1 
















































    1 













1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Tracing hooks
 *
 * Copyright (C) 2008-2009 Red Hat, Inc.  All rights reserved.
 *
 * This file defines hook entry points called by core code where
 * user tracing/debugging support might need to do something.  These
 * entry points are called tracehook_*().  Each hook declared below
 * has a detailed kerneldoc comment giving the context (locking et
 * al) from which it is called, and the meaning of its return value.
 *
 * Each function here typically has only one call site, so it is ok
 * to have some nontrivial tracehook_*() inlines.  In all cases, the
 * fast path when no tracing is enabled should be very short.
 *
 * The purpose of this file and the tracehook_* layer is to consolidate
 * the interface that the kernel core and arch code uses to enable any
 * user debugging or tracing facility (such as ptrace).  The interfaces
 * here are carefully documented so that maintainers of core and arch
 * code do not need to think about the implementation details of the
 * tracing facilities.  Likewise, maintainers of the tracing code do not
 * need to understand all the calling core or arch code in detail, just
 * documented circumstances of each call, such as locking conditions.
 *
 * If the calling core code changes so that locking is different, then
 * it is ok to change the interface documented here.  The maintainer of
 * core code changing should notify the maintainers of the tracing code
 * that they need to work out the change.
 *
 * Some tracehook_*() inlines take arguments that the current tracing
 * implementations might not necessarily use.  These function signatures
 * are chosen to pass in all the information that is on hand in the
 * caller and might conceivably be relevant to a tracer, so that the
 * core code won't have to be updated when tracing adds more features.
 * If a call site changes so that some of those parameters are no longer
 * already on hand without extra work, then the tracehook_* interface
 * can change so there is no make-work burden on the core code.  The
 * maintainer of core code changing should notify the maintainers of the
 * tracing code that they need to work out the change.
 */

#ifndef _LINUX_TRACEHOOK_H
#define _LINUX_TRACEHOOK_H        1

#include <linux/sched.h>
#include <linux/ptrace.h>
#include <linux/security.h>
#include <linux/task_work.h>
#include <linux/memcontrol.h>
#include <linux/blk-cgroup.h>
struct linux_binprm;

/*
 * ptrace report for syscall entry and exit looks identical.
 */
static inline int ptrace_report_syscall(struct pt_regs *regs,
                                        unsigned long message)
{
        int ptrace = current->ptrace;

        if (!(ptrace & PT_PTRACED))
                return 0;

        current->ptrace_message = message;
        ptrace_notify(SIGTRAP | ((ptrace & PT_TRACESYSGOOD) ? 0x80 : 0));

        /*
         * this isn't the same as continuing with a signal, but it will do
         * for normal use.  strace only continues with a signal if the
         * stopping signal is not SIGTRAP.  -brl
         */
        if (current->exit_code) {
                send_sig(current->exit_code, current, 1);
                current->exit_code = 0;
        }

        current->ptrace_message = 0;
        return fatal_signal_pending(current);
}

/**
 * tracehook_report_syscall_entry - task is about to attempt a system call
 * @regs:                user register state of current task
 *
 * This will be called if %TIF_SYSCALL_TRACE or %TIF_SYSCALL_EMU have been set,
 * when the current task has just entered the kernel for a system call.
 * Full user register state is available here.  Changing the values
 * in @regs can affect the system call number and arguments to be tried.
 * It is safe to block here, preventing the system call from beginning.
 *
 * Returns zero normally, or nonzero if the calling arch code should abort
 * the system call.  That must prevent normal entry so no system call is
 * made.  If @task ever returns to user mode after this, its register state
 * is unspecified, but should be something harmless like an %ENOSYS error
 * return.  It should preserve enough information so that syscall_rollback()
 * can work (see asm-generic/syscall.h).
 *
 * Called without locks, just after entering kernel mode.
 */
static inline __must_check int tracehook_report_syscall_entry(
        struct pt_regs *regs)
{
        return ptrace_report_syscall(regs, PTRACE_EVENTMSG_SYSCALL_ENTRY);
}

/**
 * tracehook_report_syscall_exit - task has just finished a system call
 * @regs:                user register state of current task
 * @step:                nonzero if simulating single-step or block-step
 *
 * This will be called if %TIF_SYSCALL_TRACE has been set, when the
 * current task has just finished an attempted system call.  Full
 * user register state is available here.  It is safe to block here,
 * preventing signals from being processed.
 *
 * If @step is nonzero, this report is also in lieu of the normal
 * trap that would follow the system call instruction because
 * user_enable_block_step() or user_enable_single_step() was used.
 * In this case, %TIF_SYSCALL_TRACE might not be set.
 *
 * Called without locks, just before checking for pending signals.
 */
static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step)
{
        if (step)
                user_single_step_report(regs);
        else
                ptrace_report_syscall(regs, PTRACE_EVENTMSG_SYSCALL_EXIT);
}

/**
 * tracehook_signal_handler - signal handler setup is complete
 * @stepping:                nonzero if debugger single-step or block-step in use
 *
 * Called by the arch code after a signal handler has been set up.
 * Register and stack state reflects the user handler about to run.
 * Signal mask changes have already been made.
 *
 * Called without locks, shortly before returning to user mode
 * (or handling more signals).
 */
static inline void tracehook_signal_handler(int stepping)
{
        if (stepping)
                ptrace_notify(SIGTRAP);
}

/**
 * set_notify_resume - cause tracehook_notify_resume() to be called
 * @task:                task that will call tracehook_notify_resume()
 *
 * Calling this arranges that @task will call tracehook_notify_resume()
 * before returning to user mode.  If it's already running in user mode,
 * it will enter the kernel and call tracehook_notify_resume() soon.
 * If it's blocked, it will not be woken.
 */
static inline void set_notify_resume(struct task_struct *task)
{
#ifdef TIF_NOTIFY_RESUME
        if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_RESUME))
                kick_process(task);
#endif
}

/**
 * tracehook_notify_resume - report when about to return to user mode
 * @regs:                user-mode registers of @current task
 *
 * This is called when %TIF_NOTIFY_RESUME has been set.  Now we are
 * about to return to user mode, and the user state in @regs can be
 * inspected or adjusted.  The caller in arch code has cleared
 * %TIF_NOTIFY_RESUME before the call.  If the flag gets set again
 * asynchronously, this will be called again before we return to
 * user mode.
 *
 * Called without locks.
 */
static inline void tracehook_notify_resume(struct pt_regs *regs)
{
        clear_thread_flag(TIF_NOTIFY_RESUME);
        /*
         * This barrier pairs with task_work_add()->set_notify_resume() after
         * hlist_add_head(task->task_works);
         */
        smp_mb__after_atomic();
        if (unlikely(current->task_works))
                task_work_run();

#ifdef CONFIG_KEYS_REQUEST_CACHE
        if (unlikely(current->cached_requested_key)) {
                key_put(current->cached_requested_key);
                current->cached_requested_key = NULL;
        }
#endif

        mem_cgroup_handle_over_high();
        blkcg_maybe_throttle_current();
}

/*
 * called by exit_to_user_mode_loop() if ti_work & _TIF_NOTIFY_SIGNAL. This
 * is currently used by TWA_SIGNAL based task_work, which requires breaking
 * wait loops to ensure that task_work is noticed and run.
 */
static inline void tracehook_notify_signal(void)
{
        clear_thread_flag(TIF_NOTIFY_SIGNAL);
        smp_mb__after_atomic();
        if (current->task_works)
                task_work_run();
}

/*
 * Called when we have work to process from exit_to_user_mode_loop()
 */
static inline void set_notify_signal(struct task_struct *task)
{
        if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) &&
            !wake_up_state(task, TASK_INTERRUPTIBLE))
                kick_process(task);
}

#endif        /* <linux/tracehook.h> */
























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PGALLOC_H
#define _ASM_X86_PGALLOC_H

#include <linux/threads.h>
#include <linux/mm.h>                /* for struct page */
#include <linux/pagemap.h>

#define __HAVE_ARCH_PTE_ALLOC_ONE
#define __HAVE_ARCH_PGD_FREE
#include <asm-generic/pgalloc.h>

static inline int  __paravirt_pgd_alloc(struct mm_struct *mm) { return 0; }

#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else
#define paravirt_pgd_alloc(mm)        __paravirt_pgd_alloc(mm)
static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) {}
static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn)        {}
static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn)        {}
static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
                                            unsigned long start, unsigned long count) {}
static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn)        {}
static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn)        {}
static inline void paravirt_release_pte(unsigned long pfn) {}
static inline void paravirt_release_pmd(unsigned long pfn) {}
static inline void paravirt_release_pud(unsigned long pfn) {}
static inline void paravirt_release_p4d(unsigned long pfn) {}
#endif

/*
 * Flags to use when allocating a user page table page.
 */
extern gfp_t __userpte_alloc_gfp;

#ifdef CONFIG_PAGE_TABLE_ISOLATION
/*
 * Instead of one PGD, we acquire two PGDs.  Being order-1, it is
 * both 8k in size and 8k-aligned.  That lets us just flip bit 12
 * in a pointer to swap between the two 4k halves.
 */
#define PGD_ALLOCATION_ORDER 1
#else
#define PGD_ALLOCATION_ORDER 0
#endif

/*
 * Allocate and free page tables.
 */
extern pgd_t *pgd_alloc(struct mm_struct *);
extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);

extern pgtable_t pte_alloc_one(struct mm_struct *);

extern void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte);

static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
                                  unsigned long address)
{
        ___pte_free_tlb(tlb, pte);
}

static inline void pmd_populate_kernel(struct mm_struct *mm,
                                       pmd_t *pmd, pte_t *pte)
{
        paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT);
        set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
}

static inline void pmd_populate_kernel_safe(struct mm_struct *mm,
                                       pmd_t *pmd, pte_t *pte)
{
        paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT);
        set_pmd_safe(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
}

static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
                                struct page *pte)
{
        unsigned long pfn = page_to_pfn(pte);

        paravirt_alloc_pte(mm, pfn);
        set_pmd(pmd, __pmd(((pteval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
}

#define pmd_pgtable(pmd) pmd_page(pmd)

#if CONFIG_PGTABLE_LEVELS > 2
extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);

static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
                                  unsigned long address)
{
        ___pmd_free_tlb(tlb, pmd);
}

#ifdef CONFIG_X86_PAE
extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd);
#else        /* !CONFIG_X86_PAE */
static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
{
        paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
        set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
}

static inline void pud_populate_safe(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
{
        paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
        set_pud_safe(pud, __pud(_PAGE_TABLE | __pa(pmd)));
}
#endif        /* CONFIG_X86_PAE */

#if CONFIG_PGTABLE_LEVELS > 3
static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
{
        paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
        set_p4d(p4d, __p4d(_PAGE_TABLE | __pa(pud)));
}

static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
{
        paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
        set_p4d_safe(p4d, __p4d(_PAGE_TABLE | __pa(pud)));
}

extern void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);

static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
                                  unsigned long address)
{
        ___pud_free_tlb(tlb, pud);
}

#if CONFIG_PGTABLE_LEVELS > 4
static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d)
{
        if (!pgtable_l5_enabled())
                return;
        paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT);
        set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d)));
}

static inline void pgd_populate_safe(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d)
{
        if (!pgtable_l5_enabled())
                return;
        paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT);
        set_pgd_safe(pgd, __pgd(_PAGE_TABLE | __pa(p4d)));
}

static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr)
{
        gfp_t gfp = GFP_KERNEL_ACCOUNT;

        if (mm == &init_mm)
                gfp &= ~__GFP_ACCOUNT;
        return (p4d_t *)get_zeroed_page(gfp);
}

static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d)
{
        if (!pgtable_l5_enabled())
                return;

        BUG_ON((unsigned long)p4d & (PAGE_SIZE-1));
        free_page((unsigned long)p4d);
}

extern void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d);

static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d,
                                  unsigned long address)
{
        if (pgtable_l5_enabled())
                ___p4d_free_tlb(tlb, p4d);
}

#endif        /* CONFIG_PGTABLE_LEVELS > 4 */
#endif        /* CONFIG_PGTABLE_LEVELS > 3 */
#endif        /* CONFIG_PGTABLE_LEVELS > 2 */

#endif /* _ASM_X86_PGALLOC_H */




































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef LINUX_CRASH_DUMP_H
#define LINUX_CRASH_DUMP_H

#include <linux/kexec.h>
#include <linux/proc_fs.h>
#include <linux/elf.h>
#include <linux/pgtable.h>
#include <uapi/linux/vmcore.h>

#include <linux/pgtable.h> /* for pgprot_t */

#ifdef CONFIG_CRASH_DUMP
#define ELFCORE_ADDR_MAX        (-1ULL)
#define ELFCORE_ADDR_ERR        (-2ULL)

extern unsigned long long elfcorehdr_addr;
extern unsigned long long elfcorehdr_size;

extern int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size);
extern void elfcorehdr_free(unsigned long long addr);
extern ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos);
extern ssize_t elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos);
extern int remap_oldmem_pfn_range(struct vm_area_struct *vma,
                                  unsigned long from, unsigned long pfn,
                                  unsigned long size, pgprot_t prot);

extern ssize_t copy_oldmem_page(unsigned long, char *, size_t,
                                                unsigned long, int);
extern ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf,
                                          size_t csize, unsigned long offset,
                                          int userbuf);

void vmcore_cleanup(void);

/* Architecture code defines this if there are other possible ELF
 * machine types, e.g. on bi-arch capable hardware. */
#ifndef vmcore_elf_check_arch_cross
#define vmcore_elf_check_arch_cross(x) 0
#endif

/*
 * Architecture code can redefine this if there are any special checks
 * needed for 32-bit ELF or 64-bit ELF vmcores.  In case of 32-bit
 * only architecture, vmcore_elf64_check_arch can be set to zero.
 */
#ifndef vmcore_elf32_check_arch
#define vmcore_elf32_check_arch(x) elf_check_arch(x)
#endif

#ifndef vmcore_elf64_check_arch
#define vmcore_elf64_check_arch(x) (elf_check_arch(x) || vmcore_elf_check_arch_cross(x))
#endif

/*
 * is_kdump_kernel() checks whether this kernel is booting after a panic of
 * previous kernel or not. This is determined by checking if previous kernel
 * has passed the elf core header address on command line.
 *
 * This is not just a test if CONFIG_CRASH_DUMP is enabled or not. It will
 * return true if CONFIG_CRASH_DUMP=y and if kernel is booting after a panic
 * of previous kernel.
 */

static inline bool is_kdump_kernel(void)
{
        return elfcorehdr_addr != ELFCORE_ADDR_MAX;
}

/* is_vmcore_usable() checks if the kernel is booting after a panic and
 * the vmcore region is usable.
 *
 * This makes use of the fact that due to alignment -2ULL is not
 * a valid pointer, much in the vain of IS_ERR(), except
 * dealing directly with an unsigned long long rather than a pointer.
 */

static inline int is_vmcore_usable(void)
{
        return is_kdump_kernel() && elfcorehdr_addr != ELFCORE_ADDR_ERR ? 1 : 0;
}

/* vmcore_unusable() marks the vmcore as unusable,
 * without disturbing the logic of is_kdump_kernel()
 */

static inline void vmcore_unusable(void)
{
        if (is_kdump_kernel())
                elfcorehdr_addr = ELFCORE_ADDR_ERR;
}

#define HAVE_OLDMEM_PFN_IS_RAM 1
extern int register_oldmem_pfn_is_ram(int (*fn)(unsigned long pfn));
extern void unregister_oldmem_pfn_is_ram(void);

#else /* !CONFIG_CRASH_DUMP */
static inline bool is_kdump_kernel(void) { return 0; }
#endif /* CONFIG_CRASH_DUMP */

/* Device Dump information to be filled by drivers */
struct vmcoredd_data {
        char dump_name[VMCOREDD_MAX_NAME_BYTES]; /* Unique name of the dump */
        unsigned int size;                       /* Size of the dump */
        /* Driver's registered callback to be invoked to collect dump */
        int (*vmcoredd_callback)(struct vmcoredd_data *data, void *buf);
};

#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
int vmcore_add_device_dump(struct vmcoredd_data *data);
#else
static inline int vmcore_add_device_dump(struct vmcoredd_data *data)
{
        return -EOPNOTSUPP;
}
#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */

#ifdef CONFIG_PROC_VMCORE
ssize_t read_from_oldmem(char *buf, size_t count,
                         u64 *ppos, int userbuf,
                         bool encrypted);
#else
static inline ssize_t read_from_oldmem(char *buf, size_t count,
                                       u64 *ppos, int userbuf,
                                       bool encrypted)
{
        return -EOPNOTSUPP;
}
#endif /* CONFIG_PROC_VMCORE */

#endif /* LINUX_CRASHDUMP_H */

























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __VDSO_MATH64_H
#define __VDSO_MATH64_H

static __always_inline u32
__iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder)
{
        u32 ret = 0;

        while (dividend >= divisor) {
                /* The following asm() prevents the compiler from
                   optimising this loop into a modulo operation.  */
                asm("" : "+rm"(dividend));

                dividend -= divisor;
                ret++;
        }

        *remainder = dividend;

        return ret;
}

#endif /* __VDSO_MATH64_H */















    1 









    1 

    1 














































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
// SPDX-License-Identifier: GPL-2.0
#include <linux/cache.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/pid_namespace.h>
#include "internal.h"

/*
 * /proc/thread_self:
 */
static const char *proc_thread_self_get_link(struct dentry *dentry,
                                             struct inode *inode,
                                             struct delayed_call *done)
{
        struct pid_namespace *ns = proc_pid_ns(inode->i_sb);
        pid_t tgid = task_tgid_nr_ns(current, ns);
        pid_t pid = task_pid_nr_ns(current, ns);
        char *name;

        if (!pid)
                return ERR_PTR(-ENOENT);
        name = kmalloc(10 + 6 + 10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC);
        if (unlikely(!name))
                return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD);
        sprintf(name, "%u/task/%u", tgid, pid);
        set_delayed_call(done, kfree_link, name);
        return name;
}

static const struct inode_operations proc_thread_self_inode_operations = {
        .get_link        = proc_thread_self_get_link,
};

static unsigned thread_self_inum __ro_after_init;

int proc_setup_thread_self(struct super_block *s)
{
        struct inode *root_inode = d_inode(s->s_root);
        struct proc_fs_info *fs_info = proc_sb_info(s);
        struct dentry *thread_self;
        int ret = -ENOMEM;

        inode_lock(root_inode);
        thread_self = d_alloc_name(s->s_root, "thread-self");
        if (thread_self) {
                struct inode *inode = new_inode(s);
                if (inode) {
                        inode->i_ino = thread_self_inum;
                        inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
                        inode->i_mode = S_IFLNK | S_IRWXUGO;
                        inode->i_uid = GLOBAL_ROOT_UID;
                        inode->i_gid = GLOBAL_ROOT_GID;
                        inode->i_op = &proc_thread_self_inode_operations;
                        d_add(thread_self, inode);
                        ret = 0;
                } else {
                        dput(thread_self);
                }
        }
        inode_unlock(root_inode);

        if (ret)
                pr_err("proc_fill_super: can't allocate /proc/thread-self\n");
        else
                fs_info->proc_thread_self = thread_self;

        return ret;
}

void __init proc_thread_self_init(void)
{
        proc_alloc_inum(&thread_self_inum);
}




















































































































    1 










































    1 































































    1 

































































    1 













    1 



























    1 






















    1 
    1 
    1 












    1 







    1 






















    1 

































































































































































































    1 




    1 









    1 


    1 

    1 
    1 






















    1 












































































    1 




    1 











    1 



    1 





























    1 



































































































































































































    1 

























































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Resizable, Scalable, Concurrent Hash Table
 *
 * Copyright (c) 2015-2016 Herbert Xu <herbert@gondor.apana.org.au>
 * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch>
 * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
 *
 * Code partially derived from nft_hash
 * Rewritten with rehash code from br_multicast plus single list
 * pointer as suggested by Josh Triplett
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */

#ifndef _LINUX_RHASHTABLE_H
#define _LINUX_RHASHTABLE_H

#include <linux/err.h>
#include <linux/errno.h>
#include <linux/jhash.h>
#include <linux/list_nulls.h>
#include <linux/workqueue.h>
#include <linux/rculist.h>
#include <linux/bit_spinlock.h>

#include <linux/rhashtable-types.h>
/*
 * Objects in an rhashtable have an embedded struct rhash_head
 * which is linked into as hash chain from the hash table - or one
 * of two or more hash tables when the rhashtable is being resized.
 * The end of the chain is marked with a special nulls marks which has
 * the least significant bit set but otherwise stores the address of
 * the hash bucket.  This allows us to be sure we've found the end
 * of the right list.
 * The value stored in the hash bucket has BIT(0) used as a lock bit.
 * This bit must be atomically set before any changes are made to
 * the chain.  To avoid dereferencing this pointer without clearing
 * the bit first, we use an opaque 'struct rhash_lock_head *' for the
 * pointer stored in the bucket.  This struct needs to be defined so
 * that rcu_dereference() works on it, but it has no content so a
 * cast is needed for it to be useful.  This ensures it isn't
 * used by mistake with clearing the lock bit first.
 */
struct rhash_lock_head {};

/* Maximum chain length before rehash
 *
 * The maximum (not average) chain length grows with the size of the hash
 * table, at a rate of (log N)/(log log N).
 *
 * The value of 16 is selected so that even if the hash table grew to
 * 2^32 you would not expect the maximum chain length to exceed it
 * unless we are under attack (or extremely unlucky).
 *
 * As this limit is only to detect attacks, we don't need to set it to a
 * lower value as you'd need the chain length to vastly exceed 16 to have
 * any real effect on the system.
 */
#define RHT_ELASTICITY        16u

/**
 * struct bucket_table - Table of hash buckets
 * @size: Number of hash buckets
 * @nest: Number of bits of first-level nested table.
 * @rehash: Current bucket being rehashed
 * @hash_rnd: Random seed to fold into hash
 * @walkers: List of active walkers
 * @rcu: RCU structure for freeing the table
 * @future_tbl: Table under construction during rehashing
 * @ntbl: Nested table used when out of memory.
 * @buckets: size * hash buckets
 */
struct bucket_table {
        unsigned int                size;
        unsigned int                nest;
        u32                        hash_rnd;
        struct list_head        walkers;
        struct rcu_head                rcu;

        struct bucket_table __rcu *future_tbl;

        struct lockdep_map        dep_map;

        struct rhash_lock_head __rcu *buckets[] ____cacheline_aligned_in_smp;
};

/*
 * NULLS_MARKER() expects a hash value with the low
 * bits mostly likely to be significant, and it discards
 * the msb.
 * We give it an address, in which the bottom bit is
 * always 0, and the msb might be significant.
 * So we shift the address down one bit to align with
 * expectations and avoid losing a significant bit.
 *
 * We never store the NULLS_MARKER in the hash table
 * itself as we need the lsb for locking.
 * Instead we store a NULL
 */
#define        RHT_NULLS_MARKER(ptr)        \
        ((void *)NULLS_MARKER(((unsigned long) (ptr)) >> 1))
#define INIT_RHT_NULLS_HEAD(ptr)        \
        ((ptr) = NULL)

static inline bool rht_is_a_nulls(const struct rhash_head *ptr)
{
        return ((unsigned long) ptr & 1);
}

static inline void *rht_obj(const struct rhashtable *ht,
                            const struct rhash_head *he)
{
        return (char *)he - ht->p.head_offset;
}

static inline unsigned int rht_bucket_index(const struct bucket_table *tbl,
                                            unsigned int hash)
{
        return hash & (tbl->size - 1);
}

static inline unsigned int rht_key_get_hash(struct rhashtable *ht,
        const void *key, const struct rhashtable_params params,
        unsigned int hash_rnd)
{
        unsigned int hash;

        /* params must be equal to ht->p if it isn't constant. */
        if (!__builtin_constant_p(params.key_len))
                hash = ht->p.hashfn(key, ht->key_len, hash_rnd);
        else if (params.key_len) {
                unsigned int key_len = params.key_len;

                if (params.hashfn)
                        hash = params.hashfn(key, key_len, hash_rnd);
                else if (key_len & (sizeof(u32) - 1))
                        hash = jhash(key, key_len, hash_rnd);
                else
                        hash = jhash2(key, key_len / sizeof(u32), hash_rnd);
        } else {
                unsigned int key_len = ht->p.key_len;

                if (params.hashfn)
                        hash = params.hashfn(key, key_len, hash_rnd);
                else
                        hash = jhash(key, key_len, hash_rnd);
        }

        return hash;
}

static inline unsigned int rht_key_hashfn(
        struct rhashtable *ht, const struct bucket_table *tbl,
        const void *key, const struct rhashtable_params params)
{
        unsigned int hash = rht_key_get_hash(ht, key, params, tbl->hash_rnd);

        return rht_bucket_index(tbl, hash);
}

static inline unsigned int rht_head_hashfn(
        struct rhashtable *ht, const struct bucket_table *tbl,
        const struct rhash_head *he, const struct rhashtable_params params)
{
        const char *ptr = rht_obj(ht, he);

        return likely(params.obj_hashfn) ?
               rht_bucket_index(tbl, params.obj_hashfn(ptr, params.key_len ?:
                                                            ht->p.key_len,
                                                       tbl->hash_rnd)) :
               rht_key_hashfn(ht, tbl, ptr + params.key_offset, params);
}

/**
 * rht_grow_above_75 - returns true if nelems > 0.75 * table-size
 * @ht:                hash table
 * @tbl:        current table
 */
static inline bool rht_grow_above_75(const struct rhashtable *ht,
                                     const struct bucket_table *tbl)
{
        /* Expand table when exceeding 75% load */
        return atomic_read(&ht->nelems) > (tbl->size / 4 * 3) &&
               (!ht->p.max_size || tbl->size < ht->p.max_size);
}

/**
 * rht_shrink_below_30 - returns true if nelems < 0.3 * table-size
 * @ht:                hash table
 * @tbl:        current table
 */
static inline bool rht_shrink_below_30(const struct rhashtable *ht,
                                       const struct bucket_table *tbl)
{
        /* Shrink table beneath 30% load */
        return atomic_read(&ht->nelems) < (tbl->size * 3 / 10) &&
               tbl->size > ht->p.min_size;
}

/**
 * rht_grow_above_100 - returns true if nelems > table-size
 * @ht:                hash table
 * @tbl:        current table
 */
static inline bool rht_grow_above_100(const struct rhashtable *ht,
                                      const struct bucket_table *tbl)
{
        return atomic_read(&ht->nelems) > tbl->size &&
                (!ht->p.max_size || tbl->size < ht->p.max_size);
}

/**
 * rht_grow_above_max - returns true if table is above maximum
 * @ht:                hash table
 * @tbl:        current table
 */
static inline bool rht_grow_above_max(const struct rhashtable *ht,
                                      const struct bucket_table *tbl)
{
        return atomic_read(&ht->nelems) >= ht->max_elems;
}

#ifdef CONFIG_PROVE_LOCKING
int lockdep_rht_mutex_is_held(struct rhashtable *ht);
int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash);
#else
static inline int lockdep_rht_mutex_is_held(struct rhashtable *ht)
{
        return 1;
}

static inline int lockdep_rht_bucket_is_held(const struct bucket_table *tbl,
                                             u32 hash)
{
        return 1;
}
#endif /* CONFIG_PROVE_LOCKING */

void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
                             struct rhash_head *obj);

void rhashtable_walk_enter(struct rhashtable *ht,
                           struct rhashtable_iter *iter);
void rhashtable_walk_exit(struct rhashtable_iter *iter);
int rhashtable_walk_start_check(struct rhashtable_iter *iter) __acquires(RCU);

static inline void rhashtable_walk_start(struct rhashtable_iter *iter)
{
        (void)rhashtable_walk_start_check(iter);
}

void *rhashtable_walk_next(struct rhashtable_iter *iter);
void *rhashtable_walk_peek(struct rhashtable_iter *iter);
void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU);

void rhashtable_free_and_destroy(struct rhashtable *ht,
                                 void (*free_fn)(void *ptr, void *arg),
                                 void *arg);
void rhashtable_destroy(struct rhashtable *ht);

struct rhash_lock_head __rcu **rht_bucket_nested(
        const struct bucket_table *tbl, unsigned int hash);
struct rhash_lock_head __rcu **__rht_bucket_nested(
        const struct bucket_table *tbl, unsigned int hash);
struct rhash_lock_head __rcu **rht_bucket_nested_insert(
        struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash);

#define rht_dereference(p, ht) \
        rcu_dereference_protected(p, lockdep_rht_mutex_is_held(ht))

#define rht_dereference_rcu(p, ht) \
        rcu_dereference_check(p, lockdep_rht_mutex_is_held(ht))

#define rht_dereference_bucket(p, tbl, hash) \
        rcu_dereference_protected(p, lockdep_rht_bucket_is_held(tbl, hash))

#define rht_dereference_bucket_rcu(p, tbl, hash) \
        rcu_dereference_check(p, lockdep_rht_bucket_is_held(tbl, hash))

#define rht_entry(tpos, pos, member) \
        ({ tpos = container_of(pos, typeof(*tpos), member); 1; })

static inline struct rhash_lock_head __rcu *const *rht_bucket(
        const struct bucket_table *tbl, unsigned int hash)
{
        return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) :
                                     &tbl->buckets[hash];
}

static inline struct rhash_lock_head __rcu **rht_bucket_var(
        struct bucket_table *tbl, unsigned int hash)
{
        return unlikely(tbl->nest) ? __rht_bucket_nested(tbl, hash) :
                                     &tbl->buckets[hash];
}

static inline struct rhash_lock_head __rcu **rht_bucket_insert(
        struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash)
{
        return unlikely(tbl->nest) ? rht_bucket_nested_insert(ht, tbl, hash) :
                                     &tbl->buckets[hash];
}

/*
 * We lock a bucket by setting BIT(0) in the pointer - this is always
 * zero in real pointers.  The NULLS mark is never stored in the bucket,
 * rather we store NULL if the bucket is empty.
 * bit_spin_locks do not handle contention well, but the whole point
 * of the hashtable design is to achieve minimum per-bucket contention.
 * A nested hash table might not have a bucket pointer.  In that case
 * we cannot get a lock.  For remove and replace the bucket cannot be
 * interesting and doesn't need locking.
 * For insert we allocate the bucket if this is the last bucket_table,
 * and then take the lock.
 * Sometimes we unlock a bucket by writing a new pointer there.  In that
 * case we don't need to unlock, but we do need to reset state such as
 * local_bh. For that we have rht_assign_unlock().  As rcu_assign_pointer()
 * provides the same release semantics that bit_spin_unlock() provides,
 * this is safe.
 * When we write to a bucket without unlocking, we use rht_assign_locked().
 */

static inline void rht_lock(struct bucket_table *tbl,
                            struct rhash_lock_head __rcu **bkt)
{
        local_bh_disable();
        bit_spin_lock(0, (unsigned long *)bkt);
        lock_map_acquire(&tbl->dep_map);
}

static inline void rht_lock_nested(struct bucket_table *tbl,
                                   struct rhash_lock_head __rcu **bucket,
                                   unsigned int subclass)
{
        local_bh_disable();
        bit_spin_lock(0, (unsigned long *)bucket);
        lock_acquire_exclusive(&tbl->dep_map, subclass, 0, NULL, _THIS_IP_);
}

static inline void rht_unlock(struct bucket_table *tbl,
                              struct rhash_lock_head __rcu **bkt)
{
        lock_map_release(&tbl->dep_map);
        bit_spin_unlock(0, (unsigned long *)bkt);
        local_bh_enable();
}

static inline struct rhash_head *__rht_ptr(
        struct rhash_lock_head *p, struct rhash_lock_head __rcu *const *bkt)
{
        return (struct rhash_head *)
                ((unsigned long)p & ~BIT(0) ?:
                 (unsigned long)RHT_NULLS_MARKER(bkt));
}

/*
 * Where 'bkt' is a bucket and might be locked:
 *   rht_ptr_rcu() dereferences that pointer and clears the lock bit.
 *   rht_ptr() dereferences in a context where the bucket is locked.
 *   rht_ptr_exclusive() dereferences in a context where exclusive
 *            access is guaranteed, such as when destroying the table.
 */
static inline struct rhash_head *rht_ptr_rcu(
        struct rhash_lock_head __rcu *const *bkt)
{
        return __rht_ptr(rcu_dereference(*bkt), bkt);
}

static inline struct rhash_head *rht_ptr(
        struct rhash_lock_head __rcu *const *bkt,
        struct bucket_table *tbl,
        unsigned int hash)
{
        return __rht_ptr(rht_dereference_bucket(*bkt, tbl, hash), bkt);
}

static inline struct rhash_head *rht_ptr_exclusive(
        struct rhash_lock_head __rcu *const *bkt)
{
        return __rht_ptr(rcu_dereference_protected(*bkt, 1), bkt);
}

static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt,
                                     struct rhash_head *obj)
{
        if (rht_is_a_nulls(obj))
                obj = NULL;
        rcu_assign_pointer(*bkt, (void *)((unsigned long)obj | BIT(0)));
}

static inline void rht_assign_unlock(struct bucket_table *tbl,
                                     struct rhash_lock_head __rcu **bkt,
                                     struct rhash_head *obj)
{
        if (rht_is_a_nulls(obj))
                obj = NULL;
        lock_map_release(&tbl->dep_map);
        rcu_assign_pointer(*bkt, (void *)obj);
        preempt_enable();
        __release(bitlock);
        local_bh_enable();
}

/**
 * rht_for_each_from - iterate over hash chain from given head
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @head:        the &struct rhash_head to start from
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 */
#define rht_for_each_from(pos, head, tbl, hash) \
        for (pos = head;                        \
             !rht_is_a_nulls(pos);                \
             pos = rht_dereference_bucket((pos)->next, tbl, hash))

/**
 * rht_for_each - iterate over hash chain
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 */
#define rht_for_each(pos, tbl, hash) \
        rht_for_each_from(pos, rht_ptr(rht_bucket(tbl, hash), tbl, hash),  \
                          tbl, hash)

/**
 * rht_for_each_entry_from - iterate over hash chain from given head
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @head:        the &struct rhash_head to start from
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 * @member:        name of the &struct rhash_head within the hashable struct.
 */
#define rht_for_each_entry_from(tpos, pos, head, tbl, hash, member)        \
        for (pos = head;                                                \
             (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);        \
             pos = rht_dereference_bucket((pos)->next, tbl, hash))

/**
 * rht_for_each_entry - iterate over hash chain of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 * @member:        name of the &struct rhash_head within the hashable struct.
 */
#define rht_for_each_entry(tpos, pos, tbl, hash, member)                \
        rht_for_each_entry_from(tpos, pos,                                \
                                rht_ptr(rht_bucket(tbl, hash), tbl, hash), \
                                tbl, hash, member)

/**
 * rht_for_each_entry_safe - safely iterate over hash chain of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @next:        the &struct rhash_head to use as next in loop cursor.
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 * @member:        name of the &struct rhash_head within the hashable struct.
 *
 * This hash chain list-traversal primitive allows for the looped code to
 * remove the loop cursor from the list.
 */
#define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member)              \
        for (pos = rht_ptr(rht_bucket(tbl, hash), tbl, hash),                      \
             next = !rht_is_a_nulls(pos) ?                                      \
                       rht_dereference_bucket(pos->next, tbl, hash) : NULL;   \
             (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);              \
             pos = next,                                                      \
             next = !rht_is_a_nulls(pos) ?                                      \
                       rht_dereference_bucket(pos->next, tbl, hash) : NULL)

/**
 * rht_for_each_rcu_from - iterate over rcu hash chain from given head
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @head:        the &struct rhash_head to start from
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 *
 * This hash chain list-traversal primitive may safely run concurrently with
 * the _rcu mutation primitives such as rhashtable_insert() as long as the
 * traversal is guarded by rcu_read_lock().
 */
#define rht_for_each_rcu_from(pos, head, tbl, hash)                        \
        for (({barrier(); }),                                                \
             pos = head;                                                \
             !rht_is_a_nulls(pos);                                        \
             pos = rcu_dereference_raw(pos->next))

/**
 * rht_for_each_rcu - iterate over rcu hash chain
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 *
 * This hash chain list-traversal primitive may safely run concurrently with
 * the _rcu mutation primitives such as rhashtable_insert() as long as the
 * traversal is guarded by rcu_read_lock().
 */
#define rht_for_each_rcu(pos, tbl, hash)                        \
        for (({barrier(); }),                                        \
             pos = rht_ptr_rcu(rht_bucket(tbl, hash));                \
             !rht_is_a_nulls(pos);                                \
             pos = rcu_dereference_raw(pos->next))

/**
 * rht_for_each_entry_rcu_from - iterated over rcu hash chain from given head
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @head:        the &struct rhash_head to start from
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 * @member:        name of the &struct rhash_head within the hashable struct.
 *
 * This hash chain list-traversal primitive may safely run concurrently with
 * the _rcu mutation primitives such as rhashtable_insert() as long as the
 * traversal is guarded by rcu_read_lock().
 */
#define rht_for_each_entry_rcu_from(tpos, pos, head, tbl, hash, member) \
        for (({barrier(); }),                                                    \
             pos = head;                                                    \
             (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);            \
             pos = rht_dereference_bucket_rcu(pos->next, tbl, hash))

/**
 * rht_for_each_entry_rcu - iterate over rcu hash chain of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 * @member:        name of the &struct rhash_head within the hashable struct.
 *
 * This hash chain list-traversal primitive may safely run concurrently with
 * the _rcu mutation primitives such as rhashtable_insert() as long as the
 * traversal is guarded by rcu_read_lock().
 */
#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member)                   \
        rht_for_each_entry_rcu_from(tpos, pos,                                   \
                                    rht_ptr_rcu(rht_bucket(tbl, hash)),           \
                                    tbl, hash, member)

/**
 * rhl_for_each_rcu - iterate over rcu hash table list
 * @pos:        the &struct rlist_head to use as a loop cursor.
 * @list:        the head of the list
 *
 * This hash chain list-traversal primitive should be used on the
 * list returned by rhltable_lookup.
 */
#define rhl_for_each_rcu(pos, list)                                        \
        for (pos = list; pos; pos = rcu_dereference_raw(pos->next))

/**
 * rhl_for_each_entry_rcu - iterate over rcu hash table list of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct rlist_head to use as a loop cursor.
 * @list:        the head of the list
 * @member:        name of the &struct rlist_head within the hashable struct.
 *
 * This hash chain list-traversal primitive should be used on the
 * list returned by rhltable_lookup.
 */
#define rhl_for_each_entry_rcu(tpos, pos, list, member)                        \
        for (pos = list; pos && rht_entry(tpos, pos, member);                \
             pos = rcu_dereference_raw(pos->next))

static inline int rhashtable_compare(struct rhashtable_compare_arg *arg,
                                     const void *obj)
{
        struct rhashtable *ht = arg->ht;
        const char *ptr = obj;

        return memcmp(ptr + ht->p.key_offset, arg->key, ht->p.key_len);
}

/* Internal function, do not use. */
static inline struct rhash_head *__rhashtable_lookup(
        struct rhashtable *ht, const void *key,
        const struct rhashtable_params params)
{
        struct rhashtable_compare_arg arg = {
                .ht = ht,
                .key = key,
        };
        struct rhash_lock_head __rcu *const *bkt;
        struct bucket_table *tbl;
        struct rhash_head *he;
        unsigned int hash;

        tbl = rht_dereference_rcu(ht->tbl, ht);
restart:
        hash = rht_key_hashfn(ht, tbl, key, params);
        bkt = rht_bucket(tbl, hash);
        do {
                rht_for_each_rcu_from(he, rht_ptr_rcu(bkt), tbl, hash) {
                        if (params.obj_cmpfn ?
                            params.obj_cmpfn(&arg, rht_obj(ht, he)) :
                            rhashtable_compare(&arg, rht_obj(ht, he)))
                                continue;
                        return he;
                }
                /* An object might have been moved to a different hash chain,
                 * while we walk along it - better check and retry.
                 */
        } while (he != RHT_NULLS_MARKER(bkt));

        /* Ensure we see any new tables. */
        smp_rmb();

        tbl = rht_dereference_rcu(tbl->future_tbl, ht);
        if (unlikely(tbl))
                goto restart;

        return NULL;
}

/**
 * rhashtable_lookup - search hash table
 * @ht:                hash table
 * @key:        the pointer to the key
 * @params:        hash table parameters
 *
 * Computes the hash value for the key and traverses the bucket chain looking
 * for a entry with an identical key. The first matching entry is returned.
 *
 * This must only be called under the RCU read lock.
 *
 * Returns the first entry on which the compare function returned true.
 */
static inline void *rhashtable_lookup(
        struct rhashtable *ht, const void *key,
        const struct rhashtable_params params)
{
        struct rhash_head *he = __rhashtable_lookup(ht, key, params);

        return he ? rht_obj(ht, he) : NULL;
}

/**
 * rhashtable_lookup_fast - search hash table, without RCU read lock
 * @ht:                hash table
 * @key:        the pointer to the key
 * @params:        hash table parameters
 *
 * Computes the hash value for the key and traverses the bucket chain looking
 * for a entry with an identical key. The first matching entry is returned.
 *
 * Only use this function when you have other mechanisms guaranteeing
 * that the object won't go away after the RCU read lock is released.
 *
 * Returns the first entry on which the compare function returned true.
 */
static inline void *rhashtable_lookup_fast(
        struct rhashtable *ht, const void *key,
        const struct rhashtable_params params)
{
        void *obj;

        rcu_read_lock();
        obj = rhashtable_lookup(ht, key, params);
        rcu_read_unlock();

        return obj;
}

/**
 * rhltable_lookup - search hash list table
 * @hlt:        hash table
 * @key:        the pointer to the key
 * @params:        hash table parameters
 *
 * Computes the hash value for the key and traverses the bucket chain looking
 * for a entry with an identical key.  All matching entries are returned
 * in a list.
 *
 * This must only be called under the RCU read lock.
 *
 * Returns the list of entries that match the given key.
 */
static inline struct rhlist_head *rhltable_lookup(
        struct rhltable *hlt, const void *key,
        const struct rhashtable_params params)
{
        struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params);

        return he ? container_of(he, struct rhlist_head, rhead) : NULL;
}

/* Internal function, please use rhashtable_insert_fast() instead. This
 * function returns the existing element already in hashes in there is a clash,
 * otherwise it returns an error via ERR_PTR().
 */
static inline void *__rhashtable_insert_fast(
        struct rhashtable *ht, const void *key, struct rhash_head *obj,
        const struct rhashtable_params params, bool rhlist)
{
        struct rhashtable_compare_arg arg = {
                .ht = ht,
                .key = key,
        };
        struct rhash_lock_head __rcu **bkt;
        struct rhash_head __rcu **pprev;
        struct bucket_table *tbl;
        struct rhash_head *head;
        unsigned int hash;
        int elasticity;
        void *data;

        rcu_read_lock();

        tbl = rht_dereference_rcu(ht->tbl, ht);
        hash = rht_head_hashfn(ht, tbl, obj, params);
        elasticity = RHT_ELASTICITY;
        bkt = rht_bucket_insert(ht, tbl, hash);
        data = ERR_PTR(-ENOMEM);
        if (!bkt)
                goto out;
        pprev = NULL;
        rht_lock(tbl, bkt);

        if (unlikely(rcu_access_pointer(tbl->future_tbl))) {
slow_path:
                rht_unlock(tbl, bkt);
                rcu_read_unlock();
                return rhashtable_insert_slow(ht, key, obj);
        }

        rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) {
                struct rhlist_head *plist;
                struct rhlist_head *list;

                elasticity--;
                if (!key ||
                    (params.obj_cmpfn ?
                     params.obj_cmpfn(&arg, rht_obj(ht, head)) :
                     rhashtable_compare(&arg, rht_obj(ht, head)))) {
                        pprev = &head->next;
                        continue;
                }

                data = rht_obj(ht, head);

                if (!rhlist)
                        goto out_unlock;


                list = container_of(obj, struct rhlist_head, rhead);
                plist = container_of(head, struct rhlist_head, rhead);

                RCU_INIT_POINTER(list->next, plist);
                head = rht_dereference_bucket(head->next, tbl, hash);
                RCU_INIT_POINTER(list->rhead.next, head);
                if (pprev) {
                        rcu_assign_pointer(*pprev, obj);
                        rht_unlock(tbl, bkt);
                } else
                        rht_assign_unlock(tbl, bkt, obj);
                data = NULL;
                goto out;
        }

        if (elasticity <= 0)
                goto slow_path;

        data = ERR_PTR(-E2BIG);
        if (unlikely(rht_grow_above_max(ht, tbl)))
                goto out_unlock;

        if (unlikely(rht_grow_above_100(ht, tbl)))
                goto slow_path;

        /* Inserting at head of list makes unlocking free. */
        head = rht_ptr(bkt, tbl, hash);

        RCU_INIT_POINTER(obj->next, head);
        if (rhlist) {
                struct rhlist_head *list;

                list = container_of(obj, struct rhlist_head, rhead);
                RCU_INIT_POINTER(list->next, NULL);
        }

        atomic_inc(&ht->nelems);
        rht_assign_unlock(tbl, bkt, obj);

        if (rht_grow_above_75(ht, tbl))
                schedule_work(&ht->run_work);

        data = NULL;
out:
        rcu_read_unlock();

        return data;

out_unlock:
        rht_unlock(tbl, bkt);
        goto out;
}

/**
 * rhashtable_insert_fast - insert object into hash table
 * @ht:                hash table
 * @obj:        pointer to hash head inside object
 * @params:        hash table parameters
 *
 * Will take the per bucket bitlock to protect against mutual mutations
 * on the same bucket. Multiple insertions may occur in parallel unless
 * they map to the same bucket.
 *
 * It is safe to call this function from atomic context.
 *
 * Will trigger an automatic deferred table resizing if residency in the
 * table grows beyond 70%.
 */
static inline int rhashtable_insert_fast(
        struct rhashtable *ht, struct rhash_head *obj,
        const struct rhashtable_params params)
{
        void *ret;

        ret = __rhashtable_insert_fast(ht, NULL, obj, params, false);
        if (IS_ERR(ret))
                return PTR_ERR(ret);

        return ret == NULL ? 0 : -EEXIST;
}

/**
 * rhltable_insert_key - insert object into hash list table
 * @hlt:        hash list table
 * @key:        the pointer to the key
 * @list:        pointer to hash list head inside object
 * @params:        hash table parameters
 *
 * Will take the per bucket bitlock to protect against mutual mutations
 * on the same bucket. Multiple insertions may occur in parallel unless
 * they map to the same bucket.
 *
 * It is safe to call this function from atomic context.
 *
 * Will trigger an automatic deferred table resizing if residency in the
 * table grows beyond 70%.
 */
static inline int rhltable_insert_key(
        struct rhltable *hlt, const void *key, struct rhlist_head *list,
        const struct rhashtable_params params)
{
        return PTR_ERR(__rhashtable_insert_fast(&hlt->ht, key, &list->rhead,
                                                params, true));
}

/**
 * rhltable_insert - insert object into hash list table
 * @hlt:        hash list table
 * @list:        pointer to hash list head inside object
 * @params:        hash table parameters
 *
 * Will take the per bucket bitlock to protect against mutual mutations
 * on the same bucket. Multiple insertions may occur in parallel unless
 * they map to the same bucket.
 *
 * It is safe to call this function from atomic context.
 *
 * Will trigger an automatic deferred table resizing if residency in the
 * table grows beyond 70%.
 */
static inline int rhltable_insert(
        struct rhltable *hlt, struct rhlist_head *list,
        const struct rhashtable_params params)
{
        const char *key = rht_obj(&hlt->ht, &list->rhead);

        key += params.key_offset;

        return rhltable_insert_key(hlt, key, list, params);
}

/**
 * rhashtable_lookup_insert_fast - lookup and insert object into hash table
 * @ht:                hash table
 * @obj:        pointer to hash head inside object
 * @params:        hash table parameters
 *
 * This lookup function may only be used for fixed key hash table (key_len
 * parameter set). It will BUG() if used inappropriately.
 *
 * It is safe to call this function from atomic context.
 *
 * Will trigger an automatic deferred table resizing if residency in the
 * table grows beyond 70%.
 */
static inline int rhashtable_lookup_insert_fast(
        struct rhashtable *ht, struct rhash_head *obj,
        const struct rhashtable_params params)
{
        const char *key = rht_obj(ht, obj);
        void *ret;

        BUG_ON(ht->p.obj_hashfn);

        ret = __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params,
                                       false);
        if (IS_ERR(ret))
                return PTR_ERR(ret);

        return ret == NULL ? 0 : -EEXIST;
}

/**
 * rhashtable_lookup_get_insert_fast - lookup and insert object into hash table
 * @ht:                hash table
 * @obj:        pointer to hash head inside object
 * @params:        hash table parameters
 *
 * Just like rhashtable_lookup_insert_fast(), but this function returns the
 * object if it exists, NULL if it did not and the insertion was successful,
 * and an ERR_PTR otherwise.
 */
static inline void *rhashtable_lookup_get_insert_fast(
        struct rhashtable *ht, struct rhash_head *obj,
        const struct rhashtable_params params)
{
        const char *key = rht_obj(ht, obj);

        BUG_ON(ht->p.obj_hashfn);

        return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params,
                                        false);
}

/**
 * rhashtable_lookup_insert_key - search and insert object to hash table
 *                                  with explicit key
 * @ht:                hash table
 * @key:        key
 * @obj:        pointer to hash head inside object
 * @params:        hash table parameters
 *
 * Lookups may occur in parallel with hashtable mutations and resizing.
 *
 * Will trigger an automatic deferred table resizing if residency in the
 * table grows beyond 70%.
 *
 * Returns zero on success.
 */
static inline int rhashtable_lookup_insert_key(
        struct rhashtable *ht, const void *key, struct rhash_head *obj,
        const struct rhashtable_params params)
{
        void *ret;

        BUG_ON(!ht->p.obj_hashfn || !key);

        ret = __rhashtable_insert_fast(ht, key, obj, params, false);
        if (IS_ERR(ret))
                return PTR_ERR(ret);

        return ret == NULL ? 0 : -EEXIST;
}

/**
 * rhashtable_lookup_get_insert_key - lookup and insert object into hash table
 * @ht:                hash table
 * @key:        key
 * @obj:        pointer to hash head inside object
 * @params:        hash table parameters
 *
 * Just like rhashtable_lookup_insert_key(), but this function returns the
 * object if it exists, NULL if it does not and the insertion was successful,
 * and an ERR_PTR otherwise.
 */
static inline void *rhashtable_lookup_get_insert_key(
        struct rhashtable *ht, const void *key, struct rhash_head *obj,
        const struct rhashtable_params params)
{
        BUG_ON(!ht->p.obj_hashfn || !key);

        return __rhashtable_insert_fast(ht, key, obj, params, false);
}

/* Internal function, please use rhashtable_remove_fast() instead */
static inline int __rhashtable_remove_fast_one(
        struct rhashtable *ht, struct bucket_table *tbl,
        struct rhash_head *obj, const struct rhashtable_params params,
        bool rhlist)
{
        struct rhash_lock_head __rcu **bkt;
        struct rhash_head __rcu **pprev;
        struct rhash_head *he;
        unsigned int hash;
        int err = -ENOENT;

        hash = rht_head_hashfn(ht, tbl, obj, params);
        bkt = rht_bucket_var(tbl, hash);
        if (!bkt)
                return -ENOENT;
        pprev = NULL;
        rht_lock(tbl, bkt);

        rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) {
                struct rhlist_head *list;

                list = container_of(he, struct rhlist_head, rhead);

                if (he != obj) {
                        struct rhlist_head __rcu **lpprev;

                        pprev = &he->next;

                        if (!rhlist)
                                continue;

                        do {
                                lpprev = &list->next;
                                list = rht_dereference_bucket(list->next,
                                                              tbl, hash);
                        } while (list && obj != &list->rhead);

                        if (!list)
                                continue;

                        list = rht_dereference_bucket(list->next, tbl, hash);
                        RCU_INIT_POINTER(*lpprev, list);
                        err = 0;
                        break;
                }

                obj = rht_dereference_bucket(obj->next, tbl, hash);
                err = 1;

                if (rhlist) {
                        list = rht_dereference_bucket(list->next, tbl, hash);
                        if (list) {
                                RCU_INIT_POINTER(list->rhead.next, obj);
                                obj = &list->rhead;
                                err = 0;
                        }
                }

                if (pprev) {
                        rcu_assign_pointer(*pprev, obj);
                        rht_unlock(tbl, bkt);
                } else {
                        rht_assign_unlock(tbl, bkt, obj);
                }
                goto unlocked;
        }

        rht_unlock(tbl, bkt);
unlocked:
        if (err > 0) {
                atomic_dec(&ht->nelems);
                if (unlikely(ht->p.automatic_shrinking &&
                             rht_shrink_below_30(ht, tbl)))
                        schedule_work(&ht->run_work);
                err = 0;
        }

        return err;
}

/* Internal function, please use rhashtable_remove_fast() instead */
static inline int __rhashtable_remove_fast(
        struct rhashtable *ht, struct rhash_head *obj,
        const struct rhashtable_params params, bool rhlist)
{
        struct bucket_table *tbl;
        int err;

        rcu_read_lock();

        tbl = rht_dereference_rcu(ht->tbl, ht);

        /* Because we have already taken (and released) the bucket
         * lock in old_tbl, if we find that future_tbl is not yet
         * visible then that guarantees the entry to still be in
         * the old tbl if it exists.
         */
        while ((err = __rhashtable_remove_fast_one(ht, tbl, obj, params,
                                                   rhlist)) &&
               (tbl = rht_dereference_rcu(tbl->future_tbl, ht)))
                ;

        rcu_read_unlock();

        return err;
}

/**
 * rhashtable_remove_fast - remove object from hash table
 * @ht:                hash table
 * @obj:        pointer to hash head inside object
 * @params:        hash table parameters
 *
 * Since the hash chain is single linked, the removal operation needs to
 * walk the bucket chain upon removal. The removal operation is thus
 * considerable slow if the hash table is not correctly sized.
 *
 * Will automatically shrink the table if permitted when residency drops
 * below 30%.
 *
 * Returns zero on success, -ENOENT if the entry could not be found.
 */
static inline int rhashtable_remove_fast(
        struct rhashtable *ht, struct rhash_head *obj,
        const struct rhashtable_params params)
{
        return __rhashtable_remove_fast(ht, obj, params, false);
}

/**
 * rhltable_remove - remove object from hash list table
 * @hlt:        hash list table
 * @list:        pointer to hash list head inside object
 * @params:        hash table parameters
 *
 * Since the hash chain is single linked, the removal operation needs to
 * walk the bucket chain upon removal. The removal operation is thus
 * considerable slow if the hash table is not correctly sized.
 *
 * Will automatically shrink the table if permitted when residency drops
 * below 30%
 *
 * Returns zero on success, -ENOENT if the entry could not be found.
 */
static inline int rhltable_remove(
        struct rhltable *hlt, struct rhlist_head *list,
        const struct rhashtable_params params)
{
        return __rhashtable_remove_fast(&hlt->ht, &list->rhead, params, true);
}

/* Internal function, please use rhashtable_replace_fast() instead */
static inline int __rhashtable_replace_fast(
        struct rhashtable *ht, struct bucket_table *tbl,
        struct rhash_head *obj_old, struct rhash_head *obj_new,
        const struct rhashtable_params params)
{
        struct rhash_lock_head __rcu **bkt;
        struct rhash_head __rcu **pprev;
        struct rhash_head *he;
        unsigned int hash;
        int err = -ENOENT;

        /* Minimally, the old and new objects must have same hash
         * (which should mean identifiers are the same).
         */
        hash = rht_head_hashfn(ht, tbl, obj_old, params);
        if (hash != rht_head_hashfn(ht, tbl, obj_new, params))
                return -EINVAL;

        bkt = rht_bucket_var(tbl, hash);
        if (!bkt)
                return -ENOENT;

        pprev = NULL;
        rht_lock(tbl, bkt);

        rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) {
                if (he != obj_old) {
                        pprev = &he->next;
                        continue;
                }

                rcu_assign_pointer(obj_new->next, obj_old->next);
                if (pprev) {
                        rcu_assign_pointer(*pprev, obj_new);
                        rht_unlock(tbl, bkt);
                } else {
                        rht_assign_unlock(tbl, bkt, obj_new);
                }
                err = 0;
                goto unlocked;
        }

        rht_unlock(tbl, bkt);

unlocked:
        return err;
}

/**
 * rhashtable_replace_fast - replace an object in hash table
 * @ht:                hash table
 * @obj_old:        pointer to hash head inside object being replaced
 * @obj_new:        pointer to hash head inside object which is new
 * @params:        hash table parameters
 *
 * Replacing an object doesn't affect the number of elements in the hash table
 * or bucket, so we don't need to worry about shrinking or expanding the
 * table here.
 *
 * Returns zero on success, -ENOENT if the entry could not be found,
 * -EINVAL if hash is not the same for the old and new objects.
 */
static inline int rhashtable_replace_fast(
        struct rhashtable *ht, struct rhash_head *obj_old,
        struct rhash_head *obj_new,
        const struct rhashtable_params params)
{
        struct bucket_table *tbl;
        int err;

        rcu_read_lock();

        tbl = rht_dereference_rcu(ht->tbl, ht);

        /* Because we have already taken (and released) the bucket
         * lock in old_tbl, if we find that future_tbl is not yet
         * visible then that guarantees the entry to still be in
         * the old tbl if it exists.
         */
        while ((err = __rhashtable_replace_fast(ht, tbl, obj_old,
                                                obj_new, params)) &&
               (tbl = rht_dereference_rcu(tbl->future_tbl, ht)))
                ;

        rcu_read_unlock();

        return err;
}

/**
 * rhltable_walk_enter - Initialise an iterator
 * @hlt:        Table to walk over
 * @iter:        Hash table Iterator
 *
 * This function prepares a hash table walk.
 *
 * Note that if you restart a walk after rhashtable_walk_stop you
 * may see the same object twice.  Also, you may miss objects if
 * there are removals in between rhashtable_walk_stop and the next
 * call to rhashtable_walk_start.
 *
 * For a completely stable walk you should construct your own data
 * structure outside the hash table.
 *
 * This function may be called from any process context, including
 * non-preemptable context, but cannot be called from softirq or
 * hardirq context.
 *
 * You must call rhashtable_walk_exit after this function returns.
 */
static inline void rhltable_walk_enter(struct rhltable *hlt,
                                       struct rhashtable_iter *iter)
{
        return rhashtable_walk_enter(&hlt->ht, iter);
}

/**
 * rhltable_free_and_destroy - free elements and destroy hash list table
 * @hlt:        the hash list table to destroy
 * @free_fn:        callback to release resources of element
 * @arg:        pointer passed to free_fn
 *
 * See documentation for rhashtable_free_and_destroy.
 */
static inline void rhltable_free_and_destroy(struct rhltable *hlt,
                                             void (*free_fn)(void *ptr,
                                                             void *arg),
                                             void *arg)
{
        return rhashtable_free_and_destroy(&hlt->ht, free_fn, arg);
}

static inline void rhltable_destroy(struct rhltable *hlt)
{
        return rhltable_free_and_destroy(hlt, NULL, NULL);
}

#endif /* _LINUX_RHASHTABLE_H */












































































    1 


    1 
    1 

    1 


    1 





    1 































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
// SPDX-License-Identifier: GPL-2.0
/*
 * Out-of-line refcount functions.
 */

#include <linux/mutex.h>
#include <linux/refcount.h>
#include <linux/spinlock.h>
#include <linux/bug.h>

#define REFCOUNT_WARN(str)        WARN_ONCE(1, "refcount_t: " str ".\n")

void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t)
{
        refcount_set(r, REFCOUNT_SATURATED);

        switch (t) {
        case REFCOUNT_ADD_NOT_ZERO_OVF:
                REFCOUNT_WARN("saturated; leaking memory");
                break;
        case REFCOUNT_ADD_OVF:
                REFCOUNT_WARN("saturated; leaking memory");
                break;
        case REFCOUNT_ADD_UAF:
                REFCOUNT_WARN("addition on 0; use-after-free");
                break;
        case REFCOUNT_SUB_UAF:
                REFCOUNT_WARN("underflow; use-after-free");
                break;
        case REFCOUNT_DEC_LEAK:
                REFCOUNT_WARN("decrement hit 0; leaking memory");
                break;
        default:
                REFCOUNT_WARN("unknown saturation event!?");
        }
}
EXPORT_SYMBOL(refcount_warn_saturate);

/**
 * refcount_dec_if_one - decrement a refcount if it is 1
 * @r: the refcount
 *
 * No atomic_t counterpart, it attempts a 1 -> 0 transition and returns the
 * success thereof.
 *
 * Like all decrement operations, it provides release memory order and provides
 * a control dependency.
 *
 * It can be used like a try-delete operator; this explicit case is provided
 * and not cmpxchg in generic, because that would allow implementing unsafe
 * operations.
 *
 * Return: true if the resulting refcount is 0, false otherwise
 */
bool refcount_dec_if_one(refcount_t *r)
{
        int val = 1;

        return atomic_try_cmpxchg_release(&r->refs, &val, 0);
}
EXPORT_SYMBOL(refcount_dec_if_one);

/**
 * refcount_dec_not_one - decrement a refcount if it is not 1
 * @r: the refcount
 *
 * No atomic_t counterpart, it decrements unless the value is 1, in which case
 * it will return false.
 *
 * Was often done like: atomic_add_unless(&var, -1, 1)
 *
 * Return: true if the decrement operation was successful, false otherwise
 */
bool refcount_dec_not_one(refcount_t *r)
{
        unsigned int new, val = atomic_read(&r->refs);

        do {
                if (unlikely(val == REFCOUNT_SATURATED))
                        return true;

                if (val == 1)
                        return false;

                new = val - 1;
                if (new > val) {
                        WARN_ONCE(new > val, "refcount_t: underflow; use-after-free.\n");
                        return true;
                }

        } while (!atomic_try_cmpxchg_release(&r->refs, &val, new));

        return true;
}
EXPORT_SYMBOL(refcount_dec_not_one);

/**
 * refcount_dec_and_mutex_lock - return holding mutex if able to decrement
 *                               refcount to 0
 * @r: the refcount
 * @lock: the mutex to be locked
 *
 * Similar to atomic_dec_and_mutex_lock(), it will WARN on underflow and fail
 * to decrement when saturated at REFCOUNT_SATURATED.
 *
 * Provides release memory ordering, such that prior loads and stores are done
 * before, and provides a control dependency such that free() must come after.
 * See the comment on top.
 *
 * Return: true and hold mutex if able to decrement refcount to 0, false
 *         otherwise
 */
bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock)
{
        if (refcount_dec_not_one(r))
                return false;

        mutex_lock(lock);
        if (!refcount_dec_and_test(r)) {
                mutex_unlock(lock);
                return false;
        }

        return true;
}
EXPORT_SYMBOL(refcount_dec_and_mutex_lock);

/**
 * refcount_dec_and_lock - return holding spinlock if able to decrement
 *                         refcount to 0
 * @r: the refcount
 * @lock: the spinlock to be locked
 *
 * Similar to atomic_dec_and_lock(), it will WARN on underflow and fail to
 * decrement when saturated at REFCOUNT_SATURATED.
 *
 * Provides release memory ordering, such that prior loads and stores are done
 * before, and provides a control dependency such that free() must come after.
 * See the comment on top.
 *
 * Return: true and hold spinlock if able to decrement refcount to 0, false
 *         otherwise
 */
bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock)
{
        if (refcount_dec_not_one(r))
                return false;

        spin_lock(lock);
        if (!refcount_dec_and_test(r)) {
                spin_unlock(lock);
                return false;
        }

        return true;
}
EXPORT_SYMBOL(refcount_dec_and_lock);

/**
 * refcount_dec_and_lock_irqsave - return holding spinlock with disabled
 *                                 interrupts if able to decrement refcount to 0
 * @r: the refcount
 * @lock: the spinlock to be locked
 * @flags: saved IRQ-flags if the is acquired
 *
 * Same as refcount_dec_and_lock() above except that the spinlock is acquired
 * with disabled interupts.
 *
 * Return: true and hold spinlock if able to decrement refcount to 0, false
 *         otherwise
 */
bool refcount_dec_and_lock_irqsave(refcount_t *r, spinlock_t *lock,
                                   unsigned long *flags)
{
        if (refcount_dec_not_one(r))
                return false;

        spin_lock_irqsave(lock, *flags);
        if (!refcount_dec_and_test(r)) {
                spin_unlock_irqrestore(lock, *flags);
                return false;
        }

        return true;
}
EXPORT_SYMBOL(refcount_dec_and_lock_irqsave);






























































    1 













    1 






























































































































    1 




    1 






    1 















    1 















    1 








    1 



    1 









    1 






    1 
    1 






    1 


    1 



    1 
    1 
    1 











    1 









    1 




    1 
    1 













    1 








    1 
    1 
















    1 
    1 




    1 




















































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
// SPDX-License-Identifier: GPL-2.0
/*
 *  Block device elevator/IO-scheduler.
 *
 *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
 *
 * 30042000 Jens Axboe <axboe@kernel.dk> :
 *
 * Split the elevator a bit so that it is possible to choose a different
 * one or even write a new "plug in". There are three pieces:
 * - elevator_fn, inserts a new request in the queue list
 * - elevator_merge_fn, decides whether a new buffer can be merged with
 *   an existing request
 * - elevator_dequeue_fn, called when a request is taken off the active list
 *
 * 20082000 Dave Jones <davej@suse.de> :
 * Removed tests for max-bomb-segments, which was breaking elvtune
 *  when run without -bN
 *
 * Jens:
 * - Rework again to work with bio instead of buffer_heads
 * - loose bi_dev comparisons, partition handling is right now
 * - completely modularize elevator setup and teardown
 *
 */
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/elevator.h>
#include <linux/bio.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/compiler.h>
#include <linux/blktrace_api.h>
#include <linux/hash.h>
#include <linux/uaccess.h>
#include <linux/pm_runtime.h>
#include <linux/blk-cgroup.h>

#include <trace/events/block.h>

#include "blk.h"
#include "blk-mq-sched.h"
#include "blk-pm.h"
#include "blk-wbt.h"

static DEFINE_SPINLOCK(elv_list_lock);
static LIST_HEAD(elv_list);

/*
 * Merge hash stuff.
 */
#define rq_hash_key(rq)                (blk_rq_pos(rq) + blk_rq_sectors(rq))

/*
 * Query io scheduler to see if the current process issuing bio may be
 * merged with rq.
 */
static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
{
        struct request_queue *q = rq->q;
        struct elevator_queue *e = q->elevator;

        if (e->type->ops.allow_merge)
                return e->type->ops.allow_merge(q, rq, bio);

        return 1;
}

/*
 * can we safely merge with this request?
 */
bool elv_bio_merge_ok(struct request *rq, struct bio *bio)
{
        if (!blk_rq_merge_ok(rq, bio))
                return false;

        if (!elv_iosched_allow_bio_merge(rq, bio))
                return false;

        return true;
}
EXPORT_SYMBOL(elv_bio_merge_ok);

static inline bool elv_support_features(unsigned int elv_features,
                                        unsigned int required_features)
{
        return (required_features & elv_features) == required_features;
}

/**
 * elevator_match - Test an elevator name and features
 * @e: Scheduler to test
 * @name: Elevator name to test
 * @required_features: Features that the elevator must provide
 *
 * Return true if the elevator @e name matches @name and if @e provides all
 * the features specified by @required_features.
 */
static bool elevator_match(const struct elevator_type *e, const char *name,
                           unsigned int required_features)
{
        if (!elv_support_features(e->elevator_features, required_features))
                return false;
        if (!strcmp(e->elevator_name, name))
                return true;
        if (e->elevator_alias && !strcmp(e->elevator_alias, name))
                return true;

        return false;
}

/**
 * elevator_find - Find an elevator
 * @name: Name of the elevator to find
 * @required_features: Features that the elevator must provide
 *
 * Return the first registered scheduler with name @name and supporting the
 * features @required_features and NULL otherwise.
 */
static struct elevator_type *elevator_find(const char *name,
                                           unsigned int required_features)
{
        struct elevator_type *e;

        list_for_each_entry(e, &elv_list, list) {
                if (elevator_match(e, name, required_features))
                        return e;
        }

        return NULL;
}

static void elevator_put(struct elevator_type *e)
{
        module_put(e->elevator_owner);
}

static struct elevator_type *elevator_get(struct request_queue *q,
                                          const char *name, bool try_loading)
{
        struct elevator_type *e;

        spin_lock(&elv_list_lock);

        e = elevator_find(name, q->required_elevator_features);
        if (!e && try_loading) {
                spin_unlock(&elv_list_lock);
                request_module("%s-iosched", name);
                spin_lock(&elv_list_lock);
                e = elevator_find(name, q->required_elevator_features);
        }

        if (e && !try_module_get(e->elevator_owner))
                e = NULL;

        spin_unlock(&elv_list_lock);
        return e;
}

static struct kobj_type elv_ktype;

struct elevator_queue *elevator_alloc(struct request_queue *q,
                                  struct elevator_type *e)
{
        struct elevator_queue *eq;

        eq = kzalloc_node(sizeof(*eq), GFP_KERNEL, q->node);
        if (unlikely(!eq))
                return NULL;

        eq->type = e;
        kobject_init(&eq->kobj, &elv_ktype);
        mutex_init(&eq->sysfs_lock);
        hash_init(eq->hash);

        return eq;
}
EXPORT_SYMBOL(elevator_alloc);

static void elevator_release(struct kobject *kobj)
{
        struct elevator_queue *e;

        e = container_of(kobj, struct elevator_queue, kobj);
        elevator_put(e->type);
        kfree(e);
}

void __elevator_exit(struct request_queue *q, struct elevator_queue *e)
{
        mutex_lock(&e->sysfs_lock);
        blk_mq_exit_sched(q, e);
        mutex_unlock(&e->sysfs_lock);

        kobject_put(&e->kobj);
}

static inline void __elv_rqhash_del(struct request *rq)
{
        hash_del(&rq->hash);
        rq->rq_flags &= ~RQF_HASHED;
}

void elv_rqhash_del(struct request_queue *q, struct request *rq)
{
        if (ELV_ON_HASH(rq))
                __elv_rqhash_del(rq);
}
EXPORT_SYMBOL_GPL(elv_rqhash_del);

void elv_rqhash_add(struct request_queue *q, struct request *rq)
{
        struct elevator_queue *e = q->elevator;

        BUG_ON(ELV_ON_HASH(rq));
        hash_add(e->hash, &rq->hash, rq_hash_key(rq));
        rq->rq_flags |= RQF_HASHED;
}
EXPORT_SYMBOL_GPL(elv_rqhash_add);

void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
{
        __elv_rqhash_del(rq);
        elv_rqhash_add(q, rq);
}

struct request *elv_rqhash_find(struct request_queue *q, sector_t offset)
{
        struct elevator_queue *e = q->elevator;
        struct hlist_node *next;
        struct request *rq;

        hash_for_each_possible_safe(e->hash, rq, next, hash, offset) {
                BUG_ON(!ELV_ON_HASH(rq));

                if (unlikely(!rq_mergeable(rq))) {
                        __elv_rqhash_del(rq);
                        continue;
                }

                if (rq_hash_key(rq) == offset)
                        return rq;
        }

        return NULL;
}

/*
 * RB-tree support functions for inserting/lookup/removal of requests
 * in a sorted RB tree.
 */
void elv_rb_add(struct rb_root *root, struct request *rq)
{
        struct rb_node **p = &root->rb_node;
        struct rb_node *parent = NULL;
        struct request *__rq;

        while (*p) {
                parent = *p;
                __rq = rb_entry(parent, struct request, rb_node);

                if (blk_rq_pos(rq) < blk_rq_pos(__rq))
                        p = &(*p)->rb_left;
                else if (blk_rq_pos(rq) >= blk_rq_pos(__rq))
                        p = &(*p)->rb_right;
        }

        rb_link_node(&rq->rb_node, parent, p);
        rb_insert_color(&rq->rb_node, root);
}
EXPORT_SYMBOL(elv_rb_add);

void elv_rb_del(struct rb_root *root, struct request *rq)
{
        BUG_ON(RB_EMPTY_NODE(&rq->rb_node));
        rb_erase(&rq->rb_node, root);
        RB_CLEAR_NODE(&rq->rb_node);
}
EXPORT_SYMBOL(elv_rb_del);

struct request *elv_rb_find(struct rb_root *root, sector_t sector)
{
        struct rb_node *n = root->rb_node;
        struct request *rq;

        while (n) {
                rq = rb_entry(n, struct request, rb_node);

                if (sector < blk_rq_pos(rq))
                        n = n->rb_left;
                else if (sector > blk_rq_pos(rq))
                        n = n->rb_right;
                else
                        return rq;
        }

        return NULL;
}
EXPORT_SYMBOL(elv_rb_find);

enum elv_merge elv_merge(struct request_queue *q, struct request **req,
                struct bio *bio)
{
        struct elevator_queue *e = q->elevator;
        struct request *__rq;

        /*
         * Levels of merges:
         *         nomerges:  No merges at all attempted
         *         noxmerges: Only simple one-hit cache try
         *         merges:           All merge tries attempted
         */
        if (blk_queue_nomerges(q) || !bio_mergeable(bio))
                return ELEVATOR_NO_MERGE;

        /*
         * First try one-hit cache.
         */
        if (q->last_merge && elv_bio_merge_ok(q->last_merge, bio)) {
                enum elv_merge ret = blk_try_merge(q->last_merge, bio);

                if (ret != ELEVATOR_NO_MERGE) {
                        *req = q->last_merge;
                        return ret;
                }
        }

        if (blk_queue_noxmerges(q))
                return ELEVATOR_NO_MERGE;

        /*
         * See if our hash lookup can find a potential backmerge.
         */
        __rq = elv_rqhash_find(q, bio->bi_iter.bi_sector);
        if (__rq && elv_bio_merge_ok(__rq, bio)) {
                *req = __rq;

                if (blk_discard_mergable(__rq))
                        return ELEVATOR_DISCARD_MERGE;
                return ELEVATOR_BACK_MERGE;
        }

        if (e->type->ops.request_merge)
                return e->type->ops.request_merge(q, req, bio);

        return ELEVATOR_NO_MERGE;
}

/*
 * Attempt to do an insertion back merge. Only check for the case where
 * we can append 'rq' to an existing request, so we can throw 'rq' away
 * afterwards.
 *
 * Returns true if we merged, false otherwise
 */
bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq)
{
        struct request *__rq;
        bool ret;

        if (blk_queue_nomerges(q))
                return false;

        /*
         * First try one-hit cache.
         */
        if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq))
                return true;

        if (blk_queue_noxmerges(q))
                return false;

        ret = false;
        /*
         * See if our hash lookup can find a potential backmerge.
         */
        while (1) {
                __rq = elv_rqhash_find(q, blk_rq_pos(rq));
                if (!__rq || !blk_attempt_req_merge(q, __rq, rq))
                        break;

                /* The merged request could be merged with others, try again */
                ret = true;
                rq = __rq;
        }

        return ret;
}

void elv_merged_request(struct request_queue *q, struct request *rq,
                enum elv_merge type)
{
        struct elevator_queue *e = q->elevator;

        if (e->type->ops.request_merged)
                e->type->ops.request_merged(q, rq, type);

        if (type == ELEVATOR_BACK_MERGE)
                elv_rqhash_reposition(q, rq);

        q->last_merge = rq;
}

void elv_merge_requests(struct request_queue *q, struct request *rq,
                             struct request *next)
{
        struct elevator_queue *e = q->elevator;

        if (e->type->ops.requests_merged)
                e->type->ops.requests_merged(q, rq, next);

        elv_rqhash_reposition(q, rq);
        q->last_merge = rq;
}

struct request *elv_latter_request(struct request_queue *q, struct request *rq)
{
        struct elevator_queue *e = q->elevator;

        if (e->type->ops.next_request)
                return e->type->ops.next_request(q, rq);

        return NULL;
}

struct request *elv_former_request(struct request_queue *q, struct request *rq)
{
        struct elevator_queue *e = q->elevator;

        if (e->type->ops.former_request)
                return e->type->ops.former_request(q, rq);

        return NULL;
}

#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)

static ssize_t
elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
{
        struct elv_fs_entry *entry = to_elv(attr);
        struct elevator_queue *e;
        ssize_t error;

        if (!entry->show)
                return -EIO;

        e = container_of(kobj, struct elevator_queue, kobj);
        mutex_lock(&e->sysfs_lock);
        error = e->type ? entry->show(e, page) : -ENOENT;
        mutex_unlock(&e->sysfs_lock);
        return error;
}

static ssize_t
elv_attr_store(struct kobject *kobj, struct attribute *attr,
               const char *page, size_t length)
{
        struct elv_fs_entry *entry = to_elv(attr);
        struct elevator_queue *e;
        ssize_t error;

        if (!entry->store)
                return -EIO;

        e = container_of(kobj, struct elevator_queue, kobj);
        mutex_lock(&e->sysfs_lock);
        error = e->type ? entry->store(e, page, length) : -ENOENT;
        mutex_unlock(&e->sysfs_lock);
        return error;
}

static const struct sysfs_ops elv_sysfs_ops = {
        .show        = elv_attr_show,
        .store        = elv_attr_store,
};

static struct kobj_type elv_ktype = {
        .sysfs_ops        = &elv_sysfs_ops,
        .release        = elevator_release,
};

int elv_register_queue(struct request_queue *q, bool uevent)
{
        struct elevator_queue *e = q->elevator;
        int error;

        lockdep_assert_held(&q->sysfs_lock);

        error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
        if (!error) {
                struct elv_fs_entry *attr = e->type->elevator_attrs;
                if (attr) {
                        while (attr->attr.name) {
                                if (sysfs_create_file(&e->kobj, &attr->attr))
                                        break;
                                attr++;
                        }
                }
                if (uevent)
                        kobject_uevent(&e->kobj, KOBJ_ADD);

                e->registered = 1;
        }
        return error;
}

void elv_unregister_queue(struct request_queue *q)
{
        lockdep_assert_held(&q->sysfs_lock);

        if (q) {
                struct elevator_queue *e = q->elevator;

                kobject_uevent(&e->kobj, KOBJ_REMOVE);
                kobject_del(&e->kobj);

                e->registered = 0;
        }
}

int elv_register(struct elevator_type *e)
{
        /* create icq_cache if requested */
        if (e->icq_size) {
                if (WARN_ON(e->icq_size < sizeof(struct io_cq)) ||
                    WARN_ON(e->icq_align < __alignof__(struct io_cq)))
                        return -EINVAL;

                snprintf(e->icq_cache_name, sizeof(e->icq_cache_name),
                         "%s_io_cq", e->elevator_name);
                e->icq_cache = kmem_cache_create(e->icq_cache_name, e->icq_size,
                                                 e->icq_align, 0, NULL);
                if (!e->icq_cache)
                        return -ENOMEM;
        }

        /* register, don't allow duplicate names */
        spin_lock(&elv_list_lock);
        if (elevator_find(e->elevator_name, 0)) {
                spin_unlock(&elv_list_lock);
                kmem_cache_destroy(e->icq_cache);
                return -EBUSY;
        }
        list_add_tail(&e->list, &elv_list);
        spin_unlock(&elv_list_lock);

        printk(KERN_INFO "io scheduler %s registered\n", e->elevator_name);

        return 0;
}
EXPORT_SYMBOL_GPL(elv_register);

void elv_unregister(struct elevator_type *e)
{
        /* unregister */
        spin_lock(&elv_list_lock);
        list_del_init(&e->list);
        spin_unlock(&elv_list_lock);

        /*
         * Destroy icq_cache if it exists.  icq's are RCU managed.  Make
         * sure all RCU operations are complete before proceeding.
         */
        if (e->icq_cache) {
                rcu_barrier();
                kmem_cache_destroy(e->icq_cache);
                e->icq_cache = NULL;
        }
}
EXPORT_SYMBOL_GPL(elv_unregister);

int elevator_switch_mq(struct request_queue *q,
                              struct elevator_type *new_e)
{
        int ret;

        lockdep_assert_held(&q->sysfs_lock);

        if (q->elevator) {
                if (q->elevator->registered)
                        elv_unregister_queue(q);

                ioc_clear_queue(q);
                elevator_exit(q, q->elevator);
        }

        ret = blk_mq_init_sched(q, new_e);
        if (ret)
                goto out;

        if (new_e) {
                ret = elv_register_queue(q, true);
                if (ret) {
                        elevator_exit(q, q->elevator);
                        goto out;
                }
        }

        if (new_e)
                blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
        else
                blk_add_trace_msg(q, "elv switch: none");

out:
        return ret;
}

static inline bool elv_support_iosched(struct request_queue *q)
{
        if (!queue_is_mq(q) ||
            (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED)))
                return false;
        return true;
}

/*
 * For single queue devices, default to using mq-deadline. If we have multiple
 * queues or mq-deadline is not available, default to "none".
 */
static struct elevator_type *elevator_get_default(struct request_queue *q)
{
        if (q->nr_hw_queues != 1)
                return NULL;

        return elevator_get(q, "mq-deadline", false);
}

/*
 * Get the first elevator providing the features required by the request queue.
 * Default to "none" if no matching elevator is found.
 */
static struct elevator_type *elevator_get_by_features(struct request_queue *q)
{
        struct elevator_type *e, *found = NULL;

        spin_lock(&elv_list_lock);

        list_for_each_entry(e, &elv_list, list) {
                if (elv_support_features(e->elevator_features,
                                         q->required_elevator_features)) {
                        found = e;
                        break;
                }
        }

        if (found && !try_module_get(found->elevator_owner))
                found = NULL;

        spin_unlock(&elv_list_lock);
        return found;
}

/*
 * For a device queue that has no required features, use the default elevator
 * settings. Otherwise, use the first elevator available matching the required
 * features. If no suitable elevator is find or if the chosen elevator
 * initialization fails, fall back to the "none" elevator (no elevator).
 */
void elevator_init_mq(struct request_queue *q)
{
        struct elevator_type *e;
        int err;

        if (!elv_support_iosched(q))
                return;

        WARN_ON_ONCE(blk_queue_registered(q));

        if (unlikely(q->elevator))
                return;

        if (!q->required_elevator_features)
                e = elevator_get_default(q);
        else
                e = elevator_get_by_features(q);
        if (!e)
                return;

        blk_mq_freeze_queue(q);
        blk_mq_quiesce_queue(q);

        err = blk_mq_init_sched(q, e);

        blk_mq_unquiesce_queue(q);
        blk_mq_unfreeze_queue(q);

        if (err) {
                pr_warn("\"%s\" elevator initialization failed, "
                        "falling back to \"none\"\n", e->elevator_name);
                elevator_put(e);
        }
}


/*
 * switch to new_e io scheduler. be careful not to introduce deadlocks -
 * we don't free the old io scheduler, before we have allocated what we
 * need for the new one. this way we have a chance of going back to the old
 * one, if the new one fails init for some reason.
 */
static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
{
        int err;

        lockdep_assert_held(&q->sysfs_lock);

        blk_mq_freeze_queue(q);
        blk_mq_quiesce_queue(q);

        err = elevator_switch_mq(q, new_e);

        blk_mq_unquiesce_queue(q);
        blk_mq_unfreeze_queue(q);

        return err;
}

/*
 * Switch this queue to the given IO scheduler.
 */
static int __elevator_change(struct request_queue *q, const char *name)
{
        char elevator_name[ELV_NAME_MAX];
        struct elevator_type *e;

        /* Make sure queue is not in the middle of being removed */
        if (!blk_queue_registered(q))
                return -ENOENT;

        /*
         * Special case for mq, turn off scheduling
         */
        if (!strncmp(name, "none", 4)) {
                if (!q->elevator)
                        return 0;
                return elevator_switch(q, NULL);
        }

        strlcpy(elevator_name, name, sizeof(elevator_name));
        e = elevator_get(q, strstrip(elevator_name), true);
        if (!e)
                return -EINVAL;

        if (q->elevator &&
            elevator_match(q->elevator->type, elevator_name, 0)) {
                elevator_put(e);
                return 0;
        }

        return elevator_switch(q, e);
}

ssize_t elv_iosched_store(struct request_queue *q, const char *name,
                          size_t count)
{
        int ret;

        if (!elv_support_iosched(q))
                return count;

        ret = __elevator_change(q, name);
        if (!ret)
                return count;

        return ret;
}

ssize_t elv_iosched_show(struct request_queue *q, char *name)
{
        struct elevator_queue *e = q->elevator;
        struct elevator_type *elv = NULL;
        struct elevator_type *__e;
        int len = 0;

        if (!queue_is_mq(q))
                return sprintf(name, "none\n");

        if (!q->elevator)
                len += sprintf(name+len, "[none] ");
        else
                elv = e->type;

        spin_lock(&elv_list_lock);
        list_for_each_entry(__e, &elv_list, list) {
                if (elv && elevator_match(elv, __e->elevator_name, 0)) {
                        len += sprintf(name+len, "[%s] ", elv->elevator_name);
                        continue;
                }
                if (elv_support_iosched(q) &&
                    elevator_match(__e, __e->elevator_name,
                                   q->required_elevator_features))
                        len += sprintf(name+len, "%s ", __e->elevator_name);
        }
        spin_unlock(&elv_list_lock);

        if (q->elevator)
                len += sprintf(name+len, "none");

        len += sprintf(len+name, "\n");
        return len;
}

struct request *elv_rb_former_request(struct request_queue *q,
                                      struct request *rq)
{
        struct rb_node *rbprev = rb_prev(&rq->rb_node);

        if (rbprev)
                return rb_entry_rq(rbprev);

        return NULL;
}
EXPORT_SYMBOL(elv_rb_former_request);

struct request *elv_rb_latter_request(struct request_queue *q,
                                      struct request *rq)
{
        struct rb_node *rbnext = rb_next(&rq->rb_node);

        if (rbnext)
                return rb_entry_rq(rbnext);

        return NULL;
}
EXPORT_SYMBOL(elv_rb_latter_request);

static int __init elevator_setup(char *str)
{
        pr_warn("Kernel parameter elevator= does not have any effect anymore.\n"
                "Please use sysfs to set IO scheduler for individual devices.\n");
        return 1;
}

__setup("elevator=", elevator_setup);





























































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
/* SPDX-License-Identifier: GPL-2.0 */
/* Freezer declarations */

#ifndef FREEZER_H_INCLUDED
#define FREEZER_H_INCLUDED

#include <linux/debug_locks.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/atomic.h>

#ifdef CONFIG_FREEZER
extern atomic_t system_freezing_cnt;        /* nr of freezing conds in effect */
extern bool pm_freezing;                /* PM freezing in effect */
extern bool pm_nosig_freezing;                /* PM nosig freezing in effect */

/*
 * Timeout for stopping processes
 */
extern unsigned int freeze_timeout_msecs;

/*
 * Check if a process has been frozen
 */
static inline bool frozen(struct task_struct *p)
{
        return p->flags & PF_FROZEN;
}

extern bool freezing_slow_path(struct task_struct *p);

/*
 * Check if there is a request to freeze a process
 */
static inline bool freezing(struct task_struct *p)
{
        if (likely(!atomic_read(&system_freezing_cnt)))
                return false;
        return freezing_slow_path(p);
}

/* Takes and releases task alloc lock using task_lock() */
extern void __thaw_task(struct task_struct *t);

extern bool __refrigerator(bool check_kthr_stop);
extern int freeze_processes(void);
extern int freeze_kernel_threads(void);
extern void thaw_processes(void);
extern void thaw_kernel_threads(void);

/*
 * DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION
 * If try_to_freeze causes a lockdep warning it means the caller may deadlock
 */
static inline bool try_to_freeze_unsafe(void)
{
        might_sleep();
        if (likely(!freezing(current)))
                return false;
        return __refrigerator(false);
}

static inline bool try_to_freeze(void)
{
        if (!(current->flags & PF_NOFREEZE))
                debug_check_no_locks_held();
        return try_to_freeze_unsafe();
}

extern bool freeze_task(struct task_struct *p);
extern bool set_freezable(void);

#ifdef CONFIG_CGROUP_FREEZER
extern bool cgroup_freezing(struct task_struct *task);
#else /* !CONFIG_CGROUP_FREEZER */
static inline bool cgroup_freezing(struct task_struct *task)
{
        return false;
}
#endif /* !CONFIG_CGROUP_FREEZER */

/*
 * The PF_FREEZER_SKIP flag should be set by a vfork parent right before it
 * calls wait_for_completion(&vfork) and reset right after it returns from this
 * function.  Next, the parent should call try_to_freeze() to freeze itself
 * appropriately in case the child has exited before the freezing of tasks is
 * complete.  However, we don't want kernel threads to be frozen in unexpected
 * places, so we allow them to block freeze_processes() instead or to set
 * PF_NOFREEZE if needed. Fortunately, in the ____call_usermodehelper() case the
 * parent won't really block freeze_processes(), since ____call_usermodehelper()
 * (the child) does a little before exec/exit and it can't be frozen before
 * waking up the parent.
 */


/**
 * freezer_do_not_count - tell freezer to ignore %current
 *
 * Tell freezers to ignore the current task when determining whether the
 * target frozen state is reached.  IOW, the current task will be
 * considered frozen enough by freezers.
 *
 * The caller shouldn't do anything which isn't allowed for a frozen task
 * until freezer_cont() is called.  Usually, freezer[_do_not]_count() pair
 * wrap a scheduling operation and nothing much else.
 */
static inline void freezer_do_not_count(void)
{
        current->flags |= PF_FREEZER_SKIP;
}

/**
 * freezer_count - tell freezer to stop ignoring %current
 *
 * Undo freezer_do_not_count().  It tells freezers that %current should be
 * considered again and tries to freeze if freezing condition is already in
 * effect.
 */
static inline void freezer_count(void)
{
        current->flags &= ~PF_FREEZER_SKIP;
        /*
         * If freezing is in progress, the following paired with smp_mb()
         * in freezer_should_skip() ensures that either we see %true
         * freezing() or freezer_should_skip() sees !PF_FREEZER_SKIP.
         */
        smp_mb();
        try_to_freeze();
}

/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
static inline void freezer_count_unsafe(void)
{
        current->flags &= ~PF_FREEZER_SKIP;
        smp_mb();
        try_to_freeze_unsafe();
}

/**
 * freezer_should_skip - whether to skip a task when determining frozen
 *                         state is reached
 * @p: task in quesion
 *
 * This function is used by freezers after establishing %true freezing() to
 * test whether a task should be skipped when determining the target frozen
 * state is reached.  IOW, if this function returns %true, @p is considered
 * frozen enough.
 */
static inline bool freezer_should_skip(struct task_struct *p)
{
        /*
         * The following smp_mb() paired with the one in freezer_count()
         * ensures that either freezer_count() sees %true freezing() or we
         * see cleared %PF_FREEZER_SKIP and return %false.  This makes it
         * impossible for a task to slip frozen state testing after
         * clearing %PF_FREEZER_SKIP.
         */
        smp_mb();
        return p->flags & PF_FREEZER_SKIP;
}

/*
 * These functions are intended to be used whenever you want allow a sleeping
 * task to be frozen. Note that neither return any clear indication of
 * whether a freeze event happened while in this function.
 */

/* Like schedule(), but should not block the freezer. */
static inline void freezable_schedule(void)
{
        freezer_do_not_count();
        schedule();
        freezer_count();
}

/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
static inline void freezable_schedule_unsafe(void)
{
        freezer_do_not_count();
        schedule();
        freezer_count_unsafe();
}

/*
 * Like schedule_timeout(), but should not block the freezer.  Do not
 * call this with locks held.
 */
static inline long freezable_schedule_timeout(long timeout)
{
        long __retval;
        freezer_do_not_count();
        __retval = schedule_timeout(timeout);
        freezer_count();
        return __retval;
}

/*
 * Like schedule_timeout_interruptible(), but should not block the freezer.  Do not
 * call this with locks held.
 */
static inline long freezable_schedule_timeout_interruptible(long timeout)
{
        long __retval;
        freezer_do_not_count();
        __retval = schedule_timeout_interruptible(timeout);
        freezer_count();
        return __retval;
}

/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
static inline long freezable_schedule_timeout_interruptible_unsafe(long timeout)
{
        long __retval;

        freezer_do_not_count();
        __retval = schedule_timeout_interruptible(timeout);
        freezer_count_unsafe();
        return __retval;
}

/* Like schedule_timeout_killable(), but should not block the freezer. */
static inline long freezable_schedule_timeout_killable(long timeout)
{
        long __retval;
        freezer_do_not_count();
        __retval = schedule_timeout_killable(timeout);
        freezer_count();
        return __retval;
}

/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
static inline long freezable_schedule_timeout_killable_unsafe(long timeout)
{
        long __retval;
        freezer_do_not_count();
        __retval = schedule_timeout_killable(timeout);
        freezer_count_unsafe();
        return __retval;
}

/*
 * Like schedule_hrtimeout_range(), but should not block the freezer.  Do not
 * call this with locks held.
 */
static inline int freezable_schedule_hrtimeout_range(ktime_t *expires,
                u64 delta, const enum hrtimer_mode mode)
{
        int __retval;
        freezer_do_not_count();
        __retval = schedule_hrtimeout_range(expires, delta, mode);
        freezer_count();
        return __retval;
}

/*
 * Freezer-friendly wrappers around wait_event_interruptible(),
 * wait_event_killable() and wait_event_interruptible_timeout(), originally
 * defined in <linux/wait.h>
 */

/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
#define wait_event_freezekillable_unsafe(wq, condition)                        \
({                                                                        \
        int __retval;                                                        \
        freezer_do_not_count();                                                \
        __retval = wait_event_killable(wq, (condition));                \
        freezer_count_unsafe();                                                \
        __retval;                                                        \
})

#else /* !CONFIG_FREEZER */
static inline bool frozen(struct task_struct *p) { return false; }
static inline bool freezing(struct task_struct *p) { return false; }
static inline void __thaw_task(struct task_struct *t) {}

static inline bool __refrigerator(bool check_kthr_stop) { return false; }
static inline int freeze_processes(void) { return -ENOSYS; }
static inline int freeze_kernel_threads(void) { return -ENOSYS; }
static inline void thaw_processes(void) {}
static inline void thaw_kernel_threads(void) {}

static inline bool try_to_freeze_nowarn(void) { return false; }
static inline bool try_to_freeze(void) { return false; }

static inline void freezer_do_not_count(void) {}
static inline void freezer_count(void) {}
static inline int freezer_should_skip(struct task_struct *p) { return 0; }
static inline void set_freezable(void) {}

#define freezable_schedule()  schedule()

#define freezable_schedule_unsafe()  schedule()

#define freezable_schedule_timeout(timeout)  schedule_timeout(timeout)

#define freezable_schedule_timeout_interruptible(timeout)                \
        schedule_timeout_interruptible(timeout)

#define freezable_schedule_timeout_interruptible_unsafe(timeout)        \
        schedule_timeout_interruptible(timeout)

#define freezable_schedule_timeout_killable(timeout)                        \
        schedule_timeout_killable(timeout)

#define freezable_schedule_timeout_killable_unsafe(timeout)                \
        schedule_timeout_killable(timeout)

#define freezable_schedule_hrtimeout_range(expires, delta, mode)        \
        schedule_hrtimeout_range(expires, delta, mode)

#define wait_event_freezekillable_unsafe(wq, condition)                        \
                wait_event_killable(wq, condition)

#endif /* !CONFIG_FREEZER */

#endif        /* FREEZER_H_INCLUDED */




















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _KBD_KERN_H
#define _KBD_KERN_H

#include <linux/tty.h>
#include <linux/interrupt.h>
#include <linux/keyboard.h>

extern struct tasklet_struct keyboard_tasklet;

extern char *func_table[MAX_NR_FUNC];
extern char func_buf[];
extern char *funcbufptr;
extern int funcbufsize, funcbufleft;

/*
 * kbd->xxx contains the VC-local things (flag settings etc..)
 *
 * Note: externally visible are LED_SCR, LED_NUM, LED_CAP defined in kd.h
 *       The code in KDGETLED / KDSETLED depends on the internal and
 *       external order being the same.
 *
 * Note: lockstate is used as index in the array key_map.
 */
struct kbd_struct {

        unsigned char lockstate;
/* 8 modifiers - the names do not have any meaning at all;
   they can be associated to arbitrarily chosen keys */
#define VC_SHIFTLOCK        KG_SHIFT        /* shift lock mode */
#define VC_ALTGRLOCK        KG_ALTGR        /* altgr lock mode */
#define VC_CTRLLOCK        KG_CTRL         /* control lock mode */
#define VC_ALTLOCK        KG_ALT          /* alt lock mode */
#define VC_SHIFTLLOCK        KG_SHIFTL        /* shiftl lock mode */
#define VC_SHIFTRLOCK        KG_SHIFTR        /* shiftr lock mode */
#define VC_CTRLLLOCK        KG_CTRLL         /* ctrll lock mode */
#define VC_CTRLRLOCK        KG_CTRLR         /* ctrlr lock mode */
        unsigned char slockstate;         /* for `sticky' Shift, Ctrl, etc. */

        unsigned char ledmode:1;
#define LED_SHOW_FLAGS 0        /* traditional state */
#define LED_SHOW_IOCTL 1        /* only change leds upon ioctl */

        unsigned char ledflagstate:4;        /* flags, not lights */
        unsigned char default_ledflagstate:4;
#define VC_SCROLLOCK        0        /* scroll-lock mode */
#define VC_NUMLOCK        1        /* numeric lock mode */
#define VC_CAPSLOCK        2        /* capslock mode */
#define VC_KANALOCK        3        /* kanalock mode */

        unsigned char kbdmode:3;        /* one 3-bit value */
#define VC_XLATE        0        /* translate keycodes using keymap */
#define VC_MEDIUMRAW        1        /* medium raw (keycode) mode */
#define VC_RAW                2        /* raw (scancode) mode */
#define VC_UNICODE        3        /* Unicode mode */
#define VC_OFF                4        /* disabled mode */

        unsigned char modeflags:5;
#define VC_APPLIC        0        /* application key mode */
#define VC_CKMODE        1        /* cursor key mode */
#define VC_REPEAT        2        /* keyboard repeat */
#define VC_CRLF                3        /* 0 - enter sends CR, 1 - enter sends CRLF */
#define VC_META                4        /* 0 - meta, 1 - meta=prefix with ESC */
};

extern int kbd_init(void);

extern void setledstate(struct kbd_struct *kbd, unsigned int led);

extern int do_poke_blanked_console;

extern void (*kbd_ledfunc)(unsigned int led);

extern int set_console(int nr);
extern void schedule_console_callback(void);

/* FIXME: review locking for vt.c callers */
static inline void set_leds(void)
{
        tasklet_schedule(&keyboard_tasklet);
}

static inline int vc_kbd_mode(struct kbd_struct * kbd, int flag)
{
        return ((kbd->modeflags >> flag) & 1);
}

static inline int vc_kbd_led(struct kbd_struct * kbd, int flag)
{
        return ((kbd->ledflagstate >> flag) & 1);
}

static inline void set_vc_kbd_mode(struct kbd_struct * kbd, int flag)
{
        kbd->modeflags |= 1 << flag;
}

static inline void set_vc_kbd_led(struct kbd_struct * kbd, int flag)
{
        kbd->ledflagstate |= 1 << flag;
}

static inline void clr_vc_kbd_mode(struct kbd_struct * kbd, int flag)
{
        kbd->modeflags &= ~(1 << flag);
}

static inline void clr_vc_kbd_led(struct kbd_struct * kbd, int flag)
{
        kbd->ledflagstate &= ~(1 << flag);
}

static inline void chg_vc_kbd_lock(struct kbd_struct * kbd, int flag)
{
        kbd->lockstate ^= 1 << flag;
}

static inline void chg_vc_kbd_slock(struct kbd_struct * kbd, int flag)
{
        kbd->slockstate ^= 1 << flag;
}

static inline void chg_vc_kbd_mode(struct kbd_struct * kbd, int flag)
{
        kbd->modeflags ^= 1 << flag;
}

static inline void chg_vc_kbd_led(struct kbd_struct * kbd, int flag)
{
        kbd->ledflagstate ^= 1 << flag;
}

#define U(x) ((x) ^ 0xf000)

#define BRL_UC_ROW 0x2800

/* keyboard.c */

struct console;

void compute_shiftstate(void);

/* defkeymap.c */

extern unsigned int keymap_count;

#endif






































    1 

































    1 

    1 
    1 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BLK_MQ_SCHED_H
#define BLK_MQ_SCHED_H

#include "blk-mq.h"
#include "blk-mq-tag.h"

void blk_mq_sched_assign_ioc(struct request *rq);

void blk_mq_sched_request_inserted(struct request *rq);
bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
                unsigned int nr_segs, struct request **merged_request);
bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
                unsigned int nr_segs);
bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx);
void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);

void blk_mq_sched_insert_request(struct request *rq, bool at_head,
                                 bool run_queue, bool async);
void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
                                  struct blk_mq_ctx *ctx,
                                  struct list_head *list, bool run_queue_async);

void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);

int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
void blk_mq_sched_free_requests(struct request_queue *q);

static inline bool
blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
                unsigned int nr_segs)
{
        if (blk_queue_nomerges(q) || !bio_mergeable(bio))
                return false;

        return __blk_mq_sched_bio_merge(q, bio, nr_segs);
}

static inline bool
blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
                         struct bio *bio)
{
        struct elevator_queue *e = q->elevator;

        if (e && e->type->ops.allow_merge)
                return e->type->ops.allow_merge(q, rq, bio);

        return true;
}

static inline void blk_mq_sched_completed_request(struct request *rq, u64 now)
{
        struct elevator_queue *e = rq->q->elevator;

        if (e && e->type->ops.completed_request)
                e->type->ops.completed_request(rq, now);
}

static inline void blk_mq_sched_requeue_request(struct request *rq)
{
        struct request_queue *q = rq->q;
        struct elevator_queue *e = q->elevator;

        if ((rq->rq_flags & RQF_ELVPRIV) && e && e->type->ops.requeue_request)
                e->type->ops.requeue_request(rq);
}

static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx)
{
        struct elevator_queue *e = hctx->queue->elevator;

        if (e && e->type->ops.has_work)
                return e->type->ops.has_work(hctx);

        return false;
}

static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx)
{
        return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
}

#endif
































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* -*- mode: c; c-basic-offset:8; -*-
 * vim: noexpandtab sw=8 ts=8 sts=0:
 *
 * configfs_internal.h - Internal stuff for configfs
 *
 * Based on sysfs:
 *         sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
 *
 * configfs Copyright (C) 2005 Oracle.  All rights reserved.
 */

#ifdef pr_fmt
#undef pr_fmt
#endif

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/slab.h>
#include <linux/list.h>
#include <linux/spinlock.h>

struct configfs_fragment {
        atomic_t frag_count;
        struct rw_semaphore frag_sem;
        bool frag_dead;
};

void put_fragment(struct configfs_fragment *);
struct configfs_fragment *get_fragment(struct configfs_fragment *);

struct configfs_dirent {
        atomic_t                s_count;
        int                        s_dependent_count;
        struct list_head        s_sibling;
        struct list_head        s_children;
        int                        s_links;
        void                        * s_element;
        int                        s_type;
        umode_t                        s_mode;
        struct dentry                * s_dentry;
        struct iattr                * s_iattr;
#ifdef CONFIG_LOCKDEP
        int                        s_depth;
#endif
        struct configfs_fragment *s_frag;
};

#define CONFIGFS_ROOT                0x0001
#define CONFIGFS_DIR                0x0002
#define CONFIGFS_ITEM_ATTR        0x0004
#define CONFIGFS_ITEM_BIN_ATTR        0x0008
#define CONFIGFS_ITEM_LINK        0x0020
#define CONFIGFS_USET_DIR        0x0040
#define CONFIGFS_USET_DEFAULT        0x0080
#define CONFIGFS_USET_DROPPING        0x0100
#define CONFIGFS_USET_IN_MKDIR        0x0200
#define CONFIGFS_USET_CREATING        0x0400
#define CONFIGFS_NOT_PINNED        (CONFIGFS_ITEM_ATTR | CONFIGFS_ITEM_BIN_ATTR)

extern struct mutex configfs_symlink_mutex;
extern spinlock_t configfs_dirent_lock;

extern struct kmem_cache *configfs_dir_cachep;

extern int configfs_is_root(struct config_item *item);

extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *, struct super_block *);
extern struct inode *configfs_create(struct dentry *, umode_t mode);

extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
extern int configfs_create_bin_file(struct config_item *,
                                    const struct configfs_bin_attribute *);
extern int configfs_make_dirent(struct configfs_dirent *, struct dentry *,
                                void *, umode_t, int, struct configfs_fragment *);
extern int configfs_dirent_is_ready(struct configfs_dirent *);

extern void configfs_hash_and_remove(struct dentry * dir, const char * name);

extern const unsigned char * configfs_get_name(struct configfs_dirent *sd);
extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent);
extern int configfs_setattr(struct dentry *dentry, struct iattr *iattr);

extern struct dentry *configfs_pin_fs(void);
extern void configfs_release_fs(void);

extern const struct file_operations configfs_dir_operations;
extern const struct file_operations configfs_file_operations;
extern const struct file_operations configfs_bin_file_operations;
extern const struct inode_operations configfs_dir_inode_operations;
extern const struct inode_operations configfs_root_inode_operations;
extern const struct inode_operations configfs_symlink_inode_operations;
extern const struct dentry_operations configfs_dentry_ops;

extern int configfs_symlink(struct inode *dir, struct dentry *dentry,
                            const char *symname);
extern int configfs_unlink(struct inode *dir, struct dentry *dentry);

int configfs_create_link(struct configfs_dirent *target, struct dentry *parent,
                struct dentry *dentry, char *body);

static inline struct config_item * to_item(struct dentry * dentry)
{
        struct configfs_dirent * sd = dentry->d_fsdata;
        return ((struct config_item *) sd->s_element);
}

static inline struct configfs_attribute * to_attr(struct dentry * dentry)
{
        struct configfs_dirent * sd = dentry->d_fsdata;
        return ((struct configfs_attribute *) sd->s_element);
}

static inline struct configfs_bin_attribute *to_bin_attr(struct dentry *dentry)
{
        struct configfs_attribute *attr = to_attr(dentry);

        return container_of(attr, struct configfs_bin_attribute, cb_attr);
}

static inline struct config_item *configfs_get_config_item(struct dentry *dentry)
{
        struct config_item * item = NULL;

        spin_lock(&dentry->d_lock);
        if (!d_unhashed(dentry)) {
                struct configfs_dirent * sd = dentry->d_fsdata;
                item = config_item_get(sd->s_element);
        }
        spin_unlock(&dentry->d_lock);

        return item;
}

static inline void release_configfs_dirent(struct configfs_dirent * sd)
{
        if (!(sd->s_type & CONFIGFS_ROOT)) {
                kfree(sd->s_iattr);
                put_fragment(sd->s_frag);
                kmem_cache_free(configfs_dir_cachep, sd);
        }
}

static inline struct configfs_dirent * configfs_get(struct configfs_dirent * sd)
{
        if (sd) {
                WARN_ON(!atomic_read(&sd->s_count));
                atomic_inc(&sd->s_count);
        }
        return sd;
}

static inline void configfs_put(struct configfs_dirent * sd)
{
        WARN_ON(!atomic_read(&sd->s_count));
        if (atomic_dec_and_test(&sd->s_count))
                release_configfs_dirent(sd);
}






















    5 


















    1 






























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
// SPDX-License-Identifier: GPL-2.0
#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/bitops.h>
#include <linux/cpumask.h>
#include <linux/export.h>
#include <linux/memblock.h>
#include <linux/numa.h>

/**
 * cpumask_next - get the next cpu in a cpumask
 * @n: the cpu prior to the place to search (ie. return will be > @n)
 * @srcp: the cpumask pointer
 *
 * Returns >= nr_cpu_ids if no further cpus set.
 */
unsigned int cpumask_next(int n, const struct cpumask *srcp)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpumask_check(n);
        return find_next_bit(cpumask_bits(srcp), nr_cpumask_bits, n + 1);
}
EXPORT_SYMBOL(cpumask_next);

/**
 * cpumask_next_and - get the next cpu in *src1p & *src2p
 * @n: the cpu prior to the place to search (ie. return will be > @n)
 * @src1p: the first cpumask pointer
 * @src2p: the second cpumask pointer
 *
 * Returns >= nr_cpu_ids if no further cpus set in both.
 */
int cpumask_next_and(int n, const struct cpumask *src1p,
                     const struct cpumask *src2p)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpumask_check(n);
        return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p),
                nr_cpumask_bits, n + 1);
}
EXPORT_SYMBOL(cpumask_next_and);

/**
 * cpumask_any_but - return a "random" in a cpumask, but not this one.
 * @mask: the cpumask to search
 * @cpu: the cpu to ignore.
 *
 * Often used to find any cpu but smp_processor_id() in a mask.
 * Returns >= nr_cpu_ids if no cpus set.
 */
int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
{
        unsigned int i;

        cpumask_check(cpu);
        for_each_cpu(i, mask)
                if (i != cpu)
                        break;
        return i;
}
EXPORT_SYMBOL(cpumask_any_but);

/**
 * cpumask_next_wrap - helper to implement for_each_cpu_wrap
 * @n: the cpu prior to the place to search
 * @mask: the cpumask pointer
 * @start: the start point of the iteration
 * @wrap: assume @n crossing @start terminates the iteration
 *
 * Returns >= nr_cpu_ids on completion
 *
 * Note: the @wrap argument is required for the start condition when
 * we cannot assume @start is set in @mask.
 */
int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap)
{
        int next;

again:
        next = cpumask_next(n, mask);

        if (wrap && n < start && next >= start) {
                return nr_cpumask_bits;

        } else if (next >= nr_cpumask_bits) {
                wrap = true;
                n = -1;
                goto again;
        }

        return next;
}
EXPORT_SYMBOL(cpumask_next_wrap);

/* These are not inline because of header tangles. */
#ifdef CONFIG_CPUMASK_OFFSTACK
/**
 * alloc_cpumask_var_node - allocate a struct cpumask on a given node
 * @mask: pointer to cpumask_var_t where the cpumask is returned
 * @flags: GFP_ flags
 *
 * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is
 * a nop returning a constant 1 (in <linux/cpumask.h>)
 * Returns TRUE if memory allocation succeeded, FALSE otherwise.
 *
 * In addition, mask will be NULL if this fails.  Note that gcc is
 * usually smart enough to know that mask can never be NULL if
 * CONFIG_CPUMASK_OFFSTACK=n, so does code elimination in that case
 * too.
 */
bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
{
        *mask = kmalloc_node(cpumask_size(), flags, node);

#ifdef CONFIG_DEBUG_PER_CPU_MAPS
        if (!*mask) {
                printk(KERN_ERR "=> alloc_cpumask_var: failed!\n");
                dump_stack();
        }
#endif

        return *mask != NULL;
}
EXPORT_SYMBOL(alloc_cpumask_var_node);

bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
{
        return alloc_cpumask_var_node(mask, flags | __GFP_ZERO, node);
}
EXPORT_SYMBOL(zalloc_cpumask_var_node);

/**
 * alloc_cpumask_var - allocate a struct cpumask
 * @mask: pointer to cpumask_var_t where the cpumask is returned
 * @flags: GFP_ flags
 *
 * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is
 * a nop returning a constant 1 (in <linux/cpumask.h>).
 *
 * See alloc_cpumask_var_node.
 */
bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
{
        return alloc_cpumask_var_node(mask, flags, NUMA_NO_NODE);
}
EXPORT_SYMBOL(alloc_cpumask_var);

bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
{
        return alloc_cpumask_var(mask, flags | __GFP_ZERO);
}
EXPORT_SYMBOL(zalloc_cpumask_var);

/**
 * alloc_bootmem_cpumask_var - allocate a struct cpumask from the bootmem arena.
 * @mask: pointer to cpumask_var_t where the cpumask is returned
 *
 * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is
 * a nop (in <linux/cpumask.h>).
 * Either returns an allocated (zero-filled) cpumask, or causes the
 * system to panic.
 */
void __init alloc_bootmem_cpumask_var(cpumask_var_t *mask)
{
        *mask = memblock_alloc(cpumask_size(), SMP_CACHE_BYTES);
        if (!*mask)
                panic("%s: Failed to allocate %u bytes\n", __func__,
                      cpumask_size());
}

/**
 * free_cpumask_var - frees memory allocated for a struct cpumask.
 * @mask: cpumask to free
 *
 * This is safe on a NULL mask.
 */
void free_cpumask_var(cpumask_var_t mask)
{
        kfree(mask);
}
EXPORT_SYMBOL(free_cpumask_var);

/**
 * free_bootmem_cpumask_var - frees result of alloc_bootmem_cpumask_var
 * @mask: cpumask to free
 */
void __init free_bootmem_cpumask_var(cpumask_var_t mask)
{
        memblock_free_early(__pa(mask), cpumask_size());
}
#endif

/**
 * cpumask_local_spread - select the i'th cpu with local numa cpu's first
 * @i: index number
 * @node: local numa_node
 *
 * This function selects an online CPU according to a numa aware policy;
 * local cpus are returned first, followed by non-local ones, then it
 * wraps around.
 *
 * It's not very efficient, but useful for setup.
 */
unsigned int cpumask_local_spread(unsigned int i, int node)
{
        int cpu;

        /* Wrap: we always want a cpu. */
        i %= num_online_cpus();

        if (node == NUMA_NO_NODE) {
                for_each_cpu(cpu, cpu_online_mask)
                        if (i-- == 0)
                                return cpu;
        } else {
                /* NUMA first. */
                for_each_cpu_and(cpu, cpumask_of_node(node), cpu_online_mask)
                        if (i-- == 0)
                                return cpu;

                for_each_cpu(cpu, cpu_online_mask) {
                        /* Skip NUMA nodes, done above. */
                        if (cpumask_test_cpu(cpu, cpumask_of_node(node)))
                                continue;

                        if (i-- == 0)
                                return cpu;
                }
        }
        BUG();
}
EXPORT_SYMBOL(cpumask_local_spread);

static DEFINE_PER_CPU(int, distribute_cpu_mask_prev);

/**
 * Returns an arbitrary cpu within srcp1 & srcp2.
 *
 * Iterated calls using the same srcp1 and srcp2 will be distributed within
 * their intersection.
 *
 * Returns >= nr_cpu_ids if the intersection is empty.
 */
int cpumask_any_and_distribute(const struct cpumask *src1p,
                               const struct cpumask *src2p)
{
        int next, prev;

        /* NOTE: our first selection will skip 0. */
        prev = __this_cpu_read(distribute_cpu_mask_prev);

        next = cpumask_next_and(prev, src1p, src2p);
        if (next >= nr_cpu_ids)
                next = cpumask_first_and(src1p, src2p);

        if (next < nr_cpu_ids)
                __this_cpu_write(distribute_cpu_mask_prev, next);

        return next;
}
EXPORT_SYMBOL(cpumask_any_and_distribute);










































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * User-mode machine state access
 *
 * Copyright (C) 2007 Red Hat, Inc.  All rights reserved.
 *
 * Red Hat Author: Roland McGrath.
 */

#ifndef _LINUX_REGSET_H
#define _LINUX_REGSET_H        1

#include <linux/compiler.h>
#include <linux/types.h>
#include <linux/bug.h>
#include <linux/uaccess.h>
struct task_struct;
struct user_regset;

struct membuf {
        void *p;
        size_t left;
};

static inline int membuf_zero(struct membuf *s, size_t size)
{
        if (s->left) {
                if (size > s->left)
                        size = s->left;
                memset(s->p, 0, size);
                s->p += size;
                s->left -= size;
        }
        return s->left;
}

static inline int membuf_write(struct membuf *s, const void *v, size_t size)
{
        if (s->left) {
                if (size > s->left)
                        size = s->left;
                memcpy(s->p, v, size);
                s->p += size;
                s->left -= size;
        }
        return s->left;
}

/* current s->p must be aligned for v; v must be a scalar */
#define membuf_store(s, v)                                \
({                                                        \
        struct membuf *__s = (s);                        \
        if (__s->left) {                                \
                typeof(v) __v = (v);                        \
                size_t __size = sizeof(__v);                \
                if (unlikely(__size > __s->left)) {        \
                        __size = __s->left;                \
                        memcpy(__s->p, &__v, __size);        \
                } else {                                \
                        *(typeof(__v + 0) *)__s->p = __v;        \
                }                                        \
                __s->p += __size;                        \
                __s->left -= __size;                        \
        }                                                \
        __s->left;})

/**
 * user_regset_active_fn - type of @active function in &struct user_regset
 * @target:        thread being examined
 * @regset:        regset being examined
 *
 * Return -%ENODEV if not available on the hardware found.
 * Return %0 if no interesting state in this thread.
 * Return >%0 number of @size units of interesting state.
 * Any get call fetching state beyond that number will
 * see the default initialization state for this data,
 * so a caller that knows what the default state is need
 * not copy it all out.
 * This call is optional; the pointer is %NULL if there
 * is no inexpensive check to yield a value < @n.
 */
typedef int user_regset_active_fn(struct task_struct *target,
                                  const struct user_regset *regset);

typedef int user_regset_get2_fn(struct task_struct *target,
                               const struct user_regset *regset,
                               struct membuf to);

/**
 * user_regset_set_fn - type of @set function in &struct user_regset
 * @target:        thread being examined
 * @regset:        regset being examined
 * @pos:        offset into the regset data to access, in bytes
 * @count:        amount of data to copy, in bytes
 * @kbuf:        if not %NULL, a kernel-space pointer to copy from
 * @ubuf:        if @kbuf is %NULL, a user-space pointer to copy from
 *
 * Store register values.  Return %0 on success; -%EIO or -%ENODEV
 * are usual failure returns.  The @pos and @count values are in
 * bytes, but must be properly aligned.  If @kbuf is non-null, that
 * buffer is used and @ubuf is ignored.  If @kbuf is %NULL, then
 * ubuf gives a userland pointer to access directly, and an -%EFAULT
 * return value is possible.
 */
typedef int user_regset_set_fn(struct task_struct *target,
                               const struct user_regset *regset,
                               unsigned int pos, unsigned int count,
                               const void *kbuf, const void __user *ubuf);

/**
 * user_regset_writeback_fn - type of @writeback function in &struct user_regset
 * @target:        thread being examined
 * @regset:        regset being examined
 * @immediate:        zero if writeback at completion of next context switch is OK
 *
 * This call is optional; usually the pointer is %NULL.  When
 * provided, there is some user memory associated with this regset's
 * hardware, such as memory backing cached register data on register
 * window machines; the regset's data controls what user memory is
 * used (e.g. via the stack pointer value).
 *
 * Write register data back to user memory.  If the @immediate flag
 * is nonzero, it must be written to the user memory so uaccess or
 * access_process_vm() can see it when this call returns; if zero,
 * then it must be written back by the time the task completes a
 * context switch (as synchronized with wait_task_inactive()).
 * Return %0 on success or if there was nothing to do, -%EFAULT for
 * a memory problem (bad stack pointer or whatever), or -%EIO for a
 * hardware problem.
 */
typedef int user_regset_writeback_fn(struct task_struct *target,
                                     const struct user_regset *regset,
                                     int immediate);

/**
 * struct user_regset - accessible thread CPU state
 * @n:                        Number of slots (registers).
 * @size:                Size in bytes of a slot (register).
 * @align:                Required alignment, in bytes.
 * @bias:                Bias from natural indexing.
 * @core_note_type:        ELF note @n_type value used in core dumps.
 * @get:                Function to fetch values.
 * @set:                Function to store values.
 * @active:                Function to report if regset is active, or %NULL.
 * @writeback:                Function to write data back to user memory, or %NULL.
 *
 * This data structure describes a machine resource we call a register set.
 * This is part of the state of an individual thread, not necessarily
 * actual CPU registers per se.  A register set consists of a number of
 * similar slots, given by @n.  Each slot is @size bytes, and aligned to
 * @align bytes (which is at least @size).  For dynamically-sized
 * regsets, @n must contain the maximum possible number of slots for the
 * regset.
 *
 * For backward compatibility, the @get and @set methods must pad to, or
 * accept, @n * @size bytes, even if the current regset size is smaller.
 * The precise semantics of these operations depend on the regset being
 * accessed.
 *
 * The functions to which &struct user_regset members point must be
 * called only on the current thread or on a thread that is in
 * %TASK_STOPPED or %TASK_TRACED state, that we are guaranteed will not
 * be woken up and return to user mode, and that we have called
 * wait_task_inactive() on.  (The target thread always might wake up for
 * SIGKILL while these functions are working, in which case that
 * thread's user_regset state might be scrambled.)
 *
 * The @pos argument must be aligned according to @align; the @count
 * argument must be a multiple of @size.  These functions are not
 * responsible for checking for invalid arguments.
 *
 * When there is a natural value to use as an index, @bias gives the
 * difference between the natural index and the slot index for the
 * register set.  For example, x86 GDT segment descriptors form a regset;
 * the segment selector produces a natural index, but only a subset of
 * that index space is available as a regset (the TLS slots); subtracting
 * @bias from a segment selector index value computes the regset slot.
 *
 * If nonzero, @core_note_type gives the n_type field (NT_* value)
 * of the core file note in which this regset's data appears.
 * NT_PRSTATUS is a special case in that the regset data starts at
 * offsetof(struct elf_prstatus, pr_reg) into the note data; that is
 * part of the per-machine ELF formats userland knows about.  In
 * other cases, the core file note contains exactly the whole regset
 * (@n * @size) and nothing else.  The core file note is normally
 * omitted when there is an @active function and it returns zero.
 */
struct user_regset {
        user_regset_get2_fn                *regset_get;
        user_regset_set_fn                *set;
        user_regset_active_fn                *active;
        user_regset_writeback_fn        *writeback;
        unsigned int                        n;
        unsigned int                         size;
        unsigned int                         align;
        unsigned int                         bias;
        unsigned int                         core_note_type;
};

/**
 * struct user_regset_view - available regsets
 * @name:        Identifier, e.g. UTS_MACHINE string.
 * @regsets:        Array of @n regsets available in this view.
 * @n:                Number of elements in @regsets.
 * @e_machine:        ELF header @e_machine %EM_* value written in core dumps.
 * @e_flags:        ELF header @e_flags value written in core dumps.
 * @ei_osabi:        ELF header @e_ident[%EI_OSABI] value written in core dumps.
 *
 * A regset view is a collection of regsets (&struct user_regset,
 * above).  This describes all the state of a thread that can be seen
 * from a given architecture/ABI environment.  More than one view might
 * refer to the same &struct user_regset, or more than one regset
 * might refer to the same machine-specific state in the thread.  For
 * example, a 32-bit thread's state could be examined from the 32-bit
 * view or from the 64-bit view.  Either method reaches the same thread
 * register state, doing appropriate widening or truncation.
 */
struct user_regset_view {
        const char *name;
        const struct user_regset *regsets;
        unsigned int n;
        u32 e_flags;
        u16 e_machine;
        u8 ei_osabi;
};

/*
 * This is documented here rather than at the definition sites because its
 * implementation is machine-dependent but its interface is universal.
 */
/**
 * task_user_regset_view - Return the process's native regset view.
 * @tsk: a thread of the process in question
 *
 * Return the &struct user_regset_view that is native for the given process.
 * For example, what it would access when it called ptrace().
 * Throughout the life of the process, this only changes at exec.
 */
const struct user_regset_view *task_user_regset_view(struct task_struct *tsk);

static inline int user_regset_copyin(unsigned int *pos, unsigned int *count,
                                     const void **kbuf,
                                     const void __user **ubuf, void *data,
                                     const int start_pos, const int end_pos)
{
        if (*count == 0)
                return 0;
        BUG_ON(*pos < start_pos);
        if (end_pos < 0 || *pos < end_pos) {
                unsigned int copy = (end_pos < 0 ? *count
                                     : min(*count, end_pos - *pos));
                data += *pos - start_pos;
                if (*kbuf) {
                        memcpy(data, *kbuf, copy);
                        *kbuf += copy;
                } else if (__copy_from_user(data, *ubuf, copy))
                        return -EFAULT;
                else
                        *ubuf += copy;
                *pos += copy;
                *count -= copy;
        }
        return 0;
}

static inline int user_regset_copyin_ignore(unsigned int *pos,
                                            unsigned int *count,
                                            const void **kbuf,
                                            const void __user **ubuf,
                                            const int start_pos,
                                            const int end_pos)
{
        if (*count == 0)
                return 0;
        BUG_ON(*pos < start_pos);
        if (end_pos < 0 || *pos < end_pos) {
                unsigned int copy = (end_pos < 0 ? *count
                                     : min(*count, end_pos - *pos));
                if (*kbuf)
                        *kbuf += copy;
                else
                        *ubuf += copy;
                *pos += copy;
                *count -= copy;
        }
        return 0;
}

extern int regset_get(struct task_struct *target,
                      const struct user_regset *regset,
                      unsigned int size, void *data);

extern int regset_get_alloc(struct task_struct *target,
                            const struct user_regset *regset,
                            unsigned int size,
                            void **data);

extern int copy_regset_to_user(struct task_struct *target,
                               const struct user_regset_view *view,
                               unsigned int setno, unsigned int offset,
                               unsigned int size, void __user *data);

/**
 * copy_regset_from_user - store into thread's user_regset data from user memory
 * @target:        thread to be examined
 * @view:        &struct user_regset_view describing user thread machine state
 * @setno:        index in @view->regsets
 * @offset:        offset into the regset data, in bytes
 * @size:        amount of data to copy, in bytes
 * @data:        user-mode pointer to copy from
 */
static inline int copy_regset_from_user(struct task_struct *target,
                                        const struct user_regset_view *view,
                                        unsigned int setno,
                                        unsigned int offset, unsigned int size,
                                        const void __user *data)
{
        const struct user_regset *regset = &view->regsets[setno];

        if (!regset->set)
                return -EOPNOTSUPP;

        if (!access_ok(data, size))
                return -EFAULT;

        return regset->set(target, regset, offset, size, NULL, data);
}

#endif        /* <linux/regset.h> */


















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
/*
   BlueZ - Bluetooth protocol stack for Linux
   Copyright (C) 2014 Intel Corporation

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License version 2 as
   published by the Free Software Foundation;

   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
   IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
   CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

   ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
   COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
   SOFTWARE IS DISCLAIMED.
*/

#include <asm/unaligned.h>

#define hci_req_sync_lock(hdev)   mutex_lock(&hdev->req_lock)
#define hci_req_sync_unlock(hdev) mutex_unlock(&hdev->req_lock)

struct hci_request {
        struct hci_dev                *hdev;
        struct sk_buff_head        cmd_q;

        /* If something goes wrong when building the HCI request, the error
         * value is stored in this field.
         */
        int                        err;
};

void hci_req_init(struct hci_request *req, struct hci_dev *hdev);
void hci_req_purge(struct hci_request *req);
bool hci_req_status_pend(struct hci_dev *hdev);
int hci_req_run(struct hci_request *req, hci_req_complete_t complete);
int hci_req_run_skb(struct hci_request *req, hci_req_complete_skb_t complete);
void hci_req_add(struct hci_request *req, u16 opcode, u32 plen,
                 const void *param);
void hci_req_add_ev(struct hci_request *req, u16 opcode, u32 plen,
                    const void *param, u8 event);
void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status,
                          hci_req_complete_t *req_complete,
                          hci_req_complete_skb_t *req_complete_skb);

int hci_req_sync(struct hci_dev *hdev, int (*req)(struct hci_request *req,
                                                  unsigned long opt),
                 unsigned long opt, u32 timeout, u8 *hci_status);
int __hci_req_sync(struct hci_dev *hdev, int (*func)(struct hci_request *req,
                                                     unsigned long opt),
                   unsigned long opt, u32 timeout, u8 *hci_status);
void hci_req_sync_cancel(struct hci_dev *hdev, int err);

struct sk_buff *hci_prepare_cmd(struct hci_dev *hdev, u16 opcode, u32 plen,
                                const void *param);

int __hci_req_hci_power_on(struct hci_dev *hdev);

void __hci_req_write_fast_connectable(struct hci_request *req, bool enable);
void __hci_req_update_name(struct hci_request *req);
void __hci_req_update_eir(struct hci_request *req);

void hci_req_add_le_scan_disable(struct hci_request *req, bool rpa_le_conn);
void hci_req_add_le_passive_scan(struct hci_request *req);

void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next);

void hci_req_disable_address_resolution(struct hci_dev *hdev);
void hci_req_reenable_advertising(struct hci_dev *hdev);
void __hci_req_enable_advertising(struct hci_request *req);
void __hci_req_disable_advertising(struct hci_request *req);
void __hci_req_update_adv_data(struct hci_request *req, u8 instance);
int hci_req_update_adv_data(struct hci_dev *hdev, u8 instance);
void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance);

int __hci_req_schedule_adv_instance(struct hci_request *req, u8 instance,
                                    bool force);
void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk,
                                struct hci_request *req, u8 instance,
                                bool force);

int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance);
int __hci_req_start_ext_adv(struct hci_request *req, u8 instance);
int __hci_req_enable_ext_advertising(struct hci_request *req, u8 instance);
int __hci_req_disable_ext_adv_instance(struct hci_request *req, u8 instance);
int __hci_req_remove_ext_adv_instance(struct hci_request *req, u8 instance);
void __hci_req_clear_ext_adv_sets(struct hci_request *req);
int hci_get_random_address(struct hci_dev *hdev, bool require_privacy,
                           bool use_rpa, struct adv_info *adv_instance,
                           u8 *own_addr_type, bdaddr_t *rand_addr);

void __hci_req_update_class(struct hci_request *req);

/* Returns true if HCI commands were queued */
bool hci_req_stop_discovery(struct hci_request *req);

static inline void hci_req_update_scan(struct hci_dev *hdev)
{
        queue_work(hdev->req_workqueue, &hdev->scan_update);
}

void __hci_req_update_scan(struct hci_request *req);

int hci_update_random_address(struct hci_request *req, bool require_privacy,
                              bool use_rpa, u8 *own_addr_type);

int hci_abort_conn(struct hci_conn *conn, u8 reason);
void __hci_abort_conn(struct hci_request *req, struct hci_conn *conn,
                      u8 reason);

static inline void hci_update_background_scan(struct hci_dev *hdev)
{
        queue_work(hdev->req_workqueue, &hdev->bg_scan_update);
}

void hci_request_setup(struct hci_dev *hdev);
void hci_request_cancel_all(struct hci_dev *hdev);

u8 append_local_name(struct hci_dev *hdev, u8 *ptr, u8 ad_len);

static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type,
                                  u8 *data, u8 data_len)
{
        eir[eir_len++] = sizeof(type) + data_len;
        eir[eir_len++] = type;
        memcpy(&eir[eir_len], data, data_len);
        eir_len += data_len;

        return eir_len;
}

static inline u16 eir_append_le16(u8 *eir, u16 eir_len, u8 type, u16 data)
{
        eir[eir_len++] = sizeof(type) + sizeof(data);
        eir[eir_len++] = type;
        put_unaligned_le16(data, &eir[eir_len]);
        eir_len += sizeof(data);

        return eir_len;
}




























































































































    9 




    9 
















    3 

    3 


























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
// SPDX-License-Identifier: GPL-2.0
#include <linux/err.h>
#include <linux/bug.h>
#include <linux/atomic.h>
#include <linux/errseq.h>

/*
 * An errseq_t is a way of recording errors in one place, and allowing any
 * number of "subscribers" to tell whether it has changed since a previous
 * point where it was sampled.
 *
 * It's implemented as an unsigned 32-bit value. The low order bits are
 * designated to hold an error code (between 0 and -MAX_ERRNO). The upper bits
 * are used as a counter. This is done with atomics instead of locking so that
 * these functions can be called from any context.
 *
 * The general idea is for consumers to sample an errseq_t value. That value
 * can later be used to tell whether any new errors have occurred since that
 * sampling was done.
 *
 * Note that there is a risk of collisions if new errors are being recorded
 * frequently, since we have so few bits to use as a counter.
 *
 * To mitigate this, one bit is used as a flag to tell whether the value has
 * been sampled since a new value was recorded. That allows us to avoid bumping
 * the counter if no one has sampled it since the last time an error was
 * recorded.
 *
 * A new errseq_t should always be zeroed out.  A errseq_t value of all zeroes
 * is the special (but common) case where there has never been an error. An all
 * zero value thus serves as the "epoch" if one wishes to know whether there
 * has ever been an error set since it was first initialized.
 */

/* The low bits are designated for error code (max of MAX_ERRNO) */
#define ERRSEQ_SHIFT                ilog2(MAX_ERRNO + 1)

/* This bit is used as a flag to indicate whether the value has been seen */
#define ERRSEQ_SEEN                (1 << ERRSEQ_SHIFT)

/* The lowest bit of the counter */
#define ERRSEQ_CTR_INC                (1 << (ERRSEQ_SHIFT + 1))

/**
 * errseq_set - set a errseq_t for later reporting
 * @eseq: errseq_t field that should be set
 * @err: error to set (must be between -1 and -MAX_ERRNO)
 *
 * This function sets the error in @eseq, and increments the sequence counter
 * if the last sequence was sampled at some point in the past.
 *
 * Any error set will always overwrite an existing error.
 *
 * Return: The previous value, primarily for debugging purposes. The
 * return value should not be used as a previously sampled value in later
 * calls as it will not have the SEEN flag set.
 */
errseq_t errseq_set(errseq_t *eseq, int err)
{
        errseq_t cur, old;

        /* MAX_ERRNO must be able to serve as a mask */
        BUILD_BUG_ON_NOT_POWER_OF_2(MAX_ERRNO + 1);

        /*
         * Ensure the error code actually fits where we want it to go. If it
         * doesn't then just throw a warning and don't record anything. We
         * also don't accept zero here as that would effectively clear a
         * previous error.
         */
        old = READ_ONCE(*eseq);

        if (WARN(unlikely(err == 0 || (unsigned int)-err > MAX_ERRNO),
                                "err = %d\n", err))
                return old;

        for (;;) {
                errseq_t new;

                /* Clear out error bits and set new error */
                new = (old & ~(MAX_ERRNO|ERRSEQ_SEEN)) | -err;

                /* Only increment if someone has looked at it */
                if (old & ERRSEQ_SEEN)
                        new += ERRSEQ_CTR_INC;

                /* If there would be no change, then call it done */
                if (new == old) {
                        cur = new;
                        break;
                }

                /* Try to swap the new value into place */
                cur = cmpxchg(eseq, old, new);

                /*
                 * Call it success if we did the swap or someone else beat us
                 * to it for the same value.
                 */
                if (likely(cur == old || cur == new))
                        break;

                /* Raced with an update, try again */
                old = cur;
        }
        return cur;
}
EXPORT_SYMBOL(errseq_set);

/**
 * errseq_sample() - Grab current errseq_t value.
 * @eseq: Pointer to errseq_t to be sampled.
 *
 * This function allows callers to initialise their errseq_t variable.
 * If the error has been "seen", new callers will not see an old error.
 * If there is an unseen error in @eseq, the caller of this function will
 * see it the next time it checks for an error.
 *
 * Context: Any context.
 * Return: The current errseq value.
 */
errseq_t errseq_sample(errseq_t *eseq)
{
        errseq_t old = READ_ONCE(*eseq);

        /* If nobody has seen this error yet, then we can be the first. */
        if (!(old & ERRSEQ_SEEN))
                old = 0;
        return old;
}
EXPORT_SYMBOL(errseq_sample);

/**
 * errseq_check() - Has an error occurred since a particular sample point?
 * @eseq: Pointer to errseq_t value to be checked.
 * @since: Previously-sampled errseq_t from which to check.
 *
 * Grab the value that eseq points to, and see if it has changed @since
 * the given value was sampled. The @since value is not advanced, so there
 * is no need to mark the value as seen.
 *
 * Return: The latest error set in the errseq_t or 0 if it hasn't changed.
 */
int errseq_check(errseq_t *eseq, errseq_t since)
{
        errseq_t cur = READ_ONCE(*eseq);

        if (likely(cur == since))
                return 0;
        return -(cur & MAX_ERRNO);
}
EXPORT_SYMBOL(errseq_check);

/**
 * errseq_check_and_advance() - Check an errseq_t and advance to current value.
 * @eseq: Pointer to value being checked and reported.
 * @since: Pointer to previously-sampled errseq_t to check against and advance.
 *
 * Grab the eseq value, and see whether it matches the value that @since
 * points to. If it does, then just return 0.
 *
 * If it doesn't, then the value has changed. Set the "seen" flag, and try to
 * swap it into place as the new eseq value. Then, set that value as the new
 * "since" value, and return whatever the error portion is set to.
 *
 * Note that no locking is provided here for concurrent updates to the "since"
 * value. The caller must provide that if necessary. Because of this, callers
 * may want to do a lockless errseq_check before taking the lock and calling
 * this.
 *
 * Return: Negative errno if one has been stored, or 0 if no new error has
 * occurred.
 */
int errseq_check_and_advance(errseq_t *eseq, errseq_t *since)
{
        int err = 0;
        errseq_t old, new;

        /*
         * Most callers will want to use the inline wrapper to check this,
         * so that the common case of no error is handled without needing
         * to take the lock that protects the "since" value.
         */
        old = READ_ONCE(*eseq);
        if (old != *since) {
                /*
                 * Set the flag and try to swap it into place if it has
                 * changed.
                 *
                 * We don't care about the outcome of the swap here. If the
                 * swap doesn't occur, then it has either been updated by a
                 * writer who is altering the value in some way (updating
                 * counter or resetting the error), or another reader who is
                 * just setting the "seen" flag. Either outcome is OK, and we
                 * can advance "since" and return an error based on what we
                 * have.
                 */
                new = old | ERRSEQ_SEEN;
                if (new != old)
                        cmpxchg(eseq, old, new);
                *since = new;
                err = -(new & MAX_ERRNO);
        }
        return err;
}
EXPORT_SYMBOL(errseq_check_and_advance);


























































































































































    7 




    9 





    5 

    5 



















    6 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 *  NSA Security-Enhanced Linux (SELinux) security module
 *
 *  This file contains the SELinux security data structures for kernel objects.
 *
 *  Author(s):  Stephen Smalley, <sds@tycho.nsa.gov>
 *                Chris Vance, <cvance@nai.com>
 *                Wayne Salamon, <wsalamon@nai.com>
 *                James Morris <jmorris@redhat.com>
 *
 *  Copyright (C) 2001,2002 Networks Associates Technology, Inc.
 *  Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
 *  Copyright (C) 2016 Mellanox Technologies
 */
#ifndef _SELINUX_OBJSEC_H_
#define _SELINUX_OBJSEC_H_

#include <linux/list.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/binfmts.h>
#include <linux/in.h>
#include <linux/spinlock.h>
#include <linux/lsm_hooks.h>
#include <linux/msg.h>
#include <net/net_namespace.h>
#include "flask.h"
#include "avc.h"

struct task_security_struct {
        u32 osid;                /* SID prior to last execve */
        u32 sid;                /* current SID */
        u32 exec_sid;                /* exec SID */
        u32 create_sid;                /* fscreate SID */
        u32 keycreate_sid;        /* keycreate SID */
        u32 sockcreate_sid;        /* fscreate SID */
} __randomize_layout;

enum label_initialized {
        LABEL_INVALID,                /* invalid or not initialized */
        LABEL_INITIALIZED,        /* initialized */
        LABEL_PENDING
};

struct inode_security_struct {
        struct inode *inode;        /* back pointer to inode object */
        struct list_head list;        /* list of inode_security_struct */
        u32 task_sid;                /* SID of creating task */
        u32 sid;                /* SID of this object */
        u16 sclass;                /* security class of this object */
        unsigned char initialized;        /* initialization flag */
        spinlock_t lock;
};

struct file_security_struct {
        u32 sid;                /* SID of open file description */
        u32 fown_sid;                /* SID of file owner (for SIGIO) */
        u32 isid;                /* SID of inode at the time of file open */
        u32 pseqno;                /* Policy seqno at the time of file open */
};

struct superblock_security_struct {
        struct super_block *sb;                /* back pointer to sb object */
        u32 sid;                        /* SID of file system superblock */
        u32 def_sid;                        /* default SID for labeling */
        u32 mntpoint_sid;                /* SECURITY_FS_USE_MNTPOINT context for files */
        unsigned short behavior;        /* labeling behavior */
        unsigned short flags;                /* which mount options were specified */
        struct mutex lock;
        struct list_head isec_head;
        spinlock_t isec_lock;
};

struct msg_security_struct {
        u32 sid;        /* SID of message */
};

struct ipc_security_struct {
        u16 sclass;        /* security class of this object */
        u32 sid;        /* SID of IPC resource */
};

struct netif_security_struct {
        struct net *ns;                        /* network namespace */
        int ifindex;                        /* device index */
        u32 sid;                        /* SID for this interface */
};

struct netnode_security_struct {
        union {
                __be32 ipv4;                /* IPv4 node address */
                struct in6_addr ipv6;        /* IPv6 node address */
        } addr;
        u32 sid;                        /* SID for this node */
        u16 family;                        /* address family */
};

struct netport_security_struct {
        u32 sid;                        /* SID for this node */
        u16 port;                        /* port number */
        u8 protocol;                        /* transport protocol */
};

struct sk_security_struct {
#ifdef CONFIG_NETLABEL
        enum {                                /* NetLabel state */
                NLBL_UNSET = 0,
                NLBL_REQUIRE,
                NLBL_LABELED,
                NLBL_REQSKB,
                NLBL_CONNLABELED,
        } nlbl_state;
        struct netlbl_lsm_secattr *nlbl_secattr; /* NetLabel sec attributes */
#endif
        u32 sid;                        /* SID of this object */
        u32 peer_sid;                        /* SID of peer */
        u16 sclass;                        /* sock security class */
        enum {                                /* SCTP association state */
                SCTP_ASSOC_UNSET = 0,
                SCTP_ASSOC_SET,
        } sctp_assoc_state;
};

struct tun_security_struct {
        u32 sid;                        /* SID for the tun device sockets */
};

struct key_security_struct {
        u32 sid;        /* SID of key */
};

struct ib_security_struct {
        u32 sid;        /* SID of the queue pair or MAD agent */
};

struct pkey_security_struct {
        u64        subnet_prefix; /* Port subnet prefix */
        u16        pkey;        /* PKey number */
        u32        sid;        /* SID of pkey */
};

struct bpf_security_struct {
        u32 sid;  /* SID of bpf obj creator */
};

struct perf_event_security_struct {
        u32 sid;  /* SID of perf_event obj creator */
};

extern struct lsm_blob_sizes selinux_blob_sizes;
static inline struct task_security_struct *selinux_cred(const struct cred *cred)
{
        return cred->security + selinux_blob_sizes.lbs_cred;
}

static inline struct file_security_struct *selinux_file(const struct file *file)
{
        return file->f_security + selinux_blob_sizes.lbs_file;
}

static inline struct inode_security_struct *selinux_inode(
                                                const struct inode *inode)
{
        if (unlikely(!inode->i_security))
                return NULL;
        return inode->i_security + selinux_blob_sizes.lbs_inode;
}

static inline struct msg_security_struct *selinux_msg_msg(
                                                const struct msg_msg *msg_msg)
{
        return msg_msg->security + selinux_blob_sizes.lbs_msg_msg;
}

static inline struct ipc_security_struct *selinux_ipc(
                                                const struct kern_ipc_perm *ipc)
{
        return ipc->security + selinux_blob_sizes.lbs_ipc;
}

/*
 * get the subjective security ID of the current task
 */
static inline u32 current_sid(void)
{
        const struct task_security_struct *tsec = selinux_cred(current_cred());

        return tsec->sid;
}

#endif /* _SELINUX_OBJSEC_H_ */











































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_INETDEVICE_H
#define _LINUX_INETDEVICE_H

#ifdef __KERNEL__

#include <linux/bitmap.h>
#include <linux/if.h>
#include <linux/ip.h>
#include <linux/netdevice.h>
#include <linux/rcupdate.h>
#include <linux/timer.h>
#include <linux/sysctl.h>
#include <linux/rtnetlink.h>
#include <linux/refcount.h>

struct ipv4_devconf {
        void        *sysctl;
        int        data[IPV4_DEVCONF_MAX];
        DECLARE_BITMAP(state, IPV4_DEVCONF_MAX);
};

#define MC_HASH_SZ_LOG 9

struct in_device {
        struct net_device        *dev;
        refcount_t                refcnt;
        int                        dead;
        struct in_ifaddr        __rcu *ifa_list;/* IP ifaddr chain                */

        struct ip_mc_list __rcu        *mc_list;        /* IP multicast filter chain    */
        struct ip_mc_list __rcu        * __rcu *mc_hash;

        int                        mc_count;        /* Number of installed mcasts        */
        spinlock_t                mc_tomb_lock;
        struct ip_mc_list        *mc_tomb;
        unsigned long                mr_v1_seen;
        unsigned long                mr_v2_seen;
        unsigned long                mr_maxdelay;
        unsigned long                mr_qi;                /* Query Interval */
        unsigned long                mr_qri;                /* Query Response Interval */
        unsigned char                mr_qrv;                /* Query Robustness Variable */
        unsigned char                mr_gq_running;
        u32                        mr_ifc_count;
        struct timer_list        mr_gq_timer;        /* general query timer */
        struct timer_list        mr_ifc_timer;        /* interface change timer */

        struct neigh_parms        *arp_parms;
        struct ipv4_devconf        cnf;
        struct rcu_head                rcu_head;
};

#define IPV4_DEVCONF(cnf, attr) ((cnf).data[IPV4_DEVCONF_ ## attr - 1])
#define IPV4_DEVCONF_ALL(net, attr) \
        IPV4_DEVCONF((*(net)->ipv4.devconf_all), attr)

static inline int ipv4_devconf_get(struct in_device *in_dev, int index)
{
        index--;
        return in_dev->cnf.data[index];
}

static inline void ipv4_devconf_set(struct in_device *in_dev, int index,
                                    int val)
{
        index--;
        set_bit(index, in_dev->cnf.state);
        in_dev->cnf.data[index] = val;
}

static inline void ipv4_devconf_setall(struct in_device *in_dev)
{
        bitmap_fill(in_dev->cnf.state, IPV4_DEVCONF_MAX);
}

#define IN_DEV_CONF_GET(in_dev, attr) \
        ipv4_devconf_get((in_dev), IPV4_DEVCONF_ ## attr)
#define IN_DEV_CONF_SET(in_dev, attr, val) \
        ipv4_devconf_set((in_dev), IPV4_DEVCONF_ ## attr, (val))

#define IN_DEV_ANDCONF(in_dev, attr) \
        (IPV4_DEVCONF_ALL(dev_net(in_dev->dev), attr) && \
         IN_DEV_CONF_GET((in_dev), attr))

#define IN_DEV_NET_ORCONF(in_dev, net, attr) \
        (IPV4_DEVCONF_ALL(net, attr) || \
         IN_DEV_CONF_GET((in_dev), attr))

#define IN_DEV_ORCONF(in_dev, attr) \
        IN_DEV_NET_ORCONF(in_dev, dev_net(in_dev->dev), attr)

#define IN_DEV_MAXCONF(in_dev, attr) \
        (max(IPV4_DEVCONF_ALL(dev_net(in_dev->dev), attr), \
             IN_DEV_CONF_GET((in_dev), attr)))

#define IN_DEV_FORWARD(in_dev)                IN_DEV_CONF_GET((in_dev), FORWARDING)
#define IN_DEV_MFORWARD(in_dev)                IN_DEV_ANDCONF((in_dev), MC_FORWARDING)
#define IN_DEV_BFORWARD(in_dev)                IN_DEV_ANDCONF((in_dev), BC_FORWARDING)
#define IN_DEV_RPFILTER(in_dev)                IN_DEV_MAXCONF((in_dev), RP_FILTER)
#define IN_DEV_SRC_VMARK(in_dev)            IN_DEV_ORCONF((in_dev), SRC_VMARK)
#define IN_DEV_SOURCE_ROUTE(in_dev)        IN_DEV_ANDCONF((in_dev), \
                                                       ACCEPT_SOURCE_ROUTE)
#define IN_DEV_ACCEPT_LOCAL(in_dev)        IN_DEV_ORCONF((in_dev), ACCEPT_LOCAL)
#define IN_DEV_BOOTP_RELAY(in_dev)        IN_DEV_ANDCONF((in_dev), BOOTP_RELAY)

#define IN_DEV_LOG_MARTIANS(in_dev)        IN_DEV_ORCONF((in_dev), LOG_MARTIANS)
#define IN_DEV_PROXY_ARP(in_dev)        IN_DEV_ORCONF((in_dev), PROXY_ARP)
#define IN_DEV_PROXY_ARP_PVLAN(in_dev)        IN_DEV_CONF_GET(in_dev, PROXY_ARP_PVLAN)
#define IN_DEV_SHARED_MEDIA(in_dev)        IN_DEV_ORCONF((in_dev), SHARED_MEDIA)
#define IN_DEV_TX_REDIRECTS(in_dev)        IN_DEV_ORCONF((in_dev), SEND_REDIRECTS)
#define IN_DEV_SEC_REDIRECTS(in_dev)        IN_DEV_ORCONF((in_dev), \
                                                      SECURE_REDIRECTS)
#define IN_DEV_IDTAG(in_dev)                IN_DEV_CONF_GET(in_dev, TAG)
#define IN_DEV_MEDIUM_ID(in_dev)        IN_DEV_CONF_GET(in_dev, MEDIUM_ID)
#define IN_DEV_PROMOTE_SECONDARIES(in_dev) \
                                        IN_DEV_ORCONF((in_dev), \
                                                      PROMOTE_SECONDARIES)
#define IN_DEV_ROUTE_LOCALNET(in_dev)        IN_DEV_ORCONF(in_dev, ROUTE_LOCALNET)
#define IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)        \
        IN_DEV_NET_ORCONF(in_dev, net, ROUTE_LOCALNET)

#define IN_DEV_RX_REDIRECTS(in_dev) \
        ((IN_DEV_FORWARD(in_dev) && \
          IN_DEV_ANDCONF((in_dev), ACCEPT_REDIRECTS)) \
         || (!IN_DEV_FORWARD(in_dev) && \
          IN_DEV_ORCONF((in_dev), ACCEPT_REDIRECTS)))

#define IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) \
        IN_DEV_CONF_GET((in_dev), IGNORE_ROUTES_WITH_LINKDOWN)

#define IN_DEV_ARPFILTER(in_dev)        IN_DEV_ORCONF((in_dev), ARPFILTER)
#define IN_DEV_ARP_ACCEPT(in_dev)        IN_DEV_ORCONF((in_dev), ARP_ACCEPT)
#define IN_DEV_ARP_ANNOUNCE(in_dev)        IN_DEV_MAXCONF((in_dev), ARP_ANNOUNCE)
#define IN_DEV_ARP_IGNORE(in_dev)        IN_DEV_MAXCONF((in_dev), ARP_IGNORE)
#define IN_DEV_ARP_NOTIFY(in_dev)        IN_DEV_MAXCONF((in_dev), ARP_NOTIFY)

struct in_ifaddr {
        struct hlist_node        hash;
        struct in_ifaddr        __rcu *ifa_next;
        struct in_device        *ifa_dev;
        struct rcu_head                rcu_head;
        __be32                        ifa_local;
        __be32                        ifa_address;
        __be32                        ifa_mask;
        __u32                        ifa_rt_priority;
        __be32                        ifa_broadcast;
        unsigned char                ifa_scope;
        unsigned char                ifa_prefixlen;
        __u32                        ifa_flags;
        char                        ifa_label[IFNAMSIZ];

        /* In seconds, relative to tstamp. Expiry is at tstamp + HZ * lft. */
        __u32                        ifa_valid_lft;
        __u32                        ifa_preferred_lft;
        unsigned long                ifa_cstamp; /* created timestamp */
        unsigned long                ifa_tstamp; /* updated timestamp */
};

struct in_validator_info {
        __be32                        ivi_addr;
        struct in_device        *ivi_dev;
        struct netlink_ext_ack        *extack;
};

int register_inetaddr_notifier(struct notifier_block *nb);
int unregister_inetaddr_notifier(struct notifier_block *nb);
int register_inetaddr_validator_notifier(struct notifier_block *nb);
int unregister_inetaddr_validator_notifier(struct notifier_block *nb);

void inet_netconf_notify_devconf(struct net *net, int event, int type,
                                 int ifindex, struct ipv4_devconf *devconf);

struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref);
static inline struct net_device *ip_dev_find(struct net *net, __be32 addr)
{
        return __ip_dev_find(net, addr, true);
}

int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b);
int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *);
#ifdef CONFIG_INET
int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size);
#else
static inline int inet_gifconf(struct net_device *dev, char __user *buf,
                               int len, int size)
{
        return 0;
}
#endif
void devinet_init(void);
struct in_device *inetdev_by_index(struct net *, int);
__be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope);
__be32 inet_confirm_addr(struct net *net, struct in_device *in_dev, __be32 dst,
                         __be32 local, int scope);
struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
                                    __be32 mask);
struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr);
static inline bool inet_ifa_match(__be32 addr, const struct in_ifaddr *ifa)
{
        return !((addr^ifa->ifa_address)&ifa->ifa_mask);
}

/*
 *        Check if a mask is acceptable.
 */
 
static __inline__ bool bad_mask(__be32 mask, __be32 addr)
{
        __u32 hmask;
        if (addr & (mask = ~mask))
                return true;
        hmask = ntohl(mask);
        if (hmask & (hmask+1))
                return true;
        return false;
}

#define in_dev_for_each_ifa_rtnl(ifa, in_dev)                        \
        for (ifa = rtnl_dereference((in_dev)->ifa_list); ifa;        \
             ifa = rtnl_dereference(ifa->ifa_next))

#define in_dev_for_each_ifa_rcu(ifa, in_dev)                        \
        for (ifa = rcu_dereference((in_dev)->ifa_list); ifa;        \
             ifa = rcu_dereference(ifa->ifa_next))

static inline struct in_device *__in_dev_get_rcu(const struct net_device *dev)
{
        return rcu_dereference(dev->ip_ptr);
}

static inline struct in_device *in_dev_get(const struct net_device *dev)
{
        struct in_device *in_dev;

        rcu_read_lock();
        in_dev = __in_dev_get_rcu(dev);
        if (in_dev)
                refcount_inc(&in_dev->refcnt);
        rcu_read_unlock();
        return in_dev;
}

static inline struct in_device *__in_dev_get_rtnl(const struct net_device *dev)
{
        return rtnl_dereference(dev->ip_ptr);
}

/* called with rcu_read_lock or rtnl held */
static inline bool ip_ignore_linkdown(const struct net_device *dev)
{
        struct in_device *in_dev;
        bool rc = false;

        in_dev = rcu_dereference_rtnl(dev->ip_ptr);
        if (in_dev &&
            IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev))
                rc = true;

        return rc;
}

static inline struct neigh_parms *__in_dev_arp_parms_get_rcu(const struct net_device *dev)
{
        struct in_device *in_dev = __in_dev_get_rcu(dev);

        return in_dev ? in_dev->arp_parms : NULL;
}

void in_dev_finish_destroy(struct in_device *idev);

static inline void in_dev_put(struct in_device *idev)
{
        if (refcount_dec_and_test(&idev->refcnt))
                in_dev_finish_destroy(idev);
}

#define __in_dev_put(idev)  refcount_dec(&(idev)->refcnt)
#define in_dev_hold(idev)   refcount_inc(&(idev)->refcnt)

#endif /* __KERNEL__ */

static __inline__ __be32 inet_make_mask(int logmask)
{
        if (logmask)
                return htonl(~((1U<<(32-logmask))-1));
        return 0;
}

static __inline__ int inet_mask_len(__be32 mask)
{
        __u32 hmask = ntohl(mask);
        if (!hmask)
                return 0;
        return 32 - ffz(~hmask);
}


#endif /* _LINUX_INETDEVICE_H */

























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *        Definitions for the 'struct skb_array' datastructure.
 *
 *        Author:
 *                Michael S. Tsirkin <mst@redhat.com>
 *
 *        Copyright (C) 2016 Red Hat, Inc.
 *
 *        Limited-size FIFO of skbs. Can be used more or less whenever
 *        sk_buff_head can be used, except you need to know the queue size in
 *        advance.
 *        Implemented as a type-safe wrapper around ptr_ring.
 */

#ifndef _LINUX_SKB_ARRAY_H
#define _LINUX_SKB_ARRAY_H 1

#ifdef __KERNEL__
#include <linux/ptr_ring.h>
#include <linux/skbuff.h>
#include <linux/if_vlan.h>
#endif

struct skb_array {
        struct ptr_ring ring;
};

/* Might be slightly faster than skb_array_full below, but callers invoking
 * this in a loop must use a compiler barrier, for example cpu_relax().
 */
static inline bool __skb_array_full(struct skb_array *a)
{
        return __ptr_ring_full(&a->ring);
}

static inline bool skb_array_full(struct skb_array *a)
{
        return ptr_ring_full(&a->ring);
}

static inline int skb_array_produce(struct skb_array *a, struct sk_buff *skb)
{
        return ptr_ring_produce(&a->ring, skb);
}

static inline int skb_array_produce_irq(struct skb_array *a, struct sk_buff *skb)
{
        return ptr_ring_produce_irq(&a->ring, skb);
}

static inline int skb_array_produce_bh(struct skb_array *a, struct sk_buff *skb)
{
        return ptr_ring_produce_bh(&a->ring, skb);
}

static inline int skb_array_produce_any(struct skb_array *a, struct sk_buff *skb)
{
        return ptr_ring_produce_any(&a->ring, skb);
}

/* Might be slightly faster than skb_array_empty below, but only safe if the
 * array is never resized. Also, callers invoking this in a loop must take care
 * to use a compiler barrier, for example cpu_relax().
 */
static inline bool __skb_array_empty(struct skb_array *a)
{
        return __ptr_ring_empty(&a->ring);
}

static inline struct sk_buff *__skb_array_peek(struct skb_array *a)
{
        return __ptr_ring_peek(&a->ring);
}

static inline bool skb_array_empty(struct skb_array *a)
{
        return ptr_ring_empty(&a->ring);
}

static inline bool skb_array_empty_bh(struct skb_array *a)
{
        return ptr_ring_empty_bh(&a->ring);
}

static inline bool skb_array_empty_irq(struct skb_array *a)
{
        return ptr_ring_empty_irq(&a->ring);
}

static inline bool skb_array_empty_any(struct skb_array *a)
{
        return ptr_ring_empty_any(&a->ring);
}

static inline struct sk_buff *__skb_array_consume(struct skb_array *a)
{
        return __ptr_ring_consume(&a->ring);
}

static inline struct sk_buff *skb_array_consume(struct skb_array *a)
{
        return ptr_ring_consume(&a->ring);
}

static inline int skb_array_consume_batched(struct skb_array *a,
                                            struct sk_buff **array, int n)
{
        return ptr_ring_consume_batched(&a->ring, (void **)array, n);
}

static inline struct sk_buff *skb_array_consume_irq(struct skb_array *a)
{
        return ptr_ring_consume_irq(&a->ring);
}

static inline int skb_array_consume_batched_irq(struct skb_array *a,
                                                struct sk_buff **array, int n)
{
        return ptr_ring_consume_batched_irq(&a->ring, (void **)array, n);
}

static inline struct sk_buff *skb_array_consume_any(struct skb_array *a)
{
        return ptr_ring_consume_any(&a->ring);
}

static inline int skb_array_consume_batched_any(struct skb_array *a,
                                                struct sk_buff **array, int n)
{
        return ptr_ring_consume_batched_any(&a->ring, (void **)array, n);
}


static inline struct sk_buff *skb_array_consume_bh(struct skb_array *a)
{
        return ptr_ring_consume_bh(&a->ring);
}

static inline int skb_array_consume_batched_bh(struct skb_array *a,
                                               struct sk_buff **array, int n)
{
        return ptr_ring_consume_batched_bh(&a->ring, (void **)array, n);
}

static inline int __skb_array_len_with_tag(struct sk_buff *skb)
{
        if (likely(skb)) {
                int len = skb->len;

                if (skb_vlan_tag_present(skb))
                        len += VLAN_HLEN;

                return len;
        } else {
                return 0;
        }
}

static inline int skb_array_peek_len(struct skb_array *a)
{
        return PTR_RING_PEEK_CALL(&a->ring, __skb_array_len_with_tag);
}

static inline int skb_array_peek_len_irq(struct skb_array *a)
{
        return PTR_RING_PEEK_CALL_IRQ(&a->ring, __skb_array_len_with_tag);
}

static inline int skb_array_peek_len_bh(struct skb_array *a)
{
        return PTR_RING_PEEK_CALL_BH(&a->ring, __skb_array_len_with_tag);
}

static inline int skb_array_peek_len_any(struct skb_array *a)
{
        return PTR_RING_PEEK_CALL_ANY(&a->ring, __skb_array_len_with_tag);
}

static inline int skb_array_init(struct skb_array *a, int size, gfp_t gfp)
{
        return ptr_ring_init(&a->ring, size, gfp);
}

static void __skb_array_destroy_skb(void *ptr)
{
        kfree_skb(ptr);
}

static inline void skb_array_unconsume(struct skb_array *a,
                                       struct sk_buff **skbs, int n)
{
        ptr_ring_unconsume(&a->ring, (void **)skbs, n, __skb_array_destroy_skb);
}

static inline int skb_array_resize(struct skb_array *a, int size, gfp_t gfp)
{
        return ptr_ring_resize(&a->ring, size, gfp, __skb_array_destroy_skb);
}

static inline int skb_array_resize_multiple(struct skb_array **rings,
                                            int nrings, unsigned int size,
                                            gfp_t gfp)
{
        BUILD_BUG_ON(offsetof(struct skb_array, ring));
        return ptr_ring_resize_multiple((struct ptr_ring **)rings,
                                        nrings, size, gfp,
                                        __skb_array_destroy_skb);
}

static inline void skb_array_cleanup(struct skb_array *a)
{
        ptr_ring_cleanup(&a->ring, __skb_array_destroy_skb);
}

#endif /* _LINUX_SKB_ARRAY_H  */











































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/pm_qos.h>

static inline void device_pm_init_common(struct device *dev)
{
        if (!dev->power.early_init) {
                spin_lock_init(&dev->power.lock);
                dev->power.qos = NULL;
                dev->power.early_init = true;
        }
}

#ifdef CONFIG_PM

static inline void pm_runtime_early_init(struct device *dev)
{
        dev->power.disable_depth = 1;
        device_pm_init_common(dev);
}

extern void pm_runtime_init(struct device *dev);
extern void pm_runtime_reinit(struct device *dev);
extern void pm_runtime_remove(struct device *dev);
extern u64 pm_runtime_active_time(struct device *dev);

#define WAKE_IRQ_DEDICATED_ALLOCATED        BIT(0)
#define WAKE_IRQ_DEDICATED_MANAGED        BIT(1)
#define WAKE_IRQ_DEDICATED_REVERSE        BIT(2)
#define WAKE_IRQ_DEDICATED_MASK                (WAKE_IRQ_DEDICATED_ALLOCATED | \
                                         WAKE_IRQ_DEDICATED_MANAGED | \
                                         WAKE_IRQ_DEDICATED_REVERSE)
#define WAKE_IRQ_DEDICATED_ENABLED        BIT(3)

struct wake_irq {
        struct device *dev;
        unsigned int status;
        int irq;
        const char *name;
};

extern void dev_pm_arm_wake_irq(struct wake_irq *wirq);
extern void dev_pm_disarm_wake_irq(struct wake_irq *wirq);
extern void dev_pm_enable_wake_irq_check(struct device *dev,
                                         bool can_change_status);
extern void dev_pm_disable_wake_irq_check(struct device *dev, bool cond_disable);
extern void dev_pm_enable_wake_irq_complete(struct device *dev);

#ifdef CONFIG_PM_SLEEP

extern void device_wakeup_attach_irq(struct device *dev, struct wake_irq *wakeirq);
extern void device_wakeup_detach_irq(struct device *dev);
extern void device_wakeup_arm_wake_irqs(void);
extern void device_wakeup_disarm_wake_irqs(void);

#else

static inline void device_wakeup_attach_irq(struct device *dev,
                                            struct wake_irq *wakeirq) {}

static inline void device_wakeup_detach_irq(struct device *dev)
{
}

#endif /* CONFIG_PM_SLEEP */

/*
 * sysfs.c
 */

extern int dpm_sysfs_add(struct device *dev);
extern void dpm_sysfs_remove(struct device *dev);
extern void rpm_sysfs_remove(struct device *dev);
extern int wakeup_sysfs_add(struct device *dev);
extern void wakeup_sysfs_remove(struct device *dev);
extern int pm_qos_sysfs_add_resume_latency(struct device *dev);
extern void pm_qos_sysfs_remove_resume_latency(struct device *dev);
extern int pm_qos_sysfs_add_flags(struct device *dev);
extern void pm_qos_sysfs_remove_flags(struct device *dev);
extern int pm_qos_sysfs_add_latency_tolerance(struct device *dev);
extern void pm_qos_sysfs_remove_latency_tolerance(struct device *dev);
extern int dpm_sysfs_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid);

#else /* CONFIG_PM */

static inline void pm_runtime_early_init(struct device *dev)
{
        device_pm_init_common(dev);
}

static inline void pm_runtime_init(struct device *dev) {}
static inline void pm_runtime_reinit(struct device *dev) {}
static inline void pm_runtime_remove(struct device *dev) {}

static inline int dpm_sysfs_add(struct device *dev) { return 0; }
static inline void dpm_sysfs_remove(struct device *dev) {}
static inline int dpm_sysfs_change_owner(struct device *dev, kuid_t kuid,
                                         kgid_t kgid) { return 0; }

#endif

#ifdef CONFIG_PM_SLEEP

/* kernel/power/main.c */
extern int pm_async_enabled;

/* drivers/base/power/main.c */
extern struct list_head dpm_list;        /* The active device list */

static inline struct device *to_device(struct list_head *entry)
{
        return container_of(entry, struct device, power.entry);
}

extern void device_pm_sleep_init(struct device *dev);
extern void device_pm_add(struct device *);
extern void device_pm_remove(struct device *);
extern void device_pm_move_before(struct device *, struct device *);
extern void device_pm_move_after(struct device *, struct device *);
extern void device_pm_move_last(struct device *);
extern void device_pm_check_callbacks(struct device *dev);

static inline bool device_pm_initialized(struct device *dev)
{
        return dev->power.in_dpm_list;
}

/* drivers/base/power/wakeup_stats.c */
extern int wakeup_source_sysfs_add(struct device *parent,
                                   struct wakeup_source *ws);
extern void wakeup_source_sysfs_remove(struct wakeup_source *ws);

extern int pm_wakeup_source_sysfs_add(struct device *parent);

#else /* !CONFIG_PM_SLEEP */

static inline void device_pm_sleep_init(struct device *dev) {}

static inline void device_pm_add(struct device *dev) {}

static inline void device_pm_remove(struct device *dev)
{
        pm_runtime_remove(dev);
}

static inline void device_pm_move_before(struct device *deva,
                                         struct device *devb) {}
static inline void device_pm_move_after(struct device *deva,
                                        struct device *devb) {}
static inline void device_pm_move_last(struct device *dev) {}

static inline void device_pm_check_callbacks(struct device *dev) {}

static inline bool device_pm_initialized(struct device *dev)
{
        return device_is_registered(dev);
}

static inline int pm_wakeup_source_sysfs_add(struct device *parent)
{
        return 0;
}

#endif /* !CONFIG_PM_SLEEP */

static inline void device_pm_init(struct device *dev)
{
        device_pm_init_common(dev);
        device_pm_sleep_init(dev);
        pm_runtime_init(dev);
}










































































































































































































































































































































    2 

















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_GENHD_H
#define _LINUX_GENHD_H

/*
 *         genhd.h Copyright (C) 1992 Drew Eckhardt
 *        Generic hard disk header file by  
 *                 Drew Eckhardt
 *
 *                <drew@colorado.edu>
 */

#include <linux/types.h>
#include <linux/kdev_t.h>
#include <linux/rcupdate.h>
#include <linux/slab.h>
#include <linux/percpu-refcount.h>
#include <linux/uuid.h>
#include <linux/blk_types.h>
#include <asm/local.h>

#define dev_to_disk(device)        container_of((device), struct gendisk, part0.__dev)
#define dev_to_part(device)        container_of((device), struct hd_struct, __dev)
#define disk_to_dev(disk)        (&(disk)->part0.__dev)
#define part_to_dev(part)        (&((part)->__dev))

extern const struct device_type disk_type;
extern struct device_type part_type;
extern struct class block_class;

#define DISK_MAX_PARTS                        256
#define DISK_NAME_LEN                        32

#include <linux/major.h>
#include <linux/device.h>
#include <linux/smp.h>
#include <linux/string.h>
#include <linux/fs.h>
#include <linux/workqueue.h>

#define PARTITION_META_INFO_VOLNAMELTH        64
/*
 * Enough for the string representation of any kind of UUID plus NULL.
 * EFI UUID is 36 characters. MSDOS UUID is 11 characters.
 */
#define PARTITION_META_INFO_UUIDLTH        (UUID_STRING_LEN + 1)

struct partition_meta_info {
        char uuid[PARTITION_META_INFO_UUIDLTH];
        u8 volname[PARTITION_META_INFO_VOLNAMELTH];
};

struct hd_struct {
        sector_t start_sect;
        /*
         * nr_sects is protected by sequence counter. One might extend a
         * partition while IO is happening to it and update of nr_sects
         * can be non-atomic on 32bit machines with 64bit sector_t.
         */
        sector_t nr_sects;
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
        seqcount_t nr_sects_seq;
#endif
        unsigned long stamp;
        struct disk_stats __percpu *dkstats;
        struct percpu_ref ref;

        struct device __dev;
        struct kobject *holder_dir;
        int policy, partno;
        struct partition_meta_info *info;
#ifdef CONFIG_FAIL_MAKE_REQUEST
        int make_it_fail;
#endif
        struct rcu_work rcu_work;
};

/**
 * DOC: genhd capability flags
 *
 * ``GENHD_FL_REMOVABLE`` (0x0001): indicates that the block device
 * gives access to removable media.
 * When set, the device remains present even when media is not
 * inserted.
 * Must not be set for devices which are removed entirely when the
 * media is removed.
 *
 * ``GENHD_FL_CD`` (0x0008): the block device is a CD-ROM-style
 * device.
 * Affects responses to the ``CDROM_GET_CAPABILITY`` ioctl.
 *
 * ``GENHD_FL_UP`` (0x0010): indicates that the block device is "up",
 * with a similar meaning to network interfaces.
 *
 * ``GENHD_FL_SUPPRESS_PARTITION_INFO`` (0x0020): don't include
 * partition information in ``/proc/partitions`` or in the output of
 * printk_all_partitions().
 * Used for the null block device and some MMC devices.
 *
 * ``GENHD_FL_EXT_DEVT`` (0x0040): the driver supports extended
 * dynamic ``dev_t``, i.e. it wants extended device numbers
 * (``BLOCK_EXT_MAJOR``).
 * This affects the maximum number of partitions.
 *
 * ``GENHD_FL_NATIVE_CAPACITY`` (0x0080): based on information in the
 * partition table, the device's capacity has been extended to its
 * native capacity; i.e. the device has hidden capacity used by one
 * of the partitions (this is a flag used so that native capacity is
 * only ever unlocked once).
 *
 * ``GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE`` (0x0100): event polling is
 * blocked whenever a writer holds an exclusive lock.
 *
 * ``GENHD_FL_NO_PART_SCAN`` (0x0200): partition scanning is disabled.
 * Used for loop devices in their default settings and some MMC
 * devices.
 *
 * ``GENHD_FL_HIDDEN`` (0x0400): the block device is hidden; it
 * doesn't produce events, doesn't appear in sysfs, and doesn't have
 * an associated ``bdev``.
 * Implies ``GENHD_FL_SUPPRESS_PARTITION_INFO`` and
 * ``GENHD_FL_NO_PART_SCAN``.
 * Used for multipath devices.
 */
#define GENHD_FL_REMOVABLE                        0x0001
/* 2 is unused (used to be GENHD_FL_DRIVERFS) */
/* 4 is unused (used to be GENHD_FL_MEDIA_CHANGE_NOTIFY) */
#define GENHD_FL_CD                                0x0008
#define GENHD_FL_UP                                0x0010
#define GENHD_FL_SUPPRESS_PARTITION_INFO        0x0020
#define GENHD_FL_EXT_DEVT                        0x0040
#define GENHD_FL_NATIVE_CAPACITY                0x0080
#define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE        0x0100
#define GENHD_FL_NO_PART_SCAN                        0x0200
#define GENHD_FL_HIDDEN                                0x0400

enum {
        DISK_EVENT_MEDIA_CHANGE                        = 1 << 0, /* media changed */
        DISK_EVENT_EJECT_REQUEST                = 1 << 1, /* eject requested */
};

enum {
        /* Poll even if events_poll_msecs is unset */
        DISK_EVENT_FLAG_POLL                        = 1 << 0,
        /* Forward events to udev */
        DISK_EVENT_FLAG_UEVENT                        = 1 << 1,
};

struct disk_part_tbl {
        struct rcu_head rcu_head;
        int len;
        struct hd_struct __rcu *last_lookup;
        struct hd_struct __rcu *part[];
};

struct disk_events;
struct badblocks;

struct blk_integrity {
        const struct blk_integrity_profile        *profile;
        unsigned char                                flags;
        unsigned char                                tuple_size;
        unsigned char                                interval_exp;
        unsigned char                                tag_size;
};

struct gendisk {
        /* major, first_minor and minors are input parameters only,
         * don't use directly.  Use disk_devt() and disk_max_parts().
         */
        int major;                        /* major number of driver */
        int first_minor;
        int minors;                     /* maximum number of minors, =1 for
                                         * disks that can't be partitioned. */

        char disk_name[DISK_NAME_LEN];        /* name of major driver */

        unsigned short events;                /* supported events */
        unsigned short event_flags;        /* flags related to event processing */

        /* Array of pointers to partitions indexed by partno.
         * Protected with matching bdev lock but stat and other
         * non-critical accesses use RCU.  Always access through
         * helpers.
         */
        struct disk_part_tbl __rcu *part_tbl;
        struct hd_struct part0;

        const struct block_device_operations *fops;
        struct request_queue *queue;
        void *private_data;

        int flags;
        unsigned long state;
#define GD_NEED_PART_SCAN                0
        struct rw_semaphore lookup_sem;
        struct kobject *slave_dir;

        struct timer_rand_state *random;
        atomic_t sync_io;                /* RAID */
        struct disk_events *ev;
#ifdef  CONFIG_BLK_DEV_INTEGRITY
        struct kobject integrity_kobj;
#endif        /* CONFIG_BLK_DEV_INTEGRITY */
#if IS_ENABLED(CONFIG_CDROM)
        struct cdrom_device_info *cdi;
#endif
        int node_id;
        struct badblocks *bb;
        struct lockdep_map lockdep_map;
};

#if IS_REACHABLE(CONFIG_CDROM)
#define disk_to_cdi(disk)        ((disk)->cdi)
#else
#define disk_to_cdi(disk)        NULL
#endif

static inline struct gendisk *part_to_disk(struct hd_struct *part)
{
        if (likely(part)) {
                if (part->partno)
                        return dev_to_disk(part_to_dev(part)->parent);
                else
                        return dev_to_disk(part_to_dev(part));
        }
        return NULL;
}

static inline int disk_max_parts(struct gendisk *disk)
{
        if (disk->flags & GENHD_FL_EXT_DEVT)
                return DISK_MAX_PARTS;
        return disk->minors;
}

static inline bool disk_part_scan_enabled(struct gendisk *disk)
{
        return disk_max_parts(disk) > 1 &&
                !(disk->flags & GENHD_FL_NO_PART_SCAN);
}

static inline dev_t disk_devt(struct gendisk *disk)
{
        return MKDEV(disk->major, disk->first_minor);
}

static inline dev_t part_devt(struct hd_struct *part)
{
        return part_to_dev(part)->devt;
}

extern struct hd_struct *__disk_get_part(struct gendisk *disk, int partno);
extern struct hd_struct *disk_get_part(struct gendisk *disk, int partno);

static inline void disk_put_part(struct hd_struct *part)
{
        if (likely(part))
                put_device(part_to_dev(part));
}

static inline void hd_sects_seq_init(struct hd_struct *p)
{
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
        seqcount_init(&p->nr_sects_seq);
#endif
}

/*
 * Smarter partition iterator without context limits.
 */
#define DISK_PITER_REVERSE        (1 << 0) /* iterate in the reverse direction */
#define DISK_PITER_INCL_EMPTY        (1 << 1) /* include 0-sized parts */
#define DISK_PITER_INCL_PART0        (1 << 2) /* include partition 0 */
#define DISK_PITER_INCL_EMPTY_PART0 (1 << 3) /* include empty partition 0 */

struct disk_part_iter {
        struct gendisk                *disk;
        struct hd_struct        *part;
        int                        idx;
        unsigned int                flags;
};

extern void disk_part_iter_init(struct disk_part_iter *piter,
                                 struct gendisk *disk, unsigned int flags);
extern struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter);
extern void disk_part_iter_exit(struct disk_part_iter *piter);
extern bool disk_has_partitions(struct gendisk *disk);

/* block/genhd.c */
extern void device_add_disk(struct device *parent, struct gendisk *disk,
                            const struct attribute_group **groups);
static inline void add_disk(struct gendisk *disk)
{
        device_add_disk(NULL, disk, NULL);
}
extern void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk);
static inline void add_disk_no_queue_reg(struct gendisk *disk)
{
        device_add_disk_no_queue_reg(NULL, disk);
}

extern void del_gendisk(struct gendisk *gp);
extern struct gendisk *get_gendisk(dev_t dev, int *partno);
extern struct block_device *bdget_disk(struct gendisk *disk, int partno);

extern void set_device_ro(struct block_device *bdev, int flag);
extern void set_disk_ro(struct gendisk *disk, int flag);

static inline int get_disk_ro(struct gendisk *disk)
{
        return disk->part0.policy;
}

extern void disk_block_events(struct gendisk *disk);
extern void disk_unblock_events(struct gendisk *disk);
extern void disk_flush_events(struct gendisk *disk, unsigned int mask);
bool set_capacity_and_notify(struct gendisk *disk, sector_t size);

/* drivers/char/random.c */
extern void add_disk_randomness(struct gendisk *disk) __latent_entropy;
extern void rand_initialize_disk(struct gendisk *disk);

static inline sector_t get_start_sect(struct block_device *bdev)
{
        return bdev->bd_part->start_sect;
}
static inline sector_t get_capacity(struct gendisk *disk)
{
        return disk->part0.nr_sects;
}
static inline void set_capacity(struct gendisk *disk, sector_t size)
{
        disk->part0.nr_sects = size;
}

int bdev_disk_changed(struct block_device *bdev, bool invalidate);
int blk_add_partitions(struct gendisk *disk, struct block_device *bdev);
int blk_drop_partitions(struct block_device *bdev);

extern struct gendisk *__alloc_disk_node(int minors, int node_id);
extern struct kobject *get_disk_and_module(struct gendisk *disk);
extern void put_disk(struct gendisk *disk);
extern void put_disk_and_module(struct gendisk *disk);
extern void blk_register_region(dev_t devt, unsigned long range,
                        struct module *module,
                        struct kobject *(*probe)(dev_t, int *, void *),
                        int (*lock)(dev_t, void *),
                        void *data);
extern void blk_unregister_region(dev_t devt, unsigned long range);

#define alloc_disk_node(minors, node_id)                                \
({                                                                        \
        static struct lock_class_key __key;                                \
        const char *__name;                                                \
        struct gendisk *__disk;                                                \
                                                                        \
        __name = "(gendisk_completion)"#minors"("#node_id")";                \
                                                                        \
        __disk = __alloc_disk_node(minors, node_id);                        \
                                                                        \
        if (__disk)                                                        \
                lockdep_init_map(&__disk->lockdep_map, __name, &__key, 0); \
                                                                        \
        __disk;                                                                \
})

#define alloc_disk(minors) alloc_disk_node(minors, NUMA_NO_NODE)

int register_blkdev(unsigned int major, const char *name);
void unregister_blkdev(unsigned int major, const char *name);

void revalidate_disk_size(struct gendisk *disk, bool verbose);
bool bdev_check_media_change(struct block_device *bdev);
int __invalidate_device(struct block_device *bdev, bool kill_dirty);
void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors);

/* for drivers/char/raw.c: */
int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long);
long compat_blkdev_ioctl(struct file *, unsigned, unsigned long);

#ifdef CONFIG_SYSFS
int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk);
void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk);
#else
static inline int bd_link_disk_holder(struct block_device *bdev,
                                      struct gendisk *disk)
{
        return 0;
}
static inline void bd_unlink_disk_holder(struct block_device *bdev,
                                         struct gendisk *disk)
{
}
#endif /* CONFIG_SYSFS */

#ifdef CONFIG_BLOCK
void printk_all_partitions(void);
dev_t blk_lookup_devt(const char *name, int partno);
#else /* CONFIG_BLOCK */
static inline void printk_all_partitions(void)
{
}
static inline dev_t blk_lookup_devt(const char *name, int partno)
{
        dev_t devt = MKDEV(0, 0);
        return devt;
}
#endif /* CONFIG_BLOCK */

#endif /* _LINUX_GENHD_H */


































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 























































    1 









































    1 




    1 











    1 









    1 








    1 











    1 





    1 











    1 


    1 

    1 














    1 


























    1 







    1 


















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
// SPDX-License-Identifier: GPL-2.0-only
/*
 *      sd.c Copyright (C) 1992 Drew Eckhardt
 *           Copyright (C) 1993, 1994, 1995, 1999 Eric Youngdale
 *
 *      Linux scsi disk driver
 *              Initial versions: Drew Eckhardt
 *              Subsequent revisions: Eric Youngdale
 *        Modification history:
 *       - Drew Eckhardt <drew@colorado.edu> original
 *       - Eric Youngdale <eric@andante.org> add scatter-gather, multiple 
 *         outstanding request, and other enhancements.
 *         Support loadable low-level scsi drivers.
 *       - Jirka Hanika <geo@ff.cuni.cz> support more scsi disks using 
 *         eight major numbers.
 *       - Richard Gooch <rgooch@atnf.csiro.au> support devfs.
 *         - Torben Mathiasen <tmm@image.dk> Resource allocation fixes in 
 *           sd_init and cleanups.
 *         - Alex Davis <letmein@erols.com> Fix problem where partition info
 *           not being read in sd_open. Fix problem where removable media 
 *           could be ejected after sd_open.
 *         - Douglas Gilbert <dgilbert@interlog.com> cleanup for lk 2.5.x
 *         - Badari Pulavarty <pbadari@us.ibm.com>, Matthew Wilcox 
 *           <willy@debian.org>, Kurt Garloff <garloff@suse.de>: 
 *           Support 32k/1M disks.
 *
 *        Logging policy (needs CONFIG_SCSI_LOGGING defined):
 *         - setting up transfer: SCSI_LOG_HLQUEUE levels 1 and 2
 *         - end of transfer (bh + scsi_lib): SCSI_LOG_HLCOMPLETE level 1
 *         - entering sd_ioctl: SCSI_LOG_IOCTL level 1
 *         - entering other commands: SCSI_LOG_HLQUEUE level 3
 *        Note: when the logging level is set by the user, it must be greater
 *        than the level indicated above to trigger output.        
 */

#include <linux/module.h>
#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/bio.h>
#include <linux/genhd.h>
#include <linux/hdreg.h>
#include <linux/errno.h>
#include <linux/idr.h>
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/blkdev.h>
#include <linux/blkpg.h>
#include <linux/blk-pm.h>
#include <linux/delay.h>
#include <linux/mutex.h>
#include <linux/string_helpers.h>
#include <linux/async.h>
#include <linux/slab.h>
#include <linux/sed-opal.h>
#include <linux/pm_runtime.h>
#include <linux/pr.h>
#include <linux/t10-pi.h>
#include <linux/uaccess.h>
#include <asm/unaligned.h>

#include <scsi/scsi.h>
#include <scsi/scsi_cmnd.h>
#include <scsi/scsi_dbg.h>
#include <scsi/scsi_device.h>
#include <scsi/scsi_driver.h>
#include <scsi/scsi_eh.h>
#include <scsi/scsi_host.h>
#include <scsi/scsi_ioctl.h>
#include <scsi/scsicam.h>

#include "sd.h"
#include "scsi_priv.h"
#include "scsi_logging.h"

MODULE_AUTHOR("Eric Youngdale");
MODULE_DESCRIPTION("SCSI disk (sd) driver");
MODULE_LICENSE("GPL");

MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK0_MAJOR);
MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK1_MAJOR);
MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK2_MAJOR);
MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK3_MAJOR);
MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK4_MAJOR);
MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK5_MAJOR);
MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK6_MAJOR);
MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK7_MAJOR);
MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK8_MAJOR);
MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK9_MAJOR);
MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK10_MAJOR);
MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK11_MAJOR);
MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK12_MAJOR);
MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK13_MAJOR);
MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK14_MAJOR);
MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK15_MAJOR);
MODULE_ALIAS_SCSI_DEVICE(TYPE_DISK);
MODULE_ALIAS_SCSI_DEVICE(TYPE_MOD);
MODULE_ALIAS_SCSI_DEVICE(TYPE_RBC);
MODULE_ALIAS_SCSI_DEVICE(TYPE_ZBC);

#if !defined(CONFIG_DEBUG_BLOCK_EXT_DEVT)
#define SD_MINORS        16
#else
#define SD_MINORS        0
#endif

static void sd_config_discard(struct scsi_disk *, unsigned int);
static void sd_config_write_same(struct scsi_disk *);
static int  sd_revalidate_disk(struct gendisk *);
static void sd_unlock_native_capacity(struct gendisk *disk);
static int  sd_probe(struct device *);
static int  sd_remove(struct device *);
static void sd_shutdown(struct device *);
static int sd_suspend_system(struct device *);
static int sd_suspend_runtime(struct device *);
static int sd_resume(struct device *);
static void sd_rescan(struct device *);
static blk_status_t sd_init_command(struct scsi_cmnd *SCpnt);
static void sd_uninit_command(struct scsi_cmnd *SCpnt);
static int sd_done(struct scsi_cmnd *);
static void sd_eh_reset(struct scsi_cmnd *);
static int sd_eh_action(struct scsi_cmnd *, int);
static void sd_read_capacity(struct scsi_disk *sdkp, unsigned char *buffer);
static void scsi_disk_release(struct device *cdev);

static DEFINE_IDA(sd_index_ida);

/* This semaphore is used to mediate the 0->1 reference get in the
 * face of object destruction (i.e. we can't allow a get on an
 * object after last put) */
static DEFINE_MUTEX(sd_ref_mutex);

static struct kmem_cache *sd_cdb_cache;
static mempool_t *sd_cdb_pool;
static mempool_t *sd_page_pool;

static const char *sd_cache_types[] = {
        "write through", "none", "write back",
        "write back, no read (daft)"
};

static void sd_set_flush_flag(struct scsi_disk *sdkp)
{
        bool wc = false, fua = false;

        if (sdkp->WCE) {
                wc = true;
                if (sdkp->DPOFUA)
                        fua = true;
        }

        blk_queue_write_cache(sdkp->disk->queue, wc, fua);
}

static ssize_t
cache_type_store(struct device *dev, struct device_attribute *attr,
                 const char *buf, size_t count)
{
        int ct, rcd, wce, sp;
        struct scsi_disk *sdkp = to_scsi_disk(dev);
        struct scsi_device *sdp = sdkp->device;
        char buffer[64];
        char *buffer_data;
        struct scsi_mode_data data;
        struct scsi_sense_hdr sshdr;
        static const char temp[] = "temporary ";
        int len;

        if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC)
                /* no cache control on RBC devices; theoretically they
                 * can do it, but there's probably so many exceptions
                 * it's not worth the risk */
                return -EINVAL;

        if (strncmp(buf, temp, sizeof(temp) - 1) == 0) {
                buf += sizeof(temp) - 1;
                sdkp->cache_override = 1;
        } else {
                sdkp->cache_override = 0;
        }

        ct = sysfs_match_string(sd_cache_types, buf);
        if (ct < 0)
                return -EINVAL;

        rcd = ct & 0x01 ? 1 : 0;
        wce = (ct & 0x02) && !sdkp->write_prot ? 1 : 0;

        if (sdkp->cache_override) {
                sdkp->WCE = wce;
                sdkp->RCD = rcd;
                sd_set_flush_flag(sdkp);
                return count;
        }

        if (scsi_mode_sense(sdp, 0x08, 8, buffer, sizeof(buffer), SD_TIMEOUT,
                            sdkp->max_retries, &data, NULL))
                return -EINVAL;
        len = min_t(size_t, sizeof(buffer), data.length - data.header_length -
                  data.block_descriptor_length);
        buffer_data = buffer + data.header_length +
                data.block_descriptor_length;
        buffer_data[2] &= ~0x05;
        buffer_data[2] |= wce << 2 | rcd;
        sp = buffer_data[0] & 0x80 ? 1 : 0;
        buffer_data[0] &= ~0x80;

        /*
         * Ensure WP, DPOFUA, and RESERVED fields are cleared in
         * received mode parameter buffer before doing MODE SELECT.
         */
        data.device_specific = 0;

        if (scsi_mode_select(sdp, 1, sp, 8, buffer_data, len, SD_TIMEOUT,
                             sdkp->max_retries, &data, &sshdr)) {
                if (scsi_sense_valid(&sshdr))
                        sd_print_sense_hdr(sdkp, &sshdr);
                return -EINVAL;
        }
        sd_revalidate_disk(sdkp->disk);
        return count;
}

static ssize_t
manage_start_stop_show(struct device *dev, struct device_attribute *attr,
                       char *buf)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);
        struct scsi_device *sdp = sdkp->device;

        return sprintf(buf, "%u\n", sdp->manage_start_stop);
}

static ssize_t
manage_start_stop_store(struct device *dev, struct device_attribute *attr,
                        const char *buf, size_t count)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);
        struct scsi_device *sdp = sdkp->device;
        bool v;

        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;

        if (kstrtobool(buf, &v))
                return -EINVAL;

        sdp->manage_start_stop = v;

        return count;
}
static DEVICE_ATTR_RW(manage_start_stop);

static ssize_t
allow_restart_show(struct device *dev, struct device_attribute *attr, char *buf)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);

        return sprintf(buf, "%u\n", sdkp->device->allow_restart);
}

static ssize_t
allow_restart_store(struct device *dev, struct device_attribute *attr,
                    const char *buf, size_t count)
{
        bool v;
        struct scsi_disk *sdkp = to_scsi_disk(dev);
        struct scsi_device *sdp = sdkp->device;

        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;

        if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC)
                return -EINVAL;

        if (kstrtobool(buf, &v))
                return -EINVAL;

        sdp->allow_restart = v;

        return count;
}
static DEVICE_ATTR_RW(allow_restart);

static ssize_t
cache_type_show(struct device *dev, struct device_attribute *attr, char *buf)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);
        int ct = sdkp->RCD + 2*sdkp->WCE;

        return sprintf(buf, "%s\n", sd_cache_types[ct]);
}
static DEVICE_ATTR_RW(cache_type);

static ssize_t
FUA_show(struct device *dev, struct device_attribute *attr, char *buf)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);

        return sprintf(buf, "%u\n", sdkp->DPOFUA);
}
static DEVICE_ATTR_RO(FUA);

static ssize_t
protection_type_show(struct device *dev, struct device_attribute *attr,
                     char *buf)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);

        return sprintf(buf, "%u\n", sdkp->protection_type);
}

static ssize_t
protection_type_store(struct device *dev, struct device_attribute *attr,
                      const char *buf, size_t count)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);
        unsigned int val;
        int err;

        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;

        err = kstrtouint(buf, 10, &val);

        if (err)
                return err;

        if (val <= T10_PI_TYPE3_PROTECTION)
                sdkp->protection_type = val;

        return count;
}
static DEVICE_ATTR_RW(protection_type);

static ssize_t
protection_mode_show(struct device *dev, struct device_attribute *attr,
                     char *buf)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);
        struct scsi_device *sdp = sdkp->device;
        unsigned int dif, dix;

        dif = scsi_host_dif_capable(sdp->host, sdkp->protection_type);
        dix = scsi_host_dix_capable(sdp->host, sdkp->protection_type);

        if (!dix && scsi_host_dix_capable(sdp->host, T10_PI_TYPE0_PROTECTION)) {
                dif = 0;
                dix = 1;
        }

        if (!dif && !dix)
                return sprintf(buf, "none\n");

        return sprintf(buf, "%s%u\n", dix ? "dix" : "dif", dif);
}
static DEVICE_ATTR_RO(protection_mode);

static ssize_t
app_tag_own_show(struct device *dev, struct device_attribute *attr, char *buf)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);

        return sprintf(buf, "%u\n", sdkp->ATO);
}
static DEVICE_ATTR_RO(app_tag_own);

static ssize_t
thin_provisioning_show(struct device *dev, struct device_attribute *attr,
                       char *buf)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);

        return sprintf(buf, "%u\n", sdkp->lbpme);
}
static DEVICE_ATTR_RO(thin_provisioning);

/* sysfs_match_string() requires dense arrays */
static const char *lbp_mode[] = {
        [SD_LBP_FULL]                = "full",
        [SD_LBP_UNMAP]                = "unmap",
        [SD_LBP_WS16]                = "writesame_16",
        [SD_LBP_WS10]                = "writesame_10",
        [SD_LBP_ZERO]                = "writesame_zero",
        [SD_LBP_DISABLE]        = "disabled",
};

static ssize_t
provisioning_mode_show(struct device *dev, struct device_attribute *attr,
                       char *buf)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);

        return sprintf(buf, "%s\n", lbp_mode[sdkp->provisioning_mode]);
}

static ssize_t
provisioning_mode_store(struct device *dev, struct device_attribute *attr,
                        const char *buf, size_t count)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);
        struct scsi_device *sdp = sdkp->device;
        int mode;

        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;

        if (sd_is_zoned(sdkp)) {
                sd_config_discard(sdkp, SD_LBP_DISABLE);
                return count;
        }

        if (sdp->type != TYPE_DISK)
                return -EINVAL;

        mode = sysfs_match_string(lbp_mode, buf);
        if (mode < 0)
                return -EINVAL;

        sd_config_discard(sdkp, mode);

        return count;
}
static DEVICE_ATTR_RW(provisioning_mode);

/* sysfs_match_string() requires dense arrays */
static const char *zeroing_mode[] = {
        [SD_ZERO_WRITE]                = "write",
        [SD_ZERO_WS]                = "writesame",
        [SD_ZERO_WS16_UNMAP]        = "writesame_16_unmap",
        [SD_ZERO_WS10_UNMAP]        = "writesame_10_unmap",
};

static ssize_t
zeroing_mode_show(struct device *dev, struct device_attribute *attr,
                  char *buf)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);

        return sprintf(buf, "%s\n", zeroing_mode[sdkp->zeroing_mode]);
}

static ssize_t
zeroing_mode_store(struct device *dev, struct device_attribute *attr,
                   const char *buf, size_t count)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);
        int mode;

        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;

        mode = sysfs_match_string(zeroing_mode, buf);
        if (mode < 0)
                return -EINVAL;

        sdkp->zeroing_mode = mode;

        return count;
}
static DEVICE_ATTR_RW(zeroing_mode);

static ssize_t
max_medium_access_timeouts_show(struct device *dev,
                                struct device_attribute *attr, char *buf)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);

        return sprintf(buf, "%u\n", sdkp->max_medium_access_timeouts);
}

static ssize_t
max_medium_access_timeouts_store(struct device *dev,
                                 struct device_attribute *attr, const char *buf,
                                 size_t count)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);
        int err;

        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;

        err = kstrtouint(buf, 10, &sdkp->max_medium_access_timeouts);

        return err ? err : count;
}
static DEVICE_ATTR_RW(max_medium_access_timeouts);

static ssize_t
max_write_same_blocks_show(struct device *dev, struct device_attribute *attr,
                           char *buf)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);

        return sprintf(buf, "%u\n", sdkp->max_ws_blocks);
}

static ssize_t
max_write_same_blocks_store(struct device *dev, struct device_attribute *attr,
                            const char *buf, size_t count)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);
        struct scsi_device *sdp = sdkp->device;
        unsigned long max;
        int err;

        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;

        if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC)
                return -EINVAL;

        err = kstrtoul(buf, 10, &max);

        if (err)
                return err;

        if (max == 0)
                sdp->no_write_same = 1;
        else if (max <= SD_MAX_WS16_BLOCKS) {
                sdp->no_write_same = 0;
                sdkp->max_ws_blocks = max;
        }

        sd_config_write_same(sdkp);

        return count;
}
static DEVICE_ATTR_RW(max_write_same_blocks);

static ssize_t
zoned_cap_show(struct device *dev, struct device_attribute *attr, char *buf)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);

        if (sdkp->device->type == TYPE_ZBC)
                return sprintf(buf, "host-managed\n");
        if (sdkp->zoned == 1)
                return sprintf(buf, "host-aware\n");
        if (sdkp->zoned == 2)
                return sprintf(buf, "drive-managed\n");
        return sprintf(buf, "none\n");
}
static DEVICE_ATTR_RO(zoned_cap);

static ssize_t
max_retries_store(struct device *dev, struct device_attribute *attr,
                  const char *buf, size_t count)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);
        struct scsi_device *sdev = sdkp->device;
        int retries, err;

        err = kstrtoint(buf, 10, &retries);
        if (err)
                return err;

        if (retries == SCSI_CMD_RETRIES_NO_LIMIT || retries <= SD_MAX_RETRIES) {
                sdkp->max_retries = retries;
                return count;
        }

        sdev_printk(KERN_ERR, sdev, "max_retries must be between -1 and %d\n",
                    SD_MAX_RETRIES);
        return -EINVAL;
}

static ssize_t
max_retries_show(struct device *dev, struct device_attribute *attr,
                 char *buf)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);

        return sprintf(buf, "%d\n", sdkp->max_retries);
}

static DEVICE_ATTR_RW(max_retries);

static struct attribute *sd_disk_attrs[] = {
        &dev_attr_cache_type.attr,
        &dev_attr_FUA.attr,
        &dev_attr_allow_restart.attr,
        &dev_attr_manage_start_stop.attr,
        &dev_attr_protection_type.attr,
        &dev_attr_protection_mode.attr,
        &dev_attr_app_tag_own.attr,
        &dev_attr_thin_provisioning.attr,
        &dev_attr_provisioning_mode.attr,
        &dev_attr_zeroing_mode.attr,
        &dev_attr_max_write_same_blocks.attr,
        &dev_attr_max_medium_access_timeouts.attr,
        &dev_attr_zoned_cap.attr,
        &dev_attr_max_retries.attr,
        NULL,
};
ATTRIBUTE_GROUPS(sd_disk);

static struct class sd_disk_class = {
        .name                = "scsi_disk",
        .owner                = THIS_MODULE,
        .dev_release        = scsi_disk_release,
        .dev_groups        = sd_disk_groups,
};

static const struct dev_pm_ops sd_pm_ops = {
        .suspend                = sd_suspend_system,
        .resume                        = sd_resume,
        .poweroff                = sd_suspend_system,
        .restore                = sd_resume,
        .runtime_suspend        = sd_suspend_runtime,
        .runtime_resume                = sd_resume,
};

static struct scsi_driver sd_template = {
        .gendrv = {
                .name                = "sd",
                .owner                = THIS_MODULE,
                .probe                = sd_probe,
                .probe_type        = PROBE_PREFER_ASYNCHRONOUS,
                .remove                = sd_remove,
                .shutdown        = sd_shutdown,
                .pm                = &sd_pm_ops,
        },
        .rescan                        = sd_rescan,
        .init_command                = sd_init_command,
        .uninit_command                = sd_uninit_command,
        .done                        = sd_done,
        .eh_action                = sd_eh_action,
        .eh_reset                = sd_eh_reset,
};

/*
 * Dummy kobj_map->probe function.
 * The default ->probe function will call modprobe, which is
 * pointless as this module is already loaded.
 */
static struct kobject *sd_default_probe(dev_t devt, int *partno, void *data)
{
        return NULL;
}

/*
 * Device no to disk mapping:
 * 
 *       major         disc2     disc  p1
 *   |............|.............|....|....| <- dev_t
 *    31        20 19          8 7  4 3  0
 * 
 * Inside a major, we have 16k disks, however mapped non-
 * contiguously. The first 16 disks are for major0, the next
 * ones with major1, ... Disk 256 is for major0 again, disk 272 
 * for major1, ... 
 * As we stay compatible with our numbering scheme, we can reuse 
 * the well-know SCSI majors 8, 65--71, 136--143.
 */
static int sd_major(int major_idx)
{
        switch (major_idx) {
        case 0:
                return SCSI_DISK0_MAJOR;
        case 1 ... 7:
                return SCSI_DISK1_MAJOR + major_idx - 1;
        case 8 ... 15:
                return SCSI_DISK8_MAJOR + major_idx - 8;
        default:
                BUG();
                return 0;        /* shut up gcc */
        }
}

static struct scsi_disk *scsi_disk_get(struct gendisk *disk)
{
        struct scsi_disk *sdkp = NULL;

        mutex_lock(&sd_ref_mutex);

        if (disk->private_data) {
                sdkp = scsi_disk(disk);
                if (scsi_device_get(sdkp->device) == 0)
                        get_device(&sdkp->dev);
                else
                        sdkp = NULL;
        }
        mutex_unlock(&sd_ref_mutex);
        return sdkp;
}

static void scsi_disk_put(struct scsi_disk *sdkp)
{
        struct scsi_device *sdev = sdkp->device;

        mutex_lock(&sd_ref_mutex);
        put_device(&sdkp->dev);
        scsi_device_put(sdev);
        mutex_unlock(&sd_ref_mutex);
}

#ifdef CONFIG_BLK_SED_OPAL
static int sd_sec_submit(void *data, u16 spsp, u8 secp, void *buffer,
                size_t len, bool send)
{
        struct scsi_disk *sdkp = data;
        struct scsi_device *sdev = sdkp->device;
        u8 cdb[12] = { 0, };
        int ret;

        cdb[0] = send ? SECURITY_PROTOCOL_OUT : SECURITY_PROTOCOL_IN;
        cdb[1] = secp;
        put_unaligned_be16(spsp, &cdb[2]);
        put_unaligned_be32(len, &cdb[6]);

        ret = scsi_execute(sdev, cdb, send ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
                buffer, len, NULL, NULL, SD_TIMEOUT, sdkp->max_retries, 0,
                RQF_PM, NULL);
        return ret <= 0 ? ret : -EIO;
}
#endif /* CONFIG_BLK_SED_OPAL */

/*
 * Look up the DIX operation based on whether the command is read or
 * write and whether dix and dif are enabled.
 */
static unsigned int sd_prot_op(bool write, bool dix, bool dif)
{
        /* Lookup table: bit 2 (write), bit 1 (dix), bit 0 (dif) */
        static const unsigned int ops[] = {        /* wrt dix dif */
                SCSI_PROT_NORMAL,                /*  0        0   0  */
                SCSI_PROT_READ_STRIP,                /*  0        0   1  */
                SCSI_PROT_READ_INSERT,                /*  0        1   0  */
                SCSI_PROT_READ_PASS,                /*  0        1   1  */
                SCSI_PROT_NORMAL,                /*  1        0   0  */
                SCSI_PROT_WRITE_INSERT,                /*  1        0   1  */
                SCSI_PROT_WRITE_STRIP,                /*  1        1   0  */
                SCSI_PROT_WRITE_PASS,                /*  1        1   1  */
        };

        return ops[write << 2 | dix << 1 | dif];
}

/*
 * Returns a mask of the protection flags that are valid for a given DIX
 * operation.
 */
static unsigned int sd_prot_flag_mask(unsigned int prot_op)
{
        static const unsigned int flag_mask[] = {
                [SCSI_PROT_NORMAL]                = 0,

                [SCSI_PROT_READ_STRIP]                = SCSI_PROT_TRANSFER_PI |
                                                  SCSI_PROT_GUARD_CHECK |
                                                  SCSI_PROT_REF_CHECK |
                                                  SCSI_PROT_REF_INCREMENT,

                [SCSI_PROT_READ_INSERT]                = SCSI_PROT_REF_INCREMENT |
                                                  SCSI_PROT_IP_CHECKSUM,

                [SCSI_PROT_READ_PASS]                = SCSI_PROT_TRANSFER_PI |
                                                  SCSI_PROT_GUARD_CHECK |
                                                  SCSI_PROT_REF_CHECK |
                                                  SCSI_PROT_REF_INCREMENT |
                                                  SCSI_PROT_IP_CHECKSUM,

                [SCSI_PROT_WRITE_INSERT]        = SCSI_PROT_TRANSFER_PI |
                                                  SCSI_PROT_REF_INCREMENT,

                [SCSI_PROT_WRITE_STRIP]                = SCSI_PROT_GUARD_CHECK |
                                                  SCSI_PROT_REF_CHECK |
                                                  SCSI_PROT_REF_INCREMENT |
                                                  SCSI_PROT_IP_CHECKSUM,

                [SCSI_PROT_WRITE_PASS]                = SCSI_PROT_TRANSFER_PI |
                                                  SCSI_PROT_GUARD_CHECK |
                                                  SCSI_PROT_REF_CHECK |
                                                  SCSI_PROT_REF_INCREMENT |
                                                  SCSI_PROT_IP_CHECKSUM,
        };

        return flag_mask[prot_op];
}

static unsigned char sd_setup_protect_cmnd(struct scsi_cmnd *scmd,
                                           unsigned int dix, unsigned int dif)
{
        struct bio *bio = scmd->request->bio;
        unsigned int prot_op = sd_prot_op(rq_data_dir(scmd->request), dix, dif);
        unsigned int protect = 0;

        if (dix) {                                /* DIX Type 0, 1, 2, 3 */
                if (bio_integrity_flagged(bio, BIP_IP_CHECKSUM))
                        scmd->prot_flags |= SCSI_PROT_IP_CHECKSUM;

                if (bio_integrity_flagged(bio, BIP_CTRL_NOCHECK) == false)
                        scmd->prot_flags |= SCSI_PROT_GUARD_CHECK;
        }

        if (dif != T10_PI_TYPE3_PROTECTION) {        /* DIX/DIF Type 0, 1, 2 */
                scmd->prot_flags |= SCSI_PROT_REF_INCREMENT;

                if (bio_integrity_flagged(bio, BIP_CTRL_NOCHECK) == false)
                        scmd->prot_flags |= SCSI_PROT_REF_CHECK;
        }

        if (dif) {                                /* DIX/DIF Type 1, 2, 3 */
                scmd->prot_flags |= SCSI_PROT_TRANSFER_PI;

                if (bio_integrity_flagged(bio, BIP_DISK_NOCHECK))
                        protect = 3 << 5;        /* Disable target PI checking */
                else
                        protect = 1 << 5;        /* Enable target PI checking */
        }

        scsi_set_prot_op(scmd, prot_op);
        scsi_set_prot_type(scmd, dif);
        scmd->prot_flags &= sd_prot_flag_mask(prot_op);

        return protect;
}

static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode)
{
        struct request_queue *q = sdkp->disk->queue;
        unsigned int logical_block_size = sdkp->device->sector_size;
        unsigned int max_blocks = 0;

        q->limits.discard_alignment =
                sdkp->unmap_alignment * logical_block_size;
        q->limits.discard_granularity =
                max(sdkp->physical_block_size,
                    sdkp->unmap_granularity * logical_block_size);
        sdkp->provisioning_mode = mode;

        switch (mode) {

        case SD_LBP_FULL:
        case SD_LBP_DISABLE:
                blk_queue_max_discard_sectors(q, 0);
                blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q);
                return;

        case SD_LBP_UNMAP:
                max_blocks = min_not_zero(sdkp->max_unmap_blocks,
                                          (u32)SD_MAX_WS16_BLOCKS);
                break;

        case SD_LBP_WS16:
                if (sdkp->device->unmap_limit_for_ws)
                        max_blocks = sdkp->max_unmap_blocks;
                else
                        max_blocks = sdkp->max_ws_blocks;

                max_blocks = min_not_zero(max_blocks, (u32)SD_MAX_WS16_BLOCKS);
                break;

        case SD_LBP_WS10:
                if (sdkp->device->unmap_limit_for_ws)
                        max_blocks = sdkp->max_unmap_blocks;
                else
                        max_blocks = sdkp->max_ws_blocks;

                max_blocks = min_not_zero(max_blocks, (u32)SD_MAX_WS10_BLOCKS);
                break;

        case SD_LBP_ZERO:
                max_blocks = min_not_zero(sdkp->max_ws_blocks,
                                          (u32)SD_MAX_WS10_BLOCKS);
                break;
        }

        blk_queue_max_discard_sectors(q, max_blocks * (logical_block_size >> 9));
        blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
}

static blk_status_t sd_setup_unmap_cmnd(struct scsi_cmnd *cmd)
{
        struct scsi_device *sdp = cmd->device;
        struct request *rq = cmd->request;
        struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
        u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq));
        u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq));
        unsigned int data_len = 24;
        char *buf;

        rq->special_vec.bv_page = mempool_alloc(sd_page_pool, GFP_ATOMIC);
        if (!rq->special_vec.bv_page)
                return BLK_STS_RESOURCE;
        clear_highpage(rq->special_vec.bv_page);
        rq->special_vec.bv_offset = 0;
        rq->special_vec.bv_len = data_len;
        rq->rq_flags |= RQF_SPECIAL_PAYLOAD;

        cmd->cmd_len = 10;
        cmd->cmnd[0] = UNMAP;
        cmd->cmnd[8] = 24;

        buf = page_address(rq->special_vec.bv_page);
        put_unaligned_be16(6 + 16, &buf[0]);
        put_unaligned_be16(16, &buf[2]);
        put_unaligned_be64(lba, &buf[8]);
        put_unaligned_be32(nr_blocks, &buf[16]);

        cmd->allowed = sdkp->max_retries;
        cmd->transfersize = data_len;
        rq->timeout = SD_TIMEOUT;

        return scsi_alloc_sgtables(cmd);
}

static blk_status_t sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd,
                bool unmap)
{
        struct scsi_device *sdp = cmd->device;
        struct request *rq = cmd->request;
        struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
        u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq));
        u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq));
        u32 data_len = sdp->sector_size;

        rq->special_vec.bv_page = mempool_alloc(sd_page_pool, GFP_ATOMIC);
        if (!rq->special_vec.bv_page)
                return BLK_STS_RESOURCE;
        clear_highpage(rq->special_vec.bv_page);
        rq->special_vec.bv_offset = 0;
        rq->special_vec.bv_len = data_len;
        rq->rq_flags |= RQF_SPECIAL_PAYLOAD;

        cmd->cmd_len = 16;
        cmd->cmnd[0] = WRITE_SAME_16;
        if (unmap)
                cmd->cmnd[1] = 0x8; /* UNMAP */
        put_unaligned_be64(lba, &cmd->cmnd[2]);
        put_unaligned_be32(nr_blocks, &cmd->cmnd[10]);

        cmd->allowed = sdkp->max_retries;
        cmd->transfersize = data_len;
        rq->timeout = unmap ? SD_TIMEOUT : SD_WRITE_SAME_TIMEOUT;

        return scsi_alloc_sgtables(cmd);
}

static blk_status_t sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd,
                bool unmap)
{
        struct scsi_device *sdp = cmd->device;
        struct request *rq = cmd->request;
        struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
        u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq));
        u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq));
        u32 data_len = sdp->sector_size;

        rq->special_vec.bv_page = mempool_alloc(sd_page_pool, GFP_ATOMIC);
        if (!rq->special_vec.bv_page)
                return BLK_STS_RESOURCE;
        clear_highpage(rq->special_vec.bv_page);
        rq->special_vec.bv_offset = 0;
        rq->special_vec.bv_len = data_len;
        rq->rq_flags |= RQF_SPECIAL_PAYLOAD;

        cmd->cmd_len = 10;
        cmd->cmnd[0] = WRITE_SAME;
        if (unmap)
                cmd->cmnd[1] = 0x8; /* UNMAP */
        put_unaligned_be32(lba, &cmd->cmnd[2]);
        put_unaligned_be16(nr_blocks, &cmd->cmnd[7]);

        cmd->allowed = sdkp->max_retries;
        cmd->transfersize = data_len;
        rq->timeout = unmap ? SD_TIMEOUT : SD_WRITE_SAME_TIMEOUT;

        return scsi_alloc_sgtables(cmd);
}

static blk_status_t sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd)
{
        struct request *rq = cmd->request;
        struct scsi_device *sdp = cmd->device;
        struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
        u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq));
        u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq));

        if (!(rq->cmd_flags & REQ_NOUNMAP)) {
                switch (sdkp->zeroing_mode) {
                case SD_ZERO_WS16_UNMAP:
                        return sd_setup_write_same16_cmnd(cmd, true);
                case SD_ZERO_WS10_UNMAP:
                        return sd_setup_write_same10_cmnd(cmd, true);
                }
        }

        if (sdp->no_write_same) {
                rq->rq_flags |= RQF_QUIET;
                return BLK_STS_TARGET;
        }

        if (sdkp->ws16 || lba > 0xffffffff || nr_blocks > 0xffff)
                return sd_setup_write_same16_cmnd(cmd, false);

        return sd_setup_write_same10_cmnd(cmd, false);
}

static void sd_config_write_same(struct scsi_disk *sdkp)
{
        struct request_queue *q = sdkp->disk->queue;
        unsigned int logical_block_size = sdkp->device->sector_size;

        if (sdkp->device->no_write_same) {
                sdkp->max_ws_blocks = 0;
                goto out;
        }

        /* Some devices can not handle block counts above 0xffff despite
         * supporting WRITE SAME(16). Consequently we default to 64k
         * blocks per I/O unless the device explicitly advertises a
         * bigger limit.
         */
        if (sdkp->max_ws_blocks > SD_MAX_WS10_BLOCKS)
                sdkp->max_ws_blocks = min_not_zero(sdkp->max_ws_blocks,
                                                   (u32)SD_MAX_WS16_BLOCKS);
        else if (sdkp->ws16 || sdkp->ws10 || sdkp->device->no_report_opcodes)
                sdkp->max_ws_blocks = min_not_zero(sdkp->max_ws_blocks,
                                                   (u32)SD_MAX_WS10_BLOCKS);
        else {
                sdkp->device->no_write_same = 1;
                sdkp->max_ws_blocks = 0;
        }

        if (sdkp->lbprz && sdkp->lbpws)
                sdkp->zeroing_mode = SD_ZERO_WS16_UNMAP;
        else if (sdkp->lbprz && sdkp->lbpws10)
                sdkp->zeroing_mode = SD_ZERO_WS10_UNMAP;
        else if (sdkp->max_ws_blocks)
                sdkp->zeroing_mode = SD_ZERO_WS;
        else
                sdkp->zeroing_mode = SD_ZERO_WRITE;

        if (sdkp->max_ws_blocks &&
            sdkp->physical_block_size > logical_block_size) {
                /*
                 * Reporting a maximum number of blocks that is not aligned
                 * on the device physical size would cause a large write same
                 * request to be split into physically unaligned chunks by
                 * __blkdev_issue_write_zeroes() and __blkdev_issue_write_same()
                 * even if the caller of these functions took care to align the
                 * large request. So make sure the maximum reported is aligned
                 * to the device physical block size. This is only an optional
                 * optimization for regular disks, but this is mandatory to
                 * avoid failure of large write same requests directed at
                 * sequential write required zones of host-managed ZBC disks.
                 */
                sdkp->max_ws_blocks =
                        round_down(sdkp->max_ws_blocks,
                                   bytes_to_logical(sdkp->device,
                                                    sdkp->physical_block_size));
        }

out:
        blk_queue_max_write_same_sectors(q, sdkp->max_ws_blocks *
                                         (logical_block_size >> 9));
        blk_queue_max_write_zeroes_sectors(q, sdkp->max_ws_blocks *
                                         (logical_block_size >> 9));
}

/**
 * sd_setup_write_same_cmnd - write the same data to multiple blocks
 * @cmd: command to prepare
 *
 * Will set up either WRITE SAME(10) or WRITE SAME(16) depending on
 * the preference indicated by the target device.
 **/
static blk_status_t sd_setup_write_same_cmnd(struct scsi_cmnd *cmd)
{
        struct request *rq = cmd->request;
        struct scsi_device *sdp = cmd->device;
        struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
        struct bio *bio = rq->bio;
        u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq));
        u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq));
        unsigned int nr_bytes = blk_rq_bytes(rq);
        blk_status_t ret;

        if (sdkp->device->no_write_same)
                return BLK_STS_TARGET;

        BUG_ON(bio_offset(bio) || bio_iovec(bio).bv_len != sdp->sector_size);

        rq->timeout = SD_WRITE_SAME_TIMEOUT;

        if (sdkp->ws16 || lba > 0xffffffff || nr_blocks > 0xffff) {
                cmd->cmd_len = 16;
                cmd->cmnd[0] = WRITE_SAME_16;
                put_unaligned_be64(lba, &cmd->cmnd[2]);
                put_unaligned_be32(nr_blocks, &cmd->cmnd[10]);
        } else {
                cmd->cmd_len = 10;
                cmd->cmnd[0] = WRITE_SAME;
                put_unaligned_be32(lba, &cmd->cmnd[2]);
                put_unaligned_be16(nr_blocks, &cmd->cmnd[7]);
        }

        cmd->transfersize = sdp->sector_size;
        cmd->allowed = sdkp->max_retries;

        /*
         * For WRITE SAME the data transferred via the DATA OUT buffer is
         * different from the amount of data actually written to the target.
         *
         * We set up __data_len to the amount of data transferred via the
         * DATA OUT buffer so that blk_rq_map_sg sets up the proper S/G list
         * to transfer a single sector of data first, but then reset it to
         * the amount of data to be written right after so that the I/O path
         * knows how much to actually write.
         */
        rq->__data_len = sdp->sector_size;
        ret = scsi_alloc_sgtables(cmd);
        rq->__data_len = nr_bytes;

        return ret;
}

static blk_status_t sd_setup_flush_cmnd(struct scsi_cmnd *cmd)
{
        struct request *rq = cmd->request;
        struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);

        /* flush requests don't perform I/O, zero the S/G table */
        memset(&cmd->sdb, 0, sizeof(cmd->sdb));

        cmd->cmnd[0] = SYNCHRONIZE_CACHE;
        cmd->cmd_len = 10;
        cmd->transfersize = 0;
        cmd->allowed = sdkp->max_retries;

        rq->timeout = rq->q->rq_timeout * SD_FLUSH_TIMEOUT_MULTIPLIER;
        return BLK_STS_OK;
}

static blk_status_t sd_setup_rw32_cmnd(struct scsi_cmnd *cmd, bool write,
                                       sector_t lba, unsigned int nr_blocks,
                                       unsigned char flags)
{
        cmd->cmnd = mempool_alloc(sd_cdb_pool, GFP_ATOMIC);
        if (unlikely(cmd->cmnd == NULL))
                return BLK_STS_RESOURCE;

        cmd->cmd_len = SD_EXT_CDB_SIZE;
        memset(cmd->cmnd, 0, cmd->cmd_len);

        cmd->cmnd[0]  = VARIABLE_LENGTH_CMD;
        cmd->cmnd[7]  = 0x18; /* Additional CDB len */
        cmd->cmnd[9]  = write ? WRITE_32 : READ_32;
        cmd->cmnd[10] = flags;
        put_unaligned_be64(lba, &cmd->cmnd[12]);
        put_unaligned_be32(lba, &cmd->cmnd[20]); /* Expected Indirect LBA */
        put_unaligned_be32(nr_blocks, &cmd->cmnd[28]);

        return BLK_STS_OK;
}

static blk_status_t sd_setup_rw16_cmnd(struct scsi_cmnd *cmd, bool write,
                                       sector_t lba, unsigned int nr_blocks,
                                       unsigned char flags)
{
        cmd->cmd_len  = 16;
        cmd->cmnd[0]  = write ? WRITE_16 : READ_16;
        cmd->cmnd[1]  = flags;
        cmd->cmnd[14] = 0;
        cmd->cmnd[15] = 0;
        put_unaligned_be64(lba, &cmd->cmnd[2]);
        put_unaligned_be32(nr_blocks, &cmd->cmnd[10]);

        return BLK_STS_OK;
}

static blk_status_t sd_setup_rw10_cmnd(struct scsi_cmnd *cmd, bool write,
                                       sector_t lba, unsigned int nr_blocks,
                                       unsigned char flags)
{
        cmd->cmd_len = 10;
        cmd->cmnd[0] = write ? WRITE_10 : READ_10;
        cmd->cmnd[1] = flags;
        cmd->cmnd[6] = 0;
        cmd->cmnd[9] = 0;
        put_unaligned_be32(lba, &cmd->cmnd[2]);
        put_unaligned_be16(nr_blocks, &cmd->cmnd[7]);

        return BLK_STS_OK;
}

static blk_status_t sd_setup_rw6_cmnd(struct scsi_cmnd *cmd, bool write,
                                      sector_t lba, unsigned int nr_blocks,
                                      unsigned char flags)
{
        /* Avoid that 0 blocks gets translated into 256 blocks. */
        if (WARN_ON_ONCE(nr_blocks == 0))
                return BLK_STS_IOERR;

        if (unlikely(flags & 0x8)) {
                /*
                 * This happens only if this drive failed 10byte rw
                 * command with ILLEGAL_REQUEST during operation and
                 * thus turned off use_10_for_rw.
                 */
                scmd_printk(KERN_ERR, cmd, "FUA write on READ/WRITE(6) drive\n");
                return BLK_STS_IOERR;
        }

        cmd->cmd_len = 6;
        cmd->cmnd[0] = write ? WRITE_6 : READ_6;
        cmd->cmnd[1] = (lba >> 16) & 0x1f;
        cmd->cmnd[2] = (lba >> 8) & 0xff;
        cmd->cmnd[3] = lba & 0xff;
        cmd->cmnd[4] = nr_blocks;
        cmd->cmnd[5] = 0;

        return BLK_STS_OK;
}

static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd)
{
        struct request *rq = cmd->request;
        struct scsi_device *sdp = cmd->device;
        struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
        sector_t lba = sectors_to_logical(sdp, blk_rq_pos(rq));
        sector_t threshold;
        unsigned int nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq));
        unsigned int mask = logical_to_sectors(sdp, 1) - 1;
        bool write = rq_data_dir(rq) == WRITE;
        unsigned char protect, fua;
        blk_status_t ret;
        unsigned int dif;
        bool dix;

        ret = scsi_alloc_sgtables(cmd);
        if (ret != BLK_STS_OK)
                return ret;

        ret = BLK_STS_IOERR;
        if (!scsi_device_online(sdp) || sdp->changed) {
                scmd_printk(KERN_ERR, cmd, "device offline or changed\n");
                goto fail;
        }

        if (blk_rq_pos(rq) + blk_rq_sectors(rq) > get_capacity(rq->rq_disk)) {
                scmd_printk(KERN_ERR, cmd, "access beyond end of device\n");
                goto fail;
        }

        if ((blk_rq_pos(rq) & mask) || (blk_rq_sectors(rq) & mask)) {
                scmd_printk(KERN_ERR, cmd, "request not aligned to the logical block size\n");
                goto fail;
        }

        /*
         * Some SD card readers can't handle accesses which touch the
         * last one or two logical blocks. Split accesses as needed.
         */
        threshold = sdkp->capacity - SD_LAST_BUGGY_SECTORS;

        if (unlikely(sdp->last_sector_bug && lba + nr_blocks > threshold)) {
                if (lba < threshold) {
                        /* Access up to the threshold but not beyond */
                        nr_blocks = threshold - lba;
                } else {
                        /* Access only a single logical block */
                        nr_blocks = 1;
                }
        }

        if (req_op(rq) == REQ_OP_ZONE_APPEND) {
                ret = sd_zbc_prepare_zone_append(cmd, &lba, nr_blocks);
                if (ret)
                        goto fail;
        }

        fua = rq->cmd_flags & REQ_FUA ? 0x8 : 0;
        dix = scsi_prot_sg_count(cmd);
        dif = scsi_host_dif_capable(cmd->device->host, sdkp->protection_type);

        if (dif || dix)
                protect = sd_setup_protect_cmnd(cmd, dix, dif);
        else
                protect = 0;

        if (protect && sdkp->protection_type == T10_PI_TYPE2_PROTECTION) {
                ret = sd_setup_rw32_cmnd(cmd, write, lba, nr_blocks,
                                         protect | fua);
        } else if (sdp->use_16_for_rw || (nr_blocks > 0xffff)) {
                ret = sd_setup_rw16_cmnd(cmd, write, lba, nr_blocks,
                                         protect | fua);
        } else if ((nr_blocks > 0xff) || (lba > 0x1fffff) ||
                   sdp->use_10_for_rw || protect) {
                ret = sd_setup_rw10_cmnd(cmd, write, lba, nr_blocks,
                                         protect | fua);
        } else {
                ret = sd_setup_rw6_cmnd(cmd, write, lba, nr_blocks,
                                        protect | fua);
        }

        if (unlikely(ret != BLK_STS_OK))
                goto fail;

        /*
         * We shouldn't disconnect in the middle of a sector, so with a dumb
         * host adapter, it's safe to assume that we can at least transfer
         * this many bytes between each connect / disconnect.
         */
        cmd->transfersize = sdp->sector_size;
        cmd->underflow = nr_blocks << 9;
        cmd->allowed = sdkp->max_retries;
        cmd->sdb.length = nr_blocks * sdp->sector_size;

        SCSI_LOG_HLQUEUE(1,
                         scmd_printk(KERN_INFO, cmd,
                                     "%s: block=%llu, count=%d\n", __func__,
                                     (unsigned long long)blk_rq_pos(rq),
                                     blk_rq_sectors(rq)));
        SCSI_LOG_HLQUEUE(2,
                         scmd_printk(KERN_INFO, cmd,
                                     "%s %d/%u 512 byte blocks.\n",
                                     write ? "writing" : "reading", nr_blocks,
                                     blk_rq_sectors(rq)));

        /*
         * This indicates that the command is ready from our end to be queued.
         */
        return BLK_STS_OK;
fail:
        scsi_free_sgtables(cmd);
        return ret;
}

static blk_status_t sd_init_command(struct scsi_cmnd *cmd)
{
        struct request *rq = cmd->request;

        switch (req_op(rq)) {
        case REQ_OP_DISCARD:
                switch (scsi_disk(rq->rq_disk)->provisioning_mode) {
                case SD_LBP_UNMAP:
                        return sd_setup_unmap_cmnd(cmd);
                case SD_LBP_WS16:
                        return sd_setup_write_same16_cmnd(cmd, true);
                case SD_LBP_WS10:
                        return sd_setup_write_same10_cmnd(cmd, true);
                case SD_LBP_ZERO:
                        return sd_setup_write_same10_cmnd(cmd, false);
                default:
                        return BLK_STS_TARGET;
                }
        case REQ_OP_WRITE_ZEROES:
                return sd_setup_write_zeroes_cmnd(cmd);
        case REQ_OP_WRITE_SAME:
                return sd_setup_write_same_cmnd(cmd);
        case REQ_OP_FLUSH:
                return sd_setup_flush_cmnd(cmd);
        case REQ_OP_READ:
        case REQ_OP_WRITE:
        case REQ_OP_ZONE_APPEND:
                return sd_setup_read_write_cmnd(cmd);
        case REQ_OP_ZONE_RESET:
                return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_RESET_WRITE_POINTER,
                                                   false);
        case REQ_OP_ZONE_RESET_ALL:
                return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_RESET_WRITE_POINTER,
                                                   true);
        case REQ_OP_ZONE_OPEN:
                return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_OPEN_ZONE, false);
        case REQ_OP_ZONE_CLOSE:
                return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_CLOSE_ZONE, false);
        case REQ_OP_ZONE_FINISH:
                return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_FINISH_ZONE, false);
        default:
                WARN_ON_ONCE(1);
                return BLK_STS_NOTSUPP;
        }
}

static void sd_uninit_command(struct scsi_cmnd *SCpnt)
{
        struct request *rq = SCpnt->request;
        u8 *cmnd;

        if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
                mempool_free(rq->special_vec.bv_page, sd_page_pool);

        if (SCpnt->cmnd != scsi_req(rq)->cmd) {
                cmnd = SCpnt->cmnd;
                SCpnt->cmnd = NULL;
                SCpnt->cmd_len = 0;
                mempool_free(cmnd, sd_cdb_pool);
        }
}

static bool sd_need_revalidate(struct block_device *bdev,
                struct scsi_disk *sdkp)
{
        if (sdkp->device->removable || sdkp->write_prot) {
                if (bdev_check_media_change(bdev))
                        return true;
        }

        /*
         * Force a full rescan after ioctl(BLKRRPART).  While the disk state has
         * nothing to do with partitions, BLKRRPART is used to force a full
         * revalidate after things like a format for historical reasons.
         */
        return test_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
}

/**
 *        sd_open - open a scsi disk device
 *        @bdev: Block device of the scsi disk to open
 *        @mode: FMODE_* mask
 *
 *        Returns 0 if successful. Returns a negated errno value in case 
 *        of error.
 *
 *        Note: This can be called from a user context (e.g. fsck(1) )
 *        or from within the kernel (e.g. as a result of a mount(1) ).
 *        In the latter case @inode and @filp carry an abridged amount
 *        of information as noted above.
 *
 *        Locking: called with bdev->bd_mutex held.
 **/
static int sd_open(struct block_device *bdev, fmode_t mode)
{
        struct scsi_disk *sdkp = scsi_disk_get(bdev->bd_disk);
        struct scsi_device *sdev;
        int retval;

        if (!sdkp)
                return -ENXIO;

        SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_open\n"));

        sdev = sdkp->device;

        /*
         * If the device is in error recovery, wait until it is done.
         * If the device is offline, then disallow any access to it.
         */
        retval = -ENXIO;
        if (!scsi_block_when_processing_errors(sdev))
                goto error_out;

        if (sd_need_revalidate(bdev, sdkp))
                sd_revalidate_disk(bdev->bd_disk);

        /*
         * If the drive is empty, just let the open fail.
         */
        retval = -ENOMEDIUM;
        if (sdev->removable && !sdkp->media_present && !(mode & FMODE_NDELAY))
                goto error_out;

        /*
         * If the device has the write protect tab set, have the open fail
         * if the user expects to be able to write to the thing.
         */
        retval = -EROFS;
        if (sdkp->write_prot && (mode & FMODE_WRITE))
                goto error_out;

        /*
         * It is possible that the disk changing stuff resulted in
         * the device being taken offline.  If this is the case,
         * report this to the user, and don't pretend that the
         * open actually succeeded.
         */
        retval = -ENXIO;
        if (!scsi_device_online(sdev))
                goto error_out;

        if ((atomic_inc_return(&sdkp->openers) == 1) && sdev->removable) {
                if (scsi_block_when_processing_errors(sdev))
                        scsi_set_medium_removal(sdev, SCSI_REMOVAL_PREVENT);
        }

        return 0;

error_out:
        scsi_disk_put(sdkp);
        return retval;        
}

/**
 *        sd_release - invoked when the (last) close(2) is called on this
 *        scsi disk.
 *        @disk: disk to release
 *        @mode: FMODE_* mask
 *
 *        Returns 0. 
 *
 *        Note: may block (uninterruptible) if error recovery is underway
 *        on this disk.
 *
 *        Locking: called with bdev->bd_mutex held.
 **/
static void sd_release(struct gendisk *disk, fmode_t mode)
{
        struct scsi_disk *sdkp = scsi_disk(disk);
        struct scsi_device *sdev = sdkp->device;

        SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_release\n"));

        if (atomic_dec_return(&sdkp->openers) == 0 && sdev->removable) {
                if (scsi_block_when_processing_errors(sdev))
                        scsi_set_medium_removal(sdev, SCSI_REMOVAL_ALLOW);
        }

        scsi_disk_put(sdkp);
}

static int sd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
{
        struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk);
        struct scsi_device *sdp = sdkp->device;
        struct Scsi_Host *host = sdp->host;
        sector_t capacity = logical_to_sectors(sdp, sdkp->capacity);
        int diskinfo[4];

        /* default to most commonly used values */
        diskinfo[0] = 0x40;        /* 1 << 6 */
        diskinfo[1] = 0x20;        /* 1 << 5 */
        diskinfo[2] = capacity >> 11;

        /* override with calculated, extended default, or driver values */
        if (host->hostt->bios_param)
                host->hostt->bios_param(sdp, bdev, capacity, diskinfo);
        else
                scsicam_bios_param(bdev, capacity, diskinfo);

        geo->heads = diskinfo[0];
        geo->sectors = diskinfo[1];
        geo->cylinders = diskinfo[2];
        return 0;
}

/**
 *        sd_ioctl - process an ioctl
 *        @bdev: target block device
 *        @mode: FMODE_* mask
 *        @cmd: ioctl command number
 *        @p: this is third argument given to ioctl(2) system call.
 *        Often contains a pointer.
 *
 *        Returns 0 if successful (some ioctls return positive numbers on
 *        success as well). Returns a negated errno value in case of error.
 *
 *        Note: most ioctls are forward onto the block subsystem or further
 *        down in the scsi subsystem.
 **/
static int sd_ioctl_common(struct block_device *bdev, fmode_t mode,
                           unsigned int cmd, void __user *p)
{
        struct gendisk *disk = bdev->bd_disk;
        struct scsi_disk *sdkp = scsi_disk(disk);
        struct scsi_device *sdp = sdkp->device;
        int error;
    
        SCSI_LOG_IOCTL(1, sd_printk(KERN_INFO, sdkp, "sd_ioctl: disk=%s, "
                                    "cmd=0x%x\n", disk->disk_name, cmd));

        error = scsi_verify_blk_ioctl(bdev, cmd);
        if (error < 0)
                return error;

        /*
         * If we are in the middle of error recovery, don't let anyone
         * else try and use this device.  Also, if error recovery fails, it
         * may try and take the device offline, in which case all further
         * access to the device is prohibited.
         */
        error = scsi_ioctl_block_when_processing_errors(sdp, cmd,
                        (mode & FMODE_NDELAY) != 0);
        if (error)
                goto out;

        if (is_sed_ioctl(cmd))
                return sed_ioctl(sdkp->opal_dev, cmd, p);

        /*
         * Send SCSI addressing ioctls directly to mid level, send other
         * ioctls to block level and then onto mid level if they can't be
         * resolved.
         */
        switch (cmd) {
                case SCSI_IOCTL_GET_IDLUN:
                case SCSI_IOCTL_GET_BUS_NUMBER:
                        error = scsi_ioctl(sdp, cmd, p);
                        break;
                default:
                        error = scsi_cmd_blk_ioctl(bdev, mode, cmd, p);
                        break;
        }
out:
        return error;
}

static void set_media_not_present(struct scsi_disk *sdkp)
{
        if (sdkp->media_present)
                sdkp->device->changed = 1;

        if (sdkp->device->removable) {
                sdkp->media_present = 0;
                sdkp->capacity = 0;
        }
}

static int media_not_present(struct scsi_disk *sdkp,
                             struct scsi_sense_hdr *sshdr)
{
        if (!scsi_sense_valid(sshdr))
                return 0;

        /* not invoked for commands that could return deferred errors */
        switch (sshdr->sense_key) {
        case UNIT_ATTENTION:
        case NOT_READY:
                /* medium not present */
                if (sshdr->asc == 0x3A) {
                        set_media_not_present(sdkp);
                        return 1;
                }
        }
        return 0;
}

/**
 *        sd_check_events - check media events
 *        @disk: kernel device descriptor
 *        @clearing: disk events currently being cleared
 *
 *        Returns mask of DISK_EVENT_*.
 *
 *        Note: this function is invoked from the block subsystem.
 **/
static unsigned int sd_check_events(struct gendisk *disk, unsigned int clearing)
{
        struct scsi_disk *sdkp = scsi_disk_get(disk);
        struct scsi_device *sdp;
        int retval;

        if (!sdkp)
                return 0;

        sdp = sdkp->device;
        SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_check_events\n"));

        /*
         * If the device is offline, don't send any commands - just pretend as
         * if the command failed.  If the device ever comes back online, we
         * can deal with it then.  It is only because of unrecoverable errors
         * that we would ever take a device offline in the first place.
         */
        if (!scsi_device_online(sdp)) {
                set_media_not_present(sdkp);
                goto out;
        }

        /*
         * Using TEST_UNIT_READY enables differentiation between drive with
         * no cartridge loaded - NOT READY, drive with changed cartridge -
         * UNIT ATTENTION, or with same cartridge - GOOD STATUS.
         *
         * Drives that auto spin down. eg iomega jaz 1G, will be started
         * by sd_spinup_disk() from sd_revalidate_disk(), which happens whenever
         * sd_revalidate() is called.
         */
        if (scsi_block_when_processing_errors(sdp)) {
                struct scsi_sense_hdr sshdr = { 0, };

                retval = scsi_test_unit_ready(sdp, SD_TIMEOUT, sdkp->max_retries,
                                              &sshdr);

                /* failed to execute TUR, assume media not present */
                if (host_byte(retval)) {
                        set_media_not_present(sdkp);
                        goto out;
                }

                if (media_not_present(sdkp, &sshdr))
                        goto out;
        }

        /*
         * For removable scsi disk we have to recognise the presence
         * of a disk in the drive.
         */
        if (!sdkp->media_present)
                sdp->changed = 1;
        sdkp->media_present = 1;
out:
        /*
         * sdp->changed is set under the following conditions:
         *
         *        Medium present state has changed in either direction.
         *        Device has indicated UNIT_ATTENTION.
         */
        retval = sdp->changed ? DISK_EVENT_MEDIA_CHANGE : 0;
        sdp->changed = 0;
        scsi_disk_put(sdkp);
        return retval;
}

static int sd_sync_cache(struct scsi_disk *sdkp, struct scsi_sense_hdr *sshdr)
{
        int retries, res;
        struct scsi_device *sdp = sdkp->device;
        const int timeout = sdp->request_queue->rq_timeout
                * SD_FLUSH_TIMEOUT_MULTIPLIER;
        struct scsi_sense_hdr my_sshdr;

        if (!scsi_device_online(sdp))
                return -ENODEV;

        /* caller might not be interested in sense, but we need it */
        if (!sshdr)
                sshdr = &my_sshdr;

        for (retries = 3; retries > 0; --retries) {
                unsigned char cmd[10] = { 0 };

                cmd[0] = SYNCHRONIZE_CACHE;
                /*
                 * Leave the rest of the command zero to indicate
                 * flush everything.
                 */
                res = scsi_execute(sdp, cmd, DMA_NONE, NULL, 0, NULL, sshdr,
                                timeout, sdkp->max_retries, 0, RQF_PM, NULL);
                if (res == 0)
                        break;
        }

        if (res) {
                sd_print_result(sdkp, "Synchronize Cache(10) failed", res);

                if (driver_byte(res) == DRIVER_SENSE)
                        sd_print_sense_hdr(sdkp, sshdr);

                /* we need to evaluate the error return  */
                if (scsi_sense_valid(sshdr) &&
                        (sshdr->asc == 0x3a ||        /* medium not present */
                         sshdr->asc == 0x20 ||        /* invalid command */
                         (sshdr->asc == 0x74 && sshdr->ascq == 0x71)))        /* drive is password locked */
                                /* this is no error here */
                                return 0;

                switch (host_byte(res)) {
                /* ignore errors due to racing a disconnection */
                case DID_BAD_TARGET:
                case DID_NO_CONNECT:
                        return 0;
                /* signal the upper layer it might try again */
                case DID_BUS_BUSY:
                case DID_IMM_RETRY:
                case DID_REQUEUE:
                case DID_SOFT_ERROR:
                        return -EBUSY;
                default:
                        return -EIO;
                }
        }
        return 0;
}

static void sd_rescan(struct device *dev)
{
        struct scsi_disk *sdkp = dev_get_drvdata(dev);

        sd_revalidate_disk(sdkp->disk);
}

static int sd_ioctl(struct block_device *bdev, fmode_t mode,
                    unsigned int cmd, unsigned long arg)
{
        void __user *p = (void __user *)arg;
        int ret;

        ret = sd_ioctl_common(bdev, mode, cmd, p);
        if (ret != -ENOTTY)
                return ret;

        return scsi_ioctl(scsi_disk(bdev->bd_disk)->device, cmd, p);
}

#ifdef CONFIG_COMPAT
static int sd_compat_ioctl(struct block_device *bdev, fmode_t mode,
                           unsigned int cmd, unsigned long arg)
{
        void __user *p = compat_ptr(arg);
        int ret;

        ret = sd_ioctl_common(bdev, mode, cmd, p);
        if (ret != -ENOTTY)
                return ret;

        return scsi_compat_ioctl(scsi_disk(bdev->bd_disk)->device, cmd, p);
}
#endif

static char sd_pr_type(enum pr_type type)
{
        switch (type) {
        case PR_WRITE_EXCLUSIVE:
                return 0x01;
        case PR_EXCLUSIVE_ACCESS:
                return 0x03;
        case PR_WRITE_EXCLUSIVE_REG_ONLY:
                return 0x05;
        case PR_EXCLUSIVE_ACCESS_REG_ONLY:
                return 0x06;
        case PR_WRITE_EXCLUSIVE_ALL_REGS:
                return 0x07;
        case PR_EXCLUSIVE_ACCESS_ALL_REGS:
                return 0x08;
        default:
                return 0;
        }
};

static int sd_pr_command(struct block_device *bdev, u8 sa,
                u64 key, u64 sa_key, u8 type, u8 flags)
{
        struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk);
        struct scsi_device *sdev = sdkp->device;
        struct scsi_sense_hdr sshdr;
        int result;
        u8 cmd[16] = { 0, };
        u8 data[24] = { 0, };

        cmd[0] = PERSISTENT_RESERVE_OUT;
        cmd[1] = sa;
        cmd[2] = type;
        put_unaligned_be32(sizeof(data), &cmd[5]);

        put_unaligned_be64(key, &data[0]);
        put_unaligned_be64(sa_key, &data[8]);
        data[20] = flags;

        result = scsi_execute_req(sdev, cmd, DMA_TO_DEVICE, &data, sizeof(data),
                        &sshdr, SD_TIMEOUT, sdkp->max_retries, NULL);

        if (driver_byte(result) == DRIVER_SENSE &&
            scsi_sense_valid(&sshdr)) {
                sdev_printk(KERN_INFO, sdev, "PR command failed: %d\n", result);
                scsi_print_sense_hdr(sdev, NULL, &sshdr);
        }

        return result;
}

static int sd_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
                u32 flags)
{
        if (flags & ~PR_FL_IGNORE_KEY)
                return -EOPNOTSUPP;
        return sd_pr_command(bdev, (flags & PR_FL_IGNORE_KEY) ? 0x06 : 0x00,
                        old_key, new_key, 0,
                        (1 << 0) /* APTPL */);
}

static int sd_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
                u32 flags)
{
        if (flags)
                return -EOPNOTSUPP;
        return sd_pr_command(bdev, 0x01, key, 0, sd_pr_type(type), 0);
}

static int sd_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
{
        return sd_pr_command(bdev, 0x02, key, 0, sd_pr_type(type), 0);
}

static int sd_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
                enum pr_type type, bool abort)
{
        return sd_pr_command(bdev, abort ? 0x05 : 0x04, old_key, new_key,
                             sd_pr_type(type), 0);
}

static int sd_pr_clear(struct block_device *bdev, u64 key)
{
        return sd_pr_command(bdev, 0x03, key, 0, 0, 0);
}

static const struct pr_ops sd_pr_ops = {
        .pr_register        = sd_pr_register,
        .pr_reserve        = sd_pr_reserve,
        .pr_release        = sd_pr_release,
        .pr_preempt        = sd_pr_preempt,
        .pr_clear        = sd_pr_clear,
};

static const struct block_device_operations sd_fops = {
        .owner                        = THIS_MODULE,
        .open                        = sd_open,
        .release                = sd_release,
        .ioctl                        = sd_ioctl,
        .getgeo                        = sd_getgeo,
#ifdef CONFIG_COMPAT
        .compat_ioctl                = sd_compat_ioctl,
#endif
        .check_events                = sd_check_events,
        .unlock_native_capacity        = sd_unlock_native_capacity,
        .report_zones                = sd_zbc_report_zones,
        .pr_ops                        = &sd_pr_ops,
};

/**
 *        sd_eh_reset - reset error handling callback
 *        @scmd:                sd-issued command that has failed
 *
 *        This function is called by the SCSI midlayer before starting
 *        SCSI EH. When counting medium access failures we have to be
 *        careful to register it only only once per device and SCSI EH run;
 *        there might be several timed out commands which will cause the
 *        'max_medium_access_timeouts' counter to trigger after the first
 *        SCSI EH run already and set the device to offline.
 *        So this function resets the internal counter before starting SCSI EH.
 **/
static void sd_eh_reset(struct scsi_cmnd *scmd)
{
        struct scsi_disk *sdkp = scsi_disk(scmd->request->rq_disk);

        /* New SCSI EH run, reset gate variable */
        sdkp->ignore_medium_access_errors = false;
}

/**
 *        sd_eh_action - error handling callback
 *        @scmd:                sd-issued command that has failed
 *        @eh_disp:        The recovery disposition suggested by the midlayer
 *
 *        This function is called by the SCSI midlayer upon completion of an
 *        error test command (currently TEST UNIT READY). The result of sending
 *        the eh command is passed in eh_disp.  We're looking for devices that
 *        fail medium access commands but are OK with non access commands like
 *        test unit ready (so wrongly see the device as having a successful
 *        recovery)
 **/
static int sd_eh_action(struct scsi_cmnd *scmd, int eh_disp)
{
        struct scsi_disk *sdkp = scsi_disk(scmd->request->rq_disk);
        struct scsi_device *sdev = scmd->device;

        if (!scsi_device_online(sdev) ||
            !scsi_medium_access_command(scmd) ||
            host_byte(scmd->result) != DID_TIME_OUT ||
            eh_disp != SUCCESS)
                return eh_disp;

        /*
         * The device has timed out executing a medium access command.
         * However, the TEST UNIT READY command sent during error
         * handling completed successfully. Either the device is in the
         * process of recovering or has it suffered an internal failure
         * that prevents access to the storage medium.
         */
        if (!sdkp->ignore_medium_access_errors) {
                sdkp->medium_access_timed_out++;
                sdkp->ignore_medium_access_errors = true;
        }

        /*
         * If the device keeps failing read/write commands but TEST UNIT
         * READY always completes successfully we assume that medium
         * access is no longer possible and take the device offline.
         */
        if (sdkp->medium_access_timed_out >= sdkp->max_medium_access_timeouts) {
                scmd_printk(KERN_ERR, scmd,
                            "Medium access timeout failure. Offlining disk!\n");
                mutex_lock(&sdev->state_mutex);
                scsi_device_set_state(sdev, SDEV_OFFLINE);
                mutex_unlock(&sdev->state_mutex);

                return SUCCESS;
        }

        return eh_disp;
}

static unsigned int sd_completed_bytes(struct scsi_cmnd *scmd)
{
        struct request *req = scmd->request;
        struct scsi_device *sdev = scmd->device;
        unsigned int transferred, good_bytes;
        u64 start_lba, end_lba, bad_lba;

        /*
         * Some commands have a payload smaller than the device logical
         * block size (e.g. INQUIRY on a 4K disk).
         */
        if (scsi_bufflen(scmd) <= sdev->sector_size)
                return 0;

        /* Check if we have a 'bad_lba' information */
        if (!scsi_get_sense_info_fld(scmd->sense_buffer,
                                     SCSI_SENSE_BUFFERSIZE,
                                     &bad_lba))
                return 0;

        /*
         * If the bad lba was reported incorrectly, we have no idea where
         * the error is.
         */
        start_lba = sectors_to_logical(sdev, blk_rq_pos(req));
        end_lba = start_lba + bytes_to_logical(sdev, scsi_bufflen(scmd));
        if (bad_lba < start_lba || bad_lba >= end_lba)
                return 0;

        /*
         * resid is optional but mostly filled in.  When it's unused,
         * its value is zero, so we assume the whole buffer transferred
         */
        transferred = scsi_bufflen(scmd) - scsi_get_resid(scmd);

        /* This computation should always be done in terms of the
         * resolution of the device's medium.
         */
        good_bytes = logical_to_bytes(sdev, bad_lba - start_lba);

        return min(good_bytes, transferred);
}

/**
 *        sd_done - bottom half handler: called when the lower level
 *        driver has completed (successfully or otherwise) a scsi command.
 *        @SCpnt: mid-level's per command structure.
 *
 *        Note: potentially run from within an ISR. Must not block.
 **/
static int sd_done(struct scsi_cmnd *SCpnt)
{
        int result = SCpnt->result;
        unsigned int good_bytes = result ? 0 : scsi_bufflen(SCpnt);
        unsigned int sector_size = SCpnt->device->sector_size;
        unsigned int resid;
        struct scsi_sense_hdr sshdr;
        struct scsi_disk *sdkp = scsi_disk(SCpnt->request->rq_disk);
        struct request *req = SCpnt->request;
        int sense_valid = 0;
        int sense_deferred = 0;

        switch (req_op(req)) {
        case REQ_OP_DISCARD:
        case REQ_OP_WRITE_ZEROES:
        case REQ_OP_WRITE_SAME:
        case REQ_OP_ZONE_RESET:
        case REQ_OP_ZONE_RESET_ALL:
        case REQ_OP_ZONE_OPEN:
        case REQ_OP_ZONE_CLOSE:
        case REQ_OP_ZONE_FINISH:
                if (!result) {
                        good_bytes = blk_rq_bytes(req);
                        scsi_set_resid(SCpnt, 0);
                } else {
                        good_bytes = 0;
                        scsi_set_resid(SCpnt, blk_rq_bytes(req));
                }
                break;
        default:
                /*
                 * In case of bogus fw or device, we could end up having
                 * an unaligned partial completion. Check this here and force
                 * alignment.
                 */
                resid = scsi_get_resid(SCpnt);
                if (resid & (sector_size - 1)) {
                        sd_printk(KERN_INFO, sdkp,
                                "Unaligned partial completion (resid=%u, sector_sz=%u)\n",
                                resid, sector_size);
                        scsi_print_command(SCpnt);
                        resid = min(scsi_bufflen(SCpnt),
                                    round_up(resid, sector_size));
                        scsi_set_resid(SCpnt, resid);
                }
        }

        if (result) {
                sense_valid = scsi_command_normalize_sense(SCpnt, &sshdr);
                if (sense_valid)
                        sense_deferred = scsi_sense_is_deferred(&sshdr);
        }
        sdkp->medium_access_timed_out = 0;

        if (driver_byte(result) != DRIVER_SENSE &&
            (!sense_valid || sense_deferred))
                goto out;

        switch (sshdr.sense_key) {
        case HARDWARE_ERROR:
        case MEDIUM_ERROR:
                good_bytes = sd_completed_bytes(SCpnt);
                break;
        case RECOVERED_ERROR:
                good_bytes = scsi_bufflen(SCpnt);
                break;
        case NO_SENSE:
                /* This indicates a false check condition, so ignore it.  An
                 * unknown amount of data was transferred so treat it as an
                 * error.
                 */
                SCpnt->result = 0;
                memset(SCpnt->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE);
                break;
        case ABORTED_COMMAND:
                if (sshdr.asc == 0x10)  /* DIF: Target detected corruption */
                        good_bytes = sd_completed_bytes(SCpnt);
                break;
        case ILLEGAL_REQUEST:
                switch (sshdr.asc) {
                case 0x10:        /* DIX: Host detected corruption */
                        good_bytes = sd_completed_bytes(SCpnt);
                        break;
                case 0x20:        /* INVALID COMMAND OPCODE */
                case 0x24:        /* INVALID FIELD IN CDB */
                        switch (SCpnt->cmnd[0]) {
                        case UNMAP:
                                sd_config_discard(sdkp, SD_LBP_DISABLE);
                                break;
                        case WRITE_SAME_16:
                        case WRITE_SAME:
                                if (SCpnt->cmnd[1] & 8) { /* UNMAP */
                                        sd_config_discard(sdkp, SD_LBP_DISABLE);
                                } else {
                                        sdkp->device->no_write_same = 1;
                                        sd_config_write_same(sdkp);
                                        req->rq_flags |= RQF_QUIET;
                                }
                                break;
                        }
                }
                break;
        default:
                break;
        }

 out:
        if (sd_is_zoned(sdkp))
                good_bytes = sd_zbc_complete(SCpnt, good_bytes, &sshdr);

        SCSI_LOG_HLCOMPLETE(1, scmd_printk(KERN_INFO, SCpnt,
                                           "sd_done: completed %d of %d bytes\n",
                                           good_bytes, scsi_bufflen(SCpnt)));

        return good_bytes;
}

/*
 * spinup disk - called only in sd_revalidate_disk()
 */
static void
sd_spinup_disk(struct scsi_disk *sdkp)
{
        unsigned char cmd[10];
        unsigned long spintime_expire = 0;
        int retries, spintime;
        unsigned int the_result;
        struct scsi_sense_hdr sshdr;
        int sense_valid = 0;

        spintime = 0;

        /* Spin up drives, as required.  Only do this at boot time */
        /* Spinup needs to be done for module loads too. */
        do {
                retries = 0;

                do {
                        cmd[0] = TEST_UNIT_READY;
                        memset((void *) &cmd[1], 0, 9);

                        the_result = scsi_execute_req(sdkp->device, cmd,
                                                      DMA_NONE, NULL, 0,
                                                      &sshdr, SD_TIMEOUT,
                                                      sdkp->max_retries, NULL);

                        /*
                         * If the drive has indicated to us that it
                         * doesn't have any media in it, don't bother
                         * with any more polling.
                         */
                        if (media_not_present(sdkp, &sshdr))
                                return;

                        if (the_result)
                                sense_valid = scsi_sense_valid(&sshdr);
                        retries++;
                } while (retries < 3 && 
                         (!scsi_status_is_good(the_result) ||
                          ((driver_byte(the_result) == DRIVER_SENSE) &&
                          sense_valid && sshdr.sense_key == UNIT_ATTENTION)));

                if (driver_byte(the_result) != DRIVER_SENSE) {
                        /* no sense, TUR either succeeded or failed
                         * with a status error */
                        if(!spintime && !scsi_status_is_good(the_result)) {
                                sd_print_result(sdkp, "Test Unit Ready failed",
                                                the_result);
                        }
                        break;
                }

                /*
                 * The device does not want the automatic start to be issued.
                 */
                if (sdkp->device->no_start_on_add)
                        break;

                if (sense_valid && sshdr.sense_key == NOT_READY) {
                        if (sshdr.asc == 4 && sshdr.ascq == 3)
                                break;        /* manual intervention required */
                        if (sshdr.asc == 4 && sshdr.ascq == 0xb)
                                break;        /* standby */
                        if (sshdr.asc == 4 && sshdr.ascq == 0xc)
                                break;        /* unavailable */
                        if (sshdr.asc == 4 && sshdr.ascq == 0x1b)
                                break;        /* sanitize in progress */
                        /*
                         * Issue command to spin up drive when not ready
                         */
                        if (!spintime) {
                                sd_printk(KERN_NOTICE, sdkp, "Spinning up disk...");
                                cmd[0] = START_STOP;
                                cmd[1] = 1;        /* Return immediately */
                                memset((void *) &cmd[2], 0, 8);
                                cmd[4] = 1;        /* Start spin cycle */
                                if (sdkp->device->start_stop_pwr_cond)
                                        cmd[4] |= 1 << 4;
                                scsi_execute_req(sdkp->device, cmd, DMA_NONE,
                                                 NULL, 0, &sshdr,
                                                 SD_TIMEOUT, sdkp->max_retries,
                                                 NULL);
                                spintime_expire = jiffies + 100 * HZ;
                                spintime = 1;
                        }
                        /* Wait 1 second for next try */
                        msleep(1000);
                        printk(KERN_CONT ".");

                /*
                 * Wait for USB flash devices with slow firmware.
                 * Yes, this sense key/ASC combination shouldn't
                 * occur here.  It's characteristic of these devices.
                 */
                } else if (sense_valid &&
                                sshdr.sense_key == UNIT_ATTENTION &&
                                sshdr.asc == 0x28) {
                        if (!spintime) {
                                spintime_expire = jiffies + 5 * HZ;
                                spintime = 1;
                        }
                        /* Wait 1 second for next try */
                        msleep(1000);
                } else {
                        /* we don't understand the sense code, so it's
                         * probably pointless to loop */
                        if(!spintime) {
                                sd_printk(KERN_NOTICE, sdkp, "Unit Not Ready\n");
                                sd_print_sense_hdr(sdkp, &sshdr);
                        }
                        break;
                }
                                
        } while (spintime && time_before_eq(jiffies, spintime_expire));

        if (spintime) {
                if (scsi_status_is_good(the_result))
                        printk(KERN_CONT "ready\n");
                else
                        printk(KERN_CONT "not responding...\n");
        }
}

/*
 * Determine whether disk supports Data Integrity Field.
 */
static int sd_read_protection_type(struct scsi_disk *sdkp, unsigned char *buffer)
{
        struct scsi_device *sdp = sdkp->device;
        u8 type;
        int ret = 0;

        if (scsi_device_protection(sdp) == 0 || (buffer[12] & 1) == 0) {
                sdkp->protection_type = 0;
                return ret;
        }

        type = ((buffer[12] >> 1) & 7) + 1; /* P_TYPE 0 = Type 1 */

        if (type > T10_PI_TYPE3_PROTECTION)
                ret = -ENODEV;
        else if (scsi_host_dif_capable(sdp->host, type))
                ret = 1;

        if (sdkp->first_scan || type != sdkp->protection_type)
                switch (ret) {
                case -ENODEV:
                        sd_printk(KERN_ERR, sdkp, "formatted with unsupported" \
                                  " protection type %u. Disabling disk!\n",
                                  type);
                        break;
                case 1:
                        sd_printk(KERN_NOTICE, sdkp,
                                  "Enabling DIF Type %u protection\n", type);
                        break;
                case 0:
                        sd_printk(KERN_NOTICE, sdkp,
                                  "Disabling DIF Type %u protection\n", type);
                        break;
                }

        sdkp->protection_type = type;

        return ret;
}

static void read_capacity_error(struct scsi_disk *sdkp, struct scsi_device *sdp,
                        struct scsi_sense_hdr *sshdr, int sense_valid,
                        int the_result)
{
        if (driver_byte(the_result) == DRIVER_SENSE)
                sd_print_sense_hdr(sdkp, sshdr);
        else
                sd_printk(KERN_NOTICE, sdkp, "Sense not available.\n");

        /*
         * Set dirty bit for removable devices if not ready -
         * sometimes drives will not report this properly.
         */
        if (sdp->removable &&
            sense_valid && sshdr->sense_key == NOT_READY)
                set_media_not_present(sdkp);

        /*
         * We used to set media_present to 0 here to indicate no media
         * in the drive, but some drives fail read capacity even with
         * media present, so we can't do that.
         */
        sdkp->capacity = 0; /* unknown mapped to zero - as usual */
}

#define RC16_LEN 32
#if RC16_LEN > SD_BUF_SIZE
#error RC16_LEN must not be more than SD_BUF_SIZE
#endif

#define READ_CAPACITY_RETRIES_ON_RESET        10

static int read_capacity_16(struct scsi_disk *sdkp, struct scsi_device *sdp,
                                                unsigned char *buffer)
{
        unsigned char cmd[16];
        struct scsi_sense_hdr sshdr;
        int sense_valid = 0;
        int the_result;
        int retries = 3, reset_retries = READ_CAPACITY_RETRIES_ON_RESET;
        unsigned int alignment;
        unsigned long long lba;
        unsigned sector_size;

        if (sdp->no_read_capacity_16)
                return -EINVAL;

        do {
                memset(cmd, 0, 16);
                cmd[0] = SERVICE_ACTION_IN_16;
                cmd[1] = SAI_READ_CAPACITY_16;
                cmd[13] = RC16_LEN;
                memset(buffer, 0, RC16_LEN);

                the_result = scsi_execute_req(sdp, cmd, DMA_FROM_DEVICE,
                                        buffer, RC16_LEN, &sshdr,
                                        SD_TIMEOUT, sdkp->max_retries, NULL);

                if (media_not_present(sdkp, &sshdr))
                        return -ENODEV;

                if (the_result) {
                        sense_valid = scsi_sense_valid(&sshdr);
                        if (sense_valid &&
                            sshdr.sense_key == ILLEGAL_REQUEST &&
                            (sshdr.asc == 0x20 || sshdr.asc == 0x24) &&
                            sshdr.ascq == 0x00)
                                /* Invalid Command Operation Code or
                                 * Invalid Field in CDB, just retry
                                 * silently with RC10 */
                                return -EINVAL;
                        if (sense_valid &&
                            sshdr.sense_key == UNIT_ATTENTION &&
                            sshdr.asc == 0x29 && sshdr.ascq == 0x00)
                                /* Device reset might occur several times,
                                 * give it one more chance */
                                if (--reset_retries > 0)
                                        continue;
                }
                retries--;

        } while (the_result && retries);

        if (the_result) {
                sd_print_result(sdkp, "Read Capacity(16) failed", the_result);
                read_capacity_error(sdkp, sdp, &sshdr, sense_valid, the_result);
                return -EINVAL;
        }

        sector_size = get_unaligned_be32(&buffer[8]);
        lba = get_unaligned_be64(&buffer[0]);

        if (sd_read_protection_type(sdkp, buffer) < 0) {
                sdkp->capacity = 0;
                return -ENODEV;
        }

        /* Logical blocks per physical block exponent */
        sdkp->physical_block_size = (1 << (buffer[13] & 0xf)) * sector_size;

        /* RC basis */
        sdkp->rc_basis = (buffer[12] >> 4) & 0x3;

        /* Lowest aligned logical block */
        alignment = ((buffer[14] & 0x3f) << 8 | buffer[15]) * sector_size;
        blk_queue_alignment_offset(sdp->request_queue, alignment);
        if (alignment && sdkp->first_scan)
                sd_printk(KERN_NOTICE, sdkp,
                          "physical block alignment offset: %u\n", alignment);

        if (buffer[14] & 0x80) { /* LBPME */
                sdkp->lbpme = 1;

                if (buffer[14] & 0x40) /* LBPRZ */
                        sdkp->lbprz = 1;

                sd_config_discard(sdkp, SD_LBP_WS16);
        }

        sdkp->capacity = lba + 1;
        return sector_size;
}

static int read_capacity_10(struct scsi_disk *sdkp, struct scsi_device *sdp,
                                                unsigned char *buffer)
{
        unsigned char cmd[16];
        struct scsi_sense_hdr sshdr;
        int sense_valid = 0;
        int the_result;
        int retries = 3, reset_retries = READ_CAPACITY_RETRIES_ON_RESET;
        sector_t lba;
        unsigned sector_size;

        do {
                cmd[0] = READ_CAPACITY;
                memset(&cmd[1], 0, 9);
                memset(buffer, 0, 8);

                the_result = scsi_execute_req(sdp, cmd, DMA_FROM_DEVICE,
                                        buffer, 8, &sshdr,
                                        SD_TIMEOUT, sdkp->max_retries, NULL);

                if (media_not_present(sdkp, &sshdr))
                        return -ENODEV;

                if (the_result) {
                        sense_valid = scsi_sense_valid(&sshdr);
                        if (sense_valid &&
                            sshdr.sense_key == UNIT_ATTENTION &&
                            sshdr.asc == 0x29 && sshdr.ascq == 0x00)
                                /* Device reset might occur several times,
                                 * give it one more chance */
                                if (--reset_retries > 0)
                                        continue;
                }
                retries--;

        } while (the_result && retries);

        if (the_result) {
                sd_print_result(sdkp, "Read Capacity(10) failed", the_result);
                read_capacity_error(sdkp, sdp, &sshdr, sense_valid, the_result);
                return -EINVAL;
        }

        sector_size = get_unaligned_be32(&buffer[4]);
        lba = get_unaligned_be32(&buffer[0]);

        if (sdp->no_read_capacity_16 && (lba == 0xffffffff)) {
                /* Some buggy (usb cardreader) devices return an lba of
                   0xffffffff when the want to report a size of 0 (with
                   which they really mean no media is present) */
                sdkp->capacity = 0;
                sdkp->physical_block_size = sector_size;
                return sector_size;
        }

        sdkp->capacity = lba + 1;
        sdkp->physical_block_size = sector_size;
        return sector_size;
}

static int sd_try_rc16_first(struct scsi_device *sdp)
{
        if (sdp->host->max_cmd_len < 16)
                return 0;
        if (sdp->try_rc_10_first)
                return 0;
        if (sdp->scsi_level > SCSI_SPC_2)
                return 1;
        if (scsi_device_protection(sdp))
                return 1;
        return 0;
}

/*
 * read disk capacity
 */
static void
sd_read_capacity(struct scsi_disk *sdkp, unsigned char *buffer)
{
        int sector_size;
        struct scsi_device *sdp = sdkp->device;

        if (sd_try_rc16_first(sdp)) {
                sector_size = read_capacity_16(sdkp, sdp, buffer);
                if (sector_size == -EOVERFLOW)
                        goto got_data;
                if (sector_size == -ENODEV)
                        return;
                if (sector_size < 0)
                        sector_size = read_capacity_10(sdkp, sdp, buffer);
                if (sector_size < 0)
                        return;
        } else {
                sector_size = read_capacity_10(sdkp, sdp, buffer);
                if (sector_size == -EOVERFLOW)
                        goto got_data;
                if (sector_size < 0)
                        return;
                if ((sizeof(sdkp->capacity) > 4) &&
                    (sdkp->capacity > 0xffffffffULL)) {
                        int old_sector_size = sector_size;
                        sd_printk(KERN_NOTICE, sdkp, "Very big device. "
                                        "Trying to use READ CAPACITY(16).\n");
                        sector_size = read_capacity_16(sdkp, sdp, buffer);
                        if (sector_size < 0) {
                                sd_printk(KERN_NOTICE, sdkp,
                                        "Using 0xffffffff as device size\n");
                                sdkp->capacity = 1 + (sector_t) 0xffffffff;
                                sector_size = old_sector_size;
                                goto got_data;
                        }
                        /* Remember that READ CAPACITY(16) succeeded */
                        sdp->try_rc_10_first = 0;
                }
        }

        /* Some devices are known to return the total number of blocks,
         * not the highest block number.  Some devices have versions
         * which do this and others which do not.  Some devices we might
         * suspect of doing this but we don't know for certain.
         *
         * If we know the reported capacity is wrong, decrement it.  If
         * we can only guess, then assume the number of blocks is even
         * (usually true but not always) and err on the side of lowering
         * the capacity.
         */
        if (sdp->fix_capacity ||
            (sdp->guess_capacity && (sdkp->capacity & 0x01))) {
                sd_printk(KERN_INFO, sdkp, "Adjusting the sector count "
                                "from its reported value: %llu\n",
                                (unsigned long long) sdkp->capacity);
                --sdkp->capacity;
        }

got_data:
        if (sector_size == 0) {
                sector_size = 512;
                sd_printk(KERN_NOTICE, sdkp, "Sector size 0 reported, "
                          "assuming 512.\n");
        }

        if (sector_size != 512 &&
            sector_size != 1024 &&
            sector_size != 2048 &&
            sector_size != 4096) {
                sd_printk(KERN_NOTICE, sdkp, "Unsupported sector size %d.\n",
                          sector_size);
                /*
                 * The user might want to re-format the drive with
                 * a supported sectorsize.  Once this happens, it
                 * would be relatively trivial to set the thing up.
                 * For this reason, we leave the thing in the table.
                 */
                sdkp->capacity = 0;
                /*
                 * set a bogus sector size so the normal read/write
                 * logic in the block layer will eventually refuse any
                 * request on this device without tripping over power
                 * of two sector size assumptions
                 */
                sector_size = 512;
        }
        blk_queue_logical_block_size(sdp->request_queue, sector_size);
        blk_queue_physical_block_size(sdp->request_queue,
                                      sdkp->physical_block_size);
        sdkp->device->sector_size = sector_size;

        if (sdkp->capacity > 0xffffffff)
                sdp->use_16_for_rw = 1;

}

/*
 * Print disk capacity
 */
static void
sd_print_capacity(struct scsi_disk *sdkp,
                  sector_t old_capacity)
{
        int sector_size = sdkp->device->sector_size;
        char cap_str_2[10], cap_str_10[10];

        if (!sdkp->first_scan && old_capacity == sdkp->capacity)
                return;

        string_get_size(sdkp->capacity, sector_size,
                        STRING_UNITS_2, cap_str_2, sizeof(cap_str_2));
        string_get_size(sdkp->capacity, sector_size,
                        STRING_UNITS_10, cap_str_10, sizeof(cap_str_10));

        sd_printk(KERN_NOTICE, sdkp,
                  "%llu %d-byte logical blocks: (%s/%s)\n",
                  (unsigned long long)sdkp->capacity,
                  sector_size, cap_str_10, cap_str_2);

        if (sdkp->physical_block_size != sector_size)
                sd_printk(KERN_NOTICE, sdkp,
                          "%u-byte physical blocks\n",
                          sdkp->physical_block_size);
}

/* called with buffer of length 512 */
static inline int
sd_do_mode_sense(struct scsi_disk *sdkp, int dbd, int modepage,
                 unsigned char *buffer, int len, struct scsi_mode_data *data,
                 struct scsi_sense_hdr *sshdr)
{
        /*
         * If we must use MODE SENSE(10), make sure that the buffer length
         * is at least 8 bytes so that the mode sense header fits.
         */
        if (sdkp->device->use_10_for_ms && len < 8)
                len = 8;

        return scsi_mode_sense(sdkp->device, dbd, modepage, buffer, len,
                               SD_TIMEOUT, sdkp->max_retries, data,
                               sshdr);
}

/*
 * read write protect setting, if possible - called only in sd_revalidate_disk()
 * called with buffer of length SD_BUF_SIZE
 */
static void
sd_read_write_protect_flag(struct scsi_disk *sdkp, unsigned char *buffer)
{
        int res;
        struct scsi_device *sdp = sdkp->device;
        struct scsi_mode_data data;
        int old_wp = sdkp->write_prot;

        set_disk_ro(sdkp->disk, 0);
        if (sdp->skip_ms_page_3f) {
                sd_first_printk(KERN_NOTICE, sdkp, "Assuming Write Enabled\n");
                return;
        }

        if (sdp->use_192_bytes_for_3f) {
                res = sd_do_mode_sense(sdkp, 0, 0x3F, buffer, 192, &data, NULL);
        } else {
                /*
                 * First attempt: ask for all pages (0x3F), but only 4 bytes.
                 * We have to start carefully: some devices hang if we ask
                 * for more than is available.
                 */
                res = sd_do_mode_sense(sdkp, 0, 0x3F, buffer, 4, &data, NULL);

                /*
                 * Second attempt: ask for page 0 When only page 0 is
                 * implemented, a request for page 3F may return Sense Key
                 * 5: Illegal Request, Sense Code 24: Invalid field in
                 * CDB.
                 */
                if (res < 0)
                        res = sd_do_mode_sense(sdkp, 0, 0, buffer, 4, &data, NULL);

                /*
                 * Third attempt: ask 255 bytes, as we did earlier.
                 */
                if (res < 0)
                        res = sd_do_mode_sense(sdkp, 0, 0x3F, buffer, 255,
                                               &data, NULL);
        }

        if (res < 0) {
                sd_first_printk(KERN_WARNING, sdkp,
                          "Test WP failed, assume Write Enabled\n");
        } else {
                sdkp->write_prot = ((data.device_specific & 0x80) != 0);
                set_disk_ro(sdkp->disk, sdkp->write_prot);
                if (sdkp->first_scan || old_wp != sdkp->write_prot) {
                        sd_printk(KERN_NOTICE, sdkp, "Write Protect is %s\n",
                                  sdkp->write_prot ? "on" : "off");
                        sd_printk(KERN_DEBUG, sdkp, "Mode Sense: %4ph\n", buffer);
                }
        }
}

/*
 * sd_read_cache_type - called only from sd_revalidate_disk()
 * called with buffer of length SD_BUF_SIZE
 */
static void
sd_read_cache_type(struct scsi_disk *sdkp, unsigned char *buffer)
{
        int len = 0, res;
        struct scsi_device *sdp = sdkp->device;

        int dbd;
        int modepage;
        int first_len;
        struct scsi_mode_data data;
        struct scsi_sense_hdr sshdr;
        int old_wce = sdkp->WCE;
        int old_rcd = sdkp->RCD;
        int old_dpofua = sdkp->DPOFUA;


        if (sdkp->cache_override)
                return;

        first_len = 4;
        if (sdp->skip_ms_page_8) {
                if (sdp->type == TYPE_RBC)
                        goto defaults;
                else {
                        if (sdp->skip_ms_page_3f)
                                goto defaults;
                        modepage = 0x3F;
                        if (sdp->use_192_bytes_for_3f)
                                first_len = 192;
                        dbd = 0;
                }
        } else if (sdp->type == TYPE_RBC) {
                modepage = 6;
                dbd = 8;
        } else {
                modepage = 8;
                dbd = 0;
        }

        /* cautiously ask */
        res = sd_do_mode_sense(sdkp, dbd, modepage, buffer, first_len,
                        &data, &sshdr);

        if (res < 0)
                goto bad_sense;

        if (!data.header_length) {
                modepage = 6;
                first_len = 0;
                sd_first_printk(KERN_ERR, sdkp,
                                "Missing header in MODE_SENSE response\n");
        }

        /* that went OK, now ask for the proper length */
        len = data.length;

        /*
         * We're only interested in the first three bytes, actually.
         * But the data cache page is defined for the first 20.
         */
        if (len < 3)
                goto bad_sense;
        else if (len > SD_BUF_SIZE) {
                sd_first_printk(KERN_NOTICE, sdkp, "Truncating mode parameter "
                          "data from %d to %d bytes\n", len, SD_BUF_SIZE);
                len = SD_BUF_SIZE;
        }
        if (modepage == 0x3F && sdp->use_192_bytes_for_3f)
                len = 192;

        /* Get the data */
        if (len > first_len)
                res = sd_do_mode_sense(sdkp, dbd, modepage, buffer, len,
                                &data, &sshdr);

        if (!res) {
                int offset = data.header_length + data.block_descriptor_length;

                while (offset < len) {
                        u8 page_code = buffer[offset] & 0x3F;
                        u8 spf       = buffer[offset] & 0x40;

                        if (page_code == 8 || page_code == 6) {
                                /* We're interested only in the first 3 bytes.
                                 */
                                if (len - offset <= 2) {
                                        sd_first_printk(KERN_ERR, sdkp,
                                                "Incomplete mode parameter "
                                                        "data\n");
                                        goto defaults;
                                } else {
                                        modepage = page_code;
                                        goto Page_found;
                                }
                        } else {
                                /* Go to the next page */
                                if (spf && len - offset > 3)
                                        offset += 4 + (buffer[offset+2] << 8) +
                                                buffer[offset+3];
                                else if (!spf && len - offset > 1)
                                        offset += 2 + buffer[offset+1];
                                else {
                                        sd_first_printk(KERN_ERR, sdkp,
                                                        "Incomplete mode "
                                                        "parameter data\n");
                                        goto defaults;
                                }
                        }
                }

                sd_first_printk(KERN_ERR, sdkp, "No Caching mode page found\n");
                goto defaults;

        Page_found:
                if (modepage == 8) {
                        sdkp->WCE = ((buffer[offset + 2] & 0x04) != 0);
                        sdkp->RCD = ((buffer[offset + 2] & 0x01) != 0);
                } else {
                        sdkp->WCE = ((buffer[offset + 2] & 0x01) == 0);
                        sdkp->RCD = 0;
                }

                sdkp->DPOFUA = (data.device_specific & 0x10) != 0;
                if (sdp->broken_fua) {
                        sd_first_printk(KERN_NOTICE, sdkp, "Disabling FUA\n");
                        sdkp->DPOFUA = 0;
                } else if (sdkp->DPOFUA && !sdkp->device->use_10_for_rw &&
                           !sdkp->device->use_16_for_rw) {
                        sd_first_printk(KERN_NOTICE, sdkp,
                                  "Uses READ/WRITE(6), disabling FUA\n");
                        sdkp->DPOFUA = 0;
                }

                /* No cache flush allowed for write protected devices */
                if (sdkp->WCE && sdkp->write_prot)
                        sdkp->WCE = 0;

                if (sdkp->first_scan || old_wce != sdkp->WCE ||
                    old_rcd != sdkp->RCD || old_dpofua != sdkp->DPOFUA)
                        sd_printk(KERN_NOTICE, sdkp,
                                  "Write cache: %s, read cache: %s, %s\n",
                                  sdkp->WCE ? "enabled" : "disabled",
                                  sdkp->RCD ? "disabled" : "enabled",
                                  sdkp->DPOFUA ? "supports DPO and FUA"
                                  : "doesn't support DPO or FUA");

                return;
        }

bad_sense:
        if (scsi_sense_valid(&sshdr) &&
            sshdr.sense_key == ILLEGAL_REQUEST &&
            sshdr.asc == 0x24 && sshdr.ascq == 0x0)
                /* Invalid field in CDB */
                sd_first_printk(KERN_NOTICE, sdkp, "Cache data unavailable\n");
        else
                sd_first_printk(KERN_ERR, sdkp,
                                "Asking for cache data failed\n");

defaults:
        if (sdp->wce_default_on) {
                sd_first_printk(KERN_NOTICE, sdkp,
                                "Assuming drive cache: write back\n");
                sdkp->WCE = 1;
        } else {
                sd_first_printk(KERN_ERR, sdkp,
                                "Assuming drive cache: write through\n");
                sdkp->WCE = 0;
        }
        sdkp->RCD = 0;
        sdkp->DPOFUA = 0;
}

/*
 * The ATO bit indicates whether the DIF application tag is available
 * for use by the operating system.
 */
static void sd_read_app_tag_own(struct scsi_disk *sdkp, unsigned char *buffer)
{
        int res, offset;
        struct scsi_device *sdp = sdkp->device;
        struct scsi_mode_data data;
        struct scsi_sense_hdr sshdr;

        if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC)
                return;

        if (sdkp->protection_type == 0)
                return;

        res = scsi_mode_sense(sdp, 1, 0x0a, buffer, 36, SD_TIMEOUT,
                              sdkp->max_retries, &data, &sshdr);

        if (res < 0 || !data.header_length ||
            data.length < 6) {
                sd_first_printk(KERN_WARNING, sdkp,
                          "getting Control mode page failed, assume no ATO\n");

                if (scsi_sense_valid(&sshdr))
                        sd_print_sense_hdr(sdkp, &sshdr);

                return;
        }

        offset = data.header_length + data.block_descriptor_length;

        if ((buffer[offset] & 0x3f) != 0x0a) {
                sd_first_printk(KERN_ERR, sdkp, "ATO Got wrong page\n");
                return;
        }

        if ((buffer[offset + 5] & 0x80) == 0)
                return;

        sdkp->ATO = 1;

        return;
}

/**
 * sd_read_block_limits - Query disk device for preferred I/O sizes.
 * @sdkp: disk to query
 */
static void sd_read_block_limits(struct scsi_disk *sdkp)
{
        unsigned int sector_sz = sdkp->device->sector_size;
        const int vpd_len = 64;
        unsigned char *buffer = kmalloc(vpd_len, GFP_KERNEL);

        if (!buffer ||
            /* Block Limits VPD */
            scsi_get_vpd_page(sdkp->device, 0xb0, buffer, vpd_len))
                goto out;

        blk_queue_io_min(sdkp->disk->queue,
                         get_unaligned_be16(&buffer[6]) * sector_sz);

        sdkp->max_xfer_blocks = get_unaligned_be32(&buffer[8]);
        sdkp->opt_xfer_blocks = get_unaligned_be32(&buffer[12]);

        if (buffer[3] == 0x3c) {
                unsigned int lba_count, desc_count;

                sdkp->max_ws_blocks = (u32)get_unaligned_be64(&buffer[36]);

                if (!sdkp->lbpme)
                        goto out;

                lba_count = get_unaligned_be32(&buffer[20]);
                desc_count = get_unaligned_be32(&buffer[24]);

                if (lba_count && desc_count)
                        sdkp->max_unmap_blocks = lba_count;

                sdkp->unmap_granularity = get_unaligned_be32(&buffer[28]);

                if (buffer[32] & 0x80)
                        sdkp->unmap_alignment =
                                get_unaligned_be32(&buffer[32]) & ~(1 << 31);

                if (!sdkp->lbpvpd) { /* LBP VPD page not provided */

                        if (sdkp->max_unmap_blocks)
                                sd_config_discard(sdkp, SD_LBP_UNMAP);
                        else
                                sd_config_discard(sdkp, SD_LBP_WS16);

                } else {        /* LBP VPD page tells us what to use */
                        if (sdkp->lbpu && sdkp->max_unmap_blocks)
                                sd_config_discard(sdkp, SD_LBP_UNMAP);
                        else if (sdkp->lbpws)
                                sd_config_discard(sdkp, SD_LBP_WS16);
                        else if (sdkp->lbpws10)
                                sd_config_discard(sdkp, SD_LBP_WS10);
                        else
                                sd_config_discard(sdkp, SD_LBP_DISABLE);
                }
        }

 out:
        kfree(buffer);
}

/**
 * sd_read_block_characteristics - Query block dev. characteristics
 * @sdkp: disk to query
 */
static void sd_read_block_characteristics(struct scsi_disk *sdkp)
{
        struct request_queue *q = sdkp->disk->queue;
        unsigned char *buffer;
        u16 rot;
        const int vpd_len = 64;

        buffer = kmalloc(vpd_len, GFP_KERNEL);

        if (!buffer ||
            /* Block Device Characteristics VPD */
            scsi_get_vpd_page(sdkp->device, 0xb1, buffer, vpd_len))
                goto out;

        rot = get_unaligned_be16(&buffer[4]);

        if (rot == 1) {
                blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
                blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q);
        }

        if (sdkp->device->type == TYPE_ZBC) {
                /*
                 * Host-managed: Per ZBC and ZAC specifications, writes in
                 * sequential write required zones of host-managed devices must
                 * be aligned to the device physical block size.
                 */
                blk_queue_set_zoned(sdkp->disk, BLK_ZONED_HM);
                blk_queue_zone_write_granularity(q, sdkp->physical_block_size);
        } else {
                sdkp->zoned = (buffer[8] >> 4) & 3;
                if (sdkp->zoned == 1) {
                        /* Host-aware */
                        blk_queue_set_zoned(sdkp->disk, BLK_ZONED_HA);
                } else {
                        /* Regular disk or drive managed disk */
                        blk_queue_set_zoned(sdkp->disk, BLK_ZONED_NONE);
                }
        }

        if (!sdkp->first_scan)
                goto out;

        if (blk_queue_is_zoned(q)) {
                sd_printk(KERN_NOTICE, sdkp, "Host-%s zoned block device\n",
                      q->limits.zoned == BLK_ZONED_HM ? "managed" : "aware");
        } else {
                if (sdkp->zoned == 1)
                        sd_printk(KERN_NOTICE, sdkp,
                                  "Host-aware SMR disk used as regular disk\n");
                else if (sdkp->zoned == 2)
                        sd_printk(KERN_NOTICE, sdkp,
                                  "Drive-managed SMR disk\n");
        }

 out:
        kfree(buffer);
}

/**
 * sd_read_block_provisioning - Query provisioning VPD page
 * @sdkp: disk to query
 */
static void sd_read_block_provisioning(struct scsi_disk *sdkp)
{
        unsigned char *buffer;
        const int vpd_len = 8;

        if (sdkp->lbpme == 0)
                return;

        buffer = kmalloc(vpd_len, GFP_KERNEL);

        if (!buffer || scsi_get_vpd_page(sdkp->device, 0xb2, buffer, vpd_len))
                goto out;

        sdkp->lbpvpd        = 1;
        sdkp->lbpu        = (buffer[5] >> 7) & 1;        /* UNMAP */
        sdkp->lbpws        = (buffer[5] >> 6) & 1;        /* WRITE SAME(16) with UNMAP */
        sdkp->lbpws10        = (buffer[5] >> 5) & 1;        /* WRITE SAME(10) with UNMAP */

 out:
        kfree(buffer);
}

static void sd_read_write_same(struct scsi_disk *sdkp, unsigned char *buffer)
{
        struct scsi_device *sdev = sdkp->device;

        if (sdev->host->no_write_same) {
                sdev->no_write_same = 1;

                return;
        }

        if (scsi_report_opcode(sdev, buffer, SD_BUF_SIZE, INQUIRY) < 0) {
                /* too large values might cause issues with arcmsr */
                int vpd_buf_len = 64;

                sdev->no_report_opcodes = 1;

                /* Disable WRITE SAME if REPORT SUPPORTED OPERATION
                 * CODES is unsupported and the device has an ATA
                 * Information VPD page (SAT).
                 */
                if (!scsi_get_vpd_page(sdev, 0x89, buffer, vpd_buf_len))
                        sdev->no_write_same = 1;
        }

        if (scsi_report_opcode(sdev, buffer, SD_BUF_SIZE, WRITE_SAME_16) == 1)
                sdkp->ws16 = 1;

        if (scsi_report_opcode(sdev, buffer, SD_BUF_SIZE, WRITE_SAME) == 1)
                sdkp->ws10 = 1;
}

static void sd_read_security(struct scsi_disk *sdkp, unsigned char *buffer)
{
        struct scsi_device *sdev = sdkp->device;

        if (!sdev->security_supported)
                return;

        if (scsi_report_opcode(sdev, buffer, SD_BUF_SIZE,
                        SECURITY_PROTOCOL_IN) == 1 &&
            scsi_report_opcode(sdev, buffer, SD_BUF_SIZE,
                        SECURITY_PROTOCOL_OUT) == 1)
                sdkp->security = 1;
}

/*
 * Determine the device's preferred I/O size for reads and writes
 * unless the reported value is unreasonably small, large, not a
 * multiple of the physical block size, or simply garbage.
 */
static bool sd_validate_opt_xfer_size(struct scsi_disk *sdkp,
                                      unsigned int dev_max)
{
        struct scsi_device *sdp = sdkp->device;
        unsigned int opt_xfer_bytes =
                logical_to_bytes(sdp, sdkp->opt_xfer_blocks);

        if (sdkp->opt_xfer_blocks == 0)
                return false;

        if (sdkp->opt_xfer_blocks > dev_max) {
                sd_first_printk(KERN_WARNING, sdkp,
                                "Optimal transfer size %u logical blocks " \
                                "> dev_max (%u logical blocks)\n",
                                sdkp->opt_xfer_blocks, dev_max);
                return false;
        }

        if (sdkp->opt_xfer_blocks > SD_DEF_XFER_BLOCKS) {
                sd_first_printk(KERN_WARNING, sdkp,
                                "Optimal transfer size %u logical blocks " \
                                "> sd driver limit (%u logical blocks)\n",
                                sdkp->opt_xfer_blocks, SD_DEF_XFER_BLOCKS);
                return false;
        }

        if (opt_xfer_bytes < PAGE_SIZE) {
                sd_first_printk(KERN_WARNING, sdkp,
                                "Optimal transfer size %u bytes < " \
                                "PAGE_SIZE (%u bytes)\n",
                                opt_xfer_bytes, (unsigned int)PAGE_SIZE);
                return false;
        }

        if (opt_xfer_bytes & (sdkp->physical_block_size - 1)) {
                sd_first_printk(KERN_WARNING, sdkp,
                                "Optimal transfer size %u bytes not a " \
                                "multiple of physical block size (%u bytes)\n",
                                opt_xfer_bytes, sdkp->physical_block_size);
                return false;
        }

        sd_first_printk(KERN_INFO, sdkp, "Optimal transfer size %u bytes\n",
                        opt_xfer_bytes);
        return true;
}

/**
 *        sd_revalidate_disk - called the first time a new disk is seen,
 *        performs disk spin up, read_capacity, etc.
 *        @disk: struct gendisk we care about
 **/
static int sd_revalidate_disk(struct gendisk *disk)
{
        struct scsi_disk *sdkp = scsi_disk(disk);
        struct scsi_device *sdp = sdkp->device;
        struct request_queue *q = sdkp->disk->queue;
        sector_t old_capacity = sdkp->capacity;
        unsigned char *buffer;
        unsigned int dev_max, rw_max;

        SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp,
                                      "sd_revalidate_disk\n"));

        /*
         * If the device is offline, don't try and read capacity or any
         * of the other niceties.
         */
        if (!scsi_device_online(sdp))
                goto out;

        buffer = kmalloc(SD_BUF_SIZE, GFP_KERNEL);
        if (!buffer) {
                sd_printk(KERN_WARNING, sdkp, "sd_revalidate_disk: Memory "
                          "allocation failure.\n");
                goto out;
        }

        sd_spinup_disk(sdkp);

        /*
         * Without media there is no reason to ask; moreover, some devices
         * react badly if we do.
         */
        if (sdkp->media_present) {
                sd_read_capacity(sdkp, buffer);

                /*
                 * set the default to rotational.  All non-rotational devices
                 * support the block characteristics VPD page, which will
                 * cause this to be updated correctly and any device which
                 * doesn't support it should be treated as rotational.
                 */
                blk_queue_flag_clear(QUEUE_FLAG_NONROT, q);
                blk_queue_flag_set(QUEUE_FLAG_ADD_RANDOM, q);

                if (scsi_device_supports_vpd(sdp)) {
                        sd_read_block_provisioning(sdkp);
                        sd_read_block_limits(sdkp);
                        sd_read_block_characteristics(sdkp);
                        sd_zbc_read_zones(sdkp, buffer);
                }

                sd_print_capacity(sdkp, old_capacity);

                sd_read_write_protect_flag(sdkp, buffer);
                sd_read_cache_type(sdkp, buffer);
                sd_read_app_tag_own(sdkp, buffer);
                sd_read_write_same(sdkp, buffer);
                sd_read_security(sdkp, buffer);
        }

        /*
         * We now have all cache related info, determine how we deal
         * with flush requests.
         */
        sd_set_flush_flag(sdkp);

        /* Initial block count limit based on CDB TRANSFER LENGTH field size. */
        dev_max = sdp->use_16_for_rw ? SD_MAX_XFER_BLOCKS : SD_DEF_XFER_BLOCKS;

        /* Some devices report a maximum block count for READ/WRITE requests. */
        dev_max = min_not_zero(dev_max, sdkp->max_xfer_blocks);
        q->limits.max_dev_sectors = logical_to_sectors(sdp, dev_max);

        if (sd_validate_opt_xfer_size(sdkp, dev_max)) {
                q->limits.io_opt = logical_to_bytes(sdp, sdkp->opt_xfer_blocks);
                rw_max = logical_to_sectors(sdp, sdkp->opt_xfer_blocks);
        } else {
                q->limits.io_opt = 0;
                rw_max = min_not_zero(logical_to_sectors(sdp, dev_max),
                                      (sector_t)BLK_DEF_MAX_SECTORS);
        }

        /* Do not exceed controller limit */
        rw_max = min(rw_max, queue_max_hw_sectors(q));

        /*
         * Only update max_sectors if previously unset or if the current value
         * exceeds the capabilities of the hardware.
         */
        if (sdkp->first_scan ||
            q->limits.max_sectors > q->limits.max_dev_sectors ||
            q->limits.max_sectors > q->limits.max_hw_sectors)
                q->limits.max_sectors = rw_max;

        sdkp->first_scan = 0;

        set_capacity_and_notify(disk, logical_to_sectors(sdp, sdkp->capacity));
        sd_config_write_same(sdkp);
        kfree(buffer);

        /*
         * For a zoned drive, revalidating the zones can be done only once
         * the gendisk capacity is set. So if this fails, set back the gendisk
         * capacity to 0.
         */
        if (sd_zbc_revalidate_zones(sdkp))
                set_capacity_and_notify(disk, 0);

 out:
        return 0;
}

/**
 *        sd_unlock_native_capacity - unlock native capacity
 *        @disk: struct gendisk to set capacity for
 *
 *        Block layer calls this function if it detects that partitions
 *        on @disk reach beyond the end of the device.  If the SCSI host
 *        implements ->unlock_native_capacity() method, it's invoked to
 *        give it a chance to adjust the device capacity.
 *
 *        CONTEXT:
 *        Defined by block layer.  Might sleep.
 */
static void sd_unlock_native_capacity(struct gendisk *disk)
{
        struct scsi_device *sdev = scsi_disk(disk)->device;

        if (sdev->host->hostt->unlock_native_capacity)
                sdev->host->hostt->unlock_native_capacity(sdev);
}

/**
 *        sd_format_disk_name - format disk name
 *        @prefix: name prefix - ie. "sd" for SCSI disks
 *        @index: index of the disk to format name for
 *        @buf: output buffer
 *        @buflen: length of the output buffer
 *
 *        SCSI disk names starts at sda.  The 26th device is sdz and the
 *        27th is sdaa.  The last one for two lettered suffix is sdzz
 *        which is followed by sdaaa.
 *
 *        This is basically 26 base counting with one extra 'nil' entry
 *        at the beginning from the second digit on and can be
 *        determined using similar method as 26 base conversion with the
 *        index shifted -1 after each digit is computed.
 *
 *        CONTEXT:
 *        Don't care.
 *
 *        RETURNS:
 *        0 on success, -errno on failure.
 */
static int sd_format_disk_name(char *prefix, int index, char *buf, int buflen)
{
        const int base = 'z' - 'a' + 1;
        char *begin = buf + strlen(prefix);
        char *end = buf + buflen;
        char *p;
        int unit;

        p = end - 1;
        *p = '\0';
        unit = base;
        do {
                if (p == begin)
                        return -EINVAL;
                *--p = 'a' + (index % unit);
                index = (index / unit) - 1;
        } while (index >= 0);

        memmove(begin, p, end - p);
        memcpy(buf, prefix, strlen(prefix));

        return 0;
}

/**
 *        sd_probe - called during driver initialization and whenever a
 *        new scsi device is attached to the system. It is called once
 *        for each scsi device (not just disks) present.
 *        @dev: pointer to device object
 *
 *        Returns 0 if successful (or not interested in this scsi device 
 *        (e.g. scanner)); 1 when there is an error.
 *
 *        Note: this function is invoked from the scsi mid-level.
 *        This function sets up the mapping between a given 
 *        <host,channel,id,lun> (found in sdp) and new device name 
 *        (e.g. /dev/sda). More precisely it is the block device major 
 *        and minor number that is chosen here.
 *
 *        Assume sd_probe is not re-entrant (for time being)
 *        Also think about sd_probe() and sd_remove() running coincidentally.
 **/
static int sd_probe(struct device *dev)
{
        struct scsi_device *sdp = to_scsi_device(dev);
        struct scsi_disk *sdkp;
        struct gendisk *gd;
        int index;
        int error;

        scsi_autopm_get_device(sdp);
        error = -ENODEV;
        if (sdp->type != TYPE_DISK &&
            sdp->type != TYPE_ZBC &&
            sdp->type != TYPE_MOD &&
            sdp->type != TYPE_RBC)
                goto out;

#ifndef CONFIG_BLK_DEV_ZONED
        if (sdp->type == TYPE_ZBC)
                goto out;
#endif
        SCSI_LOG_HLQUEUE(3, sdev_printk(KERN_INFO, sdp,
                                        "sd_probe\n"));

        error = -ENOMEM;
        sdkp = kzalloc(sizeof(*sdkp), GFP_KERNEL);
        if (!sdkp)
                goto out;

        gd = alloc_disk(SD_MINORS);
        if (!gd)
                goto out_free;

        index = ida_alloc(&sd_index_ida, GFP_KERNEL);
        if (index < 0) {
                sdev_printk(KERN_WARNING, sdp, "sd_probe: memory exhausted.\n");
                goto out_put;
        }

        error = sd_format_disk_name("sd", index, gd->disk_name, DISK_NAME_LEN);
        if (error) {
                sdev_printk(KERN_WARNING, sdp, "SCSI disk (sd) name length exceeded.\n");
                goto out_free_index;
        }

        sdkp->device = sdp;
        sdkp->driver = &sd_template;
        sdkp->disk = gd;
        sdkp->index = index;
        sdkp->max_retries = SD_MAX_RETRIES;
        atomic_set(&sdkp->openers, 0);
        atomic_set(&sdkp->device->ioerr_cnt, 0);

        if (!sdp->request_queue->rq_timeout) {
                if (sdp->type != TYPE_MOD)
                        blk_queue_rq_timeout(sdp->request_queue, SD_TIMEOUT);
                else
                        blk_queue_rq_timeout(sdp->request_queue,
                                             SD_MOD_TIMEOUT);
        }

        device_initialize(&sdkp->dev);
        sdkp->dev.parent = get_device(dev);
        sdkp->dev.class = &sd_disk_class;
        dev_set_name(&sdkp->dev, "%s", dev_name(dev));

        error = device_add(&sdkp->dev);
        if (error) {
                put_device(&sdkp->dev);
                goto out;
        }

        dev_set_drvdata(dev, sdkp);

        gd->major = sd_major((index & 0xf0) >> 4);
        gd->first_minor = ((index & 0xf) << 4) | (index & 0xfff00);

        gd->fops = &sd_fops;
        gd->private_data = &sdkp->driver;
        gd->queue = sdkp->device->request_queue;

        /* defaults, until the device tells us otherwise */
        sdp->sector_size = 512;
        sdkp->capacity = 0;
        sdkp->media_present = 1;
        sdkp->write_prot = 0;
        sdkp->cache_override = 0;
        sdkp->WCE = 0;
        sdkp->RCD = 0;
        sdkp->ATO = 0;
        sdkp->first_scan = 1;
        sdkp->max_medium_access_timeouts = SD_MAX_MEDIUM_TIMEOUTS;

        sd_revalidate_disk(gd);

        gd->flags = GENHD_FL_EXT_DEVT;
        if (sdp->removable) {
                gd->flags |= GENHD_FL_REMOVABLE;
                gd->events |= DISK_EVENT_MEDIA_CHANGE;
                gd->event_flags = DISK_EVENT_FLAG_POLL | DISK_EVENT_FLAG_UEVENT;
        }

        blk_pm_runtime_init(sdp->request_queue, dev);
        if (sdp->rpm_autosuspend) {
                pm_runtime_set_autosuspend_delay(dev,
                        sdp->host->hostt->rpm_autosuspend_delay);
        }
        device_add_disk(dev, gd, NULL);
        if (sdkp->capacity)
                sd_dif_config_host(sdkp);

        sd_revalidate_disk(gd);

        if (sdkp->security) {
                sdkp->opal_dev = init_opal_dev(sdkp, &sd_sec_submit);
                if (sdkp->opal_dev)
                        sd_printk(KERN_NOTICE, sdkp, "supports TCG Opal\n");
        }

        sd_printk(KERN_NOTICE, sdkp, "Attached SCSI %sdisk\n",
                  sdp->removable ? "removable " : "");
        scsi_autopm_put_device(sdp);

        return 0;

 out_free_index:
        ida_free(&sd_index_ida, index);
 out_put:
        put_disk(gd);
 out_free:
        kfree(sdkp);
 out:
        scsi_autopm_put_device(sdp);
        return error;
}

/**
 *        sd_remove - called whenever a scsi disk (previously recognized by
 *        sd_probe) is detached from the system. It is called (potentially
 *        multiple times) during sd module unload.
 *        @dev: pointer to device object
 *
 *        Note: this function is invoked from the scsi mid-level.
 *        This function potentially frees up a device name (e.g. /dev/sdc)
 *        that could be re-used by a subsequent sd_probe().
 *        This function is not called when the built-in sd driver is "exit-ed".
 **/
static int sd_remove(struct device *dev)
{
        struct scsi_disk *sdkp;
        dev_t devt;

        sdkp = dev_get_drvdata(dev);
        devt = disk_devt(sdkp->disk);
        scsi_autopm_get_device(sdkp->device);

        async_synchronize_full_domain(&scsi_sd_pm_domain);
        device_del(&sdkp->dev);
        del_gendisk(sdkp->disk);
        sd_shutdown(dev);

        free_opal_dev(sdkp->opal_dev);

        blk_register_region(devt, SD_MINORS, NULL,
                            sd_default_probe, NULL, NULL);

        mutex_lock(&sd_ref_mutex);
        dev_set_drvdata(dev, NULL);
        put_device(&sdkp->dev);
        mutex_unlock(&sd_ref_mutex);

        return 0;
}

/**
 *        scsi_disk_release - Called to free the scsi_disk structure
 *        @dev: pointer to embedded class device
 *
 *        sd_ref_mutex must be held entering this routine.  Because it is
 *        called on last put, you should always use the scsi_disk_get()
 *        scsi_disk_put() helpers which manipulate the semaphore directly
 *        and never do a direct put_device.
 **/
static void scsi_disk_release(struct device *dev)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);
        struct gendisk *disk = sdkp->disk;
        struct request_queue *q = disk->queue;

        ida_free(&sd_index_ida, sdkp->index);

        /*
         * Wait until all requests that are in progress have completed.
         * This is necessary to avoid that e.g. scsi_end_request() crashes
         * due to clearing the disk->private_data pointer. Wait from inside
         * scsi_disk_release() instead of from sd_release() to avoid that
         * freezing and unfreezing the request queue affects user space I/O
         * in case multiple processes open a /dev/sd... node concurrently.
         */
        blk_mq_freeze_queue(q);
        blk_mq_unfreeze_queue(q);

        disk->private_data = NULL;
        put_disk(disk);
        put_device(&sdkp->device->sdev_gendev);

        sd_zbc_release_disk(sdkp);

        kfree(sdkp);
}

static int sd_start_stop_device(struct scsi_disk *sdkp, int start)
{
        unsigned char cmd[6] = { START_STOP };        /* START_VALID */
        struct scsi_sense_hdr sshdr;
        struct scsi_device *sdp = sdkp->device;
        int res;

        if (start)
                cmd[4] |= 1;        /* START */

        if (sdp->start_stop_pwr_cond)
                cmd[4] |= start ? 1 << 4 : 3 << 4;        /* Active or Standby */

        if (!scsi_device_online(sdp))
                return -ENODEV;

        res = scsi_execute(sdp, cmd, DMA_NONE, NULL, 0, NULL, &sshdr,
                        SD_TIMEOUT, sdkp->max_retries, 0, RQF_PM, NULL);
        if (res) {
                sd_print_result(sdkp, "Start/Stop Unit failed", res);
                if (driver_byte(res) == DRIVER_SENSE)
                        sd_print_sense_hdr(sdkp, &sshdr);
                if (scsi_sense_valid(&sshdr) &&
                        /* 0x3a is medium not present */
                        sshdr.asc == 0x3a)
                        res = 0;
        }

        /* SCSI error codes must not go to the generic layer */
        if (res)
                return -EIO;

        return 0;
}

/*
 * Send a SYNCHRONIZE CACHE instruction down to the device through
 * the normal SCSI command structure.  Wait for the command to
 * complete.
 */
static void sd_shutdown(struct device *dev)
{
        struct scsi_disk *sdkp = dev_get_drvdata(dev);

        if (!sdkp)
                return;         /* this can happen */

        if (pm_runtime_suspended(dev))
                return;

        if (sdkp->WCE && sdkp->media_present) {
                sd_printk(KERN_NOTICE, sdkp, "Synchronizing SCSI cache\n");
                sd_sync_cache(sdkp, NULL);
        }

        if (system_state != SYSTEM_RESTART && sdkp->device->manage_start_stop) {
                sd_printk(KERN_NOTICE, sdkp, "Stopping disk\n");
                sd_start_stop_device(sdkp, 0);
        }
}

static int sd_suspend_common(struct device *dev, bool ignore_stop_errors)
{
        struct scsi_disk *sdkp = dev_get_drvdata(dev);
        struct scsi_sense_hdr sshdr;
        int ret = 0;

        if (!sdkp)        /* E.g.: runtime suspend following sd_remove() */
                return 0;

        if (sdkp->WCE && sdkp->media_present) {
                sd_printk(KERN_NOTICE, sdkp, "Synchronizing SCSI cache\n");
                ret = sd_sync_cache(sdkp, &sshdr);

                if (ret) {
                        /* ignore OFFLINE device */
                        if (ret == -ENODEV)
                                return 0;

                        if (!scsi_sense_valid(&sshdr) ||
                            sshdr.sense_key != ILLEGAL_REQUEST)
                                return ret;

                        /*
                         * sshdr.sense_key == ILLEGAL_REQUEST means this drive
                         * doesn't support sync. There's not much to do and
                         * suspend shouldn't fail.
                         */
                        ret = 0;
                }
        }

        if (sdkp->device->manage_start_stop) {
                sd_printk(KERN_NOTICE, sdkp, "Stopping disk\n");
                /* an error is not worth aborting a system sleep */
                ret = sd_start_stop_device(sdkp, 0);
                if (ignore_stop_errors)
                        ret = 0;
        }

        return ret;
}

static int sd_suspend_system(struct device *dev)
{
        return sd_suspend_common(dev, true);
}

static int sd_suspend_runtime(struct device *dev)
{
        return sd_suspend_common(dev, false);
}

static int sd_resume(struct device *dev)
{
        struct scsi_disk *sdkp = dev_get_drvdata(dev);
        int ret;

        if (!sdkp)        /* E.g.: runtime resume at the start of sd_probe() */
                return 0;

        if (!sdkp->device->manage_start_stop)
                return 0;

        sd_printk(KERN_NOTICE, sdkp, "Starting disk\n");
        ret = sd_start_stop_device(sdkp, 1);
        if (!ret)
                opal_unlock_from_suspend(sdkp->opal_dev);
        return ret;
}

/**
 *        init_sd - entry point for this driver (both when built in or when
 *        a module).
 *
 *        Note: this function registers this driver with the scsi mid-level.
 **/
static int __init init_sd(void)
{
        int majors = 0, i, err;

        SCSI_LOG_HLQUEUE(3, printk("init_sd: sd driver entry point\n"));

        for (i = 0; i < SD_MAJORS; i++) {
                if (register_blkdev(sd_major(i), "sd") != 0)
                        continue;
                majors++;
                blk_register_region(sd_major(i), SD_MINORS, NULL,
                                    sd_default_probe, NULL, NULL);
        }

        if (!majors)
                return -ENODEV;

        err = class_register(&sd_disk_class);
        if (err)
                goto err_out;

        sd_cdb_cache = kmem_cache_create("sd_ext_cdb", SD_EXT_CDB_SIZE,
                                         0, 0, NULL);
        if (!sd_cdb_cache) {
                printk(KERN_ERR "sd: can't init extended cdb cache\n");
                err = -ENOMEM;
                goto err_out_class;
        }

        sd_cdb_pool = mempool_create_slab_pool(SD_MEMPOOL_SIZE, sd_cdb_cache);
        if (!sd_cdb_pool) {
                printk(KERN_ERR "sd: can't init extended cdb pool\n");
                err = -ENOMEM;
                goto err_out_cache;
        }

        sd_page_pool = mempool_create_page_pool(SD_MEMPOOL_SIZE, 0);
        if (!sd_page_pool) {
                printk(KERN_ERR "sd: can't init discard page pool\n");
                err = -ENOMEM;
                goto err_out_ppool;
        }

        err = scsi_register_driver(&sd_template.gendrv);
        if (err)
                goto err_out_driver;

        return 0;

err_out_driver:
        mempool_destroy(sd_page_pool);

err_out_ppool:
        mempool_destroy(sd_cdb_pool);

err_out_cache:
        kmem_cache_destroy(sd_cdb_cache);

err_out_class:
        class_unregister(&sd_disk_class);
err_out:
        for (i = 0; i < SD_MAJORS; i++)
                unregister_blkdev(sd_major(i), "sd");
        return err;
}

/**
 *        exit_sd - exit point for this driver (when it is a module).
 *
 *        Note: this function unregisters this driver from the scsi mid-level.
 **/
static void __exit exit_sd(void)
{
        int i;

        SCSI_LOG_HLQUEUE(3, printk("exit_sd: exiting sd driver\n"));

        scsi_unregister_driver(&sd_template.gendrv);
        mempool_destroy(sd_cdb_pool);
        mempool_destroy(sd_page_pool);
        kmem_cache_destroy(sd_cdb_cache);

        class_unregister(&sd_disk_class);

        for (i = 0; i < SD_MAJORS; i++) {
                blk_unregister_region(sd_major(i), SD_MINORS);
                unregister_blkdev(sd_major(i), "sd");
        }
}

module_init(init_sd);
module_exit(exit_sd);

void sd_print_sense_hdr(struct scsi_disk *sdkp, struct scsi_sense_hdr *sshdr)
{
        scsi_print_sense_hdr(sdkp->device,
                             sdkp->disk ? sdkp->disk->disk_name : NULL, sshdr);
}

void sd_print_result(const struct scsi_disk *sdkp, const char *msg, int result)
{
        const char *hb_string = scsi_hostbyte_string(result);
        const char *db_string = scsi_driverbyte_string(result);

        if (hb_string || db_string)
                sd_printk(KERN_INFO, sdkp,
                          "%s: Result: hostbyte=%s driverbyte=%s\n", msg,
                          hb_string ? hb_string : "invalid",
                          db_string ? db_string : "invalid");
        else
                sd_printk(KERN_INFO, sdkp,
                          "%s: Result: hostbyte=0x%02x driverbyte=0x%02x\n",
                          msg, host_byte(result), driver_byte(result));
}















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM 9p

#if !defined(_TRACE_9P_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_9P_H

#include <linux/tracepoint.h>

#define P9_MSG_T                                                        \
                EM( P9_TLERROR,                "P9_TLERROR" )                        \
                EM( P9_RLERROR,                "P9_RLERROR" )                        \
                EM( P9_TSTATFS,                "P9_TSTATFS" )                        \
                EM( P9_RSTATFS,                "P9_RSTATFS" )                        \
                EM( P9_TLOPEN,                "P9_TLOPEN" )                        \
                EM( P9_RLOPEN,                "P9_RLOPEN" )                        \
                EM( P9_TLCREATE,        "P9_TLCREATE" )                        \
                EM( P9_RLCREATE,        "P9_RLCREATE" )                        \
                EM( P9_TSYMLINK,        "P9_TSYMLINK" )                        \
                EM( P9_RSYMLINK,        "P9_RSYMLINK" )                        \
                EM( P9_TMKNOD,                "P9_TMKNOD" )                        \
                EM( P9_RMKNOD,                "P9_RMKNOD" )                        \
                EM( P9_TRENAME,                "P9_TRENAME" )                        \
                EM( P9_RRENAME,                "P9_RRENAME" )                        \
                EM( P9_TREADLINK,        "P9_TREADLINK" )                \
                EM( P9_RREADLINK,        "P9_RREADLINK" )                \
                EM( P9_TGETATTR,        "P9_TGETATTR" )                        \
                EM( P9_RGETATTR,        "P9_RGETATTR" )                        \
                EM( P9_TSETATTR,        "P9_TSETATTR" )                        \
                EM( P9_RSETATTR,        "P9_RSETATTR" )                        \
                EM( P9_TXATTRWALK,        "P9_TXATTRWALK" )                \
                EM( P9_RXATTRWALK,        "P9_RXATTRWALK" )                \
                EM( P9_TXATTRCREATE,        "P9_TXATTRCREATE" )                \
                EM( P9_RXATTRCREATE,        "P9_RXATTRCREATE" )                \
                EM( P9_TREADDIR,        "P9_TREADDIR" )                        \
                EM( P9_RREADDIR,        "P9_RREADDIR" )                        \
                EM( P9_TFSYNC,                "P9_TFSYNC" )                        \
                EM( P9_RFSYNC,                "P9_RFSYNC" )                        \
                EM( P9_TLOCK,                "P9_TLOCK" )                        \
                EM( P9_RLOCK,                "P9_RLOCK" )                        \
                EM( P9_TGETLOCK,        "P9_TGETLOCK" )                        \
                EM( P9_RGETLOCK,        "P9_RGETLOCK" )                        \
                EM( P9_TLINK,                "P9_TLINK" )                        \
                EM( P9_RLINK,                "P9_RLINK" )                        \
                EM( P9_TMKDIR,                "P9_TMKDIR" )                        \
                EM( P9_RMKDIR,                "P9_RMKDIR" )                        \
                EM( P9_TRENAMEAT,        "P9_TRENAMEAT" )                \
                EM( P9_RRENAMEAT,        "P9_RRENAMEAT" )                \
                EM( P9_TUNLINKAT,        "P9_TUNLINKAT" )                \
                EM( P9_RUNLINKAT,        "P9_RUNLINKAT" )                \
                EM( P9_TVERSION,        "P9_TVERSION" )                        \
                EM( P9_RVERSION,        "P9_RVERSION" )                        \
                EM( P9_TAUTH,                "P9_TAUTH" )                        \
                EM( P9_RAUTH,                "P9_RAUTH" )                        \
                EM( P9_TATTACH,                "P9_TATTACH" )                        \
                EM( P9_RATTACH,                "P9_RATTACH" )                        \
                EM( P9_TERROR,                "P9_TERROR" )                        \
                EM( P9_RERROR,                "P9_RERROR" )                        \
                EM( P9_TFLUSH,                "P9_TFLUSH" )                        \
                EM( P9_RFLUSH,                "P9_RFLUSH" )                        \
                EM( P9_TWALK,                "P9_TWALK" )                        \
                EM( P9_RWALK,                "P9_RWALK" )                        \
                EM( P9_TOPEN,                "P9_TOPEN" )                        \
                EM( P9_ROPEN,                "P9_ROPEN" )                        \
                EM( P9_TCREATE,                "P9_TCREATE" )                        \
                EM( P9_RCREATE,                "P9_RCREATE" )                        \
                EM( P9_TREAD,                "P9_TREAD" )                        \
                EM( P9_RREAD,                "P9_RREAD" )                        \
                EM( P9_TWRITE,                "P9_TWRITE" )                        \
                EM( P9_RWRITE,                "P9_RWRITE" )                        \
                EM( P9_TCLUNK,                "P9_TCLUNK" )                        \
                EM( P9_RCLUNK,                "P9_RCLUNK" )                        \
                EM( P9_TREMOVE,                "P9_TREMOVE" )                        \
                EM( P9_RREMOVE,                "P9_RREMOVE" )                        \
                EM( P9_TSTAT,                "P9_TSTAT" )                        \
                EM( P9_RSTAT,                "P9_RSTAT" )                        \
                EM( P9_TWSTAT,                "P9_TWSTAT" )                        \
                EMe(P9_RWSTAT,                "P9_RWSTAT" )

/* Define EM() to export the enums to userspace via TRACE_DEFINE_ENUM() */
#undef EM
#undef EMe
#define EM(a, b)        TRACE_DEFINE_ENUM(a);
#define EMe(a, b)        TRACE_DEFINE_ENUM(a);

P9_MSG_T

/*
 * Now redefine the EM() and EMe() macros to map the enums to the strings
 * that will be printed in the output.
 */
#undef EM
#undef EMe
#define EM(a, b)        { a, b },
#define EMe(a, b)        { a, b }

#define show_9p_op(type)                                                \
        __print_symbolic(type, P9_MSG_T)

TRACE_EVENT(9p_client_req,
            TP_PROTO(struct p9_client *clnt, int8_t type, int tag),

            TP_ARGS(clnt, type, tag),

            TP_STRUCT__entry(
                    __field(    void *,                clnt                             )
                    __field(        __u8,                type                             )
                    __field(        __u32,                tag                             )
                    ),

            TP_fast_assign(
                    __entry->clnt    =  clnt;
                    __entry->type    =  type;
                    __entry->tag     =  tag;
                    ),

            TP_printk("client %lu request %s tag  %d",
                    (long)__entry->clnt, show_9p_op(__entry->type),
                    __entry->tag)
 );

TRACE_EVENT(9p_client_res,
            TP_PROTO(struct p9_client *clnt, int8_t type, int tag, int err),

            TP_ARGS(clnt, type, tag, err),

            TP_STRUCT__entry(
                    __field(    void *,                clnt                             )
                    __field(        __u8,                type                             )
                    __field(        __u32,                tag                             )
                    __field(        __u32,                err                             )
                    ),

            TP_fast_assign(
                    __entry->clnt    =  clnt;
                    __entry->type    =  type;
                    __entry->tag     =  tag;
                    __entry->err     =  err;
                    ),

            TP_printk("client %lu response %s tag  %d err %d",
                      (long)__entry->clnt, show_9p_op(__entry->type),
                      __entry->tag, __entry->err)
);

/* dump 32 bytes of protocol data */
#define P9_PROTO_DUMP_SZ 32
TRACE_EVENT(9p_protocol_dump,
            TP_PROTO(struct p9_client *clnt, struct p9_fcall *pdu),

            TP_ARGS(clnt, pdu),

            TP_STRUCT__entry(
                    __field(        void *,                clnt                                )
                    __field(        __u8,                type                                )
                    __field(        __u16,                tag                                )
                    __array(        unsigned char,        line,        P9_PROTO_DUMP_SZ        )
                    ),

            TP_fast_assign(
                    __entry->clnt   =  clnt;
                    __entry->type   =  pdu->id;
                    __entry->tag    =  pdu->tag;
                    memcpy(__entry->line, pdu->sdata, P9_PROTO_DUMP_SZ);
                    ),
            TP_printk("clnt %lu %s(tag = %d)\n%.3x: %16ph\n%.3x: %16ph\n",
                      (unsigned long)__entry->clnt, show_9p_op(__entry->type),
                      __entry->tag, 0, __entry->line, 16, __entry->line + 16)
 );

#endif /* _TRACE_9P_H */

/* This part must be outside protection */
#include <trace/define_trace.h>





































    6 




    6 






    1 




























































    6 









    6 














    6 





    6 













    6 








































































    6 











    6 




























    6 




















    6 






















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
// SPDX-License-Identifier: GPL-2.0-only
/*
 * This implements the various checks for CONFIG_HARDENED_USERCOPY*,
 * which are designed to protect kernel memory from needless exposure
 * and overwrite under many unintended conditions. This code is based
 * on PAX_USERCOPY, which is:
 *
 * Copyright (C) 2001-2016 PaX Team, Bradley Spengler, Open Source
 * Security Inc.
 */
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/thread_info.h>
#include <linux/atomic.h>
#include <linux/jump_label.h>
#include <asm/sections.h>

/*
 * Checks if a given pointer and length is contained by the current
 * stack frame (if possible).
 *
 * Returns:
 *        NOT_STACK: not at all on the stack
 *        GOOD_FRAME: fully within a valid stack frame
 *        GOOD_STACK: fully on the stack (when can't do frame-checking)
 *        BAD_STACK: error condition (invalid stack position or bad stack frame)
 */
static noinline int check_stack_object(const void *obj, unsigned long len)
{
        const void * const stack = task_stack_page(current);
        const void * const stackend = stack + THREAD_SIZE;
        int ret;

        /* Object is not on the stack at all. */
        if (obj + len <= stack || stackend <= obj)
                return NOT_STACK;

        /*
         * Reject: object partially overlaps the stack (passing the
         * check above means at least one end is within the stack,
         * so if this check fails, the other end is outside the stack).
         */
        if (obj < stack || stackend < obj + len)
                return BAD_STACK;

        /* Check if object is safely within a valid frame. */
        ret = arch_within_stack_frames(stack, stackend, obj, len);
        if (ret)
                return ret;

        return GOOD_STACK;
}

/*
 * If these functions are reached, then CONFIG_HARDENED_USERCOPY has found
 * an unexpected state during a copy_from_user() or copy_to_user() call.
 * There are several checks being performed on the buffer by the
 * __check_object_size() function. Normal stack buffer usage should never
 * trip the checks, and kernel text addressing will always trip the check.
 * For cache objects, it is checking that only the whitelisted range of
 * bytes for a given cache is being accessed (via the cache's usersize and
 * useroffset fields). To adjust a cache whitelist, use the usercopy-aware
 * kmem_cache_create_usercopy() function to create the cache (and
 * carefully audit the whitelist range).
 */
void usercopy_warn(const char *name, const char *detail, bool to_user,
                   unsigned long offset, unsigned long len)
{
        WARN_ONCE(1, "Bad or missing usercopy whitelist? Kernel memory %s attempt detected %s %s%s%s%s (offset %lu, size %lu)!\n",
                 to_user ? "exposure" : "overwrite",
                 to_user ? "from" : "to",
                 name ? : "unknown?!",
                 detail ? " '" : "", detail ? : "", detail ? "'" : "",
                 offset, len);
}

void __noreturn usercopy_abort(const char *name, const char *detail,
                               bool to_user, unsigned long offset,
                               unsigned long len)
{
        pr_emerg("Kernel memory %s attempt detected %s %s%s%s%s (offset %lu, size %lu)!\n",
                 to_user ? "exposure" : "overwrite",
                 to_user ? "from" : "to",
                 name ? : "unknown?!",
                 detail ? " '" : "", detail ? : "", detail ? "'" : "",
                 offset, len);

        /*
         * For greater effect, it would be nice to do do_group_exit(),
         * but BUG() actually hooks all the lock-breaking and per-arch
         * Oops code, so that is used here instead.
         */
        BUG();
}

/* Returns true if any portion of [ptr,ptr+n) over laps with [low,high). */
static bool overlaps(const unsigned long ptr, unsigned long n,
                     unsigned long low, unsigned long high)
{
        const unsigned long check_low = ptr;
        unsigned long check_high = check_low + n;

        /* Does not overlap if entirely above or entirely below. */
        if (check_low >= high || check_high <= low)
                return false;

        return true;
}

/* Is this address range in the kernel text area? */
static inline void check_kernel_text_object(const unsigned long ptr,
                                            unsigned long n, bool to_user)
{
        unsigned long textlow = (unsigned long)_stext;
        unsigned long texthigh = (unsigned long)_etext;
        unsigned long textlow_linear, texthigh_linear;

        if (overlaps(ptr, n, textlow, texthigh))
                usercopy_abort("kernel text", NULL, to_user, ptr - textlow, n);

        /*
         * Some architectures have virtual memory mappings with a secondary
         * mapping of the kernel text, i.e. there is more than one virtual
         * kernel address that points to the kernel image. It is usually
         * when there is a separate linear physical memory mapping, in that
         * __pa() is not just the reverse of __va(). This can be detected
         * and checked:
         */
        textlow_linear = (unsigned long)lm_alias(textlow);
        /* No different mapping: we're done. */
        if (textlow_linear == textlow)
                return;

        /* Check the secondary mapping... */
        texthigh_linear = (unsigned long)lm_alias(texthigh);
        if (overlaps(ptr, n, textlow_linear, texthigh_linear))
                usercopy_abort("linear kernel text", NULL, to_user,
                               ptr - textlow_linear, n);
}

static inline void check_bogus_address(const unsigned long ptr, unsigned long n,
                                       bool to_user)
{
        /* Reject if object wraps past end of memory. */
        if (ptr + (n - 1) < ptr)
                usercopy_abort("wrapped address", NULL, to_user, 0, ptr + n);

        /* Reject if NULL or ZERO-allocation. */
        if (ZERO_OR_NULL_PTR(ptr))
                usercopy_abort("null address", NULL, to_user, ptr, n);
}

/* Checks for allocs that are marked in some way as spanning multiple pages. */
static inline void check_page_span(const void *ptr, unsigned long n,
                                   struct page *page, bool to_user)
{
#ifdef CONFIG_HARDENED_USERCOPY_PAGESPAN
        const void *end = ptr + n - 1;
        struct page *endpage;
        bool is_reserved, is_cma;

        /*
         * Sometimes the kernel data regions are not marked Reserved (see
         * check below). And sometimes [_sdata,_edata) does not cover
         * rodata and/or bss, so check each range explicitly.
         */

        /* Allow reads of kernel rodata region (if not marked as Reserved). */
        if (ptr >= (const void *)__start_rodata &&
            end <= (const void *)__end_rodata) {
                if (!to_user)
                        usercopy_abort("rodata", NULL, to_user, 0, n);
                return;
        }

        /* Allow kernel data region (if not marked as Reserved). */
        if (ptr >= (const void *)_sdata && end <= (const void *)_edata)
                return;

        /* Allow kernel bss region (if not marked as Reserved). */
        if (ptr >= (const void *)__bss_start &&
            end <= (const void *)__bss_stop)
                return;

        /* Is the object wholly within one base page? */
        if (likely(((unsigned long)ptr & (unsigned long)PAGE_MASK) ==
                   ((unsigned long)end & (unsigned long)PAGE_MASK)))
                return;

        /* Allow if fully inside the same compound (__GFP_COMP) page. */
        endpage = virt_to_head_page(end);
        if (likely(endpage == page))
                return;

        /*
         * Reject if range is entirely either Reserved (i.e. special or
         * device memory), or CMA. Otherwise, reject since the object spans
         * several independently allocated pages.
         */
        is_reserved = PageReserved(page);
        is_cma = is_migrate_cma_page(page);
        if (!is_reserved && !is_cma)
                usercopy_abort("spans multiple pages", NULL, to_user, 0, n);

        for (ptr += PAGE_SIZE; ptr <= end; ptr += PAGE_SIZE) {
                page = virt_to_head_page(ptr);
                if (is_reserved && !PageReserved(page))
                        usercopy_abort("spans Reserved and non-Reserved pages",
                                       NULL, to_user, 0, n);
                if (is_cma && !is_migrate_cma_page(page))
                        usercopy_abort("spans CMA and non-CMA pages", NULL,
                                       to_user, 0, n);
        }
#endif
}

static inline void check_heap_object(const void *ptr, unsigned long n,
                                     bool to_user)
{
        struct page *page;

        if (!virt_addr_valid(ptr))
                return;

        /*
         * When CONFIG_HIGHMEM=y, kmap_to_page() will give either the
         * highmem page or fallback to virt_to_page(). The following
         * is effectively a highmem-aware virt_to_head_page().
         */
        page = compound_head(kmap_to_page((void *)ptr));

        if (PageSlab(page)) {
                /* Check slab allocator for flags and size. */
                __check_heap_object(ptr, n, page, to_user);
        } else {
                /* Verify object does not incorrectly span multiple pages. */
                check_page_span(ptr, n, page, to_user);
        }
}

static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks);

/*
 * Validates that the given object is:
 * - not bogus address
 * - fully contained by stack (or stack frame, when available)
 * - fully within SLAB object (or object whitelist area, when available)
 * - not in kernel text
 */
void __check_object_size(const void *ptr, unsigned long n, bool to_user)
{
        if (static_branch_unlikely(&bypass_usercopy_checks))
                return;

        /* Skip all tests if size is zero. */
        if (!n)
                return;

        /* Check for invalid addresses. */
        check_bogus_address((const unsigned long)ptr, n, to_user);

        /* Check for bad stack object. */
        switch (check_stack_object(ptr, n)) {
        case NOT_STACK:
                /* Object is not touching the current process stack. */
                break;
        case GOOD_FRAME:
        case GOOD_STACK:
                /*
                 * Object is either in the correct frame (when it
                 * is possible to check) or just generally on the
                 * process stack (when frame checking not available).
                 */
                return;
        default:
                usercopy_abort("process stack", NULL, to_user, 0, n);
        }

        /* Check for bad heap object. */
        check_heap_object(ptr, n, to_user);

        /* Check for object in kernel to avoid text exposure. */
        check_kernel_text_object((const unsigned long)ptr, n, to_user);
}
EXPORT_SYMBOL(__check_object_size);

static bool enable_checks __initdata = true;

static int __init parse_hardened_usercopy(char *str)
{
        if (strtobool(str, &enable_checks))
                pr_warn("Invalid option string for hardened_usercopy: '%s'\n",
                        str);
        return 1;
}

__setup("hardened_usercopy=", parse_hardened_usercopy);

static int __init set_hardened_usercopy(void)
{
        if (enable_checks == false)
                static_branch_enable(&bypass_usercopy_checks);
        return 1;
}

late_initcall(set_hardened_usercopy);




















































































































































































































































































































































































































    1 














































































    1 












    1 











































    1 











    1 





















































































































































































































































































































































    1 











    1 




































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_CGROUP_H
#define _LINUX_CGROUP_H
/*
 *  cgroup interface
 *
 *  Copyright (C) 2003 BULL SA
 *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
 *
 */

#include <linux/sched.h>
#include <linux/cpumask.h>
#include <linux/nodemask.h>
#include <linux/rculist.h>
#include <linux/cgroupstats.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
#include <linux/kernfs.h>
#include <linux/jump_label.h>
#include <linux/types.h>
#include <linux/ns_common.h>
#include <linux/nsproxy.h>
#include <linux/user_namespace.h>
#include <linux/refcount.h>
#include <linux/kernel_stat.h>

#include <linux/cgroup-defs.h>

struct kernel_clone_args;

#ifdef CONFIG_CGROUPS

/*
 * All weight knobs on the default hierarhcy should use the following min,
 * default and max values.  The default value is the logarithmic center of
 * MIN and MAX and allows 100x to be expressed in both directions.
 */
#define CGROUP_WEIGHT_MIN                1
#define CGROUP_WEIGHT_DFL                100
#define CGROUP_WEIGHT_MAX                10000

/* walk only threadgroup leaders */
#define CSS_TASK_ITER_PROCS                (1U << 0)
/* walk all threaded css_sets in the domain */
#define CSS_TASK_ITER_THREADED                (1U << 1)

/* internal flags */
#define CSS_TASK_ITER_SKIPPED                (1U << 16)

/* a css_task_iter should be treated as an opaque object */
struct css_task_iter {
        struct cgroup_subsys                *ss;
        unsigned int                        flags;

        struct list_head                *cset_pos;
        struct list_head                *cset_head;

        struct list_head                *tcset_pos;
        struct list_head                *tcset_head;

        struct list_head                *task_pos;

        struct list_head                *cur_tasks_head;
        struct css_set                        *cur_cset;
        struct css_set                        *cur_dcset;
        struct task_struct                *cur_task;
        struct list_head                iters_node;        /* css_set->task_iters */
};

extern struct file_system_type cgroup_fs_type;
extern struct cgroup_root cgrp_dfl_root;
extern struct css_set init_css_set;

#define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys;
#include <linux/cgroup_subsys.h>
#undef SUBSYS

#define SUBSYS(_x)                                                                \
        extern struct static_key_true _x ## _cgrp_subsys_enabled_key;                \
        extern struct static_key_true _x ## _cgrp_subsys_on_dfl_key;
#include <linux/cgroup_subsys.h>
#undef SUBSYS

/**
 * cgroup_subsys_enabled - fast test on whether a subsys is enabled
 * @ss: subsystem in question
 */
#define cgroup_subsys_enabled(ss)                                                \
        static_branch_likely(&ss ## _enabled_key)

/**
 * cgroup_subsys_on_dfl - fast test on whether a subsys is on default hierarchy
 * @ss: subsystem in question
 */
#define cgroup_subsys_on_dfl(ss)                                                \
        static_branch_likely(&ss ## _on_dfl_key)

bool css_has_online_children(struct cgroup_subsys_state *css);
struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss);
struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgroup,
                                         struct cgroup_subsys *ss);
struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup,
                                             struct cgroup_subsys *ss);
struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
                                                       struct cgroup_subsys *ss);

struct cgroup *cgroup_get_from_path(const char *path);
struct cgroup *cgroup_get_from_fd(int fd);

int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);

int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
int cgroup_rm_cftypes(struct cftype *cfts);
void cgroup_file_notify(struct cgroup_file *cfile);

int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen);
int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry);
int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
                     struct pid *pid, struct task_struct *tsk);

void cgroup_fork(struct task_struct *p);
extern int cgroup_can_fork(struct task_struct *p,
                           struct kernel_clone_args *kargs);
extern void cgroup_cancel_fork(struct task_struct *p,
                               struct kernel_clone_args *kargs);
extern void cgroup_post_fork(struct task_struct *p,
                             struct kernel_clone_args *kargs);
void cgroup_exit(struct task_struct *p);
void cgroup_release(struct task_struct *p);
void cgroup_free(struct task_struct *p);

int cgroup_init_early(void);
int cgroup_init(void);

int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v);

/*
 * Iteration helpers and macros.
 */

struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
                                           struct cgroup_subsys_state *parent);
struct cgroup_subsys_state *css_next_descendant_pre(struct cgroup_subsys_state *pos,
                                                    struct cgroup_subsys_state *css);
struct cgroup_subsys_state *css_rightmost_descendant(struct cgroup_subsys_state *pos);
struct cgroup_subsys_state *css_next_descendant_post(struct cgroup_subsys_state *pos,
                                                     struct cgroup_subsys_state *css);

struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
                                         struct cgroup_subsys_state **dst_cssp);
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
                                        struct cgroup_subsys_state **dst_cssp);

void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
                         struct css_task_iter *it);
struct task_struct *css_task_iter_next(struct css_task_iter *it);
void css_task_iter_end(struct css_task_iter *it);

/**
 * css_for_each_child - iterate through children of a css
 * @pos: the css * to use as the loop cursor
 * @parent: css whose children to walk
 *
 * Walk @parent's children.  Must be called under rcu_read_lock().
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
 *
 * It is allowed to temporarily drop RCU read lock during iteration.  The
 * caller is responsible for ensuring that @pos remains accessible until
 * the start of the next iteration by, for example, bumping the css refcnt.
 */
#define css_for_each_child(pos, parent)                                        \
        for ((pos) = css_next_child(NULL, (parent)); (pos);                \
             (pos) = css_next_child((pos), (parent)))

/**
 * css_for_each_descendant_pre - pre-order walk of a css's descendants
 * @pos: the css * to use as the loop cursor
 * @root: css whose descendants to walk
 *
 * Walk @root's descendants.  @root is included in the iteration and the
 * first node to be visited.  Must be called under rcu_read_lock().
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
 *
 * For example, the following guarantees that a descendant can't escape
 * state updates of its ancestors.
 *
 * my_online(@css)
 * {
 *        Lock @css's parent and @css;
 *        Inherit state from the parent;
 *        Unlock both.
 * }
 *
 * my_update_state(@css)
 * {
 *        css_for_each_descendant_pre(@pos, @css) {
 *                Lock @pos;
 *                if (@pos == @css)
 *                        Update @css's state;
 *                else
 *                        Verify @pos is alive and inherit state from its parent;
 *                Unlock @pos;
 *        }
 * }
 *
 * As long as the inheriting step, including checking the parent state, is
 * enclosed inside @pos locking, double-locking the parent isn't necessary
 * while inheriting.  The state update to the parent is guaranteed to be
 * visible by walking order and, as long as inheriting operations to the
 * same @pos are atomic to each other, multiple updates racing each other
 * still result in the correct state.  It's guaranateed that at least one
 * inheritance happens for any css after the latest update to its parent.
 *
 * If checking parent's state requires locking the parent, each inheriting
 * iteration should lock and unlock both @pos->parent and @pos.
 *
 * Alternatively, a subsystem may choose to use a single global lock to
 * synchronize ->css_online() and ->css_offline() against tree-walking
 * operations.
 *
 * It is allowed to temporarily drop RCU read lock during iteration.  The
 * caller is responsible for ensuring that @pos remains accessible until
 * the start of the next iteration by, for example, bumping the css refcnt.
 */
#define css_for_each_descendant_pre(pos, css)                                \
        for ((pos) = css_next_descendant_pre(NULL, (css)); (pos);        \
             (pos) = css_next_descendant_pre((pos), (css)))

/**
 * css_for_each_descendant_post - post-order walk of a css's descendants
 * @pos: the css * to use as the loop cursor
 * @css: css whose descendants to walk
 *
 * Similar to css_for_each_descendant_pre() but performs post-order
 * traversal instead.  @root is included in the iteration and the last
 * node to be visited.
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
 *
 * Note that the walk visibility guarantee example described in pre-order
 * walk doesn't apply the same to post-order walks.
 */
#define css_for_each_descendant_post(pos, css)                                \
        for ((pos) = css_next_descendant_post(NULL, (css)); (pos);        \
             (pos) = css_next_descendant_post((pos), (css)))

/**
 * cgroup_taskset_for_each - iterate cgroup_taskset
 * @task: the loop cursor
 * @dst_css: the destination css
 * @tset: taskset to iterate
 *
 * @tset may contain multiple tasks and they may belong to multiple
 * processes.
 *
 * On the v2 hierarchy, there may be tasks from multiple processes and they
 * may not share the source or destination csses.
 *
 * On traditional hierarchies, when there are multiple tasks in @tset, if a
 * task of a process is in @tset, all tasks of the process are in @tset.
 * Also, all are guaranteed to share the same source and destination csses.
 *
 * Iteration is not in any specific order.
 */
#define cgroup_taskset_for_each(task, dst_css, tset)                        \
        for ((task) = cgroup_taskset_first((tset), &(dst_css));                \
             (task);                                                        \
             (task) = cgroup_taskset_next((tset), &(dst_css)))

/**
 * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset
 * @leader: the loop cursor
 * @dst_css: the destination css
 * @tset: taskset to iterate
 *
 * Iterate threadgroup leaders of @tset.  For single-task migrations, @tset
 * may not contain any.
 */
#define cgroup_taskset_for_each_leader(leader, dst_css, tset)                \
        for ((leader) = cgroup_taskset_first((tset), &(dst_css));        \
             (leader);                                                        \
             (leader) = cgroup_taskset_next((tset), &(dst_css)))        \
                if ((leader) != (leader)->group_leader)                        \
                        ;                                                \
                else

/*
 * Inline functions.
 */

static inline u64 cgroup_id(struct cgroup *cgrp)
{
        return cgrp->kn->id;
}

/**
 * css_get - obtain a reference on the specified css
 * @css: target css
 *
 * The caller must already have a reference.
 */
static inline void css_get(struct cgroup_subsys_state *css)
{
        if (!(css->flags & CSS_NO_REF))
                percpu_ref_get(&css->refcnt);
}

/**
 * css_get_many - obtain references on the specified css
 * @css: target css
 * @n: number of references to get
 *
 * The caller must already have a reference.
 */
static inline void css_get_many(struct cgroup_subsys_state *css, unsigned int n)
{
        if (!(css->flags & CSS_NO_REF))
                percpu_ref_get_many(&css->refcnt, n);
}

/**
 * css_tryget - try to obtain a reference on the specified css
 * @css: target css
 *
 * Obtain a reference on @css unless it already has reached zero and is
 * being released.  This function doesn't care whether @css is on or
 * offline.  The caller naturally needs to ensure that @css is accessible
 * but doesn't have to be holding a reference on it - IOW, RCU protected
 * access is good enough for this function.  Returns %true if a reference
 * count was successfully obtained; %false otherwise.
 */
static inline bool css_tryget(struct cgroup_subsys_state *css)
{
        if (!(css->flags & CSS_NO_REF))
                return percpu_ref_tryget(&css->refcnt);
        return true;
}

/**
 * css_tryget_online - try to obtain a reference on the specified css if online
 * @css: target css
 *
 * Obtain a reference on @css if it's online.  The caller naturally needs
 * to ensure that @css is accessible but doesn't have to be holding a
 * reference on it - IOW, RCU protected access is good enough for this
 * function.  Returns %true if a reference count was successfully obtained;
 * %false otherwise.
 */
static inline bool css_tryget_online(struct cgroup_subsys_state *css)
{
        if (!(css->flags & CSS_NO_REF))
                return percpu_ref_tryget_live(&css->refcnt);
        return true;
}

/**
 * css_is_dying - test whether the specified css is dying
 * @css: target css
 *
 * Test whether @css is in the process of offlining or already offline.  In
 * most cases, ->css_online() and ->css_offline() callbacks should be
 * enough; however, the actual offline operations are RCU delayed and this
 * test returns %true also when @css is scheduled to be offlined.
 *
 * This is useful, for example, when the use case requires synchronous
 * behavior with respect to cgroup removal.  cgroup removal schedules css
 * offlining but the css can seem alive while the operation is being
 * delayed.  If the delay affects user visible semantics, this test can be
 * used to resolve the situation.
 */
static inline bool css_is_dying(struct cgroup_subsys_state *css)
{
        return !(css->flags & CSS_NO_REF) && percpu_ref_is_dying(&css->refcnt);
}

/**
 * css_put - put a css reference
 * @css: target css
 *
 * Put a reference obtained via css_get() and css_tryget_online().
 */
static inline void css_put(struct cgroup_subsys_state *css)
{
        if (!(css->flags & CSS_NO_REF))
                percpu_ref_put(&css->refcnt);
}

/**
 * css_put_many - put css references
 * @css: target css
 * @n: number of references to put
 *
 * Put references obtained via css_get() and css_tryget_online().
 */
static inline void css_put_many(struct cgroup_subsys_state *css, unsigned int n)
{
        if (!(css->flags & CSS_NO_REF))
                percpu_ref_put_many(&css->refcnt, n);
}

static inline void cgroup_get(struct cgroup *cgrp)
{
        css_get(&cgrp->self);
}

static inline bool cgroup_tryget(struct cgroup *cgrp)
{
        return css_tryget(&cgrp->self);
}

static inline void cgroup_put(struct cgroup *cgrp)
{
        css_put(&cgrp->self);
}

/**
 * task_css_set_check - obtain a task's css_set with extra access conditions
 * @task: the task to obtain css_set for
 * @__c: extra condition expression to be passed to rcu_dereference_check()
 *
 * A task's css_set is RCU protected, initialized and exited while holding
 * task_lock(), and can only be modified while holding both cgroup_mutex
 * and task_lock() while the task is alive.  This macro verifies that the
 * caller is inside proper critical section and returns @task's css_set.
 *
 * The caller can also specify additional allowed conditions via @__c, such
 * as locks used during the cgroup_subsys::attach() methods.
 */
#ifdef CONFIG_PROVE_RCU
extern struct mutex cgroup_mutex;
extern spinlock_t css_set_lock;
#define task_css_set_check(task, __c)                                        \
        rcu_dereference_check((task)->cgroups,                                \
                rcu_read_lock_sched_held() ||                                \
                lockdep_is_held(&cgroup_mutex) ||                        \
                lockdep_is_held(&css_set_lock) ||                        \
                ((task)->flags & PF_EXITING) || (__c))
#else
#define task_css_set_check(task, __c)                                        \
        rcu_dereference((task)->cgroups)
#endif

/**
 * task_css_check - obtain css for (task, subsys) w/ extra access conds
 * @task: the target task
 * @subsys_id: the target subsystem ID
 * @__c: extra condition expression to be passed to rcu_dereference_check()
 *
 * Return the cgroup_subsys_state for the (@task, @subsys_id) pair.  The
 * synchronization rules are the same as task_css_set_check().
 */
#define task_css_check(task, subsys_id, __c)                                \
        task_css_set_check((task), (__c))->subsys[(subsys_id)]

/**
 * task_css_set - obtain a task's css_set
 * @task: the task to obtain css_set for
 *
 * See task_css_set_check().
 */
static inline struct css_set *task_css_set(struct task_struct *task)
{
        return task_css_set_check(task, false);
}

/**
 * task_css - obtain css for (task, subsys)
 * @task: the target task
 * @subsys_id: the target subsystem ID
 *
 * See task_css_check().
 */
static inline struct cgroup_subsys_state *task_css(struct task_struct *task,
                                                   int subsys_id)
{
        return task_css_check(task, subsys_id, false);
}

/**
 * task_get_css - find and get the css for (task, subsys)
 * @task: the target task
 * @subsys_id: the target subsystem ID
 *
 * Find the css for the (@task, @subsys_id) combination, increment a
 * reference on and return it.  This function is guaranteed to return a
 * valid css.  The returned css may already have been offlined.
 */
static inline struct cgroup_subsys_state *
task_get_css(struct task_struct *task, int subsys_id)
{
        struct cgroup_subsys_state *css;

        rcu_read_lock();
        while (true) {
                css = task_css(task, subsys_id);
                /*
                 * Can't use css_tryget_online() here.  A task which has
                 * PF_EXITING set may stay associated with an offline css.
                 * If such task calls this function, css_tryget_online()
                 * will keep failing.
                 */
                if (likely(css_tryget(css)))
                        break;
                cpu_relax();
        }
        rcu_read_unlock();
        return css;
}

/**
 * task_css_is_root - test whether a task belongs to the root css
 * @task: the target task
 * @subsys_id: the target subsystem ID
 *
 * Test whether @task belongs to the root css on the specified subsystem.
 * May be invoked in any context.
 */
static inline bool task_css_is_root(struct task_struct *task, int subsys_id)
{
        return task_css_check(task, subsys_id, true) ==
                init_css_set.subsys[subsys_id];
}

static inline struct cgroup *task_cgroup(struct task_struct *task,
                                         int subsys_id)
{
        return task_css(task, subsys_id)->cgroup;
}

static inline struct cgroup *task_dfl_cgroup(struct task_struct *task)
{
        return task_css_set(task)->dfl_cgrp;
}

static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
{
        struct cgroup_subsys_state *parent_css = cgrp->self.parent;

        if (parent_css)
                return container_of(parent_css, struct cgroup, self);
        return NULL;
}

/**
 * cgroup_is_descendant - test ancestry
 * @cgrp: the cgroup to be tested
 * @ancestor: possible ancestor of @cgrp
 *
 * Test whether @cgrp is a descendant of @ancestor.  It also returns %true
 * if @cgrp == @ancestor.  This function is safe to call as long as @cgrp
 * and @ancestor are accessible.
 */
static inline bool cgroup_is_descendant(struct cgroup *cgrp,
                                        struct cgroup *ancestor)
{
        if (cgrp->root != ancestor->root || cgrp->level < ancestor->level)
                return false;
        return cgrp->ancestor_ids[ancestor->level] == cgroup_id(ancestor);
}

/**
 * cgroup_ancestor - find ancestor of cgroup
 * @cgrp: cgroup to find ancestor of
 * @ancestor_level: level of ancestor to find starting from root
 *
 * Find ancestor of cgroup at specified level starting from root if it exists
 * and return pointer to it. Return NULL if @cgrp doesn't have ancestor at
 * @ancestor_level.
 *
 * This function is safe to call as long as @cgrp is accessible.
 */
static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp,
                                             int ancestor_level)
{
        if (cgrp->level < ancestor_level)
                return NULL;
        while (cgrp && cgrp->level > ancestor_level)
                cgrp = cgroup_parent(cgrp);
        return cgrp;
}

/**
 * task_under_cgroup_hierarchy - test task's membership of cgroup ancestry
 * @task: the task to be tested
 * @ancestor: possible ancestor of @task's cgroup
 *
 * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor.
 * It follows all the same rules as cgroup_is_descendant, and only applies
 * to the default hierarchy.
 */
static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
                                               struct cgroup *ancestor)
{
        struct css_set *cset = task_css_set(task);

        return cgroup_is_descendant(cset->dfl_cgrp, ancestor);
}

/* no synchronization, the result can only be used as a hint */
static inline bool cgroup_is_populated(struct cgroup *cgrp)
{
        return cgrp->nr_populated_csets + cgrp->nr_populated_domain_children +
                cgrp->nr_populated_threaded_children;
}

/* returns ino associated with a cgroup */
static inline ino_t cgroup_ino(struct cgroup *cgrp)
{
        return kernfs_ino(cgrp->kn);
}

/* cft/css accessors for cftype->write() operation */
static inline struct cftype *of_cft(struct kernfs_open_file *of)
{
        return of->kn->priv;
}

struct cgroup_subsys_state *of_css(struct kernfs_open_file *of);

/* cft/css accessors for cftype->seq_*() operations */
static inline struct cftype *seq_cft(struct seq_file *seq)
{
        return of_cft(seq->private);
}

static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq)
{
        return of_css(seq->private);
}

/*
 * Name / path handling functions.  All are thin wrappers around the kernfs
 * counterparts and can be called under any context.
 */

static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen)
{
        return kernfs_name(cgrp->kn, buf, buflen);
}

static inline int cgroup_path(struct cgroup *cgrp, char *buf, size_t buflen)
{
        return kernfs_path(cgrp->kn, buf, buflen);
}

static inline void pr_cont_cgroup_name(struct cgroup *cgrp)
{
        pr_cont_kernfs_name(cgrp->kn);
}

static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
{
        pr_cont_kernfs_path(cgrp->kn);
}

static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
{
        return &cgrp->psi;
}

static inline void cgroup_init_kthreadd(void)
{
        /*
         * kthreadd is inherited by all kthreads, keep it in the root so
         * that the new kthreads are guaranteed to stay in the root until
         * initialization is finished.
         */
        current->no_cgroup_migration = 1;
}

static inline void cgroup_kthread_ready(void)
{
        /*
         * This kthread finished initialization.  The creator should have
         * set PF_NO_SETAFFINITY if this kthread should stay in the root.
         */
        current->no_cgroup_migration = 0;
}

void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen);
#else /* !CONFIG_CGROUPS */

struct cgroup_subsys_state;
struct cgroup;

static inline u64 cgroup_id(struct cgroup *cgrp) { return 1; }
static inline void css_get(struct cgroup_subsys_state *css) {}
static inline void css_put(struct cgroup_subsys_state *css) {}
static inline int cgroup_attach_task_all(struct task_struct *from,
                                         struct task_struct *t) { return 0; }
static inline int cgroupstats_build(struct cgroupstats *stats,
                                    struct dentry *dentry) { return -EINVAL; }

static inline void cgroup_fork(struct task_struct *p) {}
static inline int cgroup_can_fork(struct task_struct *p,
                                  struct kernel_clone_args *kargs) { return 0; }
static inline void cgroup_cancel_fork(struct task_struct *p,
                                      struct kernel_clone_args *kargs) {}
static inline void cgroup_post_fork(struct task_struct *p,
                                    struct kernel_clone_args *kargs) {}
static inline void cgroup_exit(struct task_struct *p) {}
static inline void cgroup_release(struct task_struct *p) {}
static inline void cgroup_free(struct task_struct *p) {}

static inline int cgroup_init_early(void) { return 0; }
static inline int cgroup_init(void) { return 0; }
static inline void cgroup_init_kthreadd(void) {}
static inline void cgroup_kthread_ready(void) {}

static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
{
        return NULL;
}

static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
{
        return NULL;
}

static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
                                               struct cgroup *ancestor)
{
        return true;
}

static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
{}
#endif /* !CONFIG_CGROUPS */

#ifdef CONFIG_CGROUPS
/*
 * cgroup scalable recursive statistics.
 */
void cgroup_rstat_updated(struct cgroup *cgrp, int cpu);
void cgroup_rstat_flush(struct cgroup *cgrp);
void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp);
void cgroup_rstat_flush_hold(struct cgroup *cgrp);
void cgroup_rstat_flush_release(void);

/*
 * Basic resource stats.
 */
#ifdef CONFIG_CGROUP_CPUACCT
void cpuacct_charge(struct task_struct *tsk, u64 cputime);
void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
#else
static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
static inline void cpuacct_account_field(struct task_struct *tsk, int index,
                                         u64 val) {}
#endif

void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec);
void __cgroup_account_cputime_field(struct cgroup *cgrp,
                                    enum cpu_usage_stat index, u64 delta_exec);

static inline void cgroup_account_cputime(struct task_struct *task,
                                          u64 delta_exec)
{
        struct cgroup *cgrp;

        cpuacct_charge(task, delta_exec);

        cgrp = task_dfl_cgroup(task);
        if (cgroup_parent(cgrp))
                __cgroup_account_cputime(cgrp, delta_exec);
}

static inline void cgroup_account_cputime_field(struct task_struct *task,
                                                enum cpu_usage_stat index,
                                                u64 delta_exec)
{
        struct cgroup *cgrp;

        cpuacct_account_field(task, index, delta_exec);

        rcu_read_lock();
        cgrp = task_dfl_cgroup(task);
        if (cgroup_parent(cgrp))
                __cgroup_account_cputime_field(cgrp, index, delta_exec);
        rcu_read_unlock();
}

#else        /* CONFIG_CGROUPS */

static inline void cgroup_account_cputime(struct task_struct *task,
                                          u64 delta_exec) {}
static inline void cgroup_account_cputime_field(struct task_struct *task,
                                                enum cpu_usage_stat index,
                                                u64 delta_exec) {}

#endif        /* CONFIG_CGROUPS */

/*
 * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
 * definition in cgroup-defs.h.
 */
#ifdef CONFIG_SOCK_CGROUP_DATA

void cgroup_sk_alloc(struct sock_cgroup_data *skcd);
void cgroup_sk_clone(struct sock_cgroup_data *skcd);
void cgroup_sk_free(struct sock_cgroup_data *skcd);

static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd)
{
        return skcd->cgroup;
}

#else        /* CONFIG_CGROUP_DATA */

static inline void cgroup_sk_alloc(struct sock_cgroup_data *skcd) {}
static inline void cgroup_sk_clone(struct sock_cgroup_data *skcd) {}
static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {}

#endif        /* CONFIG_CGROUP_DATA */

struct cgroup_namespace {
        refcount_t                count;
        struct ns_common        ns;
        struct user_namespace        *user_ns;
        struct ucounts                *ucounts;
        struct css_set          *root_cset;
};

extern struct cgroup_namespace init_cgroup_ns;

#ifdef CONFIG_CGROUPS

void free_cgroup_ns(struct cgroup_namespace *ns);

struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
                                        struct user_namespace *user_ns,
                                        struct cgroup_namespace *old_ns);

int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
                   struct cgroup_namespace *ns);

#else /* !CONFIG_CGROUPS */

static inline void free_cgroup_ns(struct cgroup_namespace *ns) { }
static inline struct cgroup_namespace *
copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns,
               struct cgroup_namespace *old_ns)
{
        return old_ns;
}

#endif /* !CONFIG_CGROUPS */

static inline void get_cgroup_ns(struct cgroup_namespace *ns)
{
        if (ns)
                refcount_inc(&ns->count);
}

static inline void put_cgroup_ns(struct cgroup_namespace *ns)
{
        if (ns && refcount_dec_and_test(&ns->count))
                free_cgroup_ns(ns);
}

#ifdef CONFIG_CGROUPS

void cgroup_enter_frozen(void);
void cgroup_leave_frozen(bool always_leave);
void cgroup_update_frozen(struct cgroup *cgrp);
void cgroup_freeze(struct cgroup *cgrp, bool freeze);
void cgroup_freezer_migrate_task(struct task_struct *task, struct cgroup *src,
                                 struct cgroup *dst);

static inline bool cgroup_task_freeze(struct task_struct *task)
{
        bool ret;

        if (task->flags & PF_KTHREAD)
                return false;

        rcu_read_lock();
        ret = test_bit(CGRP_FREEZE, &task_dfl_cgroup(task)->flags);
        rcu_read_unlock();

        return ret;
}

static inline bool cgroup_task_frozen(struct task_struct *task)
{
        return task->frozen;
}

#else /* !CONFIG_CGROUPS */

static inline void cgroup_enter_frozen(void) { }
static inline void cgroup_leave_frozen(bool always_leave) { }
static inline bool cgroup_task_freeze(struct task_struct *task)
{
        return false;
}
static inline bool cgroup_task_frozen(struct task_struct *task)
{
        return false;
}

#endif /* !CONFIG_CGROUPS */

#ifdef CONFIG_CGROUP_BPF
static inline void cgroup_bpf_get(struct cgroup *cgrp)
{
        percpu_ref_get(&cgrp->bpf.refcnt);
}

static inline void cgroup_bpf_put(struct cgroup *cgrp)
{
        percpu_ref_put(&cgrp->bpf.refcnt);
}

#else /* CONFIG_CGROUP_BPF */

static inline void cgroup_bpf_get(struct cgroup *cgrp) {}
static inline void cgroup_bpf_put(struct cgroup *cgrp) {}

#endif /* CONFIG_CGROUP_BPF */

#endif /* _LINUX_CGROUP_H */




































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
/* SPDX-License-Identifier: GPL-2.0 */
/*
 *  include/linux/signalfd.h
 *
 *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
 *
 */
#ifndef _LINUX_SIGNALFD_H
#define _LINUX_SIGNALFD_H

#include <uapi/linux/signalfd.h>
#include <linux/sched/signal.h>

#ifdef CONFIG_SIGNALFD

/*
 * Deliver the signal to listening signalfd.
 */
static inline void signalfd_notify(struct task_struct *tsk, int sig)
{
        if (unlikely(waitqueue_active(&tsk->sighand->signalfd_wqh)))
                wake_up(&tsk->sighand->signalfd_wqh);
}

extern void signalfd_cleanup(struct sighand_struct *sighand);

#else /* CONFIG_SIGNALFD */

static inline void signalfd_notify(struct task_struct *tsk, int sig) { }

static inline void signalfd_cleanup(struct sighand_struct *sighand) { }

#endif /* CONFIG_SIGNALFD */

#endif /* _LINUX_SIGNALFD_H */



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
/*
   BlueZ - Bluetooth protocol stack for Linux
   Copyright (C) 2000-2001 Qualcomm Incorporated
   Copyright (C) 2009-2010 Gustavo F. Padovan <gustavo@padovan.org>
   Copyright (C) 2010 Google Inc.

   Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License version 2 as
   published by the Free Software Foundation;

   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
   IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
   CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

   ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
   COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
   SOFTWARE IS DISCLAIMED.
*/

#ifndef __L2CAP_H
#define __L2CAP_H

#include <asm/unaligned.h>
#include <linux/atomic.h>

/* L2CAP defaults */
#define L2CAP_DEFAULT_MTU                672
#define L2CAP_DEFAULT_MIN_MTU                48
#define L2CAP_DEFAULT_FLUSH_TO                0xFFFF
#define L2CAP_EFS_DEFAULT_FLUSH_TO        0xFFFFFFFF
#define L2CAP_DEFAULT_TX_WINDOW                63
#define L2CAP_DEFAULT_EXT_WINDOW        0x3FFF
#define L2CAP_DEFAULT_MAX_TX                3
#define L2CAP_DEFAULT_RETRANS_TO        2000    /* 2 seconds */
#define L2CAP_DEFAULT_MONITOR_TO        12000   /* 12 seconds */
#define L2CAP_DEFAULT_MAX_PDU_SIZE        1492    /* Sized for AMP packet */
#define L2CAP_DEFAULT_ACK_TO                200
#define L2CAP_DEFAULT_MAX_SDU_SIZE        0xFFFF
#define L2CAP_DEFAULT_SDU_ITIME                0xFFFFFFFF
#define L2CAP_DEFAULT_ACC_LAT                0xFFFFFFFF
#define L2CAP_BREDR_MAX_PAYLOAD                1019    /* 3-DH5 packet */
#define L2CAP_LE_MIN_MTU                23
#define L2CAP_ECRED_CONN_SCID_MAX        5

#define L2CAP_DISC_TIMEOUT                msecs_to_jiffies(100)
#define L2CAP_DISC_REJ_TIMEOUT                msecs_to_jiffies(5000)
#define L2CAP_ENC_TIMEOUT                msecs_to_jiffies(5000)
#define L2CAP_CONN_TIMEOUT                msecs_to_jiffies(40000)
#define L2CAP_INFO_TIMEOUT                msecs_to_jiffies(4000)
#define L2CAP_MOVE_TIMEOUT                msecs_to_jiffies(4000)
#define L2CAP_MOVE_ERTX_TIMEOUT                msecs_to_jiffies(60000)
#define L2CAP_WAIT_ACK_POLL_PERIOD        msecs_to_jiffies(200)
#define L2CAP_WAIT_ACK_TIMEOUT                msecs_to_jiffies(10000)

#define L2CAP_A2MP_DEFAULT_MTU                670

/* L2CAP socket address */
struct sockaddr_l2 {
        sa_family_t        l2_family;
        __le16                l2_psm;
        bdaddr_t        l2_bdaddr;
        __le16                l2_cid;
        __u8                l2_bdaddr_type;
};

/* L2CAP socket options */
#define L2CAP_OPTIONS        0x01
struct l2cap_options {
        __u16 omtu;
        __u16 imtu;
        __u16 flush_to;
        __u8  mode;
        __u8  fcs;
        __u8  max_tx;
        __u16 txwin_size;
};

#define L2CAP_CONNINFO        0x02
struct l2cap_conninfo {
        __u16 hci_handle;
        __u8  dev_class[3];
};

#define L2CAP_LM        0x03
#define L2CAP_LM_MASTER                0x0001
#define L2CAP_LM_AUTH                0x0002
#define L2CAP_LM_ENCRYPT        0x0004
#define L2CAP_LM_TRUSTED        0x0008
#define L2CAP_LM_RELIABLE        0x0010
#define L2CAP_LM_SECURE                0x0020
#define L2CAP_LM_FIPS                0x0040

/* L2CAP command codes */
#define L2CAP_COMMAND_REJ        0x01
#define L2CAP_CONN_REQ                0x02
#define L2CAP_CONN_RSP                0x03
#define L2CAP_CONF_REQ                0x04
#define L2CAP_CONF_RSP                0x05
#define L2CAP_DISCONN_REQ        0x06
#define L2CAP_DISCONN_RSP        0x07
#define L2CAP_ECHO_REQ                0x08
#define L2CAP_ECHO_RSP                0x09
#define L2CAP_INFO_REQ                0x0a
#define L2CAP_INFO_RSP                0x0b
#define L2CAP_CREATE_CHAN_REQ        0x0c
#define L2CAP_CREATE_CHAN_RSP        0x0d
#define L2CAP_MOVE_CHAN_REQ        0x0e
#define L2CAP_MOVE_CHAN_RSP        0x0f
#define L2CAP_MOVE_CHAN_CFM        0x10
#define L2CAP_MOVE_CHAN_CFM_RSP        0x11
#define L2CAP_CONN_PARAM_UPDATE_REQ        0x12
#define L2CAP_CONN_PARAM_UPDATE_RSP        0x13
#define L2CAP_LE_CONN_REQ        0x14
#define L2CAP_LE_CONN_RSP        0x15
#define L2CAP_LE_CREDITS        0x16
#define L2CAP_ECRED_CONN_REQ        0x17
#define L2CAP_ECRED_CONN_RSP        0x18
#define L2CAP_ECRED_RECONF_REQ        0x19
#define L2CAP_ECRED_RECONF_RSP        0x1a

/* L2CAP extended feature mask */
#define L2CAP_FEAT_FLOWCTL        0x00000001
#define L2CAP_FEAT_RETRANS        0x00000002
#define L2CAP_FEAT_BIDIR_QOS        0x00000004
#define L2CAP_FEAT_ERTM                0x00000008
#define L2CAP_FEAT_STREAMING        0x00000010
#define L2CAP_FEAT_FCS                0x00000020
#define L2CAP_FEAT_EXT_FLOW        0x00000040
#define L2CAP_FEAT_FIXED_CHAN        0x00000080
#define L2CAP_FEAT_EXT_WINDOW        0x00000100
#define L2CAP_FEAT_UCD                0x00000200

/* L2CAP checksum option */
#define L2CAP_FCS_NONE                0x00
#define L2CAP_FCS_CRC16                0x01

/* L2CAP fixed channels */
#define L2CAP_FC_SIG_BREDR        0x02
#define L2CAP_FC_CONNLESS        0x04
#define L2CAP_FC_A2MP                0x08
#define L2CAP_FC_ATT                0x10
#define L2CAP_FC_SIG_LE                0x20
#define L2CAP_FC_SMP_LE                0x40
#define L2CAP_FC_SMP_BREDR        0x80

/* L2CAP Control Field bit masks */
#define L2CAP_CTRL_SAR                        0xC000
#define L2CAP_CTRL_REQSEQ                0x3F00
#define L2CAP_CTRL_TXSEQ                0x007E
#define L2CAP_CTRL_SUPERVISE                0x000C

#define L2CAP_CTRL_RETRANS                0x0080
#define L2CAP_CTRL_FINAL                0x0080
#define L2CAP_CTRL_POLL                        0x0010
#define L2CAP_CTRL_FRAME_TYPE                0x0001 /* I- or S-Frame */

#define L2CAP_CTRL_TXSEQ_SHIFT                1
#define L2CAP_CTRL_SUPER_SHIFT                2
#define L2CAP_CTRL_POLL_SHIFT                4
#define L2CAP_CTRL_FINAL_SHIFT                7
#define L2CAP_CTRL_REQSEQ_SHIFT                8
#define L2CAP_CTRL_SAR_SHIFT                14

/* L2CAP Extended Control Field bit mask */
#define L2CAP_EXT_CTRL_TXSEQ                0xFFFC0000
#define L2CAP_EXT_CTRL_SAR                0x00030000
#define L2CAP_EXT_CTRL_SUPERVISE        0x00030000
#define L2CAP_EXT_CTRL_REQSEQ                0x0000FFFC

#define L2CAP_EXT_CTRL_POLL                0x00040000
#define L2CAP_EXT_CTRL_FINAL                0x00000002
#define L2CAP_EXT_CTRL_FRAME_TYPE        0x00000001 /* I- or S-Frame */

#define L2CAP_EXT_CTRL_FINAL_SHIFT        1
#define L2CAP_EXT_CTRL_REQSEQ_SHIFT        2
#define L2CAP_EXT_CTRL_SAR_SHIFT        16
#define L2CAP_EXT_CTRL_SUPER_SHIFT        16
#define L2CAP_EXT_CTRL_POLL_SHIFT        18
#define L2CAP_EXT_CTRL_TXSEQ_SHIFT        18

/* L2CAP Supervisory Function */
#define L2CAP_SUPER_RR                0x00
#define L2CAP_SUPER_REJ                0x01
#define L2CAP_SUPER_RNR                0x02
#define L2CAP_SUPER_SREJ        0x03

/* L2CAP Segmentation and Reassembly */
#define L2CAP_SAR_UNSEGMENTED        0x00
#define L2CAP_SAR_START                0x01
#define L2CAP_SAR_END                0x02
#define L2CAP_SAR_CONTINUE        0x03

/* L2CAP Command rej. reasons */
#define L2CAP_REJ_NOT_UNDERSTOOD        0x0000
#define L2CAP_REJ_MTU_EXCEEDED                0x0001
#define L2CAP_REJ_INVALID_CID                0x0002

/* L2CAP structures */
struct l2cap_hdr {
        __le16     len;
        __le16     cid;
} __packed;
#define L2CAP_HDR_SIZE                4
#define L2CAP_ENH_HDR_SIZE        6
#define L2CAP_EXT_HDR_SIZE        8

#define L2CAP_FCS_SIZE                2
#define L2CAP_SDULEN_SIZE        2
#define L2CAP_PSMLEN_SIZE        2
#define L2CAP_ENH_CTRL_SIZE        2
#define L2CAP_EXT_CTRL_SIZE        4

struct l2cap_cmd_hdr {
        __u8       code;
        __u8       ident;
        __le16     len;
} __packed;
#define L2CAP_CMD_HDR_SIZE        4

struct l2cap_cmd_rej_unk {
        __le16     reason;
} __packed;

struct l2cap_cmd_rej_mtu {
        __le16     reason;
        __le16     max_mtu;
} __packed;

struct l2cap_cmd_rej_cid {
        __le16     reason;
        __le16     scid;
        __le16     dcid;
} __packed;

struct l2cap_conn_req {
        __le16     psm;
        __le16     scid;
} __packed;

struct l2cap_conn_rsp {
        __le16     dcid;
        __le16     scid;
        __le16     result;
        __le16     status;
} __packed;

/* protocol/service multiplexer (PSM) */
#define L2CAP_PSM_SDP                0x0001
#define L2CAP_PSM_RFCOMM        0x0003
#define L2CAP_PSM_3DSP                0x0021
#define L2CAP_PSM_IPSP                0x0023 /* 6LoWPAN */

#define L2CAP_PSM_DYN_START        0x1001
#define L2CAP_PSM_DYN_END        0xffff
#define L2CAP_PSM_AUTO_END        0x10ff
#define L2CAP_PSM_LE_DYN_START  0x0080
#define L2CAP_PSM_LE_DYN_END        0x00ff

/* channel identifier */
#define L2CAP_CID_SIGNALING        0x0001
#define L2CAP_CID_CONN_LESS        0x0002
#define L2CAP_CID_A2MP                0x0003
#define L2CAP_CID_ATT                0x0004
#define L2CAP_CID_LE_SIGNALING        0x0005
#define L2CAP_CID_SMP                0x0006
#define L2CAP_CID_SMP_BREDR        0x0007
#define L2CAP_CID_DYN_START        0x0040
#define L2CAP_CID_DYN_END        0xffff
#define L2CAP_CID_LE_DYN_END        0x007f

/* connect/create channel results */
#define L2CAP_CR_SUCCESS        0x0000
#define L2CAP_CR_PEND                0x0001
#define L2CAP_CR_BAD_PSM        0x0002
#define L2CAP_CR_SEC_BLOCK        0x0003
#define L2CAP_CR_NO_MEM                0x0004
#define L2CAP_CR_BAD_AMP        0x0005
#define L2CAP_CR_INVALID_SCID        0x0006
#define L2CAP_CR_SCID_IN_USE        0x0007

/* credit based connect results */
#define L2CAP_CR_LE_SUCCESS                0x0000
#define L2CAP_CR_LE_BAD_PSM                0x0002
#define L2CAP_CR_LE_NO_MEM                0x0004
#define L2CAP_CR_LE_AUTHENTICATION        0x0005
#define L2CAP_CR_LE_AUTHORIZATION        0x0006
#define L2CAP_CR_LE_BAD_KEY_SIZE        0x0007
#define L2CAP_CR_LE_ENCRYPTION                0x0008
#define L2CAP_CR_LE_INVALID_SCID        0x0009
#define L2CAP_CR_LE_SCID_IN_USE                0X000A
#define L2CAP_CR_LE_UNACCEPT_PARAMS        0X000B
#define L2CAP_CR_LE_INVALID_PARAMS        0X000C

/* connect/create channel status */
#define L2CAP_CS_NO_INFO        0x0000
#define L2CAP_CS_AUTHEN_PEND        0x0001
#define L2CAP_CS_AUTHOR_PEND        0x0002

struct l2cap_conf_req {
        __le16     dcid;
        __le16     flags;
        __u8       data[];
} __packed;

struct l2cap_conf_rsp {
        __le16     scid;
        __le16     flags;
        __le16     result;
        __u8       data[];
} __packed;

#define L2CAP_CONF_SUCCESS        0x0000
#define L2CAP_CONF_UNACCEPT        0x0001
#define L2CAP_CONF_REJECT        0x0002
#define L2CAP_CONF_UNKNOWN        0x0003
#define L2CAP_CONF_PENDING        0x0004
#define L2CAP_CONF_EFS_REJECT        0x0005

/* configuration req/rsp continuation flag */
#define L2CAP_CONF_FLAG_CONTINUATION        0x0001

struct l2cap_conf_opt {
        __u8       type;
        __u8       len;
        __u8       val[];
} __packed;
#define L2CAP_CONF_OPT_SIZE        2

#define L2CAP_CONF_HINT                0x80
#define L2CAP_CONF_MASK                0x7f

#define L2CAP_CONF_MTU                0x01
#define L2CAP_CONF_FLUSH_TO        0x02
#define L2CAP_CONF_QOS                0x03
#define L2CAP_CONF_RFC                0x04
#define L2CAP_CONF_FCS                0x05
#define L2CAP_CONF_EFS                0x06
#define L2CAP_CONF_EWS                0x07

#define L2CAP_CONF_MAX_SIZE        22

struct l2cap_conf_rfc {
        __u8       mode;
        __u8       txwin_size;
        __u8       max_transmit;
        __le16     retrans_timeout;
        __le16     monitor_timeout;
        __le16     max_pdu_size;
} __packed;

#define L2CAP_MODE_BASIC        0x00
#define L2CAP_MODE_RETRANS        0x01
#define L2CAP_MODE_FLOWCTL        0x02
#define L2CAP_MODE_ERTM                0x03
#define L2CAP_MODE_STREAMING        0x04

/* Unlike the above this one doesn't actually map to anything that would
 * ever be sent over the air. Therefore, use a value that's unlikely to
 * ever be used in the BR/EDR configuration phase.
 */
#define L2CAP_MODE_LE_FLOWCTL        0x80
#define L2CAP_MODE_EXT_FLOWCTL        0x81

struct l2cap_conf_efs {
        __u8        id;
        __u8        stype;
        __le16        msdu;
        __le32        sdu_itime;
        __le32        acc_lat;
        __le32        flush_to;
} __packed;

#define L2CAP_SERV_NOTRAFIC        0x00
#define L2CAP_SERV_BESTEFFORT        0x01
#define L2CAP_SERV_GUARANTEED        0x02

#define L2CAP_BESTEFFORT_ID        0x01

struct l2cap_disconn_req {
        __le16     dcid;
        __le16     scid;
} __packed;

struct l2cap_disconn_rsp {
        __le16     dcid;
        __le16     scid;
} __packed;

struct l2cap_info_req {
        __le16      type;
} __packed;

struct l2cap_info_rsp {
        __le16      type;
        __le16      result;
        __u8        data[];
} __packed;

struct l2cap_create_chan_req {
        __le16      psm;
        __le16      scid;
        __u8        amp_id;
} __packed;

struct l2cap_create_chan_rsp {
        __le16      dcid;
        __le16      scid;
        __le16      result;
        __le16      status;
} __packed;

struct l2cap_move_chan_req {
        __le16      icid;
        __u8        dest_amp_id;
} __packed;

struct l2cap_move_chan_rsp {
        __le16      icid;
        __le16      result;
} __packed;

#define L2CAP_MR_SUCCESS        0x0000
#define L2CAP_MR_PEND                0x0001
#define L2CAP_MR_BAD_ID                0x0002
#define L2CAP_MR_SAME_ID        0x0003
#define L2CAP_MR_NOT_SUPP        0x0004
#define L2CAP_MR_COLLISION        0x0005
#define L2CAP_MR_NOT_ALLOWED        0x0006

struct l2cap_move_chan_cfm {
        __le16      icid;
        __le16      result;
} __packed;

#define L2CAP_MC_CONFIRMED        0x0000
#define L2CAP_MC_UNCONFIRMED        0x0001

struct l2cap_move_chan_cfm_rsp {
        __le16      icid;
} __packed;

/* info type */
#define L2CAP_IT_CL_MTU                0x0001
#define L2CAP_IT_FEAT_MASK        0x0002
#define L2CAP_IT_FIXED_CHAN        0x0003

/* info result */
#define L2CAP_IR_SUCCESS        0x0000
#define L2CAP_IR_NOTSUPP        0x0001

struct l2cap_conn_param_update_req {
        __le16      min;
        __le16      max;
        __le16      latency;
        __le16      to_multiplier;
} __packed;

struct l2cap_conn_param_update_rsp {
        __le16      result;
} __packed;

/* Connection Parameters result */
#define L2CAP_CONN_PARAM_ACCEPTED        0x0000
#define L2CAP_CONN_PARAM_REJECTED        0x0001

struct l2cap_le_conn_req {
        __le16     psm;
        __le16     scid;
        __le16     mtu;
        __le16     mps;
        __le16     credits;
} __packed;

struct l2cap_le_conn_rsp {
        __le16     dcid;
        __le16     mtu;
        __le16     mps;
        __le16     credits;
        __le16     result;
} __packed;

struct l2cap_le_credits {
        __le16     cid;
        __le16     credits;
} __packed;

#define L2CAP_ECRED_MIN_MTU                64
#define L2CAP_ECRED_MIN_MPS                64
#define L2CAP_ECRED_MAX_CID                5

struct l2cap_ecred_conn_req {
        __le16 psm;
        __le16 mtu;
        __le16 mps;
        __le16 credits;
        __le16 scid[];
} __packed;

struct l2cap_ecred_conn_rsp {
        __le16 mtu;
        __le16 mps;
        __le16 credits;
        __le16 result;
        __le16 dcid[];
};

struct l2cap_ecred_reconf_req {
        __le16 mtu;
        __le16 mps;
        __le16 scid[];
} __packed;

#define L2CAP_RECONF_SUCCESS                0x0000
#define L2CAP_RECONF_INVALID_MTU        0x0001
#define L2CAP_RECONF_INVALID_MPS        0x0002
#define L2CAP_RECONF_INVALID_CID        0x0003
#define L2CAP_RECONF_INVALID_PARAMS        0x0004

struct l2cap_ecred_reconf_rsp {
        __le16 result;
} __packed;

/* ----- L2CAP channels and connections ----- */
struct l2cap_seq_list {
        __u16        head;
        __u16        tail;
        __u16        mask;
        __u16        *list;
};

#define L2CAP_SEQ_LIST_CLEAR        0xFFFF
#define L2CAP_SEQ_LIST_TAIL        0x8000

struct l2cap_chan {
        struct l2cap_conn        *conn;
        struct hci_conn                *hs_hcon;
        struct hci_chan                *hs_hchan;
        struct kref        kref;
        atomic_t        nesting;

        __u8                state;

        bdaddr_t        dst;
        __u8                dst_type;
        bdaddr_t        src;
        __u8                src_type;
        __le16                psm;
        __le16                sport;
        __u16                dcid;
        __u16                scid;

        __u16                imtu;
        __u16                omtu;
        __u16                flush_to;
        __u8                mode;
        __u8                chan_type;
        __u8                chan_policy;

        __u8                sec_level;

        __u8                ident;

        __u8                conf_req[64];
        __u8                conf_len;
        __u8                num_conf_req;
        __u8                num_conf_rsp;

        __u8                fcs;

        __u16                tx_win;
        __u16                tx_win_max;
        __u16                ack_win;
        __u8                max_tx;
        __u16                retrans_timeout;
        __u16                monitor_timeout;
        __u16                mps;

        __u16                tx_credits;
        __u16                rx_credits;

        __u8                tx_state;
        __u8                rx_state;

        unsigned long        conf_state;
        unsigned long        conn_state;
        unsigned long        flags;

        __u8                remote_amp_id;
        __u8                local_amp_id;
        __u8                move_id;
        __u8                move_state;
        __u8                move_role;

        __u16                next_tx_seq;
        __u16                expected_ack_seq;
        __u16                expected_tx_seq;
        __u16                buffer_seq;
        __u16                srej_save_reqseq;
        __u16                last_acked_seq;
        __u16                frames_sent;
        __u16                unacked_frames;
        __u8                retry_count;
        __u16                sdu_len;
        struct sk_buff        *sdu;
        struct sk_buff        *sdu_last_frag;

        __u16                remote_tx_win;
        __u8                remote_max_tx;
        __u16                remote_mps;

        __u8                local_id;
        __u8                local_stype;
        __u16                local_msdu;
        __u32                local_sdu_itime;
        __u32                local_acc_lat;
        __u32                local_flush_to;

        __u8                remote_id;
        __u8                remote_stype;
        __u16                remote_msdu;
        __u32                remote_sdu_itime;
        __u32                remote_acc_lat;
        __u32                remote_flush_to;

        struct delayed_work        chan_timer;
        struct delayed_work        retrans_timer;
        struct delayed_work        monitor_timer;
        struct delayed_work        ack_timer;

        struct sk_buff                *tx_send_head;
        struct sk_buff_head        tx_q;
        struct sk_buff_head        srej_q;
        struct l2cap_seq_list        srej_list;
        struct l2cap_seq_list        retrans_list;

        struct list_head        list;
        struct list_head        global_l;

        void                        *data;
        const struct l2cap_ops        *ops;
        struct mutex                lock;
};

struct l2cap_ops {
        char                        *name;

        struct l2cap_chan        *(*new_connection) (struct l2cap_chan *chan);
        int                        (*recv) (struct l2cap_chan * chan,
                                         struct sk_buff *skb);
        void                        (*teardown) (struct l2cap_chan *chan, int err);
        void                        (*close) (struct l2cap_chan *chan);
        void                        (*state_change) (struct l2cap_chan *chan,
                                                 int state, int err);
        void                        (*ready) (struct l2cap_chan *chan);
        void                        (*defer) (struct l2cap_chan *chan);
        void                        (*resume) (struct l2cap_chan *chan);
        void                        (*suspend) (struct l2cap_chan *chan);
        void                        (*set_shutdown) (struct l2cap_chan *chan);
        long                        (*get_sndtimeo) (struct l2cap_chan *chan);
        struct pid                *(*get_peer_pid) (struct l2cap_chan *chan);
        struct sk_buff                *(*alloc_skb) (struct l2cap_chan *chan,
                                               unsigned long hdr_len,
                                               unsigned long len, int nb);
        int                        (*filter) (struct l2cap_chan * chan,
                                           struct sk_buff *skb);
};

struct l2cap_conn {
        struct hci_conn                *hcon;
        struct hci_chan                *hchan;

        unsigned int                mtu;

        __u32                        feat_mask;
        __u8                        remote_fixed_chan;
        __u8                        local_fixed_chan;

        __u8                        info_state;
        __u8                        info_ident;

        struct delayed_work        info_timer;

        struct sk_buff                *rx_skb;
        __u32                        rx_len;
        __u8                        tx_ident;
        struct mutex                ident_lock;

        struct sk_buff_head        pending_rx;
        struct work_struct        pending_rx_work;

        struct work_struct        id_addr_update_work;

        __u8                        disc_reason;

        struct l2cap_chan        *smp;

        struct list_head        chan_l;
        struct mutex                chan_lock;
        struct kref                ref;
        struct list_head        users;
};

struct l2cap_user {
        struct list_head list;
        int (*probe) (struct l2cap_conn *conn, struct l2cap_user *user);
        void (*remove) (struct l2cap_conn *conn, struct l2cap_user *user);
};

#define L2CAP_INFO_CL_MTU_REQ_SENT        0x01
#define L2CAP_INFO_FEAT_MASK_REQ_SENT        0x04
#define L2CAP_INFO_FEAT_MASK_REQ_DONE        0x08

#define L2CAP_CHAN_RAW                        1
#define L2CAP_CHAN_CONN_LESS                2
#define L2CAP_CHAN_CONN_ORIENTED        3
#define L2CAP_CHAN_FIXED                4

/* ----- L2CAP socket info ----- */
#define l2cap_pi(sk) ((struct l2cap_pinfo *) sk)

struct l2cap_pinfo {
        struct bt_sock                bt;
        struct l2cap_chan        *chan;
        struct sk_buff                *rx_busy_skb;
};

enum {
        CONF_REQ_SENT,
        CONF_INPUT_DONE,
        CONF_OUTPUT_DONE,
        CONF_MTU_DONE,
        CONF_MODE_DONE,
        CONF_CONNECT_PEND,
        CONF_RECV_NO_FCS,
        CONF_STATE2_DEVICE,
        CONF_EWS_RECV,
        CONF_LOC_CONF_PEND,
        CONF_REM_CONF_PEND,
        CONF_NOT_COMPLETE,
};

#define L2CAP_CONF_MAX_CONF_REQ 2
#define L2CAP_CONF_MAX_CONF_RSP 2

enum {
        CONN_SREJ_SENT,
        CONN_WAIT_F,
        CONN_SREJ_ACT,
        CONN_SEND_PBIT,
        CONN_REMOTE_BUSY,
        CONN_LOCAL_BUSY,
        CONN_REJ_ACT,
        CONN_SEND_FBIT,
        CONN_RNR_SENT,
};

/* Definitions for flags in l2cap_chan */
enum {
        FLAG_ROLE_SWITCH,
        FLAG_FORCE_ACTIVE,
        FLAG_FORCE_RELIABLE,
        FLAG_FLUSHABLE,
        FLAG_EXT_CTRL,
        FLAG_EFS_ENABLE,
        FLAG_DEFER_SETUP,
        FLAG_LE_CONN_REQ_SENT,
        FLAG_ECRED_CONN_REQ_SENT,
        FLAG_PENDING_SECURITY,
        FLAG_HOLD_HCI_CONN,
};

/* Lock nesting levels for L2CAP channels. We need these because lockdep
 * otherwise considers all channels equal and will e.g. complain about a
 * connection oriented channel triggering SMP procedures or a listening
 * channel creating and locking a child channel.
 */
enum {
        L2CAP_NESTING_SMP,
        L2CAP_NESTING_NORMAL,
        L2CAP_NESTING_PARENT,
};

enum {
        L2CAP_TX_STATE_XMIT,
        L2CAP_TX_STATE_WAIT_F,
};

enum {
        L2CAP_RX_STATE_RECV,
        L2CAP_RX_STATE_SREJ_SENT,
        L2CAP_RX_STATE_MOVE,
        L2CAP_RX_STATE_WAIT_P,
        L2CAP_RX_STATE_WAIT_F,
};

enum {
        L2CAP_TXSEQ_EXPECTED,
        L2CAP_TXSEQ_EXPECTED_SREJ,
        L2CAP_TXSEQ_UNEXPECTED,
        L2CAP_TXSEQ_UNEXPECTED_SREJ,
        L2CAP_TXSEQ_DUPLICATE,
        L2CAP_TXSEQ_DUPLICATE_SREJ,
        L2CAP_TXSEQ_INVALID,
        L2CAP_TXSEQ_INVALID_IGNORE,
};

enum {
        L2CAP_EV_DATA_REQUEST,
        L2CAP_EV_LOCAL_BUSY_DETECTED,
        L2CAP_EV_LOCAL_BUSY_CLEAR,
        L2CAP_EV_RECV_REQSEQ_AND_FBIT,
        L2CAP_EV_RECV_FBIT,
        L2CAP_EV_RETRANS_TO,
        L2CAP_EV_MONITOR_TO,
        L2CAP_EV_EXPLICIT_POLL,
        L2CAP_EV_RECV_IFRAME,
        L2CAP_EV_RECV_RR,
        L2CAP_EV_RECV_REJ,
        L2CAP_EV_RECV_RNR,
        L2CAP_EV_RECV_SREJ,
        L2CAP_EV_RECV_FRAME,
};

enum {
        L2CAP_MOVE_ROLE_NONE,
        L2CAP_MOVE_ROLE_INITIATOR,
        L2CAP_MOVE_ROLE_RESPONDER,
};

enum {
        L2CAP_MOVE_STABLE,
        L2CAP_MOVE_WAIT_REQ,
        L2CAP_MOVE_WAIT_RSP,
        L2CAP_MOVE_WAIT_RSP_SUCCESS,
        L2CAP_MOVE_WAIT_CONFIRM,
        L2CAP_MOVE_WAIT_CONFIRM_RSP,
        L2CAP_MOVE_WAIT_LOGICAL_COMP,
        L2CAP_MOVE_WAIT_LOGICAL_CFM,
        L2CAP_MOVE_WAIT_LOCAL_BUSY,
        L2CAP_MOVE_WAIT_PREPARE,
};

void l2cap_chan_hold(struct l2cap_chan *c);
struct l2cap_chan *l2cap_chan_hold_unless_zero(struct l2cap_chan *c);
void l2cap_chan_put(struct l2cap_chan *c);

static inline void l2cap_chan_lock(struct l2cap_chan *chan)
{
        mutex_lock_nested(&chan->lock, atomic_read(&chan->nesting));
}

static inline void l2cap_chan_unlock(struct l2cap_chan *chan)
{
        mutex_unlock(&chan->lock);
}

static inline void l2cap_set_timer(struct l2cap_chan *chan,
                                   struct delayed_work *work, long timeout)
{
        BT_DBG("chan %p state %s timeout %ld", chan,
               state_to_string(chan->state), timeout);

        /* If delayed work cancelled do not hold(chan)
           since it is already done with previous set_timer */
        if (!cancel_delayed_work(work))
                l2cap_chan_hold(chan);

        schedule_delayed_work(work, timeout);
}

static inline bool l2cap_clear_timer(struct l2cap_chan *chan,
                                     struct delayed_work *work)
{
        bool ret;

        /* put(chan) if delayed work cancelled otherwise it
           is done in delayed work function */
        ret = cancel_delayed_work(work);
        if (ret)
                l2cap_chan_put(chan);

        return ret;
}

#define __set_chan_timer(c, t) l2cap_set_timer(c, &c->chan_timer, (t))
#define __clear_chan_timer(c) l2cap_clear_timer(c, &c->chan_timer)
#define __clear_retrans_timer(c) l2cap_clear_timer(c, &c->retrans_timer)
#define __clear_monitor_timer(c) l2cap_clear_timer(c, &c->monitor_timer)
#define __set_ack_timer(c) l2cap_set_timer(c, &chan->ack_timer, \
                msecs_to_jiffies(L2CAP_DEFAULT_ACK_TO));
#define __clear_ack_timer(c) l2cap_clear_timer(c, &c->ack_timer)

static inline int __seq_offset(struct l2cap_chan *chan, __u16 seq1, __u16 seq2)
{
        if (seq1 >= seq2)
                return seq1 - seq2;
        else
                return chan->tx_win_max + 1 - seq2 + seq1;
}

static inline __u16 __next_seq(struct l2cap_chan *chan, __u16 seq)
{
        return (seq + 1) % (chan->tx_win_max + 1);
}

static inline struct l2cap_chan *l2cap_chan_no_new_connection(struct l2cap_chan *chan)
{
        return NULL;
}

static inline int l2cap_chan_no_recv(struct l2cap_chan *chan, struct sk_buff *skb)
{
        return -ENOSYS;
}

static inline struct sk_buff *l2cap_chan_no_alloc_skb(struct l2cap_chan *chan,
                                                      unsigned long hdr_len,
                                                      unsigned long len, int nb)
{
        return ERR_PTR(-ENOSYS);
}

static inline void l2cap_chan_no_teardown(struct l2cap_chan *chan, int err)
{
}

static inline void l2cap_chan_no_close(struct l2cap_chan *chan)
{
}

static inline void l2cap_chan_no_ready(struct l2cap_chan *chan)
{
}

static inline void l2cap_chan_no_state_change(struct l2cap_chan *chan,
                                              int state, int err)
{
}

static inline void l2cap_chan_no_defer(struct l2cap_chan *chan)
{
}

static inline void l2cap_chan_no_suspend(struct l2cap_chan *chan)
{
}

static inline void l2cap_chan_no_resume(struct l2cap_chan *chan)
{
}

static inline void l2cap_chan_no_set_shutdown(struct l2cap_chan *chan)
{
}

static inline long l2cap_chan_no_get_sndtimeo(struct l2cap_chan *chan)
{
        return 0;
}

extern bool disable_ertm;
extern bool enable_ecred;

int l2cap_init_sockets(void);
void l2cap_cleanup_sockets(void);
bool l2cap_is_socket(struct socket *sock);

void __l2cap_le_connect_rsp_defer(struct l2cap_chan *chan);
void __l2cap_ecred_conn_rsp_defer(struct l2cap_chan *chan);
void __l2cap_connect_rsp_defer(struct l2cap_chan *chan);

int l2cap_add_psm(struct l2cap_chan *chan, bdaddr_t *src, __le16 psm);
int l2cap_add_scid(struct l2cap_chan *chan,  __u16 scid);

struct l2cap_chan *l2cap_chan_create(void);
void l2cap_chan_close(struct l2cap_chan *chan, int reason);
int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid,
                       bdaddr_t *dst, u8 dst_type);
int l2cap_chan_reconfigure(struct l2cap_chan *chan, __u16 mtu);
int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len);
void l2cap_chan_busy(struct l2cap_chan *chan, int busy);
int l2cap_chan_check_security(struct l2cap_chan *chan, bool initiator);
void l2cap_chan_set_defaults(struct l2cap_chan *chan);
int l2cap_ertm_init(struct l2cap_chan *chan);
void l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan);
void __l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan);
typedef void (*l2cap_chan_func_t)(struct l2cap_chan *chan, void *data);
void l2cap_chan_list(struct l2cap_conn *conn, l2cap_chan_func_t func,
                     void *data);
void l2cap_chan_del(struct l2cap_chan *chan, int err);
void l2cap_send_conn_req(struct l2cap_chan *chan);
void l2cap_move_start(struct l2cap_chan *chan);
void l2cap_logical_cfm(struct l2cap_chan *chan, struct hci_chan *hchan,
                       u8 status);
void __l2cap_physical_cfm(struct l2cap_chan *chan, int result);

struct l2cap_conn *l2cap_conn_get(struct l2cap_conn *conn);
void l2cap_conn_put(struct l2cap_conn *conn);

int l2cap_register_user(struct l2cap_conn *conn, struct l2cap_user *user);
void l2cap_unregister_user(struct l2cap_conn *conn, struct l2cap_user *user);

#endif /* __L2CAP_H */


































































































    2 




    2 
















    2 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Derived from arch/ppc/mm/extable.c and arch/i386/mm/extable.c.
 *
 * Copyright (C) 2004 Paul Mackerras, IBM Corp.
 */

#include <linux/bsearch.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/sort.h>
#include <linux/uaccess.h>
#include <linux/extable.h>

#ifndef ARCH_HAS_RELATIVE_EXTABLE
#define ex_to_insn(x)        ((x)->insn)
#else
static inline unsigned long ex_to_insn(const struct exception_table_entry *x)
{
        return (unsigned long)&x->insn + x->insn;
}
#endif

#ifndef ARCH_HAS_SORT_EXTABLE
#ifndef ARCH_HAS_RELATIVE_EXTABLE
#define swap_ex                NULL
#else
static void swap_ex(void *a, void *b, int size)
{
        struct exception_table_entry *x = a, *y = b, tmp;
        int delta = b - a;

        tmp = *x;
        x->insn = y->insn + delta;
        y->insn = tmp.insn - delta;

#ifdef swap_ex_entry_fixup
        swap_ex_entry_fixup(x, y, tmp, delta);
#else
        x->fixup = y->fixup + delta;
        y->fixup = tmp.fixup - delta;
#endif
}
#endif /* ARCH_HAS_RELATIVE_EXTABLE */

/*
 * The exception table needs to be sorted so that the binary
 * search that we use to find entries in it works properly.
 * This is used both for the kernel exception table and for
 * the exception tables of modules that get loaded.
 */
static int cmp_ex_sort(const void *a, const void *b)
{
        const struct exception_table_entry *x = a, *y = b;

        /* avoid overflow */
        if (ex_to_insn(x) > ex_to_insn(y))
                return 1;
        if (ex_to_insn(x) < ex_to_insn(y))
                return -1;
        return 0;
}

void sort_extable(struct exception_table_entry *start,
                  struct exception_table_entry *finish)
{
        sort(start, finish - start, sizeof(struct exception_table_entry),
             cmp_ex_sort, swap_ex);
}

#ifdef CONFIG_MODULES
/*
 * If the exception table is sorted, any referring to the module init
 * will be at the beginning or the end.
 */
void trim_init_extable(struct module *m)
{
        /*trim the beginning*/
        while (m->num_exentries &&
               within_module_init(ex_to_insn(&m->extable[0]), m)) {
                m->extable++;
                m->num_exentries--;
        }
        /*trim the end*/
        while (m->num_exentries &&
               within_module_init(ex_to_insn(&m->extable[m->num_exentries - 1]),
                                  m))
                m->num_exentries--;
}
#endif /* CONFIG_MODULES */
#endif /* !ARCH_HAS_SORT_EXTABLE */

#ifndef ARCH_HAS_SEARCH_EXTABLE

static int cmp_ex_search(const void *key, const void *elt)
{
        const struct exception_table_entry *_elt = elt;
        unsigned long _key = *(unsigned long *)key;

        /* avoid overflow */
        if (_key > ex_to_insn(_elt))
                return 1;
        if (_key < ex_to_insn(_elt))
                return -1;
        return 0;
}

/*
 * Search one exception table for an entry corresponding to the
 * given instruction address, and return the address of the entry,
 * or NULL if none is found.
 * We use a binary search, and thus we assume that the table is
 * already sorted.
 */
const struct exception_table_entry *
search_extable(const struct exception_table_entry *base,
               const size_t num,
               unsigned long value)
{
        return bsearch(&value, base, num,
                       sizeof(struct exception_table_entry), cmp_ex_search);
}
#endif















































































































































































































































































































































































    1 





































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_CPUMASK_H
#define __LINUX_CPUMASK_H

/*
 * Cpumasks provide a bitmap suitable for representing the
 * set of CPU's in a system, one bit position per CPU number.  In general,
 * only nr_cpu_ids (<= NR_CPUS) bits are valid.
 */
#include <linux/kernel.h>
#include <linux/threads.h>
#include <linux/bitmap.h>
#include <linux/atomic.h>
#include <linux/bug.h>

/* Don't assign or return these: may not be this big! */
typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;

/**
 * cpumask_bits - get the bits in a cpumask
 * @maskp: the struct cpumask *
 *
 * You should only assume nr_cpu_ids bits of this mask are valid.  This is
 * a macro so it's const-correct.
 */
#define cpumask_bits(maskp) ((maskp)->bits)

/**
 * cpumask_pr_args - printf args to output a cpumask
 * @maskp: cpumask to be printed
 *
 * Can be used to provide arguments for '%*pb[l]' when printing a cpumask.
 */
#define cpumask_pr_args(maskp)                nr_cpu_ids, cpumask_bits(maskp)

#if NR_CPUS == 1
#define nr_cpu_ids                1U
#else
extern unsigned int nr_cpu_ids;
#endif

#ifdef CONFIG_CPUMASK_OFFSTACK
/* Assuming NR_CPUS is huge, a runtime limit is more efficient.  Also,
 * not all bits may be allocated. */
#define nr_cpumask_bits        nr_cpu_ids
#else
#define nr_cpumask_bits        ((unsigned int)NR_CPUS)
#endif

/*
 * The following particular system cpumasks and operations manage
 * possible, present, active and online cpus.
 *
 *     cpu_possible_mask- has bit 'cpu' set iff cpu is populatable
 *     cpu_present_mask - has bit 'cpu' set iff cpu is populated
 *     cpu_online_mask  - has bit 'cpu' set iff cpu available to scheduler
 *     cpu_active_mask  - has bit 'cpu' set iff cpu available to migration
 *
 *  If !CONFIG_HOTPLUG_CPU, present == possible, and active == online.
 *
 *  The cpu_possible_mask is fixed at boot time, as the set of CPU id's
 *  that it is possible might ever be plugged in at anytime during the
 *  life of that system boot.  The cpu_present_mask is dynamic(*),
 *  representing which CPUs are currently plugged in.  And
 *  cpu_online_mask is the dynamic subset of cpu_present_mask,
 *  indicating those CPUs available for scheduling.
 *
 *  If HOTPLUG is enabled, then cpu_possible_mask is forced to have
 *  all NR_CPUS bits set, otherwise it is just the set of CPUs that
 *  ACPI reports present at boot.
 *
 *  If HOTPLUG is enabled, then cpu_present_mask varies dynamically,
 *  depending on what ACPI reports as currently plugged in, otherwise
 *  cpu_present_mask is just a copy of cpu_possible_mask.
 *
 *  (*) Well, cpu_present_mask is dynamic in the hotplug case.  If not
 *      hotplug, it's a copy of cpu_possible_mask, hence fixed at boot.
 *
 * Subtleties:
 * 1) UP arch's (NR_CPUS == 1, CONFIG_SMP not defined) hardcode
 *    assumption that their single CPU is online.  The UP
 *    cpu_{online,possible,present}_masks are placebos.  Changing them
 *    will have no useful affect on the following num_*_cpus()
 *    and cpu_*() macros in the UP case.  This ugliness is a UP
 *    optimization - don't waste any instructions or memory references
 *    asking if you're online or how many CPUs there are if there is
 *    only one CPU.
 */

extern struct cpumask __cpu_possible_mask;
extern struct cpumask __cpu_online_mask;
extern struct cpumask __cpu_present_mask;
extern struct cpumask __cpu_active_mask;
#define cpu_possible_mask ((const struct cpumask *)&__cpu_possible_mask)
#define cpu_online_mask   ((const struct cpumask *)&__cpu_online_mask)
#define cpu_present_mask  ((const struct cpumask *)&__cpu_present_mask)
#define cpu_active_mask   ((const struct cpumask *)&__cpu_active_mask)

extern atomic_t __num_online_cpus;

#if NR_CPUS > 1
/**
 * num_online_cpus() - Read the number of online CPUs
 *
 * Despite the fact that __num_online_cpus is of type atomic_t, this
 * interface gives only a momentary snapshot and is not protected against
 * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held
 * region.
 */
static inline unsigned int num_online_cpus(void)
{
        return atomic_read(&__num_online_cpus);
}
#define num_possible_cpus()        cpumask_weight(cpu_possible_mask)
#define num_present_cpus()        cpumask_weight(cpu_present_mask)
#define num_active_cpus()        cpumask_weight(cpu_active_mask)
#define cpu_online(cpu)                cpumask_test_cpu((cpu), cpu_online_mask)
#define cpu_possible(cpu)        cpumask_test_cpu((cpu), cpu_possible_mask)
#define cpu_present(cpu)        cpumask_test_cpu((cpu), cpu_present_mask)
#define cpu_active(cpu)                cpumask_test_cpu((cpu), cpu_active_mask)
#else
#define num_online_cpus()        1U
#define num_possible_cpus()        1U
#define num_present_cpus()        1U
#define num_active_cpus()        1U
#define cpu_online(cpu)                ((cpu) == 0)
#define cpu_possible(cpu)        ((cpu) == 0)
#define cpu_present(cpu)        ((cpu) == 0)
#define cpu_active(cpu)                ((cpu) == 0)
#endif

extern cpumask_t cpus_booted_once_mask;

static inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits)
{
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
        WARN_ON_ONCE(cpu >= bits);
#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
}

/* verify cpu argument to cpumask_* operators */
static inline unsigned int cpumask_check(unsigned int cpu)
{
        cpu_max_bits_warn(cpu, nr_cpumask_bits);
        return cpu;
}

#if NR_CPUS == 1
/* Uniprocessor.  Assume all masks are "1". */
static inline unsigned int cpumask_first(const struct cpumask *srcp)
{
        return 0;
}

static inline unsigned int cpumask_last(const struct cpumask *srcp)
{
        return 0;
}

/* Valid inputs for n are -1 and 0. */
static inline unsigned int cpumask_next(int n, const struct cpumask *srcp)
{
        return n+1;
}

static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
{
        return n+1;
}

static inline unsigned int cpumask_next_and(int n,
                                            const struct cpumask *srcp,
                                            const struct cpumask *andp)
{
        return n+1;
}

static inline unsigned int cpumask_next_wrap(int n, const struct cpumask *mask,
                                             int start, bool wrap)
{
        /* cpu0 unless stop condition, wrap and at cpu0, then nr_cpumask_bits */
        return (wrap && n == 0);
}

/* cpu must be a valid cpu, ie 0, so there's no other choice. */
static inline unsigned int cpumask_any_but(const struct cpumask *mask,
                                           unsigned int cpu)
{
        return 1;
}

static inline unsigned int cpumask_local_spread(unsigned int i, int node)
{
        return 0;
}

static inline int cpumask_any_and_distribute(const struct cpumask *src1p,
                                             const struct cpumask *src2p) {
        return cpumask_next_and(-1, src1p, src2p);
}

#define for_each_cpu(cpu, mask)                        \
        for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
#define for_each_cpu_not(cpu, mask)                \
        for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
#define for_each_cpu_wrap(cpu, mask, start)        \
        for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)(start))
#define for_each_cpu_and(cpu, mask1, mask2)        \
        for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask1, (void)mask2)
#else
/**
 * cpumask_first - get the first cpu in a cpumask
 * @srcp: the cpumask pointer
 *
 * Returns >= nr_cpu_ids if no cpus set.
 */
static inline unsigned int cpumask_first(const struct cpumask *srcp)
{
        return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits);
}

/**
 * cpumask_last - get the last CPU in a cpumask
 * @srcp:        - the cpumask pointer
 *
 * Returns        >= nr_cpumask_bits if no CPUs set.
 */
static inline unsigned int cpumask_last(const struct cpumask *srcp)
{
        return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits);
}

unsigned int cpumask_next(int n, const struct cpumask *srcp);

/**
 * cpumask_next_zero - get the next unset cpu in a cpumask
 * @n: the cpu prior to the place to search (ie. return will be > @n)
 * @srcp: the cpumask pointer
 *
 * Returns >= nr_cpu_ids if no further cpus unset.
 */
static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpumask_check(n);
        return find_next_zero_bit(cpumask_bits(srcp), nr_cpumask_bits, n+1);
}

int cpumask_next_and(int n, const struct cpumask *, const struct cpumask *);
int cpumask_any_but(const struct cpumask *mask, unsigned int cpu);
unsigned int cpumask_local_spread(unsigned int i, int node);
int cpumask_any_and_distribute(const struct cpumask *src1p,
                               const struct cpumask *src2p);

/**
 * for_each_cpu - iterate over every cpu in a mask
 * @cpu: the (optionally unsigned) integer iterator
 * @mask: the cpumask pointer
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu(cpu, mask)                                \
        for ((cpu) = -1;                                \
                (cpu) = cpumask_next((cpu), (mask)),        \
                (cpu) < nr_cpu_ids;)

/**
 * for_each_cpu_not - iterate over every cpu in a complemented mask
 * @cpu: the (optionally unsigned) integer iterator
 * @mask: the cpumask pointer
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu_not(cpu, mask)                                \
        for ((cpu) = -1;                                        \
                (cpu) = cpumask_next_zero((cpu), (mask)),        \
                (cpu) < nr_cpu_ids;)

extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap);

/**
 * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location
 * @cpu: the (optionally unsigned) integer iterator
 * @mask: the cpumask poiter
 * @start: the start location
 *
 * The implementation does not assume any bit in @mask is set (including @start).
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu_wrap(cpu, mask, start)                                        \
        for ((cpu) = cpumask_next_wrap((start)-1, (mask), (start), false);        \
             (cpu) < nr_cpumask_bits;                                                \
             (cpu) = cpumask_next_wrap((cpu), (mask), (start), true))

/**
 * for_each_cpu_and - iterate over every cpu in both masks
 * @cpu: the (optionally unsigned) integer iterator
 * @mask1: the first cpumask pointer
 * @mask2: the second cpumask pointer
 *
 * This saves a temporary CPU mask in many places.  It is equivalent to:
 *        struct cpumask tmp;
 *        cpumask_and(&tmp, &mask1, &mask2);
 *        for_each_cpu(cpu, &tmp)
 *                ...
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu_and(cpu, mask1, mask2)                                \
        for ((cpu) = -1;                                                \
                (cpu) = cpumask_next_and((cpu), (mask1), (mask2)),        \
                (cpu) < nr_cpu_ids;)
#endif /* SMP */

#define CPU_BITS_NONE                                                \
{                                                                \
        [0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL                        \
}

#define CPU_BITS_CPU0                                                \
{                                                                \
        [0] =  1UL                                                \
}

/**
 * cpumask_set_cpu - set a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @dstp: the cpumask pointer
 */
static inline void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)
{
        set_bit(cpumask_check(cpu), cpumask_bits(dstp));
}

static inline void __cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)
{
        __set_bit(cpumask_check(cpu), cpumask_bits(dstp));
}


/**
 * cpumask_clear_cpu - clear a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @dstp: the cpumask pointer
 */
static inline void cpumask_clear_cpu(int cpu, struct cpumask *dstp)
{
        clear_bit(cpumask_check(cpu), cpumask_bits(dstp));
}

static inline void __cpumask_clear_cpu(int cpu, struct cpumask *dstp)
{
        __clear_bit(cpumask_check(cpu), cpumask_bits(dstp));
}

/**
 * cpumask_test_cpu - test for a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @cpumask: the cpumask pointer
 *
 * Returns 1 if @cpu is set in @cpumask, else returns 0
 */
static inline int cpumask_test_cpu(int cpu, const struct cpumask *cpumask)
{
        return test_bit(cpumask_check(cpu), cpumask_bits((cpumask)));
}

/**
 * cpumask_test_and_set_cpu - atomically test and set a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @cpumask: the cpumask pointer
 *
 * Returns 1 if @cpu is set in old bitmap of @cpumask, else returns 0
 *
 * test_and_set_bit wrapper for cpumasks.
 */
static inline int cpumask_test_and_set_cpu(int cpu, struct cpumask *cpumask)
{
        return test_and_set_bit(cpumask_check(cpu), cpumask_bits(cpumask));
}

/**
 * cpumask_test_and_clear_cpu - atomically test and clear a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @cpumask: the cpumask pointer
 *
 * Returns 1 if @cpu is set in old bitmap of @cpumask, else returns 0
 *
 * test_and_clear_bit wrapper for cpumasks.
 */
static inline int cpumask_test_and_clear_cpu(int cpu, struct cpumask *cpumask)
{
        return test_and_clear_bit(cpumask_check(cpu), cpumask_bits(cpumask));
}

/**
 * cpumask_setall - set all cpus (< nr_cpu_ids) in a cpumask
 * @dstp: the cpumask pointer
 */
static inline void cpumask_setall(struct cpumask *dstp)
{
        bitmap_fill(cpumask_bits(dstp), nr_cpumask_bits);
}

/**
 * cpumask_clear - clear all cpus (< nr_cpu_ids) in a cpumask
 * @dstp: the cpumask pointer
 */
static inline void cpumask_clear(struct cpumask *dstp)
{
        bitmap_zero(cpumask_bits(dstp), nr_cpumask_bits);
}

/**
 * cpumask_and - *dstp = *src1p & *src2p
 * @dstp: the cpumask result
 * @src1p: the first input
 * @src2p: the second input
 *
 * If *@dstp is empty, returns 0, else returns 1
 */
static inline int cpumask_and(struct cpumask *dstp,
                               const struct cpumask *src1p,
                               const struct cpumask *src2p)
{
        return bitmap_and(cpumask_bits(dstp), cpumask_bits(src1p),
                                       cpumask_bits(src2p), nr_cpumask_bits);
}

/**
 * cpumask_or - *dstp = *src1p | *src2p
 * @dstp: the cpumask result
 * @src1p: the first input
 * @src2p: the second input
 */
static inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p,
                              const struct cpumask *src2p)
{
        bitmap_or(cpumask_bits(dstp), cpumask_bits(src1p),
                                      cpumask_bits(src2p), nr_cpumask_bits);
}

/**
 * cpumask_xor - *dstp = *src1p ^ *src2p
 * @dstp: the cpumask result
 * @src1p: the first input
 * @src2p: the second input
 */
static inline void cpumask_xor(struct cpumask *dstp,
                               const struct cpumask *src1p,
                               const struct cpumask *src2p)
{
        bitmap_xor(cpumask_bits(dstp), cpumask_bits(src1p),
                                       cpumask_bits(src2p), nr_cpumask_bits);
}

/**
 * cpumask_andnot - *dstp = *src1p & ~*src2p
 * @dstp: the cpumask result
 * @src1p: the first input
 * @src2p: the second input
 *
 * If *@dstp is empty, returns 0, else returns 1
 */
static inline int cpumask_andnot(struct cpumask *dstp,
                                  const struct cpumask *src1p,
                                  const struct cpumask *src2p)
{
        return bitmap_andnot(cpumask_bits(dstp), cpumask_bits(src1p),
                                          cpumask_bits(src2p), nr_cpumask_bits);
}

/**
 * cpumask_complement - *dstp = ~*srcp
 * @dstp: the cpumask result
 * @srcp: the input to invert
 */
static inline void cpumask_complement(struct cpumask *dstp,
                                      const struct cpumask *srcp)
{
        bitmap_complement(cpumask_bits(dstp), cpumask_bits(srcp),
                                              nr_cpumask_bits);
}

/**
 * cpumask_equal - *src1p == *src2p
 * @src1p: the first input
 * @src2p: the second input
 */
static inline bool cpumask_equal(const struct cpumask *src1p,
                                const struct cpumask *src2p)
{
        return bitmap_equal(cpumask_bits(src1p), cpumask_bits(src2p),
                                                 nr_cpumask_bits);
}

/**
 * cpumask_or_equal - *src1p | *src2p == *src3p
 * @src1p: the first input
 * @src2p: the second input
 * @src3p: the third input
 */
static inline bool cpumask_or_equal(const struct cpumask *src1p,
                                    const struct cpumask *src2p,
                                    const struct cpumask *src3p)
{
        return bitmap_or_equal(cpumask_bits(src1p), cpumask_bits(src2p),
                               cpumask_bits(src3p), nr_cpumask_bits);
}

/**
 * cpumask_intersects - (*src1p & *src2p) != 0
 * @src1p: the first input
 * @src2p: the second input
 */
static inline bool cpumask_intersects(const struct cpumask *src1p,
                                     const struct cpumask *src2p)
{
        return bitmap_intersects(cpumask_bits(src1p), cpumask_bits(src2p),
                                                      nr_cpumask_bits);
}

/**
 * cpumask_subset - (*src1p & ~*src2p) == 0
 * @src1p: the first input
 * @src2p: the second input
 *
 * Returns 1 if *@src1p is a subset of *@src2p, else returns 0
 */
static inline int cpumask_subset(const struct cpumask *src1p,
                                 const struct cpumask *src2p)
{
        return bitmap_subset(cpumask_bits(src1p), cpumask_bits(src2p),
                                                  nr_cpumask_bits);
}

/**
 * cpumask_empty - *srcp == 0
 * @srcp: the cpumask to that all cpus < nr_cpu_ids are clear.
 */
static inline bool cpumask_empty(const struct cpumask *srcp)
{
        return bitmap_empty(cpumask_bits(srcp), nr_cpumask_bits);
}

/**
 * cpumask_full - *srcp == 0xFFFFFFFF...
 * @srcp: the cpumask to that all cpus < nr_cpu_ids are set.
 */
static inline bool cpumask_full(const struct cpumask *srcp)
{
        return bitmap_full(cpumask_bits(srcp), nr_cpumask_bits);
}

/**
 * cpumask_weight - Count of bits in *srcp
 * @srcp: the cpumask to count bits (< nr_cpu_ids) in.
 */
static inline unsigned int cpumask_weight(const struct cpumask *srcp)
{
        return bitmap_weight(cpumask_bits(srcp), nr_cpumask_bits);
}

/**
 * cpumask_shift_right - *dstp = *srcp >> n
 * @dstp: the cpumask result
 * @srcp: the input to shift
 * @n: the number of bits to shift by
 */
static inline void cpumask_shift_right(struct cpumask *dstp,
                                       const struct cpumask *srcp, int n)
{
        bitmap_shift_right(cpumask_bits(dstp), cpumask_bits(srcp), n,
                                               nr_cpumask_bits);
}

/**
 * cpumask_shift_left - *dstp = *srcp << n
 * @dstp: the cpumask result
 * @srcp: the input to shift
 * @n: the number of bits to shift by
 */
static inline void cpumask_shift_left(struct cpumask *dstp,
                                      const struct cpumask *srcp, int n)
{
        bitmap_shift_left(cpumask_bits(dstp), cpumask_bits(srcp), n,
                                              nr_cpumask_bits);
}

/**
 * cpumask_copy - *dstp = *srcp
 * @dstp: the result
 * @srcp: the input cpumask
 */
static inline void cpumask_copy(struct cpumask *dstp,
                                const struct cpumask *srcp)
{
        bitmap_copy(cpumask_bits(dstp), cpumask_bits(srcp), nr_cpumask_bits);
}

/**
 * cpumask_any - pick a "random" cpu from *srcp
 * @srcp: the input cpumask
 *
 * Returns >= nr_cpu_ids if no cpus set.
 */
#define cpumask_any(srcp) cpumask_first(srcp)

/**
 * cpumask_first_and - return the first cpu from *srcp1 & *srcp2
 * @src1p: the first input
 * @src2p: the second input
 *
 * Returns >= nr_cpu_ids if no cpus set in both.  See also cpumask_next_and().
 */
#define cpumask_first_and(src1p, src2p) cpumask_next_and(-1, (src1p), (src2p))

/**
 * cpumask_any_and - pick a "random" cpu from *mask1 & *mask2
 * @mask1: the first input cpumask
 * @mask2: the second input cpumask
 *
 * Returns >= nr_cpu_ids if no cpus set.
 */
#define cpumask_any_and(mask1, mask2) cpumask_first_and((mask1), (mask2))

/**
 * cpumask_of - the cpumask containing just a given cpu
 * @cpu: the cpu (<= nr_cpu_ids)
 */
#define cpumask_of(cpu) (get_cpu_mask(cpu))

/**
 * cpumask_parse_user - extract a cpumask from a user string
 * @buf: the buffer to extract from
 * @len: the length of the buffer
 * @dstp: the cpumask to set.
 *
 * Returns -errno, or 0 for success.
 */
static inline int cpumask_parse_user(const char __user *buf, int len,
                                     struct cpumask *dstp)
{
        return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits);
}

/**
 * cpumask_parselist_user - extract a cpumask from a user string
 * @buf: the buffer to extract from
 * @len: the length of the buffer
 * @dstp: the cpumask to set.
 *
 * Returns -errno, or 0 for success.
 */
static inline int cpumask_parselist_user(const char __user *buf, int len,
                                     struct cpumask *dstp)
{
        return bitmap_parselist_user(buf, len, cpumask_bits(dstp),
                                     nr_cpumask_bits);
}

/**
 * cpumask_parse - extract a cpumask from a string
 * @buf: the buffer to extract from
 * @dstp: the cpumask to set.
 *
 * Returns -errno, or 0 for success.
 */
static inline int cpumask_parse(const char *buf, struct cpumask *dstp)
{
        return bitmap_parse(buf, UINT_MAX, cpumask_bits(dstp), nr_cpumask_bits);
}

/**
 * cpulist_parse - extract a cpumask from a user string of ranges
 * @buf: the buffer to extract from
 * @dstp: the cpumask to set.
 *
 * Returns -errno, or 0 for success.
 */
static inline int cpulist_parse(const char *buf, struct cpumask *dstp)
{
        return bitmap_parselist(buf, cpumask_bits(dstp), nr_cpumask_bits);
}

/**
 * cpumask_size - size to allocate for a 'struct cpumask' in bytes
 */
static inline unsigned int cpumask_size(void)
{
        return bitmap_size(nr_cpumask_bits);
}

/*
 * cpumask_var_t: struct cpumask for stack usage.
 *
 * Oh, the wicked games we play!  In order to make kernel coding a
 * little more difficult, we typedef cpumask_var_t to an array or a
 * pointer: doing &mask on an array is a noop, so it still works.
 *
 * ie.
 *        cpumask_var_t tmpmask;
 *        if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
 *                return -ENOMEM;
 *
 *          ... use 'tmpmask' like a normal struct cpumask * ...
 *
 *        free_cpumask_var(tmpmask);
 *
 *
 * However, one notable exception is there. alloc_cpumask_var() allocates
 * only nr_cpumask_bits bits (in the other hand, real cpumask_t always has
 * NR_CPUS bits). Therefore you don't have to dereference cpumask_var_t.
 *
 *        cpumask_var_t tmpmask;
 *        if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
 *                return -ENOMEM;
 *
 *        var = *tmpmask;
 *
 * This code makes NR_CPUS length memcopy and brings to a memory corruption.
 * cpumask_copy() provide safe copy functionality.
 *
 * Note that there is another evil here: If you define a cpumask_var_t
 * as a percpu variable then the way to obtain the address of the cpumask
 * structure differently influences what this_cpu_* operation needs to be
 * used. Please use this_cpu_cpumask_var_t in those cases. The direct use
 * of this_cpu_ptr() or this_cpu_read() will lead to failures when the
 * other type of cpumask_var_t implementation is configured.
 *
 * Please also note that __cpumask_var_read_mostly can be used to declare
 * a cpumask_var_t variable itself (not its content) as read mostly.
 */
#ifdef CONFIG_CPUMASK_OFFSTACK
typedef struct cpumask *cpumask_var_t;

#define this_cpu_cpumask_var_ptr(x)        this_cpu_read(x)
#define __cpumask_var_read_mostly        __read_mostly

bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node);
bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags);
bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node);
bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags);
void alloc_bootmem_cpumask_var(cpumask_var_t *mask);
void free_cpumask_var(cpumask_var_t mask);
void free_bootmem_cpumask_var(cpumask_var_t mask);

static inline bool cpumask_available(cpumask_var_t mask)
{
        return mask != NULL;
}

#else
typedef struct cpumask cpumask_var_t[1];

#define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x)
#define __cpumask_var_read_mostly

static inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
{
        return true;
}

static inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags,
                                          int node)
{
        return true;
}

static inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
{
        cpumask_clear(*mask);
        return true;
}

static inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags,
                                          int node)
{
        cpumask_clear(*mask);
        return true;
}

static inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask)
{
}

static inline void free_cpumask_var(cpumask_var_t mask)
{
}

static inline void free_bootmem_cpumask_var(cpumask_var_t mask)
{
}

static inline bool cpumask_available(cpumask_var_t mask)
{
        return true;
}
#endif /* CONFIG_CPUMASK_OFFSTACK */

/* It's common to want to use cpu_all_mask in struct member initializers,
 * so it has to refer to an address rather than a pointer. */
extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS);
#define cpu_all_mask to_cpumask(cpu_all_bits)

/* First bits of cpu_bit_bitmap are in fact unset. */
#define cpu_none_mask to_cpumask(cpu_bit_bitmap[0])

#define for_each_possible_cpu(cpu) for_each_cpu((cpu), cpu_possible_mask)
#define for_each_online_cpu(cpu)   for_each_cpu((cpu), cpu_online_mask)
#define for_each_present_cpu(cpu)  for_each_cpu((cpu), cpu_present_mask)

/* Wrappers for arch boot code to manipulate normally-constant masks */
void init_cpu_present(const struct cpumask *src);
void init_cpu_possible(const struct cpumask *src);
void init_cpu_online(const struct cpumask *src);

static inline void reset_cpu_possible_mask(void)
{
        bitmap_zero(cpumask_bits(&__cpu_possible_mask), NR_CPUS);
}

static inline void
set_cpu_possible(unsigned int cpu, bool possible)
{
        if (possible)
                cpumask_set_cpu(cpu, &__cpu_possible_mask);
        else
                cpumask_clear_cpu(cpu, &__cpu_possible_mask);
}

static inline void
set_cpu_present(unsigned int cpu, bool present)
{
        if (present)
                cpumask_set_cpu(cpu, &__cpu_present_mask);
        else
                cpumask_clear_cpu(cpu, &__cpu_present_mask);
}

void set_cpu_online(unsigned int cpu, bool online);

static inline void
set_cpu_active(unsigned int cpu, bool active)
{
        if (active)
                cpumask_set_cpu(cpu, &__cpu_active_mask);
        else
                cpumask_clear_cpu(cpu, &__cpu_active_mask);
}


/**
 * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask *
 * @bitmap: the bitmap
 *
 * There are a few places where cpumask_var_t isn't appropriate and
 * static cpumasks must be used (eg. very early boot), yet we don't
 * expose the definition of 'struct cpumask'.
 *
 * This does the conversion, and can be used as a constant initializer.
 */
#define to_cpumask(bitmap)                                                \
        ((struct cpumask *)(1 ? (bitmap)                                \
                            : (void *)sizeof(__check_is_bitmap(bitmap))))

static inline int __check_is_bitmap(const unsigned long *bitmap)
{
        return 1;
}

/*
 * Special-case data structure for "single bit set only" constant CPU masks.
 *
 * We pre-generate all the 64 (or 32) possible bit positions, with enough
 * padding to the left and the right, and return the constant pointer
 * appropriately offset.
 */
extern const unsigned long
        cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)];

static inline const struct cpumask *get_cpu_mask(unsigned int cpu)
{
        const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG];
        p -= cpu / BITS_PER_LONG;
        return to_cpumask(p);
}

#define cpu_is_offline(cpu)        unlikely(!cpu_online(cpu))

#if NR_CPUS <= BITS_PER_LONG
#define CPU_BITS_ALL                                                \
{                                                                \
        [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS)        \
}

#else /* NR_CPUS > BITS_PER_LONG */

#define CPU_BITS_ALL                                                \
{                                                                \
        [0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL,                \
        [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS)        \
}
#endif /* NR_CPUS > BITS_PER_LONG */

/**
 * cpumap_print_to_pagebuf  - copies the cpumask into the buffer either
 *        as comma-separated list of cpus or hex values of cpumask
 * @list: indicates whether the cpumap must be list
 * @mask: the cpumask to copy
 * @buf: the buffer to copy into
 *
 * Returns the length of the (null-terminated) @buf string, zero if
 * nothing is copied.
 */
static inline ssize_t
cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask)
{
        return bitmap_print_to_pagebuf(list, buf, cpumask_bits(mask),
                                      nr_cpu_ids);
}

#if NR_CPUS <= BITS_PER_LONG
#define CPU_MASK_ALL                                                        \
(cpumask_t) { {                                                                \
        [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS)        \
} }
#else
#define CPU_MASK_ALL                                                        \
(cpumask_t) { {                                                                \
        [0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL,                        \
        [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS)        \
} }
#endif /* NR_CPUS > BITS_PER_LONG */

#define CPU_MASK_NONE                                                        \
(cpumask_t) { {                                                                \
        [0 ... BITS_TO_LONGS(NR_CPUS)-1] =  0UL                                \
} }

#define CPU_MASK_CPU0                                                        \
(cpumask_t) { {                                                                \
        [0] =  1UL                                                        \
} }

#endif /* __LINUX_CPUMASK_H */






































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
/* SPDX-License-Identifier: GPL-2.0 */
/* taskstats_kern.h - kernel header for per-task statistics interface
 *
 * Copyright (C) Shailabh Nagar, IBM Corp. 2006
 *           (C) Balbir Singh,   IBM Corp. 2006
 */

#ifndef _LINUX_TASKSTATS_KERN_H
#define _LINUX_TASKSTATS_KERN_H

#include <linux/taskstats.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>

#ifdef CONFIG_TASKSTATS
extern struct kmem_cache *taskstats_cache;
extern struct mutex taskstats_exit_mutex;

static inline void taskstats_tgid_free(struct signal_struct *sig)
{
        if (sig->stats)
                kmem_cache_free(taskstats_cache, sig->stats);
}

extern void taskstats_exit(struct task_struct *, int group_dead);
extern void taskstats_init_early(void);
#else
static inline void taskstats_exit(struct task_struct *tsk, int group_dead)
{}
static inline void taskstats_tgid_free(struct signal_struct *sig)
{}
static inline void taskstats_init_early(void)
{}
#endif /* CONFIG_TASKSTATS */

#endif















































































































    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_NSPROXY_H
#define _LINUX_NSPROXY_H

#include <linux/spinlock.h>
#include <linux/sched.h>

struct mnt_namespace;
struct uts_namespace;
struct ipc_namespace;
struct pid_namespace;
struct cgroup_namespace;
struct fs_struct;

/*
 * A structure to contain pointers to all per-process
 * namespaces - fs (mount), uts, network, sysvipc, etc.
 *
 * The pid namespace is an exception -- it's accessed using
 * task_active_pid_ns.  The pid namespace here is the
 * namespace that children will use.
 *
 * 'count' is the number of tasks holding a reference.
 * The count for each namespace, then, will be the number
 * of nsproxies pointing to it, not the number of tasks.
 *
 * The nsproxy is shared by tasks which share all namespaces.
 * As soon as a single namespace is cloned or unshared, the
 * nsproxy is copied.
 */
struct nsproxy {
        atomic_t count;
        struct uts_namespace *uts_ns;
        struct ipc_namespace *ipc_ns;
        struct mnt_namespace *mnt_ns;
        struct pid_namespace *pid_ns_for_children;
        struct net              *net_ns;
        struct time_namespace *time_ns;
        struct time_namespace *time_ns_for_children;
        struct cgroup_namespace *cgroup_ns;
};
extern struct nsproxy init_nsproxy;

/*
 * A structure to encompass all bits needed to install
 * a partial or complete new set of namespaces.
 *
 * If a new user namespace is requested cred will
 * point to a modifiable set of credentials. If a pointer
 * to a modifiable set is needed nsset_cred() must be
 * used and tested.
 */
struct nsset {
        unsigned flags;
        struct nsproxy *nsproxy;
        struct fs_struct *fs;
        const struct cred *cred;
};

static inline struct cred *nsset_cred(struct nsset *set)
{
        if (set->flags & CLONE_NEWUSER)
                return (struct cred *)set->cred;

        return NULL;
}

/*
 * the namespaces access rules are:
 *
 *  1. only current task is allowed to change tsk->nsproxy pointer or
 *     any pointer on the nsproxy itself.  Current must hold the task_lock
 *     when changing tsk->nsproxy.
 *
 *  2. when accessing (i.e. reading) current task's namespaces - no
 *     precautions should be taken - just dereference the pointers
 *
 *  3. the access to other task namespaces is performed like this
 *     task_lock(task);
 *     nsproxy = task->nsproxy;
 *     if (nsproxy != NULL) {
 *             / *
 *               * work with the namespaces here
 *               * e.g. get the reference on one of them
 *               * /
 *     } / *
 *         * NULL task->nsproxy means that this task is
 *         * almost dead (zombie)
 *         * /
 *     task_unlock(task);
 *
 */

int copy_namespaces(unsigned long flags, struct task_struct *tsk);
void exit_task_namespaces(struct task_struct *tsk);
void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
void free_nsproxy(struct nsproxy *ns);
int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **,
        struct cred *, struct fs_struct *);
int __init nsproxy_cache_init(void);

static inline void put_nsproxy(struct nsproxy *ns)
{
        if (atomic_dec_and_test(&ns->count)) {
                free_nsproxy(ns);
        }
}

static inline void get_nsproxy(struct nsproxy *ns)
{
        atomic_inc(&ns->count);
}

#endif





































































































































































    1 





    1 












































































































































































    1 

    1 

    1 





    1 




































    1 






















































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
// SPDX-License-Identifier: GPL-2.0-or-later
/* Task credentials management - see Documentation/security/credentials.rst
 *
 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */
#include <linux/export.h>
#include <linux/cred.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/coredump.h>
#include <linux/key.h>
#include <linux/keyctl.h>
#include <linux/init_task.h>
#include <linux/security.h>
#include <linux/binfmts.h>
#include <linux/cn_proc.h>
#include <linux/uidgid.h>

#if 0
#define kdebug(FMT, ...)                                                \
        printk("[%-5.5s%5u] " FMT "\n",                                        \
               current->comm, current->pid, ##__VA_ARGS__)
#else
#define kdebug(FMT, ...)                                                \
do {                                                                        \
        if (0)                                                                \
                no_printk("[%-5.5s%5u] " FMT "\n",                        \
                          current->comm, current->pid, ##__VA_ARGS__);        \
} while (0)
#endif

static struct kmem_cache *cred_jar;

/* init to 2 - one for init_task, one to ensure it is never freed */
struct group_info init_groups = { .usage = ATOMIC_INIT(2) };

/*
 * The initial credentials for the initial task
 */
struct cred init_cred = {
        .usage                        = ATOMIC_INIT(4),
#ifdef CONFIG_DEBUG_CREDENTIALS
        .subscribers                = ATOMIC_INIT(2),
        .magic                        = CRED_MAGIC,
#endif
        .uid                        = GLOBAL_ROOT_UID,
        .gid                        = GLOBAL_ROOT_GID,
        .suid                        = GLOBAL_ROOT_UID,
        .sgid                        = GLOBAL_ROOT_GID,
        .euid                        = GLOBAL_ROOT_UID,
        .egid                        = GLOBAL_ROOT_GID,
        .fsuid                        = GLOBAL_ROOT_UID,
        .fsgid                        = GLOBAL_ROOT_GID,
        .securebits                = SECUREBITS_DEFAULT,
        .cap_inheritable        = CAP_EMPTY_SET,
        .cap_permitted                = CAP_FULL_SET,
        .cap_effective                = CAP_FULL_SET,
        .cap_bset                = CAP_FULL_SET,
        .user                        = INIT_USER,
        .user_ns                = &init_user_ns,
        .group_info                = &init_groups,
};

static inline void set_cred_subscribers(struct cred *cred, int n)
{
#ifdef CONFIG_DEBUG_CREDENTIALS
        atomic_set(&cred->subscribers, n);
#endif
}

static inline int read_cred_subscribers(const struct cred *cred)
{
#ifdef CONFIG_DEBUG_CREDENTIALS
        return atomic_read(&cred->subscribers);
#else
        return 0;
#endif
}

static inline void alter_cred_subscribers(const struct cred *_cred, int n)
{
#ifdef CONFIG_DEBUG_CREDENTIALS
        struct cred *cred = (struct cred *) _cred;

        atomic_add(n, &cred->subscribers);
#endif
}

/*
 * The RCU callback to actually dispose of a set of credentials
 */
static void put_cred_rcu(struct rcu_head *rcu)
{
        struct cred *cred = container_of(rcu, struct cred, rcu);

        kdebug("put_cred_rcu(%p)", cred);

#ifdef CONFIG_DEBUG_CREDENTIALS
        if (cred->magic != CRED_MAGIC_DEAD ||
            atomic_long_read(&cred->usage) != 0 ||
            read_cred_subscribers(cred) != 0)
                panic("CRED: put_cred_rcu() sees %p with"
                      " mag %x, put %p, usage %ld, subscr %d\n",
                      cred, cred->magic, cred->put_addr,
                      atomic_long_read(&cred->usage),
                      read_cred_subscribers(cred));
#else
        if (atomic_long_read(&cred->usage) != 0)
                panic("CRED: put_cred_rcu() sees %p with usage %ld\n",
                      cred, atomic_long_read(&cred->usage));
#endif

        security_cred_free(cred);
        key_put(cred->session_keyring);
        key_put(cred->process_keyring);
        key_put(cred->thread_keyring);
        key_put(cred->request_key_auth);
        if (cred->group_info)
                put_group_info(cred->group_info);
        free_uid(cred->user);
        put_user_ns(cred->user_ns);
        kmem_cache_free(cred_jar, cred);
}

/**
 * __put_cred - Destroy a set of credentials
 * @cred: The record to release
 *
 * Destroy a set of credentials on which no references remain.
 */
void __put_cred(struct cred *cred)
{
        kdebug("__put_cred(%p{%ld,%d})", cred,
               atomic_long_read(&cred->usage),
               read_cred_subscribers(cred));

        BUG_ON(atomic_long_read(&cred->usage) != 0);
#ifdef CONFIG_DEBUG_CREDENTIALS
        BUG_ON(read_cred_subscribers(cred) != 0);
        cred->magic = CRED_MAGIC_DEAD;
        cred->put_addr = __builtin_return_address(0);
#endif
        BUG_ON(cred == current->cred);
        BUG_ON(cred == current->real_cred);

        if (cred->non_rcu)
                put_cred_rcu(&cred->rcu);
        else
                call_rcu(&cred->rcu, put_cred_rcu);
}
EXPORT_SYMBOL(__put_cred);

/*
 * Clean up a task's credentials when it exits
 */
void exit_creds(struct task_struct *tsk)
{
        struct cred *cred;

        kdebug("exit_creds(%u,%p,%p,{%ld,%d})", tsk->pid, tsk->real_cred, tsk->cred,
               atomic_long_read(&tsk->cred->usage),
               read_cred_subscribers(tsk->cred));

        cred = (struct cred *) tsk->real_cred;
        tsk->real_cred = NULL;
        validate_creds(cred);
        alter_cred_subscribers(cred, -1);
        put_cred(cred);

        cred = (struct cred *) tsk->cred;
        tsk->cred = NULL;
        validate_creds(cred);
        alter_cred_subscribers(cred, -1);
        put_cred(cred);

#ifdef CONFIG_KEYS_REQUEST_CACHE
        key_put(tsk->cached_requested_key);
        tsk->cached_requested_key = NULL;
#endif
}

/**
 * get_task_cred - Get another task's objective credentials
 * @task: The task to query
 *
 * Get the objective credentials of a task, pinning them so that they can't go
 * away.  Accessing a task's credentials directly is not permitted.
 *
 * The caller must also make sure task doesn't get deleted, either by holding a
 * ref on task or by holding tasklist_lock to prevent it from being unlinked.
 */
const struct cred *get_task_cred(struct task_struct *task)
{
        const struct cred *cred;

        rcu_read_lock();

        do {
                cred = __task_cred((task));
                BUG_ON(!cred);
        } while (!get_cred_rcu(cred));

        rcu_read_unlock();
        return cred;
}
EXPORT_SYMBOL(get_task_cred);

/*
 * Allocate blank credentials, such that the credentials can be filled in at a
 * later date without risk of ENOMEM.
 */
struct cred *cred_alloc_blank(void)
{
        struct cred *new;

        new = kmem_cache_zalloc(cred_jar, GFP_KERNEL);
        if (!new)
                return NULL;

        atomic_long_set(&new->usage, 1);
#ifdef CONFIG_DEBUG_CREDENTIALS
        new->magic = CRED_MAGIC;
#endif

        if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0)
                goto error;

        return new;

error:
        abort_creds(new);
        return NULL;
}

/**
 * prepare_creds - Prepare a new set of credentials for modification
 *
 * Prepare a new set of task credentials for modification.  A task's creds
 * shouldn't generally be modified directly, therefore this function is used to
 * prepare a new copy, which the caller then modifies and then commits by
 * calling commit_creds().
 *
 * Preparation involves making a copy of the objective creds for modification.
 *
 * Returns a pointer to the new creds-to-be if successful, NULL otherwise.
 *
 * Call commit_creds() or abort_creds() to clean up.
 */
struct cred *prepare_creds(void)
{
        struct task_struct *task = current;
        const struct cred *old;
        struct cred *new;

        validate_process_creds();

        new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
        if (!new)
                return NULL;

        kdebug("prepare_creds() alloc %p", new);

        old = task->cred;
        memcpy(new, old, sizeof(struct cred));

        new->non_rcu = 0;
        atomic_long_set(&new->usage, 1);
        set_cred_subscribers(new, 0);
        get_group_info(new->group_info);
        get_uid(new->user);
        get_user_ns(new->user_ns);

#ifdef CONFIG_KEYS
        key_get(new->session_keyring);
        key_get(new->process_keyring);
        key_get(new->thread_keyring);
        key_get(new->request_key_auth);
#endif

#ifdef CONFIG_SECURITY
        new->security = NULL;
#endif

        if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
                goto error;
        validate_creds(new);
        return new;

error:
        abort_creds(new);
        return NULL;
}
EXPORT_SYMBOL(prepare_creds);

/*
 * Prepare credentials for current to perform an execve()
 * - The caller must hold ->cred_guard_mutex
 */
struct cred *prepare_exec_creds(void)
{
        struct cred *new;

        new = prepare_creds();
        if (!new)
                return new;

#ifdef CONFIG_KEYS
        /* newly exec'd tasks don't get a thread keyring */
        key_put(new->thread_keyring);
        new->thread_keyring = NULL;

        /* inherit the session keyring; new process keyring */
        key_put(new->process_keyring);
        new->process_keyring = NULL;
#endif

        new->suid = new->fsuid = new->euid;
        new->sgid = new->fsgid = new->egid;

        return new;
}

/*
 * Copy credentials for the new process created by fork()
 *
 * We share if we can, but under some circumstances we have to generate a new
 * set.
 *
 * The new process gets the current process's subjective credentials as its
 * objective and subjective credentials
 */
int copy_creds(struct task_struct *p, unsigned long clone_flags)
{
        struct cred *new;
        int ret;

#ifdef CONFIG_KEYS_REQUEST_CACHE
        p->cached_requested_key = NULL;
#endif

        if (
#ifdef CONFIG_KEYS
                !p->cred->thread_keyring &&
#endif
                clone_flags & CLONE_THREAD
            ) {
                p->real_cred = get_cred(p->cred);
                get_cred(p->cred);
                alter_cred_subscribers(p->cred, 2);
                kdebug("share_creds(%p{%ld,%d})",
                       p->cred, atomic_long_read(&p->cred->usage),
                       read_cred_subscribers(p->cred));
                atomic_inc(&p->cred->user->processes);
                return 0;
        }

        new = prepare_creds();
        if (!new)
                return -ENOMEM;

        if (clone_flags & CLONE_NEWUSER) {
                ret = create_user_ns(new);
                if (ret < 0)
                        goto error_put;
        }

#ifdef CONFIG_KEYS
        /* new threads get their own thread keyrings if their parent already
         * had one */
        if (new->thread_keyring) {
                key_put(new->thread_keyring);
                new->thread_keyring = NULL;
                if (clone_flags & CLONE_THREAD)
                        install_thread_keyring_to_cred(new);
        }

        /* The process keyring is only shared between the threads in a process;
         * anything outside of those threads doesn't inherit.
         */
        if (!(clone_flags & CLONE_THREAD)) {
                key_put(new->process_keyring);
                new->process_keyring = NULL;
        }
#endif

        atomic_inc(&new->user->processes);
        p->cred = p->real_cred = get_cred(new);
        alter_cred_subscribers(new, 2);
        validate_creds(new);
        return 0;

error_put:
        put_cred(new);
        return ret;
}

static bool cred_cap_issubset(const struct cred *set, const struct cred *subset)
{
        const struct user_namespace *set_ns = set->user_ns;
        const struct user_namespace *subset_ns = subset->user_ns;

        /* If the two credentials are in the same user namespace see if
         * the capabilities of subset are a subset of set.
         */
        if (set_ns == subset_ns)
                return cap_issubset(subset->cap_permitted, set->cap_permitted);

        /* The credentials are in a different user namespaces
         * therefore one is a subset of the other only if a set is an
         * ancestor of subset and set->euid is owner of subset or one
         * of subsets ancestors.
         */
        for (;subset_ns != &init_user_ns; subset_ns = subset_ns->parent) {
                if ((set_ns == subset_ns->parent)  &&
                    uid_eq(subset_ns->owner, set->euid))
                        return true;
        }

        return false;
}

/**
 * commit_creds - Install new credentials upon the current task
 * @new: The credentials to be assigned
 *
 * Install a new set of credentials to the current task, using RCU to replace
 * the old set.  Both the objective and the subjective credentials pointers are
 * updated.  This function may not be called if the subjective credentials are
 * in an overridden state.
 *
 * This function eats the caller's reference to the new credentials.
 *
 * Always returns 0 thus allowing this function to be tail-called at the end
 * of, say, sys_setgid().
 */
int commit_creds(struct cred *new)
{
        struct task_struct *task = current;
        const struct cred *old = task->real_cred;

        kdebug("commit_creds(%p{%ld,%d})", new,
               atomic_long_read(&new->usage),
               read_cred_subscribers(new));

        BUG_ON(task->cred != old);
#ifdef CONFIG_DEBUG_CREDENTIALS
        BUG_ON(read_cred_subscribers(old) < 2);
        validate_creds(old);
        validate_creds(new);
#endif
        BUG_ON(atomic_long_read(&new->usage) < 1);

        get_cred(new); /* we will require a ref for the subj creds too */

        /* dumpability changes */
        if (!uid_eq(old->euid, new->euid) ||
            !gid_eq(old->egid, new->egid) ||
            !uid_eq(old->fsuid, new->fsuid) ||
            !gid_eq(old->fsgid, new->fsgid) ||
            !cred_cap_issubset(old, new)) {
                if (task->mm)
                        set_dumpable(task->mm, suid_dumpable);
                task->pdeath_signal = 0;
                /*
                 * If a task drops privileges and becomes nondumpable,
                 * the dumpability change must become visible before
                 * the credential change; otherwise, a __ptrace_may_access()
                 * racing with this change may be able to attach to a task it
                 * shouldn't be able to attach to (as if the task had dropped
                 * privileges without becoming nondumpable).
                 * Pairs with a read barrier in __ptrace_may_access().
                 */
                smp_wmb();
        }

        /* alter the thread keyring */
        if (!uid_eq(new->fsuid, old->fsuid))
                key_fsuid_changed(new);
        if (!gid_eq(new->fsgid, old->fsgid))
                key_fsgid_changed(new);

        /* do it
         * RLIMIT_NPROC limits on user->processes have already been checked
         * in set_user().
         */
        alter_cred_subscribers(new, 2);
        if (new->user != old->user)
                atomic_inc(&new->user->processes);
        rcu_assign_pointer(task->real_cred, new);
        rcu_assign_pointer(task->cred, new);
        if (new->user != old->user)
                atomic_dec(&old->user->processes);
        alter_cred_subscribers(old, -2);

        /* send notifications */
        if (!uid_eq(new->uid,   old->uid)  ||
            !uid_eq(new->euid,  old->euid) ||
            !uid_eq(new->suid,  old->suid) ||
            !uid_eq(new->fsuid, old->fsuid))
                proc_id_connector(task, PROC_EVENT_UID);

        if (!gid_eq(new->gid,   old->gid)  ||
            !gid_eq(new->egid,  old->egid) ||
            !gid_eq(new->sgid,  old->sgid) ||
            !gid_eq(new->fsgid, old->fsgid))
                proc_id_connector(task, PROC_EVENT_GID);

        /* release the old obj and subj refs both */
        put_cred(old);
        put_cred(old);
        return 0;
}
EXPORT_SYMBOL(commit_creds);

/**
 * abort_creds - Discard a set of credentials and unlock the current task
 * @new: The credentials that were going to be applied
 *
 * Discard a set of credentials that were under construction and unlock the
 * current task.
 */
void abort_creds(struct cred *new)
{
        kdebug("abort_creds(%p{%ld,%d})", new,
               atomic_long_read(&new->usage),
               read_cred_subscribers(new));

#ifdef CONFIG_DEBUG_CREDENTIALS
        BUG_ON(read_cred_subscribers(new) != 0);
#endif
        BUG_ON(atomic_long_read(&new->usage) < 1);
        put_cred(new);
}
EXPORT_SYMBOL(abort_creds);

/**
 * override_creds - Override the current process's subjective credentials
 * @new: The credentials to be assigned
 *
 * Install a set of temporary override subjective credentials on the current
 * process, returning the old set for later reversion.
 */
const struct cred *override_creds(const struct cred *new)
{
        const struct cred *old = current->cred;

        kdebug("override_creds(%p{%ld,%d})", new,
               atomic_long_read(&new->usage),
               read_cred_subscribers(new));

        validate_creds(old);
        validate_creds(new);

        /*
         * NOTE! This uses 'get_new_cred()' rather than 'get_cred()'.
         *
         * That means that we do not clear the 'non_rcu' flag, since
         * we are only installing the cred into the thread-synchronous
         * '->cred' pointer, not the '->real_cred' pointer that is
         * visible to other threads under RCU.
         *
         * Also note that we did validate_creds() manually, not depending
         * on the validation in 'get_cred()'.
         */
        get_new_cred((struct cred *)new);
        alter_cred_subscribers(new, 1);
        rcu_assign_pointer(current->cred, new);
        alter_cred_subscribers(old, -1);

        kdebug("override_creds() = %p{%ld,%d}", old,
               atomic_long_read(&old->usage),
               read_cred_subscribers(old));
        return old;
}
EXPORT_SYMBOL(override_creds);

/**
 * revert_creds - Revert a temporary subjective credentials override
 * @old: The credentials to be restored
 *
 * Revert a temporary set of override subjective credentials to an old set,
 * discarding the override set.
 */
void revert_creds(const struct cred *old)
{
        const struct cred *override = current->cred;

        kdebug("revert_creds(%p{%ld,%d})", old,
               atomic_long_read(&old->usage),
               read_cred_subscribers(old));

        validate_creds(old);
        validate_creds(override);
        alter_cred_subscribers(old, 1);
        rcu_assign_pointer(current->cred, old);
        alter_cred_subscribers(override, -1);
        put_cred(override);
}
EXPORT_SYMBOL(revert_creds);

/**
 * cred_fscmp - Compare two credentials with respect to filesystem access.
 * @a: The first credential
 * @b: The second credential
 *
 * cred_cmp() will return zero if both credentials have the same
 * fsuid, fsgid, and supplementary groups.  That is, if they will both
 * provide the same access to files based on mode/uid/gid.
 * If the credentials are different, then either -1 or 1 will
 * be returned depending on whether @a comes before or after @b
 * respectively in an arbitrary, but stable, ordering of credentials.
 *
 * Return: -1, 0, or 1 depending on comparison
 */
int cred_fscmp(const struct cred *a, const struct cred *b)
{
        struct group_info *ga, *gb;
        int g;

        if (a == b)
                return 0;
        if (uid_lt(a->fsuid, b->fsuid))
                return -1;
        if (uid_gt(a->fsuid, b->fsuid))
                return 1;

        if (gid_lt(a->fsgid, b->fsgid))
                return -1;
        if (gid_gt(a->fsgid, b->fsgid))
                return 1;

        ga = a->group_info;
        gb = b->group_info;
        if (ga == gb)
                return 0;
        if (ga == NULL)
                return -1;
        if (gb == NULL)
                return 1;
        if (ga->ngroups < gb->ngroups)
                return -1;
        if (ga->ngroups > gb->ngroups)
                return 1;

        for (g = 0; g < ga->ngroups; g++) {
                if (gid_lt(ga->gid[g], gb->gid[g]))
                        return -1;
                if (gid_gt(ga->gid[g], gb->gid[g]))
                        return 1;
        }
        return 0;
}
EXPORT_SYMBOL(cred_fscmp);

/*
 * initialise the credentials stuff
 */
void __init cred_init(void)
{
        /* allocate a slab in which we can store credentials */
        cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
}

/**
 * prepare_kernel_cred - Prepare a set of credentials for a kernel service
 * @daemon: A userspace daemon to be used as a reference
 *
 * Prepare a set of credentials for a kernel service.  This can then be used to
 * override a task's own credentials so that work can be done on behalf of that
 * task that requires a different subjective context.
 *
 * @daemon is used to provide a base for the security record, but can be NULL.
 * If @daemon is supplied, then the security data will be derived from that;
 * otherwise they'll be set to 0 and no groups, full capabilities and no keys.
 *
 * The caller may change these controls afterwards if desired.
 *
 * Returns the new credentials or NULL if out of memory.
 */
struct cred *prepare_kernel_cred(struct task_struct *daemon)
{
        const struct cred *old;
        struct cred *new;

        new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
        if (!new)
                return NULL;

        kdebug("prepare_kernel_cred() alloc %p", new);

        if (daemon)
                old = get_task_cred(daemon);
        else
                old = get_cred(&init_cred);

        validate_creds(old);

        *new = *old;
        new->non_rcu = 0;
        atomic_long_set(&new->usage, 1);
        set_cred_subscribers(new, 0);
        get_uid(new->user);
        get_user_ns(new->user_ns);
        get_group_info(new->group_info);

#ifdef CONFIG_KEYS
        new->session_keyring = NULL;
        new->process_keyring = NULL;
        new->thread_keyring = NULL;
        new->request_key_auth = NULL;
        new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
#endif

#ifdef CONFIG_SECURITY
        new->security = NULL;
#endif
        if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
                goto error;

        put_cred(old);
        validate_creds(new);
        return new;

error:
        put_cred(new);
        put_cred(old);
        return NULL;
}
EXPORT_SYMBOL(prepare_kernel_cred);

/**
 * set_security_override - Set the security ID in a set of credentials
 * @new: The credentials to alter
 * @secid: The LSM security ID to set
 *
 * Set the LSM security ID in a set of credentials so that the subjective
 * security is overridden when an alternative set of credentials is used.
 */
int set_security_override(struct cred *new, u32 secid)
{
        return security_kernel_act_as(new, secid);
}
EXPORT_SYMBOL(set_security_override);

/**
 * set_security_override_from_ctx - Set the security ID in a set of credentials
 * @new: The credentials to alter
 * @secctx: The LSM security context to generate the security ID from.
 *
 * Set the LSM security ID in a set of credentials so that the subjective
 * security is overridden when an alternative set of credentials is used.  The
 * security ID is specified in string form as a security context to be
 * interpreted by the LSM.
 */
int set_security_override_from_ctx(struct cred *new, const char *secctx)
{
        u32 secid;
        int ret;

        ret = security_secctx_to_secid(secctx, strlen(secctx), &secid);
        if (ret < 0)
                return ret;

        return set_security_override(new, secid);
}
EXPORT_SYMBOL(set_security_override_from_ctx);

/**
 * set_create_files_as - Set the LSM file create context in a set of credentials
 * @new: The credentials to alter
 * @inode: The inode to take the context from
 *
 * Change the LSM file creation context in a set of credentials to be the same
 * as the object context of the specified inode, so that the new inodes have
 * the same MAC context as that inode.
 */
int set_create_files_as(struct cred *new, struct inode *inode)
{
        if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
                return -EINVAL;
        new->fsuid = inode->i_uid;
        new->fsgid = inode->i_gid;
        return security_kernel_create_files_as(new, inode);
}
EXPORT_SYMBOL(set_create_files_as);

#ifdef CONFIG_DEBUG_CREDENTIALS

bool creds_are_invalid(const struct cred *cred)
{
        if (cred->magic != CRED_MAGIC)
                return true;
        return false;
}
EXPORT_SYMBOL(creds_are_invalid);

/*
 * dump invalid credentials
 */
static void dump_invalid_creds(const struct cred *cred, const char *label,
                               const struct task_struct *tsk)
{
        printk(KERN_ERR "CRED: %s credentials: %p %s%s%s\n",
               label, cred,
               cred == &init_cred ? "[init]" : "",
               cred == tsk->real_cred ? "[real]" : "",
               cred == tsk->cred ? "[eff]" : "");
        printk(KERN_ERR "CRED: ->magic=%x, put_addr=%p\n",
               cred->magic, cred->put_addr);
        printk(KERN_ERR "CRED: ->usage=%ld, subscr=%d\n",
               atomic_long_read(&cred->usage),
               read_cred_subscribers(cred));
        printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n",
                from_kuid_munged(&init_user_ns, cred->uid),
                from_kuid_munged(&init_user_ns, cred->euid),
                from_kuid_munged(&init_user_ns, cred->suid),
                from_kuid_munged(&init_user_ns, cred->fsuid));
        printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n",
                from_kgid_munged(&init_user_ns, cred->gid),
                from_kgid_munged(&init_user_ns, cred->egid),
                from_kgid_munged(&init_user_ns, cred->sgid),
                from_kgid_munged(&init_user_ns, cred->fsgid));
#ifdef CONFIG_SECURITY
        printk(KERN_ERR "CRED: ->security is %p\n", cred->security);
        if ((unsigned long) cred->security >= PAGE_SIZE &&
            (((unsigned long) cred->security & 0xffffff00) !=
             (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)))
                printk(KERN_ERR "CRED: ->security {%x, %x}\n",
                       ((u32*)cred->security)[0],
                       ((u32*)cred->security)[1]);
#endif
}

/*
 * report use of invalid credentials
 */
void __invalid_creds(const struct cred *cred, const char *file, unsigned line)
{
        printk(KERN_ERR "CRED: Invalid credentials\n");
        printk(KERN_ERR "CRED: At %s:%u\n", file, line);
        dump_invalid_creds(cred, "Specified", current);
        BUG();
}
EXPORT_SYMBOL(__invalid_creds);

/*
 * check the credentials on a process
 */
void __validate_process_creds(struct task_struct *tsk,
                              const char *file, unsigned line)
{
        if (tsk->cred == tsk->real_cred) {
                if (unlikely(read_cred_subscribers(tsk->cred) < 2 ||
                             creds_are_invalid(tsk->cred)))
                        goto invalid_creds;
        } else {
                if (unlikely(read_cred_subscribers(tsk->real_cred) < 1 ||
                             read_cred_subscribers(tsk->cred) < 1 ||
                             creds_are_invalid(tsk->real_cred) ||
                             creds_are_invalid(tsk->cred)))
                        goto invalid_creds;
        }
        return;

invalid_creds:
        printk(KERN_ERR "CRED: Invalid process credentials\n");
        printk(KERN_ERR "CRED: At %s:%u\n", file, line);

        dump_invalid_creds(tsk->real_cred, "Real", tsk);
        if (tsk->cred != tsk->real_cred)
                dump_invalid_creds(tsk->cred, "Effective", tsk);
        else
                printk(KERN_ERR "CRED: Effective creds == Real creds\n");
        BUG();
}
EXPORT_SYMBOL(__validate_process_creds);

/*
 * check creds for do_exit()
 */
void validate_creds_for_do_exit(struct task_struct *tsk)
{
        kdebug("validate_creds_for_do_exit(%p,%p{%ld,%d})",
               tsk->real_cred, tsk->cred,
               atomic_long_read(&tsk->cred->usage),
               read_cred_subscribers(tsk->cred));

        __validate_process_creds(tsk, __FILE__, __LINE__);
}

#endif /* CONFIG_DEBUG_CREDENTIALS */

























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * linux/include/linux/sunrpc/addr.h
 *
 * Various routines for copying and comparing sockaddrs and for
 * converting them to and from presentation format.
 */
#ifndef _LINUX_SUNRPC_ADDR_H
#define _LINUX_SUNRPC_ADDR_H

#include <linux/socket.h>
#include <linux/in.h>
#include <linux/in6.h>
#include <net/ipv6.h>

size_t                rpc_ntop(const struct sockaddr *, char *, const size_t);
size_t                rpc_pton(struct net *, const char *, const size_t,
                         struct sockaddr *, const size_t);
char *                rpc_sockaddr2uaddr(const struct sockaddr *, gfp_t);
size_t                rpc_uaddr2sockaddr(struct net *, const char *, const size_t,
                                   struct sockaddr *, const size_t);

static inline unsigned short rpc_get_port(const struct sockaddr *sap)
{
        switch (sap->sa_family) {
        case AF_INET:
                return ntohs(((struct sockaddr_in *)sap)->sin_port);
        case AF_INET6:
                return ntohs(((struct sockaddr_in6 *)sap)->sin6_port);
        }
        return 0;
}

static inline void rpc_set_port(struct sockaddr *sap,
                                const unsigned short port)
{
        switch (sap->sa_family) {
        case AF_INET:
                ((struct sockaddr_in *)sap)->sin_port = htons(port);
                break;
        case AF_INET6:
                ((struct sockaddr_in6 *)sap)->sin6_port = htons(port);
                break;
        }
}

#define IPV6_SCOPE_DELIMITER                '%'
#define IPV6_SCOPE_ID_LEN                sizeof("%nnnnnnnnnn")

static inline bool rpc_cmp_addr4(const struct sockaddr *sap1,
                                 const struct sockaddr *sap2)
{
        const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sap1;
        const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sap2;

        return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr;
}

static inline bool __rpc_copy_addr4(struct sockaddr *dst,
                                    const struct sockaddr *src)
{
        const struct sockaddr_in *ssin = (struct sockaddr_in *) src;
        struct sockaddr_in *dsin = (struct sockaddr_in *) dst;

        dsin->sin_family = ssin->sin_family;
        dsin->sin_addr.s_addr = ssin->sin_addr.s_addr;
        return true;
}

#if IS_ENABLED(CONFIG_IPV6)
static inline bool rpc_cmp_addr6(const struct sockaddr *sap1,
                                 const struct sockaddr *sap2)
{
        const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sap1;
        const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sap2;

        if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr))
                return false;
        else if (ipv6_addr_type(&sin1->sin6_addr) & IPV6_ADDR_LINKLOCAL)
                return sin1->sin6_scope_id == sin2->sin6_scope_id;

        return true;
}

static inline bool __rpc_copy_addr6(struct sockaddr *dst,
                                    const struct sockaddr *src)
{
        const struct sockaddr_in6 *ssin6 = (const struct sockaddr_in6 *) src;
        struct sockaddr_in6 *dsin6 = (struct sockaddr_in6 *) dst;

        dsin6->sin6_family = ssin6->sin6_family;
        dsin6->sin6_addr = ssin6->sin6_addr;
        dsin6->sin6_scope_id = ssin6->sin6_scope_id;
        return true;
}
#else        /* !(IS_ENABLED(CONFIG_IPV6) */
static inline bool rpc_cmp_addr6(const struct sockaddr *sap1,
                                   const struct sockaddr *sap2)
{
        return false;
}

static inline bool __rpc_copy_addr6(struct sockaddr *dst,
                                    const struct sockaddr *src)
{
        return false;
}
#endif        /* !(IS_ENABLED(CONFIG_IPV6) */

/**
 * rpc_cmp_addr - compare the address portion of two sockaddrs.
 * @sap1: first sockaddr
 * @sap2: second sockaddr
 *
 * Just compares the family and address portion. Ignores port, but
 * compares the scope if it's a link-local address.
 *
 * Returns true if the addrs are equal, false if they aren't.
 */
static inline bool rpc_cmp_addr(const struct sockaddr *sap1,
                                const struct sockaddr *sap2)
{
        if (sap1->sa_family == sap2->sa_family) {
                switch (sap1->sa_family) {
                case AF_INET:
                        return rpc_cmp_addr4(sap1, sap2);
                case AF_INET6:
                        return rpc_cmp_addr6(sap1, sap2);
                }
        }
        return false;
}

/**
 * rpc_cmp_addr_port - compare the address and port number of two sockaddrs.
 * @sap1: first sockaddr
 * @sap2: second sockaddr
 */
static inline bool rpc_cmp_addr_port(const struct sockaddr *sap1,
                                     const struct sockaddr *sap2)
{
        if (!rpc_cmp_addr(sap1, sap2))
                return false;
        return rpc_get_port(sap1) == rpc_get_port(sap2);
}

/**
 * rpc_copy_addr - copy the address portion of one sockaddr to another
 * @dst: destination sockaddr
 * @src: source sockaddr
 *
 * Just copies the address portion and family. Ignores port, scope, etc.
 * Caller is responsible for making certain that dst is large enough to hold
 * the address in src. Returns true if address family is supported. Returns
 * false otherwise.
 */
static inline bool rpc_copy_addr(struct sockaddr *dst,
                                 const struct sockaddr *src)
{
        switch (src->sa_family) {
        case AF_INET:
                return __rpc_copy_addr4(dst, src);
        case AF_INET6:
                return __rpc_copy_addr6(dst, src);
        }
        return false;
}

/**
 * rpc_get_scope_id - return scopeid for a given sockaddr
 * @sa: sockaddr to get scopeid from
 *
 * Returns the value of the sin6_scope_id for AF_INET6 addrs, or 0 if
 * not an AF_INET6 address.
 */
static inline u32 rpc_get_scope_id(const struct sockaddr *sa)
{
        if (sa->sa_family != AF_INET6)
                return 0;

        return ((struct sockaddr_in6 *) sa)->sin6_scope_id;
}

#endif /* _LINUX_SUNRPC_ADDR_H */



























































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
// SPDX-License-Identifier: GPL-2.0
/*
 * This is a maximally equidistributed combined Tausworthe generator
 * based on code from GNU Scientific Library 1.5 (30 Jun 2004)
 *
 * lfsr113 version:
 *
 * x_n = (s1_n ^ s2_n ^ s3_n ^ s4_n)
 *
 * s1_{n+1} = (((s1_n & 4294967294) << 18) ^ (((s1_n <<  6) ^ s1_n) >> 13))
 * s2_{n+1} = (((s2_n & 4294967288) <<  2) ^ (((s2_n <<  2) ^ s2_n) >> 27))
 * s3_{n+1} = (((s3_n & 4294967280) <<  7) ^ (((s3_n << 13) ^ s3_n) >> 21))
 * s4_{n+1} = (((s4_n & 4294967168) << 13) ^ (((s4_n <<  3) ^ s4_n) >> 12))
 *
 * The period of this generator is about 2^113 (see erratum paper).
 *
 * From: P. L'Ecuyer, "Maximally Equidistributed Combined Tausworthe
 * Generators", Mathematics of Computation, 65, 213 (1996), 203--213:
 * http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme.ps
 * ftp://ftp.iro.umontreal.ca/pub/simulation/lecuyer/papers/tausme.ps
 *
 * There is an erratum in the paper "Tables of Maximally Equidistributed
 * Combined LFSR Generators", Mathematics of Computation, 68, 225 (1999),
 * 261--269: http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme2.ps
 *
 *      ... the k_j most significant bits of z_j must be non-zero,
 *      for each j. (Note: this restriction also applies to the
 *      computer code given in [4], but was mistakenly not mentioned
 *      in that paper.)
 *
 * This affects the seeding procedure by imposing the requirement
 * s1 > 1, s2 > 7, s3 > 15, s4 > 127.
 */

#include <linux/types.h>
#include <linux/percpu.h>
#include <linux/export.h>
#include <linux/jiffies.h>
#include <linux/random.h>
#include <linux/sched.h>
#include <linux/bitops.h>
#include <linux/slab.h>
#include <linux/notifier.h>
#include <asm/unaligned.h>

/**
 *        prandom_u32_state - seeded pseudo-random number generator.
 *        @state: pointer to state structure holding seeded state.
 *
 *        This is used for pseudo-randomness with no outside seeding.
 *        For more random results, use prandom_u32().
 */
u32 prandom_u32_state(struct rnd_state *state)
{
#define TAUSWORTHE(s, a, b, c, d) ((s & c) << d) ^ (((s << a) ^ s) >> b)
        state->s1 = TAUSWORTHE(state->s1,  6U, 13U, 4294967294U, 18U);
        state->s2 = TAUSWORTHE(state->s2,  2U, 27U, 4294967288U,  2U);
        state->s3 = TAUSWORTHE(state->s3, 13U, 21U, 4294967280U,  7U);
        state->s4 = TAUSWORTHE(state->s4,  3U, 12U, 4294967168U, 13U);

        return (state->s1 ^ state->s2 ^ state->s3 ^ state->s4);
}
EXPORT_SYMBOL(prandom_u32_state);

/**
 *        prandom_bytes_state - get the requested number of pseudo-random bytes
 *
 *        @state: pointer to state structure holding seeded state.
 *        @buf: where to copy the pseudo-random bytes to
 *        @bytes: the requested number of bytes
 *
 *        This is used for pseudo-randomness with no outside seeding.
 *        For more random results, use prandom_bytes().
 */
void prandom_bytes_state(struct rnd_state *state, void *buf, size_t bytes)
{
        u8 *ptr = buf;

        while (bytes >= sizeof(u32)) {
                put_unaligned(prandom_u32_state(state), (u32 *) ptr);
                ptr += sizeof(u32);
                bytes -= sizeof(u32);
        }

        if (bytes > 0) {
                u32 rem = prandom_u32_state(state);
                do {
                        *ptr++ = (u8) rem;
                        bytes--;
                        rem >>= BITS_PER_BYTE;
                } while (bytes > 0);
        }
}
EXPORT_SYMBOL(prandom_bytes_state);

static void prandom_warmup(struct rnd_state *state)
{
        /* Calling RNG ten times to satisfy recurrence condition */
        prandom_u32_state(state);
        prandom_u32_state(state);
        prandom_u32_state(state);
        prandom_u32_state(state);
        prandom_u32_state(state);
        prandom_u32_state(state);
        prandom_u32_state(state);
        prandom_u32_state(state);
        prandom_u32_state(state);
        prandom_u32_state(state);
}

void prandom_seed_full_state(struct rnd_state __percpu *pcpu_state)
{
        int i;

        for_each_possible_cpu(i) {
                struct rnd_state *state = per_cpu_ptr(pcpu_state, i);
                u32 seeds[4];

                get_random_bytes(&seeds, sizeof(seeds));
                state->s1 = __seed(seeds[0],   2U);
                state->s2 = __seed(seeds[1],   8U);
                state->s3 = __seed(seeds[2],  16U);
                state->s4 = __seed(seeds[3], 128U);

                prandom_warmup(state);
        }
}
EXPORT_SYMBOL(prandom_seed_full_state);

#ifdef CONFIG_RANDOM32_SELFTEST
static struct prandom_test1 {
        u32 seed;
        u32 result;
} test1[] = {
        { 1U, 3484351685U },
        { 2U, 2623130059U },
        { 3U, 3125133893U },
        { 4U,  984847254U },
};

static struct prandom_test2 {
        u32 seed;
        u32 iteration;
        u32 result;
} test2[] = {
        /* Test cases against taus113 from GSL library. */
        {  931557656U, 959U, 2975593782U },
        { 1339693295U, 876U, 3887776532U },
        { 1545556285U, 961U, 1615538833U },
        {  601730776U, 723U, 1776162651U },
        { 1027516047U, 687U,  511983079U },
        {  416526298U, 700U,  916156552U },
        { 1395522032U, 652U, 2222063676U },
        {  366221443U, 617U, 2992857763U },
        { 1539836965U, 714U, 3783265725U },
        {  556206671U, 994U,  799626459U },
        {  684907218U, 799U,  367789491U },
        { 2121230701U, 931U, 2115467001U },
        { 1668516451U, 644U, 3620590685U },
        {  768046066U, 883U, 2034077390U },
        { 1989159136U, 833U, 1195767305U },
        {  536585145U, 996U, 3577259204U },
        { 1008129373U, 642U, 1478080776U },
        { 1740775604U, 939U, 1264980372U },
        { 1967883163U, 508U,   10734624U },
        { 1923019697U, 730U, 3821419629U },
        {  442079932U, 560U, 3440032343U },
        { 1961302714U, 845U,  841962572U },
        { 2030205964U, 962U, 1325144227U },
        { 1160407529U, 507U,  240940858U },
        {  635482502U, 779U, 4200489746U },
        { 1252788931U, 699U,  867195434U },
        { 1961817131U, 719U,  668237657U },
        { 1071468216U, 983U,  917876630U },
        { 1281848367U, 932U, 1003100039U },
        {  582537119U, 780U, 1127273778U },
        { 1973672777U, 853U, 1071368872U },
        { 1896756996U, 762U, 1127851055U },
        {  847917054U, 500U, 1717499075U },
        { 1240520510U, 951U, 2849576657U },
        { 1685071682U, 567U, 1961810396U },
        { 1516232129U, 557U,    3173877U },
        { 1208118903U, 612U, 1613145022U },
        { 1817269927U, 693U, 4279122573U },
        { 1510091701U, 717U,  638191229U },
        {  365916850U, 807U,  600424314U },
        {  399324359U, 702U, 1803598116U },
        { 1318480274U, 779U, 2074237022U },
        {  697758115U, 840U, 1483639402U },
        { 1696507773U, 840U,  577415447U },
        { 2081979121U, 981U, 3041486449U },
        {  955646687U, 742U, 3846494357U },
        { 1250683506U, 749U,  836419859U },
        {  595003102U, 534U,  366794109U },
        {   47485338U, 558U, 3521120834U },
        {  619433479U, 610U, 3991783875U },
        {  704096520U, 518U, 4139493852U },
        { 1712224984U, 606U, 2393312003U },
        { 1318233152U, 922U, 3880361134U },
        {  855572992U, 761U, 1472974787U },
        {   64721421U, 703U,  683860550U },
        {  678931758U, 840U,  380616043U },
        {  692711973U, 778U, 1382361947U },
        {  677703619U, 530U, 2826914161U },
        {   92393223U, 586U, 1522128471U },
        { 1222592920U, 743U, 3466726667U },
        {  358288986U, 695U, 1091956998U },
        { 1935056945U, 958U,  514864477U },
        {  735675993U, 990U, 1294239989U },
        { 1560089402U, 897U, 2238551287U },
        {   70616361U, 829U,   22483098U },
        {  368234700U, 731U, 2913875084U },
        {   20221190U, 879U, 1564152970U },
        {  539444654U, 682U, 1835141259U },
        { 1314987297U, 840U, 1801114136U },
        { 2019295544U, 645U, 3286438930U },
        {  469023838U, 716U, 1637918202U },
        { 1843754496U, 653U, 2562092152U },
        {  400672036U, 809U, 4264212785U },
        {  404722249U, 965U, 2704116999U },
        {  600702209U, 758U,  584979986U },
        {  519953954U, 667U, 2574436237U },
        { 1658071126U, 694U, 2214569490U },
        {  420480037U, 749U, 3430010866U },
        {  690103647U, 969U, 3700758083U },
        { 1029424799U, 937U, 3787746841U },
        { 2012608669U, 506U, 3362628973U },
        { 1535432887U, 998U,   42610943U },
        { 1330635533U, 857U, 3040806504U },
        { 1223800550U, 539U, 3954229517U },
        { 1322411537U, 680U, 3223250324U },
        { 1877847898U, 945U, 2915147143U },
        { 1646356099U, 874U,  965988280U },
        {  805687536U, 744U, 4032277920U },
        { 1948093210U, 633U, 1346597684U },
        {  392609744U, 783U, 1636083295U },
        {  690241304U, 770U, 1201031298U },
        { 1360302965U, 696U, 1665394461U },
        { 1220090946U, 780U, 1316922812U },
        {  447092251U, 500U, 3438743375U },
        { 1613868791U, 592U,  828546883U },
        {  523430951U, 548U, 2552392304U },
        {  726692899U, 810U, 1656872867U },
        { 1364340021U, 836U, 3710513486U },
        { 1986257729U, 931U,  935013962U },
        {  407983964U, 921U,  728767059U },
};

static u32 __extract_hwseed(void)
{
        unsigned int val = 0;

        (void)(arch_get_random_seed_int(&val) ||
               arch_get_random_int(&val));

        return val;
}

static void prandom_seed_early(struct rnd_state *state, u32 seed,
                               bool mix_with_hwseed)
{
#define LCG(x)         ((x) * 69069U)        /* super-duper LCG */
#define HWSEED() (mix_with_hwseed ? __extract_hwseed() : 0)
        state->s1 = __seed(HWSEED() ^ LCG(seed),        2U);
        state->s2 = __seed(HWSEED() ^ LCG(state->s1),   8U);
        state->s3 = __seed(HWSEED() ^ LCG(state->s2),  16U);
        state->s4 = __seed(HWSEED() ^ LCG(state->s3), 128U);
}

static int __init prandom_state_selftest(void)
{
        int i, j, errors = 0, runs = 0;
        bool error = false;

        for (i = 0; i < ARRAY_SIZE(test1); i++) {
                struct rnd_state state;

                prandom_seed_early(&state, test1[i].seed, false);
                prandom_warmup(&state);

                if (test1[i].result != prandom_u32_state(&state))
                        error = true;
        }

        if (error)
                pr_warn("prandom: seed boundary self test failed\n");
        else
                pr_info("prandom: seed boundary self test passed\n");

        for (i = 0; i < ARRAY_SIZE(test2); i++) {
                struct rnd_state state;

                prandom_seed_early(&state, test2[i].seed, false);
                prandom_warmup(&state);

                for (j = 0; j < test2[i].iteration - 1; j++)
                        prandom_u32_state(&state);

                if (test2[i].result != prandom_u32_state(&state))
                        errors++;

                runs++;
                cond_resched();
        }

        if (errors)
                pr_warn("prandom: %d/%d self tests failed\n", errors, runs);
        else
                pr_info("prandom: %d self tests passed\n", runs);
        return 0;
}
core_initcall(prandom_state_selftest);
#endif

/*
 * The prandom_u32() implementation is now completely separate from the
 * prandom_state() functions, which are retained (for now) for compatibility.
 *
 * Because of (ab)use in the networking code for choosing random TCP/UDP port
 * numbers, which open DoS possibilities if guessable, we want something
 * stronger than a standard PRNG.  But the performance requirements of
 * the network code do not allow robust crypto for this application.
 *
 * So this is a homebrew Junior Spaceman implementation, based on the
 * lowest-latency trustworthy crypto primitive available, SipHash.
 * (The authors of SipHash have not been consulted about this abuse of
 * their work.)
 *
 * Standard SipHash-2-4 uses 2n+4 rounds to hash n words of input to
 * one word of output.  This abbreviated version uses 2 rounds per word
 * of output.
 */

struct siprand_state {
        unsigned long v0;
        unsigned long v1;
        unsigned long v2;
        unsigned long v3;
};

static DEFINE_PER_CPU(struct siprand_state, net_rand_state) __latent_entropy;
DEFINE_PER_CPU(unsigned long, net_rand_noise);
EXPORT_PER_CPU_SYMBOL(net_rand_noise);

/*
 * This is the core CPRNG function.  As "pseudorandom", this is not used
 * for truly valuable things, just intended to be a PITA to guess.
 * For maximum speed, we do just two SipHash rounds per word.  This is
 * the same rate as 4 rounds per 64 bits that SipHash normally uses,
 * so hopefully it's reasonably secure.
 *
 * There are two changes from the official SipHash finalization:
 * - We omit some constants XORed with v2 in the SipHash spec as irrelevant;
 *   they are there only to make the output rounds distinct from the input
 *   rounds, and this application has no input rounds.
 * - Rather than returning v0^v1^v2^v3, return v1+v3.
 *   If you look at the SipHash round, the last operation on v3 is
 *   "v3 ^= v0", so "v0 ^ v3" just undoes that, a waste of time.
 *   Likewise "v1 ^= v2".  (The rotate of v2 makes a difference, but
 *   it still cancels out half of the bits in v2 for no benefit.)
 *   Second, since the last combining operation was xor, continue the
 *   pattern of alternating xor/add for a tiny bit of extra non-linearity.
 */
static inline u32 siprand_u32(struct siprand_state *s)
{
        unsigned long v0 = s->v0, v1 = s->v1, v2 = s->v2, v3 = s->v3;
        unsigned long n = raw_cpu_read(net_rand_noise);

        v3 ^= n;
        PRND_SIPROUND(v0, v1, v2, v3);
        PRND_SIPROUND(v0, v1, v2, v3);
        v0 ^= n;
        s->v0 = v0;  s->v1 = v1;  s->v2 = v2;  s->v3 = v3;
        return v1 + v3;
}


/**
 *        prandom_u32 - pseudo random number generator
 *
 *        A 32 bit pseudo-random number is generated using a fast
 *        algorithm suitable for simulation. This algorithm is NOT
 *        considered safe for cryptographic use.
 */
u32 prandom_u32(void)
{
        struct siprand_state *state = get_cpu_ptr(&net_rand_state);
        u32 res = siprand_u32(state);

        put_cpu_ptr(&net_rand_state);
        return res;
}
EXPORT_SYMBOL(prandom_u32);

/**
 *        prandom_bytes - get the requested number of pseudo-random bytes
 *        @buf: where to copy the pseudo-random bytes to
 *        @bytes: the requested number of bytes
 */
void prandom_bytes(void *buf, size_t bytes)
{
        struct siprand_state *state = get_cpu_ptr(&net_rand_state);
        u8 *ptr = buf;

        while (bytes >= sizeof(u32)) {
                put_unaligned(siprand_u32(state), (u32 *)ptr);
                ptr += sizeof(u32);
                bytes -= sizeof(u32);
        }

        if (bytes > 0) {
                u32 rem = siprand_u32(state);

                do {
                        *ptr++ = (u8)rem;
                        rem >>= BITS_PER_BYTE;
                } while (--bytes > 0);
        }
        put_cpu_ptr(&net_rand_state);
}
EXPORT_SYMBOL(prandom_bytes);

/**
 *        prandom_seed - add entropy to pseudo random number generator
 *        @entropy: entropy value
 *
 *        Add some additional seed material to the prandom pool.
 *        The "entropy" is actually our IP address (the only caller is
 *        the network code), not for unpredictability, but to ensure that
 *        different machines are initialized differently.
 */
void prandom_seed(u32 entropy)
{
        int i;

        add_device_randomness(&entropy, sizeof(entropy));

        for_each_possible_cpu(i) {
                struct siprand_state *state = per_cpu_ptr(&net_rand_state, i);
                unsigned long v0 = state->v0, v1 = state->v1;
                unsigned long v2 = state->v2, v3 = state->v3;

                do {
                        v3 ^= entropy;
                        PRND_SIPROUND(v0, v1, v2, v3);
                        PRND_SIPROUND(v0, v1, v2, v3);
                        v0 ^= entropy;
                } while (unlikely(!v0 || !v1 || !v2 || !v3));

                WRITE_ONCE(state->v0, v0);
                WRITE_ONCE(state->v1, v1);
                WRITE_ONCE(state->v2, v2);
                WRITE_ONCE(state->v3, v3);
        }
}
EXPORT_SYMBOL(prandom_seed);

/*
 *        Generate some initially weak seeding values to allow
 *        the prandom_u32() engine to be started.
 */
static int __init prandom_init_early(void)
{
        int i;
        unsigned long v0, v1, v2, v3;

        if (!arch_get_random_long(&v0))
                v0 = jiffies;
        if (!arch_get_random_long(&v1))
                v1 = random_get_entropy();
        v2 = v0 ^ PRND_K0;
        v3 = v1 ^ PRND_K1;

        for_each_possible_cpu(i) {
                struct siprand_state *state;

                v3 ^= i;
                PRND_SIPROUND(v0, v1, v2, v3);
                PRND_SIPROUND(v0, v1, v2, v3);
                v0 ^= i;

                state = per_cpu_ptr(&net_rand_state, i);
                state->v0 = v0;  state->v1 = v1;
                state->v2 = v2;  state->v3 = v3;
        }

        return 0;
}
core_initcall(prandom_init_early);


/* Stronger reseeding when available, and periodically thereafter. */
static void prandom_reseed(struct timer_list *unused);

static DEFINE_TIMER(seed_timer, prandom_reseed);

static void prandom_reseed(struct timer_list *unused)
{
        unsigned long expires;
        int i;

        /*
         * Reinitialize each CPU's PRNG with 128 bits of key.
         * No locking on the CPUs, but then somewhat random results are,
         * well, expected.
         */
        for_each_possible_cpu(i) {
                struct siprand_state *state;
                unsigned long v0 = get_random_long(), v2 = v0 ^ PRND_K0;
                unsigned long v1 = get_random_long(), v3 = v1 ^ PRND_K1;
#if BITS_PER_LONG == 32
                int j;

                /*
                 * On 32-bit machines, hash in two extra words to
                 * approximate 128-bit key length.  Not that the hash
                 * has that much security, but this prevents a trivial
                 * 64-bit brute force.
                 */
                for (j = 0; j < 2; j++) {
                        unsigned long m = get_random_long();

                        v3 ^= m;
                        PRND_SIPROUND(v0, v1, v2, v3);
                        PRND_SIPROUND(v0, v1, v2, v3);
                        v0 ^= m;
                }
#endif
                /*
                 * Probably impossible in practice, but there is a
                 * theoretical risk that a race between this reseeding
                 * and the target CPU writing its state back could
                 * create the all-zero SipHash fixed point.
                 *
                 * To ensure that never happens, ensure the state
                 * we write contains no zero words.
                 */
                state = per_cpu_ptr(&net_rand_state, i);
                WRITE_ONCE(state->v0, v0 ? v0 : -1ul);
                WRITE_ONCE(state->v1, v1 ? v1 : -1ul);
                WRITE_ONCE(state->v2, v2 ? v2 : -1ul);
                WRITE_ONCE(state->v3, v3 ? v3 : -1ul);
        }

        /* reseed every ~60 seconds, in [40 .. 80) interval with slack */
        expires = round_jiffies(jiffies + 40 * HZ + prandom_u32_max(40 * HZ));
        mod_timer(&seed_timer, expires);
}

/*
 * The random ready callback can be called from almost any interrupt.
 * To avoid worrying about whether it's safe to delay that interrupt
 * long enough to seed all CPUs, just schedule an immediate timer event.
 */
static int prandom_timer_start(struct notifier_block *nb,
                               unsigned long action, void *data)
{
        mod_timer(&seed_timer, jiffies);
        return 0;
}

#ifdef CONFIG_RANDOM32_SELFTEST
/* Principle: True 32-bit random numbers will all have 16 differing bits on
 * average. For each 32-bit number, there are 601M numbers differing by 16
 * bits, and 89% of the numbers differ by at least 12 bits. Note that more
 * than 16 differing bits also implies a correlation with inverted bits. Thus
 * we take 1024 random numbers and compare each of them to the other ones,
 * counting the deviation of correlated bits to 16. Constants report 32,
 * counters 32-log2(TEST_SIZE), and pure randoms, around 6 or lower. With the
 * u32 total, TEST_SIZE may be as large as 4096 samples.
 */
#define TEST_SIZE 1024
static int __init prandom32_state_selftest(void)
{
        unsigned int x, y, bits, samples;
        u32 xor, flip;
        u32 total;
        u32 *data;

        data = kmalloc(sizeof(*data) * TEST_SIZE, GFP_KERNEL);
        if (!data)
                return 0;

        for (samples = 0; samples < TEST_SIZE; samples++)
                data[samples] = prandom_u32();

        flip = total = 0;
        for (x = 0; x < samples; x++) {
                for (y = 0; y < samples; y++) {
                        if (x == y)
                                continue;
                        xor = data[x] ^ data[y];
                        flip |= xor;
                        bits = hweight32(xor);
                        total += (bits - 16) * (bits - 16);
                }
        }

        /* We'll return the average deviation as 2*sqrt(corr/samples), which
         * is also sqrt(4*corr/samples) which provides a better resolution.
         */
        bits = int_sqrt(total / (samples * (samples - 1)) * 4);
        if (bits > 6)
                pr_warn("prandom32: self test failed (at least %u bits"
                        " correlated, fixed_mask=%#x fixed_value=%#x\n",
                        bits, ~flip, data[0] & ~flip);
        else
                pr_info("prandom32: self test passed (less than %u bits"
                        " correlated)\n",
                        bits+1);
        kfree(data);
        return 0;
}
core_initcall(prandom32_state_selftest);
#endif /*  CONFIG_RANDOM32_SELFTEST */

/*
 * Start periodic full reseeding as soon as strong
 * random numbers are available.
 */
static int __init prandom_init_late(void)
{
        static struct notifier_block random_ready = {
                .notifier_call = prandom_timer_start
        };
        int ret = register_random_ready_notifier(&random_ready);

        if (ret == -EALREADY) {
                prandom_timer_start(&random_ready, 0, NULL);
                ret = 0;
        }
        return ret;
}
late_initcall(prandom_init_late);




















































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Definitions for diskquota-operations. When diskquota is configured these
 * macros expand to the right source-code.
 *
 * Author:  Marco van Wieringen <mvw@planets.elm.net>
 */
#ifndef _LINUX_QUOTAOPS_
#define _LINUX_QUOTAOPS_

#include <linux/fs.h>

#define DQUOT_SPACE_WARN        0x1
#define DQUOT_SPACE_RESERVE        0x2
#define DQUOT_SPACE_NOFAIL        0x4

static inline struct quota_info *sb_dqopt(struct super_block *sb)
{
        return &sb->s_dquot;
}

/* i_mutex must being held */
static inline bool is_quota_modification(struct inode *inode, struct iattr *ia)
{
        return (ia->ia_valid & ATTR_SIZE) ||
                (ia->ia_valid & ATTR_UID && !uid_eq(ia->ia_uid, inode->i_uid)) ||
                (ia->ia_valid & ATTR_GID && !gid_eq(ia->ia_gid, inode->i_gid));
}

#if defined(CONFIG_QUOTA)

#define quota_error(sb, fmt, args...) \
        __quota_error((sb), __func__, fmt , ## args)

extern __printf(3, 4)
void __quota_error(struct super_block *sb, const char *func,
                   const char *fmt, ...);

/*
 * declaration of quota_function calls in kernel.
 */
int dquot_initialize(struct inode *inode);
bool dquot_initialize_needed(struct inode *inode);
void dquot_drop(struct inode *inode);
struct dquot *dqget(struct super_block *sb, struct kqid qid);
static inline struct dquot *dqgrab(struct dquot *dquot)
{
        /* Make sure someone else has active reference to dquot */
        WARN_ON_ONCE(!atomic_read(&dquot->dq_count));
        WARN_ON_ONCE(!test_bit(DQ_ACTIVE_B, &dquot->dq_flags));
        atomic_inc(&dquot->dq_count);
        return dquot;
}

static inline bool dquot_is_busy(struct dquot *dquot)
{
        if (test_bit(DQ_MOD_B, &dquot->dq_flags))
                return true;
        if (atomic_read(&dquot->dq_count) > 0)
                return true;
        return false;
}

void dqput(struct dquot *dquot);
int dquot_scan_active(struct super_block *sb,
                      int (*fn)(struct dquot *dquot, unsigned long priv),
                      unsigned long priv);
struct dquot *dquot_alloc(struct super_block *sb, int type);
void dquot_destroy(struct dquot *dquot);

int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags);
void __dquot_free_space(struct inode *inode, qsize_t number, int flags);

int dquot_alloc_inode(struct inode *inode);

int dquot_claim_space_nodirty(struct inode *inode, qsize_t number);
void dquot_free_inode(struct inode *inode);
void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number);

int dquot_disable(struct super_block *sb, int type, unsigned int flags);
/* Suspend quotas on remount RO */
static inline int dquot_suspend(struct super_block *sb, int type)
{
        return dquot_disable(sb, type, DQUOT_SUSPENDED);
}
int dquot_resume(struct super_block *sb, int type);

int dquot_commit(struct dquot *dquot);
int dquot_acquire(struct dquot *dquot);
int dquot_release(struct dquot *dquot);
int dquot_commit_info(struct super_block *sb, int type);
int dquot_get_next_id(struct super_block *sb, struct kqid *qid);
int dquot_mark_dquot_dirty(struct dquot *dquot);

int dquot_file_open(struct inode *inode, struct file *file);

int dquot_load_quota_sb(struct super_block *sb, int type, int format_id,
        unsigned int flags);
int dquot_load_quota_inode(struct inode *inode, int type, int format_id,
        unsigned int flags);
int dquot_quota_on(struct super_block *sb, int type, int format_id,
        const struct path *path);
int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
         int format_id, int type);
int dquot_quota_off(struct super_block *sb, int type);
int dquot_writeback_dquots(struct super_block *sb, int type);
int dquot_quota_sync(struct super_block *sb, int type);
int dquot_get_state(struct super_block *sb, struct qc_state *state);
int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii);
int dquot_get_dqblk(struct super_block *sb, struct kqid id,
                struct qc_dqblk *di);
int dquot_get_next_dqblk(struct super_block *sb, struct kqid *id,
                struct qc_dqblk *di);
int dquot_set_dqblk(struct super_block *sb, struct kqid id,
                struct qc_dqblk *di);

int __dquot_transfer(struct inode *inode, struct dquot **transfer_to);
int dquot_transfer(struct inode *inode, struct iattr *iattr);

static inline struct mem_dqinfo *sb_dqinfo(struct super_block *sb, int type)
{
        return sb_dqopt(sb)->info + type;
}

/*
 * Functions for checking status of quota
 */

static inline bool sb_has_quota_usage_enabled(struct super_block *sb, int type)
{
        return sb_dqopt(sb)->flags &
                                dquot_state_flag(DQUOT_USAGE_ENABLED, type);
}

static inline bool sb_has_quota_limits_enabled(struct super_block *sb, int type)
{
        return sb_dqopt(sb)->flags &
                                dquot_state_flag(DQUOT_LIMITS_ENABLED, type);
}

static inline bool sb_has_quota_suspended(struct super_block *sb, int type)
{
        return sb_dqopt(sb)->flags &
                                dquot_state_flag(DQUOT_SUSPENDED, type);
}

static inline unsigned sb_any_quota_suspended(struct super_block *sb)
{
        return dquot_state_types(sb_dqopt(sb)->flags, DQUOT_SUSPENDED);
}

/* Does kernel know about any quota information for given sb + type? */
static inline bool sb_has_quota_loaded(struct super_block *sb, int type)
{
        /* Currently if anything is on, then quota usage is on as well */
        return sb_has_quota_usage_enabled(sb, type);
}

static inline unsigned sb_any_quota_loaded(struct super_block *sb)
{
        return dquot_state_types(sb_dqopt(sb)->flags, DQUOT_USAGE_ENABLED);
}

static inline bool sb_has_quota_active(struct super_block *sb, int type)
{
        return sb_has_quota_loaded(sb, type) &&
               !sb_has_quota_suspended(sb, type);
}

/*
 * Operations supported for diskquotas.
 */
extern const struct dquot_operations dquot_operations;
extern const struct quotactl_ops dquot_quotactl_sysfile_ops;

#else

static inline int sb_has_quota_usage_enabled(struct super_block *sb, int type)
{
        return 0;
}

static inline int sb_has_quota_limits_enabled(struct super_block *sb, int type)
{
        return 0;
}

static inline int sb_has_quota_suspended(struct super_block *sb, int type)
{
        return 0;
}

static inline int sb_any_quota_suspended(struct super_block *sb)
{
        return 0;
}

/* Does kernel know about any quota information for given sb + type? */
static inline int sb_has_quota_loaded(struct super_block *sb, int type)
{
        return 0;
}

static inline int sb_any_quota_loaded(struct super_block *sb)
{
        return 0;
}

static inline int sb_has_quota_active(struct super_block *sb, int type)
{
        return 0;
}

static inline int dquot_initialize(struct inode *inode)
{
        return 0;
}

static inline bool dquot_initialize_needed(struct inode *inode)
{
        return false;
}

static inline void dquot_drop(struct inode *inode)
{
}

static inline int dquot_alloc_inode(struct inode *inode)
{
        return 0;
}

static inline void dquot_free_inode(struct inode *inode)
{
}

static inline int dquot_transfer(struct inode *inode, struct iattr *iattr)
{
        return 0;
}

static inline int __dquot_alloc_space(struct inode *inode, qsize_t number,
                int flags)
{
        if (!(flags & DQUOT_SPACE_RESERVE))
                inode_add_bytes(inode, number);
        return 0;
}

static inline void __dquot_free_space(struct inode *inode, qsize_t number,
                int flags)
{
        if (!(flags & DQUOT_SPACE_RESERVE))
                inode_sub_bytes(inode, number);
}

static inline int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
{
        inode_add_bytes(inode, number);
        return 0;
}

static inline int dquot_reclaim_space_nodirty(struct inode *inode,
                                              qsize_t number)
{
        inode_sub_bytes(inode, number);
        return 0;
}

static inline int dquot_disable(struct super_block *sb, int type,
                unsigned int flags)
{
        return 0;
}

static inline int dquot_suspend(struct super_block *sb, int type)
{
        return 0;
}

static inline int dquot_resume(struct super_block *sb, int type)
{
        return 0;
}

#define dquot_file_open                generic_file_open

static inline int dquot_writeback_dquots(struct super_block *sb, int type)
{
        return 0;
}

#endif /* CONFIG_QUOTA */

static inline int dquot_alloc_space_nodirty(struct inode *inode, qsize_t nr)
{
        return __dquot_alloc_space(inode, nr, DQUOT_SPACE_WARN);
}

static inline void dquot_alloc_space_nofail(struct inode *inode, qsize_t nr)
{
        __dquot_alloc_space(inode, nr, DQUOT_SPACE_WARN|DQUOT_SPACE_NOFAIL);
        mark_inode_dirty_sync(inode);
}

static inline int dquot_alloc_space(struct inode *inode, qsize_t nr)
{
        int ret;

        ret = dquot_alloc_space_nodirty(inode, nr);
        if (!ret) {
                /*
                 * Mark inode fully dirty. Since we are allocating blocks, inode
                 * would become fully dirty soon anyway and it reportedly
                 * reduces lock contention.
                 */
                mark_inode_dirty(inode);
        }
        return ret;
}

static inline int dquot_alloc_block_nodirty(struct inode *inode, qsize_t nr)
{
        return dquot_alloc_space_nodirty(inode, nr << inode->i_blkbits);
}

static inline void dquot_alloc_block_nofail(struct inode *inode, qsize_t nr)
{
        dquot_alloc_space_nofail(inode, nr << inode->i_blkbits);
}

static inline int dquot_alloc_block(struct inode *inode, qsize_t nr)
{
        return dquot_alloc_space(inode, nr << inode->i_blkbits);
}

static inline int dquot_prealloc_block_nodirty(struct inode *inode, qsize_t nr)
{
        return __dquot_alloc_space(inode, nr << inode->i_blkbits, 0);
}

static inline int dquot_prealloc_block(struct inode *inode, qsize_t nr)
{
        int ret;

        ret = dquot_prealloc_block_nodirty(inode, nr);
        if (!ret)
                mark_inode_dirty_sync(inode);
        return ret;
}

static inline int dquot_reserve_block(struct inode *inode, qsize_t nr)
{
        return __dquot_alloc_space(inode, nr << inode->i_blkbits,
                                DQUOT_SPACE_WARN|DQUOT_SPACE_RESERVE);
}

static inline int dquot_claim_block(struct inode *inode, qsize_t nr)
{
        int ret;

        ret = dquot_claim_space_nodirty(inode, nr << inode->i_blkbits);
        if (!ret)
                mark_inode_dirty_sync(inode);
        return ret;
}

static inline void dquot_reclaim_block(struct inode *inode, qsize_t nr)
{
        dquot_reclaim_space_nodirty(inode, nr << inode->i_blkbits);
        mark_inode_dirty_sync(inode);
}

static inline void dquot_free_space_nodirty(struct inode *inode, qsize_t nr)
{
        __dquot_free_space(inode, nr, 0);
}

static inline void dquot_free_space(struct inode *inode, qsize_t nr)
{
        dquot_free_space_nodirty(inode, nr);
        mark_inode_dirty_sync(inode);
}

static inline void dquot_free_block_nodirty(struct inode *inode, qsize_t nr)
{
        dquot_free_space_nodirty(inode, nr << inode->i_blkbits);
}

static inline void dquot_free_block(struct inode *inode, qsize_t nr)
{
        dquot_free_space(inode, nr << inode->i_blkbits);
}

static inline void dquot_release_reservation_block(struct inode *inode,
                qsize_t nr)
{
        __dquot_free_space(inode, nr << inode->i_blkbits, DQUOT_SPACE_RESERVE);
}

unsigned int qtype_enforce_flag(int type);

#endif /* _LINUX_QUOTAOPS_ */




































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * sysctl.h: General linux system control interface
 *
 * Begun 24 March 1995, Stephen Tweedie
 *
 ****************************************************************
 ****************************************************************
 **
 **  WARNING:
 **  The values in this file are exported to user space via 
 **  the sysctl() binary interface.  Do *NOT* change the
 **  numbering of any existing values here, and do not change
 **  any numbers within any one set of values.  If you have to
 **  redefine an existing interface, use a new number for it.
 **  The kernel will then return -ENOTDIR to any application using
 **  the old binary interface.
 **
 ****************************************************************
 ****************************************************************
 */
#ifndef _LINUX_SYSCTL_H
#define _LINUX_SYSCTL_H

#include <linux/list.h>
#include <linux/rcupdate.h>
#include <linux/wait.h>
#include <linux/rbtree.h>
#include <linux/uidgid.h>
#include <uapi/linux/sysctl.h>

/* For the /proc/sys support */
struct completion;
struct ctl_table;
struct nsproxy;
struct ctl_table_root;
struct ctl_table_header;
struct ctl_dir;

/* Keep the same order as in fs/proc/proc_sysctl.c */
#define SYSCTL_NEG_ONE                        ((void *)&sysctl_vals[0])
#define SYSCTL_ZERO                        ((void *)&sysctl_vals[1])
#define SYSCTL_ONE                        ((void *)&sysctl_vals[2])
#define SYSCTL_TWO                        ((void *)&sysctl_vals[3])
#define SYSCTL_FOUR                        ((void *)&sysctl_vals[4])
#define SYSCTL_ONE_HUNDRED                ((void *)&sysctl_vals[5])
#define SYSCTL_TWO_HUNDRED                ((void *)&sysctl_vals[6])
#define SYSCTL_ONE_THOUSAND                ((void *)&sysctl_vals[7])
#define SYSCTL_THREE_THOUSAND                ((void *)&sysctl_vals[8])
#define SYSCTL_INT_MAX                        ((void *)&sysctl_vals[9])

extern const int sysctl_vals[];

typedef int proc_handler(struct ctl_table *ctl, int write, void *buffer,
                size_t *lenp, loff_t *ppos);

int proc_dostring(struct ctl_table *, int, void *, size_t *, loff_t *);
int proc_dobool(struct ctl_table *table, int write, void *buffer,
                size_t *lenp, loff_t *ppos);
int proc_dointvec(struct ctl_table *, int, void *, size_t *, loff_t *);
int proc_douintvec(struct ctl_table *, int, void *, size_t *, loff_t *);
int proc_dointvec_minmax(struct ctl_table *, int, void *, size_t *, loff_t *);
int proc_douintvec_minmax(struct ctl_table *table, int write, void *buffer,
                size_t *lenp, loff_t *ppos);
int proc_dou8vec_minmax(struct ctl_table *table, int write, void *buffer,
                        size_t *lenp, loff_t *ppos);
int proc_dointvec_jiffies(struct ctl_table *, int, void *, size_t *, loff_t *);
int proc_dointvec_userhz_jiffies(struct ctl_table *, int, void *, size_t *,
                loff_t *);
int proc_dointvec_ms_jiffies(struct ctl_table *, int, void *, size_t *,
                loff_t *);
int proc_doulongvec_minmax(struct ctl_table *, int, void *, size_t *, loff_t *);
int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int, void *,
                size_t *, loff_t *);
int proc_do_large_bitmap(struct ctl_table *, int, void *, size_t *, loff_t *);
int proc_do_static_key(struct ctl_table *table, int write, void *buffer,
                size_t *lenp, loff_t *ppos);

/*
 * Register a set of sysctl names by calling register_sysctl_table
 * with an initialised array of struct ctl_table's.  An entry with 
 * NULL procname terminates the table.  table->de will be
 * set up by the registration and need not be initialised in advance.
 *
 * sysctl names can be mirrored automatically under /proc/sys.  The
 * procname supplied controls /proc naming.
 *
 * The table's mode will be honoured for proc-fs access.
 *
 * Leaf nodes in the sysctl tree will be represented by a single file
 * under /proc; non-leaf nodes will be represented by directories.  A
 * null procname disables /proc mirroring at this node.
 *
 * The data and maxlen fields of the ctl_table
 * struct enable minimal validation of the values being written to be
 * performed, and the mode field allows minimal authentication.
 * 
 * There must be a proc_handler routine for any terminal nodes
 * mirrored under /proc/sys (non-terminals are handled by a built-in
 * directory handler).  Several default handlers are available to
 * cover common cases.
 */

/* Support for userspace poll() to watch for changes */
struct ctl_table_poll {
        atomic_t event;
        wait_queue_head_t wait;
};

static inline void *proc_sys_poll_event(struct ctl_table_poll *poll)
{
        return (void *)(unsigned long)atomic_read(&poll->event);
}

#define __CTL_TABLE_POLL_INITIALIZER(name) {                                \
        .event = ATOMIC_INIT(0),                                        \
        .wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.wait) }

#define DEFINE_CTL_TABLE_POLL(name)                                        \
        struct ctl_table_poll name = __CTL_TABLE_POLL_INITIALIZER(name)

/* A sysctl table is an array of struct ctl_table: */
struct ctl_table {
        const char *procname;                /* Text ID for /proc/sys, or zero */
        void *data;
        int maxlen;
        umode_t mode;
        struct ctl_table *child;        /* Deprecated */
        proc_handler *proc_handler;        /* Callback for text formatting */
        struct ctl_table_poll *poll;
        void *extra1;
        void *extra2;
} __randomize_layout;

struct ctl_node {
        struct rb_node node;
        struct ctl_table_header *header;
};

/* struct ctl_table_header is used to maintain dynamic lists of
   struct ctl_table trees. */
struct ctl_table_header {
        union {
                struct {
                        struct ctl_table *ctl_table;
                        int used;
                        int count;
                        int nreg;
                };
                struct rcu_head rcu;
        };
        struct completion *unregistering;
        struct ctl_table *ctl_table_arg;
        struct ctl_table_root *root;
        struct ctl_table_set *set;
        struct ctl_dir *parent;
        struct ctl_node *node;
        struct hlist_head inodes; /* head for proc_inode->sysctl_inodes */
};

struct ctl_dir {
        /* Header must be at the start of ctl_dir */
        struct ctl_table_header header;
        struct rb_root root;
};

struct ctl_table_set {
        int (*is_seen)(struct ctl_table_set *);
        struct ctl_dir dir;
};

struct ctl_table_root {
        struct ctl_table_set default_set;
        struct ctl_table_set *(*lookup)(struct ctl_table_root *root);
        void (*set_ownership)(struct ctl_table_header *head,
                              struct ctl_table *table,
                              kuid_t *uid, kgid_t *gid);
        int (*permissions)(struct ctl_table_header *head, struct ctl_table *table);
};

/* struct ctl_path describes where in the hierarchy a table is added */
struct ctl_path {
        const char *procname;
};

#ifdef CONFIG_SYSCTL

void proc_sys_poll_notify(struct ctl_table_poll *poll);

extern void setup_sysctl_set(struct ctl_table_set *p,
        struct ctl_table_root *root,
        int (*is_seen)(struct ctl_table_set *));
extern void retire_sysctl_set(struct ctl_table_set *set);

struct ctl_table_header *__register_sysctl_table(
        struct ctl_table_set *set,
        const char *path, struct ctl_table *table);
struct ctl_table_header *__register_sysctl_paths(
        struct ctl_table_set *set,
        const struct ctl_path *path, struct ctl_table *table);
struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *table);
struct ctl_table_header *register_sysctl_table(struct ctl_table * table);
struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
                                                struct ctl_table *table);

void unregister_sysctl_table(struct ctl_table_header * table);

extern int sysctl_init(void);
extern void __register_sysctl_init(const char *path, struct ctl_table *table,
                                 const char *table_name);
#define register_sysctl_init(path, table) __register_sysctl_init(path, table, #table)
void do_sysctl_args(void);

extern int pwrsw_enabled;
extern int unaligned_enabled;
extern int unaligned_dump_stack;
extern int no_unaligned_warning;

extern struct ctl_table sysctl_mount_point[];
extern struct ctl_table random_table[];
extern struct ctl_table firmware_config_table[];
extern struct ctl_table epoll_table[];

#else /* CONFIG_SYSCTL */
static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
{
        return NULL;
}

static inline struct ctl_table_header *register_sysctl_paths(
                        const struct ctl_path *path, struct ctl_table *table)
{
        return NULL;
}

static inline struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *table)
{
        return NULL;
}

static inline void unregister_sysctl_table(struct ctl_table_header * table)
{
}

static inline void setup_sysctl_set(struct ctl_table_set *p,
        struct ctl_table_root *root,
        int (*is_seen)(struct ctl_table_set *))
{
}

static inline void do_sysctl_args(void)
{
}
#endif /* CONFIG_SYSCTL */

int sysctl_max_threads(struct ctl_table *table, int write, void *buffer,
                size_t *lenp, loff_t *ppos);

#endif /* _LINUX_SYSCTL_H */



















































































































































































































    1 

    1 




















































































































    1 
    1 






























































































































































































































































































































































    1 







































































    1 
























































































































































































    1 







    1 


    1 

































































    1 




























































    1 




    1 






    1 






































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
// SPDX-License-Identifier: GPL-2.0
/*
 * linux/kernel/seccomp.c
 *
 * Copyright 2004-2005  Andrea Arcangeli <andrea@cpushare.com>
 *
 * Copyright (C) 2012 Google, Inc.
 * Will Drewry <wad@chromium.org>
 *
 * This defines a simple but solid secure-computing facility.
 *
 * Mode 1 uses a fixed list of allowed system calls.
 * Mode 2 allows user-defined system call filters in the form
 *        of Berkeley Packet Filters/Linux Socket Filters.
 */
#define pr_fmt(fmt) "seccomp: " fmt

#include <linux/refcount.h>
#include <linux/audit.h>
#include <linux/compat.h>
#include <linux/coredump.h>
#include <linux/kmemleak.h>
#include <linux/nospec.h>
#include <linux/prctl.h>
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
#include <linux/seccomp.h>
#include <linux/slab.h>
#include <linux/syscalls.h>
#include <linux/sysctl.h>

/* Not exposed in headers: strictly internal use only. */
#define SECCOMP_MODE_DEAD        (SECCOMP_MODE_FILTER + 1)

#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
#include <asm/syscall.h>
#endif

#ifdef CONFIG_SECCOMP_FILTER
#include <linux/file.h>
#include <linux/filter.h>
#include <linux/pid.h>
#include <linux/ptrace.h>
#include <linux/capability.h>
#include <linux/tracehook.h>
#include <linux/uaccess.h>
#include <linux/anon_inodes.h>
#include <linux/lockdep.h>

/*
 * When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced, it had the
 * wrong direction flag in the ioctl number. This is the broken one,
 * which the kernel needs to keep supporting until all userspaces stop
 * using the wrong command number.
 */
#define SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR        SECCOMP_IOR(2, __u64)

enum notify_state {
        SECCOMP_NOTIFY_INIT,
        SECCOMP_NOTIFY_SENT,
        SECCOMP_NOTIFY_REPLIED,
};

struct seccomp_knotif {
        /* The struct pid of the task whose filter triggered the notification */
        struct task_struct *task;

        /* The "cookie" for this request; this is unique for this filter. */
        u64 id;

        /*
         * The seccomp data. This pointer is valid the entire time this
         * notification is active, since it comes from __seccomp_filter which
         * eclipses the entire lifecycle here.
         */
        const struct seccomp_data *data;

        /*
         * Notification states. When SECCOMP_RET_USER_NOTIF is returned, a
         * struct seccomp_knotif is created and starts out in INIT. Once the
         * handler reads the notification off of an FD, it transitions to SENT.
         * If a signal is received the state transitions back to INIT and
         * another message is sent. When the userspace handler replies, state
         * transitions to REPLIED.
         */
        enum notify_state state;

        /* The return values, only valid when in SECCOMP_NOTIFY_REPLIED */
        int error;
        long val;
        u32 flags;

        /*
         * Signals when this has changed states, such as the listener
         * dying, a new seccomp addfd message, or changing to REPLIED
         */
        struct completion ready;

        struct list_head list;

        /* outstanding addfd requests */
        struct list_head addfd;
};

/**
 * struct seccomp_kaddfd - container for seccomp_addfd ioctl messages
 *
 * @file: A reference to the file to install in the other task
 * @fd: The fd number to install it at. If the fd number is -1, it means the
 *      installing process should allocate the fd as normal.
 * @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC
 *         is allowed.
 * @ret: The return value of the installing process. It is set to the fd num
 *       upon success (>= 0).
 * @completion: Indicates that the installing process has completed fd
 *              installation, or gone away (either due to successful
 *              reply, or signal)
 *
 */
struct seccomp_kaddfd {
        struct file *file;
        int fd;
        unsigned int flags;

        /* To only be set on reply */
        int ret;
        struct completion completion;
        struct list_head list;
};

/**
 * struct notification - container for seccomp userspace notifications. Since
 * most seccomp filters will not have notification listeners attached and this
 * structure is fairly large, we store the notification-specific stuff in a
 * separate structure.
 *
 * @request: A semaphore that users of this notification can wait on for
 *           changes. Actual reads and writes are still controlled with
 *           filter->notify_lock.
 * @next_id: The id of the next request.
 * @notifications: A list of struct seccomp_knotif elements.
 */
struct notification {
        struct semaphore request;
        u64 next_id;
        struct list_head notifications;
};

/**
 * struct seccomp_filter - container for seccomp BPF programs
 *
 * @refs: Reference count to manage the object lifetime.
 *          A filter's reference count is incremented for each directly
 *          attached task, once for the dependent filter, and if
 *          requested for the user notifier. When @refs reaches zero,
 *          the filter can be freed.
 * @users: A filter's @users count is incremented for each directly
 *         attached task (filter installation, fork(), thread_sync),
 *           and once for the dependent filter (tracked in filter->prev).
 *           When it reaches zero it indicates that no direct or indirect
 *           users of that filter exist. No new tasks can get associated with
 *           this filter after reaching 0. The @users count is always smaller
 *           or equal to @refs. Hence, reaching 0 for @users does not mean
 *           the filter can be freed.
 * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
 * @prev: points to a previously installed, or inherited, filter
 * @prog: the BPF program to evaluate
 * @notif: the struct that holds all notification related information
 * @notify_lock: A lock for all notification-related accesses.
 * @wqh: A wait queue for poll if a notifier is in use.
 *
 * seccomp_filter objects are organized in a tree linked via the @prev
 * pointer.  For any task, it appears to be a singly-linked list starting
 * with current->seccomp.filter, the most recently attached or inherited filter.
 * However, multiple filters may share a @prev node, by way of fork(), which
 * results in a unidirectional tree existing in memory.  This is similar to
 * how namespaces work.
 *
 * seccomp_filter objects should never be modified after being attached
 * to a task_struct (other than @refs).
 */
struct seccomp_filter {
        refcount_t refs;
        refcount_t users;
        bool log;
        struct seccomp_filter *prev;
        struct bpf_prog *prog;
        struct notification *notif;
        struct mutex notify_lock;
        wait_queue_head_t wqh;
};

/* Limit any path through the tree to 256KB worth of instructions. */
#define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))

/*
 * Endianness is explicitly ignored and left for BPF program authors to manage
 * as per the specific architecture.
 */
static void populate_seccomp_data(struct seccomp_data *sd)
{
        /*
         * Instead of using current_pt_reg(), we're already doing the work
         * to safely fetch "current", so just use "task" everywhere below.
         */
        struct task_struct *task = current;
        struct pt_regs *regs = task_pt_regs(task);
        unsigned long args[6];

        sd->nr = syscall_get_nr(task, regs);
        sd->arch = syscall_get_arch(task);
        syscall_get_arguments(task, regs, args);
        sd->args[0] = args[0];
        sd->args[1] = args[1];
        sd->args[2] = args[2];
        sd->args[3] = args[3];
        sd->args[4] = args[4];
        sd->args[5] = args[5];
        sd->instruction_pointer = KSTK_EIP(task);
}

/**
 *        seccomp_check_filter - verify seccomp filter code
 *        @filter: filter to verify
 *        @flen: length of filter
 *
 * Takes a previously checked filter (by bpf_check_classic) and
 * redirects all filter code that loads struct sk_buff data
 * and related data through seccomp_bpf_load.  It also
 * enforces length and alignment checking of those loads.
 *
 * Returns 0 if the rule set is legal or -EINVAL if not.
 */
static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
{
        int pc;
        for (pc = 0; pc < flen; pc++) {
                struct sock_filter *ftest = &filter[pc];
                u16 code = ftest->code;
                u32 k = ftest->k;

                switch (code) {
                case BPF_LD | BPF_W | BPF_ABS:
                        ftest->code = BPF_LDX | BPF_W | BPF_ABS;
                        /* 32-bit aligned and not out of bounds. */
                        if (k >= sizeof(struct seccomp_data) || k & 3)
                                return -EINVAL;
                        continue;
                case BPF_LD | BPF_W | BPF_LEN:
                        ftest->code = BPF_LD | BPF_IMM;
                        ftest->k = sizeof(struct seccomp_data);
                        continue;
                case BPF_LDX | BPF_W | BPF_LEN:
                        ftest->code = BPF_LDX | BPF_IMM;
                        ftest->k = sizeof(struct seccomp_data);
                        continue;
                /* Explicitly include allowed calls. */
                case BPF_RET | BPF_K:
                case BPF_RET | BPF_A:
                case BPF_ALU | BPF_ADD | BPF_K:
                case BPF_ALU | BPF_ADD | BPF_X:
                case BPF_ALU | BPF_SUB | BPF_K:
                case BPF_ALU | BPF_SUB | BPF_X:
                case BPF_ALU | BPF_MUL | BPF_K:
                case BPF_ALU | BPF_MUL | BPF_X:
                case BPF_ALU | BPF_DIV | BPF_K:
                case BPF_ALU | BPF_DIV | BPF_X:
                case BPF_ALU | BPF_AND | BPF_K:
                case BPF_ALU | BPF_AND | BPF_X:
                case BPF_ALU | BPF_OR | BPF_K:
                case BPF_ALU | BPF_OR | BPF_X:
                case BPF_ALU | BPF_XOR | BPF_K:
                case BPF_ALU | BPF_XOR | BPF_X:
                case BPF_ALU | BPF_LSH | BPF_K:
                case BPF_ALU | BPF_LSH | BPF_X:
                case BPF_ALU | BPF_RSH | BPF_K:
                case BPF_ALU | BPF_RSH | BPF_X:
                case BPF_ALU | BPF_NEG:
                case BPF_LD | BPF_IMM:
                case BPF_LDX | BPF_IMM:
                case BPF_MISC | BPF_TAX:
                case BPF_MISC | BPF_TXA:
                case BPF_LD | BPF_MEM:
                case BPF_LDX | BPF_MEM:
                case BPF_ST:
                case BPF_STX:
                case BPF_JMP | BPF_JA:
                case BPF_JMP | BPF_JEQ | BPF_K:
                case BPF_JMP | BPF_JEQ | BPF_X:
                case BPF_JMP | BPF_JGE | BPF_K:
                case BPF_JMP | BPF_JGE | BPF_X:
                case BPF_JMP | BPF_JGT | BPF_K:
                case BPF_JMP | BPF_JGT | BPF_X:
                case BPF_JMP | BPF_JSET | BPF_K:
                case BPF_JMP | BPF_JSET | BPF_X:
                        continue;
                default:
                        return -EINVAL;
                }
        }
        return 0;
}

/**
 * seccomp_run_filters - evaluates all seccomp filters against @sd
 * @sd: optional seccomp data to be passed to filters
 * @match: stores struct seccomp_filter that resulted in the return value,
 *         unless filter returned SECCOMP_RET_ALLOW, in which case it will
 *         be unchanged.
 *
 * Returns valid seccomp BPF response codes.
 */
#define ACTION_ONLY(ret) ((s32)((ret) & (SECCOMP_RET_ACTION_FULL)))
static u32 seccomp_run_filters(const struct seccomp_data *sd,
                               struct seccomp_filter **match)
{
        u32 ret = SECCOMP_RET_ALLOW;
        /* Make sure cross-thread synced filter points somewhere sane. */
        struct seccomp_filter *f =
                        READ_ONCE(current->seccomp.filter);

        /* Ensure unexpected behavior doesn't result in failing open. */
        if (WARN_ON(f == NULL))
                return SECCOMP_RET_KILL_PROCESS;

        /*
         * All filters in the list are evaluated and the lowest BPF return
         * value always takes priority (ignoring the DATA).
         */
        for (; f; f = f->prev) {
                u32 cur_ret = bpf_prog_run_pin_on_cpu(f->prog, sd);

                if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) {
                        ret = cur_ret;
                        *match = f;
                }
        }
        return ret;
}
#endif /* CONFIG_SECCOMP_FILTER */

static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
{
        assert_spin_locked(&current->sighand->siglock);

        if (current->seccomp.mode && current->seccomp.mode != seccomp_mode)
                return false;

        return true;
}

void __weak arch_seccomp_spec_mitigate(struct task_struct *task) { }

static inline void seccomp_assign_mode(struct task_struct *task,
                                       unsigned long seccomp_mode,
                                       unsigned long flags)
{
        assert_spin_locked(&task->sighand->siglock);

        task->seccomp.mode = seccomp_mode;
        /*
         * Make sure TIF_SECCOMP cannot be set before the mode (and
         * filter) is set.
         */
        smp_mb__before_atomic();
        /* Assume default seccomp processes want spec flaw mitigation. */
        if ((flags & SECCOMP_FILTER_FLAG_SPEC_ALLOW) == 0)
                arch_seccomp_spec_mitigate(task);
        set_tsk_thread_flag(task, TIF_SECCOMP);
}

#ifdef CONFIG_SECCOMP_FILTER
/* Returns 1 if the parent is an ancestor of the child. */
static int is_ancestor(struct seccomp_filter *parent,
                       struct seccomp_filter *child)
{
        /* NULL is the root ancestor. */
        if (parent == NULL)
                return 1;
        for (; child; child = child->prev)
                if (child == parent)
                        return 1;
        return 0;
}

/**
 * seccomp_can_sync_threads: checks if all threads can be synchronized
 *
 * Expects sighand and cred_guard_mutex locks to be held.
 *
 * Returns 0 on success, -ve on error, or the pid of a thread which was
 * either not in the correct seccomp mode or did not have an ancestral
 * seccomp filter.
 */
static inline pid_t seccomp_can_sync_threads(void)
{
        struct task_struct *thread, *caller;

        BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
        assert_spin_locked(&current->sighand->siglock);

        /* Validate all threads being eligible for synchronization. */
        caller = current;
        for_each_thread(caller, thread) {
                pid_t failed;

                /* Skip current, since it is initiating the sync. */
                if (thread == caller)
                        continue;

                if (thread->seccomp.mode == SECCOMP_MODE_DISABLED ||
                    (thread->seccomp.mode == SECCOMP_MODE_FILTER &&
                     is_ancestor(thread->seccomp.filter,
                                 caller->seccomp.filter)))
                        continue;

                /* Return the first thread that cannot be synchronized. */
                failed = task_pid_vnr(thread);
                /* If the pid cannot be resolved, then return -ESRCH */
                if (WARN_ON(failed == 0))
                        failed = -ESRCH;
                return failed;
        }

        return 0;
}

static inline void seccomp_filter_free(struct seccomp_filter *filter)
{
        if (filter) {
                bpf_prog_destroy(filter->prog);
                kfree(filter);
        }
}

static void __seccomp_filter_orphan(struct seccomp_filter *orig)
{
        while (orig && refcount_dec_and_test(&orig->users)) {
                if (waitqueue_active(&orig->wqh))
                        wake_up_poll(&orig->wqh, EPOLLHUP);
                orig = orig->prev;
        }
}

static void __put_seccomp_filter(struct seccomp_filter *orig)
{
        /* Clean up single-reference branches iteratively. */
        while (orig && refcount_dec_and_test(&orig->refs)) {
                struct seccomp_filter *freeme = orig;
                orig = orig->prev;
                seccomp_filter_free(freeme);
        }
}

static void __seccomp_filter_release(struct seccomp_filter *orig)
{
        /* Notify about any unused filters in the task's former filter tree. */
        __seccomp_filter_orphan(orig);
        /* Finally drop all references to the task's former tree. */
        __put_seccomp_filter(orig);
}

/**
 * seccomp_filter_release - Detach the task from its filter tree,
 *                            drop its reference count, and notify
 *                            about unused filters
 *
 * This function should only be called when the task is exiting as
 * it detaches it from its filter tree. As such, READ_ONCE() and
 * barriers are not needed here, as would normally be needed.
 */
void seccomp_filter_release(struct task_struct *tsk)
{
        struct seccomp_filter *orig = tsk->seccomp.filter;

        /* Detach task from its filter tree. */
        tsk->seccomp.filter = NULL;
        __seccomp_filter_release(orig);
}

/**
 * seccomp_sync_threads: sets all threads to use current's filter
 *
 * Expects sighand and cred_guard_mutex locks to be held, and for
 * seccomp_can_sync_threads() to have returned success already
 * without dropping the locks.
 *
 */
static inline void seccomp_sync_threads(unsigned long flags)
{
        struct task_struct *thread, *caller;

        BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
        assert_spin_locked(&current->sighand->siglock);

        /* Synchronize all threads. */
        caller = current;
        for_each_thread(caller, thread) {
                /* Skip current, since it needs no changes. */
                if (thread == caller)
                        continue;

                /* Get a task reference for the new leaf node. */
                get_seccomp_filter(caller);

                /*
                 * Drop the task reference to the shared ancestor since
                 * current's path will hold a reference.  (This also
                 * allows a put before the assignment.)
                 */
                __seccomp_filter_release(thread->seccomp.filter);

                /* Make our new filter tree visible. */
                smp_store_release(&thread->seccomp.filter,
                                  caller->seccomp.filter);
                atomic_set(&thread->seccomp.filter_count,
                           atomic_read(&caller->seccomp.filter_count));

                /*
                 * Don't let an unprivileged task work around
                 * the no_new_privs restriction by creating
                 * a thread that sets it up, enters seccomp,
                 * then dies.
                 */
                if (task_no_new_privs(caller))
                        task_set_no_new_privs(thread);

                /*
                 * Opt the other thread into seccomp if needed.
                 * As threads are considered to be trust-realm
                 * equivalent (see ptrace_may_access), it is safe to
                 * allow one thread to transition the other.
                 */
                if (thread->seccomp.mode == SECCOMP_MODE_DISABLED)
                        seccomp_assign_mode(thread, SECCOMP_MODE_FILTER,
                                            flags);
        }
}

/**
 * seccomp_prepare_filter: Prepares a seccomp filter for use.
 * @fprog: BPF program to install
 *
 * Returns filter on success or an ERR_PTR on failure.
 */
static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
{
        struct seccomp_filter *sfilter;
        int ret;
        const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE);

        if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
                return ERR_PTR(-EINVAL);

        BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));

        /*
         * Installing a seccomp filter requires that the task has
         * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
         * This avoids scenarios where unprivileged tasks can affect the
         * behavior of privileged children.
         */
        if (!task_no_new_privs(current) &&
                        !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
                return ERR_PTR(-EACCES);

        /* Allocate a new seccomp_filter */
        sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN);
        if (!sfilter)
                return ERR_PTR(-ENOMEM);

        mutex_init(&sfilter->notify_lock);
        ret = bpf_prog_create_from_user(&sfilter->prog, fprog,
                                        seccomp_check_filter, save_orig);
        if (ret < 0) {
                kfree(sfilter);
                return ERR_PTR(ret);
        }

        refcount_set(&sfilter->refs, 1);
        refcount_set(&sfilter->users, 1);
        init_waitqueue_head(&sfilter->wqh);

        return sfilter;
}

/**
 * seccomp_prepare_user_filter - prepares a user-supplied sock_fprog
 * @user_filter: pointer to the user data containing a sock_fprog.
 *
 * Returns 0 on success and non-zero otherwise.
 */
static struct seccomp_filter *
seccomp_prepare_user_filter(const char __user *user_filter)
{
        struct sock_fprog fprog;
        struct seccomp_filter *filter = ERR_PTR(-EFAULT);

#ifdef CONFIG_COMPAT
        if (in_compat_syscall()) {
                struct compat_sock_fprog fprog32;
                if (copy_from_user(&fprog32, user_filter, sizeof(fprog32)))
                        goto out;
                fprog.len = fprog32.len;
                fprog.filter = compat_ptr(fprog32.filter);
        } else /* falls through to the if below. */
#endif
        if (copy_from_user(&fprog, user_filter, sizeof(fprog)))
                goto out;
        filter = seccomp_prepare_filter(&fprog);
out:
        return filter;
}

/**
 * seccomp_attach_filter: validate and attach filter
 * @flags:  flags to change filter behavior
 * @filter: seccomp filter to add to the current process
 *
 * Caller must be holding current->sighand->siglock lock.
 *
 * Returns 0 on success, -ve on error, or
 *   - in TSYNC mode: the pid of a thread which was either not in the correct
 *     seccomp mode or did not have an ancestral seccomp filter
 *   - in NEW_LISTENER mode: the fd of the new listener
 */
static long seccomp_attach_filter(unsigned int flags,
                                  struct seccomp_filter *filter)
{
        unsigned long total_insns;
        struct seccomp_filter *walker;

        assert_spin_locked(&current->sighand->siglock);

        /* Validate resulting filter length. */
        total_insns = filter->prog->len;
        for (walker = current->seccomp.filter; walker; walker = walker->prev)
                total_insns += walker->prog->len + 4;  /* 4 instr penalty */
        if (total_insns > MAX_INSNS_PER_PATH)
                return -ENOMEM;

        /* If thread sync has been requested, check that it is possible. */
        if (flags & SECCOMP_FILTER_FLAG_TSYNC) {
                int ret;

                ret = seccomp_can_sync_threads();
                if (ret) {
                        if (flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH)
                                return -ESRCH;
                        else
                                return ret;
                }
        }

        /* Set log flag, if present. */
        if (flags & SECCOMP_FILTER_FLAG_LOG)
                filter->log = true;

        /*
         * If there is an existing filter, make it the prev and don't drop its
         * task reference.
         */
        filter->prev = current->seccomp.filter;
        current->seccomp.filter = filter;
        atomic_inc(&current->seccomp.filter_count);

        /* Now that the new filter is in place, synchronize to all threads. */
        if (flags & SECCOMP_FILTER_FLAG_TSYNC)
                seccomp_sync_threads(flags);

        return 0;
}

static void __get_seccomp_filter(struct seccomp_filter *filter)
{
        refcount_inc(&filter->refs);
}

/* get_seccomp_filter - increments the reference count of the filter on @tsk */
void get_seccomp_filter(struct task_struct *tsk)
{
        struct seccomp_filter *orig = tsk->seccomp.filter;
        if (!orig)
                return;
        __get_seccomp_filter(orig);
        refcount_inc(&orig->users);
}

static void seccomp_init_siginfo(kernel_siginfo_t *info, int syscall, int reason)
{
        clear_siginfo(info);
        info->si_signo = SIGSYS;
        info->si_code = SYS_SECCOMP;
        info->si_call_addr = (void __user *)KSTK_EIP(current);
        info->si_errno = reason;
        info->si_arch = syscall_get_arch(current);
        info->si_syscall = syscall;
}

/**
 * seccomp_send_sigsys - signals the task to allow in-process syscall emulation
 * @syscall: syscall number to send to userland
 * @reason: filter-supplied reason code to send to userland (via si_errno)
 *
 * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info.
 */
static void seccomp_send_sigsys(int syscall, int reason)
{
        struct kernel_siginfo info;
        seccomp_init_siginfo(&info, syscall, reason);
        force_sig_info(&info);
}
#endif        /* CONFIG_SECCOMP_FILTER */

/* For use with seccomp_actions_logged */
#define SECCOMP_LOG_KILL_PROCESS        (1 << 0)
#define SECCOMP_LOG_KILL_THREAD                (1 << 1)
#define SECCOMP_LOG_TRAP                (1 << 2)
#define SECCOMP_LOG_ERRNO                (1 << 3)
#define SECCOMP_LOG_TRACE                (1 << 4)
#define SECCOMP_LOG_LOG                        (1 << 5)
#define SECCOMP_LOG_ALLOW                (1 << 6)
#define SECCOMP_LOG_USER_NOTIF                (1 << 7)

static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_PROCESS |
                                    SECCOMP_LOG_KILL_THREAD  |
                                    SECCOMP_LOG_TRAP  |
                                    SECCOMP_LOG_ERRNO |
                                    SECCOMP_LOG_USER_NOTIF |
                                    SECCOMP_LOG_TRACE |
                                    SECCOMP_LOG_LOG;

static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
                               bool requested)
{
        bool log = false;

        switch (action) {
        case SECCOMP_RET_ALLOW:
                break;
        case SECCOMP_RET_TRAP:
                log = requested && seccomp_actions_logged & SECCOMP_LOG_TRAP;
                break;
        case SECCOMP_RET_ERRNO:
                log = requested && seccomp_actions_logged & SECCOMP_LOG_ERRNO;
                break;
        case SECCOMP_RET_TRACE:
                log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE;
                break;
        case SECCOMP_RET_USER_NOTIF:
                log = requested && seccomp_actions_logged & SECCOMP_LOG_USER_NOTIF;
                break;
        case SECCOMP_RET_LOG:
                log = seccomp_actions_logged & SECCOMP_LOG_LOG;
                break;
        case SECCOMP_RET_KILL_THREAD:
                log = seccomp_actions_logged & SECCOMP_LOG_KILL_THREAD;
                break;
        case SECCOMP_RET_KILL_PROCESS:
        default:
                log = seccomp_actions_logged & SECCOMP_LOG_KILL_PROCESS;
        }

        /*
         * Emit an audit message when the action is RET_KILL_*, RET_LOG, or the
         * FILTER_FLAG_LOG bit was set. The admin has the ability to silence
         * any action from being logged by removing the action name from the
         * seccomp_actions_logged sysctl.
         */
        if (!log)
                return;

        audit_seccomp(syscall, signr, action);
}

/*
 * Secure computing mode 1 allows only read/write/exit/sigreturn.
 * To be fully secure this must be combined with rlimit
 * to limit the stack allocations too.
 */
static const int mode1_syscalls[] = {
        __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn,
        -1, /* negative terminated */
};

static void __secure_computing_strict(int this_syscall)
{
        const int *allowed_syscalls = mode1_syscalls;
#ifdef CONFIG_COMPAT
        if (in_compat_syscall())
                allowed_syscalls = get_compat_mode1_syscalls();
#endif
        do {
                if (*allowed_syscalls == this_syscall)
                        return;
        } while (*++allowed_syscalls != -1);

#ifdef SECCOMP_DEBUG
        dump_stack();
#endif
        current->seccomp.mode = SECCOMP_MODE_DEAD;
        seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL_THREAD, true);
        do_exit(SIGKILL);
}

#ifndef CONFIG_HAVE_ARCH_SECCOMP_FILTER
void secure_computing_strict(int this_syscall)
{
        int mode = current->seccomp.mode;

        if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) &&
            unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
                return;

        if (mode == SECCOMP_MODE_DISABLED)
                return;
        else if (mode == SECCOMP_MODE_STRICT)
                __secure_computing_strict(this_syscall);
        else
                BUG();
}
#else

#ifdef CONFIG_SECCOMP_FILTER
static u64 seccomp_next_notify_id(struct seccomp_filter *filter)
{
        /*
         * Note: overflow is ok here, the id just needs to be unique per
         * filter.
         */
        lockdep_assert_held(&filter->notify_lock);
        return filter->notif->next_id++;
}

static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd)
{
        /*
         * Remove the notification, and reset the list pointers, indicating
         * that it has been handled.
         */
        list_del_init(&addfd->list);
        addfd->ret = receive_fd_replace(addfd->fd, addfd->file, addfd->flags);
        complete(&addfd->completion);
}

static int seccomp_do_user_notification(int this_syscall,
                                        struct seccomp_filter *match,
                                        const struct seccomp_data *sd)
{
        int err;
        u32 flags = 0;
        long ret = 0;
        struct seccomp_knotif n = {};
        struct seccomp_kaddfd *addfd, *tmp;

        mutex_lock(&match->notify_lock);
        err = -ENOSYS;
        if (!match->notif)
                goto out;

        n.task = current;
        n.state = SECCOMP_NOTIFY_INIT;
        n.data = sd;
        n.id = seccomp_next_notify_id(match);
        init_completion(&n.ready);
        list_add(&n.list, &match->notif->notifications);
        INIT_LIST_HEAD(&n.addfd);

        up(&match->notif->request);
        wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);

        /*
         * This is where we wait for a reply from userspace.
         */
        do {
                mutex_unlock(&match->notify_lock);
                err = wait_for_completion_interruptible(&n.ready);
                mutex_lock(&match->notify_lock);
                if (err != 0)
                        goto interrupted;

                addfd = list_first_entry_or_null(&n.addfd,
                                                 struct seccomp_kaddfd, list);
                /* Check if we were woken up by a addfd message */
                if (addfd)
                        seccomp_handle_addfd(addfd);

        }  while (n.state != SECCOMP_NOTIFY_REPLIED);

        ret = n.val;
        err = n.error;
        flags = n.flags;

interrupted:
        /* If there were any pending addfd calls, clear them out */
        list_for_each_entry_safe(addfd, tmp, &n.addfd, list) {
                /* The process went away before we got a chance to handle it */
                addfd->ret = -ESRCH;
                list_del_init(&addfd->list);
                complete(&addfd->completion);
        }

        /*
         * Note that it's possible the listener died in between the time when
         * we were notified of a response (or a signal) and when we were able to
         * re-acquire the lock, so only delete from the list if the
         * notification actually exists.
         *
         * Also note that this test is only valid because there's no way to
         * *reattach* to a notifier right now. If one is added, we'll need to
         * keep track of the notif itself and make sure they match here.
         */
        if (match->notif)
                list_del(&n.list);
out:
        mutex_unlock(&match->notify_lock);

        /* Userspace requests to continue the syscall. */
        if (flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE)
                return 0;

        syscall_set_return_value(current, current_pt_regs(),
                                 err, ret);
        return -1;
}

static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
                            const bool recheck_after_trace)
{
        u32 filter_ret, action;
        struct seccomp_filter *match = NULL;
        int data;
        struct seccomp_data sd_local;

        /*
         * Make sure that any changes to mode from another thread have
         * been seen after TIF_SECCOMP was seen.
         */
        rmb();

        if (!sd) {
                populate_seccomp_data(&sd_local);
                sd = &sd_local;
        }

        filter_ret = seccomp_run_filters(sd, &match);
        data = filter_ret & SECCOMP_RET_DATA;
        action = filter_ret & SECCOMP_RET_ACTION_FULL;

        switch (action) {
        case SECCOMP_RET_ERRNO:
                /* Set low-order bits as an errno, capped at MAX_ERRNO. */
                if (data > MAX_ERRNO)
                        data = MAX_ERRNO;
                syscall_set_return_value(current, current_pt_regs(),
                                         -data, 0);
                goto skip;

        case SECCOMP_RET_TRAP:
                /* Show the handler the original registers. */
                syscall_rollback(current, current_pt_regs());
                /* Let the filter pass back 16 bits of data. */
                seccomp_send_sigsys(this_syscall, data);
                goto skip;

        case SECCOMP_RET_TRACE:
                /* We've been put in this state by the ptracer already. */
                if (recheck_after_trace)
                        return 0;

                /* ENOSYS these calls if there is no tracer attached. */
                if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
                        syscall_set_return_value(current,
                                                 current_pt_regs(),
                                                 -ENOSYS, 0);
                        goto skip;
                }

                /* Allow the BPF to provide the event message */
                ptrace_event(PTRACE_EVENT_SECCOMP, data);
                /*
                 * The delivery of a fatal signal during event
                 * notification may silently skip tracer notification,
                 * which could leave us with a potentially unmodified
                 * syscall that the tracer would have liked to have
                 * changed. Since the process is about to die, we just
                 * force the syscall to be skipped and let the signal
                 * kill the process and correctly handle any tracer exit
                 * notifications.
                 */
                if (fatal_signal_pending(current))
                        goto skip;
                /* Check if the tracer forced the syscall to be skipped. */
                this_syscall = syscall_get_nr(current, current_pt_regs());
                if (this_syscall < 0)
                        goto skip;

                /*
                 * Recheck the syscall, since it may have changed. This
                 * intentionally uses a NULL struct seccomp_data to force
                 * a reload of all registers. This does not goto skip since
                 * a skip would have already been reported.
                 */
                if (__seccomp_filter(this_syscall, NULL, true))
                        return -1;

                return 0;

        case SECCOMP_RET_USER_NOTIF:
                if (seccomp_do_user_notification(this_syscall, match, sd))
                        goto skip;

                return 0;

        case SECCOMP_RET_LOG:
                seccomp_log(this_syscall, 0, action, true);
                return 0;

        case SECCOMP_RET_ALLOW:
                /*
                 * Note that the "match" filter will always be NULL for
                 * this action since SECCOMP_RET_ALLOW is the starting
                 * state in seccomp_run_filters().
                 */
                return 0;

        case SECCOMP_RET_KILL_THREAD:
        case SECCOMP_RET_KILL_PROCESS:
        default:
                current->seccomp.mode = SECCOMP_MODE_DEAD;
                seccomp_log(this_syscall, SIGSYS, action, true);
                /* Dump core only if this is the last remaining thread. */
                if (action != SECCOMP_RET_KILL_THREAD ||
                    get_nr_threads(current) == 1) {
                        kernel_siginfo_t info;

                        /* Show the original registers in the dump. */
                        syscall_rollback(current, current_pt_regs());
                        /* Trigger a manual coredump since do_exit skips it. */
                        seccomp_init_siginfo(&info, this_syscall, data);
                        do_coredump(&info);
                }
                if (action == SECCOMP_RET_KILL_THREAD)
                        do_exit(SIGSYS);
                else
                        do_group_exit(SIGSYS);
        }

        unreachable();

skip:
        seccomp_log(this_syscall, 0, action, match ? match->log : false);
        return -1;
}
#else
static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
                            const bool recheck_after_trace)
{
        BUG();

        return -1;
}
#endif

int __secure_computing(const struct seccomp_data *sd)
{
        int mode = current->seccomp.mode;
        int this_syscall;

        if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) &&
            unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
                return 0;

        this_syscall = sd ? sd->nr :
                syscall_get_nr(current, current_pt_regs());

        switch (mode) {
        case SECCOMP_MODE_STRICT:
                __secure_computing_strict(this_syscall);  /* may call do_exit */
                return 0;
        case SECCOMP_MODE_FILTER:
                return __seccomp_filter(this_syscall, sd, false);
        /* Surviving SECCOMP_RET_KILL_* must be proactively impossible. */
        case SECCOMP_MODE_DEAD:
                WARN_ON_ONCE(1);
                do_exit(SIGKILL);
                return -1;
        default:
                BUG();
        }
}
#endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */

long prctl_get_seccomp(void)
{
        return current->seccomp.mode;
}

/**
 * seccomp_set_mode_strict: internal function for setting strict seccomp
 *
 * Once current->seccomp.mode is non-zero, it may not be changed.
 *
 * Returns 0 on success or -EINVAL on failure.
 */
static long seccomp_set_mode_strict(void)
{
        const unsigned long seccomp_mode = SECCOMP_MODE_STRICT;
        long ret = -EINVAL;

        spin_lock_irq(&current->sighand->siglock);

        if (!seccomp_may_assign_mode(seccomp_mode))
                goto out;

#ifdef TIF_NOTSC
        disable_TSC();
#endif
        seccomp_assign_mode(current, seccomp_mode, 0);
        ret = 0;

out:
        spin_unlock_irq(&current->sighand->siglock);

        return ret;
}

#ifdef CONFIG_SECCOMP_FILTER
static void seccomp_notify_free(struct seccomp_filter *filter)
{
        kfree(filter->notif);
        filter->notif = NULL;
}

static void seccomp_notify_detach(struct seccomp_filter *filter)
{
        struct seccomp_knotif *knotif;

        if (!filter)
                return;

        mutex_lock(&filter->notify_lock);

        /*
         * If this file is being closed because e.g. the task who owned it
         * died, let's wake everyone up who was waiting on us.
         */
        list_for_each_entry(knotif, &filter->notif->notifications, list) {
                if (knotif->state == SECCOMP_NOTIFY_REPLIED)
                        continue;

                knotif->state = SECCOMP_NOTIFY_REPLIED;
                knotif->error = -ENOSYS;
                knotif->val = 0;

                /*
                 * We do not need to wake up any pending addfd messages, as
                 * the notifier will do that for us, as this just looks
                 * like a standard reply.
                 */
                complete(&knotif->ready);
        }

        seccomp_notify_free(filter);
        mutex_unlock(&filter->notify_lock);
}

static int seccomp_notify_release(struct inode *inode, struct file *file)
{
        struct seccomp_filter *filter = file->private_data;

        seccomp_notify_detach(filter);
        __put_seccomp_filter(filter);
        return 0;
}

/* must be called with notif_lock held */
static inline struct seccomp_knotif *
find_notification(struct seccomp_filter *filter, u64 id)
{
        struct seccomp_knotif *cur;

        lockdep_assert_held(&filter->notify_lock);

        list_for_each_entry(cur, &filter->notif->notifications, list) {
                if (cur->id == id)
                        return cur;
        }

        return NULL;
}


static long seccomp_notify_recv(struct seccomp_filter *filter,
                                void __user *buf)
{
        struct seccomp_knotif *knotif = NULL, *cur;
        struct seccomp_notif unotif;
        ssize_t ret;

        /* Verify that we're not given garbage to keep struct extensible. */
        ret = check_zeroed_user(buf, sizeof(unotif));
        if (ret < 0)
                return ret;
        if (!ret)
                return -EINVAL;

        memset(&unotif, 0, sizeof(unotif));

        ret = down_interruptible(&filter->notif->request);
        if (ret < 0)
                return ret;

        mutex_lock(&filter->notify_lock);
        list_for_each_entry(cur, &filter->notif->notifications, list) {
                if (cur->state == SECCOMP_NOTIFY_INIT) {
                        knotif = cur;
                        break;
                }
        }

        /*
         * If we didn't find a notification, it could be that the task was
         * interrupted by a fatal signal between the time we were woken and
         * when we were able to acquire the rw lock.
         */
        if (!knotif) {
                ret = -ENOENT;
                goto out;
        }

        unotif.id = knotif->id;
        unotif.pid = task_pid_vnr(knotif->task);
        unotif.data = *(knotif->data);

        knotif->state = SECCOMP_NOTIFY_SENT;
        wake_up_poll(&filter->wqh, EPOLLOUT | EPOLLWRNORM);
        ret = 0;
out:
        mutex_unlock(&filter->notify_lock);

        if (ret == 0 && copy_to_user(buf, &unotif, sizeof(unotif))) {
                ret = -EFAULT;

                /*
                 * Userspace screwed up. To make sure that we keep this
                 * notification alive, let's reset it back to INIT. It
                 * may have died when we released the lock, so we need to make
                 * sure it's still around.
                 */
                mutex_lock(&filter->notify_lock);
                knotif = find_notification(filter, unotif.id);
                if (knotif) {
                        knotif->state = SECCOMP_NOTIFY_INIT;
                        up(&filter->notif->request);
                }
                mutex_unlock(&filter->notify_lock);
        }

        return ret;
}

static long seccomp_notify_send(struct seccomp_filter *filter,
                                void __user *buf)
{
        struct seccomp_notif_resp resp = {};
        struct seccomp_knotif *knotif;
        long ret;

        if (copy_from_user(&resp, buf, sizeof(resp)))
                return -EFAULT;

        if (resp.flags & ~SECCOMP_USER_NOTIF_FLAG_CONTINUE)
                return -EINVAL;

        if ((resp.flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE) &&
            (resp.error || resp.val))
                return -EINVAL;

        ret = mutex_lock_interruptible(&filter->notify_lock);
        if (ret < 0)
                return ret;

        knotif = find_notification(filter, resp.id);
        if (!knotif) {
                ret = -ENOENT;
                goto out;
        }

        /* Allow exactly one reply. */
        if (knotif->state != SECCOMP_NOTIFY_SENT) {
                ret = -EINPROGRESS;
                goto out;
        }

        ret = 0;
        knotif->state = SECCOMP_NOTIFY_REPLIED;
        knotif->error = resp.error;
        knotif->val = resp.val;
        knotif->flags = resp.flags;
        complete(&knotif->ready);
out:
        mutex_unlock(&filter->notify_lock);
        return ret;
}

static long seccomp_notify_id_valid(struct seccomp_filter *filter,
                                    void __user *buf)
{
        struct seccomp_knotif *knotif;
        u64 id;
        long ret;

        if (copy_from_user(&id, buf, sizeof(id)))
                return -EFAULT;

        ret = mutex_lock_interruptible(&filter->notify_lock);
        if (ret < 0)
                return ret;

        knotif = find_notification(filter, id);
        if (knotif && knotif->state == SECCOMP_NOTIFY_SENT)
                ret = 0;
        else
                ret = -ENOENT;

        mutex_unlock(&filter->notify_lock);
        return ret;
}

static long seccomp_notify_addfd(struct seccomp_filter *filter,
                                 struct seccomp_notif_addfd __user *uaddfd,
                                 unsigned int size)
{
        struct seccomp_notif_addfd addfd;
        struct seccomp_knotif *knotif;
        struct seccomp_kaddfd kaddfd;
        int ret;

        BUILD_BUG_ON(sizeof(addfd) < SECCOMP_NOTIFY_ADDFD_SIZE_VER0);
        BUILD_BUG_ON(sizeof(addfd) != SECCOMP_NOTIFY_ADDFD_SIZE_LATEST);

        if (size < SECCOMP_NOTIFY_ADDFD_SIZE_VER0 || size >= PAGE_SIZE)
                return -EINVAL;

        ret = copy_struct_from_user(&addfd, sizeof(addfd), uaddfd, size);
        if (ret)
                return ret;

        if (addfd.newfd_flags & ~O_CLOEXEC)
                return -EINVAL;

        if (addfd.flags & ~SECCOMP_ADDFD_FLAG_SETFD)
                return -EINVAL;

        if (addfd.newfd && !(addfd.flags & SECCOMP_ADDFD_FLAG_SETFD))
                return -EINVAL;

        kaddfd.file = fget(addfd.srcfd);
        if (!kaddfd.file)
                return -EBADF;

        kaddfd.flags = addfd.newfd_flags;
        kaddfd.fd = (addfd.flags & SECCOMP_ADDFD_FLAG_SETFD) ?
                    addfd.newfd : -1;
        init_completion(&kaddfd.completion);

        ret = mutex_lock_interruptible(&filter->notify_lock);
        if (ret < 0)
                goto out;

        knotif = find_notification(filter, addfd.id);
        if (!knotif) {
                ret = -ENOENT;
                goto out_unlock;
        }

        /*
         * We do not want to allow for FD injection to occur before the
         * notification has been picked up by a userspace handler, or after
         * the notification has been replied to.
         */
        if (knotif->state != SECCOMP_NOTIFY_SENT) {
                ret = -EINPROGRESS;
                goto out_unlock;
        }

        list_add(&kaddfd.list, &knotif->addfd);
        complete(&knotif->ready);
        mutex_unlock(&filter->notify_lock);

        /* Now we wait for it to be processed or be interrupted */
        ret = wait_for_completion_interruptible(&kaddfd.completion);
        if (ret == 0) {
                /*
                 * We had a successful completion. The other side has already
                 * removed us from the addfd queue, and
                 * wait_for_completion_interruptible has a memory barrier upon
                 * success that lets us read this value directly without
                 * locking.
                 */
                ret = kaddfd.ret;
                goto out;
        }

        mutex_lock(&filter->notify_lock);
        /*
         * Even though we were woken up by a signal and not a successful
         * completion, a completion may have happened in the mean time.
         *
         * We need to check again if the addfd request has been handled,
         * and if not, we will remove it from the queue.
         */
        if (list_empty(&kaddfd.list))
                ret = kaddfd.ret;
        else
                list_del(&kaddfd.list);

out_unlock:
        mutex_unlock(&filter->notify_lock);
out:
        fput(kaddfd.file);

        return ret;
}

static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
                                 unsigned long arg)
{
        struct seccomp_filter *filter = file->private_data;
        void __user *buf = (void __user *)arg;

        /* Fixed-size ioctls */
        switch (cmd) {
        case SECCOMP_IOCTL_NOTIF_RECV:
                return seccomp_notify_recv(filter, buf);
        case SECCOMP_IOCTL_NOTIF_SEND:
                return seccomp_notify_send(filter, buf);
        case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR:
        case SECCOMP_IOCTL_NOTIF_ID_VALID:
                return seccomp_notify_id_valid(filter, buf);
        }

        /* Extensible Argument ioctls */
#define EA_IOCTL(cmd)        ((cmd) & ~(IOC_INOUT | IOCSIZE_MASK))
        switch (EA_IOCTL(cmd)) {
        case EA_IOCTL(SECCOMP_IOCTL_NOTIF_ADDFD):
                return seccomp_notify_addfd(filter, buf, _IOC_SIZE(cmd));
        default:
                return -EINVAL;
        }
}

static __poll_t seccomp_notify_poll(struct file *file,
                                    struct poll_table_struct *poll_tab)
{
        struct seccomp_filter *filter = file->private_data;
        __poll_t ret = 0;
        struct seccomp_knotif *cur;

        poll_wait(file, &filter->wqh, poll_tab);

        if (mutex_lock_interruptible(&filter->notify_lock) < 0)
                return EPOLLERR;

        list_for_each_entry(cur, &filter->notif->notifications, list) {
                if (cur->state == SECCOMP_NOTIFY_INIT)
                        ret |= EPOLLIN | EPOLLRDNORM;
                if (cur->state == SECCOMP_NOTIFY_SENT)
                        ret |= EPOLLOUT | EPOLLWRNORM;
                if ((ret & EPOLLIN) && (ret & EPOLLOUT))
                        break;
        }

        mutex_unlock(&filter->notify_lock);

        if (refcount_read(&filter->users) == 0)
                ret |= EPOLLHUP;

        return ret;
}

static const struct file_operations seccomp_notify_ops = {
        .poll = seccomp_notify_poll,
        .release = seccomp_notify_release,
        .unlocked_ioctl = seccomp_notify_ioctl,
        .compat_ioctl = seccomp_notify_ioctl,
};

static struct file *init_listener(struct seccomp_filter *filter)
{
        struct file *ret;

        ret = ERR_PTR(-ENOMEM);
        filter->notif = kzalloc(sizeof(*(filter->notif)), GFP_KERNEL);
        if (!filter->notif)
                goto out;

        sema_init(&filter->notif->request, 0);
        filter->notif->next_id = get_random_u64();
        INIT_LIST_HEAD(&filter->notif->notifications);

        ret = anon_inode_getfile("seccomp notify", &seccomp_notify_ops,
                                 filter, O_RDWR);
        if (IS_ERR(ret))
                goto out_notif;

        /* The file has a reference to it now */
        __get_seccomp_filter(filter);

out_notif:
        if (IS_ERR(ret))
                seccomp_notify_free(filter);
out:
        return ret;
}

/*
 * Does @new_child have a listener while an ancestor also has a listener?
 * If so, we'll want to reject this filter.
 * This only has to be tested for the current process, even in the TSYNC case,
 * because TSYNC installs @child with the same parent on all threads.
 * Note that @new_child is not hooked up to its parent at this point yet, so
 * we use current->seccomp.filter.
 */
static bool has_duplicate_listener(struct seccomp_filter *new_child)
{
        struct seccomp_filter *cur;

        /* must be protected against concurrent TSYNC */
        lockdep_assert_held(&current->sighand->siglock);

        if (!new_child->notif)
                return false;
        for (cur = current->seccomp.filter; cur; cur = cur->prev) {
                if (cur->notif)
                        return true;
        }

        return false;
}

/**
 * seccomp_set_mode_filter: internal function for setting seccomp filter
 * @flags:  flags to change filter behavior
 * @filter: struct sock_fprog containing filter
 *
 * This function may be called repeatedly to install additional filters.
 * Every filter successfully installed will be evaluated (in reverse order)
 * for each system call the task makes.
 *
 * Once current->seccomp.mode is non-zero, it may not be changed.
 *
 * Returns 0 on success or -EINVAL on failure.
 */
static long seccomp_set_mode_filter(unsigned int flags,
                                    const char __user *filter)
{
        const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
        struct seccomp_filter *prepared = NULL;
        long ret = -EINVAL;
        int listener = -1;
        struct file *listener_f = NULL;

        /* Validate flags. */
        if (flags & ~SECCOMP_FILTER_FLAG_MASK)
                return -EINVAL;

        /*
         * In the successful case, NEW_LISTENER returns the new listener fd.
         * But in the failure case, TSYNC returns the thread that died. If you
         * combine these two flags, there's no way to tell whether something
         * succeeded or failed. So, let's disallow this combination if the user
         * has not explicitly requested no errors from TSYNC.
         */
        if ((flags & SECCOMP_FILTER_FLAG_TSYNC) &&
            (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) &&
            ((flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH) == 0))
                return -EINVAL;

        /* Prepare the new filter before holding any locks. */
        prepared = seccomp_prepare_user_filter(filter);
        if (IS_ERR(prepared))
                return PTR_ERR(prepared);

        if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
                listener = get_unused_fd_flags(O_CLOEXEC);
                if (listener < 0) {
                        ret = listener;
                        goto out_free;
                }

                listener_f = init_listener(prepared);
                if (IS_ERR(listener_f)) {
                        put_unused_fd(listener);
                        ret = PTR_ERR(listener_f);
                        goto out_free;
                }
        }

        /*
         * Make sure we cannot change seccomp or nnp state via TSYNC
         * while another thread is in the middle of calling exec.
         */
        if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
            mutex_lock_killable(&current->signal->cred_guard_mutex))
                goto out_put_fd;

        spin_lock_irq(&current->sighand->siglock);

        if (!seccomp_may_assign_mode(seccomp_mode))
                goto out;

        if (has_duplicate_listener(prepared)) {
                ret = -EBUSY;
                goto out;
        }

        ret = seccomp_attach_filter(flags, prepared);
        if (ret)
                goto out;
        /* Do not free the successfully attached filter. */
        prepared = NULL;

        seccomp_assign_mode(current, seccomp_mode, flags);
out:
        spin_unlock_irq(&current->sighand->siglock);
        if (flags & SECCOMP_FILTER_FLAG_TSYNC)
                mutex_unlock(&current->signal->cred_guard_mutex);
out_put_fd:
        if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
                if (ret) {
                        listener_f->private_data = NULL;
                        fput(listener_f);
                        put_unused_fd(listener);
                        seccomp_notify_detach(prepared);
                } else {
                        fd_install(listener, listener_f);
                        ret = listener;
                }
        }
out_free:
        seccomp_filter_free(prepared);
        return ret;
}
#else
static inline long seccomp_set_mode_filter(unsigned int flags,
                                           const char __user *filter)
{
        return -EINVAL;
}
#endif

static long seccomp_get_action_avail(const char __user *uaction)
{
        u32 action;

        if (copy_from_user(&action, uaction, sizeof(action)))
                return -EFAULT;

        switch (action) {
        case SECCOMP_RET_KILL_PROCESS:
        case SECCOMP_RET_KILL_THREAD:
        case SECCOMP_RET_TRAP:
        case SECCOMP_RET_ERRNO:
        case SECCOMP_RET_USER_NOTIF:
        case SECCOMP_RET_TRACE:
        case SECCOMP_RET_LOG:
        case SECCOMP_RET_ALLOW:
                break;
        default:
                return -EOPNOTSUPP;
        }

        return 0;
}

static long seccomp_get_notif_sizes(void __user *usizes)
{
        struct seccomp_notif_sizes sizes = {
                .seccomp_notif = sizeof(struct seccomp_notif),
                .seccomp_notif_resp = sizeof(struct seccomp_notif_resp),
                .seccomp_data = sizeof(struct seccomp_data),
        };

        if (copy_to_user(usizes, &sizes, sizeof(sizes)))
                return -EFAULT;

        return 0;
}

/* Common entry point for both prctl and syscall. */
static long do_seccomp(unsigned int op, unsigned int flags,
                       void __user *uargs)
{
        switch (op) {
        case SECCOMP_SET_MODE_STRICT:
                if (flags != 0 || uargs != NULL)
                        return -EINVAL;
                return seccomp_set_mode_strict();
        case SECCOMP_SET_MODE_FILTER:
                return seccomp_set_mode_filter(flags, uargs);
        case SECCOMP_GET_ACTION_AVAIL:
                if (flags != 0)
                        return -EINVAL;

                return seccomp_get_action_avail(uargs);
        case SECCOMP_GET_NOTIF_SIZES:
                if (flags != 0)
                        return -EINVAL;

                return seccomp_get_notif_sizes(uargs);
        default:
                return -EINVAL;
        }
}

SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags,
                         void __user *, uargs)
{
        return do_seccomp(op, flags, uargs);
}

/**
 * prctl_set_seccomp: configures current->seccomp.mode
 * @seccomp_mode: requested mode to use
 * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
 *
 * Returns 0 on success or -EINVAL on failure.
 */
long prctl_set_seccomp(unsigned long seccomp_mode, void __user *filter)
{
        unsigned int op;
        void __user *uargs;

        switch (seccomp_mode) {
        case SECCOMP_MODE_STRICT:
                op = SECCOMP_SET_MODE_STRICT;
                /*
                 * Setting strict mode through prctl always ignored filter,
                 * so make sure it is always NULL here to pass the internal
                 * check in do_seccomp().
                 */
                uargs = NULL;
                break;
        case SECCOMP_MODE_FILTER:
                op = SECCOMP_SET_MODE_FILTER;
                uargs = filter;
                break;
        default:
                return -EINVAL;
        }

        /* prctl interface doesn't have flags, so they are always zero. */
        return do_seccomp(op, 0, uargs);
}

#if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
static struct seccomp_filter *get_nth_filter(struct task_struct *task,
                                             unsigned long filter_off)
{
        struct seccomp_filter *orig, *filter;
        unsigned long count;

        /*
         * Note: this is only correct because the caller should be the (ptrace)
         * tracer of the task, otherwise lock_task_sighand is needed.
         */
        spin_lock_irq(&task->sighand->siglock);

        if (task->seccomp.mode != SECCOMP_MODE_FILTER) {
                spin_unlock_irq(&task->sighand->siglock);
                return ERR_PTR(-EINVAL);
        }

        orig = task->seccomp.filter;
        __get_seccomp_filter(orig);
        spin_unlock_irq(&task->sighand->siglock);

        count = 0;
        for (filter = orig; filter; filter = filter->prev)
                count++;

        if (filter_off >= count) {
                filter = ERR_PTR(-ENOENT);
                goto out;
        }

        count -= filter_off;
        for (filter = orig; filter && count > 1; filter = filter->prev)
                count--;

        if (WARN_ON(count != 1 || !filter)) {
                filter = ERR_PTR(-ENOENT);
                goto out;
        }

        __get_seccomp_filter(filter);

out:
        __put_seccomp_filter(orig);
        return filter;
}

long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
                        void __user *data)
{
        struct seccomp_filter *filter;
        struct sock_fprog_kern *fprog;
        long ret;

        if (!capable(CAP_SYS_ADMIN) ||
            current->seccomp.mode != SECCOMP_MODE_DISABLED) {
                return -EACCES;
        }

        filter = get_nth_filter(task, filter_off);
        if (IS_ERR(filter))
                return PTR_ERR(filter);

        fprog = filter->prog->orig_prog;
        if (!fprog) {
                /* This must be a new non-cBPF filter, since we save
                 * every cBPF filter's orig_prog above when
                 * CONFIG_CHECKPOINT_RESTORE is enabled.
                 */
                ret = -EMEDIUMTYPE;
                goto out;
        }

        ret = fprog->len;
        if (!data)
                goto out;

        if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog)))
                ret = -EFAULT;

out:
        __put_seccomp_filter(filter);
        return ret;
}

long seccomp_get_metadata(struct task_struct *task,
                          unsigned long size, void __user *data)
{
        long ret;
        struct seccomp_filter *filter;
        struct seccomp_metadata kmd = {};

        if (!capable(CAP_SYS_ADMIN) ||
            current->seccomp.mode != SECCOMP_MODE_DISABLED) {
                return -EACCES;
        }

        size = min_t(unsigned long, size, sizeof(kmd));

        if (size < sizeof(kmd.filter_off))
                return -EINVAL;

        if (copy_from_user(&kmd.filter_off, data, sizeof(kmd.filter_off)))
                return -EFAULT;

        filter = get_nth_filter(task, kmd.filter_off);
        if (IS_ERR(filter))
                return PTR_ERR(filter);

        if (filter->log)
                kmd.flags |= SECCOMP_FILTER_FLAG_LOG;

        ret = size;
        if (copy_to_user(data, &kmd, size))
                ret = -EFAULT;

        __put_seccomp_filter(filter);
        return ret;
}
#endif

#ifdef CONFIG_SYSCTL

/* Human readable action names for friendly sysctl interaction */
#define SECCOMP_RET_KILL_PROCESS_NAME        "kill_process"
#define SECCOMP_RET_KILL_THREAD_NAME        "kill_thread"
#define SECCOMP_RET_TRAP_NAME                "trap"
#define SECCOMP_RET_ERRNO_NAME                "errno"
#define SECCOMP_RET_USER_NOTIF_NAME        "user_notif"
#define SECCOMP_RET_TRACE_NAME                "trace"
#define SECCOMP_RET_LOG_NAME                "log"
#define SECCOMP_RET_ALLOW_NAME                "allow"

static const char seccomp_actions_avail[] =
                                SECCOMP_RET_KILL_PROCESS_NAME        " "
                                SECCOMP_RET_KILL_THREAD_NAME        " "
                                SECCOMP_RET_TRAP_NAME                " "
                                SECCOMP_RET_ERRNO_NAME                " "
                                SECCOMP_RET_USER_NOTIF_NAME     " "
                                SECCOMP_RET_TRACE_NAME                " "
                                SECCOMP_RET_LOG_NAME                " "
                                SECCOMP_RET_ALLOW_NAME;

struct seccomp_log_name {
        u32                log;
        const char        *name;
};

static const struct seccomp_log_name seccomp_log_names[] = {
        { SECCOMP_LOG_KILL_PROCESS, SECCOMP_RET_KILL_PROCESS_NAME },
        { SECCOMP_LOG_KILL_THREAD, SECCOMP_RET_KILL_THREAD_NAME },
        { SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME },
        { SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME },
        { SECCOMP_LOG_USER_NOTIF, SECCOMP_RET_USER_NOTIF_NAME },
        { SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME },
        { SECCOMP_LOG_LOG, SECCOMP_RET_LOG_NAME },
        { SECCOMP_LOG_ALLOW, SECCOMP_RET_ALLOW_NAME },
        { }
};

static bool seccomp_names_from_actions_logged(char *names, size_t size,
                                              u32 actions_logged,
                                              const char *sep)
{
        const struct seccomp_log_name *cur;
        bool append_sep = false;

        for (cur = seccomp_log_names; cur->name && size; cur++) {
                ssize_t ret;

                if (!(actions_logged & cur->log))
                        continue;

                if (append_sep) {
                        ret = strscpy(names, sep, size);
                        if (ret < 0)
                                return false;

                        names += ret;
                        size -= ret;
                } else
                        append_sep = true;

                ret = strscpy(names, cur->name, size);
                if (ret < 0)
                        return false;

                names += ret;
                size -= ret;
        }

        return true;
}

static bool seccomp_action_logged_from_name(u32 *action_logged,
                                            const char *name)
{
        const struct seccomp_log_name *cur;

        for (cur = seccomp_log_names; cur->name; cur++) {
                if (!strcmp(cur->name, name)) {
                        *action_logged = cur->log;
                        return true;
                }
        }

        return false;
}

static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names)
{
        char *name;

        *actions_logged = 0;
        while ((name = strsep(&names, " ")) && *name) {
                u32 action_logged = 0;

                if (!seccomp_action_logged_from_name(&action_logged, name))
                        return false;

                *actions_logged |= action_logged;
        }

        return true;
}

static int read_actions_logged(struct ctl_table *ro_table, void __user *buffer,
                               size_t *lenp, loff_t *ppos)
{
        char names[sizeof(seccomp_actions_avail)];
        struct ctl_table table;

        memset(names, 0, sizeof(names));

        if (!seccomp_names_from_actions_logged(names, sizeof(names),
                                               seccomp_actions_logged, " "))
                return -EINVAL;

        table = *ro_table;
        table.data = names;
        table.maxlen = sizeof(names);
        return proc_dostring(&table, 0, buffer, lenp, ppos);
}

static int write_actions_logged(struct ctl_table *ro_table, void __user *buffer,
                                size_t *lenp, loff_t *ppos, u32 *actions_logged)
{
        char names[sizeof(seccomp_actions_avail)];
        struct ctl_table table;
        int ret;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        memset(names, 0, sizeof(names));

        table = *ro_table;
        table.data = names;
        table.maxlen = sizeof(names);
        ret = proc_dostring(&table, 1, buffer, lenp, ppos);
        if (ret)
                return ret;

        if (!seccomp_actions_logged_from_names(actions_logged, table.data))
                return -EINVAL;

        if (*actions_logged & SECCOMP_LOG_ALLOW)
                return -EINVAL;

        seccomp_actions_logged = *actions_logged;
        return 0;
}

static void audit_actions_logged(u32 actions_logged, u32 old_actions_logged,
                                 int ret)
{
        char names[sizeof(seccomp_actions_avail)];
        char old_names[sizeof(seccomp_actions_avail)];
        const char *new = names;
        const char *old = old_names;

        if (!audit_enabled)
                return;

        memset(names, 0, sizeof(names));
        memset(old_names, 0, sizeof(old_names));

        if (ret)
                new = "?";
        else if (!actions_logged)
                new = "(none)";
        else if (!seccomp_names_from_actions_logged(names, sizeof(names),
                                                    actions_logged, ","))
                new = "?";

        if (!old_actions_logged)
                old = "(none)";
        else if (!seccomp_names_from_actions_logged(old_names,
                                                    sizeof(old_names),
                                                    old_actions_logged, ","))
                old = "?";

        return audit_seccomp_actions_logged(new, old, !ret);
}

static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write,
                                          void *buffer, size_t *lenp,
                                          loff_t *ppos)
{
        int ret;

        if (write) {
                u32 actions_logged = 0;
                u32 old_actions_logged = seccomp_actions_logged;

                ret = write_actions_logged(ro_table, buffer, lenp, ppos,
                                           &actions_logged);
                audit_actions_logged(actions_logged, old_actions_logged, ret);
        } else
                ret = read_actions_logged(ro_table, buffer, lenp, ppos);

        return ret;
}

static struct ctl_path seccomp_sysctl_path[] = {
        { .procname = "kernel", },
        { .procname = "seccomp", },
        { }
};

static struct ctl_table seccomp_sysctl_table[] = {
        {
                .procname        = "actions_avail",
                .data                = (void *) &seccomp_actions_avail,
                .maxlen                = sizeof(seccomp_actions_avail),
                .mode                = 0444,
                .proc_handler        = proc_dostring,
        },
        {
                .procname        = "actions_logged",
                .mode                = 0644,
                .proc_handler        = seccomp_actions_logged_handler,
        },
        { }
};

static int __init seccomp_sysctl_init(void)
{
        struct ctl_table_header *hdr;

        hdr = register_sysctl_paths(seccomp_sysctl_path, seccomp_sysctl_table);
        if (!hdr)
                pr_warn("sysctl registration failed\n");
        else
                kmemleak_not_leak(hdr);

        return 0;
}

device_initcall(seccomp_sysctl_init)

#endif /* CONFIG_SYSCTL */
























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 



    2 





    2 
    2 


































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057
8058
8059
8060
8061
8062
8063
8064
8065
8066
8067
8068
8069
8070
8071
8072
8073
8074
8075
8076
8077
8078
8079
8080
8081
8082
8083
8084
8085
8086
8087
8088
8089
8090
8091
8092
8093
8094
8095
8096
8097
8098
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118
8119
8120
8121
8122
8123
8124
8125
8126
8127
8128
8129
8130
8131
8132
8133
8134
8135
8136
8137
8138
8139
8140
8141
8142
8143
8144
8145
8146
8147
8148
8149
8150
8151
8152
8153
8154
8155
8156
8157
8158
8159
8160
8161
8162
8163
8164
8165
8166
8167
8168
8169
8170
8171
8172
8173
8174
8175
8176
8177
8178
8179
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195
8196
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213
8214
8215
8216
8217
8218
8219
8220
8221
8222
8223
8224
8225
8226
8227
8228
8229
8230
8231
8232
8233
8234
8235
8236
8237
8238
8239
8240
8241
8242
8243
8244
8245
8246
8247
8248
8249
8250
8251
8252
8253
8254
8255
8256
8257
8258
8259
8260
8261
8262
8263
8264
8265
8266
8267
8268
8269
8270
8271
8272
8273
8274
8275
8276
8277
8278
8279
8280
8281
8282
8283
8284
8285
8286
8287
8288
8289
8290
8291
8292
8293
8294
8295
8296
8297
8298
8299
8300
8301
8302
8303
8304
8305
8306
8307
8308
8309
8310
8311
8312
8313
8314
8315
8316
8317
8318
8319
8320
8321
8322
8323
8324
8325
8326
8327
8328
8329
8330
8331
8332
8333
8334
8335
8336
8337
8338
8339
8340
8341
8342
8343
8344
8345
8346
8347
8348
8349
8350
8351
8352
8353
8354
8355
8356
8357
8358
8359
8360
8361
8362
8363
8364
8365
8366
8367
8368
8369
8370
8371
8372
8373
8374
8375
8376
8377
8378
8379
8380
8381
8382
8383
8384
8385
8386
8387
8388
8389
8390
8391
8392
8393
8394
8395
8396
8397
8398
8399
8400
8401
8402
8403
8404
8405
8406
8407
8408
8409
8410
8411
8412
8413
8414
8415
8416
8417
8418
8419
8420
8421
8422
8423
8424
8425
8426
8427
8428
8429
8430
8431
8432
8433
8434
8435
8436
8437
8438
8439
8440
8441
8442
8443
8444
8445
8446
8447
8448
8449
8450
8451
8452
8453
8454
8455
8456
8457
8458
8459
8460
8461
8462
8463
8464
8465
8466
8467
8468
8469
8470
8471
8472
8473
8474
8475
8476
8477
8478
8479
8480
8481
8482
8483
8484
8485
8486
8487
8488
8489
8490
8491
8492
8493
8494
8495
8496
8497
8498
8499
8500
8501
8502
8503
8504
8505
8506
8507
8508
8509
8510
8511
8512
8513
8514
8515
8516
8517
8518
8519
8520
8521
8522
8523
8524
8525
8526
8527
8528
8529
8530
8531
8532
8533
8534
8535
8536
8537
8538
8539
8540
8541
8542
8543
8544
8545
8546
8547
8548
8549
8550
8551
8552
8553
8554
8555
8556
8557
8558
8559
8560
8561
8562
8563
8564
8565
8566
8567
8568
8569
8570
8571
8572
8573
8574
8575
8576
8577
8578
8579
8580
8581
8582
8583
8584
8585
8586
8587
8588
8589
8590
8591
8592
8593
8594
8595
8596
8597
8598
8599
8600
8601
8602
8603
8604
8605
8606
8607
8608
8609
8610
8611
8612
8613
8614
8615
8616
8617
8618
8619
8620
8621
8622
8623
8624
8625
8626
8627
8628
8629
8630
8631
8632
8633
8634
8635
8636
8637
8638
8639
8640
8641
8642
8643
8644
8645
8646
8647
8648
8649
8650
8651
8652
8653
8654
8655
8656
8657
8658
8659
8660
8661
8662
8663
8664
8665
8666
8667
8668
8669
8670
8671
8672
8673
8674
8675
8676
8677
8678
8679
8680
8681
8682
8683
8684
8685
8686
8687
8688
8689
8690
8691
8692
8693
8694
8695
8696
8697
8698
8699
8700
8701
8702
8703
8704
8705
8706
8707
8708
8709
8710
8711
8712
8713
8714
8715
8716
8717
8718
8719
8720
8721
8722
8723
8724
8725
8726
8727
8728
8729
8730
8731
8732
8733
8734
8735
8736
8737
8738
8739
8740
8741
8742
8743
8744
8745
8746
8747
8748
8749
8750
8751
8752
8753
8754
8755
8756
8757
8758
8759
8760
8761
8762
8763
8764
8765
8766
8767
8768
8769
8770
8771
8772
8773
8774
8775
8776
8777
8778
8779
8780
8781
8782
8783
8784
8785
8786
8787
8788
8789
8790
8791
8792
8793
8794
8795
8796
8797
8798
8799
8800
8801
8802
8803
8804
8805
8806
8807
8808
8809
8810
8811
8812
8813
8814
8815
8816
8817
8818
8819
8820
8821
8822
8823
8824
8825
8826
8827
8828
8829
8830
8831
8832
8833
8834
8835
8836
8837
8838
8839
8840
8841
8842
8843
8844
8845
8846
8847
8848
8849
8850
8851
8852
8853
8854
8855
8856
8857
8858
8859
8860
8861
8862
8863
8864
8865
8866
8867
8868
8869
8870
8871
8872
8873
8874
8875
8876
8877
8878
8879
8880
8881
8882
8883
8884
8885
8886
8887
8888
8889
8890
8891
8892
8893
8894
8895
8896
8897
8898
8899
8900
8901
8902
8903
8904
8905
8906
8907
8908
8909
8910
8911
8912
8913
8914
8915
8916
8917
8918
8919
8920
8921
8922
8923
8924
8925
8926
8927
8928
8929
8930
8931
8932
8933
8934
8935
8936
8937
8938
8939
8940
8941
8942
8943
8944
8945
8946
8947
8948
8949
8950
8951
8952
8953
8954
8955
8956
8957
8958
8959
8960
8961
8962
8963
8964
8965
8966
8967
8968
8969
8970
8971
8972
8973
8974
8975
8976
8977
8978
8979
8980
8981
8982
8983
8984
8985
8986
8987
8988
8989
8990
8991
8992
8993
8994
8995
8996
8997
8998
8999
9000
9001
9002
9003
9004
9005
9006
9007
9008
9009
9010
9011
9012
9013
9014
9015
9016
9017
9018
9019
9020
9021
9022
9023
9024
9025
9026
9027
9028
9029
9030
9031
9032
9033
9034
9035
9036
9037
9038
9039
9040
9041
9042
9043
9044
9045
9046
9047
9048
9049
9050
9051
9052
9053
9054
9055
9056
9057
9058
9059
9060
9061
9062
9063
9064
9065
9066
9067
9068
9069
9070
9071
9072
9073
9074
9075
9076
9077
9078
9079
9080
9081
9082
9083
9084
9085
9086
9087
9088
9089
9090
9091
9092
9093
9094
9095
9096
9097
9098
9099
9100
9101
9102
9103
9104
9105
9106
9107
9108
9109
9110
9111
9112
9113
9114
9115
9116
9117
9118
9119
9120
9121
9122
9123
9124
9125
9126
9127
9128
9129
9130
9131
9132
9133
9134
9135
9136
9137
9138
9139
9140
9141
9142
9143
9144
9145
9146
9147
9148
9149
9150
9151
9152
9153
9154
9155
9156
9157
9158
9159
9160
9161
9162
9163
9164
9165
9166
9167
9168
9169
9170
9171
9172
9173
9174
9175
9176
9177
9178
9179
9180
9181
9182
9183
9184
9185
9186
9187
9188
9189
9190
9191
9192
9193
9194
9195
9196
9197
9198
9199
9200
9201
9202
9203
9204
9205
9206
9207
9208
9209
9210
9211
9212
9213
9214
9215
9216
9217
9218
9219
9220
9221
9222
9223
9224
9225
9226
9227
9228
9229
9230
9231
9232
9233
9234
9235
9236
9237
9238
9239
9240
9241
9242
9243
9244
9245
9246
9247
9248
9249
9250
9251
9252
9253
9254
9255
9256
9257
9258
9259
9260
9261
9262
9263
9264
9265
9266
9267
9268
9269
9270
9271
9272
9273
9274
9275
9276
9277
9278
9279
9280
9281
9282
9283
9284
9285
9286
9287
9288
9289
9290
9291
9292
9293
9294
9295
9296
9297
9298
9299
9300
9301
9302
9303
9304
9305
9306
9307
9308
9309
9310
9311
9312
9313
9314
9315
9316
9317
9318
9319
9320
9321
9322
9323
9324
9325
9326
9327
9328
9329
9330
9331
9332
9333
9334
9335
9336
9337
9338
9339
9340
9341
9342
9343
9344
9345
9346
9347
9348
9349
9350
9351
9352
9353
9354
9355
9356
9357
9358
9359
9360
9361
9362
9363
9364
9365
9366
9367
9368
9369
9370
9371
9372
9373
9374
9375
9376
9377
9378
9379
9380
9381
9382
9383
9384
9385
9386
9387
9388
9389
9390
9391
9392
9393
9394
9395
9396
9397
9398
9399
9400
9401
9402
9403
9404
9405
9406
9407
9408
9409
9410
9411
9412
9413
9414
9415
9416
9417
9418
9419
9420
9421
9422
9423
9424
9425
9426
9427
9428
9429
9430
9431
9432
9433
9434
9435
9436
9437
9438
9439
9440
9441
9442
9443
9444
9445
9446
9447
9448
9449
9450
9451
9452
9453
9454
9455
9456
9457
9458
9459
9460
9461
9462
9463
9464
9465
9466
9467
9468
9469
9470
9471
9472
9473
9474
9475
9476
9477
9478
9479
9480
9481
9482
9483
9484
9485
9486
9487
9488
9489
9490
9491
9492
9493
9494
9495
9496
9497
9498
9499
9500
9501
9502
9503
9504
9505
9506
9507
9508
9509
9510
9511
9512
9513
9514
9515
9516
9517
9518
9519
9520
9521
9522
9523
9524
9525
9526
9527
9528
9529
9530
9531
9532
9533
9534
9535
9536
9537
9538
9539
9540
9541
9542
9543
9544
9545
9546
9547
9548
9549
9550
9551
9552
9553
9554
9555
9556
9557
9558
9559
9560
9561
9562
9563
9564
9565
9566
9567
9568
9569
9570
9571
9572
9573
9574
9575
9576
9577
9578
9579
9580
9581
9582
9583
9584
9585
9586
9587
9588
9589
9590
9591
9592
9593
9594
9595
9596
9597
9598
9599
9600
9601
9602
9603
9604
9605
9606
9607
9608
9609
9610
9611
9612
9613
9614
9615
9616
9617
9618
9619
9620
9621
9622
9623
9624
9625
9626
9627
9628
9629
9630
9631
9632
9633
9634
9635
9636
9637
9638
9639
9640
9641
9642
9643
9644
9645
9646
9647
9648
9649
9650
9651
9652
9653
9654
9655
9656
9657
9658
9659
9660
9661
9662
9663
9664
9665
9666
9667
9668
9669
9670
9671
9672
9673
9674
9675
9676
9677
9678
9679
9680
9681
9682
9683
9684
9685
9686
9687
9688
9689
9690
9691
9692
9693
9694
9695
9696
9697
9698
9699
9700
9701
9702
9703
9704
9705
9706
9707
9708
9709
9710
9711
9712
9713
9714
9715
9716
9717
9718
9719
9720
9721
9722
9723
9724
9725
9726
9727
9728
9729
9730
9731
9732
9733
9734
9735
9736
9737
9738
9739
9740
9741
9742
9743
9744
9745
9746
9747
9748
9749
9750
9751
9752
9753
9754
9755
9756
9757
9758
9759
9760
9761
9762
9763
9764
9765
9766
9767
9768
9769
9770
9771
9772
9773
9774
9775
9776
9777
9778
9779
9780
9781
9782
9783
9784
9785
9786
9787
9788
9789
9790
9791
9792
9793
9794
9795
9796
9797
9798
9799
9800
9801
9802
9803
9804
9805
9806
9807
9808
9809
9810
9811
9812
9813
9814
9815
9816
9817
9818
9819
9820
9821
9822
9823
9824
9825
9826
9827
9828
9829
9830
9831
9832
9833
9834
9835
9836
9837
9838
9839
9840
9841
9842
9843
9844
9845
9846
9847
9848
9849
9850
9851
9852
9853
9854
9855
9856
9857
9858
9859
9860
9861
9862
9863
9864
9865
9866
9867
9868
9869
9870
9871
9872
9873
9874
9875
9876
9877
9878
9879
9880
9881
9882
9883
9884
9885
9886
9887
9888
9889
9890
9891
9892
9893
9894
9895
9896
9897
9898
9899
9900
9901
9902
9903
9904
9905
9906
9907
9908
9909
9910
9911
9912
9913
9914
9915
9916
9917
9918
9919
9920
9921
9922
9923
9924
9925
// SPDX-License-Identifier: GPL-2.0
/*
 * ring buffer based function tracer
 *
 * Copyright (C) 2007-2012 Steven Rostedt <srostedt@redhat.com>
 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
 *
 * Originally taken from the RT patch by:
 *    Arnaldo Carvalho de Melo <acme@redhat.com>
 *
 * Based on code from the latency_tracer, that is:
 *  Copyright (C) 2004-2006 Ingo Molnar
 *  Copyright (C) 2004 Nadia Yvette Chambers
 */
#include <linux/ring_buffer.h>
#include <generated/utsrelease.h>
#include <linux/stacktrace.h>
#include <linux/writeback.h>
#include <linux/kallsyms.h>
#include <linux/security.h>
#include <linux/seq_file.h>
#include <linux/notifier.h>
#include <linux/irqflags.h>
#include <linux/debugfs.h>
#include <linux/tracefs.h>
#include <linux/pagemap.h>
#include <linux/hardirq.h>
#include <linux/linkage.h>
#include <linux/uaccess.h>
#include <linux/vmalloc.h>
#include <linux/ftrace.h>
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/splice.h>
#include <linux/kdebug.h>
#include <linux/string.h>
#include <linux/mount.h>
#include <linux/rwsem.h>
#include <linux/slab.h>
#include <linux/ctype.h>
#include <linux/init.h>
#include <linux/kmemleak.h>
#include <linux/poll.h>
#include <linux/nmi.h>
#include <linux/fs.h>
#include <linux/trace.h>
#include <linux/sched/clock.h>
#include <linux/sched/rt.h>
#include <linux/fsnotify.h>
#include <linux/irq_work.h>
#include <linux/workqueue.h>

#include "trace.h"
#include "trace_output.h"

/*
 * On boot up, the ring buffer is set to the minimum size, so that
 * we do not waste memory on systems that are not using tracing.
 */
bool ring_buffer_expanded;

/*
 * We need to change this state when a selftest is running.
 * A selftest will lurk into the ring-buffer to count the
 * entries inserted during the selftest although some concurrent
 * insertions into the ring-buffer such as trace_printk could occurred
 * at the same time, giving false positive or negative results.
 */
static bool __read_mostly tracing_selftest_running;

/*
 * If boot-time tracing including tracers/events via kernel cmdline
 * is running, we do not want to run SELFTEST.
 */
bool __read_mostly tracing_selftest_disabled;

#ifdef CONFIG_FTRACE_STARTUP_TEST
void __init disable_tracing_selftest(const char *reason)
{
        if (!tracing_selftest_disabled) {
                tracing_selftest_disabled = true;
                pr_info("Ftrace startup test is disabled due to %s\n", reason);
        }
}
#endif

/* Pipe tracepoints to printk */
struct trace_iterator *tracepoint_print_iter;
int tracepoint_printk;
static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key);

/* For tracers that don't implement custom flags */
static struct tracer_opt dummy_tracer_opt[] = {
        { }
};

static int
dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
{
        return 0;
}

/*
 * To prevent the comm cache from being overwritten when no
 * tracing is active, only save the comm when a trace event
 * occurred.
 */
static DEFINE_PER_CPU(bool, trace_taskinfo_save);

/*
 * Kill all tracing for good (never come back).
 * It is initialized to 1 but will turn to zero if the initialization
 * of the tracer is successful. But that is the only place that sets
 * this back to zero.
 */
static int tracing_disabled = 1;

cpumask_var_t __read_mostly        tracing_buffer_mask;

/*
 * ftrace_dump_on_oops - variable to dump ftrace buffer on oops
 *
 * If there is an oops (or kernel panic) and the ftrace_dump_on_oops
 * is set, then ftrace_dump is called. This will output the contents
 * of the ftrace buffers to the console.  This is very useful for
 * capturing traces that lead to crashes and outputing it to a
 * serial console.
 *
 * It is default off, but you can enable it with either specifying
 * "ftrace_dump_on_oops" in the kernel command line, or setting
 * /proc/sys/kernel/ftrace_dump_on_oops
 * Set 1 if you want to dump buffers of all CPUs
 * Set 2 if you want to dump the buffer of the CPU that triggered oops
 */

enum ftrace_dump_mode ftrace_dump_on_oops;

/* When set, tracing will stop when a WARN*() is hit */
int __disable_trace_on_warning;

#ifdef CONFIG_TRACE_EVAL_MAP_FILE
/* Map of enums to their values, for "eval_map" file */
struct trace_eval_map_head {
        struct module                        *mod;
        unsigned long                        length;
};

union trace_eval_map_item;

struct trace_eval_map_tail {
        /*
         * "end" is first and points to NULL as it must be different
         * than "mod" or "eval_string"
         */
        union trace_eval_map_item        *next;
        const char                        *end;        /* points to NULL */
};

static DEFINE_MUTEX(trace_eval_mutex);

/*
 * The trace_eval_maps are saved in an array with two extra elements,
 * one at the beginning, and one at the end. The beginning item contains
 * the count of the saved maps (head.length), and the module they
 * belong to if not built in (head.mod). The ending item contains a
 * pointer to the next array of saved eval_map items.
 */
union trace_eval_map_item {
        struct trace_eval_map                map;
        struct trace_eval_map_head        head;
        struct trace_eval_map_tail        tail;
};

static union trace_eval_map_item *trace_eval_maps;
#endif /* CONFIG_TRACE_EVAL_MAP_FILE */

int tracing_set_tracer(struct trace_array *tr, const char *buf);
static void ftrace_trace_userstack(struct trace_array *tr,
                                   struct trace_buffer *buffer,
                                   unsigned long flags, int pc);

#define MAX_TRACER_SIZE                100
static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
static char *default_bootup_tracer;

static bool allocate_snapshot;

static int __init set_cmdline_ftrace(char *str)
{
        strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
        default_bootup_tracer = bootup_tracer_buf;
        /* We are using ftrace early, expand it */
        ring_buffer_expanded = true;
        return 1;
}
__setup("ftrace=", set_cmdline_ftrace);

static int __init set_ftrace_dump_on_oops(char *str)
{
        if (*str++ != '=' || !*str) {
                ftrace_dump_on_oops = DUMP_ALL;
                return 1;
        }

        if (!strcmp("orig_cpu", str)) {
                ftrace_dump_on_oops = DUMP_ORIG;
                return 1;
        }

        return 0;
}
__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);

static int __init stop_trace_on_warning(char *str)
{
        if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0))
                __disable_trace_on_warning = 1;
        return 1;
}
__setup("traceoff_on_warning", stop_trace_on_warning);

static int __init boot_alloc_snapshot(char *str)
{
        allocate_snapshot = true;
        /* We also need the main ring buffer expanded */
        ring_buffer_expanded = true;
        return 1;
}
__setup("alloc_snapshot", boot_alloc_snapshot);


static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;

static int __init set_trace_boot_options(char *str)
{
        strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
        return 1;
}
__setup("trace_options=", set_trace_boot_options);

static char trace_boot_clock_buf[MAX_TRACER_SIZE] __initdata;
static char *trace_boot_clock __initdata;

static int __init set_trace_boot_clock(char *str)
{
        strlcpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE);
        trace_boot_clock = trace_boot_clock_buf;
        return 1;
}
__setup("trace_clock=", set_trace_boot_clock);

static int __init set_tracepoint_printk(char *str)
{
        /* Ignore the "tp_printk_stop_on_boot" param */
        if (*str == '_')
                return 0;

        if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0))
                tracepoint_printk = 1;
        return 1;
}
__setup("tp_printk", set_tracepoint_printk);

unsigned long long ns2usecs(u64 nsec)
{
        nsec += 500;
        do_div(nsec, 1000);
        return nsec;
}

static void
trace_process_export(struct trace_export *export,
               struct ring_buffer_event *event, int flag)
{
        struct trace_entry *entry;
        unsigned int size = 0;

        if (export->flags & flag) {
                entry = ring_buffer_event_data(event);
                size = ring_buffer_event_length(event);
                export->write(export, entry, size);
        }
}

static DEFINE_MUTEX(ftrace_export_lock);

static struct trace_export __rcu *ftrace_exports_list __read_mostly;

static DEFINE_STATIC_KEY_FALSE(trace_function_exports_enabled);
static DEFINE_STATIC_KEY_FALSE(trace_event_exports_enabled);
static DEFINE_STATIC_KEY_FALSE(trace_marker_exports_enabled);

static inline void ftrace_exports_enable(struct trace_export *export)
{
        if (export->flags & TRACE_EXPORT_FUNCTION)
                static_branch_inc(&trace_function_exports_enabled);

        if (export->flags & TRACE_EXPORT_EVENT)
                static_branch_inc(&trace_event_exports_enabled);

        if (export->flags & TRACE_EXPORT_MARKER)
                static_branch_inc(&trace_marker_exports_enabled);
}

static inline void ftrace_exports_disable(struct trace_export *export)
{
        if (export->flags & TRACE_EXPORT_FUNCTION)
                static_branch_dec(&trace_function_exports_enabled);

        if (export->flags & TRACE_EXPORT_EVENT)
                static_branch_dec(&trace_event_exports_enabled);

        if (export->flags & TRACE_EXPORT_MARKER)
                static_branch_dec(&trace_marker_exports_enabled);
}

static void ftrace_exports(struct ring_buffer_event *event, int flag)
{
        struct trace_export *export;

        preempt_disable_notrace();

        export = rcu_dereference_raw_check(ftrace_exports_list);
        while (export) {
                trace_process_export(export, event, flag);
                export = rcu_dereference_raw_check(export->next);
        }

        preempt_enable_notrace();
}

static inline void
add_trace_export(struct trace_export **list, struct trace_export *export)
{
        rcu_assign_pointer(export->next, *list);
        /*
         * We are entering export into the list but another
         * CPU might be walking that list. We need to make sure
         * the export->next pointer is valid before another CPU sees
         * the export pointer included into the list.
         */
        rcu_assign_pointer(*list, export);
}

static inline int
rm_trace_export(struct trace_export **list, struct trace_export *export)
{
        struct trace_export **p;

        for (p = list; *p != NULL; p = &(*p)->next)
                if (*p == export)
                        break;

        if (*p != export)
                return -1;

        rcu_assign_pointer(*p, (*p)->next);

        return 0;
}

static inline void
add_ftrace_export(struct trace_export **list, struct trace_export *export)
{
        ftrace_exports_enable(export);

        add_trace_export(list, export);
}

static inline int
rm_ftrace_export(struct trace_export **list, struct trace_export *export)
{
        int ret;

        ret = rm_trace_export(list, export);
        ftrace_exports_disable(export);

        return ret;
}

int register_ftrace_export(struct trace_export *export)
{
        if (WARN_ON_ONCE(!export->write))
                return -1;

        mutex_lock(&ftrace_export_lock);

        add_ftrace_export(&ftrace_exports_list, export);

        mutex_unlock(&ftrace_export_lock);

        return 0;
}
EXPORT_SYMBOL_GPL(register_ftrace_export);

int unregister_ftrace_export(struct trace_export *export)
{
        int ret;

        mutex_lock(&ftrace_export_lock);

        ret = rm_ftrace_export(&ftrace_exports_list, export);

        mutex_unlock(&ftrace_export_lock);

        return ret;
}
EXPORT_SYMBOL_GPL(unregister_ftrace_export);

/* trace_flags holds trace_options default values */
#define TRACE_DEFAULT_FLAGS                                                \
        (FUNCTION_DEFAULT_FLAGS |                                        \
         TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |                        \
         TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO |                \
         TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE |                        \
         TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS)

/* trace_options that are only supported by global_trace */
#define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK |                        \
               TRACE_ITER_PRINTK_MSGONLY | TRACE_ITER_RECORD_CMD)

/* trace_flags that are default zero for instances */
#define ZEROED_TRACE_FLAGS \
        (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK)

/*
 * The global_trace is the descriptor that holds the top-level tracing
 * buffers for the live tracing.
 */
static struct trace_array global_trace = {
        .trace_flags = TRACE_DEFAULT_FLAGS,
};

LIST_HEAD(ftrace_trace_arrays);

int trace_array_get(struct trace_array *this_tr)
{
        struct trace_array *tr;
        int ret = -ENODEV;

        mutex_lock(&trace_types_lock);
        list_for_each_entry(tr, &ftrace_trace_arrays, list) {
                if (tr == this_tr) {
                        tr->ref++;
                        ret = 0;
                        break;
                }
        }
        mutex_unlock(&trace_types_lock);

        return ret;
}

static void __trace_array_put(struct trace_array *this_tr)
{
        WARN_ON(!this_tr->ref);
        this_tr->ref--;
}

/**
 * trace_array_put - Decrement the reference counter for this trace array.
 *
 * NOTE: Use this when we no longer need the trace array returned by
 * trace_array_get_by_name(). This ensures the trace array can be later
 * destroyed.
 *
 */
void trace_array_put(struct trace_array *this_tr)
{
        if (!this_tr)
                return;

        mutex_lock(&trace_types_lock);
        __trace_array_put(this_tr);
        mutex_unlock(&trace_types_lock);
}
EXPORT_SYMBOL_GPL(trace_array_put);

int tracing_check_open_get_tr(struct trace_array *tr)
{
        int ret;

        ret = security_locked_down(LOCKDOWN_TRACEFS);
        if (ret)
                return ret;

        if (tracing_disabled)
                return -ENODEV;

        if (tr && trace_array_get(tr) < 0)
                return -ENODEV;

        return 0;
}

int call_filter_check_discard(struct trace_event_call *call, void *rec,
                              struct trace_buffer *buffer,
                              struct ring_buffer_event *event)
{
        if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
            !filter_match_preds(call->filter, rec)) {
                __trace_event_discard_commit(buffer, event);
                return 1;
        }

        return 0;
}

void trace_free_pid_list(struct trace_pid_list *pid_list)
{
        vfree(pid_list->pids);
        kfree(pid_list);
}

/**
 * trace_find_filtered_pid - check if a pid exists in a filtered_pid list
 * @filtered_pids: The list of pids to check
 * @search_pid: The PID to find in @filtered_pids
 *
 * Returns true if @search_pid is fonud in @filtered_pids, and false otherwis.
 */
bool
trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
{
        /*
         * If pid_max changed after filtered_pids was created, we
         * by default ignore all pids greater than the previous pid_max.
         */
        if (search_pid >= filtered_pids->pid_max)
                return false;

        return test_bit(search_pid, filtered_pids->pids);
}

/**
 * trace_ignore_this_task - should a task be ignored for tracing
 * @filtered_pids: The list of pids to check
 * @task: The task that should be ignored if not filtered
 *
 * Checks if @task should be traced or not from @filtered_pids.
 * Returns true if @task should *NOT* be traced.
 * Returns false if @task should be traced.
 */
bool
trace_ignore_this_task(struct trace_pid_list *filtered_pids,
                       struct trace_pid_list *filtered_no_pids,
                       struct task_struct *task)
{
        /*
         * If filterd_no_pids is not empty, and the task's pid is listed
         * in filtered_no_pids, then return true.
         * Otherwise, if filtered_pids is empty, that means we can
         * trace all tasks. If it has content, then only trace pids
         * within filtered_pids.
         */

        return (filtered_pids &&
                !trace_find_filtered_pid(filtered_pids, task->pid)) ||
                (filtered_no_pids &&
                 trace_find_filtered_pid(filtered_no_pids, task->pid));
}

/**
 * trace_filter_add_remove_task - Add or remove a task from a pid_list
 * @pid_list: The list to modify
 * @self: The current task for fork or NULL for exit
 * @task: The task to add or remove
 *
 * If adding a task, if @self is defined, the task is only added if @self
 * is also included in @pid_list. This happens on fork and tasks should
 * only be added when the parent is listed. If @self is NULL, then the
 * @task pid will be removed from the list, which would happen on exit
 * of a task.
 */
void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
                                  struct task_struct *self,
                                  struct task_struct *task)
{
        if (!pid_list)
                return;

        /* For forks, we only add if the forking task is listed */
        if (self) {
                if (!trace_find_filtered_pid(pid_list, self->pid))
                        return;
        }

        /* Sorry, but we don't support pid_max changing after setting */
        if (task->pid >= pid_list->pid_max)
                return;

        /* "self" is set for forks, and NULL for exits */
        if (self)
                set_bit(task->pid, pid_list->pids);
        else
                clear_bit(task->pid, pid_list->pids);
}

/**
 * trace_pid_next - Used for seq_file to get to the next pid of a pid_list
 * @pid_list: The pid list to show
 * @v: The last pid that was shown (+1 the actual pid to let zero be displayed)
 * @pos: The position of the file
 *
 * This is used by the seq_file "next" operation to iterate the pids
 * listed in a trace_pid_list structure.
 *
 * Returns the pid+1 as we want to display pid of zero, but NULL would
 * stop the iteration.
 */
void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos)
{
        unsigned long pid = (unsigned long)v;

        (*pos)++;

        /* pid already is +1 of the actual prevous bit */
        pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid);

        /* Return pid + 1 to allow zero to be represented */
        if (pid < pid_list->pid_max)
                return (void *)(pid + 1);

        return NULL;
}

/**
 * trace_pid_start - Used for seq_file to start reading pid lists
 * @pid_list: The pid list to show
 * @pos: The position of the file
 *
 * This is used by seq_file "start" operation to start the iteration
 * of listing pids.
 *
 * Returns the pid+1 as we want to display pid of zero, but NULL would
 * stop the iteration.
 */
void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos)
{
        unsigned long pid;
        loff_t l = 0;

        pid = find_first_bit(pid_list->pids, pid_list->pid_max);
        if (pid >= pid_list->pid_max)
                return NULL;

        /* Return pid + 1 so that zero can be the exit value */
        for (pid++; pid && l < *pos;
             pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l))
                ;
        return (void *)pid;
}

/**
 * trace_pid_show - show the current pid in seq_file processing
 * @m: The seq_file structure to write into
 * @v: A void pointer of the pid (+1) value to display
 *
 * Can be directly used by seq_file operations to display the current
 * pid value.
 */
int trace_pid_show(struct seq_file *m, void *v)
{
        unsigned long pid = (unsigned long)v - 1;

        seq_printf(m, "%lu\n", pid);
        return 0;
}

/* 128 should be much more than enough */
#define PID_BUF_SIZE                127

int trace_pid_write(struct trace_pid_list *filtered_pids,
                    struct trace_pid_list **new_pid_list,
                    const char __user *ubuf, size_t cnt)
{
        struct trace_pid_list *pid_list;
        struct trace_parser parser;
        unsigned long val;
        int nr_pids = 0;
        ssize_t read = 0;
        ssize_t ret = 0;
        loff_t pos;
        pid_t pid;

        if (trace_parser_get_init(&parser, PID_BUF_SIZE + 1))
                return -ENOMEM;

        /*
         * Always recreate a new array. The write is an all or nothing
         * operation. Always create a new array when adding new pids by
         * the user. If the operation fails, then the current list is
         * not modified.
         */
        pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
        if (!pid_list) {
                trace_parser_put(&parser);
                return -ENOMEM;
        }

        pid_list->pid_max = READ_ONCE(pid_max);

        /* Only truncating will shrink pid_max */
        if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max)
                pid_list->pid_max = filtered_pids->pid_max;

        pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3);
        if (!pid_list->pids) {
                trace_parser_put(&parser);
                kfree(pid_list);
                return -ENOMEM;
        }

        if (filtered_pids) {
                /* copy the current bits to the new max */
                for_each_set_bit(pid, filtered_pids->pids,
                                 filtered_pids->pid_max) {
                        set_bit(pid, pid_list->pids);
                        nr_pids++;
                }
        }

        while (cnt > 0) {

                pos = 0;

                ret = trace_get_user(&parser, ubuf, cnt, &pos);
                if (ret < 0 || !trace_parser_loaded(&parser))
                        break;

                read += ret;
                ubuf += ret;
                cnt -= ret;

                ret = -EINVAL;
                if (kstrtoul(parser.buffer, 0, &val))
                        break;
                if (val >= pid_list->pid_max)
                        break;

                pid = (pid_t)val;

                set_bit(pid, pid_list->pids);
                nr_pids++;

                trace_parser_clear(&parser);
                ret = 0;
        }
        trace_parser_put(&parser);

        if (ret < 0) {
                trace_free_pid_list(pid_list);
                return ret;
        }

        if (!nr_pids) {
                /* Cleared the list of pids */
                trace_free_pid_list(pid_list);
                read = ret;
                pid_list = NULL;
        }

        *new_pid_list = pid_list;

        return read;
}

static u64 buffer_ftrace_now(struct array_buffer *buf, int cpu)
{
        u64 ts;

        /* Early boot up does not have a buffer yet */
        if (!buf->buffer)
                return trace_clock_local();

        ts = ring_buffer_time_stamp(buf->buffer, cpu);
        ring_buffer_normalize_time_stamp(buf->buffer, cpu, &ts);

        return ts;
}

u64 ftrace_now(int cpu)
{
        return buffer_ftrace_now(&global_trace.array_buffer, cpu);
}

/**
 * tracing_is_enabled - Show if global_trace has been disabled
 *
 * Shows if the global trace has been enabled or not. It uses the
 * mirror flag "buffer_disabled" to be used in fast paths such as for
 * the irqsoff tracer. But it may be inaccurate due to races. If you
 * need to know the accurate state, use tracing_is_on() which is a little
 * slower, but accurate.
 */
int tracing_is_enabled(void)
{
        /*
         * For quick access (irqsoff uses this in fast path), just
         * return the mirror variable of the state of the ring buffer.
         * It's a little racy, but we don't really care.
         */
        smp_rmb();
        return !global_trace.buffer_disabled;
}

/*
 * trace_buf_size is the size in bytes that is allocated
 * for a buffer. Note, the number of bytes is always rounded
 * to page size.
 *
 * This number is purposely set to a low number of 16384.
 * If the dump on oops happens, it will be much appreciated
 * to not have to wait for all that output. Anyway this can be
 * boot time and run time configurable.
 */
#define TRACE_BUF_SIZE_DEFAULT        1441792UL /* 16384 * 88 (sizeof(entry)) */

static unsigned long                trace_buf_size = TRACE_BUF_SIZE_DEFAULT;

/* trace_types holds a link list of available tracers. */
static struct tracer                *trace_types __read_mostly;

/*
 * trace_types_lock is used to protect the trace_types list.
 */
DEFINE_MUTEX(trace_types_lock);

/*
 * serialize the access of the ring buffer
 *
 * ring buffer serializes readers, but it is low level protection.
 * The validity of the events (which returns by ring_buffer_peek() ..etc)
 * are not protected by ring buffer.
 *
 * The content of events may become garbage if we allow other process consumes
 * these events concurrently:
 *   A) the page of the consumed events may become a normal page
 *      (not reader page) in ring buffer, and this page will be rewrited
 *      by events producer.
 *   B) The page of the consumed events may become a page for splice_read,
 *      and this page will be returned to system.
 *
 * These primitives allow multi process access to different cpu ring buffer
 * concurrently.
 *
 * These primitives don't distinguish read-only and read-consume access.
 * Multi read-only access are also serialized.
 */

#ifdef CONFIG_SMP
static DECLARE_RWSEM(all_cpu_access_lock);
static DEFINE_PER_CPU(struct mutex, cpu_access_lock);

static inline void trace_access_lock(int cpu)
{
        if (cpu == RING_BUFFER_ALL_CPUS) {
                /* gain it for accessing the whole ring buffer. */
                down_write(&all_cpu_access_lock);
        } else {
                /* gain it for accessing a cpu ring buffer. */

                /* Firstly block other trace_access_lock(RING_BUFFER_ALL_CPUS). */
                down_read(&all_cpu_access_lock);

                /* Secondly block other access to this @cpu ring buffer. */
                mutex_lock(&per_cpu(cpu_access_lock, cpu));
        }
}

static inline void trace_access_unlock(int cpu)
{
        if (cpu == RING_BUFFER_ALL_CPUS) {
                up_write(&all_cpu_access_lock);
        } else {
                mutex_unlock(&per_cpu(cpu_access_lock, cpu));
                up_read(&all_cpu_access_lock);
        }
}

static inline void trace_access_lock_init(void)
{
        int cpu;

        for_each_possible_cpu(cpu)
                mutex_init(&per_cpu(cpu_access_lock, cpu));
}

#else

static DEFINE_MUTEX(access_lock);

static inline void trace_access_lock(int cpu)
{
        (void)cpu;
        mutex_lock(&access_lock);
}

static inline void trace_access_unlock(int cpu)
{
        (void)cpu;
        mutex_unlock(&access_lock);
}

static inline void trace_access_lock_init(void)
{
}

#endif

#ifdef CONFIG_STACKTRACE
static void __ftrace_trace_stack(struct trace_buffer *buffer,
                                 unsigned long flags,
                                 int skip, int pc, struct pt_regs *regs);
static inline void ftrace_trace_stack(struct trace_array *tr,
                                      struct trace_buffer *buffer,
                                      unsigned long flags,
                                      int skip, int pc, struct pt_regs *regs);

#else
static inline void __ftrace_trace_stack(struct trace_buffer *buffer,
                                        unsigned long flags,
                                        int skip, int pc, struct pt_regs *regs)
{
}
static inline void ftrace_trace_stack(struct trace_array *tr,
                                      struct trace_buffer *buffer,
                                      unsigned long flags,
                                      int skip, int pc, struct pt_regs *regs)
{
}

#endif

static __always_inline void
trace_event_setup(struct ring_buffer_event *event,
                  int type, unsigned long flags, int pc)
{
        struct trace_entry *ent = ring_buffer_event_data(event);

        tracing_generic_entry_update(ent, type, flags, pc);
}

static __always_inline struct ring_buffer_event *
__trace_buffer_lock_reserve(struct trace_buffer *buffer,
                          int type,
                          unsigned long len,
                          unsigned long flags, int pc)
{
        struct ring_buffer_event *event;

        event = ring_buffer_lock_reserve(buffer, len);
        if (event != NULL)
                trace_event_setup(event, type, flags, pc);

        return event;
}

void tracer_tracing_on(struct trace_array *tr)
{
        if (tr->array_buffer.buffer)
                ring_buffer_record_on(tr->array_buffer.buffer);
        /*
         * This flag is looked at when buffers haven't been allocated
         * yet, or by some tracers (like irqsoff), that just want to
         * know if the ring buffer has been disabled, but it can handle
         * races of where it gets disabled but we still do a record.
         * As the check is in the fast path of the tracers, it is more
         * important to be fast than accurate.
         */
        tr->buffer_disabled = 0;
        /* Make the flag seen by readers */
        smp_wmb();
}

/**
 * tracing_on - enable tracing buffers
 *
 * This function enables tracing buffers that may have been
 * disabled with tracing_off.
 */
void tracing_on(void)
{
        tracer_tracing_on(&global_trace);
}
EXPORT_SYMBOL_GPL(tracing_on);


static __always_inline void
__buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *event)
{
        __this_cpu_write(trace_taskinfo_save, true);

        /* If this is the temp buffer, we need to commit fully */
        if (this_cpu_read(trace_buffered_event) == event) {
                /* Length is in event->array[0] */
                ring_buffer_write(buffer, event->array[0], &event->array[1]);
                /* Release the temp buffer */
                this_cpu_dec(trace_buffered_event_cnt);
        } else
                ring_buffer_unlock_commit(buffer, event);
}

/**
 * __trace_puts - write a constant string into the trace buffer.
 * @ip:           The address of the caller
 * @str:   The constant string to write
 * @size:  The size of the string.
 */
int __trace_puts(unsigned long ip, const char *str, int size)
{
        struct ring_buffer_event *event;
        struct trace_buffer *buffer;
        struct print_entry *entry;
        unsigned long irq_flags;
        int alloc;
        int pc;

        if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
                return 0;

        pc = preempt_count();

        if (unlikely(tracing_selftest_running || tracing_disabled))
                return 0;

        alloc = sizeof(*entry) + size + 2; /* possible \n added */

        local_save_flags(irq_flags);
        buffer = global_trace.array_buffer.buffer;
        ring_buffer_nest_start(buffer);
        event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, 
                                            irq_flags, pc);
        if (!event) {
                size = 0;
                goto out;
        }

        entry = ring_buffer_event_data(event);
        entry->ip = ip;

        memcpy(&entry->buf, str, size);

        /* Add a newline if necessary */
        if (entry->buf[size - 1] != '\n') {
                entry->buf[size] = '\n';
                entry->buf[size + 1] = '\0';
        } else
                entry->buf[size] = '\0';

        __buffer_unlock_commit(buffer, event);
        ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL);
 out:
        ring_buffer_nest_end(buffer);
        return size;
}
EXPORT_SYMBOL_GPL(__trace_puts);

/**
 * __trace_bputs - write the pointer to a constant string into trace buffer
 * @ip:           The address of the caller
 * @str:   The constant string to write to the buffer to
 */
int __trace_bputs(unsigned long ip, const char *str)
{
        struct ring_buffer_event *event;
        struct trace_buffer *buffer;
        struct bputs_entry *entry;
        unsigned long irq_flags;
        int size = sizeof(struct bputs_entry);
        int ret = 0;
        int pc;

        if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
                return 0;

        pc = preempt_count();

        if (unlikely(tracing_selftest_running || tracing_disabled))
                return 0;

        local_save_flags(irq_flags);
        buffer = global_trace.array_buffer.buffer;

        ring_buffer_nest_start(buffer);
        event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
                                            irq_flags, pc);
        if (!event)
                goto out;

        entry = ring_buffer_event_data(event);
        entry->ip                        = ip;
        entry->str                        = str;

        __buffer_unlock_commit(buffer, event);
        ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL);

        ret = 1;
 out:
        ring_buffer_nest_end(buffer);
        return ret;
}
EXPORT_SYMBOL_GPL(__trace_bputs);

#ifdef CONFIG_TRACER_SNAPSHOT
static void tracing_snapshot_instance_cond(struct trace_array *tr,
                                           void *cond_data)
{
        struct tracer *tracer = tr->current_trace;
        unsigned long flags;

        if (in_nmi()) {
                internal_trace_puts("*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n");
                internal_trace_puts("*** snapshot is being ignored        ***\n");
                return;
        }

        if (!tr->allocated_snapshot) {
                internal_trace_puts("*** SNAPSHOT NOT ALLOCATED ***\n");
                internal_trace_puts("*** stopping trace here!   ***\n");
                tracing_off();
                return;
        }

        /* Note, snapshot can not be used when the tracer uses it */
        if (tracer->use_max_tr) {
                internal_trace_puts("*** LATENCY TRACER ACTIVE ***\n");
                internal_trace_puts("*** Can not use snapshot (sorry) ***\n");
                return;
        }

        local_irq_save(flags);
        update_max_tr(tr, current, smp_processor_id(), cond_data);
        local_irq_restore(flags);
}

void tracing_snapshot_instance(struct trace_array *tr)
{
        tracing_snapshot_instance_cond(tr, NULL);
}

/**
 * tracing_snapshot - take a snapshot of the current buffer.
 *
 * This causes a swap between the snapshot buffer and the current live
 * tracing buffer. You can use this to take snapshots of the live
 * trace when some condition is triggered, but continue to trace.
 *
 * Note, make sure to allocate the snapshot with either
 * a tracing_snapshot_alloc(), or by doing it manually
 * with: echo 1 > /sys/kernel/debug/tracing/snapshot
 *
 * If the snapshot buffer is not allocated, it will stop tracing.
 * Basically making a permanent snapshot.
 */
void tracing_snapshot(void)
{
        struct trace_array *tr = &global_trace;

        tracing_snapshot_instance(tr);
}
EXPORT_SYMBOL_GPL(tracing_snapshot);

/**
 * tracing_snapshot_cond - conditionally take a snapshot of the current buffer.
 * @tr:                The tracing instance to snapshot
 * @cond_data:        The data to be tested conditionally, and possibly saved
 *
 * This is the same as tracing_snapshot() except that the snapshot is
 * conditional - the snapshot will only happen if the
 * cond_snapshot.update() implementation receiving the cond_data
 * returns true, which means that the trace array's cond_snapshot
 * update() operation used the cond_data to determine whether the
 * snapshot should be taken, and if it was, presumably saved it along
 * with the snapshot.
 */
void tracing_snapshot_cond(struct trace_array *tr, void *cond_data)
{
        tracing_snapshot_instance_cond(tr, cond_data);
}
EXPORT_SYMBOL_GPL(tracing_snapshot_cond);

/**
 * tracing_snapshot_cond_data - get the user data associated with a snapshot
 * @tr:                The tracing instance
 *
 * When the user enables a conditional snapshot using
 * tracing_snapshot_cond_enable(), the user-defined cond_data is saved
 * with the snapshot.  This accessor is used to retrieve it.
 *
 * Should not be called from cond_snapshot.update(), since it takes
 * the tr->max_lock lock, which the code calling
 * cond_snapshot.update() has already done.
 *
 * Returns the cond_data associated with the trace array's snapshot.
 */
void *tracing_cond_snapshot_data(struct trace_array *tr)
{
        void *cond_data = NULL;

        local_irq_disable();
        arch_spin_lock(&tr->max_lock);

        if (tr->cond_snapshot)
                cond_data = tr->cond_snapshot->cond_data;

        arch_spin_unlock(&tr->max_lock);
        local_irq_enable();

        return cond_data;
}
EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data);

static int resize_buffer_duplicate_size(struct array_buffer *trace_buf,
                                        struct array_buffer *size_buf, int cpu_id);
static void set_buffer_entries(struct array_buffer *buf, unsigned long val);

int tracing_alloc_snapshot_instance(struct trace_array *tr)
{
        int ret;

        if (!tr->allocated_snapshot) {

                /* allocate spare buffer */
                ret = resize_buffer_duplicate_size(&tr->max_buffer,
                                   &tr->array_buffer, RING_BUFFER_ALL_CPUS);
                if (ret < 0)
                        return ret;

                tr->allocated_snapshot = true;
        }

        return 0;
}

static void free_snapshot(struct trace_array *tr)
{
        /*
         * We don't free the ring buffer. instead, resize it because
         * The max_tr ring buffer has some state (e.g. ring->clock) and
         * we want preserve it.
         */
        ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS);
        set_buffer_entries(&tr->max_buffer, 1);
        tracing_reset_online_cpus(&tr->max_buffer);
        tr->allocated_snapshot = false;
}

/**
 * tracing_alloc_snapshot - allocate snapshot buffer.
 *
 * This only allocates the snapshot buffer if it isn't already
 * allocated - it doesn't also take a snapshot.
 *
 * This is meant to be used in cases where the snapshot buffer needs
 * to be set up for events that can't sleep but need to be able to
 * trigger a snapshot.
 */
int tracing_alloc_snapshot(void)
{
        struct trace_array *tr = &global_trace;
        int ret;

        ret = tracing_alloc_snapshot_instance(tr);
        WARN_ON(ret < 0);

        return ret;
}
EXPORT_SYMBOL_GPL(tracing_alloc_snapshot);

/**
 * tracing_snapshot_alloc - allocate and take a snapshot of the current buffer.
 *
 * This is similar to tracing_snapshot(), but it will allocate the
 * snapshot buffer if it isn't already allocated. Use this only
 * where it is safe to sleep, as the allocation may sleep.
 *
 * This causes a swap between the snapshot buffer and the current live
 * tracing buffer. You can use this to take snapshots of the live
 * trace when some condition is triggered, but continue to trace.
 */
void tracing_snapshot_alloc(void)
{
        int ret;

        ret = tracing_alloc_snapshot();
        if (ret < 0)
                return;

        tracing_snapshot();
}
EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);

/**
 * tracing_snapshot_cond_enable - enable conditional snapshot for an instance
 * @tr:                The tracing instance
 * @cond_data:        User data to associate with the snapshot
 * @update:        Implementation of the cond_snapshot update function
 *
 * Check whether the conditional snapshot for the given instance has
 * already been enabled, or if the current tracer is already using a
 * snapshot; if so, return -EBUSY, else create a cond_snapshot and
 * save the cond_data and update function inside.
 *
 * Returns 0 if successful, error otherwise.
 */
int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data,
                                 cond_update_fn_t update)
{
        struct cond_snapshot *cond_snapshot;
        int ret = 0;

        cond_snapshot = kzalloc(sizeof(*cond_snapshot), GFP_KERNEL);
        if (!cond_snapshot)
                return -ENOMEM;

        cond_snapshot->cond_data = cond_data;
        cond_snapshot->update = update;

        mutex_lock(&trace_types_lock);

        ret = tracing_alloc_snapshot_instance(tr);
        if (ret)
                goto fail_unlock;

        if (tr->current_trace->use_max_tr) {
                ret = -EBUSY;
                goto fail_unlock;
        }

        /*
         * The cond_snapshot can only change to NULL without the
         * trace_types_lock. We don't care if we race with it going
         * to NULL, but we want to make sure that it's not set to
         * something other than NULL when we get here, which we can
         * do safely with only holding the trace_types_lock and not
         * having to take the max_lock.
         */
        if (tr->cond_snapshot) {
                ret = -EBUSY;
                goto fail_unlock;
        }

        local_irq_disable();
        arch_spin_lock(&tr->max_lock);
        tr->cond_snapshot = cond_snapshot;
        arch_spin_unlock(&tr->max_lock);
        local_irq_enable();

        mutex_unlock(&trace_types_lock);

        return ret;

 fail_unlock:
        mutex_unlock(&trace_types_lock);
        kfree(cond_snapshot);
        return ret;
}
EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable);

/**
 * tracing_snapshot_cond_disable - disable conditional snapshot for an instance
 * @tr:                The tracing instance
 *
 * Check whether the conditional snapshot for the given instance is
 * enabled; if so, free the cond_snapshot associated with it,
 * otherwise return -EINVAL.
 *
 * Returns 0 if successful, error otherwise.
 */
int tracing_snapshot_cond_disable(struct trace_array *tr)
{
        int ret = 0;

        local_irq_disable();
        arch_spin_lock(&tr->max_lock);

        if (!tr->cond_snapshot)
                ret = -EINVAL;
        else {
                kfree(tr->cond_snapshot);
                tr->cond_snapshot = NULL;
        }

        arch_spin_unlock(&tr->max_lock);
        local_irq_enable();

        return ret;
}
EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable);
#else
void tracing_snapshot(void)
{
        WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used");
}
EXPORT_SYMBOL_GPL(tracing_snapshot);
void tracing_snapshot_cond(struct trace_array *tr, void *cond_data)
{
        WARN_ONCE(1, "Snapshot feature not enabled, but internal conditional snapshot used");
}
EXPORT_SYMBOL_GPL(tracing_snapshot_cond);
int tracing_alloc_snapshot(void)
{
        WARN_ONCE(1, "Snapshot feature not enabled, but snapshot allocation used");
        return -ENODEV;
}
EXPORT_SYMBOL_GPL(tracing_alloc_snapshot);
void tracing_snapshot_alloc(void)
{
        /* Give warning */
        tracing_snapshot();
}
EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
void *tracing_cond_snapshot_data(struct trace_array *tr)
{
        return NULL;
}
EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data);
int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update)
{
        return -ENODEV;
}
EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable);
int tracing_snapshot_cond_disable(struct trace_array *tr)
{
        return false;
}
EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable);
#endif /* CONFIG_TRACER_SNAPSHOT */

void tracer_tracing_off(struct trace_array *tr)
{
        if (tr->array_buffer.buffer)
                ring_buffer_record_off(tr->array_buffer.buffer);
        /*
         * This flag is looked at when buffers haven't been allocated
         * yet, or by some tracers (like irqsoff), that just want to
         * know if the ring buffer has been disabled, but it can handle
         * races of where it gets disabled but we still do a record.
         * As the check is in the fast path of the tracers, it is more
         * important to be fast than accurate.
         */
        tr->buffer_disabled = 1;
        /* Make the flag seen by readers */
        smp_wmb();
}

/**
 * tracing_off - turn off tracing buffers
 *
 * This function stops the tracing buffers from recording data.
 * It does not disable any overhead the tracers themselves may
 * be causing. This function simply causes all recording to
 * the ring buffers to fail.
 */
void tracing_off(void)
{
        tracer_tracing_off(&global_trace);
}
EXPORT_SYMBOL_GPL(tracing_off);

void disable_trace_on_warning(void)
{
        if (__disable_trace_on_warning) {
                trace_array_printk_buf(global_trace.array_buffer.buffer, _THIS_IP_,
                        "Disabling tracing due to warning\n");
                tracing_off();
        }
}

/**
 * tracer_tracing_is_on - show real state of ring buffer enabled
 * @tr : the trace array to know if ring buffer is enabled
 *
 * Shows real state of the ring buffer if it is enabled or not.
 */
bool tracer_tracing_is_on(struct trace_array *tr)
{
        if (tr->array_buffer.buffer)
                return ring_buffer_record_is_on(tr->array_buffer.buffer);
        return !tr->buffer_disabled;
}

/**
 * tracing_is_on - show state of ring buffers enabled
 */
int tracing_is_on(void)
{
        return tracer_tracing_is_on(&global_trace);
}
EXPORT_SYMBOL_GPL(tracing_is_on);

static int __init set_buf_size(char *str)
{
        unsigned long buf_size;

        if (!str)
                return 0;
        buf_size = memparse(str, &str);
        /*
         * nr_entries can not be zero and the startup
         * tests require some buffer space. Therefore
         * ensure we have at least 4096 bytes of buffer.
         */
        trace_buf_size = max(4096UL, buf_size);
        return 1;
}
__setup("trace_buf_size=", set_buf_size);

static int __init set_tracing_thresh(char *str)
{
        unsigned long threshold;
        int ret;

        if (!str)
                return 0;
        ret = kstrtoul(str, 0, &threshold);
        if (ret < 0)
                return 0;
        tracing_thresh = threshold * 1000;
        return 1;
}
__setup("tracing_thresh=", set_tracing_thresh);

unsigned long nsecs_to_usecs(unsigned long nsecs)
{
        return nsecs / 1000;
}

/*
 * TRACE_FLAGS is defined as a tuple matching bit masks with strings.
 * It uses C(a, b) where 'a' is the eval (enum) name and 'b' is the string that
 * matches it. By defining "C(a, b) b", TRACE_FLAGS becomes a list
 * of strings in the order that the evals (enum) were defined.
 */
#undef C
#define C(a, b) b

/* These must match the bit postions in trace_iterator_flags */
static const char *trace_options[] = {
        TRACE_FLAGS
        NULL
};

static struct {
        u64 (*func)(void);
        const char *name;
        int in_ns;                /* is this clock in nanoseconds? */
} trace_clocks[] = {
        { trace_clock_local,                "local",        1 },
        { trace_clock_global,                "global",        1 },
        { trace_clock_counter,                "counter",        0 },
        { trace_clock_jiffies,                "uptime",        0 },
        { trace_clock,                        "perf",                1 },
        { ktime_get_mono_fast_ns,        "mono",                1 },
        { ktime_get_raw_fast_ns,        "mono_raw",        1 },
        { ktime_get_boot_fast_ns,        "boot",                1 },
        ARCH_TRACE_CLOCKS
};

bool trace_clock_in_ns(struct trace_array *tr)
{
        if (trace_clocks[tr->clock_id].in_ns)
                return true;

        return false;
}

/*
 * trace_parser_get_init - gets the buffer for trace parser
 */
int trace_parser_get_init(struct trace_parser *parser, int size)
{
        memset(parser, 0, sizeof(*parser));

        parser->buffer = kmalloc(size, GFP_KERNEL);
        if (!parser->buffer)
                return 1;

        parser->size = size;
        return 0;
}

/*
 * trace_parser_put - frees the buffer for trace parser
 */
void trace_parser_put(struct trace_parser *parser)
{
        kfree(parser->buffer);
        parser->buffer = NULL;
}

/*
 * trace_get_user - reads the user input string separated by  space
 * (matched by isspace(ch))
 *
 * For each string found the 'struct trace_parser' is updated,
 * and the function returns.
 *
 * Returns number of bytes read.
 *
 * See kernel/trace/trace.h for 'struct trace_parser' details.
 */
int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
        size_t cnt, loff_t *ppos)
{
        char ch;
        size_t read = 0;
        ssize_t ret;

        if (!*ppos)
                trace_parser_clear(parser);

        ret = get_user(ch, ubuf++);
        if (ret)
                goto fail;

        read++;
        cnt--;

        /*
         * The parser is not finished with the last write,
         * continue reading the user input without skipping spaces.
         */
        if (!parser->cont) {
                /* skip white space */
                while (cnt && isspace(ch)) {
                        ret = get_user(ch, ubuf++);
                        if (ret)
                                goto fail;
                        read++;
                        cnt--;
                }

                parser->idx = 0;

                /* only spaces were written */
                if (isspace(ch) || !ch) {
                        *ppos += read;
                        return read;
                }
        }

        /* read the non-space input */
        while (cnt && !isspace(ch) && ch) {
                if (parser->idx < parser->size - 1)
                        parser->buffer[parser->idx++] = ch;
                else {
                        ret = -EINVAL;
                        goto fail;
                }

                ret = get_user(ch, ubuf++);
                if (ret)
                        goto fail;
                read++;
                cnt--;
        }

        /* We either got finished input or we have to wait for another call. */
        if (isspace(ch) || !ch) {
                parser->buffer[parser->idx] = 0;
                parser->cont = false;
        } else if (parser->idx < parser->size - 1) {
                parser->cont = true;
                parser->buffer[parser->idx++] = ch;
                /* Make sure the parsed string always terminates with '\0'. */
                parser->buffer[parser->idx] = 0;
        } else {
                ret = -EINVAL;
                goto fail;
        }

        *ppos += read;
        return read;
fail:
        trace_parser_fail(parser);
        return ret;
}

/* TODO add a seq_buf_to_buffer() */
static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
{
        int len;

        if (trace_seq_used(s) <= s->seq.readpos)
                return -EBUSY;

        len = trace_seq_used(s) - s->seq.readpos;
        if (cnt > len)
                cnt = len;
        memcpy(buf, s->buffer + s->seq.readpos, cnt);

        s->seq.readpos += cnt;
        return cnt;
}

unsigned long __read_mostly        tracing_thresh;
static const struct file_operations tracing_max_lat_fops;

#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \
        defined(CONFIG_FSNOTIFY)

static struct workqueue_struct *fsnotify_wq;

static void latency_fsnotify_workfn(struct work_struct *work)
{
        struct trace_array *tr = container_of(work, struct trace_array,
                                              fsnotify_work);
        fsnotify_inode(tr->d_max_latency->d_inode, FS_MODIFY);
}

static void latency_fsnotify_workfn_irq(struct irq_work *iwork)
{
        struct trace_array *tr = container_of(iwork, struct trace_array,
                                              fsnotify_irqwork);
        queue_work(fsnotify_wq, &tr->fsnotify_work);
}

static void trace_create_maxlat_file(struct trace_array *tr,
                                     struct dentry *d_tracer)
{
        INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn);
        init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq);
        tr->d_max_latency = trace_create_file("tracing_max_latency", 0644,
                                              d_tracer, &tr->max_latency,
                                              &tracing_max_lat_fops);
}

__init static int latency_fsnotify_init(void)
{
        fsnotify_wq = alloc_workqueue("tr_max_lat_wq",
                                      WQ_UNBOUND | WQ_HIGHPRI, 0);
        if (!fsnotify_wq) {
                pr_err("Unable to allocate tr_max_lat_wq\n");
                return -ENOMEM;
        }
        return 0;
}

late_initcall_sync(latency_fsnotify_init);

void latency_fsnotify(struct trace_array *tr)
{
        if (!fsnotify_wq)
                return;
        /*
         * We cannot call queue_work(&tr->fsnotify_work) from here because it's
         * possible that we are called from __schedule() or do_idle(), which
         * could cause a deadlock.
         */
        irq_work_queue(&tr->fsnotify_irqwork);
}

/*
 * (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \
 *  defined(CONFIG_FSNOTIFY)
 */
#else

#define trace_create_maxlat_file(tr, d_tracer)                                \
        trace_create_file("tracing_max_latency", 0644, d_tracer,        \
                          &tr->max_latency, &tracing_max_lat_fops)

#endif

#ifdef CONFIG_TRACER_MAX_TRACE
/*
 * Copy the new maximum trace into the separate maximum-trace
 * structure. (this way the maximum trace is permanently saved,
 * for later retrieval via /sys/kernel/tracing/tracing_max_latency)
 */
static void
__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
{
        struct array_buffer *trace_buf = &tr->array_buffer;
        struct array_buffer *max_buf = &tr->max_buffer;
        struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu);
        struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu);

        max_buf->cpu = cpu;
        max_buf->time_start = data->preempt_timestamp;

        max_data->saved_latency = tr->max_latency;
        max_data->critical_start = data->critical_start;
        max_data->critical_end = data->critical_end;

        strncpy(max_data->comm, tsk->comm, TASK_COMM_LEN);
        max_data->pid = tsk->pid;
        /*
         * If tsk == current, then use current_uid(), as that does not use
         * RCU. The irq tracer can be called out of RCU scope.
         */
        if (tsk == current)
                max_data->uid = current_uid();
        else
                max_data->uid = task_uid(tsk);

        max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
        max_data->policy = tsk->policy;
        max_data->rt_priority = tsk->rt_priority;

        /* record this tasks comm */
        tracing_record_cmdline(tsk);
        latency_fsnotify(tr);
}

/**
 * update_max_tr - snapshot all trace buffers from global_trace to max_tr
 * @tr: tracer
 * @tsk: the task with the latency
 * @cpu: The cpu that initiated the trace.
 * @cond_data: User data associated with a conditional snapshot
 *
 * Flip the buffers between the @tr and the max_tr and record information
 * about which task was the cause of this latency.
 */
void
update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu,
              void *cond_data)
{
        if (tr->stop_count)
                return;

        WARN_ON_ONCE(!irqs_disabled());

        if (!tr->allocated_snapshot) {
                /* Only the nop tracer should hit this when disabling */
                WARN_ON_ONCE(tr->current_trace != &nop_trace);
                return;
        }

        arch_spin_lock(&tr->max_lock);

        /* Inherit the recordable setting from array_buffer */
        if (ring_buffer_record_is_set_on(tr->array_buffer.buffer))
                ring_buffer_record_on(tr->max_buffer.buffer);
        else
                ring_buffer_record_off(tr->max_buffer.buffer);

#ifdef CONFIG_TRACER_SNAPSHOT
        if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data))
                goto out_unlock;
#endif
        swap(tr->array_buffer.buffer, tr->max_buffer.buffer);

        __update_max_tr(tr, tsk, cpu);

 out_unlock:
        arch_spin_unlock(&tr->max_lock);
}

/**
 * update_max_tr_single - only copy one trace over, and reset the rest
 * @tr: tracer
 * @tsk: task with the latency
 * @cpu: the cpu of the buffer to copy.
 *
 * Flip the trace of a single CPU buffer between the @tr and the max_tr.
 */
void
update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
{
        int ret;

        if (tr->stop_count)
                return;

        WARN_ON_ONCE(!irqs_disabled());
        if (!tr->allocated_snapshot) {
                /* Only the nop tracer should hit this when disabling */
                WARN_ON_ONCE(tr->current_trace != &nop_trace);
                return;
        }

        arch_spin_lock(&tr->max_lock);

        ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->array_buffer.buffer, cpu);

        if (ret == -EBUSY) {
                /*
                 * We failed to swap the buffer due to a commit taking
                 * place on this CPU. We fail to record, but we reset
                 * the max trace buffer (no one writes directly to it)
                 * and flag that it failed.
                 * Another reason is resize is in progress.
                 */
                trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_,
                        "Failed to swap buffers due to commit or resize in progress\n");
        }

        WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);

        __update_max_tr(tr, tsk, cpu);
        arch_spin_unlock(&tr->max_lock);

        /* Any waiters on the old snapshot buffer need to wake up */
        ring_buffer_wake_waiters(tr->array_buffer.buffer, RING_BUFFER_ALL_CPUS);
}
#endif /* CONFIG_TRACER_MAX_TRACE */

static int wait_on_pipe(struct trace_iterator *iter, int full)
{
        int ret;

        /* Iterators are static, they should be filled or empty */
        if (trace_buffer_iter(iter, iter->cpu_file))
                return 0;

        ret = ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, full);

#ifdef CONFIG_TRACER_MAX_TRACE
        /*
         * Make sure this is still the snapshot buffer, as if a snapshot were
         * to happen, this would now be the main buffer.
         */
        if (iter->snapshot)
                iter->array_buffer = &iter->tr->max_buffer;
#endif
        return ret;
}

#ifdef CONFIG_FTRACE_STARTUP_TEST
static bool selftests_can_run;

struct trace_selftests {
        struct list_head                list;
        struct tracer                        *type;
};

static LIST_HEAD(postponed_selftests);

static int save_selftest(struct tracer *type)
{
        struct trace_selftests *selftest;

        selftest = kmalloc(sizeof(*selftest), GFP_KERNEL);
        if (!selftest)
                return -ENOMEM;

        selftest->type = type;
        list_add(&selftest->list, &postponed_selftests);
        return 0;
}

static int run_tracer_selftest(struct tracer *type)
{
        struct trace_array *tr = &global_trace;
        struct tracer *saved_tracer = tr->current_trace;
        int ret;

        if (!type->selftest || tracing_selftest_disabled)
                return 0;

        /*
         * If a tracer registers early in boot up (before scheduling is
         * initialized and such), then do not run its selftests yet.
         * Instead, run it a little later in the boot process.
         */
        if (!selftests_can_run)
                return save_selftest(type);

        /*
         * Run a selftest on this tracer.
         * Here we reset the trace buffer, and set the current
         * tracer to be this tracer. The tracer can then run some
         * internal tracing to verify that everything is in order.
         * If we fail, we do not register this tracer.
         */
        tracing_reset_online_cpus(&tr->array_buffer);

        tr->current_trace = type;

#ifdef CONFIG_TRACER_MAX_TRACE
        if (type->use_max_tr) {
                /* If we expanded the buffers, make sure the max is expanded too */
                if (ring_buffer_expanded)
                        ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size,
                                           RING_BUFFER_ALL_CPUS);
                tr->allocated_snapshot = true;
        }
#endif

        /* the test is responsible for initializing and enabling */
        pr_info("Testing tracer %s: ", type->name);
        ret = type->selftest(type, tr);
        /* the test is responsible for resetting too */
        tr->current_trace = saved_tracer;
        if (ret) {
                printk(KERN_CONT "FAILED!\n");
                /* Add the warning after printing 'FAILED' */
                WARN_ON(1);
                return -1;
        }
        /* Only reset on passing, to avoid touching corrupted buffers */
        tracing_reset_online_cpus(&tr->array_buffer);

#ifdef CONFIG_TRACER_MAX_TRACE
        if (type->use_max_tr) {
                tr->allocated_snapshot = false;

                /* Shrink the max buffer again */
                if (ring_buffer_expanded)
                        ring_buffer_resize(tr->max_buffer.buffer, 1,
                                           RING_BUFFER_ALL_CPUS);
        }
#endif

        printk(KERN_CONT "PASSED\n");
        return 0;
}

static __init int init_trace_selftests(void)
{
        struct trace_selftests *p, *n;
        struct tracer *t, **last;
        int ret;

        selftests_can_run = true;

        mutex_lock(&trace_types_lock);

        if (list_empty(&postponed_selftests))
                goto out;

        pr_info("Running postponed tracer tests:\n");

        tracing_selftest_running = true;
        list_for_each_entry_safe(p, n, &postponed_selftests, list) {
                /* This loop can take minutes when sanitizers are enabled, so
                 * lets make sure we allow RCU processing.
                 */
                cond_resched();
                ret = run_tracer_selftest(p->type);
                /* If the test fails, then warn and remove from available_tracers */
                if (ret < 0) {
                        WARN(1, "tracer: %s failed selftest, disabling\n",
                             p->type->name);
                        last = &trace_types;
                        for (t = trace_types; t; t = t->next) {
                                if (t == p->type) {
                                        *last = t->next;
                                        break;
                                }
                                last = &t->next;
                        }
                }
                list_del(&p->list);
                kfree(p);
        }
        tracing_selftest_running = false;

 out:
        mutex_unlock(&trace_types_lock);

        return 0;
}
core_initcall(init_trace_selftests);
#else
static inline int run_tracer_selftest(struct tracer *type)
{
        return 0;
}
#endif /* CONFIG_FTRACE_STARTUP_TEST */

static void add_tracer_options(struct trace_array *tr, struct tracer *t);

static void __init apply_trace_boot_options(void);

/**
 * register_tracer - register a tracer with the ftrace system.
 * @type: the plugin for the tracer
 *
 * Register a new plugin tracer.
 */
int __init register_tracer(struct tracer *type)
{
        struct tracer *t;
        int ret = 0;

        if (!type->name) {
                pr_info("Tracer must have a name\n");
                return -1;
        }

        if (strlen(type->name) >= MAX_TRACER_SIZE) {
                pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE);
                return -1;
        }

        if (security_locked_down(LOCKDOWN_TRACEFS)) {
                pr_warn("Can not register tracer %s due to lockdown\n",
                           type->name);
                return -EPERM;
        }

        mutex_lock(&trace_types_lock);

        tracing_selftest_running = true;

        for (t = trace_types; t; t = t->next) {
                if (strcmp(type->name, t->name) == 0) {
                        /* already found */
                        pr_info("Tracer %s already registered\n",
                                type->name);
                        ret = -1;
                        goto out;
                }
        }

        if (!type->set_flag)
                type->set_flag = &dummy_set_flag;
        if (!type->flags) {
                /*allocate a dummy tracer_flags*/
                type->flags = kmalloc(sizeof(*type->flags), GFP_KERNEL);
                if (!type->flags) {
                        ret = -ENOMEM;
                        goto out;
                }
                type->flags->val = 0;
                type->flags->opts = dummy_tracer_opt;
        } else
                if (!type->flags->opts)
                        type->flags->opts = dummy_tracer_opt;

        /* store the tracer for __set_tracer_option */
        type->flags->trace = type;

        ret = run_tracer_selftest(type);
        if (ret < 0)
                goto out;

        type->next = trace_types;
        trace_types = type;
        add_tracer_options(&global_trace, type);

 out:
        tracing_selftest_running = false;
        mutex_unlock(&trace_types_lock);

        if (ret || !default_bootup_tracer)
                return ret;

        if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE))
                return 0;

        printk(KERN_INFO "Starting tracer '%s'\n", type->name);
        /* Do we want this tracer to start on bootup? */
        tracing_set_tracer(&global_trace, type->name);
        default_bootup_tracer = NULL;

        apply_trace_boot_options();

        /* disable other selftests, since this will break it. */
        disable_tracing_selftest("running a tracer");

        return 0;
}

static void tracing_reset_cpu(struct array_buffer *buf, int cpu)
{
        struct trace_buffer *buffer = buf->buffer;

        if (!buffer)
                return;

        ring_buffer_record_disable(buffer);

        /* Make sure all commits have finished */
        synchronize_rcu();
        ring_buffer_reset_cpu(buffer, cpu);

        ring_buffer_record_enable(buffer);
}

void tracing_reset_online_cpus(struct array_buffer *buf)
{
        struct trace_buffer *buffer = buf->buffer;

        if (!buffer)
                return;

        ring_buffer_record_disable(buffer);

        /* Make sure all commits have finished */
        synchronize_rcu();

        buf->time_start = buffer_ftrace_now(buf, buf->cpu);

        ring_buffer_reset_online_cpus(buffer);

        ring_buffer_record_enable(buffer);
}

/* Must have trace_types_lock held */
void tracing_reset_all_online_cpus_unlocked(void)
{
        struct trace_array *tr;

        lockdep_assert_held(&trace_types_lock);

        list_for_each_entry(tr, &ftrace_trace_arrays, list) {
                if (!tr->clear_trace)
                        continue;
                tr->clear_trace = false;
                tracing_reset_online_cpus(&tr->array_buffer);
#ifdef CONFIG_TRACER_MAX_TRACE
                tracing_reset_online_cpus(&tr->max_buffer);
#endif
        }
}

void tracing_reset_all_online_cpus(void)
{
        mutex_lock(&trace_types_lock);
        tracing_reset_all_online_cpus_unlocked();
        mutex_unlock(&trace_types_lock);
}

/*
 * The tgid_map array maps from pid to tgid; i.e. the value stored at index i
 * is the tgid last observed corresponding to pid=i.
 */
static int *tgid_map;

/* The maximum valid index into tgid_map. */
static size_t tgid_map_max;

#define SAVED_CMDLINES_DEFAULT 128
#define NO_CMDLINE_MAP UINT_MAX
/*
 * Preemption must be disabled before acquiring trace_cmdline_lock.
 * The various trace_arrays' max_lock must be acquired in a context
 * where interrupt is disabled.
 */
static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
struct saved_cmdlines_buffer {
        unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
        unsigned *map_cmdline_to_pid;
        unsigned cmdline_num;
        int cmdline_idx;
        char saved_cmdlines[];
};
static struct saved_cmdlines_buffer *savedcmd;

/* Holds the size of a cmdline and pid element */
#define SAVED_CMDLINE_MAP_ELEMENT_SIZE(s)                        \
        (TASK_COMM_LEN + sizeof((s)->map_cmdline_to_pid[0]))

static inline char *get_saved_cmdlines(int idx)
{
        return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN];
}

static inline void set_cmdline(int idx, const char *cmdline)
{
        strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN);
}

static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
{
        int order = get_order(sizeof(*s) + s->cmdline_num * TASK_COMM_LEN);

        kmemleak_free(s);
        free_pages((unsigned long)s, order);
}

static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val)
{
        struct saved_cmdlines_buffer *s;
        struct page *page;
        int orig_size, size;
        int order;

        /* Figure out how much is needed to hold the given number of cmdlines */
        orig_size = sizeof(*s) + val * SAVED_CMDLINE_MAP_ELEMENT_SIZE(s);
        order = get_order(orig_size);
        size = 1 << (order + PAGE_SHIFT);
        page = alloc_pages(GFP_KERNEL, order);
        if (!page)
                return NULL;

        s = page_address(page);
        kmemleak_alloc(s, size, 1, GFP_KERNEL);
        memset(s, 0, sizeof(*s));

        /* Round up to actual allocation */
        val = (size - sizeof(*s)) / SAVED_CMDLINE_MAP_ELEMENT_SIZE(s);
        s->cmdline_num = val;

        /* Place map_cmdline_to_pid array right after saved_cmdlines */
        s->map_cmdline_to_pid = (unsigned *)&s->saved_cmdlines[val * TASK_COMM_LEN];

        s->cmdline_idx = 0;
        memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP,
               sizeof(s->map_pid_to_cmdline));
        memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP,
               val * sizeof(*s->map_cmdline_to_pid));

        return s;
}

static int trace_create_savedcmd(void)
{
        savedcmd = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT);

        return savedcmd ? 0 : -ENOMEM;
}

int is_tracing_stopped(void)
{
        return global_trace.stop_count;
}

static void tracing_start_tr(struct trace_array *tr)
{
        struct trace_buffer *buffer;
        unsigned long flags;

        if (tracing_disabled)
                return;

        raw_spin_lock_irqsave(&tr->start_lock, flags);
        if (--tr->stop_count) {
                if (WARN_ON_ONCE(tr->stop_count < 0)) {
                        /* Someone screwed up their debugging */
                        tr->stop_count = 0;
                }
                goto out;
        }

        /* Prevent the buffers from switching */
        arch_spin_lock(&tr->max_lock);

        buffer = tr->array_buffer.buffer;
        if (buffer)
                ring_buffer_record_enable(buffer);

#ifdef CONFIG_TRACER_MAX_TRACE
        buffer = tr->max_buffer.buffer;
        if (buffer)
                ring_buffer_record_enable(buffer);
#endif

        arch_spin_unlock(&tr->max_lock);

 out:
        raw_spin_unlock_irqrestore(&tr->start_lock, flags);
}

/**
 * tracing_start - quick start of the tracer
 *
 * If tracing is enabled but was stopped by tracing_stop,
 * this will start the tracer back up.
 */
void tracing_start(void)

{
        return tracing_start_tr(&global_trace);
}

static void tracing_stop_tr(struct trace_array *tr)
{
        struct trace_buffer *buffer;
        unsigned long flags;

        raw_spin_lock_irqsave(&tr->start_lock, flags);
        if (tr->stop_count++)
                goto out;

        /* Prevent the buffers from switching */
        arch_spin_lock(&tr->max_lock);

        buffer = tr->array_buffer.buffer;
        if (buffer)
                ring_buffer_record_disable(buffer);

#ifdef CONFIG_TRACER_MAX_TRACE
        buffer = tr->max_buffer.buffer;
        if (buffer)
                ring_buffer_record_disable(buffer);
#endif

        arch_spin_unlock(&tr->max_lock);

 out:
        raw_spin_unlock_irqrestore(&tr->start_lock, flags);
}

/**
 * tracing_stop - quick stop of the tracer
 *
 * Light weight way to stop tracing. Use in conjunction with
 * tracing_start.
 */
void tracing_stop(void)
{
        return tracing_stop_tr(&global_trace);
}

static int trace_save_cmdline(struct task_struct *tsk)
{
        unsigned tpid, idx;

        /* treat recording of idle task as a success */
        if (!tsk->pid)
                return 1;

        tpid = tsk->pid & (PID_MAX_DEFAULT - 1);

        /*
         * It's not the end of the world if we don't get
         * the lock, but we also don't want to spin
         * nor do we want to disable interrupts,
         * so if we miss here, then better luck next time.
         *
         * This is called within the scheduler and wake up, so interrupts
         * had better been disabled and run queue lock been held.
         */
        lockdep_assert_preemption_disabled();
        if (!arch_spin_trylock(&trace_cmdline_lock))
                return 0;

        idx = savedcmd->map_pid_to_cmdline[tpid];
        if (idx == NO_CMDLINE_MAP) {
                idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num;

                savedcmd->map_pid_to_cmdline[tpid] = idx;
                savedcmd->cmdline_idx = idx;
        }

        savedcmd->map_cmdline_to_pid[idx] = tsk->pid;
        set_cmdline(idx, tsk->comm);

        arch_spin_unlock(&trace_cmdline_lock);

        return 1;
}

static void __trace_find_cmdline(int pid, char comm[])
{
        unsigned map;
        int tpid;

        if (!pid) {
                strcpy(comm, "<idle>");
                return;
        }

        if (WARN_ON_ONCE(pid < 0)) {
                strcpy(comm, "<XXX>");
                return;
        }

        tpid = pid & (PID_MAX_DEFAULT - 1);
        map = savedcmd->map_pid_to_cmdline[tpid];
        if (map != NO_CMDLINE_MAP) {
                tpid = savedcmd->map_cmdline_to_pid[map];
                if (tpid == pid) {
                        strlcpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN);
                        return;
                }
        }
        strcpy(comm, "<...>");
}

void trace_find_cmdline(int pid, char comm[])
{
        preempt_disable();
        arch_spin_lock(&trace_cmdline_lock);

        __trace_find_cmdline(pid, comm);

        arch_spin_unlock(&trace_cmdline_lock);
        preempt_enable();
}

static int *trace_find_tgid_ptr(int pid)
{
        /*
         * Pairs with the smp_store_release in set_tracer_flag() to ensure that
         * if we observe a non-NULL tgid_map then we also observe the correct
         * tgid_map_max.
         */
        int *map = smp_load_acquire(&tgid_map);

        if (unlikely(!map || pid > tgid_map_max))
                return NULL;

        return &map[pid];
}

int trace_find_tgid(int pid)
{
        int *ptr = trace_find_tgid_ptr(pid);

        return ptr ? *ptr : 0;
}

static int trace_save_tgid(struct task_struct *tsk)
{
        int *ptr;

        /* treat recording of idle task as a success */
        if (!tsk->pid)
                return 1;

        ptr = trace_find_tgid_ptr(tsk->pid);
        if (!ptr)
                return 0;

        *ptr = tsk->tgid;
        return 1;
}

static bool tracing_record_taskinfo_skip(int flags)
{
        if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID))))
                return true;
        if (!__this_cpu_read(trace_taskinfo_save))
                return true;
        return false;
}

/**
 * tracing_record_taskinfo - record the task info of a task
 *
 * @task:  task to record
 * @flags: TRACE_RECORD_CMDLINE for recording comm
 *         TRACE_RECORD_TGID for recording tgid
 */
void tracing_record_taskinfo(struct task_struct *task, int flags)
{
        bool done;

        if (tracing_record_taskinfo_skip(flags))
                return;

        /*
         * Record as much task information as possible. If some fail, continue
         * to try to record the others.
         */
        done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(task);
        done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(task);

        /* If recording any information failed, retry again soon. */
        if (!done)
                return;

        __this_cpu_write(trace_taskinfo_save, false);
}

/**
 * tracing_record_taskinfo_sched_switch - record task info for sched_switch
 *
 * @prev: previous task during sched_switch
 * @next: next task during sched_switch
 * @flags: TRACE_RECORD_CMDLINE for recording comm
 *         TRACE_RECORD_TGID for recording tgid
 */
void tracing_record_taskinfo_sched_switch(struct task_struct *prev,
                                          struct task_struct *next, int flags)
{
        bool done;

        if (tracing_record_taskinfo_skip(flags))
                return;

        /*
         * Record as much task information as possible. If some fail, continue
         * to try to record the others.
         */
        done  = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(prev);
        done &= !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(next);
        done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(prev);
        done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(next);

        /* If recording any information failed, retry again soon. */
        if (!done)
                return;

        __this_cpu_write(trace_taskinfo_save, false);
}

/* Helpers to record a specific task information */
void tracing_record_cmdline(struct task_struct *task)
{
        tracing_record_taskinfo(task, TRACE_RECORD_CMDLINE);
}

void tracing_record_tgid(struct task_struct *task)
{
        tracing_record_taskinfo(task, TRACE_RECORD_TGID);
}

/*
 * Several functions return TRACE_TYPE_PARTIAL_LINE if the trace_seq
 * overflowed, and TRACE_TYPE_HANDLED otherwise. This helper function
 * simplifies those functions and keeps them in sync.
 */
enum print_line_t trace_handle_return(struct trace_seq *s)
{
        return trace_seq_has_overflowed(s) ?
                TRACE_TYPE_PARTIAL_LINE : TRACE_TYPE_HANDLED;
}
EXPORT_SYMBOL_GPL(trace_handle_return);

void
tracing_generic_entry_update(struct trace_entry *entry, unsigned short type,
                             unsigned long flags, int pc)
{
        struct task_struct *tsk = current;

        entry->preempt_count                = pc & 0xff;
        entry->pid                        = (tsk) ? tsk->pid : 0;
        entry->type                        = type;
        entry->flags =
#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
                (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
#else
                TRACE_FLAG_IRQS_NOSUPPORT |
#endif
                ((pc & NMI_MASK    ) ? TRACE_FLAG_NMI     : 0) |
                ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
                ((pc & SOFTIRQ_OFFSET) ? TRACE_FLAG_SOFTIRQ : 0) |
                (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
                (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
}
EXPORT_SYMBOL_GPL(tracing_generic_entry_update);

struct ring_buffer_event *
trace_buffer_lock_reserve(struct trace_buffer *buffer,
                          int type,
                          unsigned long len,
                          unsigned long flags, int pc)
{
        return __trace_buffer_lock_reserve(buffer, type, len, flags, pc);
}

DEFINE_PER_CPU(struct ring_buffer_event *, trace_buffered_event);
DEFINE_PER_CPU(int, trace_buffered_event_cnt);
static int trace_buffered_event_ref;

/**
 * trace_buffered_event_enable - enable buffering events
 *
 * When events are being filtered, it is quicker to use a temporary
 * buffer to write the event data into if there's a likely chance
 * that it will not be committed. The discard of the ring buffer
 * is not as fast as committing, and is much slower than copying
 * a commit.
 *
 * When an event is to be filtered, allocate per cpu buffers to
 * write the event data into, and if the event is filtered and discarded
 * it is simply dropped, otherwise, the entire data is to be committed
 * in one shot.
 */
void trace_buffered_event_enable(void)
{
        struct ring_buffer_event *event;
        struct page *page;
        int cpu;

        WARN_ON_ONCE(!mutex_is_locked(&event_mutex));

        if (trace_buffered_event_ref++)
                return;

        for_each_tracing_cpu(cpu) {
                page = alloc_pages_node(cpu_to_node(cpu),
                                        GFP_KERNEL | __GFP_NORETRY, 0);
                /* This is just an optimization and can handle failures */
                if (!page) {
                        pr_err("Failed to allocate event buffer\n");
                        break;
                }

                event = page_address(page);
                memset(event, 0, sizeof(*event));

                per_cpu(trace_buffered_event, cpu) = event;

                preempt_disable();
                if (cpu == smp_processor_id() &&
                    __this_cpu_read(trace_buffered_event) !=
                    per_cpu(trace_buffered_event, cpu))
                        WARN_ON_ONCE(1);
                preempt_enable();
        }
}

static void enable_trace_buffered_event(void *data)
{
        /* Probably not needed, but do it anyway */
        smp_rmb();
        this_cpu_dec(trace_buffered_event_cnt);
}

static void disable_trace_buffered_event(void *data)
{
        this_cpu_inc(trace_buffered_event_cnt);
}

/**
 * trace_buffered_event_disable - disable buffering events
 *
 * When a filter is removed, it is faster to not use the buffered
 * events, and to commit directly into the ring buffer. Free up
 * the temp buffers when there are no more users. This requires
 * special synchronization with current events.
 */
void trace_buffered_event_disable(void)
{
        int cpu;

        WARN_ON_ONCE(!mutex_is_locked(&event_mutex));

        if (WARN_ON_ONCE(!trace_buffered_event_ref))
                return;

        if (--trace_buffered_event_ref)
                return;

        /* For each CPU, set the buffer as used. */
        on_each_cpu_mask(tracing_buffer_mask, disable_trace_buffered_event,
                         NULL, true);

        /* Wait for all current users to finish */
        synchronize_rcu();

        for_each_tracing_cpu(cpu) {
                free_page((unsigned long)per_cpu(trace_buffered_event, cpu));
                per_cpu(trace_buffered_event, cpu) = NULL;
        }

        /*
         * Wait for all CPUs that potentially started checking if they can use
         * their event buffer only after the previous synchronize_rcu() call and
         * they still read a valid pointer from trace_buffered_event. It must be
         * ensured they don't see cleared trace_buffered_event_cnt else they
         * could wrongly decide to use the pointed-to buffer which is now freed.
         */
        synchronize_rcu();

        /* For each CPU, relinquish the buffer */
        on_each_cpu_mask(tracing_buffer_mask, enable_trace_buffered_event, NULL,
                         true);
}

static struct trace_buffer *temp_buffer;

struct ring_buffer_event *
trace_event_buffer_lock_reserve(struct trace_buffer **current_rb,
                          struct trace_event_file *trace_file,
                          int type, unsigned long len,
                          unsigned long flags, int pc)
{
        struct ring_buffer_event *entry;
        int val;

        *current_rb = trace_file->tr->array_buffer.buffer;

        if (!ring_buffer_time_stamp_abs(*current_rb) && (trace_file->flags &
             (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) &&
            (entry = this_cpu_read(trace_buffered_event))) {
                /* Try to use the per cpu buffer first */
                val = this_cpu_inc_return(trace_buffered_event_cnt);
                if ((len < (PAGE_SIZE - sizeof(*entry) - sizeof(entry->array[0]))) && val == 1) {
                        trace_event_setup(entry, type, flags, pc);
                        entry->array[0] = len;
                        return entry;
                }
                this_cpu_dec(trace_buffered_event_cnt);
        }

        entry = __trace_buffer_lock_reserve(*current_rb,
                                            type, len, flags, pc);
        /*
         * If tracing is off, but we have triggers enabled
         * we still need to look at the event data. Use the temp_buffer
         * to store the trace event for the trigger to use. It's recursive
         * safe and will not be recorded anywhere.
         */
        if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) {
                *current_rb = temp_buffer;
                entry = __trace_buffer_lock_reserve(*current_rb,
                                                    type, len, flags, pc);
        }
        return entry;
}
EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve);

static DEFINE_RAW_SPINLOCK(tracepoint_iter_lock);
static DEFINE_MUTEX(tracepoint_printk_mutex);

static void output_printk(struct trace_event_buffer *fbuffer)
{
        struct trace_event_call *event_call;
        struct trace_event_file *file;
        struct trace_event *event;
        unsigned long flags;
        struct trace_iterator *iter = tracepoint_print_iter;

        /* We should never get here if iter is NULL */
        if (WARN_ON_ONCE(!iter))
                return;

        event_call = fbuffer->trace_file->event_call;
        if (!event_call || !event_call->event.funcs ||
            !event_call->event.funcs->trace)
                return;

        file = fbuffer->trace_file;
        if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) ||
            (unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
             !filter_match_preds(file->filter, fbuffer->entry)))
                return;

        event = &fbuffer->trace_file->event_call->event;

        raw_spin_lock_irqsave(&tracepoint_iter_lock, flags);
        trace_seq_init(&iter->seq);
        iter->ent = fbuffer->entry;
        event_call->event.funcs->trace(iter, 0, event);
        trace_seq_putc(&iter->seq, 0);
        printk("%s", iter->seq.buffer);

        raw_spin_unlock_irqrestore(&tracepoint_iter_lock, flags);
}

int tracepoint_printk_sysctl(struct ctl_table *table, int write,
                             void *buffer, size_t *lenp,
                             loff_t *ppos)
{
        int save_tracepoint_printk;
        int ret;

        mutex_lock(&tracepoint_printk_mutex);
        save_tracepoint_printk = tracepoint_printk;

        ret = proc_dointvec(table, write, buffer, lenp, ppos);

        /*
         * This will force exiting early, as tracepoint_printk
         * is always zero when tracepoint_printk_iter is not allocated
         */
        if (!tracepoint_print_iter)
                tracepoint_printk = 0;

        if (save_tracepoint_printk == tracepoint_printk)
                goto out;

        if (tracepoint_printk)
                static_key_enable(&tracepoint_printk_key.key);
        else
                static_key_disable(&tracepoint_printk_key.key);

 out:
        mutex_unlock(&tracepoint_printk_mutex);

        return ret;
}

void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
{
        if (static_key_false(&tracepoint_printk_key.key))
                output_printk(fbuffer);

        if (static_branch_unlikely(&trace_event_exports_enabled))
                ftrace_exports(fbuffer->event, TRACE_EXPORT_EVENT);
        event_trigger_unlock_commit_regs(fbuffer->trace_file, fbuffer->buffer,
                                    fbuffer->event, fbuffer->entry,
                                    fbuffer->flags, fbuffer->pc, fbuffer->regs);
}
EXPORT_SYMBOL_GPL(trace_event_buffer_commit);

/*
 * Skip 3:
 *
 *   trace_buffer_unlock_commit_regs()
 *   trace_event_buffer_commit()
 *   trace_event_raw_event_xxx()
 */
# define STACK_SKIP 3

void trace_buffer_unlock_commit_regs(struct trace_array *tr,
                                     struct trace_buffer *buffer,
                                     struct ring_buffer_event *event,
                                     unsigned long flags, int pc,
                                     struct pt_regs *regs)
{
        __buffer_unlock_commit(buffer, event);

        /*
         * If regs is not set, then skip the necessary functions.
         * Note, we can still get here via blktrace, wakeup tracer
         * and mmiotrace, but that's ok if they lose a function or
         * two. They are not that meaningful.
         */
        ftrace_trace_stack(tr, buffer, flags, regs ? 0 : STACK_SKIP, pc, regs);
        ftrace_trace_userstack(tr, buffer, flags, pc);
}

/*
 * Similar to trace_buffer_unlock_commit_regs() but do not dump stack.
 */
void
trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer,
                                   struct ring_buffer_event *event)
{
        __buffer_unlock_commit(buffer, event);
}

void
trace_function(struct trace_array *tr,
               unsigned long ip, unsigned long parent_ip, unsigned long flags,
               int pc)
{
        struct trace_event_call *call = &event_function;
        struct trace_buffer *buffer = tr->array_buffer.buffer;
        struct ring_buffer_event *event;
        struct ftrace_entry *entry;

        event = __trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
                                            flags, pc);
        if (!event)
                return;
        entry        = ring_buffer_event_data(event);
        entry->ip                        = ip;
        entry->parent_ip                = parent_ip;

        if (!call_filter_check_discard(call, entry, buffer, event)) {
                if (static_branch_unlikely(&trace_function_exports_enabled))
                        ftrace_exports(event, TRACE_EXPORT_FUNCTION);
                __buffer_unlock_commit(buffer, event);
        }
}

#ifdef CONFIG_STACKTRACE

/* Allow 4 levels of nesting: normal, softirq, irq, NMI */
#define FTRACE_KSTACK_NESTING        4

#define FTRACE_KSTACK_ENTRIES        (PAGE_SIZE / FTRACE_KSTACK_NESTING)

struct ftrace_stack {
        unsigned long                calls[FTRACE_KSTACK_ENTRIES];
};


struct ftrace_stacks {
        struct ftrace_stack        stacks[FTRACE_KSTACK_NESTING];
};

static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks);
static DEFINE_PER_CPU(int, ftrace_stack_reserve);

static void __ftrace_trace_stack(struct trace_buffer *buffer,
                                 unsigned long flags,
                                 int skip, int pc, struct pt_regs *regs)
{
        struct trace_event_call *call = &event_kernel_stack;
        struct ring_buffer_event *event;
        unsigned int size, nr_entries;
        struct ftrace_stack *fstack;
        struct stack_entry *entry;
        int stackidx;

        /*
         * Add one, for this function and the call to save_stack_trace()
         * If regs is set, then these functions will not be in the way.
         */
#ifndef CONFIG_UNWINDER_ORC
        if (!regs)
                skip++;
#endif

        preempt_disable_notrace();

        stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1;

        /* This should never happen. If it does, yell once and skip */
        if (WARN_ON_ONCE(stackidx >= FTRACE_KSTACK_NESTING))
                goto out;

        /*
         * The above __this_cpu_inc_return() is 'atomic' cpu local. An
         * interrupt will either see the value pre increment or post
         * increment. If the interrupt happens pre increment it will have
         * restored the counter when it returns.  We just need a barrier to
         * keep gcc from moving things around.
         */
        barrier();

        fstack = this_cpu_ptr(ftrace_stacks.stacks) + stackidx;
        size = ARRAY_SIZE(fstack->calls);

        if (regs) {
                nr_entries = stack_trace_save_regs(regs, fstack->calls,
                                                   size, skip);
        } else {
                nr_entries = stack_trace_save(fstack->calls, size, skip);
        }

        size = nr_entries * sizeof(unsigned long);
        event = __trace_buffer_lock_reserve(buffer, TRACE_STACK,
                                    (sizeof(*entry) - sizeof(entry->caller)) + size,
                                    flags, pc);
        if (!event)
                goto out;
        entry = ring_buffer_event_data(event);

        memcpy(&entry->caller, fstack->calls, size);
        entry->size = nr_entries;

        if (!call_filter_check_discard(call, entry, buffer, event))
                __buffer_unlock_commit(buffer, event);

 out:
        /* Again, don't let gcc optimize things here */
        barrier();
        __this_cpu_dec(ftrace_stack_reserve);
        preempt_enable_notrace();

}

static inline void ftrace_trace_stack(struct trace_array *tr,
                                      struct trace_buffer *buffer,
                                      unsigned long flags,
                                      int skip, int pc, struct pt_regs *regs)
{
        if (!(tr->trace_flags & TRACE_ITER_STACKTRACE))
                return;

        __ftrace_trace_stack(buffer, flags, skip, pc, regs);
}

void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
                   int pc)
{
        struct trace_buffer *buffer = tr->array_buffer.buffer;

        if (rcu_is_watching()) {
                __ftrace_trace_stack(buffer, flags, skip, pc, NULL);
                return;
        }

        /*
         * When an NMI triggers, RCU is enabled via rcu_nmi_enter(),
         * but if the above rcu_is_watching() failed, then the NMI
         * triggered someplace critical, and rcu_irq_enter() should
         * not be called from NMI.
         */
        if (unlikely(in_nmi()))
                return;

        rcu_irq_enter_irqson();
        __ftrace_trace_stack(buffer, flags, skip, pc, NULL);
        rcu_irq_exit_irqson();
}

/**
 * trace_dump_stack - record a stack back trace in the trace buffer
 * @skip: Number of functions to skip (helper handlers)
 */
void trace_dump_stack(int skip)
{
        unsigned long flags;

        if (tracing_disabled || tracing_selftest_running)
                return;

        local_save_flags(flags);

#ifndef CONFIG_UNWINDER_ORC
        /* Skip 1 to skip this function. */
        skip++;
#endif
        __ftrace_trace_stack(global_trace.array_buffer.buffer,
                             flags, skip, preempt_count(), NULL);
}
EXPORT_SYMBOL_GPL(trace_dump_stack);

#ifdef CONFIG_USER_STACKTRACE_SUPPORT
static DEFINE_PER_CPU(int, user_stack_count);

static void
ftrace_trace_userstack(struct trace_array *tr,
                       struct trace_buffer *buffer, unsigned long flags, int pc)
{
        struct trace_event_call *call = &event_user_stack;
        struct ring_buffer_event *event;
        struct userstack_entry *entry;

        if (!(tr->trace_flags & TRACE_ITER_USERSTACKTRACE))
                return;

        /*
         * NMIs can not handle page faults, even with fix ups.
         * The save user stack can (and often does) fault.
         */
        if (unlikely(in_nmi()))
                return;

        /*
         * prevent recursion, since the user stack tracing may
         * trigger other kernel events.
         */
        preempt_disable();
        if (__this_cpu_read(user_stack_count))
                goto out;

        __this_cpu_inc(user_stack_count);

        event = __trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
                                            sizeof(*entry), flags, pc);
        if (!event)
                goto out_drop_count;
        entry        = ring_buffer_event_data(event);

        entry->tgid                = current->tgid;
        memset(&entry->caller, 0, sizeof(entry->caller));

        stack_trace_save_user(entry->caller, FTRACE_STACK_ENTRIES);
        if (!call_filter_check_discard(call, entry, buffer, event))
                __buffer_unlock_commit(buffer, event);

 out_drop_count:
        __this_cpu_dec(user_stack_count);
 out:
        preempt_enable();
}
#else /* CONFIG_USER_STACKTRACE_SUPPORT */
static void ftrace_trace_userstack(struct trace_array *tr,
                                   struct trace_buffer *buffer,
                                   unsigned long flags, int pc)
{
}
#endif /* !CONFIG_USER_STACKTRACE_SUPPORT */

#endif /* CONFIG_STACKTRACE */

/* created for use with alloc_percpu */
struct trace_buffer_struct {
        int nesting;
        char buffer[4][TRACE_BUF_SIZE];
};

static struct trace_buffer_struct __percpu *trace_percpu_buffer;

/*
 * Thise allows for lockless recording.  If we're nested too deeply, then
 * this returns NULL.
 */
static char *get_trace_buf(void)
{
        struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer);

        if (!trace_percpu_buffer || buffer->nesting >= 4)
                return NULL;

        buffer->nesting++;

        /* Interrupts must see nesting incremented before we use the buffer */
        barrier();
        return &buffer->buffer[buffer->nesting - 1][0];
}

static void put_trace_buf(void)
{
        /* Don't let the decrement of nesting leak before this */
        barrier();
        this_cpu_dec(trace_percpu_buffer->nesting);
}

static int alloc_percpu_trace_buffer(void)
{
        struct trace_buffer_struct __percpu *buffers;

        if (trace_percpu_buffer)
                return 0;

        buffers = alloc_percpu(struct trace_buffer_struct);
        if (MEM_FAIL(!buffers, "Could not allocate percpu trace_printk buffer"))
                return -ENOMEM;

        trace_percpu_buffer = buffers;
        return 0;
}

static int buffers_allocated;

void trace_printk_init_buffers(void)
{
        if (buffers_allocated)
                return;

        if (alloc_percpu_trace_buffer())
                return;

        /* trace_printk() is for debug use only. Don't use it in production. */

        pr_warn("\n");
        pr_warn("**********************************************************\n");
        pr_warn("**   NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE   **\n");
        pr_warn("**                                                      **\n");
        pr_warn("** trace_printk() being used. Allocating extra memory.  **\n");
        pr_warn("**                                                      **\n");
        pr_warn("** This means that this is a DEBUG kernel and it is     **\n");
        pr_warn("** unsafe for production use.                           **\n");
        pr_warn("**                                                      **\n");
        pr_warn("** If you see this message and you are not debugging    **\n");
        pr_warn("** the kernel, report this immediately to your vendor!  **\n");
        pr_warn("**                                                      **\n");
        pr_warn("**   NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE   **\n");
        pr_warn("**********************************************************\n");

        /* Expand the buffers to set size */
        tracing_update_buffers();

        buffers_allocated = 1;

        /*
         * trace_printk_init_buffers() can be called by modules.
         * If that happens, then we need to start cmdline recording
         * directly here. If the global_trace.buffer is already
         * allocated here, then this was called by module code.
         */
        if (global_trace.array_buffer.buffer)
                tracing_start_cmdline_record();
}
EXPORT_SYMBOL_GPL(trace_printk_init_buffers);

void trace_printk_start_comm(void)
{
        /* Start tracing comms if trace printk is set */
        if (!buffers_allocated)
                return;
        tracing_start_cmdline_record();
}

static void trace_printk_start_stop_comm(int enabled)
{
        if (!buffers_allocated)
                return;

        if (enabled)
                tracing_start_cmdline_record();
        else
                tracing_stop_cmdline_record();
}

/**
 * trace_vbprintk - write binary msg to tracing buffer
 * @ip:    The address of the caller
 * @fmt:   The string format to write to the buffer
 * @args:  Arguments for @fmt
 */
int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
{
        struct trace_event_call *call = &event_bprint;
        struct ring_buffer_event *event;
        struct trace_buffer *buffer;
        struct trace_array *tr = &global_trace;
        struct bprint_entry *entry;
        unsigned long flags;
        char *tbuffer;
        int len = 0, size, pc;

        if (unlikely(tracing_selftest_running || tracing_disabled))
                return 0;

        /* Don't pollute graph traces with trace_vprintk internals */
        pause_graph_tracing();

        pc = preempt_count();
        preempt_disable_notrace();

        tbuffer = get_trace_buf();
        if (!tbuffer) {
                len = 0;
                goto out_nobuffer;
        }

        len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args);

        if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0)
                goto out_put;

        local_save_flags(flags);
        size = sizeof(*entry) + sizeof(u32) * len;
        buffer = tr->array_buffer.buffer;
        ring_buffer_nest_start(buffer);
        event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
                                            flags, pc);
        if (!event)
                goto out;
        entry = ring_buffer_event_data(event);
        entry->ip                        = ip;
        entry->fmt                        = fmt;

        memcpy(entry->buf, tbuffer, sizeof(u32) * len);
        if (!call_filter_check_discard(call, entry, buffer, event)) {
                __buffer_unlock_commit(buffer, event);
                ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL);
        }

out:
        ring_buffer_nest_end(buffer);
out_put:
        put_trace_buf();

out_nobuffer:
        preempt_enable_notrace();
        unpause_graph_tracing();

        return len;
}
EXPORT_SYMBOL_GPL(trace_vbprintk);

__printf(3, 0)
static int
__trace_array_vprintk(struct trace_buffer *buffer,
                      unsigned long ip, const char *fmt, va_list args)
{
        struct trace_event_call *call = &event_print;
        struct ring_buffer_event *event;
        int len = 0, size, pc;
        struct print_entry *entry;
        unsigned long flags;
        char *tbuffer;

        if (tracing_disabled || tracing_selftest_running)
                return 0;

        /* Don't pollute graph traces with trace_vprintk internals */
        pause_graph_tracing();

        pc = preempt_count();
        preempt_disable_notrace();


        tbuffer = get_trace_buf();
        if (!tbuffer) {
                len = 0;
                goto out_nobuffer;
        }

        len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);

        local_save_flags(flags);
        size = sizeof(*entry) + len + 1;
        ring_buffer_nest_start(buffer);
        event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
                                            flags, pc);
        if (!event)
                goto out;
        entry = ring_buffer_event_data(event);
        entry->ip = ip;

        memcpy(&entry->buf, tbuffer, len + 1);
        if (!call_filter_check_discard(call, entry, buffer, event)) {
                __buffer_unlock_commit(buffer, event);
                ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL);
        }

out:
        ring_buffer_nest_end(buffer);
        put_trace_buf();

out_nobuffer:
        preempt_enable_notrace();
        unpause_graph_tracing();

        return len;
}

__printf(3, 0)
int trace_array_vprintk(struct trace_array *tr,
                        unsigned long ip, const char *fmt, va_list args)
{
        return __trace_array_vprintk(tr->array_buffer.buffer, ip, fmt, args);
}

/**
 * trace_array_printk - Print a message to a specific instance
 * @tr: The instance trace_array descriptor
 * @ip: The instruction pointer that this is called from.
 * @fmt: The format to print (printf format)
 *
 * If a subsystem sets up its own instance, they have the right to
 * printk strings into their tracing instance buffer using this
 * function. Note, this function will not write into the top level
 * buffer (use trace_printk() for that), as writing into the top level
 * buffer should only have events that can be individually disabled.
 * trace_printk() is only used for debugging a kernel, and should not
 * be ever encorporated in normal use.
 *
 * trace_array_printk() can be used, as it will not add noise to the
 * top level tracing buffer.
 *
 * Note, trace_array_init_printk() must be called on @tr before this
 * can be used.
 */
__printf(3, 0)
int trace_array_printk(struct trace_array *tr,
                       unsigned long ip, const char *fmt, ...)
{
        int ret;
        va_list ap;

        if (!tr)
                return -ENOENT;

        /* This is only allowed for created instances */
        if (tr == &global_trace)
                return 0;

        if (!(tr->trace_flags & TRACE_ITER_PRINTK))
                return 0;

        va_start(ap, fmt);
        ret = trace_array_vprintk(tr, ip, fmt, ap);
        va_end(ap);
        return ret;
}
EXPORT_SYMBOL_GPL(trace_array_printk);

/**
 * trace_array_init_printk - Initialize buffers for trace_array_printk()
 * @tr: The trace array to initialize the buffers for
 *
 * As trace_array_printk() only writes into instances, they are OK to
 * have in the kernel (unlike trace_printk()). This needs to be called
 * before trace_array_printk() can be used on a trace_array.
 */
int trace_array_init_printk(struct trace_array *tr)
{
        if (!tr)
                return -ENOENT;

        /* This is only allowed for created instances */
        if (tr == &global_trace)
                return -EINVAL;

        return alloc_percpu_trace_buffer();
}
EXPORT_SYMBOL_GPL(trace_array_init_printk);

__printf(3, 4)
int trace_array_printk_buf(struct trace_buffer *buffer,
                           unsigned long ip, const char *fmt, ...)
{
        int ret;
        va_list ap;

        if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
                return 0;

        va_start(ap, fmt);
        ret = __trace_array_vprintk(buffer, ip, fmt, ap);
        va_end(ap);
        return ret;
}

__printf(2, 0)
int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
{
        return trace_array_vprintk(&global_trace, ip, fmt, args);
}
EXPORT_SYMBOL_GPL(trace_vprintk);

static void trace_iterator_increment(struct trace_iterator *iter)
{
        struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, iter->cpu);

        iter->idx++;
        if (buf_iter)
                ring_buffer_iter_advance(buf_iter);
}

static struct trace_entry *
peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
                unsigned long *lost_events)
{
        struct ring_buffer_event *event;
        struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, cpu);

        if (buf_iter) {
                event = ring_buffer_iter_peek(buf_iter, ts);
                if (lost_events)
                        *lost_events = ring_buffer_iter_dropped(buf_iter) ?
                                (unsigned long)-1 : 0;
        } else {
                event = ring_buffer_peek(iter->array_buffer->buffer, cpu, ts,
                                         lost_events);
        }

        if (event) {
                iter->ent_size = ring_buffer_event_length(event);
                return ring_buffer_event_data(event);
        }
        iter->ent_size = 0;
        return NULL;
}

static struct trace_entry *
__find_next_entry(struct trace_iterator *iter, int *ent_cpu,
                  unsigned long *missing_events, u64 *ent_ts)
{
        struct trace_buffer *buffer = iter->array_buffer->buffer;
        struct trace_entry *ent, *next = NULL;
        unsigned long lost_events = 0, next_lost = 0;
        int cpu_file = iter->cpu_file;
        u64 next_ts = 0, ts;
        int next_cpu = -1;
        int next_size = 0;
        int cpu;

        /*
         * If we are in a per_cpu trace file, don't bother by iterating over
         * all cpu and peek directly.
         */
        if (cpu_file > RING_BUFFER_ALL_CPUS) {
                if (ring_buffer_empty_cpu(buffer, cpu_file))
                        return NULL;
                ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events);
                if (ent_cpu)
                        *ent_cpu = cpu_file;

                return ent;
        }

        for_each_tracing_cpu(cpu) {

                if (ring_buffer_empty_cpu(buffer, cpu))
                        continue;

                ent = peek_next_entry(iter, cpu, &ts, &lost_events);

                /*
                 * Pick the entry with the smallest timestamp:
                 */
                if (ent && (!next || ts < next_ts)) {
                        next = ent;
                        next_cpu = cpu;
                        next_ts = ts;
                        next_lost = lost_events;
                        next_size = iter->ent_size;
                }
        }

        iter->ent_size = next_size;

        if (ent_cpu)
                *ent_cpu = next_cpu;

        if (ent_ts)
                *ent_ts = next_ts;

        if (missing_events)
                *missing_events = next_lost;

        return next;
}

#define STATIC_FMT_BUF_SIZE        128
static char static_fmt_buf[STATIC_FMT_BUF_SIZE];

static char *trace_iter_expand_format(struct trace_iterator *iter)
{
        char *tmp;

        if (iter->fmt == static_fmt_buf)
                return NULL;

        tmp = krealloc(iter->fmt, iter->fmt_size + STATIC_FMT_BUF_SIZE,
                       GFP_KERNEL);
        if (tmp) {
                iter->fmt_size += STATIC_FMT_BUF_SIZE;
                iter->fmt = tmp;
        }

        return tmp;
}

const char *trace_event_format(struct trace_iterator *iter, const char *fmt)
{
        const char *p, *new_fmt;
        char *q;

        if (WARN_ON_ONCE(!fmt))
                return fmt;

        p = fmt;
        new_fmt = q = iter->fmt;
        while (*p) {
                if (unlikely(q - new_fmt + 3 > iter->fmt_size)) {
                        if (!trace_iter_expand_format(iter))
                                return fmt;

                        q += iter->fmt - new_fmt;
                        new_fmt = iter->fmt;
                }

                *q++ = *p++;

                /* Replace %p with %px */
                if (p[-1] == '%') {
                        if (p[0] == '%') {
                                *q++ = *p++;
                        } else if (p[0] == 'p' && !isalnum(p[1])) {
                                *q++ = *p++;
                                *q++ = 'x';
                        }
                }
        }
        *q = '\0';

        return new_fmt;
}

#define STATIC_TEMP_BUF_SIZE        128
static char static_temp_buf[STATIC_TEMP_BUF_SIZE] __aligned(4);

/* Find the next real entry, without updating the iterator itself */
struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
                                          int *ent_cpu, u64 *ent_ts)
{
        /* __find_next_entry will reset ent_size */
        int ent_size = iter->ent_size;
        struct trace_entry *entry;

        /*
         * If called from ftrace_dump(), then the iter->temp buffer
         * will be the static_temp_buf and not created from kmalloc.
         * If the entry size is greater than the buffer, we can
         * not save it. Just return NULL in that case. This is only
         * used to add markers when two consecutive events' time
         * stamps have a large delta. See trace_print_lat_context()
         */
        if (iter->temp == static_temp_buf &&
            STATIC_TEMP_BUF_SIZE < ent_size)
                return NULL;

        /*
         * The __find_next_entry() may call peek_next_entry(), which may
         * call ring_buffer_peek() that may make the contents of iter->ent
         * undefined. Need to copy iter->ent now.
         */
        if (iter->ent && iter->ent != iter->temp) {
                if ((!iter->temp || iter->temp_size < iter->ent_size) &&
                    !WARN_ON_ONCE(iter->temp == static_temp_buf)) {
                        void *temp;
                        temp = kmalloc(iter->ent_size, GFP_KERNEL);
                        if (!temp)
                                return NULL;
                        kfree(iter->temp);
                        iter->temp = temp;
                        iter->temp_size = iter->ent_size;
                }
                memcpy(iter->temp, iter->ent, iter->ent_size);
                iter->ent = iter->temp;
        }
        entry = __find_next_entry(iter, ent_cpu, NULL, ent_ts);
        /* Put back the original ent_size */
        iter->ent_size = ent_size;

        return entry;
}

/* Find the next real entry, and increment the iterator to the next entry */
void *trace_find_next_entry_inc(struct trace_iterator *iter)
{
        iter->ent = __find_next_entry(iter, &iter->cpu,
                                      &iter->lost_events, &iter->ts);

        if (iter->ent)
                trace_iterator_increment(iter);

        return iter->ent ? iter : NULL;
}

static void trace_consume(struct trace_iterator *iter)
{
        ring_buffer_consume(iter->array_buffer->buffer, iter->cpu, &iter->ts,
                            &iter->lost_events);
}

static void *s_next(struct seq_file *m, void *v, loff_t *pos)
{
        struct trace_iterator *iter = m->private;
        int i = (int)*pos;
        void *ent;

        WARN_ON_ONCE(iter->leftover);

        (*pos)++;

        /* can't go backwards */
        if (iter->idx > i)
                return NULL;

        if (iter->idx < 0)
                ent = trace_find_next_entry_inc(iter);
        else
                ent = iter;

        while (ent && iter->idx < i)
                ent = trace_find_next_entry_inc(iter);

        iter->pos = *pos;

        return ent;
}

void tracing_iter_reset(struct trace_iterator *iter, int cpu)
{
        struct ring_buffer_iter *buf_iter;
        unsigned long entries = 0;
        u64 ts;

        per_cpu_ptr(iter->array_buffer->data, cpu)->skipped_entries = 0;

        buf_iter = trace_buffer_iter(iter, cpu);
        if (!buf_iter)
                return;

        ring_buffer_iter_reset(buf_iter);

        /*
         * We could have the case with the max latency tracers
         * that a reset never took place on a cpu. This is evident
         * by the timestamp being before the start of the buffer.
         */
        while (ring_buffer_iter_peek(buf_iter, &ts)) {
                if (ts >= iter->array_buffer->time_start)
                        break;
                entries++;
                ring_buffer_iter_advance(buf_iter);
                /* This could be a big loop */
                cond_resched();
        }

        per_cpu_ptr(iter->array_buffer->data, cpu)->skipped_entries = entries;
}

/*
 * The current tracer is copied to avoid a global locking
 * all around.
 */
static void *s_start(struct seq_file *m, loff_t *pos)
{
        struct trace_iterator *iter = m->private;
        struct trace_array *tr = iter->tr;
        int cpu_file = iter->cpu_file;
        void *p = NULL;
        loff_t l = 0;
        int cpu;

        /*
         * copy the tracer to avoid using a global lock all around.
         * iter->trace is a copy of current_trace, the pointer to the
         * name may be used instead of a strcmp(), as iter->trace->name
         * will point to the same string as current_trace->name.
         */
        mutex_lock(&trace_types_lock);
        if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name)) {
                /* Close iter->trace before switching to the new current tracer */
                if (iter->trace->close)
                        iter->trace->close(iter);
                *iter->trace = *tr->current_trace;
                /* Reopen the new current tracer */
                if (iter->trace->open)
                        iter->trace->open(iter);
        }
        mutex_unlock(&trace_types_lock);

#ifdef CONFIG_TRACER_MAX_TRACE
        if (iter->snapshot && iter->trace->use_max_tr)
                return ERR_PTR(-EBUSY);
#endif

        if (*pos != iter->pos) {
                iter->ent = NULL;
                iter->cpu = 0;
                iter->idx = -1;

                if (cpu_file == RING_BUFFER_ALL_CPUS) {
                        for_each_tracing_cpu(cpu)
                                tracing_iter_reset(iter, cpu);
                } else
                        tracing_iter_reset(iter, cpu_file);

                iter->leftover = 0;
                for (p = iter; p && l < *pos; p = s_next(m, p, &l))
                        ;

        } else {
                /*
                 * If we overflowed the seq_file before, then we want
                 * to just reuse the trace_seq buffer again.
                 */
                if (iter->leftover)
                        p = iter;
                else {
                        l = *pos - 1;
                        p = s_next(m, p, &l);
                }
        }

        trace_event_read_lock();
        trace_access_lock(cpu_file);
        return p;
}

static void s_stop(struct seq_file *m, void *p)
{
        struct trace_iterator *iter = m->private;

#ifdef CONFIG_TRACER_MAX_TRACE
        if (iter->snapshot && iter->trace->use_max_tr)
                return;
#endif

        trace_access_unlock(iter->cpu_file);
        trace_event_read_unlock();
}

static void
get_total_entries_cpu(struct array_buffer *buf, unsigned long *total,
                      unsigned long *entries, int cpu)
{
        unsigned long count;

        count = ring_buffer_entries_cpu(buf->buffer, cpu);
        /*
         * If this buffer has skipped entries, then we hold all
         * entries for the trace and we need to ignore the
         * ones before the time stamp.
         */
        if (per_cpu_ptr(buf->data, cpu)->skipped_entries) {
                count -= per_cpu_ptr(buf->data, cpu)->skipped_entries;
                /* total is the same as the entries */
                *total = count;
        } else
                *total = count +
                        ring_buffer_overrun_cpu(buf->buffer, cpu);
        *entries = count;
}

static void
get_total_entries(struct array_buffer *buf,
                  unsigned long *total, unsigned long *entries)
{
        unsigned long t, e;
        int cpu;

        *total = 0;
        *entries = 0;

        for_each_tracing_cpu(cpu) {
                get_total_entries_cpu(buf, &t, &e, cpu);
                *total += t;
                *entries += e;
        }
}

unsigned long trace_total_entries_cpu(struct trace_array *tr, int cpu)
{
        unsigned long total, entries;

        if (!tr)
                tr = &global_trace;

        get_total_entries_cpu(&tr->array_buffer, &total, &entries, cpu);

        return entries;
}

unsigned long trace_total_entries(struct trace_array *tr)
{
        unsigned long total, entries;

        if (!tr)
                tr = &global_trace;

        get_total_entries(&tr->array_buffer, &total, &entries);

        return entries;
}

static void print_lat_help_header(struct seq_file *m)
{
        seq_puts(m, "#                    _------=> CPU#            \n"
                    "#                   / _-----=> irqs-off        \n"
                    "#                  | / _----=> need-resched    \n"
                    "#                  || / _---=> hardirq/softirq \n"
                    "#                  ||| / _--=> preempt-depth   \n"
                    "#                  |||| /     delay            \n"
                    "#  cmd     pid     ||||| time  |   caller      \n"
                    "#     \\   /        |||||  \\    |   /         \n");
}

static void print_event_info(struct array_buffer *buf, struct seq_file *m)
{
        unsigned long total;
        unsigned long entries;

        get_total_entries(buf, &total, &entries);
        seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu   #P:%d\n",
                   entries, total, num_online_cpus());
        seq_puts(m, "#\n");
}

static void print_func_help_header(struct array_buffer *buf, struct seq_file *m,
                                   unsigned int flags)
{
        bool tgid = flags & TRACE_ITER_RECORD_TGID;

        print_event_info(buf, m);

        seq_printf(m, "#           TASK-PID    %s CPU#     TIMESTAMP  FUNCTION\n", tgid ? "   TGID   " : "");
        seq_printf(m, "#              | |      %s   |         |         |\n",      tgid ? "     |    " : "");
}

static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file *m,
                                       unsigned int flags)
{
        bool tgid = flags & TRACE_ITER_RECORD_TGID;
        const char *space = "            ";
        int prec = tgid ? 12 : 2;

        print_event_info(buf, m);

        seq_printf(m, "#                            %.*s  _-----=> irqs-off\n", prec, space);
        seq_printf(m, "#                            %.*s / _----=> need-resched\n", prec, space);
        seq_printf(m, "#                            %.*s| / _---=> hardirq/softirq\n", prec, space);
        seq_printf(m, "#                            %.*s|| / _--=> preempt-depth\n", prec, space);
        seq_printf(m, "#                            %.*s||| /     delay\n", prec, space);
        seq_printf(m, "#           TASK-PID  %.*s CPU#  ||||   TIMESTAMP  FUNCTION\n", prec, "     TGID   ");
        seq_printf(m, "#              | |    %.*s   |   ||||      |         |\n", prec, "       |    ");
}

void
print_trace_header(struct seq_file *m, struct trace_iterator *iter)
{
        unsigned long sym_flags = (global_trace.trace_flags & TRACE_ITER_SYM_MASK);
        struct array_buffer *buf = iter->array_buffer;
        struct trace_array_cpu *data = per_cpu_ptr(buf->data, buf->cpu);
        struct tracer *type = iter->trace;
        unsigned long entries;
        unsigned long total;
        const char *name = "preemption";

        name = type->name;

        get_total_entries(buf, &total, &entries);

        seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
                   name, UTS_RELEASE);
        seq_puts(m, "# -----------------------------------"
                 "---------------------------------\n");
        seq_printf(m, "# latency: %lu us, #%lu/%lu, CPU#%d |"
                   " (M:%s VP:%d, KP:%d, SP:%d HP:%d",
                   nsecs_to_usecs(data->saved_latency),
                   entries,
                   total,
                   buf->cpu,
#if defined(CONFIG_PREEMPT_NONE)
                   "server",
#elif defined(CONFIG_PREEMPT_VOLUNTARY)
                   "desktop",
#elif defined(CONFIG_PREEMPT)
                   "preempt",
#elif defined(CONFIG_PREEMPT_RT)
                   "preempt_rt",
#else
                   "unknown",
#endif
                   /* These are reserved for later use */
                   0, 0, 0, 0);
#ifdef CONFIG_SMP
        seq_printf(m, " #P:%d)\n", num_online_cpus());
#else
        seq_puts(m, ")\n");
#endif
        seq_puts(m, "#    -----------------\n");
        seq_printf(m, "#    | task: %.16s-%d "
                   "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n",
                   data->comm, data->pid,
                   from_kuid_munged(seq_user_ns(m), data->uid), data->nice,
                   data->policy, data->rt_priority);
        seq_puts(m, "#    -----------------\n");

        if (data->critical_start) {
                seq_puts(m, "#  => started at: ");
                seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags);
                trace_print_seq(m, &iter->seq);
                seq_puts(m, "\n#  => ended at:   ");
                seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags);
                trace_print_seq(m, &iter->seq);
                seq_puts(m, "\n#\n");
        }

        seq_puts(m, "#\n");
}

static void test_cpu_buff_start(struct trace_iterator *iter)
{
        struct trace_seq *s = &iter->seq;
        struct trace_array *tr = iter->tr;

        if (!(tr->trace_flags & TRACE_ITER_ANNOTATE))
                return;

        if (!(iter->iter_flags & TRACE_FILE_ANNOTATE))
                return;

        if (cpumask_available(iter->started) &&
            cpumask_test_cpu(iter->cpu, iter->started))
                return;

        if (per_cpu_ptr(iter->array_buffer->data, iter->cpu)->skipped_entries)
                return;

        if (cpumask_available(iter->started))
                cpumask_set_cpu(iter->cpu, iter->started);

        /* Don't print started cpu buffer for the first entry of the trace */
        if (iter->idx > 1)
                trace_seq_printf(s, "##### CPU %u buffer started ####\n",
                                iter->cpu);
}

static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
{
        struct trace_array *tr = iter->tr;
        struct trace_seq *s = &iter->seq;
        unsigned long sym_flags = (tr->trace_flags & TRACE_ITER_SYM_MASK);
        struct trace_entry *entry;
        struct trace_event *event;

        entry = iter->ent;

        test_cpu_buff_start(iter);

        event = ftrace_find_event(entry->type);

        if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) {
                if (iter->iter_flags & TRACE_FILE_LAT_FMT)
                        trace_print_lat_context(iter);
                else
                        trace_print_context(iter);
        }

        if (trace_seq_has_overflowed(s))
                return TRACE_TYPE_PARTIAL_LINE;

        if (event)
                return event->funcs->trace(iter, sym_flags, event);

        trace_seq_printf(s, "Unknown type %d\n", entry->type);

        return trace_handle_return(s);
}

static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
{
        struct trace_array *tr = iter->tr;
        struct trace_seq *s = &iter->seq;
        struct trace_entry *entry;
        struct trace_event *event;

        entry = iter->ent;

        if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO)
                trace_seq_printf(s, "%d %d %llu ",
                                 entry->pid, iter->cpu, iter->ts);

        if (trace_seq_has_overflowed(s))
                return TRACE_TYPE_PARTIAL_LINE;

        event = ftrace_find_event(entry->type);
        if (event)
                return event->funcs->raw(iter, 0, event);

        trace_seq_printf(s, "%d ?\n", entry->type);

        return trace_handle_return(s);
}

static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
{
        struct trace_array *tr = iter->tr;
        struct trace_seq *s = &iter->seq;
        unsigned char newline = '\n';
        struct trace_entry *entry;
        struct trace_event *event;

        entry = iter->ent;

        if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) {
                SEQ_PUT_HEX_FIELD(s, entry->pid);
                SEQ_PUT_HEX_FIELD(s, iter->cpu);
                SEQ_PUT_HEX_FIELD(s, iter->ts);
                if (trace_seq_has_overflowed(s))
                        return TRACE_TYPE_PARTIAL_LINE;
        }

        event = ftrace_find_event(entry->type);
        if (event) {
                enum print_line_t ret = event->funcs->hex(iter, 0, event);
                if (ret != TRACE_TYPE_HANDLED)
                        return ret;
        }

        SEQ_PUT_FIELD(s, newline);

        return trace_handle_return(s);
}

static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
{
        struct trace_array *tr = iter->tr;
        struct trace_seq *s = &iter->seq;
        struct trace_entry *entry;
        struct trace_event *event;

        entry = iter->ent;

        if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) {
                SEQ_PUT_FIELD(s, entry->pid);
                SEQ_PUT_FIELD(s, iter->cpu);
                SEQ_PUT_FIELD(s, iter->ts);
                if (trace_seq_has_overflowed(s))
                        return TRACE_TYPE_PARTIAL_LINE;
        }

        event = ftrace_find_event(entry->type);
        return event ? event->funcs->binary(iter, 0, event) :
                TRACE_TYPE_HANDLED;
}

int trace_empty(struct trace_iterator *iter)
{
        struct ring_buffer_iter *buf_iter;
        int cpu;

        /* If we are looking at one CPU buffer, only check that one */
        if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
                cpu = iter->cpu_file;
                buf_iter = trace_buffer_iter(iter, cpu);
                if (buf_iter) {
                        if (!ring_buffer_iter_empty(buf_iter))
                                return 0;
                } else {
                        if (!ring_buffer_empty_cpu(iter->array_buffer->buffer, cpu))
                                return 0;
                }
                return 1;
        }

        for_each_tracing_cpu(cpu) {
                buf_iter = trace_buffer_iter(iter, cpu);
                if (buf_iter) {
                        if (!ring_buffer_iter_empty(buf_iter))
                                return 0;
                } else {
                        if (!ring_buffer_empty_cpu(iter->array_buffer->buffer, cpu))
                                return 0;
                }
        }

        return 1;
}

/*  Called with trace_event_read_lock() held. */
enum print_line_t print_trace_line(struct trace_iterator *iter)
{
        struct trace_array *tr = iter->tr;
        unsigned long trace_flags = tr->trace_flags;
        enum print_line_t ret;

        if (iter->lost_events) {
                if (iter->lost_events == (unsigned long)-1)
                        trace_seq_printf(&iter->seq, "CPU:%d [LOST EVENTS]\n",
                                         iter->cpu);
                else
                        trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
                                         iter->cpu, iter->lost_events);
                if (trace_seq_has_overflowed(&iter->seq))
                        return TRACE_TYPE_PARTIAL_LINE;
        }

        if (iter->trace && iter->trace->print_line) {
                ret = iter->trace->print_line(iter);
                if (ret != TRACE_TYPE_UNHANDLED)
                        return ret;
        }

        if (iter->ent->type == TRACE_BPUTS &&
                        trace_flags & TRACE_ITER_PRINTK &&
                        trace_flags & TRACE_ITER_PRINTK_MSGONLY)
                return trace_print_bputs_msg_only(iter);

        if (iter->ent->type == TRACE_BPRINT &&
                        trace_flags & TRACE_ITER_PRINTK &&
                        trace_flags & TRACE_ITER_PRINTK_MSGONLY)
                return trace_print_bprintk_msg_only(iter);

        if (iter->ent->type == TRACE_PRINT &&
                        trace_flags & TRACE_ITER_PRINTK &&
                        trace_flags & TRACE_ITER_PRINTK_MSGONLY)
                return trace_print_printk_msg_only(iter);

        if (trace_flags & TRACE_ITER_BIN)
                return print_bin_fmt(iter);

        if (trace_flags & TRACE_ITER_HEX)
                return print_hex_fmt(iter);

        if (trace_flags & TRACE_ITER_RAW)
                return print_raw_fmt(iter);

        return print_trace_fmt(iter);
}

void trace_latency_header(struct seq_file *m)
{
        struct trace_iterator *iter = m->private;
        struct trace_array *tr = iter->tr;

        /* print nothing if the buffers are empty */
        if (trace_empty(iter))
                return;

        if (iter->iter_flags & TRACE_FILE_LAT_FMT)
                print_trace_header(m, iter);

        if (!(tr->trace_flags & TRACE_ITER_VERBOSE))
                print_lat_help_header(m);
}

void trace_default_header(struct seq_file *m)
{
        struct trace_iterator *iter = m->private;
        struct trace_array *tr = iter->tr;
        unsigned long trace_flags = tr->trace_flags;

        if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
                return;

        if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
                /* print nothing if the buffers are empty */
                if (trace_empty(iter))
                        return;
                print_trace_header(m, iter);
                if (!(trace_flags & TRACE_ITER_VERBOSE))
                        print_lat_help_header(m);
        } else {
                if (!(trace_flags & TRACE_ITER_VERBOSE)) {
                        if (trace_flags & TRACE_ITER_IRQ_INFO)
                                print_func_help_header_irq(iter->array_buffer,
                                                           m, trace_flags);
                        else
                                print_func_help_header(iter->array_buffer, m,
                                                       trace_flags);
                }
        }
}

static void test_ftrace_alive(struct seq_file *m)
{
        if (!ftrace_is_dead())
                return;
        seq_puts(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n"
                    "#          MAY BE MISSING FUNCTION EVENTS\n");
}

#ifdef CONFIG_TRACER_MAX_TRACE
static void show_snapshot_main_help(struct seq_file *m)
{
        seq_puts(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"
                    "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"
                    "#                      Takes a snapshot of the main buffer.\n"
                    "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n"
                    "#                      (Doesn't have to be '2' works with any number that\n"
                    "#                       is not a '0' or '1')\n");
}

static void show_snapshot_percpu_help(struct seq_file *m)
{
        seq_puts(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n");
#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
        seq_puts(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"
                    "#                      Takes a snapshot of the main buffer for this cpu.\n");
#else
        seq_puts(m, "# echo 1 > snapshot : Not supported with this kernel.\n"
                    "#                     Must use main snapshot file to allocate.\n");
#endif
        seq_puts(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n"
                    "#                      (Doesn't have to be '2' works with any number that\n"
                    "#                       is not a '0' or '1')\n");
}

static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
{
        if (iter->tr->allocated_snapshot)
                seq_puts(m, "#\n# * Snapshot is allocated *\n#\n");
        else
                seq_puts(m, "#\n# * Snapshot is freed *\n#\n");

        seq_puts(m, "# Snapshot commands:\n");
        if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
                show_snapshot_main_help(m);
        else
                show_snapshot_percpu_help(m);
}
#else
/* Should never be called */
static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { }
#endif

static int s_show(struct seq_file *m, void *v)
{
        struct trace_iterator *iter = v;
        int ret;

        if (iter->ent == NULL) {
                if (iter->tr) {
                        seq_printf(m, "# tracer: %s\n", iter->trace->name);
                        seq_puts(m, "#\n");
                        test_ftrace_alive(m);
                }
                if (iter->snapshot && trace_empty(iter))
                        print_snapshot_help(m, iter);
                else if (iter->trace && iter->trace->print_header)
                        iter->trace->print_header(m);
                else
                        trace_default_header(m);

        } else if (iter->leftover) {
                /*
                 * If we filled the seq_file buffer earlier, we
                 * want to just show it now.
                 */
                ret = trace_print_seq(m, &iter->seq);

                /* ret should this time be zero, but you never know */
                iter->leftover = ret;

        } else {
                ret = print_trace_line(iter);
                if (ret == TRACE_TYPE_PARTIAL_LINE) {
                        iter->seq.full = 0;
                        trace_seq_puts(&iter->seq, "[LINE TOO BIG]\n");
                }
                ret = trace_print_seq(m, &iter->seq);
                /*
                 * If we overflow the seq_file buffer, then it will
                 * ask us for this data again at start up.
                 * Use that instead.
                 *  ret is 0 if seq_file write succeeded.
                 *        -1 otherwise.
                 */
                iter->leftover = ret;
        }

        return 0;
}

/*
 * Should be used after trace_array_get(), trace_types_lock
 * ensures that i_cdev was already initialized.
 */
static inline int tracing_get_cpu(struct inode *inode)
{
        if (inode->i_cdev) /* See trace_create_cpu_file() */
                return (long)inode->i_cdev - 1;
        return RING_BUFFER_ALL_CPUS;
}

static const struct seq_operations tracer_seq_ops = {
        .start                = s_start,
        .next                = s_next,
        .stop                = s_stop,
        .show                = s_show,
};

static struct trace_iterator *
__tracing_open(struct inode *inode, struct file *file, bool snapshot)
{
        struct trace_array *tr = inode->i_private;
        struct trace_iterator *iter;
        int cpu;

        if (tracing_disabled)
                return ERR_PTR(-ENODEV);

        iter = __seq_open_private(file, &tracer_seq_ops, sizeof(*iter));
        if (!iter)
                return ERR_PTR(-ENOMEM);

        iter->buffer_iter = kcalloc(nr_cpu_ids, sizeof(*iter->buffer_iter),
                                    GFP_KERNEL);
        if (!iter->buffer_iter)
                goto release;

        /*
         * trace_find_next_entry() may need to save off iter->ent.
         * It will place it into the iter->temp buffer. As most
         * events are less than 128, allocate a buffer of that size.
         * If one is greater, then trace_find_next_entry() will
         * allocate a new buffer to adjust for the bigger iter->ent.
         * It's not critical if it fails to get allocated here.
         */
        iter->temp = kmalloc(128, GFP_KERNEL);
        if (iter->temp)
                iter->temp_size = 128;

        /*
         * trace_event_printf() may need to modify given format
         * string to replace %p with %px so that it shows real address
         * instead of hash value. However, that is only for the event
         * tracing, other tracer may not need. Defer the allocation
         * until it is needed.
         */
        iter->fmt = NULL;
        iter->fmt_size = 0;

        /*
         * We make a copy of the current tracer to avoid concurrent
         * changes on it while we are reading.
         */
        mutex_lock(&trace_types_lock);
        iter->trace = kzalloc(sizeof(*iter->trace), GFP_KERNEL);
        if (!iter->trace)
                goto fail;

        *iter->trace = *tr->current_trace;

        if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
                goto fail;

        iter->tr = tr;

#ifdef CONFIG_TRACER_MAX_TRACE
        /* Currently only the top directory has a snapshot */
        if (tr->current_trace->print_max || snapshot)
                iter->array_buffer = &tr->max_buffer;
        else
#endif
                iter->array_buffer = &tr->array_buffer;
        iter->snapshot = snapshot;
        iter->pos = -1;
        iter->cpu_file = tracing_get_cpu(inode);
        mutex_init(&iter->mutex);

        /* Notify the tracer early; before we stop tracing. */
        if (iter->trace->open)
                iter->trace->open(iter);

        /* Annotate start of buffers if we had overruns */
        if (ring_buffer_overruns(iter->array_buffer->buffer))
                iter->iter_flags |= TRACE_FILE_ANNOTATE;

        /* Output in nanoseconds only if we are using a clock in nanoseconds. */
        if (trace_clocks[tr->clock_id].in_ns)
                iter->iter_flags |= TRACE_FILE_TIME_IN_NS;

        /*
         * If pause-on-trace is enabled, then stop the trace while
         * dumping, unless this is the "snapshot" file
         */
        if (!iter->snapshot && (tr->trace_flags & TRACE_ITER_PAUSE_ON_TRACE))
                tracing_stop_tr(tr);

        if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
                for_each_tracing_cpu(cpu) {
                        iter->buffer_iter[cpu] =
                                ring_buffer_read_prepare(iter->array_buffer->buffer,
                                                         cpu, GFP_KERNEL);
                }
                ring_buffer_read_prepare_sync();
                for_each_tracing_cpu(cpu) {
                        ring_buffer_read_start(iter->buffer_iter[cpu]);
                        tracing_iter_reset(iter, cpu);
                }
        } else {
                cpu = iter->cpu_file;
                iter->buffer_iter[cpu] =
                        ring_buffer_read_prepare(iter->array_buffer->buffer,
                                                 cpu, GFP_KERNEL);
                ring_buffer_read_prepare_sync();
                ring_buffer_read_start(iter->buffer_iter[cpu]);
                tracing_iter_reset(iter, cpu);
        }

        mutex_unlock(&trace_types_lock);

        return iter;

 fail:
        mutex_unlock(&trace_types_lock);
        kfree(iter->trace);
        kfree(iter->temp);
        kfree(iter->buffer_iter);
release:
        seq_release_private(inode, file);
        return ERR_PTR(-ENOMEM);
}

int tracing_open_generic(struct inode *inode, struct file *filp)
{
        int ret;

        ret = tracing_check_open_get_tr(NULL);
        if (ret)
                return ret;

        filp->private_data = inode->i_private;
        return 0;
}

bool tracing_is_disabled(void)
{
        return (tracing_disabled) ? true: false;
}

/*
 * Open and update trace_array ref count.
 * Must have the current trace_array passed to it.
 */
int tracing_open_generic_tr(struct inode *inode, struct file *filp)
{
        struct trace_array *tr = inode->i_private;
        int ret;

        ret = tracing_check_open_get_tr(tr);
        if (ret)
                return ret;

        filp->private_data = inode->i_private;

        return 0;
}

/*
 * The private pointer of the inode is the trace_event_file.
 * Update the tr ref count associated to it.
 */
int tracing_open_file_tr(struct inode *inode, struct file *filp)
{
        struct trace_event_file *file = inode->i_private;
        int ret;

        ret = tracing_check_open_get_tr(file->tr);
        if (ret)
                return ret;

        mutex_lock(&event_mutex);

        /* Fail if the file is marked for removal */
        if (file->flags & EVENT_FILE_FL_FREED) {
                trace_array_put(file->tr);
                ret = -ENODEV;
        } else {
                event_file_get(file);
        }

        mutex_unlock(&event_mutex);
        if (ret)
                return ret;

        filp->private_data = inode->i_private;

        return 0;
}

int tracing_release_file_tr(struct inode *inode, struct file *filp)
{
        struct trace_event_file *file = inode->i_private;

        trace_array_put(file->tr);
        event_file_put(file);

        return 0;
}

static int tracing_release(struct inode *inode, struct file *file)
{
        struct trace_array *tr = inode->i_private;
        struct seq_file *m = file->private_data;
        struct trace_iterator *iter;
        int cpu;

        if (!(file->f_mode & FMODE_READ)) {
                trace_array_put(tr);
                return 0;
        }

        /* Writes do not use seq_file */
        iter = m->private;
        mutex_lock(&trace_types_lock);

        for_each_tracing_cpu(cpu) {
                if (iter->buffer_iter[cpu])
                        ring_buffer_read_finish(iter->buffer_iter[cpu]);
        }

        if (iter->trace && iter->trace->close)
                iter->trace->close(iter);

        if (!iter->snapshot && tr->stop_count)
                /* reenable tracing if it was previously enabled */
                tracing_start_tr(tr);

        __trace_array_put(tr);

        mutex_unlock(&trace_types_lock);

        mutex_destroy(&iter->mutex);
        free_cpumask_var(iter->started);
        kfree(iter->fmt);
        kfree(iter->temp);
        kfree(iter->trace);
        kfree(iter->buffer_iter);
        seq_release_private(inode, file);

        return 0;
}

static int tracing_release_generic_tr(struct inode *inode, struct file *file)
{
        struct trace_array *tr = inode->i_private;

        trace_array_put(tr);
        return 0;
}

static int tracing_single_release_tr(struct inode *inode, struct file *file)
{
        struct trace_array *tr = inode->i_private;

        trace_array_put(tr);

        return single_release(inode, file);
}

static int tracing_open(struct inode *inode, struct file *file)
{
        struct trace_array *tr = inode->i_private;
        struct trace_iterator *iter;
        int ret;

        ret = tracing_check_open_get_tr(tr);
        if (ret)
                return ret;

        /* If this file was open for write, then erase contents */
        if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
                int cpu = tracing_get_cpu(inode);
                struct array_buffer *trace_buf = &tr->array_buffer;

#ifdef CONFIG_TRACER_MAX_TRACE
                if (tr->current_trace->print_max)
                        trace_buf = &tr->max_buffer;
#endif

                if (cpu == RING_BUFFER_ALL_CPUS)
                        tracing_reset_online_cpus(trace_buf);
                else
                        tracing_reset_cpu(trace_buf, cpu);
        }

        if (file->f_mode & FMODE_READ) {
                iter = __tracing_open(inode, file, false);
                if (IS_ERR(iter))
                        ret = PTR_ERR(iter);
                else if (tr->trace_flags & TRACE_ITER_LATENCY_FMT)
                        iter->iter_flags |= TRACE_FILE_LAT_FMT;
        }

        if (ret < 0)
                trace_array_put(tr);

        return ret;
}

/*
 * Some tracers are not suitable for instance buffers.
 * A tracer is always available for the global array (toplevel)
 * or if it explicitly states that it is.
 */
static bool
trace_ok_for_array(struct tracer *t, struct trace_array *tr)
{
        return (tr->flags & TRACE_ARRAY_FL_GLOBAL) || t->allow_instances;
}

/* Find the next tracer that this trace array may use */
static struct tracer *
get_tracer_for_array(struct trace_array *tr, struct tracer *t)
{
        while (t && !trace_ok_for_array(t, tr))
                t = t->next;

        return t;
}

static void *
t_next(struct seq_file *m, void *v, loff_t *pos)
{
        struct trace_array *tr = m->private;
        struct tracer *t = v;

        (*pos)++;

        if (t)
                t = get_tracer_for_array(tr, t->next);

        return t;
}

static void *t_start(struct seq_file *m, loff_t *pos)
{
        struct trace_array *tr = m->private;
        struct tracer *t;
        loff_t l = 0;

        mutex_lock(&trace_types_lock);

        t = get_tracer_for_array(tr, trace_types);
        for (; t && l < *pos; t = t_next(m, t, &l))
                        ;

        return t;
}

static void t_stop(struct seq_file *m, void *p)
{
        mutex_unlock(&trace_types_lock);
}

static int t_show(struct seq_file *m, void *v)
{
        struct tracer *t = v;

        if (!t)
                return 0;

        seq_puts(m, t->name);
        if (t->next)
                seq_putc(m, ' ');
        else
                seq_putc(m, '\n');

        return 0;
}

static const struct seq_operations show_traces_seq_ops = {
        .start                = t_start,
        .next                = t_next,
        .stop                = t_stop,
        .show                = t_show,
};

static int show_traces_open(struct inode *inode, struct file *file)
{
        struct trace_array *tr = inode->i_private;
        struct seq_file *m;
        int ret;

        ret = tracing_check_open_get_tr(tr);
        if (ret)
                return ret;

        ret = seq_open(file, &show_traces_seq_ops);
        if (ret) {
                trace_array_put(tr);
                return ret;
        }

        m = file->private_data;
        m->private = tr;

        return 0;
}

static int show_traces_release(struct inode *inode, struct file *file)
{
        struct trace_array *tr = inode->i_private;

        trace_array_put(tr);
        return seq_release(inode, file);
}

static ssize_t
tracing_write_stub(struct file *filp, const char __user *ubuf,
                   size_t count, loff_t *ppos)
{
        return count;
}

loff_t tracing_lseek(struct file *file, loff_t offset, int whence)
{
        int ret;

        if (file->f_mode & FMODE_READ)
                ret = seq_lseek(file, offset, whence);
        else
                file->f_pos = ret = 0;

        return ret;
}

static const struct file_operations tracing_fops = {
        .open                = tracing_open,
        .read                = seq_read,
        .read_iter        = seq_read_iter,
        .splice_read        = generic_file_splice_read,
        .write                = tracing_write_stub,
        .llseek                = tracing_lseek,
        .release        = tracing_release,
};

static const struct file_operations show_traces_fops = {
        .open                = show_traces_open,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = show_traces_release,
};

static ssize_t
tracing_cpumask_read(struct file *filp, char __user *ubuf,
                     size_t count, loff_t *ppos)
{
        struct trace_array *tr = file_inode(filp)->i_private;
        char *mask_str;
        int len;

        len = snprintf(NULL, 0, "%*pb\n",
                       cpumask_pr_args(tr->tracing_cpumask)) + 1;
        mask_str = kmalloc(len, GFP_KERNEL);
        if (!mask_str)
                return -ENOMEM;

        len = snprintf(mask_str, len, "%*pb\n",
                       cpumask_pr_args(tr->tracing_cpumask));
        if (len >= count) {
                count = -EINVAL;
                goto out_err;
        }
        count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len);

out_err:
        kfree(mask_str);

        return count;
}

int tracing_set_cpumask(struct trace_array *tr,
                        cpumask_var_t tracing_cpumask_new)
{
        int cpu;

        if (!tr)
                return -EINVAL;

        local_irq_disable();
        arch_spin_lock(&tr->max_lock);
        for_each_tracing_cpu(cpu) {
                /*
                 * Increase/decrease the disabled counter if we are
                 * about to flip a bit in the cpumask:
                 */
                if (cpumask_test_cpu(cpu, tr->tracing_cpumask) &&
                                !cpumask_test_cpu(cpu, tracing_cpumask_new)) {
                        atomic_inc(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled);
                        ring_buffer_record_disable_cpu(tr->array_buffer.buffer, cpu);
#ifdef CONFIG_TRACER_MAX_TRACE
                        ring_buffer_record_disable_cpu(tr->max_buffer.buffer, cpu);
#endif
                }
                if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) &&
                                cpumask_test_cpu(cpu, tracing_cpumask_new)) {
                        atomic_dec(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled);
                        ring_buffer_record_enable_cpu(tr->array_buffer.buffer, cpu);
#ifdef CONFIG_TRACER_MAX_TRACE
                        ring_buffer_record_enable_cpu(tr->max_buffer.buffer, cpu);
#endif
                }
        }
        arch_spin_unlock(&tr->max_lock);
        local_irq_enable();

        cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new);

        return 0;
}

static ssize_t
tracing_cpumask_write(struct file *filp, const char __user *ubuf,
                      size_t count, loff_t *ppos)
{
        struct trace_array *tr = file_inode(filp)->i_private;
        cpumask_var_t tracing_cpumask_new;
        int err;

        if (count == 0 || count > KMALLOC_MAX_SIZE)
                return -EINVAL;

        if (!zalloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
                return -ENOMEM;

        err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
        if (err)
                goto err_free;

        err = tracing_set_cpumask(tr, tracing_cpumask_new);
        if (err)
                goto err_free;

        free_cpumask_var(tracing_cpumask_new);

        return count;

err_free:
        free_cpumask_var(tracing_cpumask_new);

        return err;
}

static const struct file_operations tracing_cpumask_fops = {
        .open                = tracing_open_generic_tr,
        .read                = tracing_cpumask_read,
        .write                = tracing_cpumask_write,
        .release        = tracing_release_generic_tr,
        .llseek                = generic_file_llseek,
};

static int tracing_trace_options_show(struct seq_file *m, void *v)
{
        struct tracer_opt *trace_opts;
        struct trace_array *tr = m->private;
        u32 tracer_flags;
        int i;

        mutex_lock(&trace_types_lock);
        tracer_flags = tr->current_trace->flags->val;
        trace_opts = tr->current_trace->flags->opts;

        for (i = 0; trace_options[i]; i++) {
                if (tr->trace_flags & (1 << i))
                        seq_printf(m, "%s\n", trace_options[i]);
                else
                        seq_printf(m, "no%s\n", trace_options[i]);
        }

        for (i = 0; trace_opts[i].name; i++) {
                if (tracer_flags & trace_opts[i].bit)
                        seq_printf(m, "%s\n", trace_opts[i].name);
                else
                        seq_printf(m, "no%s\n", trace_opts[i].name);
        }
        mutex_unlock(&trace_types_lock);

        return 0;
}

static int __set_tracer_option(struct trace_array *tr,
                               struct tracer_flags *tracer_flags,
                               struct tracer_opt *opts, int neg)
{
        struct tracer *trace = tracer_flags->trace;
        int ret;

        ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg);
        if (ret)
                return ret;

        if (neg)
                tracer_flags->val &= ~opts->bit;
        else
                tracer_flags->val |= opts->bit;
        return 0;
}

/* Try to assign a tracer specific option */
static int set_tracer_option(struct trace_array *tr, char *cmp, int neg)
{
        struct tracer *trace = tr->current_trace;
        struct tracer_flags *tracer_flags = trace->flags;
        struct tracer_opt *opts = NULL;
        int i;

        for (i = 0; tracer_flags->opts[i].name; i++) {
                opts = &tracer_flags->opts[i];

                if (strcmp(cmp, opts->name) == 0)
                        return __set_tracer_option(tr, trace->flags, opts, neg);
        }

        return -EINVAL;
}

/* Some tracers require overwrite to stay enabled */
int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)
{
        if (tracer->enabled && (mask & TRACE_ITER_OVERWRITE) && !set)
                return -1;

        return 0;
}

int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
{
        int *map;

        if ((mask == TRACE_ITER_RECORD_TGID) ||
            (mask == TRACE_ITER_RECORD_CMD))
                lockdep_assert_held(&event_mutex);

        /* do nothing if flag is already set */
        if (!!(tr->trace_flags & mask) == !!enabled)
                return 0;

        /* Give the tracer a chance to approve the change */
        if (tr->current_trace->flag_changed)
                if (tr->current_trace->flag_changed(tr, mask, !!enabled))
                        return -EINVAL;

        if (enabled)
                tr->trace_flags |= mask;
        else
                tr->trace_flags &= ~mask;

        if (mask == TRACE_ITER_RECORD_CMD)
                trace_event_enable_cmd_record(enabled);

        if (mask == TRACE_ITER_RECORD_TGID) {
                if (!tgid_map) {
                        tgid_map_max = pid_max;
                        map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map),
                                       GFP_KERNEL);

                        /*
                         * Pairs with smp_load_acquire() in
                         * trace_find_tgid_ptr() to ensure that if it observes
                         * the tgid_map we just allocated then it also observes
                         * the corresponding tgid_map_max value.
                         */
                        smp_store_release(&tgid_map, map);
                }
                if (!tgid_map) {
                        tr->trace_flags &= ~TRACE_ITER_RECORD_TGID;
                        return -ENOMEM;
                }

                trace_event_enable_tgid_record(enabled);
        }

        if (mask == TRACE_ITER_EVENT_FORK)
                trace_event_follow_fork(tr, enabled);

        if (mask == TRACE_ITER_FUNC_FORK)
                ftrace_pid_follow_fork(tr, enabled);

        if (mask == TRACE_ITER_OVERWRITE) {
                ring_buffer_change_overwrite(tr->array_buffer.buffer, enabled);
#ifdef CONFIG_TRACER_MAX_TRACE
                ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled);
#endif
        }

        if (mask == TRACE_ITER_PRINTK) {
                trace_printk_start_stop_comm(enabled);
                trace_printk_control(enabled);
        }

        return 0;
}

int trace_set_options(struct trace_array *tr, char *option)
{
        char *cmp;
        int neg = 0;
        int ret;
        size_t orig_len = strlen(option);
        int len;

        cmp = strstrip(option);

        len = str_has_prefix(cmp, "no");
        if (len)
                neg = 1;

        cmp += len;

        mutex_lock(&event_mutex);
        mutex_lock(&trace_types_lock);

        ret = match_string(trace_options, -1, cmp);
        /* If no option could be set, test the specific tracer options */
        if (ret < 0)
                ret = set_tracer_option(tr, cmp, neg);
        else
                ret = set_tracer_flag(tr, 1 << ret, !neg);

        mutex_unlock(&trace_types_lock);
        mutex_unlock(&event_mutex);

        /*
         * If the first trailing whitespace is replaced with '\0' by strstrip,
         * turn it back into a space.
         */
        if (orig_len > strlen(option))
                option[strlen(option)] = ' ';

        return ret;
}

static void __init apply_trace_boot_options(void)
{
        char *buf = trace_boot_options_buf;
        char *option;

        while (true) {
                option = strsep(&buf, ",");

                if (!option)
                        break;

                if (*option)
                        trace_set_options(&global_trace, option);

                /* Put back the comma to allow this to be called again */
                if (buf)
                        *(buf - 1) = ',';
        }
}

static ssize_t
tracing_trace_options_write(struct file *filp, const char __user *ubuf,
                        size_t cnt, loff_t *ppos)
{
        struct seq_file *m = filp->private_data;
        struct trace_array *tr = m->private;
        char buf[64];
        int ret;

        if (cnt >= sizeof(buf))
                return -EINVAL;

        if (copy_from_user(buf, ubuf, cnt))
                return -EFAULT;

        buf[cnt] = 0;

        ret = trace_set_options(tr, buf);
        if (ret < 0)
                return ret;

        *ppos += cnt;

        return cnt;
}

static int tracing_trace_options_open(struct inode *inode, struct file *file)
{
        struct trace_array *tr = inode->i_private;
        int ret;

        ret = tracing_check_open_get_tr(tr);
        if (ret)
                return ret;

        ret = single_open(file, tracing_trace_options_show, inode->i_private);
        if (ret < 0)
                trace_array_put(tr);

        return ret;
}

static const struct file_operations tracing_iter_fops = {
        .open                = tracing_trace_options_open,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = tracing_single_release_tr,
        .write                = tracing_trace_options_write,
};

static const char readme_msg[] =
        "tracing mini-HOWTO:\n\n"
        "# echo 0 > tracing_on : quick way to disable tracing\n"
        "# echo 1 > tracing_on : quick way to re-enable tracing\n\n"
        " Important files:\n"
        "  trace\t\t\t- The static contents of the buffer\n"
        "\t\t\t  To clear the buffer write into this file: echo > trace\n"
        "  trace_pipe\t\t- A consuming read to see the contents of the buffer\n"
        "  current_tracer\t- function and latency tracers\n"
        "  available_tracers\t- list of configured tracers for current_tracer\n"
        "  error_log\t- error log for failed commands (that support it)\n"
        "  buffer_size_kb\t- view and modify size of per cpu buffer\n"
        "  buffer_total_size_kb  - view total size of all cpu buffers\n\n"
        "  trace_clock\t\t-change the clock used to order events\n"
        "       local:   Per cpu clock but may not be synced across CPUs\n"
        "      global:   Synced across CPUs but slows tracing down.\n"
        "     counter:   Not a clock, but just an increment\n"
        "      uptime:   Jiffy counter from time of boot\n"
        "        perf:   Same clock that perf events use\n"
#ifdef CONFIG_X86_64
        "     x86-tsc:   TSC cycle counter\n"
#endif
        "\n  timestamp_mode\t-view the mode used to timestamp events\n"
        "       delta:   Delta difference against a buffer-wide timestamp\n"
        "    absolute:   Absolute (standalone) timestamp\n"
        "\n  trace_marker\t\t- Writes into this file writes into the kernel buffer\n"
        "\n  trace_marker_raw\t\t- Writes into this file writes binary data into the kernel buffer\n"
        "  tracing_cpumask\t- Limit which CPUs to trace\n"
        "  instances\t\t- Make sub-buffers with: mkdir instances/foo\n"
        "\t\t\t  Remove sub-buffer with rmdir\n"
        "  trace_options\t\t- Set format or modify how tracing happens\n"
        "\t\t\t  Disable an option by prefixing 'no' to the\n"
        "\t\t\t  option name\n"
        "  saved_cmdlines_size\t- echo command number in here to store comm-pid list\n"
#ifdef CONFIG_DYNAMIC_FTRACE
        "\n  available_filter_functions - list of functions that can be filtered on\n"
        "  set_ftrace_filter\t- echo function name in here to only trace these\n"
        "\t\t\t  functions\n"
        "\t     accepts: func_full_name or glob-matching-pattern\n"
        "\t     modules: Can select a group via module\n"
        "\t      Format: :mod:<module-name>\n"
        "\t     example: echo :mod:ext3 > set_ftrace_filter\n"
        "\t    triggers: a command to perform when function is hit\n"
        "\t      Format: <function>:<trigger>[:count]\n"
        "\t     trigger: traceon, traceoff\n"
        "\t\t      enable_event:<system>:<event>\n"
        "\t\t      disable_event:<system>:<event>\n"
#ifdef CONFIG_STACKTRACE
        "\t\t      stacktrace\n"
#endif
#ifdef CONFIG_TRACER_SNAPSHOT
        "\t\t      snapshot\n"
#endif
        "\t\t      dump\n"
        "\t\t      cpudump\n"
        "\t     example: echo do_fault:traceoff > set_ftrace_filter\n"
        "\t              echo do_trap:traceoff:3 > set_ftrace_filter\n"
        "\t     The first one will disable tracing every time do_fault is hit\n"
        "\t     The second will disable tracing at most 3 times when do_trap is hit\n"
        "\t       The first time do trap is hit and it disables tracing, the\n"
        "\t       counter will decrement to 2. If tracing is already disabled,\n"
        "\t       the counter will not decrement. It only decrements when the\n"
        "\t       trigger did work\n"
        "\t     To remove trigger without count:\n"
        "\t       echo '!<function>:<trigger> > set_ftrace_filter\n"
        "\t     To remove trigger with a count:\n"
        "\t       echo '!<function>:<trigger>:0 > set_ftrace_filter\n"
        "  set_ftrace_notrace\t- echo function name in here to never trace.\n"
        "\t    accepts: func_full_name, *func_end, func_begin*, *func_middle*\n"
        "\t    modules: Can select a group via module command :mod:\n"
        "\t    Does not accept triggers\n"
#endif /* CONFIG_DYNAMIC_FTRACE */
#ifdef CONFIG_FUNCTION_TRACER
        "  set_ftrace_pid\t- Write pid(s) to only function trace those pids\n"
        "\t\t    (function)\n"
        "  set_ftrace_notrace_pid\t- Write pid(s) to not function trace those pids\n"
        "\t\t    (function)\n"
#endif
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
        "  set_graph_function\t- Trace the nested calls of a function (function_graph)\n"
        "  set_graph_notrace\t- Do not trace the nested calls of a function (function_graph)\n"
        "  max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n"
#endif
#ifdef CONFIG_TRACER_SNAPSHOT
        "\n  snapshot\t\t- Like 'trace' but shows the content of the static\n"
        "\t\t\t  snapshot buffer. Read the contents for more\n"
        "\t\t\t  information\n"
#endif
#ifdef CONFIG_STACK_TRACER
        "  stack_trace\t\t- Shows the max stack trace when active\n"
        "  stack_max_size\t- Shows current max stack size that was traced\n"
        "\t\t\t  Write into this file to reset the max size (trigger a\n"
        "\t\t\t  new trace)\n"
#ifdef CONFIG_DYNAMIC_FTRACE
        "  stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace\n"
        "\t\t\t  traces\n"
#endif
#endif /* CONFIG_STACK_TRACER */
#ifdef CONFIG_DYNAMIC_EVENTS
        "  dynamic_events\t\t- Create/append/remove/show the generic dynamic events\n"
        "\t\t\t  Write into this file to define/undefine new trace events.\n"
#endif
#ifdef CONFIG_KPROBE_EVENTS
        "  kprobe_events\t\t- Create/append/remove/show the kernel dynamic events\n"
        "\t\t\t  Write into this file to define/undefine new trace events.\n"
#endif
#ifdef CONFIG_UPROBE_EVENTS
        "  uprobe_events\t\t- Create/append/remove/show the userspace dynamic events\n"
        "\t\t\t  Write into this file to define/undefine new trace events.\n"
#endif
#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
        "\t  accepts: event-definitions (one definition per line)\n"
        "\t   Format: p[:[<group>/]<event>] <place> [<args>]\n"
        "\t           r[maxactive][:[<group>/]<event>] <place> [<args>]\n"
#ifdef CONFIG_HIST_TRIGGERS
        "\t           s:[synthetic/]<event> <field> [<field>]\n"
#endif
        "\t           -:[<group>/]<event>\n"
#ifdef CONFIG_KPROBE_EVENTS
        "\t    place: [<module>:]<symbol>[+<offset>]|<memaddr>\n"
  "place (kretprobe): [<module>:]<symbol>[+<offset>]%return|<memaddr>\n"
#endif
#ifdef CONFIG_UPROBE_EVENTS
  "   place (uprobe): <path>:<offset>[%return][(ref_ctr_offset)]\n"
#endif
        "\t     args: <name>=fetcharg[:type]\n"
        "\t fetcharg: %<register>, @<address>, @<symbol>[+|-<offset>],\n"
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
        "\t           $stack<index>, $stack, $retval, $comm, $arg<N>,\n"
#else
        "\t           $stack<index>, $stack, $retval, $comm,\n"
#endif
        "\t           +|-[u]<offset>(<fetcharg>), \\imm-value, \\\"imm-string\"\n"
        "\t     type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string, symbol,\n"
        "\t           b<bit-width>@<bit-offset>/<container-size>, ustring,\n"
        "\t           <type>\\[<array-size>\\]\n"
#ifdef CONFIG_HIST_TRIGGERS
        "\t    field: <stype> <name>;\n"
        "\t    stype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n"
        "\t           [unsigned] char/int/long\n"
#endif
#endif
        "  events/\t\t- Directory containing all trace event subsystems:\n"
        "      enable\t\t- Write 0/1 to enable/disable tracing of all events\n"
        "  events/<system>/\t- Directory containing all trace events for <system>:\n"
        "      enable\t\t- Write 0/1 to enable/disable tracing of all <system>\n"
        "\t\t\t  events\n"
        "      filter\t\t- If set, only events passing filter are traced\n"
        "  events/<system>/<event>/\t- Directory containing control files for\n"
        "\t\t\t  <event>:\n"
        "      enable\t\t- Write 0/1 to enable/disable tracing of <event>\n"
        "      filter\t\t- If set, only events passing filter are traced\n"
        "      trigger\t\t- If set, a command to perform when event is hit\n"
        "\t    Format: <trigger>[:count][if <filter>]\n"
        "\t   trigger: traceon, traceoff\n"
        "\t            enable_event:<system>:<event>\n"
        "\t            disable_event:<system>:<event>\n"
#ifdef CONFIG_HIST_TRIGGERS
        "\t            enable_hist:<system>:<event>\n"
        "\t            disable_hist:<system>:<event>\n"
#endif
#ifdef CONFIG_STACKTRACE
        "\t\t    stacktrace\n"
#endif
#ifdef CONFIG_TRACER_SNAPSHOT
        "\t\t    snapshot\n"
#endif
#ifdef CONFIG_HIST_TRIGGERS
        "\t\t    hist (see below)\n"
#endif
        "\t   example: echo traceoff > events/block/block_unplug/trigger\n"
        "\t            echo traceoff:3 > events/block/block_unplug/trigger\n"
        "\t            echo 'enable_event:kmem:kmalloc:3 if nr_rq > 1' > \\\n"
        "\t                  events/block/block_unplug/trigger\n"
        "\t   The first disables tracing every time block_unplug is hit.\n"
        "\t   The second disables tracing the first 3 times block_unplug is hit.\n"
        "\t   The third enables the kmalloc event the first 3 times block_unplug\n"
        "\t     is hit and has value of greater than 1 for the 'nr_rq' event field.\n"
        "\t   Like function triggers, the counter is only decremented if it\n"
        "\t    enabled or disabled tracing.\n"
        "\t   To remove a trigger without a count:\n"
        "\t     echo '!<trigger> > <system>/<event>/trigger\n"
        "\t   To remove a trigger with a count:\n"
        "\t     echo '!<trigger>:0 > <system>/<event>/trigger\n"
        "\t   Filters can be ignored when removing a trigger.\n"
#ifdef CONFIG_HIST_TRIGGERS
        "      hist trigger\t- If set, event hits are aggregated into a hash table\n"
        "\t    Format: hist:keys=<field1[,field2,...]>\n"
        "\t            [:values=<field1[,field2,...]>]\n"
        "\t            [:sort=<field1[,field2,...]>]\n"
        "\t            [:size=#entries]\n"
        "\t            [:pause][:continue][:clear]\n"
        "\t            [:name=histname1]\n"
        "\t            [:<handler>.<action>]\n"
        "\t            [if <filter>]\n\n"
        "\t    Note, special fields can be used as well:\n"
        "\t            common_timestamp - to record current timestamp\n"
        "\t            common_cpu - to record the CPU the event happened on\n"
        "\n"
        "\t    When a matching event is hit, an entry is added to a hash\n"
        "\t    table using the key(s) and value(s) named, and the value of a\n"
        "\t    sum called 'hitcount' is incremented.  Keys and values\n"
        "\t    correspond to fields in the event's format description.  Keys\n"
        "\t    can be any field, or the special string 'stacktrace'.\n"
        "\t    Compound keys consisting of up to two fields can be specified\n"
        "\t    by the 'keys' keyword.  Values must correspond to numeric\n"
        "\t    fields.  Sort keys consisting of up to two fields can be\n"
        "\t    specified using the 'sort' keyword.  The sort direction can\n"
        "\t    be modified by appending '.descending' or '.ascending' to a\n"
        "\t    sort field.  The 'size' parameter can be used to specify more\n"
        "\t    or fewer than the default 2048 entries for the hashtable size.\n"
        "\t    If a hist trigger is given a name using the 'name' parameter,\n"
        "\t    its histogram data will be shared with other triggers of the\n"
        "\t    same name, and trigger hits will update this common data.\n\n"
        "\t    Reading the 'hist' file for the event will dump the hash\n"
        "\t    table in its entirety to stdout.  If there are multiple hist\n"
        "\t    triggers attached to an event, there will be a table for each\n"
        "\t    trigger in the output.  The table displayed for a named\n"
        "\t    trigger will be the same as any other instance having the\n"
        "\t    same name.  The default format used to display a given field\n"
        "\t    can be modified by appending any of the following modifiers\n"
        "\t    to the field name, as applicable:\n\n"
        "\t            .hex        display a number as a hex value\n"
        "\t            .sym        display an address as a symbol\n"
        "\t            .sym-offset display an address as a symbol and offset\n"
        "\t            .execname   display a common_pid as a program name\n"
        "\t            .syscall    display a syscall id as a syscall name\n"
        "\t            .log2       display log2 value rather than raw number\n"
        "\t            .usecs      display a common_timestamp in microseconds\n\n"
        "\t    The 'pause' parameter can be used to pause an existing hist\n"
        "\t    trigger or to start a hist trigger but not log any events\n"
        "\t    until told to do so.  'continue' can be used to start or\n"
        "\t    restart a paused hist trigger.\n\n"
        "\t    The 'clear' parameter will clear the contents of a running\n"
        "\t    hist trigger and leave its current paused/active state\n"
        "\t    unchanged.\n\n"
        "\t    The enable_hist and disable_hist triggers can be used to\n"
        "\t    have one event conditionally start and stop another event's\n"
        "\t    already-attached hist trigger.  The syntax is analogous to\n"
        "\t    the enable_event and disable_event triggers.\n\n"
        "\t    Hist trigger handlers and actions are executed whenever a\n"
        "\t    a histogram entry is added or updated.  They take the form:\n\n"
        "\t        <handler>.<action>\n\n"
        "\t    The available handlers are:\n\n"
        "\t        onmatch(matching.event)  - invoke on addition or update\n"
        "\t        onmax(var)               - invoke if var exceeds current max\n"
        "\t        onchange(var)            - invoke action if var changes\n\n"
        "\t    The available actions are:\n\n"
        "\t        trace(<synthetic_event>,param list)  - generate synthetic event\n"
        "\t        save(field,...)                      - save current event fields\n"
#ifdef CONFIG_TRACER_SNAPSHOT
        "\t        snapshot()                           - snapshot the trace buffer\n\n"
#endif
#ifdef CONFIG_SYNTH_EVENTS
        "  events/synthetic_events\t- Create/append/remove/show synthetic events\n"
        "\t  Write into this file to define/undefine new synthetic events.\n"
        "\t     example: echo 'myevent u64 lat; char name[]' >> synthetic_events\n"
#endif
#endif
;

static ssize_t
tracing_readme_read(struct file *filp, char __user *ubuf,
                       size_t cnt, loff_t *ppos)
{
        return simple_read_from_buffer(ubuf, cnt, ppos,
                                        readme_msg, strlen(readme_msg));
}

static const struct file_operations tracing_readme_fops = {
        .open                = tracing_open_generic,
        .read                = tracing_readme_read,
        .llseek                = generic_file_llseek,
};

static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos)
{
        int pid = ++(*pos);

        return trace_find_tgid_ptr(pid);
}

static void *saved_tgids_start(struct seq_file *m, loff_t *pos)
{
        int pid = *pos;

        return trace_find_tgid_ptr(pid);
}

static void saved_tgids_stop(struct seq_file *m, void *v)
{
}

static int saved_tgids_show(struct seq_file *m, void *v)
{
        int *entry = (int *)v;
        int pid = entry - tgid_map;
        int tgid = *entry;

        if (tgid == 0)
                return SEQ_SKIP;

        seq_printf(m, "%d %d\n", pid, tgid);
        return 0;
}

static const struct seq_operations tracing_saved_tgids_seq_ops = {
        .start                = saved_tgids_start,
        .stop                = saved_tgids_stop,
        .next                = saved_tgids_next,
        .show                = saved_tgids_show,
};

static int tracing_saved_tgids_open(struct inode *inode, struct file *filp)
{
        int ret;

        ret = tracing_check_open_get_tr(NULL);
        if (ret)
                return ret;

        return seq_open(filp, &tracing_saved_tgids_seq_ops);
}


static const struct file_operations tracing_saved_tgids_fops = {
        .open                = tracing_saved_tgids_open,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = seq_release,
};

static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos)
{
        unsigned int *ptr = v;

        if (*pos || m->count)
                ptr++;

        (*pos)++;

        for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num];
             ptr++) {
                if (*ptr == -1 || *ptr == NO_CMDLINE_MAP)
                        continue;

                return ptr;
        }

        return NULL;
}

static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos)
{
        void *v;
        loff_t l = 0;

        preempt_disable();
        arch_spin_lock(&trace_cmdline_lock);

        v = &savedcmd->map_cmdline_to_pid[0];
        while (l <= *pos) {
                v = saved_cmdlines_next(m, v, &l);
                if (!v)
                        return NULL;
        }

        return v;
}

static void saved_cmdlines_stop(struct seq_file *m, void *v)
{
        arch_spin_unlock(&trace_cmdline_lock);
        preempt_enable();
}

static int saved_cmdlines_show(struct seq_file *m, void *v)
{
        char buf[TASK_COMM_LEN];
        unsigned int *pid = v;

        __trace_find_cmdline(*pid, buf);
        seq_printf(m, "%d %s\n", *pid, buf);
        return 0;
}

static const struct seq_operations tracing_saved_cmdlines_seq_ops = {
        .start                = saved_cmdlines_start,
        .next                = saved_cmdlines_next,
        .stop                = saved_cmdlines_stop,
        .show                = saved_cmdlines_show,
};

static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp)
{
        int ret;

        ret = tracing_check_open_get_tr(NULL);
        if (ret)
                return ret;

        return seq_open(filp, &tracing_saved_cmdlines_seq_ops);
}

static const struct file_operations tracing_saved_cmdlines_fops = {
        .open                = tracing_saved_cmdlines_open,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = seq_release,
};

static ssize_t
tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf,
                                 size_t cnt, loff_t *ppos)
{
        char buf[64];
        int r;

        preempt_disable();
        arch_spin_lock(&trace_cmdline_lock);
        r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num);
        arch_spin_unlock(&trace_cmdline_lock);
        preempt_enable();

        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}

static int tracing_resize_saved_cmdlines(unsigned int val)
{
        struct saved_cmdlines_buffer *s, *savedcmd_temp;

        s = allocate_cmdlines_buffer(val);
        if (!s)
                return -ENOMEM;

        preempt_disable();
        arch_spin_lock(&trace_cmdline_lock);
        savedcmd_temp = savedcmd;
        savedcmd = s;
        arch_spin_unlock(&trace_cmdline_lock);
        preempt_enable();
        free_saved_cmdlines_buffer(savedcmd_temp);

        return 0;
}

static ssize_t
tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf,
                                  size_t cnt, loff_t *ppos)
{
        unsigned long val;
        int ret;

        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
        if (ret)
                return ret;

        /* must have at least 1 entry or less than PID_MAX_DEFAULT */
        if (!val || val > PID_MAX_DEFAULT)
                return -EINVAL;

        ret = tracing_resize_saved_cmdlines((unsigned int)val);
        if (ret < 0)
                return ret;

        *ppos += cnt;

        return cnt;
}

static const struct file_operations tracing_saved_cmdlines_size_fops = {
        .open                = tracing_open_generic,
        .read                = tracing_saved_cmdlines_size_read,
        .write                = tracing_saved_cmdlines_size_write,
};

#ifdef CONFIG_TRACE_EVAL_MAP_FILE
static union trace_eval_map_item *
update_eval_map(union trace_eval_map_item *ptr)
{
        if (!ptr->map.eval_string) {
                if (ptr->tail.next) {
                        ptr = ptr->tail.next;
                        /* Set ptr to the next real item (skip head) */
                        ptr++;
                } else
                        return NULL;
        }
        return ptr;
}

static void *eval_map_next(struct seq_file *m, void *v, loff_t *pos)
{
        union trace_eval_map_item *ptr = v;

        /*
         * Paranoid! If ptr points to end, we don't want to increment past it.
         * This really should never happen.
         */
        (*pos)++;
        ptr = update_eval_map(ptr);
        if (WARN_ON_ONCE(!ptr))
                return NULL;

        ptr++;
        ptr = update_eval_map(ptr);

        return ptr;
}

static void *eval_map_start(struct seq_file *m, loff_t *pos)
{
        union trace_eval_map_item *v;
        loff_t l = 0;

        mutex_lock(&trace_eval_mutex);

        v = trace_eval_maps;
        if (v)
                v++;

        while (v && l < *pos) {
                v = eval_map_next(m, v, &l);
        }

        return v;
}

static void eval_map_stop(struct seq_file *m, void *v)
{
        mutex_unlock(&trace_eval_mutex);
}

static int eval_map_show(struct seq_file *m, void *v)
{
        union trace_eval_map_item *ptr = v;

        seq_printf(m, "%s %ld (%s)\n",
                   ptr->map.eval_string, ptr->map.eval_value,
                   ptr->map.system);

        return 0;
}

static const struct seq_operations tracing_eval_map_seq_ops = {
        .start                = eval_map_start,
        .next                = eval_map_next,
        .stop                = eval_map_stop,
        .show                = eval_map_show,
};

static int tracing_eval_map_open(struct inode *inode, struct file *filp)
{
        int ret;

        ret = tracing_check_open_get_tr(NULL);
        if (ret)
                return ret;

        return seq_open(filp, &tracing_eval_map_seq_ops);
}

static const struct file_operations tracing_eval_map_fops = {
        .open                = tracing_eval_map_open,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = seq_release,
};

static inline union trace_eval_map_item *
trace_eval_jmp_to_tail(union trace_eval_map_item *ptr)
{
        /* Return tail of array given the head */
        return ptr + ptr->head.length + 1;
}

static void
trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start,
                           int len)
{
        struct trace_eval_map **stop;
        struct trace_eval_map **map;
        union trace_eval_map_item *map_array;
        union trace_eval_map_item *ptr;

        stop = start + len;

        /*
         * The trace_eval_maps contains the map plus a head and tail item,
         * where the head holds the module and length of array, and the
         * tail holds a pointer to the next list.
         */
        map_array = kmalloc_array(len + 2, sizeof(*map_array), GFP_KERNEL);
        if (!map_array) {
                pr_warn("Unable to allocate trace eval mapping\n");
                return;
        }

        mutex_lock(&trace_eval_mutex);

        if (!trace_eval_maps)
                trace_eval_maps = map_array;
        else {
                ptr = trace_eval_maps;
                for (;;) {
                        ptr = trace_eval_jmp_to_tail(ptr);
                        if (!ptr->tail.next)
                                break;
                        ptr = ptr->tail.next;

                }
                ptr->tail.next = map_array;
        }
        map_array->head.mod = mod;
        map_array->head.length = len;
        map_array++;

        for (map = start; (unsigned long)map < (unsigned long)stop; map++) {
                map_array->map = **map;
                map_array++;
        }
        memset(map_array, 0, sizeof(*map_array));

        mutex_unlock(&trace_eval_mutex);
}

static void trace_create_eval_file(struct dentry *d_tracer)
{
        trace_create_file("eval_map", 0444, d_tracer,
                          NULL, &tracing_eval_map_fops);
}

#else /* CONFIG_TRACE_EVAL_MAP_FILE */
static inline void trace_create_eval_file(struct dentry *d_tracer) { }
static inline void trace_insert_eval_map_file(struct module *mod,
                              struct trace_eval_map **start, int len) { }
#endif /* !CONFIG_TRACE_EVAL_MAP_FILE */

static void trace_insert_eval_map(struct module *mod,
                                  struct trace_eval_map **start, int len)
{
        struct trace_eval_map **map;

        if (len <= 0)
                return;

        map = start;

        trace_event_eval_update(map, len);

        trace_insert_eval_map_file(mod, start, len);
}

static ssize_t
tracing_set_trace_read(struct file *filp, char __user *ubuf,
                       size_t cnt, loff_t *ppos)
{
        struct trace_array *tr = filp->private_data;
        char buf[MAX_TRACER_SIZE+2];
        int r;

        mutex_lock(&trace_types_lock);
        r = sprintf(buf, "%s\n", tr->current_trace->name);
        mutex_unlock(&trace_types_lock);

        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}

int tracer_init(struct tracer *t, struct trace_array *tr)
{
        tracing_reset_online_cpus(&tr->array_buffer);
        return t->init(tr);
}

static void set_buffer_entries(struct array_buffer *buf, unsigned long val)
{
        int cpu;

        for_each_tracing_cpu(cpu)
                per_cpu_ptr(buf->data, cpu)->entries = val;
}

static void update_buffer_entries(struct array_buffer *buf, int cpu)
{
        if (cpu == RING_BUFFER_ALL_CPUS) {
                set_buffer_entries(buf, ring_buffer_size(buf->buffer, 0));
        } else {
                per_cpu_ptr(buf->data, cpu)->entries = ring_buffer_size(buf->buffer, cpu);
        }
}

#ifdef CONFIG_TRACER_MAX_TRACE
/* resize @tr's buffer to the size of @size_tr's entries */
static int resize_buffer_duplicate_size(struct array_buffer *trace_buf,
                                        struct array_buffer *size_buf, int cpu_id)
{
        int cpu, ret = 0;

        if (cpu_id == RING_BUFFER_ALL_CPUS) {
                for_each_tracing_cpu(cpu) {
                        ret = ring_buffer_resize(trace_buf->buffer,
                                 per_cpu_ptr(size_buf->data, cpu)->entries, cpu);
                        if (ret < 0)
                                break;
                        per_cpu_ptr(trace_buf->data, cpu)->entries =
                                per_cpu_ptr(size_buf->data, cpu)->entries;
                }
        } else {
                ret = ring_buffer_resize(trace_buf->buffer,
                                 per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id);
                if (ret == 0)
                        per_cpu_ptr(trace_buf->data, cpu_id)->entries =
                                per_cpu_ptr(size_buf->data, cpu_id)->entries;
        }

        return ret;
}
#endif /* CONFIG_TRACER_MAX_TRACE */

static int __tracing_resize_ring_buffer(struct trace_array *tr,
                                        unsigned long size, int cpu)
{
        int ret;

        /*
         * If kernel or user changes the size of the ring buffer
         * we use the size that was given, and we can forget about
         * expanding it later.
         */
        ring_buffer_expanded = true;

        /* May be called before buffers are initialized */
        if (!tr->array_buffer.buffer)
                return 0;

        /* Do not allow tracing while resizing ring buffer */
        tracing_stop_tr(tr);

        ret = ring_buffer_resize(tr->array_buffer.buffer, size, cpu);
        if (ret < 0)
                goto out_start;

#ifdef CONFIG_TRACER_MAX_TRACE
        if (!tr->allocated_snapshot)
                goto out;

        ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu);
        if (ret < 0) {
                int r = resize_buffer_duplicate_size(&tr->array_buffer,
                                                     &tr->array_buffer, cpu);
                if (r < 0) {
                        /*
                         * AARGH! We are left with different
                         * size max buffer!!!!
                         * The max buffer is our "snapshot" buffer.
                         * When a tracer needs a snapshot (one of the
                         * latency tracers), it swaps the max buffer
                         * with the saved snap shot. We succeeded to
                         * update the size of the main buffer, but failed to
                         * update the size of the max buffer. But when we tried
                         * to reset the main buffer to the original size, we
                         * failed there too. This is very unlikely to
                         * happen, but if it does, warn and kill all
                         * tracing.
                         */
                        WARN_ON(1);
                        tracing_disabled = 1;
                }
                goto out_start;
        }

        update_buffer_entries(&tr->max_buffer, cpu);

 out:
#endif /* CONFIG_TRACER_MAX_TRACE */

        update_buffer_entries(&tr->array_buffer, cpu);
 out_start:
        tracing_start_tr(tr);
        return ret;
}

ssize_t tracing_resize_ring_buffer(struct trace_array *tr,
                                  unsigned long size, int cpu_id)
{
        int ret = size;

        mutex_lock(&trace_types_lock);

        if (cpu_id != RING_BUFFER_ALL_CPUS) {
                /* make sure, this cpu is enabled in the mask */
                if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) {
                        ret = -EINVAL;
                        goto out;
                }
        }

        ret = __tracing_resize_ring_buffer(tr, size, cpu_id);
        if (ret < 0)
                ret = -ENOMEM;

out:
        mutex_unlock(&trace_types_lock);

        return ret;
}


/**
 * tracing_update_buffers - used by tracing facility to expand ring buffers
 *
 * To save on memory when the tracing is never used on a system with it
 * configured in. The ring buffers are set to a minimum size. But once
 * a user starts to use the tracing facility, then they need to grow
 * to their default size.
 *
 * This function is to be called when a tracer is about to be used.
 */
int tracing_update_buffers(void)
{
        int ret = 0;

        mutex_lock(&trace_types_lock);
        if (!ring_buffer_expanded)
                ret = __tracing_resize_ring_buffer(&global_trace, trace_buf_size,
                                                RING_BUFFER_ALL_CPUS);
        mutex_unlock(&trace_types_lock);

        return ret;
}

struct trace_option_dentry;

static void
create_trace_option_files(struct trace_array *tr, struct tracer *tracer);

/*
 * Used to clear out the tracer before deletion of an instance.
 * Must have trace_types_lock held.
 */
static void tracing_set_nop(struct trace_array *tr)
{
        if (tr->current_trace == &nop_trace)
                return;
        
        tr->current_trace->enabled--;

        if (tr->current_trace->reset)
                tr->current_trace->reset(tr);

        tr->current_trace = &nop_trace;
}

static bool tracer_options_updated;

static void add_tracer_options(struct trace_array *tr, struct tracer *t)
{
        /* Only enable if the directory has been created already. */
        if (!tr->dir)
                return;

        /* Only create trace option files after update_tracer_options finish */
        if (!tracer_options_updated)
                return;

        create_trace_option_files(tr, t);
}

int tracing_set_tracer(struct trace_array *tr, const char *buf)
{
        struct tracer *t;
#ifdef CONFIG_TRACER_MAX_TRACE
        bool had_max_tr;
#endif
        int ret = 0;

        mutex_lock(&trace_types_lock);

        if (!ring_buffer_expanded) {
                ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
                                                RING_BUFFER_ALL_CPUS);
                if (ret < 0)
                        goto out;
                ret = 0;
        }

        for (t = trace_types; t; t = t->next) {
                if (strcmp(t->name, buf) == 0)
                        break;
        }
        if (!t) {
                ret = -EINVAL;
                goto out;
        }
        if (t == tr->current_trace)
                goto out;

#ifdef CONFIG_TRACER_SNAPSHOT
        if (t->use_max_tr) {
                local_irq_disable();
                arch_spin_lock(&tr->max_lock);
                if (tr->cond_snapshot)
                        ret = -EBUSY;
                arch_spin_unlock(&tr->max_lock);
                local_irq_enable();
                if (ret)
                        goto out;
        }
#endif
        /* Some tracers won't work on kernel command line */
        if (system_state < SYSTEM_RUNNING && t->noboot) {
                pr_warn("Tracer '%s' is not allowed on command line, ignored\n",
                        t->name);
                goto out;
        }

        /* Some tracers are only allowed for the top level buffer */
        if (!trace_ok_for_array(t, tr)) {
                ret = -EINVAL;
                goto out;
        }

        /* If trace pipe files are being read, we can't change the tracer */
        if (tr->trace_ref) {
                ret = -EBUSY;
                goto out;
        }

        trace_branch_disable();

        tr->current_trace->enabled--;

        if (tr->current_trace->reset)
                tr->current_trace->reset(tr);

#ifdef CONFIG_TRACER_MAX_TRACE
        had_max_tr = tr->current_trace->use_max_tr;

        /* Current trace needs to be nop_trace before synchronize_rcu */
        tr->current_trace = &nop_trace;

        if (had_max_tr && !t->use_max_tr) {
                /*
                 * We need to make sure that the update_max_tr sees that
                 * current_trace changed to nop_trace to keep it from
                 * swapping the buffers after we resize it.
                 * The update_max_tr is called from interrupts disabled
                 * so a synchronized_sched() is sufficient.
                 */
                synchronize_rcu();
                free_snapshot(tr);
        }

        if (t->use_max_tr && !tr->allocated_snapshot) {
                ret = tracing_alloc_snapshot_instance(tr);
                if (ret < 0)
                        goto out;
        }
#else
        tr->current_trace = &nop_trace;
#endif

        if (t->init) {
                ret = tracer_init(t, tr);
                if (ret)
                        goto out;
        }

        tr->current_trace = t;
        tr->current_trace->enabled++;
        trace_branch_enable(tr);
 out:
        mutex_unlock(&trace_types_lock);

        return ret;
}

static ssize_t
tracing_set_trace_write(struct file *filp, const char __user *ubuf,
                        size_t cnt, loff_t *ppos)
{
        struct trace_array *tr = filp->private_data;
        char buf[MAX_TRACER_SIZE+1];
        int i;
        size_t ret;
        int err;

        ret = cnt;

        if (cnt > MAX_TRACER_SIZE)
                cnt = MAX_TRACER_SIZE;

        if (copy_from_user(buf, ubuf, cnt))
                return -EFAULT;

        buf[cnt] = 0;

        /* strip ending whitespace. */
        for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
                buf[i] = 0;

        err = tracing_set_tracer(tr, buf);
        if (err)
                return err;

        *ppos += ret;

        return ret;
}

static ssize_t
tracing_nsecs_read(unsigned long *ptr, char __user *ubuf,
                   size_t cnt, loff_t *ppos)
{
        char buf[64];
        int r;

        r = snprintf(buf, sizeof(buf), "%ld\n",
                     *ptr == (unsigned long)-1 ? -1 : nsecs_to_usecs(*ptr));
        if (r > sizeof(buf))
                r = sizeof(buf);
        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}

static ssize_t
tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf,
                    size_t cnt, loff_t *ppos)
{
        unsigned long val;
        int ret;

        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
        if (ret)
                return ret;

        *ptr = val * 1000;

        return cnt;
}

static ssize_t
tracing_thresh_read(struct file *filp, char __user *ubuf,
                    size_t cnt, loff_t *ppos)
{
        return tracing_nsecs_read(&tracing_thresh, ubuf, cnt, ppos);
}

static ssize_t
tracing_thresh_write(struct file *filp, const char __user *ubuf,
                     size_t cnt, loff_t *ppos)
{
        struct trace_array *tr = filp->private_data;
        int ret;

        mutex_lock(&trace_types_lock);
        ret = tracing_nsecs_write(&tracing_thresh, ubuf, cnt, ppos);
        if (ret < 0)
                goto out;

        if (tr->current_trace->update_thresh) {
                ret = tr->current_trace->update_thresh(tr);
                if (ret < 0)
                        goto out;
        }

        ret = cnt;
out:
        mutex_unlock(&trace_types_lock);

        return ret;
}

#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)

static ssize_t
tracing_max_lat_read(struct file *filp, char __user *ubuf,
                     size_t cnt, loff_t *ppos)
{
        return tracing_nsecs_read(filp->private_data, ubuf, cnt, ppos);
}

static ssize_t
tracing_max_lat_write(struct file *filp, const char __user *ubuf,
                      size_t cnt, loff_t *ppos)
{
        return tracing_nsecs_write(filp->private_data, ubuf, cnt, ppos);
}

#endif

static int open_pipe_on_cpu(struct trace_array *tr, int cpu)
{
        if (cpu == RING_BUFFER_ALL_CPUS) {
                if (cpumask_empty(tr->pipe_cpumask)) {
                        cpumask_setall(tr->pipe_cpumask);
                        return 0;
                }
        } else if (!cpumask_test_cpu(cpu, tr->pipe_cpumask)) {
                cpumask_set_cpu(cpu, tr->pipe_cpumask);
                return 0;
        }
        return -EBUSY;
}

static void close_pipe_on_cpu(struct trace_array *tr, int cpu)
{
        if (cpu == RING_BUFFER_ALL_CPUS) {
                WARN_ON(!cpumask_full(tr->pipe_cpumask));
                cpumask_clear(tr->pipe_cpumask);
        } else {
                WARN_ON(!cpumask_test_cpu(cpu, tr->pipe_cpumask));
                cpumask_clear_cpu(cpu, tr->pipe_cpumask);
        }
}

static int tracing_open_pipe(struct inode *inode, struct file *filp)
{
        struct trace_array *tr = inode->i_private;
        struct trace_iterator *iter;
        int cpu;
        int ret;

        ret = tracing_check_open_get_tr(tr);
        if (ret)
                return ret;

        mutex_lock(&trace_types_lock);
        cpu = tracing_get_cpu(inode);
        ret = open_pipe_on_cpu(tr, cpu);
        if (ret)
                goto fail_pipe_on_cpu;

        /* create a buffer to store the information to pass to userspace */
        iter = kzalloc(sizeof(*iter), GFP_KERNEL);
        if (!iter) {
                ret = -ENOMEM;
                goto fail_alloc_iter;
        }

        trace_seq_init(&iter->seq);
        iter->trace = tr->current_trace;

        if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
                ret = -ENOMEM;
                goto fail;
        }

        /* trace pipe does not show start of buffer */
        cpumask_setall(iter->started);

        if (tr->trace_flags & TRACE_ITER_LATENCY_FMT)
                iter->iter_flags |= TRACE_FILE_LAT_FMT;

        /* Output in nanoseconds only if we are using a clock in nanoseconds. */
        if (trace_clocks[tr->clock_id].in_ns)
                iter->iter_flags |= TRACE_FILE_TIME_IN_NS;

        iter->tr = tr;
        iter->array_buffer = &tr->array_buffer;
        iter->cpu_file = cpu;
        mutex_init(&iter->mutex);
        filp->private_data = iter;

        if (iter->trace->pipe_open)
                iter->trace->pipe_open(iter);

        nonseekable_open(inode, filp);

        tr->trace_ref++;

        mutex_unlock(&trace_types_lock);
        return ret;

fail:
        kfree(iter);
fail_alloc_iter:
        close_pipe_on_cpu(tr, cpu);
fail_pipe_on_cpu:
        __trace_array_put(tr);
        mutex_unlock(&trace_types_lock);
        return ret;
}

static int tracing_release_pipe(struct inode *inode, struct file *file)
{
        struct trace_iterator *iter = file->private_data;
        struct trace_array *tr = inode->i_private;

        mutex_lock(&trace_types_lock);

        tr->trace_ref--;

        if (iter->trace->pipe_close)
                iter->trace->pipe_close(iter);
        close_pipe_on_cpu(tr, iter->cpu_file);
        mutex_unlock(&trace_types_lock);

        free_cpumask_var(iter->started);
        kfree(iter->temp);
        mutex_destroy(&iter->mutex);
        kfree(iter);

        trace_array_put(tr);

        return 0;
}

static __poll_t
trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table)
{
        struct trace_array *tr = iter->tr;

        /* Iterators are static, they should be filled or empty */
        if (trace_buffer_iter(iter, iter->cpu_file))
                return EPOLLIN | EPOLLRDNORM;

        if (tr->trace_flags & TRACE_ITER_BLOCK)
                /*
                 * Always select as readable when in blocking mode
                 */
                return EPOLLIN | EPOLLRDNORM;
        else
                return ring_buffer_poll_wait(iter->array_buffer->buffer, iter->cpu_file,
                                             filp, poll_table, iter->tr->buffer_percent);
}

static __poll_t
tracing_poll_pipe(struct file *filp, poll_table *poll_table)
{
        struct trace_iterator *iter = filp->private_data;

        return trace_poll(iter, filp, poll_table);
}

/* Must be called with iter->mutex held. */
static int tracing_wait_pipe(struct file *filp)
{
        struct trace_iterator *iter = filp->private_data;
        int ret;

        while (trace_empty(iter)) {

                if ((filp->f_flags & O_NONBLOCK)) {
                        return -EAGAIN;
                }

                /*
                 * We block until we read something and tracing is disabled.
                 * We still block if tracing is disabled, but we have never
                 * read anything. This allows a user to cat this file, and
                 * then enable tracing. But after we have read something,
                 * we give an EOF when tracing is again disabled.
                 *
                 * iter->pos will be 0 if we haven't read anything.
                 */
                if (!tracer_tracing_is_on(iter->tr) && iter->pos)
                        break;

                mutex_unlock(&iter->mutex);

                ret = wait_on_pipe(iter, 0);

                mutex_lock(&iter->mutex);

                if (ret)
                        return ret;
        }

        return 1;
}

/*
 * Consumer reader.
 */
static ssize_t
tracing_read_pipe(struct file *filp, char __user *ubuf,
                  size_t cnt, loff_t *ppos)
{
        struct trace_iterator *iter = filp->private_data;
        ssize_t sret;

        /*
         * Avoid more than one consumer on a single file descriptor
         * This is just a matter of traces coherency, the ring buffer itself
         * is protected.
         */
        mutex_lock(&iter->mutex);

        /* return any leftover data */
        sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
        if (sret != -EBUSY)
                goto out;

        trace_seq_init(&iter->seq);

        if (iter->trace->read) {
                sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
                if (sret)
                        goto out;
        }

waitagain:
        sret = tracing_wait_pipe(filp);
        if (sret <= 0)
                goto out;

        /* stop when tracing is finished */
        if (trace_empty(iter)) {
                sret = 0;
                goto out;
        }

        if (cnt >= PAGE_SIZE)
                cnt = PAGE_SIZE - 1;

        /* reset all but tr, trace, and overruns */
        memset(&iter->seq, 0,
               sizeof(struct trace_iterator) -
               offsetof(struct trace_iterator, seq));
        cpumask_clear(iter->started);
        trace_seq_init(&iter->seq);
        iter->pos = -1;

        trace_event_read_lock();
        trace_access_lock(iter->cpu_file);
        while (trace_find_next_entry_inc(iter) != NULL) {
                enum print_line_t ret;
                int save_len = iter->seq.seq.len;

                ret = print_trace_line(iter);
                if (ret == TRACE_TYPE_PARTIAL_LINE) {
                        /*
                         * If one print_trace_line() fills entire trace_seq in one shot,
                         * trace_seq_to_user() will returns -EBUSY because save_len == 0,
                         * In this case, we need to consume it, otherwise, loop will peek
                         * this event next time, resulting in an infinite loop.
                         */
                        if (save_len == 0) {
                                iter->seq.full = 0;
                                trace_seq_puts(&iter->seq, "[LINE TOO BIG]\n");
                                trace_consume(iter);
                                break;
                        }

                        /* In other cases, don't print partial lines */
                        iter->seq.seq.len = save_len;
                        break;
                }
                if (ret != TRACE_TYPE_NO_CONSUME)
                        trace_consume(iter);

                if (trace_seq_used(&iter->seq) >= cnt)
                        break;

                /*
                 * Setting the full flag means we reached the trace_seq buffer
                 * size and we should leave by partial output condition above.
                 * One of the trace_seq_* functions is not used properly.
                 */
                WARN_ONCE(iter->seq.full, "full flag set for trace type %d",
                          iter->ent->type);
        }
        trace_access_unlock(iter->cpu_file);
        trace_event_read_unlock();

        /* Now copy what we have to the user */
        sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
        if (iter->seq.seq.readpos >= trace_seq_used(&iter->seq))
                trace_seq_init(&iter->seq);

        /*
         * If there was nothing to send to user, in spite of consuming trace
         * entries, go back to wait for more entries.
         */
        if (sret == -EBUSY)
                goto waitagain;

out:
        mutex_unlock(&iter->mutex);

        return sret;
}

static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
                                     unsigned int idx)
{
        __free_page(spd->pages[idx]);
}

static size_t
tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
{
        size_t count;
        int save_len;
        int ret;

        /* Seq buffer is page-sized, exactly what we need. */
        for (;;) {
                save_len = iter->seq.seq.len;
                ret = print_trace_line(iter);

                if (trace_seq_has_overflowed(&iter->seq)) {
                        iter->seq.seq.len = save_len;
                        break;
                }

                /*
                 * This should not be hit, because it should only
                 * be set if the iter->seq overflowed. But check it
                 * anyway to be safe.
                 */
                if (ret == TRACE_TYPE_PARTIAL_LINE) {
                        iter->seq.seq.len = save_len;
                        break;
                }

                count = trace_seq_used(&iter->seq) - save_len;
                if (rem < count) {
                        rem = 0;
                        iter->seq.seq.len = save_len;
                        break;
                }

                if (ret != TRACE_TYPE_NO_CONSUME)
                        trace_consume(iter);
                rem -= count;
                if (!trace_find_next_entry_inc(iter))        {
                        rem = 0;
                        iter->ent = NULL;
                        break;
                }
        }

        return rem;
}

static ssize_t tracing_splice_read_pipe(struct file *filp,
                                        loff_t *ppos,
                                        struct pipe_inode_info *pipe,
                                        size_t len,
                                        unsigned int flags)
{
        struct page *pages_def[PIPE_DEF_BUFFERS];
        struct partial_page partial_def[PIPE_DEF_BUFFERS];
        struct trace_iterator *iter = filp->private_data;
        struct splice_pipe_desc spd = {
                .pages                = pages_def,
                .partial        = partial_def,
                .nr_pages        = 0, /* This gets updated below. */
                .nr_pages_max        = PIPE_DEF_BUFFERS,
                .ops                = &default_pipe_buf_ops,
                .spd_release        = tracing_spd_release_pipe,
        };
        ssize_t ret;
        size_t rem;
        unsigned int i;

        if (splice_grow_spd(pipe, &spd))
                return -ENOMEM;

        mutex_lock(&iter->mutex);

        if (iter->trace->splice_read) {
                ret = iter->trace->splice_read(iter, filp,
                                               ppos, pipe, len, flags);
                if (ret)
                        goto out_err;
        }

        ret = tracing_wait_pipe(filp);
        if (ret <= 0)
                goto out_err;

        if (!iter->ent && !trace_find_next_entry_inc(iter)) {
                ret = -EFAULT;
                goto out_err;
        }

        trace_event_read_lock();
        trace_access_lock(iter->cpu_file);

        /* Fill as many pages as possible. */
        for (i = 0, rem = len; i < spd.nr_pages_max && rem; i++) {
                spd.pages[i] = alloc_page(GFP_KERNEL);
                if (!spd.pages[i])
                        break;

                rem = tracing_fill_pipe_page(rem, iter);

                /* Copy the data into the page, so we can start over. */
                ret = trace_seq_to_buffer(&iter->seq,
                                          page_address(spd.pages[i]),
                                          min((size_t)trace_seq_used(&iter->seq),
                                                  (size_t)PAGE_SIZE));
                if (ret < 0) {
                        __free_page(spd.pages[i]);
                        break;
                }
                spd.partial[i].offset = 0;
                spd.partial[i].len = ret;

                trace_seq_init(&iter->seq);
        }

        trace_access_unlock(iter->cpu_file);
        trace_event_read_unlock();
        mutex_unlock(&iter->mutex);

        spd.nr_pages = i;

        if (i)
                ret = splice_to_pipe(pipe, &spd);
        else
                ret = 0;
out:
        splice_shrink_spd(&spd);
        return ret;

out_err:
        mutex_unlock(&iter->mutex);
        goto out;
}

static ssize_t
tracing_entries_read(struct file *filp, char __user *ubuf,
                     size_t cnt, loff_t *ppos)
{
        struct inode *inode = file_inode(filp);
        struct trace_array *tr = inode->i_private;
        int cpu = tracing_get_cpu(inode);
        char buf[64];
        int r = 0;
        ssize_t ret;

        mutex_lock(&trace_types_lock);

        if (cpu == RING_BUFFER_ALL_CPUS) {
                int cpu, buf_size_same;
                unsigned long size;

                size = 0;
                buf_size_same = 1;
                /* check if all cpu sizes are same */
                for_each_tracing_cpu(cpu) {
                        /* fill in the size from first enabled cpu */
                        if (size == 0)
                                size = per_cpu_ptr(tr->array_buffer.data, cpu)->entries;
                        if (size != per_cpu_ptr(tr->array_buffer.data, cpu)->entries) {
                                buf_size_same = 0;
                                break;
                        }
                }

                if (buf_size_same) {
                        if (!ring_buffer_expanded)
                                r = sprintf(buf, "%lu (expanded: %lu)\n",
                                            size >> 10,
                                            trace_buf_size >> 10);
                        else
                                r = sprintf(buf, "%lu\n", size >> 10);
                } else
                        r = sprintf(buf, "X\n");
        } else
                r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->array_buffer.data, cpu)->entries >> 10);

        mutex_unlock(&trace_types_lock);

        ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
        return ret;
}

static ssize_t
tracing_entries_write(struct file *filp, const char __user *ubuf,
                      size_t cnt, loff_t *ppos)
{
        struct inode *inode = file_inode(filp);
        struct trace_array *tr = inode->i_private;
        unsigned long val;
        int ret;

        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
        if (ret)
                return ret;

        /* must have at least 1 entry */
        if (!val)
                return -EINVAL;

        /* value is in KB */
        val <<= 10;
        ret = tracing_resize_ring_buffer(tr, val, tracing_get_cpu(inode));
        if (ret < 0)
                return ret;

        *ppos += cnt;

        return cnt;
}

static ssize_t
tracing_total_entries_read(struct file *filp, char __user *ubuf,
                                size_t cnt, loff_t *ppos)
{
        struct trace_array *tr = filp->private_data;
        char buf[64];
        int r, cpu;
        unsigned long size = 0, expanded_size = 0;

        mutex_lock(&trace_types_lock);
        for_each_tracing_cpu(cpu) {
                size += per_cpu_ptr(tr->array_buffer.data, cpu)->entries >> 10;
                if (!ring_buffer_expanded)
                        expanded_size += trace_buf_size >> 10;
        }
        if (ring_buffer_expanded)
                r = sprintf(buf, "%lu\n", size);
        else
                r = sprintf(buf, "%lu (expanded: %lu)\n", size, expanded_size);
        mutex_unlock(&trace_types_lock);

        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}

static ssize_t
tracing_free_buffer_write(struct file *filp, const char __user *ubuf,
                          size_t cnt, loff_t *ppos)
{
        /*
         * There is no need to read what the user has written, this function
         * is just to make sure that there is no error when "echo" is used
         */

        *ppos += cnt;

        return cnt;
}

static int
tracing_free_buffer_release(struct inode *inode, struct file *filp)
{
        struct trace_array *tr = inode->i_private;

        /* disable tracing ? */
        if (tr->trace_flags & TRACE_ITER_STOP_ON_FREE)
                tracer_tracing_off(tr);
        /* resize the ring buffer to 0 */
        tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS);

        trace_array_put(tr);

        return 0;
}

static ssize_t
tracing_mark_write(struct file *filp, const char __user *ubuf,
                                        size_t cnt, loff_t *fpos)
{
        struct trace_array *tr = filp->private_data;
        struct ring_buffer_event *event;
        enum event_trigger_type tt = ETT_NONE;
        struct trace_buffer *buffer;
        struct print_entry *entry;
        unsigned long irq_flags;
        ssize_t written;
        int size;
        int len;

/* Used in tracing_mark_raw_write() as well */
#define FAULTED_STR "<faulted>"
#define FAULTED_SIZE (sizeof(FAULTED_STR) - 1) /* '\0' is already accounted for */

        if (tracing_disabled)
                return -EINVAL;

        if (!(tr->trace_flags & TRACE_ITER_MARKERS))
                return -EINVAL;

        if (cnt > TRACE_BUF_SIZE)
                cnt = TRACE_BUF_SIZE;

        BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);

        local_save_flags(irq_flags);
        size = sizeof(*entry) + cnt + 2; /* add '\0' and possible '\n' */

        /* If less than "<faulted>", then make sure we can still add that */
        if (cnt < FAULTED_SIZE)
                size += FAULTED_SIZE - cnt;

        buffer = tr->array_buffer.buffer;
        event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
                                            irq_flags, preempt_count());
        if (unlikely(!event))
                /* Ring buffer disabled, return as if not open for write */
                return -EBADF;

        entry = ring_buffer_event_data(event);
        entry->ip = _THIS_IP_;

        len = copy_from_user_nofault(&entry->buf, ubuf, cnt);
        if (len) {
                memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE);
                cnt = FAULTED_SIZE;
                written = -EFAULT;
        } else
                written = cnt;

        if (tr->trace_marker_file && !list_empty(&tr->trace_marker_file->triggers)) {
                /* do not add \n before testing triggers, but add \0 */
                entry->buf[cnt] = '\0';
                tt = event_triggers_call(tr->trace_marker_file, entry, event);
        }

        if (entry->buf[cnt - 1] != '\n') {
                entry->buf[cnt] = '\n';
                entry->buf[cnt + 1] = '\0';
        } else
                entry->buf[cnt] = '\0';

        if (static_branch_unlikely(&trace_marker_exports_enabled))
                ftrace_exports(event, TRACE_EXPORT_MARKER);
        __buffer_unlock_commit(buffer, event);

        if (tt)
                event_triggers_post_call(tr->trace_marker_file, tt);

        if (written > 0)
                *fpos += written;

        return written;
}

/* Limit it for now to 3K (including tag) */
#define RAW_DATA_MAX_SIZE (1024*3)

static ssize_t
tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
                                        size_t cnt, loff_t *fpos)
{
        struct trace_array *tr = filp->private_data;
        struct ring_buffer_event *event;
        struct trace_buffer *buffer;
        struct raw_data_entry *entry;
        unsigned long irq_flags;
        ssize_t written;
        int size;
        int len;

#define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int))

        if (tracing_disabled)
                return -EINVAL;

        if (!(tr->trace_flags & TRACE_ITER_MARKERS))
                return -EINVAL;

        /* The marker must at least have a tag id */
        if (cnt < sizeof(unsigned int) || cnt > RAW_DATA_MAX_SIZE)
                return -EINVAL;

        if (cnt > TRACE_BUF_SIZE)
                cnt = TRACE_BUF_SIZE;

        BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);

        local_save_flags(irq_flags);
        size = sizeof(*entry) + cnt;
        if (cnt < FAULT_SIZE_ID)
                size += FAULT_SIZE_ID - cnt;

        buffer = tr->array_buffer.buffer;
        event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size,
                                            irq_flags, preempt_count());
        if (!event)
                /* Ring buffer disabled, return as if not open for write */
                return -EBADF;

        entry = ring_buffer_event_data(event);

        len = copy_from_user_nofault(&entry->id, ubuf, cnt);
        if (len) {
                entry->id = -1;
                memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE);
                written = -EFAULT;
        } else
                written = cnt;

        __buffer_unlock_commit(buffer, event);

        if (written > 0)
                *fpos += written;

        return written;
}

static int tracing_clock_show(struct seq_file *m, void *v)
{
        struct trace_array *tr = m->private;
        int i;

        for (i = 0; i < ARRAY_SIZE(trace_clocks); i++)
                seq_printf(m,
                        "%s%s%s%s", i ? " " : "",
                        i == tr->clock_id ? "[" : "", trace_clocks[i].name,
                        i == tr->clock_id ? "]" : "");
        seq_putc(m, '\n');

        return 0;
}

int tracing_set_clock(struct trace_array *tr, const char *clockstr)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) {
                if (strcmp(trace_clocks[i].name, clockstr) == 0)
                        break;
        }
        if (i == ARRAY_SIZE(trace_clocks))
                return -EINVAL;

        mutex_lock(&trace_types_lock);

        tr->clock_id = i;

        ring_buffer_set_clock(tr->array_buffer.buffer, trace_clocks[i].func);

        /*
         * New clock may not be consistent with the previous clock.
         * Reset the buffer so that it doesn't have incomparable timestamps.
         */
        tracing_reset_online_cpus(&tr->array_buffer);

#ifdef CONFIG_TRACER_MAX_TRACE
        if (tr->max_buffer.buffer)
                ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func);
        tracing_reset_online_cpus(&tr->max_buffer);
#endif

        mutex_unlock(&trace_types_lock);

        return 0;
}

static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
                                   size_t cnt, loff_t *fpos)
{
        struct seq_file *m = filp->private_data;
        struct trace_array *tr = m->private;
        char buf[64];
        const char *clockstr;
        int ret;

        if (cnt >= sizeof(buf))
                return -EINVAL;

        if (copy_from_user(buf, ubuf, cnt))
                return -EFAULT;

        buf[cnt] = 0;

        clockstr = strstrip(buf);

        ret = tracing_set_clock(tr, clockstr);
        if (ret)
                return ret;

        *fpos += cnt;

        return cnt;
}

static int tracing_clock_open(struct inode *inode, struct file *file)
{
        struct trace_array *tr = inode->i_private;
        int ret;

        ret = tracing_check_open_get_tr(tr);
        if (ret)
                return ret;

        ret = single_open(file, tracing_clock_show, inode->i_private);
        if (ret < 0)
                trace_array_put(tr);

        return ret;
}

static int tracing_time_stamp_mode_show(struct seq_file *m, void *v)
{
        struct trace_array *tr = m->private;

        mutex_lock(&trace_types_lock);

        if (ring_buffer_time_stamp_abs(tr->array_buffer.buffer))
                seq_puts(m, "delta [absolute]\n");
        else
                seq_puts(m, "[delta] absolute\n");

        mutex_unlock(&trace_types_lock);

        return 0;
}

static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file)
{
        struct trace_array *tr = inode->i_private;
        int ret;

        ret = tracing_check_open_get_tr(tr);
        if (ret)
                return ret;

        ret = single_open(file, tracing_time_stamp_mode_show, inode->i_private);
        if (ret < 0)
                trace_array_put(tr);

        return ret;
}

int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs)
{
        int ret = 0;

        mutex_lock(&trace_types_lock);

        if (abs && tr->time_stamp_abs_ref++)
                goto out;

        if (!abs) {
                if (WARN_ON_ONCE(!tr->time_stamp_abs_ref)) {
                        ret = -EINVAL;
                        goto out;
                }

                if (--tr->time_stamp_abs_ref)
                        goto out;
        }

        ring_buffer_set_time_stamp_abs(tr->array_buffer.buffer, abs);

#ifdef CONFIG_TRACER_MAX_TRACE
        if (tr->max_buffer.buffer)
                ring_buffer_set_time_stamp_abs(tr->max_buffer.buffer, abs);
#endif
 out:
        mutex_unlock(&trace_types_lock);

        return ret;
}

struct ftrace_buffer_info {
        struct trace_iterator        iter;
        void                        *spare;
        unsigned int                spare_cpu;
        unsigned int                read;
};

#ifdef CONFIG_TRACER_SNAPSHOT
static int tracing_snapshot_open(struct inode *inode, struct file *file)
{
        struct trace_array *tr = inode->i_private;
        struct trace_iterator *iter;
        struct seq_file *m;
        int ret;

        ret = tracing_check_open_get_tr(tr);
        if (ret)
                return ret;

        if (file->f_mode & FMODE_READ) {
                iter = __tracing_open(inode, file, true);
                if (IS_ERR(iter))
                        ret = PTR_ERR(iter);
        } else {
                /* Writes still need the seq_file to hold the private data */
                ret = -ENOMEM;
                m = kzalloc(sizeof(*m), GFP_KERNEL);
                if (!m)
                        goto out;
                iter = kzalloc(sizeof(*iter), GFP_KERNEL);
                if (!iter) {
                        kfree(m);
                        goto out;
                }
                ret = 0;

                iter->tr = tr;
                iter->array_buffer = &tr->max_buffer;
                iter->cpu_file = tracing_get_cpu(inode);
                m->private = iter;
                file->private_data = m;
        }
out:
        if (ret < 0)
                trace_array_put(tr);

        return ret;
}

static void tracing_swap_cpu_buffer(void *tr)
{
        update_max_tr_single((struct trace_array *)tr, current, smp_processor_id());
}

static ssize_t
tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
                       loff_t *ppos)
{
        struct seq_file *m = filp->private_data;
        struct trace_iterator *iter = m->private;
        struct trace_array *tr = iter->tr;
        unsigned long val;
        int ret;

        ret = tracing_update_buffers();
        if (ret < 0)
                return ret;

        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
        if (ret)
                return ret;

        mutex_lock(&trace_types_lock);

        if (tr->current_trace->use_max_tr) {
                ret = -EBUSY;
                goto out;
        }

        local_irq_disable();
        arch_spin_lock(&tr->max_lock);
        if (tr->cond_snapshot)
                ret = -EBUSY;
        arch_spin_unlock(&tr->max_lock);
        local_irq_enable();
        if (ret)
                goto out;

        switch (val) {
        case 0:
                if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
                        ret = -EINVAL;
                        break;
                }
                if (tr->allocated_snapshot)
                        free_snapshot(tr);
                break;
        case 1:
/* Only allow per-cpu swap if the ring buffer supports it */
#ifndef CONFIG_RING_BUFFER_ALLOW_SWAP
                if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
                        ret = -EINVAL;
                        break;
                }
#endif
                if (tr->allocated_snapshot)
                        ret = resize_buffer_duplicate_size(&tr->max_buffer,
                                        &tr->array_buffer, iter->cpu_file);
                else
                        ret = tracing_alloc_snapshot_instance(tr);
                if (ret < 0)
                        break;
                /* Now, we're going to swap */
                if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
                        local_irq_disable();
                        update_max_tr(tr, current, smp_processor_id(), NULL);
                        local_irq_enable();
                } else {
                        smp_call_function_single(iter->cpu_file, tracing_swap_cpu_buffer,
                                                 (void *)tr, 1);
                }
                break;
        default:
                if (tr->allocated_snapshot) {
                        if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
                                tracing_reset_online_cpus(&tr->max_buffer);
                        else
                                tracing_reset_cpu(&tr->max_buffer, iter->cpu_file);
                }
                break;
        }

        if (ret >= 0) {
                *ppos += cnt;
                ret = cnt;
        }
out:
        mutex_unlock(&trace_types_lock);
        return ret;
}

static int tracing_snapshot_release(struct inode *inode, struct file *file)
{
        struct seq_file *m = file->private_data;
        int ret;

        ret = tracing_release(inode, file);

        if (file->f_mode & FMODE_READ)
                return ret;

        /* If write only, the seq_file is just a stub */
        if (m)
                kfree(m->private);
        kfree(m);

        return 0;
}

static int tracing_buffers_open(struct inode *inode, struct file *filp);
static ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf,
                                    size_t count, loff_t *ppos);
static int tracing_buffers_release(struct inode *inode, struct file *file);
static ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                   struct pipe_inode_info *pipe, size_t len, unsigned int flags);

static int snapshot_raw_open(struct inode *inode, struct file *filp)
{
        struct ftrace_buffer_info *info;
        int ret;

        /* The following checks for tracefs lockdown */
        ret = tracing_buffers_open(inode, filp);
        if (ret < 0)
                return ret;

        info = filp->private_data;

        if (info->iter.trace->use_max_tr) {
                tracing_buffers_release(inode, filp);
                return -EBUSY;
        }

        info->iter.snapshot = true;
        info->iter.array_buffer = &info->iter.tr->max_buffer;

        return ret;
}

#endif /* CONFIG_TRACER_SNAPSHOT */


static const struct file_operations tracing_thresh_fops = {
        .open                = tracing_open_generic,
        .read                = tracing_thresh_read,
        .write                = tracing_thresh_write,
        .llseek                = generic_file_llseek,
};

#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
static const struct file_operations tracing_max_lat_fops = {
        .open                = tracing_open_generic,
        .read                = tracing_max_lat_read,
        .write                = tracing_max_lat_write,
        .llseek                = generic_file_llseek,
};
#endif

static const struct file_operations set_tracer_fops = {
        .open                = tracing_open_generic_tr,
        .read                = tracing_set_trace_read,
        .write                = tracing_set_trace_write,
        .llseek                = generic_file_llseek,
        .release        = tracing_release_generic_tr,
};

static const struct file_operations tracing_pipe_fops = {
        .open                = tracing_open_pipe,
        .poll                = tracing_poll_pipe,
        .read                = tracing_read_pipe,
        .splice_read        = tracing_splice_read_pipe,
        .release        = tracing_release_pipe,
        .llseek                = no_llseek,
};

static const struct file_operations tracing_entries_fops = {
        .open                = tracing_open_generic_tr,
        .read                = tracing_entries_read,
        .write                = tracing_entries_write,
        .llseek                = generic_file_llseek,
        .release        = tracing_release_generic_tr,
};

static const struct file_operations tracing_total_entries_fops = {
        .open                = tracing_open_generic_tr,
        .read                = tracing_total_entries_read,
        .llseek                = generic_file_llseek,
        .release        = tracing_release_generic_tr,
};

static const struct file_operations tracing_free_buffer_fops = {
        .open                = tracing_open_generic_tr,
        .write                = tracing_free_buffer_write,
        .release        = tracing_free_buffer_release,
};

static const struct file_operations tracing_mark_fops = {
        .open                = tracing_open_generic_tr,
        .write                = tracing_mark_write,
        .llseek                = generic_file_llseek,
        .release        = tracing_release_generic_tr,
};

static const struct file_operations tracing_mark_raw_fops = {
        .open                = tracing_open_generic_tr,
        .write                = tracing_mark_raw_write,
        .llseek                = generic_file_llseek,
        .release        = tracing_release_generic_tr,
};

static const struct file_operations trace_clock_fops = {
        .open                = tracing_clock_open,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = tracing_single_release_tr,
        .write                = tracing_clock_write,
};

static const struct file_operations trace_time_stamp_mode_fops = {
        .open                = tracing_time_stamp_mode_open,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = tracing_single_release_tr,
};

#ifdef CONFIG_TRACER_SNAPSHOT
static const struct file_operations snapshot_fops = {
        .open                = tracing_snapshot_open,
        .read                = seq_read,
        .write                = tracing_snapshot_write,
        .llseek                = tracing_lseek,
        .release        = tracing_snapshot_release,
};

static const struct file_operations snapshot_raw_fops = {
        .open                = snapshot_raw_open,
        .read                = tracing_buffers_read,
        .release        = tracing_buffers_release,
        .splice_read        = tracing_buffers_splice_read,
        .llseek                = no_llseek,
};

#endif /* CONFIG_TRACER_SNAPSHOT */

#define TRACING_LOG_ERRS_MAX        8
#define TRACING_LOG_LOC_MAX        128

#define CMD_PREFIX "  Command: "

struct err_info {
        const char        **errs;        /* ptr to loc-specific array of err strings */
        u8                type;        /* index into errs -> specific err string */
        u8                pos;        /* MAX_FILTER_STR_VAL = 256 */
        u64                ts;
};

struct tracing_log_err {
        struct list_head        list;
        struct err_info                info;
        char                        loc[TRACING_LOG_LOC_MAX]; /* err location */
        char                        cmd[MAX_FILTER_STR_VAL]; /* what caused err */
};

static DEFINE_MUTEX(tracing_err_log_lock);

static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr)
{
        struct tracing_log_err *err;

        if (tr->n_err_log_entries < TRACING_LOG_ERRS_MAX) {
                err = kzalloc(sizeof(*err), GFP_KERNEL);
                if (!err)
                        err = ERR_PTR(-ENOMEM);
                else
                        tr->n_err_log_entries++;

                return err;
        }

        err = list_first_entry(&tr->err_log, struct tracing_log_err, list);
        list_del(&err->list);

        return err;
}

/**
 * err_pos - find the position of a string within a command for error careting
 * @cmd: The tracing command that caused the error
 * @str: The string to position the caret at within @cmd
 *
 * Finds the position of the first occurence of @str within @cmd.  The
 * return value can be passed to tracing_log_err() for caret placement
 * within @cmd.
 *
 * Returns the index within @cmd of the first occurence of @str or 0
 * if @str was not found.
 */
unsigned int err_pos(char *cmd, const char *str)
{
        char *found;

        if (WARN_ON(!strlen(cmd)))
                return 0;

        found = strstr(cmd, str);
        if (found)
                return found - cmd;

        return 0;
}

/**
 * tracing_log_err - write an error to the tracing error log
 * @tr: The associated trace array for the error (NULL for top level array)
 * @loc: A string describing where the error occurred
 * @cmd: The tracing command that caused the error
 * @errs: The array of loc-specific static error strings
 * @type: The index into errs[], which produces the specific static err string
 * @pos: The position the caret should be placed in the cmd
 *
 * Writes an error into tracing/error_log of the form:
 *
 * <loc>: error: <text>
 *   Command: <cmd>
 *              ^
 *
 * tracing/error_log is a small log file containing the last
 * TRACING_LOG_ERRS_MAX errors (8).  Memory for errors isn't allocated
 * unless there has been a tracing error, and the error log can be
 * cleared and have its memory freed by writing the empty string in
 * truncation mode to it i.e. echo > tracing/error_log.
 *
 * NOTE: the @errs array along with the @type param are used to
 * produce a static error string - this string is not copied and saved
 * when the error is logged - only a pointer to it is saved.  See
 * existing callers for examples of how static strings are typically
 * defined for use with tracing_log_err().
 */
void tracing_log_err(struct trace_array *tr,
                     const char *loc, const char *cmd,
                     const char **errs, u8 type, u8 pos)
{
        struct tracing_log_err *err;

        if (!tr)
                tr = &global_trace;

        mutex_lock(&tracing_err_log_lock);
        err = get_tracing_log_err(tr);
        if (PTR_ERR(err) == -ENOMEM) {
                mutex_unlock(&tracing_err_log_lock);
                return;
        }

        snprintf(err->loc, TRACING_LOG_LOC_MAX, "%s: error: ", loc);
        snprintf(err->cmd, MAX_FILTER_STR_VAL,"\n" CMD_PREFIX "%s\n", cmd);

        err->info.errs = errs;
        err->info.type = type;
        err->info.pos = pos;
        err->info.ts = local_clock();

        list_add_tail(&err->list, &tr->err_log);
        mutex_unlock(&tracing_err_log_lock);
}

static void clear_tracing_err_log(struct trace_array *tr)
{
        struct tracing_log_err *err, *next;

        mutex_lock(&tracing_err_log_lock);
        list_for_each_entry_safe(err, next, &tr->err_log, list) {
                list_del(&err->list);
                kfree(err);
        }

        tr->n_err_log_entries = 0;
        mutex_unlock(&tracing_err_log_lock);
}

static void *tracing_err_log_seq_start(struct seq_file *m, loff_t *pos)
{
        struct trace_array *tr = m->private;

        mutex_lock(&tracing_err_log_lock);

        return seq_list_start(&tr->err_log, *pos);
}

static void *tracing_err_log_seq_next(struct seq_file *m, void *v, loff_t *pos)
{
        struct trace_array *tr = m->private;

        return seq_list_next(v, &tr->err_log, pos);
}

static void tracing_err_log_seq_stop(struct seq_file *m, void *v)
{
        mutex_unlock(&tracing_err_log_lock);
}

static void tracing_err_log_show_pos(struct seq_file *m, u8 pos)
{
        u8 i;

        for (i = 0; i < sizeof(CMD_PREFIX) - 1; i++)
                seq_putc(m, ' ');
        for (i = 0; i < pos; i++)
                seq_putc(m, ' ');
        seq_puts(m, "^\n");
}

static int tracing_err_log_seq_show(struct seq_file *m, void *v)
{
        struct tracing_log_err *err = v;

        if (err) {
                const char *err_text = err->info.errs[err->info.type];
                u64 sec = err->info.ts;
                u32 nsec;

                nsec = do_div(sec, NSEC_PER_SEC);
                seq_printf(m, "[%5llu.%06u] %s%s", sec, nsec / 1000,
                           err->loc, err_text);
                seq_printf(m, "%s", err->cmd);
                tracing_err_log_show_pos(m, err->info.pos);
        }

        return 0;
}

static const struct seq_operations tracing_err_log_seq_ops = {
        .start  = tracing_err_log_seq_start,
        .next   = tracing_err_log_seq_next,
        .stop   = tracing_err_log_seq_stop,
        .show   = tracing_err_log_seq_show
};

static int tracing_err_log_open(struct inode *inode, struct file *file)
{
        struct trace_array *tr = inode->i_private;
        int ret = 0;

        ret = tracing_check_open_get_tr(tr);
        if (ret)
                return ret;

        /* If this file was opened for write, then erase contents */
        if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC))
                clear_tracing_err_log(tr);

        if (file->f_mode & FMODE_READ) {
                ret = seq_open(file, &tracing_err_log_seq_ops);
                if (!ret) {
                        struct seq_file *m = file->private_data;
                        m->private = tr;
                } else {
                        trace_array_put(tr);
                }
        }
        return ret;
}

static ssize_t tracing_err_log_write(struct file *file,
                                     const char __user *buffer,
                                     size_t count, loff_t *ppos)
{
        return count;
}

static int tracing_err_log_release(struct inode *inode, struct file *file)
{
        struct trace_array *tr = inode->i_private;

        trace_array_put(tr);

        if (file->f_mode & FMODE_READ)
                seq_release(inode, file);

        return 0;
}

static const struct file_operations tracing_err_log_fops = {
        .open           = tracing_err_log_open,
        .write                = tracing_err_log_write,
        .read           = seq_read,
        .llseek         = tracing_lseek,
        .release        = tracing_err_log_release,
};

static int tracing_buffers_open(struct inode *inode, struct file *filp)
{
        struct trace_array *tr = inode->i_private;
        struct ftrace_buffer_info *info;
        int ret;

        ret = tracing_check_open_get_tr(tr);
        if (ret)
                return ret;

        info = kvzalloc(sizeof(*info), GFP_KERNEL);
        if (!info) {
                trace_array_put(tr);
                return -ENOMEM;
        }

        mutex_lock(&trace_types_lock);

        info->iter.tr                = tr;
        info->iter.cpu_file        = tracing_get_cpu(inode);
        info->iter.trace        = tr->current_trace;
        info->iter.array_buffer = &tr->array_buffer;
        info->spare                = NULL;
        /* Force reading ring buffer for first read */
        info->read                = (unsigned int)-1;

        filp->private_data = info;

        tr->trace_ref++;

        mutex_unlock(&trace_types_lock);

        ret = nonseekable_open(inode, filp);
        if (ret < 0)
                trace_array_put(tr);

        return ret;
}

static __poll_t
tracing_buffers_poll(struct file *filp, poll_table *poll_table)
{
        struct ftrace_buffer_info *info = filp->private_data;
        struct trace_iterator *iter = &info->iter;

        return trace_poll(iter, filp, poll_table);
}

static ssize_t
tracing_buffers_read(struct file *filp, char __user *ubuf,
                     size_t count, loff_t *ppos)
{
        struct ftrace_buffer_info *info = filp->private_data;
        struct trace_iterator *iter = &info->iter;
        ssize_t ret = 0;
        ssize_t size;

        if (!count)
                return 0;

#ifdef CONFIG_TRACER_MAX_TRACE
        if (iter->snapshot && iter->tr->current_trace->use_max_tr)
                return -EBUSY;
#endif

        if (!info->spare) {
                info->spare = ring_buffer_alloc_read_page(iter->array_buffer->buffer,
                                                          iter->cpu_file);
                if (IS_ERR(info->spare)) {
                        ret = PTR_ERR(info->spare);
                        info->spare = NULL;
                } else {
                        info->spare_cpu = iter->cpu_file;
                }
        }
        if (!info->spare)
                return ret;

        /* Do we have previous read data to read? */
        if (info->read < PAGE_SIZE)
                goto read;

 again:
        trace_access_lock(iter->cpu_file);
        ret = ring_buffer_read_page(iter->array_buffer->buffer,
                                    &info->spare,
                                    count,
                                    iter->cpu_file, 0);
        trace_access_unlock(iter->cpu_file);

        if (ret < 0) {
                if (trace_empty(iter)) {
                        if ((filp->f_flags & O_NONBLOCK))
                                return -EAGAIN;

                        ret = wait_on_pipe(iter, 0);
                        if (ret)
                                return ret;

                        goto again;
                }
                return 0;
        }

        info->read = 0;
 read:
        size = PAGE_SIZE - info->read;
        if (size > count)
                size = count;

        ret = copy_to_user(ubuf, info->spare + info->read, size);
        if (ret == size)
                return -EFAULT;

        size -= ret;

        *ppos += size;
        info->read += size;

        return size;
}

static int tracing_buffers_release(struct inode *inode, struct file *file)
{
        struct ftrace_buffer_info *info = file->private_data;
        struct trace_iterator *iter = &info->iter;

        mutex_lock(&trace_types_lock);

        iter->tr->trace_ref--;

        __trace_array_put(iter->tr);

        if (info->spare)
                ring_buffer_free_read_page(iter->array_buffer->buffer,
                                           info->spare_cpu, info->spare);
        kvfree(info);

        mutex_unlock(&trace_types_lock);

        return 0;
}

struct buffer_ref {
        struct trace_buffer        *buffer;
        void                        *page;
        int                        cpu;
        refcount_t                refcount;
};

static void buffer_ref_release(struct buffer_ref *ref)
{
        if (!refcount_dec_and_test(&ref->refcount))
                return;
        ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page);
        kfree(ref);
}

static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
                                    struct pipe_buffer *buf)
{
        struct buffer_ref *ref = (struct buffer_ref *)buf->private;

        buffer_ref_release(ref);
        buf->private = 0;
}

static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe,
                                struct pipe_buffer *buf)
{
        struct buffer_ref *ref = (struct buffer_ref *)buf->private;

        if (refcount_read(&ref->refcount) > INT_MAX/2)
                return false;

        refcount_inc(&ref->refcount);
        return true;
}

/* Pipe buffer operations for a buffer. */
static const struct pipe_buf_operations buffer_pipe_buf_ops = {
        .release                = buffer_pipe_buf_release,
        .get                        = buffer_pipe_buf_get,
};

/*
 * Callback from splice_to_pipe(), if we need to release some pages
 * at the end of the spd in case we error'ed out in filling the pipe.
 */
static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i)
{
        struct buffer_ref *ref =
                (struct buffer_ref *)spd->partial[i].private;

        buffer_ref_release(ref);
        spd->partial[i].private = 0;
}

static ssize_t
tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                            struct pipe_inode_info *pipe, size_t len,
                            unsigned int flags)
{
        struct ftrace_buffer_info *info = file->private_data;
        struct trace_iterator *iter = &info->iter;
        struct partial_page partial_def[PIPE_DEF_BUFFERS];
        struct page *pages_def[PIPE_DEF_BUFFERS];
        struct splice_pipe_desc spd = {
                .pages                = pages_def,
                .partial        = partial_def,
                .nr_pages_max        = PIPE_DEF_BUFFERS,
                .ops                = &buffer_pipe_buf_ops,
                .spd_release        = buffer_spd_release,
        };
        struct buffer_ref *ref;
        int entries, i;
        ssize_t ret = 0;

#ifdef CONFIG_TRACER_MAX_TRACE
        if (iter->snapshot && iter->tr->current_trace->use_max_tr)
                return -EBUSY;
#endif

        if (*ppos & (PAGE_SIZE - 1))
                return -EINVAL;

        if (len & (PAGE_SIZE - 1)) {
                if (len < PAGE_SIZE)
                        return -EINVAL;
                len &= PAGE_MASK;
        }

        if (splice_grow_spd(pipe, &spd))
                return -ENOMEM;

 again:
        trace_access_lock(iter->cpu_file);
        entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file);

        for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= PAGE_SIZE) {
                struct page *page;
                int r;

                ref = kzalloc(sizeof(*ref), GFP_KERNEL);
                if (!ref) {
                        ret = -ENOMEM;
                        break;
                }

                refcount_set(&ref->refcount, 1);
                ref->buffer = iter->array_buffer->buffer;
                ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
                if (IS_ERR(ref->page)) {
                        ret = PTR_ERR(ref->page);
                        ref->page = NULL;
                        kfree(ref);
                        break;
                }
                ref->cpu = iter->cpu_file;

                r = ring_buffer_read_page(ref->buffer, &ref->page,
                                          len, iter->cpu_file, 1);
                if (r < 0) {
                        ring_buffer_free_read_page(ref->buffer, ref->cpu,
                                                   ref->page);
                        kfree(ref);
                        break;
                }

                page = virt_to_page(ref->page);

                spd.pages[i] = page;
                spd.partial[i].len = PAGE_SIZE;
                spd.partial[i].offset = 0;
                spd.partial[i].private = (unsigned long)ref;
                spd.nr_pages++;
                *ppos += PAGE_SIZE;

                entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file);
        }

        trace_access_unlock(iter->cpu_file);
        spd.nr_pages = i;

        /* did we read anything? */
        if (!spd.nr_pages) {
                if (ret)
                        goto out;

                ret = -EAGAIN;
                if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
                        goto out;

                ret = wait_on_pipe(iter, iter->snapshot ? 0 : iter->tr->buffer_percent);
                if (ret)
                        goto out;

                goto again;
        }

        ret = splice_to_pipe(pipe, &spd);
out:
        splice_shrink_spd(&spd);

        return ret;
}

static const struct file_operations tracing_buffers_fops = {
        .open                = tracing_buffers_open,
        .read                = tracing_buffers_read,
        .poll                = tracing_buffers_poll,
        .release        = tracing_buffers_release,
        .splice_read        = tracing_buffers_splice_read,
        .llseek                = no_llseek,
};

static ssize_t
tracing_stats_read(struct file *filp, char __user *ubuf,
                   size_t count, loff_t *ppos)
{
        struct inode *inode = file_inode(filp);
        struct trace_array *tr = inode->i_private;
        struct array_buffer *trace_buf = &tr->array_buffer;
        int cpu = tracing_get_cpu(inode);
        struct trace_seq *s;
        unsigned long cnt;
        unsigned long long t;
        unsigned long usec_rem;

        s = kmalloc(sizeof(*s), GFP_KERNEL);
        if (!s)
                return -ENOMEM;

        trace_seq_init(s);

        cnt = ring_buffer_entries_cpu(trace_buf->buffer, cpu);
        trace_seq_printf(s, "entries: %ld\n", cnt);

        cnt = ring_buffer_overrun_cpu(trace_buf->buffer, cpu);
        trace_seq_printf(s, "overrun: %ld\n", cnt);

        cnt = ring_buffer_commit_overrun_cpu(trace_buf->buffer, cpu);
        trace_seq_printf(s, "commit overrun: %ld\n", cnt);

        cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu);
        trace_seq_printf(s, "bytes: %ld\n", cnt);

        if (trace_clocks[tr->clock_id].in_ns) {
                /* local or global for trace_clock */
                t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));
                usec_rem = do_div(t, USEC_PER_SEC);
                trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n",
                                                                t, usec_rem);

                t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer, cpu));
                usec_rem = do_div(t, USEC_PER_SEC);
                trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);
        } else {
                /* counter or tsc mode for trace_clock */
                trace_seq_printf(s, "oldest event ts: %llu\n",
                                ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));

                trace_seq_printf(s, "now ts: %llu\n",
                                ring_buffer_time_stamp(trace_buf->buffer, cpu));
        }

        cnt = ring_buffer_dropped_events_cpu(trace_buf->buffer, cpu);
        trace_seq_printf(s, "dropped events: %ld\n", cnt);

        cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu);
        trace_seq_printf(s, "read events: %ld\n", cnt);

        count = simple_read_from_buffer(ubuf, count, ppos,
                                        s->buffer, trace_seq_used(s));

        kfree(s);

        return count;
}

static const struct file_operations tracing_stats_fops = {
        .open                = tracing_open_generic_tr,
        .read                = tracing_stats_read,
        .llseek                = generic_file_llseek,
        .release        = tracing_release_generic_tr,
};

#ifdef CONFIG_DYNAMIC_FTRACE

static ssize_t
tracing_read_dyn_info(struct file *filp, char __user *ubuf,
                  size_t cnt, loff_t *ppos)
{
        ssize_t ret;
        char *buf;
        int r;

        /* 256 should be plenty to hold the amount needed */
        buf = kmalloc(256, GFP_KERNEL);
        if (!buf)
                return -ENOMEM;

        r = scnprintf(buf, 256, "%ld pages:%ld groups: %ld\n",
                      ftrace_update_tot_cnt,
                      ftrace_number_of_pages,
                      ftrace_number_of_groups);

        ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
        kfree(buf);
        return ret;
}

static const struct file_operations tracing_dyn_info_fops = {
        .open                = tracing_open_generic,
        .read                = tracing_read_dyn_info,
        .llseek                = generic_file_llseek,
};
#endif /* CONFIG_DYNAMIC_FTRACE */

#if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE)
static void
ftrace_snapshot(unsigned long ip, unsigned long parent_ip,
                struct trace_array *tr, struct ftrace_probe_ops *ops,
                void *data)
{
        tracing_snapshot_instance(tr);
}

static void
ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip,
                      struct trace_array *tr, struct ftrace_probe_ops *ops,
                      void *data)
{
        struct ftrace_func_mapper *mapper = data;
        long *count = NULL;

        if (mapper)
                count = (long *)ftrace_func_mapper_find_ip(mapper, ip);

        if (count) {

                if (*count <= 0)
                        return;

                (*count)--;
        }

        tracing_snapshot_instance(tr);
}

static int
ftrace_snapshot_print(struct seq_file *m, unsigned long ip,
                      struct ftrace_probe_ops *ops, void *data)
{
        struct ftrace_func_mapper *mapper = data;
        long *count = NULL;

        seq_printf(m, "%ps:", (void *)ip);

        seq_puts(m, "snapshot");

        if (mapper)
                count = (long *)ftrace_func_mapper_find_ip(mapper, ip);

        if (count)
                seq_printf(m, ":count=%ld\n", *count);
        else
                seq_puts(m, ":unlimited\n");

        return 0;
}

static int
ftrace_snapshot_init(struct ftrace_probe_ops *ops, struct trace_array *tr,
                     unsigned long ip, void *init_data, void **data)
{
        struct ftrace_func_mapper *mapper = *data;

        if (!mapper) {
                mapper = allocate_ftrace_func_mapper();
                if (!mapper)
                        return -ENOMEM;
                *data = mapper;
        }

        return ftrace_func_mapper_add_ip(mapper, ip, init_data);
}

static void
ftrace_snapshot_free(struct ftrace_probe_ops *ops, struct trace_array *tr,
                     unsigned long ip, void *data)
{
        struct ftrace_func_mapper *mapper = data;

        if (!ip) {
                if (!mapper)
                        return;
                free_ftrace_func_mapper(mapper, NULL);
                return;
        }

        ftrace_func_mapper_remove_ip(mapper, ip);
}

static struct ftrace_probe_ops snapshot_probe_ops = {
        .func                        = ftrace_snapshot,
        .print                        = ftrace_snapshot_print,
};

static struct ftrace_probe_ops snapshot_count_probe_ops = {
        .func                        = ftrace_count_snapshot,
        .print                        = ftrace_snapshot_print,
        .init                        = ftrace_snapshot_init,
        .free                        = ftrace_snapshot_free,
};

static int
ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash,
                               char *glob, char *cmd, char *param, int enable)
{
        struct ftrace_probe_ops *ops;
        void *count = (void *)-1;
        char *number;
        int ret;

        if (!tr)
                return -ENODEV;

        /* hash funcs only work with set_ftrace_filter */
        if (!enable)
                return -EINVAL;

        ops = param ? &snapshot_count_probe_ops :  &snapshot_probe_ops;

        if (glob[0] == '!')
                return unregister_ftrace_function_probe_func(glob+1, tr, ops);

        if (!param)
                goto out_reg;

        number = strsep(&param, ":");

        if (!strlen(number))
                goto out_reg;

        /*
         * We use the callback data field (which is a pointer)
         * as our counter.
         */
        ret = kstrtoul(number, 0, (unsigned long *)&count);
        if (ret)
                return ret;

 out_reg:
        ret = tracing_alloc_snapshot_instance(tr);
        if (ret < 0)
                return ret;

        ret = register_ftrace_function_probe(glob, tr, ops, count);

        return ret < 0 ? ret : 0;
}

static struct ftrace_func_command ftrace_snapshot_cmd = {
        .name                        = "snapshot",
        .func                        = ftrace_trace_snapshot_callback,
};

static __init int register_snapshot_cmd(void)
{
        return register_ftrace_command(&ftrace_snapshot_cmd);
}
#else
static inline __init int register_snapshot_cmd(void) { return 0; }
#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */

static struct dentry *tracing_get_dentry(struct trace_array *tr)
{
        if (WARN_ON(!tr->dir))
                return ERR_PTR(-ENODEV);

        /* Top directory uses NULL as the parent */
        if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
                return NULL;

        /* All sub buffers have a descriptor */
        return tr->dir;
}

static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
{
        struct dentry *d_tracer;

        if (tr->percpu_dir)
                return tr->percpu_dir;

        d_tracer = tracing_get_dentry(tr);
        if (IS_ERR(d_tracer))
                return NULL;

        tr->percpu_dir = tracefs_create_dir("per_cpu", d_tracer);

        MEM_FAIL(!tr->percpu_dir,
                  "Could not create tracefs directory 'per_cpu/%d'\n", cpu);

        return tr->percpu_dir;
}

static struct dentry *
trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent,
                      void *data, long cpu, const struct file_operations *fops)
{
        struct dentry *ret = trace_create_file(name, mode, parent, data, fops);

        if (ret) /* See tracing_get_cpu() */
                d_inode(ret)->i_cdev = (void *)(cpu + 1);
        return ret;
}

static void
tracing_init_tracefs_percpu(struct trace_array *tr, long cpu)
{
        struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu);
        struct dentry *d_cpu;
        char cpu_dir[30]; /* 30 characters should be more than enough */

        if (!d_percpu)
                return;

        snprintf(cpu_dir, 30, "cpu%ld", cpu);
        d_cpu = tracefs_create_dir(cpu_dir, d_percpu);
        if (!d_cpu) {
                pr_warn("Could not create tracefs '%s' entry\n", cpu_dir);
                return;
        }

        /* per cpu trace_pipe */
        trace_create_cpu_file("trace_pipe", 0444, d_cpu,
                                tr, cpu, &tracing_pipe_fops);

        /* per cpu trace */
        trace_create_cpu_file("trace", 0644, d_cpu,
                                tr, cpu, &tracing_fops);

        trace_create_cpu_file("trace_pipe_raw", 0444, d_cpu,
                                tr, cpu, &tracing_buffers_fops);

        trace_create_cpu_file("stats", 0444, d_cpu,
                                tr, cpu, &tracing_stats_fops);

        trace_create_cpu_file("buffer_size_kb", 0444, d_cpu,
                                tr, cpu, &tracing_entries_fops);

#ifdef CONFIG_TRACER_SNAPSHOT
        trace_create_cpu_file("snapshot", 0644, d_cpu,
                                tr, cpu, &snapshot_fops);

        trace_create_cpu_file("snapshot_raw", 0444, d_cpu,
                                tr, cpu, &snapshot_raw_fops);
#endif
}

#ifdef CONFIG_FTRACE_SELFTEST
/* Let selftest have access to static functions in this file */
#include "trace_selftest.c"
#endif

static ssize_t
trace_options_read(struct file *filp, char __user *ubuf, size_t cnt,
                        loff_t *ppos)
{
        struct trace_option_dentry *topt = filp->private_data;
        char *buf;

        if (topt->flags->val & topt->opt->bit)
                buf = "1\n";
        else
                buf = "0\n";

        return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
}

static ssize_t
trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
                         loff_t *ppos)
{
        struct trace_option_dentry *topt = filp->private_data;
        unsigned long val;
        int ret;

        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
        if (ret)
                return ret;

        if (val != 0 && val != 1)
                return -EINVAL;

        if (!!(topt->flags->val & topt->opt->bit) != val) {
                mutex_lock(&trace_types_lock);
                ret = __set_tracer_option(topt->tr, topt->flags,
                                          topt->opt, !val);
                mutex_unlock(&trace_types_lock);
                if (ret)
                        return ret;
        }

        *ppos += cnt;

        return cnt;
}

static int tracing_open_options(struct inode *inode, struct file *filp)
{
        struct trace_option_dentry *topt = inode->i_private;
        int ret;

        ret = tracing_check_open_get_tr(topt->tr);
        if (ret)
                return ret;

        filp->private_data = inode->i_private;
        return 0;
}

static int tracing_release_options(struct inode *inode, struct file *file)
{
        struct trace_option_dentry *topt = file->private_data;

        trace_array_put(topt->tr);
        return 0;
}

static const struct file_operations trace_options_fops = {
        .open = tracing_open_options,
        .read = trace_options_read,
        .write = trace_options_write,
        .llseek        = generic_file_llseek,
        .release = tracing_release_options,
};

/*
 * In order to pass in both the trace_array descriptor as well as the index
 * to the flag that the trace option file represents, the trace_array
 * has a character array of trace_flags_index[], which holds the index
 * of the bit for the flag it represents. index[0] == 0, index[1] == 1, etc.
 * The address of this character array is passed to the flag option file
 * read/write callbacks.
 *
 * In order to extract both the index and the trace_array descriptor,
 * get_tr_index() uses the following algorithm.
 *
 *   idx = *ptr;
 *
 * As the pointer itself contains the address of the index (remember
 * index[1] == 1).
 *
 * Then to get the trace_array descriptor, by subtracting that index
 * from the ptr, we get to the start of the index itself.
 *
 *   ptr - idx == &index[0]
 *
 * Then a simple container_of() from that pointer gets us to the
 * trace_array descriptor.
 */
static void get_tr_index(void *data, struct trace_array **ptr,
                         unsigned int *pindex)
{
        *pindex = *(unsigned char *)data;

        *ptr = container_of(data - *pindex, struct trace_array,
                            trace_flags_index);
}

static ssize_t
trace_options_core_read(struct file *filp, char __user *ubuf, size_t cnt,
                        loff_t *ppos)
{
        void *tr_index = filp->private_data;
        struct trace_array *tr;
        unsigned int index;
        char *buf;

        get_tr_index(tr_index, &tr, &index);

        if (tr->trace_flags & (1 << index))
                buf = "1\n";
        else
                buf = "0\n";

        return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
}

static ssize_t
trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
                         loff_t *ppos)
{
        void *tr_index = filp->private_data;
        struct trace_array *tr;
        unsigned int index;
        unsigned long val;
        int ret;

        get_tr_index(tr_index, &tr, &index);

        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
        if (ret)
                return ret;

        if (val != 0 && val != 1)
                return -EINVAL;

        mutex_lock(&event_mutex);
        mutex_lock(&trace_types_lock);
        ret = set_tracer_flag(tr, 1 << index, val);
        mutex_unlock(&trace_types_lock);
        mutex_unlock(&event_mutex);

        if (ret < 0)
                return ret;

        *ppos += cnt;

        return cnt;
}

static const struct file_operations trace_options_core_fops = {
        .open = tracing_open_generic,
        .read = trace_options_core_read,
        .write = trace_options_core_write,
        .llseek = generic_file_llseek,
};

struct dentry *trace_create_file(const char *name,
                                 umode_t mode,
                                 struct dentry *parent,
                                 void *data,
                                 const struct file_operations *fops)
{
        struct dentry *ret;

        ret = tracefs_create_file(name, mode, parent, data, fops);
        if (!ret)
                pr_warn("Could not create tracefs '%s' entry\n", name);

        return ret;
}


static struct dentry *trace_options_init_dentry(struct trace_array *tr)
{
        struct dentry *d_tracer;

        if (tr->options)
                return tr->options;

        d_tracer = tracing_get_dentry(tr);
        if (IS_ERR(d_tracer))
                return NULL;

        tr->options = tracefs_create_dir("options", d_tracer);
        if (!tr->options) {
                pr_warn("Could not create tracefs directory 'options'\n");
                return NULL;
        }

        return tr->options;
}

static void
create_trace_option_file(struct trace_array *tr,
                         struct trace_option_dentry *topt,
                         struct tracer_flags *flags,
                         struct tracer_opt *opt)
{
        struct dentry *t_options;

        t_options = trace_options_init_dentry(tr);
        if (!t_options)
                return;

        topt->flags = flags;
        topt->opt = opt;
        topt->tr = tr;

        topt->entry = trace_create_file(opt->name, 0644, t_options, topt,
                                    &trace_options_fops);

}

static void
create_trace_option_files(struct trace_array *tr, struct tracer *tracer)
{
        struct trace_option_dentry *topts;
        struct trace_options *tr_topts;
        struct tracer_flags *flags;
        struct tracer_opt *opts;
        int cnt;
        int i;

        if (!tracer)
                return;

        flags = tracer->flags;

        if (!flags || !flags->opts)
                return;

        /*
         * If this is an instance, only create flags for tracers
         * the instance may have.
         */
        if (!trace_ok_for_array(tracer, tr))
                return;

        for (i = 0; i < tr->nr_topts; i++) {
                /* Make sure there's no duplicate flags. */
                if (WARN_ON_ONCE(tr->topts[i].tracer->flags == tracer->flags))
                        return;
        }

        opts = flags->opts;

        for (cnt = 0; opts[cnt].name; cnt++)
                ;

        topts = kcalloc(cnt + 1, sizeof(*topts), GFP_KERNEL);
        if (!topts)
                return;

        tr_topts = krealloc(tr->topts, sizeof(*tr->topts) * (tr->nr_topts + 1),
                            GFP_KERNEL);
        if (!tr_topts) {
                kfree(topts);
                return;
        }

        tr->topts = tr_topts;
        tr->topts[tr->nr_topts].tracer = tracer;
        tr->topts[tr->nr_topts].topts = topts;
        tr->nr_topts++;

        for (cnt = 0; opts[cnt].name; cnt++) {
                create_trace_option_file(tr, &topts[cnt], flags,
                                         &opts[cnt]);
                MEM_FAIL(topts[cnt].entry == NULL,
                          "Failed to create trace option: %s",
                          opts[cnt].name);
        }
}

static struct dentry *
create_trace_option_core_file(struct trace_array *tr,
                              const char *option, long index)
{
        struct dentry *t_options;

        t_options = trace_options_init_dentry(tr);
        if (!t_options)
                return NULL;

        return trace_create_file(option, 0644, t_options,
                                 (void *)&tr->trace_flags_index[index],
                                 &trace_options_core_fops);
}

static void create_trace_options_dir(struct trace_array *tr)
{
        struct dentry *t_options;
        bool top_level = tr == &global_trace;
        int i;

        t_options = trace_options_init_dentry(tr);
        if (!t_options)
                return;

        for (i = 0; trace_options[i]; i++) {
                if (top_level ||
                    !((1 << i) & TOP_LEVEL_TRACE_FLAGS))
                        create_trace_option_core_file(tr, trace_options[i], i);
        }
}

static ssize_t
rb_simple_read(struct file *filp, char __user *ubuf,
               size_t cnt, loff_t *ppos)
{
        struct trace_array *tr = filp->private_data;
        char buf[64];
        int r;

        r = tracer_tracing_is_on(tr);
        r = sprintf(buf, "%d\n", r);

        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}

static ssize_t
rb_simple_write(struct file *filp, const char __user *ubuf,
                size_t cnt, loff_t *ppos)
{
        struct trace_array *tr = filp->private_data;
        struct trace_buffer *buffer = tr->array_buffer.buffer;
        unsigned long val;
        int ret;

        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
        if (ret)
                return ret;

        if (buffer) {
                mutex_lock(&trace_types_lock);
                if (!!val == tracer_tracing_is_on(tr)) {
                        val = 0; /* do nothing */
                } else if (val) {
                        tracer_tracing_on(tr);
                        if (tr->current_trace->start)
                                tr->current_trace->start(tr);
                } else {
                        tracer_tracing_off(tr);
                        if (tr->current_trace->stop)
                                tr->current_trace->stop(tr);
                }
                mutex_unlock(&trace_types_lock);
        }

        (*ppos)++;

        return cnt;
}

static const struct file_operations rb_simple_fops = {
        .open                = tracing_open_generic_tr,
        .read                = rb_simple_read,
        .write                = rb_simple_write,
        .release        = tracing_release_generic_tr,
        .llseek                = default_llseek,
};

static ssize_t
buffer_percent_read(struct file *filp, char __user *ubuf,
                    size_t cnt, loff_t *ppos)
{
        struct trace_array *tr = filp->private_data;
        char buf[64];
        int r;

        r = tr->buffer_percent;
        r = sprintf(buf, "%d\n", r);

        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}

static ssize_t
buffer_percent_write(struct file *filp, const char __user *ubuf,
                     size_t cnt, loff_t *ppos)
{
        struct trace_array *tr = filp->private_data;
        unsigned long val;
        int ret;

        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
        if (ret)
                return ret;

        if (val > 100)
                return -EINVAL;

        tr->buffer_percent = val;

        (*ppos)++;

        return cnt;
}

static const struct file_operations buffer_percent_fops = {
        .open                = tracing_open_generic_tr,
        .read                = buffer_percent_read,
        .write                = buffer_percent_write,
        .release        = tracing_release_generic_tr,
        .llseek                = default_llseek,
};

static struct dentry *trace_instance_dir;

static void
init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer);

static int
allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, unsigned long size)
{
        enum ring_buffer_flags rb_flags;

        rb_flags = tr->trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;

        buf->tr = tr;

        buf->buffer = ring_buffer_alloc(size, rb_flags);
        if (!buf->buffer)
                return -ENOMEM;

        buf->data = alloc_percpu(struct trace_array_cpu);
        if (!buf->data) {
                ring_buffer_free(buf->buffer);
                buf->buffer = NULL;
                return -ENOMEM;
        }

        /* Allocate the first page for all buffers */
        set_buffer_entries(&tr->array_buffer,
                           ring_buffer_size(tr->array_buffer.buffer, 0));

        return 0;
}

static int allocate_trace_buffers(struct trace_array *tr, unsigned long size)
{
        int ret;

        ret = allocate_trace_buffer(tr, &tr->array_buffer, size);
        if (ret)
                return ret;

#ifdef CONFIG_TRACER_MAX_TRACE
        ret = allocate_trace_buffer(tr, &tr->max_buffer,
                                    allocate_snapshot ? size : 1);
        if (MEM_FAIL(ret, "Failed to allocate trace buffer\n")) {
                ring_buffer_free(tr->array_buffer.buffer);
                tr->array_buffer.buffer = NULL;
                free_percpu(tr->array_buffer.data);
                tr->array_buffer.data = NULL;
                return -ENOMEM;
        }
        tr->allocated_snapshot = allocate_snapshot;

        /*
         * Only the top level trace array gets its snapshot allocated
         * from the kernel command line.
         */
        allocate_snapshot = false;
#endif

        return 0;
}

static void free_trace_buffer(struct array_buffer *buf)
{
        if (buf->buffer) {
                ring_buffer_free(buf->buffer);
                buf->buffer = NULL;
                free_percpu(buf->data);
                buf->data = NULL;
        }
}

static void free_trace_buffers(struct trace_array *tr)
{
        if (!tr)
                return;

        free_trace_buffer(&tr->array_buffer);

#ifdef CONFIG_TRACER_MAX_TRACE
        free_trace_buffer(&tr->max_buffer);
#endif
}

static void init_trace_flags_index(struct trace_array *tr)
{
        int i;

        /* Used by the trace options files */
        for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++)
                tr->trace_flags_index[i] = i;
}

static void __update_tracer_options(struct trace_array *tr)
{
        struct tracer *t;

        for (t = trace_types; t; t = t->next)
                add_tracer_options(tr, t);
}

static void update_tracer_options(struct trace_array *tr)
{
        mutex_lock(&trace_types_lock);
        tracer_options_updated = true;
        __update_tracer_options(tr);
        mutex_unlock(&trace_types_lock);
}

/* Must have trace_types_lock held */
struct trace_array *trace_array_find(const char *instance)
{
        struct trace_array *tr, *found = NULL;

        list_for_each_entry(tr, &ftrace_trace_arrays, list) {
                if (tr->name && strcmp(tr->name, instance) == 0) {
                        found = tr;
                        break;
                }
        }

        return found;
}

struct trace_array *trace_array_find_get(const char *instance)
{
        struct trace_array *tr;

        mutex_lock(&trace_types_lock);
        tr = trace_array_find(instance);
        if (tr)
                tr->ref++;
        mutex_unlock(&trace_types_lock);

        return tr;
}

static int trace_array_create_dir(struct trace_array *tr)
{
        int ret;

        tr->dir = tracefs_create_dir(tr->name, trace_instance_dir);
        if (!tr->dir)
                return -EINVAL;

        ret = event_trace_add_tracer(tr->dir, tr);
        if (ret) {
                tracefs_remove(tr->dir);
                return ret;
        }

        init_tracer_tracefs(tr, tr->dir);
        __update_tracer_options(tr);

        return ret;
}

static struct trace_array *trace_array_create(const char *name)
{
        struct trace_array *tr;
        int ret;

        ret = -ENOMEM;
        tr = kzalloc(sizeof(*tr), GFP_KERNEL);
        if (!tr)
                return ERR_PTR(ret);

        tr->name = kstrdup(name, GFP_KERNEL);
        if (!tr->name)
                goto out_free_tr;

        if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL))
                goto out_free_tr;

        if (!zalloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL))
                goto out_free_tr;

        tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;

        cpumask_copy(tr->tracing_cpumask, cpu_all_mask);

        raw_spin_lock_init(&tr->start_lock);

        tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;

        tr->current_trace = &nop_trace;

        INIT_LIST_HEAD(&tr->systems);
        INIT_LIST_HEAD(&tr->events);
        INIT_LIST_HEAD(&tr->hist_vars);
        INIT_LIST_HEAD(&tr->err_log);

        if (allocate_trace_buffers(tr, trace_buf_size) < 0)
                goto out_free_tr;

        if (ftrace_allocate_ftrace_ops(tr) < 0)
                goto out_free_tr;

        ftrace_init_trace_array(tr);

        init_trace_flags_index(tr);

        if (trace_instance_dir) {
                ret = trace_array_create_dir(tr);
                if (ret)
                        goto out_free_tr;
        } else
                __trace_early_add_events(tr);

        list_add(&tr->list, &ftrace_trace_arrays);

        tr->ref++;

        return tr;

 out_free_tr:
        ftrace_free_ftrace_ops(tr);
        free_trace_buffers(tr);
        free_cpumask_var(tr->pipe_cpumask);
        free_cpumask_var(tr->tracing_cpumask);
        kfree(tr->name);
        kfree(tr);

        return ERR_PTR(ret);
}

static int instance_mkdir(const char *name)
{
        struct trace_array *tr;
        int ret;

        mutex_lock(&event_mutex);
        mutex_lock(&trace_types_lock);

        ret = -EEXIST;
        if (trace_array_find(name))
                goto out_unlock;

        tr = trace_array_create(name);

        ret = PTR_ERR_OR_ZERO(tr);

out_unlock:
        mutex_unlock(&trace_types_lock);
        mutex_unlock(&event_mutex);
        return ret;
}

/**
 * trace_array_get_by_name - Create/Lookup a trace array, given its name.
 * @name: The name of the trace array to be looked up/created.
 *
 * Returns pointer to trace array with given name.
 * NULL, if it cannot be created.
 *
 * NOTE: This function increments the reference counter associated with the
 * trace array returned. This makes sure it cannot be freed while in use.
 * Use trace_array_put() once the trace array is no longer needed.
 * If the trace_array is to be freed, trace_array_destroy() needs to
 * be called after the trace_array_put(), or simply let user space delete
 * it from the tracefs instances directory. But until the
 * trace_array_put() is called, user space can not delete it.
 *
 */
struct trace_array *trace_array_get_by_name(const char *name)
{
        struct trace_array *tr;

        mutex_lock(&event_mutex);
        mutex_lock(&trace_types_lock);

        list_for_each_entry(tr, &ftrace_trace_arrays, list) {
                if (tr->name && strcmp(tr->name, name) == 0)
                        goto out_unlock;
        }

        tr = trace_array_create(name);

        if (IS_ERR(tr))
                tr = NULL;
out_unlock:
        if (tr)
                tr->ref++;

        mutex_unlock(&trace_types_lock);
        mutex_unlock(&event_mutex);
        return tr;
}
EXPORT_SYMBOL_GPL(trace_array_get_by_name);

static int __remove_instance(struct trace_array *tr)
{
        int i;

        /* Reference counter for a newly created trace array = 1. */
        if (tr->ref > 1 || (tr->current_trace && tr->trace_ref))
                return -EBUSY;

        list_del(&tr->list);

        /* Disable all the flags that were enabled coming in */
        for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) {
                if ((1 << i) & ZEROED_TRACE_FLAGS)
                        set_tracer_flag(tr, 1 << i, 0);
        }

        tracing_set_nop(tr);
        clear_ftrace_function_probes(tr);
        event_trace_del_tracer(tr);
        ftrace_clear_pids(tr);
        ftrace_destroy_function_files(tr);
        tracefs_remove(tr->dir);
        free_trace_buffers(tr);
        clear_tracing_err_log(tr);

        for (i = 0; i < tr->nr_topts; i++) {
                kfree(tr->topts[i].topts);
        }
        kfree(tr->topts);

        free_cpumask_var(tr->pipe_cpumask);
        free_cpumask_var(tr->tracing_cpumask);
        kfree(tr->name);
        kfree(tr);

        return 0;
}

int trace_array_destroy(struct trace_array *this_tr)
{
        struct trace_array *tr;
        int ret;

        if (!this_tr)
                return -EINVAL;

        mutex_lock(&event_mutex);
        mutex_lock(&trace_types_lock);

        ret = -ENODEV;

        /* Making sure trace array exists before destroying it. */
        list_for_each_entry(tr, &ftrace_trace_arrays, list) {
                if (tr == this_tr) {
                        ret = __remove_instance(tr);
                        break;
                }
        }

        mutex_unlock(&trace_types_lock);
        mutex_unlock(&event_mutex);

        return ret;
}
EXPORT_SYMBOL_GPL(trace_array_destroy);

static int instance_rmdir(const char *name)
{
        struct trace_array *tr;
        int ret;

        mutex_lock(&event_mutex);
        mutex_lock(&trace_types_lock);

        ret = -ENODEV;
        tr = trace_array_find(name);
        if (tr)
                ret = __remove_instance(tr);

        mutex_unlock(&trace_types_lock);
        mutex_unlock(&event_mutex);

        return ret;
}

static __init void create_trace_instances(struct dentry *d_tracer)
{
        struct trace_array *tr;

        trace_instance_dir = tracefs_create_instance_dir("instances", d_tracer,
                                                         instance_mkdir,
                                                         instance_rmdir);
        if (MEM_FAIL(!trace_instance_dir, "Failed to create instances directory\n"))
                return;

        mutex_lock(&event_mutex);
        mutex_lock(&trace_types_lock);

        list_for_each_entry(tr, &ftrace_trace_arrays, list) {
                if (!tr->name)
                        continue;
                if (MEM_FAIL(trace_array_create_dir(tr) < 0,
                             "Failed to create instance directory\n"))
                        break;
        }

        mutex_unlock(&trace_types_lock);
        mutex_unlock(&event_mutex);
}

static void
init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
{
        struct trace_event_file *file;
        int cpu;

        trace_create_file("available_tracers", 0444, d_tracer,
                        tr, &show_traces_fops);

        trace_create_file("current_tracer", 0644, d_tracer,
                        tr, &set_tracer_fops);

        trace_create_file("tracing_cpumask", 0644, d_tracer,
                          tr, &tracing_cpumask_fops);

        trace_create_file("trace_options", 0644, d_tracer,
                          tr, &tracing_iter_fops);

        trace_create_file("trace", 0644, d_tracer,
                          tr, &tracing_fops);

        trace_create_file("trace_pipe", 0444, d_tracer,
                          tr, &tracing_pipe_fops);

        trace_create_file("buffer_size_kb", 0644, d_tracer,
                          tr, &tracing_entries_fops);

        trace_create_file("buffer_total_size_kb", 0444, d_tracer,
                          tr, &tracing_total_entries_fops);

        trace_create_file("free_buffer", 0200, d_tracer,
                          tr, &tracing_free_buffer_fops);

        trace_create_file("trace_marker", 0220, d_tracer,
                          tr, &tracing_mark_fops);

        file = __find_event_file(tr, "ftrace", "print");
        if (file && file->dir)
                trace_create_file("trigger", 0644, file->dir, file,
                                  &event_trigger_fops);
        tr->trace_marker_file = file;

        trace_create_file("trace_marker_raw", 0220, d_tracer,
                          tr, &tracing_mark_raw_fops);

        trace_create_file("trace_clock", 0644, d_tracer, tr,
                          &trace_clock_fops);

        trace_create_file("tracing_on", 0644, d_tracer,
                          tr, &rb_simple_fops);

        trace_create_file("timestamp_mode", 0444, d_tracer, tr,
                          &trace_time_stamp_mode_fops);

        tr->buffer_percent = 50;

        trace_create_file("buffer_percent", 0444, d_tracer,
                        tr, &buffer_percent_fops);

        create_trace_options_dir(tr);

#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
        trace_create_maxlat_file(tr, d_tracer);
#endif

        if (ftrace_create_function_files(tr, d_tracer))
                MEM_FAIL(1, "Could not allocate function filter files");

#ifdef CONFIG_TRACER_SNAPSHOT
        trace_create_file("snapshot", 0644, d_tracer,
                          tr, &snapshot_fops);
#endif

        trace_create_file("error_log", 0644, d_tracer,
                          tr, &tracing_err_log_fops);

        for_each_tracing_cpu(cpu)
                tracing_init_tracefs_percpu(tr, cpu);

        ftrace_init_tracefs(tr, d_tracer);
}

static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
{
        struct vfsmount *mnt;
        struct file_system_type *type;

        /*
         * To maintain backward compatibility for tools that mount
         * debugfs to get to the tracing facility, tracefs is automatically
         * mounted to the debugfs/tracing directory.
         */
        type = get_fs_type("tracefs");
        if (!type)
                return NULL;
        mnt = vfs_submount(mntpt, type, "tracefs", NULL);
        put_filesystem(type);
        if (IS_ERR(mnt))
                return NULL;
        mntget(mnt);

        return mnt;
}

/**
 * tracing_init_dentry - initialize top level trace array
 *
 * This is called when creating files or directories in the tracing
 * directory. It is called via fs_initcall() by any of the boot up code
 * and expects to return the dentry of the top level tracing directory.
 */
int tracing_init_dentry(void)
{
        struct trace_array *tr = &global_trace;

        if (security_locked_down(LOCKDOWN_TRACEFS)) {
                pr_warn("Tracing disabled due to lockdown\n");
                return -EPERM;
        }

        /* The top level trace array uses  NULL as parent */
        if (tr->dir)
                return 0;

        if (WARN_ON(!tracefs_initialized()))
                return -ENODEV;

        /*
         * As there may still be users that expect the tracing
         * files to exist in debugfs/tracing, we must automount
         * the tracefs file system there, so older tools still
         * work with the newer kerenl.
         */
        tr->dir = debugfs_create_automount("tracing", NULL,
                                           trace_automount, NULL);

        return 0;
}

extern struct trace_eval_map *__start_ftrace_eval_maps[];
extern struct trace_eval_map *__stop_ftrace_eval_maps[];

static void __init trace_eval_init(void)
{
        int len;

        len = __stop_ftrace_eval_maps - __start_ftrace_eval_maps;
        trace_insert_eval_map(NULL, __start_ftrace_eval_maps, len);
}

#ifdef CONFIG_MODULES
static void trace_module_add_evals(struct module *mod)
{
        if (!mod->num_trace_evals)
                return;

        /*
         * Modules with bad taint do not have events created, do
         * not bother with enums either.
         */
        if (trace_module_has_bad_taint(mod))
                return;

        trace_insert_eval_map(mod, mod->trace_evals, mod->num_trace_evals);
}

#ifdef CONFIG_TRACE_EVAL_MAP_FILE
static void trace_module_remove_evals(struct module *mod)
{
        union trace_eval_map_item *map;
        union trace_eval_map_item **last = &trace_eval_maps;

        if (!mod->num_trace_evals)
                return;

        mutex_lock(&trace_eval_mutex);

        map = trace_eval_maps;

        while (map) {
                if (map->head.mod == mod)
                        break;
                map = trace_eval_jmp_to_tail(map);
                last = &map->tail.next;
                map = map->tail.next;
        }
        if (!map)
                goto out;

        *last = trace_eval_jmp_to_tail(map)->tail.next;
        kfree(map);
 out:
        mutex_unlock(&trace_eval_mutex);
}
#else
static inline void trace_module_remove_evals(struct module *mod) { }
#endif /* CONFIG_TRACE_EVAL_MAP_FILE */

static int trace_module_notify(struct notifier_block *self,
                               unsigned long val, void *data)
{
        struct module *mod = data;

        switch (val) {
        case MODULE_STATE_COMING:
                trace_module_add_evals(mod);
                break;
        case MODULE_STATE_GOING:
                trace_module_remove_evals(mod);
                break;
        }

        return NOTIFY_OK;
}

static struct notifier_block trace_module_nb = {
        .notifier_call = trace_module_notify,
        .priority = 0,
};
#endif /* CONFIG_MODULES */

static __init int tracer_init_tracefs(void)
{
        int ret;

        trace_access_lock_init();

        ret = tracing_init_dentry();
        if (ret)
                return 0;

        event_trace_init();

        init_tracer_tracefs(&global_trace, NULL);
        ftrace_init_tracefs_toplevel(&global_trace, NULL);

        trace_create_file("tracing_thresh", 0644, NULL,
                        &global_trace, &tracing_thresh_fops);

        trace_create_file("README", 0444, NULL,
                        NULL, &tracing_readme_fops);

        trace_create_file("saved_cmdlines", 0444, NULL,
                        NULL, &tracing_saved_cmdlines_fops);

        trace_create_file("saved_cmdlines_size", 0644, NULL,
                          NULL, &tracing_saved_cmdlines_size_fops);

        trace_create_file("saved_tgids", 0444, NULL,
                        NULL, &tracing_saved_tgids_fops);

        trace_eval_init();

        trace_create_eval_file(NULL);

#ifdef CONFIG_MODULES
        register_module_notifier(&trace_module_nb);
#endif

#ifdef CONFIG_DYNAMIC_FTRACE
        trace_create_file("dyn_ftrace_total_info", 0444, NULL,
                        NULL, &tracing_dyn_info_fops);
#endif

        create_trace_instances(NULL);

        update_tracer_options(&global_trace);

        return 0;
}

static int trace_panic_handler(struct notifier_block *this,
                               unsigned long event, void *unused)
{
        if (ftrace_dump_on_oops)
                ftrace_dump(ftrace_dump_on_oops);
        return NOTIFY_OK;
}

static struct notifier_block trace_panic_notifier = {
        .notifier_call  = trace_panic_handler,
        .next           = NULL,
        .priority       = 150   /* priority: INT_MAX >= x >= 0 */
};

static int trace_die_handler(struct notifier_block *self,
                             unsigned long val,
                             void *data)
{
        switch (val) {
        case DIE_OOPS:
                if (ftrace_dump_on_oops)
                        ftrace_dump(ftrace_dump_on_oops);
                break;
        default:
                break;
        }
        return NOTIFY_OK;
}

static struct notifier_block trace_die_notifier = {
        .notifier_call = trace_die_handler,
        .priority = 200
};

/*
 * printk is set to max of 1024, we really don't need it that big.
 * Nothing should be printing 1000 characters anyway.
 */
#define TRACE_MAX_PRINT                1000

/*
 * Define here KERN_TRACE so that we have one place to modify
 * it if we decide to change what log level the ftrace dump
 * should be at.
 */
#define KERN_TRACE                KERN_EMERG

void
trace_printk_seq(struct trace_seq *s)
{
        /* Probably should print a warning here. */
        if (s->seq.len >= TRACE_MAX_PRINT)
                s->seq.len = TRACE_MAX_PRINT;

        /*
         * More paranoid code. Although the buffer size is set to
         * PAGE_SIZE, and TRACE_MAX_PRINT is 1000, this is just
         * an extra layer of protection.
         */
        if (WARN_ON_ONCE(s->seq.len >= s->seq.size))
                s->seq.len = s->seq.size - 1;

        /* should be zero ended, but we are paranoid. */
        s->buffer[s->seq.len] = 0;

        printk(KERN_TRACE "%s", s->buffer);

        trace_seq_init(s);
}

void trace_init_global_iter(struct trace_iterator *iter)
{
        iter->tr = &global_trace;
        iter->trace = iter->tr->current_trace;
        iter->cpu_file = RING_BUFFER_ALL_CPUS;
        iter->array_buffer = &global_trace.array_buffer;

        if (iter->trace && iter->trace->open)
                iter->trace->open(iter);

        /* Annotate start of buffers if we had overruns */
        if (ring_buffer_overruns(iter->array_buffer->buffer))
                iter->iter_flags |= TRACE_FILE_ANNOTATE;

        /* Output in nanoseconds only if we are using a clock in nanoseconds. */
        if (trace_clocks[iter->tr->clock_id].in_ns)
                iter->iter_flags |= TRACE_FILE_TIME_IN_NS;

        /* Can not use kmalloc for iter.temp and iter.fmt */
        iter->temp = static_temp_buf;
        iter->temp_size = STATIC_TEMP_BUF_SIZE;
        iter->fmt = static_fmt_buf;
        iter->fmt_size = STATIC_FMT_BUF_SIZE;
}

void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
{
        /* use static because iter can be a bit big for the stack */
        static struct trace_iterator iter;
        static atomic_t dump_running;
        struct trace_array *tr = &global_trace;
        unsigned int old_userobj;
        unsigned long flags;
        int cnt = 0, cpu;

        /* Only allow one dump user at a time. */
        if (atomic_inc_return(&dump_running) != 1) {
                atomic_dec(&dump_running);
                return;
        }

        /*
         * Always turn off tracing when we dump.
         * We don't need to show trace output of what happens
         * between multiple crashes.
         *
         * If the user does a sysrq-z, then they can re-enable
         * tracing with echo 1 > tracing_on.
         */
        tracing_off();

        local_irq_save(flags);
        printk_nmi_direct_enter();

        /* Simulate the iterator */
        trace_init_global_iter(&iter);

        for_each_tracing_cpu(cpu) {
                atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
        }

        old_userobj = tr->trace_flags & TRACE_ITER_SYM_USEROBJ;

        /* don't look at user memory in panic mode */
        tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ;

        switch (oops_dump_mode) {
        case DUMP_ALL:
                iter.cpu_file = RING_BUFFER_ALL_CPUS;
                break;
        case DUMP_ORIG:
                iter.cpu_file = raw_smp_processor_id();
                break;
        case DUMP_NONE:
                goto out_enable;
        default:
                printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n");
                iter.cpu_file = RING_BUFFER_ALL_CPUS;
        }

        printk(KERN_TRACE "Dumping ftrace buffer:\n");

        /* Did function tracer already get disabled? */
        if (ftrace_is_dead()) {
                printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n");
                printk("#          MAY BE MISSING FUNCTION EVENTS\n");
        }

        /*
         * We need to stop all tracing on all CPUS to read
         * the next buffer. This is a bit expensive, but is
         * not done often. We fill all what we can read,
         * and then release the locks again.
         */

        while (!trace_empty(&iter)) {

                if (!cnt)
                        printk(KERN_TRACE "---------------------------------\n");

                cnt++;

                trace_iterator_reset(&iter);
                iter.iter_flags |= TRACE_FILE_LAT_FMT;

                if (trace_find_next_entry_inc(&iter) != NULL) {
                        int ret;

                        ret = print_trace_line(&iter);
                        if (ret != TRACE_TYPE_NO_CONSUME)
                                trace_consume(&iter);

                        trace_printk_seq(&iter.seq);
                }
                touch_nmi_watchdog();
        }

        if (!cnt)
                printk(KERN_TRACE "   (ftrace buffer empty)\n");
        else
                printk(KERN_TRACE "---------------------------------\n");

 out_enable:
        tr->trace_flags |= old_userobj;

        for_each_tracing_cpu(cpu) {
                atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
        }
        atomic_dec(&dump_running);
        printk_nmi_direct_exit();
        local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(ftrace_dump);

int trace_run_command(const char *buf, int (*createfn)(int, char **))
{
        char **argv;
        int argc, ret;

        argc = 0;
        ret = 0;
        argv = argv_split(GFP_KERNEL, buf, &argc);
        if (!argv)
                return -ENOMEM;

        if (argc)
                ret = createfn(argc, argv);

        argv_free(argv);

        return ret;
}

#define WRITE_BUFSIZE  4096

ssize_t trace_parse_run_command(struct file *file, const char __user *buffer,
                                size_t count, loff_t *ppos,
                                int (*createfn)(int, char **))
{
        char *kbuf, *buf, *tmp;
        int ret = 0;
        size_t done = 0;
        size_t size;

        kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
        if (!kbuf)
                return -ENOMEM;

        while (done < count) {
                size = count - done;

                if (size >= WRITE_BUFSIZE)
                        size = WRITE_BUFSIZE - 1;

                if (copy_from_user(kbuf, buffer + done, size)) {
                        ret = -EFAULT;
                        goto out;
                }
                kbuf[size] = '\0';
                buf = kbuf;
                do {
                        tmp = strchr(buf, '\n');
                        if (tmp) {
                                *tmp = '\0';
                                size = tmp - buf + 1;
                        } else {
                                size = strlen(buf);
                                if (done + size < count) {
                                        if (buf != kbuf)
                                                break;
                                        /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */
                                        pr_warn("Line length is too long: Should be less than %d\n",
                                                WRITE_BUFSIZE - 2);
                                        ret = -EINVAL;
                                        goto out;
                                }
                        }
                        done += size;

                        /* Remove comments */
                        tmp = strchr(buf, '#');

                        if (tmp)
                                *tmp = '\0';

                        ret = trace_run_command(buf, createfn);
                        if (ret)
                                goto out;
                        buf += size;

                } while (done < count);
        }
        ret = done;

out:
        kfree(kbuf);

        return ret;
}

__init static int tracer_alloc_buffers(void)
{
        unsigned long ring_buf_size;
        int ret = -ENOMEM;


        if (security_locked_down(LOCKDOWN_TRACEFS)) {
                pr_warn("Tracing disabled due to lockdown\n");
                return -EPERM;
        }

        /*
         * Make sure we don't accidentally add more trace options
         * than we have bits for.
         */
        BUILD_BUG_ON(TRACE_ITER_LAST_BIT > TRACE_FLAGS_MAX_SIZE);

        if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
                return -ENOMEM;

        if (!alloc_cpumask_var(&global_trace.tracing_cpumask, GFP_KERNEL))
                goto out_free_buffer_mask;

        /* Only allocate trace_printk buffers if a trace_printk exists */
        if (&__stop___trace_bprintk_fmt != &__start___trace_bprintk_fmt)
                /* Must be called before global_trace.buffer is allocated */
                trace_printk_init_buffers();

        /* To save memory, keep the ring buffer size to its minimum */
        if (ring_buffer_expanded)
                ring_buf_size = trace_buf_size;
        else
                ring_buf_size = 1;

        cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
        cpumask_copy(global_trace.tracing_cpumask, cpu_all_mask);

        raw_spin_lock_init(&global_trace.start_lock);

        /*
         * The prepare callbacks allocates some memory for the ring buffer. We
         * don't free the buffer if the CPU goes down. If we were to free
         * the buffer, then the user would lose any trace that was in the
         * buffer. The memory will be removed once the "instance" is removed.
         */
        ret = cpuhp_setup_state_multi(CPUHP_TRACE_RB_PREPARE,
                                      "trace/RB:preapre", trace_rb_cpu_prepare,
                                      NULL);
        if (ret < 0)
                goto out_free_cpumask;
        /* Used for event triggers */
        ret = -ENOMEM;
        temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE);
        if (!temp_buffer)
                goto out_rm_hp_state;

        if (trace_create_savedcmd() < 0)
                goto out_free_temp_buffer;

        if (!zalloc_cpumask_var(&global_trace.pipe_cpumask, GFP_KERNEL))
                goto out_free_savedcmd;

        /* TODO: make the number of buffers hot pluggable with CPUS */
        if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
                MEM_FAIL(1, "tracer: failed to allocate ring buffer!\n");
                goto out_free_pipe_cpumask;
        }
        if (global_trace.buffer_disabled)
                tracing_off();

        if (trace_boot_clock) {
                ret = tracing_set_clock(&global_trace, trace_boot_clock);
                if (ret < 0)
                        pr_warn("Trace clock %s not defined, going back to default\n",
                                trace_boot_clock);
        }

        /*
         * register_tracer() might reference current_trace, so it
         * needs to be set before we register anything. This is
         * just a bootstrap of current_trace anyway.
         */
        global_trace.current_trace = &nop_trace;

        global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;

        ftrace_init_global_array_ops(&global_trace);

        init_trace_flags_index(&global_trace);

        register_tracer(&nop_trace);

        /* Function tracing may start here (via kernel command line) */
        init_function_trace();

        /* All seems OK, enable tracing */
        tracing_disabled = 0;

        atomic_notifier_chain_register(&panic_notifier_list,
                                       &trace_panic_notifier);

        register_die_notifier(&trace_die_notifier);

        global_trace.flags = TRACE_ARRAY_FL_GLOBAL;

        INIT_LIST_HEAD(&global_trace.systems);
        INIT_LIST_HEAD(&global_trace.events);
        INIT_LIST_HEAD(&global_trace.hist_vars);
        INIT_LIST_HEAD(&global_trace.err_log);
        list_add(&global_trace.list, &ftrace_trace_arrays);

        apply_trace_boot_options();

        register_snapshot_cmd();

        return 0;

out_free_pipe_cpumask:
        free_cpumask_var(global_trace.pipe_cpumask);
out_free_savedcmd:
        free_saved_cmdlines_buffer(savedcmd);
out_free_temp_buffer:
        ring_buffer_free(temp_buffer);
out_rm_hp_state:
        cpuhp_remove_multi_state(CPUHP_TRACE_RB_PREPARE);
out_free_cpumask:
        free_cpumask_var(global_trace.tracing_cpumask);
out_free_buffer_mask:
        free_cpumask_var(tracing_buffer_mask);
        return ret;
}

void __init early_trace_init(void)
{
        if (tracepoint_printk) {
                tracepoint_print_iter =
                        kmalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL);
                if (MEM_FAIL(!tracepoint_print_iter,
                             "Failed to allocate trace iterator\n"))
                        tracepoint_printk = 0;
                else
                        static_key_enable(&tracepoint_printk_key.key);
        }
        tracer_alloc_buffers();

        init_events();
}

void __init trace_init(void)
{
        trace_event_init();
}

__init static int clear_boot_tracer(void)
{
        /*
         * The default tracer at boot buffer is an init section.
         * This function is called in lateinit. If we did not
         * find the boot tracer, then clear it out, to prevent
         * later registration from accessing the buffer that is
         * about to be freed.
         */
        if (!default_bootup_tracer)
                return 0;

        printk(KERN_INFO "ftrace bootup tracer '%s' not registered.\n",
               default_bootup_tracer);
        default_bootup_tracer = NULL;

        return 0;
}

fs_initcall(tracer_init_tracefs);
late_initcall_sync(clear_boot_tracer);

#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
__init static int tracing_set_default_clock(void)
{
        /* sched_clock_stable() is determined in late_initcall */
        if (!trace_boot_clock && !sched_clock_stable()) {
                if (security_locked_down(LOCKDOWN_TRACEFS)) {
                        pr_warn("Can not set tracing clock due to lockdown\n");
                        return -EPERM;
                }

                printk(KERN_WARNING
                       "Unstable clock detected, switching default tracing clock to \"global\"\n"
                       "If you want to keep using the local clock, then add:\n"
                       "  \"trace_clock=local\"\n"
                       "on the kernel command line\n");
                tracing_set_clock(&global_trace, "global");
        }

        return 0;
}
late_initcall_sync(tracing_set_default_clock);
#endif























































    3 









































































































































































































































    2 





















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Copyright (C) 2001 Momchil Velikov
 * Portions Copyright (C) 2001 Christoph Hellwig
 * Copyright (C) 2006 Nick Piggin
 * Copyright (C) 2012 Konstantin Khlebnikov
 */
#ifndef _LINUX_RADIX_TREE_H
#define _LINUX_RADIX_TREE_H

#include <linux/bitops.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/percpu.h>
#include <linux/preempt.h>
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
#include <linux/types.h>
#include <linux/xarray.h>
#include <linux/local_lock.h>

/* Keep unconverted code working */
#define radix_tree_root                xarray
#define radix_tree_node                xa_node

struct radix_tree_preload {
        local_lock_t lock;
        unsigned nr;
        /* nodes->parent points to next preallocated node */
        struct radix_tree_node *nodes;
};
DECLARE_PER_CPU(struct radix_tree_preload, radix_tree_preloads);

/*
 * The bottom two bits of the slot determine how the remaining bits in the
 * slot are interpreted:
 *
 * 00 - data pointer
 * 10 - internal entry
 * x1 - value entry
 *
 * The internal entry may be a pointer to the next level in the tree, a
 * sibling entry, or an indicator that the entry in this slot has been moved
 * to another location in the tree and the lookup should be restarted.  While
 * NULL fits the 'data pointer' pattern, it means that there is no entry in
 * the tree for this index (no matter what level of the tree it is found at).
 * This means that storing a NULL entry in the tree is the same as deleting
 * the entry from the tree.
 */
#define RADIX_TREE_ENTRY_MASK                3UL
#define RADIX_TREE_INTERNAL_NODE        2UL

static inline bool radix_tree_is_internal_node(void *ptr)
{
        return ((unsigned long)ptr & RADIX_TREE_ENTRY_MASK) ==
                                RADIX_TREE_INTERNAL_NODE;
}

/*** radix-tree API starts here ***/

#define RADIX_TREE_MAP_SHIFT        XA_CHUNK_SHIFT
#define RADIX_TREE_MAP_SIZE        (1UL << RADIX_TREE_MAP_SHIFT)
#define RADIX_TREE_MAP_MASK        (RADIX_TREE_MAP_SIZE-1)

#define RADIX_TREE_MAX_TAGS        XA_MAX_MARKS
#define RADIX_TREE_TAG_LONGS        XA_MARK_LONGS

#define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
                                          RADIX_TREE_MAP_SHIFT))

/* The IDR tag is stored in the low bits of xa_flags */
#define ROOT_IS_IDR        ((__force gfp_t)4)
/* The top bits of xa_flags are used to store the root tags */
#define ROOT_TAG_SHIFT        (__GFP_BITS_SHIFT)

#define RADIX_TREE_INIT(name, mask)        XARRAY_INIT(name, mask)

#define RADIX_TREE(name, mask) \
        struct radix_tree_root name = RADIX_TREE_INIT(name, mask)

#define INIT_RADIX_TREE(root, mask) xa_init_flags(root, mask)

static inline bool radix_tree_empty(const struct radix_tree_root *root)
{
        return root->xa_head == NULL;
}

/**
 * struct radix_tree_iter - radix tree iterator state
 *
 * @index:        index of current slot
 * @next_index:        one beyond the last index for this chunk
 * @tags:        bit-mask for tag-iterating
 * @node:        node that contains current slot
 *
 * This radix tree iterator works in terms of "chunks" of slots.  A chunk is a
 * subinterval of slots contained within one radix tree leaf node.  It is
 * described by a pointer to its first slot and a struct radix_tree_iter
 * which holds the chunk's position in the tree and its size.  For tagged
 * iteration radix_tree_iter also holds the slots' bit-mask for one chosen
 * radix tree tag.
 */
struct radix_tree_iter {
        unsigned long        index;
        unsigned long        next_index;
        unsigned long        tags;
        struct radix_tree_node *node;
};

/**
 * Radix-tree synchronization
 *
 * The radix-tree API requires that users provide all synchronisation (with
 * specific exceptions, noted below).
 *
 * Synchronization of access to the data items being stored in the tree, and
 * management of their lifetimes must be completely managed by API users.
 *
 * For API usage, in general,
 * - any function _modifying_ the tree or tags (inserting or deleting
 *   items, setting or clearing tags) must exclude other modifications, and
 *   exclude any functions reading the tree.
 * - any function _reading_ the tree or tags (looking up items or tags,
 *   gang lookups) must exclude modifications to the tree, but may occur
 *   concurrently with other readers.
 *
 * The notable exceptions to this rule are the following functions:
 * __radix_tree_lookup
 * radix_tree_lookup
 * radix_tree_lookup_slot
 * radix_tree_tag_get
 * radix_tree_gang_lookup
 * radix_tree_gang_lookup_tag
 * radix_tree_gang_lookup_tag_slot
 * radix_tree_tagged
 *
 * The first 7 functions are able to be called locklessly, using RCU. The
 * caller must ensure calls to these functions are made within rcu_read_lock()
 * regions. Other readers (lock-free or otherwise) and modifications may be
 * running concurrently.
 *
 * It is still required that the caller manage the synchronization and lifetimes
 * of the items. So if RCU lock-free lookups are used, typically this would mean
 * that the items have their own locks, or are amenable to lock-free access; and
 * that the items are freed by RCU (or only freed after having been deleted from
 * the radix tree *and* a synchronize_rcu() grace period).
 *
 * (Note, rcu_assign_pointer and rcu_dereference are not needed to control
 * access to data items when inserting into or looking up from the radix tree)
 *
 * Note that the value returned by radix_tree_tag_get() may not be relied upon
 * if only the RCU read lock is held.  Functions to set/clear tags and to
 * delete nodes running concurrently with it may affect its result such that
 * two consecutive reads in the same locked section may return different
 * values.  If reliability is required, modification functions must also be
 * excluded from concurrency.
 *
 * radix_tree_tagged is able to be called without locking or RCU.
 */

/**
 * radix_tree_deref_slot - dereference a slot
 * @slot: slot pointer, returned by radix_tree_lookup_slot
 *
 * For use with radix_tree_lookup_slot().  Caller must hold tree at least read
 * locked across slot lookup and dereference. Not required if write lock is
 * held (ie. items cannot be concurrently inserted).
 *
 * radix_tree_deref_retry must be used to confirm validity of the pointer if
 * only the read lock is held.
 *
 * Return: entry stored in that slot.
 */
static inline void *radix_tree_deref_slot(void __rcu **slot)
{
        return rcu_dereference(*slot);
}

/**
 * radix_tree_deref_slot_protected - dereference a slot with tree lock held
 * @slot: slot pointer, returned by radix_tree_lookup_slot
 *
 * Similar to radix_tree_deref_slot.  The caller does not hold the RCU read
 * lock but it must hold the tree lock to prevent parallel updates.
 *
 * Return: entry stored in that slot.
 */
static inline void *radix_tree_deref_slot_protected(void __rcu **slot,
                                                        spinlock_t *treelock)
{
        return rcu_dereference_protected(*slot, lockdep_is_held(treelock));
}

/**
 * radix_tree_deref_retry        - check radix_tree_deref_slot
 * @arg:        pointer returned by radix_tree_deref_slot
 * Returns:        0 if retry is not required, otherwise retry is required
 *
 * radix_tree_deref_retry must be used with radix_tree_deref_slot.
 */
static inline int radix_tree_deref_retry(void *arg)
{
        return unlikely(radix_tree_is_internal_node(arg));
}

/**
 * radix_tree_exception        - radix_tree_deref_slot returned either exception?
 * @arg:        value returned by radix_tree_deref_slot
 * Returns:        0 if well-aligned pointer, non-0 if either kind of exception.
 */
static inline int radix_tree_exception(void *arg)
{
        return unlikely((unsigned long)arg & RADIX_TREE_ENTRY_MASK);
}

int radix_tree_insert(struct radix_tree_root *, unsigned long index,
                        void *);
void *__radix_tree_lookup(const struct radix_tree_root *, unsigned long index,
                          struct radix_tree_node **nodep, void __rcu ***slotp);
void *radix_tree_lookup(const struct radix_tree_root *, unsigned long);
void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *,
                                        unsigned long index);
void __radix_tree_replace(struct radix_tree_root *, struct radix_tree_node *,
                          void __rcu **slot, void *entry);
void radix_tree_iter_replace(struct radix_tree_root *,
                const struct radix_tree_iter *, void __rcu **slot, void *entry);
void radix_tree_replace_slot(struct radix_tree_root *,
                             void __rcu **slot, void *entry);
void radix_tree_iter_delete(struct radix_tree_root *,
                        struct radix_tree_iter *iter, void __rcu **slot);
void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
void *radix_tree_delete(struct radix_tree_root *, unsigned long);
unsigned int radix_tree_gang_lookup(const struct radix_tree_root *,
                        void **results, unsigned long first_index,
                        unsigned int max_items);
int radix_tree_preload(gfp_t gfp_mask);
int radix_tree_maybe_preload(gfp_t gfp_mask);
void radix_tree_init(void);
void *radix_tree_tag_set(struct radix_tree_root *,
                        unsigned long index, unsigned int tag);
void *radix_tree_tag_clear(struct radix_tree_root *,
                        unsigned long index, unsigned int tag);
int radix_tree_tag_get(const struct radix_tree_root *,
                        unsigned long index, unsigned int tag);
void radix_tree_iter_tag_clear(struct radix_tree_root *,
                const struct radix_tree_iter *iter, unsigned int tag);
unsigned int radix_tree_gang_lookup_tag(const struct radix_tree_root *,
                void **results, unsigned long first_index,
                unsigned int max_items, unsigned int tag);
unsigned int radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *,
                void __rcu ***results, unsigned long first_index,
                unsigned int max_items, unsigned int tag);
int radix_tree_tagged(const struct radix_tree_root *, unsigned int tag);

static inline void radix_tree_preload_end(void)
{
        local_unlock(&radix_tree_preloads.lock);
}

void __rcu **idr_get_free(struct radix_tree_root *root,
                              struct radix_tree_iter *iter, gfp_t gfp,
                              unsigned long max);

enum {
        RADIX_TREE_ITER_TAG_MASK = 0x0f,        /* tag index in lower nybble */
        RADIX_TREE_ITER_TAGGED   = 0x10,        /* lookup tagged slots */
        RADIX_TREE_ITER_CONTIG   = 0x20,        /* stop at first hole */
};

/**
 * radix_tree_iter_init - initialize radix tree iterator
 *
 * @iter:        pointer to iterator state
 * @start:        iteration starting index
 * Returns:        NULL
 */
static __always_inline void __rcu **
radix_tree_iter_init(struct radix_tree_iter *iter, unsigned long start)
{
        /*
         * Leave iter->tags uninitialized. radix_tree_next_chunk() will fill it
         * in the case of a successful tagged chunk lookup.  If the lookup was
         * unsuccessful or non-tagged then nobody cares about ->tags.
         *
         * Set index to zero to bypass next_index overflow protection.
         * See the comment in radix_tree_next_chunk() for details.
         */
        iter->index = 0;
        iter->next_index = start;
        return NULL;
}

/**
 * radix_tree_next_chunk - find next chunk of slots for iteration
 *
 * @root:        radix tree root
 * @iter:        iterator state
 * @flags:        RADIX_TREE_ITER_* flags and tag index
 * Returns:        pointer to chunk first slot, or NULL if there no more left
 *
 * This function looks up the next chunk in the radix tree starting from
 * @iter->next_index.  It returns a pointer to the chunk's first slot.
 * Also it fills @iter with data about chunk: position in the tree (index),
 * its end (next_index), and constructs a bit mask for tagged iterating (tags).
 */
void __rcu **radix_tree_next_chunk(const struct radix_tree_root *,
                             struct radix_tree_iter *iter, unsigned flags);

/**
 * radix_tree_iter_lookup - look up an index in the radix tree
 * @root: radix tree root
 * @iter: iterator state
 * @index: key to look up
 *
 * If @index is present in the radix tree, this function returns the slot
 * containing it and updates @iter to describe the entry.  If @index is not
 * present, it returns NULL.
 */
static inline void __rcu **
radix_tree_iter_lookup(const struct radix_tree_root *root,
                        struct radix_tree_iter *iter, unsigned long index)
{
        radix_tree_iter_init(iter, index);
        return radix_tree_next_chunk(root, iter, RADIX_TREE_ITER_CONTIG);
}

/**
 * radix_tree_iter_retry - retry this chunk of the iteration
 * @iter:        iterator state
 *
 * If we iterate over a tree protected only by the RCU lock, a race
 * against deletion or creation may result in seeing a slot for which
 * radix_tree_deref_retry() returns true.  If so, call this function
 * and continue the iteration.
 */
static inline __must_check
void __rcu **radix_tree_iter_retry(struct radix_tree_iter *iter)
{
        iter->next_index = iter->index;
        iter->tags = 0;
        return NULL;
}

static inline unsigned long
__radix_tree_iter_add(struct radix_tree_iter *iter, unsigned long slots)
{
        return iter->index + slots;
}

/**
 * radix_tree_iter_resume - resume iterating when the chunk may be invalid
 * @slot: pointer to current slot
 * @iter: iterator state
 * Returns: New slot pointer
 *
 * If the iterator needs to release then reacquire a lock, the chunk may
 * have been invalidated by an insertion or deletion.  Call this function
 * before releasing the lock to continue the iteration from the next index.
 */
void __rcu **__must_check radix_tree_iter_resume(void __rcu **slot,
                                        struct radix_tree_iter *iter);

/**
 * radix_tree_chunk_size - get current chunk size
 *
 * @iter:        pointer to radix tree iterator
 * Returns:        current chunk size
 */
static __always_inline long
radix_tree_chunk_size(struct radix_tree_iter *iter)
{
        return iter->next_index - iter->index;
}

/**
 * radix_tree_next_slot - find next slot in chunk
 *
 * @slot:        pointer to current slot
 * @iter:        pointer to iterator state
 * @flags:        RADIX_TREE_ITER_*, should be constant
 * Returns:        pointer to next slot, or NULL if there no more left
 *
 * This function updates @iter->index in the case of a successful lookup.
 * For tagged lookup it also eats @iter->tags.
 *
 * There are several cases where 'slot' can be passed in as NULL to this
 * function.  These cases result from the use of radix_tree_iter_resume() or
 * radix_tree_iter_retry().  In these cases we don't end up dereferencing
 * 'slot' because either:
 * a) we are doing tagged iteration and iter->tags has been set to 0, or
 * b) we are doing non-tagged iteration, and iter->index and iter->next_index
 *    have been set up so that radix_tree_chunk_size() returns 1 or 0.
 */
static __always_inline void __rcu **radix_tree_next_slot(void __rcu **slot,
                                struct radix_tree_iter *iter, unsigned flags)
{
        if (flags & RADIX_TREE_ITER_TAGGED) {
                iter->tags >>= 1;
                if (unlikely(!iter->tags))
                        return NULL;
                if (likely(iter->tags & 1ul)) {
                        iter->index = __radix_tree_iter_add(iter, 1);
                        slot++;
                        goto found;
                }
                if (!(flags & RADIX_TREE_ITER_CONTIG)) {
                        unsigned offset = __ffs(iter->tags);

                        iter->tags >>= offset++;
                        iter->index = __radix_tree_iter_add(iter, offset);
                        slot += offset;
                        goto found;
                }
        } else {
                long count = radix_tree_chunk_size(iter);

                while (--count > 0) {
                        slot++;
                        iter->index = __radix_tree_iter_add(iter, 1);

                        if (likely(*slot))
                                goto found;
                        if (flags & RADIX_TREE_ITER_CONTIG) {
                                /* forbid switching to the next chunk */
                                iter->next_index = 0;
                                break;
                        }
                }
        }
        return NULL;

 found:
        return slot;
}

/**
 * radix_tree_for_each_slot - iterate over non-empty slots
 *
 * @slot:        the void** variable for pointer to slot
 * @root:        the struct radix_tree_root pointer
 * @iter:        the struct radix_tree_iter pointer
 * @start:        iteration starting index
 *
 * @slot points to radix tree slot, @iter->index contains its index.
 */
#define radix_tree_for_each_slot(slot, root, iter, start)                \
        for (slot = radix_tree_iter_init(iter, start) ;                        \
             slot || (slot = radix_tree_next_chunk(root, iter, 0)) ;        \
             slot = radix_tree_next_slot(slot, iter, 0))

/**
 * radix_tree_for_each_tagged - iterate over tagged slots
 *
 * @slot:        the void** variable for pointer to slot
 * @root:        the struct radix_tree_root pointer
 * @iter:        the struct radix_tree_iter pointer
 * @start:        iteration starting index
 * @tag:        tag index
 *
 * @slot points to radix tree slot, @iter->index contains its index.
 */
#define radix_tree_for_each_tagged(slot, root, iter, start, tag)        \
        for (slot = radix_tree_iter_init(iter, start) ;                        \
             slot || (slot = radix_tree_next_chunk(root, iter,                \
                              RADIX_TREE_ITER_TAGGED | tag)) ;                \
             slot = radix_tree_next_slot(slot, iter,                        \
                                RADIX_TREE_ITER_TAGGED | tag))

#endif /* _LINUX_RADIX_TREE_H */












































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_IO_H
#define _ASM_X86_IO_H

/*
 * This file contains the definitions for the x86 IO instructions
 * inb/inw/inl/outb/outw/outl and the "string versions" of the same
 * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
 * versions of the single-IO instructions (inb_p/inw_p/..).
 *
 * This file is not meant to be obfuscating: it's just complicated
 * to (a) handle it all in a way that makes gcc able to optimize it
 * as well as possible and (b) trying to avoid writing the same thing
 * over and over again with slight variations and possibly making a
 * mistake somewhere.
 */

/*
 * Thanks to James van Artsdalen for a better timing-fix than
 * the two short jumps: using outb's to a nonexistent port seems
 * to guarantee better timings even on fast machines.
 *
 * On the other hand, I'd like to be sure of a non-existent port:
 * I feel a bit unsafe about using 0x80 (should be safe, though)
 *
 *                Linus
 */

 /*
  *  Bit simplified and optimized by Jan Hubicka
  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
  *
  *  isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
  *  isa_read[wl] and isa_write[wl] fixed
  *  - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
  */

#define ARCH_HAS_IOREMAP_WC
#define ARCH_HAS_IOREMAP_WT

#include <linux/string.h>
#include <linux/compiler.h>
#include <asm/page.h>
#include <asm/early_ioremap.h>
#include <asm/pgtable_types.h>

#define build_mmio_read(name, size, type, reg, barrier) \
static inline type name(const volatile void __iomem *addr) \
{ type ret; asm volatile("mov" size " %1,%0":reg (ret) \
:"m" (*(volatile type __force *)addr) barrier); return ret; }

#define build_mmio_write(name, size, type, reg, barrier) \
static inline void name(type val, volatile void __iomem *addr) \
{ asm volatile("mov" size " %0,%1": :reg (val), \
"m" (*(volatile type __force *)addr) barrier); }

build_mmio_read(readb, "b", unsigned char, "=q", :"memory")
build_mmio_read(readw, "w", unsigned short, "=r", :"memory")
build_mmio_read(readl, "l", unsigned int, "=r", :"memory")

build_mmio_read(__readb, "b", unsigned char, "=q", )
build_mmio_read(__readw, "w", unsigned short, "=r", )
build_mmio_read(__readl, "l", unsigned int, "=r", )

build_mmio_write(writeb, "b", unsigned char, "q", :"memory")
build_mmio_write(writew, "w", unsigned short, "r", :"memory")
build_mmio_write(writel, "l", unsigned int, "r", :"memory")

build_mmio_write(__writeb, "b", unsigned char, "q", )
build_mmio_write(__writew, "w", unsigned short, "r", )
build_mmio_write(__writel, "l", unsigned int, "r", )

#define readb readb
#define readw readw
#define readl readl
#define readb_relaxed(a) __readb(a)
#define readw_relaxed(a) __readw(a)
#define readl_relaxed(a) __readl(a)
#define __raw_readb __readb
#define __raw_readw __readw
#define __raw_readl __readl

#define writeb writeb
#define writew writew
#define writel writel
#define writeb_relaxed(v, a) __writeb(v, a)
#define writew_relaxed(v, a) __writew(v, a)
#define writel_relaxed(v, a) __writel(v, a)
#define __raw_writeb __writeb
#define __raw_writew __writew
#define __raw_writel __writel

#ifdef CONFIG_X86_64

build_mmio_read(readq, "q", u64, "=r", :"memory")
build_mmio_read(__readq, "q", u64, "=r", )
build_mmio_write(writeq, "q", u64, "r", :"memory")
build_mmio_write(__writeq, "q", u64, "r", )

#define readq_relaxed(a)        __readq(a)
#define writeq_relaxed(v, a)        __writeq(v, a)

#define __raw_readq                __readq
#define __raw_writeq                __writeq

/* Let people know that we have them */
#define readq                        readq
#define writeq                        writeq

#endif

#define ARCH_HAS_VALID_PHYS_ADDR_RANGE
extern int valid_phys_addr_range(phys_addr_t addr, size_t size);
extern int valid_mmap_phys_addr_range(unsigned long pfn, size_t size);

/**
 *        virt_to_phys        -        map virtual addresses to physical
 *        @address: address to remap
 *
 *        The returned physical address is the physical (CPU) mapping for
 *        the memory address given. It is only valid to use this function on
 *        addresses directly mapped or allocated via kmalloc.
 *
 *        This function does not give bus mappings for DMA transfers. In
 *        almost all conceivable cases a device driver should not be using
 *        this function
 */

static inline phys_addr_t virt_to_phys(volatile void *address)
{
        return __pa(address);
}
#define virt_to_phys virt_to_phys

/**
 *        phys_to_virt        -        map physical address to virtual
 *        @address: address to remap
 *
 *        The returned virtual address is a current CPU mapping for
 *        the memory address given. It is only valid to use this function on
 *        addresses that have a kernel mapping
 *
 *        This function does not handle bus mappings for DMA transfers. In
 *        almost all conceivable cases a device driver should not be using
 *        this function
 */

static inline void *phys_to_virt(phys_addr_t address)
{
        return __va(address);
}
#define phys_to_virt phys_to_virt

/*
 * Change "struct page" to physical address.
 */
#define page_to_phys(page)    ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)

/*
 * ISA I/O bus memory addresses are 1:1 with the physical address.
 * However, we truncate the address to unsigned int to avoid undesirable
 * promitions in legacy drivers.
 */
static inline unsigned int isa_virt_to_bus(volatile void *address)
{
        return (unsigned int)virt_to_phys(address);
}
#define isa_bus_to_virt                phys_to_virt

/*
 * However PCI ones are not necessarily 1:1 and therefore these interfaces
 * are forbidden in portable PCI drivers.
 *
 * Allow them on x86 for legacy drivers, though.
 */
#define virt_to_bus virt_to_phys
#define bus_to_virt phys_to_virt

/*
 * The default ioremap() behavior is non-cached; if you need something
 * else, you probably want one of the following.
 */
extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size);
#define ioremap_uc ioremap_uc
extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
#define ioremap_cache ioremap_cache
extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val);
#define ioremap_prot ioremap_prot
extern void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size);
#define ioremap_encrypted ioremap_encrypted

/**
 * ioremap     -   map bus memory into CPU space
 * @offset:    bus address of the memory
 * @size:      size of the resource to map
 *
 * ioremap performs a platform specific sequence of operations to
 * make bus memory CPU accessible via the readb/readw/readl/writeb/
 * writew/writel functions and the other mmio helpers. The returned
 * address is not guaranteed to be usable directly as a virtual
 * address.
 *
 * If the area you are trying to map is a PCI BAR you should have a
 * look at pci_iomap().
 */
void __iomem *ioremap(resource_size_t offset, unsigned long size);
#define ioremap ioremap

extern void iounmap(volatile void __iomem *addr);
#define iounmap iounmap

extern void set_iounmap_nonlazy(void);

#ifdef __KERNEL__

void memcpy_fromio(void *, const volatile void __iomem *, size_t);
void memcpy_toio(volatile void __iomem *, const void *, size_t);
void memset_io(volatile void __iomem *, int, size_t);

#define memcpy_fromio memcpy_fromio
#define memcpy_toio memcpy_toio
#define memset_io memset_io

#include <asm-generic/iomap.h>

/*
 * ISA space is 'always mapped' on a typical x86 system, no need to
 * explicitly ioremap() it. The fact that the ISA IO space is mapped
 * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
 * are physical addresses. The following constant pointer can be
 * used as the IO-area pointer (it can be iounmapped as well, so the
 * analogy with PCI is quite large):
 */
#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET))

#endif /* __KERNEL__ */

extern void native_io_delay(void);

extern int io_delay_type;
extern void io_delay_init(void);

#if defined(CONFIG_PARAVIRT)
#include <asm/paravirt.h>
#else

static inline void slow_down_io(void)
{
        native_io_delay();
#ifdef REALLY_SLOW_IO
        native_io_delay();
        native_io_delay();
        native_io_delay();
#endif
}

#endif

#ifdef CONFIG_AMD_MEM_ENCRYPT
#include <linux/jump_label.h>

extern struct static_key_false sev_enable_key;
static inline bool sev_key_active(void)
{
        return static_branch_unlikely(&sev_enable_key);
}

#else /* !CONFIG_AMD_MEM_ENCRYPT */

static inline bool sev_key_active(void) { return false; }

#endif /* CONFIG_AMD_MEM_ENCRYPT */

#define BUILDIO(bwl, bw, type)                                                \
static inline void out##bwl(unsigned type value, int port)                \
{                                                                        \
        asm volatile("out" #bwl " %" #bw "0, %w1"                        \
                     : : "a"(value), "Nd"(port));                        \
}                                                                        \
                                                                        \
static inline unsigned type in##bwl(int port)                                \
{                                                                        \
        unsigned type value;                                                \
        asm volatile("in" #bwl " %w1, %" #bw "0"                        \
                     : "=a"(value) : "Nd"(port));                        \
        return value;                                                        \
}                                                                        \
                                                                        \
static inline void out##bwl##_p(unsigned type value, int port)                \
{                                                                        \
        out##bwl(value, port);                                                \
        slow_down_io();                                                        \
}                                                                        \
                                                                        \
static inline unsigned type in##bwl##_p(int port)                        \
{                                                                        \
        unsigned type value = in##bwl(port);                                \
        slow_down_io();                                                        \
        return value;                                                        \
}                                                                        \
                                                                        \
static inline void outs##bwl(int port, const void *addr, unsigned long count) \
{                                                                        \
        if (sev_key_active()) {                                                \
                unsigned type *value = (unsigned type *)addr;                \
                while (count) {                                                \
                        out##bwl(*value, port);                                \
                        value++;                                        \
                        count--;                                        \
                }                                                        \
        } else {                                                        \
                asm volatile("rep; outs" #bwl                                \
                             : "+S"(addr), "+c"(count)                        \
                             : "d"(port) : "memory");                        \
        }                                                                \
}                                                                        \
                                                                        \
static inline void ins##bwl(int port, void *addr, unsigned long count)        \
{                                                                        \
        if (sev_key_active()) {                                                \
                unsigned type *value = (unsigned type *)addr;                \
                while (count) {                                                \
                        *value = in##bwl(port);                                \
                        value++;                                        \
                        count--;                                        \
                }                                                        \
        } else {                                                        \
                asm volatile("rep; ins" #bwl                                \
                             : "+D"(addr), "+c"(count)                        \
                             : "d"(port) : "memory");                        \
        }                                                                \
}

BUILDIO(b, b, char)
BUILDIO(w, w, short)
BUILDIO(l, , int)

#define inb inb
#define inw inw
#define inl inl
#define inb_p inb_p
#define inw_p inw_p
#define inl_p inl_p
#define insb insb
#define insw insw
#define insl insl

#define outb outb
#define outw outw
#define outl outl
#define outb_p outb_p
#define outw_p outw_p
#define outl_p outl_p
#define outsb outsb
#define outsw outsw
#define outsl outsl

extern void *xlate_dev_mem_ptr(phys_addr_t phys);
extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr);

#define xlate_dev_mem_ptr xlate_dev_mem_ptr
#define unxlate_dev_mem_ptr unxlate_dev_mem_ptr

extern int ioremap_change_attr(unsigned long vaddr, unsigned long size,
                                enum page_cache_mode pcm);
extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size);
#define ioremap_wc ioremap_wc
extern void __iomem *ioremap_wt(resource_size_t offset, unsigned long size);
#define ioremap_wt ioremap_wt

extern bool is_early_ioremap_ptep(pte_t *ptep);

#define IO_SPACE_LIMIT 0xffff

#include <asm-generic/io.h>
#undef PCI_IOBASE

#ifdef CONFIG_MTRR
extern int __must_check arch_phys_wc_index(int handle);
#define arch_phys_wc_index arch_phys_wc_index

extern int __must_check arch_phys_wc_add(unsigned long base,
                                         unsigned long size);
extern void arch_phys_wc_del(int handle);
#define arch_phys_wc_add arch_phys_wc_add
#endif

#ifdef CONFIG_X86_PAT
extern int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size);
extern void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size);
#define arch_io_reserve_memtype_wc arch_io_reserve_memtype_wc
#endif

extern bool arch_memremap_can_ram_remap(resource_size_t offset,
                                        unsigned long size,
                                        unsigned long flags);
#define arch_memremap_can_ram_remap arch_memremap_can_ram_remap

extern bool phys_mem_access_encrypted(unsigned long phys_addr,
                                      unsigned long size);

/**
 * iosubmit_cmds512 - copy data to single MMIO location, in 512-bit units
 * @dst: destination, in MMIO space (must be 512-bit aligned)
 * @src: source
 * @count: number of 512 bits quantities to submit
 *
 * Submit data from kernel space to MMIO space, in units of 512 bits at a
 * time.  Order of access is not guaranteed, nor is a memory barrier
 * performed afterwards.
 *
 * Warning: Do not use this helper unless your driver has checked that the CPU
 * instruction is supported on the platform.
 */
static inline void iosubmit_cmds512(void __iomem *dst, const void *src,
                                    size_t count)
{
        const u8 *from = src;
        const u8 *end = from + count * 64;

        while (from < end) {
                movdir64b(dst, from);
                from += 64;
        }
}

#endif /* _ASM_X86_IO_H */



























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    4 
































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Generic hugetlb support.
 * (C) Nadia Yvette Chambers, April 2004
 */
#include <linux/list.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/seq_file.h>
#include <linux/sysctl.h>
#include <linux/highmem.h>
#include <linux/mmu_notifier.h>
#include <linux/nodemask.h>
#include <linux/pagemap.h>
#include <linux/mempolicy.h>
#include <linux/compiler.h>
#include <linux/cpuset.h>
#include <linux/mutex.h>
#include <linux/memblock.h>
#include <linux/sysfs.h>
#include <linux/slab.h>
#include <linux/sched/mm.h>
#include <linux/mmdebug.h>
#include <linux/sched/signal.h>
#include <linux/rmap.h>
#include <linux/string_helpers.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/jhash.h>
#include <linux/numa.h>
#include <linux/llist.h>
#include <linux/cma.h>

#include <asm/page.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>

#include <linux/io.h>
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
#include <linux/node.h>
#include <linux/userfaultfd_k.h>
#include <linux/page_owner.h>
#include "internal.h"

int hugetlb_max_hstate __read_mostly;
unsigned int default_hstate_idx;
struct hstate hstates[HUGE_MAX_HSTATE];

#ifdef CONFIG_CMA
static struct cma *hugetlb_cma[MAX_NUMNODES];
#endif
static unsigned long hugetlb_cma_size __initdata;

/*
 * Minimum page order among possible hugepage sizes, set to a proper value
 * at boot time.
 */
static unsigned int minimum_order __read_mostly = UINT_MAX;

__initdata LIST_HEAD(huge_boot_pages);

/* for command line parsing */
static struct hstate * __initdata parsed_hstate;
static unsigned long __initdata default_hstate_max_huge_pages;
static bool __initdata parsed_valid_hugepagesz = true;
static bool __initdata parsed_default_hugepagesz;

/*
 * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
 * free_huge_pages, and surplus_huge_pages.
 */
DEFINE_SPINLOCK(hugetlb_lock);

/*
 * Serializes faults on the same logical page.  This is used to
 * prevent spurious OOMs when the hugepage pool is fully utilized.
 */
static int num_fault_mutexes;
struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;

static inline bool PageHugeFreed(struct page *head)
{
        return page_private(head + 4) == -1UL;
}

static inline void SetPageHugeFreed(struct page *head)
{
        set_page_private(head + 4, -1UL);
}

static inline void ClearPageHugeFreed(struct page *head)
{
        set_page_private(head + 4, 0);
}

/* Forward declaration */
static int hugetlb_acct_memory(struct hstate *h, long delta);
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
                unsigned long start, unsigned long end, bool take_locks);

static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
{
        bool free = (spool->count == 0) && (spool->used_hpages == 0);

        spin_unlock(&spool->lock);

        /* If no pages are used, and no other handles to the subpool
         * remain, give up any reservations based on minimum size and
         * free the subpool */
        if (free) {
                if (spool->min_hpages != -1)
                        hugetlb_acct_memory(spool->hstate,
                                                -spool->min_hpages);
                kfree(spool);
        }
}

struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
                                                long min_hpages)
{
        struct hugepage_subpool *spool;

        spool = kzalloc(sizeof(*spool), GFP_KERNEL);
        if (!spool)
                return NULL;

        spin_lock_init(&spool->lock);
        spool->count = 1;
        spool->max_hpages = max_hpages;
        spool->hstate = h;
        spool->min_hpages = min_hpages;

        if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
                kfree(spool);
                return NULL;
        }
        spool->rsv_hpages = min_hpages;

        return spool;
}

void hugepage_put_subpool(struct hugepage_subpool *spool)
{
        spin_lock(&spool->lock);
        BUG_ON(!spool->count);
        spool->count--;
        unlock_or_release_subpool(spool);
}

/*
 * Subpool accounting for allocating and reserving pages.
 * Return -ENOMEM if there are not enough resources to satisfy the
 * request.  Otherwise, return the number of pages by which the
 * global pools must be adjusted (upward).  The returned value may
 * only be different than the passed value (delta) in the case where
 * a subpool minimum size must be maintained.
 */
static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
                                      long delta)
{
        long ret = delta;

        if (!spool)
                return ret;

        spin_lock(&spool->lock);

        if (spool->max_hpages != -1) {                /* maximum size accounting */
                if ((spool->used_hpages + delta) <= spool->max_hpages)
                        spool->used_hpages += delta;
                else {
                        ret = -ENOMEM;
                        goto unlock_ret;
                }
        }

        /* minimum size accounting */
        if (spool->min_hpages != -1 && spool->rsv_hpages) {
                if (delta > spool->rsv_hpages) {
                        /*
                         * Asking for more reserves than those already taken on
                         * behalf of subpool.  Return difference.
                         */
                        ret = delta - spool->rsv_hpages;
                        spool->rsv_hpages = 0;
                } else {
                        ret = 0;        /* reserves already accounted for */
                        spool->rsv_hpages -= delta;
                }
        }

unlock_ret:
        spin_unlock(&spool->lock);
        return ret;
}

/*
 * Subpool accounting for freeing and unreserving pages.
 * Return the number of global page reservations that must be dropped.
 * The return value may only be different than the passed value (delta)
 * in the case where a subpool minimum size must be maintained.
 */
static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
                                       long delta)
{
        long ret = delta;

        if (!spool)
                return delta;

        spin_lock(&spool->lock);

        if (spool->max_hpages != -1)                /* maximum size accounting */
                spool->used_hpages -= delta;

         /* minimum size accounting */
        if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) {
                if (spool->rsv_hpages + delta <= spool->min_hpages)
                        ret = 0;
                else
                        ret = spool->rsv_hpages + delta - spool->min_hpages;

                spool->rsv_hpages += delta;
                if (spool->rsv_hpages > spool->min_hpages)
                        spool->rsv_hpages = spool->min_hpages;
        }

        /*
         * If hugetlbfs_put_super couldn't free spool due to an outstanding
         * quota reference, free it now.
         */
        unlock_or_release_subpool(spool);

        return ret;
}

static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
{
        return HUGETLBFS_SB(inode->i_sb)->spool;
}

static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
{
        return subpool_inode(file_inode(vma->vm_file));
}

/* Helper that removes a struct file_region from the resv_map cache and returns
 * it for use.
 */
static struct file_region *
get_file_region_entry_from_cache(struct resv_map *resv, long from, long to)
{
        struct file_region *nrg = NULL;

        VM_BUG_ON(resv->region_cache_count <= 0);

        resv->region_cache_count--;
        nrg = list_first_entry(&resv->region_cache, struct file_region, link);
        list_del(&nrg->link);

        nrg->from = from;
        nrg->to = to;

        return nrg;
}

static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg,
                                              struct file_region *rg)
{
#ifdef CONFIG_CGROUP_HUGETLB
        nrg->reservation_counter = rg->reservation_counter;
        nrg->css = rg->css;
        if (rg->css)
                css_get(rg->css);
#endif
}

/* Helper that records hugetlb_cgroup uncharge info. */
static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
                                                struct hstate *h,
                                                struct resv_map *resv,
                                                struct file_region *nrg)
{
#ifdef CONFIG_CGROUP_HUGETLB
        if (h_cg) {
                nrg->reservation_counter =
                        &h_cg->rsvd_hugepage[hstate_index(h)];
                nrg->css = &h_cg->css;
                /*
                 * The caller will hold exactly one h_cg->css reference for the
                 * whole contiguous reservation region. But this area might be
                 * scattered when there are already some file_regions reside in
                 * it. As a result, many file_regions may share only one css
                 * reference. In order to ensure that one file_region must hold
                 * exactly one h_cg->css reference, we should do css_get for
                 * each file_region and leave the reference held by caller
                 * untouched.
                 */
                css_get(&h_cg->css);
                if (!resv->pages_per_hpage)
                        resv->pages_per_hpage = pages_per_huge_page(h);
                /* pages_per_hpage should be the same for all entries in
                 * a resv_map.
                 */
                VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h));
        } else {
                nrg->reservation_counter = NULL;
                nrg->css = NULL;
        }
#endif
}

static void put_uncharge_info(struct file_region *rg)
{
#ifdef CONFIG_CGROUP_HUGETLB
        if (rg->css)
                css_put(rg->css);
#endif
}

static bool has_same_uncharge_info(struct file_region *rg,
                                   struct file_region *org)
{
#ifdef CONFIG_CGROUP_HUGETLB
        return rg && org &&
               rg->reservation_counter == org->reservation_counter &&
               rg->css == org->css;

#else
        return true;
#endif
}

static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
{
        struct file_region *nrg = NULL, *prg = NULL;

        prg = list_prev_entry(rg, link);
        if (&prg->link != &resv->regions && prg->to == rg->from &&
            has_same_uncharge_info(prg, rg)) {
                prg->to = rg->to;

                list_del(&rg->link);
                put_uncharge_info(rg);
                kfree(rg);

                rg = prg;
        }

        nrg = list_next_entry(rg, link);
        if (&nrg->link != &resv->regions && nrg->from == rg->to &&
            has_same_uncharge_info(nrg, rg)) {
                nrg->from = rg->from;

                list_del(&rg->link);
                put_uncharge_info(rg);
                kfree(rg);
        }
}

/*
 * Must be called with resv->lock held.
 *
 * Calling this with regions_needed != NULL will count the number of pages
 * to be added but will not modify the linked list. And regions_needed will
 * indicate the number of file_regions needed in the cache to carry out to add
 * the regions for this range.
 */
static long add_reservation_in_range(struct resv_map *resv, long f, long t,
                                     struct hugetlb_cgroup *h_cg,
                                     struct hstate *h, long *regions_needed)
{
        long add = 0;
        struct list_head *head = &resv->regions;
        long last_accounted_offset = f;
        struct file_region *rg = NULL, *trg = NULL, *nrg = NULL;

        if (regions_needed)
                *regions_needed = 0;

        /* In this loop, we essentially handle an entry for the range
         * [last_accounted_offset, rg->from), at every iteration, with some
         * bounds checking.
         */
        list_for_each_entry_safe(rg, trg, head, link) {
                /* Skip irrelevant regions that start before our range. */
                if (rg->from < f) {
                        /* If this region ends after the last accounted offset,
                         * then we need to update last_accounted_offset.
                         */
                        if (rg->to > last_accounted_offset)
                                last_accounted_offset = rg->to;
                        continue;
                }

                /* When we find a region that starts beyond our range, we've
                 * finished.
                 */
                if (rg->from > t)
                        break;

                /* Add an entry for last_accounted_offset -> rg->from, and
                 * update last_accounted_offset.
                 */
                if (rg->from > last_accounted_offset) {
                        add += rg->from - last_accounted_offset;
                        if (!regions_needed) {
                                nrg = get_file_region_entry_from_cache(
                                        resv, last_accounted_offset, rg->from);
                                record_hugetlb_cgroup_uncharge_info(h_cg, h,
                                                                    resv, nrg);
                                list_add(&nrg->link, rg->link.prev);
                                coalesce_file_region(resv, nrg);
                        } else
                                *regions_needed += 1;
                }

                last_accounted_offset = rg->to;
        }

        /* Handle the case where our range extends beyond
         * last_accounted_offset.
         */
        if (last_accounted_offset < t) {
                add += t - last_accounted_offset;
                if (!regions_needed) {
                        nrg = get_file_region_entry_from_cache(
                                resv, last_accounted_offset, t);
                        record_hugetlb_cgroup_uncharge_info(h_cg, h, resv, nrg);
                        list_add(&nrg->link, rg->link.prev);
                        coalesce_file_region(resv, nrg);
                } else
                        *regions_needed += 1;
        }

        VM_BUG_ON(add < 0);
        return add;
}

/* Must be called with resv->lock acquired. Will drop lock to allocate entries.
 */
static int allocate_file_region_entries(struct resv_map *resv,
                                        int regions_needed)
        __must_hold(&resv->lock)
{
        struct list_head allocated_regions;
        int to_allocate = 0, i = 0;
        struct file_region *trg = NULL, *rg = NULL;

        VM_BUG_ON(regions_needed < 0);

        INIT_LIST_HEAD(&allocated_regions);

        /*
         * Check for sufficient descriptors in the cache to accommodate
         * the number of in progress add operations plus regions_needed.
         *
         * This is a while loop because when we drop the lock, some other call
         * to region_add or region_del may have consumed some region_entries,
         * so we keep looping here until we finally have enough entries for
         * (adds_in_progress + regions_needed).
         */
        while (resv->region_cache_count <
               (resv->adds_in_progress + regions_needed)) {
                to_allocate = resv->adds_in_progress + regions_needed -
                              resv->region_cache_count;

                /* At this point, we should have enough entries in the cache
                 * for all the existings adds_in_progress. We should only be
                 * needing to allocate for regions_needed.
                 */
                VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress);

                spin_unlock(&resv->lock);
                for (i = 0; i < to_allocate; i++) {
                        trg = kmalloc(sizeof(*trg), GFP_KERNEL);
                        if (!trg)
                                goto out_of_memory;
                        list_add(&trg->link, &allocated_regions);
                }

                spin_lock(&resv->lock);

                list_splice(&allocated_regions, &resv->region_cache);
                resv->region_cache_count += to_allocate;
        }

        return 0;

out_of_memory:
        list_for_each_entry_safe(rg, trg, &allocated_regions, link) {
                list_del(&rg->link);
                kfree(rg);
        }
        return -ENOMEM;
}

/*
 * Add the huge page range represented by [f, t) to the reserve
 * map.  Regions will be taken from the cache to fill in this range.
 * Sufficient regions should exist in the cache due to the previous
 * call to region_chg with the same range, but in some cases the cache will not
 * have sufficient entries due to races with other code doing region_add or
 * region_del.  The extra needed entries will be allocated.
 *
 * regions_needed is the out value provided by a previous call to region_chg.
 *
 * Return the number of new huge pages added to the map.  This number is greater
 * than or equal to zero.  If file_region entries needed to be allocated for
 * this operation and we were not able to allocate, it returns -ENOMEM.
 * region_add of regions of length 1 never allocate file_regions and cannot
 * fail; region_chg will always allocate at least 1 entry and a region_add for
 * 1 page will only require at most 1 entry.
 */
static long region_add(struct resv_map *resv, long f, long t,
                       long in_regions_needed, struct hstate *h,
                       struct hugetlb_cgroup *h_cg)
{
        long add = 0, actual_regions_needed = 0;

        spin_lock(&resv->lock);
retry:

        /* Count how many regions are actually needed to execute this add. */
        add_reservation_in_range(resv, f, t, NULL, NULL,
                                 &actual_regions_needed);

        /*
         * Check for sufficient descriptors in the cache to accommodate
         * this add operation. Note that actual_regions_needed may be greater
         * than in_regions_needed, as the resv_map may have been modified since
         * the region_chg call. In this case, we need to make sure that we
         * allocate extra entries, such that we have enough for all the
         * existing adds_in_progress, plus the excess needed for this
         * operation.
         */
        if (actual_regions_needed > in_regions_needed &&
            resv->region_cache_count <
                    resv->adds_in_progress +
                            (actual_regions_needed - in_regions_needed)) {
                /* region_add operation of range 1 should never need to
                 * allocate file_region entries.
                 */
                VM_BUG_ON(t - f <= 1);

                if (allocate_file_region_entries(
                            resv, actual_regions_needed - in_regions_needed)) {
                        return -ENOMEM;
                }

                goto retry;
        }

        add = add_reservation_in_range(resv, f, t, h_cg, h, NULL);

        resv->adds_in_progress -= in_regions_needed;

        spin_unlock(&resv->lock);
        VM_BUG_ON(add < 0);
        return add;
}

/*
 * Examine the existing reserve map and determine how many
 * huge pages in the specified range [f, t) are NOT currently
 * represented.  This routine is called before a subsequent
 * call to region_add that will actually modify the reserve
 * map to add the specified range [f, t).  region_chg does
 * not change the number of huge pages represented by the
 * map.  A number of new file_region structures is added to the cache as a
 * placeholder, for the subsequent region_add call to use. At least 1
 * file_region structure is added.
 *
 * out_regions_needed is the number of regions added to the
 * resv->adds_in_progress.  This value needs to be provided to a follow up call
 * to region_add or region_abort for proper accounting.
 *
 * Returns the number of huge pages that need to be added to the existing
 * reservation map for the range [f, t).  This number is greater or equal to
 * zero.  -ENOMEM is returned if a new file_region structure or cache entry
 * is needed and can not be allocated.
 */
static long region_chg(struct resv_map *resv, long f, long t,
                       long *out_regions_needed)
{
        long chg = 0;

        spin_lock(&resv->lock);

        /* Count how many hugepages in this range are NOT represented. */
        chg = add_reservation_in_range(resv, f, t, NULL, NULL,
                                       out_regions_needed);

        if (*out_regions_needed == 0)
                *out_regions_needed = 1;

        if (allocate_file_region_entries(resv, *out_regions_needed))
                return -ENOMEM;

        resv->adds_in_progress += *out_regions_needed;

        spin_unlock(&resv->lock);
        return chg;
}

/*
 * Abort the in progress add operation.  The adds_in_progress field
 * of the resv_map keeps track of the operations in progress between
 * calls to region_chg and region_add.  Operations are sometimes
 * aborted after the call to region_chg.  In such cases, region_abort
 * is called to decrement the adds_in_progress counter. regions_needed
 * is the value returned by the region_chg call, it is used to decrement
 * the adds_in_progress counter.
 *
 * NOTE: The range arguments [f, t) are not needed or used in this
 * routine.  They are kept to make reading the calling code easier as
 * arguments will match the associated region_chg call.
 */
static void region_abort(struct resv_map *resv, long f, long t,
                         long regions_needed)
{
        spin_lock(&resv->lock);
        VM_BUG_ON(!resv->region_cache_count);
        resv->adds_in_progress -= regions_needed;
        spin_unlock(&resv->lock);
}

/*
 * Delete the specified range [f, t) from the reserve map.  If the
 * t parameter is LONG_MAX, this indicates that ALL regions after f
 * should be deleted.  Locate the regions which intersect [f, t)
 * and either trim, delete or split the existing regions.
 *
 * Returns the number of huge pages deleted from the reserve map.
 * In the normal case, the return value is zero or more.  In the
 * case where a region must be split, a new region descriptor must
 * be allocated.  If the allocation fails, -ENOMEM will be returned.
 * NOTE: If the parameter t == LONG_MAX, then we will never split
 * a region and possibly return -ENOMEM.  Callers specifying
 * t == LONG_MAX do not need to check for -ENOMEM error.
 */
static long region_del(struct resv_map *resv, long f, long t)
{
        struct list_head *head = &resv->regions;
        struct file_region *rg, *trg;
        struct file_region *nrg = NULL;
        long del = 0;

retry:
        spin_lock(&resv->lock);
        list_for_each_entry_safe(rg, trg, head, link) {
                /*
                 * Skip regions before the range to be deleted.  file_region
                 * ranges are normally of the form [from, to).  However, there
                 * may be a "placeholder" entry in the map which is of the form
                 * (from, to) with from == to.  Check for placeholder entries
                 * at the beginning of the range to be deleted.
                 */
                if (rg->to <= f && (rg->to != rg->from || rg->to != f))
                        continue;

                if (rg->from >= t)
                        break;

                if (f > rg->from && t < rg->to) { /* Must split region */
                        /*
                         * Check for an entry in the cache before dropping
                         * lock and attempting allocation.
                         */
                        if (!nrg &&
                            resv->region_cache_count > resv->adds_in_progress) {
                                nrg = list_first_entry(&resv->region_cache,
                                                        struct file_region,
                                                        link);
                                list_del(&nrg->link);
                                resv->region_cache_count--;
                        }

                        if (!nrg) {
                                spin_unlock(&resv->lock);
                                nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
                                if (!nrg)
                                        return -ENOMEM;
                                goto retry;
                        }

                        del += t - f;
                        hugetlb_cgroup_uncharge_file_region(
                                resv, rg, t - f, false);

                        /* New entry for end of split region */
                        nrg->from = t;
                        nrg->to = rg->to;

                        copy_hugetlb_cgroup_uncharge_info(nrg, rg);

                        INIT_LIST_HEAD(&nrg->link);

                        /* Original entry is trimmed */
                        rg->to = f;

                        list_add(&nrg->link, &rg->link);
                        nrg = NULL;
                        break;
                }

                if (f <= rg->from && t >= rg->to) { /* Remove entire region */
                        del += rg->to - rg->from;
                        hugetlb_cgroup_uncharge_file_region(resv, rg,
                                                            rg->to - rg->from, true);
                        list_del(&rg->link);
                        kfree(rg);
                        continue;
                }

                if (f <= rg->from) {        /* Trim beginning of region */
                        hugetlb_cgroup_uncharge_file_region(resv, rg,
                                                            t - rg->from, false);

                        del += t - rg->from;
                        rg->from = t;
                } else {                /* Trim end of region */
                        hugetlb_cgroup_uncharge_file_region(resv, rg,
                                                            rg->to - f, false);

                        del += rg->to - f;
                        rg->to = f;
                }
        }

        spin_unlock(&resv->lock);
        kfree(nrg);
        return del;
}

/*
 * A rare out of memory error was encountered which prevented removal of
 * the reserve map region for a page.  The huge page itself was free'ed
 * and removed from the page cache.  This routine will adjust the subpool
 * usage count, and the global reserve count if needed.  By incrementing
 * these counts, the reserve map entry which could not be deleted will
 * appear as a "reserved" entry instead of simply dangling with incorrect
 * counts.
 */
void hugetlb_fix_reserve_counts(struct inode *inode)
{
        struct hugepage_subpool *spool = subpool_inode(inode);
        long rsv_adjust;
        bool reserved = false;

        rsv_adjust = hugepage_subpool_get_pages(spool, 1);
        if (rsv_adjust > 0) {
                struct hstate *h = hstate_inode(inode);

                if (!hugetlb_acct_memory(h, 1))
                        reserved = true;
        } else if (!rsv_adjust) {
                reserved = true;
        }

        if (!reserved)
                pr_warn("hugetlb: Huge Page Reserved count may go negative.\n");
}

/*
 * Count and return the number of huge pages in the reserve map
 * that intersect with the range [f, t).
 */
static long region_count(struct resv_map *resv, long f, long t)
{
        struct list_head *head = &resv->regions;
        struct file_region *rg;
        long chg = 0;

        spin_lock(&resv->lock);
        /* Locate each segment we overlap with, and count that overlap. */
        list_for_each_entry(rg, head, link) {
                long seg_from;
                long seg_to;

                if (rg->to <= f)
                        continue;
                if (rg->from >= t)
                        break;

                seg_from = max(rg->from, f);
                seg_to = min(rg->to, t);

                chg += seg_to - seg_from;
        }
        spin_unlock(&resv->lock);

        return chg;
}

/*
 * Convert the address within this vma to the page offset within
 * the mapping, in pagecache page units; huge pages here.
 */
static pgoff_t vma_hugecache_offset(struct hstate *h,
                        struct vm_area_struct *vma, unsigned long address)
{
        return ((address - vma->vm_start) >> huge_page_shift(h)) +
                        (vma->vm_pgoff >> huge_page_order(h));
}

pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
                                     unsigned long address)
{
        return vma_hugecache_offset(hstate_vma(vma), vma, address);
}
EXPORT_SYMBOL_GPL(linear_hugepage_index);

/*
 * Return the size of the pages allocated when backing a VMA. In the majority
 * cases this will be same size as used by the page table entries.
 */
unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
{
        if (vma->vm_ops && vma->vm_ops->pagesize)
                return vma->vm_ops->pagesize(vma);
        return PAGE_SIZE;
}
EXPORT_SYMBOL_GPL(vma_kernel_pagesize);

/*
 * Return the page size being used by the MMU to back a VMA. In the majority
 * of cases, the page size used by the kernel matches the MMU size. On
 * architectures where it differs, an architecture-specific 'strong'
 * version of this symbol is required.
 */
__weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
{
        return vma_kernel_pagesize(vma);
}

/*
 * Flags for MAP_PRIVATE reservations.  These are stored in the bottom
 * bits of the reservation map pointer, which are always clear due to
 * alignment.
 */
#define HPAGE_RESV_OWNER    (1UL << 0)
#define HPAGE_RESV_UNMAPPED (1UL << 1)
#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)

/*
 * These helpers are used to track how many pages are reserved for
 * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
 * is guaranteed to have their future faults succeed.
 *
 * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
 * the reserve counters are updated with the hugetlb_lock held. It is safe
 * to reset the VMA at fork() time as it is not in use yet and there is no
 * chance of the global counters getting corrupted as a result of the values.
 *
 * The private mapping reservation is represented in a subtly different
 * manner to a shared mapping.  A shared mapping has a region map associated
 * with the underlying file, this region map represents the backing file
 * pages which have ever had a reservation assigned which this persists even
 * after the page is instantiated.  A private mapping has a region map
 * associated with the original mmap which is attached to all VMAs which
 * reference it, this region map represents those offsets which have consumed
 * reservation ie. where pages have been instantiated.
 */
static unsigned long get_vma_private_data(struct vm_area_struct *vma)
{
        return (unsigned long)vma->vm_private_data;
}

static void set_vma_private_data(struct vm_area_struct *vma,
                                                        unsigned long value)
{
        vma->vm_private_data = (void *)value;
}

static void
resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map,
                                          struct hugetlb_cgroup *h_cg,
                                          struct hstate *h)
{
#ifdef CONFIG_CGROUP_HUGETLB
        if (!h_cg || !h) {
                resv_map->reservation_counter = NULL;
                resv_map->pages_per_hpage = 0;
                resv_map->css = NULL;
        } else {
                resv_map->reservation_counter =
                        &h_cg->rsvd_hugepage[hstate_index(h)];
                resv_map->pages_per_hpage = pages_per_huge_page(h);
                resv_map->css = &h_cg->css;
        }
#endif
}

struct resv_map *resv_map_alloc(void)
{
        struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
        struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);

        if (!resv_map || !rg) {
                kfree(resv_map);
                kfree(rg);
                return NULL;
        }

        kref_init(&resv_map->refs);
        spin_lock_init(&resv_map->lock);
        INIT_LIST_HEAD(&resv_map->regions);

        resv_map->adds_in_progress = 0;
        /*
         * Initialize these to 0. On shared mappings, 0's here indicate these
         * fields don't do cgroup accounting. On private mappings, these will be
         * re-initialized to the proper values, to indicate that hugetlb cgroup
         * reservations are to be un-charged from here.
         */
        resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL);

        INIT_LIST_HEAD(&resv_map->region_cache);
        list_add(&rg->link, &resv_map->region_cache);
        resv_map->region_cache_count = 1;

        return resv_map;
}

void resv_map_release(struct kref *ref)
{
        struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
        struct list_head *head = &resv_map->region_cache;
        struct file_region *rg, *trg;

        /* Clear out any active regions before we release the map. */
        region_del(resv_map, 0, LONG_MAX);

        /* ... and any entries left in the cache */
        list_for_each_entry_safe(rg, trg, head, link) {
                list_del(&rg->link);
                kfree(rg);
        }

        VM_BUG_ON(resv_map->adds_in_progress);

        kfree(resv_map);
}

static inline struct resv_map *inode_resv_map(struct inode *inode)
{
        /*
         * At inode evict time, i_mapping may not point to the original
         * address space within the inode.  This original address space
         * contains the pointer to the resv_map.  So, always use the
         * address space embedded within the inode.
         * The VERY common case is inode->mapping == &inode->i_data but,
         * this may not be true for device special inodes.
         */
        return (struct resv_map *)(&inode->i_data)->private_data;
}

static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
{
        VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
        if (vma->vm_flags & VM_MAYSHARE) {
                struct address_space *mapping = vma->vm_file->f_mapping;
                struct inode *inode = mapping->host;

                return inode_resv_map(inode);

        } else {
                return (struct resv_map *)(get_vma_private_data(vma) &
                                                        ~HPAGE_RESV_MASK);
        }
}

static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
{
        VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
        VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);

        set_vma_private_data(vma, (get_vma_private_data(vma) &
                                HPAGE_RESV_MASK) | (unsigned long)map);
}

static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
{
        VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
        VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);

        set_vma_private_data(vma, get_vma_private_data(vma) | flags);
}

static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
{
        VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);

        return (get_vma_private_data(vma) & flag) != 0;
}

/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
{
        VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
        if (!(vma->vm_flags & VM_MAYSHARE))
                vma->vm_private_data = (void *)0;
}

/* Returns true if the VMA has associated reserve pages */
static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
{
        if (vma->vm_flags & VM_NORESERVE) {
                /*
                 * This address is already reserved by other process(chg == 0),
                 * so, we should decrement reserved count. Without decrementing,
                 * reserve count remains after releasing inode, because this
                 * allocated page will go into page cache and is regarded as
                 * coming from reserved pool in releasing step.  Currently, we
                 * don't have any other solution to deal with this situation
                 * properly, so add work-around here.
                 */
                if (vma->vm_flags & VM_MAYSHARE && chg == 0)
                        return true;
                else
                        return false;
        }

        /* Shared mappings always use reserves */
        if (vma->vm_flags & VM_MAYSHARE) {
                /*
                 * We know VM_NORESERVE is not set.  Therefore, there SHOULD
                 * be a region map for all pages.  The only situation where
                 * there is no region map is if a hole was punched via
                 * fallocate.  In this case, there really are no reserves to
                 * use.  This situation is indicated if chg != 0.
                 */
                if (chg)
                        return false;
                else
                        return true;
        }

        /*
         * Only the process that called mmap() has reserves for
         * private mappings.
         */
        if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
                /*
                 * Like the shared case above, a hole punch or truncate
                 * could have been performed on the private mapping.
                 * Examine the value of chg to determine if reserves
                 * actually exist or were previously consumed.
                 * Very Subtle - The value of chg comes from a previous
                 * call to vma_needs_reserves().  The reserve map for
                 * private mappings has different (opposite) semantics
                 * than that of shared mappings.  vma_needs_reserves()
                 * has already taken this difference in semantics into
                 * account.  Therefore, the meaning of chg is the same
                 * as in the shared case above.  Code could easily be
                 * combined, but keeping it separate draws attention to
                 * subtle differences.
                 */
                if (chg)
                        return false;
                else
                        return true;
        }

        return false;
}

static void enqueue_huge_page(struct hstate *h, struct page *page)
{
        int nid = page_to_nid(page);
        list_move(&page->lru, &h->hugepage_freelists[nid]);
        h->free_huge_pages++;
        h->free_huge_pages_node[nid]++;
        SetPageHugeFreed(page);
}

static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
{
        struct page *page;
        bool nocma = !!(current->flags & PF_MEMALLOC_NOCMA);

        list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
                if (nocma && is_migrate_cma_page(page))
                        continue;

                if (PageHWPoison(page))
                        continue;

                list_move(&page->lru, &h->hugepage_activelist);
                set_page_refcounted(page);
                ClearPageHugeFreed(page);
                h->free_huge_pages--;
                h->free_huge_pages_node[nid]--;
                return page;
        }

        return NULL;
}

static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid,
                nodemask_t *nmask)
{
        unsigned int cpuset_mems_cookie;
        struct zonelist *zonelist;
        struct zone *zone;
        struct zoneref *z;
        int node = NUMA_NO_NODE;

        zonelist = node_zonelist(nid, gfp_mask);

retry_cpuset:
        cpuset_mems_cookie = read_mems_allowed_begin();
        for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) {
                struct page *page;

                if (!cpuset_zone_allowed(zone, gfp_mask))
                        continue;
                /*
                 * no need to ask again on the same node. Pool is node rather than
                 * zone aware
                 */
                if (zone_to_nid(zone) == node)
                        continue;
                node = zone_to_nid(zone);

                page = dequeue_huge_page_node_exact(h, node);
                if (page)
                        return page;
        }
        if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie)))
                goto retry_cpuset;

        return NULL;
}

static struct page *dequeue_huge_page_vma(struct hstate *h,
                                struct vm_area_struct *vma,
                                unsigned long address, int avoid_reserve,
                                long chg)
{
        struct page *page;
        struct mempolicy *mpol;
        gfp_t gfp_mask;
        nodemask_t *nodemask;
        int nid;

        /*
         * A child process with MAP_PRIVATE mappings created by their parent
         * have no page reserves. This check ensures that reservations are
         * not "stolen". The child may still get SIGKILLed
         */
        if (!vma_has_reserves(vma, chg) &&
                        h->free_huge_pages - h->resv_huge_pages == 0)
                goto err;

        /* If reserves cannot be used, ensure enough pages are in the pool */
        if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
                goto err;

        gfp_mask = htlb_alloc_mask(h);
        nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
        page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
        if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
                SetPagePrivate(page);
                h->resv_huge_pages--;
        }

        mpol_cond_put(mpol);
        return page;

err:
        return NULL;
}

/*
 * common helper functions for hstate_next_node_to_{alloc|free}.
 * We may have allocated or freed a huge page based on a different
 * nodes_allowed previously, so h->next_node_to_{alloc|free} might
 * be outside of *nodes_allowed.  Ensure that we use an allowed
 * node for alloc or free.
 */
static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
{
        nid = next_node_in(nid, *nodes_allowed);
        VM_BUG_ON(nid >= MAX_NUMNODES);

        return nid;
}

static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
{
        if (!node_isset(nid, *nodes_allowed))
                nid = next_node_allowed(nid, nodes_allowed);
        return nid;
}

/*
 * returns the previously saved node ["this node"] from which to
 * allocate a persistent huge page for the pool and advance the
 * next node from which to allocate, handling wrap at end of node
 * mask.
 */
static int hstate_next_node_to_alloc(struct hstate *h,
                                        nodemask_t *nodes_allowed)
{
        int nid;

        VM_BUG_ON(!nodes_allowed);

        nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
        h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);

        return nid;
}

/*
 * helper for free_pool_huge_page() - return the previously saved
 * node ["this node"] from which to free a huge page.  Advance the
 * next node id whether or not we find a free huge page to free so
 * that the next attempt to free addresses the next node.
 */
static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
{
        int nid;

        VM_BUG_ON(!nodes_allowed);

        nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
        h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);

        return nid;
}

#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask)                \
        for (nr_nodes = nodes_weight(*mask);                                \
                nr_nodes > 0 &&                                                \
                ((node = hstate_next_node_to_alloc(hs, mask)) || 1);        \
                nr_nodes--)

#define for_each_node_mask_to_free(hs, nr_nodes, node, mask)                \
        for (nr_nodes = nodes_weight(*mask);                                \
                nr_nodes > 0 &&                                                \
                ((node = hstate_next_node_to_free(hs, mask)) || 1);        \
                nr_nodes--)

#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
static void destroy_compound_gigantic_page(struct page *page,
                                        unsigned int order)
{
        int i;
        int nr_pages = 1 << order;
        struct page *p = page + 1;

        atomic_set(compound_mapcount_ptr(page), 0);
        atomic_set(compound_pincount_ptr(page), 0);

        for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
                clear_compound_head(p);
                set_page_refcounted(p);
        }

        set_compound_order(page, 0);
        page[1].compound_nr = 0;
        __ClearPageHead(page);
}

static void free_gigantic_page(struct page *page, unsigned int order)
{
        /*
         * If the page isn't allocated using the cma allocator,
         * cma_release() returns false.
         */
#ifdef CONFIG_CMA
        if (cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order))
                return;
#endif

        free_contig_range(page_to_pfn(page), 1 << order);
}

#ifdef CONFIG_CONTIG_ALLOC
static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
                int nid, nodemask_t *nodemask)
{
        unsigned long nr_pages = 1UL << huge_page_order(h);
        if (nid == NUMA_NO_NODE)
                nid = numa_mem_id();

#ifdef CONFIG_CMA
        {
                struct page *page;
                int node;

                if (hugetlb_cma[nid]) {
                        page = cma_alloc(hugetlb_cma[nid], nr_pages,
                                        huge_page_order(h), true);
                        if (page)
                                return page;
                }

                if (!(gfp_mask & __GFP_THISNODE)) {
                        for_each_node_mask(node, *nodemask) {
                                if (node == nid || !hugetlb_cma[node])
                                        continue;

                                page = cma_alloc(hugetlb_cma[node], nr_pages,
                                                huge_page_order(h), true);
                                if (page)
                                        return page;
                        }
                }
        }
#endif

        return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
}

#else /* !CONFIG_CONTIG_ALLOC */
static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
                                        int nid, nodemask_t *nodemask)
{
        return NULL;
}
#endif /* CONFIG_CONTIG_ALLOC */

#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
                                        int nid, nodemask_t *nodemask)
{
        return NULL;
}
static inline void free_gigantic_page(struct page *page, unsigned int order) { }
static inline void destroy_compound_gigantic_page(struct page *page,
                                                unsigned int order) { }
#endif

static void update_and_free_page(struct hstate *h, struct page *page)
{
        int i;
        struct page *subpage = page;

        if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
                return;

        h->nr_huge_pages--;
        h->nr_huge_pages_node[page_to_nid(page)]--;
        for (i = 0; i < pages_per_huge_page(h);
             i++, subpage = mem_map_next(subpage, page, i)) {
                subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
                                1 << PG_referenced | 1 << PG_dirty |
                                1 << PG_active | 1 << PG_private |
                                1 << PG_writeback);
        }
        VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
        VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
        set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
        set_page_refcounted(page);
        if (hstate_is_gigantic(h)) {
                /*
                 * Temporarily drop the hugetlb_lock, because
                 * we might block in free_gigantic_page().
                 */
                spin_unlock(&hugetlb_lock);
                destroy_compound_gigantic_page(page, huge_page_order(h));
                free_gigantic_page(page, huge_page_order(h));
                spin_lock(&hugetlb_lock);
        } else {
                __free_pages(page, huge_page_order(h));
        }
}

struct hstate *size_to_hstate(unsigned long size)
{
        struct hstate *h;

        for_each_hstate(h) {
                if (huge_page_size(h) == size)
                        return h;
        }
        return NULL;
}

/*
 * Test to determine whether the hugepage is "active/in-use" (i.e. being linked
 * to hstate->hugepage_activelist.)
 *
 * This function can be called for tail pages, but never returns true for them.
 */
bool page_huge_active(struct page *page)
{
        return PageHeadHuge(page) && PagePrivate(&page[1]);
}

/* never called for tail page */
void set_page_huge_active(struct page *page)
{
        VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
        SetPagePrivate(&page[1]);
}

static void clear_page_huge_active(struct page *page)
{
        VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
        ClearPagePrivate(&page[1]);
}

/*
 * Internal hugetlb specific page flag. Do not use outside of the hugetlb
 * code
 */
static inline bool PageHugeTemporary(struct page *page)
{
        if (!PageHuge(page))
                return false;

        return (unsigned long)page[2].mapping == -1U;
}

static inline void SetPageHugeTemporary(struct page *page)
{
        page[2].mapping = (void *)-1U;
}

static inline void ClearPageHugeTemporary(struct page *page)
{
        page[2].mapping = NULL;
}

static void __free_huge_page(struct page *page)
{
        /*
         * Can't pass hstate in here because it is called from the
         * compound page destructor.
         */
        struct hstate *h = page_hstate(page);
        int nid = page_to_nid(page);
        struct hugepage_subpool *spool =
                (struct hugepage_subpool *)page_private(page);
        bool restore_reserve;

        VM_BUG_ON_PAGE(page_count(page), page);
        VM_BUG_ON_PAGE(page_mapcount(page), page);

        set_page_private(page, 0);
        page->mapping = NULL;
        restore_reserve = PagePrivate(page);
        ClearPagePrivate(page);

        /*
         * If PagePrivate() was set on page, page allocation consumed a
         * reservation.  If the page was associated with a subpool, there
         * would have been a page reserved in the subpool before allocation
         * via hugepage_subpool_get_pages().  Since we are 'restoring' the
         * reservtion, do not call hugepage_subpool_put_pages() as this will
         * remove the reserved page from the subpool.
         */
        if (!restore_reserve) {
                /*
                 * A return code of zero implies that the subpool will be
                 * under its minimum size if the reservation is not restored
                 * after page is free.  Therefore, force restore_reserve
                 * operation.
                 */
                if (hugepage_subpool_put_pages(spool, 1) == 0)
                        restore_reserve = true;
        }

        spin_lock(&hugetlb_lock);
        clear_page_huge_active(page);
        hugetlb_cgroup_uncharge_page(hstate_index(h),
                                     pages_per_huge_page(h), page);
        hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
                                          pages_per_huge_page(h), page);
        if (restore_reserve)
                h->resv_huge_pages++;

        if (PageHugeTemporary(page)) {
                list_del(&page->lru);
                ClearPageHugeTemporary(page);
                update_and_free_page(h, page);
        } else if (h->surplus_huge_pages_node[nid]) {
                /* remove the page from active list */
                list_del(&page->lru);
                update_and_free_page(h, page);
                h->surplus_huge_pages--;
                h->surplus_huge_pages_node[nid]--;
        } else {
                arch_clear_hugepage_flags(page);
                enqueue_huge_page(h, page);
        }
        spin_unlock(&hugetlb_lock);
}

/*
 * As free_huge_page() can be called from a non-task context, we have
 * to defer the actual freeing in a workqueue to prevent potential
 * hugetlb_lock deadlock.
 *
 * free_hpage_workfn() locklessly retrieves the linked list of pages to
 * be freed and frees them one-by-one. As the page->mapping pointer is
 * going to be cleared in __free_huge_page() anyway, it is reused as the
 * llist_node structure of a lockless linked list of huge pages to be freed.
 */
static LLIST_HEAD(hpage_freelist);

static void free_hpage_workfn(struct work_struct *work)
{
        struct llist_node *node;
        struct page *page;

        node = llist_del_all(&hpage_freelist);

        while (node) {
                page = container_of((struct address_space **)node,
                                     struct page, mapping);
                node = node->next;
                __free_huge_page(page);
        }
}
static DECLARE_WORK(free_hpage_work, free_hpage_workfn);

void free_huge_page(struct page *page)
{
        /*
         * Defer freeing if in non-task context to avoid hugetlb_lock deadlock.
         */
        if (!in_task()) {
                /*
                 * Only call schedule_work() if hpage_freelist is previously
                 * empty. Otherwise, schedule_work() had been called but the
                 * workfn hasn't retrieved the list yet.
                 */
                if (llist_add((struct llist_node *)&page->mapping,
                              &hpage_freelist))
                        schedule_work(&free_hpage_work);
                return;
        }

        __free_huge_page(page);
}

static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
{
        INIT_LIST_HEAD(&page->lru);
        set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
        set_hugetlb_cgroup(page, NULL);
        set_hugetlb_cgroup_rsvd(page, NULL);
        spin_lock(&hugetlb_lock);
        h->nr_huge_pages++;
        h->nr_huge_pages_node[nid]++;
        ClearPageHugeFreed(page);
        spin_unlock(&hugetlb_lock);
}

static void prep_compound_gigantic_page(struct page *page, unsigned int order)
{
        int i;
        int nr_pages = 1 << order;
        struct page *p = page + 1;

        /* we rely on prep_new_huge_page to set the destructor */
        set_compound_order(page, order);
        __ClearPageReserved(page);
        __SetPageHead(page);
        for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
                /*
                 * For gigantic hugepages allocated through bootmem at
                 * boot, it's safer to be consistent with the not-gigantic
                 * hugepages and clear the PG_reserved bit from all tail pages
                 * too.  Otherwise drivers using get_user_pages() to access tail
                 * pages may get the reference counting wrong if they see
                 * PG_reserved set on a tail page (despite the head page not
                 * having PG_reserved set).  Enforcing this consistency between
                 * head and tail pages allows drivers to optimize away a check
                 * on the head page when they need know if put_page() is needed
                 * after get_user_pages().
                 */
                __ClearPageReserved(p);
                set_page_count(p, 0);
                set_compound_head(p, page);
        }
        atomic_set(compound_mapcount_ptr(page), -1);
        atomic_set(compound_pincount_ptr(page), 0);
}

/*
 * PageHuge() only returns true for hugetlbfs pages, but not for normal or
 * transparent huge pages.  See the PageTransHuge() documentation for more
 * details.
 */
int PageHuge(struct page *page)
{
        if (!PageCompound(page))
                return 0;

        page = compound_head(page);
        return page[1].compound_dtor == HUGETLB_PAGE_DTOR;
}
EXPORT_SYMBOL_GPL(PageHuge);

/*
 * PageHeadHuge() only returns true for hugetlbfs head page, but not for
 * normal or transparent huge pages.
 */
int PageHeadHuge(struct page *page_head)
{
        if (!PageHead(page_head))
                return 0;

        return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR;
}

/*
 * Find and lock address space (mapping) in write mode.
 *
 * Upon entry, the page is locked which means that page_mapping() is
 * stable.  Due to locking order, we can only trylock_write.  If we can
 * not get the lock, simply return NULL to caller.
 */
struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage)
{
        struct address_space *mapping = page_mapping(hpage);

        if (!mapping)
                return mapping;

        if (i_mmap_trylock_write(mapping))
                return mapping;

        return NULL;
}

pgoff_t hugetlb_basepage_index(struct page *page)
{
        struct page *page_head = compound_head(page);
        pgoff_t index = page_index(page_head);
        unsigned long compound_idx;

        if (compound_order(page_head) >= MAX_ORDER)
                compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
        else
                compound_idx = page - page_head;

        return (index << compound_order(page_head)) + compound_idx;
}

static struct page *alloc_buddy_huge_page(struct hstate *h,
                gfp_t gfp_mask, int nid, nodemask_t *nmask,
                nodemask_t *node_alloc_noretry)
{
        int order = huge_page_order(h);
        struct page *page;
        bool alloc_try_hard = true;

        /*
         * By default we always try hard to allocate the page with
         * __GFP_RETRY_MAYFAIL flag.  However, if we are allocating pages in
         * a loop (to adjust global huge page counts) and previous allocation
         * failed, do not continue to try hard on the same node.  Use the
         * node_alloc_noretry bitmap to manage this state information.
         */
        if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry))
                alloc_try_hard = false;
        gfp_mask |= __GFP_COMP|__GFP_NOWARN;
        if (alloc_try_hard)
                gfp_mask |= __GFP_RETRY_MAYFAIL;
        if (nid == NUMA_NO_NODE)
                nid = numa_mem_id();
        page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
        if (page)
                __count_vm_event(HTLB_BUDDY_PGALLOC);
        else
                __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);

        /*
         * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this
         * indicates an overall state change.  Clear bit so that we resume
         * normal 'try hard' allocations.
         */
        if (node_alloc_noretry && page && !alloc_try_hard)
                node_clear(nid, *node_alloc_noretry);

        /*
         * If we tried hard to get a page but failed, set bit so that
         * subsequent attempts will not try as hard until there is an
         * overall state change.
         */
        if (node_alloc_noretry && !page && alloc_try_hard)
                node_set(nid, *node_alloc_noretry);

        return page;
}

/*
 * Common helper to allocate a fresh hugetlb page. All specific allocators
 * should use this function to get new hugetlb pages
 */
static struct page *alloc_fresh_huge_page(struct hstate *h,
                gfp_t gfp_mask, int nid, nodemask_t *nmask,
                nodemask_t *node_alloc_noretry)
{
        struct page *page;

        if (hstate_is_gigantic(h))
                page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
        else
                page = alloc_buddy_huge_page(h, gfp_mask,
                                nid, nmask, node_alloc_noretry);
        if (!page)
                return NULL;

        if (hstate_is_gigantic(h))
                prep_compound_gigantic_page(page, huge_page_order(h));
        prep_new_huge_page(h, page, page_to_nid(page));

        return page;
}

/*
 * Allocates a fresh page to the hugetlb allocator pool in the node interleaved
 * manner.
 */
static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
                                nodemask_t *node_alloc_noretry)
{
        struct page *page;
        int nr_nodes, node;
        gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;

        for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
                page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed,
                                                node_alloc_noretry);
                if (page)
                        break;
        }

        if (!page)
                return 0;

        put_page(page); /* free it into the hugepage allocator */

        return 1;
}

/*
 * Free huge page from pool from next node to free.
 * Attempt to keep persistent huge pages more or less
 * balanced over allowed nodes.
 * Called with hugetlb_lock locked.
 */
static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
                                                         bool acct_surplus)
{
        int nr_nodes, node;
        int ret = 0;

        for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
                /*
                 * If we're returning unused surplus pages, only examine
                 * nodes with surplus pages.
                 */
                if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
                    !list_empty(&h->hugepage_freelists[node])) {
                        struct page *page =
                                list_entry(h->hugepage_freelists[node].next,
                                          struct page, lru);
                        list_del(&page->lru);
                        h->free_huge_pages--;
                        h->free_huge_pages_node[node]--;
                        if (acct_surplus) {
                                h->surplus_huge_pages--;
                                h->surplus_huge_pages_node[node]--;
                        }
                        update_and_free_page(h, page);
                        ret = 1;
                        break;
                }
        }

        return ret;
}

/*
 * Dissolve a given free hugepage into free buddy pages. This function does
 * nothing for in-use hugepages and non-hugepages.
 * This function returns values like below:
 *
 *  -EBUSY: failed to dissolved free hugepages or the hugepage is in-use
 *          (allocated or reserved.)
 *       0: successfully dissolved free hugepages or the page is not a
 *          hugepage (considered as already dissolved)
 */
int dissolve_free_huge_page(struct page *page)
{
        int rc = -EBUSY;

retry:
        /* Not to disrupt normal path by vainly holding hugetlb_lock */
        if (!PageHuge(page))
                return 0;

        spin_lock(&hugetlb_lock);
        if (!PageHuge(page)) {
                rc = 0;
                goto out;
        }

        if (!page_count(page)) {
                struct page *head = compound_head(page);
                struct hstate *h = page_hstate(head);
                int nid = page_to_nid(head);
                if (h->free_huge_pages - h->resv_huge_pages == 0)
                        goto out;

                /*
                 * We should make sure that the page is already on the free list
                 * when it is dissolved.
                 */
                if (unlikely(!PageHugeFreed(head))) {
                        spin_unlock(&hugetlb_lock);
                        cond_resched();

                        /*
                         * Theoretically, we should return -EBUSY when we
                         * encounter this race. In fact, we have a chance
                         * to successfully dissolve the page if we do a
                         * retry. Because the race window is quite small.
                         * If we seize this opportunity, it is an optimization
                         * for increasing the success rate of dissolving page.
                         */
                        goto retry;
                }

                /*
                 * Move PageHWPoison flag from head page to the raw error page,
                 * which makes any subpages rather than the error page reusable.
                 */
                if (PageHWPoison(head) && page != head) {
                        SetPageHWPoison(page);
                        ClearPageHWPoison(head);
                }
                list_del(&head->lru);
                h->free_huge_pages--;
                h->free_huge_pages_node[nid]--;
                h->max_huge_pages--;
                update_and_free_page(h, head);
                rc = 0;
        }
out:
        spin_unlock(&hugetlb_lock);
        return rc;
}

/*
 * Dissolve free hugepages in a given pfn range. Used by memory hotplug to
 * make specified memory blocks removable from the system.
 * Note that this will dissolve a free gigantic hugepage completely, if any
 * part of it lies within the given range.
 * Also note that if dissolve_free_huge_page() returns with an error, all
 * free hugepages that were dissolved before that error are lost.
 */
int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
{
        unsigned long pfn;
        struct page *page;
        int rc = 0;

        if (!hugepages_supported())
                return rc;

        for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) {
                page = pfn_to_page(pfn);
                rc = dissolve_free_huge_page(page);
                if (rc)
                        break;
        }

        return rc;
}

/*
 * Allocates a fresh surplus page from the page allocator.
 */
static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
                int nid, nodemask_t *nmask)
{
        struct page *page = NULL;

        if (hstate_is_gigantic(h))
                return NULL;

        spin_lock(&hugetlb_lock);
        if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
                goto out_unlock;
        spin_unlock(&hugetlb_lock);

        page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
        if (!page)
                return NULL;

        spin_lock(&hugetlb_lock);
        /*
         * We could have raced with the pool size change.
         * Double check that and simply deallocate the new page
         * if we would end up overcommiting the surpluses. Abuse
         * temporary page to workaround the nasty free_huge_page
         * codeflow
         */
        if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
                SetPageHugeTemporary(page);
                spin_unlock(&hugetlb_lock);
                put_page(page);
                return NULL;
        } else {
                h->surplus_huge_pages++;
                h->surplus_huge_pages_node[page_to_nid(page)]++;
        }

out_unlock:
        spin_unlock(&hugetlb_lock);

        return page;
}

static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
                                     int nid, nodemask_t *nmask)
{
        struct page *page;

        if (hstate_is_gigantic(h))
                return NULL;

        page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
        if (!page)
                return NULL;

        /*
         * We do not account these pages as surplus because they are only
         * temporary and will be released properly on the last reference
         */
        SetPageHugeTemporary(page);

        return page;
}

/*
 * Use the VMA's mpolicy to allocate a huge page from the buddy.
 */
static
struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
                struct vm_area_struct *vma, unsigned long addr)
{
        struct page *page;
        struct mempolicy *mpol;
        gfp_t gfp_mask = htlb_alloc_mask(h);
        int nid;
        nodemask_t *nodemask;

        nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
        page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
        mpol_cond_put(mpol);

        return page;
}

/* page migration callback function */
struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
                nodemask_t *nmask, gfp_t gfp_mask)
{
        spin_lock(&hugetlb_lock);
        if (h->free_huge_pages - h->resv_huge_pages > 0) {
                struct page *page;

                page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
                if (page) {
                        spin_unlock(&hugetlb_lock);
                        return page;
                }
        }
        spin_unlock(&hugetlb_lock);

        return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
}

/* mempolicy aware migration callback */
struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
                unsigned long address)
{
        struct mempolicy *mpol;
        nodemask_t *nodemask;
        struct page *page;
        gfp_t gfp_mask;
        int node;

        gfp_mask = htlb_alloc_mask(h);
        node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
        page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask);
        mpol_cond_put(mpol);

        return page;
}

/*
 * Increase the hugetlb pool such that it can accommodate a reservation
 * of size 'delta'.
 */
static int gather_surplus_pages(struct hstate *h, int delta)
        __must_hold(&hugetlb_lock)
{
        struct list_head surplus_list;
        struct page *page, *tmp;
        int ret, i;
        int needed, allocated;
        bool alloc_ok = true;

        needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
        if (needed <= 0) {
                h->resv_huge_pages += delta;
                return 0;
        }

        allocated = 0;
        INIT_LIST_HEAD(&surplus_list);

        ret = -ENOMEM;
retry:
        spin_unlock(&hugetlb_lock);
        for (i = 0; i < needed; i++) {
                page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
                                NUMA_NO_NODE, NULL);
                if (!page) {
                        alloc_ok = false;
                        break;
                }
                list_add(&page->lru, &surplus_list);
                cond_resched();
        }
        allocated += i;

        /*
         * After retaking hugetlb_lock, we need to recalculate 'needed'
         * because either resv_huge_pages or free_huge_pages may have changed.
         */
        spin_lock(&hugetlb_lock);
        needed = (h->resv_huge_pages + delta) -
                        (h->free_huge_pages + allocated);
        if (needed > 0) {
                if (alloc_ok)
                        goto retry;
                /*
                 * We were not able to allocate enough pages to
                 * satisfy the entire reservation so we free what
                 * we've allocated so far.
                 */
                goto free;
        }
        /*
         * The surplus_list now contains _at_least_ the number of extra pages
         * needed to accommodate the reservation.  Add the appropriate number
         * of pages to the hugetlb pool and free the extras back to the buddy
         * allocator.  Commit the entire reservation here to prevent another
         * process from stealing the pages as they are added to the pool but
         * before they are reserved.
         */
        needed += allocated;
        h->resv_huge_pages += delta;
        ret = 0;

        /* Free the needed pages to the hugetlb pool */
        list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
                if ((--needed) < 0)
                        break;
                /*
                 * This page is now managed by the hugetlb allocator and has
                 * no users -- drop the buddy allocator's reference.
                 */
                put_page_testzero(page);
                VM_BUG_ON_PAGE(page_count(page), page);
                enqueue_huge_page(h, page);
        }
free:
        spin_unlock(&hugetlb_lock);

        /* Free unnecessary surplus pages to the buddy allocator */
        list_for_each_entry_safe(page, tmp, &surplus_list, lru)
                put_page(page);
        spin_lock(&hugetlb_lock);

        return ret;
}

/*
 * This routine has two main purposes:
 * 1) Decrement the reservation count (resv_huge_pages) by the value passed
 *    in unused_resv_pages.  This corresponds to the prior adjustments made
 *    to the associated reservation map.
 * 2) Free any unused surplus pages that may have been allocated to satisfy
 *    the reservation.  As many as unused_resv_pages may be freed.
 *
 * Called with hugetlb_lock held.  However, the lock could be dropped (and
 * reacquired) during calls to cond_resched_lock.  Whenever dropping the lock,
 * we must make sure nobody else can claim pages we are in the process of
 * freeing.  Do this by ensuring resv_huge_page always is greater than the
 * number of huge pages we plan to free when dropping the lock.
 */
static void return_unused_surplus_pages(struct hstate *h,
                                        unsigned long unused_resv_pages)
{
        unsigned long nr_pages;

        /* Cannot return gigantic pages currently */
        if (hstate_is_gigantic(h))
                goto out;

        /*
         * Part (or even all) of the reservation could have been backed
         * by pre-allocated pages. Only free surplus pages.
         */
        nr_pages = min(unused_resv_pages, h->surplus_huge_pages);

        /*
         * We want to release as many surplus pages as possible, spread
         * evenly across all nodes with memory. Iterate across these nodes
         * until we can no longer free unreserved surplus pages. This occurs
         * when the nodes with surplus pages have no free pages.
         * free_pool_huge_page() will balance the freed pages across the
         * on-line nodes with memory and will handle the hstate accounting.
         *
         * Note that we decrement resv_huge_pages as we free the pages.  If
         * we drop the lock, resv_huge_pages will still be sufficiently large
         * to cover subsequent pages we may free.
         */
        while (nr_pages--) {
                h->resv_huge_pages--;
                unused_resv_pages--;
                if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
                        goto out;
                cond_resched_lock(&hugetlb_lock);
        }

out:
        /* Fully uncommit the reservation */
        h->resv_huge_pages -= unused_resv_pages;
}


/*
 * vma_needs_reservation, vma_commit_reservation and vma_end_reservation
 * are used by the huge page allocation routines to manage reservations.
 *
 * vma_needs_reservation is called to determine if the huge page at addr
 * within the vma has an associated reservation.  If a reservation is
 * needed, the value 1 is returned.  The caller is then responsible for
 * managing the global reservation and subpool usage counts.  After
 * the huge page has been allocated, vma_commit_reservation is called
 * to add the page to the reservation map.  If the page allocation fails,
 * the reservation must be ended instead of committed.  vma_end_reservation
 * is called in such cases.
 *
 * In the normal case, vma_commit_reservation returns the same value
 * as the preceding vma_needs_reservation call.  The only time this
 * is not the case is if a reserve map was changed between calls.  It
 * is the responsibility of the caller to notice the difference and
 * take appropriate action.
 *
 * vma_add_reservation is used in error paths where a reservation must
 * be restored when a newly allocated huge page must be freed.  It is
 * to be called after calling vma_needs_reservation to determine if a
 * reservation exists.
 */
enum vma_resv_mode {
        VMA_NEEDS_RESV,
        VMA_COMMIT_RESV,
        VMA_END_RESV,
        VMA_ADD_RESV,
};
static long __vma_reservation_common(struct hstate *h,
                                struct vm_area_struct *vma, unsigned long addr,
                                enum vma_resv_mode mode)
{
        struct resv_map *resv;
        pgoff_t idx;
        long ret;
        long dummy_out_regions_needed;

        resv = vma_resv_map(vma);
        if (!resv)
                return 1;

        idx = vma_hugecache_offset(h, vma, addr);
        switch (mode) {
        case VMA_NEEDS_RESV:
                ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed);
                /* We assume that vma_reservation_* routines always operate on
                 * 1 page, and that adding to resv map a 1 page entry can only
                 * ever require 1 region.
                 */
                VM_BUG_ON(dummy_out_regions_needed != 1);
                break;
        case VMA_COMMIT_RESV:
                ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
                /* region_add calls of range 1 should never fail. */
                VM_BUG_ON(ret < 0);
                break;
        case VMA_END_RESV:
                region_abort(resv, idx, idx + 1, 1);
                ret = 0;
                break;
        case VMA_ADD_RESV:
                if (vma->vm_flags & VM_MAYSHARE) {
                        ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
                        /* region_add calls of range 1 should never fail. */
                        VM_BUG_ON(ret < 0);
                } else {
                        region_abort(resv, idx, idx + 1, 1);
                        ret = region_del(resv, idx, idx + 1);
                }
                break;
        default:
                BUG();
        }

        if (vma->vm_flags & VM_MAYSHARE)
                return ret;
        else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) {
                /*
                 * In most cases, reserves always exist for private mappings.
                 * However, a file associated with mapping could have been
                 * hole punched or truncated after reserves were consumed.
                 * As subsequent fault on such a range will not use reserves.
                 * Subtle - The reserve map for private mappings has the
                 * opposite meaning than that of shared mappings.  If NO
                 * entry is in the reserve map, it means a reservation exists.
                 * If an entry exists in the reserve map, it means the
                 * reservation has already been consumed.  As a result, the
                 * return value of this routine is the opposite of the
                 * value returned from reserve map manipulation routines above.
                 */
                if (ret)
                        return 0;
                else
                        return 1;
        }
        else
                return ret < 0 ? ret : 0;
}

static long vma_needs_reservation(struct hstate *h,
                        struct vm_area_struct *vma, unsigned long addr)
{
        return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV);
}

static long vma_commit_reservation(struct hstate *h,
                        struct vm_area_struct *vma, unsigned long addr)
{
        return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV);
}

static void vma_end_reservation(struct hstate *h,
                        struct vm_area_struct *vma, unsigned long addr)
{
        (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
}

static long vma_add_reservation(struct hstate *h,
                        struct vm_area_struct *vma, unsigned long addr)
{
        return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV);
}

/*
 * This routine is called to restore a reservation on error paths.  In the
 * specific error paths, a huge page was allocated (via alloc_huge_page)
 * and is about to be freed.  If a reservation for the page existed,
 * alloc_huge_page would have consumed the reservation and set PagePrivate
 * in the newly allocated page.  When the page is freed via free_huge_page,
 * the global reservation count will be incremented if PagePrivate is set.
 * However, free_huge_page can not adjust the reserve map.  Adjust the
 * reserve map here to be consistent with global reserve count adjustments
 * to be made by free_huge_page.
 */
static void restore_reserve_on_error(struct hstate *h,
                        struct vm_area_struct *vma, unsigned long address,
                        struct page *page)
{
        if (unlikely(PagePrivate(page))) {
                long rc = vma_needs_reservation(h, vma, address);

                if (unlikely(rc < 0)) {
                        /*
                         * Rare out of memory condition in reserve map
                         * manipulation.  Clear PagePrivate so that
                         * global reserve count will not be incremented
                         * by free_huge_page.  This will make it appear
                         * as though the reservation for this page was
                         * consumed.  This may prevent the task from
                         * faulting in the page at a later time.  This
                         * is better than inconsistent global huge page
                         * accounting of reserve counts.
                         */
                        ClearPagePrivate(page);
                } else if (rc) {
                        rc = vma_add_reservation(h, vma, address);
                        if (unlikely(rc < 0))
                                /*
                                 * See above comment about rare out of
                                 * memory condition.
                                 */
                                ClearPagePrivate(page);
                } else
                        vma_end_reservation(h, vma, address);
        }
}

struct page *alloc_huge_page(struct vm_area_struct *vma,
                                    unsigned long addr, int avoid_reserve)
{
        struct hugepage_subpool *spool = subpool_vma(vma);
        struct hstate *h = hstate_vma(vma);
        struct page *page;
        long map_chg, map_commit;
        long gbl_chg;
        int ret, idx;
        struct hugetlb_cgroup *h_cg;
        bool deferred_reserve;

        idx = hstate_index(h);
        /*
         * Examine the region/reserve map to determine if the process
         * has a reservation for the page to be allocated.  A return
         * code of zero indicates a reservation exists (no change).
         */
        map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
        if (map_chg < 0)
                return ERR_PTR(-ENOMEM);

        /*
         * Processes that did not create the mapping will have no
         * reserves as indicated by the region/reserve map. Check
         * that the allocation will not exceed the subpool limit.
         * Allocations for MAP_NORESERVE mappings also need to be
         * checked against any subpool limit.
         */
        if (map_chg || avoid_reserve) {
                gbl_chg = hugepage_subpool_get_pages(spool, 1);
                if (gbl_chg < 0) {
                        vma_end_reservation(h, vma, addr);
                        return ERR_PTR(-ENOSPC);
                }

                /*
                 * Even though there was no reservation in the region/reserve
                 * map, there could be reservations associated with the
                 * subpool that can be used.  This would be indicated if the
                 * return value of hugepage_subpool_get_pages() is zero.
                 * However, if avoid_reserve is specified we still avoid even
                 * the subpool reservations.
                 */
                if (avoid_reserve)
                        gbl_chg = 1;
        }

        /* If this allocation is not consuming a reservation, charge it now.
         */
        deferred_reserve = map_chg || avoid_reserve || !vma_resv_map(vma);
        if (deferred_reserve) {
                ret = hugetlb_cgroup_charge_cgroup_rsvd(
                        idx, pages_per_huge_page(h), &h_cg);
                if (ret)
                        goto out_subpool_put;
        }

        ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
        if (ret)
                goto out_uncharge_cgroup_reservation;

        spin_lock(&hugetlb_lock);
        /*
         * glb_chg is passed to indicate whether or not a page must be taken
         * from the global free pool (global change).  gbl_chg == 0 indicates
         * a reservation exists for the allocation.
         */
        page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
        if (!page) {
                spin_unlock(&hugetlb_lock);
                page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
                if (!page)
                        goto out_uncharge_cgroup;
                spin_lock(&hugetlb_lock);
                if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
                        SetPagePrivate(page);
                        h->resv_huge_pages--;
                }
                list_add(&page->lru, &h->hugepage_activelist);
                /* Fall through */
        }
        hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
        /* If allocation is not consuming a reservation, also store the
         * hugetlb_cgroup pointer on the page.
         */
        if (deferred_reserve) {
                hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h),
                                                  h_cg, page);
        }

        spin_unlock(&hugetlb_lock);

        set_page_private(page, (unsigned long)spool);

        map_commit = vma_commit_reservation(h, vma, addr);
        if (unlikely(map_chg > map_commit)) {
                /*
                 * The page was added to the reservation map between
                 * vma_needs_reservation and vma_commit_reservation.
                 * This indicates a race with hugetlb_reserve_pages.
                 * Adjust for the subpool count incremented above AND
                 * in hugetlb_reserve_pages for the same page.  Also,
                 * the reservation count added in hugetlb_reserve_pages
                 * no longer applies.
                 */
                long rsv_adjust;

                rsv_adjust = hugepage_subpool_put_pages(spool, 1);
                hugetlb_acct_memory(h, -rsv_adjust);
                if (deferred_reserve)
                        hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
                                        pages_per_huge_page(h), page);
        }
        return page;

out_uncharge_cgroup:
        hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
out_uncharge_cgroup_reservation:
        if (deferred_reserve)
                hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
                                                    h_cg);
out_subpool_put:
        if (map_chg || avoid_reserve)
                hugepage_subpool_put_pages(spool, 1);
        vma_end_reservation(h, vma, addr);
        return ERR_PTR(-ENOSPC);
}

int alloc_bootmem_huge_page(struct hstate *h)
        __attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
int __alloc_bootmem_huge_page(struct hstate *h)
{
        struct huge_bootmem_page *m;
        int nr_nodes, node;

        for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
                void *addr;

                addr = memblock_alloc_try_nid_raw(
                                huge_page_size(h), huge_page_size(h),
                                0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
                if (addr) {
                        /*
                         * Use the beginning of the huge page to store the
                         * huge_bootmem_page struct (until gather_bootmem
                         * puts them into the mem_map).
                         */
                        m = addr;
                        goto found;
                }
        }
        return 0;

found:
        BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h)));
        /* Put them into a private list first because mem_map is not up yet */
        INIT_LIST_HEAD(&m->list);
        list_add(&m->list, &huge_boot_pages);
        m->hstate = h;
        return 1;
}

/*
 * Put bootmem huge pages into the standard lists after mem_map is up.
 * Note: This only applies to gigantic (order > MAX_ORDER) pages.
 */
static void __init gather_bootmem_prealloc(void)
{
        struct huge_bootmem_page *m;

        list_for_each_entry(m, &huge_boot_pages, list) {
                struct page *page = virt_to_page(m);
                struct hstate *h = m->hstate;

                VM_BUG_ON(!hstate_is_gigantic(h));
                WARN_ON(page_count(page) != 1);
                prep_compound_gigantic_page(page, huge_page_order(h));
                WARN_ON(PageReserved(page));
                prep_new_huge_page(h, page, page_to_nid(page));
                put_page(page); /* free it into the hugepage allocator */

                /*
                 * We need to restore the 'stolen' pages to totalram_pages
                 * in order to fix confusing memory reports from free(1) and
                 * other side-effects, like CommitLimit going negative.
                 */
                adjust_managed_page_count(page, pages_per_huge_page(h));
                cond_resched();
        }
}

static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
{
        unsigned long i;
        nodemask_t *node_alloc_noretry;

        if (!hstate_is_gigantic(h)) {
                /*
                 * Bit mask controlling how hard we retry per-node allocations.
                 * Ignore errors as lower level routines can deal with
                 * node_alloc_noretry == NULL.  If this kmalloc fails at boot
                 * time, we are likely in bigger trouble.
                 */
                node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry),
                                                GFP_KERNEL);
        } else {
                /* allocations done at boot time */
                node_alloc_noretry = NULL;
        }

        /* bit mask controlling how hard we retry per-node allocations */
        if (node_alloc_noretry)
                nodes_clear(*node_alloc_noretry);

        for (i = 0; i < h->max_huge_pages; ++i) {
                if (hstate_is_gigantic(h)) {
                        if (hugetlb_cma_size) {
                                pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
                                goto free;
                        }
                        if (!alloc_bootmem_huge_page(h))
                                break;
                } else if (!alloc_pool_huge_page(h,
                                         &node_states[N_MEMORY],
                                         node_alloc_noretry))
                        break;
                cond_resched();
        }
        if (i < h->max_huge_pages) {
                char buf[32];

                string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
                pr_warn("HugeTLB: allocating %lu of page size %s failed.  Only allocated %lu hugepages.\n",
                        h->max_huge_pages, buf, i);
                h->max_huge_pages = i;
        }
free:
        kfree(node_alloc_noretry);
}

static void __init hugetlb_init_hstates(void)
{
        struct hstate *h;

        for_each_hstate(h) {
                if (minimum_order > huge_page_order(h))
                        minimum_order = huge_page_order(h);

                /* oversize hugepages were init'ed in early boot */
                if (!hstate_is_gigantic(h))
                        hugetlb_hstate_alloc_pages(h);
        }
        VM_BUG_ON(minimum_order == UINT_MAX);
}

static void __init report_hugepages(void)
{
        struct hstate *h;

        for_each_hstate(h) {
                char buf[32];

                string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
                pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
                        buf, h->free_huge_pages);
        }
}

#ifdef CONFIG_HIGHMEM
static void try_to_free_low(struct hstate *h, unsigned long count,
                                                nodemask_t *nodes_allowed)
{
        int i;

        if (hstate_is_gigantic(h))
                return;

        for_each_node_mask(i, *nodes_allowed) {
                struct page *page, *next;
                struct list_head *freel = &h->hugepage_freelists[i];
                list_for_each_entry_safe(page, next, freel, lru) {
                        if (count >= h->nr_huge_pages)
                                return;
                        if (PageHighMem(page))
                                continue;
                        list_del(&page->lru);
                        update_and_free_page(h, page);
                        h->free_huge_pages--;
                        h->free_huge_pages_node[page_to_nid(page)]--;
                }
        }
}
#else
static inline void try_to_free_low(struct hstate *h, unsigned long count,
                                                nodemask_t *nodes_allowed)
{
}
#endif

/*
 * Increment or decrement surplus_huge_pages.  Keep node-specific counters
 * balanced by operating on them in a round-robin fashion.
 * Returns 1 if an adjustment was made.
 */
static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
                                int delta)
{
        int nr_nodes, node;

        VM_BUG_ON(delta != -1 && delta != 1);

        if (delta < 0) {
                for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
                        if (h->surplus_huge_pages_node[node])
                                goto found;
                }
        } else {
                for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
                        if (h->surplus_huge_pages_node[node] <
                                        h->nr_huge_pages_node[node])
                                goto found;
                }
        }
        return 0;

found:
        h->surplus_huge_pages += delta;
        h->surplus_huge_pages_node[node] += delta;
        return 1;
}

#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
                              nodemask_t *nodes_allowed)
{
        unsigned long min_count, ret;
        NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);

        /*
         * Bit mask controlling how hard we retry per-node allocations.
         * If we can not allocate the bit mask, do not attempt to allocate
         * the requested huge pages.
         */
        if (node_alloc_noretry)
                nodes_clear(*node_alloc_noretry);
        else
                return -ENOMEM;

        spin_lock(&hugetlb_lock);

        /*
         * Check for a node specific request.
         * Changing node specific huge page count may require a corresponding
         * change to the global count.  In any case, the passed node mask
         * (nodes_allowed) will restrict alloc/free to the specified node.
         */
        if (nid != NUMA_NO_NODE) {
                unsigned long old_count = count;

                count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
                /*
                 * User may have specified a large count value which caused the
                 * above calculation to overflow.  In this case, they wanted
                 * to allocate as many huge pages as possible.  Set count to
                 * largest possible value to align with their intention.
                 */
                if (count < old_count)
                        count = ULONG_MAX;
        }

        /*
         * Gigantic pages runtime allocation depend on the capability for large
         * page range allocation.
         * If the system does not provide this feature, return an error when
         * the user tries to allocate gigantic pages but let the user free the
         * boottime allocated gigantic pages.
         */
        if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
                if (count > persistent_huge_pages(h)) {
                        spin_unlock(&hugetlb_lock);
                        NODEMASK_FREE(node_alloc_noretry);
                        return -EINVAL;
                }
                /* Fall through to decrease pool */
        }

        /*
         * Increase the pool size
         * First take pages out of surplus state.  Then make up the
         * remaining difference by allocating fresh huge pages.
         *
         * We might race with alloc_surplus_huge_page() here and be unable
         * to convert a surplus huge page to a normal huge page. That is
         * not critical, though, it just means the overall size of the
         * pool might be one hugepage larger than it needs to be, but
         * within all the constraints specified by the sysctls.
         */
        while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
                if (!adjust_pool_surplus(h, nodes_allowed, -1))
                        break;
        }

        while (count > persistent_huge_pages(h)) {
                /*
                 * If this allocation races such that we no longer need the
                 * page, free_huge_page will handle it by freeing the page
                 * and reducing the surplus.
                 */
                spin_unlock(&hugetlb_lock);

                /* yield cpu to avoid soft lockup */
                cond_resched();

                ret = alloc_pool_huge_page(h, nodes_allowed,
                                                node_alloc_noretry);
                spin_lock(&hugetlb_lock);
                if (!ret)
                        goto out;

                /* Bail for signals. Probably ctrl-c from user */
                if (signal_pending(current))
                        goto out;
        }

        /*
         * Decrease the pool size
         * First return free pages to the buddy allocator (being careful
         * to keep enough around to satisfy reservations).  Then place
         * pages into surplus state as needed so the pool will shrink
         * to the desired size as pages become free.
         *
         * By placing pages into the surplus state independent of the
         * overcommit value, we are allowing the surplus pool size to
         * exceed overcommit. There are few sane options here. Since
         * alloc_surplus_huge_page() is checking the global counter,
         * though, we'll note that we're not allowed to exceed surplus
         * and won't grow the pool anywhere else. Not until one of the
         * sysctls are changed, or the surplus pages go out of use.
         */
        min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
        min_count = max(count, min_count);
        try_to_free_low(h, min_count, nodes_allowed);
        while (min_count < persistent_huge_pages(h)) {
                if (!free_pool_huge_page(h, nodes_allowed, 0))
                        break;
                cond_resched_lock(&hugetlb_lock);
        }
        while (count < persistent_huge_pages(h)) {
                if (!adjust_pool_surplus(h, nodes_allowed, 1))
                        break;
        }
out:
        h->max_huge_pages = persistent_huge_pages(h);
        spin_unlock(&hugetlb_lock);

        NODEMASK_FREE(node_alloc_noretry);

        return 0;
}

#define HSTATE_ATTR_RO(_name) \
        static struct kobj_attribute _name##_attr = __ATTR_RO(_name)

#define HSTATE_ATTR(_name) \
        static struct kobj_attribute _name##_attr = \
                __ATTR(_name, 0644, _name##_show, _name##_store)

static struct kobject *hugepages_kobj;
static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];

static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);

static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
{
        int i;

        for (i = 0; i < HUGE_MAX_HSTATE; i++)
                if (hstate_kobjs[i] == kobj) {
                        if (nidp)
                                *nidp = NUMA_NO_NODE;
                        return &hstates[i];
                }

        return kobj_to_node_hstate(kobj, nidp);
}

static ssize_t nr_hugepages_show_common(struct kobject *kobj,
                                        struct kobj_attribute *attr, char *buf)
{
        struct hstate *h;
        unsigned long nr_huge_pages;
        int nid;

        h = kobj_to_hstate(kobj, &nid);
        if (nid == NUMA_NO_NODE)
                nr_huge_pages = h->nr_huge_pages;
        else
                nr_huge_pages = h->nr_huge_pages_node[nid];

        return sprintf(buf, "%lu\n", nr_huge_pages);
}

static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
                                           struct hstate *h, int nid,
                                           unsigned long count, size_t len)
{
        int err;
        nodemask_t nodes_allowed, *n_mask;

        if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
                return -EINVAL;

        if (nid == NUMA_NO_NODE) {
                /*
                 * global hstate attribute
                 */
                if (!(obey_mempolicy &&
                                init_nodemask_of_mempolicy(&nodes_allowed)))
                        n_mask = &node_states[N_MEMORY];
                else
                        n_mask = &nodes_allowed;
        } else {
                /*
                 * Node specific request.  count adjustment happens in
                 * set_max_huge_pages() after acquiring hugetlb_lock.
                 */
                init_nodemask_of_node(&nodes_allowed, nid);
                n_mask = &nodes_allowed;
        }

        err = set_max_huge_pages(h, count, nid, n_mask);

        return err ? err : len;
}

static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
                                         struct kobject *kobj, const char *buf,
                                         size_t len)
{
        struct hstate *h;
        unsigned long count;
        int nid;
        int err;

        err = kstrtoul(buf, 10, &count);
        if (err)
                return err;

        h = kobj_to_hstate(kobj, &nid);
        return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len);
}

static ssize_t nr_hugepages_show(struct kobject *kobj,
                                       struct kobj_attribute *attr, char *buf)
{
        return nr_hugepages_show_common(kobj, attr, buf);
}

static ssize_t nr_hugepages_store(struct kobject *kobj,
               struct kobj_attribute *attr, const char *buf, size_t len)
{
        return nr_hugepages_store_common(false, kobj, buf, len);
}
HSTATE_ATTR(nr_hugepages);

#ifdef CONFIG_NUMA

/*
 * hstate attribute for optionally mempolicy-based constraint on persistent
 * huge page alloc/free.
 */
static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
                                       struct kobj_attribute *attr, char *buf)
{
        return nr_hugepages_show_common(kobj, attr, buf);
}

static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
               struct kobj_attribute *attr, const char *buf, size_t len)
{
        return nr_hugepages_store_common(true, kobj, buf, len);
}
HSTATE_ATTR(nr_hugepages_mempolicy);
#endif


static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
                                        struct kobj_attribute *attr, char *buf)
{
        struct hstate *h = kobj_to_hstate(kobj, NULL);
        return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
}

static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
                struct kobj_attribute *attr, const char *buf, size_t count)
{
        int err;
        unsigned long input;
        struct hstate *h = kobj_to_hstate(kobj, NULL);

        if (hstate_is_gigantic(h))
                return -EINVAL;

        err = kstrtoul(buf, 10, &input);
        if (err)
                return err;

        spin_lock(&hugetlb_lock);
        h->nr_overcommit_huge_pages = input;
        spin_unlock(&hugetlb_lock);

        return count;
}
HSTATE_ATTR(nr_overcommit_hugepages);

static ssize_t free_hugepages_show(struct kobject *kobj,
                                        struct kobj_attribute *attr, char *buf)
{
        struct hstate *h;
        unsigned long free_huge_pages;
        int nid;

        h = kobj_to_hstate(kobj, &nid);
        if (nid == NUMA_NO_NODE)
                free_huge_pages = h->free_huge_pages;
        else
                free_huge_pages = h->free_huge_pages_node[nid];

        return sprintf(buf, "%lu\n", free_huge_pages);
}
HSTATE_ATTR_RO(free_hugepages);

static ssize_t resv_hugepages_show(struct kobject *kobj,
                                        struct kobj_attribute *attr, char *buf)
{
        struct hstate *h = kobj_to_hstate(kobj, NULL);
        return sprintf(buf, "%lu\n", h->resv_huge_pages);
}
HSTATE_ATTR_RO(resv_hugepages);

static ssize_t surplus_hugepages_show(struct kobject *kobj,
                                        struct kobj_attribute *attr, char *buf)
{
        struct hstate *h;
        unsigned long surplus_huge_pages;
        int nid;

        h = kobj_to_hstate(kobj, &nid);
        if (nid == NUMA_NO_NODE)
                surplus_huge_pages = h->surplus_huge_pages;
        else
                surplus_huge_pages = h->surplus_huge_pages_node[nid];

        return sprintf(buf, "%lu\n", surplus_huge_pages);
}
HSTATE_ATTR_RO(surplus_hugepages);

static struct attribute *hstate_attrs[] = {
        &nr_hugepages_attr.attr,
        &nr_overcommit_hugepages_attr.attr,
        &free_hugepages_attr.attr,
        &resv_hugepages_attr.attr,
        &surplus_hugepages_attr.attr,
#ifdef CONFIG_NUMA
        &nr_hugepages_mempolicy_attr.attr,
#endif
        NULL,
};

static const struct attribute_group hstate_attr_group = {
        .attrs = hstate_attrs,
};

static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
                                    struct kobject **hstate_kobjs,
                                    const struct attribute_group *hstate_attr_group)
{
        int retval;
        int hi = hstate_index(h);

        hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
        if (!hstate_kobjs[hi])
                return -ENOMEM;

        retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
        if (retval) {
                kobject_put(hstate_kobjs[hi]);
                hstate_kobjs[hi] = NULL;
        }

        return retval;
}

static void __init hugetlb_sysfs_init(void)
{
        struct hstate *h;
        int err;

        hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
        if (!hugepages_kobj)
                return;

        for_each_hstate(h) {
                err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
                                         hstate_kobjs, &hstate_attr_group);
                if (err)
                        pr_err("HugeTLB: Unable to add hstate %s", h->name);
        }
}

#ifdef CONFIG_NUMA

/*
 * node_hstate/s - associate per node hstate attributes, via their kobjects,
 * with node devices in node_devices[] using a parallel array.  The array
 * index of a node device or _hstate == node id.
 * This is here to avoid any static dependency of the node device driver, in
 * the base kernel, on the hugetlb module.
 */
struct node_hstate {
        struct kobject                *hugepages_kobj;
        struct kobject                *hstate_kobjs[HUGE_MAX_HSTATE];
};
static struct node_hstate node_hstates[MAX_NUMNODES];

/*
 * A subset of global hstate attributes for node devices
 */
static struct attribute *per_node_hstate_attrs[] = {
        &nr_hugepages_attr.attr,
        &free_hugepages_attr.attr,
        &surplus_hugepages_attr.attr,
        NULL,
};

static const struct attribute_group per_node_hstate_attr_group = {
        .attrs = per_node_hstate_attrs,
};

/*
 * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj.
 * Returns node id via non-NULL nidp.
 */
static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
{
        int nid;

        for (nid = 0; nid < nr_node_ids; nid++) {
                struct node_hstate *nhs = &node_hstates[nid];
                int i;
                for (i = 0; i < HUGE_MAX_HSTATE; i++)
                        if (nhs->hstate_kobjs[i] == kobj) {
                                if (nidp)
                                        *nidp = nid;
                                return &hstates[i];
                        }
        }

        BUG();
        return NULL;
}

/*
 * Unregister hstate attributes from a single node device.
 * No-op if no hstate attributes attached.
 */
static void hugetlb_unregister_node(struct node *node)
{
        struct hstate *h;
        struct node_hstate *nhs = &node_hstates[node->dev.id];

        if (!nhs->hugepages_kobj)
                return;                /* no hstate attributes */

        for_each_hstate(h) {
                int idx = hstate_index(h);
                if (nhs->hstate_kobjs[idx]) {
                        kobject_put(nhs->hstate_kobjs[idx]);
                        nhs->hstate_kobjs[idx] = NULL;
                }
        }

        kobject_put(nhs->hugepages_kobj);
        nhs->hugepages_kobj = NULL;
}


/*
 * Register hstate attributes for a single node device.
 * No-op if attributes already registered.
 */
static void hugetlb_register_node(struct node *node)
{
        struct hstate *h;
        struct node_hstate *nhs = &node_hstates[node->dev.id];
        int err;

        if (nhs->hugepages_kobj)
                return;                /* already allocated */

        nhs->hugepages_kobj = kobject_create_and_add("hugepages",
                                                        &node->dev.kobj);
        if (!nhs->hugepages_kobj)
                return;

        for_each_hstate(h) {
                err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
                                                nhs->hstate_kobjs,
                                                &per_node_hstate_attr_group);
                if (err) {
                        pr_err("HugeTLB: Unable to add hstate %s for node %d\n",
                                h->name, node->dev.id);
                        hugetlb_unregister_node(node);
                        break;
                }
        }
}

/*
 * hugetlb init time:  register hstate attributes for all registered node
 * devices of nodes that have memory.  All on-line nodes should have
 * registered their associated device by this time.
 */
static void __init hugetlb_register_all_nodes(void)
{
        int nid;

        for_each_node_state(nid, N_MEMORY) {
                struct node *node = node_devices[nid];
                if (node->dev.id == nid)
                        hugetlb_register_node(node);
        }

        /*
         * Let the node device driver know we're here so it can
         * [un]register hstate attributes on node hotplug.
         */
        register_hugetlbfs_with_node(hugetlb_register_node,
                                     hugetlb_unregister_node);
}
#else        /* !CONFIG_NUMA */

static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
{
        BUG();
        if (nidp)
                *nidp = -1;
        return NULL;
}

static void hugetlb_register_all_nodes(void) { }

#endif

static int __init hugetlb_init(void)
{
        int i;

        if (!hugepages_supported()) {
                if (hugetlb_max_hstate || default_hstate_max_huge_pages)
                        pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n");
                return 0;
        }

        /*
         * Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists.  Some
         * architectures depend on setup being done here.
         */
        hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
        if (!parsed_default_hugepagesz) {
                /*
                 * If we did not parse a default huge page size, set
                 * default_hstate_idx to HPAGE_SIZE hstate. And, if the
                 * number of huge pages for this default size was implicitly
                 * specified, set that here as well.
                 * Note that the implicit setting will overwrite an explicit
                 * setting.  A warning will be printed in this case.
                 */
                default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE));
                if (default_hstate_max_huge_pages) {
                        if (default_hstate.max_huge_pages) {
                                char buf[32];

                                string_get_size(huge_page_size(&default_hstate),
                                        1, STRING_UNITS_2, buf, 32);
                                pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n",
                                        default_hstate.max_huge_pages, buf);
                                pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n",
                                        default_hstate_max_huge_pages);
                        }
                        default_hstate.max_huge_pages =
                                default_hstate_max_huge_pages;
                }
        }

        hugetlb_cma_check();
        hugetlb_init_hstates();
        gather_bootmem_prealloc();
        report_hugepages();

        hugetlb_sysfs_init();
        hugetlb_register_all_nodes();
        hugetlb_cgroup_file_init();

#ifdef CONFIG_SMP
        num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
#else
        num_fault_mutexes = 1;
#endif
        hugetlb_fault_mutex_table =
                kmalloc_array(num_fault_mutexes, sizeof(struct mutex),
                              GFP_KERNEL);
        BUG_ON(!hugetlb_fault_mutex_table);

        for (i = 0; i < num_fault_mutexes; i++)
                mutex_init(&hugetlb_fault_mutex_table[i]);
        return 0;
}
subsys_initcall(hugetlb_init);

/* Overwritten by architectures with more huge page sizes */
bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size)
{
        return size == HPAGE_SIZE;
}

void __init hugetlb_add_hstate(unsigned int order)
{
        struct hstate *h;
        unsigned long i;

        if (size_to_hstate(PAGE_SIZE << order)) {
                return;
        }
        BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
        BUG_ON(order == 0);
        h = &hstates[hugetlb_max_hstate++];
        h->order = order;
        h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
        h->nr_huge_pages = 0;
        h->free_huge_pages = 0;
        for (i = 0; i < MAX_NUMNODES; ++i)
                INIT_LIST_HEAD(&h->hugepage_freelists[i]);
        INIT_LIST_HEAD(&h->hugepage_activelist);
        h->next_nid_to_alloc = first_memory_node;
        h->next_nid_to_free = first_memory_node;
        snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
                                        huge_page_size(h)/1024);

        parsed_hstate = h;
}

/*
 * hugepages command line processing
 * hugepages normally follows a valid hugepagsz or default_hugepagsz
 * specification.  If not, ignore the hugepages value.  hugepages can also
 * be the first huge page command line  option in which case it implicitly
 * specifies the number of huge pages for the default size.
 */
static int __init hugepages_setup(char *s)
{
        unsigned long *mhp;
        static unsigned long *last_mhp;

        if (!parsed_valid_hugepagesz) {
                pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
                parsed_valid_hugepagesz = true;
                return 0;
        }

        /*
         * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter
         * yet, so this hugepages= parameter goes to the "default hstate".
         * Otherwise, it goes with the previously parsed hugepagesz or
         * default_hugepagesz.
         */
        else if (!hugetlb_max_hstate)
                mhp = &default_hstate_max_huge_pages;
        else
                mhp = &parsed_hstate->max_huge_pages;

        if (mhp == last_mhp) {
                pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s);
                return 0;
        }

        if (sscanf(s, "%lu", mhp) <= 0)
                *mhp = 0;

        /*
         * Global state is always initialized later in hugetlb_init.
         * But we need to allocate >= MAX_ORDER hstates here early to still
         * use the bootmem allocator.
         */
        if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
                hugetlb_hstate_alloc_pages(parsed_hstate);

        last_mhp = mhp;

        return 1;
}
__setup("hugepages=", hugepages_setup);

/*
 * hugepagesz command line processing
 * A specific huge page size can only be specified once with hugepagesz.
 * hugepagesz is followed by hugepages on the command line.  The global
 * variable 'parsed_valid_hugepagesz' is used to determine if prior
 * hugepagesz argument was valid.
 */
static int __init hugepagesz_setup(char *s)
{
        unsigned long size;
        struct hstate *h;

        parsed_valid_hugepagesz = false;
        size = (unsigned long)memparse(s, NULL);

        if (!arch_hugetlb_valid_size(size)) {
                pr_err("HugeTLB: unsupported hugepagesz=%s\n", s);
                return 0;
        }

        h = size_to_hstate(size);
        if (h) {
                /*
                 * hstate for this size already exists.  This is normally
                 * an error, but is allowed if the existing hstate is the
                 * default hstate.  More specifically, it is only allowed if
                 * the number of huge pages for the default hstate was not
                 * previously specified.
                 */
                if (!parsed_default_hugepagesz ||  h != &default_hstate ||
                    default_hstate.max_huge_pages) {
                        pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s);
                        return 0;
                }

                /*
                 * No need to call hugetlb_add_hstate() as hstate already
                 * exists.  But, do set parsed_hstate so that a following
                 * hugepages= parameter will be applied to this hstate.
                 */
                parsed_hstate = h;
                parsed_valid_hugepagesz = true;
                return 1;
        }

        hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
        parsed_valid_hugepagesz = true;
        return 1;
}
__setup("hugepagesz=", hugepagesz_setup);

/*
 * default_hugepagesz command line input
 * Only one instance of default_hugepagesz allowed on command line.
 */
static int __init default_hugepagesz_setup(char *s)
{
        unsigned long size;

        parsed_valid_hugepagesz = false;
        if (parsed_default_hugepagesz) {
                pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
                return 0;
        }

        size = (unsigned long)memparse(s, NULL);

        if (!arch_hugetlb_valid_size(size)) {
                pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s);
                return 0;
        }

        hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
        parsed_valid_hugepagesz = true;
        parsed_default_hugepagesz = true;
        default_hstate_idx = hstate_index(size_to_hstate(size));

        /*
         * The number of default huge pages (for this size) could have been
         * specified as the first hugetlb parameter: hugepages=X.  If so,
         * then default_hstate_max_huge_pages is set.  If the default huge
         * page size is gigantic (>= MAX_ORDER), then the pages must be
         * allocated here from bootmem allocator.
         */
        if (default_hstate_max_huge_pages) {
                default_hstate.max_huge_pages = default_hstate_max_huge_pages;
                if (hstate_is_gigantic(&default_hstate))
                        hugetlb_hstate_alloc_pages(&default_hstate);
                default_hstate_max_huge_pages = 0;
        }

        return 1;
}
__setup("default_hugepagesz=", default_hugepagesz_setup);

static unsigned int allowed_mems_nr(struct hstate *h)
{
        int node;
        unsigned int nr = 0;
        nodemask_t *mpol_allowed;
        unsigned int *array = h->free_huge_pages_node;
        gfp_t gfp_mask = htlb_alloc_mask(h);

        mpol_allowed = policy_nodemask_current(gfp_mask);

        for_each_node_mask(node, cpuset_current_mems_allowed) {
                if (!mpol_allowed ||
                    (mpol_allowed && node_isset(node, *mpol_allowed)))
                        nr += array[node];
        }

        return nr;
}

#ifdef CONFIG_SYSCTL
static int proc_hugetlb_doulongvec_minmax(struct ctl_table *table, int write,
                                          void *buffer, size_t *length,
                                          loff_t *ppos, unsigned long *out)
{
        struct ctl_table dup_table;

        /*
         * In order to avoid races with __do_proc_doulongvec_minmax(), we
         * can duplicate the @table and alter the duplicate of it.
         */
        dup_table = *table;
        dup_table.data = out;

        return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos);
}

static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
                         struct ctl_table *table, int write,
                         void *buffer, size_t *length, loff_t *ppos)
{
        struct hstate *h = &default_hstate;
        unsigned long tmp = h->max_huge_pages;
        int ret;

        if (!hugepages_supported())
                return -EOPNOTSUPP;

        ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
                                             &tmp);
        if (ret)
                goto out;

        if (write)
                ret = __nr_hugepages_store_common(obey_mempolicy, h,
                                                  NUMA_NO_NODE, tmp, *length);
out:
        return ret;
}

int hugetlb_sysctl_handler(struct ctl_table *table, int write,
                          void *buffer, size_t *length, loff_t *ppos)
{

        return hugetlb_sysctl_handler_common(false, table, write,
                                                        buffer, length, ppos);
}

#ifdef CONFIG_NUMA
int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
                          void *buffer, size_t *length, loff_t *ppos)
{
        return hugetlb_sysctl_handler_common(true, table, write,
                                                        buffer, length, ppos);
}
#endif /* CONFIG_NUMA */

int hugetlb_overcommit_handler(struct ctl_table *table, int write,
                void *buffer, size_t *length, loff_t *ppos)
{
        struct hstate *h = &default_hstate;
        unsigned long tmp;
        int ret;

        if (!hugepages_supported())
                return -EOPNOTSUPP;

        tmp = h->nr_overcommit_huge_pages;

        if (write && hstate_is_gigantic(h))
                return -EINVAL;

        ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
                                             &tmp);
        if (ret)
                goto out;

        if (write) {
                spin_lock(&hugetlb_lock);
                h->nr_overcommit_huge_pages = tmp;
                spin_unlock(&hugetlb_lock);
        }
out:
        return ret;
}

#endif /* CONFIG_SYSCTL */

void hugetlb_report_meminfo(struct seq_file *m)
{
        struct hstate *h;
        unsigned long total = 0;

        if (!hugepages_supported())
                return;

        for_each_hstate(h) {
                unsigned long count = h->nr_huge_pages;

                total += (PAGE_SIZE << huge_page_order(h)) * count;

                if (h == &default_hstate)
                        seq_printf(m,
                                   "HugePages_Total:   %5lu\n"
                                   "HugePages_Free:    %5lu\n"
                                   "HugePages_Rsvd:    %5lu\n"
                                   "HugePages_Surp:    %5lu\n"
                                   "Hugepagesize:   %8lu kB\n",
                                   count,
                                   h->free_huge_pages,
                                   h->resv_huge_pages,
                                   h->surplus_huge_pages,
                                   (PAGE_SIZE << huge_page_order(h)) / 1024);
        }

        seq_printf(m, "Hugetlb:        %8lu kB\n", total / 1024);
}

int hugetlb_report_node_meminfo(char *buf, int len, int nid)
{
        struct hstate *h = &default_hstate;

        if (!hugepages_supported())
                return 0;

        return sysfs_emit_at(buf, len,
                             "Node %d HugePages_Total: %5u\n"
                             "Node %d HugePages_Free:  %5u\n"
                             "Node %d HugePages_Surp:  %5u\n",
                             nid, h->nr_huge_pages_node[nid],
                             nid, h->free_huge_pages_node[nid],
                             nid, h->surplus_huge_pages_node[nid]);
}

void hugetlb_show_meminfo(void)
{
        struct hstate *h;
        int nid;

        if (!hugepages_supported())
                return;

        for_each_node_state(nid, N_MEMORY)
                for_each_hstate(h)
                        pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
                                nid,
                                h->nr_huge_pages_node[nid],
                                h->free_huge_pages_node[nid],
                                h->surplus_huge_pages_node[nid],
                                1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
}

void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
{
        seq_printf(m, "HugetlbPages:\t%8lu kB\n",
                   atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10));
}

/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
unsigned long hugetlb_total_pages(void)
{
        struct hstate *h;
        unsigned long nr_total_pages = 0;

        for_each_hstate(h)
                nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
        return nr_total_pages;
}

static int hugetlb_acct_memory(struct hstate *h, long delta)
{
        int ret = -ENOMEM;

        spin_lock(&hugetlb_lock);
        /*
         * When cpuset is configured, it breaks the strict hugetlb page
         * reservation as the accounting is done on a global variable. Such
         * reservation is completely rubbish in the presence of cpuset because
         * the reservation is not checked against page availability for the
         * current cpuset. Application can still potentially OOM'ed by kernel
         * with lack of free htlb page in cpuset that the task is in.
         * Attempt to enforce strict accounting with cpuset is almost
         * impossible (or too ugly) because cpuset is too fluid that
         * task or memory node can be dynamically moved between cpusets.
         *
         * The change of semantics for shared hugetlb mapping with cpuset is
         * undesirable. However, in order to preserve some of the semantics,
         * we fall back to check against current free page availability as
         * a best attempt and hopefully to minimize the impact of changing
         * semantics that cpuset has.
         *
         * Apart from cpuset, we also have memory policy mechanism that
         * also determines from which node the kernel will allocate memory
         * in a NUMA system. So similar to cpuset, we also should consider
         * the memory policy of the current task. Similar to the description
         * above.
         */
        if (delta > 0) {
                if (gather_surplus_pages(h, delta) < 0)
                        goto out;

                if (delta > allowed_mems_nr(h)) {
                        return_unused_surplus_pages(h, delta);
                        goto out;
                }
        }

        ret = 0;
        if (delta < 0)
                return_unused_surplus_pages(h, (unsigned long) -delta);

out:
        spin_unlock(&hugetlb_lock);
        return ret;
}

static void hugetlb_vm_op_open(struct vm_area_struct *vma)
{
        struct resv_map *resv = vma_resv_map(vma);

        /*
         * This new VMA should share its siblings reservation map if present.
         * The VMA will only ever have a valid reservation map pointer where
         * it is being copied for another still existing VMA.  As that VMA
         * has a reference to the reservation map it cannot disappear until
         * after this open call completes.  It is therefore safe to take a
         * new reference here without additional locking.
         */
        if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
                resv_map_dup_hugetlb_cgroup_uncharge_info(resv);
                kref_get(&resv->refs);
        }
}

static void hugetlb_vm_op_close(struct vm_area_struct *vma)
{
        struct hstate *h = hstate_vma(vma);
        struct resv_map *resv = vma_resv_map(vma);
        struct hugepage_subpool *spool = subpool_vma(vma);
        unsigned long reserve, start, end;
        long gbl_reserve;

        if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
                return;

        start = vma_hugecache_offset(h, vma, vma->vm_start);
        end = vma_hugecache_offset(h, vma, vma->vm_end);

        reserve = (end - start) - region_count(resv, start, end);
        hugetlb_cgroup_uncharge_counter(resv, start, end);
        if (reserve) {
                /*
                 * Decrement reserve counts.  The global reserve count may be
                 * adjusted if the subpool has a minimum size.
                 */
                gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
                hugetlb_acct_memory(h, -gbl_reserve);
        }

        kref_put(&resv->refs, resv_map_release);
}

static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
{
        if (addr & ~(huge_page_mask(hstate_vma(vma))))
                return -EINVAL;
        return 0;
}

void hugetlb_split(struct vm_area_struct *vma, unsigned long addr)
{
        /*
         * PMD sharing is only possible for PUD_SIZE-aligned address ranges
         * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
         * split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
         * This function is called in the middle of a VMA split operation, with
         * MM, VMA and rmap all write-locked to prevent concurrent page table
         * walks (except hardware and gup_fast()).
         */
        mmap_assert_write_locked(vma->vm_mm);
        i_mmap_assert_write_locked(vma->vm_file->f_mapping);

        if (addr & ~PUD_MASK) {
                unsigned long floor = addr & PUD_MASK;
                unsigned long ceil = floor + PUD_SIZE;

                if (floor >= vma->vm_start && ceil <= vma->vm_end) {
                        /*
                         * Locking:
                         * Use take_locks=false here.
                         * The file rmap lock is already held.
                         * The hugetlb VMA lock can't be taken when we already
                         * hold the file rmap lock, and we don't need it because
                         * its purpose is to synchronize against concurrent page
                         * table walks, which are not possible thanks to the
                         * locks held by our caller.
                         */
                        hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false);
                }
        }
}

static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
{
        struct hstate *hstate = hstate_vma(vma);

        return 1UL << huge_page_shift(hstate);
}

/*
 * We cannot handle pagefaults against hugetlb pages at all.  They cause
 * handle_mm_fault() to try to instantiate regular-sized pages in the
 * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
 * this far.
 */
static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf)
{
        BUG();
        return 0;
}

/*
 * When a new function is introduced to vm_operations_struct and added
 * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops.
 * This is because under System V memory model, mappings created via
 * shmget/shmat with "huge page" specified are backed by hugetlbfs files,
 * their original vm_ops are overwritten with shm_vm_ops.
 */
const struct vm_operations_struct hugetlb_vm_ops = {
        .fault = hugetlb_vm_op_fault,
        .open = hugetlb_vm_op_open,
        .close = hugetlb_vm_op_close,
        .split = hugetlb_vm_op_split,
        .pagesize = hugetlb_vm_op_pagesize,
};

static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
                                int writable)
{
        pte_t entry;

        if (writable) {
                entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
                                         vma->vm_page_prot)));
        } else {
                entry = huge_pte_wrprotect(mk_huge_pte(page,
                                           vma->vm_page_prot));
        }
        entry = pte_mkyoung(entry);
        entry = pte_mkhuge(entry);
        entry = arch_make_huge_pte(entry, vma, page, writable);

        return entry;
}

static void set_huge_ptep_writable(struct vm_area_struct *vma,
                                   unsigned long address, pte_t *ptep)
{
        pte_t entry;

        entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep)));
        if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
                update_mmu_cache(vma, address, ptep);
}

bool is_hugetlb_entry_migration(pte_t pte)
{
        swp_entry_t swp;

        if (huge_pte_none(pte) || pte_present(pte))
                return false;
        swp = pte_to_swp_entry(pte);
        if (is_migration_entry(swp))
                return true;
        else
                return false;
}

static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
{
        swp_entry_t swp;

        if (huge_pte_none(pte) || pte_present(pte))
                return false;
        swp = pte_to_swp_entry(pte);
        if (is_hwpoison_entry(swp))
                return true;
        else
                return false;
}

int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                            struct vm_area_struct *vma)
{
        pte_t *src_pte, *dst_pte, entry;
        struct page *ptepage;
        unsigned long addr;
        int cow;
        struct hstate *h = hstate_vma(vma);
        unsigned long sz = huge_page_size(h);
        struct address_space *mapping = vma->vm_file->f_mapping;
        struct mmu_notifier_range range;
        int ret = 0;

        cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;

        if (cow) {
                mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src,
                                        vma->vm_start,
                                        vma->vm_end);
                mmu_notifier_invalidate_range_start(&range);
        } else {
                /*
                 * For shared mappings i_mmap_rwsem must be held to call
                 * huge_pte_alloc, otherwise the returned ptep could go
                 * away if part of a shared pmd and another thread calls
                 * huge_pmd_unshare.
                 */
                i_mmap_lock_read(mapping);
        }

        for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
                spinlock_t *src_ptl, *dst_ptl;
                src_pte = huge_pte_offset(src, addr, sz);
                if (!src_pte)
                        continue;
                dst_pte = huge_pte_alloc(dst, addr, sz);
                if (!dst_pte) {
                        ret = -ENOMEM;
                        break;
                }

#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
                /* If the pagetables are shared, there is nothing to do */
                if (atomic_read(&virt_to_page(dst_pte)->pt_share_count))
                        continue;
#endif

                dst_ptl = huge_pte_lock(h, dst, dst_pte);
                src_ptl = huge_pte_lockptr(h, src, src_pte);
                spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
                entry = huge_ptep_get(src_pte);
                if (huge_pte_none(entry)) {
                        /*
                         * Skip if src entry none.
                         */
                        ;
                } else if (unlikely(is_hugetlb_entry_migration(entry) ||
                                    is_hugetlb_entry_hwpoisoned(entry))) {
                        swp_entry_t swp_entry = pte_to_swp_entry(entry);

                        if (is_write_migration_entry(swp_entry) && cow) {
                                /*
                                 * COW mappings require pages in both
                                 * parent and child to be set to read.
                                 */
                                make_migration_entry_read(&swp_entry);
                                entry = swp_entry_to_pte(swp_entry);
                                set_huge_swap_pte_at(src, addr, src_pte,
                                                     entry, sz);
                        }
                        set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
                } else {
                        if (cow) {
                                /*
                                 * No need to notify as we are downgrading page
                                 * table protection not changing it to point
                                 * to a new page.
                                 *
                                 * See Documentation/vm/mmu_notifier.rst
                                 */
                                huge_ptep_set_wrprotect(src, addr, src_pte);
                        }
                        entry = huge_ptep_get(src_pte);
                        ptepage = pte_page(entry);
                        get_page(ptepage);
                        page_dup_rmap(ptepage, true);
                        set_huge_pte_at(dst, addr, dst_pte, entry);
                        hugetlb_count_add(pages_per_huge_page(h), dst);
                }
                spin_unlock(src_ptl);
                spin_unlock(dst_ptl);
        }

        if (cow)
                mmu_notifier_invalidate_range_end(&range);
        else
                i_mmap_unlock_read(mapping);

        return ret;
}

void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
                            unsigned long start, unsigned long end,
                            struct page *ref_page)
{
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address;
        pte_t *ptep;
        pte_t pte;
        spinlock_t *ptl;
        struct page *page;
        struct hstate *h = hstate_vma(vma);
        unsigned long sz = huge_page_size(h);
        struct mmu_notifier_range range;

        WARN_ON(!is_vm_hugetlb_page(vma));
        BUG_ON(start & ~huge_page_mask(h));
        BUG_ON(end & ~huge_page_mask(h));

        /*
         * This is a hugetlb vma, all the pte entries should point
         * to huge page.
         */
        tlb_change_page_size(tlb, sz);
        tlb_start_vma(tlb, vma);

        /*
         * If sharing possible, alert mmu notifiers of worst case.
         */
        mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start,
                                end);
        adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
        mmu_notifier_invalidate_range_start(&range);
        address = start;
        for (; address < end; address += sz) {
                ptep = huge_pte_offset(mm, address, sz);
                if (!ptep)
                        continue;

                ptl = huge_pte_lock(h, mm, ptep);
                if (huge_pmd_unshare(tlb, vma, &address, ptep)) {
                        spin_unlock(ptl);
                        continue;
                }

                pte = huge_ptep_get(ptep);
                if (huge_pte_none(pte)) {
                        spin_unlock(ptl);
                        continue;
                }

                /*
                 * Migrating hugepage or HWPoisoned hugepage is already
                 * unmapped and its refcount is dropped, so just clear pte here.
                 */
                if (unlikely(!pte_present(pte))) {
                        huge_pte_clear(mm, address, ptep, sz);
                        spin_unlock(ptl);
                        continue;
                }

                page = pte_page(pte);
                /*
                 * If a reference page is supplied, it is because a specific
                 * page is being unmapped, not a range. Ensure the page we
                 * are about to unmap is the actual page of interest.
                 */
                if (ref_page) {
                        if (page != ref_page) {
                                spin_unlock(ptl);
                                continue;
                        }
                        /*
                         * Mark the VMA as having unmapped its page so that
                         * future faults in this VMA will fail rather than
                         * looking like data was lost
                         */
                        set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
                }

                pte = huge_ptep_get_and_clear(mm, address, ptep);
                tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
                if (huge_pte_dirty(pte))
                        set_page_dirty(page);

                hugetlb_count_sub(pages_per_huge_page(h), mm);
                page_remove_rmap(page, true);

                spin_unlock(ptl);
                tlb_remove_page_size(tlb, page, huge_page_size(h));
                /*
                 * Bail out after unmapping reference page if supplied
                 */
                if (ref_page)
                        break;
        }
        mmu_notifier_invalidate_range_end(&range);
        tlb_end_vma(tlb, vma);

        huge_pmd_unshare_flush(tlb, vma);
}

void __unmap_hugepage_range_final(struct mmu_gather *tlb,
                          struct vm_area_struct *vma, unsigned long start,
                          unsigned long end, struct page *ref_page)
{
        __unmap_hugepage_range(tlb, vma, start, end, ref_page);

        /*
         * Clear this flag so that x86's huge_pmd_share page_table_shareable
         * test will fail on a vma being torn down, and not grab a page table
         * on its way out.  We're lucky that the flag has such an appropriate
         * name, and can in fact be safely cleared here. We could clear it
         * before the __unmap_hugepage_range above, but all that's necessary
         * is to clear it before releasing the i_mmap_rwsem. This works
         * because in the context this is called, the VMA is about to be
         * destroyed and the i_mmap_rwsem is held.
         */
        vma->vm_flags &= ~VM_MAYSHARE;
}

void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                          unsigned long end, struct page *ref_page)
{
        struct mm_struct *mm;
        struct mmu_gather tlb;
        unsigned long tlb_start = start;
        unsigned long tlb_end = end;

        /*
         * If shared PMDs were possibly used within this vma range, adjust
         * start/end for worst case tlb flushing.
         * Note that we can not be sure if PMDs are shared until we try to
         * unmap pages.  However, we want to make sure TLB flushing covers
         * the largest possible range.
         */
        adjust_range_if_pmd_sharing_possible(vma, &tlb_start, &tlb_end);

        mm = vma->vm_mm;

        tlb_gather_mmu(&tlb, mm, tlb_start, tlb_end);
        __unmap_hugepage_range(&tlb, vma, start, end, ref_page);
        tlb_finish_mmu(&tlb, tlb_start, tlb_end);
}

/*
 * This is called when the original mapper is failing to COW a MAP_PRIVATE
 * mappping it owns the reserve page for. The intention is to unmap the page
 * from other VMAs and let the children be SIGKILLed if they are faulting the
 * same region.
 */
static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
                              struct page *page, unsigned long address)
{
        struct hstate *h = hstate_vma(vma);
        struct vm_area_struct *iter_vma;
        struct address_space *mapping;
        pgoff_t pgoff;

        /*
         * vm_pgoff is in PAGE_SIZE units, hence the different calculation
         * from page cache lookup which is in HPAGE_SIZE units.
         */
        address = address & huge_page_mask(h);
        pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
                        vma->vm_pgoff;
        mapping = vma->vm_file->f_mapping;

        /*
         * Take the mapping lock for the duration of the table walk. As
         * this mapping should be shared between all the VMAs,
         * __unmap_hugepage_range() is called as the lock is already held
         */
        i_mmap_lock_write(mapping);
        vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
                /* Do not unmap the current VMA */
                if (iter_vma == vma)
                        continue;

                /*
                 * Shared VMAs have their own reserves and do not affect
                 * MAP_PRIVATE accounting but it is possible that a shared
                 * VMA is using the same page so check and skip such VMAs.
                 */
                if (iter_vma->vm_flags & VM_MAYSHARE)
                        continue;

                /*
                 * Unmap the page from other VMAs without their own reserves.
                 * They get marked to be SIGKILLed if they fault in these
                 * areas. This is because a future no-page fault on this VMA
                 * could insert a zeroed page instead of the data existing
                 * from the time of fork. This would look like data corruption
                 */
                if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
                        unmap_hugepage_range(iter_vma, address,
                                             address + huge_page_size(h), page);
        }
        i_mmap_unlock_write(mapping);
}

/*
 * Hugetlb_cow() should be called with page lock of the original hugepage held.
 * Called with hugetlb_instantiation_mutex held and pte_page locked so we
 * cannot race with other handlers or page migration.
 * Keep the pte_same checks anyway to make transition from the mutex easier.
 */
static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
                       unsigned long address, pte_t *ptep,
                       struct page *pagecache_page, spinlock_t *ptl)
{
        pte_t pte;
        struct hstate *h = hstate_vma(vma);
        struct page *old_page, *new_page;
        int outside_reserve = 0;
        vm_fault_t ret = 0;
        unsigned long haddr = address & huge_page_mask(h);
        struct mmu_notifier_range range;

        pte = huge_ptep_get(ptep);
        old_page = pte_page(pte);

retry_avoidcopy:
        /* If no-one else is actually using this page, avoid the copy
         * and just make the page writable */
        if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
                page_move_anon_rmap(old_page, vma);
                set_huge_ptep_writable(vma, haddr, ptep);
                return 0;
        }

        /*
         * If the process that created a MAP_PRIVATE mapping is about to
         * perform a COW due to a shared page count, attempt to satisfy
         * the allocation without using the existing reserves. The pagecache
         * page is used to determine if the reserve at this address was
         * consumed or not. If reserves were used, a partial faulted mapping
         * at the time of fork() could consume its reserves on COW instead
         * of the full address range.
         */
        if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
                        old_page != pagecache_page)
                outside_reserve = 1;

        get_page(old_page);

        /*
         * Drop page table lock as buddy allocator may be called. It will
         * be acquired again before returning to the caller, as expected.
         */
        spin_unlock(ptl);
        new_page = alloc_huge_page(vma, haddr, outside_reserve);

        if (IS_ERR(new_page)) {
                /*
                 * If a process owning a MAP_PRIVATE mapping fails to COW,
                 * it is due to references held by a child and an insufficient
                 * huge page pool. To guarantee the original mappers
                 * reliability, unmap the page from child processes. The child
                 * may get SIGKILLed if it later faults.
                 */
                if (outside_reserve) {
                        struct address_space *mapping = vma->vm_file->f_mapping;
                        pgoff_t idx;
                        u32 hash;

                        put_page(old_page);
                        BUG_ON(huge_pte_none(pte));
                        /*
                         * Drop hugetlb_fault_mutex and i_mmap_rwsem before
                         * unmapping.  unmapping needs to hold i_mmap_rwsem
                         * in write mode.  Dropping i_mmap_rwsem in read mode
                         * here is OK as COW mappings do not interact with
                         * PMD sharing.
                         *
                         * Reacquire both after unmap operation.
                         */
                        idx = vma_hugecache_offset(h, vma, haddr);
                        hash = hugetlb_fault_mutex_hash(mapping, idx);
                        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                        i_mmap_unlock_read(mapping);

                        unmap_ref_private(mm, vma, old_page, haddr);

                        i_mmap_lock_read(mapping);
                        mutex_lock(&hugetlb_fault_mutex_table[hash]);
                        spin_lock(ptl);
                        ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
                        if (likely(ptep &&
                                   pte_same(huge_ptep_get(ptep), pte)))
                                goto retry_avoidcopy;
                        /*
                         * race occurs while re-acquiring page table
                         * lock, and our job is done.
                         */
                        return 0;
                }

                ret = vmf_error(PTR_ERR(new_page));
                goto out_release_old;
        }

        /*
         * When the original hugepage is shared one, it does not have
         * anon_vma prepared.
         */
        if (unlikely(anon_vma_prepare(vma))) {
                ret = VM_FAULT_OOM;
                goto out_release_all;
        }

        copy_user_huge_page(new_page, old_page, address, vma,
                            pages_per_huge_page(h));
        __SetPageUptodate(new_page);

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr,
                                haddr + huge_page_size(h));
        mmu_notifier_invalidate_range_start(&range);

        /*
         * Retake the page table lock to check for racing updates
         * before the page tables are altered
         */
        spin_lock(ptl);
        ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
        if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
                ClearPagePrivate(new_page);

                /* Break COW */
                huge_ptep_clear_flush(vma, haddr, ptep);
                mmu_notifier_invalidate_range(mm, range.start, range.end);
                set_huge_pte_at(mm, haddr, ptep,
                                make_huge_pte(vma, new_page, 1));
                page_remove_rmap(old_page, true);
                hugepage_add_new_anon_rmap(new_page, vma, haddr);
                set_page_huge_active(new_page);
                /* Make the old page be freed below */
                new_page = old_page;
        }
        spin_unlock(ptl);
        mmu_notifier_invalidate_range_end(&range);
out_release_all:
        restore_reserve_on_error(h, vma, haddr, new_page);
        put_page(new_page);
out_release_old:
        put_page(old_page);

        spin_lock(ptl); /* Caller expects lock to be held */
        return ret;
}

/* Return the pagecache page at a given address within a VMA */
static struct page *hugetlbfs_pagecache_page(struct hstate *h,
                        struct vm_area_struct *vma, unsigned long address)
{
        struct address_space *mapping;
        pgoff_t idx;

        mapping = vma->vm_file->f_mapping;
        idx = vma_hugecache_offset(h, vma, address);

        return find_lock_page(mapping, idx);
}

/*
 * Return whether there is a pagecache page to back given address within VMA.
 * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
 */
static bool hugetlbfs_pagecache_present(struct hstate *h,
                        struct vm_area_struct *vma, unsigned long address)
{
        struct address_space *mapping;
        pgoff_t idx;
        struct page *page;

        mapping = vma->vm_file->f_mapping;
        idx = vma_hugecache_offset(h, vma, address);

        page = find_get_page(mapping, idx);
        if (page)
                put_page(page);
        return page != NULL;
}

int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
                           pgoff_t idx)
{
        struct inode *inode = mapping->host;
        struct hstate *h = hstate_inode(inode);
        int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);

        if (err)
                return err;
        ClearPagePrivate(page);

        /*
         * set page dirty so that it will not be removed from cache/file
         * by non-hugetlbfs specific code paths.
         */
        set_page_dirty(page);

        spin_lock(&inode->i_lock);
        inode->i_blocks += blocks_per_huge_page(h);
        spin_unlock(&inode->i_lock);
        return 0;
}

static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
                        struct vm_area_struct *vma,
                        struct address_space *mapping, pgoff_t idx,
                        unsigned long address, pte_t *ptep, unsigned int flags)
{
        struct hstate *h = hstate_vma(vma);
        vm_fault_t ret = VM_FAULT_SIGBUS;
        int anon_rmap = 0;
        unsigned long size;
        struct page *page;
        pte_t new_pte;
        spinlock_t *ptl;
        unsigned long haddr = address & huge_page_mask(h);
        bool new_page = false;
        u32 hash = hugetlb_fault_mutex_hash(mapping, idx);

        /*
         * Currently, we are forced to kill the process in the event the
         * original mapper has unmapped pages from the child due to a failed
         * COW. Warn that such a situation has occurred as it may not be obvious
         */
        if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
                pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
                           current->pid);
                goto out;
        }

        /*
         * We can not race with truncation due to holding i_mmap_rwsem.
         * i_size is modified when holding i_mmap_rwsem, so check here
         * once for faults beyond end of file.
         */
        size = i_size_read(mapping->host) >> huge_page_shift(h);
        if (idx >= size)
                goto out;

retry:
        page = find_lock_page(mapping, idx);
        if (!page) {
                /*
                 * Check for page in userfault range
                 */
                if (userfaultfd_missing(vma)) {
                        struct vm_fault vmf = {
                                .vma = vma,
                                .address = haddr,
                                .flags = flags,
                                /*
                                 * Hard to debug if it ends up being
                                 * used by a callee that assumes
                                 * something about the other
                                 * uninitialized fields... same as in
                                 * memory.c
                                 */
                        };

                        /*
                         * vma_lock and hugetlb_fault_mutex must be dropped
                         * before handling userfault. Also mmap_lock will
                         * be dropped during handling userfault, any vma
                         * operation should be careful from here.
                         */
                        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                        i_mmap_unlock_read(mapping);
                        return handle_userfault(&vmf, VM_UFFD_MISSING);
                }

                page = alloc_huge_page(vma, haddr, 0);
                if (IS_ERR(page)) {
                        /*
                         * Returning error will result in faulting task being
                         * sent SIGBUS.  The hugetlb fault mutex prevents two
                         * tasks from racing to fault in the same page which
                         * could result in false unable to allocate errors.
                         * Page migration does not take the fault mutex, but
                         * does a clear then write of pte's under page table
                         * lock.  Page fault code could race with migration,
                         * notice the clear pte and try to allocate a page
                         * here.  Before returning error, get ptl and make
                         * sure there really is no pte entry.
                         */
                        ptl = huge_pte_lock(h, mm, ptep);
                        if (!huge_pte_none(huge_ptep_get(ptep))) {
                                ret = 0;
                                spin_unlock(ptl);
                                goto out;
                        }
                        spin_unlock(ptl);
                        ret = vmf_error(PTR_ERR(page));
                        goto out;
                }
                clear_huge_page(page, address, pages_per_huge_page(h));
                __SetPageUptodate(page);
                new_page = true;

                if (vma->vm_flags & VM_MAYSHARE) {
                        int err = huge_add_to_page_cache(page, mapping, idx);
                        if (err) {
                                put_page(page);
                                if (err == -EEXIST)
                                        goto retry;
                                goto out;
                        }
                } else {
                        lock_page(page);
                        if (unlikely(anon_vma_prepare(vma))) {
                                ret = VM_FAULT_OOM;
                                goto backout_unlocked;
                        }
                        anon_rmap = 1;
                }
        } else {
                /*
                 * If memory error occurs between mmap() and fault, some process
                 * don't have hwpoisoned swap entry for errored virtual address.
                 * So we need to block hugepage fault by PG_hwpoison bit check.
                 */
                if (unlikely(PageHWPoison(page))) {
                        ret = VM_FAULT_HWPOISON_LARGE |
                                VM_FAULT_SET_HINDEX(hstate_index(h));
                        goto backout_unlocked;
                }
        }

        /*
         * If we are going to COW a private mapping later, we examine the
         * pending reservations for this page now. This will ensure that
         * any allocations necessary to record that reservation occur outside
         * the spinlock.
         */
        if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
                if (vma_needs_reservation(h, vma, haddr) < 0) {
                        ret = VM_FAULT_OOM;
                        goto backout_unlocked;
                }
                /* Just decrements count, does not deallocate */
                vma_end_reservation(h, vma, haddr);
        }

        ptl = huge_pte_lock(h, mm, ptep);
        ret = 0;
        if (!huge_pte_none(huge_ptep_get(ptep)))
                goto backout;

        if (anon_rmap) {
                ClearPagePrivate(page);
                hugepage_add_new_anon_rmap(page, vma, haddr);
        } else
                page_dup_rmap(page, true);
        new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
                                && (vma->vm_flags & VM_SHARED)));
        set_huge_pte_at(mm, haddr, ptep, new_pte);

        hugetlb_count_add(pages_per_huge_page(h), mm);
        if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
                /* Optimization, do the COW without a second fault */
                ret = hugetlb_cow(mm, vma, address, ptep, page, ptl);
        }

        spin_unlock(ptl);

        /*
         * Only make newly allocated pages active.  Existing pages found
         * in the pagecache could be !page_huge_active() if they have been
         * isolated for migration.
         */
        if (new_page)
                set_page_huge_active(page);

        unlock_page(page);
out:
        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
        i_mmap_unlock_read(mapping);
        return ret;

backout:
        spin_unlock(ptl);
backout_unlocked:
        unlock_page(page);
        restore_reserve_on_error(h, vma, haddr, page);
        put_page(page);
        goto out;
}

#ifdef CONFIG_SMP
u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
{
        unsigned long key[2];
        u32 hash;

        key[0] = (unsigned long) mapping;
        key[1] = idx;

        hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0);

        return hash & (num_fault_mutexes - 1);
}
#else
/*
 * For uniprocesor systems we always use a single mutex, so just
 * return 0 and avoid the hashing overhead.
 */
u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
{
        return 0;
}
#endif

vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, unsigned int flags)
{
        pte_t *ptep, entry;
        spinlock_t *ptl;
        vm_fault_t ret;
        u32 hash;
        pgoff_t idx;
        struct page *page = NULL;
        struct page *pagecache_page = NULL;
        struct hstate *h = hstate_vma(vma);
        struct address_space *mapping;
        int need_wait_lock = 0;
        unsigned long haddr = address & huge_page_mask(h);

        ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
        if (ptep) {
                /*
                 * Since we hold no locks, ptep could be stale.  That is
                 * OK as we are only making decisions based on content and
                 * not actually modifying content here.
                 */
                entry = huge_ptep_get(ptep);
                if (unlikely(is_hugetlb_entry_migration(entry))) {
                        migration_entry_wait_huge(vma, mm, ptep);
                        return 0;
                } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
                        return VM_FAULT_HWPOISON_LARGE |
                                VM_FAULT_SET_HINDEX(hstate_index(h));
        }

        /*
         * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
         * until finished with ptep.  This serves two purposes:
         * 1) It prevents huge_pmd_unshare from being called elsewhere
         *    and making the ptep no longer valid.
         * 2) It synchronizes us with i_size modifications during truncation.
         *
         * ptep could have already be assigned via huge_pte_offset.  That
         * is OK, as huge_pte_alloc will return the same value unless
         * something has changed.
         */
        mapping = vma->vm_file->f_mapping;
        i_mmap_lock_read(mapping);
        ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
        if (!ptep) {
                i_mmap_unlock_read(mapping);
                return VM_FAULT_OOM;
        }

        /*
         * Serialize hugepage allocation and instantiation, so that we don't
         * get spurious allocation failures if two CPUs race to instantiate
         * the same page in the page cache.
         */
        idx = vma_hugecache_offset(h, vma, haddr);
        hash = hugetlb_fault_mutex_hash(mapping, idx);
        mutex_lock(&hugetlb_fault_mutex_table[hash]);

        entry = huge_ptep_get(ptep);
        if (huge_pte_none(entry))
                /*
                 * hugetlb_no_page will drop vma lock and hugetlb fault
                 * mutex internally, which make us return immediately.
                 */
                return hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);

        ret = 0;

        /*
         * entry could be a migration/hwpoison entry at this point, so this
         * check prevents the kernel from going below assuming that we have
         * an active hugepage in pagecache. This goto expects the 2nd page
         * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will
         * properly handle it.
         */
        if (!pte_present(entry))
                goto out_mutex;

        /*
         * If we are going to COW the mapping later, we examine the pending
         * reservations for this page now. This will ensure that any
         * allocations necessary to record that reservation occur outside the
         * spinlock. For private mappings, we also lookup the pagecache
         * page now as it is used to determine if a reservation has been
         * consumed.
         */
        if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
                if (vma_needs_reservation(h, vma, haddr) < 0) {
                        ret = VM_FAULT_OOM;
                        goto out_mutex;
                }
                /* Just decrements count, does not deallocate */
                vma_end_reservation(h, vma, haddr);

                if (!(vma->vm_flags & VM_MAYSHARE))
                        pagecache_page = hugetlbfs_pagecache_page(h,
                                                                vma, haddr);
        }

        ptl = huge_pte_lock(h, mm, ptep);

        /* Check for a racing update before calling hugetlb_cow */
        if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
                goto out_ptl;

        /*
         * hugetlb_cow() requires page locks of pte_page(entry) and
         * pagecache_page, so here we need take the former one
         * when page != pagecache_page or !pagecache_page.
         */
        page = pte_page(entry);
        if (page != pagecache_page)
                if (!trylock_page(page)) {
                        need_wait_lock = 1;
                        goto out_ptl;
                }

        get_page(page);

        if (flags & FAULT_FLAG_WRITE) {
                if (!huge_pte_write(entry)) {
                        ret = hugetlb_cow(mm, vma, address, ptep,
                                          pagecache_page, ptl);
                        goto out_put_page;
                }
                entry = huge_pte_mkdirty(entry);
        }
        entry = pte_mkyoung(entry);
        if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
                                                flags & FAULT_FLAG_WRITE))
                update_mmu_cache(vma, haddr, ptep);
out_put_page:
        if (page != pagecache_page)
                unlock_page(page);
        put_page(page);
out_ptl:
        spin_unlock(ptl);

        if (pagecache_page) {
                unlock_page(pagecache_page);
                put_page(pagecache_page);
        }
out_mutex:
        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
        i_mmap_unlock_read(mapping);
        /*
         * Generally it's safe to hold refcount during waiting page lock. But
         * here we just wait to defer the next page fault to avoid busy loop and
         * the page is not used after unlocked before returning from the current
         * page fault. So we are safe from accessing freed page, even if we wait
         * here without taking refcount.
         */
        if (need_wait_lock)
                wait_on_page_locked(page);
        return ret;
}

/*
 * Used by userfaultfd UFFDIO_COPY.  Based on mcopy_atomic_pte with
 * modifications for huge pages.
 */
int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
                            pte_t *dst_pte,
                            struct vm_area_struct *dst_vma,
                            unsigned long dst_addr,
                            unsigned long src_addr,
                            struct page **pagep)
{
        struct address_space *mapping;
        pgoff_t idx;
        unsigned long size;
        int vm_shared = dst_vma->vm_flags & VM_SHARED;
        struct hstate *h = hstate_vma(dst_vma);
        pte_t _dst_pte;
        spinlock_t *ptl;
        int ret;
        struct page *page;

        if (!*pagep) {
                /* If a page already exists, then it's UFFDIO_COPY for
                 * a non-missing case. Return -EEXIST.
                 */
                if (vm_shared &&
                    hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
                        ret = -EEXIST;
                        goto out;
                }

                page = alloc_huge_page(dst_vma, dst_addr, 0);
                if (IS_ERR(page)) {
                        ret = -ENOMEM;
                        goto out;
                }

                ret = copy_huge_page_from_user(page,
                                                (const void __user *) src_addr,
                                                pages_per_huge_page(h), false);

                /* fallback to copy_from_user outside mmap_lock */
                if (unlikely(ret)) {
                        ret = -ENOENT;
                        *pagep = page;
                        /* don't free the page */
                        goto out;
                }
        } else {
                page = *pagep;
                *pagep = NULL;
        }

        /*
         * The memory barrier inside __SetPageUptodate makes sure that
         * preceding stores to the page contents become visible before
         * the set_pte_at() write.
         */
        __SetPageUptodate(page);

        mapping = dst_vma->vm_file->f_mapping;
        idx = vma_hugecache_offset(h, dst_vma, dst_addr);

        /*
         * If shared, add to page cache
         */
        if (vm_shared) {
                size = i_size_read(mapping->host) >> huge_page_shift(h);
                ret = -EFAULT;
                if (idx >= size)
                        goto out_release_nounlock;

                /*
                 * Serialization between remove_inode_hugepages() and
                 * huge_add_to_page_cache() below happens through the
                 * hugetlb_fault_mutex_table that here must be hold by
                 * the caller.
                 */
                ret = huge_add_to_page_cache(page, mapping, idx);
                if (ret)
                        goto out_release_nounlock;
        }

        ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
        spin_lock(ptl);

        /*
         * Recheck the i_size after holding PT lock to make sure not
         * to leave any page mapped (as page_mapped()) beyond the end
         * of the i_size (remove_inode_hugepages() is strict about
         * enforcing that). If we bail out here, we'll also leave a
         * page in the radix tree in the vm_shared case beyond the end
         * of the i_size, but remove_inode_hugepages() will take care
         * of it as soon as we drop the hugetlb_fault_mutex_table.
         */
        size = i_size_read(mapping->host) >> huge_page_shift(h);
        ret = -EFAULT;
        if (idx >= size)
                goto out_release_unlock;

        ret = -EEXIST;
        if (!huge_pte_none(huge_ptep_get(dst_pte)))
                goto out_release_unlock;

        if (vm_shared) {
                page_dup_rmap(page, true);
        } else {
                ClearPagePrivate(page);
                hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
        }

        _dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE);
        if (dst_vma->vm_flags & VM_WRITE)
                _dst_pte = huge_pte_mkdirty(_dst_pte);
        _dst_pte = pte_mkyoung(_dst_pte);

        set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);

        (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte,
                                        dst_vma->vm_flags & VM_WRITE);
        hugetlb_count_add(pages_per_huge_page(h), dst_mm);

        /* No need to invalidate - it was non-present before */
        update_mmu_cache(dst_vma, dst_addr, dst_pte);

        spin_unlock(ptl);
        set_page_huge_active(page);
        if (vm_shared)
                unlock_page(page);
        ret = 0;
out:
        return ret;
out_release_unlock:
        spin_unlock(ptl);
        if (vm_shared)
                unlock_page(page);
out_release_nounlock:
        put_page(page);
        goto out;
}

long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                         struct page **pages, struct vm_area_struct **vmas,
                         unsigned long *position, unsigned long *nr_pages,
                         long i, unsigned int flags, int *locked)
{
        unsigned long pfn_offset;
        unsigned long vaddr = *position;
        unsigned long remainder = *nr_pages;
        struct hstate *h = hstate_vma(vma);
        int err = -EFAULT;

        while (vaddr < vma->vm_end && remainder) {
                pte_t *pte;
                spinlock_t *ptl = NULL;
                int absent;
                struct page *page;

                /*
                 * If we have a pending SIGKILL, don't keep faulting pages and
                 * potentially allocating memory.
                 */
                if (fatal_signal_pending(current)) {
                        remainder = 0;
                        break;
                }

                /*
                 * Some archs (sparc64, sh*) have multiple pte_ts to
                 * each hugepage.  We have to make sure we get the
                 * first, for the page indexing below to work.
                 *
                 * Note that page table lock is not held when pte is null.
                 */
                pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
                                      huge_page_size(h));
                if (pte)
                        ptl = huge_pte_lock(h, mm, pte);
                absent = !pte || huge_pte_none(huge_ptep_get(pte));

                /*
                 * When coredumping, it suits get_dump_page if we just return
                 * an error where there's an empty slot with no huge pagecache
                 * to back it.  This way, we avoid allocating a hugepage, and
                 * the sparse dumpfile avoids allocating disk blocks, but its
                 * huge holes still show up with zeroes where they need to be.
                 */
                if (absent && (flags & FOLL_DUMP) &&
                    !hugetlbfs_pagecache_present(h, vma, vaddr)) {
                        if (pte)
                                spin_unlock(ptl);
                        remainder = 0;
                        break;
                }

                /*
                 * We need call hugetlb_fault for both hugepages under migration
                 * (in which case hugetlb_fault waits for the migration,) and
                 * hwpoisoned hugepages (in which case we need to prevent the
                 * caller from accessing to them.) In order to do this, we use
                 * here is_swap_pte instead of is_hugetlb_entry_migration and
                 * is_hugetlb_entry_hwpoisoned. This is because it simply covers
                 * both cases, and because we can't follow correct pages
                 * directly from any kind of swap entries.
                 */
                if (absent || is_swap_pte(huge_ptep_get(pte)) ||
                    ((flags & FOLL_WRITE) &&
                      !huge_pte_write(huge_ptep_get(pte)))) {
                        vm_fault_t ret;
                        unsigned int fault_flags = 0;

                        if (pte)
                                spin_unlock(ptl);
                        if (flags & FOLL_WRITE)
                                fault_flags |= FAULT_FLAG_WRITE;
                        if (locked)
                                fault_flags |= FAULT_FLAG_ALLOW_RETRY |
                                        FAULT_FLAG_KILLABLE;
                        if (flags & FOLL_NOWAIT)
                                fault_flags |= FAULT_FLAG_ALLOW_RETRY |
                                        FAULT_FLAG_RETRY_NOWAIT;
                        if (flags & FOLL_TRIED) {
                                /*
                                 * Note: FAULT_FLAG_ALLOW_RETRY and
                                 * FAULT_FLAG_TRIED can co-exist
                                 */
                                fault_flags |= FAULT_FLAG_TRIED;
                        }
                        ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
                        if (ret & VM_FAULT_ERROR) {
                                err = vm_fault_to_errno(ret, flags);
                                remainder = 0;
                                break;
                        }
                        if (ret & VM_FAULT_RETRY) {
                                if (locked &&
                                    !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
                                        *locked = 0;
                                *nr_pages = 0;
                                /*
                                 * VM_FAULT_RETRY must not return an
                                 * error, it will return zero
                                 * instead.
                                 *
                                 * No need to update "position" as the
                                 * caller will not check it after
                                 * *nr_pages is set to 0.
                                 */
                                return i;
                        }
                        continue;
                }

                pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
                page = pte_page(huge_ptep_get(pte));

                /*
                 * If subpage information not requested, update counters
                 * and skip the same_page loop below.
                 */
                if (!pages && !vmas && !pfn_offset &&
                    (vaddr + huge_page_size(h) < vma->vm_end) &&
                    (remainder >= pages_per_huge_page(h))) {
                        vaddr += huge_page_size(h);
                        remainder -= pages_per_huge_page(h);
                        i += pages_per_huge_page(h);
                        spin_unlock(ptl);
                        continue;
                }

same_page:
                if (pages) {
                        pages[i] = mem_map_offset(page, pfn_offset);
                        /*
                         * try_grab_page() should always succeed here, because:
                         * a) we hold the ptl lock, and b) we've just checked
                         * that the huge page is present in the page tables. If
                         * the huge page is present, then the tail pages must
                         * also be present. The ptl prevents the head page and
                         * tail pages from being rearranged in any way. So this
                         * page must be available at this point, unless the page
                         * refcount overflowed:
                         */
                        if (WARN_ON_ONCE(!try_grab_page(pages[i], flags))) {
                                spin_unlock(ptl);
                                remainder = 0;
                                err = -ENOMEM;
                                break;
                        }
                }

                if (vmas)
                        vmas[i] = vma;

                vaddr += PAGE_SIZE;
                ++pfn_offset;
                --remainder;
                ++i;
                if (vaddr < vma->vm_end && remainder &&
                                pfn_offset < pages_per_huge_page(h)) {
                        /*
                         * We use pfn_offset to avoid touching the pageframes
                         * of this compound page.
                         */
                        goto same_page;
                }
                spin_unlock(ptl);
        }
        *nr_pages = remainder;
        /*
         * setting position is actually required only if remainder is
         * not zero but it's faster not to add a "if (remainder)"
         * branch.
         */
        *position = vaddr;

        return i ? i : err;
}

#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
/*
 * ARCHes with special requirements for evicting HUGETLB backing TLB entries can
 * implement this.
 */
#define flush_hugetlb_tlb_range(vma, addr, end)        flush_tlb_range(vma, addr, end)
#endif

unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                unsigned long address, unsigned long end, pgprot_t newprot)
{
        struct mm_struct *mm = vma->vm_mm;
        unsigned long start = address;
        pte_t *ptep;
        pte_t pte;
        struct hstate *h = hstate_vma(vma);
        unsigned long pages = 0;
        struct mmu_notifier_range range;
        struct mmu_gather tlb;

        /*
         * In the case of shared PMDs, the area to flush could be beyond
         * start/end.  Set range.start/range.end to cover the maximum possible
         * range if PMD sharing is possible.
         */
        mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA,
                                0, vma, mm, start, end);
        adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);

        BUG_ON(address >= end);
        flush_cache_range(vma, range.start, range.end);
        tlb_gather_mmu_vma(&tlb, vma, range.start, range.end);

        mmu_notifier_invalidate_range_start(&range);
        i_mmap_lock_write(vma->vm_file->f_mapping);
        for (; address < end; address += huge_page_size(h)) {
                spinlock_t *ptl;
                ptep = huge_pte_offset(mm, address, huge_page_size(h));
                if (!ptep)
                        continue;
                ptl = huge_pte_lock(h, mm, ptep);
                if (huge_pmd_unshare(&tlb, vma, &address, ptep)) {
                        pages++;
                        spin_unlock(ptl);
                        continue;
                }
                pte = huge_ptep_get(ptep);
                if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
                        spin_unlock(ptl);
                        continue;
                }
                if (unlikely(is_hugetlb_entry_migration(pte))) {
                        swp_entry_t entry = pte_to_swp_entry(pte);

                        if (is_write_migration_entry(entry)) {
                                pte_t newpte;

                                make_migration_entry_read(&entry);
                                newpte = swp_entry_to_pte(entry);
                                set_huge_swap_pte_at(mm, address, ptep,
                                                     newpte, huge_page_size(h));
                                pages++;
                        }
                        spin_unlock(ptl);
                        continue;
                }
                if (!huge_pte_none(pte)) {
                        pte_t old_pte;

                        old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
                        pte = pte_mkhuge(huge_pte_modify(old_pte, newprot));
                        pte = arch_make_huge_pte(pte, vma, NULL, 0);
                        huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
                        pages++;
                        tlb_remove_huge_tlb_entry(h, &tlb, ptep, address);
                }
                spin_unlock(ptl);

                cond_resched();
        }

        tlb_flush_mmu_tlbonly(&tlb);
        huge_pmd_unshare_flush(&tlb, vma);
        /*
         * No need to call mmu_notifier_invalidate_range() we are downgrading
         * page table protection not changing it to point to a new page.
         *
         * See Documentation/vm/mmu_notifier.rst
         */
        i_mmap_unlock_write(vma->vm_file->f_mapping);
        mmu_notifier_invalidate_range_end(&range);
        tlb_finish_mmu(&tlb, range.start, range.end);

        return pages << h->order;
}

/* Return true if reservation was successful, false otherwise.  */
bool hugetlb_reserve_pages(struct inode *inode,
                                        long from, long to,
                                        struct vm_area_struct *vma,
                                        vm_flags_t vm_flags)
{
        long chg, add = -1;
        struct hstate *h = hstate_inode(inode);
        struct hugepage_subpool *spool = subpool_inode(inode);
        struct resv_map *resv_map;
        struct hugetlb_cgroup *h_cg = NULL;
        long gbl_reserve, regions_needed = 0;

        /* This should never happen */
        if (from > to) {
                VM_WARN(1, "%s called with a negative range\n", __func__);
                return false;
        }

        /*
         * Only apply hugepage reservation if asked. At fault time, an
         * attempt will be made for VM_NORESERVE to allocate a page
         * without using reserves
         */
        if (vm_flags & VM_NORESERVE)
                return true;

        /*
         * Shared mappings base their reservation on the number of pages that
         * are already allocated on behalf of the file. Private mappings need
         * to reserve the full area even if read-only as mprotect() may be
         * called to make the mapping read-write. Assume !vma is a shm mapping
         */
        if (!vma || vma->vm_flags & VM_MAYSHARE) {
                /*
                 * resv_map can not be NULL as hugetlb_reserve_pages is only
                 * called for inodes for which resv_maps were created (see
                 * hugetlbfs_get_inode).
                 */
                resv_map = inode_resv_map(inode);

                chg = region_chg(resv_map, from, to, &regions_needed);

        } else {
                /* Private mapping. */
                resv_map = resv_map_alloc();
                if (!resv_map)
                        return false;

                chg = to - from;

                set_vma_resv_map(vma, resv_map);
                set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
        }

        if (chg < 0)
                goto out_err;

        if (hugetlb_cgroup_charge_cgroup_rsvd(hstate_index(h),
                                chg * pages_per_huge_page(h), &h_cg) < 0)
                goto out_err;

        if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
                /* For private mappings, the hugetlb_cgroup uncharge info hangs
                 * of the resv_map.
                 */
                resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h);
        }

        /*
         * There must be enough pages in the subpool for the mapping. If
         * the subpool has a minimum size, there may be some global
         * reservations already in place (gbl_reserve).
         */
        gbl_reserve = hugepage_subpool_get_pages(spool, chg);
        if (gbl_reserve < 0)
                goto out_uncharge_cgroup;

        /*
         * Check enough hugepages are available for the reservation.
         * Hand the pages back to the subpool if there are not
         */
        if (hugetlb_acct_memory(h, gbl_reserve) < 0)
                goto out_put_pages;

        /*
         * Account for the reservations made. Shared mappings record regions
         * that have reservations as they are shared by multiple VMAs.
         * When the last VMA disappears, the region map says how much
         * the reservation was and the page cache tells how much of
         * the reservation was consumed. Private mappings are per-VMA and
         * only the consumed reservations are tracked. When the VMA
         * disappears, the original reservation is the VMA size and the
         * consumed reservations are stored in the map. Hence, nothing
         * else has to be done for private mappings here
         */
        if (!vma || vma->vm_flags & VM_MAYSHARE) {
                add = region_add(resv_map, from, to, regions_needed, h, h_cg);

                if (unlikely(add < 0)) {
                        hugetlb_acct_memory(h, -gbl_reserve);
                        goto out_put_pages;
                } else if (unlikely(chg > add)) {
                        /*
                         * pages in this range were added to the reserve
                         * map between region_chg and region_add.  This
                         * indicates a race with alloc_huge_page.  Adjust
                         * the subpool and reserve counts modified above
                         * based on the difference.
                         */
                        long rsv_adjust;

                        /*
                         * hugetlb_cgroup_uncharge_cgroup_rsvd() will put the
                         * reference to h_cg->css. See comment below for detail.
                         */
                        hugetlb_cgroup_uncharge_cgroup_rsvd(
                                hstate_index(h),
                                (chg - add) * pages_per_huge_page(h), h_cg);

                        rsv_adjust = hugepage_subpool_put_pages(spool,
                                                                chg - add);
                        hugetlb_acct_memory(h, -rsv_adjust);
                } else if (h_cg) {
                        /*
                         * The file_regions will hold their own reference to
                         * h_cg->css. So we should release the reference held
                         * via hugetlb_cgroup_charge_cgroup_rsvd() when we are
                         * done.
                         */
                        hugetlb_cgroup_put_rsvd_cgroup(h_cg);
                }
        }
        return true;

out_put_pages:
        /* put back original number of pages, chg */
        (void)hugepage_subpool_put_pages(spool, chg);
out_uncharge_cgroup:
        hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
                                            chg * pages_per_huge_page(h), h_cg);
out_err:
        if (!vma || vma->vm_flags & VM_MAYSHARE)
                /* Only call region_abort if the region_chg succeeded but the
                 * region_add failed or didn't run.
                 */
                if (chg >= 0 && add < 0)
                        region_abort(resv_map, from, to, regions_needed);
        if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
                kref_put(&resv_map->refs, resv_map_release);
        return false;
}

long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
                                                                long freed)
{
        struct hstate *h = hstate_inode(inode);
        struct resv_map *resv_map = inode_resv_map(inode);
        long chg = 0;
        struct hugepage_subpool *spool = subpool_inode(inode);
        long gbl_reserve;

        /*
         * Since this routine can be called in the evict inode path for all
         * hugetlbfs inodes, resv_map could be NULL.
         */
        if (resv_map) {
                chg = region_del(resv_map, start, end);
                /*
                 * region_del() can fail in the rare case where a region
                 * must be split and another region descriptor can not be
                 * allocated.  If end == LONG_MAX, it will not fail.
                 */
                if (chg < 0)
                        return chg;
        }

        spin_lock(&inode->i_lock);
        inode->i_blocks -= (blocks_per_huge_page(h) * freed);
        spin_unlock(&inode->i_lock);

        /*
         * If the subpool has a minimum size, the number of global
         * reservations to be released may be adjusted.
         */
        gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
        hugetlb_acct_memory(h, -gbl_reserve);

        return 0;
}

#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
static unsigned long page_table_shareable(struct vm_area_struct *svma,
                                struct vm_area_struct *vma,
                                unsigned long addr, pgoff_t idx)
{
        unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
                                svma->vm_start;
        unsigned long sbase = saddr & PUD_MASK;
        unsigned long s_end = sbase + PUD_SIZE;

        /* Allow segments to share if only one is marked locked */
        unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
        unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK;

        /*
         * match the virtual addresses, permission and the alignment of the
         * page table page.
         */
        if (pmd_index(addr) != pmd_index(saddr) ||
            vm_flags != svm_flags ||
            sbase < svma->vm_start || svma->vm_end < s_end)
                return 0;

        return saddr;
}

static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
{
        unsigned long base = addr & PUD_MASK;
        unsigned long end = base + PUD_SIZE;

        /*
         * check on proper vm_flags and page table alignment
         */
        if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end))
                return true;
        return false;
}

/*
 * Determine if start,end range within vma could be mapped by shared pmd.
 * If yes, adjust start and end to cover range associated with possible
 * shared pmd mappings.
 */
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
                                unsigned long *start, unsigned long *end)
{
        unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE),
                v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);

        /*
         * vma need span at least one aligned PUD size and the start,end range
         * must at least partialy within it.
         */
        if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) ||
                (*end <= v_start) || (*start >= v_end))
                return;

        /* Extend the range to be PUD aligned for a worst case scenario */
        if (*start > v_start)
                *start = ALIGN_DOWN(*start, PUD_SIZE);

        if (*end < v_end)
                *end = ALIGN(*end, PUD_SIZE);
}

/*
 * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
 * and returns the corresponding pte. While this is not necessary for the
 * !shared pmd case because we can allocate the pmd later as well, it makes the
 * code much cleaner.
 *
 * This routine must be called with i_mmap_rwsem held in at least read mode if
 * sharing is possible.  For hugetlbfs, this prevents removal of any page
 * table entries associated with the address space.  This is important as we
 * are setting up sharing based on existing page table entries (mappings).
 *
 * NOTE: This routine is only called from huge_pte_alloc.  Some callers of
 * huge_pte_alloc know that sharing is not possible and do not take
 * i_mmap_rwsem as a performance optimization.  This is handled by the
 * if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is
 * only required for subsequent processing.
 */
pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
{
        struct vm_area_struct *vma = find_vma(mm, addr);
        struct address_space *mapping = vma->vm_file->f_mapping;
        pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
                        vma->vm_pgoff;
        struct vm_area_struct *svma;
        unsigned long saddr;
        pte_t *spte = NULL;
        pte_t *pte;
        spinlock_t *ptl;

        if (!vma_shareable(vma, addr))
                return (pte_t *)pmd_alloc(mm, pud, addr);

        i_mmap_assert_locked(mapping);
        vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
                if (svma == vma)
                        continue;

                saddr = page_table_shareable(svma, vma, addr, idx);
                if (saddr) {
                        spte = huge_pte_offset(svma->vm_mm, saddr,
                                               vma_mmu_pagesize(svma));
                        if (spte) {
                                atomic_inc(&virt_to_page(spte)->pt_share_count);
                                break;
                        }
                }
        }

        if (!spte)
                goto out;

        ptl = huge_pte_lock(hstate_vma(vma), mm, spte);
        if (pud_none(*pud)) {
                pud_populate(mm, pud,
                                (pmd_t *)((unsigned long)spte & PAGE_MASK));
                mm_inc_nr_pmds(mm);
        } else {
                atomic_dec(&virt_to_page(spte)->pt_share_count);
        }
        spin_unlock(ptl);
out:
        pte = (pte_t *)pmd_alloc(mm, pud, addr);
        return pte;
}

/**
 * huge_pmd_unshare - Unmap a pmd table if it is shared by multiple users
 * @tlb: the current mmu_gather.
 * @vma: the vma covering the pmd table.
 * @addr: pointer to the address we are trying to unshare.
 * @ptep: pointer into the (pmd) page table.
 *
 * Called with the page table lock held, the i_mmap_rwsem held in write mode
 * and the hugetlb vma lock held in write mode.
 *
 * Note: The caller must call huge_pmd_unshare_flush() before dropping the
 * i_mmap_rwsem.
 *
 * Returns: 1 if it was a shared PMD table and it got unmapped, or 0 if it
 *            was not a shared PMD table.
 */
int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma,
                unsigned long *addr, pte_t *ptep)
{
        unsigned long sz = huge_page_size(hstate_vma(vma));
        struct mm_struct *mm = vma->vm_mm;
        pgd_t *pgd = pgd_offset(mm, *addr);
        p4d_t *p4d = p4d_offset(pgd, *addr);
        pud_t *pud = pud_offset(p4d, *addr);

        i_mmap_assert_write_locked(vma->vm_file->f_mapping);
        if (sz != PMD_SIZE)
                return 0;
        if (!atomic_read(&virt_to_page(ptep)->pt_share_count))
                return 0;

        pud_clear(pud);
        tlb_unshare_pmd_ptdesc(tlb, virt_to_page(ptep), *addr);

        mm_dec_nr_pmds(mm);
        /*
         * This update of passed address optimizes loops sequentially
         * processing addresses in increments of huge page size (PMD_SIZE
         * in this case).  By clearing the pud, a PUD_SIZE area is unmapped.
         * Update address to the 'last page' in the cleared area so that
         * calling loop can move to first page past this area.
         */
        *addr |= PUD_SIZE - PMD_SIZE;
        return 1;
}

/*
 * huge_pmd_unshare_flush - Complete a sequence of huge_pmd_unshare() calls
 * @tlb: the current mmu_gather.
 * @vma: the vma covering the pmd table.
 *
 * Perform necessary TLB flushes or IPI broadcasts to synchronize PMD table
 * unsharing with concurrent page table walkers.
 *
 * This function must be called after a sequence of huge_pmd_unshare()
 * calls while still holding the i_mmap_rwsem.
 */
void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma)
{
        /*
         * We must synchronize page table unsharing such that nobody will
         * try reusing a previously-shared page table while it might still
         * be in use by previous sharers (TLB, GUP_fast).
         */
        i_mmap_assert_write_locked(vma->vm_file->f_mapping);

        tlb_flush_unshared_tables(tlb);
}

#define want_pmd_share()        (1)
#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
{
        return NULL;
}

int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma,
                unsigned long *addr, pte_t *ptep)
{
        return 0;
}

void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma)
{
}

void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
                                unsigned long *start, unsigned long *end)
{
}
#define want_pmd_share()        (0)
#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */

#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
pte_t *huge_pte_alloc(struct mm_struct *mm,
                        unsigned long addr, unsigned long sz)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pte_t *pte = NULL;

        pgd = pgd_offset(mm, addr);
        p4d = p4d_alloc(mm, pgd, addr);
        if (!p4d)
                return NULL;
        pud = pud_alloc(mm, p4d, addr);
        if (pud) {
                if (sz == PUD_SIZE) {
                        pte = (pte_t *)pud;
                } else {
                        BUG_ON(sz != PMD_SIZE);
                        if (want_pmd_share() && pud_none(*pud))
                                pte = huge_pmd_share(mm, addr, pud);
                        else
                                pte = (pte_t *)pmd_alloc(mm, pud, addr);
                }
        }
        BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte));

        return pte;
}

/*
 * huge_pte_offset() - Walk the page table to resolve the hugepage
 * entry at address @addr
 *
 * Return: Pointer to page table entry (PUD or PMD) for
 * address @addr, or NULL if a !p*d_present() entry is encountered and the
 * size @sz doesn't match the hugepage size at this level of the page
 * table.
 */
pte_t *huge_pte_offset(struct mm_struct *mm,
                       unsigned long addr, unsigned long sz)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;

        pgd = pgd_offset(mm, addr);
        if (!pgd_present(*pgd))
                return NULL;
        p4d = p4d_offset(pgd, addr);
        if (!p4d_present(*p4d))
                return NULL;

        pud = pud_offset(p4d, addr);
        if (sz == PUD_SIZE)
                /* must be pud huge, non-present or none */
                return (pte_t *)pud;
        if (!pud_present(*pud))
                return NULL;
        /* must have a valid entry and size to go further */

        pmd = pmd_offset(pud, addr);
        /* must be pmd huge, non-present or none */
        return (pte_t *)pmd;
}

#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */

/*
 * These functions are overwritable if your architecture needs its own
 * behavior.
 */
struct page * __weak
follow_huge_addr(struct mm_struct *mm, unsigned long address,
                              int write)
{
        return ERR_PTR(-EINVAL);
}

struct page * __weak
follow_huge_pd(struct vm_area_struct *vma,
               unsigned long address, hugepd_t hpd, int flags, int pdshift)
{
        WARN(1, "hugepd follow called with no support for hugepage directory format\n");
        return NULL;
}

struct page * __weak
follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, int flags)
{
        struct hstate *h = hstate_vma(vma);
        struct mm_struct *mm = vma->vm_mm;
        struct page *page = NULL;
        spinlock_t *ptl;
        pte_t *ptep, pte;

        /* FOLL_GET and FOLL_PIN are mutually exclusive. */
        if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
                         (FOLL_PIN | FOLL_GET)))
                return NULL;

retry:
        ptep = huge_pte_offset(mm, address, huge_page_size(h));
        if (!ptep)
                return NULL;

        ptl = huge_pte_lock(h, mm, ptep);
        pte = huge_ptep_get(ptep);
        if (pte_present(pte)) {
                page = pte_page(pte) +
                        ((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
                /*
                 * try_grab_page() should always succeed here, because: a) we
                 * hold the pmd (ptl) lock, and b) we've just checked that the
                 * huge pmd (head) page is present in the page tables. The ptl
                 * prevents the head page and tail pages from being rearranged
                 * in any way. So this page must be available at this point,
                 * unless the page refcount overflowed:
                 */
                if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
                        page = NULL;
                        goto out;
                }
        } else {
                if (is_hugetlb_entry_migration(pte)) {
                        spin_unlock(ptl);
                        __migration_entry_wait(mm, ptep, ptl);
                        goto retry;
                }
                /*
                 * hwpoisoned entry is treated as no_page_table in
                 * follow_page_mask().
                 */
        }
out:
        spin_unlock(ptl);
        return page;
}

struct page * __weak
follow_huge_pud(struct mm_struct *mm, unsigned long address,
                pud_t *pud, int flags)
{
        if (flags & (FOLL_GET | FOLL_PIN))
                return NULL;

        return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
}

struct page * __weak
follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
{
        if (flags & (FOLL_GET | FOLL_PIN))
                return NULL;

        return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
}

int isolate_hugetlb(struct page *page, struct list_head *list)
{
        int ret = 0;

        spin_lock(&hugetlb_lock);
        if (!PageHeadHuge(page) || !page_huge_active(page) ||
            !get_page_unless_zero(page)) {
                ret = -EBUSY;
                goto unlock;
        }
        clear_page_huge_active(page);
        list_move_tail(&page->lru, list);
unlock:
        spin_unlock(&hugetlb_lock);
        return ret;
}

void putback_active_hugepage(struct page *page)
{
        VM_BUG_ON_PAGE(!PageHead(page), page);
        spin_lock(&hugetlb_lock);
        set_page_huge_active(page);
        list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
        spin_unlock(&hugetlb_lock);
        put_page(page);
}

void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
{
        struct hstate *h = page_hstate(oldpage);

        hugetlb_cgroup_migrate(oldpage, newpage);
        set_page_owner_migrate_reason(newpage, reason);

        /*
         * transfer temporary state of the new huge page. This is
         * reverse to other transitions because the newpage is going to
         * be final while the old one will be freed so it takes over
         * the temporary status.
         *
         * Also note that we have to transfer the per-node surplus state
         * here as well otherwise the global surplus count will not match
         * the per-node's.
         */
        if (PageHugeTemporary(newpage)) {
                int old_nid = page_to_nid(oldpage);
                int new_nid = page_to_nid(newpage);

                SetPageHugeTemporary(oldpage);
                ClearPageHugeTemporary(newpage);

                spin_lock(&hugetlb_lock);
                if (h->surplus_huge_pages_node[old_nid]) {
                        h->surplus_huge_pages_node[old_nid]--;
                        h->surplus_huge_pages_node[new_nid]++;
                }
                spin_unlock(&hugetlb_lock);
        }
}

/*
 * If @take_locks is false, the caller must ensure that no concurrent page table
 * access can happen (except for gup_fast() and hardware page walks).
 * If @take_locks is true, we take the hugetlb VMA lock (to lock out things like
 * concurrent page fault handling) and the file rmap lock.
 */
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
                                   unsigned long start,
                                   unsigned long end,
                                   bool take_locks)
{
        struct hstate *h = hstate_vma(vma);
        unsigned long sz = huge_page_size(h);
        struct mm_struct *mm = vma->vm_mm;
        struct mmu_notifier_range range;
        struct mmu_gather tlb;
        unsigned long address;
        spinlock_t *ptl;
        pte_t *ptep;

        if (!(vma->vm_flags & VM_MAYSHARE))
                return;

        if (start >= end)
                return;

        flush_cache_range(vma, start, end);
        tlb_gather_mmu_vma(&tlb, vma, start, end);

        /*
         * No need to call adjust_range_if_pmd_sharing_possible(), because
         * we have already done the PUD_SIZE alignment.
         */
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
                                start, end);
        mmu_notifier_invalidate_range_start(&range);
        if (take_locks) {
                i_mmap_lock_write(vma->vm_file->f_mapping);
        } else {
                i_mmap_assert_write_locked(vma->vm_file->f_mapping);
        }
        for (address = start; address < end; address += PUD_SIZE) {
                unsigned long tmp = address;

                ptep = huge_pte_offset(mm, address, sz);
                if (!ptep)
                        continue;
                ptl = huge_pte_lock(h, mm, ptep);
                /* We don't want 'address' to be changed */
                huge_pmd_unshare(&tlb, vma, &tmp, ptep);
                spin_unlock(ptl);
        }
        huge_pmd_unshare_flush(&tlb, vma);
        if (take_locks) {
                i_mmap_unlock_write(vma->vm_file->f_mapping);
        }
        /*
         * No need to call mmu_notifier_invalidate_range(), see
         * Documentation/mm/mmu_notifier.rst.
         */
        mmu_notifier_invalidate_range_end(&range);
        tlb_finish_mmu(&tlb, start, end);
}

#ifdef CONFIG_CMA
static bool cma_reserve_called __initdata;

static int __init cmdline_parse_hugetlb_cma(char *p)
{
        hugetlb_cma_size = memparse(p, &p);
        return 0;
}

early_param("hugetlb_cma", cmdline_parse_hugetlb_cma);

void __init hugetlb_cma_reserve(int order)
{
        unsigned long size, reserved, per_node;
        int nid;

        cma_reserve_called = true;

        if (!hugetlb_cma_size)
                return;

        if (hugetlb_cma_size < (PAGE_SIZE << order)) {
                pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n",
                        (PAGE_SIZE << order) / SZ_1M);
                return;
        }

        /*
         * If 3 GB area is requested on a machine with 4 numa nodes,
         * let's allocate 1 GB on first three nodes and ignore the last one.
         */
        per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes);
        pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
                hugetlb_cma_size / SZ_1M, per_node / SZ_1M);

        reserved = 0;
        for_each_node_state(nid, N_ONLINE) {
                int res;
                char name[CMA_MAX_NAME];

                size = min(per_node, hugetlb_cma_size - reserved);
                size = round_up(size, PAGE_SIZE << order);

                snprintf(name, sizeof(name), "hugetlb%d", nid);
                res = cma_declare_contiguous_nid(0, size, 0, PAGE_SIZE << order,
                                                 0, false, name,
                                                 &hugetlb_cma[nid], nid);
                if (res) {
                        pr_warn("hugetlb_cma: reservation failed: err %d, node %d",
                                res, nid);
                        continue;
                }

                reserved += size;
                pr_info("hugetlb_cma: reserved %lu MiB on node %d\n",
                        size / SZ_1M, nid);

                if (reserved >= hugetlb_cma_size)
                        break;
        }
}

void __init hugetlb_cma_check(void)
{
        if (!hugetlb_cma_size || cma_reserve_called)
                return;

        pr_warn("hugetlb_cma: the option isn't supported by current arch\n");
}

#endif /* CONFIG_CMA */






















































































































































































    2 





















    2 
























































































    3 




























































































































































































































































































































































































































    1 
















































































































    1 







































    1 








































































    1 


























    1 


































































    1 


































































    1 

































































































































































































































































































































































































































































































































































































































    1 

















































    1 


























































    3 



























































































































































































































































































    1 























































































































































































































































    1 





    1 






























    1 





















    1 





























    3 





















    3 


























































































































































































































































































































































































































    1 
















    1 






































































































    2 



    2 





















    1 






























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM ext4

#if !defined(_TRACE_EXT4_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_EXT4_H

#include <linux/writeback.h>
#include <linux/tracepoint.h>

struct ext4_allocation_context;
struct ext4_allocation_request;
struct ext4_extent;
struct ext4_prealloc_space;
struct ext4_inode_info;
struct mpage_da_data;
struct ext4_map_blocks;
struct extent_status;
struct ext4_fsmap;
struct partial_cluster;

#define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode))

#define show_mballoc_flags(flags) __print_flags(flags, "|",        \
        { EXT4_MB_HINT_MERGE,                "HINT_MERGE" },                \
        { EXT4_MB_HINT_RESERVED,        "HINT_RESV" },                \
        { EXT4_MB_HINT_METADATA,        "HINT_MDATA" },                \
        { EXT4_MB_HINT_FIRST,                "HINT_FIRST" },                \
        { EXT4_MB_HINT_BEST,                "HINT_BEST" },                \
        { EXT4_MB_HINT_DATA,                "HINT_DATA" },                \
        { EXT4_MB_HINT_NOPREALLOC,        "HINT_NOPREALLOC" },        \
        { EXT4_MB_HINT_GROUP_ALLOC,        "HINT_GRP_ALLOC" },        \
        { EXT4_MB_HINT_GOAL_ONLY,        "HINT_GOAL_ONLY" },        \
        { EXT4_MB_HINT_TRY_GOAL,        "HINT_TRY_GOAL" },        \
        { EXT4_MB_DELALLOC_RESERVED,        "DELALLOC_RESV" },        \
        { EXT4_MB_STREAM_ALLOC,                "STREAM_ALLOC" },        \
        { EXT4_MB_USE_ROOT_BLOCKS,        "USE_ROOT_BLKS" },        \
        { EXT4_MB_USE_RESERVED,                "USE_RESV" },                \
        { EXT4_MB_STRICT_CHECK,                "STRICT_CHECK" })

#define show_map_flags(flags) __print_flags(flags, "|",                        \
        { EXT4_GET_BLOCKS_CREATE,                "CREATE" },                \
        { EXT4_GET_BLOCKS_UNWRIT_EXT,                "UNWRIT" },                \
        { EXT4_GET_BLOCKS_DELALLOC_RESERVE,        "DELALLOC" },                \
        { EXT4_GET_BLOCKS_PRE_IO,                "PRE_IO" },                \
        { EXT4_GET_BLOCKS_CONVERT,                "CONVERT" },                \
        { EXT4_GET_BLOCKS_METADATA_NOFAIL,        "METADATA_NOFAIL" },        \
        { EXT4_GET_BLOCKS_NO_NORMALIZE,                "NO_NORMALIZE" },        \
        { EXT4_GET_BLOCKS_CONVERT_UNWRITTEN,        "CONVERT_UNWRITTEN" },  \
        { EXT4_GET_BLOCKS_ZERO,                        "ZERO" },                \
        { EXT4_GET_BLOCKS_IO_SUBMIT,                "IO_SUBMIT" },                \
        { EXT4_EX_NOCACHE,                        "EX_NOCACHE" })

/*
 * __print_flags() requires that all enum values be wrapped in the
 * TRACE_DEFINE_ENUM macro so that the enum value can be encoded in the ftrace
 * ring buffer.
 */
TRACE_DEFINE_ENUM(BH_New);
TRACE_DEFINE_ENUM(BH_Mapped);
TRACE_DEFINE_ENUM(BH_Unwritten);
TRACE_DEFINE_ENUM(BH_Boundary);

#define show_mflags(flags) __print_flags(flags, "",        \
        { EXT4_MAP_NEW,                "N" },                        \
        { EXT4_MAP_MAPPED,        "M" },                        \
        { EXT4_MAP_UNWRITTEN,        "U" },                        \
        { EXT4_MAP_BOUNDARY,        "B" })

#define show_free_flags(flags) __print_flags(flags, "|",        \
        { EXT4_FREE_BLOCKS_METADATA,                "METADATA" },        \
        { EXT4_FREE_BLOCKS_FORGET,                "FORGET" },        \
        { EXT4_FREE_BLOCKS_VALIDATED,                "VALIDATED" },        \
        { EXT4_FREE_BLOCKS_NO_QUOT_UPDATE,        "NO_QUOTA" },        \
        { EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER,"1ST_CLUSTER" },\
        { EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER,        "LAST_CLUSTER" })

TRACE_DEFINE_ENUM(ES_WRITTEN_B);
TRACE_DEFINE_ENUM(ES_UNWRITTEN_B);
TRACE_DEFINE_ENUM(ES_DELAYED_B);
TRACE_DEFINE_ENUM(ES_HOLE_B);
TRACE_DEFINE_ENUM(ES_REFERENCED_B);

#define show_extent_status(status) __print_flags(status, "",        \
        { EXTENT_STATUS_WRITTEN,        "W" },                        \
        { EXTENT_STATUS_UNWRITTEN,        "U" },                        \
        { EXTENT_STATUS_DELAYED,        "D" },                        \
        { EXTENT_STATUS_HOLE,                "H" },                        \
        { EXTENT_STATUS_REFERENCED,        "R" })

#define show_falloc_mode(mode) __print_flags(mode, "|",                \
        { FALLOC_FL_KEEP_SIZE,                "KEEP_SIZE"},                \
        { FALLOC_FL_PUNCH_HOLE,                "PUNCH_HOLE"},                \
        { FALLOC_FL_NO_HIDE_STALE,        "NO_HIDE_STALE"},        \
        { FALLOC_FL_COLLAPSE_RANGE,        "COLLAPSE_RANGE"},        \
        { FALLOC_FL_ZERO_RANGE,                "ZERO_RANGE"})

TRACE_DEFINE_ENUM(EXT4_FC_REASON_XATTR);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_CROSS_RENAME);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_JOURNAL_FLAG_CHANGE);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_NOMEM);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_SWAP_BOOT);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_RESIZE);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_RENAME_DIR);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_FALLOC_RANGE);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_INODE_JOURNAL_DATA);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_ENCRYPTED_FILENAME);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX);

#define show_fc_reason(reason)                                                \
        __print_symbolic(reason,                                        \
                { EXT4_FC_REASON_XATTR,                "XATTR"},                \
                { EXT4_FC_REASON_CROSS_RENAME,        "CROSS_RENAME"},        \
                { EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, "JOURNAL_FLAG_CHANGE"}, \
                { EXT4_FC_REASON_NOMEM,        "NO_MEM"},                        \
                { EXT4_FC_REASON_SWAP_BOOT,        "SWAP_BOOT"},                \
                { EXT4_FC_REASON_RESIZE,        "RESIZE"},                \
                { EXT4_FC_REASON_RENAME_DIR,        "RENAME_DIR"},                \
                { EXT4_FC_REASON_FALLOC_RANGE,        "FALLOC_RANGE"},        \
                { EXT4_FC_REASON_INODE_JOURNAL_DATA,        "INODE_JOURNAL_DATA"}, \
                { EXT4_FC_REASON_ENCRYPTED_FILENAME,        "ENCRYPTED_FILENAME"})

TRACE_EVENT(ext4_other_inode_update_time,
        TP_PROTO(struct inode *inode, ino_t orig_ino),

        TP_ARGS(inode, orig_ino),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        ino_t,        orig_ino                )
                __field(        uid_t,        uid                        )
                __field(        gid_t,        gid                        )
                __field(        __u16, mode                        )
        ),

        TP_fast_assign(
                __entry->orig_ino = orig_ino;
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->uid        = i_uid_read(inode);
                __entry->gid        = i_gid_read(inode);
                __entry->mode        = inode->i_mode;
        ),

        TP_printk("dev %d,%d orig_ino %lu ino %lu mode 0%o uid %u gid %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->orig_ino,
                  (unsigned long) __entry->ino, __entry->mode,
                  __entry->uid, __entry->gid)
);

TRACE_EVENT(ext4_free_inode,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        uid_t,        uid                        )
                __field(        gid_t,        gid                        )
                __field(        __u64, blocks                        )
                __field(        __u16, mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->uid        = i_uid_read(inode);
                __entry->gid        = i_gid_read(inode);
                __entry->blocks        = inode->i_blocks;
                __entry->mode        = inode->i_mode;
        ),

        TP_printk("dev %d,%d ino %lu mode 0%o uid %u gid %u blocks %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->mode,
                  __entry->uid, __entry->gid, __entry->blocks)
);

TRACE_EVENT(ext4_request_inode,
        TP_PROTO(struct inode *dir, int mode),

        TP_ARGS(dir, mode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        dir                        )
                __field(        __u16, mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = dir->i_sb->s_dev;
                __entry->dir        = dir->i_ino;
                __entry->mode        = mode;
        ),

        TP_printk("dev %d,%d dir %lu mode 0%o",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->dir, __entry->mode)
);

TRACE_EVENT(ext4_allocate_inode,
        TP_PROTO(struct inode *inode, struct inode *dir, int mode),

        TP_ARGS(inode, dir, mode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        ino_t,        dir                        )
                __field(        __u16,        mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->dir        = dir->i_ino;
                __entry->mode        = mode;
        ),

        TP_printk("dev %d,%d ino %lu dir %lu mode 0%o",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned long) __entry->dir, __entry->mode)
);

TRACE_EVENT(ext4_evict_inode,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        int,        nlink                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->nlink        = inode->i_nlink;
        ),

        TP_printk("dev %d,%d ino %lu nlink %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->nlink)
);

TRACE_EVENT(ext4_drop_inode,
        TP_PROTO(struct inode *inode, int drop),

        TP_ARGS(inode, drop),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        int,        drop                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->drop        = drop;
        ),

        TP_printk("dev %d,%d ino %lu drop %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->drop)
);

TRACE_EVENT(ext4_nfs_commit_metadata,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
        ),

        TP_printk("dev %d,%d ino %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino)
);

TRACE_EVENT(ext4_mark_inode_dirty,
        TP_PROTO(struct inode *inode, unsigned long IP),

        TP_ARGS(inode, IP),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(unsigned long,        ip                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->ip        = IP;
        ),

        TP_printk("dev %d,%d ino %lu caller %pS",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, (void *)__entry->ip)
);

TRACE_EVENT(ext4_begin_ordered_truncate,
        TP_PROTO(struct inode *inode, loff_t new_size),

        TP_ARGS(inode, new_size),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        loff_t,        new_size                )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->new_size        = new_size;
        ),

        TP_printk("dev %d,%d ino %lu new_size %lld",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->new_size)
);

DECLARE_EVENT_CLASS(ext4__write_begin,

        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                 unsigned int flags),

        TP_ARGS(inode, pos, len, flags),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        loff_t,        pos                        )
                __field(        unsigned int, len                )
                __field(        unsigned int, flags                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->pos        = pos;
                __entry->len        = len;
                __entry->flags        = flags;
        ),

        TP_printk("dev %d,%d ino %lu pos %lld len %u flags %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->pos, __entry->len, __entry->flags)
);

DEFINE_EVENT(ext4__write_begin, ext4_write_begin,

        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                 unsigned int flags),

        TP_ARGS(inode, pos, len, flags)
);

DEFINE_EVENT(ext4__write_begin, ext4_da_write_begin,

        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                 unsigned int flags),

        TP_ARGS(inode, pos, len, flags)
);

DECLARE_EVENT_CLASS(ext4__write_end,
        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                        unsigned int copied),

        TP_ARGS(inode, pos, len, copied),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        loff_t,        pos                        )
                __field(        unsigned int, len                )
                __field(        unsigned int, copied                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->pos        = pos;
                __entry->len        = len;
                __entry->copied        = copied;
        ),

        TP_printk("dev %d,%d ino %lu pos %lld len %u copied %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->pos, __entry->len, __entry->copied)
);

DEFINE_EVENT(ext4__write_end, ext4_write_end,

        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                 unsigned int copied),

        TP_ARGS(inode, pos, len, copied)
);

DEFINE_EVENT(ext4__write_end, ext4_journalled_write_end,

        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                 unsigned int copied),

        TP_ARGS(inode, pos, len, copied)
);

DEFINE_EVENT(ext4__write_end, ext4_da_write_end,

        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                 unsigned int copied),

        TP_ARGS(inode, pos, len, copied)
);

TRACE_EVENT(ext4_writepages,
        TP_PROTO(struct inode *inode, struct writeback_control *wbc),

        TP_ARGS(inode, wbc),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        long,        nr_to_write                )
                __field(        long,        pages_skipped                )
                __field(        loff_t,        range_start                )
                __field(        loff_t,        range_end                )
                __field(       pgoff_t,        writeback_index                )
                __field(        int,        sync_mode                )
                __field(        char,        for_kupdate                )
                __field(        char,        range_cyclic                )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->nr_to_write        = wbc->nr_to_write;
                __entry->pages_skipped        = wbc->pages_skipped;
                __entry->range_start        = wbc->range_start;
                __entry->range_end        = wbc->range_end;
                __entry->writeback_index = inode->i_mapping->writeback_index;
                __entry->sync_mode        = wbc->sync_mode;
                __entry->for_kupdate        = wbc->for_kupdate;
                __entry->range_cyclic        = wbc->range_cyclic;
        ),

        TP_printk("dev %d,%d ino %lu nr_to_write %ld pages_skipped %ld "
                  "range_start %lld range_end %lld sync_mode %d "
                  "for_kupdate %d range_cyclic %d writeback_index %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->nr_to_write,
                  __entry->pages_skipped, __entry->range_start,
                  __entry->range_end, __entry->sync_mode,
                  __entry->for_kupdate, __entry->range_cyclic,
                  (unsigned long) __entry->writeback_index)
);

TRACE_EVENT(ext4_da_write_pages,
        TP_PROTO(struct inode *inode, pgoff_t first_page,
                 struct writeback_control *wbc),

        TP_ARGS(inode, first_page, wbc),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(      pgoff_t,        first_page                )
                __field(         long,        nr_to_write                )
                __field(          int,        sync_mode                )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->first_page        = first_page;
                __entry->nr_to_write        = wbc->nr_to_write;
                __entry->sync_mode        = wbc->sync_mode;
        ),

        TP_printk("dev %d,%d ino %lu first_page %lu nr_to_write %ld "
                  "sync_mode %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->first_page,
                  __entry->nr_to_write, __entry->sync_mode)
);

TRACE_EVENT(ext4_da_write_pages_extent,
        TP_PROTO(struct inode *inode, struct ext4_map_blocks *map),

        TP_ARGS(inode, map),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        lblk                        )
                __field(        __u32,        len                        )
                __field(        __u32,        flags                        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->lblk                = map->m_lblk;
                __entry->len                = map->m_len;
                __entry->flags                = map->m_flags;
        ),

        TP_printk("dev %d,%d ino %lu lblk %llu len %u flags %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->lblk, __entry->len,
                  show_mflags(__entry->flags))
);

TRACE_EVENT(ext4_writepages_result,
        TP_PROTO(struct inode *inode, struct writeback_control *wbc,
                        int ret, int pages_written),

        TP_ARGS(inode, wbc, ret, pages_written),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        int,        ret                        )
                __field(        int,        pages_written                )
                __field(        long,        pages_skipped                )
                __field(       pgoff_t,        writeback_index                )
                __field(        int,        sync_mode                )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->ret                = ret;
                __entry->pages_written        = pages_written;
                __entry->pages_skipped        = wbc->pages_skipped;
                __entry->writeback_index = inode->i_mapping->writeback_index;
                __entry->sync_mode        = wbc->sync_mode;
        ),

        TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld "
                  "sync_mode %d writeback_index %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->ret,
                  __entry->pages_written, __entry->pages_skipped,
                  __entry->sync_mode,
                  (unsigned long) __entry->writeback_index)
);

DECLARE_EVENT_CLASS(ext4__page_op,
        TP_PROTO(struct page *page),

        TP_ARGS(page),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        pgoff_t, index                        )

        ),

        TP_fast_assign(
                __entry->dev        = page->mapping->host->i_sb->s_dev;
                __entry->ino        = page->mapping->host->i_ino;
                __entry->index        = page->index;
        ),

        TP_printk("dev %d,%d ino %lu page_index %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned long) __entry->index)
);

DEFINE_EVENT(ext4__page_op, ext4_writepage,

        TP_PROTO(struct page *page),

        TP_ARGS(page)
);

DEFINE_EVENT(ext4__page_op, ext4_readpage,

        TP_PROTO(struct page *page),

        TP_ARGS(page)
);

DEFINE_EVENT(ext4__page_op, ext4_releasepage,

        TP_PROTO(struct page *page),

        TP_ARGS(page)
);

DECLARE_EVENT_CLASS(ext4_invalidatepage_op,
        TP_PROTO(struct page *page, unsigned int offset, unsigned int length),

        TP_ARGS(page, offset, length),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        pgoff_t, index                        )
                __field(        unsigned int, offset                )
                __field(        unsigned int, length                )
        ),

        TP_fast_assign(
                __entry->dev        = page->mapping->host->i_sb->s_dev;
                __entry->ino        = page->mapping->host->i_ino;
                __entry->index        = page->index;
                __entry->offset        = offset;
                __entry->length        = length;
        ),

        TP_printk("dev %d,%d ino %lu page_index %lu offset %u length %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned long) __entry->index,
                  __entry->offset, __entry->length)
);

DEFINE_EVENT(ext4_invalidatepage_op, ext4_invalidatepage,
        TP_PROTO(struct page *page, unsigned int offset, unsigned int length),

        TP_ARGS(page, offset, length)
);

DEFINE_EVENT(ext4_invalidatepage_op, ext4_journalled_invalidatepage,
        TP_PROTO(struct page *page, unsigned int offset, unsigned int length),

        TP_ARGS(page, offset, length)
);

TRACE_EVENT(ext4_discard_blocks,
        TP_PROTO(struct super_block *sb, unsigned long long blk,
                        unsigned long long count),

        TP_ARGS(sb, blk, count),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        __u64,        blk                        )
                __field(        __u64,        count                        )

        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->blk        = blk;
                __entry->count        = count;
        ),

        TP_printk("dev %d,%d blk %llu count %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->blk, __entry->count)
);

DECLARE_EVENT_CLASS(ext4__mb_new_pa,
        TP_PROTO(struct ext4_allocation_context *ac,
                 struct ext4_prealloc_space *pa),

        TP_ARGS(ac, pa),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        pa_pstart                )
                __field(        __u64,        pa_lstart                )
                __field(        __u32,        pa_len                        )

        ),

        TP_fast_assign(
                __entry->dev                = ac->ac_sb->s_dev;
                __entry->ino                = ac->ac_inode->i_ino;
                __entry->pa_pstart        = pa->pa_pstart;
                __entry->pa_lstart        = pa->pa_lstart;
                __entry->pa_len                = pa->pa_len;
        ),

        TP_printk("dev %d,%d ino %lu pstart %llu len %u lstart %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->pa_pstart, __entry->pa_len, __entry->pa_lstart)
);

DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_inode_pa,

        TP_PROTO(struct ext4_allocation_context *ac,
                 struct ext4_prealloc_space *pa),

        TP_ARGS(ac, pa)
);

DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_group_pa,

        TP_PROTO(struct ext4_allocation_context *ac,
                 struct ext4_prealloc_space *pa),

        TP_ARGS(ac, pa)
);

TRACE_EVENT(ext4_mb_release_inode_pa,
        TP_PROTO(struct ext4_prealloc_space *pa,
                 unsigned long long block, unsigned int count),

        TP_ARGS(pa, block, count),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        block                        )
                __field(        __u32,        count                        )

        ),

        TP_fast_assign(
                __entry->dev                = pa->pa_inode->i_sb->s_dev;
                __entry->ino                = pa->pa_inode->i_ino;
                __entry->block                = block;
                __entry->count                = count;
        ),

        TP_printk("dev %d,%d ino %lu block %llu count %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->block, __entry->count)
);

TRACE_EVENT(ext4_mb_release_group_pa,
        TP_PROTO(struct super_block *sb, struct ext4_prealloc_space *pa),

        TP_ARGS(sb, pa),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        __u64,        pa_pstart                )
                __field(        __u32,        pa_len                        )

        ),

        TP_fast_assign(
                __entry->dev                = sb->s_dev;
                __entry->pa_pstart        = pa->pa_pstart;
                __entry->pa_len                = pa->pa_len;
        ),

        TP_printk("dev %d,%d pstart %llu len %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->pa_pstart, __entry->pa_len)
);

TRACE_EVENT(ext4_discard_preallocations,
        TP_PROTO(struct inode *inode, unsigned int len, unsigned int needed),

        TP_ARGS(inode, len, needed),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        unsigned int,        len                )
                __field(        unsigned int,        needed                )

        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->len        = len;
                __entry->needed        = needed;
        ),

        TP_printk("dev %d,%d ino %lu len: %u needed %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->len,
                  __entry->needed)
);

TRACE_EVENT(ext4_mb_discard_preallocations,
        TP_PROTO(struct super_block *sb, int needed),

        TP_ARGS(sb, needed),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        int,        needed                        )

        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->needed        = needed;
        ),

        TP_printk("dev %d,%d needed %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->needed)
);

TRACE_EVENT(ext4_request_blocks,
        TP_PROTO(struct ext4_allocation_request *ar),

        TP_ARGS(ar),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        unsigned int, len                )
                __field(        __u32,  logical                        )
                __field(        __u32,        lleft                        )
                __field(        __u32,        lright                        )
                __field(        __u64,        goal                        )
                __field(        __u64,        pleft                        )
                __field(        __u64,        pright                        )
                __field(        unsigned int, flags                )
        ),

        TP_fast_assign(
                __entry->dev        = ar->inode->i_sb->s_dev;
                __entry->ino        = ar->inode->i_ino;
                __entry->len        = ar->len;
                __entry->logical = ar->logical;
                __entry->goal        = ar->goal;
                __entry->lleft        = ar->lleft;
                __entry->lright        = ar->lright;
                __entry->pleft        = ar->pleft;
                __entry->pright        = ar->pright;
                __entry->flags        = ar->flags;
        ),

        TP_printk("dev %d,%d ino %lu flags %s len %u lblk %u goal %llu "
                  "lleft %u lright %u pleft %llu pright %llu ",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, show_mballoc_flags(__entry->flags),
                  __entry->len, __entry->logical, __entry->goal,
                  __entry->lleft, __entry->lright, __entry->pleft,
                  __entry->pright)
);

TRACE_EVENT(ext4_allocate_blocks,
        TP_PROTO(struct ext4_allocation_request *ar, unsigned long long block),

        TP_ARGS(ar, block),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        block                        )
                __field(        unsigned int, len                )
                __field(        __u32,  logical                        )
                __field(        __u32,        lleft                        )
                __field(        __u32,        lright                        )
                __field(        __u64,        goal                        )
                __field(        __u64,        pleft                        )
                __field(        __u64,        pright                        )
                __field(        unsigned int, flags                )
        ),

        TP_fast_assign(
                __entry->dev        = ar->inode->i_sb->s_dev;
                __entry->ino        = ar->inode->i_ino;
                __entry->block        = block;
                __entry->len        = ar->len;
                __entry->logical = ar->logical;
                __entry->goal        = ar->goal;
                __entry->lleft        = ar->lleft;
                __entry->lright        = ar->lright;
                __entry->pleft        = ar->pleft;
                __entry->pright        = ar->pright;
                __entry->flags        = ar->flags;
        ),

        TP_printk("dev %d,%d ino %lu flags %s len %u block %llu lblk %u "
                  "goal %llu lleft %u lright %u pleft %llu pright %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, show_mballoc_flags(__entry->flags),
                  __entry->len, __entry->block, __entry->logical,
                  __entry->goal,  __entry->lleft, __entry->lright,
                  __entry->pleft, __entry->pright)
);

TRACE_EVENT(ext4_free_blocks,
        TP_PROTO(struct inode *inode, __u64 block, unsigned long count,
                 int flags),

        TP_ARGS(inode, block, count, flags),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        block                        )
                __field(        unsigned long,        count                )
                __field(        int,        flags                        )
                __field(        __u16,        mode                        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->block                = block;
                __entry->count                = count;
                __entry->flags                = flags;
                __entry->mode                = inode->i_mode;
        ),

        TP_printk("dev %d,%d ino %lu mode 0%o block %llu count %lu flags %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->mode, __entry->block, __entry->count,
                  show_free_flags(__entry->flags))
);

TRACE_EVENT(ext4_sync_file_enter,
        TP_PROTO(struct file *file, int datasync),

        TP_ARGS(file, datasync),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        ino_t,        parent                        )
                __field(        int,        datasync                )
        ),

        TP_fast_assign(
                struct dentry *dentry = file->f_path.dentry;

                __entry->dev                = dentry->d_sb->s_dev;
                __entry->ino                = d_inode(dentry)->i_ino;
                __entry->datasync        = datasync;
                __entry->parent                = d_inode(dentry->d_parent)->i_ino;
        ),

        TP_printk("dev %d,%d ino %lu parent %lu datasync %d ",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned long) __entry->parent, __entry->datasync)
);

TRACE_EVENT(ext4_sync_file_exit,
        TP_PROTO(struct inode *inode, int ret),

        TP_ARGS(inode, ret),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        int,        ret                        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->ret                = ret;
        ),

        TP_printk("dev %d,%d ino %lu ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->ret)
);

TRACE_EVENT(ext4_sync_fs,
        TP_PROTO(struct super_block *sb, int wait),

        TP_ARGS(sb, wait),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        int,        wait                        )

        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->wait        = wait;
        ),

        TP_printk("dev %d,%d wait %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->wait)
);

TRACE_EVENT(ext4_alloc_da_blocks,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field( unsigned int,        data_blocks                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
        ),

        TP_printk("dev %d,%d ino %lu reserved_data_blocks %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->data_blocks)
);

TRACE_EVENT(ext4_mballoc_alloc,
        TP_PROTO(struct ext4_allocation_context *ac),

        TP_ARGS(ac),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u32,         orig_logical                )
                __field(          int,        orig_start                )
                __field(        __u32,         orig_group                )
                __field(          int,        orig_len                )
                __field(        __u32,         goal_logical                )
                __field(          int,        goal_start                )
                __field(        __u32,         goal_group                )
                __field(          int,        goal_len                )
                __field(        __u32,         result_logical                )
                __field(          int,        result_start                )
                __field(        __u32,         result_group                )
                __field(          int,        result_len                )
                __field(        __u16,        found                        )
                __field(        __u16,        groups                        )
                __field(        __u16,        buddy                        )
                __field(        __u16,        flags                        )
                __field(        __u16,        tail                        )
                __field(        __u8,        cr                        )
        ),

        TP_fast_assign(
                __entry->dev                = ac->ac_inode->i_sb->s_dev;
                __entry->ino                = ac->ac_inode->i_ino;
                __entry->orig_logical        = ac->ac_o_ex.fe_logical;
                __entry->orig_start        = ac->ac_o_ex.fe_start;
                __entry->orig_group        = ac->ac_o_ex.fe_group;
                __entry->orig_len        = ac->ac_o_ex.fe_len;
                __entry->goal_logical        = ac->ac_g_ex.fe_logical;
                __entry->goal_start        = ac->ac_g_ex.fe_start;
                __entry->goal_group        = ac->ac_g_ex.fe_group;
                __entry->goal_len        = ac->ac_g_ex.fe_len;
                __entry->result_logical        = ac->ac_f_ex.fe_logical;
                __entry->result_start        = ac->ac_f_ex.fe_start;
                __entry->result_group        = ac->ac_f_ex.fe_group;
                __entry->result_len        = ac->ac_f_ex.fe_len;
                __entry->found                = ac->ac_found;
                __entry->flags                = ac->ac_flags;
                __entry->groups                = ac->ac_groups_scanned;
                __entry->buddy                = ac->ac_buddy;
                __entry->tail                = ac->ac_tail;
                __entry->cr                = ac->ac_criteria;
        ),

        TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u "
                  "result %u/%d/%u@%u blks %u grps %u cr %u flags %s "
                  "tail %u broken %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->orig_group, __entry->orig_start,
                  __entry->orig_len, __entry->orig_logical,
                  __entry->goal_group, __entry->goal_start,
                  __entry->goal_len, __entry->goal_logical,
                  __entry->result_group, __entry->result_start,
                  __entry->result_len, __entry->result_logical,
                  __entry->found, __entry->groups, __entry->cr,
                  show_mballoc_flags(__entry->flags), __entry->tail,
                  __entry->buddy ? 1 << __entry->buddy : 0)
);

TRACE_EVENT(ext4_mballoc_prealloc,
        TP_PROTO(struct ext4_allocation_context *ac),

        TP_ARGS(ac),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u32,         orig_logical                )
                __field(          int,        orig_start                )
                __field(        __u32,         orig_group                )
                __field(          int,        orig_len                )
                __field(        __u32,         result_logical                )
                __field(          int,        result_start                )
                __field(        __u32,         result_group                )
                __field(          int,        result_len                )
        ),

        TP_fast_assign(
                __entry->dev                = ac->ac_inode->i_sb->s_dev;
                __entry->ino                = ac->ac_inode->i_ino;
                __entry->orig_logical        = ac->ac_o_ex.fe_logical;
                __entry->orig_start        = ac->ac_o_ex.fe_start;
                __entry->orig_group        = ac->ac_o_ex.fe_group;
                __entry->orig_len        = ac->ac_o_ex.fe_len;
                __entry->result_logical        = ac->ac_b_ex.fe_logical;
                __entry->result_start        = ac->ac_b_ex.fe_start;
                __entry->result_group        = ac->ac_b_ex.fe_group;
                __entry->result_len        = ac->ac_b_ex.fe_len;
        ),

        TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u result %u/%d/%u@%u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->orig_group, __entry->orig_start,
                  __entry->orig_len, __entry->orig_logical,
                  __entry->result_group, __entry->result_start,
                  __entry->result_len, __entry->result_logical)
);

DECLARE_EVENT_CLASS(ext4__mballoc,
        TP_PROTO(struct super_block *sb,
                 struct inode *inode,
                 ext4_group_t group,
                 ext4_grpblk_t start,
                 ext4_grpblk_t len),

        TP_ARGS(sb, inode, group, start, len),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(          int,        result_start                )
                __field(        __u32,         result_group                )
                __field(          int,        result_len                )
        ),

        TP_fast_assign(
                __entry->dev                = sb->s_dev;
                __entry->ino                = inode ? inode->i_ino : 0;
                __entry->result_start        = start;
                __entry->result_group        = group;
                __entry->result_len        = len;
        ),

        TP_printk("dev %d,%d inode %lu extent %u/%d/%d ",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->result_group, __entry->result_start,
                  __entry->result_len)
);

DEFINE_EVENT(ext4__mballoc, ext4_mballoc_discard,

        TP_PROTO(struct super_block *sb,
                 struct inode *inode,
                 ext4_group_t group,
                 ext4_grpblk_t start,
                 ext4_grpblk_t len),

        TP_ARGS(sb, inode, group, start, len)
);

DEFINE_EVENT(ext4__mballoc, ext4_mballoc_free,

        TP_PROTO(struct super_block *sb,
                 struct inode *inode,
                 ext4_group_t group,
                 ext4_grpblk_t start,
                 ext4_grpblk_t len),

        TP_ARGS(sb, inode, group, start, len)
);

TRACE_EVENT(ext4_forget,
        TP_PROTO(struct inode *inode, int is_metadata, __u64 block),

        TP_ARGS(inode, is_metadata, block),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        block                        )
                __field(        int,        is_metadata                )
                __field(        __u16,        mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->block        = block;
                __entry->is_metadata = is_metadata;
                __entry->mode        = inode->i_mode;
        ),

        TP_printk("dev %d,%d ino %lu mode 0%o is_metadata %d block %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->mode, __entry->is_metadata, __entry->block)
);

TRACE_EVENT(ext4_da_update_reserve_space,
        TP_PROTO(struct inode *inode, int used_blocks, int quota_claim),

        TP_ARGS(inode, used_blocks, quota_claim),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        i_blocks                )
                __field(        int,        used_blocks                )
                __field(        int,        reserved_data_blocks        )
                __field(        int,        quota_claim                )
                __field(        __u16,        mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->i_blocks = inode->i_blocks;
                __entry->used_blocks = used_blocks;
                __entry->reserved_data_blocks =
                                EXT4_I(inode)->i_reserved_data_blocks;
                __entry->quota_claim = quota_claim;
                __entry->mode        = inode->i_mode;
        ),

        TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu used_blocks %d "
                  "reserved_data_blocks %d quota_claim %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->mode, __entry->i_blocks,
                  __entry->used_blocks, __entry->reserved_data_blocks,
                  __entry->quota_claim)
);

TRACE_EVENT(ext4_da_reserve_space,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        i_blocks                )
                __field(        int,        reserved_data_blocks        )
                __field(        __u16,  mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->i_blocks = inode->i_blocks;
                __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
                __entry->mode        = inode->i_mode;
        ),

        TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu "
                  "reserved_data_blocks %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->mode, __entry->i_blocks,
                  __entry->reserved_data_blocks)
);

TRACE_EVENT(ext4_da_release_space,
        TP_PROTO(struct inode *inode, int freed_blocks),

        TP_ARGS(inode, freed_blocks),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        i_blocks                )
                __field(        int,        freed_blocks                )
                __field(        int,        reserved_data_blocks        )
                __field(        __u16,  mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->i_blocks = inode->i_blocks;
                __entry->freed_blocks = freed_blocks;
                __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
                __entry->mode        = inode->i_mode;
        ),

        TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu freed_blocks %d "
                  "reserved_data_blocks %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->mode, __entry->i_blocks,
                  __entry->freed_blocks, __entry->reserved_data_blocks)
);

DECLARE_EVENT_CLASS(ext4__bitmap_load,
        TP_PROTO(struct super_block *sb, unsigned long group),

        TP_ARGS(sb, group),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        __u32,        group                        )

        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->group        = group;
        ),

        TP_printk("dev %d,%d group %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->group)
);

DEFINE_EVENT(ext4__bitmap_load, ext4_mb_bitmap_load,

        TP_PROTO(struct super_block *sb, unsigned long group),

        TP_ARGS(sb, group)
);

DEFINE_EVENT(ext4__bitmap_load, ext4_mb_buddy_bitmap_load,

        TP_PROTO(struct super_block *sb, unsigned long group),

        TP_ARGS(sb, group)
);

DEFINE_EVENT(ext4__bitmap_load, ext4_load_inode_bitmap,

        TP_PROTO(struct super_block *sb, unsigned long group),

        TP_ARGS(sb, group)
);

TRACE_EVENT(ext4_read_block_bitmap_load,
        TP_PROTO(struct super_block *sb, unsigned long group, bool prefetch),

        TP_ARGS(sb, group, prefetch),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        __u32,        group                        )
                __field(        bool,        prefetch                )

        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->group        = group;
                __entry->prefetch = prefetch;
        ),

        TP_printk("dev %d,%d group %u prefetch %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->group, __entry->prefetch)
);

TRACE_EVENT(ext4_direct_IO_enter,
        TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, int rw),

        TP_ARGS(inode, offset, len, rw),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        loff_t,        pos                        )
                __field(        unsigned long,        len                )
                __field(        int,        rw                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->pos        = offset;
                __entry->len        = len;
                __entry->rw        = rw;
        ),

        TP_printk("dev %d,%d ino %lu pos %lld len %lu rw %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->pos, __entry->len, __entry->rw)
);

TRACE_EVENT(ext4_direct_IO_exit,
        TP_PROTO(struct inode *inode, loff_t offset, unsigned long len,
                 int rw, int ret),

        TP_ARGS(inode, offset, len, rw, ret),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        loff_t,        pos                        )
                __field(        unsigned long,        len                )
                __field(        int,        rw                        )
                __field(        int,        ret                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->pos        = offset;
                __entry->len        = len;
                __entry->rw        = rw;
                __entry->ret        = ret;
        ),

        TP_printk("dev %d,%d ino %lu pos %lld len %lu rw %d ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->pos, __entry->len,
                  __entry->rw, __entry->ret)
);

DECLARE_EVENT_CLASS(ext4__fallocate_mode,
        TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),

        TP_ARGS(inode, offset, len, mode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        loff_t,        offset                        )
                __field(        loff_t, len                        )
                __field(        int,        mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->offset        = offset;
                __entry->len        = len;
                __entry->mode        = mode;
        ),

        TP_printk("dev %d,%d ino %lu offset %lld len %lld mode %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->offset, __entry->len,
                  show_falloc_mode(__entry->mode))
);

DEFINE_EVENT(ext4__fallocate_mode, ext4_fallocate_enter,

        TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),

        TP_ARGS(inode, offset, len, mode)
);

DEFINE_EVENT(ext4__fallocate_mode, ext4_punch_hole,

        TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),

        TP_ARGS(inode, offset, len, mode)
);

DEFINE_EVENT(ext4__fallocate_mode, ext4_zero_range,

        TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),

        TP_ARGS(inode, offset, len, mode)
);

TRACE_EVENT(ext4_fallocate_exit,
        TP_PROTO(struct inode *inode, loff_t offset,
                 unsigned int max_blocks, int ret),

        TP_ARGS(inode, offset, max_blocks, ret),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        loff_t,        pos                        )
                __field(        unsigned int,        blocks                )
                __field(        int,         ret                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->pos        = offset;
                __entry->blocks        = max_blocks;
                __entry->ret        = ret;
        ),

        TP_printk("dev %d,%d ino %lu pos %lld blocks %u ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->pos, __entry->blocks,
                  __entry->ret)
);

TRACE_EVENT(ext4_unlink_enter,
        TP_PROTO(struct inode *parent, struct dentry *dentry),

        TP_ARGS(parent, dentry),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        ino_t,        parent                        )
                __field(        loff_t,        size                        )
        ),

        TP_fast_assign(
                __entry->dev                = dentry->d_sb->s_dev;
                __entry->ino                = d_inode(dentry)->i_ino;
                __entry->parent                = parent->i_ino;
                __entry->size                = d_inode(dentry)->i_size;
        ),

        TP_printk("dev %d,%d ino %lu size %lld parent %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->size,
                  (unsigned long) __entry->parent)
);

TRACE_EVENT(ext4_unlink_exit,
        TP_PROTO(struct dentry *dentry, int ret),

        TP_ARGS(dentry, ret),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        int,        ret                        )
        ),

        TP_fast_assign(
                __entry->dev                = dentry->d_sb->s_dev;
                __entry->ino                = d_inode(dentry)->i_ino;
                __entry->ret                = ret;
        ),

        TP_printk("dev %d,%d ino %lu ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->ret)
);

DECLARE_EVENT_CLASS(ext4__truncate,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        __u64,                blocks                )
        ),

        TP_fast_assign(
                __entry->dev    = inode->i_sb->s_dev;
                __entry->ino    = inode->i_ino;
                __entry->blocks        = inode->i_blocks;
        ),

        TP_printk("dev %d,%d ino %lu blocks %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->blocks)
);

DEFINE_EVENT(ext4__truncate, ext4_truncate_enter,

        TP_PROTO(struct inode *inode),

        TP_ARGS(inode)
);

DEFINE_EVENT(ext4__truncate, ext4_truncate_exit,

        TP_PROTO(struct inode *inode),

        TP_ARGS(inode)
);

/* 'ux' is the unwritten extent. */
TRACE_EVENT(ext4_ext_convert_to_initialized_enter,
        TP_PROTO(struct inode *inode, struct ext4_map_blocks *map,
                 struct ext4_extent *ux),

        TP_ARGS(inode, map, ux),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_lblk_t,        m_lblk        )
                __field(        unsigned,        m_len        )
                __field(        ext4_lblk_t,        u_lblk        )
                __field(        unsigned,        u_len        )
                __field(        ext4_fsblk_t,        u_pblk        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->m_lblk                = map->m_lblk;
                __entry->m_len                = map->m_len;
                __entry->u_lblk                = le32_to_cpu(ux->ee_block);
                __entry->u_len                = ext4_ext_get_actual_len(ux);
                __entry->u_pblk                = ext4_ext_pblock(ux);
        ),

        TP_printk("dev %d,%d ino %lu m_lblk %u m_len %u u_lblk %u u_len %u "
                  "u_pblk %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->m_lblk, __entry->m_len,
                  __entry->u_lblk, __entry->u_len, __entry->u_pblk)
);

/*
 * 'ux' is the unwritten extent.
 * 'ix' is the initialized extent to which blocks are transferred.
 */
TRACE_EVENT(ext4_ext_convert_to_initialized_fastpath,
        TP_PROTO(struct inode *inode, struct ext4_map_blocks *map,
                 struct ext4_extent *ux, struct ext4_extent *ix),

        TP_ARGS(inode, map, ux, ix),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_lblk_t,        m_lblk        )
                __field(        unsigned,        m_len        )
                __field(        ext4_lblk_t,        u_lblk        )
                __field(        unsigned,        u_len        )
                __field(        ext4_fsblk_t,        u_pblk        )
                __field(        ext4_lblk_t,        i_lblk        )
                __field(        unsigned,        i_len        )
                __field(        ext4_fsblk_t,        i_pblk        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->m_lblk                = map->m_lblk;
                __entry->m_len                = map->m_len;
                __entry->u_lblk                = le32_to_cpu(ux->ee_block);
                __entry->u_len                = ext4_ext_get_actual_len(ux);
                __entry->u_pblk                = ext4_ext_pblock(ux);
                __entry->i_lblk                = le32_to_cpu(ix->ee_block);
                __entry->i_len                = ext4_ext_get_actual_len(ix);
                __entry->i_pblk                = ext4_ext_pblock(ix);
        ),

        TP_printk("dev %d,%d ino %lu m_lblk %u m_len %u "
                  "u_lblk %u u_len %u u_pblk %llu "
                  "i_lblk %u i_len %u i_pblk %llu ",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->m_lblk, __entry->m_len,
                  __entry->u_lblk, __entry->u_len, __entry->u_pblk,
                  __entry->i_lblk, __entry->i_len, __entry->i_pblk)
);

DECLARE_EVENT_CLASS(ext4__map_blocks_enter,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
                 unsigned int len, unsigned int flags),

        TP_ARGS(inode, lblk, len, flags),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        unsigned int,        len                )
                __field(        unsigned int,        flags                )
        ),

        TP_fast_assign(
                __entry->dev    = inode->i_sb->s_dev;
                __entry->ino    = inode->i_ino;
                __entry->lblk        = lblk;
                __entry->len        = len;
                __entry->flags        = flags;
        ),

        TP_printk("dev %d,%d ino %lu lblk %u len %u flags %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->lblk, __entry->len, show_map_flags(__entry->flags))
);

DEFINE_EVENT(ext4__map_blocks_enter, ext4_ext_map_blocks_enter,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
                 unsigned len, unsigned flags),

        TP_ARGS(inode, lblk, len, flags)
);

DEFINE_EVENT(ext4__map_blocks_enter, ext4_ind_map_blocks_enter,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
                 unsigned len, unsigned flags),

        TP_ARGS(inode, lblk, len, flags)
);

DECLARE_EVENT_CLASS(ext4__map_blocks_exit,
        TP_PROTO(struct inode *inode, unsigned flags, struct ext4_map_blocks *map,
                 int ret),

        TP_ARGS(inode, flags, map, ret),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        unsigned int,        flags                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        unsigned int,        len                )
                __field(        unsigned int,        mflags                )
                __field(        int,                ret                )
        ),

        TP_fast_assign(
                __entry->dev    = inode->i_sb->s_dev;
                __entry->ino    = inode->i_ino;
                __entry->flags        = flags;
                __entry->pblk        = map->m_pblk;
                __entry->lblk        = map->m_lblk;
                __entry->len        = map->m_len;
                __entry->mflags        = map->m_flags;
                __entry->ret        = ret;
        ),

        TP_printk("dev %d,%d ino %lu flags %s lblk %u pblk %llu len %u "
                  "mflags %s ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  show_map_flags(__entry->flags), __entry->lblk, __entry->pblk,
                  __entry->len, show_mflags(__entry->mflags), __entry->ret)
);

DEFINE_EVENT(ext4__map_blocks_exit, ext4_ext_map_blocks_exit,
        TP_PROTO(struct inode *inode, unsigned flags,
                 struct ext4_map_blocks *map, int ret),

        TP_ARGS(inode, flags, map, ret)
);

DEFINE_EVENT(ext4__map_blocks_exit, ext4_ind_map_blocks_exit,
        TP_PROTO(struct inode *inode, unsigned flags,
                 struct ext4_map_blocks *map, int ret),

        TP_ARGS(inode, flags, map, ret)
);

TRACE_EVENT(ext4_ext_load_extent,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk),

        TP_ARGS(inode, lblk, pblk),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        ext4_lblk_t,        lblk                )
        ),

        TP_fast_assign(
                __entry->dev    = inode->i_sb->s_dev;
                __entry->ino    = inode->i_ino;
                __entry->pblk        = pblk;
                __entry->lblk        = lblk;
        ),

        TP_printk("dev %d,%d ino %lu lblk %u pblk %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->lblk, __entry->pblk)
);

TRACE_EVENT(ext4_load_inode,
        TP_PROTO(struct super_block *sb, unsigned long ino),

        TP_ARGS(sb, ino),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                )
                __field(        ino_t,        ino                )
        ),

        TP_fast_assign(
                __entry->dev                = sb->s_dev;
                __entry->ino                = ino;
        ),

        TP_printk("dev %d,%d ino %ld",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino)
);

TRACE_EVENT(ext4_journal_start,
        TP_PROTO(struct super_block *sb, int blocks, int rsv_blocks,
                 int revoke_creds, unsigned long IP),

        TP_ARGS(sb, blocks, rsv_blocks, revoke_creds, IP),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(unsigned long,        ip                        )
                __field(          int,        blocks                        )
                __field(          int,        rsv_blocks                )
                __field(          int,        revoke_creds                )
        ),

        TP_fast_assign(
                __entry->dev                 = sb->s_dev;
                __entry->ip                 = IP;
                __entry->blocks                 = blocks;
                __entry->rsv_blocks         = rsv_blocks;
                __entry->revoke_creds         = revoke_creds;
        ),

        TP_printk("dev %d,%d blocks %d, rsv_blocks %d, revoke_creds %d, "
                  "caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->blocks, __entry->rsv_blocks, __entry->revoke_creds,
                  (void *)__entry->ip)
);

TRACE_EVENT(ext4_journal_start_reserved,
        TP_PROTO(struct super_block *sb, int blocks, unsigned long IP),

        TP_ARGS(sb, blocks, IP),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(unsigned long,        ip                        )
                __field(          int,        blocks                        )
        ),

        TP_fast_assign(
                __entry->dev                 = sb->s_dev;
                __entry->ip                 = IP;
                __entry->blocks                 = blocks;
        ),

        TP_printk("dev %d,%d blocks, %d caller %pS",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->blocks, (void *)__entry->ip)
);

DECLARE_EVENT_CLASS(ext4__trim,
        TP_PROTO(struct super_block *sb,
                 ext4_group_t group,
                 ext4_grpblk_t start,
                 ext4_grpblk_t len),

        TP_ARGS(sb, group, start, len),

        TP_STRUCT__entry(
                __field(        int,        dev_major                )
                __field(        int,        dev_minor                )
                __field(        __u32,         group                        )
                __field(        int,        start                        )
                __field(        int,        len                        )
        ),

        TP_fast_assign(
                __entry->dev_major        = MAJOR(sb->s_dev);
                __entry->dev_minor        = MINOR(sb->s_dev);
                __entry->group                = group;
                __entry->start                = start;
                __entry->len                = len;
        ),

        TP_printk("dev %d,%d group %u, start %d, len %d",
                  __entry->dev_major, __entry->dev_minor,
                  __entry->group, __entry->start, __entry->len)
);

DEFINE_EVENT(ext4__trim, ext4_trim_extent,

        TP_PROTO(struct super_block *sb,
                 ext4_group_t group,
                 ext4_grpblk_t start,
                 ext4_grpblk_t len),

        TP_ARGS(sb, group, start, len)
);

DEFINE_EVENT(ext4__trim, ext4_trim_all_free,

        TP_PROTO(struct super_block *sb,
                 ext4_group_t group,
                 ext4_grpblk_t start,
                 ext4_grpblk_t len),

        TP_ARGS(sb, group, start, len)
);

TRACE_EVENT(ext4_ext_handle_unwritten_extents,
        TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int flags,
                 unsigned int allocated, ext4_fsblk_t newblock),

        TP_ARGS(inode, map, flags, allocated, newblock),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        int,                flags                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        unsigned int,        len                )
                __field(        unsigned int,        allocated        )
                __field(        ext4_fsblk_t,        newblk                )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->flags                = flags;
                __entry->lblk                = map->m_lblk;
                __entry->pblk                = map->m_pblk;
                __entry->len                = map->m_len;
                __entry->allocated        = allocated;
                __entry->newblk                = newblock;
        ),

        TP_printk("dev %d,%d ino %lu m_lblk %u m_pblk %llu m_len %u flags %s "
                  "allocated %d newblock %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->lblk, (unsigned long long) __entry->pblk,
                  __entry->len, show_map_flags(__entry->flags),
                  (unsigned int) __entry->allocated,
                  (unsigned long long) __entry->newblk)
);

TRACE_EVENT(ext4_get_implied_cluster_alloc_exit,
        TP_PROTO(struct super_block *sb, struct ext4_map_blocks *map, int ret),

        TP_ARGS(sb, map, ret),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        unsigned int,        flags        )
                __field(        ext4_lblk_t,        lblk        )
                __field(        ext4_fsblk_t,        pblk        )
                __field(        unsigned int,        len        )
                __field(        int,                ret        )
        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->flags        = map->m_flags;
                __entry->lblk        = map->m_lblk;
                __entry->pblk        = map->m_pblk;
                __entry->len        = map->m_len;
                __entry->ret        = ret;
        ),

        TP_printk("dev %d,%d m_lblk %u m_pblk %llu m_len %u m_flags %s ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->lblk, (unsigned long long) __entry->pblk,
                  __entry->len, show_mflags(__entry->flags), __entry->ret)
);

TRACE_EVENT(ext4_ext_put_in_cache,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk, unsigned int len,
                 ext4_fsblk_t start),

        TP_ARGS(inode, lblk, len, start),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_lblk_t,        lblk        )
                __field(        unsigned int,        len        )
                __field(        ext4_fsblk_t,        start        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = lblk;
                __entry->len        = len;
                __entry->start        = start;
        ),

        TP_printk("dev %d,%d ino %lu lblk %u len %u start %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->lblk,
                  __entry->len,
                  (unsigned long long) __entry->start)
);

TRACE_EVENT(ext4_ext_in_cache,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk, int ret),

        TP_ARGS(inode, lblk, ret),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_lblk_t,        lblk        )
                __field(        int,                ret        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = lblk;
                __entry->ret        = ret;
        ),

        TP_printk("dev %d,%d ino %lu lblk %u ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->lblk,
                  __entry->ret)

);

TRACE_EVENT(ext4_find_delalloc_range,
        TP_PROTO(struct inode *inode, ext4_lblk_t from, ext4_lblk_t to,
                int reverse, int found, ext4_lblk_t found_blk),

        TP_ARGS(inode, from, to, reverse, found, found_blk),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        from                )
                __field(        ext4_lblk_t,        to                )
                __field(        int,                reverse                )
                __field(        int,                found                )
                __field(        ext4_lblk_t,        found_blk        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->from                = from;
                __entry->to                = to;
                __entry->reverse        = reverse;
                __entry->found                = found;
                __entry->found_blk        = found_blk;
        ),

        TP_printk("dev %d,%d ino %lu from %u to %u reverse %d found %d "
                  "(blk = %u)",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->from, (unsigned) __entry->to,
                  __entry->reverse, __entry->found,
                  (unsigned) __entry->found_blk)
);

TRACE_EVENT(ext4_get_reserved_cluster_alloc,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk, unsigned int len),

        TP_ARGS(inode, lblk, len),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_lblk_t,        lblk        )
                __field(        unsigned int,        len        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = lblk;
                __entry->len        = len;
        ),

        TP_printk("dev %d,%d ino %lu lblk %u len %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->lblk,
                  __entry->len)
);

TRACE_EVENT(ext4_ext_show_extent,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
                 unsigned short len),

        TP_ARGS(inode, lblk, pblk, len),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_fsblk_t,        pblk        )
                __field(        ext4_lblk_t,        lblk        )
                __field(        unsigned short,        len        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->pblk        = pblk;
                __entry->lblk        = lblk;
                __entry->len        = len;
        ),

        TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->lblk,
                  (unsigned long long) __entry->pblk,
                  (unsigned short) __entry->len)
);

TRACE_EVENT(ext4_remove_blocks,
        TP_PROTO(struct inode *inode, struct ext4_extent *ex,
                 ext4_lblk_t from, ext4_fsblk_t to,
                 struct partial_cluster *pc),

        TP_ARGS(inode, ex, from, to, pc),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_lblk_t,        from        )
                __field(        ext4_lblk_t,        to        )
                __field(        ext4_fsblk_t,        ee_pblk        )
                __field(        ext4_lblk_t,        ee_lblk        )
                __field(        unsigned short,        ee_len        )
                __field(        ext4_fsblk_t,        pc_pclu        )
                __field(        ext4_lblk_t,        pc_lblk        )
                __field(        int,                pc_state)
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->from                = from;
                __entry->to                = to;
                __entry->ee_pblk        = ext4_ext_pblock(ex);
                __entry->ee_lblk        = le32_to_cpu(ex->ee_block);
                __entry->ee_len                = ext4_ext_get_actual_len(ex);
                __entry->pc_pclu        = pc->pclu;
                __entry->pc_lblk        = pc->lblk;
                __entry->pc_state        = pc->state;
        ),

        TP_printk("dev %d,%d ino %lu extent [%u(%llu), %u]"
                  "from %u to %u partial [pclu %lld lblk %u state %d]",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->ee_lblk,
                  (unsigned long long) __entry->ee_pblk,
                  (unsigned short) __entry->ee_len,
                  (unsigned) __entry->from,
                  (unsigned) __entry->to,
                  (long long) __entry->pc_pclu,
                  (unsigned int) __entry->pc_lblk,
                  (int) __entry->pc_state)
);

TRACE_EVENT(ext4_ext_rm_leaf,
        TP_PROTO(struct inode *inode, ext4_lblk_t start,
                 struct ext4_extent *ex,
                 struct partial_cluster *pc),

        TP_ARGS(inode, start, ex, pc),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_lblk_t,        start        )
                __field(        ext4_lblk_t,        ee_lblk        )
                __field(        ext4_fsblk_t,        ee_pblk        )
                __field(        short,                ee_len        )
                __field(        ext4_fsblk_t,        pc_pclu        )
                __field(        ext4_lblk_t,        pc_lblk        )
                __field(        int,                pc_state)
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->start                = start;
                __entry->ee_lblk        = le32_to_cpu(ex->ee_block);
                __entry->ee_pblk        = ext4_ext_pblock(ex);
                __entry->ee_len                = ext4_ext_get_actual_len(ex);
                __entry->pc_pclu        = pc->pclu;
                __entry->pc_lblk        = pc->lblk;
                __entry->pc_state        = pc->state;
        ),

        TP_printk("dev %d,%d ino %lu start_lblk %u last_extent [%u(%llu), %u]"
                  "partial [pclu %lld lblk %u state %d]",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->start,
                  (unsigned) __entry->ee_lblk,
                  (unsigned long long) __entry->ee_pblk,
                  (unsigned short) __entry->ee_len,
                  (long long) __entry->pc_pclu,
                  (unsigned int) __entry->pc_lblk,
                  (int) __entry->pc_state)
);

TRACE_EVENT(ext4_ext_rm_idx,
        TP_PROTO(struct inode *inode, ext4_fsblk_t pblk),

        TP_ARGS(inode, pblk),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_fsblk_t,        pblk        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->pblk        = pblk;
        ),

        TP_printk("dev %d,%d ino %lu index_pblk %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned long long) __entry->pblk)
);

TRACE_EVENT(ext4_ext_remove_space,
        TP_PROTO(struct inode *inode, ext4_lblk_t start,
                 ext4_lblk_t end, int depth),

        TP_ARGS(inode, start, end, depth),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_lblk_t,        start        )
                __field(        ext4_lblk_t,        end        )
                __field(        int,                depth        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->start        = start;
                __entry->end        = end;
                __entry->depth        = depth;
        ),

        TP_printk("dev %d,%d ino %lu since %u end %u depth %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->start,
                  (unsigned) __entry->end,
                  __entry->depth)
);

TRACE_EVENT(ext4_ext_remove_space_done,
        TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end,
                 int depth, struct partial_cluster *pc, __le16 eh_entries),

        TP_ARGS(inode, start, end, depth, pc, eh_entries),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        start                )
                __field(        ext4_lblk_t,        end                )
                __field(        int,                depth                )
                __field(        ext4_fsblk_t,        pc_pclu                )
                __field(        ext4_lblk_t,        pc_lblk                )
                __field(        int,                pc_state        )
                __field(        unsigned short,        eh_entries        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->start                = start;
                __entry->end                = end;
                __entry->depth                = depth;
                __entry->pc_pclu        = pc->pclu;
                __entry->pc_lblk        = pc->lblk;
                __entry->pc_state        = pc->state;
                __entry->eh_entries        = le16_to_cpu(eh_entries);
        ),

        TP_printk("dev %d,%d ino %lu since %u end %u depth %d "
                  "partial [pclu %lld lblk %u state %d] "
                  "remaining_entries %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->start,
                  (unsigned) __entry->end,
                  __entry->depth,
                  (long long) __entry->pc_pclu,
                  (unsigned int) __entry->pc_lblk,
                  (int) __entry->pc_state,
                  (unsigned short) __entry->eh_entries)
);

DECLARE_EVENT_CLASS(ext4__es_extent,
        TP_PROTO(struct inode *inode, struct extent_status *es),

        TP_ARGS(inode, es),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        ext4_lblk_t,        len                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        char, status        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = es->es_lblk;
                __entry->len        = es->es_len;
                __entry->pblk        = ext4_es_show_pblock(es);
                __entry->status        = ext4_es_status(es);
        ),

        TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->lblk, __entry->len,
                  __entry->pblk, show_extent_status(__entry->status))
);

DEFINE_EVENT(ext4__es_extent, ext4_es_insert_extent,
        TP_PROTO(struct inode *inode, struct extent_status *es),

        TP_ARGS(inode, es)
);

DEFINE_EVENT(ext4__es_extent, ext4_es_cache_extent,
        TP_PROTO(struct inode *inode, struct extent_status *es),

        TP_ARGS(inode, es)
);

TRACE_EVENT(ext4_es_remove_extent,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len),

        TP_ARGS(inode, lblk, len),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        loff_t,        lblk                        )
                __field(        loff_t,        len                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = lblk;
                __entry->len        = len;
        ),

        TP_printk("dev %d,%d ino %lu es [%lld/%lld)",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->lblk, __entry->len)
);

TRACE_EVENT(ext4_es_find_extent_range_enter,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk),

        TP_ARGS(inode, lblk),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        lblk                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = lblk;
        ),

        TP_printk("dev %d,%d ino %lu lblk %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->lblk)
);

TRACE_EVENT(ext4_es_find_extent_range_exit,
        TP_PROTO(struct inode *inode, struct extent_status *es),

        TP_ARGS(inode, es),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        ext4_lblk_t,        len                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        char, status        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = es->es_lblk;
                __entry->len        = es->es_len;
                __entry->pblk        = ext4_es_show_pblock(es);
                __entry->status        = ext4_es_status(es);
        ),

        TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->lblk, __entry->len,
                  __entry->pblk, show_extent_status(__entry->status))
);

TRACE_EVENT(ext4_es_lookup_extent_enter,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk),

        TP_ARGS(inode, lblk),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        lblk                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = lblk;
        ),

        TP_printk("dev %d,%d ino %lu lblk %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->lblk)
);

TRACE_EVENT(ext4_es_lookup_extent_exit,
        TP_PROTO(struct inode *inode, struct extent_status *es,
                 int found),

        TP_ARGS(inode, es, found),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        ext4_lblk_t,        len                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        char,                status                )
                __field(        int,                found                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = es->es_lblk;
                __entry->len        = es->es_len;
                __entry->pblk        = ext4_es_show_pblock(es);
                __entry->status        = ext4_es_status(es);
                __entry->found        = found;
        ),

        TP_printk("dev %d,%d ino %lu found %d [%u/%u) %llu %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->found,
                  __entry->lblk, __entry->len,
                  __entry->found ? __entry->pblk : 0,
                  show_extent_status(__entry->found ? __entry->status : 0))
);

DECLARE_EVENT_CLASS(ext4__es_shrink_enter,
        TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt),

        TP_ARGS(sb, nr_to_scan, cache_cnt),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        int,        nr_to_scan                )
                __field(        int,        cache_cnt                )
        ),

        TP_fast_assign(
                __entry->dev                = sb->s_dev;
                __entry->nr_to_scan        = nr_to_scan;
                __entry->cache_cnt        = cache_cnt;
        ),

        TP_printk("dev %d,%d nr_to_scan %d cache_cnt %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->nr_to_scan, __entry->cache_cnt)
);

DEFINE_EVENT(ext4__es_shrink_enter, ext4_es_shrink_count,
        TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt),

        TP_ARGS(sb, nr_to_scan, cache_cnt)
);

DEFINE_EVENT(ext4__es_shrink_enter, ext4_es_shrink_scan_enter,
        TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt),

        TP_ARGS(sb, nr_to_scan, cache_cnt)
);

TRACE_EVENT(ext4_es_shrink_scan_exit,
        TP_PROTO(struct super_block *sb, int nr_shrunk, int cache_cnt),

        TP_ARGS(sb, nr_shrunk, cache_cnt),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        int,        nr_shrunk                )
                __field(        int,        cache_cnt                )
        ),

        TP_fast_assign(
                __entry->dev                = sb->s_dev;
                __entry->nr_shrunk        = nr_shrunk;
                __entry->cache_cnt        = cache_cnt;
        ),

        TP_printk("dev %d,%d nr_shrunk %d cache_cnt %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->nr_shrunk, __entry->cache_cnt)
);

TRACE_EVENT(ext4_collapse_range,
        TP_PROTO(struct inode *inode, loff_t offset, loff_t len),

        TP_ARGS(inode, offset, len),

        TP_STRUCT__entry(
                __field(dev_t,        dev)
                __field(ino_t,        ino)
                __field(loff_t,        offset)
                __field(loff_t, len)
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->offset        = offset;
                __entry->len        = len;
        ),

        TP_printk("dev %d,%d ino %lu offset %lld len %lld",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->offset, __entry->len)
);

TRACE_EVENT(ext4_insert_range,
        TP_PROTO(struct inode *inode, loff_t offset, loff_t len),

        TP_ARGS(inode, offset, len),

        TP_STRUCT__entry(
                __field(dev_t,        dev)
                __field(ino_t,        ino)
                __field(loff_t,        offset)
                __field(loff_t, len)
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->offset        = offset;
                __entry->len        = len;
        ),

        TP_printk("dev %d,%d ino %lu offset %lld len %lld",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->offset, __entry->len)
);

TRACE_EVENT(ext4_es_shrink,
        TP_PROTO(struct super_block *sb, int nr_shrunk, u64 scan_time,
                 int nr_skipped, int retried),

        TP_ARGS(sb, nr_shrunk, scan_time, nr_skipped, retried),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        int,                nr_shrunk        )
                __field(        unsigned long long, scan_time        )
                __field(        int,                nr_skipped        )
                __field(        int,                retried                )
        ),

        TP_fast_assign(
                __entry->dev                = sb->s_dev;
                __entry->nr_shrunk        = nr_shrunk;
                __entry->scan_time        = div_u64(scan_time, 1000);
                __entry->nr_skipped        = nr_skipped;
                __entry->retried        = retried;
        ),

        TP_printk("dev %d,%d nr_shrunk %d, scan_time %llu "
                  "nr_skipped %d retried %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_shrunk,
                  __entry->scan_time, __entry->nr_skipped, __entry->retried)
);

TRACE_EVENT(ext4_es_insert_delayed_block,
        TP_PROTO(struct inode *inode, struct extent_status *es,
                 bool allocated),

        TP_ARGS(inode, es, allocated),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        ext4_lblk_t,        len                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        char,                status                )
                __field(        bool,                allocated        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->lblk                = es->es_lblk;
                __entry->len                = es->es_len;
                __entry->pblk                = ext4_es_show_pblock(es);
                __entry->status                = ext4_es_status(es);
                __entry->allocated        = allocated;
        ),

        TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s "
                  "allocated %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->lblk, __entry->len,
                  __entry->pblk, show_extent_status(__entry->status),
                  __entry->allocated)
);

/* fsmap traces */
DECLARE_EVENT_CLASS(ext4_fsmap_class,
        TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len,
                 u64 owner),
        TP_ARGS(sb, keydev, agno, bno, len, owner),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(dev_t, keydev)
                __field(u32, agno)
                __field(u64, bno)
                __field(u64, len)
                __field(u64, owner)
        ),
        TP_fast_assign(
                __entry->dev = sb->s_bdev->bd_dev;
                __entry->keydev = new_decode_dev(keydev);
                __entry->agno = agno;
                __entry->bno = bno;
                __entry->len = len;
                __entry->owner = owner;
        ),
        TP_printk("dev %d:%d keydev %d:%d agno %u bno %llu len %llu owner %lld\n",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  MAJOR(__entry->keydev), MINOR(__entry->keydev),
                  __entry->agno,
                  __entry->bno,
                  __entry->len,
                  __entry->owner)
)
#define DEFINE_FSMAP_EVENT(name) \
DEFINE_EVENT(ext4_fsmap_class, name, \
        TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len, \
                 u64 owner), \
        TP_ARGS(sb, keydev, agno, bno, len, owner))
DEFINE_FSMAP_EVENT(ext4_fsmap_low_key);
DEFINE_FSMAP_EVENT(ext4_fsmap_high_key);
DEFINE_FSMAP_EVENT(ext4_fsmap_mapping);

DECLARE_EVENT_CLASS(ext4_getfsmap_class,
        TP_PROTO(struct super_block *sb, struct ext4_fsmap *fsmap),
        TP_ARGS(sb, fsmap),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(dev_t, keydev)
                __field(u64, block)
                __field(u64, len)
                __field(u64, owner)
                __field(u64, flags)
        ),
        TP_fast_assign(
                __entry->dev = sb->s_bdev->bd_dev;
                __entry->keydev = new_decode_dev(fsmap->fmr_device);
                __entry->block = fsmap->fmr_physical;
                __entry->len = fsmap->fmr_length;
                __entry->owner = fsmap->fmr_owner;
                __entry->flags = fsmap->fmr_flags;
        ),
        TP_printk("dev %d:%d keydev %d:%d block %llu len %llu owner %lld flags 0x%llx\n",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  MAJOR(__entry->keydev), MINOR(__entry->keydev),
                  __entry->block,
                  __entry->len,
                  __entry->owner,
                  __entry->flags)
)
#define DEFINE_GETFSMAP_EVENT(name) \
DEFINE_EVENT(ext4_getfsmap_class, name, \
        TP_PROTO(struct super_block *sb, struct ext4_fsmap *fsmap), \
        TP_ARGS(sb, fsmap))
DEFINE_GETFSMAP_EVENT(ext4_getfsmap_low_key);
DEFINE_GETFSMAP_EVENT(ext4_getfsmap_high_key);
DEFINE_GETFSMAP_EVENT(ext4_getfsmap_mapping);

TRACE_EVENT(ext4_shutdown,
        TP_PROTO(struct super_block *sb, unsigned long flags),

        TP_ARGS(sb, flags),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(     unsigned,        flags                        )
        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->flags        = flags;
        ),

        TP_printk("dev %d,%d flags %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->flags)
);

TRACE_EVENT(ext4_error,
        TP_PROTO(struct super_block *sb, const char *function,
                 unsigned int line),

        TP_ARGS(sb, function, line),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field( const char *,        function                )
                __field(     unsigned,        line                        )
        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->function = function;
                __entry->line        = line;
        ),

        TP_printk("dev %d,%d function %s line %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->function, __entry->line)
);

TRACE_EVENT(ext4_prefetch_bitmaps,
            TP_PROTO(struct super_block *sb, ext4_group_t group,
                     ext4_group_t next, unsigned int prefetch_ios),

        TP_ARGS(sb, group, next, prefetch_ios),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        __u32,        group                        )
                __field(        __u32,        next                        )
                __field(        __u32,        ios                        )
        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->group        = group;
                __entry->next        = next;
                __entry->ios        = prefetch_ios;
        ),

        TP_printk("dev %d,%d group %u next %u ios %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->group, __entry->next, __entry->ios)
);

TRACE_EVENT(ext4_lazy_itable_init,
            TP_PROTO(struct super_block *sb, ext4_group_t group),

        TP_ARGS(sb, group),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        __u32,        group                        )
        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->group        = group;
        ),

        TP_printk("dev %d,%d group %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->group)
);

TRACE_EVENT(ext4_fc_replay_scan,
        TP_PROTO(struct super_block *sb, int error, int off),

        TP_ARGS(sb, error, off),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(int, error)
                __field(int, off)
        ),

        TP_fast_assign(
                __entry->dev = sb->s_dev;
                __entry->error = error;
                __entry->off = off;
        ),

        TP_printk("FC scan pass on dev %d,%d: error %d, off %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->error, __entry->off)
);

TRACE_EVENT(ext4_fc_replay,
        TP_PROTO(struct super_block *sb, int tag, int ino, int priv1, int priv2),

        TP_ARGS(sb, tag, ino, priv1, priv2),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(int, tag)
                __field(int, ino)
                __field(int, priv1)
                __field(int, priv2)
        ),

        TP_fast_assign(
                __entry->dev = sb->s_dev;
                __entry->tag = tag;
                __entry->ino = ino;
                __entry->priv1 = priv1;
                __entry->priv2 = priv2;
        ),

        TP_printk("FC Replay %d,%d: tag %d, ino %d, data1 %d, data2 %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->tag, __entry->ino, __entry->priv1, __entry->priv2)
);

TRACE_EVENT(ext4_fc_commit_start,
        TP_PROTO(struct super_block *sb),

        TP_ARGS(sb),

        TP_STRUCT__entry(
                __field(dev_t, dev)
        ),

        TP_fast_assign(
                __entry->dev = sb->s_dev;
        ),

        TP_printk("fast_commit started on dev %d,%d",
                  MAJOR(__entry->dev), MINOR(__entry->dev))
);

TRACE_EVENT(ext4_fc_commit_stop,
            TP_PROTO(struct super_block *sb, int nblks, int reason),

        TP_ARGS(sb, nblks, reason),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(int, nblks)
                __field(int, reason)
                __field(int, num_fc)
                __field(int, num_fc_ineligible)
                __field(int, nblks_agg)
        ),

        TP_fast_assign(
                __entry->dev = sb->s_dev;
                __entry->nblks = nblks;
                __entry->reason = reason;
                __entry->num_fc = EXT4_SB(sb)->s_fc_stats.fc_num_commits;
                __entry->num_fc_ineligible =
                        EXT4_SB(sb)->s_fc_stats.fc_ineligible_commits;
                __entry->nblks_agg = EXT4_SB(sb)->s_fc_stats.fc_numblks;
        ),

        TP_printk("fc on [%d,%d] nblks %d, reason %d, fc = %d, ineligible = %d, agg_nblks %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->nblks, __entry->reason, __entry->num_fc,
                  __entry->num_fc_ineligible, __entry->nblks_agg)
);

#define FC_REASON_NAME_STAT(reason)                                        \
        show_fc_reason(reason),                                                \
        __entry->fc_ineligible_rc[reason]

TRACE_EVENT(ext4_fc_stats,
        TP_PROTO(struct super_block *sb),

        TP_ARGS(sb),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __array(unsigned int, fc_ineligible_rc, EXT4_FC_REASON_MAX)
                __field(unsigned long, fc_commits)
                __field(unsigned long, fc_ineligible_commits)
                __field(unsigned long, fc_numblks)
        ),

        TP_fast_assign(
                int i;

                __entry->dev = sb->s_dev;
                for (i = 0; i < EXT4_FC_REASON_MAX; i++) {
                        __entry->fc_ineligible_rc[i] =
                                EXT4_SB(sb)->s_fc_stats.fc_ineligible_reason_count[i];
                }
                __entry->fc_commits = EXT4_SB(sb)->s_fc_stats.fc_num_commits;
                __entry->fc_ineligible_commits =
                        EXT4_SB(sb)->s_fc_stats.fc_ineligible_commits;
                __entry->fc_numblks = EXT4_SB(sb)->s_fc_stats.fc_numblks;
        ),

        TP_printk("dev %d,%d fc ineligible reasons:\n"
                  "%s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u"
                  "num_commits:%lu, ineligible: %lu, numblks: %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_XATTR),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_CROSS_RENAME),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_JOURNAL_FLAG_CHANGE),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_NOMEM),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_SWAP_BOOT),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_RESIZE),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_RENAME_DIR),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_FALLOC_RANGE),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_INODE_JOURNAL_DATA),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_ENCRYPTED_FILENAME),
                  __entry->fc_commits, __entry->fc_ineligible_commits,
                  __entry->fc_numblks)
);

#define DEFINE_TRACE_DENTRY_EVENT(__type)                                \
        TRACE_EVENT(ext4_fc_track_##__type,                                \
            TP_PROTO(struct inode *inode, struct dentry *dentry, int ret), \
                                                                        \
            TP_ARGS(inode, dentry, ret),                                \
                                                                        \
            TP_STRUCT__entry(                                                \
                    __field(dev_t, dev)                                        \
                    __field(int, ino)                                        \
                    __field(int, error)                                        \
                    ),                                                        \
                                                                        \
            TP_fast_assign(                                                \
                    __entry->dev = inode->i_sb->s_dev;                        \
                    __entry->ino = inode->i_ino;                        \
                    __entry->error = ret;                                \
                    ),                                                        \
                                                                        \
            TP_printk("dev %d:%d, inode %d, error %d, fc_%s",                \
                      MAJOR(__entry->dev), MINOR(__entry->dev),                \
                      __entry->ino, __entry->error,                        \
                      #__type)                                                \
        )

DEFINE_TRACE_DENTRY_EVENT(create);
DEFINE_TRACE_DENTRY_EVENT(link);
DEFINE_TRACE_DENTRY_EVENT(unlink);

TRACE_EVENT(ext4_fc_track_inode,
            TP_PROTO(struct inode *inode, int ret),

            TP_ARGS(inode, ret),

            TP_STRUCT__entry(
                    __field(dev_t, dev)
                    __field(int, ino)
                    __field(int, error)
                    ),

            TP_fast_assign(
                    __entry->dev = inode->i_sb->s_dev;
                    __entry->ino = inode->i_ino;
                    __entry->error = ret;
                    ),

            TP_printk("dev %d:%d, inode %d, error %d",
                      MAJOR(__entry->dev), MINOR(__entry->dev),
                      __entry->ino, __entry->error)
        );

TRACE_EVENT(ext4_fc_track_range,
            TP_PROTO(struct inode *inode, long start, long end, int ret),

            TP_ARGS(inode, start, end, ret),

            TP_STRUCT__entry(
                    __field(dev_t, dev)
                    __field(int, ino)
                    __field(long, start)
                    __field(long, end)
                    __field(int, error)
                    ),

            TP_fast_assign(
                    __entry->dev = inode->i_sb->s_dev;
                    __entry->ino = inode->i_ino;
                    __entry->start = start;
                    __entry->end = end;
                    __entry->error = ret;
                    ),

            TP_printk("dev %d:%d, inode %d, error %d, start %ld, end %ld",
                      MAJOR(__entry->dev), MINOR(__entry->dev),
                      __entry->ino, __entry->error, __entry->start,
                      __entry->end)
        );

#endif /* _TRACE_EXT4_H */

/* This part must be outside protection */
#include <trace/define_trace.h>










































































































































    4 







    4 














    4 



























    4 










    4 








    4 












    1 
    1 















    1 










    4 

    4 


    4 



    4 






































    4 



    4 






















































    1 













    1 








    1 
    1 
    4 













    4 
    3 
    1 






    4 












    4 
























    3 







    3 












    4 






    4 











    3 












    4 





































    4 











    4 
    1 


































    4 










    4 




    4 







    4 







    4 






    4 



    4 











    4 




    4 
    1 



    4 












    4 


    4 

    2 



























    3 



    3 


    3 

    3 


    3 

























    1 

    1 


    1 

    1 









    1 







    1 



















    2 

















    3 







    2 
    2 
    2 




    3 




















    3 
    3 

    1 
























    2 



    2 

    2 





    2 




    2 

    2 




    2 
    2 



    2 

    2 











    2 
    2 




































    1 

    1 













































































































































    1 









































































































































































































    2 
















































    2 

    2 


    2 







    2 






    2 
    2 

    2 


    2 















    2 










    2 



    2 

    2 







    2 







    1 
    1 
















    1 






















    1 

    1 

    1 






    1 



    1 











    2 









    2 

    1 
    2 


















    2 


    2 
    1 


    1 

    1 

    1 

    1 




    1 














    1 






    1 














    1 

    1 





















    4 
    4 



    1 




    1 


    1 


































    1 










    1 
    1 


    1 


    1 






    1 



    1 



    1 

    1 





    1 




    1 






    1 





















    1 
    1 

    2 
    1 



    2 


    2 

    1 

























































































    3 




















































    4 
    3 



    2 


    2 
    1 



    2 

    4 







































































































































































    1 


    4 








































































    3 
    1 
    4 

    4 
    2 
    4 









    4 


    4 


    3 



    4 







    4 










    4 



    4 







    4 

    4 


    4 
    4 





    1 



    4 

    4 



    1 



    4 













    4 


    4 




    4 


    4 


    4 
    4 


















    4 


    2 
    2 






    3 
    3 




    3 


    3 
































    3 


    4 










    1 


    1 













    1 











    1 




    1 

    1 



    1 



    1 










    1 







    1 


    1 



    1 
    1 






    1 

    1 
    1 

    1 



    1 
















    1 

    1 
    1 







































    1 
































































































































































































    1 

























































































    1 

    1 

    1 

    1 

    1 



























































































    2 

















































    1 
    1 




    3 






    3 



    1 

    1 














    2 




    3 






    3 







    3 









































    1 

    1 


    1 



    1 










































































    3 










    3 



    3 















    3 













    3 

    3 
    1 

    1 







    3 







    3 


    3 










    3 
    1 

    1 




    1 



    2 













    3 















    3 
    2 


    2 


    2 


    2 


    1 
    1 




    1 



    3 
    3 







    1 

    2 
    3 
    3 






    3 
    3 

    3 


    3 
    1 





    2 

    2 
    2 
    1 









    3 





    2 



    3 

    3 
    1 

    1 

    1 




    3 



    2 
    3 

    1 

    1 





    3 
    3 
    3 


    3 




































































































    3 



    3 

    3 


    3 
    3 




    3 

    1 
    3 




    1 






    1 






    3 






    3 


    3 







































    1 





    1 









    1 



    1 



















    1 



    1 



    1 
    1 























    1 














    1 
    1 





    1 
    1 


    1 











    1 
    1 







    1 






















    1 








    1 


    1 






    1 








    1 












    1 
















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/namei.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 * Some corrections by tytso.
 */

/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
 * lookup logic.
 */
/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
 */

#include <linux/init.h>
#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
#include <linux/fsnotify.h>
#include <linux/personality.h>
#include <linux/security.h>
#include <linux/ima.h>
#include <linux/syscalls.h>
#include <linux/mount.h>
#include <linux/audit.h>
#include <linux/capability.h>
#include <linux/file.h>
#include <linux/fcntl.h>
#include <linux/device_cgroup.h>
#include <linux/fs_struct.h>
#include <linux/posix_acl.h>
#include <linux/hash.h>
#include <linux/bitops.h>
#include <linux/init_task.h>
#include <linux/uaccess.h>

#include "internal.h"
#include "mount.h"

/* [Feb-1997 T. Schoebel-Theuer]
 * Fundamental changes in the pathname lookup mechanisms (namei)
 * were necessary because of omirr.  The reason is that omirr needs
 * to know the _real_ pathname, not the user-supplied one, in case
 * of symlinks (and also when transname replacements occur).
 *
 * The new code replaces the old recursive symlink resolution with
 * an iterative one (in case of non-nested symlink chains).  It does
 * this with calls to <fs>_follow_link().
 * As a side effect, dir_namei(), _namei() and follow_link() are now 
 * replaced with a single function lookup_dentry() that can handle all 
 * the special cases of the former code.
 *
 * With the new dcache, the pathname is stored at each inode, at least as
 * long as the refcount of the inode is positive.  As a side effect, the
 * size of the dcache depends on the inode cache and thus is dynamic.
 *
 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
 * resolution to correspond with current state of the code.
 *
 * Note that the symlink resolution is not *completely* iterative.
 * There is still a significant amount of tail- and mid- recursion in
 * the algorithm.  Also, note that <fs>_readlink() is not used in
 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
 * may return different results than <fs>_follow_link().  Many virtual
 * filesystems (including /proc) exhibit this behavior.
 */

/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
 * and the name already exists in form of a symlink, try to create the new
 * name indicated by the symlink. The old code always complained that the
 * name already exists, due to not following the symlink even if its target
 * is nonexistent.  The new semantics affects also mknod() and link() when
 * the name is a symlink pointing to a non-existent name.
 *
 * I don't know which semantics is the right one, since I have no access
 * to standards. But I found by trial that HP-UX 9.0 has the full "new"
 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
 * "old" one. Personally, I think the new semantics is much more logical.
 * Note that "ln old new" where "new" is a symlink pointing to a non-existing
 * file does succeed in both HP-UX and SunOs, but not in Solaris
 * and in the old Linux semantics.
 */

/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
 * semantics.  See the comments in "open_namei" and "do_link" below.
 *
 * [10-Sep-98 Alan Modra] Another symlink change.
 */

/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
 *        inside the path - always follow.
 *        in the last component in creation/removal/renaming - never follow.
 *        if LOOKUP_FOLLOW passed - follow.
 *        if the pathname has trailing slashes - follow.
 *        otherwise - don't follow.
 * (applied in that order).
 *
 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
 * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
 * During the 2.4 we need to fix the userland stuff depending on it -
 * hopefully we will be able to get rid of that wart in 2.5. So far only
 * XEmacs seems to be relying on it...
 */
/*
 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
 * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
 * any extra contention...
 */

/* In order to reduce some races, while at the same time doing additional
 * checking and hopefully speeding things up, we copy filenames to the
 * kernel data space before using them..
 *
 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
 * PATH_MAX includes the nul terminator --RR.
 */

#define EMBEDDED_NAME_MAX        (PATH_MAX - offsetof(struct filename, iname))

struct filename *
getname_flags(const char __user *filename, int flags, int *empty)
{
        struct filename *result;
        char *kname;
        int len;

        result = audit_reusename(filename);
        if (result)
                return result;

        result = __getname();
        if (unlikely(!result))
                return ERR_PTR(-ENOMEM);

        /*
         * First, try to embed the struct filename inside the names_cache
         * allocation
         */
        kname = (char *)result->iname;
        result->name = kname;

        len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
        if (unlikely(len < 0)) {
                __putname(result);
                return ERR_PTR(len);
        }

        /*
         * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
         * separate struct filename so we can dedicate the entire
         * names_cache allocation for the pathname, and re-do the copy from
         * userland.
         */
        if (unlikely(len == EMBEDDED_NAME_MAX)) {
                const size_t size = offsetof(struct filename, iname[1]);
                kname = (char *)result;

                /*
                 * size is chosen that way we to guarantee that
                 * result->iname[0] is within the same object and that
                 * kname can't be equal to result->iname, no matter what.
                 */
                result = kzalloc(size, GFP_KERNEL);
                if (unlikely(!result)) {
                        __putname(kname);
                        return ERR_PTR(-ENOMEM);
                }
                result->name = kname;
                len = strncpy_from_user(kname, filename, PATH_MAX);
                if (unlikely(len < 0)) {
                        __putname(kname);
                        kfree(result);
                        return ERR_PTR(len);
                }
                if (unlikely(len == PATH_MAX)) {
                        __putname(kname);
                        kfree(result);
                        return ERR_PTR(-ENAMETOOLONG);
                }
        }

        result->refcnt = 1;
        /* The empty path is special. */
        if (unlikely(!len)) {
                if (empty)
                        *empty = 1;
                if (!(flags & LOOKUP_EMPTY)) {
                        putname(result);
                        return ERR_PTR(-ENOENT);
                }
        }

        result->uptr = filename;
        result->aname = NULL;
        audit_getname(result);
        return result;
}

struct filename *
getname(const char __user * filename)
{
        return getname_flags(filename, 0, NULL);
}

struct filename *
getname_kernel(const char * filename)
{
        struct filename *result;
        int len = strlen(filename) + 1;

        result = __getname();
        if (unlikely(!result))
                return ERR_PTR(-ENOMEM);

        if (len <= EMBEDDED_NAME_MAX) {
                result->name = (char *)result->iname;
        } else if (len <= PATH_MAX) {
                const size_t size = offsetof(struct filename, iname[1]);
                struct filename *tmp;

                tmp = kmalloc(size, GFP_KERNEL);
                if (unlikely(!tmp)) {
                        __putname(result);
                        return ERR_PTR(-ENOMEM);
                }
                tmp->name = (char *)result;
                result = tmp;
        } else {
                __putname(result);
                return ERR_PTR(-ENAMETOOLONG);
        }
        memcpy((char *)result->name, filename, len);
        result->uptr = NULL;
        result->aname = NULL;
        result->refcnt = 1;
        audit_getname(result);

        return result;
}

void putname(struct filename *name)
{
        BUG_ON(name->refcnt <= 0);

        if (--name->refcnt > 0)
                return;

        if (name->name != name->iname) {
                __putname(name->name);
                kfree(name);
        } else
                __putname(name);
}

static int check_acl(struct inode *inode, int mask)
{
#ifdef CONFIG_FS_POSIX_ACL
        struct posix_acl *acl;

        if (mask & MAY_NOT_BLOCK) {
                acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
                if (!acl)
                        return -EAGAIN;
                /* no ->get_acl() calls in RCU mode... */
                if (is_uncached_acl(acl))
                        return -ECHILD;
                return posix_acl_permission(inode, acl, mask);
        }

        acl = get_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (acl) {
                int error = posix_acl_permission(inode, acl, mask);
                posix_acl_release(acl);
                return error;
        }
#endif

        return -EAGAIN;
}

/*
 * This does the basic UNIX permission checking.
 *
 * Note that the POSIX ACL check cares about the MAY_NOT_BLOCK bit,
 * for RCU walking.
 */
static int acl_permission_check(struct inode *inode, int mask)
{
        unsigned int mode = inode->i_mode;

        /* Are we the owner? If so, ACL's don't matter */
        if (likely(uid_eq(current_fsuid(), inode->i_uid))) {
                mask &= 7;
                mode >>= 6;
                return (mask & ~mode) ? -EACCES : 0;
        }

        /* Do we have ACL's? */
        if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
                int error = check_acl(inode, mask);
                if (error != -EAGAIN)
                        return error;
        }

        /* Only RWX matters for group/other mode bits */
        mask &= 7;

        /*
         * Are the group permissions different from
         * the other permissions in the bits we care
         * about? Need to check group ownership if so.
         */
        if (mask & (mode ^ (mode >> 3))) {
                if (in_group_p(inode->i_gid))
                        mode >>= 3;
        }

        /* Bits in 'mode' clear that we require? */
        return (mask & ~mode) ? -EACCES : 0;
}

/**
 * generic_permission -  check for access rights on a Posix-like filesystem
 * @inode:        inode to check access rights for
 * @mask:        right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC,
 *                %MAY_NOT_BLOCK ...)
 *
 * Used to check for read/write/execute permissions on a file.
 * We use "fsuid" for this, letting us set arbitrary permissions
 * for filesystem access without changing the "normal" uids which
 * are used for other things.
 *
 * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
 * request cannot be satisfied (eg. requires blocking or too much complexity).
 * It would then be called again in ref-walk mode.
 */
int generic_permission(struct inode *inode, int mask)
{
        int ret;

        /*
         * Do the basic permission checks.
         */
        ret = acl_permission_check(inode, mask);
        if (ret != -EACCES)
                return ret;

        if (S_ISDIR(inode->i_mode)) {
                /* DACs are overridable for directories */
                if (!(mask & MAY_WRITE))
                        if (capable_wrt_inode_uidgid(inode,
                                                     CAP_DAC_READ_SEARCH))
                                return 0;
                if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
                        return 0;
                return -EACCES;
        }

        /*
         * Searching includes executable on directories, else just read.
         */
        mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
        if (mask == MAY_READ)
                if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH))
                        return 0;
        /*
         * Read/write DACs are always overridable.
         * Executable DACs are overridable when there is
         * at least one exec bit set.
         */
        if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
                if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
                        return 0;

        return -EACCES;
}
EXPORT_SYMBOL(generic_permission);

/*
 * We _really_ want to just do "generic_permission()" without
 * even looking at the inode->i_op values. So we keep a cache
 * flag in inode->i_opflags, that says "this has not special
 * permission function, use the fast case".
 */
static inline int do_inode_permission(struct inode *inode, int mask)
{
        if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
                if (likely(inode->i_op->permission))
                        return inode->i_op->permission(inode, mask);

                /* This gets set once for the inode lifetime */
                spin_lock(&inode->i_lock);
                inode->i_opflags |= IOP_FASTPERM;
                spin_unlock(&inode->i_lock);
        }
        return generic_permission(inode, mask);
}

/**
 * sb_permission - Check superblock-level permissions
 * @sb: Superblock of inode to check permission on
 * @inode: Inode to check permission on
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Separate out file-system wide checks from inode-specific permission checks.
 */
static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
{
        if (unlikely(mask & MAY_WRITE)) {
                umode_t mode = inode->i_mode;

                /* Nobody gets write access to a read-only fs. */
                if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
                        return -EROFS;
        }
        return 0;
}

/**
 * inode_permission - Check for access rights to a given inode
 * @inode: Inode to check permission on
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
 * this, letting us set arbitrary permissions for filesystem access without
 * changing the "normal" UIDs which are used for other things.
 *
 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
 */
int inode_permission(struct inode *inode, int mask)
{
        int retval;

        retval = sb_permission(inode->i_sb, inode, mask);
        if (retval)
                return retval;

        if (unlikely(mask & MAY_WRITE)) {
                /*
                 * Nobody gets write access to an immutable file.
                 */
                if (IS_IMMUTABLE(inode))
                        return -EPERM;

                /*
                 * Updating mtime will likely cause i_uid and i_gid to be
                 * written back improperly if their true value is unknown
                 * to the vfs.
                 */
                if (HAS_UNMAPPED_ID(inode))
                        return -EACCES;
        }

        retval = do_inode_permission(inode, mask);
        if (retval)
                return retval;

        retval = devcgroup_inode_permission(inode, mask);
        if (retval)
                return retval;

        return security_inode_permission(inode, mask);
}
EXPORT_SYMBOL(inode_permission);

/**
 * path_get - get a reference to a path
 * @path: path to get the reference to
 *
 * Given a path increment the reference count to the dentry and the vfsmount.
 */
void path_get(const struct path *path)
{
        mntget(path->mnt);
        dget(path->dentry);
}
EXPORT_SYMBOL(path_get);

/**
 * path_put - put a reference to a path
 * @path: path to put the reference to
 *
 * Given a path decrement the reference count to the dentry and the vfsmount.
 */
void path_put(const struct path *path)
{
        dput(path->dentry);
        mntput(path->mnt);
}
EXPORT_SYMBOL(path_put);

#define EMBEDDED_LEVELS 2
struct nameidata {
        struct path        path;
        struct qstr        last;
        struct path        root;
        struct inode        *inode; /* path.dentry.d_inode */
        unsigned int        flags;
        unsigned        seq, m_seq, r_seq;
        int                last_type;
        unsigned        depth;
        int                total_link_count;
        struct saved {
                struct path link;
                struct delayed_call done;
                const char *name;
                unsigned seq;
        } *stack, internal[EMBEDDED_LEVELS];
        struct filename        *name;
        struct nameidata *saved;
        unsigned        root_seq;
        int                dfd;
        kuid_t                dir_uid;
        umode_t                dir_mode;
} __randomize_layout;

static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
{
        struct nameidata *old = current->nameidata;
        p->stack = p->internal;
        p->dfd = dfd;
        p->name = name;
        p->path.mnt = NULL;
        p->path.dentry = NULL;
        p->total_link_count = old ? old->total_link_count : 0;
        p->saved = old;
        current->nameidata = p;
}

static void restore_nameidata(void)
{
        struct nameidata *now = current->nameidata, *old = now->saved;

        current->nameidata = old;
        if (old)
                old->total_link_count = now->total_link_count;
        if (now->stack != now->internal)
                kfree(now->stack);
}

static bool nd_alloc_stack(struct nameidata *nd)
{
        struct saved *p;

        p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
                         nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL);
        if (unlikely(!p))
                return false;
        memcpy(p, nd->internal, sizeof(nd->internal));
        nd->stack = p;
        return true;
}

/**
 * path_connected - Verify that a dentry is below mnt.mnt_root
 *
 * Rename can sometimes move a file or directory outside of a bind
 * mount, path_connected allows those cases to be detected.
 */
static bool path_connected(struct vfsmount *mnt, struct dentry *dentry)
{
        struct super_block *sb = mnt->mnt_sb;

        /* Bind mounts can have disconnected paths */
        if (mnt->mnt_root == sb->s_root)
                return true;

        return is_subdir(dentry, mnt->mnt_root);
}

static void drop_links(struct nameidata *nd)
{
        int i = nd->depth;
        while (i--) {
                struct saved *last = nd->stack + i;
                do_delayed_call(&last->done);
                clear_delayed_call(&last->done);
        }
}

static void terminate_walk(struct nameidata *nd)
{
        drop_links(nd);
        if (!(nd->flags & LOOKUP_RCU)) {
                int i;
                path_put(&nd->path);
                for (i = 0; i < nd->depth; i++)
                        path_put(&nd->stack[i].link);
                if (nd->flags & LOOKUP_ROOT_GRABBED) {
                        path_put(&nd->root);
                        nd->flags &= ~LOOKUP_ROOT_GRABBED;
                }
        } else {
                nd->flags &= ~LOOKUP_RCU;
                rcu_read_unlock();
        }
        nd->depth = 0;
        nd->path.mnt = NULL;
        nd->path.dentry = NULL;
}

/* path_put is needed afterwards regardless of success or failure */
static bool __legitimize_path(struct path *path, unsigned seq, unsigned mseq)
{
        int res = __legitimize_mnt(path->mnt, mseq);
        if (unlikely(res)) {
                if (res > 0)
                        path->mnt = NULL;
                path->dentry = NULL;
                return false;
        }
        if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
                path->dentry = NULL;
                return false;
        }
        return !read_seqcount_retry(&path->dentry->d_seq, seq);
}

static inline bool legitimize_path(struct nameidata *nd,
                            struct path *path, unsigned seq)
{
        return __legitimize_path(path, seq, nd->m_seq);
}

static bool legitimize_links(struct nameidata *nd)
{
        int i;
        if (unlikely(nd->flags & LOOKUP_CACHED)) {
                drop_links(nd);
                nd->depth = 0;
                return false;
        }
        for (i = 0; i < nd->depth; i++) {
                struct saved *last = nd->stack + i;
                if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
                        drop_links(nd);
                        nd->depth = i + 1;
                        return false;
                }
        }
        return true;
}

static bool legitimize_root(struct nameidata *nd)
{
        /*
         * For scoped-lookups (where nd->root has been zeroed), we need to
         * restart the whole lookup from scratch -- because set_root() is wrong
         * for these lookups (nd->dfd is the root, not the filesystem root).
         */
        if (!nd->root.mnt && (nd->flags & LOOKUP_IS_SCOPED))
                return false;
        /* Nothing to do if nd->root is zero or is managed by the VFS user. */
        if (!nd->root.mnt || (nd->flags & LOOKUP_ROOT))
                return true;
        nd->flags |= LOOKUP_ROOT_GRABBED;
        return legitimize_path(nd, &nd->root, nd->root_seq);
}

/*
 * Path walking has 2 modes, rcu-walk and ref-walk (see
 * Documentation/filesystems/path-lookup.txt).  In situations when we can't
 * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
 * normal reference counts on dentries and vfsmounts to transition to ref-walk
 * mode.  Refcounts are grabbed at the last known good point before rcu-walk
 * got stuck, so ref-walk may continue from there. If this is not successful
 * (eg. a seqcount has changed), then failure is returned and it's up to caller
 * to restart the path walk from the beginning in ref-walk mode.
 */

/**
 * try_to_unlazy - try to switch to ref-walk mode.
 * @nd: nameidata pathwalk data
 * Returns: true on success, false on failure
 *
 * try_to_unlazy attempts to legitimize the current nd->path and nd->root
 * for ref-walk mode.
 * Must be called from rcu-walk context.
 * Nothing should touch nameidata between try_to_unlazy() failure and
 * terminate_walk().
 */
static bool try_to_unlazy(struct nameidata *nd)
{
        struct dentry *parent = nd->path.dentry;

        BUG_ON(!(nd->flags & LOOKUP_RCU));

        nd->flags &= ~LOOKUP_RCU;
        if (unlikely(!legitimize_links(nd)))
                goto out1;
        if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
                goto out;
        if (unlikely(!legitimize_root(nd)))
                goto out;
        rcu_read_unlock();
        BUG_ON(nd->inode != parent->d_inode);
        return true;

out1:
        nd->path.mnt = NULL;
        nd->path.dentry = NULL;
out:
        rcu_read_unlock();
        return false;
}

/**
 * try_to_unlazy_next - try to switch to ref-walk mode.
 * @nd: nameidata pathwalk data
 * @dentry: next dentry to step into
 * @seq: seq number to check @dentry against
 * Returns: true on success, false on failure
 *
 * Similar to to try_to_unlazy(), but here we have the next dentry already
 * picked by rcu-walk and want to legitimize that in addition to the current
 * nd->path and nd->root for ref-walk mode.  Must be called from rcu-walk context.
 * Nothing should touch nameidata between try_to_unlazy_next() failure and
 * terminate_walk().
 */
static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsigned seq)
{
        BUG_ON(!(nd->flags & LOOKUP_RCU));

        nd->flags &= ~LOOKUP_RCU;
        if (unlikely(!legitimize_links(nd)))
                goto out2;
        if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq)))
                goto out2;
        if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref)))
                goto out1;

        /*
         * We need to move both the parent and the dentry from the RCU domain
         * to be properly refcounted. And the sequence number in the dentry
         * validates *both* dentry counters, since we checked the sequence
         * number of the parent after we got the child sequence number. So we
         * know the parent must still be valid if the child sequence number is
         */
        if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
                goto out;
        if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
                goto out_dput;
        /*
         * Sequence counts matched. Now make sure that the root is
         * still valid and get it if required.
         */
        if (unlikely(!legitimize_root(nd)))
                goto out_dput;
        rcu_read_unlock();
        return true;

out2:
        nd->path.mnt = NULL;
out1:
        nd->path.dentry = NULL;
out:
        rcu_read_unlock();
        return false;
out_dput:
        rcu_read_unlock();
        dput(dentry);
        return false;
}

static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
{
        if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
                return dentry->d_op->d_revalidate(dentry, flags);
        else
                return 1;
}

/**
 * complete_walk - successful completion of path walk
 * @nd:  pointer nameidata
 *
 * If we had been in RCU mode, drop out of it and legitimize nd->path.
 * Revalidate the final result, unless we'd already done that during
 * the path walk or the filesystem doesn't ask for it.  Return 0 on
 * success, -error on failure.  In case of failure caller does not
 * need to drop nd->path.
 */
static int complete_walk(struct nameidata *nd)
{
        struct dentry *dentry = nd->path.dentry;
        int status;

        if (nd->flags & LOOKUP_RCU) {
                /*
                 * We don't want to zero nd->root for scoped-lookups or
                 * externally-managed nd->root.
                 */
                if (!(nd->flags & (LOOKUP_ROOT | LOOKUP_IS_SCOPED)))
                        nd->root.mnt = NULL;
                nd->flags &= ~LOOKUP_CACHED;
                if (!try_to_unlazy(nd))
                        return -ECHILD;
        }

        if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
                /*
                 * While the guarantee of LOOKUP_IS_SCOPED is (roughly) "don't
                 * ever step outside the root during lookup" and should already
                 * be guaranteed by the rest of namei, we want to avoid a namei
                 * BUG resulting in userspace being given a path that was not
                 * scoped within the root at some point during the lookup.
                 *
                 * So, do a final sanity-check to make sure that in the
                 * worst-case scenario (a complete bypass of LOOKUP_IS_SCOPED)
                 * we won't silently return an fd completely outside of the
                 * requested root to userspace.
                 *
                 * Userspace could move the path outside the root after this
                 * check, but as discussed elsewhere this is not a concern (the
                 * resolved file was inside the root at some point).
                 */
                if (!path_is_under(&nd->path, &nd->root))
                        return -EXDEV;
        }

        if (likely(!(nd->flags & LOOKUP_JUMPED)))
                return 0;

        if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
                return 0;

        status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
        if (status > 0)
                return 0;

        if (!status)
                status = -ESTALE;

        return status;
}

static int set_root(struct nameidata *nd)
{
        struct fs_struct *fs = current->fs;

        /*
         * Jumping to the real root in a scoped-lookup is a BUG in namei, but we
         * still have to ensure it doesn't happen because it will cause a breakout
         * from the dirfd.
         */
        if (WARN_ON(nd->flags & LOOKUP_IS_SCOPED))
                return -ENOTRECOVERABLE;

        if (nd->flags & LOOKUP_RCU) {
                unsigned seq;

                do {
                        seq = read_seqcount_begin(&fs->seq);
                        nd->root = fs->root;
                        nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
                } while (read_seqcount_retry(&fs->seq, seq));
        } else {
                get_fs_root(fs, &nd->root);
                nd->flags |= LOOKUP_ROOT_GRABBED;
        }
        return 0;
}

static int nd_jump_root(struct nameidata *nd)
{
        if (unlikely(nd->flags & LOOKUP_BENEATH))
                return -EXDEV;
        if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
                /* Absolute path arguments to path_init() are allowed. */
                if (nd->path.mnt != NULL && nd->path.mnt != nd->root.mnt)
                        return -EXDEV;
        }
        if (!nd->root.mnt) {
                int error = set_root(nd);
                if (error)
                        return error;
        }
        if (nd->flags & LOOKUP_RCU) {
                struct dentry *d;
                nd->path = nd->root;
                d = nd->path.dentry;
                nd->inode = d->d_inode;
                nd->seq = nd->root_seq;
                if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
                        return -ECHILD;
        } else {
                path_put(&nd->path);
                nd->path = nd->root;
                path_get(&nd->path);
                nd->inode = nd->path.dentry->d_inode;
        }
        nd->flags |= LOOKUP_JUMPED;
        return 0;
}

/*
 * Helper to directly jump to a known parsed path from ->get_link,
 * caller must have taken a reference to path beforehand.
 */
int nd_jump_link(struct path *path)
{
        int error = -ELOOP;
        struct nameidata *nd = current->nameidata;

        if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS))
                goto err;

        error = -EXDEV;
        if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
                if (nd->path.mnt != path->mnt)
                        goto err;
        }
        /* Not currently safe for scoped-lookups. */
        if (unlikely(nd->flags & LOOKUP_IS_SCOPED))
                goto err;

        path_put(&nd->path);
        nd->path = *path;
        nd->inode = nd->path.dentry->d_inode;
        nd->flags |= LOOKUP_JUMPED;
        return 0;

err:
        path_put(path);
        return error;
}

static inline void put_link(struct nameidata *nd)
{
        struct saved *last = nd->stack + --nd->depth;
        do_delayed_call(&last->done);
        if (!(nd->flags & LOOKUP_RCU))
                path_put(&last->link);
}

int sysctl_protected_symlinks __read_mostly = 0;
int sysctl_protected_hardlinks __read_mostly = 0;
int sysctl_protected_fifos __read_mostly;
int sysctl_protected_regular __read_mostly;

/**
 * may_follow_link - Check symlink following for unsafe situations
 * @nd: nameidata pathwalk data
 *
 * In the case of the sysctl_protected_symlinks sysctl being enabled,
 * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
 * in a sticky world-writable directory. This is to protect privileged
 * processes from failing races against path names that may change out
 * from under them by way of other users creating malicious symlinks.
 * It will permit symlinks to be followed only when outside a sticky
 * world-writable directory, or when the uid of the symlink and follower
 * match, or when the directory owner matches the symlink's owner.
 *
 * Returns 0 if following the symlink is allowed, -ve on error.
 */
static inline int may_follow_link(struct nameidata *nd, const struct inode *inode)
{
        if (!sysctl_protected_symlinks)
                return 0;

        /* Allowed if owner and follower match. */
        if (uid_eq(current_cred()->fsuid, inode->i_uid))
                return 0;

        /* Allowed if parent directory not sticky and world-writable. */
        if ((nd->dir_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
                return 0;

        /* Allowed if parent directory and link owner match. */
        if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, inode->i_uid))
                return 0;

        if (nd->flags & LOOKUP_RCU)
                return -ECHILD;

        audit_inode(nd->name, nd->stack[0].link.dentry, 0);
        audit_log_path_denied(AUDIT_ANOM_LINK, "follow_link");
        return -EACCES;
}

/**
 * safe_hardlink_source - Check for safe hardlink conditions
 * @inode: the source inode to hardlink from
 *
 * Return false if at least one of the following conditions:
 *    - inode is not a regular file
 *    - inode is setuid
 *    - inode is setgid and group-exec
 *    - access failure for read and write
 *
 * Otherwise returns true.
 */
static bool safe_hardlink_source(struct inode *inode)
{
        umode_t mode = inode->i_mode;

        /* Special files should not get pinned to the filesystem. */
        if (!S_ISREG(mode))
                return false;

        /* Setuid files should not get pinned to the filesystem. */
        if (mode & S_ISUID)
                return false;

        /* Executable setgid files should not get pinned to the filesystem. */
        if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
                return false;

        /* Hardlinking to unreadable or unwritable sources is dangerous. */
        if (inode_permission(inode, MAY_READ | MAY_WRITE))
                return false;

        return true;
}

/**
 * may_linkat - Check permissions for creating a hardlink
 * @link: the source to hardlink from
 *
 * Block hardlink when all of:
 *  - sysctl_protected_hardlinks enabled
 *  - fsuid does not match inode
 *  - hardlink source is unsafe (see safe_hardlink_source() above)
 *  - not CAP_FOWNER in a namespace with the inode owner uid mapped
 *
 * Returns 0 if successful, -ve on error.
 */
int may_linkat(struct path *link)
{
        struct inode *inode = link->dentry->d_inode;

        /* Inode writeback is not safe when the uid or gid are invalid. */
        if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
                return -EOVERFLOW;

        if (!sysctl_protected_hardlinks)
                return 0;

        /* Source inode owner (or CAP_FOWNER) can hardlink all they like,
         * otherwise, it must be a safe source.
         */
        if (safe_hardlink_source(inode) || inode_owner_or_capable(inode))
                return 0;

        audit_log_path_denied(AUDIT_ANOM_LINK, "linkat");
        return -EPERM;
}

/**
 * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
 *                          should be allowed, or not, on files that already
 *                          exist.
 * @dir_mode: mode bits of directory
 * @dir_uid: owner of directory
 * @inode: the inode of the file to open
 *
 * Block an O_CREAT open of a FIFO (or a regular file) when:
 *   - sysctl_protected_fifos (or sysctl_protected_regular) is enabled
 *   - the file already exists
 *   - we are in a sticky directory
 *   - we don't own the file
 *   - the owner of the directory doesn't own the file
 *   - the directory is world writable
 * If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2
 * the directory doesn't have to be world writable: being group writable will
 * be enough.
 *
 * Returns 0 if the open is allowed, -ve on error.
 */
static int may_create_in_sticky(umode_t dir_mode, kuid_t dir_uid,
                                struct inode * const inode)
{
        if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) ||
            (!sysctl_protected_regular && S_ISREG(inode->i_mode)) ||
            likely(!(dir_mode & S_ISVTX)) ||
            uid_eq(inode->i_uid, dir_uid) ||
            uid_eq(current_fsuid(), inode->i_uid))
                return 0;

        if (likely(dir_mode & 0002) ||
            (dir_mode & 0020 &&
             ((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) ||
              (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) {
                const char *operation = S_ISFIFO(inode->i_mode) ?
                                        "sticky_create_fifo" :
                                        "sticky_create_regular";
                audit_log_path_denied(AUDIT_ANOM_CREAT, operation);
                return -EACCES;
        }
        return 0;
}

/*
 * follow_up - Find the mountpoint of path's vfsmount
 *
 * Given a path, find the mountpoint of its source file system.
 * Replace @path with the path of the mountpoint in the parent mount.
 * Up is towards /.
 *
 * Return 1 if we went up a level and 0 if we were already at the
 * root.
 */
int follow_up(struct path *path)
{
        struct mount *mnt = real_mount(path->mnt);
        struct mount *parent;
        struct dentry *mountpoint;

        read_seqlock_excl(&mount_lock);
        parent = mnt->mnt_parent;
        if (parent == mnt) {
                read_sequnlock_excl(&mount_lock);
                return 0;
        }
        mntget(&parent->mnt);
        mountpoint = dget(mnt->mnt_mountpoint);
        read_sequnlock_excl(&mount_lock);
        dput(path->dentry);
        path->dentry = mountpoint;
        mntput(path->mnt);
        path->mnt = &parent->mnt;
        return 1;
}
EXPORT_SYMBOL(follow_up);

static bool choose_mountpoint_rcu(struct mount *m, const struct path *root,
                                  struct path *path, unsigned *seqp)
{
        while (mnt_has_parent(m)) {
                struct dentry *mountpoint = m->mnt_mountpoint;

                m = m->mnt_parent;
                if (unlikely(root->dentry == mountpoint &&
                             root->mnt == &m->mnt))
                        break;
                if (mountpoint != m->mnt.mnt_root) {
                        path->mnt = &m->mnt;
                        path->dentry = mountpoint;
                        *seqp = read_seqcount_begin(&mountpoint->d_seq);
                        return true;
                }
        }
        return false;
}

static bool choose_mountpoint(struct mount *m, const struct path *root,
                              struct path *path)
{
        bool found;

        rcu_read_lock();
        while (1) {
                unsigned seq, mseq = read_seqbegin(&mount_lock);

                found = choose_mountpoint_rcu(m, root, path, &seq);
                if (unlikely(!found)) {
                        if (!read_seqretry(&mount_lock, mseq))
                                break;
                } else {
                        if (likely(__legitimize_path(path, seq, mseq)))
                                break;
                        rcu_read_unlock();
                        path_put(path);
                        rcu_read_lock();
                }
        }
        rcu_read_unlock();
        return found;
}

/*
 * Perform an automount
 * - return -EISDIR to tell follow_managed() to stop and return the path we
 *   were called with.
 */
static int follow_automount(struct path *path, int *count, unsigned lookup_flags)
{
        struct dentry *dentry = path->dentry;

        /* We don't want to mount if someone's just doing a stat -
         * unless they're stat'ing a directory and appended a '/' to
         * the name.
         *
         * We do, however, want to mount if someone wants to open or
         * create a file of any type under the mountpoint, wants to
         * traverse through the mountpoint or wants to open the
         * mounted directory.  Also, autofs may mark negative dentries
         * as being automount points.  These will need the attentions
         * of the daemon to instantiate them before they can be used.
         */
        if (!(lookup_flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
                           LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
            dentry->d_inode)
                return -EISDIR;

        /* No need to trigger automounts if mountpoint crossing is disabled. */
        if (lookup_flags & LOOKUP_NO_XDEV)
                return -EXDEV;

        if (count && (*count)++ >= MAXSYMLINKS)
                return -ELOOP;

        return finish_automount(dentry->d_op->d_automount(path), path);
}

/*
 * mount traversal - out-of-line part.  One note on ->d_flags accesses -
 * dentries are pinned but not locked here, so negative dentry can go
 * positive right under us.  Use of smp_load_acquire() provides a barrier
 * sufficient for ->d_inode and ->d_flags consistency.
 */
static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped,
                             int *count, unsigned lookup_flags)
{
        struct vfsmount *mnt = path->mnt;
        bool need_mntput = false;
        int ret = 0;

        while (flags & DCACHE_MANAGED_DENTRY) {
                /* Allow the filesystem to manage the transit without i_mutex
                 * being held. */
                if (flags & DCACHE_MANAGE_TRANSIT) {
                        if (lookup_flags & LOOKUP_NO_XDEV) {
                                ret = -EXDEV;
                                break;
                        }
                        ret = path->dentry->d_op->d_manage(path, false);
                        flags = smp_load_acquire(&path->dentry->d_flags);
                        if (ret < 0)
                                break;
                }

                if (flags & DCACHE_MOUNTED) {        // something's mounted on it..
                        struct vfsmount *mounted = lookup_mnt(path);
                        if (mounted) {                // ... in our namespace
                                dput(path->dentry);
                                if (need_mntput)
                                        mntput(path->mnt);
                                path->mnt = mounted;
                                path->dentry = dget(mounted->mnt_root);
                                // here we know it's positive
                                flags = path->dentry->d_flags;
                                need_mntput = true;
                                continue;
                        }
                }

                if (!(flags & DCACHE_NEED_AUTOMOUNT))
                        break;

                // uncovered automount point
                ret = follow_automount(path, count, lookup_flags);
                flags = smp_load_acquire(&path->dentry->d_flags);
                if (ret < 0)
                        break;
        }

        if (ret == -EISDIR)
                ret = 0;
        // possible if you race with several mount --move
        if (need_mntput && path->mnt == mnt)
                mntput(path->mnt);
        if (!ret && unlikely(d_flags_negative(flags)))
                ret = -ENOENT;
        *jumped = need_mntput;
        return ret;
}

static inline int traverse_mounts(struct path *path, bool *jumped,
                                  int *count, unsigned lookup_flags)
{
        unsigned flags = smp_load_acquire(&path->dentry->d_flags);

        /* fastpath */
        if (likely(!(flags & DCACHE_MANAGED_DENTRY))) {
                *jumped = false;
                if (unlikely(d_flags_negative(flags)))
                        return -ENOENT;
                return 0;
        }
        return __traverse_mounts(path, flags, jumped, count, lookup_flags);
}

int follow_down_one(struct path *path)
{
        struct vfsmount *mounted;

        mounted = lookup_mnt(path);
        if (mounted) {
                dput(path->dentry);
                mntput(path->mnt);
                path->mnt = mounted;
                path->dentry = dget(mounted->mnt_root);
                return 1;
        }
        return 0;
}
EXPORT_SYMBOL(follow_down_one);

/*
 * Follow down to the covering mount currently visible to userspace.  At each
 * point, the filesystem owning that dentry may be queried as to whether the
 * caller is permitted to proceed or not.
 */
int follow_down(struct path *path)
{
        struct vfsmount *mnt = path->mnt;
        bool jumped;
        int ret = traverse_mounts(path, &jumped, NULL, 0);

        if (path->mnt != mnt)
                mntput(mnt);
        return ret;
}
EXPORT_SYMBOL(follow_down);

/*
 * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
 * we meet a managed dentry that would need blocking.
 */
static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
                               struct inode **inode, unsigned *seqp)
{
        struct dentry *dentry = path->dentry;
        unsigned int flags = dentry->d_flags;

        if (likely(!(flags & DCACHE_MANAGED_DENTRY)))
                return true;

        if (unlikely(nd->flags & LOOKUP_NO_XDEV))
                return false;

        for (;;) {
                /*
                 * Don't forget we might have a non-mountpoint managed dentry
                 * that wants to block transit.
                 */
                if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) {
                        int res = dentry->d_op->d_manage(path, true);
                        if (res)
                                return res == -EISDIR;
                        flags = dentry->d_flags;
                }

                if (flags & DCACHE_MOUNTED) {
                        struct mount *mounted = __lookup_mnt(path->mnt, dentry);
                        if (mounted) {
                                path->mnt = &mounted->mnt;
                                dentry = path->dentry = mounted->mnt.mnt_root;
                                nd->flags |= LOOKUP_JUMPED;
                                *seqp = read_seqcount_begin(&dentry->d_seq);
                                *inode = dentry->d_inode;
                                /*
                                 * We don't need to re-check ->d_seq after this
                                 * ->d_inode read - there will be an RCU delay
                                 * between mount hash removal and ->mnt_root
                                 * becoming unpinned.
                                 */
                                flags = dentry->d_flags;
                                if (read_seqretry(&mount_lock, nd->m_seq))
                                        return false;
                                continue;
                        }
                        if (read_seqretry(&mount_lock, nd->m_seq))
                                return false;
                }
                return !(flags & DCACHE_NEED_AUTOMOUNT);
        }
}

static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
                          struct path *path, struct inode **inode,
                          unsigned int *seqp)
{
        bool jumped;
        int ret;

        path->mnt = nd->path.mnt;
        path->dentry = dentry;
        if (nd->flags & LOOKUP_RCU) {
                unsigned int seq = *seqp;
                if (unlikely(!*inode))
                        return -ENOENT;
                if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
                        return 0;
                if (!try_to_unlazy_next(nd, dentry, seq))
                        return -ECHILD;
                // *path might've been clobbered by __follow_mount_rcu()
                path->mnt = nd->path.mnt;
                path->dentry = dentry;
        }
        ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags);
        if (jumped) {
                if (unlikely(nd->flags & LOOKUP_NO_XDEV))
                        ret = -EXDEV;
                else
                        nd->flags |= LOOKUP_JUMPED;
        }
        if (unlikely(ret)) {
                dput(path->dentry);
                if (path->mnt != nd->path.mnt)
                        mntput(path->mnt);
        } else {
                *inode = d_backing_inode(path->dentry);
                *seqp = 0; /* out of RCU mode, so the value doesn't matter */
        }
        return ret;
}

/*
 * This looks up the name in dcache and possibly revalidates the found dentry.
 * NULL is returned if the dentry does not exist in the cache.
 */
static struct dentry *lookup_dcache(const struct qstr *name,
                                    struct dentry *dir,
                                    unsigned int flags)
{
        struct dentry *dentry = d_lookup(dir, name);
        if (dentry) {
                int error = d_revalidate(dentry, flags);
                if (unlikely(error <= 0)) {
                        if (!error)
                                d_invalidate(dentry);
                        dput(dentry);
                        return ERR_PTR(error);
                }
        }
        return dentry;
}

/*
 * Parent directory has inode locked exclusive.  This is one
 * and only case when ->lookup() gets called on non in-lookup
 * dentries - as the matter of fact, this only gets called
 * when directory is guaranteed to have no in-lookup children
 * at all.
 */
static struct dentry *__lookup_hash(const struct qstr *name,
                struct dentry *base, unsigned int flags)
{
        struct dentry *dentry = lookup_dcache(name, base, flags);
        struct dentry *old;
        struct inode *dir = base->d_inode;

        if (dentry)
                return dentry;

        /* Don't create child dentry for a dead directory. */
        if (unlikely(IS_DEADDIR(dir)))
                return ERR_PTR(-ENOENT);

        dentry = d_alloc(base, name);
        if (unlikely(!dentry))
                return ERR_PTR(-ENOMEM);

        old = dir->i_op->lookup(dir, dentry, flags);
        if (unlikely(old)) {
                dput(dentry);
                dentry = old;
        }
        return dentry;
}

static struct dentry *lookup_fast(struct nameidata *nd,
                                  struct inode **inode,
                                  unsigned *seqp)
{
        struct dentry *dentry, *parent = nd->path.dentry;
        int status = 1;

        /*
         * Rename seqlock is not required here because in the off chance
         * of a false negative due to a concurrent rename, the caller is
         * going to fall back to non-racy lookup.
         */
        if (nd->flags & LOOKUP_RCU) {
                unsigned seq;
                dentry = __d_lookup_rcu(parent, &nd->last, &seq);
                if (unlikely(!dentry)) {
                        if (!try_to_unlazy(nd))
                                return ERR_PTR(-ECHILD);
                        return NULL;
                }

                /*
                 * This sequence count validates that the inode matches
                 * the dentry name information from lookup.
                 */
                *inode = d_backing_inode(dentry);
                if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
                        return ERR_PTR(-ECHILD);

                /*
                 * This sequence count validates that the parent had no
                 * changes while we did the lookup of the dentry above.
                 *
                 * The memory barrier in read_seqcount_begin of child is
                 *  enough, we can use __read_seqcount_retry here.
                 */
                if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq)))
                        return ERR_PTR(-ECHILD);

                *seqp = seq;
                status = d_revalidate(dentry, nd->flags);
                if (likely(status > 0))
                        return dentry;
                if (!try_to_unlazy_next(nd, dentry, seq))
                        return ERR_PTR(-ECHILD);
                if (unlikely(status == -ECHILD))
                        /* we'd been told to redo it in non-rcu mode */
                        status = d_revalidate(dentry, nd->flags);
        } else {
                dentry = __d_lookup(parent, &nd->last);
                if (unlikely(!dentry))
                        return NULL;
                status = d_revalidate(dentry, nd->flags);
        }
        if (unlikely(status <= 0)) {
                if (!status)
                        d_invalidate(dentry);
                dput(dentry);
                return ERR_PTR(status);
        }
        return dentry;
}

/* Fast lookup failed, do it the slow way */
static struct dentry *__lookup_slow(const struct qstr *name,
                                    struct dentry *dir,
                                    unsigned int flags)
{
        struct dentry *dentry, *old;
        struct inode *inode = dir->d_inode;
        DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);

        /* Don't go there if it's already dead */
        if (unlikely(IS_DEADDIR(inode)))
                return ERR_PTR(-ENOENT);
again:
        dentry = d_alloc_parallel(dir, name, &wq);
        if (IS_ERR(dentry))
                return dentry;
        if (unlikely(!d_in_lookup(dentry))) {
                int error = d_revalidate(dentry, flags);
                if (unlikely(error <= 0)) {
                        if (!error) {
                                d_invalidate(dentry);
                                dput(dentry);
                                goto again;
                        }
                        dput(dentry);
                        dentry = ERR_PTR(error);
                }
        } else {
                old = inode->i_op->lookup(inode, dentry, flags);
                d_lookup_done(dentry);
                if (unlikely(old)) {
                        dput(dentry);
                        dentry = old;
                }
        }
        return dentry;
}

static struct dentry *lookup_slow(const struct qstr *name,
                                  struct dentry *dir,
                                  unsigned int flags)
{
        struct inode *inode = dir->d_inode;
        struct dentry *res;
        inode_lock_shared(inode);
        res = __lookup_slow(name, dir, flags);
        inode_unlock_shared(inode);
        return res;
}

static inline int may_lookup(struct nameidata *nd)
{
        if (nd->flags & LOOKUP_RCU) {
                int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
                if (err != -ECHILD || !try_to_unlazy(nd))
                        return err;
        }
        return inode_permission(nd->inode, MAY_EXEC);
}

static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
{
        if (unlikely(nd->total_link_count++ >= MAXSYMLINKS))
                return -ELOOP;

        if (likely(nd->depth != EMBEDDED_LEVELS))
                return 0;
        if (likely(nd->stack != nd->internal))
                return 0;
        if (likely(nd_alloc_stack(nd)))
                return 0;

        if (nd->flags & LOOKUP_RCU) {
                // we need to grab link before we do unlazy.  And we can't skip
                // unlazy even if we fail to grab the link - cleanup needs it
                bool grabbed_link = legitimize_path(nd, link, seq);

                if (!try_to_unlazy(nd) != 0 || !grabbed_link)
                        return -ECHILD;

                if (nd_alloc_stack(nd))
                        return 0;
        }
        return -ENOMEM;
}

enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};

static const char *pick_link(struct nameidata *nd, struct path *link,
                     struct inode *inode, unsigned seq, int flags)
{
        struct saved *last;
        const char *res;
        int error = reserve_stack(nd, link, seq);

        if (unlikely(error)) {
                if (!(nd->flags & LOOKUP_RCU))
                        path_put(link);
                return ERR_PTR(error);
        }
        last = nd->stack + nd->depth++;
        last->link = *link;
        clear_delayed_call(&last->done);
        last->seq = seq;

        if (flags & WALK_TRAILING) {
                error = may_follow_link(nd, inode);
                if (unlikely(error))
                        return ERR_PTR(error);
        }

        if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) ||
                        unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW))
                return ERR_PTR(-ELOOP);

        if (!(nd->flags & LOOKUP_RCU)) {
                touch_atime(&last->link);
                cond_resched();
        } else if (atime_needs_update(&last->link, inode)) {
                if (!try_to_unlazy(nd))
                        return ERR_PTR(-ECHILD);
                touch_atime(&last->link);
        }

        error = security_inode_follow_link(link->dentry, inode,
                                           nd->flags & LOOKUP_RCU);
        if (unlikely(error))
                return ERR_PTR(error);

        res = READ_ONCE(inode->i_link);
        if (!res) {
                const char * (*get)(struct dentry *, struct inode *,
                                struct delayed_call *);
                get = inode->i_op->get_link;
                if (nd->flags & LOOKUP_RCU) {
                        res = get(NULL, inode, &last->done);
                        if (res == ERR_PTR(-ECHILD) && try_to_unlazy(nd))
                                res = get(link->dentry, inode, &last->done);
                } else {
                        res = get(link->dentry, inode, &last->done);
                }
                if (!res)
                        goto all_done;
                if (IS_ERR(res))
                        return res;
        }
        if (*res == '/') {
                error = nd_jump_root(nd);
                if (unlikely(error))
                        return ERR_PTR(error);
                while (unlikely(*++res == '/'))
                        ;
        }
        if (*res)
                return res;
all_done: // pure jump
        put_link(nd);
        return NULL;
}

/*
 * Do we need to follow links? We _really_ want to be able
 * to do this check without having to look at inode->i_op,
 * so we keep a cache of "no, this doesn't need follow_link"
 * for the common case.
 */
static const char *step_into(struct nameidata *nd, int flags,
                     struct dentry *dentry, struct inode *inode, unsigned seq)
{
        struct path path;
        int err = handle_mounts(nd, dentry, &path, &inode, &seq);

        if (err < 0)
                return ERR_PTR(err);
        if (likely(!d_is_symlink(path.dentry)) ||
           ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) ||
           (flags & WALK_NOFOLLOW)) {
                /* not a symlink or should not follow */
                if (!(nd->flags & LOOKUP_RCU)) {
                        dput(nd->path.dentry);
                        if (nd->path.mnt != path.mnt)
                                mntput(nd->path.mnt);
                }
                nd->path = path;
                nd->inode = inode;
                nd->seq = seq;
                return NULL;
        }
        if (nd->flags & LOOKUP_RCU) {
                /* make sure that d_is_symlink above matches inode */
                if (read_seqcount_retry(&path.dentry->d_seq, seq))
                        return ERR_PTR(-ECHILD);
        } else {
                if (path.mnt == nd->path.mnt)
                        mntget(path.mnt);
        }
        return pick_link(nd, &path, inode, seq, flags);
}

static struct dentry *follow_dotdot_rcu(struct nameidata *nd,
                                        struct inode **inodep,
                                        unsigned *seqp)
{
        struct dentry *parent, *old;

        if (path_equal(&nd->path, &nd->root))
                goto in_root;
        if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
                struct path path;
                unsigned seq;
                if (!choose_mountpoint_rcu(real_mount(nd->path.mnt),
                                           &nd->root, &path, &seq))
                        goto in_root;
                if (unlikely(nd->flags & LOOKUP_NO_XDEV))
                        return ERR_PTR(-ECHILD);
                nd->path = path;
                nd->inode = path.dentry->d_inode;
                nd->seq = seq;
                if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
                        return ERR_PTR(-ECHILD);
                /* we know that mountpoint was pinned */
        }
        old = nd->path.dentry;
        parent = old->d_parent;
        *inodep = parent->d_inode;
        *seqp = read_seqcount_begin(&parent->d_seq);
        if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq)))
                return ERR_PTR(-ECHILD);
        if (unlikely(!path_connected(nd->path.mnt, parent)))
                return ERR_PTR(-ECHILD);
        return parent;
in_root:
        if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
                return ERR_PTR(-ECHILD);
        if (unlikely(nd->flags & LOOKUP_BENEATH))
                return ERR_PTR(-ECHILD);
        return NULL;
}

static struct dentry *follow_dotdot(struct nameidata *nd,
                                 struct inode **inodep,
                                 unsigned *seqp)
{
        struct dentry *parent;

        if (path_equal(&nd->path, &nd->root))
                goto in_root;
        if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
                struct path path;

                if (!choose_mountpoint(real_mount(nd->path.mnt),
                                       &nd->root, &path))
                        goto in_root;
                path_put(&nd->path);
                nd->path = path;
                nd->inode = path.dentry->d_inode;
                if (unlikely(nd->flags & LOOKUP_NO_XDEV))
                        return ERR_PTR(-EXDEV);
        }
        /* rare case of legitimate dget_parent()... */
        parent = dget_parent(nd->path.dentry);
        if (unlikely(!path_connected(nd->path.mnt, parent))) {
                dput(parent);
                return ERR_PTR(-ENOENT);
        }
        *seqp = 0;
        *inodep = parent->d_inode;
        return parent;

in_root:
        if (unlikely(nd->flags & LOOKUP_BENEATH))
                return ERR_PTR(-EXDEV);
        dget(nd->path.dentry);
        return NULL;
}

static const char *handle_dots(struct nameidata *nd, int type)
{
        if (type == LAST_DOTDOT) {
                const char *error = NULL;
                struct dentry *parent;
                struct inode *inode;
                unsigned seq;

                if (!nd->root.mnt) {
                        error = ERR_PTR(set_root(nd));
                        if (error)
                                return error;
                }
                if (nd->flags & LOOKUP_RCU)
                        parent = follow_dotdot_rcu(nd, &inode, &seq);
                else
                        parent = follow_dotdot(nd, &inode, &seq);
                if (IS_ERR(parent))
                        return ERR_CAST(parent);
                if (unlikely(!parent))
                        error = step_into(nd, WALK_NOFOLLOW,
                                         nd->path.dentry, nd->inode, nd->seq);
                else
                        error = step_into(nd, WALK_NOFOLLOW,
                                         parent, inode, seq);
                if (unlikely(error))
                        return error;

                if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
                        /*
                         * If there was a racing rename or mount along our
                         * path, then we can't be sure that ".." hasn't jumped
                         * above nd->root (and so userspace should retry or use
                         * some fallback).
                         */
                        smp_rmb();
                        if (unlikely(__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq)))
                                return ERR_PTR(-EAGAIN);
                        if (unlikely(__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq)))
                                return ERR_PTR(-EAGAIN);
                }
        }
        return NULL;
}

static const char *walk_component(struct nameidata *nd, int flags)
{
        struct dentry *dentry;
        struct inode *inode;
        unsigned seq;
        /*
         * "." and ".." are special - ".." especially so because it has
         * to be able to know about the current root directory and
         * parent relationships.
         */
        if (unlikely(nd->last_type != LAST_NORM)) {
                if (!(flags & WALK_MORE) && nd->depth)
                        put_link(nd);
                return handle_dots(nd, nd->last_type);
        }
        dentry = lookup_fast(nd, &inode, &seq);
        if (IS_ERR(dentry))
                return ERR_CAST(dentry);
        if (unlikely(!dentry)) {
                dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags);
                if (IS_ERR(dentry))
                        return ERR_CAST(dentry);
        }
        if (!(flags & WALK_MORE) && nd->depth)
                put_link(nd);
        return step_into(nd, flags, dentry, inode, seq);
}

/*
 * We can do the critical dentry name comparison and hashing
 * operations one word at a time, but we are limited to:
 *
 * - Architectures with fast unaligned word accesses. We could
 *   do a "get_unaligned()" if this helps and is sufficiently
 *   fast.
 *
 * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
 *   do not trap on the (extremely unlikely) case of a page
 *   crossing operation.
 *
 * - Furthermore, we need an efficient 64-bit compile for the
 *   64-bit case in order to generate the "number of bytes in
 *   the final mask". Again, that could be replaced with a
 *   efficient population count instruction or similar.
 */
#ifdef CONFIG_DCACHE_WORD_ACCESS

#include <asm/word-at-a-time.h>

#ifdef HASH_MIX

/* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */

#elif defined(CONFIG_64BIT)
/*
 * Register pressure in the mixing function is an issue, particularly
 * on 32-bit x86, but almost any function requires one state value and
 * one temporary.  Instead, use a function designed for two state values
 * and no temporaries.
 *
 * This function cannot create a collision in only two iterations, so
 * we have two iterations to achieve avalanche.  In those two iterations,
 * we have six layers of mixing, which is enough to spread one bit's
 * influence out to 2^6 = 64 state bits.
 *
 * Rotate constants are scored by considering either 64 one-bit input
 * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the
 * probability of that delta causing a change to each of the 128 output
 * bits, using a sample of random initial states.
 *
 * The Shannon entropy of the computed probabilities is then summed
 * to produce a score.  Ideally, any input change has a 50% chance of
 * toggling any given output bit.
 *
 * Mixing scores (in bits) for (12,45):
 * Input delta: 1-bit      2-bit
 * 1 round:     713.3    42542.6
 * 2 rounds:   2753.7   140389.8
 * 3 rounds:   5954.1   233458.2
 * 4 rounds:   7862.6   256672.2
 * Perfect:    8192     258048
 *            (64*128) (64*63/2 * 128)
 */
#define HASH_MIX(x, y, a)        \
        (        x ^= (a),        \
        y ^= x,        x = rol64(x,12),\
        x += y,        y = rol64(y,45),\
        y *= 9                        )

/*
 * Fold two longs into one 32-bit hash value.  This must be fast, but
 * latency isn't quite as critical, as there is a fair bit of additional
 * work done before the hash value is used.
 */
static inline unsigned int fold_hash(unsigned long x, unsigned long y)
{
        y ^= x * GOLDEN_RATIO_64;
        y *= GOLDEN_RATIO_64;
        return y >> 32;
}

#else        /* 32-bit case */

/*
 * Mixing scores (in bits) for (7,20):
 * Input delta: 1-bit      2-bit
 * 1 round:     330.3     9201.6
 * 2 rounds:   1246.4    25475.4
 * 3 rounds:   1907.1    31295.1
 * 4 rounds:   2042.3    31718.6
 * Perfect:    2048      31744
 *            (32*64)   (32*31/2 * 64)
 */
#define HASH_MIX(x, y, a)        \
        (        x ^= (a),        \
        y ^= x,        x = rol32(x, 7),\
        x += y,        y = rol32(y,20),\
        y *= 9                        )

static inline unsigned int fold_hash(unsigned long x, unsigned long y)
{
        /* Use arch-optimized multiply if one exists */
        return __hash_32(y ^ __hash_32(x));
}

#endif

/*
 * Return the hash of a string of known length.  This is carfully
 * designed to match hash_name(), which is the more critical function.
 * In particular, we must end by hashing a final word containing 0..7
 * payload bytes, to match the way that hash_name() iterates until it
 * finds the delimiter after the name.
 */
unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
{
        unsigned long a, x = 0, y = (unsigned long)salt;

        for (;;) {
                if (!len)
                        goto done;
                a = load_unaligned_zeropad(name);
                if (len < sizeof(unsigned long))
                        break;
                HASH_MIX(x, y, a);
                name += sizeof(unsigned long);
                len -= sizeof(unsigned long);
        }
        x ^= a & bytemask_from_count(len);
done:
        return fold_hash(x, y);
}
EXPORT_SYMBOL(full_name_hash);

/* Return the "hash_len" (hash and length) of a null-terminated string */
u64 hashlen_string(const void *salt, const char *name)
{
        unsigned long a = 0, x = 0, y = (unsigned long)salt;
        unsigned long adata, mask, len;
        const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;

        len = 0;
        goto inside;

        do {
                HASH_MIX(x, y, a);
                len += sizeof(unsigned long);
inside:
                a = load_unaligned_zeropad(name+len);
        } while (!has_zero(a, &adata, &constants));

        adata = prep_zero_mask(a, adata, &constants);
        mask = create_zero_mask(adata);
        x ^= a & zero_bytemask(mask);

        return hashlen_create(fold_hash(x, y), len + find_zero(mask));
}
EXPORT_SYMBOL(hashlen_string);

/*
 * Calculate the length and hash of the path component, and
 * return the "hash_len" as the result.
 */
static inline u64 hash_name(const void *salt, const char *name)
{
        unsigned long a = 0, b, x = 0, y = (unsigned long)salt;
        unsigned long adata, bdata, mask, len;
        const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;

        len = 0;
        goto inside;

        do {
                HASH_MIX(x, y, a);
                len += sizeof(unsigned long);
inside:
                a = load_unaligned_zeropad(name+len);
                b = a ^ REPEAT_BYTE('/');
        } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));

        adata = prep_zero_mask(a, adata, &constants);
        bdata = prep_zero_mask(b, bdata, &constants);
        mask = create_zero_mask(adata | bdata);
        x ^= a & zero_bytemask(mask);

        return hashlen_create(fold_hash(x, y), len + find_zero(mask));
}

#else        /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */

/* Return the hash of a string of known length */
unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
{
        unsigned long hash = init_name_hash(salt);
        while (len--)
                hash = partial_name_hash((unsigned char)*name++, hash);
        return end_name_hash(hash);
}
EXPORT_SYMBOL(full_name_hash);

/* Return the "hash_len" (hash and length) of a null-terminated string */
u64 hashlen_string(const void *salt, const char *name)
{
        unsigned long hash = init_name_hash(salt);
        unsigned long len = 0, c;

        c = (unsigned char)*name;
        while (c) {
                len++;
                hash = partial_name_hash(c, hash);
                c = (unsigned char)name[len];
        }
        return hashlen_create(end_name_hash(hash), len);
}
EXPORT_SYMBOL(hashlen_string);

/*
 * We know there's a real path component here of at least
 * one character.
 */
static inline u64 hash_name(const void *salt, const char *name)
{
        unsigned long hash = init_name_hash(salt);
        unsigned long len = 0, c;

        c = (unsigned char)*name;
        do {
                len++;
                hash = partial_name_hash(c, hash);
                c = (unsigned char)name[len];
        } while (c && c != '/');
        return hashlen_create(end_name_hash(hash), len);
}

#endif

/*
 * Name resolution.
 * This is the basic name resolution function, turning a pathname into
 * the final dentry. We expect 'base' to be positive and a directory.
 *
 * Returns 0 and nd will have valid dentry and mnt on success.
 * Returns error and drops reference to input namei data on failure.
 */
static int link_path_walk(const char *name, struct nameidata *nd)
{
        int depth = 0; // depth <= nd->depth
        int err;

        nd->last_type = LAST_ROOT;
        nd->flags |= LOOKUP_PARENT;
        if (IS_ERR(name))
                return PTR_ERR(name);
        while (*name=='/')
                name++;
        if (!*name)
                return 0;

        /* At this point we know we have a real path component. */
        for(;;) {
                const char *link;
                u64 hash_len;
                int type;

                err = may_lookup(nd);
                if (err)
                        return err;

                hash_len = hash_name(nd->path.dentry, name);

                type = LAST_NORM;
                if (name[0] == '.') switch (hashlen_len(hash_len)) {
                        case 2:
                                if (name[1] == '.') {
                                        type = LAST_DOTDOT;
                                        nd->flags |= LOOKUP_JUMPED;
                                }
                                break;
                        case 1:
                                type = LAST_DOT;
                }
                if (likely(type == LAST_NORM)) {
                        struct dentry *parent = nd->path.dentry;
                        nd->flags &= ~LOOKUP_JUMPED;
                        if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
                                struct qstr this = { { .hash_len = hash_len }, .name = name };
                                err = parent->d_op->d_hash(parent, &this);
                                if (err < 0)
                                        return err;
                                hash_len = this.hash_len;
                                name = this.name;
                        }
                }

                nd->last.hash_len = hash_len;
                nd->last.name = name;
                nd->last_type = type;

                name += hashlen_len(hash_len);
                if (!*name)
                        goto OK;
                /*
                 * If it wasn't NUL, we know it was '/'. Skip that
                 * slash, and continue until no more slashes.
                 */
                do {
                        name++;
                } while (unlikely(*name == '/'));
                if (unlikely(!*name)) {
OK:
                        /* pathname or trailing symlink, done */
                        if (!depth) {
                                nd->dir_uid = nd->inode->i_uid;
                                nd->dir_mode = nd->inode->i_mode;
                                nd->flags &= ~LOOKUP_PARENT;
                                return 0;
                        }
                        /* last component of nested symlink */
                        name = nd->stack[--depth].name;
                        link = walk_component(nd, 0);
                } else {
                        /* not the last component */
                        link = walk_component(nd, WALK_MORE);
                }
                if (unlikely(link)) {
                        if (IS_ERR(link))
                                return PTR_ERR(link);
                        /* a symlink to follow */
                        nd->stack[depth++].name = name;
                        name = link;
                        continue;
                }
                if (unlikely(!d_can_lookup(nd->path.dentry))) {
                        if (nd->flags & LOOKUP_RCU) {
                                if (!try_to_unlazy(nd))
                                        return -ECHILD;
                        }
                        return -ENOTDIR;
                }
        }
}

/* must be paired with terminate_walk() */
static const char *path_init(struct nameidata *nd, unsigned flags)
{
        int error;
        const char *s = nd->name->name;

        /* LOOKUP_CACHED requires RCU, ask caller to retry */
        if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)
                return ERR_PTR(-EAGAIN);

        if (!*s)
                flags &= ~LOOKUP_RCU;
        if (flags & LOOKUP_RCU)
                rcu_read_lock();

        nd->flags = flags | LOOKUP_JUMPED;
        nd->depth = 0;

        nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount);
        nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount);
        smp_rmb();

        if (flags & LOOKUP_ROOT) {
                struct dentry *root = nd->root.dentry;
                struct inode *inode = root->d_inode;
                if (*s && unlikely(!d_can_lookup(root)))
                        return ERR_PTR(-ENOTDIR);
                nd->path = nd->root;
                nd->inode = inode;
                if (flags & LOOKUP_RCU) {
                        nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
                        nd->root_seq = nd->seq;
                } else {
                        path_get(&nd->path);
                }
                return s;
        }

        nd->root.mnt = NULL;

        /* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */
        if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) {
                error = nd_jump_root(nd);
                if (unlikely(error))
                        return ERR_PTR(error);
                return s;
        }

        /* Relative pathname -- get the starting-point it is relative to. */
        if (nd->dfd == AT_FDCWD) {
                if (flags & LOOKUP_RCU) {
                        struct fs_struct *fs = current->fs;
                        unsigned seq;

                        do {
                                seq = read_seqcount_begin(&fs->seq);
                                nd->path = fs->pwd;
                                nd->inode = nd->path.dentry->d_inode;
                                nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
                        } while (read_seqcount_retry(&fs->seq, seq));
                } else {
                        get_fs_pwd(current->fs, &nd->path);
                        nd->inode = nd->path.dentry->d_inode;
                }
        } else {
                /* Caller must check execute permissions on the starting path component */
                struct fd f = fdget_raw(nd->dfd);
                struct dentry *dentry;

                if (!f.file)
                        return ERR_PTR(-EBADF);

                dentry = f.file->f_path.dentry;

                if (*s && unlikely(!d_can_lookup(dentry))) {
                        fdput(f);
                        return ERR_PTR(-ENOTDIR);
                }

                nd->path = f.file->f_path;
                if (flags & LOOKUP_RCU) {
                        nd->inode = nd->path.dentry->d_inode;
                        nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
                } else {
                        path_get(&nd->path);
                        nd->inode = nd->path.dentry->d_inode;
                }
                fdput(f);
        }

        /* For scoped-lookups we need to set the root to the dirfd as well. */
        if (flags & LOOKUP_IS_SCOPED) {
                nd->root = nd->path;
                if (flags & LOOKUP_RCU) {
                        nd->root_seq = nd->seq;
                } else {
                        path_get(&nd->root);
                        nd->flags |= LOOKUP_ROOT_GRABBED;
                }
        }
        return s;
}

static inline const char *lookup_last(struct nameidata *nd)
{
        if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
                nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;

        return walk_component(nd, WALK_TRAILING);
}

static int handle_lookup_down(struct nameidata *nd)
{
        if (!(nd->flags & LOOKUP_RCU))
                dget(nd->path.dentry);
        return PTR_ERR(step_into(nd, WALK_NOFOLLOW,
                        nd->path.dentry, nd->inode, nd->seq));
}

/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path)
{
        const char *s = path_init(nd, flags);
        int err;

        if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(s)) {
                err = handle_lookup_down(nd);
                if (unlikely(err < 0))
                        s = ERR_PTR(err);
        }

        while (!(err = link_path_walk(s, nd)) &&
               (s = lookup_last(nd)) != NULL)
                ;
        if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) {
                err = handle_lookup_down(nd);
                nd->flags &= ~LOOKUP_JUMPED; // no d_weak_revalidate(), please...
        }
        if (!err)
                err = complete_walk(nd);

        if (!err && nd->flags & LOOKUP_DIRECTORY)
                if (!d_can_lookup(nd->path.dentry))
                        err = -ENOTDIR;
        if (!err) {
                *path = nd->path;
                nd->path.mnt = NULL;
                nd->path.dentry = NULL;
        }
        terminate_walk(nd);
        return err;
}

int filename_lookup(int dfd, struct filename *name, unsigned flags,
                    struct path *path, struct path *root)
{
        int retval;
        struct nameidata nd;
        if (IS_ERR(name))
                return PTR_ERR(name);
        if (unlikely(root)) {
                nd.root = *root;
                flags |= LOOKUP_ROOT;
        }
        set_nameidata(&nd, dfd, name);
        retval = path_lookupat(&nd, flags | LOOKUP_RCU, path);
        if (unlikely(retval == -ECHILD))
                retval = path_lookupat(&nd, flags, path);
        if (unlikely(retval == -ESTALE))
                retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path);

        if (likely(!retval))
                audit_inode(name, path->dentry,
                            flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0);
        restore_nameidata();
        putname(name);
        return retval;
}

/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
static int path_parentat(struct nameidata *nd, unsigned flags,
                                struct path *parent)
{
        const char *s = path_init(nd, flags);
        int err = link_path_walk(s, nd);
        if (!err)
                err = complete_walk(nd);
        if (!err) {
                *parent = nd->path;
                nd->path.mnt = NULL;
                nd->path.dentry = NULL;
        }
        terminate_walk(nd);
        return err;
}

static struct filename *filename_parentat(int dfd, struct filename *name,
                                unsigned int flags, struct path *parent,
                                struct qstr *last, int *type)
{
        int retval;
        struct nameidata nd;

        if (IS_ERR(name))
                return name;
        set_nameidata(&nd, dfd, name);
        retval = path_parentat(&nd, flags | LOOKUP_RCU, parent);
        if (unlikely(retval == -ECHILD))
                retval = path_parentat(&nd, flags, parent);
        if (unlikely(retval == -ESTALE))
                retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent);
        if (likely(!retval)) {
                *last = nd.last;
                *type = nd.last_type;
                audit_inode(name, parent->dentry, AUDIT_INODE_PARENT);
        } else {
                putname(name);
                name = ERR_PTR(retval);
        }
        restore_nameidata();
        return name;
}

/* does lookup, returns the object with parent locked */
struct dentry *kern_path_locked(const char *name, struct path *path)
{
        struct filename *filename;
        struct dentry *d;
        struct qstr last;
        int type;

        filename = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path,
                                    &last, &type);
        if (IS_ERR(filename))
                return ERR_CAST(filename);
        if (unlikely(type != LAST_NORM)) {
                path_put(path);
                putname(filename);
                return ERR_PTR(-EINVAL);
        }
        inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
        d = __lookup_hash(&last, path->dentry, 0);
        if (IS_ERR(d)) {
                inode_unlock(path->dentry->d_inode);
                path_put(path);
        }
        putname(filename);
        return d;
}

int kern_path(const char *name, unsigned int flags, struct path *path)
{
        return filename_lookup(AT_FDCWD, getname_kernel(name),
                               flags, path, NULL);
}
EXPORT_SYMBOL(kern_path);

/**
 * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
 * @dentry:  pointer to dentry of the base directory
 * @mnt: pointer to vfs mount of the base directory
 * @name: pointer to file name
 * @flags: lookup flags
 * @path: pointer to struct path to fill
 */
int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
                    const char *name, unsigned int flags,
                    struct path *path)
{
        struct path root = {.mnt = mnt, .dentry = dentry};
        /* the first argument of filename_lookup() is ignored with root */
        return filename_lookup(AT_FDCWD, getname_kernel(name),
                               flags , path, &root);
}
EXPORT_SYMBOL(vfs_path_lookup);

static int lookup_one_len_common(const char *name, struct dentry *base,
                                 int len, struct qstr *this)
{
        this->name = name;
        this->len = len;
        this->hash = full_name_hash(base, name, len);
        if (!len)
                return -EACCES;

        if (unlikely(name[0] == '.')) {
                if (len < 2 || (len == 2 && name[1] == '.'))
                        return -EACCES;
        }

        while (len--) {
                unsigned int c = *(const unsigned char *)name++;
                if (c == '/' || c == '\0')
                        return -EACCES;
        }
        /*
         * See if the low-level filesystem might want
         * to use its own hash..
         */
        if (base->d_flags & DCACHE_OP_HASH) {
                int err = base->d_op->d_hash(base, this);
                if (err < 0)
                        return err;
        }

        return inode_permission(base->d_inode, MAY_EXEC);
}

/**
 * try_lookup_one_len - filesystem helper to lookup single pathname component
 * @name:        pathname component to lookup
 * @base:        base directory to lookup from
 * @len:        maximum length @len should be interpreted to
 *
 * Look up a dentry by name in the dcache, returning NULL if it does not
 * currently exist.  The function does not try to create a dentry.
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * The caller must hold base->i_mutex.
 */
struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len)
{
        struct qstr this;
        int err;

        WARN_ON_ONCE(!inode_is_locked(base->d_inode));

        err = lookup_one_len_common(name, base, len, &this);
        if (err)
                return ERR_PTR(err);

        return lookup_dcache(&this, base, 0);
}
EXPORT_SYMBOL(try_lookup_one_len);

/**
 * lookup_one_len - filesystem helper to lookup single pathname component
 * @name:        pathname component to lookup
 * @base:        base directory to lookup from
 * @len:        maximum length @len should be interpreted to
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * The caller must hold base->i_mutex.
 */
struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
{
        struct dentry *dentry;
        struct qstr this;
        int err;

        WARN_ON_ONCE(!inode_is_locked(base->d_inode));

        err = lookup_one_len_common(name, base, len, &this);
        if (err)
                return ERR_PTR(err);

        dentry = lookup_dcache(&this, base, 0);
        return dentry ? dentry : __lookup_slow(&this, base, 0);
}
EXPORT_SYMBOL(lookup_one_len);

/**
 * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
 * @name:        pathname component to lookup
 * @base:        base directory to lookup from
 * @len:        maximum length @len should be interpreted to
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * Unlike lookup_one_len, it should be called without the parent
 * i_mutex held, and will take the i_mutex itself if necessary.
 */
struct dentry *lookup_one_len_unlocked(const char *name,
                                       struct dentry *base, int len)
{
        struct qstr this;
        int err;
        struct dentry *ret;

        err = lookup_one_len_common(name, base, len, &this);
        if (err)
                return ERR_PTR(err);

        ret = lookup_dcache(&this, base, 0);
        if (!ret)
                ret = lookup_slow(&this, base, 0);
        return ret;
}
EXPORT_SYMBOL(lookup_one_len_unlocked);

/*
 * Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT)
 * on negatives.  Returns known positive or ERR_PTR(); that's what
 * most of the users want.  Note that pinned negative with unlocked parent
 * _can_ become positive at any time, so callers of lookup_one_len_unlocked()
 * need to be very careful; pinned positives have ->d_inode stable, so
 * this one avoids such problems.
 */
struct dentry *lookup_positive_unlocked(const char *name,
                                       struct dentry *base, int len)
{
        struct dentry *ret = lookup_one_len_unlocked(name, base, len);
        if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
                dput(ret);
                ret = ERR_PTR(-ENOENT);
        }
        return ret;
}
EXPORT_SYMBOL(lookup_positive_unlocked);

#ifdef CONFIG_UNIX98_PTYS
int path_pts(struct path *path)
{
        /* Find something mounted on "pts" in the same directory as
         * the input path.
         */
        struct dentry *parent = dget_parent(path->dentry);
        struct dentry *child;
        struct qstr this = QSTR_INIT("pts", 3);

        if (unlikely(!path_connected(path->mnt, parent))) {
                dput(parent);
                return -ENOENT;
        }
        dput(path->dentry);
        path->dentry = parent;
        child = d_hash_and_lookup(parent, &this);
        if (IS_ERR_OR_NULL(child))
                return -ENOENT;

        path->dentry = child;
        dput(parent);
        follow_down(path);
        return 0;
}
#endif

int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
                 struct path *path, int *empty)
{
        return filename_lookup(dfd, getname_flags(name, flags, empty),
                               flags, path, NULL);
}
EXPORT_SYMBOL(user_path_at_empty);

int __check_sticky(struct inode *dir, struct inode *inode)
{
        kuid_t fsuid = current_fsuid();

        if (uid_eq(inode->i_uid, fsuid))
                return 0;
        if (uid_eq(dir->i_uid, fsuid))
                return 0;
        return !capable_wrt_inode_uidgid(inode, CAP_FOWNER);
}
EXPORT_SYMBOL(__check_sticky);

/*
 *        Check whether we can remove a link victim from directory dir, check
 *  whether the type of victim is right.
 *  1. We can't do it if dir is read-only (done in permission())
 *  2. We should have write and exec permissions on dir
 *  3. We can't remove anything from append-only dir
 *  4. We can't do anything with immutable dir (done in permission())
 *  5. If the sticky bit on dir is set we should either
 *        a. be owner of dir, or
 *        b. be owner of victim, or
 *        c. have CAP_FOWNER capability
 *  6. If the victim is append-only or immutable we can't do antyhing with
 *     links pointing to it.
 *  7. If the victim has an unknown uid or gid we can't change the inode.
 *  8. If we were asked to remove a directory and victim isn't one - ENOTDIR.
 *  9. If we were asked to remove a non-directory and victim isn't one - EISDIR.
 * 10. We can't remove a root or mountpoint.
 * 11. We don't allow removal of NFS sillyrenamed files; it's handled by
 *     nfs_async_unlink().
 */
static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
{
        struct inode *inode = d_backing_inode(victim);
        int error;

        if (d_is_negative(victim))
                return -ENOENT;
        BUG_ON(!inode);

        BUG_ON(victim->d_parent->d_inode != dir);

        /* Inode writeback is not safe when the uid or gid are invalid. */
        if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
                return -EOVERFLOW;

        audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);

        error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
        if (error)
                return error;
        if (IS_APPEND(dir))
                return -EPERM;

        if (check_sticky(dir, inode) || IS_APPEND(inode) ||
            IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || HAS_UNMAPPED_ID(inode))
                return -EPERM;
        if (isdir) {
                if (!d_is_dir(victim))
                        return -ENOTDIR;
                if (IS_ROOT(victim))
                        return -EBUSY;
        } else if (d_is_dir(victim))
                return -EISDIR;
        if (IS_DEADDIR(dir))
                return -ENOENT;
        if (victim->d_flags & DCACHE_NFSFS_RENAMED)
                return -EBUSY;
        return 0;
}

/*        Check whether we can create an object with dentry child in directory
 *  dir.
 *  1. We can't do it if child already exists (open has special treatment for
 *     this case, but since we are inlined it's OK)
 *  2. We can't do it if dir is read-only (done in permission())
 *  3. We can't do it if the fs can't represent the fsuid or fsgid.
 *  4. We should have write and exec permissions on dir
 *  5. We can't do it if dir is immutable (done in permission())
 */
static inline int may_create(struct inode *dir, struct dentry *child)
{
        struct user_namespace *s_user_ns;
        audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
        if (child->d_inode)
                return -EEXIST;
        if (IS_DEADDIR(dir))
                return -ENOENT;
        s_user_ns = dir->i_sb->s_user_ns;
        if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
            !kgid_has_mapping(s_user_ns, current_fsgid()))
                return -EOVERFLOW;
        return inode_permission(dir, MAY_WRITE | MAY_EXEC);
}

/*
 * p1 and p2 should be directories on the same fs.
 */
struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
{
        struct dentry *p;

        if (p1 == p2) {
                inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
                return NULL;
        }

        mutex_lock(&p1->d_sb->s_vfs_rename_mutex);

        p = d_ancestor(p2, p1);
        if (p) {
                inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
                inode_lock_nested(p1->d_inode, I_MUTEX_PARENT2);
                return p;
        }

        p = d_ancestor(p1, p2);
        inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
        inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
        return p;
}
EXPORT_SYMBOL(lock_rename);

void unlock_rename(struct dentry *p1, struct dentry *p2)
{
        inode_unlock(p1->d_inode);
        if (p1 != p2) {
                inode_unlock(p2->d_inode);
                mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
        }
}
EXPORT_SYMBOL(unlock_rename);

/**
 * mode_strip_umask - handle vfs umask stripping
 * @dir:        parent directory of the new inode
 * @mode:        mode of the new inode to be created in @dir
 *
 * Umask stripping depends on whether or not the filesystem supports POSIX
 * ACLs. If the filesystem doesn't support it umask stripping is done directly
 * in here. If the filesystem does support POSIX ACLs umask stripping is
 * deferred until the filesystem calls posix_acl_create().
 *
 * Returns: mode
 */
static inline umode_t mode_strip_umask(const struct inode *dir, umode_t mode)
{
        if (!IS_POSIXACL(dir))
                mode &= ~current_umask();
        return mode;
}

/**
 * vfs_prepare_mode - prepare the mode to be used for a new inode
 * @dir:        parent directory of the new inode
 * @mode:        mode of the new inode
 * @mask_perms:        allowed permission by the vfs
 * @type:        type of file to be created
 *
 * This helper consolidates and enforces vfs restrictions on the @mode of a new
 * object to be created.
 *
 * Umask stripping depends on whether the filesystem supports POSIX ACLs (see
 * the kernel documentation for mode_strip_umask()). Moving umask stripping
 * after setgid stripping allows the same ordering for both non-POSIX ACL and
 * POSIX ACL supporting filesystems.
 *
 * Note that it's currently valid for @type to be 0 if a directory is created.
 * Filesystems raise that flag individually and we need to check whether each
 * filesystem can deal with receiving S_IFDIR from the vfs before we enforce a
 * non-zero type.
 *
 * Returns: mode to be passed to the filesystem
 */
static inline umode_t vfs_prepare_mode(const struct inode *dir, umode_t mode,
                                       umode_t mask_perms, umode_t type)
{
        mode = mode_strip_sgid(dir, mode);
        mode = mode_strip_umask(dir, mode);

        /*
         * Apply the vfs mandated allowed permission mask and set the type of
         * file to be created before we call into the filesystem.
         */
        mode &= (mask_perms & ~S_IFMT);
        mode |= (type & S_IFMT);

        return mode;
}

int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
                bool want_excl)
{
        int error = may_create(dir, dentry);
        if (error)
                return error;

        if (!dir->i_op->create)
                return -EACCES;        /* shouldn't it be ENOSYS? */

        mode = vfs_prepare_mode(dir, mode, S_IALLUGO, S_IFREG);
        error = security_inode_create(dir, dentry, mode);
        if (error)
                return error;
        error = dir->i_op->create(dir, dentry, mode, want_excl);
        if (!error)
                fsnotify_create(dir, dentry);
        return error;
}
EXPORT_SYMBOL(vfs_create);

int vfs_mkobj(struct dentry *dentry, umode_t mode,
                int (*f)(struct dentry *, umode_t, void *),
                void *arg)
{
        struct inode *dir = dentry->d_parent->d_inode;
        int error = may_create(dir, dentry);
        if (error)
                return error;

        mode &= S_IALLUGO;
        mode |= S_IFREG;
        error = security_inode_create(dir, dentry, mode);
        if (error)
                return error;
        error = f(dentry, mode, arg);
        if (!error)
                fsnotify_create(dir, dentry);
        return error;
}
EXPORT_SYMBOL(vfs_mkobj);

bool may_open_dev(const struct path *path)
{
        return !(path->mnt->mnt_flags & MNT_NODEV) &&
                !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
}

static int may_open(const struct path *path, int acc_mode, int flag)
{
        struct dentry *dentry = path->dentry;
        struct inode *inode = dentry->d_inode;
        int error;

        if (!inode)
                return -ENOENT;

        switch (inode->i_mode & S_IFMT) {
        case S_IFLNK:
                return -ELOOP;
        case S_IFDIR:
                if (acc_mode & MAY_WRITE)
                        return -EISDIR;
                if (acc_mode & MAY_EXEC)
                        return -EACCES;
                break;
        case S_IFBLK:
        case S_IFCHR:
                if (!may_open_dev(path))
                        return -EACCES;
                fallthrough;
        case S_IFIFO:
        case S_IFSOCK:
                if (acc_mode & MAY_EXEC)
                        return -EACCES;
                flag &= ~O_TRUNC;
                break;
        case S_IFREG:
                if ((acc_mode & MAY_EXEC) && path_noexec(path))
                        return -EACCES;
                break;
        }

        error = inode_permission(inode, MAY_OPEN | acc_mode);
        if (error)
                return error;

        /*
         * An append-only file must be opened in append mode for writing.
         */
        if (IS_APPEND(inode)) {
                if  ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
                        return -EPERM;
                if (flag & O_TRUNC)
                        return -EPERM;
        }

        /* O_NOATIME can only be set by the owner or superuser */
        if (flag & O_NOATIME && !inode_owner_or_capable(inode))
                return -EPERM;

        return 0;
}

static int handle_truncate(struct file *filp)
{
        const struct path *path = &filp->f_path;
        struct inode *inode = path->dentry->d_inode;
        int error = get_write_access(inode);
        if (error)
                return error;
        /*
         * Refuse to truncate files with mandatory locks held on them.
         */
        error = locks_verify_locked(filp);
        if (!error)
                error = security_path_truncate(path);
        if (!error) {
                error = do_truncate(path->dentry, 0,
                                    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
                                    filp);
        }
        put_write_access(inode);
        return error;
}

static inline int open_to_namei_flags(int flag)
{
        if ((flag & O_ACCMODE) == 3)
                flag--;
        return flag;
}

static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t mode)
{
        struct user_namespace *s_user_ns;
        int error = security_path_mknod(dir, dentry, mode, 0);
        if (error)
                return error;

        s_user_ns = dir->dentry->d_sb->s_user_ns;
        if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
            !kgid_has_mapping(s_user_ns, current_fsgid()))
                return -EOVERFLOW;

        error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
        if (error)
                return error;

        return security_inode_create(dir->dentry->d_inode, dentry, mode);
}

/*
 * Attempt to atomically look up, create and open a file from a negative
 * dentry.
 *
 * Returns 0 if successful.  The file will have been created and attached to
 * @file by the filesystem calling finish_open().
 *
 * If the file was looked up only or didn't need creating, FMODE_OPENED won't
 * be set.  The caller will need to perform the open themselves.  @path will
 * have been updated to point to the new dentry.  This may be negative.
 *
 * Returns an error code otherwise.
 */
static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry,
                                  struct file *file,
                                  int open_flag, umode_t mode)
{
        struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
        struct inode *dir =  nd->path.dentry->d_inode;
        int error;

        if (nd->flags & LOOKUP_DIRECTORY)
                open_flag |= O_DIRECTORY;

        file->f_path.dentry = DENTRY_NOT_SET;
        file->f_path.mnt = nd->path.mnt;
        error = dir->i_op->atomic_open(dir, dentry, file,
                                       open_to_namei_flags(open_flag), mode);
        d_lookup_done(dentry);
        if (!error) {
                if (file->f_mode & FMODE_OPENED) {
                        if (unlikely(dentry != file->f_path.dentry)) {
                                dput(dentry);
                                dentry = dget(file->f_path.dentry);
                        }
                } else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
                        error = -EIO;
                } else {
                        if (file->f_path.dentry) {
                                dput(dentry);
                                dentry = file->f_path.dentry;
                        }
                        if (unlikely(d_is_negative(dentry)))
                                error = -ENOENT;
                }
        }
        if (error) {
                dput(dentry);
                dentry = ERR_PTR(error);
        }
        return dentry;
}

/*
 * Look up and maybe create and open the last component.
 *
 * Must be called with parent locked (exclusive in O_CREAT case).
 *
 * Returns 0 on success, that is, if
 *  the file was successfully atomically created (if necessary) and opened, or
 *  the file was not completely opened at this time, though lookups and
 *  creations were performed.
 * These case are distinguished by presence of FMODE_OPENED on file->f_mode.
 * In the latter case dentry returned in @path might be negative if O_CREAT
 * hadn't been specified.
 *
 * An error code is returned on failure.
 */
static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
                                  const struct open_flags *op,
                                  bool got_write)
{
        struct dentry *dir = nd->path.dentry;
        struct inode *dir_inode = dir->d_inode;
        int open_flag = op->open_flag;
        struct dentry *dentry;
        int error, create_error = 0;
        umode_t mode = op->mode;
        DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);

        if (unlikely(IS_DEADDIR(dir_inode)))
                return ERR_PTR(-ENOENT);

        file->f_mode &= ~FMODE_CREATED;
        dentry = d_lookup(dir, &nd->last);
        for (;;) {
                if (!dentry) {
                        dentry = d_alloc_parallel(dir, &nd->last, &wq);
                        if (IS_ERR(dentry))
                                return dentry;
                }
                if (d_in_lookup(dentry))
                        break;

                error = d_revalidate(dentry, nd->flags);
                if (likely(error > 0))
                        break;
                if (error)
                        goto out_dput;
                d_invalidate(dentry);
                dput(dentry);
                dentry = NULL;
        }
        if (dentry->d_inode) {
                /* Cached positive dentry: will open in f_op->open */
                return dentry;
        }

        /*
         * Checking write permission is tricky, bacuse we don't know if we are
         * going to actually need it: O_CREAT opens should work as long as the
         * file exists.  But checking existence breaks atomicity.  The trick is
         * to check access and if not granted clear O_CREAT from the flags.
         *
         * Another problem is returing the "right" error value (e.g. for an
         * O_EXCL open we want to return EEXIST not EROFS).
         */
        if (unlikely(!got_write))
                open_flag &= ~O_TRUNC;
        if (open_flag & O_CREAT) {
                if (open_flag & O_EXCL)
                        open_flag &= ~O_TRUNC;
                mode = vfs_prepare_mode(dir->d_inode, mode, mode, mode);
                if (likely(got_write))
                        create_error = may_o_create(&nd->path, dentry, mode);
                else
                        create_error = -EROFS;
        }
        if (create_error)
                open_flag &= ~O_CREAT;
        if (dir_inode->i_op->atomic_open) {
                dentry = atomic_open(nd, dentry, file, open_flag, mode);
                if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT))
                        dentry = ERR_PTR(create_error);
                return dentry;
        }

        if (d_in_lookup(dentry)) {
                struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
                                                             nd->flags);
                d_lookup_done(dentry);
                if (unlikely(res)) {
                        if (IS_ERR(res)) {
                                error = PTR_ERR(res);
                                goto out_dput;
                        }
                        dput(dentry);
                        dentry = res;
                }
        }

        /* Negative dentry, just create the file */
        if (!dentry->d_inode && (open_flag & O_CREAT)) {
                file->f_mode |= FMODE_CREATED;
                audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
                if (!dir_inode->i_op->create) {
                        error = -EACCES;
                        goto out_dput;
                }
                error = dir_inode->i_op->create(dir_inode, dentry, mode,
                                                open_flag & O_EXCL);
                if (error)
                        goto out_dput;
        }
        if (unlikely(create_error) && !dentry->d_inode) {
                error = create_error;
                goto out_dput;
        }
        return dentry;

out_dput:
        dput(dentry);
        return ERR_PTR(error);
}

static const char *open_last_lookups(struct nameidata *nd,
                   struct file *file, const struct open_flags *op)
{
        struct dentry *dir = nd->path.dentry;
        int open_flag = op->open_flag;
        bool got_write = false;
        unsigned seq;
        struct inode *inode;
        struct dentry *dentry;
        const char *res;

        nd->flags |= op->intent;

        if (nd->last_type != LAST_NORM) {
                if (nd->depth)
                        put_link(nd);
                return handle_dots(nd, nd->last_type);
        }

        if (!(open_flag & O_CREAT)) {
                if (nd->last.name[nd->last.len])
                        nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
                /* we _can_ be in RCU mode here */
                dentry = lookup_fast(nd, &inode, &seq);
                if (IS_ERR(dentry))
                        return ERR_CAST(dentry);
                if (likely(dentry))
                        goto finish_lookup;

                BUG_ON(nd->flags & LOOKUP_RCU);
        } else {
                /* create side of things */
                if (nd->flags & LOOKUP_RCU) {
                        if (!try_to_unlazy(nd))
                                return ERR_PTR(-ECHILD);
                }
                audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
                /* trailing slashes? */
                if (unlikely(nd->last.name[nd->last.len]))
                        return ERR_PTR(-EISDIR);
        }

        if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
                got_write = !mnt_want_write(nd->path.mnt);
                /*
                 * do _not_ fail yet - we might not need that or fail with
                 * a different error; let lookup_open() decide; we'll be
                 * dropping this one anyway.
                 */
        }
        if (open_flag & O_CREAT)
                inode_lock(dir->d_inode);
        else
                inode_lock_shared(dir->d_inode);
        dentry = lookup_open(nd, file, op, got_write);
        if (!IS_ERR(dentry) && (file->f_mode & FMODE_CREATED))
                fsnotify_create(dir->d_inode, dentry);
        if (open_flag & O_CREAT)
                inode_unlock(dir->d_inode);
        else
                inode_unlock_shared(dir->d_inode);

        if (got_write)
                mnt_drop_write(nd->path.mnt);

        if (IS_ERR(dentry))
                return ERR_CAST(dentry);

        if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) {
                dput(nd->path.dentry);
                nd->path.dentry = dentry;
                return NULL;
        }

finish_lookup:
        if (nd->depth)
                put_link(nd);
        res = step_into(nd, WALK_TRAILING, dentry, inode, seq);
        if (unlikely(res))
                nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
        return res;
}

/*
 * Handle the last step of open()
 */
static int do_open(struct nameidata *nd,
                   struct file *file, const struct open_flags *op)
{
        int open_flag = op->open_flag;
        bool do_truncate;
        int acc_mode;
        int error;

        if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) {
                error = complete_walk(nd);
                if (error)
                        return error;
        }
        if (!(file->f_mode & FMODE_CREATED))
                audit_inode(nd->name, nd->path.dentry, 0);
        if (open_flag & O_CREAT) {
                if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
                        return -EEXIST;
                if (d_is_dir(nd->path.dentry))
                        return -EISDIR;
                error = may_create_in_sticky(nd->dir_mode, nd->dir_uid,
                                             d_backing_inode(nd->path.dentry));
                if (unlikely(error))
                        return error;
        }
        if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
                return -ENOTDIR;

        do_truncate = false;
        acc_mode = op->acc_mode;
        if (file->f_mode & FMODE_CREATED) {
                /* Don't check for write permission, don't truncate */
                open_flag &= ~O_TRUNC;
                acc_mode = 0;
        } else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) {
                error = mnt_want_write(nd->path.mnt);
                if (error)
                        return error;
                do_truncate = true;
        }
        error = may_open(&nd->path, acc_mode, open_flag);
        if (!error && !(file->f_mode & FMODE_OPENED))
                error = vfs_open(&nd->path, file);
        if (!error)
                error = ima_file_check(file, op->acc_mode);
        if (!error && do_truncate)
                error = handle_truncate(file);
        if (unlikely(error > 0)) {
                WARN_ON(1);
                error = -EINVAL;
        }
        if (do_truncate)
                mnt_drop_write(nd->path.mnt);
        return error;
}

struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag)
{
        struct dentry *child = NULL;
        struct inode *dir = dentry->d_inode;
        struct inode *inode;
        int error;

        /* we want directory to be writable */
        error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
        if (error)
                goto out_err;
        error = -EOPNOTSUPP;
        if (!dir->i_op->tmpfile)
                goto out_err;
        error = -ENOMEM;
        child = d_alloc(dentry, &slash_name);
        if (unlikely(!child))
                goto out_err;
        mode = vfs_prepare_mode(dir, mode, mode, mode);
        error = dir->i_op->tmpfile(dir, child, mode);
        if (error)
                goto out_err;
        error = -ENOENT;
        inode = child->d_inode;
        if (unlikely(!inode))
                goto out_err;
        if (!(open_flag & O_EXCL)) {
                spin_lock(&inode->i_lock);
                inode->i_state |= I_LINKABLE;
                spin_unlock(&inode->i_lock);
        }
        ima_post_create_tmpfile(inode);
        return child;

out_err:
        dput(child);
        return ERR_PTR(error);
}
EXPORT_SYMBOL(vfs_tmpfile);

static int do_tmpfile(struct nameidata *nd, unsigned flags,
                const struct open_flags *op,
                struct file *file)
{
        struct dentry *child;
        struct path path;
        int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
        if (unlikely(error))
                return error;
        error = mnt_want_write(path.mnt);
        if (unlikely(error))
                goto out;
        child = vfs_tmpfile(path.dentry, op->mode, op->open_flag);
        error = PTR_ERR(child);
        if (IS_ERR(child))
                goto out2;
        dput(path.dentry);
        path.dentry = child;
        audit_inode(nd->name, child, 0);
        /* Don't check for other permissions, the inode was just created */
        error = may_open(&path, 0, op->open_flag);
        if (error)
                goto out2;
        file->f_path.mnt = path.mnt;
        error = finish_open(file, child, NULL);
out2:
        mnt_drop_write(path.mnt);
out:
        path_put(&path);
        return error;
}

static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file)
{
        struct path path;
        int error = path_lookupat(nd, flags, &path);
        if (!error) {
                audit_inode(nd->name, path.dentry, 0);
                error = vfs_open(&path, file);
                path_put(&path);
        }
        return error;
}

static struct file *path_openat(struct nameidata *nd,
                        const struct open_flags *op, unsigned flags)
{
        struct file *file;
        int error;

        file = alloc_empty_file(op->open_flag, current_cred());
        if (IS_ERR(file))
                return file;

        if (unlikely(file->f_flags & __O_TMPFILE)) {
                error = do_tmpfile(nd, flags, op, file);
        } else if (unlikely(file->f_flags & O_PATH)) {
                error = do_o_path(nd, flags, file);
        } else {
                const char *s = path_init(nd, flags);
                while (!(error = link_path_walk(s, nd)) &&
                       (s = open_last_lookups(nd, file, op)) != NULL)
                        ;
                if (!error)
                        error = do_open(nd, file, op);
                terminate_walk(nd);
        }
        if (likely(!error)) {
                if (likely(file->f_mode & FMODE_OPENED))
                        return file;
                WARN_ON(1);
                error = -EINVAL;
        }
        fput(file);
        if (error == -EOPENSTALE) {
                if (flags & LOOKUP_RCU)
                        error = -ECHILD;
                else
                        error = -ESTALE;
        }
        return ERR_PTR(error);
}

struct file *do_filp_open(int dfd, struct filename *pathname,
                const struct open_flags *op)
{
        struct nameidata nd;
        int flags = op->lookup_flags;
        struct file *filp;

        set_nameidata(&nd, dfd, pathname);
        filp = path_openat(&nd, op, flags | LOOKUP_RCU);
        if (unlikely(filp == ERR_PTR(-ECHILD)))
                filp = path_openat(&nd, op, flags);
        if (unlikely(filp == ERR_PTR(-ESTALE)))
                filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
        restore_nameidata();
        return filp;
}

struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
                const char *name, const struct open_flags *op)
{
        struct nameidata nd;
        struct file *file;
        struct filename *filename;
        int flags = op->lookup_flags | LOOKUP_ROOT;

        nd.root.mnt = mnt;
        nd.root.dentry = dentry;

        if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN)
                return ERR_PTR(-ELOOP);

        filename = getname_kernel(name);
        if (IS_ERR(filename))
                return ERR_CAST(filename);

        set_nameidata(&nd, -1, filename);
        file = path_openat(&nd, op, flags | LOOKUP_RCU);
        if (unlikely(file == ERR_PTR(-ECHILD)))
                file = path_openat(&nd, op, flags);
        if (unlikely(file == ERR_PTR(-ESTALE)))
                file = path_openat(&nd, op, flags | LOOKUP_REVAL);
        restore_nameidata();
        putname(filename);
        return file;
}

static struct dentry *filename_create(int dfd, struct filename *name,
                                struct path *path, unsigned int lookup_flags)
{
        struct dentry *dentry = ERR_PTR(-EEXIST);
        struct qstr last;
        int type;
        int err2;
        int error;
        bool is_dir = (lookup_flags & LOOKUP_DIRECTORY);

        /*
         * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any
         * other flags passed in are ignored!
         */
        lookup_flags &= LOOKUP_REVAL;

        name = filename_parentat(dfd, name, lookup_flags, path, &last, &type);
        if (IS_ERR(name))
                return ERR_CAST(name);

        /*
         * Yucky last component or no last component at all?
         * (foo/., foo/.., /////)
         */
        if (unlikely(type != LAST_NORM))
                goto out;

        /* don't fail immediately if it's r/o, at least try to report other errors */
        err2 = mnt_want_write(path->mnt);
        /*
         * Do the final lookup.
         */
        lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL;
        inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
        dentry = __lookup_hash(&last, path->dentry, lookup_flags);
        if (IS_ERR(dentry))
                goto unlock;

        error = -EEXIST;
        if (d_is_positive(dentry))
                goto fail;

        /*
         * Special case - lookup gave negative, but... we had foo/bar/
         * From the vfs_mknod() POV we just have a negative dentry -
         * all is fine. Let's be bastards - you had / on the end, you've
         * been asking for (non-existent) directory. -ENOENT for you.
         */
        if (unlikely(!is_dir && last.name[last.len])) {
                error = -ENOENT;
                goto fail;
        }
        if (unlikely(err2)) {
                error = err2;
                goto fail;
        }
        putname(name);
        return dentry;
fail:
        dput(dentry);
        dentry = ERR_PTR(error);
unlock:
        inode_unlock(path->dentry->d_inode);
        if (!err2)
                mnt_drop_write(path->mnt);
out:
        path_put(path);
        putname(name);
        return dentry;
}

struct dentry *kern_path_create(int dfd, const char *pathname,
                                struct path *path, unsigned int lookup_flags)
{
        return filename_create(dfd, getname_kernel(pathname),
                                path, lookup_flags);
}
EXPORT_SYMBOL(kern_path_create);

void done_path_create(struct path *path, struct dentry *dentry)
{
        dput(dentry);
        inode_unlock(path->dentry->d_inode);
        mnt_drop_write(path->mnt);
        path_put(path);
}
EXPORT_SYMBOL(done_path_create);

inline struct dentry *user_path_create(int dfd, const char __user *pathname,
                                struct path *path, unsigned int lookup_flags)
{
        return filename_create(dfd, getname(pathname), path, lookup_flags);
}
EXPORT_SYMBOL(user_path_create);

int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
{
        bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV;
        int error = may_create(dir, dentry);

        if (error)
                return error;

        if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout &&
            !capable(CAP_MKNOD))
                return -EPERM;

        if (!dir->i_op->mknod)
                return -EPERM;

        mode = vfs_prepare_mode(dir, mode, mode, mode);
        error = devcgroup_inode_mknod(mode, dev);
        if (error)
                return error;

        error = security_inode_mknod(dir, dentry, mode, dev);
        if (error)
                return error;

        error = dir->i_op->mknod(dir, dentry, mode, dev);
        if (!error)
                fsnotify_create(dir, dentry);
        return error;
}
EXPORT_SYMBOL(vfs_mknod);

static int may_mknod(umode_t mode)
{
        switch (mode & S_IFMT) {
        case S_IFREG:
        case S_IFCHR:
        case S_IFBLK:
        case S_IFIFO:
        case S_IFSOCK:
        case 0: /* zero mode translates to S_IFREG */
                return 0;
        case S_IFDIR:
                return -EPERM;
        default:
                return -EINVAL;
        }
}

static long do_mknodat(int dfd, const char __user *filename, umode_t mode,
                unsigned int dev)
{
        struct dentry *dentry;
        struct path path;
        int error;
        unsigned int lookup_flags = 0;

        error = may_mknod(mode);
        if (error)
                return error;
retry:
        dentry = user_path_create(dfd, filename, &path, lookup_flags);
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);

        error = security_path_mknod(&path, dentry,
                        mode_strip_umask(path.dentry->d_inode, mode), dev);
        if (error)
                goto out;
        switch (mode & S_IFMT) {
                case 0: case S_IFREG:
                        error = vfs_create(path.dentry->d_inode,dentry,mode,true);
                        if (!error)
                                ima_post_path_mknod(dentry);
                        break;
                case S_IFCHR: case S_IFBLK:
                        error = vfs_mknod(path.dentry->d_inode,dentry,mode,
                                        new_decode_dev(dev));
                        break;
                case S_IFIFO: case S_IFSOCK:
                        error = vfs_mknod(path.dentry->d_inode,dentry,mode,0);
                        break;
        }
out:
        done_path_create(&path, dentry);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
        return error;
}

SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
                unsigned int, dev)
{
        return do_mknodat(dfd, filename, mode, dev);
}

SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
{
        return do_mknodat(AT_FDCWD, filename, mode, dev);
}

int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
        int error = may_create(dir, dentry);
        unsigned max_links = dir->i_sb->s_max_links;

        if (error)
                return error;

        if (!dir->i_op->mkdir)
                return -EPERM;

        mode = vfs_prepare_mode(dir, mode, S_IRWXUGO | S_ISVTX, 0);
        error = security_inode_mkdir(dir, dentry, mode);
        if (error)
                return error;

        if (max_links && dir->i_nlink >= max_links)
                return -EMLINK;

        error = dir->i_op->mkdir(dir, dentry, mode);
        if (!error)
                fsnotify_mkdir(dir, dentry);
        return error;
}
EXPORT_SYMBOL(vfs_mkdir);

static long do_mkdirat(int dfd, const char __user *pathname, umode_t mode)
{
        struct dentry *dentry;
        struct path path;
        int error;
        unsigned int lookup_flags = LOOKUP_DIRECTORY;

retry:
        dentry = user_path_create(dfd, pathname, &path, lookup_flags);
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);

        error = security_path_mkdir(&path, dentry,
                        mode_strip_umask(path.dentry->d_inode, mode));
        if (!error)
                error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
        done_path_create(&path, dentry);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
        return error;
}

SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
{
        return do_mkdirat(dfd, pathname, mode);
}

SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
{
        return do_mkdirat(AT_FDCWD, pathname, mode);
}

int vfs_rmdir(struct inode *dir, struct dentry *dentry)
{
        int error = may_delete(dir, dentry, 1);

        if (error)
                return error;

        if (!dir->i_op->rmdir)
                return -EPERM;

        dget(dentry);
        inode_lock(dentry->d_inode);

        error = -EBUSY;
        if (is_local_mountpoint(dentry))
                goto out;

        error = security_inode_rmdir(dir, dentry);
        if (error)
                goto out;

        error = dir->i_op->rmdir(dir, dentry);
        if (error)
                goto out;

        shrink_dcache_parent(dentry);
        dentry->d_inode->i_flags |= S_DEAD;
        dont_mount(dentry);
        detach_mounts(dentry);

out:
        inode_unlock(dentry->d_inode);
        dput(dentry);
        if (!error)
                d_delete_notify(dir, dentry);
        return error;
}
EXPORT_SYMBOL(vfs_rmdir);

long do_rmdir(int dfd, struct filename *name)
{
        int error = 0;
        struct dentry *dentry;
        struct path path;
        struct qstr last;
        int type;
        unsigned int lookup_flags = 0;
retry:
        name = filename_parentat(dfd, name, lookup_flags,
                                &path, &last, &type);
        if (IS_ERR(name))
                return PTR_ERR(name);

        switch (type) {
        case LAST_DOTDOT:
                error = -ENOTEMPTY;
                goto exit1;
        case LAST_DOT:
                error = -EINVAL;
                goto exit1;
        case LAST_ROOT:
                error = -EBUSY;
                goto exit1;
        }

        error = mnt_want_write(path.mnt);
        if (error)
                goto exit1;

        inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
        dentry = __lookup_hash(&last, path.dentry, lookup_flags);
        error = PTR_ERR(dentry);
        if (IS_ERR(dentry))
                goto exit2;
        if (!dentry->d_inode) {
                error = -ENOENT;
                goto exit3;
        }
        error = security_path_rmdir(&path, dentry);
        if (error)
                goto exit3;
        error = vfs_rmdir(path.dentry->d_inode, dentry);
exit3:
        dput(dentry);
exit2:
        inode_unlock(path.dentry->d_inode);
        mnt_drop_write(path.mnt);
exit1:
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
        putname(name);
        return error;
}

SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
{
        return do_rmdir(AT_FDCWD, getname(pathname));
}

/**
 * vfs_unlink - unlink a filesystem object
 * @dir:        parent directory
 * @dentry:        victim
 * @delegated_inode: returns victim inode, if the inode is delegated.
 *
 * The caller must hold dir->i_mutex.
 *
 * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
 * return a reference to the inode in delegated_inode.  The caller
 * should then break the delegation on that inode and retry.  Because
 * breaking a delegation may take a long time, the caller should drop
 * dir->i_mutex before doing so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 */
int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
{
        struct inode *target = dentry->d_inode;
        int error = may_delete(dir, dentry, 0);

        if (error)
                return error;

        if (!dir->i_op->unlink)
                return -EPERM;

        inode_lock(target);
        if (is_local_mountpoint(dentry))
                error = -EBUSY;
        else {
                error = security_inode_unlink(dir, dentry);
                if (!error) {
                        error = try_break_deleg(target, delegated_inode);
                        if (error)
                                goto out;
                        error = dir->i_op->unlink(dir, dentry);
                        if (!error) {
                                dont_mount(dentry);
                                detach_mounts(dentry);
                        }
                }
        }
out:
        inode_unlock(target);

        /* We don't d_delete() NFS sillyrenamed files--they still exist. */
        if (!error && dentry->d_flags & DCACHE_NFSFS_RENAMED) {
                fsnotify_unlink(dir, dentry);
        } else if (!error) {
                fsnotify_link_count(target);
                d_delete_notify(dir, dentry);
        }

        return error;
}
EXPORT_SYMBOL(vfs_unlink);

/*
 * Make sure that the actual truncation of the file will occur outside its
 * directory's i_mutex.  Truncate can take a long time if there is a lot of
 * writeout happening, and we don't want to prevent access to the directory
 * while waiting on the I/O.
 */
long do_unlinkat(int dfd, struct filename *name)
{
        int error;
        struct dentry *dentry;
        struct path path;
        struct qstr last;
        int type;
        struct inode *inode = NULL;
        struct inode *delegated_inode = NULL;
        unsigned int lookup_flags = 0;
retry:
        name = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
        if (IS_ERR(name))
                return PTR_ERR(name);

        error = -EISDIR;
        if (type != LAST_NORM)
                goto exit1;

        error = mnt_want_write(path.mnt);
        if (error)
                goto exit1;
retry_deleg:
        inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
        dentry = __lookup_hash(&last, path.dentry, lookup_flags);
        error = PTR_ERR(dentry);
        if (!IS_ERR(dentry)) {
                /* Why not before? Because we want correct error value */
                if (last.name[last.len])
                        goto slashes;
                inode = dentry->d_inode;
                if (d_is_negative(dentry))
                        goto slashes;
                ihold(inode);
                error = security_path_unlink(&path, dentry);
                if (error)
                        goto exit2;
                error = vfs_unlink(path.dentry->d_inode, dentry, &delegated_inode);
exit2:
                dput(dentry);
        }
        inode_unlock(path.dentry->d_inode);
        if (inode)
                iput(inode);        /* truncate the inode here */
        inode = NULL;
        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }
        mnt_drop_write(path.mnt);
exit1:
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                inode = NULL;
                goto retry;
        }
        putname(name);
        return error;

slashes:
        if (d_is_negative(dentry))
                error = -ENOENT;
        else if (d_is_dir(dentry))
                error = -EISDIR;
        else
                error = -ENOTDIR;
        goto exit2;
}

SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
{
        if ((flag & ~AT_REMOVEDIR) != 0)
                return -EINVAL;

        if (flag & AT_REMOVEDIR)
                return do_rmdir(dfd, getname(pathname));
        return do_unlinkat(dfd, getname(pathname));
}

SYSCALL_DEFINE1(unlink, const char __user *, pathname)
{
        return do_unlinkat(AT_FDCWD, getname(pathname));
}

int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
{
        int error = may_create(dir, dentry);

        if (error)
                return error;

        if (!dir->i_op->symlink)
                return -EPERM;

        error = security_inode_symlink(dir, dentry, oldname);
        if (error)
                return error;

        error = dir->i_op->symlink(dir, dentry, oldname);
        if (!error)
                fsnotify_create(dir, dentry);
        return error;
}
EXPORT_SYMBOL(vfs_symlink);

static long do_symlinkat(const char __user *oldname, int newdfd,
                  const char __user *newname)
{
        int error;
        struct filename *from;
        struct dentry *dentry;
        struct path path;
        unsigned int lookup_flags = 0;

        from = getname(oldname);
        if (IS_ERR(from))
                return PTR_ERR(from);
retry:
        dentry = user_path_create(newdfd, newname, &path, lookup_flags);
        error = PTR_ERR(dentry);
        if (IS_ERR(dentry))
                goto out_putname;

        error = security_path_symlink(&path, dentry, from->name);
        if (!error)
                error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
        done_path_create(&path, dentry);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
out_putname:
        putname(from);
        return error;
}

SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
                int, newdfd, const char __user *, newname)
{
        return do_symlinkat(oldname, newdfd, newname);
}

SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
{
        return do_symlinkat(oldname, AT_FDCWD, newname);
}

/**
 * vfs_link - create a new link
 * @old_dentry:        object to be linked
 * @dir:        new parent
 * @new_dentry:        where to create the new link
 * @delegated_inode: returns inode needing a delegation break
 *
 * The caller must hold dir->i_mutex
 *
 * If vfs_link discovers a delegation on the to-be-linked file in need
 * of breaking, it will return -EWOULDBLOCK and return a reference to the
 * inode in delegated_inode.  The caller should then break the delegation
 * and retry.  Because breaking a delegation may take a long time, the
 * caller should drop the i_mutex before doing so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 */
int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
{
        struct inode *inode = old_dentry->d_inode;
        unsigned max_links = dir->i_sb->s_max_links;
        int error;

        if (!inode)
                return -ENOENT;

        error = may_create(dir, new_dentry);
        if (error)
                return error;

        if (dir->i_sb != inode->i_sb)
                return -EXDEV;

        /*
         * A link to an append-only or immutable file cannot be created.
         */
        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
                return -EPERM;
        /*
         * Updating the link count will likely cause i_uid and i_gid to
         * be writen back improperly if their true value is unknown to
         * the vfs.
         */
        if (HAS_UNMAPPED_ID(inode))
                return -EPERM;
        if (!dir->i_op->link)
                return -EPERM;
        if (S_ISDIR(inode->i_mode))
                return -EPERM;

        error = security_inode_link(old_dentry, dir, new_dentry);
        if (error)
                return error;

        inode_lock(inode);
        /* Make sure we don't allow creating hardlink to an unlinked file */
        if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
                error =  -ENOENT;
        else if (max_links && inode->i_nlink >= max_links)
                error = -EMLINK;
        else {
                error = try_break_deleg(inode, delegated_inode);
                if (!error)
                        error = dir->i_op->link(old_dentry, dir, new_dentry);
        }

        if (!error && (inode->i_state & I_LINKABLE)) {
                spin_lock(&inode->i_lock);
                inode->i_state &= ~I_LINKABLE;
                spin_unlock(&inode->i_lock);
        }
        inode_unlock(inode);
        if (!error)
                fsnotify_link(dir, inode, new_dentry);
        return error;
}
EXPORT_SYMBOL(vfs_link);

/*
 * Hardlinks are often used in delicate situations.  We avoid
 * security-related surprises by not following symlinks on the
 * newname.  --KAB
 *
 * We don't follow them on the oldname either to be compatible
 * with linux 2.0, and to avoid hard-linking to directories
 * and other special files.  --ADM
 */
static int do_linkat(int olddfd, const char __user *oldname, int newdfd,
              const char __user *newname, int flags)
{
        struct dentry *new_dentry;
        struct path old_path, new_path;
        struct inode *delegated_inode = NULL;
        int how = 0;
        int error;

        if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
                return -EINVAL;
        /*
         * To use null names we require CAP_DAC_READ_SEARCH
         * This ensures that not everyone will be able to create
         * handlink using the passed filedescriptor.
         */
        if (flags & AT_EMPTY_PATH) {
                if (!capable(CAP_DAC_READ_SEARCH))
                        return -ENOENT;
                how = LOOKUP_EMPTY;
        }

        if (flags & AT_SYMLINK_FOLLOW)
                how |= LOOKUP_FOLLOW;
retry:
        error = user_path_at(olddfd, oldname, how, &old_path);
        if (error)
                return error;

        new_dentry = user_path_create(newdfd, newname, &new_path,
                                        (how & LOOKUP_REVAL));
        error = PTR_ERR(new_dentry);
        if (IS_ERR(new_dentry))
                goto out;

        error = -EXDEV;
        if (old_path.mnt != new_path.mnt)
                goto out_dput;
        error = may_linkat(&old_path);
        if (unlikely(error))
                goto out_dput;
        error = security_path_link(old_path.dentry, &new_path, new_dentry);
        if (error)
                goto out_dput;
        error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode);
out_dput:
        done_path_create(&new_path, new_dentry);
        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error) {
                        path_put(&old_path);
                        goto retry;
                }
        }
        if (retry_estale(error, how)) {
                path_put(&old_path);
                how |= LOOKUP_REVAL;
                goto retry;
        }
out:
        path_put(&old_path);

        return error;
}

SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
                int, newdfd, const char __user *, newname, int, flags)
{
        return do_linkat(olddfd, oldname, newdfd, newname, flags);
}

SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
{
        return do_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
}

/**
 * vfs_rename - rename a filesystem object
 * @old_dir:        parent of source
 * @old_dentry:        source
 * @new_dir:        parent of destination
 * @new_dentry:        destination
 * @delegated_inode: returns an inode needing a delegation break
 * @flags:        rename flags
 *
 * The caller must hold multiple mutexes--see lock_rename()).
 *
 * If vfs_rename discovers a delegation in need of breaking at either
 * the source or destination, it will return -EWOULDBLOCK and return a
 * reference to the inode in delegated_inode.  The caller should then
 * break the delegation and retry.  Because breaking a delegation may
 * take a long time, the caller should drop all locks before doing
 * so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 *
 * The worst of all namespace operations - renaming directory. "Perverted"
 * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
 * Problems:
 *
 *        a) we can get into loop creation.
 *        b) race potential - two innocent renames can create a loop together.
 *           That's where 4.4BSD screws up. Current fix: serialization on
 *           sb->s_vfs_rename_mutex. We might be more accurate, but that's another
 *           story.
 *        c) we may have to lock up to _four_ objects - parents and victim (if it exists),
 *           and source (if it's a non-directory or a subdirectory that moves to
 *           different parent).
 *           And that - after we got ->i_mutex on parents (until then we don't know
 *           whether the target exists).  Solution: try to be smart with locking
 *           order for inodes.  We rely on the fact that tree topology may change
 *           only under ->s_vfs_rename_mutex _and_ that parent of the object we
 *           move will be locked.  Thus we can rank directories by the tree
 *           (ancestors first) and rank all non-directories after them.
 *           That works since everybody except rename does "lock parent, lookup,
 *           lock child" and rename is under ->s_vfs_rename_mutex.
 *           HOWEVER, it relies on the assumption that any object with ->lookup()
 *           has no more than 1 dentry.  If "hybrid" objects will ever appear,
 *           we'd better make sure that there's no link(2) for them.
 *        d) conversion from fhandle to dentry may come in the wrong moment - when
 *           we are removing the target. Solution: we will have to grab ->i_mutex
 *           in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
 *           ->i_mutex on parents, which works but leads to some truly excessive
 *           locking].
 */
int vfs_rename(struct renamedata *rd)
{
        int error;
        struct inode *old_dir = rd->old_dir, *new_dir = rd->new_dir;
        struct dentry *old_dentry = rd->old_dentry;
        struct dentry *new_dentry = rd->new_dentry;
        struct inode **delegated_inode = rd->delegated_inode;
        unsigned int flags = rd->flags;
        bool is_dir = d_is_dir(old_dentry);
        struct inode *source = old_dentry->d_inode;
        struct inode *target = new_dentry->d_inode;
        bool new_is_dir = false;
        unsigned max_links = new_dir->i_sb->s_max_links;
        struct name_snapshot old_name;
        bool lock_old_subdir, lock_new_subdir;

        if (source == target)
                return 0;

        error = may_delete(old_dir, old_dentry, is_dir);
        if (error)
                return error;

        if (!target) {
                error = may_create(new_dir, new_dentry);
        } else {
                new_is_dir = d_is_dir(new_dentry);

                if (!(flags & RENAME_EXCHANGE))
                        error = may_delete(new_dir, new_dentry, is_dir);
                else
                        error = may_delete(new_dir, new_dentry, new_is_dir);
        }
        if (error)
                return error;

        if (!old_dir->i_op->rename)
                return -EPERM;

        /*
         * If we are going to change the parent - check write permissions,
         * we'll need to flip '..'.
         */
        if (new_dir != old_dir) {
                if (is_dir) {
                        error = inode_permission(source, MAY_WRITE);
                        if (error)
                                return error;
                }
                if ((flags & RENAME_EXCHANGE) && new_is_dir) {
                        error = inode_permission(target, MAY_WRITE);
                        if (error)
                                return error;
                }
        }

        error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
                                      flags);
        if (error)
                return error;

        take_dentry_name_snapshot(&old_name, old_dentry);
        dget(new_dentry);
        /*
         * Lock children.
         * The source subdirectory needs to be locked on cross-directory
         * rename or cross-directory exchange since its parent changes.
         * The target subdirectory needs to be locked on cross-directory
         * exchange due to parent change and on any rename due to becoming
         * a victim.
         * Non-directories need locking in all cases (for NFS reasons);
         * they get locked after any subdirectories (in inode address order).
         *
         * NOTE: WE ONLY LOCK UNRELATED DIRECTORIES IN CROSS-DIRECTORY CASE.
         * NEVER, EVER DO THAT WITHOUT ->s_vfs_rename_mutex.
         */
        lock_old_subdir = new_dir != old_dir;
        lock_new_subdir = new_dir != old_dir || !(flags & RENAME_EXCHANGE);
        if (is_dir) {
                if (lock_old_subdir)
                        inode_lock_nested(source, I_MUTEX_CHILD);
                if (target && (!new_is_dir || lock_new_subdir))
                        inode_lock(target);
        } else if (new_is_dir) {
                if (lock_new_subdir)
                        inode_lock_nested(target, I_MUTEX_CHILD);
                inode_lock(source);
        } else {
                lock_two_nondirectories(source, target);
        }

        error = -EBUSY;
        if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
                goto out;

        if (max_links && new_dir != old_dir) {
                error = -EMLINK;
                if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
                        goto out;
                if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
                    old_dir->i_nlink >= max_links)
                        goto out;
        }
        if (!is_dir) {
                error = try_break_deleg(source, delegated_inode);
                if (error)
                        goto out;
        }
        if (target && !new_is_dir) {
                error = try_break_deleg(target, delegated_inode);
                if (error)
                        goto out;
        }
        error = old_dir->i_op->rename(old_dir, old_dentry,
                                       new_dir, new_dentry, flags);
        if (error)
                goto out;

        if (!(flags & RENAME_EXCHANGE) && target) {
                if (is_dir) {
                        shrink_dcache_parent(new_dentry);
                        target->i_flags |= S_DEAD;
                }
                dont_mount(new_dentry);
                detach_mounts(new_dentry);
        }
        if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
                if (!(flags & RENAME_EXCHANGE))
                        d_move(old_dentry, new_dentry);
                else
                        d_exchange(old_dentry, new_dentry);
        }
out:
        if (!is_dir || lock_old_subdir)
                inode_unlock(source);
        if (target && (!new_is_dir || lock_new_subdir))
                inode_unlock(target);
        dput(new_dentry);
        if (!error) {
                fsnotify_move(old_dir, new_dir, &old_name.name, is_dir,
                              !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
                if (flags & RENAME_EXCHANGE) {
                        fsnotify_move(new_dir, old_dir, &old_dentry->d_name,
                                      new_is_dir, NULL, new_dentry);
                }
        }
        release_dentry_name_snapshot(&old_name);

        return error;
}
EXPORT_SYMBOL(vfs_rename);

int do_renameat2(int olddfd, struct filename *from, int newdfd,
                 struct filename *to, unsigned int flags)
{
        struct renamedata rd;
        struct dentry *old_dentry, *new_dentry;
        struct dentry *trap;
        struct path old_path, new_path;
        struct qstr old_last, new_last;
        int old_type, new_type;
        struct inode *delegated_inode = NULL;
        unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
        bool should_retry = false;
        int error = -EINVAL;

        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
                goto put_both;

        if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
            (flags & RENAME_EXCHANGE))
                goto put_both;

        if (flags & RENAME_EXCHANGE)
                target_flags = 0;

retry:
        from = filename_parentat(olddfd, from, lookup_flags, &old_path,
                                        &old_last, &old_type);
        if (IS_ERR(from)) {
                error = PTR_ERR(from);
                goto put_new;
        }

        to = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last,
                                &new_type);
        if (IS_ERR(to)) {
                error = PTR_ERR(to);
                goto exit1;
        }

        error = -EXDEV;
        if (old_path.mnt != new_path.mnt)
                goto exit2;

        error = -EBUSY;
        if (old_type != LAST_NORM)
                goto exit2;

        if (flags & RENAME_NOREPLACE)
                error = -EEXIST;
        if (new_type != LAST_NORM)
                goto exit2;

        error = mnt_want_write(old_path.mnt);
        if (error)
                goto exit2;

retry_deleg:
        trap = lock_rename(new_path.dentry, old_path.dentry);

        old_dentry = __lookup_hash(&old_last, old_path.dentry, lookup_flags);
        error = PTR_ERR(old_dentry);
        if (IS_ERR(old_dentry))
                goto exit3;
        /* source must exist */
        error = -ENOENT;
        if (d_is_negative(old_dentry))
                goto exit4;
        new_dentry = __lookup_hash(&new_last, new_path.dentry, lookup_flags | target_flags);
        error = PTR_ERR(new_dentry);
        if (IS_ERR(new_dentry))
                goto exit4;
        error = -EEXIST;
        if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
                goto exit5;
        if (flags & RENAME_EXCHANGE) {
                error = -ENOENT;
                if (d_is_negative(new_dentry))
                        goto exit5;

                if (!d_is_dir(new_dentry)) {
                        error = -ENOTDIR;
                        if (new_last.name[new_last.len])
                                goto exit5;
                }
        }
        /* unless the source is a directory trailing slashes give -ENOTDIR */
        if (!d_is_dir(old_dentry)) {
                error = -ENOTDIR;
                if (old_last.name[old_last.len])
                        goto exit5;
                if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
                        goto exit5;
        }
        /* source should not be ancestor of target */
        error = -EINVAL;
        if (old_dentry == trap)
                goto exit5;
        /* target should not be an ancestor of source */
        if (!(flags & RENAME_EXCHANGE))
                error = -ENOTEMPTY;
        if (new_dentry == trap)
                goto exit5;

        error = security_path_rename(&old_path, old_dentry,
                                     &new_path, new_dentry, flags);
        if (error)
                goto exit5;

        rd.old_dir           = old_path.dentry->d_inode;
        rd.old_dentry           = old_dentry;
        rd.new_dir           = new_path.dentry->d_inode;
        rd.new_dentry           = new_dentry;
        rd.delegated_inode = &delegated_inode;
        rd.flags           = flags;
        error = vfs_rename(&rd);
exit5:
        dput(new_dentry);
exit4:
        dput(old_dentry);
exit3:
        unlock_rename(new_path.dentry, old_path.dentry);
        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }
        mnt_drop_write(old_path.mnt);
exit2:
        if (retry_estale(error, lookup_flags))
                should_retry = true;
        path_put(&new_path);
exit1:
        path_put(&old_path);
        if (should_retry) {
                should_retry = false;
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
put_both:
        if (!IS_ERR(from))
                putname(from);
put_new:
        if (!IS_ERR(to))
                putname(to);
        return error;
}

SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
                int, newdfd, const char __user *, newname, unsigned int, flags)
{
        return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
                                flags);
}

SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
                int, newdfd, const char __user *, newname)
{
        return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
                                0);
}

SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
{
        return do_renameat2(AT_FDCWD, getname(oldname), AT_FDCWD,
                                getname(newname), 0);
}

int readlink_copy(char __user *buffer, int buflen, const char *link)
{
        int len = PTR_ERR(link);
        if (IS_ERR(link))
                goto out;

        len = strlen(link);
        if (len > (unsigned) buflen)
                len = buflen;
        if (copy_to_user(buffer, link, len))
                len = -EFAULT;
out:
        return len;
}

/**
 * vfs_readlink - copy symlink body into userspace buffer
 * @dentry: dentry on which to get symbolic link
 * @buffer: user memory pointer
 * @buflen: size of buffer
 *
 * Does not touch atime.  That's up to the caller if necessary
 *
 * Does not call security hook.
 */
int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
        struct inode *inode = d_inode(dentry);
        DEFINE_DELAYED_CALL(done);
        const char *link;
        int res;

        if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
                if (unlikely(inode->i_op->readlink))
                        return inode->i_op->readlink(dentry, buffer, buflen);

                if (!d_is_symlink(dentry))
                        return -EINVAL;

                spin_lock(&inode->i_lock);
                inode->i_opflags |= IOP_DEFAULT_READLINK;
                spin_unlock(&inode->i_lock);
        }

        link = READ_ONCE(inode->i_link);
        if (!link) {
                link = inode->i_op->get_link(dentry, inode, &done);
                if (IS_ERR(link))
                        return PTR_ERR(link);
        }
        res = readlink_copy(buffer, buflen, link);
        do_delayed_call(&done);
        return res;
}
EXPORT_SYMBOL(vfs_readlink);

/**
 * vfs_get_link - get symlink body
 * @dentry: dentry on which to get symbolic link
 * @done: caller needs to free returned data with this
 *
 * Calls security hook and i_op->get_link() on the supplied inode.
 *
 * It does not touch atime.  That's up to the caller if necessary.
 *
 * Does not work on "special" symlinks like /proc/$$/fd/N
 */
const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done)
{
        const char *res = ERR_PTR(-EINVAL);
        struct inode *inode = d_inode(dentry);

        if (d_is_symlink(dentry)) {
                res = ERR_PTR(security_inode_readlink(dentry));
                if (!res)
                        res = inode->i_op->get_link(dentry, inode, done);
        }
        return res;
}
EXPORT_SYMBOL(vfs_get_link);

/* get the link contents into pagecache */
static char *__page_get_link(struct dentry *dentry, struct inode *inode,
                             struct delayed_call *callback)
{
        struct page *page;
        struct address_space *mapping = inode->i_mapping;

        if (!dentry) {
                page = find_get_page(mapping, 0);
                if (!page)
                        return ERR_PTR(-ECHILD);
                if (!PageUptodate(page)) {
                        put_page(page);
                        return ERR_PTR(-ECHILD);
                }
        } else {
                page = read_mapping_page(mapping, 0, NULL);
                if (IS_ERR(page))
                        return (char*)page;
        }
        set_delayed_call(callback, page_put_link, page);
        BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
        return page_address(page);
}

const char *page_get_link_raw(struct dentry *dentry, struct inode *inode,
                              struct delayed_call *callback)
{
        return __page_get_link(dentry, inode, callback);
}
EXPORT_SYMBOL_GPL(page_get_link_raw);

const char *page_get_link(struct dentry *dentry, struct inode *inode,
                                        struct delayed_call *callback)
{
        char *kaddr = __page_get_link(dentry, inode, callback);

        if (!IS_ERR(kaddr))
                nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
        return kaddr;
}

EXPORT_SYMBOL(page_get_link);

void page_put_link(void *arg)
{
        put_page(arg);
}
EXPORT_SYMBOL(page_put_link);

int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
        DEFINE_DELAYED_CALL(done);
        int res = readlink_copy(buffer, buflen,
                                page_get_link(dentry, d_inode(dentry),
                                              &done));
        do_delayed_call(&done);
        return res;
}
EXPORT_SYMBOL(page_readlink);

/*
 * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
 */
int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
{
        struct address_space *mapping = inode->i_mapping;
        struct page *page;
        void *fsdata = NULL;
        int err;
        unsigned int flags = 0;
        if (nofs)
                flags |= AOP_FLAG_NOFS;

retry:
        err = pagecache_write_begin(NULL, mapping, 0, len-1,
                                flags, &page, &fsdata);
        if (err)
                goto fail;

        memcpy(page_address(page), symname, len-1);

        err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
                                                        page, fsdata);
        if (err < 0)
                goto fail;
        if (err < len-1)
                goto retry;

        mark_inode_dirty(inode);
        return 0;
fail:
        return err;
}
EXPORT_SYMBOL(__page_symlink);

int page_symlink(struct inode *inode, const char *symname, int len)
{
        return __page_symlink(inode, symname, len,
                        !mapping_gfp_constraint(inode->i_mapping, __GFP_FS));
}
EXPORT_SYMBOL(page_symlink);

const struct inode_operations page_symlink_inode_operations = {
        .get_link        = page_get_link,
};
EXPORT_SYMBOL(page_symlink_inode_operations);



























































    1 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#ifndef _LINUX_PSI_H
#define _LINUX_PSI_H

#include <linux/jump_label.h>
#include <linux/psi_types.h>
#include <linux/sched.h>
#include <linux/poll.h>

struct seq_file;
struct css_set;

#ifdef CONFIG_PSI

extern struct static_key_false psi_disabled;
extern struct psi_group psi_system;

void psi_init(void);

void psi_task_change(struct task_struct *task, int clear, int set);
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
                     bool sleep);

void psi_memstall_tick(struct task_struct *task, int cpu);
void psi_memstall_enter(unsigned long *flags);
void psi_memstall_leave(unsigned long *flags);

int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);

#ifdef CONFIG_CGROUPS
int psi_cgroup_alloc(struct cgroup *cgrp);
void psi_cgroup_free(struct cgroup *cgrp);
void cgroup_move_task(struct task_struct *p, struct css_set *to);

struct psi_trigger *psi_trigger_create(struct psi_group *group,
                        char *buf, size_t nbytes, enum psi_res res);
void psi_trigger_destroy(struct psi_trigger *t);

__poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
                        poll_table *wait);
#endif

#else /* CONFIG_PSI */

static inline void psi_init(void) {}

static inline void psi_memstall_enter(unsigned long *flags) {}
static inline void psi_memstall_leave(unsigned long *flags) {}

#ifdef CONFIG_CGROUPS
static inline int psi_cgroup_alloc(struct cgroup *cgrp)
{
        return 0;
}
static inline void psi_cgroup_free(struct cgroup *cgrp)
{
}
static inline void cgroup_move_task(struct task_struct *p, struct css_set *to)
{
        rcu_assign_pointer(p->cgroups, to);
}
#endif

#endif /* CONFIG_PSI */

#endif /* _LINUX_PSI_H */












    3 


    3 

    2 

    2 

    2 



    2 


















    3 























    3 






    3 










    3 










    3 
    3 




    3 

    3 














    3 





    3 




    3 






































































    3 
























    3 





    3 




    3 











































































    1 




    1 






    3 

    3 




    3 

    3 


















































    1 





    1 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
// SPDX-License-Identifier: GPL-2.0
/*
 * Interface between ext4 and JBD
 */

#include "ext4_jbd2.h"

#include <trace/events/ext4.h>

int ext4_inode_journal_mode(struct inode *inode)
{
        if (EXT4_JOURNAL(inode) == NULL)
                return EXT4_INODE_WRITEBACK_DATA_MODE;        /* writeback */
        /* We do not support data journalling with delayed allocation */
        if (!S_ISREG(inode->i_mode) ||
            ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
            test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
            (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
            !test_opt(inode->i_sb, DELALLOC))) {
                /* We do not support data journalling for encrypted data */
                if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode))
                        return EXT4_INODE_ORDERED_DATA_MODE;  /* ordered */
                return EXT4_INODE_JOURNAL_DATA_MODE;        /* journal data */
        }
        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
                return EXT4_INODE_ORDERED_DATA_MODE;        /* ordered */
        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
                return EXT4_INODE_WRITEBACK_DATA_MODE;        /* writeback */
        BUG();
}

/* Just increment the non-pointer handle value */
static handle_t *ext4_get_nojournal(void)
{
        handle_t *handle = current->journal_info;
        unsigned long ref_cnt = (unsigned long)handle;

        BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT);

        ref_cnt++;
        handle = (handle_t *)ref_cnt;

        current->journal_info = handle;
        return handle;
}


/* Decrement the non-pointer handle value */
static void ext4_put_nojournal(handle_t *handle)
{
        unsigned long ref_cnt = (unsigned long)handle;

        BUG_ON(ref_cnt == 0);

        ref_cnt--;
        handle = (handle_t *)ref_cnt;

        current->journal_info = handle;
}

/*
 * Wrappers for jbd2_journal_start/end.
 */
static int ext4_journal_check_start(struct super_block *sb)
{
        journal_t *journal;

        might_sleep();

        if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
                return -EIO;

        if (sb_rdonly(sb))
                return -EROFS;
        WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
        journal = EXT4_SB(sb)->s_journal;
        /*
         * Special case here: if the journal has aborted behind our
         * backs (eg. EIO in the commit thread), then we still need to
         * take the FS itself readonly cleanly.
         */
        if (journal && is_journal_aborted(journal)) {
                ext4_abort(sb, -journal->j_errno, "Detected aborted journal");
                return -EROFS;
        }
        return 0;
}

handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
                                  int type, int blocks, int rsv_blocks,
                                  int revoke_creds)
{
        journal_t *journal;
        int err;

        trace_ext4_journal_start(sb, blocks, rsv_blocks, revoke_creds,
                                 _RET_IP_);
        err = ext4_journal_check_start(sb);
        if (err < 0)
                return ERR_PTR(err);

        journal = EXT4_SB(sb)->s_journal;
        if (!journal || (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
                return ext4_get_nojournal();
        return jbd2__journal_start(journal, blocks, rsv_blocks, revoke_creds,
                                   GFP_NOFS, type, line);
}

int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
{
        struct super_block *sb;
        int err;
        int rc;

        if (!ext4_handle_valid(handle)) {
                ext4_put_nojournal(handle);
                return 0;
        }

        err = handle->h_err;
        if (!handle->h_transaction) {
                rc = jbd2_journal_stop(handle);
                return err ? err : rc;
        }

        sb = handle->h_transaction->t_journal->j_private;
        rc = jbd2_journal_stop(handle);

        if (!err)
                err = rc;
        if (err)
                __ext4_std_error(sb, where, line, err);
        return err;
}

handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
                                        int type)
{
        struct super_block *sb;
        int err;

        if (!ext4_handle_valid(handle))
                return ext4_get_nojournal();

        sb = handle->h_journal->j_private;
        trace_ext4_journal_start_reserved(sb,
                                jbd2_handle_buffer_credits(handle), _RET_IP_);
        err = ext4_journal_check_start(sb);
        if (err < 0) {
                jbd2_journal_free_reserved(handle);
                return ERR_PTR(err);
        }

        err = jbd2_journal_start_reserved(handle, type, line);
        if (err < 0)
                return ERR_PTR(err);
        return handle;
}

int __ext4_journal_ensure_credits(handle_t *handle, int check_cred,
                                  int extend_cred, int revoke_cred)
{
        if (!ext4_handle_valid(handle))
                return 0;
        if (is_handle_aborted(handle))
                return -EROFS;
        if (jbd2_handle_buffer_credits(handle) >= check_cred &&
            handle->h_revoke_credits >= revoke_cred)
                return 0;
        extend_cred = max(0, extend_cred - jbd2_handle_buffer_credits(handle));
        revoke_cred = max(0, revoke_cred - handle->h_revoke_credits);
        return ext4_journal_extend(handle, extend_cred, revoke_cred);
}

static void ext4_journal_abort_handle(const char *caller, unsigned int line,
                                      const char *err_fn,
                                      struct buffer_head *bh,
                                      handle_t *handle, int err)
{
        char nbuf[16];
        const char *errstr = ext4_decode_error(NULL, err, nbuf);

        BUG_ON(!ext4_handle_valid(handle));

        if (bh)
                BUFFER_TRACE(bh, "abort");

        if (!handle->h_err)
                handle->h_err = err;

        if (is_handle_aborted(handle))
                return;

        printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n",
               caller, line, errstr, err_fn);

        jbd2_journal_abort_handle(handle);
}

static void ext4_check_bdev_write_error(struct super_block *sb)
{
        struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int err;

        /*
         * If the block device has write error flag, it may have failed to
         * async write out metadata buffers in the background. In this case,
         * we could read old data from disk and write it out again, which
         * may lead to on-disk filesystem inconsistency.
         */
        if (errseq_check(&mapping->wb_err, READ_ONCE(sbi->s_bdev_wb_err))) {
                spin_lock(&sbi->s_bdev_wb_lock);
                err = errseq_check_and_advance(&mapping->wb_err, &sbi->s_bdev_wb_err);
                spin_unlock(&sbi->s_bdev_wb_lock);
                if (err)
                        ext4_error_err(sb, -err,
                                       "Error while async write back metadata");
        }
}

int __ext4_journal_get_write_access(const char *where, unsigned int line,
                                    handle_t *handle, struct buffer_head *bh)
{
        int err = 0;

        might_sleep();

        if (bh->b_bdev->bd_super)
                ext4_check_bdev_write_error(bh->b_bdev->bd_super);

        if (ext4_handle_valid(handle)) {
                err = jbd2_journal_get_write_access(handle, bh);
                if (err)
                        ext4_journal_abort_handle(where, line, __func__, bh,
                                                  handle, err);
        }
        return err;
}

/*
 * The ext4 forget function must perform a revoke if we are freeing data
 * which has been journaled.  Metadata (eg. indirect blocks) must be
 * revoked in all cases.
 *
 * "bh" may be NULL: a metadata block may have been freed from memory
 * but there may still be a record of it in the journal, and that record
 * still needs to be revoked.
 *
 * If the handle isn't valid we're not journaling, but we still need to
 * call into ext4_journal_revoke() to put the buffer head.
 */
int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
                  int is_metadata, struct inode *inode,
                  struct buffer_head *bh, ext4_fsblk_t blocknr)
{
        int err;

        might_sleep();

        trace_ext4_forget(inode, is_metadata, blocknr);
        BUFFER_TRACE(bh, "enter");

        jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
                  "data mode %x\n",
                  bh, is_metadata, inode->i_mode,
                  test_opt(inode->i_sb, DATA_FLAGS));

        /* In the no journal case, we can just do a bforget and return */
        if (!ext4_handle_valid(handle)) {
                bforget(bh);
                return 0;
        }

        /* Never use the revoke function if we are doing full data
         * journaling: there is no need to, and a V1 superblock won't
         * support it.  Otherwise, only skip the revoke on un-journaled
         * data blocks. */

        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
            (!is_metadata && !ext4_should_journal_data(inode))) {
                if (bh) {
                        BUFFER_TRACE(bh, "call jbd2_journal_forget");
                        err = jbd2_journal_forget(handle, bh);
                        if (err)
                                ext4_journal_abort_handle(where, line, __func__,
                                                          bh, handle, err);
                        return err;
                }
                return 0;
        }

        /*
         * data!=journal && (is_metadata || should_journal_data(inode))
         */
        BUFFER_TRACE(bh, "call jbd2_journal_revoke");
        err = jbd2_journal_revoke(handle, blocknr, bh);
        if (err) {
                ext4_journal_abort_handle(where, line, __func__,
                                          bh, handle, err);
                __ext4_abort(inode->i_sb, where, line, -err,
                           "error %d when attempting revoke", err);
        }
        BUFFER_TRACE(bh, "exit");
        return err;
}

int __ext4_journal_get_create_access(const char *where, unsigned int line,
                                handle_t *handle, struct buffer_head *bh)
{
        int err = 0;

        if (ext4_handle_valid(handle)) {
                err = jbd2_journal_get_create_access(handle, bh);
                if (err)
                        ext4_journal_abort_handle(where, line, __func__,
                                                  bh, handle, err);
        }
        return err;
}

int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
                                 handle_t *handle, struct inode *inode,
                                 struct buffer_head *bh)
{
        int err = 0;

        might_sleep();

        set_buffer_meta(bh);
        set_buffer_prio(bh);
        if (ext4_handle_valid(handle)) {
                err = jbd2_journal_dirty_metadata(handle, bh);
                /* Errors can only happen due to aborted journal or a nasty bug */
                if (!is_handle_aborted(handle) && WARN_ON_ONCE(err)) {
                        ext4_journal_abort_handle(where, line, __func__, bh,
                                                  handle, err);
                        if (inode == NULL) {
                                pr_err("EXT4: jbd2_journal_dirty_metadata "
                                       "failed: handle type %u started at "
                                       "line %u, credits %u/%u, errcode %d",
                                       handle->h_type,
                                       handle->h_line_no,
                                       handle->h_requested_credits,
                                       jbd2_handle_buffer_credits(handle), err);
                                return err;
                        }
                        ext4_error_inode(inode, where, line,
                                         bh->b_blocknr,
                                         "journal_dirty_metadata failed: "
                                         "handle type %u started at line %u, "
                                         "credits %u/%u, errcode %d",
                                         handle->h_type,
                                         handle->h_line_no,
                                         handle->h_requested_credits,
                                         jbd2_handle_buffer_credits(handle),
                                         err);
                }
        } else {
                set_buffer_uptodate(bh);
                if (inode)
                        mark_buffer_dirty_inode(bh, inode);
                else
                        mark_buffer_dirty(bh);
                if (inode && inode_needs_sync(inode)) {
                        sync_dirty_buffer(bh);
                        if (buffer_req(bh) && !buffer_uptodate(bh)) {
                                ext4_error_inode_err(inode, where, line,
                                                     bh->b_blocknr, EIO,
                                        "IO error syncing itable block");
                                err = -EIO;
                        }
                }
        }
        return err;
}

int __ext4_handle_dirty_super(const char *where, unsigned int line,
                              handle_t *handle, struct super_block *sb)
{
        struct buffer_head *bh = EXT4_SB(sb)->s_sbh;
        int err = 0;

        ext4_superblock_csum_set(sb);
        if (ext4_handle_valid(handle)) {
                err = jbd2_journal_dirty_metadata(handle, bh);
                if (err)
                        ext4_journal_abort_handle(where, line, __func__,
                                                  bh, handle, err);
        } else
                mark_buffer_dirty(bh);
        return err;
}




































    3 





























































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
// SPDX-License-Identifier: GPL-2.0-only
/*
 * mm/readahead.c - address_space-level file readahead.
 *
 * Copyright (C) 2002, Linus Torvalds
 *
 * 09Apr2002        Andrew Morton
 *                Initial version.
 */

#include <linux/kernel.h>
#include <linux/dax.h>
#include <linux/gfp.h>
#include <linux/export.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/pagevec.h>
#include <linux/pagemap.h>
#include <linux/syscalls.h>
#include <linux/file.h>
#include <linux/mm_inline.h>
#include <linux/blk-cgroup.h>
#include <linux/fadvise.h>
#include <linux/sched/mm.h>

#include "internal.h"

/*
 * Initialise a struct file's readahead state.  Assumes that the caller has
 * memset *ra to zero.
 */
void
file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
{
        ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages;
        ra->prev_pos = -1;
}
EXPORT_SYMBOL_GPL(file_ra_state_init);

/*
 * see if a page needs releasing upon read_cache_pages() failure
 * - the caller of read_cache_pages() may have set PG_private or PG_fscache
 *   before calling, such as the NFS fs marking pages that are cached locally
 *   on disk, thus we need to give the fs a chance to clean up in the event of
 *   an error
 */
static void read_cache_pages_invalidate_page(struct address_space *mapping,
                                             struct page *page)
{
        if (page_has_private(page)) {
                if (!trylock_page(page))
                        BUG();
                page->mapping = mapping;
                do_invalidatepage(page, 0, PAGE_SIZE);
                page->mapping = NULL;
                unlock_page(page);
        }
        put_page(page);
}

/*
 * release a list of pages, invalidating them first if need be
 */
static void read_cache_pages_invalidate_pages(struct address_space *mapping,
                                              struct list_head *pages)
{
        struct page *victim;

        while (!list_empty(pages)) {
                victim = lru_to_page(pages);
                list_del(&victim->lru);
                read_cache_pages_invalidate_page(mapping, victim);
        }
}

/**
 * read_cache_pages - populate an address space with some pages & start reads against them
 * @mapping: the address_space
 * @pages: The address of a list_head which contains the target pages.  These
 *   pages have their ->index populated and are otherwise uninitialised.
 * @filler: callback routine for filling a single page.
 * @data: private data for the callback routine.
 *
 * Hides the details of the LRU cache etc from the filesystems.
 *
 * Returns: %0 on success, error return by @filler otherwise
 */
int read_cache_pages(struct address_space *mapping, struct list_head *pages,
                        int (*filler)(void *, struct page *), void *data)
{
        struct page *page;
        int ret = 0;

        while (!list_empty(pages)) {
                page = lru_to_page(pages);
                list_del(&page->lru);
                if (add_to_page_cache_lru(page, mapping, page->index,
                                readahead_gfp_mask(mapping))) {
                        read_cache_pages_invalidate_page(mapping, page);
                        continue;
                }
                put_page(page);

                ret = filler(data, page);
                if (unlikely(ret)) {
                        read_cache_pages_invalidate_pages(mapping, pages);
                        break;
                }
                task_io_account_read(PAGE_SIZE);
        }
        return ret;
}

EXPORT_SYMBOL(read_cache_pages);

static void read_pages(struct readahead_control *rac, struct list_head *pages,
                bool skip_page)
{
        const struct address_space_operations *aops = rac->mapping->a_ops;
        struct page *page;
        struct blk_plug plug;

        if (!readahead_count(rac))
                goto out;

        blk_start_plug(&plug);

        if (aops->readahead) {
                aops->readahead(rac);
                /* Clean up the remaining pages */
                while ((page = readahead_page(rac))) {
                        unlock_page(page);
                        put_page(page);
                }
        } else if (aops->readpages) {
                aops->readpages(rac->file, rac->mapping, pages,
                                readahead_count(rac));
                /* Clean up the remaining pages */
                put_pages_list(pages);
                rac->_index += rac->_nr_pages;
                rac->_nr_pages = 0;
        } else {
                while ((page = readahead_page(rac))) {
                        aops->readpage(rac->file, page);
                        put_page(page);
                }
        }

        blk_finish_plug(&plug);

        BUG_ON(!list_empty(pages));
        BUG_ON(readahead_count(rac));

out:
        if (skip_page)
                rac->_index++;
}

/**
 * page_cache_ra_unbounded - Start unchecked readahead.
 * @ractl: Readahead control.
 * @nr_to_read: The number of pages to read.
 * @lookahead_size: Where to start the next readahead.
 *
 * This function is for filesystems to call when they want to start
 * readahead beyond a file's stated i_size.  This is almost certainly
 * not the function you want to call.  Use page_cache_async_readahead()
 * or page_cache_sync_readahead() instead.
 *
 * Context: File is referenced by caller.  Mutexes may be held by caller.
 * May sleep, but will not reenter filesystem to reclaim memory.
 */
void page_cache_ra_unbounded(struct readahead_control *ractl,
                unsigned long nr_to_read, unsigned long lookahead_size)
{
        struct address_space *mapping = ractl->mapping;
        unsigned long index = readahead_index(ractl);
        LIST_HEAD(page_pool);
        gfp_t gfp_mask = readahead_gfp_mask(mapping);
        unsigned long i;

        /*
         * Partway through the readahead operation, we will have added
         * locked pages to the page cache, but will not yet have submitted
         * them for I/O.  Adding another page may need to allocate memory,
         * which can trigger memory reclaim.  Telling the VM we're in
         * the middle of a filesystem operation will cause it to not
         * touch file-backed pages, preventing a deadlock.  Most (all?)
         * filesystems already specify __GFP_NOFS in their mapping's
         * gfp_mask, but let's be explicit here.
         */
        unsigned int nofs = memalloc_nofs_save();

        /*
         * Preallocate as many pages as we will need.
         */
        for (i = 0; i < nr_to_read; i++) {
                struct page *page = xa_load(&mapping->i_pages, index + i);

                BUG_ON(index + i != ractl->_index + ractl->_nr_pages);

                if (page && !xa_is_value(page)) {
                        /*
                         * Page already present?  Kick off the current batch
                         * of contiguous pages before continuing with the
                         * next batch.  This page may be the one we would
                         * have intended to mark as Readahead, but we don't
                         * have a stable reference to this page, and it's
                         * not worth getting one just for that.
                         */
                        read_pages(ractl, &page_pool, true);
                        continue;
                }

                page = __page_cache_alloc(gfp_mask);
                if (!page)
                        break;
                if (mapping->a_ops->readpages) {
                        page->index = index + i;
                        list_add(&page->lru, &page_pool);
                } else if (add_to_page_cache_lru(page, mapping, index + i,
                                        gfp_mask) < 0) {
                        put_page(page);
                        read_pages(ractl, &page_pool, true);
                        continue;
                }
                if (i == nr_to_read - lookahead_size)
                        SetPageReadahead(page);
                ractl->_nr_pages++;
        }

        /*
         * Now start the IO.  We ignore I/O errors - if the page is not
         * uptodate then the caller will launch readpage again, and
         * will then handle the error.
         */
        read_pages(ractl, &page_pool, false);
        memalloc_nofs_restore(nofs);
}
EXPORT_SYMBOL_GPL(page_cache_ra_unbounded);

/*
 * do_page_cache_ra() actually reads a chunk of disk.  It allocates
 * the pages first, then submits them for I/O. This avoids the very bad
 * behaviour which would occur if page allocations are causing VM writeback.
 * We really don't want to intermingle reads and writes like that.
 */
void do_page_cache_ra(struct readahead_control *ractl,
                unsigned long nr_to_read, unsigned long lookahead_size)
{
        struct inode *inode = ractl->mapping->host;
        unsigned long index = readahead_index(ractl);
        loff_t isize = i_size_read(inode);
        pgoff_t end_index;        /* The last page we want to read */

        if (isize == 0)
                return;

        end_index = (isize - 1) >> PAGE_SHIFT;
        if (index > end_index)
                return;
        /* Don't read past the page containing the last byte of the file */
        if (nr_to_read > end_index - index)
                nr_to_read = end_index - index + 1;

        page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size);
}

/*
 * Chunk the readahead into 2 megabyte units, so that we don't pin too much
 * memory at once.
 */
void force_page_cache_ra(struct readahead_control *ractl,
                struct file_ra_state *ra, unsigned long nr_to_read)
{
        struct address_space *mapping = ractl->mapping;
        struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
        unsigned long max_pages, index;

        if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages &&
                        !mapping->a_ops->readahead))
                return;

        /*
         * If the request exceeds the readahead window, allow the read to
         * be up to the optimal hardware IO size
         */
        index = readahead_index(ractl);
        max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
        nr_to_read = min_t(unsigned long, nr_to_read, max_pages);
        while (nr_to_read) {
                unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE;

                if (this_chunk > nr_to_read)
                        this_chunk = nr_to_read;
                ractl->_index = index;
                do_page_cache_ra(ractl, this_chunk, 0);

                index += this_chunk;
                nr_to_read -= this_chunk;
        }
}

/*
 * Set the initial window size, round to next power of 2 and square
 * for small size, x 4 for medium, and x 2 for large
 * for 128k (32 page) max ra
 * 1-8 page = 32k initial, > 8 page = 128k initial
 */
static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
{
        unsigned long newsize = roundup_pow_of_two(size);

        if (newsize <= max / 32)
                newsize = newsize * 4;
        else if (newsize <= max / 4)
                newsize = newsize * 2;
        else
                newsize = max;

        return newsize;
}

/*
 *  Get the previous window size, ramp it up, and
 *  return it as the new window size.
 */
static unsigned long get_next_ra_size(struct file_ra_state *ra,
                                      unsigned long max)
{
        unsigned long cur = ra->size;

        if (cur < max / 16)
                return 4 * cur;
        if (cur <= max / 2)
                return 2 * cur;
        return max;
}

/*
 * On-demand readahead design.
 *
 * The fields in struct file_ra_state represent the most-recently-executed
 * readahead attempt:
 *
 *                        |<----- async_size ---------|
 *     |------------------- size -------------------->|
 *     |==================#===========================|
 *     ^start             ^page marked with PG_readahead
 *
 * To overlap application thinking time and disk I/O time, we do
 * `readahead pipelining': Do not wait until the application consumed all
 * readahead pages and stalled on the missing page at readahead_index;
 * Instead, submit an asynchronous readahead I/O as soon as there are
 * only async_size pages left in the readahead window. Normally async_size
 * will be equal to size, for maximum pipelining.
 *
 * In interleaved sequential reads, concurrent streams on the same fd can
 * be invalidating each other's readahead state. So we flag the new readahead
 * page at (start+size-async_size) with PG_readahead, and use it as readahead
 * indicator. The flag won't be set on already cached pages, to avoid the
 * readahead-for-nothing fuss, saving pointless page cache lookups.
 *
 * prev_pos tracks the last visited byte in the _previous_ read request.
 * It should be maintained by the caller, and will be used for detecting
 * small random reads. Note that the readahead algorithm checks loosely
 * for sequential patterns. Hence interleaved reads might be served as
 * sequential ones.
 *
 * There is a special-case: if the first page which the application tries to
 * read happens to be the first page of the file, it is assumed that a linear
 * read is about to happen and the window is immediately set to the initial size
 * based on I/O request size and the max_readahead.
 *
 * The code ramps up the readahead size aggressively at first, but slow down as
 * it approaches max_readhead.
 */

/*
 * Count contiguously cached pages from @index-1 to @index-@max,
 * this count is a conservative estimation of
 *         - length of the sequential read sequence, or
 *         - thrashing threshold in memory tight systems
 */
static pgoff_t count_history_pages(struct address_space *mapping,
                                   pgoff_t index, unsigned long max)
{
        pgoff_t head;

        rcu_read_lock();
        head = page_cache_prev_miss(mapping, index - 1, max);
        rcu_read_unlock();

        return index - 1 - head;
}

/*
 * page cache context based read-ahead
 */
static int try_context_readahead(struct address_space *mapping,
                                 struct file_ra_state *ra,
                                 pgoff_t index,
                                 unsigned long req_size,
                                 unsigned long max)
{
        pgoff_t size;

        size = count_history_pages(mapping, index, max);

        /*
         * not enough history pages:
         * it could be a random read
         */
        if (size <= req_size)
                return 0;

        /*
         * starts from beginning of file:
         * it is a strong indication of long-run stream (or whole-file-read)
         */
        if (size >= index)
                size *= 2;

        ra->start = index;
        ra->size = min(size + req_size, max);
        ra->async_size = 1;

        return 1;
}

/*
 * A minimal readahead algorithm for trivial sequential/random reads.
 */
static void ondemand_readahead(struct readahead_control *ractl,
                struct file_ra_state *ra, bool hit_readahead_marker,
                unsigned long req_size)
{
        struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host);
        unsigned long max_pages = ra->ra_pages;
        unsigned long add_pages;
        unsigned long index = readahead_index(ractl);
        pgoff_t prev_index;

        /*
         * If the request exceeds the readahead window, allow the read to
         * be up to the optimal hardware IO size
         */
        if (req_size > max_pages && bdi->io_pages > max_pages)
                max_pages = min(req_size, bdi->io_pages);

        /*
         * start of file
         */
        if (!index)
                goto initial_readahead;

        /*
         * It's the expected callback index, assume sequential access.
         * Ramp up sizes, and push forward the readahead window.
         */
        if ((index == (ra->start + ra->size - ra->async_size) ||
             index == (ra->start + ra->size))) {
                ra->start += ra->size;
                ra->size = get_next_ra_size(ra, max_pages);
                ra->async_size = ra->size;
                goto readit;
        }

        /*
         * Hit a marked page without valid readahead state.
         * E.g. interleaved reads.
         * Query the pagecache for async_size, which normally equals to
         * readahead size. Ramp it up and use it as the new readahead size.
         */
        if (hit_readahead_marker) {
                pgoff_t start;

                rcu_read_lock();
                start = page_cache_next_miss(ractl->mapping, index + 1,
                                max_pages);
                rcu_read_unlock();

                if (!start || start - index > max_pages)
                        return;

                ra->start = start;
                ra->size = start - index;        /* old async_size */
                ra->size += req_size;
                ra->size = get_next_ra_size(ra, max_pages);
                ra->async_size = ra->size;
                goto readit;
        }

        /*
         * oversize read
         */
        if (req_size > max_pages)
                goto initial_readahead;

        /*
         * sequential cache miss
         * trivial case: (index - prev_index) == 1
         * unaligned reads: (index - prev_index) == 0
         */
        prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
        if (index - prev_index <= 1UL)
                goto initial_readahead;

        /*
         * Query the page cache and look for the traces(cached history pages)
         * that a sequential stream would leave behind.
         */
        if (try_context_readahead(ractl->mapping, ra, index, req_size,
                        max_pages))
                goto readit;

        /*
         * standalone, small random read
         * Read as is, and do not pollute the readahead state.
         */
        do_page_cache_ra(ractl, req_size, 0);
        return;

initial_readahead:
        ra->start = index;
        ra->size = get_init_ra_size(req_size, max_pages);
        ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;

readit:
        /*
         * Will this read hit the readahead marker made by itself?
         * If so, trigger the readahead marker hit now, and merge
         * the resulted next readahead window into the current one.
         * Take care of maximum IO pages as above.
         */
        if (index == ra->start && ra->size == ra->async_size) {
                add_pages = get_next_ra_size(ra, max_pages);
                if (ra->size + add_pages <= max_pages) {
                        ra->async_size = add_pages;
                        ra->size += add_pages;
                } else {
                        ra->size = max_pages;
                        ra->async_size = max_pages >> 1;
                }
        }

        ractl->_index = ra->start;
        do_page_cache_ra(ractl, ra->size, ra->async_size);
}

void page_cache_sync_ra(struct readahead_control *ractl,
                struct file_ra_state *ra, unsigned long req_count)
{
        bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM);

        /*
         * Even if read-ahead is disabled, issue this request as read-ahead
         * as we'll need it to satisfy the requested range. The forced
         * read-ahead will do the right thing and limit the read to just the
         * requested range, which we'll set to 1 page for this case.
         */
        if (!ra->ra_pages || blk_cgroup_congested()) {
                if (!ractl->file)
                        return;
                req_count = 1;
                do_forced_ra = true;
        }

        /* be dumb */
        if (do_forced_ra) {
                force_page_cache_ra(ractl, ra, req_count);
                return;
        }

        /* do read-ahead */
        ondemand_readahead(ractl, ra, false, req_count);
}
EXPORT_SYMBOL_GPL(page_cache_sync_ra);

void page_cache_async_ra(struct readahead_control *ractl,
                struct file_ra_state *ra, struct page *page,
                unsigned long req_count)
{
        /* no read-ahead */
        if (!ra->ra_pages)
                return;

        /*
         * Same bit is used for PG_readahead and PG_reclaim.
         */
        if (PageWriteback(page))
                return;

        ClearPageReadahead(page);

        /*
         * Defer asynchronous read-ahead on IO congestion.
         */
        if (inode_read_congested(ractl->mapping->host))
                return;

        if (blk_cgroup_congested())
                return;

        /* do read-ahead */
        ondemand_readahead(ractl, ra, true, req_count);
}
EXPORT_SYMBOL_GPL(page_cache_async_ra);

ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
{
        ssize_t ret;
        struct fd f;

        ret = -EBADF;
        f = fdget(fd);
        if (!f.file || !(f.file->f_mode & FMODE_READ))
                goto out;

        /*
         * The readahead() syscall is intended to run only on files
         * that can execute readahead. If readahead is not possible
         * on this file, then we must return -EINVAL.
         */
        ret = -EINVAL;
        if (!f.file->f_mapping || !f.file->f_mapping->a_ops ||
            (!S_ISREG(file_inode(f.file)->i_mode) &&
            !S_ISBLK(file_inode(f.file)->i_mode)))
                goto out;

        ret = vfs_fadvise(f.file, offset, count, POSIX_FADV_WILLNEED);
out:
        fdput(f);
        return ret;
}

SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
{
        return ksys_readahead(fd, offset, count);
}
































































































































































































   14 



































































    1 




































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_UACCESS_H__
#define __LINUX_UACCESS_H__

#include <linux/fault-inject-usercopy.h>
#include <linux/instrumented.h>
#include <linux/minmax.h>
#include <linux/sched.h>
#include <linux/thread_info.h>

#include <asm/uaccess.h>

#ifdef CONFIG_SET_FS
/*
 * Force the uaccess routines to be wired up for actual userspace access,
 * overriding any possible set_fs(KERNEL_DS) still lingering around.  Undone
 * using force_uaccess_end below.
 */
static inline mm_segment_t force_uaccess_begin(void)
{
        mm_segment_t fs = get_fs();

        set_fs(USER_DS);
        return fs;
}

static inline void force_uaccess_end(mm_segment_t oldfs)
{
        set_fs(oldfs);
}
#else /* CONFIG_SET_FS */
typedef struct {
        /* empty dummy */
} mm_segment_t;

#ifndef TASK_SIZE_MAX
#define TASK_SIZE_MAX                        TASK_SIZE
#endif

#define uaccess_kernel()                (false)
#define user_addr_max()                        (TASK_SIZE_MAX)

static inline mm_segment_t force_uaccess_begin(void)
{
        return (mm_segment_t) { };
}

static inline void force_uaccess_end(mm_segment_t oldfs)
{
}
#endif /* CONFIG_SET_FS */

/*
 * Architectures should provide two primitives (raw_copy_{to,from}_user())
 * and get rid of their private instances of copy_{to,from}_user() and
 * __copy_{to,from}_user{,_inatomic}().
 *
 * raw_copy_{to,from}_user(to, from, size) should copy up to size bytes and
 * return the amount left to copy.  They should assume that access_ok() has
 * already been checked (and succeeded); they should *not* zero-pad anything.
 * No KASAN or object size checks either - those belong here.
 *
 * Both of these functions should attempt to copy size bytes starting at from
 * into the area starting at to.  They must not fetch or store anything
 * outside of those areas.  Return value must be between 0 (everything
 * copied successfully) and size (nothing copied).
 *
 * If raw_copy_{to,from}_user(to, from, size) returns N, size - N bytes starting
 * at to must become equal to the bytes fetched from the corresponding area
 * starting at from.  All data past to + size - N must be left unmodified.
 *
 * If copying succeeds, the return value must be 0.  If some data cannot be
 * fetched, it is permitted to copy less than had been fetched; the only
 * hard requirement is that not storing anything at all (i.e. returning size)
 * should happen only when nothing could be copied.  In other words, you don't
 * have to squeeze as much as possible - it is allowed, but not necessary.
 *
 * For raw_copy_from_user() to always points to kernel memory and no faults
 * on store should happen.  Interpretation of from is affected by set_fs().
 * For raw_copy_to_user() it's the other way round.
 *
 * Both can be inlined - it's up to architectures whether it wants to bother
 * with that.  They should not be used directly; they are used to implement
 * the 6 functions (copy_{to,from}_user(), __copy_{to,from}_user_inatomic())
 * that are used instead.  Out of those, __... ones are inlined.  Plain
 * copy_{to,from}_user() might or might not be inlined.  If you want them
 * inlined, have asm/uaccess.h define INLINE_COPY_{TO,FROM}_USER.
 *
 * NOTE: only copy_from_user() zero-pads the destination in case of short copy.
 * Neither __copy_from_user() nor __copy_from_user_inatomic() zero anything
 * at all; their callers absolutely must check the return value.
 *
 * Biarch ones should also provide raw_copy_in_user() - similar to the above,
 * but both source and destination are __user pointers (affected by set_fs()
 * as usual) and both source and destination can trigger faults.
 */

static __always_inline __must_check unsigned long
__copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
{
        instrument_copy_from_user(to, from, n);
        check_object_size(to, n, false);
        return raw_copy_from_user(to, from, n);
}

static __always_inline __must_check unsigned long
__copy_from_user(void *to, const void __user *from, unsigned long n)
{
        might_fault();
        if (should_fail_usercopy())
                return n;
        instrument_copy_from_user(to, from, n);
        check_object_size(to, n, false);
        return raw_copy_from_user(to, from, n);
}

/**
 * __copy_to_user_inatomic: - Copy a block of data into user space, with less checking.
 * @to:   Destination address, in user space.
 * @from: Source address, in kernel space.
 * @n:    Number of bytes to copy.
 *
 * Context: User context only.
 *
 * Copy data from kernel space to user space.  Caller must check
 * the specified block with access_ok() before calling this function.
 * The caller should also make sure he pins the user space address
 * so that we don't result in page fault and sleep.
 */
static __always_inline __must_check unsigned long
__copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
{
        if (should_fail_usercopy())
                return n;
        instrument_copy_to_user(to, from, n);
        check_object_size(from, n, true);
        return raw_copy_to_user(to, from, n);
}

static __always_inline __must_check unsigned long
__copy_to_user(void __user *to, const void *from, unsigned long n)
{
        might_fault();
        if (should_fail_usercopy())
                return n;
        instrument_copy_to_user(to, from, n);
        check_object_size(from, n, true);
        return raw_copy_to_user(to, from, n);
}

#ifdef INLINE_COPY_FROM_USER
static inline __must_check unsigned long
_copy_from_user(void *to, const void __user *from, unsigned long n)
{
        unsigned long res = n;
        might_fault();
        if (!should_fail_usercopy() && likely(access_ok(from, n))) {
                instrument_copy_from_user(to, from, n);
                res = raw_copy_from_user(to, from, n);
        }
        if (unlikely(res))
                memset(to + (n - res), 0, res);
        return res;
}
#else
extern __must_check unsigned long
_copy_from_user(void *, const void __user *, unsigned long);
#endif

#ifdef INLINE_COPY_TO_USER
static inline __must_check unsigned long
_copy_to_user(void __user *to, const void *from, unsigned long n)
{
        might_fault();
        if (should_fail_usercopy())
                return n;
        if (access_ok(to, n)) {
                instrument_copy_to_user(to, from, n);
                n = raw_copy_to_user(to, from, n);
        }
        return n;
}
#else
extern __must_check unsigned long
_copy_to_user(void __user *, const void *, unsigned long);
#endif

static __always_inline unsigned long __must_check
copy_from_user(void *to, const void __user *from, unsigned long n)
{
        if (likely(check_copy_size(to, n, false)))
                n = _copy_from_user(to, from, n);
        return n;
}

static __always_inline unsigned long __must_check
copy_to_user(void __user *to, const void *from, unsigned long n)
{
        if (likely(check_copy_size(from, n, true)))
                n = _copy_to_user(to, from, n);
        return n;
}
#ifdef CONFIG_COMPAT
static __always_inline unsigned long __must_check
copy_in_user(void __user *to, const void __user *from, unsigned long n)
{
        might_fault();
        if (access_ok(to, n) && access_ok(from, n))
                n = raw_copy_in_user(to, from, n);
        return n;
}
#endif

#ifndef copy_mc_to_kernel
/*
 * Without arch opt-in this generic copy_mc_to_kernel() will not handle
 * #MC (or arch equivalent) during source read.
 */
static inline unsigned long __must_check
copy_mc_to_kernel(void *dst, const void *src, size_t cnt)
{
        memcpy(dst, src, cnt);
        return 0;
}
#endif

static __always_inline void pagefault_disabled_inc(void)
{
        current->pagefault_disabled++;
}

static __always_inline void pagefault_disabled_dec(void)
{
        current->pagefault_disabled--;
}

/*
 * These routines enable/disable the pagefault handler. If disabled, it will
 * not take any locks and go straight to the fixup table.
 *
 * User access methods will not sleep when called from a pagefault_disabled()
 * environment.
 */
static inline void pagefault_disable(void)
{
        pagefault_disabled_inc();
        /*
         * make sure to have issued the store before a pagefault
         * can hit.
         */
        barrier();
}

static inline void pagefault_enable(void)
{
        /*
         * make sure to issue those last loads/stores before enabling
         * the pagefault handler again.
         */
        barrier();
        pagefault_disabled_dec();
}

/*
 * Is the pagefault handler disabled? If so, user access methods will not sleep.
 */
static inline bool pagefault_disabled(void)
{
        return current->pagefault_disabled != 0;
}

/*
 * The pagefault handler is in general disabled by pagefault_disable() or
 * when in irq context (via in_atomic()).
 *
 * This function should only be used by the fault handlers. Other users should
 * stick to pagefault_disabled().
 * Please NEVER use preempt_disable() to disable the fault handler. With
 * !CONFIG_PREEMPT_COUNT, this is like a NOP. So the handler won't be disabled.
 * in_atomic() will report different values based on !CONFIG_PREEMPT_COUNT.
 */
#define faulthandler_disabled() (pagefault_disabled() || in_atomic())

#ifndef ARCH_HAS_NOCACHE_UACCESS

static inline __must_check unsigned long
__copy_from_user_inatomic_nocache(void *to, const void __user *from,
                                  unsigned long n)
{
        return __copy_from_user_inatomic(to, from, n);
}

#endif                /* ARCH_HAS_NOCACHE_UACCESS */

extern __must_check int check_zeroed_user(const void __user *from, size_t size);

/**
 * copy_struct_from_user: copy a struct from userspace
 * @dst:   Destination address, in kernel space. This buffer must be @ksize
 *         bytes long.
 * @ksize: Size of @dst struct.
 * @src:   Source address, in userspace.
 * @usize: (Alleged) size of @src struct.
 *
 * Copies a struct from userspace to kernel space, in a way that guarantees
 * backwards-compatibility for struct syscall arguments (as long as future
 * struct extensions are made such that all new fields are *appended* to the
 * old struct, and zeroed-out new fields have the same meaning as the old
 * struct).
 *
 * @ksize is just sizeof(*dst), and @usize should've been passed by userspace.
 * The recommended usage is something like the following:
 *
 *   SYSCALL_DEFINE2(foobar, const struct foo __user *, uarg, size_t, usize)
 *   {
 *      int err;
 *      struct foo karg = {};
 *
 *      if (usize > PAGE_SIZE)
 *        return -E2BIG;
 *      if (usize < FOO_SIZE_VER0)
 *        return -EINVAL;
 *
 *      err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize);
 *      if (err)
 *        return err;
 *
 *      // ...
 *   }
 *
 * There are three cases to consider:
 *  * If @usize == @ksize, then it's copied verbatim.
 *  * If @usize < @ksize, then the userspace has passed an old struct to a
 *    newer kernel. The rest of the trailing bytes in @dst (@ksize - @usize)
 *    are to be zero-filled.
 *  * If @usize > @ksize, then the userspace has passed a new struct to an
 *    older kernel. The trailing bytes unknown to the kernel (@usize - @ksize)
 *    are checked to ensure they are zeroed, otherwise -E2BIG is returned.
 *
 * Returns (in all cases, some data may have been copied):
 *  * -E2BIG:  (@usize > @ksize) and there are non-zero trailing bytes in @src.
 *  * -EFAULT: access to userspace failed.
 */
static __always_inline __must_check int
copy_struct_from_user(void *dst, size_t ksize, const void __user *src,
                      size_t usize)
{
        size_t size = min(ksize, usize);
        size_t rest = max(ksize, usize) - size;

        /* Double check if ksize is larger than a known object size. */
        if (WARN_ON_ONCE(ksize > __builtin_object_size(dst, 1)))
                return -E2BIG;

        /* Deal with trailing bytes. */
        if (usize < ksize) {
                memset(dst + size, 0, rest);
        } else if (usize > ksize) {
                int ret = check_zeroed_user(src + size, rest);
                if (ret <= 0)
                        return ret ?: -E2BIG;
        }
        /* Copy the interoperable parts of the struct. */
        if (copy_from_user(dst, src, size))
                return -EFAULT;
        return 0;
}

bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size);

long copy_from_kernel_nofault(void *dst, const void *src, size_t size);
long notrace copy_to_kernel_nofault(void *dst, const void *src, size_t size);

long copy_from_user_nofault(void *dst, const void __user *src, size_t size);
long notrace copy_to_user_nofault(void __user *dst, const void *src,
                size_t size);

long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr,
                long count);

long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr,
                long count);
long strnlen_user_nofault(const void __user *unsafe_addr, long count);

/**
 * get_kernel_nofault(): safely attempt to read from a location
 * @val: read into this variable
 * @ptr: address to read from
 *
 * Returns 0 on success, or -EFAULT.
 */
#define get_kernel_nofault(val, ptr) ({                                \
        const typeof(val) *__gk_ptr = (ptr);                        \
        copy_from_kernel_nofault(&(val), __gk_ptr, sizeof(val));\
})

#ifndef user_access_begin
#define user_access_begin(ptr,len) access_ok(ptr, len)
#define user_access_end() do { } while (0)
#define unsafe_op_wrap(op, err) do { if (unlikely(op)) goto err; } while (0)
#define unsafe_get_user(x,p,e) unsafe_op_wrap(__get_user(x,p),e)
#define unsafe_put_user(x,p,e) unsafe_op_wrap(__put_user(x,p),e)
#define unsafe_copy_to_user(d,s,l,e) unsafe_op_wrap(__copy_to_user(d,s,l),e)
static inline unsigned long user_access_save(void) { return 0UL; }
static inline void user_access_restore(unsigned long flags) { }
#endif
#ifndef user_write_access_begin
#define user_write_access_begin user_access_begin
#define user_write_access_end user_access_end
#endif
#ifndef user_read_access_begin
#define user_read_access_begin user_access_begin
#define user_read_access_end user_access_end
#endif

#ifdef CONFIG_HARDENED_USERCOPY
void usercopy_warn(const char *name, const char *detail, bool to_user,
                   unsigned long offset, unsigned long len);
void __noreturn usercopy_abort(const char *name, const char *detail,
                               bool to_user, unsigned long offset,
                               unsigned long len);
#endif

#endif                /* __LINUX_UACCESS_H__ */





































    1 







































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM skb

#if !defined(_TRACE_SKB_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_SKB_H

#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/tracepoint.h>

/*
 * Tracepoint for free an sk_buff:
 */
TRACE_EVENT(kfree_skb,

        TP_PROTO(struct sk_buff *skb, void *location),

        TP_ARGS(skb, location),

        TP_STRUCT__entry(
                __field(        void *,                skbaddr                )
                __field(        void *,                location        )
                __field(        unsigned short,        protocol        )
        ),

        TP_fast_assign(
                __entry->skbaddr = skb;
                __entry->location = location;
                __entry->protocol = ntohs(skb->protocol);
        ),

        TP_printk("skbaddr=%p protocol=%u location=%p",
                __entry->skbaddr, __entry->protocol, __entry->location)
);

TRACE_EVENT(consume_skb,

        TP_PROTO(struct sk_buff *skb),

        TP_ARGS(skb),

        TP_STRUCT__entry(
                __field(        void *,        skbaddr        )
        ),

        TP_fast_assign(
                __entry->skbaddr = skb;
        ),

        TP_printk("skbaddr=%p", __entry->skbaddr)
);

TRACE_EVENT(skb_copy_datagram_iovec,

        TP_PROTO(const struct sk_buff *skb, int len),

        TP_ARGS(skb, len),

        TP_STRUCT__entry(
                __field(        const void *,                skbaddr                )
                __field(        int,                        len                )
        ),

        TP_fast_assign(
                __entry->skbaddr = skb;
                __entry->len = len;
        ),

        TP_printk("skbaddr=%p len=%d", __entry->skbaddr, __entry->len)
);

#endif /* _TRACE_SKB_H */

/* This part must be outside protection */
#include <trace/define_trace.h>



































    1 
































































    2 


































    2 


















































































































































    3 




































    2 








































    1 




























































    1 



    1 











    1 












































    1 
















    1 






































































































































































































































































































    3 













    9 












    3 








    1 




    2 




    1 












    1 






























    7 
   10 


































































    1 
















    1 



























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_LIST_H
#define _LINUX_LIST_H

#include <linux/types.h>
#include <linux/stddef.h>
#include <linux/poison.h>
#include <linux/const.h>
#include <linux/kernel.h>

/*
 * Simple doubly linked list implementation.
 *
 * Some of the internal functions ("__xxx") are useful when
 * manipulating whole lists rather than single entries, as
 * sometimes we already know the next/prev entries and we can
 * generate better code by using them directly rather than
 * using the generic single-entry routines.
 */

#define LIST_HEAD_INIT(name) { &(name), &(name) }

#define LIST_HEAD(name) \
        struct list_head name = LIST_HEAD_INIT(name)

/**
 * INIT_LIST_HEAD - Initialize a list_head structure
 * @list: list_head structure to be initialized.
 *
 * Initializes the list_head to point to itself.  If it is a list header,
 * the result is an empty list.
 */
static inline void INIT_LIST_HEAD(struct list_head *list)
{
        WRITE_ONCE(list->next, list);
        list->prev = list;
}

#ifdef CONFIG_DEBUG_LIST
extern bool __list_add_valid(struct list_head *new,
                              struct list_head *prev,
                              struct list_head *next);
extern bool __list_del_entry_valid(struct list_head *entry);
#else
static inline bool __list_add_valid(struct list_head *new,
                                struct list_head *prev,
                                struct list_head *next)
{
        return true;
}
static inline bool __list_del_entry_valid(struct list_head *entry)
{
        return true;
}
#endif

/*
 * Insert a new entry between two known consecutive entries.
 *
 * This is only for internal list manipulation where we know
 * the prev/next entries already!
 */
static inline void __list_add(struct list_head *new,
                              struct list_head *prev,
                              struct list_head *next)
{
        if (!__list_add_valid(new, prev, next))
                return;

        next->prev = new;
        new->next = next;
        new->prev = prev;
        WRITE_ONCE(prev->next, new);
}

/**
 * list_add - add a new entry
 * @new: new entry to be added
 * @head: list head to add it after
 *
 * Insert a new entry after the specified head.
 * This is good for implementing stacks.
 */
static inline void list_add(struct list_head *new, struct list_head *head)
{
        __list_add(new, head, head->next);
}


/**
 * list_add_tail - add a new entry
 * @new: new entry to be added
 * @head: list head to add it before
 *
 * Insert a new entry before the specified head.
 * This is useful for implementing queues.
 */
static inline void list_add_tail(struct list_head *new, struct list_head *head)
{
        __list_add(new, head->prev, head);
}

/*
 * Delete a list entry by making the prev/next entries
 * point to each other.
 *
 * This is only for internal list manipulation where we know
 * the prev/next entries already!
 */
static inline void __list_del(struct list_head * prev, struct list_head * next)
{
        next->prev = prev;
        WRITE_ONCE(prev->next, next);
}

/*
 * Delete a list entry and clear the 'prev' pointer.
 *
 * This is a special-purpose list clearing method used in the networking code
 * for lists allocated as per-cpu, where we don't want to incur the extra
 * WRITE_ONCE() overhead of a regular list_del_init(). The code that uses this
 * needs to check the node 'prev' pointer instead of calling list_empty().
 */
static inline void __list_del_clearprev(struct list_head *entry)
{
        __list_del(entry->prev, entry->next);
        entry->prev = NULL;
}

static inline void __list_del_entry(struct list_head *entry)
{
        if (!__list_del_entry_valid(entry))
                return;

        __list_del(entry->prev, entry->next);
}

/**
 * list_del - deletes entry from list.
 * @entry: the element to delete from the list.
 * Note: list_empty() on entry does not return true after this, the entry is
 * in an undefined state.
 */
static inline void list_del(struct list_head *entry)
{
        __list_del_entry(entry);
        entry->next = LIST_POISON1;
        entry->prev = LIST_POISON2;
}

/**
 * list_replace - replace old entry by new one
 * @old : the element to be replaced
 * @new : the new element to insert
 *
 * If @old was empty, it will be overwritten.
 */
static inline void list_replace(struct list_head *old,
                                struct list_head *new)
{
        new->next = old->next;
        new->next->prev = new;
        new->prev = old->prev;
        new->prev->next = new;
}

/**
 * list_replace_init - replace old entry by new one and initialize the old one
 * @old : the element to be replaced
 * @new : the new element to insert
 *
 * If @old was empty, it will be overwritten.
 */
static inline void list_replace_init(struct list_head *old,
                                     struct list_head *new)
{
        list_replace(old, new);
        INIT_LIST_HEAD(old);
}

/**
 * list_swap - replace entry1 with entry2 and re-add entry1 at entry2's position
 * @entry1: the location to place entry2
 * @entry2: the location to place entry1
 */
static inline void list_swap(struct list_head *entry1,
                             struct list_head *entry2)
{
        struct list_head *pos = entry2->prev;

        list_del(entry2);
        list_replace(entry1, entry2);
        if (pos == entry1)
                pos = entry2;
        list_add(entry1, pos);
}

/**
 * list_del_init - deletes entry from list and reinitialize it.
 * @entry: the element to delete from the list.
 */
static inline void list_del_init(struct list_head *entry)
{
        __list_del_entry(entry);
        INIT_LIST_HEAD(entry);
}

/**
 * list_move - delete from one list and add as another's head
 * @list: the entry to move
 * @head: the head that will precede our entry
 */
static inline void list_move(struct list_head *list, struct list_head *head)
{
        __list_del_entry(list);
        list_add(list, head);
}

/**
 * list_move_tail - delete from one list and add as another's tail
 * @list: the entry to move
 * @head: the head that will follow our entry
 */
static inline void list_move_tail(struct list_head *list,
                                  struct list_head *head)
{
        __list_del_entry(list);
        list_add_tail(list, head);
}

/**
 * list_bulk_move_tail - move a subsection of a list to its tail
 * @head: the head that will follow our entry
 * @first: first entry to move
 * @last: last entry to move, can be the same as first
 *
 * Move all entries between @first and including @last before @head.
 * All three entries must belong to the same linked list.
 */
static inline void list_bulk_move_tail(struct list_head *head,
                                       struct list_head *first,
                                       struct list_head *last)
{
        first->prev->next = last->next;
        last->next->prev = first->prev;

        head->prev->next = first;
        first->prev = head->prev;

        last->next = head;
        head->prev = last;
}

/**
 * list_is_first -- tests whether @list is the first entry in list @head
 * @list: the entry to test
 * @head: the head of the list
 */
static inline int list_is_first(const struct list_head *list,
                                        const struct list_head *head)
{
        return list->prev == head;
}

/**
 * list_is_last - tests whether @list is the last entry in list @head
 * @list: the entry to test
 * @head: the head of the list
 */
static inline int list_is_last(const struct list_head *list,
                                const struct list_head *head)
{
        return list->next == head;
}

/**
 * list_empty - tests whether a list is empty
 * @head: the list to test.
 */
static inline int list_empty(const struct list_head *head)
{
        return READ_ONCE(head->next) == head;
}

/**
 * list_del_init_careful - deletes entry from list and reinitialize it.
 * @entry: the element to delete from the list.
 *
 * This is the same as list_del_init(), except designed to be used
 * together with list_empty_careful() in a way to guarantee ordering
 * of other memory operations.
 *
 * Any memory operations done before a list_del_init_careful() are
 * guaranteed to be visible after a list_empty_careful() test.
 */
static inline void list_del_init_careful(struct list_head *entry)
{
        __list_del_entry(entry);
        entry->prev = entry;
        smp_store_release(&entry->next, entry);
}

/**
 * list_empty_careful - tests whether a list is empty and not being modified
 * @head: the list to test
 *
 * Description:
 * tests whether a list is empty _and_ checks that no other CPU might be
 * in the process of modifying either member (next or prev)
 *
 * NOTE: using list_empty_careful() without synchronization
 * can only be safe if the only activity that can happen
 * to the list entry is list_del_init(). Eg. it cannot be used
 * if another CPU could re-list_add() it.
 */
static inline int list_empty_careful(const struct list_head *head)
{
        struct list_head *next = smp_load_acquire(&head->next);
        return (next == head) && (next == head->prev);
}

/**
 * list_rotate_left - rotate the list to the left
 * @head: the head of the list
 */
static inline void list_rotate_left(struct list_head *head)
{
        struct list_head *first;

        if (!list_empty(head)) {
                first = head->next;
                list_move_tail(first, head);
        }
}

/**
 * list_rotate_to_front() - Rotate list to specific item.
 * @list: The desired new front of the list.
 * @head: The head of the list.
 *
 * Rotates list so that @list becomes the new front of the list.
 */
static inline void list_rotate_to_front(struct list_head *list,
                                        struct list_head *head)
{
        /*
         * Deletes the list head from the list denoted by @head and
         * places it as the tail of @list, this effectively rotates the
         * list so that @list is at the front.
         */
        list_move_tail(head, list);
}

/**
 * list_is_singular - tests whether a list has just one entry.
 * @head: the list to test.
 */
static inline int list_is_singular(const struct list_head *head)
{
        return !list_empty(head) && (head->next == head->prev);
}

static inline void __list_cut_position(struct list_head *list,
                struct list_head *head, struct list_head *entry)
{
        struct list_head *new_first = entry->next;
        list->next = head->next;
        list->next->prev = list;
        list->prev = entry;
        entry->next = list;
        head->next = new_first;
        new_first->prev = head;
}

/**
 * list_cut_position - cut a list into two
 * @list: a new list to add all removed entries
 * @head: a list with entries
 * @entry: an entry within head, could be the head itself
 *        and if so we won't cut the list
 *
 * This helper moves the initial part of @head, up to and
 * including @entry, from @head to @list. You should
 * pass on @entry an element you know is on @head. @list
 * should be an empty list or a list you do not care about
 * losing its data.
 *
 */
static inline void list_cut_position(struct list_head *list,
                struct list_head *head, struct list_head *entry)
{
        if (list_empty(head))
                return;
        if (list_is_singular(head) &&
                (head->next != entry && head != entry))
                return;
        if (entry == head)
                INIT_LIST_HEAD(list);
        else
                __list_cut_position(list, head, entry);
}

/**
 * list_cut_before - cut a list into two, before given entry
 * @list: a new list to add all removed entries
 * @head: a list with entries
 * @entry: an entry within head, could be the head itself
 *
 * This helper moves the initial part of @head, up to but
 * excluding @entry, from @head to @list.  You should pass
 * in @entry an element you know is on @head.  @list should
 * be an empty list or a list you do not care about losing
 * its data.
 * If @entry == @head, all entries on @head are moved to
 * @list.
 */
static inline void list_cut_before(struct list_head *list,
                                   struct list_head *head,
                                   struct list_head *entry)
{
        if (head->next == entry) {
                INIT_LIST_HEAD(list);
                return;
        }
        list->next = head->next;
        list->next->prev = list;
        list->prev = entry->prev;
        list->prev->next = list;
        head->next = entry;
        entry->prev = head;
}

static inline void __list_splice(const struct list_head *list,
                                 struct list_head *prev,
                                 struct list_head *next)
{
        struct list_head *first = list->next;
        struct list_head *last = list->prev;

        first->prev = prev;
        prev->next = first;

        last->next = next;
        next->prev = last;
}

/**
 * list_splice - join two lists, this is designed for stacks
 * @list: the new list to add.
 * @head: the place to add it in the first list.
 */
static inline void list_splice(const struct list_head *list,
                                struct list_head *head)
{
        if (!list_empty(list))
                __list_splice(list, head, head->next);
}

/**
 * list_splice_tail - join two lists, each list being a queue
 * @list: the new list to add.
 * @head: the place to add it in the first list.
 */
static inline void list_splice_tail(struct list_head *list,
                                struct list_head *head)
{
        if (!list_empty(list))
                __list_splice(list, head->prev, head);
}

/**
 * list_splice_init - join two lists and reinitialise the emptied list.
 * @list: the new list to add.
 * @head: the place to add it in the first list.
 *
 * The list at @list is reinitialised
 */
static inline void list_splice_init(struct list_head *list,
                                    struct list_head *head)
{
        if (!list_empty(list)) {
                __list_splice(list, head, head->next);
                INIT_LIST_HEAD(list);
        }
}

/**
 * list_splice_tail_init - join two lists and reinitialise the emptied list
 * @list: the new list to add.
 * @head: the place to add it in the first list.
 *
 * Each of the lists is a queue.
 * The list at @list is reinitialised
 */
static inline void list_splice_tail_init(struct list_head *list,
                                         struct list_head *head)
{
        if (!list_empty(list)) {
                __list_splice(list, head->prev, head);
                INIT_LIST_HEAD(list);
        }
}

/**
 * list_entry - get the struct for this entry
 * @ptr:        the &struct list_head pointer.
 * @type:        the type of the struct this is embedded in.
 * @member:        the name of the list_head within the struct.
 */
#define list_entry(ptr, type, member) \
        container_of(ptr, type, member)

/**
 * list_first_entry - get the first element from a list
 * @ptr:        the list head to take the element from.
 * @type:        the type of the struct this is embedded in.
 * @member:        the name of the list_head within the struct.
 *
 * Note, that list is expected to be not empty.
 */
#define list_first_entry(ptr, type, member) \
        list_entry((ptr)->next, type, member)

/**
 * list_last_entry - get the last element from a list
 * @ptr:        the list head to take the element from.
 * @type:        the type of the struct this is embedded in.
 * @member:        the name of the list_head within the struct.
 *
 * Note, that list is expected to be not empty.
 */
#define list_last_entry(ptr, type, member) \
        list_entry((ptr)->prev, type, member)

/**
 * list_first_entry_or_null - get the first element from a list
 * @ptr:        the list head to take the element from.
 * @type:        the type of the struct this is embedded in.
 * @member:        the name of the list_head within the struct.
 *
 * Note that if the list is empty, it returns NULL.
 */
#define list_first_entry_or_null(ptr, type, member) ({ \
        struct list_head *head__ = (ptr); \
        struct list_head *pos__ = READ_ONCE(head__->next); \
        pos__ != head__ ? list_entry(pos__, type, member) : NULL; \
})

/**
 * list_next_entry - get the next element in list
 * @pos:        the type * to cursor
 * @member:        the name of the list_head within the struct.
 */
#define list_next_entry(pos, member) \
        list_entry((pos)->member.next, typeof(*(pos)), member)

/**
 * list_prev_entry - get the prev element in list
 * @pos:        the type * to cursor
 * @member:        the name of the list_head within the struct.
 */
#define list_prev_entry(pos, member) \
        list_entry((pos)->member.prev, typeof(*(pos)), member)

/**
 * list_for_each        -        iterate over a list
 * @pos:        the &struct list_head to use as a loop cursor.
 * @head:        the head for your list.
 */
#define list_for_each(pos, head) \
        for (pos = (head)->next; pos != (head); pos = pos->next)

/**
 * list_for_each_continue - continue iteration over a list
 * @pos:        the &struct list_head to use as a loop cursor.
 * @head:        the head for your list.
 *
 * Continue to iterate over a list, continuing after the current position.
 */
#define list_for_each_continue(pos, head) \
        for (pos = pos->next; pos != (head); pos = pos->next)

/**
 * list_for_each_prev        -        iterate over a list backwards
 * @pos:        the &struct list_head to use as a loop cursor.
 * @head:        the head for your list.
 */
#define list_for_each_prev(pos, head) \
        for (pos = (head)->prev; pos != (head); pos = pos->prev)

/**
 * list_for_each_safe - iterate over a list safe against removal of list entry
 * @pos:        the &struct list_head to use as a loop cursor.
 * @n:                another &struct list_head to use as temporary storage
 * @head:        the head for your list.
 */
#define list_for_each_safe(pos, n, head) \
        for (pos = (head)->next, n = pos->next; pos != (head); \
                pos = n, n = pos->next)

/**
 * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry
 * @pos:        the &struct list_head to use as a loop cursor.
 * @n:                another &struct list_head to use as temporary storage
 * @head:        the head for your list.
 */
#define list_for_each_prev_safe(pos, n, head) \
        for (pos = (head)->prev, n = pos->prev; \
             pos != (head); \
             pos = n, n = pos->prev)

/**
 * list_entry_is_head - test if the entry points to the head of the list
 * @pos:        the type * to cursor
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 */
#define list_entry_is_head(pos, head, member)                                \
        (&pos->member == (head))

/**
 * list_for_each_entry        -        iterate over list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 */
#define list_for_each_entry(pos, head, member)                                \
        for (pos = list_first_entry(head, typeof(*pos), member);        \
             !list_entry_is_head(pos, head, member);                        \
             pos = list_next_entry(pos, member))

/**
 * list_for_each_entry_reverse - iterate backwards over list of given type.
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 */
#define list_for_each_entry_reverse(pos, head, member)                        \
        for (pos = list_last_entry(head, typeof(*pos), member);                \
             !list_entry_is_head(pos, head, member);                         \
             pos = list_prev_entry(pos, member))

/**
 * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue()
 * @pos:        the type * to use as a start point
 * @head:        the head of the list
 * @member:        the name of the list_head within the struct.
 *
 * Prepares a pos entry for use as a start point in list_for_each_entry_continue().
 */
#define list_prepare_entry(pos, head, member) \
        ((pos) ? : list_entry(head, typeof(*pos), member))

/**
 * list_for_each_entry_continue - continue iteration over list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Continue to iterate over list of given type, continuing after
 * the current position.
 */
#define list_for_each_entry_continue(pos, head, member)                 \
        for (pos = list_next_entry(pos, member);                        \
             !list_entry_is_head(pos, head, member);                        \
             pos = list_next_entry(pos, member))

/**
 * list_for_each_entry_continue_reverse - iterate backwards from the given point
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Start to iterate over list of given type backwards, continuing after
 * the current position.
 */
#define list_for_each_entry_continue_reverse(pos, head, member)                \
        for (pos = list_prev_entry(pos, member);                        \
             !list_entry_is_head(pos, head, member);                        \
             pos = list_prev_entry(pos, member))

/**
 * list_for_each_entry_from - iterate over list of given type from the current point
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Iterate over list of given type, continuing from current position.
 */
#define list_for_each_entry_from(pos, head, member)                         \
        for (; !list_entry_is_head(pos, head, member);                        \
             pos = list_next_entry(pos, member))

/**
 * list_for_each_entry_from_reverse - iterate backwards over list of given type
 *                                    from the current point
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Iterate backwards over list of given type, continuing from current position.
 */
#define list_for_each_entry_from_reverse(pos, head, member)                \
        for (; !list_entry_is_head(pos, head, member);                        \
             pos = list_prev_entry(pos, member))

/**
 * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
 * @pos:        the type * to use as a loop cursor.
 * @n:                another type * to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 */
#define list_for_each_entry_safe(pos, n, head, member)                        \
        for (pos = list_first_entry(head, typeof(*pos), member),        \
                n = list_next_entry(pos, member);                        \
             !list_entry_is_head(pos, head, member);                         \
             pos = n, n = list_next_entry(n, member))

/**
 * list_for_each_entry_safe_continue - continue list iteration safe against removal
 * @pos:        the type * to use as a loop cursor.
 * @n:                another type * to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Iterate over list of given type, continuing after current point,
 * safe against removal of list entry.
 */
#define list_for_each_entry_safe_continue(pos, n, head, member)                 \
        for (pos = list_next_entry(pos, member),                                 \
                n = list_next_entry(pos, member);                                \
             !list_entry_is_head(pos, head, member);                                \
             pos = n, n = list_next_entry(n, member))

/**
 * list_for_each_entry_safe_from - iterate over list from current point safe against removal
 * @pos:        the type * to use as a loop cursor.
 * @n:                another type * to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Iterate over list of given type from current point, safe against
 * removal of list entry.
 */
#define list_for_each_entry_safe_from(pos, n, head, member)                         \
        for (n = list_next_entry(pos, member);                                        \
             !list_entry_is_head(pos, head, member);                                \
             pos = n, n = list_next_entry(n, member))

/**
 * list_for_each_entry_safe_reverse - iterate backwards over list safe against removal
 * @pos:        the type * to use as a loop cursor.
 * @n:                another type * to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Iterate backwards over list of given type, safe against removal
 * of list entry.
 */
#define list_for_each_entry_safe_reverse(pos, n, head, member)                \
        for (pos = list_last_entry(head, typeof(*pos), member),                \
                n = list_prev_entry(pos, member);                        \
             !list_entry_is_head(pos, head, member);                         \
             pos = n, n = list_prev_entry(n, member))

/**
 * list_safe_reset_next - reset a stale list_for_each_entry_safe loop
 * @pos:        the loop cursor used in the list_for_each_entry_safe loop
 * @n:                temporary storage used in list_for_each_entry_safe
 * @member:        the name of the list_head within the struct.
 *
 * list_safe_reset_next is not safe to use in general if the list may be
 * modified concurrently (eg. the lock is dropped in the loop body). An
 * exception to this is if the cursor element (pos) is pinned in the list,
 * and list_safe_reset_next is called after re-taking the lock and before
 * completing the current iteration of the loop body.
 */
#define list_safe_reset_next(pos, n, member)                                \
        n = list_next_entry(pos, member)

/*
 * Double linked lists with a single pointer list head.
 * Mostly useful for hash tables where the two pointer list head is
 * too wasteful.
 * You lose the ability to access the tail in O(1).
 */

#define HLIST_HEAD_INIT { .first = NULL }
#define HLIST_HEAD(name) struct hlist_head name = {  .first = NULL }
#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL)
static inline void INIT_HLIST_NODE(struct hlist_node *h)
{
        h->next = NULL;
        h->pprev = NULL;
}

/**
 * hlist_unhashed - Has node been removed from list and reinitialized?
 * @h: Node to be checked
 *
 * Not that not all removal functions will leave a node in unhashed
 * state.  For example, hlist_nulls_del_init_rcu() does leave the
 * node in unhashed state, but hlist_nulls_del() does not.
 */
static inline int hlist_unhashed(const struct hlist_node *h)
{
        return !h->pprev;
}

/**
 * hlist_unhashed_lockless - Version of hlist_unhashed for lockless use
 * @h: Node to be checked
 *
 * This variant of hlist_unhashed() must be used in lockless contexts
 * to avoid potential load-tearing.  The READ_ONCE() is paired with the
 * various WRITE_ONCE() in hlist helpers that are defined below.
 */
static inline int hlist_unhashed_lockless(const struct hlist_node *h)
{
        return !READ_ONCE(h->pprev);
}

/**
 * hlist_empty - Is the specified hlist_head structure an empty hlist?
 * @h: Structure to check.
 */
static inline int hlist_empty(const struct hlist_head *h)
{
        return !READ_ONCE(h->first);
}

static inline void __hlist_del(struct hlist_node *n)
{
        struct hlist_node *next = n->next;
        struct hlist_node **pprev = n->pprev;

        WRITE_ONCE(*pprev, next);
        if (next)
                WRITE_ONCE(next->pprev, pprev);
}

/**
 * hlist_del - Delete the specified hlist_node from its list
 * @n: Node to delete.
 *
 * Note that this function leaves the node in hashed state.  Use
 * hlist_del_init() or similar instead to unhash @n.
 */
static inline void hlist_del(struct hlist_node *n)
{
        __hlist_del(n);
        n->next = LIST_POISON1;
        n->pprev = LIST_POISON2;
}

/**
 * hlist_del_init - Delete the specified hlist_node from its list and initialize
 * @n: Node to delete.
 *
 * Note that this function leaves the node in unhashed state.
 */
static inline void hlist_del_init(struct hlist_node *n)
{
        if (!hlist_unhashed(n)) {
                __hlist_del(n);
                INIT_HLIST_NODE(n);
        }
}

/**
 * hlist_add_head - add a new entry at the beginning of the hlist
 * @n: new entry to be added
 * @h: hlist head to add it after
 *
 * Insert a new entry after the specified head.
 * This is good for implementing stacks.
 */
static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
{
        struct hlist_node *first = h->first;
        WRITE_ONCE(n->next, first);
        if (first)
                WRITE_ONCE(first->pprev, &n->next);
        WRITE_ONCE(h->first, n);
        WRITE_ONCE(n->pprev, &h->first);
}

/**
 * hlist_add_before - add a new entry before the one specified
 * @n: new entry to be added
 * @next: hlist node to add it before, which must be non-NULL
 */
static inline void hlist_add_before(struct hlist_node *n,
                                    struct hlist_node *next)
{
        WRITE_ONCE(n->pprev, next->pprev);
        WRITE_ONCE(n->next, next);
        WRITE_ONCE(next->pprev, &n->next);
        WRITE_ONCE(*(n->pprev), n);
}

/**
 * hlist_add_behing - add a new entry after the one specified
 * @n: new entry to be added
 * @prev: hlist node to add it after, which must be non-NULL
 */
static inline void hlist_add_behind(struct hlist_node *n,
                                    struct hlist_node *prev)
{
        WRITE_ONCE(n->next, prev->next);
        WRITE_ONCE(prev->next, n);
        WRITE_ONCE(n->pprev, &prev->next);

        if (n->next)
                WRITE_ONCE(n->next->pprev, &n->next);
}

/**
 * hlist_add_fake - create a fake hlist consisting of a single headless node
 * @n: Node to make a fake list out of
 *
 * This makes @n appear to be its own predecessor on a headless hlist.
 * The point of this is to allow things like hlist_del() to work correctly
 * in cases where there is no list.
 */
static inline void hlist_add_fake(struct hlist_node *n)
{
        n->pprev = &n->next;
}

/**
 * hlist_fake: Is this node a fake hlist?
 * @h: Node to check for being a self-referential fake hlist.
 */
static inline bool hlist_fake(struct hlist_node *h)
{
        return h->pprev == &h->next;
}

/**
 * hlist_is_singular_node - is node the only element of the specified hlist?
 * @n: Node to check for singularity.
 * @h: Header for potentially singular list.
 *
 * Check whether the node is the only node of the head without
 * accessing head, thus avoiding unnecessary cache misses.
 */
static inline bool
hlist_is_singular_node(struct hlist_node *n, struct hlist_head *h)
{
        return !n->next && n->pprev == &h->first;
}

/**
 * hlist_move_list - Move an hlist
 * @old: hlist_head for old list.
 * @new: hlist_head for new list.
 *
 * Move a list from one list head to another. Fixup the pprev
 * reference of the first entry if it exists.
 */
static inline void hlist_move_list(struct hlist_head *old,
                                   struct hlist_head *new)
{
        new->first = old->first;
        if (new->first)
                new->first->pprev = &new->first;
        old->first = NULL;
}

#define hlist_entry(ptr, type, member) container_of(ptr,type,member)

#define hlist_for_each(pos, head) \
        for (pos = (head)->first; pos ; pos = pos->next)

#define hlist_for_each_safe(pos, n, head) \
        for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \
             pos = n)

#define hlist_entry_safe(ptr, type, member) \
        ({ typeof(ptr) ____ptr = (ptr); \
           ____ptr ? hlist_entry(____ptr, type, member) : NULL; \
        })

/**
 * hlist_for_each_entry        - iterate over list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry(pos, head, member)                                \
        for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member);\
             pos;                                                        \
             pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))

/**
 * hlist_for_each_entry_continue - iterate over a hlist continuing after current point
 * @pos:        the type * to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_continue(pos, member)                        \
        for (pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member);\
             pos;                                                        \
             pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))

/**
 * hlist_for_each_entry_from - iterate over a hlist continuing from current point
 * @pos:        the type * to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_from(pos, member)                                \
        for (; pos;                                                        \
             pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))

/**
 * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry
 * @pos:        the type * to use as a loop cursor.
 * @n:                a &struct hlist_node to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_safe(pos, n, head, member)                 \
        for (pos = hlist_entry_safe((head)->first, typeof(*pos), member);\
             pos && ({ n = pos->member.next; 1; });                        \
             pos = hlist_entry_safe(n, typeof(*pos), member))

#endif






































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __KERNEL_PRINTK__
#define __KERNEL_PRINTK__

#include <stdarg.h>
#include <linux/init.h>
#include <linux/kern_levels.h>
#include <linux/linkage.h>
#include <linux/cache.h>
#include <linux/ratelimit_types.h>

extern const char linux_banner[];
extern const char linux_proc_banner[];

extern int oops_in_progress;        /* If set, an oops, panic(), BUG() or die() is in progress */

#define PRINTK_MAX_SINGLE_HEADER_LEN 2

static inline int printk_get_level(const char *buffer)
{
        if (buffer[0] == KERN_SOH_ASCII && buffer[1]) {
                switch (buffer[1]) {
                case '0' ... '7':
                case 'c':        /* KERN_CONT */
                        return buffer[1];
                }
        }
        return 0;
}

static inline const char *printk_skip_level(const char *buffer)
{
        if (printk_get_level(buffer))
                return buffer + 2;

        return buffer;
}

static inline const char *printk_skip_headers(const char *buffer)
{
        while (printk_get_level(buffer))
                buffer = printk_skip_level(buffer);

        return buffer;
}

#define CONSOLE_EXT_LOG_MAX        8192

/* printk's without a loglevel use this.. */
#define MESSAGE_LOGLEVEL_DEFAULT CONFIG_MESSAGE_LOGLEVEL_DEFAULT

/* We show everything that is MORE important than this.. */
#define CONSOLE_LOGLEVEL_SILENT  0 /* Mum's the word */
#define CONSOLE_LOGLEVEL_MIN         1 /* Minimum loglevel we let people use */
#define CONSOLE_LOGLEVEL_DEBUG        10 /* issue debug messages */
#define CONSOLE_LOGLEVEL_MOTORMOUTH 15        /* You can't shut this one up */

/*
 * Default used to be hard-coded at 7, quiet used to be hardcoded at 4,
 * we're now allowing both to be set from kernel config.
 */
#define CONSOLE_LOGLEVEL_DEFAULT CONFIG_CONSOLE_LOGLEVEL_DEFAULT
#define CONSOLE_LOGLEVEL_QUIET         CONFIG_CONSOLE_LOGLEVEL_QUIET

extern int console_printk[];

#define console_loglevel (console_printk[0])
#define default_message_loglevel (console_printk[1])
#define minimum_console_loglevel (console_printk[2])
#define default_console_loglevel (console_printk[3])

static inline void console_silent(void)
{
        console_loglevel = CONSOLE_LOGLEVEL_SILENT;
}

static inline void console_verbose(void)
{
        if (console_loglevel)
                console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
}

/* strlen("ratelimit") + 1 */
#define DEVKMSG_STR_MAX_SIZE 10
extern char devkmsg_log_str[];
struct ctl_table;

extern int suppress_printk;

struct va_format {
        const char *fmt;
        va_list *va;
};

/*
 * FW_BUG
 * Add this to a message where you are sure the firmware is buggy or behaves
 * really stupid or out of spec. Be aware that the responsible BIOS developer
 * should be able to fix this issue or at least get a concrete idea of the
 * problem by reading your message without the need of looking at the kernel
 * code.
 *
 * Use it for definite and high priority BIOS bugs.
 *
 * FW_WARN
 * Use it for not that clear (e.g. could the kernel messed up things already?)
 * and medium priority BIOS bugs.
 *
 * FW_INFO
 * Use this one if you want to tell the user or vendor about something
 * suspicious, but generally harmless related to the firmware.
 *
 * Use it for information or very low priority BIOS bugs.
 */
#define FW_BUG                "[Firmware Bug]: "
#define FW_WARN                "[Firmware Warn]: "
#define FW_INFO                "[Firmware Info]: "

/*
 * HW_ERR
 * Add this to a message for hardware errors, so that user can report
 * it to hardware vendor instead of LKML or software vendor.
 */
#define HW_ERR                "[Hardware Error]: "

/*
 * DEPRECATED
 * Add this to a message whenever you want to warn user space about the use
 * of a deprecated aspect of an API so they can stop using it
 */
#define DEPRECATED        "[Deprecated]: "

/*
 * Dummy printk for disabled debugging statements to use whilst maintaining
 * gcc's format checking.
 */
#define no_printk(fmt, ...)                                \
({                                                        \
        if (0)                                                \
                printk(fmt, ##__VA_ARGS__);                \
        0;                                                \
})

#ifdef CONFIG_EARLY_PRINTK
extern asmlinkage __printf(1, 2)
void early_printk(const char *fmt, ...);
#else
static inline __printf(1, 2) __cold
void early_printk(const char *s, ...) { }
#endif

#ifdef CONFIG_PRINTK_NMI
extern void printk_nmi_enter(void);
extern void printk_nmi_exit(void);
extern void printk_nmi_direct_enter(void);
extern void printk_nmi_direct_exit(void);
#else
static inline void printk_nmi_enter(void) { }
static inline void printk_nmi_exit(void) { }
static inline void printk_nmi_direct_enter(void) { }
static inline void printk_nmi_direct_exit(void) { }
#endif /* PRINTK_NMI */

struct dev_printk_info;

#ifdef CONFIG_PRINTK
asmlinkage __printf(4, 0)
int vprintk_emit(int facility, int level,
                 const struct dev_printk_info *dev_info,
                 const char *fmt, va_list args);

asmlinkage __printf(1, 0)
int vprintk(const char *fmt, va_list args);

asmlinkage __printf(1, 2) __cold
int printk(const char *fmt, ...);

/*
 * Special printk facility for scheduler/timekeeping use only, _DO_NOT_USE_ !
 */
__printf(1, 2) __cold int printk_deferred(const char *fmt, ...);

/*
 * Please don't use printk_ratelimit(), because it shares ratelimiting state
 * with all other unrelated printk_ratelimit() callsites.  Instead use
 * printk_ratelimited() or plain old __ratelimit().
 */
extern int __printk_ratelimit(const char *func);
#define printk_ratelimit() __printk_ratelimit(__func__)
extern bool printk_timed_ratelimit(unsigned long *caller_jiffies,
                                   unsigned int interval_msec);

extern int printk_delay_msec;
extern int dmesg_restrict;

extern int
devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, void *buf,
                          size_t *lenp, loff_t *ppos);

extern void wake_up_klogd(void);

char *log_buf_addr_get(void);
u32 log_buf_len_get(void);
void log_buf_vmcoreinfo_setup(void);
void __init setup_log_buf(int early);
__printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...);
void dump_stack_print_info(const char *log_lvl);
void show_regs_print_info(const char *log_lvl);
extern asmlinkage void dump_stack(void) __cold;
extern void printk_safe_flush(void);
extern void printk_safe_flush_on_panic(void);
#else
static inline __printf(1, 0)
int vprintk(const char *s, va_list args)
{
        return 0;
}
static inline __printf(1, 2) __cold
int printk(const char *s, ...)
{
        return 0;
}
static inline __printf(1, 2) __cold
int printk_deferred(const char *s, ...)
{
        return 0;
}
static inline int printk_ratelimit(void)
{
        return 0;
}
static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies,
                                          unsigned int interval_msec)
{
        return false;
}

static inline void wake_up_klogd(void)
{
}

static inline char *log_buf_addr_get(void)
{
        return NULL;
}

static inline u32 log_buf_len_get(void)
{
        return 0;
}

static inline void log_buf_vmcoreinfo_setup(void)
{
}

static inline void setup_log_buf(int early)
{
}

static inline __printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...)
{
}

static inline void dump_stack_print_info(const char *log_lvl)
{
}

static inline void show_regs_print_info(const char *log_lvl)
{
}

static inline void dump_stack(void)
{
}

static inline void printk_safe_flush(void)
{
}

static inline void printk_safe_flush_on_panic(void)
{
}
#endif

extern int kptr_restrict;

/**
 * pr_fmt - used by the pr_*() macros to generate the printk format string
 * @fmt: format string passed from a pr_*() macro
 *
 * This macro can be used to generate a unified format string for pr_*()
 * macros. A common use is to prefix all pr_*() messages in a file with a common
 * string. For example, defining this at the top of a source file:
 *
 *        #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 *
 * would prefix all pr_info, pr_emerg... messages in the file with the module
 * name.
 */
#ifndef pr_fmt
#define pr_fmt(fmt) fmt
#endif

/**
 * pr_emerg - Print an emergency-level message
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_EMERG loglevel. It uses pr_fmt() to
 * generate the format string.
 */
#define pr_emerg(fmt, ...) \
        printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
/**
 * pr_alert - Print an alert-level message
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_ALERT loglevel. It uses pr_fmt() to
 * generate the format string.
 */
#define pr_alert(fmt, ...) \
        printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
/**
 * pr_crit - Print a critical-level message
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_CRIT loglevel. It uses pr_fmt() to
 * generate the format string.
 */
#define pr_crit(fmt, ...) \
        printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
/**
 * pr_err - Print an error-level message
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_ERR loglevel. It uses pr_fmt() to
 * generate the format string.
 */
#define pr_err(fmt, ...) \
        printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
/**
 * pr_warn - Print a warning-level message
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_WARNING loglevel. It uses pr_fmt()
 * to generate the format string.
 */
#define pr_warn(fmt, ...) \
        printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
/**
 * pr_notice - Print a notice-level message
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_NOTICE loglevel. It uses pr_fmt() to
 * generate the format string.
 */
#define pr_notice(fmt, ...) \
        printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
/**
 * pr_info - Print an info-level message
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_INFO loglevel. It uses pr_fmt() to
 * generate the format string.
 */
#define pr_info(fmt, ...) \
        printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)

/**
 * pr_cont - Continues a previous log message in the same line.
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_CONT loglevel. It should only be
 * used when continuing a log message with no newline ('\n') enclosed. Otherwise
 * it defaults back to KERN_DEFAULT loglevel.
 */
#define pr_cont(fmt, ...) \
        printk(KERN_CONT fmt, ##__VA_ARGS__)

/**
 * pr_devel - Print a debug-level message conditionally
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_DEBUG loglevel if DEBUG is
 * defined. Otherwise it does nothing.
 *
 * It uses pr_fmt() to generate the format string.
 */
#ifdef DEBUG
#define pr_devel(fmt, ...) \
        printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#else
#define pr_devel(fmt, ...) \
        no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif


/* If you are writing a driver, please use dev_dbg instead */
#if defined(CONFIG_DYNAMIC_DEBUG) || \
        (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
#include <linux/dynamic_debug.h>

/**
 * pr_debug - Print a debug-level message conditionally
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to dynamic_pr_debug() if CONFIG_DYNAMIC_DEBUG is
 * set. Otherwise, if DEBUG is defined, it's equivalent to a printk with
 * KERN_DEBUG loglevel. If DEBUG is not defined it does nothing.
 *
 * It uses pr_fmt() to generate the format string (dynamic_pr_debug() uses
 * pr_fmt() internally).
 */
#define pr_debug(fmt, ...)                        \
        dynamic_pr_debug(fmt, ##__VA_ARGS__)
#elif defined(DEBUG)
#define pr_debug(fmt, ...) \
        printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#else
#define pr_debug(fmt, ...) \
        no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif

/*
 * Print a one-time message (analogous to WARN_ONCE() et al):
 */

#ifdef CONFIG_PRINTK
#define printk_once(fmt, ...)                                        \
({                                                                \
        static bool __section(".data.once") __print_once;        \
        bool __ret_print_once = !__print_once;                        \
                                                                \
        if (!__print_once) {                                        \
                __print_once = true;                                \
                printk(fmt, ##__VA_ARGS__);                        \
        }                                                        \
        unlikely(__ret_print_once);                                \
})
#define printk_deferred_once(fmt, ...)                                \
({                                                                \
        static bool __section(".data.once") __print_once;        \
        bool __ret_print_once = !__print_once;                        \
                                                                \
        if (!__print_once) {                                        \
                __print_once = true;                                \
                printk_deferred(fmt, ##__VA_ARGS__);                \
        }                                                        \
        unlikely(__ret_print_once);                                \
})
#else
#define printk_once(fmt, ...)                                        \
        no_printk(fmt, ##__VA_ARGS__)
#define printk_deferred_once(fmt, ...)                                \
        no_printk(fmt, ##__VA_ARGS__)
#endif

#define pr_emerg_once(fmt, ...)                                        \
        printk_once(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
#define pr_alert_once(fmt, ...)                                        \
        printk_once(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
#define pr_crit_once(fmt, ...)                                        \
        printk_once(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
#define pr_err_once(fmt, ...)                                        \
        printk_once(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
#define pr_warn_once(fmt, ...)                                        \
        printk_once(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
#define pr_notice_once(fmt, ...)                                \
        printk_once(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
#define pr_info_once(fmt, ...)                                        \
        printk_once(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
/* no pr_cont_once, don't do that... */

#if defined(DEBUG)
#define pr_devel_once(fmt, ...)                                        \
        printk_once(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#else
#define pr_devel_once(fmt, ...)                                        \
        no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif

/* If you are writing a driver, please use dev_dbg instead */
#if defined(DEBUG)
#define pr_debug_once(fmt, ...)                                        \
        printk_once(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#else
#define pr_debug_once(fmt, ...)                                        \
        no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif

/*
 * ratelimited messages with local ratelimit_state,
 * no local ratelimit_state used in the !PRINTK case
 */
#ifdef CONFIG_PRINTK
#define printk_ratelimited(fmt, ...)                                        \
({                                                                        \
        static DEFINE_RATELIMIT_STATE(_rs,                                \
                                      DEFAULT_RATELIMIT_INTERVAL,        \
                                      DEFAULT_RATELIMIT_BURST);                \
                                                                        \
        if (__ratelimit(&_rs))                                                \
                printk(fmt, ##__VA_ARGS__);                                \
})
#else
#define printk_ratelimited(fmt, ...)                                        \
        no_printk(fmt, ##__VA_ARGS__)
#endif

#define pr_emerg_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
#define pr_alert_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
#define pr_crit_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
#define pr_err_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
#define pr_warn_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
#define pr_notice_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
#define pr_info_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
/* no pr_cont_ratelimited, don't do that... */

#if defined(DEBUG)
#define pr_devel_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#else
#define pr_devel_ratelimited(fmt, ...)                                        \
        no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif

/* If you are writing a driver, please use dev_dbg instead */
#if defined(CONFIG_DYNAMIC_DEBUG) || \
        (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
/* descriptor check is first to prevent flooding with "callbacks suppressed" */
#define pr_debug_ratelimited(fmt, ...)                                        \
do {                                                                        \
        static DEFINE_RATELIMIT_STATE(_rs,                                \
                                      DEFAULT_RATELIMIT_INTERVAL,        \
                                      DEFAULT_RATELIMIT_BURST);                \
        DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, pr_fmt(fmt));                \
        if (DYNAMIC_DEBUG_BRANCH(descriptor) &&                                \
            __ratelimit(&_rs))                                                \
                __dynamic_pr_debug(&descriptor, pr_fmt(fmt), ##__VA_ARGS__);        \
} while (0)
#elif defined(DEBUG)
#define pr_debug_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#else
#define pr_debug_ratelimited(fmt, ...) \
        no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif

extern const struct file_operations kmsg_fops;

enum {
        DUMP_PREFIX_NONE,
        DUMP_PREFIX_ADDRESS,
        DUMP_PREFIX_OFFSET
};
extern int hex_dump_to_buffer(const void *buf, size_t len, int rowsize,
                              int groupsize, char *linebuf, size_t linebuflen,
                              bool ascii);
#ifdef CONFIG_PRINTK
extern void print_hex_dump(const char *level, const char *prefix_str,
                           int prefix_type, int rowsize, int groupsize,
                           const void *buf, size_t len, bool ascii);
#else
static inline void print_hex_dump(const char *level, const char *prefix_str,
                                  int prefix_type, int rowsize, int groupsize,
                                  const void *buf, size_t len, bool ascii)
{
}
static inline void print_hex_dump_bytes(const char *prefix_str, int prefix_type,
                                        const void *buf, size_t len)
{
}

#endif

#if defined(CONFIG_DYNAMIC_DEBUG) || \
        (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
#define print_hex_dump_debug(prefix_str, prefix_type, rowsize,        \
                             groupsize, buf, len, ascii)        \
        dynamic_hex_dump(prefix_str, prefix_type, rowsize,        \
                         groupsize, buf, len, ascii)
#elif defined(DEBUG)
#define print_hex_dump_debug(prefix_str, prefix_type, rowsize,                \
                             groupsize, buf, len, ascii)                \
        print_hex_dump(KERN_DEBUG, prefix_str, prefix_type, rowsize,        \
                       groupsize, buf, len, ascii)
#else
static inline void print_hex_dump_debug(const char *prefix_str, int prefix_type,
                                        int rowsize, int groupsize,
                                        const void *buf, size_t len, bool ascii)
{
}
#endif

/**
 * print_hex_dump_bytes - shorthand form of print_hex_dump() with default params
 * @prefix_str: string to prefix each line with;
 *  caller supplies trailing spaces for alignment if desired
 * @prefix_type: controls whether prefix of an offset, address, or none
 *  is printed (%DUMP_PREFIX_OFFSET, %DUMP_PREFIX_ADDRESS, %DUMP_PREFIX_NONE)
 * @buf: data blob to dump
 * @len: number of bytes in the @buf
 *
 * Calls print_hex_dump(), with log level of KERN_DEBUG,
 * rowsize of 16, groupsize of 1, and ASCII output included.
 */
#define print_hex_dump_bytes(prefix_str, prefix_type, buf, len)        \
        print_hex_dump_debug(prefix_str, prefix_type, 16, 1, buf, len, true)

#ifdef CONFIG_PRINTK
extern void __printk_safe_enter(void);
extern void __printk_safe_exit(void);
/*
 * The printk_deferred_enter/exit macros are available only as a hack for
 * some code paths that need to defer all printk console printing. Interrupts
 * must be disabled for the deferred duration.
 */
#define printk_deferred_enter __printk_safe_enter
#define printk_deferred_exit __printk_safe_exit
#else
static inline void printk_deferred_enter(void)
{
}
static inline void printk_deferred_exit(void)
{
}
#endif

#endif

























































































































































































    4 

    4 
    4 

    4 

    3 
    4 
    2 
    2 
    2 


    4 

    2 


    2 

    2 














    2 

    4 
    4 

    4 

    1 
    2 
    1 
    1 
    1 


    1 

    1 


    1 

    1 



















































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
/*
 * Implementation of the access vector table type.
 *
 * Author : Stephen Smalley, <sds@tycho.nsa.gov>
 */

/* Updated: Frank Mayer <mayerf@tresys.com> and Karl MacMillan <kmacmillan@tresys.com>
 *
 *        Added conditional policy language extensions
 *
 * Copyright (C) 2003 Tresys Technology, LLC
 *        This program is free software; you can redistribute it and/or modify
 *        it under the terms of the GNU General Public License as published by
 *        the Free Software Foundation, version 2.
 *
 * Updated: Yuichi Nakamura <ynakam@hitachisoft.jp>
 *        Tuned number of hash slots for avtab to reduce memory usage
 */

#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/errno.h>
#include "avtab.h"
#include "policydb.h"

static struct kmem_cache *avtab_node_cachep;
static struct kmem_cache *avtab_xperms_cachep;

/* Based on MurmurHash3, written by Austin Appleby and placed in the
 * public domain.
 */
static inline int avtab_hash(struct avtab_key *keyp, u32 mask)
{
        static const u32 c1 = 0xcc9e2d51;
        static const u32 c2 = 0x1b873593;
        static const u32 r1 = 15;
        static const u32 r2 = 13;
        static const u32 m  = 5;
        static const u32 n  = 0xe6546b64;

        u32 hash = 0;

#define mix(input) { \
        u32 v = input; \
        v *= c1; \
        v = (v << r1) | (v >> (32 - r1)); \
        v *= c2; \
        hash ^= v; \
        hash = (hash << r2) | (hash >> (32 - r2)); \
        hash = hash * m + n; \
}

        mix(keyp->target_class);
        mix(keyp->target_type);
        mix(keyp->source_type);

#undef mix

        hash ^= hash >> 16;
        hash *= 0x85ebca6b;
        hash ^= hash >> 13;
        hash *= 0xc2b2ae35;
        hash ^= hash >> 16;

        return hash & mask;
}

static struct avtab_node*
avtab_insert_node(struct avtab *h, int hvalue,
                  struct avtab_node *prev, struct avtab_node *cur,
                  struct avtab_key *key, struct avtab_datum *datum)
{
        struct avtab_node *newnode;
        struct avtab_extended_perms *xperms;
        newnode = kmem_cache_zalloc(avtab_node_cachep, GFP_KERNEL);
        if (newnode == NULL)
                return NULL;
        newnode->key = *key;

        if (key->specified & AVTAB_XPERMS) {
                xperms = kmem_cache_zalloc(avtab_xperms_cachep, GFP_KERNEL);
                if (xperms == NULL) {
                        kmem_cache_free(avtab_node_cachep, newnode);
                        return NULL;
                }
                *xperms = *(datum->u.xperms);
                newnode->datum.u.xperms = xperms;
        } else {
                newnode->datum.u.data = datum->u.data;
        }

        if (prev) {
                newnode->next = prev->next;
                prev->next = newnode;
        } else {
                struct avtab_node **n = &h->htable[hvalue];

                newnode->next = *n;
                *n = newnode;
        }

        h->nel++;
        return newnode;
}

static int avtab_insert(struct avtab *h, struct avtab_key *key, struct avtab_datum *datum)
{
        int hvalue;
        struct avtab_node *prev, *cur, *newnode;
        u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD);

        if (!h || !h->nslot)
                return -EINVAL;

        hvalue = avtab_hash(key, h->mask);
        for (prev = NULL, cur = h->htable[hvalue];
             cur;
             prev = cur, cur = cur->next) {
                if (key->source_type == cur->key.source_type &&
                    key->target_type == cur->key.target_type &&
                    key->target_class == cur->key.target_class &&
                    (specified & cur->key.specified)) {
                        /* extended perms may not be unique */
                        if (specified & AVTAB_XPERMS)
                                break;
                        return -EEXIST;
                }
                if (key->source_type < cur->key.source_type)
                        break;
                if (key->source_type == cur->key.source_type &&
                    key->target_type < cur->key.target_type)
                        break;
                if (key->source_type == cur->key.source_type &&
                    key->target_type == cur->key.target_type &&
                    key->target_class < cur->key.target_class)
                        break;
        }

        newnode = avtab_insert_node(h, hvalue, prev, cur, key, datum);
        if (!newnode)
                return -ENOMEM;

        return 0;
}

/* Unlike avtab_insert(), this function allow multiple insertions of the same
 * key/specified mask into the table, as needed by the conditional avtab.
 * It also returns a pointer to the node inserted.
 */
struct avtab_node *
avtab_insert_nonunique(struct avtab *h, struct avtab_key *key, struct avtab_datum *datum)
{
        int hvalue;
        struct avtab_node *prev, *cur;
        u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD);

        if (!h || !h->nslot)
                return NULL;
        hvalue = avtab_hash(key, h->mask);
        for (prev = NULL, cur = h->htable[hvalue];
             cur;
             prev = cur, cur = cur->next) {
                if (key->source_type == cur->key.source_type &&
                    key->target_type == cur->key.target_type &&
                    key->target_class == cur->key.target_class &&
                    (specified & cur->key.specified))
                        break;
                if (key->source_type < cur->key.source_type)
                        break;
                if (key->source_type == cur->key.source_type &&
                    key->target_type < cur->key.target_type)
                        break;
                if (key->source_type == cur->key.source_type &&
                    key->target_type == cur->key.target_type &&
                    key->target_class < cur->key.target_class)
                        break;
        }
        return avtab_insert_node(h, hvalue, prev, cur, key, datum);
}

struct avtab_datum *avtab_search(struct avtab *h, struct avtab_key *key)
{
        int hvalue;
        struct avtab_node *cur;
        u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD);

        if (!h || !h->nslot)
                return NULL;

        hvalue = avtab_hash(key, h->mask);
        for (cur = h->htable[hvalue]; cur;
             cur = cur->next) {
                if (key->source_type == cur->key.source_type &&
                    key->target_type == cur->key.target_type &&
                    key->target_class == cur->key.target_class &&
                    (specified & cur->key.specified))
                        return &cur->datum;

                if (key->source_type < cur->key.source_type)
                        break;
                if (key->source_type == cur->key.source_type &&
                    key->target_type < cur->key.target_type)
                        break;
                if (key->source_type == cur->key.source_type &&
                    key->target_type == cur->key.target_type &&
                    key->target_class < cur->key.target_class)
                        break;
        }

        return NULL;
}

/* This search function returns a node pointer, and can be used in
 * conjunction with avtab_search_next_node()
 */
struct avtab_node*
avtab_search_node(struct avtab *h, struct avtab_key *key)
{
        int hvalue;
        struct avtab_node *cur;
        u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD);

        if (!h || !h->nslot)
                return NULL;

        hvalue = avtab_hash(key, h->mask);
        for (cur = h->htable[hvalue]; cur;
             cur = cur->next) {
                if (key->source_type == cur->key.source_type &&
                    key->target_type == cur->key.target_type &&
                    key->target_class == cur->key.target_class &&
                    (specified & cur->key.specified))
                        return cur;

                if (key->source_type < cur->key.source_type)
                        break;
                if (key->source_type == cur->key.source_type &&
                    key->target_type < cur->key.target_type)
                        break;
                if (key->source_type == cur->key.source_type &&
                    key->target_type == cur->key.target_type &&
                    key->target_class < cur->key.target_class)
                        break;
        }
        return NULL;
}

struct avtab_node*
avtab_search_node_next(struct avtab_node *node, int specified)
{
        struct avtab_node *cur;

        if (!node)
                return NULL;

        specified &= ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD);
        for (cur = node->next; cur; cur = cur->next) {
                if (node->key.source_type == cur->key.source_type &&
                    node->key.target_type == cur->key.target_type &&
                    node->key.target_class == cur->key.target_class &&
                    (specified & cur->key.specified))
                        return cur;

                if (node->key.source_type < cur->key.source_type)
                        break;
                if (node->key.source_type == cur->key.source_type &&
                    node->key.target_type < cur->key.target_type)
                        break;
                if (node->key.source_type == cur->key.source_type &&
                    node->key.target_type == cur->key.target_type &&
                    node->key.target_class < cur->key.target_class)
                        break;
        }
        return NULL;
}

void avtab_destroy(struct avtab *h)
{
        int i;
        struct avtab_node *cur, *temp;

        if (!h)
                return;

        for (i = 0; i < h->nslot; i++) {
                cur = h->htable[i];
                while (cur) {
                        temp = cur;
                        cur = cur->next;
                        if (temp->key.specified & AVTAB_XPERMS)
                                kmem_cache_free(avtab_xperms_cachep,
                                                temp->datum.u.xperms);
                        kmem_cache_free(avtab_node_cachep, temp);
                }
        }
        kvfree(h->htable);
        h->htable = NULL;
        h->nel = 0;
        h->nslot = 0;
        h->mask = 0;
}

void avtab_init(struct avtab *h)
{
        h->htable = NULL;
        h->nel = 0;
        h->nslot = 0;
        h->mask = 0;
}

static int avtab_alloc_common(struct avtab *h, u32 nslot)
{
        if (!nslot)
                return 0;

        h->htable = kvcalloc(nslot, sizeof(void *), GFP_KERNEL);
        if (!h->htable)
                return -ENOMEM;

        h->nslot = nslot;
        h->mask = nslot - 1;
        return 0;
}

int avtab_alloc(struct avtab *h, u32 nrules)
{
        int rc;
        u32 nslot = 0;

        if (nrules != 0) {
                u32 shift = 1;
                u32 work = nrules >> 3;
                while (work) {
                        work >>= 1;
                        shift++;
                }
                nslot = 1 << shift;
                if (nslot > MAX_AVTAB_HASH_BUCKETS)
                        nslot = MAX_AVTAB_HASH_BUCKETS;

                rc = avtab_alloc_common(h, nslot);
                if (rc)
                        return rc;
        }

        pr_debug("SELinux: %d avtab hash slots, %d rules.\n", nslot, nrules);
        return 0;
}

int avtab_alloc_dup(struct avtab *new, const struct avtab *orig)
{
        return avtab_alloc_common(new, orig->nslot);
}

void avtab_hash_eval(struct avtab *h, char *tag)
{
        int i, chain_len, slots_used, max_chain_len;
        unsigned long long chain2_len_sum;
        struct avtab_node *cur;

        slots_used = 0;
        max_chain_len = 0;
        chain2_len_sum = 0;
        for (i = 0; i < h->nslot; i++) {
                cur = h->htable[i];
                if (cur) {
                        slots_used++;
                        chain_len = 0;
                        while (cur) {
                                chain_len++;
                                cur = cur->next;
                        }

                        if (chain_len > max_chain_len)
                                max_chain_len = chain_len;
                        chain2_len_sum += chain_len * chain_len;
                }
        }

        pr_debug("SELinux: %s:  %d entries and %d/%d buckets used, "
               "longest chain length %d sum of chain length^2 %llu\n",
               tag, h->nel, slots_used, h->nslot, max_chain_len,
               chain2_len_sum);
}

static uint16_t spec_order[] = {
        AVTAB_ALLOWED,
        AVTAB_AUDITDENY,
        AVTAB_AUDITALLOW,
        AVTAB_TRANSITION,
        AVTAB_CHANGE,
        AVTAB_MEMBER,
        AVTAB_XPERMS_ALLOWED,
        AVTAB_XPERMS_AUDITALLOW,
        AVTAB_XPERMS_DONTAUDIT
};

int avtab_read_item(struct avtab *a, void *fp, struct policydb *pol,
                    int (*insertf)(struct avtab *a, struct avtab_key *k,
                                   struct avtab_datum *d, void *p),
                    void *p)
{
        __le16 buf16[4];
        u16 enabled;
        u32 items, items2, val, vers = pol->policyvers;
        struct avtab_key key;
        struct avtab_datum datum;
        struct avtab_extended_perms xperms;
        __le32 buf32[ARRAY_SIZE(xperms.perms.p)];
        int i, rc;
        unsigned set;

        memset(&key, 0, sizeof(struct avtab_key));
        memset(&datum, 0, sizeof(struct avtab_datum));

        if (vers < POLICYDB_VERSION_AVTAB) {
                rc = next_entry(buf32, fp, sizeof(u32));
                if (rc) {
                        pr_err("SELinux: avtab: truncated entry\n");
                        return rc;
                }
                items2 = le32_to_cpu(buf32[0]);
                if (items2 > ARRAY_SIZE(buf32)) {
                        pr_err("SELinux: avtab: entry overflow\n");
                        return -EINVAL;

                }
                rc = next_entry(buf32, fp, sizeof(u32)*items2);
                if (rc) {
                        pr_err("SELinux: avtab: truncated entry\n");
                        return rc;
                }
                items = 0;

                val = le32_to_cpu(buf32[items++]);
                key.source_type = (u16)val;
                if (key.source_type != val) {
                        pr_err("SELinux: avtab: truncated source type\n");
                        return -EINVAL;
                }
                val = le32_to_cpu(buf32[items++]);
                key.target_type = (u16)val;
                if (key.target_type != val) {
                        pr_err("SELinux: avtab: truncated target type\n");
                        return -EINVAL;
                }
                val = le32_to_cpu(buf32[items++]);
                key.target_class = (u16)val;
                if (key.target_class != val) {
                        pr_err("SELinux: avtab: truncated target class\n");
                        return -EINVAL;
                }

                val = le32_to_cpu(buf32[items++]);
                enabled = (val & AVTAB_ENABLED_OLD) ? AVTAB_ENABLED : 0;

                if (!(val & (AVTAB_AV | AVTAB_TYPE))) {
                        pr_err("SELinux: avtab: null entry\n");
                        return -EINVAL;
                }
                if ((val & AVTAB_AV) &&
                    (val & AVTAB_TYPE)) {
                        pr_err("SELinux: avtab: entry has both access vectors and types\n");
                        return -EINVAL;
                }
                if (val & AVTAB_XPERMS) {
                        pr_err("SELinux: avtab: entry has extended permissions\n");
                        return -EINVAL;
                }

                for (i = 0; i < ARRAY_SIZE(spec_order); i++) {
                        if (val & spec_order[i]) {
                                key.specified = spec_order[i] | enabled;
                                datum.u.data = le32_to_cpu(buf32[items++]);
                                rc = insertf(a, &key, &datum, p);
                                if (rc)
                                        return rc;
                        }
                }

                if (items != items2) {
                        pr_err("SELinux: avtab: entry only had %d items, expected %d\n",
                               items2, items);
                        return -EINVAL;
                }
                return 0;
        }

        rc = next_entry(buf16, fp, sizeof(u16)*4);
        if (rc) {
                pr_err("SELinux: avtab: truncated entry\n");
                return rc;
        }

        items = 0;
        key.source_type = le16_to_cpu(buf16[items++]);
        key.target_type = le16_to_cpu(buf16[items++]);
        key.target_class = le16_to_cpu(buf16[items++]);
        key.specified = le16_to_cpu(buf16[items++]);

        if (!policydb_type_isvalid(pol, key.source_type) ||
            !policydb_type_isvalid(pol, key.target_type) ||
            !policydb_class_isvalid(pol, key.target_class)) {
                pr_err("SELinux: avtab: invalid type or class\n");
                return -EINVAL;
        }

        set = 0;
        for (i = 0; i < ARRAY_SIZE(spec_order); i++) {
                if (key.specified & spec_order[i])
                        set++;
        }
        if (!set || set > 1) {
                pr_err("SELinux:  avtab:  more than one specifier\n");
                return -EINVAL;
        }

        if ((vers < POLICYDB_VERSION_XPERMS_IOCTL) &&
                        (key.specified & AVTAB_XPERMS)) {
                pr_err("SELinux:  avtab:  policy version %u does not "
                                "support extended permissions rules and one "
                                "was specified\n", vers);
                return -EINVAL;
        } else if (key.specified & AVTAB_XPERMS) {
                memset(&xperms, 0, sizeof(struct avtab_extended_perms));
                rc = next_entry(&xperms.specified, fp, sizeof(u8));
                if (rc) {
                        pr_err("SELinux: avtab: truncated entry\n");
                        return rc;
                }
                rc = next_entry(&xperms.driver, fp, sizeof(u8));
                if (rc) {
                        pr_err("SELinux: avtab: truncated entry\n");
                        return rc;
                }
                rc = next_entry(buf32, fp, sizeof(u32)*ARRAY_SIZE(xperms.perms.p));
                if (rc) {
                        pr_err("SELinux: avtab: truncated entry\n");
                        return rc;
                }
                for (i = 0; i < ARRAY_SIZE(xperms.perms.p); i++)
                        xperms.perms.p[i] = le32_to_cpu(buf32[i]);
                datum.u.xperms = &xperms;
        } else {
                rc = next_entry(buf32, fp, sizeof(u32));
                if (rc) {
                        pr_err("SELinux: avtab: truncated entry\n");
                        return rc;
                }
                datum.u.data = le32_to_cpu(*buf32);
        }
        if ((key.specified & AVTAB_TYPE) &&
            !policydb_type_isvalid(pol, datum.u.data)) {
                pr_err("SELinux: avtab: invalid type\n");
                return -EINVAL;
        }
        return insertf(a, &key, &datum, p);
}

static int avtab_insertf(struct avtab *a, struct avtab_key *k,
                         struct avtab_datum *d, void *p)
{
        return avtab_insert(a, k, d);
}

int avtab_read(struct avtab *a, void *fp, struct policydb *pol)
{
        int rc;
        __le32 buf[1];
        u32 nel, i;


        rc = next_entry(buf, fp, sizeof(u32));
        if (rc < 0) {
                pr_err("SELinux: avtab: truncated table\n");
                goto bad;
        }
        nel = le32_to_cpu(buf[0]);
        if (!nel) {
                pr_err("SELinux: avtab: table is empty\n");
                rc = -EINVAL;
                goto bad;
        }

        rc = avtab_alloc(a, nel);
        if (rc)
                goto bad;

        for (i = 0; i < nel; i++) {
                rc = avtab_read_item(a, fp, pol, avtab_insertf, NULL);
                if (rc) {
                        if (rc == -ENOMEM)
                                pr_err("SELinux: avtab: out of memory\n");
                        else if (rc == -EEXIST)
                                pr_err("SELinux: avtab: duplicate entry\n");

                        goto bad;
                }
        }

        rc = 0;
out:
        return rc;

bad:
        avtab_destroy(a);
        goto out;
}

int avtab_write_item(struct policydb *p, struct avtab_node *cur, void *fp)
{
        __le16 buf16[4];
        __le32 buf32[ARRAY_SIZE(cur->datum.u.xperms->perms.p)];
        int rc;
        unsigned int i;

        buf16[0] = cpu_to_le16(cur->key.source_type);
        buf16[1] = cpu_to_le16(cur->key.target_type);
        buf16[2] = cpu_to_le16(cur->key.target_class);
        buf16[3] = cpu_to_le16(cur->key.specified);
        rc = put_entry(buf16, sizeof(u16), 4, fp);
        if (rc)
                return rc;

        if (cur->key.specified & AVTAB_XPERMS) {
                rc = put_entry(&cur->datum.u.xperms->specified, sizeof(u8), 1, fp);
                if (rc)
                        return rc;
                rc = put_entry(&cur->datum.u.xperms->driver, sizeof(u8), 1, fp);
                if (rc)
                        return rc;
                for (i = 0; i < ARRAY_SIZE(cur->datum.u.xperms->perms.p); i++)
                        buf32[i] = cpu_to_le32(cur->datum.u.xperms->perms.p[i]);
                rc = put_entry(buf32, sizeof(u32),
                                ARRAY_SIZE(cur->datum.u.xperms->perms.p), fp);
        } else {
                buf32[0] = cpu_to_le32(cur->datum.u.data);
                rc = put_entry(buf32, sizeof(u32), 1, fp);
        }
        if (rc)
                return rc;
        return 0;
}

int avtab_write(struct policydb *p, struct avtab *a, void *fp)
{
        unsigned int i;
        int rc = 0;
        struct avtab_node *cur;
        __le32 buf[1];

        buf[0] = cpu_to_le32(a->nel);
        rc = put_entry(buf, sizeof(u32), 1, fp);
        if (rc)
                return rc;

        for (i = 0; i < a->nslot; i++) {
                for (cur = a->htable[i]; cur;
                     cur = cur->next) {
                        rc = avtab_write_item(p, cur, fp);
                        if (rc)
                                return rc;
                }
        }

        return rc;
}

void __init avtab_cache_init(void)
{
        avtab_node_cachep = kmem_cache_create("avtab_node",
                                              sizeof(struct avtab_node),
                                              0, SLAB_PANIC, NULL);
        avtab_xperms_cachep = kmem_cache_create("avtab_extended_perms",
                                                sizeof(struct avtab_extended_perms),
                                                0, SLAB_PANIC, NULL);
}


















































































































































    1 



    1 






    2 



    2 













































































































































































    1 


















    1 


    1 











































































    1 





    1 





    1 







    1 
    4 































    1 








    1 





    1 











    1 

    1 










    1 
    1 




    1 











    1 














































































    1 
    1 
    1 


























































































































































    1 


    1 
    1 
    1 









    1 
    1 






























































































    1 










































































































    1 






    1 








    1 







    1 



    1 




    1 





    1 




    1 







    1 

    1 


    1 


    1 










    3 



    3 










































































































    1 
    1 











    1 
    1 
    1 





























    1 




    1 



    1 











































    1 

    1 














    1 


    1 





    1 

    1 






    1 







    1 








































































































































































































































































    1 







    1 
    1 








    1 
    1 

    1 















































































    2 

    2 
    2 


















    2 

    2 

    2 





    2 



    2 
















    2 













    2 
    2 

    2 



    2 


    2 






    2 

    1 

    1 







































    1 


    1 


    1 

















    1 


    1 














    1 
    1 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
// SPDX-License-Identifier: GPL-2.0-only
#include <crypto/hash.h>
#include <linux/export.h>
#include <linux/bvec.h>
#include <linux/fault-inject-usercopy.h>
#include <linux/uio.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/splice.h>
#include <linux/compat.h>
#include <net/checksum.h>
#include <linux/scatterlist.h>
#include <linux/instrumented.h>

#define PIPE_PARANOIA /* for now */

#define iterate_iovec(i, n, __v, __p, skip, STEP) {        \
        size_t left;                                        \
        size_t wanted = n;                                \
        __p = i->iov;                                        \
        __v.iov_len = min(n, __p->iov_len - skip);        \
        if (likely(__v.iov_len)) {                        \
                __v.iov_base = __p->iov_base + skip;        \
                left = (STEP);                                \
                __v.iov_len -= left;                        \
                skip += __v.iov_len;                        \
                n -= __v.iov_len;                        \
        } else {                                        \
                left = 0;                                \
        }                                                \
        while (unlikely(!left && n)) {                        \
                __p++;                                        \
                __v.iov_len = min(n, __p->iov_len);        \
                if (unlikely(!__v.iov_len))                \
                        continue;                        \
                __v.iov_base = __p->iov_base;                \
                left = (STEP);                                \
                __v.iov_len -= left;                        \
                skip = __v.iov_len;                        \
                n -= __v.iov_len;                        \
        }                                                \
        n = wanted - n;                                        \
}

#define iterate_kvec(i, n, __v, __p, skip, STEP) {        \
        size_t wanted = n;                                \
        __p = i->kvec;                                        \
        __v.iov_len = min(n, __p->iov_len - skip);        \
        if (likely(__v.iov_len)) {                        \
                __v.iov_base = __p->iov_base + skip;        \
                (void)(STEP);                                \
                skip += __v.iov_len;                        \
                n -= __v.iov_len;                        \
        }                                                \
        while (unlikely(n)) {                                \
                __p++;                                        \
                __v.iov_len = min(n, __p->iov_len);        \
                if (unlikely(!__v.iov_len))                \
                        continue;                        \
                __v.iov_base = __p->iov_base;                \
                (void)(STEP);                                \
                skip = __v.iov_len;                        \
                n -= __v.iov_len;                        \
        }                                                \
        n = wanted;                                        \
}

#define iterate_bvec(i, n, __v, __bi, skip, STEP) {        \
        struct bvec_iter __start;                        \
        __start.bi_size = n;                                \
        __start.bi_bvec_done = skip;                        \
        __start.bi_idx = 0;                                \
        for_each_bvec(__v, i->bvec, __bi, __start) {        \
                if (!__v.bv_len)                        \
                        continue;                        \
                (void)(STEP);                                \
        }                                                \
}

#define iterate_all_kinds(i, n, v, I, B, K) {                        \
        if (likely(n)) {                                        \
                size_t skip = i->iov_offset;                        \
                if (unlikely(i->type & ITER_BVEC)) {                \
                        struct bio_vec v;                        \
                        struct bvec_iter __bi;                        \
                        iterate_bvec(i, n, v, __bi, skip, (B))        \
                } else if (unlikely(i->type & ITER_KVEC)) {        \
                        const struct kvec *kvec;                \
                        struct kvec v;                                \
                        iterate_kvec(i, n, v, kvec, skip, (K))        \
                } else if (unlikely(i->type & ITER_DISCARD)) {        \
                } else {                                        \
                        const struct iovec *iov;                \
                        struct iovec v;                                \
                        iterate_iovec(i, n, v, iov, skip, (I))        \
                }                                                \
        }                                                        \
}

#define iterate_and_advance(i, n, v, I, B, K) {                        \
        if (unlikely(i->count < n))                                \
                n = i->count;                                        \
        if (i->count) {                                                \
                size_t skip = i->iov_offset;                        \
                if (unlikely(i->type & ITER_BVEC)) {                \
                        const struct bio_vec *bvec = i->bvec;        \
                        struct bio_vec v;                        \
                        struct bvec_iter __bi;                        \
                        iterate_bvec(i, n, v, __bi, skip, (B))        \
                        i->bvec = __bvec_iter_bvec(i->bvec, __bi);        \
                        i->nr_segs -= i->bvec - bvec;                \
                        skip = __bi.bi_bvec_done;                \
                } else if (unlikely(i->type & ITER_KVEC)) {        \
                        const struct kvec *kvec;                \
                        struct kvec v;                                \
                        iterate_kvec(i, n, v, kvec, skip, (K))        \
                        if (skip == kvec->iov_len) {                \
                                kvec++;                                \
                                skip = 0;                        \
                        }                                        \
                        i->nr_segs -= kvec - i->kvec;                \
                        i->kvec = kvec;                                \
                } else if (unlikely(i->type & ITER_DISCARD)) {        \
                        skip += n;                                \
                } else {                                        \
                        const struct iovec *iov;                \
                        struct iovec v;                                \
                        iterate_iovec(i, n, v, iov, skip, (I))        \
                        if (skip == iov->iov_len) {                \
                                iov++;                                \
                                skip = 0;                        \
                        }                                        \
                        i->nr_segs -= iov - i->iov;                \
                        i->iov = iov;                                \
                }                                                \
                i->count -= n;                                        \
                i->iov_offset = skip;                                \
        }                                                        \
}

static int copyout(void __user *to, const void *from, size_t n)
{
        if (should_fail_usercopy())
                return n;
        if (access_ok(to, n)) {
                instrument_copy_to_user(to, from, n);
                n = raw_copy_to_user(to, from, n);
        }
        return n;
}

static int copyin(void *to, const void __user *from, size_t n)
{
        if (should_fail_usercopy())
                return n;
        if (access_ok(from, n)) {
                instrument_copy_from_user(to, from, n);
                n = raw_copy_from_user(to, from, n);
        }
        return n;
}

static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes,
                         struct iov_iter *i)
{
        size_t skip, copy, left, wanted;
        const struct iovec *iov;
        char __user *buf;
        void *kaddr, *from;

        if (unlikely(bytes > i->count))
                bytes = i->count;

        if (unlikely(!bytes))
                return 0;

        might_fault();
        wanted = bytes;
        iov = i->iov;
        skip = i->iov_offset;
        buf = iov->iov_base + skip;
        copy = min(bytes, iov->iov_len - skip);

        if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) {
                kaddr = kmap_atomic(page);
                from = kaddr + offset;

                /* first chunk, usually the only one */
                left = copyout(buf, from, copy);
                copy -= left;
                skip += copy;
                from += copy;
                bytes -= copy;

                while (unlikely(!left && bytes)) {
                        iov++;
                        buf = iov->iov_base;
                        copy = min(bytes, iov->iov_len);
                        left = copyout(buf, from, copy);
                        copy -= left;
                        skip = copy;
                        from += copy;
                        bytes -= copy;
                }
                if (likely(!bytes)) {
                        kunmap_atomic(kaddr);
                        goto done;
                }
                offset = from - kaddr;
                buf += copy;
                kunmap_atomic(kaddr);
                copy = min(bytes, iov->iov_len - skip);
        }
        /* Too bad - revert to non-atomic kmap */

        kaddr = kmap(page);
        from = kaddr + offset;
        left = copyout(buf, from, copy);
        copy -= left;
        skip += copy;
        from += copy;
        bytes -= copy;
        while (unlikely(!left && bytes)) {
                iov++;
                buf = iov->iov_base;
                copy = min(bytes, iov->iov_len);
                left = copyout(buf, from, copy);
                copy -= left;
                skip = copy;
                from += copy;
                bytes -= copy;
        }
        kunmap(page);

done:
        if (skip == iov->iov_len) {
                iov++;
                skip = 0;
        }
        i->count -= wanted - bytes;
        i->nr_segs -= iov - i->iov;
        i->iov = iov;
        i->iov_offset = skip;
        return wanted - bytes;
}

static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes,
                         struct iov_iter *i)
{
        size_t skip, copy, left, wanted;
        const struct iovec *iov;
        char __user *buf;
        void *kaddr, *to;

        if (unlikely(bytes > i->count))
                bytes = i->count;

        if (unlikely(!bytes))
                return 0;

        might_fault();
        wanted = bytes;
        iov = i->iov;
        skip = i->iov_offset;
        buf = iov->iov_base + skip;
        copy = min(bytes, iov->iov_len - skip);

        if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) {
                kaddr = kmap_atomic(page);
                to = kaddr + offset;

                /* first chunk, usually the only one */
                left = copyin(to, buf, copy);
                copy -= left;
                skip += copy;
                to += copy;
                bytes -= copy;

                while (unlikely(!left && bytes)) {
                        iov++;
                        buf = iov->iov_base;
                        copy = min(bytes, iov->iov_len);
                        left = copyin(to, buf, copy);
                        copy -= left;
                        skip = copy;
                        to += copy;
                        bytes -= copy;
                }
                if (likely(!bytes)) {
                        kunmap_atomic(kaddr);
                        goto done;
                }
                offset = to - kaddr;
                buf += copy;
                kunmap_atomic(kaddr);
                copy = min(bytes, iov->iov_len - skip);
        }
        /* Too bad - revert to non-atomic kmap */

        kaddr = kmap(page);
        to = kaddr + offset;
        left = copyin(to, buf, copy);
        copy -= left;
        skip += copy;
        to += copy;
        bytes -= copy;
        while (unlikely(!left && bytes)) {
                iov++;
                buf = iov->iov_base;
                copy = min(bytes, iov->iov_len);
                left = copyin(to, buf, copy);
                copy -= left;
                skip = copy;
                to += copy;
                bytes -= copy;
        }
        kunmap(page);

done:
        if (skip == iov->iov_len) {
                iov++;
                skip = 0;
        }
        i->count -= wanted - bytes;
        i->nr_segs -= iov - i->iov;
        i->iov = iov;
        i->iov_offset = skip;
        return wanted - bytes;
}

#ifdef PIPE_PARANOIA
static bool sanity(const struct iov_iter *i)
{
        struct pipe_inode_info *pipe = i->pipe;
        unsigned int p_head = pipe->head;
        unsigned int p_tail = pipe->tail;
        unsigned int p_mask = pipe->ring_size - 1;
        unsigned int p_occupancy = pipe_occupancy(p_head, p_tail);
        unsigned int i_head = i->head;
        unsigned int idx;

        if (i->iov_offset) {
                struct pipe_buffer *p;
                if (unlikely(p_occupancy == 0))
                        goto Bad;        // pipe must be non-empty
                if (unlikely(i_head != p_head - 1))
                        goto Bad;        // must be at the last buffer...

                p = &pipe->bufs[i_head & p_mask];
                if (unlikely(p->offset + p->len != i->iov_offset))
                        goto Bad;        // ... at the end of segment
        } else {
                if (i_head != p_head)
                        goto Bad;        // must be right after the last buffer
        }
        return true;
Bad:
        printk(KERN_ERR "idx = %d, offset = %zd\n", i_head, i->iov_offset);
        printk(KERN_ERR "head = %d, tail = %d, buffers = %d\n",
                        p_head, p_tail, pipe->ring_size);
        for (idx = 0; idx < pipe->ring_size; idx++)
                printk(KERN_ERR "[%p %p %d %d]\n",
                        pipe->bufs[idx].ops,
                        pipe->bufs[idx].page,
                        pipe->bufs[idx].offset,
                        pipe->bufs[idx].len);
        WARN_ON(1);
        return false;
}
#else
#define sanity(i) true
#endif

static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes,
                         struct iov_iter *i)
{
        struct pipe_inode_info *pipe = i->pipe;
        struct pipe_buffer *buf;
        unsigned int p_tail = pipe->tail;
        unsigned int p_mask = pipe->ring_size - 1;
        unsigned int i_head = i->head;
        size_t off;

        if (unlikely(bytes > i->count))
                bytes = i->count;

        if (unlikely(!bytes))
                return 0;

        if (!sanity(i))
                return 0;

        off = i->iov_offset;
        buf = &pipe->bufs[i_head & p_mask];
        if (off) {
                if (offset == off && buf->page == page) {
                        /* merge with the last one */
                        buf->len += bytes;
                        i->iov_offset += bytes;
                        goto out;
                }
                i_head++;
                buf = &pipe->bufs[i_head & p_mask];
        }
        if (pipe_full(i_head, p_tail, pipe->max_usage))
                return 0;

        buf->ops = &page_cache_pipe_buf_ops;
        buf->flags = 0;
        get_page(page);
        buf->page = page;
        buf->offset = offset;
        buf->len = bytes;

        pipe->head = i_head + 1;
        i->iov_offset = offset + bytes;
        i->head = i_head;
out:
        i->count -= bytes;
        return bytes;
}

/*
 * Fault in one or more iovecs of the given iov_iter, to a maximum length of
 * bytes.  For each iovec, fault in each page that constitutes the iovec.
 *
 * Return 0 on success, or non-zero if the memory could not be accessed (i.e.
 * because it is an invalid address).
 */
int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
{
        size_t skip = i->iov_offset;
        const struct iovec *iov;
        int err;
        struct iovec v;

        if (iter_is_iovec(i)) {
                iterate_iovec(i, bytes, v, iov, skip, ({
                        err = fault_in_pages_readable(v.iov_base, v.iov_len);
                        if (unlikely(err))
                        return err;
                0;}))
        }
        return 0;
}
EXPORT_SYMBOL(iov_iter_fault_in_readable);

void iov_iter_init(struct iov_iter *i, unsigned int direction,
                        const struct iovec *iov, unsigned long nr_segs,
                        size_t count)
{
        WARN_ON(direction & ~(READ | WRITE));
        direction &= READ | WRITE;

        /* It will get better.  Eventually... */
        if (uaccess_kernel()) {
                i->type = ITER_KVEC | direction;
                i->kvec = (struct kvec *)iov;
        } else {
                i->type = ITER_IOVEC | direction;
                i->iov = iov;
        }
        i->nr_segs = nr_segs;
        i->iov_offset = 0;
        i->count = count;
}
EXPORT_SYMBOL(iov_iter_init);

static void memzero_page(struct page *page, size_t offset, size_t len)
{
        char *addr = kmap_atomic(page);
        memset(addr + offset, 0, len);
        kunmap_atomic(addr);
}

static inline bool allocated(struct pipe_buffer *buf)
{
        return buf->ops == &default_pipe_buf_ops;
}

static inline void data_start(const struct iov_iter *i,
                              unsigned int *iter_headp, size_t *offp)
{
        unsigned int p_mask = i->pipe->ring_size - 1;
        unsigned int iter_head = i->head;
        size_t off = i->iov_offset;

        if (off && (!allocated(&i->pipe->bufs[iter_head & p_mask]) ||
                    off == PAGE_SIZE)) {
                iter_head++;
                off = 0;
        }
        *iter_headp = iter_head;
        *offp = off;
}

static size_t push_pipe(struct iov_iter *i, size_t size,
                        int *iter_headp, size_t *offp)
{
        struct pipe_inode_info *pipe = i->pipe;
        unsigned int p_tail = pipe->tail;
        unsigned int p_mask = pipe->ring_size - 1;
        unsigned int iter_head;
        size_t off;
        ssize_t left;

        if (unlikely(size > i->count))
                size = i->count;
        if (unlikely(!size))
                return 0;

        left = size;
        data_start(i, &iter_head, &off);
        *iter_headp = iter_head;
        *offp = off;
        if (off) {
                left -= PAGE_SIZE - off;
                if (left <= 0) {
                        pipe->bufs[iter_head & p_mask].len += size;
                        return size;
                }
                pipe->bufs[iter_head & p_mask].len = PAGE_SIZE;
                iter_head++;
        }
        while (!pipe_full(iter_head, p_tail, pipe->max_usage)) {
                struct pipe_buffer *buf = &pipe->bufs[iter_head & p_mask];
                struct page *page = alloc_page(GFP_USER);
                if (!page)
                        break;

                buf->ops = &default_pipe_buf_ops;
                buf->flags = 0;
                buf->page = page;
                buf->offset = 0;
                buf->len = min_t(ssize_t, left, PAGE_SIZE);
                left -= buf->len;
                iter_head++;
                pipe->head = iter_head;

                if (left == 0)
                        return size;
        }
        return size - left;
}

static size_t copy_pipe_to_iter(const void *addr, size_t bytes,
                                struct iov_iter *i)
{
        struct pipe_inode_info *pipe = i->pipe;
        unsigned int p_mask = pipe->ring_size - 1;
        unsigned int i_head;
        size_t n, off;

        if (!sanity(i))
                return 0;

        bytes = n = push_pipe(i, bytes, &i_head, &off);
        if (unlikely(!n))
                return 0;
        do {
                size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
                memcpy_to_page(pipe->bufs[i_head & p_mask].page, off, addr, chunk);
                i->head = i_head;
                i->iov_offset = off + chunk;
                n -= chunk;
                addr += chunk;
                off = 0;
                i_head++;
        } while (n);
        i->count -= bytes;
        return bytes;
}

static __wsum csum_and_memcpy(void *to, const void *from, size_t len,
                              __wsum sum, size_t off)
{
        __wsum next = csum_partial_copy_nocheck(from, to, len);
        return csum_block_add(sum, next, off);
}

static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes,
                                         struct csum_state *csstate,
                                         struct iov_iter *i)
{
        struct pipe_inode_info *pipe = i->pipe;
        unsigned int p_mask = pipe->ring_size - 1;
        __wsum sum = csstate->csum;
        size_t off = csstate->off;
        unsigned int i_head;
        size_t n, r;

        if (!sanity(i))
                return 0;

        bytes = n = push_pipe(i, bytes, &i_head, &r);
        if (unlikely(!n))
                return 0;
        do {
                size_t chunk = min_t(size_t, n, PAGE_SIZE - r);
                char *p = kmap_atomic(pipe->bufs[i_head & p_mask].page);
                sum = csum_and_memcpy(p + r, addr, chunk, sum, off);
                kunmap_atomic(p);
                i->head = i_head;
                i->iov_offset = r + chunk;
                n -= chunk;
                off += chunk;
                addr += chunk;
                r = 0;
                i_head++;
        } while (n);
        i->count -= bytes;
        csstate->csum = sum;
        csstate->off = off;
        return bytes;
}

size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
{
        const char *from = addr;
        if (unlikely(iov_iter_is_pipe(i)))
                return copy_pipe_to_iter(addr, bytes, i);
        if (iter_is_iovec(i))
                might_fault();
        iterate_and_advance(i, bytes, v,
                copyout(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len),
                memcpy_to_page(v.bv_page, v.bv_offset,
                               (from += v.bv_len) - v.bv_len, v.bv_len),
                memcpy(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len)
        )

        return bytes;
}
EXPORT_SYMBOL(_copy_to_iter);

#ifdef CONFIG_ARCH_HAS_COPY_MC
static int copyout_mc(void __user *to, const void *from, size_t n)
{
        if (access_ok(to, n)) {
                instrument_copy_to_user(to, from, n);
                n = copy_mc_to_user((__force void *) to, from, n);
        }
        return n;
}

static unsigned long copy_mc_to_page(struct page *page, size_t offset,
                const char *from, size_t len)
{
        unsigned long ret;
        char *to;

        to = kmap_atomic(page);
        ret = copy_mc_to_kernel(to + offset, from, len);
        kunmap_atomic(to);

        return ret;
}

static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes,
                                struct iov_iter *i)
{
        struct pipe_inode_info *pipe = i->pipe;
        unsigned int p_mask = pipe->ring_size - 1;
        unsigned int i_head;
        size_t n, off, xfer = 0;

        if (!sanity(i))
                return 0;

        bytes = n = push_pipe(i, bytes, &i_head, &off);
        if (unlikely(!n))
                return 0;
        do {
                size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
                unsigned long rem;

                rem = copy_mc_to_page(pipe->bufs[i_head & p_mask].page,
                                            off, addr, chunk);
                i->head = i_head;
                i->iov_offset = off + chunk - rem;
                xfer += chunk - rem;
                if (rem)
                        break;
                n -= chunk;
                addr += chunk;
                off = 0;
                i_head++;
        } while (n);
        i->count -= xfer;
        return xfer;
}

/**
 * _copy_mc_to_iter - copy to iter with source memory error exception handling
 * @addr: source kernel address
 * @bytes: total transfer length
 * @iter: destination iterator
 *
 * The pmem driver deploys this for the dax operation
 * (dax_copy_to_iter()) for dax reads (bypass page-cache and the
 * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes
 * successfully copied.
 *
 * The main differences between this and typical _copy_to_iter().
 *
 * * Typical tail/residue handling after a fault retries the copy
 *   byte-by-byte until the fault happens again. Re-triggering machine
 *   checks is potentially fatal so the implementation uses source
 *   alignment and poison alignment assumptions to avoid re-triggering
 *   hardware exceptions.
 *
 * * ITER_KVEC, ITER_PIPE, and ITER_BVEC can return short copies.
 *   Compare to copy_to_iter() where only ITER_IOVEC attempts might return
 *   a short copy.
 */
size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
{
        const char *from = addr;
        unsigned long rem, curr_addr, s_addr = (unsigned long) addr;

        if (unlikely(iov_iter_is_pipe(i)))
                return copy_mc_pipe_to_iter(addr, bytes, i);
        if (iter_is_iovec(i))
                might_fault();
        iterate_and_advance(i, bytes, v,
                copyout_mc(v.iov_base, (from += v.iov_len) - v.iov_len,
                           v.iov_len),
                ({
                rem = copy_mc_to_page(v.bv_page, v.bv_offset,
                                      (from += v.bv_len) - v.bv_len, v.bv_len);
                if (rem) {
                        curr_addr = (unsigned long) from;
                        bytes = curr_addr - s_addr - rem;
                        return bytes;
                }
                }),
                ({
                rem = copy_mc_to_kernel(v.iov_base, (from += v.iov_len)
                                        - v.iov_len, v.iov_len);
                if (rem) {
                        curr_addr = (unsigned long) from;
                        bytes = curr_addr - s_addr - rem;
                        return bytes;
                }
                })
        )

        return bytes;
}
EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
#endif /* CONFIG_ARCH_HAS_COPY_MC */

size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
{
        char *to = addr;
        if (unlikely(iov_iter_is_pipe(i))) {
                WARN_ON(1);
                return 0;
        }
        if (iter_is_iovec(i))
                might_fault();
        iterate_and_advance(i, bytes, v,
                copyin((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
                memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
                                 v.bv_offset, v.bv_len),
                memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
        )

        return bytes;
}
EXPORT_SYMBOL(_copy_from_iter);

bool _copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i)
{
        char *to = addr;
        if (unlikely(iov_iter_is_pipe(i))) {
                WARN_ON(1);
                return false;
        }
        if (unlikely(i->count < bytes))
                return false;

        if (iter_is_iovec(i))
                might_fault();
        iterate_all_kinds(i, bytes, v, ({
                if (copyin((to += v.iov_len) - v.iov_len,
                                      v.iov_base, v.iov_len))
                        return false;
                0;}),
                memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
                                 v.bv_offset, v.bv_len),
                memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
        )

        iov_iter_advance(i, bytes);
        return true;
}
EXPORT_SYMBOL(_copy_from_iter_full);

size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
{
        char *to = addr;
        if (unlikely(iov_iter_is_pipe(i))) {
                WARN_ON(1);
                return 0;
        }
        iterate_and_advance(i, bytes, v,
                __copy_from_user_inatomic_nocache((to += v.iov_len) - v.iov_len,
                                         v.iov_base, v.iov_len),
                memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
                                 v.bv_offset, v.bv_len),
                memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
        )

        return bytes;
}
EXPORT_SYMBOL(_copy_from_iter_nocache);

#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
/**
 * _copy_from_iter_flushcache - write destination through cpu cache
 * @addr: destination kernel address
 * @bytes: total transfer length
 * @iter: source iterator
 *
 * The pmem driver arranges for filesystem-dax to use this facility via
 * dax_copy_from_iter() for ensuring that writes to persistent memory
 * are flushed through the CPU cache. It is differentiated from
 * _copy_from_iter_nocache() in that guarantees all data is flushed for
 * all iterator types. The _copy_from_iter_nocache() only attempts to
 * bypass the cache for the ITER_IOVEC case, and on some archs may use
 * instructions that strand dirty-data in the cache.
 */
size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
{
        char *to = addr;
        if (unlikely(iov_iter_is_pipe(i))) {
                WARN_ON(1);
                return 0;
        }
        iterate_and_advance(i, bytes, v,
                __copy_from_user_flushcache((to += v.iov_len) - v.iov_len,
                                         v.iov_base, v.iov_len),
                memcpy_page_flushcache((to += v.bv_len) - v.bv_len, v.bv_page,
                                 v.bv_offset, v.bv_len),
                memcpy_flushcache((to += v.iov_len) - v.iov_len, v.iov_base,
                        v.iov_len)
        )

        return bytes;
}
EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache);
#endif

bool _copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i)
{
        char *to = addr;
        if (unlikely(iov_iter_is_pipe(i))) {
                WARN_ON(1);
                return false;
        }
        if (unlikely(i->count < bytes))
                return false;
        iterate_all_kinds(i, bytes, v, ({
                if (__copy_from_user_inatomic_nocache((to += v.iov_len) - v.iov_len,
                                             v.iov_base, v.iov_len))
                        return false;
                0;}),
                memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
                                 v.bv_offset, v.bv_len),
                memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
        )

        iov_iter_advance(i, bytes);
        return true;
}
EXPORT_SYMBOL(_copy_from_iter_full_nocache);

static inline bool page_copy_sane(struct page *page, size_t offset, size_t n)
{
        struct page *head;
        size_t v = n + offset;

        /*
         * The general case needs to access the page order in order
         * to compute the page size.
         * However, we mostly deal with order-0 pages and thus can
         * avoid a possible cache line miss for requests that fit all
         * page orders.
         */
        if (n <= v && v <= PAGE_SIZE)
                return true;

        head = compound_head(page);
        v += (page - head) << PAGE_SHIFT;

        if (likely(n <= v && v <= (page_size(head))))
                return true;
        WARN_ON(1);
        return false;
}

size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
                         struct iov_iter *i)
{
        if (unlikely(!page_copy_sane(page, offset, bytes)))
                return 0;
        if (i->type & (ITER_BVEC|ITER_KVEC)) {
                void *kaddr = kmap_atomic(page);
                size_t wanted = copy_to_iter(kaddr + offset, bytes, i);
                kunmap_atomic(kaddr);
                return wanted;
        } else if (unlikely(iov_iter_is_discard(i))) {
                if (unlikely(i->count < bytes))
                        bytes = i->count;
                i->count -= bytes;
                return bytes;
        } else if (likely(!iov_iter_is_pipe(i)))
                return copy_page_to_iter_iovec(page, offset, bytes, i);
        else
                return copy_page_to_iter_pipe(page, offset, bytes, i);
}
EXPORT_SYMBOL(copy_page_to_iter);

size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
                         struct iov_iter *i)
{
        if (unlikely(!page_copy_sane(page, offset, bytes)))
                return 0;
        if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
                WARN_ON(1);
                return 0;
        }
        if (i->type & (ITER_BVEC|ITER_KVEC)) {
                void *kaddr = kmap_atomic(page);
                size_t wanted = _copy_from_iter(kaddr + offset, bytes, i);
                kunmap_atomic(kaddr);
                return wanted;
        } else
                return copy_page_from_iter_iovec(page, offset, bytes, i);
}
EXPORT_SYMBOL(copy_page_from_iter);

static size_t pipe_zero(size_t bytes, struct iov_iter *i)
{
        struct pipe_inode_info *pipe = i->pipe;
        unsigned int p_mask = pipe->ring_size - 1;
        unsigned int i_head;
        size_t n, off;

        if (!sanity(i))
                return 0;

        bytes = n = push_pipe(i, bytes, &i_head, &off);
        if (unlikely(!n))
                return 0;

        do {
                size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
                memzero_page(pipe->bufs[i_head & p_mask].page, off, chunk);
                i->head = i_head;
                i->iov_offset = off + chunk;
                n -= chunk;
                off = 0;
                i_head++;
        } while (n);
        i->count -= bytes;
        return bytes;
}

size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
{
        if (unlikely(iov_iter_is_pipe(i)))
                return pipe_zero(bytes, i);
        iterate_and_advance(i, bytes, v,
                clear_user(v.iov_base, v.iov_len),
                memzero_page(v.bv_page, v.bv_offset, v.bv_len),
                memset(v.iov_base, 0, v.iov_len)
        )

        return bytes;
}
EXPORT_SYMBOL(iov_iter_zero);

size_t iov_iter_copy_from_user_atomic(struct page *page,
                struct iov_iter *i, unsigned long offset, size_t bytes)
{
        char *kaddr = kmap_atomic(page), *p = kaddr + offset;
        if (unlikely(!page_copy_sane(page, offset, bytes))) {
                kunmap_atomic(kaddr);
                return 0;
        }
        if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
                kunmap_atomic(kaddr);
                WARN_ON(1);
                return 0;
        }
        iterate_all_kinds(i, bytes, v,
                copyin((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
                memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page,
                                 v.bv_offset, v.bv_len),
                memcpy((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
        )
        kunmap_atomic(kaddr);
        return bytes;
}
EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);

static inline void pipe_truncate(struct iov_iter *i)
{
        struct pipe_inode_info *pipe = i->pipe;
        unsigned int p_tail = pipe->tail;
        unsigned int p_head = pipe->head;
        unsigned int p_mask = pipe->ring_size - 1;

        if (!pipe_empty(p_head, p_tail)) {
                struct pipe_buffer *buf;
                unsigned int i_head = i->head;
                size_t off = i->iov_offset;

                if (off) {
                        buf = &pipe->bufs[i_head & p_mask];
                        buf->len = off - buf->offset;
                        i_head++;
                }
                while (p_head != i_head) {
                        p_head--;
                        pipe_buf_release(pipe, &pipe->bufs[p_head & p_mask]);
                }

                pipe->head = p_head;
        }
}

static void pipe_advance(struct iov_iter *i, size_t size)
{
        struct pipe_inode_info *pipe = i->pipe;
        if (unlikely(i->count < size))
                size = i->count;
        if (size) {
                struct pipe_buffer *buf;
                unsigned int p_mask = pipe->ring_size - 1;
                unsigned int i_head = i->head;
                size_t off = i->iov_offset, left = size;

                if (off) /* make it relative to the beginning of buffer */
                        left += off - pipe->bufs[i_head & p_mask].offset;
                while (1) {
                        buf = &pipe->bufs[i_head & p_mask];
                        if (left <= buf->len)
                                break;
                        left -= buf->len;
                        i_head++;
                }
                i->head = i_head;
                i->iov_offset = buf->offset + left;
        }
        i->count -= size;
        /* ... and discard everything past that point */
        pipe_truncate(i);
}

void iov_iter_advance(struct iov_iter *i, size_t size)
{
        if (unlikely(iov_iter_is_pipe(i))) {
                pipe_advance(i, size);
                return;
        }
        if (unlikely(iov_iter_is_discard(i))) {
                i->count -= size;
                return;
        }
        iterate_and_advance(i, size, v, 0, 0, 0)
}
EXPORT_SYMBOL(iov_iter_advance);

void iov_iter_revert(struct iov_iter *i, size_t unroll)
{
        if (!unroll)
                return;
        if (WARN_ON(unroll > MAX_RW_COUNT))
                return;
        i->count += unroll;
        if (unlikely(iov_iter_is_pipe(i))) {
                struct pipe_inode_info *pipe = i->pipe;
                unsigned int p_mask = pipe->ring_size - 1;
                unsigned int i_head = i->head;
                size_t off = i->iov_offset;
                while (1) {
                        struct pipe_buffer *b = &pipe->bufs[i_head & p_mask];
                        size_t n = off - b->offset;
                        if (unroll < n) {
                                off -= unroll;
                                break;
                        }
                        unroll -= n;
                        if (!unroll && i_head == i->start_head) {
                                off = 0;
                                break;
                        }
                        i_head--;
                        b = &pipe->bufs[i_head & p_mask];
                        off = b->offset + b->len;
                }
                i->iov_offset = off;
                i->head = i_head;
                pipe_truncate(i);
                return;
        }
        if (unlikely(iov_iter_is_discard(i)))
                return;
        if (unroll <= i->iov_offset) {
                i->iov_offset -= unroll;
                return;
        }
        unroll -= i->iov_offset;
        if (iov_iter_is_bvec(i)) {
                const struct bio_vec *bvec = i->bvec;
                while (1) {
                        size_t n = (--bvec)->bv_len;
                        i->nr_segs++;
                        if (unroll <= n) {
                                i->bvec = bvec;
                                i->iov_offset = n - unroll;
                                return;
                        }
                        unroll -= n;
                }
        } else { /* same logics for iovec and kvec */
                const struct iovec *iov = i->iov;
                while (1) {
                        size_t n = (--iov)->iov_len;
                        i->nr_segs++;
                        if (unroll <= n) {
                                i->iov = iov;
                                i->iov_offset = n - unroll;
                                return;
                        }
                        unroll -= n;
                }
        }
}
EXPORT_SYMBOL(iov_iter_revert);

/*
 * Return the count of just the current iov_iter segment.
 */
size_t iov_iter_single_seg_count(const struct iov_iter *i)
{
        if (unlikely(iov_iter_is_pipe(i)))
                return i->count;        // it is a silly place, anyway
        if (i->nr_segs == 1)
                return i->count;
        if (unlikely(iov_iter_is_discard(i)))
                return i->count;
        else if (iov_iter_is_bvec(i))
                return min(i->count, i->bvec->bv_len - i->iov_offset);
        else
                return min(i->count, i->iov->iov_len - i->iov_offset);
}
EXPORT_SYMBOL(iov_iter_single_seg_count);

void iov_iter_kvec(struct iov_iter *i, unsigned int direction,
                        const struct kvec *kvec, unsigned long nr_segs,
                        size_t count)
{
        WARN_ON(direction & ~(READ | WRITE));
        i->type = ITER_KVEC | (direction & (READ | WRITE));
        i->kvec = kvec;
        i->nr_segs = nr_segs;
        i->iov_offset = 0;
        i->count = count;
}
EXPORT_SYMBOL(iov_iter_kvec);

void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
                        const struct bio_vec *bvec, unsigned long nr_segs,
                        size_t count)
{
        WARN_ON(direction & ~(READ | WRITE));
        i->type = ITER_BVEC | (direction & (READ | WRITE));
        i->bvec = bvec;
        i->nr_segs = nr_segs;
        i->iov_offset = 0;
        i->count = count;
}
EXPORT_SYMBOL(iov_iter_bvec);

void iov_iter_pipe(struct iov_iter *i, unsigned int direction,
                        struct pipe_inode_info *pipe,
                        size_t count)
{
        BUG_ON(direction != READ);
        WARN_ON(pipe_full(pipe->head, pipe->tail, pipe->ring_size));
        i->type = ITER_PIPE | READ;
        i->pipe = pipe;
        i->head = pipe->head;
        i->iov_offset = 0;
        i->count = count;
        i->start_head = i->head;
}
EXPORT_SYMBOL(iov_iter_pipe);

/**
 * iov_iter_discard - Initialise an I/O iterator that discards data
 * @i: The iterator to initialise.
 * @direction: The direction of the transfer.
 * @count: The size of the I/O buffer in bytes.
 *
 * Set up an I/O iterator that just discards everything that's written to it.
 * It's only available as a READ iterator.
 */
void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count)
{
        BUG_ON(direction != READ);
        i->type = ITER_DISCARD | READ;
        i->count = count;
        i->iov_offset = 0;
}
EXPORT_SYMBOL(iov_iter_discard);

unsigned long iov_iter_alignment(const struct iov_iter *i)
{
        unsigned long res = 0;
        size_t size = i->count;

        if (unlikely(iov_iter_is_pipe(i))) {
                unsigned int p_mask = i->pipe->ring_size - 1;

                if (size && i->iov_offset && allocated(&i->pipe->bufs[i->head & p_mask]))
                        return size | i->iov_offset;
                return size;
        }
        iterate_all_kinds(i, size, v,
                (res |= (unsigned long)v.iov_base | v.iov_len, 0),
                res |= v.bv_offset | v.bv_len,
                res |= (unsigned long)v.iov_base | v.iov_len
        )
        return res;
}
EXPORT_SYMBOL(iov_iter_alignment);

unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
{
        unsigned long res = 0;
        size_t size = i->count;

        if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
                WARN_ON(1);
                return ~0U;
        }

        iterate_all_kinds(i, size, v,
                (res |= (!res ? 0 : (unsigned long)v.iov_base) |
                        (size != v.iov_len ? size : 0), 0),
                (res |= (!res ? 0 : (unsigned long)v.bv_offset) |
                        (size != v.bv_len ? size : 0)),
                (res |= (!res ? 0 : (unsigned long)v.iov_base) |
                        (size != v.iov_len ? size : 0))
                );
        return res;
}
EXPORT_SYMBOL(iov_iter_gap_alignment);

static inline ssize_t __pipe_get_pages(struct iov_iter *i,
                                size_t maxsize,
                                struct page **pages,
                                int iter_head,
                                size_t *start)
{
        struct pipe_inode_info *pipe = i->pipe;
        unsigned int p_mask = pipe->ring_size - 1;
        ssize_t n = push_pipe(i, maxsize, &iter_head, start);
        if (!n)
                return -EFAULT;

        maxsize = n;
        n += *start;
        while (n > 0) {
                get_page(*pages++ = pipe->bufs[iter_head & p_mask].page);
                iter_head++;
                n -= PAGE_SIZE;
        }

        return maxsize;
}

static ssize_t pipe_get_pages(struct iov_iter *i,
                   struct page **pages, size_t maxsize, unsigned maxpages,
                   size_t *start)
{
        unsigned int iter_head, npages;
        size_t capacity;

        if (!maxsize)
                return 0;

        if (!sanity(i))
                return -EFAULT;

        data_start(i, &iter_head, start);
        /* Amount of free space: some of this one + all after this one */
        npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
        capacity = min(npages, maxpages) * PAGE_SIZE - *start;

        return __pipe_get_pages(i, min(maxsize, capacity), pages, iter_head, start);
}

ssize_t iov_iter_get_pages(struct iov_iter *i,
                   struct page **pages, size_t maxsize, unsigned maxpages,
                   size_t *start)
{
        if (maxsize > i->count)
                maxsize = i->count;

        if (unlikely(iov_iter_is_pipe(i)))
                return pipe_get_pages(i, pages, maxsize, maxpages, start);
        if (unlikely(iov_iter_is_discard(i)))
                return -EFAULT;

        iterate_all_kinds(i, maxsize, v, ({
                unsigned long addr = (unsigned long)v.iov_base;
                size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
                int n;
                int res;

                if (len > maxpages * PAGE_SIZE)
                        len = maxpages * PAGE_SIZE;
                addr &= ~(PAGE_SIZE - 1);
                n = DIV_ROUND_UP(len, PAGE_SIZE);
                res = get_user_pages_fast(addr, n,
                                iov_iter_rw(i) != WRITE ?  FOLL_WRITE : 0,
                                pages);
                if (unlikely(res <= 0))
                        return res;
                return (res == n ? len : res * PAGE_SIZE) - *start;
        0;}),({
                /* can't be more than PAGE_SIZE */
                *start = v.bv_offset;
                get_page(*pages = v.bv_page);
                return v.bv_len;
        }),({
                return -EFAULT;
        })
        )
        return 0;
}
EXPORT_SYMBOL(iov_iter_get_pages);

static struct page **get_pages_array(size_t n)
{
        return kvmalloc_array(n, sizeof(struct page *), GFP_KERNEL);
}

static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
                   struct page ***pages, size_t maxsize,
                   size_t *start)
{
        struct page **p;
        unsigned int iter_head, npages;
        ssize_t n;

        if (!maxsize)
                return 0;

        if (!sanity(i))
                return -EFAULT;

        data_start(i, &iter_head, start);
        /* Amount of free space: some of this one + all after this one */
        npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
        n = npages * PAGE_SIZE - *start;
        if (maxsize > n)
                maxsize = n;
        else
                npages = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE);
        p = get_pages_array(npages);
        if (!p)
                return -ENOMEM;
        n = __pipe_get_pages(i, maxsize, p, iter_head, start);
        if (n > 0)
                *pages = p;
        else
                kvfree(p);
        return n;
}

ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
                   struct page ***pages, size_t maxsize,
                   size_t *start)
{
        struct page **p;

        if (maxsize > i->count)
                maxsize = i->count;

        if (unlikely(iov_iter_is_pipe(i)))
                return pipe_get_pages_alloc(i, pages, maxsize, start);
        if (unlikely(iov_iter_is_discard(i)))
                return -EFAULT;

        iterate_all_kinds(i, maxsize, v, ({
                unsigned long addr = (unsigned long)v.iov_base;
                size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
                int n;
                int res;

                addr &= ~(PAGE_SIZE - 1);
                n = DIV_ROUND_UP(len, PAGE_SIZE);
                p = get_pages_array(n);
                if (!p)
                        return -ENOMEM;
                res = get_user_pages_fast(addr, n,
                                iov_iter_rw(i) != WRITE ?  FOLL_WRITE : 0, p);
                if (unlikely(res <= 0)) {
                        kvfree(p);
                        *pages = NULL;
                        return res;
                }
                *pages = p;
                return (res == n ? len : res * PAGE_SIZE) - *start;
        0;}),({
                /* can't be more than PAGE_SIZE */
                *start = v.bv_offset;
                *pages = p = get_pages_array(1);
                if (!p)
                        return -ENOMEM;
                get_page(*p = v.bv_page);
                return v.bv_len;
        }),({
                return -EFAULT;
        })
        )
        return 0;
}
EXPORT_SYMBOL(iov_iter_get_pages_alloc);

size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
                               struct iov_iter *i)
{
        char *to = addr;
        __wsum sum, next;
        size_t off = 0;
        sum = *csum;
        if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
                WARN_ON(1);
                return 0;
        }
        iterate_and_advance(i, bytes, v, ({
                next = csum_and_copy_from_user(v.iov_base,
                                               (to += v.iov_len) - v.iov_len,
                                               v.iov_len);
                if (next) {
                        sum = csum_block_add(sum, next, off);
                        off += v.iov_len;
                }
                next ? 0 : v.iov_len;
        }), ({
                char *p = kmap_atomic(v.bv_page);
                sum = csum_and_memcpy((to += v.bv_len) - v.bv_len,
                                      p + v.bv_offset, v.bv_len,
                                      sum, off);
                kunmap_atomic(p);
                off += v.bv_len;
        }),({
                sum = csum_and_memcpy((to += v.iov_len) - v.iov_len,
                                      v.iov_base, v.iov_len,
                                      sum, off);
                off += v.iov_len;
        })
        )
        *csum = sum;
        return bytes;
}
EXPORT_SYMBOL(csum_and_copy_from_iter);

bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum,
                               struct iov_iter *i)
{
        char *to = addr;
        __wsum sum, next;
        size_t off = 0;
        sum = *csum;
        if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
                WARN_ON(1);
                return false;
        }
        if (unlikely(i->count < bytes))
                return false;
        iterate_all_kinds(i, bytes, v, ({
                next = csum_and_copy_from_user(v.iov_base,
                                               (to += v.iov_len) - v.iov_len,
                                               v.iov_len);
                if (!next)
                        return false;
                sum = csum_block_add(sum, next, off);
                off += v.iov_len;
                0;
        }), ({
                char *p = kmap_atomic(v.bv_page);
                sum = csum_and_memcpy((to += v.bv_len) - v.bv_len,
                                      p + v.bv_offset, v.bv_len,
                                      sum, off);
                kunmap_atomic(p);
                off += v.bv_len;
        }),({
                sum = csum_and_memcpy((to += v.iov_len) - v.iov_len,
                                      v.iov_base, v.iov_len,
                                      sum, off);
                off += v.iov_len;
        })
        )
        *csum = sum;
        iov_iter_advance(i, bytes);
        return true;
}
EXPORT_SYMBOL(csum_and_copy_from_iter_full);

size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate,
                             struct iov_iter *i)
{
        struct csum_state *csstate = _csstate;
        const char *from = addr;
        __wsum sum, next;
        size_t off;

        if (unlikely(iov_iter_is_pipe(i)))
                return csum_and_copy_to_pipe_iter(addr, bytes, _csstate, i);

        sum = csstate->csum;
        off = csstate->off;
        if (unlikely(iov_iter_is_discard(i))) {
                WARN_ON(1);        /* for now */
                return 0;
        }
        iterate_and_advance(i, bytes, v, ({
                next = csum_and_copy_to_user((from += v.iov_len) - v.iov_len,
                                             v.iov_base,
                                             v.iov_len);
                if (next) {
                        sum = csum_block_add(sum, next, off);
                        off += v.iov_len;
                }
                next ? 0 : v.iov_len;
        }), ({
                char *p = kmap_atomic(v.bv_page);
                sum = csum_and_memcpy(p + v.bv_offset,
                                      (from += v.bv_len) - v.bv_len,
                                      v.bv_len, sum, off);
                kunmap_atomic(p);
                off += v.bv_len;
        }),({
                sum = csum_and_memcpy(v.iov_base,
                                     (from += v.iov_len) - v.iov_len,
                                     v.iov_len, sum, off);
                off += v.iov_len;
        })
        )
        csstate->csum = sum;
        csstate->off = off;
        return bytes;
}
EXPORT_SYMBOL(csum_and_copy_to_iter);

size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
                struct iov_iter *i)
{
#ifdef CONFIG_CRYPTO_HASH
        struct ahash_request *hash = hashp;
        struct scatterlist sg;
        size_t copied;

        copied = copy_to_iter(addr, bytes, i);
        sg_init_one(&sg, addr, copied);
        ahash_request_set_crypt(hash, &sg, NULL, copied);
        crypto_ahash_update(hash);
        return copied;
#else
        return 0;
#endif
}
EXPORT_SYMBOL(hash_and_copy_to_iter);

int iov_iter_npages(const struct iov_iter *i, int maxpages)
{
        size_t size = i->count;
        int npages = 0;

        if (!size)
                return 0;
        if (unlikely(iov_iter_is_discard(i)))
                return 0;

        if (unlikely(iov_iter_is_pipe(i))) {
                struct pipe_inode_info *pipe = i->pipe;
                unsigned int iter_head;
                size_t off;

                if (!sanity(i))
                        return 0;

                data_start(i, &iter_head, &off);
                /* some of this one + all after this one */
                npages = pipe_space_for_user(iter_head, pipe->tail, pipe);
                if (npages >= maxpages)
                        return maxpages;
        } else iterate_all_kinds(i, size, v, ({
                unsigned long p = (unsigned long)v.iov_base;
                npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE)
                        - p / PAGE_SIZE;
                if (npages >= maxpages)
                        return maxpages;
        0;}),({
                npages++;
                if (npages >= maxpages)
                        return maxpages;
        }),({
                unsigned long p = (unsigned long)v.iov_base;
                npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE)
                        - p / PAGE_SIZE;
                if (npages >= maxpages)
                        return maxpages;
        })
        )
        return npages;
}
EXPORT_SYMBOL(iov_iter_npages);

const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
{
        *new = *old;
        if (unlikely(iov_iter_is_pipe(new))) {
                WARN_ON(1);
                return NULL;
        }
        if (unlikely(iov_iter_is_discard(new)))
                return NULL;
        if (iov_iter_is_bvec(new))
                return new->bvec = kmemdup(new->bvec,
                                    new->nr_segs * sizeof(struct bio_vec),
                                    flags);
        else
                /* iovec and kvec have identical layout */
                return new->iov = kmemdup(new->iov,
                                   new->nr_segs * sizeof(struct iovec),
                                   flags);
}
EXPORT_SYMBOL(dup_iter);

static int copy_compat_iovec_from_user(struct iovec *iov,
                const struct iovec __user *uvec, unsigned long nr_segs)
{
        const struct compat_iovec __user *uiov =
                (const struct compat_iovec __user *)uvec;
        int ret = -EFAULT, i;

        if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
                return -EFAULT;

        for (i = 0; i < nr_segs; i++) {
                compat_uptr_t buf;
                compat_ssize_t len;

                unsafe_get_user(len, &uiov[i].iov_len, uaccess_end);
                unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end);

                /* check for compat_size_t not fitting in compat_ssize_t .. */
                if (len < 0) {
                        ret = -EINVAL;
                        goto uaccess_end;
                }
                iov[i].iov_base = compat_ptr(buf);
                iov[i].iov_len = len;
        }

        ret = 0;
uaccess_end:
        user_access_end();
        return ret;
}

static int copy_iovec_from_user(struct iovec *iov,
                const struct iovec __user *uvec, unsigned long nr_segs)
{
        unsigned long seg;

        if (copy_from_user(iov, uvec, nr_segs * sizeof(*uvec)))
                return -EFAULT;
        for (seg = 0; seg < nr_segs; seg++) {
                if ((ssize_t)iov[seg].iov_len < 0)
                        return -EINVAL;
        }

        return 0;
}

struct iovec *iovec_from_user(const struct iovec __user *uvec,
                unsigned long nr_segs, unsigned long fast_segs,
                struct iovec *fast_iov, bool compat)
{
        struct iovec *iov = fast_iov;
        int ret;

        /*
         * SuS says "The readv() function *may* fail if the iovcnt argument was
         * less than or equal to 0, or greater than {IOV_MAX}.  Linux has
         * traditionally returned zero for zero segments, so...
         */
        if (nr_segs == 0)
                return iov;
        if (nr_segs > UIO_MAXIOV)
                return ERR_PTR(-EINVAL);
        if (nr_segs > fast_segs) {
                iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
                if (!iov)
                        return ERR_PTR(-ENOMEM);
        }

        if (compat)
                ret = copy_compat_iovec_from_user(iov, uvec, nr_segs);
        else
                ret = copy_iovec_from_user(iov, uvec, nr_segs);
        if (ret) {
                if (iov != fast_iov)
                        kfree(iov);
                return ERR_PTR(ret);
        }

        return iov;
}

ssize_t __import_iovec(int type, const struct iovec __user *uvec,
                 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
                 struct iov_iter *i, bool compat)
{
        ssize_t total_len = 0;
        unsigned long seg;
        struct iovec *iov;

        iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat);
        if (IS_ERR(iov)) {
                *iovp = NULL;
                return PTR_ERR(iov);
        }

        /*
         * According to the Single Unix Specification we should return EINVAL if
         * an element length is < 0 when cast to ssize_t or if the total length
         * would overflow the ssize_t return value of the system call.
         *
         * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
         * overflow case.
         */
        for (seg = 0; seg < nr_segs; seg++) {
                ssize_t len = (ssize_t)iov[seg].iov_len;

                if (!access_ok(iov[seg].iov_base, len)) {
                        if (iov != *iovp)
                                kfree(iov);
                        *iovp = NULL;
                        return -EFAULT;
                }

                if (len > MAX_RW_COUNT - total_len) {
                        len = MAX_RW_COUNT - total_len;
                        iov[seg].iov_len = len;
                }
                total_len += len;
        }

        iov_iter_init(i, type, iov, nr_segs, total_len);
        if (iov == *iovp)
                *iovp = NULL;
        else
                *iovp = iov;
        return total_len;
}

/**
 * import_iovec() - Copy an array of &struct iovec from userspace
 *     into the kernel, check that it is valid, and initialize a new
 *     &struct iov_iter iterator to access it.
 *
 * @type: One of %READ or %WRITE.
 * @uvec: Pointer to the userspace array.
 * @nr_segs: Number of elements in userspace array.
 * @fast_segs: Number of elements in @iov.
 * @iovp: (input and output parameter) Pointer to pointer to (usually small
 *     on-stack) kernel array.
 * @i: Pointer to iterator that will be initialized on success.
 *
 * If the array pointed to by *@iov is large enough to hold all @nr_segs,
 * then this function places %NULL in *@iov on return. Otherwise, a new
 * array will be allocated and the result placed in *@iov. This means that
 * the caller may call kfree() on *@iov regardless of whether the small
 * on-stack array was used or not (and regardless of whether this function
 * returns an error or not).
 *
 * Return: Negative error code on error, bytes imported on success
 */
ssize_t import_iovec(int type, const struct iovec __user *uvec,
                 unsigned nr_segs, unsigned fast_segs,
                 struct iovec **iovp, struct iov_iter *i)
{
        return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i,
                              in_compat_syscall());
}
EXPORT_SYMBOL(import_iovec);

int import_single_range(int rw, void __user *buf, size_t len,
                 struct iovec *iov, struct iov_iter *i)
{
        if (len > MAX_RW_COUNT)
                len = MAX_RW_COUNT;
        if (unlikely(!access_ok(buf, len)))
                return -EFAULT;

        iov->iov_base = buf;
        iov->iov_len = len;
        iov_iter_init(i, rw, iov, 1, len);
        return 0;
}
EXPORT_SYMBOL(import_single_range);

/**
 * iov_iter_restore() - Restore a &struct iov_iter to the same state as when
 *     iov_iter_save_state() was called.
 *
 * @i: &struct iov_iter to restore
 * @state: state to restore from
 *
 * Used after iov_iter_save_state() to bring restore @i, if operations may
 * have advanced it.
 *
 * Note: only works on ITER_IOVEC, ITER_BVEC, and ITER_KVEC
 */
void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
{
        if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i)) &&
                         !iov_iter_is_kvec(i))
                return;
        i->iov_offset = state->iov_offset;
        i->count = state->count;
        /*
         * For the *vec iters, nr_segs + iov is constant - if we increment
         * the vec, then we also decrement the nr_segs count. Hence we don't
         * need to track both of these, just one is enough and we can deduct
         * the other from that. ITER_KVEC and ITER_IOVEC are the same struct
         * size, so we can just increment the iov pointer as they are unionzed.
         * ITER_BVEC _may_ be the same size on some archs, but on others it is
         * not. Be safe and handle it separately.
         */
        BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec));
        if (iov_iter_is_bvec(i))
                i->bvec -= state->nr_segs - i->nr_segs;
        else
                i->iov -= state->nr_segs - i->nr_segs;
        i->nr_segs = state->nr_segs;
}







































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
/* SPDX-License-Identifier: GPL-2.0 */

#ifndef _ASM_X86_NOSPEC_BRANCH_H_
#define _ASM_X86_NOSPEC_BRANCH_H_

#include <linux/static_key.h>
#include <linux/objtool.h>
#include <linux/linkage.h>

#include <asm/alternative.h>
#include <asm/cpufeatures.h>
#include <asm/msr-index.h>
#include <asm/unwind_hints.h>
#include <asm/percpu.h>

#define RETPOLINE_THUNK_SIZE        32

/*
 * Fill the CPU return stack buffer.
 *
 * Each entry in the RSB, if used for a speculative 'ret', contains an
 * infinite 'pause; lfence; jmp' loop to capture speculative execution.
 *
 * This is required in various cases for retpoline and IBRS-based
 * mitigations for the Spectre variant 2 vulnerability. Sometimes to
 * eliminate potentially bogus entries from the RSB, and sometimes
 * purely to ensure that it doesn't get empty, which on some CPUs would
 * allow predictions from other (unwanted!) sources to be used.
 *
 * We define a CPP macro such that it can be used from both .S files and
 * inline assembly. It's possible to do a .macro and then include that
 * from C via asm(".include <asm/nospec-branch.h>") but let's not go there.
 */

#define RSB_CLEAR_LOOPS                32        /* To forcibly overwrite all entries */

/*
 * Common helper for __FILL_RETURN_BUFFER and __FILL_ONE_RETURN.
 */
#define __FILL_RETURN_SLOT                        \
        ANNOTATE_INTRA_FUNCTION_CALL;                \
        call        772f;                                \
        int3;                                        \
772:

/*
 * Stuff the entire RSB.
 *
 * Google experimented with loop-unrolling and this turned out to be
 * the optimal version — two calls, each with their own speculation
 * trap should their return address end up getting used, in a loop.
 */
#ifdef CONFIG_X86_64
#define __FILL_RETURN_BUFFER(reg, nr)                        \
        mov        $(nr/2), reg;                                \
771:                                                        \
        __FILL_RETURN_SLOT                                \
        __FILL_RETURN_SLOT                                \
        add        $(BITS_PER_LONG/8) * 2, %_ASM_SP;        \
        dec        reg;                                        \
        jnz        771b;                                        \
        /* barrier for jnz misprediction */                \
        lfence;
#else
/*
 * i386 doesn't unconditionally have LFENCE, as such it can't
 * do a loop.
 */
#define __FILL_RETURN_BUFFER(reg, nr)                        \
        .rept nr;                                        \
        __FILL_RETURN_SLOT;                                \
        .endr;                                                \
        add        $(BITS_PER_LONG/8) * nr, %_ASM_SP;
#endif

/*
 * Stuff a single RSB slot.
 *
 * To mitigate Post-Barrier RSB speculation, one CALL instruction must be
 * forced to retire before letting a RET instruction execute.
 *
 * On PBRSB-vulnerable CPUs, it is not safe for a RET to be executed
 * before this point.
 */
#define __FILL_ONE_RETURN                                \
        __FILL_RETURN_SLOT                                \
        add        $(BITS_PER_LONG/8), %_ASM_SP;                \
        lfence;

#ifdef __ASSEMBLY__

/*
 * This should be used immediately before an indirect jump/call. It tells
 * objtool the subsequent indirect jump/call is vouched safe for retpoline
 * builds.
 */
.macro ANNOTATE_RETPOLINE_SAFE
        .Lannotate_\@:
        .pushsection .discard.retpoline_safe
        _ASM_PTR .Lannotate_\@
        .popsection
.endm

/*
 * (ab)use RETPOLINE_SAFE on RET to annotate away 'bare' RET instructions
 * vs RETBleed validation.
 */
#define ANNOTATE_UNRET_SAFE ANNOTATE_RETPOLINE_SAFE

/*
 * Abuse ANNOTATE_RETPOLINE_SAFE on a NOP to indicate UNRET_END, should
 * eventually turn into it's own annotation.
 */
.macro ANNOTATE_UNRET_END
#if (defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO))
        ANNOTATE_RETPOLINE_SAFE
        nop
#endif
.endm

/*
 * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple
 * indirect jmp/call which may be susceptible to the Spectre variant 2
 * attack.
 */
.macro JMP_NOSPEC reg:req
#ifdef CONFIG_RETPOLINE
        ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \
                      __stringify(jmp __x86_indirect_thunk_\reg), X86_FEATURE_RETPOLINE, \
                      __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_LFENCE
#else
        jmp        *%\reg
#endif
.endm

.macro CALL_NOSPEC reg:req
#ifdef CONFIG_RETPOLINE
        ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; call *%\reg), \
                      __stringify(call __x86_indirect_thunk_\reg), X86_FEATURE_RETPOLINE, \
                      __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *%\reg), X86_FEATURE_RETPOLINE_LFENCE
#else
        call        *%\reg
#endif
.endm

 /*
  * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP
  * monstrosity above, manually.
  */
.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req ftr2=ALT_NOT(X86_FEATURE_ALWAYS)
        ALTERNATIVE_2 "jmp .Lskip_rsb_\@", \
                __stringify(__FILL_RETURN_BUFFER(\reg,\nr)), \ftr, \
                __stringify(__FILL_ONE_RETURN), \ftr2

.Lskip_rsb_\@:
.endm

/*
 * The CALL to srso_alias_untrain_ret() must be patched in directly at
 * the spot where untraining must be done, ie., srso_alias_untrain_ret()
 * must be the target of a CALL instruction instead of indirectly
 * jumping to a wrapper which then calls it. Therefore, this macro is
 * called outside of __UNTRAIN_RET below, for the time being, before the
 * kernel can support nested alternatives with arbitrary nesting.
 */
.macro CALL_UNTRAIN_RET
#ifdef CONFIG_CPU_UNRET_ENTRY
        ALTERNATIVE_2 "", "call entry_untrain_ret", X86_FEATURE_UNRET, \
                          "call srso_alias_untrain_ret", X86_FEATURE_SRSO_ALIAS
#endif
.endm

/*
 * Mitigate RETBleed for AMD/Hygon Zen uarch. Requires KERNEL CR3 because the
 * return thunk isn't mapped into the userspace tables (then again, AMD
 * typically has NO_MELTDOWN).
 *
 * While retbleed_untrain_ret() doesn't clobber anything but requires stack,
 * entry_ibpb() will clobber AX, CX, DX.
 *
 * As such, this must be placed after every *SWITCH_TO_KERNEL_CR3 at a point
 * where we have a stack but before any RET instruction.
 */
.macro UNTRAIN_RET
#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \
        defined(CONFIG_CPU_SRSO)
        ANNOTATE_UNRET_END
        CALL_UNTRAIN_RET
        ALTERNATIVE "", "call entry_ibpb", X86_FEATURE_ENTRY_IBPB
#endif
.endm

/*
 * Macro to execute VERW insns that mitigate transient data sampling
 * attacks such as MDS or TSA. On affected systems a microcode update
 * overloaded VERW insns to also clear the CPU buffers. VERW clobbers
 * CFLAGS.ZF.
 * Note: Only the memory operand variant of VERW clears the CPU buffers.
 */
.macro __CLEAR_CPU_BUFFERS feature
        ALTERNATIVE "jmp .Lskip_verw_\@", "", \feature
#ifdef CONFIG_X86_64
        verw x86_verw_sel(%rip)
#else
        /*
         * In 32bit mode, the memory operand must be a %cs reference. The data
         * segments may not be usable (vm86 mode), and the stack segment may not
         * be flat (ESPFIX32).
         */
        verw %cs:x86_verw_sel
#endif
.Lskip_verw_\@:
.endm

#define CLEAR_CPU_BUFFERS \
        __CLEAR_CPU_BUFFERS X86_FEATURE_CLEAR_CPU_BUF

#define VM_CLEAR_CPU_BUFFERS \
        __CLEAR_CPU_BUFFERS X86_FEATURE_CLEAR_CPU_BUF_VM

#else /* __ASSEMBLY__ */

#define ANNOTATE_RETPOLINE_SAFE                                        \
        "999:\n\t"                                                \
        ".pushsection .discard.retpoline_safe\n\t"                \
        _ASM_PTR " 999b\n\t"                                        \
        ".popsection\n\t"

#ifdef CONFIG_RETHUNK
extern void __x86_return_thunk(void);
#else
static inline void __x86_return_thunk(void) {}
#endif

#ifdef CONFIG_MITIGATION_ITS
extern void its_return_thunk(void);
#else
static inline void its_return_thunk(void) {}
#endif

extern void retbleed_return_thunk(void);
extern void srso_return_thunk(void);
extern void srso_alias_return_thunk(void);

extern void retbleed_untrain_ret(void);
extern void srso_untrain_ret(void);
extern void srso_alias_untrain_ret(void);

extern void entry_untrain_ret(void);
extern void entry_ibpb(void);

extern void (*x86_return_thunk)(void);

#ifdef CONFIG_RETPOLINE

typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE];

#define ITS_THUNK_SIZE        64
typedef u8 its_thunk_t[ITS_THUNK_SIZE];

extern its_thunk_t         __x86_indirect_its_thunk_array[];

#define GEN(reg) \
        extern retpoline_thunk_t __x86_indirect_thunk_ ## reg;
#include <asm/GEN-for-each-reg.h>
#undef GEN

extern retpoline_thunk_t __x86_indirect_thunk_array[];

#ifdef CONFIG_X86_64

/*
 * Inline asm uses the %V modifier which is only in newer GCC
 * which is ensured when CONFIG_RETPOLINE is defined.
 */
# define CALL_NOSPEC                                                \
        ALTERNATIVE_2(                                                \
        ANNOTATE_RETPOLINE_SAFE                                        \
        "call *%[thunk_target]\n",                                \
        "call __x86_indirect_thunk_%V[thunk_target]\n",                \
        X86_FEATURE_RETPOLINE,                                        \
        "lfence;\n"                                                \
        ANNOTATE_RETPOLINE_SAFE                                        \
        "call *%[thunk_target]\n",                                \
        X86_FEATURE_RETPOLINE_LFENCE)

# define THUNK_TARGET(addr) [thunk_target] "r" (addr)

#else /* CONFIG_X86_32 */
/*
 * For i386 we use the original ret-equivalent retpoline, because
 * otherwise we'll run out of registers. We don't care about CET
 * here, anyway.
 */
# define CALL_NOSPEC                                                \
        ALTERNATIVE_2(                                                \
        ANNOTATE_RETPOLINE_SAFE                                        \
        "call *%[thunk_target]\n",                                \
        "       jmp    904f;\n"                                        \
        "       .align 16\n"                                        \
        "901:        call   903f;\n"                                        \
        "902:        pause;\n"                                        \
        "            lfence;\n"                                        \
        "       jmp    902b;\n"                                        \
        "       .align 16\n"                                        \
        "903:        lea    4(%%esp), %%esp;\n"                        \
        "       pushl  %[thunk_target];\n"                        \
        "       ret;\n"                                                \
        "       .align 16\n"                                        \
        "904:        call   901b;\n",                                \
        X86_FEATURE_RETPOLINE,                                        \
        "lfence;\n"                                                \
        ANNOTATE_RETPOLINE_SAFE                                        \
        "call *%[thunk_target]\n",                                \
        X86_FEATURE_RETPOLINE_LFENCE)

# define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
#endif
#else /* No retpoline for C / inline asm */
# define CALL_NOSPEC "call *%[thunk_target]\n"
# define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
#endif

/* The Spectre V2 mitigation variants */
enum spectre_v2_mitigation {
        SPECTRE_V2_NONE,
        SPECTRE_V2_RETPOLINE,
        SPECTRE_V2_LFENCE,
        SPECTRE_V2_EIBRS,
        SPECTRE_V2_EIBRS_RETPOLINE,
        SPECTRE_V2_EIBRS_LFENCE,
        SPECTRE_V2_IBRS,
};

/* The indirect branch speculation control variants */
enum spectre_v2_user_mitigation {
        SPECTRE_V2_USER_NONE,
        SPECTRE_V2_USER_STRICT,
        SPECTRE_V2_USER_STRICT_PREFERRED,
        SPECTRE_V2_USER_PRCTL,
        SPECTRE_V2_USER_SECCOMP,
};

/* The Speculative Store Bypass disable variants */
enum ssb_mitigation {
        SPEC_STORE_BYPASS_NONE,
        SPEC_STORE_BYPASS_DISABLE,
        SPEC_STORE_BYPASS_PRCTL,
        SPEC_STORE_BYPASS_SECCOMP,
};

extern char __indirect_thunk_start[];
extern char __indirect_thunk_end[];

static __always_inline
void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature)
{
        asm volatile(ALTERNATIVE("", "wrmsr", %c[feature])
                : : "c" (msr),
                    "a" ((u32)val),
                    "d" ((u32)(val >> 32)),
                    [feature] "i" (feature)
                : "memory");
}

extern u64 x86_pred_cmd;

DECLARE_PER_CPU(bool, x86_ibpb_exit_to_user);

static inline void indirect_branch_prediction_barrier(void)
{
        alternative_msr_write(MSR_IA32_PRED_CMD, x86_pred_cmd, X86_FEATURE_USE_IBPB);
}

/* The Intel SPEC CTRL MSR base value cache */
extern u64 x86_spec_ctrl_base;
DECLARE_PER_CPU(u64, x86_spec_ctrl_current);
extern void update_spec_ctrl_cond(u64 val);
extern u64 spec_ctrl_current(void);

/*
 * With retpoline, we must use IBRS to restrict branch prediction
 * before calling into firmware.
 *
 * (Implemented as CPP macros due to header hell.)
 */
#define firmware_restrict_branch_speculation_start()                        \
do {                                                                        \
        preempt_disable();                                                \
        alternative_msr_write(MSR_IA32_SPEC_CTRL,                        \
                              spec_ctrl_current() | SPEC_CTRL_IBRS,        \
                              X86_FEATURE_USE_IBRS_FW);                        \
        alternative_msr_write(MSR_IA32_PRED_CMD, PRED_CMD_IBPB,                \
                              X86_FEATURE_USE_IBPB_FW);                        \
} while (0)

#define firmware_restrict_branch_speculation_end()                        \
do {                                                                        \
        alternative_msr_write(MSR_IA32_SPEC_CTRL,                        \
                              spec_ctrl_current(),                        \
                              X86_FEATURE_USE_IBRS_FW);                        \
        preempt_enable();                                                \
} while (0)

DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp);
DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb);

DECLARE_STATIC_KEY_FALSE(cpu_buf_idle_clear);

DECLARE_STATIC_KEY_FALSE(mmio_stale_data_clear);

extern u16 x86_verw_sel;

#include <asm/segment.h>

/**
 * x86_clear_cpu_buffers - Buffer clearing support for different x86 CPU vulns
 *
 * This uses the otherwise unused and obsolete VERW instruction in
 * combination with microcode which triggers a CPU buffer flush when the
 * instruction is executed.
 */
static __always_inline void x86_clear_cpu_buffers(void)
{
        static const u16 ds = __KERNEL_DS;

        /*
         * Has to be the memory-operand variant because only that
         * guarantees the CPU buffer flush functionality according to
         * documentation. The register-operand variant does not.
         * Works with any segment selector, but a valid writable
         * data segment is the fastest variant.
         *
         * "cc" clobber is required because VERW modifies ZF.
         */
        asm volatile("verw %[ds]" : : [ds] "m" (ds) : "cc");
}

/**
 * x86_idle_clear_cpu_buffers - Buffer clearing support in idle for the MDS
 * and TSA vulnerabilities.
 *
 * Clear CPU buffers if the corresponding static key is enabled
 */
static __always_inline void x86_idle_clear_cpu_buffers(void)
{
        if (static_branch_likely(&cpu_buf_idle_clear))
                x86_clear_cpu_buffers();
}

#endif /* __ASSEMBLY__ */

#endif /* _ASM_X86_NOSPEC_BRANCH_H_ */
































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SOCKET_H
#define _LINUX_SOCKET_H


#include <asm/socket.h>                        /* arch-dependent defines        */
#include <linux/sockios.h>                /* the SIOCxxx I/O controls        */
#include <linux/uio.h>                        /* iovec support                */
#include <linux/types.h>                /* pid_t                        */
#include <linux/compiler.h>                /* __user                        */
#include <uapi/linux/socket.h>

struct file;
struct pid;
struct cred;
struct socket;

#define __sockaddr_check_size(size)        \
        BUILD_BUG_ON(((size) > sizeof(struct __kernel_sockaddr_storage)))

#ifdef CONFIG_PROC_FS
struct seq_file;
extern void socket_seq_show(struct seq_file *seq);
#endif

typedef __kernel_sa_family_t        sa_family_t;

/*
 *        1003.1g requires sa_family_t and that sa_data is char.
 */

struct sockaddr {
        sa_family_t        sa_family;        /* address family, AF_xxx        */
        union {
                char sa_data_min[14];                /* Minimum 14 bytes of protocol address        */
                DECLARE_FLEX_ARRAY(char, sa_data);
        };
};

struct linger {
        int                l_onoff;        /* Linger active                */
        int                l_linger;        /* How long to linger for        */
};

#define sockaddr_storage __kernel_sockaddr_storage

/*
 *        As we do 4.4BSD message passing we use a 4.4BSD message passing
 *        system, not 4.3. Thus msg_accrights(len) are now missing. They
 *        belong in an obscure libc emulation or the bin.
 */

struct msghdr {
        void                *msg_name;        /* ptr to socket address structure */
        int                msg_namelen;        /* size of socket address structure */
        struct iov_iter        msg_iter;        /* data */

        /*
         * Ancillary data. msg_control_user is the user buffer used for the
         * recv* side when msg_control_is_user is set, msg_control is the kernel
         * buffer used for all other cases.
         */
        union {
                void                *msg_control;
                void __user        *msg_control_user;
        };
        bool                msg_control_is_user : 1;
        __kernel_size_t        msg_controllen;        /* ancillary data buffer length */
        unsigned int        msg_flags;        /* flags on received message */
        struct kiocb        *msg_iocb;        /* ptr to iocb for async requests */
};

struct user_msghdr {
        void                __user *msg_name;        /* ptr to socket address structure */
        int                msg_namelen;                /* size of socket address structure */
        struct iovec        __user *msg_iov;        /* scatter/gather array */
        __kernel_size_t        msg_iovlen;                /* # elements in msg_iov */
        void                __user *msg_control;        /* ancillary data */
        __kernel_size_t        msg_controllen;                /* ancillary data buffer length */
        unsigned int        msg_flags;                /* flags on received message */
};

/* For recvmmsg/sendmmsg */
struct mmsghdr {
        struct user_msghdr  msg_hdr;
        unsigned int        msg_len;
};

/*
 *        POSIX 1003.1g - ancillary data object information
 *        Ancillary data consits of a sequence of pairs of
 *        (cmsghdr, cmsg_data[])
 */

struct cmsghdr {
        __kernel_size_t        cmsg_len;        /* data byte count, including hdr */
        int                cmsg_level;        /* originating protocol */
        int                cmsg_type;        /* protocol-specific type */
};

/*
 *        Ancillary data object information MACROS
 *        Table 5-14 of POSIX 1003.1g
 */

#define __CMSG_NXTHDR(ctl, len, cmsg) __cmsg_nxthdr((ctl),(len),(cmsg))
#define CMSG_NXTHDR(mhdr, cmsg) cmsg_nxthdr((mhdr), (cmsg))

#define CMSG_ALIGN(len) ( ((len)+sizeof(long)-1) & ~(sizeof(long)-1) )

#define CMSG_DATA(cmsg) \
        ((void *)(cmsg) + sizeof(struct cmsghdr))
#define CMSG_USER_DATA(cmsg) \
        ((void __user *)(cmsg) + sizeof(struct cmsghdr))
#define CMSG_SPACE(len) (sizeof(struct cmsghdr) + CMSG_ALIGN(len))
#define CMSG_LEN(len) (sizeof(struct cmsghdr) + (len))

#define __CMSG_FIRSTHDR(ctl,len) ((len) >= sizeof(struct cmsghdr) ? \
                                  (struct cmsghdr *)(ctl) : \
                                  (struct cmsghdr *)NULL)
#define CMSG_FIRSTHDR(msg)        __CMSG_FIRSTHDR((msg)->msg_control, (msg)->msg_controllen)
#define CMSG_OK(mhdr, cmsg) ((cmsg)->cmsg_len >= sizeof(struct cmsghdr) && \
                             (cmsg)->cmsg_len <= (unsigned long) \
                             ((mhdr)->msg_controllen - \
                              ((char *)(cmsg) - (char *)(mhdr)->msg_control)))
#define for_each_cmsghdr(cmsg, msg) \
        for (cmsg = CMSG_FIRSTHDR(msg); \
             cmsg; \
             cmsg = CMSG_NXTHDR(msg, cmsg))

/*
 *        Get the next cmsg header
 *
 *        PLEASE, do not touch this function. If you think, that it is
 *        incorrect, grep kernel sources and think about consequences
 *        before trying to improve it.
 *
 *        Now it always returns valid, not truncated ancillary object
 *        HEADER. But caller still MUST check, that cmsg->cmsg_len is
 *        inside range, given by msg->msg_controllen before using
 *        ancillary object DATA.                                --ANK (980731)
 */

static inline struct cmsghdr * __cmsg_nxthdr(void *__ctl, __kernel_size_t __size,
                                               struct cmsghdr *__cmsg)
{
        struct cmsghdr * __ptr;

        __ptr = (struct cmsghdr*)(((unsigned char *) __cmsg) +  CMSG_ALIGN(__cmsg->cmsg_len));
        if ((unsigned long)((char*)(__ptr+1) - (char *) __ctl) > __size)
                return (struct cmsghdr *)0;

        return __ptr;
}

static inline struct cmsghdr * cmsg_nxthdr (struct msghdr *__msg, struct cmsghdr *__cmsg)
{
        return __cmsg_nxthdr(__msg->msg_control, __msg->msg_controllen, __cmsg);
}

static inline size_t msg_data_left(struct msghdr *msg)
{
        return iov_iter_count(&msg->msg_iter);
}

/* "Socket"-level control message types: */

#define        SCM_RIGHTS        0x01                /* rw: access rights (array of int) */
#define SCM_CREDENTIALS 0x02                /* rw: struct ucred                */
#define SCM_SECURITY        0x03                /* rw: security label                */

struct ucred {
        __u32        pid;
        __u32        uid;
        __u32        gid;
};

/* Supported address families. */
#define AF_UNSPEC        0
#define AF_UNIX                1        /* Unix domain sockets                 */
#define AF_LOCAL        1        /* POSIX name for AF_UNIX        */
#define AF_INET                2        /* Internet IP Protocol         */
#define AF_AX25                3        /* Amateur Radio AX.25                 */
#define AF_IPX                4        /* Novell IPX                         */
#define AF_APPLETALK        5        /* AppleTalk DDP                 */
#define AF_NETROM        6        /* Amateur Radio NET/ROM         */
#define AF_BRIDGE        7        /* Multiprotocol bridge         */
#define AF_ATMPVC        8        /* ATM PVCs                        */
#define AF_X25                9        /* Reserved for X.25 project         */
#define AF_INET6        10        /* IP version 6                        */
#define AF_ROSE                11        /* Amateur Radio X.25 PLP        */
#define AF_DECnet        12        /* Reserved for DECnet project        */
#define AF_NETBEUI        13        /* Reserved for 802.2LLC project*/
#define AF_SECURITY        14        /* Security callback pseudo AF */
#define AF_KEY                15      /* PF_KEY key management API */
#define AF_NETLINK        16
#define AF_ROUTE        AF_NETLINK /* Alias to emulate 4.4BSD */
#define AF_PACKET        17        /* Packet family                */
#define AF_ASH                18        /* Ash                                */
#define AF_ECONET        19        /* Acorn Econet                        */
#define AF_ATMSVC        20        /* ATM SVCs                        */
#define AF_RDS                21        /* RDS sockets                         */
#define AF_SNA                22        /* Linux SNA Project (nutters!) */
#define AF_IRDA                23        /* IRDA sockets                        */
#define AF_PPPOX        24        /* PPPoX sockets                */
#define AF_WANPIPE        25        /* Wanpipe API Sockets */
#define AF_LLC                26        /* Linux LLC                        */
#define AF_IB                27        /* Native InfiniBand address        */
#define AF_MPLS                28        /* MPLS */
#define AF_CAN                29        /* Controller Area Network      */
#define AF_TIPC                30        /* TIPC sockets                        */
#define AF_BLUETOOTH        31        /* Bluetooth sockets                 */
#define AF_IUCV                32        /* IUCV sockets                        */
#define AF_RXRPC        33        /* RxRPC sockets                 */
#define AF_ISDN                34        /* mISDN sockets                 */
#define AF_PHONET        35        /* Phonet sockets                */
#define AF_IEEE802154        36        /* IEEE802154 sockets                */
#define AF_CAIF                37        /* CAIF sockets                        */
#define AF_ALG                38        /* Algorithm sockets                */
#define AF_NFC                39        /* NFC sockets                        */
#define AF_VSOCK        40        /* vSockets                        */
#define AF_KCM                41        /* Kernel Connection Multiplexor*/
#define AF_QIPCRTR        42        /* Qualcomm IPC Router          */
#define AF_SMC                43        /* smc sockets: reserve number for
                                 * PF_SMC protocol family that
                                 * reuses AF_INET address family
                                 */
#define AF_XDP                44        /* XDP sockets                        */

#define AF_MAX                45        /* For now.. */

/* Protocol families, same as address families. */
#define PF_UNSPEC        AF_UNSPEC
#define PF_UNIX                AF_UNIX
#define PF_LOCAL        AF_LOCAL
#define PF_INET                AF_INET
#define PF_AX25                AF_AX25
#define PF_IPX                AF_IPX
#define PF_APPLETALK        AF_APPLETALK
#define        PF_NETROM        AF_NETROM
#define PF_BRIDGE        AF_BRIDGE
#define PF_ATMPVC        AF_ATMPVC
#define PF_X25                AF_X25
#define PF_INET6        AF_INET6
#define PF_ROSE                AF_ROSE
#define PF_DECnet        AF_DECnet
#define PF_NETBEUI        AF_NETBEUI
#define PF_SECURITY        AF_SECURITY
#define PF_KEY                AF_KEY
#define PF_NETLINK        AF_NETLINK
#define PF_ROUTE        AF_ROUTE
#define PF_PACKET        AF_PACKET
#define PF_ASH                AF_ASH
#define PF_ECONET        AF_ECONET
#define PF_ATMSVC        AF_ATMSVC
#define PF_RDS                AF_RDS
#define PF_SNA                AF_SNA
#define PF_IRDA                AF_IRDA
#define PF_PPPOX        AF_PPPOX
#define PF_WANPIPE        AF_WANPIPE
#define PF_LLC                AF_LLC
#define PF_IB                AF_IB
#define PF_MPLS                AF_MPLS
#define PF_CAN                AF_CAN
#define PF_TIPC                AF_TIPC
#define PF_BLUETOOTH        AF_BLUETOOTH
#define PF_IUCV                AF_IUCV
#define PF_RXRPC        AF_RXRPC
#define PF_ISDN                AF_ISDN
#define PF_PHONET        AF_PHONET
#define PF_IEEE802154        AF_IEEE802154
#define PF_CAIF                AF_CAIF
#define PF_ALG                AF_ALG
#define PF_NFC                AF_NFC
#define PF_VSOCK        AF_VSOCK
#define PF_KCM                AF_KCM
#define PF_QIPCRTR        AF_QIPCRTR
#define PF_SMC                AF_SMC
#define PF_XDP                AF_XDP
#define PF_MAX                AF_MAX

/* Maximum queue length specifiable by listen.  */
#define SOMAXCONN        4096

/* Flags we can use with send/ and recv.
   Added those for 1003.1g not all are supported yet
 */

#define MSG_OOB                1
#define MSG_PEEK        2
#define MSG_DONTROUTE        4
#define MSG_TRYHARD     4       /* Synonym for MSG_DONTROUTE for DECnet */
#define MSG_CTRUNC        8
#define MSG_PROBE        0x10        /* Do not send. Only probe path f.e. for MTU */
#define MSG_TRUNC        0x20
#define MSG_DONTWAIT        0x40        /* Nonblocking io                 */
#define MSG_EOR         0x80        /* End of record */
#define MSG_WAITALL        0x100        /* Wait for a full request */
#define MSG_FIN         0x200
#define MSG_SYN                0x400
#define MSG_CONFIRM        0x800        /* Confirm path validity */
#define MSG_RST                0x1000
#define MSG_ERRQUEUE        0x2000        /* Fetch message from error queue */
#define MSG_NOSIGNAL        0x4000        /* Do not generate SIGPIPE */
#define MSG_MORE        0x8000        /* Sender will send more */
#define MSG_WAITFORONE        0x10000        /* recvmmsg(): block until 1+ packets avail */
#define MSG_SENDPAGE_NOPOLICY 0x10000 /* sendpage() internal : do no apply policy */
#define MSG_SENDPAGE_NOTLAST 0x20000 /* sendpage() internal : not the last page */
#define MSG_BATCH        0x40000 /* sendmmsg(): more messages coming */
#define MSG_EOF         MSG_FIN
#define MSG_NO_SHARED_FRAGS 0x80000 /* sendpage() internal : page frags are not shared */
#define MSG_SENDPAGE_DECRYPTED        0x100000 /* sendpage() internal : page may carry
                                          * plain text and require encryption
                                          */

#define MSG_ZEROCOPY        0x4000000        /* Use user data in kernel path */
#define MSG_FASTOPEN        0x20000000        /* Send data in TCP SYN */
#define MSG_CMSG_CLOEXEC 0x40000000        /* Set close_on_exec for file
                                           descriptor received through
                                           SCM_RIGHTS */
#if defined(CONFIG_COMPAT)
#define MSG_CMSG_COMPAT        0x80000000        /* This message needs 32 bit fixups */
#else
#define MSG_CMSG_COMPAT        0                /* We never have 32 bit fixups */
#endif


/* Setsockoptions(2) level. Thanks to BSD these must match IPPROTO_xxx */
#define SOL_IP                0
/* #define SOL_ICMP        1        No-no-no! Due to Linux :-) we cannot use SOL_ICMP=1 */
#define SOL_TCP                6
#define SOL_UDP                17
#define SOL_IPV6        41
#define SOL_ICMPV6        58
#define SOL_SCTP        132
#define SOL_UDPLITE        136     /* UDP-Lite (RFC 3828) */
#define SOL_RAW                255
#define SOL_IPX                256
#define SOL_AX25        257
#define SOL_ATALK        258
#define SOL_NETROM        259
#define SOL_ROSE        260
#define SOL_DECNET        261
#define        SOL_X25                262
#define SOL_PACKET        263
#define SOL_ATM                264        /* ATM layer (cell level) */
#define SOL_AAL                265        /* ATM Adaption Layer (packet level) */
#define SOL_IRDA        266
#define SOL_NETBEUI        267
#define SOL_LLC                268
#define SOL_DCCP        269
#define SOL_NETLINK        270
#define SOL_TIPC        271
#define SOL_RXRPC        272
#define SOL_PPPOL2TP        273
#define SOL_BLUETOOTH        274
#define SOL_PNPIPE        275
#define SOL_RDS                276
#define SOL_IUCV        277
#define SOL_CAIF        278
#define SOL_ALG                279
#define SOL_NFC                280
#define SOL_KCM                281
#define SOL_TLS                282
#define SOL_XDP                283

/* IPX options */
#define IPX_TYPE        1

extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr);
extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data);

struct timespec64;
struct __kernel_timespec;
struct old_timespec32;

struct scm_timestamping_internal {
        struct timespec64 ts[3];
};

extern void put_cmsg_scm_timestamping64(struct msghdr *msg, struct scm_timestamping_internal *tss);
extern void put_cmsg_scm_timestamping(struct msghdr *msg, struct scm_timestamping_internal *tss);

/* The __sys_...msg variants allow MSG_CMSG_COMPAT iff
 * forbid_cmsg_compat==false
 */
extern long __sys_recvmsg(int fd, struct user_msghdr __user *msg,
                          unsigned int flags, bool forbid_cmsg_compat);
extern long __sys_sendmsg(int fd, struct user_msghdr __user *msg,
                          unsigned int flags, bool forbid_cmsg_compat);
extern int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg,
                          unsigned int vlen, unsigned int flags,
                          struct __kernel_timespec __user *timeout,
                          struct old_timespec32 __user *timeout32);
extern int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg,
                          unsigned int vlen, unsigned int flags,
                          bool forbid_cmsg_compat);
extern long __sys_sendmsg_sock(struct socket *sock, struct msghdr *msg,
                               unsigned int flags);
extern long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg,
                               struct user_msghdr __user *umsg,
                               struct sockaddr __user *uaddr,
                               unsigned int flags);
extern int sendmsg_copy_msghdr(struct msghdr *msg,
                               struct user_msghdr __user *umsg, unsigned flags,
                               struct iovec **iov);
extern int recvmsg_copy_msghdr(struct msghdr *msg,
                               struct user_msghdr __user *umsg, unsigned flags,
                               struct sockaddr __user **uaddr,
                               struct iovec **iov);
extern int __copy_msghdr_from_user(struct msghdr *kmsg,
                                   struct user_msghdr __user *umsg,
                                   struct sockaddr __user **save_addr,
                                   struct iovec __user **uiov, size_t *nsegs);

/* helpers which do the actual work for syscalls */
extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size,
                          unsigned int flags, struct sockaddr __user *addr,
                          int __user *addr_len);
extern int __sys_sendto(int fd, void __user *buff, size_t len,
                        unsigned int flags, struct sockaddr __user *addr,
                        int addr_len);
extern int __sys_accept4_file(struct file *file, unsigned file_flags,
                        struct sockaddr __user *upeer_sockaddr,
                         int __user *upeer_addrlen, int flags,
                         unsigned long nofile);
extern struct file *do_accept(struct file *file, unsigned file_flags,
                              struct sockaddr __user *upeer_sockaddr,
                              int __user *upeer_addrlen, int flags);
extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
                         int __user *upeer_addrlen, int flags);
extern int __sys_socket(int family, int type, int protocol);
extern int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen);
extern int __sys_connect_file(struct file *file, struct sockaddr_storage *addr,
                              int addrlen, int file_flags);
extern int __sys_connect(int fd, struct sockaddr __user *uservaddr,
                         int addrlen);
extern int __sys_listen(int fd, int backlog);
extern int __sys_getsockname(int fd, struct sockaddr __user *usockaddr,
                             int __user *usockaddr_len);
extern int __sys_getpeername(int fd, struct sockaddr __user *usockaddr,
                             int __user *usockaddr_len);
extern int __sys_socketpair(int family, int type, int protocol,
                            int __user *usockvec);
extern int __sys_shutdown_sock(struct socket *sock, int how);
extern int __sys_shutdown(int fd, int how);
#endif /* _LINUX_SOCKET_H */





































































































    2 


















    1 




































    9 




























    1 








    1 





    1 



    1 















    1 



































    4 










































    4 







































    4 




    4 

















    4 



    4 























    1 












    1 
























































    1 



















































































































    4 
    4 





















    2 


















    2 































    1 





























































































    1 







    1 




    1 








    1 
    1 








    1 
    1 



























    1 
    1 













































    1 









    1 

    1 

































    1 






    1 

    1 

    1 

    1 











    1 

































    1 


    1 



    1 


    1 

    1 






    1 








































































































































































    4 



    4 











    4 



















































    4 




    4 






    9 

    9 
































































































































































































    1 




    1 




    1 











    1 







































































































































































































































































































































































































































































































































































































































    1 




    1 

    1 


    1 
    1 

    1 


    1 
    1 






















































































    1 
    1 




    1 












    1 




    1 







    1 
































    1 





    1 

    1 







    1 
    1 




























    1 


    1 



    1 







































































































































































































































































































    1 





















































































































































































































































































































    1 













    1 



    1 


    1 
    1 

















    1 







    1 





    1 






    1 


















    1 


    1 



    1 










    1 








    1 




    1 


    1 
    1 


    1 

    1 

    1 








































































































































































































    1 















    1 






    1 









    1 























    1 




    1 

    1 


    1 
    1 



    1 



    1 



    1 

    1 

    1 

    1 

    1 

    1 

    1 
    1 
    1 



    1 






    1 










    1 

    1 

    1 

    1 














    1 
    1 
























































































































































































    1 

















    1 








    1 

    1 








    1 








    1 





    1 








    1 

    1 

    1 

    1 

    1 
    1 
    1 


    1 






    1 





    1 




    1 



































































































































































































































































































































































































































































































































    1 


































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/namespace.c
 *
 * (C) Copyright Al Viro 2000, 2001
 *
 * Based on code from fs/super.c, copyright Linus Torvalds and others.
 * Heavily rewritten.
 */

#include <linux/syscalls.h>
#include <linux/export.h>
#include <linux/capability.h>
#include <linux/mnt_namespace.h>
#include <linux/user_namespace.h>
#include <linux/namei.h>
#include <linux/security.h>
#include <linux/cred.h>
#include <linux/idr.h>
#include <linux/init.h>                /* init_rootfs */
#include <linux/fs_struct.h>        /* get_fs_root et.al. */
#include <linux/fsnotify.h>        /* fsnotify_vfsmount_delete */
#include <linux/file.h>
#include <linux/uaccess.h>
#include <linux/proc_ns.h>
#include <linux/magic.h>
#include <linux/memblock.h>
#include <linux/task_work.h>
#include <linux/sched/task.h>
#include <uapi/linux/mount.h>
#include <linux/fs_context.h>
#include <linux/shmem_fs.h>

#include "pnode.h"
#include "internal.h"

/* Maximum number of mounts in a mount namespace */
unsigned int sysctl_mount_max __read_mostly = 100000;

static unsigned int m_hash_mask __read_mostly;
static unsigned int m_hash_shift __read_mostly;
static unsigned int mp_hash_mask __read_mostly;
static unsigned int mp_hash_shift __read_mostly;

static __initdata unsigned long mhash_entries;
static int __init set_mhash_entries(char *str)
{
        if (!str)
                return 0;
        mhash_entries = simple_strtoul(str, &str, 0);
        return 1;
}
__setup("mhash_entries=", set_mhash_entries);

static __initdata unsigned long mphash_entries;
static int __init set_mphash_entries(char *str)
{
        if (!str)
                return 0;
        mphash_entries = simple_strtoul(str, &str, 0);
        return 1;
}
__setup("mphash_entries=", set_mphash_entries);

static char * __initdata initramfs_options;
static int __init initramfs_options_setup(char *str)
{
        initramfs_options = str;
        return 1;
}

__setup("initramfs_options=", initramfs_options_setup);

static u64 event;
static DEFINE_IDA(mnt_id_ida);
static DEFINE_IDA(mnt_group_ida);

static struct hlist_head *mount_hashtable __read_mostly;
static struct hlist_head *mountpoint_hashtable __read_mostly;
static struct kmem_cache *mnt_cache __read_mostly;
static DECLARE_RWSEM(namespace_sem);
static HLIST_HEAD(unmounted);        /* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */

/* /sys/fs */
struct kobject *fs_kobj;
EXPORT_SYMBOL_GPL(fs_kobj);

/*
 * vfsmount lock may be taken for read to prevent changes to the
 * vfsmount hash, ie. during mountpoint lookups or walking back
 * up the tree.
 *
 * It should be taken for write in all cases where the vfsmount
 * tree or hash is modified or when a vfsmount structure is modified.
 */
__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);

static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)
{
        unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
        tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
        tmp = tmp + (tmp >> m_hash_shift);
        return &mount_hashtable[tmp & m_hash_mask];
}

static inline struct hlist_head *mp_hash(struct dentry *dentry)
{
        unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
        tmp = tmp + (tmp >> mp_hash_shift);
        return &mountpoint_hashtable[tmp & mp_hash_mask];
}

static int mnt_alloc_id(struct mount *mnt)
{
        int res = ida_alloc(&mnt_id_ida, GFP_KERNEL);

        if (res < 0)
                return res;
        mnt->mnt_id = res;
        return 0;
}

static void mnt_free_id(struct mount *mnt)
{
        ida_free(&mnt_id_ida, mnt->mnt_id);
}

/*
 * Allocate a new peer group ID
 */
static int mnt_alloc_group_id(struct mount *mnt)
{
        int res = ida_alloc_min(&mnt_group_ida, 1, GFP_KERNEL);

        if (res < 0)
                return res;
        mnt->mnt_group_id = res;
        return 0;
}

/*
 * Release a peer group ID
 */
void mnt_release_group_id(struct mount *mnt)
{
        ida_free(&mnt_group_ida, mnt->mnt_group_id);
        mnt->mnt_group_id = 0;
}

/*
 * vfsmount lock must be held for read
 */
static inline void mnt_add_count(struct mount *mnt, int n)
{
#ifdef CONFIG_SMP
        this_cpu_add(mnt->mnt_pcp->mnt_count, n);
#else
        preempt_disable();
        mnt->mnt_count += n;
        preempt_enable();
#endif
}

/*
 * vfsmount lock must be held for write
 */
int mnt_get_count(struct mount *mnt)
{
#ifdef CONFIG_SMP
        int count = 0;
        int cpu;

        for_each_possible_cpu(cpu) {
                count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
        }

        return count;
#else
        return mnt->mnt_count;
#endif
}

static struct mount *alloc_vfsmnt(const char *name)
{
        struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
        if (mnt) {
                int err;

                err = mnt_alloc_id(mnt);
                if (err)
                        goto out_free_cache;

                if (name) {
                        mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL);
                        if (!mnt->mnt_devname)
                                goto out_free_id;
                }

#ifdef CONFIG_SMP
                mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
                if (!mnt->mnt_pcp)
                        goto out_free_devname;

                this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
#else
                mnt->mnt_count = 1;
                mnt->mnt_writers = 0;
#endif

                INIT_HLIST_NODE(&mnt->mnt_hash);
                INIT_LIST_HEAD(&mnt->mnt_child);
                INIT_LIST_HEAD(&mnt->mnt_mounts);
                INIT_LIST_HEAD(&mnt->mnt_list);
                INIT_LIST_HEAD(&mnt->mnt_expire);
                INIT_LIST_HEAD(&mnt->mnt_share);
                INIT_LIST_HEAD(&mnt->mnt_slave_list);
                INIT_LIST_HEAD(&mnt->mnt_slave);
                INIT_HLIST_NODE(&mnt->mnt_mp_list);
                INIT_LIST_HEAD(&mnt->mnt_umounting);
                INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
        }
        return mnt;

#ifdef CONFIG_SMP
out_free_devname:
        kfree_const(mnt->mnt_devname);
#endif
out_free_id:
        mnt_free_id(mnt);
out_free_cache:
        kmem_cache_free(mnt_cache, mnt);
        return NULL;
}

/*
 * Most r/o checks on a fs are for operations that take
 * discrete amounts of time, like a write() or unlink().
 * We must keep track of when those operations start
 * (for permission checks) and when they end, so that
 * we can determine when writes are able to occur to
 * a filesystem.
 */
/*
 * __mnt_is_readonly: check whether a mount is read-only
 * @mnt: the mount to check for its write status
 *
 * This shouldn't be used directly ouside of the VFS.
 * It does not guarantee that the filesystem will stay
 * r/w, just that it is right *now*.  This can not and
 * should not be used in place of IS_RDONLY(inode).
 * mnt_want/drop_write() will _keep_ the filesystem
 * r/w.
 */
bool __mnt_is_readonly(struct vfsmount *mnt)
{
        return (mnt->mnt_flags & MNT_READONLY) || sb_rdonly(mnt->mnt_sb);
}
EXPORT_SYMBOL_GPL(__mnt_is_readonly);

static inline void mnt_inc_writers(struct mount *mnt)
{
#ifdef CONFIG_SMP
        this_cpu_inc(mnt->mnt_pcp->mnt_writers);
#else
        mnt->mnt_writers++;
#endif
}

static inline void mnt_dec_writers(struct mount *mnt)
{
#ifdef CONFIG_SMP
        this_cpu_dec(mnt->mnt_pcp->mnt_writers);
#else
        mnt->mnt_writers--;
#endif
}

static unsigned int mnt_get_writers(struct mount *mnt)
{
#ifdef CONFIG_SMP
        unsigned int count = 0;
        int cpu;

        for_each_possible_cpu(cpu) {
                count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
        }

        return count;
#else
        return mnt->mnt_writers;
#endif
}

static int mnt_is_readonly(struct vfsmount *mnt)
{
        if (mnt->mnt_sb->s_readonly_remount)
                return 1;
        /* Order wrt setting s_flags/s_readonly_remount in do_remount() */
        smp_rmb();
        return __mnt_is_readonly(mnt);
}

/*
 * Most r/o & frozen checks on a fs are for operations that take discrete
 * amounts of time, like a write() or unlink().  We must keep track of when
 * those operations start (for permission checks) and when they end, so that we
 * can determine when writes are able to occur to a filesystem.
 */
/**
 * __mnt_want_write - get write access to a mount without freeze protection
 * @m: the mount on which to take a write
 *
 * This tells the low-level filesystem that a write is about to be performed to
 * it, and makes sure that writes are allowed (mnt it read-write) before
 * returning success. This operation does not protect against filesystem being
 * frozen. When the write operation is finished, __mnt_drop_write() must be
 * called. This is effectively a refcount.
 */
int __mnt_want_write(struct vfsmount *m)
{
        struct mount *mnt = real_mount(m);
        int ret = 0;

        preempt_disable();
        mnt_inc_writers(mnt);
        /*
         * The store to mnt_inc_writers must be visible before we pass
         * MNT_WRITE_HOLD loop below, so that the slowpath can see our
         * incremented count after it has set MNT_WRITE_HOLD.
         */
        smp_mb();
        while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
                cpu_relax();
        /*
         * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
         * be set to match its requirements. So we must not load that until
         * MNT_WRITE_HOLD is cleared.
         */
        smp_rmb();
        if (mnt_is_readonly(m)) {
                mnt_dec_writers(mnt);
                ret = -EROFS;
        }
        preempt_enable();

        return ret;
}

/**
 * mnt_want_write - get write access to a mount
 * @m: the mount on which to take a write
 *
 * This tells the low-level filesystem that a write is about to be performed to
 * it, and makes sure that writes are allowed (mount is read-write, filesystem
 * is not frozen) before returning success.  When the write operation is
 * finished, mnt_drop_write() must be called.  This is effectively a refcount.
 */
int mnt_want_write(struct vfsmount *m)
{
        int ret;

        sb_start_write(m->mnt_sb);
        ret = __mnt_want_write(m);
        if (ret)
                sb_end_write(m->mnt_sb);
        return ret;
}
EXPORT_SYMBOL_GPL(mnt_want_write);

/**
 * mnt_clone_write - get write access to a mount
 * @mnt: the mount on which to take a write
 *
 * This is effectively like mnt_want_write, except
 * it must only be used to take an extra write reference
 * on a mountpoint that we already know has a write reference
 * on it. This allows some optimisation.
 *
 * After finished, mnt_drop_write must be called as usual to
 * drop the reference.
 */
int mnt_clone_write(struct vfsmount *mnt)
{
        /* superblock may be r/o */
        if (__mnt_is_readonly(mnt))
                return -EROFS;
        preempt_disable();
        mnt_inc_writers(real_mount(mnt));
        preempt_enable();
        return 0;
}
EXPORT_SYMBOL_GPL(mnt_clone_write);

/**
 * __mnt_want_write_file - get write access to a file's mount
 * @file: the file who's mount on which to take a write
 *
 * This is like __mnt_want_write, but it takes a file and can
 * do some optimisations if the file is open for write already
 */
int __mnt_want_write_file(struct file *file)
{
        if (!(file->f_mode & FMODE_WRITER))
                return __mnt_want_write(file->f_path.mnt);
        else
                return mnt_clone_write(file->f_path.mnt);
}

/**
 * mnt_want_write_file - get write access to a file's mount
 * @file: the file who's mount on which to take a write
 *
 * This is like mnt_want_write, but it takes a file and can
 * do some optimisations if the file is open for write already
 */
int mnt_want_write_file(struct file *file)
{
        int ret;

        sb_start_write(file_inode(file)->i_sb);
        ret = __mnt_want_write_file(file);
        if (ret)
                sb_end_write(file_inode(file)->i_sb);
        return ret;
}
EXPORT_SYMBOL_GPL(mnt_want_write_file);

/**
 * __mnt_drop_write - give up write access to a mount
 * @mnt: the mount on which to give up write access
 *
 * Tells the low-level filesystem that we are done
 * performing writes to it.  Must be matched with
 * __mnt_want_write() call above.
 */
void __mnt_drop_write(struct vfsmount *mnt)
{
        preempt_disable();
        mnt_dec_writers(real_mount(mnt));
        preempt_enable();
}

/**
 * mnt_drop_write - give up write access to a mount
 * @mnt: the mount on which to give up write access
 *
 * Tells the low-level filesystem that we are done performing writes to it and
 * also allows filesystem to be frozen again.  Must be matched with
 * mnt_want_write() call above.
 */
void mnt_drop_write(struct vfsmount *mnt)
{
        __mnt_drop_write(mnt);
        sb_end_write(mnt->mnt_sb);
}
EXPORT_SYMBOL_GPL(mnt_drop_write);

void __mnt_drop_write_file(struct file *file)
{
        __mnt_drop_write(file->f_path.mnt);
}

void mnt_drop_write_file(struct file *file)
{
        __mnt_drop_write_file(file);
        sb_end_write(file_inode(file)->i_sb);
}
EXPORT_SYMBOL(mnt_drop_write_file);

static int mnt_make_readonly(struct mount *mnt)
{
        int ret = 0;

        lock_mount_hash();
        mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
        /*
         * After storing MNT_WRITE_HOLD, we'll read the counters. This store
         * should be visible before we do.
         */
        smp_mb();

        /*
         * With writers on hold, if this value is zero, then there are
         * definitely no active writers (although held writers may subsequently
         * increment the count, they'll have to wait, and decrement it after
         * seeing MNT_READONLY).
         *
         * It is OK to have counter incremented on one CPU and decremented on
         * another: the sum will add up correctly. The danger would be when we
         * sum up each counter, if we read a counter before it is incremented,
         * but then read another CPU's count which it has been subsequently
         * decremented from -- we would see more decrements than we should.
         * MNT_WRITE_HOLD protects against this scenario, because
         * mnt_want_write first increments count, then smp_mb, then spins on
         * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
         * we're counting up here.
         */
        if (mnt_get_writers(mnt) > 0)
                ret = -EBUSY;
        else
                mnt->mnt.mnt_flags |= MNT_READONLY;
        /*
         * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
         * that become unheld will see MNT_READONLY.
         */
        smp_wmb();
        mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
        unlock_mount_hash();
        return ret;
}

static int __mnt_unmake_readonly(struct mount *mnt)
{
        lock_mount_hash();
        mnt->mnt.mnt_flags &= ~MNT_READONLY;
        unlock_mount_hash();
        return 0;
}

int sb_prepare_remount_readonly(struct super_block *sb)
{
        struct mount *mnt;
        int err = 0;

        /* Racy optimization.  Recheck the counter under MNT_WRITE_HOLD */
        if (atomic_long_read(&sb->s_remove_count))
                return -EBUSY;

        lock_mount_hash();
        list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
                if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
                        mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
                        smp_mb();
                        if (mnt_get_writers(mnt) > 0) {
                                err = -EBUSY;
                                break;
                        }
                }
        }
        if (!err && atomic_long_read(&sb->s_remove_count))
                err = -EBUSY;

        if (!err) {
                sb->s_readonly_remount = 1;
                smp_wmb();
        }
        list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
                if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
                        mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
        }
        unlock_mount_hash();

        return err;
}

static void free_vfsmnt(struct mount *mnt)
{
        kfree_const(mnt->mnt_devname);
#ifdef CONFIG_SMP
        free_percpu(mnt->mnt_pcp);
#endif
        kmem_cache_free(mnt_cache, mnt);
}

static void delayed_free_vfsmnt(struct rcu_head *head)
{
        free_vfsmnt(container_of(head, struct mount, mnt_rcu));
}

/* call under rcu_read_lock */
int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
{
        struct mount *mnt;
        if (read_seqretry(&mount_lock, seq))
                return 1;
        if (bastard == NULL)
                return 0;
        mnt = real_mount(bastard);
        mnt_add_count(mnt, 1);
        smp_mb();                // see mntput_no_expire() and do_umount()
        if (likely(!read_seqretry(&mount_lock, seq)))
                return 0;
        lock_mount_hash();
        if (unlikely(bastard->mnt_flags & (MNT_SYNC_UMOUNT | MNT_DOOMED))) {
                mnt_add_count(mnt, -1);
                unlock_mount_hash();
                return 1;
        }
        unlock_mount_hash();
        /* caller will mntput() */
        return -1;
}

/* call under rcu_read_lock */
bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
{
        int res = __legitimize_mnt(bastard, seq);
        if (likely(!res))
                return true;
        if (unlikely(res < 0)) {
                rcu_read_unlock();
                mntput(bastard);
                rcu_read_lock();
        }
        return false;
}

/*
 * find the first mount at @dentry on vfsmount @mnt.
 * call under rcu_read_lock()
 */
struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
{
        struct hlist_head *head = m_hash(mnt, dentry);
        struct mount *p;

        hlist_for_each_entry_rcu(p, head, mnt_hash)
                if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
                        return p;
        return NULL;
}

/*
 * lookup_mnt - Return the first child mount mounted at path
 *
 * "First" means first mounted chronologically.  If you create the
 * following mounts:
 *
 * mount /dev/sda1 /mnt
 * mount /dev/sda2 /mnt
 * mount /dev/sda3 /mnt
 *
 * Then lookup_mnt() on the base /mnt dentry in the root mount will
 * return successively the root dentry and vfsmount of /dev/sda1, then
 * /dev/sda2, then /dev/sda3, then NULL.
 *
 * lookup_mnt takes a reference to the found vfsmount.
 */
struct vfsmount *lookup_mnt(const struct path *path)
{
        struct mount *child_mnt;
        struct vfsmount *m;
        unsigned seq;

        rcu_read_lock();
        do {
                seq = read_seqbegin(&mount_lock);
                child_mnt = __lookup_mnt(path->mnt, path->dentry);
                m = child_mnt ? &child_mnt->mnt : NULL;
        } while (!legitimize_mnt(m, seq));
        rcu_read_unlock();
        return m;
}

static inline void lock_ns_list(struct mnt_namespace *ns)
{
        spin_lock(&ns->ns_lock);
}

static inline void unlock_ns_list(struct mnt_namespace *ns)
{
        spin_unlock(&ns->ns_lock);
}

static inline bool mnt_is_cursor(struct mount *mnt)
{
        return mnt->mnt.mnt_flags & MNT_CURSOR;
}

/*
 * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
 *                         current mount namespace.
 *
 * The common case is dentries are not mountpoints at all and that
 * test is handled inline.  For the slow case when we are actually
 * dealing with a mountpoint of some kind, walk through all of the
 * mounts in the current mount namespace and test to see if the dentry
 * is a mountpoint.
 *
 * The mount_hashtable is not usable in the context because we
 * need to identify all mounts that may be in the current mount
 * namespace not just a mount that happens to have some specified
 * parent mount.
 */
bool __is_local_mountpoint(struct dentry *dentry)
{
        struct mnt_namespace *ns = current->nsproxy->mnt_ns;
        struct mount *mnt;
        bool is_covered = false;

        down_read(&namespace_sem);
        lock_ns_list(ns);
        list_for_each_entry(mnt, &ns->list, mnt_list) {
                if (mnt_is_cursor(mnt))
                        continue;
                is_covered = (mnt->mnt_mountpoint == dentry);
                if (is_covered)
                        break;
        }
        unlock_ns_list(ns);
        up_read(&namespace_sem);

        return is_covered;
}

static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
{
        struct hlist_head *chain = mp_hash(dentry);
        struct mountpoint *mp;

        hlist_for_each_entry(mp, chain, m_hash) {
                if (mp->m_dentry == dentry) {
                        mp->m_count++;
                        return mp;
                }
        }
        return NULL;
}

static struct mountpoint *get_mountpoint(struct dentry *dentry)
{
        struct mountpoint *mp, *new = NULL;
        int ret;

        if (d_mountpoint(dentry)) {
                /* might be worth a WARN_ON() */
                if (d_unlinked(dentry))
                        return ERR_PTR(-ENOENT);
mountpoint:
                read_seqlock_excl(&mount_lock);
                mp = lookup_mountpoint(dentry);
                read_sequnlock_excl(&mount_lock);
                if (mp)
                        goto done;
        }

        if (!new)
                new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
        if (!new)
                return ERR_PTR(-ENOMEM);


        /* Exactly one processes may set d_mounted */
        ret = d_set_mounted(dentry);

        /* Someone else set d_mounted? */
        if (ret == -EBUSY)
                goto mountpoint;

        /* The dentry is not available as a mountpoint? */
        mp = ERR_PTR(ret);
        if (ret)
                goto done;

        /* Add the new mountpoint to the hash table */
        read_seqlock_excl(&mount_lock);
        new->m_dentry = dget(dentry);
        new->m_count = 1;
        hlist_add_head(&new->m_hash, mp_hash(dentry));
        INIT_HLIST_HEAD(&new->m_list);
        read_sequnlock_excl(&mount_lock);

        mp = new;
        new = NULL;
done:
        kfree(new);
        return mp;
}

/*
 * vfsmount lock must be held.  Additionally, the caller is responsible
 * for serializing calls for given disposal list.
 */
static void __put_mountpoint(struct mountpoint *mp, struct list_head *list)
{
        if (!--mp->m_count) {
                struct dentry *dentry = mp->m_dentry;
                BUG_ON(!hlist_empty(&mp->m_list));
                spin_lock(&dentry->d_lock);
                dentry->d_flags &= ~DCACHE_MOUNTED;
                spin_unlock(&dentry->d_lock);
                dput_to_list(dentry, list);
                hlist_del(&mp->m_hash);
                kfree(mp);
        }
}

/* called with namespace_lock and vfsmount lock */
static void put_mountpoint(struct mountpoint *mp)
{
        __put_mountpoint(mp, &ex_mountpoints);
}

static inline int check_mnt(struct mount *mnt)
{
        return mnt->mnt_ns == current->nsproxy->mnt_ns;
}

/*
 * vfsmount lock must be held for write
 */
static void touch_mnt_namespace(struct mnt_namespace *ns)
{
        if (ns) {
                ns->event = ++event;
                wake_up_interruptible(&ns->poll);
        }
}

/*
 * vfsmount lock must be held for write
 */
static void __touch_mnt_namespace(struct mnt_namespace *ns)
{
        if (ns && ns->event != event) {
                ns->event = event;
                wake_up_interruptible(&ns->poll);
        }
}

/*
 * vfsmount lock must be held for write
 */
static struct mountpoint *unhash_mnt(struct mount *mnt)
{
        struct mountpoint *mp;
        mnt->mnt_parent = mnt;
        mnt->mnt_mountpoint = mnt->mnt.mnt_root;
        list_del_init(&mnt->mnt_child);
        hlist_del_init_rcu(&mnt->mnt_hash);
        hlist_del_init(&mnt->mnt_mp_list);
        mp = mnt->mnt_mp;
        mnt->mnt_mp = NULL;
        return mp;
}

/*
 * vfsmount lock must be held for write
 */
static void umount_mnt(struct mount *mnt)
{
        put_mountpoint(unhash_mnt(mnt));
}

/*
 * vfsmount lock must be held for write
 */
void mnt_set_mountpoint(struct mount *mnt,
                        struct mountpoint *mp,
                        struct mount *child_mnt)
{
        mp->m_count++;
        mnt_add_count(mnt, 1);        /* essentially, that's mntget */
        child_mnt->mnt_mountpoint = mp->m_dentry;
        child_mnt->mnt_parent = mnt;
        child_mnt->mnt_mp = mp;
        hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
}

static void __attach_mnt(struct mount *mnt, struct mount *parent)
{
        hlist_add_head_rcu(&mnt->mnt_hash,
                           m_hash(&parent->mnt, mnt->mnt_mountpoint));
        list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
}

/*
 * vfsmount lock must be held for write
 */
static void attach_mnt(struct mount *mnt,
                        struct mount *parent,
                        struct mountpoint *mp)
{
        mnt_set_mountpoint(parent, mp, mnt);
        __attach_mnt(mnt, parent);
}

void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
{
        struct mountpoint *old_mp = mnt->mnt_mp;
        struct mount *old_parent = mnt->mnt_parent;

        list_del_init(&mnt->mnt_child);
        hlist_del_init(&mnt->mnt_mp_list);
        hlist_del_init_rcu(&mnt->mnt_hash);

        attach_mnt(mnt, parent, mp);

        put_mountpoint(old_mp);
        mnt_add_count(old_parent, -1);
}

/*
 * vfsmount lock must be held for write
 */
static void commit_tree(struct mount *mnt)
{
        struct mount *parent = mnt->mnt_parent;
        struct mount *m;
        LIST_HEAD(head);
        struct mnt_namespace *n = parent->mnt_ns;

        BUG_ON(parent == mnt);

        list_add_tail(&head, &mnt->mnt_list);
        list_for_each_entry(m, &head, mnt_list)
                m->mnt_ns = n;

        list_splice(&head, n->list.prev);

        n->mounts += n->pending_mounts;
        n->pending_mounts = 0;

        __attach_mnt(mnt, parent);
        touch_mnt_namespace(n);
}

static struct mount *next_mnt(struct mount *p, struct mount *root)
{
        struct list_head *next = p->mnt_mounts.next;
        if (next == &p->mnt_mounts) {
                while (1) {
                        if (p == root)
                                return NULL;
                        next = p->mnt_child.next;
                        if (next != &p->mnt_parent->mnt_mounts)
                                break;
                        p = p->mnt_parent;
                }
        }
        return list_entry(next, struct mount, mnt_child);
}

static struct mount *skip_mnt_tree(struct mount *p)
{
        struct list_head *prev = p->mnt_mounts.prev;
        while (prev != &p->mnt_mounts) {
                p = list_entry(prev, struct mount, mnt_child);
                prev = p->mnt_mounts.prev;
        }
        return p;
}

/**
 * vfs_create_mount - Create a mount for a configured superblock
 * @fc: The configuration context with the superblock attached
 *
 * Create a mount to an already configured superblock.  If necessary, the
 * caller should invoke vfs_get_tree() before calling this.
 *
 * Note that this does not attach the mount to anything.
 */
struct vfsmount *vfs_create_mount(struct fs_context *fc)
{
        struct mount *mnt;

        if (!fc->root)
                return ERR_PTR(-EINVAL);

        mnt = alloc_vfsmnt(fc->source ?: "none");
        if (!mnt)
                return ERR_PTR(-ENOMEM);

        if (fc->sb_flags & SB_KERNMOUNT)
                mnt->mnt.mnt_flags = MNT_INTERNAL;

        atomic_inc(&fc->root->d_sb->s_active);
        mnt->mnt.mnt_sb                = fc->root->d_sb;
        mnt->mnt.mnt_root        = dget(fc->root);
        mnt->mnt_mountpoint        = mnt->mnt.mnt_root;
        mnt->mnt_parent                = mnt;

        lock_mount_hash();
        list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts);
        unlock_mount_hash();
        return &mnt->mnt;
}
EXPORT_SYMBOL(vfs_create_mount);

struct vfsmount *fc_mount(struct fs_context *fc)
{
        int err = vfs_get_tree(fc);
        if (!err) {
                up_write(&fc->root->d_sb->s_umount);
                return vfs_create_mount(fc);
        }
        return ERR_PTR(err);
}
EXPORT_SYMBOL(fc_mount);

struct vfsmount *vfs_kern_mount(struct file_system_type *type,
                                int flags, const char *name,
                                void *data)
{
        struct fs_context *fc;
        struct vfsmount *mnt;
        int ret = 0;

        if (!type)
                return ERR_PTR(-EINVAL);

        fc = fs_context_for_mount(type, flags);
        if (IS_ERR(fc))
                return ERR_CAST(fc);

        if (name)
                ret = vfs_parse_fs_string(fc, "source",
                                          name, strlen(name));
        if (!ret)
                ret = parse_monolithic_mount_data(fc, data);
        if (!ret)
                mnt = fc_mount(fc);
        else
                mnt = ERR_PTR(ret);

        put_fs_context(fc);
        return mnt;
}
EXPORT_SYMBOL_GPL(vfs_kern_mount);

struct vfsmount *
vfs_submount(const struct dentry *mountpoint, struct file_system_type *type,
             const char *name, void *data)
{
        /* Until it is worked out how to pass the user namespace
         * through from the parent mount to the submount don't support
         * unprivileged mounts with submounts.
         */
        if (mountpoint->d_sb->s_user_ns != &init_user_ns)
                return ERR_PTR(-EPERM);

        return vfs_kern_mount(type, SB_SUBMOUNT, name, data);
}
EXPORT_SYMBOL_GPL(vfs_submount);

static struct mount *clone_mnt(struct mount *old, struct dentry *root,
                                        int flag)
{
        struct super_block *sb = old->mnt.mnt_sb;
        struct mount *mnt;
        int err;

        mnt = alloc_vfsmnt(old->mnt_devname);
        if (!mnt)
                return ERR_PTR(-ENOMEM);

        if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
                mnt->mnt_group_id = 0; /* not a peer of original */
        else
                mnt->mnt_group_id = old->mnt_group_id;

        if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
                err = mnt_alloc_group_id(mnt);
                if (err)
                        goto out_free;
        }

        mnt->mnt.mnt_flags = old->mnt.mnt_flags;
        mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);

        atomic_inc(&sb->s_active);
        mnt->mnt.mnt_sb = sb;
        mnt->mnt.mnt_root = dget(root);
        mnt->mnt_mountpoint = mnt->mnt.mnt_root;
        mnt->mnt_parent = mnt;
        lock_mount_hash();
        list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
        unlock_mount_hash();

        if ((flag & CL_SLAVE) ||
            ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
                list_add(&mnt->mnt_slave, &old->mnt_slave_list);
                mnt->mnt_master = old;
                CLEAR_MNT_SHARED(mnt);
        } else if (!(flag & CL_PRIVATE)) {
                if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
                        list_add(&mnt->mnt_share, &old->mnt_share);
                if (IS_MNT_SLAVE(old))
                        list_add(&mnt->mnt_slave, &old->mnt_slave);
                mnt->mnt_master = old->mnt_master;
        } else {
                CLEAR_MNT_SHARED(mnt);
        }
        if (flag & CL_MAKE_SHARED)
                set_mnt_shared(mnt);

        /* stick the duplicate mount on the same expiry list
         * as the original if that was on one */
        if (flag & CL_EXPIRE) {
                if (!list_empty(&old->mnt_expire))
                        list_add(&mnt->mnt_expire, &old->mnt_expire);
        }

        return mnt;

 out_free:
        mnt_free_id(mnt);
        free_vfsmnt(mnt);
        return ERR_PTR(err);
}

static void cleanup_mnt(struct mount *mnt)
{
        struct hlist_node *p;
        struct mount *m;
        /*
         * The warning here probably indicates that somebody messed
         * up a mnt_want/drop_write() pair.  If this happens, the
         * filesystem was probably unable to make r/w->r/o transitions.
         * The locking used to deal with mnt_count decrement provides barriers,
         * so mnt_get_writers() below is safe.
         */
        WARN_ON(mnt_get_writers(mnt));
        if (unlikely(mnt->mnt_pins.first))
                mnt_pin_kill(mnt);
        hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) {
                hlist_del(&m->mnt_umount);
                mntput(&m->mnt);
        }
        fsnotify_vfsmount_delete(&mnt->mnt);
        dput(mnt->mnt.mnt_root);
        deactivate_super(mnt->mnt.mnt_sb);
        mnt_free_id(mnt);
        call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
}

static void __cleanup_mnt(struct rcu_head *head)
{
        cleanup_mnt(container_of(head, struct mount, mnt_rcu));
}

static LLIST_HEAD(delayed_mntput_list);
static void delayed_mntput(struct work_struct *unused)
{
        struct llist_node *node = llist_del_all(&delayed_mntput_list);
        struct mount *m, *t;

        llist_for_each_entry_safe(m, t, node, mnt_llist)
                cleanup_mnt(m);
}
static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);

static void mntput_no_expire(struct mount *mnt)
{
        LIST_HEAD(list);
        int count;

        rcu_read_lock();
        if (likely(READ_ONCE(mnt->mnt_ns))) {
                /*
                 * Since we don't do lock_mount_hash() here,
                 * ->mnt_ns can change under us.  However, if it's
                 * non-NULL, then there's a reference that won't
                 * be dropped until after an RCU delay done after
                 * turning ->mnt_ns NULL.  So if we observe it
                 * non-NULL under rcu_read_lock(), the reference
                 * we are dropping is not the final one.
                 */
                mnt_add_count(mnt, -1);
                rcu_read_unlock();
                return;
        }
        lock_mount_hash();
        /*
         * make sure that if __legitimize_mnt() has not seen us grab
         * mount_lock, we'll see their refcount increment here.
         */
        smp_mb();
        mnt_add_count(mnt, -1);
        count = mnt_get_count(mnt);
        if (count != 0) {
                WARN_ON(count < 0);
                rcu_read_unlock();
                unlock_mount_hash();
                return;
        }
        if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
                rcu_read_unlock();
                unlock_mount_hash();
                return;
        }
        mnt->mnt.mnt_flags |= MNT_DOOMED;
        rcu_read_unlock();

        list_del(&mnt->mnt_instance);

        if (unlikely(!list_empty(&mnt->mnt_mounts))) {
                struct mount *p, *tmp;
                list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts,  mnt_child) {
                        __put_mountpoint(unhash_mnt(p), &list);
                        hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children);
                }
        }
        unlock_mount_hash();
        shrink_dentry_list(&list);

        if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
                struct task_struct *task = current;
                if (likely(!(task->flags & PF_KTHREAD))) {
                        init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
                        if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME))
                                return;
                }
                if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
                        schedule_delayed_work(&delayed_mntput_work, 1);
                return;
        }
        cleanup_mnt(mnt);
}

void mntput(struct vfsmount *mnt)
{
        if (mnt) {
                struct mount *m = real_mount(mnt);
                /* avoid cacheline pingpong, hope gcc doesn't get "smart" */
                if (unlikely(m->mnt_expiry_mark))
                        m->mnt_expiry_mark = 0;
                mntput_no_expire(m);
        }
}
EXPORT_SYMBOL(mntput);

struct vfsmount *mntget(struct vfsmount *mnt)
{
        if (mnt)
                mnt_add_count(real_mount(mnt), 1);
        return mnt;
}
EXPORT_SYMBOL(mntget);

/* path_is_mountpoint() - Check if path is a mount in the current
 *                          namespace.
 *
 *  d_mountpoint() can only be used reliably to establish if a dentry is
 *  not mounted in any namespace and that common case is handled inline.
 *  d_mountpoint() isn't aware of the possibility there may be multiple
 *  mounts using a given dentry in a different namespace. This function
 *  checks if the passed in path is a mountpoint rather than the dentry
 *  alone.
 */
bool path_is_mountpoint(const struct path *path)
{
        unsigned seq;
        bool res;

        if (!d_mountpoint(path->dentry))
                return false;

        rcu_read_lock();
        do {
                seq = read_seqbegin(&mount_lock);
                res = __path_is_mountpoint(path);
        } while (read_seqretry(&mount_lock, seq));
        rcu_read_unlock();

        return res;
}
EXPORT_SYMBOL(path_is_mountpoint);

struct vfsmount *mnt_clone_internal(const struct path *path)
{
        struct mount *p;
        p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
        if (IS_ERR(p))
                return ERR_CAST(p);
        p->mnt.mnt_flags |= MNT_INTERNAL;
        return &p->mnt;
}

#ifdef CONFIG_PROC_FS
static struct mount *mnt_list_next(struct mnt_namespace *ns,
                                   struct list_head *p)
{
        struct mount *mnt, *ret = NULL;

        lock_ns_list(ns);
        list_for_each_continue(p, &ns->list) {
                mnt = list_entry(p, typeof(*mnt), mnt_list);
                if (!mnt_is_cursor(mnt)) {
                        ret = mnt;
                        break;
                }
        }
        unlock_ns_list(ns);

        return ret;
}

/* iterator; we want it to have access to namespace_sem, thus here... */
static void *m_start(struct seq_file *m, loff_t *pos)
{
        struct proc_mounts *p = m->private;
        struct list_head *prev;

        down_read(&namespace_sem);
        if (!*pos) {
                prev = &p->ns->list;
        } else {
                prev = &p->cursor.mnt_list;

                /* Read after we'd reached the end? */
                if (list_empty(prev))
                        return NULL;
        }

        return mnt_list_next(p->ns, prev);
}

static void *m_next(struct seq_file *m, void *v, loff_t *pos)
{
        struct proc_mounts *p = m->private;
        struct mount *mnt = v;

        ++*pos;
        return mnt_list_next(p->ns, &mnt->mnt_list);
}

static void m_stop(struct seq_file *m, void *v)
{
        struct proc_mounts *p = m->private;
        struct mount *mnt = v;

        lock_ns_list(p->ns);
        if (mnt)
                list_move_tail(&p->cursor.mnt_list, &mnt->mnt_list);
        else
                list_del_init(&p->cursor.mnt_list);
        unlock_ns_list(p->ns);
        up_read(&namespace_sem);
}

static int m_show(struct seq_file *m, void *v)
{
        struct proc_mounts *p = m->private;
        struct mount *r = v;
        return p->show(m, &r->mnt);
}

const struct seq_operations mounts_op = {
        .start        = m_start,
        .next        = m_next,
        .stop        = m_stop,
        .show        = m_show,
};

void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor)
{
        down_read(&namespace_sem);
        lock_ns_list(ns);
        list_del(&cursor->mnt_list);
        unlock_ns_list(ns);
        up_read(&namespace_sem);
}
#endif  /* CONFIG_PROC_FS */

/**
 * may_umount_tree - check if a mount tree is busy
 * @mnt: root of mount tree
 *
 * This is called to check if a tree of mounts has any
 * open files, pwds, chroots or sub mounts that are
 * busy.
 */
int may_umount_tree(struct vfsmount *m)
{
        struct mount *mnt = real_mount(m);
        int actual_refs = 0;
        int minimum_refs = 0;
        struct mount *p;
        BUG_ON(!m);

        /* write lock needed for mnt_get_count */
        lock_mount_hash();
        for (p = mnt; p; p = next_mnt(p, mnt)) {
                actual_refs += mnt_get_count(p);
                minimum_refs += 2;
        }
        unlock_mount_hash();

        if (actual_refs > minimum_refs)
                return 0;

        return 1;
}

EXPORT_SYMBOL(may_umount_tree);

/**
 * may_umount - check if a mount point is busy
 * @mnt: root of mount
 *
 * This is called to check if a mount point has any
 * open files, pwds, chroots or sub mounts. If the
 * mount has sub mounts this will return busy
 * regardless of whether the sub mounts are busy.
 *
 * Doesn't take quota and stuff into account. IOW, in some cases it will
 * give false negatives. The main reason why it's here is that we need
 * a non-destructive way to look for easily umountable filesystems.
 */
int may_umount(struct vfsmount *mnt)
{
        int ret = 1;
        down_read(&namespace_sem);
        lock_mount_hash();
        if (propagate_mount_busy(real_mount(mnt), 2))
                ret = 0;
        unlock_mount_hash();
        up_read(&namespace_sem);
        return ret;
}

EXPORT_SYMBOL(may_umount);

static void namespace_unlock(void)
{
        struct hlist_head head;
        struct hlist_node *p;
        struct mount *m;
        LIST_HEAD(list);

        hlist_move_list(&unmounted, &head);
        list_splice_init(&ex_mountpoints, &list);

        up_write(&namespace_sem);

        shrink_dentry_list(&list);

        if (likely(hlist_empty(&head)))
                return;

        synchronize_rcu_expedited();

        hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
                hlist_del(&m->mnt_umount);
                mntput(&m->mnt);
        }
}

static inline void namespace_lock(void)
{
        down_write(&namespace_sem);
}

enum umount_tree_flags {
        UMOUNT_SYNC = 1,
        UMOUNT_PROPAGATE = 2,
        UMOUNT_CONNECTED = 4,
};

static bool disconnect_mount(struct mount *mnt, enum umount_tree_flags how)
{
        /* Leaving mounts connected is only valid for lazy umounts */
        if (how & UMOUNT_SYNC)
                return true;

        /* A mount without a parent has nothing to be connected to */
        if (!mnt_has_parent(mnt))
                return true;

        /* Because the reference counting rules change when mounts are
         * unmounted and connected, umounted mounts may not be
         * connected to mounted mounts.
         */
        if (!(mnt->mnt_parent->mnt.mnt_flags & MNT_UMOUNT))
                return true;

        /* Has it been requested that the mount remain connected? */
        if (how & UMOUNT_CONNECTED)
                return false;

        /* Is the mount locked such that it needs to remain connected? */
        if (IS_MNT_LOCKED(mnt))
                return false;

        /* By default disconnect the mount */
        return true;
}

/*
 * mount_lock must be held
 * namespace_sem must be held for write
 */
static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
{
        LIST_HEAD(tmp_list);
        struct mount *p;

        if (how & UMOUNT_PROPAGATE)
                propagate_mount_unlock(mnt);

        /* Gather the mounts to umount */
        for (p = mnt; p; p = next_mnt(p, mnt)) {
                p->mnt.mnt_flags |= MNT_UMOUNT;
                list_move(&p->mnt_list, &tmp_list);
        }

        /* Hide the mounts from mnt_mounts */
        list_for_each_entry(p, &tmp_list, mnt_list) {
                list_del_init(&p->mnt_child);
        }

        /* Add propogated mounts to the tmp_list */
        if (how & UMOUNT_PROPAGATE)
                propagate_umount(&tmp_list);

        while (!list_empty(&tmp_list)) {
                struct mnt_namespace *ns;
                bool disconnect;
                p = list_first_entry(&tmp_list, struct mount, mnt_list);
                list_del_init(&p->mnt_expire);
                list_del_init(&p->mnt_list);
                ns = p->mnt_ns;
                if (ns) {
                        ns->mounts--;
                        __touch_mnt_namespace(ns);
                }
                p->mnt_ns = NULL;
                if (how & UMOUNT_SYNC)
                        p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;

                disconnect = disconnect_mount(p, how);
                if (mnt_has_parent(p)) {
                        mnt_add_count(p->mnt_parent, -1);
                        if (!disconnect) {
                                /* Don't forget about p */
                                list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
                        } else {
                                umount_mnt(p);
                        }
                }
                change_mnt_propagation(p, MS_PRIVATE);
                if (disconnect)
                        hlist_add_head(&p->mnt_umount, &unmounted);
        }
}

static void shrink_submounts(struct mount *mnt);

static int do_umount_root(struct super_block *sb)
{
        int ret = 0;

        down_write(&sb->s_umount);
        if (!sb_rdonly(sb)) {
                struct fs_context *fc;

                fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY,
                                                SB_RDONLY);
                if (IS_ERR(fc)) {
                        ret = PTR_ERR(fc);
                } else {
                        ret = parse_monolithic_mount_data(fc, NULL);
                        if (!ret)
                                ret = reconfigure_super(fc);
                        put_fs_context(fc);
                }
        }
        up_write(&sb->s_umount);
        return ret;
}

static int do_umount(struct mount *mnt, int flags)
{
        struct super_block *sb = mnt->mnt.mnt_sb;
        int retval;

        retval = security_sb_umount(&mnt->mnt, flags);
        if (retval)
                return retval;

        /*
         * Allow userspace to request a mountpoint be expired rather than
         * unmounting unconditionally. Unmount only happens if:
         *  (1) the mark is already set (the mark is cleared by mntput())
         *  (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]
         */
        if (flags & MNT_EXPIRE) {
                if (&mnt->mnt == current->fs->root.mnt ||
                    flags & (MNT_FORCE | MNT_DETACH))
                        return -EINVAL;

                /*
                 * probably don't strictly need the lock here if we examined
                 * all race cases, but it's a slowpath.
                 */
                lock_mount_hash();
                if (mnt_get_count(mnt) != 2) {
                        unlock_mount_hash();
                        return -EBUSY;
                }
                unlock_mount_hash();

                if (!xchg(&mnt->mnt_expiry_mark, 1))
                        return -EAGAIN;
        }

        /*
         * If we may have to abort operations to get out of this
         * mount, and they will themselves hold resources we must
         * allow the fs to do things. In the Unix tradition of
         * 'Gee thats tricky lets do it in userspace' the umount_begin
         * might fail to complete on the first run through as other tasks
         * must return, and the like. Thats for the mount program to worry
         * about for the moment.
         */

        if (flags & MNT_FORCE && sb->s_op->umount_begin) {
                sb->s_op->umount_begin(sb);
        }

        /*
         * No sense to grab the lock for this test, but test itself looks
         * somewhat bogus. Suggestions for better replacement?
         * Ho-hum... In principle, we might treat that as umount + switch
         * to rootfs. GC would eventually take care of the old vfsmount.
         * Actually it makes sense, especially if rootfs would contain a
         * /reboot - static binary that would close all descriptors and
         * call reboot(9). Then init(8) could umount root and exec /reboot.
         */
        if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {
                /*
                 * Special case for "unmounting" root ...
                 * we just try to remount it readonly.
                 */
                if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
                        return -EPERM;
                return do_umount_root(sb);
        }

        namespace_lock();
        lock_mount_hash();

        /* Recheck MNT_LOCKED with the locks held */
        retval = -EINVAL;
        if (mnt->mnt.mnt_flags & MNT_LOCKED)
                goto out;

        event++;
        if (flags & MNT_DETACH) {
                if (!list_empty(&mnt->mnt_list))
                        umount_tree(mnt, UMOUNT_PROPAGATE);
                retval = 0;
        } else {
                smp_mb(); // paired with __legitimize_mnt()
                shrink_submounts(mnt);
                retval = -EBUSY;
                if (!propagate_mount_busy(mnt, 2)) {
                        if (!list_empty(&mnt->mnt_list))
                                umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
                        retval = 0;
                }
        }
out:
        unlock_mount_hash();
        namespace_unlock();
        return retval;
}

/*
 * __detach_mounts - lazily unmount all mounts on the specified dentry
 *
 * During unlink, rmdir, and d_drop it is possible to loose the path
 * to an existing mountpoint, and wind up leaking the mount.
 * detach_mounts allows lazily unmounting those mounts instead of
 * leaking them.
 *
 * The caller may hold dentry->d_inode->i_mutex.
 */
void __detach_mounts(struct dentry *dentry)
{
        struct mountpoint *mp;
        struct mount *mnt;

        namespace_lock();
        lock_mount_hash();
        mp = lookup_mountpoint(dentry);
        if (!mp)
                goto out_unlock;

        event++;
        while (!hlist_empty(&mp->m_list)) {
                mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
                if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
                        umount_mnt(mnt);
                        hlist_add_head(&mnt->mnt_umount, &unmounted);
                }
                else umount_tree(mnt, UMOUNT_CONNECTED);
        }
        put_mountpoint(mp);
out_unlock:
        unlock_mount_hash();
        namespace_unlock();
}

/*
 * Is the caller allowed to modify his namespace?
 */
static inline bool may_mount(void)
{
        return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
}

#ifdef        CONFIG_MANDATORY_FILE_LOCKING
static bool may_mandlock(void)
{
        pr_warn_once("======================================================\n"
                     "WARNING: the mand mount option is being deprecated and\n"
                     "         will be removed in v5.15!\n"
                     "======================================================\n");
        return capable(CAP_SYS_ADMIN);
}
#else
static inline bool may_mandlock(void)
{
        pr_warn("VFS: \"mand\" mount option not supported");
        return false;
}
#endif

static int can_umount(const struct path *path, int flags)
{
        struct mount *mnt = real_mount(path->mnt);
        struct super_block *sb = path->dentry->d_sb;

        if (!may_mount())
                return -EPERM;
        if (path->dentry != path->mnt->mnt_root)
                return -EINVAL;
        if (!check_mnt(mnt))
                return -EINVAL;
        if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */
                return -EINVAL;
        if (flags & MNT_FORCE && !ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
                return -EPERM;
        return 0;
}

// caller is responsible for flags being sane
int path_umount(struct path *path, int flags)
{
        struct mount *mnt = real_mount(path->mnt);
        int ret;

        ret = can_umount(path, flags);
        if (!ret)
                ret = do_umount(mnt, flags);

        /* we mustn't call path_put() as that would clear mnt_expiry_mark */
        dput(path->dentry);
        mntput_no_expire(mnt);
        return ret;
}

static int ksys_umount(char __user *name, int flags)
{
        int lookup_flags = LOOKUP_MOUNTPOINT;
        struct path path;
        int ret;

        // basic validity checks done first
        if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
                return -EINVAL;

        if (!(flags & UMOUNT_NOFOLLOW))
                lookup_flags |= LOOKUP_FOLLOW;
        ret = user_path_at(AT_FDCWD, name, lookup_flags, &path);
        if (ret)
                return ret;
        return path_umount(&path, flags);
}

SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
{
        return ksys_umount(name, flags);
}

#ifdef __ARCH_WANT_SYS_OLDUMOUNT

/*
 *        The 2.0 compatible umount. No flags.
 */
SYSCALL_DEFINE1(oldumount, char __user *, name)
{
        return ksys_umount(name, 0);
}

#endif

static bool is_mnt_ns_file(struct dentry *dentry)
{
        /* Is this a proxy for a mount namespace? */
        return dentry->d_op == &ns_dentry_operations &&
               dentry->d_fsdata == &mntns_operations;
}

static struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
{
        return container_of(ns, struct mnt_namespace, ns);
}

struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
{
        return &mnt->ns;
}

static bool mnt_ns_loop(struct dentry *dentry)
{
        /* Could bind mounting the mount namespace inode cause a
         * mount namespace loop?
         */
        struct mnt_namespace *mnt_ns;
        if (!is_mnt_ns_file(dentry))
                return false;

        mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode));
        return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
}

struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
                                        int flag)
{
        struct mount *res, *p, *q, *r, *parent;

        if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt))
                return ERR_PTR(-EINVAL);

        if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
                return ERR_PTR(-EINVAL);

        res = q = clone_mnt(mnt, dentry, flag);
        if (IS_ERR(q))
                return q;

        q->mnt_mountpoint = mnt->mnt_mountpoint;

        p = mnt;
        list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) {
                struct mount *s;
                if (!is_subdir(r->mnt_mountpoint, dentry))
                        continue;

                for (s = r; s; s = next_mnt(s, r)) {
                        if (!(flag & CL_COPY_UNBINDABLE) &&
                            IS_MNT_UNBINDABLE(s)) {
                                if (s->mnt.mnt_flags & MNT_LOCKED) {
                                        /* Both unbindable and locked. */
                                        q = ERR_PTR(-EPERM);
                                        goto out;
                                } else {
                                        s = skip_mnt_tree(s);
                                        continue;
                                }
                        }
                        if (!(flag & CL_COPY_MNT_NS_FILE) &&
                            is_mnt_ns_file(s->mnt.mnt_root)) {
                                s = skip_mnt_tree(s);
                                continue;
                        }
                        while (p != s->mnt_parent) {
                                p = p->mnt_parent;
                                q = q->mnt_parent;
                        }
                        p = s;
                        parent = q;
                        q = clone_mnt(p, p->mnt.mnt_root, flag);
                        if (IS_ERR(q))
                                goto out;
                        lock_mount_hash();
                        list_add_tail(&q->mnt_list, &res->mnt_list);
                        attach_mnt(q, parent, p->mnt_mp);
                        unlock_mount_hash();
                }
        }
        return res;
out:
        if (res) {
                lock_mount_hash();
                umount_tree(res, UMOUNT_SYNC);
                unlock_mount_hash();
        }
        return q;
}

/* Caller should check returned pointer for errors */

struct vfsmount *collect_mounts(const struct path *path)
{
        struct mount *tree;
        namespace_lock();
        if (!check_mnt(real_mount(path->mnt)))
                tree = ERR_PTR(-EINVAL);
        else
                tree = copy_tree(real_mount(path->mnt), path->dentry,
                                 CL_COPY_ALL | CL_PRIVATE);
        namespace_unlock();
        if (IS_ERR(tree))
                return ERR_CAST(tree);
        return &tree->mnt;
}

static void free_mnt_ns(struct mnt_namespace *);
static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool);

void dissolve_on_fput(struct vfsmount *mnt)
{
        struct mnt_namespace *ns;
        namespace_lock();
        lock_mount_hash();
        ns = real_mount(mnt)->mnt_ns;
        if (ns) {
                if (is_anon_ns(ns))
                        umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
                else
                        ns = NULL;
        }
        unlock_mount_hash();
        namespace_unlock();
        if (ns)
                free_mnt_ns(ns);
}

void drop_collected_mounts(struct vfsmount *mnt)
{
        namespace_lock();
        lock_mount_hash();
        umount_tree(real_mount(mnt), 0);
        unlock_mount_hash();
        namespace_unlock();
}

static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
{
        struct mount *child;

        list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
                if (!is_subdir(child->mnt_mountpoint, dentry))
                        continue;

                if (child->mnt.mnt_flags & MNT_LOCKED)
                        return true;
        }
        return false;
}

/**
 * clone_private_mount - create a private clone of a path
 *
 * This creates a new vfsmount, which will be the clone of @path.  The new will
 * not be attached anywhere in the namespace and will be private (i.e. changes
 * to the originating mount won't be propagated into this).
 *
 * Release with mntput().
 */
struct vfsmount *clone_private_mount(const struct path *path)
{
        struct mount *old_mnt = real_mount(path->mnt);
        struct mount *new_mnt;

        down_read(&namespace_sem);
        if (IS_MNT_UNBINDABLE(old_mnt))
                goto invalid;

        if (!check_mnt(old_mnt))
                goto invalid;

        if (has_locked_children(old_mnt, path->dentry))
                goto invalid;

        new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
        up_read(&namespace_sem);

        if (IS_ERR(new_mnt))
                return ERR_CAST(new_mnt);

        /* Longterm mount to be removed by kern_unmount*() */
        new_mnt->mnt_ns = MNT_NS_INTERNAL;

        return &new_mnt->mnt;

invalid:
        up_read(&namespace_sem);
        return ERR_PTR(-EINVAL);
}
EXPORT_SYMBOL_GPL(clone_private_mount);

int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
                   struct vfsmount *root)
{
        struct mount *mnt;
        int res = f(root, arg);
        if (res)
                return res;
        list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {
                res = f(&mnt->mnt, arg);
                if (res)
                        return res;
        }
        return 0;
}

static void lock_mnt_tree(struct mount *mnt)
{
        struct mount *p;

        for (p = mnt; p; p = next_mnt(p, mnt)) {
                int flags = p->mnt.mnt_flags;
                /* Don't allow unprivileged users to change mount flags */
                flags |= MNT_LOCK_ATIME;

                if (flags & MNT_READONLY)
                        flags |= MNT_LOCK_READONLY;

                if (flags & MNT_NODEV)
                        flags |= MNT_LOCK_NODEV;

                if (flags & MNT_NOSUID)
                        flags |= MNT_LOCK_NOSUID;

                if (flags & MNT_NOEXEC)
                        flags |= MNT_LOCK_NOEXEC;
                /* Don't allow unprivileged users to reveal what is under a mount */
                if (list_empty(&p->mnt_expire))
                        flags |= MNT_LOCKED;
                p->mnt.mnt_flags = flags;
        }
}

static void cleanup_group_ids(struct mount *mnt, struct mount *end)
{
        struct mount *p;

        for (p = mnt; p != end; p = next_mnt(p, mnt)) {
                if (p->mnt_group_id && !IS_MNT_SHARED(p))
                        mnt_release_group_id(p);
        }
}

static int invent_group_ids(struct mount *mnt, bool recurse)
{
        struct mount *p;

        for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {
                if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {
                        int err = mnt_alloc_group_id(p);
                        if (err) {
                                cleanup_group_ids(mnt, p);
                                return err;
                        }
                }
        }

        return 0;
}

int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
{
        unsigned int max = READ_ONCE(sysctl_mount_max);
        unsigned int mounts = 0, old, pending, sum;
        struct mount *p;

        for (p = mnt; p; p = next_mnt(p, mnt))
                mounts++;

        old = ns->mounts;
        pending = ns->pending_mounts;
        sum = old + pending;
        if ((old > sum) ||
            (pending > sum) ||
            (max < sum) ||
            (mounts > (max - sum)))
                return -ENOSPC;

        ns->pending_mounts = pending + mounts;
        return 0;
}

/*
 *  @source_mnt : mount tree to be attached
 *  @nd         : place the mount tree @source_mnt is attached
 *  @parent_nd  : if non-null, detach the source_mnt from its parent and
 *                     store the parent mount and mountpoint dentry.
 *                     (done when source_mnt is moved)
 *
 *  NOTE: in the table below explains the semantics when a source mount
 *  of a given type is attached to a destination mount of a given type.
 * ---------------------------------------------------------------------------
 * |         BIND MOUNT OPERATION                                            |
 * |**************************************************************************
 * | source-->| shared        |       private  |       slave    | unbindable |
 * | dest     |               |                |                |            |
 * |   |      |               |                |                |            |
 * |   v      |               |                |                |            |
 * |**************************************************************************
 * |  shared  | shared (++)   |     shared (+) |     shared(+++)|  invalid   |
 * |          |               |                |                |            |
 * |non-shared| shared (+)    |      private   |      slave (*) |  invalid   |
 * ***************************************************************************
 * A bind operation clones the source mount and mounts the clone on the
 * destination mount.
 *
 * (++)  the cloned mount is propagated to all the mounts in the propagation
 *          tree of the destination mount and the cloned mount is added to
 *          the peer group of the source mount.
 * (+)   the cloned mount is created under the destination mount and is marked
 *       as shared. The cloned mount is added to the peer group of the source
 *       mount.
 * (+++) the mount is propagated to all the mounts in the propagation tree
 *       of the destination mount and the cloned mount is made slave
 *       of the same master as that of the source mount. The cloned mount
 *       is marked as 'shared and slave'.
 * (*)   the cloned mount is made a slave of the same master as that of the
 *          source mount.
 *
 * ---------------------------------------------------------------------------
 * |                         MOVE MOUNT OPERATION                                 |
 * |**************************************************************************
 * | source-->| shared        |       private  |       slave    | unbindable |
 * | dest     |               |                |                |            |
 * |   |      |               |                |                |            |
 * |   v      |               |                |                |            |
 * |**************************************************************************
 * |  shared  | shared (+)    |     shared (+) |    shared(+++) |  invalid   |
 * |          |               |                |                |            |
 * |non-shared| shared (+*)   |      private   |    slave (*)   | unbindable |
 * ***************************************************************************
 *
 * (+)  the mount is moved to the destination. And is then propagated to
 *         all the mounts in the propagation tree of the destination mount.
 * (+*)  the mount is moved to the destination.
 * (+++)  the mount is moved to the destination and is then propagated to
 *         all the mounts belonging to the destination mount's propagation tree.
 *         the mount is marked as 'shared and slave'.
 * (*)        the mount continues to be a slave at the new location.
 *
 * if the source mount is a tree, the operations explained above is
 * applied to each mount in the tree.
 * Must be called without spinlocks held, since this function can sleep
 * in allocations.
 */
static int attach_recursive_mnt(struct mount *source_mnt,
                        struct mount *dest_mnt,
                        struct mountpoint *dest_mp,
                        bool moving)
{
        struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
        HLIST_HEAD(tree_list);
        struct mnt_namespace *ns = dest_mnt->mnt_ns;
        struct mountpoint *smp;
        struct mount *child, *p;
        struct hlist_node *n;
        int err;

        /* Preallocate a mountpoint in case the new mounts need
         * to be tucked under other mounts.
         */
        smp = get_mountpoint(source_mnt->mnt.mnt_root);
        if (IS_ERR(smp))
                return PTR_ERR(smp);

        /* Is there space to add these mounts to the mount namespace? */
        if (!moving) {
                err = count_mounts(ns, source_mnt);
                if (err)
                        goto out;
        }

        if (IS_MNT_SHARED(dest_mnt)) {
                err = invent_group_ids(source_mnt, true);
                if (err)
                        goto out;
                err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
                lock_mount_hash();
                if (err)
                        goto out_cleanup_ids;
                for (p = source_mnt; p; p = next_mnt(p, source_mnt))
                        set_mnt_shared(p);
        } else {
                lock_mount_hash();
        }
        if (moving) {
                unhash_mnt(source_mnt);
                attach_mnt(source_mnt, dest_mnt, dest_mp);
                touch_mnt_namespace(source_mnt->mnt_ns);
        } else {
                if (source_mnt->mnt_ns) {
                        /* move from anon - the caller will destroy */
                        list_del_init(&source_mnt->mnt_ns->list);
                }
                mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
                commit_tree(source_mnt);
        }

        hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
                struct mount *q;
                hlist_del_init(&child->mnt_hash);
                /* Notice when we are propagating across user namespaces */
                if (child->mnt_parent->mnt_ns->user_ns != user_ns)
                        lock_mnt_tree(child);
                child->mnt.mnt_flags &= ~MNT_LOCKED;
                q = __lookup_mnt(&child->mnt_parent->mnt,
                                 child->mnt_mountpoint);
                if (q)
                        mnt_change_mountpoint(child, smp, q);
                commit_tree(child);
        }
        put_mountpoint(smp);
        unlock_mount_hash();

        return 0;

 out_cleanup_ids:
        while (!hlist_empty(&tree_list)) {
                child = hlist_entry(tree_list.first, struct mount, mnt_hash);
                child->mnt_parent->mnt_ns->pending_mounts = 0;
                umount_tree(child, UMOUNT_SYNC);
        }
        unlock_mount_hash();
        cleanup_group_ids(source_mnt, NULL);
 out:
        ns->pending_mounts = 0;

        read_seqlock_excl(&mount_lock);
        put_mountpoint(smp);
        read_sequnlock_excl(&mount_lock);

        return err;
}

static struct mountpoint *lock_mount(struct path *path)
{
        struct vfsmount *mnt;
        struct dentry *dentry = path->dentry;
retry:
        inode_lock(dentry->d_inode);
        if (unlikely(cant_mount(dentry))) {
                inode_unlock(dentry->d_inode);
                return ERR_PTR(-ENOENT);
        }
        namespace_lock();
        mnt = lookup_mnt(path);
        if (likely(!mnt)) {
                struct mountpoint *mp = get_mountpoint(dentry);
                if (IS_ERR(mp)) {
                        namespace_unlock();
                        inode_unlock(dentry->d_inode);
                        return mp;
                }
                return mp;
        }
        namespace_unlock();
        inode_unlock(path->dentry->d_inode);
        path_put(path);
        path->mnt = mnt;
        dentry = path->dentry = dget(mnt->mnt_root);
        goto retry;
}

static void unlock_mount(struct mountpoint *where)
{
        struct dentry *dentry = where->m_dentry;

        read_seqlock_excl(&mount_lock);
        put_mountpoint(where);
        read_sequnlock_excl(&mount_lock);

        namespace_unlock();
        inode_unlock(dentry->d_inode);
}

static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
{
        if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER)
                return -EINVAL;

        if (d_is_dir(mp->m_dentry) !=
              d_is_dir(mnt->mnt.mnt_root))
                return -ENOTDIR;

        return attach_recursive_mnt(mnt, p, mp, false);
}

static int may_change_propagation(const struct mount *m)
{
        struct mnt_namespace *ns = m->mnt_ns;

         // it must be mounted in some namespace
         if (IS_ERR_OR_NULL(ns))         // is_mounted()
                 return -EINVAL;
         // and the caller must be admin in userns of that namespace
         if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN))
                 return -EPERM;
         return 0;
}

/*
 * Sanity check the flags to change_mnt_propagation.
 */

static int flags_to_propagation_type(int ms_flags)
{
        int type = ms_flags & ~(MS_REC | MS_SILENT);

        /* Fail if any non-propagation flags are set */
        if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
                return 0;
        /* Only one propagation flag should be set */
        if (!is_power_of_2(type))
                return 0;
        return type;
}

/*
 * recursively change the type of the mountpoint.
 */
static int do_change_type(struct path *path, int ms_flags)
{
        struct mount *m;
        struct mount *mnt = real_mount(path->mnt);
        int recurse = ms_flags & MS_REC;
        int type;
        int err = 0;

        if (path->dentry != path->mnt->mnt_root)
                return -EINVAL;

        type = flags_to_propagation_type(ms_flags);
        if (!type)
                return -EINVAL;

        namespace_lock();
        err = may_change_propagation(mnt);
        if (err)
                goto out_unlock;

        if (type == MS_SHARED) {
                err = invent_group_ids(mnt, recurse);
                if (err)
                        goto out_unlock;
        }

        lock_mount_hash();
        for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
                change_mnt_propagation(m, type);
        unlock_mount_hash();

 out_unlock:
        namespace_unlock();
        return err;
}

static struct mount *__do_loopback(struct path *old_path, int recurse)
{
        struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt);

        if (IS_MNT_UNBINDABLE(old))
                return mnt;

        if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations)
                return mnt;

        if (!recurse && has_locked_children(old, old_path->dentry))
                return mnt;

        if (recurse)
                mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
        else
                mnt = clone_mnt(old, old_path->dentry, 0);

        if (!IS_ERR(mnt))
                mnt->mnt.mnt_flags &= ~MNT_LOCKED;

        return mnt;
}

/*
 * do loopback mount.
 */
static int do_loopback(struct path *path, const char *old_name,
                                int recurse)
{
        struct path old_path;
        struct mount *mnt = NULL, *parent;
        struct mountpoint *mp;
        int err;
        if (!old_name || !*old_name)
                return -EINVAL;
        err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
        if (err)
                return err;

        err = -EINVAL;
        if (mnt_ns_loop(old_path.dentry))
                goto out;

        mp = lock_mount(path);
        if (IS_ERR(mp)) {
                err = PTR_ERR(mp);
                goto out;
        }

        parent = real_mount(path->mnt);
        if (!check_mnt(parent))
                goto out2;

        mnt = __do_loopback(&old_path, recurse);
        if (IS_ERR(mnt)) {
                err = PTR_ERR(mnt);
                goto out2;
        }

        err = graft_tree(mnt, parent, mp);
        if (err) {
                lock_mount_hash();
                umount_tree(mnt, UMOUNT_SYNC);
                unlock_mount_hash();
        }
out2:
        unlock_mount(mp);
out:
        path_put(&old_path);
        return err;
}

static struct file *open_detached_copy(struct path *path, bool recursive)
{
        struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
        struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true);
        struct mount *mnt, *p;
        struct file *file;

        if (IS_ERR(ns))
                return ERR_CAST(ns);

        namespace_lock();
        mnt = __do_loopback(path, recursive);
        if (IS_ERR(mnt)) {
                namespace_unlock();
                free_mnt_ns(ns);
                return ERR_CAST(mnt);
        }

        lock_mount_hash();
        for (p = mnt; p; p = next_mnt(p, mnt)) {
                p->mnt_ns = ns;
                ns->mounts++;
        }
        ns->root = mnt;
        list_add_tail(&ns->list, &mnt->mnt_list);
        mntget(&mnt->mnt);
        unlock_mount_hash();
        namespace_unlock();

        mntput(path->mnt);
        path->mnt = &mnt->mnt;
        file = dentry_open(path, O_PATH, current_cred());
        if (IS_ERR(file))
                dissolve_on_fput(path->mnt);
        else
                file->f_mode |= FMODE_NEED_UNMOUNT;
        return file;
}

SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags)
{
        struct file *file;
        struct path path;
        int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
        bool detached = flags & OPEN_TREE_CLONE;
        int error;
        int fd;

        BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);

        if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
                      AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
                      OPEN_TREE_CLOEXEC))
                return -EINVAL;

        if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE)
                return -EINVAL;

        if (flags & AT_NO_AUTOMOUNT)
                lookup_flags &= ~LOOKUP_AUTOMOUNT;
        if (flags & AT_SYMLINK_NOFOLLOW)
                lookup_flags &= ~LOOKUP_FOLLOW;
        if (flags & AT_EMPTY_PATH)
                lookup_flags |= LOOKUP_EMPTY;

        if (detached && !may_mount())
                return -EPERM;

        fd = get_unused_fd_flags(flags & O_CLOEXEC);
        if (fd < 0)
                return fd;

        error = user_path_at(dfd, filename, lookup_flags, &path);
        if (unlikely(error)) {
                file = ERR_PTR(error);
        } else {
                if (detached)
                        file = open_detached_copy(&path, flags & AT_RECURSIVE);
                else
                        file = dentry_open(&path, O_PATH, current_cred());
                path_put(&path);
        }
        if (IS_ERR(file)) {
                put_unused_fd(fd);
                return PTR_ERR(file);
        }
        fd_install(fd, file);
        return fd;
}

/*
 * Don't allow locked mount flags to be cleared.
 *
 * No locks need to be held here while testing the various MNT_LOCK
 * flags because those flags can never be cleared once they are set.
 */
static bool can_change_locked_flags(struct mount *mnt, unsigned int mnt_flags)
{
        unsigned int fl = mnt->mnt.mnt_flags;

        if ((fl & MNT_LOCK_READONLY) &&
            !(mnt_flags & MNT_READONLY))
                return false;

        if ((fl & MNT_LOCK_NODEV) &&
            !(mnt_flags & MNT_NODEV))
                return false;

        if ((fl & MNT_LOCK_NOSUID) &&
            !(mnt_flags & MNT_NOSUID))
                return false;

        if ((fl & MNT_LOCK_NOEXEC) &&
            !(mnt_flags & MNT_NOEXEC))
                return false;

        if ((fl & MNT_LOCK_ATIME) &&
            ((fl & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK)))
                return false;

        return true;
}

static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags)
{
        bool readonly_request = (mnt_flags & MNT_READONLY);

        if (readonly_request == __mnt_is_readonly(&mnt->mnt))
                return 0;

        if (readonly_request)
                return mnt_make_readonly(mnt);

        return __mnt_unmake_readonly(mnt);
}

/*
 * Update the user-settable attributes on a mount.  The caller must hold
 * sb->s_umount for writing.
 */
static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags)
{
        lock_mount_hash();
        mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
        mnt->mnt.mnt_flags = mnt_flags;
        touch_mnt_namespace(mnt->mnt_ns);
        unlock_mount_hash();
}

static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *mnt)
{
        struct super_block *sb = mnt->mnt_sb;

        if (!__mnt_is_readonly(mnt) &&
           (!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) &&
           (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) {
                char *buf, *mntpath;

                buf = (char *)__get_free_page(GFP_KERNEL);
                if (buf)
                        mntpath = d_path(mountpoint, buf, PAGE_SIZE);
                else
                        mntpath = ERR_PTR(-ENOMEM);
                if (IS_ERR(mntpath))
                        mntpath = "(unknown)";

                pr_warn("%s filesystem being %s at %s supports timestamps until %ptTd (0x%llx)\n",
                        sb->s_type->name,
                        is_mounted(mnt) ? "remounted" : "mounted",
                        mntpath, &sb->s_time_max,
                        (unsigned long long)sb->s_time_max);

                sb->s_iflags |= SB_I_TS_EXPIRY_WARNED;
                if (buf)
                        free_page((unsigned long)buf);
        }
}

/*
 * Handle reconfiguration of the mountpoint only without alteration of the
 * superblock it refers to.  This is triggered by specifying MS_REMOUNT|MS_BIND
 * to mount(2).
 */
static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags)
{
        struct super_block *sb = path->mnt->mnt_sb;
        struct mount *mnt = real_mount(path->mnt);
        int ret;

        if (!check_mnt(mnt))
                return -EINVAL;

        if (path->dentry != mnt->mnt.mnt_root)
                return -EINVAL;

        if (!can_change_locked_flags(mnt, mnt_flags))
                return -EPERM;

        down_write(&sb->s_umount);
        ret = change_mount_ro_state(mnt, mnt_flags);
        if (ret == 0)
                set_mount_attributes(mnt, mnt_flags);
        up_write(&sb->s_umount);

        mnt_warn_timestamp_expiry(path, &mnt->mnt);

        return ret;
}

/*
 * change filesystem flags. dir should be a physical root of filesystem.
 * If you've mounted a non-root directory somewhere and want to do remount
 * on it - tough luck.
 */
static int do_remount(struct path *path, int ms_flags, int sb_flags,
                      int mnt_flags, void *data)
{
        int err;
        struct super_block *sb = path->mnt->mnt_sb;
        struct mount *mnt = real_mount(path->mnt);
        struct fs_context *fc;

        if (!check_mnt(mnt))
                return -EINVAL;

        if (path->dentry != path->mnt->mnt_root)
                return -EINVAL;

        if (!can_change_locked_flags(mnt, mnt_flags))
                return -EPERM;

        fc = fs_context_for_reconfigure(path->dentry, sb_flags, MS_RMT_MASK);
        if (IS_ERR(fc))
                return PTR_ERR(fc);

        /*
         * Indicate to the filesystem that the remount request is coming
         * from the legacy mount system call.
         */
        fc->oldapi = true;

        err = parse_monolithic_mount_data(fc, data);
        if (!err) {
                down_write(&sb->s_umount);
                err = -EPERM;
                if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
                        err = reconfigure_super(fc);
                        if (!err)
                                set_mount_attributes(mnt, mnt_flags);
                }
                up_write(&sb->s_umount);
        }

        mnt_warn_timestamp_expiry(path, &mnt->mnt);

        put_fs_context(fc);
        return err;
}

static inline int tree_contains_unbindable(struct mount *mnt)
{
        struct mount *p;
        for (p = mnt; p; p = next_mnt(p, mnt)) {
                if (IS_MNT_UNBINDABLE(p))
                        return 1;
        }
        return 0;
}

/*
 * Check that there aren't references to earlier/same mount namespaces in the
 * specified subtree.  Such references can act as pins for mount namespaces
 * that aren't checked by the mount-cycle checking code, thereby allowing
 * cycles to be made.
 */
static bool check_for_nsfs_mounts(struct mount *subtree)
{
        struct mount *p;
        bool ret = false;

        lock_mount_hash();
        for (p = subtree; p; p = next_mnt(p, subtree))
                if (mnt_ns_loop(p->mnt.mnt_root))
                        goto out;

        ret = true;
out:
        unlock_mount_hash();
        return ret;
}

static int do_set_group(struct path *from_path, struct path *to_path)
{
        struct mount *from, *to;
        int err;

        from = real_mount(from_path->mnt);
        to = real_mount(to_path->mnt);

        namespace_lock();

        err = may_change_propagation(from);
        if (err)
                goto out;
        err = may_change_propagation(to);
        if (err)
                goto out;

        err = -EINVAL;
        /* To and From paths should be mount roots */
        if (from_path->dentry != from_path->mnt->mnt_root)
                goto out;
        if (to_path->dentry != to_path->mnt->mnt_root)
                goto out;

        /* Setting sharing groups is only allowed across same superblock */
        if (from->mnt.mnt_sb != to->mnt.mnt_sb)
                goto out;

        /* From mount root should be wider than To mount root */
        if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root))
                goto out;

        /* From mount should not have locked children in place of To's root */
        if (has_locked_children(from, to->mnt.mnt_root))
                goto out;

        /* Setting sharing groups is only allowed on private mounts */
        if (IS_MNT_SHARED(to) || IS_MNT_SLAVE(to))
                goto out;

        /* From should not be private */
        if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from))
                goto out;

        if (IS_MNT_SLAVE(from)) {
                struct mount *m = from->mnt_master;

                list_add(&to->mnt_slave, &m->mnt_slave_list);
                to->mnt_master = m;
        }

        if (IS_MNT_SHARED(from)) {
                to->mnt_group_id = from->mnt_group_id;
                list_add(&to->mnt_share, &from->mnt_share);
                lock_mount_hash();
                set_mnt_shared(to);
                unlock_mount_hash();
        }

        err = 0;
out:
        namespace_unlock();
        return err;
}

static int do_move_mount(struct path *old_path, struct path *new_path)
{
        struct mnt_namespace *ns;
        struct mount *p;
        struct mount *old;
        struct mount *parent;
        struct mountpoint *mp, *old_mp;
        int err;
        bool attached;

        mp = lock_mount(new_path);
        if (IS_ERR(mp))
                return PTR_ERR(mp);

        old = real_mount(old_path->mnt);
        p = real_mount(new_path->mnt);
        parent = old->mnt_parent;
        attached = mnt_has_parent(old);
        old_mp = old->mnt_mp;
        ns = old->mnt_ns;

        err = -EINVAL;
        /* The mountpoint must be in our namespace. */
        if (!check_mnt(p))
                goto out;

        /* The thing moved must be mounted... */
        if (!is_mounted(&old->mnt))
                goto out;

        /* ... and either ours or the root of anon namespace */
        if (!(attached ? check_mnt(old) : is_anon_ns(ns)))
                goto out;

        if (old->mnt.mnt_flags & MNT_LOCKED)
                goto out;

        if (old_path->dentry != old_path->mnt->mnt_root)
                goto out;

        if (d_is_dir(new_path->dentry) !=
            d_is_dir(old_path->dentry))
                goto out;
        /*
         * Don't move a mount residing in a shared parent.
         */
        if (attached && IS_MNT_SHARED(parent))
                goto out;
        /*
         * Don't move a mount tree containing unbindable mounts to a destination
         * mount which is shared.
         */
        if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
                goto out;
        err = -ELOOP;
        if (!check_for_nsfs_mounts(old))
                goto out;
        for (; mnt_has_parent(p); p = p->mnt_parent)
                if (p == old)
                        goto out;

        err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp,
                                   attached);
        if (err)
                goto out;

        /* if the mount is moved, it should no longer be expire
         * automatically */
        list_del_init(&old->mnt_expire);
        if (attached)
                put_mountpoint(old_mp);
out:
        unlock_mount(mp);
        if (!err) {
                if (attached)
                        mntput_no_expire(parent);
                else
                        free_mnt_ns(ns);
        }
        return err;
}

static int do_move_mount_old(struct path *path, const char *old_name)
{
        struct path old_path;
        int err;

        if (!old_name || !*old_name)
                return -EINVAL;

        err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
        if (err)
                return err;

        err = do_move_mount(&old_path, path);
        path_put(&old_path);
        return err;
}

/*
 * add a mount into a namespace's mount tree
 */
static int do_add_mount(struct mount *newmnt, struct mountpoint *mp,
                        struct path *path, int mnt_flags)
{
        struct mount *parent = real_mount(path->mnt);

        mnt_flags &= ~MNT_INTERNAL_FLAGS;

        if (unlikely(!check_mnt(parent))) {
                /* that's acceptable only for automounts done in private ns */
                if (!(mnt_flags & MNT_SHRINKABLE))
                        return -EINVAL;
                /* ... and for those we'd better have mountpoint still alive */
                if (!parent->mnt_ns)
                        return -EINVAL;
        }

        /* Refuse the same filesystem on the same mount point */
        if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&
            path->mnt->mnt_root == path->dentry)
                return -EBUSY;

        if (d_is_symlink(newmnt->mnt.mnt_root))
                return -EINVAL;

        newmnt->mnt.mnt_flags = mnt_flags;
        return graft_tree(newmnt, parent, mp);
}

static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags);

/*
 * Create a new mount using a superblock configuration and request it
 * be added to the namespace tree.
 */
static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
                           unsigned int mnt_flags)
{
        struct vfsmount *mnt;
        struct mountpoint *mp;
        struct super_block *sb = fc->root->d_sb;
        int error;

        error = security_sb_kern_mount(sb);
        if (!error && mount_too_revealing(sb, &mnt_flags))
                error = -EPERM;

        if (unlikely(error)) {
                fc_drop_locked(fc);
                return error;
        }

        up_write(&sb->s_umount);

        mnt = vfs_create_mount(fc);
        if (IS_ERR(mnt))
                return PTR_ERR(mnt);

        mnt_warn_timestamp_expiry(mountpoint, mnt);

        mp = lock_mount(mountpoint);
        if (IS_ERR(mp)) {
                mntput(mnt);
                return PTR_ERR(mp);
        }
        error = do_add_mount(real_mount(mnt), mp, mountpoint, mnt_flags);
        unlock_mount(mp);
        if (error < 0)
                mntput(mnt);
        return error;
}

/*
 * create a new mount for userspace and request it to be added into the
 * namespace's tree
 */
static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
                        int mnt_flags, const char *name, void *data)
{
        struct file_system_type *type;
        struct fs_context *fc;
        const char *subtype = NULL;
        int err = 0;

        if (!fstype)
                return -EINVAL;

        type = get_fs_type(fstype);
        if (!type)
                return -ENODEV;

        if (type->fs_flags & FS_HAS_SUBTYPE) {
                subtype = strchr(fstype, '.');
                if (subtype) {
                        subtype++;
                        if (!*subtype) {
                                put_filesystem(type);
                                return -EINVAL;
                        }
                }
        }

        fc = fs_context_for_mount(type, sb_flags);
        put_filesystem(type);
        if (IS_ERR(fc))
                return PTR_ERR(fc);

        /*
         * Indicate to the filesystem that the mount request is coming
         * from the legacy mount system call.
         */
        fc->oldapi = true;

        if (subtype)
                err = vfs_parse_fs_string(fc, "subtype",
                                          subtype, strlen(subtype));
        if (!err && name)
                err = vfs_parse_fs_string(fc, "source", name, strlen(name));
        if (!err)
                err = parse_monolithic_mount_data(fc, data);
        if (!err && !mount_capable(fc))
                err = -EPERM;
        if (!err)
                err = vfs_get_tree(fc);
        if (!err)
                err = do_new_mount_fc(fc, path, mnt_flags);

        put_fs_context(fc);
        return err;
}

int finish_automount(struct vfsmount *m, struct path *path)
{
        struct dentry *dentry = path->dentry;
        struct mountpoint *mp;
        struct mount *mnt;
        int err;

        if (!m)
                return 0;
        if (IS_ERR(m))
                return PTR_ERR(m);

        mnt = real_mount(m);
        /* The new mount record should have at least 2 refs to prevent it being
         * expired before we get a chance to add it
         */
        BUG_ON(mnt_get_count(mnt) < 2);

        if (m->mnt_sb == path->mnt->mnt_sb &&
            m->mnt_root == dentry) {
                err = -ELOOP;
                goto discard;
        }

        /*
         * we don't want to use lock_mount() - in this case finding something
         * that overmounts our mountpoint to be means "quitely drop what we've
         * got", not "try to mount it on top".
         */
        inode_lock(dentry->d_inode);
        namespace_lock();
        if (unlikely(cant_mount(dentry))) {
                err = -ENOENT;
                goto discard_locked;
        }
        rcu_read_lock();
        if (unlikely(__lookup_mnt(path->mnt, dentry))) {
                rcu_read_unlock();
                err = 0;
                goto discard_locked;
        }
        rcu_read_unlock();
        mp = get_mountpoint(dentry);
        if (IS_ERR(mp)) {
                err = PTR_ERR(mp);
                goto discard_locked;
        }

        err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
        unlock_mount(mp);
        if (unlikely(err))
                goto discard;
        mntput(m);
        return 0;

discard_locked:
        namespace_unlock();
        inode_unlock(dentry->d_inode);
discard:
        /* remove m from any expiration list it may be on */
        if (!list_empty(&mnt->mnt_expire)) {
                namespace_lock();
                list_del_init(&mnt->mnt_expire);
                namespace_unlock();
        }
        mntput(m);
        mntput(m);
        return err;
}

/**
 * mnt_set_expiry - Put a mount on an expiration list
 * @mnt: The mount to list.
 * @expiry_list: The list to add the mount to.
 */
void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
{
        namespace_lock();

        list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);

        namespace_unlock();
}
EXPORT_SYMBOL(mnt_set_expiry);

/*
 * process a list of expirable mountpoints with the intent of discarding any
 * mountpoints that aren't in use and haven't been touched since last we came
 * here
 */
void mark_mounts_for_expiry(struct list_head *mounts)
{
        struct mount *mnt, *next;
        LIST_HEAD(graveyard);

        if (list_empty(mounts))
                return;

        namespace_lock();
        lock_mount_hash();

        /* extract from the expiration list every vfsmount that matches the
         * following criteria:
         * - only referenced by its parent vfsmount
         * - still marked for expiry (marked on the last call here; marks are
         *   cleared by mntput())
         */
        list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
                if (!xchg(&mnt->mnt_expiry_mark, 1) ||
                        propagate_mount_busy(mnt, 1))
                        continue;
                list_move(&mnt->mnt_expire, &graveyard);
        }
        while (!list_empty(&graveyard)) {
                mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
                touch_mnt_namespace(mnt->mnt_ns);
                umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
        }
        unlock_mount_hash();
        namespace_unlock();
}

EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);

/*
 * Ripoff of 'select_parent()'
 *
 * search the list of submounts for a given mountpoint, and move any
 * shrinkable submounts to the 'graveyard' list.
 */
static int select_submounts(struct mount *parent, struct list_head *graveyard)
{
        struct mount *this_parent = parent;
        struct list_head *next;
        int found = 0;

repeat:
        next = this_parent->mnt_mounts.next;
resume:
        while (next != &this_parent->mnt_mounts) {
                struct list_head *tmp = next;
                struct mount *mnt = list_entry(tmp, struct mount, mnt_child);

                next = tmp->next;
                if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))
                        continue;
                /*
                 * Descend a level if the d_mounts list is non-empty.
                 */
                if (!list_empty(&mnt->mnt_mounts)) {
                        this_parent = mnt;
                        goto repeat;
                }

                if (!propagate_mount_busy(mnt, 1)) {
                        list_move_tail(&mnt->mnt_expire, graveyard);
                        found++;
                }
        }
        /*
         * All done at this level ... ascend and resume the search
         */
        if (this_parent != parent) {
                next = this_parent->mnt_child.next;
                this_parent = this_parent->mnt_parent;
                goto resume;
        }
        return found;
}

/*
 * process a list of expirable mountpoints with the intent of discarding any
 * submounts of a specific parent mountpoint
 *
 * mount_lock must be held for write
 */
static void shrink_submounts(struct mount *mnt)
{
        LIST_HEAD(graveyard);
        struct mount *m;

        /* extract submounts of 'mountpoint' from the expiration list */
        while (select_submounts(mnt, &graveyard)) {
                while (!list_empty(&graveyard)) {
                        m = list_first_entry(&graveyard, struct mount,
                                                mnt_expire);
                        touch_mnt_namespace(m->mnt_ns);
                        umount_tree(m, UMOUNT_PROPAGATE|UMOUNT_SYNC);
                }
        }
}

static void *copy_mount_options(const void __user * data)
{
        char *copy;
        unsigned left, offset;

        if (!data)
                return NULL;

        copy = kmalloc(PAGE_SIZE, GFP_KERNEL);
        if (!copy)
                return ERR_PTR(-ENOMEM);

        left = copy_from_user(copy, data, PAGE_SIZE);

        /*
         * Not all architectures have an exact copy_from_user(). Resort to
         * byte at a time.
         */
        offset = PAGE_SIZE - left;
        while (left) {
                char c;
                if (get_user(c, (const char __user *)data + offset))
                        break;
                copy[offset] = c;
                left--;
                offset++;
        }

        if (left == PAGE_SIZE) {
                kfree(copy);
                return ERR_PTR(-EFAULT);
        }

        return copy;
}

static char *copy_mount_string(const void __user *data)
{
        return data ? strndup_user(data, PATH_MAX) : NULL;
}

/*
 * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
 * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
 *
 * data is a (void *) that can point to any structure up to
 * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
 * information (or be NULL).
 *
 * Pre-0.97 versions of mount() didn't have a flags word.
 * When the flags word was introduced its top half was required
 * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
 * Therefore, if this magic number is present, it carries no information
 * and must be discarded.
 */
int path_mount(const char *dev_name, struct path *path,
                const char *type_page, unsigned long flags, void *data_page)
{
        unsigned int mnt_flags = 0, sb_flags;
        int ret;

        /* Discard magic */
        if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
                flags &= ~MS_MGC_MSK;

        /* Basic sanity checks */
        if (data_page)
                ((char *)data_page)[PAGE_SIZE - 1] = 0;

        if (flags & MS_NOUSER)
                return -EINVAL;

        ret = security_sb_mount(dev_name, path, type_page, flags, data_page);
        if (ret)
                return ret;
        if (!may_mount())
                return -EPERM;
        if ((flags & SB_MANDLOCK) && !may_mandlock())
                return -EPERM;

        /* Default to relatime unless overriden */
        if (!(flags & MS_NOATIME))
                mnt_flags |= MNT_RELATIME;

        /* Separate the per-mountpoint flags */
        if (flags & MS_NOSUID)
                mnt_flags |= MNT_NOSUID;
        if (flags & MS_NODEV)
                mnt_flags |= MNT_NODEV;
        if (flags & MS_NOEXEC)
                mnt_flags |= MNT_NOEXEC;
        if (flags & MS_NOATIME)
                mnt_flags |= MNT_NOATIME;
        if (flags & MS_NODIRATIME)
                mnt_flags |= MNT_NODIRATIME;
        if (flags & MS_STRICTATIME)
                mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
        if (flags & MS_RDONLY)
                mnt_flags |= MNT_READONLY;
        if (flags & MS_NOSYMFOLLOW)
                mnt_flags |= MNT_NOSYMFOLLOW;

        /* The default atime for remount is preservation */
        if ((flags & MS_REMOUNT) &&
            ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
                       MS_STRICTATIME)) == 0)) {
                mnt_flags &= ~MNT_ATIME_MASK;
                mnt_flags |= path->mnt->mnt_flags & MNT_ATIME_MASK;
        }

        sb_flags = flags & (SB_RDONLY |
                            SB_SYNCHRONOUS |
                            SB_MANDLOCK |
                            SB_DIRSYNC |
                            SB_SILENT |
                            SB_POSIXACL |
                            SB_LAZYTIME |
                            SB_I_VERSION);

        if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND))
                return do_reconfigure_mnt(path, mnt_flags);
        if (flags & MS_REMOUNT)
                return do_remount(path, flags, sb_flags, mnt_flags, data_page);
        if (flags & MS_BIND)
                return do_loopback(path, dev_name, flags & MS_REC);
        if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
                return do_change_type(path, flags);
        if (flags & MS_MOVE)
                return do_move_mount_old(path, dev_name);

        return do_new_mount(path, type_page, sb_flags, mnt_flags, dev_name,
                            data_page);
}

long do_mount(const char *dev_name, const char __user *dir_name,
                const char *type_page, unsigned long flags, void *data_page)
{
        struct path path;
        int ret;

        ret = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path);
        if (ret)
                return ret;
        ret = path_mount(dev_name, &path, type_page, flags, data_page);
        path_put(&path);
        return ret;
}

static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns)
{
        return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES);
}

static void dec_mnt_namespaces(struct ucounts *ucounts)
{
        dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES);
}

static void free_mnt_ns(struct mnt_namespace *ns)
{
        if (!is_anon_ns(ns))
                ns_free_inum(&ns->ns);
        dec_mnt_namespaces(ns->ucounts);
        put_user_ns(ns->user_ns);
        kfree(ns);
}

/*
 * Assign a sequence number so we can detect when we attempt to bind
 * mount a reference to an older mount namespace into the current
 * mount namespace, preventing reference counting loops.  A 64bit
 * number incrementing at 10Ghz will take 12,427 years to wrap which
 * is effectively never, so we can ignore the possibility.
 */
static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);

static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon)
{
        struct mnt_namespace *new_ns;
        struct ucounts *ucounts;
        int ret;

        ucounts = inc_mnt_namespaces(user_ns);
        if (!ucounts)
                return ERR_PTR(-ENOSPC);

        new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
        if (!new_ns) {
                dec_mnt_namespaces(ucounts);
                return ERR_PTR(-ENOMEM);
        }
        if (!anon) {
                ret = ns_alloc_inum(&new_ns->ns);
                if (ret) {
                        kfree(new_ns);
                        dec_mnt_namespaces(ucounts);
                        return ERR_PTR(ret);
                }
        }
        new_ns->ns.ops = &mntns_operations;
        if (!anon)
                new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
        atomic_set(&new_ns->count, 1);
        INIT_LIST_HEAD(&new_ns->list);
        init_waitqueue_head(&new_ns->poll);
        spin_lock_init(&new_ns->ns_lock);
        new_ns->user_ns = get_user_ns(user_ns);
        new_ns->ucounts = ucounts;
        return new_ns;
}

__latent_entropy
struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
                struct user_namespace *user_ns, struct fs_struct *new_fs)
{
        struct mnt_namespace *new_ns;
        struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
        struct mount *p, *q;
        struct mount *old;
        struct mount *new;
        int copy_flags;

        BUG_ON(!ns);

        if (likely(!(flags & CLONE_NEWNS))) {
                get_mnt_ns(ns);
                return ns;
        }

        old = ns->root;

        new_ns = alloc_mnt_ns(user_ns, false);
        if (IS_ERR(new_ns))
                return new_ns;

        namespace_lock();
        /* First pass: copy the tree topology */
        copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
        if (user_ns != ns->user_ns)
                copy_flags |= CL_SHARED_TO_SLAVE;
        new = copy_tree(old, old->mnt.mnt_root, copy_flags);
        if (IS_ERR(new)) {
                namespace_unlock();
                free_mnt_ns(new_ns);
                return ERR_CAST(new);
        }
        if (user_ns != ns->user_ns) {
                lock_mount_hash();
                lock_mnt_tree(new);
                unlock_mount_hash();
        }
        new_ns->root = new;
        list_add_tail(&new_ns->list, &new->mnt_list);

        /*
         * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
         * as belonging to new namespace.  We have already acquired a private
         * fs_struct, so tsk->fs->lock is not needed.
         */
        p = old;
        q = new;
        while (p) {
                q->mnt_ns = new_ns;
                new_ns->mounts++;
                if (new_fs) {
                        if (&p->mnt == new_fs->root.mnt) {
                                new_fs->root.mnt = mntget(&q->mnt);
                                rootmnt = &p->mnt;
                        }
                        if (&p->mnt == new_fs->pwd.mnt) {
                                new_fs->pwd.mnt = mntget(&q->mnt);
                                pwdmnt = &p->mnt;
                        }
                }
                p = next_mnt(p, old);
                q = next_mnt(q, new);
                if (!q)
                        break;
                while (p->mnt.mnt_root != q->mnt.mnt_root)
                        p = next_mnt(p, old);
        }
        namespace_unlock();

        if (rootmnt)
                mntput(rootmnt);
        if (pwdmnt)
                mntput(pwdmnt);

        return new_ns;
}

struct dentry *mount_subtree(struct vfsmount *m, const char *name)
{
        struct mount *mnt = real_mount(m);
        struct mnt_namespace *ns;
        struct super_block *s;
        struct path path;
        int err;

        ns = alloc_mnt_ns(&init_user_ns, true);
        if (IS_ERR(ns)) {
                mntput(m);
                return ERR_CAST(ns);
        }
        mnt->mnt_ns = ns;
        ns->root = mnt;
        ns->mounts++;
        list_add(&mnt->mnt_list, &ns->list);

        err = vfs_path_lookup(m->mnt_root, m,
                        name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);

        put_mnt_ns(ns);

        if (err)
                return ERR_PTR(err);

        /* trade a vfsmount reference for active sb one */
        s = path.mnt->mnt_sb;
        atomic_inc(&s->s_active);
        mntput(path.mnt);
        /* lock the sucker */
        down_write(&s->s_umount);
        /* ... and return the root of (sub)tree on it */
        return path.dentry;
}
EXPORT_SYMBOL(mount_subtree);

SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
                char __user *, type, unsigned long, flags, void __user *, data)
{
        int ret;
        char *kernel_type;
        char *kernel_dev;
        void *options;

        kernel_type = copy_mount_string(type);
        ret = PTR_ERR(kernel_type);
        if (IS_ERR(kernel_type))
                goto out_type;

        kernel_dev = copy_mount_string(dev_name);
        ret = PTR_ERR(kernel_dev);
        if (IS_ERR(kernel_dev))
                goto out_dev;

        options = copy_mount_options(data);
        ret = PTR_ERR(options);
        if (IS_ERR(options))
                goto out_data;

        ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options);

        kfree(options);
out_data:
        kfree(kernel_dev);
out_dev:
        kfree(kernel_type);
out_type:
        return ret;
}

/*
 * Create a kernel mount representation for a new, prepared superblock
 * (specified by fs_fd) and attach to an open_tree-like file descriptor.
 */
SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
                unsigned int, attr_flags)
{
        struct mnt_namespace *ns;
        struct fs_context *fc;
        struct file *file;
        struct path newmount;
        struct mount *mnt;
        struct fd f;
        unsigned int mnt_flags = 0;
        long ret;

        if (!may_mount())
                return -EPERM;

        if ((flags & ~(FSMOUNT_CLOEXEC)) != 0)
                return -EINVAL;

        if (attr_flags & ~(MOUNT_ATTR_RDONLY |
                           MOUNT_ATTR_NOSUID |
                           MOUNT_ATTR_NODEV |
                           MOUNT_ATTR_NOEXEC |
                           MOUNT_ATTR__ATIME |
                           MOUNT_ATTR_NODIRATIME))
                return -EINVAL;

        if (attr_flags & MOUNT_ATTR_RDONLY)
                mnt_flags |= MNT_READONLY;
        if (attr_flags & MOUNT_ATTR_NOSUID)
                mnt_flags |= MNT_NOSUID;
        if (attr_flags & MOUNT_ATTR_NODEV)
                mnt_flags |= MNT_NODEV;
        if (attr_flags & MOUNT_ATTR_NOEXEC)
                mnt_flags |= MNT_NOEXEC;
        if (attr_flags & MOUNT_ATTR_NODIRATIME)
                mnt_flags |= MNT_NODIRATIME;

        switch (attr_flags & MOUNT_ATTR__ATIME) {
        case MOUNT_ATTR_STRICTATIME:
                break;
        case MOUNT_ATTR_NOATIME:
                mnt_flags |= MNT_NOATIME;
                break;
        case MOUNT_ATTR_RELATIME:
                mnt_flags |= MNT_RELATIME;
                break;
        default:
                return -EINVAL;
        }

        f = fdget(fs_fd);
        if (!f.file)
                return -EBADF;

        ret = -EINVAL;
        if (f.file->f_op != &fscontext_fops)
                goto err_fsfd;

        fc = f.file->private_data;

        ret = mutex_lock_interruptible(&fc->uapi_mutex);
        if (ret < 0)
                goto err_fsfd;

        /* There must be a valid superblock or we can't mount it */
        ret = -EINVAL;
        if (!fc->root)
                goto err_unlock;

        ret = -EPERM;
        if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) {
                pr_warn("VFS: Mount too revealing\n");
                goto err_unlock;
        }

        ret = -EBUSY;
        if (fc->phase != FS_CONTEXT_AWAITING_MOUNT)
                goto err_unlock;

        ret = -EPERM;
        if ((fc->sb_flags & SB_MANDLOCK) && !may_mandlock())
                goto err_unlock;

        newmount.mnt = vfs_create_mount(fc);
        if (IS_ERR(newmount.mnt)) {
                ret = PTR_ERR(newmount.mnt);
                goto err_unlock;
        }
        newmount.dentry = dget(fc->root);
        newmount.mnt->mnt_flags = mnt_flags;

        /* We've done the mount bit - now move the file context into more or
         * less the same state as if we'd done an fspick().  We don't want to
         * do any memory allocation or anything like that at this point as we
         * don't want to have to handle any errors incurred.
         */
        vfs_clean_context(fc);

        ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true);
        if (IS_ERR(ns)) {
                ret = PTR_ERR(ns);
                goto err_path;
        }
        mnt = real_mount(newmount.mnt);
        mnt->mnt_ns = ns;
        ns->root = mnt;
        ns->mounts = 1;
        list_add(&mnt->mnt_list, &ns->list);
        mntget(newmount.mnt);

        /* Attach to an apparent O_PATH fd with a note that we need to unmount
         * it, not just simply put it.
         */
        file = dentry_open(&newmount, O_PATH, fc->cred);
        if (IS_ERR(file)) {
                dissolve_on_fput(newmount.mnt);
                ret = PTR_ERR(file);
                goto err_path;
        }
        file->f_mode |= FMODE_NEED_UNMOUNT;

        ret = get_unused_fd_flags((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0);
        if (ret >= 0)
                fd_install(ret, file);
        else
                fput(file);

err_path:
        path_put(&newmount);
err_unlock:
        mutex_unlock(&fc->uapi_mutex);
err_fsfd:
        fdput(f);
        return ret;
}

/*
 * Move a mount from one place to another.  In combination with
 * fsopen()/fsmount() this is used to install a new mount and in combination
 * with open_tree(OPEN_TREE_CLONE [| AT_RECURSIVE]) it can be used to copy
 * a mount subtree.
 *
 * Note the flags value is a combination of MOVE_MOUNT_* flags.
 */
SYSCALL_DEFINE5(move_mount,
                int, from_dfd, const char __user *, from_pathname,
                int, to_dfd, const char __user *, to_pathname,
                unsigned int, flags)
{
        struct path from_path, to_path;
        unsigned int lflags;
        int ret = 0;

        if (!may_mount())
                return -EPERM;

        if (flags & ~MOVE_MOUNT__MASK)
                return -EINVAL;

        /* If someone gives a pathname, they aren't permitted to move
         * from an fd that requires unmount as we can't get at the flag
         * to clear it afterwards.
         */
        lflags = 0;
        if (flags & MOVE_MOUNT_F_SYMLINKS)        lflags |= LOOKUP_FOLLOW;
        if (flags & MOVE_MOUNT_F_AUTOMOUNTS)        lflags |= LOOKUP_AUTOMOUNT;
        if (flags & MOVE_MOUNT_F_EMPTY_PATH)        lflags |= LOOKUP_EMPTY;

        ret = user_path_at(from_dfd, from_pathname, lflags, &from_path);
        if (ret < 0)
                return ret;

        lflags = 0;
        if (flags & MOVE_MOUNT_T_SYMLINKS)        lflags |= LOOKUP_FOLLOW;
        if (flags & MOVE_MOUNT_T_AUTOMOUNTS)        lflags |= LOOKUP_AUTOMOUNT;
        if (flags & MOVE_MOUNT_T_EMPTY_PATH)        lflags |= LOOKUP_EMPTY;

        ret = user_path_at(to_dfd, to_pathname, lflags, &to_path);
        if (ret < 0)
                goto out_from;

        ret = security_move_mount(&from_path, &to_path);
        if (ret < 0)
                goto out_to;

        if (flags & MOVE_MOUNT_SET_GROUP)
                ret = do_set_group(&from_path, &to_path);
        else
                ret = do_move_mount(&from_path, &to_path);

out_to:
        path_put(&to_path);
out_from:
        path_put(&from_path);
        return ret;
}

/*
 * Return true if path is reachable from root
 *
 * namespace_sem or mount_lock is held
 */
bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
                         const struct path *root)
{
        while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {
                dentry = mnt->mnt_mountpoint;
                mnt = mnt->mnt_parent;
        }
        return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
}

bool path_is_under(const struct path *path1, const struct path *path2)
{
        bool res;
        read_seqlock_excl(&mount_lock);
        res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
        read_sequnlock_excl(&mount_lock);
        return res;
}
EXPORT_SYMBOL(path_is_under);

/*
 * pivot_root Semantics:
 * Moves the root file system of the current process to the directory put_old,
 * makes new_root as the new root file system of the current process, and sets
 * root/cwd of all processes which had them on the current root to new_root.
 *
 * Restrictions:
 * The new_root and put_old must be directories, and  must not be on the
 * same file  system as the current process root. The put_old  must  be
 * underneath new_root,  i.e. adding a non-zero number of /.. to the string
 * pointed to by put_old must yield the same directory as new_root. No other
 * file system may be mounted on put_old. After all, new_root is a mountpoint.
 *
 * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
 * See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives
 * in this situation.
 *
 * Notes:
 *  - we don't move root/cwd if they are not at the root (reason: if something
 *    cared enough to change them, it's probably wrong to force them elsewhere)
 *  - it's okay to pick a root that isn't the root of a file system, e.g.
 *    /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
 *    though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
 *    first.
 */
SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
                const char __user *, put_old)
{
        struct path new, old, root;
        struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent;
        struct mountpoint *old_mp, *root_mp;
        int error;

        if (!may_mount())
                return -EPERM;

        error = user_path_at(AT_FDCWD, new_root,
                             LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new);
        if (error)
                goto out0;

        error = user_path_at(AT_FDCWD, put_old,
                             LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old);
        if (error)
                goto out1;

        error = security_sb_pivotroot(&old, &new);
        if (error)
                goto out2;

        get_fs_root(current->fs, &root);
        old_mp = lock_mount(&old);
        error = PTR_ERR(old_mp);
        if (IS_ERR(old_mp))
                goto out3;

        error = -EINVAL;
        new_mnt = real_mount(new.mnt);
        root_mnt = real_mount(root.mnt);
        old_mnt = real_mount(old.mnt);
        ex_parent = new_mnt->mnt_parent;
        root_parent = root_mnt->mnt_parent;
        if (IS_MNT_SHARED(old_mnt) ||
                IS_MNT_SHARED(ex_parent) ||
                IS_MNT_SHARED(root_parent))
                goto out4;
        if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
                goto out4;
        if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
                goto out4;
        error = -ENOENT;
        if (d_unlinked(new.dentry))
                goto out4;
        error = -EBUSY;
        if (new_mnt == root_mnt || old_mnt == root_mnt)
                goto out4; /* loop, on the same file system  */
        error = -EINVAL;
        if (root.mnt->mnt_root != root.dentry)
                goto out4; /* not a mountpoint */
        if (!mnt_has_parent(root_mnt))
                goto out4; /* not attached */
        if (new.mnt->mnt_root != new.dentry)
                goto out4; /* not a mountpoint */
        if (!mnt_has_parent(new_mnt))
                goto out4; /* not attached */
        /* make sure we can reach put_old from new_root */
        if (!is_path_reachable(old_mnt, old.dentry, &new))
                goto out4;
        /* make certain new is below the root */
        if (!is_path_reachable(new_mnt, new.dentry, &root))
                goto out4;
        lock_mount_hash();
        umount_mnt(new_mnt);
        root_mp = unhash_mnt(root_mnt);  /* we'll need its mountpoint */
        if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
                new_mnt->mnt.mnt_flags |= MNT_LOCKED;
                root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
        }
        /* mount old root on put_old */
        attach_mnt(root_mnt, old_mnt, old_mp);
        /* mount new_root on / */
        attach_mnt(new_mnt, root_parent, root_mp);
        mnt_add_count(root_parent, -1);
        touch_mnt_namespace(current->nsproxy->mnt_ns);
        /* A moved mount should not expire automatically */
        list_del_init(&new_mnt->mnt_expire);
        put_mountpoint(root_mp);
        unlock_mount_hash();
        chroot_fs_refs(&root, &new);
        error = 0;
out4:
        unlock_mount(old_mp);
        if (!error)
                mntput_no_expire(ex_parent);
out3:
        path_put(&root);
out2:
        path_put(&old);
out1:
        path_put(&new);
out0:
        return error;
}

static void __init init_mount_tree(void)
{
        struct vfsmount *mnt;
        struct mount *m;
        struct mnt_namespace *ns;
        struct path root;

        mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", initramfs_options);
        if (IS_ERR(mnt))
                panic("Can't create rootfs");

        ns = alloc_mnt_ns(&init_user_ns, false);
        if (IS_ERR(ns))
                panic("Can't allocate initial namespace");
        m = real_mount(mnt);
        m->mnt_ns = ns;
        ns->root = m;
        ns->mounts = 1;
        list_add(&m->mnt_list, &ns->list);
        init_task.nsproxy->mnt_ns = ns;
        get_mnt_ns(ns);

        root.mnt = mnt;
        root.dentry = mnt->mnt_root;
        mnt->mnt_flags |= MNT_LOCKED;

        set_fs_pwd(current->fs, &root);
        set_fs_root(current->fs, &root);
}

void __init mnt_init(void)
{
        int err;

        mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
                        0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);

        mount_hashtable = alloc_large_system_hash("Mount-cache",
                                sizeof(struct hlist_head),
                                mhash_entries, 19,
                                HASH_ZERO,
                                &m_hash_shift, &m_hash_mask, 0, 0);
        mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
                                sizeof(struct hlist_head),
                                mphash_entries, 19,
                                HASH_ZERO,
                                &mp_hash_shift, &mp_hash_mask, 0, 0);

        if (!mount_hashtable || !mountpoint_hashtable)
                panic("Failed to allocate mount hash table\n");

        kernfs_init();

        err = sysfs_init();
        if (err)
                printk(KERN_WARNING "%s: sysfs_init error: %d\n",
                        __func__, err);
        fs_kobj = kobject_create_and_add("fs", NULL);
        if (!fs_kobj)
                printk(KERN_WARNING "%s: kobj create error\n", __func__);
        shmem_init();
        init_rootfs();
        init_mount_tree();
}

void put_mnt_ns(struct mnt_namespace *ns)
{
        if (!atomic_dec_and_test(&ns->count))
                return;
        drop_collected_mounts(&ns->root->mnt);
        free_mnt_ns(ns);
}

struct vfsmount *kern_mount(struct file_system_type *type)
{
        struct vfsmount *mnt;
        mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
        if (!IS_ERR(mnt)) {
                /*
                 * it is a longterm mount, don't release mnt until
                 * we unmount before file sys is unregistered
                */
                real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
        }
        return mnt;
}
EXPORT_SYMBOL_GPL(kern_mount);

void kern_unmount(struct vfsmount *mnt)
{
        /* release long term mount so mount point can be released */
        if (!IS_ERR_OR_NULL(mnt)) {
                real_mount(mnt)->mnt_ns = NULL;
                synchronize_rcu();        /* yecchhh... */
                mntput(mnt);
        }
}
EXPORT_SYMBOL(kern_unmount);

void kern_unmount_array(struct vfsmount *mnt[], unsigned int num)
{
        unsigned int i;

        for (i = 0; i < num; i++)
                if (mnt[i])
                        real_mount(mnt[i])->mnt_ns = NULL;
        synchronize_rcu_expedited();
        for (i = 0; i < num; i++)
                mntput(mnt[i]);
}
EXPORT_SYMBOL(kern_unmount_array);

bool our_mnt(struct vfsmount *mnt)
{
        return check_mnt(real_mount(mnt));
}

bool current_chrooted(void)
{
        /* Does the current process have a non-standard root */
        struct path ns_root;
        struct path fs_root;
        bool chrooted;

        /* Find the namespace root */
        ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt;
        ns_root.dentry = ns_root.mnt->mnt_root;
        path_get(&ns_root);
        while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))
                ;

        get_fs_root(current->fs, &fs_root);

        chrooted = !path_equal(&fs_root, &ns_root);

        path_put(&fs_root);
        path_put(&ns_root);

        return chrooted;
}

static bool mnt_already_visible(struct mnt_namespace *ns,
                                const struct super_block *sb,
                                int *new_mnt_flags)
{
        int new_flags = *new_mnt_flags;
        struct mount *mnt;
        bool visible = false;

        down_read(&namespace_sem);
        lock_ns_list(ns);
        list_for_each_entry(mnt, &ns->list, mnt_list) {
                struct mount *child;
                int mnt_flags;

                if (mnt_is_cursor(mnt))
                        continue;

                if (mnt->mnt.mnt_sb->s_type != sb->s_type)
                        continue;

                /* This mount is not fully visible if it's root directory
                 * is not the root directory of the filesystem.
                 */
                if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
                        continue;

                /* A local view of the mount flags */
                mnt_flags = mnt->mnt.mnt_flags;

                /* Don't miss readonly hidden in the superblock flags */
                if (sb_rdonly(mnt->mnt.mnt_sb))
                        mnt_flags |= MNT_LOCK_READONLY;

                /* Verify the mount flags are equal to or more permissive
                 * than the proposed new mount.
                 */
                if ((mnt_flags & MNT_LOCK_READONLY) &&
                    !(new_flags & MNT_READONLY))
                        continue;
                if ((mnt_flags & MNT_LOCK_ATIME) &&
                    ((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
                        continue;

                /* This mount is not fully visible if there are any
                 * locked child mounts that cover anything except for
                 * empty directories.
                 */
                list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
                        struct inode *inode = child->mnt_mountpoint->d_inode;
                        /* Only worry about locked mounts */
                        if (!(child->mnt.mnt_flags & MNT_LOCKED))
                                continue;
                        /* Is the directory permanetly empty? */
                        if (!is_empty_dir_inode(inode))
                                goto next;
                }
                /* Preserve the locked attributes */
                *new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \
                                               MNT_LOCK_ATIME);
                visible = true;
                goto found;
        next:        ;
        }
found:
        unlock_ns_list(ns);
        up_read(&namespace_sem);
        return visible;
}

static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags)
{
        const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV;
        struct mnt_namespace *ns = current->nsproxy->mnt_ns;
        unsigned long s_iflags;

        if (ns->user_ns == &init_user_ns)
                return false;

        /* Can this filesystem be too revealing? */
        s_iflags = sb->s_iflags;
        if (!(s_iflags & SB_I_USERNS_VISIBLE))
                return false;

        if ((s_iflags & required_iflags) != required_iflags) {
                WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n",
                          required_iflags);
                return true;
        }

        return !mnt_already_visible(ns, sb, new_mnt_flags);
}

bool mnt_may_suid(struct vfsmount *mnt)
{
        /*
         * Foreign mounts (accessed via fchdir or through /proc
         * symlinks) are always treated as if they are nosuid.  This
         * prevents namespaces from trusting potentially unsafe
         * suid/sgid bits, file caps, or security labels that originate
         * in other namespaces.
         */
        return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(real_mount(mnt)) &&
               current_in_userns(mnt->mnt_sb->s_user_ns);
}

static struct ns_common *mntns_get(struct task_struct *task)
{
        struct ns_common *ns = NULL;
        struct nsproxy *nsproxy;

        task_lock(task);
        nsproxy = task->nsproxy;
        if (nsproxy) {
                ns = &nsproxy->mnt_ns->ns;
                get_mnt_ns(to_mnt_ns(ns));
        }
        task_unlock(task);

        return ns;
}

static void mntns_put(struct ns_common *ns)
{
        put_mnt_ns(to_mnt_ns(ns));
}

static int mntns_install(struct nsset *nsset, struct ns_common *ns)
{
        struct nsproxy *nsproxy = nsset->nsproxy;
        struct fs_struct *fs = nsset->fs;
        struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns;
        struct user_namespace *user_ns = nsset->cred->user_ns;
        struct path root;
        int err;

        if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
            !ns_capable(user_ns, CAP_SYS_CHROOT) ||
            !ns_capable(user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        if (is_anon_ns(mnt_ns))
                return -EINVAL;

        if (fs->users != 1)
                return -EINVAL;

        get_mnt_ns(mnt_ns);
        old_mnt_ns = nsproxy->mnt_ns;
        nsproxy->mnt_ns = mnt_ns;

        /* Find the root */
        err = vfs_path_lookup(mnt_ns->root->mnt.mnt_root, &mnt_ns->root->mnt,
                                "/", LOOKUP_DOWN, &root);
        if (err) {
                /* revert to old namespace */
                nsproxy->mnt_ns = old_mnt_ns;
                put_mnt_ns(mnt_ns);
                return err;
        }

        put_mnt_ns(old_mnt_ns);

        /* Update the pwd and root */
        set_fs_pwd(fs, &root);
        set_fs_root(fs, &root);

        path_put(&root);
        return 0;
}

static struct user_namespace *mntns_owner(struct ns_common *ns)
{
        return to_mnt_ns(ns)->user_ns;
}

const struct proc_ns_operations mntns_operations = {
        .name                = "mnt",
        .type                = CLONE_NEWNS,
        .get                = mntns_get,
        .put                = mntns_put,
        .install        = mntns_install,
        .owner                = mntns_owner,
};


















































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
   md.h : kernel internal structure of the Linux MD driver
          Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman

*/

#ifndef _MD_MD_H
#define _MD_MD_H

#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/badblocks.h>
#include <linux/kobject.h>
#include <linux/list.h>
#include <linux/mm.h>
#include <linux/mutex.h>
#include <linux/timer.h>
#include <linux/wait.h>
#include <linux/workqueue.h>
#include "md-cluster.h"

#define MaxSector (~(sector_t)0)

/*
 * These flags should really be called "NO_RETRY" rather than
 * "FAILFAST" because they don't make any promise about time lapse,
 * only about the number of retries, which will be zero.
 * REQ_FAILFAST_DRIVER is not included because
 * Commit: 4a27446f3e39 ("[SCSI] modify scsi to handle new fail fast flags.")
 * seems to suggest that the errors it avoids retrying should usually
 * be retried.
 */
#define        MD_FAILFAST        (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT)

/*
 * The struct embedded in rdev is used to serialize IO.
 */
struct serial_in_rdev {
        struct rb_root_cached serial_rb;
        spinlock_t serial_lock;
        wait_queue_head_t serial_io_wait;
};

/*
 * MD's 'extended' device
 */
struct md_rdev {
        struct list_head same_set;        /* RAID devices within the same set */

        sector_t sectors;                /* Device size (in 512bytes sectors) */
        struct mddev *mddev;                /* RAID array if running */
        int last_events;                /* IO event timestamp */

        /*
         * If meta_bdev is non-NULL, it means that a separate device is
         * being used to store the metadata (superblock/bitmap) which
         * would otherwise be contained on the same device as the data (bdev).
         */
        struct block_device *meta_bdev;
        struct block_device *bdev;        /* block device handle */

        struct page        *sb_page, *bb_page;
        int                sb_loaded;
        __u64                sb_events;
        sector_t        data_offset;        /* start of data in array */
        sector_t        new_data_offset;/* only relevant while reshaping */
        sector_t        sb_start;        /* offset of the super block (in 512byte sectors) */
        int                sb_size;        /* bytes in the superblock */
        int                preferred_minor;        /* autorun support */

        struct kobject        kobj;

        /* A device can be in one of three states based on two flags:
         * Not working:   faulty==1 in_sync==0
         * Fully working: faulty==0 in_sync==1
         * Working, but not
         * in sync with array
         *                faulty==0 in_sync==0
         *
         * It can never have faulty==1, in_sync==1
         * This reduces the burden of testing multiple flags in many cases
         */

        unsigned long        flags;        /* bit set of 'enum flag_bits' bits. */
        wait_queue_head_t blocked_wait;

        int desc_nr;                        /* descriptor index in the superblock */
        int raid_disk;                        /* role of device in array */
        int new_raid_disk;                /* role that the device will have in
                                         * the array after a level-change completes.
                                         */
        int saved_raid_disk;                /* role that device used to have in the
                                         * array and could again if we did a partial
                                         * resync from the bitmap
                                         */
        union {
                sector_t recovery_offset;/* If this device has been partially
                                         * recovered, this is where we were
                                         * up to.
                                         */
                sector_t journal_tail;        /* If this device is a journal device,
                                         * this is the journal tail (journal
                                         * recovery start point)
                                         */
        };

        atomic_t        nr_pending;        /* number of pending requests.
                                         * only maintained for arrays that
                                         * support hot removal
                                         */
        atomic_t        read_errors;        /* number of consecutive read errors that
                                         * we have tried to ignore.
                                         */
        time64_t        last_read_error;        /* monotonic time since our
                                                 * last read error
                                                 */
        atomic_t        corrected_errors; /* number of corrected read errors,
                                           * for reporting to userspace and storing
                                           * in superblock.
                                           */

        struct serial_in_rdev *serial;  /* used for raid1 io serialization */

        struct work_struct del_work;        /* used for delayed sysfs removal */

        struct kernfs_node *sysfs_state; /* handle for 'state'
                                           * sysfs entry */
        /* handle for 'unacknowledged_bad_blocks' sysfs dentry */
        struct kernfs_node *sysfs_unack_badblocks;
        /* handle for 'bad_blocks' sysfs dentry */
        struct kernfs_node *sysfs_badblocks;
        struct badblocks badblocks;

        struct {
                short offset;        /* Offset from superblock to start of PPL.
                                 * Not used by external metadata. */
                unsigned int size;        /* Size in sectors of the PPL space */
                sector_t sector;        /* First sector of the PPL space */
        } ppl;
};
enum flag_bits {
        Faulty,                        /* device is known to have a fault */
        In_sync,                /* device is in_sync with rest of array */
        Bitmap_sync,                /* ..actually, not quite In_sync.  Need a
                                 * bitmap-based recovery to get fully in sync.
                                 * The bit is only meaningful before device
                                 * has been passed to pers->hot_add_disk.
                                 */
        WriteMostly,                /* Avoid reading if at all possible */
        AutoDetected,                /* added by auto-detect */
        Blocked,                /* An error occurred but has not yet
                                 * been acknowledged by the metadata
                                 * handler, so don't allow writes
                                 * until it is cleared */
        WriteErrorSeen,                /* A write error has been seen on this
                                 * device
                                 */
        FaultRecorded,                /* Intermediate state for clearing
                                 * Blocked.  The Fault is/will-be
                                 * recorded in the metadata, but that
                                 * metadata hasn't been stored safely
                                 * on disk yet.
                                 */
        BlockedBadBlocks,        /* A writer is blocked because they
                                 * found an unacknowledged bad-block.
                                 * This can safely be cleared at any
                                 * time, and the writer will re-check.
                                 * It may be set at any time, and at
                                 * worst the writer will timeout and
                                 * re-check.  So setting it as
                                 * accurately as possible is good, but
                                 * not absolutely critical.
                                 */
        WantReplacement,        /* This device is a candidate to be
                                 * hot-replaced, either because it has
                                 * reported some faults, or because
                                 * of explicit request.
                                 */
        Replacement,                /* This device is a replacement for
                                 * a want_replacement device with same
                                 * raid_disk number.
                                 */
        Candidate,                /* For clustered environments only:
                                 * This device is seen locally but not
                                 * by the whole cluster
                                 */
        Journal,                /* This device is used as journal for
                                 * raid-5/6.
                                 * Usually, this device should be faster
                                 * than other devices in the array
                                 */
        ClusterRemove,
        RemoveSynchronized,        /* synchronize_rcu() was called after
                                 * this device was known to be faulty,
                                 * so it is safe to remove without
                                 * another synchronize_rcu() call.
                                 */
        ExternalBbl,            /* External metadata provides bad
                                 * block management for a disk
                                 */
        FailFast,                /* Minimal retries should be attempted on
                                 * this device, so use REQ_FAILFAST_DEV.
                                 * Also don't try to repair failed reads.
                                 * It is expects that no bad block log
                                 * is present.
                                 */
        LastDev,                /* Seems to be the last working dev as
                                 * it didn't fail, so don't use FailFast
                                 * any more for metadata
                                 */
        CollisionCheck,                /*
                                 * check if there is collision between raid1
                                 * serial bios.
                                 */
};

static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
                              sector_t *first_bad, int *bad_sectors)
{
        if (unlikely(rdev->badblocks.count)) {
                int rv = badblocks_check(&rdev->badblocks, rdev->data_offset + s,
                                        sectors,
                                        first_bad, bad_sectors);
                if (rv)
                        *first_bad -= rdev->data_offset;
                return rv;
        }
        return 0;
}
extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
                              int is_new);
extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
                                int is_new);
struct md_cluster_info;

/* change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added */
enum mddev_flags {
        MD_ARRAY_FIRST_USE,        /* First use of array, needs initialization */
        MD_CLOSING,                /* If set, we are closing the array, do not open
                                 * it then */
        MD_JOURNAL_CLEAN,        /* A raid with journal is already clean */
        MD_HAS_JOURNAL,                /* The raid array has journal feature set */
        MD_CLUSTER_RESYNC_LOCKED, /* cluster raid only, which means node
                                   * already took resync lock, need to
                                   * release the lock */
        MD_FAILFAST_SUPPORTED,        /* Using MD_FAILFAST on metadata writes is
                                 * supported as calls to md_error() will
                                 * never cause the array to become failed.
                                 */
        MD_HAS_PPL,                /* The raid array has PPL feature set */
        MD_HAS_MULTIPLE_PPLS,        /* The raid array has multiple PPLs feature set */
        MD_ALLOW_SB_UPDATE,        /* md_check_recovery is allowed to update
                                 * the metadata without taking reconfig_mutex.
                                 */
        MD_UPDATING_SB,                /* md_check_recovery is updating the metadata
                                 * without explicitly holding reconfig_mutex.
                                 */
        MD_NOT_READY,                /* do_md_run() is active, so 'array_state'
                                 * must not report that array is ready yet
                                 */
        MD_BROKEN,              /* This is used in RAID-0/LINEAR only, to stop
                                 * I/O in case an array member is gone/failed.
                                 */
};

enum mddev_sb_flags {
        MD_SB_CHANGE_DEVS,                /* Some device status has changed */
        MD_SB_CHANGE_CLEAN,        /* transition to or from 'clean' */
        MD_SB_CHANGE_PENDING,        /* switch from 'clean' to 'active' in progress */
        MD_SB_NEED_REWRITE,        /* metadata write needs to be repeated */
};

#define NR_SERIAL_INFOS                8
/* record current range of serialize IOs */
struct serial_info {
        struct rb_node node;
        sector_t start;                /* start sector of rb node */
        sector_t last;                /* end sector of rb node */
        sector_t _subtree_last; /* highest sector in subtree of rb node */
};

struct mddev {
        void                                *private;
        struct md_personality                *pers;
        dev_t                                unit;
        int                                md_minor;
        struct list_head                disks;
        unsigned long                        flags;
        unsigned long                        sb_flags;

        int                                suspended;
        atomic_t                        active_io;
        int                                ro;
        int                                sysfs_active; /* set when sysfs deletes
                                                       * are happening, so run/
                                                       * takeover/stop are not safe
                                                       */
        struct gendisk                        *gendisk;

        struct kobject                        kobj;
        int                                hold_active;
#define        UNTIL_IOCTL        1
#define        UNTIL_STOP        2

        /* Superblock information */
        int                                major_version,
                                        minor_version,
                                        patch_version;
        int                                persistent;
        int                                external;        /* metadata is
                                                         * managed externally */
        char                                metadata_type[17]; /* externally set*/
        int                                chunk_sectors;
        time64_t                        ctime, utime;
        int                                level, layout;
        char                                clevel[16];
        int                                raid_disks;
        int                                max_disks;
        sector_t                        dev_sectors;        /* used size of
                                                         * component devices */
        sector_t                        array_sectors; /* exported array size */
        int                                external_size; /* size managed
                                                        * externally */
        __u64                                events;
        /* If the last 'event' was simply a clean->dirty transition, and
         * we didn't write it to the spares, then it is safe and simple
         * to just decrement the event count on a dirty->clean transition.
         * So we record that possibility here.
         */
        int                                can_decrease_events;

        char                                uuid[16];

        /* If the array is being reshaped, we need to record the
         * new shape and an indication of where we are up to.
         * This is written to the superblock.
         * If reshape_position is MaxSector, then no reshape is happening (yet).
         */
        sector_t                        reshape_position;
        int                                delta_disks, new_level, new_layout;
        int                                new_chunk_sectors;
        int                                reshape_backwards;

        struct md_thread                *thread;        /* management thread */
        struct md_thread                *sync_thread;        /* doing resync or reconstruct */

        /* 'last_sync_action' is initialized to "none".  It is set when a
         * sync operation (i.e "data-check", "requested-resync", "resync",
         * "recovery", or "reshape") is started.  It holds this value even
         * when the sync thread is "frozen" (interrupted) or "idle" (stopped
         * or finished).  It is overwritten when a new sync operation is begun.
         */
        char                                *last_sync_action;
        sector_t                        curr_resync;        /* last block scheduled */
        /* As resync requests can complete out of order, we cannot easily track
         * how much resync has been completed.  So we occasionally pause until
         * everything completes, then set curr_resync_completed to curr_resync.
         * As such it may be well behind the real resync mark, but it is a value
         * we are certain of.
         */
        sector_t                        curr_resync_completed;
        unsigned long                        resync_mark;        /* a recent timestamp */
        sector_t                        resync_mark_cnt;/* blocks written at resync_mark */
        sector_t                        curr_mark_cnt; /* blocks scheduled now */

        sector_t                        resync_max_sectors; /* may be set by personality */

        atomic64_t                        resync_mismatches; /* count of sectors where
                                                            * parity/replica mismatch found
                                                            */

        /* allow user-space to request suspension of IO to regions of the array */
        sector_t                        suspend_lo;
        sector_t                        suspend_hi;
        /* if zero, use the system-wide default */
        int                                sync_speed_min;
        int                                sync_speed_max;

        /* resync even though the same disks are shared among md-devices */
        int                                parallel_resync;

        int                                ok_start_degraded;

        unsigned long                        recovery;
        /* If a RAID personality determines that recovery (of a particular
         * device) will fail due to a read error on the source device, it
         * takes a copy of this number and does not attempt recovery again
         * until this number changes.
         */
        int                                recovery_disabled;

        int                                in_sync;        /* know to not need resync */
        /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so
         * that we are never stopping an array while it is open.
         * 'reconfig_mutex' protects all other reconfiguration.
         * These locks are separate due to conflicting interactions
         * with bdev->bd_mutex.
         * Lock ordering is:
         *  reconfig_mutex -> bd_mutex
         *  bd_mutex -> open_mutex:  e.g. __blkdev_get -> md_open
         */
        struct mutex                        open_mutex;
        struct mutex                        reconfig_mutex;
        atomic_t                        active;                /* general refcount */
        atomic_t                        openers;        /* number of active opens */

        int                                changed;        /* True if we might need to
                                                         * reread partition info */
        int                                degraded;        /* whether md should consider
                                                         * adding a spare
                                                         */

        atomic_t                        recovery_active; /* blocks scheduled, but not written */
        wait_queue_head_t                recovery_wait;
        sector_t                        recovery_cp;
        sector_t                        resync_min;        /* user requested sync
                                                         * starts here */
        sector_t                        resync_max;        /* resync should pause
                                                         * when it gets here */

        struct kernfs_node                *sysfs_state;        /* handle for 'array_state'
                                                         * file in sysfs.
                                                         */
        struct kernfs_node                *sysfs_action;  /* handle for 'sync_action' */
        struct kernfs_node                *sysfs_completed;        /*handle for 'sync_completed' */
        struct kernfs_node                *sysfs_degraded;        /*handle for 'degraded' */
        struct kernfs_node                *sysfs_level;                /*handle for 'level' */

        struct work_struct del_work;        /* used for delayed sysfs removal */

        /* "lock" protects:
         *   flush_bio transition from NULL to !NULL
         *   rdev superblocks, events
         *   clearing MD_CHANGE_*
         *   in_sync - and related safemode and MD_CHANGE changes
         *   pers (also protected by reconfig_mutex and pending IO).
         *   clearing ->bitmap
         *   clearing ->bitmap_info.file
         *   changing ->resync_{min,max}
         *   setting MD_RECOVERY_RUNNING (which interacts with resync_{min,max})
         */
        spinlock_t                        lock;
        wait_queue_head_t                sb_wait;        /* for waiting on superblock updates */
        atomic_t                        pending_writes;        /* number of active superblock writes */

        unsigned int                        safemode;        /* if set, update "clean" superblock
                                                         * when no writes pending.
                                                         */
        unsigned int                        safemode_delay;
        struct timer_list                safemode_timer;
        struct percpu_ref                writes_pending;
        int                                sync_checkers;        /* # of threads checking writes_pending */
        struct request_queue                *queue;        /* for plugging ... */

        struct bitmap                        *bitmap; /* the bitmap for the device */
        struct {
                struct file                *file; /* the bitmap file */
                loff_t                        offset; /* offset from superblock of
                                                 * start of bitmap. May be
                                                 * negative, but not '0'
                                                 * For external metadata, offset
                                                 * from start of device.
                                                 */
                unsigned long                space; /* space available at this offset */
                loff_t                        default_offset; /* this is the offset to use when
                                                         * hot-adding a bitmap.  It should
                                                         * eventually be settable by sysfs.
                                                         */
                unsigned long                default_space; /* space available at
                                                        * default offset */
                struct mutex                mutex;
                unsigned long                chunksize;
                unsigned long                daemon_sleep; /* how many jiffies between updates? */
                unsigned long                max_write_behind; /* write-behind mode */
                int                        external;
                int                        nodes; /* Maximum number of nodes in the cluster */
                char                    cluster_name[64]; /* Name of the cluster */
        } bitmap_info;

        atomic_t                        max_corr_read_errors; /* max read retries */
        struct list_head                all_mddevs;

        struct attribute_group                *to_remove;

        struct bio_set                        bio_set;
        struct bio_set                        sync_set; /* for sync operations like
                                                   * metadata and bitmap writes
                                                   */

        /* Generic flush handling.
         * The last to finish preflush schedules a worker to submit
         * the rest of the request (without the REQ_PREFLUSH flag).
         */
        struct bio *flush_bio;
        atomic_t flush_pending;
        ktime_t start_flush, last_flush; /* last_flush is when the last completed
                                          * flush was started.
                                          */
        struct work_struct flush_work;
        struct work_struct event_work;        /* used by dm to report failure event */
        mempool_t *serial_info_pool;
        void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
        struct md_cluster_info                *cluster_info;
        unsigned int                        good_device_nr;        /* good device num within cluster raid */
        unsigned int                        noio_flag; /* for memalloc scope API */

        bool        has_superblocks:1;
        bool        fail_last_dev:1;
        bool        serialize_policy:1;
};

enum recovery_flags {
        /*
         * If neither SYNC or RESHAPE are set, then it is a recovery.
         */
        MD_RECOVERY_RUNNING,        /* a thread is running, or about to be started */
        MD_RECOVERY_SYNC,        /* actually doing a resync, not a recovery */
        MD_RECOVERY_RECOVER,        /* doing recovery, or need to try it. */
        MD_RECOVERY_INTR,        /* resync needs to be aborted for some reason */
        MD_RECOVERY_DONE,        /* thread is done and is waiting to be reaped */
        MD_RECOVERY_NEEDED,        /* we might need to start a resync/recover */
        MD_RECOVERY_REQUESTED,        /* user-space has requested a sync (used with SYNC) */
        MD_RECOVERY_CHECK,        /* user-space request for check-only, no repair */
        MD_RECOVERY_RESHAPE,        /* A reshape is happening */
        MD_RECOVERY_FROZEN,        /* User request to abort, and not restart, any action */
        MD_RECOVERY_ERROR,        /* sync-action interrupted because io-error */
        MD_RECOVERY_WAIT,        /* waiting for pers->start() to finish */
        MD_RESYNCING_REMOTE,        /* remote node is running resync thread */
};

static inline int __must_check mddev_lock(struct mddev *mddev)
{
        return mutex_lock_interruptible(&mddev->reconfig_mutex);
}

/* Sometimes we need to take the lock in a situation where
 * failure due to interrupts is not acceptable.
 */
static inline void mddev_lock_nointr(struct mddev *mddev)
{
        mutex_lock(&mddev->reconfig_mutex);
}

static inline int mddev_trylock(struct mddev *mddev)
{
        return mutex_trylock(&mddev->reconfig_mutex);
}
extern void mddev_unlock(struct mddev *mddev);

static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
{
        atomic_add(nr_sectors, &bdev->bd_disk->sync_io);
}

static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors)
{
        atomic_add(nr_sectors, &bio->bi_disk->sync_io);
}

struct md_personality
{
        char *name;
        int level;
        struct list_head list;
        struct module *owner;
        bool __must_check (*make_request)(struct mddev *mddev, struct bio *bio);
        /*
         * start up works that do NOT require md_thread. tasks that
         * requires md_thread should go into start()
         */
        int (*run)(struct mddev *mddev);
        /* start up works that require md threads */
        int (*start)(struct mddev *mddev);
        void (*free)(struct mddev *mddev, void *priv);
        void (*status)(struct seq_file *seq, struct mddev *mddev);
        /* error_handler must set ->faulty and clear ->in_sync
         * if appropriate, and should abort recovery if needed
         */
        void (*error_handler)(struct mddev *mddev, struct md_rdev *rdev);
        int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev);
        int (*hot_remove_disk) (struct mddev *mddev, struct md_rdev *rdev);
        int (*spare_active) (struct mddev *mddev);
        sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped);
        int (*resize) (struct mddev *mddev, sector_t sectors);
        sector_t (*size) (struct mddev *mddev, sector_t sectors, int raid_disks);
        int (*check_reshape) (struct mddev *mddev);
        int (*start_reshape) (struct mddev *mddev);
        void (*finish_reshape) (struct mddev *mddev);
        void (*update_reshape_pos) (struct mddev *mddev);
        /* quiesce suspends or resumes internal processing.
         * 1 - stop new actions and wait for action io to complete
         * 0 - return to normal behaviour
         */
        void (*quiesce) (struct mddev *mddev, int quiesce);
        /* takeover is used to transition an array from one
         * personality to another.  The new personality must be able
         * to handle the data in the current layout.
         * e.g. 2drive raid1 -> 2drive raid5
         *      ndrive raid5 -> degraded n+1drive raid6 with special layout
         * If the takeover succeeds, a new 'private' structure is returned.
         * This needs to be installed and then ->run used to activate the
         * array.
         */
        void *(*takeover) (struct mddev *mddev);
        /* Changes the consistency policy of an active array. */
        int (*change_consistency_policy)(struct mddev *mddev, const char *buf);
};

struct md_sysfs_entry {
        struct attribute attr;
        ssize_t (*show)(struct mddev *, char *);
        ssize_t (*store)(struct mddev *, const char *, size_t);
};
extern struct attribute_group md_bitmap_group;

static inline struct kernfs_node *sysfs_get_dirent_safe(struct kernfs_node *sd, char *name)
{
        if (sd)
                return sysfs_get_dirent(sd, name);
        return sd;
}
static inline void sysfs_notify_dirent_safe(struct kernfs_node *sd)
{
        if (sd)
                sysfs_notify_dirent(sd);
}

static inline char * mdname (struct mddev * mddev)
{
        return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";
}

static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev)
{
        char nm[20];
        if (!test_bit(Replacement, &rdev->flags) &&
            !test_bit(Journal, &rdev->flags) &&
            mddev->kobj.sd) {
                sprintf(nm, "rd%d", rdev->raid_disk);
                return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
        } else
                return 0;
}

static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
{
        char nm[20];
        if (!test_bit(Replacement, &rdev->flags) &&
            !test_bit(Journal, &rdev->flags) &&
            mddev->kobj.sd) {
                sprintf(nm, "rd%d", rdev->raid_disk);
                sysfs_remove_link(&mddev->kobj, nm);
        }
}

/*
 * iterates through some rdev ringlist. It's safe to remove the
 * current 'rdev'. Dont touch 'tmp' though.
 */
#define rdev_for_each_list(rdev, tmp, head)                                \
        list_for_each_entry_safe(rdev, tmp, head, same_set)

/*
 * iterates through the 'same array disks' ringlist
 */
#define rdev_for_each(rdev, mddev)                                \
        list_for_each_entry(rdev, &((mddev)->disks), same_set)

#define rdev_for_each_safe(rdev, tmp, mddev)                                \
        list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set)

#define rdev_for_each_rcu(rdev, mddev)                                \
        list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set)

struct md_thread {
        void                        (*run) (struct md_thread *thread);
        struct mddev                *mddev;
        wait_queue_head_t        wqueue;
        unsigned long                flags;
        struct task_struct        *tsk;
        unsigned long                timeout;
        void                        *private;
};

#define THREAD_WAKEUP  0

static inline void safe_put_page(struct page *p)
{
        if (p) put_page(p);
}

extern int register_md_personality(struct md_personality *p);
extern int unregister_md_personality(struct md_personality *p);
extern int register_md_cluster_operations(struct md_cluster_operations *ops,
                struct module *module);
extern int unregister_md_cluster_operations(void);
extern int md_setup_cluster(struct mddev *mddev, int nodes);
extern void md_cluster_stop(struct mddev *mddev);
extern struct md_thread *md_register_thread(
        void (*run)(struct md_thread *thread),
        struct mddev *mddev,
        const char *name);
extern void md_unregister_thread(struct md_thread **threadp);
extern void md_wakeup_thread(struct md_thread *thread);
extern void md_check_recovery(struct mddev *mddev);
extern void md_reap_sync_thread(struct mddev *mddev);
extern int mddev_init_writes_pending(struct mddev *mddev);
extern bool md_write_start(struct mddev *mddev, struct bio *bi);
extern void md_write_inc(struct mddev *mddev, struct bio *bi);
extern void md_write_end(struct mddev *mddev);
extern void md_done_sync(struct mddev *mddev, int blocks, int ok);
extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
extern void md_finish_reshape(struct mddev *mddev);

extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio);
extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
                           sector_t sector, int size, struct page *page);
extern int md_super_wait(struct mddev *mddev);
extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
                        struct page *page, int op, int op_flags,
                        bool metadata_op);
extern void md_do_sync(struct md_thread *thread);
extern void md_new_event(struct mddev *mddev);
extern void md_allow_write(struct mddev *mddev);
extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev);
extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors);
extern int md_check_no_bitmap(struct mddev *mddev);
extern int md_integrity_register(struct mddev *mddev);
extern int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev);
extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);

extern void mddev_init(struct mddev *mddev);
extern int md_run(struct mddev *mddev);
extern int md_start(struct mddev *mddev);
extern void md_stop(struct mddev *mddev);
extern void md_stop_writes(struct mddev *mddev);
extern int md_rdev_init(struct md_rdev *rdev);
extern void md_rdev_clear(struct md_rdev *rdev);

extern void md_handle_request(struct mddev *mddev, struct bio *bio);
extern void mddev_suspend(struct mddev *mddev);
extern void mddev_resume(struct mddev *mddev);
extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
                                   struct mddev *mddev);

extern void md_reload_sb(struct mddev *mddev, int raid_disk);
extern void md_update_sb(struct mddev *mddev, int force);
extern void md_kick_rdev_from_array(struct md_rdev * rdev);
extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
                                     bool is_suspend);
extern void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
                                      bool is_suspend);
struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev);

static inline bool is_mddev_broken(struct md_rdev *rdev, const char *md_type)
{
        int flags = rdev->bdev->bd_disk->flags;

        if (!(flags & GENHD_FL_UP)) {
                if (!test_and_set_bit(MD_BROKEN, &rdev->mddev->flags))
                        pr_warn("md: %s: %s array has a missing/failed member\n",
                                mdname(rdev->mddev), md_type);
                return true;
        }
        return false;
}

static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)
{
        int faulty = test_bit(Faulty, &rdev->flags);
        if (atomic_dec_and_test(&rdev->nr_pending) && faulty) {
                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
                md_wakeup_thread(mddev->thread);
        }
}

extern struct md_cluster_operations *md_cluster_ops;
static inline int mddev_is_clustered(struct mddev *mddev)
{
        return mddev->cluster_info && mddev->bitmap_info.nodes > 1;
}

/* clear unsupported mddev_flags */
static inline void mddev_clear_unsupported_flags(struct mddev *mddev,
        unsigned long unsupported_flags)
{
        mddev->flags &= ~unsupported_flags;
}

static inline void mddev_check_writesame(struct mddev *mddev, struct bio *bio)
{
        if (bio_op(bio) == REQ_OP_WRITE_SAME &&
            !bio->bi_disk->queue->limits.max_write_same_sectors)
                mddev->queue->limits.max_write_same_sectors = 0;
}

static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio)
{
        if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
            !bio->bi_disk->queue->limits.max_write_zeroes_sectors)
                mddev->queue->limits.max_write_zeroes_sectors = 0;
}

struct mdu_array_info_s;
struct mdu_disk_info_s;

extern int mdp_major;
void md_autostart_arrays(int part);
int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info);
int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info);
int do_md_run(struct mddev *mddev);

extern const struct block_device_operations md_fops;

#endif /* _MD_MD_H */













































    1 


    1 












































































































































































    1 
































































































































































































































































































































































































































































    1 




























































    1 







    1 







    1 



    1 





    1 































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ioctl.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/syscalls.h>
#include <linux/mm.h>
#include <linux/capability.h>
#include <linux/compat.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/security.h>
#include <linux/export.h>
#include <linux/uaccess.h>
#include <linux/writeback.h>
#include <linux/buffer_head.h>
#include <linux/falloc.h>
#include <linux/sched/signal.h>
#include <linux/fiemap.h>

#include "internal.h"

#include <asm/ioctls.h>

/* So that the fiemap access checks can't overflow on 32 bit machines. */
#define FIEMAP_MAX_EXTENTS        (UINT_MAX / sizeof(struct fiemap_extent))

/**
 * vfs_ioctl - call filesystem specific ioctl methods
 * @filp:        open file to invoke ioctl method on
 * @cmd:        ioctl command to execute
 * @arg:        command-specific argument for ioctl
 *
 * Invokes filesystem specific ->unlocked_ioctl, if one exists; otherwise
 * returns -ENOTTY.
 *
 * Returns 0 on success, -errno on error.
 */
long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
        int error = -ENOTTY;

        if (!filp->f_op->unlocked_ioctl)
                goto out;

        error = filp->f_op->unlocked_ioctl(filp, cmd, arg);
        if (error == -ENOIOCTLCMD)
                error = -ENOTTY;
 out:
        return error;
}
EXPORT_SYMBOL(vfs_ioctl);

static int ioctl_fibmap(struct file *filp, int __user *p)
{
        struct inode *inode = file_inode(filp);
        struct super_block *sb = inode->i_sb;
        int error, ur_block;
        sector_t block;

        if (!capable(CAP_SYS_RAWIO))
                return -EPERM;

        error = get_user(ur_block, p);
        if (error)
                return error;

        if (ur_block < 0)
                return -EINVAL;

        block = ur_block;
        error = bmap(inode, &block);

        if (block > INT_MAX) {
                error = -ERANGE;
                pr_warn_ratelimited("[%s/%d] FS: %s File: %pD4 would truncate fibmap result\n",
                                    current->comm, task_pid_nr(current),
                                    sb->s_id, filp);
        }

        if (error)
                ur_block = 0;
        else
                ur_block = block;

        if (put_user(ur_block, p))
                error = -EFAULT;

        return error;
}

/**
 * fiemap_fill_next_extent - Fiemap helper function
 * @fieinfo:        Fiemap context passed into ->fiemap
 * @logical:        Extent logical start offset, in bytes
 * @phys:        Extent physical start offset, in bytes
 * @len:        Extent length, in bytes
 * @flags:        FIEMAP_EXTENT flags that describe this extent
 *
 * Called from file system ->fiemap callback. Will populate extent
 * info as passed in via arguments and copy to user memory. On
 * success, extent count on fieinfo is incremented.
 *
 * Returns 0 on success, -errno on error, 1 if this was the last
 * extent that will fit in user array.
 */
#define SET_UNKNOWN_FLAGS        (FIEMAP_EXTENT_DELALLOC)
#define SET_NO_UNMOUNTED_IO_FLAGS        (FIEMAP_EXTENT_DATA_ENCRYPTED)
#define SET_NOT_ALIGNED_FLAGS        (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE)
int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical,
                            u64 phys, u64 len, u32 flags)
{
        struct fiemap_extent extent;
        struct fiemap_extent __user *dest = fieinfo->fi_extents_start;

        /* only count the extents */
        if (fieinfo->fi_extents_max == 0) {
                fieinfo->fi_extents_mapped++;
                return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0;
        }

        if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max)
                return 1;

        if (flags & SET_UNKNOWN_FLAGS)
                flags |= FIEMAP_EXTENT_UNKNOWN;
        if (flags & SET_NO_UNMOUNTED_IO_FLAGS)
                flags |= FIEMAP_EXTENT_ENCODED;
        if (flags & SET_NOT_ALIGNED_FLAGS)
                flags |= FIEMAP_EXTENT_NOT_ALIGNED;

        memset(&extent, 0, sizeof(extent));
        extent.fe_logical = logical;
        extent.fe_physical = phys;
        extent.fe_length = len;
        extent.fe_flags = flags;

        dest += fieinfo->fi_extents_mapped;
        if (copy_to_user(dest, &extent, sizeof(extent)))
                return -EFAULT;

        fieinfo->fi_extents_mapped++;
        if (fieinfo->fi_extents_mapped == fieinfo->fi_extents_max)
                return 1;
        return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0;
}
EXPORT_SYMBOL(fiemap_fill_next_extent);

/**
 * fiemap_prep - check validity of requested flags for fiemap
 * @inode:        Inode to operate on
 * @fieinfo:        Fiemap context passed into ->fiemap
 * @start:        Start of the mapped range
 * @len:        Length of the mapped range, can be truncated by this function.
 * @supported_flags:        Set of fiemap flags that the file system understands
 *
 * This function must be called from each ->fiemap instance to validate the
 * fiemap request against the file system parameters.
 *
 * Returns 0 on success, or a negative error on failure.
 */
int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo,
                u64 start, u64 *len, u32 supported_flags)
{
        u64 maxbytes = inode->i_sb->s_maxbytes;
        u32 incompat_flags;
        int ret = 0;

        if (*len == 0)
                return -EINVAL;
        if (start >= maxbytes)
                return -EFBIG;

        /*
         * Shrink request scope to what the fs can actually handle.
         */
        if (*len > maxbytes || (maxbytes - *len) < start)
                *len = maxbytes - start;

        supported_flags |= FIEMAP_FLAG_SYNC;
        supported_flags &= FIEMAP_FLAGS_COMPAT;
        incompat_flags = fieinfo->fi_flags & ~supported_flags;
        if (incompat_flags) {
                fieinfo->fi_flags = incompat_flags;
                return -EBADR;
        }

        if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC)
                ret = filemap_write_and_wait(inode->i_mapping);
        return ret;
}
EXPORT_SYMBOL(fiemap_prep);

static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap)
{
        struct fiemap fiemap;
        struct fiemap_extent_info fieinfo = { 0, };
        struct inode *inode = file_inode(filp);
        int error;

        if (!inode->i_op->fiemap)
                return -EOPNOTSUPP;

        if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap)))
                return -EFAULT;

        if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
                return -EINVAL;

        fieinfo.fi_flags = fiemap.fm_flags;
        fieinfo.fi_extents_max = fiemap.fm_extent_count;
        fieinfo.fi_extents_start = ufiemap->fm_extents;

        error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start,
                        fiemap.fm_length);

        fiemap.fm_flags = fieinfo.fi_flags;
        fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
        if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap)))
                error = -EFAULT;

        return error;
}

static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
                             u64 off, u64 olen, u64 destoff)
{
        struct fd src_file = fdget(srcfd);
        loff_t cloned;
        int ret;

        if (!src_file.file)
                return -EBADF;
        ret = -EXDEV;
        if (src_file.file->f_path.mnt != dst_file->f_path.mnt)
                goto fdput;
        cloned = vfs_clone_file_range(src_file.file, off, dst_file, destoff,
                                      olen, 0);
        if (cloned < 0)
                ret = cloned;
        else if (olen && cloned != olen)
                ret = -EINVAL;
        else
                ret = 0;
fdput:
        fdput(src_file);
        return ret;
}

static long ioctl_file_clone_range(struct file *file,
                                   struct file_clone_range __user *argp)
{
        struct file_clone_range args;

        if (copy_from_user(&args, argp, sizeof(args)))
                return -EFAULT;
        return ioctl_file_clone(file, args.src_fd, args.src_offset,
                                args.src_length, args.dest_offset);
}

#ifdef CONFIG_BLOCK

static inline sector_t logical_to_blk(struct inode *inode, loff_t offset)
{
        return (offset >> inode->i_blkbits);
}

static inline loff_t blk_to_logical(struct inode *inode, sector_t blk)
{
        return (blk << inode->i_blkbits);
}

/**
 * __generic_block_fiemap - FIEMAP for block based inodes (no locking)
 * @inode: the inode to map
 * @fieinfo: the fiemap info struct that will be passed back to userspace
 * @start: where to start mapping in the inode
 * @len: how much space to map
 * @get_block: the fs's get_block function
 *
 * This does FIEMAP for block based inodes.  Basically it will just loop
 * through get_block until we hit the number of extents we want to map, or we
 * go past the end of the file and hit a hole.
 *
 * If it is possible to have data blocks beyond a hole past @inode->i_size, then
 * please do not use this function, it will stop at the first unmapped block
 * beyond i_size.
 *
 * If you use this function directly, you need to do your own locking. Use
 * generic_block_fiemap if you want the locking done for you.
 */
static int __generic_block_fiemap(struct inode *inode,
                           struct fiemap_extent_info *fieinfo, loff_t start,
                           loff_t len, get_block_t *get_block)
{
        struct buffer_head map_bh;
        sector_t start_blk, last_blk;
        loff_t isize = i_size_read(inode);
        u64 logical = 0, phys = 0, size = 0;
        u32 flags = FIEMAP_EXTENT_MERGED;
        bool past_eof = false, whole_file = false;
        int ret = 0;

        ret = fiemap_prep(inode, fieinfo, start, &len, FIEMAP_FLAG_SYNC);
        if (ret)
                return ret;

        /*
         * Either the i_mutex or other appropriate locking needs to be held
         * since we expect isize to not change at all through the duration of
         * this call.
         */
        if (len >= isize) {
                whole_file = true;
                len = isize;
        }

        /*
         * Some filesystems can't deal with being asked to map less than
         * blocksize, so make sure our len is at least block length.
         */
        if (logical_to_blk(inode, len) == 0)
                len = blk_to_logical(inode, 1);

        start_blk = logical_to_blk(inode, start);
        last_blk = logical_to_blk(inode, start + len - 1);

        do {
                /*
                 * we set b_size to the total size we want so it will map as
                 * many contiguous blocks as possible at once
                 */
                memset(&map_bh, 0, sizeof(struct buffer_head));
                map_bh.b_size = len;

                ret = get_block(inode, start_blk, &map_bh, 0);
                if (ret)
                        break;

                /* HOLE */
                if (!buffer_mapped(&map_bh)) {
                        start_blk++;

                        /*
                         * We want to handle the case where there is an
                         * allocated block at the front of the file, and then
                         * nothing but holes up to the end of the file properly,
                         * to make sure that extent at the front gets properly
                         * marked with FIEMAP_EXTENT_LAST
                         */
                        if (!past_eof &&
                            blk_to_logical(inode, start_blk) >= isize)
                                past_eof = 1;

                        /*
                         * First hole after going past the EOF, this is our
                         * last extent
                         */
                        if (past_eof && size) {
                                flags = FIEMAP_EXTENT_MERGED|FIEMAP_EXTENT_LAST;
                                ret = fiemap_fill_next_extent(fieinfo, logical,
                                                              phys, size,
                                                              flags);
                        } else if (size) {
                                ret = fiemap_fill_next_extent(fieinfo, logical,
                                                              phys, size, flags);
                                size = 0;
                        }

                        /* if we have holes up to/past EOF then we're done */
                        if (start_blk > last_blk || past_eof || ret)
                                break;
                } else {
                        /*
                         * We have gone over the length of what we wanted to
                         * map, and it wasn't the entire file, so add the extent
                         * we got last time and exit.
                         *
                         * This is for the case where say we want to map all the
                         * way up to the second to the last block in a file, but
                         * the last block is a hole, making the second to last
                         * block FIEMAP_EXTENT_LAST.  In this case we want to
                         * see if there is a hole after the second to last block
                         * so we can mark it properly.  If we found data after
                         * we exceeded the length we were requesting, then we
                         * are good to go, just add the extent to the fieinfo
                         * and break
                         */
                        if (start_blk > last_blk && !whole_file) {
                                ret = fiemap_fill_next_extent(fieinfo, logical,
                                                              phys, size,
                                                              flags);
                                break;
                        }

                        /*
                         * if size != 0 then we know we already have an extent
                         * to add, so add it.
                         */
                        if (size) {
                                ret = fiemap_fill_next_extent(fieinfo, logical,
                                                              phys, size,
                                                              flags);
                                if (ret)
                                        break;
                        }

                        logical = blk_to_logical(inode, start_blk);
                        phys = blk_to_logical(inode, map_bh.b_blocknr);
                        size = map_bh.b_size;
                        flags = FIEMAP_EXTENT_MERGED;

                        start_blk += logical_to_blk(inode, size);

                        /*
                         * If we are past the EOF, then we need to make sure as
                         * soon as we find a hole that the last extent we found
                         * is marked with FIEMAP_EXTENT_LAST
                         */
                        if (!past_eof && logical + size >= isize)
                                past_eof = true;
                }
                cond_resched();
                if (fatal_signal_pending(current)) {
                        ret = -EINTR;
                        break;
                }

        } while (1);

        /* If ret is 1 then we just hit the end of the extent array */
        if (ret == 1)
                ret = 0;

        return ret;
}

/**
 * generic_block_fiemap - FIEMAP for block based inodes
 * @inode: The inode to map
 * @fieinfo: The mapping information
 * @start: The initial block to map
 * @len: The length of the extect to attempt to map
 * @get_block: The block mapping function for the fs
 *
 * Calls __generic_block_fiemap to map the inode, after taking
 * the inode's mutex lock.
 */

int generic_block_fiemap(struct inode *inode,
                         struct fiemap_extent_info *fieinfo, u64 start,
                         u64 len, get_block_t *get_block)
{
        int ret;
        inode_lock(inode);
        ret = __generic_block_fiemap(inode, fieinfo, start, len, get_block);
        inode_unlock(inode);
        return ret;
}
EXPORT_SYMBOL(generic_block_fiemap);

#endif  /*  CONFIG_BLOCK  */

/*
 * This provides compatibility with legacy XFS pre-allocation ioctls
 * which predate the fallocate syscall.
 *
 * Only the l_start, l_len and l_whence fields of the 'struct space_resv'
 * are used here, rest are ignored.
 */
static int ioctl_preallocate(struct file *filp, int mode, void __user *argp)
{
        struct inode *inode = file_inode(filp);
        struct space_resv sr;

        if (copy_from_user(&sr, argp, sizeof(sr)))
                return -EFAULT;

        switch (sr.l_whence) {
        case SEEK_SET:
                break;
        case SEEK_CUR:
                sr.l_start += filp->f_pos;
                break;
        case SEEK_END:
                sr.l_start += i_size_read(inode);
                break;
        default:
                return -EINVAL;
        }

        return vfs_fallocate(filp, mode | FALLOC_FL_KEEP_SIZE, sr.l_start,
                        sr.l_len);
}

/* on ia32 l_start is on a 32-bit boundary */
#if defined CONFIG_COMPAT && defined(CONFIG_X86_64)
/* just account for different alignment */
static int compat_ioctl_preallocate(struct file *file, int mode,
                                    struct space_resv_32 __user *argp)
{
        struct inode *inode = file_inode(file);
        struct space_resv_32 sr;

        if (copy_from_user(&sr, argp, sizeof(sr)))
                return -EFAULT;

        switch (sr.l_whence) {
        case SEEK_SET:
                break;
        case SEEK_CUR:
                sr.l_start += file->f_pos;
                break;
        case SEEK_END:
                sr.l_start += i_size_read(inode);
                break;
        default:
                return -EINVAL;
        }

        return vfs_fallocate(file, mode | FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len);
}
#endif

static int file_ioctl(struct file *filp, unsigned int cmd, int __user *p)
{
        switch (cmd) {
        case FIBMAP:
                return ioctl_fibmap(filp, p);
        case FS_IOC_RESVSP:
        case FS_IOC_RESVSP64:
                return ioctl_preallocate(filp, 0, p);
        case FS_IOC_UNRESVSP:
        case FS_IOC_UNRESVSP64:
                return ioctl_preallocate(filp, FALLOC_FL_PUNCH_HOLE, p);
        case FS_IOC_ZERO_RANGE:
                return ioctl_preallocate(filp, FALLOC_FL_ZERO_RANGE, p);
        }

        return -ENOIOCTLCMD;
}

static int ioctl_fionbio(struct file *filp, int __user *argp)
{
        unsigned int flag;
        int on, error;

        error = get_user(on, argp);
        if (error)
                return error;
        flag = O_NONBLOCK;
#ifdef __sparc__
        /* SunOS compatibility item. */
        if (O_NONBLOCK != O_NDELAY)
                flag |= O_NDELAY;
#endif
        spin_lock(&filp->f_lock);
        if (on)
                filp->f_flags |= flag;
        else
                filp->f_flags &= ~flag;
        spin_unlock(&filp->f_lock);
        return error;
}

static int ioctl_fioasync(unsigned int fd, struct file *filp,
                          int __user *argp)
{
        unsigned int flag;
        int on, error;

        error = get_user(on, argp);
        if (error)
                return error;
        flag = on ? FASYNC : 0;

        /* Did FASYNC state change ? */
        if ((flag ^ filp->f_flags) & FASYNC) {
                if (filp->f_op->fasync)
                        /* fasync() adjusts filp->f_flags */
                        error = filp->f_op->fasync(fd, filp, on);
                else
                        error = -ENOTTY;
        }
        return error < 0 ? error : 0;
}

static int ioctl_fsfreeze(struct file *filp)
{
        struct super_block *sb = file_inode(filp)->i_sb;

        if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        /* If filesystem doesn't support freeze feature, return. */
        if (sb->s_op->freeze_fs == NULL && sb->s_op->freeze_super == NULL)
                return -EOPNOTSUPP;

        /* Freeze */
        if (sb->s_op->freeze_super)
                return sb->s_op->freeze_super(sb);
        return freeze_super(sb);
}

static int ioctl_fsthaw(struct file *filp)
{
        struct super_block *sb = file_inode(filp)->i_sb;

        if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        /* Thaw */
        if (sb->s_op->thaw_super)
                return sb->s_op->thaw_super(sb);
        return thaw_super(sb);
}

static int ioctl_file_dedupe_range(struct file *file,
                                   struct file_dedupe_range __user *argp)
{
        struct file_dedupe_range *same = NULL;
        int ret;
        unsigned long size;
        u16 count;

        if (get_user(count, &argp->dest_count)) {
                ret = -EFAULT;
                goto out;
        }

        size = offsetof(struct file_dedupe_range __user, info[count]);
        if (size > PAGE_SIZE) {
                ret = -ENOMEM;
                goto out;
        }

        same = memdup_user(argp, size);
        if (IS_ERR(same)) {
                ret = PTR_ERR(same);
                same = NULL;
                goto out;
        }

        same->dest_count = count;
        ret = vfs_dedupe_file_range(file, same);
        if (ret)
                goto out;

        ret = copy_to_user(argp, same, size);
        if (ret)
                ret = -EFAULT;

out:
        kfree(same);
        return ret;
}

/*
 * do_vfs_ioctl() is not for drivers and not intended to be EXPORT_SYMBOL()'d.
 * It's just a simple helper for sys_ioctl and compat_sys_ioctl.
 *
 * When you add any new common ioctls to the switches above and below,
 * please ensure they have compatible arguments in compat mode.
 */
static int do_vfs_ioctl(struct file *filp, unsigned int fd,
                        unsigned int cmd, unsigned long arg)
{
        void __user *argp = (void __user *)arg;
        struct inode *inode = file_inode(filp);

        switch (cmd) {
        case FIOCLEX:
                set_close_on_exec(fd, 1);
                return 0;

        case FIONCLEX:
                set_close_on_exec(fd, 0);
                return 0;

        case FIONBIO:
                return ioctl_fionbio(filp, argp);

        case FIOASYNC:
                return ioctl_fioasync(fd, filp, argp);

        case FIOQSIZE:
                if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) ||
                    S_ISLNK(inode->i_mode)) {
                        loff_t res = inode_get_bytes(inode);
                        return copy_to_user(argp, &res, sizeof(res)) ?
                                            -EFAULT : 0;
                }

                return -ENOTTY;

        case FIFREEZE:
                return ioctl_fsfreeze(filp);

        case FITHAW:
                return ioctl_fsthaw(filp);

        case FS_IOC_FIEMAP:
                return ioctl_fiemap(filp, argp);

        case FIGETBSZ:
                /* anon_bdev filesystems may not have a block size */
                if (!inode->i_sb->s_blocksize)
                        return -EINVAL;

                return put_user(inode->i_sb->s_blocksize, (int __user *)argp);

        case FICLONE:
                return ioctl_file_clone(filp, arg, 0, 0, 0);

        case FICLONERANGE:
                return ioctl_file_clone_range(filp, argp);

        case FIDEDUPERANGE:
                return ioctl_file_dedupe_range(filp, argp);

        case FIONREAD:
                if (!S_ISREG(inode->i_mode))
                        return vfs_ioctl(filp, cmd, arg);

                return put_user(i_size_read(inode) - filp->f_pos,
                                (int __user *)argp);

        default:
                if (S_ISREG(inode->i_mode))
                        return file_ioctl(filp, cmd, argp);
                break;
        }

        return -ENOIOCTLCMD;
}

SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
{
        struct fd f = fdget(fd);
        int error;

        if (!f.file)
                return -EBADF;

        error = security_file_ioctl(f.file, cmd, arg);
        if (error)
                goto out;

        error = do_vfs_ioctl(f.file, fd, cmd, arg);
        if (error == -ENOIOCTLCMD)
                error = vfs_ioctl(f.file, cmd, arg);

out:
        fdput(f);
        return error;
}

#ifdef CONFIG_COMPAT
/**
 * compat_ptr_ioctl - generic implementation of .compat_ioctl file operation
 *
 * This is not normally called as a function, but instead set in struct
 * file_operations as
 *
 *     .compat_ioctl = compat_ptr_ioctl,
 *
 * On most architectures, the compat_ptr_ioctl() just passes all arguments
 * to the corresponding ->ioctl handler. The exception is arch/s390, where
 * compat_ptr() clears the top bit of a 32-bit pointer value, so user space
 * pointers to the second 2GB alias the first 2GB, as is the case for
 * native 32-bit s390 user space.
 *
 * The compat_ptr_ioctl() function must therefore be used only with ioctl
 * functions that either ignore the argument or pass a pointer to a
 * compatible data type.
 *
 * If any ioctl command handled by fops->unlocked_ioctl passes a plain
 * integer instead of a pointer, or any of the passed data types
 * is incompatible between 32-bit and 64-bit architectures, a proper
 * handler is required instead of compat_ptr_ioctl.
 */
long compat_ptr_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
        if (!file->f_op->unlocked_ioctl)
                return -ENOIOCTLCMD;

        return file->f_op->unlocked_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
}
EXPORT_SYMBOL(compat_ptr_ioctl);

COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
                       compat_ulong_t, arg)
{
        struct fd f = fdget(fd);
        int error;

        if (!f.file)
                return -EBADF;

        error = security_file_ioctl_compat(f.file, cmd, arg);
        if (error)
                goto out;

        switch (cmd) {
        /* FICLONE takes an int argument, so don't use compat_ptr() */
        case FICLONE:
                error = ioctl_file_clone(f.file, arg, 0, 0, 0);
                break;

#if defined(CONFIG_X86_64)
        /* these get messy on amd64 due to alignment differences */
        case FS_IOC_RESVSP_32:
        case FS_IOC_RESVSP64_32:
                error = compat_ioctl_preallocate(f.file, 0, compat_ptr(arg));
                break;
        case FS_IOC_UNRESVSP_32:
        case FS_IOC_UNRESVSP64_32:
                error = compat_ioctl_preallocate(f.file, FALLOC_FL_PUNCH_HOLE,
                                compat_ptr(arg));
                break;
        case FS_IOC_ZERO_RANGE_32:
                error = compat_ioctl_preallocate(f.file, FALLOC_FL_ZERO_RANGE,
                                compat_ptr(arg));
                break;
#endif

        /*
         * everything else in do_vfs_ioctl() takes either a compatible
         * pointer argument or no argument -- call it with a modified
         * argument.
         */
        default:
                error = do_vfs_ioctl(f.file, fd, cmd,
                                     (unsigned long)compat_ptr(arg));
                if (error != -ENOIOCTLCMD)
                        break;

                if (f.file->f_op->compat_ioctl)
                        error = f.file->f_op->compat_ioctl(f.file, cmd, arg);
                if (error == -ENOIOCTLCMD)
                        error = -ENOTTY;
                break;
        }

 out:
        fdput(f);

        return error;
}
#endif



















    2 




    2 




















































    1 










    2 
























































































    2 




    2 

































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/extable.h>
#include <linux/uaccess.h>
#include <linux/sched/debug.h>
#include <xen/xen.h>

#include <asm/fpu/internal.h>
#include <asm/sev-es.h>
#include <asm/traps.h>
#include <asm/kdebug.h>

typedef bool (*ex_handler_t)(const struct exception_table_entry *,
                            struct pt_regs *, int, unsigned long,
                            unsigned long);

static inline unsigned long
ex_fixup_addr(const struct exception_table_entry *x)
{
        return (unsigned long)&x->fixup + x->fixup;
}
static inline ex_handler_t
ex_fixup_handler(const struct exception_table_entry *x)
{
        return (ex_handler_t)((unsigned long)&x->handler + x->handler);
}

__visible bool ex_handler_default(const struct exception_table_entry *fixup,
                                  struct pt_regs *regs, int trapnr,
                                  unsigned long error_code,
                                  unsigned long fault_addr)
{
        regs->ip = ex_fixup_addr(fixup);
        return true;
}
EXPORT_SYMBOL(ex_handler_default);

__visible bool ex_handler_fault(const struct exception_table_entry *fixup,
                                struct pt_regs *regs, int trapnr,
                                unsigned long error_code,
                                unsigned long fault_addr)
{
        regs->ip = ex_fixup_addr(fixup);
        regs->ax = trapnr;
        return true;
}
EXPORT_SYMBOL_GPL(ex_handler_fault);

/*
 * Handler for when we fail to restore a task's FPU state.  We should never get
 * here because the FPU state of a task using the FPU (task->thread.fpu.state)
 * should always be valid.  However, past bugs have allowed userspace to set
 * reserved bits in the XSAVE area using PTRACE_SETREGSET or sys_rt_sigreturn().
 * These caused XRSTOR to fail when switching to the task, leaking the FPU
 * registers of the task previously executing on the CPU.  Mitigate this class
 * of vulnerability by restoring from the initial state (essentially, zeroing
 * out all the FPU registers) if we can't restore from the task's FPU state.
 */
__visible bool ex_handler_fprestore(const struct exception_table_entry *fixup,
                                    struct pt_regs *regs, int trapnr,
                                    unsigned long error_code,
                                    unsigned long fault_addr)
{
        WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.",
                  (void *)instruction_pointer(regs));

        __copy_kernel_to_fpregs(&init_fpstate, -1);

        return ex_handler_default(fixup, regs, trapnr, error_code, fault_addr);
}
EXPORT_SYMBOL_GPL(ex_handler_fprestore);

__visible bool ex_handler_uaccess(const struct exception_table_entry *fixup,
                                  struct pt_regs *regs, int trapnr,
                                  unsigned long error_code,
                                  unsigned long fault_addr)
{
        WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?");
        regs->ip = ex_fixup_addr(fixup);
        return true;
}
EXPORT_SYMBOL(ex_handler_uaccess);

__visible bool ex_handler_copy(const struct exception_table_entry *fixup,
                               struct pt_regs *regs, int trapnr,
                               unsigned long error_code,
                               unsigned long fault_addr)
{
        WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?");
        regs->ip = ex_fixup_addr(fixup);
        regs->ax = trapnr;
        return true;
}
EXPORT_SYMBOL(ex_handler_copy);

__visible bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup,
                                       struct pt_regs *regs, int trapnr,
                                       unsigned long error_code,
                                       unsigned long fault_addr)
{
        if (pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
                         (unsigned int)regs->cx, regs->ip, (void *)regs->ip))
                show_stack_regs(regs);

        /* Pretend that the read succeeded and returned 0. */
        regs->ip = ex_fixup_addr(fixup);
        regs->ax = 0;
        regs->dx = 0;
        return true;
}
EXPORT_SYMBOL(ex_handler_rdmsr_unsafe);

__visible bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup,
                                       struct pt_regs *regs, int trapnr,
                                       unsigned long error_code,
                                       unsigned long fault_addr)
{
        if (pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
                         (unsigned int)regs->cx, (unsigned int)regs->dx,
                         (unsigned int)regs->ax,  regs->ip, (void *)regs->ip))
                show_stack_regs(regs);

        /* Pretend that the write succeeded. */
        regs->ip = ex_fixup_addr(fixup);
        return true;
}
EXPORT_SYMBOL(ex_handler_wrmsr_unsafe);

__visible bool ex_handler_clear_fs(const struct exception_table_entry *fixup,
                                   struct pt_regs *regs, int trapnr,
                                   unsigned long error_code,
                                   unsigned long fault_addr)
{
        if (static_cpu_has(X86_BUG_NULL_SEG))
                asm volatile ("mov %0, %%fs" : : "rm" (__USER_DS));
        asm volatile ("mov %0, %%fs" : : "rm" (0));
        return ex_handler_default(fixup, regs, trapnr, error_code, fault_addr);
}
EXPORT_SYMBOL(ex_handler_clear_fs);

enum handler_type ex_get_fault_handler_type(unsigned long ip)
{
        const struct exception_table_entry *e;
        ex_handler_t handler;

        e = search_exception_tables(ip);
        if (!e)
                return EX_HANDLER_NONE;
        handler = ex_fixup_handler(e);
        if (handler == ex_handler_fault)
                return EX_HANDLER_FAULT;
        else if (handler == ex_handler_uaccess || handler == ex_handler_copy)
                return EX_HANDLER_UACCESS;
        else
                return EX_HANDLER_OTHER;
}

int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code,
                    unsigned long fault_addr)
{
        const struct exception_table_entry *e;
        ex_handler_t handler;

#ifdef CONFIG_PNPBIOS
        if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
                extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp;
                extern u32 pnp_bios_is_utter_crap;
                pnp_bios_is_utter_crap = 1;
                printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n");
                __asm__ volatile(
                        "movl %0, %%esp\n\t"
                        "jmp *%1\n\t"
                        : : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip));
                panic("do_trap: can't hit this");
        }
#endif

        e = search_exception_tables(regs->ip);
        if (!e)
                return 0;

        handler = ex_fixup_handler(e);
        return handler(e, regs, trapnr, error_code, fault_addr);
}

extern unsigned int early_recursion_flag;

/* Restricted version used during very early boot */
void __init early_fixup_exception(struct pt_regs *regs, int trapnr)
{
        /* Ignore early NMIs. */
        if (trapnr == X86_TRAP_NMI)
                return;

        if (early_recursion_flag > 2)
                goto halt_loop;

        /*
         * Old CPUs leave the high bits of CS on the stack
         * undefined.  I'm not sure which CPUs do this, but at least
         * the 486 DX works this way.
         * Xen pv domains are not using the default __KERNEL_CS.
         */
        if (!xen_pv_domain() && regs->cs != __KERNEL_CS)
                goto fail;

        /*
         * The full exception fixup machinery is available as soon as
         * the early IDT is loaded.  This means that it is the
         * responsibility of extable users to either function correctly
         * when handlers are invoked early or to simply avoid causing
         * exceptions before they're ready to handle them.
         *
         * This is better than filtering which handlers can be used,
         * because refusing to call a handler here is guaranteed to
         * result in a hard-to-debug panic.
         *
         * Keep in mind that not all vectors actually get here.  Early
         * page faults, for example, are special.
         */
        if (fixup_exception(regs, trapnr, regs->orig_ax, 0))
                return;

        if (trapnr == X86_TRAP_UD) {
                if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) {
                        /* Skip the ud2. */
                        regs->ip += LEN_UD2;
                        return;
                }

                /*
                 * If this was a BUG and report_bug returns or if this
                 * was just a normal #UD, we want to continue onward and
                 * crash.
                 */
        }

fail:
        early_printk("PANIC: early exception 0x%02x IP %lx:%lx error %lx cr2 0x%lx\n",
                     (unsigned)trapnr, (unsigned long)regs->cs, regs->ip,
                     regs->orig_ax, read_cr2());

        show_regs(regs);

halt_loop:
        while (true)
                halt();
}

































































    1 




































    1 














































    1 














    1 

    1 

























































    1 
    1 











    1 

    1 



















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Process number limiting controller for cgroups.
 *
 * Used to allow a cgroup hierarchy to stop any new processes from fork()ing
 * after a certain limit is reached.
 *
 * Since it is trivial to hit the task limit without hitting any kmemcg limits
 * in place, PIDs are a fundamental resource. As such, PID exhaustion must be
 * preventable in the scope of a cgroup hierarchy by allowing resource limiting
 * of the number of tasks in a cgroup.
 *
 * In order to use the `pids` controller, set the maximum number of tasks in
 * pids.max (this is not available in the root cgroup for obvious reasons). The
 * number of processes currently in the cgroup is given by pids.current.
 * Organisational operations are not blocked by cgroup policies, so it is
 * possible to have pids.current > pids.max. However, it is not possible to
 * violate a cgroup policy through fork(). fork() will return -EAGAIN if forking
 * would cause a cgroup policy to be violated.
 *
 * To set a cgroup to have no limit, set pids.max to "max". This is the default
 * for all new cgroups (N.B. that PID limits are hierarchical, so the most
 * stringent limit in the hierarchy is followed).
 *
 * pids.current tracks all child cgroup hierarchies, so parent/pids.current is
 * a superset of parent/child/pids.current.
 *
 * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com>
 */

#include <linux/kernel.h>
#include <linux/threads.h>
#include <linux/atomic.h>
#include <linux/cgroup.h>
#include <linux/slab.h>
#include <linux/sched/task.h>

#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
#define PIDS_MAX_STR "max"

struct pids_cgroup {
        struct cgroup_subsys_state        css;

        /*
         * Use 64-bit types so that we can safely represent "max" as
         * %PIDS_MAX = (%PID_MAX_LIMIT + 1).
         */
        atomic64_t                        counter;
        atomic64_t                        limit;

        /* Handle for "pids.events" */
        struct cgroup_file                events_file;

        /* Number of times fork failed because limit was hit. */
        atomic64_t                        events_limit;
};

static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
{
        return container_of(css, struct pids_cgroup, css);
}

static struct pids_cgroup *parent_pids(struct pids_cgroup *pids)
{
        return css_pids(pids->css.parent);
}

static struct cgroup_subsys_state *
pids_css_alloc(struct cgroup_subsys_state *parent)
{
        struct pids_cgroup *pids;

        pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL);
        if (!pids)
                return ERR_PTR(-ENOMEM);

        atomic64_set(&pids->counter, 0);
        atomic64_set(&pids->limit, PIDS_MAX);
        atomic64_set(&pids->events_limit, 0);
        return &pids->css;
}

static void pids_css_free(struct cgroup_subsys_state *css)
{
        kfree(css_pids(css));
}

/**
 * pids_cancel - uncharge the local pid count
 * @pids: the pid cgroup state
 * @num: the number of pids to cancel
 *
 * This function will WARN if the pid count goes under 0, because such a case is
 * a bug in the pids controller proper.
 */
static void pids_cancel(struct pids_cgroup *pids, int num)
{
        /*
         * A negative count (or overflow for that matter) is invalid,
         * and indicates a bug in the `pids` controller proper.
         */
        WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter));
}

/**
 * pids_uncharge - hierarchically uncharge the pid count
 * @pids: the pid cgroup state
 * @num: the number of pids to uncharge
 */
static void pids_uncharge(struct pids_cgroup *pids, int num)
{
        struct pids_cgroup *p;

        for (p = pids; parent_pids(p); p = parent_pids(p))
                pids_cancel(p, num);
}

/**
 * pids_charge - hierarchically charge the pid count
 * @pids: the pid cgroup state
 * @num: the number of pids to charge
 *
 * This function does *not* follow the pid limit set. It cannot fail and the new
 * pid count may exceed the limit. This is only used for reverting failed
 * attaches, where there is no other way out than violating the limit.
 */
static void pids_charge(struct pids_cgroup *pids, int num)
{
        struct pids_cgroup *p;

        for (p = pids; parent_pids(p); p = parent_pids(p))
                atomic64_add(num, &p->counter);
}

/**
 * pids_try_charge - hierarchically try to charge the pid count
 * @pids: the pid cgroup state
 * @num: the number of pids to charge
 *
 * This function follows the set limit. It will fail if the charge would cause
 * the new value to exceed the hierarchical limit. Returns 0 if the charge
 * succeeded, otherwise -EAGAIN.
 */
static int pids_try_charge(struct pids_cgroup *pids, int num)
{
        struct pids_cgroup *p, *q;

        for (p = pids; parent_pids(p); p = parent_pids(p)) {
                int64_t new = atomic64_add_return(num, &p->counter);
                int64_t limit = atomic64_read(&p->limit);

                /*
                 * Since new is capped to the maximum number of pid_t, if
                 * p->limit is %PIDS_MAX then we know that this test will never
                 * fail.
                 */
                if (new > limit)
                        goto revert;
        }

        return 0;

revert:
        for (q = pids; q != p; q = parent_pids(q))
                pids_cancel(q, num);
        pids_cancel(p, num);

        return -EAGAIN;
}

static int pids_can_attach(struct cgroup_taskset *tset)
{
        struct task_struct *task;
        struct cgroup_subsys_state *dst_css;

        cgroup_taskset_for_each(task, dst_css, tset) {
                struct pids_cgroup *pids = css_pids(dst_css);
                struct cgroup_subsys_state *old_css;
                struct pids_cgroup *old_pids;

                /*
                 * No need to pin @old_css between here and cancel_attach()
                 * because cgroup core protects it from being freed before
                 * the migration completes or fails.
                 */
                old_css = task_css(task, pids_cgrp_id);
                old_pids = css_pids(old_css);

                pids_charge(pids, 1);
                pids_uncharge(old_pids, 1);
        }

        return 0;
}

static void pids_cancel_attach(struct cgroup_taskset *tset)
{
        struct task_struct *task;
        struct cgroup_subsys_state *dst_css;

        cgroup_taskset_for_each(task, dst_css, tset) {
                struct pids_cgroup *pids = css_pids(dst_css);
                struct cgroup_subsys_state *old_css;
                struct pids_cgroup *old_pids;

                old_css = task_css(task, pids_cgrp_id);
                old_pids = css_pids(old_css);

                pids_charge(old_pids, 1);
                pids_uncharge(pids, 1);
        }
}

/*
 * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
 * on cgroup_threadgroup_change_begin() held by the copy_process().
 */
static int pids_can_fork(struct task_struct *task, struct css_set *cset)
{
        struct cgroup_subsys_state *css;
        struct pids_cgroup *pids;
        int err;

        if (cset)
                css = cset->subsys[pids_cgrp_id];
        else
                css = task_css_check(current, pids_cgrp_id, true);
        pids = css_pids(css);
        err = pids_try_charge(pids, 1);
        if (err) {
                /* Only log the first time events_limit is incremented. */
                if (atomic64_inc_return(&pids->events_limit) == 1) {
                        pr_info("cgroup: fork rejected by pids controller in ");
                        pr_cont_cgroup_path(css->cgroup);
                        pr_cont("\n");
                }
                cgroup_file_notify(&pids->events_file);
        }
        return err;
}

static void pids_cancel_fork(struct task_struct *task, struct css_set *cset)
{
        struct cgroup_subsys_state *css;
        struct pids_cgroup *pids;

        if (cset)
                css = cset->subsys[pids_cgrp_id];
        else
                css = task_css_check(current, pids_cgrp_id, true);
        pids = css_pids(css);
        pids_uncharge(pids, 1);
}

static void pids_release(struct task_struct *task)
{
        struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id));

        pids_uncharge(pids, 1);
}

static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf,
                              size_t nbytes, loff_t off)
{
        struct cgroup_subsys_state *css = of_css(of);
        struct pids_cgroup *pids = css_pids(css);
        int64_t limit;
        int err;

        buf = strstrip(buf);
        if (!strcmp(buf, PIDS_MAX_STR)) {
                limit = PIDS_MAX;
                goto set_limit;
        }

        err = kstrtoll(buf, 0, &limit);
        if (err)
                return err;

        if (limit < 0 || limit >= PIDS_MAX)
                return -EINVAL;

set_limit:
        /*
         * Limit updates don't need to be mutex'd, since it isn't
         * critical that any racing fork()s follow the new limit.
         */
        atomic64_set(&pids->limit, limit);
        return nbytes;
}

static int pids_max_show(struct seq_file *sf, void *v)
{
        struct cgroup_subsys_state *css = seq_css(sf);
        struct pids_cgroup *pids = css_pids(css);
        int64_t limit = atomic64_read(&pids->limit);

        if (limit >= PIDS_MAX)
                seq_printf(sf, "%s\n", PIDS_MAX_STR);
        else
                seq_printf(sf, "%lld\n", limit);

        return 0;
}

static s64 pids_current_read(struct cgroup_subsys_state *css,
                             struct cftype *cft)
{
        struct pids_cgroup *pids = css_pids(css);

        return atomic64_read(&pids->counter);
}

static int pids_events_show(struct seq_file *sf, void *v)
{
        struct pids_cgroup *pids = css_pids(seq_css(sf));

        seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events_limit));
        return 0;
}

static struct cftype pids_files[] = {
        {
                .name = "max",
                .write = pids_max_write,
                .seq_show = pids_max_show,
                .flags = CFTYPE_NOT_ON_ROOT,
        },
        {
                .name = "current",
                .read_s64 = pids_current_read,
                .flags = CFTYPE_NOT_ON_ROOT,
        },
        {
                .name = "events",
                .seq_show = pids_events_show,
                .file_offset = offsetof(struct pids_cgroup, events_file),
                .flags = CFTYPE_NOT_ON_ROOT,
        },
        { }        /* terminate */
};

struct cgroup_subsys pids_cgrp_subsys = {
        .css_alloc        = pids_css_alloc,
        .css_free        = pids_css_free,
        .can_attach         = pids_can_attach,
        .cancel_attach         = pids_cancel_attach,
        .can_fork        = pids_can_fork,
        .cancel_fork        = pids_cancel_fork,
        .release        = pids_release,
        .legacy_cftypes        = pids_files,
        .dfl_cftypes        = pids_files,
        .threaded        = true,
};














































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET  is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the IP router.
 *
 * Version:        @(#)route.h        1.0.4        05/27/93
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 * Fixes:
 *                Alan Cox        :        Reformatted. Added ip_rt_local()
 *                Alan Cox        :        Support for TCP parameters.
 *                Alexey Kuznetsov:        Major changes for new routing code.
 *                Mike McLagan    :        Routing by source
 *                Robert Olsson   :        Added rt_cache statistics
 */
#ifndef _ROUTE_H
#define _ROUTE_H

#include <net/dst.h>
#include <net/inetpeer.h>
#include <net/flow.h>
#include <net/inet_sock.h>
#include <net/ip_fib.h>
#include <net/arp.h>
#include <net/ndisc.h>
#include <linux/in_route.h>
#include <linux/rtnetlink.h>
#include <linux/rcupdate.h>
#include <linux/route.h>
#include <linux/ip.h>
#include <linux/cache.h>
#include <linux/security.h>

/* IPv4 datagram length is stored into 16bit field (tot_len) */
#define IP_MAX_MTU        0xFFFFU

#define RTO_ONLINK        0x01

#define RT_CONN_FLAGS(sk)   (RT_TOS(inet_sk(sk)->tos) | sock_flag(sk, SOCK_LOCALROUTE))
#define RT_CONN_FLAGS_TOS(sk,tos)   (RT_TOS(tos) | sock_flag(sk, SOCK_LOCALROUTE))

struct fib_nh;
struct fib_info;
struct uncached_list;
struct rtable {
        struct dst_entry        dst;

        int                        rt_genid;
        unsigned int                rt_flags;
        __u16                        rt_type;
        __u8                        rt_is_input;
        __u8                        rt_uses_gateway;

        int                        rt_iif;

        u8                        rt_gw_family;
        /* Info on neighbour */
        union {
                __be32                rt_gw4;
                struct in6_addr        rt_gw6;
        };

        /* Miscellaneous cached information */
        u32                        rt_mtu_locked:1,
                                rt_pmtu:31;

        struct list_head        rt_uncached;
        struct uncached_list        *rt_uncached_list;
};

static inline bool rt_is_input_route(const struct rtable *rt)
{
        return rt->rt_is_input != 0;
}

static inline bool rt_is_output_route(const struct rtable *rt)
{
        return rt->rt_is_input == 0;
}

static inline __be32 rt_nexthop(const struct rtable *rt, __be32 daddr)
{
        if (rt->rt_gw_family == AF_INET)
                return rt->rt_gw4;
        return daddr;
}

struct ip_rt_acct {
        __u32         o_bytes;
        __u32         o_packets;
        __u32         i_bytes;
        __u32         i_packets;
};

struct rt_cache_stat {
        unsigned int in_slow_tot;
        unsigned int in_slow_mc;
        unsigned int in_no_route;
        unsigned int in_brd;
        unsigned int in_martian_dst;
        unsigned int in_martian_src;
        unsigned int out_slow_tot;
        unsigned int out_slow_mc;
};

extern struct ip_rt_acct __percpu *ip_rt_acct;

struct in_device;

int ip_rt_init(void);
void rt_cache_flush(struct net *net);
void rt_flush_dev(struct net_device *dev);
struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *flp,
                                        const struct sk_buff *skb);
struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *flp,
                                            struct fib_result *res,
                                            const struct sk_buff *skb);

static inline struct rtable *__ip_route_output_key(struct net *net,
                                                   struct flowi4 *flp)
{
        return ip_route_output_key_hash(net, flp, NULL);
}

struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
                                    const struct sock *sk);
struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
                                      struct net_device *dev,
                                      struct net *net, __be32 *saddr,
                                      const struct ip_tunnel_info *info,
                                      u8 protocol, bool use_cache);

struct dst_entry *ipv4_blackhole_route(struct net *net,
                                       struct dst_entry *dst_orig);

static inline struct rtable *ip_route_output_key(struct net *net, struct flowi4 *flp)
{
        return ip_route_output_flow(net, flp, NULL);
}

static inline struct rtable *ip_route_output(struct net *net, __be32 daddr,
                                             __be32 saddr, u8 tos, int oif)
{
        struct flowi4 fl4 = {
                .flowi4_oif = oif,
                .flowi4_tos = tos,
                .daddr = daddr,
                .saddr = saddr,
        };
        return ip_route_output_key(net, &fl4);
}

static inline struct rtable *ip_route_output_ports(struct net *net, struct flowi4 *fl4,
                                                   struct sock *sk,
                                                   __be32 daddr, __be32 saddr,
                                                   __be16 dport, __be16 sport,
                                                   __u8 proto, __u8 tos, int oif)
{
        flowi4_init_output(fl4, oif, sk ? sk->sk_mark : 0, tos,
                           RT_SCOPE_UNIVERSE, proto,
                           sk ? inet_sk_flowi_flags(sk) : 0,
                           daddr, saddr, dport, sport, sock_net_uid(net, sk));
        if (sk)
                security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4));
        return ip_route_output_flow(net, fl4, sk);
}

static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4 *fl4,
                                                 __be32 daddr, __be32 saddr,
                                                 __be32 gre_key, __u8 tos, int oif)
{
        memset(fl4, 0, sizeof(*fl4));
        fl4->flowi4_oif = oif;
        fl4->daddr = daddr;
        fl4->saddr = saddr;
        fl4->flowi4_tos = tos;
        fl4->flowi4_proto = IPPROTO_GRE;
        fl4->fl4_gre_key = gre_key;
        return ip_route_output_key(net, fl4);
}
int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                          u8 tos, struct net_device *dev,
                          struct in_device *in_dev, u32 *itag);
int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src,
                         u8 tos, struct net_device *devin);
int ip_route_input_rcu(struct sk_buff *skb, __be32 dst, __be32 src,
                       u8 tos, struct net_device *devin,
                       struct fib_result *res);

int ip_route_use_hint(struct sk_buff *skb, __be32 dst, __be32 src,
                      u8 tos, struct net_device *devin,
                      const struct sk_buff *hint);

static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src,
                                 u8 tos, struct net_device *devin)
{
        int err;

        rcu_read_lock();
        err = ip_route_input_noref(skb, dst, src, tos, devin);
        if (!err) {
                skb_dst_force(skb);
                if (!skb_dst(skb))
                        err = -EINVAL;
        }
        rcu_read_unlock();

        return err;
}

void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, int oif,
                      u8 protocol);
void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu);
void ipv4_redirect(struct sk_buff *skb, struct net *net, int oif, u8 protocol);
void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk);
void ip_rt_send_redirect(struct sk_buff *skb);

unsigned int inet_addr_type(struct net *net, __be32 addr);
unsigned int inet_addr_type_table(struct net *net, __be32 addr, u32 tb_id);
unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
                                __be32 addr);
unsigned int inet_addr_type_dev_table(struct net *net,
                                      const struct net_device *dev,
                                      __be32 addr);
void ip_rt_multicast_event(struct in_device *);
int ip_rt_ioctl(struct net *, unsigned int cmd, struct rtentry *rt);
void ip_rt_get_source(u8 *src, struct sk_buff *skb, struct rtable *rt);
struct rtable *rt_dst_alloc(struct net_device *dev,
                             unsigned int flags, u16 type,
                             bool nopolicy, bool noxfrm);
struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt);

struct in_ifaddr;
void fib_add_ifaddr(struct in_ifaddr *);
void fib_del_ifaddr(struct in_ifaddr *, struct in_ifaddr *);
void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric);

void rt_add_uncached_list(struct rtable *rt);
void rt_del_uncached_list(struct rtable *rt);

int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
                       u32 table_id, struct fib_info *fi,
                       int *fa_index, int fa_start, unsigned int flags);

static inline void ip_rt_put(struct rtable *rt)
{
        /* dst_release() accepts a NULL parameter.
         * We rely on dst being first structure in struct rtable
         */
        BUILD_BUG_ON(offsetof(struct rtable, dst) != 0);
        dst_release(&rt->dst);
}

#define IPTOS_RT_MASK        (IPTOS_TOS_MASK & ~3)

extern const __u8 ip_tos2prio[16];

static inline char rt_tos2priority(u8 tos)
{
        return ip_tos2prio[IPTOS_TOS(tos)>>1];
}

/* ip_route_connect() and ip_route_newports() work in tandem whilst
 * binding a socket for a new outgoing connection.
 *
 * In order to use IPSEC properly, we must, in the end, have a
 * route that was looked up using all available keys including source
 * and destination ports.
 *
 * However, if a source port needs to be allocated (the user specified
 * a wildcard source port) we need to obtain addressing information
 * in order to perform that allocation.
 *
 * So ip_route_connect() looks up a route using wildcarded source and
 * destination ports in the key, simply so that we can get a pair of
 * addresses to use for port allocation.
 *
 * Later, once the ports are allocated, ip_route_newports() will make
 * another route lookup if needed to make sure we catch any IPSEC
 * rules keyed on the port information.
 *
 * The callers allocate the flow key on their stack, and must pass in
 * the same flowi4 object to both the ip_route_connect() and the
 * ip_route_newports() calls.
 */

static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32 src,
                                         u32 tos, int oif, u8 protocol,
                                         __be16 sport, __be16 dport,
                                         struct sock *sk)
{
        __u8 flow_flags = 0;

        if (inet_sk(sk)->transparent)
                flow_flags |= FLOWI_FLAG_ANYSRC;

        flowi4_init_output(fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE,
                           protocol, flow_flags, dst, src, dport, sport,
                           sk->sk_uid);
}

static inline struct rtable *ip_route_connect(struct flowi4 *fl4,
                                              __be32 dst, __be32 src, u32 tos,
                                              int oif, u8 protocol,
                                              __be16 sport, __be16 dport,
                                              struct sock *sk)
{
        struct net *net = sock_net(sk);
        struct rtable *rt;

        ip_route_connect_init(fl4, dst, src, tos, oif, protocol,
                              sport, dport, sk);

        if (!dst || !src) {
                rt = __ip_route_output_key(net, fl4);
                if (IS_ERR(rt))
                        return rt;
                ip_rt_put(rt);
                flowi4_update_output(fl4, oif, tos, fl4->daddr, fl4->saddr);
        }
        security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4));
        return ip_route_output_flow(net, fl4, sk);
}

static inline struct rtable *ip_route_newports(struct flowi4 *fl4, struct rtable *rt,
                                               __be16 orig_sport, __be16 orig_dport,
                                               __be16 sport, __be16 dport,
                                               struct sock *sk)
{
        if (sport != orig_sport || dport != orig_dport) {
                fl4->fl4_dport = dport;
                fl4->fl4_sport = sport;
                ip_rt_put(rt);
                flowi4_update_output(fl4, sk->sk_bound_dev_if,
                                     RT_CONN_FLAGS(sk), fl4->daddr,
                                     fl4->saddr);
                security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4));
                return ip_route_output_flow(sock_net(sk), fl4, sk);
        }
        return rt;
}

static inline int inet_iif(const struct sk_buff *skb)
{
        struct rtable *rt = skb_rtable(skb);

        if (rt && rt->rt_iif)
                return rt->rt_iif;

        return skb->skb_iif;
}

static inline int ip4_dst_hoplimit(const struct dst_entry *dst)
{
        int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
        struct net *net = dev_net(dst->dev);

        if (hoplimit == 0)
                hoplimit = net->ipv4.sysctl_ip_default_ttl;
        return hoplimit;
}

static inline struct neighbour *ip_neigh_gw4(struct net_device *dev,
                                             __be32 daddr)
{
        struct neighbour *neigh;

        neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)daddr);
        if (unlikely(!neigh))
                neigh = __neigh_create(&arp_tbl, &daddr, dev, false);

        return neigh;
}

static inline struct neighbour *ip_neigh_for_gw(struct rtable *rt,
                                                struct sk_buff *skb,
                                                bool *is_v6gw)
{
        struct net_device *dev = rt->dst.dev;
        struct neighbour *neigh;

        if (likely(rt->rt_gw_family == AF_INET)) {
                neigh = ip_neigh_gw4(dev, rt->rt_gw4);
        } else if (rt->rt_gw_family == AF_INET6) {
                neigh = ip_neigh_gw6(dev, &rt->rt_gw6);
                *is_v6gw = true;
        } else {
                neigh = ip_neigh_gw4(dev, ip_hdr(skb)->daddr);
        }
        return neigh;
}

#endif        /* _ROUTE_H */







































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __FIRMWARE_LOADER_H
#define __FIRMWARE_LOADER_H

#include <linux/bitops.h>
#include <linux/firmware.h>
#include <linux/types.h>
#include <linux/kref.h>
#include <linux/list.h>
#include <linux/completion.h>

#include <generated/utsrelease.h>

/**
 * enum fw_opt - options to control firmware loading behaviour
 *
 * @FW_OPT_UEVENT: Enables the fallback mechanism to send a kobject uevent
 *        when the firmware is not found. Userspace is in charge to load the
 *        firmware using the sysfs loading facility.
 * @FW_OPT_NOWAIT: Used to describe the firmware request is asynchronous.
 * @FW_OPT_USERHELPER: Enable the fallback mechanism, in case the direct
 *        filesystem lookup fails at finding the firmware.  For details refer to
 *        firmware_fallback_sysfs().
 * @FW_OPT_NO_WARN: Quiet, avoid printing warning messages.
 * @FW_OPT_NOCACHE: Disables firmware caching. Firmware caching is used to
 *        cache the firmware upon suspend, so that upon resume races against the
 *        firmware file lookup on storage is avoided. Used for calls where the
 *        file may be too big, or where the driver takes charge of its own
 *        firmware caching mechanism.
 * @FW_OPT_NOFALLBACK_SYSFS: Disable the sysfs fallback mechanism. Takes
 *        precedence over &FW_OPT_UEVENT and &FW_OPT_USERHELPER.
 * @FW_OPT_FALLBACK_PLATFORM: Enable fallback to device fw copy embedded in
 *        the platform's main firmware. If both this fallback and the sysfs
 *      fallback are enabled, then this fallback will be tried first.
 * @FW_OPT_PARTIAL: Allow partial read of firmware instead of needing to read
 *        entire file.
 */
enum fw_opt {
        FW_OPT_UEVENT                        = BIT(0),
        FW_OPT_NOWAIT                        = BIT(1),
        FW_OPT_USERHELPER                = BIT(2),
        FW_OPT_NO_WARN                        = BIT(3),
        FW_OPT_NOCACHE                        = BIT(4),
        FW_OPT_NOFALLBACK_SYSFS                = BIT(5),
        FW_OPT_FALLBACK_PLATFORM        = BIT(6),
        FW_OPT_PARTIAL                        = BIT(7),
};

enum fw_status {
        FW_STATUS_UNKNOWN,
        FW_STATUS_LOADING,
        FW_STATUS_DONE,
        FW_STATUS_ABORTED,
};

/*
 * Concurrent request_firmware() for the same firmware need to be
 * serialized.  struct fw_state is simple state machine which hold the
 * state of the firmware loading.
 */
struct fw_state {
        struct completion completion;
        enum fw_status status;
};

struct fw_priv {
        struct kref ref;
        struct list_head list;
        struct firmware_cache *fwc;
        struct fw_state fw_st;
        void *data;
        size_t size;
        size_t allocated_size;
        size_t offset;
        u32 opt_flags;
#ifdef CONFIG_FW_LOADER_PAGED_BUF
        bool is_paged_buf;
        struct page **pages;
        int nr_pages;
        int page_array_size;
#endif
#ifdef CONFIG_FW_LOADER_USER_HELPER
        bool need_uevent;
        struct list_head pending_list;
#endif
        const char *fw_name;
};

extern struct mutex fw_lock;

static inline bool __fw_state_check(struct fw_priv *fw_priv,
                                    enum fw_status status)
{
        struct fw_state *fw_st = &fw_priv->fw_st;

        return fw_st->status == status;
}

static inline int __fw_state_wait_common(struct fw_priv *fw_priv, long timeout)
{
        struct fw_state *fw_st = &fw_priv->fw_st;
        long ret;

        ret = wait_for_completion_killable_timeout(&fw_st->completion, timeout);
        if (ret != 0 && fw_st->status == FW_STATUS_ABORTED)
                return -ENOENT;
        if (!ret)
                return -ETIMEDOUT;

        return ret < 0 ? ret : 0;
}

static inline void __fw_state_set(struct fw_priv *fw_priv,
                                  enum fw_status status)
{
        struct fw_state *fw_st = &fw_priv->fw_st;

        WRITE_ONCE(fw_st->status, status);

        if (status == FW_STATUS_DONE || status == FW_STATUS_ABORTED) {
#ifdef CONFIG_FW_LOADER_USER_HELPER
                /*
                 * Doing this here ensures that the fw_priv is deleted from
                 * the pending list in all abort/done paths.
                 */
                list_del_init(&fw_priv->pending_list);
#endif
                complete_all(&fw_st->completion);
        }
}

static inline void fw_state_aborted(struct fw_priv *fw_priv)
{
        __fw_state_set(fw_priv, FW_STATUS_ABORTED);
}

static inline bool fw_state_is_aborted(struct fw_priv *fw_priv)
{
        return __fw_state_check(fw_priv, FW_STATUS_ABORTED);
}

static inline void fw_state_start(struct fw_priv *fw_priv)
{
        __fw_state_set(fw_priv, FW_STATUS_LOADING);
}

static inline void fw_state_done(struct fw_priv *fw_priv)
{
        __fw_state_set(fw_priv, FW_STATUS_DONE);
}

int assign_fw(struct firmware *fw, struct device *device);

#ifdef CONFIG_FW_LOADER_PAGED_BUF
void fw_free_paged_buf(struct fw_priv *fw_priv);
int fw_grow_paged_buf(struct fw_priv *fw_priv, int pages_needed);
int fw_map_paged_buf(struct fw_priv *fw_priv);
bool fw_is_paged_buf(struct fw_priv *fw_priv);
#else
static inline void fw_free_paged_buf(struct fw_priv *fw_priv) {}
static inline int fw_grow_paged_buf(struct fw_priv *fw_priv, int pages_needed) { return -ENXIO; }
static inline int fw_map_paged_buf(struct fw_priv *fw_priv) { return -ENXIO; }
static inline bool fw_is_paged_buf(struct fw_priv *fw_priv) { return false; }
#endif

#endif /* __FIRMWARE_LOADER_H */














































    1 
















    1 



    1 
















































    1 

    1 
    1 
    1 






    1 




    1 


    1 
























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/module.h>
#include <linux/scatterlist.h>
#include <linux/mempool.h>
#include <linux/slab.h>

#define SG_MEMPOOL_NR                ARRAY_SIZE(sg_pools)
#define SG_MEMPOOL_SIZE                2

struct sg_pool {
        size_t                size;
        char                *name;
        struct kmem_cache        *slab;
        mempool_t        *pool;
};

#define SP(x) { .size = x, "sgpool-" __stringify(x) }
#if (SG_CHUNK_SIZE < 32)
#error SG_CHUNK_SIZE is too small (must be 32 or greater)
#endif
static struct sg_pool sg_pools[] = {
        SP(8),
        SP(16),
#if (SG_CHUNK_SIZE > 32)
        SP(32),
#if (SG_CHUNK_SIZE > 64)
        SP(64),
#if (SG_CHUNK_SIZE > 128)
        SP(128),
#if (SG_CHUNK_SIZE > 256)
#error SG_CHUNK_SIZE is too large (256 MAX)
#endif
#endif
#endif
#endif
        SP(SG_CHUNK_SIZE)
};
#undef SP

static inline unsigned int sg_pool_index(unsigned short nents)
{
        unsigned int index;

        BUG_ON(nents > SG_CHUNK_SIZE);

        if (nents <= 8)
                index = 0;
        else
                index = get_count_order(nents) - 3;

        return index;
}

static void sg_pool_free(struct scatterlist *sgl, unsigned int nents)
{
        struct sg_pool *sgp;

        sgp = sg_pools + sg_pool_index(nents);
        mempool_free(sgl, sgp->pool);
}

static struct scatterlist *sg_pool_alloc(unsigned int nents, gfp_t gfp_mask)
{
        struct sg_pool *sgp;

        sgp = sg_pools + sg_pool_index(nents);
        return mempool_alloc(sgp->pool, gfp_mask);
}

/**
 * sg_free_table_chained - Free a previously mapped sg table
 * @table:        The sg table header to use
 * @nents_first_chunk: size of the first_chunk SGL passed to
 *                sg_alloc_table_chained
 *
 *  Description:
 *    Free an sg table previously allocated and setup with
 *    sg_alloc_table_chained().
 *
 *    @nents_first_chunk has to be same with that same parameter passed
 *    to sg_alloc_table_chained().
 *
 **/
void sg_free_table_chained(struct sg_table *table,
                unsigned nents_first_chunk)
{
        if (table->orig_nents <= nents_first_chunk)
                return;

        if (nents_first_chunk == 1)
                nents_first_chunk = 0;

        __sg_free_table(table, SG_CHUNK_SIZE, nents_first_chunk, sg_pool_free);
}
EXPORT_SYMBOL_GPL(sg_free_table_chained);

/**
 * sg_alloc_table_chained - Allocate and chain SGLs in an sg table
 * @table:        The sg table header to use
 * @nents:        Number of entries in sg list
 * @first_chunk: first SGL
 * @nents_first_chunk: number of the SGL of @first_chunk
 *
 *  Description:
 *    Allocate and chain SGLs in an sg table. If @nents@ is larger than
 *    @nents_first_chunk a chained sg table will be setup. @first_chunk is
 *    ignored if nents_first_chunk <= 1 because user expects the SGL points
 *    non-chain SGL.
 *
 **/
int sg_alloc_table_chained(struct sg_table *table, int nents,
                struct scatterlist *first_chunk, unsigned nents_first_chunk)
{
        int ret;

        BUG_ON(!nents);

        if (first_chunk && nents_first_chunk) {
                if (nents <= nents_first_chunk) {
                        table->nents = table->orig_nents = nents;
                        sg_init_table(table->sgl, nents);
                        return 0;
                }
        }

        /* User supposes that the 1st SGL includes real entry */
        if (nents_first_chunk <= 1) {
                first_chunk = NULL;
                nents_first_chunk = 0;
        }

        ret = __sg_alloc_table(table, nents, SG_CHUNK_SIZE,
                               first_chunk, nents_first_chunk,
                               GFP_ATOMIC, sg_pool_alloc);
        if (unlikely(ret))
                sg_free_table_chained(table, nents_first_chunk);
        return ret;
}
EXPORT_SYMBOL_GPL(sg_alloc_table_chained);

static __init int sg_pool_init(void)
{
        int i;

        for (i = 0; i < SG_MEMPOOL_NR; i++) {
                struct sg_pool *sgp = sg_pools + i;
                int size = sgp->size * sizeof(struct scatterlist);

                sgp->slab = kmem_cache_create(sgp->name, size, 0,
                                SLAB_HWCACHE_ALIGN, NULL);
                if (!sgp->slab) {
                        printk(KERN_ERR "SG_POOL: can't init sg slab %s\n",
                                        sgp->name);
                        goto cleanup_sdb;
                }

                sgp->pool = mempool_create_slab_pool(SG_MEMPOOL_SIZE,
                                                     sgp->slab);
                if (!sgp->pool) {
                        printk(KERN_ERR "SG_POOL: can't init sg mempool %s\n",
                                        sgp->name);
                        goto cleanup_sdb;
                }
        }

        return 0;

cleanup_sdb:
        for (i = 0; i < SG_MEMPOOL_NR; i++) {
                struct sg_pool *sgp = sg_pools + i;

                mempool_destroy(sgp->pool);
                kmem_cache_destroy(sgp->slab);
        }

        return -ENOMEM;
}

static __exit void sg_pool_exit(void)
{
        int i;

        for (i = 0; i < SG_MEMPOOL_NR; i++) {
                struct sg_pool *sgp = sg_pools + i;
                mempool_destroy(sgp->pool);
                kmem_cache_destroy(sgp->slab);
        }
}

module_init(sg_pool_init);
module_exit(sg_pool_exit);




















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
// SPDX-License-Identifier: GPL-2.0
/*
 * Helper routines for building identity mapping page tables. This is
 * included by both the compressed kernel and the regular kernel.
 */

static void ident_pmd_init(struct x86_mapping_info *info, pmd_t *pmd_page,
                           unsigned long addr, unsigned long end)
{
        addr &= PMD_MASK;
        for (; addr < end; addr += PMD_SIZE) {
                pmd_t *pmd = pmd_page + pmd_index(addr);

                if (pmd_present(*pmd))
                        continue;

                set_pmd(pmd, __pmd((addr - info->offset) | info->page_flag));
        }
}

static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
                          unsigned long addr, unsigned long end)
{
        unsigned long next;

        for (; addr < end; addr = next) {
                pud_t *pud = pud_page + pud_index(addr);
                pmd_t *pmd;

                next = (addr & PUD_MASK) + PUD_SIZE;
                if (next > end)
                        next = end;

                if (info->direct_gbpages) {
                        pud_t pudval;

                        if (pud_present(*pud))
                                continue;

                        addr &= PUD_MASK;
                        pudval = __pud((addr - info->offset) | info->page_flag);
                        set_pud(pud, pudval);
                        continue;
                }

                if (pud_present(*pud)) {
                        pmd = pmd_offset(pud, 0);
                        ident_pmd_init(info, pmd, addr, next);
                        continue;
                }
                pmd = (pmd_t *)info->alloc_pgt_page(info->context);
                if (!pmd)
                        return -ENOMEM;
                ident_pmd_init(info, pmd, addr, next);
                set_pud(pud, __pud(__pa(pmd) | info->kernpg_flag));
        }

        return 0;
}

static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page,
                          unsigned long addr, unsigned long end)
{
        unsigned long next;
        int result;

        for (; addr < end; addr = next) {
                p4d_t *p4d = p4d_page + p4d_index(addr);
                pud_t *pud;

                next = (addr & P4D_MASK) + P4D_SIZE;
                if (next > end)
                        next = end;

                if (p4d_present(*p4d)) {
                        pud = pud_offset(p4d, 0);
                        result = ident_pud_init(info, pud, addr, next);
                        if (result)
                                return result;

                        continue;
                }
                pud = (pud_t *)info->alloc_pgt_page(info->context);
                if (!pud)
                        return -ENOMEM;

                result = ident_pud_init(info, pud, addr, next);
                if (result)
                        return result;

                set_p4d(p4d, __p4d(__pa(pud) | info->kernpg_flag));
        }

        return 0;
}

int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
                              unsigned long pstart, unsigned long pend)
{
        unsigned long addr = pstart + info->offset;
        unsigned long end = pend + info->offset;
        unsigned long next;
        int result;

        /* Set the default pagetable flags if not supplied */
        if (!info->kernpg_flag)
                info->kernpg_flag = _KERNPG_TABLE;

        /* Filter out unsupported __PAGE_KERNEL_* bits: */
        info->kernpg_flag &= __default_kernel_pte_mask;

        for (; addr < end; addr = next) {
                pgd_t *pgd = pgd_page + pgd_index(addr);
                p4d_t *p4d;

                next = (addr & PGDIR_MASK) + PGDIR_SIZE;
                if (next > end)
                        next = end;

                if (pgd_present(*pgd)) {
                        p4d = p4d_offset(pgd, 0);
                        result = ident_p4d_init(info, p4d, addr, next);
                        if (result)
                                return result;
                        continue;
                }

                p4d = (p4d_t *)info->alloc_pgt_page(info->context);
                if (!p4d)
                        return -ENOMEM;
                result = ident_p4d_init(info, p4d, addr, next);
                if (result)
                        return result;
                if (pgtable_l5_enabled()) {
                        set_pgd(pgd, __pgd(__pa(p4d) | info->kernpg_flag));
                } else {
                        /*
                         * With p4d folded, pgd is equal to p4d.
                         * The pgd entry has to point to the pud page table in this case.
                         */
                        pud_t *pud = pud_offset(p4d, 0);
                        set_pgd(pgd, __pgd(__pa(pud) | info->kernpg_flag));
                }
        }

        return 0;
}

























































































    1 










    1 
    1 




    1 





    1 
    1 







    1 








































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
 */

/*
 * Basic idea behind the notification queue: An fsnotify group (like inotify)
 * sends the userspace notification about events asynchronously some time after
 * the event happened.  When inotify gets an event it will need to add that
 * event to the group notify queue.  Since a single event might need to be on
 * multiple group's notification queues we can't add the event directly to each
 * queue and instead add a small "event_holder" to each queue.  This event_holder
 * has a pointer back to the original event.  Since the majority of events are
 * going to end up on one, and only one, notification queue we embed one
 * event_holder into each event.  This means we have a single allocation instead
 * of always needing two.  If the embedded event_holder is already in use by
 * another group a new event_holder (from fsnotify_event_holder_cachep) will be
 * allocated and used.
 */

#include <linux/fs.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/mutex.h>
#include <linux/namei.h>
#include <linux/path.h>
#include <linux/slab.h>
#include <linux/spinlock.h>

#include <linux/atomic.h>

#include <linux/fsnotify_backend.h>
#include "fsnotify.h"

static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0);

/**
 * fsnotify_get_cookie - return a unique cookie for use in synchronizing events.
 * Called from fsnotify_move, which is inlined into filesystem modules.
 */
u32 fsnotify_get_cookie(void)
{
        return atomic_inc_return(&fsnotify_sync_cookie);
}
EXPORT_SYMBOL_GPL(fsnotify_get_cookie);

void fsnotify_destroy_event(struct fsnotify_group *group,
                            struct fsnotify_event *event)
{
        /* Overflow events are per-group and we don't want to free them */
        if (!event || event == group->overflow_event)
                return;
        /*
         * If the event is still queued, we have a problem... Do an unreliable
         * lockless check first to avoid locking in the common case. The
         * locking may be necessary for permission events which got removed
         * from the list by a different CPU than the one freeing the event.
         */
        if (!list_empty(&event->list)) {
                spin_lock(&group->notification_lock);
                WARN_ON(!list_empty(&event->list));
                spin_unlock(&group->notification_lock);
        }
        group->ops->free_event(group, event);
}

/*
 * Try to add an event to the notification queue.
 * The group can later pull this event off the queue to deal with.
 * The group can use the @merge hook to merge the event with a queued event.
 * The group can use the @insert hook to insert the event into hash table.
 * The function returns:
 * 0 if the event was added to a queue
 * 1 if the event was merged with some other queued event
 * 2 if the event was not queued - either the queue of events has overflown
 *   or the group is shutting down.
 */
int fsnotify_insert_event(struct fsnotify_group *group,
                          struct fsnotify_event *event,
                          int (*merge)(struct fsnotify_group *,
                                       struct fsnotify_event *),
                          void (*insert)(struct fsnotify_group *,
                                         struct fsnotify_event *))
{
        int ret = 0;
        struct list_head *list = &group->notification_list;

        pr_debug("%s: group=%p event=%p\n", __func__, group, event);

        spin_lock(&group->notification_lock);

        if (group->shutdown) {
                spin_unlock(&group->notification_lock);
                return 2;
        }

        if (event == group->overflow_event ||
            group->q_len >= group->max_events) {
                ret = 2;
                /* Queue overflow event only if it isn't already queued */
                if (!list_empty(&group->overflow_event->list)) {
                        spin_unlock(&group->notification_lock);
                        return ret;
                }
                event = group->overflow_event;
                goto queue;
        }

        if (!list_empty(list) && merge) {
                ret = merge(group, event);
                if (ret) {
                        spin_unlock(&group->notification_lock);
                        return ret;
                }
        }

queue:
        group->q_len++;
        list_add_tail(&event->list, list);
        if (insert)
                insert(group, event);
        spin_unlock(&group->notification_lock);

        wake_up(&group->notification_waitq);
        kill_fasync(&group->fsn_fa, SIGIO, POLL_IN);
        return ret;
}

void fsnotify_remove_queued_event(struct fsnotify_group *group,
                                  struct fsnotify_event *event)
{
        assert_spin_locked(&group->notification_lock);
        /*
         * We need to init list head for the case of overflow event so that
         * check in fsnotify_add_event() works
         */
        list_del_init(&event->list);
        group->q_len--;
}

/*
 * Return the first event on the notification list without removing it.
 * Returns NULL if the list is empty.
 */
struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group)
{
        assert_spin_locked(&group->notification_lock);

        if (fsnotify_notify_queue_is_empty(group))
                return NULL;

        return list_first_entry(&group->notification_list,
                                struct fsnotify_event, list);
}

/*
 * Remove and return the first event from the notification list.  It is the
 * responsibility of the caller to destroy the obtained event
 */
struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group)
{
        struct fsnotify_event *event = fsnotify_peek_first_event(group);

        if (!event)
                return NULL;

        pr_debug("%s: group=%p event=%p\n", __func__, group, event);

        fsnotify_remove_queued_event(group, event);

        return event;
}

/*
 * Called when a group is being torn down to clean up any outstanding
 * event notifications.
 */
void fsnotify_flush_notify(struct fsnotify_group *group)
{
        struct fsnotify_event *event;

        spin_lock(&group->notification_lock);
        while (!fsnotify_notify_queue_is_empty(group)) {
                event = fsnotify_remove_first_event(group);
                spin_unlock(&group->notification_lock);
                fsnotify_destroy_event(group, event);
                spin_lock(&group->notification_lock);
        }
        spin_unlock(&group->notification_lock);
}





























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
/* SPDX-License-Identifier: GPL-2.0 */
/*
  File: linux/posix_acl.h

  (C) 2002 Andreas Gruenbacher, <a.gruenbacher@computer.org>
*/


#ifndef __LINUX_POSIX_ACL_H
#define __LINUX_POSIX_ACL_H

#include <linux/bug.h>
#include <linux/slab.h>
#include <linux/rcupdate.h>
#include <linux/refcount.h>
#include <uapi/linux/posix_acl.h>

struct posix_acl_entry {
        short                        e_tag;
        unsigned short                e_perm;
        union {
                kuid_t                e_uid;
                kgid_t                e_gid;
        };
};

struct posix_acl {
        refcount_t                a_refcount;
        struct rcu_head                a_rcu;
        unsigned int                a_count;
        struct posix_acl_entry        a_entries[];
};

#define FOREACH_ACL_ENTRY(pa, acl, pe) \
        for(pa=(acl)->a_entries, pe=pa+(acl)->a_count; pa<pe; pa++)


/*
 * Duplicate an ACL handle.
 */
static inline struct posix_acl *
posix_acl_dup(struct posix_acl *acl)
{
        if (acl)
                refcount_inc(&acl->a_refcount);
        return acl;
}

/*
 * Free an ACL handle.
 */
static inline void
posix_acl_release(struct posix_acl *acl)
{
        if (acl && refcount_dec_and_test(&acl->a_refcount))
                kfree_rcu(acl, a_rcu);
}


/* posix_acl.c */

extern void posix_acl_init(struct posix_acl *, int);
extern struct posix_acl *posix_acl_alloc(int, gfp_t);
extern int posix_acl_valid(struct user_namespace *, const struct posix_acl *);
extern int posix_acl_permission(struct inode *, const struct posix_acl *, int);
extern struct posix_acl *posix_acl_from_mode(umode_t, gfp_t);
extern int posix_acl_equiv_mode(const struct posix_acl *, umode_t *);
extern int __posix_acl_create(struct posix_acl **, gfp_t, umode_t *);
extern int __posix_acl_chmod(struct posix_acl **, gfp_t, umode_t);

extern struct posix_acl *get_posix_acl(struct inode *, int);
extern int set_posix_acl(struct inode *, int, struct posix_acl *);

#ifdef CONFIG_FS_POSIX_ACL
extern int posix_acl_chmod(struct inode *, umode_t);
extern int posix_acl_create(struct inode *, umode_t *, struct posix_acl **,
                struct posix_acl **);
extern int posix_acl_update_mode(struct inode *, umode_t *, struct posix_acl **);

extern int simple_set_acl(struct inode *, struct posix_acl *, int);
extern int simple_acl_create(struct inode *, struct inode *);

struct posix_acl *get_cached_acl(struct inode *inode, int type);
struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type);
void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl);
void forget_cached_acl(struct inode *inode, int type);
void forget_all_cached_acls(struct inode *inode);

static inline void cache_no_acl(struct inode *inode)
{
        inode->i_acl = NULL;
        inode->i_default_acl = NULL;
}
#else
static inline int posix_acl_chmod(struct inode *inode, umode_t mode)
{
        return 0;
}

#define simple_set_acl                NULL

static inline int simple_acl_create(struct inode *dir, struct inode *inode)
{
        return 0;
}
static inline void cache_no_acl(struct inode *inode)
{
}

static inline int posix_acl_create(struct inode *inode, umode_t *mode,
                struct posix_acl **default_acl, struct posix_acl **acl)
{
        *default_acl = *acl = NULL;
        return 0;
}

static inline void forget_all_cached_acls(struct inode *inode)
{
}
#endif /* CONFIG_FS_POSIX_ACL */

struct posix_acl *get_acl(struct inode *inode, int type);

#endif  /* __LINUX_POSIX_ACL_H */























































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * include/linux/writeback.h
 */
#ifndef WRITEBACK_H
#define WRITEBACK_H

#include <linux/sched.h>
#include <linux/workqueue.h>
#include <linux/fs.h>
#include <linux/flex_proportions.h>
#include <linux/backing-dev-defs.h>
#include <linux/blk_types.h>
#include <linux/blk-cgroup.h>

struct bio;

DECLARE_PER_CPU(int, dirty_throttle_leaks);

/*
 * The 1/4 region under the global dirty thresh is for smooth dirty throttling:
 *
 *        (thresh - thresh/DIRTY_FULL_SCOPE, thresh)
 *
 * Further beyond, all dirtier tasks will enter a loop waiting (possibly long
 * time) for the dirty pages to drop, unless written enough pages.
 *
 * The global dirty threshold is normally equal to the global dirty limit,
 * except when the system suddenly allocates a lot of anonymous memory and
 * knocks down the global dirty threshold quickly, in which case the global
 * dirty limit will follow down slowly to prevent livelocking all dirtier tasks.
 */
#define DIRTY_SCOPE                8
#define DIRTY_FULL_SCOPE        (DIRTY_SCOPE / 2)

struct backing_dev_info;

/*
 * fs/fs-writeback.c
 */
enum writeback_sync_modes {
        WB_SYNC_NONE,        /* Don't wait on anything */
        WB_SYNC_ALL,        /* Wait on every mapping */
};

/*
 * A control structure which tells the writeback code what to do.  These are
 * always on the stack, and hence need no locking.  They are always initialised
 * in a manner such that unspecified fields are set to zero.
 */
struct writeback_control {
        long nr_to_write;                /* Write this many pages, and decrement
                                           this for each page written */
        long pages_skipped;                /* Pages which were not written */

        /*
         * For a_ops->writepages(): if start or end are non-zero then this is
         * a hint that the filesystem need only write out the pages inside that
         * byterange.  The byte at `end' is included in the writeout request.
         */
        loff_t range_start;
        loff_t range_end;

        enum writeback_sync_modes sync_mode;

        unsigned for_kupdate:1;                /* A kupdate writeback */
        unsigned for_background:1;        /* A background writeback */
        unsigned tagged_writepages:1;        /* tag-and-write to avoid livelock */
        unsigned for_reclaim:1;                /* Invoked from the page allocator */
        unsigned range_cyclic:1;        /* range_start is cyclic */
        unsigned for_sync:1;                /* sync(2) WB_SYNC_ALL writeback */

        /*
         * When writeback IOs are bounced through async layers, only the
         * initial synchronous phase should be accounted towards inode
         * cgroup ownership arbitration to avoid confusion.  Later stages
         * can set the following flag to disable the accounting.
         */
        unsigned no_cgroup_owner:1;

        unsigned punt_to_cgroup:1;        /* cgrp punting, see __REQ_CGROUP_PUNT */

#ifdef CONFIG_CGROUP_WRITEBACK
        struct bdi_writeback *wb;        /* wb this writeback is issued under */
        struct inode *inode;                /* inode being written out */

        /* foreign inode detection, see wbc_detach_inode() */
        int wb_id;                        /* current wb id */
        int wb_lcand_id;                /* last foreign candidate wb id */
        int wb_tcand_id;                /* this foreign candidate wb id */
        size_t wb_bytes;                /* bytes written by current wb */
        size_t wb_lcand_bytes;                /* bytes written by last candidate */
        size_t wb_tcand_bytes;                /* bytes written by this candidate */
#endif
};

static inline int wbc_to_write_flags(struct writeback_control *wbc)
{
        int flags = 0;

        if (wbc->punt_to_cgroup)
                flags = REQ_CGROUP_PUNT;

        if (wbc->sync_mode == WB_SYNC_ALL)
                flags |= REQ_SYNC;
        else if (wbc->for_kupdate || wbc->for_background)
                flags |= REQ_BACKGROUND;

        return flags;
}

static inline struct cgroup_subsys_state *
wbc_blkcg_css(struct writeback_control *wbc)
{
#ifdef CONFIG_CGROUP_WRITEBACK
        if (wbc->wb)
                return wbc->wb->blkcg_css;
#endif
        return blkcg_root_css;
}

/*
 * A wb_domain represents a domain that wb's (bdi_writeback's) belong to
 * and are measured against each other in.  There always is one global
 * domain, global_wb_domain, that every wb in the system is a member of.
 * This allows measuring the relative bandwidth of each wb to distribute
 * dirtyable memory accordingly.
 */
struct wb_domain {
        spinlock_t lock;

        /*
         * Scale the writeback cache size proportional to the relative
         * writeout speed.
         *
         * We do this by keeping a floating proportion between BDIs, based
         * on page writeback completions [end_page_writeback()]. Those
         * devices that write out pages fastest will get the larger share,
         * while the slower will get a smaller share.
         *
         * We use page writeout completions because we are interested in
         * getting rid of dirty pages. Having them written out is the
         * primary goal.
         *
         * We introduce a concept of time, a period over which we measure
         * these events, because demand can/will vary over time. The length
         * of this period itself is measured in page writeback completions.
         */
        struct fprop_global completions;
        struct timer_list period_timer;        /* timer for aging of completions */
        unsigned long period_time;

        /*
         * The dirtyable memory and dirty threshold could be suddenly
         * knocked down by a large amount (eg. on the startup of KVM in a
         * swapless system). This may throw the system into deep dirty
         * exceeded state and throttle heavy/light dirtiers alike. To
         * retain good responsiveness, maintain global_dirty_limit for
         * tracking slowly down to the knocked down dirty threshold.
         *
         * Both fields are protected by ->lock.
         */
        unsigned long dirty_limit_tstamp;
        unsigned long dirty_limit;
};

/**
 * wb_domain_size_changed - memory available to a wb_domain has changed
 * @dom: wb_domain of interest
 *
 * This function should be called when the amount of memory available to
 * @dom has changed.  It resets @dom's dirty limit parameters to prevent
 * the past values which don't match the current configuration from skewing
 * dirty throttling.  Without this, when memory size of a wb_domain is
 * greatly reduced, the dirty throttling logic may allow too many pages to
 * be dirtied leading to consecutive unnecessary OOMs and may get stuck in
 * that situation.
 */
static inline void wb_domain_size_changed(struct wb_domain *dom)
{
        spin_lock(&dom->lock);
        dom->dirty_limit_tstamp = jiffies;
        dom->dirty_limit = 0;
        spin_unlock(&dom->lock);
}

/*
 * fs/fs-writeback.c
 */        
struct bdi_writeback;
void writeback_inodes_sb(struct super_block *, enum wb_reason reason);
void writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
                                                        enum wb_reason reason);
void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason);
void sync_inodes_sb(struct super_block *);
void wakeup_flusher_threads(enum wb_reason reason);
void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
                                enum wb_reason reason);
void inode_wait_for_writeback(struct inode *inode);
void inode_io_list_del(struct inode *inode);

/* writeback.h requires fs.h; it, too, is not included from here. */
static inline void wait_on_inode(struct inode *inode)
{
        might_sleep();
        wait_on_bit(&inode->i_state, __I_NEW, TASK_UNINTERRUPTIBLE);
}

#ifdef CONFIG_CGROUP_WRITEBACK

#include <linux/cgroup.h>
#include <linux/bio.h>

void __inode_attach_wb(struct inode *inode, struct page *page);
void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
                                 struct inode *inode)
        __releases(&inode->i_lock);
void wbc_detach_inode(struct writeback_control *wbc);
void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
                              size_t bytes);
int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr_pages,
                           enum wb_reason reason, struct wb_completion *done);
void cgroup_writeback_umount(void);

/**
 * inode_attach_wb - associate an inode with its wb
 * @inode: inode of interest
 * @page: page being dirtied (may be NULL)
 *
 * If @inode doesn't have its wb, associate it with the wb matching the
 * memcg of @page or, if @page is NULL, %current.  May be called w/ or w/o
 * @inode->i_lock.
 */
static inline void inode_attach_wb(struct inode *inode, struct page *page)
{
        if (!inode->i_wb)
                __inode_attach_wb(inode, page);
}

/**
 * inode_detach_wb - disassociate an inode from its wb
 * @inode: inode of interest
 *
 * @inode is being freed.  Detach from its wb.
 */
static inline void inode_detach_wb(struct inode *inode)
{
        if (inode->i_wb) {
                WARN_ON_ONCE(!(inode->i_state & I_CLEAR));
                wb_put(inode->i_wb);
                inode->i_wb = NULL;
        }
}

/**
 * wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite
 * @wbc: writeback_control of interest
 * @inode: target inode
 *
 * This function is to be used by __filemap_fdatawrite_range(), which is an
 * alternative entry point into writeback code, and first ensures @inode is
 * associated with a bdi_writeback and attaches it to @wbc.
 */
static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
                                               struct inode *inode)
{
        spin_lock(&inode->i_lock);
        inode_attach_wb(inode, NULL);
        wbc_attach_and_unlock_inode(wbc, inode);
}

/**
 * wbc_init_bio - writeback specific initializtion of bio
 * @wbc: writeback_control for the writeback in progress
 * @bio: bio to be initialized
 *
 * @bio is a part of the writeback in progress controlled by @wbc.  Perform
 * writeback specific initialization.  This is used to apply the cgroup
 * writeback context.  Must be called after the bio has been associated with
 * a device.
 */
static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
{
        /*
         * pageout() path doesn't attach @wbc to the inode being written
         * out.  This is intentional as we don't want the function to block
         * behind a slow cgroup.  Ultimately, we want pageout() to kick off
         * regular writeback instead of writing things out itself.
         */
        if (wbc->wb)
                bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css);
}

#else        /* CONFIG_CGROUP_WRITEBACK */

static inline void inode_attach_wb(struct inode *inode, struct page *page)
{
}

static inline void inode_detach_wb(struct inode *inode)
{
}

static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
                                               struct inode *inode)
        __releases(&inode->i_lock)
{
        spin_unlock(&inode->i_lock);
}

static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
                                               struct inode *inode)
{
}

static inline void wbc_detach_inode(struct writeback_control *wbc)
{
}

static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
{
}

static inline void wbc_account_cgroup_owner(struct writeback_control *wbc,
                                            struct page *page, size_t bytes)
{
}

static inline void cgroup_writeback_umount(void)
{
}

#endif        /* CONFIG_CGROUP_WRITEBACK */

/*
 * mm/page-writeback.c
 */
#ifdef CONFIG_BLOCK
void laptop_io_completion(struct backing_dev_info *info);
void laptop_sync_completion(void);
void laptop_mode_sync(struct work_struct *work);
void laptop_mode_timer_fn(struct timer_list *t);
#else
static inline void laptop_sync_completion(void) { }
#endif
bool node_dirty_ok(struct pglist_data *pgdat);
int wb_domain_init(struct wb_domain *dom, gfp_t gfp);
#ifdef CONFIG_CGROUP_WRITEBACK
void wb_domain_exit(struct wb_domain *dom);
#endif

extern struct wb_domain global_wb_domain;

/* These are exported to sysctl. */
extern int dirty_background_ratio;
extern unsigned long dirty_background_bytes;
extern int vm_dirty_ratio;
extern unsigned long vm_dirty_bytes;
extern unsigned int dirty_writeback_interval;
extern unsigned int dirty_expire_interval;
extern unsigned int dirtytime_expire_interval;
extern int vm_highmem_is_dirtyable;
extern int block_dump;
extern int laptop_mode;

int dirty_background_ratio_handler(struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos);
int dirty_background_bytes_handler(struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos);
int dirty_ratio_handler(struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos);
int dirty_bytes_handler(struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos);
int dirtytime_interval_handler(struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos);
int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos);

void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);

void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time);
void balance_dirty_pages_ratelimited(struct address_space *mapping);
bool wb_over_bg_thresh(struct bdi_writeback *wb);

typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
                                void *data);

int generic_writepages(struct address_space *mapping,
                       struct writeback_control *wbc);
void tag_pages_for_writeback(struct address_space *mapping,
                             pgoff_t start, pgoff_t end);
int write_cache_pages(struct address_space *mapping,
                      struct writeback_control *wbc, writepage_t writepage,
                      void *data);
int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
void writeback_set_ratelimit(void);
void tag_pages_for_writeback(struct address_space *mapping,
                             pgoff_t start, pgoff_t end);

void account_page_redirty(struct page *page);

void sb_mark_inode_writeback(struct inode *inode);
void sb_clear_inode_writeback(struct inode *inode);

#endif                /* WRITEBACK_H */





























































    1 










    1 







    1 












































































    1 
    1 


































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _XFRM_HASH_H
#define _XFRM_HASH_H

#include <linux/xfrm.h>
#include <linux/socket.h>
#include <linux/jhash.h>

static inline unsigned int __xfrm4_addr_hash(const xfrm_address_t *addr)
{
        return ntohl(addr->a4);
}

static inline unsigned int __xfrm6_addr_hash(const xfrm_address_t *addr)
{
        return jhash2((__force u32 *)addr->a6, 4, 0);
}

static inline unsigned int __xfrm4_daddr_saddr_hash(const xfrm_address_t *daddr,
                                                    const xfrm_address_t *saddr)
{
        u32 sum = (__force u32)daddr->a4 + (__force u32)saddr->a4;
        return ntohl((__force __be32)sum);
}

static inline unsigned int __xfrm6_daddr_saddr_hash(const xfrm_address_t *daddr,
                                                    const xfrm_address_t *saddr)
{
        return __xfrm6_addr_hash(daddr) ^ __xfrm6_addr_hash(saddr);
}

static inline u32 __bits2mask32(__u8 bits)
{
        u32 mask32 = 0xffffffff;

        if (bits == 0)
                mask32 = 0;
        else if (bits < 32)
                mask32 <<= (32 - bits);

        return mask32;
}

static inline unsigned int __xfrm4_dpref_spref_hash(const xfrm_address_t *daddr,
                                                    const xfrm_address_t *saddr,
                                                    __u8 dbits,
                                                    __u8 sbits)
{
        return jhash_2words(ntohl(daddr->a4) & __bits2mask32(dbits),
                            ntohl(saddr->a4) & __bits2mask32(sbits),
                            0);
}

static inline unsigned int __xfrm6_pref_hash(const xfrm_address_t *addr,
                                             __u8 prefixlen)
{
        unsigned int pdw;
        unsigned int pbi;
        u32 initval = 0;

        pdw = prefixlen >> 5;     /* num of whole u32 in prefix */
        pbi = prefixlen &  0x1f;  /* num of bits in incomplete u32 in prefix */

        if (pbi) {
                __be32 mask;

                mask = htonl((0xffffffff) << (32 - pbi));

                initval = (__force u32)(addr->a6[pdw] & mask);
        }

        return jhash2((__force u32 *)addr->a6, pdw, initval);
}

static inline unsigned int __xfrm6_dpref_spref_hash(const xfrm_address_t *daddr,
                                                    const xfrm_address_t *saddr,
                                                    __u8 dbits,
                                                    __u8 sbits)
{
        return __xfrm6_pref_hash(daddr, dbits) ^
               __xfrm6_pref_hash(saddr, sbits);
}

static inline unsigned int __xfrm_dst_hash(const xfrm_address_t *daddr,
                                           const xfrm_address_t *saddr,
                                           u32 reqid, unsigned short family,
                                           unsigned int hmask)
{
        unsigned int h = family ^ reqid;
        switch (family) {
        case AF_INET:
                h ^= __xfrm4_daddr_saddr_hash(daddr, saddr);
                break;
        case AF_INET6:
                h ^= __xfrm6_daddr_saddr_hash(daddr, saddr);
                break;
        }
        return (h ^ (h >> 16)) & hmask;
}

static inline unsigned int __xfrm_src_hash(const xfrm_address_t *daddr,
                                           const xfrm_address_t *saddr,
                                           unsigned short family,
                                           unsigned int hmask)
{
        unsigned int h = family;
        switch (family) {
        case AF_INET:
                h ^= __xfrm4_daddr_saddr_hash(daddr, saddr);
                break;
        case AF_INET6:
                h ^= __xfrm6_daddr_saddr_hash(daddr, saddr);
                break;
        }
        return (h ^ (h >> 16)) & hmask;
}

static inline unsigned int
__xfrm_spi_hash(const xfrm_address_t *daddr, __be32 spi, u8 proto,
                unsigned short family, unsigned int hmask)
{
        unsigned int h = (__force u32)spi ^ proto;
        switch (family) {
        case AF_INET:
                h ^= __xfrm4_addr_hash(daddr);
                break;
        case AF_INET6:
                h ^= __xfrm6_addr_hash(daddr);
                break;
        }
        return (h ^ (h >> 10) ^ (h >> 20)) & hmask;
}

static inline unsigned int __idx_hash(u32 index, unsigned int hmask)
{
        return (index ^ (index >> 8)) & hmask;
}

static inline unsigned int __sel_hash(const struct xfrm_selector *sel,
                                      unsigned short family, unsigned int hmask,
                                      u8 dbits, u8 sbits)
{
        const xfrm_address_t *daddr = &sel->daddr;
        const xfrm_address_t *saddr = &sel->saddr;
        unsigned int h = 0;

        switch (family) {
        case AF_INET:
                if (sel->prefixlen_d < dbits ||
                    sel->prefixlen_s < sbits)
                        return hmask + 1;

                h = __xfrm4_dpref_spref_hash(daddr, saddr, dbits, sbits);
                break;

        case AF_INET6:
                if (sel->prefixlen_d < dbits ||
                    sel->prefixlen_s < sbits)
                        return hmask + 1;

                h = __xfrm6_dpref_spref_hash(daddr, saddr, dbits, sbits);
                break;
        }
        h ^= (h >> 16);
        return h & hmask;
}

static inline unsigned int __addr_hash(const xfrm_address_t *daddr,
                                       const xfrm_address_t *saddr,
                                       unsigned short family,
                                       unsigned int hmask,
                                       u8 dbits, u8 sbits)
{
        unsigned int h = 0;

        switch (family) {
        case AF_INET:
                h = __xfrm4_dpref_spref_hash(daddr, saddr, dbits, sbits);
                break;

        case AF_INET6:
                h = __xfrm6_dpref_spref_hash(daddr, saddr, dbits, sbits);
                break;
        }
        h ^= (h >> 16);
        return h & hmask;
}

struct hlist_head *xfrm_hash_alloc(unsigned int sz);
void xfrm_hash_free(struct hlist_head *n, unsigned int sz);

#endif /* _XFRM_HASH_H */


































































































    1 







































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/export.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/fs.h>
#include <linux/path.h>
#include <linux/slab.h>
#include <linux/fs_struct.h>
#include <linux/init_task.h>
#include "internal.h"

/*
 * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
 * It can block.
 */
void set_fs_root(struct fs_struct *fs, const struct path *path)
{
        struct path old_root;

        path_get(path);
        spin_lock(&fs->lock);
        write_seqcount_begin(&fs->seq);
        old_root = fs->root;
        fs->root = *path;
        write_seqcount_end(&fs->seq);
        spin_unlock(&fs->lock);
        if (old_root.dentry)
                path_put(&old_root);
}

/*
 * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
 * It can block.
 */
void set_fs_pwd(struct fs_struct *fs, const struct path *path)
{
        struct path old_pwd;

        path_get(path);
        spin_lock(&fs->lock);
        write_seqcount_begin(&fs->seq);
        old_pwd = fs->pwd;
        fs->pwd = *path;
        write_seqcount_end(&fs->seq);
        spin_unlock(&fs->lock);

        if (old_pwd.dentry)
                path_put(&old_pwd);
}

static inline int replace_path(struct path *p, const struct path *old, const struct path *new)
{
        if (likely(p->dentry != old->dentry || p->mnt != old->mnt))
                return 0;
        *p = *new;
        return 1;
}

void chroot_fs_refs(const struct path *old_root, const struct path *new_root)
{
        struct task_struct *g, *p;
        struct fs_struct *fs;
        int count = 0;

        read_lock(&tasklist_lock);
        do_each_thread(g, p) {
                task_lock(p);
                fs = p->fs;
                if (fs) {
                        int hits = 0;
                        spin_lock(&fs->lock);
                        write_seqcount_begin(&fs->seq);
                        hits += replace_path(&fs->root, old_root, new_root);
                        hits += replace_path(&fs->pwd, old_root, new_root);
                        write_seqcount_end(&fs->seq);
                        while (hits--) {
                                count++;
                                path_get(new_root);
                        }
                        spin_unlock(&fs->lock);
                }
                task_unlock(p);
        } while_each_thread(g, p);
        read_unlock(&tasklist_lock);
        while (count--)
                path_put(old_root);
}

void free_fs_struct(struct fs_struct *fs)
{
        path_put(&fs->root);
        path_put(&fs->pwd);
        kmem_cache_free(fs_cachep, fs);
}

void exit_fs(struct task_struct *tsk)
{
        struct fs_struct *fs = tsk->fs;

        if (fs) {
                int kill;
                task_lock(tsk);
                spin_lock(&fs->lock);
                tsk->fs = NULL;
                kill = !--fs->users;
                spin_unlock(&fs->lock);
                task_unlock(tsk);
                if (kill)
                        free_fs_struct(fs);
        }
}

struct fs_struct *copy_fs_struct(struct fs_struct *old)
{
        struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
        /* We don't need to lock fs - think why ;-) */
        if (fs) {
                fs->users = 1;
                fs->in_exec = 0;
                spin_lock_init(&fs->lock);
                seqcount_spinlock_init(&fs->seq, &fs->lock);
                fs->umask = old->umask;

                spin_lock(&old->lock);
                fs->root = old->root;
                path_get(&fs->root);
                fs->pwd = old->pwd;
                path_get(&fs->pwd);
                spin_unlock(&old->lock);
        }
        return fs;
}

int unshare_fs_struct(void)
{
        struct fs_struct *fs = current->fs;
        struct fs_struct *new_fs = copy_fs_struct(fs);
        int kill;

        if (!new_fs)
                return -ENOMEM;

        task_lock(current);
        spin_lock(&fs->lock);
        kill = !--fs->users;
        current->fs = new_fs;
        spin_unlock(&fs->lock);
        task_unlock(current);

        if (kill)
                free_fs_struct(fs);

        return 0;
}
EXPORT_SYMBOL_GPL(unshare_fs_struct);

int current_umask(void)
{
        return current->fs->umask;
}
EXPORT_SYMBOL(current_umask);

/* to be mentioned only in INIT_TASK */
struct fs_struct init_fs = {
        .users                = 1,
        .lock                = __SPIN_LOCK_UNLOCKED(init_fs.lock),
        .seq                = SEQCNT_SPINLOCK_ZERO(init_fs.seq, &init_fs.lock),
        .umask                = 0022,
};




























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ASM_GENERIC_PGALLOC_H
#define __ASM_GENERIC_PGALLOC_H

#ifdef CONFIG_MMU

#define GFP_PGTABLE_KERNEL        (GFP_KERNEL | __GFP_ZERO)
#define GFP_PGTABLE_USER        (GFP_PGTABLE_KERNEL | __GFP_ACCOUNT)

/**
 * __pte_alloc_one_kernel - allocate a page for PTE-level kernel page table
 * @mm: the mm_struct of the current context
 *
 * This function is intended for architectures that need
 * anything beyond simple page allocation.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
static inline pte_t *__pte_alloc_one_kernel(struct mm_struct *mm)
{
        return (pte_t *)__get_free_page(GFP_PGTABLE_KERNEL);
}

#ifndef __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL
/**
 * pte_alloc_one_kernel - allocate a page for PTE-level kernel page table
 * @mm: the mm_struct of the current context
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
{
        return __pte_alloc_one_kernel(mm);
}
#endif

/**
 * pte_free_kernel - free PTE-level kernel page table page
 * @mm: the mm_struct of the current context
 * @pte: pointer to the memory containing the page table
 */
static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
{
        free_page((unsigned long)pte);
}

/**
 * __pte_alloc_one - allocate a page for PTE-level user page table
 * @mm: the mm_struct of the current context
 * @gfp: GFP flags to use for the allocation
 *
 * Allocates a page and runs the pgtable_pte_page_ctor().
 *
 * This function is intended for architectures that need
 * anything beyond simple page allocation or must have custom GFP flags.
 *
 * Return: `struct page` initialized as page table or %NULL on error
 */
static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, gfp_t gfp)
{
        struct page *pte;

        pte = alloc_page(gfp);
        if (!pte)
                return NULL;
        if (!pgtable_pte_page_ctor(pte)) {
                __free_page(pte);
                return NULL;
        }

        return pte;
}

#ifndef __HAVE_ARCH_PTE_ALLOC_ONE
/**
 * pte_alloc_one - allocate a page for PTE-level user page table
 * @mm: the mm_struct of the current context
 *
 * Allocates a page and runs the pgtable_pte_page_ctor().
 *
 * Return: `struct page` initialized as page table or %NULL on error
 */
static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
{
        return __pte_alloc_one(mm, GFP_PGTABLE_USER);
}
#endif

/*
 * Should really implement gc for free page table pages. This could be
 * done with a reference count in struct page.
 */

/**
 * pte_free - free PTE-level user page table page
 * @mm: the mm_struct of the current context
 * @pte_page: the `struct page` representing the page table
 */
static inline void pte_free(struct mm_struct *mm, struct page *pte_page)
{
        pgtable_pte_page_dtor(pte_page);
        __free_page(pte_page);
}


#if CONFIG_PGTABLE_LEVELS > 2

#ifndef __HAVE_ARCH_PMD_ALLOC_ONE
/**
 * pmd_alloc_one - allocate a page for PMD-level page table
 * @mm: the mm_struct of the current context
 *
 * Allocates a page and runs the pgtable_pmd_page_ctor().
 * Allocations use %GFP_PGTABLE_USER in user context and
 * %GFP_PGTABLE_KERNEL in kernel context.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
{
        struct page *page;
        gfp_t gfp = GFP_PGTABLE_USER;

        if (mm == &init_mm)
                gfp = GFP_PGTABLE_KERNEL;
        page = alloc_pages(gfp, 0);
        if (!page)
                return NULL;
        if (!pgtable_pmd_page_ctor(page)) {
                __free_pages(page, 0);
                return NULL;
        }
        return (pmd_t *)page_address(page);
}
#endif

#ifndef __HAVE_ARCH_PMD_FREE
static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
{
        BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
        pgtable_pmd_page_dtor(virt_to_page(pmd));
        free_page((unsigned long)pmd);
}
#endif

#endif /* CONFIG_PGTABLE_LEVELS > 2 */

#if CONFIG_PGTABLE_LEVELS > 3

#ifndef __HAVE_ARCH_PUD_ALLOC_ONE
/**
 * pud_alloc_one - allocate a page for PUD-level page table
 * @mm: the mm_struct of the current context
 *
 * Allocates a page using %GFP_PGTABLE_USER for user context and
 * %GFP_PGTABLE_KERNEL for kernel context.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
{
        gfp_t gfp = GFP_PGTABLE_USER;

        if (mm == &init_mm)
                gfp = GFP_PGTABLE_KERNEL;
        return (pud_t *)get_zeroed_page(gfp);
}
#endif

static inline void pud_free(struct mm_struct *mm, pud_t *pud)
{
        BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
        free_page((unsigned long)pud);
}

#endif /* CONFIG_PGTABLE_LEVELS > 3 */

#ifndef __HAVE_ARCH_PGD_FREE
static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
        free_page((unsigned long)pgd);
}
#endif

#endif /* CONFIG_MMU */

#endif /* __ASM_GENERIC_PGALLOC_H */











































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
/* SPDX-License-Identifier: GPL-2.0 */
/* Rewritten and vastly simplified by Rusty Russell for in-kernel
 * module loader:
 *   Copyright 2002 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
 */
#ifndef _LINUX_KALLSYMS_H
#define _LINUX_KALLSYMS_H

#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/stddef.h>
#include <linux/mm.h>
#include <linux/module.h>

#include <asm/sections.h>

#define KSYM_NAME_LEN 128
#define KSYM_SYMBOL_LEN (sizeof("%s+%#lx/%#lx [%s]") + (KSYM_NAME_LEN - 1) + \
                         2*(BITS_PER_LONG*3/10) + (MODULE_NAME_LEN - 1) + 1)

struct cred;
struct module;

static inline int is_kernel_inittext(unsigned long addr)
{
        if (addr >= (unsigned long)_sinittext
            && addr <= (unsigned long)_einittext)
                return 1;
        return 0;
}

static inline int is_kernel_text(unsigned long addr)
{
        if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) ||
            arch_is_kernel_text(addr))
                return 1;
        return in_gate_area_no_mm(addr);
}

static inline int is_kernel(unsigned long addr)
{
        if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end)
                return 1;
        return in_gate_area_no_mm(addr);
}

static inline int is_ksym_addr(unsigned long addr)
{
        if (IS_ENABLED(CONFIG_KALLSYMS_ALL))
                return is_kernel(addr);

        return is_kernel_text(addr) || is_kernel_inittext(addr);
}

static inline void *dereference_symbol_descriptor(void *ptr)
{
#ifdef HAVE_DEREFERENCE_FUNCTION_DESCRIPTOR
        struct module *mod;

        ptr = dereference_kernel_function_descriptor(ptr);
        if (is_ksym_addr((unsigned long)ptr))
                return ptr;

        preempt_disable();
        mod = __module_address((unsigned long)ptr);

        if (mod)
                ptr = dereference_module_function_descriptor(mod, ptr);
        preempt_enable();
#endif
        return ptr;
}

#ifdef CONFIG_KALLSYMS
int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
                                      unsigned long),
                            void *data);

/* Lookup the address for a symbol. Returns 0 if not found. */
unsigned long kallsyms_lookup_name(const char *name);

extern int kallsyms_lookup_size_offset(unsigned long addr,
                                  unsigned long *symbolsize,
                                  unsigned long *offset);

/* Lookup an address.  modname is set to NULL if it's in the kernel. */
const char *kallsyms_lookup(unsigned long addr,
                            unsigned long *symbolsize,
                            unsigned long *offset,
                            char **modname, char *namebuf);

/* Look up a kernel symbol and return it in a text buffer. */
extern int sprint_symbol(char *buffer, unsigned long address);
extern int sprint_symbol_no_offset(char *buffer, unsigned long address);
extern int sprint_backtrace(char *buffer, unsigned long address);

int lookup_symbol_name(unsigned long addr, char *symname);
int lookup_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name);

/* How and when do we show kallsyms values? */
extern bool kallsyms_show_value(const struct cred *cred);

#else /* !CONFIG_KALLSYMS */

static inline unsigned long kallsyms_lookup_name(const char *name)
{
        return 0;
}

static inline int kallsyms_lookup_size_offset(unsigned long addr,
                                              unsigned long *symbolsize,
                                              unsigned long *offset)
{
        return 0;
}

static inline const char *kallsyms_lookup(unsigned long addr,
                                          unsigned long *symbolsize,
                                          unsigned long *offset,
                                          char **modname, char *namebuf)
{
        return NULL;
}

static inline int sprint_symbol(char *buffer, unsigned long addr)
{
        *buffer = '\0';
        return 0;
}

static inline int sprint_symbol_no_offset(char *buffer, unsigned long addr)
{
        *buffer = '\0';
        return 0;
}

static inline int sprint_backtrace(char *buffer, unsigned long addr)
{
        *buffer = '\0';
        return 0;
}

static inline int lookup_symbol_name(unsigned long addr, char *symname)
{
        return -ERANGE;
}

static inline int lookup_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name)
{
        return -ERANGE;
}

static inline bool kallsyms_show_value(const struct cred *cred)
{
        return false;
}

static inline int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
                                          unsigned long), void *data)
{
        return -EOPNOTSUPP;
}
#endif /*CONFIG_KALLSYMS*/

static inline void print_ip_sym(const char *loglvl, unsigned long ip)
{
        printk("%s[<%px>] %pS\n", loglvl, (void *) ip, (void *) ip);
}

#endif /*_LINUX_KALLSYMS_H*/































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_UNWIND_H
#define _ASM_X86_UNWIND_H

#include <linux/sched.h>
#include <linux/ftrace.h>
#include <asm/ptrace.h>
#include <asm/stacktrace.h>

#define IRET_FRAME_OFFSET (offsetof(struct pt_regs, ip))
#define IRET_FRAME_SIZE   (sizeof(struct pt_regs) - IRET_FRAME_OFFSET)

struct unwind_state {
        struct stack_info stack_info;
        unsigned long stack_mask;
        struct task_struct *task;
        int graph_idx;
        bool error;
#if defined(CONFIG_UNWINDER_ORC)
        bool signal, full_regs;
        unsigned long sp, bp, ip;
        struct pt_regs *regs, *prev_regs;
#elif defined(CONFIG_UNWINDER_FRAME_POINTER)
        bool got_irq;
        unsigned long *bp, *orig_sp, ip;
        /*
         * If non-NULL: The current frame is incomplete and doesn't contain a
         * valid BP. When looking for the next frame, use this instead of the
         * non-existent saved BP.
         */
        unsigned long *next_bp;
        struct pt_regs *regs;
#else
        unsigned long *sp;
#endif
};

void __unwind_start(struct unwind_state *state, struct task_struct *task,
                    struct pt_regs *regs, unsigned long *first_frame);
bool unwind_next_frame(struct unwind_state *state);
unsigned long unwind_get_return_address(struct unwind_state *state);
unsigned long *unwind_get_return_address_ptr(struct unwind_state *state);

static inline bool unwind_done(struct unwind_state *state)
{
        return state->stack_info.type == STACK_TYPE_UNKNOWN;
}

static inline bool unwind_error(struct unwind_state *state)
{
        return state->error;
}

static inline
void unwind_start(struct unwind_state *state, struct task_struct *task,
                  struct pt_regs *regs, unsigned long *first_frame)
{
        first_frame = first_frame ? : get_stack_pointer(task, regs);

        __unwind_start(state, task, regs, first_frame);
}

#if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER)
/*
 * If 'partial' returns true, only the iret frame registers are valid.
 */
static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state,
                                                    bool *partial)
{
        if (unwind_done(state))
                return NULL;

        if (partial) {
#ifdef CONFIG_UNWINDER_ORC
                *partial = !state->full_regs;
#else
                *partial = false;
#endif
        }

        return state->regs;
}
#else
static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state,
                                                    bool *partial)
{
        return NULL;
}
#endif

#ifdef CONFIG_UNWINDER_ORC
void unwind_init(void);
void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
                        void *orc, size_t orc_size);
#else
static inline void unwind_init(void) {}
static inline
void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
                        void *orc, size_t orc_size) {}
#endif

/*
 * This disables KASAN checking when reading a value from another task's stack,
 * since the other task could be running on another CPU and could have poisoned
 * the stack in the meantime.
 */
#define READ_ONCE_TASK_STACK(task, x)                        \
({                                                        \
        unsigned long val;                                \
        if (task == current)                                \
                val = READ_ONCE(x);                        \
        else                                                \
                val = READ_ONCE_NOCHECK(x);                \
        val;                                                \
})

static inline bool task_on_another_cpu(struct task_struct *task)
{
#ifdef CONFIG_SMP
        return task != current && task->on_cpu;
#else
        return false;
#endif
}

#endif /* _ASM_X86_UNWIND_H */






































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef LINUX_KEXEC_H
#define LINUX_KEXEC_H

#define IND_DESTINATION_BIT 0
#define IND_INDIRECTION_BIT 1
#define IND_DONE_BIT        2
#define IND_SOURCE_BIT      3

#define IND_DESTINATION  (1 << IND_DESTINATION_BIT)
#define IND_INDIRECTION  (1 << IND_INDIRECTION_BIT)
#define IND_DONE         (1 << IND_DONE_BIT)
#define IND_SOURCE       (1 << IND_SOURCE_BIT)
#define IND_FLAGS (IND_DESTINATION | IND_INDIRECTION | IND_DONE | IND_SOURCE)

#if !defined(__ASSEMBLY__)

#include <linux/crash_core.h>
#include <asm/io.h>

#include <uapi/linux/kexec.h>

#ifdef CONFIG_KEXEC_CORE
#include <linux/list.h>
#include <linux/compat.h>
#include <linux/ioport.h>
#include <linux/module.h>
#include <asm/kexec.h>

/* Verify architecture specific macros are defined */

#ifndef KEXEC_SOURCE_MEMORY_LIMIT
#error KEXEC_SOURCE_MEMORY_LIMIT not defined
#endif

#ifndef KEXEC_DESTINATION_MEMORY_LIMIT
#error KEXEC_DESTINATION_MEMORY_LIMIT not defined
#endif

#ifndef KEXEC_CONTROL_MEMORY_LIMIT
#error KEXEC_CONTROL_MEMORY_LIMIT not defined
#endif

#ifndef KEXEC_CONTROL_MEMORY_GFP
#define KEXEC_CONTROL_MEMORY_GFP (GFP_KERNEL | __GFP_NORETRY)
#endif

#ifndef KEXEC_CONTROL_PAGE_SIZE
#error KEXEC_CONTROL_PAGE_SIZE not defined
#endif

#ifndef KEXEC_ARCH
#error KEXEC_ARCH not defined
#endif

#ifndef KEXEC_CRASH_CONTROL_MEMORY_LIMIT
#define KEXEC_CRASH_CONTROL_MEMORY_LIMIT KEXEC_CONTROL_MEMORY_LIMIT
#endif

#ifndef KEXEC_CRASH_MEM_ALIGN
#define KEXEC_CRASH_MEM_ALIGN PAGE_SIZE
#endif

#define KEXEC_CORE_NOTE_NAME        CRASH_CORE_NOTE_NAME

/*
 * This structure is used to hold the arguments that are used when loading
 * kernel binaries.
 */

typedef unsigned long kimage_entry_t;

struct kexec_segment {
        /*
         * This pointer can point to user memory if kexec_load() system
         * call is used or will point to kernel memory if
         * kexec_file_load() system call is used.
         *
         * Use ->buf when expecting to deal with user memory and use ->kbuf
         * when expecting to deal with kernel memory.
         */
        union {
                void __user *buf;
                void *kbuf;
        };
        size_t bufsz;
        unsigned long mem;
        size_t memsz;
};

#ifdef CONFIG_COMPAT
struct compat_kexec_segment {
        compat_uptr_t buf;
        compat_size_t bufsz;
        compat_ulong_t mem;        /* User space sees this as a (void *) ... */
        compat_size_t memsz;
};
#endif

#ifdef CONFIG_KEXEC_FILE
struct purgatory_info {
        /*
         * Pointer to elf header at the beginning of kexec_purgatory.
         * Note: kexec_purgatory is read only
         */
        const Elf_Ehdr *ehdr;
        /*
         * Temporary, modifiable buffer for sechdrs used for relocation.
         * This memory can be freed post image load.
         */
        Elf_Shdr *sechdrs;
        /*
         * Temporary, modifiable buffer for stripped purgatory used for
         * relocation. This memory can be freed post image load.
         */
        void *purgatory_buf;
};

struct kimage;

typedef int (kexec_probe_t)(const char *kernel_buf, unsigned long kernel_size);
typedef void *(kexec_load_t)(struct kimage *image, char *kernel_buf,
                             unsigned long kernel_len, char *initrd,
                             unsigned long initrd_len, char *cmdline,
                             unsigned long cmdline_len);
typedef int (kexec_cleanup_t)(void *loader_data);

#ifdef CONFIG_KEXEC_SIG
typedef int (kexec_verify_sig_t)(const char *kernel_buf,
                                 unsigned long kernel_len);
#endif

struct kexec_file_ops {
        kexec_probe_t *probe;
        kexec_load_t *load;
        kexec_cleanup_t *cleanup;
#ifdef CONFIG_KEXEC_SIG
        kexec_verify_sig_t *verify_sig;
#endif
};

extern const struct kexec_file_ops * const kexec_file_loaders[];

int kexec_image_probe_default(struct kimage *image, void *buf,
                              unsigned long buf_len);
int kexec_image_post_load_cleanup_default(struct kimage *image);

/*
 * If kexec_buf.mem is set to this value, kexec_locate_mem_hole()
 * will try to allocate free memory. Arch may overwrite it.
 */
#ifndef KEXEC_BUF_MEM_UNKNOWN
#define KEXEC_BUF_MEM_UNKNOWN 0
#endif

/**
 * struct kexec_buf - parameters for finding a place for a buffer in memory
 * @image:        kexec image in which memory to search.
 * @buffer:        Contents which will be copied to the allocated memory.
 * @bufsz:        Size of @buffer.
 * @mem:        On return will have address of the buffer in memory.
 * @memsz:        Size for the buffer in memory.
 * @buf_align:        Minimum alignment needed.
 * @buf_min:        The buffer can't be placed below this address.
 * @buf_max:        The buffer can't be placed above this address.
 * @top_down:        Allocate from top of memory.
 */
struct kexec_buf {
        struct kimage *image;
        void *buffer;
        unsigned long bufsz;
        unsigned long mem;
        unsigned long memsz;
        unsigned long buf_align;
        unsigned long buf_min;
        unsigned long buf_max;
        bool top_down;
};

int kexec_load_purgatory(struct kimage *image, struct kexec_buf *kbuf);
int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
                                   void *buf, unsigned int size,
                                   bool get_value);
void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name);

/* Architectures may override the below functions */
int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
                                  unsigned long buf_len);
void *arch_kexec_kernel_image_load(struct kimage *image);
int arch_kimage_file_post_load_cleanup(struct kimage *image);
#ifdef CONFIG_KEXEC_SIG
int arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
                                 unsigned long buf_len);
#endif
int arch_kexec_locate_mem_hole(struct kexec_buf *kbuf);

extern int kexec_add_buffer(struct kexec_buf *kbuf);
int kexec_locate_mem_hole(struct kexec_buf *kbuf);

/* Alignment required for elf header segment */
#define ELF_CORE_HEADER_ALIGN   4096

struct crash_mem_range {
        u64 start, end;
};

struct crash_mem {
        unsigned int max_nr_ranges;
        unsigned int nr_ranges;
        struct crash_mem_range ranges[];
};

extern int crash_exclude_mem_range(struct crash_mem *mem,
                                   unsigned long long mstart,
                                   unsigned long long mend);
extern int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
                                       void **addr, unsigned long *sz);

#ifndef arch_kexec_apply_relocations_add
/*
 * arch_kexec_apply_relocations_add - apply relocations of type RELA
 * @pi:                Purgatory to be relocated.
 * @section:        Section relocations applying to.
 * @relsec:        Section containing RELAs.
 * @symtab:        Corresponding symtab.
 *
 * Return: 0 on success, negative errno on error.
 */
static inline int
arch_kexec_apply_relocations_add(struct purgatory_info *pi, Elf_Shdr *section,
                                 const Elf_Shdr *relsec, const Elf_Shdr *symtab)
{
        pr_err("RELA relocation unsupported.\n");
        return -ENOEXEC;
}
#endif

#ifndef arch_kexec_apply_relocations
/*
 * arch_kexec_apply_relocations - apply relocations of type REL
 * @pi:                Purgatory to be relocated.
 * @section:        Section relocations applying to.
 * @relsec:        Section containing RELs.
 * @symtab:        Corresponding symtab.
 *
 * Return: 0 on success, negative errno on error.
 */
static inline int
arch_kexec_apply_relocations(struct purgatory_info *pi, Elf_Shdr *section,
                             const Elf_Shdr *relsec, const Elf_Shdr *symtab)
{
        pr_err("REL relocation unsupported.\n");
        return -ENOEXEC;
}
#endif
#endif /* CONFIG_KEXEC_FILE */

#ifdef CONFIG_KEXEC_ELF
struct kexec_elf_info {
        /*
         * Where the ELF binary contents are kept.
         * Memory managed by the user of the struct.
         */
        const char *buffer;

        const struct elfhdr *ehdr;
        const struct elf_phdr *proghdrs;
};

int kexec_build_elf_info(const char *buf, size_t len, struct elfhdr *ehdr,
                               struct kexec_elf_info *elf_info);

int kexec_elf_load(struct kimage *image, struct elfhdr *ehdr,
                         struct kexec_elf_info *elf_info,
                         struct kexec_buf *kbuf,
                         unsigned long *lowest_load_addr);

void kexec_free_elf_info(struct kexec_elf_info *elf_info);
int kexec_elf_probe(const char *buf, unsigned long len);
#endif
struct kimage {
        kimage_entry_t head;
        kimage_entry_t *entry;
        kimage_entry_t *last_entry;

        unsigned long start;
        struct page *control_code_page;
        struct page *swap_page;
        void *vmcoreinfo_data_copy; /* locates in the crash memory */

        unsigned long nr_segments;
        struct kexec_segment segment[KEXEC_SEGMENT_MAX];

        struct list_head control_pages;
        struct list_head dest_pages;
        struct list_head unusable_pages;

        /* Address of next control page to allocate for crash kernels. */
        unsigned long control_page;

        /* Flags to indicate special processing */
        unsigned int type : 1;
#define KEXEC_TYPE_DEFAULT 0
#define KEXEC_TYPE_CRASH   1
        unsigned int preserve_context : 1;
        /* If set, we are using file mode kexec syscall */
        unsigned int file_mode:1;

#ifdef ARCH_HAS_KIMAGE_ARCH
        struct kimage_arch arch;
#endif

#ifdef CONFIG_KEXEC_FILE
        /* Additional fields for file based kexec syscall */
        void *kernel_buf;
        unsigned long kernel_buf_len;

        void *initrd_buf;
        unsigned long initrd_buf_len;

        char *cmdline_buf;
        unsigned long cmdline_buf_len;

        /* File operations provided by image loader */
        const struct kexec_file_ops *fops;

        /* Image loader handling the kernel can store a pointer here */
        void *image_loader_data;

        /* Information for loading purgatory */
        struct purgatory_info purgatory_info;
#endif

#ifdef CONFIG_IMA_KEXEC
        /* Virtual address of IMA measurement buffer for kexec syscall */
        void *ima_buffer;
#endif
};

/* kexec interface functions */
extern void machine_kexec(struct kimage *image);
extern int machine_kexec_prepare(struct kimage *image);
extern void machine_kexec_cleanup(struct kimage *image);
extern int kernel_kexec(void);
extern struct page *kimage_alloc_control_pages(struct kimage *image,
                                                unsigned int order);
extern void __crash_kexec(struct pt_regs *);
extern void crash_kexec(struct pt_regs *);
int kexec_should_crash(struct task_struct *);
int kexec_crash_loaded(void);
void crash_save_cpu(struct pt_regs *regs, int cpu);
extern int kimage_crash_copy_vmcoreinfo(struct kimage *image);

extern struct kimage *kexec_image;
extern struct kimage *kexec_crash_image;
extern int kexec_load_disabled;

#ifndef kexec_flush_icache_page
#define kexec_flush_icache_page(page)
#endif

/* List of defined/legal kexec flags */
#ifndef CONFIG_KEXEC_JUMP
#define KEXEC_FLAGS    KEXEC_ON_CRASH
#else
#define KEXEC_FLAGS    (KEXEC_ON_CRASH | KEXEC_PRESERVE_CONTEXT)
#endif

/* List of defined/legal kexec file flags */
#define KEXEC_FILE_FLAGS        (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \
                                 KEXEC_FILE_NO_INITRAMFS)

/* Location of a reserved region to hold the crash kernel.
 */
extern struct resource crashk_res;
extern struct resource crashk_low_res;
extern note_buf_t __percpu *crash_notes;

/* flag to track if kexec reboot is in progress */
extern bool kexec_in_progress;

int crash_shrink_memory(unsigned long new_size);
void crash_free_reserved_phys_range(unsigned long begin, unsigned long end);
ssize_t crash_get_memory_size(void);

void arch_kexec_protect_crashkres(void);
void arch_kexec_unprotect_crashkres(void);

#ifndef page_to_boot_pfn
static inline unsigned long page_to_boot_pfn(struct page *page)
{
        return page_to_pfn(page);
}
#endif

#ifndef boot_pfn_to_page
static inline struct page *boot_pfn_to_page(unsigned long boot_pfn)
{
        return pfn_to_page(boot_pfn);
}
#endif

#ifndef phys_to_boot_phys
static inline unsigned long phys_to_boot_phys(phys_addr_t phys)
{
        return phys;
}
#endif

#ifndef boot_phys_to_phys
static inline phys_addr_t boot_phys_to_phys(unsigned long boot_phys)
{
        return boot_phys;
}
#endif

static inline unsigned long virt_to_boot_phys(void *addr)
{
        return phys_to_boot_phys(__pa((unsigned long)addr));
}

static inline void *boot_phys_to_virt(unsigned long entry)
{
        return phys_to_virt(boot_phys_to_phys(entry));
}

#ifndef arch_kexec_post_alloc_pages
static inline int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) { return 0; }
#endif

#ifndef arch_kexec_pre_free_pages
static inline void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) { }
#endif

#else /* !CONFIG_KEXEC_CORE */
struct pt_regs;
struct task_struct;
static inline void __crash_kexec(struct pt_regs *regs) { }
static inline void crash_kexec(struct pt_regs *regs) { }
static inline int kexec_should_crash(struct task_struct *p) { return 0; }
static inline int kexec_crash_loaded(void) { return 0; }
#define kexec_in_progress false
#endif /* CONFIG_KEXEC_CORE */

#ifdef CONFIG_KEXEC_SIG
void set_kexec_sig_enforced(void);
#else
static inline void set_kexec_sig_enforced(void) {}
#endif

#endif /* !defined(__ASSEBMLY__) */

#endif /* LINUX_KEXEC_H */


































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 

























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/fcntl.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/syscalls.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/sched/task.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/capability.h>
#include <linux/dnotify.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/pipe_fs_i.h>
#include <linux/security.h>
#include <linux/ptrace.h>
#include <linux/signal.h>
#include <linux/rcupdate.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <linux/memfd.h>
#include <linux/compat.h>

#include <linux/poll.h>
#include <asm/siginfo.h>
#include <linux/uaccess.h>

#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)

static int setfl(int fd, struct file * filp, unsigned long arg)
{
        struct inode * inode = file_inode(filp);
        int error = 0;

        /*
         * O_APPEND cannot be cleared if the file is marked as append-only
         * and the file is open for write.
         */
        if (((arg ^ filp->f_flags) & O_APPEND) && IS_APPEND(inode))
                return -EPERM;

        /* O_NOATIME can only be set by the owner or superuser */
        if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME))
                if (!inode_owner_or_capable(inode))
                        return -EPERM;

        /* required for strict SunOS emulation */
        if (O_NONBLOCK != O_NDELAY)
               if (arg & O_NDELAY)
                   arg |= O_NONBLOCK;

        /* Pipe packetized mode is controlled by O_DIRECT flag */
        if (!S_ISFIFO(inode->i_mode) && (arg & O_DIRECT)) {
                if (!filp->f_mapping || !filp->f_mapping->a_ops ||
                        !filp->f_mapping->a_ops->direct_IO)
                                return -EINVAL;
        }

        if (filp->f_op->check_flags)
                error = filp->f_op->check_flags(arg);
        if (error)
                return error;

        /*
         * ->fasync() is responsible for setting the FASYNC bit.
         */
        if (((arg ^ filp->f_flags) & FASYNC) && filp->f_op->fasync) {
                error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
                if (error < 0)
                        goto out;
                if (error > 0)
                        error = 0;
        }
        spin_lock(&filp->f_lock);
        filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
        spin_unlock(&filp->f_lock);

 out:
        return error;
}

void __f_setown(struct file *filp, struct pid *pid, enum pid_type type,
                int force)
{
        write_lock_irq(&filp->f_owner.lock);
        if (force || !filp->f_owner.pid) {
                put_pid(filp->f_owner.pid);
                filp->f_owner.pid = get_pid(pid);
                filp->f_owner.pid_type = type;

                if (pid) {
                        const struct cred *cred = current_cred();
                        security_file_set_fowner(filp);
                        filp->f_owner.uid = cred->uid;
                        filp->f_owner.euid = cred->euid;
                }
        }
        write_unlock_irq(&filp->f_owner.lock);
}
EXPORT_SYMBOL(__f_setown);

int f_setown(struct file *filp, unsigned long arg, int force)
{
        enum pid_type type;
        struct pid *pid = NULL;
        int who = arg, ret = 0;

        type = PIDTYPE_TGID;
        if (who < 0) {
                /* avoid overflow below */
                if (who == INT_MIN)
                        return -EINVAL;

                type = PIDTYPE_PGID;
                who = -who;
        }

        rcu_read_lock();
        if (who) {
                pid = find_vpid(who);
                if (!pid)
                        ret = -ESRCH;
        }

        if (!ret)
                __f_setown(filp, pid, type, force);
        rcu_read_unlock();

        return ret;
}
EXPORT_SYMBOL(f_setown);

void f_delown(struct file *filp)
{
        __f_setown(filp, NULL, PIDTYPE_TGID, 1);
}

pid_t f_getown(struct file *filp)
{
        pid_t pid = 0;

        read_lock_irq(&filp->f_owner.lock);
        rcu_read_lock();
        if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) {
                pid = pid_vnr(filp->f_owner.pid);
                if (filp->f_owner.pid_type == PIDTYPE_PGID)
                        pid = -pid;
        }
        rcu_read_unlock();
        read_unlock_irq(&filp->f_owner.lock);
        return pid;
}

static int f_setown_ex(struct file *filp, unsigned long arg)
{
        struct f_owner_ex __user *owner_p = (void __user *)arg;
        struct f_owner_ex owner;
        struct pid *pid;
        int type;
        int ret;

        ret = copy_from_user(&owner, owner_p, sizeof(owner));
        if (ret)
                return -EFAULT;

        switch (owner.type) {
        case F_OWNER_TID:
                type = PIDTYPE_PID;
                break;

        case F_OWNER_PID:
                type = PIDTYPE_TGID;
                break;

        case F_OWNER_PGRP:
                type = PIDTYPE_PGID;
                break;

        default:
                return -EINVAL;
        }

        rcu_read_lock();
        pid = find_vpid(owner.pid);
        if (owner.pid && !pid)
                ret = -ESRCH;
        else
                 __f_setown(filp, pid, type, 1);
        rcu_read_unlock();

        return ret;
}

static int f_getown_ex(struct file *filp, unsigned long arg)
{
        struct f_owner_ex __user *owner_p = (void __user *)arg;
        struct f_owner_ex owner = {};
        int ret = 0;

        read_lock_irq(&filp->f_owner.lock);
        rcu_read_lock();
        if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type))
                owner.pid = pid_vnr(filp->f_owner.pid);
        rcu_read_unlock();
        switch (filp->f_owner.pid_type) {
        case PIDTYPE_PID:
                owner.type = F_OWNER_TID;
                break;

        case PIDTYPE_TGID:
                owner.type = F_OWNER_PID;
                break;

        case PIDTYPE_PGID:
                owner.type = F_OWNER_PGRP;
                break;

        default:
                WARN_ON(1);
                ret = -EINVAL;
                break;
        }
        read_unlock_irq(&filp->f_owner.lock);

        if (!ret) {
                ret = copy_to_user(owner_p, &owner, sizeof(owner));
                if (ret)
                        ret = -EFAULT;
        }
        return ret;
}

#ifdef CONFIG_CHECKPOINT_RESTORE
static int f_getowner_uids(struct file *filp, unsigned long arg)
{
        struct user_namespace *user_ns = current_user_ns();
        uid_t __user *dst = (void __user *)arg;
        uid_t src[2];
        int err;

        read_lock_irq(&filp->f_owner.lock);
        src[0] = from_kuid(user_ns, filp->f_owner.uid);
        src[1] = from_kuid(user_ns, filp->f_owner.euid);
        read_unlock_irq(&filp->f_owner.lock);

        err  = put_user(src[0], &dst[0]);
        err |= put_user(src[1], &dst[1]);

        return err;
}
#else
static int f_getowner_uids(struct file *filp, unsigned long arg)
{
        return -EINVAL;
}
#endif

static bool rw_hint_valid(enum rw_hint hint)
{
        switch (hint) {
        case RWH_WRITE_LIFE_NOT_SET:
        case RWH_WRITE_LIFE_NONE:
        case RWH_WRITE_LIFE_SHORT:
        case RWH_WRITE_LIFE_MEDIUM:
        case RWH_WRITE_LIFE_LONG:
        case RWH_WRITE_LIFE_EXTREME:
                return true;
        default:
                return false;
        }
}

static long fcntl_rw_hint(struct file *file, unsigned int cmd,
                          unsigned long arg)
{
        struct inode *inode = file_inode(file);
        u64 __user *argp = (u64 __user *)arg;
        enum rw_hint hint;
        u64 h;

        switch (cmd) {
        case F_GET_FILE_RW_HINT:
                h = file_write_hint(file);
                if (copy_to_user(argp, &h, sizeof(*argp)))
                        return -EFAULT;
                return 0;
        case F_SET_FILE_RW_HINT:
                if (copy_from_user(&h, argp, sizeof(h)))
                        return -EFAULT;
                hint = (enum rw_hint) h;
                if (!rw_hint_valid(hint))
                        return -EINVAL;

                spin_lock(&file->f_lock);
                file->f_write_hint = hint;
                spin_unlock(&file->f_lock);
                return 0;
        case F_GET_RW_HINT:
                h = inode->i_write_hint;
                if (copy_to_user(argp, &h, sizeof(*argp)))
                        return -EFAULT;
                return 0;
        case F_SET_RW_HINT:
                if (copy_from_user(&h, argp, sizeof(h)))
                        return -EFAULT;
                hint = (enum rw_hint) h;
                if (!rw_hint_valid(hint))
                        return -EINVAL;

                inode_lock(inode);
                inode->i_write_hint = hint;
                inode_unlock(inode);
                return 0;
        default:
                return -EINVAL;
        }
}

static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
                struct file *filp)
{
        void __user *argp = (void __user *)arg;
        struct flock flock;
        long err = -EINVAL;

        switch (cmd) {
        case F_DUPFD:
                err = f_dupfd(arg, filp, 0);
                break;
        case F_DUPFD_CLOEXEC:
                err = f_dupfd(arg, filp, O_CLOEXEC);
                break;
        case F_GETFD:
                err = get_close_on_exec(fd) ? FD_CLOEXEC : 0;
                break;
        case F_SETFD:
                err = 0;
                set_close_on_exec(fd, arg & FD_CLOEXEC);
                break;
        case F_GETFL:
                err = filp->f_flags;
                break;
        case F_SETFL:
                err = setfl(fd, filp, arg);
                break;
#if BITS_PER_LONG != 32
        /* 32-bit arches must use fcntl64() */
        case F_OFD_GETLK:
#endif
        case F_GETLK:
                if (copy_from_user(&flock, argp, sizeof(flock)))
                        return -EFAULT;
                err = fcntl_getlk(filp, cmd, &flock);
                if (!err && copy_to_user(argp, &flock, sizeof(flock)))
                        return -EFAULT;
                break;
#if BITS_PER_LONG != 32
        /* 32-bit arches must use fcntl64() */
        case F_OFD_SETLK:
        case F_OFD_SETLKW:
#endif
                fallthrough;
        case F_SETLK:
        case F_SETLKW:
                if (copy_from_user(&flock, argp, sizeof(flock)))
                        return -EFAULT;
                err = fcntl_setlk(fd, filp, cmd, &flock);
                break;
        case F_GETOWN:
                /*
                 * XXX If f_owner is a process group, the
                 * negative return value will get converted
                 * into an error.  Oops.  If we keep the
                 * current syscall conventions, the only way
                 * to fix this will be in libc.
                 */
                err = f_getown(filp);
                force_successful_syscall_return();
                break;
        case F_SETOWN:
                err = f_setown(filp, arg, 1);
                break;
        case F_GETOWN_EX:
                err = f_getown_ex(filp, arg);
                break;
        case F_SETOWN_EX:
                err = f_setown_ex(filp, arg);
                break;
        case F_GETOWNER_UIDS:
                err = f_getowner_uids(filp, arg);
                break;
        case F_GETSIG:
                err = filp->f_owner.signum;
                break;
        case F_SETSIG:
                /* arg == 0 restores default behaviour. */
                if (!valid_signal(arg)) {
                        break;
                }
                err = 0;
                filp->f_owner.signum = arg;
                break;
        case F_GETLEASE:
                err = fcntl_getlease(filp);
                break;
        case F_SETLEASE:
                err = fcntl_setlease(fd, filp, arg);
                break;
        case F_NOTIFY:
                err = fcntl_dirnotify(fd, filp, arg);
                break;
        case F_SETPIPE_SZ:
        case F_GETPIPE_SZ:
                err = pipe_fcntl(filp, cmd, arg);
                break;
        case F_ADD_SEALS:
        case F_GET_SEALS:
                err = memfd_fcntl(filp, cmd, arg);
                break;
        case F_GET_RW_HINT:
        case F_SET_RW_HINT:
        case F_GET_FILE_RW_HINT:
        case F_SET_FILE_RW_HINT:
                err = fcntl_rw_hint(filp, cmd, arg);
                break;
        default:
                break;
        }
        return err;
}

static int check_fcntl_cmd(unsigned cmd)
{
        switch (cmd) {
        case F_DUPFD:
        case F_DUPFD_CLOEXEC:
        case F_GETFD:
        case F_SETFD:
        case F_GETFL:
                return 1;
        }
        return 0;
}

SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
{        
        struct fd f = fdget_raw(fd);
        long err = -EBADF;

        if (!f.file)
                goto out;

        if (unlikely(f.file->f_mode & FMODE_PATH)) {
                if (!check_fcntl_cmd(cmd))
                        goto out1;
        }

        err = security_file_fcntl(f.file, cmd, arg);
        if (!err)
                err = do_fcntl(fd, cmd, arg, f.file);

out1:
         fdput(f);
out:
        return err;
}

#if BITS_PER_LONG == 32
SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
                unsigned long, arg)
{        
        void __user *argp = (void __user *)arg;
        struct fd f = fdget_raw(fd);
        struct flock64 flock;
        long err = -EBADF;

        if (!f.file)
                goto out;

        if (unlikely(f.file->f_mode & FMODE_PATH)) {
                if (!check_fcntl_cmd(cmd))
                        goto out1;
        }

        err = security_file_fcntl(f.file, cmd, arg);
        if (err)
                goto out1;
        
        switch (cmd) {
        case F_GETLK64:
        case F_OFD_GETLK:
                err = -EFAULT;
                if (copy_from_user(&flock, argp, sizeof(flock)))
                        break;
                err = fcntl_getlk64(f.file, cmd, &flock);
                if (!err && copy_to_user(argp, &flock, sizeof(flock)))
                        err = -EFAULT;
                break;
        case F_SETLK64:
        case F_SETLKW64:
        case F_OFD_SETLK:
        case F_OFD_SETLKW:
                err = -EFAULT;
                if (copy_from_user(&flock, argp, sizeof(flock)))
                        break;
                err = fcntl_setlk64(fd, f.file, cmd, &flock);
                break;
        default:
                err = do_fcntl(fd, cmd, arg, f.file);
                break;
        }
out1:
        fdput(f);
out:
        return err;
}
#endif

#ifdef CONFIG_COMPAT
/* careful - don't use anywhere else */
#define copy_flock_fields(dst, src)                \
        (dst)->l_type = (src)->l_type;                \
        (dst)->l_whence = (src)->l_whence;        \
        (dst)->l_start = (src)->l_start;        \
        (dst)->l_len = (src)->l_len;                \
        (dst)->l_pid = (src)->l_pid;

static int get_compat_flock(struct flock *kfl, const struct compat_flock __user *ufl)
{
        struct compat_flock fl;

        if (copy_from_user(&fl, ufl, sizeof(struct compat_flock)))
                return -EFAULT;
        copy_flock_fields(kfl, &fl);
        return 0;
}

static int get_compat_flock64(struct flock *kfl, const struct compat_flock64 __user *ufl)
{
        struct compat_flock64 fl;

        if (copy_from_user(&fl, ufl, sizeof(struct compat_flock64)))
                return -EFAULT;
        copy_flock_fields(kfl, &fl);
        return 0;
}

static int put_compat_flock(const struct flock *kfl, struct compat_flock __user *ufl)
{
        struct compat_flock fl;

        memset(&fl, 0, sizeof(struct compat_flock));
        copy_flock_fields(&fl, kfl);
        if (copy_to_user(ufl, &fl, sizeof(struct compat_flock)))
                return -EFAULT;
        return 0;
}

static int put_compat_flock64(const struct flock *kfl, struct compat_flock64 __user *ufl)
{
        struct compat_flock64 fl;

        BUILD_BUG_ON(sizeof(kfl->l_start) > sizeof(ufl->l_start));
        BUILD_BUG_ON(sizeof(kfl->l_len) > sizeof(ufl->l_len));

        memset(&fl, 0, sizeof(struct compat_flock64));
        copy_flock_fields(&fl, kfl);
        if (copy_to_user(ufl, &fl, sizeof(struct compat_flock64)))
                return -EFAULT;
        return 0;
}
#undef copy_flock_fields

static unsigned int
convert_fcntl_cmd(unsigned int cmd)
{
        switch (cmd) {
        case F_GETLK64:
                return F_GETLK;
        case F_SETLK64:
                return F_SETLK;
        case F_SETLKW64:
                return F_SETLKW;
        }

        return cmd;
}

/*
 * GETLK was successful and we need to return the data, but it needs to fit in
 * the compat structure.
 * l_start shouldn't be too big, unless the original start + end is greater than
 * COMPAT_OFF_T_MAX, in which case the app was asking for trouble, so we return
 * -EOVERFLOW in that case.  l_len could be too big, in which case we just
 * truncate it, and only allow the app to see that part of the conflicting lock
 * that might make sense to it anyway
 */
static int fixup_compat_flock(struct flock *flock)
{
        if (flock->l_start > COMPAT_OFF_T_MAX)
                return -EOVERFLOW;
        if (flock->l_len > COMPAT_OFF_T_MAX)
                flock->l_len = COMPAT_OFF_T_MAX;
        return 0;
}

static long do_compat_fcntl64(unsigned int fd, unsigned int cmd,
                             compat_ulong_t arg)
{
        struct fd f = fdget_raw(fd);
        struct flock flock;
        long err = -EBADF;

        if (!f.file)
                return err;

        if (unlikely(f.file->f_mode & FMODE_PATH)) {
                if (!check_fcntl_cmd(cmd))
                        goto out_put;
        }

        err = security_file_fcntl(f.file, cmd, arg);
        if (err)
                goto out_put;

        switch (cmd) {
        case F_GETLK:
                err = get_compat_flock(&flock, compat_ptr(arg));
                if (err)
                        break;
                err = fcntl_getlk(f.file, convert_fcntl_cmd(cmd), &flock);
                if (err)
                        break;
                err = fixup_compat_flock(&flock);
                if (!err)
                        err = put_compat_flock(&flock, compat_ptr(arg));
                break;
        case F_GETLK64:
        case F_OFD_GETLK:
                err = get_compat_flock64(&flock, compat_ptr(arg));
                if (err)
                        break;
                err = fcntl_getlk(f.file, convert_fcntl_cmd(cmd), &flock);
                if (!err)
                        err = put_compat_flock64(&flock, compat_ptr(arg));
                break;
        case F_SETLK:
        case F_SETLKW:
                err = get_compat_flock(&flock, compat_ptr(arg));
                if (err)
                        break;
                err = fcntl_setlk(fd, f.file, convert_fcntl_cmd(cmd), &flock);
                break;
        case F_SETLK64:
        case F_SETLKW64:
        case F_OFD_SETLK:
        case F_OFD_SETLKW:
                err = get_compat_flock64(&flock, compat_ptr(arg));
                if (err)
                        break;
                err = fcntl_setlk(fd, f.file, convert_fcntl_cmd(cmd), &flock);
                break;
        default:
                err = do_fcntl(fd, cmd, arg, f.file);
                break;
        }
out_put:
        fdput(f);
        return err;
}

COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
                       compat_ulong_t, arg)
{
        return do_compat_fcntl64(fd, cmd, arg);
}

COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd,
                       compat_ulong_t, arg)
{
        switch (cmd) {
        case F_GETLK64:
        case F_SETLK64:
        case F_SETLKW64:
        case F_OFD_GETLK:
        case F_OFD_SETLK:
        case F_OFD_SETLKW:
                return -EINVAL;
        }
        return do_compat_fcntl64(fd, cmd, arg);
}
#endif

/* Table to convert sigio signal codes into poll band bitmaps */

static const __poll_t band_table[NSIGPOLL] = {
        EPOLLIN | EPOLLRDNORM,                        /* POLL_IN */
        EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND,        /* POLL_OUT */
        EPOLLIN | EPOLLRDNORM | EPOLLMSG,                /* POLL_MSG */
        EPOLLERR,                                /* POLL_ERR */
        EPOLLPRI | EPOLLRDBAND,                        /* POLL_PRI */
        EPOLLHUP | EPOLLERR                        /* POLL_HUP */
};

static inline int sigio_perm(struct task_struct *p,
                             struct fown_struct *fown, int sig)
{
        const struct cred *cred;
        int ret;

        rcu_read_lock();
        cred = __task_cred(p);
        ret = ((uid_eq(fown->euid, GLOBAL_ROOT_UID) ||
                uid_eq(fown->euid, cred->suid) || uid_eq(fown->euid, cred->uid) ||
                uid_eq(fown->uid,  cred->suid) || uid_eq(fown->uid,  cred->uid)) &&
               !security_file_send_sigiotask(p, fown, sig));
        rcu_read_unlock();
        return ret;
}

static void send_sigio_to_task(struct task_struct *p,
                               struct fown_struct *fown,
                               int fd, int reason, enum pid_type type)
{
        /*
         * F_SETSIG can change ->signum lockless in parallel, make
         * sure we read it once and use the same value throughout.
         */
        int signum = READ_ONCE(fown->signum);

        if (!sigio_perm(p, fown, signum))
                return;

        switch (signum) {
                default: {
                        kernel_siginfo_t si;

                        /* Queue a rt signal with the appropriate fd as its
                           value.  We use SI_SIGIO as the source, not 
                           SI_KERNEL, since kernel signals always get 
                           delivered even if we can't queue.  Failure to
                           queue in this case _should_ be reported; we fall
                           back to SIGIO in that case. --sct */
                        clear_siginfo(&si);
                        si.si_signo = signum;
                        si.si_errno = 0;
                        si.si_code  = reason;
                        /*
                         * Posix definies POLL_IN and friends to be signal
                         * specific si_codes for SIG_POLL.  Linux extended
                         * these si_codes to other signals in a way that is
                         * ambiguous if other signals also have signal
                         * specific si_codes.  In that case use SI_SIGIO instead
                         * to remove the ambiguity.
                         */
                        if ((signum != SIGPOLL) && sig_specific_sicodes(signum))
                                si.si_code = SI_SIGIO;

                        /* Make sure we are called with one of the POLL_*
                           reasons, otherwise we could leak kernel stack into
                           userspace.  */
                        BUG_ON((reason < POLL_IN) || ((reason - POLL_IN) >= NSIGPOLL));
                        if (reason - POLL_IN >= NSIGPOLL)
                                si.si_band  = ~0L;
                        else
                                si.si_band = mangle_poll(band_table[reason - POLL_IN]);
                        si.si_fd    = fd;
                        if (!do_send_sig_info(signum, &si, p, type))
                                break;
                }
                        fallthrough;        /* fall back on the old plain SIGIO signal */
                case 0:
                        do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, type);
        }
}

void send_sigio(struct fown_struct *fown, int fd, int band)
{
        struct task_struct *p;
        enum pid_type type;
        unsigned long flags;
        struct pid *pid;
        
        read_lock_irqsave(&fown->lock, flags);

        type = fown->pid_type;
        pid = fown->pid;
        if (!pid)
                goto out_unlock_fown;

        if (type <= PIDTYPE_TGID) {
                rcu_read_lock();
                p = pid_task(pid, PIDTYPE_PID);
                if (p)
                        send_sigio_to_task(p, fown, fd, band, type);
                rcu_read_unlock();
        } else {
                read_lock(&tasklist_lock);
                do_each_pid_task(pid, type, p) {
                        send_sigio_to_task(p, fown, fd, band, type);
                } while_each_pid_task(pid, type, p);
                read_unlock(&tasklist_lock);
        }
 out_unlock_fown:
        read_unlock_irqrestore(&fown->lock, flags);
}

static void send_sigurg_to_task(struct task_struct *p,
                                struct fown_struct *fown, enum pid_type type)
{
        if (sigio_perm(p, fown, SIGURG))
                do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, type);
}

int send_sigurg(struct fown_struct *fown)
{
        struct task_struct *p;
        enum pid_type type;
        struct pid *pid;
        unsigned long flags;
        int ret = 0;
        
        read_lock_irqsave(&fown->lock, flags);

        type = fown->pid_type;
        pid = fown->pid;
        if (!pid)
                goto out_unlock_fown;

        ret = 1;

        if (type <= PIDTYPE_TGID) {
                rcu_read_lock();
                p = pid_task(pid, PIDTYPE_PID);
                if (p)
                        send_sigurg_to_task(p, fown, type);
                rcu_read_unlock();
        } else {
                read_lock(&tasklist_lock);
                do_each_pid_task(pid, type, p) {
                        send_sigurg_to_task(p, fown, type);
                } while_each_pid_task(pid, type, p);
                read_unlock(&tasklist_lock);
        }
 out_unlock_fown:
        read_unlock_irqrestore(&fown->lock, flags);
        return ret;
}

static DEFINE_SPINLOCK(fasync_lock);
static struct kmem_cache *fasync_cache __read_mostly;

static void fasync_free_rcu(struct rcu_head *head)
{
        kmem_cache_free(fasync_cache,
                        container_of(head, struct fasync_struct, fa_rcu));
}

/*
 * Remove a fasync entry. If successfully removed, return
 * positive and clear the FASYNC flag. If no entry exists,
 * do nothing and return 0.
 *
 * NOTE! It is very important that the FASYNC flag always
 * match the state "is the filp on a fasync list".
 *
 */
int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
{
        struct fasync_struct *fa, **fp;
        int result = 0;

        spin_lock(&filp->f_lock);
        spin_lock(&fasync_lock);
        for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
                if (fa->fa_file != filp)
                        continue;

                write_lock_irq(&fa->fa_lock);
                fa->fa_file = NULL;
                write_unlock_irq(&fa->fa_lock);

                *fp = fa->fa_next;
                call_rcu(&fa->fa_rcu, fasync_free_rcu);
                filp->f_flags &= ~FASYNC;
                result = 1;
                break;
        }
        spin_unlock(&fasync_lock);
        spin_unlock(&filp->f_lock);
        return result;
}

struct fasync_struct *fasync_alloc(void)
{
        return kmem_cache_alloc(fasync_cache, GFP_KERNEL);
}

/*
 * NOTE! This can be used only for unused fasync entries:
 * entries that actually got inserted on the fasync list
 * need to be released by rcu - see fasync_remove_entry.
 */
void fasync_free(struct fasync_struct *new)
{
        kmem_cache_free(fasync_cache, new);
}

/*
 * Insert a new entry into the fasync list.  Return the pointer to the
 * old one if we didn't use the new one.
 *
 * NOTE! It is very important that the FASYNC flag always
 * match the state "is the filp on a fasync list".
 */
struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasync_struct **fapp, struct fasync_struct *new)
{
        struct fasync_struct *fa, **fp;

        spin_lock(&filp->f_lock);
        spin_lock(&fasync_lock);
        for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
                if (fa->fa_file != filp)
                        continue;

                write_lock_irq(&fa->fa_lock);
                fa->fa_fd = fd;
                write_unlock_irq(&fa->fa_lock);
                goto out;
        }

        rwlock_init(&new->fa_lock);
        new->magic = FASYNC_MAGIC;
        new->fa_file = filp;
        new->fa_fd = fd;
        new->fa_next = *fapp;
        rcu_assign_pointer(*fapp, new);
        filp->f_flags |= FASYNC;

out:
        spin_unlock(&fasync_lock);
        spin_unlock(&filp->f_lock);
        return fa;
}

/*
 * Add a fasync entry. Return negative on error, positive if
 * added, and zero if did nothing but change an existing one.
 */
static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp)
{
        struct fasync_struct *new;

        new = fasync_alloc();
        if (!new)
                return -ENOMEM;

        /*
         * fasync_insert_entry() returns the old (update) entry if
         * it existed.
         *
         * So free the (unused) new entry and return 0 to let the
         * caller know that we didn't add any new fasync entries.
         */
        if (fasync_insert_entry(fd, filp, fapp, new)) {
                fasync_free(new);
                return 0;
        }

        return 1;
}

/*
 * fasync_helper() is used by almost all character device drivers
 * to set up the fasync queue, and for regular files by the file
 * lease code. It returns negative on error, 0 if it did no changes
 * and positive if it added/deleted the entry.
 */
int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp)
{
        if (!on)
                return fasync_remove_entry(filp, fapp);
        return fasync_add_entry(fd, filp, fapp);
}

EXPORT_SYMBOL(fasync_helper);

/*
 * rcu_read_lock() is held
 */
static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
{
        while (fa) {
                struct fown_struct *fown;
                unsigned long flags;

                if (fa->magic != FASYNC_MAGIC) {
                        printk(KERN_ERR "kill_fasync: bad magic number in "
                               "fasync_struct!\n");
                        return;
                }
                read_lock_irqsave(&fa->fa_lock, flags);
                if (fa->fa_file) {
                        fown = &fa->fa_file->f_owner;
                        /* Don't send SIGURG to processes which have not set a
                           queued signum: SIGURG has its own default signalling
                           mechanism. */
                        if (!(sig == SIGURG && fown->signum == 0))
                                send_sigio(fown, fa->fa_fd, band);
                }
                read_unlock_irqrestore(&fa->fa_lock, flags);
                fa = rcu_dereference(fa->fa_next);
        }
}

void kill_fasync(struct fasync_struct **fp, int sig, int band)
{
        /* First a quick test without locking: usually
         * the list is empty.
         */
        if (*fp) {
                rcu_read_lock();
                kill_fasync_rcu(rcu_dereference(*fp), sig, band);
                rcu_read_unlock();
        }
}
EXPORT_SYMBOL(kill_fasync);

static int __init fcntl_init(void)
{
        /*
         * Please add new bits here to ensure allocation uniqueness.
         * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
         * is defined as O_NONBLOCK on some platforms and not on others.
         */
        BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ !=
                HWEIGHT32(
                        (VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
                        __FMODE_EXEC | __FMODE_NONOTIFY));

        fasync_cache = kmem_cache_create("fasync_cache",
                sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL);
        return 0;
}

module_init(fcntl_init)










































































































































































































































































































































































































































































































































































    2 


























    1 








    1 




















































































































































































































































































































































    1 





































































































































    1 








































































































    1 







































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_NETLINK_H
#define __NET_NETLINK_H

#include <linux/types.h>
#include <linux/netlink.h>
#include <linux/jiffies.h>
#include <linux/in6.h>

/* ========================================================================
 *         Netlink Messages and Attributes Interface (As Seen On TV)
 * ------------------------------------------------------------------------
 *                          Messages Interface
 * ------------------------------------------------------------------------
 *
 * Message Format:
 *    <--- nlmsg_total_size(payload)  --->
 *    <-- nlmsg_msg_size(payload) ->
 *   +----------+- - -+-------------+- - -+-------- - -
 *   | nlmsghdr | Pad |   Payload   | Pad | nlmsghdr
 *   +----------+- - -+-------------+- - -+-------- - -
 *   nlmsg_data(nlh)---^                   ^
 *   nlmsg_next(nlh)-----------------------+
 *
 * Payload Format:
 *    <---------------------- nlmsg_len(nlh) --------------------->
 *    <------ hdrlen ------>       <- nlmsg_attrlen(nlh, hdrlen) ->
 *   +----------------------+- - -+--------------------------------+
 *   |     Family Header    | Pad |           Attributes           |
 *   +----------------------+- - -+--------------------------------+
 *   nlmsg_attrdata(nlh, hdrlen)---^
 *
 * Data Structures:
 *   struct nlmsghdr                        netlink message header
 *
 * Message Construction:
 *   nlmsg_new()                        create a new netlink message
 *   nlmsg_put()                        add a netlink message to an skb
 *   nlmsg_put_answer()                        callback based nlmsg_put()
 *   nlmsg_end()                        finalize netlink message
 *   nlmsg_get_pos()                        return current position in message
 *   nlmsg_trim()                        trim part of message
 *   nlmsg_cancel()                        cancel message construction
 *   nlmsg_free()                        free a netlink message
 *
 * Message Sending:
 *   nlmsg_multicast()                        multicast message to several groups
 *   nlmsg_unicast()                        unicast a message to a single socket
 *   nlmsg_notify()                        send notification message
 *
 * Message Length Calculations:
 *   nlmsg_msg_size(payload)                length of message w/o padding
 *   nlmsg_total_size(payload)                length of message w/ padding
 *   nlmsg_padlen(payload)                length of padding at tail
 *
 * Message Payload Access:
 *   nlmsg_data(nlh)                        head of message payload
 *   nlmsg_len(nlh)                        length of message payload
 *   nlmsg_attrdata(nlh, hdrlen)        head of attributes data
 *   nlmsg_attrlen(nlh, hdrlen)                length of attributes data
 *
 * Message Parsing:
 *   nlmsg_ok(nlh, remaining)                does nlh fit into remaining bytes?
 *   nlmsg_next(nlh, remaining)                get next netlink message
 *   nlmsg_parse()                        parse attributes of a message
 *   nlmsg_find_attr()                        find an attribute in a message
 *   nlmsg_for_each_msg()                loop over all messages
 *   nlmsg_validate()                        validate netlink message incl. attrs
 *   nlmsg_for_each_attr()                loop over all attributes
 *
 * Misc:
 *   nlmsg_report()                        report back to application?
 *
 * ------------------------------------------------------------------------
 *                          Attributes Interface
 * ------------------------------------------------------------------------
 *
 * Attribute Format:
 *    <------- nla_total_size(payload) ------->
 *    <---- nla_attr_size(payload) ----->
 *   +----------+- - -+- - - - - - - - - +- - -+-------- - -
 *   |  Header  | Pad |     Payload      | Pad |  Header
 *   +----------+- - -+- - - - - - - - - +- - -+-------- - -
 *                     <- nla_len(nla) ->      ^
 *   nla_data(nla)----^                        |
 *   nla_next(nla)-----------------------------'
 *
 * Data Structures:
 *   struct nlattr                        netlink attribute header
 *
 * Attribute Construction:
 *   nla_reserve(skb, type, len)        reserve room for an attribute
 *   nla_reserve_nohdr(skb, len)        reserve room for an attribute w/o hdr
 *   nla_put(skb, type, len, data)        add attribute to skb
 *   nla_put_nohdr(skb, len, data)        add attribute w/o hdr
 *   nla_append(skb, len, data)                append data to skb
 *
 * Attribute Construction for Basic Types:
 *   nla_put_u8(skb, type, value)        add u8 attribute to skb
 *   nla_put_u16(skb, type, value)        add u16 attribute to skb
 *   nla_put_u32(skb, type, value)        add u32 attribute to skb
 *   nla_put_u64_64bit(skb, type,
 *                     value, padattr)        add u64 attribute to skb
 *   nla_put_s8(skb, type, value)        add s8 attribute to skb
 *   nla_put_s16(skb, type, value)        add s16 attribute to skb
 *   nla_put_s32(skb, type, value)        add s32 attribute to skb
 *   nla_put_s64(skb, type, value,
 *               padattr)                add s64 attribute to skb
 *   nla_put_string(skb, type, str)        add string attribute to skb
 *   nla_put_flag(skb, type)                add flag attribute to skb
 *   nla_put_msecs(skb, type, jiffies,
 *                 padattr)                add msecs attribute to skb
 *   nla_put_in_addr(skb, type, addr)        add IPv4 address attribute to skb
 *   nla_put_in6_addr(skb, type, addr)        add IPv6 address attribute to skb
 *
 * Nested Attributes Construction:
 *   nla_nest_start(skb, type)                start a nested attribute
 *   nla_nest_end(skb, nla)                finalize a nested attribute
 *   nla_nest_cancel(skb, nla)                cancel nested attribute construction
 *
 * Attribute Length Calculations:
 *   nla_attr_size(payload)                length of attribute w/o padding
 *   nla_total_size(payload)                length of attribute w/ padding
 *   nla_padlen(payload)                length of padding
 *
 * Attribute Payload Access:
 *   nla_data(nla)                        head of attribute payload
 *   nla_len(nla)                        length of attribute payload
 *
 * Attribute Payload Access for Basic Types:
 *   nla_get_u8(nla)                        get payload for a u8 attribute
 *   nla_get_u16(nla)                        get payload for a u16 attribute
 *   nla_get_u32(nla)                        get payload for a u32 attribute
 *   nla_get_u64(nla)                        get payload for a u64 attribute
 *   nla_get_s8(nla)                        get payload for a s8 attribute
 *   nla_get_s16(nla)                        get payload for a s16 attribute
 *   nla_get_s32(nla)                        get payload for a s32 attribute
 *   nla_get_s64(nla)                        get payload for a s64 attribute
 *   nla_get_flag(nla)                        return 1 if flag is true
 *   nla_get_msecs(nla)                        get payload for a msecs attribute
 *
 * Attribute Misc:
 *   nla_memcpy(dest, nla, count)        copy attribute into memory
 *   nla_memcmp(nla, data, size)        compare attribute with memory area
 *   nla_strlcpy(dst, nla, size)        copy attribute to a sized string
 *   nla_strcmp(nla, str)                compare attribute with string
 *
 * Attribute Parsing:
 *   nla_ok(nla, remaining)                does nla fit into remaining bytes?
 *   nla_next(nla, remaining)                get next netlink attribute
 *   nla_validate()                        validate a stream of attributes
 *   nla_validate_nested()                validate a stream of nested attributes
 *   nla_find()                                find attribute in stream of attributes
 *   nla_find_nested()                        find attribute in nested attributes
 *   nla_parse()                        parse and validate stream of attrs
 *   nla_parse_nested()                        parse nested attributes
 *   nla_for_each_attr()                loop over all attributes
 *   nla_for_each_nested()                loop over the nested attributes
 *=========================================================================
 */

 /**
  * Standard attribute types to specify validation policy
  */
enum {
        NLA_UNSPEC,
        NLA_U8,
        NLA_U16,
        NLA_U32,
        NLA_U64,
        NLA_STRING,
        NLA_FLAG,
        NLA_MSECS,
        NLA_NESTED,
        NLA_NESTED_ARRAY,
        NLA_NUL_STRING,
        NLA_BINARY,
        NLA_S8,
        NLA_S16,
        NLA_S32,
        NLA_S64,
        NLA_BITFIELD32,
        NLA_REJECT,
        NLA_BE16,
        NLA_BE32,
        __NLA_TYPE_MAX,
};

#define NLA_TYPE_MAX (__NLA_TYPE_MAX - 1)

struct netlink_range_validation {
        u64 min, max;
};

struct netlink_range_validation_signed {
        s64 min, max;
};

enum nla_policy_validation {
        NLA_VALIDATE_NONE,
        NLA_VALIDATE_RANGE,
        NLA_VALIDATE_RANGE_WARN_TOO_LONG,
        NLA_VALIDATE_MIN,
        NLA_VALIDATE_MAX,
        NLA_VALIDATE_MASK,
        NLA_VALIDATE_RANGE_PTR,
        NLA_VALIDATE_FUNCTION,
};

/**
 * struct nla_policy - attribute validation policy
 * @type: Type of attribute or NLA_UNSPEC
 * @validation_type: type of attribute validation done in addition to
 *        type-specific validation (e.g. range, function call), see
 *        &enum nla_policy_validation
 * @len: Type specific length of payload
 *
 * Policies are defined as arrays of this struct, the array must be
 * accessible by attribute type up to the highest identifier to be expected.
 *
 * Meaning of `len' field:
 *    NLA_STRING           Maximum length of string
 *    NLA_NUL_STRING       Maximum length of string (excluding NUL)
 *    NLA_FLAG             Unused
 *    NLA_BINARY           Maximum length of attribute payload
 *                         (but see also below with the validation type)
 *    NLA_NESTED,
 *    NLA_NESTED_ARRAY     Length verification is done by checking len of
 *                         nested header (or empty); len field is used if
 *                         nested_policy is also used, for the max attr
 *                         number in the nested policy.
 *    NLA_U8, NLA_U16,
 *    NLA_U32, NLA_U64,
 *    NLA_S8, NLA_S16,
 *    NLA_S32, NLA_S64,
 *    NLA_BE16, NLA_BE32,
 *    NLA_MSECS            Leaving the length field zero will verify the
 *                         given type fits, using it verifies minimum length
 *                         just like "All other"
 *    NLA_BITFIELD32       Unused
 *    NLA_REJECT           Unused
 *    All other            Minimum length of attribute payload
 *
 * Meaning of validation union:
 *    NLA_BITFIELD32       This is a 32-bit bitmap/bitselector attribute and
 *                         `bitfield32_valid' is the u32 value of valid flags
 *    NLA_REJECT           This attribute is always rejected and `reject_message'
 *                         may point to a string to report as the error instead
 *                         of the generic one in extended ACK.
 *    NLA_NESTED           `nested_policy' to a nested policy to validate, must
 *                         also set `len' to the max attribute number. Use the
 *                         provided NLA_POLICY_NESTED() macro.
 *                         Note that nla_parse() will validate, but of course not
 *                         parse, the nested sub-policies.
 *    NLA_NESTED_ARRAY     `nested_policy' points to a nested policy to validate,
 *                         must also set `len' to the max attribute number. Use
 *                         the provided NLA_POLICY_NESTED_ARRAY() macro.
 *                         The difference to NLA_NESTED is the structure:
 *                         NLA_NESTED has the nested attributes directly inside
 *                         while an array has the nested attributes at another
 *                         level down and the attribute types directly in the
 *                         nesting don't matter.
 *    NLA_U8,
 *    NLA_U16,
 *    NLA_U32,
 *    NLA_U64,
 *    NLA_BE16,
 *    NLA_BE32,
 *    NLA_S8,
 *    NLA_S16,
 *    NLA_S32,
 *    NLA_S64              The `min' and `max' fields are used depending on the
 *                         validation_type field, if that is min/max/range then
 *                         the min, max or both are used (respectively) to check
 *                         the value of the integer attribute.
 *                         Note that in the interest of code simplicity and
 *                         struct size both limits are s16, so you cannot
 *                         enforce a range that doesn't fall within the range
 *                         of s16 - do that as usual in the code instead.
 *                         Use the NLA_POLICY_MIN(), NLA_POLICY_MAX() and
 *                         NLA_POLICY_RANGE() macros.
 *    NLA_U8,
 *    NLA_U16,
 *    NLA_U32,
 *    NLA_U64              If the validation_type field instead is set to
 *                         NLA_VALIDATE_RANGE_PTR, `range' must be a pointer
 *                         to a struct netlink_range_validation that indicates
 *                         the min/max values.
 *                         Use NLA_POLICY_FULL_RANGE().
 *    NLA_S8,
 *    NLA_S16,
 *    NLA_S32,
 *    NLA_S64              If the validation_type field instead is set to
 *                         NLA_VALIDATE_RANGE_PTR, `range_signed' must be a
 *                         pointer to a struct netlink_range_validation_signed
 *                         that indicates the min/max values.
 *                         Use NLA_POLICY_FULL_RANGE_SIGNED().
 *
 *    NLA_BINARY           If the validation type is like the ones for integers
 *                         above, then the min/max length (not value like for
 *                         integers) of the attribute is enforced.
 *
 *    All other            Unused - but note that it's a union
 *
 * Meaning of `validate' field, use via NLA_POLICY_VALIDATE_FN:
 *    NLA_BINARY           Validation function called for the attribute.
 *    All other            Unused - but note that it's a union
 *
 * Example:
 *
 * static const u32 myvalidflags = 0xff231023;
 *
 * static const struct nla_policy my_policy[ATTR_MAX+1] = {
 *         [ATTR_FOO] = { .type = NLA_U16 },
 *        [ATTR_BAR] = { .type = NLA_STRING, .len = BARSIZ },
 *        [ATTR_BAZ] = NLA_POLICY_EXACT_LEN(sizeof(struct mystruct)),
 *        [ATTR_GOO] = NLA_POLICY_BITFIELD32(myvalidflags),
 * };
 */
struct nla_policy {
        u8                type;
        u8                validation_type;
        u16                len;
        union {
                /**
                 * @strict_start_type: first attribute to validate strictly
                 *
                 * This entry is special, and used for the attribute at index 0
                 * only, and specifies special data about the policy, namely it
                 * specifies the "boundary type" where strict length validation
                 * starts for any attribute types >= this value, also, strict
                 * nesting validation starts here.
                 *
                 * Additionally, it means that NLA_UNSPEC is actually NLA_REJECT
                 * for any types >= this, so need to use NLA_POLICY_MIN_LEN() to
                 * get the previous pure { .len = xyz } behaviour. The advantage
                 * of this is that types not specified in the policy will be
                 * rejected.
                 *
                 * For completely new families it should be set to 1 so that the
                 * validation is enforced for all attributes. For existing ones
                 * it should be set at least when new attributes are added to
                 * the enum used by the policy, and be set to the new value that
                 * was added to enforce strict validation from thereon.
                 */
                u16 strict_start_type;

                /* private: use NLA_POLICY_*() to set */
                const u32 bitfield32_valid;
                const u32 mask;
                const char *reject_message;
                const struct nla_policy *nested_policy;
                struct netlink_range_validation *range;
                struct netlink_range_validation_signed *range_signed;
                struct {
                        s16 min, max;
                };
                int (*validate)(const struct nlattr *attr,
                                struct netlink_ext_ack *extack);
        };
};

#define NLA_POLICY_ETH_ADDR                NLA_POLICY_EXACT_LEN(ETH_ALEN)
#define NLA_POLICY_ETH_ADDR_COMPAT        NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN)

#define _NLA_POLICY_NESTED(maxattr, policy) \
        { .type = NLA_NESTED, .nested_policy = policy, .len = maxattr }
#define _NLA_POLICY_NESTED_ARRAY(maxattr, policy) \
        { .type = NLA_NESTED_ARRAY, .nested_policy = policy, .len = maxattr }
#define NLA_POLICY_NESTED(policy) \
        _NLA_POLICY_NESTED(ARRAY_SIZE(policy) - 1, policy)
#define NLA_POLICY_NESTED_ARRAY(policy) \
        _NLA_POLICY_NESTED_ARRAY(ARRAY_SIZE(policy) - 1, policy)
#define NLA_POLICY_BITFIELD32(valid) \
        { .type = NLA_BITFIELD32, .bitfield32_valid = valid }

#define __NLA_IS_UINT_TYPE(tp)                                        \
        (tp == NLA_U8 || tp == NLA_U16 || tp == NLA_U32 ||        \
         tp == NLA_U64 || tp == NLA_BE16 || tp == NLA_BE32)
#define __NLA_IS_SINT_TYPE(tp)                                                \
        (tp == NLA_S8 || tp == NLA_S16 || tp == NLA_S32 || tp == NLA_S64)

#define __NLA_ENSURE(condition) BUILD_BUG_ON_ZERO(!(condition))
#define NLA_ENSURE_UINT_TYPE(tp)                        \
        (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp)) + tp)
#define NLA_ENSURE_UINT_OR_BINARY_TYPE(tp)                \
        (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp) ||        \
                      tp == NLA_MSECS ||                \
                      tp == NLA_BINARY) + tp)
#define NLA_ENSURE_SINT_TYPE(tp)                        \
        (__NLA_ENSURE(__NLA_IS_SINT_TYPE(tp)) + tp)
#define NLA_ENSURE_INT_OR_BINARY_TYPE(tp)                \
        (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp) ||                \
                      __NLA_IS_SINT_TYPE(tp) ||                \
                      tp == NLA_MSECS ||                \
                      tp == NLA_BINARY) + tp)
#define NLA_ENSURE_NO_VALIDATION_PTR(tp)                \
        (__NLA_ENSURE(tp != NLA_BITFIELD32 &&                \
                      tp != NLA_REJECT &&                \
                      tp != NLA_NESTED &&                \
                      tp != NLA_NESTED_ARRAY) + tp)

#define NLA_POLICY_RANGE(tp, _min, _max) {                \
        .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp),        \
        .validation_type = NLA_VALIDATE_RANGE,                \
        .min = _min,                                        \
        .max = _max                                        \
}

#define NLA_POLICY_FULL_RANGE(tp, _range) {                \
        .type = NLA_ENSURE_UINT_OR_BINARY_TYPE(tp),        \
        .validation_type = NLA_VALIDATE_RANGE_PTR,        \
        .range = _range,                                \
}

#define NLA_POLICY_FULL_RANGE_SIGNED(tp, _range) {        \
        .type = NLA_ENSURE_SINT_TYPE(tp),                \
        .validation_type = NLA_VALIDATE_RANGE_PTR,        \
        .range_signed = _range,                                \
}

#define NLA_POLICY_MIN(tp, _min) {                        \
        .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp),        \
        .validation_type = NLA_VALIDATE_MIN,                \
        .min = _min,                                        \
}

#define NLA_POLICY_MAX(tp, _max) {                        \
        .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp),        \
        .validation_type = NLA_VALIDATE_MAX,                \
        .max = _max,                                        \
}

#define NLA_POLICY_MASK(tp, _mask) {                        \
        .type = NLA_ENSURE_UINT_TYPE(tp),                \
        .validation_type = NLA_VALIDATE_MASK,                \
        .mask = _mask,                                        \
}

#define NLA_POLICY_VALIDATE_FN(tp, fn, ...) {                \
        .type = NLA_ENSURE_NO_VALIDATION_PTR(tp),        \
        .validation_type = NLA_VALIDATE_FUNCTION,        \
        .validate = fn,                                        \
        .len = __VA_ARGS__ + 0,                                \
}

#define NLA_POLICY_EXACT_LEN(_len)        NLA_POLICY_RANGE(NLA_BINARY, _len, _len)
#define NLA_POLICY_EXACT_LEN_WARN(_len) {                        \
        .type = NLA_BINARY,                                        \
        .validation_type = NLA_VALIDATE_RANGE_WARN_TOO_LONG,        \
        .min = _len,                                                \
        .max = _len                                                \
}
#define NLA_POLICY_MIN_LEN(_len)        NLA_POLICY_MIN(NLA_BINARY, _len)

/**
 * struct nl_info - netlink source information
 * @nlh: Netlink message header of original request
 * @nl_net: Network namespace
 * @portid: Netlink PORTID of requesting application
 * @skip_notify: Skip netlink notifications to user space
 * @skip_notify_kernel: Skip selected in-kernel notifications
 */
struct nl_info {
        struct nlmsghdr                *nlh;
        struct net                *nl_net;
        u32                        portid;
        u8                        skip_notify:1,
                                skip_notify_kernel:1;
};

/**
 * enum netlink_validation - netlink message/attribute validation levels
 * @NL_VALIDATE_LIBERAL: Old-style "be liberal" validation, not caring about
 *        extra data at the end of the message, attributes being longer than
 *        they should be, or unknown attributes being present.
 * @NL_VALIDATE_TRAILING: Reject junk data encountered after attribute parsing.
 * @NL_VALIDATE_MAXTYPE: Reject attributes > max type; Together with _TRAILING
 *        this is equivalent to the old nla_parse_strict()/nlmsg_parse_strict().
 * @NL_VALIDATE_UNSPEC: Reject attributes with NLA_UNSPEC in the policy.
 *        This can safely be set by the kernel when the given policy has no
 *        NLA_UNSPEC anymore, and can thus be used to ensure policy entries
 *        are enforced going forward.
 * @NL_VALIDATE_STRICT_ATTRS: strict attribute policy parsing (e.g.
 *        U8, U16, U32 must have exact size, etc.)
 * @NL_VALIDATE_NESTED: Check that NLA_F_NESTED is set for NLA_NESTED(_ARRAY)
 *        and unset for other policies.
 */
enum netlink_validation {
        NL_VALIDATE_LIBERAL = 0,
        NL_VALIDATE_TRAILING = BIT(0),
        NL_VALIDATE_MAXTYPE = BIT(1),
        NL_VALIDATE_UNSPEC = BIT(2),
        NL_VALIDATE_STRICT_ATTRS = BIT(3),
        NL_VALIDATE_NESTED = BIT(4),
};

#define NL_VALIDATE_DEPRECATED_STRICT (NL_VALIDATE_TRAILING |\
                                       NL_VALIDATE_MAXTYPE)
#define NL_VALIDATE_STRICT (NL_VALIDATE_TRAILING |\
                            NL_VALIDATE_MAXTYPE |\
                            NL_VALIDATE_UNSPEC |\
                            NL_VALIDATE_STRICT_ATTRS |\
                            NL_VALIDATE_NESTED)

int netlink_rcv_skb(struct sk_buff *skb,
                    int (*cb)(struct sk_buff *, struct nlmsghdr *,
                              struct netlink_ext_ack *));
int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid,
                 unsigned int group, int report, gfp_t flags);

int __nla_validate(const struct nlattr *head, int len, int maxtype,
                   const struct nla_policy *policy, unsigned int validate,
                   struct netlink_ext_ack *extack);
int __nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head,
                int len, const struct nla_policy *policy, unsigned int validate,
                struct netlink_ext_ack *extack);
int nla_policy_len(const struct nla_policy *, int);
struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype);
size_t nla_strlcpy(char *dst, const struct nlattr *nla, size_t dstsize);
char *nla_strdup(const struct nlattr *nla, gfp_t flags);
int nla_memcpy(void *dest, const struct nlattr *src, int count);
int nla_memcmp(const struct nlattr *nla, const void *data, size_t size);
int nla_strcmp(const struct nlattr *nla, const char *str);
struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen);
struct nlattr *__nla_reserve_64bit(struct sk_buff *skb, int attrtype,
                                   int attrlen, int padattr);
void *__nla_reserve_nohdr(struct sk_buff *skb, int attrlen);
struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen);
struct nlattr *nla_reserve_64bit(struct sk_buff *skb, int attrtype,
                                 int attrlen, int padattr);
void *nla_reserve_nohdr(struct sk_buff *skb, int attrlen);
void __nla_put(struct sk_buff *skb, int attrtype, int attrlen,
               const void *data);
void __nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen,
                     const void *data, int padattr);
void __nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data);
int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data);
int nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen,
                  const void *data, int padattr);
int nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data);
int nla_append(struct sk_buff *skb, int attrlen, const void *data);

/**************************************************************************
 * Netlink Messages
 **************************************************************************/

/**
 * nlmsg_msg_size - length of netlink message not including padding
 * @payload: length of message payload
 */
static inline int nlmsg_msg_size(int payload)
{
        return NLMSG_HDRLEN + payload;
}

/**
 * nlmsg_total_size - length of netlink message including padding
 * @payload: length of message payload
 */
static inline int nlmsg_total_size(int payload)
{
        return NLMSG_ALIGN(nlmsg_msg_size(payload));
}

/**
 * nlmsg_padlen - length of padding at the message's tail
 * @payload: length of message payload
 */
static inline int nlmsg_padlen(int payload)
{
        return nlmsg_total_size(payload) - nlmsg_msg_size(payload);
}

/**
 * nlmsg_data - head of message payload
 * @nlh: netlink message header
 */
static inline void *nlmsg_data(const struct nlmsghdr *nlh)
{
        return (unsigned char *) nlh + NLMSG_HDRLEN;
}

/**
 * nlmsg_len - length of message payload
 * @nlh: netlink message header
 */
static inline int nlmsg_len(const struct nlmsghdr *nlh)
{
        return nlh->nlmsg_len - NLMSG_HDRLEN;
}

/**
 * nlmsg_attrdata - head of attributes data
 * @nlh: netlink message header
 * @hdrlen: length of family specific header
 */
static inline struct nlattr *nlmsg_attrdata(const struct nlmsghdr *nlh,
                                            int hdrlen)
{
        unsigned char *data = nlmsg_data(nlh);
        return (struct nlattr *) (data + NLMSG_ALIGN(hdrlen));
}

/**
 * nlmsg_attrlen - length of attributes data
 * @nlh: netlink message header
 * @hdrlen: length of family specific header
 */
static inline int nlmsg_attrlen(const struct nlmsghdr *nlh, int hdrlen)
{
        return nlmsg_len(nlh) - NLMSG_ALIGN(hdrlen);
}

/**
 * nlmsg_ok - check if the netlink message fits into the remaining bytes
 * @nlh: netlink message header
 * @remaining: number of bytes remaining in message stream
 */
static inline int nlmsg_ok(const struct nlmsghdr *nlh, int remaining)
{
        return (remaining >= (int) sizeof(struct nlmsghdr) &&
                nlh->nlmsg_len >= sizeof(struct nlmsghdr) &&
                nlh->nlmsg_len <= remaining);
}

/**
 * nlmsg_next - next netlink message in message stream
 * @nlh: netlink message header
 * @remaining: number of bytes remaining in message stream
 *
 * Returns the next netlink message in the message stream and
 * decrements remaining by the size of the current message.
 */
static inline struct nlmsghdr *
nlmsg_next(const struct nlmsghdr *nlh, int *remaining)
{
        int totlen = NLMSG_ALIGN(nlh->nlmsg_len);

        *remaining -= totlen;

        return (struct nlmsghdr *) ((unsigned char *) nlh + totlen);
}

/**
 * nla_parse - Parse a stream of attributes into a tb buffer
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @policy: validation policy
 * @extack: extended ACK pointer
 *
 * Parses a stream of attributes and stores a pointer to each attribute in
 * the tb array accessible via the attribute type. Attributes with a type
 * exceeding maxtype will be rejected, policy must be specified, attributes
 * will be validated in the strictest way possible.
 *
 * Returns 0 on success or a negative error code.
 */
static inline int nla_parse(struct nlattr **tb, int maxtype,
                            const struct nlattr *head, int len,
                            const struct nla_policy *policy,
                            struct netlink_ext_ack *extack)
{
        return __nla_parse(tb, maxtype, head, len, policy,
                           NL_VALIDATE_STRICT, extack);
}

/**
 * nla_parse_deprecated - Parse a stream of attributes into a tb buffer
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @policy: validation policy
 * @extack: extended ACK pointer
 *
 * Parses a stream of attributes and stores a pointer to each attribute in
 * the tb array accessible via the attribute type. Attributes with a type
 * exceeding maxtype will be ignored and attributes from the policy are not
 * always strictly validated (only for new attributes).
 *
 * Returns 0 on success or a negative error code.
 */
static inline int nla_parse_deprecated(struct nlattr **tb, int maxtype,
                                       const struct nlattr *head, int len,
                                       const struct nla_policy *policy,
                                       struct netlink_ext_ack *extack)
{
        return __nla_parse(tb, maxtype, head, len, policy,
                           NL_VALIDATE_LIBERAL, extack);
}

/**
 * nla_parse_deprecated_strict - Parse a stream of attributes into a tb buffer
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @policy: validation policy
 * @extack: extended ACK pointer
 *
 * Parses a stream of attributes and stores a pointer to each attribute in
 * the tb array accessible via the attribute type. Attributes with a type
 * exceeding maxtype will be rejected as well as trailing data, but the
 * policy is not completely strictly validated (only for new attributes).
 *
 * Returns 0 on success or a negative error code.
 */
static inline int nla_parse_deprecated_strict(struct nlattr **tb, int maxtype,
                                              const struct nlattr *head,
                                              int len,
                                              const struct nla_policy *policy,
                                              struct netlink_ext_ack *extack)
{
        return __nla_parse(tb, maxtype, head, len, policy,
                           NL_VALIDATE_DEPRECATED_STRICT, extack);
}

/**
 * __nlmsg_parse - parse attributes of a netlink message
 * @nlh: netlink message header
 * @hdrlen: length of family specific header
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @validate: validation strictness
 * @extack: extended ACK report struct
 *
 * See nla_parse()
 */
static inline int __nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen,
                                struct nlattr *tb[], int maxtype,
                                const struct nla_policy *policy,
                                unsigned int validate,
                                struct netlink_ext_ack *extack)
{
        if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) {
                NL_SET_ERR_MSG(extack, "Invalid header length");
                return -EINVAL;
        }

        return __nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen),
                           nlmsg_attrlen(nlh, hdrlen), policy, validate,
                           extack);
}

/**
 * nlmsg_parse - parse attributes of a netlink message
 * @nlh: netlink message header
 * @hdrlen: length of family specific header
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @extack: extended ACK report struct
 *
 * See nla_parse()
 */
static inline int nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen,
                              struct nlattr *tb[], int maxtype,
                              const struct nla_policy *policy,
                              struct netlink_ext_ack *extack)
{
        return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy,
                             NL_VALIDATE_STRICT, extack);
}

/**
 * nlmsg_parse_deprecated - parse attributes of a netlink message
 * @nlh: netlink message header
 * @hdrlen: length of family specific header
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @extack: extended ACK report struct
 *
 * See nla_parse_deprecated()
 */
static inline int nlmsg_parse_deprecated(const struct nlmsghdr *nlh, int hdrlen,
                                         struct nlattr *tb[], int maxtype,
                                         const struct nla_policy *policy,
                                         struct netlink_ext_ack *extack)
{
        return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy,
                             NL_VALIDATE_LIBERAL, extack);
}

/**
 * nlmsg_parse_deprecated_strict - parse attributes of a netlink message
 * @nlh: netlink message header
 * @hdrlen: length of family specific header
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @extack: extended ACK report struct
 *
 * See nla_parse_deprecated_strict()
 */
static inline int
nlmsg_parse_deprecated_strict(const struct nlmsghdr *nlh, int hdrlen,
                              struct nlattr *tb[], int maxtype,
                              const struct nla_policy *policy,
                              struct netlink_ext_ack *extack)
{
        return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy,
                             NL_VALIDATE_DEPRECATED_STRICT, extack);
}

/**
 * nlmsg_find_attr - find a specific attribute in a netlink message
 * @nlh: netlink message header
 * @hdrlen: length of familiy specific header
 * @attrtype: type of attribute to look for
 *
 * Returns the first attribute which matches the specified type.
 */
static inline struct nlattr *nlmsg_find_attr(const struct nlmsghdr *nlh,
                                             int hdrlen, int attrtype)
{
        return nla_find(nlmsg_attrdata(nlh, hdrlen),
                        nlmsg_attrlen(nlh, hdrlen), attrtype);
}

/**
 * nla_validate_deprecated - Validate a stream of attributes
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @validate: validation strictness
 * @extack: extended ACK report struct
 *
 * Validates all attributes in the specified attribute stream against the
 * specified policy. Validation is done in liberal mode.
 * See documenation of struct nla_policy for more details.
 *
 * Returns 0 on success or a negative error code.
 */
static inline int nla_validate_deprecated(const struct nlattr *head, int len,
                                          int maxtype,
                                          const struct nla_policy *policy,
                                          struct netlink_ext_ack *extack)
{
        return __nla_validate(head, len, maxtype, policy, NL_VALIDATE_LIBERAL,
                              extack);
}

/**
 * nla_validate - Validate a stream of attributes
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @extack: extended ACK report struct
 *
 * Validates all attributes in the specified attribute stream against the
 * specified policy. Validation is done in strict mode.
 * See documenation of struct nla_policy for more details.
 *
 * Returns 0 on success or a negative error code.
 */
static inline int nla_validate(const struct nlattr *head, int len, int maxtype,
                               const struct nla_policy *policy,
                               struct netlink_ext_ack *extack)
{
        return __nla_validate(head, len, maxtype, policy, NL_VALIDATE_STRICT,
                              extack);
}

/**
 * nlmsg_validate_deprecated - validate a netlink message including attributes
 * @nlh: netlinket message header
 * @hdrlen: length of familiy specific header
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @extack: extended ACK report struct
 */
static inline int nlmsg_validate_deprecated(const struct nlmsghdr *nlh,
                                            int hdrlen, int maxtype,
                                            const struct nla_policy *policy,
                                            struct netlink_ext_ack *extack)
{
        if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen))
                return -EINVAL;

        return __nla_validate(nlmsg_attrdata(nlh, hdrlen),
                              nlmsg_attrlen(nlh, hdrlen), maxtype,
                              policy, NL_VALIDATE_LIBERAL, extack);
}



/**
 * nlmsg_report - need to report back to application?
 * @nlh: netlink message header
 *
 * Returns 1 if a report back to the application is requested.
 */
static inline int nlmsg_report(const struct nlmsghdr *nlh)
{
        return !!(nlh->nlmsg_flags & NLM_F_ECHO);
}

/**
 * nlmsg_for_each_attr - iterate over a stream of attributes
 * @pos: loop counter, set to current attribute
 * @nlh: netlink message header
 * @hdrlen: length of familiy specific header
 * @rem: initialized to len, holds bytes currently remaining in stream
 */
#define nlmsg_for_each_attr(pos, nlh, hdrlen, rem) \
        nla_for_each_attr(pos, nlmsg_attrdata(nlh, hdrlen), \
                          nlmsg_attrlen(nlh, hdrlen), rem)

/**
 * nlmsg_put - Add a new netlink message to an skb
 * @skb: socket buffer to store message in
 * @portid: netlink PORTID of requesting application
 * @seq: sequence number of message
 * @type: message type
 * @payload: length of message payload
 * @flags: message flags
 *
 * Returns NULL if the tailroom of the skb is insufficient to store
 * the message header and payload.
 */
static inline struct nlmsghdr *nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq,
                                         int type, int payload, int flags)
{
        if (unlikely(skb_tailroom(skb) < nlmsg_total_size(payload)))
                return NULL;

        return __nlmsg_put(skb, portid, seq, type, payload, flags);
}

/**
 * nlmsg_put_answer - Add a new callback based netlink message to an skb
 * @skb: socket buffer to store message in
 * @cb: netlink callback
 * @type: message type
 * @payload: length of message payload
 * @flags: message flags
 *
 * Returns NULL if the tailroom of the skb is insufficient to store
 * the message header and payload.
 */
static inline struct nlmsghdr *nlmsg_put_answer(struct sk_buff *skb,
                                                struct netlink_callback *cb,
                                                int type, int payload,
                                                int flags)
{
        return nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
                         type, payload, flags);
}

/**
 * nlmsg_new - Allocate a new netlink message
 * @payload: size of the message payload
 * @flags: the type of memory to allocate.
 *
 * Use NLMSG_DEFAULT_SIZE if the size of the payload isn't known
 * and a good default is needed.
 */
static inline struct sk_buff *nlmsg_new(size_t payload, gfp_t flags)
{
        return alloc_skb(nlmsg_total_size(payload), flags);
}

/**
 * nlmsg_end - Finalize a netlink message
 * @skb: socket buffer the message is stored in
 * @nlh: netlink message header
 *
 * Corrects the netlink message header to include the appeneded
 * attributes. Only necessary if attributes have been added to
 * the message.
 */
static inline void nlmsg_end(struct sk_buff *skb, struct nlmsghdr *nlh)
{
        nlh->nlmsg_len = skb_tail_pointer(skb) - (unsigned char *)nlh;
}

/**
 * nlmsg_get_pos - return current position in netlink message
 * @skb: socket buffer the message is stored in
 *
 * Returns a pointer to the current tail of the message.
 */
static inline void *nlmsg_get_pos(struct sk_buff *skb)
{
        return skb_tail_pointer(skb);
}

/**
 * nlmsg_trim - Trim message to a mark
 * @skb: socket buffer the message is stored in
 * @mark: mark to trim to
 *
 * Trims the message to the provided mark.
 */
static inline void nlmsg_trim(struct sk_buff *skb, const void *mark)
{
        if (mark) {
                WARN_ON((unsigned char *) mark < skb->data);
                skb_trim(skb, (unsigned char *) mark - skb->data);
        }
}

/**
 * nlmsg_cancel - Cancel construction of a netlink message
 * @skb: socket buffer the message is stored in
 * @nlh: netlink message header
 *
 * Removes the complete netlink message including all
 * attributes from the socket buffer again.
 */
static inline void nlmsg_cancel(struct sk_buff *skb, struct nlmsghdr *nlh)
{
        nlmsg_trim(skb, nlh);
}

/**
 * nlmsg_free - free a netlink message
 * @skb: socket buffer of netlink message
 */
static inline void nlmsg_free(struct sk_buff *skb)
{
        kfree_skb(skb);
}

/**
 * nlmsg_multicast - multicast a netlink message
 * @sk: netlink socket to spread messages to
 * @skb: netlink message as socket buffer
 * @portid: own netlink portid to avoid sending to yourself
 * @group: multicast group id
 * @flags: allocation flags
 */
static inline int nlmsg_multicast(struct sock *sk, struct sk_buff *skb,
                                  u32 portid, unsigned int group, gfp_t flags)
{
        int err;

        NETLINK_CB(skb).dst_group = group;

        err = netlink_broadcast(sk, skb, portid, group, flags);
        if (err > 0)
                err = 0;

        return err;
}

/**
 * nlmsg_unicast - unicast a netlink message
 * @sk: netlink socket to spread message to
 * @skb: netlink message as socket buffer
 * @portid: netlink portid of the destination socket
 */
static inline int nlmsg_unicast(struct sock *sk, struct sk_buff *skb, u32 portid)
{
        int err;

        err = netlink_unicast(sk, skb, portid, MSG_DONTWAIT);
        if (err > 0)
                err = 0;

        return err;
}

/**
 * nlmsg_for_each_msg - iterate over a stream of messages
 * @pos: loop counter, set to current message
 * @head: head of message stream
 * @len: length of message stream
 * @rem: initialized to len, holds bytes currently remaining in stream
 */
#define nlmsg_for_each_msg(pos, head, len, rem) \
        for (pos = head, rem = len; \
             nlmsg_ok(pos, rem); \
             pos = nlmsg_next(pos, &(rem)))

/**
 * nl_dump_check_consistent - check if sequence is consistent and advertise if not
 * @cb: netlink callback structure that stores the sequence number
 * @nlh: netlink message header to write the flag to
 *
 * This function checks if the sequence (generation) number changed during dump
 * and if it did, advertises it in the netlink message header.
 *
 * The correct way to use it is to set cb->seq to the generation counter when
 * all locks for dumping have been acquired, and then call this function for
 * each message that is generated.
 *
 * Note that due to initialisation concerns, 0 is an invalid sequence number
 * and must not be used by code that uses this functionality.
 */
static inline void
nl_dump_check_consistent(struct netlink_callback *cb,
                         struct nlmsghdr *nlh)
{
        if (cb->prev_seq && cb->seq != cb->prev_seq)
                nlh->nlmsg_flags |= NLM_F_DUMP_INTR;
        cb->prev_seq = cb->seq;
}

/**************************************************************************
 * Netlink Attributes
 **************************************************************************/

/**
 * nla_attr_size - length of attribute not including padding
 * @payload: length of payload
 */
static inline int nla_attr_size(int payload)
{
        return NLA_HDRLEN + payload;
}

/**
 * nla_total_size - total length of attribute including padding
 * @payload: length of payload
 */
static inline int nla_total_size(int payload)
{
        return NLA_ALIGN(nla_attr_size(payload));
}

/**
 * nla_padlen - length of padding at the tail of attribute
 * @payload: length of payload
 */
static inline int nla_padlen(int payload)
{
        return nla_total_size(payload) - nla_attr_size(payload);
}

/**
 * nla_type - attribute type
 * @nla: netlink attribute
 */
static inline int nla_type(const struct nlattr *nla)
{
        return nla->nla_type & NLA_TYPE_MASK;
}

/**
 * nla_data - head of payload
 * @nla: netlink attribute
 */
static inline void *nla_data(const struct nlattr *nla)
{
        return (char *) nla + NLA_HDRLEN;
}

/**
 * nla_len - length of payload
 * @nla: netlink attribute
 */
static inline int nla_len(const struct nlattr *nla)
{
        return nla->nla_len - NLA_HDRLEN;
}

/**
 * nla_ok - check if the netlink attribute fits into the remaining bytes
 * @nla: netlink attribute
 * @remaining: number of bytes remaining in attribute stream
 */
static inline int nla_ok(const struct nlattr *nla, int remaining)
{
        return remaining >= (int) sizeof(*nla) &&
               nla->nla_len >= sizeof(*nla) &&
               nla->nla_len <= remaining;
}

/**
 * nla_next - next netlink attribute in attribute stream
 * @nla: netlink attribute
 * @remaining: number of bytes remaining in attribute stream
 *
 * Returns the next netlink attribute in the attribute stream and
 * decrements remaining by the size of the current attribute.
 */
static inline struct nlattr *nla_next(const struct nlattr *nla, int *remaining)
{
        unsigned int totlen = NLA_ALIGN(nla->nla_len);

        *remaining -= totlen;
        return (struct nlattr *) ((char *) nla + totlen);
}

/**
 * nla_find_nested - find attribute in a set of nested attributes
 * @nla: attribute containing the nested attributes
 * @attrtype: type of attribute to look for
 *
 * Returns the first attribute which matches the specified type.
 */
static inline struct nlattr *
nla_find_nested(const struct nlattr *nla, int attrtype)
{
        return nla_find(nla_data(nla), nla_len(nla), attrtype);
}

/**
 * nla_parse_nested - parse nested attributes
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @nla: attribute containing the nested attributes
 * @policy: validation policy
 * @extack: extended ACK report struct
 *
 * See nla_parse()
 */
static inline int nla_parse_nested(struct nlattr *tb[], int maxtype,
                                   const struct nlattr *nla,
                                   const struct nla_policy *policy,
                                   struct netlink_ext_ack *extack)
{
        if (!(nla->nla_type & NLA_F_NESTED)) {
                NL_SET_ERR_MSG_ATTR(extack, nla, "NLA_F_NESTED is missing");
                return -EINVAL;
        }

        return __nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy,
                           NL_VALIDATE_STRICT, extack);
}

/**
 * nla_parse_nested_deprecated - parse nested attributes
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @nla: attribute containing the nested attributes
 * @policy: validation policy
 * @extack: extended ACK report struct
 *
 * See nla_parse_deprecated()
 */
static inline int nla_parse_nested_deprecated(struct nlattr *tb[], int maxtype,
                                              const struct nlattr *nla,
                                              const struct nla_policy *policy,
                                              struct netlink_ext_ack *extack)
{
        return __nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy,
                           NL_VALIDATE_LIBERAL, extack);
}

/**
 * nla_put_u8 - Add a u8 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_u8(struct sk_buff *skb, int attrtype, u8 value)
{
        /* temporary variables to work around GCC PR81715 with asan-stack=1 */
        u8 tmp = value;

        return nla_put(skb, attrtype, sizeof(u8), &tmp);
}

/**
 * nla_put_u16 - Add a u16 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_u16(struct sk_buff *skb, int attrtype, u16 value)
{
        u16 tmp = value;

        return nla_put(skb, attrtype, sizeof(u16), &tmp);
}

/**
 * nla_put_be16 - Add a __be16 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_be16(struct sk_buff *skb, int attrtype, __be16 value)
{
        __be16 tmp = value;

        return nla_put(skb, attrtype, sizeof(__be16), &tmp);
}

/**
 * nla_put_net16 - Add 16-bit network byte order netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_net16(struct sk_buff *skb, int attrtype, __be16 value)
{
        __be16 tmp = value;

        return nla_put_be16(skb, attrtype | NLA_F_NET_BYTEORDER, tmp);
}

/**
 * nla_put_le16 - Add a __le16 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_le16(struct sk_buff *skb, int attrtype, __le16 value)
{
        __le16 tmp = value;

        return nla_put(skb, attrtype, sizeof(__le16), &tmp);
}

/**
 * nla_put_u32 - Add a u32 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_u32(struct sk_buff *skb, int attrtype, u32 value)
{
        u32 tmp = value;

        return nla_put(skb, attrtype, sizeof(u32), &tmp);
}

/**
 * nla_put_be32 - Add a __be32 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_be32(struct sk_buff *skb, int attrtype, __be32 value)
{
        __be32 tmp = value;

        return nla_put(skb, attrtype, sizeof(__be32), &tmp);
}

/**
 * nla_put_net32 - Add 32-bit network byte order netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_net32(struct sk_buff *skb, int attrtype, __be32 value)
{
        __be32 tmp = value;

        return nla_put_be32(skb, attrtype | NLA_F_NET_BYTEORDER, tmp);
}

/**
 * nla_put_le32 - Add a __le32 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_le32(struct sk_buff *skb, int attrtype, __le32 value)
{
        __le32 tmp = value;

        return nla_put(skb, attrtype, sizeof(__le32), &tmp);
}

/**
 * nla_put_u64_64bit - Add a u64 netlink attribute to a skb and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 * @padattr: attribute type for the padding
 */
static inline int nla_put_u64_64bit(struct sk_buff *skb, int attrtype,
                                    u64 value, int padattr)
{
        u64 tmp = value;

        return nla_put_64bit(skb, attrtype, sizeof(u64), &tmp, padattr);
}

/**
 * nla_put_be64 - Add a __be64 netlink attribute to a socket buffer and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 * @padattr: attribute type for the padding
 */
static inline int nla_put_be64(struct sk_buff *skb, int attrtype, __be64 value,
                               int padattr)
{
        __be64 tmp = value;

        return nla_put_64bit(skb, attrtype, sizeof(__be64), &tmp, padattr);
}

/**
 * nla_put_net64 - Add 64-bit network byte order nlattr to a skb and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 * @padattr: attribute type for the padding
 */
static inline int nla_put_net64(struct sk_buff *skb, int attrtype, __be64 value,
                                int padattr)
{
        __be64 tmp = value;

        return nla_put_be64(skb, attrtype | NLA_F_NET_BYTEORDER, tmp,
                            padattr);
}

/**
 * nla_put_le64 - Add a __le64 netlink attribute to a socket buffer and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 * @padattr: attribute type for the padding
 */
static inline int nla_put_le64(struct sk_buff *skb, int attrtype, __le64 value,
                               int padattr)
{
        __le64 tmp = value;

        return nla_put_64bit(skb, attrtype, sizeof(__le64), &tmp, padattr);
}

/**
 * nla_put_s8 - Add a s8 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_s8(struct sk_buff *skb, int attrtype, s8 value)
{
        s8 tmp = value;

        return nla_put(skb, attrtype, sizeof(s8), &tmp);
}

/**
 * nla_put_s16 - Add a s16 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_s16(struct sk_buff *skb, int attrtype, s16 value)
{
        s16 tmp = value;

        return nla_put(skb, attrtype, sizeof(s16), &tmp);
}

/**
 * nla_put_s32 - Add a s32 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_s32(struct sk_buff *skb, int attrtype, s32 value)
{
        s32 tmp = value;

        return nla_put(skb, attrtype, sizeof(s32), &tmp);
}

/**
 * nla_put_s64 - Add a s64 netlink attribute to a socket buffer and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 * @padattr: attribute type for the padding
 */
static inline int nla_put_s64(struct sk_buff *skb, int attrtype, s64 value,
                              int padattr)
{
        s64 tmp = value;

        return nla_put_64bit(skb, attrtype, sizeof(s64), &tmp, padattr);
}

/**
 * nla_put_string - Add a string netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @str: NUL terminated string
 */
static inline int nla_put_string(struct sk_buff *skb, int attrtype,
                                 const char *str)
{
        return nla_put(skb, attrtype, strlen(str) + 1, str);
}

/**
 * nla_put_flag - Add a flag netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 */
static inline int nla_put_flag(struct sk_buff *skb, int attrtype)
{
        return nla_put(skb, attrtype, 0, NULL);
}

/**
 * nla_put_msecs - Add a msecs netlink attribute to a skb and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @njiffies: number of jiffies to convert to msecs
 * @padattr: attribute type for the padding
 */
static inline int nla_put_msecs(struct sk_buff *skb, int attrtype,
                                unsigned long njiffies, int padattr)
{
        u64 tmp = jiffies_to_msecs(njiffies);

        return nla_put_64bit(skb, attrtype, sizeof(u64), &tmp, padattr);
}

/**
 * nla_put_in_addr - Add an IPv4 address netlink attribute to a socket
 * buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @addr: IPv4 address
 */
static inline int nla_put_in_addr(struct sk_buff *skb, int attrtype,
                                  __be32 addr)
{
        __be32 tmp = addr;

        return nla_put_be32(skb, attrtype, tmp);
}

/**
 * nla_put_in6_addr - Add an IPv6 address netlink attribute to a socket
 * buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @addr: IPv6 address
 */
static inline int nla_put_in6_addr(struct sk_buff *skb, int attrtype,
                                   const struct in6_addr *addr)
{
        return nla_put(skb, attrtype, sizeof(*addr), addr);
}

/**
 * nla_put_bitfield32 - Add a bitfield32 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: value carrying bits
 * @selector: selector of valid bits
 */
static inline int nla_put_bitfield32(struct sk_buff *skb, int attrtype,
                                     __u32 value, __u32 selector)
{
        struct nla_bitfield32 tmp = { value, selector, };

        return nla_put(skb, attrtype, sizeof(tmp), &tmp);
}

/**
 * nla_get_u32 - return payload of u32 attribute
 * @nla: u32 netlink attribute
 */
static inline u32 nla_get_u32(const struct nlattr *nla)
{
        return *(u32 *) nla_data(nla);
}

/**
 * nla_get_be32 - return payload of __be32 attribute
 * @nla: __be32 netlink attribute
 */
static inline __be32 nla_get_be32(const struct nlattr *nla)
{
        return *(__be32 *) nla_data(nla);
}

/**
 * nla_get_le32 - return payload of __le32 attribute
 * @nla: __le32 netlink attribute
 */
static inline __le32 nla_get_le32(const struct nlattr *nla)
{
        return *(__le32 *) nla_data(nla);
}

/**
 * nla_get_u16 - return payload of u16 attribute
 * @nla: u16 netlink attribute
 */
static inline u16 nla_get_u16(const struct nlattr *nla)
{
        return *(u16 *) nla_data(nla);
}

/**
 * nla_get_be16 - return payload of __be16 attribute
 * @nla: __be16 netlink attribute
 */
static inline __be16 nla_get_be16(const struct nlattr *nla)
{
        return *(__be16 *) nla_data(nla);
}

/**
 * nla_get_le16 - return payload of __le16 attribute
 * @nla: __le16 netlink attribute
 */
static inline __le16 nla_get_le16(const struct nlattr *nla)
{
        return *(__le16 *) nla_data(nla);
}

/**
 * nla_get_u8 - return payload of u8 attribute
 * @nla: u8 netlink attribute
 */
static inline u8 nla_get_u8(const struct nlattr *nla)
{
        return *(u8 *) nla_data(nla);
}

/**
 * nla_get_u64 - return payload of u64 attribute
 * @nla: u64 netlink attribute
 */
static inline u64 nla_get_u64(const struct nlattr *nla)
{
        u64 tmp;

        nla_memcpy(&tmp, nla, sizeof(tmp));

        return tmp;
}

/**
 * nla_get_be64 - return payload of __be64 attribute
 * @nla: __be64 netlink attribute
 */
static inline __be64 nla_get_be64(const struct nlattr *nla)
{
        __be64 tmp;

        nla_memcpy(&tmp, nla, sizeof(tmp));

        return tmp;
}

/**
 * nla_get_le64 - return payload of __le64 attribute
 * @nla: __le64 netlink attribute
 */
static inline __le64 nla_get_le64(const struct nlattr *nla)
{
        return *(__le64 *) nla_data(nla);
}

/**
 * nla_get_s32 - return payload of s32 attribute
 * @nla: s32 netlink attribute
 */
static inline s32 nla_get_s32(const struct nlattr *nla)
{
        return *(s32 *) nla_data(nla);
}

/**
 * nla_get_s16 - return payload of s16 attribute
 * @nla: s16 netlink attribute
 */
static inline s16 nla_get_s16(const struct nlattr *nla)
{
        return *(s16 *) nla_data(nla);
}

/**
 * nla_get_s8 - return payload of s8 attribute
 * @nla: s8 netlink attribute
 */
static inline s8 nla_get_s8(const struct nlattr *nla)
{
        return *(s8 *) nla_data(nla);
}

/**
 * nla_get_s64 - return payload of s64 attribute
 * @nla: s64 netlink attribute
 */
static inline s64 nla_get_s64(const struct nlattr *nla)
{
        s64 tmp;

        nla_memcpy(&tmp, nla, sizeof(tmp));

        return tmp;
}

/**
 * nla_get_flag - return payload of flag attribute
 * @nla: flag netlink attribute
 */
static inline int nla_get_flag(const struct nlattr *nla)
{
        return !!nla;
}

/**
 * nla_get_msecs - return payload of msecs attribute
 * @nla: msecs netlink attribute
 *
 * Returns the number of milliseconds in jiffies.
 */
static inline unsigned long nla_get_msecs(const struct nlattr *nla)
{
        u64 msecs = nla_get_u64(nla);

        return msecs_to_jiffies((unsigned long) msecs);
}

/**
 * nla_get_in_addr - return payload of IPv4 address attribute
 * @nla: IPv4 address netlink attribute
 */
static inline __be32 nla_get_in_addr(const struct nlattr *nla)
{
        return *(__be32 *) nla_data(nla);
}

/**
 * nla_get_in6_addr - return payload of IPv6 address attribute
 * @nla: IPv6 address netlink attribute
 */
static inline struct in6_addr nla_get_in6_addr(const struct nlattr *nla)
{
        struct in6_addr tmp;

        nla_memcpy(&tmp, nla, sizeof(tmp));
        return tmp;
}

/**
 * nla_get_bitfield32 - return payload of 32 bitfield attribute
 * @nla: nla_bitfield32 attribute
 */
static inline struct nla_bitfield32 nla_get_bitfield32(const struct nlattr *nla)
{
        struct nla_bitfield32 tmp;

        nla_memcpy(&tmp, nla, sizeof(tmp));
        return tmp;
}

/**
 * nla_memdup - duplicate attribute memory (kmemdup)
 * @src: netlink attribute to duplicate from
 * @gfp: GFP mask
 */
static inline void *nla_memdup(const struct nlattr *src, gfp_t gfp)
{
        return kmemdup(nla_data(src), nla_len(src), gfp);
}

/**
 * nla_nest_start_noflag - Start a new level of nested attributes
 * @skb: socket buffer to add attributes to
 * @attrtype: attribute type of container
 *
 * This function exists for backward compatibility to use in APIs which never
 * marked their nest attributes with NLA_F_NESTED flag. New APIs should use
 * nla_nest_start() which sets the flag.
 *
 * Returns the container attribute or NULL on error
 */
static inline struct nlattr *nla_nest_start_noflag(struct sk_buff *skb,
                                                   int attrtype)
{
        struct nlattr *start = (struct nlattr *)skb_tail_pointer(skb);

        if (nla_put(skb, attrtype, 0, NULL) < 0)
                return NULL;

        return start;
}

/**
 * nla_nest_start - Start a new level of nested attributes, with NLA_F_NESTED
 * @skb: socket buffer to add attributes to
 * @attrtype: attribute type of container
 *
 * Unlike nla_nest_start_noflag(), mark the nest attribute with NLA_F_NESTED
 * flag. This is the preferred function to use in new code.
 *
 * Returns the container attribute or NULL on error
 */
static inline struct nlattr *nla_nest_start(struct sk_buff *skb, int attrtype)
{
        return nla_nest_start_noflag(skb, attrtype | NLA_F_NESTED);
}

/**
 * nla_nest_end - Finalize nesting of attributes
 * @skb: socket buffer the attributes are stored in
 * @start: container attribute
 *
 * Corrects the container attribute header to include the all
 * appeneded attributes.
 *
 * Returns the total data length of the skb.
 */
static inline int nla_nest_end(struct sk_buff *skb, struct nlattr *start)
{
        start->nla_len = skb_tail_pointer(skb) - (unsigned char *)start;
        return skb->len;
}

/**
 * nla_nest_cancel - Cancel nesting of attributes
 * @skb: socket buffer the message is stored in
 * @start: container attribute
 *
 * Removes the container attribute and including all nested
 * attributes. Returns -EMSGSIZE
 */
static inline void nla_nest_cancel(struct sk_buff *skb, struct nlattr *start)
{
        nlmsg_trim(skb, start);
}

/**
 * __nla_validate_nested - Validate a stream of nested attributes
 * @start: container attribute
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @validate: validation strictness
 * @extack: extended ACK report struct
 *
 * Validates all attributes in the nested attribute stream against the
 * specified policy. Attributes with a type exceeding maxtype will be
 * ignored. See documenation of struct nla_policy for more details.
 *
 * Returns 0 on success or a negative error code.
 */
static inline int __nla_validate_nested(const struct nlattr *start, int maxtype,
                                        const struct nla_policy *policy,
                                        unsigned int validate,
                                        struct netlink_ext_ack *extack)
{
        return __nla_validate(nla_data(start), nla_len(start), maxtype, policy,
                              validate, extack);
}

static inline int
nla_validate_nested(const struct nlattr *start, int maxtype,
                    const struct nla_policy *policy,
                    struct netlink_ext_ack *extack)
{
        return __nla_validate_nested(start, maxtype, policy,
                                     NL_VALIDATE_STRICT, extack);
}

static inline int
nla_validate_nested_deprecated(const struct nlattr *start, int maxtype,
                               const struct nla_policy *policy,
                               struct netlink_ext_ack *extack)
{
        return __nla_validate_nested(start, maxtype, policy,
                                     NL_VALIDATE_LIBERAL, extack);
}

/**
 * nla_need_padding_for_64bit - test 64-bit alignment of the next attribute
 * @skb: socket buffer the message is stored in
 *
 * Return true if padding is needed to align the next attribute (nla_data()) to
 * a 64-bit aligned area.
 */
static inline bool nla_need_padding_for_64bit(struct sk_buff *skb)
{
#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
        /* The nlattr header is 4 bytes in size, that's why we test
         * if the skb->data _is_ aligned.  A NOP attribute, plus
         * nlattr header for next attribute, will make nla_data()
         * 8-byte aligned.
         */
        if (IS_ALIGNED((unsigned long)skb_tail_pointer(skb), 8))
                return true;
#endif
        return false;
}

/**
 * nla_align_64bit - 64-bit align the nla_data() of next attribute
 * @skb: socket buffer the message is stored in
 * @padattr: attribute type for the padding
 *
 * Conditionally emit a padding netlink attribute in order to make
 * the next attribute we emit have a 64-bit aligned nla_data() area.
 * This will only be done in architectures which do not have
 * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS defined.
 *
 * Returns zero on success or a negative error code.
 */
static inline int nla_align_64bit(struct sk_buff *skb, int padattr)
{
        if (nla_need_padding_for_64bit(skb) &&
            !nla_reserve(skb, padattr, 0))
                return -EMSGSIZE;

        return 0;
}

/**
 * nla_total_size_64bit - total length of attribute including padding
 * @payload: length of payload
 */
static inline int nla_total_size_64bit(int payload)
{
        return NLA_ALIGN(nla_attr_size(payload))
#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
                + NLA_ALIGN(nla_attr_size(0))
#endif
                ;
}

/**
 * nla_for_each_attr - iterate over a stream of attributes
 * @pos: loop counter, set to current attribute
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @rem: initialized to len, holds bytes currently remaining in stream
 */
#define nla_for_each_attr(pos, head, len, rem) \
        for (pos = head, rem = len; \
             nla_ok(pos, rem); \
             pos = nla_next(pos, &(rem)))

/**
 * nla_for_each_nested - iterate over nested attributes
 * @pos: loop counter, set to current attribute
 * @nla: attribute containing the nested attributes
 * @rem: initialized to len, holds bytes currently remaining in stream
 */
#define nla_for_each_nested(pos, nla, rem) \
        nla_for_each_attr(pos, nla_data(nla), nla_len(nla), rem)

/**
 * nla_is_last - Test if attribute is last in stream
 * @nla: attribute to test
 * @rem: bytes remaining in stream
 */
static inline bool nla_is_last(const struct nlattr *nla, int rem)
{
        return nla->nla_len == rem;
}

void nla_get_range_unsigned(const struct nla_policy *pt,
                            struct netlink_range_validation *range);
void nla_get_range_signed(const struct nla_policy *pt,
                          struct netlink_range_validation_signed *range);

struct netlink_policy_dump_state;

int netlink_policy_dump_add_policy(struct netlink_policy_dump_state **pstate,
                                   const struct nla_policy *policy,
                                   unsigned int maxtype);
int netlink_policy_dump_get_policy_idx(struct netlink_policy_dump_state *state,
                                       const struct nla_policy *policy,
                                       unsigned int maxtype);
bool netlink_policy_dump_loop(struct netlink_policy_dump_state *state);
int netlink_policy_dump_write(struct sk_buff *skb,
                              struct netlink_policy_dump_state *state);
int netlink_policy_dump_attr_size_estimate(const struct nla_policy *pt);
int netlink_policy_dump_write_attr(struct sk_buff *skb,
                                   const struct nla_policy *pt,
                                   int nestattr);
void netlink_policy_dump_free(struct netlink_policy_dump_state *state);

#endif










































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _INET_COMMON_H
#define _INET_COMMON_H

#include <linux/indirect_call_wrapper.h>

extern const struct proto_ops inet_stream_ops;
extern const struct proto_ops inet_dgram_ops;

/*
 *        INET4 prototypes used by INET6
 */

struct msghdr;
struct sock;
struct sockaddr;
struct socket;

int inet_release(struct socket *sock);
int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
                        int addr_len, int flags);
int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
                          int addr_len, int flags, int is_sendmsg);
int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
                       int addr_len, int flags);
int inet_accept(struct socket *sock, struct socket *newsock, int flags,
                bool kern);
int inet_send_prepare(struct sock *sk);
int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size);
ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
                      size_t size, int flags);
int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
                 int flags);
int inet_shutdown(struct socket *sock, int how);
int inet_listen(struct socket *sock, int backlog);
void inet_sock_destruct(struct sock *sk);
int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
/* Don't allocate port at this moment, defer to connect. */
#define BIND_FORCE_ADDRESS_NO_PORT        (1 << 0)
/* Grab and release socket lock. */
#define BIND_WITH_LOCK                        (1 << 1)
/* Called from BPF program. */
#define BIND_FROM_BPF                        (1 << 2)
int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
                u32 flags);
int inet_getname(struct socket *sock, struct sockaddr *uaddr,
                 int peer);
int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
int inet_ctl_sock_create(struct sock **sk, unsigned short family,
                         unsigned short type, unsigned char protocol,
                         struct net *net);
int inet_recv_error(struct sock *sk, struct msghdr *msg, int len,
                    int *addr_len);

struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb);
int inet_gro_complete(struct sk_buff *skb, int nhoff);
struct sk_buff *inet_gso_segment(struct sk_buff *skb,
                                 netdev_features_t features);

static inline void inet_ctl_sock_destroy(struct sock *sk)
{
        if (sk)
                sock_release(sk->sk_socket);
}

#define indirect_call_gro_receive(f2, f1, cb, head, skb)        \
({                                                                \
        unlikely(gro_recursion_inc_test(skb)) ?                        \
                NAPI_GRO_CB(skb)->flush |= 1, NULL :                \
                INDIRECT_CALL_2(cb, f2, f1, head, skb);                \
})

#endif






































































































































































    4 
    4 

    4 








    4 






    4 











    4 
    4 

    4 


    4 





    1 

    1 

































































































































































































































































































    4 


    4 


    4 






    4 
    4 

    4 






















    4 



























    1 


    1 








































    1 
    1 

    1 

    1 








































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
// SPDX-License-Identifier: GPL-2.0
/*
 * Implementation of the multi-level security (MLS) policy.
 *
 * Author : Stephen Smalley, <sds@tycho.nsa.gov>
 */
/*
 * Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com>
 *
 *        Support for enhanced MLS infrastructure.
 *
 * Copyright (C) 2004-2006 Trusted Computer Solutions, Inc.
 */
/*
 * Updated: Hewlett-Packard <paul@paul-moore.com>
 *
 *      Added support to import/export the MLS label from NetLabel
 *
 * (c) Copyright Hewlett-Packard Development Company, L.P., 2006
 */

#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <net/netlabel.h>
#include "sidtab.h"
#include "mls.h"
#include "policydb.h"
#include "services.h"

/*
 * Return the length in bytes for the MLS fields of the
 * security context string representation of `context'.
 */
int mls_compute_context_len(struct policydb *p, struct context *context)
{
        int i, l, len, head, prev;
        char *nm;
        struct ebitmap *e;
        struct ebitmap_node *node;

        if (!p->mls_enabled)
                return 0;

        len = 1; /* for the beginning ":" */
        for (l = 0; l < 2; l++) {
                int index_sens = context->range.level[l].sens;
                len += strlen(sym_name(p, SYM_LEVELS, index_sens - 1));

                /* categories */
                head = -2;
                prev = -2;
                e = &context->range.level[l].cat;
                ebitmap_for_each_positive_bit(e, node, i) {
                        if (i - prev > 1) {
                                /* one or more negative bits are skipped */
                                if (head != prev) {
                                        nm = sym_name(p, SYM_CATS, prev);
                                        len += strlen(nm) + 1;
                                }
                                nm = sym_name(p, SYM_CATS, i);
                                len += strlen(nm) + 1;
                                head = i;
                        }
                        prev = i;
                }
                if (prev != head) {
                        nm = sym_name(p, SYM_CATS, prev);
                        len += strlen(nm) + 1;
                }
                if (l == 0) {
                        if (mls_level_eq(&context->range.level[0],
                                         &context->range.level[1]))
                                break;
                        else
                                len++;
                }
        }

        return len;
}

/*
 * Write the security context string representation of
 * the MLS fields of `context' into the string `*scontext'.
 * Update `*scontext' to point to the end of the MLS fields.
 */
void mls_sid_to_context(struct policydb *p,
                        struct context *context,
                        char **scontext)
{
        char *scontextp, *nm;
        int i, l, head, prev;
        struct ebitmap *e;
        struct ebitmap_node *node;

        if (!p->mls_enabled)
                return;

        scontextp = *scontext;

        *scontextp = ':';
        scontextp++;

        for (l = 0; l < 2; l++) {
                strcpy(scontextp, sym_name(p, SYM_LEVELS,
                                           context->range.level[l].sens - 1));
                scontextp += strlen(scontextp);

                /* categories */
                head = -2;
                prev = -2;
                e = &context->range.level[l].cat;
                ebitmap_for_each_positive_bit(e, node, i) {
                        if (i - prev > 1) {
                                /* one or more negative bits are skipped */
                                if (prev != head) {
                                        if (prev - head > 1)
                                                *scontextp++ = '.';
                                        else
                                                *scontextp++ = ',';
                                        nm = sym_name(p, SYM_CATS, prev);
                                        strcpy(scontextp, nm);
                                        scontextp += strlen(nm);
                                }
                                if (prev < 0)
                                        *scontextp++ = ':';
                                else
                                        *scontextp++ = ',';
                                nm = sym_name(p, SYM_CATS, i);
                                strcpy(scontextp, nm);
                                scontextp += strlen(nm);
                                head = i;
                        }
                        prev = i;
                }

                if (prev != head) {
                        if (prev - head > 1)
                                *scontextp++ = '.';
                        else
                                *scontextp++ = ',';
                        nm = sym_name(p, SYM_CATS, prev);
                        strcpy(scontextp, nm);
                        scontextp += strlen(nm);
                }

                if (l == 0) {
                        if (mls_level_eq(&context->range.level[0],
                                         &context->range.level[1]))
                                break;
                        else
                                *scontextp++ = '-';
                }
        }

        *scontext = scontextp;
        return;
}

int mls_level_isvalid(struct policydb *p, struct mls_level *l)
{
        struct level_datum *levdatum;

        if (!l->sens || l->sens > p->p_levels.nprim)
                return 0;
        levdatum = symtab_search(&p->p_levels,
                                 sym_name(p, SYM_LEVELS, l->sens - 1));
        if (!levdatum)
                return 0;

        /*
         * Return 1 iff all the bits set in l->cat are also be set in
         * levdatum->level->cat and no bit in l->cat is larger than
         * p->p_cats.nprim.
         */
        return ebitmap_contains(&levdatum->level->cat, &l->cat,
                                p->p_cats.nprim);
}

int mls_range_isvalid(struct policydb *p, struct mls_range *r)
{
        return (mls_level_isvalid(p, &r->level[0]) &&
                mls_level_isvalid(p, &r->level[1]) &&
                mls_level_dom(&r->level[1], &r->level[0]));
}

/*
 * Return 1 if the MLS fields in the security context
 * structure `c' are valid.  Return 0 otherwise.
 */
int mls_context_isvalid(struct policydb *p, struct context *c)
{
        struct user_datum *usrdatum;

        if (!p->mls_enabled)
                return 1;

        if (!mls_range_isvalid(p, &c->range))
                return 0;

        if (c->role == OBJECT_R_VAL)
                return 1;

        /*
         * User must be authorized for the MLS range.
         */
        if (!c->user || c->user > p->p_users.nprim)
                return 0;
        usrdatum = p->user_val_to_struct[c->user - 1];
        if (!mls_range_contains(usrdatum->range, c->range))
                return 0; /* user may not be associated with range */

        return 1;
}

/*
 * Set the MLS fields in the security context structure
 * `context' based on the string representation in
 * the string `scontext'.
 *
 * This function modifies the string in place, inserting
 * NULL characters to terminate the MLS fields.
 *
 * If a def_sid is provided and no MLS field is present,
 * copy the MLS field of the associated default context.
 * Used for upgraded to MLS systems where objects may lack
 * MLS fields.
 *
 * Policy read-lock must be held for sidtab lookup.
 *
 */
int mls_context_to_sid(struct policydb *pol,
                       char oldc,
                       char *scontext,
                       struct context *context,
                       struct sidtab *s,
                       u32 def_sid)
{
        char *sensitivity, *cur_cat, *next_cat, *rngptr;
        struct level_datum *levdatum;
        struct cat_datum *catdatum, *rngdatum;
        int l, rc, i;
        char *rangep[2];

        if (!pol->mls_enabled) {
                /*
                 * With no MLS, only return -EINVAL if there is a MLS field
                 * and it did not come from an xattr.
                 */
                if (oldc && def_sid == SECSID_NULL)
                        return -EINVAL;
                return 0;
        }

        /*
         * No MLS component to the security context, try and map to
         * default if provided.
         */
        if (!oldc) {
                struct context *defcon;

                if (def_sid == SECSID_NULL)
                        return -EINVAL;

                defcon = sidtab_search(s, def_sid);
                if (!defcon)
                        return -EINVAL;

                return mls_context_cpy(context, defcon);
        }

        /*
         * If we're dealing with a range, figure out where the two parts
         * of the range begin.
         */
        rangep[0] = scontext;
        rangep[1] = strchr(scontext, '-');
        if (rangep[1]) {
                rangep[1][0] = '\0';
                rangep[1]++;
        }

        /* For each part of the range: */
        for (l = 0; l < 2; l++) {
                /* Split sensitivity and category set. */
                sensitivity = rangep[l];
                if (sensitivity == NULL)
                        break;
                next_cat = strchr(sensitivity, ':');
                if (next_cat)
                        *(next_cat++) = '\0';

                /* Parse sensitivity. */
                levdatum = symtab_search(&pol->p_levels, sensitivity);
                if (!levdatum)
                        return -EINVAL;
                context->range.level[l].sens = levdatum->level->sens;

                /* Extract category set. */
                while (next_cat != NULL) {
                        cur_cat = next_cat;
                        next_cat = strchr(next_cat, ',');
                        if (next_cat != NULL)
                                *(next_cat++) = '\0';

                        /* Separate into range if exists */
                        rngptr = strchr(cur_cat, '.');
                        if (rngptr != NULL) {
                                /* Remove '.' */
                                *rngptr++ = '\0';
                        }

                        catdatum = symtab_search(&pol->p_cats, cur_cat);
                        if (!catdatum)
                                return -EINVAL;

                        rc = ebitmap_set_bit(&context->range.level[l].cat,
                                             catdatum->value - 1, 1);
                        if (rc)
                                return rc;

                        /* If range, set all categories in range */
                        if (rngptr == NULL)
                                continue;

                        rngdatum = symtab_search(&pol->p_cats, rngptr);
                        if (!rngdatum)
                                return -EINVAL;

                        if (catdatum->value >= rngdatum->value)
                                return -EINVAL;

                        for (i = catdatum->value; i < rngdatum->value; i++) {
                                rc = ebitmap_set_bit(&context->range.level[l].cat, i, 1);
                                if (rc)
                                        return rc;
                        }
                }
        }

        /* If we didn't see a '-', the range start is also the range end. */
        if (rangep[1] == NULL) {
                context->range.level[1].sens = context->range.level[0].sens;
                rc = ebitmap_cpy(&context->range.level[1].cat,
                                 &context->range.level[0].cat);
                if (rc)
                        return rc;
        }

        return 0;
}

/*
 * Set the MLS fields in the security context structure
 * `context' based on the string representation in
 * the string `str'.  This function will allocate temporary memory with the
 * given constraints of gfp_mask.
 */
int mls_from_string(struct policydb *p, char *str, struct context *context,
                    gfp_t gfp_mask)
{
        char *tmpstr;
        int rc;

        if (!p->mls_enabled)
                return -EINVAL;

        tmpstr = kstrdup(str, gfp_mask);
        if (!tmpstr) {
                rc = -ENOMEM;
        } else {
                rc = mls_context_to_sid(p, ':', tmpstr, context,
                                        NULL, SECSID_NULL);
                kfree(tmpstr);
        }

        return rc;
}

/*
 * Copies the MLS range `range' into `context'.
 */
int mls_range_set(struct context *context,
                                struct mls_range *range)
{
        int l, rc = 0;

        /* Copy the MLS range into the  context */
        for (l = 0; l < 2; l++) {
                context->range.level[l].sens = range->level[l].sens;
                rc = ebitmap_cpy(&context->range.level[l].cat,
                                 &range->level[l].cat);
                if (rc)
                        break;
        }

        return rc;
}

int mls_setup_user_range(struct policydb *p,
                         struct context *fromcon, struct user_datum *user,
                         struct context *usercon)
{
        if (p->mls_enabled) {
                struct mls_level *fromcon_sen = &(fromcon->range.level[0]);
                struct mls_level *fromcon_clr = &(fromcon->range.level[1]);
                struct mls_level *user_low = &(user->range.level[0]);
                struct mls_level *user_clr = &(user->range.level[1]);
                struct mls_level *user_def = &(user->dfltlevel);
                struct mls_level *usercon_sen = &(usercon->range.level[0]);
                struct mls_level *usercon_clr = &(usercon->range.level[1]);

                /* Honor the user's default level if we can */
                if (mls_level_between(user_def, fromcon_sen, fromcon_clr))
                        *usercon_sen = *user_def;
                else if (mls_level_between(fromcon_sen, user_def, user_clr))
                        *usercon_sen = *fromcon_sen;
                else if (mls_level_between(fromcon_clr, user_low, user_def))
                        *usercon_sen = *user_low;
                else
                        return -EINVAL;

                /* Lower the clearance of available contexts
                   if the clearance of "fromcon" is lower than
                   that of the user's default clearance (but
                   only if the "fromcon" clearance dominates
                   the user's computed sensitivity level) */
                if (mls_level_dom(user_clr, fromcon_clr))
                        *usercon_clr = *fromcon_clr;
                else if (mls_level_dom(fromcon_clr, user_clr))
                        *usercon_clr = *user_clr;
                else
                        return -EINVAL;
        }

        return 0;
}

/*
 * Convert the MLS fields in the security context
 * structure `oldc' from the values specified in the
 * policy `oldp' to the values specified in the policy `newp',
 * storing the resulting context in `newc'.
 */
int mls_convert_context(struct policydb *oldp,
                        struct policydb *newp,
                        struct context *oldc,
                        struct context *newc)
{
        struct level_datum *levdatum;
        struct cat_datum *catdatum;
        struct ebitmap_node *node;
        int l, i;

        if (!oldp->mls_enabled || !newp->mls_enabled)
                return 0;

        for (l = 0; l < 2; l++) {
                char *name = sym_name(oldp, SYM_LEVELS,
                                      oldc->range.level[l].sens - 1);

                levdatum = symtab_search(&newp->p_levels, name);

                if (!levdatum)
                        return -EINVAL;
                newc->range.level[l].sens = levdatum->level->sens;

                ebitmap_for_each_positive_bit(&oldc->range.level[l].cat,
                                              node, i) {
                        int rc;

                        catdatum = symtab_search(&newp->p_cats,
                                                 sym_name(oldp, SYM_CATS, i));
                        if (!catdatum)
                                return -EINVAL;
                        rc = ebitmap_set_bit(&newc->range.level[l].cat,
                                             catdatum->value - 1, 1);
                        if (rc)
                                return rc;
                }
        }

        return 0;
}

int mls_compute_sid(struct policydb *p,
                    struct context *scontext,
                    struct context *tcontext,
                    u16 tclass,
                    u32 specified,
                    struct context *newcontext,
                    bool sock)
{
        struct range_trans rtr;
        struct mls_range *r;
        struct class_datum *cladatum;
        int default_range = 0;

        if (!p->mls_enabled)
                return 0;

        switch (specified) {
        case AVTAB_TRANSITION:
                /* Look for a range transition rule. */
                rtr.source_type = scontext->type;
                rtr.target_type = tcontext->type;
                rtr.target_class = tclass;
                r = policydb_rangetr_search(p, &rtr);
                if (r)
                        return mls_range_set(newcontext, r);

                if (tclass && tclass <= p->p_classes.nprim) {
                        cladatum = p->class_val_to_struct[tclass - 1];
                        if (cladatum)
                                default_range = cladatum->default_range;
                }

                switch (default_range) {
                case DEFAULT_SOURCE_LOW:
                        return mls_context_cpy_low(newcontext, scontext);
                case DEFAULT_SOURCE_HIGH:
                        return mls_context_cpy_high(newcontext, scontext);
                case DEFAULT_SOURCE_LOW_HIGH:
                        return mls_context_cpy(newcontext, scontext);
                case DEFAULT_TARGET_LOW:
                        return mls_context_cpy_low(newcontext, tcontext);
                case DEFAULT_TARGET_HIGH:
                        return mls_context_cpy_high(newcontext, tcontext);
                case DEFAULT_TARGET_LOW_HIGH:
                        return mls_context_cpy(newcontext, tcontext);
                case DEFAULT_GLBLUB:
                        return mls_context_glblub(newcontext,
                                                  scontext, tcontext);
                }

                fallthrough;
        case AVTAB_CHANGE:
                if ((tclass == p->process_class) || sock)
                        /* Use the process MLS attributes. */
                        return mls_context_cpy(newcontext, scontext);
                else
                        /* Use the process effective MLS attributes. */
                        return mls_context_cpy_low(newcontext, scontext);
        case AVTAB_MEMBER:
                /* Use the process effective MLS attributes. */
                return mls_context_cpy_low(newcontext, scontext);
        }
        return -EINVAL;
}

#ifdef CONFIG_NETLABEL
/**
 * mls_export_netlbl_lvl - Export the MLS sensitivity levels to NetLabel
 * @context: the security context
 * @secattr: the NetLabel security attributes
 *
 * Description:
 * Given the security context copy the low MLS sensitivity level into the
 * NetLabel MLS sensitivity level field.
 *
 */
void mls_export_netlbl_lvl(struct policydb *p,
                           struct context *context,
                           struct netlbl_lsm_secattr *secattr)
{
        if (!p->mls_enabled)
                return;

        secattr->attr.mls.lvl = context->range.level[0].sens - 1;
        secattr->flags |= NETLBL_SECATTR_MLS_LVL;
}

/**
 * mls_import_netlbl_lvl - Import the NetLabel MLS sensitivity levels
 * @context: the security context
 * @secattr: the NetLabel security attributes
 *
 * Description:
 * Given the security context and the NetLabel security attributes, copy the
 * NetLabel MLS sensitivity level into the context.
 *
 */
void mls_import_netlbl_lvl(struct policydb *p,
                           struct context *context,
                           struct netlbl_lsm_secattr *secattr)
{
        if (!p->mls_enabled)
                return;

        context->range.level[0].sens = secattr->attr.mls.lvl + 1;
        context->range.level[1].sens = context->range.level[0].sens;
}

/**
 * mls_export_netlbl_cat - Export the MLS categories to NetLabel
 * @context: the security context
 * @secattr: the NetLabel security attributes
 *
 * Description:
 * Given the security context copy the low MLS categories into the NetLabel
 * MLS category field.  Returns zero on success, negative values on failure.
 *
 */
int mls_export_netlbl_cat(struct policydb *p,
                          struct context *context,
                          struct netlbl_lsm_secattr *secattr)
{
        int rc;

        if (!p->mls_enabled)
                return 0;

        rc = ebitmap_netlbl_export(&context->range.level[0].cat,
                                   &secattr->attr.mls.cat);
        if (rc == 0 && secattr->attr.mls.cat != NULL)
                secattr->flags |= NETLBL_SECATTR_MLS_CAT;

        return rc;
}

/**
 * mls_import_netlbl_cat - Import the MLS categories from NetLabel
 * @context: the security context
 * @secattr: the NetLabel security attributes
 *
 * Description:
 * Copy the NetLabel security attributes into the SELinux context; since the
 * NetLabel security attribute only contains a single MLS category use it for
 * both the low and high categories of the context.  Returns zero on success,
 * negative values on failure.
 *
 */
int mls_import_netlbl_cat(struct policydb *p,
                          struct context *context,
                          struct netlbl_lsm_secattr *secattr)
{
        int rc;

        if (!p->mls_enabled)
                return 0;

        rc = ebitmap_netlbl_import(&context->range.level[0].cat,
                                   secattr->attr.mls.cat);
        if (rc)
                goto import_netlbl_cat_failure;
        memcpy(&context->range.level[1].cat, &context->range.level[0].cat,
               sizeof(context->range.level[0].cat));

        return 0;

import_netlbl_cat_failure:
        ebitmap_destroy(&context->range.level[0].cat);
        return rc;
}
#endif /* CONFIG_NETLABEL */










































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
/* SPDX-License-Identifier: GPL-2.0 */

#ifndef _BLOCK_BLK_PM_H_
#define _BLOCK_BLK_PM_H_

#include <linux/pm_runtime.h>

#ifdef CONFIG_PM
static inline int blk_pm_resume_queue(const bool pm, struct request_queue *q)
{
        if (!q->dev || !blk_queue_pm_only(q))
                return 1;        /* Nothing to do */
        if (pm && q->rpm_status != RPM_SUSPENDED)
                return 1;        /* Request allowed */
        pm_request_resume(q->dev);
        return 0;
}

static inline void blk_pm_mark_last_busy(struct request *rq)
{
        if (rq->q->dev && !(rq->rq_flags & RQF_PM))
                pm_runtime_mark_last_busy(rq->q->dev);
}

static inline void blk_pm_requeue_request(struct request *rq)
{
        lockdep_assert_held(&rq->q->queue_lock);

        if (rq->q->dev && !(rq->rq_flags & RQF_PM))
                rq->q->nr_pending--;
}

static inline void blk_pm_add_request(struct request_queue *q,
                                      struct request *rq)
{
        lockdep_assert_held(&q->queue_lock);

        if (q->dev && !(rq->rq_flags & RQF_PM))
                q->nr_pending++;
}

static inline void blk_pm_put_request(struct request *rq)
{
        lockdep_assert_held(&rq->q->queue_lock);

        if (rq->q->dev && !(rq->rq_flags & RQF_PM))
                --rq->q->nr_pending;
}
#else
static inline int blk_pm_resume_queue(const bool pm, struct request_queue *q)
{
        return 1;
}

static inline void blk_pm_mark_last_busy(struct request *rq)
{
}

static inline void blk_pm_requeue_request(struct request *rq)
{
}

static inline void blk_pm_add_request(struct request_queue *q,
                                      struct request *rq)
{
}

static inline void blk_pm_put_request(struct request *rq)
{
}
#endif

#endif /* _BLOCK_BLK_PM_H_ */


























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
    1 











    1 
























    1 
    1 
    1 





















    1 






































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
// SPDX-License-Identifier: GPL-2.0
/*
 *  Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
 *  Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
 *  Copyright(C) 2006-2007  Timesys Corp., Thomas Gleixner
 *
 *  High-resolution kernel timers
 *
 *  In contrast to the low-resolution timeout API, aka timer wheel,
 *  hrtimers provide finer resolution and accuracy depending on system
 *  configuration and capabilities.
 *
 *  Started by: Thomas Gleixner and Ingo Molnar
 *
 *  Credits:
 *        Based on the original timer wheel code
 *
 *        Help, testing, suggestions, bugfixes, improvements were
 *        provided by:
 *
 *        George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel
 *        et. al.
 */

#include <linux/cpu.h>
#include <linux/export.h>
#include <linux/percpu.h>
#include <linux/hrtimer.h>
#include <linux/notifier.h>
#include <linux/syscalls.h>
#include <linux/interrupt.h>
#include <linux/tick.h>
#include <linux/err.h>
#include <linux/debugobjects.h>
#include <linux/sched/signal.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/rt.h>
#include <linux/sched/deadline.h>
#include <linux/sched/nohz.h>
#include <linux/sched/debug.h>
#include <linux/timer.h>
#include <linux/freezer.h>
#include <linux/compat.h>

#include <linux/uaccess.h>

#include <trace/events/timer.h>

#include "tick-internal.h"

/*
 * Masks for selecting the soft and hard context timers from
 * cpu_base->active
 */
#define MASK_SHIFT                (HRTIMER_BASE_MONOTONIC_SOFT)
#define HRTIMER_ACTIVE_HARD        ((1U << MASK_SHIFT) - 1)
#define HRTIMER_ACTIVE_SOFT        (HRTIMER_ACTIVE_HARD << MASK_SHIFT)
#define HRTIMER_ACTIVE_ALL        (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)

/*
 * The timer bases:
 *
 * There are more clockids than hrtimer bases. Thus, we index
 * into the timer bases by the hrtimer_base_type enum. When trying
 * to reach a base using a clockid, hrtimer_clockid_to_base()
 * is used to convert from clockid to the proper hrtimer_base_type.
 */
DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
        .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
        .clock_base =
        {
                {
                        .index = HRTIMER_BASE_MONOTONIC,
                        .clockid = CLOCK_MONOTONIC,
                        .get_time = &ktime_get,
                },
                {
                        .index = HRTIMER_BASE_REALTIME,
                        .clockid = CLOCK_REALTIME,
                        .get_time = &ktime_get_real,
                },
                {
                        .index = HRTIMER_BASE_BOOTTIME,
                        .clockid = CLOCK_BOOTTIME,
                        .get_time = &ktime_get_boottime,
                },
                {
                        .index = HRTIMER_BASE_TAI,
                        .clockid = CLOCK_TAI,
                        .get_time = &ktime_get_clocktai,
                },
                {
                        .index = HRTIMER_BASE_MONOTONIC_SOFT,
                        .clockid = CLOCK_MONOTONIC,
                        .get_time = &ktime_get,
                },
                {
                        .index = HRTIMER_BASE_REALTIME_SOFT,
                        .clockid = CLOCK_REALTIME,
                        .get_time = &ktime_get_real,
                },
                {
                        .index = HRTIMER_BASE_BOOTTIME_SOFT,
                        .clockid = CLOCK_BOOTTIME,
                        .get_time = &ktime_get_boottime,
                },
                {
                        .index = HRTIMER_BASE_TAI_SOFT,
                        .clockid = CLOCK_TAI,
                        .get_time = &ktime_get_clocktai,
                },
        }
};

static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
        /* Make sure we catch unsupported clockids */
        [0 ... MAX_CLOCKS - 1]        = HRTIMER_MAX_CLOCK_BASES,

        [CLOCK_REALTIME]        = HRTIMER_BASE_REALTIME,
        [CLOCK_MONOTONIC]        = HRTIMER_BASE_MONOTONIC,
        [CLOCK_BOOTTIME]        = HRTIMER_BASE_BOOTTIME,
        [CLOCK_TAI]                = HRTIMER_BASE_TAI,
};

/*
 * Functions and macros which are different for UP/SMP systems are kept in a
 * single place
 */
#ifdef CONFIG_SMP

/*
 * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
 * such that hrtimer_callback_running() can unconditionally dereference
 * timer->base->cpu_base
 */
static struct hrtimer_cpu_base migration_cpu_base = {
        .clock_base = { {
                .cpu_base = &migration_cpu_base,
                .seq      = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq,
                                                     &migration_cpu_base.lock),
        }, },
};

#define migration_base        migration_cpu_base.clock_base[0]

/*
 * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
 * means that all timers which are tied to this base via timer->base are
 * locked, and the base itself is locked too.
 *
 * So __run_timers/migrate_timers can safely modify all timers which could
 * be found on the lists/queues.
 *
 * When the timer's base is locked, and the timer removed from list, it is
 * possible to set timer->base = &migration_base and drop the lock: the timer
 * remains locked.
 */
static
struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
                                             unsigned long *flags)
{
        struct hrtimer_clock_base *base;

        for (;;) {
                base = READ_ONCE(timer->base);
                if (likely(base != &migration_base)) {
                        raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
                        if (likely(base == timer->base))
                                return base;
                        /* The timer has migrated to another CPU: */
                        raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags);
                }
                cpu_relax();
        }
}

/*
 * We do not migrate the timer when it is expiring before the next
 * event on the target cpu. When high resolution is enabled, we cannot
 * reprogram the target cpu hardware and we would cause it to fire
 * late. To keep it simple, we handle the high resolution enabled and
 * disabled case similar.
 *
 * Called with cpu_base->lock of target cpu held.
 */
static int
hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
{
        ktime_t expires;

        expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
        return expires < new_base->cpu_base->expires_next;
}

static inline
struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
                                         int pinned)
{
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
        if (static_branch_likely(&timers_migration_enabled) && !pinned)
                return &per_cpu(hrtimer_bases, get_nohz_timer_target());
#endif
        return base;
}

/*
 * We switch the timer base to a power-optimized selected CPU target,
 * if:
 *        - NO_HZ_COMMON is enabled
 *        - timer migration is enabled
 *        - the timer callback is not running
 *        - the timer is not the first expiring timer on the new target
 *
 * If one of the above requirements is not fulfilled we move the timer
 * to the current CPU or leave it on the previously assigned CPU if
 * the timer callback is currently running.
 */
static inline struct hrtimer_clock_base *
switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
                    int pinned)
{
        struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base;
        struct hrtimer_clock_base *new_base;
        int basenum = base->index;

        this_cpu_base = this_cpu_ptr(&hrtimer_bases);
        new_cpu_base = get_target_base(this_cpu_base, pinned);
again:
        new_base = &new_cpu_base->clock_base[basenum];

        if (base != new_base) {
                /*
                 * We are trying to move timer to new_base.
                 * However we can't change timer's base while it is running,
                 * so we keep it on the same CPU. No hassle vs. reprogramming
                 * the event source in the high resolution case. The softirq
                 * code will take care of this when the timer function has
                 * completed. There is no conflict as we hold the lock until
                 * the timer is enqueued.
                 */
                if (unlikely(hrtimer_callback_running(timer)))
                        return base;

                /* See the comment in lock_hrtimer_base() */
                WRITE_ONCE(timer->base, &migration_base);
                raw_spin_unlock(&base->cpu_base->lock);
                raw_spin_lock(&new_base->cpu_base->lock);

                if (new_cpu_base != this_cpu_base &&
                    hrtimer_check_target(timer, new_base)) {
                        raw_spin_unlock(&new_base->cpu_base->lock);
                        raw_spin_lock(&base->cpu_base->lock);
                        new_cpu_base = this_cpu_base;
                        WRITE_ONCE(timer->base, base);
                        goto again;
                }
                WRITE_ONCE(timer->base, new_base);
        } else {
                if (new_cpu_base != this_cpu_base &&
                    hrtimer_check_target(timer, new_base)) {
                        new_cpu_base = this_cpu_base;
                        goto again;
                }
        }
        return new_base;
}

#else /* CONFIG_SMP */

static inline struct hrtimer_clock_base *
lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
{
        struct hrtimer_clock_base *base = timer->base;

        raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);

        return base;
}

# define switch_hrtimer_base(t, b, p)        (b)

#endif        /* !CONFIG_SMP */

/*
 * Functions for the union type storage format of ktime_t which are
 * too large for inlining:
 */
#if BITS_PER_LONG < 64
/*
 * Divide a ktime value by a nanosecond value
 */
s64 __ktime_divns(const ktime_t kt, s64 div)
{
        int sft = 0;
        s64 dclc;
        u64 tmp;

        dclc = ktime_to_ns(kt);
        tmp = dclc < 0 ? -dclc : dclc;

        /* Make sure the divisor is less than 2^32: */
        while (div >> 32) {
                sft++;
                div >>= 1;
        }
        tmp >>= sft;
        do_div(tmp, (u32) div);
        return dclc < 0 ? -tmp : tmp;
}
EXPORT_SYMBOL_GPL(__ktime_divns);
#endif /* BITS_PER_LONG >= 64 */

/*
 * Add two ktime values and do a safety check for overflow:
 */
ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
{
        ktime_t res = ktime_add_unsafe(lhs, rhs);

        /*
         * We use KTIME_SEC_MAX here, the maximum timeout which we can
         * return to user space in a timespec:
         */
        if (res < 0 || res < lhs || res < rhs)
                res = ktime_set(KTIME_SEC_MAX, 0);

        return res;
}

EXPORT_SYMBOL_GPL(ktime_add_safe);

#ifdef CONFIG_DEBUG_OBJECTS_TIMERS

static const struct debug_obj_descr hrtimer_debug_descr;

static void *hrtimer_debug_hint(void *addr)
{
        return ((struct hrtimer *) addr)->function;
}

/*
 * fixup_init is called when:
 * - an active object is initialized
 */
static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state)
{
        struct hrtimer *timer = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                hrtimer_cancel(timer);
                debug_object_init(timer, &hrtimer_debug_descr);
                return true;
        default:
                return false;
        }
}

/*
 * fixup_activate is called when:
 * - an active object is activated
 * - an unknown non-static object is activated
 */
static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
{
        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                WARN_ON(1);
                fallthrough;
        default:
                return false;
        }
}

/*
 * fixup_free is called when:
 * - an active object is freed
 */
static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state)
{
        struct hrtimer *timer = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                hrtimer_cancel(timer);
                debug_object_free(timer, &hrtimer_debug_descr);
                return true;
        default:
                return false;
        }
}

static const struct debug_obj_descr hrtimer_debug_descr = {
        .name                = "hrtimer",
        .debug_hint        = hrtimer_debug_hint,
        .fixup_init        = hrtimer_fixup_init,
        .fixup_activate        = hrtimer_fixup_activate,
        .fixup_free        = hrtimer_fixup_free,
};

static inline void debug_hrtimer_init(struct hrtimer *timer)
{
        debug_object_init(timer, &hrtimer_debug_descr);
}

static inline void debug_hrtimer_activate(struct hrtimer *timer,
                                          enum hrtimer_mode mode)
{
        debug_object_activate(timer, &hrtimer_debug_descr);
}

static inline void debug_hrtimer_deactivate(struct hrtimer *timer)
{
        debug_object_deactivate(timer, &hrtimer_debug_descr);
}

static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
                           enum hrtimer_mode mode);

void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id,
                           enum hrtimer_mode mode)
{
        debug_object_init_on_stack(timer, &hrtimer_debug_descr);
        __hrtimer_init(timer, clock_id, mode);
}
EXPORT_SYMBOL_GPL(hrtimer_init_on_stack);

static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
                                   clockid_t clock_id, enum hrtimer_mode mode);

void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
                                   clockid_t clock_id, enum hrtimer_mode mode)
{
        debug_object_init_on_stack(&sl->timer, &hrtimer_debug_descr);
        __hrtimer_init_sleeper(sl, clock_id, mode);
}
EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack);

void destroy_hrtimer_on_stack(struct hrtimer *timer)
{
        debug_object_free(timer, &hrtimer_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack);

#else

static inline void debug_hrtimer_init(struct hrtimer *timer) { }
static inline void debug_hrtimer_activate(struct hrtimer *timer,
                                          enum hrtimer_mode mode) { }
static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
#endif

static inline void
debug_init(struct hrtimer *timer, clockid_t clockid,
           enum hrtimer_mode mode)
{
        debug_hrtimer_init(timer);
        trace_hrtimer_init(timer, clockid, mode);
}

static inline void debug_activate(struct hrtimer *timer,
                                  enum hrtimer_mode mode)
{
        debug_hrtimer_activate(timer, mode);
        trace_hrtimer_start(timer, mode);
}

static inline void debug_deactivate(struct hrtimer *timer)
{
        debug_hrtimer_deactivate(timer);
        trace_hrtimer_cancel(timer);
}

static struct hrtimer_clock_base *
__next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active)
{
        unsigned int idx;

        if (!*active)
                return NULL;

        idx = __ffs(*active);
        *active &= ~(1U << idx);

        return &cpu_base->clock_base[idx];
}

#define for_each_active_base(base, cpu_base, active)        \
        while ((base = __next_base((cpu_base), &(active))))

static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
                                         const struct hrtimer *exclude,
                                         unsigned int active,
                                         ktime_t expires_next)
{
        struct hrtimer_clock_base *base;
        ktime_t expires;

        for_each_active_base(base, cpu_base, active) {
                struct timerqueue_node *next;
                struct hrtimer *timer;

                next = timerqueue_getnext(&base->active);
                timer = container_of(next, struct hrtimer, node);
                if (timer == exclude) {
                        /* Get to the next timer in the queue. */
                        next = timerqueue_iterate_next(next);
                        if (!next)
                                continue;

                        timer = container_of(next, struct hrtimer, node);
                }
                expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
                if (expires < expires_next) {
                        expires_next = expires;

                        /* Skip cpu_base update if a timer is being excluded. */
                        if (exclude)
                                continue;

                        if (timer->is_soft)
                                cpu_base->softirq_next_timer = timer;
                        else
                                cpu_base->next_timer = timer;
                }
        }
        /*
         * clock_was_set() might have changed base->offset of any of
         * the clock bases so the result might be negative. Fix it up
         * to prevent a false positive in clockevents_program_event().
         */
        if (expires_next < 0)
                expires_next = 0;
        return expires_next;
}

/*
 * Recomputes cpu_base::*next_timer and returns the earliest expires_next
 * but does not set cpu_base::*expires_next, that is done by
 * hrtimer[_force]_reprogram and hrtimer_interrupt only. When updating
 * cpu_base::*expires_next right away, reprogramming logic would no longer
 * work.
 *
 * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases,
 * those timers will get run whenever the softirq gets handled, at the end of
 * hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases.
 *
 * Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases.
 * The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual
 * softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD.
 *
 * @active_mask must be one of:
 *  - HRTIMER_ACTIVE_ALL,
 *  - HRTIMER_ACTIVE_SOFT, or
 *  - HRTIMER_ACTIVE_HARD.
 */
static ktime_t
__hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask)
{
        unsigned int active;
        struct hrtimer *next_timer = NULL;
        ktime_t expires_next = KTIME_MAX;

        if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) {
                active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
                cpu_base->softirq_next_timer = NULL;
                expires_next = __hrtimer_next_event_base(cpu_base, NULL,
                                                         active, KTIME_MAX);

                next_timer = cpu_base->softirq_next_timer;
        }

        if (active_mask & HRTIMER_ACTIVE_HARD) {
                active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
                cpu_base->next_timer = next_timer;
                expires_next = __hrtimer_next_event_base(cpu_base, NULL, active,
                                                         expires_next);
        }

        return expires_next;
}

static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base)
{
        ktime_t expires_next, soft = KTIME_MAX;

        /*
         * If the soft interrupt has already been activated, ignore the
         * soft bases. They will be handled in the already raised soft
         * interrupt.
         */
        if (!cpu_base->softirq_activated) {
                soft = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
                /*
                 * Update the soft expiry time. clock_settime() might have
                 * affected it.
                 */
                cpu_base->softirq_expires_next = soft;
        }

        expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD);
        /*
         * If a softirq timer is expiring first, update cpu_base->next_timer
         * and program the hardware with the soft expiry time.
         */
        if (expires_next > soft) {
                cpu_base->next_timer = cpu_base->softirq_next_timer;
                expires_next = soft;
        }

        return expires_next;
}

static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
{
        ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
        ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
        ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;

        ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq,
                                            offs_real, offs_boot, offs_tai);

        base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real;
        base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot;
        base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai;

        return now;
}

/*
 * Is the high resolution mode active ?
 */
static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
{
        return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ?
                cpu_base->hres_active : 0;
}

static inline int hrtimer_hres_active(void)
{
        return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases));
}

/*
 * Reprogram the event source with checking both queues for the
 * next event
 * Called with interrupts disabled and base->lock held
 */
static void
hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
{
        ktime_t expires_next;

        expires_next = hrtimer_update_next_event(cpu_base);

        if (skip_equal && expires_next == cpu_base->expires_next)
                return;

        cpu_base->expires_next = expires_next;

        /*
         * If hres is not active, hardware does not have to be
         * reprogrammed yet.
         *
         * If a hang was detected in the last timer interrupt then we
         * leave the hang delay active in the hardware. We want the
         * system to make progress. That also prevents the following
         * scenario:
         * T1 expires 50ms from now
         * T2 expires 5s from now
         *
         * T1 is removed, so this code is called and would reprogram
         * the hardware to 5s from now. Any hrtimer_start after that
         * will not reprogram the hardware due to hang_detected being
         * set. So we'd effectivly block all timers until the T2 event
         * fires.
         */
        if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
                return;

        tick_program_event(cpu_base->expires_next, 1);
}

/* High resolution timer related functions */
#ifdef CONFIG_HIGH_RES_TIMERS

/*
 * High resolution timer enabled ?
 */
static bool hrtimer_hres_enabled __read_mostly  = true;
unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
EXPORT_SYMBOL_GPL(hrtimer_resolution);

/*
 * Enable / Disable high resolution mode
 */
static int __init setup_hrtimer_hres(char *str)
{
        return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
}

__setup("highres=", setup_hrtimer_hres);

/*
 * hrtimer_high_res_enabled - query, if the highres mode is enabled
 */
static inline int hrtimer_is_hres_enabled(void)
{
        return hrtimer_hres_enabled;
}

/*
 * Retrigger next event is called after clock was set
 *
 * Called with interrupts disabled via on_each_cpu()
 */
static void retrigger_next_event(void *arg)
{
        struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);

        if (!__hrtimer_hres_active(base))
                return;

        raw_spin_lock(&base->lock);
        hrtimer_update_base(base);
        hrtimer_force_reprogram(base, 0);
        raw_spin_unlock(&base->lock);
}

/*
 * Switch to high resolution mode
 */
static void hrtimer_switch_to_hres(void)
{
        struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);

        if (tick_init_highres()) {
                pr_warn("Could not switch to high resolution mode on CPU %u\n",
                        base->cpu);
                return;
        }
        base->hres_active = 1;
        hrtimer_resolution = HIGH_RES_NSEC;

        tick_setup_sched_timer();
        /* "Retrigger" the interrupt to get things going */
        retrigger_next_event(NULL);
}

#else

static inline int hrtimer_is_hres_enabled(void) { return 0; }
static inline void hrtimer_switch_to_hres(void) { }
static inline void retrigger_next_event(void *arg) { }

#endif /* CONFIG_HIGH_RES_TIMERS */

/*
 * When a timer is enqueued and expires earlier than the already enqueued
 * timers, we have to check, whether it expires earlier than the timer for
 * which the clock event device was armed.
 *
 * Called with interrupts disabled and base->cpu_base.lock held
 */
static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        struct hrtimer_clock_base *base = timer->base;
        ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);

        WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);

        /*
         * CLOCK_REALTIME timer might be requested with an absolute
         * expiry time which is less than base->offset. Set it to 0.
         */
        if (expires < 0)
                expires = 0;

        if (timer->is_soft) {
                /*
                 * soft hrtimer could be started on a remote CPU. In this
                 * case softirq_expires_next needs to be updated on the
                 * remote CPU. The soft hrtimer will not expire before the
                 * first hard hrtimer on the remote CPU -
                 * hrtimer_check_target() prevents this case.
                 */
                struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base;

                if (timer_cpu_base->softirq_activated)
                        return;

                if (!ktime_before(expires, timer_cpu_base->softirq_expires_next))
                        return;

                timer_cpu_base->softirq_next_timer = timer;
                timer_cpu_base->softirq_expires_next = expires;

                if (!ktime_before(expires, timer_cpu_base->expires_next) ||
                    !reprogram)
                        return;
        }

        /*
         * If the timer is not on the current cpu, we cannot reprogram
         * the other cpus clock event device.
         */
        if (base->cpu_base != cpu_base)
                return;

        /*
         * If the hrtimer interrupt is running, then it will
         * reevaluate the clock bases and reprogram the clock event
         * device. The callbacks are always executed in hard interrupt
         * context so we don't need an extra check for a running
         * callback.
         */
        if (cpu_base->in_hrtirq)
                return;

        if (expires >= cpu_base->expires_next)
                return;

        /* Update the pointer to the next expiring timer */
        cpu_base->next_timer = timer;
        cpu_base->expires_next = expires;

        /*
         * If hres is not active, hardware does not have to be
         * programmed yet.
         *
         * If a hang was detected in the last timer interrupt then we
         * do not schedule a timer which is earlier than the expiry
         * which we enforced in the hang detection. We want the system
         * to make progress.
         */
        if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
                return;

        /*
         * Program the timer hardware. We enforce the expiry for
         * events which are already in the past.
         */
        tick_program_event(expires, 1);
}

/*
 * Clock realtime was set
 *
 * Change the offset of the realtime clock vs. the monotonic
 * clock.
 *
 * We might have to reprogram the high resolution timer interrupt. On
 * SMP we call the architecture specific code to retrigger _all_ high
 * resolution timer interrupts. On UP we just disable interrupts and
 * call the high resolution interrupt code.
 */
void clock_was_set(void)
{
#ifdef CONFIG_HIGH_RES_TIMERS
        /* Retrigger the CPU local events everywhere */
        on_each_cpu(retrigger_next_event, NULL, 1);
#endif
        timerfd_clock_was_set();
}

static void clock_was_set_work(struct work_struct *work)
{
        clock_was_set();
}

static DECLARE_WORK(hrtimer_work, clock_was_set_work);

/*
 * Called from timekeeping and resume code to reprogram the hrtimer
 * interrupt device on all cpus and to notify timerfd.
 */
void clock_was_set_delayed(void)
{
        schedule_work(&hrtimer_work);
}

/*
 * During resume we might have to reprogram the high resolution timer
 * interrupt on all online CPUs.  However, all other CPUs will be
 * stopped with IRQs interrupts disabled so the clock_was_set() call
 * must be deferred.
 */
void hrtimers_resume(void)
{
        lockdep_assert_irqs_disabled();
        /* Retrigger on the local CPU */
        retrigger_next_event(NULL);
        /* And schedule a retrigger for all others */
        clock_was_set_delayed();
}

/*
 * Counterpart to lock_hrtimer_base above:
 */
static inline
void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
{
        raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
}

/**
 * hrtimer_forward - forward the timer expiry
 * @timer:        hrtimer to forward
 * @now:        forward past this time
 * @interval:        the interval to forward
 *
 * Forward the timer expiry so it will expire in the future.
 * Returns the number of overruns.
 *
 * Can be safely called from the callback function of @timer. If
 * called from other contexts @timer must neither be enqueued nor
 * running the callback and the caller needs to take care of
 * serialization.
 *
 * Note: This only updates the timer expiry value and does not requeue
 * the timer.
 */
u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
{
        u64 orun = 1;
        ktime_t delta;

        delta = ktime_sub(now, hrtimer_get_expires(timer));

        if (delta < 0)
                return 0;

        if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED))
                return 0;

        if (interval < hrtimer_resolution)
                interval = hrtimer_resolution;

        if (unlikely(delta >= interval)) {
                s64 incr = ktime_to_ns(interval);

                orun = ktime_divns(delta, incr);
                hrtimer_add_expires_ns(timer, incr * orun);
                if (hrtimer_get_expires_tv64(timer) > now)
                        return orun;
                /*
                 * This (and the ktime_add() below) is the
                 * correction for exact:
                 */
                orun++;
        }
        hrtimer_add_expires(timer, interval);

        return orun;
}
EXPORT_SYMBOL_GPL(hrtimer_forward);

/*
 * enqueue_hrtimer - internal function to (re)start a timer
 *
 * The timer is inserted in expiry order. Insertion into the
 * red black tree is O(log(n)). Must hold the base lock.
 *
 * Returns 1 when the new timer is the leftmost timer in the tree.
 */
static int enqueue_hrtimer(struct hrtimer *timer,
                           struct hrtimer_clock_base *base,
                           enum hrtimer_mode mode)
{
        debug_activate(timer, mode);
        WARN_ON_ONCE(!base->cpu_base->online);

        base->cpu_base->active_bases |= 1 << base->index;

        /* Pairs with the lockless read in hrtimer_is_queued() */
        WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED);

        return timerqueue_add(&base->active, &timer->node);
}

/*
 * __remove_hrtimer - internal function to remove a timer
 *
 * Caller must hold the base lock.
 *
 * High resolution timer mode reprograms the clock event device when the
 * timer is the one which expires next. The caller can disable this by setting
 * reprogram to zero. This is useful, when the context does a reprogramming
 * anyway (e.g. timer interrupt)
 */
static void __remove_hrtimer(struct hrtimer *timer,
                             struct hrtimer_clock_base *base,
                             u8 newstate, int reprogram)
{
        struct hrtimer_cpu_base *cpu_base = base->cpu_base;
        u8 state = timer->state;

        /* Pairs with the lockless read in hrtimer_is_queued() */
        WRITE_ONCE(timer->state, newstate);
        if (!(state & HRTIMER_STATE_ENQUEUED))
                return;

        if (!timerqueue_del(&base->active, &timer->node))
                cpu_base->active_bases &= ~(1 << base->index);

        /*
         * Note: If reprogram is false we do not update
         * cpu_base->next_timer. This happens when we remove the first
         * timer on a remote cpu. No harm as we never dereference
         * cpu_base->next_timer. So the worst thing what can happen is
         * an superflous call to hrtimer_force_reprogram() on the
         * remote cpu later on if the same timer gets enqueued again.
         */
        if (reprogram && timer == cpu_base->next_timer)
                hrtimer_force_reprogram(cpu_base, 1);
}

/*
 * remove hrtimer, called with base lock held
 */
static inline int
remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
               bool restart, bool keep_local)
{
        u8 state = timer->state;

        if (state & HRTIMER_STATE_ENQUEUED) {
                bool reprogram;

                /*
                 * Remove the timer and force reprogramming when high
                 * resolution mode is active and the timer is on the current
                 * CPU. If we remove a timer on another CPU, reprogramming is
                 * skipped. The interrupt event on this CPU is fired and
                 * reprogramming happens in the interrupt handler. This is a
                 * rare case and less expensive than a smp call.
                 */
                debug_deactivate(timer);
                reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);

                /*
                 * If the timer is not restarted then reprogramming is
                 * required if the timer is local. If it is local and about
                 * to be restarted, avoid programming it twice (on removal
                 * and a moment later when it's requeued).
                 */
                if (!restart)
                        state = HRTIMER_STATE_INACTIVE;
                else
                        reprogram &= !keep_local;

                __remove_hrtimer(timer, base, state, reprogram);
                return 1;
        }
        return 0;
}

static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
                                            const enum hrtimer_mode mode)
{
#ifdef CONFIG_TIME_LOW_RES
        /*
         * CONFIG_TIME_LOW_RES indicates that the system has no way to return
         * granular time values. For relative timers we add hrtimer_resolution
         * (i.e. one jiffie) to prevent short timeouts.
         */
        timer->is_rel = mode & HRTIMER_MODE_REL;
        if (timer->is_rel)
                tim = ktime_add_safe(tim, hrtimer_resolution);
#endif
        return tim;
}

static void
hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram)
{
        ktime_t expires;

        /*
         * Find the next SOFT expiration.
         */
        expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);

        /*
         * reprogramming needs to be triggered, even if the next soft
         * hrtimer expires at the same time than the next hard
         * hrtimer. cpu_base->softirq_expires_next needs to be updated!
         */
        if (expires == KTIME_MAX)
                return;

        /*
         * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event()
         * cpu_base->*expires_next is only set by hrtimer_reprogram()
         */
        hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram);
}

static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
                                    u64 delta_ns, const enum hrtimer_mode mode,
                                    struct hrtimer_clock_base *base)
{
        struct hrtimer_clock_base *new_base;
        bool force_local, first;

        /*
         * If the timer is on the local cpu base and is the first expiring
         * timer then this might end up reprogramming the hardware twice
         * (on removal and on enqueue). To avoid that by prevent the
         * reprogram on removal, keep the timer local to the current CPU
         * and enforce reprogramming after it is queued no matter whether
         * it is the new first expiring timer again or not.
         */
        force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
        force_local &= base->cpu_base->next_timer == timer;

        /*
         * Remove an active timer from the queue. In case it is not queued
         * on the current CPU, make sure that remove_hrtimer() updates the
         * remote data correctly.
         *
         * If it's on the current CPU and the first expiring timer, then
         * skip reprogramming, keep the timer local and enforce
         * reprogramming later if it was the first expiring timer.  This
         * avoids programming the underlying clock event twice (once at
         * removal and once after enqueue).
         */
        remove_hrtimer(timer, base, true, force_local);

        if (mode & HRTIMER_MODE_REL)
                tim = ktime_add_safe(tim, base->get_time());

        tim = hrtimer_update_lowres(timer, tim, mode);

        hrtimer_set_expires_range_ns(timer, tim, delta_ns);

        /* Switch the timer base, if necessary: */
        if (!force_local) {
                new_base = switch_hrtimer_base(timer, base,
                                               mode & HRTIMER_MODE_PINNED);
        } else {
                new_base = base;
        }

        first = enqueue_hrtimer(timer, new_base, mode);
        if (!force_local)
                return first;

        /*
         * Timer was forced to stay on the current CPU to avoid
         * reprogramming on removal and enqueue. Force reprogram the
         * hardware by evaluating the new first expiring timer.
         */
        hrtimer_force_reprogram(new_base->cpu_base, 1);
        return 0;
}

/**
 * hrtimer_start_range_ns - (re)start an hrtimer
 * @timer:        the timer to be added
 * @tim:        expiry time
 * @delta_ns:        "slack" range for the timer
 * @mode:        timer mode: absolute (HRTIMER_MODE_ABS) or
 *                relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
 *                softirq based mode is considered for debug purpose only!
 */
void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
                            u64 delta_ns, const enum hrtimer_mode mode)
{
        struct hrtimer_clock_base *base;
        unsigned long flags;

        if (WARN_ON_ONCE(!timer->function))
                return;
        /*
         * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft
         * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard
         * expiry mode because unmarked timers are moved to softirq expiry.
         */
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft);
        else
                WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard);

        base = lock_hrtimer_base(timer, &flags);

        if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base))
                hrtimer_reprogram(timer, true);

        unlock_hrtimer_base(timer, &flags);
}
EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);

/**
 * hrtimer_try_to_cancel - try to deactivate a timer
 * @timer:        hrtimer to stop
 *
 * Returns:
 *
 *  *  0 when the timer was not active
 *  *  1 when the timer was active
 *  * -1 when the timer is currently executing the callback function and
 *    cannot be stopped
 */
int hrtimer_try_to_cancel(struct hrtimer *timer)
{
        struct hrtimer_clock_base *base;
        unsigned long flags;
        int ret = -1;

        /*
         * Check lockless first. If the timer is not active (neither
         * enqueued nor running the callback, nothing to do here.  The
         * base lock does not serialize against a concurrent enqueue,
         * so we can avoid taking it.
         */
        if (!hrtimer_active(timer))
                return 0;

        base = lock_hrtimer_base(timer, &flags);

        if (!hrtimer_callback_running(timer))
                ret = remove_hrtimer(timer, base, false, false);

        unlock_hrtimer_base(timer, &flags);

        return ret;

}
EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);

#ifdef CONFIG_PREEMPT_RT
static void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base)
{
        spin_lock_init(&base->softirq_expiry_lock);
}

static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base)
{
        spin_lock(&base->softirq_expiry_lock);
}

static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base)
{
        spin_unlock(&base->softirq_expiry_lock);
}

/*
 * The counterpart to hrtimer_cancel_wait_running().
 *
 * If there is a waiter for cpu_base->expiry_lock, then it was waiting for
 * the timer callback to finish. Drop expiry_lock and reaquire it. That
 * allows the waiter to acquire the lock and make progress.
 */
static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base,
                                      unsigned long flags)
{
        if (atomic_read(&cpu_base->timer_waiters)) {
                raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
                spin_unlock(&cpu_base->softirq_expiry_lock);
                spin_lock(&cpu_base->softirq_expiry_lock);
                raw_spin_lock_irq(&cpu_base->lock);
        }
}

#ifdef CONFIG_SMP
static __always_inline bool is_migration_base(struct hrtimer_clock_base *base)
{
        return base == &migration_base;
}
#else
static __always_inline bool is_migration_base(struct hrtimer_clock_base *base)
{
        return false;
}
#endif

/*
 * This function is called on PREEMPT_RT kernels when the fast path
 * deletion of a timer failed because the timer callback function was
 * running.
 *
 * This prevents priority inversion: if the soft irq thread is preempted
 * in the middle of a timer callback, then calling del_timer_sync() can
 * lead to two issues:
 *
 *  - If the caller is on a remote CPU then it has to spin wait for the timer
 *    handler to complete. This can result in unbound priority inversion.
 *
 *  - If the caller originates from the task which preempted the timer
 *    handler on the same CPU, then spin waiting for the timer handler to
 *    complete is never going to end.
 */
void hrtimer_cancel_wait_running(const struct hrtimer *timer)
{
        /* Lockless read. Prevent the compiler from reloading it below */
        struct hrtimer_clock_base *base = READ_ONCE(timer->base);

        /*
         * Just relax if the timer expires in hard interrupt context or if
         * it is currently on the migration base.
         */
        if (!timer->is_soft || is_migration_base(base)) {
                cpu_relax();
                return;
        }

        /*
         * Mark the base as contended and grab the expiry lock, which is
         * held by the softirq across the timer callback. Drop the lock
         * immediately so the softirq can expire the next timer. In theory
         * the timer could already be running again, but that's more than
         * unlikely and just causes another wait loop.
         */
        atomic_inc(&base->cpu_base->timer_waiters);
        spin_lock_bh(&base->cpu_base->softirq_expiry_lock);
        atomic_dec(&base->cpu_base->timer_waiters);
        spin_unlock_bh(&base->cpu_base->softirq_expiry_lock);
}
#else
static inline void
hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { }
static inline void
hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { }
static inline void
hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { }
static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base,
                                             unsigned long flags) { }
#endif

/**
 * hrtimer_cancel - cancel a timer and wait for the handler to finish.
 * @timer:        the timer to be cancelled
 *
 * Returns:
 *  0 when the timer was not active
 *  1 when the timer was active
 */
int hrtimer_cancel(struct hrtimer *timer)
{
        int ret;

        do {
                ret = hrtimer_try_to_cancel(timer);

                if (ret < 0)
                        hrtimer_cancel_wait_running(timer);
        } while (ret < 0);
        return ret;
}
EXPORT_SYMBOL_GPL(hrtimer_cancel);

/**
 * hrtimer_get_remaining - get remaining time for the timer
 * @timer:        the timer to read
 * @adjust:        adjust relative timers when CONFIG_TIME_LOW_RES=y
 */
ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust)
{
        unsigned long flags;
        ktime_t rem;

        lock_hrtimer_base(timer, &flags);
        if (IS_ENABLED(CONFIG_TIME_LOW_RES) && adjust)
                rem = hrtimer_expires_remaining_adjusted(timer);
        else
                rem = hrtimer_expires_remaining(timer);
        unlock_hrtimer_base(timer, &flags);

        return rem;
}
EXPORT_SYMBOL_GPL(__hrtimer_get_remaining);

#ifdef CONFIG_NO_HZ_COMMON
/**
 * hrtimer_get_next_event - get the time until next expiry event
 *
 * Returns the next expiry time or KTIME_MAX if no timer is pending.
 */
u64 hrtimer_get_next_event(void)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        u64 expires = KTIME_MAX;
        unsigned long flags;

        raw_spin_lock_irqsave(&cpu_base->lock, flags);

        if (!__hrtimer_hres_active(cpu_base))
                expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);

        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);

        return expires;
}

/**
 * hrtimer_next_event_without - time until next expiry event w/o one timer
 * @exclude:        timer to exclude
 *
 * Returns the next expiry time over all timers except for the @exclude one or
 * KTIME_MAX if none of them is pending.
 */
u64 hrtimer_next_event_without(const struct hrtimer *exclude)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        u64 expires = KTIME_MAX;
        unsigned long flags;

        raw_spin_lock_irqsave(&cpu_base->lock, flags);

        if (__hrtimer_hres_active(cpu_base)) {
                unsigned int active;

                if (!cpu_base->softirq_activated) {
                        active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
                        expires = __hrtimer_next_event_base(cpu_base, exclude,
                                                            active, KTIME_MAX);
                }
                active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
                expires = __hrtimer_next_event_base(cpu_base, exclude, active,
                                                    expires);
        }

        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);

        return expires;
}
#endif

static inline int hrtimer_clockid_to_base(clockid_t clock_id)
{
        if (likely(clock_id < MAX_CLOCKS)) {
                int base = hrtimer_clock_to_base_table[clock_id];

                if (likely(base != HRTIMER_MAX_CLOCK_BASES))
                        return base;
        }
        WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
        return HRTIMER_BASE_MONOTONIC;
}

static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
                           enum hrtimer_mode mode)
{
        bool softtimer = !!(mode & HRTIMER_MODE_SOFT);
        struct hrtimer_cpu_base *cpu_base;
        int base;

        /*
         * On PREEMPT_RT enabled kernels hrtimers which are not explicitely
         * marked for hard interrupt expiry mode are moved into soft
         * interrupt context for latency reasons and because the callbacks
         * can invoke functions which might sleep on RT, e.g. spin_lock().
         */
        if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(mode & HRTIMER_MODE_HARD))
                softtimer = true;

        memset(timer, 0, sizeof(struct hrtimer));

        cpu_base = raw_cpu_ptr(&hrtimer_bases);

        /*
         * POSIX magic: Relative CLOCK_REALTIME timers are not affected by
         * clock modifications, so they needs to become CLOCK_MONOTONIC to
         * ensure POSIX compliance.
         */
        if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL)
                clock_id = CLOCK_MONOTONIC;

        base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0;
        base += hrtimer_clockid_to_base(clock_id);
        timer->is_soft = softtimer;
        timer->is_hard = !!(mode & HRTIMER_MODE_HARD);
        timer->base = &cpu_base->clock_base[base];
        timerqueue_init(&timer->node);
}

/**
 * hrtimer_init - initialize a timer to the given clock
 * @timer:        the timer to be initialized
 * @clock_id:        the clock to be used
 * @mode:       The modes which are relevant for intitialization:
 *              HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT,
 *              HRTIMER_MODE_REL_SOFT
 *
 *              The PINNED variants of the above can be handed in,
 *              but the PINNED bit is ignored as pinning happens
 *              when the hrtimer is started
 */
void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
                  enum hrtimer_mode mode)
{
        debug_init(timer, clock_id, mode);
        __hrtimer_init(timer, clock_id, mode);
}
EXPORT_SYMBOL_GPL(hrtimer_init);

/*
 * A timer is active, when it is enqueued into the rbtree or the
 * callback function is running or it's in the state of being migrated
 * to another cpu.
 *
 * It is important for this function to not return a false negative.
 */
bool hrtimer_active(const struct hrtimer *timer)
{
        struct hrtimer_clock_base *base;
        unsigned int seq;

        do {
                base = READ_ONCE(timer->base);
                seq = raw_read_seqcount_begin(&base->seq);

                if (timer->state != HRTIMER_STATE_INACTIVE ||
                    base->running == timer)
                        return true;

        } while (read_seqcount_retry(&base->seq, seq) ||
                 base != READ_ONCE(timer->base));

        return false;
}
EXPORT_SYMBOL_GPL(hrtimer_active);

/*
 * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3
 * distinct sections:
 *
 *  - queued:        the timer is queued
 *  - callback:        the timer is being ran
 *  - post:        the timer is inactive or (re)queued
 *
 * On the read side we ensure we observe timer->state and cpu_base->running
 * from the same section, if anything changed while we looked at it, we retry.
 * This includes timer->base changing because sequence numbers alone are
 * insufficient for that.
 *
 * The sequence numbers are required because otherwise we could still observe
 * a false negative if the read side got smeared over multiple consequtive
 * __run_hrtimer() invocations.
 */

static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
                          struct hrtimer_clock_base *base,
                          struct hrtimer *timer, ktime_t *now,
                          unsigned long flags) __must_hold(&cpu_base->lock)
{
        enum hrtimer_restart (*fn)(struct hrtimer *);
        bool expires_in_hardirq;
        int restart;

        lockdep_assert_held(&cpu_base->lock);

        debug_hrtimer_deactivate(timer);
        base->running = timer;

        /*
         * Separate the ->running assignment from the ->state assignment.
         *
         * As with a regular write barrier, this ensures the read side in
         * hrtimer_active() cannot observe base->running == NULL &&
         * timer->state == INACTIVE.
         */
        raw_write_seqcount_barrier(&base->seq);

        __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
        fn = timer->function;

        /*
         * Clear the 'is relative' flag for the TIME_LOW_RES case. If the
         * timer is restarted with a period then it becomes an absolute
         * timer. If its not restarted it does not matter.
         */
        if (IS_ENABLED(CONFIG_TIME_LOW_RES))
                timer->is_rel = false;

        /*
         * The timer is marked as running in the CPU base, so it is
         * protected against migration to a different CPU even if the lock
         * is dropped.
         */
        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
        trace_hrtimer_expire_entry(timer, now);
        expires_in_hardirq = lockdep_hrtimer_enter(timer);

        restart = fn(timer);

        lockdep_hrtimer_exit(expires_in_hardirq);
        trace_hrtimer_expire_exit(timer);
        raw_spin_lock_irq(&cpu_base->lock);

        /*
         * Note: We clear the running state after enqueue_hrtimer and
         * we do not reprogram the event hardware. Happens either in
         * hrtimer_start_range_ns() or in hrtimer_interrupt()
         *
         * Note: Because we dropped the cpu_base->lock above,
         * hrtimer_start_range_ns() can have popped in and enqueued the timer
         * for us already.
         */
        if (restart != HRTIMER_NORESTART &&
            !(timer->state & HRTIMER_STATE_ENQUEUED))
                enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS);

        /*
         * Separate the ->running assignment from the ->state assignment.
         *
         * As with a regular write barrier, this ensures the read side in
         * hrtimer_active() cannot observe base->running.timer == NULL &&
         * timer->state == INACTIVE.
         */
        raw_write_seqcount_barrier(&base->seq);

        WARN_ON_ONCE(base->running != timer);
        base->running = NULL;
}

static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
                                 unsigned long flags, unsigned int active_mask)
{
        struct hrtimer_clock_base *base;
        unsigned int active = cpu_base->active_bases & active_mask;

        for_each_active_base(base, cpu_base, active) {
                struct timerqueue_node *node;
                ktime_t basenow;

                basenow = ktime_add(now, base->offset);

                while ((node = timerqueue_getnext(&base->active))) {
                        struct hrtimer *timer;

                        timer = container_of(node, struct hrtimer, node);

                        /*
                         * The immediate goal for using the softexpires is
                         * minimizing wakeups, not running timers at the
                         * earliest interrupt after their soft expiration.
                         * This allows us to avoid using a Priority Search
                         * Tree, which can answer a stabbing querry for
                         * overlapping intervals and instead use the simple
                         * BST we already have.
                         * We don't add extra wakeups by delaying timers that
                         * are right-of a not yet expired timer, because that
                         * timer will have to trigger a wakeup anyway.
                         */
                        if (basenow < hrtimer_get_softexpires_tv64(timer))
                                break;

                        __run_hrtimer(cpu_base, base, timer, &basenow, flags);
                        if (active_mask == HRTIMER_ACTIVE_SOFT)
                                hrtimer_sync_wait_running(cpu_base, flags);
                }
        }
}

static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        unsigned long flags;
        ktime_t now;

        hrtimer_cpu_base_lock_expiry(cpu_base);
        raw_spin_lock_irqsave(&cpu_base->lock, flags);

        now = hrtimer_update_base(cpu_base);
        __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT);

        cpu_base->softirq_activated = 0;
        hrtimer_update_softirq_timer(cpu_base, true);

        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
        hrtimer_cpu_base_unlock_expiry(cpu_base);
}

#ifdef CONFIG_HIGH_RES_TIMERS

/*
 * High resolution timer interrupt
 * Called with interrupts disabled
 */
void hrtimer_interrupt(struct clock_event_device *dev)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        ktime_t expires_next, now, entry_time, delta;
        unsigned long flags;
        int retries = 0;

        BUG_ON(!cpu_base->hres_active);
        cpu_base->nr_events++;
        dev->next_event = KTIME_MAX;

        raw_spin_lock_irqsave(&cpu_base->lock, flags);
        entry_time = now = hrtimer_update_base(cpu_base);
retry:
        cpu_base->in_hrtirq = 1;
        /*
         * We set expires_next to KTIME_MAX here with cpu_base->lock
         * held to prevent that a timer is enqueued in our queue via
         * the migration code. This does not affect enqueueing of
         * timers which run their callback and need to be requeued on
         * this CPU.
         */
        cpu_base->expires_next = KTIME_MAX;

        if (!ktime_before(now, cpu_base->softirq_expires_next)) {
                cpu_base->softirq_expires_next = KTIME_MAX;
                cpu_base->softirq_activated = 1;
                raise_softirq_irqoff(HRTIMER_SOFTIRQ);
        }

        __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);

        /* Reevaluate the clock bases for the [soft] next expiry */
        expires_next = hrtimer_update_next_event(cpu_base);
        /*
         * Store the new expiry value so the migration code can verify
         * against it.
         */
        cpu_base->expires_next = expires_next;
        cpu_base->in_hrtirq = 0;
        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);

        /* Reprogramming necessary ? */
        if (!tick_program_event(expires_next, 0)) {
                cpu_base->hang_detected = 0;
                return;
        }

        /*
         * The next timer was already expired due to:
         * - tracing
         * - long lasting callbacks
         * - being scheduled away when running in a VM
         *
         * We need to prevent that we loop forever in the hrtimer
         * interrupt routine. We give it 3 attempts to avoid
         * overreacting on some spurious event.
         *
         * Acquire base lock for updating the offsets and retrieving
         * the current time.
         */
        raw_spin_lock_irqsave(&cpu_base->lock, flags);
        now = hrtimer_update_base(cpu_base);
        cpu_base->nr_retries++;
        if (++retries < 3)
                goto retry;
        /*
         * Give the system a chance to do something else than looping
         * here. We stored the entry time, so we know exactly how long
         * we spent here. We schedule the next event this amount of
         * time away.
         */
        cpu_base->nr_hangs++;
        cpu_base->hang_detected = 1;
        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);

        delta = ktime_sub(now, entry_time);
        if ((unsigned int)delta > cpu_base->max_hang_time)
                cpu_base->max_hang_time = (unsigned int) delta;
        /*
         * Limit it to a sensible value as we enforce a longer
         * delay. Give the CPU at least 100ms to catch up.
         */
        if (delta > 100 * NSEC_PER_MSEC)
                expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
        else
                expires_next = ktime_add(now, delta);
        tick_program_event(expires_next, 1);
        pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta));
}

/* called with interrupts disabled */
static inline void __hrtimer_peek_ahead_timers(void)
{
        struct tick_device *td;

        if (!hrtimer_hres_active())
                return;

        td = this_cpu_ptr(&tick_cpu_device);
        if (td && td->evtdev)
                hrtimer_interrupt(td->evtdev);
}

#else /* CONFIG_HIGH_RES_TIMERS */

static inline void __hrtimer_peek_ahead_timers(void) { }

#endif        /* !CONFIG_HIGH_RES_TIMERS */

/*
 * Called from run_local_timers in hardirq context every jiffy
 */
void hrtimer_run_queues(void)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        unsigned long flags;
        ktime_t now;

        if (__hrtimer_hres_active(cpu_base))
                return;

        /*
         * This _is_ ugly: We have to check periodically, whether we
         * can switch to highres and / or nohz mode. The clocksource
         * switch happens with xtime_lock held. Notification from
         * there only sets the check bit in the tick_oneshot code,
         * otherwise we might deadlock vs. xtime_lock.
         */
        if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) {
                hrtimer_switch_to_hres();
                return;
        }

        raw_spin_lock_irqsave(&cpu_base->lock, flags);
        now = hrtimer_update_base(cpu_base);

        if (!ktime_before(now, cpu_base->softirq_expires_next)) {
                cpu_base->softirq_expires_next = KTIME_MAX;
                cpu_base->softirq_activated = 1;
                raise_softirq_irqoff(HRTIMER_SOFTIRQ);
        }

        __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
}

/*
 * Sleep related functions:
 */
static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
{
        struct hrtimer_sleeper *t =
                container_of(timer, struct hrtimer_sleeper, timer);
        struct task_struct *task = t->task;

        t->task = NULL;
        if (task)
                wake_up_process(task);

        return HRTIMER_NORESTART;
}

/**
 * hrtimer_sleeper_start_expires - Start a hrtimer sleeper timer
 * @sl:                sleeper to be started
 * @mode:        timer mode abs/rel
 *
 * Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers
 * to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context)
 */
void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl,
                                   enum hrtimer_mode mode)
{
        /*
         * Make the enqueue delivery mode check work on RT. If the sleeper
         * was initialized for hard interrupt delivery, force the mode bit.
         * This is a special case for hrtimer_sleepers because
         * hrtimer_init_sleeper() determines the delivery mode on RT so the
         * fiddling with this decision is avoided at the call sites.
         */
        if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard)
                mode |= HRTIMER_MODE_HARD;

        hrtimer_start_expires(&sl->timer, mode);
}
EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires);

static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
                                   clockid_t clock_id, enum hrtimer_mode mode)
{
        /*
         * On PREEMPT_RT enabled kernels hrtimers which are not explicitely
         * marked for hard interrupt expiry mode are moved into soft
         * interrupt context either for latency reasons or because the
         * hrtimer callback takes regular spinlocks or invokes other
         * functions which are not suitable for hard interrupt context on
         * PREEMPT_RT.
         *
         * The hrtimer_sleeper callback is RT compatible in hard interrupt
         * context, but there is a latency concern: Untrusted userspace can
         * spawn many threads which arm timers for the same expiry time on
         * the same CPU. That causes a latency spike due to the wakeup of
         * a gazillion threads.
         *
         * OTOH, priviledged real-time user space applications rely on the
         * low latency of hard interrupt wakeups. If the current task is in
         * a real-time scheduling class, mark the mode for hard interrupt
         * expiry.
         */
        if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
                if (task_is_realtime(current) && !(mode & HRTIMER_MODE_SOFT))
                        mode |= HRTIMER_MODE_HARD;
        }

        __hrtimer_init(&sl->timer, clock_id, mode);
        sl->timer.function = hrtimer_wakeup;
        sl->task = current;
}

/**
 * hrtimer_init_sleeper - initialize sleeper to the given clock
 * @sl:                sleeper to be initialized
 * @clock_id:        the clock to be used
 * @mode:        timer mode abs/rel
 */
void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
                          enum hrtimer_mode mode)
{
        debug_init(&sl->timer, clock_id, mode);
        __hrtimer_init_sleeper(sl, clock_id, mode);

}
EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);

int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts)
{
        switch(restart->nanosleep.type) {
#ifdef CONFIG_COMPAT_32BIT_TIME
        case TT_COMPAT:
                if (put_old_timespec32(ts, restart->nanosleep.compat_rmtp))
                        return -EFAULT;
                break;
#endif
        case TT_NATIVE:
                if (put_timespec64(ts, restart->nanosleep.rmtp))
                        return -EFAULT;
                break;
        default:
                BUG();
        }
        return -ERESTART_RESTARTBLOCK;
}

static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
{
        struct restart_block *restart;

        do {
                set_current_state(TASK_INTERRUPTIBLE);
                hrtimer_sleeper_start_expires(t, mode);

                if (likely(t->task))
                        freezable_schedule();

                hrtimer_cancel(&t->timer);
                mode = HRTIMER_MODE_ABS;

        } while (t->task && !signal_pending(current));

        __set_current_state(TASK_RUNNING);

        if (!t->task)
                return 0;

        restart = &current->restart_block;
        if (restart->nanosleep.type != TT_NONE) {
                ktime_t rem = hrtimer_expires_remaining(&t->timer);
                struct timespec64 rmt;

                if (rem <= 0)
                        return 0;
                rmt = ktime_to_timespec64(rem);

                return nanosleep_copyout(restart, &rmt);
        }
        return -ERESTART_RESTARTBLOCK;
}

static long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
{
        struct hrtimer_sleeper t;
        int ret;

        hrtimer_init_sleeper_on_stack(&t, restart->nanosleep.clockid,
                                      HRTIMER_MODE_ABS);
        hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
        ret = do_nanosleep(&t, HRTIMER_MODE_ABS);
        destroy_hrtimer_on_stack(&t.timer);
        return ret;
}

long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
                       const clockid_t clockid)
{
        struct restart_block *restart;
        struct hrtimer_sleeper t;
        int ret = 0;
        u64 slack;

        slack = current->timer_slack_ns;
        if (dl_task(current) || rt_task(current))
                slack = 0;

        hrtimer_init_sleeper_on_stack(&t, clockid, mode);
        hrtimer_set_expires_range_ns(&t.timer, rqtp, slack);
        ret = do_nanosleep(&t, mode);
        if (ret != -ERESTART_RESTARTBLOCK)
                goto out;

        /* Absolute timers do not update the rmtp value and restart: */
        if (mode == HRTIMER_MODE_ABS) {
                ret = -ERESTARTNOHAND;
                goto out;
        }

        restart = &current->restart_block;
        restart->nanosleep.clockid = t.timer.base->clockid;
        restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
        set_restart_fn(restart, hrtimer_nanosleep_restart);
out:
        destroy_hrtimer_on_stack(&t.timer);
        return ret;
}

#ifdef CONFIG_64BIT

SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
                struct __kernel_timespec __user *, rmtp)
{
        struct timespec64 tu;

        if (get_timespec64(&tu, rqtp))
                return -EFAULT;

        if (!timespec64_valid(&tu))
                return -EINVAL;

        current->restart_block.fn = do_no_restart_syscall;
        current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
        current->restart_block.nanosleep.rmtp = rmtp;
        return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
                                 CLOCK_MONOTONIC);
}

#endif

#ifdef CONFIG_COMPAT_32BIT_TIME

SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
                       struct old_timespec32 __user *, rmtp)
{
        struct timespec64 tu;

        if (get_old_timespec32(&tu, rqtp))
                return -EFAULT;

        if (!timespec64_valid(&tu))
                return -EINVAL;

        current->restart_block.fn = do_no_restart_syscall;
        current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
        current->restart_block.nanosleep.compat_rmtp = rmtp;
        return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
                                 CLOCK_MONOTONIC);
}
#endif

/*
 * Functions related to boot-time initialization:
 */
int hrtimers_prepare_cpu(unsigned int cpu)
{
        struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
        int i;

        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
                struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i];

                clock_b->cpu_base = cpu_base;
                seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock);
                timerqueue_init_head(&clock_b->active);
        }

        cpu_base->cpu = cpu;
        hrtimer_cpu_base_init_expiry_lock(cpu_base);
        return 0;
}

int hrtimers_cpu_starting(unsigned int cpu)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);

        /* Clear out any left over state from a CPU down operation */
        cpu_base->active_bases = 0;
        cpu_base->hres_active = 0;
        cpu_base->hang_detected = 0;
        cpu_base->next_timer = NULL;
        cpu_base->softirq_next_timer = NULL;
        cpu_base->expires_next = KTIME_MAX;
        cpu_base->softirq_expires_next = KTIME_MAX;
        cpu_base->online = 1;
        return 0;
}

#ifdef CONFIG_HOTPLUG_CPU

static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
                                struct hrtimer_clock_base *new_base)
{
        struct hrtimer *timer;
        struct timerqueue_node *node;

        while ((node = timerqueue_getnext(&old_base->active))) {
                timer = container_of(node, struct hrtimer, node);
                BUG_ON(hrtimer_callback_running(timer));
                debug_deactivate(timer);

                /*
                 * Mark it as ENQUEUED not INACTIVE otherwise the
                 * timer could be seen as !active and just vanish away
                 * under us on another CPU
                 */
                __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
                timer->base = new_base;
                /*
                 * Enqueue the timers on the new cpu. This does not
                 * reprogram the event device in case the timer
                 * expires before the earliest on this CPU, but we run
                 * hrtimer_interrupt after we migrated everything to
                 * sort out already expired timers and reprogram the
                 * event device.
                 */
                enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS);
        }
}

int hrtimers_cpu_dying(unsigned int dying_cpu)
{
        struct hrtimer_cpu_base *old_base, *new_base;
        int i, ncpu = cpumask_first(cpu_active_mask);

        tick_cancel_sched_timer(dying_cpu);

        old_base = this_cpu_ptr(&hrtimer_bases);
        new_base = &per_cpu(hrtimer_bases, ncpu);

        /*
         * The caller is globally serialized and nobody else
         * takes two locks at once, deadlock is not possible.
         */
        raw_spin_lock(&old_base->lock);
        raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING);

        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
                migrate_hrtimer_list(&old_base->clock_base[i],
                                     &new_base->clock_base[i]);
        }

        /*
         * The migration might have changed the first expiring softirq
         * timer on this CPU. Update it.
         */
        __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT);
        /* Tell the other CPU to retrigger the next event */
        smp_call_function_single(ncpu, retrigger_next_event, NULL, 0);

        raw_spin_unlock(&new_base->lock);
        old_base->online = 0;
        raw_spin_unlock(&old_base->lock);

        return 0;
}

#endif /* CONFIG_HOTPLUG_CPU */

void __init hrtimers_init(void)
{
        hrtimers_prepare_cpu(smp_processor_id());
        hrtimers_cpu_starting(smp_processor_id());
        open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq);
}

/**
 * schedule_hrtimeout_range_clock - sleep until timeout
 * @expires:        timeout value (ktime_t)
 * @delta:        slack in expires timeout (ktime_t) for SCHED_OTHER tasks
 * @mode:        timer mode
 * @clock_id:        timer clock to be used
 */
int __sched
schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
                               const enum hrtimer_mode mode, clockid_t clock_id)
{
        struct hrtimer_sleeper t;

        /*
         * Optimize when a zero timeout value is given. It does not
         * matter whether this is an absolute or a relative time.
         */
        if (expires && *expires == 0) {
                __set_current_state(TASK_RUNNING);
                return 0;
        }

        /*
         * A NULL parameter means "infinite"
         */
        if (!expires) {
                schedule();
                return -EINTR;
        }

        /*
         * Override any slack passed by the user if under
         * rt contraints.
         */
        if (rt_task(current))
                delta = 0;

        hrtimer_init_sleeper_on_stack(&t, clock_id, mode);
        hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
        hrtimer_sleeper_start_expires(&t, mode);

        if (likely(t.task))
                schedule();

        hrtimer_cancel(&t.timer);
        destroy_hrtimer_on_stack(&t.timer);

        __set_current_state(TASK_RUNNING);

        return !t.task ? 0 : -EINTR;
}
EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock);

/**
 * schedule_hrtimeout_range - sleep until timeout
 * @expires:        timeout value (ktime_t)
 * @delta:        slack in expires timeout (ktime_t) for SCHED_OTHER tasks
 * @mode:        timer mode
 *
 * Make the current task sleep until the given expiry time has
 * elapsed. The routine will return immediately unless
 * the current task state has been set (see set_current_state()).
 *
 * The @delta argument gives the kernel the freedom to schedule the
 * actual wakeup to a time that is both power and performance friendly
 * for regular (non RT/DL) tasks.
 * The kernel give the normal best effort behavior for "@expires+@delta",
 * but may decide to fire the timer earlier, but no earlier than @expires.
 *
 * You can set the task state as follows -
 *
 * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
 * pass before the routine returns unless the current task is explicitly
 * woken up, (e.g. by wake_up_process()).
 *
 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
 * delivered to the current task or the current task is explicitly woken
 * up.
 *
 * The current task state is guaranteed to be TASK_RUNNING when this
 * routine returns.
 *
 * Returns 0 when the timer has expired. If the task was woken before the
 * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or
 * by an explicit wakeup, it returns -EINTR.
 */
int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta,
                                     const enum hrtimer_mode mode)
{
        return schedule_hrtimeout_range_clock(expires, delta, mode,
                                              CLOCK_MONOTONIC);
}
EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);

/**
 * schedule_hrtimeout - sleep until timeout
 * @expires:        timeout value (ktime_t)
 * @mode:        timer mode
 *
 * Make the current task sleep until the given expiry time has
 * elapsed. The routine will return immediately unless
 * the current task state has been set (see set_current_state()).
 *
 * You can set the task state as follows -
 *
 * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
 * pass before the routine returns unless the current task is explicitly
 * woken up, (e.g. by wake_up_process()).
 *
 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
 * delivered to the current task or the current task is explicitly woken
 * up.
 *
 * The current task state is guaranteed to be TASK_RUNNING when this
 * routine returns.
 *
 * Returns 0 when the timer has expired. If the task was woken before the
 * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or
 * by an explicit wakeup, it returns -EINTR.
 */
int __sched schedule_hrtimeout(ktime_t *expires,
                               const enum hrtimer_mode mode)
{
        return schedule_hrtimeout_range(expires, 0, mode);
}
EXPORT_SYMBOL_GPL(schedule_hrtimeout);




























































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * pm_runtime.h - Device run-time power management helper functions.
 *
 * Copyright (C) 2009 Rafael J. Wysocki <rjw@sisk.pl>
 */

#ifndef _LINUX_PM_RUNTIME_H
#define _LINUX_PM_RUNTIME_H

#include <linux/device.h>
#include <linux/notifier.h>
#include <linux/pm.h>

#include <linux/jiffies.h>

/* Runtime PM flag argument bits */
#define RPM_ASYNC                0x01        /* Request is asynchronous */
#define RPM_NOWAIT                0x02        /* Don't wait for concurrent
                                            state change */
#define RPM_GET_PUT                0x04        /* Increment/decrement the
                                            usage_count */
#define RPM_AUTO                0x08        /* Use autosuspend_delay */

#ifdef CONFIG_PM
extern struct workqueue_struct *pm_wq;

static inline bool queue_pm_work(struct work_struct *work)
{
        return queue_work(pm_wq, work);
}

extern int pm_generic_runtime_suspend(struct device *dev);
extern int pm_generic_runtime_resume(struct device *dev);
extern bool pm_runtime_need_not_resume(struct device *dev);
extern int pm_runtime_force_suspend(struct device *dev);
extern int pm_runtime_force_resume(struct device *dev);

extern int __pm_runtime_idle(struct device *dev, int rpmflags);
extern int __pm_runtime_suspend(struct device *dev, int rpmflags);
extern int __pm_runtime_resume(struct device *dev, int rpmflags);
extern int pm_runtime_get_if_active(struct device *dev, bool ign_usage_count);
extern int pm_schedule_suspend(struct device *dev, unsigned int delay);
extern int __pm_runtime_set_status(struct device *dev, unsigned int status);
extern int pm_runtime_barrier(struct device *dev);
extern void pm_runtime_enable(struct device *dev);
extern void __pm_runtime_disable(struct device *dev, bool check_resume);
extern void pm_runtime_allow(struct device *dev);
extern void pm_runtime_forbid(struct device *dev);
extern void pm_runtime_no_callbacks(struct device *dev);
extern void pm_runtime_irq_safe(struct device *dev);
extern void __pm_runtime_use_autosuspend(struct device *dev, bool use);
extern void pm_runtime_set_autosuspend_delay(struct device *dev, int delay);
extern u64 pm_runtime_autosuspend_expiration(struct device *dev);
extern void pm_runtime_update_max_time_suspended(struct device *dev,
                                                 s64 delta_ns);
extern void pm_runtime_set_memalloc_noio(struct device *dev, bool enable);
extern void pm_runtime_get_suppliers(struct device *dev);
extern void pm_runtime_put_suppliers(struct device *dev);
extern void pm_runtime_new_link(struct device *dev);
extern void pm_runtime_drop_link(struct device_link *link);
extern void pm_runtime_release_supplier(struct device_link *link);

int devm_pm_runtime_set_active_enabled(struct device *dev);
extern int devm_pm_runtime_enable(struct device *dev);
int devm_pm_runtime_get_noresume(struct device *dev);

/**
 * pm_runtime_get_if_in_use - Conditionally bump up runtime PM usage counter.
 * @dev: Target device.
 *
 * Increment the runtime PM usage counter of @dev if its runtime PM status is
 * %RPM_ACTIVE and its runtime PM usage counter is greater than 0.
 */
static inline int pm_runtime_get_if_in_use(struct device *dev)
{
        return pm_runtime_get_if_active(dev, false);
}

/**
 * pm_suspend_ignore_children - Set runtime PM behavior regarding children.
 * @dev: Target device.
 * @enable: Whether or not to ignore possible dependencies on children.
 *
 * The dependencies of @dev on its children will not be taken into account by
 * the runtime PM framework going forward if @enable is %true, or they will
 * be taken into account otherwise.
 */
static inline void pm_suspend_ignore_children(struct device *dev, bool enable)
{
        dev->power.ignore_children = enable;
}

/**
 * pm_runtime_get_noresume - Bump up runtime PM usage counter of a device.
 * @dev: Target device.
 */
static inline void pm_runtime_get_noresume(struct device *dev)
{
        atomic_inc(&dev->power.usage_count);
}

/**
 * pm_runtime_put_noidle - Drop runtime PM usage counter of a device.
 * @dev: Target device.
 *
 * Decrement the runtime PM usage counter of @dev unless it is 0 already.
 */
static inline void pm_runtime_put_noidle(struct device *dev)
{
        atomic_add_unless(&dev->power.usage_count, -1, 0);
}

/**
 * pm_runtime_suspended - Check whether or not a device is runtime-suspended.
 * @dev: Target device.
 *
 * Return %true if runtime PM is enabled for @dev and its runtime PM status is
 * %RPM_SUSPENDED, or %false otherwise.
 *
 * Note that the return value of this function can only be trusted if it is
 * called under the runtime PM lock of @dev or under conditions in which
 * runtime PM cannot be either disabled or enabled for @dev and its runtime PM
 * status cannot change.
 */
static inline bool pm_runtime_suspended(struct device *dev)
{
        return dev->power.runtime_status == RPM_SUSPENDED
                && !dev->power.disable_depth;
}

/**
 * pm_runtime_active - Check whether or not a device is runtime-active.
 * @dev: Target device.
 *
 * Return %true if runtime PM is disabled for @dev or its runtime PM status is
 * %RPM_ACTIVE, or %false otherwise.
 *
 * Note that the return value of this function can only be trusted if it is
 * called under the runtime PM lock of @dev or under conditions in which
 * runtime PM cannot be either disabled or enabled for @dev and its runtime PM
 * status cannot change.
 */
static inline bool pm_runtime_active(struct device *dev)
{
        return dev->power.runtime_status == RPM_ACTIVE
                || dev->power.disable_depth;
}

/**
 * pm_runtime_status_suspended - Check if runtime PM status is "suspended".
 * @dev: Target device.
 *
 * Return %true if the runtime PM status of @dev is %RPM_SUSPENDED, or %false
 * otherwise, regardless of whether or not runtime PM has been enabled for @dev.
 *
 * Note that the return value of this function can only be trusted if it is
 * called under the runtime PM lock of @dev or under conditions in which the
 * runtime PM status of @dev cannot change.
 */
static inline bool pm_runtime_status_suspended(struct device *dev)
{
        return dev->power.runtime_status == RPM_SUSPENDED;
}

/**
 * pm_runtime_enabled - Check if runtime PM is enabled.
 * @dev: Target device.
 *
 * Return %true if runtime PM is enabled for @dev or %false otherwise.
 *
 * Note that the return value of this function can only be trusted if it is
 * called under the runtime PM lock of @dev or under conditions in which
 * runtime PM cannot be either disabled or enabled for @dev.
 */
static inline bool pm_runtime_enabled(struct device *dev)
{
        return !dev->power.disable_depth;
}

/**
 * pm_runtime_has_no_callbacks - Check if runtime PM callbacks may be present.
 * @dev: Target device.
 *
 * Return %true if @dev is a special device without runtime PM callbacks or
 * %false otherwise.
 */
static inline bool pm_runtime_has_no_callbacks(struct device *dev)
{
        return dev->power.no_callbacks;
}

/**
 * pm_runtime_mark_last_busy - Update the last access time of a device.
 * @dev: Target device.
 *
 * Update the last access time of @dev used by the runtime PM autosuspend
 * mechanism to the current time as returned by ktime_get_mono_fast_ns().
 */
static inline void pm_runtime_mark_last_busy(struct device *dev)
{
        WRITE_ONCE(dev->power.last_busy, ktime_get_mono_fast_ns());
}

/**
 * pm_runtime_is_irq_safe - Check if runtime PM can work in interrupt context.
 * @dev: Target device.
 *
 * Return %true if @dev has been marked as an "IRQ-safe" device (with respect
 * to runtime PM), in which case its runtime PM callabcks can be expected to
 * work correctly when invoked from interrupt handlers.
 */
static inline bool pm_runtime_is_irq_safe(struct device *dev)
{
        return dev->power.irq_safe;
}

extern u64 pm_runtime_suspended_time(struct device *dev);

#else /* !CONFIG_PM */

static inline bool queue_pm_work(struct work_struct *work) { return false; }

static inline int pm_generic_runtime_suspend(struct device *dev) { return 0; }
static inline int pm_generic_runtime_resume(struct device *dev) { return 0; }
static inline bool pm_runtime_need_not_resume(struct device *dev) {return true; }
static inline int pm_runtime_force_suspend(struct device *dev) { return 0; }
static inline int pm_runtime_force_resume(struct device *dev) { return 0; }

static inline int __pm_runtime_idle(struct device *dev, int rpmflags)
{
        return -ENOSYS;
}
static inline int __pm_runtime_suspend(struct device *dev, int rpmflags)
{
        return -ENOSYS;
}
static inline int __pm_runtime_resume(struct device *dev, int rpmflags)
{
        return 1;
}
static inline int pm_schedule_suspend(struct device *dev, unsigned int delay)
{
        return -ENOSYS;
}
static inline int pm_runtime_get_if_in_use(struct device *dev)
{
        return -EINVAL;
}
static inline int pm_runtime_get_if_active(struct device *dev,
                                           bool ign_usage_count)
{
        return -EINVAL;
}
static inline int __pm_runtime_set_status(struct device *dev,
                                            unsigned int status) { return 0; }
static inline int pm_runtime_barrier(struct device *dev) { return 0; }
static inline void pm_runtime_enable(struct device *dev) {}
static inline void __pm_runtime_disable(struct device *dev, bool c) {}
static inline void pm_runtime_allow(struct device *dev) {}
static inline void pm_runtime_forbid(struct device *dev) {}

static inline int devm_pm_runtime_set_active_enabled(struct device *dev) { return 0; }
static inline int devm_pm_runtime_enable(struct device *dev) { return 0; }
static inline int devm_pm_runtime_get_noresume(struct device *dev) { return 0; }

static inline void pm_suspend_ignore_children(struct device *dev, bool enable) {}
static inline void pm_runtime_get_noresume(struct device *dev) {}
static inline void pm_runtime_put_noidle(struct device *dev) {}
static inline bool pm_runtime_suspended(struct device *dev) { return false; }
static inline bool pm_runtime_active(struct device *dev) { return true; }
static inline bool pm_runtime_status_suspended(struct device *dev) { return false; }
static inline bool pm_runtime_enabled(struct device *dev) { return false; }

static inline void pm_runtime_no_callbacks(struct device *dev) {}
static inline void pm_runtime_irq_safe(struct device *dev) {}
static inline bool pm_runtime_is_irq_safe(struct device *dev) { return false; }

static inline bool pm_runtime_has_no_callbacks(struct device *dev) { return false; }
static inline void pm_runtime_mark_last_busy(struct device *dev) {}
static inline void __pm_runtime_use_autosuspend(struct device *dev,
                                                bool use) {}
static inline void pm_runtime_set_autosuspend_delay(struct device *dev,
                                                int delay) {}
static inline u64 pm_runtime_autosuspend_expiration(
                                struct device *dev) { return 0; }
static inline void pm_runtime_set_memalloc_noio(struct device *dev,
                                                bool enable){}
static inline void pm_runtime_get_suppliers(struct device *dev) {}
static inline void pm_runtime_put_suppliers(struct device *dev) {}
static inline void pm_runtime_new_link(struct device *dev) {}
static inline void pm_runtime_drop_link(struct device_link *link) {}
static inline void pm_runtime_release_supplier(struct device_link *link) {}

#endif /* !CONFIG_PM */

/**
 * pm_runtime_idle - Conditionally set up autosuspend of a device or suspend it.
 * @dev: Target device.
 *
 * Invoke the "idle check" callback of @dev and, depending on its return value,
 * set up autosuspend of @dev or suspend it (depending on whether or not
 * autosuspend has been enabled for it).
 */
static inline int pm_runtime_idle(struct device *dev)
{
        return __pm_runtime_idle(dev, 0);
}

/**
 * pm_runtime_suspend - Suspend a device synchronously.
 * @dev: Target device.
 */
static inline int pm_runtime_suspend(struct device *dev)
{
        return __pm_runtime_suspend(dev, 0);
}

/**
 * pm_runtime_autosuspend - Set up autosuspend of a device or suspend it.
 * @dev: Target device.
 *
 * Set up autosuspend of @dev or suspend it (depending on whether or not
 * autosuspend is enabled for it) without engaging its "idle check" callback.
 */
static inline int pm_runtime_autosuspend(struct device *dev)
{
        return __pm_runtime_suspend(dev, RPM_AUTO);
}

/**
 * pm_runtime_resume - Resume a device synchronously.
 * @dev: Target device.
 */
static inline int pm_runtime_resume(struct device *dev)
{
        return __pm_runtime_resume(dev, 0);
}

/**
 * pm_request_idle - Queue up "idle check" execution for a device.
 * @dev: Target device.
 *
 * Queue up a work item to run an equivalent of pm_runtime_idle() for @dev
 * asynchronously.
 */
static inline int pm_request_idle(struct device *dev)
{
        return __pm_runtime_idle(dev, RPM_ASYNC);
}

/**
 * pm_request_resume - Queue up runtime-resume of a device.
 * @dev: Target device.
 */
static inline int pm_request_resume(struct device *dev)
{
        return __pm_runtime_resume(dev, RPM_ASYNC);
}

/**
 * pm_request_autosuspend - Queue up autosuspend of a device.
 * @dev: Target device.
 *
 * Queue up a work item to run an equivalent pm_runtime_autosuspend() for @dev
 * asynchronously.
 */
static inline int pm_request_autosuspend(struct device *dev)
{
        return __pm_runtime_suspend(dev, RPM_ASYNC | RPM_AUTO);
}

/**
 * pm_runtime_get - Bump up usage counter and queue up resume of a device.
 * @dev: Target device.
 *
 * Bump up the runtime PM usage counter of @dev and queue up a work item to
 * carry out runtime-resume of it.
 */
static inline int pm_runtime_get(struct device *dev)
{
        return __pm_runtime_resume(dev, RPM_GET_PUT | RPM_ASYNC);
}

/**
 * pm_runtime_get_sync - Bump up usage counter of a device and resume it.
 * @dev: Target device.
 *
 * Bump up the runtime PM usage counter of @dev and carry out runtime-resume of
 * it synchronously.
 *
 * The possible return values of this function are the same as for
 * pm_runtime_resume() and the runtime PM usage counter of @dev remains
 * incremented in all cases, even if it returns an error code.
 */
static inline int pm_runtime_get_sync(struct device *dev)
{
        return __pm_runtime_resume(dev, RPM_GET_PUT);
}

/**
 * pm_runtime_resume_and_get - Bump up usage counter of a device and resume it.
 * @dev: Target device.
 *
 * Resume @dev synchronously and if that is successful, increment its runtime
 * PM usage counter. Return 0 if the runtime PM usage counter of @dev has been
 * incremented or a negative error code otherwise.
 */
static inline int pm_runtime_resume_and_get(struct device *dev)
{
        int ret;

        ret = __pm_runtime_resume(dev, RPM_GET_PUT);
        if (ret < 0) {
                pm_runtime_put_noidle(dev);
                return ret;
        }

        return 0;
}

/**
 * pm_runtime_put - Drop device usage counter and queue up "idle check" if 0.
 * @dev: Target device.
 *
 * Decrement the runtime PM usage counter of @dev and if it turns out to be
 * equal to 0, queue up a work item for @dev like in pm_request_idle().
 */
static inline int pm_runtime_put(struct device *dev)
{
        return __pm_runtime_idle(dev, RPM_GET_PUT | RPM_ASYNC);
}

/**
 * pm_runtime_put_autosuspend - Drop device usage counter and queue autosuspend if 0.
 * @dev: Target device.
 *
 * Decrement the runtime PM usage counter of @dev and if it turns out to be
 * equal to 0, queue up a work item for @dev like in pm_request_autosuspend().
 */
static inline int pm_runtime_put_autosuspend(struct device *dev)
{
        return __pm_runtime_suspend(dev,
            RPM_GET_PUT | RPM_ASYNC | RPM_AUTO);
}

/**
 * pm_runtime_put_sync - Drop device usage counter and run "idle check" if 0.
 * @dev: Target device.
 *
 * Decrement the runtime PM usage counter of @dev and if it turns out to be
 * equal to 0, invoke the "idle check" callback of @dev and, depending on its
 * return value, set up autosuspend of @dev or suspend it (depending on whether
 * or not autosuspend has been enabled for it).
 *
 * The possible return values of this function are the same as for
 * pm_runtime_idle() and the runtime PM usage counter of @dev remains
 * decremented in all cases, even if it returns an error code.
 */
static inline int pm_runtime_put_sync(struct device *dev)
{
        return __pm_runtime_idle(dev, RPM_GET_PUT);
}

/**
 * pm_runtime_put_sync_suspend - Drop device usage counter and suspend if 0.
 * @dev: Target device.
 *
 * Decrement the runtime PM usage counter of @dev and if it turns out to be
 * equal to 0, carry out runtime-suspend of @dev synchronously.
 *
 * The possible return values of this function are the same as for
 * pm_runtime_suspend() and the runtime PM usage counter of @dev remains
 * decremented in all cases, even if it returns an error code.
 */
static inline int pm_runtime_put_sync_suspend(struct device *dev)
{
        return __pm_runtime_suspend(dev, RPM_GET_PUT);
}

/**
 * pm_runtime_put_sync_autosuspend - Drop device usage counter and autosuspend if 0.
 * @dev: Target device.
 *
 * Decrement the runtime PM usage counter of @dev and if it turns out to be
 * equal to 0, set up autosuspend of @dev or suspend it synchronously (depending
 * on whether or not autosuspend has been enabled for it).
 *
 * The possible return values of this function are the same as for
 * pm_runtime_autosuspend() and the runtime PM usage counter of @dev remains
 * decremented in all cases, even if it returns an error code.
 */
static inline int pm_runtime_put_sync_autosuspend(struct device *dev)
{
        return __pm_runtime_suspend(dev, RPM_GET_PUT | RPM_AUTO);
}

/**
 * pm_runtime_set_active - Set runtime PM status to "active".
 * @dev: Target device.
 *
 * Set the runtime PM status of @dev to %RPM_ACTIVE and ensure that dependencies
 * of it will be taken into account.
 *
 * It is not valid to call this function for devices with runtime PM enabled.
 */
static inline int pm_runtime_set_active(struct device *dev)
{
        return __pm_runtime_set_status(dev, RPM_ACTIVE);
}

/**
 * pm_runtime_set_suspended - Set runtime PM status to "suspended".
 * @dev: Target device.
 *
 * Set the runtime PM status of @dev to %RPM_SUSPENDED and ensure that
 * dependencies of it will be taken into account.
 *
 * It is not valid to call this function for devices with runtime PM enabled.
 */
static inline int pm_runtime_set_suspended(struct device *dev)
{
        return __pm_runtime_set_status(dev, RPM_SUSPENDED);
}

/**
 * pm_runtime_disable - Disable runtime PM for a device.
 * @dev: Target device.
 *
 * Prevent the runtime PM framework from working with @dev (by incrementing its
 * "blocking" counter).
 *
 * For each invocation of this function for @dev there must be a matching
 * pm_runtime_enable() call in order for runtime PM to be enabled for it.
 */
static inline void pm_runtime_disable(struct device *dev)
{
        __pm_runtime_disable(dev, true);
}

/**
 * pm_runtime_use_autosuspend - Allow autosuspend to be used for a device.
 * @dev: Target device.
 *
 * Allow the runtime PM autosuspend mechanism to be used for @dev whenever
 * requested (or "autosuspend" will be handled as direct runtime-suspend for
 * it).
 *
 * NOTE: It's important to undo this with pm_runtime_dont_use_autosuspend()
 * at driver exit time unless your driver initially enabled pm_runtime
 * with devm_pm_runtime_enable() (which handles it for you).
 */
static inline void pm_runtime_use_autosuspend(struct device *dev)
{
        __pm_runtime_use_autosuspend(dev, true);
}

/**
 * pm_runtime_dont_use_autosuspend - Prevent autosuspend from being used.
 * @dev: Target device.
 *
 * Prevent the runtime PM autosuspend mechanism from being used for @dev which
 * means that "autosuspend" will be handled as direct runtime-suspend for it
 * going forward.
 */
static inline void pm_runtime_dont_use_autosuspend(struct device *dev)
{
        __pm_runtime_use_autosuspend(dev, false);
}

#endif




















































































































































































































    2 







    2 


























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_IVERSION_H
#define _LINUX_IVERSION_H

#include <linux/fs.h>

/*
 * The inode->i_version field:
 * ---------------------------
 * The change attribute (i_version) is mandated by NFSv4 and is mostly for
 * knfsd, but is also used for other purposes (e.g. IMA). The i_version must
 * appear different to observers if there was a change to the inode's data or
 * metadata since it was last queried.
 *
 * Observers see the i_version as a 64-bit number that never decreases. If it
 * remains the same since it was last checked, then nothing has changed in the
 * inode. If it's different then something has changed. Observers cannot infer
 * anything about the nature or magnitude of the changes from the value, only
 * that the inode has changed in some fashion.
 *
 * Not all filesystems properly implement the i_version counter. Subsystems that
 * want to use i_version field on an inode should first check whether the
 * filesystem sets the SB_I_VERSION flag (usually via the IS_I_VERSION macro).
 *
 * Those that set SB_I_VERSION will automatically have their i_version counter
 * incremented on writes to normal files. If the SB_I_VERSION is not set, then
 * the VFS will not touch it on writes, and the filesystem can use it how it
 * wishes. Note that the filesystem is always responsible for updating the
 * i_version on namespace changes in directories (mkdir, rmdir, unlink, etc.).
 * We consider these sorts of filesystems to have a kernel-managed i_version.
 *
 * It may be impractical for filesystems to keep i_version updates atomic with
 * respect to the changes that cause them.  They should, however, guarantee
 * that i_version updates are never visible before the changes that caused
 * them.  Also, i_version updates should never be delayed longer than it takes
 * the original change to reach disk.
 *
 * This implementation uses the low bit in the i_version field as a flag to
 * track when the value has been queried. If it has not been queried since it
 * was last incremented, we can skip the increment in most cases.
 *
 * In the event that we're updating the ctime, we will usually go ahead and
 * bump the i_version anyway. Since that has to go to stable storage in some
 * fashion, we might as well increment it as well.
 *
 * With this implementation, the value should always appear to observers to
 * increase over time if the file has changed. It's recommended to use
 * inode_eq_iversion() helper to compare values.
 *
 * Note that some filesystems (e.g. NFS and AFS) just use the field to store
 * a server-provided value (for the most part). For that reason, those
 * filesystems do not set SB_I_VERSION. These filesystems are considered to
 * have a self-managed i_version.
 *
 * Persistently storing the i_version
 * ----------------------------------
 * Queries of the i_version field are not gated on them hitting the backing
 * store. It's always possible that the host could crash after allowing
 * a query of the value but before it has made it to disk.
 *
 * To mitigate this problem, filesystems should always use
 * inode_set_iversion_queried when loading an existing inode from disk. This
 * ensures that the next attempted inode increment will result in the value
 * changing.
 *
 * Storing the value to disk therefore does not count as a query, so those
 * filesystems should use inode_peek_iversion to grab the value to be stored.
 * There is no need to flag the value as having been queried in that case.
 */

/*
 * We borrow the lowest bit in the i_version to use as a flag to tell whether
 * it has been queried since we last incremented it. If it has, then we must
 * increment it on the next change. After that, we can clear the flag and
 * avoid incrementing it again until it has again been queried.
 */
#define I_VERSION_QUERIED_SHIFT        (1)
#define I_VERSION_QUERIED        (1ULL << (I_VERSION_QUERIED_SHIFT - 1))
#define I_VERSION_INCREMENT        (1ULL << I_VERSION_QUERIED_SHIFT)

/**
 * inode_set_iversion_raw - set i_version to the specified raw value
 * @inode: inode to set
 * @val: new i_version value to set
 *
 * Set @inode's i_version field to @val. This function is for use by
 * filesystems that self-manage the i_version.
 *
 * For example, the NFS client stores its NFSv4 change attribute in this way,
 * and the AFS client stores the data_version from the server here.
 */
static inline void
inode_set_iversion_raw(struct inode *inode, u64 val)
{
        atomic64_set(&inode->i_version, val);
}

/**
 * inode_peek_iversion_raw - grab a "raw" iversion value
 * @inode: inode from which i_version should be read
 *
 * Grab a "raw" inode->i_version value and return it. The i_version is not
 * flagged or converted in any way. This is mostly used to access a self-managed
 * i_version.
 *
 * With those filesystems, we want to treat the i_version as an entirely
 * opaque value.
 */
static inline u64
inode_peek_iversion_raw(const struct inode *inode)
{
        return atomic64_read(&inode->i_version);
}

/**
 * inode_set_max_iversion_raw - update i_version new value is larger
 * @inode: inode to set
 * @val: new i_version to set
 *
 * Some self-managed filesystems (e.g Ceph) will only update the i_version
 * value if the new value is larger than the one we already have.
 */
static inline void
inode_set_max_iversion_raw(struct inode *inode, u64 val)
{
        u64 cur, old;

        cur = inode_peek_iversion_raw(inode);
        for (;;) {
                if (cur > val)
                        break;
                old = atomic64_cmpxchg(&inode->i_version, cur, val);
                if (likely(old == cur))
                        break;
                cur = old;
        }
}

/**
 * inode_set_iversion - set i_version to a particular value
 * @inode: inode to set
 * @val: new i_version value to set
 *
 * Set @inode's i_version field to @val. This function is for filesystems with
 * a kernel-managed i_version, for initializing a newly-created inode from
 * scratch.
 *
 * In this case, we do not set the QUERIED flag since we know that this value
 * has never been queried.
 */
static inline void
inode_set_iversion(struct inode *inode, u64 val)
{
        inode_set_iversion_raw(inode, val << I_VERSION_QUERIED_SHIFT);
}

/**
 * inode_set_iversion_queried - set i_version to a particular value as quereied
 * @inode: inode to set
 * @val: new i_version value to set
 *
 * Set @inode's i_version field to @val, and flag it for increment on the next
 * change.
 *
 * Filesystems that persistently store the i_version on disk should use this
 * when loading an existing inode from disk.
 *
 * When loading in an i_version value from a backing store, we can't be certain
 * that it wasn't previously viewed before being stored. Thus, we must assume
 * that it was, to ensure that we don't end up handing out the same value for
 * different versions of the same inode.
 */
static inline void
inode_set_iversion_queried(struct inode *inode, u64 val)
{
        inode_set_iversion_raw(inode, (val << I_VERSION_QUERIED_SHIFT) |
                                I_VERSION_QUERIED);
}

/**
 * inode_maybe_inc_iversion - increments i_version
 * @inode: inode with the i_version that should be updated
 * @force: increment the counter even if it's not necessary?
 *
 * Every time the inode is modified, the i_version field must be seen to have
 * changed by any observer.
 *
 * If "force" is set or the QUERIED flag is set, then ensure that we increment
 * the value, and clear the queried flag.
 *
 * In the common case where neither is set, then we can return "false" without
 * updating i_version.
 *
 * If this function returns false, and no other metadata has changed, then we
 * can avoid logging the metadata.
 */
static inline bool
inode_maybe_inc_iversion(struct inode *inode, bool force)
{
        u64 cur, old, new;

        /*
         * The i_version field is not strictly ordered with any other inode
         * information, but the legacy inode_inc_iversion code used a spinlock
         * to serialize increments.
         *
         * Here, we add full memory barriers to ensure that any de-facto
         * ordering with other info is preserved.
         *
         * This barrier pairs with the barrier in inode_query_iversion()
         */
        smp_mb();
        cur = inode_peek_iversion_raw(inode);
        for (;;) {
                /* If flag is clear then we needn't do anything */
                if (!force && !(cur & I_VERSION_QUERIED))
                        return false;

                /* Since lowest bit is flag, add 2 to avoid it */
                new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT;

                old = atomic64_cmpxchg(&inode->i_version, cur, new);
                if (likely(old == cur))
                        break;
                cur = old;
        }
        return true;
}


/**
 * inode_inc_iversion - forcibly increment i_version
 * @inode: inode that needs to be updated
 *
 * Forcbily increment the i_version field. This always results in a change to
 * the observable value.
 */
static inline void
inode_inc_iversion(struct inode *inode)
{
        inode_maybe_inc_iversion(inode, true);
}

/**
 * inode_iversion_need_inc - is the i_version in need of being incremented?
 * @inode: inode to check
 *
 * Returns whether the inode->i_version counter needs incrementing on the next
 * change. Just fetch the value and check the QUERIED flag.
 */
static inline bool
inode_iversion_need_inc(struct inode *inode)
{
        return inode_peek_iversion_raw(inode) & I_VERSION_QUERIED;
}

/**
 * inode_inc_iversion_raw - forcibly increment raw i_version
 * @inode: inode that needs to be updated
 *
 * Forcbily increment the raw i_version field. This always results in a change
 * to the raw value.
 *
 * NFS will use the i_version field to store the value from the server. It
 * mostly treats it as opaque, but in the case where it holds a write
 * delegation, it must increment the value itself. This function does that.
 */
static inline void
inode_inc_iversion_raw(struct inode *inode)
{
        atomic64_inc(&inode->i_version);
}

/**
 * inode_peek_iversion - read i_version without flagging it to be incremented
 * @inode: inode from which i_version should be read
 *
 * Read the inode i_version counter for an inode without registering it as a
 * query.
 *
 * This is typically used by local filesystems that need to store an i_version
 * on disk. In that situation, it's not necessary to flag it as having been
 * viewed, as the result won't be used to gauge changes from that point.
 */
static inline u64
inode_peek_iversion(const struct inode *inode)
{
        return inode_peek_iversion_raw(inode) >> I_VERSION_QUERIED_SHIFT;
}

/**
 * inode_query_iversion - read i_version for later use
 * @inode: inode from which i_version should be read
 *
 * Read the inode i_version counter. This should be used by callers that wish
 * to store the returned i_version for later comparison. This will guarantee
 * that a later query of the i_version will result in a different value if
 * anything has changed.
 *
 * In this implementation, we fetch the current value, set the QUERIED flag and
 * then try to swap it into place with a cmpxchg, if it wasn't already set. If
 * that fails, we try again with the newly fetched value from the cmpxchg.
 */
static inline u64
inode_query_iversion(struct inode *inode)
{
        u64 cur, old, new;

        cur = inode_peek_iversion_raw(inode);
        for (;;) {
                /* If flag is already set, then no need to swap */
                if (cur & I_VERSION_QUERIED) {
                        /*
                         * This barrier (and the implicit barrier in the
                         * cmpxchg below) pairs with the barrier in
                         * inode_maybe_inc_iversion().
                         */
                        smp_mb();
                        break;
                }

                new = cur | I_VERSION_QUERIED;
                old = atomic64_cmpxchg(&inode->i_version, cur, new);
                if (likely(old == cur))
                        break;
                cur = old;
        }
        return cur >> I_VERSION_QUERIED_SHIFT;
}

/*
 * For filesystems without any sort of change attribute, the best we can
 * do is fake one up from the ctime:
 */
static inline u64 time_to_chattr(struct timespec64 *t)
{
        u64 chattr = t->tv_sec;

        chattr <<= 32;
        chattr += t->tv_nsec;
        return chattr;
}

/**
 * inode_eq_iversion_raw - check whether the raw i_version counter has changed
 * @inode: inode to check
 * @old: old value to check against its i_version
 *
 * Compare the current raw i_version counter with a previous one. Returns true
 * if they are the same or false if they are different.
 */
static inline bool
inode_eq_iversion_raw(const struct inode *inode, u64 old)
{
        return inode_peek_iversion_raw(inode) == old;
}

/**
 * inode_eq_iversion - check whether the i_version counter has changed
 * @inode: inode to check
 * @old: old value to check against its i_version
 *
 * Compare an i_version counter with a previous one. Returns true if they are
 * the same, and false if they are different.
 *
 * Note that we don't need to set the QUERIED flag in this case, as the value
 * in the inode is not being recorded for later use.
 */
static inline bool
inode_eq_iversion(const struct inode *inode, u64 old)
{
        return inode_peek_iversion(inode) == old;
}
#endif


























    4 


    4 

    4 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
// SPDX-License-Identifier: GPL-2.0
/*
 * Implementations of the security context functions.
 *
 * Author: Ondrej Mosnacek <omosnacek@gmail.com>
 * Copyright (C) 2020 Red Hat, Inc.
 */

#include <linux/jhash.h>

#include "context.h"
#include "mls.h"

u32 context_compute_hash(const struct context *c)
{
        u32 hash = 0;

        /*
         * If a context is invalid, it will always be represented by a
         * context struct with only the len & str set (and vice versa)
         * under a given policy. Since context structs from different
         * policies should never meet, it is safe to hash valid and
         * invalid contexts differently. The context_cmp() function
         * already operates under the same assumption.
         */
        if (c->len)
                return full_name_hash(NULL, c->str, c->len);

        hash = jhash_3words(c->user, c->role, c->type, hash);
        hash = mls_range_hash(&c->range, hash);
        return hash;
}















































































































































































































































































































































































































































































































































































































    1 





    1 
    1 

























    1 







    1 













































































    1 









































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
// SPDX-License-Identifier: GPL-2.0
/*
 * NETLINK      Netlink attributes
 *
 *                 Authors:        Thomas Graf <tgraf@suug.ch>
 *                                 Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
 */

#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/jiffies.h>
#include <linux/nospec.h>
#include <linux/skbuff.h>
#include <linux/string.h>
#include <linux/types.h>
#include <net/netlink.h>

/* For these data types, attribute length should be exactly the given
 * size. However, to maintain compatibility with broken commands, if the
 * attribute length does not match the expected size a warning is emitted
 * to the user that the command is sending invalid data and needs to be fixed.
 */
static const u8 nla_attr_len[NLA_TYPE_MAX+1] = {
        [NLA_U8]        = sizeof(u8),
        [NLA_U16]        = sizeof(u16),
        [NLA_U32]        = sizeof(u32),
        [NLA_U64]        = sizeof(u64),
        [NLA_S8]        = sizeof(s8),
        [NLA_S16]        = sizeof(s16),
        [NLA_S32]        = sizeof(s32),
        [NLA_S64]        = sizeof(s64),
        [NLA_BE16]        = sizeof(__be16),
        [NLA_BE32]        = sizeof(__be32),
};

static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = {
        [NLA_U8]        = sizeof(u8),
        [NLA_U16]        = sizeof(u16),
        [NLA_U32]        = sizeof(u32),
        [NLA_U64]        = sizeof(u64),
        [NLA_MSECS]        = sizeof(u64),
        [NLA_NESTED]        = NLA_HDRLEN,
        [NLA_S8]        = sizeof(s8),
        [NLA_S16]        = sizeof(s16),
        [NLA_S32]        = sizeof(s32),
        [NLA_S64]        = sizeof(s64),
        [NLA_BE16]        = sizeof(__be16),
        [NLA_BE32]        = sizeof(__be32),
};

/*
 * Nested policies might refer back to the original
 * policy in some cases, and userspace could try to
 * abuse that and recurse by nesting in the right
 * ways. Limit recursion to avoid this problem.
 */
#define MAX_POLICY_RECURSION_DEPTH        10

static int __nla_validate_parse(const struct nlattr *head, int len, int maxtype,
                                const struct nla_policy *policy,
                                unsigned int validate,
                                struct netlink_ext_ack *extack,
                                struct nlattr **tb, unsigned int depth);

static int validate_nla_bitfield32(const struct nlattr *nla,
                                   const u32 valid_flags_mask)
{
        const struct nla_bitfield32 *bf = nla_data(nla);

        if (!valid_flags_mask)
                return -EINVAL;

        /*disallow invalid bit selector */
        if (bf->selector & ~valid_flags_mask)
                return -EINVAL;

        /*disallow invalid bit values */
        if (bf->value & ~valid_flags_mask)
                return -EINVAL;

        /*disallow valid bit values that are not selected*/
        if (bf->value & ~bf->selector)
                return -EINVAL;

        return 0;
}

static int nla_validate_array(const struct nlattr *head, int len, int maxtype,
                              const struct nla_policy *policy,
                              struct netlink_ext_ack *extack,
                              unsigned int validate, unsigned int depth)
{
        const struct nlattr *entry;
        int rem;

        nla_for_each_attr(entry, head, len, rem) {
                int ret;

                if (nla_len(entry) == 0)
                        continue;

                if (nla_len(entry) < NLA_HDRLEN) {
                        NL_SET_ERR_MSG_ATTR_POL(extack, entry, policy,
                                                "Array element too short");
                        return -ERANGE;
                }

                ret = __nla_validate_parse(nla_data(entry), nla_len(entry),
                                           maxtype, policy, validate, extack,
                                           NULL, depth + 1);
                if (ret < 0)
                        return ret;
        }

        return 0;
}

void nla_get_range_unsigned(const struct nla_policy *pt,
                            struct netlink_range_validation *range)
{
        WARN_ON_ONCE(pt->validation_type != NLA_VALIDATE_RANGE_PTR &&
                     (pt->min < 0 || pt->max < 0));

        range->min = 0;

        switch (pt->type) {
        case NLA_U8:
                range->max = U8_MAX;
                break;
        case NLA_U16:
        case NLA_BE16:
        case NLA_BINARY:
                range->max = U16_MAX;
                break;
        case NLA_U32:
        case NLA_BE32:
                range->max = U32_MAX;
                break;
        case NLA_U64:
        case NLA_MSECS:
                range->max = U64_MAX;
                break;
        default:
                WARN_ON_ONCE(1);
                return;
        }

        switch (pt->validation_type) {
        case NLA_VALIDATE_RANGE:
        case NLA_VALIDATE_RANGE_WARN_TOO_LONG:
                range->min = pt->min;
                range->max = pt->max;
                break;
        case NLA_VALIDATE_RANGE_PTR:
                *range = *pt->range;
                break;
        case NLA_VALIDATE_MIN:
                range->min = pt->min;
                break;
        case NLA_VALIDATE_MAX:
                range->max = pt->max;
                break;
        default:
                break;
        }
}

static int nla_validate_range_unsigned(const struct nla_policy *pt,
                                       const struct nlattr *nla,
                                       struct netlink_ext_ack *extack,
                                       unsigned int validate)
{
        struct netlink_range_validation range;
        u64 value;

        switch (pt->type) {
        case NLA_U8:
                value = nla_get_u8(nla);
                break;
        case NLA_U16:
                value = nla_get_u16(nla);
                break;
        case NLA_U32:
                value = nla_get_u32(nla);
                break;
        case NLA_U64:
                value = nla_get_u64(nla);
                break;
        case NLA_MSECS:
                value = nla_get_u64(nla);
                break;
        case NLA_BINARY:
                value = nla_len(nla);
                break;
        case NLA_BE16:
                value = ntohs(nla_get_be16(nla));
                break;
        case NLA_BE32:
                value = ntohl(nla_get_be32(nla));
                break;
        default:
                return -EINVAL;
        }

        nla_get_range_unsigned(pt, &range);

        if (pt->validation_type == NLA_VALIDATE_RANGE_WARN_TOO_LONG &&
            pt->type == NLA_BINARY && value > range.max) {
                pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n",
                                    current->comm, pt->type);
                if (validate & NL_VALIDATE_STRICT_ATTRS) {
                        NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
                                                "invalid attribute length");
                        return -EINVAL;
                }

                /* this assumes min <= max (don't validate against min) */
                return 0;
        }

        if (value < range.min || value > range.max) {
                bool binary = pt->type == NLA_BINARY;

                if (binary)
                        NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
                                                "binary attribute size out of range");
                else
                        NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
                                                "integer out of range");

                return -ERANGE;
        }

        return 0;
}

void nla_get_range_signed(const struct nla_policy *pt,
                          struct netlink_range_validation_signed *range)
{
        switch (pt->type) {
        case NLA_S8:
                range->min = S8_MIN;
                range->max = S8_MAX;
                break;
        case NLA_S16:
                range->min = S16_MIN;
                range->max = S16_MAX;
                break;
        case NLA_S32:
                range->min = S32_MIN;
                range->max = S32_MAX;
                break;
        case NLA_S64:
                range->min = S64_MIN;
                range->max = S64_MAX;
                break;
        default:
                WARN_ON_ONCE(1);
                return;
        }

        switch (pt->validation_type) {
        case NLA_VALIDATE_RANGE:
                range->min = pt->min;
                range->max = pt->max;
                break;
        case NLA_VALIDATE_RANGE_PTR:
                *range = *pt->range_signed;
                break;
        case NLA_VALIDATE_MIN:
                range->min = pt->min;
                break;
        case NLA_VALIDATE_MAX:
                range->max = pt->max;
                break;
        default:
                break;
        }
}

static int nla_validate_int_range_signed(const struct nla_policy *pt,
                                         const struct nlattr *nla,
                                         struct netlink_ext_ack *extack)
{
        struct netlink_range_validation_signed range;
        s64 value;

        switch (pt->type) {
        case NLA_S8:
                value = nla_get_s8(nla);
                break;
        case NLA_S16:
                value = nla_get_s16(nla);
                break;
        case NLA_S32:
                value = nla_get_s32(nla);
                break;
        case NLA_S64:
                value = nla_get_s64(nla);
                break;
        default:
                return -EINVAL;
        }

        nla_get_range_signed(pt, &range);

        if (value < range.min || value > range.max) {
                NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
                                        "integer out of range");
                return -ERANGE;
        }

        return 0;
}

static int nla_validate_int_range(const struct nla_policy *pt,
                                  const struct nlattr *nla,
                                  struct netlink_ext_ack *extack,
                                  unsigned int validate)
{
        switch (pt->type) {
        case NLA_U8:
        case NLA_U16:
        case NLA_U32:
        case NLA_U64:
        case NLA_MSECS:
        case NLA_BINARY:
        case NLA_BE16:
        case NLA_BE32:
                return nla_validate_range_unsigned(pt, nla, extack, validate);
        case NLA_S8:
        case NLA_S16:
        case NLA_S32:
        case NLA_S64:
                return nla_validate_int_range_signed(pt, nla, extack);
        default:
                WARN_ON(1);
                return -EINVAL;
        }
}

static int nla_validate_mask(const struct nla_policy *pt,
                             const struct nlattr *nla,
                             struct netlink_ext_ack *extack)
{
        u64 value;

        switch (pt->type) {
        case NLA_U8:
                value = nla_get_u8(nla);
                break;
        case NLA_U16:
                value = nla_get_u16(nla);
                break;
        case NLA_U32:
                value = nla_get_u32(nla);
                break;
        case NLA_U64:
                value = nla_get_u64(nla);
                break;
        case NLA_BE16:
                value = ntohs(nla_get_be16(nla));
                break;
        case NLA_BE32:
                value = ntohl(nla_get_be32(nla));
                break;
        default:
                return -EINVAL;
        }

        if (value & ~(u64)pt->mask) {
                NL_SET_ERR_MSG_ATTR(extack, nla, "reserved bit set");
                return -EINVAL;
        }

        return 0;
}

static int validate_nla(const struct nlattr *nla, int maxtype,
                        const struct nla_policy *policy, unsigned int validate,
                        struct netlink_ext_ack *extack, unsigned int depth)
{
        u16 strict_start_type = policy[0].strict_start_type;
        const struct nla_policy *pt;
        int minlen = 0, attrlen = nla_len(nla), type = nla_type(nla);
        int err = -ERANGE;

        if (strict_start_type && type >= strict_start_type)
                validate |= NL_VALIDATE_STRICT;

        if (type <= 0 || type > maxtype)
                return 0;

        type = array_index_nospec(type, maxtype + 1);
        pt = &policy[type];

        BUG_ON(pt->type > NLA_TYPE_MAX);

        if (nla_attr_len[pt->type] && attrlen != nla_attr_len[pt->type]) {
                pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n",
                                    current->comm, type);
                if (validate & NL_VALIDATE_STRICT_ATTRS) {
                        NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
                                                "invalid attribute length");
                        return -EINVAL;
                }
        }

        if (validate & NL_VALIDATE_NESTED) {
                if ((pt->type == NLA_NESTED || pt->type == NLA_NESTED_ARRAY) &&
                    !(nla->nla_type & NLA_F_NESTED)) {
                        NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
                                                "NLA_F_NESTED is missing");
                        return -EINVAL;
                }
                if (pt->type != NLA_NESTED && pt->type != NLA_NESTED_ARRAY &&
                    pt->type != NLA_UNSPEC && (nla->nla_type & NLA_F_NESTED)) {
                        NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
                                                "NLA_F_NESTED not expected");
                        return -EINVAL;
                }
        }

        switch (pt->type) {
        case NLA_REJECT:
                if (extack && pt->reject_message) {
                        NL_SET_BAD_ATTR(extack, nla);
                        extack->_msg = pt->reject_message;
                        return -EINVAL;
                }
                err = -EINVAL;
                goto out_err;

        case NLA_FLAG:
                if (attrlen > 0)
                        goto out_err;
                break;

        case NLA_BITFIELD32:
                if (attrlen != sizeof(struct nla_bitfield32))
                        goto out_err;

                err = validate_nla_bitfield32(nla, pt->bitfield32_valid);
                if (err)
                        goto out_err;
                break;

        case NLA_NUL_STRING:
                if (pt->len)
                        minlen = min_t(int, attrlen, pt->len + 1);
                else
                        minlen = attrlen;

                if (!minlen || memchr(nla_data(nla), '\0', minlen) == NULL) {
                        err = -EINVAL;
                        goto out_err;
                }
                /* fall through */

        case NLA_STRING:
                if (attrlen < 1)
                        goto out_err;

                if (pt->len) {
                        char *buf = nla_data(nla);

                        if (buf[attrlen - 1] == '\0')
                                attrlen--;

                        if (attrlen > pt->len)
                                goto out_err;
                }
                break;

        case NLA_BINARY:
                if (pt->len && attrlen > pt->len)
                        goto out_err;
                break;

        case NLA_NESTED:
                /* a nested attributes is allowed to be empty; if its not,
                 * it must have a size of at least NLA_HDRLEN.
                 */
                if (attrlen == 0)
                        break;
                if (attrlen < NLA_HDRLEN)
                        goto out_err;
                if (pt->nested_policy) {
                        err = __nla_validate_parse(nla_data(nla), nla_len(nla),
                                                   pt->len, pt->nested_policy,
                                                   validate, extack, NULL,
                                                   depth + 1);
                        if (err < 0) {
                                /*
                                 * return directly to preserve the inner
                                 * error message/attribute pointer
                                 */
                                return err;
                        }
                }
                break;
        case NLA_NESTED_ARRAY:
                /* a nested array attribute is allowed to be empty; if its not,
                 * it must have a size of at least NLA_HDRLEN.
                 */
                if (attrlen == 0)
                        break;
                if (attrlen < NLA_HDRLEN)
                        goto out_err;
                if (pt->nested_policy) {
                        int err;

                        err = nla_validate_array(nla_data(nla), nla_len(nla),
                                                 pt->len, pt->nested_policy,
                                                 extack, validate, depth);
                        if (err < 0) {
                                /*
                                 * return directly to preserve the inner
                                 * error message/attribute pointer
                                 */
                                return err;
                        }
                }
                break;

        case NLA_UNSPEC:
                if (validate & NL_VALIDATE_UNSPEC) {
                        NL_SET_ERR_MSG_ATTR(extack, nla,
                                            "Unsupported attribute");
                        return -EINVAL;
                }
                if (attrlen < pt->len)
                        goto out_err;
                break;

        default:
                if (pt->len)
                        minlen = pt->len;
                else
                        minlen = nla_attr_minlen[pt->type];

                if (attrlen < minlen)
                        goto out_err;
        }

        /* further validation */
        switch (pt->validation_type) {
        case NLA_VALIDATE_NONE:
                /* nothing to do */
                break;
        case NLA_VALIDATE_RANGE_PTR:
        case NLA_VALIDATE_RANGE:
        case NLA_VALIDATE_RANGE_WARN_TOO_LONG:
        case NLA_VALIDATE_MIN:
        case NLA_VALIDATE_MAX:
                err = nla_validate_int_range(pt, nla, extack, validate);
                if (err)
                        return err;
                break;
        case NLA_VALIDATE_MASK:
                err = nla_validate_mask(pt, nla, extack);
                if (err)
                        return err;
                break;
        case NLA_VALIDATE_FUNCTION:
                if (pt->validate) {
                        err = pt->validate(nla, extack);
                        if (err)
                                return err;
                }
                break;
        }

        return 0;
out_err:
        NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
                                "Attribute failed policy validation");
        return err;
}

static int __nla_validate_parse(const struct nlattr *head, int len, int maxtype,
                                const struct nla_policy *policy,
                                unsigned int validate,
                                struct netlink_ext_ack *extack,
                                struct nlattr **tb, unsigned int depth)
{
        const struct nlattr *nla;
        int rem;

        if (depth >= MAX_POLICY_RECURSION_DEPTH) {
                NL_SET_ERR_MSG(extack,
                               "allowed policy recursion depth exceeded");
                return -EINVAL;
        }

        if (tb)
                memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));

        nla_for_each_attr(nla, head, len, rem) {
                u16 type = nla_type(nla);

                if (type == 0 || type > maxtype) {
                        if (validate & NL_VALIDATE_MAXTYPE) {
                                NL_SET_ERR_MSG_ATTR(extack, nla,
                                                    "Unknown attribute type");
                                return -EINVAL;
                        }
                        continue;
                }
                type = array_index_nospec(type, maxtype + 1);
                if (policy) {
                        int err = validate_nla(nla, maxtype, policy,
                                               validate, extack, depth);

                        if (err < 0)
                                return err;
                }

                if (tb)
                        tb[type] = (struct nlattr *)nla;
        }

        if (unlikely(rem > 0)) {
                pr_warn_ratelimited("netlink: %d bytes leftover after parsing attributes in process `%s'.\n",
                                    rem, current->comm);
                NL_SET_ERR_MSG(extack, "bytes leftover after parsing attributes");
                if (validate & NL_VALIDATE_TRAILING)
                        return -EINVAL;
        }

        return 0;
}

/**
 * __nla_validate - Validate a stream of attributes
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @validate: validation strictness
 * @extack: extended ACK report struct
 *
 * Validates all attributes in the specified attribute stream against the
 * specified policy. Validation depends on the validate flags passed, see
 * &enum netlink_validation for more details on that.
 * See documenation of struct nla_policy for more details.
 *
 * Returns 0 on success or a negative error code.
 */
int __nla_validate(const struct nlattr *head, int len, int maxtype,
                   const struct nla_policy *policy, unsigned int validate,
                   struct netlink_ext_ack *extack)
{
        return __nla_validate_parse(head, len, maxtype, policy, validate,
                                    extack, NULL, 0);
}
EXPORT_SYMBOL(__nla_validate);

/**
 * nla_policy_len - Determin the max. length of a policy
 * @policy: policy to use
 * @n: number of policies
 *
 * Determines the max. length of the policy.  It is currently used
 * to allocated Netlink buffers roughly the size of the actual
 * message.
 *
 * Returns 0 on success or a negative error code.
 */
int
nla_policy_len(const struct nla_policy *p, int n)
{
        int i, len = 0;

        for (i = 0; i < n; i++, p++) {
                if (p->len)
                        len += nla_total_size(p->len);
                else if (nla_attr_len[p->type])
                        len += nla_total_size(nla_attr_len[p->type]);
                else if (nla_attr_minlen[p->type])
                        len += nla_total_size(nla_attr_minlen[p->type]);
        }

        return len;
}
EXPORT_SYMBOL(nla_policy_len);

/**
 * __nla_parse - Parse a stream of attributes into a tb buffer
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @policy: validation policy
 * @validate: validation strictness
 * @extack: extended ACK pointer
 *
 * Parses a stream of attributes and stores a pointer to each attribute in
 * the tb array accessible via the attribute type.
 * Validation is controlled by the @validate parameter.
 *
 * Returns 0 on success or a negative error code.
 */
int __nla_parse(struct nlattr **tb, int maxtype,
                const struct nlattr *head, int len,
                const struct nla_policy *policy, unsigned int validate,
                struct netlink_ext_ack *extack)
{
        return __nla_validate_parse(head, len, maxtype, policy, validate,
                                    extack, tb, 0);
}
EXPORT_SYMBOL(__nla_parse);

/**
 * nla_find - Find a specific attribute in a stream of attributes
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @attrtype: type of attribute to look for
 *
 * Returns the first attribute in the stream matching the specified type.
 */
struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype)
{
        const struct nlattr *nla;
        int rem;

        nla_for_each_attr(nla, head, len, rem)
                if (nla_type(nla) == attrtype)
                        return (struct nlattr *)nla;

        return NULL;
}
EXPORT_SYMBOL(nla_find);

/**
 * nla_strlcpy - Copy string attribute payload into a sized buffer
 * @dst: where to copy the string to
 * @nla: attribute to copy the string from
 * @dstsize: size of destination buffer
 *
 * Copies at most dstsize - 1 bytes into the destination buffer.
 * The result is always a valid NUL-terminated string. Unlike
 * strlcpy the destination buffer is always padded out.
 *
 * Returns the length of the source buffer.
 */
size_t nla_strlcpy(char *dst, const struct nlattr *nla, size_t dstsize)
{
        size_t srclen = nla_len(nla);
        char *src = nla_data(nla);

        if (srclen > 0 && src[srclen - 1] == '\0')
                srclen--;

        if (dstsize > 0) {
                size_t len = (srclen >= dstsize) ? dstsize - 1 : srclen;

                memset(dst, 0, dstsize);
                memcpy(dst, src, len);
        }

        return srclen;
}
EXPORT_SYMBOL(nla_strlcpy);

/**
 * nla_strdup - Copy string attribute payload into a newly allocated buffer
 * @nla: attribute to copy the string from
 * @flags: the type of memory to allocate (see kmalloc).
 *
 * Returns a pointer to the allocated buffer or NULL on error.
 */
char *nla_strdup(const struct nlattr *nla, gfp_t flags)
{
        size_t srclen = nla_len(nla);
        char *src = nla_data(nla), *dst;

        if (srclen > 0 && src[srclen - 1] == '\0')
                srclen--;

        dst = kmalloc(srclen + 1, flags);
        if (dst != NULL) {
                memcpy(dst, src, srclen);
                dst[srclen] = '\0';
        }
        return dst;
}
EXPORT_SYMBOL(nla_strdup);

/**
 * nla_memcpy - Copy a netlink attribute into another memory area
 * @dest: where to copy to memcpy
 * @src: netlink attribute to copy from
 * @count: size of the destination area
 *
 * Note: The number of bytes copied is limited by the length of
 *       attribute's payload. memcpy
 *
 * Returns the number of bytes copied.
 */
int nla_memcpy(void *dest, const struct nlattr *src, int count)
{
        int minlen = min_t(int, count, nla_len(src));

        memcpy(dest, nla_data(src), minlen);
        if (count > minlen)
                memset(dest + minlen, 0, count - minlen);

        return minlen;
}
EXPORT_SYMBOL(nla_memcpy);

/**
 * nla_memcmp - Compare an attribute with sized memory area
 * @nla: netlink attribute
 * @data: memory area
 * @size: size of memory area
 */
int nla_memcmp(const struct nlattr *nla, const void *data,
                             size_t size)
{
        int d = nla_len(nla) - size;

        if (d == 0)
                d = memcmp(nla_data(nla), data, size);

        return d;
}
EXPORT_SYMBOL(nla_memcmp);

/**
 * nla_strcmp - Compare a string attribute against a string
 * @nla: netlink string attribute
 * @str: another string
 */
int nla_strcmp(const struct nlattr *nla, const char *str)
{
        int len = strlen(str);
        char *buf = nla_data(nla);
        int attrlen = nla_len(nla);
        int d;

        while (attrlen > 0 && buf[attrlen - 1] == '\0')
                attrlen--;

        d = attrlen - len;
        if (d == 0)
                d = memcmp(nla_data(nla), str, len);

        return d;
}
EXPORT_SYMBOL(nla_strcmp);

#ifdef CONFIG_NET
/**
 * __nla_reserve - reserve room for attribute on the skb
 * @skb: socket buffer to reserve room on
 * @attrtype: attribute type
 * @attrlen: length of attribute payload
 *
 * Adds a netlink attribute header to a socket buffer and reserves
 * room for the payload but does not copy it.
 *
 * The caller is responsible to ensure that the skb provides enough
 * tailroom for the attribute header and payload.
 */
struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen)
{
        struct nlattr *nla;

        nla = skb_put(skb, nla_total_size(attrlen));
        nla->nla_type = attrtype;
        nla->nla_len = nla_attr_size(attrlen);

        memset((unsigned char *) nla + nla->nla_len, 0, nla_padlen(attrlen));

        return nla;
}
EXPORT_SYMBOL(__nla_reserve);

/**
 * __nla_reserve_64bit - reserve room for attribute on the skb and align it
 * @skb: socket buffer to reserve room on
 * @attrtype: attribute type
 * @attrlen: length of attribute payload
 * @padattr: attribute type for the padding
 *
 * Adds a netlink attribute header to a socket buffer and reserves
 * room for the payload but does not copy it. It also ensure that this
 * attribute will have a 64-bit aligned nla_data() area.
 *
 * The caller is responsible to ensure that the skb provides enough
 * tailroom for the attribute header and payload.
 */
struct nlattr *__nla_reserve_64bit(struct sk_buff *skb, int attrtype,
                                   int attrlen, int padattr)
{
        nla_align_64bit(skb, padattr);

        return __nla_reserve(skb, attrtype, attrlen);
}
EXPORT_SYMBOL(__nla_reserve_64bit);

/**
 * __nla_reserve_nohdr - reserve room for attribute without header
 * @skb: socket buffer to reserve room on
 * @attrlen: length of attribute payload
 *
 * Reserves room for attribute payload without a header.
 *
 * The caller is responsible to ensure that the skb provides enough
 * tailroom for the payload.
 */
void *__nla_reserve_nohdr(struct sk_buff *skb, int attrlen)
{
        return skb_put_zero(skb, NLA_ALIGN(attrlen));
}
EXPORT_SYMBOL(__nla_reserve_nohdr);

/**
 * nla_reserve - reserve room for attribute on the skb
 * @skb: socket buffer to reserve room on
 * @attrtype: attribute type
 * @attrlen: length of attribute payload
 *
 * Adds a netlink attribute header to a socket buffer and reserves
 * room for the payload but does not copy it.
 *
 * Returns NULL if the tailroom of the skb is insufficient to store
 * the attribute header and payload.
 */
struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen)
{
        if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen)))
                return NULL;

        return __nla_reserve(skb, attrtype, attrlen);
}
EXPORT_SYMBOL(nla_reserve);

/**
 * nla_reserve_64bit - reserve room for attribute on the skb and align it
 * @skb: socket buffer to reserve room on
 * @attrtype: attribute type
 * @attrlen: length of attribute payload
 * @padattr: attribute type for the padding
 *
 * Adds a netlink attribute header to a socket buffer and reserves
 * room for the payload but does not copy it. It also ensure that this
 * attribute will have a 64-bit aligned nla_data() area.
 *
 * Returns NULL if the tailroom of the skb is insufficient to store
 * the attribute header and payload.
 */
struct nlattr *nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen,
                                 int padattr)
{
        size_t len;

        if (nla_need_padding_for_64bit(skb))
                len = nla_total_size_64bit(attrlen);
        else
                len = nla_total_size(attrlen);
        if (unlikely(skb_tailroom(skb) < len))
                return NULL;

        return __nla_reserve_64bit(skb, attrtype, attrlen, padattr);
}
EXPORT_SYMBOL(nla_reserve_64bit);

/**
 * nla_reserve_nohdr - reserve room for attribute without header
 * @skb: socket buffer to reserve room on
 * @attrlen: length of attribute payload
 *
 * Reserves room for attribute payload without a header.
 *
 * Returns NULL if the tailroom of the skb is insufficient to store
 * the attribute payload.
 */
void *nla_reserve_nohdr(struct sk_buff *skb, int attrlen)
{
        if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen)))
                return NULL;

        return __nla_reserve_nohdr(skb, attrlen);
}
EXPORT_SYMBOL(nla_reserve_nohdr);

/**
 * __nla_put - Add a netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @attrlen: length of attribute payload
 * @data: head of attribute payload
 *
 * The caller is responsible to ensure that the skb provides enough
 * tailroom for the attribute header and payload.
 */
void __nla_put(struct sk_buff *skb, int attrtype, int attrlen,
                             const void *data)
{
        struct nlattr *nla;

        nla = __nla_reserve(skb, attrtype, attrlen);
        memcpy(nla_data(nla), data, attrlen);
}
EXPORT_SYMBOL(__nla_put);

/**
 * __nla_put_64bit - Add a netlink attribute to a socket buffer and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @attrlen: length of attribute payload
 * @data: head of attribute payload
 * @padattr: attribute type for the padding
 *
 * The caller is responsible to ensure that the skb provides enough
 * tailroom for the attribute header and payload.
 */
void __nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen,
                     const void *data, int padattr)
{
        struct nlattr *nla;

        nla = __nla_reserve_64bit(skb, attrtype, attrlen, padattr);
        memcpy(nla_data(nla), data, attrlen);
}
EXPORT_SYMBOL(__nla_put_64bit);

/**
 * __nla_put_nohdr - Add a netlink attribute without header
 * @skb: socket buffer to add attribute to
 * @attrlen: length of attribute payload
 * @data: head of attribute payload
 *
 * The caller is responsible to ensure that the skb provides enough
 * tailroom for the attribute payload.
 */
void __nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data)
{
        void *start;

        start = __nla_reserve_nohdr(skb, attrlen);
        memcpy(start, data, attrlen);
}
EXPORT_SYMBOL(__nla_put_nohdr);

/**
 * nla_put - Add a netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @attrlen: length of attribute payload
 * @data: head of attribute payload
 *
 * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store
 * the attribute header and payload.
 */
int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data)
{
        if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen)))
                return -EMSGSIZE;

        __nla_put(skb, attrtype, attrlen, data);
        return 0;
}
EXPORT_SYMBOL(nla_put);

/**
 * nla_put_64bit - Add a netlink attribute to a socket buffer and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @attrlen: length of attribute payload
 * @data: head of attribute payload
 * @padattr: attribute type for the padding
 *
 * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store
 * the attribute header and payload.
 */
int nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen,
                  const void *data, int padattr)
{
        size_t len;

        if (nla_need_padding_for_64bit(skb))
                len = nla_total_size_64bit(attrlen);
        else
                len = nla_total_size(attrlen);
        if (unlikely(skb_tailroom(skb) < len))
                return -EMSGSIZE;

        __nla_put_64bit(skb, attrtype, attrlen, data, padattr);
        return 0;
}
EXPORT_SYMBOL(nla_put_64bit);

/**
 * nla_put_nohdr - Add a netlink attribute without header
 * @skb: socket buffer to add attribute to
 * @attrlen: length of attribute payload
 * @data: head of attribute payload
 *
 * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store
 * the attribute payload.
 */
int nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data)
{
        if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen)))
                return -EMSGSIZE;

        __nla_put_nohdr(skb, attrlen, data);
        return 0;
}
EXPORT_SYMBOL(nla_put_nohdr);

/**
 * nla_append - Add a netlink attribute without header or padding
 * @skb: socket buffer to add attribute to
 * @attrlen: length of attribute payload
 * @data: head of attribute payload
 *
 * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store
 * the attribute payload.
 */
int nla_append(struct sk_buff *skb, int attrlen, const void *data)
{
        if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen)))
                return -EMSGSIZE;

        skb_put_data(skb, data, attrlen);
        return 0;
}
EXPORT_SYMBOL(nla_append);
#endif



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 




    1 




    1 

































































































































































































































































































































































































    1 
















































































































































    1 

















    1 










































































































































    1 



































    1 


















    1 





































































































































































































































    1 
    1 































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *  Copyright 2003-2005 Red Hat, Inc.  All rights reserved.
 *  Copyright 2003-2005 Jeff Garzik
 *
 *  libata documentation is available via 'make {ps|pdf}docs',
 *  as Documentation/driver-api/libata.rst
 */

#ifndef __LINUX_LIBATA_H__
#define __LINUX_LIBATA_H__

#include <linux/delay.h>
#include <linux/jiffies.h>
#include <linux/interrupt.h>
#include <linux/dma-mapping.h>
#include <linux/scatterlist.h>
#include <linux/io.h>
#include <linux/ata.h>
#include <linux/workqueue.h>
#include <scsi/scsi_host.h>
#include <linux/acpi.h>
#include <linux/cdrom.h>
#include <linux/sched.h>
#include <linux/async.h>

/*
 * Define if arch has non-standard setup.  This is a _PCI_ standard
 * not a legacy or ISA standard.
 */
#ifdef CONFIG_ATA_NONSTANDARD
#include <asm/libata-portmap.h>
#else
#define ATA_PRIMARY_IRQ(dev)        14
#define ATA_SECONDARY_IRQ(dev)        15
#endif

/*
 * compile-time options: to be removed as soon as all the drivers are
 * converted to the new debugging mechanism
 */
#undef ATA_DEBUG                /* debugging output */
#undef ATA_VERBOSE_DEBUG        /* yet more debugging output */
#undef ATA_IRQ_TRAP                /* define to ack screaming irqs */
#undef ATA_NDEBUG                /* define to disable quick runtime checks */


/* note: prints function name for you */
#ifdef ATA_DEBUG
#define DPRINTK(fmt, args...) printk(KERN_ERR "%s: " fmt, __func__, ## args)
#ifdef ATA_VERBOSE_DEBUG
#define VPRINTK(fmt, args...) printk(KERN_ERR "%s: " fmt, __func__, ## args)
#else
#define VPRINTK(fmt, args...)
#endif        /* ATA_VERBOSE_DEBUG */
#else
#define DPRINTK(fmt, args...)
#define VPRINTK(fmt, args...)
#endif        /* ATA_DEBUG */

#define ata_print_version_once(dev, version)                        \
({                                                                \
        static bool __print_once;                                \
                                                                \
        if (!__print_once) {                                        \
                __print_once = true;                                \
                ata_print_version(dev, version);                \
        }                                                        \
})

/* NEW: debug levels */
#define HAVE_LIBATA_MSG 1

enum {
        ATA_MSG_DRV        = 0x0001,
        ATA_MSG_INFO        = 0x0002,
        ATA_MSG_PROBE        = 0x0004,
        ATA_MSG_WARN        = 0x0008,
        ATA_MSG_MALLOC        = 0x0010,
        ATA_MSG_CTL        = 0x0020,
        ATA_MSG_INTR        = 0x0040,
        ATA_MSG_ERR        = 0x0080,
};

#define ata_msg_drv(p)    ((p)->msg_enable & ATA_MSG_DRV)
#define ata_msg_info(p)   ((p)->msg_enable & ATA_MSG_INFO)
#define ata_msg_probe(p)  ((p)->msg_enable & ATA_MSG_PROBE)
#define ata_msg_warn(p)   ((p)->msg_enable & ATA_MSG_WARN)
#define ata_msg_malloc(p) ((p)->msg_enable & ATA_MSG_MALLOC)
#define ata_msg_ctl(p)    ((p)->msg_enable & ATA_MSG_CTL)
#define ata_msg_intr(p)   ((p)->msg_enable & ATA_MSG_INTR)
#define ata_msg_err(p)    ((p)->msg_enable & ATA_MSG_ERR)

static inline u32 ata_msg_init(int dval, int default_msg_enable_bits)
{
        if (dval < 0 || dval >= (sizeof(u32) * 8))
                return default_msg_enable_bits; /* should be 0x1 - only driver info msgs */
        if (!dval)
                return 0;
        return (1 << dval) - 1;
}

/* defines only for the constants which don't work well as enums */
#define ATA_TAG_POISON                0xfafbfcfdU

enum {
        /* various global constants */
        LIBATA_MAX_PRD                = ATA_MAX_PRD / 2,
        LIBATA_DUMB_MAX_PRD        = ATA_MAX_PRD / 4,        /* Worst case */
        ATA_DEF_QUEUE                = 1,
        ATA_MAX_QUEUE                = 32,
        ATA_TAG_INTERNAL        = ATA_MAX_QUEUE,
        ATA_SHORT_PAUSE                = 16,

        ATAPI_MAX_DRAIN                = 16 << 10,

        ATA_ALL_DEVICES                = (1 << ATA_MAX_DEVICES) - 1,

        ATA_SHT_EMULATED        = 1,
        ATA_SHT_THIS_ID                = -1,

        /* struct ata_taskfile flags */
        ATA_TFLAG_LBA48                = (1 << 0), /* enable 48-bit LBA and "HOB" */
        ATA_TFLAG_ISADDR        = (1 << 1), /* enable r/w to nsect/lba regs */
        ATA_TFLAG_DEVICE        = (1 << 2), /* enable r/w to device reg */
        ATA_TFLAG_WRITE                = (1 << 3), /* data dir: host->dev==1 (write) */
        ATA_TFLAG_LBA                = (1 << 4), /* enable LBA */
        ATA_TFLAG_FUA                = (1 << 5), /* enable FUA */
        ATA_TFLAG_POLLING        = (1 << 6), /* set nIEN to 1 and use polling */

        /* struct ata_device stuff */
        ATA_DFLAG_LBA                = (1 << 0), /* device supports LBA */
        ATA_DFLAG_LBA48                = (1 << 1), /* device supports LBA48 */
        ATA_DFLAG_CDB_INTR        = (1 << 2), /* device asserts INTRQ when ready for CDB */
        ATA_DFLAG_NCQ                = (1 << 3), /* device supports NCQ */
        ATA_DFLAG_FLUSH_EXT        = (1 << 4), /* do FLUSH_EXT instead of FLUSH */
        ATA_DFLAG_ACPI_PENDING        = (1 << 5), /* ACPI resume action pending */
        ATA_DFLAG_ACPI_FAILED        = (1 << 6), /* ACPI on devcfg has failed */
        ATA_DFLAG_AN                = (1 << 7), /* AN configured */
        ATA_DFLAG_TRUSTED        = (1 << 8), /* device supports trusted send/recv */
        ATA_DFLAG_DMADIR        = (1 << 10), /* device requires DMADIR */
        ATA_DFLAG_CFG_MASK        = (1 << 12) - 1,

        ATA_DFLAG_PIO                = (1 << 12), /* device limited to PIO mode */
        ATA_DFLAG_NCQ_OFF        = (1 << 13), /* device limited to non-NCQ mode */
        ATA_DFLAG_SLEEPING        = (1 << 15), /* device is sleeping */
        ATA_DFLAG_DUBIOUS_XFER        = (1 << 16), /* data transfer not verified */
        ATA_DFLAG_NO_UNLOAD        = (1 << 17), /* device doesn't support unload */
        ATA_DFLAG_UNLOCK_HPA        = (1 << 18), /* unlock HPA */
        ATA_DFLAG_NCQ_SEND_RECV = (1 << 19), /* device supports NCQ SEND and RECV */
        ATA_DFLAG_NCQ_PRIO        = (1 << 20), /* device supports NCQ priority */
        ATA_DFLAG_NCQ_PRIO_ENABLE = (1 << 21), /* Priority cmds sent to dev */
        ATA_DFLAG_INIT_MASK        = (1 << 24) - 1,

        ATA_DFLAG_DETACH        = (1 << 24),
        ATA_DFLAG_DETACHED        = (1 << 25),

        ATA_DFLAG_DA                = (1 << 26), /* device supports Device Attention */
        ATA_DFLAG_DEVSLP        = (1 << 27), /* device supports Device Sleep */
        ATA_DFLAG_ACPI_DISABLED = (1 << 28), /* ACPI for the device is disabled */
        ATA_DFLAG_D_SENSE        = (1 << 29), /* Descriptor sense requested */
        ATA_DFLAG_ZAC                = (1 << 30), /* ZAC device */

        ATA_DEV_UNKNOWN                = 0,        /* unknown device */
        ATA_DEV_ATA                = 1,        /* ATA device */
        ATA_DEV_ATA_UNSUP        = 2,        /* ATA device (unsupported) */
        ATA_DEV_ATAPI                = 3,        /* ATAPI device */
        ATA_DEV_ATAPI_UNSUP        = 4,        /* ATAPI device (unsupported) */
        ATA_DEV_PMP                = 5,        /* SATA port multiplier */
        ATA_DEV_PMP_UNSUP        = 6,        /* SATA port multiplier (unsupported) */
        ATA_DEV_SEMB                = 7,        /* SEMB */
        ATA_DEV_SEMB_UNSUP        = 8,        /* SEMB (unsupported) */
        ATA_DEV_ZAC                = 9,        /* ZAC device */
        ATA_DEV_ZAC_UNSUP        = 10,        /* ZAC device (unsupported) */
        ATA_DEV_NONE                = 11,        /* no device */

        /* struct ata_link flags */
        /* NOTE: struct ata_force_param currently stores lflags in u16 */
        ATA_LFLAG_NO_HRST        = (1 << 1), /* avoid hardreset */
        ATA_LFLAG_NO_SRST        = (1 << 2), /* avoid softreset */
        ATA_LFLAG_ASSUME_ATA        = (1 << 3), /* assume ATA class */
        ATA_LFLAG_ASSUME_SEMB        = (1 << 4), /* assume SEMB class */
        ATA_LFLAG_ASSUME_CLASS        = ATA_LFLAG_ASSUME_ATA | ATA_LFLAG_ASSUME_SEMB,
        ATA_LFLAG_NO_RETRY        = (1 << 5), /* don't retry this link */
        ATA_LFLAG_DISABLED        = (1 << 6), /* link is disabled */
        ATA_LFLAG_SW_ACTIVITY        = (1 << 7), /* keep activity stats */
        ATA_LFLAG_NO_LPM        = (1 << 8), /* disable LPM on this link */
        ATA_LFLAG_RST_ONCE        = (1 << 9), /* limit recovery to one reset */
        ATA_LFLAG_CHANGED        = (1 << 10), /* LPM state changed on this link */
        ATA_LFLAG_NO_DEBOUNCE_DELAY = (1 << 11), /* no debounce delay on link resume */

        /* struct ata_port flags */
        ATA_FLAG_SLAVE_POSS        = (1 << 0), /* host supports slave dev */
                                            /* (doesn't imply presence) */
        ATA_FLAG_SATA                = (1 << 1),
        ATA_FLAG_NO_LPM                = (1 << 2), /* host not happy with LPM */
        ATA_FLAG_NO_LOG_PAGE        = (1 << 5), /* do not issue log page read */
        ATA_FLAG_NO_ATAPI        = (1 << 6), /* No ATAPI support */
        ATA_FLAG_PIO_DMA        = (1 << 7), /* PIO cmds via DMA */
        ATA_FLAG_PIO_LBA48        = (1 << 8), /* Host DMA engine is LBA28 only */
        ATA_FLAG_PIO_POLLING        = (1 << 9), /* use polling PIO if LLD
                                             * doesn't handle PIO interrupts */
        ATA_FLAG_NCQ                = (1 << 10), /* host supports NCQ */
        ATA_FLAG_NO_POWEROFF_SPINDOWN = (1 << 11), /* don't spindown before poweroff */
        ATA_FLAG_NO_HIBERNATE_SPINDOWN = (1 << 12), /* don't spindown before hibernation */
        ATA_FLAG_DEBUGMSG        = (1 << 13),
        ATA_FLAG_FPDMA_AA                = (1 << 14), /* driver supports Auto-Activate */
        ATA_FLAG_IGN_SIMPLEX        = (1 << 15), /* ignore SIMPLEX */
        ATA_FLAG_NO_IORDY        = (1 << 16), /* controller lacks iordy */
        ATA_FLAG_ACPI_SATA        = (1 << 17), /* need native SATA ACPI layout */
        ATA_FLAG_AN                = (1 << 18), /* controller supports AN */
        ATA_FLAG_PMP                = (1 << 19), /* controller supports PMP */
        ATA_FLAG_FPDMA_AUX        = (1 << 20), /* controller supports H2DFIS aux field */
        ATA_FLAG_EM                = (1 << 21), /* driver supports enclosure
                                              * management */
        ATA_FLAG_SW_ACTIVITY        = (1 << 22), /* driver supports sw activity
                                              * led */
        ATA_FLAG_NO_DIPM        = (1 << 23), /* host not happy with DIPM */
        ATA_FLAG_SAS_HOST        = (1 << 24), /* SAS host */

        /* bits 24:31 of ap->flags are reserved for LLD specific flags */


        /* struct ata_port pflags */
        ATA_PFLAG_EH_PENDING        = (1 << 0), /* EH pending */
        ATA_PFLAG_EH_IN_PROGRESS = (1 << 1), /* EH in progress */
        ATA_PFLAG_FROZEN        = (1 << 2), /* port is frozen */
        ATA_PFLAG_RECOVERED        = (1 << 3), /* recovery action performed */
        ATA_PFLAG_LOADING        = (1 << 4), /* boot/loading probe */
        ATA_PFLAG_SCSI_HOTPLUG        = (1 << 6), /* SCSI hotplug scheduled */
        ATA_PFLAG_INITIALIZING        = (1 << 7), /* being initialized, don't touch */
        ATA_PFLAG_RESETTING        = (1 << 8), /* reset in progress */
        ATA_PFLAG_UNLOADING        = (1 << 9), /* driver is being unloaded */
        ATA_PFLAG_UNLOADED        = (1 << 10), /* driver is unloaded */

        ATA_PFLAG_SUSPENDED        = (1 << 17), /* port is suspended (power) */
        ATA_PFLAG_PM_PENDING        = (1 << 18), /* PM operation pending */
        ATA_PFLAG_INIT_GTM_VALID = (1 << 19), /* initial gtm data valid */

        ATA_PFLAG_PIO32                = (1 << 20),  /* 32bit PIO */
        ATA_PFLAG_PIO32CHANGE        = (1 << 21),  /* 32bit PIO can be turned on/off */
        ATA_PFLAG_EXTERNAL        = (1 << 22),  /* eSATA/external port */

        /* struct ata_queued_cmd flags */
        ATA_QCFLAG_ACTIVE        = (1 << 0), /* cmd not yet ack'd to scsi lyer */
        ATA_QCFLAG_DMAMAP        = (1 << 1), /* SG table is DMA mapped */
        ATA_QCFLAG_IO                = (1 << 3), /* standard IO command */
        ATA_QCFLAG_RESULT_TF        = (1 << 4), /* result TF requested */
        ATA_QCFLAG_CLEAR_EXCL        = (1 << 5), /* clear excl_link on completion */
        ATA_QCFLAG_QUIET        = (1 << 6), /* don't report device error */
        ATA_QCFLAG_RETRY        = (1 << 7), /* retry after failure */

        ATA_QCFLAG_FAILED        = (1 << 16), /* cmd failed and is owned by EH */
        ATA_QCFLAG_SENSE_VALID        = (1 << 17), /* sense data valid */
        ATA_QCFLAG_EH_SCHEDULED = (1 << 18), /* EH scheduled (obsolete) */

        /* host set flags */
        ATA_HOST_SIMPLEX        = (1 << 0),        /* Host is simplex, one DMA channel per host only */
        ATA_HOST_STARTED        = (1 << 1),        /* Host started */
        ATA_HOST_PARALLEL_SCAN        = (1 << 2),        /* Ports on this host can be scanned in parallel */
        ATA_HOST_IGNORE_ATA        = (1 << 3),        /* Ignore ATA devices on this host. */

        ATA_HOST_NO_PART        = (1 << 4), /* Host does not support partial */
        ATA_HOST_NO_SSC                = (1 << 5), /* Host does not support slumber */
        ATA_HOST_NO_DEVSLP        = (1 << 6), /* Host does not support devslp */

        /* bits 24:31 of host->flags are reserved for LLD specific flags */

        /* various lengths of time */
        ATA_TMOUT_BOOT                = 30000,        /* heuristic */
        ATA_TMOUT_BOOT_QUICK        =  7000,        /* heuristic */
        ATA_TMOUT_INTERNAL_QUICK = 5000,
        ATA_TMOUT_MAX_PARK        = 30000,

        /*
         * GoVault needs 2s and iVDR disk HHD424020F7SV00 800ms.  2s
         * is too much without parallel probing.  Use 2s if parallel
         * probing is available, 800ms otherwise.
         */
        ATA_TMOUT_FF_WAIT_LONG        =  2000,
        ATA_TMOUT_FF_WAIT        =   800,

        /* Spec mandates to wait for ">= 2ms" before checking status
         * after reset.  We wait 150ms, because that was the magic
         * delay used for ATAPI devices in Hale Landis's ATADRVR, for
         * the period of time between when the ATA command register is
         * written, and then status is checked.  Because waiting for
         * "a while" before checking status is fine, post SRST, we
         * perform this magic delay here as well.
         *
         * Old drivers/ide uses the 2mS rule and then waits for ready.
         */
        ATA_WAIT_AFTER_RESET        =  150,

        /* If PMP is supported, we have to do follow-up SRST.  As some
         * PMPs don't send D2H Reg FIS after hardreset, LLDs are
         * advised to wait only for the following duration before
         * doing SRST.
         */
        ATA_TMOUT_PMP_SRST_WAIT        = 10000,

        /* When the LPM policy is set to ATA_LPM_MAX_POWER, there might
         * be a spurious PHY event, so ignore the first PHY event that
         * occurs within 10s after the policy change.
         */
        ATA_TMOUT_SPURIOUS_PHY        = 10000,

        /* ATA bus states */
        BUS_UNKNOWN                = 0,
        BUS_DMA                        = 1,
        BUS_IDLE                = 2,
        BUS_NOINTR                = 3,
        BUS_NODATA                = 4,
        BUS_TIMER                = 5,
        BUS_PIO                        = 6,
        BUS_EDD                        = 7,
        BUS_IDENTIFY                = 8,
        BUS_PACKET                = 9,

        /* SATA port states */
        PORT_UNKNOWN                = 0,
        PORT_ENABLED                = 1,
        PORT_DISABLED                = 2,

        /* encoding various smaller bitmaps into a single
         * unsigned long bitmap
         */
        ATA_NR_PIO_MODES        = 7,
        ATA_NR_MWDMA_MODES        = 5,
        ATA_NR_UDMA_MODES        = 8,

        ATA_SHIFT_PIO                = 0,
        ATA_SHIFT_MWDMA                = ATA_SHIFT_PIO + ATA_NR_PIO_MODES,
        ATA_SHIFT_UDMA                = ATA_SHIFT_MWDMA + ATA_NR_MWDMA_MODES,
        ATA_SHIFT_PRIO                = 6,

        ATA_PRIO_HIGH                = 2,
        /* size of buffer to pad xfers ending on unaligned boundaries */
        ATA_DMA_PAD_SZ                = 4,

        /* ering size */
        ATA_ERING_SIZE                = 32,

        /* return values for ->qc_defer */
        ATA_DEFER_LINK                = 1,
        ATA_DEFER_PORT                = 2,

        /* desc_len for ata_eh_info and context */
        ATA_EH_DESC_LEN                = 80,

        /* reset / recovery action types */
        ATA_EH_REVALIDATE        = (1 << 0),
        ATA_EH_SOFTRESET        = (1 << 1), /* meaningful only in ->prereset */
        ATA_EH_HARDRESET        = (1 << 2), /* meaningful only in ->prereset */
        ATA_EH_RESET                = ATA_EH_SOFTRESET | ATA_EH_HARDRESET,
        ATA_EH_ENABLE_LINK        = (1 << 3),
        ATA_EH_PARK                = (1 << 5), /* unload heads and stop I/O */

        ATA_EH_PERDEV_MASK        = ATA_EH_REVALIDATE | ATA_EH_PARK,
        ATA_EH_ALL_ACTIONS        = ATA_EH_REVALIDATE | ATA_EH_RESET |
                                  ATA_EH_ENABLE_LINK,

        /* ata_eh_info->flags */
        ATA_EHI_HOTPLUGGED        = (1 << 0),  /* could have been hotplugged */
        ATA_EHI_NO_AUTOPSY        = (1 << 2),  /* no autopsy */
        ATA_EHI_QUIET                = (1 << 3),  /* be quiet */
        ATA_EHI_NO_RECOVERY        = (1 << 4),  /* no recovery */

        ATA_EHI_DID_SOFTRESET        = (1 << 16), /* already soft-reset this port */
        ATA_EHI_DID_HARDRESET        = (1 << 17), /* already soft-reset this port */
        ATA_EHI_PRINTINFO        = (1 << 18), /* print configuration info */
        ATA_EHI_SETMODE                = (1 << 19), /* configure transfer mode */
        ATA_EHI_POST_SETMODE        = (1 << 20), /* revalidating after setmode */

        ATA_EHI_DID_RESET        = ATA_EHI_DID_SOFTRESET | ATA_EHI_DID_HARDRESET,

        /* mask of flags to transfer *to* the slave link */
        ATA_EHI_TO_SLAVE_MASK        = ATA_EHI_NO_AUTOPSY | ATA_EHI_QUIET,

        /* max tries if error condition is still set after ->error_handler */
        ATA_EH_MAX_TRIES        = 5,

        /* sometimes resuming a link requires several retries */
        ATA_LINK_RESUME_TRIES        = 5,

        /* how hard are we gonna try to probe/recover devices */
        ATA_PROBE_MAX_TRIES        = 3,
        ATA_EH_DEV_TRIES        = 3,
        ATA_EH_PMP_TRIES        = 5,
        ATA_EH_PMP_LINK_TRIES        = 3,

        SATA_PMP_RW_TIMEOUT        = 3000,                /* PMP read/write timeout */

        /* This should match the actual table size of
         * ata_eh_cmd_timeout_table in libata-eh.c.
         */
        ATA_EH_CMD_TIMEOUT_TABLE_SIZE = 7,

        /* Horkage types. May be set by libata or controller on drives
           (some horkage may be drive/controller pair dependent */

        ATA_HORKAGE_DIAGNOSTIC        = (1 << 0),        /* Failed boot diag */
        ATA_HORKAGE_NODMA        = (1 << 1),        /* DMA problems */
        ATA_HORKAGE_NONCQ        = (1 << 2),        /* Don't use NCQ */
        ATA_HORKAGE_MAX_SEC_128        = (1 << 3),        /* Limit max sects to 128 */
        ATA_HORKAGE_BROKEN_HPA        = (1 << 4),        /* Broken HPA */
        ATA_HORKAGE_DISABLE        = (1 << 5),        /* Disable it */
        ATA_HORKAGE_HPA_SIZE        = (1 << 6),        /* native size off by one */
        ATA_HORKAGE_IVB                = (1 << 8),        /* cbl det validity bit bugs */
        ATA_HORKAGE_STUCK_ERR        = (1 << 9),        /* stuck ERR on next PACKET */
        ATA_HORKAGE_BRIDGE_OK        = (1 << 10),        /* no bridge limits */
        ATA_HORKAGE_ATAPI_MOD16_DMA = (1 << 11), /* use ATAPI DMA for commands
                                                    not multiple of 16 bytes */
        ATA_HORKAGE_FIRMWARE_WARN = (1 << 12),        /* firmware update warning */
        ATA_HORKAGE_1_5_GBPS        = (1 << 13),        /* force 1.5 Gbps */
        ATA_HORKAGE_NOSETXFER        = (1 << 14),        /* skip SETXFER, SATA only */
        ATA_HORKAGE_BROKEN_FPDMA_AA        = (1 << 15),        /* skip AA */
        ATA_HORKAGE_DUMP_ID        = (1 << 16),        /* dump IDENTIFY data */
        ATA_HORKAGE_MAX_SEC_LBA48 = (1 << 17),        /* Set max sects to 65535 */
        ATA_HORKAGE_ATAPI_DMADIR = (1 << 18),        /* device requires dmadir */
        ATA_HORKAGE_NO_NCQ_TRIM        = (1 << 19),        /* don't use queued TRIM */
        ATA_HORKAGE_NOLPM        = (1 << 20),        /* don't use LPM */
        ATA_HORKAGE_WD_BROKEN_LPM = (1 << 21),        /* some WDs have broken LPM */
        ATA_HORKAGE_ZERO_AFTER_TRIM = (1 << 22),/* guarantees zero after trim */
        ATA_HORKAGE_NO_DMA_LOG        = (1 << 23),        /* don't use DMA for log read */
        ATA_HORKAGE_NOTRIM        = (1 << 24),        /* don't use TRIM */
        ATA_HORKAGE_MAX_SEC_1024 = (1 << 25),        /* Limit max sects to 1024 */
        ATA_HORKAGE_MAX_TRIM_128M = (1 << 26),        /* Limit max trim size to 128M */
        ATA_HORKAGE_NO_NCQ_ON_ATI = (1 << 27),        /* Disable NCQ on ATI chipset */

         /* DMA mask for user DMA control: User visible values; DO NOT
            renumber */
        ATA_DMA_MASK_ATA        = (1 << 0),        /* DMA on ATA Disk */
        ATA_DMA_MASK_ATAPI        = (1 << 1),        /* DMA on ATAPI */
        ATA_DMA_MASK_CFA        = (1 << 2),        /* DMA on CF Card */

        /* ATAPI command types */
        ATAPI_READ                = 0,                /* READs */
        ATAPI_WRITE                = 1,                /* WRITEs */
        ATAPI_READ_CD                = 2,                /* READ CD [MSF] */
        ATAPI_PASS_THRU                = 3,                /* SAT pass-thru */
        ATAPI_MISC                = 4,                /* the rest */

        /* Timing constants */
        ATA_TIMING_SETUP        = (1 << 0),
        ATA_TIMING_ACT8B        = (1 << 1),
        ATA_TIMING_REC8B        = (1 << 2),
        ATA_TIMING_CYC8B        = (1 << 3),
        ATA_TIMING_8BIT                = ATA_TIMING_ACT8B | ATA_TIMING_REC8B |
                                  ATA_TIMING_CYC8B,
        ATA_TIMING_ACTIVE        = (1 << 4),
        ATA_TIMING_RECOVER        = (1 << 5),
        ATA_TIMING_DMACK_HOLD        = (1 << 6),
        ATA_TIMING_CYCLE        = (1 << 7),
        ATA_TIMING_UDMA                = (1 << 8),
        ATA_TIMING_ALL                = ATA_TIMING_SETUP | ATA_TIMING_ACT8B |
                                  ATA_TIMING_REC8B | ATA_TIMING_CYC8B |
                                  ATA_TIMING_ACTIVE | ATA_TIMING_RECOVER |
                                  ATA_TIMING_DMACK_HOLD | ATA_TIMING_CYCLE |
                                  ATA_TIMING_UDMA,

        /* ACPI constants */
        ATA_ACPI_FILTER_SETXFER        = 1 << 0,
        ATA_ACPI_FILTER_LOCK        = 1 << 1,
        ATA_ACPI_FILTER_DIPM        = 1 << 2,
        ATA_ACPI_FILTER_FPDMA_OFFSET = 1 << 3,        /* FPDMA non-zero offset */
        ATA_ACPI_FILTER_FPDMA_AA = 1 << 4,        /* FPDMA auto activate */

        ATA_ACPI_FILTER_DEFAULT        = ATA_ACPI_FILTER_SETXFER |
                                  ATA_ACPI_FILTER_LOCK |
                                  ATA_ACPI_FILTER_DIPM,
};

enum ata_xfer_mask {
        ATA_MASK_PIO                = ((1LU << ATA_NR_PIO_MODES) - 1)
                                        << ATA_SHIFT_PIO,
        ATA_MASK_MWDMA                = ((1LU << ATA_NR_MWDMA_MODES) - 1)
                                        << ATA_SHIFT_MWDMA,
        ATA_MASK_UDMA                = ((1LU << ATA_NR_UDMA_MODES) - 1)
                                        << ATA_SHIFT_UDMA,
};

enum hsm_task_states {
        HSM_ST_IDLE,                /* no command on going */
        HSM_ST_FIRST,                /* (waiting the device to)
                                   write CDB or first data block */
        HSM_ST,                        /* (waiting the device to) transfer data */
        HSM_ST_LAST,                /* (waiting the device to) complete command */
        HSM_ST_ERR,                /* error */
};

enum ata_completion_errors {
        AC_ERR_OK                = 0,            /* no error */
        AC_ERR_DEV                = (1 << 0), /* device reported error */
        AC_ERR_HSM                = (1 << 1), /* host state machine violation */
        AC_ERR_TIMEOUT                = (1 << 2), /* timeout */
        AC_ERR_MEDIA                = (1 << 3), /* media error */
        AC_ERR_ATA_BUS                = (1 << 4), /* ATA bus error */
        AC_ERR_HOST_BUS                = (1 << 5), /* host bus error */
        AC_ERR_SYSTEM                = (1 << 6), /* system error */
        AC_ERR_INVALID                = (1 << 7), /* invalid argument */
        AC_ERR_OTHER                = (1 << 8), /* unknown */
        AC_ERR_NODEV_HINT        = (1 << 9), /* polling device detection hint */
        AC_ERR_NCQ                = (1 << 10), /* marker for offending NCQ qc */
};

/*
 * Link power management policy: If you alter this, you also need to
 * alter libata-scsi.c (for the ascii descriptions)
 */
enum ata_lpm_policy {
        ATA_LPM_UNKNOWN,
        ATA_LPM_MAX_POWER,
        ATA_LPM_MED_POWER,
        ATA_LPM_MED_POWER_WITH_DIPM, /* Med power + DIPM as win IRST does */
        ATA_LPM_MIN_POWER_WITH_PARTIAL, /* Min Power + partial and slumber */
        ATA_LPM_MIN_POWER, /* Min power + no partial (slumber only) */
};

enum ata_lpm_hints {
        ATA_LPM_EMPTY                = (1 << 0), /* port empty/probing */
        ATA_LPM_HIPM                = (1 << 1), /* may use HIPM */
        ATA_LPM_WAKE_ONLY        = (1 << 2), /* only wake up link */
};

/* forward declarations */
struct scsi_device;
struct ata_port_operations;
struct ata_port;
struct ata_link;
struct ata_queued_cmd;

/* typedefs */
typedef void (*ata_qc_cb_t) (struct ata_queued_cmd *qc);
typedef int (*ata_prereset_fn_t)(struct ata_link *link, unsigned long deadline);
typedef int (*ata_reset_fn_t)(struct ata_link *link, unsigned int *classes,
                              unsigned long deadline);
typedef void (*ata_postreset_fn_t)(struct ata_link *link, unsigned int *classes);

extern struct device_attribute dev_attr_unload_heads;
#ifdef CONFIG_SATA_HOST
extern struct device_attribute dev_attr_link_power_management_policy;
extern struct device_attribute dev_attr_ncq_prio_enable;
extern struct device_attribute dev_attr_em_message_type;
extern struct device_attribute dev_attr_em_message;
extern struct device_attribute dev_attr_sw_activity;
#endif

enum sw_activity {
        OFF,
        BLINK_ON,
        BLINK_OFF,
};

struct ata_taskfile {
        unsigned long                flags;                /* ATA_TFLAG_xxx */
        u8                        protocol;        /* ATA_PROT_xxx */

        u8                        ctl;                /* control reg */

        u8                        hob_feature;        /* additional data */
        u8                        hob_nsect;        /* to support LBA48 */
        u8                        hob_lbal;
        u8                        hob_lbam;
        u8                        hob_lbah;

        u8                        feature;
        u8                        nsect;
        u8                        lbal;
        u8                        lbam;
        u8                        lbah;

        u8                        device;

        u8                        command;        /* IO operation */

        u32                        auxiliary;        /* auxiliary field */
                                                /* from SATA 3.1 and */
                                                /* ATA-8 ACS-3 */
};

#ifdef CONFIG_ATA_SFF
struct ata_ioports {
        void __iomem                *cmd_addr;
        void __iomem                *data_addr;
        void __iomem                *error_addr;
        void __iomem                *feature_addr;
        void __iomem                *nsect_addr;
        void __iomem                *lbal_addr;
        void __iomem                *lbam_addr;
        void __iomem                *lbah_addr;
        void __iomem                *device_addr;
        void __iomem                *status_addr;
        void __iomem                *command_addr;
        void __iomem                *altstatus_addr;
        void __iomem                *ctl_addr;
#ifdef CONFIG_ATA_BMDMA
        void __iomem                *bmdma_addr;
#endif /* CONFIG_ATA_BMDMA */
        void __iomem                *scr_addr;
};
#endif /* CONFIG_ATA_SFF */

struct ata_host {
        spinlock_t                lock;
        struct device                 *dev;
        void __iomem * const        *iomap;
        unsigned int                n_ports;
        unsigned int                n_tags;                        /* nr of NCQ tags */
        void                        *private_data;
        struct ata_port_operations *ops;
        unsigned long                flags;
        struct kref                kref;

        struct mutex                eh_mutex;
        struct task_struct        *eh_owner;

        struct ata_port                *simplex_claimed;        /* channel owning the DMA */
        struct ata_port                *ports[];
};

struct ata_queued_cmd {
        struct ata_port                *ap;
        struct ata_device        *dev;

        struct scsi_cmnd        *scsicmd;
        void                        (*scsidone)(struct scsi_cmnd *);

        struct ata_taskfile        tf;
        u8                        cdb[ATAPI_CDB_LEN];

        unsigned long                flags;                /* ATA_QCFLAG_xxx */
        unsigned int                tag;                /* libata core tag */
        unsigned int                hw_tag;                /* driver tag */
        unsigned int                n_elem;
        unsigned int                orig_n_elem;

        int                        dma_dir;

        unsigned int                sect_size;

        unsigned int                nbytes;
        unsigned int                extrabytes;
        unsigned int                curbytes;

        struct scatterlist        sgent;

        struct scatterlist        *sg;

        struct scatterlist        *cursg;
        unsigned int                cursg_ofs;

        unsigned int                err_mask;
        struct ata_taskfile        result_tf;
        ata_qc_cb_t                complete_fn;

        void                        *private_data;
        void                        *lldd_task;
};

struct ata_port_stats {
        unsigned long                unhandled_irq;
        unsigned long                idle_irq;
        unsigned long                rw_reqbuf;
};

struct ata_ering_entry {
        unsigned int                eflags;
        unsigned int                err_mask;
        u64                        timestamp;
};

struct ata_ering {
        int                        cursor;
        struct ata_ering_entry        ring[ATA_ERING_SIZE];
};

struct ata_device {
        struct ata_link                *link;
        unsigned int                devno;                /* 0 or 1 */
        unsigned int                horkage;        /* List of broken features */
        unsigned long                flags;                /* ATA_DFLAG_xxx */
        struct scsi_device        *sdev;                /* attached SCSI device */
        void                        *private_data;
#ifdef CONFIG_ATA_ACPI
        union acpi_object        *gtf_cache;
        unsigned int                gtf_filter;
#endif
#ifdef CONFIG_SATA_ZPODD
        void                        *zpodd;
#endif
        struct device                tdev;
        /* n_sector is CLEAR_BEGIN, read comment above CLEAR_BEGIN */
        u64                        n_sectors;        /* size of device, if ATA */
        u64                        n_native_sectors; /* native size, if ATA */
        unsigned int                class;                /* ATA_DEV_xxx */
        unsigned long                unpark_deadline;

        u8                        pio_mode;
        u8                        dma_mode;
        u8                        xfer_mode;
        unsigned int                xfer_shift;        /* ATA_SHIFT_xxx */

        unsigned int                multi_count;        /* sectors count for
                                                   READ/WRITE MULTIPLE */
        unsigned int                max_sectors;        /* per-device max sectors */
        unsigned int                cdb_len;

        /* per-dev xfer mask */
        unsigned long                pio_mask;
        unsigned long                mwdma_mask;
        unsigned long                udma_mask;

        /* for CHS addressing */
        u16                        cylinders;        /* Number of cylinders */
        u16                        heads;                /* Number of heads */
        u16                        sectors;        /* Number of sectors per track */

        union {
                u16                id[ATA_ID_WORDS]; /* IDENTIFY xxx DEVICE data */
                u32                gscr[SATA_PMP_GSCR_DWORDS]; /* PMP GSCR block */
        } ____cacheline_aligned;

        /* DEVSLP Timing Variables from Identify Device Data Log */
        u8                        devslp_timing[ATA_LOG_DEVSLP_SIZE];

        /* NCQ send and receive log subcommand support */
        u8                        ncq_send_recv_cmds[ATA_LOG_NCQ_SEND_RECV_SIZE];
        u8                        ncq_non_data_cmds[ATA_LOG_NCQ_NON_DATA_SIZE];

        /* ZAC zone configuration */
        u32                        zac_zoned_cap;
        u32                        zac_zones_optimal_open;
        u32                        zac_zones_optimal_nonseq;
        u32                        zac_zones_max_open;

        /* error history */
        int                        spdn_cnt;
        /* ering is CLEAR_END, read comment above CLEAR_END */
        struct ata_ering        ering;
};

/* Fields between ATA_DEVICE_CLEAR_BEGIN and ATA_DEVICE_CLEAR_END are
 * cleared to zero on ata_dev_init().
 */
#define ATA_DEVICE_CLEAR_BEGIN                offsetof(struct ata_device, n_sectors)
#define ATA_DEVICE_CLEAR_END                offsetof(struct ata_device, ering)

struct ata_eh_info {
        struct ata_device        *dev;                /* offending device */
        u32                        serror;                /* SError from LLDD */
        unsigned int                err_mask;        /* port-wide err_mask */
        unsigned int                action;                /* ATA_EH_* action mask */
        unsigned int                dev_action[ATA_MAX_DEVICES]; /* dev EH action */
        unsigned int                flags;                /* ATA_EHI_* flags */

        unsigned int                probe_mask;

        char                        desc[ATA_EH_DESC_LEN];
        int                        desc_len;
};

struct ata_eh_context {
        struct ata_eh_info        i;
        int                        tries[ATA_MAX_DEVICES];
        int                        cmd_timeout_idx[ATA_MAX_DEVICES]
                                               [ATA_EH_CMD_TIMEOUT_TABLE_SIZE];
        unsigned int                classes[ATA_MAX_DEVICES];
        unsigned int                did_probe_mask;
        unsigned int                unloaded_mask;
        unsigned int                saved_ncq_enabled;
        u8                        saved_xfer_mode[ATA_MAX_DEVICES];
        /* timestamp for the last reset attempt or success */
        unsigned long                last_reset;
};

struct ata_acpi_drive
{
        u32 pio;
        u32 dma;
} __packed;

struct ata_acpi_gtm {
        struct ata_acpi_drive drive[2];
        u32 flags;
} __packed;

struct ata_link {
        struct ata_port                *ap;
        int                        pmp;                /* port multiplier port # */

        struct device                tdev;
        unsigned int                active_tag;        /* active tag on this link */
        u32                        sactive;        /* active NCQ commands */

        unsigned int                flags;                /* ATA_LFLAG_xxx */

        u32                        saved_scontrol;        /* SControl on probe */
        unsigned int                hw_sata_spd_limit;
        unsigned int                sata_spd_limit;
        unsigned int                sata_spd;        /* current SATA PHY speed */
        enum ata_lpm_policy        lpm_policy;

        /* record runtime error info, protected by host_set lock */
        struct ata_eh_info        eh_info;
        /* EH context */
        struct ata_eh_context        eh_context;

        struct ata_device        device[ATA_MAX_DEVICES];

        unsigned long                last_lpm_change; /* when last LPM change happened */
};
#define ATA_LINK_CLEAR_BEGIN                offsetof(struct ata_link, active_tag)
#define ATA_LINK_CLEAR_END                offsetof(struct ata_link, device[0])

struct ata_port {
        struct Scsi_Host        *scsi_host; /* our co-allocated scsi host */
        struct ata_port_operations *ops;
        spinlock_t                *lock;
        /* Flags owned by the EH context. Only EH should touch these once the
           port is active */
        unsigned long                flags;        /* ATA_FLAG_xxx */
        /* Flags that change dynamically, protected by ap->lock */
        unsigned int                pflags; /* ATA_PFLAG_xxx */
        unsigned int                print_id; /* user visible unique port ID */
        unsigned int            local_port_no; /* host local port num */
        unsigned int                port_no; /* 0 based port no. inside the host */

#ifdef CONFIG_ATA_SFF
        struct ata_ioports        ioaddr;        /* ATA cmd/ctl/dma register blocks */
        u8                        ctl;        /* cache of ATA control register */
        u8                        last_ctl;        /* Cache last written value */
        struct ata_link*        sff_pio_task_link; /* link currently used */
        struct delayed_work        sff_pio_task;
#ifdef CONFIG_ATA_BMDMA
        struct ata_bmdma_prd        *bmdma_prd;        /* BMDMA SG list */
        dma_addr_t                bmdma_prd_dma;        /* and its DMA mapping */
#endif /* CONFIG_ATA_BMDMA */
#endif /* CONFIG_ATA_SFF */

        unsigned int                pio_mask;
        unsigned int                mwdma_mask;
        unsigned int                udma_mask;
        unsigned int                cbl;        /* cable type; ATA_CBL_xxx */

        struct ata_queued_cmd        qcmd[ATA_MAX_QUEUE + 1];
        unsigned long                sas_tag_allocated; /* for sas tag allocation only */
        u64                        qc_active;
        int                        nr_active_links; /* #links with active qcs */
        unsigned int                sas_last_tag;        /* track next tag hw expects */

        struct ata_link                link;                /* host default link */
        struct ata_link                *slave_link;        /* see ata_slave_link_init() */

        int                        nr_pmp_links;        /* nr of available PMP links */
        struct ata_link                *pmp_link;        /* array of PMP links */
        struct ata_link                *excl_link;        /* for PMP qc exclusion */

        struct ata_port_stats        stats;
        struct ata_host                *host;
        struct device                 *dev;
        struct device                tdev;

        struct mutex                scsi_scan_mutex;
        struct delayed_work        hotplug_task;
        struct work_struct        scsi_rescan_task;

        unsigned int                hsm_task_state;

        u32                        msg_enable;
        struct list_head        eh_done_q;
        wait_queue_head_t        eh_wait_q;
        int                        eh_tries;
        struct completion        park_req_pending;

        pm_message_t                pm_mesg;
        enum ata_lpm_policy        target_lpm_policy;

        struct timer_list        fastdrain_timer;
        unsigned long                fastdrain_cnt;

        async_cookie_t                cookie;

        int                        em_message_type;
        void                        *private_data;

#ifdef CONFIG_ATA_ACPI
        struct ata_acpi_gtm        __acpi_init_gtm; /* use ata_acpi_init_gtm() */
#endif
        /* owned by EH */
        u8                        sector_buf[ATA_SECT_SIZE] ____cacheline_aligned;
};

/* The following initializer overrides a method to NULL whether one of
 * its parent has the method defined or not.  This is equivalent to
 * ERR_PTR(-ENOENT).  Unfortunately, ERR_PTR doesn't render a constant
 * expression and thus can't be used as an initializer.
 */
#define ATA_OP_NULL                (void *)(unsigned long)(-ENOENT)

struct ata_port_operations {
        /*
         * Command execution
         */
        int (*qc_defer)(struct ata_queued_cmd *qc);
        int (*check_atapi_dma)(struct ata_queued_cmd *qc);
        enum ata_completion_errors (*qc_prep)(struct ata_queued_cmd *qc);
        unsigned int (*qc_issue)(struct ata_queued_cmd *qc);
        bool (*qc_fill_rtf)(struct ata_queued_cmd *qc);

        /*
         * Configuration and exception handling
         */
        int  (*cable_detect)(struct ata_port *ap);
        unsigned long (*mode_filter)(struct ata_device *dev, unsigned long xfer_mask);
        void (*set_piomode)(struct ata_port *ap, struct ata_device *dev);
        void (*set_dmamode)(struct ata_port *ap, struct ata_device *dev);
        int  (*set_mode)(struct ata_link *link, struct ata_device **r_failed_dev);
        unsigned int (*read_id)(struct ata_device *dev, struct ata_taskfile *tf, u16 *id);

        void (*dev_config)(struct ata_device *dev);

        void (*freeze)(struct ata_port *ap);
        void (*thaw)(struct ata_port *ap);
        ata_prereset_fn_t        prereset;
        ata_reset_fn_t                softreset;
        ata_reset_fn_t                hardreset;
        ata_postreset_fn_t        postreset;
        ata_prereset_fn_t        pmp_prereset;
        ata_reset_fn_t                pmp_softreset;
        ata_reset_fn_t                pmp_hardreset;
        ata_postreset_fn_t        pmp_postreset;
        void (*error_handler)(struct ata_port *ap);
        void (*lost_interrupt)(struct ata_port *ap);
        void (*post_internal_cmd)(struct ata_queued_cmd *qc);
        void (*sched_eh)(struct ata_port *ap);
        void (*end_eh)(struct ata_port *ap);

        /*
         * Optional features
         */
        int  (*scr_read)(struct ata_link *link, unsigned int sc_reg, u32 *val);
        int  (*scr_write)(struct ata_link *link, unsigned int sc_reg, u32 val);
        void (*pmp_attach)(struct ata_port *ap);
        void (*pmp_detach)(struct ata_port *ap);
        int  (*set_lpm)(struct ata_link *link, enum ata_lpm_policy policy,
                        unsigned hints);

        /*
         * Start, stop, suspend and resume
         */
        int  (*port_suspend)(struct ata_port *ap, pm_message_t mesg);
        int  (*port_resume)(struct ata_port *ap);
        int  (*port_start)(struct ata_port *ap);
        void (*port_stop)(struct ata_port *ap);
        void (*host_stop)(struct ata_host *host);

#ifdef CONFIG_ATA_SFF
        /*
         * SFF / taskfile oriented ops
         */
        void (*sff_dev_select)(struct ata_port *ap, unsigned int device);
        void (*sff_set_devctl)(struct ata_port *ap, u8 ctl);
        u8   (*sff_check_status)(struct ata_port *ap);
        u8   (*sff_check_altstatus)(struct ata_port *ap);
        void (*sff_tf_load)(struct ata_port *ap, const struct ata_taskfile *tf);
        void (*sff_tf_read)(struct ata_port *ap, struct ata_taskfile *tf);
        void (*sff_exec_command)(struct ata_port *ap,
                                 const struct ata_taskfile *tf);
        unsigned int (*sff_data_xfer)(struct ata_queued_cmd *qc,
                        unsigned char *buf, unsigned int buflen, int rw);
        void (*sff_irq_on)(struct ata_port *);
        bool (*sff_irq_check)(struct ata_port *);
        void (*sff_irq_clear)(struct ata_port *);
        void (*sff_drain_fifo)(struct ata_queued_cmd *qc);

#ifdef CONFIG_ATA_BMDMA
        void (*bmdma_setup)(struct ata_queued_cmd *qc);
        void (*bmdma_start)(struct ata_queued_cmd *qc);
        void (*bmdma_stop)(struct ata_queued_cmd *qc);
        u8   (*bmdma_status)(struct ata_port *ap);
#endif /* CONFIG_ATA_BMDMA */
#endif /* CONFIG_ATA_SFF */

        ssize_t (*em_show)(struct ata_port *ap, char *buf);
        ssize_t (*em_store)(struct ata_port *ap, const char *message,
                            size_t size);
        ssize_t (*sw_activity_show)(struct ata_device *dev, char *buf);
        ssize_t (*sw_activity_store)(struct ata_device *dev,
                                     enum sw_activity val);
        ssize_t (*transmit_led_message)(struct ata_port *ap, u32 state,
                                        ssize_t size);

        /*
         * Obsolete
         */
        void (*phy_reset)(struct ata_port *ap);
        void (*eng_timeout)(struct ata_port *ap);

        /*
         * ->inherits must be the last field and all the preceding
         * fields must be pointers.
         */
        const struct ata_port_operations        *inherits;
};

struct ata_port_info {
        unsigned long                flags;
        unsigned long                link_flags;
        unsigned long                pio_mask;
        unsigned long                mwdma_mask;
        unsigned long                udma_mask;
        struct ata_port_operations *port_ops;
        void                         *private_data;
};

struct ata_timing {
        unsigned short mode;                /* ATA mode */
        unsigned short setup;                /* t1 */
        unsigned short act8b;                /* t2 for 8-bit I/O */
        unsigned short rec8b;                /* t2i for 8-bit I/O */
        unsigned short cyc8b;                /* t0 for 8-bit I/O */
        unsigned short active;                /* t2 or tD */
        unsigned short recover;                /* t2i or tK */
        unsigned short dmack_hold;        /* tj */
        unsigned short cycle;                /* t0 */
        unsigned short udma;                /* t2CYCTYP/2 */
};

/*
 * Core layer - drivers/ata/libata-core.c
 */
extern struct ata_port_operations ata_dummy_port_ops;
extern const struct ata_port_info ata_dummy_port_info;

static inline bool ata_is_atapi(u8 prot)
{
        return prot & ATA_PROT_FLAG_ATAPI;
}

static inline bool ata_is_pio(u8 prot)
{
        return prot & ATA_PROT_FLAG_PIO;
}

static inline bool ata_is_dma(u8 prot)
{
        return prot & ATA_PROT_FLAG_DMA;
}

static inline bool ata_is_ncq(u8 prot)
{
        return prot & ATA_PROT_FLAG_NCQ;
}

static inline bool ata_is_data(u8 prot)
{
        return prot & (ATA_PROT_FLAG_PIO | ATA_PROT_FLAG_DMA);
}

static inline int is_multi_taskfile(struct ata_taskfile *tf)
{
        return (tf->command == ATA_CMD_READ_MULTI) ||
               (tf->command == ATA_CMD_WRITE_MULTI) ||
               (tf->command == ATA_CMD_READ_MULTI_EXT) ||
               (tf->command == ATA_CMD_WRITE_MULTI_EXT) ||
               (tf->command == ATA_CMD_WRITE_MULTI_FUA_EXT);
}

static inline int ata_port_is_dummy(struct ata_port *ap)
{
        return ap->ops == &ata_dummy_port_ops;
}

extern int ata_std_prereset(struct ata_link *link, unsigned long deadline);
extern int ata_wait_after_reset(struct ata_link *link, unsigned long deadline,
                                int (*check_ready)(struct ata_link *link));
extern int sata_std_hardreset(struct ata_link *link, unsigned int *class,
                              unsigned long deadline);
extern void ata_std_postreset(struct ata_link *link, unsigned int *classes);

extern struct ata_host *ata_host_alloc(struct device *dev, int max_ports);
extern struct ata_host *ata_host_alloc_pinfo(struct device *dev,
                        const struct ata_port_info * const * ppi, int n_ports);
extern void ata_host_get(struct ata_host *host);
extern void ata_host_put(struct ata_host *host);
extern int ata_host_start(struct ata_host *host);
extern int ata_host_register(struct ata_host *host,
                             struct scsi_host_template *sht);
extern int ata_host_activate(struct ata_host *host, int irq,
                             irq_handler_t irq_handler, unsigned long irq_flags,
                             struct scsi_host_template *sht);
extern void ata_host_detach(struct ata_host *host);
extern void ata_host_init(struct ata_host *, struct device *, struct ata_port_operations *);
extern int ata_scsi_detect(struct scsi_host_template *sht);
extern int ata_scsi_ioctl(struct scsi_device *dev, unsigned int cmd,
                          void __user *arg);
#ifdef CONFIG_COMPAT
#define ATA_SCSI_COMPAT_IOCTL .compat_ioctl = ata_scsi_ioctl,
#else
#define ATA_SCSI_COMPAT_IOCTL /* empty */
#endif
extern int ata_scsi_queuecmd(struct Scsi_Host *h, struct scsi_cmnd *cmd);
#if IS_REACHABLE(CONFIG_ATA)
bool ata_scsi_dma_need_drain(struct request *rq);
#else
#define ata_scsi_dma_need_drain NULL
#endif
extern int ata_sas_scsi_ioctl(struct ata_port *ap, struct scsi_device *dev,
                            unsigned int cmd, void __user *arg);
extern bool ata_link_online(struct ata_link *link);
extern bool ata_link_offline(struct ata_link *link);
#ifdef CONFIG_PM
extern int ata_host_suspend(struct ata_host *host, pm_message_t mesg);
extern void ata_host_resume(struct ata_host *host);
extern void ata_sas_port_suspend(struct ata_port *ap);
extern void ata_sas_port_resume(struct ata_port *ap);
#else
static inline void ata_sas_port_suspend(struct ata_port *ap)
{
}
static inline void ata_sas_port_resume(struct ata_port *ap)
{
}
#endif
extern int ata_ratelimit(void);
extern void ata_msleep(struct ata_port *ap, unsigned int msecs);
extern u32 ata_wait_register(struct ata_port *ap, void __iomem *reg, u32 mask,
                        u32 val, unsigned long interval, unsigned long timeout);
extern int atapi_cmd_type(u8 opcode);
extern unsigned long ata_pack_xfermask(unsigned long pio_mask,
                        unsigned long mwdma_mask, unsigned long udma_mask);
extern void ata_unpack_xfermask(unsigned long xfer_mask,
                        unsigned long *pio_mask, unsigned long *mwdma_mask,
                        unsigned long *udma_mask);
extern u8 ata_xfer_mask2mode(unsigned long xfer_mask);
extern unsigned long ata_xfer_mode2mask(u8 xfer_mode);
extern int ata_xfer_mode2shift(unsigned long xfer_mode);
extern const char *ata_mode_string(unsigned long xfer_mask);
extern unsigned long ata_id_xfermask(const u16 *id);
extern int ata_std_qc_defer(struct ata_queued_cmd *qc);
extern enum ata_completion_errors ata_noop_qc_prep(struct ata_queued_cmd *qc);
extern void ata_sg_init(struct ata_queued_cmd *qc, struct scatterlist *sg,
                 unsigned int n_elem);
extern unsigned int ata_dev_classify(const struct ata_taskfile *tf);
extern void ata_dev_disable(struct ata_device *adev);
extern void ata_id_string(const u16 *id, unsigned char *s,
                          unsigned int ofs, unsigned int len);
extern void ata_id_c_string(const u16 *id, unsigned char *s,
                            unsigned int ofs, unsigned int len);
extern unsigned int ata_do_dev_read_id(struct ata_device *dev,
                                        struct ata_taskfile *tf, u16 *id);
extern void ata_qc_complete(struct ata_queued_cmd *qc);
extern u64 ata_qc_get_active(struct ata_port *ap);
extern void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd);
extern int ata_std_bios_param(struct scsi_device *sdev,
                              struct block_device *bdev,
                              sector_t capacity, int geom[]);
extern void ata_scsi_unlock_native_capacity(struct scsi_device *sdev);
extern int ata_scsi_slave_config(struct scsi_device *sdev);
extern void ata_scsi_slave_destroy(struct scsi_device *sdev);
extern int ata_scsi_change_queue_depth(struct scsi_device *sdev,
                                       int queue_depth);
extern int __ata_change_queue_depth(struct ata_port *ap, struct scsi_device *sdev,
                                    int queue_depth);
extern struct ata_device *ata_dev_pair(struct ata_device *adev);
extern int ata_do_set_mode(struct ata_link *link, struct ata_device **r_failed_dev);
extern void ata_scsi_port_error_handler(struct Scsi_Host *host, struct ata_port *ap);
extern void ata_scsi_cmd_error_handler(struct Scsi_Host *host, struct ata_port *ap, struct list_head *eh_q);

/*
 * SATA specific code - drivers/ata/libata-sata.c
 */
#ifdef CONFIG_SATA_HOST
extern const unsigned long sata_deb_timing_normal[];
extern const unsigned long sata_deb_timing_hotplug[];
extern const unsigned long sata_deb_timing_long[];

static inline const unsigned long *
sata_ehc_deb_timing(struct ata_eh_context *ehc)
{
        if (ehc->i.flags & ATA_EHI_HOTPLUGGED)
                return sata_deb_timing_hotplug;
        else
                return sata_deb_timing_normal;
}

extern int sata_scr_valid(struct ata_link *link);
extern int sata_scr_read(struct ata_link *link, int reg, u32 *val);
extern int sata_scr_write(struct ata_link *link, int reg, u32 val);
extern int sata_scr_write_flush(struct ata_link *link, int reg, u32 val);
extern int sata_set_spd(struct ata_link *link);
extern int sata_link_hardreset(struct ata_link *link,
                        const unsigned long *timing, unsigned long deadline,
                        bool *online, int (*check_ready)(struct ata_link *));
extern int sata_link_resume(struct ata_link *link, const unsigned long *params,
                            unsigned long deadline);
extern void ata_eh_analyze_ncq_error(struct ata_link *link);
#else
static inline const unsigned long *
sata_ehc_deb_timing(struct ata_eh_context *ehc)
{
        return NULL;
}
static inline int sata_scr_valid(struct ata_link *link) { return 0; }
static inline int sata_scr_read(struct ata_link *link, int reg, u32 *val)
{
        return -EOPNOTSUPP;
}
static inline int sata_scr_write(struct ata_link *link, int reg, u32 val)
{
        return -EOPNOTSUPP;
}
static inline int sata_scr_write_flush(struct ata_link *link, int reg, u32 val)
{
        return -EOPNOTSUPP;
}
static inline int sata_set_spd(struct ata_link *link) { return -EOPNOTSUPP; }
static inline int sata_link_hardreset(struct ata_link *link,
                                      const unsigned long *timing,
                                      unsigned long deadline,
                                      bool *online,
                                      int (*check_ready)(struct ata_link *))
{
        if (online)
                *online = false;
        return -EOPNOTSUPP;
}
static inline int sata_link_resume(struct ata_link *link,
                                   const unsigned long *params,
                                   unsigned long deadline)
{
        return -EOPNOTSUPP;
}
static inline void ata_eh_analyze_ncq_error(struct ata_link *link) { }
#endif
extern int sata_link_debounce(struct ata_link *link,
                        const unsigned long *params, unsigned long deadline);
extern int sata_link_scr_lpm(struct ata_link *link, enum ata_lpm_policy policy,
                             bool spm_wakeup);
extern int ata_slave_link_init(struct ata_port *ap);
extern void ata_sas_port_destroy(struct ata_port *);
extern struct ata_port *ata_sas_port_alloc(struct ata_host *,
                                           struct ata_port_info *, struct Scsi_Host *);
extern void ata_sas_async_probe(struct ata_port *ap);
extern int ata_sas_sync_probe(struct ata_port *ap);
extern int ata_sas_port_init(struct ata_port *);
extern int ata_sas_port_start(struct ata_port *ap);
extern int ata_sas_tport_add(struct device *parent, struct ata_port *ap);
extern void ata_sas_tport_delete(struct ata_port *ap);
extern void ata_sas_port_stop(struct ata_port *ap);
extern int ata_sas_slave_configure(struct scsi_device *, struct ata_port *);
extern int ata_sas_queuecmd(struct scsi_cmnd *cmd, struct ata_port *ap);
extern void ata_tf_to_fis(const struct ata_taskfile *tf,
                          u8 pmp, int is_cmd, u8 *fis);
extern void ata_tf_from_fis(const u8 *fis, struct ata_taskfile *tf);
extern int ata_qc_complete_multiple(struct ata_port *ap, u64 qc_active);
extern bool sata_lpm_ignore_phy_events(struct ata_link *link);
extern int sata_async_notification(struct ata_port *ap);

extern int ata_cable_40wire(struct ata_port *ap);
extern int ata_cable_80wire(struct ata_port *ap);
extern int ata_cable_sata(struct ata_port *ap);
extern int ata_cable_ignore(struct ata_port *ap);
extern int ata_cable_unknown(struct ata_port *ap);

/* Timing helpers */
extern unsigned int ata_pio_need_iordy(const struct ata_device *);
extern u8 ata_timing_cycle2mode(unsigned int xfer_shift, int cycle);

/* PCI */
#ifdef CONFIG_PCI
struct pci_dev;

struct pci_bits {
        unsigned int                reg;        /* PCI config register to read */
        unsigned int                width;        /* 1 (8 bit), 2 (16 bit), 4 (32 bit) */
        unsigned long                mask;
        unsigned long                val;
};

extern int pci_test_config_bits(struct pci_dev *pdev, const struct pci_bits *bits);
extern void ata_pci_shutdown_one(struct pci_dev *pdev);
extern void ata_pci_remove_one(struct pci_dev *pdev);

#ifdef CONFIG_PM
extern void ata_pci_device_do_suspend(struct pci_dev *pdev, pm_message_t mesg);
extern int __must_check ata_pci_device_do_resume(struct pci_dev *pdev);
extern int ata_pci_device_suspend(struct pci_dev *pdev, pm_message_t mesg);
extern int ata_pci_device_resume(struct pci_dev *pdev);
#endif /* CONFIG_PM */
#endif /* CONFIG_PCI */

struct platform_device;

extern int ata_platform_remove_one(struct platform_device *pdev);

/*
 * ACPI - drivers/ata/libata-acpi.c
 */
#ifdef CONFIG_ATA_ACPI
static inline const struct ata_acpi_gtm *ata_acpi_init_gtm(struct ata_port *ap)
{
        if (ap->pflags & ATA_PFLAG_INIT_GTM_VALID)
                return &ap->__acpi_init_gtm;
        return NULL;
}
int ata_acpi_stm(struct ata_port *ap, const struct ata_acpi_gtm *stm);
int ata_acpi_gtm(struct ata_port *ap, struct ata_acpi_gtm *stm);
unsigned long ata_acpi_gtm_xfermask(struct ata_device *dev,
                                    const struct ata_acpi_gtm *gtm);
int ata_acpi_cbl_80wire(struct ata_port *ap, const struct ata_acpi_gtm *gtm);
#else
static inline const struct ata_acpi_gtm *ata_acpi_init_gtm(struct ata_port *ap)
{
        return NULL;
}

static inline int ata_acpi_stm(const struct ata_port *ap,
                               struct ata_acpi_gtm *stm)
{
        return -ENOSYS;
}

static inline int ata_acpi_gtm(const struct ata_port *ap,
                               struct ata_acpi_gtm *stm)
{
        return -ENOSYS;
}

static inline unsigned int ata_acpi_gtm_xfermask(struct ata_device *dev,
                                        const struct ata_acpi_gtm *gtm)
{
        return 0;
}

static inline int ata_acpi_cbl_80wire(struct ata_port *ap,
                                      const struct ata_acpi_gtm *gtm)
{
        return 0;
}
#endif

/*
 * EH - drivers/ata/libata-eh.c
 */
extern void ata_port_schedule_eh(struct ata_port *ap);
extern void ata_port_wait_eh(struct ata_port *ap);
extern int ata_link_abort(struct ata_link *link);
extern int ata_port_abort(struct ata_port *ap);
extern int ata_port_freeze(struct ata_port *ap);

extern void ata_eh_freeze_port(struct ata_port *ap);
extern void ata_eh_thaw_port(struct ata_port *ap);

extern void ata_eh_qc_complete(struct ata_queued_cmd *qc);
extern void ata_eh_qc_retry(struct ata_queued_cmd *qc);

extern void ata_do_eh(struct ata_port *ap, ata_prereset_fn_t prereset,
                      ata_reset_fn_t softreset, ata_reset_fn_t hardreset,
                      ata_postreset_fn_t postreset);
extern void ata_std_error_handler(struct ata_port *ap);
extern void ata_std_sched_eh(struct ata_port *ap);
extern void ata_std_end_eh(struct ata_port *ap);
extern int ata_link_nr_enabled(struct ata_link *link);

/*
 * Base operations to inherit from and initializers for sht
 *
 * Operations
 *
 * base  : Common to all libata drivers.
 * sata  : SATA controllers w/ native interface.
 * pmp   : SATA controllers w/ PMP support.
 * sff   : SFF ATA controllers w/o BMDMA support.
 * bmdma : SFF ATA controllers w/ BMDMA support.
 *
 * sht initializers
 *
 * BASE  : Common to all libata drivers.  The user must set
 *           sg_tablesize and dma_boundary.
 * PIO   : SFF ATA controllers w/ only PIO support.
 * BMDMA : SFF ATA controllers w/ BMDMA support.  sg_tablesize and
 *           dma_boundary are set to BMDMA limits.
 * NCQ   : SATA controllers supporting NCQ.  The user must set
 *           sg_tablesize, dma_boundary and can_queue.
 */
extern const struct ata_port_operations ata_base_port_ops;
extern const struct ata_port_operations sata_port_ops;
extern struct device_attribute *ata_common_sdev_attrs[];

/*
 * All sht initializers (BASE, PIO, BMDMA, NCQ) must be instantiated
 * by the edge drivers.  Because the 'module' field of sht must be the
 * edge driver's module reference, otherwise the driver can be unloaded
 * even if the scsi_device is being accessed.
 */
#define __ATA_BASE_SHT(drv_name)                                \
        .module                        = THIS_MODULE,                        \
        .name                        = drv_name,                        \
        .ioctl                        = ata_scsi_ioctl,                \
        ATA_SCSI_COMPAT_IOCTL                                        \
        .queuecommand                = ata_scsi_queuecmd,                \
        .dma_need_drain                = ata_scsi_dma_need_drain,        \
        .can_queue                = ATA_DEF_QUEUE,                \
        .tag_alloc_policy        = BLK_TAG_ALLOC_RR,                \
        .this_id                = ATA_SHT_THIS_ID,                \
        .emulated                = ATA_SHT_EMULATED,                \
        .proc_name                = drv_name,                        \
        .slave_configure        = ata_scsi_slave_config,        \
        .slave_destroy                = ata_scsi_slave_destroy,        \
        .bios_param                = ata_std_bios_param,                \
        .unlock_native_capacity        = ata_scsi_unlock_native_capacity

#define ATA_BASE_SHT(drv_name)                                        \
        __ATA_BASE_SHT(drv_name),                                \
        .sdev_attrs                = ata_common_sdev_attrs

#ifdef CONFIG_SATA_HOST
extern struct device_attribute *ata_ncq_sdev_attrs[];

#define ATA_NCQ_SHT(drv_name)                                        \
        __ATA_BASE_SHT(drv_name),                                \
        .sdev_attrs                = ata_ncq_sdev_attrs,                \
        .change_queue_depth        = ata_scsi_change_queue_depth
#endif

/*
 * PMP helpers
 */
#ifdef CONFIG_SATA_PMP
static inline bool sata_pmp_supported(struct ata_port *ap)
{
        return ap->flags & ATA_FLAG_PMP;
}

static inline bool sata_pmp_attached(struct ata_port *ap)
{
        return ap->nr_pmp_links != 0;
}

static inline bool ata_is_host_link(const struct ata_link *link)
{
        return link == &link->ap->link || link == link->ap->slave_link;
}
#else /* CONFIG_SATA_PMP */
static inline bool sata_pmp_supported(struct ata_port *ap)
{
        return false;
}

static inline bool sata_pmp_attached(struct ata_port *ap)
{
        return false;
}

static inline bool ata_is_host_link(const struct ata_link *link)
{
        return 1;
}
#endif /* CONFIG_SATA_PMP */

static inline int sata_srst_pmp(struct ata_link *link)
{
        if (sata_pmp_supported(link->ap) && ata_is_host_link(link))
                return SATA_PMP_CTRL_PORT;
        return link->pmp;
}

/*
 * printk helpers
 */
__printf(3, 4)
void ata_port_printk(const struct ata_port *ap, const char *level,
                     const char *fmt, ...);
__printf(3, 4)
void ata_link_printk(const struct ata_link *link, const char *level,
                     const char *fmt, ...);
__printf(3, 4)
void ata_dev_printk(const struct ata_device *dev, const char *level,
                    const char *fmt, ...);

#define ata_port_err(ap, fmt, ...)                                \
        ata_port_printk(ap, KERN_ERR, fmt, ##__VA_ARGS__)
#define ata_port_warn(ap, fmt, ...)                                \
        ata_port_printk(ap, KERN_WARNING, fmt, ##__VA_ARGS__)
#define ata_port_notice(ap, fmt, ...)                                \
        ata_port_printk(ap, KERN_NOTICE, fmt, ##__VA_ARGS__)
#define ata_port_info(ap, fmt, ...)                                \
        ata_port_printk(ap, KERN_INFO, fmt, ##__VA_ARGS__)
#define ata_port_dbg(ap, fmt, ...)                                \
        ata_port_printk(ap, KERN_DEBUG, fmt, ##__VA_ARGS__)

#define ata_link_err(link, fmt, ...)                                \
        ata_link_printk(link, KERN_ERR, fmt, ##__VA_ARGS__)
#define ata_link_warn(link, fmt, ...)                                \
        ata_link_printk(link, KERN_WARNING, fmt, ##__VA_ARGS__)
#define ata_link_notice(link, fmt, ...)                                \
        ata_link_printk(link, KERN_NOTICE, fmt, ##__VA_ARGS__)
#define ata_link_info(link, fmt, ...)                                \
        ata_link_printk(link, KERN_INFO, fmt, ##__VA_ARGS__)
#define ata_link_dbg(link, fmt, ...)                                \
        ata_link_printk(link, KERN_DEBUG, fmt, ##__VA_ARGS__)

#define ata_dev_err(dev, fmt, ...)                                \
        ata_dev_printk(dev, KERN_ERR, fmt, ##__VA_ARGS__)
#define ata_dev_warn(dev, fmt, ...)                                \
        ata_dev_printk(dev, KERN_WARNING, fmt, ##__VA_ARGS__)
#define ata_dev_notice(dev, fmt, ...)                                \
        ata_dev_printk(dev, KERN_NOTICE, fmt, ##__VA_ARGS__)
#define ata_dev_info(dev, fmt, ...)                                \
        ata_dev_printk(dev, KERN_INFO, fmt, ##__VA_ARGS__)
#define ata_dev_dbg(dev, fmt, ...)                                \
        ata_dev_printk(dev, KERN_DEBUG, fmt, ##__VA_ARGS__)

void ata_print_version(const struct device *dev, const char *version);

/*
 * ata_eh_info helpers
 */
extern __printf(2, 3)
void __ata_ehi_push_desc(struct ata_eh_info *ehi, const char *fmt, ...);
extern __printf(2, 3)
void ata_ehi_push_desc(struct ata_eh_info *ehi, const char *fmt, ...);
extern void ata_ehi_clear_desc(struct ata_eh_info *ehi);

static inline void ata_ehi_hotplugged(struct ata_eh_info *ehi)
{
        ehi->probe_mask |= (1 << ATA_MAX_DEVICES) - 1;
        ehi->flags |= ATA_EHI_HOTPLUGGED;
        ehi->action |= ATA_EH_RESET | ATA_EH_ENABLE_LINK;
        ehi->err_mask |= AC_ERR_ATA_BUS;
}

/*
 * port description helpers
 */
extern __printf(2, 3)
void ata_port_desc(struct ata_port *ap, const char *fmt, ...);
#ifdef CONFIG_PCI
extern void ata_port_pbar_desc(struct ata_port *ap, int bar, ssize_t offset,
                               const char *name);
#endif

static inline bool ata_tag_internal(unsigned int tag)
{
        return tag == ATA_TAG_INTERNAL;
}

static inline bool ata_tag_valid(unsigned int tag)
{
        return tag < ATA_MAX_QUEUE || ata_tag_internal(tag);
}

#define __ata_qc_for_each(ap, qc, tag, max_tag, fn)                \
        for ((tag) = 0; (tag) < (max_tag) &&                        \
             ({ qc = fn((ap), (tag)); 1; }); (tag)++)                \

/*
 * Internal use only, iterate commands ignoring error handling and
 * status of 'qc'.
 */
#define ata_qc_for_each_raw(ap, qc, tag)                                        \
        __ata_qc_for_each(ap, qc, tag, ATA_MAX_QUEUE, __ata_qc_from_tag)

/*
 * Iterate all potential commands that can be queued
 */
#define ata_qc_for_each(ap, qc, tag)                                        \
        __ata_qc_for_each(ap, qc, tag, ATA_MAX_QUEUE, ata_qc_from_tag)

/*
 * Like ata_qc_for_each, but with the internal tag included
 */
#define ata_qc_for_each_with_internal(ap, qc, tag)                        \
        __ata_qc_for_each(ap, qc, tag, ATA_MAX_QUEUE + 1, ata_qc_from_tag)

/*
 * device helpers
 */
static inline unsigned int ata_class_enabled(unsigned int class)
{
        return class == ATA_DEV_ATA || class == ATA_DEV_ATAPI ||
                class == ATA_DEV_PMP || class == ATA_DEV_SEMB ||
                class == ATA_DEV_ZAC;
}

static inline unsigned int ata_class_disabled(unsigned int class)
{
        return class == ATA_DEV_ATA_UNSUP || class == ATA_DEV_ATAPI_UNSUP ||
                class == ATA_DEV_PMP_UNSUP || class == ATA_DEV_SEMB_UNSUP ||
                class == ATA_DEV_ZAC_UNSUP;
}

static inline unsigned int ata_class_absent(unsigned int class)
{
        return !ata_class_enabled(class) && !ata_class_disabled(class);
}

static inline unsigned int ata_dev_enabled(const struct ata_device *dev)
{
        return ata_class_enabled(dev->class);
}

static inline unsigned int ata_dev_disabled(const struct ata_device *dev)
{
        return ata_class_disabled(dev->class);
}

static inline unsigned int ata_dev_absent(const struct ata_device *dev)
{
        return ata_class_absent(dev->class);
}

/*
 * link helpers
 */
static inline int ata_link_max_devices(const struct ata_link *link)
{
        if (ata_is_host_link(link) && link->ap->flags & ATA_FLAG_SLAVE_POSS)
                return 2;
        return 1;
}

static inline int ata_link_active(struct ata_link *link)
{
        return ata_tag_valid(link->active_tag) || link->sactive;
}

/*
 * Iterators
 *
 * ATA_LITER_* constants are used to select link iteration mode and
 * ATA_DITER_* device iteration mode.
 *
 * For a custom iteration directly using ata_{link|dev}_next(), if
 * @link or @dev, respectively, is NULL, the first element is
 * returned.  @dev and @link can be any valid device or link and the
 * next element according to the iteration mode will be returned.
 * After the last element, NULL is returned.
 */
enum ata_link_iter_mode {
        ATA_LITER_EDGE,                /* if present, PMP links only; otherwise,
                                 * host link.  no slave link */
        ATA_LITER_HOST_FIRST,        /* host link followed by PMP or slave links */
        ATA_LITER_PMP_FIRST,        /* PMP links followed by host link,
                                 * slave link still comes after host link */
};

enum ata_dev_iter_mode {
        ATA_DITER_ENABLED,
        ATA_DITER_ENABLED_REVERSE,
        ATA_DITER_ALL,
        ATA_DITER_ALL_REVERSE,
};

extern struct ata_link *ata_link_next(struct ata_link *link,
                                      struct ata_port *ap,
                                      enum ata_link_iter_mode mode);

extern struct ata_device *ata_dev_next(struct ata_device *dev,
                                       struct ata_link *link,
                                       enum ata_dev_iter_mode mode);

/*
 * Shortcut notation for iterations
 *
 * ata_for_each_link() iterates over each link of @ap according to
 * @mode.  @link points to the current link in the loop.  @link is
 * NULL after loop termination.  ata_for_each_dev() works the same way
 * except that it iterates over each device of @link.
 *
 * Note that the mode prefixes ATA_{L|D}ITER_ shouldn't need to be
 * specified when using the following shorthand notations.  Only the
 * mode itself (EDGE, HOST_FIRST, ENABLED, etc...) should be
 * specified.  This not only increases brevity but also makes it
 * impossible to use ATA_LITER_* for device iteration or vice-versa.
 */
#define ata_for_each_link(link, ap, mode) \
        for ((link) = ata_link_next(NULL, (ap), ATA_LITER_##mode); (link); \
             (link) = ata_link_next((link), (ap), ATA_LITER_##mode))

#define ata_for_each_dev(dev, link, mode) \
        for ((dev) = ata_dev_next(NULL, (link), ATA_DITER_##mode); (dev); \
             (dev) = ata_dev_next((dev), (link), ATA_DITER_##mode))

/**
 *        ata_ncq_enabled - Test whether NCQ is enabled
 *        @dev: ATA device to test for
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 *
 *        RETURNS:
 *        1 if NCQ is enabled for @dev, 0 otherwise.
 */
static inline int ata_ncq_enabled(struct ata_device *dev)
{
        if (!IS_ENABLED(CONFIG_SATA_HOST))
                return 0;
        return (dev->flags & (ATA_DFLAG_PIO | ATA_DFLAG_NCQ_OFF |
                              ATA_DFLAG_NCQ)) == ATA_DFLAG_NCQ;
}

static inline bool ata_fpdma_dsm_supported(struct ata_device *dev)
{
        return (dev->flags & ATA_DFLAG_NCQ_SEND_RECV) &&
                (dev->ncq_send_recv_cmds[ATA_LOG_NCQ_SEND_RECV_DSM_OFFSET] &
                 ATA_LOG_NCQ_SEND_RECV_DSM_TRIM);
}

static inline bool ata_fpdma_read_log_supported(struct ata_device *dev)
{
        return (dev->flags & ATA_DFLAG_NCQ_SEND_RECV) &&
                (dev->ncq_send_recv_cmds[ATA_LOG_NCQ_SEND_RECV_RD_LOG_OFFSET] &
                 ATA_LOG_NCQ_SEND_RECV_RD_LOG_SUPPORTED);
}

static inline bool ata_fpdma_zac_mgmt_in_supported(struct ata_device *dev)
{
        return (dev->flags & ATA_DFLAG_NCQ_SEND_RECV) &&
                (dev->ncq_send_recv_cmds[ATA_LOG_NCQ_SEND_RECV_ZAC_MGMT_OFFSET] &
                ATA_LOG_NCQ_SEND_RECV_ZAC_MGMT_IN_SUPPORTED);
}

static inline bool ata_fpdma_zac_mgmt_out_supported(struct ata_device *dev)
{
        return (dev->ncq_non_data_cmds[ATA_LOG_NCQ_NON_DATA_ZAC_MGMT_OFFSET] &
                ATA_LOG_NCQ_NON_DATA_ZAC_MGMT_OUT);
}

static inline void ata_qc_set_polling(struct ata_queued_cmd *qc)
{
        qc->tf.ctl |= ATA_NIEN;
}

static inline struct ata_queued_cmd *__ata_qc_from_tag(struct ata_port *ap,
                                                       unsigned int tag)
{
        if (ata_tag_valid(tag))
                return &ap->qcmd[tag];
        return NULL;
}

static inline struct ata_queued_cmd *ata_qc_from_tag(struct ata_port *ap,
                                                     unsigned int tag)
{
        struct ata_queued_cmd *qc = __ata_qc_from_tag(ap, tag);

        if (unlikely(!qc) || !ap->ops->error_handler)
                return qc;

        if ((qc->flags & (ATA_QCFLAG_ACTIVE |
                          ATA_QCFLAG_FAILED)) == ATA_QCFLAG_ACTIVE)
                return qc;

        return NULL;
}

static inline unsigned int ata_qc_raw_nbytes(struct ata_queued_cmd *qc)
{
        return qc->nbytes - min(qc->extrabytes, qc->nbytes);
}

static inline void ata_tf_init(struct ata_device *dev, struct ata_taskfile *tf)
{
        memset(tf, 0, sizeof(*tf));

#ifdef CONFIG_ATA_SFF
        tf->ctl = dev->link->ap->ctl;
#else
        tf->ctl = ATA_DEVCTL_OBS;
#endif
        if (dev->devno == 0)
                tf->device = ATA_DEVICE_OBS;
        else
                tf->device = ATA_DEVICE_OBS | ATA_DEV1;
}

static inline void ata_qc_reinit(struct ata_queued_cmd *qc)
{
        qc->dma_dir = DMA_NONE;
        qc->sg = NULL;
        qc->flags = 0;
        qc->cursg = NULL;
        qc->cursg_ofs = 0;
        qc->nbytes = qc->extrabytes = qc->curbytes = 0;
        qc->n_elem = 0;
        qc->err_mask = 0;
        qc->sect_size = ATA_SECT_SIZE;

        ata_tf_init(qc->dev, &qc->tf);

        /* init result_tf such that it indicates normal completion */
        qc->result_tf.command = ATA_DRDY;
        qc->result_tf.feature = 0;
}

static inline int ata_try_flush_cache(const struct ata_device *dev)
{
        return ata_id_wcache_enabled(dev->id) ||
               ata_id_has_flush(dev->id) ||
               ata_id_has_flush_ext(dev->id);
}

static inline unsigned int ac_err_mask(u8 status)
{
        if (status & (ATA_BUSY | ATA_DRQ))
                return AC_ERR_HSM;
        if (status & (ATA_ERR | ATA_DF))
                return AC_ERR_DEV;
        return 0;
}

static inline unsigned int __ac_err_mask(u8 status)
{
        unsigned int mask = ac_err_mask(status);
        if (mask == 0)
                return AC_ERR_OTHER;
        return mask;
}

static inline struct ata_port *ata_shost_to_port(struct Scsi_Host *host)
{
        return *(struct ata_port **)&host->hostdata[0];
}

static inline int ata_check_ready(u8 status)
{
        if (!(status & ATA_BUSY))
                return 1;

        /* 0xff indicates either no device or device not ready */
        if (status == 0xff)
                return -ENODEV;

        return 0;
}

static inline unsigned long ata_deadline(unsigned long from_jiffies,
                                         unsigned long timeout_msecs)
{
        return from_jiffies + msecs_to_jiffies(timeout_msecs);
}

/* Don't open code these in drivers as there are traps. Firstly the range may
   change in future hardware and specs, secondly 0xFF means 'no DMA' but is
   > UDMA_0. Dyma ddreigiau */

static inline int ata_using_mwdma(struct ata_device *adev)
{
        if (adev->dma_mode >= XFER_MW_DMA_0 && adev->dma_mode <= XFER_MW_DMA_4)
                return 1;
        return 0;
}

static inline int ata_using_udma(struct ata_device *adev)
{
        if (adev->dma_mode >= XFER_UDMA_0 && adev->dma_mode <= XFER_UDMA_7)
                return 1;
        return 0;
}

static inline int ata_dma_enabled(struct ata_device *adev)
{
        return (adev->dma_mode == 0xFF ? 0 : 1);
}

/**************************************************************************
 * PATA timings - drivers/ata/libata-pata-timings.c
 */
extern const struct ata_timing *ata_timing_find_mode(u8 xfer_mode);
extern int ata_timing_compute(struct ata_device *, unsigned short,
                              struct ata_timing *, int, int);
extern void ata_timing_merge(const struct ata_timing *,
                             const struct ata_timing *, struct ata_timing *,
                             unsigned int);

/**************************************************************************
 * PMP - drivers/ata/libata-pmp.c
 */
#ifdef CONFIG_SATA_PMP

extern const struct ata_port_operations sata_pmp_port_ops;

extern int sata_pmp_qc_defer_cmd_switch(struct ata_queued_cmd *qc);
extern void sata_pmp_error_handler(struct ata_port *ap);

#else /* CONFIG_SATA_PMP */

#define sata_pmp_port_ops                sata_port_ops
#define sata_pmp_qc_defer_cmd_switch        ata_std_qc_defer
#define sata_pmp_error_handler                ata_std_error_handler

#endif /* CONFIG_SATA_PMP */


/**************************************************************************
 * SFF - drivers/ata/libata-sff.c
 */
#ifdef CONFIG_ATA_SFF

extern const struct ata_port_operations ata_sff_port_ops;
extern const struct ata_port_operations ata_bmdma32_port_ops;

/* PIO only, sg_tablesize and dma_boundary limits can be removed */
#define ATA_PIO_SHT(drv_name)                                        \
        ATA_BASE_SHT(drv_name),                                        \
        .sg_tablesize                = LIBATA_MAX_PRD,                \
        .dma_boundary                = ATA_DMA_BOUNDARY

extern void ata_sff_dev_select(struct ata_port *ap, unsigned int device);
extern u8 ata_sff_check_status(struct ata_port *ap);
extern void ata_sff_pause(struct ata_port *ap);
extern void ata_sff_dma_pause(struct ata_port *ap);
extern int ata_sff_busy_sleep(struct ata_port *ap,
                              unsigned long timeout_pat, unsigned long timeout);
extern int ata_sff_wait_ready(struct ata_link *link, unsigned long deadline);
extern void ata_sff_tf_load(struct ata_port *ap, const struct ata_taskfile *tf);
extern void ata_sff_tf_read(struct ata_port *ap, struct ata_taskfile *tf);
extern void ata_sff_exec_command(struct ata_port *ap,
                                 const struct ata_taskfile *tf);
extern unsigned int ata_sff_data_xfer(struct ata_queued_cmd *qc,
                        unsigned char *buf, unsigned int buflen, int rw);
extern unsigned int ata_sff_data_xfer32(struct ata_queued_cmd *qc,
                        unsigned char *buf, unsigned int buflen, int rw);
extern void ata_sff_irq_on(struct ata_port *ap);
extern void ata_sff_irq_clear(struct ata_port *ap);
extern int ata_sff_hsm_move(struct ata_port *ap, struct ata_queued_cmd *qc,
                            u8 status, int in_wq);
extern void ata_sff_queue_work(struct work_struct *work);
extern void ata_sff_queue_delayed_work(struct delayed_work *dwork,
                unsigned long delay);
extern void ata_sff_queue_pio_task(struct ata_link *link, unsigned long delay);
extern unsigned int ata_sff_qc_issue(struct ata_queued_cmd *qc);
extern bool ata_sff_qc_fill_rtf(struct ata_queued_cmd *qc);
extern unsigned int ata_sff_port_intr(struct ata_port *ap,
                                      struct ata_queued_cmd *qc);
extern irqreturn_t ata_sff_interrupt(int irq, void *dev_instance);
extern void ata_sff_lost_interrupt(struct ata_port *ap);
extern void ata_sff_freeze(struct ata_port *ap);
extern void ata_sff_thaw(struct ata_port *ap);
extern int ata_sff_prereset(struct ata_link *link, unsigned long deadline);
extern unsigned int ata_sff_dev_classify(struct ata_device *dev, int present,
                                          u8 *r_err);
extern int ata_sff_wait_after_reset(struct ata_link *link, unsigned int devmask,
                                    unsigned long deadline);
extern int ata_sff_softreset(struct ata_link *link, unsigned int *classes,
                             unsigned long deadline);
extern int sata_sff_hardreset(struct ata_link *link, unsigned int *class,
                               unsigned long deadline);
extern void ata_sff_postreset(struct ata_link *link, unsigned int *classes);
extern void ata_sff_drain_fifo(struct ata_queued_cmd *qc);
extern void ata_sff_error_handler(struct ata_port *ap);
extern void ata_sff_std_ports(struct ata_ioports *ioaddr);
#ifdef CONFIG_PCI
extern int ata_pci_sff_init_host(struct ata_host *host);
extern int ata_pci_sff_prepare_host(struct pci_dev *pdev,
                                    const struct ata_port_info * const * ppi,
                                    struct ata_host **r_host);
extern int ata_pci_sff_activate_host(struct ata_host *host,
                                     irq_handler_t irq_handler,
                                     struct scsi_host_template *sht);
extern int ata_pci_sff_init_one(struct pci_dev *pdev,
                const struct ata_port_info * const * ppi,
                struct scsi_host_template *sht, void *host_priv, int hflags);
#endif /* CONFIG_PCI */

#ifdef CONFIG_ATA_BMDMA

extern const struct ata_port_operations ata_bmdma_port_ops;

#define ATA_BMDMA_SHT(drv_name)                                        \
        ATA_BASE_SHT(drv_name),                                        \
        .sg_tablesize                = LIBATA_MAX_PRD,                \
        .dma_boundary                = ATA_DMA_BOUNDARY

extern enum ata_completion_errors ata_bmdma_qc_prep(struct ata_queued_cmd *qc);
extern unsigned int ata_bmdma_qc_issue(struct ata_queued_cmd *qc);
extern enum ata_completion_errors ata_bmdma_dumb_qc_prep(struct ata_queued_cmd *qc);
extern unsigned int ata_bmdma_port_intr(struct ata_port *ap,
                                      struct ata_queued_cmd *qc);
extern irqreturn_t ata_bmdma_interrupt(int irq, void *dev_instance);
extern void ata_bmdma_error_handler(struct ata_port *ap);
extern void ata_bmdma_post_internal_cmd(struct ata_queued_cmd *qc);
extern void ata_bmdma_irq_clear(struct ata_port *ap);
extern void ata_bmdma_setup(struct ata_queued_cmd *qc);
extern void ata_bmdma_start(struct ata_queued_cmd *qc);
extern void ata_bmdma_stop(struct ata_queued_cmd *qc);
extern u8 ata_bmdma_status(struct ata_port *ap);
extern int ata_bmdma_port_start(struct ata_port *ap);
extern int ata_bmdma_port_start32(struct ata_port *ap);

#ifdef CONFIG_PCI
extern int ata_pci_bmdma_clear_simplex(struct pci_dev *pdev);
extern void ata_pci_bmdma_init(struct ata_host *host);
extern int ata_pci_bmdma_prepare_host(struct pci_dev *pdev,
                                      const struct ata_port_info * const * ppi,
                                      struct ata_host **r_host);
extern int ata_pci_bmdma_init_one(struct pci_dev *pdev,
                                  const struct ata_port_info * const * ppi,
                                  struct scsi_host_template *sht,
                                  void *host_priv, int hflags);
#endif /* CONFIG_PCI */
#endif /* CONFIG_ATA_BMDMA */

/**
 *        ata_sff_busy_wait - Wait for a port status register
 *        @ap: Port to wait for.
 *        @bits: bits that must be clear
 *        @max: number of 10uS waits to perform
 *
 *        Waits up to max*10 microseconds for the selected bits in the port's
 *        status register to be cleared.
 *        Returns final value of status register.
 *
 *        LOCKING:
 *        Inherited from caller.
 */
static inline u8 ata_sff_busy_wait(struct ata_port *ap, unsigned int bits,
                                   unsigned int max)
{
        u8 status;

        do {
                udelay(10);
                status = ap->ops->sff_check_status(ap);
                max--;
        } while (status != 0xff && (status & bits) && (max > 0));

        return status;
}

/**
 *        ata_wait_idle - Wait for a port to be idle.
 *        @ap: Port to wait for.
 *
 *        Waits up to 10ms for port's BUSY and DRQ signals to clear.
 *        Returns final value of status register.
 *
 *        LOCKING:
 *        Inherited from caller.
 */
static inline u8 ata_wait_idle(struct ata_port *ap)
{
        u8 status = ata_sff_busy_wait(ap, ATA_BUSY | ATA_DRQ, 1000);

#ifdef ATA_DEBUG
        if (status != 0xff && (status & (ATA_BUSY | ATA_DRQ)))
                ata_port_printk(ap, KERN_DEBUG, "abnormal Status 0x%X\n",
                                status);
#endif

        return status;
}
#endif /* CONFIG_ATA_SFF */

#endif /* __LINUX_LIBATA_H__ */











































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_LWTUNNEL_H
#define __NET_LWTUNNEL_H 1

#include <linux/lwtunnel.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/types.h>
#include <net/route.h>

#define LWTUNNEL_HASH_BITS   7
#define LWTUNNEL_HASH_SIZE   (1 << LWTUNNEL_HASH_BITS)

/* lw tunnel state flags */
#define LWTUNNEL_STATE_OUTPUT_REDIRECT        BIT(0)
#define LWTUNNEL_STATE_INPUT_REDIRECT        BIT(1)
#define LWTUNNEL_STATE_XMIT_REDIRECT        BIT(2)

/* LWTUNNEL_XMIT_CONTINUE should be distinguishable from dst_output return
 * values (NET_XMIT_xxx and NETDEV_TX_xxx in linux/netdevice.h) for safety.
 */
enum {
        LWTUNNEL_XMIT_DONE,
        LWTUNNEL_XMIT_CONTINUE = 0x100,
};


struct lwtunnel_state {
        __u16                type;
        __u16                flags;
        __u16                headroom;
        atomic_t        refcnt;
        int                (*orig_output)(struct net *net, struct sock *sk, struct sk_buff *skb);
        int                (*orig_input)(struct sk_buff *);
        struct                rcu_head rcu;
        __u8            data[];
};

struct lwtunnel_encap_ops {
        int (*build_state)(struct net *net, struct nlattr *encap,
                           unsigned int family, const void *cfg,
                           struct lwtunnel_state **ts,
                           struct netlink_ext_ack *extack);
        void (*destroy_state)(struct lwtunnel_state *lws);
        int (*output)(struct net *net, struct sock *sk, struct sk_buff *skb);
        int (*input)(struct sk_buff *skb);
        int (*fill_encap)(struct sk_buff *skb,
                          struct lwtunnel_state *lwtstate);
        int (*get_encap_size)(struct lwtunnel_state *lwtstate);
        int (*cmp_encap)(struct lwtunnel_state *a, struct lwtunnel_state *b);
        int (*xmit)(struct sk_buff *skb);

        struct module *owner;
};

#ifdef CONFIG_LWTUNNEL
void lwtstate_free(struct lwtunnel_state *lws);

static inline struct lwtunnel_state *
lwtstate_get(struct lwtunnel_state *lws)
{
        if (lws)
                atomic_inc(&lws->refcnt);

        return lws;
}

static inline void lwtstate_put(struct lwtunnel_state *lws)
{
        if (!lws)
                return;

        if (atomic_dec_and_test(&lws->refcnt))
                lwtstate_free(lws);
}

static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate)
{
        if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_OUTPUT_REDIRECT))
                return true;

        return false;
}

static inline bool lwtunnel_input_redirect(struct lwtunnel_state *lwtstate)
{
        if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_INPUT_REDIRECT))
                return true;

        return false;
}

static inline bool lwtunnel_xmit_redirect(struct lwtunnel_state *lwtstate)
{
        if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_XMIT_REDIRECT))
                return true;

        return false;
}

static inline unsigned int lwtunnel_headroom(struct lwtunnel_state *lwtstate,
                                             unsigned int mtu)
{
        if ((lwtunnel_xmit_redirect(lwtstate) ||
             lwtunnel_output_redirect(lwtstate)) && lwtstate->headroom < mtu)
                return lwtstate->headroom;

        return 0;
}

int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op,
                           unsigned int num);
int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op,
                           unsigned int num);
int lwtunnel_valid_encap_type(u16 encap_type,
                              struct netlink_ext_ack *extack);
int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len,
                                   struct netlink_ext_ack *extack);
int lwtunnel_build_state(struct net *net, u16 encap_type,
                         struct nlattr *encap,
                         unsigned int family, const void *cfg,
                         struct lwtunnel_state **lws,
                         struct netlink_ext_ack *extack);
int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate,
                        int encap_attr, int encap_type_attr);
int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate);
struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len);
int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b);
int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb);
int lwtunnel_input(struct sk_buff *skb);
int lwtunnel_xmit(struct sk_buff *skb);
int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
                          bool ingress);

static inline void lwtunnel_set_redirect(struct dst_entry *dst)
{
        if (lwtunnel_output_redirect(dst->lwtstate)) {
                dst->lwtstate->orig_output = dst->output;
                dst->output = lwtunnel_output;
        }
        if (lwtunnel_input_redirect(dst->lwtstate)) {
                dst->lwtstate->orig_input = dst->input;
                dst->input = lwtunnel_input;
        }
}
#else

static inline void lwtstate_free(struct lwtunnel_state *lws)
{
}

static inline struct lwtunnel_state *
lwtstate_get(struct lwtunnel_state *lws)
{
        return lws;
}

static inline void lwtstate_put(struct lwtunnel_state *lws)
{
}

static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate)
{
        return false;
}

static inline bool lwtunnel_input_redirect(struct lwtunnel_state *lwtstate)
{
        return false;
}

static inline bool lwtunnel_xmit_redirect(struct lwtunnel_state *lwtstate)
{
        return false;
}

static inline void lwtunnel_set_redirect(struct dst_entry *dst)
{
}

static inline unsigned int lwtunnel_headroom(struct lwtunnel_state *lwtstate,
                                             unsigned int mtu)
{
        return 0;
}

static inline int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op,
                                         unsigned int num)
{
        return -EOPNOTSUPP;

}

static inline int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op,
                                         unsigned int num)
{
        return -EOPNOTSUPP;
}

static inline int lwtunnel_valid_encap_type(u16 encap_type,
                                            struct netlink_ext_ack *extack)
{
        NL_SET_ERR_MSG(extack, "CONFIG_LWTUNNEL is not enabled in this kernel");
        return -EOPNOTSUPP;
}
static inline int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len,
                                                 struct netlink_ext_ack *extack)
{
        /* return 0 since we are not walking attr looking for
         * RTA_ENCAP_TYPE attribute on nexthops.
         */
        return 0;
}

static inline int lwtunnel_build_state(struct net *net, u16 encap_type,
                                       struct nlattr *encap,
                                       unsigned int family, const void *cfg,
                                       struct lwtunnel_state **lws,
                                       struct netlink_ext_ack *extack)
{
        return -EOPNOTSUPP;
}

static inline int lwtunnel_fill_encap(struct sk_buff *skb,
                                      struct lwtunnel_state *lwtstate,
                                      int encap_attr, int encap_type_attr)
{
        return 0;
}

static inline int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate)
{
        return 0;
}

static inline struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len)
{
        return NULL;
}

static inline int lwtunnel_cmp_encap(struct lwtunnel_state *a,
                                     struct lwtunnel_state *b)
{
        return 0;
}

static inline int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        return -EOPNOTSUPP;
}

static inline int lwtunnel_input(struct sk_buff *skb)
{
        return -EOPNOTSUPP;
}

static inline int lwtunnel_xmit(struct sk_buff *skb)
{
        return -EOPNOTSUPP;
}

#endif /* CONFIG_LWTUNNEL */

#define MODULE_ALIAS_RTNL_LWT(encap_type) MODULE_ALIAS("rtnl-lwt-" __stringify(encap_type))

#endif /* __NET_LWTUNNEL_H */






































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_CLOCK_H
#define _LINUX_SCHED_CLOCK_H

#include <linux/smp.h>

/*
 * Do not use outside of architecture code which knows its limitations.
 *
 * sched_clock() has no promise of monotonicity or bounded drift between
 * CPUs, use (which you should not) requires disabling IRQs.
 *
 * Please use one of the three interfaces below.
 */
extern unsigned long long notrace sched_clock(void);

/*
 * See the comment in kernel/sched/clock.c
 */
extern u64 running_clock(void);
extern u64 sched_clock_cpu(int cpu);


extern void sched_clock_init(void);

#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
static inline void sched_clock_tick(void)
{
}

static inline void clear_sched_clock_stable(void)
{
}

static inline void sched_clock_idle_sleep_event(void)
{
}

static inline void sched_clock_idle_wakeup_event(void)
{
}

static inline u64 cpu_clock(int cpu)
{
        return sched_clock();
}

static inline u64 local_clock(void)
{
        return sched_clock();
}
#else
extern int sched_clock_stable(void);
extern void clear_sched_clock_stable(void);

/*
 * When sched_clock_stable(), __sched_clock_offset provides the offset
 * between local_clock() and sched_clock().
 */
extern u64 __sched_clock_offset;

extern void sched_clock_tick(void);
extern void sched_clock_tick_stable(void);
extern void sched_clock_idle_sleep_event(void);
extern void sched_clock_idle_wakeup_event(void);

/*
 * As outlined in clock.c, provides a fast, high resolution, nanosecond
 * time source that is monotonic per cpu argument and has bounded drift
 * between cpus.
 *
 * ######################### BIG FAT WARNING ##########################
 * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
 * # go backwards !!                                                  #
 * ####################################################################
 */
static inline u64 cpu_clock(int cpu)
{
        return sched_clock_cpu(cpu);
}

static inline u64 local_clock(void)
{
        return sched_clock_cpu(raw_smp_processor_id());
}
#endif

#ifdef CONFIG_IRQ_TIME_ACCOUNTING
/*
 * An i/f to runtime opt-in for irq time accounting based off of sched_clock.
 * The reason for this explicit opt-in is not to have perf penalty with
 * slow sched_clocks.
 */
extern void enable_sched_clock_irqtime(void);
extern void disable_sched_clock_irqtime(void);
#else
static inline void enable_sched_clock_irqtime(void) {}
static inline void disable_sched_clock_irqtime(void) {}
#endif

#endif /* _LINUX_SCHED_CLOCK_H */




































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_RT_H
#define _LINUX_SCHED_RT_H

#include <linux/sched.h>

struct task_struct;

static inline int rt_prio(int prio)
{
        if (unlikely(prio < MAX_RT_PRIO))
                return 1;
        return 0;
}

static inline int rt_task(struct task_struct *p)
{
        return rt_prio(p->prio);
}

static inline bool task_is_realtime(struct task_struct *tsk)
{
        int policy = tsk->policy;

        if (policy == SCHED_FIFO || policy == SCHED_RR)
                return true;
        if (policy == SCHED_DEADLINE)
                return true;
        return false;
}

#ifdef CONFIG_RT_MUTEXES
/*
 * Must hold either p->pi_lock or task_rq(p)->lock.
 */
static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *p)
{
        return p->pi_top_task;
}
extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task);
extern void rt_mutex_adjust_pi(struct task_struct *p);
static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
{
        return tsk->pi_blocked_on != NULL;
}
#else
static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
{
        return NULL;
}
# define rt_mutex_adjust_pi(p)                do { } while (0)
static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
{
        return false;
}
#endif

extern void normalize_rt_tasks(void);


/*
 * default timeslice is 100 msecs (used only for SCHED_RR tasks).
 * Timeslices get refilled after they expire.
 */
#define RR_TIMESLICE                (100 * HZ / 1000)

#endif /* _LINUX_SCHED_RT_H */









































































































































































































































































































































    3 

    3 







    3 



    3 




    3 




    3 













































































































































































































































































































































































































    3 

















































































































































































































































































































































































































































































































































































































    1 
    1 


















































































































    3 

































































































    3 










    1 






































































    3 




    3 

    3 


































    1 














    2 


    3 

























































    3 
    2 



    3 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * linux/include/linux/jbd2.h
 *
 * Written by Stephen C. Tweedie <sct@redhat.com>
 *
 * Copyright 1998-2000 Red Hat, Inc --- All Rights Reserved
 *
 * Definitions for transaction data structures for the buffer cache
 * filesystem journaling support.
 */

#ifndef _LINUX_JBD2_H
#define _LINUX_JBD2_H

/* Allow this file to be included directly into e2fsprogs */
#ifndef __KERNEL__
#include "jfs_compat.h"
#define JBD2_DEBUG
#else

#include <linux/types.h>
#include <linux/buffer_head.h>
#include <linux/journal-head.h>
#include <linux/stddef.h>
#include <linux/mutex.h>
#include <linux/timer.h>
#include <linux/slab.h>
#include <linux/bit_spinlock.h>
#include <linux/blkdev.h>
#include <crypto/hash.h>
#endif

#define journal_oom_retry 1

/*
 * Define JBD2_PARANIOD_IOFAIL to cause a kernel BUG() if ext4 finds
 * certain classes of error which can occur due to failed IOs.  Under
 * normal use we want ext4 to continue after such errors, because
 * hardware _can_ fail, but for debugging purposes when running tests on
 * known-good hardware we may want to trap these errors.
 */
#undef JBD2_PARANOID_IOFAIL

/*
 * The default maximum commit age, in seconds.
 */
#define JBD2_DEFAULT_MAX_COMMIT_AGE 5

#ifdef CONFIG_JBD2_DEBUG
/*
 * Define JBD2_EXPENSIVE_CHECKING to enable more expensive internal
 * consistency checks.  By default we don't do this unless
 * CONFIG_JBD2_DEBUG is on.
 */
#define JBD2_EXPENSIVE_CHECKING
extern ushort jbd2_journal_enable_debug;
void __jbd2_debug(int level, const char *file, const char *func,
                  unsigned int line, const char *fmt, ...);

#define jbd_debug(n, fmt, a...) \
        __jbd2_debug((n), __FILE__, __func__, __LINE__, (fmt), ##a)
#else
#define jbd_debug(n, fmt, a...)    /**/
#endif

extern void *jbd2_alloc(size_t size, gfp_t flags);
extern void jbd2_free(void *ptr, size_t size);

#define JBD2_MIN_JOURNAL_BLOCKS 1024
#define JBD2_MIN_FC_BLOCKS        256

#ifdef __KERNEL__

/**
 * typedef handle_t - The handle_t type represents a single atomic update being performed by some process.
 *
 * All filesystem modifications made by the process go
 * through this handle.  Recursive operations (such as quota operations)
 * are gathered into a single update.
 *
 * The buffer credits field is used to account for journaled buffers
 * being modified by the running process.  To ensure that there is
 * enough log space for all outstanding operations, we need to limit the
 * number of outstanding buffers possible at any time.  When the
 * operation completes, any buffer credits not used are credited back to
 * the transaction, so that at all times we know how many buffers the
 * outstanding updates on a transaction might possibly touch.
 *
 * This is an opaque datatype.
 **/
typedef struct jbd2_journal_handle handle_t;        /* Atomic operation type */


/**
 * typedef journal_t - The journal_t maintains all of the journaling state information for a single filesystem.
 *
 * journal_t is linked to from the fs superblock structure.
 *
 * We use the journal_t to keep track of all outstanding transaction
 * activity on the filesystem, and to manage the state of the log
 * writing process.
 *
 * This is an opaque datatype.
 **/
typedef struct journal_s        journal_t;        /* Journal control structure */
#endif

/*
 * Internal structures used by the logging mechanism:
 */

#define JBD2_MAGIC_NUMBER 0xc03b3998U /* The first 4 bytes of /dev/random! */

/*
 * On-disk structures
 */

/*
 * Descriptor block types:
 */

#define JBD2_DESCRIPTOR_BLOCK        1
#define JBD2_COMMIT_BLOCK        2
#define JBD2_SUPERBLOCK_V1        3
#define JBD2_SUPERBLOCK_V2        4
#define JBD2_REVOKE_BLOCK        5

/*
 * Standard header for all descriptor blocks:
 */
typedef struct journal_header_s
{
        __be32                h_magic;
        __be32                h_blocktype;
        __be32                h_sequence;
} journal_header_t;

/*
 * Checksum types.
 */
#define JBD2_CRC32_CHKSUM   1
#define JBD2_MD5_CHKSUM     2
#define JBD2_SHA1_CHKSUM    3
#define JBD2_CRC32C_CHKSUM  4

#define JBD2_CRC32_CHKSUM_SIZE 4

#define JBD2_CHECKSUM_BYTES (32 / sizeof(u32))
/*
 * Commit block header for storing transactional checksums:
 *
 * NOTE: If FEATURE_COMPAT_CHECKSUM (checksum v1) is set, the h_chksum*
 * fields are used to store a checksum of the descriptor and data blocks.
 *
 * If FEATURE_INCOMPAT_CSUM_V2 (checksum v2) is set, then the h_chksum
 * field is used to store crc32c(uuid+commit_block).  Each journal metadata
 * block gets its own checksum, and data block checksums are stored in
 * journal_block_tag (in the descriptor).  The other h_chksum* fields are
 * not used.
 *
 * If FEATURE_INCOMPAT_CSUM_V3 is set, the descriptor block uses
 * journal_block_tag3_t to store a full 32-bit checksum.  Everything else
 * is the same as v2.
 *
 * Checksum v1, v2, and v3 are mutually exclusive features.
 */
struct commit_header {
        __be32                h_magic;
        __be32          h_blocktype;
        __be32          h_sequence;
        unsigned char   h_chksum_type;
        unsigned char   h_chksum_size;
        unsigned char         h_padding[2];
        __be32                 h_chksum[JBD2_CHECKSUM_BYTES];
        __be64                h_commit_sec;
        __be32                h_commit_nsec;
};

/*
 * The block tag: used to describe a single buffer in the journal.
 * t_blocknr_high is only used if INCOMPAT_64BIT is set, so this
 * raw struct shouldn't be used for pointer math or sizeof() - use
 * journal_tag_bytes(journal) instead to compute this.
 */
typedef struct journal_block_tag3_s
{
        __be32                t_blocknr;        /* The on-disk block number */
        __be32                t_flags;        /* See below */
        __be32                t_blocknr_high; /* most-significant high 32bits. */
        __be32                t_checksum;        /* crc32c(uuid+seq+block) */
} journal_block_tag3_t;

typedef struct journal_block_tag_s
{
        __be32                t_blocknr;        /* The on-disk block number */
        __be16                t_checksum;        /* truncated crc32c(uuid+seq+block) */
        __be16                t_flags;        /* See below */
        __be32                t_blocknr_high; /* most-significant high 32bits. */
} journal_block_tag_t;

/* Tail of descriptor or revoke block, for checksumming */
struct jbd2_journal_block_tail {
        __be32                t_checksum;        /* crc32c(uuid+descr_block) */
};

/*
 * The revoke descriptor: used on disk to describe a series of blocks to
 * be revoked from the log
 */
typedef struct jbd2_journal_revoke_header_s
{
        journal_header_t r_header;
        __be32                 r_count;        /* Count of bytes used in the block */
} jbd2_journal_revoke_header_t;

/* Definitions for the journal tag flags word: */
#define JBD2_FLAG_ESCAPE                1        /* on-disk block is escaped */
#define JBD2_FLAG_SAME_UUID        2        /* block has same uuid as previous */
#define JBD2_FLAG_DELETED        4        /* block deleted by this transaction */
#define JBD2_FLAG_LAST_TAG        8        /* last tag in this descriptor block */


/*
 * The journal superblock.  All fields are in big-endian byte order.
 */
typedef struct journal_superblock_s
{
/* 0x0000 */
        journal_header_t s_header;

/* 0x000C */
        /* Static information describing the journal */
        __be32        s_blocksize;                /* journal device blocksize */
        __be32        s_maxlen;                /* total blocks in journal file */
        __be32        s_first;                /* first block of log information */

/* 0x0018 */
        /* Dynamic information describing the current state of the log */
        __be32        s_sequence;                /* first commit ID expected in log */
        __be32        s_start;                /* blocknr of start of log */

/* 0x0020 */
        /* Error value, as set by jbd2_journal_abort(). */
        __be32        s_errno;

/* 0x0024 */
        /* Remaining fields are only valid in a version-2 superblock */
        __be32        s_feature_compat;        /* compatible feature set */
        __be32        s_feature_incompat;        /* incompatible feature set */
        __be32        s_feature_ro_compat;        /* readonly-compatible feature set */
/* 0x0030 */
        __u8        s_uuid[16];                /* 128-bit uuid for journal */

/* 0x0040 */
        __be32        s_nr_users;                /* Nr of filesystems sharing log */

        __be32        s_dynsuper;                /* Blocknr of dynamic superblock copy*/

/* 0x0048 */
        __be32        s_max_transaction;        /* Limit of journal blocks per trans.*/
        __be32        s_max_trans_data;        /* Limit of data blocks per trans. */

/* 0x0050 */
        __u8        s_checksum_type;        /* checksum type */
        __u8        s_padding2[3];
/* 0x0054 */
        __be32        s_num_fc_blks;                /* Number of fast commit blocks */
/* 0x0058 */
        __u32        s_padding[41];
        __be32        s_checksum;                /* crc32c(superblock) */

/* 0x0100 */
        __u8        s_users[16*48];                /* ids of all fs'es sharing the log */
/* 0x0400 */
} journal_superblock_t;

/* Use the jbd2_{has,set,clear}_feature_* helpers; these will be removed */
#define JBD2_HAS_COMPAT_FEATURE(j,mask)                                        \
        ((j)->j_format_version >= 2 &&                                        \
         ((j)->j_superblock->s_feature_compat & cpu_to_be32((mask))))
#define JBD2_HAS_RO_COMPAT_FEATURE(j,mask)                                \
        ((j)->j_format_version >= 2 &&                                        \
         ((j)->j_superblock->s_feature_ro_compat & cpu_to_be32((mask))))
#define JBD2_HAS_INCOMPAT_FEATURE(j,mask)                                \
        ((j)->j_format_version >= 2 &&                                        \
         ((j)->j_superblock->s_feature_incompat & cpu_to_be32((mask))))

#define JBD2_FEATURE_COMPAT_CHECKSUM                0x00000001

#define JBD2_FEATURE_INCOMPAT_REVOKE                0x00000001
#define JBD2_FEATURE_INCOMPAT_64BIT                0x00000002
#define JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT        0x00000004
#define JBD2_FEATURE_INCOMPAT_CSUM_V2                0x00000008
#define JBD2_FEATURE_INCOMPAT_CSUM_V3                0x00000010
#define JBD2_FEATURE_INCOMPAT_FAST_COMMIT        0x00000020

/* See "journal feature predicate functions" below */

/* Features known to this kernel version: */
#define JBD2_KNOWN_COMPAT_FEATURES        JBD2_FEATURE_COMPAT_CHECKSUM
#define JBD2_KNOWN_ROCOMPAT_FEATURES        0
#define JBD2_KNOWN_INCOMPAT_FEATURES        (JBD2_FEATURE_INCOMPAT_REVOKE | \
                                        JBD2_FEATURE_INCOMPAT_64BIT | \
                                        JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | \
                                        JBD2_FEATURE_INCOMPAT_CSUM_V2 | \
                                        JBD2_FEATURE_INCOMPAT_CSUM_V3 | \
                                        JBD2_FEATURE_INCOMPAT_FAST_COMMIT)

#ifdef __KERNEL__

#include <linux/fs.h>
#include <linux/sched.h>

enum jbd_state_bits {
        BH_JBD                        /* Has an attached ext3 journal_head */
          = BH_PrivateStart,
        BH_JWrite,                /* Being written to log (@@@ DEBUGGING) */
        BH_Freed,                /* Has been freed (truncated) */
        BH_Revoked,                /* Has been revoked from the log */
        BH_RevokeValid,                /* Revoked flag is valid */
        BH_JBDDirty,                /* Is dirty but journaled */
        BH_JournalHead,                /* Pins bh->b_private and jh->b_bh */
        BH_Shadow,                /* IO on shadow buffer is running */
        BH_Verified,                /* Metadata block has been verified ok */
        BH_JBDPrivateStart,        /* First bit available for private use by FS */
};

BUFFER_FNS(JBD, jbd)
BUFFER_FNS(JWrite, jwrite)
BUFFER_FNS(JBDDirty, jbddirty)
TAS_BUFFER_FNS(JBDDirty, jbddirty)
BUFFER_FNS(Revoked, revoked)
TAS_BUFFER_FNS(Revoked, revoked)
BUFFER_FNS(RevokeValid, revokevalid)
TAS_BUFFER_FNS(RevokeValid, revokevalid)
BUFFER_FNS(Freed, freed)
BUFFER_FNS(Shadow, shadow)
BUFFER_FNS(Verified, verified)

static inline struct buffer_head *jh2bh(struct journal_head *jh)
{
        return jh->b_bh;
}

static inline struct journal_head *bh2jh(struct buffer_head *bh)
{
        return bh->b_private;
}

static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
{
        bit_spin_lock(BH_JournalHead, &bh->b_state);
}

static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
{
        bit_spin_unlock(BH_JournalHead, &bh->b_state);
}

#define J_ASSERT(assert)        BUG_ON(!(assert))

#define J_ASSERT_BH(bh, expr)        J_ASSERT(expr)
#define J_ASSERT_JH(jh, expr)        J_ASSERT(expr)

#if defined(JBD2_PARANOID_IOFAIL)
#define J_EXPECT(expr, why...)                J_ASSERT(expr)
#define J_EXPECT_BH(bh, expr, why...)        J_ASSERT_BH(bh, expr)
#define J_EXPECT_JH(jh, expr, why...)        J_ASSERT_JH(jh, expr)
#else
#define __journal_expect(expr, why...)                                             \
        ({                                                                     \
                int val = (expr);                                             \
                if (!val) {                                                     \
                        printk(KERN_ERR                                             \
                               "JBD2 unexpected failure: %s: %s;\n",             \
                               __func__, #expr);                             \
                        printk(KERN_ERR why "\n");                             \
                }                                                             \
                val;                                                             \
        })
#define J_EXPECT(expr, why...)                __journal_expect(expr, ## why)
#define J_EXPECT_BH(bh, expr, why...)        __journal_expect(expr, ## why)
#define J_EXPECT_JH(jh, expr, why...)        __journal_expect(expr, ## why)
#endif

/* Flags in jbd_inode->i_flags */
#define __JI_COMMIT_RUNNING 0
#define __JI_WRITE_DATA 1
#define __JI_WAIT_DATA 2

/*
 * Commit of the inode data in progress. We use this flag to protect us from
 * concurrent deletion of inode. We cannot use reference to inode for this
 * since we cannot afford doing last iput() on behalf of kjournald
 */
#define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING)
/* Write allocated dirty buffers in this inode before commit */
#define JI_WRITE_DATA (1 << __JI_WRITE_DATA)
/* Wait for outstanding data writes for this inode before commit */
#define JI_WAIT_DATA (1 << __JI_WAIT_DATA)

/**
 * struct jbd2_inode - The jbd_inode type is the structure linking inodes in
 * ordered mode present in a transaction so that we can sync them during commit.
 */
struct jbd2_inode {
        /**
         * @i_transaction:
         *
         * Which transaction does this inode belong to? Either the running
         * transaction or the committing one. [j_list_lock]
         */
        transaction_t *i_transaction;

        /**
         * @i_next_transaction:
         *
         * Pointer to the running transaction modifying inode's data in case
         * there is already a committing transaction touching it. [j_list_lock]
         */
        transaction_t *i_next_transaction;

        /**
         * @i_list: List of inodes in the i_transaction [j_list_lock]
         */
        struct list_head i_list;

        /**
         * @i_vfs_inode:
         *
         * VFS inode this inode belongs to [constant for lifetime of structure]
         */
        struct inode *i_vfs_inode;

        /**
         * @i_flags: Flags of inode [j_list_lock]
         */
        unsigned long i_flags;

        /**
         * @i_dirty_start:
         *
         * Offset in bytes where the dirty range for this inode starts.
         * [j_list_lock]
         */
        loff_t i_dirty_start;

        /**
         * @i_dirty_end:
         *
         * Inclusive offset in bytes where the dirty range for this inode
         * ends. [j_list_lock]
         */
        loff_t i_dirty_end;
};

struct jbd2_revoke_table_s;

/**
 * struct jbd2_journal_handle - The jbd2_journal_handle type is the concrete
 *     type associated with handle_t.
 * @h_transaction: Which compound transaction is this update a part of?
 * @h_journal: Which journal handle belongs to - used iff h_reserved set.
 * @h_rsv_handle: Handle reserved for finishing the logical operation.
 * @h_total_credits: Number of remaining buffers we are allowed to add to
 *        journal. These are dirty buffers and revoke descriptor blocks.
 * @h_revoke_credits: Number of remaining revoke records available for handle
 * @h_ref: Reference count on this handle.
 * @h_err: Field for caller's use to track errors through large fs operations.
 * @h_sync: Flag for sync-on-close.
 * @h_jdata: Flag to force data journaling.
 * @h_reserved: Flag for handle for reserved credits.
 * @h_aborted: Flag indicating fatal error on handle.
 * @h_type: For handle statistics.
 * @h_line_no: For handle statistics.
 * @h_start_jiffies: Handle Start time.
 * @h_requested_credits: Holds @h_total_credits after handle is started.
 * @h_revoke_credits_requested: Holds @h_revoke_credits after handle is started.
 * @saved_alloc_context: Saved context while transaction is open.
 **/

/* Docbook can't yet cope with the bit fields, but will leave the documentation
 * in so it can be fixed later.
 */

struct jbd2_journal_handle
{
        union {
                transaction_t        *h_transaction;
                /* Which journal handle belongs to - used iff h_reserved set */
                journal_t        *h_journal;
        };

        handle_t                *h_rsv_handle;
        int                        h_total_credits;
        int                        h_revoke_credits;
        int                        h_revoke_credits_requested;
        int                        h_ref;
        int                        h_err;

        /* Flags [no locking] */
        unsigned int        h_sync:                1;
        unsigned int        h_jdata:        1;
        unsigned int        h_reserved:        1;
        unsigned int        h_aborted:        1;
        unsigned int        h_type:                8;
        unsigned int        h_line_no:        16;

        unsigned long                h_start_jiffies;
        unsigned int                h_requested_credits;

        unsigned int                saved_alloc_context;
};


/*
 * Some stats for checkpoint phase
 */
struct transaction_chp_stats_s {
        unsigned long                cs_chp_time;
        __u32                        cs_forced_to_close;
        __u32                        cs_written;
        __u32                        cs_dropped;
};

/* The transaction_t type is the guts of the journaling mechanism.  It
 * tracks a compound transaction through its various states:
 *
 * RUNNING:        accepting new updates
 * LOCKED:        Updates still running but we don't accept new ones
 * RUNDOWN:        Updates are tidying up but have finished requesting
 *                new buffers to modify (state not used for now)
 * FLUSH:       All updates complete, but we are still writing to disk
 * COMMIT:      All data on disk, writing commit record
 * FINISHED:        We still have to keep the transaction for checkpointing.
 *
 * The transaction keeps track of all of the buffers modified by a
 * running transaction, and all of the buffers committed but not yet
 * flushed to home for finished transactions.
 */

/*
 * Lock ranking:
 *
 *    j_list_lock
 *      ->jbd_lock_bh_journal_head()        (This is "innermost")
 *
 *    j_state_lock
 *    ->b_state_lock
 *
 *    b_state_lock
 *    ->j_list_lock
 *
 *    j_state_lock
 *    ->t_handle_lock
 *
 *    j_state_lock
 *    ->j_list_lock                        (journal_unmap_buffer)
 *
 */

struct transaction_s
{
        /* Pointer to the journal for this transaction. [no locking] */
        journal_t                *t_journal;

        /* Sequence number for this transaction [no locking] */
        tid_t                        t_tid;

        /*
         * Transaction's current state
         * [no locking - only kjournald2 alters this]
         * [j_list_lock] guards transition of a transaction into T_FINISHED
         * state and subsequent call of __jbd2_journal_drop_transaction()
         * FIXME: needs barriers
         * KLUDGE: [use j_state_lock]
         */
        enum {
                T_RUNNING,
                T_LOCKED,
                T_SWITCH,
                T_FLUSH,
                T_COMMIT,
                T_COMMIT_DFLUSH,
                T_COMMIT_JFLUSH,
                T_COMMIT_CALLBACK,
                T_FINISHED
        }                        t_state;

        /*
         * Where in the log does this transaction's commit start? [no locking]
         */
        unsigned long                t_log_start;

        /* Number of buffers on the t_buffers list [j_list_lock] */
        int                        t_nr_buffers;

        /*
         * Doubly-linked circular list of all buffers reserved but not yet
         * modified by this transaction [j_list_lock]
         */
        struct journal_head        *t_reserved_list;

        /*
         * Doubly-linked circular list of all metadata buffers owned by this
         * transaction [j_list_lock]
         */
        struct journal_head        *t_buffers;

        /*
         * Doubly-linked circular list of all forget buffers (superseded
         * buffers which we can un-checkpoint once this transaction commits)
         * [j_list_lock]
         */
        struct journal_head        *t_forget;

        /*
         * Doubly-linked circular list of all buffers still to be flushed before
         * this transaction can be checkpointed. [j_list_lock]
         */
        struct journal_head        *t_checkpoint_list;

        /*
         * Doubly-linked circular list of all buffers submitted for IO while
         * checkpointing. [j_list_lock]
         */
        struct journal_head        *t_checkpoint_io_list;

        /*
         * Doubly-linked circular list of metadata buffers being shadowed by log
         * IO.  The IO buffers on the iobuf list and the shadow buffers on this
         * list match each other one for one at all times. [j_list_lock]
         */
        struct journal_head        *t_shadow_list;

        /*
         * List of inodes associated with the transaction; e.g., ext4 uses
         * this to track inodes in data=ordered and data=journal mode that
         * need special handling on transaction commit; also used by ocfs2.
         * [j_list_lock]
         */
        struct list_head        t_inode_list;

        /*
         * Protects info related to handles
         */
        spinlock_t                t_handle_lock;

        /*
         * Longest time some handle had to wait for running transaction
         */
        unsigned long                t_max_wait;

        /*
         * When transaction started
         */
        unsigned long                t_start;

        /*
         * When commit was requested
         */
        unsigned long                t_requested;

        /*
         * Checkpointing stats [j_checkpoint_sem]
         */
        struct transaction_chp_stats_s t_chp_stats;

        /*
         * Number of outstanding updates running on this transaction
         * [none]
         */
        atomic_t                t_updates;

        /*
         * Number of blocks reserved for this transaction in the journal.
         * This is including all credits reserved when starting transaction
         * handles as well as all journal descriptor blocks needed for this
         * transaction. [none]
         */
        atomic_t                t_outstanding_credits;

        /*
         * Number of revoke records for this transaction added by already
         * stopped handles. [none]
         */
        atomic_t                t_outstanding_revokes;

        /*
         * How many handles used this transaction? [none]
         */
        atomic_t                t_handle_count;

        /*
         * Forward and backward links for the circular list of all transactions
         * awaiting checkpoint. [j_list_lock]
         */
        transaction_t                *t_cpnext, *t_cpprev;

        /*
         * When will the transaction expire (become due for commit), in jiffies?
         * [no locking]
         */
        unsigned long                t_expires;

        /*
         * When this transaction started, in nanoseconds [no locking]
         */
        ktime_t                        t_start_time;

        /*
         * This transaction is being forced and some process is
         * waiting for it to finish.
         */
        unsigned int t_synchronous_commit:1;

        /* Disk flush needs to be sent to fs partition [no locking] */
        int                        t_need_data_flush;

        /*
         * For use by the filesystem to store fs-specific data
         * structures associated with the transaction
         */
        struct list_head        t_private_list;
};

struct transaction_run_stats_s {
        unsigned long                rs_wait;
        unsigned long                rs_request_delay;
        unsigned long                rs_running;
        unsigned long                rs_locked;
        unsigned long                rs_flushing;
        unsigned long                rs_logging;

        __u32                        rs_handle_count;
        __u32                        rs_blocks;
        __u32                        rs_blocks_logged;
};

struct transaction_stats_s {
        unsigned long                ts_tid;
        unsigned long                ts_requested;
        struct transaction_run_stats_s run;
};

static inline unsigned long
jbd2_time_diff(unsigned long start, unsigned long end)
{
        if (end >= start)
                return end - start;

        return end + (MAX_JIFFY_OFFSET - start);
}

#define JBD2_NR_BATCH        64

enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};

#define JBD2_FC_REPLAY_STOP        0
#define JBD2_FC_REPLAY_CONTINUE        1

/**
 * struct journal_s - The journal_s type is the concrete type associated with
 *     journal_t.
 */
struct journal_s
{
        /**
         * @j_flags: General journaling state flags [j_state_lock]
         */
        unsigned long                j_flags;

        /**
         * @j_errno:
         *
         * Is there an outstanding uncleared error on the journal (from a prior
         * abort)? [j_state_lock]
         */
        int                        j_errno;

        /**
         * @j_abort_mutex: Lock the whole aborting procedure.
         */
        struct mutex                j_abort_mutex;

        /**
         * @j_sb_buffer: The first part of the superblock buffer.
         */
        struct buffer_head        *j_sb_buffer;

        /**
         * @j_superblock: The second part of the superblock buffer.
         */
        journal_superblock_t        *j_superblock;

        /**
         * @j_format_version: Version of the superblock format.
         */
        int                        j_format_version;

        /**
         * @j_state_lock: Protect the various scalars in the journal.
         */
        rwlock_t                j_state_lock;

        /**
         * @j_barrier_count:
         *
         * Number of processes waiting to create a barrier lock [j_state_lock]
         */
        int                        j_barrier_count;

        /**
         * @j_barrier: The barrier lock itself.
         */
        struct mutex                j_barrier;

        /**
         * @j_running_transaction:
         *
         * Transactions: The current running transaction...
         * [j_state_lock] [caller holding open handle]
         */
        transaction_t                *j_running_transaction;

        /**
         * @j_committing_transaction:
         *
         * the transaction we are pushing to disk
         * [j_state_lock] [caller holding open handle]
         */
        transaction_t                *j_committing_transaction;

        /**
         * @j_checkpoint_transactions:
         *
         * ... and a linked circular list of all transactions waiting for
         * checkpointing. [j_list_lock]
         */
        transaction_t                *j_checkpoint_transactions;

        /**
         * @j_wait_transaction_locked:
         *
         * Wait queue for waiting for a locked transaction to start committing,
         * or for a barrier lock to be released.
         */
        wait_queue_head_t        j_wait_transaction_locked;

        /**
         * @j_wait_done_commit: Wait queue for waiting for commit to complete.
         */
        wait_queue_head_t        j_wait_done_commit;

        /**
         * @j_wait_commit: Wait queue to trigger commit.
         */
        wait_queue_head_t        j_wait_commit;

        /**
         * @j_wait_updates: Wait queue to wait for updates to complete.
         */
        wait_queue_head_t        j_wait_updates;

        /**
         * @j_wait_reserved:
         *
         * Wait queue to wait for reserved buffer credits to drop.
         */
        wait_queue_head_t        j_wait_reserved;

        /**
         * @j_fc_wait:
         *
         * Wait queue to wait for completion of async fast commits.
         */
        wait_queue_head_t        j_fc_wait;

        /**
         * @j_checkpoint_mutex:
         *
         * Semaphore for locking against concurrent checkpoints.
         */
        struct mutex                j_checkpoint_mutex;

        /**
         * @j_chkpt_bhs:
         *
         * List of buffer heads used by the checkpoint routine.  This
         * was moved from jbd2_log_do_checkpoint() to reduce stack
         * usage.  Access to this array is controlled by the
         * @j_checkpoint_mutex.  [j_checkpoint_mutex]
         */
        struct buffer_head        *j_chkpt_bhs[JBD2_NR_BATCH];

        /**
         * @j_head:
         *
         * Journal head: identifies the first unused block in the journal.
         * [j_state_lock]
         */
        unsigned long                j_head;

        /**
         * @j_tail:
         *
         * Journal tail: identifies the oldest still-used block in the journal.
         * [j_state_lock]
         */
        unsigned long                j_tail;

        /**
         * @j_free:
         *
         * Journal free: how many free blocks are there in the journal?
         * [j_state_lock]
         */
        unsigned long                j_free;

        /**
         * @j_first:
         *
         * The block number of the first usable block in the journal
         * [j_state_lock].
         */
        unsigned long                j_first;

        /**
         * @j_last:
         *
         * The block number one beyond the last usable block in the journal
         * [j_state_lock].
         */
        unsigned long                j_last;

        /**
         * @j_fc_first:
         *
         * The block number of the first fast commit block in the journal
         * [j_state_lock].
         */
        unsigned long                j_fc_first;

        /**
         * @j_fc_off:
         *
         * Number of fast commit blocks currently allocated. Accessed only
         * during fast commit. Currently only process can do fast commit, so
         * this field is not protected by any lock.
         */
        unsigned long                j_fc_off;

        /**
         * @j_fc_last:
         *
         * The block number one beyond the last fast commit block in the journal
         * [j_state_lock].
         */
        unsigned long                j_fc_last;

        /**
         * @j_dev: Device where we store the journal.
         */
        struct block_device        *j_dev;

        /**
         * @j_blocksize: Block size for the location where we store the journal.
         */
        int                        j_blocksize;

        /**
         * @j_blk_offset:
         *
         * Starting block offset into the device where we store the journal.
         */
        unsigned long long        j_blk_offset;

        /**
         * @j_devname: Journal device name.
         */
        char                        j_devname[BDEVNAME_SIZE+24];

        /**
         * @j_fs_dev:
         *
         * Device which holds the client fs.  For internal journal this will be
         * equal to j_dev.
         */
        struct block_device        *j_fs_dev;

        /**
         * @j_total_len: Total maximum capacity of the journal region on disk.
         */
        unsigned int                j_total_len;

        /**
         * @j_reserved_credits:
         *
         * Number of buffers reserved from the running transaction.
         */
        atomic_t                j_reserved_credits;

        /**
         * @j_list_lock: Protects the buffer lists and internal buffer state.
         */
        spinlock_t                j_list_lock;

        /**
         * @j_inode:
         *
         * Optional inode where we store the journal.  If present, all
         * journal block numbers are mapped into this inode via bmap().
         */
        struct inode                *j_inode;

        /**
         * @j_tail_sequence:
         *
         * Sequence number of the oldest transaction in the log [j_state_lock]
         */
        tid_t                        j_tail_sequence;

        /**
         * @j_transaction_sequence:
         *
         * Sequence number of the next transaction to grant [j_state_lock]
         */
        tid_t                        j_transaction_sequence;

        /**
         * @j_commit_sequence:
         *
         * Sequence number of the most recently committed transaction
         * [j_state_lock].
         */
        tid_t                        j_commit_sequence;

        /**
         * @j_commit_request:
         *
         * Sequence number of the most recent transaction wanting commit
         * [j_state_lock]
         */
        tid_t                        j_commit_request;

        /**
         * @j_uuid:
         *
         * Journal uuid: identifies the object (filesystem, LVM volume etc)
         * backed by this journal.  This will eventually be replaced by an array
         * of uuids, allowing us to index multiple devices within a single
         * journal and to perform atomic updates across them.
         */
        __u8                        j_uuid[16];

        /**
         * @j_task: Pointer to the current commit thread for this journal.
         */
        struct task_struct        *j_task;

        /**
         * @j_max_transaction_buffers:
         *
         * Maximum number of metadata buffers to allow in a single compound
         * commit transaction.
         */
        int                        j_max_transaction_buffers;

        /**
         * @j_revoke_records_per_block:
         *
         * Number of revoke records that fit in one descriptor block.
         */
        int                        j_revoke_records_per_block;

        /**
         * @j_commit_interval:
         *
         * What is the maximum transaction lifetime before we begin a commit?
         */
        unsigned long                j_commit_interval;

        /**
         * @j_commit_timer: The timer used to wakeup the commit thread.
         */
        struct timer_list        j_commit_timer;

        /**
         * @j_revoke_lock: Protect the revoke table.
         */
        spinlock_t                j_revoke_lock;

        /**
         * @j_revoke:
         *
         * The revoke table - maintains the list of revoked blocks in the
         * current transaction.
         */
        struct jbd2_revoke_table_s *j_revoke;

        /**
         * @j_revoke_table: Alternate revoke tables for j_revoke.
         */
        struct jbd2_revoke_table_s *j_revoke_table[2];

        /**
         * @j_wbuf: Array of bhs for jbd2_journal_commit_transaction.
         */
        struct buffer_head        **j_wbuf;

        /**
         * @j_fc_wbuf: Array of fast commit bhs for fast commit. Accessed only
         * during a fast commit. Currently only process can do fast commit, so
         * this field is not protected by any lock.
         */
        struct buffer_head        **j_fc_wbuf;

        /**
         * @j_wbufsize:
         *
         * Size of @j_wbuf array.
         */
        int                        j_wbufsize;

        /**
         * @j_fc_wbufsize:
         *
         * Size of @j_fc_wbuf array.
         */
        int                        j_fc_wbufsize;

        /**
         * @j_last_sync_writer:
         *
         * The pid of the last person to run a synchronous operation
         * through the journal.
         */
        pid_t                        j_last_sync_writer;

        /**
         * @j_average_commit_time:
         *
         * The average amount of time in nanoseconds it takes to commit a
         * transaction to disk. [j_state_lock]
         */
        u64                        j_average_commit_time;

        /**
         * @j_min_batch_time:
         *
         * Minimum time that we should wait for additional filesystem operations
         * to get batched into a synchronous handle in microseconds.
         */
        u32                        j_min_batch_time;

        /**
         * @j_max_batch_time:
         *
         * Maximum time that we should wait for additional filesystem operations
         * to get batched into a synchronous handle in microseconds.
         */
        u32                        j_max_batch_time;

        /**
         * @j_commit_callback:
         *
         * This function is called when a transaction is closed.
         */
        void                        (*j_commit_callback)(journal_t *,
                                                     transaction_t *);

        /**
         * @j_submit_inode_data_buffers:
         *
         * This function is called for all inodes associated with the
         * committing transaction marked with JI_WRITE_DATA flag
         * before we start to write out the transaction to the journal.
         */
        int                        (*j_submit_inode_data_buffers)
                                        (struct jbd2_inode *);

        /**
         * @j_finish_inode_data_buffers:
         *
         * This function is called for all inodes associated with the
         * committing transaction marked with JI_WAIT_DATA flag
         * after we have written the transaction to the journal
         * but before we write out the commit block.
         */
        int                        (*j_finish_inode_data_buffers)
                                        (struct jbd2_inode *);

        /*
         * Journal statistics
         */

        /**
         * @j_history_lock: Protect the transactions statistics history.
         */
        spinlock_t                j_history_lock;

        /**
         * @j_proc_entry: procfs entry for the jbd statistics directory.
         */
        struct proc_dir_entry        *j_proc_entry;

        /**
         * @j_stats: Overall statistics.
         */
        struct transaction_stats_s j_stats;

        /**
         * @j_failed_commit: Failed journal commit ID.
         */
        unsigned int                j_failed_commit;

        /**
         * @j_private:
         *
         * An opaque pointer to fs-private information.  ext3 puts its
         * superblock pointer here.
         */
        void *j_private;

        /**
         * @j_chksum_driver:
         *
         * Reference to checksum algorithm driver via cryptoapi.
         */
        struct crypto_shash *j_chksum_driver;

        /**
         * @j_csum_seed:
         *
         * Precomputed journal UUID checksum for seeding other checksums.
         */
        __u32 j_csum_seed;

#ifdef CONFIG_DEBUG_LOCK_ALLOC
        /**
         * @j_trans_commit_map:
         *
         * Lockdep entity to track transaction commit dependencies. Handles
         * hold this "lock" for read, when we wait for commit, we acquire the
         * "lock" for writing. This matches the properties of jbd2 journalling
         * where the running transaction has to wait for all handles to be
         * dropped to commit that transaction and also acquiring a handle may
         * require transaction commit to finish.
         */
        struct lockdep_map        j_trans_commit_map;
#endif

        /**
         * @j_fc_cleanup_callback:
         *
         * Clean-up after fast commit or full commit. JBD2 calls this function
         * after every commit operation.
         */
        void (*j_fc_cleanup_callback)(struct journal_s *journal, int);

        /**
         * @j_fc_replay_callback:
         *
         * File-system specific function that performs replay of a fast
         * commit. JBD2 calls this function for each fast commit block found in
         * the journal. This function should return JBD2_FC_REPLAY_CONTINUE
         * to indicate that the block was processed correctly and more fast
         * commit replay should continue. Return value of JBD2_FC_REPLAY_STOP
         * indicates the end of replay (no more blocks remaining). A negative
         * return value indicates error.
         */
        int (*j_fc_replay_callback)(struct journal_s *journal,
                                    struct buffer_head *bh,
                                    enum passtype pass, int off,
                                    tid_t expected_commit_id);
};

#define jbd2_might_wait_for_commit(j) \
        do { \
                rwsem_acquire(&j->j_trans_commit_map, 0, 0, _THIS_IP_); \
                rwsem_release(&j->j_trans_commit_map, _THIS_IP_); \
        } while (0)

/* journal feature predicate functions */
#define JBD2_FEATURE_COMPAT_FUNCS(name, flagname) \
static inline bool jbd2_has_feature_##name(journal_t *j) \
{ \
        return ((j)->j_format_version >= 2 && \
                ((j)->j_superblock->s_feature_compat & \
                 cpu_to_be32(JBD2_FEATURE_COMPAT_##flagname)) != 0); \
} \
static inline void jbd2_set_feature_##name(journal_t *j) \
{ \
        (j)->j_superblock->s_feature_compat |= \
                cpu_to_be32(JBD2_FEATURE_COMPAT_##flagname); \
} \
static inline void jbd2_clear_feature_##name(journal_t *j) \
{ \
        (j)->j_superblock->s_feature_compat &= \
                ~cpu_to_be32(JBD2_FEATURE_COMPAT_##flagname); \
}

#define JBD2_FEATURE_RO_COMPAT_FUNCS(name, flagname) \
static inline bool jbd2_has_feature_##name(journal_t *j) \
{ \
        return ((j)->j_format_version >= 2 && \
                ((j)->j_superblock->s_feature_ro_compat & \
                 cpu_to_be32(JBD2_FEATURE_RO_COMPAT_##flagname)) != 0); \
} \
static inline void jbd2_set_feature_##name(journal_t *j) \
{ \
        (j)->j_superblock->s_feature_ro_compat |= \
                cpu_to_be32(JBD2_FEATURE_RO_COMPAT_##flagname); \
} \
static inline void jbd2_clear_feature_##name(journal_t *j) \
{ \
        (j)->j_superblock->s_feature_ro_compat &= \
                ~cpu_to_be32(JBD2_FEATURE_RO_COMPAT_##flagname); \
}

#define JBD2_FEATURE_INCOMPAT_FUNCS(name, flagname) \
static inline bool jbd2_has_feature_##name(journal_t *j) \
{ \
        return ((j)->j_format_version >= 2 && \
                ((j)->j_superblock->s_feature_incompat & \
                 cpu_to_be32(JBD2_FEATURE_INCOMPAT_##flagname)) != 0); \
} \
static inline void jbd2_set_feature_##name(journal_t *j) \
{ \
        (j)->j_superblock->s_feature_incompat |= \
                cpu_to_be32(JBD2_FEATURE_INCOMPAT_##flagname); \
} \
static inline void jbd2_clear_feature_##name(journal_t *j) \
{ \
        (j)->j_superblock->s_feature_incompat &= \
                ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_##flagname); \
}

JBD2_FEATURE_COMPAT_FUNCS(checksum,                CHECKSUM)

JBD2_FEATURE_INCOMPAT_FUNCS(revoke,                REVOKE)
JBD2_FEATURE_INCOMPAT_FUNCS(64bit,                64BIT)
JBD2_FEATURE_INCOMPAT_FUNCS(async_commit,        ASYNC_COMMIT)
JBD2_FEATURE_INCOMPAT_FUNCS(csum2,                CSUM_V2)
JBD2_FEATURE_INCOMPAT_FUNCS(csum3,                CSUM_V3)
JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit,        FAST_COMMIT)

/*
 * Journal flag definitions
 */
#define JBD2_UNMOUNT        0x001        /* Journal thread is being destroyed */
#define JBD2_ABORT        0x002        /* Journaling has been aborted for errors. */
#define JBD2_ACK_ERR        0x004        /* The errno in the sb has been acked */
#define JBD2_FLUSHED        0x008        /* The journal superblock has been flushed */
#define JBD2_LOADED        0x010        /* The journal superblock has been loaded */
#define JBD2_BARRIER        0x020        /* Use IDE barriers */
#define JBD2_ABORT_ON_SYNCDATA_ERR        0x040        /* Abort the journal on file
                                                 * data write error in ordered
                                                 * mode */
#define JBD2_FAST_COMMIT_ONGOING        0x100        /* Fast commit is ongoing */
#define JBD2_FULL_COMMIT_ONGOING        0x200        /* Full commit is ongoing */

/*
 * Function declarations for the journaling transaction and buffer
 * management
 */

/* Filing buffers */
extern void jbd2_journal_unfile_buffer(journal_t *, struct journal_head *);
extern bool __jbd2_journal_refile_buffer(struct journal_head *);
extern void jbd2_journal_refile_buffer(journal_t *, struct journal_head *);
extern void __jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int);
extern void __journal_free_buffer(struct journal_head *bh);
extern void jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int);
extern void __journal_clean_data_list(transaction_t *transaction);
static inline void jbd2_file_log_bh(struct list_head *head, struct buffer_head *bh)
{
        list_add_tail(&bh->b_assoc_buffers, head);
}
static inline void jbd2_unfile_log_bh(struct buffer_head *bh)
{
        list_del_init(&bh->b_assoc_buffers);
}

/* Log buffer allocation */
struct buffer_head *jbd2_journal_get_descriptor_buffer(transaction_t *, int);
void jbd2_descriptor_block_csum_set(journal_t *, struct buffer_head *);
int jbd2_journal_next_log_block(journal_t *, unsigned long long *);
int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
                              unsigned long *block);
int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block);
void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block);

/* Commit management */
extern void jbd2_journal_commit_transaction(journal_t *);

/* Checkpoint list management */
void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy);
int __jbd2_journal_remove_checkpoint(struct journal_head *);
void jbd2_journal_destroy_checkpoint(journal_t *journal);
void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *);


/*
 * Triggers
 */

struct jbd2_buffer_trigger_type {
        /*
         * Fired a the moment data to write to the journal are known to be
         * stable - so either at the moment b_frozen_data is created or just
         * before a buffer is written to the journal.  mapped_data is a mapped
         * buffer that is the frozen data for commit.
         */
        void (*t_frozen)(struct jbd2_buffer_trigger_type *type,
                         struct buffer_head *bh, void *mapped_data,
                         size_t size);

        /*
         * Fired during journal abort for dirty buffers that will not be
         * committed.
         */
        void (*t_abort)(struct jbd2_buffer_trigger_type *type,
                        struct buffer_head *bh);
};

extern void jbd2_buffer_frozen_trigger(struct journal_head *jh,
                                       void *mapped_data,
                                       struct jbd2_buffer_trigger_type *triggers);
extern void jbd2_buffer_abort_trigger(struct journal_head *jh,
                                      struct jbd2_buffer_trigger_type *triggers);

/* Buffer IO */
extern int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
                                              struct journal_head *jh_in,
                                              struct buffer_head **bh_out,
                                              sector_t blocknr);

/* Transaction locking */
extern void                __wait_on_journal (journal_t *);

/* Transaction cache support */
extern void jbd2_journal_destroy_transaction_cache(void);
extern int __init jbd2_journal_init_transaction_cache(void);
extern void jbd2_journal_free_transaction(transaction_t *);

/*
 * Journal locking.
 *
 * We need to lock the journal during transaction state changes so that nobody
 * ever tries to take a handle on the running transaction while we are in the
 * middle of moving it to the commit phase.  j_state_lock does this.
 *
 * Note that the locking is completely interrupt unsafe.  We never touch
 * journal structures from interrupts.
 */

static inline handle_t *journal_current_handle(void)
{
        return current->journal_info;
}

/* The journaling code user interface:
 *
 * Create and destroy handles
 * Register buffer modifications against the current transaction.
 */

extern handle_t *jbd2_journal_start(journal_t *, int nblocks);
extern handle_t *jbd2__journal_start(journal_t *, int blocks, int rsv_blocks,
                                     int revoke_records, gfp_t gfp_mask,
                                     unsigned int type, unsigned int line_no);
extern int         jbd2_journal_restart(handle_t *, int nblocks);
extern int         jbd2__journal_restart(handle_t *, int nblocks,
                                       int revoke_records, gfp_t gfp_mask);
extern int         jbd2_journal_start_reserved(handle_t *handle,
                                unsigned int type, unsigned int line_no);
extern void         jbd2_journal_free_reserved(handle_t *handle);
extern int         jbd2_journal_extend(handle_t *handle, int nblocks,
                                     int revoke_records);
extern int         jbd2_journal_get_write_access(handle_t *, struct buffer_head *);
extern int         jbd2_journal_get_create_access (handle_t *, struct buffer_head *);
extern int         jbd2_journal_get_undo_access(handle_t *, struct buffer_head *);
void                 jbd2_journal_set_triggers(struct buffer_head *,
                                           struct jbd2_buffer_trigger_type *type);
extern int         jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *);
extern int         jbd2_journal_forget (handle_t *, struct buffer_head *);
extern int         jbd2_journal_invalidatepage(journal_t *,
                                struct page *, unsigned int, unsigned int);
extern int         jbd2_journal_try_to_free_buffers(journal_t *journal, struct page *page);
extern int         jbd2_journal_stop(handle_t *);
extern int         jbd2_journal_flush (journal_t *);
extern void         jbd2_journal_lock_updates (journal_t *);
extern void         jbd2_journal_unlock_updates (journal_t *);

extern journal_t * jbd2_journal_init_dev(struct block_device *bdev,
                                struct block_device *fs_dev,
                                unsigned long long start, int len, int bsize);
extern journal_t * jbd2_journal_init_inode (struct inode *);
extern int           jbd2_journal_update_format (journal_t *);
extern int           jbd2_journal_check_used_features
                   (journal_t *, unsigned long, unsigned long, unsigned long);
extern int           jbd2_journal_check_available_features
                   (journal_t *, unsigned long, unsigned long, unsigned long);
extern int           jbd2_journal_set_features
                   (journal_t *, unsigned long, unsigned long, unsigned long);
extern void           jbd2_journal_clear_features
                   (journal_t *, unsigned long, unsigned long, unsigned long);
extern int           jbd2_journal_load       (journal_t *journal);
extern int           jbd2_journal_destroy    (journal_t *);
extern int           jbd2_journal_recover    (journal_t *journal);
extern int           jbd2_journal_wipe       (journal_t *, int);
extern int           jbd2_journal_skip_recovery        (journal_t *);
extern void           jbd2_journal_update_sb_errno(journal_t *);
extern int           jbd2_journal_update_sb_log_tail        (journal_t *, tid_t,
                                unsigned long, int);
extern void           jbd2_journal_abort      (journal_t *, int);
extern int           jbd2_journal_errno      (journal_t *);
extern void           jbd2_journal_ack_err    (journal_t *);
extern int           jbd2_journal_clear_err  (journal_t *);
extern int           jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *);
extern int           jbd2_journal_force_commit(journal_t *);
extern int           jbd2_journal_force_commit_nested(journal_t *);
extern int           jbd2_journal_inode_ranged_write(handle_t *handle,
                        struct jbd2_inode *inode, loff_t start_byte,
                        loff_t length);
extern int           jbd2_journal_inode_ranged_wait(handle_t *handle,
                        struct jbd2_inode *inode, loff_t start_byte,
                        loff_t length);
extern int           jbd2_journal_submit_inode_data_buffers(
                        struct jbd2_inode *jinode);
extern int           jbd2_journal_finish_inode_data_buffers(
                        struct jbd2_inode *jinode);
extern int           jbd2_journal_begin_ordered_truncate(journal_t *journal,
                                struct jbd2_inode *inode, loff_t new_size);
extern void           jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode);
extern void           jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode);

/*
 * journal_head management
 */
struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh);
struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh);
void jbd2_journal_put_journal_head(struct journal_head *jh);

/*
 * handle management
 */
extern struct kmem_cache *jbd2_handle_cache;

static inline handle_t *jbd2_alloc_handle(gfp_t gfp_flags)
{
        return kmem_cache_zalloc(jbd2_handle_cache, gfp_flags);
}

static inline void jbd2_free_handle(handle_t *handle)
{
        kmem_cache_free(jbd2_handle_cache, handle);
}

/*
 * jbd2_inode management (optional, for those file systems that want to use
 * dynamically allocated jbd2_inode structures)
 */
extern struct kmem_cache *jbd2_inode_cache;

static inline struct jbd2_inode *jbd2_alloc_inode(gfp_t gfp_flags)
{
        return kmem_cache_alloc(jbd2_inode_cache, gfp_flags);
}

static inline void jbd2_free_inode(struct jbd2_inode *jinode)
{
        kmem_cache_free(jbd2_inode_cache, jinode);
}

/* Primary revoke support */
#define JOURNAL_REVOKE_DEFAULT_HASH 256
extern int           jbd2_journal_init_revoke(journal_t *, int);
extern void           jbd2_journal_destroy_revoke_record_cache(void);
extern void           jbd2_journal_destroy_revoke_table_cache(void);
extern int __init jbd2_journal_init_revoke_record_cache(void);
extern int __init jbd2_journal_init_revoke_table_cache(void);

extern void           jbd2_journal_destroy_revoke(journal_t *);
extern int           jbd2_journal_revoke (handle_t *, unsigned long long, struct buffer_head *);
extern int           jbd2_journal_cancel_revoke(handle_t *, struct journal_head *);
extern void           jbd2_journal_write_revoke_records(transaction_t *transaction,
                                                     struct list_head *log_bufs);

/* Recovery revoke support */
extern int        jbd2_journal_set_revoke(journal_t *, unsigned long long, tid_t);
extern int        jbd2_journal_test_revoke(journal_t *, unsigned long long, tid_t);
extern void        jbd2_journal_clear_revoke(journal_t *);
extern void        jbd2_journal_switch_revoke_table(journal_t *journal);
extern void        jbd2_clear_buffer_revoked_flags(journal_t *journal);

/*
 * The log thread user interface:
 *
 * Request space in the current transaction, and force transaction commit
 * transitions on demand.
 */

int jbd2_log_start_commit(journal_t *journal, tid_t tid);
int __jbd2_log_start_commit(journal_t *journal, tid_t tid);
int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
int jbd2_transaction_committed(journal_t *journal, tid_t tid);
int jbd2_complete_transaction(journal_t *journal, tid_t tid);
int jbd2_log_do_checkpoint(journal_t *journal);
int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid);

void __jbd2_log_wait_for_space(journal_t *journal);
extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *);
extern int jbd2_cleanup_journal_tail(journal_t *);

/* Fast commit related APIs */
int jbd2_fc_begin_commit(journal_t *journal, tid_t tid);
int jbd2_fc_end_commit(journal_t *journal);
int jbd2_fc_end_commit_fallback(journal_t *journal);
int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out);
int jbd2_submit_inode_data(struct jbd2_inode *jinode);
int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode);
int jbd2_fc_wait_bufs(journal_t *journal, int num_blks);
int jbd2_fc_release_bufs(journal_t *journal);

/*
 * is_journal_abort
 *
 * Simple test wrapper function to test the JBD2_ABORT state flag.  This
 * bit, when set, indicates that we have had a fatal error somewhere,
 * either inside the journaling layer or indicated to us by the client
 * (eg. ext3), and that we and should not commit any further
 * transactions.
 */

static inline int is_journal_aborted(journal_t *journal)
{
        return journal->j_flags & JBD2_ABORT;
}

static inline int is_handle_aborted(handle_t *handle)
{
        if (handle->h_aborted || !handle->h_transaction)
                return 1;
        return is_journal_aborted(handle->h_transaction->t_journal);
}

static inline void jbd2_journal_abort_handle(handle_t *handle)
{
        handle->h_aborted = 1;
}

#endif /* __KERNEL__   */

/* Comparison functions for transaction IDs: perform comparisons using
 * modulo arithmetic so that they work over sequence number wraps. */

static inline int tid_gt(tid_t x, tid_t y)
{
        int difference = (x - y);
        return (difference > 0);
}

static inline int tid_geq(tid_t x, tid_t y)
{
        int difference = (x - y);
        return (difference >= 0);
}

extern int jbd2_journal_blocks_per_page(struct inode *inode);
extern size_t journal_tag_bytes(journal_t *journal);

static inline bool jbd2_journal_has_csum_v2or3_feature(journal_t *j)
{
        return jbd2_has_feature_csum2(j) || jbd2_has_feature_csum3(j);
}

static inline int jbd2_journal_has_csum_v2or3(journal_t *journal)
{
        WARN_ON_ONCE(jbd2_journal_has_csum_v2or3_feature(journal) &&
                     journal->j_chksum_driver == NULL);

        return journal->j_chksum_driver != NULL;
}

/*
 * Return number of free blocks in the log. Must be called under j_state_lock.
 */
static inline unsigned long jbd2_log_space_left(journal_t *journal)
{
        /* Allow for rounding errors */
        long free = journal->j_free - 32;

        if (journal->j_committing_transaction) {
                free -= atomic_read(&journal->
                        j_committing_transaction->t_outstanding_credits);
        }
        return max_t(long, free, 0);
}

/*
 * Definitions which augment the buffer_head layer
 */

/* journaling buffer types */
#define BJ_None                0        /* Not journaled */
#define BJ_Metadata        1        /* Normal journaled metadata */
#define BJ_Forget        2        /* Buffer superseded by this transaction */
#define BJ_Shadow        3        /* Buffer contents being shadowed to the log */
#define BJ_Reserved        4        /* Buffer is reserved for access by journal */
#define BJ_Types        5

extern int jbd_blocks_per_page(struct inode *inode);

/* JBD uses a CRC32 checksum */
#define JBD_MAX_CHECKSUM_SIZE 4

static inline u32 jbd2_chksum(journal_t *journal, u32 crc,
                              const void *address, unsigned int length)
{
        struct {
                struct shash_desc shash;
                char ctx[JBD_MAX_CHECKSUM_SIZE];
        } desc;
        int err;

        BUG_ON(crypto_shash_descsize(journal->j_chksum_driver) >
                JBD_MAX_CHECKSUM_SIZE);

        desc.shash.tfm = journal->j_chksum_driver;
        *(u32 *)desc.ctx = crc;

        err = crypto_shash_update(&desc.shash, address, length);
        BUG_ON(err);

        return *(u32 *)desc.ctx;
}

/* Return most recent uncommitted transaction */
static inline tid_t  jbd2_get_latest_transaction(journal_t *journal)
{
        tid_t tid;

        read_lock(&journal->j_state_lock);
        tid = journal->j_commit_request;
        if (journal->j_running_transaction)
                tid = journal->j_running_transaction->t_tid;
        read_unlock(&journal->j_state_lock);
        return tid;
}

static inline int jbd2_handle_buffer_credits(handle_t *handle)
{
        journal_t *journal;

        if (!handle->h_reserved)
                journal = handle->h_transaction->t_journal;
        else
                journal = handle->h_journal;

        return handle->h_total_credits -
                DIV_ROUND_UP(handle->h_revoke_credits_requested,
                             journal->j_revoke_records_per_block);
}

#ifdef __KERNEL__

#define buffer_trace_init(bh)        do {} while (0)
#define print_buffer_fields(bh)        do {} while (0)
#define print_buffer_trace(bh)        do {} while (0)
#define BUFFER_TRACE(bh, info)        do {} while (0)
#define BUFFER_TRACE2(bh, bh2, info)        do {} while (0)
#define JBUFFER_TRACE(jh, info)        do {} while (0)

#endif        /* __KERNEL__ */

#define EFSBADCRC        EBADMSG                /* Bad CRC detected */
#define EFSCORRUPTED        EUCLEAN                /* Filesystem is corrupted */

#endif        /* _LINUX_JBD2_H */
















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __CFG80211_RDEV_OPS
#define __CFG80211_RDEV_OPS

#include <linux/rtnetlink.h>
#include <net/cfg80211.h>
#include "core.h"
#include "trace.h"

static inline int rdev_suspend(struct cfg80211_registered_device *rdev,
                               struct cfg80211_wowlan *wowlan)
{
        int ret;
        trace_rdev_suspend(&rdev->wiphy, wowlan);
        ret = rdev->ops->suspend(&rdev->wiphy, wowlan);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_resume(struct cfg80211_registered_device *rdev)
{
        int ret;
        trace_rdev_resume(&rdev->wiphy);
        ret = rdev->ops->resume(&rdev->wiphy);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline void rdev_set_wakeup(struct cfg80211_registered_device *rdev,
                                   bool enabled)
{
        trace_rdev_set_wakeup(&rdev->wiphy, enabled);
        rdev->ops->set_wakeup(&rdev->wiphy, enabled);
        trace_rdev_return_void(&rdev->wiphy);
}

static inline struct wireless_dev
*rdev_add_virtual_intf(struct cfg80211_registered_device *rdev, char *name,
                       unsigned char name_assign_type,
                       enum nl80211_iftype type,
                       struct vif_params *params)
{
        struct wireless_dev *ret;
        trace_rdev_add_virtual_intf(&rdev->wiphy, name, type);
        ret = rdev->ops->add_virtual_intf(&rdev->wiphy, name, name_assign_type,
                                          type, params);
        trace_rdev_return_wdev(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_del_virtual_intf(struct cfg80211_registered_device *rdev,
                      struct wireless_dev *wdev)
{
        int ret;
        trace_rdev_del_virtual_intf(&rdev->wiphy, wdev);
        ret = rdev->ops->del_virtual_intf(&rdev->wiphy, wdev);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_change_virtual_intf(struct cfg80211_registered_device *rdev,
                         struct net_device *dev, enum nl80211_iftype type,
                         struct vif_params *params)
{
        int ret;
        trace_rdev_change_virtual_intf(&rdev->wiphy, dev, type);
        ret = rdev->ops->change_virtual_intf(&rdev->wiphy, dev, type, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_add_key(struct cfg80211_registered_device *rdev,
                               struct net_device *netdev, u8 key_index,
                               bool pairwise, const u8 *mac_addr,
                               struct key_params *params)
{
        int ret;
        trace_rdev_add_key(&rdev->wiphy, netdev, key_index, pairwise,
                           mac_addr, params->mode);
        ret = rdev->ops->add_key(&rdev->wiphy, netdev, key_index, pairwise,
                                  mac_addr, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_get_key(struct cfg80211_registered_device *rdev, struct net_device *netdev,
             u8 key_index, bool pairwise, const u8 *mac_addr, void *cookie,
             void (*callback)(void *cookie, struct key_params*))
{
        int ret;
        trace_rdev_get_key(&rdev->wiphy, netdev, key_index, pairwise, mac_addr);
        ret = rdev->ops->get_key(&rdev->wiphy, netdev, key_index, pairwise,
                                  mac_addr, cookie, callback);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_del_key(struct cfg80211_registered_device *rdev,
                               struct net_device *netdev, u8 key_index,
                               bool pairwise, const u8 *mac_addr)
{
        int ret;
        trace_rdev_del_key(&rdev->wiphy, netdev, key_index, pairwise, mac_addr);
        ret = rdev->ops->del_key(&rdev->wiphy, netdev, key_index, pairwise,
                                  mac_addr);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_set_default_key(struct cfg80211_registered_device *rdev,
                     struct net_device *netdev, u8 key_index, bool unicast,
                     bool multicast)
{
        int ret;
        trace_rdev_set_default_key(&rdev->wiphy, netdev, key_index,
                                   unicast, multicast);
        ret = rdev->ops->set_default_key(&rdev->wiphy, netdev, key_index,
                                          unicast, multicast);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_set_default_mgmt_key(struct cfg80211_registered_device *rdev,
                          struct net_device *netdev, u8 key_index)
{
        int ret;
        trace_rdev_set_default_mgmt_key(&rdev->wiphy, netdev, key_index);
        ret = rdev->ops->set_default_mgmt_key(&rdev->wiphy, netdev,
                                               key_index);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_set_default_beacon_key(struct cfg80211_registered_device *rdev,
                            struct net_device *netdev, u8 key_index)
{
        int ret;

        trace_rdev_set_default_beacon_key(&rdev->wiphy, netdev, key_index);
        ret = rdev->ops->set_default_beacon_key(&rdev->wiphy, netdev,
                                                key_index);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_start_ap(struct cfg80211_registered_device *rdev,
                                struct net_device *dev,
                                struct cfg80211_ap_settings *settings)
{
        int ret;
        trace_rdev_start_ap(&rdev->wiphy, dev, settings);
        ret = rdev->ops->start_ap(&rdev->wiphy, dev, settings);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_change_beacon(struct cfg80211_registered_device *rdev,
                                     struct net_device *dev,
                                     struct cfg80211_beacon_data *info)
{
        int ret;
        trace_rdev_change_beacon(&rdev->wiphy, dev, info);
        ret = rdev->ops->change_beacon(&rdev->wiphy, dev, info);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_stop_ap(struct cfg80211_registered_device *rdev,
                               struct net_device *dev)
{
        int ret;
        trace_rdev_stop_ap(&rdev->wiphy, dev);
        ret = rdev->ops->stop_ap(&rdev->wiphy, dev);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_add_station(struct cfg80211_registered_device *rdev,
                                   struct net_device *dev, u8 *mac,
                                   struct station_parameters *params)
{
        int ret;
        trace_rdev_add_station(&rdev->wiphy, dev, mac, params);
        ret = rdev->ops->add_station(&rdev->wiphy, dev, mac, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_del_station(struct cfg80211_registered_device *rdev,
                                   struct net_device *dev,
                                   struct station_del_parameters *params)
{
        int ret;
        trace_rdev_del_station(&rdev->wiphy, dev, params);
        ret = rdev->ops->del_station(&rdev->wiphy, dev, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_change_station(struct cfg80211_registered_device *rdev,
                                      struct net_device *dev, u8 *mac,
                                      struct station_parameters *params)
{
        int ret;
        trace_rdev_change_station(&rdev->wiphy, dev, mac, params);
        ret = rdev->ops->change_station(&rdev->wiphy, dev, mac, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_get_station(struct cfg80211_registered_device *rdev,
                                   struct net_device *dev, const u8 *mac,
                                   struct station_info *sinfo)
{
        int ret;
        trace_rdev_get_station(&rdev->wiphy, dev, mac);
        ret = rdev->ops->get_station(&rdev->wiphy, dev, mac, sinfo);
        trace_rdev_return_int_station_info(&rdev->wiphy, ret, sinfo);
        return ret;
}

static inline int rdev_dump_station(struct cfg80211_registered_device *rdev,
                                    struct net_device *dev, int idx, u8 *mac,
                                    struct station_info *sinfo)
{
        int ret;
        trace_rdev_dump_station(&rdev->wiphy, dev, idx, mac);
        ret = rdev->ops->dump_station(&rdev->wiphy, dev, idx, mac, sinfo);
        trace_rdev_return_int_station_info(&rdev->wiphy, ret, sinfo);
        return ret;
}

static inline int rdev_add_mpath(struct cfg80211_registered_device *rdev,
                                 struct net_device *dev, u8 *dst, u8 *next_hop)
{
        int ret;
        trace_rdev_add_mpath(&rdev->wiphy, dev, dst, next_hop);
        ret = rdev->ops->add_mpath(&rdev->wiphy, dev, dst, next_hop);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_del_mpath(struct cfg80211_registered_device *rdev,
                                 struct net_device *dev, u8 *dst)
{
        int ret;
        trace_rdev_del_mpath(&rdev->wiphy, dev, dst);
        ret = rdev->ops->del_mpath(&rdev->wiphy, dev, dst);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_change_mpath(struct cfg80211_registered_device *rdev,
                                    struct net_device *dev, u8 *dst,
                                    u8 *next_hop)
{
        int ret;
        trace_rdev_change_mpath(&rdev->wiphy, dev, dst, next_hop);
        ret = rdev->ops->change_mpath(&rdev->wiphy, dev, dst, next_hop);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_get_mpath(struct cfg80211_registered_device *rdev,
                                 struct net_device *dev, u8 *dst, u8 *next_hop,
                                 struct mpath_info *pinfo)
{
        int ret;
        trace_rdev_get_mpath(&rdev->wiphy, dev, dst, next_hop);
        ret = rdev->ops->get_mpath(&rdev->wiphy, dev, dst, next_hop, pinfo);
        trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo);
        return ret;

}

static inline int rdev_get_mpp(struct cfg80211_registered_device *rdev,
                               struct net_device *dev, u8 *dst, u8 *mpp,
                               struct mpath_info *pinfo)
{
        int ret;

        trace_rdev_get_mpp(&rdev->wiphy, dev, dst, mpp);
        ret = rdev->ops->get_mpp(&rdev->wiphy, dev, dst, mpp, pinfo);
        trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo);
        return ret;
}

static inline int rdev_dump_mpath(struct cfg80211_registered_device *rdev,
                                  struct net_device *dev, int idx, u8 *dst,
                                  u8 *next_hop, struct mpath_info *pinfo)

{
        int ret;
        trace_rdev_dump_mpath(&rdev->wiphy, dev, idx, dst, next_hop);
        ret = rdev->ops->dump_mpath(&rdev->wiphy, dev, idx, dst, next_hop,
                                    pinfo);
        trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo);
        return ret;
}

static inline int rdev_dump_mpp(struct cfg80211_registered_device *rdev,
                                struct net_device *dev, int idx, u8 *dst,
                                u8 *mpp, struct mpath_info *pinfo)

{
        int ret;

        trace_rdev_dump_mpp(&rdev->wiphy, dev, idx, dst, mpp);
        ret = rdev->ops->dump_mpp(&rdev->wiphy, dev, idx, dst, mpp, pinfo);
        trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo);
        return ret;
}

static inline int
rdev_get_mesh_config(struct cfg80211_registered_device *rdev,
                     struct net_device *dev, struct mesh_config *conf)
{
        int ret;
        trace_rdev_get_mesh_config(&rdev->wiphy, dev);
        ret = rdev->ops->get_mesh_config(&rdev->wiphy, dev, conf);
        trace_rdev_return_int_mesh_config(&rdev->wiphy, ret, conf);
        return ret;
}

static inline int
rdev_update_mesh_config(struct cfg80211_registered_device *rdev,
                        struct net_device *dev, u32 mask,
                        const struct mesh_config *nconf)
{
        int ret;
        trace_rdev_update_mesh_config(&rdev->wiphy, dev, mask, nconf);
        ret = rdev->ops->update_mesh_config(&rdev->wiphy, dev, mask, nconf);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_join_mesh(struct cfg80211_registered_device *rdev,
                                 struct net_device *dev,
                                 const struct mesh_config *conf,
                                 const struct mesh_setup *setup)
{
        int ret;
        trace_rdev_join_mesh(&rdev->wiphy, dev, conf, setup);
        ret = rdev->ops->join_mesh(&rdev->wiphy, dev, conf, setup);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}


static inline int rdev_leave_mesh(struct cfg80211_registered_device *rdev,
                                  struct net_device *dev)
{
        int ret;
        trace_rdev_leave_mesh(&rdev->wiphy, dev);
        ret = rdev->ops->leave_mesh(&rdev->wiphy, dev);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_join_ocb(struct cfg80211_registered_device *rdev,
                                struct net_device *dev,
                                struct ocb_setup *setup)
{
        int ret;
        trace_rdev_join_ocb(&rdev->wiphy, dev, setup);
        ret = rdev->ops->join_ocb(&rdev->wiphy, dev, setup);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_leave_ocb(struct cfg80211_registered_device *rdev,
                                 struct net_device *dev)
{
        int ret;
        trace_rdev_leave_ocb(&rdev->wiphy, dev);
        ret = rdev->ops->leave_ocb(&rdev->wiphy, dev);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_change_bss(struct cfg80211_registered_device *rdev,
                                  struct net_device *dev,
                                  struct bss_parameters *params)

{
        int ret;
        trace_rdev_change_bss(&rdev->wiphy, dev, params);
        ret = rdev->ops->change_bss(&rdev->wiphy, dev, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_set_txq_params(struct cfg80211_registered_device *rdev,
                                      struct net_device *dev,
                                      struct ieee80211_txq_params *params)

{
        int ret;
        trace_rdev_set_txq_params(&rdev->wiphy, dev, params);
        ret = rdev->ops->set_txq_params(&rdev->wiphy, dev, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_libertas_set_mesh_channel(struct cfg80211_registered_device *rdev,
                               struct net_device *dev,
                               struct ieee80211_channel *chan)
{
        int ret;
        trace_rdev_libertas_set_mesh_channel(&rdev->wiphy, dev, chan);
        ret = rdev->ops->libertas_set_mesh_channel(&rdev->wiphy, dev, chan);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_set_monitor_channel(struct cfg80211_registered_device *rdev,
                         struct cfg80211_chan_def *chandef)
{
        int ret;
        trace_rdev_set_monitor_channel(&rdev->wiphy, chandef);
        ret = rdev->ops->set_monitor_channel(&rdev->wiphy, chandef);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_scan(struct cfg80211_registered_device *rdev,
                            struct cfg80211_scan_request *request)
{
        int ret;
        trace_rdev_scan(&rdev->wiphy, request);
        ret = rdev->ops->scan(&rdev->wiphy, request);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline void rdev_abort_scan(struct cfg80211_registered_device *rdev,
                                   struct wireless_dev *wdev)
{
        trace_rdev_abort_scan(&rdev->wiphy, wdev);
        rdev->ops->abort_scan(&rdev->wiphy, wdev);
        trace_rdev_return_void(&rdev->wiphy);
}

static inline int rdev_auth(struct cfg80211_registered_device *rdev,
                            struct net_device *dev,
                            struct cfg80211_auth_request *req)
{
        int ret;
        trace_rdev_auth(&rdev->wiphy, dev, req);
        ret = rdev->ops->auth(&rdev->wiphy, dev, req);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_assoc(struct cfg80211_registered_device *rdev,
                             struct net_device *dev,
                             struct cfg80211_assoc_request *req)
{
        int ret;
        trace_rdev_assoc(&rdev->wiphy, dev, req);
        ret = rdev->ops->assoc(&rdev->wiphy, dev, req);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_deauth(struct cfg80211_registered_device *rdev,
                              struct net_device *dev,
                              struct cfg80211_deauth_request *req)
{
        int ret;
        trace_rdev_deauth(&rdev->wiphy, dev, req);
        ret = rdev->ops->deauth(&rdev->wiphy, dev, req);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_disassoc(struct cfg80211_registered_device *rdev,
                                struct net_device *dev,
                                struct cfg80211_disassoc_request *req)
{
        int ret;
        trace_rdev_disassoc(&rdev->wiphy, dev, req);
        ret = rdev->ops->disassoc(&rdev->wiphy, dev, req);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_connect(struct cfg80211_registered_device *rdev,
                               struct net_device *dev,
                               struct cfg80211_connect_params *sme)
{
        int ret;
        trace_rdev_connect(&rdev->wiphy, dev, sme);
        ret = rdev->ops->connect(&rdev->wiphy, dev, sme);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_update_connect_params(struct cfg80211_registered_device *rdev,
                           struct net_device *dev,
                           struct cfg80211_connect_params *sme, u32 changed)
{
        int ret;
        trace_rdev_update_connect_params(&rdev->wiphy, dev, sme, changed);
        ret = rdev->ops->update_connect_params(&rdev->wiphy, dev, sme, changed);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_disconnect(struct cfg80211_registered_device *rdev,
                                  struct net_device *dev, u16 reason_code)
{
        int ret;
        trace_rdev_disconnect(&rdev->wiphy, dev, reason_code);
        ret = rdev->ops->disconnect(&rdev->wiphy, dev, reason_code);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_join_ibss(struct cfg80211_registered_device *rdev,
                                 struct net_device *dev,
                                 struct cfg80211_ibss_params *params)
{
        int ret;
        trace_rdev_join_ibss(&rdev->wiphy, dev, params);
        ret = rdev->ops->join_ibss(&rdev->wiphy, dev, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_leave_ibss(struct cfg80211_registered_device *rdev,
                                  struct net_device *dev)
{
        int ret;
        trace_rdev_leave_ibss(&rdev->wiphy, dev);
        ret = rdev->ops->leave_ibss(&rdev->wiphy, dev);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_set_wiphy_params(struct cfg80211_registered_device *rdev, u32 changed)
{
        int ret;

        if (!rdev->ops->set_wiphy_params)
                return -EOPNOTSUPP;

        trace_rdev_set_wiphy_params(&rdev->wiphy, changed);
        ret = rdev->ops->set_wiphy_params(&rdev->wiphy, changed);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_set_tx_power(struct cfg80211_registered_device *rdev,
                                    struct wireless_dev *wdev,
                                    enum nl80211_tx_power_setting type, int mbm)
{
        int ret;
        trace_rdev_set_tx_power(&rdev->wiphy, wdev, type, mbm);
        ret = rdev->ops->set_tx_power(&rdev->wiphy, wdev, type, mbm);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_get_tx_power(struct cfg80211_registered_device *rdev,
                                    struct wireless_dev *wdev, int *dbm)
{
        int ret;
        trace_rdev_get_tx_power(&rdev->wiphy, wdev);
        ret = rdev->ops->get_tx_power(&rdev->wiphy, wdev, dbm);
        trace_rdev_return_int_int(&rdev->wiphy, ret, *dbm);
        return ret;
}

static inline int rdev_set_wds_peer(struct cfg80211_registered_device *rdev,
                                    struct net_device *dev, const u8 *addr)
{
        int ret;
        trace_rdev_set_wds_peer(&rdev->wiphy, dev, addr);
        ret = rdev->ops->set_wds_peer(&rdev->wiphy, dev, addr);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_set_multicast_to_unicast(struct cfg80211_registered_device *rdev,
                              struct net_device *dev,
                              const bool enabled)
{
        int ret;
        trace_rdev_set_multicast_to_unicast(&rdev->wiphy, dev, enabled);
        ret = rdev->ops->set_multicast_to_unicast(&rdev->wiphy, dev, enabled);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_get_txq_stats(struct cfg80211_registered_device *rdev,
                   struct wireless_dev *wdev,
                   struct cfg80211_txq_stats *txqstats)
{
        int ret;
        trace_rdev_get_txq_stats(&rdev->wiphy, wdev);
        ret = rdev->ops->get_txq_stats(&rdev->wiphy, wdev, txqstats);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline void rdev_rfkill_poll(struct cfg80211_registered_device *rdev)
{
        trace_rdev_rfkill_poll(&rdev->wiphy);
        rdev->ops->rfkill_poll(&rdev->wiphy);
        trace_rdev_return_void(&rdev->wiphy);
}


#ifdef CONFIG_NL80211_TESTMODE
static inline int rdev_testmode_cmd(struct cfg80211_registered_device *rdev,
                                    struct wireless_dev *wdev,
                                    void *data, int len)
{
        int ret;
        trace_rdev_testmode_cmd(&rdev->wiphy, wdev);
        ret = rdev->ops->testmode_cmd(&rdev->wiphy, wdev, data, len);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_testmode_dump(struct cfg80211_registered_device *rdev,
                                     struct sk_buff *skb,
                                     struct netlink_callback *cb, void *data,
                                     int len)
{
        int ret;
        trace_rdev_testmode_dump(&rdev->wiphy);
        ret = rdev->ops->testmode_dump(&rdev->wiphy, skb, cb, data, len);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}
#endif

static inline int
rdev_set_bitrate_mask(struct cfg80211_registered_device *rdev,
                      struct net_device *dev, const u8 *peer,
                      const struct cfg80211_bitrate_mask *mask)
{
        int ret;
        trace_rdev_set_bitrate_mask(&rdev->wiphy, dev, peer, mask);
        ret = rdev->ops->set_bitrate_mask(&rdev->wiphy, dev, peer, mask);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_dump_survey(struct cfg80211_registered_device *rdev,
                                   struct net_device *netdev, int idx,
                                   struct survey_info *info)
{
        int ret;
        trace_rdev_dump_survey(&rdev->wiphy, netdev, idx);
        ret = rdev->ops->dump_survey(&rdev->wiphy, netdev, idx, info);
        if (ret < 0)
                trace_rdev_return_int(&rdev->wiphy, ret);
        else
                trace_rdev_return_int_survey_info(&rdev->wiphy, ret, info);
        return ret;
}

static inline int rdev_set_pmksa(struct cfg80211_registered_device *rdev,
                                 struct net_device *netdev,
                                 struct cfg80211_pmksa *pmksa)
{
        int ret;
        trace_rdev_set_pmksa(&rdev->wiphy, netdev, pmksa);
        ret = rdev->ops->set_pmksa(&rdev->wiphy, netdev, pmksa);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_del_pmksa(struct cfg80211_registered_device *rdev,
                                 struct net_device *netdev,
                                 struct cfg80211_pmksa *pmksa)
{
        int ret;
        trace_rdev_del_pmksa(&rdev->wiphy, netdev, pmksa);
        ret = rdev->ops->del_pmksa(&rdev->wiphy, netdev, pmksa);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_flush_pmksa(struct cfg80211_registered_device *rdev,
                                   struct net_device *netdev)
{
        int ret;
        trace_rdev_flush_pmksa(&rdev->wiphy, netdev);
        ret = rdev->ops->flush_pmksa(&rdev->wiphy, netdev);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_remain_on_channel(struct cfg80211_registered_device *rdev,
                       struct wireless_dev *wdev,
                       struct ieee80211_channel *chan,
                       unsigned int duration, u64 *cookie)
{
        int ret;
        trace_rdev_remain_on_channel(&rdev->wiphy, wdev, chan, duration);
        ret = rdev->ops->remain_on_channel(&rdev->wiphy, wdev, chan,
                                           duration, cookie);
        trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie);
        return ret;
}

static inline int
rdev_cancel_remain_on_channel(struct cfg80211_registered_device *rdev,
                              struct wireless_dev *wdev, u64 cookie)
{
        int ret;
        trace_rdev_cancel_remain_on_channel(&rdev->wiphy, wdev, cookie);
        ret = rdev->ops->cancel_remain_on_channel(&rdev->wiphy, wdev, cookie);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_mgmt_tx(struct cfg80211_registered_device *rdev,
                               struct wireless_dev *wdev,
                               struct cfg80211_mgmt_tx_params *params,
                               u64 *cookie)
{
        int ret;
        trace_rdev_mgmt_tx(&rdev->wiphy, wdev, params);
        ret = rdev->ops->mgmt_tx(&rdev->wiphy, wdev, params, cookie);
        trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie);
        return ret;
}

static inline int rdev_tx_control_port(struct cfg80211_registered_device *rdev,
                                       struct net_device *dev,
                                       const void *buf, size_t len,
                                       const u8 *dest, __be16 proto,
                                       const bool noencrypt, u64 *cookie)
{
        int ret;
        trace_rdev_tx_control_port(&rdev->wiphy, dev, buf, len,
                                   dest, proto, noencrypt);
        ret = rdev->ops->tx_control_port(&rdev->wiphy, dev, buf, len,
                                         dest, proto, noencrypt, cookie);
        if (cookie)
                trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie);
        else
                trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_mgmt_tx_cancel_wait(struct cfg80211_registered_device *rdev,
                         struct wireless_dev *wdev, u64 cookie)
{
        int ret;
        trace_rdev_mgmt_tx_cancel_wait(&rdev->wiphy, wdev, cookie);
        ret = rdev->ops->mgmt_tx_cancel_wait(&rdev->wiphy, wdev, cookie);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_set_power_mgmt(struct cfg80211_registered_device *rdev,
                                      struct net_device *dev, bool enabled,
                                      int timeout)
{
        int ret;
        trace_rdev_set_power_mgmt(&rdev->wiphy, dev, enabled, timeout);
        ret = rdev->ops->set_power_mgmt(&rdev->wiphy, dev, enabled, timeout);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_set_cqm_rssi_config(struct cfg80211_registered_device *rdev,
                         struct net_device *dev, s32 rssi_thold, u32 rssi_hyst)
{
        int ret;
        trace_rdev_set_cqm_rssi_config(&rdev->wiphy, dev, rssi_thold,
                                       rssi_hyst);
        ret = rdev->ops->set_cqm_rssi_config(&rdev->wiphy, dev, rssi_thold,
                                       rssi_hyst);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_set_cqm_rssi_range_config(struct cfg80211_registered_device *rdev,
                               struct net_device *dev, s32 low, s32 high)
{
        int ret;
        trace_rdev_set_cqm_rssi_range_config(&rdev->wiphy, dev, low, high);
        ret = rdev->ops->set_cqm_rssi_range_config(&rdev->wiphy, dev,
                                                   low, high);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_set_cqm_txe_config(struct cfg80211_registered_device *rdev,
                        struct net_device *dev, u32 rate, u32 pkts, u32 intvl)
{
        int ret;
        trace_rdev_set_cqm_txe_config(&rdev->wiphy, dev, rate, pkts, intvl);
        ret = rdev->ops->set_cqm_txe_config(&rdev->wiphy, dev, rate, pkts,
                                             intvl);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline void
rdev_update_mgmt_frame_registrations(struct cfg80211_registered_device *rdev,
                                     struct wireless_dev *wdev,
                                     struct mgmt_frame_regs *upd)
{
        might_sleep();

        trace_rdev_update_mgmt_frame_registrations(&rdev->wiphy, wdev, upd);
        if (rdev->ops->update_mgmt_frame_registrations)
                rdev->ops->update_mgmt_frame_registrations(&rdev->wiphy, wdev,
                                                           upd);
        trace_rdev_return_void(&rdev->wiphy);
}

static inline int rdev_set_antenna(struct cfg80211_registered_device *rdev,
                                   u32 tx_ant, u32 rx_ant)
{
        int ret;
        trace_rdev_set_antenna(&rdev->wiphy, tx_ant, rx_ant);
        ret = rdev->ops->set_antenna(&rdev->wiphy, tx_ant, rx_ant);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_get_antenna(struct cfg80211_registered_device *rdev,
                                   u32 *tx_ant, u32 *rx_ant)
{
        int ret;
        trace_rdev_get_antenna(&rdev->wiphy);
        ret = rdev->ops->get_antenna(&rdev->wiphy, tx_ant, rx_ant);
        if (ret)
                trace_rdev_return_int(&rdev->wiphy, ret);
        else
                trace_rdev_return_int_tx_rx(&rdev->wiphy, ret, *tx_ant,
                                            *rx_ant);
        return ret;
}

static inline int
rdev_sched_scan_start(struct cfg80211_registered_device *rdev,
                      struct net_device *dev,
                      struct cfg80211_sched_scan_request *request)
{
        int ret;
        trace_rdev_sched_scan_start(&rdev->wiphy, dev, request->reqid);
        ret = rdev->ops->sched_scan_start(&rdev->wiphy, dev, request);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_sched_scan_stop(struct cfg80211_registered_device *rdev,
                                       struct net_device *dev, u64 reqid)
{
        int ret;
        trace_rdev_sched_scan_stop(&rdev->wiphy, dev, reqid);
        ret = rdev->ops->sched_scan_stop(&rdev->wiphy, dev, reqid);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_set_rekey_data(struct cfg80211_registered_device *rdev,
                                      struct net_device *dev,
                                      struct cfg80211_gtk_rekey_data *data)
{
        int ret;
        trace_rdev_set_rekey_data(&rdev->wiphy, dev);
        ret = rdev->ops->set_rekey_data(&rdev->wiphy, dev, data);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_tdls_mgmt(struct cfg80211_registered_device *rdev,
                                 struct net_device *dev, u8 *peer,
                                 u8 action_code, u8 dialog_token,
                                 u16 status_code, u32 peer_capability,
                                 bool initiator, const u8 *buf, size_t len)
{
        int ret;
        trace_rdev_tdls_mgmt(&rdev->wiphy, dev, peer, action_code,
                             dialog_token, status_code, peer_capability,
                             initiator, buf, len);
        ret = rdev->ops->tdls_mgmt(&rdev->wiphy, dev, peer, action_code,
                                   dialog_token, status_code, peer_capability,
                                   initiator, buf, len);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_tdls_oper(struct cfg80211_registered_device *rdev,
                                 struct net_device *dev, u8 *peer,
                                 enum nl80211_tdls_operation oper)
{
        int ret;
        trace_rdev_tdls_oper(&rdev->wiphy, dev, peer, oper);
        ret = rdev->ops->tdls_oper(&rdev->wiphy, dev, peer, oper);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_probe_client(struct cfg80211_registered_device *rdev,
                                    struct net_device *dev, const u8 *peer,
                                    u64 *cookie)
{
        int ret;
        trace_rdev_probe_client(&rdev->wiphy, dev, peer);
        ret = rdev->ops->probe_client(&rdev->wiphy, dev, peer, cookie);
        trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie);
        return ret;
}

static inline int rdev_set_noack_map(struct cfg80211_registered_device *rdev,
                                     struct net_device *dev, u16 noack_map)
{
        int ret;
        trace_rdev_set_noack_map(&rdev->wiphy, dev, noack_map);
        ret = rdev->ops->set_noack_map(&rdev->wiphy, dev, noack_map);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_get_channel(struct cfg80211_registered_device *rdev,
                 struct wireless_dev *wdev,
                 struct cfg80211_chan_def *chandef)
{
        int ret;

        trace_rdev_get_channel(&rdev->wiphy, wdev);
        ret = rdev->ops->get_channel(&rdev->wiphy, wdev, chandef);
        trace_rdev_return_chandef(&rdev->wiphy, ret, chandef);

        return ret;
}

static inline int rdev_start_p2p_device(struct cfg80211_registered_device *rdev,
                                        struct wireless_dev *wdev)
{
        int ret;

        trace_rdev_start_p2p_device(&rdev->wiphy, wdev);
        ret = rdev->ops->start_p2p_device(&rdev->wiphy, wdev);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline void rdev_stop_p2p_device(struct cfg80211_registered_device *rdev,
                                        struct wireless_dev *wdev)
{
        trace_rdev_stop_p2p_device(&rdev->wiphy, wdev);
        rdev->ops->stop_p2p_device(&rdev->wiphy, wdev);
        trace_rdev_return_void(&rdev->wiphy);
}

static inline int rdev_start_nan(struct cfg80211_registered_device *rdev,
                                 struct wireless_dev *wdev,
                                 struct cfg80211_nan_conf *conf)
{
        int ret;

        trace_rdev_start_nan(&rdev->wiphy, wdev, conf);
        ret = rdev->ops->start_nan(&rdev->wiphy, wdev, conf);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline void rdev_stop_nan(struct cfg80211_registered_device *rdev,
                                 struct wireless_dev *wdev)
{
        trace_rdev_stop_nan(&rdev->wiphy, wdev);
        rdev->ops->stop_nan(&rdev->wiphy, wdev);
        trace_rdev_return_void(&rdev->wiphy);
}

static inline int
rdev_add_nan_func(struct cfg80211_registered_device *rdev,
                  struct wireless_dev *wdev,
                  struct cfg80211_nan_func *nan_func)
{
        int ret;

        trace_rdev_add_nan_func(&rdev->wiphy, wdev, nan_func);
        ret = rdev->ops->add_nan_func(&rdev->wiphy, wdev, nan_func);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline void rdev_del_nan_func(struct cfg80211_registered_device *rdev,
                                    struct wireless_dev *wdev, u64 cookie)
{
        trace_rdev_del_nan_func(&rdev->wiphy, wdev, cookie);
        rdev->ops->del_nan_func(&rdev->wiphy, wdev, cookie);
        trace_rdev_return_void(&rdev->wiphy);
}

static inline int
rdev_nan_change_conf(struct cfg80211_registered_device *rdev,
                     struct wireless_dev *wdev,
                     struct cfg80211_nan_conf *conf, u32 changes)
{
        int ret;

        trace_rdev_nan_change_conf(&rdev->wiphy, wdev, conf, changes);
        if (rdev->ops->nan_change_conf)
                ret = rdev->ops->nan_change_conf(&rdev->wiphy, wdev, conf,
                                                 changes);
        else
                ret = -ENOTSUPP;
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_set_mac_acl(struct cfg80211_registered_device *rdev,
                                   struct net_device *dev,
                                   struct cfg80211_acl_data *params)
{
        int ret;

        trace_rdev_set_mac_acl(&rdev->wiphy, dev, params);
        ret = rdev->ops->set_mac_acl(&rdev->wiphy, dev, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_update_ft_ies(struct cfg80211_registered_device *rdev,
                                     struct net_device *dev,
                                     struct cfg80211_update_ft_ies_params *ftie)
{
        int ret;

        trace_rdev_update_ft_ies(&rdev->wiphy, dev, ftie);
        ret = rdev->ops->update_ft_ies(&rdev->wiphy, dev, ftie);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_crit_proto_start(struct cfg80211_registered_device *rdev,
                                        struct wireless_dev *wdev,
                                        enum nl80211_crit_proto_id protocol,
                                        u16 duration)
{
        int ret;

        trace_rdev_crit_proto_start(&rdev->wiphy, wdev, protocol, duration);
        ret = rdev->ops->crit_proto_start(&rdev->wiphy, wdev,
                                          protocol, duration);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline void rdev_crit_proto_stop(struct cfg80211_registered_device *rdev,
                                       struct wireless_dev *wdev)
{
        trace_rdev_crit_proto_stop(&rdev->wiphy, wdev);
        rdev->ops->crit_proto_stop(&rdev->wiphy, wdev);
        trace_rdev_return_void(&rdev->wiphy);
}

static inline int rdev_channel_switch(struct cfg80211_registered_device *rdev,
                                      struct net_device *dev,
                                      struct cfg80211_csa_settings *params)
{
        int ret;

        trace_rdev_channel_switch(&rdev->wiphy, dev, params);
        ret = rdev->ops->channel_switch(&rdev->wiphy, dev, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_set_qos_map(struct cfg80211_registered_device *rdev,
                                   struct net_device *dev,
                                   struct cfg80211_qos_map *qos_map)
{
        int ret = -EOPNOTSUPP;

        if (rdev->ops->set_qos_map) {
                trace_rdev_set_qos_map(&rdev->wiphy, dev, qos_map);
                ret = rdev->ops->set_qos_map(&rdev->wiphy, dev, qos_map);
                trace_rdev_return_int(&rdev->wiphy, ret);
        }

        return ret;
}

static inline int
rdev_set_ap_chanwidth(struct cfg80211_registered_device *rdev,
                      struct net_device *dev, struct cfg80211_chan_def *chandef)
{
        int ret;

        trace_rdev_set_ap_chanwidth(&rdev->wiphy, dev, chandef);
        ret = rdev->ops->set_ap_chanwidth(&rdev->wiphy, dev, chandef);
        trace_rdev_return_int(&rdev->wiphy, ret);

        return ret;
}

static inline int
rdev_add_tx_ts(struct cfg80211_registered_device *rdev,
               struct net_device *dev, u8 tsid, const u8 *peer,
               u8 user_prio, u16 admitted_time)
{
        int ret = -EOPNOTSUPP;

        trace_rdev_add_tx_ts(&rdev->wiphy, dev, tsid, peer,
                             user_prio, admitted_time);
        if (rdev->ops->add_tx_ts)
                ret = rdev->ops->add_tx_ts(&rdev->wiphy, dev, tsid, peer,
                                           user_prio, admitted_time);
        trace_rdev_return_int(&rdev->wiphy, ret);

        return ret;
}

static inline int
rdev_del_tx_ts(struct cfg80211_registered_device *rdev,
               struct net_device *dev, u8 tsid, const u8 *peer)
{
        int ret = -EOPNOTSUPP;

        trace_rdev_del_tx_ts(&rdev->wiphy, dev, tsid, peer);
        if (rdev->ops->del_tx_ts)
                ret = rdev->ops->del_tx_ts(&rdev->wiphy, dev, tsid, peer);
        trace_rdev_return_int(&rdev->wiphy, ret);

        return ret;
}

static inline int
rdev_tdls_channel_switch(struct cfg80211_registered_device *rdev,
                         struct net_device *dev, const u8 *addr,
                         u8 oper_class, struct cfg80211_chan_def *chandef)
{
        int ret;

        trace_rdev_tdls_channel_switch(&rdev->wiphy, dev, addr, oper_class,
                                       chandef);
        ret = rdev->ops->tdls_channel_switch(&rdev->wiphy, dev, addr,
                                             oper_class, chandef);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline void
rdev_tdls_cancel_channel_switch(struct cfg80211_registered_device *rdev,
                                struct net_device *dev, const u8 *addr)
{
        trace_rdev_tdls_cancel_channel_switch(&rdev->wiphy, dev, addr);
        rdev->ops->tdls_cancel_channel_switch(&rdev->wiphy, dev, addr);
        trace_rdev_return_void(&rdev->wiphy);
}

static inline int
rdev_start_radar_detection(struct cfg80211_registered_device *rdev,
                           struct net_device *dev,
                           struct cfg80211_chan_def *chandef,
                           u32 cac_time_ms)
{
        int ret = -ENOTSUPP;

        trace_rdev_start_radar_detection(&rdev->wiphy, dev, chandef,
                                         cac_time_ms);
        if (rdev->ops->start_radar_detection)
                ret = rdev->ops->start_radar_detection(&rdev->wiphy, dev,
                                                       chandef, cac_time_ms);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline void
rdev_end_cac(struct cfg80211_registered_device *rdev,
             struct net_device *dev)
{
        trace_rdev_end_cac(&rdev->wiphy, dev);
        if (rdev->ops->end_cac)
                rdev->ops->end_cac(&rdev->wiphy, dev);
        trace_rdev_return_void(&rdev->wiphy);
}

static inline int
rdev_set_mcast_rate(struct cfg80211_registered_device *rdev,
                    struct net_device *dev,
                    int mcast_rate[NUM_NL80211_BANDS])
{
        int ret = -ENOTSUPP;

        trace_rdev_set_mcast_rate(&rdev->wiphy, dev, mcast_rate);
        if (rdev->ops->set_mcast_rate)
                ret = rdev->ops->set_mcast_rate(&rdev->wiphy, dev, mcast_rate);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_set_coalesce(struct cfg80211_registered_device *rdev,
                  struct cfg80211_coalesce *coalesce)
{
        int ret = -ENOTSUPP;

        trace_rdev_set_coalesce(&rdev->wiphy, coalesce);
        if (rdev->ops->set_coalesce)
                ret = rdev->ops->set_coalesce(&rdev->wiphy, coalesce);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_set_pmk(struct cfg80211_registered_device *rdev,
                               struct net_device *dev,
                               struct cfg80211_pmk_conf *pmk_conf)
{
        int ret = -EOPNOTSUPP;

        trace_rdev_set_pmk(&rdev->wiphy, dev, pmk_conf);
        if (rdev->ops->set_pmk)
                ret = rdev->ops->set_pmk(&rdev->wiphy, dev, pmk_conf);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_del_pmk(struct cfg80211_registered_device *rdev,
                               struct net_device *dev, const u8 *aa)
{
        int ret = -EOPNOTSUPP;

        trace_rdev_del_pmk(&rdev->wiphy, dev, aa);
        if (rdev->ops->del_pmk)
                ret = rdev->ops->del_pmk(&rdev->wiphy, dev, aa);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_external_auth(struct cfg80211_registered_device *rdev,
                   struct net_device *dev,
                   struct cfg80211_external_auth_params *params)
{
        int ret = -EOPNOTSUPP;

        trace_rdev_external_auth(&rdev->wiphy, dev, params);
        if (rdev->ops->external_auth)
                ret = rdev->ops->external_auth(&rdev->wiphy, dev, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_get_ftm_responder_stats(struct cfg80211_registered_device *rdev,
                             struct net_device *dev,
                             struct cfg80211_ftm_responder_stats *ftm_stats)
{
        int ret = -EOPNOTSUPP;

        trace_rdev_get_ftm_responder_stats(&rdev->wiphy, dev, ftm_stats);
        if (rdev->ops->get_ftm_responder_stats)
                ret = rdev->ops->get_ftm_responder_stats(&rdev->wiphy, dev,
                                                        ftm_stats);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_start_pmsr(struct cfg80211_registered_device *rdev,
                struct wireless_dev *wdev,
                struct cfg80211_pmsr_request *request)
{
        int ret = -EOPNOTSUPP;

        trace_rdev_start_pmsr(&rdev->wiphy, wdev, request->cookie);
        if (rdev->ops->start_pmsr)
                ret = rdev->ops->start_pmsr(&rdev->wiphy, wdev, request);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline void
rdev_abort_pmsr(struct cfg80211_registered_device *rdev,
                struct wireless_dev *wdev,
                struct cfg80211_pmsr_request *request)
{
        trace_rdev_abort_pmsr(&rdev->wiphy, wdev, request->cookie);
        if (rdev->ops->abort_pmsr)
                rdev->ops->abort_pmsr(&rdev->wiphy, wdev, request);
        trace_rdev_return_void(&rdev->wiphy);
}

static inline int rdev_update_owe_info(struct cfg80211_registered_device *rdev,
                                       struct net_device *dev,
                                       struct cfg80211_update_owe_info *oweinfo)
{
        int ret = -EOPNOTSUPP;

        trace_rdev_update_owe_info(&rdev->wiphy, dev, oweinfo);
        if (rdev->ops->update_owe_info)
                ret = rdev->ops->update_owe_info(&rdev->wiphy, dev, oweinfo);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_probe_mesh_link(struct cfg80211_registered_device *rdev,
                     struct net_device *dev, const u8 *dest,
                     const void *buf, size_t len)
{
        int ret;

        trace_rdev_probe_mesh_link(&rdev->wiphy, dev, dest, buf, len);
        ret = rdev->ops->probe_mesh_link(&rdev->wiphy, dev, buf, len);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_set_tid_config(struct cfg80211_registered_device *rdev,
                                      struct net_device *dev,
                                      struct cfg80211_tid_config *tid_conf)
{
        int ret;

        trace_rdev_set_tid_config(&rdev->wiphy, dev, tid_conf);
        ret = rdev->ops->set_tid_config(&rdev->wiphy, dev, tid_conf);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_reset_tid_config(struct cfg80211_registered_device *rdev,
                                        struct net_device *dev, const u8 *peer,
                                        u8 tids)
{
        int ret;

        trace_rdev_reset_tid_config(&rdev->wiphy, dev, peer, tids);
        ret = rdev->ops->reset_tid_config(&rdev->wiphy, dev, peer, tids);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

#endif /* __CFG80211_RDEV_OPS */






































































































































































































    1 


























   12 























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SIGNAL_H
#define _LINUX_SIGNAL_H

#include <linux/bug.h>
#include <linux/signal_types.h>
#include <linux/string.h>

struct task_struct;

/* for sysctl */
extern int print_fatal_signals;

static inline void copy_siginfo(kernel_siginfo_t *to,
                                const kernel_siginfo_t *from)
{
        memcpy(to, from, sizeof(*to));
}

static inline void clear_siginfo(kernel_siginfo_t *info)
{
        memset(info, 0, sizeof(*info));
}

#define SI_EXPANSION_SIZE (sizeof(struct siginfo) - sizeof(struct kernel_siginfo))

static inline void copy_siginfo_to_external(siginfo_t *to,
                                            const kernel_siginfo_t *from)
{
        memcpy(to, from, sizeof(*from));
        memset(((char *)to) + sizeof(struct kernel_siginfo), 0,
                SI_EXPANSION_SIZE);
}

int copy_siginfo_to_user(siginfo_t __user *to, const kernel_siginfo_t *from);
int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from);

enum siginfo_layout {
        SIL_KILL,
        SIL_TIMER,
        SIL_POLL,
        SIL_FAULT,
        SIL_FAULT_MCEERR,
        SIL_FAULT_BNDERR,
        SIL_FAULT_PKUERR,
        SIL_CHLD,
        SIL_RT,
        SIL_SYS,
};

enum siginfo_layout siginfo_layout(unsigned sig, int si_code);

/*
 * Define some primitives to manipulate sigset_t.
 */

#ifndef __HAVE_ARCH_SIG_BITOPS
#include <linux/bitops.h>

/* We don't use <linux/bitops.h> for these because there is no need to
   be atomic.  */
static inline void sigaddset(sigset_t *set, int _sig)
{
        unsigned long sig = _sig - 1;
        if (_NSIG_WORDS == 1)
                set->sig[0] |= 1UL << sig;
        else
                set->sig[sig / _NSIG_BPW] |= 1UL << (sig % _NSIG_BPW);
}

static inline void sigdelset(sigset_t *set, int _sig)
{
        unsigned long sig = _sig - 1;
        if (_NSIG_WORDS == 1)
                set->sig[0] &= ~(1UL << sig);
        else
                set->sig[sig / _NSIG_BPW] &= ~(1UL << (sig % _NSIG_BPW));
}

static inline int sigismember(sigset_t *set, int _sig)
{
        unsigned long sig = _sig - 1;
        if (_NSIG_WORDS == 1)
                return 1 & (set->sig[0] >> sig);
        else
                return 1 & (set->sig[sig / _NSIG_BPW] >> (sig % _NSIG_BPW));
}

#endif /* __HAVE_ARCH_SIG_BITOPS */

static inline int sigisemptyset(sigset_t *set)
{
        switch (_NSIG_WORDS) {
        case 4:
                return (set->sig[3] | set->sig[2] |
                        set->sig[1] | set->sig[0]) == 0;
        case 2:
                return (set->sig[1] | set->sig[0]) == 0;
        case 1:
                return set->sig[0] == 0;
        default:
                BUILD_BUG();
                return 0;
        }
}

static inline int sigequalsets(const sigset_t *set1, const sigset_t *set2)
{
        switch (_NSIG_WORDS) {
        case 4:
                return        (set1->sig[3] == set2->sig[3]) &&
                        (set1->sig[2] == set2->sig[2]) &&
                        (set1->sig[1] == set2->sig[1]) &&
                        (set1->sig[0] == set2->sig[0]);
        case 2:
                return        (set1->sig[1] == set2->sig[1]) &&
                        (set1->sig[0] == set2->sig[0]);
        case 1:
                return        set1->sig[0] == set2->sig[0];
        }
        return 0;
}

#define sigmask(sig)        (1UL << ((sig) - 1))

#ifndef __HAVE_ARCH_SIG_SETOPS
#include <linux/string.h>

#define _SIG_SET_BINOP(name, op)                                        \
static inline void name(sigset_t *r, const sigset_t *a, const sigset_t *b) \
{                                                                        \
        unsigned long a0, a1, a2, a3, b0, b1, b2, b3;                        \
                                                                        \
        switch (_NSIG_WORDS) {                                                \
        case 4:                                                                \
                a3 = a->sig[3]; a2 = a->sig[2];                                \
                b3 = b->sig[3]; b2 = b->sig[2];                                \
                r->sig[3] = op(a3, b3);                                        \
                r->sig[2] = op(a2, b2);                                        \
                fallthrough;                                                \
        case 2:                                                                \
                a1 = a->sig[1]; b1 = b->sig[1];                                \
                r->sig[1] = op(a1, b1);                                        \
                fallthrough;                                                \
        case 1:                                                                \
                a0 = a->sig[0]; b0 = b->sig[0];                                \
                r->sig[0] = op(a0, b0);                                        \
                break;                                                        \
        default:                                                        \
                BUILD_BUG();                                                \
        }                                                                \
}

#define _sig_or(x,y)        ((x) | (y))
_SIG_SET_BINOP(sigorsets, _sig_or)

#define _sig_and(x,y)        ((x) & (y))
_SIG_SET_BINOP(sigandsets, _sig_and)

#define _sig_andn(x,y)        ((x) & ~(y))
_SIG_SET_BINOP(sigandnsets, _sig_andn)

#undef _SIG_SET_BINOP
#undef _sig_or
#undef _sig_and
#undef _sig_andn

#define _SIG_SET_OP(name, op)                                                \
static inline void name(sigset_t *set)                                        \
{                                                                        \
        switch (_NSIG_WORDS) {                                                \
        case 4:        set->sig[3] = op(set->sig[3]);                                \
                set->sig[2] = op(set->sig[2]);                                \
                fallthrough;                                                \
        case 2:        set->sig[1] = op(set->sig[1]);                                \
                fallthrough;                                                \
        case 1:        set->sig[0] = op(set->sig[0]);                                \
                    break;                                                \
        default:                                                        \
                BUILD_BUG();                                                \
        }                                                                \
}

#define _sig_not(x)        (~(x))
_SIG_SET_OP(signotset, _sig_not)

#undef _SIG_SET_OP
#undef _sig_not

static inline void sigemptyset(sigset_t *set)
{
        switch (_NSIG_WORDS) {
        default:
                memset(set, 0, sizeof(sigset_t));
                break;
        case 2: set->sig[1] = 0;
                fallthrough;
        case 1:        set->sig[0] = 0;
                break;
        }
}

static inline void sigfillset(sigset_t *set)
{
        switch (_NSIG_WORDS) {
        default:
                memset(set, -1, sizeof(sigset_t));
                break;
        case 2: set->sig[1] = -1;
                fallthrough;
        case 1:        set->sig[0] = -1;
                break;
        }
}

/* Some extensions for manipulating the low 32 signals in particular.  */

static inline void sigaddsetmask(sigset_t *set, unsigned long mask)
{
        set->sig[0] |= mask;
}

static inline void sigdelsetmask(sigset_t *set, unsigned long mask)
{
        set->sig[0] &= ~mask;
}

static inline int sigtestsetmask(sigset_t *set, unsigned long mask)
{
        return (set->sig[0] & mask) != 0;
}

static inline void siginitset(sigset_t *set, unsigned long mask)
{
        set->sig[0] = mask;
        switch (_NSIG_WORDS) {
        default:
                memset(&set->sig[1], 0, sizeof(long)*(_NSIG_WORDS-1));
                break;
        case 2: set->sig[1] = 0;
                break;
        case 1: ;
        }
}

static inline void siginitsetinv(sigset_t *set, unsigned long mask)
{
        set->sig[0] = ~mask;
        switch (_NSIG_WORDS) {
        default:
                memset(&set->sig[1], -1, sizeof(long)*(_NSIG_WORDS-1));
                break;
        case 2: set->sig[1] = -1;
                break;
        case 1: ;
        }
}

#endif /* __HAVE_ARCH_SIG_SETOPS */

static inline void init_sigpending(struct sigpending *sig)
{
        sigemptyset(&sig->signal);
        INIT_LIST_HEAD(&sig->list);
}

extern void flush_sigqueue(struct sigpending *queue);

/* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
static inline int valid_signal(unsigned long sig)
{
        return sig <= _NSIG ? 1 : 0;
}

struct timespec;
struct pt_regs;
enum pid_type;

extern int next_signal(struct sigpending *pending, sigset_t *mask);
extern int do_send_sig_info(int sig, struct kernel_siginfo *info,
                                struct task_struct *p, enum pid_type type);
extern int group_send_sig_info(int sig, struct kernel_siginfo *info,
                               struct task_struct *p, enum pid_type type);
extern int __group_send_sig_info(int, struct kernel_siginfo *, struct task_struct *);
extern int sigprocmask(int, sigset_t *, sigset_t *);
extern void set_current_blocked(sigset_t *);
extern void __set_current_blocked(const sigset_t *);
extern int show_unhandled_signals;

extern bool get_signal(struct ksignal *ksig);
extern void signal_setup_done(int failed, struct ksignal *ksig, int stepping);
extern void exit_signals(struct task_struct *tsk);
extern void kernel_sigaction(int, __sighandler_t);

#define SIG_KTHREAD ((__force __sighandler_t)2)
#define SIG_KTHREAD_KERNEL ((__force __sighandler_t)3)

static inline void allow_signal(int sig)
{
        /*
         * Kernel threads handle their own signals. Let the signal code
         * know it'll be handled, so that they don't get converted to
         * SIGKILL or just silently dropped.
         */
        kernel_sigaction(sig, SIG_KTHREAD);
}

static inline void allow_kernel_signal(int sig)
{
        /*
         * Kernel threads handle their own signals. Let the signal code
         * know signals sent by the kernel will be handled, so that they
         * don't get silently dropped.
         */
        kernel_sigaction(sig, SIG_KTHREAD_KERNEL);
}

static inline void disallow_signal(int sig)
{
        kernel_sigaction(sig, SIG_IGN);
}

extern struct kmem_cache *sighand_cachep;

extern bool unhandled_signal(struct task_struct *tsk, int sig);

/*
 * In POSIX a signal is sent either to a specific thread (Linux task)
 * or to the process as a whole (Linux thread group).  How the signal
 * is sent determines whether it's to one thread or the whole group,
 * which determines which signal mask(s) are involved in blocking it
 * from being delivered until later.  When the signal is delivered,
 * either it's caught or ignored by a user handler or it has a default
 * effect that applies to the whole thread group (POSIX process).
 *
 * The possible effects an unblocked signal set to SIG_DFL can have are:
 *   ignore        - Nothing Happens
 *   terminate        - kill the process, i.e. all threads in the group,
 *                   similar to exit_group.  The group leader (only) reports
 *                  WIFSIGNALED status to its parent.
 *   coredump        - write a core dump file describing all threads using
 *                  the same mm and then kill all those threads
 *   stop         - stop all the threads in the group, i.e. TASK_STOPPED state
 *
 * SIGKILL and SIGSTOP cannot be caught, blocked, or ignored.
 * Other signals when not blocked and set to SIG_DFL behaves as follows.
 * The job control signals also have other special effects.
 *
 *        +--------------------+------------------+
 *        |  POSIX signal      |  default action  |
 *        +--------------------+------------------+
 *        |  SIGHUP            |  terminate        |
 *        |  SIGINT            |        terminate        |
 *        |  SIGQUIT           |        coredump         |
 *        |  SIGILL            |        coredump         |
 *        |  SIGTRAP           |        coredump         |
 *        |  SIGABRT/SIGIOT    |        coredump         |
 *        |  SIGBUS            |        coredump         |
 *        |  SIGFPE            |        coredump         |
 *        |  SIGKILL           |        terminate(+)        |
 *        |  SIGUSR1           |        terminate        |
 *        |  SIGSEGV           |        coredump         |
 *        |  SIGUSR2           |        terminate        |
 *        |  SIGPIPE           |        terminate        |
 *        |  SIGALRM           |        terminate        |
 *        |  SIGTERM           |        terminate        |
 *        |  SIGCHLD           |        ignore           |
 *        |  SIGCONT           |        ignore(*)        |
 *        |  SIGSTOP           |        stop(*)(+)          |
 *        |  SIGTSTP           |        stop(*)          |
 *        |  SIGTTIN           |        stop(*)          |
 *        |  SIGTTOU           |        stop(*)          |
 *        |  SIGURG            |        ignore           |
 *        |  SIGXCPU           |        coredump         |
 *        |  SIGXFSZ           |        coredump         |
 *        |  SIGVTALRM         |        terminate        |
 *        |  SIGPROF           |        terminate        |
 *        |  SIGPOLL/SIGIO     |        terminate        |
 *        |  SIGSYS/SIGUNUSED  |        coredump         |
 *        |  SIGSTKFLT         |        terminate        |
 *        |  SIGWINCH          |        ignore           |
 *        |  SIGPWR            |        terminate        |
 *        |  SIGRTMIN-SIGRTMAX |        terminate       |
 *        +--------------------+------------------+
 *        |  non-POSIX signal  |  default action  |
 *        +--------------------+------------------+
 *        |  SIGEMT            |  coredump        |
 *        +--------------------+------------------+
 *
 * (+) For SIGKILL and SIGSTOP the action is "always", not just "default".
 * (*) Special job control effects:
 * When SIGCONT is sent, it resumes the process (all threads in the group)
 * from TASK_STOPPED state and also clears any pending/queued stop signals
 * (any of those marked with "stop(*)").  This happens regardless of blocking,
 * catching, or ignoring SIGCONT.  When any stop signal is sent, it clears
 * any pending/queued SIGCONT signals; this happens regardless of blocking,
 * catching, or ignored the stop signal, though (except for SIGSTOP) the
 * default action of stopping the process may happen later or never.
 */

#ifdef SIGEMT
#define SIGEMT_MASK        rt_sigmask(SIGEMT)
#else
#define SIGEMT_MASK        0
#endif

#if SIGRTMIN > BITS_PER_LONG
#define rt_sigmask(sig)        (1ULL << ((sig)-1))
#else
#define rt_sigmask(sig)        sigmask(sig)
#endif

#define siginmask(sig, mask) \
        ((sig) > 0 && (sig) < SIGRTMIN && (rt_sigmask(sig) & (mask)))

#define SIG_KERNEL_ONLY_MASK (\
        rt_sigmask(SIGKILL)   |  rt_sigmask(SIGSTOP))

#define SIG_KERNEL_STOP_MASK (\
        rt_sigmask(SIGSTOP)   |  rt_sigmask(SIGTSTP)   | \
        rt_sigmask(SIGTTIN)   |  rt_sigmask(SIGTTOU)   )

#define SIG_KERNEL_COREDUMP_MASK (\
        rt_sigmask(SIGQUIT)   |  rt_sigmask(SIGILL)    | \
        rt_sigmask(SIGTRAP)   |  rt_sigmask(SIGABRT)   | \
        rt_sigmask(SIGFPE)    |  rt_sigmask(SIGSEGV)   | \
        rt_sigmask(SIGBUS)    |  rt_sigmask(SIGSYS)    | \
        rt_sigmask(SIGXCPU)   |  rt_sigmask(SIGXFSZ)   | \
        SIGEMT_MASK                                       )

#define SIG_KERNEL_IGNORE_MASK (\
        rt_sigmask(SIGCONT)   |  rt_sigmask(SIGCHLD)   | \
        rt_sigmask(SIGWINCH)  |  rt_sigmask(SIGURG)    )

#define SIG_SPECIFIC_SICODES_MASK (\
        rt_sigmask(SIGILL)    |  rt_sigmask(SIGFPE)    | \
        rt_sigmask(SIGSEGV)   |  rt_sigmask(SIGBUS)    | \
        rt_sigmask(SIGTRAP)   |  rt_sigmask(SIGCHLD)   | \
        rt_sigmask(SIGPOLL)   |  rt_sigmask(SIGSYS)    | \
        SIGEMT_MASK                                    )

#define sig_kernel_only(sig)                siginmask(sig, SIG_KERNEL_ONLY_MASK)
#define sig_kernel_coredump(sig)        siginmask(sig, SIG_KERNEL_COREDUMP_MASK)
#define sig_kernel_ignore(sig)                siginmask(sig, SIG_KERNEL_IGNORE_MASK)
#define sig_kernel_stop(sig)                siginmask(sig, SIG_KERNEL_STOP_MASK)
#define sig_specific_sicodes(sig)        siginmask(sig, SIG_SPECIFIC_SICODES_MASK)

#define sig_fatal(t, signr) \
        (!siginmask(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \
         (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL)

void signals_init(void);

int restore_altstack(const stack_t __user *);
int __save_altstack(stack_t __user *, unsigned long);

#define unsafe_save_altstack(uss, sp, label) do { \
        stack_t __user *__uss = uss; \
        struct task_struct *t = current; \
        unsafe_put_user((void __user *)t->sas_ss_sp, &__uss->ss_sp, label); \
        unsafe_put_user(t->sas_ss_flags, &__uss->ss_flags, label); \
        unsafe_put_user(t->sas_ss_size, &__uss->ss_size, label); \
        if (t->sas_ss_flags & SS_AUTODISARM) \
                sas_ss_reset(t); \
} while (0);

#ifdef CONFIG_PROC_FS
struct seq_file;
extern void render_sigset_t(struct seq_file *, const char *, sigset_t *);
#endif

#endif /* _LINUX_SIGNAL_H */

































    3 


















    2 



















































































    1 


































    1 
































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM timer

#if !defined(_TRACE_TIMER_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_TIMER_H

#include <linux/tracepoint.h>
#include <linux/hrtimer.h>
#include <linux/timer.h>

DECLARE_EVENT_CLASS(timer_class,

        TP_PROTO(struct timer_list *timer),

        TP_ARGS(timer),

        TP_STRUCT__entry(
                __field( void *,        timer        )
        ),

        TP_fast_assign(
                __entry->timer        = timer;
        ),

        TP_printk("timer=%p", __entry->timer)
);

/**
 * timer_init - called when the timer is initialized
 * @timer:        pointer to struct timer_list
 */
DEFINE_EVENT(timer_class, timer_init,

        TP_PROTO(struct timer_list *timer),

        TP_ARGS(timer)
);

#define decode_timer_flags(flags)                        \
        __print_flags(flags, "|",                        \
                {  TIMER_MIGRATING,        "M" },                \
                {  TIMER_DEFERRABLE,        "D" },                \
                {  TIMER_PINNED,        "P" },                \
                {  TIMER_IRQSAFE,        "I" })

/**
 * timer_start - called when the timer is started
 * @timer:        pointer to struct timer_list
 * @expires:        the timers expiry time
 */
TRACE_EVENT(timer_start,

        TP_PROTO(struct timer_list *timer,
                unsigned long expires,
                unsigned int flags),

        TP_ARGS(timer, expires, flags),

        TP_STRUCT__entry(
                __field( void *,        timer                )
                __field( void *,        function        )
                __field( unsigned long,        expires                )
                __field( unsigned long,        now                )
                __field( unsigned int,        flags                )
        ),

        TP_fast_assign(
                __entry->timer                = timer;
                __entry->function        = timer->function;
                __entry->expires        = expires;
                __entry->now                = jiffies;
                __entry->flags                = flags;
        ),

        TP_printk("timer=%p function=%ps expires=%lu [timeout=%ld] cpu=%u idx=%u flags=%s",
                  __entry->timer, __entry->function, __entry->expires,
                  (long)__entry->expires - __entry->now,
                  __entry->flags & TIMER_CPUMASK,
                  __entry->flags >> TIMER_ARRAYSHIFT,
                  decode_timer_flags(__entry->flags & TIMER_TRACE_FLAGMASK))
);

/**
 * timer_expire_entry - called immediately before the timer callback
 * @timer:        pointer to struct timer_list
 *
 * Allows to determine the timer latency.
 */
TRACE_EVENT(timer_expire_entry,

        TP_PROTO(struct timer_list *timer, unsigned long baseclk),

        TP_ARGS(timer, baseclk),

        TP_STRUCT__entry(
                __field( void *,        timer        )
                __field( unsigned long,        now        )
                __field( void *,        function)
                __field( unsigned long,        baseclk        )
        ),

        TP_fast_assign(
                __entry->timer                = timer;
                __entry->now                = jiffies;
                __entry->function        = timer->function;
                __entry->baseclk        = baseclk;
        ),

        TP_printk("timer=%p function=%ps now=%lu baseclk=%lu",
                  __entry->timer, __entry->function, __entry->now,
                  __entry->baseclk)
);

/**
 * timer_expire_exit - called immediately after the timer callback returns
 * @timer:        pointer to struct timer_list
 *
 * When used in combination with the timer_expire_entry tracepoint we can
 * determine the runtime of the timer callback function.
 *
 * NOTE: Do NOT derefernce timer in TP_fast_assign. The pointer might
 * be invalid. We solely track the pointer.
 */
DEFINE_EVENT(timer_class, timer_expire_exit,

        TP_PROTO(struct timer_list *timer),

        TP_ARGS(timer)
);

/**
 * timer_cancel - called when the timer is canceled
 * @timer:        pointer to struct timer_list
 */
DEFINE_EVENT(timer_class, timer_cancel,

        TP_PROTO(struct timer_list *timer),

        TP_ARGS(timer)
);

#define decode_clockid(type)                                                \
        __print_symbolic(type,                                                \
                { CLOCK_REALTIME,        "CLOCK_REALTIME"        },        \
                { CLOCK_MONOTONIC,        "CLOCK_MONOTONIC"        },        \
                { CLOCK_BOOTTIME,        "CLOCK_BOOTTIME"        },        \
                { CLOCK_TAI,                "CLOCK_TAI"                })

#define decode_hrtimer_mode(mode)                                        \
        __print_symbolic(mode,                                                \
                { HRTIMER_MODE_ABS,                "ABS"                },        \
                { HRTIMER_MODE_REL,                "REL"                },        \
                { HRTIMER_MODE_ABS_PINNED,        "ABS|PINNED"        },        \
                { HRTIMER_MODE_REL_PINNED,        "REL|PINNED"        },        \
                { HRTIMER_MODE_ABS_SOFT,        "ABS|SOFT"        },        \
                { HRTIMER_MODE_REL_SOFT,        "REL|SOFT"        },        \
                { HRTIMER_MODE_ABS_PINNED_SOFT,        "ABS|PINNED|SOFT" },        \
                { HRTIMER_MODE_REL_PINNED_SOFT,        "REL|PINNED|SOFT" },        \
                { HRTIMER_MODE_ABS_HARD,        "ABS|HARD" },                \
                { HRTIMER_MODE_REL_HARD,        "REL|HARD" },                \
                { HRTIMER_MODE_ABS_PINNED_HARD, "ABS|PINNED|HARD" },        \
                { HRTIMER_MODE_REL_PINNED_HARD,        "REL|PINNED|HARD" })

/**
 * hrtimer_init - called when the hrtimer is initialized
 * @hrtimer:        pointer to struct hrtimer
 * @clockid:        the hrtimers clock
 * @mode:        the hrtimers mode
 */
TRACE_EVENT(hrtimer_init,

        TP_PROTO(struct hrtimer *hrtimer, clockid_t clockid,
                 enum hrtimer_mode mode),

        TP_ARGS(hrtimer, clockid, mode),

        TP_STRUCT__entry(
                __field( void *,                hrtimer                )
                __field( clockid_t,                clockid                )
                __field( enum hrtimer_mode,        mode                )
        ),

        TP_fast_assign(
                __entry->hrtimer        = hrtimer;
                __entry->clockid        = clockid;
                __entry->mode                = mode;
        ),

        TP_printk("hrtimer=%p clockid=%s mode=%s", __entry->hrtimer,
                  decode_clockid(__entry->clockid),
                  decode_hrtimer_mode(__entry->mode))
);

/**
 * hrtimer_start - called when the hrtimer is started
 * @hrtimer: pointer to struct hrtimer
 */
TRACE_EVENT(hrtimer_start,

        TP_PROTO(struct hrtimer *hrtimer, enum hrtimer_mode mode),

        TP_ARGS(hrtimer, mode),

        TP_STRUCT__entry(
                __field( void *,        hrtimer                )
                __field( void *,        function        )
                __field( s64,                expires                )
                __field( s64,                softexpires        )
                __field( enum hrtimer_mode,        mode        )
        ),

        TP_fast_assign(
                __entry->hrtimer        = hrtimer;
                __entry->function        = hrtimer->function;
                __entry->expires        = hrtimer_get_expires(hrtimer);
                __entry->softexpires        = hrtimer_get_softexpires(hrtimer);
                __entry->mode                = mode;
        ),

        TP_printk("hrtimer=%p function=%ps expires=%llu softexpires=%llu "
                  "mode=%s", __entry->hrtimer, __entry->function,
                  (unsigned long long) __entry->expires,
                  (unsigned long long) __entry->softexpires,
                  decode_hrtimer_mode(__entry->mode))
);

/**
 * hrtimer_expire_entry - called immediately before the hrtimer callback
 * @hrtimer:        pointer to struct hrtimer
 * @now:        pointer to variable which contains current time of the
 *                timers base.
 *
 * Allows to determine the timer latency.
 */
TRACE_EVENT(hrtimer_expire_entry,

        TP_PROTO(struct hrtimer *hrtimer, ktime_t *now),

        TP_ARGS(hrtimer, now),

        TP_STRUCT__entry(
                __field( void *,        hrtimer        )
                __field( s64,                now        )
                __field( void *,        function)
        ),

        TP_fast_assign(
                __entry->hrtimer        = hrtimer;
                __entry->now                = *now;
                __entry->function        = hrtimer->function;
        ),

        TP_printk("hrtimer=%p function=%ps now=%llu",
                  __entry->hrtimer, __entry->function,
                  (unsigned long long) __entry->now)
);

DECLARE_EVENT_CLASS(hrtimer_class,

        TP_PROTO(struct hrtimer *hrtimer),

        TP_ARGS(hrtimer),

        TP_STRUCT__entry(
                __field( void *,        hrtimer        )
        ),

        TP_fast_assign(
                __entry->hrtimer        = hrtimer;
        ),

        TP_printk("hrtimer=%p", __entry->hrtimer)
);

/**
 * hrtimer_expire_exit - called immediately after the hrtimer callback returns
 * @hrtimer:        pointer to struct hrtimer
 *
 * When used in combination with the hrtimer_expire_entry tracepoint we can
 * determine the runtime of the callback function.
 */
DEFINE_EVENT(hrtimer_class, hrtimer_expire_exit,

        TP_PROTO(struct hrtimer *hrtimer),

        TP_ARGS(hrtimer)
);

/**
 * hrtimer_cancel - called when the hrtimer is canceled
 * @hrtimer:        pointer to struct hrtimer
 */
DEFINE_EVENT(hrtimer_class, hrtimer_cancel,

        TP_PROTO(struct hrtimer *hrtimer),

        TP_ARGS(hrtimer)
);

/**
 * itimer_state - called when itimer is started or canceled
 * @which:        name of the interval timer
 * @value:        the itimers value, itimer is canceled if value->it_value is
 *                zero, otherwise it is started
 * @expires:        the itimers expiry time
 */
TRACE_EVENT(itimer_state,

        TP_PROTO(int which, const struct itimerspec64 *const value,
                 unsigned long long expires),

        TP_ARGS(which, value, expires),

        TP_STRUCT__entry(
                __field(        int,                        which                )
                __field(        unsigned long long,        expires                )
                __field(        long,                        value_sec        )
                __field(        long,                        value_nsec        )
                __field(        long,                        interval_sec        )
                __field(        long,                        interval_nsec        )
        ),

        TP_fast_assign(
                __entry->which                = which;
                __entry->expires        = expires;
                __entry->value_sec        = value->it_value.tv_sec;
                __entry->value_nsec        = value->it_value.tv_nsec;
                __entry->interval_sec        = value->it_interval.tv_sec;
                __entry->interval_nsec        = value->it_interval.tv_nsec;
        ),

        TP_printk("which=%d expires=%llu it_value=%ld.%06ld it_interval=%ld.%06ld",
                  __entry->which, __entry->expires,
                  __entry->value_sec, __entry->value_nsec / NSEC_PER_USEC,
                  __entry->interval_sec, __entry->interval_nsec / NSEC_PER_USEC)
);

/**
 * itimer_expire - called when itimer expires
 * @which:        type of the interval timer
 * @pid:        pid of the process which owns the timer
 * @now:        current time, used to calculate the latency of itimer
 */
TRACE_EVENT(itimer_expire,

        TP_PROTO(int which, struct pid *pid, unsigned long long now),

        TP_ARGS(which, pid, now),

        TP_STRUCT__entry(
                __field( int ,                        which        )
                __field( pid_t,                        pid        )
                __field( unsigned long long,        now        )
        ),

        TP_fast_assign(
                __entry->which        = which;
                __entry->now        = now;
                __entry->pid        = pid_nr(pid);
        ),

        TP_printk("which=%d pid=%d now=%llu", __entry->which,
                  (int) __entry->pid, __entry->now)
);

#ifdef CONFIG_NO_HZ_COMMON

#define TICK_DEP_NAMES                                        \
                tick_dep_mask_name(NONE)                \
                tick_dep_name(POSIX_TIMER)                \
                tick_dep_name(PERF_EVENTS)                \
                tick_dep_name(SCHED)                        \
                tick_dep_name(CLOCK_UNSTABLE)                \
                tick_dep_name(RCU)                        \
                tick_dep_name_end(RCU_EXP)

#undef tick_dep_name
#undef tick_dep_mask_name
#undef tick_dep_name_end

/* The MASK will convert to their bits and they need to be processed too */
#define tick_dep_name(sdep) TRACE_DEFINE_ENUM(TICK_DEP_BIT_##sdep); \
        TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep);
#define tick_dep_name_end(sdep)  TRACE_DEFINE_ENUM(TICK_DEP_BIT_##sdep); \
        TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep);
/* NONE only has a mask defined for it */
#define tick_dep_mask_name(sdep) TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep);

TICK_DEP_NAMES

#undef tick_dep_name
#undef tick_dep_mask_name
#undef tick_dep_name_end

#define tick_dep_name(sdep) { TICK_DEP_MASK_##sdep, #sdep },
#define tick_dep_mask_name(sdep) { TICK_DEP_MASK_##sdep, #sdep },
#define tick_dep_name_end(sdep) { TICK_DEP_MASK_##sdep, #sdep }

#define show_tick_dep_name(val)                                \
        __print_symbolic(val, TICK_DEP_NAMES)

TRACE_EVENT(tick_stop,

        TP_PROTO(int success, int dependency),

        TP_ARGS(success, dependency),

        TP_STRUCT__entry(
                __field( int ,                success        )
                __field( int ,                dependency )
        ),

        TP_fast_assign(
                __entry->success        = success;
                __entry->dependency        = dependency;
        ),

        TP_printk("success=%d dependency=%s",  __entry->success, \
                        show_tick_dep_name(__entry->dependency))
);
#endif

#endif /*  _TRACE_TIMER_H */

/* This part must be outside protection */
#include <trace/define_trace.h>

































































































































































































































































































































    1 
























































































    8 























    8 
    8 






































   14 






























    8 

    8 



















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright (C) 1994 Linus Torvalds
 *
 * Pentium III FXSR, SSE support
 * General FPU state handling cleanups
 *        Gareth Hughes <gareth@valinux.com>, May 2000
 * x86-64 work by Andi Kleen 2002
 */

#ifndef _ASM_X86_FPU_INTERNAL_H
#define _ASM_X86_FPU_INTERNAL_H

#include <linux/compat.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/mm.h>

#include <asm/user.h>
#include <asm/fpu/api.h>
#include <asm/fpu/xstate.h>
#include <asm/fpu/xcr.h>
#include <asm/cpufeature.h>
#include <asm/trace/fpu.h>

/*
 * High level FPU state handling functions:
 */
extern void fpu__prepare_read(struct fpu *fpu);
extern void fpu__prepare_write(struct fpu *fpu);
extern void fpu__save(struct fpu *fpu);
extern int  fpu__restore_sig(void __user *buf, int ia32_frame);
extern void fpu__drop(struct fpu *fpu);
extern int  fpu__copy(struct task_struct *dst, struct task_struct *src);
extern void fpu__clear_user_states(struct fpu *fpu);
extern void fpu__clear_all(struct fpu *fpu);
extern int  fpu__exception_code(struct fpu *fpu, int trap_nr);

/*
 * Boot time FPU initialization functions:
 */
extern void fpu__init_cpu(void);
extern void fpu__init_system_xstate(void);
extern void fpu__init_cpu_xstate(void);
extern void fpu__init_system(void);
extern void fpu__init_check_bugs(void);
extern void fpu__resume_cpu(void);
extern u64 fpu__get_supported_xfeatures_mask(void);

/*
 * Debugging facility:
 */
#ifdef CONFIG_X86_DEBUG_FPU
# define WARN_ON_FPU(x) WARN_ON_ONCE(x)
#else
# define WARN_ON_FPU(x) ({ (void)(x); 0; })
#endif

/*
 * FPU related CPU feature flag helper routines:
 */
static __always_inline __pure bool use_xsaveopt(void)
{
        return static_cpu_has(X86_FEATURE_XSAVEOPT);
}

static __always_inline __pure bool use_xsave(void)
{
        return static_cpu_has(X86_FEATURE_XSAVE);
}

static __always_inline __pure bool use_fxsr(void)
{
        return static_cpu_has(X86_FEATURE_FXSR);
}

/*
 * fpstate handling functions:
 */

extern union fpregs_state init_fpstate;

extern void fpstate_init(union fpregs_state *state);
#ifdef CONFIG_MATH_EMULATION
extern void fpstate_init_soft(struct swregs_state *soft);
#else
static inline void fpstate_init_soft(struct swregs_state *soft) {}
#endif

static inline void fpstate_init_xstate(struct xregs_state *xsave)
{
        /*
         * XRSTORS requires these bits set in xcomp_bv, or it will
         * trigger #GP:
         */
        xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask_all;
}

static inline void fpstate_init_fxstate(struct fxregs_state *fx)
{
        fx->cwd = 0x37f;
        fx->mxcsr = MXCSR_DEFAULT;
}
extern void fpstate_sanitize_xstate(struct fpu *fpu);

/* Returns 0 or the negated trap number, which results in -EFAULT for #PF */
#define user_insn(insn, output, input...)                                \
({                                                                        \
        int err;                                                        \
                                                                        \
        might_fault();                                                        \
                                                                        \
        asm volatile(ASM_STAC "\n"                                        \
                     "1: " #insn "\n"                                        \
                     "2: " ASM_CLAC "\n"                                \
                     ".section .fixup,\"ax\"\n"                                \
                     "3:  negl %%eax\n"                                        \
                     "    jmp  2b\n"                                        \
                     ".previous\n"                                        \
                     _ASM_EXTABLE_FAULT(1b, 3b)                                \
                     : [err] "=a" (err), output                                \
                     : "0"(0), input);                                        \
        err;                                                                \
})

#define kernel_insn_err(insn, output, input...)                                \
({                                                                        \
        int err;                                                        \
        asm volatile("1:" #insn "\n\t"                                        \
                     "2:\n"                                                \
                     ".section .fixup,\"ax\"\n"                                \
                     "3:  movl $-1,%[err]\n"                                \
                     "    jmp  2b\n"                                        \
                     ".previous\n"                                        \
                     _ASM_EXTABLE(1b, 3b)                                \
                     : [err] "=r" (err), output                                \
                     : "0"(0), input);                                        \
        err;                                                                \
})

#define kernel_insn(insn, output, input...)                                \
        asm volatile("1:" #insn "\n\t"                                        \
                     "2:\n"                                                \
                     _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_fprestore)        \
                     : output : input)

static inline int copy_fregs_to_user(struct fregs_state __user *fx)
{
        return user_insn(fnsave %[fx]; fwait,  [fx] "=m" (*fx), "m" (*fx));
}

static inline int copy_fxregs_to_user(struct fxregs_state __user *fx)
{
        if (IS_ENABLED(CONFIG_X86_32))
                return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx));
        else
                return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx));

}

static inline void copy_kernel_to_fxregs(struct fxregs_state *fx)
{
        if (IS_ENABLED(CONFIG_X86_32))
                kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
        else
                kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
}

static inline int copy_kernel_to_fxregs_err(struct fxregs_state *fx)
{
        if (IS_ENABLED(CONFIG_X86_32))
                return kernel_insn_err(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
        else
                return kernel_insn_err(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
}

static inline int copy_user_to_fxregs(struct fxregs_state __user *fx)
{
        if (IS_ENABLED(CONFIG_X86_32))
                return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
        else
                return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
}

static inline void copy_kernel_to_fregs(struct fregs_state *fx)
{
        kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
}

static inline int copy_kernel_to_fregs_err(struct fregs_state *fx)
{
        return kernel_insn_err(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
}

static inline int copy_user_to_fregs(struct fregs_state __user *fx)
{
        return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
}

static inline void copy_fxregs_to_kernel(struct fpu *fpu)
{
        if (IS_ENABLED(CONFIG_X86_32))
                asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state.fxsave));
        else
                asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave));
}

static inline void fxsave(struct fxregs_state *fx)
{
        if (IS_ENABLED(CONFIG_X86_32))
                asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx));
        else
                asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx));
}

/* These macros all use (%edi)/(%rdi) as the single memory argument. */
#define XSAVE                ".byte " REX_PREFIX "0x0f,0xae,0x27"
#define XSAVEOPT        ".byte " REX_PREFIX "0x0f,0xae,0x37"
#define XSAVES                ".byte " REX_PREFIX "0x0f,0xc7,0x2f"
#define XRSTOR                ".byte " REX_PREFIX "0x0f,0xae,0x2f"
#define XRSTORS                ".byte " REX_PREFIX "0x0f,0xc7,0x1f"

/*
 * After this @err contains 0 on success or the negated trap number when
 * the operation raises an exception. For faults this results in -EFAULT.
 */
#define XSTATE_OP(op, st, lmask, hmask, err)                                \
        asm volatile("1:" op "\n\t"                                        \
                     "xor %[err], %[err]\n"                                \
                     "2:\n\t"                                                \
                     ".pushsection .fixup,\"ax\"\n\t"                        \
                     "3: negl %%eax\n\t"                                \
                     "jmp 2b\n\t"                                        \
                     ".popsection\n\t"                                        \
                     _ASM_EXTABLE_FAULT(1b, 3b)                                \
                     : [err] "=a" (err)                                        \
                     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)        \
                     : "memory")

/*
 * If XSAVES is enabled, it replaces XSAVEOPT because it supports a compact
 * format and supervisor states in addition to modified optimization in
 * XSAVEOPT.
 *
 * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT
 * supports modified optimization which is not supported by XSAVE.
 *
 * We use XSAVE as a fallback.
 *
 * The 661 label is defined in the ALTERNATIVE* macros as the address of the
 * original instruction which gets replaced. We need to use it here as the
 * address of the instruction where we might get an exception at.
 */
#define XSTATE_XSAVE(st, lmask, hmask, err)                                \
        asm volatile(ALTERNATIVE_2(XSAVE,                                \
                                   XSAVEOPT, X86_FEATURE_XSAVEOPT,        \
                                   XSAVES,   X86_FEATURE_XSAVES)        \
                     "\n"                                                \
                     "xor %[err], %[err]\n"                                \
                     "3:\n"                                                \
                     ".pushsection .fixup,\"ax\"\n"                        \
                     "4: movl $-2, %[err]\n"                                \
                     "jmp 3b\n"                                                \
                     ".popsection\n"                                        \
                     _ASM_EXTABLE(661b, 4b)                                \
                     : [err] "=r" (err)                                        \
                     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)        \
                     : "memory")

/*
 * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact
 * XSAVE area format.
 */
#define XSTATE_XRESTORE(st, lmask, hmask)                                \
        asm volatile(ALTERNATIVE(XRSTOR,                                \
                                 XRSTORS, X86_FEATURE_XSAVES)                \
                     "\n"                                                \
                     "3:\n"                                                \
                     _ASM_EXTABLE_HANDLE(661b, 3b, ex_handler_fprestore)\
                     :                                                        \
                     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)        \
                     : "memory")

/*
 * This function is called only during boot time when x86 caps are not set
 * up and alternative can not be used yet.
 */
static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate)
{
        u64 mask = -1;
        u32 lmask = mask;
        u32 hmask = mask >> 32;
        int err;

        WARN_ON(system_state != SYSTEM_BOOTING);

        if (boot_cpu_has(X86_FEATURE_XSAVES))
                XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
        else
                XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);

        /*
         * We should never fault when copying from a kernel buffer, and the FPU
         * state we set at boot time should be valid.
         */
        WARN_ON_FPU(err);
}

/*
 * Save processor xstate to xsave area.
 */
static inline void copy_xregs_to_kernel(struct xregs_state *xstate)
{
        u64 mask = xfeatures_mask_all;
        u32 lmask = mask;
        u32 hmask = mask >> 32;
        int err;

        WARN_ON_FPU(!alternatives_patched);

        XSTATE_XSAVE(xstate, lmask, hmask, err);

        /* We should never fault when copying to a kernel buffer: */
        WARN_ON_FPU(err);
}

/*
 * Restore processor xstate from xsave area.
 */
static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask)
{
        u32 lmask = mask;
        u32 hmask = mask >> 32;

        XSTATE_XRESTORE(xstate, lmask, hmask);
}

/*
 * Save xstate to user space xsave area.
 *
 * We don't use modified optimization because xrstor/xrstors might track
 * a different application.
 *
 * We don't use compacted format xsave area for
 * backward compatibility for old applications which don't understand
 * compacted format of xsave area.
 */
static inline int copy_xregs_to_user(struct xregs_state __user *buf)
{
        u64 mask = xfeatures_mask_user();
        u32 lmask = mask;
        u32 hmask = mask >> 32;
        int err;

        /*
         * Clear the xsave header first, so that reserved fields are
         * initialized to zero.
         */
        err = __clear_user(&buf->header, sizeof(buf->header));
        if (unlikely(err))
                return -EFAULT;

        stac();
        XSTATE_OP(XSAVE, buf, lmask, hmask, err);
        clac();

        return err;
}

/*
 * Restore xstate from user space xsave area.
 */
static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask)
{
        struct xregs_state *xstate = ((__force struct xregs_state *)buf);
        u32 lmask = mask;
        u32 hmask = mask >> 32;
        int err;

        stac();
        XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
        clac();

        return err;
}

/*
 * Restore xstate from kernel space xsave area, return an error code instead of
 * an exception.
 */
static inline int copy_kernel_to_xregs_err(struct xregs_state *xstate, u64 mask)
{
        u32 lmask = mask;
        u32 hmask = mask >> 32;
        int err;

        if (static_cpu_has(X86_FEATURE_XSAVES))
                XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
        else
                XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);

        return err;
}

extern int copy_fpregs_to_fpstate(struct fpu *fpu);

static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask)
{
        if (use_xsave()) {
                copy_kernel_to_xregs(&fpstate->xsave, mask);
        } else {
                if (use_fxsr())
                        copy_kernel_to_fxregs(&fpstate->fxsave);
                else
                        copy_kernel_to_fregs(&fpstate->fsave);
        }
}

static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate)
{
        /*
         * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is
         * pending. Clear the x87 state here by setting it to fixed values.
         * "m" is a random variable that should be in L1.
         */
        if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) {
                asm volatile(
                        "fnclex\n\t"
                        "emms\n\t"
                        "fildl %P[addr]"        /* set F?P to defined value */
                        : : [addr] "m" (fpstate));
        }

        __copy_kernel_to_fpregs(fpstate, -1);
}

extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);

/*
 * FPU context switch related helper methods:
 */

DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);

/*
 * The in-register FPU state for an FPU context on a CPU is assumed to be
 * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx
 * matches the FPU.
 *
 * If the FPU register state is valid, the kernel can skip restoring the
 * FPU state from memory.
 *
 * Any code that clobbers the FPU registers or updates the in-memory
 * FPU state for a task MUST let the rest of the kernel know that the
 * FPU registers are no longer valid for this task.
 *
 * Either one of these invalidation functions is enough. Invalidate
 * a resource you control: CPU if using the CPU for something else
 * (with preemption disabled), FPU for the current task, or a task that
 * is prevented from running by the current task.
 */
static inline void __cpu_invalidate_fpregs_state(void)
{
        __this_cpu_write(fpu_fpregs_owner_ctx, NULL);
}

static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu)
{
        fpu->last_cpu = -1;
}

static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu)
{
        return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu;
}

/*
 * These generally need preemption protection to work,
 * do try to avoid using these on their own:
 */
static inline void fpregs_deactivate(struct fpu *fpu)
{
        this_cpu_write(fpu_fpregs_owner_ctx, NULL);
        trace_x86_fpu_regs_deactivated(fpu);
}

static inline void fpregs_activate(struct fpu *fpu)
{
        this_cpu_write(fpu_fpregs_owner_ctx, fpu);
        trace_x86_fpu_regs_activated(fpu);
}

/*
 * Internal helper, do not use directly. Use switch_fpu_return() instead.
 */
static inline void __fpregs_load_activate(void)
{
        struct fpu *fpu = &current->thread.fpu;
        int cpu = smp_processor_id();

        if (WARN_ON_ONCE(current->flags & PF_KTHREAD))
                return;

        if (!fpregs_state_valid(fpu, cpu)) {
                copy_kernel_to_fpregs(&fpu->state);
                fpregs_activate(fpu);
                fpu->last_cpu = cpu;
        }
        clear_thread_flag(TIF_NEED_FPU_LOAD);
}

/*
 * FPU state switching for scheduling.
 *
 * This is a two-stage process:
 *
 *  - switch_fpu_prepare() saves the old state.
 *    This is done within the context of the old process.
 *
 *  - switch_fpu_finish() sets TIF_NEED_FPU_LOAD; the floating point state
 *    will get loaded on return to userspace, or when the kernel needs it.
 *
 * If TIF_NEED_FPU_LOAD is cleared then the CPU's FPU registers
 * are saved in the current thread's FPU register state.
 *
 * If TIF_NEED_FPU_LOAD is set then CPU's FPU registers may not
 * hold current()'s FPU registers. It is required to load the
 * registers before returning to userland or using the content
 * otherwise.
 *
 * The FPU context is only stored/restored for a user task and
 * PF_KTHREAD is used to distinguish between kernel and user threads.
 */
static inline void switch_fpu_prepare(struct task_struct *prev, int cpu)
{
        struct fpu *old_fpu = &prev->thread.fpu;

        if (static_cpu_has(X86_FEATURE_FPU) && !(prev->flags & PF_KTHREAD)) {
                if (!copy_fpregs_to_fpstate(old_fpu))
                        old_fpu->last_cpu = -1;
                else
                        old_fpu->last_cpu = cpu;

                /* But leave fpu_fpregs_owner_ctx! */
                trace_x86_fpu_regs_deactivated(old_fpu);
        }
}

/*
 * Misc helper functions:
 */

/*
 * Load PKRU from the FPU context if available. Delay loading of the
 * complete FPU state until the return to userland.
 */
static inline void switch_fpu_finish(struct task_struct *next)
{
        u32 pkru_val = init_pkru_value;
        struct pkru_state *pk;
        struct fpu *next_fpu = &next->thread.fpu;

        if (!static_cpu_has(X86_FEATURE_FPU))
                return;

        set_thread_flag(TIF_NEED_FPU_LOAD);

        if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
                return;

        /*
         * PKRU state is switched eagerly because it needs to be valid before we
         * return to userland e.g. for a copy_to_user() operation.
         */
        if (!(next->flags & PF_KTHREAD)) {
                /*
                 * If the PKRU bit in xsave.header.xfeatures is not set,
                 * then the PKRU component was in init state, which means
                 * XRSTOR will set PKRU to 0. If the bit is not set then
                 * get_xsave_addr() will return NULL because the PKRU value
                 * in memory is not valid. This means pkru_val has to be
                 * set to 0 and not to init_pkru_value.
                 */
                pk = get_xsave_addr(&next_fpu->state.xsave, XFEATURE_PKRU);
                pkru_val = pk ? pk->pkru : 0;
        }
        __write_pkru(pkru_val);
}

#endif /* _ASM_X86_FPU_INTERNAL_H */













































    1 
    1 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * generic net pointers
 */

#ifndef __NET_GENERIC_H__
#define __NET_GENERIC_H__

#include <linux/bug.h>
#include <linux/rcupdate.h>

/*
 * Generic net pointers are to be used by modules to put some private
 * stuff on the struct net without explicit struct net modification
 *
 * The rules are simple:
 * 1. set pernet_operations->id.  After register_pernet_device you
 *    will have the id of your private pointer.
 * 2. set pernet_operations->size to have the code allocate and free
 *    a private structure pointed to from struct net.
 * 3. do not change this pointer while the net is alive;
 * 4. do not try to have any private reference on the net_generic object.
 *
 * After accomplishing all of the above, the private pointer can be
 * accessed with the net_generic() call.
 */

struct net_generic {
        union {
                struct {
                        unsigned int len;
                        struct rcu_head rcu;
                } s;

                void *ptr[0];
        };
};

static inline void *net_generic(const struct net *net, unsigned int id)
{
        struct net_generic *ng;
        void *ptr;

        rcu_read_lock();
        ng = rcu_dereference(net->gen);
        ptr = ng->ptr[id];
        rcu_read_unlock();

        return ptr;
}
#endif


















































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
// SPDX-License-Identifier: GPL-2.0
/*
 * Common header file for probe-based Dynamic events.
 *
 * This code was copied from kernel/trace/trace_kprobe.h written by
 * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
 *
 * Updates to make this generic:
 * Copyright (C) IBM Corporation, 2010-2011
 * Author:     Srikar Dronamraju
 */

#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/smp.h>
#include <linux/tracefs.h>
#include <linux/types.h>
#include <linux/string.h>
#include <linux/ptrace.h>
#include <linux/perf_event.h>
#include <linux/kprobes.h>
#include <linux/stringify.h>
#include <linux/limits.h>
#include <linux/uaccess.h>
#include <linux/bitops.h>
#include <asm/bitsperlong.h>

#include "trace.h"
#include "trace_output.h"

#define MAX_TRACE_ARGS                128
#define MAX_ARGSTR_LEN                63
#define MAX_ARRAY_LEN                64
#define MAX_ARG_NAME_LEN        32
#define MAX_STRING_SIZE                PATH_MAX

/* Reserved field names */
#define FIELD_STRING_IP                "__probe_ip"
#define FIELD_STRING_RETIP        "__probe_ret_ip"
#define FIELD_STRING_FUNC        "__probe_func"

#undef DEFINE_FIELD
#define DEFINE_FIELD(type, item, name, is_signed)                        \
        do {                                                                \
                ret = trace_define_field(event_call, #type, name,        \
                                         offsetof(typeof(field), item),        \
                                         sizeof(field.item), is_signed, \
                                         FILTER_OTHER);                        \
                if (ret)                                                \
                        return ret;                                        \
        } while (0)


/* Flags for trace_probe */
#define TP_FLAG_TRACE                1
#define TP_FLAG_PROFILE                2

/* data_loc: data location, compatible with u32 */
#define make_data_loc(len, offs)        \
        (((u32)(len) << 16) | ((u32)(offs) & 0xffff))
#define get_loc_len(dl)                ((u32)(dl) >> 16)
#define get_loc_offs(dl)        ((u32)(dl) & 0xffff)

static nokprobe_inline void *get_loc_data(u32 *dl, void *ent)
{
        return (u8 *)ent + get_loc_offs(*dl);
}

static nokprobe_inline u32 update_data_loc(u32 loc, int consumed)
{
        u32 maxlen = get_loc_len(loc);
        u32 offset = get_loc_offs(loc);

        return make_data_loc(maxlen - consumed, offset + consumed);
}

/* Printing function type */
typedef int (*print_type_func_t)(struct trace_seq *, void *, void *);

enum fetch_op {
        FETCH_OP_NOP = 0,
        // Stage 1 (load) ops
        FETCH_OP_REG,                /* Register : .param = offset */
        FETCH_OP_STACK,                /* Stack : .param = index */
        FETCH_OP_STACKP,        /* Stack pointer */
        FETCH_OP_RETVAL,        /* Return value */
        FETCH_OP_IMM,                /* Immediate : .immediate */
        FETCH_OP_COMM,                /* Current comm */
        FETCH_OP_ARG,                /* Function argument : .param */
        FETCH_OP_FOFFS,                /* File offset: .immediate */
        FETCH_OP_DATA,                /* Allocated data: .data */
        // Stage 2 (dereference) op
        FETCH_OP_DEREF,                /* Dereference: .offset */
        FETCH_OP_UDEREF,        /* User-space Dereference: .offset */
        // Stage 3 (store) ops
        FETCH_OP_ST_RAW,        /* Raw: .size */
        FETCH_OP_ST_MEM,        /* Mem: .offset, .size */
        FETCH_OP_ST_UMEM,        /* Mem: .offset, .size */
        FETCH_OP_ST_STRING,        /* String: .offset, .size */
        FETCH_OP_ST_USTRING,        /* User String: .offset, .size */
        // Stage 4 (modify) op
        FETCH_OP_MOD_BF,        /* Bitfield: .basesize, .lshift, .rshift */
        // Stage 5 (loop) op
        FETCH_OP_LP_ARRAY,        /* Array: .param = loop count */
        FETCH_OP_END,
        FETCH_NOP_SYMBOL,        /* Unresolved Symbol holder */
};

struct fetch_insn {
        enum fetch_op op;
        union {
                unsigned int param;
                struct {
                        unsigned int size;
                        int offset;
                };
                struct {
                        unsigned char basesize;
                        unsigned char lshift;
                        unsigned char rshift;
                };
                unsigned long immediate;
                void *data;
        };
};

/* fetch + deref*N + store + mod + end <= 16, this allows N=12, enough */
#define FETCH_INSN_MAX        16
#define FETCH_TOKEN_COMM        (-ECOMM)

/* Fetch type information table */
struct fetch_type {
        const char                *name;                /* Name of type */
        size_t                        size;                /* Byte size of type */
        int                        is_signed;        /* Signed flag */
        print_type_func_t        print;                /* Print functions */
        const char                *fmt;                /* Fromat string */
        const char                *fmttype;        /* Name in format file */
};

/* For defining macros, define string/string_size types */
typedef u32 string;
typedef u32 string_size;

#define PRINT_TYPE_FUNC_NAME(type)        print_type_##type
#define PRINT_TYPE_FMT_NAME(type)        print_type_format_##type

/* Printing  in basic type function template */
#define DECLARE_BASIC_PRINT_TYPE_FUNC(type)                                \
int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, void *data, void *ent);\
extern const char PRINT_TYPE_FMT_NAME(type)[]

DECLARE_BASIC_PRINT_TYPE_FUNC(u8);
DECLARE_BASIC_PRINT_TYPE_FUNC(u16);
DECLARE_BASIC_PRINT_TYPE_FUNC(u32);
DECLARE_BASIC_PRINT_TYPE_FUNC(u64);
DECLARE_BASIC_PRINT_TYPE_FUNC(s8);
DECLARE_BASIC_PRINT_TYPE_FUNC(s16);
DECLARE_BASIC_PRINT_TYPE_FUNC(s32);
DECLARE_BASIC_PRINT_TYPE_FUNC(s64);
DECLARE_BASIC_PRINT_TYPE_FUNC(x8);
DECLARE_BASIC_PRINT_TYPE_FUNC(x16);
DECLARE_BASIC_PRINT_TYPE_FUNC(x32);
DECLARE_BASIC_PRINT_TYPE_FUNC(x64);

DECLARE_BASIC_PRINT_TYPE_FUNC(string);
DECLARE_BASIC_PRINT_TYPE_FUNC(symbol);

/* Default (unsigned long) fetch type */
#define __DEFAULT_FETCH_TYPE(t) x##t
#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)

#define __ADDR_FETCH_TYPE(t) u##t
#define _ADDR_FETCH_TYPE(t) __ADDR_FETCH_TYPE(t)
#define ADDR_FETCH_TYPE _ADDR_FETCH_TYPE(BITS_PER_LONG)

#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype)        \
        {.name = _name,                                \
         .size = _size,                                        \
         .is_signed = sign,                                \
         .print = PRINT_TYPE_FUNC_NAME(ptype),                \
         .fmt = PRINT_TYPE_FMT_NAME(ptype),                \
         .fmttype = _fmttype,                                \
        }
#define _ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype)        \
        __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, #_fmttype)
#define ASSIGN_FETCH_TYPE(ptype, ftype, sign)                        \
        _ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, ptype)

/* If ptype is an alias of atype, use this macro (show atype in format) */
#define ASSIGN_FETCH_TYPE_ALIAS(ptype, atype, ftype, sign)                \
        _ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, atype)

#define ASSIGN_FETCH_TYPE_END {}
#define MAX_ARRAY_LEN        64

#ifdef CONFIG_KPROBE_EVENTS
bool trace_kprobe_on_func_entry(struct trace_event_call *call);
bool trace_kprobe_error_injectable(struct trace_event_call *call);
#else
static inline bool trace_kprobe_on_func_entry(struct trace_event_call *call)
{
        return false;
}

static inline bool trace_kprobe_error_injectable(struct trace_event_call *call)
{
        return false;
}
#endif /* CONFIG_KPROBE_EVENTS */

struct probe_arg {
        struct fetch_insn        *code;
        bool                        dynamic;/* Dynamic array (string) is used */
        unsigned int                offset;        /* Offset from argument entry */
        unsigned int                count;        /* Array count */
        const char                *name;        /* Name of this argument */
        const char                *comm;        /* Command of this argument */
        char                        *fmt;        /* Format string if needed */
        const struct fetch_type        *type;        /* Type of this argument */
};

struct trace_uprobe_filter {
        rwlock_t                rwlock;
        int                        nr_systemwide;
        struct list_head        perf_events;
};

/* Event call and class holder */
struct trace_probe_event {
        unsigned int                        flags;        /* For TP_FLAG_* */
        struct trace_event_class        class;
        struct trace_event_call                call;
        struct list_head                 files;
        struct list_head                probes;
        struct trace_uprobe_filter        filter[];
};

struct trace_probe {
        struct list_head                list;
        struct trace_probe_event        *event;
        ssize_t                                size;        /* trace entry size */
        unsigned int                        nr_args;
        struct probe_arg                args[];
};

struct event_file_link {
        struct trace_event_file                *file;
        struct list_head                list;
};

static inline unsigned int trace_probe_load_flag(struct trace_probe *tp)
{
        return smp_load_acquire(&tp->event->flags);
}

static inline bool trace_probe_test_flag(struct trace_probe *tp,
                                         unsigned int flag)
{
        return !!(trace_probe_load_flag(tp) & flag);
}

static inline void trace_probe_set_flag(struct trace_probe *tp,
                                        unsigned int flag)
{
        smp_store_release(&tp->event->flags, tp->event->flags | flag);
}

static inline void trace_probe_clear_flag(struct trace_probe *tp,
                                          unsigned int flag)
{
        tp->event->flags &= ~flag;
}

static inline bool trace_probe_is_enabled(struct trace_probe *tp)
{
        return trace_probe_test_flag(tp, TP_FLAG_TRACE | TP_FLAG_PROFILE);
}

static inline const char *trace_probe_name(struct trace_probe *tp)
{
        return trace_event_name(&tp->event->call);
}

static inline const char *trace_probe_group_name(struct trace_probe *tp)
{
        return tp->event->call.class->system;
}

static inline struct trace_event_call *
        trace_probe_event_call(struct trace_probe *tp)
{
        return &tp->event->call;
}

static inline struct trace_probe_event *
trace_probe_event_from_call(struct trace_event_call *event_call)
{
        return container_of(event_call, struct trace_probe_event, call);
}

static inline struct trace_probe *
trace_probe_primary_from_call(struct trace_event_call *call)
{
        struct trace_probe_event *tpe = trace_probe_event_from_call(call);

        return list_first_entry_or_null(&tpe->probes, struct trace_probe, list);
}

static inline struct list_head *trace_probe_probe_list(struct trace_probe *tp)
{
        return &tp->event->probes;
}

static inline bool trace_probe_has_sibling(struct trace_probe *tp)
{
        struct list_head *list = trace_probe_probe_list(tp);

        return !list_empty(list) && !list_is_singular(list);
}

static inline int trace_probe_unregister_event_call(struct trace_probe *tp)
{
        /* tp->event is unregistered in trace_remove_event_call() */
        return trace_remove_event_call(&tp->event->call);
}

static inline bool trace_probe_has_single_file(struct trace_probe *tp)
{
        return !!list_is_singular(&tp->event->files);
}

int trace_probe_init(struct trace_probe *tp, const char *event,
                     const char *group, bool alloc_filter);
void trace_probe_cleanup(struct trace_probe *tp);
int trace_probe_append(struct trace_probe *tp, struct trace_probe *to);
void trace_probe_unlink(struct trace_probe *tp);
int trace_probe_register_event_call(struct trace_probe *tp);
int trace_probe_add_file(struct trace_probe *tp, struct trace_event_file *file);
int trace_probe_remove_file(struct trace_probe *tp,
                            struct trace_event_file *file);
struct event_file_link *trace_probe_get_file_link(struct trace_probe *tp,
                                                struct trace_event_file *file);
int trace_probe_compare_arg_type(struct trace_probe *a, struct trace_probe *b);
bool trace_probe_match_command_args(struct trace_probe *tp,
                                    int argc, const char **argv);

#define trace_probe_for_each_link(pos, tp)        \
        list_for_each_entry(pos, &(tp)->event->files, list)
#define trace_probe_for_each_link_rcu(pos, tp)        \
        list_for_each_entry_rcu(pos, &(tp)->event->files, list)

#define TPARG_FL_RETURN BIT(0)
#define TPARG_FL_KERNEL BIT(1)
#define TPARG_FL_FENTRY BIT(2)
#define TPARG_FL_MASK        GENMASK(2, 0)

extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i,
                                char *arg, unsigned int flags);

extern int traceprobe_update_arg(struct probe_arg *arg);
extern void traceprobe_free_probe_arg(struct probe_arg *arg);

extern int traceprobe_split_symbol_offset(char *symbol, long *offset);
int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
                                char *buf, int offset);

extern int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return);

#ifdef CONFIG_PERF_EVENTS
extern struct trace_event_call *
create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
                          bool is_return);
extern void destroy_local_trace_kprobe(struct trace_event_call *event_call);

extern struct trace_event_call *
create_local_trace_uprobe(char *name, unsigned long offs,
                          unsigned long ref_ctr_offset, bool is_return);
extern void destroy_local_trace_uprobe(struct trace_event_call *event_call);
#endif
extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
                                        size_t offset, struct trace_probe *tp);

#undef ERRORS
#define ERRORS        \
        C(FILE_NOT_FOUND,        "Failed to find the given file"),        \
        C(NO_REGULAR_FILE,        "Not a regular file"),                        \
        C(BAD_REFCNT,                "Invalid reference counter offset"),        \
        C(REFCNT_OPEN_BRACE,        "Reference counter brace is not closed"), \
        C(BAD_REFCNT_SUFFIX,        "Reference counter has wrong suffix"),        \
        C(BAD_UPROBE_OFFS,        "Invalid uprobe offset"),                \
        C(MAXACT_NO_KPROBE,        "Maxactive is not for kprobe"),                \
        C(BAD_MAXACT,                "Invalid maxactive number"),                \
        C(MAXACT_TOO_BIG,        "Maxactive is too big"),                \
        C(BAD_PROBE_ADDR,        "Invalid probed address or symbol"),        \
        C(NON_UNIQ_SYMBOL,        "The symbol is not unique"),                \
        C(BAD_RETPROBE,                "Retprobe address must be an function entry"), \
        C(BAD_ADDR_SUFFIX,        "Invalid probed address suffix"), \
        C(NO_GROUP_NAME,        "Group name is not specified"),                \
        C(GROUP_TOO_LONG,        "Group name is too long"),                \
        C(BAD_GROUP_NAME,        "Group name must follow the same rules as C identifiers"), \
        C(NO_EVENT_NAME,        "Event name is not specified"),                \
        C(EVENT_TOO_LONG,        "Event name is too long"),                \
        C(BAD_EVENT_NAME,        "Event name must follow the same rules as C identifiers"), \
        C(EVENT_EXIST,                "Given group/event name is already used by another event"), \
        C(RETVAL_ON_PROBE,        "$retval is not available on probe"),        \
        C(BAD_STACK_NUM,        "Invalid stack number"),                \
        C(BAD_ARG_NUM,                "Invalid argument number"),                \
        C(BAD_VAR,                "Invalid $-valiable specified"),        \
        C(BAD_REG_NAME,                "Invalid register name"),                \
        C(BAD_MEM_ADDR,                "Invalid memory address"),                \
        C(BAD_IMM,                "Invalid immediate value"),                \
        C(IMMSTR_NO_CLOSE,        "String is not closed with '\"'"),        \
        C(FILE_ON_KPROBE,        "File offset is not available with kprobe"), \
        C(BAD_FILE_OFFS,        "Invalid file offset value"),                \
        C(SYM_ON_UPROBE,        "Symbol is not available with uprobe"),        \
        C(TOO_MANY_OPS,                "Dereference is too much nested"),         \
        C(DEREF_NEED_BRACE,        "Dereference needs a brace"),                \
        C(BAD_DEREF_OFFS,        "Invalid dereference offset"),                \
        C(DEREF_OPEN_BRACE,        "Dereference brace is not closed"),        \
        C(COMM_CANT_DEREF,        "$comm can not be dereferenced"),        \
        C(BAD_FETCH_ARG,        "Invalid fetch argument"),                \
        C(ARRAY_NO_CLOSE,        "Array is not closed"),                        \
        C(BAD_ARRAY_SUFFIX,        "Array has wrong suffix"),                \
        C(BAD_ARRAY_NUM,        "Invalid array size"),                        \
        C(ARRAY_TOO_BIG,        "Array number is too big"),                \
        C(BAD_TYPE,                "Unknown type is specified"),                \
        C(BAD_STRING,                "String accepts only memory argument"),        \
        C(BAD_BITFIELD,                "Invalid bitfield"),                        \
        C(ARG_NAME_TOO_LONG,        "Argument name is too long"),                \
        C(NO_ARG_NAME,                "Argument name is not specified"),        \
        C(BAD_ARG_NAME,                "Argument name must follow the same rules as C identifiers"), \
        C(USED_ARG_NAME,        "This argument name is already used"),        \
        C(ARG_TOO_LONG,                "Argument expression is too long"),        \
        C(NO_ARG_BODY,                "No argument expression"),                \
        C(BAD_INSN_BNDRY,        "Probe point is not an instruction boundary"),\
        C(FAIL_REG_PROBE,        "Failed to register probe event"),\
        C(DIFF_PROBE_TYPE,        "Probe type is different from existing probe"),\
        C(DIFF_ARG_TYPE,        "Argument type or name is different from existing probe"),\
        C(SAME_PROBE,                "There is already the exact same probe event"),

#undef C
#define C(a, b)                TP_ERR_##a

/* Define TP_ERR_ */
enum { ERRORS };

/* Error text is defined in trace_probe.c */

struct trace_probe_log {
        const char        *subsystem;
        const char        **argv;
        int                argc;
        int                index;
};

void trace_probe_log_init(const char *subsystem, int argc, const char **argv);
void trace_probe_log_set_index(int index);
void trace_probe_log_clear(void);
void __trace_probe_log_err(int offset, int err);

#define trace_probe_log_err(offs, err)        \
        __trace_probe_log_err(offs, TP_ERR_##err)



































    4 
    4 

    4 








    4 





























    4 
    4 


































    1 






    1 




























    1 











































































    4 
    4 

    4 


    4 























    4 









    3 



    3 
    3 
    3 
    3 


    3 










































































    4 


    4 






    4 






























































































































































































    4 




    4 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
// SPDX-License-Identifier: GPL-2.0
/*
 * Implementation of the extensible bitmap type.
 *
 * Author : Stephen Smalley, <sds@tycho.nsa.gov>
 */
/*
 * Updated: Hewlett-Packard <paul@paul-moore.com>
 *
 *      Added support to import/export the NetLabel category bitmap
 *
 * (c) Copyright Hewlett-Packard Development Company, L.P., 2006
 */
/*
 * Updated: KaiGai Kohei <kaigai@ak.jp.nec.com>
 *      Applied standard bit operations to improve bitmap scanning.
 */

#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/errno.h>
#include <linux/jhash.h>
#include <net/netlabel.h>
#include "ebitmap.h"
#include "policydb.h"

#define BITS_PER_U64        (sizeof(u64) * 8)

static struct kmem_cache *ebitmap_node_cachep;

int ebitmap_cmp(struct ebitmap *e1, struct ebitmap *e2)
{
        struct ebitmap_node *n1, *n2;

        if (e1->highbit != e2->highbit)
                return 0;

        n1 = e1->node;
        n2 = e2->node;
        while (n1 && n2 &&
               (n1->startbit == n2->startbit) &&
               !memcmp(n1->maps, n2->maps, EBITMAP_SIZE / 8)) {
                n1 = n1->next;
                n2 = n2->next;
        }

        if (n1 || n2)
                return 0;

        return 1;
}

int ebitmap_cpy(struct ebitmap *dst, struct ebitmap *src)
{
        struct ebitmap_node *n, *new, *prev;

        ebitmap_init(dst);
        n = src->node;
        prev = NULL;
        while (n) {
                new = kmem_cache_zalloc(ebitmap_node_cachep, GFP_ATOMIC);
                if (!new) {
                        ebitmap_destroy(dst);
                        return -ENOMEM;
                }
                new->startbit = n->startbit;
                memcpy(new->maps, n->maps, EBITMAP_SIZE / 8);
                new->next = NULL;
                if (prev)
                        prev->next = new;
                else
                        dst->node = new;
                prev = new;
                n = n->next;
        }

        dst->highbit = src->highbit;
        return 0;
}

int ebitmap_and(struct ebitmap *dst, struct ebitmap *e1, struct ebitmap *e2)
{
        struct ebitmap_node *n;
        int bit, rc;

        ebitmap_init(dst);

        ebitmap_for_each_positive_bit(e1, n, bit) {
                if (ebitmap_get_bit(e2, bit)) {
                        rc = ebitmap_set_bit(dst, bit, 1);
                        if (rc < 0)
                                return rc;
                }
        }
        return 0;
}


#ifdef CONFIG_NETLABEL
/**
 * ebitmap_netlbl_export - Export an ebitmap into a NetLabel category bitmap
 * @ebmap: the ebitmap to export
 * @catmap: the NetLabel category bitmap
 *
 * Description:
 * Export a SELinux extensibile bitmap into a NetLabel category bitmap.
 * Returns zero on success, negative values on error.
 *
 */
int ebitmap_netlbl_export(struct ebitmap *ebmap,
                          struct netlbl_lsm_catmap **catmap)
{
        struct ebitmap_node *e_iter = ebmap->node;
        unsigned long e_map;
        u32 offset;
        unsigned int iter;
        int rc;

        if (e_iter == NULL) {
                *catmap = NULL;
                return 0;
        }

        if (*catmap != NULL)
                netlbl_catmap_free(*catmap);
        *catmap = NULL;

        while (e_iter) {
                offset = e_iter->startbit;
                for (iter = 0; iter < EBITMAP_UNIT_NUMS; iter++) {
                        e_map = e_iter->maps[iter];
                        if (e_map != 0) {
                                rc = netlbl_catmap_setlong(catmap,
                                                           offset,
                                                           e_map,
                                                           GFP_ATOMIC);
                                if (rc != 0)
                                        goto netlbl_export_failure;
                        }
                        offset += EBITMAP_UNIT_SIZE;
                }
                e_iter = e_iter->next;
        }

        return 0;

netlbl_export_failure:
        netlbl_catmap_free(*catmap);
        return -ENOMEM;
}

/**
 * ebitmap_netlbl_import - Import a NetLabel category bitmap into an ebitmap
 * @ebmap: the ebitmap to import
 * @catmap: the NetLabel category bitmap
 *
 * Description:
 * Import a NetLabel category bitmap into a SELinux extensibile bitmap.
 * Returns zero on success, negative values on error.
 *
 */
int ebitmap_netlbl_import(struct ebitmap *ebmap,
                          struct netlbl_lsm_catmap *catmap)
{
        int rc;
        struct ebitmap_node *e_iter = NULL;
        struct ebitmap_node *e_prev = NULL;
        u32 offset = 0, idx;
        unsigned long bitmap;

        for (;;) {
                rc = netlbl_catmap_getlong(catmap, &offset, &bitmap);
                if (rc < 0)
                        goto netlbl_import_failure;
                if (offset == (u32)-1)
                        return 0;

                /* don't waste ebitmap space if the netlabel bitmap is empty */
                if (bitmap == 0) {
                        offset += EBITMAP_UNIT_SIZE;
                        continue;
                }

                if (e_iter == NULL ||
                    offset >= e_iter->startbit + EBITMAP_SIZE) {
                        e_prev = e_iter;
                        e_iter = kmem_cache_zalloc(ebitmap_node_cachep, GFP_ATOMIC);
                        if (e_iter == NULL)
                                goto netlbl_import_failure;
                        e_iter->startbit = offset - (offset % EBITMAP_SIZE);
                        if (e_prev == NULL)
                                ebmap->node = e_iter;
                        else
                                e_prev->next = e_iter;
                        ebmap->highbit = e_iter->startbit + EBITMAP_SIZE;
                }

                /* offset will always be aligned to an unsigned long */
                idx = EBITMAP_NODE_INDEX(e_iter, offset);
                e_iter->maps[idx] = bitmap;

                /* next */
                offset += EBITMAP_UNIT_SIZE;
        }

        /* NOTE: we should never reach this return */
        return 0;

netlbl_import_failure:
        ebitmap_destroy(ebmap);
        return -ENOMEM;
}
#endif /* CONFIG_NETLABEL */

/*
 * Check to see if all the bits set in e2 are also set in e1. Optionally,
 * if last_e2bit is non-zero, the highest set bit in e2 cannot exceed
 * last_e2bit.
 */
int ebitmap_contains(struct ebitmap *e1, struct ebitmap *e2, u32 last_e2bit)
{
        struct ebitmap_node *n1, *n2;
        int i;

        if (e1->highbit < e2->highbit)
                return 0;

        n1 = e1->node;
        n2 = e2->node;

        while (n1 && n2 && (n1->startbit <= n2->startbit)) {
                if (n1->startbit < n2->startbit) {
                        n1 = n1->next;
                        continue;
                }
                for (i = EBITMAP_UNIT_NUMS - 1; (i >= 0) && !n2->maps[i]; )
                        i--;        /* Skip trailing NULL map entries */
                if (last_e2bit && (i >= 0)) {
                        u32 lastsetbit = n2->startbit + i * EBITMAP_UNIT_SIZE +
                                         __fls(n2->maps[i]);
                        if (lastsetbit > last_e2bit)
                                return 0;
                }

                while (i >= 0) {
                        if ((n1->maps[i] & n2->maps[i]) != n2->maps[i])
                                return 0;
                        i--;
                }

                n1 = n1->next;
                n2 = n2->next;
        }

        if (n2)
                return 0;

        return 1;
}

int ebitmap_get_bit(struct ebitmap *e, unsigned long bit)
{
        struct ebitmap_node *n;

        if (e->highbit < bit)
                return 0;

        n = e->node;
        while (n && (n->startbit <= bit)) {
                if ((n->startbit + EBITMAP_SIZE) > bit)
                        return ebitmap_node_get_bit(n, bit);
                n = n->next;
        }

        return 0;
}

int ebitmap_set_bit(struct ebitmap *e, unsigned long bit, int value)
{
        struct ebitmap_node *n, *prev, *new;

        prev = NULL;
        n = e->node;
        while (n && n->startbit <= bit) {
                if ((n->startbit + EBITMAP_SIZE) > bit) {
                        if (value) {
                                ebitmap_node_set_bit(n, bit);
                        } else {
                                unsigned int s;

                                ebitmap_node_clr_bit(n, bit);

                                s = find_first_bit(n->maps, EBITMAP_SIZE);
                                if (s < EBITMAP_SIZE)
                                        return 0;

                                /* drop this node from the bitmap */
                                if (!n->next) {
                                        /*
                                         * this was the highest map
                                         * within the bitmap
                                         */
                                        if (prev)
                                                e->highbit = prev->startbit
                                                             + EBITMAP_SIZE;
                                        else
                                                e->highbit = 0;
                                }
                                if (prev)
                                        prev->next = n->next;
                                else
                                        e->node = n->next;
                                kmem_cache_free(ebitmap_node_cachep, n);
                        }
                        return 0;
                }
                prev = n;
                n = n->next;
        }

        if (!value)
                return 0;

        new = kmem_cache_zalloc(ebitmap_node_cachep, GFP_ATOMIC);
        if (!new)
                return -ENOMEM;

        new->startbit = bit - (bit % EBITMAP_SIZE);
        ebitmap_node_set_bit(new, bit);

        if (!n)
                /* this node will be the highest map within the bitmap */
                e->highbit = new->startbit + EBITMAP_SIZE;

        if (prev) {
                new->next = prev->next;
                prev->next = new;
        } else {
                new->next = e->node;
                e->node = new;
        }

        return 0;
}

void ebitmap_destroy(struct ebitmap *e)
{
        struct ebitmap_node *n, *temp;

        if (!e)
                return;

        n = e->node;
        while (n) {
                temp = n;
                n = n->next;
                kmem_cache_free(ebitmap_node_cachep, temp);
        }

        e->highbit = 0;
        e->node = NULL;
        return;
}

int ebitmap_read(struct ebitmap *e, void *fp)
{
        struct ebitmap_node *n = NULL;
        u32 mapunit, count, startbit, index;
        __le32 ebitmap_start;
        u64 map;
        __le64 mapbits;
        __le32 buf[3];
        int rc, i;

        ebitmap_init(e);

        rc = next_entry(buf, fp, sizeof buf);
        if (rc < 0)
                goto out;

        mapunit = le32_to_cpu(buf[0]);
        e->highbit = le32_to_cpu(buf[1]);
        count = le32_to_cpu(buf[2]);

        if (mapunit != BITS_PER_U64) {
                pr_err("SELinux: ebitmap: map size %u does not "
                       "match my size %zd (high bit was %d)\n",
                       mapunit, BITS_PER_U64, e->highbit);
                goto bad;
        }

        /* round up e->highbit */
        e->highbit += EBITMAP_SIZE - 1;
        e->highbit -= (e->highbit % EBITMAP_SIZE);

        if (!e->highbit) {
                e->node = NULL;
                goto ok;
        }

        if (e->highbit && !count)
                goto bad;

        for (i = 0; i < count; i++) {
                rc = next_entry(&ebitmap_start, fp, sizeof(u32));
                if (rc < 0) {
                        pr_err("SELinux: ebitmap: truncated map\n");
                        goto bad;
                }
                startbit = le32_to_cpu(ebitmap_start);

                if (startbit & (mapunit - 1)) {
                        pr_err("SELinux: ebitmap start bit (%d) is "
                               "not a multiple of the map unit size (%u)\n",
                               startbit, mapunit);
                        goto bad;
                }
                if (startbit > e->highbit - mapunit) {
                        pr_err("SELinux: ebitmap start bit (%d) is "
                               "beyond the end of the bitmap (%u)\n",
                               startbit, (e->highbit - mapunit));
                        goto bad;
                }

                if (!n || startbit >= n->startbit + EBITMAP_SIZE) {
                        struct ebitmap_node *tmp;
                        tmp = kmem_cache_zalloc(ebitmap_node_cachep, GFP_KERNEL);
                        if (!tmp) {
                                pr_err("SELinux: ebitmap: out of memory\n");
                                rc = -ENOMEM;
                                goto bad;
                        }
                        /* round down */
                        tmp->startbit = startbit - (startbit % EBITMAP_SIZE);
                        if (n)
                                n->next = tmp;
                        else
                                e->node = tmp;
                        n = tmp;
                } else if (startbit <= n->startbit) {
                        pr_err("SELinux: ebitmap: start bit %d"
                               " comes after start bit %d\n",
                               startbit, n->startbit);
                        goto bad;
                }

                rc = next_entry(&mapbits, fp, sizeof(u64));
                if (rc < 0) {
                        pr_err("SELinux: ebitmap: truncated map\n");
                        goto bad;
                }
                map = le64_to_cpu(mapbits);

                index = (startbit - n->startbit) / EBITMAP_UNIT_SIZE;
                while (map) {
                        n->maps[index++] = map & (-1UL);
                        map = EBITMAP_SHIFT_UNIT_SIZE(map);
                }
        }
ok:
        rc = 0;
out:
        return rc;
bad:
        if (!rc)
                rc = -EINVAL;
        ebitmap_destroy(e);
        goto out;
}

int ebitmap_write(struct ebitmap *e, void *fp)
{
        struct ebitmap_node *n;
        u32 count;
        __le32 buf[3];
        u64 map;
        int bit, last_bit, last_startbit, rc;

        buf[0] = cpu_to_le32(BITS_PER_U64);

        count = 0;
        last_bit = 0;
        last_startbit = -1;
        ebitmap_for_each_positive_bit(e, n, bit) {
                if (rounddown(bit, (int)BITS_PER_U64) > last_startbit) {
                        count++;
                        last_startbit = rounddown(bit, BITS_PER_U64);
                }
                last_bit = roundup(bit + 1, BITS_PER_U64);
        }
        buf[1] = cpu_to_le32(last_bit);
        buf[2] = cpu_to_le32(count);

        rc = put_entry(buf, sizeof(u32), 3, fp);
        if (rc)
                return rc;

        map = 0;
        last_startbit = INT_MIN;
        ebitmap_for_each_positive_bit(e, n, bit) {
                if (rounddown(bit, (int)BITS_PER_U64) > last_startbit) {
                        __le64 buf64[1];

                        /* this is the very first bit */
                        if (!map) {
                                last_startbit = rounddown(bit, BITS_PER_U64);
                                map = (u64)1 << (bit - last_startbit);
                                continue;
                        }

                        /* write the last node */
                        buf[0] = cpu_to_le32(last_startbit);
                        rc = put_entry(buf, sizeof(u32), 1, fp);
                        if (rc)
                                return rc;

                        buf64[0] = cpu_to_le64(map);
                        rc = put_entry(buf64, sizeof(u64), 1, fp);
                        if (rc)
                                return rc;

                        /* set up for the next node */
                        map = 0;
                        last_startbit = rounddown(bit, BITS_PER_U64);
                }
                map |= (u64)1 << (bit - last_startbit);
        }
        /* write the last node */
        if (map) {
                __le64 buf64[1];

                /* write the last node */
                buf[0] = cpu_to_le32(last_startbit);
                rc = put_entry(buf, sizeof(u32), 1, fp);
                if (rc)
                        return rc;

                buf64[0] = cpu_to_le64(map);
                rc = put_entry(buf64, sizeof(u64), 1, fp);
                if (rc)
                        return rc;
        }
        return 0;
}

u32 ebitmap_hash(const struct ebitmap *e, u32 hash)
{
        struct ebitmap_node *node;

        /* need to change hash even if ebitmap is empty */
        hash = jhash_1word(e->highbit, hash);
        for (node = e->node; node; node = node->next) {
                hash = jhash_1word(node->startbit, hash);
                hash = jhash(node->maps, sizeof(node->maps), hash);
        }
        return hash;
}

void __init ebitmap_cache_init(void)
{
        ebitmap_node_cachep = kmem_cache_create("ebitmap_node",
                                                        sizeof(struct ebitmap_node),
                                                        0, SLAB_PANIC, NULL);
}










    1 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_VMACACHE_H
#define __LINUX_VMACACHE_H

#include <linux/sched.h>
#include <linux/mm.h>

static inline void vmacache_flush(struct task_struct *tsk)
{
        memset(tsk->vmacache.vmas, 0, sizeof(tsk->vmacache.vmas));
}

extern void vmacache_update(unsigned long addr, struct vm_area_struct *newvma);
extern struct vm_area_struct *vmacache_find(struct mm_struct *mm,
                                                    unsigned long addr);

#ifndef CONFIG_MMU
extern struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
                                                  unsigned long start,
                                                  unsigned long end);
#endif

static inline void vmacache_invalidate(struct mm_struct *mm)
{
        mm->vmacache_seqnum++;
}

#endif /* __LINUX_VMACACHE_H */













































































































































































































































































































































































































































































    1 








    2 









    1 


































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Block data types and constants.  Directly include this file only to
 * break include dependency loop.
 */
#ifndef __LINUX_BLK_TYPES_H
#define __LINUX_BLK_TYPES_H

#include <linux/types.h>
#include <linux/bvec.h>
#include <linux/ktime.h>

struct bio_set;
struct bio;
struct bio_integrity_payload;
struct page;
struct io_context;
struct cgroup_subsys_state;
typedef void (bio_end_io_t) (struct bio *);
struct bio_crypt_ctx;

struct block_device {
        dev_t                        bd_dev;
        int                        bd_openers;
        struct inode *                bd_inode;        /* will die */
        struct super_block *        bd_super;
        struct mutex                bd_mutex;        /* open/close mutex */
        void *                        bd_claiming;
        void *                        bd_holder;
        int                        bd_holders;
        bool                        bd_write_holder;
#ifdef CONFIG_SYSFS
        struct list_head        bd_holder_disks;
#endif
        struct block_device *        bd_contains;
        u8                        bd_partno;
        struct hd_struct *        bd_part;
        /* number of times partitions within this device have been opened. */
        unsigned                bd_part_count;

        spinlock_t                bd_size_lock; /* for bd_inode->i_size updates */
        struct gendisk *        bd_disk;
        struct backing_dev_info *bd_bdi;

        /* The counter of freeze processes */
        int                        bd_fsfreeze_count;
        /* Mutex for freeze */
        struct mutex                bd_fsfreeze_mutex;
} __randomize_layout;

/*
 * Block error status values.  See block/blk-core:blk_errors for the details.
 * Alpha cannot write a byte atomically, so we need to use 32-bit value.
 */
#if defined(CONFIG_ALPHA) && !defined(__alpha_bwx__)
typedef u32 __bitwise blk_status_t;
#else
typedef u8 __bitwise blk_status_t;
#endif
#define        BLK_STS_OK 0
#define BLK_STS_NOTSUPP                ((__force blk_status_t)1)
#define BLK_STS_TIMEOUT                ((__force blk_status_t)2)
#define BLK_STS_NOSPC                ((__force blk_status_t)3)
#define BLK_STS_TRANSPORT        ((__force blk_status_t)4)
#define BLK_STS_TARGET                ((__force blk_status_t)5)
#define BLK_STS_NEXUS                ((__force blk_status_t)6)
#define BLK_STS_MEDIUM                ((__force blk_status_t)7)
#define BLK_STS_PROTECTION        ((__force blk_status_t)8)
#define BLK_STS_RESOURCE        ((__force blk_status_t)9)
#define BLK_STS_IOERR                ((__force blk_status_t)10)

/* hack for device mapper, don't use elsewhere: */
#define BLK_STS_DM_REQUEUE    ((__force blk_status_t)11)

#define BLK_STS_AGAIN                ((__force blk_status_t)12)

/*
 * BLK_STS_DEV_RESOURCE is returned from the driver to the block layer if
 * device related resources are unavailable, but the driver can guarantee
 * that the queue will be rerun in the future once resources become
 * available again. This is typically the case for device specific
 * resources that are consumed for IO. If the driver fails allocating these
 * resources, we know that inflight (or pending) IO will free these
 * resource upon completion.
 *
 * This is different from BLK_STS_RESOURCE in that it explicitly references
 * a device specific resource. For resources of wider scope, allocation
 * failure can happen without having pending IO. This means that we can't
 * rely on request completions freeing these resources, as IO may not be in
 * flight. Examples of that are kernel memory allocations, DMA mappings, or
 * any other system wide resources.
 */
#define BLK_STS_DEV_RESOURCE        ((__force blk_status_t)13)

/*
 * BLK_STS_ZONE_RESOURCE is returned from the driver to the block layer if zone
 * related resources are unavailable, but the driver can guarantee the queue
 * will be rerun in the future once the resources become available again.
 *
 * This is different from BLK_STS_DEV_RESOURCE in that it explicitly references
 * a zone specific resource and IO to a different zone on the same device could
 * still be served. Examples of that are zones that are write-locked, but a read
 * to the same zone could be served.
 */
#define BLK_STS_ZONE_RESOURCE        ((__force blk_status_t)14)

/*
 * BLK_STS_ZONE_OPEN_RESOURCE is returned from the driver in the completion
 * path if the device returns a status indicating that too many zone resources
 * are currently open. The same command should be successful if resubmitted
 * after the number of open zones decreases below the device's limits, which is
 * reported in the request_queue's max_open_zones.
 */
#define BLK_STS_ZONE_OPEN_RESOURCE        ((__force blk_status_t)15)

/*
 * BLK_STS_ZONE_ACTIVE_RESOURCE is returned from the driver in the completion
 * path if the device returns a status indicating that too many zone resources
 * are currently active. The same command should be successful if resubmitted
 * after the number of active zones decreases below the device's limits, which
 * is reported in the request_queue's max_active_zones.
 */
#define BLK_STS_ZONE_ACTIVE_RESOURCE        ((__force blk_status_t)16)

/**
 * blk_path_error - returns true if error may be path related
 * @error: status the request was completed with
 *
 * Description:
 *     This classifies block error status into non-retryable errors and ones
 *     that may be successful if retried on a failover path.
 *
 * Return:
 *     %false - retrying failover path will not help
 *     %true  - may succeed if retried
 */
static inline bool blk_path_error(blk_status_t error)
{
        switch (error) {
        case BLK_STS_NOTSUPP:
        case BLK_STS_NOSPC:
        case BLK_STS_TARGET:
        case BLK_STS_NEXUS:
        case BLK_STS_MEDIUM:
        case BLK_STS_PROTECTION:
                return false;
        }

        /* Anything else could be a path failure, so should be retried */
        return true;
}

/*
 * From most significant bit:
 * 1 bit: reserved for other usage, see below
 * 12 bits: original size of bio
 * 51 bits: issue time of bio
 */
#define BIO_ISSUE_RES_BITS      1
#define BIO_ISSUE_SIZE_BITS     12
#define BIO_ISSUE_RES_SHIFT     (64 - BIO_ISSUE_RES_BITS)
#define BIO_ISSUE_SIZE_SHIFT    (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS)
#define BIO_ISSUE_TIME_MASK     ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1)
#define BIO_ISSUE_SIZE_MASK     \
        (((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT)
#define BIO_ISSUE_RES_MASK      (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1))

/* Reserved bit for blk-throtl */
#define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63)

struct bio_issue {
        u64 value;
};

static inline u64 __bio_issue_time(u64 time)
{
        return time & BIO_ISSUE_TIME_MASK;
}

static inline u64 bio_issue_time(struct bio_issue *issue)
{
        return __bio_issue_time(issue->value);
}

static inline sector_t bio_issue_size(struct bio_issue *issue)
{
        return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT);
}

static inline void bio_issue_init(struct bio_issue *issue,
                                       sector_t size)
{
        size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1;
        issue->value = ((issue->value & BIO_ISSUE_RES_MASK) |
                        (ktime_get_ns() & BIO_ISSUE_TIME_MASK) |
                        ((u64)size << BIO_ISSUE_SIZE_SHIFT));
}

/*
 * main unit of I/O for the block layer and lower layers (ie drivers and
 * stacking drivers)
 */
struct bio {
        struct bio                *bi_next;        /* request queue link */
        struct gendisk                *bi_disk;
        unsigned int                bi_opf;                /* bottom bits req flags,
                                                 * top bits REQ_OP. Use
                                                 * accessors.
                                                 */
        unsigned short                bi_flags;        /* status, etc and bvec pool number */
        unsigned short                bi_ioprio;
        unsigned short                bi_write_hint;
        blk_status_t                bi_status;
        u8                        bi_partno;
        atomic_t                __bi_remaining;

        struct bvec_iter        bi_iter;

        bio_end_io_t                *bi_end_io;

        void                        *bi_private;
#ifdef CONFIG_BLK_CGROUP
        /*
         * Represents the association of the css and request_queue for the bio.
         * If a bio goes direct to device, it will not have a blkg as it will
         * not have a request_queue associated with it.  The reference is put
         * on release of the bio.
         */
        struct blkcg_gq                *bi_blkg;
        struct bio_issue        bi_issue;
#ifdef CONFIG_BLK_CGROUP_IOCOST
        u64                        bi_iocost_cost;
#endif
#endif

#ifdef CONFIG_BLK_INLINE_ENCRYPTION
        struct bio_crypt_ctx        *bi_crypt_context;
#endif

        union {
#if defined(CONFIG_BLK_DEV_INTEGRITY)
                struct bio_integrity_payload *bi_integrity; /* data integrity */
#endif
        };

        unsigned short                bi_vcnt;        /* how many bio_vec's */

        /*
         * Everything starting with bi_max_vecs will be preserved by bio_reset()
         */

        unsigned short                bi_max_vecs;        /* max bvl_vecs we can hold */

        atomic_t                __bi_cnt;        /* pin count */

        struct bio_vec                *bi_io_vec;        /* the actual vec list */

        struct bio_set                *bi_pool;

        /*
         * We can inline a number of vecs at the end of the bio, to avoid
         * double allocations for a small number of bio_vecs. This member
         * MUST obviously be kept at the very end of the bio.
         */
        struct bio_vec                bi_inline_vecs[];
};

#define BIO_RESET_BYTES                offsetof(struct bio, bi_max_vecs)

/*
 * bio flags
 */
enum {
        BIO_NO_PAGE_REF,        /* don't put release vec pages */
        BIO_CLONED,                /* doesn't own data */
        BIO_BOUNCED,                /* bio is a bounce bio */
        BIO_WORKINGSET,                /* contains userspace workingset pages */
        BIO_QUIET,                /* Make BIO Quiet */
        BIO_CHAIN,                /* chained bio, ->bi_remaining in effect */
        BIO_REFFED,                /* bio has elevated ->bi_cnt */
        BIO_THROTTLED,                /* This bio has already been subjected to
                                 * throttling rules. Don't do it again. */
        BIO_TRACE_COMPLETION,        /* bio_endio() should trace the final completion
                                 * of this bio. */
        BIO_CGROUP_ACCT,        /* has been accounted to a cgroup */
        BIO_TRACKED,                /* set if bio goes through the rq_qos path */
        BIO_FLAG_LAST
};

/* See BVEC_POOL_OFFSET below before adding new flags */

/*
 * We support 6 different bvec pools, the last one is magic in that it
 * is backed by a mempool.
 */
#define BVEC_POOL_NR                6
#define BVEC_POOL_MAX                (BVEC_POOL_NR - 1)

/*
 * Top 3 bits of bio flags indicate the pool the bvecs came from.  We add
 * 1 to the actual index so that 0 indicates that there are no bvecs to be
 * freed.
 */
#define BVEC_POOL_BITS                (3)
#define BVEC_POOL_OFFSET        (16 - BVEC_POOL_BITS)
#define BVEC_POOL_IDX(bio)        ((bio)->bi_flags >> BVEC_POOL_OFFSET)
#if (1<< BVEC_POOL_BITS) < (BVEC_POOL_NR+1)
# error "BVEC_POOL_BITS is too small"
#endif

/*
 * Flags starting here get preserved by bio_reset() - this includes
 * only BVEC_POOL_IDX()
 */
#define BIO_RESET_BITS        BVEC_POOL_OFFSET

typedef __u32 __bitwise blk_mq_req_flags_t;

/*
 * Operations and flags common to the bio and request structures.
 * We use 8 bits for encoding the operation, and the remaining 24 for flags.
 *
 * The least significant bit of the operation number indicates the data
 * transfer direction:
 *
 *   - if the least significant bit is set transfers are TO the device
 *   - if the least significant bit is not set transfers are FROM the device
 *
 * If a operation does not transfer data the least significant bit has no
 * meaning.
 */
#define REQ_OP_BITS        8
#define REQ_OP_MASK        ((1 << REQ_OP_BITS) - 1)
#define REQ_FLAG_BITS        24

enum req_opf {
        /* read sectors from the device */
        REQ_OP_READ                = 0,
        /* write sectors to the device */
        REQ_OP_WRITE                = 1,
        /* flush the volatile write cache */
        REQ_OP_FLUSH                = 2,
        /* discard sectors */
        REQ_OP_DISCARD                = 3,
        /* securely erase sectors */
        REQ_OP_SECURE_ERASE        = 5,
        /* write the same sector many times */
        REQ_OP_WRITE_SAME        = 7,
        /* write the zero filled sector many times */
        REQ_OP_WRITE_ZEROES        = 9,
        /* Open a zone */
        REQ_OP_ZONE_OPEN        = 11,
        /* Close a zone */
        REQ_OP_ZONE_CLOSE        = 13,
        /* Transition a zone to full */
        REQ_OP_ZONE_FINISH        = 15,
        /* reset a zone write pointer */
        REQ_OP_ZONE_RESET        = 17,
        /* reset all the zone present on the device */
        REQ_OP_ZONE_RESET_ALL        = 19,
        /* write data at the current zone write pointer */
        REQ_OP_ZONE_APPEND        = 21,

        /* SCSI passthrough using struct scsi_request */
        REQ_OP_SCSI_IN                = 32,
        REQ_OP_SCSI_OUT                = 33,
        /* Driver private requests */
        REQ_OP_DRV_IN                = 34,
        REQ_OP_DRV_OUT                = 35,

        REQ_OP_LAST,
};

enum req_flag_bits {
        __REQ_FAILFAST_DEV =        /* no driver retries of device errors */
                REQ_OP_BITS,
        __REQ_FAILFAST_TRANSPORT, /* no driver retries of transport errors */
        __REQ_FAILFAST_DRIVER,        /* no driver retries of driver errors */
        __REQ_SYNC,                /* request is sync (sync write or read) */
        __REQ_META,                /* metadata io request */
        __REQ_PRIO,                /* boost priority in cfq */
        __REQ_NOMERGE,                /* don't touch this for merging */
        __REQ_IDLE,                /* anticipate more IO after this one */
        __REQ_INTEGRITY,        /* I/O includes block integrity payload */
        __REQ_FUA,                /* forced unit access */
        __REQ_PREFLUSH,                /* request for cache flush */
        __REQ_RAHEAD,                /* read ahead, can fail anytime */
        __REQ_BACKGROUND,        /* background IO */
        __REQ_NOWAIT,           /* Don't wait if request will block */
        /*
         * When a shared kthread needs to issue a bio for a cgroup, doing
         * so synchronously can lead to priority inversions as the kthread
         * can be trapped waiting for that cgroup.  CGROUP_PUNT flag makes
         * submit_bio() punt the actual issuing to a dedicated per-blkcg
         * work item to avoid such priority inversions.
         */
        __REQ_CGROUP_PUNT,

        /* command specific flags for REQ_OP_WRITE_ZEROES: */
        __REQ_NOUNMAP,                /* do not free blocks when zeroing */

        __REQ_HIPRI,

        /* for driver use */
        __REQ_DRV,
        __REQ_SWAP,                /* swapping request. */
        __REQ_NR_BITS,                /* stops here */
};

#define REQ_FAILFAST_DEV        (1ULL << __REQ_FAILFAST_DEV)
#define REQ_FAILFAST_TRANSPORT        (1ULL << __REQ_FAILFAST_TRANSPORT)
#define REQ_FAILFAST_DRIVER        (1ULL << __REQ_FAILFAST_DRIVER)
#define REQ_SYNC                (1ULL << __REQ_SYNC)
#define REQ_META                (1ULL << __REQ_META)
#define REQ_PRIO                (1ULL << __REQ_PRIO)
#define REQ_NOMERGE                (1ULL << __REQ_NOMERGE)
#define REQ_IDLE                (1ULL << __REQ_IDLE)
#define REQ_INTEGRITY                (1ULL << __REQ_INTEGRITY)
#define REQ_FUA                        (1ULL << __REQ_FUA)
#define REQ_PREFLUSH                (1ULL << __REQ_PREFLUSH)
#define REQ_RAHEAD                (1ULL << __REQ_RAHEAD)
#define REQ_BACKGROUND                (1ULL << __REQ_BACKGROUND)
#define REQ_NOWAIT                (1ULL << __REQ_NOWAIT)
#define REQ_CGROUP_PUNT                (1ULL << __REQ_CGROUP_PUNT)

#define REQ_NOUNMAP                (1ULL << __REQ_NOUNMAP)
#define REQ_HIPRI                (1ULL << __REQ_HIPRI)

#define REQ_DRV                        (1ULL << __REQ_DRV)
#define REQ_SWAP                (1ULL << __REQ_SWAP)

#define REQ_FAILFAST_MASK \
        (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)

#define REQ_NOMERGE_FLAGS \
        (REQ_NOMERGE | REQ_PREFLUSH | REQ_FUA)

enum stat_group {
        STAT_READ,
        STAT_WRITE,
        STAT_DISCARD,
        STAT_FLUSH,

        NR_STAT_GROUPS
};

#define bio_op(bio) \
        ((bio)->bi_opf & REQ_OP_MASK)
#define req_op(req) \
        ((req)->cmd_flags & REQ_OP_MASK)

/* obsolete, don't use in new code */
static inline void bio_set_op_attrs(struct bio *bio, unsigned op,
                unsigned op_flags)
{
        bio->bi_opf = op | op_flags;
}

static inline bool op_is_write(unsigned int op)
{
        return (op & 1);
}

/*
 * Check if the bio or request is one that needs special treatment in the
 * flush state machine.
 */
static inline bool op_is_flush(unsigned int op)
{
        return op & (REQ_FUA | REQ_PREFLUSH);
}

/*
 * Reads are always treated as synchronous, as are requests with the FUA or
 * PREFLUSH flag.  Other operations may be marked as synchronous using the
 * REQ_SYNC flag.
 */
static inline bool op_is_sync(unsigned int op)
{
        return (op & REQ_OP_MASK) == REQ_OP_READ ||
                (op & (REQ_SYNC | REQ_FUA | REQ_PREFLUSH));
}

static inline bool op_is_discard(unsigned int op)
{
        return (op & REQ_OP_MASK) == REQ_OP_DISCARD;
}

/*
 * Check if a bio or request operation is a zone management operation.
 */
static inline bool op_is_zone_mgmt(enum req_opf op)
{
        switch (op & REQ_OP_MASK) {
        case REQ_OP_ZONE_RESET:
        case REQ_OP_ZONE_RESET_ALL:
        case REQ_OP_ZONE_OPEN:
        case REQ_OP_ZONE_CLOSE:
        case REQ_OP_ZONE_FINISH:
                return true;
        default:
                return false;
        }
}

static inline int op_stat_group(unsigned int op)
{
        if (op_is_discard(op))
                return STAT_DISCARD;
        return op_is_write(op);
}

typedef unsigned int blk_qc_t;
#define BLK_QC_T_NONE                -1U
#define BLK_QC_T_SHIFT                16
#define BLK_QC_T_INTERNAL        (1U << 31)

static inline bool blk_qc_t_valid(blk_qc_t cookie)
{
        return cookie != BLK_QC_T_NONE;
}

static inline unsigned int blk_qc_t_to_queue_num(blk_qc_t cookie)
{
        return (cookie & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT;
}

static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie)
{
        return cookie & ((1u << BLK_QC_T_SHIFT) - 1);
}

static inline bool blk_qc_t_is_internal(blk_qc_t cookie)
{
        return (cookie & BLK_QC_T_INTERNAL) != 0;
}

struct blk_rq_stat {
        u64 mean;
        u64 min;
        u64 max;
        u32 nr_samples;
        u64 batch;
};

#endif /* __LINUX_BLK_TYPES_H */

















































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MM_TYPES_H
#define _LINUX_MM_TYPES_H

#include <linux/mm_types_task.h>

#include <linux/auxvec.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/rbtree.h>
#include <linux/rwsem.h>
#include <linux/completion.h>
#include <linux/cpumask.h>
#include <linux/uprobes.h>
#include <linux/page-flags-layout.h>
#include <linux/workqueue.h>
#include <linux/seqlock.h>

#include <asm/mmu.h>

#ifndef AT_VECTOR_SIZE_ARCH
#define AT_VECTOR_SIZE_ARCH 0
#endif
#define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1))

#define INIT_PASID        0

struct address_space;
struct mem_cgroup;

/*
 * Each physical page in the system has a struct page associated with
 * it to keep track of whatever it is we are using the page for at the
 * moment. Note that we have no way to track which tasks are using
 * a page, though if it is a pagecache page, rmap structures can tell us
 * who is mapping it.
 *
 * If you allocate the page using alloc_pages(), you can use some of the
 * space in struct page for your own purposes.  The five words in the main
 * union are available, except for bit 0 of the first word which must be
 * kept clear.  Many users use this word to store a pointer to an object
 * which is guaranteed to be aligned.  If you use the same storage as
 * page->mapping, you must restore it to NULL before freeing the page.
 *
 * If your page will not be mapped to userspace, you can also use the four
 * bytes in the mapcount union, but you must call page_mapcount_reset()
 * before freeing it.
 *
 * If you want to use the refcount field, it must be used in such a way
 * that other CPUs temporarily incrementing and then decrementing the
 * refcount does not cause problems.  On receiving the page from
 * alloc_pages(), the refcount will be positive.
 *
 * If you allocate pages of order > 0, you can use some of the fields
 * in each subpage, but you may need to restore some of their values
 * afterwards.
 *
 * SLUB uses cmpxchg_double() to atomically update its freelist and
 * counters.  That requires that freelist & counters be adjacent and
 * double-word aligned.  We align all struct pages to double-word
 * boundaries, and ensure that 'freelist' is aligned within the
 * struct.
 */
#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE
#define _struct_page_alignment        __aligned(2 * sizeof(unsigned long))
#else
#define _struct_page_alignment
#endif

struct page {
        unsigned long flags;                /* Atomic flags, some possibly
                                         * updated asynchronously */
        /*
         * Five words (20/40 bytes) are available in this union.
         * WARNING: bit 0 of the first word is used for PageTail(). That
         * means the other users of this union MUST NOT use the bit to
         * avoid collision and false-positive PageTail().
         */
        union {
                struct {        /* Page cache and anonymous pages */
                        /**
                         * @lru: Pageout list, eg. active_list protected by
                         * pgdat->lru_lock.  Sometimes used as a generic list
                         * by the page owner.
                         */
                        struct list_head lru;
                        /* See page-flags.h for PAGE_MAPPING_FLAGS */
                        struct address_space *mapping;
                        pgoff_t index;                /* Our offset within mapping. */
                        /**
                         * @private: Mapping-private opaque data.
                         * Usually used for buffer_heads if PagePrivate.
                         * Used for swp_entry_t if PageSwapCache.
                         * Indicates order in the buddy system if PageBuddy.
                         */
                        unsigned long private;
                };
                struct {        /* page_pool used by netstack */
                        /**
                         * @dma_addr: might require a 64-bit value on
                         * 32-bit architectures.
                         */
                        unsigned long dma_addr[2];
                };
                struct {        /* slab, slob and slub */
                        union {
                                struct list_head slab_list;
                                struct {        /* Partial pages */
                                        struct page *next;
#ifdef CONFIG_64BIT
                                        int pages;        /* Nr of pages left */
                                        int pobjects;        /* Approximate count */
#else
                                        short int pages;
                                        short int pobjects;
#endif
                                };
                        };
                        struct kmem_cache *slab_cache; /* not slob */
                        /* Double-word boundary */
                        void *freelist;                /* first free object */
                        union {
                                void *s_mem;        /* slab: first object */
                                unsigned long counters;                /* SLUB */
                                struct {                        /* SLUB */
                                        unsigned inuse:16;
                                        unsigned objects:15;
                                        unsigned frozen:1;
                                };
                        };
                };
                struct {        /* Tail pages of compound page */
                        unsigned long compound_head;        /* Bit zero is set */

                        /* First tail page only */
                        unsigned char compound_dtor;
                        unsigned char compound_order;
                        atomic_t compound_mapcount;
                        unsigned int compound_nr; /* 1 << compound_order */
                };
                struct {        /* Second tail page of compound page */
                        unsigned long _compound_pad_1;        /* compound_head */
                        atomic_t hpage_pinned_refcount;
                        /* For both global and memcg */
                        struct list_head deferred_list;
                };
                struct {        /* Page table pages */
                        unsigned long _pt_pad_1;        /* compound_head */
                        pgtable_t pmd_huge_pte; /* protected by page->ptl */
                        unsigned long _pt_pad_2;        /* mapping */
                        union {
                                struct mm_struct *pt_mm; /* x86 pgds only */
                                atomic_t pt_frag_refcount; /* powerpc */
#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
                                atomic_t pt_share_count;
#endif
                        };
#if ALLOC_SPLIT_PTLOCKS
                        spinlock_t *ptl;
#else
                        spinlock_t ptl;
#endif
                };
                struct {        /* ZONE_DEVICE pages */
                        /** @pgmap: Points to the hosting device page map. */
                        struct dev_pagemap *pgmap;
                        void *zone_device_data;
                        /*
                         * ZONE_DEVICE private pages are counted as being
                         * mapped so the next 3 words hold the mapping, index,
                         * and private fields from the source anonymous or
                         * page cache page while the page is migrated to device
                         * private memory.
                         * ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also
                         * use the mapping, index, and private fields when
                         * pmem backed DAX files are mapped.
                         */
                };

                /** @rcu_head: You can use this to free a page by RCU. */
                struct rcu_head rcu_head;
        };

        union {                /* This union is 4 bytes in size. */
                /*
                 * If the page can be mapped to userspace, encodes the number
                 * of times this page is referenced by a page table.
                 */
                atomic_t _mapcount;

                /*
                 * If the page is neither PageSlab nor mappable to userspace,
                 * the value stored here may help determine what this page
                 * is used for.  See page-flags.h for a list of page types
                 * which are currently stored here.
                 */
                unsigned int page_type;

                unsigned int active;                /* SLAB */
                int units;                        /* SLOB */
        };

        /* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */
        atomic_t _refcount;

#ifdef CONFIG_MEMCG
        union {
                struct mem_cgroup *mem_cgroup;
                struct obj_cgroup **obj_cgroups;
        };
#endif

        /*
         * On machines where all RAM is mapped into kernel address space,
         * we can simply calculate the virtual address. On machines with
         * highmem some memory is mapped into kernel virtual memory
         * dynamically, so we need a place to store that address.
         * Note that this field could be 16 bits on x86 ... ;)
         *
         * Architectures with slow multiplication can define
         * WANT_PAGE_VIRTUAL in asm/page.h
         */
#if defined(WANT_PAGE_VIRTUAL)
        void *virtual;                        /* Kernel virtual address (NULL if
                                           not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */

#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
        int _last_cpupid;
#endif
} _struct_page_alignment;

static inline atomic_t *compound_mapcount_ptr(struct page *page)
{
        return &page[1].compound_mapcount;
}

static inline atomic_t *compound_pincount_ptr(struct page *page)
{
        return &page[2].hpage_pinned_refcount;
}

/*
 * Used for sizing the vmemmap region on some architectures
 */
#define STRUCT_PAGE_MAX_SHIFT        (order_base_2(sizeof(struct page)))

#define PAGE_FRAG_CACHE_MAX_SIZE        __ALIGN_MASK(32768, ~PAGE_MASK)
#define PAGE_FRAG_CACHE_MAX_ORDER        get_order(PAGE_FRAG_CACHE_MAX_SIZE)

#define page_private(page)                ((page)->private)

static inline void set_page_private(struct page *page, unsigned long private)
{
        page->private = private;
}

struct page_frag_cache {
        void * va;
#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
        __u16 offset;
        __u16 size;
#else
        __u32 offset;
#endif
        /* we maintain a pagecount bias, so that we dont dirty cache line
         * containing page->_refcount every time we allocate a fragment.
         */
        unsigned int                pagecnt_bias;
        bool pfmemalloc;
};

typedef unsigned long vm_flags_t;

/*
 * A region containing a mapping of a non-memory backed file under NOMMU
 * conditions.  These are held in a global tree and are pinned by the VMAs that
 * map parts of them.
 */
struct vm_region {
        struct rb_node        vm_rb;                /* link in global region tree */
        vm_flags_t        vm_flags;        /* VMA vm_flags */
        unsigned long        vm_start;        /* start address of region */
        unsigned long        vm_end;                /* region initialised to here */
        unsigned long        vm_top;                /* region allocated to here */
        unsigned long        vm_pgoff;        /* the offset in vm_file corresponding to vm_start */
        struct file        *vm_file;        /* the backing file or NULL */

        int                vm_usage;        /* region usage count (access under nommu_region_sem) */
        bool                vm_icache_flushed : 1; /* true if the icache has been flushed for
                                                * this region */
};

#ifdef CONFIG_USERFAULTFD
#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) { NULL, })
struct vm_userfaultfd_ctx {
        struct userfaultfd_ctx *ctx;
};
#else /* CONFIG_USERFAULTFD */
#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) {})
struct vm_userfaultfd_ctx {};
#endif /* CONFIG_USERFAULTFD */

/*
 * This struct describes a virtual memory area. There is one of these
 * per VM-area/task. A VM area is any part of the process virtual memory
 * space that has a special rule for the page-fault handlers (ie a shared
 * library, the executable area etc).
 */
struct vm_area_struct {
        /* The first cache line has the info for VMA tree walking. */

        unsigned long vm_start;                /* Our start address within vm_mm. */
        unsigned long vm_end;                /* The first byte after our end address
                                           within vm_mm. */

        /* linked list of VM areas per task, sorted by address */
        struct vm_area_struct *vm_next, *vm_prev;

        struct rb_node vm_rb;

        /*
         * Largest free memory gap in bytes to the left of this VMA.
         * Either between this VMA and vma->vm_prev, or between one of the
         * VMAs below us in the VMA rbtree and its ->vm_prev. This helps
         * get_unmapped_area find a free area of the right size.
         */
        unsigned long rb_subtree_gap;

        /* Second cache line starts here. */

        struct mm_struct *vm_mm;        /* The address space we belong to. */

        /*
         * Access permissions of this VMA.
         * See vmf_insert_mixed_prot() for discussion.
         */
        pgprot_t vm_page_prot;
        unsigned long vm_flags;                /* Flags, see mm.h. */

        /*
         * For areas with an address space and backing store,
         * linkage into the address_space->i_mmap interval tree.
         */
        struct {
                struct rb_node rb;
                unsigned long rb_subtree_last;
        } shared;

        /*
         * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
         * list, after a COW of one of the file pages.        A MAP_SHARED vma
         * can only be in the i_mmap tree.  An anonymous MAP_PRIVATE, stack
         * or brk vma (with NULL file) can only be in an anon_vma list.
         */
        struct list_head anon_vma_chain; /* Serialized by mmap_lock &
                                          * page_table_lock */
        struct anon_vma *anon_vma;        /* Serialized by page_table_lock */

        /* Function pointers to deal with this struct. */
        const struct vm_operations_struct *vm_ops;

        /* Information about our backing store: */
        unsigned long vm_pgoff;                /* Offset (within vm_file) in PAGE_SIZE
                                           units */
        struct file * vm_file;                /* File we map to (can be NULL). */
        void * vm_private_data;                /* was vm_pte (shared mem) */

#ifdef CONFIG_SWAP
        atomic_long_t swap_readahead_info;
#endif
#ifndef CONFIG_MMU
        struct vm_region *vm_region;        /* NOMMU mapping region */
#endif
#ifdef CONFIG_NUMA
        struct mempolicy *vm_policy;        /* NUMA policy for the VMA */
#endif
        struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
} __randomize_layout;

struct core_thread {
        struct task_struct *task;
        struct core_thread *next;
};

struct core_state {
        atomic_t nr_threads;
        struct core_thread dumper;
        struct completion startup;
};

struct kioctx_table;
struct mm_struct {
        struct {
                struct vm_area_struct *mmap;                /* list of VMAs */
                struct rb_root mm_rb;
                u64 vmacache_seqnum;                   /* per-thread vmacache */
#ifdef CONFIG_MMU
                unsigned long (*get_unmapped_area) (struct file *filp,
                                unsigned long addr, unsigned long len,
                                unsigned long pgoff, unsigned long flags);
#endif
                unsigned long mmap_base;        /* base of mmap area */
                unsigned long mmap_legacy_base;        /* base of mmap area in bottom-up allocations */
#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
                /* Base adresses for compatible mmap() */
                unsigned long mmap_compat_base;
                unsigned long mmap_compat_legacy_base;
#endif
                unsigned long task_size;        /* size of task vm space */
                unsigned long highest_vm_end;        /* highest vma end address */
                pgd_t * pgd;

#ifdef CONFIG_MEMBARRIER
                /**
                 * @membarrier_state: Flags controlling membarrier behavior.
                 *
                 * This field is close to @pgd to hopefully fit in the same
                 * cache-line, which needs to be touched by switch_mm().
                 */
                atomic_t membarrier_state;
#endif

                /**
                 * @mm_users: The number of users including userspace.
                 *
                 * Use mmget()/mmget_not_zero()/mmput() to modify. When this
                 * drops to 0 (i.e. when the task exits and there are no other
                 * temporary reference holders), we also release a reference on
                 * @mm_count (which may then free the &struct mm_struct if
                 * @mm_count also drops to 0).
                 */
                atomic_t mm_users;

                /**
                 * @mm_count: The number of references to &struct mm_struct
                 * (@mm_users count as 1).
                 *
                 * Use mmgrab()/mmdrop() to modify. When this drops to 0, the
                 * &struct mm_struct is freed.
                 */
                atomic_t mm_count;

                /**
                 * @has_pinned: Whether this mm has pinned any pages.  This can
                 * be either replaced in the future by @pinned_vm when it
                 * becomes stable, or grow into a counter on its own. We're
                 * aggresive on this bit now - even if the pinned pages were
                 * unpinned later on, we'll still keep this bit set for the
                 * lifecycle of this mm just for simplicity.
                 */
                atomic_t has_pinned;

#ifdef CONFIG_MMU
                atomic_long_t pgtables_bytes;        /* PTE page table pages */
#endif
                int map_count;                        /* number of VMAs */

                spinlock_t page_table_lock; /* Protects page tables and some
                                             * counters
                                             */
                /*
                 * With some kernel config, the current mmap_lock's offset
                 * inside 'mm_struct' is at 0x120, which is very optimal, as
                 * its two hot fields 'count' and 'owner' sit in 2 different
                 * cachelines,  and when mmap_lock is highly contended, both
                 * of the 2 fields will be accessed frequently, current layout
                 * will help to reduce cache bouncing.
                 *
                 * So please be careful with adding new fields before
                 * mmap_lock, which can easily push the 2 fields into one
                 * cacheline.
                 */
                struct rw_semaphore mmap_lock;

                struct list_head mmlist; /* List of maybe swapped mm's.        These
                                          * are globally strung together off
                                          * init_mm.mmlist, and are protected
                                          * by mmlist_lock
                                          */


                unsigned long hiwater_rss; /* High-watermark of RSS usage */
                unsigned long hiwater_vm;  /* High-water virtual memory usage */

                unsigned long total_vm;           /* Total pages mapped */
                unsigned long locked_vm;   /* Pages that have PG_mlocked set */
                atomic64_t    pinned_vm;   /* Refcount permanently increased */
                unsigned long data_vm;           /* VM_WRITE & ~VM_SHARED & ~VM_STACK */
                unsigned long exec_vm;           /* VM_EXEC & ~VM_WRITE & ~VM_STACK */
                unsigned long stack_vm;           /* VM_STACK */
                unsigned long def_flags;

                /**
                 * @write_protect_seq: Locked when any thread is write
                 * protecting pages mapped by this mm to enforce a later COW,
                 * for instance during page table copying for fork().
                 */
                seqcount_t write_protect_seq;

                spinlock_t arg_lock; /* protect the below fields */

                unsigned long start_code, end_code, start_data, end_data;
                unsigned long start_brk, brk, start_stack;
                unsigned long arg_start, arg_end, env_start, env_end;

                unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */

                /*
                 * Special counters, in some configurations protected by the
                 * page_table_lock, in other configurations by being atomic.
                 */
                struct mm_rss_stat rss_stat;

                struct linux_binfmt *binfmt;

                /* Architecture-specific MM context */
                mm_context_t context;

                unsigned long flags; /* Must use atomic bitops to access */

                struct core_state *core_state; /* coredumping support */

#ifdef CONFIG_AIO
                spinlock_t                        ioctx_lock;
                struct kioctx_table __rcu        *ioctx_table;
#endif
#ifdef CONFIG_MEMCG
                /*
                 * "owner" points to a task that is regarded as the canonical
                 * user/owner of this mm. All of the following must be true in
                 * order for it to be changed:
                 *
                 * current == mm->owner
                 * current->mm != mm
                 * new_owner->mm == mm
                 * new_owner->alloc_lock is held
                 */
                struct task_struct __rcu *owner;
#endif
                struct user_namespace *user_ns;

                /* store ref to file /proc/<pid>/exe symlink points to */
                struct file __rcu *exe_file;
#ifdef CONFIG_MMU_NOTIFIER
                struct mmu_notifier_subscriptions *notifier_subscriptions;
#endif
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
                pgtable_t pmd_huge_pte; /* protected by page_table_lock */
#endif
#ifdef CONFIG_NUMA_BALANCING
                /*
                 * numa_next_scan is the next time that the PTEs will be marked
                 * pte_numa. NUMA hinting faults will gather statistics and
                 * migrate pages to new nodes if necessary.
                 */
                unsigned long numa_next_scan;

                /* Restart point for scanning and setting pte_numa */
                unsigned long numa_scan_offset;

                /* numa_scan_seq prevents two threads setting pte_numa */
                int numa_scan_seq;
#endif
                /*
                 * An operation with batched TLB flushing is going on. Anything
                 * that can move process memory needs to flush the TLB when
                 * moving a PROT_NONE or PROT_NUMA mapped page.
                 */
                atomic_t tlb_flush_pending;
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
                /* See flush_tlb_batched_pending() */
                bool tlb_flush_batched;
#endif
                struct uprobes_state uprobes_state;
#ifdef CONFIG_HUGETLB_PAGE
                atomic_long_t hugetlb_usage;
#endif
                struct work_struct async_put_work;

#ifdef CONFIG_IOMMU_SUPPORT
                u32 pasid;
#endif
        } __randomize_layout;

        /*
         * The mm_cpumask needs to be at the end of mm_struct, because it
         * is dynamically sized based on nr_cpu_ids.
         */
        unsigned long cpu_bitmap[];
};

extern struct mm_struct init_mm;

/* Pointer magic because the dynamic array size confuses some compilers. */
static inline void mm_init_cpumask(struct mm_struct *mm)
{
        unsigned long cpu_bitmap = (unsigned long)mm;

        cpu_bitmap += offsetof(struct mm_struct, cpu_bitmap);
        cpumask_clear((struct cpumask *)cpu_bitmap);
}

/* Future-safe accessor for struct mm_struct's cpu_vm_mask. */
static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
{
        return (struct cpumask *)&mm->cpu_bitmap;
}

struct mmu_gather;
extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
                                unsigned long start, unsigned long end);
void tlb_gather_mmu_vma(struct mmu_gather *tlb, struct vm_area_struct *vma,
                unsigned long start, unsigned long end);
extern void tlb_finish_mmu(struct mmu_gather *tlb,
                                unsigned long start, unsigned long end);

static inline void init_tlb_flush_pending(struct mm_struct *mm)
{
        atomic_set(&mm->tlb_flush_pending, 0);
}

static inline void inc_tlb_flush_pending(struct mm_struct *mm)
{
        atomic_inc(&mm->tlb_flush_pending);
        /*
         * The only time this value is relevant is when there are indeed pages
         * to flush. And we'll only flush pages after changing them, which
         * requires the PTL.
         *
         * So the ordering here is:
         *
         *        atomic_inc(&mm->tlb_flush_pending);
         *        spin_lock(&ptl);
         *        ...
         *        set_pte_at();
         *        spin_unlock(&ptl);
         *
         *                                spin_lock(&ptl)
         *                                mm_tlb_flush_pending();
         *                                ....
         *                                spin_unlock(&ptl);
         *
         *        flush_tlb_range();
         *        atomic_dec(&mm->tlb_flush_pending);
         *
         * Where the increment if constrained by the PTL unlock, it thus
         * ensures that the increment is visible if the PTE modification is
         * visible. After all, if there is no PTE modification, nobody cares
         * about TLB flushes either.
         *
         * This very much relies on users (mm_tlb_flush_pending() and
         * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and
         * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc
         * locks (PPC) the unlock of one doesn't order against the lock of
         * another PTL.
         *
         * The decrement is ordered by the flush_tlb_range(), such that
         * mm_tlb_flush_pending() will not return false unless all flushes have
         * completed.
         */
}

static inline void dec_tlb_flush_pending(struct mm_struct *mm)
{
        /*
         * See inc_tlb_flush_pending().
         *
         * This cannot be smp_mb__before_atomic() because smp_mb() simply does
         * not order against TLB invalidate completion, which is what we need.
         *
         * Therefore we must rely on tlb_flush_*() to guarantee order.
         */
        atomic_dec(&mm->tlb_flush_pending);
}

static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
{
        /*
         * Must be called after having acquired the PTL; orders against that
         * PTLs release and therefore ensures that if we observe the modified
         * PTE we must also observe the increment from inc_tlb_flush_pending().
         *
         * That is, it only guarantees to return true if there is a flush
         * pending for _this_ PTL.
         */
        return atomic_read(&mm->tlb_flush_pending);
}

static inline bool mm_tlb_flush_nested(struct mm_struct *mm)
{
        /*
         * Similar to mm_tlb_flush_pending(), we must have acquired the PTL
         * for which there is a TLB flush pending in order to guarantee
         * we've seen both that PTE modification and the increment.
         *
         * (no requirement on actually still holding the PTL, that is irrelevant)
         */
        return atomic_read(&mm->tlb_flush_pending) > 1;
}

struct vm_fault;

/**
 * typedef vm_fault_t - Return type for page fault handlers.
 *
 * Page fault handlers return a bitmask of %VM_FAULT values.
 */
typedef __bitwise unsigned int vm_fault_t;

/**
 * enum vm_fault_reason - Page fault handlers return a bitmask of
 * these values to tell the core VM what happened when handling the
 * fault. Used to decide whether a process gets delivered SIGBUS or
 * just gets major/minor fault counters bumped up.
 *
 * @VM_FAULT_OOM:                Out Of Memory
 * @VM_FAULT_SIGBUS:                Bad access
 * @VM_FAULT_MAJOR:                Page read from storage
 * @VM_FAULT_WRITE:                Special case for get_user_pages
 * @VM_FAULT_HWPOISON:                Hit poisoned small page
 * @VM_FAULT_HWPOISON_LARGE:        Hit poisoned large page. Index encoded
 *                                in upper bits
 * @VM_FAULT_SIGSEGV:                segmentation fault
 * @VM_FAULT_NOPAGE:                ->fault installed the pte, not return page
 * @VM_FAULT_LOCKED:                ->fault locked the returned page
 * @VM_FAULT_RETRY:                ->fault blocked, must retry
 * @VM_FAULT_FALLBACK:                huge page fault failed, fall back to small
 * @VM_FAULT_DONE_COW:                ->fault has fully handled COW
 * @VM_FAULT_NEEDDSYNC:                ->fault did not modify page tables and needs
 *                                fsync() to complete (for synchronous page faults
 *                                in DAX)
 * @VM_FAULT_HINDEX_MASK:        mask HINDEX value
 *
 */
enum vm_fault_reason {
        VM_FAULT_OOM            = (__force vm_fault_t)0x000001,
        VM_FAULT_SIGBUS         = (__force vm_fault_t)0x000002,
        VM_FAULT_MAJOR          = (__force vm_fault_t)0x000004,
        VM_FAULT_WRITE          = (__force vm_fault_t)0x000008,
        VM_FAULT_HWPOISON       = (__force vm_fault_t)0x000010,
        VM_FAULT_HWPOISON_LARGE = (__force vm_fault_t)0x000020,
        VM_FAULT_SIGSEGV        = (__force vm_fault_t)0x000040,
        VM_FAULT_NOPAGE         = (__force vm_fault_t)0x000100,
        VM_FAULT_LOCKED         = (__force vm_fault_t)0x000200,
        VM_FAULT_RETRY          = (__force vm_fault_t)0x000400,
        VM_FAULT_FALLBACK       = (__force vm_fault_t)0x000800,
        VM_FAULT_DONE_COW       = (__force vm_fault_t)0x001000,
        VM_FAULT_NEEDDSYNC      = (__force vm_fault_t)0x002000,
        VM_FAULT_HINDEX_MASK    = (__force vm_fault_t)0x0f0000,
};

/* Encode hstate index for a hwpoisoned large page */
#define VM_FAULT_SET_HINDEX(x) ((__force vm_fault_t)((x) << 16))
#define VM_FAULT_GET_HINDEX(x) (((__force unsigned int)(x) >> 16) & 0xf)

#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS |        \
                        VM_FAULT_SIGSEGV | VM_FAULT_HWPOISON |        \
                        VM_FAULT_HWPOISON_LARGE | VM_FAULT_FALLBACK)

#define VM_FAULT_RESULT_TRACE \
        { VM_FAULT_OOM,                 "OOM" },        \
        { VM_FAULT_SIGBUS,              "SIGBUS" },        \
        { VM_FAULT_MAJOR,               "MAJOR" },        \
        { VM_FAULT_WRITE,               "WRITE" },        \
        { VM_FAULT_HWPOISON,            "HWPOISON" },        \
        { VM_FAULT_HWPOISON_LARGE,      "HWPOISON_LARGE" },        \
        { VM_FAULT_SIGSEGV,             "SIGSEGV" },        \
        { VM_FAULT_NOPAGE,              "NOPAGE" },        \
        { VM_FAULT_LOCKED,              "LOCKED" },        \
        { VM_FAULT_RETRY,               "RETRY" },        \
        { VM_FAULT_FALLBACK,            "FALLBACK" },        \
        { VM_FAULT_DONE_COW,            "DONE_COW" },        \
        { VM_FAULT_NEEDDSYNC,           "NEEDDSYNC" }

struct vm_special_mapping {
        const char *name;        /* The name, e.g. "[vdso]". */

        /*
         * If .fault is not provided, this points to a
         * NULL-terminated array of pages that back the special mapping.
         *
         * This must not be NULL unless .fault is provided.
         */
        struct page **pages;

        /*
         * If non-NULL, then this is called to resolve page faults
         * on the special mapping.  If used, .pages is not checked.
         */
        vm_fault_t (*fault)(const struct vm_special_mapping *sm,
                                struct vm_area_struct *vma,
                                struct vm_fault *vmf);

        int (*mremap)(const struct vm_special_mapping *sm,
                     struct vm_area_struct *new_vma);
};

enum tlb_flush_reason {
        TLB_FLUSH_ON_TASK_SWITCH,
        TLB_REMOTE_SHOOTDOWN,
        TLB_LOCAL_SHOOTDOWN,
        TLB_LOCAL_MM_SHOOTDOWN,
        TLB_REMOTE_SEND_IPI,
        NR_TLB_FLUSH_REASONS,
};

 /*
  * A swap entry has to fit into a "unsigned long", as the entry is hidden
  * in the "index" field of the swapper address space.
  */
typedef struct {
        unsigned long val;
} swp_entry_t;

#endif /* _LINUX_MM_TYPES_H */



































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *  SR-IPv6 implementation
 *
 *  Author:
 *  David Lebrun <david.lebrun@uclouvain.be>
 */

#ifndef _NET_SEG6_H
#define _NET_SEG6_H

#include <linux/net.h>
#include <linux/ipv6.h>
#include <linux/seg6.h>
#include <linux/rhashtable-types.h>

static inline void update_csum_diff4(struct sk_buff *skb, __be32 from,
                                     __be32 to)
{
        __be32 diff[] = { ~from, to };

        skb->csum = ~csum_partial((char *)diff, sizeof(diff), ~skb->csum);
}

static inline void update_csum_diff16(struct sk_buff *skb, __be32 *from,
                                      __be32 *to)
{
        __be32 diff[] = {
                ~from[0], ~from[1], ~from[2], ~from[3],
                to[0], to[1], to[2], to[3],
        };

        skb->csum = ~csum_partial((char *)diff, sizeof(diff), ~skb->csum);
}

struct seg6_pernet_data {
        struct mutex lock;
        struct in6_addr __rcu *tun_src;
#ifdef CONFIG_IPV6_SEG6_HMAC
        struct rhashtable hmac_infos;
#endif
};

static inline struct seg6_pernet_data *seg6_pernet(struct net *net)
{
#if IS_ENABLED(CONFIG_IPV6)
        return net->ipv6.seg6_data;
#else
        return NULL;
#endif
}

extern int seg6_init(void);
extern void seg6_exit(void);
extern int seg6_iptunnel_init(void);
extern void seg6_iptunnel_exit(void);
extern int seg6_local_init(void);
extern void seg6_local_exit(void);

extern bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len, bool reduced);
extern int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh,
                             int proto);
extern int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh);
extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
                               u32 tbl_id);
#endif












































    4 





    4 














































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Supervisor Mode Access Prevention support
 *
 * Copyright (C) 2012 Intel Corporation
 * Author: H. Peter Anvin <hpa@linux.intel.com>
 */

#ifndef _ASM_X86_SMAP_H
#define _ASM_X86_SMAP_H

#include <asm/nops.h>
#include <asm/cpufeatures.h>
#include <asm/alternative.h>

/* "Raw" instruction opcodes */
#define __ASM_CLAC        ".byte 0x0f,0x01,0xca"
#define __ASM_STAC        ".byte 0x0f,0x01,0xcb"

#ifdef __ASSEMBLY__

#ifdef CONFIG_X86_SMAP

#define ASM_CLAC \
        ALTERNATIVE "", __ASM_CLAC, X86_FEATURE_SMAP

#define ASM_STAC \
        ALTERNATIVE "", __ASM_STAC, X86_FEATURE_SMAP

#else /* CONFIG_X86_SMAP */

#define ASM_CLAC
#define ASM_STAC

#endif /* CONFIG_X86_SMAP */

#else /* __ASSEMBLY__ */

#ifdef CONFIG_X86_SMAP

static __always_inline void clac(void)
{
        /* Note: a barrier is implicit in alternative() */
        alternative("", __ASM_CLAC, X86_FEATURE_SMAP);
}

static __always_inline void stac(void)
{
        /* Note: a barrier is implicit in alternative() */
        alternative("", __ASM_STAC, X86_FEATURE_SMAP);
}

static __always_inline unsigned long smap_save(void)
{
        unsigned long flags;

        asm volatile ("# smap_save\n\t"
                      ALTERNATIVE("jmp 1f", "", X86_FEATURE_SMAP)
                      "pushf; pop %0; " __ASM_CLAC "\n\t"
                      "1:"
                      : "=rm" (flags) : : "memory", "cc");

        return flags;
}

static __always_inline void smap_restore(unsigned long flags)
{
        asm volatile ("# smap_restore\n\t"
                      ALTERNATIVE("jmp 1f", "", X86_FEATURE_SMAP)
                      "push %0; popf\n\t"
                      "1:"
                      : : "g" (flags) : "memory", "cc");
}

/* These macros can be used in asm() statements */
#define ASM_CLAC \
        ALTERNATIVE("", __ASM_CLAC, X86_FEATURE_SMAP)
#define ASM_STAC \
        ALTERNATIVE("", __ASM_STAC, X86_FEATURE_SMAP)

#else /* CONFIG_X86_SMAP */

static inline void clac(void) { }
static inline void stac(void) { }

static inline unsigned long smap_save(void) { return 0; }
static inline void smap_restore(unsigned long flags) { }

#define ASM_CLAC
#define ASM_STAC

#endif /* CONFIG_X86_SMAP */

#endif /* __ASSEMBLY__ */

#endif /* _ASM_X86_SMAP_H */















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
// SPDX-License-Identifier: GPL-2.0-or-later
/* auditfilter.c -- filtering of audit events
 *
 * Copyright 2003-2004 Red Hat, Inc.
 * Copyright 2005 Hewlett-Packard Development Company, L.P.
 * Copyright 2005 IBM Corporation
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kernel.h>
#include <linux/audit.h>
#include <linux/kthread.h>
#include <linux/mutex.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/netlink.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/security.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include "audit.h"

/*
 * Locking model:
 *
 * audit_filter_mutex:
 *                Synchronizes writes and blocking reads of audit's filterlist
 *                data.  Rcu is used to traverse the filterlist and access
 *                contents of structs audit_entry, audit_watch and opaque
 *                LSM rules during filtering.  If modified, these structures
 *                must be copied and replace their counterparts in the filterlist.
 *                An audit_parent struct is not accessed during filtering, so may
 *                be written directly provided audit_filter_mutex is held.
 */

/* Audit filter lists, defined in <linux/audit.h> */
struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
        LIST_HEAD_INIT(audit_filter_list[0]),
        LIST_HEAD_INIT(audit_filter_list[1]),
        LIST_HEAD_INIT(audit_filter_list[2]),
        LIST_HEAD_INIT(audit_filter_list[3]),
        LIST_HEAD_INIT(audit_filter_list[4]),
        LIST_HEAD_INIT(audit_filter_list[5]),
        LIST_HEAD_INIT(audit_filter_list[6]),
#if AUDIT_NR_FILTERS != 7
#error Fix audit_filter_list initialiser
#endif
};
static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
        LIST_HEAD_INIT(audit_rules_list[0]),
        LIST_HEAD_INIT(audit_rules_list[1]),
        LIST_HEAD_INIT(audit_rules_list[2]),
        LIST_HEAD_INIT(audit_rules_list[3]),
        LIST_HEAD_INIT(audit_rules_list[4]),
        LIST_HEAD_INIT(audit_rules_list[5]),
        LIST_HEAD_INIT(audit_rules_list[6]),
};

DEFINE_MUTEX(audit_filter_mutex);

static void audit_free_lsm_field(struct audit_field *f)
{
        switch (f->type) {
        case AUDIT_SUBJ_USER:
        case AUDIT_SUBJ_ROLE:
        case AUDIT_SUBJ_TYPE:
        case AUDIT_SUBJ_SEN:
        case AUDIT_SUBJ_CLR:
        case AUDIT_OBJ_USER:
        case AUDIT_OBJ_ROLE:
        case AUDIT_OBJ_TYPE:
        case AUDIT_OBJ_LEV_LOW:
        case AUDIT_OBJ_LEV_HIGH:
                kfree(f->lsm_str);
                security_audit_rule_free(f->lsm_rule);
        }
}

static inline void audit_free_rule(struct audit_entry *e)
{
        int i;
        struct audit_krule *erule = &e->rule;

        /* some rules don't have associated watches */
        if (erule->watch)
                audit_put_watch(erule->watch);
        if (erule->fields)
                for (i = 0; i < erule->field_count; i++)
                        audit_free_lsm_field(&erule->fields[i]);
        kfree(erule->fields);
        kfree(erule->filterkey);
        kfree(e);
}

void audit_free_rule_rcu(struct rcu_head *head)
{
        struct audit_entry *e = container_of(head, struct audit_entry, rcu);
        audit_free_rule(e);
}

/* Initialize an audit filterlist entry. */
static inline struct audit_entry *audit_init_entry(u32 field_count)
{
        struct audit_entry *entry;
        struct audit_field *fields;

        entry = kzalloc(sizeof(*entry), GFP_KERNEL);
        if (unlikely(!entry))
                return NULL;

        fields = kcalloc(field_count, sizeof(*fields), GFP_KERNEL);
        if (unlikely(!fields)) {
                kfree(entry);
                return NULL;
        }
        entry->rule.fields = fields;

        return entry;
}

/* Unpack a filter field's string representation from user-space
 * buffer. */
char *audit_unpack_string(void **bufp, size_t *remain, size_t len)
{
        char *str;

        if (!*bufp || (len == 0) || (len > *remain))
                return ERR_PTR(-EINVAL);

        /* Of the currently implemented string fields, PATH_MAX
         * defines the longest valid length.
         */
        if (len > PATH_MAX)
                return ERR_PTR(-ENAMETOOLONG);

        str = kmalloc(len + 1, GFP_KERNEL);
        if (unlikely(!str))
                return ERR_PTR(-ENOMEM);

        memcpy(str, *bufp, len);
        str[len] = 0;
        *bufp += len;
        *remain -= len;

        return str;
}

/* Translate an inode field to kernel representation. */
static inline int audit_to_inode(struct audit_krule *krule,
                                 struct audit_field *f)
{
        if (krule->listnr != AUDIT_FILTER_EXIT ||
            krule->inode_f || krule->watch || krule->tree ||
            (f->op != Audit_equal && f->op != Audit_not_equal))
                return -EINVAL;

        krule->inode_f = f;
        return 0;
}

static __u32 *classes[AUDIT_SYSCALL_CLASSES];

int __init audit_register_class(int class, unsigned *list)
{
        __u32 *p = kcalloc(AUDIT_BITMASK_SIZE, sizeof(__u32), GFP_KERNEL);
        if (!p)
                return -ENOMEM;
        while (*list != ~0U) {
                unsigned n = *list++;
                if (n >= AUDIT_BITMASK_SIZE * 32 - AUDIT_SYSCALL_CLASSES) {
                        kfree(p);
                        return -EINVAL;
                }
                p[AUDIT_WORD(n)] |= AUDIT_BIT(n);
        }
        if (class >= AUDIT_SYSCALL_CLASSES || classes[class]) {
                kfree(p);
                return -EINVAL;
        }
        classes[class] = p;
        return 0;
}

int audit_match_class(int class, unsigned syscall)
{
        if (unlikely(syscall >= AUDIT_BITMASK_SIZE * 32))
                return 0;
        if (unlikely(class >= AUDIT_SYSCALL_CLASSES || !classes[class]))
                return 0;
        return classes[class][AUDIT_WORD(syscall)] & AUDIT_BIT(syscall);
}

#ifdef CONFIG_AUDITSYSCALL
static inline int audit_match_class_bits(int class, u32 *mask)
{
        int i;

        if (classes[class]) {
                for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
                        if (mask[i] & classes[class][i])
                                return 0;
        }
        return 1;
}

static int audit_match_signal(struct audit_entry *entry)
{
        struct audit_field *arch = entry->rule.arch_f;

        if (!arch) {
                /* When arch is unspecified, we must check both masks on biarch
                 * as syscall number alone is ambiguous. */
                return (audit_match_class_bits(AUDIT_CLASS_SIGNAL,
                                               entry->rule.mask) &&
                        audit_match_class_bits(AUDIT_CLASS_SIGNAL_32,
                                               entry->rule.mask));
        }

        switch(audit_classify_arch(arch->val)) {
        case 0: /* native */
                return (audit_match_class_bits(AUDIT_CLASS_SIGNAL,
                                               entry->rule.mask));
        case 1: /* 32bit on biarch */
                return (audit_match_class_bits(AUDIT_CLASS_SIGNAL_32,
                                               entry->rule.mask));
        default:
                return 1;
        }
}
#endif

/* Common user-space to kernel rule translation. */
static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *rule)
{
        unsigned listnr;
        struct audit_entry *entry;
        int i, err;

        err = -EINVAL;
        listnr = rule->flags & ~AUDIT_FILTER_PREPEND;
        switch(listnr) {
        default:
                goto exit_err;
#ifdef CONFIG_AUDITSYSCALL
        case AUDIT_FILTER_ENTRY:
                pr_err("AUDIT_FILTER_ENTRY is deprecated\n");
                goto exit_err;
        case AUDIT_FILTER_EXIT:
        case AUDIT_FILTER_TASK:
#endif
        case AUDIT_FILTER_USER:
        case AUDIT_FILTER_EXCLUDE:
        case AUDIT_FILTER_FS:
                ;
        }
        if (unlikely(rule->action == AUDIT_POSSIBLE)) {
                pr_err("AUDIT_POSSIBLE is deprecated\n");
                goto exit_err;
        }
        if (rule->action != AUDIT_NEVER && rule->action != AUDIT_ALWAYS)
                goto exit_err;
        if (rule->field_count > AUDIT_MAX_FIELDS)
                goto exit_err;

        err = -ENOMEM;
        entry = audit_init_entry(rule->field_count);
        if (!entry)
                goto exit_err;

        entry->rule.flags = rule->flags & AUDIT_FILTER_PREPEND;
        entry->rule.listnr = listnr;
        entry->rule.action = rule->action;
        entry->rule.field_count = rule->field_count;

        for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
                entry->rule.mask[i] = rule->mask[i];

        for (i = 0; i < AUDIT_SYSCALL_CLASSES; i++) {
                int bit = AUDIT_BITMASK_SIZE * 32 - i - 1;
                __u32 *p = &entry->rule.mask[AUDIT_WORD(bit)];
                __u32 *class;

                if (!(*p & AUDIT_BIT(bit)))
                        continue;
                *p &= ~AUDIT_BIT(bit);
                class = classes[i];
                if (class) {
                        int j;
                        for (j = 0; j < AUDIT_BITMASK_SIZE; j++)
                                entry->rule.mask[j] |= class[j];
                }
        }

        return entry;

exit_err:
        return ERR_PTR(err);
}

static u32 audit_ops[] =
{
        [Audit_equal] = AUDIT_EQUAL,
        [Audit_not_equal] = AUDIT_NOT_EQUAL,
        [Audit_bitmask] = AUDIT_BIT_MASK,
        [Audit_bittest] = AUDIT_BIT_TEST,
        [Audit_lt] = AUDIT_LESS_THAN,
        [Audit_gt] = AUDIT_GREATER_THAN,
        [Audit_le] = AUDIT_LESS_THAN_OR_EQUAL,
        [Audit_ge] = AUDIT_GREATER_THAN_OR_EQUAL,
};

static u32 audit_to_op(u32 op)
{
        u32 n;
        for (n = Audit_equal; n < Audit_bad && audit_ops[n] != op; n++)
                ;
        return n;
}

/* check if an audit field is valid */
static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
{
        switch (f->type) {
        case AUDIT_MSGTYPE:
                if (entry->rule.listnr != AUDIT_FILTER_EXCLUDE &&
                    entry->rule.listnr != AUDIT_FILTER_USER)
                        return -EINVAL;
                break;
        case AUDIT_FSTYPE:
                if (entry->rule.listnr != AUDIT_FILTER_FS)
                        return -EINVAL;
                break;
        }

        switch (entry->rule.listnr) {
        case AUDIT_FILTER_FS:
                switch(f->type) {
                case AUDIT_FSTYPE:
                case AUDIT_FILTERKEY:
                        break;
                default:
                        return -EINVAL;
                }
        }

        /* Check for valid field type and op */
        switch (f->type) {
        case AUDIT_ARG0:
        case AUDIT_ARG1:
        case AUDIT_ARG2:
        case AUDIT_ARG3:
        case AUDIT_PERS: /* <uapi/linux/personality.h> */
        case AUDIT_DEVMINOR:
                /* all ops are valid */
                break;
        case AUDIT_UID:
        case AUDIT_EUID:
        case AUDIT_SUID:
        case AUDIT_FSUID:
        case AUDIT_LOGINUID:
        case AUDIT_OBJ_UID:
        case AUDIT_GID:
        case AUDIT_EGID:
        case AUDIT_SGID:
        case AUDIT_FSGID:
        case AUDIT_OBJ_GID:
        case AUDIT_PID:
        case AUDIT_MSGTYPE:
        case AUDIT_PPID:
        case AUDIT_DEVMAJOR:
        case AUDIT_EXIT:
        case AUDIT_SUCCESS:
        case AUDIT_INODE:
        case AUDIT_SESSIONID:
        case AUDIT_SUBJ_SEN:
        case AUDIT_SUBJ_CLR:
        case AUDIT_OBJ_LEV_LOW:
        case AUDIT_OBJ_LEV_HIGH:
        case AUDIT_SADDR_FAM:
                /* bit ops are only useful on syscall args */
                if (f->op == Audit_bitmask || f->op == Audit_bittest)
                        return -EINVAL;
                break;
        case AUDIT_SUBJ_USER:
        case AUDIT_SUBJ_ROLE:
        case AUDIT_SUBJ_TYPE:
        case AUDIT_OBJ_USER:
        case AUDIT_OBJ_ROLE:
        case AUDIT_OBJ_TYPE:
        case AUDIT_WATCH:
        case AUDIT_DIR:
        case AUDIT_FILTERKEY:
        case AUDIT_LOGINUID_SET:
        case AUDIT_ARCH:
        case AUDIT_FSTYPE:
        case AUDIT_PERM:
        case AUDIT_FILETYPE:
        case AUDIT_FIELD_COMPARE:
        case AUDIT_EXE:
                /* only equal and not equal valid ops */
                if (f->op != Audit_not_equal && f->op != Audit_equal)
                        return -EINVAL;
                break;
        default:
                /* field not recognized */
                return -EINVAL;
        }

        /* Check for select valid field values */
        switch (f->type) {
        case AUDIT_LOGINUID_SET:
                if ((f->val != 0) && (f->val != 1))
                        return -EINVAL;
                break;
        case AUDIT_PERM:
                if (f->val & ~15)
                        return -EINVAL;
                break;
        case AUDIT_FILETYPE:
                if (f->val & ~S_IFMT)
                        return -EINVAL;
                break;
        case AUDIT_FIELD_COMPARE:
                if (f->val > AUDIT_MAX_FIELD_COMPARE)
                        return -EINVAL;
                break;
        case AUDIT_SADDR_FAM:
                if (f->val >= AF_MAX)
                        return -EINVAL;
                break;
        default:
                break;
        }

        return 0;
}

/* Translate struct audit_rule_data to kernel's rule representation. */
static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
                                               size_t datasz)
{
        int err = 0;
        struct audit_entry *entry;
        void *bufp;
        size_t remain = datasz - sizeof(struct audit_rule_data);
        int i;
        char *str;
        struct audit_fsnotify_mark *audit_mark;

        entry = audit_to_entry_common(data);
        if (IS_ERR(entry))
                goto exit_nofree;

        bufp = data->buf;
        for (i = 0; i < data->field_count; i++) {
                struct audit_field *f = &entry->rule.fields[i];
                u32 f_val;

                err = -EINVAL;

                f->op = audit_to_op(data->fieldflags[i]);
                if (f->op == Audit_bad)
                        goto exit_free;

                f->type = data->fields[i];
                f_val = data->values[i];

                /* Support legacy tests for a valid loginuid */
                if ((f->type == AUDIT_LOGINUID) && (f_val == AUDIT_UID_UNSET)) {
                        f->type = AUDIT_LOGINUID_SET;
                        f_val = 0;
                        entry->rule.pflags |= AUDIT_LOGINUID_LEGACY;
                }

                err = audit_field_valid(entry, f);
                if (err)
                        goto exit_free;

                err = -EINVAL;
                switch (f->type) {
                case AUDIT_LOGINUID:
                case AUDIT_UID:
                case AUDIT_EUID:
                case AUDIT_SUID:
                case AUDIT_FSUID:
                case AUDIT_OBJ_UID:
                        f->uid = make_kuid(current_user_ns(), f_val);
                        if (!uid_valid(f->uid))
                                goto exit_free;
                        break;
                case AUDIT_GID:
                case AUDIT_EGID:
                case AUDIT_SGID:
                case AUDIT_FSGID:
                case AUDIT_OBJ_GID:
                        f->gid = make_kgid(current_user_ns(), f_val);
                        if (!gid_valid(f->gid))
                                goto exit_free;
                        break;
                case AUDIT_ARCH:
                        f->val = f_val;
                        entry->rule.arch_f = f;
                        break;
                case AUDIT_SUBJ_USER:
                case AUDIT_SUBJ_ROLE:
                case AUDIT_SUBJ_TYPE:
                case AUDIT_SUBJ_SEN:
                case AUDIT_SUBJ_CLR:
                case AUDIT_OBJ_USER:
                case AUDIT_OBJ_ROLE:
                case AUDIT_OBJ_TYPE:
                case AUDIT_OBJ_LEV_LOW:
                case AUDIT_OBJ_LEV_HIGH:
                        str = audit_unpack_string(&bufp, &remain, f_val);
                        if (IS_ERR(str)) {
                                err = PTR_ERR(str);
                                goto exit_free;
                        }
                        entry->rule.buflen += f_val;
                        f->lsm_str = str;
                        err = security_audit_rule_init(f->type, f->op, str,
                                                       (void **)&f->lsm_rule,
                                                       GFP_KERNEL);
                        /* Keep currently invalid fields around in case they
                         * become valid after a policy reload. */
                        if (err == -EINVAL) {
                                pr_warn("audit rule for LSM \'%s\' is invalid\n",
                                        str);
                                err = 0;
                        } else if (err)
                                goto exit_free;
                        break;
                case AUDIT_WATCH:
                        str = audit_unpack_string(&bufp, &remain, f_val);
                        if (IS_ERR(str)) {
                                err = PTR_ERR(str);
                                goto exit_free;
                        }
                        err = audit_to_watch(&entry->rule, str, f_val, f->op);
                        if (err) {
                                kfree(str);
                                goto exit_free;
                        }
                        entry->rule.buflen += f_val;
                        break;
                case AUDIT_DIR:
                        str = audit_unpack_string(&bufp, &remain, f_val);
                        if (IS_ERR(str)) {
                                err = PTR_ERR(str);
                                goto exit_free;
                        }
                        err = audit_make_tree(&entry->rule, str, f->op);
                        kfree(str);
                        if (err)
                                goto exit_free;
                        entry->rule.buflen += f_val;
                        break;
                case AUDIT_INODE:
                        f->val = f_val;
                        err = audit_to_inode(&entry->rule, f);
                        if (err)
                                goto exit_free;
                        break;
                case AUDIT_FILTERKEY:
                        if (entry->rule.filterkey || f_val > AUDIT_MAX_KEY_LEN)
                                goto exit_free;
                        str = audit_unpack_string(&bufp, &remain, f_val);
                        if (IS_ERR(str)) {
                                err = PTR_ERR(str);
                                goto exit_free;
                        }
                        entry->rule.buflen += f_val;
                        entry->rule.filterkey = str;
                        break;
                case AUDIT_EXE:
                        if (entry->rule.exe || f_val > PATH_MAX)
                                goto exit_free;
                        str = audit_unpack_string(&bufp, &remain, f_val);
                        if (IS_ERR(str)) {
                                err = PTR_ERR(str);
                                goto exit_free;
                        }
                        audit_mark = audit_alloc_mark(&entry->rule, str, f_val);
                        if (IS_ERR(audit_mark)) {
                                kfree(str);
                                err = PTR_ERR(audit_mark);
                                goto exit_free;
                        }
                        entry->rule.buflen += f_val;
                        entry->rule.exe = audit_mark;
                        break;
                default:
                        f->val = f_val;
                        break;
                }
        }

        if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal)
                entry->rule.inode_f = NULL;

exit_nofree:
        return entry;

exit_free:
        if (entry->rule.tree)
                audit_put_tree(entry->rule.tree); /* that's the temporary one */
        if (entry->rule.exe)
                audit_remove_mark(entry->rule.exe); /* that's the template one */
        audit_free_rule(entry);
        return ERR_PTR(err);
}

/* Pack a filter field's string representation into data block. */
static inline size_t audit_pack_string(void **bufp, const char *str)
{
        size_t len = strlen(str);

        memcpy(*bufp, str, len);
        *bufp += len;

        return len;
}

/* Translate kernel rule representation to struct audit_rule_data. */
static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
{
        struct audit_rule_data *data;
        void *bufp;
        int i;

        data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL);
        if (unlikely(!data))
                return NULL;
        memset(data, 0, sizeof(*data));

        data->flags = krule->flags | krule->listnr;
        data->action = krule->action;
        data->field_count = krule->field_count;
        bufp = data->buf;
        for (i = 0; i < data->field_count; i++) {
                struct audit_field *f = &krule->fields[i];

                data->fields[i] = f->type;
                data->fieldflags[i] = audit_ops[f->op];
                switch(f->type) {
                case AUDIT_SUBJ_USER:
                case AUDIT_SUBJ_ROLE:
                case AUDIT_SUBJ_TYPE:
                case AUDIT_SUBJ_SEN:
                case AUDIT_SUBJ_CLR:
                case AUDIT_OBJ_USER:
                case AUDIT_OBJ_ROLE:
                case AUDIT_OBJ_TYPE:
                case AUDIT_OBJ_LEV_LOW:
                case AUDIT_OBJ_LEV_HIGH:
                        data->buflen += data->values[i] =
                                audit_pack_string(&bufp, f->lsm_str);
                        break;
                case AUDIT_WATCH:
                        data->buflen += data->values[i] =
                                audit_pack_string(&bufp,
                                                  audit_watch_path(krule->watch));
                        break;
                case AUDIT_DIR:
                        data->buflen += data->values[i] =
                                audit_pack_string(&bufp,
                                                  audit_tree_path(krule->tree));
                        break;
                case AUDIT_FILTERKEY:
                        data->buflen += data->values[i] =
                                audit_pack_string(&bufp, krule->filterkey);
                        break;
                case AUDIT_EXE:
                        data->buflen += data->values[i] =
                                audit_pack_string(&bufp, audit_mark_path(krule->exe));
                        break;
                case AUDIT_LOGINUID_SET:
                        if (krule->pflags & AUDIT_LOGINUID_LEGACY && !f->val) {
                                data->fields[i] = AUDIT_LOGINUID;
                                data->values[i] = AUDIT_UID_UNSET;
                                break;
                        }
                        fallthrough;        /* if set */
                default:
                        data->values[i] = f->val;
                }
        }
        for (i = 0; i < AUDIT_BITMASK_SIZE; i++) data->mask[i] = krule->mask[i];

        return data;
}

/* Compare two rules in kernel format.  Considered success if rules
 * don't match. */
static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
{
        int i;

        if (a->flags != b->flags ||
            a->pflags != b->pflags ||
            a->listnr != b->listnr ||
            a->action != b->action ||
            a->field_count != b->field_count)
                return 1;

        for (i = 0; i < a->field_count; i++) {
                if (a->fields[i].type != b->fields[i].type ||
                    a->fields[i].op != b->fields[i].op)
                        return 1;

                switch(a->fields[i].type) {
                case AUDIT_SUBJ_USER:
                case AUDIT_SUBJ_ROLE:
                case AUDIT_SUBJ_TYPE:
                case AUDIT_SUBJ_SEN:
                case AUDIT_SUBJ_CLR:
                case AUDIT_OBJ_USER:
                case AUDIT_OBJ_ROLE:
                case AUDIT_OBJ_TYPE:
                case AUDIT_OBJ_LEV_LOW:
                case AUDIT_OBJ_LEV_HIGH:
                        if (strcmp(a->fields[i].lsm_str, b->fields[i].lsm_str))
                                return 1;
                        break;
                case AUDIT_WATCH:
                        if (strcmp(audit_watch_path(a->watch),
                                   audit_watch_path(b->watch)))
                                return 1;
                        break;
                case AUDIT_DIR:
                        if (strcmp(audit_tree_path(a->tree),
                                   audit_tree_path(b->tree)))
                                return 1;
                        break;
                case AUDIT_FILTERKEY:
                        /* both filterkeys exist based on above type compare */
                        if (strcmp(a->filterkey, b->filterkey))
                                return 1;
                        break;
                case AUDIT_EXE:
                        /* both paths exist based on above type compare */
                        if (strcmp(audit_mark_path(a->exe),
                                   audit_mark_path(b->exe)))
                                return 1;
                        break;
                case AUDIT_UID:
                case AUDIT_EUID:
                case AUDIT_SUID:
                case AUDIT_FSUID:
                case AUDIT_LOGINUID:
                case AUDIT_OBJ_UID:
                        if (!uid_eq(a->fields[i].uid, b->fields[i].uid))
                                return 1;
                        break;
                case AUDIT_GID:
                case AUDIT_EGID:
                case AUDIT_SGID:
                case AUDIT_FSGID:
                case AUDIT_OBJ_GID:
                        if (!gid_eq(a->fields[i].gid, b->fields[i].gid))
                                return 1;
                        break;
                default:
                        if (a->fields[i].val != b->fields[i].val)
                                return 1;
                }
        }

        for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
                if (a->mask[i] != b->mask[i])
                        return 1;

        return 0;
}

/* Duplicate LSM field information.  The lsm_rule is opaque, so must be
 * re-initialized. */
static inline int audit_dupe_lsm_field(struct audit_field *df,
                                           struct audit_field *sf)
{
        int ret = 0;
        char *lsm_str;

        /* our own copy of lsm_str */
        lsm_str = kstrdup(sf->lsm_str, GFP_KERNEL);
        if (unlikely(!lsm_str))
                return -ENOMEM;
        df->lsm_str = lsm_str;

        /* our own (refreshed) copy of lsm_rule */
        ret = security_audit_rule_init(df->type, df->op, df->lsm_str,
                                       (void **)&df->lsm_rule, GFP_KERNEL);
        /* Keep currently invalid fields around in case they
         * become valid after a policy reload. */
        if (ret == -EINVAL) {
                pr_warn("audit rule for LSM \'%s\' is invalid\n",
                        df->lsm_str);
                ret = 0;
        }

        return ret;
}

/* Duplicate an audit rule.  This will be a deep copy with the exception
 * of the watch - that pointer is carried over.  The LSM specific fields
 * will be updated in the copy.  The point is to be able to replace the old
 * rule with the new rule in the filterlist, then free the old rule.
 * The rlist element is undefined; list manipulations are handled apart from
 * the initial copy. */
struct audit_entry *audit_dupe_rule(struct audit_krule *old)
{
        u32 fcount = old->field_count;
        struct audit_entry *entry;
        struct audit_krule *new;
        char *fk;
        int i, err = 0;

        entry = audit_init_entry(fcount);
        if (unlikely(!entry))
                return ERR_PTR(-ENOMEM);

        new = &entry->rule;
        new->flags = old->flags;
        new->pflags = old->pflags;
        new->listnr = old->listnr;
        new->action = old->action;
        for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
                new->mask[i] = old->mask[i];
        new->prio = old->prio;
        new->buflen = old->buflen;
        new->inode_f = old->inode_f;
        new->field_count = old->field_count;

        /*
         * note that we are OK with not refcounting here; audit_match_tree()
         * never dereferences tree and we can't get false positives there
         * since we'd have to have rule gone from the list *and* removed
         * before the chunks found by lookup had been allocated, i.e. before
         * the beginning of list scan.
         */
        new->tree = old->tree;
        memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount);

        /* deep copy this information, updating the lsm_rule fields, because
         * the originals will all be freed when the old rule is freed. */
        for (i = 0; i < fcount; i++) {
                switch (new->fields[i].type) {
                case AUDIT_SUBJ_USER:
                case AUDIT_SUBJ_ROLE:
                case AUDIT_SUBJ_TYPE:
                case AUDIT_SUBJ_SEN:
                case AUDIT_SUBJ_CLR:
                case AUDIT_OBJ_USER:
                case AUDIT_OBJ_ROLE:
                case AUDIT_OBJ_TYPE:
                case AUDIT_OBJ_LEV_LOW:
                case AUDIT_OBJ_LEV_HIGH:
                        err = audit_dupe_lsm_field(&new->fields[i],
                                                       &old->fields[i]);
                        break;
                case AUDIT_FILTERKEY:
                        fk = kstrdup(old->filterkey, GFP_KERNEL);
                        if (unlikely(!fk))
                                err = -ENOMEM;
                        else
                                new->filterkey = fk;
                        break;
                case AUDIT_EXE:
                        err = audit_dupe_exe(new, old);
                        break;
                }
                if (err) {
                        if (new->exe)
                                audit_remove_mark(new->exe);
                        audit_free_rule(entry);
                        return ERR_PTR(err);
                }
        }

        if (old->watch) {
                audit_get_watch(old->watch);
                new->watch = old->watch;
        }

        return entry;
}

/* Find an existing audit rule.
 * Caller must hold audit_filter_mutex to prevent stale rule data. */
static struct audit_entry *audit_find_rule(struct audit_entry *entry,
                                           struct list_head **p)
{
        struct audit_entry *e, *found = NULL;
        struct list_head *list;
        int h;

        if (entry->rule.inode_f) {
                h = audit_hash_ino(entry->rule.inode_f->val);
                *p = list = &audit_inode_hash[h];
        } else if (entry->rule.watch) {
                /* we don't know the inode number, so must walk entire hash */
                for (h = 0; h < AUDIT_INODE_BUCKETS; h++) {
                        list = &audit_inode_hash[h];
                        list_for_each_entry(e, list, list)
                                if (!audit_compare_rule(&entry->rule, &e->rule)) {
                                        found = e;
                                        goto out;
                                }
                }
                goto out;
        } else {
                *p = list = &audit_filter_list[entry->rule.listnr];
        }

        list_for_each_entry(e, list, list)
                if (!audit_compare_rule(&entry->rule, &e->rule)) {
                        found = e;
                        goto out;
                }

out:
        return found;
}

static u64 prio_low = ~0ULL/2;
static u64 prio_high = ~0ULL/2 - 1;

/* Add rule to given filterlist if not a duplicate. */
static inline int audit_add_rule(struct audit_entry *entry)
{
        struct audit_entry *e;
        struct audit_watch *watch = entry->rule.watch;
        struct audit_tree *tree = entry->rule.tree;
        struct list_head *list;
        int err = 0;
#ifdef CONFIG_AUDITSYSCALL
        int dont_count = 0;

        /* If any of these, don't count towards total */
        switch(entry->rule.listnr) {
        case AUDIT_FILTER_USER:
        case AUDIT_FILTER_EXCLUDE:
        case AUDIT_FILTER_FS:
                dont_count = 1;
        }
#endif

        mutex_lock(&audit_filter_mutex);
        e = audit_find_rule(entry, &list);
        if (e) {
                mutex_unlock(&audit_filter_mutex);
                err = -EEXIST;
                /* normally audit_add_tree_rule() will free it on failure */
                if (tree)
                        audit_put_tree(tree);
                return err;
        }

        if (watch) {
                /* audit_filter_mutex is dropped and re-taken during this call */
                err = audit_add_watch(&entry->rule, &list);
                if (err) {
                        mutex_unlock(&audit_filter_mutex);
                        /*
                         * normally audit_add_tree_rule() will free it
                         * on failure
                         */
                        if (tree)
                                audit_put_tree(tree);
                        return err;
                }
        }
        if (tree) {
                err = audit_add_tree_rule(&entry->rule);
                if (err) {
                        mutex_unlock(&audit_filter_mutex);
                        return err;
                }
        }

        entry->rule.prio = ~0ULL;
        if (entry->rule.listnr == AUDIT_FILTER_EXIT) {
                if (entry->rule.flags & AUDIT_FILTER_PREPEND)
                        entry->rule.prio = ++prio_high;
                else
                        entry->rule.prio = --prio_low;
        }

        if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
                list_add(&entry->rule.list,
                         &audit_rules_list[entry->rule.listnr]);
                list_add_rcu(&entry->list, list);
                entry->rule.flags &= ~AUDIT_FILTER_PREPEND;
        } else {
                list_add_tail(&entry->rule.list,
                              &audit_rules_list[entry->rule.listnr]);
                list_add_tail_rcu(&entry->list, list);
        }
#ifdef CONFIG_AUDITSYSCALL
        if (!dont_count)
                audit_n_rules++;

        if (!audit_match_signal(entry))
                audit_signals++;
#endif
        mutex_unlock(&audit_filter_mutex);

        return err;
}

/* Remove an existing rule from filterlist. */
int audit_del_rule(struct audit_entry *entry)
{
        struct audit_entry  *e;
        struct audit_tree *tree = entry->rule.tree;
        struct list_head *list;
        int ret = 0;
#ifdef CONFIG_AUDITSYSCALL
        int dont_count = 0;

        /* If any of these, don't count towards total */
        switch(entry->rule.listnr) {
        case AUDIT_FILTER_USER:
        case AUDIT_FILTER_EXCLUDE:
        case AUDIT_FILTER_FS:
                dont_count = 1;
        }
#endif

        mutex_lock(&audit_filter_mutex);
        e = audit_find_rule(entry, &list);
        if (!e) {
                ret = -ENOENT;
                goto out;
        }

        if (e->rule.watch)
                audit_remove_watch_rule(&e->rule);

        if (e->rule.tree)
                audit_remove_tree_rule(&e->rule);

        if (e->rule.exe)
                audit_remove_mark_rule(&e->rule);

#ifdef CONFIG_AUDITSYSCALL
        if (!dont_count)
                audit_n_rules--;

        if (!audit_match_signal(entry))
                audit_signals--;
#endif

        list_del_rcu(&e->list);
        list_del(&e->rule.list);
        call_rcu(&e->rcu, audit_free_rule_rcu);

out:
        mutex_unlock(&audit_filter_mutex);

        if (tree)
                audit_put_tree(tree);        /* that's the temporary one */

        return ret;
}

/* List rules using struct audit_rule_data. */
static void audit_list_rules(int seq, struct sk_buff_head *q)
{
        struct sk_buff *skb;
        struct audit_krule *r;
        int i;

        /* This is a blocking read, so use audit_filter_mutex instead of rcu
         * iterator to sync with list writers. */
        for (i=0; i<AUDIT_NR_FILTERS; i++) {
                list_for_each_entry(r, &audit_rules_list[i], list) {
                        struct audit_rule_data *data;

                        data = audit_krule_to_data(r);
                        if (unlikely(!data))
                                break;
                        skb = audit_make_reply(seq, AUDIT_LIST_RULES, 0, 1,
                                               data,
                                               sizeof(*data) + data->buflen);
                        if (skb)
                                skb_queue_tail(q, skb);
                        kfree(data);
                }
        }
        skb = audit_make_reply(seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
        if (skb)
                skb_queue_tail(q, skb);
}

/* Log rule additions and removals */
static void audit_log_rule_change(char *action, struct audit_krule *rule, int res)
{
        struct audit_buffer *ab;

        if (!audit_enabled)
                return;

        ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE);
        if (!ab)
                return;
        audit_log_session_info(ab);
        audit_log_task_context(ab);
        audit_log_format(ab, " op=%s", action);
        audit_log_key(ab, rule->filterkey);
        audit_log_format(ab, " list=%d res=%d", rule->listnr, res);
        audit_log_end(ab);
}

/**
 * audit_rule_change - apply all rules to the specified message type
 * @type: audit message type
 * @seq: netlink audit message sequence (serial) number
 * @data: payload data
 * @datasz: size of payload data
 */
int audit_rule_change(int type, int seq, void *data, size_t datasz)
{
        int err = 0;
        struct audit_entry *entry;

        switch (type) {
        case AUDIT_ADD_RULE:
                entry = audit_data_to_entry(data, datasz);
                if (IS_ERR(entry))
                        return PTR_ERR(entry);
                err = audit_add_rule(entry);
                audit_log_rule_change("add_rule", &entry->rule, !err);
                break;
        case AUDIT_DEL_RULE:
                entry = audit_data_to_entry(data, datasz);
                if (IS_ERR(entry))
                        return PTR_ERR(entry);
                err = audit_del_rule(entry);
                audit_log_rule_change("remove_rule", &entry->rule, !err);
                break;
        default:
                WARN_ON(1);
                return -EINVAL;
        }

        if (err || type == AUDIT_DEL_RULE) {
                if (entry->rule.exe)
                        audit_remove_mark(entry->rule.exe);
                audit_free_rule(entry);
        }

        return err;
}

/**
 * audit_list_rules_send - list the audit rules
 * @request_skb: skb of request we are replying to (used to target the reply)
 * @seq: netlink audit message sequence (serial) number
 */
int audit_list_rules_send(struct sk_buff *request_skb, int seq)
{
        struct task_struct *tsk;
        struct audit_netlink_list *dest;

        /* We can't just spew out the rules here because we might fill
         * the available socket buffer space and deadlock waiting for
         * auditctl to read from it... which isn't ever going to
         * happen if we're actually running in the context of auditctl
         * trying to _send_ the stuff */

        dest = kmalloc(sizeof(*dest), GFP_KERNEL);
        if (!dest)
                return -ENOMEM;
        dest->net = get_net(sock_net(NETLINK_CB(request_skb).sk));
        dest->portid = NETLINK_CB(request_skb).portid;
        skb_queue_head_init(&dest->q);

        mutex_lock(&audit_filter_mutex);
        audit_list_rules(seq, &dest->q);
        mutex_unlock(&audit_filter_mutex);

        tsk = kthread_run(audit_send_list_thread, dest, "audit_send_list");
        if (IS_ERR(tsk)) {
                skb_queue_purge(&dest->q);
                put_net(dest->net);
                kfree(dest);
                return PTR_ERR(tsk);
        }

        return 0;
}

int audit_comparator(u32 left, u32 op, u32 right)
{
        switch (op) {
        case Audit_equal:
                return (left == right);
        case Audit_not_equal:
                return (left != right);
        case Audit_lt:
                return (left < right);
        case Audit_le:
                return (left <= right);
        case Audit_gt:
                return (left > right);
        case Audit_ge:
                return (left >= right);
        case Audit_bitmask:
                return (left & right);
        case Audit_bittest:
                return ((left & right) == right);
        default:
                return 0;
        }
}

int audit_uid_comparator(kuid_t left, u32 op, kuid_t right)
{
        switch (op) {
        case Audit_equal:
                return uid_eq(left, right);
        case Audit_not_equal:
                return !uid_eq(left, right);
        case Audit_lt:
                return uid_lt(left, right);
        case Audit_le:
                return uid_lte(left, right);
        case Audit_gt:
                return uid_gt(left, right);
        case Audit_ge:
                return uid_gte(left, right);
        case Audit_bitmask:
        case Audit_bittest:
        default:
                return 0;
        }
}

int audit_gid_comparator(kgid_t left, u32 op, kgid_t right)
{
        switch (op) {
        case Audit_equal:
                return gid_eq(left, right);
        case Audit_not_equal:
                return !gid_eq(left, right);
        case Audit_lt:
                return gid_lt(left, right);
        case Audit_le:
                return gid_lte(left, right);
        case Audit_gt:
                return gid_gt(left, right);
        case Audit_ge:
                return gid_gte(left, right);
        case Audit_bitmask:
        case Audit_bittest:
        default:
                return 0;
        }
}

/**
 * parent_len - find the length of the parent portion of a pathname
 * @path: pathname of which to determine length
 */
int parent_len(const char *path)
{
        int plen;
        const char *p;

        plen = strlen(path);

        if (plen == 0)
                return plen;

        /* disregard trailing slashes */
        p = path + plen - 1;
        while ((*p == '/') && (p > path))
                p--;

        /* walk backward until we find the next slash or hit beginning */
        while ((*p != '/') && (p > path))
                p--;

        /* did we find a slash? Then increment to include it in path */
        if (*p == '/')
                p++;

        return p - path;
}

/**
 * audit_compare_dname_path - compare given dentry name with last component in
 *                               given path. Return of 0 indicates a match.
 * @dname:        dentry name that we're comparing
 * @path:        full pathname that we're comparing
 * @parentlen:        length of the parent if known. Passing in AUDIT_NAME_FULL
 *                 here indicates that we must compute this value.
 */
int audit_compare_dname_path(const struct qstr *dname, const char *path, int parentlen)
{
        int dlen, pathlen;
        const char *p;

        dlen = dname->len;
        pathlen = strlen(path);
        if (pathlen < dlen)
                return 1;

        parentlen = parentlen == AUDIT_NAME_FULL ? parent_len(path) : parentlen;
        if (pathlen - parentlen != dlen)
                return 1;

        p = path + parentlen;

        return strncmp(p, dname->name, dlen);
}

int audit_filter(int msgtype, unsigned int listtype)
{
        struct audit_entry *e;
        int ret = 1; /* Audit by default */

        rcu_read_lock();
        list_for_each_entry_rcu(e, &audit_filter_list[listtype], list) {
                int i, result = 0;

                for (i = 0; i < e->rule.field_count; i++) {
                        struct audit_field *f = &e->rule.fields[i];
                        pid_t pid;
                        u32 sid;

                        switch (f->type) {
                        case AUDIT_PID:
                                pid = task_pid_nr(current);
                                result = audit_comparator(pid, f->op, f->val);
                                break;
                        case AUDIT_UID:
                                result = audit_uid_comparator(current_uid(), f->op, f->uid);
                                break;
                        case AUDIT_GID:
                                result = audit_gid_comparator(current_gid(), f->op, f->gid);
                                break;
                        case AUDIT_LOGINUID:
                                result = audit_uid_comparator(audit_get_loginuid(current),
                                                              f->op, f->uid);
                                break;
                        case AUDIT_LOGINUID_SET:
                                result = audit_comparator(audit_loginuid_set(current),
                                                          f->op, f->val);
                                break;
                        case AUDIT_MSGTYPE:
                                result = audit_comparator(msgtype, f->op, f->val);
                                break;
                        case AUDIT_SUBJ_USER:
                        case AUDIT_SUBJ_ROLE:
                        case AUDIT_SUBJ_TYPE:
                        case AUDIT_SUBJ_SEN:
                        case AUDIT_SUBJ_CLR:
                                if (f->lsm_rule) {
                                        security_task_getsecid(current, &sid);
                                        result = security_audit_rule_match(sid,
                                                   f->type, f->op, f->lsm_rule);
                                }
                                break;
                        case AUDIT_EXE:
                                result = audit_exe_compare(current, e->rule.exe);
                                if (f->op == Audit_not_equal)
                                        result = !result;
                                break;
                        default:
                                goto unlock_and_return;
                        }
                        if (result < 0) /* error */
                                goto unlock_and_return;
                        if (!result)
                                break;
                }
                if (result > 0) {
                        if (e->rule.action == AUDIT_NEVER || listtype == AUDIT_FILTER_EXCLUDE)
                                ret = 0;
                        break;
                }
        }
unlock_and_return:
        rcu_read_unlock();
        return ret;
}

static int update_lsm_rule(struct audit_krule *r)
{
        struct audit_entry *entry = container_of(r, struct audit_entry, rule);
        struct audit_entry *nentry;
        int err = 0;

        if (!security_audit_rule_known(r))
                return 0;

        nentry = audit_dupe_rule(r);
        if (entry->rule.exe)
                audit_remove_mark(entry->rule.exe);
        if (IS_ERR(nentry)) {
                /* save the first error encountered for the
                 * return value */
                err = PTR_ERR(nentry);
                audit_panic("error updating LSM filters");
                if (r->watch)
                        list_del(&r->rlist);
                list_del_rcu(&entry->list);
                list_del(&r->list);
        } else {
                if (r->watch || r->tree)
                        list_replace_init(&r->rlist, &nentry->rule.rlist);
                list_replace_rcu(&entry->list, &nentry->list);
                list_replace(&r->list, &nentry->rule.list);
        }
        call_rcu(&entry->rcu, audit_free_rule_rcu);

        return err;
}

/* This function will re-initialize the lsm_rule field of all applicable rules.
 * It will traverse the filter lists serarching for rules that contain LSM
 * specific filter fields.  When such a rule is found, it is copied, the
 * LSM field is re-initialized, and the old rule is replaced with the
 * updated rule. */
int audit_update_lsm_rules(void)
{
        struct audit_krule *r, *n;
        int i, err = 0;

        /* audit_filter_mutex synchronizes the writers */
        mutex_lock(&audit_filter_mutex);

        for (i = 0; i < AUDIT_NR_FILTERS; i++) {
                list_for_each_entry_safe(r, n, &audit_rules_list[i], list) {
                        int res = update_lsm_rule(r);
                        if (!err)
                                err = res;
                }
        }
        mutex_unlock(&audit_filter_mutex);

        return err;
}








































































































































    5 











    4 


































    1 
    5 





















    5 

    5 




















    5 



    5 



    5 


    5 















    1 

    1 



    1 
    1 



    1 

    1 


    1 





    1 


    1 

    1 



    1 

































    2 
    1 



















    2 


    2 





































































    1 







    6 







































































    1 







    1 
    1 




























































    1 


    1 
    1 
    1 

    1 


















    1 

    1 
    1 

















    1 




    1 

    1 


















































































































































































































































    1 

















    1 

































































    2 





















    1 







    1 


















    4 




    4 






    3 
    1 







    1 




















    1 
























































































































    1 






















    1 

















    1 






























    1 


    1 





    1 






















































































































































    1 






















    1 






    1 

























    1 











































































































































































    2 






    1 
    1 
















    2 






































    1 















    1 






    1 
    1 











    1 












    1 


















    3 

    1 

    1 
    1 































































    1 












    1 

    1 

    1 
    1 
    1 
    1 
    1 
    1 




    1 










    1 

    1 














    1 


    1 
    1 







    1 

    1 




    1 


    1 




















































    1 


    2 



    2 

































    2 



    2 

    2 






























    2 

    2 




    1 

    1 


    1 



    1 



    1 














    1 



    1 


    1 


































































































    1 



    1 
    1 





















    3 
    2 






    3 
    3 












































































    2 
    2 


















    4 




    4 
    4 



    4 







    4 

















    4 






    4 











































































































































    2 


    2 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
// SPDX-License-Identifier: GPL-2.0-only
/*
 * (C) 1997 Linus Torvalds
 * (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation)
 */
#include <linux/export.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/backing-dev.h>
#include <linux/hash.h>
#include <linux/swap.h>
#include <linux/security.h>
#include <linux/cdev.h>
#include <linux/memblock.h>
#include <linux/fscrypt.h>
#include <linux/fsnotify.h>
#include <linux/mount.h>
#include <linux/posix_acl.h>
#include <linux/prefetch.h>
#include <linux/buffer_head.h> /* for inode_has_buffers */
#include <linux/ratelimit.h>
#include <linux/list_lru.h>
#include <linux/iversion.h>
#include <trace/events/writeback.h>
#include "internal.h"

/*
 * Inode locking rules:
 *
 * inode->i_lock protects:
 *   inode->i_state, inode->i_hash, __iget()
 * Inode LRU list locks protect:
 *   inode->i_sb->s_inode_lru, inode->i_lru
 * inode->i_sb->s_inode_list_lock protects:
 *   inode->i_sb->s_inodes, inode->i_sb_list
 * bdi->wb.list_lock protects:
 *   bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list
 * inode_hash_lock protects:
 *   inode_hashtable, inode->i_hash
 *
 * Lock ordering:
 *
 * inode->i_sb->s_inode_list_lock
 *   inode->i_lock
 *     Inode LRU list locks
 *
 * bdi->wb.list_lock
 *   inode->i_lock
 *
 * inode_hash_lock
 *   inode->i_sb->s_inode_list_lock
 *   inode->i_lock
 *
 * iunique_lock
 *   inode_hash_lock
 */

static unsigned int i_hash_mask __read_mostly;
static unsigned int i_hash_shift __read_mostly;
static struct hlist_head *inode_hashtable __read_mostly;
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);

/*
 * Empty aops. Can be used for the cases where the user does not
 * define any of the address_space operations.
 */
const struct address_space_operations empty_aops = {
};
EXPORT_SYMBOL(empty_aops);

/*
 * Statistics gathering..
 */
struct inodes_stat_t inodes_stat;

static DEFINE_PER_CPU(unsigned long, nr_inodes);
static DEFINE_PER_CPU(unsigned long, nr_unused);

static struct kmem_cache *inode_cachep __read_mostly;

static long get_nr_inodes(void)
{
        int i;
        long sum = 0;
        for_each_possible_cpu(i)
                sum += per_cpu(nr_inodes, i);
        return sum < 0 ? 0 : sum;
}

static inline long get_nr_inodes_unused(void)
{
        int i;
        long sum = 0;
        for_each_possible_cpu(i)
                sum += per_cpu(nr_unused, i);
        return sum < 0 ? 0 : sum;
}

long get_nr_dirty_inodes(void)
{
        /* not actually dirty inodes, but a wild approximation */
        long nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
        return nr_dirty > 0 ? nr_dirty : 0;
}

/*
 * Handle nr_inode sysctl
 */
#ifdef CONFIG_SYSCTL
int proc_nr_inodes(struct ctl_table *table, int write,
                   void *buffer, size_t *lenp, loff_t *ppos)
{
        inodes_stat.nr_inodes = get_nr_inodes();
        inodes_stat.nr_unused = get_nr_inodes_unused();
        return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
#endif

static int no_open(struct inode *inode, struct file *file)
{
        return -ENXIO;
}

/**
 * inode_init_always - perform inode structure initialisation
 * @sb: superblock inode belongs to
 * @inode: inode to initialise
 *
 * These are initializations that need to be done on every inode
 * allocation as the fields are not initialised by slab allocation.
 */
int inode_init_always(struct super_block *sb, struct inode *inode)
{
        static const struct inode_operations empty_iops;
        static const struct file_operations no_open_fops = {.open = no_open};
        struct address_space *const mapping = &inode->i_data;

        inode->i_sb = sb;
        inode->i_blkbits = sb->s_blocksize_bits;
        inode->i_flags = 0;
        atomic64_set(&inode->i_sequence, 0);
        atomic_set(&inode->i_count, 1);
        inode->i_op = &empty_iops;
        inode->i_fop = &no_open_fops;
        inode->__i_nlink = 1;
        inode->i_opflags = 0;
        if (sb->s_xattr)
                inode->i_opflags |= IOP_XATTR;
        i_uid_write(inode, 0);
        i_gid_write(inode, 0);
        atomic_set(&inode->i_writecount, 0);
        inode->i_size = 0;
        inode->i_write_hint = WRITE_LIFE_NOT_SET;
        inode->i_blocks = 0;
        inode->i_bytes = 0;
        inode->i_generation = 0;
        inode->i_pipe = NULL;
        inode->i_bdev = NULL;
        inode->i_cdev = NULL;
        inode->i_link = NULL;
        inode->i_dir_seq = 0;
        inode->i_rdev = 0;
        inode->dirtied_when = 0;

#ifdef CONFIG_CGROUP_WRITEBACK
        inode->i_wb_frn_winner = 0;
        inode->i_wb_frn_avg_time = 0;
        inode->i_wb_frn_history = 0;
#endif

        spin_lock_init(&inode->i_lock);
        lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);

        init_rwsem(&inode->i_rwsem);
        lockdep_set_class(&inode->i_rwsem, &sb->s_type->i_mutex_key);

        atomic_set(&inode->i_dio_count, 0);

        mapping->a_ops = &empty_aops;
        mapping->host = inode;
        mapping->flags = 0;
        if (sb->s_type->fs_flags & FS_THP_SUPPORT)
                __set_bit(AS_THP_SUPPORT, &mapping->flags);
        mapping->wb_err = 0;
        atomic_set(&mapping->i_mmap_writable, 0);
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
        atomic_set(&mapping->nr_thps, 0);
#endif
        mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
        mapping->private_data = NULL;
        mapping->writeback_index = 0;
        inode->i_private = NULL;
        inode->i_mapping = mapping;
        INIT_HLIST_HEAD(&inode->i_dentry);        /* buggered by rcu freeing */
#ifdef CONFIG_FS_POSIX_ACL
        inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
#endif

#ifdef CONFIG_FSNOTIFY
        inode->i_fsnotify_mask = 0;
#endif
        inode->i_flctx = NULL;

        if (unlikely(security_inode_alloc(inode)))
                return -ENOMEM;
        this_cpu_inc(nr_inodes);

        return 0;
}
EXPORT_SYMBOL(inode_init_always);

void free_inode_nonrcu(struct inode *inode)
{
        kmem_cache_free(inode_cachep, inode);
}
EXPORT_SYMBOL(free_inode_nonrcu);

static void i_callback(struct rcu_head *head)
{
        struct inode *inode = container_of(head, struct inode, i_rcu);
        if (inode->free_inode)
                inode->free_inode(inode);
        else
                free_inode_nonrcu(inode);
}

static struct inode *alloc_inode(struct super_block *sb)
{
        const struct super_operations *ops = sb->s_op;
        struct inode *inode;

        if (ops->alloc_inode)
                inode = ops->alloc_inode(sb);
        else
                inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);

        if (!inode)
                return NULL;

        if (unlikely(inode_init_always(sb, inode))) {
                if (ops->destroy_inode) {
                        ops->destroy_inode(inode);
                        if (!ops->free_inode)
                                return NULL;
                }
                inode->free_inode = ops->free_inode;
                i_callback(&inode->i_rcu);
                return NULL;
        }

        return inode;
}

void __destroy_inode(struct inode *inode)
{
        BUG_ON(inode_has_buffers(inode));
        inode_detach_wb(inode);
        security_inode_free(inode);
        fsnotify_inode_delete(inode);
        locks_free_lock_context(inode);
        if (!inode->i_nlink) {
                WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0);
                atomic_long_dec(&inode->i_sb->s_remove_count);
        }

#ifdef CONFIG_FS_POSIX_ACL
        if (inode->i_acl && !is_uncached_acl(inode->i_acl))
                posix_acl_release(inode->i_acl);
        if (inode->i_default_acl && !is_uncached_acl(inode->i_default_acl))
                posix_acl_release(inode->i_default_acl);
#endif
        this_cpu_dec(nr_inodes);
}
EXPORT_SYMBOL(__destroy_inode);

static void destroy_inode(struct inode *inode)
{
        const struct super_operations *ops = inode->i_sb->s_op;

        BUG_ON(!list_empty(&inode->i_lru));
        __destroy_inode(inode);
        if (ops->destroy_inode) {
                ops->destroy_inode(inode);
                if (!ops->free_inode)
                        return;
        }
        inode->free_inode = ops->free_inode;
        call_rcu(&inode->i_rcu, i_callback);
}

/**
 * drop_nlink - directly drop an inode's link count
 * @inode: inode
 *
 * This is a low-level filesystem helper to replace any
 * direct filesystem manipulation of i_nlink.  In cases
 * where we are attempting to track writes to the
 * filesystem, a decrement to zero means an imminent
 * write when the file is truncated and actually unlinked
 * on the filesystem.
 */
void drop_nlink(struct inode *inode)
{
        WARN_ON(inode->i_nlink == 0);
        inode->__i_nlink--;
        if (!inode->i_nlink)
                atomic_long_inc(&inode->i_sb->s_remove_count);
}
EXPORT_SYMBOL(drop_nlink);

/**
 * clear_nlink - directly zero an inode's link count
 * @inode: inode
 *
 * This is a low-level filesystem helper to replace any
 * direct filesystem manipulation of i_nlink.  See
 * drop_nlink() for why we care about i_nlink hitting zero.
 */
void clear_nlink(struct inode *inode)
{
        if (inode->i_nlink) {
                inode->__i_nlink = 0;
                atomic_long_inc(&inode->i_sb->s_remove_count);
        }
}
EXPORT_SYMBOL(clear_nlink);

/**
 * set_nlink - directly set an inode's link count
 * @inode: inode
 * @nlink: new nlink (should be non-zero)
 *
 * This is a low-level filesystem helper to replace any
 * direct filesystem manipulation of i_nlink.
 */
void set_nlink(struct inode *inode, unsigned int nlink)
{
        if (!nlink) {
                clear_nlink(inode);
        } else {
                /* Yes, some filesystems do change nlink from zero to one */
                if (inode->i_nlink == 0)
                        atomic_long_dec(&inode->i_sb->s_remove_count);

                inode->__i_nlink = nlink;
        }
}
EXPORT_SYMBOL(set_nlink);

/**
 * inc_nlink - directly increment an inode's link count
 * @inode: inode
 *
 * This is a low-level filesystem helper to replace any
 * direct filesystem manipulation of i_nlink.  Currently,
 * it is only here for parity with dec_nlink().
 */
void inc_nlink(struct inode *inode)
{
        if (unlikely(inode->i_nlink == 0)) {
                WARN_ON(!(inode->i_state & I_LINKABLE));
                atomic_long_dec(&inode->i_sb->s_remove_count);
        }

        inode->__i_nlink++;
}
EXPORT_SYMBOL(inc_nlink);

static void __address_space_init_once(struct address_space *mapping)
{
        xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
        init_rwsem(&mapping->i_mmap_rwsem);
        INIT_LIST_HEAD(&mapping->private_list);
        spin_lock_init(&mapping->private_lock);
        mapping->i_mmap = RB_ROOT_CACHED;
}

void address_space_init_once(struct address_space *mapping)
{
        memset(mapping, 0, sizeof(*mapping));
        __address_space_init_once(mapping);
}
EXPORT_SYMBOL(address_space_init_once);

/*
 * These are initializations that only need to be done
 * once, because the fields are idempotent across use
 * of the inode, so let the slab aware of that.
 */
void inode_init_once(struct inode *inode)
{
        memset(inode, 0, sizeof(*inode));
        INIT_HLIST_NODE(&inode->i_hash);
        INIT_LIST_HEAD(&inode->i_devices);
        INIT_LIST_HEAD(&inode->i_io_list);
        INIT_LIST_HEAD(&inode->i_wb_list);
        INIT_LIST_HEAD(&inode->i_lru);
        __address_space_init_once(&inode->i_data);
        i_size_ordered_init(inode);
}
EXPORT_SYMBOL(inode_init_once);

static void init_once(void *foo)
{
        struct inode *inode = (struct inode *) foo;

        inode_init_once(inode);
}

/*
 * inode->i_lock must be held
 */
void __iget(struct inode *inode)
{
        atomic_inc(&inode->i_count);
}

/*
 * get additional reference to inode; caller must already hold one.
 */
void ihold(struct inode *inode)
{
        WARN_ON(atomic_inc_return(&inode->i_count) < 2);
}
EXPORT_SYMBOL(ihold);

static void inode_lru_list_add(struct inode *inode)
{
        if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru))
                this_cpu_inc(nr_unused);
        else
                inode->i_state |= I_REFERENCED;
}

/*
 * Add inode to LRU if needed (inode is unused and clean).
 *
 * Needs inode->i_lock held.
 */
void inode_add_lru(struct inode *inode)
{
        if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC |
                                I_FREEING | I_WILL_FREE)) &&
            !atomic_read(&inode->i_count) && inode->i_sb->s_flags & SB_ACTIVE)
                inode_lru_list_add(inode);
}


static void inode_lru_list_del(struct inode *inode)
{

        if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru))
                this_cpu_dec(nr_unused);
}

static void inode_pin_lru_isolating(struct inode *inode)
{
        lockdep_assert_held(&inode->i_lock);
        WARN_ON(inode->i_state & (I_LRU_ISOLATING | I_FREEING | I_WILL_FREE));
        inode->i_state |= I_LRU_ISOLATING;
}

static void inode_unpin_lru_isolating(struct inode *inode)
{
        spin_lock(&inode->i_lock);
        WARN_ON(!(inode->i_state & I_LRU_ISOLATING));
        inode->i_state &= ~I_LRU_ISOLATING;
        smp_mb();
        wake_up_bit(&inode->i_state, __I_LRU_ISOLATING);
        spin_unlock(&inode->i_lock);
}

static void inode_wait_for_lru_isolating(struct inode *inode)
{
        spin_lock(&inode->i_lock);
        if (inode->i_state & I_LRU_ISOLATING) {
                DEFINE_WAIT_BIT(wq, &inode->i_state, __I_LRU_ISOLATING);
                wait_queue_head_t *wqh;

                wqh = bit_waitqueue(&inode->i_state, __I_LRU_ISOLATING);
                spin_unlock(&inode->i_lock);
                __wait_on_bit(wqh, &wq, bit_wait, TASK_UNINTERRUPTIBLE);
                spin_lock(&inode->i_lock);
                WARN_ON(inode->i_state & I_LRU_ISOLATING);
        }
        spin_unlock(&inode->i_lock);
}

/**
 * inode_sb_list_add - add inode to the superblock list of inodes
 * @inode: inode to add
 */
void inode_sb_list_add(struct inode *inode)
{
        spin_lock(&inode->i_sb->s_inode_list_lock);
        list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
        spin_unlock(&inode->i_sb->s_inode_list_lock);
}
EXPORT_SYMBOL_GPL(inode_sb_list_add);

static inline void inode_sb_list_del(struct inode *inode)
{
        if (!list_empty(&inode->i_sb_list)) {
                spin_lock(&inode->i_sb->s_inode_list_lock);
                list_del_init(&inode->i_sb_list);
                spin_unlock(&inode->i_sb->s_inode_list_lock);
        }
}

static unsigned long hash(struct super_block *sb, unsigned long hashval)
{
        unsigned long tmp;

        tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
                        L1_CACHE_BYTES;
        tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift);
        return tmp & i_hash_mask;
}

/**
 *        __insert_inode_hash - hash an inode
 *        @inode: unhashed inode
 *        @hashval: unsigned long value used to locate this object in the
 *                inode_hashtable.
 *
 *        Add an inode to the inode hash for this superblock.
 */
void __insert_inode_hash(struct inode *inode, unsigned long hashval)
{
        struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);

        spin_lock(&inode_hash_lock);
        spin_lock(&inode->i_lock);
        hlist_add_head_rcu(&inode->i_hash, b);
        spin_unlock(&inode->i_lock);
        spin_unlock(&inode_hash_lock);
}
EXPORT_SYMBOL(__insert_inode_hash);

/**
 *        __remove_inode_hash - remove an inode from the hash
 *        @inode: inode to unhash
 *
 *        Remove an inode from the superblock.
 */
void __remove_inode_hash(struct inode *inode)
{
        spin_lock(&inode_hash_lock);
        spin_lock(&inode->i_lock);
        hlist_del_init_rcu(&inode->i_hash);
        spin_unlock(&inode->i_lock);
        spin_unlock(&inode_hash_lock);
}
EXPORT_SYMBOL(__remove_inode_hash);

void clear_inode(struct inode *inode)
{
        /*
         * We have to cycle the i_pages lock here because reclaim can be in the
         * process of removing the last page (in __delete_from_page_cache())
         * and we must not free the mapping under it.
         */
        xa_lock_irq(&inode->i_data.i_pages);
        BUG_ON(inode->i_data.nrpages);
        BUG_ON(inode->i_data.nrexceptional);
        xa_unlock_irq(&inode->i_data.i_pages);
        BUG_ON(!list_empty(&inode->i_data.private_list));
        BUG_ON(!(inode->i_state & I_FREEING));
        BUG_ON(inode->i_state & I_CLEAR);
        BUG_ON(!list_empty(&inode->i_wb_list));
        /* don't need i_lock here, no concurrent mods to i_state */
        inode->i_state = I_FREEING | I_CLEAR;
}
EXPORT_SYMBOL(clear_inode);

/*
 * Free the inode passed in, removing it from the lists it is still connected
 * to. We remove any pages still attached to the inode and wait for any IO that
 * is still in progress before finally destroying the inode.
 *
 * An inode must already be marked I_FREEING so that we avoid the inode being
 * moved back onto lists if we race with other code that manipulates the lists
 * (e.g. writeback_single_inode). The caller is responsible for setting this.
 *
 * An inode must already be removed from the LRU list before being evicted from
 * the cache. This should occur atomically with setting the I_FREEING state
 * flag, so no inodes here should ever be on the LRU when being evicted.
 */
static void evict(struct inode *inode)
{
        const struct super_operations *op = inode->i_sb->s_op;

        BUG_ON(!(inode->i_state & I_FREEING));
        BUG_ON(!list_empty(&inode->i_lru));

        if (!list_empty(&inode->i_io_list))
                inode_io_list_del(inode);

        inode_sb_list_del(inode);

        inode_wait_for_lru_isolating(inode);

        /*
         * Wait for flusher thread to be done with the inode so that filesystem
         * does not start destroying it while writeback is still running. Since
         * the inode has I_FREEING set, flusher thread won't start new work on
         * the inode.  We just have to wait for running writeback to finish.
         */
        inode_wait_for_writeback(inode);

        if (op->evict_inode) {
                op->evict_inode(inode);
        } else {
                truncate_inode_pages_final(&inode->i_data);
                clear_inode(inode);
        }
        if (S_ISBLK(inode->i_mode) && inode->i_bdev)
                bd_forget(inode);
        if (S_ISCHR(inode->i_mode) && inode->i_cdev)
                cd_forget(inode);

        remove_inode_hash(inode);

        spin_lock(&inode->i_lock);
        wake_up_bit(&inode->i_state, __I_NEW);
        BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
        spin_unlock(&inode->i_lock);

        destroy_inode(inode);
}

/*
 * dispose_list - dispose of the contents of a local list
 * @head: the head of the list to free
 *
 * Dispose-list gets a local list with local inodes in it, so it doesn't
 * need to worry about list corruption and SMP locks.
 */
static void dispose_list(struct list_head *head)
{
        while (!list_empty(head)) {
                struct inode *inode;

                inode = list_first_entry(head, struct inode, i_lru);
                list_del_init(&inode->i_lru);

                evict(inode);
                cond_resched();
        }
}

/**
 * evict_inodes        - evict all evictable inodes for a superblock
 * @sb:                superblock to operate on
 *
 * Make sure that no inodes with zero refcount are retained.  This is
 * called by superblock shutdown after having SB_ACTIVE flag removed,
 * so any inode reaching zero refcount during or after that call will
 * be immediately evicted.
 */
void evict_inodes(struct super_block *sb)
{
        struct inode *inode, *next;
        LIST_HEAD(dispose);

again:
        spin_lock(&sb->s_inode_list_lock);
        list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
                if (atomic_read(&inode->i_count))
                        continue;

                spin_lock(&inode->i_lock);
                if (atomic_read(&inode->i_count)) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }
                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }

                inode->i_state |= I_FREEING;
                inode_lru_list_del(inode);
                spin_unlock(&inode->i_lock);
                list_add(&inode->i_lru, &dispose);

                /*
                 * We can have a ton of inodes to evict at unmount time given
                 * enough memory, check to see if we need to go to sleep for a
                 * bit so we don't livelock.
                 */
                if (need_resched()) {
                        spin_unlock(&sb->s_inode_list_lock);
                        cond_resched();
                        dispose_list(&dispose);
                        goto again;
                }
        }
        spin_unlock(&sb->s_inode_list_lock);

        dispose_list(&dispose);
}
EXPORT_SYMBOL_GPL(evict_inodes);

/**
 * invalidate_inodes        - attempt to free all inodes on a superblock
 * @sb:                superblock to operate on
 * @kill_dirty: flag to guide handling of dirty inodes
 *
 * Attempts to free all inodes for a given superblock.  If there were any
 * busy inodes return a non-zero value, else zero.
 * If @kill_dirty is set, discard dirty inodes too, otherwise treat
 * them as busy.
 */
int invalidate_inodes(struct super_block *sb, bool kill_dirty)
{
        int busy = 0;
        struct inode *inode, *next;
        LIST_HEAD(dispose);

again:
        spin_lock(&sb->s_inode_list_lock);
        list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }
                if (inode->i_state & I_DIRTY_ALL && !kill_dirty) {
                        spin_unlock(&inode->i_lock);
                        busy = 1;
                        continue;
                }
                if (atomic_read(&inode->i_count)) {
                        spin_unlock(&inode->i_lock);
                        busy = 1;
                        continue;
                }

                inode->i_state |= I_FREEING;
                inode_lru_list_del(inode);
                spin_unlock(&inode->i_lock);
                list_add(&inode->i_lru, &dispose);
                if (need_resched()) {
                        spin_unlock(&sb->s_inode_list_lock);
                        cond_resched();
                        dispose_list(&dispose);
                        goto again;
                }
        }
        spin_unlock(&sb->s_inode_list_lock);

        dispose_list(&dispose);

        return busy;
}

/*
 * Isolate the inode from the LRU in preparation for freeing it.
 *
 * Any inodes which are pinned purely because of attached pagecache have their
 * pagecache removed.  If the inode has metadata buffers attached to
 * mapping->private_list then try to remove them.
 *
 * If the inode has the I_REFERENCED flag set, then it means that it has been
 * used recently - the flag is set in iput_final(). When we encounter such an
 * inode, clear the flag and move it to the back of the LRU so it gets another
 * pass through the LRU before it gets reclaimed. This is necessary because of
 * the fact we are doing lazy LRU updates to minimise lock contention so the
 * LRU does not have strict ordering. Hence we don't want to reclaim inodes
 * with this flag set because they are the inodes that are out of order.
 */
static enum lru_status inode_lru_isolate(struct list_head *item,
                struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
{
        struct list_head *freeable = arg;
        struct inode        *inode = container_of(item, struct inode, i_lru);

        /*
         * we are inverting the lru lock/inode->i_lock here, so use a trylock.
         * If we fail to get the lock, just skip it.
         */
        if (!spin_trylock(&inode->i_lock))
                return LRU_SKIP;

        /*
         * Referenced or dirty inodes are still in use. Give them another pass
         * through the LRU as we canot reclaim them now.
         */
        if (atomic_read(&inode->i_count) ||
            (inode->i_state & ~I_REFERENCED)) {
                list_lru_isolate(lru, &inode->i_lru);
                spin_unlock(&inode->i_lock);
                this_cpu_dec(nr_unused);
                return LRU_REMOVED;
        }

        /* recently referenced inodes get one more pass */
        if (inode->i_state & I_REFERENCED) {
                inode->i_state &= ~I_REFERENCED;
                spin_unlock(&inode->i_lock);
                return LRU_ROTATE;
        }

        if (inode_has_buffers(inode) || inode->i_data.nrpages) {
                inode_pin_lru_isolating(inode);
                spin_unlock(&inode->i_lock);
                spin_unlock(lru_lock);
                if (remove_inode_buffers(inode)) {
                        unsigned long reap;
                        reap = invalidate_mapping_pages(&inode->i_data, 0, -1);
                        if (current_is_kswapd())
                                __count_vm_events(KSWAPD_INODESTEAL, reap);
                        else
                                __count_vm_events(PGINODESTEAL, reap);
                        if (current->reclaim_state)
                                current->reclaim_state->reclaimed_slab += reap;
                }
                inode_unpin_lru_isolating(inode);
                spin_lock(lru_lock);
                return LRU_RETRY;
        }

        WARN_ON(inode->i_state & I_NEW);
        inode->i_state |= I_FREEING;
        list_lru_isolate_move(lru, &inode->i_lru, freeable);
        spin_unlock(&inode->i_lock);

        this_cpu_dec(nr_unused);
        return LRU_REMOVED;
}

/*
 * Walk the superblock inode LRU for freeable inodes and attempt to free them.
 * This is called from the superblock shrinker function with a number of inodes
 * to trim from the LRU. Inodes to be freed are moved to a temporary list and
 * then are freed outside inode_lock by dispose_list().
 */
long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
{
        LIST_HEAD(freeable);
        long freed;

        freed = list_lru_shrink_walk(&sb->s_inode_lru, sc,
                                     inode_lru_isolate, &freeable);
        dispose_list(&freeable);
        return freed;
}

static void __wait_on_freeing_inode(struct inode *inode);
/*
 * Called with the inode lock held.
 */
static struct inode *find_inode(struct super_block *sb,
                                struct hlist_head *head,
                                int (*test)(struct inode *, void *),
                                void *data)
{
        struct inode *inode = NULL;

repeat:
        hlist_for_each_entry(inode, head, i_hash) {
                if (inode->i_sb != sb)
                        continue;
                if (!test(inode, data))
                        continue;
                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
                        __wait_on_freeing_inode(inode);
                        goto repeat;
                }
                if (unlikely(inode->i_state & I_CREATING)) {
                        spin_unlock(&inode->i_lock);
                        return ERR_PTR(-ESTALE);
                }
                __iget(inode);
                spin_unlock(&inode->i_lock);
                return inode;
        }
        return NULL;
}

/*
 * find_inode_fast is the fast path version of find_inode, see the comment at
 * iget_locked for details.
 */
static struct inode *find_inode_fast(struct super_block *sb,
                                struct hlist_head *head, unsigned long ino)
{
        struct inode *inode = NULL;

repeat:
        hlist_for_each_entry(inode, head, i_hash) {
                if (inode->i_ino != ino)
                        continue;
                if (inode->i_sb != sb)
                        continue;
                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
                        __wait_on_freeing_inode(inode);
                        goto repeat;
                }
                if (unlikely(inode->i_state & I_CREATING)) {
                        spin_unlock(&inode->i_lock);
                        return ERR_PTR(-ESTALE);
                }
                __iget(inode);
                spin_unlock(&inode->i_lock);
                return inode;
        }
        return NULL;
}

/*
 * Each cpu owns a range of LAST_INO_BATCH numbers.
 * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations,
 * to renew the exhausted range.
 *
 * This does not significantly increase overflow rate because every CPU can
 * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is
 * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the
 * 2^32 range, and is a worst-case. Even a 50% wastage would only increase
 * overflow rate by 2x, which does not seem too significant.
 *
 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
 * error if st_ino won't fit in target struct field. Use 32bit counter
 * here to attempt to avoid that.
 */
#define LAST_INO_BATCH 1024
static DEFINE_PER_CPU(unsigned int, last_ino);

unsigned int get_next_ino(void)
{
        unsigned int *p = &get_cpu_var(last_ino);
        unsigned int res = *p;

#ifdef CONFIG_SMP
        if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) {
                static atomic_t shared_last_ino;
                int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino);

                res = next - LAST_INO_BATCH;
        }
#endif

        res++;
        /* get_next_ino should not provide a 0 inode number */
        if (unlikely(!res))
                res++;
        *p = res;
        put_cpu_var(last_ino);
        return res;
}
EXPORT_SYMBOL(get_next_ino);

/**
 *        new_inode_pseudo         - obtain an inode
 *        @sb: superblock
 *
 *        Allocates a new inode for given superblock.
 *        Inode wont be chained in superblock s_inodes list
 *        This means :
 *        - fs can't be unmount
 *        - quotas, fsnotify, writeback can't work
 */
struct inode *new_inode_pseudo(struct super_block *sb)
{
        struct inode *inode = alloc_inode(sb);

        if (inode) {
                spin_lock(&inode->i_lock);
                inode->i_state = 0;
                spin_unlock(&inode->i_lock);
                INIT_LIST_HEAD(&inode->i_sb_list);
        }
        return inode;
}

/**
 *        new_inode         - obtain an inode
 *        @sb: superblock
 *
 *        Allocates a new inode for given superblock. The default gfp_mask
 *        for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE.
 *        If HIGHMEM pages are unsuitable or it is known that pages allocated
 *        for the page cache are not reclaimable or migratable,
 *        mapping_set_gfp_mask() must be called with suitable flags on the
 *        newly created inode's mapping
 *
 */
struct inode *new_inode(struct super_block *sb)
{
        struct inode *inode;

        spin_lock_prefetch(&sb->s_inode_list_lock);

        inode = new_inode_pseudo(sb);
        if (inode)
                inode_sb_list_add(inode);
        return inode;
}
EXPORT_SYMBOL(new_inode);

#ifdef CONFIG_DEBUG_LOCK_ALLOC
void lockdep_annotate_inode_mutex_key(struct inode *inode)
{
        if (S_ISDIR(inode->i_mode)) {
                struct file_system_type *type = inode->i_sb->s_type;

                /* Set new key only if filesystem hasn't already changed it */
                if (lockdep_match_class(&inode->i_rwsem, &type->i_mutex_key)) {
                        /*
                         * ensure nobody is actually holding i_mutex
                         */
                        // mutex_destroy(&inode->i_mutex);
                        init_rwsem(&inode->i_rwsem);
                        lockdep_set_class(&inode->i_rwsem,
                                          &type->i_mutex_dir_key);
                }
        }
}
EXPORT_SYMBOL(lockdep_annotate_inode_mutex_key);
#endif

/**
 * unlock_new_inode - clear the I_NEW state and wake up any waiters
 * @inode:        new inode to unlock
 *
 * Called when the inode is fully initialised to clear the new state of the
 * inode and wake up anyone waiting for the inode to finish initialisation.
 */
void unlock_new_inode(struct inode *inode)
{
        lockdep_annotate_inode_mutex_key(inode);
        spin_lock(&inode->i_lock);
        WARN_ON(!(inode->i_state & I_NEW));
        inode->i_state &= ~I_NEW & ~I_CREATING;
        smp_mb();
        wake_up_bit(&inode->i_state, __I_NEW);
        spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(unlock_new_inode);

void discard_new_inode(struct inode *inode)
{
        lockdep_annotate_inode_mutex_key(inode);
        spin_lock(&inode->i_lock);
        WARN_ON(!(inode->i_state & I_NEW));
        inode->i_state &= ~I_NEW;
        smp_mb();
        wake_up_bit(&inode->i_state, __I_NEW);
        spin_unlock(&inode->i_lock);
        iput(inode);
}
EXPORT_SYMBOL(discard_new_inode);

/**
 * lock_two_inodes - lock two inodes (may be regular files but also dirs)
 *
 * Lock any non-NULL argument. The caller must make sure that if he is passing
 * in two directories, one is not ancestor of the other.  Zero, one or two
 * objects may be locked by this function.
 *
 * @inode1: first inode to lock
 * @inode2: second inode to lock
 * @subclass1: inode lock subclass for the first lock obtained
 * @subclass2: inode lock subclass for the second lock obtained
 */
void lock_two_inodes(struct inode *inode1, struct inode *inode2,
                     unsigned subclass1, unsigned subclass2)
{
        if (!inode1 || !inode2) {
                /*
                 * Make sure @subclass1 will be used for the acquired lock.
                 * This is not strictly necessary (no current caller cares) but
                 * let's keep things consistent.
                 */
                if (!inode1)
                        swap(inode1, inode2);
                goto lock;
        }

        /*
         * If one object is directory and the other is not, we must make sure
         * to lock directory first as the other object may be its child.
         */
        if (S_ISDIR(inode2->i_mode) == S_ISDIR(inode1->i_mode)) {
                if (inode1 > inode2)
                        swap(inode1, inode2);
        } else if (!S_ISDIR(inode1->i_mode))
                swap(inode1, inode2);
lock:
        if (inode1)
                inode_lock_nested(inode1, subclass1);
        if (inode2 && inode2 != inode1)
                inode_lock_nested(inode2, subclass2);
}

/**
 * lock_two_nondirectories - take two i_mutexes on non-directory objects
 *
 * Lock any non-NULL argument that is not a directory.
 * Zero, one or two objects may be locked by this function.
 *
 * @inode1: first inode to lock
 * @inode2: second inode to lock
 */
void lock_two_nondirectories(struct inode *inode1, struct inode *inode2)
{
        if (inode1 > inode2)
                swap(inode1, inode2);

        if (inode1 && !S_ISDIR(inode1->i_mode))
                inode_lock(inode1);
        if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
                inode_lock_nested(inode2, I_MUTEX_NONDIR2);
}
EXPORT_SYMBOL(lock_two_nondirectories);

/**
 * unlock_two_nondirectories - release locks from lock_two_nondirectories()
 * @inode1: first inode to unlock
 * @inode2: second inode to unlock
 */
void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2)
{
        if (inode1 && !S_ISDIR(inode1->i_mode))
                inode_unlock(inode1);
        if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
                inode_unlock(inode2);
}
EXPORT_SYMBOL(unlock_two_nondirectories);

/**
 * inode_insert5 - obtain an inode from a mounted file system
 * @inode:        pre-allocated inode to use for insert to cache
 * @hashval:        hash value (usually inode number) to get
 * @test:        callback used for comparisons between inodes
 * @set:        callback used to initialize a new struct inode
 * @data:        opaque data pointer to pass to @test and @set
 *
 * Search for the inode specified by @hashval and @data in the inode cache,
 * and if present it is return it with an increased reference count. This is
 * a variant of iget5_locked() for callers that don't want to fail on memory
 * allocation of inode.
 *
 * If the inode is not in cache, insert the pre-allocated inode to cache and
 * return it locked, hashed, and with the I_NEW flag set. The file system gets
 * to fill it in before unlocking it via unlock_new_inode().
 *
 * Note both @test and @set are called with the inode_hash_lock held, so can't
 * sleep.
 */
struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
                            int (*test)(struct inode *, void *),
                            int (*set)(struct inode *, void *), void *data)
{
        struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
        struct inode *old;
        bool creating = inode->i_state & I_CREATING;

again:
        spin_lock(&inode_hash_lock);
        old = find_inode(inode->i_sb, head, test, data);
        if (unlikely(old)) {
                /*
                 * Uhhuh, somebody else created the same inode under us.
                 * Use the old inode instead of the preallocated one.
                 */
                spin_unlock(&inode_hash_lock);
                if (IS_ERR(old))
                        return NULL;
                wait_on_inode(old);
                if (unlikely(inode_unhashed(old))) {
                        iput(old);
                        goto again;
                }
                return old;
        }

        if (set && unlikely(set(inode, data))) {
                inode = NULL;
                goto unlock;
        }

        /*
         * Return the locked inode with I_NEW set, the
         * caller is responsible for filling in the contents
         */
        spin_lock(&inode->i_lock);
        inode->i_state |= I_NEW;
        hlist_add_head_rcu(&inode->i_hash, head);
        spin_unlock(&inode->i_lock);
        if (!creating)
                inode_sb_list_add(inode);
unlock:
        spin_unlock(&inode_hash_lock);

        return inode;
}
EXPORT_SYMBOL(inode_insert5);

/**
 * iget5_locked - obtain an inode from a mounted file system
 * @sb:                super block of file system
 * @hashval:        hash value (usually inode number) to get
 * @test:        callback used for comparisons between inodes
 * @set:        callback used to initialize a new struct inode
 * @data:        opaque data pointer to pass to @test and @set
 *
 * Search for the inode specified by @hashval and @data in the inode cache,
 * and if present it is return it with an increased reference count. This is
 * a generalized version of iget_locked() for file systems where the inode
 * number is not sufficient for unique identification of an inode.
 *
 * If the inode is not in cache, allocate a new inode and return it locked,
 * hashed, and with the I_NEW flag set. The file system gets to fill it in
 * before unlocking it via unlock_new_inode().
 *
 * Note both @test and @set are called with the inode_hash_lock held, so can't
 * sleep.
 */
struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
                int (*test)(struct inode *, void *),
                int (*set)(struct inode *, void *), void *data)
{
        struct inode *inode = ilookup5(sb, hashval, test, data);

        if (!inode) {
                struct inode *new = alloc_inode(sb);

                if (new) {
                        new->i_state = 0;
                        inode = inode_insert5(new, hashval, test, set, data);
                        if (unlikely(inode != new))
                                destroy_inode(new);
                }
        }
        return inode;
}
EXPORT_SYMBOL(iget5_locked);

/**
 * iget_locked - obtain an inode from a mounted file system
 * @sb:                super block of file system
 * @ino:        inode number to get
 *
 * Search for the inode specified by @ino in the inode cache and if present
 * return it with an increased reference count. This is for file systems
 * where the inode number is sufficient for unique identification of an inode.
 *
 * If the inode is not in cache, allocate a new inode and return it locked,
 * hashed, and with the I_NEW flag set.  The file system gets to fill it in
 * before unlocking it via unlock_new_inode().
 */
struct inode *iget_locked(struct super_block *sb, unsigned long ino)
{
        struct hlist_head *head = inode_hashtable + hash(sb, ino);
        struct inode *inode;
again:
        spin_lock(&inode_hash_lock);
        inode = find_inode_fast(sb, head, ino);
        spin_unlock(&inode_hash_lock);
        if (inode) {
                if (IS_ERR(inode))
                        return NULL;
                wait_on_inode(inode);
                if (unlikely(inode_unhashed(inode))) {
                        iput(inode);
                        goto again;
                }
                return inode;
        }

        inode = alloc_inode(sb);
        if (inode) {
                struct inode *old;

                spin_lock(&inode_hash_lock);
                /* We released the lock, so.. */
                old = find_inode_fast(sb, head, ino);
                if (!old) {
                        inode->i_ino = ino;
                        spin_lock(&inode->i_lock);
                        inode->i_state = I_NEW;
                        hlist_add_head_rcu(&inode->i_hash, head);
                        spin_unlock(&inode->i_lock);
                        inode_sb_list_add(inode);
                        spin_unlock(&inode_hash_lock);

                        /* Return the locked inode with I_NEW set, the
                         * caller is responsible for filling in the contents
                         */
                        return inode;
                }

                /*
                 * Uhhuh, somebody else created the same inode under
                 * us. Use the old inode instead of the one we just
                 * allocated.
                 */
                spin_unlock(&inode_hash_lock);
                destroy_inode(inode);
                if (IS_ERR(old))
                        return NULL;
                inode = old;
                wait_on_inode(inode);
                if (unlikely(inode_unhashed(inode))) {
                        iput(inode);
                        goto again;
                }
        }
        return inode;
}
EXPORT_SYMBOL(iget_locked);

/*
 * search the inode cache for a matching inode number.
 * If we find one, then the inode number we are trying to
 * allocate is not unique and so we should not use it.
 *
 * Returns 1 if the inode number is unique, 0 if it is not.
 */
static int test_inode_iunique(struct super_block *sb, unsigned long ino)
{
        struct hlist_head *b = inode_hashtable + hash(sb, ino);
        struct inode *inode;

        hlist_for_each_entry_rcu(inode, b, i_hash) {
                if (inode->i_ino == ino && inode->i_sb == sb)
                        return 0;
        }
        return 1;
}

/**
 *        iunique - get a unique inode number
 *        @sb: superblock
 *        @max_reserved: highest reserved inode number
 *
 *        Obtain an inode number that is unique on the system for a given
 *        superblock. This is used by file systems that have no natural
 *        permanent inode numbering system. An inode number is returned that
 *        is higher than the reserved limit but unique.
 *
 *        BUGS:
 *        With a large number of inodes live on the file system this function
 *        currently becomes quite slow.
 */
ino_t iunique(struct super_block *sb, ino_t max_reserved)
{
        /*
         * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
         * error if st_ino won't fit in target struct field. Use 32bit counter
         * here to attempt to avoid that.
         */
        static DEFINE_SPINLOCK(iunique_lock);
        static unsigned int counter;
        ino_t res;

        rcu_read_lock();
        spin_lock(&iunique_lock);
        do {
                if (counter <= max_reserved)
                        counter = max_reserved + 1;
                res = counter++;
        } while (!test_inode_iunique(sb, res));
        spin_unlock(&iunique_lock);
        rcu_read_unlock();

        return res;
}
EXPORT_SYMBOL(iunique);

struct inode *igrab(struct inode *inode)
{
        spin_lock(&inode->i_lock);
        if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) {
                __iget(inode);
                spin_unlock(&inode->i_lock);
        } else {
                spin_unlock(&inode->i_lock);
                /*
                 * Handle the case where s_op->clear_inode is not been
                 * called yet, and somebody is calling igrab
                 * while the inode is getting freed.
                 */
                inode = NULL;
        }
        return inode;
}
EXPORT_SYMBOL(igrab);

/**
 * ilookup5_nowait - search for an inode in the inode cache
 * @sb:                super block of file system to search
 * @hashval:        hash value (usually inode number) to search for
 * @test:        callback used for comparisons between inodes
 * @data:        opaque data pointer to pass to @test
 *
 * Search for the inode specified by @hashval and @data in the inode cache.
 * If the inode is in the cache, the inode is returned with an incremented
 * reference count.
 *
 * Note: I_NEW is not waited upon so you have to be very careful what you do
 * with the returned inode.  You probably should be using ilookup5() instead.
 *
 * Note2: @test is called with the inode_hash_lock held, so can't sleep.
 */
struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
                int (*test)(struct inode *, void *), void *data)
{
        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
        struct inode *inode;

        spin_lock(&inode_hash_lock);
        inode = find_inode(sb, head, test, data);
        spin_unlock(&inode_hash_lock);

        return IS_ERR(inode) ? NULL : inode;
}
EXPORT_SYMBOL(ilookup5_nowait);

/**
 * ilookup5 - search for an inode in the inode cache
 * @sb:                super block of file system to search
 * @hashval:        hash value (usually inode number) to search for
 * @test:        callback used for comparisons between inodes
 * @data:        opaque data pointer to pass to @test
 *
 * Search for the inode specified by @hashval and @data in the inode cache,
 * and if the inode is in the cache, return the inode with an incremented
 * reference count.  Waits on I_NEW before returning the inode.
 * returned with an incremented reference count.
 *
 * This is a generalized version of ilookup() for file systems where the
 * inode number is not sufficient for unique identification of an inode.
 *
 * Note: @test is called with the inode_hash_lock held, so can't sleep.
 */
struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
                int (*test)(struct inode *, void *), void *data)
{
        struct inode *inode;
again:
        inode = ilookup5_nowait(sb, hashval, test, data);
        if (inode) {
                wait_on_inode(inode);
                if (unlikely(inode_unhashed(inode))) {
                        iput(inode);
                        goto again;
                }
        }
        return inode;
}
EXPORT_SYMBOL(ilookup5);

/**
 * ilookup - search for an inode in the inode cache
 * @sb:                super block of file system to search
 * @ino:        inode number to search for
 *
 * Search for the inode @ino in the inode cache, and if the inode is in the
 * cache, the inode is returned with an incremented reference count.
 */
struct inode *ilookup(struct super_block *sb, unsigned long ino)
{
        struct hlist_head *head = inode_hashtable + hash(sb, ino);
        struct inode *inode;
again:
        spin_lock(&inode_hash_lock);
        inode = find_inode_fast(sb, head, ino);
        spin_unlock(&inode_hash_lock);

        if (inode) {
                if (IS_ERR(inode))
                        return NULL;
                wait_on_inode(inode);
                if (unlikely(inode_unhashed(inode))) {
                        iput(inode);
                        goto again;
                }
        }
        return inode;
}
EXPORT_SYMBOL(ilookup);

/**
 * find_inode_nowait - find an inode in the inode cache
 * @sb:                super block of file system to search
 * @hashval:        hash value (usually inode number) to search for
 * @match:        callback used for comparisons between inodes
 * @data:        opaque data pointer to pass to @match
 *
 * Search for the inode specified by @hashval and @data in the inode
 * cache, where the helper function @match will return 0 if the inode
 * does not match, 1 if the inode does match, and -1 if the search
 * should be stopped.  The @match function must be responsible for
 * taking the i_lock spin_lock and checking i_state for an inode being
 * freed or being initialized, and incrementing the reference count
 * before returning 1.  It also must not sleep, since it is called with
 * the inode_hash_lock spinlock held.
 *
 * This is a even more generalized version of ilookup5() when the
 * function must never block --- find_inode() can block in
 * __wait_on_freeing_inode() --- or when the caller can not increment
 * the reference count because the resulting iput() might cause an
 * inode eviction.  The tradeoff is that the @match funtion must be
 * very carefully implemented.
 */
struct inode *find_inode_nowait(struct super_block *sb,
                                unsigned long hashval,
                                int (*match)(struct inode *, unsigned long,
                                             void *),
                                void *data)
{
        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
        struct inode *inode, *ret_inode = NULL;
        int mval;

        spin_lock(&inode_hash_lock);
        hlist_for_each_entry(inode, head, i_hash) {
                if (inode->i_sb != sb)
                        continue;
                mval = match(inode, hashval, data);
                if (mval == 0)
                        continue;
                if (mval == 1)
                        ret_inode = inode;
                goto out;
        }
out:
        spin_unlock(&inode_hash_lock);
        return ret_inode;
}
EXPORT_SYMBOL(find_inode_nowait);

/**
 * find_inode_rcu - find an inode in the inode cache
 * @sb:                Super block of file system to search
 * @hashval:        Key to hash
 * @test:        Function to test match on an inode
 * @data:        Data for test function
 *
 * Search for the inode specified by @hashval and @data in the inode cache,
 * where the helper function @test will return 0 if the inode does not match
 * and 1 if it does.  The @test function must be responsible for taking the
 * i_lock spin_lock and checking i_state for an inode being freed or being
 * initialized.
 *
 * If successful, this will return the inode for which the @test function
 * returned 1 and NULL otherwise.
 *
 * The @test function is not permitted to take a ref on any inode presented.
 * It is also not permitted to sleep.
 *
 * The caller must hold the RCU read lock.
 */
struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval,
                             int (*test)(struct inode *, void *), void *data)
{
        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
        struct inode *inode;

        RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
                         "suspicious find_inode_rcu() usage");

        hlist_for_each_entry_rcu(inode, head, i_hash) {
                if (inode->i_sb == sb &&
                    !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) &&
                    test(inode, data))
                        return inode;
        }
        return NULL;
}
EXPORT_SYMBOL(find_inode_rcu);

/**
 * find_inode_by_rcu - Find an inode in the inode cache
 * @sb:                Super block of file system to search
 * @ino:        The inode number to match
 *
 * Search for the inode specified by @hashval and @data in the inode cache,
 * where the helper function @test will return 0 if the inode does not match
 * and 1 if it does.  The @test function must be responsible for taking the
 * i_lock spin_lock and checking i_state for an inode being freed or being
 * initialized.
 *
 * If successful, this will return the inode for which the @test function
 * returned 1 and NULL otherwise.
 *
 * The @test function is not permitted to take a ref on any inode presented.
 * It is also not permitted to sleep.
 *
 * The caller must hold the RCU read lock.
 */
struct inode *find_inode_by_ino_rcu(struct super_block *sb,
                                    unsigned long ino)
{
        struct hlist_head *head = inode_hashtable + hash(sb, ino);
        struct inode *inode;

        RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
                         "suspicious find_inode_by_ino_rcu() usage");

        hlist_for_each_entry_rcu(inode, head, i_hash) {
                if (inode->i_ino == ino &&
                    inode->i_sb == sb &&
                    !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)))
                    return inode;
        }
        return NULL;
}
EXPORT_SYMBOL(find_inode_by_ino_rcu);

int insert_inode_locked(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        ino_t ino = inode->i_ino;
        struct hlist_head *head = inode_hashtable + hash(sb, ino);

        while (1) {
                struct inode *old = NULL;
                spin_lock(&inode_hash_lock);
                hlist_for_each_entry(old, head, i_hash) {
                        if (old->i_ino != ino)
                                continue;
                        if (old->i_sb != sb)
                                continue;
                        spin_lock(&old->i_lock);
                        if (old->i_state & (I_FREEING|I_WILL_FREE)) {
                                spin_unlock(&old->i_lock);
                                continue;
                        }
                        break;
                }
                if (likely(!old)) {
                        spin_lock(&inode->i_lock);
                        inode->i_state |= I_NEW | I_CREATING;
                        hlist_add_head_rcu(&inode->i_hash, head);
                        spin_unlock(&inode->i_lock);
                        spin_unlock(&inode_hash_lock);
                        return 0;
                }
                if (unlikely(old->i_state & I_CREATING)) {
                        spin_unlock(&old->i_lock);
                        spin_unlock(&inode_hash_lock);
                        return -EBUSY;
                }
                __iget(old);
                spin_unlock(&old->i_lock);
                spin_unlock(&inode_hash_lock);
                wait_on_inode(old);
                if (unlikely(!inode_unhashed(old))) {
                        iput(old);
                        return -EBUSY;
                }
                iput(old);
        }
}
EXPORT_SYMBOL(insert_inode_locked);

int insert_inode_locked4(struct inode *inode, unsigned long hashval,
                int (*test)(struct inode *, void *), void *data)
{
        struct inode *old;

        inode->i_state |= I_CREATING;
        old = inode_insert5(inode, hashval, test, NULL, data);

        if (old != inode) {
                iput(old);
                return -EBUSY;
        }
        return 0;
}
EXPORT_SYMBOL(insert_inode_locked4);


int generic_delete_inode(struct inode *inode)
{
        return 1;
}
EXPORT_SYMBOL(generic_delete_inode);

/*
 * Called when we're dropping the last reference
 * to an inode.
 *
 * Call the FS "drop_inode()" function, defaulting to
 * the legacy UNIX filesystem behaviour.  If it tells
 * us to evict inode, do so.  Otherwise, retain inode
 * in cache if fs is alive, sync and evict if fs is
 * shutting down.
 */
static void iput_final(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        const struct super_operations *op = inode->i_sb->s_op;
        unsigned long state;
        int drop;

        WARN_ON(inode->i_state & I_NEW);

        if (op->drop_inode)
                drop = op->drop_inode(inode);
        else
                drop = generic_drop_inode(inode);

        if (!drop &&
            !(inode->i_state & I_DONTCACHE) &&
            (sb->s_flags & SB_ACTIVE)) {
                inode_add_lru(inode);
                spin_unlock(&inode->i_lock);
                return;
        }

        state = inode->i_state;
        if (!drop) {
                WRITE_ONCE(inode->i_state, state | I_WILL_FREE);
                spin_unlock(&inode->i_lock);

                write_inode_now(inode, 1);

                spin_lock(&inode->i_lock);
                state = inode->i_state;
                WARN_ON(state & I_NEW);
                state &= ~I_WILL_FREE;
        }

        WRITE_ONCE(inode->i_state, state | I_FREEING);
        if (!list_empty(&inode->i_lru))
                inode_lru_list_del(inode);
        spin_unlock(&inode->i_lock);

        evict(inode);
}

/**
 *        iput        - put an inode
 *        @inode: inode to put
 *
 *        Puts an inode, dropping its usage count. If the inode use count hits
 *        zero, the inode is then freed and may also be destroyed.
 *
 *        Consequently, iput() can sleep.
 */
void iput(struct inode *inode)
{
        if (!inode)
                return;
        BUG_ON(inode->i_state & I_CLEAR);
retry:
        if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
                if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
                        atomic_inc(&inode->i_count);
                        spin_unlock(&inode->i_lock);
                        trace_writeback_lazytime_iput(inode);
                        mark_inode_dirty_sync(inode);
                        goto retry;
                }
                iput_final(inode);
        }
}
EXPORT_SYMBOL(iput);

#ifdef CONFIG_BLOCK
/**
 *        bmap        - find a block number in a file
 *        @inode:  inode owning the block number being requested
 *        @block: pointer containing the block to find
 *
 *        Replaces the value in ``*block`` with the block number on the device holding
 *        corresponding to the requested block number in the file.
 *        That is, asked for block 4 of inode 1 the function will replace the
 *        4 in ``*block``, with disk block relative to the disk start that holds that
 *        block of the file.
 *
 *        Returns -EINVAL in case of error, 0 otherwise. If mapping falls into a
 *        hole, returns 0 and ``*block`` is also set to 0.
 */
int bmap(struct inode *inode, sector_t *block)
{
        if (!inode->i_mapping->a_ops->bmap)
                return -EINVAL;

        *block = inode->i_mapping->a_ops->bmap(inode->i_mapping, *block);
        return 0;
}
EXPORT_SYMBOL(bmap);
#endif

/*
 * With relative atime, only update atime if the previous atime is
 * earlier than either the ctime or mtime or if at least a day has
 * passed since the last atime update.
 */
static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
                             struct timespec64 now)
{

        if (!(mnt->mnt_flags & MNT_RELATIME))
                return 1;
        /*
         * Is mtime younger than atime? If yes, update atime:
         */
        if (timespec64_compare(&inode->i_mtime, &inode->i_atime) >= 0)
                return 1;
        /*
         * Is ctime younger than atime? If yes, update atime:
         */
        if (timespec64_compare(&inode->i_ctime, &inode->i_atime) >= 0)
                return 1;

        /*
         * Is the previous atime value older than a day? If yes,
         * update atime:
         */
        if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60)
                return 1;
        /*
         * Good, we can skip the atime update:
         */
        return 0;
}

int generic_update_time(struct inode *inode, struct timespec64 *time, int flags)
{
        int iflags = I_DIRTY_TIME;
        bool dirty = false;

        if (flags & S_ATIME)
                inode->i_atime = *time;
        if (flags & S_VERSION)
                dirty = inode_maybe_inc_iversion(inode, false);
        if (flags & S_CTIME)
                inode->i_ctime = *time;
        if (flags & S_MTIME)
                inode->i_mtime = *time;
        if ((flags & (S_ATIME | S_CTIME | S_MTIME)) &&
            !(inode->i_sb->s_flags & SB_LAZYTIME))
                dirty = true;

        if (dirty)
                iflags |= I_DIRTY_SYNC;
        __mark_inode_dirty(inode, iflags);
        return 0;
}
EXPORT_SYMBOL(generic_update_time);

/*
 * This does the actual work of updating an inodes time or version.  Must have
 * had called mnt_want_write() before calling this.
 */
int inode_update_time(struct inode *inode, struct timespec64 *time, int flags)
{
        if (inode->i_op->update_time)
                return inode->i_op->update_time(inode, time, flags);
        return generic_update_time(inode, time, flags);
}
EXPORT_SYMBOL(inode_update_time);

/**
 *        touch_atime        -        update the access time
 *        @path: the &struct path to update
 *        @inode: inode to update
 *
 *        Update the accessed time on an inode and mark it for writeback.
 *        This function automatically handles read only file systems and media,
 *        as well as the "noatime" flag and inode specific "noatime" markers.
 */
bool atime_needs_update(const struct path *path, struct inode *inode)
{
        struct vfsmount *mnt = path->mnt;
        struct timespec64 now;

        if (inode->i_flags & S_NOATIME)
                return false;

        /* Atime updates will likely cause i_uid and i_gid to be written
         * back improprely if their true value is unknown to the vfs.
         */
        if (HAS_UNMAPPED_ID(inode))
                return false;

        if (IS_NOATIME(inode))
                return false;
        if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))
                return false;

        if (mnt->mnt_flags & MNT_NOATIME)
                return false;
        if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
                return false;

        now = current_time(inode);

        if (!relatime_need_update(mnt, inode, now))
                return false;

        if (timespec64_equal(&inode->i_atime, &now))
                return false;

        return true;
}

void touch_atime(const struct path *path)
{
        struct vfsmount *mnt = path->mnt;
        struct inode *inode = d_inode(path->dentry);
        struct timespec64 now;

        if (!atime_needs_update(path, inode))
                return;

        if (!sb_start_write_trylock(inode->i_sb))
                return;

        if (__mnt_want_write(mnt) != 0)
                goto skip_update;
        /*
         * File systems can error out when updating inodes if they need to
         * allocate new space to modify an inode (such is the case for
         * Btrfs), but since we touch atime while walking down the path we
         * really don't care if we failed to update the atime of the file,
         * so just ignore the return value.
         * We may also fail on filesystems that have the ability to make parts
         * of the fs read only, e.g. subvolumes in Btrfs.
         */
        now = current_time(inode);
        inode_update_time(inode, &now, S_ATIME);
        __mnt_drop_write(mnt);
skip_update:
        sb_end_write(inode->i_sb);
}
EXPORT_SYMBOL(touch_atime);

/*
 * Return mask of changes for notify_change() that need to be done as a
 * response to write or truncate. Return 0 if nothing has to be changed.
 * Negative value on error (change should be denied).
 */
int dentry_needs_remove_privs(struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);
        int mask = 0;
        int ret;

        if (IS_NOSEC(inode))
                return 0;

        mask = setattr_should_drop_suidgid(inode);
        ret = security_inode_need_killpriv(dentry);
        if (ret < 0)
                return ret;
        if (ret)
                mask |= ATTR_KILL_PRIV;
        return mask;
}

static int __remove_privs(struct dentry *dentry, int kill)
{
        struct iattr newattrs;

        newattrs.ia_valid = ATTR_FORCE | kill;
        /*
         * Note we call this on write, so notify_change will not
         * encounter any conflicting delegations:
         */
        return notify_change(dentry, &newattrs, NULL);
}

/*
 * Remove special file priviledges (suid, capabilities) when file is written
 * to or truncated.
 */
int file_remove_privs(struct file *file)
{
        struct dentry *dentry = file_dentry(file);
        struct inode *inode = file_inode(file);
        int kill;
        int error = 0;

        /*
         * Fast path for nothing security related.
         * As well for non-regular files, e.g. blkdev inodes.
         * For example, blkdev_write_iter() might get here
         * trying to remove privs which it is not allowed to.
         */
        if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode))
                return 0;

        kill = dentry_needs_remove_privs(dentry);
        if (kill < 0)
                return kill;
        if (kill)
                error = __remove_privs(dentry, kill);
        if (!error)
                inode_has_no_xattr(inode);

        return error;
}
EXPORT_SYMBOL(file_remove_privs);

/**
 *        file_update_time        -        update mtime and ctime time
 *        @file: file accessed
 *
 *        Update the mtime and ctime members of an inode and mark the inode
 *        for writeback.  Note that this function is meant exclusively for
 *        usage in the file write path of filesystems, and filesystems may
 *        choose to explicitly ignore update via this function with the
 *        S_NOCMTIME inode flag, e.g. for network filesystem where these
 *        timestamps are handled by the server.  This can return an error for
 *        file systems who need to allocate space in order to update an inode.
 */

int file_update_time(struct file *file)
{
        struct inode *inode = file_inode(file);
        struct timespec64 now;
        int sync_it = 0;
        int ret;

        /* First try to exhaust all avenues to not sync */
        if (IS_NOCMTIME(inode))
                return 0;

        now = current_time(inode);
        if (!timespec64_equal(&inode->i_mtime, &now))
                sync_it = S_MTIME;

        if (!timespec64_equal(&inode->i_ctime, &now))
                sync_it |= S_CTIME;

        if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
                sync_it |= S_VERSION;

        if (!sync_it)
                return 0;

        /* Finally allowed to write? Takes lock. */
        if (__mnt_want_write_file(file))
                return 0;

        ret = inode_update_time(inode, &now, sync_it);
        __mnt_drop_write_file(file);

        return ret;
}
EXPORT_SYMBOL(file_update_time);

/* Caller must hold the file's inode lock */
int file_modified(struct file *file)
{
        int err;

        /*
         * Clear the security bits if the process is not being run by root.
         * This keeps people from modifying setuid and setgid binaries.
         */
        err = file_remove_privs(file);
        if (err)
                return err;

        if (unlikely(file->f_mode & FMODE_NOCMTIME))
                return 0;

        return file_update_time(file);
}
EXPORT_SYMBOL(file_modified);

int inode_needs_sync(struct inode *inode)
{
        if (IS_SYNC(inode))
                return 1;
        if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
                return 1;
        return 0;
}
EXPORT_SYMBOL(inode_needs_sync);

/*
 * If we try to find an inode in the inode hash while it is being
 * deleted, we have to wait until the filesystem completes its
 * deletion before reporting that it isn't found.  This function waits
 * until the deletion _might_ have completed.  Callers are responsible
 * to recheck inode state.
 *
 * It doesn't matter if I_NEW is not set initially, a call to
 * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
 * will DTRT.
 */
static void __wait_on_freeing_inode(struct inode *inode)
{
        wait_queue_head_t *wq;
        DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
        wq = bit_waitqueue(&inode->i_state, __I_NEW);
        prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
        spin_unlock(&inode->i_lock);
        spin_unlock(&inode_hash_lock);
        schedule();
        finish_wait(wq, &wait.wq_entry);
        spin_lock(&inode_hash_lock);
}

static __initdata unsigned long ihash_entries;
static int __init set_ihash_entries(char *str)
{
        if (!str)
                return 0;
        ihash_entries = simple_strtoul(str, &str, 0);
        return 1;
}
__setup("ihash_entries=", set_ihash_entries);

/*
 * Initialize the waitqueues and inode hash table.
 */
void __init inode_init_early(void)
{
        /* If hashes are distributed across NUMA nodes, defer
         * hash allocation until vmalloc space is available.
         */
        if (hashdist)
                return;

        inode_hashtable =
                alloc_large_system_hash("Inode-cache",
                                        sizeof(struct hlist_head),
                                        ihash_entries,
                                        14,
                                        HASH_EARLY | HASH_ZERO,
                                        &i_hash_shift,
                                        &i_hash_mask,
                                        0,
                                        0);
}

void __init inode_init(void)
{
        /* inode slab cache */
        inode_cachep = kmem_cache_create("inode_cache",
                                         sizeof(struct inode),
                                         0,
                                         (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
                                         SLAB_MEM_SPREAD|SLAB_ACCOUNT),
                                         init_once);

        /* Hash may have been set up in inode_init_early */
        if (!hashdist)
                return;

        inode_hashtable =
                alloc_large_system_hash("Inode-cache",
                                        sizeof(struct hlist_head),
                                        ihash_entries,
                                        14,
                                        HASH_ZERO,
                                        &i_hash_shift,
                                        &i_hash_mask,
                                        0,
                                        0);
}

void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
{
        inode->i_mode = mode;
        if (S_ISCHR(mode)) {
                inode->i_fop = &def_chr_fops;
                inode->i_rdev = rdev;
        } else if (S_ISBLK(mode)) {
                inode->i_fop = &def_blk_fops;
                inode->i_rdev = rdev;
        } else if (S_ISFIFO(mode))
                inode->i_fop = &pipefifo_fops;
        else if (S_ISSOCK(mode))
                ;        /* leave it no_open_fops */
        else
                printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for"
                                  " inode %s:%lu\n", mode, inode->i_sb->s_id,
                                  inode->i_ino);
}
EXPORT_SYMBOL(init_special_inode);

/**
 * inode_init_owner - Init uid,gid,mode for new inode according to posix standards
 * @inode: New inode
 * @dir: Directory inode
 * @mode: mode of the new inode
 */
void inode_init_owner(struct inode *inode, const struct inode *dir,
                        umode_t mode)
{
        inode->i_uid = current_fsuid();
        if (dir && dir->i_mode & S_ISGID) {
                inode->i_gid = dir->i_gid;

                /* Directories are special, and always inherit S_ISGID */
                if (S_ISDIR(mode))
                        mode |= S_ISGID;
        } else
                inode->i_gid = current_fsgid();
        inode->i_mode = mode;
}
EXPORT_SYMBOL(inode_init_owner);

/**
 * inode_owner_or_capable - check current task permissions to inode
 * @inode: inode being checked
 *
 * Return true if current either has CAP_FOWNER in a namespace with the
 * inode owner uid mapped, or owns the file.
 */
bool inode_owner_or_capable(const struct inode *inode)
{
        struct user_namespace *ns;

        if (uid_eq(current_fsuid(), inode->i_uid))
                return true;

        ns = current_user_ns();
        if (kuid_has_mapping(ns, inode->i_uid) && ns_capable(ns, CAP_FOWNER))
                return true;
        return false;
}
EXPORT_SYMBOL(inode_owner_or_capable);

/*
 * Direct i/o helper functions
 */
static void __inode_dio_wait(struct inode *inode)
{
        wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP);
        DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);

        do {
                prepare_to_wait(wq, &q.wq_entry, TASK_UNINTERRUPTIBLE);
                if (atomic_read(&inode->i_dio_count))
                        schedule();
        } while (atomic_read(&inode->i_dio_count));
        finish_wait(wq, &q.wq_entry);
}

/**
 * inode_dio_wait - wait for outstanding DIO requests to finish
 * @inode: inode to wait for
 *
 * Waits for all pending direct I/O requests to finish so that we can
 * proceed with a truncate or equivalent operation.
 *
 * Must be called under a lock that serializes taking new references
 * to i_dio_count, usually by inode->i_mutex.
 */
void inode_dio_wait(struct inode *inode)
{
        if (atomic_read(&inode->i_dio_count))
                __inode_dio_wait(inode);
}
EXPORT_SYMBOL(inode_dio_wait);

/*
 * inode_set_flags - atomically set some inode flags
 *
 * Note: the caller should be holding i_mutex, or else be sure that
 * they have exclusive access to the inode structure (i.e., while the
 * inode is being instantiated).  The reason for the cmpxchg() loop
 * --- which wouldn't be necessary if all code paths which modify
 * i_flags actually followed this rule, is that there is at least one
 * code path which doesn't today so we use cmpxchg() out of an abundance
 * of caution.
 *
 * In the long run, i_mutex is overkill, and we should probably look
 * at using the i_lock spinlock to protect i_flags, and then make sure
 * it is so documented in include/linux/fs.h and that all code follows
 * the locking convention!!
 */
void inode_set_flags(struct inode *inode, unsigned int flags,
                     unsigned int mask)
{
        WARN_ON_ONCE(flags & ~mask);
        set_mask_bits(&inode->i_flags, mask, flags);
}
EXPORT_SYMBOL(inode_set_flags);

void inode_nohighmem(struct inode *inode)
{
        mapping_set_gfp_mask(inode->i_mapping, GFP_USER);
}
EXPORT_SYMBOL(inode_nohighmem);

/**
 * timestamp_truncate - Truncate timespec to a granularity
 * @t: Timespec
 * @inode: inode being updated
 *
 * Truncate a timespec to the granularity supported by the fs
 * containing the inode. Always rounds down. gran must
 * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns).
 */
struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        unsigned int gran = sb->s_time_gran;

        t.tv_sec = clamp(t.tv_sec, sb->s_time_min, sb->s_time_max);
        if (unlikely(t.tv_sec == sb->s_time_max || t.tv_sec == sb->s_time_min))
                t.tv_nsec = 0;

        /* Avoid division in the common cases 1 ns and 1 s. */
        if (gran == 1)
                ; /* nothing */
        else if (gran == NSEC_PER_SEC)
                t.tv_nsec = 0;
        else if (gran > 1 && gran < NSEC_PER_SEC)
                t.tv_nsec -= t.tv_nsec % gran;
        else
                WARN(1, "invalid file time granularity: %u", gran);
        return t;
}
EXPORT_SYMBOL(timestamp_truncate);

/**
 * current_time - Return FS time
 * @inode: inode.
 *
 * Return the current time truncated to the time granularity supported by
 * the fs.
 *
 * Note that inode and inode->sb cannot be NULL.
 * Otherwise, the function warns and returns time without truncation.
 */
struct timespec64 current_time(struct inode *inode)
{
        struct timespec64 now;

        ktime_get_coarse_real_ts64(&now);

        if (unlikely(!inode->i_sb)) {
                WARN(1, "current_time() called with uninitialized super_block in the inode");
                return now;
        }

        return timestamp_truncate(now, inode);
}
EXPORT_SYMBOL(current_time);

/*
 * Generic function to check FS_IOC_SETFLAGS values and reject any invalid
 * configurations.
 *
 * Note: the caller should be holding i_mutex, or else be sure that they have
 * exclusive access to the inode structure.
 */
int vfs_ioc_setflags_prepare(struct inode *inode, unsigned int oldflags,
                             unsigned int flags)
{
        /*
         * The IMMUTABLE and APPEND_ONLY flags can only be changed by
         * the relevant capability.
         *
         * This test looks nicer. Thanks to Pauline Middelink
         */
        if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL) &&
            !capable(CAP_LINUX_IMMUTABLE))
                return -EPERM;

        return fscrypt_prepare_setflags(inode, oldflags, flags);
}
EXPORT_SYMBOL(vfs_ioc_setflags_prepare);

/*
 * Generic function to check FS_IOC_FSSETXATTR values and reject any invalid
 * configurations.
 *
 * Note: the caller should be holding i_mutex, or else be sure that they have
 * exclusive access to the inode structure.
 */
int vfs_ioc_fssetxattr_check(struct inode *inode, const struct fsxattr *old_fa,
                             struct fsxattr *fa)
{
        /*
         * Can't modify an immutable/append-only file unless we have
         * appropriate permission.
         */
        if ((old_fa->fsx_xflags ^ fa->fsx_xflags) &
                        (FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND) &&
            !capable(CAP_LINUX_IMMUTABLE))
                return -EPERM;

        /*
         * Project Quota ID state is only allowed to change from within the init
         * namespace. Enforce that restriction only if we are trying to change
         * the quota ID state. Everything else is allowed in user namespaces.
         */
        if (current_user_ns() != &init_user_ns) {
                if (old_fa->fsx_projid != fa->fsx_projid)
                        return -EINVAL;
                if ((old_fa->fsx_xflags ^ fa->fsx_xflags) &
                                FS_XFLAG_PROJINHERIT)
                        return -EINVAL;
        }

        /* Check extent size hints. */
        if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(inode->i_mode))
                return -EINVAL;

        if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) &&
                        !S_ISDIR(inode->i_mode))
                return -EINVAL;

        if ((fa->fsx_xflags & FS_XFLAG_COWEXTSIZE) &&
            !S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
                return -EINVAL;

        /*
         * It is only valid to set the DAX flag on regular files and
         * directories on filesystems.
         */
        if ((fa->fsx_xflags & FS_XFLAG_DAX) &&
            !(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
                return -EINVAL;

        /* Extent size hints of zero turn off the flags. */
        if (fa->fsx_extsize == 0)
                fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT);
        if (fa->fsx_cowextsize == 0)
                fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;

        return 0;
}
EXPORT_SYMBOL(vfs_ioc_fssetxattr_check);

/**
 * inode_set_ctime_current - set the ctime to current_time
 * @inode: inode
 *
 * Set the inode->i_ctime to the current value for the inode. Returns
 * the current value that was assigned to i_ctime.
 */
struct timespec64 inode_set_ctime_current(struct inode *inode)
{
        struct timespec64 now = current_time(inode);

        inode_set_ctime(inode, now.tv_sec, now.tv_nsec);
        return now;
}
EXPORT_SYMBOL(inode_set_ctime_current);

/**
 * in_group_or_capable - check whether caller is CAP_FSETID privileged
 * @inode:        inode to check
 * @gid:        the new/current gid of @inode
 *
 * Check wether @gid is in the caller's group list or if the caller is
 * privileged with CAP_FSETID over @inode. This can be used to determine
 * whether the setgid bit can be kept or must be dropped.
 *
 * Return: true if the caller is sufficiently privileged, false if not.
 */
bool in_group_or_capable(const struct inode *inode, kgid_t gid)
{
        if (in_group_p(gid))
                return true;
        if (capable_wrt_inode_uidgid(inode, CAP_FSETID))
                return true;
        return false;
}

/**
 * mode_strip_sgid - handle the sgid bit for non-directories
 * @dir: parent directory inode
 * @mode: mode of the file to be created in @dir
 *
 * If the @mode of the new file has both the S_ISGID and S_IXGRP bit
 * raised and @dir has the S_ISGID bit raised ensure that the caller is
 * either in the group of the parent directory or they have CAP_FSETID
 * in their user namespace and are privileged over the parent directory.
 * In all other cases, strip the S_ISGID bit from @mode.
 *
 * Return: the new mode to use for the file
 */
umode_t mode_strip_sgid(const struct inode *dir, umode_t mode)
{
        if ((mode & (S_ISGID | S_IXGRP)) != (S_ISGID | S_IXGRP))
                return mode;
        if (S_ISDIR(mode) || !dir || !(dir->i_mode & S_ISGID))
                return mode;
        if (in_group_or_capable(dir, dir->i_gid))
                return mode;
        return mode & ~S_ISGID;
}
EXPORT_SYMBOL(mode_strip_sgid);






































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Checksumming functions for IPv6
 *
 * Authors:        Jorge Cwik, <jorge@laser.satlink.net>
 *                Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 *                Borrows very liberally from tcp.c and ip.c, see those
 *                files for more names.
 */

/*
 *        Fixes:
 *
 *        Ralf Baechle                        :        generic ipv6 checksum
 *        <ralf@waldorf-gmbh.de>
 */

#ifndef _CHECKSUM_IPV6_H
#define _CHECKSUM_IPV6_H

#include <asm/types.h>
#include <asm/byteorder.h>
#include <net/ip.h>
#include <asm/checksum.h>
#include <linux/in6.h>
#include <linux/tcp.h>
#include <linux/ipv6.h>

#ifndef _HAVE_ARCH_IPV6_CSUM
__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
                        const struct in6_addr *daddr,
                        __u32 len, __u8 proto, __wsum csum);
#endif

static inline __wsum ip6_compute_pseudo(struct sk_buff *skb, int proto)
{
        return ~csum_unfold(csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
                                            &ipv6_hdr(skb)->daddr,
                                            skb->len, proto, 0));
}

static inline __wsum ip6_gro_compute_pseudo(struct sk_buff *skb, int proto)
{
        const struct ipv6hdr *iph = skb_gro_network_header(skb);

        return ~csum_unfold(csum_ipv6_magic(&iph->saddr, &iph->daddr,
                                            skb_gro_len(skb), proto, 0));
}

static __inline__ __sum16 tcp_v6_check(int len,
                                   const struct in6_addr *saddr,
                                   const struct in6_addr *daddr,
                                   __wsum base)
{
        return csum_ipv6_magic(saddr, daddr, len, IPPROTO_TCP, base);
}

static inline void __tcp_v6_send_check(struct sk_buff *skb,
                                       const struct in6_addr *saddr,
                                       const struct in6_addr *daddr)
{
        struct tcphdr *th = tcp_hdr(skb);

        if (skb->ip_summed == CHECKSUM_PARTIAL) {
                th->check = ~tcp_v6_check(skb->len, saddr, daddr, 0);
                skb->csum_start = skb_transport_header(skb) - skb->head;
                skb->csum_offset = offsetof(struct tcphdr, check);
        } else {
                th->check = tcp_v6_check(skb->len, saddr, daddr,
                                         csum_partial(th, th->doff << 2,
                                                      skb->csum));
        }
}

static inline void tcp_v6_gso_csum_prep(struct sk_buff *skb)
{
        struct ipv6hdr *ipv6h = ipv6_hdr(skb);
        struct tcphdr *th = tcp_hdr(skb);

        ipv6h->payload_len = 0;
        th->check = ~tcp_v6_check(0, &ipv6h->saddr, &ipv6h->daddr, 0);
}

static inline __sum16 udp_v6_check(int len,
                                   const struct in6_addr *saddr,
                                   const struct in6_addr *daddr,
                                   __wsum base)
{
        return csum_ipv6_magic(saddr, daddr, len, IPPROTO_UDP, base);
}

void udp6_set_csum(bool nocheck, struct sk_buff *skb,
                   const struct in6_addr *saddr,
                   const struct in6_addr *daddr, int len);

int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto);
#endif






































    1 








    1 



    1 












































    1 



    1 








    1 





    1 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/uaccess.h>
#include <linux/mm.h>
#include <linux/bitops.h>

#include <asm/word-at-a-time.h>

/*
 * Do a strnlen, return length of string *with* final '\0'.
 * 'count' is the user-supplied count, while 'max' is the
 * address space maximum.
 *
 * Return 0 for exceptions (which includes hitting the address
 * space maximum), or 'count+1' if hitting the user-supplied
 * maximum count.
 *
 * NOTE! We can sometimes overshoot the user-supplied maximum
 * if it fits in a aligned 'long'. The caller needs to check
 * the return value against "> max".
 */
static inline long do_strnlen_user(const char __user *src, unsigned long count, unsigned long max)
{
        const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
        unsigned long align, res = 0;
        unsigned long c;

        /*
         * Do everything aligned. But that means that we
         * need to also expand the maximum..
         */
        align = (sizeof(unsigned long) - 1) & (unsigned long)src;
        src -= align;
        max += align;

        unsafe_get_user(c, (unsigned long __user *)src, efault);
        c |= aligned_byte_mask(align);

        for (;;) {
                unsigned long data;
                if (has_zero(c, &data, &constants)) {
                        data = prep_zero_mask(c, data, &constants);
                        data = create_zero_mask(data);
                        return res + find_zero(data) + 1 - align;
                }
                res += sizeof(unsigned long);
                /* We already handled 'unsigned long' bytes. Did we do it all ? */
                if (unlikely(max <= sizeof(unsigned long)))
                        break;
                max -= sizeof(unsigned long);
                unsafe_get_user(c, (unsigned long __user *)(src+res), efault);
        }
        res -= align;

        /*
         * Uhhuh. We hit 'max'. But was that the user-specified maximum
         * too? If so, return the marker for "too long".
         */
        if (res >= count)
                return count+1;

        /*
         * Nope: we hit the address space limit, and we still had more
         * characters the caller would have wanted. That's 0.
         */
efault:
        return 0;
}

/**
 * strnlen_user: - Get the size of a user string INCLUDING final NUL.
 * @str: The string to measure.
 * @count: Maximum count (including NUL character)
 *
 * Context: User context only. This function may sleep if pagefaults are
 *          enabled.
 *
 * Get the size of a NUL-terminated string in user space.
 *
 * Returns the size of the string INCLUDING the terminating NUL.
 * If the string is too long, returns a number larger than @count. User
 * has to check the return value against "> count".
 * On exception (or invalid count), returns 0.
 *
 * NOTE! You should basically never use this function. There is
 * almost never any valid case for using the length of a user space
 * string, since the string can be changed at any time by other
 * threads. Use "strncpy_from_user()" instead to get a stable copy
 * of the string.
 */
long strnlen_user(const char __user *str, long count)
{
        unsigned long max_addr, src_addr;

        if (unlikely(count <= 0))
                return 0;

        max_addr = user_addr_max();
        src_addr = (unsigned long)untagged_addr(str);
        if (likely(src_addr < max_addr)) {
                unsigned long max = max_addr - src_addr;
                long retval;

                /*
                 * Truncate 'max' to the user-specified limit, so that
                 * we only have one limit we need to check in the loop
                 */
                if (max > count)
                        max = count;

                if (user_read_access_begin(str, max)) {
                        retval = do_strnlen_user(str, count, max);
                        user_read_access_end();
                        return retval;
                }
        }
        return 0;
}
EXPORT_SYMBOL(strnlen_user);







































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
/* SPDX-License-Identifier: GPL-2.0 */
/**
 * lib/minmax.c: windowed min/max tracker by Kathleen Nichols.
 *
 */
#ifndef MINMAX_H
#define MINMAX_H

#include <linux/types.h>

/* A single data point for our parameterized min-max tracker */
struct minmax_sample {
        u32        t;        /* time measurement was taken */
        u32        v;        /* value measured */
};

/* State for the parameterized min-max tracker */
struct minmax {
        struct minmax_sample s[3];
};

static inline u32 minmax_get(const struct minmax *m)
{
        return m->s[0].v;
}

static inline u32 minmax_reset(struct minmax *m, u32 t, u32 meas)
{
        struct minmax_sample val = { .t = t, .v = meas };

        m->s[2] = m->s[1] = m->s[0] = val;
        return m->s[0].v;
}

u32 minmax_running_max(struct minmax *m, u32 win, u32 t, u32 meas);
u32 minmax_running_min(struct minmax *m, u32 win, u32 t, u32 meas);

#endif






























   14 

   14 
   14 








   14 

   14 
   14 



   14 













   14 
   14 


   14 








   14 


   14 


   14 



















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
// SPDX-License-Identifier: GPL-2.0
/*
 * preemptoff and irqoff tracepoints
 *
 * Copyright (C) Joel Fernandes (Google) <joel@joelfernandes.org>
 */

#include <linux/kallsyms.h>
#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/ftrace.h>
#include <linux/kprobes.h>
#include "trace.h"

#define CREATE_TRACE_POINTS
#include <trace/events/preemptirq.h>

#ifdef CONFIG_TRACE_IRQFLAGS
/* Per-cpu variable to prevent redundant calls when IRQs already off */
static DEFINE_PER_CPU(int, tracing_irq_cpu);

/*
 * Like trace_hardirqs_on() but without the lockdep invocation. This is
 * used in the low level entry code where the ordering vs. RCU is important
 * and lockdep uses a staged approach which splits the lockdep hardirq
 * tracking into a RCU on and a RCU off section.
 */
void trace_hardirqs_on_prepare(void)
{
        if (this_cpu_read(tracing_irq_cpu)) {
                if (!in_nmi())
                        trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1);
                tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
                this_cpu_write(tracing_irq_cpu, 0);
        }
}
EXPORT_SYMBOL(trace_hardirqs_on_prepare);
NOKPROBE_SYMBOL(trace_hardirqs_on_prepare);

void trace_hardirqs_on(void)
{
        if (this_cpu_read(tracing_irq_cpu)) {
                if (!in_nmi())
                        trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
                tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
                this_cpu_write(tracing_irq_cpu, 0);
        }

        lockdep_hardirqs_on_prepare(CALLER_ADDR0);
        lockdep_hardirqs_on(CALLER_ADDR0);
}
EXPORT_SYMBOL(trace_hardirqs_on);
NOKPROBE_SYMBOL(trace_hardirqs_on);

/*
 * Like trace_hardirqs_off() but without the lockdep invocation. This is
 * used in the low level entry code where the ordering vs. RCU is important
 * and lockdep uses a staged approach which splits the lockdep hardirq
 * tracking into a RCU on and a RCU off section.
 */
void trace_hardirqs_off_finish(void)
{
        if (!this_cpu_read(tracing_irq_cpu)) {
                this_cpu_write(tracing_irq_cpu, 1);
                tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
                if (!in_nmi())
                        trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1);
        }

}
EXPORT_SYMBOL(trace_hardirqs_off_finish);
NOKPROBE_SYMBOL(trace_hardirqs_off_finish);

void trace_hardirqs_off(void)
{
        lockdep_hardirqs_off(CALLER_ADDR0);

        if (!this_cpu_read(tracing_irq_cpu)) {
                this_cpu_write(tracing_irq_cpu, 1);
                tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
                if (!in_nmi())
                        trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
        }
}
EXPORT_SYMBOL(trace_hardirqs_off);
NOKPROBE_SYMBOL(trace_hardirqs_off);

__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
{
        if (this_cpu_read(tracing_irq_cpu)) {
                if (!in_nmi())
                        trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr);
                tracer_hardirqs_on(CALLER_ADDR0, caller_addr);
                this_cpu_write(tracing_irq_cpu, 0);
        }

        lockdep_hardirqs_on_prepare(caller_addr);
        lockdep_hardirqs_on(caller_addr);
}
EXPORT_SYMBOL(trace_hardirqs_on_caller);
NOKPROBE_SYMBOL(trace_hardirqs_on_caller);

__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
{
        lockdep_hardirqs_off(caller_addr);

        if (!this_cpu_read(tracing_irq_cpu)) {
                this_cpu_write(tracing_irq_cpu, 1);
                tracer_hardirqs_off(CALLER_ADDR0, caller_addr);
                if (!in_nmi())
                        trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr);
        }
}
EXPORT_SYMBOL(trace_hardirqs_off_caller);
NOKPROBE_SYMBOL(trace_hardirqs_off_caller);
#endif /* CONFIG_TRACE_IRQFLAGS */

#ifdef CONFIG_TRACE_PREEMPT_TOGGLE

void trace_preempt_on(unsigned long a0, unsigned long a1)
{
        if (!in_nmi())
                trace_preempt_enable_rcuidle(a0, a1);
        tracer_preempt_on(a0, a1);
}

void trace_preempt_off(unsigned long a0, unsigned long a1)
{
        if (!in_nmi())
                trace_preempt_disable_rcuidle(a0, a1);
        tracer_preempt_off(a0, a1);
}
#endif









































































































    1 





    1 


    1 
    1 





























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * bvec iterator
 *
 * Copyright (C) 2001 Ming Lei <ming.lei@canonical.com>
 */
#ifndef __LINUX_BVEC_ITER_H
#define __LINUX_BVEC_ITER_H

#include <linux/bug.h>
#include <linux/errno.h>
#include <linux/limits.h>
#include <linux/minmax.h>
#include <linux/mm.h>
#include <linux/types.h>

struct page;

/**
 * struct bio_vec - a contiguous range of physical memory addresses
 * @bv_page:   First page associated with the address range.
 * @bv_len:    Number of bytes in the address range.
 * @bv_offset: Start of the address range relative to the start of @bv_page.
 *
 * The following holds for a bvec if n * PAGE_SIZE < bv_offset + bv_len:
 *
 *   nth_page(@bv_page, n) == @bv_page + n
 *
 * This holds because page_is_mergeable() checks the above property.
 */
struct bio_vec {
        struct page        *bv_page;
        unsigned int        bv_len;
        unsigned int        bv_offset;
};

struct bvec_iter {
        sector_t                bi_sector;        /* device address in 512 byte
                                                   sectors */
        unsigned int                bi_size;        /* residual I/O count */

        unsigned int                bi_idx;                /* current index into bvl_vec */

        unsigned int            bi_bvec_done;        /* number of bytes completed in
                                                   current bvec */
};

struct bvec_iter_all {
        struct bio_vec        bv;
        int                idx;
        unsigned        done;
};

/*
 * various member access, note that bio_data should of course not be used
 * on highmem page vectors
 */
#define __bvec_iter_bvec(bvec, iter)        (&(bvec)[(iter).bi_idx])

/* multi-page (mp_bvec) helpers */
#define mp_bvec_iter_page(bvec, iter)                                \
        (__bvec_iter_bvec((bvec), (iter))->bv_page)

#define mp_bvec_iter_len(bvec, iter)                                \
        min((iter).bi_size,                                        \
            __bvec_iter_bvec((bvec), (iter))->bv_len - (iter).bi_bvec_done)

#define mp_bvec_iter_offset(bvec, iter)                                \
        (__bvec_iter_bvec((bvec), (iter))->bv_offset + (iter).bi_bvec_done)

#define mp_bvec_iter_page_idx(bvec, iter)                        \
        (mp_bvec_iter_offset((bvec), (iter)) / PAGE_SIZE)

#define mp_bvec_iter_bvec(bvec, iter)                                \
((struct bio_vec) {                                                \
        .bv_page        = mp_bvec_iter_page((bvec), (iter)),        \
        .bv_len                = mp_bvec_iter_len((bvec), (iter)),        \
        .bv_offset        = mp_bvec_iter_offset((bvec), (iter)),        \
})

/* For building single-page bvec in flight */
 #define bvec_iter_offset(bvec, iter)                                \
        (mp_bvec_iter_offset((bvec), (iter)) % PAGE_SIZE)

#define bvec_iter_len(bvec, iter)                                \
        min_t(unsigned, mp_bvec_iter_len((bvec), (iter)),                \
              PAGE_SIZE - bvec_iter_offset((bvec), (iter)))

#define bvec_iter_page(bvec, iter)                                \
        (mp_bvec_iter_page((bvec), (iter)) +                        \
         mp_bvec_iter_page_idx((bvec), (iter)))

#define bvec_iter_bvec(bvec, iter)                                \
((struct bio_vec) {                                                \
        .bv_page        = bvec_iter_page((bvec), (iter)),        \
        .bv_len                = bvec_iter_len((bvec), (iter)),        \
        .bv_offset        = bvec_iter_offset((bvec), (iter)),        \
})

static inline bool bvec_iter_advance(const struct bio_vec *bv,
                struct bvec_iter *iter, unsigned bytes)
{
        unsigned int idx = iter->bi_idx;

        if (WARN_ONCE(bytes > iter->bi_size,
                     "Attempted to advance past end of bvec iter\n")) {
                iter->bi_size = 0;
                return false;
        }

        iter->bi_size -= bytes;
        bytes += iter->bi_bvec_done;

        while (bytes && bytes >= bv[idx].bv_len) {
                bytes -= bv[idx].bv_len;
                idx++;
        }

        iter->bi_idx = idx;
        iter->bi_bvec_done = bytes;
        return true;
}

static inline void bvec_iter_skip_zero_bvec(struct bvec_iter *iter)
{
        iter->bi_bvec_done = 0;
        iter->bi_idx++;
}

#define for_each_bvec(bvl, bio_vec, iter, start)                        \
        for (iter = (start);                                                \
             (iter).bi_size &&                                                \
                ((bvl = bvec_iter_bvec((bio_vec), (iter))), 1);        \
             (bvl).bv_len ? (void)bvec_iter_advance((bio_vec), &(iter),        \
                     (bvl).bv_len) : bvec_iter_skip_zero_bvec(&(iter)))

/* for iterating one bio from start to end */
#define BVEC_ITER_ALL_INIT (struct bvec_iter)                                \
{                                                                        \
        .bi_sector        = 0,                                                \
        .bi_size        = UINT_MAX,                                        \
        .bi_idx                = 0,                                                \
        .bi_bvec_done        = 0,                                                \
}

static inline struct bio_vec *bvec_init_iter_all(struct bvec_iter_all *iter_all)
{
        iter_all->done = 0;
        iter_all->idx = 0;

        return &iter_all->bv;
}

static inline void bvec_advance(const struct bio_vec *bvec,
                                struct bvec_iter_all *iter_all)
{
        struct bio_vec *bv = &iter_all->bv;

        if (iter_all->done) {
                bv->bv_page++;
                bv->bv_offset = 0;
        } else {
                bv->bv_page = bvec->bv_page + (bvec->bv_offset >> PAGE_SHIFT);
                bv->bv_offset = bvec->bv_offset & ~PAGE_MASK;
        }
        bv->bv_len = min_t(unsigned int, PAGE_SIZE - bv->bv_offset,
                           bvec->bv_len - iter_all->done);
        iter_all->done += bv->bv_len;

        if (iter_all->done == bvec->bv_len) {
                iter_all->idx++;
                iter_all->done = 0;
        }
}

#endif /* __LINUX_BVEC_ITER_H */




















































































































































































































































































































































































































































































































































































































































































































































































    1 









    1 






    1 


    1 










    1 

    1 




































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 







    1 
    1 


































    1 

























    1 





    1 












    1 


    1 




























    1 





    1 




    1 





































    1 















    1 





    1 










































































































































































































































































































































































































































































































































































































































    1 




    1 
    1 
    1 
    1 
    1 
    1 
    1 


    1 

















    1 
    1 

    1 
    1 





    1 














































































































































































































































































































































































































































































































































































































































































    1 




























































































































































































































































































































    1 




















    1 




    1 







    1 














    1 



    1 


    1 








    1 





    1 
























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/proc/base.c
 *
 *  Copyright (C) 1991, 1992 Linus Torvalds
 *
 *  proc base directory handling functions
 *
 *  1999, Al Viro. Rewritten. Now it covers the whole per-process part.
 *  Instead of using magical inumbers to determine the kind of object
 *  we allocate and fill in-core inodes upon lookup. They don't even
 *  go into icache. We cache the reference to task_struct upon lookup too.
 *  Eventually it should become a filesystem in its own. We don't use the
 *  rest of procfs anymore.
 *
 *
 *  Changelog:
 *  17-Jan-2005
 *  Allan Bezerra
 *  Bruna Moreira <bruna.moreira@indt.org.br>
 *  Edjard Mota <edjard.mota@indt.org.br>
 *  Ilias Biris <ilias.biris@indt.org.br>
 *  Mauricio Lin <mauricio.lin@indt.org.br>
 *
 *  Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
 *
 *  A new process specific entry (smaps) included in /proc. It shows the
 *  size of rss for each memory area. The maps entry lacks information
 *  about physical memory size (rss) for each mapped file, i.e.,
 *  rss information for executables and library files.
 *  This additional information is useful for any tools that need to know
 *  about physical memory consumption for a process specific library.
 *
 *  Changelog:
 *  21-Feb-2005
 *  Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
 *  Pud inclusion in the page table walking.
 *
 *  ChangeLog:
 *  10-Mar-2005
 *  10LE Instituto Nokia de Tecnologia - INdT:
 *  A better way to walks through the page table as suggested by Hugh Dickins.
 *
 *  Simo Piiroinen <simo.piiroinen@nokia.com>:
 *  Smaps information related to shared, private, clean and dirty pages.
 *
 *  Paul Mundt <paul.mundt@nokia.com>:
 *  Overall revision about smaps.
 */

#include <linux/uaccess.h>

#include <linux/errno.h>
#include <linux/time.h>
#include <linux/proc_fs.h>
#include <linux/stat.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/init.h>
#include <linux/capability.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/generic-radix-tree.h>
#include <linux/string.h>
#include <linux/seq_file.h>
#include <linux/namei.h>
#include <linux/mnt_namespace.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/rcupdate.h>
#include <linux/kallsyms.h>
#include <linux/stacktrace.h>
#include <linux/resource.h>
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/security.h>
#include <linux/ptrace.h>
#include <linux/tracehook.h>
#include <linux/printk.h>
#include <linux/cache.h>
#include <linux/cgroup.h>
#include <linux/cpuset.h>
#include <linux/audit.h>
#include <linux/poll.h>
#include <linux/nsproxy.h>
#include <linux/oom.h>
#include <linux/elf.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <linux/fs_parser.h>
#include <linux/fs_struct.h>
#include <linux/slab.h>
#include <linux/sched/autogroup.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/debug.h>
#include <linux/sched/stat.h>
#include <linux/posix-timers.h>
#include <linux/time_namespace.h>
#include <linux/resctrl.h>
#include <trace/events/oom.h>
#include "internal.h"
#include "fd.h"

#include "../../lib/kstrtox.h"

/* NOTE:
 *        Implementing inode permission operations in /proc is almost
 *        certainly an error.  Permission checks need to happen during
 *        each system call not at open time.  The reason is that most of
 *        what we wish to check for permissions in /proc varies at runtime.
 *
 *        The classic example of a problem is opening file descriptors
 *        in /proc for a task before it execs a suid executable.
 */

static u8 nlink_tid __ro_after_init;
static u8 nlink_tgid __ro_after_init;

enum proc_mem_force {
        PROC_MEM_FORCE_ALWAYS,
        PROC_MEM_FORCE_PTRACE,
        PROC_MEM_FORCE_NEVER
};

static enum proc_mem_force proc_mem_force_override __ro_after_init =
        IS_ENABLED(CONFIG_PROC_MEM_NO_FORCE) ? PROC_MEM_FORCE_NEVER :
        IS_ENABLED(CONFIG_PROC_MEM_FORCE_PTRACE) ? PROC_MEM_FORCE_PTRACE :
        PROC_MEM_FORCE_ALWAYS;

static const struct constant_table proc_mem_force_table[] __initconst = {
        { "always", PROC_MEM_FORCE_ALWAYS },
        { "ptrace", PROC_MEM_FORCE_PTRACE },
        { "never", PROC_MEM_FORCE_NEVER },
        { }
};

static int __init early_proc_mem_force_override(char *buf)
{
        if (!buf)
                return -EINVAL;

        /*
         * lookup_constant() defaults to proc_mem_force_override to preseve
         * the initial Kconfig choice in case an invalid param gets passed.
         */
        proc_mem_force_override = lookup_constant(proc_mem_force_table,
                                                  buf, proc_mem_force_override);

        return 0;
}
early_param("proc_mem.force_override", early_proc_mem_force_override);

struct pid_entry {
        const char *name;
        unsigned int len;
        umode_t mode;
        const struct inode_operations *iop;
        const struct file_operations *fop;
        union proc_op op;
};

#define NOD(NAME, MODE, IOP, FOP, OP) {                        \
        .name = (NAME),                                        \
        .len  = sizeof(NAME) - 1,                        \
        .mode = MODE,                                        \
        .iop  = IOP,                                        \
        .fop  = FOP,                                        \
        .op   = OP,                                        \
}

#define DIR(NAME, MODE, iops, fops)        \
        NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} )
#define LNK(NAME, get_link)                                        \
        NOD(NAME, (S_IFLNK|S_IRWXUGO),                                \
                &proc_pid_link_inode_operations, NULL,                \
                { .proc_get_link = get_link } )
#define REG(NAME, MODE, fops)                                \
        NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
#define ONE(NAME, MODE, show)                                \
        NOD(NAME, (S_IFREG|(MODE)),                        \
                NULL, &proc_single_file_operations,        \
                { .proc_show = show } )
#define ATTR(LSM, NAME, MODE)                                \
        NOD(NAME, (S_IFREG|(MODE)),                        \
                NULL, &proc_pid_attr_operations,        \
                { .lsm = LSM })

/*
 * Count the number of hardlinks for the pid_entry table, excluding the .
 * and .. links.
 */
static unsigned int __init pid_entry_nlink(const struct pid_entry *entries,
        unsigned int n)
{
        unsigned int i;
        unsigned int count;

        count = 2;
        for (i = 0; i < n; ++i) {
                if (S_ISDIR(entries[i].mode))
                        ++count;
        }

        return count;
}

static int get_task_root(struct task_struct *task, struct path *root)
{
        int result = -ENOENT;

        task_lock(task);
        if (task->fs) {
                get_fs_root(task->fs, root);
                result = 0;
        }
        task_unlock(task);
        return result;
}

static int proc_cwd_link(struct dentry *dentry, struct path *path)
{
        struct task_struct *task = get_proc_task(d_inode(dentry));
        int result = -ENOENT;

        if (task) {
                task_lock(task);
                if (task->fs) {
                        get_fs_pwd(task->fs, path);
                        result = 0;
                }
                task_unlock(task);
                put_task_struct(task);
        }
        return result;
}

static int proc_root_link(struct dentry *dentry, struct path *path)
{
        struct task_struct *task = get_proc_task(d_inode(dentry));
        int result = -ENOENT;

        if (task) {
                result = get_task_root(task, path);
                put_task_struct(task);
        }
        return result;
}

/*
 * If the user used setproctitle(), we just get the string from
 * user space at arg_start, and limit it to a maximum of one page.
 */
static ssize_t get_mm_proctitle(struct mm_struct *mm, char __user *buf,
                                size_t count, unsigned long pos,
                                unsigned long arg_start)
{
        char *page;
        int ret, got;

        if (pos >= PAGE_SIZE)
                return 0;

        page = (char *)__get_free_page(GFP_KERNEL);
        if (!page)
                return -ENOMEM;

        ret = 0;
        got = access_remote_vm(mm, arg_start, page, PAGE_SIZE, FOLL_ANON);
        if (got > 0) {
                int len = strnlen(page, got);

                /* Include the NUL character if it was found */
                if (len < got)
                        len++;

                if (len > pos) {
                        len -= pos;
                        if (len > count)
                                len = count;
                        len -= copy_to_user(buf, page+pos, len);
                        if (!len)
                                len = -EFAULT;
                        ret = len;
                }
        }
        free_page((unsigned long)page);
        return ret;
}

static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf,
                              size_t count, loff_t *ppos)
{
        unsigned long arg_start, arg_end, env_start, env_end;
        unsigned long pos, len;
        char *page, c;

        /* Check if process spawned far enough to have cmdline. */
        if (!mm->env_end)
                return 0;

        spin_lock(&mm->arg_lock);
        arg_start = mm->arg_start;
        arg_end = mm->arg_end;
        env_start = mm->env_start;
        env_end = mm->env_end;
        spin_unlock(&mm->arg_lock);

        if (arg_start >= arg_end)
                return 0;

        /*
         * We allow setproctitle() to overwrite the argument
         * strings, and overflow past the original end. But
         * only when it overflows into the environment area.
         */
        if (env_start != arg_end || env_end < env_start)
                env_start = env_end = arg_end;
        len = env_end - arg_start;

        /* We're not going to care if "*ppos" has high bits set */
        pos = *ppos;
        if (pos >= len)
                return 0;
        if (count > len - pos)
                count = len - pos;
        if (!count)
                return 0;

        /*
         * Magical special case: if the argv[] end byte is not
         * zero, the user has overwritten it with setproctitle(3).
         *
         * Possible future enhancement: do this only once when
         * pos is 0, and set a flag in the 'struct file'.
         */
        if (access_remote_vm(mm, arg_end-1, &c, 1, FOLL_ANON) == 1 && c)
                return get_mm_proctitle(mm, buf, count, pos, arg_start);

        /*
         * For the non-setproctitle() case we limit things strictly
         * to the [arg_start, arg_end[ range.
         */
        pos += arg_start;
        if (pos < arg_start || pos >= arg_end)
                return 0;
        if (count > arg_end - pos)
                count = arg_end - pos;

        page = (char *)__get_free_page(GFP_KERNEL);
        if (!page)
                return -ENOMEM;

        len = 0;
        while (count) {
                int got;
                size_t size = min_t(size_t, PAGE_SIZE, count);

                got = access_remote_vm(mm, pos, page, size, FOLL_ANON);
                if (got <= 0)
                        break;
                got -= copy_to_user(buf, page, got);
                if (unlikely(!got)) {
                        if (!len)
                                len = -EFAULT;
                        break;
                }
                pos += got;
                buf += got;
                len += got;
                count -= got;
        }

        free_page((unsigned long)page);
        return len;
}

static ssize_t get_task_cmdline(struct task_struct *tsk, char __user *buf,
                                size_t count, loff_t *pos)
{
        struct mm_struct *mm;
        ssize_t ret;

        mm = get_task_mm(tsk);
        if (!mm)
                return 0;

        ret = get_mm_cmdline(mm, buf, count, pos);
        mmput(mm);
        return ret;
}

static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
                                     size_t count, loff_t *pos)
{
        struct task_struct *tsk;
        ssize_t ret;

        BUG_ON(*pos < 0);

        tsk = get_proc_task(file_inode(file));
        if (!tsk)
                return -ESRCH;
        ret = get_task_cmdline(tsk, buf, count, pos);
        put_task_struct(tsk);
        if (ret > 0)
                *pos += ret;
        return ret;
}

static const struct file_operations proc_pid_cmdline_ops = {
        .read        = proc_pid_cmdline_read,
        .llseek        = generic_file_llseek,
};

#ifdef CONFIG_KALLSYMS
/*
 * Provides a wchan file via kallsyms in a proper one-value-per-file format.
 * Returns the resolved symbol to user space.
 */
static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
                          struct pid *pid, struct task_struct *task)
{
        unsigned long wchan;
        char symname[KSYM_NAME_LEN];

        if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
                goto print0;

        wchan = get_wchan(task);
        if (wchan && !lookup_symbol_name(wchan, symname)) {
                seq_puts(m, symname);
                return 0;
        }

print0:
        seq_putc(m, '0');
        return 0;
}
#endif /* CONFIG_KALLSYMS */

static int lock_trace(struct task_struct *task)
{
        int err = down_read_killable(&task->signal->exec_update_lock);
        if (err)
                return err;
        if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) {
                up_read(&task->signal->exec_update_lock);
                return -EPERM;
        }
        return 0;
}

static void unlock_trace(struct task_struct *task)
{
        up_read(&task->signal->exec_update_lock);
}

#ifdef CONFIG_STACKTRACE

#define MAX_STACK_TRACE_DEPTH        64

static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
                          struct pid *pid, struct task_struct *task)
{
        unsigned long *entries;
        int err;

        /*
         * The ability to racily run the kernel stack unwinder on a running task
         * and then observe the unwinder output is scary; while it is useful for
         * debugging kernel issues, it can also allow an attacker to leak kernel
         * stack contents.
         * Doing this in a manner that is at least safe from races would require
         * some work to ensure that the remote task can not be scheduled; and
         * even then, this would still expose the unwinder as local attack
         * surface.
         * Therefore, this interface is restricted to root.
         */
        if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN))
                return -EACCES;

        entries = kmalloc_array(MAX_STACK_TRACE_DEPTH, sizeof(*entries),
                                GFP_KERNEL);
        if (!entries)
                return -ENOMEM;

        err = lock_trace(task);
        if (!err) {
                unsigned int i, nr_entries;

                nr_entries = stack_trace_save_tsk(task, entries,
                                                  MAX_STACK_TRACE_DEPTH, 0);

                for (i = 0; i < nr_entries; i++) {
                        seq_printf(m, "[<0>] %pB\n", (void *)entries[i]);
                }

                unlock_trace(task);
        }
        kfree(entries);

        return err;
}
#endif

#ifdef CONFIG_SCHED_INFO
/*
 * Provides /proc/PID/schedstat
 */
static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
                              struct pid *pid, struct task_struct *task)
{
        if (unlikely(!sched_info_on()))
                seq_puts(m, "0 0 0\n");
        else
                seq_printf(m, "%llu %llu %lu\n",
                   (unsigned long long)task->se.sum_exec_runtime,
                   (unsigned long long)task->sched_info.run_delay,
                   task->sched_info.pcount);

        return 0;
}
#endif

#ifdef CONFIG_LATENCYTOP
static int lstats_show_proc(struct seq_file *m, void *v)
{
        int i;
        struct inode *inode = m->private;
        struct task_struct *task = get_proc_task(inode);

        if (!task)
                return -ESRCH;
        seq_puts(m, "Latency Top version : v0.1\n");
        for (i = 0; i < LT_SAVECOUNT; i++) {
                struct latency_record *lr = &task->latency_record[i];
                if (lr->backtrace[0]) {
                        int q;
                        seq_printf(m, "%i %li %li",
                                   lr->count, lr->time, lr->max);
                        for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
                                unsigned long bt = lr->backtrace[q];

                                if (!bt)
                                        break;
                                seq_printf(m, " %ps", (void *)bt);
                        }
                        seq_putc(m, '\n');
                }

        }
        put_task_struct(task);
        return 0;
}

static int lstats_open(struct inode *inode, struct file *file)
{
        return single_open(file, lstats_show_proc, inode);
}

static ssize_t lstats_write(struct file *file, const char __user *buf,
                            size_t count, loff_t *offs)
{
        struct task_struct *task = get_proc_task(file_inode(file));

        if (!task)
                return -ESRCH;
        clear_tsk_latency_tracing(task);
        put_task_struct(task);

        return count;
}

static const struct file_operations proc_lstats_operations = {
        .open                = lstats_open,
        .read                = seq_read,
        .write                = lstats_write,
        .llseek                = seq_lseek,
        .release        = single_release,
};

#endif

static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
                          struct pid *pid, struct task_struct *task)
{
        unsigned long totalpages = totalram_pages() + total_swap_pages;
        unsigned long points = 0;
        long badness;

        badness = oom_badness(task, totalpages);
        /*
         * Special case OOM_SCORE_ADJ_MIN for all others scale the
         * badness value into [0, 2000] range which we have been
         * exporting for a long time so userspace might depend on it.
         */
        if (badness != LONG_MIN)
                points = (1000 + badness * 1000 / (long)totalpages) * 2 / 3;

        seq_printf(m, "%lu\n", points);

        return 0;
}

struct limit_names {
        const char *name;
        const char *unit;
};

static const struct limit_names lnames[RLIM_NLIMITS] = {
        [RLIMIT_CPU] = {"Max cpu time", "seconds"},
        [RLIMIT_FSIZE] = {"Max file size", "bytes"},
        [RLIMIT_DATA] = {"Max data size", "bytes"},
        [RLIMIT_STACK] = {"Max stack size", "bytes"},
        [RLIMIT_CORE] = {"Max core file size", "bytes"},
        [RLIMIT_RSS] = {"Max resident set", "bytes"},
        [RLIMIT_NPROC] = {"Max processes", "processes"},
        [RLIMIT_NOFILE] = {"Max open files", "files"},
        [RLIMIT_MEMLOCK] = {"Max locked memory", "bytes"},
        [RLIMIT_AS] = {"Max address space", "bytes"},
        [RLIMIT_LOCKS] = {"Max file locks", "locks"},
        [RLIMIT_SIGPENDING] = {"Max pending signals", "signals"},
        [RLIMIT_MSGQUEUE] = {"Max msgqueue size", "bytes"},
        [RLIMIT_NICE] = {"Max nice priority", NULL},
        [RLIMIT_RTPRIO] = {"Max realtime priority", NULL},
        [RLIMIT_RTTIME] = {"Max realtime timeout", "us"},
};

/* Display limits for a process */
static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns,
                           struct pid *pid, struct task_struct *task)
{
        unsigned int i;
        unsigned long flags;

        struct rlimit rlim[RLIM_NLIMITS];

        if (!lock_task_sighand(task, &flags))
                return 0;
        memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS);
        unlock_task_sighand(task, &flags);

        /*
         * print the file header
         */
        seq_puts(m, "Limit                     "
                "Soft Limit           "
                "Hard Limit           "
                "Units     \n");

        for (i = 0; i < RLIM_NLIMITS; i++) {
                if (rlim[i].rlim_cur == RLIM_INFINITY)
                        seq_printf(m, "%-25s %-20s ",
                                   lnames[i].name, "unlimited");
                else
                        seq_printf(m, "%-25s %-20lu ",
                                   lnames[i].name, rlim[i].rlim_cur);

                if (rlim[i].rlim_max == RLIM_INFINITY)
                        seq_printf(m, "%-20s ", "unlimited");
                else
                        seq_printf(m, "%-20lu ", rlim[i].rlim_max);

                if (lnames[i].unit)
                        seq_printf(m, "%-10s\n", lnames[i].unit);
                else
                        seq_putc(m, '\n');
        }

        return 0;
}

#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns,
                            struct pid *pid, struct task_struct *task)
{
        struct syscall_info info;
        u64 *args = &info.data.args[0];
        int res;

        res = lock_trace(task);
        if (res)
                return res;

        if (task_current_syscall(task, &info))
                seq_puts(m, "running\n");
        else if (info.data.nr < 0)
                seq_printf(m, "%d 0x%llx 0x%llx\n",
                           info.data.nr, info.sp, info.data.instruction_pointer);
        else
                seq_printf(m,
                       "%d 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx\n",
                       info.data.nr,
                       args[0], args[1], args[2], args[3], args[4], args[5],
                       info.sp, info.data.instruction_pointer);
        unlock_trace(task);

        return 0;
}
#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */

/************************************************************************/
/*                       Here the fs part begins                        */
/************************************************************************/

/* permission checks */
static int proc_fd_access_allowed(struct inode *inode)
{
        struct task_struct *task;
        int allowed = 0;
        /* Allow access to a task's file descriptors if it is us or we
         * may use ptrace attach to the process and find out that
         * information.
         */
        task = get_proc_task(inode);
        if (task) {
                allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
                put_task_struct(task);
        }
        return allowed;
}

int proc_setattr(struct dentry *dentry, struct iattr *attr)
{
        int error;
        struct inode *inode = d_inode(dentry);

        if (attr->ia_valid & ATTR_MODE)
                return -EPERM;

        error = setattr_prepare(dentry, attr);
        if (error)
                return error;

        setattr_copy(inode, attr);
        mark_inode_dirty(inode);
        return 0;
}

/*
 * May current process learn task's sched/cmdline info (for hide_pid_min=1)
 * or euid/egid (for hide_pid_min=2)?
 */
static bool has_pid_permissions(struct proc_fs_info *fs_info,
                                 struct task_struct *task,
                                 enum proc_hidepid hide_pid_min)
{
        /*
         * If 'hidpid' mount option is set force a ptrace check,
         * we indicate that we are using a filesystem syscall
         * by passing PTRACE_MODE_READ_FSCREDS
         */
        if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE)
                return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);

        if (fs_info->hide_pid < hide_pid_min)
                return true;
        if (in_group_p(fs_info->pid_gid))
                return true;
        return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
}


static int proc_pid_permission(struct inode *inode, int mask)
{
        struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
        struct task_struct *task;
        bool has_perms;

        task = get_proc_task(inode);
        if (!task)
                return -ESRCH;
        has_perms = has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS);
        put_task_struct(task);

        if (!has_perms) {
                if (fs_info->hide_pid == HIDEPID_INVISIBLE) {
                        /*
                         * Let's make getdents(), stat(), and open()
                         * consistent with each other.  If a process
                         * may not stat() a file, it shouldn't be seen
                         * in procfs at all.
                         */
                        return -ENOENT;
                }

                return -EPERM;
        }
        return generic_permission(inode, mask);
}



static const struct inode_operations proc_def_inode_operations = {
        .setattr        = proc_setattr,
};

static int proc_single_show(struct seq_file *m, void *v)
{
        struct inode *inode = m->private;
        struct pid_namespace *ns = proc_pid_ns(inode->i_sb);
        struct pid *pid = proc_pid(inode);
        struct task_struct *task;
        int ret;

        task = get_pid_task(pid, PIDTYPE_PID);
        if (!task)
                return -ESRCH;

        ret = PROC_I(inode)->op.proc_show(m, ns, pid, task);

        put_task_struct(task);
        return ret;
}

static int proc_single_open(struct inode *inode, struct file *filp)
{
        return single_open(filp, proc_single_show, inode);
}

static const struct file_operations proc_single_file_operations = {
        .open                = proc_single_open,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = single_release,
};


struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode)
{
        struct task_struct *task = get_proc_task(inode);
        struct mm_struct *mm = ERR_PTR(-ESRCH);

        if (task) {
                mm = mm_access(task, mode | PTRACE_MODE_FSCREDS);
                put_task_struct(task);

                if (!IS_ERR_OR_NULL(mm)) {
                        /* ensure this mm_struct can't be freed */
                        mmgrab(mm);
                        /* but do not pin its memory */
                        mmput(mm);
                }
        }

        return mm;
}

static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
{
        struct mm_struct *mm = proc_mem_open(inode, mode);

        if (IS_ERR(mm))
                return PTR_ERR(mm);

        file->private_data = mm;
        return 0;
}

static int mem_open(struct inode *inode, struct file *file)
{
        int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH);

        /* OK to pass negative loff_t, we can catch out-of-range */
        file->f_mode |= FMODE_UNSIGNED_OFFSET;

        return ret;
}

static bool proc_mem_foll_force(struct file *file, struct mm_struct *mm)
{
        struct task_struct *task;
        bool ptrace_active = false;

        switch (proc_mem_force_override) {
        case PROC_MEM_FORCE_NEVER:
                return false;
        case PROC_MEM_FORCE_PTRACE:
                task = get_proc_task(file_inode(file));
                if (task) {
                        ptrace_active =        READ_ONCE(task->ptrace) &&
                                        READ_ONCE(task->mm) == mm &&
                                        READ_ONCE(task->parent) == current;
                        put_task_struct(task);
                }
                return ptrace_active;
        default:
                return true;
        }
}

static ssize_t mem_rw(struct file *file, char __user *buf,
                        size_t count, loff_t *ppos, int write)
{
        struct mm_struct *mm = file->private_data;
        unsigned long addr = *ppos;
        ssize_t copied;
        char *page;
        unsigned int flags;

        if (!mm)
                return 0;

        page = (char *)__get_free_page(GFP_KERNEL);
        if (!page)
                return -ENOMEM;

        copied = 0;
        if (!mmget_not_zero(mm))
                goto free;

        flags = write ? FOLL_WRITE : 0;
        if (proc_mem_foll_force(file, mm))
                flags |= FOLL_FORCE;

        while (count > 0) {
                size_t this_len = min_t(size_t, count, PAGE_SIZE);

                if (write && copy_from_user(page, buf, this_len)) {
                        copied = -EFAULT;
                        break;
                }

                this_len = access_remote_vm(mm, addr, page, this_len, flags);
                if (!this_len) {
                        if (!copied)
                                copied = -EIO;
                        break;
                }

                if (!write && copy_to_user(buf, page, this_len)) {
                        copied = -EFAULT;
                        break;
                }

                buf += this_len;
                addr += this_len;
                copied += this_len;
                count -= this_len;
        }
        *ppos = addr;

        mmput(mm);
free:
        free_page((unsigned long) page);
        return copied;
}

static ssize_t mem_read(struct file *file, char __user *buf,
                        size_t count, loff_t *ppos)
{
        return mem_rw(file, buf, count, ppos, 0);
}

static ssize_t mem_write(struct file *file, const char __user *buf,
                         size_t count, loff_t *ppos)
{
        return mem_rw(file, (char __user*)buf, count, ppos, 1);
}

loff_t mem_lseek(struct file *file, loff_t offset, int orig)
{
        switch (orig) {
        case 0:
                file->f_pos = offset;
                break;
        case 1:
                file->f_pos += offset;
                break;
        default:
                return -EINVAL;
        }
        force_successful_syscall_return();
        return file->f_pos;
}

static int mem_release(struct inode *inode, struct file *file)
{
        struct mm_struct *mm = file->private_data;
        if (mm)
                mmdrop(mm);
        return 0;
}

static const struct file_operations proc_mem_operations = {
        .llseek                = mem_lseek,
        .read                = mem_read,
        .write                = mem_write,
        .open                = mem_open,
        .release        = mem_release,
};

static int environ_open(struct inode *inode, struct file *file)
{
        return __mem_open(inode, file, PTRACE_MODE_READ);
}

static ssize_t environ_read(struct file *file, char __user *buf,
                        size_t count, loff_t *ppos)
{
        char *page;
        unsigned long src = *ppos;
        int ret = 0;
        struct mm_struct *mm = file->private_data;
        unsigned long env_start, env_end;

        /* Ensure the process spawned far enough to have an environment. */
        if (!mm || !mm->env_end)
                return 0;

        page = (char *)__get_free_page(GFP_KERNEL);
        if (!page)
                return -ENOMEM;

        ret = 0;
        if (!mmget_not_zero(mm))
                goto free;

        spin_lock(&mm->arg_lock);
        env_start = mm->env_start;
        env_end = mm->env_end;
        spin_unlock(&mm->arg_lock);

        while (count > 0) {
                size_t this_len, max_len;
                int retval;

                if (src >= (env_end - env_start))
                        break;

                this_len = env_end - (env_start + src);

                max_len = min_t(size_t, PAGE_SIZE, count);
                this_len = min(max_len, this_len);

                retval = access_remote_vm(mm, (env_start + src), page, this_len, FOLL_ANON);

                if (retval <= 0) {
                        ret = retval;
                        break;
                }

                if (copy_to_user(buf, page, retval)) {
                        ret = -EFAULT;
                        break;
                }

                ret += retval;
                src += retval;
                buf += retval;
                count -= retval;
        }
        *ppos = src;
        mmput(mm);

free:
        free_page((unsigned long) page);
        return ret;
}

static const struct file_operations proc_environ_operations = {
        .open                = environ_open,
        .read                = environ_read,
        .llseek                = generic_file_llseek,
        .release        = mem_release,
};

static int auxv_open(struct inode *inode, struct file *file)
{
        return __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS);
}

static ssize_t auxv_read(struct file *file, char __user *buf,
                        size_t count, loff_t *ppos)
{
        struct mm_struct *mm = file->private_data;
        unsigned int nwords = 0;

        if (!mm)
                return 0;
        do {
                nwords += 2;
        } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
        return simple_read_from_buffer(buf, count, ppos, mm->saved_auxv,
                                       nwords * sizeof(mm->saved_auxv[0]));
}

static const struct file_operations proc_auxv_operations = {
        .open                = auxv_open,
        .read                = auxv_read,
        .llseek                = generic_file_llseek,
        .release        = mem_release,
};

static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
                            loff_t *ppos)
{
        struct task_struct *task = get_proc_task(file_inode(file));
        char buffer[PROC_NUMBUF];
        int oom_adj = OOM_ADJUST_MIN;
        size_t len;

        if (!task)
                return -ESRCH;
        if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX)
                oom_adj = OOM_ADJUST_MAX;
        else
                oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
                          OOM_SCORE_ADJ_MAX;
        put_task_struct(task);
        if (oom_adj > OOM_ADJUST_MAX)
                oom_adj = OOM_ADJUST_MAX;
        len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj);
        return simple_read_from_buffer(buf, count, ppos, buffer, len);
}

static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
{
        struct mm_struct *mm = NULL;
        struct task_struct *task;
        int err = 0;

        task = get_proc_task(file_inode(file));
        if (!task)
                return -ESRCH;

        mutex_lock(&oom_adj_mutex);
        if (legacy) {
                if (oom_adj < task->signal->oom_score_adj &&
                                !capable(CAP_SYS_RESOURCE)) {
                        err = -EACCES;
                        goto err_unlock;
                }
                /*
                 * /proc/pid/oom_adj is provided for legacy purposes, ask users to use
                 * /proc/pid/oom_score_adj instead.
                 */
                pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
                          current->comm, task_pid_nr(current), task_pid_nr(task),
                          task_pid_nr(task));
        } else {
                if ((short)oom_adj < task->signal->oom_score_adj_min &&
                                !capable(CAP_SYS_RESOURCE)) {
                        err = -EACCES;
                        goto err_unlock;
                }
        }

        /*
         * Make sure we will check other processes sharing the mm if this is
         * not vfrok which wants its own oom_score_adj.
         * pin the mm so it doesn't go away and get reused after task_unlock
         */
        if (!task->vfork_done) {
                struct task_struct *p = find_lock_task_mm(task);

                if (p) {
                        if (test_bit(MMF_MULTIPROCESS, &p->mm->flags)) {
                                mm = p->mm;
                                mmgrab(mm);
                        }
                        task_unlock(p);
                }
        }

        task->signal->oom_score_adj = oom_adj;
        if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
                task->signal->oom_score_adj_min = (short)oom_adj;
        trace_oom_score_adj_update(task);

        if (mm) {
                struct task_struct *p;

                rcu_read_lock();
                for_each_process(p) {
                        if (same_thread_group(task, p))
                                continue;

                        /* do not touch kernel threads or the global init */
                        if (p->flags & PF_KTHREAD || is_global_init(p))
                                continue;

                        task_lock(p);
                        if (!p->vfork_done && process_shares_mm(p, mm)) {
                                p->signal->oom_score_adj = oom_adj;
                                if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
                                        p->signal->oom_score_adj_min = (short)oom_adj;
                        }
                        task_unlock(p);
                }
                rcu_read_unlock();
                mmdrop(mm);
        }
err_unlock:
        mutex_unlock(&oom_adj_mutex);
        put_task_struct(task);
        return err;
}

/*
 * /proc/pid/oom_adj exists solely for backwards compatibility with previous
 * kernels.  The effective policy is defined by oom_score_adj, which has a
 * different scale: oom_adj grew exponentially and oom_score_adj grows linearly.
 * Values written to oom_adj are simply mapped linearly to oom_score_adj.
 * Processes that become oom disabled via oom_adj will still be oom disabled
 * with this implementation.
 *
 * oom_adj cannot be removed since existing userspace binaries use it.
 */
static ssize_t oom_adj_write(struct file *file, const char __user *buf,
                             size_t count, loff_t *ppos)
{
        char buffer[PROC_NUMBUF];
        int oom_adj;
        int err;

        memset(buffer, 0, sizeof(buffer));
        if (count > sizeof(buffer) - 1)
                count = sizeof(buffer) - 1;
        if (copy_from_user(buffer, buf, count)) {
                err = -EFAULT;
                goto out;
        }

        err = kstrtoint(strstrip(buffer), 0, &oom_adj);
        if (err)
                goto out;
        if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) &&
             oom_adj != OOM_DISABLE) {
                err = -EINVAL;
                goto out;
        }

        /*
         * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
         * value is always attainable.
         */
        if (oom_adj == OOM_ADJUST_MAX)
                oom_adj = OOM_SCORE_ADJ_MAX;
        else
                oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;

        err = __set_oom_adj(file, oom_adj, true);
out:
        return err < 0 ? err : count;
}

static const struct file_operations proc_oom_adj_operations = {
        .read                = oom_adj_read,
        .write                = oom_adj_write,
        .llseek                = generic_file_llseek,
};

static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
                                        size_t count, loff_t *ppos)
{
        struct task_struct *task = get_proc_task(file_inode(file));
        char buffer[PROC_NUMBUF];
        short oom_score_adj = OOM_SCORE_ADJ_MIN;
        size_t len;

        if (!task)
                return -ESRCH;
        oom_score_adj = task->signal->oom_score_adj;
        put_task_struct(task);
        len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj);
        return simple_read_from_buffer(buf, count, ppos, buffer, len);
}

static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
                                        size_t count, loff_t *ppos)
{
        char buffer[PROC_NUMBUF];
        int oom_score_adj;
        int err;

        memset(buffer, 0, sizeof(buffer));
        if (count > sizeof(buffer) - 1)
                count = sizeof(buffer) - 1;
        if (copy_from_user(buffer, buf, count)) {
                err = -EFAULT;
                goto out;
        }

        err = kstrtoint(strstrip(buffer), 0, &oom_score_adj);
        if (err)
                goto out;
        if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
                        oom_score_adj > OOM_SCORE_ADJ_MAX) {
                err = -EINVAL;
                goto out;
        }

        err = __set_oom_adj(file, oom_score_adj, false);
out:
        return err < 0 ? err : count;
}

static const struct file_operations proc_oom_score_adj_operations = {
        .read                = oom_score_adj_read,
        .write                = oom_score_adj_write,
        .llseek                = default_llseek,
};

#ifdef CONFIG_AUDIT
#define TMPBUFLEN 11
static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
                                  size_t count, loff_t *ppos)
{
        struct inode * inode = file_inode(file);
        struct task_struct *task = get_proc_task(inode);
        ssize_t length;
        char tmpbuf[TMPBUFLEN];

        if (!task)
                return -ESRCH;
        length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
                           from_kuid(file->f_cred->user_ns,
                                     audit_get_loginuid(task)));
        put_task_struct(task);
        return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
}

static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
                                   size_t count, loff_t *ppos)
{
        struct inode * inode = file_inode(file);
        uid_t loginuid;
        kuid_t kloginuid;
        int rv;

        /* Don't let kthreads write their own loginuid */
        if (current->flags & PF_KTHREAD)
                return -EPERM;

        rcu_read_lock();
        if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
                rcu_read_unlock();
                return -EPERM;
        }
        rcu_read_unlock();

        if (*ppos != 0) {
                /* No partial writes. */
                return -EINVAL;
        }

        rv = kstrtou32_from_user(buf, count, 10, &loginuid);
        if (rv < 0)
                return rv;

        /* is userspace tring to explicitly UNSET the loginuid? */
        if (loginuid == AUDIT_UID_UNSET) {
                kloginuid = INVALID_UID;
        } else {
                kloginuid = make_kuid(file->f_cred->user_ns, loginuid);
                if (!uid_valid(kloginuid))
                        return -EINVAL;
        }

        rv = audit_set_loginuid(kloginuid);
        if (rv < 0)
                return rv;
        return count;
}

static const struct file_operations proc_loginuid_operations = {
        .read                = proc_loginuid_read,
        .write                = proc_loginuid_write,
        .llseek                = generic_file_llseek,
};

static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
                                  size_t count, loff_t *ppos)
{
        struct inode * inode = file_inode(file);
        struct task_struct *task = get_proc_task(inode);
        ssize_t length;
        char tmpbuf[TMPBUFLEN];

        if (!task)
                return -ESRCH;
        length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
                                audit_get_sessionid(task));
        put_task_struct(task);
        return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
}

static const struct file_operations proc_sessionid_operations = {
        .read                = proc_sessionid_read,
        .llseek                = generic_file_llseek,
};
#endif

#ifdef CONFIG_FAULT_INJECTION
static ssize_t proc_fault_inject_read(struct file * file, char __user * buf,
                                      size_t count, loff_t *ppos)
{
        struct task_struct *task = get_proc_task(file_inode(file));
        char buffer[PROC_NUMBUF];
        size_t len;
        int make_it_fail;

        if (!task)
                return -ESRCH;
        make_it_fail = task->make_it_fail;
        put_task_struct(task);

        len = snprintf(buffer, sizeof(buffer), "%i\n", make_it_fail);

        return simple_read_from_buffer(buf, count, ppos, buffer, len);
}

static ssize_t proc_fault_inject_write(struct file * file,
                        const char __user * buf, size_t count, loff_t *ppos)
{
        struct task_struct *task;
        char buffer[PROC_NUMBUF];
        int make_it_fail;
        int rv;

        if (!capable(CAP_SYS_RESOURCE))
                return -EPERM;
        memset(buffer, 0, sizeof(buffer));
        if (count > sizeof(buffer) - 1)
                count = sizeof(buffer) - 1;
        if (copy_from_user(buffer, buf, count))
                return -EFAULT;
        rv = kstrtoint(strstrip(buffer), 0, &make_it_fail);
        if (rv < 0)
                return rv;
        if (make_it_fail < 0 || make_it_fail > 1)
                return -EINVAL;

        task = get_proc_task(file_inode(file));
        if (!task)
                return -ESRCH;
        task->make_it_fail = make_it_fail;
        put_task_struct(task);

        return count;
}

static const struct file_operations proc_fault_inject_operations = {
        .read                = proc_fault_inject_read,
        .write                = proc_fault_inject_write,
        .llseek                = generic_file_llseek,
};

static ssize_t proc_fail_nth_write(struct file *file, const char __user *buf,
                                   size_t count, loff_t *ppos)
{
        struct task_struct *task;
        int err;
        unsigned int n;

        err = kstrtouint_from_user(buf, count, 0, &n);
        if (err)
                return err;

        task = get_proc_task(file_inode(file));
        if (!task)
                return -ESRCH;
        task->fail_nth = n;
        put_task_struct(task);

        return count;
}

static ssize_t proc_fail_nth_read(struct file *file, char __user *buf,
                                  size_t count, loff_t *ppos)
{
        struct task_struct *task;
        char numbuf[PROC_NUMBUF];
        ssize_t len;

        task = get_proc_task(file_inode(file));
        if (!task)
                return -ESRCH;
        len = snprintf(numbuf, sizeof(numbuf), "%u\n", task->fail_nth);
        put_task_struct(task);
        return simple_read_from_buffer(buf, count, ppos, numbuf, len);
}

static const struct file_operations proc_fail_nth_operations = {
        .read                = proc_fail_nth_read,
        .write                = proc_fail_nth_write,
};
#endif


#ifdef CONFIG_SCHED_DEBUG
/*
 * Print out various scheduling related per-task fields:
 */
static int sched_show(struct seq_file *m, void *v)
{
        struct inode *inode = m->private;
        struct pid_namespace *ns = proc_pid_ns(inode->i_sb);
        struct task_struct *p;

        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;
        proc_sched_show_task(p, ns, m);

        put_task_struct(p);

        return 0;
}

static ssize_t
sched_write(struct file *file, const char __user *buf,
            size_t count, loff_t *offset)
{
        struct inode *inode = file_inode(file);
        struct task_struct *p;

        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;
        proc_sched_set_task(p);

        put_task_struct(p);

        return count;
}

static int sched_open(struct inode *inode, struct file *filp)
{
        return single_open(filp, sched_show, inode);
}

static const struct file_operations proc_pid_sched_operations = {
        .open                = sched_open,
        .read                = seq_read,
        .write                = sched_write,
        .llseek                = seq_lseek,
        .release        = single_release,
};

#endif

#ifdef CONFIG_SCHED_AUTOGROUP
/*
 * Print out autogroup related information:
 */
static int sched_autogroup_show(struct seq_file *m, void *v)
{
        struct inode *inode = m->private;
        struct task_struct *p;

        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;
        proc_sched_autogroup_show_task(p, m);

        put_task_struct(p);

        return 0;
}

static ssize_t
sched_autogroup_write(struct file *file, const char __user *buf,
            size_t count, loff_t *offset)
{
        struct inode *inode = file_inode(file);
        struct task_struct *p;
        char buffer[PROC_NUMBUF];
        int nice;
        int err;

        memset(buffer, 0, sizeof(buffer));
        if (count > sizeof(buffer) - 1)
                count = sizeof(buffer) - 1;
        if (copy_from_user(buffer, buf, count))
                return -EFAULT;

        err = kstrtoint(strstrip(buffer), 0, &nice);
        if (err < 0)
                return err;

        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;

        err = proc_sched_autogroup_set_nice(p, nice);
        if (err)
                count = err;

        put_task_struct(p);

        return count;
}

static int sched_autogroup_open(struct inode *inode, struct file *filp)
{
        int ret;

        ret = single_open(filp, sched_autogroup_show, NULL);
        if (!ret) {
                struct seq_file *m = filp->private_data;

                m->private = inode;
        }
        return ret;
}

static const struct file_operations proc_pid_sched_autogroup_operations = {
        .open                = sched_autogroup_open,
        .read                = seq_read,
        .write                = sched_autogroup_write,
        .llseek                = seq_lseek,
        .release        = single_release,
};

#endif /* CONFIG_SCHED_AUTOGROUP */

#ifdef CONFIG_TIME_NS
static int timens_offsets_show(struct seq_file *m, void *v)
{
        struct task_struct *p;

        p = get_proc_task(file_inode(m->file));
        if (!p)
                return -ESRCH;
        proc_timens_show_offsets(p, m);

        put_task_struct(p);

        return 0;
}

static ssize_t timens_offsets_write(struct file *file, const char __user *buf,
                                    size_t count, loff_t *ppos)
{
        struct inode *inode = file_inode(file);
        struct proc_timens_offset offsets[2];
        char *kbuf = NULL, *pos, *next_line;
        struct task_struct *p;
        int ret, noffsets;

        /* Only allow < page size writes at the beginning of the file */
        if ((*ppos != 0) || (count >= PAGE_SIZE))
                return -EINVAL;

        /* Slurp in the user data */
        kbuf = memdup_user_nul(buf, count);
        if (IS_ERR(kbuf))
                return PTR_ERR(kbuf);

        /* Parse the user data */
        ret = -EINVAL;
        noffsets = 0;
        for (pos = kbuf; pos; pos = next_line) {
                struct proc_timens_offset *off = &offsets[noffsets];
                char clock[10];
                int err;

                /* Find the end of line and ensure we don't look past it */
                next_line = strchr(pos, '\n');
                if (next_line) {
                        *next_line = '\0';
                        next_line++;
                        if (*next_line == '\0')
                                next_line = NULL;
                }

                err = sscanf(pos, "%9s %lld %lu", clock,
                                &off->val.tv_sec, &off->val.tv_nsec);
                if (err != 3 || off->val.tv_nsec >= NSEC_PER_SEC)
                        goto out;

                clock[sizeof(clock) - 1] = 0;
                if (strcmp(clock, "monotonic") == 0 ||
                    strcmp(clock, __stringify(CLOCK_MONOTONIC)) == 0)
                        off->clockid = CLOCK_MONOTONIC;
                else if (strcmp(clock, "boottime") == 0 ||
                         strcmp(clock, __stringify(CLOCK_BOOTTIME)) == 0)
                        off->clockid = CLOCK_BOOTTIME;
                else
                        goto out;

                noffsets++;
                if (noffsets == ARRAY_SIZE(offsets)) {
                        if (next_line)
                                count = next_line - kbuf;
                        break;
                }
        }

        ret = -ESRCH;
        p = get_proc_task(inode);
        if (!p)
                goto out;
        ret = proc_timens_set_offset(file, p, offsets, noffsets);
        put_task_struct(p);
        if (ret)
                goto out;

        ret = count;
out:
        kfree(kbuf);
        return ret;
}

static int timens_offsets_open(struct inode *inode, struct file *filp)
{
        return single_open(filp, timens_offsets_show, inode);
}

static const struct file_operations proc_timens_offsets_operations = {
        .open                = timens_offsets_open,
        .read                = seq_read,
        .write                = timens_offsets_write,
        .llseek                = seq_lseek,
        .release        = single_release,
};
#endif /* CONFIG_TIME_NS */

static ssize_t comm_write(struct file *file, const char __user *buf,
                                size_t count, loff_t *offset)
{
        struct inode *inode = file_inode(file);
        struct task_struct *p;
        char buffer[TASK_COMM_LEN];
        const size_t maxlen = sizeof(buffer) - 1;

        memset(buffer, 0, sizeof(buffer));
        if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count))
                return -EFAULT;

        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;

        if (same_thread_group(current, p))
                set_task_comm(p, buffer);
        else
                count = -EINVAL;

        put_task_struct(p);

        return count;
}

static int comm_show(struct seq_file *m, void *v)
{
        struct inode *inode = m->private;
        struct task_struct *p;

        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;

        proc_task_name(m, p, false);
        seq_putc(m, '\n');

        put_task_struct(p);

        return 0;
}

static int comm_open(struct inode *inode, struct file *filp)
{
        return single_open(filp, comm_show, inode);
}

static const struct file_operations proc_pid_set_comm_operations = {
        .open                = comm_open,
        .read                = seq_read,
        .write                = comm_write,
        .llseek                = seq_lseek,
        .release        = single_release,
};

static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
{
        struct task_struct *task;
        struct file *exe_file;

        task = get_proc_task(d_inode(dentry));
        if (!task)
                return -ENOENT;
        exe_file = get_task_exe_file(task);
        put_task_struct(task);
        if (exe_file) {
                *exe_path = exe_file->f_path;
                path_get(&exe_file->f_path);
                fput(exe_file);
                return 0;
        } else
                return -ENOENT;
}

static const char *proc_pid_get_link(struct dentry *dentry,
                                     struct inode *inode,
                                     struct delayed_call *done)
{
        struct path path;
        int error = -EACCES;

        if (!dentry)
                return ERR_PTR(-ECHILD);

        /* Are we allowed to snoop on the tasks file descriptors? */
        if (!proc_fd_access_allowed(inode))
                goto out;

        error = PROC_I(inode)->op.proc_get_link(dentry, &path);
        if (error)
                goto out;

        error = nd_jump_link(&path);
out:
        return ERR_PTR(error);
}

static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
{
        char *tmp = (char *)__get_free_page(GFP_KERNEL);
        char *pathname;
        int len;

        if (!tmp)
                return -ENOMEM;

        pathname = d_path(path, tmp, PAGE_SIZE);
        len = PTR_ERR(pathname);
        if (IS_ERR(pathname))
                goto out;
        len = tmp + PAGE_SIZE - 1 - pathname;

        if (len > buflen)
                len = buflen;
        if (copy_to_user(buffer, pathname, len))
                len = -EFAULT;
 out:
        free_page((unsigned long)tmp);
        return len;
}

static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen)
{
        int error = -EACCES;
        struct inode *inode = d_inode(dentry);
        struct path path;

        /* Are we allowed to snoop on the tasks file descriptors? */
        if (!proc_fd_access_allowed(inode))
                goto out;

        error = PROC_I(inode)->op.proc_get_link(dentry, &path);
        if (error)
                goto out;

        error = do_proc_readlink(&path, buffer, buflen);
        path_put(&path);
out:
        return error;
}

const struct inode_operations proc_pid_link_inode_operations = {
        .readlink        = proc_pid_readlink,
        .get_link        = proc_pid_get_link,
        .setattr        = proc_setattr,
};


/* building an inode */

void task_dump_owner(struct task_struct *task, umode_t mode,
                     kuid_t *ruid, kgid_t *rgid)
{
        /* Depending on the state of dumpable compute who should own a
         * proc file for a task.
         */
        const struct cred *cred;
        kuid_t uid;
        kgid_t gid;

        if (unlikely(task->flags & PF_KTHREAD)) {
                *ruid = GLOBAL_ROOT_UID;
                *rgid = GLOBAL_ROOT_GID;
                return;
        }

        /* Default to the tasks effective ownership */
        rcu_read_lock();
        cred = __task_cred(task);
        uid = cred->euid;
        gid = cred->egid;
        rcu_read_unlock();

        /*
         * Before the /proc/pid/status file was created the only way to read
         * the effective uid of a /process was to stat /proc/pid.  Reading
         * /proc/pid/status is slow enough that procps and other packages
         * kept stating /proc/pid.  To keep the rules in /proc simple I have
         * made this apply to all per process world readable and executable
         * directories.
         */
        if (mode != (S_IFDIR|S_IRUGO|S_IXUGO)) {
                struct mm_struct *mm;
                task_lock(task);
                mm = task->mm;
                /* Make non-dumpable tasks owned by some root */
                if (mm) {
                        if (get_dumpable(mm) != SUID_DUMP_USER) {
                                struct user_namespace *user_ns = mm->user_ns;

                                uid = make_kuid(user_ns, 0);
                                if (!uid_valid(uid))
                                        uid = GLOBAL_ROOT_UID;

                                gid = make_kgid(user_ns, 0);
                                if (!gid_valid(gid))
                                        gid = GLOBAL_ROOT_GID;
                        }
                } else {
                        uid = GLOBAL_ROOT_UID;
                        gid = GLOBAL_ROOT_GID;
                }
                task_unlock(task);
        }
        *ruid = uid;
        *rgid = gid;
}

void proc_pid_evict_inode(struct proc_inode *ei)
{
        struct pid *pid = ei->pid;

        if (S_ISDIR(ei->vfs_inode.i_mode)) {
                spin_lock(&pid->lock);
                hlist_del_init_rcu(&ei->sibling_inodes);
                spin_unlock(&pid->lock);
        }

        put_pid(pid);
}

struct inode *proc_pid_make_inode(struct super_block *sb,
                                  struct task_struct *task, umode_t mode)
{
        struct inode * inode;
        struct proc_inode *ei;
        struct pid *pid;

        /* We need a new inode */

        inode = new_inode(sb);
        if (!inode)
                goto out;

        /* Common stuff */
        ei = PROC_I(inode);
        inode->i_mode = mode;
        inode->i_ino = get_next_ino();
        inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
        inode->i_op = &proc_def_inode_operations;

        /*
         * grab the reference to task.
         */
        pid = get_task_pid(task, PIDTYPE_PID);
        if (!pid)
                goto out_unlock;

        /* Let the pid remember us for quick removal */
        ei->pid = pid;

        task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
        security_task_to_inode(task, inode);

out:
        return inode;

out_unlock:
        iput(inode);
        return NULL;
}

/*
 * Generating an inode and adding it into @pid->inodes, so that task will
 * invalidate inode's dentry before being released.
 *
 * This helper is used for creating dir-type entries under '/proc' and
 * '/proc/<tgid>/task'. Other entries(eg. fd, stat) under '/proc/<tgid>'
 * can be released by invalidating '/proc/<tgid>' dentry.
 * In theory, dentries under '/proc/<tgid>/task' can also be released by
 * invalidating '/proc/<tgid>' dentry, we reserve it to handle single
 * thread exiting situation: Any one of threads should invalidate its
 * '/proc/<tgid>/task/<pid>' dentry before released.
 */
static struct inode *proc_pid_make_base_inode(struct super_block *sb,
                                struct task_struct *task, umode_t mode)
{
        struct inode *inode;
        struct proc_inode *ei;
        struct pid *pid;

        inode = proc_pid_make_inode(sb, task, mode);
        if (!inode)
                return NULL;

        /* Let proc_flush_pid find this directory inode */
        ei = PROC_I(inode);
        pid = ei->pid;
        spin_lock(&pid->lock);
        hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes);
        spin_unlock(&pid->lock);

        return inode;
}

int pid_getattr(const struct path *path, struct kstat *stat,
                u32 request_mask, unsigned int query_flags)
{
        struct inode *inode = d_inode(path->dentry);
        struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
        struct task_struct *task;

        generic_fillattr(inode, stat);

        stat->uid = GLOBAL_ROOT_UID;
        stat->gid = GLOBAL_ROOT_GID;
        rcu_read_lock();
        task = pid_task(proc_pid(inode), PIDTYPE_PID);
        if (task) {
                if (!has_pid_permissions(fs_info, task, HIDEPID_INVISIBLE)) {
                        rcu_read_unlock();
                        /*
                         * This doesn't prevent learning whether PID exists,
                         * it only makes getattr() consistent with readdir().
                         */
                        return -ENOENT;
                }
                task_dump_owner(task, inode->i_mode, &stat->uid, &stat->gid);
        }
        rcu_read_unlock();
        return 0;
}

/* dentry stuff */

/*
 * Set <pid>/... inode ownership (can change due to setuid(), etc.)
 */
void pid_update_inode(struct task_struct *task, struct inode *inode)
{
        task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid);

        inode->i_mode &= ~(S_ISUID | S_ISGID);
        security_task_to_inode(task, inode);
}

/*
 * Rewrite the inode's ownerships here because the owning task may have
 * performed a setuid(), etc.
 *
 */
static int pid_revalidate(struct dentry *dentry, unsigned int flags)
{
        struct inode *inode;
        struct task_struct *task;

        if (flags & LOOKUP_RCU)
                return -ECHILD;

        inode = d_inode(dentry);
        task = get_proc_task(inode);

        if (task) {
                pid_update_inode(task, inode);
                put_task_struct(task);
                return 1;
        }
        return 0;
}

static inline bool proc_inode_is_dead(struct inode *inode)
{
        return !proc_pid(inode)->tasks[PIDTYPE_PID].first;
}

int pid_delete_dentry(const struct dentry *dentry)
{
        /* Is the task we represent dead?
         * If so, then don't put the dentry on the lru list,
         * kill it immediately.
         */
        return proc_inode_is_dead(d_inode(dentry));
}

const struct dentry_operations pid_dentry_operations =
{
        .d_revalidate        = pid_revalidate,
        .d_delete        = pid_delete_dentry,
};

/* Lookups */

/*
 * Fill a directory entry.
 *
 * If possible create the dcache entry and derive our inode number and
 * file type from dcache entry.
 *
 * Since all of the proc inode numbers are dynamically generated, the inode
 * numbers do not exist until the inode is cache.  This means creating the
 * the dcache entry in readdir is necessary to keep the inode numbers
 * reported by readdir in sync with the inode numbers reported
 * by stat.
 */
bool proc_fill_cache(struct file *file, struct dir_context *ctx,
        const char *name, unsigned int len,
        instantiate_t instantiate, struct task_struct *task, const void *ptr)
{
        struct dentry *child, *dir = file->f_path.dentry;
        struct qstr qname = QSTR_INIT(name, len);
        struct inode *inode;
        unsigned type = DT_UNKNOWN;
        ino_t ino = 1;

        child = d_hash_and_lookup(dir, &qname);
        if (!child) {
                DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
                child = d_alloc_parallel(dir, &qname, &wq);
                if (IS_ERR(child))
                        goto end_instantiate;
                if (d_in_lookup(child)) {
                        struct dentry *res;
                        res = instantiate(child, task, ptr);
                        d_lookup_done(child);
                        if (unlikely(res)) {
                                dput(child);
                                child = res;
                                if (IS_ERR(child))
                                        goto end_instantiate;
                        }
                }
        }
        inode = d_inode(child);
        ino = inode->i_ino;
        type = inode->i_mode >> 12;
        dput(child);
end_instantiate:
        return dir_emit(ctx, name, len, ino, type);
}

/*
 * dname_to_vma_addr - maps a dentry name into two unsigned longs
 * which represent vma start and end addresses.
 */
static int dname_to_vma_addr(struct dentry *dentry,
                             unsigned long *start, unsigned long *end)
{
        const char *str = dentry->d_name.name;
        unsigned long long sval, eval;
        unsigned int len;

        if (str[0] == '0' && str[1] != '-')
                return -EINVAL;
        len = _parse_integer(str, 16, &sval);
        if (len & KSTRTOX_OVERFLOW)
                return -EINVAL;
        if (sval != (unsigned long)sval)
                return -EINVAL;
        str += len;

        if (*str != '-')
                return -EINVAL;
        str++;

        if (str[0] == '0' && str[1])
                return -EINVAL;
        len = _parse_integer(str, 16, &eval);
        if (len & KSTRTOX_OVERFLOW)
                return -EINVAL;
        if (eval != (unsigned long)eval)
                return -EINVAL;
        str += len;

        if (*str != '\0')
                return -EINVAL;

        *start = sval;
        *end = eval;

        return 0;
}

static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
{
        unsigned long vm_start, vm_end;
        bool exact_vma_exists = false;
        struct mm_struct *mm = NULL;
        struct task_struct *task;
        struct inode *inode;
        int status = 0;

        if (flags & LOOKUP_RCU)
                return -ECHILD;

        inode = d_inode(dentry);
        task = get_proc_task(inode);
        if (!task)
                goto out_notask;

        mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
        if (IS_ERR_OR_NULL(mm))
                goto out;

        if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
                status = mmap_read_lock_killable(mm);
                if (!status) {
                        exact_vma_exists = !!find_exact_vma(mm, vm_start,
                                                            vm_end);
                        mmap_read_unlock(mm);
                }
        }

        mmput(mm);

        if (exact_vma_exists) {
                task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);

                security_task_to_inode(task, inode);
                status = 1;
        }

out:
        put_task_struct(task);

out_notask:
        return status;
}

static const struct dentry_operations tid_map_files_dentry_operations = {
        .d_revalidate        = map_files_d_revalidate,
        .d_delete        = pid_delete_dentry,
};

static int map_files_get_link(struct dentry *dentry, struct path *path)
{
        unsigned long vm_start, vm_end;
        struct vm_area_struct *vma;
        struct task_struct *task;
        struct mm_struct *mm;
        int rc;

        rc = -ENOENT;
        task = get_proc_task(d_inode(dentry));
        if (!task)
                goto out;

        mm = get_task_mm(task);
        put_task_struct(task);
        if (!mm)
                goto out;

        rc = dname_to_vma_addr(dentry, &vm_start, &vm_end);
        if (rc)
                goto out_mmput;

        rc = mmap_read_lock_killable(mm);
        if (rc)
                goto out_mmput;

        rc = -ENOENT;
        vma = find_exact_vma(mm, vm_start, vm_end);
        if (vma && vma->vm_file) {
                *path = vma->vm_file->f_path;
                path_get(path);
                rc = 0;
        }
        mmap_read_unlock(mm);

out_mmput:
        mmput(mm);
out:
        return rc;
}

struct map_files_info {
        unsigned long        start;
        unsigned long        end;
        fmode_t                mode;
};

/*
 * Only allow CAP_SYS_ADMIN and CAP_CHECKPOINT_RESTORE to follow the links, due
 * to concerns about how the symlinks may be used to bypass permissions on
 * ancestor directories in the path to the file in question.
 */
static const char *
proc_map_files_get_link(struct dentry *dentry,
                        struct inode *inode,
                        struct delayed_call *done)
{
        if (!checkpoint_restore_ns_capable(&init_user_ns))
                return ERR_PTR(-EPERM);

        return proc_pid_get_link(dentry, inode, done);
}

/*
 * Identical to proc_pid_link_inode_operations except for get_link()
 */
static const struct inode_operations proc_map_files_link_inode_operations = {
        .readlink        = proc_pid_readlink,
        .get_link        = proc_map_files_get_link,
        .setattr        = proc_setattr,
};

static struct dentry *
proc_map_files_instantiate(struct dentry *dentry,
                           struct task_struct *task, const void *ptr)
{
        fmode_t mode = (fmode_t)(unsigned long)ptr;
        struct proc_inode *ei;
        struct inode *inode;

        inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK |
                                    ((mode & FMODE_READ ) ? S_IRUSR : 0) |
                                    ((mode & FMODE_WRITE) ? S_IWUSR : 0));
        if (!inode)
                return ERR_PTR(-ENOENT);

        ei = PROC_I(inode);
        ei->op.proc_get_link = map_files_get_link;

        inode->i_op = &proc_map_files_link_inode_operations;
        inode->i_size = 64;

        d_set_d_op(dentry, &tid_map_files_dentry_operations);
        return d_splice_alias(inode, dentry);
}

static struct dentry *proc_map_files_lookup(struct inode *dir,
                struct dentry *dentry, unsigned int flags)
{
        unsigned long vm_start, vm_end;
        struct vm_area_struct *vma;
        struct task_struct *task;
        struct dentry *result;
        struct mm_struct *mm;

        result = ERR_PTR(-ENOENT);
        task = get_proc_task(dir);
        if (!task)
                goto out;

        result = ERR_PTR(-EACCES);
        if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
                goto out_put_task;

        result = ERR_PTR(-ENOENT);
        if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
                goto out_put_task;

        mm = get_task_mm(task);
        if (!mm)
                goto out_put_task;

        result = ERR_PTR(-EINTR);
        if (mmap_read_lock_killable(mm))
                goto out_put_mm;

        result = ERR_PTR(-ENOENT);
        vma = find_exact_vma(mm, vm_start, vm_end);
        if (!vma)
                goto out_no_vma;

        if (vma->vm_file)
                result = proc_map_files_instantiate(dentry, task,
                                (void *)(unsigned long)vma->vm_file->f_mode);

out_no_vma:
        mmap_read_unlock(mm);
out_put_mm:
        mmput(mm);
out_put_task:
        put_task_struct(task);
out:
        return result;
}

static const struct inode_operations proc_map_files_inode_operations = {
        .lookup                = proc_map_files_lookup,
        .permission        = proc_fd_permission,
        .setattr        = proc_setattr,
};

static int
proc_map_files_readdir(struct file *file, struct dir_context *ctx)
{
        struct vm_area_struct *vma;
        struct task_struct *task;
        struct mm_struct *mm;
        unsigned long nr_files, pos, i;
        GENRADIX(struct map_files_info) fa;
        struct map_files_info *p;
        int ret;

        genradix_init(&fa);

        ret = -ENOENT;
        task = get_proc_task(file_inode(file));
        if (!task)
                goto out;

        ret = -EACCES;
        if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
                goto out_put_task;

        ret = 0;
        if (!dir_emit_dots(file, ctx))
                goto out_put_task;

        mm = get_task_mm(task);
        if (!mm)
                goto out_put_task;

        ret = mmap_read_lock_killable(mm);
        if (ret) {
                mmput(mm);
                goto out_put_task;
        }

        nr_files = 0;

        /*
         * We need two passes here:
         *
         *  1) Collect vmas of mapped files with mmap_lock taken
         *  2) Release mmap_lock and instantiate entries
         *
         * otherwise we get lockdep complained, since filldir()
         * routine might require mmap_lock taken in might_fault().
         */

        for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
                if (!vma->vm_file)
                        continue;
                if (++pos <= ctx->pos)
                        continue;

                p = genradix_ptr_alloc(&fa, nr_files++, GFP_KERNEL);
                if (!p) {
                        ret = -ENOMEM;
                        mmap_read_unlock(mm);
                        mmput(mm);
                        goto out_put_task;
                }

                p->start = vma->vm_start;
                p->end = vma->vm_end;
                p->mode = vma->vm_file->f_mode;
        }
        mmap_read_unlock(mm);
        mmput(mm);

        for (i = 0; i < nr_files; i++) {
                char buf[4 * sizeof(long) + 2];        /* max: %lx-%lx\0 */
                unsigned int len;

                p = genradix_ptr(&fa, i);
                len = snprintf(buf, sizeof(buf), "%lx-%lx", p->start, p->end);
                if (!proc_fill_cache(file, ctx,
                                      buf, len,
                                      proc_map_files_instantiate,
                                      task,
                                      (void *)(unsigned long)p->mode))
                        break;
                ctx->pos++;
        }

out_put_task:
        put_task_struct(task);
out:
        genradix_free(&fa);
        return ret;
}

static const struct file_operations proc_map_files_operations = {
        .read                = generic_read_dir,
        .iterate_shared        = proc_map_files_readdir,
        .llseek                = generic_file_llseek,
};

#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
struct timers_private {
        struct pid *pid;
        struct task_struct *task;
        struct sighand_struct *sighand;
        struct pid_namespace *ns;
        unsigned long flags;
};

static void *timers_start(struct seq_file *m, loff_t *pos)
{
        struct timers_private *tp = m->private;

        tp->task = get_pid_task(tp->pid, PIDTYPE_PID);
        if (!tp->task)
                return ERR_PTR(-ESRCH);

        tp->sighand = lock_task_sighand(tp->task, &tp->flags);
        if (!tp->sighand)
                return ERR_PTR(-ESRCH);

        return seq_list_start(&tp->task->signal->posix_timers, *pos);
}

static void *timers_next(struct seq_file *m, void *v, loff_t *pos)
{
        struct timers_private *tp = m->private;
        return seq_list_next(v, &tp->task->signal->posix_timers, pos);
}

static void timers_stop(struct seq_file *m, void *v)
{
        struct timers_private *tp = m->private;

        if (tp->sighand) {
                unlock_task_sighand(tp->task, &tp->flags);
                tp->sighand = NULL;
        }

        if (tp->task) {
                put_task_struct(tp->task);
                tp->task = NULL;
        }
}

static int show_timer(struct seq_file *m, void *v)
{
        struct k_itimer *timer;
        struct timers_private *tp = m->private;
        int notify;
        static const char * const nstr[] = {
                [SIGEV_SIGNAL] = "signal",
                [SIGEV_NONE] = "none",
                [SIGEV_THREAD] = "thread",
        };

        timer = list_entry((struct list_head *)v, struct k_itimer, list);
        notify = timer->it_sigev_notify;

        seq_printf(m, "ID: %d\n", timer->it_id);
        seq_printf(m, "signal: %d/%px\n",
                   timer->sigq->info.si_signo,
                   timer->sigq->info.si_value.sival_ptr);
        seq_printf(m, "notify: %s/%s.%d\n",
                   nstr[notify & ~SIGEV_THREAD_ID],
                   (notify & SIGEV_THREAD_ID) ? "tid" : "pid",
                   pid_nr_ns(timer->it_pid, tp->ns));
        seq_printf(m, "ClockID: %d\n", timer->it_clock);

        return 0;
}

static const struct seq_operations proc_timers_seq_ops = {
        .start        = timers_start,
        .next        = timers_next,
        .stop        = timers_stop,
        .show        = show_timer,
};

static int proc_timers_open(struct inode *inode, struct file *file)
{
        struct timers_private *tp;

        tp = __seq_open_private(file, &proc_timers_seq_ops,
                        sizeof(struct timers_private));
        if (!tp)
                return -ENOMEM;

        tp->pid = proc_pid(inode);
        tp->ns = proc_pid_ns(inode->i_sb);
        return 0;
}

static const struct file_operations proc_timers_operations = {
        .open                = proc_timers_open,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = seq_release_private,
};
#endif

static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
                                        size_t count, loff_t *offset)
{
        struct inode *inode = file_inode(file);
        struct task_struct *p;
        u64 slack_ns;
        int err;

        err = kstrtoull_from_user(buf, count, 10, &slack_ns);
        if (err < 0)
                return err;

        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;

        if (p != current) {
                rcu_read_lock();
                if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
                        rcu_read_unlock();
                        count = -EPERM;
                        goto out;
                }
                rcu_read_unlock();

                err = security_task_setscheduler(p);
                if (err) {
                        count = err;
                        goto out;
                }
        }

        task_lock(p);
        if (slack_ns == 0)
                p->timer_slack_ns = p->default_timer_slack_ns;
        else
                p->timer_slack_ns = slack_ns;
        task_unlock(p);

out:
        put_task_struct(p);

        return count;
}

static int timerslack_ns_show(struct seq_file *m, void *v)
{
        struct inode *inode = m->private;
        struct task_struct *p;
        int err = 0;

        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;

        if (p != current) {
                rcu_read_lock();
                if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
                        rcu_read_unlock();
                        err = -EPERM;
                        goto out;
                }
                rcu_read_unlock();

                err = security_task_getscheduler(p);
                if (err)
                        goto out;
        }

        task_lock(p);
        seq_printf(m, "%llu\n", p->timer_slack_ns);
        task_unlock(p);

out:
        put_task_struct(p);

        return err;
}

static int timerslack_ns_open(struct inode *inode, struct file *filp)
{
        return single_open(filp, timerslack_ns_show, inode);
}

static const struct file_operations proc_pid_set_timerslack_ns_operations = {
        .open                = timerslack_ns_open,
        .read                = seq_read,
        .write                = timerslack_ns_write,
        .llseek                = seq_lseek,
        .release        = single_release,
};

static struct dentry *proc_pident_instantiate(struct dentry *dentry,
        struct task_struct *task, const void *ptr)
{
        const struct pid_entry *p = ptr;
        struct inode *inode;
        struct proc_inode *ei;

        inode = proc_pid_make_inode(dentry->d_sb, task, p->mode);
        if (!inode)
                return ERR_PTR(-ENOENT);

        ei = PROC_I(inode);
        if (S_ISDIR(inode->i_mode))
                set_nlink(inode, 2);        /* Use getattr to fix if necessary */
        if (p->iop)
                inode->i_op = p->iop;
        if (p->fop)
                inode->i_fop = p->fop;
        ei->op = p->op;
        pid_update_inode(task, inode);
        d_set_d_op(dentry, &pid_dentry_operations);
        return d_splice_alias(inode, dentry);
}

static struct dentry *proc_pident_lookup(struct inode *dir, 
                                         struct dentry *dentry,
                                         const struct pid_entry *p,
                                         const struct pid_entry *end)
{
        struct task_struct *task = get_proc_task(dir);
        struct dentry *res = ERR_PTR(-ENOENT);

        if (!task)
                goto out_no_task;

        /*
         * Yes, it does not scale. And it should not. Don't add
         * new entries into /proc/<tgid>/ without very good reasons.
         */
        for (; p < end; p++) {
                if (p->len != dentry->d_name.len)
                        continue;
                if (!memcmp(dentry->d_name.name, p->name, p->len)) {
                        res = proc_pident_instantiate(dentry, task, p);
                        break;
                }
        }
        put_task_struct(task);
out_no_task:
        return res;
}

static int proc_pident_readdir(struct file *file, struct dir_context *ctx,
                const struct pid_entry *ents, unsigned int nents)
{
        struct task_struct *task = get_proc_task(file_inode(file));
        const struct pid_entry *p;

        if (!task)
                return -ENOENT;

        if (!dir_emit_dots(file, ctx))
                goto out;

        if (ctx->pos >= nents + 2)
                goto out;

        for (p = ents + (ctx->pos - 2); p < ents + nents; p++) {
                if (!proc_fill_cache(file, ctx, p->name, p->len,
                                proc_pident_instantiate, task, p))
                        break;
                ctx->pos++;
        }
out:
        put_task_struct(task);
        return 0;
}

#ifdef CONFIG_SECURITY
static int proc_pid_attr_open(struct inode *inode, struct file *file)
{
        file->private_data = NULL;
        __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS);
        return 0;
}

static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
                                  size_t count, loff_t *ppos)
{
        struct inode * inode = file_inode(file);
        char *p = NULL;
        ssize_t length;
        struct task_struct *task = get_proc_task(inode);

        if (!task)
                return -ESRCH;

        length = security_getprocattr(task, PROC_I(inode)->op.lsm,
                                      (char*)file->f_path.dentry->d_name.name,
                                      &p);
        put_task_struct(task);
        if (length > 0)
                length = simple_read_from_buffer(buf, count, ppos, p, length);
        kfree(p);
        return length;
}

static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
                                   size_t count, loff_t *ppos)
{
        struct inode * inode = file_inode(file);
        struct task_struct *task;
        void *page;
        int rv;

        /* A task may only write when it was the opener. */
        if (file->private_data != current->mm)
                return -EPERM;

        rcu_read_lock();
        task = pid_task(proc_pid(inode), PIDTYPE_PID);
        if (!task) {
                rcu_read_unlock();
                return -ESRCH;
        }
        /* A task may only write its own attributes. */
        if (current != task) {
                rcu_read_unlock();
                return -EACCES;
        }
        /* Prevent changes to overridden credentials. */
        if (current_cred() != current_real_cred()) {
                rcu_read_unlock();
                return -EBUSY;
        }
        rcu_read_unlock();

        if (count > PAGE_SIZE)
                count = PAGE_SIZE;

        /* No partial writes. */
        if (*ppos != 0)
                return -EINVAL;

        page = memdup_user(buf, count);
        if (IS_ERR(page)) {
                rv = PTR_ERR(page);
                goto out;
        }

        /* Guard against adverse ptrace interaction */
        rv = mutex_lock_interruptible(&current->signal->cred_guard_mutex);
        if (rv < 0)
                goto out_free;

        rv = security_setprocattr(PROC_I(inode)->op.lsm,
                                  file->f_path.dentry->d_name.name, page,
                                  count);
        mutex_unlock(&current->signal->cred_guard_mutex);
out_free:
        kfree(page);
out:
        return rv;
}

static const struct file_operations proc_pid_attr_operations = {
        .open                = proc_pid_attr_open,
        .read                = proc_pid_attr_read,
        .write                = proc_pid_attr_write,
        .llseek                = generic_file_llseek,
        .release        = mem_release,
};

#define LSM_DIR_OPS(LSM) \
static int proc_##LSM##_attr_dir_iterate(struct file *filp, \
                             struct dir_context *ctx) \
{ \
        return proc_pident_readdir(filp, ctx, \
                                   LSM##_attr_dir_stuff, \
                                   ARRAY_SIZE(LSM##_attr_dir_stuff)); \
} \
\
static const struct file_operations proc_##LSM##_attr_dir_ops = { \
        .read                = generic_read_dir, \
        .iterate        = proc_##LSM##_attr_dir_iterate, \
        .llseek                = default_llseek, \
}; \
\
static struct dentry *proc_##LSM##_attr_dir_lookup(struct inode *dir, \
                                struct dentry *dentry, unsigned int flags) \
{ \
        return proc_pident_lookup(dir, dentry, \
                                  LSM##_attr_dir_stuff, \
                                  LSM##_attr_dir_stuff + ARRAY_SIZE(LSM##_attr_dir_stuff)); \
} \
\
static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \
        .lookup                = proc_##LSM##_attr_dir_lookup, \
        .getattr        = pid_getattr, \
        .setattr        = proc_setattr, \
}

#ifdef CONFIG_SECURITY_SMACK
static const struct pid_entry smack_attr_dir_stuff[] = {
        ATTR("smack", "current",        0666),
};
LSM_DIR_OPS(smack);
#endif

#ifdef CONFIG_SECURITY_APPARMOR
static const struct pid_entry apparmor_attr_dir_stuff[] = {
        ATTR("apparmor", "current",        0666),
        ATTR("apparmor", "prev",        0444),
        ATTR("apparmor", "exec",        0666),
};
LSM_DIR_OPS(apparmor);
#endif

static const struct pid_entry attr_dir_stuff[] = {
        ATTR(NULL, "current",                0666),
        ATTR(NULL, "prev",                0444),
        ATTR(NULL, "exec",                0666),
        ATTR(NULL, "fscreate",                0666),
        ATTR(NULL, "keycreate",                0666),
        ATTR(NULL, "sockcreate",        0666),
#ifdef CONFIG_SECURITY_SMACK
        DIR("smack",                        0555,
            proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops),
#endif
#ifdef CONFIG_SECURITY_APPARMOR
        DIR("apparmor",                        0555,
            proc_apparmor_attr_dir_inode_ops, proc_apparmor_attr_dir_ops),
#endif
};

static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx)
{
        return proc_pident_readdir(file, ctx, 
                                   attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
}

static const struct file_operations proc_attr_dir_operations = {
        .read                = generic_read_dir,
        .iterate_shared        = proc_attr_dir_readdir,
        .llseek                = generic_file_llseek,
};

static struct dentry *proc_attr_dir_lookup(struct inode *dir,
                                struct dentry *dentry, unsigned int flags)
{
        return proc_pident_lookup(dir, dentry,
                                  attr_dir_stuff,
                                  attr_dir_stuff + ARRAY_SIZE(attr_dir_stuff));
}

static const struct inode_operations proc_attr_dir_inode_operations = {
        .lookup                = proc_attr_dir_lookup,
        .getattr        = pid_getattr,
        .setattr        = proc_setattr,
};

#endif

#ifdef CONFIG_ELF_CORE
static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf,
                                         size_t count, loff_t *ppos)
{
        struct task_struct *task = get_proc_task(file_inode(file));
        struct mm_struct *mm;
        char buffer[PROC_NUMBUF];
        size_t len;
        int ret;

        if (!task)
                return -ESRCH;

        ret = 0;
        mm = get_task_mm(task);
        if (mm) {
                len = snprintf(buffer, sizeof(buffer), "%08lx\n",
                               ((mm->flags & MMF_DUMP_FILTER_MASK) >>
                                MMF_DUMP_FILTER_SHIFT));
                mmput(mm);
                ret = simple_read_from_buffer(buf, count, ppos, buffer, len);
        }

        put_task_struct(task);

        return ret;
}

static ssize_t proc_coredump_filter_write(struct file *file,
                                          const char __user *buf,
                                          size_t count,
                                          loff_t *ppos)
{
        struct task_struct *task;
        struct mm_struct *mm;
        unsigned int val;
        int ret;
        int i;
        unsigned long mask;

        ret = kstrtouint_from_user(buf, count, 0, &val);
        if (ret < 0)
                return ret;

        ret = -ESRCH;
        task = get_proc_task(file_inode(file));
        if (!task)
                goto out_no_task;

        mm = get_task_mm(task);
        if (!mm)
                goto out_no_mm;
        ret = 0;

        for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) {
                if (val & mask)
                        set_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
                else
                        clear_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
        }

        mmput(mm);
 out_no_mm:
        put_task_struct(task);
 out_no_task:
        if (ret < 0)
                return ret;
        return count;
}

static const struct file_operations proc_coredump_filter_operations = {
        .read                = proc_coredump_filter_read,
        .write                = proc_coredump_filter_write,
        .llseek                = generic_file_llseek,
};
#endif

#ifdef CONFIG_TASK_IO_ACCOUNTING
static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole)
{
        struct task_io_accounting acct = task->ioac;
        unsigned long flags;
        int result;

        result = down_read_killable(&task->signal->exec_update_lock);
        if (result)
                return result;

        if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
                result = -EACCES;
                goto out_unlock;
        }

        if (whole && lock_task_sighand(task, &flags)) {
                struct task_struct *t = task;

                task_io_accounting_add(&acct, &task->signal->ioac);
                while_each_thread(task, t)
                        task_io_accounting_add(&acct, &t->ioac);

                unlock_task_sighand(task, &flags);
        }
        seq_printf(m,
                   "rchar: %llu\n"
                   "wchar: %llu\n"
                   "syscr: %llu\n"
                   "syscw: %llu\n"
                   "read_bytes: %llu\n"
                   "write_bytes: %llu\n"
                   "cancelled_write_bytes: %llu\n",
                   (unsigned long long)acct.rchar,
                   (unsigned long long)acct.wchar,
                   (unsigned long long)acct.syscr,
                   (unsigned long long)acct.syscw,
                   (unsigned long long)acct.read_bytes,
                   (unsigned long long)acct.write_bytes,
                   (unsigned long long)acct.cancelled_write_bytes);
        result = 0;

out_unlock:
        up_read(&task->signal->exec_update_lock);
        return result;
}

static int proc_tid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
                                  struct pid *pid, struct task_struct *task)
{
        return do_io_accounting(task, m, 0);
}

static int proc_tgid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
                                   struct pid *pid, struct task_struct *task)
{
        return do_io_accounting(task, m, 1);
}
#endif /* CONFIG_TASK_IO_ACCOUNTING */

#ifdef CONFIG_USER_NS
static int proc_id_map_open(struct inode *inode, struct file *file,
        const struct seq_operations *seq_ops)
{
        struct user_namespace *ns = NULL;
        struct task_struct *task;
        struct seq_file *seq;
        int ret = -EINVAL;

        task = get_proc_task(inode);
        if (task) {
                rcu_read_lock();
                ns = get_user_ns(task_cred_xxx(task, user_ns));
                rcu_read_unlock();
                put_task_struct(task);
        }
        if (!ns)
                goto err;

        ret = seq_open(file, seq_ops);
        if (ret)
                goto err_put_ns;

        seq = file->private_data;
        seq->private = ns;

        return 0;
err_put_ns:
        put_user_ns(ns);
err:
        return ret;
}

static int proc_id_map_release(struct inode *inode, struct file *file)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *ns = seq->private;
        put_user_ns(ns);
        return seq_release(inode, file);
}

static int proc_uid_map_open(struct inode *inode, struct file *file)
{
        return proc_id_map_open(inode, file, &proc_uid_seq_operations);
}

static int proc_gid_map_open(struct inode *inode, struct file *file)
{
        return proc_id_map_open(inode, file, &proc_gid_seq_operations);
}

static int proc_projid_map_open(struct inode *inode, struct file *file)
{
        return proc_id_map_open(inode, file, &proc_projid_seq_operations);
}

static const struct file_operations proc_uid_map_operations = {
        .open                = proc_uid_map_open,
        .write                = proc_uid_map_write,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = proc_id_map_release,
};

static const struct file_operations proc_gid_map_operations = {
        .open                = proc_gid_map_open,
        .write                = proc_gid_map_write,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = proc_id_map_release,
};

static const struct file_operations proc_projid_map_operations = {
        .open                = proc_projid_map_open,
        .write                = proc_projid_map_write,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = proc_id_map_release,
};

static int proc_setgroups_open(struct inode *inode, struct file *file)
{
        struct user_namespace *ns = NULL;
        struct task_struct *task;
        int ret;

        ret = -ESRCH;
        task = get_proc_task(inode);
        if (task) {
                rcu_read_lock();
                ns = get_user_ns(task_cred_xxx(task, user_ns));
                rcu_read_unlock();
                put_task_struct(task);
        }
        if (!ns)
                goto err;

        if (file->f_mode & FMODE_WRITE) {
                ret = -EACCES;
                if (!ns_capable(ns, CAP_SYS_ADMIN))
                        goto err_put_ns;
        }

        ret = single_open(file, &proc_setgroups_show, ns);
        if (ret)
                goto err_put_ns;

        return 0;
err_put_ns:
        put_user_ns(ns);
err:
        return ret;
}

static int proc_setgroups_release(struct inode *inode, struct file *file)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *ns = seq->private;
        int ret = single_release(inode, file);
        put_user_ns(ns);
        return ret;
}

static const struct file_operations proc_setgroups_operations = {
        .open                = proc_setgroups_open,
        .write                = proc_setgroups_write,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = proc_setgroups_release,
};
#endif /* CONFIG_USER_NS */

static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
                                struct pid *pid, struct task_struct *task)
{
        int err = lock_trace(task);
        if (!err) {
                seq_printf(m, "%08x\n", task->personality);
                unlock_trace(task);
        }
        return err;
}

#ifdef CONFIG_LIVEPATCH
static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns,
                                struct pid *pid, struct task_struct *task)
{
        seq_printf(m, "%d\n", task->patch_state);
        return 0;
}
#endif /* CONFIG_LIVEPATCH */

#ifdef CONFIG_STACKLEAK_METRICS
static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
                                struct pid *pid, struct task_struct *task)
{
        unsigned long prev_depth = THREAD_SIZE -
                                (task->prev_lowest_stack & (THREAD_SIZE - 1));
        unsigned long depth = THREAD_SIZE -
                                (task->lowest_stack & (THREAD_SIZE - 1));

        seq_printf(m, "previous stack depth: %lu\nstack depth: %lu\n",
                                                        prev_depth, depth);
        return 0;
}
#endif /* CONFIG_STACKLEAK_METRICS */

/*
 * Thread groups
 */
static const struct file_operations proc_task_operations;
static const struct inode_operations proc_task_inode_operations;

static const struct pid_entry tgid_base_stuff[] = {
        DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
        DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
        DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
        DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
        DIR("ns",          S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
        DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
#endif
        REG("environ",    S_IRUSR, proc_environ_operations),
        REG("auxv",       S_IRUSR, proc_auxv_operations),
        ONE("status",     S_IRUGO, proc_pid_status),
        ONE("personality", S_IRUSR, proc_pid_personality),
        ONE("limits",          S_IRUGO, proc_pid_limits),
#ifdef CONFIG_SCHED_DEBUG
        REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
#endif
#ifdef CONFIG_SCHED_AUTOGROUP
        REG("autogroup",  S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
#endif
#ifdef CONFIG_TIME_NS
        REG("timens_offsets",  S_IRUGO|S_IWUSR, proc_timens_offsets_operations),
#endif
        REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
        ONE("syscall",    S_IRUSR, proc_pid_syscall),
#endif
        REG("cmdline",    S_IRUGO, proc_pid_cmdline_ops),
        ONE("stat",       S_IRUGO, proc_tgid_stat),
        ONE("statm",      S_IRUGO, proc_pid_statm),
        REG("maps",       S_IRUGO, proc_pid_maps_operations),
#ifdef CONFIG_NUMA
        REG("numa_maps",  S_IRUGO, proc_pid_numa_maps_operations),
#endif
        REG("mem",        S_IRUSR|S_IWUSR, proc_mem_operations),
        LNK("cwd",        proc_cwd_link),
        LNK("root",       proc_root_link),
        LNK("exe",        proc_exe_link),
        REG("mounts",     S_IRUGO, proc_mounts_operations),
        REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
        REG("mountstats", S_IRUSR, proc_mountstats_operations),
#ifdef CONFIG_PROC_PAGE_MONITOR
        REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
        REG("smaps",      S_IRUGO, proc_pid_smaps_operations),
        REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
        REG("pagemap",    S_IRUSR, proc_pagemap_operations),
#endif
#ifdef CONFIG_SECURITY
        DIR("attr",       S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
#endif
#ifdef CONFIG_KALLSYMS
        ONE("wchan",      S_IRUGO, proc_pid_wchan),
#endif
#ifdef CONFIG_STACKTRACE
        ONE("stack",      S_IRUSR, proc_pid_stack),
#endif
#ifdef CONFIG_SCHED_INFO
        ONE("schedstat",  S_IRUGO, proc_pid_schedstat),
#endif
#ifdef CONFIG_LATENCYTOP
        REG("latency",  S_IRUGO, proc_lstats_operations),
#endif
#ifdef CONFIG_PROC_PID_CPUSET
        ONE("cpuset",     S_IRUGO, proc_cpuset_show),
#endif
#ifdef CONFIG_CGROUPS
        ONE("cgroup",  S_IRUGO, proc_cgroup_show),
#endif
#ifdef CONFIG_PROC_CPU_RESCTRL
        ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show),
#endif
        ONE("oom_score",  S_IRUGO, proc_oom_score),
        REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adj_operations),
        REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
#ifdef CONFIG_AUDIT
        REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
        REG("sessionid",  S_IRUGO, proc_sessionid_operations),
#endif
#ifdef CONFIG_FAULT_INJECTION
        REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
        REG("fail-nth", 0644, proc_fail_nth_operations),
#endif
#ifdef CONFIG_ELF_CORE
        REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
#endif
#ifdef CONFIG_TASK_IO_ACCOUNTING
        ONE("io",        S_IRUSR, proc_tgid_io_accounting),
#endif
#ifdef CONFIG_USER_NS
        REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
        REG("gid_map",    S_IRUGO|S_IWUSR, proc_gid_map_operations),
        REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
        REG("setgroups",  S_IRUGO|S_IWUSR, proc_setgroups_operations),
#endif
#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
        REG("timers",          S_IRUGO, proc_timers_operations),
#endif
        REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations),
#ifdef CONFIG_LIVEPATCH
        ONE("patch_state",  S_IRUSR, proc_pid_patch_state),
#endif
#ifdef CONFIG_STACKLEAK_METRICS
        ONE("stack_depth", S_IRUGO, proc_stack_depth),
#endif
#ifdef CONFIG_PROC_PID_ARCH_STATUS
        ONE("arch_status", S_IRUGO, proc_pid_arch_status),
#endif
};

static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
{
        return proc_pident_readdir(file, ctx,
                                   tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
}

static const struct file_operations proc_tgid_base_operations = {
        .read                = generic_read_dir,
        .iterate_shared        = proc_tgid_base_readdir,
        .llseek                = generic_file_llseek,
};

struct pid *tgid_pidfd_to_pid(const struct file *file)
{
        if (file->f_op != &proc_tgid_base_operations)
                return ERR_PTR(-EBADF);

        return proc_pid(file_inode(file));
}

static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
        return proc_pident_lookup(dir, dentry,
                                  tgid_base_stuff,
                                  tgid_base_stuff + ARRAY_SIZE(tgid_base_stuff));
}

static const struct inode_operations proc_tgid_base_inode_operations = {
        .lookup                = proc_tgid_base_lookup,
        .getattr        = pid_getattr,
        .setattr        = proc_setattr,
        .permission        = proc_pid_permission,
};

/**
 * proc_flush_pid -  Remove dcache entries for @pid from the /proc dcache.
 * @pid: pid that should be flushed.
 *
 * This function walks a list of inodes (that belong to any proc
 * filesystem) that are attached to the pid and flushes them from
 * the dentry cache.
 *
 * It is safe and reasonable to cache /proc entries for a task until
 * that task exits.  After that they just clog up the dcache with
 * useless entries, possibly causing useful dcache entries to be
 * flushed instead.  This routine is provided to flush those useless
 * dcache entries when a process is reaped.
 *
 * NOTE: This routine is just an optimization so it does not guarantee
 *       that no dcache entries will exist after a process is reaped
 *       it just makes it very unlikely that any will persist.
 */

void proc_flush_pid(struct pid *pid)
{
        proc_invalidate_siblings_dcache(&pid->inodes, &pid->lock);
}

static struct dentry *proc_pid_instantiate(struct dentry * dentry,
                                   struct task_struct *task, const void *ptr)
{
        struct inode *inode;

        inode = proc_pid_make_base_inode(dentry->d_sb, task,
                                         S_IFDIR | S_IRUGO | S_IXUGO);
        if (!inode)
                return ERR_PTR(-ENOENT);

        inode->i_op = &proc_tgid_base_inode_operations;
        inode->i_fop = &proc_tgid_base_operations;
        inode->i_flags|=S_IMMUTABLE;

        set_nlink(inode, nlink_tgid);
        pid_update_inode(task, inode);

        d_set_d_op(dentry, &pid_dentry_operations);
        return d_splice_alias(inode, dentry);
}

struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags)
{
        struct task_struct *task;
        unsigned tgid;
        struct proc_fs_info *fs_info;
        struct pid_namespace *ns;
        struct dentry *result = ERR_PTR(-ENOENT);

        tgid = name_to_int(&dentry->d_name);
        if (tgid == ~0U)
                goto out;

        fs_info = proc_sb_info(dentry->d_sb);
        ns = fs_info->pid_ns;
        rcu_read_lock();
        task = find_task_by_pid_ns(tgid, ns);
        if (task)
                get_task_struct(task);
        rcu_read_unlock();
        if (!task)
                goto out;

        /* Limit procfs to only ptraceable tasks */
        if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE) {
                if (!has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS))
                        goto out_put_task;
        }

        result = proc_pid_instantiate(dentry, task, NULL);
out_put_task:
        put_task_struct(task);
out:
        return result;
}

/*
 * Find the first task with tgid >= tgid
 *
 */
struct tgid_iter {
        unsigned int tgid;
        struct task_struct *task;
};
static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter)
{
        struct pid *pid;

        if (iter.task)
                put_task_struct(iter.task);
        rcu_read_lock();
retry:
        iter.task = NULL;
        pid = find_ge_pid(iter.tgid, ns);
        if (pid) {
                iter.tgid = pid_nr_ns(pid, ns);
                iter.task = pid_task(pid, PIDTYPE_TGID);
                if (!iter.task) {
                        iter.tgid += 1;
                        goto retry;
                }
                get_task_struct(iter.task);
        }
        rcu_read_unlock();
        return iter;
}

#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 2)

/* for the /proc/ directory itself, after non-process stuff has been done */
int proc_pid_readdir(struct file *file, struct dir_context *ctx)
{
        struct tgid_iter iter;
        struct proc_fs_info *fs_info = proc_sb_info(file_inode(file)->i_sb);
        struct pid_namespace *ns = proc_pid_ns(file_inode(file)->i_sb);
        loff_t pos = ctx->pos;

        if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
                return 0;

        if (pos == TGID_OFFSET - 2) {
                struct inode *inode = d_inode(fs_info->proc_self);
                if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK))
                        return 0;
                ctx->pos = pos = pos + 1;
        }
        if (pos == TGID_OFFSET - 1) {
                struct inode *inode = d_inode(fs_info->proc_thread_self);
                if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK))
                        return 0;
                ctx->pos = pos = pos + 1;
        }
        iter.tgid = pos - TGID_OFFSET;
        iter.task = NULL;
        for (iter = next_tgid(ns, iter);
             iter.task;
             iter.tgid += 1, iter = next_tgid(ns, iter)) {
                char name[10 + 1];
                unsigned int len;

                cond_resched();
                if (!has_pid_permissions(fs_info, iter.task, HIDEPID_INVISIBLE))
                        continue;

                len = snprintf(name, sizeof(name), "%u", iter.tgid);
                ctx->pos = iter.tgid + TGID_OFFSET;
                if (!proc_fill_cache(file, ctx, name, len,
                                     proc_pid_instantiate, iter.task, NULL)) {
                        put_task_struct(iter.task);
                        return 0;
                }
        }
        ctx->pos = PID_MAX_LIMIT + TGID_OFFSET;
        return 0;
}

/*
 * proc_tid_comm_permission is a special permission function exclusively
 * used for the node /proc/<pid>/task/<tid>/comm.
 * It bypasses generic permission checks in the case where a task of the same
 * task group attempts to access the node.
 * The rationale behind this is that glibc and bionic access this node for
 * cross thread naming (pthread_set/getname_np(!self)). However, if
 * PR_SET_DUMPABLE gets set to 0 this node among others becomes uid=0 gid=0,
 * which locks out the cross thread naming implementation.
 * This function makes sure that the node is always accessible for members of
 * same thread group.
 */
static int proc_tid_comm_permission(struct inode *inode, int mask)
{
        bool is_same_tgroup;
        struct task_struct *task;

        task = get_proc_task(inode);
        if (!task)
                return -ESRCH;
        is_same_tgroup = same_thread_group(current, task);
        put_task_struct(task);

        if (likely(is_same_tgroup && !(mask & MAY_EXEC))) {
                /* This file (/proc/<pid>/task/<tid>/comm) can always be
                 * read or written by the members of the corresponding
                 * thread group.
                 */
                return 0;
        }

        return generic_permission(inode, mask);
}

static const struct inode_operations proc_tid_comm_inode_operations = {
                .setattr        = proc_setattr,
                .permission        = proc_tid_comm_permission,
};

/*
 * Tasks
 */
static const struct pid_entry tid_base_stuff[] = {
        DIR("fd",        S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
        DIR("fdinfo",    S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
        DIR("ns",         S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
        DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
#endif
        REG("environ",   S_IRUSR, proc_environ_operations),
        REG("auxv",      S_IRUSR, proc_auxv_operations),
        ONE("status",    S_IRUGO, proc_pid_status),
        ONE("personality", S_IRUSR, proc_pid_personality),
        ONE("limits",         S_IRUGO, proc_pid_limits),
#ifdef CONFIG_SCHED_DEBUG
        REG("sched",     S_IRUGO|S_IWUSR, proc_pid_sched_operations),
#endif
        NOD("comm",      S_IFREG|S_IRUGO|S_IWUSR,
                         &proc_tid_comm_inode_operations,
                         &proc_pid_set_comm_operations, {}),
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
        ONE("syscall",   S_IRUSR, proc_pid_syscall),
#endif
        REG("cmdline",   S_IRUGO, proc_pid_cmdline_ops),
        ONE("stat",      S_IRUGO, proc_tid_stat),
        ONE("statm",     S_IRUGO, proc_pid_statm),
        REG("maps",      S_IRUGO, proc_pid_maps_operations),
#ifdef CONFIG_PROC_CHILDREN
        REG("children",  S_IRUGO, proc_tid_children_operations),
#endif
#ifdef CONFIG_NUMA
        REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations),
#endif
        REG("mem",       S_IRUSR|S_IWUSR, proc_mem_operations),
        LNK("cwd",       proc_cwd_link),
        LNK("root",      proc_root_link),
        LNK("exe",       proc_exe_link),
        REG("mounts",    S_IRUGO, proc_mounts_operations),
        REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
#ifdef CONFIG_PROC_PAGE_MONITOR
        REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
        REG("smaps",     S_IRUGO, proc_pid_smaps_operations),
        REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
        REG("pagemap",    S_IRUSR, proc_pagemap_operations),
#endif
#ifdef CONFIG_SECURITY
        DIR("attr",      S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
#endif
#ifdef CONFIG_KALLSYMS
        ONE("wchan",     S_IRUGO, proc_pid_wchan),
#endif
#ifdef CONFIG_STACKTRACE
        ONE("stack",      S_IRUSR, proc_pid_stack),
#endif
#ifdef CONFIG_SCHED_INFO
        ONE("schedstat", S_IRUGO, proc_pid_schedstat),
#endif
#ifdef CONFIG_LATENCYTOP
        REG("latency",  S_IRUGO, proc_lstats_operations),
#endif
#ifdef CONFIG_PROC_PID_CPUSET
        ONE("cpuset",    S_IRUGO, proc_cpuset_show),
#endif
#ifdef CONFIG_CGROUPS
        ONE("cgroup",  S_IRUGO, proc_cgroup_show),
#endif
#ifdef CONFIG_PROC_CPU_RESCTRL
        ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show),
#endif
        ONE("oom_score", S_IRUGO, proc_oom_score),
        REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adj_operations),
        REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
#ifdef CONFIG_AUDIT
        REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
        REG("sessionid",  S_IRUGO, proc_sessionid_operations),
#endif
#ifdef CONFIG_FAULT_INJECTION
        REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
        REG("fail-nth", 0644, proc_fail_nth_operations),
#endif
#ifdef CONFIG_TASK_IO_ACCOUNTING
        ONE("io",        S_IRUSR, proc_tid_io_accounting),
#endif
#ifdef CONFIG_USER_NS
        REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
        REG("gid_map",    S_IRUGO|S_IWUSR, proc_gid_map_operations),
        REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
        REG("setgroups",  S_IRUGO|S_IWUSR, proc_setgroups_operations),
#endif
#ifdef CONFIG_LIVEPATCH
        ONE("patch_state",  S_IRUSR, proc_pid_patch_state),
#endif
#ifdef CONFIG_PROC_PID_ARCH_STATUS
        ONE("arch_status", S_IRUGO, proc_pid_arch_status),
#endif
};

static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
{
        return proc_pident_readdir(file, ctx,
                                   tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
}

static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
        return proc_pident_lookup(dir, dentry,
                                  tid_base_stuff,
                                  tid_base_stuff + ARRAY_SIZE(tid_base_stuff));
}

static const struct file_operations proc_tid_base_operations = {
        .read                = generic_read_dir,
        .iterate_shared        = proc_tid_base_readdir,
        .llseek                = generic_file_llseek,
};

static const struct inode_operations proc_tid_base_inode_operations = {
        .lookup                = proc_tid_base_lookup,
        .getattr        = pid_getattr,
        .setattr        = proc_setattr,
};

static struct dentry *proc_task_instantiate(struct dentry *dentry,
        struct task_struct *task, const void *ptr)
{
        struct inode *inode;
        inode = proc_pid_make_base_inode(dentry->d_sb, task,
                                         S_IFDIR | S_IRUGO | S_IXUGO);
        if (!inode)
                return ERR_PTR(-ENOENT);

        inode->i_op = &proc_tid_base_inode_operations;
        inode->i_fop = &proc_tid_base_operations;
        inode->i_flags |= S_IMMUTABLE;

        set_nlink(inode, nlink_tid);
        pid_update_inode(task, inode);

        d_set_d_op(dentry, &pid_dentry_operations);
        return d_splice_alias(inode, dentry);
}

static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
{
        struct task_struct *task;
        struct task_struct *leader = get_proc_task(dir);
        unsigned tid;
        struct proc_fs_info *fs_info;
        struct pid_namespace *ns;
        struct dentry *result = ERR_PTR(-ENOENT);

        if (!leader)
                goto out_no_task;

        tid = name_to_int(&dentry->d_name);
        if (tid == ~0U)
                goto out;

        fs_info = proc_sb_info(dentry->d_sb);
        ns = fs_info->pid_ns;
        rcu_read_lock();
        task = find_task_by_pid_ns(tid, ns);
        if (task)
                get_task_struct(task);
        rcu_read_unlock();
        if (!task)
                goto out;
        if (!same_thread_group(leader, task))
                goto out_drop_task;

        result = proc_task_instantiate(dentry, task, NULL);
out_drop_task:
        put_task_struct(task);
out:
        put_task_struct(leader);
out_no_task:
        return result;
}

/*
 * Find the first tid of a thread group to return to user space.
 *
 * Usually this is just the thread group leader, but if the users
 * buffer was too small or there was a seek into the middle of the
 * directory we have more work todo.
 *
 * In the case of a short read we start with find_task_by_pid.
 *
 * In the case of a seek we start with the leader and walk nr
 * threads past it.
 */
static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos,
                                        struct pid_namespace *ns)
{
        struct task_struct *pos, *task;
        unsigned long nr = f_pos;

        if (nr != f_pos)        /* 32bit overflow? */
                return NULL;

        rcu_read_lock();
        task = pid_task(pid, PIDTYPE_PID);
        if (!task)
                goto fail;

        /* Attempt to start with the tid of a thread */
        if (tid && nr) {
                pos = find_task_by_pid_ns(tid, ns);
                if (pos && same_thread_group(pos, task))
                        goto found;
        }

        /* If nr exceeds the number of threads there is nothing todo */
        if (nr >= get_nr_threads(task))
                goto fail;

        /* If we haven't found our starting place yet start
         * with the leader and walk nr threads forward.
         */
        pos = task = task->group_leader;
        do {
                if (!nr--)
                        goto found;
        } while_each_thread(task, pos);
fail:
        pos = NULL;
        goto out;
found:
        get_task_struct(pos);
out:
        rcu_read_unlock();
        return pos;
}

/*
 * Find the next thread in the thread list.
 * Return NULL if there is an error or no next thread.
 *
 * The reference to the input task_struct is released.
 */
static struct task_struct *next_tid(struct task_struct *start)
{
        struct task_struct *pos = NULL;
        rcu_read_lock();
        if (pid_alive(start)) {
                pos = next_thread(start);
                if (thread_group_leader(pos))
                        pos = NULL;
                else
                        get_task_struct(pos);
        }
        rcu_read_unlock();
        put_task_struct(start);
        return pos;
}

/* for the /proc/TGID/task/ directories */
static int proc_task_readdir(struct file *file, struct dir_context *ctx)
{
        struct inode *inode = file_inode(file);
        struct task_struct *task;
        struct pid_namespace *ns;
        int tid;

        if (proc_inode_is_dead(inode))
                return -ENOENT;

        if (!dir_emit_dots(file, ctx))
                return 0;

        /* f_version caches the tgid value that the last readdir call couldn't
         * return. lseek aka telldir automagically resets f_version to 0.
         */
        ns = proc_pid_ns(inode->i_sb);
        tid = (int)file->f_version;
        file->f_version = 0;
        for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
             task;
             task = next_tid(task), ctx->pos++) {
                char name[10 + 1];
                unsigned int len;
                tid = task_pid_nr_ns(task, ns);
                len = snprintf(name, sizeof(name), "%u", tid);
                if (!proc_fill_cache(file, ctx, name, len,
                                proc_task_instantiate, task, NULL)) {
                        /* returning this tgid failed, save it as the first
                         * pid for the next readir call */
                        file->f_version = (u64)tid;
                        put_task_struct(task);
                        break;
                }
        }

        return 0;
}

static int proc_task_getattr(const struct path *path, struct kstat *stat,
                             u32 request_mask, unsigned int query_flags)
{
        struct inode *inode = d_inode(path->dentry);
        struct task_struct *p = get_proc_task(inode);
        generic_fillattr(inode, stat);

        if (p) {
                stat->nlink += get_nr_threads(p);
                put_task_struct(p);
        }

        return 0;
}

static const struct inode_operations proc_task_inode_operations = {
        .lookup                = proc_task_lookup,
        .getattr        = proc_task_getattr,
        .setattr        = proc_setattr,
        .permission        = proc_pid_permission,
};

static const struct file_operations proc_task_operations = {
        .read                = generic_read_dir,
        .iterate_shared        = proc_task_readdir,
        .llseek                = generic_file_llseek,
};

void __init set_proc_pid_nlink(void)
{
        nlink_tid = pid_entry_nlink(tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
        nlink_tgid = pid_entry_nlink(tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
}









































































    2 
    2 
    2 



    2 

    2 



    2 





    2 



    2 








































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ext4/dir.c
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  from
 *
 *  linux/fs/minix/dir.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  ext4 directory handling functions
 *
 *  Big-endian to little-endian byte-swapping/bitmaps by
 *        David S. Miller (davem@caip.rutgers.edu), 1995
 *
 * Hash Tree Directory indexing (c) 2001  Daniel Phillips
 *
 */

#include <linux/fs.h>
#include <linux/buffer_head.h>
#include <linux/slab.h>
#include <linux/iversion.h>
#include <linux/unicode.h>
#include "ext4.h"
#include "xattr.h"

static int ext4_dx_readdir(struct file *, struct dir_context *);

/**
 * is_dx_dir() - check if a directory is using htree indexing
 * @inode: directory inode
 *
 * Check if the given dir-inode refers to an htree-indexed directory
 * (or a directory which could potentially get converted to use htree
 * indexing).
 *
 * Return 1 if it is a dx dir, 0 if not
 */
static int is_dx_dir(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;

        if (ext4_has_feature_dir_index(inode->i_sb) &&
            ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
             ((inode->i_size >> sb->s_blocksize_bits) == 1) ||
             ext4_has_inline_data(inode)))
                return 1;

        return 0;
}

/*
 * Return 0 if the directory entry is OK, and 1 if there is a problem
 *
 * Note: this is the opposite of what ext2 and ext3 historically returned...
 *
 * bh passed here can be an inode block or a dir data block, depending
 * on the inode inline data flag.
 */
int __ext4_check_dir_entry(const char *function, unsigned int line,
                           struct inode *dir, struct file *filp,
                           struct ext4_dir_entry_2 *de,
                           struct buffer_head *bh, char *buf, int size,
                           unsigned int offset)
{
        const char *error_msg = NULL;
        const int rlen = ext4_rec_len_from_disk(de->rec_len,
                                                dir->i_sb->s_blocksize);
        const int next_offset = ((char *) de - buf) + rlen;

        if (unlikely(rlen < EXT4_DIR_REC_LEN(1)))
                error_msg = "rec_len is smaller than minimal";
        else if (unlikely(rlen % 4 != 0))
                error_msg = "rec_len % 4 != 0";
        else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
                error_msg = "rec_len is too small for name_len";
        else if (unlikely(next_offset > size))
                error_msg = "directory entry overrun";
        else if (unlikely(next_offset > size - EXT4_DIR_REC_LEN(1) &&
                          next_offset != size))
                error_msg = "directory entry too close to block end";
        else if (unlikely(le32_to_cpu(de->inode) >
                        le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
                error_msg = "inode out of bounds";
        else if (unlikely(next_offset == size && de->name_len == 1 &&
                          de->name[0] == '.'))
                error_msg = "'.' directory cannot be the last in data block";
        else
                return 0;

        if (filp)
                ext4_error_file(filp, function, line, bh->b_blocknr,
                                "bad entry in directory: %s - offset=%u, "
                                "inode=%u, rec_len=%d, name_len=%d, size=%d",
                                error_msg, offset, le32_to_cpu(de->inode),
                                rlen, de->name_len, size);
        else
                ext4_error_inode(dir, function, line, bh->b_blocknr,
                                "bad entry in directory: %s - offset=%u, "
                                "inode=%u, rec_len=%d, name_len=%d, size=%d",
                                 error_msg, offset, le32_to_cpu(de->inode),
                                 rlen, de->name_len, size);

        return 1;
}

static int ext4_readdir(struct file *file, struct dir_context *ctx)
{
        unsigned int offset;
        int i;
        struct ext4_dir_entry_2 *de;
        int err;
        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
        struct buffer_head *bh = NULL;
        struct fscrypt_str fstr = FSTR_INIT(NULL, 0);

        if (IS_ENCRYPTED(inode)) {
                err = fscrypt_get_encryption_info(inode);
                if (err)
                        return err;
        }

        if (is_dx_dir(inode)) {
                err = ext4_dx_readdir(file, ctx);
                if (err != ERR_BAD_DX_DIR) {
                        return err;
                }
                /* Can we just clear INDEX flag to ignore htree information? */
                if (!ext4_has_metadata_csum(sb)) {
                        /*
                         * We don't set the inode dirty flag since it's not
                         * critical that it gets flushed back to the disk.
                         */
                        ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
                }
        }

        if (ext4_has_inline_data(inode)) {
                int has_inline_data = 1;
                err = ext4_read_inline_dir(file, ctx,
                                           &has_inline_data);
                if (has_inline_data)
                        return err;
        }

        if (IS_ENCRYPTED(inode)) {
                err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN, &fstr);
                if (err < 0)
                        return err;
        }

        while (ctx->pos < inode->i_size) {
                struct ext4_map_blocks map;

                if (fatal_signal_pending(current)) {
                        err = -ERESTARTSYS;
                        goto errout;
                }
                cond_resched();
                offset = ctx->pos & (sb->s_blocksize - 1);
                map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb);
                map.m_len = 1;
                err = ext4_map_blocks(NULL, inode, &map, 0);
                if (err == 0) {
                        /* m_len should never be zero but let's avoid
                         * an infinite loop if it somehow is */
                        if (map.m_len == 0)
                                map.m_len = 1;
                        ctx->pos += map.m_len * sb->s_blocksize;
                        continue;
                }
                if (err > 0) {
                        pgoff_t index = map.m_pblk >>
                                        (PAGE_SHIFT - inode->i_blkbits);
                        if (!ra_has_index(&file->f_ra, index))
                                page_cache_sync_readahead(
                                        sb->s_bdev->bd_inode->i_mapping,
                                        &file->f_ra, file,
                                        index, 1);
                        file->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT;
                        bh = ext4_bread(NULL, inode, map.m_lblk, 0);
                        if (IS_ERR(bh)) {
                                err = PTR_ERR(bh);
                                bh = NULL;
                                goto errout;
                        }
                }

                if (!bh) {
                        /* corrupt size?  Maybe no more blocks to read */
                        if (ctx->pos > inode->i_blocks << 9)
                                break;
                        ctx->pos += sb->s_blocksize - offset;
                        continue;
                }

                /* Check the checksum */
                if (!buffer_verified(bh) &&
                    !ext4_dirblock_csum_verify(inode, bh)) {
                        EXT4_ERROR_FILE(file, 0, "directory fails checksum "
                                        "at offset %llu",
                                        (unsigned long long)ctx->pos);
                        ctx->pos += sb->s_blocksize - offset;
                        brelse(bh);
                        bh = NULL;
                        continue;
                }
                set_buffer_verified(bh);

                /* If the dir block has changed since the last call to
                 * readdir(2), then we might be pointing to an invalid
                 * dirent right now.  Scan from the start of the block
                 * to make sure. */
                if (!inode_eq_iversion(inode, file->f_version)) {
                        for (i = 0; i < sb->s_blocksize && i < offset; ) {
                                de = (struct ext4_dir_entry_2 *)
                                        (bh->b_data + i);
                                /* It's too expensive to do a full
                                 * dirent test each time round this
                                 * loop, but we do have to test at
                                 * least that it is non-zero.  A
                                 * failure will be detected in the
                                 * dirent test below. */
                                if (ext4_rec_len_from_disk(de->rec_len,
                                        sb->s_blocksize) < EXT4_DIR_REC_LEN(1))
                                        break;
                                i += ext4_rec_len_from_disk(de->rec_len,
                                                            sb->s_blocksize);
                        }
                        offset = i;
                        ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
                                | offset;
                        file->f_version = inode_query_iversion(inode);
                }

                while (ctx->pos < inode->i_size
                       && offset < sb->s_blocksize) {
                        de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
                        if (ext4_check_dir_entry(inode, file, de, bh,
                                                 bh->b_data, bh->b_size,
                                                 offset)) {
                                /*
                                 * On error, skip to the next block
                                 */
                                ctx->pos = (ctx->pos |
                                                (sb->s_blocksize - 1)) + 1;
                                break;
                        }
                        offset += ext4_rec_len_from_disk(de->rec_len,
                                        sb->s_blocksize);
                        if (le32_to_cpu(de->inode)) {
                                if (!IS_ENCRYPTED(inode)) {
                                        if (!dir_emit(ctx, de->name,
                                            de->name_len,
                                            le32_to_cpu(de->inode),
                                            get_dtype(sb, de->file_type)))
                                                goto done;
                                } else {
                                        int save_len = fstr.len;
                                        struct fscrypt_str de_name =
                                                        FSTR_INIT(de->name,
                                                                de->name_len);

                                        /* Directory is encrypted */
                                        err = fscrypt_fname_disk_to_usr(inode,
                                                0, 0, &de_name, &fstr);
                                        de_name = fstr;
                                        fstr.len = save_len;
                                        if (err)
                                                goto errout;
                                        if (!dir_emit(ctx,
                                            de_name.name, de_name.len,
                                            le32_to_cpu(de->inode),
                                            get_dtype(sb, de->file_type)))
                                                goto done;
                                }
                        }
                        ctx->pos += ext4_rec_len_from_disk(de->rec_len,
                                                sb->s_blocksize);
                }
                if ((ctx->pos < inode->i_size) && !dir_relax_shared(inode))
                        goto done;
                brelse(bh);
                bh = NULL;
                offset = 0;
        }
done:
        err = 0;
errout:
        fscrypt_fname_free_buffer(&fstr);
        brelse(bh);
        return err;
}

static inline int is_32bit_api(void)
{
#ifdef CONFIG_COMPAT
        return in_compat_syscall();
#else
        return (BITS_PER_LONG == 32);
#endif
}

/*
 * These functions convert from the major/minor hash to an f_pos
 * value for dx directories
 *
 * Upper layer (for example NFS) should specify FMODE_32BITHASH or
 * FMODE_64BITHASH explicitly. On the other hand, we allow ext4 to be mounted
 * directly on both 32-bit and 64-bit nodes, under such case, neither
 * FMODE_32BITHASH nor FMODE_64BITHASH is specified.
 */
static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor)
{
        if ((filp->f_mode & FMODE_32BITHASH) ||
            (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
                return major >> 1;
        else
                return ((__u64)(major >> 1) << 32) | (__u64)minor;
}

static inline __u32 pos2maj_hash(struct file *filp, loff_t pos)
{
        if ((filp->f_mode & FMODE_32BITHASH) ||
            (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
                return (pos << 1) & 0xffffffff;
        else
                return ((pos >> 32) << 1) & 0xffffffff;
}

static inline __u32 pos2min_hash(struct file *filp, loff_t pos)
{
        if ((filp->f_mode & FMODE_32BITHASH) ||
            (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
                return 0;
        else
                return pos & 0xffffffff;
}

/*
 * Return 32- or 64-bit end-of-file for dx directories
 */
static inline loff_t ext4_get_htree_eof(struct file *filp)
{
        if ((filp->f_mode & FMODE_32BITHASH) ||
            (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
                return EXT4_HTREE_EOF_32BIT;
        else
                return EXT4_HTREE_EOF_64BIT;
}


/*
 * ext4_dir_llseek() calls generic_file_llseek_size to handle htree
 * directories, where the "offset" is in terms of the filename hash
 * value instead of the byte offset.
 *
 * Because we may return a 64-bit hash that is well beyond offset limits,
 * we need to pass the max hash as the maximum allowable offset in
 * the htree directory case.
 *
 * For non-htree, ext4_llseek already chooses the proper max offset.
 */
static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence)
{
        struct inode *inode = file->f_mapping->host;
        int dx_dir = is_dx_dir(inode);
        loff_t ret, htree_max = ext4_get_htree_eof(file);

        if (likely(dx_dir))
                ret = generic_file_llseek_size(file, offset, whence,
                                                    htree_max, htree_max);
        else
                ret = ext4_llseek(file, offset, whence);
        file->f_version = inode_peek_iversion(inode) - 1;
        return ret;
}

/*
 * This structure holds the nodes of the red-black tree used to store
 * the directory entry in hash order.
 */
struct fname {
        __u32                hash;
        __u32                minor_hash;
        struct rb_node        rb_hash;
        struct fname        *next;
        __u32                inode;
        __u8                name_len;
        __u8                file_type;
        char                name[];
};

/*
 * This functoin implements a non-recursive way of freeing all of the
 * nodes in the red-black tree.
 */
static void free_rb_tree_fname(struct rb_root *root)
{
        struct fname *fname, *next;

        rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash)
                while (fname) {
                        struct fname *old = fname;
                        fname = fname->next;
                        kfree(old);
                }

        *root = RB_ROOT;
}


static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp,
                                                           loff_t pos)
{
        struct dir_private_info *p;

        p = kzalloc(sizeof(*p), GFP_KERNEL);
        if (!p)
                return NULL;
        p->curr_hash = pos2maj_hash(filp, pos);
        p->curr_minor_hash = pos2min_hash(filp, pos);
        return p;
}

void ext4_htree_free_dir_info(struct dir_private_info *p)
{
        free_rb_tree_fname(&p->root);
        kfree(p);
}

/*
 * Given a directory entry, enter it into the fname rb tree.
 *
 * When filename encryption is enabled, the dirent will hold the
 * encrypted filename, while the htree will hold decrypted filename.
 * The decrypted filename is passed in via ent_name.  parameter.
 */
int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
                             __u32 minor_hash,
                            struct ext4_dir_entry_2 *dirent,
                            struct fscrypt_str *ent_name)
{
        struct rb_node **p, *parent = NULL;
        struct fname *fname, *new_fn;
        struct dir_private_info *info;
        int len;

        info = dir_file->private_data;
        p = &info->root.rb_node;

        /* Create and allocate the fname structure */
        len = sizeof(struct fname) + ent_name->len + 1;
        new_fn = kzalloc(len, GFP_KERNEL);
        if (!new_fn)
                return -ENOMEM;
        new_fn->hash = hash;
        new_fn->minor_hash = minor_hash;
        new_fn->inode = le32_to_cpu(dirent->inode);
        new_fn->name_len = ent_name->len;
        new_fn->file_type = dirent->file_type;
        memcpy(new_fn->name, ent_name->name, ent_name->len);

        while (*p) {
                parent = *p;
                fname = rb_entry(parent, struct fname, rb_hash);

                /*
                 * If the hash and minor hash match up, then we put
                 * them on a linked list.  This rarely happens...
                 */
                if ((new_fn->hash == fname->hash) &&
                    (new_fn->minor_hash == fname->minor_hash)) {
                        new_fn->next = fname->next;
                        fname->next = new_fn;
                        return 0;
                }

                if (new_fn->hash < fname->hash)
                        p = &(*p)->rb_left;
                else if (new_fn->hash > fname->hash)
                        p = &(*p)->rb_right;
                else if (new_fn->minor_hash < fname->minor_hash)
                        p = &(*p)->rb_left;
                else /* if (new_fn->minor_hash > fname->minor_hash) */
                        p = &(*p)->rb_right;
        }

        rb_link_node(&new_fn->rb_hash, parent, p);
        rb_insert_color(&new_fn->rb_hash, &info->root);
        return 0;
}



/*
 * This is a helper function for ext4_dx_readdir.  It calls filldir
 * for all entres on the fname linked list.  (Normally there is only
 * one entry on the linked list, unless there are 62 bit hash collisions.)
 */
static int call_filldir(struct file *file, struct dir_context *ctx,
                        struct fname *fname)
{
        struct dir_private_info *info = file->private_data;
        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;

        if (!fname) {
                ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: "
                         "called with null fname?!?", __func__, __LINE__,
                         inode->i_ino, current->comm);
                return 0;
        }
        ctx->pos = hash2pos(file, fname->hash, fname->minor_hash);
        while (fname) {
                if (!dir_emit(ctx, fname->name,
                                fname->name_len,
                                fname->inode,
                                get_dtype(sb, fname->file_type))) {
                        info->extra_fname = fname;
                        return 1;
                }
                fname = fname->next;
        }
        return 0;
}

static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
{
        struct dir_private_info *info = file->private_data;
        struct inode *inode = file_inode(file);
        struct fname *fname;
        int ret = 0;

        if (!info) {
                info = ext4_htree_create_dir_info(file, ctx->pos);
                if (!info)
                        return -ENOMEM;
                file->private_data = info;
        }

        if (ctx->pos == ext4_get_htree_eof(file))
                return 0;        /* EOF */

        /* Some one has messed with f_pos; reset the world */
        if (info->last_pos != ctx->pos) {
                free_rb_tree_fname(&info->root);
                info->curr_node = NULL;
                info->extra_fname = NULL;
                info->curr_hash = pos2maj_hash(file, ctx->pos);
                info->curr_minor_hash = pos2min_hash(file, ctx->pos);
        }

        /*
         * If there are any leftover names on the hash collision
         * chain, return them first.
         */
        if (info->extra_fname) {
                if (call_filldir(file, ctx, info->extra_fname))
                        goto finished;
                info->extra_fname = NULL;
                goto next_node;
        } else if (!info->curr_node)
                info->curr_node = rb_first(&info->root);

        while (1) {
                /*
                 * Fill the rbtree if we have no more entries,
                 * or the inode has changed since we last read in the
                 * cached entries.
                 */
                if ((!info->curr_node) ||
                    !inode_eq_iversion(inode, file->f_version)) {
                        info->curr_node = NULL;
                        free_rb_tree_fname(&info->root);
                        file->f_version = inode_query_iversion(inode);
                        ret = ext4_htree_fill_tree(file, info->curr_hash,
                                                   info->curr_minor_hash,
                                                   &info->next_hash);
                        if (ret < 0)
                                goto finished;
                        if (ret == 0) {
                                ctx->pos = ext4_get_htree_eof(file);
                                break;
                        }
                        info->curr_node = rb_first(&info->root);
                }

                fname = rb_entry(info->curr_node, struct fname, rb_hash);
                info->curr_hash = fname->hash;
                info->curr_minor_hash = fname->minor_hash;
                if (call_filldir(file, ctx, fname))
                        break;
        next_node:
                info->curr_node = rb_next(info->curr_node);
                if (info->curr_node) {
                        fname = rb_entry(info->curr_node, struct fname,
                                         rb_hash);
                        info->curr_hash = fname->hash;
                        info->curr_minor_hash = fname->minor_hash;
                } else {
                        if (info->next_hash == ~0) {
                                ctx->pos = ext4_get_htree_eof(file);
                                break;
                        }
                        info->curr_hash = info->next_hash;
                        info->curr_minor_hash = 0;
                }
        }
finished:
        info->last_pos = ctx->pos;
        return ret < 0 ? ret : 0;
}

static int ext4_dir_open(struct inode * inode, struct file * filp)
{
        if (IS_ENCRYPTED(inode))
                return fscrypt_get_encryption_info(inode) ? -EACCES : 0;
        return 0;
}

static int ext4_release_dir(struct inode *inode, struct file *filp)
{
        if (filp->private_data)
                ext4_htree_free_dir_info(filp->private_data);

        return 0;
}

int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf,
                      int buf_size)
{
        struct ext4_dir_entry_2 *de;
        int rlen;
        unsigned int offset = 0;
        char *top;

        de = (struct ext4_dir_entry_2 *)buf;
        top = buf + buf_size;
        while ((char *) de < top) {
                if (ext4_check_dir_entry(dir, NULL, de, bh,
                                         buf, buf_size, offset))
                        return -EFSCORRUPTED;
                rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
                de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
                offset += rlen;
        }
        if ((char *) de > top)
                return -EFSCORRUPTED;

        return 0;
}

const struct file_operations ext4_dir_operations = {
        .llseek                = ext4_dir_llseek,
        .read                = generic_read_dir,
        .iterate_shared        = ext4_readdir,
        .unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl        = ext4_compat_ioctl,
#endif
        .fsync                = ext4_sync_file,
        .open                = ext4_dir_open,
        .release        = ext4_release_dir,
};

#ifdef CONFIG_UNICODE
const struct dentry_operations ext4_dentry_ops = {
        .d_hash = generic_ci_d_hash,
        .d_compare = generic_ci_d_compare,
};
#endif
































































































































































































































    2 




























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _KERNEL_EVENTS_INTERNAL_H
#define _KERNEL_EVENTS_INTERNAL_H

#include <linux/hardirq.h>
#include <linux/uaccess.h>
#include <linux/refcount.h>

/* Buffer handling */

#define RING_BUFFER_WRITABLE                0x01

struct perf_buffer {
        refcount_t                        refcount;
        struct rcu_head                        rcu_head;
#ifdef CONFIG_PERF_USE_VMALLOC
        struct work_struct                work;
        int                                page_order;        /* allocation order  */
#endif
        int                                nr_pages;        /* nr of data pages  */
        int                                overwrite;        /* can overwrite itself */
        int                                paused;                /* can write into ring buffer */

        atomic_t                        poll;                /* POLL_ for wakeups */

        local_t                                head;                /* write position    */
        unsigned int                        nest;                /* nested writers    */
        local_t                                events;                /* event limit       */
        local_t                                wakeup;                /* wakeup stamp      */
        local_t                                lost;                /* nr records lost   */

        long                                watermark;        /* wakeup watermark  */
        long                                aux_watermark;
        /* poll crap */
        spinlock_t                        event_lock;
        struct list_head                event_list;

        atomic_t                        mmap_count;
        unsigned long                        mmap_locked;
        struct user_struct                *mmap_user;

        /* AUX area */
        struct mutex                        aux_mutex;
        long                                aux_head;
        unsigned int                        aux_nest;
        long                                aux_wakeup;        /* last aux_watermark boundary crossed by aux_head */
        unsigned long                        aux_pgoff;
        int                                aux_nr_pages;
        int                                aux_overwrite;
        atomic_t                        aux_mmap_count;
        unsigned long                        aux_mmap_locked;
        void                                (*free_aux)(void *);
        refcount_t                        aux_refcount;
        int                                aux_in_sampling;
        void                                **aux_pages;
        void                                *aux_priv;

        struct perf_event_mmap_page        *user_page;
        void                                *data_pages[];
};

extern void rb_free(struct perf_buffer *rb);

static inline void rb_free_rcu(struct rcu_head *rcu_head)
{
        struct perf_buffer *rb;

        rb = container_of(rcu_head, struct perf_buffer, rcu_head);
        rb_free(rb);
}

static inline void rb_toggle_paused(struct perf_buffer *rb, bool pause)
{
        if (!pause && rb->nr_pages)
                rb->paused = 0;
        else
                rb->paused = 1;
}

extern struct perf_buffer *
rb_alloc(int nr_pages, long watermark, int cpu, int flags);
extern void perf_event_wakeup(struct perf_event *event);
extern int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
                        pgoff_t pgoff, int nr_pages, long watermark, int flags);
extern void rb_free_aux(struct perf_buffer *rb);
extern struct perf_buffer *ring_buffer_get(struct perf_event *event);
extern void ring_buffer_put(struct perf_buffer *rb);

static inline bool rb_has_aux(struct perf_buffer *rb)
{
        return !!rb->aux_nr_pages;
}

void perf_event_aux_event(struct perf_event *event, unsigned long head,
                          unsigned long size, u64 flags);

extern struct page *
perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff);

#ifdef CONFIG_PERF_USE_VMALLOC
/*
 * Back perf_mmap() with vmalloc memory.
 *
 * Required for architectures that have d-cache aliasing issues.
 */

static inline int page_order(struct perf_buffer *rb)
{
        return rb->page_order;
}

#else

static inline int page_order(struct perf_buffer *rb)
{
        return 0;
}
#endif

static inline int data_page_nr(struct perf_buffer *rb)
{
        return rb->nr_pages << page_order(rb);
}

static inline unsigned long perf_data_size(struct perf_buffer *rb)
{
        return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
}

static inline unsigned long perf_aux_size(struct perf_buffer *rb)
{
        return (unsigned long)rb->aux_nr_pages << PAGE_SHIFT;
}

#define __DEFINE_OUTPUT_COPY_BODY(advance_buf, memcpy_func, ...)        \
{                                                                        \
        unsigned long size, written;                                        \
                                                                        \
        do {                                                                \
                size    = min(handle->size, len);                        \
                written = memcpy_func(__VA_ARGS__);                        \
                written = size - written;                                \
                                                                        \
                len -= written;                                                \
                handle->addr += written;                                \
                if (advance_buf)                                        \
                        buf += written;                                        \
                handle->size -= written;                                \
                if (!handle->size) {                                        \
                        struct perf_buffer *rb = handle->rb;        \
                                                                        \
                        handle->page++;                                        \
                        handle->page &= rb->nr_pages - 1;                \
                        handle->addr = rb->data_pages[handle->page];        \
                        handle->size = PAGE_SIZE << page_order(rb);        \
                }                                                        \
        } while (len && written == size);                                \
                                                                        \
        return len;                                                        \
}

#define DEFINE_OUTPUT_COPY(func_name, memcpy_func)                        \
static inline unsigned long                                                \
func_name(struct perf_output_handle *handle,                                \
          const void *buf, unsigned long len)                                \
__DEFINE_OUTPUT_COPY_BODY(true, memcpy_func, handle->addr, buf, size)

static inline unsigned long
__output_custom(struct perf_output_handle *handle, perf_copy_f copy_func,
                const void *buf, unsigned long len)
{
        unsigned long orig_len = len;
        __DEFINE_OUTPUT_COPY_BODY(false, copy_func, handle->addr, buf,
                                  orig_len - len, size)
}

static inline unsigned long
memcpy_common(void *dst, const void *src, unsigned long n)
{
        memcpy(dst, src, n);
        return 0;
}

DEFINE_OUTPUT_COPY(__output_copy, memcpy_common)

static inline unsigned long
memcpy_skip(void *dst, const void *src, unsigned long n)
{
        return 0;
}

DEFINE_OUTPUT_COPY(__output_skip, memcpy_skip)

#ifndef arch_perf_out_copy_user
#define arch_perf_out_copy_user arch_perf_out_copy_user

static inline unsigned long
arch_perf_out_copy_user(void *dst, const void *src, unsigned long n)
{
        unsigned long ret;

        pagefault_disable();
        ret = __copy_from_user_inatomic(dst, src, n);
        pagefault_enable();

        return ret;
}
#endif

DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)

static inline int get_recursion_context(int *recursion)
{
        unsigned int pc = preempt_count();
        unsigned char rctx = 0;

        rctx += !!(pc & (NMI_MASK));
        rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK));
        rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET));

        if (recursion[rctx])
                return -1;

        recursion[rctx]++;
        barrier();

        return rctx;
}

static inline void put_recursion_context(int *recursion, int rctx)
{
        barrier();
        recursion[rctx]--;
}

#ifdef CONFIG_HAVE_PERF_USER_STACK_DUMP
static inline bool arch_perf_have_user_stack_dump(void)
{
        return true;
}

#define perf_user_stack_pointer(regs) user_stack_pointer(regs)
#else
static inline bool arch_perf_have_user_stack_dump(void)
{
        return false;
}

#define perf_user_stack_pointer(regs) 0
#endif /* CONFIG_HAVE_PERF_USER_STACK_DUMP */

#endif /* _KERNEL_EVENTS_INTERNAL_H */























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * V9FS definitions.
 *
 *  Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com>
 *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
 */
#ifndef FS_9P_V9FS_H
#define FS_9P_V9FS_H

#include <linux/backing-dev.h>

/**
 * enum p9_session_flags - option flags for each 9P session
 * @V9FS_PROTO_2000U: whether or not to use 9P2000.u extensions
 * @V9FS_PROTO_2000L: whether or not to use 9P2000.l extensions
 * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy
 * @V9FS_ACCESS_USER: a new attach will be issued for every user (default)
 * @V9FS_ACCESS_CLIENT: Just like user, but access check is performed on client.
 * @V9FS_ACCESS_ANY: use a single attach for all users
 * @V9FS_ACCESS_MASK: bit mask of different ACCESS options
 * @V9FS_POSIX_ACL: POSIX ACLs are enforced
 *
 * Session flags reflect options selected by users at mount time
 */
#define        V9FS_ACCESS_ANY (V9FS_ACCESS_SINGLE | \
                         V9FS_ACCESS_USER |   \
                         V9FS_ACCESS_CLIENT)
#define V9FS_ACCESS_MASK V9FS_ACCESS_ANY
#define V9FS_ACL_MASK V9FS_POSIX_ACL

enum p9_session_flags {
        V9FS_PROTO_2000U        = 0x01,
        V9FS_PROTO_2000L        = 0x02,
        V9FS_ACCESS_SINGLE        = 0x04,
        V9FS_ACCESS_USER        = 0x08,
        V9FS_ACCESS_CLIENT        = 0x10,
        V9FS_POSIX_ACL                = 0x20
};

/* possible values of ->cache */
/**
 * enum p9_cache_modes - user specified cache preferences
 * @CACHE_NONE: do not cache data, dentries, or directory contents (default)
 * @CACHE_LOOSE: cache data, dentries, and directory contents w/no consistency
 *
 * eventually support loose, tight, time, session, default always none
 */

enum p9_cache_modes {
        CACHE_NONE,
        CACHE_MMAP,
        CACHE_LOOSE,
        CACHE_FSCACHE,
        nr__p9_cache_modes
};

/**
 * struct v9fs_session_info - per-instance session information
 * @flags: session options of type &p9_session_flags
 * @nodev: set to 1 to disable device mapping
 * @debug: debug level
 * @afid: authentication handle
 * @cache: cache mode of type &p9_cache_modes
 * @cachetag: the tag of the cache associated with this session
 * @fscache: session cookie associated with FS-Cache
 * @uname: string user name to mount hierarchy as
 * @aname: mount specifier for remote hierarchy
 * @maxdata: maximum data to be sent/recvd per protocol message
 * @dfltuid: default numeric userid to mount hierarchy as
 * @dfltgid: default numeric groupid to mount hierarchy as
 * @uid: if %V9FS_ACCESS_SINGLE, the numeric uid which mounted the hierarchy
 * @clnt: reference to 9P network client instantiated for this session
 * @slist: reference to list of registered 9p sessions
 *
 * This structure holds state for each session instance established during
 * a sys_mount() .
 *
 * Bugs: there seems to be a lot of state which could be condensed and/or
 * removed.
 */

struct v9fs_session_info {
        /* options */
        unsigned char flags;
        unsigned char nodev;
        unsigned short debug;
        unsigned int afid;
        unsigned int cache;
#ifdef CONFIG_9P_FSCACHE
        char *cachetag;
        struct fscache_cookie *fscache;
#endif

        char *uname;                /* user name to mount as */
        char *aname;                /* name of remote hierarchy being mounted */
        unsigned int maxdata;        /* max data for client interface */
        kuid_t dfltuid;                /* default uid/muid for legacy support */
        kgid_t dfltgid;                /* default gid for legacy support */
        kuid_t uid;                /* if ACCESS_SINGLE, the uid that has access */
        struct p9_client *clnt;        /* 9p client */
        struct list_head slist; /* list of sessions registered with v9fs */
        struct rw_semaphore rename_sem;
        long session_lock_timeout; /* retry interval for blocking locks */
};

/* cache_validity flags */
#define V9FS_INO_INVALID_ATTR 0x01

struct v9fs_inode {
#ifdef CONFIG_9P_FSCACHE
        struct mutex fscache_lock;
        struct fscache_cookie *fscache;
#endif
        struct p9_qid qid;
        unsigned int cache_validity;
        struct p9_fid *writeback_fid;
        struct mutex v_mutex;
        struct inode vfs_inode;
};

static inline struct v9fs_inode *V9FS_I(const struct inode *inode)
{
        return container_of(inode, struct v9fs_inode, vfs_inode);
}

extern int v9fs_show_options(struct seq_file *m, struct dentry *root);

struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
                                                                        char *);
extern void v9fs_session_close(struct v9fs_session_info *v9ses);
extern void v9fs_session_cancel(struct v9fs_session_info *v9ses);
extern void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
extern struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
                        unsigned int flags);
extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d);
extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d);
extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                           struct inode *new_dir, struct dentry *new_dentry,
                           unsigned int flags);
extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses,
                                         struct p9_fid *fid,
                                         struct super_block *sb, int new);
extern const struct inode_operations v9fs_dir_inode_operations_dotl;
extern const struct inode_operations v9fs_file_inode_operations_dotl;
extern const struct inode_operations v9fs_symlink_inode_operations_dotl;
extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses,
                                              struct p9_fid *fid,
                                              struct super_block *sb, int new);

/* other default globals */
#define V9FS_PORT        564
#define V9FS_DEFUSER        "nobody"
#define V9FS_DEFANAME        ""
#define V9FS_DEFUID        KUIDT_INIT(-2)
#define V9FS_DEFGID        KGIDT_INIT(-2)

static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode)
{
        return (inode->i_sb->s_fs_info);
}

static inline struct v9fs_session_info *v9fs_dentry2v9ses(struct dentry *dentry)
{
        return dentry->d_sb->s_fs_info;
}

static inline int v9fs_proto_dotu(struct v9fs_session_info *v9ses)
{
        return v9ses->flags & V9FS_PROTO_2000U;
}

static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
{
        return v9ses->flags & V9FS_PROTO_2000L;
}

/**
 * v9fs_get_inode_from_fid - Helper routine to populate an inode by
 * issuing a attribute request
 * @v9ses: session information
 * @fid: fid to issue attribute request for
 * @sb: superblock on which to create inode
 *
 */
static inline struct inode *
v9fs_get_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
                        struct super_block *sb)
{
        if (v9fs_proto_dotl(v9ses))
                return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 0);
        else
                return v9fs_inode_from_fid(v9ses, fid, sb, 0);
}

/**
 * v9fs_get_new_inode_from_fid - Helper routine to populate an inode by
 * issuing a attribute request
 * @v9ses: session information
 * @fid: fid to issue attribute request for
 * @sb: superblock on which to create inode
 *
 */
static inline struct inode *
v9fs_get_new_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
                            struct super_block *sb)
{
        if (v9fs_proto_dotl(v9ses))
                return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 1);
        else
                return v9fs_inode_from_fid(v9ses, fid, sb, 1);
}

#endif
























    1 





    1 




































    1 
    1 

    1 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/isofs/util.c
 */

#include <linux/time.h>
#include "isofs.h"

/* 
 * We have to convert from a MM/DD/YY format to the Unix ctime format.
 * We have to take into account leap years and all of that good stuff.
 * Unfortunately, the kernel does not have the information on hand to
 * take into account daylight savings time, but it shouldn't matter.
 * The time stored should be localtime (with or without DST in effect),
 * and the timezone offset should hold the offset required to get back
 * to GMT.  Thus  we should always be correct.
 */

int iso_date(u8 *p, int flag)
{
        int year, month, day, hour, minute, second, tz;
        int crtime;

        year = p[0];
        month = p[1];
        day = p[2];
        hour = p[3];
        minute = p[4];
        second = p[5];
        if (flag == 0) tz = p[6]; /* High sierra has no time zone */
        else tz = 0;
        
        if (year < 0) {
                crtime = 0;
        } else {
                crtime = mktime64(year+1900, month, day, hour, minute, second);

                /* sign extend */
                if (tz & 0x80)
                        tz |= (-1 << 8);
                
                /* 
                 * The timezone offset is unreliable on some disks,
                 * so we make a sanity check.  In no case is it ever
                 * more than 13 hours from GMT, which is 52*15min.
                 * The time is always stored in localtime with the
                 * timezone offset being what get added to GMT to
                 * get to localtime.  Thus we need to subtract the offset
                 * to get to true GMT, which is what we store the time
                 * as internally.  On the local system, the user may set
                 * their timezone any way they wish, of course, so GMT
                 * gets converted back to localtime on the receiving
                 * system.
                 *
                 * NOTE: mkisofs in versions prior to mkisofs-1.10 had
                 * the sign wrong on the timezone offset.  This has now
                 * been corrected there too, but if you are getting screwy
                 * results this may be the explanation.  If enough people
                 * complain, a user configuration option could be added
                 * to add the timezone offset in with the wrong sign
                 * for 'compatibility' with older discs, but I cannot see how
                 * it will matter that much.
                 *
                 * Thanks to kuhlmav@elec.canterbury.ac.nz (Volker Kuhlmann)
                 * for pointing out the sign error.
                 */
                if (-52 <= tz && tz <= 52)
                        crtime -= tz * 15 * 60;
        }
        return crtime;
}


































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM alarmtimer

#if !defined(_TRACE_ALARMTIMER_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_ALARMTIMER_H

#include <linux/alarmtimer.h>
#include <linux/rtc.h>
#include <linux/tracepoint.h>

TRACE_DEFINE_ENUM(ALARM_REALTIME);
TRACE_DEFINE_ENUM(ALARM_BOOTTIME);
TRACE_DEFINE_ENUM(ALARM_REALTIME_FREEZER);
TRACE_DEFINE_ENUM(ALARM_BOOTTIME_FREEZER);

#define show_alarm_type(type)        __print_flags(type, " | ",        \
        { 1 << ALARM_REALTIME, "REALTIME" },                        \
        { 1 << ALARM_BOOTTIME, "BOOTTIME" },                        \
        { 1 << ALARM_REALTIME_FREEZER, "REALTIME Freezer" },        \
        { 1 << ALARM_BOOTTIME_FREEZER, "BOOTTIME Freezer" })

TRACE_EVENT(alarmtimer_suspend,

        TP_PROTO(ktime_t expires, int flag),

        TP_ARGS(expires, flag),

        TP_STRUCT__entry(
                __field(s64, expires)
                __field(unsigned char, alarm_type)
        ),

        TP_fast_assign(
                __entry->expires = expires;
                __entry->alarm_type = flag;
        ),

        TP_printk("alarmtimer type:%s expires:%llu",
                  show_alarm_type((1 << __entry->alarm_type)),
                  __entry->expires
        )
);

DECLARE_EVENT_CLASS(alarm_class,

        TP_PROTO(struct alarm *alarm, ktime_t now),

        TP_ARGS(alarm, now),

        TP_STRUCT__entry(
                __field(void *,        alarm)
                __field(unsigned char, alarm_type)
                __field(s64, expires)
                __field(s64, now)
        ),

        TP_fast_assign(
                __entry->alarm = alarm;
                __entry->alarm_type = alarm->type;
                __entry->expires = alarm->node.expires;
                __entry->now = now;
        ),

        TP_printk("alarmtimer:%p type:%s expires:%llu now:%llu",
                  __entry->alarm,
                  show_alarm_type((1 << __entry->alarm_type)),
                  __entry->expires,
                  __entry->now
        )
);

DEFINE_EVENT(alarm_class, alarmtimer_fired,

        TP_PROTO(struct alarm *alarm, ktime_t now),

        TP_ARGS(alarm, now)
);

DEFINE_EVENT(alarm_class, alarmtimer_start,

        TP_PROTO(struct alarm *alarm, ktime_t now),

        TP_ARGS(alarm, now)
);

DEFINE_EVENT(alarm_class, alarmtimer_cancel,

        TP_PROTO(struct alarm *alarm, ktime_t now),

        TP_ARGS(alarm, now)
);

#endif /* _TRACE_ALARMTIMER_H */

/* This part must be outside protection */
#include <trace/define_trace.h>


























































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
// SPDX-License-Identifier: GPL-2.0
/*
 *  hrtimers - High-resolution kernel timers
 *
 *   Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de>
 *   Copyright(C) 2005, Red Hat, Inc., Ingo Molnar
 *
 *  data type definitions, declarations, prototypes
 *
 *  Started by: Thomas Gleixner and Ingo Molnar
 */
#ifndef _LINUX_HRTIMER_H
#define _LINUX_HRTIMER_H

#include <linux/hrtimer_defs.h>
#include <linux/rbtree.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/percpu.h>
#include <linux/seqlock.h>
#include <linux/timer.h>
#include <linux/timerqueue.h>

struct hrtimer_clock_base;
struct hrtimer_cpu_base;

/*
 * Mode arguments of xxx_hrtimer functions:
 *
 * HRTIMER_MODE_ABS                - Time value is absolute
 * HRTIMER_MODE_REL                - Time value is relative to now
 * HRTIMER_MODE_PINNED                - Timer is bound to CPU (is only considered
 *                                  when starting the timer)
 * HRTIMER_MODE_SOFT                - Timer callback function will be executed in
 *                                  soft irq context
 * HRTIMER_MODE_HARD                - Timer callback function will be executed in
 *                                  hard irq context even on PREEMPT_RT.
 */
enum hrtimer_mode {
        HRTIMER_MODE_ABS        = 0x00,
        HRTIMER_MODE_REL        = 0x01,
        HRTIMER_MODE_PINNED        = 0x02,
        HRTIMER_MODE_SOFT        = 0x04,
        HRTIMER_MODE_HARD        = 0x08,

        HRTIMER_MODE_ABS_PINNED = HRTIMER_MODE_ABS | HRTIMER_MODE_PINNED,
        HRTIMER_MODE_REL_PINNED = HRTIMER_MODE_REL | HRTIMER_MODE_PINNED,

        HRTIMER_MODE_ABS_SOFT        = HRTIMER_MODE_ABS | HRTIMER_MODE_SOFT,
        HRTIMER_MODE_REL_SOFT        = HRTIMER_MODE_REL | HRTIMER_MODE_SOFT,

        HRTIMER_MODE_ABS_PINNED_SOFT = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_SOFT,
        HRTIMER_MODE_REL_PINNED_SOFT = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_SOFT,

        HRTIMER_MODE_ABS_HARD        = HRTIMER_MODE_ABS | HRTIMER_MODE_HARD,
        HRTIMER_MODE_REL_HARD        = HRTIMER_MODE_REL | HRTIMER_MODE_HARD,

        HRTIMER_MODE_ABS_PINNED_HARD = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_HARD,
        HRTIMER_MODE_REL_PINNED_HARD = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_HARD,
};

/*
 * Return values for the callback function
 */
enum hrtimer_restart {
        HRTIMER_NORESTART,        /* Timer is not restarted */
        HRTIMER_RESTART,        /* Timer must be restarted */
};

/*
 * Values to track state of the timer
 *
 * Possible states:
 *
 * 0x00                inactive
 * 0x01                enqueued into rbtree
 *
 * The callback state is not part of the timer->state because clearing it would
 * mean touching the timer after the callback, this makes it impossible to free
 * the timer from the callback function.
 *
 * Therefore we track the callback state in:
 *
 *        timer->base->cpu_base->running == timer
 *
 * On SMP it is possible to have a "callback function running and enqueued"
 * status. It happens for example when a posix timer expired and the callback
 * queued a signal. Between dropping the lock which protects the posix timer
 * and reacquiring the base lock of the hrtimer, another CPU can deliver the
 * signal and rearm the timer.
 *
 * All state transitions are protected by cpu_base->lock.
 */
#define HRTIMER_STATE_INACTIVE        0x00
#define HRTIMER_STATE_ENQUEUED        0x01

/**
 * struct hrtimer - the basic hrtimer structure
 * @node:        timerqueue node, which also manages node.expires,
 *                the absolute expiry time in the hrtimers internal
 *                representation. The time is related to the clock on
 *                which the timer is based. Is setup by adding
 *                slack to the _softexpires value. For non range timers
 *                identical to _softexpires.
 * @_softexpires: the absolute earliest expiry time of the hrtimer.
 *                The time which was given as expiry time when the timer
 *                was armed.
 * @function:        timer expiry callback function
 * @base:        pointer to the timer base (per cpu and per clock)
 * @state:        state information (See bit values above)
 * @is_rel:        Set if the timer was armed relative
 * @is_soft:        Set if hrtimer will be expired in soft interrupt context.
 * @is_hard:        Set if hrtimer will be expired in hard interrupt context
 *                even on RT.
 *
 * The hrtimer structure must be initialized by hrtimer_init()
 */
struct hrtimer {
        struct timerqueue_node                node;
        ktime_t                                _softexpires;
        enum hrtimer_restart                (*function)(struct hrtimer *);
        struct hrtimer_clock_base        *base;
        u8                                state;
        u8                                is_rel;
        u8                                is_soft;
        u8                                is_hard;
};

/**
 * struct hrtimer_sleeper - simple sleeper structure
 * @timer:        embedded timer structure
 * @task:        task to wake up
 *
 * task is set to NULL, when the timer expires.
 */
struct hrtimer_sleeper {
        struct hrtimer timer;
        struct task_struct *task;
};

#ifdef CONFIG_64BIT
# define __hrtimer_clock_base_align        ____cacheline_aligned
#else
# define __hrtimer_clock_base_align
#endif

/**
 * struct hrtimer_clock_base - the timer base for a specific clock
 * @cpu_base:                per cpu clock base
 * @index:                clock type index for per_cpu support when moving a
 *                        timer to a base on another cpu.
 * @clockid:                clock id for per_cpu support
 * @seq:                seqcount around __run_hrtimer
 * @running:                pointer to the currently running hrtimer
 * @active:                red black tree root node for the active timers
 * @get_time:                function to retrieve the current time of the clock
 * @offset:                offset of this clock to the monotonic base
 */
struct hrtimer_clock_base {
        struct hrtimer_cpu_base        *cpu_base;
        unsigned int                index;
        clockid_t                clockid;
        seqcount_raw_spinlock_t        seq;
        struct hrtimer                *running;
        struct timerqueue_head        active;
        ktime_t                        (*get_time)(void);
        ktime_t                        offset;
} __hrtimer_clock_base_align;

enum  hrtimer_base_type {
        HRTIMER_BASE_MONOTONIC,
        HRTIMER_BASE_REALTIME,
        HRTIMER_BASE_BOOTTIME,
        HRTIMER_BASE_TAI,
        HRTIMER_BASE_MONOTONIC_SOFT,
        HRTIMER_BASE_REALTIME_SOFT,
        HRTIMER_BASE_BOOTTIME_SOFT,
        HRTIMER_BASE_TAI_SOFT,
        HRTIMER_MAX_CLOCK_BASES,
};

/**
 * struct hrtimer_cpu_base - the per cpu clock bases
 * @lock:                lock protecting the base and associated clock bases
 *                        and timers
 * @cpu:                cpu number
 * @active_bases:        Bitfield to mark bases with active timers
 * @clock_was_set_seq:        Sequence counter of clock was set events
 * @hres_active:        State of high resolution mode
 * @in_hrtirq:                hrtimer_interrupt() is currently executing
 * @hang_detected:        The last hrtimer interrupt detected a hang
 * @softirq_activated:        displays, if the softirq is raised - update of softirq
 *                        related settings is not required then.
 * @nr_events:                Total number of hrtimer interrupt events
 * @nr_retries:                Total number of hrtimer interrupt retries
 * @nr_hangs:                Total number of hrtimer interrupt hangs
 * @max_hang_time:        Maximum time spent in hrtimer_interrupt
 * @softirq_expiry_lock: Lock which is taken while softirq based hrtimer are
 *                         expired
 * @online:                CPU is online from an hrtimers point of view
 * @timer_waiters:        A hrtimer_cancel() invocation waits for the timer
 *                        callback to finish.
 * @expires_next:        absolute time of the next event, is required for remote
 *                        hrtimer enqueue; it is the total first expiry time (hard
 *                        and soft hrtimer are taken into account)
 * @next_timer:                Pointer to the first expiring timer
 * @softirq_expires_next: Time to check, if soft queues needs also to be expired
 * @softirq_next_timer: Pointer to the first expiring softirq based timer
 * @clock_base:                array of clock bases for this cpu
 *
 * Note: next_timer is just an optimization for __remove_hrtimer().
 *         Do not dereference the pointer because it is not reliable on
 *         cross cpu removals.
 */
struct hrtimer_cpu_base {
        raw_spinlock_t                        lock;
        unsigned int                        cpu;
        unsigned int                        active_bases;
        unsigned int                        clock_was_set_seq;
        unsigned int                        hres_active                : 1,
                                        in_hrtirq                : 1,
                                        hang_detected                : 1,
                                        softirq_activated       : 1,
                                        online                        : 1;
#ifdef CONFIG_HIGH_RES_TIMERS
        unsigned int                        nr_events;
        unsigned short                        nr_retries;
        unsigned short                        nr_hangs;
        unsigned int                        max_hang_time;
#endif
#ifdef CONFIG_PREEMPT_RT
        spinlock_t                        softirq_expiry_lock;
        atomic_t                        timer_waiters;
#endif
        ktime_t                                expires_next;
        struct hrtimer                        *next_timer;
        ktime_t                                softirq_expires_next;
        struct hrtimer                        *softirq_next_timer;
        struct hrtimer_clock_base        clock_base[HRTIMER_MAX_CLOCK_BASES];
} ____cacheline_aligned;

static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
{
        timer->node.expires = time;
        timer->_softexpires = time;
}

static inline void hrtimer_set_expires_range(struct hrtimer *timer, ktime_t time, ktime_t delta)
{
        timer->_softexpires = time;
        timer->node.expires = ktime_add_safe(time, delta);
}

static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, u64 delta)
{
        timer->_softexpires = time;
        timer->node.expires = ktime_add_safe(time, ns_to_ktime(delta));
}

static inline void hrtimer_set_expires_tv64(struct hrtimer *timer, s64 tv64)
{
        timer->node.expires = tv64;
        timer->_softexpires = tv64;
}

static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time)
{
        timer->node.expires = ktime_add_safe(timer->node.expires, time);
        timer->_softexpires = ktime_add_safe(timer->_softexpires, time);
}

static inline void hrtimer_add_expires_ns(struct hrtimer *timer, u64 ns)
{
        timer->node.expires = ktime_add_ns(timer->node.expires, ns);
        timer->_softexpires = ktime_add_ns(timer->_softexpires, ns);
}

static inline ktime_t hrtimer_get_expires(const struct hrtimer *timer)
{
        return timer->node.expires;
}

static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer)
{
        return timer->_softexpires;
}

static inline s64 hrtimer_get_expires_tv64(const struct hrtimer *timer)
{
        return timer->node.expires;
}
static inline s64 hrtimer_get_softexpires_tv64(const struct hrtimer *timer)
{
        return timer->_softexpires;
}

static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer)
{
        return ktime_to_ns(timer->node.expires);
}

static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer)
{
        return ktime_sub(timer->node.expires, timer->base->get_time());
}

static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer)
{
        return timer->base->get_time();
}

static inline int hrtimer_is_hres_active(struct hrtimer *timer)
{
        return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ?
                timer->base->cpu_base->hres_active : 0;
}

#ifdef CONFIG_HIGH_RES_TIMERS
struct clock_event_device;

extern void hrtimer_interrupt(struct clock_event_device *dev);

extern unsigned int hrtimer_resolution;

#else

#define hrtimer_resolution        (unsigned int)LOW_RES_NSEC

#endif

static inline ktime_t
__hrtimer_expires_remaining_adjusted(const struct hrtimer *timer, ktime_t now)
{
        ktime_t rem = ktime_sub(timer->node.expires, now);

        /*
         * Adjust relative timers for the extra we added in
         * hrtimer_start_range_ns() to prevent short timeouts.
         */
        if (IS_ENABLED(CONFIG_TIME_LOW_RES) && timer->is_rel)
                rem -= hrtimer_resolution;
        return rem;
}

static inline ktime_t
hrtimer_expires_remaining_adjusted(const struct hrtimer *timer)
{
        return __hrtimer_expires_remaining_adjusted(timer,
                                                    timer->base->get_time());
}

#ifdef CONFIG_TIMERFD
extern void timerfd_clock_was_set(void);
#else
static inline void timerfd_clock_was_set(void) { }
#endif
extern void hrtimers_resume(void);

DECLARE_PER_CPU(struct tick_device, tick_cpu_device);

#ifdef CONFIG_PREEMPT_RT
void hrtimer_cancel_wait_running(const struct hrtimer *timer);
#else
static inline void hrtimer_cancel_wait_running(struct hrtimer *timer)
{
        cpu_relax();
}
#endif

/* Exported timer functions: */

/* Initialize timers: */
extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock,
                         enum hrtimer_mode mode);
extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
                                 enum hrtimer_mode mode);

#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
extern void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t which_clock,
                                  enum hrtimer_mode mode);
extern void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
                                          clockid_t clock_id,
                                          enum hrtimer_mode mode);

extern void destroy_hrtimer_on_stack(struct hrtimer *timer);
#else
static inline void hrtimer_init_on_stack(struct hrtimer *timer,
                                         clockid_t which_clock,
                                         enum hrtimer_mode mode)
{
        hrtimer_init(timer, which_clock, mode);
}

static inline void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
                                                 clockid_t clock_id,
                                                 enum hrtimer_mode mode)
{
        hrtimer_init_sleeper(sl, clock_id, mode);
}

static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { }
#endif

/* Basic timer operations: */
extern void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
                                   u64 range_ns, const enum hrtimer_mode mode);

/**
 * hrtimer_start - (re)start an hrtimer
 * @timer:        the timer to be added
 * @tim:        expiry time
 * @mode:        timer mode: absolute (HRTIMER_MODE_ABS) or
 *                relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
 *                softirq based mode is considered for debug purpose only!
 */
static inline void hrtimer_start(struct hrtimer *timer, ktime_t tim,
                                 const enum hrtimer_mode mode)
{
        hrtimer_start_range_ns(timer, tim, 0, mode);
}

extern int hrtimer_cancel(struct hrtimer *timer);
extern int hrtimer_try_to_cancel(struct hrtimer *timer);

static inline void hrtimer_start_expires(struct hrtimer *timer,
                                         enum hrtimer_mode mode)
{
        u64 delta;
        ktime_t soft, hard;
        soft = hrtimer_get_softexpires(timer);
        hard = hrtimer_get_expires(timer);
        delta = ktime_to_ns(ktime_sub(hard, soft));
        hrtimer_start_range_ns(timer, soft, delta, mode);
}

void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl,
                                   enum hrtimer_mode mode);

static inline void hrtimer_restart(struct hrtimer *timer)
{
        hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
}

/* Query timers: */
extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust);

static inline ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
{
        return __hrtimer_get_remaining(timer, false);
}

extern u64 hrtimer_get_next_event(void);
extern u64 hrtimer_next_event_without(const struct hrtimer *exclude);

extern bool hrtimer_active(const struct hrtimer *timer);

/**
 * hrtimer_is_queued = check, whether the timer is on one of the queues
 * @timer:        Timer to check
 *
 * Returns: True if the timer is queued, false otherwise
 *
 * The function can be used lockless, but it gives only a current snapshot.
 */
static inline bool hrtimer_is_queued(struct hrtimer *timer)
{
        /* The READ_ONCE pairs with the update functions of timer->state */
        return !!(READ_ONCE(timer->state) & HRTIMER_STATE_ENQUEUED);
}

/*
 * Helper function to check, whether the timer is running the callback
 * function
 */
static inline int hrtimer_callback_running(struct hrtimer *timer)
{
        return timer->base->running == timer;
}

/* Forward a hrtimer so it expires after now: */
extern u64
hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval);

/**
 * hrtimer_forward_now - forward the timer expiry so it expires after now
 * @timer:        hrtimer to forward
 * @interval:        the interval to forward
 *
 * Forward the timer expiry so it will expire after the current time
 * of the hrtimer clock base. Returns the number of overruns.
 *
 * Can be safely called from the callback function of @timer. If
 * called from other contexts @timer must neither be enqueued nor
 * running the callback and the caller needs to take care of
 * serialization.
 *
 * Note: This only updates the timer expiry value and does not requeue
 * the timer.
 */
static inline u64 hrtimer_forward_now(struct hrtimer *timer,
                                      ktime_t interval)
{
        return hrtimer_forward(timer, timer->base->get_time(), interval);
}

/* Precise sleep: */

extern int nanosleep_copyout(struct restart_block *, struct timespec64 *);
extern long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
                              const clockid_t clockid);

extern int schedule_hrtimeout_range(ktime_t *expires, u64 delta,
                                    const enum hrtimer_mode mode);
extern int schedule_hrtimeout_range_clock(ktime_t *expires,
                                          u64 delta,
                                          const enum hrtimer_mode mode,
                                          clockid_t clock_id);
extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode);

/* Soft interrupt function to run the hrtimer queues: */
extern void hrtimer_run_queues(void);

/* Bootup initialization: */
extern void __init hrtimers_init(void);

/* Show pending timers: */
extern void sysrq_timer_list_show(void);

int hrtimers_prepare_cpu(unsigned int cpu);
int hrtimers_cpu_starting(unsigned int cpu);
#ifdef CONFIG_HOTPLUG_CPU
int hrtimers_cpu_dying(unsigned int cpu);
#else
#define hrtimers_cpu_dying        NULL
#endif

#endif



































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_FIB_RULES_H
#define __NET_FIB_RULES_H

#include <linux/types.h>
#include <linux/slab.h>
#include <linux/netdevice.h>
#include <linux/fib_rules.h>
#include <linux/refcount.h>
#include <net/flow.h>
#include <net/rtnetlink.h>
#include <net/fib_notifier.h>
#include <linux/indirect_call_wrapper.h>

struct fib_kuid_range {
        kuid_t start;
        kuid_t end;
};

struct fib_rule {
        struct list_head        list;
        int                        iifindex;
        int                        oifindex;
        u32                        mark;
        u32                        mark_mask;
        u32                        flags;
        u32                        table;
        u8                        action;
        u8                        l3mdev;
        u8                      proto;
        u8                        ip_proto;
        u32                        target;
        __be64                        tun_id;
        struct fib_rule __rcu        *ctarget;
        struct net                *fr_net;

        refcount_t                refcnt;
        u32                        pref;
        int                        suppress_ifgroup;
        int                        suppress_prefixlen;
        char                        iifname[IFNAMSIZ];
        char                        oifname[IFNAMSIZ];
        struct fib_kuid_range        uid_range;
        struct fib_rule_port_range        sport_range;
        struct fib_rule_port_range        dport_range;
        struct rcu_head                rcu;
};

struct fib_lookup_arg {
        void                        *lookup_ptr;
        const void                *lookup_data;
        void                        *result;
        struct fib_rule                *rule;
        u32                        table;
        int                        flags;
#define FIB_LOOKUP_NOREF                1
#define FIB_LOOKUP_IGNORE_LINKSTATE        2
};

struct fib_rules_ops {
        int                        family;
        struct list_head        list;
        int                        rule_size;
        int                        addr_size;
        int                        unresolved_rules;
        int                        nr_goto_rules;
        unsigned int                fib_rules_seq;

        int                        (*action)(struct fib_rule *,
                                          struct flowi *, int,
                                          struct fib_lookup_arg *);
        bool                        (*suppress)(struct fib_rule *, int,
                                            struct fib_lookup_arg *);
        int                        (*match)(struct fib_rule *,
                                         struct flowi *, int);
        int                        (*configure)(struct fib_rule *,
                                             struct sk_buff *,
                                             struct fib_rule_hdr *,
                                             struct nlattr **,
                                             struct netlink_ext_ack *);
        int                        (*delete)(struct fib_rule *);
        int                        (*compare)(struct fib_rule *,
                                           struct fib_rule_hdr *,
                                           struct nlattr **);
        int                        (*fill)(struct fib_rule *, struct sk_buff *,
                                        struct fib_rule_hdr *);
        size_t                        (*nlmsg_payload)(struct fib_rule *);

        /* Called after modifications to the rules set, must flush
         * the route cache if one exists. */
        void                        (*flush_cache)(struct fib_rules_ops *ops);

        int                        nlgroup;
        const struct nla_policy        *policy;
        struct list_head        rules_list;
        struct module                *owner;
        struct net                *fro_net;
        struct rcu_head                rcu;
};

struct fib_rule_notifier_info {
        struct fib_notifier_info info; /* must be first */
        struct fib_rule *rule;
};

#define FRA_GENERIC_POLICY \
        [FRA_UNSPEC]        = { .strict_start_type = FRA_DPORT_RANGE + 1 }, \
        [FRA_IIFNAME]        = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, \
        [FRA_OIFNAME]        = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, \
        [FRA_PRIORITY]        = { .type = NLA_U32 }, \
        [FRA_FWMARK]        = { .type = NLA_U32 }, \
        [FRA_TUN_ID]        = { .type = NLA_U64 }, \
        [FRA_FWMASK]        = { .type = NLA_U32 }, \
        [FRA_TABLE]     = { .type = NLA_U32 }, \
        [FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, \
        [FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, \
        [FRA_GOTO]        = { .type = NLA_U32 }, \
        [FRA_L3MDEV]        = { .type = NLA_U8 }, \
        [FRA_UID_RANGE]        = { .len = sizeof(struct fib_rule_uid_range) }, \
        [FRA_PROTOCOL]  = { .type = NLA_U8 }, \
        [FRA_IP_PROTO]  = { .type = NLA_U8 }, \
        [FRA_SPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }, \
        [FRA_DPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }


static inline void fib_rule_get(struct fib_rule *rule)
{
        refcount_inc(&rule->refcnt);
}

static inline void fib_rule_put(struct fib_rule *rule)
{
        if (refcount_dec_and_test(&rule->refcnt))
                kfree_rcu(rule, rcu);
}

#ifdef CONFIG_NET_L3_MASTER_DEV
static inline u32 fib_rule_get_table(struct fib_rule *rule,
                                     struct fib_lookup_arg *arg)
{
        return rule->l3mdev ? arg->table : rule->table;
}
#else
static inline u32 fib_rule_get_table(struct fib_rule *rule,
                                     struct fib_lookup_arg *arg)
{
        return rule->table;
}
#endif

static inline u32 frh_get_table(struct fib_rule_hdr *frh, struct nlattr **nla)
{
        if (nla[FRA_TABLE])
                return nla_get_u32(nla[FRA_TABLE]);
        return frh->table;
}

static inline bool fib_rule_port_range_set(const struct fib_rule_port_range *range)
{
        return range->start != 0 && range->end != 0;
}

static inline bool fib_rule_port_inrange(const struct fib_rule_port_range *a,
                                         __be16 port)
{
        return ntohs(port) >= a->start &&
                ntohs(port) <= a->end;
}

static inline bool fib_rule_port_range_valid(const struct fib_rule_port_range *a)
{
        return a->start != 0 && a->end != 0 && a->end < 0xffff &&
                a->start <= a->end;
}

static inline bool fib_rule_port_range_compare(struct fib_rule_port_range *a,
                                               struct fib_rule_port_range *b)
{
        return a->start == b->start &&
                a->end == b->end;
}

static inline bool fib_rule_requires_fldissect(struct fib_rule *rule)
{
        return rule->iifindex != LOOPBACK_IFINDEX && (rule->ip_proto ||
                fib_rule_port_range_set(&rule->sport_range) ||
                fib_rule_port_range_set(&rule->dport_range));
}

struct fib_rules_ops *fib_rules_register(const struct fib_rules_ops *,
                                         struct net *);
void fib_rules_unregister(struct fib_rules_ops *);

int fib_rules_lookup(struct fib_rules_ops *, struct flowi *, int flags,
                     struct fib_lookup_arg *);
int fib_default_rule_add(struct fib_rules_ops *, u32 pref, u32 table,
                         u32 flags);
bool fib_rule_matchall(const struct fib_rule *rule);
int fib_rules_dump(struct net *net, struct notifier_block *nb, int family,
                   struct netlink_ext_ack *extack);
unsigned int fib_rules_seq_read(struct net *net, int family);

int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
                   struct netlink_ext_ack *extack);
int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
                   struct netlink_ext_ack *extack);

INDIRECT_CALLABLE_DECLARE(int fib6_rule_match(struct fib_rule *rule,
                                            struct flowi *fl, int flags));
INDIRECT_CALLABLE_DECLARE(int fib4_rule_match(struct fib_rule *rule,
                                            struct flowi *fl, int flags));

INDIRECT_CALLABLE_DECLARE(int fib6_rule_action(struct fib_rule *rule,
                            struct flowi *flp, int flags,
                            struct fib_lookup_arg *arg));
INDIRECT_CALLABLE_DECLARE(int fib4_rule_action(struct fib_rule *rule,
                            struct flowi *flp, int flags,
                            struct fib_lookup_arg *arg));

INDIRECT_CALLABLE_DECLARE(bool fib6_rule_suppress(struct fib_rule *rule,
                                                int flags,
                                                struct fib_lookup_arg *arg));
INDIRECT_CALLABLE_DECLARE(bool fib4_rule_suppress(struct fib_rule *rule,
                                                int flags,
                                                struct fib_lookup_arg *arg));
#endif






































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * RT Mutexes: blocking mutual exclusion locks with PI support
 *
 * started by Ingo Molnar and Thomas Gleixner:
 *
 *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
 *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
 *
 * This file contains the private data structure and API definitions.
 */

#ifndef __KERNEL_RTMUTEX_COMMON_H
#define __KERNEL_RTMUTEX_COMMON_H

#include <linux/rtmutex.h>
#include <linux/sched/wake_q.h>

/*
 * This is the control structure for tasks blocked on a rt_mutex,
 * which is allocated on the kernel stack on of the blocked task.
 *
 * @tree_entry:                pi node to enqueue into the mutex waiters tree
 * @pi_tree_entry:        pi node to enqueue into the mutex owner waiters tree
 * @task:                task reference to the blocked task
 */
struct rt_mutex_waiter {
        struct rb_node          tree_entry;
        struct rb_node          pi_tree_entry;
        struct task_struct        *task;
        struct rt_mutex                *lock;
#ifdef CONFIG_DEBUG_RT_MUTEXES
        unsigned long                ip;
        struct pid                *deadlock_task_pid;
        struct rt_mutex                *deadlock_lock;
#endif
        int prio;
        u64 deadline;
};

/*
 * Various helpers to access the waiters-tree:
 */

#ifdef CONFIG_RT_MUTEXES

static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
{
        return !RB_EMPTY_ROOT(&lock->waiters.rb_root);
}

static inline struct rt_mutex_waiter *
rt_mutex_top_waiter(struct rt_mutex *lock)
{
        struct rb_node *leftmost = rb_first_cached(&lock->waiters);
        struct rt_mutex_waiter *w = NULL;

        if (leftmost) {
                w = rb_entry(leftmost, struct rt_mutex_waiter, tree_entry);
                BUG_ON(w->lock != lock);
        }
        return w;
}

static inline int task_has_pi_waiters(struct task_struct *p)
{
        return !RB_EMPTY_ROOT(&p->pi_waiters.rb_root);
}

static inline struct rt_mutex_waiter *
task_top_pi_waiter(struct task_struct *p)
{
        return rb_entry(p->pi_waiters.rb_leftmost,
                        struct rt_mutex_waiter, pi_tree_entry);
}

#else

static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
{
        return false;
}

static inline struct rt_mutex_waiter *
rt_mutex_top_waiter(struct rt_mutex *lock)
{
        return NULL;
}

static inline int task_has_pi_waiters(struct task_struct *p)
{
        return false;
}

static inline struct rt_mutex_waiter *
task_top_pi_waiter(struct task_struct *p)
{
        return NULL;
}

#endif

/*
 * lock->owner state tracking:
 */
#define RT_MUTEX_HAS_WAITERS        1UL

static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
{
        unsigned long owner = (unsigned long) READ_ONCE(lock->owner);

        return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS);
}

/*
 * Constants for rt mutex functions which have a selectable deadlock
 * detection.
 *
 * RT_MUTEX_MIN_CHAINWALK:        Stops the lock chain walk when there are
 *                                no further PI adjustments to be made.
 *
 * RT_MUTEX_FULL_CHAINWALK:        Invoke deadlock detection with a full
 *                                walk of the lock chain.
 */
enum rtmutex_chainwalk {
        RT_MUTEX_MIN_CHAINWALK,
        RT_MUTEX_FULL_CHAINWALK,
};

/*
 * PI-futex support (proxy locking functions, etc.):
 */
extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
                                       struct task_struct *proxy_owner);
extern void rt_mutex_proxy_unlock(struct rt_mutex *lock);
extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
                                     struct rt_mutex_waiter *waiter,
                                     struct task_struct *task);
extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
                                     struct rt_mutex_waiter *waiter,
                                     struct task_struct *task);
extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
                               struct hrtimer_sleeper *to,
                               struct rt_mutex_waiter *waiter);
extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
                                 struct rt_mutex_waiter *waiter);

extern int rt_mutex_futex_trylock(struct rt_mutex *l);
extern int __rt_mutex_futex_trylock(struct rt_mutex *l);

extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
                                 struct wake_q_head *wqh);

extern void rt_mutex_postunlock(struct wake_q_head *wake_q);

#ifdef CONFIG_DEBUG_RT_MUTEXES
# include "rtmutex-debug.h"
#else
# include "rtmutex.h"
#endif

#endif
















































































































    1 









































































    1 

    1 



    1 


    1 

































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
// SPDX-License-Identifier: GPL-2.0
/*
 * arch-independent dma-mapping routines
 *
 * Copyright (c) 2006  SUSE Linux Products GmbH
 * Copyright (c) 2006  Tejun Heo <teheo@suse.de>
 */
#include <linux/memblock.h> /* for max_pfn */
#include <linux/acpi.h>
#include <linux/dma-map-ops.h>
#include <linux/export.h>
#include <linux/gfp.h>
#include <linux/of_device.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include "debug.h"
#include "direct.h"

/*
 * Managed DMA API
 */
struct dma_devres {
        size_t                size;
        void                *vaddr;
        dma_addr_t        dma_handle;
        unsigned long        attrs;
};

static void dmam_release(struct device *dev, void *res)
{
        struct dma_devres *this = res;

        dma_free_attrs(dev, this->size, this->vaddr, this->dma_handle,
                        this->attrs);
}

static int dmam_match(struct device *dev, void *res, void *match_data)
{
        struct dma_devres *this = res, *match = match_data;

        if (this->vaddr == match->vaddr) {
                WARN_ON(this->size != match->size ||
                        this->dma_handle != match->dma_handle);
                return 1;
        }
        return 0;
}

/**
 * dmam_free_coherent - Managed dma_free_coherent()
 * @dev: Device to free coherent memory for
 * @size: Size of allocation
 * @vaddr: Virtual address of the memory to free
 * @dma_handle: DMA handle of the memory to free
 *
 * Managed dma_free_coherent().
 */
void dmam_free_coherent(struct device *dev, size_t size, void *vaddr,
                        dma_addr_t dma_handle)
{
        struct dma_devres match_data = { size, vaddr, dma_handle };

        WARN_ON(devres_destroy(dev, dmam_release, dmam_match, &match_data));
        dma_free_coherent(dev, size, vaddr, dma_handle);
}
EXPORT_SYMBOL(dmam_free_coherent);

/**
 * dmam_alloc_attrs - Managed dma_alloc_attrs()
 * @dev: Device to allocate non_coherent memory for
 * @size: Size of allocation
 * @dma_handle: Out argument for allocated DMA handle
 * @gfp: Allocation flags
 * @attrs: Flags in the DMA_ATTR_* namespace.
 *
 * Managed dma_alloc_attrs().  Memory allocated using this function will be
 * automatically released on driver detach.
 *
 * RETURNS:
 * Pointer to allocated memory on success, NULL on failure.
 */
void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
                gfp_t gfp, unsigned long attrs)
{
        struct dma_devres *dr;
        void *vaddr;

        dr = devres_alloc(dmam_release, sizeof(*dr), gfp);
        if (!dr)
                return NULL;

        vaddr = dma_alloc_attrs(dev, size, dma_handle, gfp, attrs);
        if (!vaddr) {
                devres_free(dr);
                return NULL;
        }

        dr->vaddr = vaddr;
        dr->dma_handle = *dma_handle;
        dr->size = size;
        dr->attrs = attrs;

        devres_add(dev, dr);

        return vaddr;
}
EXPORT_SYMBOL(dmam_alloc_attrs);

static bool dma_go_direct(struct device *dev, dma_addr_t mask,
                const struct dma_map_ops *ops)
{
        if (likely(!ops))
                return true;
#ifdef CONFIG_DMA_OPS_BYPASS
        if (dev->dma_ops_bypass)
                return min_not_zero(mask, dev->bus_dma_limit) >=
                            dma_direct_get_required_mask(dev);
#endif
        return false;
}


/*
 * Check if the devices uses a direct mapping for streaming DMA operations.
 * This allows IOMMU drivers to set a bypass mode if the DMA mask is large
 * enough.
 */
static inline bool dma_alloc_direct(struct device *dev,
                const struct dma_map_ops *ops)
{
        return dma_go_direct(dev, dev->coherent_dma_mask, ops);
}

static inline bool dma_map_direct(struct device *dev,
                const struct dma_map_ops *ops)
{
        return dma_go_direct(dev, *dev->dma_mask, ops);
}

dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
                size_t offset, size_t size, enum dma_data_direction dir,
                unsigned long attrs)
{
        const struct dma_map_ops *ops = get_dma_ops(dev);
        dma_addr_t addr;

        BUG_ON(!valid_dma_direction(dir));

        if (WARN_ON_ONCE(!dev->dma_mask))
                return DMA_MAPPING_ERROR;

        if (dma_map_direct(dev, ops))
                addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
        else
                addr = ops->map_page(dev, page, offset, size, dir, attrs);
        debug_dma_map_page(dev, page, offset, size, dir, addr);

        return addr;
}
EXPORT_SYMBOL(dma_map_page_attrs);

void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size,
                enum dma_data_direction dir, unsigned long attrs)
{
        const struct dma_map_ops *ops = get_dma_ops(dev);

        BUG_ON(!valid_dma_direction(dir));
        if (dma_map_direct(dev, ops))
                dma_direct_unmap_page(dev, addr, size, dir, attrs);
        else if (ops->unmap_page)
                ops->unmap_page(dev, addr, size, dir, attrs);
        debug_dma_unmap_page(dev, addr, size, dir);
}
EXPORT_SYMBOL(dma_unmap_page_attrs);

/*
 * dma_maps_sg_attrs returns 0 on error and > 0 on success.
 * It should never return a value < 0.
 */
int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,
                enum dma_data_direction dir, unsigned long attrs)
{
        const struct dma_map_ops *ops = get_dma_ops(dev);
        int ents;

        BUG_ON(!valid_dma_direction(dir));

        if (WARN_ON_ONCE(!dev->dma_mask))
                return 0;

        if (dma_map_direct(dev, ops))
                ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
        else
                ents = ops->map_sg(dev, sg, nents, dir, attrs);
        BUG_ON(ents < 0);
        debug_dma_map_sg(dev, sg, nents, ents, dir);

        return ents;
}
EXPORT_SYMBOL(dma_map_sg_attrs);

void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
                                      int nents, enum dma_data_direction dir,
                                      unsigned long attrs)
{
        const struct dma_map_ops *ops = get_dma_ops(dev);

        BUG_ON(!valid_dma_direction(dir));
        debug_dma_unmap_sg(dev, sg, nents, dir);
        if (dma_map_direct(dev, ops))
                dma_direct_unmap_sg(dev, sg, nents, dir, attrs);
        else if (ops->unmap_sg)
                ops->unmap_sg(dev, sg, nents, dir, attrs);
}
EXPORT_SYMBOL(dma_unmap_sg_attrs);

dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr,
                size_t size, enum dma_data_direction dir, unsigned long attrs)
{
        const struct dma_map_ops *ops = get_dma_ops(dev);
        dma_addr_t addr = DMA_MAPPING_ERROR;

        BUG_ON(!valid_dma_direction(dir));

        if (WARN_ON_ONCE(!dev->dma_mask))
                return DMA_MAPPING_ERROR;

        /* Don't allow RAM to be mapped */
        if (WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr))))
                return DMA_MAPPING_ERROR;

        if (dma_map_direct(dev, ops))
                addr = dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
        else if (ops->map_resource)
                addr = ops->map_resource(dev, phys_addr, size, dir, attrs);

        debug_dma_map_resource(dev, phys_addr, size, dir, addr);
        return addr;
}
EXPORT_SYMBOL(dma_map_resource);

void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
                enum dma_data_direction dir, unsigned long attrs)
{
        const struct dma_map_ops *ops = get_dma_ops(dev);

        BUG_ON(!valid_dma_direction(dir));
        if (!dma_map_direct(dev, ops) && ops->unmap_resource)
                ops->unmap_resource(dev, addr, size, dir, attrs);
        debug_dma_unmap_resource(dev, addr, size, dir);
}
EXPORT_SYMBOL(dma_unmap_resource);

void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
                enum dma_data_direction dir)
{
        const struct dma_map_ops *ops = get_dma_ops(dev);

        BUG_ON(!valid_dma_direction(dir));
        if (dma_map_direct(dev, ops))
                dma_direct_sync_single_for_cpu(dev, addr, size, dir);
        else if (ops->sync_single_for_cpu)
                ops->sync_single_for_cpu(dev, addr, size, dir);
        debug_dma_sync_single_for_cpu(dev, addr, size, dir);
}
EXPORT_SYMBOL(dma_sync_single_for_cpu);

void dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
                size_t size, enum dma_data_direction dir)
{
        const struct dma_map_ops *ops = get_dma_ops(dev);

        BUG_ON(!valid_dma_direction(dir));
        if (dma_map_direct(dev, ops))
                dma_direct_sync_single_for_device(dev, addr, size, dir);
        else if (ops->sync_single_for_device)
                ops->sync_single_for_device(dev, addr, size, dir);
        debug_dma_sync_single_for_device(dev, addr, size, dir);
}
EXPORT_SYMBOL(dma_sync_single_for_device);

void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
                    int nelems, enum dma_data_direction dir)
{
        const struct dma_map_ops *ops = get_dma_ops(dev);

        BUG_ON(!valid_dma_direction(dir));
        if (dma_map_direct(dev, ops))
                dma_direct_sync_sg_for_cpu(dev, sg, nelems, dir);
        else if (ops->sync_sg_for_cpu)
                ops->sync_sg_for_cpu(dev, sg, nelems, dir);
        debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir);
}
EXPORT_SYMBOL(dma_sync_sg_for_cpu);

void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
                       int nelems, enum dma_data_direction dir)
{
        const struct dma_map_ops *ops = get_dma_ops(dev);

        BUG_ON(!valid_dma_direction(dir));
        if (dma_map_direct(dev, ops))
                dma_direct_sync_sg_for_device(dev, sg, nelems, dir);
        else if (ops->sync_sg_for_device)
                ops->sync_sg_for_device(dev, sg, nelems, dir);
        debug_dma_sync_sg_for_device(dev, sg, nelems, dir);
}
EXPORT_SYMBOL(dma_sync_sg_for_device);

/*
 * The whole dma_get_sgtable() idea is fundamentally unsafe - it seems
 * that the intention is to allow exporting memory allocated via the
 * coherent DMA APIs through the dma_buf API, which only accepts a
 * scattertable.  This presents a couple of problems:
 * 1. Not all memory allocated via the coherent DMA APIs is backed by
 *    a struct page
 * 2. Passing coherent DMA memory into the streaming APIs is not allowed
 *    as we will try to flush the memory through a different alias to that
 *    actually being used (and the flushes are redundant.)
 */
int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt,
                void *cpu_addr, dma_addr_t dma_addr, size_t size,
                unsigned long attrs)
{
        const struct dma_map_ops *ops = get_dma_ops(dev);

        if (dma_alloc_direct(dev, ops))
                return dma_direct_get_sgtable(dev, sgt, cpu_addr, dma_addr,
                                size, attrs);
        if (!ops->get_sgtable)
                return -ENXIO;
        return ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size, attrs);
}
EXPORT_SYMBOL(dma_get_sgtable_attrs);

#ifdef CONFIG_MMU
/*
 * Return the page attributes used for mapping dma_alloc_* memory, either in
 * kernel space if remapping is needed, or to userspace through dma_mmap_*.
 */
pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs)
{
        if (force_dma_unencrypted(dev))
                prot = pgprot_decrypted(prot);
        if (dev_is_dma_coherent(dev))
                return prot;
#ifdef CONFIG_ARCH_HAS_DMA_WRITE_COMBINE
        if (attrs & DMA_ATTR_WRITE_COMBINE)
                return pgprot_writecombine(prot);
#endif
        return pgprot_dmacoherent(prot);
}
#endif /* CONFIG_MMU */

/**
 * dma_can_mmap - check if a given device supports dma_mmap_*
 * @dev: device to check
 *
 * Returns %true if @dev supports dma_mmap_coherent() and dma_mmap_attrs() to
 * map DMA allocations to userspace.
 */
bool dma_can_mmap(struct device *dev)
{
        const struct dma_map_ops *ops = get_dma_ops(dev);

        if (dma_alloc_direct(dev, ops))
                return dma_direct_can_mmap(dev);
        return ops->mmap != NULL;
}
EXPORT_SYMBOL_GPL(dma_can_mmap);

/**
 * dma_mmap_attrs - map a coherent DMA allocation into user space
 * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
 * @vma: vm_area_struct describing requested user mapping
 * @cpu_addr: kernel CPU-view address returned from dma_alloc_attrs
 * @dma_addr: device-view address returned from dma_alloc_attrs
 * @size: size of memory originally requested in dma_alloc_attrs
 * @attrs: attributes of mapping properties requested in dma_alloc_attrs
 *
 * Map a coherent DMA buffer previously allocated by dma_alloc_attrs into user
 * space.  The coherent DMA buffer must not be freed by the driver until the
 * user space mapping has been released.
 */
int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
                void *cpu_addr, dma_addr_t dma_addr, size_t size,
                unsigned long attrs)
{
        const struct dma_map_ops *ops = get_dma_ops(dev);

        if (dma_alloc_direct(dev, ops))
                return dma_direct_mmap(dev, vma, cpu_addr, dma_addr, size,
                                attrs);
        if (!ops->mmap)
                return -ENXIO;
        return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs);
}
EXPORT_SYMBOL(dma_mmap_attrs);

u64 dma_get_required_mask(struct device *dev)
{
        const struct dma_map_ops *ops = get_dma_ops(dev);

        if (dma_alloc_direct(dev, ops))
                return dma_direct_get_required_mask(dev);
        if (ops->get_required_mask)
                return ops->get_required_mask(dev);

        /*
         * We require every DMA ops implementation to at least support a 32-bit
         * DMA mask (and use bounce buffering if that isn't supported in
         * hardware).  As the direct mapping code has its own routine to
         * actually report an optimal mask we default to 32-bit here as that
         * is the right thing for most IOMMUs, and at least not actively
         * harmful in general.
         */
        return DMA_BIT_MASK(32);
}
EXPORT_SYMBOL_GPL(dma_get_required_mask);

void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
                gfp_t flag, unsigned long attrs)
{
        const struct dma_map_ops *ops = get_dma_ops(dev);
        void *cpu_addr;

        WARN_ON_ONCE(!dev->coherent_dma_mask);

        if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr))
                return cpu_addr;

        /* let the implementation decide on the zone to allocate from: */
        flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);

        if (dma_alloc_direct(dev, ops))
                cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs);
        else if (ops->alloc)
                cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
        else
                return NULL;

        debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
        return cpu_addr;
}
EXPORT_SYMBOL(dma_alloc_attrs);

void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
                dma_addr_t dma_handle, unsigned long attrs)
{
        const struct dma_map_ops *ops = get_dma_ops(dev);

        if (dma_release_from_dev_coherent(dev, get_order(size), cpu_addr))
                return;
        /*
         * On non-coherent platforms which implement DMA-coherent buffers via
         * non-cacheable remaps, ops->free() may call vunmap(). Thus getting
         * this far in IRQ context is a) at risk of a BUG_ON() or trying to
         * sleep on some machines, and b) an indication that the driver is
         * probably misusing the coherent API anyway.
         */
        WARN_ON(irqs_disabled());

        if (!cpu_addr)
                return;

        debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
        if (dma_alloc_direct(dev, ops))
                dma_direct_free(dev, size, cpu_addr, dma_handle, attrs);
        else if (ops->free)
                ops->free(dev, size, cpu_addr, dma_handle, attrs);
}
EXPORT_SYMBOL(dma_free_attrs);

struct page *dma_alloc_pages(struct device *dev, size_t size,
                dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
{
        const struct dma_map_ops *ops = get_dma_ops(dev);
        struct page *page;

        if (WARN_ON_ONCE(!dev->coherent_dma_mask))
                return NULL;
        if (WARN_ON_ONCE(gfp & (__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM)))
                return NULL;

        size = PAGE_ALIGN(size);
        if (dma_alloc_direct(dev, ops))
                page = dma_direct_alloc_pages(dev, size, dma_handle, dir, gfp);
        else if (ops->alloc_pages)
                page = ops->alloc_pages(dev, size, dma_handle, dir, gfp);
        else
                return NULL;

        debug_dma_map_page(dev, page, 0, size, dir, *dma_handle);

        return page;
}
EXPORT_SYMBOL_GPL(dma_alloc_pages);

void dma_free_pages(struct device *dev, size_t size, struct page *page,
                dma_addr_t dma_handle, enum dma_data_direction dir)
{
        const struct dma_map_ops *ops = get_dma_ops(dev);

        size = PAGE_ALIGN(size);
        debug_dma_unmap_page(dev, dma_handle, size, dir);

        if (dma_alloc_direct(dev, ops))
                dma_direct_free_pages(dev, size, page, dma_handle, dir);
        else if (ops->free_pages)
                ops->free_pages(dev, size, page, dma_handle, dir);
}
EXPORT_SYMBOL_GPL(dma_free_pages);

void *dma_alloc_noncoherent(struct device *dev, size_t size,
                dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
{
        const struct dma_map_ops *ops = get_dma_ops(dev);
        void *vaddr;

        if (!ops || !ops->alloc_noncoherent) {
                struct page *page;

                page = dma_alloc_pages(dev, size, dma_handle, dir, gfp);
                if (!page)
                        return NULL;
                return page_address(page);
        }

        size = PAGE_ALIGN(size);
        vaddr = ops->alloc_noncoherent(dev, size, dma_handle, dir, gfp);
        if (vaddr)
                debug_dma_map_page(dev, virt_to_page(vaddr), 0, size, dir,
                                   *dma_handle);
        return vaddr;
}
EXPORT_SYMBOL_GPL(dma_alloc_noncoherent);

void dma_free_noncoherent(struct device *dev, size_t size, void *vaddr,
                dma_addr_t dma_handle, enum dma_data_direction dir)
{
        const struct dma_map_ops *ops = get_dma_ops(dev);

        if (!ops || !ops->free_noncoherent) {
                dma_free_pages(dev, size, virt_to_page(vaddr), dma_handle, dir);
                return;
        }

        size = PAGE_ALIGN(size);
        debug_dma_unmap_page(dev, dma_handle, size, dir);
        ops->free_noncoherent(dev, size, vaddr, dma_handle, dir);
}
EXPORT_SYMBOL_GPL(dma_free_noncoherent);

int dma_supported(struct device *dev, u64 mask)
{
        const struct dma_map_ops *ops = get_dma_ops(dev);

        /*
         * ->dma_supported sets the bypass flag, so we must always call
         * into the method here unless the device is truly direct mapped.
         */
        if (!ops)
                return dma_direct_supported(dev, mask);
        if (!ops->dma_supported)
                return 1;
        return ops->dma_supported(dev, mask);
}
EXPORT_SYMBOL(dma_supported);

#ifdef CONFIG_ARCH_HAS_DMA_SET_MASK
void arch_dma_set_mask(struct device *dev, u64 mask);
#else
#define arch_dma_set_mask(dev, mask)        do { } while (0)
#endif

int dma_set_mask(struct device *dev, u64 mask)
{
        /*
         * Truncate the mask to the actually supported dma_addr_t width to
         * avoid generating unsupportable addresses.
         */
        mask = (dma_addr_t)mask;

        if (!dev->dma_mask || !dma_supported(dev, mask))
                return -EIO;

        arch_dma_set_mask(dev, mask);
        *dev->dma_mask = mask;
        return 0;
}
EXPORT_SYMBOL(dma_set_mask);

#ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK
int dma_set_coherent_mask(struct device *dev, u64 mask)
{
        /*
         * Truncate the mask to the actually supported dma_addr_t width to
         * avoid generating unsupportable addresses.
         */
        mask = (dma_addr_t)mask;

        if (!dma_supported(dev, mask))
                return -EIO;

        dev->coherent_dma_mask = mask;
        return 0;
}
EXPORT_SYMBOL(dma_set_coherent_mask);
#endif

size_t dma_max_mapping_size(struct device *dev)
{
        const struct dma_map_ops *ops = get_dma_ops(dev);
        size_t size = SIZE_MAX;

        if (dma_map_direct(dev, ops))
                size = dma_direct_max_mapping_size(dev);
        else if (ops && ops->max_mapping_size)
                size = ops->max_mapping_size(dev);

        return size;
}
EXPORT_SYMBOL_GPL(dma_max_mapping_size);

bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
{
        const struct dma_map_ops *ops = get_dma_ops(dev);

        if (dma_map_direct(dev, ops))
                return dma_direct_need_sync(dev, dma_addr);
        return ops->sync_single_for_cpu || ops->sync_single_for_device;
}
EXPORT_SYMBOL_GPL(dma_need_sync);

unsigned long dma_get_merge_boundary(struct device *dev)
{
        const struct dma_map_ops *ops = get_dma_ops(dev);

        if (!ops || !ops->get_merge_boundary)
                return 0;        /* can't merge */

        return ops->get_merge_boundary(dev);
}
EXPORT_SYMBOL_GPL(dma_get_merge_boundary);

















































    4 











    4 













































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_WORD_AT_A_TIME_H
#define _ASM_WORD_AT_A_TIME_H

#include <linux/kernel.h>

/*
 * This is largely generic for little-endian machines, but the
 * optimal byte mask counting is probably going to be something
 * that is architecture-specific. If you have a reliably fast
 * bit count instruction, that might be better than the multiply
 * and shift, for example.
 */
struct word_at_a_time {
        const unsigned long one_bits, high_bits;
};

#define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) }

#ifdef CONFIG_64BIT

/*
 * Jan Achrenius on G+: microoptimized version of
 * the simpler "(mask & ONEBYTES) * ONEBYTES >> 56"
 * that works for the bytemasks without having to
 * mask them first.
 */
static inline long count_masked_bytes(unsigned long mask)
{
        return mask*0x0001020304050608ul >> 56;
}

#else        /* 32-bit case */

/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */
static inline long count_masked_bytes(long mask)
{
        /* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */
        long a = (0x0ff0001+mask) >> 23;
        /* Fix the 1 for 00 case */
        return a & mask;
}

#endif

/* Return nonzero if it has a zero */
static inline unsigned long has_zero(unsigned long a, unsigned long *bits, const struct word_at_a_time *c)
{
        unsigned long mask = ((a - c->one_bits) & ~a) & c->high_bits;
        *bits = mask;
        return mask;
}

static inline unsigned long prep_zero_mask(unsigned long a, unsigned long bits, const struct word_at_a_time *c)
{
        return bits;
}

static inline unsigned long create_zero_mask(unsigned long bits)
{
        bits = (bits - 1) & ~bits;
        return bits >> 7;
}

/* The mask we created is directly usable as a bytemask */
#define zero_bytemask(mask) (mask)

static inline unsigned long find_zero(unsigned long mask)
{
        return count_masked_bytes(mask);
}

/*
 * Load an unaligned word from kernel space.
 *
 * In the (very unlikely) case of the word being a page-crosser
 * and the next page not being mapped, take the exception and
 * return zeroes in the non-existing part.
 */
static inline unsigned long load_unaligned_zeropad(const void *addr)
{
        unsigned long ret, dummy;

        asm(
                "1:\tmov %2,%0\n"
                "2:\n"
                ".section .fixup,\"ax\"\n"
                "3:\t"
                "lea %2,%1\n\t"
                "and %3,%1\n\t"
                "mov (%1),%0\n\t"
                "leal %2,%%ecx\n\t"
                "andl %4,%%ecx\n\t"
                "shll $3,%%ecx\n\t"
                "shr %%cl,%0\n\t"
                "jmp 2b\n"
                ".previous\n"
                _ASM_EXTABLE(1b, 3b)
                :"=&r" (ret),"=&c" (dummy)
                :"m" (*(unsigned long *)addr),
                 "i" (-sizeof(unsigned long)),
                 "i" (sizeof(unsigned long)-1));
        return ret;
}

#endif /* _ASM_WORD_AT_A_TIME_H */














































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
// SPDX-License-Identifier: GPL-2.0
/*
 * device.h - generic, centralized driver model
 *
 * Copyright (c) 2001-2003 Patrick Mochel <mochel@osdl.org>
 * Copyright (c) 2004-2009 Greg Kroah-Hartman <gregkh@suse.de>
 * Copyright (c) 2008-2009 Novell Inc.
 *
 * See Documentation/driver-api/driver-model/ for more information.
 */

#ifndef _DEVICE_H_
#define _DEVICE_H_

#include <linux/dev_printk.h>
#include <linux/energy_model.h>
#include <linux/ioport.h>
#include <linux/kobject.h>
#include <linux/klist.h>
#include <linux/list.h>
#include <linux/lockdep.h>
#include <linux/compiler.h>
#include <linux/types.h>
#include <linux/mutex.h>
#include <linux/pm.h>
#include <linux/atomic.h>
#include <linux/uidgid.h>
#include <linux/gfp.h>
#include <linux/overflow.h>
#include <linux/device/bus.h>
#include <linux/device/class.h>
#include <linux/device/driver.h>
#include <linux/cleanup.h>
#include <asm/device.h>

struct device;
struct device_private;
struct device_driver;
struct driver_private;
struct module;
struct class;
struct subsys_private;
struct device_node;
struct fwnode_handle;
struct iommu_ops;
struct iommu_group;
struct dev_pin_info;
struct dev_iommu;

/**
 * struct subsys_interface - interfaces to device functions
 * @name:       name of the device function
 * @subsys:     subsytem of the devices to attach to
 * @node:       the list of functions registered at the subsystem
 * @add_dev:    device hookup to device function handler
 * @remove_dev: device hookup to device function handler
 *
 * Simple interfaces attached to a subsystem. Multiple interfaces can
 * attach to a subsystem and its devices. Unlike drivers, they do not
 * exclusively claim or control devices. Interfaces usually represent
 * a specific functionality of a subsystem/class of devices.
 */
struct subsys_interface {
        const char *name;
        struct bus_type *subsys;
        struct list_head node;
        int (*add_dev)(struct device *dev, struct subsys_interface *sif);
        void (*remove_dev)(struct device *dev, struct subsys_interface *sif);
};

int subsys_interface_register(struct subsys_interface *sif);
void subsys_interface_unregister(struct subsys_interface *sif);

int subsys_system_register(struct bus_type *subsys,
                           const struct attribute_group **groups);
int subsys_virtual_register(struct bus_type *subsys,
                            const struct attribute_group **groups);

/*
 * The type of device, "struct device" is embedded in. A class
 * or bus can contain devices of different types
 * like "partitions" and "disks", "mouse" and "event".
 * This identifies the device type and carries type-specific
 * information, equivalent to the kobj_type of a kobject.
 * If "name" is specified, the uevent will contain it in
 * the DEVTYPE variable.
 */
struct device_type {
        const char *name;
        const struct attribute_group **groups;
        int (*uevent)(struct device *dev, struct kobj_uevent_env *env);
        char *(*devnode)(struct device *dev, umode_t *mode,
                         kuid_t *uid, kgid_t *gid);
        void (*release)(struct device *dev);

        const struct dev_pm_ops *pm;
};

/* interface for exporting device attributes */
struct device_attribute {
        struct attribute        attr;
        ssize_t (*show)(struct device *dev, struct device_attribute *attr,
                        char *buf);
        ssize_t (*store)(struct device *dev, struct device_attribute *attr,
                         const char *buf, size_t count);
};

struct dev_ext_attribute {
        struct device_attribute attr;
        void *var;
};

ssize_t device_show_ulong(struct device *dev, struct device_attribute *attr,
                          char *buf);
ssize_t device_store_ulong(struct device *dev, struct device_attribute *attr,
                           const char *buf, size_t count);
ssize_t device_show_int(struct device *dev, struct device_attribute *attr,
                        char *buf);
ssize_t device_store_int(struct device *dev, struct device_attribute *attr,
                         const char *buf, size_t count);
ssize_t device_show_bool(struct device *dev, struct device_attribute *attr,
                        char *buf);
ssize_t device_store_bool(struct device *dev, struct device_attribute *attr,
                         const char *buf, size_t count);

#define DEVICE_ATTR(_name, _mode, _show, _store) \
        struct device_attribute dev_attr_##_name = __ATTR(_name, _mode, _show, _store)
#define DEVICE_ATTR_PREALLOC(_name, _mode, _show, _store) \
        struct device_attribute dev_attr_##_name = \
                __ATTR_PREALLOC(_name, _mode, _show, _store)
#define DEVICE_ATTR_RW(_name) \
        struct device_attribute dev_attr_##_name = __ATTR_RW(_name)
#define DEVICE_ATTR_ADMIN_RW(_name) \
        struct device_attribute dev_attr_##_name = __ATTR_RW_MODE(_name, 0600)
#define DEVICE_ATTR_RO(_name) \
        struct device_attribute dev_attr_##_name = __ATTR_RO(_name)
#define DEVICE_ATTR_ADMIN_RO(_name) \
        struct device_attribute dev_attr_##_name = __ATTR_RO_MODE(_name, 0400)
#define DEVICE_ATTR_WO(_name) \
        struct device_attribute dev_attr_##_name = __ATTR_WO(_name)
#define DEVICE_ULONG_ATTR(_name, _mode, _var) \
        struct dev_ext_attribute dev_attr_##_name = \
                { __ATTR(_name, _mode, device_show_ulong, device_store_ulong), &(_var) }
#define DEVICE_INT_ATTR(_name, _mode, _var) \
        struct dev_ext_attribute dev_attr_##_name = \
                { __ATTR(_name, _mode, device_show_int, device_store_int), &(_var) }
#define DEVICE_BOOL_ATTR(_name, _mode, _var) \
        struct dev_ext_attribute dev_attr_##_name = \
                { __ATTR(_name, _mode, device_show_bool, device_store_bool), &(_var) }
#define DEVICE_ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) \
        struct device_attribute dev_attr_##_name =                \
                __ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store)

int device_create_file(struct device *device,
                       const struct device_attribute *entry);
void device_remove_file(struct device *dev,
                        const struct device_attribute *attr);
bool device_remove_file_self(struct device *dev,
                             const struct device_attribute *attr);
int __must_check device_create_bin_file(struct device *dev,
                                        const struct bin_attribute *attr);
void device_remove_bin_file(struct device *dev,
                            const struct bin_attribute *attr);

/* device resource management */
typedef void (*dr_release_t)(struct device *dev, void *res);
typedef int (*dr_match_t)(struct device *dev, void *res, void *match_data);

#ifdef CONFIG_DEBUG_DEVRES
void *__devres_alloc_node(dr_release_t release, size_t size, gfp_t gfp,
                          int nid, const char *name) __malloc;
#define devres_alloc(release, size, gfp) \
        __devres_alloc_node(release, size, gfp, NUMA_NO_NODE, #release)
#define devres_alloc_node(release, size, gfp, nid) \
        __devres_alloc_node(release, size, gfp, nid, #release)
#else
void *devres_alloc_node(dr_release_t release, size_t size,
                        gfp_t gfp, int nid) __malloc;
static inline void *devres_alloc(dr_release_t release, size_t size, gfp_t gfp)
{
        return devres_alloc_node(release, size, gfp, NUMA_NO_NODE);
}
#endif

void devres_for_each_res(struct device *dev, dr_release_t release,
                         dr_match_t match, void *match_data,
                         void (*fn)(struct device *, void *, void *),
                         void *data);
void devres_free(void *res);
void devres_add(struct device *dev, void *res);
void *devres_find(struct device *dev, dr_release_t release,
                  dr_match_t match, void *match_data);
void *devres_get(struct device *dev, void *new_res,
                 dr_match_t match, void *match_data);
void *devres_remove(struct device *dev, dr_release_t release,
                    dr_match_t match, void *match_data);
int devres_destroy(struct device *dev, dr_release_t release,
                   dr_match_t match, void *match_data);
int devres_release(struct device *dev, dr_release_t release,
                   dr_match_t match, void *match_data);

/* devres group */
void * __must_check devres_open_group(struct device *dev, void *id, gfp_t gfp);
void devres_close_group(struct device *dev, void *id);
void devres_remove_group(struct device *dev, void *id);
int devres_release_group(struct device *dev, void *id);

/* managed devm_k.alloc/kfree for device drivers */
void *devm_kmalloc(struct device *dev, size_t size, gfp_t gfp) __malloc;
void *devm_krealloc(struct device *dev, void *ptr, size_t size,
                    gfp_t gfp) __must_check;
__printf(3, 0) char *devm_kvasprintf(struct device *dev, gfp_t gfp,
                                     const char *fmt, va_list ap) __malloc;
__printf(3, 4) char *devm_kasprintf(struct device *dev, gfp_t gfp,
                                    const char *fmt, ...) __malloc;
static inline void *devm_kzalloc(struct device *dev, size_t size, gfp_t gfp)
{
        return devm_kmalloc(dev, size, gfp | __GFP_ZERO);
}
static inline void *devm_kmalloc_array(struct device *dev,
                                       size_t n, size_t size, gfp_t flags)
{
        size_t bytes;

        if (unlikely(check_mul_overflow(n, size, &bytes)))
                return NULL;

        return devm_kmalloc(dev, bytes, flags);
}
static inline void *devm_kcalloc(struct device *dev,
                                 size_t n, size_t size, gfp_t flags)
{
        return devm_kmalloc_array(dev, n, size, flags | __GFP_ZERO);
}
void devm_kfree(struct device *dev, const void *p);
char *devm_kstrdup(struct device *dev, const char *s, gfp_t gfp) __malloc;
const char *devm_kstrdup_const(struct device *dev, const char *s, gfp_t gfp);
void *devm_kmemdup(struct device *dev, const void *src, size_t len, gfp_t gfp);

unsigned long devm_get_free_pages(struct device *dev,
                                  gfp_t gfp_mask, unsigned int order);
void devm_free_pages(struct device *dev, unsigned long addr);

void __iomem *devm_ioremap_resource(struct device *dev,
                                    const struct resource *res);
void __iomem *devm_ioremap_resource_wc(struct device *dev,
                                       const struct resource *res);

void __iomem *devm_of_iomap(struct device *dev,
                            struct device_node *node, int index,
                            resource_size_t *size);

/* allows to add/remove a custom action to devres stack */
int devm_add_action(struct device *dev, void (*action)(void *), void *data);
void devm_remove_action(struct device *dev, void (*action)(void *), void *data);
void devm_release_action(struct device *dev, void (*action)(void *), void *data);

static inline int devm_add_action_or_reset(struct device *dev,
                                           void (*action)(void *), void *data)
{
        int ret;

        ret = devm_add_action(dev, action, data);
        if (ret)
                action(data);

        return ret;
}

/**
 * devm_alloc_percpu - Resource-managed alloc_percpu
 * @dev: Device to allocate per-cpu memory for
 * @type: Type to allocate per-cpu memory for
 *
 * Managed alloc_percpu. Per-cpu memory allocated with this function is
 * automatically freed on driver detach.
 *
 * RETURNS:
 * Pointer to allocated memory on success, NULL on failure.
 */
#define devm_alloc_percpu(dev, type)      \
        ((typeof(type) __percpu *)__devm_alloc_percpu((dev), sizeof(type), \
                                                      __alignof__(type)))

void __percpu *__devm_alloc_percpu(struct device *dev, size_t size,
                                   size_t align);
void devm_free_percpu(struct device *dev, void __percpu *pdata);

struct device_dma_parameters {
        /*
         * a low level driver may set these to teach IOMMU code about
         * sg limitations.
         */
        unsigned int max_segment_size;
        unsigned int min_align_mask;
        unsigned long segment_boundary_mask;
};

/**
 * enum device_link_state - Device link states.
 * @DL_STATE_NONE: The presence of the drivers is not being tracked.
 * @DL_STATE_DORMANT: None of the supplier/consumer drivers is present.
 * @DL_STATE_AVAILABLE: The supplier driver is present, but the consumer is not.
 * @DL_STATE_CONSUMER_PROBE: The consumer is probing (supplier driver present).
 * @DL_STATE_ACTIVE: Both the supplier and consumer drivers are present.
 * @DL_STATE_SUPPLIER_UNBIND: The supplier driver is unbinding.
 */
enum device_link_state {
        DL_STATE_NONE = -1,
        DL_STATE_DORMANT = 0,
        DL_STATE_AVAILABLE,
        DL_STATE_CONSUMER_PROBE,
        DL_STATE_ACTIVE,
        DL_STATE_SUPPLIER_UNBIND,
};

/*
 * Device link flags.
 *
 * STATELESS: The core will not remove this link automatically.
 * AUTOREMOVE_CONSUMER: Remove the link automatically on consumer driver unbind.
 * PM_RUNTIME: If set, the runtime PM framework will use this link.
 * RPM_ACTIVE: Run pm_runtime_get_sync() on the supplier during link creation.
 * AUTOREMOVE_SUPPLIER: Remove the link automatically on supplier driver unbind.
 * AUTOPROBE_CONSUMER: Probe consumer driver automatically after supplier binds.
 * MANAGED: The core tracks presence of supplier/consumer drivers (internal).
 * SYNC_STATE_ONLY: Link only affects sync_state() behavior.
 */
#define DL_FLAG_STATELESS                BIT(0)
#define DL_FLAG_AUTOREMOVE_CONSUMER        BIT(1)
#define DL_FLAG_PM_RUNTIME                BIT(2)
#define DL_FLAG_RPM_ACTIVE                BIT(3)
#define DL_FLAG_AUTOREMOVE_SUPPLIER        BIT(4)
#define DL_FLAG_AUTOPROBE_CONSUMER        BIT(5)
#define DL_FLAG_MANAGED                        BIT(6)
#define DL_FLAG_SYNC_STATE_ONLY                BIT(7)

/**
 * enum dl_dev_state - Device driver presence tracking information.
 * @DL_DEV_NO_DRIVER: There is no driver attached to the device.
 * @DL_DEV_PROBING: A driver is probing.
 * @DL_DEV_DRIVER_BOUND: The driver has been bound to the device.
 * @DL_DEV_UNBINDING: The driver is unbinding from the device.
 */
enum dl_dev_state {
        DL_DEV_NO_DRIVER = 0,
        DL_DEV_PROBING,
        DL_DEV_DRIVER_BOUND,
        DL_DEV_UNBINDING,
};

/**
 * enum device_removable - Whether the device is removable. The criteria for a
 * device to be classified as removable is determined by its subsystem or bus.
 * @DEVICE_REMOVABLE_NOT_SUPPORTED: This attribute is not supported for this
 *                                    device (default).
 * @DEVICE_REMOVABLE_UNKNOWN:  Device location is Unknown.
 * @DEVICE_FIXED: Device is not removable by the user.
 * @DEVICE_REMOVABLE: Device is removable by the user.
 */
enum device_removable {
        DEVICE_REMOVABLE_NOT_SUPPORTED = 0, /* must be 0 */
        DEVICE_REMOVABLE_UNKNOWN,
        DEVICE_FIXED,
        DEVICE_REMOVABLE,
};

/**
 * struct dev_links_info - Device data related to device links.
 * @suppliers: List of links to supplier devices.
 * @consumers: List of links to consumer devices.
 * @needs_suppliers: Hook to global list of devices waiting for suppliers.
 * @defer_hook: Hook to global list of devices that have deferred sync_state or
 *                deferred fw_devlink.
 * @need_for_probe: If needs_suppliers is on a list, this indicates if the
 *                    suppliers are needed for probe or not.
 * @status: Driver status information.
 */
struct dev_links_info {
        struct list_head suppliers;
        struct list_head consumers;
        struct list_head needs_suppliers;
        struct list_head defer_hook;
        bool need_for_probe;
        enum dl_dev_state status;
};

/**
 * struct device - The basic device structure
 * @parent:        The device's "parent" device, the device to which it is attached.
 *                 In most cases, a parent device is some sort of bus or host
 *                 controller. If parent is NULL, the device, is a top-level device,
 *                 which is not usually what you want.
 * @p:                Holds the private data of the driver core portions of the device.
 *                 See the comment of the struct device_private for detail.
 * @kobj:        A top-level, abstract class from which other classes are derived.
 * @init_name:        Initial name of the device.
 * @type:        The type of device.
 *                 This identifies the device type and carries type-specific
 *                 information.
 * @mutex:        Mutex to synchronize calls to its driver.
 * @lockdep_mutex: An optional debug lock that a subsystem can use as a
 *                 peer lock to gain localized lockdep coverage of the device_lock.
 * @bus:        Type of bus device is on.
 * @driver:        Which driver has allocated this
 * @platform_data: Platform data specific to the device.
 *                 Example: For devices on custom boards, as typical of embedded
 *                 and SOC based hardware, Linux often uses platform_data to point
 *                 to board-specific structures describing devices and how they
 *                 are wired.  That can include what ports are available, chip
 *                 variants, which GPIO pins act in what additional roles, and so
 *                 on.  This shrinks the "Board Support Packages" (BSPs) and
 *                 minimizes board-specific #ifdefs in drivers.
 * @driver_data: Private pointer for driver specific info.
 * @links:        Links to suppliers and consumers of this device.
 * @power:        For device power management.
 *                See Documentation/driver-api/pm/devices.rst for details.
 * @pm_domain:        Provide callbacks that are executed during system suspend,
 *                 hibernation, system resume and during runtime PM transitions
 *                 along with subsystem-level and driver-level callbacks.
 * @em_pd:        device's energy model performance domain
 * @pins:        For device pin management.
 *                See Documentation/driver-api/pinctl.rst for details.
 * @msi_list:        Hosts MSI descriptors
 * @msi_domain: The generic MSI domain this device is using.
 * @numa_node:        NUMA node this device is close to.
 * @dma_ops:    DMA mapping operations for this device.
 * @dma_mask:        Dma mask (if dma'ble device).
 * @coherent_dma_mask: Like dma_mask, but for alloc_coherent mapping as not all
 *                 hardware supports 64-bit addresses for consistent allocations
 *                 such descriptors.
 * @bus_dma_limit: Limit of an upstream bridge or bus which imposes a smaller
 *                DMA limit than the device itself supports.
 * @dma_range_map: map for DMA memory ranges relative to that of RAM
 * @dma_parms:        A low level driver may set these to teach IOMMU code about
 *                 segment limitations.
 * @dma_pools:        Dma pools (if dma'ble device).
 * @dma_mem:        Internal for coherent mem override.
 * @cma_area:        Contiguous memory area for dma allocations
 * @archdata:        For arch-specific additions.
 * @of_node:        Associated device tree node.
 * @fwnode:        Associated device node supplied by platform firmware.
 * @devt:        For creating the sysfs "dev".
 * @id:                device instance
 * @devres_lock: Spinlock to protect the resource of the device.
 * @devres_head: The resources list of the device.
 * @knode_class: The node used to add the device to the class list.
 * @class:        The class of the device.
 * @groups:        Optional attribute groups.
 * @release:        Callback to free the device after all references have
 *                 gone away. This should be set by the allocator of the
 *                 device (i.e. the bus driver that discovered the device).
 * @iommu_group: IOMMU group the device belongs to.
 * @iommu:        Per device generic IOMMU runtime data
 * @removable:  Whether the device can be removed from the system. This
 *              should be set by the subsystem / bus driver that discovered
 *              the device.
 *
 * @offline_disabled: If set, the device is permanently online.
 * @offline:        Set after successful invocation of bus type's .offline().
 * @of_node_reused: Set if the device-tree node is shared with an ancestor
 *              device.
 * @state_synced: The hardware state of this device has been synced to match
 *                  the software state of this device by calling the driver/bus
 *                  sync_state() callback.
 * @dma_coherent: this particular device is dma coherent, even if the
 *                architecture supports non-coherent devices.
 * @dma_ops_bypass: If set to %true then the dma_ops are bypassed for the
 *                streaming DMA operations (->map_* / ->unmap_* / ->sync_*),
 *                and optionall (if the coherent mask is large enough) also
 *                for dma allocations.  This flag is managed by the dma ops
 *                instance from ->dma_supported.
 *
 * At the lowest level, every device in a Linux system is represented by an
 * instance of struct device. The device structure contains the information
 * that the device model core needs to model the system. Most subsystems,
 * however, track additional information about the devices they host. As a
 * result, it is rare for devices to be represented by bare device structures;
 * instead, that structure, like kobject structures, is usually embedded within
 * a higher-level representation of the device.
 */
struct device {
        struct kobject kobj;
        struct device                *parent;

        struct device_private        *p;

        const char                *init_name; /* initial name of the device */
        const struct device_type *type;

        struct bus_type        *bus;                /* type of bus device is on */
        struct device_driver *driver;        /* which driver has allocated this
                                           device */
        void                *platform_data;        /* Platform specific data, device
                                           core doesn't touch it */
        void                *driver_data;        /* Driver data, set and get with
                                           dev_set_drvdata/dev_get_drvdata */
#ifdef CONFIG_PROVE_LOCKING
        struct mutex                lockdep_mutex;
#endif
        struct mutex                mutex;        /* mutex to synchronize calls to
                                         * its driver.
                                         */

        struct dev_links_info        links;
        struct dev_pm_info        power;
        struct dev_pm_domain        *pm_domain;

#ifdef CONFIG_ENERGY_MODEL
        struct em_perf_domain        *em_pd;
#endif

#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
        struct irq_domain        *msi_domain;
#endif
#ifdef CONFIG_PINCTRL
        struct dev_pin_info        *pins;
#endif
#ifdef CONFIG_GENERIC_MSI_IRQ
        raw_spinlock_t                msi_lock;
        struct list_head        msi_list;
#endif
#ifdef CONFIG_DMA_OPS
        const struct dma_map_ops *dma_ops;
#endif
        u64                *dma_mask;        /* dma mask (if dma'able device) */
        u64                coherent_dma_mask;/* Like dma_mask, but for
                                             alloc_coherent mappings as
                                             not all hardware supports
                                             64 bit addresses for consistent
                                             allocations such descriptors. */
        u64                bus_dma_limit;        /* upstream dma constraint */
        const struct bus_dma_region *dma_range_map;

        struct device_dma_parameters *dma_parms;

        struct list_head        dma_pools;        /* dma pools (if dma'ble) */

#ifdef CONFIG_DMA_DECLARE_COHERENT
        struct dma_coherent_mem        *dma_mem; /* internal for coherent mem
                                             override */
#endif
#ifdef CONFIG_DMA_CMA
        struct cma *cma_area;                /* contiguous memory area for dma
                                           allocations */
#endif
        /* arch specific additions */
        struct dev_archdata        archdata;

        struct device_node        *of_node; /* associated device tree node */
        struct fwnode_handle        *fwnode; /* firmware device node */

#ifdef CONFIG_NUMA
        int                numa_node;        /* NUMA node this device is close to */
#endif
        dev_t                        devt;        /* dev_t, creates the sysfs "dev" */
        u32                        id;        /* device instance */

        spinlock_t                devres_lock;
        struct list_head        devres_head;

        struct class                *class;
        const struct attribute_group **groups;        /* optional groups */

        void        (*release)(struct device *dev);
        struct iommu_group        *iommu_group;
        struct dev_iommu        *iommu;

        enum device_removable        removable;

        bool                        offline_disabled:1;
        bool                        offline:1;
        bool                        of_node_reused:1;
        bool                        state_synced:1;
#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
        bool                        dma_coherent:1;
#endif
#ifdef CONFIG_DMA_OPS_BYPASS
        bool                        dma_ops_bypass : 1;
#endif
};

/**
 * struct device_link - Device link representation.
 * @supplier: The device on the supplier end of the link.
 * @s_node: Hook to the supplier device's list of links to consumers.
 * @consumer: The device on the consumer end of the link.
 * @c_node: Hook to the consumer device's list of links to suppliers.
 * @link_dev: device used to expose link details in sysfs
 * @status: The state of the link (with respect to the presence of drivers).
 * @flags: Link flags.
 * @rpm_active: Whether or not the consumer device is runtime-PM-active.
 * @kref: Count repeated addition of the same link.
 * @rm_work: Work structure used for removing the link.
 * @supplier_preactivated: Supplier has been made active before consumer probe.
 */
struct device_link {
        struct device *supplier;
        struct list_head s_node;
        struct device *consumer;
        struct list_head c_node;
        struct device link_dev;
        enum device_link_state status;
        u32 flags;
        refcount_t rpm_active;
        struct kref kref;
        struct work_struct rm_work;
        bool supplier_preactivated; /* Owned by consumer probe. */
};

static inline struct device *kobj_to_dev(struct kobject *kobj)
{
        return container_of(kobj, struct device, kobj);
}

/**
 * device_iommu_mapped - Returns true when the device DMA is translated
 *                         by an IOMMU
 * @dev: Device to perform the check on
 */
static inline bool device_iommu_mapped(struct device *dev)
{
        return (dev->iommu_group != NULL);
}

/* Get the wakeup routines, which depend on struct device */
#include <linux/pm_wakeup.h>

static inline const char *dev_name(const struct device *dev)
{
        /* Use the init name until the kobject becomes available */
        if (dev->init_name)
                return dev->init_name;

        return kobject_name(&dev->kobj);
}

/**
 * dev_bus_name - Return a device's bus/class name, if at all possible
 * @dev: struct device to get the bus/class name of
 *
 * Will return the name of the bus/class the device is attached to.  If it is
 * not attached to a bus/class, an empty string will be returned.
 */
static inline const char *dev_bus_name(const struct device *dev)
{
        return dev->bus ? dev->bus->name : (dev->class ? dev->class->name : "");
}

__printf(2, 3) int dev_set_name(struct device *dev, const char *name, ...);

#ifdef CONFIG_NUMA
static inline int dev_to_node(struct device *dev)
{
        return dev->numa_node;
}
static inline void set_dev_node(struct device *dev, int node)
{
        dev->numa_node = node;
}
#else
static inline int dev_to_node(struct device *dev)
{
        return NUMA_NO_NODE;
}
static inline void set_dev_node(struct device *dev, int node)
{
}
#endif

static inline struct irq_domain *dev_get_msi_domain(const struct device *dev)
{
#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
        return dev->msi_domain;
#else
        return NULL;
#endif
}

static inline void dev_set_msi_domain(struct device *dev, struct irq_domain *d)
{
#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
        dev->msi_domain = d;
#endif
}

static inline void *dev_get_drvdata(const struct device *dev)
{
        return dev->driver_data;
}

static inline void dev_set_drvdata(struct device *dev, void *data)
{
        dev->driver_data = data;
}

static inline struct pm_subsys_data *dev_to_psd(struct device *dev)
{
        return dev ? dev->power.subsys_data : NULL;
}

static inline unsigned int dev_get_uevent_suppress(const struct device *dev)
{
        return dev->kobj.uevent_suppress;
}

static inline void dev_set_uevent_suppress(struct device *dev, int val)
{
        dev->kobj.uevent_suppress = val;
}

static inline int device_is_registered(struct device *dev)
{
        return dev->kobj.state_in_sysfs;
}

static inline void device_enable_async_suspend(struct device *dev)
{
        if (!dev->power.is_prepared)
                dev->power.async_suspend = true;
}

static inline void device_disable_async_suspend(struct device *dev)
{
        if (!dev->power.is_prepared)
                dev->power.async_suspend = false;
}

static inline bool device_async_suspend_enabled(struct device *dev)
{
        return !!dev->power.async_suspend;
}

static inline bool device_pm_not_required(struct device *dev)
{
        return dev->power.no_pm;
}

static inline void device_set_pm_not_required(struct device *dev)
{
        dev->power.no_pm = true;
#ifdef CONFIG_PM
        dev->power.no_callbacks = true;
#endif
}

static inline void dev_pm_syscore_device(struct device *dev, bool val)
{
#ifdef CONFIG_PM_SLEEP
        dev->power.syscore = val;
#endif
}

static inline void dev_pm_set_driver_flags(struct device *dev, u32 flags)
{
        dev->power.driver_flags = flags;
}

static inline bool dev_pm_test_driver_flags(struct device *dev, u32 flags)
{
        return !!(dev->power.driver_flags & flags);
}

static inline void device_lock(struct device *dev)
{
        mutex_lock(&dev->mutex);
}

static inline int device_lock_interruptible(struct device *dev)
{
        return mutex_lock_interruptible(&dev->mutex);
}

static inline int device_trylock(struct device *dev)
{
        return mutex_trylock(&dev->mutex);
}

static inline void device_unlock(struct device *dev)
{
        mutex_unlock(&dev->mutex);
}

static inline void device_lock_assert(struct device *dev)
{
        lockdep_assert_held(&dev->mutex);
}

static inline struct device_node *dev_of_node(struct device *dev)
{
        if (!IS_ENABLED(CONFIG_OF) || !dev)
                return NULL;
        return dev->of_node;
}

static inline bool dev_has_sync_state(struct device *dev)
{
        if (!dev)
                return false;
        if (dev->driver && dev->driver->sync_state)
                return true;
        if (dev->bus && dev->bus->sync_state)
                return true;
        return false;
}

static inline void dev_set_removable(struct device *dev,
                                     enum device_removable removable)
{
        dev->removable = removable;
}

static inline bool dev_is_removable(struct device *dev)
{
        return dev->removable == DEVICE_REMOVABLE;
}

static inline bool dev_removable_is_valid(struct device *dev)
{
        return dev->removable != DEVICE_REMOVABLE_NOT_SUPPORTED;
}

/*
 * High level routines for use by the bus drivers
 */
int __must_check device_register(struct device *dev);
void device_unregister(struct device *dev);
void device_initialize(struct device *dev);
int __must_check device_add(struct device *dev);
void device_del(struct device *dev);

DEFINE_FREE(device_del, struct device *, if (_T) device_del(_T))

int device_for_each_child(struct device *dev, void *data,
                          int (*fn)(struct device *dev, void *data));
int device_for_each_child_reverse(struct device *dev, void *data,
                                  int (*fn)(struct device *dev, void *data));
struct device *device_find_child(struct device *dev, void *data,
                                 int (*match)(struct device *dev, void *data));
struct device *device_find_child_by_name(struct device *parent,
                                         const char *name);
struct device *device_find_any_child(struct device *parent);

int device_rename(struct device *dev, const char *new_name);
int device_move(struct device *dev, struct device *new_parent,
                enum dpm_order dpm_order);
int device_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid);
const char *device_get_devnode(struct device *dev, umode_t *mode, kuid_t *uid,
                               kgid_t *gid, const char **tmp);
int device_is_dependent(struct device *dev, void *target);

static inline bool device_supports_offline(struct device *dev)
{
        return dev->bus && dev->bus->offline && dev->bus->online;
}

void lock_device_hotplug(void);
void unlock_device_hotplug(void);
int lock_device_hotplug_sysfs(void);
int device_offline(struct device *dev);
int device_online(struct device *dev);
void set_primary_fwnode(struct device *dev, struct fwnode_handle *fwnode);
void set_secondary_fwnode(struct device *dev, struct fwnode_handle *fwnode);
void device_set_of_node_from_dev(struct device *dev, const struct device *dev2);
void device_set_node(struct device *dev, struct fwnode_handle *fwnode);

static inline int dev_num_vf(struct device *dev)
{
        if (dev->bus && dev->bus->num_vf)
                return dev->bus->num_vf(dev);
        return 0;
}

/*
 * Root device objects for grouping under /sys/devices
 */
struct device *__root_device_register(const char *name, struct module *owner);

/* This is a macro to avoid include problems with THIS_MODULE */
#define root_device_register(name) \
        __root_device_register(name, THIS_MODULE)

void root_device_unregister(struct device *root);

static inline void *dev_get_platdata(const struct device *dev)
{
        return dev->platform_data;
}

/*
 * Manual binding of a device to driver. See drivers/base/bus.c
 * for information on use.
 */
int __must_check device_bind_driver(struct device *dev);
void device_release_driver(struct device *dev);
int  __must_check device_attach(struct device *dev);
int __must_check driver_attach(struct device_driver *drv);
void device_initial_probe(struct device *dev);
int __must_check device_reprobe(struct device *dev);

bool device_is_bound(struct device *dev);

/*
 * Easy functions for dynamically creating devices on the fly
 */
__printf(5, 6) struct device *
device_create(struct class *cls, struct device *parent, dev_t devt,
              void *drvdata, const char *fmt, ...);
__printf(6, 7) struct device *
device_create_with_groups(struct class *cls, struct device *parent, dev_t devt,
                          void *drvdata, const struct attribute_group **groups,
                          const char *fmt, ...);
void device_destroy(struct class *cls, dev_t devt);

int __must_check device_add_groups(struct device *dev,
                                   const struct attribute_group **groups);
void device_remove_groups(struct device *dev,
                          const struct attribute_group **groups);

static inline int __must_check device_add_group(struct device *dev,
                                        const struct attribute_group *grp)
{
        const struct attribute_group *groups[] = { grp, NULL };

        return device_add_groups(dev, groups);
}

static inline void device_remove_group(struct device *dev,
                                       const struct attribute_group *grp)
{
        const struct attribute_group *groups[] = { grp, NULL };

        return device_remove_groups(dev, groups);
}

int __must_check devm_device_add_groups(struct device *dev,
                                        const struct attribute_group **groups);
void devm_device_remove_groups(struct device *dev,
                               const struct attribute_group **groups);
int __must_check devm_device_add_group(struct device *dev,
                                       const struct attribute_group *grp);
void devm_device_remove_group(struct device *dev,
                              const struct attribute_group *grp);

/*
 * Platform "fixup" functions - allow the platform to have their say
 * about devices and actions that the general device layer doesn't
 * know about.
 */
/* Notify platform of device discovery */
extern int (*platform_notify)(struct device *dev);

extern int (*platform_notify_remove)(struct device *dev);


/*
 * get_device - atomically increment the reference count for the device.
 *
 */
struct device *get_device(struct device *dev);
void put_device(struct device *dev);

DEFINE_FREE(put_device, struct device *, if (_T) put_device(_T))

bool kill_device(struct device *dev);

#ifdef CONFIG_DEVTMPFS
int devtmpfs_mount(void);
#else
static inline int devtmpfs_mount(void) { return 0; }
#endif

/* drivers/base/power/shutdown.c */
void device_shutdown(void);

/* debugging and troubleshooting/diagnostic helpers. */
const char *dev_driver_string(const struct device *dev);

/* Device links interface. */
struct device_link *device_link_add(struct device *consumer,
                                    struct device *supplier, u32 flags);
void device_link_del(struct device_link *link);
void device_link_remove(void *consumer, struct device *supplier);
void device_links_supplier_sync_state_pause(void);
void device_links_supplier_sync_state_resume(void);
void device_link_wait_removal(void);

extern __printf(3, 4)
int dev_err_probe(const struct device *dev, int err, const char *fmt, ...);

/* Create alias, so I can be autoloaded. */
#define MODULE_ALIAS_CHARDEV(major,minor) \
        MODULE_ALIAS("char-major-" __stringify(major) "-" __stringify(minor))
#define MODULE_ALIAS_CHARDEV_MAJOR(major) \
        MODULE_ALIAS("char-major-" __stringify(major) "-*")

#ifdef CONFIG_SYSFS_DEPRECATED
extern long sysfs_deprecated;
#else
#define sysfs_deprecated 0
#endif

#endif /* _DEVICE_H_ */


























































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NDISC_H
#define _NDISC_H

#include <net/ipv6_stubs.h>

/*
 *        ICMP codes for neighbour discovery messages
 */

#define NDISC_ROUTER_SOLICITATION        133
#define NDISC_ROUTER_ADVERTISEMENT        134
#define NDISC_NEIGHBOUR_SOLICITATION        135
#define NDISC_NEIGHBOUR_ADVERTISEMENT        136
#define NDISC_REDIRECT                        137

/*
 * Router type: cross-layer information from link-layer to
 * IPv6 layer reported by certain link types (e.g., RFC4214).
 */
#define NDISC_NODETYPE_UNSPEC                0        /* unspecified (default) */
#define NDISC_NODETYPE_HOST                1        /* host or unauthorized router */
#define NDISC_NODETYPE_NODEFAULT        2        /* non-default router */
#define NDISC_NODETYPE_DEFAULT                3        /* default router */

/*
 *        ndisc options
 */

enum {
        __ND_OPT_PREFIX_INFO_END = 0,
        ND_OPT_SOURCE_LL_ADDR = 1,        /* RFC2461 */
        ND_OPT_TARGET_LL_ADDR = 2,        /* RFC2461 */
        ND_OPT_PREFIX_INFO = 3,                /* RFC2461 */
        ND_OPT_REDIRECT_HDR = 4,        /* RFC2461 */
        ND_OPT_MTU = 5,                        /* RFC2461 */
        ND_OPT_NONCE = 14,              /* RFC7527 */
        __ND_OPT_ARRAY_MAX,
        ND_OPT_ROUTE_INFO = 24,                /* RFC4191 */
        ND_OPT_RDNSS = 25,                /* RFC5006 */
        ND_OPT_DNSSL = 31,                /* RFC6106 */
        ND_OPT_6CO = 34,                /* RFC6775 */
        ND_OPT_CAPTIVE_PORTAL = 37,        /* RFC7710 */
        ND_OPT_PREF64 = 38,                /* RFC8781 */
        __ND_OPT_MAX
};

#define MAX_RTR_SOLICITATION_DELAY        HZ

#define ND_REACHABLE_TIME                (30*HZ)
#define ND_RETRANS_TIMER                HZ

#include <linux/compiler.h>
#include <linux/icmpv6.h>
#include <linux/in6.h>
#include <linux/types.h>
#include <linux/if_arp.h>
#include <linux/netdevice.h>
#include <linux/hash.h>

#include <net/neighbour.h>

/* Set to 3 to get tracing... */
#define ND_DEBUG 1

#define ND_PRINTK(val, level, fmt, ...)                                \
do {                                                                \
        if (val <= ND_DEBUG)                                        \
                net_##level##_ratelimited(fmt, ##__VA_ARGS__);        \
} while (0)

struct ctl_table;
struct inet6_dev;
struct net_device;
struct net_proto_family;
struct sk_buff;
struct prefix_info;

extern struct neigh_table nd_tbl;

struct nd_msg {
        struct icmp6hdr        icmph;
        struct in6_addr        target;
        __u8                opt[];
};

struct rs_msg {
        struct icmp6hdr        icmph;
        __u8                opt[];
};

struct ra_msg {
        struct icmp6hdr                icmph;
        __be32                        reachable_time;
        __be32                        retrans_timer;
};

struct rd_msg {
        struct icmp6hdr icmph;
        struct in6_addr        target;
        struct in6_addr        dest;
        __u8                opt[];
};

struct nd_opt_hdr {
        __u8                nd_opt_type;
        __u8                nd_opt_len;
} __packed;

/* ND options */
struct ndisc_options {
        struct nd_opt_hdr *nd_opt_array[__ND_OPT_ARRAY_MAX];
#ifdef CONFIG_IPV6_ROUTE_INFO
        struct nd_opt_hdr *nd_opts_ri;
        struct nd_opt_hdr *nd_opts_ri_end;
#endif
        struct nd_opt_hdr *nd_useropts;
        struct nd_opt_hdr *nd_useropts_end;
#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
        struct nd_opt_hdr *nd_802154_opt_array[ND_OPT_TARGET_LL_ADDR + 1];
#endif
};

#define nd_opts_src_lladdr                nd_opt_array[ND_OPT_SOURCE_LL_ADDR]
#define nd_opts_tgt_lladdr                nd_opt_array[ND_OPT_TARGET_LL_ADDR]
#define nd_opts_pi                        nd_opt_array[ND_OPT_PREFIX_INFO]
#define nd_opts_pi_end                        nd_opt_array[__ND_OPT_PREFIX_INFO_END]
#define nd_opts_rh                        nd_opt_array[ND_OPT_REDIRECT_HDR]
#define nd_opts_mtu                        nd_opt_array[ND_OPT_MTU]
#define nd_opts_nonce                        nd_opt_array[ND_OPT_NONCE]
#define nd_802154_opts_src_lladdr        nd_802154_opt_array[ND_OPT_SOURCE_LL_ADDR]
#define nd_802154_opts_tgt_lladdr        nd_802154_opt_array[ND_OPT_TARGET_LL_ADDR]

#define NDISC_OPT_SPACE(len) (((len)+2+7)&~7)

struct ndisc_options *ndisc_parse_options(const struct net_device *dev,
                                          u8 *opt, int opt_len,
                                          struct ndisc_options *ndopts);

void __ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data,
                              int data_len, int pad);

#define NDISC_OPS_REDIRECT_DATA_SPACE        2

/*
 * This structure defines the hooks for IPv6 neighbour discovery.
 * The following hooks can be defined; unless noted otherwise, they are
 * optional and can be filled with a null pointer.
 *
 * int (*is_useropt)(u8 nd_opt_type):
 *     This function is called when IPv6 decide RA userspace options. if
 *     this function returns 1 then the option given by nd_opt_type will
 *     be handled as userspace option additional to the IPv6 options.
 *
 * int (*parse_options)(const struct net_device *dev,
 *                        struct nd_opt_hdr *nd_opt,
 *                        struct ndisc_options *ndopts):
 *     This function is called while parsing ndisc ops and put each position
 *     as pointer into ndopts. If this function return unequal 0, then this
 *     function took care about the ndisc option, if 0 then the IPv6 ndisc
 *     option parser will take care about that option.
 *
 * void (*update)(const struct net_device *dev, struct neighbour *n,
 *                  u32 flags, u8 icmp6_type,
 *                  const struct ndisc_options *ndopts):
 *     This function is called when IPv6 ndisc updates the neighbour cache
 *     entry. Additional options which can be updated may be previously
 *     parsed by parse_opts callback and accessible over ndopts parameter.
 *
 * int (*opt_addr_space)(const struct net_device *dev, u8 icmp6_type,
 *                         struct neighbour *neigh, u8 *ha_buf,
 *                         u8 **ha):
 *     This function is called when the necessary option space will be
 *     calculated before allocating a skb. The parameters neigh, ha_buf
 *     abd ha are available on NDISC_REDIRECT messages only.
 *
 * void (*fill_addr_option)(const struct net_device *dev,
 *                            struct sk_buff *skb, u8 icmp6_type,
 *                            const u8 *ha):
 *     This function is called when the skb will finally fill the option
 *     fields inside skb. NOTE: this callback should fill the option
 *     fields to the skb which are previously indicated by opt_space
 *     parameter. That means the decision to add such option should
 *     not lost between these two callbacks, e.g. protected by interface
 *     up state.
 *
 * void (*prefix_rcv_add_addr)(struct net *net, struct net_device *dev,
 *                               const struct prefix_info *pinfo,
 *                               struct inet6_dev *in6_dev,
 *                               struct in6_addr *addr,
 *                               int addr_type, u32 addr_flags,
 *                               bool sllao, bool tokenized,
 *                               __u32 valid_lft, u32 prefered_lft,
 *                               bool dev_addr_generated):
 *     This function is called when a RA messages is received with valid
 *     PIO option fields and an IPv6 address will be added to the interface
 *     for autoconfiguration. The parameter dev_addr_generated reports about
 *     if the address was based on dev->dev_addr or not. This can be used
 *     to add a second address if link-layer operates with two link layer
 *     addresses. E.g. 802.15.4 6LoWPAN.
 */
struct ndisc_ops {
        int        (*is_useropt)(u8 nd_opt_type);
        int        (*parse_options)(const struct net_device *dev,
                                 struct nd_opt_hdr *nd_opt,
                                 struct ndisc_options *ndopts);
        void        (*update)(const struct net_device *dev, struct neighbour *n,
                          u32 flags, u8 icmp6_type,
                          const struct ndisc_options *ndopts);
        int        (*opt_addr_space)(const struct net_device *dev, u8 icmp6_type,
                                  struct neighbour *neigh, u8 *ha_buf,
                                  u8 **ha);
        void        (*fill_addr_option)(const struct net_device *dev,
                                    struct sk_buff *skb, u8 icmp6_type,
                                    const u8 *ha);
        void        (*prefix_rcv_add_addr)(struct net *net, struct net_device *dev,
                                       const struct prefix_info *pinfo,
                                       struct inet6_dev *in6_dev,
                                       struct in6_addr *addr,
                                       int addr_type, u32 addr_flags,
                                       bool sllao, bool tokenized,
                                       __u32 valid_lft, u32 prefered_lft,
                                       bool dev_addr_generated);
};

#if IS_ENABLED(CONFIG_IPV6)
static inline int ndisc_ops_is_useropt(const struct net_device *dev,
                                       u8 nd_opt_type)
{
        if (dev->ndisc_ops && dev->ndisc_ops->is_useropt)
                return dev->ndisc_ops->is_useropt(nd_opt_type);
        else
                return 0;
}

static inline int ndisc_ops_parse_options(const struct net_device *dev,
                                          struct nd_opt_hdr *nd_opt,
                                          struct ndisc_options *ndopts)
{
        if (dev->ndisc_ops && dev->ndisc_ops->parse_options)
                return dev->ndisc_ops->parse_options(dev, nd_opt, ndopts);
        else
                return 0;
}

static inline void ndisc_ops_update(const struct net_device *dev,
                                          struct neighbour *n, u32 flags,
                                          u8 icmp6_type,
                                          const struct ndisc_options *ndopts)
{
        if (dev->ndisc_ops && dev->ndisc_ops->update)
                dev->ndisc_ops->update(dev, n, flags, icmp6_type, ndopts);
}

static inline int ndisc_ops_opt_addr_space(const struct net_device *dev,
                                           u8 icmp6_type)
{
        if (dev->ndisc_ops && dev->ndisc_ops->opt_addr_space &&
            icmp6_type != NDISC_REDIRECT)
                return dev->ndisc_ops->opt_addr_space(dev, icmp6_type, NULL,
                                                      NULL, NULL);
        else
                return 0;
}

static inline int ndisc_ops_redirect_opt_addr_space(const struct net_device *dev,
                                                    struct neighbour *neigh,
                                                    u8 *ha_buf, u8 **ha)
{
        if (dev->ndisc_ops && dev->ndisc_ops->opt_addr_space)
                return dev->ndisc_ops->opt_addr_space(dev, NDISC_REDIRECT,
                                                      neigh, ha_buf, ha);
        else
                return 0;
}

static inline void ndisc_ops_fill_addr_option(const struct net_device *dev,
                                              struct sk_buff *skb,
                                              u8 icmp6_type)
{
        if (dev->ndisc_ops && dev->ndisc_ops->fill_addr_option &&
            icmp6_type != NDISC_REDIRECT)
                dev->ndisc_ops->fill_addr_option(dev, skb, icmp6_type, NULL);
}

static inline void ndisc_ops_fill_redirect_addr_option(const struct net_device *dev,
                                                       struct sk_buff *skb,
                                                       const u8 *ha)
{
        if (dev->ndisc_ops && dev->ndisc_ops->fill_addr_option)
                dev->ndisc_ops->fill_addr_option(dev, skb, NDISC_REDIRECT, ha);
}

static inline void ndisc_ops_prefix_rcv_add_addr(struct net *net,
                                                 struct net_device *dev,
                                                 const struct prefix_info *pinfo,
                                                 struct inet6_dev *in6_dev,
                                                 struct in6_addr *addr,
                                                 int addr_type, u32 addr_flags,
                                                 bool sllao, bool tokenized,
                                                 __u32 valid_lft,
                                                 u32 prefered_lft,
                                                 bool dev_addr_generated)
{
        if (dev->ndisc_ops && dev->ndisc_ops->prefix_rcv_add_addr)
                dev->ndisc_ops->prefix_rcv_add_addr(net, dev, pinfo, in6_dev,
                                                    addr, addr_type,
                                                    addr_flags, sllao,
                                                    tokenized, valid_lft,
                                                    prefered_lft,
                                                    dev_addr_generated);
}
#endif

/*
 * Return the padding between the option length and the start of the
 * link addr.  Currently only IP-over-InfiniBand needs this, although
 * if RFC 3831 IPv6-over-Fibre Channel is ever implemented it may
 * also need a pad of 2.
 */
static inline int ndisc_addr_option_pad(unsigned short type)
{
        switch (type) {
        case ARPHRD_INFINIBAND: return 2;
        default:                return 0;
        }
}

static inline int __ndisc_opt_addr_space(unsigned char addr_len, int pad)
{
        return NDISC_OPT_SPACE(addr_len + pad);
}

#if IS_ENABLED(CONFIG_IPV6)
static inline int ndisc_opt_addr_space(struct net_device *dev, u8 icmp6_type)
{
        return __ndisc_opt_addr_space(dev->addr_len,
                                      ndisc_addr_option_pad(dev->type)) +
                ndisc_ops_opt_addr_space(dev, icmp6_type);
}

static inline int ndisc_redirect_opt_addr_space(struct net_device *dev,
                                                struct neighbour *neigh,
                                                u8 *ops_data_buf,
                                                u8 **ops_data)
{
        return __ndisc_opt_addr_space(dev->addr_len,
                                      ndisc_addr_option_pad(dev->type)) +
                ndisc_ops_redirect_opt_addr_space(dev, neigh, ops_data_buf,
                                                  ops_data);
}
#endif

static inline u8 *__ndisc_opt_addr_data(struct nd_opt_hdr *p,
                                        unsigned char addr_len, int prepad)
{
        u8 *lladdr = (u8 *)(p + 1);
        int lladdrlen = p->nd_opt_len << 3;
        if (lladdrlen != __ndisc_opt_addr_space(addr_len, prepad))
                return NULL;
        return lladdr + prepad;
}

static inline u8 *ndisc_opt_addr_data(struct nd_opt_hdr *p,
                                      struct net_device *dev)
{
        return __ndisc_opt_addr_data(p, dev->addr_len,
                                     ndisc_addr_option_pad(dev->type));
}

static inline u32 ndisc_hashfn(const void *pkey, const struct net_device *dev, __u32 *hash_rnd)
{
        const u32 *p32 = pkey;

        return (((p32[0] ^ hash32_ptr(dev)) * hash_rnd[0]) +
                (p32[1] * hash_rnd[1]) +
                (p32[2] * hash_rnd[2]) +
                (p32[3] * hash_rnd[3]));
}

static inline struct neighbour *__ipv6_neigh_lookup_noref(struct net_device *dev, const void *pkey)
{
        return ___neigh_lookup_noref(&nd_tbl, neigh_key_eq128, ndisc_hashfn, pkey, dev);
}

static inline
struct neighbour *__ipv6_neigh_lookup_noref_stub(struct net_device *dev,
                                                 const void *pkey)
{
        return ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
                                     ndisc_hashfn, pkey, dev);
}

static inline struct neighbour *__ipv6_neigh_lookup(struct net_device *dev, const void *pkey)
{
        struct neighbour *n;

        rcu_read_lock_bh();
        n = __ipv6_neigh_lookup_noref(dev, pkey);
        if (n && !refcount_inc_not_zero(&n->refcnt))
                n = NULL;
        rcu_read_unlock_bh();

        return n;
}

static inline void __ipv6_confirm_neigh(struct net_device *dev,
                                        const void *pkey)
{
        struct neighbour *n;

        rcu_read_lock_bh();
        n = __ipv6_neigh_lookup_noref(dev, pkey);
        if (n) {
                unsigned long now = jiffies;

                /* avoid dirtying neighbour */
                if (READ_ONCE(n->confirmed) != now)
                        WRITE_ONCE(n->confirmed, now);
        }
        rcu_read_unlock_bh();
}

static inline void __ipv6_confirm_neigh_stub(struct net_device *dev,
                                             const void *pkey)
{
        struct neighbour *n;

        rcu_read_lock_bh();
        n = __ipv6_neigh_lookup_noref_stub(dev, pkey);
        if (n) {
                unsigned long now = jiffies;

                /* avoid dirtying neighbour */
                if (READ_ONCE(n->confirmed) != now)
                        WRITE_ONCE(n->confirmed, now);
        }
        rcu_read_unlock_bh();
}

/* uses ipv6_stub and is meant for use outside of IPv6 core */
static inline struct neighbour *ip_neigh_gw6(struct net_device *dev,
                                             const void *addr)
{
        struct neighbour *neigh;

        neigh = __ipv6_neigh_lookup_noref_stub(dev, addr);
        if (unlikely(!neigh))
                neigh = __neigh_create(ipv6_stub->nd_tbl, addr, dev, false);

        return neigh;
}

int ndisc_init(void);
int ndisc_late_init(void);

void ndisc_late_cleanup(void);
void ndisc_cleanup(void);

int ndisc_rcv(struct sk_buff *skb);

void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
                   const struct in6_addr *daddr, const struct in6_addr *saddr,
                   u64 nonce);

void ndisc_send_rs(struct net_device *dev,
                   const struct in6_addr *saddr, const struct in6_addr *daddr);
void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr,
                   const struct in6_addr *solicited_addr,
                   bool router, bool solicited, bool override, bool inc_opt);

void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target);

int ndisc_mc_map(const struct in6_addr *addr, char *buf, struct net_device *dev,
                 int dir);

void ndisc_update(const struct net_device *dev, struct neighbour *neigh,
                  const u8 *lladdr, u8 new, u32 flags, u8 icmp6_type,
                  struct ndisc_options *ndopts);

/*
 *        IGMP
 */
int igmp6_init(void);
int igmp6_late_init(void);

void igmp6_cleanup(void);
void igmp6_late_cleanup(void);

int igmp6_event_query(struct sk_buff *skb);

int igmp6_event_report(struct sk_buff *skb);


#ifdef CONFIG_SYSCTL
int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write,
                               void *buffer, size_t *lenp, loff_t *ppos);
int ndisc_ifinfo_sysctl_strategy(struct ctl_table *ctl,
                                 void __user *oldval, size_t __user *oldlenp,
                                 void __user *newval, size_t newlen);
#endif

void inet6_ifinfo_notify(int event, struct inet6_dev *idev);

#endif




















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_COOKIE_H
#define __LINUX_COOKIE_H

#include <linux/atomic.h>
#include <linux/percpu.h>
#include <asm/local.h>

struct pcpu_gen_cookie {
        local_t nesting;
        u64 last;
} __aligned(16);

struct gen_cookie {
        struct pcpu_gen_cookie __percpu *local;
        atomic64_t forward_last ____cacheline_aligned_in_smp;
        atomic64_t reverse_last;
};

#define COOKIE_LOCAL_BATCH        4096

#define DEFINE_COOKIE(name)                                                \
        static DEFINE_PER_CPU(struct pcpu_gen_cookie, __##name);        \
        static struct gen_cookie name = {                                \
                .local                = &__##name,                                \
                .forward_last        = ATOMIC64_INIT(0),                        \
                .reverse_last        = ATOMIC64_INIT(0),                        \
        }

static __always_inline u64 gen_cookie_next(struct gen_cookie *gc)
{
        struct pcpu_gen_cookie *local = this_cpu_ptr(gc->local);
        u64 val;

        if (likely(local_inc_return(&local->nesting) == 1)) {
                val = local->last;
                if (__is_defined(CONFIG_SMP) &&
                    unlikely((val & (COOKIE_LOCAL_BATCH - 1)) == 0)) {
                        s64 next = atomic64_add_return(COOKIE_LOCAL_BATCH,
                                                       &gc->forward_last);
                        val = next - COOKIE_LOCAL_BATCH;
                }
                local->last = ++val;
        } else {
                val = atomic64_dec_return(&gc->reverse_last);
        }
        local_dec(&local->nesting);
        return val;
}

#endif /* __LINUX_COOKIE_H */
















































































































































































    2 




























    1 
    1 












    1 

    1 
    1 

    1 













    1 





























    1 
    1 




    1 








    1 































    1 






    1 

















































































































    1 
















































    1 













    1 

    1 



    1 




































    1 

    1 








    1 




















































    1 



























































































































































































    1 



    1 



    1 
    1 













    1 

    1 
    1 



    1 






    1 





    1 






    1 
    1 











    1 

    1 











    1 




    1 







    1 






    1 




    1 

    1 

    1 

    1 





    1 





    1 


    1 


    1 





    1 









    1 
    1 



















    1 
    1 

    1 




    1 




    1 


    1 

    1 



















    3 






    3 




    3 






    1 
    1 

    1 


    1 







    3 

    1 
    3 




    3 













    3 














































































































































































































































































































































    1 


    1 


    1 



    1 



    1 
    1 



    1 

























    1 







    1 














































    1 
































































































































































































































































































































































































































    2 









































































































































































































































































    1 

















    1 

























    1 


    1 


    1 



    1 

    1 





























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
// SPDX-License-Identifier: GPL-2.0
/*
 *  fs/ext4/extents_status.c
 *
 * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
 * Modified by
 *        Allison Henderson <achender@linux.vnet.ibm.com>
 *        Hugh Dickins <hughd@google.com>
 *        Zheng Liu <wenqing.lz@taobao.com>
 *
 * Ext4 extents status tree core functions.
 */
#include <linux/list_sort.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include "ext4.h"

#include <trace/events/ext4.h>

/*
 * According to previous discussion in Ext4 Developer Workshop, we
 * will introduce a new structure called io tree to track all extent
 * status in order to solve some problems that we have met
 * (e.g. Reservation space warning), and provide extent-level locking.
 * Delay extent tree is the first step to achieve this goal.  It is
 * original built by Yongqiang Yang.  At that time it is called delay
 * extent tree, whose goal is only track delayed extents in memory to
 * simplify the implementation of fiemap and bigalloc, and introduce
 * lseek SEEK_DATA/SEEK_HOLE support.  That is why it is still called
 * delay extent tree at the first commit.  But for better understand
 * what it does, it has been rename to extent status tree.
 *
 * Step1:
 * Currently the first step has been done.  All delayed extents are
 * tracked in the tree.  It maintains the delayed extent when a delayed
 * allocation is issued, and the delayed extent is written out or
 * invalidated.  Therefore the implementation of fiemap and bigalloc
 * are simplified, and SEEK_DATA/SEEK_HOLE are introduced.
 *
 * The following comment describes the implemenmtation of extent
 * status tree and future works.
 *
 * Step2:
 * In this step all extent status are tracked by extent status tree.
 * Thus, we can first try to lookup a block mapping in this tree before
 * finding it in extent tree.  Hence, single extent cache can be removed
 * because extent status tree can do a better job.  Extents in status
 * tree are loaded on-demand.  Therefore, the extent status tree may not
 * contain all of the extents in a file.  Meanwhile we define a shrinker
 * to reclaim memory from extent status tree because fragmented extent
 * tree will make status tree cost too much memory.  written/unwritten/-
 * hole extents in the tree will be reclaimed by this shrinker when we
 * are under high memory pressure.  Delayed extents will not be
 * reclimed because fiemap, bigalloc, and seek_data/hole need it.
 */

/*
 * Extent status tree implementation for ext4.
 *
 *
 * ==========================================================================
 * Extent status tree tracks all extent status.
 *
 * 1. Why we need to implement extent status tree?
 *
 * Without extent status tree, ext4 identifies a delayed extent by looking
 * up page cache, this has several deficiencies - complicated, buggy,
 * and inefficient code.
 *
 * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a
 * block or a range of blocks are belonged to a delayed extent.
 *
 * Let us have a look at how they do without extent status tree.
 *   --        FIEMAP
 *        FIEMAP looks up page cache to identify delayed allocations from holes.
 *
 *   --        SEEK_HOLE/DATA
 *        SEEK_HOLE/DATA has the same problem as FIEMAP.
 *
 *   --        bigalloc
 *        bigalloc looks up page cache to figure out if a block is
 *        already under delayed allocation or not to determine whether
 *        quota reserving is needed for the cluster.
 *
 *   --        writeout
 *        Writeout looks up whole page cache to see if a buffer is
 *        mapped, If there are not very many delayed buffers, then it is
 *        time consuming.
 *
 * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA,
 * bigalloc and writeout can figure out if a block or a range of
 * blocks is under delayed allocation(belonged to a delayed extent) or
 * not by searching the extent tree.
 *
 *
 * ==========================================================================
 * 2. Ext4 extent status tree impelmentation
 *
 *   --        extent
 *        A extent is a range of blocks which are contiguous logically and
 *        physically.  Unlike extent in extent tree, this extent in ext4 is
 *        a in-memory struct, there is no corresponding on-disk data.  There
 *        is no limit on length of extent, so an extent can contain as many
 *        blocks as they are contiguous logically and physically.
 *
 *   --        extent status tree
 *        Every inode has an extent status tree and all allocation blocks
 *        are added to the tree with different status.  The extent in the
 *        tree are ordered by logical block no.
 *
 *   --        operations on a extent status tree
 *        There are three important operations on a delayed extent tree: find
 *        next extent, adding a extent(a range of blocks) and removing a extent.
 *
 *   --        race on a extent status tree
 *        Extent status tree is protected by inode->i_es_lock.
 *
 *   --        memory consumption
 *      Fragmented extent tree will make extent status tree cost too much
 *      memory.  Hence, we will reclaim written/unwritten/hole extents from
 *      the tree under a heavy memory pressure.
 *
 *
 * ==========================================================================
 * 3. Performance analysis
 *
 *   --        overhead
 *        1. There is a cache extent for write access, so if writes are
 *        not very random, adding space operaions are in O(1) time.
 *
 *   --        gain
 *        2. Code is much simpler, more readable, more maintainable and
 *        more efficient.
 *
 *
 * ==========================================================================
 * 4. TODO list
 *
 *   -- Refactor delayed space reservation
 *
 *   -- Extent-level locking
 */

static struct kmem_cache *ext4_es_cachep;
static struct kmem_cache *ext4_pending_cachep;

static int __es_insert_extent(struct inode *inode, struct extent_status *newes,
                              struct extent_status *prealloc);
static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
                              ext4_lblk_t end, int *reserved,
                              struct extent_status *prealloc);
static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
                       struct ext4_inode_info *locked_ei);
static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
                            ext4_lblk_t len,
                            struct pending_reservation **prealloc);

int __init ext4_init_es(void)
{
        ext4_es_cachep = kmem_cache_create("ext4_extent_status",
                                           sizeof(struct extent_status),
                                           0, (SLAB_RECLAIM_ACCOUNT), NULL);
        if (ext4_es_cachep == NULL)
                return -ENOMEM;
        return 0;
}

void ext4_exit_es(void)
{
        kmem_cache_destroy(ext4_es_cachep);
}

void ext4_es_init_tree(struct ext4_es_tree *tree)
{
        tree->root = RB_ROOT;
        tree->cache_es = NULL;
}

#ifdef ES_DEBUG__
static void ext4_es_print_tree(struct inode *inode)
{
        struct ext4_es_tree *tree;
        struct rb_node *node;

        printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino);
        tree = &EXT4_I(inode)->i_es_tree;
        node = rb_first(&tree->root);
        while (node) {
                struct extent_status *es;
                es = rb_entry(node, struct extent_status, rb_node);
                printk(KERN_DEBUG " [%u/%u) %llu %x",
                       es->es_lblk, es->es_len,
                       ext4_es_pblock(es), ext4_es_status(es));
                node = rb_next(node);
        }
        printk(KERN_DEBUG "\n");
}
#else
#define ext4_es_print_tree(inode)
#endif

static inline ext4_lblk_t ext4_es_end(struct extent_status *es)
{
        BUG_ON(es->es_lblk + es->es_len < es->es_lblk);
        return es->es_lblk + es->es_len - 1;
}

/*
 * search through the tree for an delayed extent with a given offset.  If
 * it can't be found, try to find next extent.
 */
static struct extent_status *__es_tree_search(struct rb_root *root,
                                              ext4_lblk_t lblk)
{
        struct rb_node *node = root->rb_node;
        struct extent_status *es = NULL;

        while (node) {
                es = rb_entry(node, struct extent_status, rb_node);
                if (lblk < es->es_lblk)
                        node = node->rb_left;
                else if (lblk > ext4_es_end(es))
                        node = node->rb_right;
                else
                        return es;
        }

        if (es && lblk < es->es_lblk)
                return es;

        if (es && lblk > ext4_es_end(es)) {
                node = rb_next(&es->rb_node);
                return node ? rb_entry(node, struct extent_status, rb_node) :
                              NULL;
        }

        return NULL;
}

/*
 * ext4_es_find_extent_range - find extent with specified status within block
 *                             range or next extent following block range in
 *                             extents status tree
 *
 * @inode - file containing the range
 * @matching_fn - pointer to function that matches extents with desired status
 * @lblk - logical block defining start of range
 * @end - logical block defining end of range
 * @es - extent found, if any
 *
 * Find the first extent within the block range specified by @lblk and @end
 * in the extents status tree that satisfies @matching_fn.  If a match
 * is found, it's returned in @es.  If not, and a matching extent is found
 * beyond the block range, it's returned in @es.  If no match is found, an
 * extent is returned in @es whose es_lblk, es_len, and es_pblk components
 * are 0.
 */
static void __es_find_extent_range(struct inode *inode,
                                   int (*matching_fn)(struct extent_status *es),
                                   ext4_lblk_t lblk, ext4_lblk_t end,
                                   struct extent_status *es)
{
        struct ext4_es_tree *tree = NULL;
        struct extent_status *es1 = NULL;
        struct rb_node *node;

        WARN_ON(es == NULL);
        WARN_ON(end < lblk);

        tree = &EXT4_I(inode)->i_es_tree;

        /* see if the extent has been cached */
        es->es_lblk = es->es_len = es->es_pblk = 0;
        es1 = READ_ONCE(tree->cache_es);
        if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) {
                es_debug("%u cached by [%u/%u) %llu %x\n",
                         lblk, es1->es_lblk, es1->es_len,
                         ext4_es_pblock(es1), ext4_es_status(es1));
                goto out;
        }

        es1 = __es_tree_search(&tree->root, lblk);

out:
        if (es1 && !matching_fn(es1)) {
                while ((node = rb_next(&es1->rb_node)) != NULL) {
                        es1 = rb_entry(node, struct extent_status, rb_node);
                        if (es1->es_lblk > end) {
                                es1 = NULL;
                                break;
                        }
                        if (matching_fn(es1))
                                break;
                }
        }

        if (es1 && matching_fn(es1)) {
                WRITE_ONCE(tree->cache_es, es1);
                es->es_lblk = es1->es_lblk;
                es->es_len = es1->es_len;
                es->es_pblk = es1->es_pblk;
        }

}

/*
 * Locking for __es_find_extent_range() for external use
 */
void ext4_es_find_extent_range(struct inode *inode,
                               int (*matching_fn)(struct extent_status *es),
                               ext4_lblk_t lblk, ext4_lblk_t end,
                               struct extent_status *es)
{
        es->es_lblk = es->es_len = es->es_pblk = 0;

        if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                return;

        trace_ext4_es_find_extent_range_enter(inode, lblk);

        read_lock(&EXT4_I(inode)->i_es_lock);
        __es_find_extent_range(inode, matching_fn, lblk, end, es);
        read_unlock(&EXT4_I(inode)->i_es_lock);

        trace_ext4_es_find_extent_range_exit(inode, es);
}

/*
 * __es_scan_range - search block range for block with specified status
 *                   in extents status tree
 *
 * @inode - file containing the range
 * @matching_fn - pointer to function that matches extents with desired status
 * @lblk - logical block defining start of range
 * @end - logical block defining end of range
 *
 * Returns true if at least one block in the specified block range satisfies
 * the criterion specified by @matching_fn, and false if not.  If at least
 * one extent has the specified status, then there is at least one block
 * in the cluster with that status.  Should only be called by code that has
 * taken i_es_lock.
 */
static bool __es_scan_range(struct inode *inode,
                            int (*matching_fn)(struct extent_status *es),
                            ext4_lblk_t start, ext4_lblk_t end)
{
        struct extent_status es;

        __es_find_extent_range(inode, matching_fn, start, end, &es);
        if (es.es_len == 0)
                return false;   /* no matching extent in the tree */
        else if (es.es_lblk <= start &&
                 start < es.es_lblk + es.es_len)
                return true;
        else if (start <= es.es_lblk && es.es_lblk <= end)
                return true;
        else
                return false;
}
/*
 * Locking for __es_scan_range() for external use
 */
bool ext4_es_scan_range(struct inode *inode,
                        int (*matching_fn)(struct extent_status *es),
                        ext4_lblk_t lblk, ext4_lblk_t end)
{
        bool ret;

        if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                return false;

        read_lock(&EXT4_I(inode)->i_es_lock);
        ret = __es_scan_range(inode, matching_fn, lblk, end);
        read_unlock(&EXT4_I(inode)->i_es_lock);

        return ret;
}

/*
 * __es_scan_clu - search cluster for block with specified status in
 *                 extents status tree
 *
 * @inode - file containing the cluster
 * @matching_fn - pointer to function that matches extents with desired status
 * @lblk - logical block in cluster to be searched
 *
 * Returns true if at least one extent in the cluster containing @lblk
 * satisfies the criterion specified by @matching_fn, and false if not.  If at
 * least one extent has the specified status, then there is at least one block
 * in the cluster with that status.  Should only be called by code that has
 * taken i_es_lock.
 */
static bool __es_scan_clu(struct inode *inode,
                          int (*matching_fn)(struct extent_status *es),
                          ext4_lblk_t lblk)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        ext4_lblk_t lblk_start, lblk_end;

        lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
        lblk_end = lblk_start + sbi->s_cluster_ratio - 1;

        return __es_scan_range(inode, matching_fn, lblk_start, lblk_end);
}

/*
 * Locking for __es_scan_clu() for external use
 */
bool ext4_es_scan_clu(struct inode *inode,
                      int (*matching_fn)(struct extent_status *es),
                      ext4_lblk_t lblk)
{
        bool ret;

        if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                return false;

        read_lock(&EXT4_I(inode)->i_es_lock);
        ret = __es_scan_clu(inode, matching_fn, lblk);
        read_unlock(&EXT4_I(inode)->i_es_lock);

        return ret;
}

static void ext4_es_list_add(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

        if (!list_empty(&ei->i_es_list))
                return;

        spin_lock(&sbi->s_es_lock);
        if (list_empty(&ei->i_es_list)) {
                list_add_tail(&ei->i_es_list, &sbi->s_es_list);
                sbi->s_es_nr_inode++;
        }
        spin_unlock(&sbi->s_es_lock);
}

static void ext4_es_list_del(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

        spin_lock(&sbi->s_es_lock);
        if (!list_empty(&ei->i_es_list)) {
                list_del_init(&ei->i_es_list);
                sbi->s_es_nr_inode--;
                WARN_ON_ONCE(sbi->s_es_nr_inode < 0);
        }
        spin_unlock(&sbi->s_es_lock);
}

static inline struct pending_reservation *__alloc_pending(bool nofail)
{
        if (!nofail)
                return kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC);

        return kmem_cache_zalloc(ext4_pending_cachep, GFP_KERNEL | __GFP_NOFAIL);
}

static inline void __free_pending(struct pending_reservation *pr)
{
        kmem_cache_free(ext4_pending_cachep, pr);
}

/*
 * Returns true if we cannot fail to allocate memory for this extent_status
 * entry and cannot reclaim it until its status changes.
 */
static inline bool ext4_es_must_keep(struct extent_status *es)
{
        /* fiemap, bigalloc, and seek_data/hole need to use it. */
        if (ext4_es_is_delayed(es))
                return true;

        return false;
}

static inline struct extent_status *__es_alloc_extent(bool nofail)
{
        if (!nofail)
                return kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);

        return kmem_cache_zalloc(ext4_es_cachep, GFP_KERNEL | __GFP_NOFAIL);
}

static void ext4_es_init_extent(struct inode *inode, struct extent_status *es,
                ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk)
{
        es->es_lblk = lblk;
        es->es_len = len;
        es->es_pblk = pblk;

        /* We never try to reclaim a must kept extent, so we don't count it. */
        if (!ext4_es_must_keep(es)) {
                if (!EXT4_I(inode)->i_es_shk_nr++)
                        ext4_es_list_add(inode);
                percpu_counter_inc(&EXT4_SB(inode->i_sb)->
                                        s_es_stats.es_stats_shk_cnt);
        }

        EXT4_I(inode)->i_es_all_nr++;
        percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
}

static inline void __es_free_extent(struct extent_status *es)
{
        kmem_cache_free(ext4_es_cachep, es);
}

static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
{
        EXT4_I(inode)->i_es_all_nr--;
        percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);

        /* Decrease the shrink counter when we can reclaim the extent. */
        if (!ext4_es_must_keep(es)) {
                BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0);
                if (!--EXT4_I(inode)->i_es_shk_nr)
                        ext4_es_list_del(inode);
                percpu_counter_dec(&EXT4_SB(inode->i_sb)->
                                        s_es_stats.es_stats_shk_cnt);
        }

        __es_free_extent(es);
}

/*
 * Check whether or not two extents can be merged
 * Condition:
 *  - logical block number is contiguous
 *  - physical block number is contiguous
 *  - status is equal
 */
static int ext4_es_can_be_merged(struct extent_status *es1,
                                 struct extent_status *es2)
{
        if (ext4_es_type(es1) != ext4_es_type(es2))
                return 0;

        if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) {
                pr_warn("ES assertion failed when merging extents. "
                        "The sum of lengths of es1 (%d) and es2 (%d) "
                        "is bigger than allowed file size (%d)\n",
                        es1->es_len, es2->es_len, EXT_MAX_BLOCKS);
                WARN_ON(1);
                return 0;
        }

        if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk)
                return 0;

        if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) &&
            (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2)))
                return 1;

        if (ext4_es_is_hole(es1))
                return 1;

        /* we need to check delayed extent is without unwritten status */
        if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1))
                return 1;

        return 0;
}

static struct extent_status *
ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es)
{
        struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
        struct extent_status *es1;
        struct rb_node *node;

        node = rb_prev(&es->rb_node);
        if (!node)
                return es;

        es1 = rb_entry(node, struct extent_status, rb_node);
        if (ext4_es_can_be_merged(es1, es)) {
                es1->es_len += es->es_len;
                if (ext4_es_is_referenced(es))
                        ext4_es_set_referenced(es1);
                rb_erase(&es->rb_node, &tree->root);
                ext4_es_free_extent(inode, es);
                es = es1;
        }

        return es;
}

static struct extent_status *
ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
{
        struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
        struct extent_status *es1;
        struct rb_node *node;

        node = rb_next(&es->rb_node);
        if (!node)
                return es;

        es1 = rb_entry(node, struct extent_status, rb_node);
        if (ext4_es_can_be_merged(es, es1)) {
                es->es_len += es1->es_len;
                if (ext4_es_is_referenced(es1))
                        ext4_es_set_referenced(es);
                rb_erase(node, &tree->root);
                ext4_es_free_extent(inode, es1);
        }

        return es;
}

#ifdef ES_AGGRESSIVE_TEST
#include "ext4_extents.h"        /* Needed when ES_AGGRESSIVE_TEST is defined */

static void ext4_es_insert_extent_ext_check(struct inode *inode,
                                            struct extent_status *es)
{
        struct ext4_ext_path *path = NULL;
        struct ext4_extent *ex;
        ext4_lblk_t ee_block;
        ext4_fsblk_t ee_start;
        unsigned short ee_len;
        int depth, ee_status, es_status;

        path = ext4_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE);
        if (IS_ERR(path))
                return;

        depth = ext_depth(inode);
        ex = path[depth].p_ext;

        if (ex) {

                ee_block = le32_to_cpu(ex->ee_block);
                ee_start = ext4_ext_pblock(ex);
                ee_len = ext4_ext_get_actual_len(ex);

                ee_status = ext4_ext_is_unwritten(ex) ? 1 : 0;
                es_status = ext4_es_is_unwritten(es) ? 1 : 0;

                /*
                 * Make sure ex and es are not overlap when we try to insert
                 * a delayed/hole extent.
                 */
                if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) {
                        if (in_range(es->es_lblk, ee_block, ee_len)) {
                                pr_warn("ES insert assertion failed for "
                                        "inode: %lu we can find an extent "
                                        "at block [%d/%d/%llu/%c], but we "
                                        "want to add a delayed/hole extent "
                                        "[%d/%d/%llu/%x]\n",
                                        inode->i_ino, ee_block, ee_len,
                                        ee_start, ee_status ? 'u' : 'w',
                                        es->es_lblk, es->es_len,
                                        ext4_es_pblock(es), ext4_es_status(es));
                        }
                        goto out;
                }

                /*
                 * We don't check ee_block == es->es_lblk, etc. because es
                 * might be a part of whole extent, vice versa.
                 */
                if (es->es_lblk < ee_block ||
                    ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) {
                        pr_warn("ES insert assertion failed for inode: %lu "
                                "ex_status [%d/%d/%llu/%c] != "
                                "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
                                ee_block, ee_len, ee_start,
                                ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
                                ext4_es_pblock(es), es_status ? 'u' : 'w');
                        goto out;
                }

                if (ee_status ^ es_status) {
                        pr_warn("ES insert assertion failed for inode: %lu "
                                "ex_status [%d/%d/%llu/%c] != "
                                "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
                                ee_block, ee_len, ee_start,
                                ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
                                ext4_es_pblock(es), es_status ? 'u' : 'w');
                }
        } else {
                /*
                 * We can't find an extent on disk.  So we need to make sure
                 * that we don't want to add an written/unwritten extent.
                 */
                if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
                        pr_warn("ES insert assertion failed for inode: %lu "
                                "can't find an extent at block %d but we want "
                                "to add a written/unwritten extent "
                                "[%d/%d/%llu/%x]\n", inode->i_ino,
                                es->es_lblk, es->es_lblk, es->es_len,
                                ext4_es_pblock(es), ext4_es_status(es));
                }
        }
out:
        ext4_ext_drop_refs(path);
        kfree(path);
}

static void ext4_es_insert_extent_ind_check(struct inode *inode,
                                            struct extent_status *es)
{
        struct ext4_map_blocks map;
        int retval;

        /*
         * Here we call ext4_ind_map_blocks to lookup a block mapping because
         * 'Indirect' structure is defined in indirect.c.  So we couldn't
         * access direct/indirect tree from outside.  It is too dirty to define
         * this function in indirect.c file.
         */

        map.m_lblk = es->es_lblk;
        map.m_len = es->es_len;

        retval = ext4_ind_map_blocks(NULL, inode, &map, 0);
        if (retval > 0) {
                if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) {
                        /*
                         * We want to add a delayed/hole extent but this
                         * block has been allocated.
                         */
                        pr_warn("ES insert assertion failed for inode: %lu "
                                "We can find blocks but we want to add a "
                                "delayed/hole extent [%d/%d/%llu/%x]\n",
                                inode->i_ino, es->es_lblk, es->es_len,
                                ext4_es_pblock(es), ext4_es_status(es));
                        return;
                } else if (ext4_es_is_written(es)) {
                        if (retval != es->es_len) {
                                pr_warn("ES insert assertion failed for "
                                        "inode: %lu retval %d != es_len %d\n",
                                        inode->i_ino, retval, es->es_len);
                                return;
                        }
                        if (map.m_pblk != ext4_es_pblock(es)) {
                                pr_warn("ES insert assertion failed for "
                                        "inode: %lu m_pblk %llu != "
                                        "es_pblk %llu\n",
                                        inode->i_ino, map.m_pblk,
                                        ext4_es_pblock(es));
                                return;
                        }
                } else {
                        /*
                         * We don't need to check unwritten extent because
                         * indirect-based file doesn't have it.
                         */
                        BUG();
                }
        } else if (retval == 0) {
                if (ext4_es_is_written(es)) {
                        pr_warn("ES insert assertion failed for inode: %lu "
                                "We can't find the block but we want to add "
                                "a written extent [%d/%d/%llu/%x]\n",
                                inode->i_ino, es->es_lblk, es->es_len,
                                ext4_es_pblock(es), ext4_es_status(es));
                        return;
                }
        }
}

static inline void ext4_es_insert_extent_check(struct inode *inode,
                                               struct extent_status *es)
{
        /*
         * We don't need to worry about the race condition because
         * caller takes i_data_sem locking.
         */
        BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                ext4_es_insert_extent_ext_check(inode, es);
        else
                ext4_es_insert_extent_ind_check(inode, es);
}
#else
static inline void ext4_es_insert_extent_check(struct inode *inode,
                                               struct extent_status *es)
{
}
#endif

static int __es_insert_extent(struct inode *inode, struct extent_status *newes,
                              struct extent_status *prealloc)
{
        struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
        struct rb_node **p = &tree->root.rb_node;
        struct rb_node *parent = NULL;
        struct extent_status *es;

        while (*p) {
                parent = *p;
                es = rb_entry(parent, struct extent_status, rb_node);

                if (newes->es_lblk < es->es_lblk) {
                        if (ext4_es_can_be_merged(newes, es)) {
                                /*
                                 * Here we can modify es_lblk directly
                                 * because it isn't overlapped.
                                 */
                                es->es_lblk = newes->es_lblk;
                                es->es_len += newes->es_len;
                                if (ext4_es_is_written(es) ||
                                    ext4_es_is_unwritten(es))
                                        ext4_es_store_pblock(es,
                                                             newes->es_pblk);
                                es = ext4_es_try_to_merge_left(inode, es);
                                goto out;
                        }
                        p = &(*p)->rb_left;
                } else if (newes->es_lblk > ext4_es_end(es)) {
                        if (ext4_es_can_be_merged(es, newes)) {
                                es->es_len += newes->es_len;
                                es = ext4_es_try_to_merge_right(inode, es);
                                goto out;
                        }
                        p = &(*p)->rb_right;
                } else {
                        BUG();
                        return -EINVAL;
                }
        }

        if (prealloc)
                es = prealloc;
        else
                es = __es_alloc_extent(false);
        if (!es)
                return -ENOMEM;
        ext4_es_init_extent(inode, es, newes->es_lblk, newes->es_len,
                            newes->es_pblk);

        rb_link_node(&es->rb_node, parent, p);
        rb_insert_color(&es->rb_node, &tree->root);

out:
        tree->cache_es = es;
        return 0;
}

/*
 * ext4_es_insert_extent() adds information to an inode's extent
 * status tree.
 */
void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
                           ext4_lblk_t len, ext4_fsblk_t pblk,
                           unsigned int status)
{
        struct extent_status newes;
        ext4_lblk_t end = lblk + len - 1;
        int err1 = 0, err2 = 0, err3 = 0;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct extent_status *es1 = NULL;
        struct extent_status *es2 = NULL;
        struct pending_reservation *pr = NULL;
        bool revise_pending = false;

        if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                return;

        es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
                 lblk, len, pblk, status, inode->i_ino);

        if (!len)
                return;

        BUG_ON(end < lblk);

        if ((status & EXTENT_STATUS_DELAYED) &&
            (status & EXTENT_STATUS_WRITTEN)) {
                ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as "
                                " delayed and written which can potentially "
                                " cause data loss.", lblk, len);
                WARN_ON(1);
        }

        newes.es_lblk = lblk;
        newes.es_len = len;
        ext4_es_store_pblock_status(&newes, pblk, status);
        trace_ext4_es_insert_extent(inode, &newes);

        ext4_es_insert_extent_check(inode, &newes);

        revise_pending = sbi->s_cluster_ratio > 1 &&
                         test_opt(inode->i_sb, DELALLOC) &&
                         (status & (EXTENT_STATUS_WRITTEN |
                                    EXTENT_STATUS_UNWRITTEN));
retry:
        if (err1 && !es1)
                es1 = __es_alloc_extent(true);
        if ((err1 || err2) && !es2)
                es2 = __es_alloc_extent(true);
        if ((err1 || err2 || err3) && revise_pending && !pr)
                pr = __alloc_pending(true);
        write_lock(&EXT4_I(inode)->i_es_lock);

        err1 = __es_remove_extent(inode, lblk, end, NULL, es1);
        if (err1 != 0)
                goto error;
        /* Free preallocated extent if it didn't get used. */
        if (es1) {
                if (!es1->es_len)
                        __es_free_extent(es1);
                es1 = NULL;
        }

        err2 = __es_insert_extent(inode, &newes, es2);
        if (err2 == -ENOMEM && !ext4_es_must_keep(&newes))
                err2 = 0;
        if (err2 != 0)
                goto error;
        /* Free preallocated extent if it didn't get used. */
        if (es2) {
                if (!es2->es_len)
                        __es_free_extent(es2);
                es2 = NULL;
        }

        if (revise_pending) {
                err3 = __revise_pending(inode, lblk, len, &pr);
                if (err3 != 0)
                        goto error;
                if (pr) {
                        __free_pending(pr);
                        pr = NULL;
                }
        }
error:
        write_unlock(&EXT4_I(inode)->i_es_lock);
        if (err1 || err2 || err3)
                goto retry;

        ext4_es_print_tree(inode);
        return;
}

/*
 * ext4_es_cache_extent() inserts information into the extent status
 * tree if and only if there isn't information about the range in
 * question already.
 */
void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
                          ext4_lblk_t len, ext4_fsblk_t pblk,
                          unsigned int status)
{
        struct extent_status *es;
        struct extent_status newes;
        ext4_lblk_t end = lblk + len - 1;

        if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                return;

        newes.es_lblk = lblk;
        newes.es_len = len;
        ext4_es_store_pblock_status(&newes, pblk, status);
        trace_ext4_es_cache_extent(inode, &newes);

        if (!len)
                return;

        BUG_ON(end < lblk);

        write_lock(&EXT4_I(inode)->i_es_lock);

        es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
        if (!es || es->es_lblk > end)
                __es_insert_extent(inode, &newes, NULL);
        write_unlock(&EXT4_I(inode)->i_es_lock);
}

/*
 * ext4_es_lookup_extent() looks up an extent in extent status tree.
 *
 * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks.
 *
 * Return: 1 on found, 0 on not
 */
int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
                          ext4_lblk_t *next_lblk,
                          struct extent_status *es)
{
        struct ext4_es_tree *tree;
        struct ext4_es_stats *stats;
        struct extent_status *es1 = NULL;
        struct rb_node *node;
        int found = 0;

        if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                return 0;

        trace_ext4_es_lookup_extent_enter(inode, lblk);
        es_debug("lookup extent in block %u\n", lblk);

        tree = &EXT4_I(inode)->i_es_tree;
        read_lock(&EXT4_I(inode)->i_es_lock);

        /* find extent in cache firstly */
        es->es_lblk = es->es_len = es->es_pblk = 0;
        es1 = READ_ONCE(tree->cache_es);
        if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) {
                es_debug("%u cached by [%u/%u)\n",
                         lblk, es1->es_lblk, es1->es_len);
                found = 1;
                goto out;
        }

        node = tree->root.rb_node;
        while (node) {
                es1 = rb_entry(node, struct extent_status, rb_node);
                if (lblk < es1->es_lblk)
                        node = node->rb_left;
                else if (lblk > ext4_es_end(es1))
                        node = node->rb_right;
                else {
                        found = 1;
                        break;
                }
        }

out:
        stats = &EXT4_SB(inode->i_sb)->s_es_stats;
        if (found) {
                BUG_ON(!es1);
                es->es_lblk = es1->es_lblk;
                es->es_len = es1->es_len;
                es->es_pblk = es1->es_pblk;
                if (!ext4_es_is_referenced(es1))
                        ext4_es_set_referenced(es1);
                percpu_counter_inc(&stats->es_stats_cache_hits);
                if (next_lblk) {
                        node = rb_next(&es1->rb_node);
                        if (node) {
                                es1 = rb_entry(node, struct extent_status,
                                               rb_node);
                                *next_lblk = es1->es_lblk;
                        } else
                                *next_lblk = 0;
                }
        } else {
                percpu_counter_inc(&stats->es_stats_cache_misses);
        }

        read_unlock(&EXT4_I(inode)->i_es_lock);

        trace_ext4_es_lookup_extent_exit(inode, es, found);
        return found;
}

struct rsvd_count {
        int ndelonly;
        bool first_do_lblk_found;
        ext4_lblk_t first_do_lblk;
        ext4_lblk_t last_do_lblk;
        struct extent_status *left_es;
        bool partial;
        ext4_lblk_t lclu;
};

/*
 * init_rsvd - initialize reserved count data before removing block range
 *               in file from extent status tree
 *
 * @inode - file containing range
 * @lblk - first block in range
 * @es - pointer to first extent in range
 * @rc - pointer to reserved count data
 *
 * Assumes es is not NULL
 */
static void init_rsvd(struct inode *inode, ext4_lblk_t lblk,
                      struct extent_status *es, struct rsvd_count *rc)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct rb_node *node;

        rc->ndelonly = 0;

        /*
         * for bigalloc, note the first delonly block in the range has not
         * been found, record the extent containing the block to the left of
         * the region to be removed, if any, and note that there's no partial
         * cluster to track
         */
        if (sbi->s_cluster_ratio > 1) {
                rc->first_do_lblk_found = false;
                if (lblk > es->es_lblk) {
                        rc->left_es = es;
                } else {
                        node = rb_prev(&es->rb_node);
                        rc->left_es = node ? rb_entry(node,
                                                      struct extent_status,
                                                      rb_node) : NULL;
                }
                rc->partial = false;
        }
}

/*
 * count_rsvd - count the clusters containing delayed and not unwritten
 *                (delonly) blocks in a range within an extent and add to
 *                the running tally in rsvd_count
 *
 * @inode - file containing extent
 * @lblk - first block in range
 * @len - length of range in blocks
 * @es - pointer to extent containing clusters to be counted
 * @rc - pointer to reserved count data
 *
 * Tracks partial clusters found at the beginning and end of extents so
 * they aren't overcounted when they span adjacent extents
 */
static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
                       struct extent_status *es, struct rsvd_count *rc)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        ext4_lblk_t i, end, nclu;

        if (!ext4_es_is_delonly(es))
                return;

        WARN_ON(len <= 0);

        if (sbi->s_cluster_ratio == 1) {
                rc->ndelonly += (int) len;
                return;
        }

        /* bigalloc */

        i = (lblk < es->es_lblk) ? es->es_lblk : lblk;
        end = lblk + (ext4_lblk_t) len - 1;
        end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end;

        /* record the first block of the first delonly extent seen */
        if (!rc->first_do_lblk_found) {
                rc->first_do_lblk = i;
                rc->first_do_lblk_found = true;
        }

        /* update the last lblk in the region seen so far */
        rc->last_do_lblk = end;

        /*
         * if we're tracking a partial cluster and the current extent
         * doesn't start with it, count it and stop tracking
         */
        if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) {
                rc->ndelonly++;
                rc->partial = false;
        }

        /*
         * if the first cluster doesn't start on a cluster boundary but
         * ends on one, count it
         */
        if (EXT4_LBLK_COFF(sbi, i) != 0) {
                if (end >= EXT4_LBLK_CFILL(sbi, i)) {
                        rc->ndelonly++;
                        rc->partial = false;
                        i = EXT4_LBLK_CFILL(sbi, i) + 1;
                }
        }

        /*
         * if the current cluster starts on a cluster boundary, count the
         * number of whole delonly clusters in the extent
         */
        if ((i + sbi->s_cluster_ratio - 1) <= end) {
                nclu = (end - i + 1) >> sbi->s_cluster_bits;
                rc->ndelonly += nclu;
                i += nclu << sbi->s_cluster_bits;
        }

        /*
         * start tracking a partial cluster if there's a partial at the end
         * of the current extent and we're not already tracking one
         */
        if (!rc->partial && i <= end) {
                rc->partial = true;
                rc->lclu = EXT4_B2C(sbi, i);
        }
}

/*
 * __pr_tree_search - search for a pending cluster reservation
 *
 * @root - root of pending reservation tree
 * @lclu - logical cluster to search for
 *
 * Returns the pending reservation for the cluster identified by @lclu
 * if found.  If not, returns a reservation for the next cluster if any,
 * and if not, returns NULL.
 */
static struct pending_reservation *__pr_tree_search(struct rb_root *root,
                                                    ext4_lblk_t lclu)
{
        struct rb_node *node = root->rb_node;
        struct pending_reservation *pr = NULL;

        while (node) {
                pr = rb_entry(node, struct pending_reservation, rb_node);
                if (lclu < pr->lclu)
                        node = node->rb_left;
                else if (lclu > pr->lclu)
                        node = node->rb_right;
                else
                        return pr;
        }
        if (pr && lclu < pr->lclu)
                return pr;
        if (pr && lclu > pr->lclu) {
                node = rb_next(&pr->rb_node);
                return node ? rb_entry(node, struct pending_reservation,
                                       rb_node) : NULL;
        }
        return NULL;
}

/*
 * get_rsvd - calculates and returns the number of cluster reservations to be
 *              released when removing a block range from the extent status tree
 *              and releases any pending reservations within the range
 *
 * @inode - file containing block range
 * @end - last block in range
 * @right_es - pointer to extent containing next block beyond end or NULL
 * @rc - pointer to reserved count data
 *
 * The number of reservations to be released is equal to the number of
 * clusters containing delayed and not unwritten (delonly) blocks within
 * the range, minus the number of clusters still containing delonly blocks
 * at the ends of the range, and minus the number of pending reservations
 * within the range.
 */
static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
                             struct extent_status *right_es,
                             struct rsvd_count *rc)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct pending_reservation *pr;
        struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
        struct rb_node *node;
        ext4_lblk_t first_lclu, last_lclu;
        bool left_delonly, right_delonly, count_pending;
        struct extent_status *es;

        if (sbi->s_cluster_ratio > 1) {
                /* count any remaining partial cluster */
                if (rc->partial)
                        rc->ndelonly++;

                if (rc->ndelonly == 0)
                        return 0;

                first_lclu = EXT4_B2C(sbi, rc->first_do_lblk);
                last_lclu = EXT4_B2C(sbi, rc->last_do_lblk);

                /*
                 * decrease the delonly count by the number of clusters at the
                 * ends of the range that still contain delonly blocks -
                 * these clusters still need to be reserved
                 */
                left_delonly = right_delonly = false;

                es = rc->left_es;
                while (es && ext4_es_end(es) >=
                       EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) {
                        if (ext4_es_is_delonly(es)) {
                                rc->ndelonly--;
                                left_delonly = true;
                                break;
                        }
                        node = rb_prev(&es->rb_node);
                        if (!node)
                                break;
                        es = rb_entry(node, struct extent_status, rb_node);
                }
                if (right_es && (!left_delonly || first_lclu != last_lclu)) {
                        if (end < ext4_es_end(right_es)) {
                                es = right_es;
                        } else {
                                node = rb_next(&right_es->rb_node);
                                es = node ? rb_entry(node, struct extent_status,
                                                     rb_node) : NULL;
                        }
                        while (es && es->es_lblk <=
                               EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) {
                                if (ext4_es_is_delonly(es)) {
                                        rc->ndelonly--;
                                        right_delonly = true;
                                        break;
                                }
                                node = rb_next(&es->rb_node);
                                if (!node)
                                        break;
                                es = rb_entry(node, struct extent_status,
                                              rb_node);
                        }
                }

                /*
                 * Determine the block range that should be searched for
                 * pending reservations, if any.  Clusters on the ends of the
                 * original removed range containing delonly blocks are
                 * excluded.  They've already been accounted for and it's not
                 * possible to determine if an associated pending reservation
                 * should be released with the information available in the
                 * extents status tree.
                 */
                if (first_lclu == last_lclu) {
                        if (left_delonly | right_delonly)
                                count_pending = false;
                        else
                                count_pending = true;
                } else {
                        if (left_delonly)
                                first_lclu++;
                        if (right_delonly)
                                last_lclu--;
                        if (first_lclu <= last_lclu)
                                count_pending = true;
                        else
                                count_pending = false;
                }

                /*
                 * a pending reservation found between first_lclu and last_lclu
                 * represents an allocated cluster that contained at least one
                 * delonly block, so the delonly total must be reduced by one
                 * for each pending reservation found and released
                 */
                if (count_pending) {
                        pr = __pr_tree_search(&tree->root, first_lclu);
                        while (pr && pr->lclu <= last_lclu) {
                                rc->ndelonly--;
                                node = rb_next(&pr->rb_node);
                                rb_erase(&pr->rb_node, &tree->root);
                                __free_pending(pr);
                                if (!node)
                                        break;
                                pr = rb_entry(node, struct pending_reservation,
                                              rb_node);
                        }
                }
        }
        return rc->ndelonly;
}


/*
 * __es_remove_extent - removes block range from extent status tree
 *
 * @inode - file containing range
 * @lblk - first block in range
 * @end - last block in range
 * @reserved - number of cluster reservations released
 * @prealloc - pre-allocated es to avoid memory allocation failures
 *
 * If @reserved is not NULL and delayed allocation is enabled, counts
 * block/cluster reservations freed by removing range and if bigalloc
 * enabled cancels pending reservations as needed. Returns 0 on success,
 * error code on failure.
 */
static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
                              ext4_lblk_t end, int *reserved,
                              struct extent_status *prealloc)
{
        struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
        struct rb_node *node;
        struct extent_status *es;
        struct extent_status orig_es;
        ext4_lblk_t len1, len2;
        ext4_fsblk_t block;
        int err = 0;
        bool count_reserved = true;
        struct rsvd_count rc;

        if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC))
                count_reserved = false;

        es = __es_tree_search(&tree->root, lblk);
        if (!es)
                goto out;
        if (es->es_lblk > end)
                goto out;

        /* Simply invalidate cache_es. */
        tree->cache_es = NULL;
        if (count_reserved)
                init_rsvd(inode, lblk, es, &rc);

        orig_es.es_lblk = es->es_lblk;
        orig_es.es_len = es->es_len;
        orig_es.es_pblk = es->es_pblk;

        len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0;
        len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0;
        if (len1 > 0)
                es->es_len = len1;
        if (len2 > 0) {
                if (len1 > 0) {
                        struct extent_status newes;

                        newes.es_lblk = end + 1;
                        newes.es_len = len2;
                        block = 0x7FDEADBEEFULL;
                        if (ext4_es_is_written(&orig_es) ||
                            ext4_es_is_unwritten(&orig_es))
                                block = ext4_es_pblock(&orig_es) +
                                        orig_es.es_len - len2;
                        ext4_es_store_pblock_status(&newes, block,
                                                    ext4_es_status(&orig_es));
                        err = __es_insert_extent(inode, &newes, prealloc);
                        if (err) {
                                if (!ext4_es_must_keep(&newes))
                                        return 0;

                                es->es_lblk = orig_es.es_lblk;
                                es->es_len = orig_es.es_len;
                                goto out;
                        }
                } else {
                        es->es_lblk = end + 1;
                        es->es_len = len2;
                        if (ext4_es_is_written(es) ||
                            ext4_es_is_unwritten(es)) {
                                block = orig_es.es_pblk + orig_es.es_len - len2;
                                ext4_es_store_pblock(es, block);
                        }
                }
                if (count_reserved)
                        count_rsvd(inode, orig_es.es_lblk + len1,
                                   orig_es.es_len - len1 - len2, &orig_es, &rc);
                goto out_get_reserved;
        }

        if (len1 > 0) {
                if (count_reserved)
                        count_rsvd(inode, lblk, orig_es.es_len - len1,
                                   &orig_es, &rc);
                node = rb_next(&es->rb_node);
                if (node)
                        es = rb_entry(node, struct extent_status, rb_node);
                else
                        es = NULL;
        }

        while (es && ext4_es_end(es) <= end) {
                if (count_reserved)
                        count_rsvd(inode, es->es_lblk, es->es_len, es, &rc);
                node = rb_next(&es->rb_node);
                rb_erase(&es->rb_node, &tree->root);
                ext4_es_free_extent(inode, es);
                if (!node) {
                        es = NULL;
                        break;
                }
                es = rb_entry(node, struct extent_status, rb_node);
        }

        if (es && es->es_lblk < end + 1) {
                ext4_lblk_t orig_len = es->es_len;

                len1 = ext4_es_end(es) - end;
                if (count_reserved)
                        count_rsvd(inode, es->es_lblk, orig_len - len1,
                                   es, &rc);
                es->es_lblk = end + 1;
                es->es_len = len1;
                if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) {
                        block = es->es_pblk + orig_len - len1;
                        ext4_es_store_pblock(es, block);
                }
        }

out_get_reserved:
        if (count_reserved)
                *reserved = get_rsvd(inode, end, es, &rc);
out:
        return err;
}

/*
 * ext4_es_remove_extent - removes block range from extent status tree
 *
 * @inode - file containing range
 * @lblk - first block in range
 * @len - number of blocks to remove
 *
 * Reduces block/cluster reservation count and for bigalloc cancels pending
 * reservations as needed. Returns 0 on success, error code on failure.
 */
int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
                          ext4_lblk_t len)
{
        ext4_lblk_t end;
        int err = 0;
        int reserved = 0;
        struct extent_status *es = NULL;

        if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                return 0;

        trace_ext4_es_remove_extent(inode, lblk, len);
        es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
                 lblk, len, inode->i_ino);

        if (!len)
                return err;

        end = lblk + len - 1;
        BUG_ON(end < lblk);

retry:
        if (err && !es)
                es = __es_alloc_extent(true);
        /*
         * ext4_clear_inode() depends on us taking i_es_lock unconditionally
         * so that we are sure __es_shrink() is done with the inode before it
         * is reclaimed.
         */
        write_lock(&EXT4_I(inode)->i_es_lock);
        err = __es_remove_extent(inode, lblk, end, &reserved, es);
        /* Free preallocated extent if it didn't get used. */
        if (es) {
                if (!es->es_len)
                        __es_free_extent(es);
                es = NULL;
        }
        write_unlock(&EXT4_I(inode)->i_es_lock);
        if (err)
                goto retry;

        ext4_es_print_tree(inode);
        ext4_da_release_space(inode, reserved);
        return 0;
}

static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
                       struct ext4_inode_info *locked_ei)
{
        struct ext4_inode_info *ei;
        struct ext4_es_stats *es_stats;
        ktime_t start_time;
        u64 scan_time;
        int nr_to_walk;
        int nr_shrunk = 0;
        int retried = 0, nr_skipped = 0;

        es_stats = &sbi->s_es_stats;
        start_time = ktime_get();

retry:
        spin_lock(&sbi->s_es_lock);
        nr_to_walk = sbi->s_es_nr_inode;
        while (nr_to_walk-- > 0) {
                if (list_empty(&sbi->s_es_list)) {
                        spin_unlock(&sbi->s_es_lock);
                        goto out;
                }
                ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info,
                                      i_es_list);
                /* Move the inode to the tail */
                list_move_tail(&ei->i_es_list, &sbi->s_es_list);

                /*
                 * Normally we try hard to avoid shrinking precached inodes,
                 * but we will as a last resort.
                 */
                if (!retried && ext4_test_inode_state(&ei->vfs_inode,
                                                EXT4_STATE_EXT_PRECACHED)) {
                        nr_skipped++;
                        continue;
                }

                if (ei == locked_ei || !write_trylock(&ei->i_es_lock)) {
                        nr_skipped++;
                        continue;
                }
                /*
                 * Now we hold i_es_lock which protects us from inode reclaim
                 * freeing inode under us
                 */
                spin_unlock(&sbi->s_es_lock);

                nr_shrunk += es_reclaim_extents(ei, &nr_to_scan);
                write_unlock(&ei->i_es_lock);

                if (nr_to_scan <= 0)
                        goto out;
                spin_lock(&sbi->s_es_lock);
        }
        spin_unlock(&sbi->s_es_lock);

        /*
         * If we skipped any inodes, and we weren't able to make any
         * forward progress, try again to scan precached inodes.
         */
        if ((nr_shrunk == 0) && nr_skipped && !retried) {
                retried++;
                goto retry;
        }

        if (locked_ei && nr_shrunk == 0)
                nr_shrunk = es_reclaim_extents(locked_ei, &nr_to_scan);

out:
        scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
        if (likely(es_stats->es_stats_scan_time))
                es_stats->es_stats_scan_time = (scan_time +
                                es_stats->es_stats_scan_time*3) / 4;
        else
                es_stats->es_stats_scan_time = scan_time;
        if (scan_time > es_stats->es_stats_max_scan_time)
                es_stats->es_stats_max_scan_time = scan_time;
        if (likely(es_stats->es_stats_shrunk))
                es_stats->es_stats_shrunk = (nr_shrunk +
                                es_stats->es_stats_shrunk*3) / 4;
        else
                es_stats->es_stats_shrunk = nr_shrunk;

        trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time,
                             nr_skipped, retried);
        return nr_shrunk;
}

static unsigned long ext4_es_count(struct shrinker *shrink,
                                   struct shrink_control *sc)
{
        unsigned long nr;
        struct ext4_sb_info *sbi;

        sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
        nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
        trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
        return nr;
}

static unsigned long ext4_es_scan(struct shrinker *shrink,
                                  struct shrink_control *sc)
{
        struct ext4_sb_info *sbi = container_of(shrink,
                                        struct ext4_sb_info, s_es_shrinker);
        int nr_to_scan = sc->nr_to_scan;
        int ret, nr_shrunk;

        ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
        trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);

        nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL);

        ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
        trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret);
        return nr_shrunk;
}

int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v)
{
        struct ext4_sb_info *sbi = EXT4_SB((struct super_block *) seq->private);
        struct ext4_es_stats *es_stats = &sbi->s_es_stats;
        struct ext4_inode_info *ei, *max = NULL;
        unsigned int inode_cnt = 0;

        if (v != SEQ_START_TOKEN)
                return 0;

        /* here we just find an inode that has the max nr. of objects */
        spin_lock(&sbi->s_es_lock);
        list_for_each_entry(ei, &sbi->s_es_list, i_es_list) {
                inode_cnt++;
                if (max && max->i_es_all_nr < ei->i_es_all_nr)
                        max = ei;
                else if (!max)
                        max = ei;
        }
        spin_unlock(&sbi->s_es_lock);

        seq_printf(seq, "stats:\n  %lld objects\n  %lld reclaimable objects\n",
                   percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
                   percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt));
        seq_printf(seq, "  %lld/%lld cache hits/misses\n",
                   percpu_counter_sum_positive(&es_stats->es_stats_cache_hits),
                   percpu_counter_sum_positive(&es_stats->es_stats_cache_misses));
        if (inode_cnt)
                seq_printf(seq, "  %d inodes on list\n", inode_cnt);

        seq_printf(seq, "average:\n  %llu us scan time\n",
            div_u64(es_stats->es_stats_scan_time, 1000));
        seq_printf(seq, "  %lu shrunk objects\n", es_stats->es_stats_shrunk);
        if (inode_cnt)
                seq_printf(seq,
                    "maximum:\n  %lu inode (%u objects, %u reclaimable)\n"
                    "  %llu us max scan time\n",
                    max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_shk_nr,
                    div_u64(es_stats->es_stats_max_scan_time, 1000));

        return 0;
}

int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
{
        int err;

        /* Make sure we have enough bits for physical block number */
        BUILD_BUG_ON(ES_SHIFT < 48);
        INIT_LIST_HEAD(&sbi->s_es_list);
        sbi->s_es_nr_inode = 0;
        spin_lock_init(&sbi->s_es_lock);
        sbi->s_es_stats.es_stats_shrunk = 0;
        err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_hits, 0,
                                  GFP_KERNEL);
        if (err)
                return err;
        err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_misses, 0,
                                  GFP_KERNEL);
        if (err)
                goto err1;
        sbi->s_es_stats.es_stats_scan_time = 0;
        sbi->s_es_stats.es_stats_max_scan_time = 0;
        err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL);
        if (err)
                goto err2;
        err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL);
        if (err)
                goto err3;

        sbi->s_es_shrinker.scan_objects = ext4_es_scan;
        sbi->s_es_shrinker.count_objects = ext4_es_count;
        sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
        err = register_shrinker(&sbi->s_es_shrinker);
        if (err)
                goto err4;

        return 0;
err4:
        percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
err3:
        percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
err2:
        percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
err1:
        percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits);
        return err;
}

void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
{
        percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits);
        percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
        percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
        percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
        unregister_shrinker(&sbi->s_es_shrinker);
}

/*
 * Shrink extents in given inode from ei->i_es_shrink_lblk till end. Scan at
 * most *nr_to_scan extents, update *nr_to_scan accordingly.
 *
 * Return 0 if we hit end of tree / interval, 1 if we exhausted nr_to_scan.
 * Increment *nr_shrunk by the number of reclaimed extents. Also update
 * ei->i_es_shrink_lblk to where we should continue scanning.
 */
static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
                                 int *nr_to_scan, int *nr_shrunk)
{
        struct inode *inode = &ei->vfs_inode;
        struct ext4_es_tree *tree = &ei->i_es_tree;
        struct extent_status *es;
        struct rb_node *node;

        es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk);
        if (!es)
                goto out_wrap;

        while (*nr_to_scan > 0) {
                if (es->es_lblk > end) {
                        ei->i_es_shrink_lblk = end + 1;
                        return 0;
                }

                (*nr_to_scan)--;
                node = rb_next(&es->rb_node);

                if (ext4_es_must_keep(es))
                        goto next;
                if (ext4_es_is_referenced(es)) {
                        ext4_es_clear_referenced(es);
                        goto next;
                }

                rb_erase(&es->rb_node, &tree->root);
                ext4_es_free_extent(inode, es);
                (*nr_shrunk)++;
next:
                if (!node)
                        goto out_wrap;
                es = rb_entry(node, struct extent_status, rb_node);
        }
        ei->i_es_shrink_lblk = es->es_lblk;
        return 1;
out_wrap:
        ei->i_es_shrink_lblk = 0;
        return 0;
}

static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan)
{
        struct inode *inode = &ei->vfs_inode;
        int nr_shrunk = 0;
        ext4_lblk_t start = ei->i_es_shrink_lblk;
        static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
                                      DEFAULT_RATELIMIT_BURST);

        if (ei->i_es_shk_nr == 0)
                return 0;

        if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
            __ratelimit(&_rs))
                ext4_warning(inode->i_sb, "forced shrink of precached extents");

        if (!es_do_reclaim_extents(ei, EXT_MAX_BLOCKS, nr_to_scan, &nr_shrunk) &&
            start != 0)
                es_do_reclaim_extents(ei, start - 1, nr_to_scan, &nr_shrunk);

        ei->i_es_tree.cache_es = NULL;
        return nr_shrunk;
}

/*
 * Called to support EXT4_IOC_CLEAR_ES_CACHE.  We can only remove
 * discretionary entries from the extent status cache.  (Some entries
 * must be present for proper operations.)
 */
void ext4_clear_inode_es(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct extent_status *es;
        struct ext4_es_tree *tree;
        struct rb_node *node;

        write_lock(&ei->i_es_lock);
        tree = &EXT4_I(inode)->i_es_tree;
        tree->cache_es = NULL;
        node = rb_first(&tree->root);
        while (node) {
                es = rb_entry(node, struct extent_status, rb_node);
                node = rb_next(node);
                if (!ext4_es_must_keep(es)) {
                        rb_erase(&es->rb_node, &tree->root);
                        ext4_es_free_extent(inode, es);
                }
        }
        ext4_clear_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
        write_unlock(&ei->i_es_lock);
}

#ifdef ES_DEBUG__
static void ext4_print_pending_tree(struct inode *inode)
{
        struct ext4_pending_tree *tree;
        struct rb_node *node;
        struct pending_reservation *pr;

        printk(KERN_DEBUG "pending reservations for inode %lu:", inode->i_ino);
        tree = &EXT4_I(inode)->i_pending_tree;
        node = rb_first(&tree->root);
        while (node) {
                pr = rb_entry(node, struct pending_reservation, rb_node);
                printk(KERN_DEBUG " %u", pr->lclu);
                node = rb_next(node);
        }
        printk(KERN_DEBUG "\n");
}
#else
#define ext4_print_pending_tree(inode)
#endif

int __init ext4_init_pending(void)
{
        ext4_pending_cachep = kmem_cache_create("ext4_pending_reservation",
                                           sizeof(struct pending_reservation),
                                           0, (SLAB_RECLAIM_ACCOUNT), NULL);
        if (ext4_pending_cachep == NULL)
                return -ENOMEM;
        return 0;
}

void ext4_exit_pending(void)
{
        kmem_cache_destroy(ext4_pending_cachep);
}

void ext4_init_pending_tree(struct ext4_pending_tree *tree)
{
        tree->root = RB_ROOT;
}

/*
 * __get_pending - retrieve a pointer to a pending reservation
 *
 * @inode - file containing the pending cluster reservation
 * @lclu - logical cluster of interest
 *
 * Returns a pointer to a pending reservation if it's a member of
 * the set, and NULL if not.  Must be called holding i_es_lock.
 */
static struct pending_reservation *__get_pending(struct inode *inode,
                                                 ext4_lblk_t lclu)
{
        struct ext4_pending_tree *tree;
        struct rb_node *node;
        struct pending_reservation *pr = NULL;

        tree = &EXT4_I(inode)->i_pending_tree;
        node = (&tree->root)->rb_node;

        while (node) {
                pr = rb_entry(node, struct pending_reservation, rb_node);
                if (lclu < pr->lclu)
                        node = node->rb_left;
                else if (lclu > pr->lclu)
                        node = node->rb_right;
                else if (lclu == pr->lclu)
                        return pr;
        }
        return NULL;
}

/*
 * __insert_pending - adds a pending cluster reservation to the set of
 *                    pending reservations
 *
 * @inode - file containing the cluster
 * @lblk - logical block in the cluster to be added
 * @prealloc - preallocated pending entry
 *
 * Returns 0 on successful insertion and -ENOMEM on failure.  If the
 * pending reservation is already in the set, returns successfully.
 */
static int __insert_pending(struct inode *inode, ext4_lblk_t lblk,
                            struct pending_reservation **prealloc)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
        struct rb_node **p = &tree->root.rb_node;
        struct rb_node *parent = NULL;
        struct pending_reservation *pr;
        ext4_lblk_t lclu;
        int ret = 0;

        lclu = EXT4_B2C(sbi, lblk);
        /* search to find parent for insertion */
        while (*p) {
                parent = *p;
                pr = rb_entry(parent, struct pending_reservation, rb_node);

                if (lclu < pr->lclu) {
                        p = &(*p)->rb_left;
                } else if (lclu > pr->lclu) {
                        p = &(*p)->rb_right;
                } else {
                        /* pending reservation already inserted */
                        goto out;
                }
        }

        if (likely(*prealloc == NULL)) {
                pr = __alloc_pending(false);
                if (!pr) {
                        ret = -ENOMEM;
                        goto out;
                }
        } else {
                pr = *prealloc;
                *prealloc = NULL;
        }
        pr->lclu = lclu;

        rb_link_node(&pr->rb_node, parent, p);
        rb_insert_color(&pr->rb_node, &tree->root);

out:
        return ret;
}

/*
 * __remove_pending - removes a pending cluster reservation from the set
 *                    of pending reservations
 *
 * @inode - file containing the cluster
 * @lblk - logical block in the pending cluster reservation to be removed
 *
 * Returns successfully if pending reservation is not a member of the set.
 */
static void __remove_pending(struct inode *inode, ext4_lblk_t lblk)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct pending_reservation *pr;
        struct ext4_pending_tree *tree;

        pr = __get_pending(inode, EXT4_B2C(sbi, lblk));
        if (pr != NULL) {
                tree = &EXT4_I(inode)->i_pending_tree;
                rb_erase(&pr->rb_node, &tree->root);
                __free_pending(pr);
        }
}

/*
 * ext4_remove_pending - removes a pending cluster reservation from the set
 *                       of pending reservations
 *
 * @inode - file containing the cluster
 * @lblk - logical block in the pending cluster reservation to be removed
 *
 * Locking for external use of __remove_pending.
 */
void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk)
{
        struct ext4_inode_info *ei = EXT4_I(inode);

        write_lock(&ei->i_es_lock);
        __remove_pending(inode, lblk);
        write_unlock(&ei->i_es_lock);
}

/*
 * ext4_is_pending - determine whether a cluster has a pending reservation
 *                   on it
 *
 * @inode - file containing the cluster
 * @lblk - logical block in the cluster
 *
 * Returns true if there's a pending reservation for the cluster in the
 * set of pending reservations, and false if not.
 */
bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);
        bool ret;

        read_lock(&ei->i_es_lock);
        ret = (bool)(__get_pending(inode, EXT4_B2C(sbi, lblk)) != NULL);
        read_unlock(&ei->i_es_lock);

        return ret;
}

/*
 * ext4_es_insert_delayed_block - adds a delayed block to the extents status
 *                                tree, adding a pending reservation where
 *                                needed
 *
 * @inode - file containing the newly added block
 * @lblk - logical block to be added
 * @allocated - indicates whether a physical cluster has been allocated for
 *              the logical cluster that contains the block
 *
 * Returns 0 on success, negative error code on failure.
 */
int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
                                 bool allocated)
{
        struct extent_status newes;
        int err1 = 0, err2 = 0, err3 = 0;
        struct extent_status *es1 = NULL;
        struct extent_status *es2 = NULL;
        struct pending_reservation *pr = NULL;

        if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                return 0;

        es_debug("add [%u/1) delayed to extent status tree of inode %lu\n",
                 lblk, inode->i_ino);

        newes.es_lblk = lblk;
        newes.es_len = 1;
        ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED);
        trace_ext4_es_insert_delayed_block(inode, &newes, allocated);

        ext4_es_insert_extent_check(inode, &newes);

retry:
        if (err1 && !es1)
                es1 = __es_alloc_extent(true);
        if ((err1 || err2) && !es2)
                es2 = __es_alloc_extent(true);
        if ((err1 || err2 || err3) && allocated && !pr)
                pr = __alloc_pending(true);
        write_lock(&EXT4_I(inode)->i_es_lock);

        err1 = __es_remove_extent(inode, lblk, lblk, NULL, es1);
        if (err1 != 0)
                goto error;
        /* Free preallocated extent if it didn't get used. */
        if (es1) {
                if (!es1->es_len)
                        __es_free_extent(es1);
                es1 = NULL;
        }

        err2 = __es_insert_extent(inode, &newes, es2);
        if (err2 != 0)
                goto error;
        /* Free preallocated extent if it didn't get used. */
        if (es2) {
                if (!es2->es_len)
                        __es_free_extent(es2);
                es2 = NULL;
        }

        if (allocated) {
                err3 = __insert_pending(inode, lblk, &pr);
                if (err3 != 0)
                        goto error;
                if (pr) {
                        __free_pending(pr);
                        pr = NULL;
                }
        }
error:
        write_unlock(&EXT4_I(inode)->i_es_lock);
        if (err1 || err2 || err3)
                goto retry;

        ext4_es_print_tree(inode);
        ext4_print_pending_tree(inode);
        return 0;
}

/*
 * __es_delayed_clu - count number of clusters containing blocks that
 *                    are delayed only
 *
 * @inode - file containing block range
 * @start - logical block defining start of range
 * @end - logical block defining end of range
 *
 * Returns the number of clusters containing only delayed (not delayed
 * and unwritten) blocks in the range specified by @start and @end.  Any
 * cluster or part of a cluster within the range and containing a delayed
 * and not unwritten block within the range is counted as a whole cluster.
 */
static unsigned int __es_delayed_clu(struct inode *inode, ext4_lblk_t start,
                                     ext4_lblk_t end)
{
        struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
        struct extent_status *es;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct rb_node *node;
        ext4_lblk_t first_lclu, last_lclu;
        unsigned long long last_counted_lclu;
        unsigned int n = 0;

        /* guaranteed to be unequal to any ext4_lblk_t value */
        last_counted_lclu = ~0ULL;

        es = __es_tree_search(&tree->root, start);

        while (es && (es->es_lblk <= end)) {
                if (ext4_es_is_delonly(es)) {
                        if (es->es_lblk <= start)
                                first_lclu = EXT4_B2C(sbi, start);
                        else
                                first_lclu = EXT4_B2C(sbi, es->es_lblk);

                        if (ext4_es_end(es) >= end)
                                last_lclu = EXT4_B2C(sbi, end);
                        else
                                last_lclu = EXT4_B2C(sbi, ext4_es_end(es));

                        if (first_lclu == last_counted_lclu)
                                n += last_lclu - first_lclu;
                        else
                                n += last_lclu - first_lclu + 1;
                        last_counted_lclu = last_lclu;
                }
                node = rb_next(&es->rb_node);
                if (!node)
                        break;
                es = rb_entry(node, struct extent_status, rb_node);
        }

        return n;
}

/*
 * ext4_es_delayed_clu - count number of clusters containing blocks that
 *                       are both delayed and unwritten
 *
 * @inode - file containing block range
 * @lblk - logical block defining start of range
 * @len - number of blocks in range
 *
 * Locking for external use of __es_delayed_clu().
 */
unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
                                 ext4_lblk_t len)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        ext4_lblk_t end;
        unsigned int n;

        if (len == 0)
                return 0;

        end = lblk + len - 1;
        WARN_ON(end < lblk);

        read_lock(&ei->i_es_lock);

        n = __es_delayed_clu(inode, lblk, end);

        read_unlock(&ei->i_es_lock);

        return n;
}

/*
 * __revise_pending - makes, cancels, or leaves unchanged pending cluster
 *                    reservations for a specified block range depending
 *                    upon the presence or absence of delayed blocks
 *                    outside the range within clusters at the ends of the
 *                    range
 *
 * @inode - file containing the range
 * @lblk - logical block defining the start of range
 * @len  - length of range in blocks
 * @prealloc - preallocated pending entry
 *
 * Used after a newly allocated extent is added to the extents status tree.
 * Requires that the extents in the range have either written or unwritten
 * status.  Must be called while holding i_es_lock.
 */
static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
                            ext4_lblk_t len,
                            struct pending_reservation **prealloc)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        ext4_lblk_t end = lblk + len - 1;
        ext4_lblk_t first, last;
        bool f_del = false, l_del = false;
        int ret = 0;

        if (len == 0)
                return 0;

        /*
         * Two cases - block range within single cluster and block range
         * spanning two or more clusters.  Note that a cluster belonging
         * to a range starting and/or ending on a cluster boundary is treated
         * as if it does not contain a delayed extent.  The new range may
         * have allocated space for previously delayed blocks out to the
         * cluster boundary, requiring that any pre-existing pending
         * reservation be canceled.  Because this code only looks at blocks
         * outside the range, it should revise pending reservations
         * correctly even if the extent represented by the range can't be
         * inserted in the extents status tree due to ENOSPC.
         */

        if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) {
                first = EXT4_LBLK_CMASK(sbi, lblk);
                if (first != lblk)
                        f_del = __es_scan_range(inode, &ext4_es_is_delonly,
                                                first, lblk - 1);
                if (f_del) {
                        ret = __insert_pending(inode, first, prealloc);
                        if (ret < 0)
                                goto out;
                } else {
                        last = EXT4_LBLK_CMASK(sbi, end) +
                               sbi->s_cluster_ratio - 1;
                        if (last != end)
                                l_del = __es_scan_range(inode,
                                                        &ext4_es_is_delonly,
                                                        end + 1, last);
                        if (l_del) {
                                ret = __insert_pending(inode, last, prealloc);
                                if (ret < 0)
                                        goto out;
                        } else
                                __remove_pending(inode, last);
                }
        } else {
                first = EXT4_LBLK_CMASK(sbi, lblk);
                if (first != lblk)
                        f_del = __es_scan_range(inode, &ext4_es_is_delonly,
                                                first, lblk - 1);
                if (f_del) {
                        ret = __insert_pending(inode, first, prealloc);
                        if (ret < 0)
                                goto out;
                } else
                        __remove_pending(inode, first);

                last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1;
                if (last != end)
                        l_del = __es_scan_range(inode, &ext4_es_is_delonly,
                                                end + 1, last);
                if (l_del) {
                        ret = __insert_pending(inode, last, prealloc);
                        if (ret < 0)
                                goto out;
                } else
                        __remove_pending(inode, last);
        }
out:
        return ret;
}




















































    1 


































































    1 










    1 




    1 


    1 







    1 


















    1 


    1 

    1 


    1 












    1 


    1 

    1 




    1 






    1 
    1 




    1 









































































    1 

    1 

    1 


























    1 

    1 




    1 
    1 








    1 




    1 


    1 


    1 














    1 





    1 
    1 



























    1 



























































































    1 









    1 
    1 














    1 
















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
// SPDX-License-Identifier: GPL-2.0
/*
 * blk-mq scheduling framework
 *
 * Copyright (C) 2016 Jens Axboe
 */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/blk-mq.h>
#include <linux/list_sort.h>

#include <trace/events/block.h>

#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
#include "blk-mq-sched.h"
#include "blk-mq-tag.h"
#include "blk-wbt.h"

void blk_mq_sched_assign_ioc(struct request *rq)
{
        struct request_queue *q = rq->q;
        struct io_context *ioc;
        struct io_cq *icq;

        /*
         * May not have an IO context if it's a passthrough request
         */
        ioc = current->io_context;
        if (!ioc)
                return;

        spin_lock_irq(&q->queue_lock);
        icq = ioc_lookup_icq(ioc, q);
        spin_unlock_irq(&q->queue_lock);

        if (!icq) {
                icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
                if (!icq)
                        return;
        }
        get_io_context(icq->ioc);
        rq->elv.icq = icq;
}

/*
 * Mark a hardware queue as needing a restart.
 */
void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
{
        if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
                return;

        set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
}
EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx);

void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
{
        if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
                return;
        clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);

        /*
         * Order clearing SCHED_RESTART and list_empty_careful(&hctx->dispatch)
         * in blk_mq_run_hw_queue(). Its pair is the barrier in
         * blk_mq_dispatch_rq_list(). So dispatch code won't see SCHED_RESTART,
         * meantime new request added to hctx->dispatch is missed to check in
         * blk_mq_run_hw_queue().
         */
        smp_mb();

        blk_mq_run_hw_queue(hctx, true);
}

static int sched_rq_cmp(void *priv, const struct list_head *a,
                        const struct list_head *b)
{
        struct request *rqa = container_of(a, struct request, queuelist);
        struct request *rqb = container_of(b, struct request, queuelist);

        return rqa->mq_hctx > rqb->mq_hctx;
}

static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list)
{
        struct blk_mq_hw_ctx *hctx =
                list_first_entry(rq_list, struct request, queuelist)->mq_hctx;
        struct request *rq;
        LIST_HEAD(hctx_list);
        unsigned int count = 0;

        list_for_each_entry(rq, rq_list, queuelist) {
                if (rq->mq_hctx != hctx) {
                        list_cut_before(&hctx_list, rq_list, &rq->queuelist);
                        goto dispatch;
                }
                count++;
        }
        list_splice_tail_init(rq_list, &hctx_list);

dispatch:
        return blk_mq_dispatch_rq_list(hctx, &hctx_list, count);
}

#define BLK_MQ_BUDGET_DELAY        3                /* ms units */

/*
 * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
 * its queue by itself in its completion handler, so we don't need to
 * restart queue if .get_budget() fails to get the budget.
 *
 * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to
 * be run again.  This is necessary to avoid starving flushes.
 */
static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
{
        struct request_queue *q = hctx->queue;
        struct elevator_queue *e = q->elevator;
        bool multi_hctxs = false, run_queue = false;
        bool dispatched = false, busy = false;
        unsigned int max_dispatch;
        LIST_HEAD(rq_list);
        int count = 0;

        if (hctx->dispatch_busy)
                max_dispatch = 1;
        else
                max_dispatch = hctx->queue->nr_requests;

        do {
                struct request *rq;

                if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
                        break;

                if (!list_empty_careful(&hctx->dispatch)) {
                        busy = true;
                        break;
                }

                if (!blk_mq_get_dispatch_budget(q))
                        break;

                rq = e->type->ops.dispatch_request(hctx);
                if (!rq) {
                        blk_mq_put_dispatch_budget(q);
                        /*
                         * We're releasing without dispatching. Holding the
                         * budget could have blocked any "hctx"s with the
                         * same queue and if we didn't dispatch then there's
                         * no guarantee anyone will kick the queue.  Kick it
                         * ourselves.
                         */
                        run_queue = true;
                        break;
                }

                /*
                 * Now this rq owns the budget which has to be released
                 * if this rq won't be queued to driver via .queue_rq()
                 * in blk_mq_dispatch_rq_list().
                 */
                list_add_tail(&rq->queuelist, &rq_list);
                if (rq->mq_hctx != hctx)
                        multi_hctxs = true;
        } while (++count < max_dispatch);

        if (!count) {
                if (run_queue)
                        blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY);
        } else if (multi_hctxs) {
                /*
                 * Requests from different hctx may be dequeued from some
                 * schedulers, such as bfq and deadline.
                 *
                 * Sort the requests in the list according to their hctx,
                 * dispatch batching requests from same hctx at a time.
                 */
                list_sort(NULL, &rq_list, sched_rq_cmp);
                do {
                        dispatched |= blk_mq_dispatch_hctx_list(&rq_list);
                } while (!list_empty(&rq_list));
        } else {
                dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, count);
        }

        if (busy)
                return -EAGAIN;
        return !!dispatched;
}

static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
{
        unsigned long end = jiffies + HZ;
        int ret;

        do {
                ret = __blk_mq_do_dispatch_sched(hctx);
                if (ret != 1)
                        break;
                if (need_resched() || time_is_before_jiffies(end)) {
                        blk_mq_delay_run_hw_queue(hctx, 0);
                        break;
                }
        } while (1);

        return ret;
}

static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
                                          struct blk_mq_ctx *ctx)
{
        unsigned short idx = ctx->index_hw[hctx->type];

        if (++idx == hctx->nr_ctx)
                idx = 0;

        return hctx->ctxs[idx];
}

/*
 * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
 * its queue by itself in its completion handler, so we don't need to
 * restart queue if .get_budget() fails to get the budget.
 *
 * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to
 * be run again.  This is necessary to avoid starving flushes.
 */
static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
{
        struct request_queue *q = hctx->queue;
        LIST_HEAD(rq_list);
        struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
        int ret = 0;
        struct request *rq;

        do {
                if (!list_empty_careful(&hctx->dispatch)) {
                        ret = -EAGAIN;
                        break;
                }

                if (!sbitmap_any_bit_set(&hctx->ctx_map))
                        break;

                if (!blk_mq_get_dispatch_budget(q))
                        break;

                rq = blk_mq_dequeue_from_ctx(hctx, ctx);
                if (!rq) {
                        blk_mq_put_dispatch_budget(q);
                        /*
                         * We're releasing without dispatching. Holding the
                         * budget could have blocked any "hctx"s with the
                         * same queue and if we didn't dispatch then there's
                         * no guarantee anyone will kick the queue.  Kick it
                         * ourselves.
                         */
                        blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY);
                        break;
                }

                /*
                 * Now this rq owns the budget which has to be released
                 * if this rq won't be queued to driver via .queue_rq()
                 * in blk_mq_dispatch_rq_list().
                 */
                list_add(&rq->queuelist, &rq_list);

                /* round robin for fair dispatch */
                ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);

        } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, 1));

        WRITE_ONCE(hctx->dispatch_from, ctx);
        return ret;
}

static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
{
        struct request_queue *q = hctx->queue;
        struct elevator_queue *e = q->elevator;
        const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
        int ret = 0;
        LIST_HEAD(rq_list);

        /*
         * If we have previous entries on our dispatch list, grab them first for
         * more fair dispatch.
         */
        if (!list_empty_careful(&hctx->dispatch)) {
                spin_lock(&hctx->lock);
                if (!list_empty(&hctx->dispatch))
                        list_splice_init(&hctx->dispatch, &rq_list);
                spin_unlock(&hctx->lock);
        }

        /*
         * Only ask the scheduler for requests, if we didn't have residual
         * requests from the dispatch list. This is to avoid the case where
         * we only ever dispatch a fraction of the requests available because
         * of low device queue depth. Once we pull requests out of the IO
         * scheduler, we can no longer merge or sort them. So it's best to
         * leave them there for as long as we can. Mark the hw queue as
         * needing a restart in that case.
         *
         * We want to dispatch from the scheduler if there was nothing
         * on the dispatch list or we were able to dispatch from the
         * dispatch list.
         */
        if (!list_empty(&rq_list)) {
                blk_mq_sched_mark_restart_hctx(hctx);
                if (blk_mq_dispatch_rq_list(hctx, &rq_list, 0)) {
                        if (has_sched_dispatch)
                                ret = blk_mq_do_dispatch_sched(hctx);
                        else
                                ret = blk_mq_do_dispatch_ctx(hctx);
                }
        } else if (has_sched_dispatch) {
                ret = blk_mq_do_dispatch_sched(hctx);
        } else if (hctx->dispatch_busy) {
                /* dequeue request one by one from sw queue if queue is busy */
                ret = blk_mq_do_dispatch_ctx(hctx);
        } else {
                blk_mq_flush_busy_ctxs(hctx, &rq_list);
                blk_mq_dispatch_rq_list(hctx, &rq_list, 0);
        }

        return ret;
}

void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
{
        struct request_queue *q = hctx->queue;

        /* RCU or SRCU read lock is needed before checking quiesced flag */
        if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
                return;

        hctx->run++;

        /*
         * A return of -EAGAIN is an indication that hctx->dispatch is not
         * empty and we must run again in order to avoid starving flushes.
         */
        if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) {
                if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN)
                        blk_mq_run_hw_queue(hctx, true);
        }
}

bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
                unsigned int nr_segs)
{
        struct elevator_queue *e = q->elevator;
        struct blk_mq_ctx *ctx;
        struct blk_mq_hw_ctx *hctx;
        bool ret = false;
        enum hctx_type type;

        if (e && e->type->ops.bio_merge)
                return e->type->ops.bio_merge(q, bio, nr_segs);

        ctx = blk_mq_get_ctx(q);
        hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
        type = hctx->type;
        if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) ||
            list_empty_careful(&ctx->rq_lists[type]))
                return false;

        /* default per sw-queue merge */
        spin_lock(&ctx->lock);
        /*
         * Reverse check our software queue for entries that we could
         * potentially merge with. Currently includes a hand-wavy stop
         * count of 8, to not spend too much time checking for merges.
         */
        if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) {
                ctx->rq_merged++;
                ret = true;
        }

        spin_unlock(&ctx->lock);

        return ret;
}

bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq)
{
        return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq);
}
EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);

void blk_mq_sched_request_inserted(struct request *rq)
{
        trace_block_rq_insert(rq);
}
EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted);

static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
                                       bool has_sched,
                                       struct request *rq)
{
        /*
         * dispatch flush and passthrough rq directly
         *
         * passthrough request has to be added to hctx->dispatch directly.
         * For some reason, device may be in one situation which can't
         * handle FS request, so STS_RESOURCE is always returned and the
         * FS request will be added to hctx->dispatch. However passthrough
         * request may be required at that time for fixing the problem. If
         * passthrough request is added to scheduler queue, there isn't any
         * chance to dispatch it given we prioritize requests in hctx->dispatch.
         */
        if ((rq->rq_flags & RQF_FLUSH_SEQ) || blk_rq_is_passthrough(rq))
                return true;

        if (has_sched)
                rq->rq_flags |= RQF_SORTED;

        return false;
}

void blk_mq_sched_insert_request(struct request *rq, bool at_head,
                                 bool run_queue, bool async)
{
        struct request_queue *q = rq->q;
        struct elevator_queue *e = q->elevator;
        struct blk_mq_ctx *ctx = rq->mq_ctx;
        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;

        WARN_ON(e && (rq->tag != BLK_MQ_NO_TAG));

        if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) {
                /*
                 * Firstly normal IO request is inserted to scheduler queue or
                 * sw queue, meantime we add flush request to dispatch queue(
                 * hctx->dispatch) directly and there is at most one in-flight
                 * flush request for each hw queue, so it doesn't matter to add
                 * flush request to tail or front of the dispatch queue.
                 *
                 * Secondly in case of NCQ, flush request belongs to non-NCQ
                 * command, and queueing it will fail when there is any
                 * in-flight normal IO request(NCQ command). When adding flush
                 * rq to the front of hctx->dispatch, it is easier to introduce
                 * extra time to flush rq's latency because of S_SCHED_RESTART
                 * compared with adding to the tail of dispatch queue, then
                 * chance of flush merge is increased, and less flush requests
                 * will be issued to controller. It is observed that ~10% time
                 * is saved in blktests block/004 on disk attached to AHCI/NCQ
                 * drive when adding flush rq to the front of hctx->dispatch.
                 *
                 * Simply queue flush rq to the front of hctx->dispatch so that
                 * intensive flush workloads can benefit in case of NCQ HW.
                 */
                at_head = (rq->rq_flags & RQF_FLUSH_SEQ) ? true : at_head;
                blk_mq_request_bypass_insert(rq, at_head, false);
                goto run;
        }

        if (e && e->type->ops.insert_requests) {
                LIST_HEAD(list);

                list_add(&rq->queuelist, &list);
                e->type->ops.insert_requests(hctx, &list, at_head);
        } else {
                spin_lock(&ctx->lock);
                __blk_mq_insert_request(hctx, rq, at_head);
                spin_unlock(&ctx->lock);
        }

run:
        if (run_queue)
                blk_mq_run_hw_queue(hctx, async);
}

void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
                                  struct blk_mq_ctx *ctx,
                                  struct list_head *list, bool run_queue_async)
{
        struct elevator_queue *e;
        struct request_queue *q = hctx->queue;

        /*
         * blk_mq_sched_insert_requests() is called from flush plug
         * context only, and hold one usage counter to prevent queue
         * from being released.
         */
        percpu_ref_get(&q->q_usage_counter);

        e = hctx->queue->elevator;
        if (e && e->type->ops.insert_requests)
                e->type->ops.insert_requests(hctx, list, false);
        else {
                /*
                 * try to issue requests directly if the hw queue isn't
                 * busy in case of 'none' scheduler, and this way may save
                 * us one extra enqueue & dequeue to sw queue.
                 */
                if (!hctx->dispatch_busy && !e && !run_queue_async) {
                        blk_mq_try_issue_list_directly(hctx, list);
                        if (list_empty(list))
                                goto out;
                }
                blk_mq_insert_requests(hctx, ctx, list);
        }

        blk_mq_run_hw_queue(hctx, run_queue_async);
 out:
        percpu_ref_put(&q->q_usage_counter);
}

static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
                                   struct blk_mq_hw_ctx *hctx,
                                   unsigned int hctx_idx)
{
        unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;

        if (hctx->sched_tags) {
                blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx);
                blk_mq_free_rq_map(hctx->sched_tags, flags);
                hctx->sched_tags = NULL;
        }
}

static int blk_mq_sched_alloc_tags(struct request_queue *q,
                                   struct blk_mq_hw_ctx *hctx,
                                   unsigned int hctx_idx)
{
        struct blk_mq_tag_set *set = q->tag_set;
        /* Clear HCTX_SHARED so tags are init'ed */
        unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
        int ret;

        hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
                                               set->reserved_tags, flags);
        if (!hctx->sched_tags)
                return -ENOMEM;

        ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests);
        if (ret)
                blk_mq_sched_free_tags(set, hctx, hctx_idx);

        return ret;
}

/* called in queue's release handler, tagset has gone away */
static void blk_mq_sched_tags_teardown(struct request_queue *q)
{
        struct blk_mq_hw_ctx *hctx;
        int i;

        queue_for_each_hw_ctx(q, hctx, i) {
                /* Clear HCTX_SHARED so tags are freed */
                unsigned int flags = hctx->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;

                if (hctx->sched_tags) {
                        blk_mq_free_rq_map(hctx->sched_tags, flags);
                        hctx->sched_tags = NULL;
                }
        }
}

int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
{
        struct blk_mq_hw_ctx *hctx;
        struct elevator_queue *eq;
        unsigned int i;
        int ret;

        if (!e) {
                q->elevator = NULL;
                q->nr_requests = q->tag_set->queue_depth;
                return 0;
        }

        /*
         * Default to double of smaller one between hw queue_depth and 128,
         * since we don't split into sync/async like the old code did.
         * Additionally, this is a per-hw queue depth.
         */
        q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
                                   BLKDEV_MAX_RQ);

        queue_for_each_hw_ctx(q, hctx, i) {
                ret = blk_mq_sched_alloc_tags(q, hctx, i);
                if (ret)
                        goto err;
        }

        ret = e->ops.init_sched(q, e);
        if (ret)
                goto err;

        blk_mq_debugfs_register_sched(q);

        queue_for_each_hw_ctx(q, hctx, i) {
                if (e->ops.init_hctx) {
                        ret = e->ops.init_hctx(hctx, i);
                        if (ret) {
                                eq = q->elevator;
                                blk_mq_sched_free_requests(q);
                                blk_mq_exit_sched(q, eq);
                                kobject_put(&eq->kobj);
                                return ret;
                        }
                }
                blk_mq_debugfs_register_sched_hctx(q, hctx);
        }

        return 0;

err:
        blk_mq_sched_free_requests(q);
        blk_mq_sched_tags_teardown(q);
        q->elevator = NULL;
        return ret;
}

/*
 * called in either blk_queue_cleanup or elevator_switch, tagset
 * is required for freeing requests
 */
void blk_mq_sched_free_requests(struct request_queue *q)
{
        struct blk_mq_hw_ctx *hctx;
        int i;

        queue_for_each_hw_ctx(q, hctx, i) {
                if (hctx->sched_tags)
                        blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i);
        }
}

void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned int i;

        queue_for_each_hw_ctx(q, hctx, i) {
                blk_mq_debugfs_unregister_sched_hctx(hctx);
                if (e->type->ops.exit_hctx && hctx->sched_data) {
                        e->type->ops.exit_hctx(hctx, i);
                        hctx->sched_data = NULL;
                }
        }
        blk_mq_debugfs_unregister_sched(q);
        if (e->type->ops.exit_sched)
                e->type->ops.exit_sched(e);
        blk_mq_sched_tags_teardown(q);
        q->elevator = NULL;
}


















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *        Linux NET3:        Internet Group Management Protocol  [IGMP]
 *
 *        Authors:
 *                Alan Cox <alan@lxorguk.ukuu.org.uk>
 *
 *        Extended to talk the BSD extended IGMP protocol of mrouted 3.6
 */
#ifndef _LINUX_IGMP_H
#define _LINUX_IGMP_H

#include <linux/skbuff.h>
#include <linux/timer.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/refcount.h>
#include <uapi/linux/igmp.h>

static inline struct igmphdr *igmp_hdr(const struct sk_buff *skb)
{
        return (struct igmphdr *)skb_transport_header(skb);
}

static inline struct igmpv3_report *
                        igmpv3_report_hdr(const struct sk_buff *skb)
{
        return (struct igmpv3_report *)skb_transport_header(skb);
}

static inline struct igmpv3_query *
                        igmpv3_query_hdr(const struct sk_buff *skb)
{
        return (struct igmpv3_query *)skb_transport_header(skb);
}

struct ip_sf_socklist {
        unsigned int                sl_max;
        unsigned int                sl_count;
        struct rcu_head                rcu;
        __be32                        sl_addr[];
};

#define IP_SFLSIZE(count)        (sizeof(struct ip_sf_socklist) + \
        (count) * sizeof(__be32))

#define IP_SFBLOCK        10        /* allocate this many at once */

/* ip_mc_socklist is real list now. Speed is not argument;
   this list never used in fast path code
 */

struct ip_mc_socklist {
        struct ip_mc_socklist __rcu *next_rcu;
        struct ip_mreqn                multi;
        unsigned int                sfmode;                /* MCAST_{INCLUDE,EXCLUDE} */
        struct ip_sf_socklist __rcu        *sflist;
        struct rcu_head                rcu;
};

struct ip_sf_list {
        struct ip_sf_list        *sf_next;
        unsigned long                sf_count[2];        /* include/exclude counts */
        __be32                        sf_inaddr;
        unsigned char                sf_gsresp;        /* include in g & s response? */
        unsigned char                sf_oldin;        /* change state */
        unsigned char                sf_crcount;        /* retrans. left to send */
};

struct ip_mc_list {
        struct in_device        *interface;
        __be32                        multiaddr;
        unsigned int                sfmode;
        struct ip_sf_list        *sources;
        struct ip_sf_list        *tomb;
        unsigned long                sfcount[2];
        union {
                struct ip_mc_list *next;
                struct ip_mc_list __rcu *next_rcu;
        };
        struct ip_mc_list __rcu *next_hash;
        struct timer_list        timer;
        int                        users;
        refcount_t                refcnt;
        spinlock_t                lock;
        char                        tm_running;
        char                        reporter;
        char                        unsolicit_count;
        char                        loaded;
        unsigned char                gsquery;        /* check source marks? */
        unsigned char                crcount;
        struct rcu_head                rcu;
};

/* V3 exponential field decoding */
#define IGMPV3_MASK(value, nb) ((nb)>=32 ? (value) : ((1<<(nb))-1) & (value))
#define IGMPV3_EXP(thresh, nbmant, nbexp, value) \
        ((value) < (thresh) ? (value) : \
        ((IGMPV3_MASK(value, nbmant) | (1<<(nbmant))) << \
         (IGMPV3_MASK((value) >> (nbmant), nbexp) + (nbexp))))

#define IGMPV3_QQIC(value) IGMPV3_EXP(0x80, 4, 3, value)
#define IGMPV3_MRC(value) IGMPV3_EXP(0x80, 4, 3, value)

static inline int ip_mc_may_pull(struct sk_buff *skb, unsigned int len)
{
        if (skb_transport_offset(skb) + ip_transport_len(skb) < len)
                return 0;

        return pskb_may_pull(skb, len);
}

extern int ip_check_mc_rcu(struct in_device *dev, __be32 mc_addr, __be32 src_addr, u8 proto);
extern int igmp_rcv(struct sk_buff *);
extern int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr);
extern int ip_mc_join_group_ssm(struct sock *sk, struct ip_mreqn *imr,
                                unsigned int mode);
extern int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr);
extern void ip_mc_drop_socket(struct sock *sk);
extern int ip_mc_source(int add, int omode, struct sock *sk,
                struct ip_mreq_source *mreqs, int ifindex);
extern int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf,int ifindex);
extern int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
                        sockptr_t optval, sockptr_t optlen);
extern int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
                        sockptr_t optval, size_t offset);
extern int ip_mc_sf_allow(struct sock *sk, __be32 local, __be32 rmt,
                          int dif, int sdif);
extern void ip_mc_init_dev(struct in_device *);
extern void ip_mc_destroy_dev(struct in_device *);
extern void ip_mc_up(struct in_device *);
extern void ip_mc_down(struct in_device *);
extern void ip_mc_unmap(struct in_device *);
extern void ip_mc_remap(struct in_device *);
extern void __ip_mc_dec_group(struct in_device *in_dev, __be32 addr, gfp_t gfp);
static inline void ip_mc_dec_group(struct in_device *in_dev, __be32 addr)
{
        return __ip_mc_dec_group(in_dev, addr, GFP_KERNEL);
}
extern void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr,
                              gfp_t gfp);
extern void ip_mc_inc_group(struct in_device *in_dev, __be32 addr);
int ip_mc_check_igmp(struct sk_buff *skb);

#endif









































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *  Universal TUN/TAP device driver.
 *  Copyright (C) 1999-2000 Maxim Krasnyansky <max_mk@yahoo.com>
 */
#ifndef __IF_TUN_H
#define __IF_TUN_H

#include <uapi/linux/if_tun.h>
#include <uapi/linux/virtio_net.h>

#define TUN_XDP_FLAG 0x1UL

#define TUN_MSG_UBUF 1
#define TUN_MSG_PTR  2
struct tun_msg_ctl {
        unsigned short type;
        unsigned short num;
        void *ptr;
};

struct tun_xdp_hdr {
        int buflen;
        struct virtio_net_hdr gso;
};

#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
struct socket *tun_get_socket(struct file *);
struct ptr_ring *tun_get_tx_ring(struct file *file);
static inline bool tun_is_xdp_frame(void *ptr)
{
       return (unsigned long)ptr & TUN_XDP_FLAG;
}
static inline void *tun_xdp_to_ptr(struct xdp_frame *xdp)
{
       return (void *)((unsigned long)xdp | TUN_XDP_FLAG);
}
static inline struct xdp_frame *tun_ptr_to_xdp(void *ptr)
{
       return (void *)((unsigned long)ptr & ~TUN_XDP_FLAG);
}
void tun_ptr_free(void *ptr);
#else
#include <linux/err.h>
#include <linux/errno.h>
struct file;
struct socket;
static inline struct socket *tun_get_socket(struct file *f)
{
        return ERR_PTR(-EINVAL);
}
static inline struct ptr_ring *tun_get_tx_ring(struct file *f)
{
        return ERR_PTR(-EINVAL);
}
static inline bool tun_is_xdp_frame(void *ptr)
{
        return false;
}
static inline void *tun_xdp_to_ptr(struct xdp_frame *xdp)
{
        return NULL;
}
static inline struct xdp_frame *tun_ptr_to_xdp(void *ptr)
{
        return NULL;
}
static inline void tun_ptr_free(void *ptr)
{
}
#endif /* CONFIG_TUN */
#endif /* __IF_TUN_H */











































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Descending-priority-sorted double-linked list
 *
 * (C) 2002-2003 Intel Corp
 * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>.
 *
 * 2001-2005 (c) MontaVista Software, Inc.
 * Daniel Walker <dwalker@mvista.com>
 *
 * (C) 2005 Thomas Gleixner <tglx@linutronix.de>
 *
 * Simplifications of the original code by
 * Oleg Nesterov <oleg@tv-sign.ru>
 *
 * Based on simple lists (include/linux/list.h).
 *
 * This is a priority-sorted list of nodes; each node has a
 * priority from INT_MIN (highest) to INT_MAX (lowest).
 *
 * Addition is O(K), removal is O(1), change of priority of a node is
 * O(K) and K is the number of RT priority levels used in the system.
 * (1 <= K <= 99)
 *
 * This list is really a list of lists:
 *
 *  - The tier 1 list is the prio_list, different priority nodes.
 *
 *  - The tier 2 list is the node_list, serialized nodes.
 *
 * Simple ASCII art explanation:
 *
 * pl:prio_list (only for plist_node)
 * nl:node_list
 *   HEAD|             NODE(S)
 *       |
 *       ||------------------------------------|
 *       ||->|pl|<->|pl|<--------------->|pl|<-|
 *       |   |10|   |21|   |21|   |21|   |40|   (prio)
 *       |   |  |   |  |   |  |   |  |   |  |
 *       |   |  |   |  |   |  |   |  |   |  |
 * |->|nl|<->|nl|<->|nl|<->|nl|<->|nl|<->|nl|<-|
 * |-------------------------------------------|
 *
 * The nodes on the prio_list list are sorted by priority to simplify
 * the insertion of new nodes. There are no nodes with duplicate
 * priorites on the list.
 *
 * The nodes on the node_list are ordered by priority and can contain
 * entries which have the same priority. Those entries are ordered
 * FIFO
 *
 * Addition means: look for the prio_list node in the prio_list
 * for the priority of the node and insert it before the node_list
 * entry of the next prio_list node. If it is the first node of
 * that priority, add it to the prio_list in the right position and
 * insert it into the serialized node_list list
 *
 * Removal means remove it from the node_list and remove it from
 * the prio_list if the node_list list_head is non empty. In case
 * of removal from the prio_list it must be checked whether other
 * entries of the same priority are on the list or not. If there
 * is another entry of the same priority then this entry has to
 * replace the removed entry on the prio_list. If the entry which
 * is removed is the only entry of this priority then a simple
 * remove from both list is sufficient.
 *
 * INT_MIN is the highest priority, 0 is the medium highest, INT_MAX
 * is lowest priority.
 *
 * No locking is done, up to the caller.
 */
#ifndef _LINUX_PLIST_H_
#define _LINUX_PLIST_H_

#include <linux/kernel.h>
#include <linux/list.h>

struct plist_head {
        struct list_head node_list;
};

struct plist_node {
        int                        prio;
        struct list_head        prio_list;
        struct list_head        node_list;
};

/**
 * PLIST_HEAD_INIT - static struct plist_head initializer
 * @head:        struct plist_head variable name
 */
#define PLIST_HEAD_INIT(head)                                \
{                                                        \
        .node_list = LIST_HEAD_INIT((head).node_list)        \
}

/**
 * PLIST_HEAD - declare and init plist_head
 * @head:        name for struct plist_head variable
 */
#define PLIST_HEAD(head) \
        struct plist_head head = PLIST_HEAD_INIT(head)

/**
 * PLIST_NODE_INIT - static struct plist_node initializer
 * @node:        struct plist_node variable name
 * @__prio:        initial node priority
 */
#define PLIST_NODE_INIT(node, __prio)                        \
{                                                        \
        .prio  = (__prio),                                \
        .prio_list = LIST_HEAD_INIT((node).prio_list),        \
        .node_list = LIST_HEAD_INIT((node).node_list),        \
}

/**
 * plist_head_init - dynamic struct plist_head initializer
 * @head:        &struct plist_head pointer
 */
static inline void
plist_head_init(struct plist_head *head)
{
        INIT_LIST_HEAD(&head->node_list);
}

/**
 * plist_node_init - Dynamic struct plist_node initializer
 * @node:        &struct plist_node pointer
 * @prio:        initial node priority
 */
static inline void plist_node_init(struct plist_node *node, int prio)
{
        node->prio = prio;
        INIT_LIST_HEAD(&node->prio_list);
        INIT_LIST_HEAD(&node->node_list);
}

extern void plist_add(struct plist_node *node, struct plist_head *head);
extern void plist_del(struct plist_node *node, struct plist_head *head);

extern void plist_requeue(struct plist_node *node, struct plist_head *head);

/**
 * plist_for_each - iterate over the plist
 * @pos:        the type * to use as a loop counter
 * @head:        the head for your list
 */
#define plist_for_each(pos, head)        \
         list_for_each_entry(pos, &(head)->node_list, node_list)

/**
 * plist_for_each_continue - continue iteration over the plist
 * @pos:        the type * to use as a loop cursor
 * @head:        the head for your list
 *
 * Continue to iterate over plist, continuing after the current position.
 */
#define plist_for_each_continue(pos, head)        \
         list_for_each_entry_continue(pos, &(head)->node_list, node_list)

/**
 * plist_for_each_safe - iterate safely over a plist of given type
 * @pos:        the type * to use as a loop counter
 * @n:        another type * to use as temporary storage
 * @head:        the head for your list
 *
 * Iterate over a plist of given type, safe against removal of list entry.
 */
#define plist_for_each_safe(pos, n, head)        \
         list_for_each_entry_safe(pos, n, &(head)->node_list, node_list)

/**
 * plist_for_each_entry        - iterate over list of given type
 * @pos:        the type * to use as a loop counter
 * @head:        the head for your list
 * @mem:        the name of the list_head within the struct
 */
#define plist_for_each_entry(pos, head, mem)        \
         list_for_each_entry(pos, &(head)->node_list, mem.node_list)

/**
 * plist_for_each_entry_continue - continue iteration over list of given type
 * @pos:        the type * to use as a loop cursor
 * @head:        the head for your list
 * @m:                the name of the list_head within the struct
 *
 * Continue to iterate over list of given type, continuing after
 * the current position.
 */
#define plist_for_each_entry_continue(pos, head, m)        \
        list_for_each_entry_continue(pos, &(head)->node_list, m.node_list)

/**
 * plist_for_each_entry_safe - iterate safely over list of given type
 * @pos:        the type * to use as a loop counter
 * @n:                another type * to use as temporary storage
 * @head:        the head for your list
 * @m:                the name of the list_head within the struct
 *
 * Iterate over list of given type, safe against removal of list entry.
 */
#define plist_for_each_entry_safe(pos, n, head, m)        \
        list_for_each_entry_safe(pos, n, &(head)->node_list, m.node_list)

/**
 * plist_head_empty - return !0 if a plist_head is empty
 * @head:        &struct plist_head pointer
 */
static inline int plist_head_empty(const struct plist_head *head)
{
        return list_empty(&head->node_list);
}

/**
 * plist_node_empty - return !0 if plist_node is not on a list
 * @node:        &struct plist_node pointer
 */
static inline int plist_node_empty(const struct plist_node *node)
{
        return list_empty(&node->node_list);
}

/* All functions below assume the plist_head is not empty. */

/**
 * plist_first_entry - get the struct for the first entry
 * @head:        the &struct plist_head pointer
 * @type:        the type of the struct this is embedded in
 * @member:        the name of the list_head within the struct
 */
#ifdef CONFIG_DEBUG_PLIST
# define plist_first_entry(head, type, member)        \
({ \
        WARN_ON(plist_head_empty(head)); \
        container_of(plist_first(head), type, member); \
})
#else
# define plist_first_entry(head, type, member)        \
        container_of(plist_first(head), type, member)
#endif

/**
 * plist_last_entry - get the struct for the last entry
 * @head:        the &struct plist_head pointer
 * @type:        the type of the struct this is embedded in
 * @member:        the name of the list_head within the struct
 */
#ifdef CONFIG_DEBUG_PLIST
# define plist_last_entry(head, type, member)        \
({ \
        WARN_ON(plist_head_empty(head)); \
        container_of(plist_last(head), type, member); \
})
#else
# define plist_last_entry(head, type, member)        \
        container_of(plist_last(head), type, member)
#endif

/**
 * plist_next - get the next entry in list
 * @pos:        the type * to cursor
 */
#define plist_next(pos) \
        list_next_entry(pos, node_list)

/**
 * plist_prev - get the prev entry in list
 * @pos:        the type * to cursor
 */
#define plist_prev(pos) \
        list_prev_entry(pos, node_list)

/**
 * plist_first - return the first node (and thus, highest priority)
 * @head:        the &struct plist_head pointer
 *
 * Assumes the plist is _not_ empty.
 */
static inline struct plist_node *plist_first(const struct plist_head *head)
{
        return list_entry(head->node_list.next,
                          struct plist_node, node_list);
}

/**
 * plist_last - return the last node (and thus, lowest priority)
 * @head:        the &struct plist_head pointer
 *
 * Assumes the plist is _not_ empty.
 */
static inline struct plist_node *plist_last(const struct plist_head *head)
{
        return list_entry(head->node_list.prev,
                          struct plist_node, node_list);
}

#endif









































































    1 




    1 




















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MMAN_H
#define _LINUX_MMAN_H

#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/percpu_counter.h>

#include <linux/atomic.h>
#include <uapi/linux/mman.h>

/*
 * Arrange for legacy / undefined architecture specific flags to be
 * ignored by mmap handling code.
 */
#ifndef MAP_32BIT
#define MAP_32BIT 0
#endif
#ifndef MAP_HUGE_2MB
#define MAP_HUGE_2MB 0
#endif
#ifndef MAP_HUGE_1GB
#define MAP_HUGE_1GB 0
#endif
#ifndef MAP_UNINITIALIZED
#define MAP_UNINITIALIZED 0
#endif
#ifndef MAP_SYNC
#define MAP_SYNC 0
#endif

/*
 * The historical set of flags that all mmap implementations implicitly
 * support when a ->mmap_validate() op is not provided in file_operations.
 */
#define LEGACY_MAP_MASK (MAP_SHARED \
                | MAP_PRIVATE \
                | MAP_FIXED \
                | MAP_ANONYMOUS \
                | MAP_DENYWRITE \
                | MAP_EXECUTABLE \
                | MAP_UNINITIALIZED \
                | MAP_GROWSDOWN \
                | MAP_LOCKED \
                | MAP_NORESERVE \
                | MAP_POPULATE \
                | MAP_NONBLOCK \
                | MAP_STACK \
                | MAP_HUGETLB \
                | MAP_32BIT \
                | MAP_HUGE_2MB \
                | MAP_HUGE_1GB)

extern int sysctl_overcommit_memory;
extern int sysctl_overcommit_ratio;
extern unsigned long sysctl_overcommit_kbytes;
extern struct percpu_counter vm_committed_as;

#ifdef CONFIG_SMP
extern s32 vm_committed_as_batch;
extern void mm_compute_batch(int overcommit_policy);
#else
#define vm_committed_as_batch 0
static inline void mm_compute_batch(int overcommit_policy)
{
}
#endif

unsigned long vm_memory_committed(void);

static inline void vm_acct_memory(long pages)
{
        percpu_counter_add_batch(&vm_committed_as, pages, vm_committed_as_batch);
}

static inline void vm_unacct_memory(long pages)
{
        vm_acct_memory(-pages);
}

/*
 * Allow architectures to handle additional protection and flag bits. The
 * overriding macros must be defined in the arch-specific asm/mman.h file.
 */

#ifndef arch_calc_vm_prot_bits
#define arch_calc_vm_prot_bits(prot, pkey) 0
#endif

#ifndef arch_calc_vm_flag_bits
#define arch_calc_vm_flag_bits(file, flags) 0
#endif

#ifndef arch_vm_get_page_prot
#define arch_vm_get_page_prot(vm_flags) __pgprot(0)
#endif

#ifndef arch_validate_prot
/*
 * This is called from mprotect().  PROT_GROWSDOWN and PROT_GROWSUP have
 * already been masked out.
 *
 * Returns true if the prot flags are valid
 */
static inline bool arch_validate_prot(unsigned long prot, unsigned long addr)
{
        return (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM)) == 0;
}
#define arch_validate_prot arch_validate_prot
#endif

#ifndef arch_validate_flags
/*
 * This is called from mmap() and mprotect() with the updated vma->vm_flags.
 *
 * Returns true if the VM_* flags are valid.
 */
static inline bool arch_validate_flags(unsigned long flags)
{
        return true;
}
#define arch_validate_flags arch_validate_flags
#endif

/*
 * Optimisation macro.  It is equivalent to:
 *      (x & bit1) ? bit2 : 0
 * but this version is faster.
 * ("bit1" and "bit2" must be single bits)
 */
#define _calc_vm_trans(x, bit1, bit2) \
  ((!(bit1) || !(bit2)) ? 0 : \
  ((bit1) <= (bit2) ? ((x) & (bit1)) * ((bit2) / (bit1)) \
   : ((x) & (bit1)) / ((bit1) / (bit2))))

/*
 * Combine the mmap "prot" argument into "vm_flags" used internally.
 */
static inline unsigned long
calc_vm_prot_bits(unsigned long prot, unsigned long pkey)
{
        return _calc_vm_trans(prot, PROT_READ,  VM_READ ) |
               _calc_vm_trans(prot, PROT_WRITE, VM_WRITE) |
               _calc_vm_trans(prot, PROT_EXEC,  VM_EXEC) |
               arch_calc_vm_prot_bits(prot, pkey);
}

/*
 * Combine the mmap "flags" argument into "vm_flags" used internally.
 */
static inline unsigned long
calc_vm_flag_bits(struct file *file, unsigned long flags)
{
        return _calc_vm_trans(flags, MAP_GROWSDOWN,  VM_GROWSDOWN ) |
               _calc_vm_trans(flags, MAP_DENYWRITE,  VM_DENYWRITE ) |
               _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    ) |
               _calc_vm_trans(flags, MAP_SYNC,             VM_SYNC      ) |
               arch_calc_vm_flag_bits(file, flags);
}

unsigned long vm_commit_limit(void);
#endif /* _LINUX_MMAN_H */

























































































































































    1 
    1 






    1 

    1 


















































































    1 






    1 





    1 































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  Copyright (C) 2006 IBM Corporation
 *
 *  Author: Serge Hallyn <serue@us.ibm.com>
 *
 *  Jun 2006 - namespaces support
 *             OpenVZ, SWsoft Inc.
 *             Pavel Emelianov <xemul@openvz.org>
 */

#include <linux/slab.h>
#include <linux/export.h>
#include <linux/nsproxy.h>
#include <linux/init_task.h>
#include <linux/mnt_namespace.h>
#include <linux/utsname.h>
#include <linux/pid_namespace.h>
#include <net/net_namespace.h>
#include <linux/ipc_namespace.h>
#include <linux/time_namespace.h>
#include <linux/fs_struct.h>
#include <linux/proc_fs.h>
#include <linux/proc_ns.h>
#include <linux/file.h>
#include <linux/syscalls.h>
#include <linux/cgroup.h>
#include <linux/perf_event.h>

static struct kmem_cache *nsproxy_cachep;

struct nsproxy init_nsproxy = {
        .count                        = ATOMIC_INIT(1),
        .uts_ns                        = &init_uts_ns,
#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
        .ipc_ns                        = &init_ipc_ns,
#endif
        .mnt_ns                        = NULL,
        .pid_ns_for_children        = &init_pid_ns,
#ifdef CONFIG_NET
        .net_ns                        = &init_net,
#endif
#ifdef CONFIG_CGROUPS
        .cgroup_ns                = &init_cgroup_ns,
#endif
#ifdef CONFIG_TIME_NS
        .time_ns                = &init_time_ns,
        .time_ns_for_children        = &init_time_ns,
#endif
};

static inline struct nsproxy *create_nsproxy(void)
{
        struct nsproxy *nsproxy;

        nsproxy = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL);
        if (nsproxy)
                atomic_set(&nsproxy->count, 1);
        return nsproxy;
}

/*
 * Create new nsproxy and all of its the associated namespaces.
 * Return the newly created nsproxy.  Do not attach this to the task,
 * leave it to the caller to do proper locking and attach it to task.
 */
static struct nsproxy *create_new_namespaces(unsigned long flags,
        struct task_struct *tsk, struct user_namespace *user_ns,
        struct fs_struct *new_fs)
{
        struct nsproxy *new_nsp;
        int err;

        new_nsp = create_nsproxy();
        if (!new_nsp)
                return ERR_PTR(-ENOMEM);

        new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
        if (IS_ERR(new_nsp->mnt_ns)) {
                err = PTR_ERR(new_nsp->mnt_ns);
                goto out_ns;
        }

        new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns);
        if (IS_ERR(new_nsp->uts_ns)) {
                err = PTR_ERR(new_nsp->uts_ns);
                goto out_uts;
        }

        new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns);
        if (IS_ERR(new_nsp->ipc_ns)) {
                err = PTR_ERR(new_nsp->ipc_ns);
                goto out_ipc;
        }

        new_nsp->pid_ns_for_children =
                copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns_for_children);
        if (IS_ERR(new_nsp->pid_ns_for_children)) {
                err = PTR_ERR(new_nsp->pid_ns_for_children);
                goto out_pid;
        }

        new_nsp->cgroup_ns = copy_cgroup_ns(flags, user_ns,
                                            tsk->nsproxy->cgroup_ns);
        if (IS_ERR(new_nsp->cgroup_ns)) {
                err = PTR_ERR(new_nsp->cgroup_ns);
                goto out_cgroup;
        }

        new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);
        if (IS_ERR(new_nsp->net_ns)) {
                err = PTR_ERR(new_nsp->net_ns);
                goto out_net;
        }

        new_nsp->time_ns_for_children = copy_time_ns(flags, user_ns,
                                        tsk->nsproxy->time_ns_for_children);
        if (IS_ERR(new_nsp->time_ns_for_children)) {
                err = PTR_ERR(new_nsp->time_ns_for_children);
                goto out_time;
        }
        new_nsp->time_ns = get_time_ns(tsk->nsproxy->time_ns);

        return new_nsp;

out_time:
        put_net(new_nsp->net_ns);
out_net:
        put_cgroup_ns(new_nsp->cgroup_ns);
out_cgroup:
        if (new_nsp->pid_ns_for_children)
                put_pid_ns(new_nsp->pid_ns_for_children);
out_pid:
        if (new_nsp->ipc_ns)
                put_ipc_ns(new_nsp->ipc_ns);
out_ipc:
        if (new_nsp->uts_ns)
                put_uts_ns(new_nsp->uts_ns);
out_uts:
        if (new_nsp->mnt_ns)
                put_mnt_ns(new_nsp->mnt_ns);
out_ns:
        kmem_cache_free(nsproxy_cachep, new_nsp);
        return ERR_PTR(err);
}

/*
 * called from clone.  This now handles copy for nsproxy and all
 * namespaces therein.
 */
int copy_namespaces(unsigned long flags, struct task_struct *tsk)
{
        struct nsproxy *old_ns = tsk->nsproxy;
        struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
        struct nsproxy *new_ns;
        int ret;

        if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
                              CLONE_NEWPID | CLONE_NEWNET |
                              CLONE_NEWCGROUP | CLONE_NEWTIME)))) {
                if (likely(old_ns->time_ns_for_children == old_ns->time_ns)) {
                        get_nsproxy(old_ns);
                        return 0;
                }
        } else if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        /*
         * CLONE_NEWIPC must detach from the undolist: after switching
         * to a new ipc namespace, the semaphore arrays from the old
         * namespace are unreachable.  In clone parlance, CLONE_SYSVSEM
         * means share undolist with parent, so we must forbid using
         * it along with CLONE_NEWIPC.
         */
        if ((flags & (CLONE_NEWIPC | CLONE_SYSVSEM)) ==
                (CLONE_NEWIPC | CLONE_SYSVSEM)) 
                return -EINVAL;

        new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);
        if (IS_ERR(new_ns))
                return  PTR_ERR(new_ns);

        ret = timens_on_fork(new_ns, tsk);
        if (ret) {
                free_nsproxy(new_ns);
                return ret;
        }

        tsk->nsproxy = new_ns;
        return 0;
}

void free_nsproxy(struct nsproxy *ns)
{
        if (ns->mnt_ns)
                put_mnt_ns(ns->mnt_ns);
        if (ns->uts_ns)
                put_uts_ns(ns->uts_ns);
        if (ns->ipc_ns)
                put_ipc_ns(ns->ipc_ns);
        if (ns->pid_ns_for_children)
                put_pid_ns(ns->pid_ns_for_children);
        if (ns->time_ns)
                put_time_ns(ns->time_ns);
        if (ns->time_ns_for_children)
                put_time_ns(ns->time_ns_for_children);
        put_cgroup_ns(ns->cgroup_ns);
        put_net(ns->net_ns);
        kmem_cache_free(nsproxy_cachep, ns);
}

/*
 * Called from unshare. Unshare all the namespaces part of nsproxy.
 * On success, returns the new nsproxy.
 */
int unshare_nsproxy_namespaces(unsigned long unshare_flags,
        struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs)
{
        struct user_namespace *user_ns;
        int err = 0;

        if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
                               CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP |
                               CLONE_NEWTIME)))
                return 0;

        user_ns = new_cred ? new_cred->user_ns : current_user_ns();
        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        *new_nsp = create_new_namespaces(unshare_flags, current, user_ns,
                                         new_fs ? new_fs : current->fs);
        if (IS_ERR(*new_nsp)) {
                err = PTR_ERR(*new_nsp);
                goto out;
        }

out:
        return err;
}

void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
{
        struct nsproxy *ns;

        might_sleep();

        task_lock(p);
        ns = p->nsproxy;
        p->nsproxy = new;
        task_unlock(p);

        if (ns && atomic_dec_and_test(&ns->count))
                free_nsproxy(ns);
}

void exit_task_namespaces(struct task_struct *p)
{
        switch_task_namespaces(p, NULL);
}

static int check_setns_flags(unsigned long flags)
{
        if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
                                 CLONE_NEWNET | CLONE_NEWTIME | CLONE_NEWUSER |
                                 CLONE_NEWPID | CLONE_NEWCGROUP)))
                return -EINVAL;

#ifndef CONFIG_USER_NS
        if (flags & CLONE_NEWUSER)
                return -EINVAL;
#endif
#ifndef CONFIG_PID_NS
        if (flags & CLONE_NEWPID)
                return -EINVAL;
#endif
#ifndef CONFIG_UTS_NS
        if (flags & CLONE_NEWUTS)
                return -EINVAL;
#endif
#ifndef CONFIG_IPC_NS
        if (flags & CLONE_NEWIPC)
                return -EINVAL;
#endif
#ifndef CONFIG_CGROUPS
        if (flags & CLONE_NEWCGROUP)
                return -EINVAL;
#endif
#ifndef CONFIG_NET_NS
        if (flags & CLONE_NEWNET)
                return -EINVAL;
#endif
#ifndef CONFIG_TIME_NS
        if (flags & CLONE_NEWTIME)
                return -EINVAL;
#endif

        return 0;
}

static void put_nsset(struct nsset *nsset)
{
        unsigned flags = nsset->flags;

        if (flags & CLONE_NEWUSER)
                put_cred(nsset_cred(nsset));
        /*
         * We only created a temporary copy if we attached to more than just
         * the mount namespace.
         */
        if (nsset->fs && (flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS))
                free_fs_struct(nsset->fs);
        if (nsset->nsproxy)
                free_nsproxy(nsset->nsproxy);
}

static int prepare_nsset(unsigned flags, struct nsset *nsset)
{
        struct task_struct *me = current;

        nsset->nsproxy = create_new_namespaces(0, me, current_user_ns(), me->fs);
        if (IS_ERR(nsset->nsproxy))
                return PTR_ERR(nsset->nsproxy);

        if (flags & CLONE_NEWUSER)
                nsset->cred = prepare_creds();
        else
                nsset->cred = current_cred();
        if (!nsset->cred)
                goto out;

        /* Only create a temporary copy of fs_struct if we really need to. */
        if (flags == CLONE_NEWNS) {
                nsset->fs = me->fs;
        } else if (flags & CLONE_NEWNS) {
                nsset->fs = copy_fs_struct(me->fs);
                if (!nsset->fs)
                        goto out;
        }

        nsset->flags = flags;
        return 0;

out:
        put_nsset(nsset);
        return -ENOMEM;
}

static inline int validate_ns(struct nsset *nsset, struct ns_common *ns)
{
        return ns->ops->install(nsset, ns);
}

/*
 * This is the inverse operation to unshare().
 * Ordering is equivalent to the standard ordering used everywhere else
 * during unshare and process creation. The switch to the new set of
 * namespaces occurs at the point of no return after installation of
 * all requested namespaces was successful in commit_nsset().
 */
static int validate_nsset(struct nsset *nsset, struct pid *pid)
{
        int ret = 0;
        unsigned flags = nsset->flags;
        struct user_namespace *user_ns = NULL;
        struct pid_namespace *pid_ns = NULL;
        struct nsproxy *nsp;
        struct task_struct *tsk;

        /* Take a "snapshot" of the target task's namespaces. */
        rcu_read_lock();
        tsk = pid_task(pid, PIDTYPE_PID);
        if (!tsk) {
                rcu_read_unlock();
                return -ESRCH;
        }

        if (!ptrace_may_access(tsk, PTRACE_MODE_READ_REALCREDS)) {
                rcu_read_unlock();
                return -EPERM;
        }

        task_lock(tsk);
        nsp = tsk->nsproxy;
        if (nsp)
                get_nsproxy(nsp);
        task_unlock(tsk);
        if (!nsp) {
                rcu_read_unlock();
                return -ESRCH;
        }

#ifdef CONFIG_PID_NS
        if (flags & CLONE_NEWPID) {
                pid_ns = task_active_pid_ns(tsk);
                if (unlikely(!pid_ns)) {
                        rcu_read_unlock();
                        ret = -ESRCH;
                        goto out;
                }
                get_pid_ns(pid_ns);
        }
#endif

#ifdef CONFIG_USER_NS
        if (flags & CLONE_NEWUSER)
                user_ns = get_user_ns(__task_cred(tsk)->user_ns);
#endif
        rcu_read_unlock();

        /*
         * Install requested namespaces. The caller will have
         * verified earlier that the requested namespaces are
         * supported on this kernel. We don't report errors here
         * if a namespace is requested that isn't supported.
         */
#ifdef CONFIG_USER_NS
        if (flags & CLONE_NEWUSER) {
                ret = validate_ns(nsset, &user_ns->ns);
                if (ret)
                        goto out;
        }
#endif

        if (flags & CLONE_NEWNS) {
                ret = validate_ns(nsset, from_mnt_ns(nsp->mnt_ns));
                if (ret)
                        goto out;
        }

#ifdef CONFIG_UTS_NS
        if (flags & CLONE_NEWUTS) {
                ret = validate_ns(nsset, &nsp->uts_ns->ns);
                if (ret)
                        goto out;
        }
#endif

#ifdef CONFIG_IPC_NS
        if (flags & CLONE_NEWIPC) {
                ret = validate_ns(nsset, &nsp->ipc_ns->ns);
                if (ret)
                        goto out;
        }
#endif

#ifdef CONFIG_PID_NS
        if (flags & CLONE_NEWPID) {
                ret = validate_ns(nsset, &pid_ns->ns);
                if (ret)
                        goto out;
        }
#endif

#ifdef CONFIG_CGROUPS
        if (flags & CLONE_NEWCGROUP) {
                ret = validate_ns(nsset, &nsp->cgroup_ns->ns);
                if (ret)
                        goto out;
        }
#endif

#ifdef CONFIG_NET_NS
        if (flags & CLONE_NEWNET) {
                ret = validate_ns(nsset, &nsp->net_ns->ns);
                if (ret)
                        goto out;
        }
#endif

#ifdef CONFIG_TIME_NS
        if (flags & CLONE_NEWTIME) {
                ret = validate_ns(nsset, &nsp->time_ns->ns);
                if (ret)
                        goto out;
        }
#endif

out:
        if (pid_ns)
                put_pid_ns(pid_ns);
        if (nsp)
                put_nsproxy(nsp);
        put_user_ns(user_ns);

        return ret;
}

/*
 * This is the point of no return. There are just a few namespaces
 * that do some actual work here and it's sufficiently minimal that
 * a separate ns_common operation seems unnecessary for now.
 * Unshare is doing the same thing. If we'll end up needing to do
 * more in a given namespace or a helper here is ultimately not
 * exported anymore a simple commit handler for each namespace
 * should be added to ns_common.
 */
static void commit_nsset(struct nsset *nsset)
{
        unsigned flags = nsset->flags;
        struct task_struct *me = current;

#ifdef CONFIG_USER_NS
        if (flags & CLONE_NEWUSER) {
                /* transfer ownership */
                commit_creds(nsset_cred(nsset));
                nsset->cred = NULL;
        }
#endif

        /* We only need to commit if we have used a temporary fs_struct. */
        if ((flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS)) {
                set_fs_root(me->fs, &nsset->fs->root);
                set_fs_pwd(me->fs, &nsset->fs->pwd);
        }

#ifdef CONFIG_IPC_NS
        if (flags & CLONE_NEWIPC)
                exit_sem(me);
#endif

#ifdef CONFIG_TIME_NS
        if (flags & CLONE_NEWTIME)
                timens_commit(me, nsset->nsproxy->time_ns);
#endif

        /* transfer ownership */
        switch_task_namespaces(me, nsset->nsproxy);
        nsset->nsproxy = NULL;
}

SYSCALL_DEFINE2(setns, int, fd, int, flags)
{
        struct file *file;
        struct ns_common *ns = NULL;
        struct nsset nsset = {};
        int err = 0;

        file = fget(fd);
        if (!file)
                return -EBADF;

        if (proc_ns_file(file)) {
                ns = get_proc_ns(file_inode(file));
                if (flags && (ns->ops->type != flags))
                        err = -EINVAL;
                flags = ns->ops->type;
        } else if (!IS_ERR(pidfd_pid(file))) {
                err = check_setns_flags(flags);
        } else {
                err = -EINVAL;
        }
        if (err)
                goto out;

        err = prepare_nsset(flags, &nsset);
        if (err)
                goto out;

        if (proc_ns_file(file))
                err = validate_ns(&nsset, ns);
        else
                err = validate_nsset(&nsset, file->private_data);
        if (!err) {
                commit_nsset(&nsset);
                perf_event_namespaces(current);
        }
        put_nsset(&nsset);
out:
        fput(file);
        return err;
}

int __init nsproxy_cache_init(void)
{
        nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
        return 0;
}












































































    5 
    5 

































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 



    1 




































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
// SPDX-License-Identifier: GPL-2.0-or-later
/* Common capabilities, needed by capability.o.
 */

#include <linux/capability.h>
#include <linux/audit.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/lsm_hooks.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/ptrace.h>
#include <linux/xattr.h>
#include <linux/hugetlb.h>
#include <linux/mount.h>
#include <linux/sched.h>
#include <linux/prctl.h>
#include <linux/securebits.h>
#include <linux/user_namespace.h>
#include <linux/binfmts.h>
#include <linux/personality.h>

/*
 * If a non-root user executes a setuid-root binary in
 * !secure(SECURE_NOROOT) mode, then we raise capabilities.
 * However if fE is also set, then the intent is for only
 * the file capabilities to be applied, and the setuid-root
 * bit is left on either to change the uid (plausible) or
 * to get full privilege on a kernel without file capabilities
 * support.  So in that case we do not raise capabilities.
 *
 * Warn if that happens, once per boot.
 */
static void warn_setuid_and_fcaps_mixed(const char *fname)
{
        static int warned;
        if (!warned) {
                printk(KERN_INFO "warning: `%s' has both setuid-root and"
                        " effective capabilities. Therefore not raising all"
                        " capabilities.\n", fname);
                warned = 1;
        }
}

/**
 * cap_capable - Determine whether a task has a particular effective capability
 * @cred: The credentials to use
 * @ns:  The user namespace in which we need the capability
 * @cap: The capability to check for
 * @opts: Bitmask of options defined in include/linux/security.h
 *
 * Determine whether the nominated task has the specified capability amongst
 * its effective set, returning 0 if it does, -ve if it does not.
 *
 * NOTE WELL: cap_has_capability() cannot be used like the kernel's capable()
 * and has_capability() functions.  That is, it has the reverse semantics:
 * cap_has_capability() returns 0 when a task has a capability, but the
 * kernel's capable() and has_capability() returns 1 for this case.
 */
int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
                int cap, unsigned int opts)
{
        struct user_namespace *ns = targ_ns;

        /* See if cred has the capability in the target user namespace
         * by examining the target user namespace and all of the target
         * user namespace's parents.
         */
        for (;;) {
                /* Do we have the necessary capabilities? */
                if (ns == cred->user_ns)
                        return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;

                /*
                 * If we're already at a lower level than we're looking for,
                 * we're done searching.
                 */
                if (ns->level <= cred->user_ns->level)
                        return -EPERM;

                /* 
                 * The owner of the user namespace in the parent of the
                 * user namespace has all caps.
                 */
                if ((ns->parent == cred->user_ns) && uid_eq(ns->owner, cred->euid))
                        return 0;

                /*
                 * If you have a capability in a parent user ns, then you have
                 * it over all children user namespaces as well.
                 */
                ns = ns->parent;
        }

        /* We never get here */
}

/**
 * cap_settime - Determine whether the current process may set the system clock
 * @ts: The time to set
 * @tz: The timezone to set
 *
 * Determine whether the current process may set the system clock and timezone
 * information, returning 0 if permission granted, -ve if denied.
 */
int cap_settime(const struct timespec64 *ts, const struct timezone *tz)
{
        if (!capable(CAP_SYS_TIME))
                return -EPERM;
        return 0;
}

/**
 * cap_ptrace_access_check - Determine whether the current process may access
 *                           another
 * @child: The process to be accessed
 * @mode: The mode of attachment.
 *
 * If we are in the same or an ancestor user_ns and have all the target
 * task's capabilities, then ptrace access is allowed.
 * If we have the ptrace capability to the target user_ns, then ptrace
 * access is allowed.
 * Else denied.
 *
 * Determine whether a process may access another, returning 0 if permission
 * granted, -ve if denied.
 */
int cap_ptrace_access_check(struct task_struct *child, unsigned int mode)
{
        int ret = 0;
        const struct cred *cred, *child_cred;
        const kernel_cap_t *caller_caps;

        rcu_read_lock();
        cred = current_cred();
        child_cred = __task_cred(child);
        if (mode & PTRACE_MODE_FSCREDS)
                caller_caps = &cred->cap_effective;
        else
                caller_caps = &cred->cap_permitted;
        if (cred->user_ns == child_cred->user_ns &&
            cap_issubset(child_cred->cap_permitted, *caller_caps))
                goto out;
        if (ns_capable(child_cred->user_ns, CAP_SYS_PTRACE))
                goto out;
        ret = -EPERM;
out:
        rcu_read_unlock();
        return ret;
}

/**
 * cap_ptrace_traceme - Determine whether another process may trace the current
 * @parent: The task proposed to be the tracer
 *
 * If parent is in the same or an ancestor user_ns and has all current's
 * capabilities, then ptrace access is allowed.
 * If parent has the ptrace capability to current's user_ns, then ptrace
 * access is allowed.
 * Else denied.
 *
 * Determine whether the nominated task is permitted to trace the current
 * process, returning 0 if permission is granted, -ve if denied.
 */
int cap_ptrace_traceme(struct task_struct *parent)
{
        int ret = 0;
        const struct cred *cred, *child_cred;

        rcu_read_lock();
        cred = __task_cred(parent);
        child_cred = current_cred();
        if (cred->user_ns == child_cred->user_ns &&
            cap_issubset(child_cred->cap_permitted, cred->cap_permitted))
                goto out;
        if (has_ns_capability(parent, child_cred->user_ns, CAP_SYS_PTRACE))
                goto out;
        ret = -EPERM;
out:
        rcu_read_unlock();
        return ret;
}

/**
 * cap_capget - Retrieve a task's capability sets
 * @target: The task from which to retrieve the capability sets
 * @effective: The place to record the effective set
 * @inheritable: The place to record the inheritable set
 * @permitted: The place to record the permitted set
 *
 * This function retrieves the capabilities of the nominated task and returns
 * them to the caller.
 */
int cap_capget(struct task_struct *target, kernel_cap_t *effective,
               kernel_cap_t *inheritable, kernel_cap_t *permitted)
{
        const struct cred *cred;

        /* Derived from kernel/capability.c:sys_capget. */
        rcu_read_lock();
        cred = __task_cred(target);
        *effective   = cred->cap_effective;
        *inheritable = cred->cap_inheritable;
        *permitted   = cred->cap_permitted;
        rcu_read_unlock();
        return 0;
}

/*
 * Determine whether the inheritable capabilities are limited to the old
 * permitted set.  Returns 1 if they are limited, 0 if they are not.
 */
static inline int cap_inh_is_capped(void)
{
        /* they are so limited unless the current task has the CAP_SETPCAP
         * capability
         */
        if (cap_capable(current_cred(), current_cred()->user_ns,
                        CAP_SETPCAP, CAP_OPT_NONE) == 0)
                return 0;
        return 1;
}

/**
 * cap_capset - Validate and apply proposed changes to current's capabilities
 * @new: The proposed new credentials; alterations should be made here
 * @old: The current task's current credentials
 * @effective: A pointer to the proposed new effective capabilities set
 * @inheritable: A pointer to the proposed new inheritable capabilities set
 * @permitted: A pointer to the proposed new permitted capabilities set
 *
 * This function validates and applies a proposed mass change to the current
 * process's capability sets.  The changes are made to the proposed new
 * credentials, and assuming no error, will be committed by the caller of LSM.
 */
int cap_capset(struct cred *new,
               const struct cred *old,
               const kernel_cap_t *effective,
               const kernel_cap_t *inheritable,
               const kernel_cap_t *permitted)
{
        if (cap_inh_is_capped() &&
            !cap_issubset(*inheritable,
                          cap_combine(old->cap_inheritable,
                                      old->cap_permitted)))
                /* incapable of using this inheritable set */
                return -EPERM;

        if (!cap_issubset(*inheritable,
                          cap_combine(old->cap_inheritable,
                                      old->cap_bset)))
                /* no new pI capabilities outside bounding set */
                return -EPERM;

        /* verify restrictions on target's new Permitted set */
        if (!cap_issubset(*permitted, old->cap_permitted))
                return -EPERM;

        /* verify the _new_Effective_ is a subset of the _new_Permitted_ */
        if (!cap_issubset(*effective, *permitted))
                return -EPERM;

        new->cap_effective   = *effective;
        new->cap_inheritable = *inheritable;
        new->cap_permitted   = *permitted;

        /*
         * Mask off ambient bits that are no longer both permitted and
         * inheritable.
         */
        new->cap_ambient = cap_intersect(new->cap_ambient,
                                         cap_intersect(*permitted,
                                                       *inheritable));
        if (WARN_ON(!cap_ambient_invariant_ok(new)))
                return -EINVAL;
        return 0;
}

/**
 * cap_inode_need_killpriv - Determine if inode change affects privileges
 * @dentry: The inode/dentry in being changed with change marked ATTR_KILL_PRIV
 *
 * Determine if an inode having a change applied that's marked ATTR_KILL_PRIV
 * affects the security markings on that inode, and if it is, should
 * inode_killpriv() be invoked or the change rejected.
 *
 * Returns 1 if security.capability has a value, meaning inode_killpriv()
 * is required, 0 otherwise, meaning inode_killpriv() is not required.
 */
int cap_inode_need_killpriv(struct dentry *dentry)
{
        struct inode *inode = d_backing_inode(dentry);
        int error;

        error = __vfs_getxattr(dentry, inode, XATTR_NAME_CAPS, NULL, 0);
        return error > 0;
}

/**
 * cap_inode_killpriv - Erase the security markings on an inode
 * @dentry: The inode/dentry to alter
 *
 * Erase the privilege-enhancing security markings on an inode.
 *
 * Returns 0 if successful, -ve on error.
 */
int cap_inode_killpriv(struct dentry *dentry)
{
        int error;

        error = __vfs_removexattr(dentry, XATTR_NAME_CAPS);
        if (error == -EOPNOTSUPP)
                error = 0;
        return error;
}

static bool rootid_owns_currentns(kuid_t kroot)
{
        struct user_namespace *ns;

        if (!uid_valid(kroot))
                return false;

        for (ns = current_user_ns(); ; ns = ns->parent) {
                if (from_kuid(ns, kroot) == 0)
                        return true;
                if (ns == &init_user_ns)
                        break;
        }

        return false;
}

static __u32 sansflags(__u32 m)
{
        return m & ~VFS_CAP_FLAGS_EFFECTIVE;
}

static bool is_v2header(size_t size, const struct vfs_cap_data *cap)
{
        if (size != XATTR_CAPS_SZ_2)
                return false;
        return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_2;
}

static bool is_v3header(size_t size, const struct vfs_cap_data *cap)
{
        if (size != XATTR_CAPS_SZ_3)
                return false;
        return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_3;
}

/*
 * getsecurity: We are called for security.* before any attempt to read the
 * xattr from the inode itself.
 *
 * This gives us a chance to read the on-disk value and convert it.  If we
 * return -EOPNOTSUPP, then vfs_getxattr() will call the i_op handler.
 *
 * Note we are not called by vfs_getxattr_alloc(), but that is only called
 * by the integrity subsystem, which really wants the unconverted values -
 * so that's good.
 */
int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer,
                          bool alloc)
{
        int size, ret;
        kuid_t kroot;
        u32 nsmagic, magic;
        uid_t root, mappedroot;
        char *tmpbuf = NULL;
        struct vfs_cap_data *cap;
        struct vfs_ns_cap_data *nscap = NULL;
        struct dentry *dentry;
        struct user_namespace *fs_ns;

        if (strcmp(name, "capability") != 0)
                return -EOPNOTSUPP;

        dentry = d_find_any_alias(inode);
        if (!dentry)
                return -EINVAL;

        size = sizeof(struct vfs_ns_cap_data);
        ret = (int) vfs_getxattr_alloc(dentry, XATTR_NAME_CAPS,
                                 &tmpbuf, size, GFP_NOFS);
        dput(dentry);

        if (ret < 0 || !tmpbuf) {
                size = ret;
                goto out_free;
        }

        fs_ns = inode->i_sb->s_user_ns;
        cap = (struct vfs_cap_data *) tmpbuf;
        if (is_v2header((size_t) ret, cap)) {
                root = 0;
        } else if (is_v3header((size_t) ret, cap)) {
                nscap = (struct vfs_ns_cap_data *) tmpbuf;
                root = le32_to_cpu(nscap->rootid);
        } else {
                size = -EINVAL;
                goto out_free;
        }

        kroot = make_kuid(fs_ns, root);

        /* If the root kuid maps to a valid uid in current ns, then return
         * this as a nscap. */
        mappedroot = from_kuid(current_user_ns(), kroot);
        if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) {
                size = sizeof(struct vfs_ns_cap_data);
                if (alloc) {
                        if (!nscap) {
                                /* v2 -> v3 conversion */
                                nscap = kzalloc(size, GFP_ATOMIC);
                                if (!nscap) {
                                        size = -ENOMEM;
                                        goto out_free;
                                }
                                nsmagic = VFS_CAP_REVISION_3;
                                magic = le32_to_cpu(cap->magic_etc);
                                if (magic & VFS_CAP_FLAGS_EFFECTIVE)
                                        nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
                                memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
                                nscap->magic_etc = cpu_to_le32(nsmagic);
                        } else {
                                /* use allocated v3 buffer */
                                tmpbuf = NULL;
                        }
                        nscap->rootid = cpu_to_le32(mappedroot);
                        *buffer = nscap;
                }
                goto out_free;
        }

        if (!rootid_owns_currentns(kroot)) {
                size = -EOVERFLOW;
                goto out_free;
        }

        /* This comes from a parent namespace.  Return as a v2 capability */
        size = sizeof(struct vfs_cap_data);
        if (alloc) {
                if (nscap) {
                        /* v3 -> v2 conversion */
                        cap = kzalloc(size, GFP_ATOMIC);
                        if (!cap) {
                                size = -ENOMEM;
                                goto out_free;
                        }
                        magic = VFS_CAP_REVISION_2;
                        nsmagic = le32_to_cpu(nscap->magic_etc);
                        if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE)
                                magic |= VFS_CAP_FLAGS_EFFECTIVE;
                        memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
                        cap->magic_etc = cpu_to_le32(magic);
                } else {
                        /* use unconverted v2 */
                        tmpbuf = NULL;
                }
                *buffer = cap;
        }
out_free:
        kfree(tmpbuf);
        return size;
}

static kuid_t rootid_from_xattr(const void *value, size_t size,
                                struct user_namespace *task_ns)
{
        const struct vfs_ns_cap_data *nscap = value;
        uid_t rootid = 0;

        if (size == XATTR_CAPS_SZ_3)
                rootid = le32_to_cpu(nscap->rootid);

        return make_kuid(task_ns, rootid);
}

static bool validheader(size_t size, const struct vfs_cap_data *cap)
{
        return is_v2header(size, cap) || is_v3header(size, cap);
}

/*
 * User requested a write of security.capability.  If needed, update the
 * xattr to change from v2 to v3, or to fixup the v3 rootid.
 *
 * If all is ok, we return the new size, on error return < 0.
 */
int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size)
{
        struct vfs_ns_cap_data *nscap;
        uid_t nsrootid;
        const struct vfs_cap_data *cap = *ivalue;
        __u32 magic, nsmagic;
        struct inode *inode = d_backing_inode(dentry);
        struct user_namespace *task_ns = current_user_ns(),
                *fs_ns = inode->i_sb->s_user_ns;
        kuid_t rootid;
        size_t newsize;

        if (!*ivalue)
                return -EINVAL;
        if (!validheader(size, cap))
                return -EINVAL;
        if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP))
                return -EPERM;
        if (size == XATTR_CAPS_SZ_2)
                if (ns_capable(inode->i_sb->s_user_ns, CAP_SETFCAP))
                        /* user is privileged, just write the v2 */
                        return size;

        rootid = rootid_from_xattr(*ivalue, size, task_ns);
        if (!uid_valid(rootid))
                return -EINVAL;

        nsrootid = from_kuid(fs_ns, rootid);
        if (nsrootid == -1)
                return -EINVAL;

        newsize = sizeof(struct vfs_ns_cap_data);
        nscap = kmalloc(newsize, GFP_ATOMIC);
        if (!nscap)
                return -ENOMEM;
        nscap->rootid = cpu_to_le32(nsrootid);
        nsmagic = VFS_CAP_REVISION_3;
        magic = le32_to_cpu(cap->magic_etc);
        if (magic & VFS_CAP_FLAGS_EFFECTIVE)
                nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
        nscap->magic_etc = cpu_to_le32(nsmagic);
        memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);

        kvfree(*ivalue);
        *ivalue = nscap;
        return newsize;
}

/*
 * Calculate the new process capability sets from the capability sets attached
 * to a file.
 */
static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
                                          struct linux_binprm *bprm,
                                          bool *effective,
                                          bool *has_fcap)
{
        struct cred *new = bprm->cred;
        unsigned i;
        int ret = 0;

        if (caps->magic_etc & VFS_CAP_FLAGS_EFFECTIVE)
                *effective = true;

        if (caps->magic_etc & VFS_CAP_REVISION_MASK)
                *has_fcap = true;

        CAP_FOR_EACH_U32(i) {
                __u32 permitted = caps->permitted.cap[i];
                __u32 inheritable = caps->inheritable.cap[i];

                /*
                 * pP' = (X & fP) | (pI & fI)
                 * The addition of pA' is handled later.
                 */
                new->cap_permitted.cap[i] =
                        (new->cap_bset.cap[i] & permitted) |
                        (new->cap_inheritable.cap[i] & inheritable);

                if (permitted & ~new->cap_permitted.cap[i])
                        /* insufficient to execute correctly */
                        ret = -EPERM;
        }

        /*
         * For legacy apps, with no internal support for recognizing they
         * do not have enough capabilities, we return an error if they are
         * missing some "forced" (aka file-permitted) capabilities.
         */
        return *effective ? ret : 0;
}

/*
 * Extract the on-exec-apply capability sets for an executable file.
 */
int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps)
{
        struct inode *inode = d_backing_inode(dentry);
        __u32 magic_etc;
        unsigned tocopy, i;
        int size;
        struct vfs_ns_cap_data data, *nscaps = &data;
        struct vfs_cap_data *caps = (struct vfs_cap_data *) &data;
        kuid_t rootkuid;
        struct user_namespace *fs_ns;

        memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data));

        if (!inode)
                return -ENODATA;

        fs_ns = inode->i_sb->s_user_ns;
        size = __vfs_getxattr((struct dentry *)dentry, inode,
                              XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ);
        if (size == -ENODATA || size == -EOPNOTSUPP)
                /* no data, that's ok */
                return -ENODATA;

        if (size < 0)
                return size;

        if (size < sizeof(magic_etc))
                return -EINVAL;

        cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc);

        rootkuid = make_kuid(fs_ns, 0);
        switch (magic_etc & VFS_CAP_REVISION_MASK) {
        case VFS_CAP_REVISION_1:
                if (size != XATTR_CAPS_SZ_1)
                        return -EINVAL;
                tocopy = VFS_CAP_U32_1;
                break;
        case VFS_CAP_REVISION_2:
                if (size != XATTR_CAPS_SZ_2)
                        return -EINVAL;
                tocopy = VFS_CAP_U32_2;
                break;
        case VFS_CAP_REVISION_3:
                if (size != XATTR_CAPS_SZ_3)
                        return -EINVAL;
                tocopy = VFS_CAP_U32_3;
                rootkuid = make_kuid(fs_ns, le32_to_cpu(nscaps->rootid));
                break;

        default:
                return -EINVAL;
        }
        /* Limit the caps to the mounter of the filesystem
         * or the more limited uid specified in the xattr.
         */
        if (!rootid_owns_currentns(rootkuid))
                return -ENODATA;

        CAP_FOR_EACH_U32(i) {
                if (i >= tocopy)
                        break;
                cpu_caps->permitted.cap[i] = le32_to_cpu(caps->data[i].permitted);
                cpu_caps->inheritable.cap[i] = le32_to_cpu(caps->data[i].inheritable);
        }

        cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
        cpu_caps->inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;

        cpu_caps->rootid = rootkuid;

        return 0;
}

/*
 * Attempt to get the on-exec apply capability sets for an executable file from
 * its xattrs and, if present, apply them to the proposed credentials being
 * constructed by execve().
 */
static int get_file_caps(struct linux_binprm *bprm, struct file *file,
                         bool *effective, bool *has_fcap)
{
        int rc = 0;
        struct cpu_vfs_cap_data vcaps;

        cap_clear(bprm->cred->cap_permitted);

        if (!file_caps_enabled)
                return 0;

        if (!mnt_may_suid(file->f_path.mnt))
                return 0;

        /*
         * This check is redundant with mnt_may_suid() but is kept to make
         * explicit that capability bits are limited to s_user_ns and its
         * descendants.
         */
        if (!current_in_userns(file->f_path.mnt->mnt_sb->s_user_ns))
                return 0;

        rc = get_vfs_caps_from_disk(file->f_path.dentry, &vcaps);
        if (rc < 0) {
                if (rc == -EINVAL)
                        printk(KERN_NOTICE "Invalid argument reading file caps for %s\n",
                                        bprm->filename);
                else if (rc == -ENODATA)
                        rc = 0;
                goto out;
        }

        rc = bprm_caps_from_vfs_caps(&vcaps, bprm, effective, has_fcap);

out:
        if (rc)
                cap_clear(bprm->cred->cap_permitted);

        return rc;
}

static inline bool root_privileged(void) { return !issecure(SECURE_NOROOT); }

static inline bool __is_real(kuid_t uid, struct cred *cred)
{ return uid_eq(cred->uid, uid); }

static inline bool __is_eff(kuid_t uid, struct cred *cred)
{ return uid_eq(cred->euid, uid); }

static inline bool __is_suid(kuid_t uid, struct cred *cred)
{ return !__is_real(uid, cred) && __is_eff(uid, cred); }

/*
 * handle_privileged_root - Handle case of privileged root
 * @bprm: The execution parameters, including the proposed creds
 * @has_fcap: Are any file capabilities set?
 * @effective: Do we have effective root privilege?
 * @root_uid: This namespace' root UID WRT initial USER namespace
 *
 * Handle the case where root is privileged and hasn't been neutered by
 * SECURE_NOROOT.  If file capabilities are set, they won't be combined with
 * set UID root and nothing is changed.  If we are root, cap_permitted is
 * updated.  If we have become set UID root, the effective bit is set.
 */
static void handle_privileged_root(struct linux_binprm *bprm, bool has_fcap,
                                   bool *effective, kuid_t root_uid)
{
        const struct cred *old = current_cred();
        struct cred *new = bprm->cred;

        if (!root_privileged())
                return;
        /*
         * If the legacy file capability is set, then don't set privs
         * for a setuid root binary run by a non-root user.  Do set it
         * for a root user just to cause least surprise to an admin.
         */
        if (has_fcap && __is_suid(root_uid, new)) {
                warn_setuid_and_fcaps_mixed(bprm->filename);
                return;
        }
        /*
         * To support inheritance of root-permissions and suid-root
         * executables under compatibility mode, we override the
         * capability sets for the file.
         */
        if (__is_eff(root_uid, new) || __is_real(root_uid, new)) {
                /* pP' = (cap_bset & ~0) | (pI & ~0) */
                new->cap_permitted = cap_combine(old->cap_bset,
                                                 old->cap_inheritable);
        }
        /*
         * If only the real uid is 0, we do not set the effective bit.
         */
        if (__is_eff(root_uid, new))
                *effective = true;
}

#define __cap_gained(field, target, source) \
        !cap_issubset(target->cap_##field, source->cap_##field)
#define __cap_grew(target, source, cred) \
        !cap_issubset(cred->cap_##target, cred->cap_##source)
#define __cap_full(field, cred) \
        cap_issubset(CAP_FULL_SET, cred->cap_##field)

static inline bool __is_setuid(struct cred *new, const struct cred *old)
{ return !uid_eq(new->euid, old->uid); }

static inline bool __is_setgid(struct cred *new, const struct cred *old)
{ return !gid_eq(new->egid, old->gid); }

/*
 * 1) Audit candidate if current->cap_effective is set
 *
 * We do not bother to audit if 3 things are true:
 *   1) cap_effective has all caps
 *   2) we became root *OR* are were already root
 *   3) root is supposed to have all caps (SECURE_NOROOT)
 * Since this is just a normal root execing a process.
 *
 * Number 1 above might fail if you don't have a full bset, but I think
 * that is interesting information to audit.
 *
 * A number of other conditions require logging:
 * 2) something prevented setuid root getting all caps
 * 3) non-setuid root gets fcaps
 * 4) non-setuid root gets ambient
 */
static inline bool nonroot_raised_pE(struct cred *new, const struct cred *old,
                                     kuid_t root, bool has_fcap)
{
        bool ret = false;

        if ((__cap_grew(effective, ambient, new) &&
             !(__cap_full(effective, new) &&
               (__is_eff(root, new) || __is_real(root, new)) &&
               root_privileged())) ||
            (root_privileged() &&
             __is_suid(root, new) &&
             !__cap_full(effective, new)) ||
            (!__is_setuid(new, old) &&
             ((has_fcap &&
               __cap_gained(permitted, new, old)) ||
              __cap_gained(ambient, new, old))))

                ret = true;

        return ret;
}

/**
 * cap_bprm_creds_from_file - Set up the proposed credentials for execve().
 * @bprm: The execution parameters, including the proposed creds
 * @file: The file to pull the credentials from
 *
 * Set up the proposed credentials for a new execution context being
 * constructed by execve().  The proposed creds in @bprm->cred is altered,
 * which won't take effect immediately.  Returns 0 if successful, -ve on error.
 */
int cap_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file)
{
        /* Process setpcap binaries and capabilities for uid 0 */
        const struct cred *old = current_cred();
        struct cred *new = bprm->cred;
        bool effective = false, has_fcap = false, is_setid;
        int ret;
        kuid_t root_uid;

        if (WARN_ON(!cap_ambient_invariant_ok(old)))
                return -EPERM;

        ret = get_file_caps(bprm, file, &effective, &has_fcap);
        if (ret < 0)
                return ret;

        root_uid = make_kuid(new->user_ns, 0);

        handle_privileged_root(bprm, has_fcap, &effective, root_uid);

        /* if we have fs caps, clear dangerous personality flags */
        if (__cap_gained(permitted, new, old))
                bprm->per_clear |= PER_CLEAR_ON_SETID;

        /* Don't let someone trace a set[ug]id/setpcap binary with the revised
         * credentials unless they have the appropriate permit.
         *
         * In addition, if NO_NEW_PRIVS, then ensure we get no new privs.
         */
        is_setid = __is_setuid(new, old) || __is_setgid(new, old);

        if ((is_setid || __cap_gained(permitted, new, old)) &&
            ((bprm->unsafe & ~LSM_UNSAFE_PTRACE) ||
             !ptracer_capable(current, new->user_ns))) {
                /* downgrade; they get no more than they had, and maybe less */
                if (!ns_capable(new->user_ns, CAP_SETUID) ||
                    (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)) {
                        new->euid = new->uid;
                        new->egid = new->gid;
                }
                new->cap_permitted = cap_intersect(new->cap_permitted,
                                                   old->cap_permitted);
        }

        new->suid = new->fsuid = new->euid;
        new->sgid = new->fsgid = new->egid;

        /* File caps or setid cancels ambient. */
        if (has_fcap || is_setid)
                cap_clear(new->cap_ambient);

        /*
         * Now that we've computed pA', update pP' to give:
         *   pP' = (X & fP) | (pI & fI) | pA'
         */
        new->cap_permitted = cap_combine(new->cap_permitted, new->cap_ambient);

        /*
         * Set pE' = (fE ? pP' : pA').  Because pA' is zero if fE is set,
         * this is the same as pE' = (fE ? pP' : 0) | pA'.
         */
        if (effective)
                new->cap_effective = new->cap_permitted;
        else
                new->cap_effective = new->cap_ambient;

        if (WARN_ON(!cap_ambient_invariant_ok(new)))
                return -EPERM;

        if (nonroot_raised_pE(new, old, root_uid, has_fcap)) {
                ret = audit_log_bprm_fcaps(bprm, new, old);
                if (ret < 0)
                        return ret;
        }

        new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);

        if (WARN_ON(!cap_ambient_invariant_ok(new)))
                return -EPERM;

        /* Check for privilege-elevated exec. */
        if (is_setid ||
            (!__is_real(root_uid, new) &&
             (effective ||
              __cap_grew(permitted, ambient, new))))
                bprm->secureexec = 1;

        return 0;
}

/**
 * cap_inode_setxattr - Determine whether an xattr may be altered
 * @dentry: The inode/dentry being altered
 * @name: The name of the xattr to be changed
 * @value: The value that the xattr will be changed to
 * @size: The size of value
 * @flags: The replacement flag
 *
 * Determine whether an xattr may be altered or set on an inode, returning 0 if
 * permission is granted, -ve if denied.
 *
 * This is used to make sure security xattrs don't get updated or set by those
 * who aren't privileged to do so.
 */
int cap_inode_setxattr(struct dentry *dentry, const char *name,
                       const void *value, size_t size, int flags)
{
        struct user_namespace *user_ns = dentry->d_sb->s_user_ns;

        /* Ignore non-security xattrs */
        if (strncmp(name, XATTR_SECURITY_PREFIX,
                        XATTR_SECURITY_PREFIX_LEN) != 0)
                return 0;

        /*
         * For XATTR_NAME_CAPS the check will be done in
         * cap_convert_nscap(), called by setxattr()
         */
        if (strcmp(name, XATTR_NAME_CAPS) == 0)
                return 0;

        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                return -EPERM;
        return 0;
}

/**
 * cap_inode_removexattr - Determine whether an xattr may be removed
 * @dentry: The inode/dentry being altered
 * @name: The name of the xattr to be changed
 *
 * Determine whether an xattr may be removed from an inode, returning 0 if
 * permission is granted, -ve if denied.
 *
 * This is used to make sure security xattrs don't get removed by those who
 * aren't privileged to remove them.
 */
int cap_inode_removexattr(struct dentry *dentry, const char *name)
{
        struct user_namespace *user_ns = dentry->d_sb->s_user_ns;

        /* Ignore non-security xattrs */
        if (strncmp(name, XATTR_SECURITY_PREFIX,
                        XATTR_SECURITY_PREFIX_LEN) != 0)
                return 0;

        if (strcmp(name, XATTR_NAME_CAPS) == 0) {
                /* security.capability gets namespaced */
                struct inode *inode = d_backing_inode(dentry);
                if (!inode)
                        return -EINVAL;
                if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP))
                        return -EPERM;
                return 0;
        }

        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                return -EPERM;
        return 0;
}

/*
 * cap_emulate_setxuid() fixes the effective / permitted capabilities of
 * a process after a call to setuid, setreuid, or setresuid.
 *
 *  1) When set*uiding _from_ one of {r,e,s}uid == 0 _to_ all of
 *  {r,e,s}uid != 0, the permitted and effective capabilities are
 *  cleared.
 *
 *  2) When set*uiding _from_ euid == 0 _to_ euid != 0, the effective
 *  capabilities of the process are cleared.
 *
 *  3) When set*uiding _from_ euid != 0 _to_ euid == 0, the effective
 *  capabilities are set to the permitted capabilities.
 *
 *  fsuid is handled elsewhere. fsuid == 0 and {r,e,s}uid!= 0 should
 *  never happen.
 *
 *  -astor
 *
 * cevans - New behaviour, Oct '99
 * A process may, via prctl(), elect to keep its capabilities when it
 * calls setuid() and switches away from uid==0. Both permitted and
 * effective sets will be retained.
 * Without this change, it was impossible for a daemon to drop only some
 * of its privilege. The call to setuid(!=0) would drop all privileges!
 * Keeping uid 0 is not an option because uid 0 owns too many vital
 * files..
 * Thanks to Olaf Kirch and Peter Benie for spotting this.
 */
static inline void cap_emulate_setxuid(struct cred *new, const struct cred *old)
{
        kuid_t root_uid = make_kuid(old->user_ns, 0);

        if ((uid_eq(old->uid, root_uid) ||
             uid_eq(old->euid, root_uid) ||
             uid_eq(old->suid, root_uid)) &&
            (!uid_eq(new->uid, root_uid) &&
             !uid_eq(new->euid, root_uid) &&
             !uid_eq(new->suid, root_uid))) {
                if (!issecure(SECURE_KEEP_CAPS)) {
                        cap_clear(new->cap_permitted);
                        cap_clear(new->cap_effective);
                }

                /*
                 * Pre-ambient programs expect setresuid to nonroot followed
                 * by exec to drop capabilities.  We should make sure that
                 * this remains the case.
                 */
                cap_clear(new->cap_ambient);
        }
        if (uid_eq(old->euid, root_uid) && !uid_eq(new->euid, root_uid))
                cap_clear(new->cap_effective);
        if (!uid_eq(old->euid, root_uid) && uid_eq(new->euid, root_uid))
                new->cap_effective = new->cap_permitted;
}

/**
 * cap_task_fix_setuid - Fix up the results of setuid() call
 * @new: The proposed credentials
 * @old: The current task's current credentials
 * @flags: Indications of what has changed
 *
 * Fix up the results of setuid() call before the credential changes are
 * actually applied, returning 0 to grant the changes, -ve to deny them.
 */
int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags)
{
        switch (flags) {
        case LSM_SETID_RE:
        case LSM_SETID_ID:
        case LSM_SETID_RES:
                /* juggle the capabilities to follow [RES]UID changes unless
                 * otherwise suppressed */
                if (!issecure(SECURE_NO_SETUID_FIXUP))
                        cap_emulate_setxuid(new, old);
                break;

        case LSM_SETID_FS:
                /* juggle the capabilties to follow FSUID changes, unless
                 * otherwise suppressed
                 *
                 * FIXME - is fsuser used for all CAP_FS_MASK capabilities?
                 *          if not, we might be a bit too harsh here.
                 */
                if (!issecure(SECURE_NO_SETUID_FIXUP)) {
                        kuid_t root_uid = make_kuid(old->user_ns, 0);
                        if (uid_eq(old->fsuid, root_uid) && !uid_eq(new->fsuid, root_uid))
                                new->cap_effective =
                                        cap_drop_fs_set(new->cap_effective);

                        if (!uid_eq(old->fsuid, root_uid) && uid_eq(new->fsuid, root_uid))
                                new->cap_effective =
                                        cap_raise_fs_set(new->cap_effective,
                                                         new->cap_permitted);
                }
                break;

        default:
                return -EINVAL;
        }

        return 0;
}

/*
 * Rationale: code calling task_setscheduler, task_setioprio, and
 * task_setnice, assumes that
 *   . if capable(cap_sys_nice), then those actions should be allowed
 *   . if not capable(cap_sys_nice), but acting on your own processes,
 *           then those actions should be allowed
 * This is insufficient now since you can call code without suid, but
 * yet with increased caps.
 * So we check for increased caps on the target process.
 */
static int cap_safe_nice(struct task_struct *p)
{
        int is_subset, ret = 0;

        rcu_read_lock();
        is_subset = cap_issubset(__task_cred(p)->cap_permitted,
                                 current_cred()->cap_permitted);
        if (!is_subset && !ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))
                ret = -EPERM;
        rcu_read_unlock();

        return ret;
}

/**
 * cap_task_setscheduler - Detemine if scheduler policy change is permitted
 * @p: The task to affect
 *
 * Detemine if the requested scheduler policy change is permitted for the
 * specified task, returning 0 if permission is granted, -ve if denied.
 */
int cap_task_setscheduler(struct task_struct *p)
{
        return cap_safe_nice(p);
}

/**
 * cap_task_ioprio - Detemine if I/O priority change is permitted
 * @p: The task to affect
 * @ioprio: The I/O priority to set
 *
 * Detemine if the requested I/O priority change is permitted for the specified
 * task, returning 0 if permission is granted, -ve if denied.
 */
int cap_task_setioprio(struct task_struct *p, int ioprio)
{
        return cap_safe_nice(p);
}

/**
 * cap_task_ioprio - Detemine if task priority change is permitted
 * @p: The task to affect
 * @nice: The nice value to set
 *
 * Detemine if the requested task priority change is permitted for the
 * specified task, returning 0 if permission is granted, -ve if denied.
 */
int cap_task_setnice(struct task_struct *p, int nice)
{
        return cap_safe_nice(p);
}

/*
 * Implement PR_CAPBSET_DROP.  Attempt to remove the specified capability from
 * the current task's bounding set.  Returns 0 on success, -ve on error.
 */
static int cap_prctl_drop(unsigned long cap)
{
        struct cred *new;

        if (!ns_capable(current_user_ns(), CAP_SETPCAP))
                return -EPERM;
        if (!cap_valid(cap))
                return -EINVAL;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;
        cap_lower(new->cap_bset, cap);
        return commit_creds(new);
}

/**
 * cap_task_prctl - Implement process control functions for this security module
 * @option: The process control function requested
 * @arg2, @arg3, @arg4, @arg5: The argument data for this function
 *
 * Allow process control functions (sys_prctl()) to alter capabilities; may
 * also deny access to other functions not otherwise implemented here.
 *
 * Returns 0 or +ve on success, -ENOSYS if this function is not implemented
 * here, other -ve on error.  If -ENOSYS is returned, sys_prctl() and other LSM
 * modules will consider performing the function.
 */
int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
                   unsigned long arg4, unsigned long arg5)
{
        const struct cred *old = current_cred();
        struct cred *new;

        switch (option) {
        case PR_CAPBSET_READ:
                if (!cap_valid(arg2))
                        return -EINVAL;
                return !!cap_raised(old->cap_bset, arg2);

        case PR_CAPBSET_DROP:
                return cap_prctl_drop(arg2);

        /*
         * The next four prctl's remain to assist with transitioning a
         * system from legacy UID=0 based privilege (when filesystem
         * capabilities are not in use) to a system using filesystem
         * capabilities only - as the POSIX.1e draft intended.
         *
         * Note:
         *
         *  PR_SET_SECUREBITS =
         *      issecure_mask(SECURE_KEEP_CAPS_LOCKED)
         *    | issecure_mask(SECURE_NOROOT)
         *    | issecure_mask(SECURE_NOROOT_LOCKED)
         *    | issecure_mask(SECURE_NO_SETUID_FIXUP)
         *    | issecure_mask(SECURE_NO_SETUID_FIXUP_LOCKED)
         *
         * will ensure that the current process and all of its
         * children will be locked into a pure
         * capability-based-privilege environment.
         */
        case PR_SET_SECUREBITS:
                if ((((old->securebits & SECURE_ALL_LOCKS) >> 1)
                     & (old->securebits ^ arg2))                        /*[1]*/
                    || ((old->securebits & SECURE_ALL_LOCKS & ~arg2))        /*[2]*/
                    || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS))        /*[3]*/
                    || (cap_capable(current_cred(),
                                    current_cred()->user_ns,
                                    CAP_SETPCAP,
                                    CAP_OPT_NONE) != 0)                        /*[4]*/
                        /*
                         * [1] no changing of bits that are locked
                         * [2] no unlocking of locks
                         * [3] no setting of unsupported bits
                         * [4] doing anything requires privilege (go read about
                         *     the "sendmail capabilities bug")
                         */
                    )
                        /* cannot change a locked bit */
                        return -EPERM;

                new = prepare_creds();
                if (!new)
                        return -ENOMEM;
                new->securebits = arg2;
                return commit_creds(new);

        case PR_GET_SECUREBITS:
                return old->securebits;

        case PR_GET_KEEPCAPS:
                return !!issecure(SECURE_KEEP_CAPS);

        case PR_SET_KEEPCAPS:
                if (arg2 > 1) /* Note, we rely on arg2 being unsigned here */
                        return -EINVAL;
                if (issecure(SECURE_KEEP_CAPS_LOCKED))
                        return -EPERM;

                new = prepare_creds();
                if (!new)
                        return -ENOMEM;
                if (arg2)
                        new->securebits |= issecure_mask(SECURE_KEEP_CAPS);
                else
                        new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
                return commit_creds(new);

        case PR_CAP_AMBIENT:
                if (arg2 == PR_CAP_AMBIENT_CLEAR_ALL) {
                        if (arg3 | arg4 | arg5)
                                return -EINVAL;

                        new = prepare_creds();
                        if (!new)
                                return -ENOMEM;
                        cap_clear(new->cap_ambient);
                        return commit_creds(new);
                }

                if (((!cap_valid(arg3)) | arg4 | arg5))
                        return -EINVAL;

                if (arg2 == PR_CAP_AMBIENT_IS_SET) {
                        return !!cap_raised(current_cred()->cap_ambient, arg3);
                } else if (arg2 != PR_CAP_AMBIENT_RAISE &&
                           arg2 != PR_CAP_AMBIENT_LOWER) {
                        return -EINVAL;
                } else {
                        if (arg2 == PR_CAP_AMBIENT_RAISE &&
                            (!cap_raised(current_cred()->cap_permitted, arg3) ||
                             !cap_raised(current_cred()->cap_inheritable,
                                         arg3) ||
                             issecure(SECURE_NO_CAP_AMBIENT_RAISE)))
                                return -EPERM;

                        new = prepare_creds();
                        if (!new)
                                return -ENOMEM;
                        if (arg2 == PR_CAP_AMBIENT_RAISE)
                                cap_raise(new->cap_ambient, arg3);
                        else
                                cap_lower(new->cap_ambient, arg3);
                        return commit_creds(new);
                }

        default:
                /* No functionality available - continue with default */
                return -ENOSYS;
        }
}

/**
 * cap_vm_enough_memory - Determine whether a new virtual mapping is permitted
 * @mm: The VM space in which the new mapping is to be made
 * @pages: The size of the mapping
 *
 * Determine whether the allocation of a new virtual mapping by the current
 * task is permitted, returning 1 if permission is granted, 0 if not.
 */
int cap_vm_enough_memory(struct mm_struct *mm, long pages)
{
        int cap_sys_admin = 0;

        if (cap_capable(current_cred(), &init_user_ns,
                                CAP_SYS_ADMIN, CAP_OPT_NOAUDIT) == 0)
                cap_sys_admin = 1;

        return cap_sys_admin;
}

/*
 * cap_mmap_addr - check if able to map given addr
 * @addr: address attempting to be mapped
 *
 * If the process is attempting to map memory below dac_mmap_min_addr they need
 * CAP_SYS_RAWIO.  The other parameters to this function are unused by the
 * capability security module.  Returns 0 if this mapping should be allowed
 * -EPERM if not.
 */
int cap_mmap_addr(unsigned long addr)
{
        int ret = 0;

        if (addr < dac_mmap_min_addr) {
                ret = cap_capable(current_cred(), &init_user_ns, CAP_SYS_RAWIO,
                                  CAP_OPT_NONE);
                /* set PF_SUPERPRIV if it turns out we allow the low mmap */
                if (ret == 0)
                        current->flags |= PF_SUPERPRIV;
        }
        return ret;
}

int cap_mmap_file(struct file *file, unsigned long reqprot,
                  unsigned long prot, unsigned long flags)
{
        return 0;
}

#ifdef CONFIG_SECURITY

static struct security_hook_list capability_hooks[] __lsm_ro_after_init = {
        LSM_HOOK_INIT(capable, cap_capable),
        LSM_HOOK_INIT(settime, cap_settime),
        LSM_HOOK_INIT(ptrace_access_check, cap_ptrace_access_check),
        LSM_HOOK_INIT(ptrace_traceme, cap_ptrace_traceme),
        LSM_HOOK_INIT(capget, cap_capget),
        LSM_HOOK_INIT(capset, cap_capset),
        LSM_HOOK_INIT(bprm_creds_from_file, cap_bprm_creds_from_file),
        LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv),
        LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv),
        LSM_HOOK_INIT(inode_getsecurity, cap_inode_getsecurity),
        LSM_HOOK_INIT(mmap_addr, cap_mmap_addr),
        LSM_HOOK_INIT(mmap_file, cap_mmap_file),
        LSM_HOOK_INIT(task_fix_setuid, cap_task_fix_setuid),
        LSM_HOOK_INIT(task_prctl, cap_task_prctl),
        LSM_HOOK_INIT(task_setscheduler, cap_task_setscheduler),
        LSM_HOOK_INIT(task_setioprio, cap_task_setioprio),
        LSM_HOOK_INIT(task_setnice, cap_task_setnice),
        LSM_HOOK_INIT(vm_enough_memory, cap_vm_enough_memory),
};

static int __init capability_init(void)
{
        security_add_hooks(capability_hooks, ARRAY_SIZE(capability_hooks),
                                "capability");
        return 0;
}

DEFINE_LSM(capability) = {
        .name = "capability",
        .order = LSM_ORDER_FIRST,
        .init = capability_init,
};

#endif /* CONFIG_SECURITY */


















































































































































































































    1 
    1 






















    1 
    1 
    1 



    1 































    1 

    1 





    1 



















































































    1 





























































































































    1 
    1 








































































































    1 














    1 













    1 



    1 



    1 





    1 






























































































































































































































































































































































































































































































































































































































































    1 

    1 



    1 










































































































    1 





    1 


    1 








    1 





    1 























    1 








    1 



    1 

















































































































    1 





    1 



    1 








    1 








    1 














    1 
















































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/super.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  super.c contains code to handle: - mount structures
 *                                   - super-block tables
 *                                   - filesystem drivers list
 *                                   - mount system call
 *                                   - umount system call
 *                                   - ustat system call
 *
 * GK 2/5/95  -  Changed to support mounting the root fs via NFS
 *
 *  Added kerneld support: Jacques Gelinas and Bjorn Ekwall
 *  Added change_root: Werner Almesberger & Hans Lermen, Feb '96
 *  Added options to /proc/mounts:
 *    Torbjörn Lindh (torbjorn.lindh@gopta.se), April 14, 1996.
 *  Added devfs support: Richard Gooch <rgooch@atnf.csiro.au>, 13-JAN-1998
 *  Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000
 */

#include <linux/export.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/mount.h>
#include <linux/security.h>
#include <linux/writeback.h>                /* for the emergency remount stuff */
#include <linux/idr.h>
#include <linux/mutex.h>
#include <linux/backing-dev.h>
#include <linux/rculist_bl.h>
#include <linux/cleancache.h>
#include <linux/fscrypt.h>
#include <linux/fsnotify.h>
#include <linux/lockdep.h>
#include <linux/user_namespace.h>
#include <linux/fs_context.h>
#include <uapi/linux/mount.h>
#include "internal.h"

static int thaw_super_locked(struct super_block *sb);

static LIST_HEAD(super_blocks);
static DEFINE_SPINLOCK(sb_lock);

static char *sb_writers_name[SB_FREEZE_LEVELS] = {
        "sb_writers",
        "sb_pagefaults",
        "sb_internal",
};

/*
 * One thing we have to be careful of with a per-sb shrinker is that we don't
 * drop the last active reference to the superblock from within the shrinker.
 * If that happens we could trigger unregistering the shrinker from within the
 * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we
 * take a passive reference to the superblock to avoid this from occurring.
 */
static unsigned long super_cache_scan(struct shrinker *shrink,
                                      struct shrink_control *sc)
{
        struct super_block *sb;
        long        fs_objects = 0;
        long        total_objects;
        long        freed = 0;
        long        dentries;
        long        inodes;

        sb = container_of(shrink, struct super_block, s_shrink);

        /*
         * Deadlock avoidance.  We may hold various FS locks, and we don't want
         * to recurse into the FS that called us in clear_inode() and friends..
         */
        if (!(sc->gfp_mask & __GFP_FS))
                return SHRINK_STOP;

        if (!trylock_super(sb))
                return SHRINK_STOP;

        if (sb->s_op->nr_cached_objects)
                fs_objects = sb->s_op->nr_cached_objects(sb, sc);

        inodes = list_lru_shrink_count(&sb->s_inode_lru, sc);
        dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc);
        total_objects = dentries + inodes + fs_objects + 1;
        if (!total_objects)
                total_objects = 1;

        /* proportion the scan between the caches */
        dentries = mult_frac(sc->nr_to_scan, dentries, total_objects);
        inodes = mult_frac(sc->nr_to_scan, inodes, total_objects);
        fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects);

        /*
         * prune the dcache first as the icache is pinned by it, then
         * prune the icache, followed by the filesystem specific caches
         *
         * Ensure that we always scan at least one object - memcg kmem
         * accounting uses this to fully empty the caches.
         */
        sc->nr_to_scan = dentries + 1;
        freed = prune_dcache_sb(sb, sc);
        sc->nr_to_scan = inodes + 1;
        freed += prune_icache_sb(sb, sc);

        if (fs_objects) {
                sc->nr_to_scan = fs_objects + 1;
                freed += sb->s_op->free_cached_objects(sb, sc);
        }

        up_read(&sb->s_umount);
        return freed;
}

static unsigned long super_cache_count(struct shrinker *shrink,
                                       struct shrink_control *sc)
{
        struct super_block *sb;
        long        total_objects = 0;

        sb = container_of(shrink, struct super_block, s_shrink);

        /*
         * We don't call trylock_super() here as it is a scalability bottleneck,
         * so we're exposed to partial setup state. The shrinker rwsem does not
         * protect filesystem operations backing list_lru_shrink_count() or
         * s_op->nr_cached_objects(). Counts can change between
         * super_cache_count and super_cache_scan, so we really don't need locks
         * here.
         *
         * However, if we are currently mounting the superblock, the underlying
         * filesystem might be in a state of partial construction and hence it
         * is dangerous to access it.  trylock_super() uses a SB_BORN check to
         * avoid this situation, so do the same here. The memory barrier is
         * matched with the one in mount_fs() as we don't hold locks here.
         */
        if (!(sb->s_flags & SB_BORN))
                return 0;
        smp_rmb();

        if (sb->s_op && sb->s_op->nr_cached_objects)
                total_objects = sb->s_op->nr_cached_objects(sb, sc);

        total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc);
        total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc);

        if (!total_objects)
                return SHRINK_EMPTY;

        total_objects = vfs_pressure_ratio(total_objects);
        return total_objects;
}

static void destroy_super_work(struct work_struct *work)
{
        struct super_block *s = container_of(work, struct super_block,
                                                        destroy_work);
        int i;

        for (i = 0; i < SB_FREEZE_LEVELS; i++)
                percpu_free_rwsem(&s->s_writers.rw_sem[i]);
        kfree(s);
}

static void destroy_super_rcu(struct rcu_head *head)
{
        struct super_block *s = container_of(head, struct super_block, rcu);
        INIT_WORK(&s->destroy_work, destroy_super_work);
        schedule_work(&s->destroy_work);
}

/* Free a superblock that has never been seen by anyone */
static void destroy_unused_super(struct super_block *s)
{
        if (!s)
                return;
        up_write(&s->s_umount);
        list_lru_destroy(&s->s_dentry_lru);
        list_lru_destroy(&s->s_inode_lru);
        security_sb_free(s);
        put_user_ns(s->s_user_ns);
        kfree(s->s_subtype);
        free_prealloced_shrinker(&s->s_shrink);
        /* no delays needed */
        destroy_super_work(&s->destroy_work);
}

/**
 *        alloc_super        -        create new superblock
 *        @type:        filesystem type superblock should belong to
 *        @flags: the mount flags
 *        @user_ns: User namespace for the super_block
 *
 *        Allocates and initializes a new &struct super_block.  alloc_super()
 *        returns a pointer new superblock or %NULL if allocation had failed.
 */
static struct super_block *alloc_super(struct file_system_type *type, int flags,
                                       struct user_namespace *user_ns)
{
        struct super_block *s = kzalloc(sizeof(struct super_block),  GFP_USER);
        static const struct super_operations default_op;
        int i;

        if (!s)
                return NULL;

        INIT_LIST_HEAD(&s->s_mounts);
        s->s_user_ns = get_user_ns(user_ns);
        init_rwsem(&s->s_umount);
        lockdep_set_class(&s->s_umount, &type->s_umount_key);
        /*
         * sget() can have s_umount recursion.
         *
         * When it cannot find a suitable sb, it allocates a new
         * one (this one), and tries again to find a suitable old
         * one.
         *
         * In case that succeeds, it will acquire the s_umount
         * lock of the old one. Since these are clearly distrinct
         * locks, and this object isn't exposed yet, there's no
         * risk of deadlocks.
         *
         * Annotate this by putting this lock in a different
         * subclass.
         */
        down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);

        if (security_sb_alloc(s))
                goto fail;

        for (i = 0; i < SB_FREEZE_LEVELS; i++) {
                if (__percpu_init_rwsem(&s->s_writers.rw_sem[i],
                                        sb_writers_name[i],
                                        &type->s_writers_key[i]))
                        goto fail;
        }
        init_waitqueue_head(&s->s_writers.wait_unfrozen);
        s->s_bdi = &noop_backing_dev_info;
        s->s_flags = flags;
        if (s->s_user_ns != &init_user_ns)
                s->s_iflags |= SB_I_NODEV;
        INIT_HLIST_NODE(&s->s_instances);
        INIT_HLIST_BL_HEAD(&s->s_roots);
        mutex_init(&s->s_sync_lock);
        INIT_LIST_HEAD(&s->s_inodes);
        spin_lock_init(&s->s_inode_list_lock);
        INIT_LIST_HEAD(&s->s_inodes_wb);
        spin_lock_init(&s->s_inode_wblist_lock);

        s->s_count = 1;
        atomic_set(&s->s_active, 1);
        mutex_init(&s->s_vfs_rename_mutex);
        lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
        init_rwsem(&s->s_dquot.dqio_sem);
        s->s_maxbytes = MAX_NON_LFS;
        s->s_op = &default_op;
        s->s_time_gran = 1000000000;
        s->s_time_min = TIME64_MIN;
        s->s_time_max = TIME64_MAX;
        s->cleancache_poolid = CLEANCACHE_NO_POOL;

        s->s_shrink.seeks = DEFAULT_SEEKS;
        s->s_shrink.scan_objects = super_cache_scan;
        s->s_shrink.count_objects = super_cache_count;
        s->s_shrink.batch = 1024;
        s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE;
        if (prealloc_shrinker(&s->s_shrink))
                goto fail;
        if (list_lru_init_memcg(&s->s_dentry_lru, &s->s_shrink))
                goto fail;
        if (list_lru_init_memcg(&s->s_inode_lru, &s->s_shrink))
                goto fail;
        return s;

fail:
        destroy_unused_super(s);
        return NULL;
}

/* Superblock refcounting  */

/*
 * Drop a superblock's refcount.  The caller must hold sb_lock.
 */
static void __put_super(struct super_block *s)
{
        if (!--s->s_count) {
                list_del_init(&s->s_list);
                WARN_ON(s->s_dentry_lru.node);
                WARN_ON(s->s_inode_lru.node);
                WARN_ON(!list_empty(&s->s_mounts));
                security_sb_free(s);
                fscrypt_destroy_keyring(s);
                put_user_ns(s->s_user_ns);
                kfree(s->s_subtype);
                call_rcu(&s->rcu, destroy_super_rcu);
        }
}

/**
 *        put_super        -        drop a temporary reference to superblock
 *        @sb: superblock in question
 *
 *        Drops a temporary reference, frees superblock if there's no
 *        references left.
 */
static void put_super(struct super_block *sb)
{
        spin_lock(&sb_lock);
        __put_super(sb);
        spin_unlock(&sb_lock);
}


/**
 *        deactivate_locked_super        -        drop an active reference to superblock
 *        @s: superblock to deactivate
 *
 *        Drops an active reference to superblock, converting it into a temporary
 *        one if there is no other active references left.  In that case we
 *        tell fs driver to shut it down and drop the temporary reference we
 *        had just acquired.
 *
 *        Caller holds exclusive lock on superblock; that lock is released.
 */
void deactivate_locked_super(struct super_block *s)
{
        struct file_system_type *fs = s->s_type;
        if (atomic_dec_and_test(&s->s_active)) {
                cleancache_invalidate_fs(s);
                unregister_shrinker(&s->s_shrink);
                fs->kill_sb(s);

                /*
                 * Since list_lru_destroy() may sleep, we cannot call it from
                 * put_super(), where we hold the sb_lock. Therefore we destroy
                 * the lru lists right now.
                 */
                list_lru_destroy(&s->s_dentry_lru);
                list_lru_destroy(&s->s_inode_lru);

                put_filesystem(fs);
                put_super(s);
        } else {
                up_write(&s->s_umount);
        }
}

EXPORT_SYMBOL(deactivate_locked_super);

/**
 *        deactivate_super        -        drop an active reference to superblock
 *        @s: superblock to deactivate
 *
 *        Variant of deactivate_locked_super(), except that superblock is *not*
 *        locked by caller.  If we are going to drop the final active reference,
 *        lock will be acquired prior to that.
 */
void deactivate_super(struct super_block *s)
{
        if (!atomic_add_unless(&s->s_active, -1, 1)) {
                down_write(&s->s_umount);
                deactivate_locked_super(s);
        }
}

EXPORT_SYMBOL(deactivate_super);

/**
 *        grab_super - acquire an active reference
 *        @s: reference we are trying to make active
 *
 *        Tries to acquire an active reference.  grab_super() is used when we
 *         had just found a superblock in super_blocks or fs_type->fs_supers
 *        and want to turn it into a full-blown active reference.  grab_super()
 *        is called with sb_lock held and drops it.  Returns 1 in case of
 *        success, 0 if we had failed (superblock contents was already dead or
 *        dying when grab_super() had been called).  Note that this is only
 *        called for superblocks not in rundown mode (== ones still on ->fs_supers
 *        of their type), so increment of ->s_count is OK here.
 */
static int grab_super(struct super_block *s) __releases(sb_lock)
{
        s->s_count++;
        spin_unlock(&sb_lock);
        down_write(&s->s_umount);
        if ((s->s_flags & SB_BORN) && atomic_inc_not_zero(&s->s_active)) {
                put_super(s);
                return 1;
        }
        up_write(&s->s_umount);
        put_super(s);
        return 0;
}

/*
 *        trylock_super - try to grab ->s_umount shared
 *        @sb: reference we are trying to grab
 *
 *        Try to prevent fs shutdown.  This is used in places where we
 *        cannot take an active reference but we need to ensure that the
 *        filesystem is not shut down while we are working on it. It returns
 *        false if we cannot acquire s_umount or if we lose the race and
 *        filesystem already got into shutdown, and returns true with the s_umount
 *        lock held in read mode in case of success. On successful return,
 *        the caller must drop the s_umount lock when done.
 *
 *        Note that unlike get_super() et.al. this one does *not* bump ->s_count.
 *        The reason why it's safe is that we are OK with doing trylock instead
 *        of down_read().  There's a couple of places that are OK with that, but
 *        it's very much not a general-purpose interface.
 */
bool trylock_super(struct super_block *sb)
{
        if (down_read_trylock(&sb->s_umount)) {
                if (!hlist_unhashed(&sb->s_instances) &&
                    sb->s_root && (sb->s_flags & SB_BORN))
                        return true;
                up_read(&sb->s_umount);
        }

        return false;
}

/**
 *        generic_shutdown_super        -        common helper for ->kill_sb()
 *        @sb: superblock to kill
 *
 *        generic_shutdown_super() does all fs-independent work on superblock
 *        shutdown.  Typical ->kill_sb() should pick all fs-specific objects
 *        that need destruction out of superblock, call generic_shutdown_super()
 *        and release aforementioned objects.  Note: dentries and inodes _are_
 *        taken care of and do not need specific handling.
 *
 *        Upon calling this function, the filesystem may no longer alter or
 *        rearrange the set of dentries belonging to this super_block, nor may it
 *        change the attachments of dentries to inodes.
 */
void generic_shutdown_super(struct super_block *sb)
{
        const struct super_operations *sop = sb->s_op;

        if (sb->s_root) {
                shrink_dcache_for_umount(sb);
                sync_filesystem(sb);
                sb->s_flags &= ~SB_ACTIVE;

                cgroup_writeback_umount();

                /* evict all inodes with zero refcount */
                evict_inodes(sb);
                /* only nonzero refcount inodes can have marks */
                fsnotify_sb_delete(sb);
                fscrypt_destroy_keyring(sb);

                if (sb->s_dio_done_wq) {
                        destroy_workqueue(sb->s_dio_done_wq);
                        sb->s_dio_done_wq = NULL;
                }

                if (sop->put_super)
                        sop->put_super(sb);

                if (!list_empty(&sb->s_inodes)) {
                        printk("VFS: Busy inodes after unmount of %s. "
                           "Self-destruct in 5 seconds.  Have a nice day...\n",
                           sb->s_id);
                }
        }
        spin_lock(&sb_lock);
        /* should be initialized for __put_super_and_need_restart() */
        hlist_del_init(&sb->s_instances);
        spin_unlock(&sb_lock);
        up_write(&sb->s_umount);
        if (sb->s_bdi != &noop_backing_dev_info) {
                if (sb->s_iflags & SB_I_PERSB_BDI)
                        bdi_unregister(sb->s_bdi);
                bdi_put(sb->s_bdi);
                sb->s_bdi = &noop_backing_dev_info;
        }
}

EXPORT_SYMBOL(generic_shutdown_super);

bool mount_capable(struct fs_context *fc)
{
        if (!(fc->fs_type->fs_flags & FS_USERNS_MOUNT))
                return capable(CAP_SYS_ADMIN);
        else
                return ns_capable(fc->user_ns, CAP_SYS_ADMIN);
}

/**
 * sget_fc - Find or create a superblock
 * @fc:        Filesystem context.
 * @test: Comparison callback
 * @set: Setup callback
 *
 * Find or create a superblock using the parameters stored in the filesystem
 * context and the two callback functions.
 *
 * If an extant superblock is matched, then that will be returned with an
 * elevated reference count that the caller must transfer or discard.
 *
 * If no match is made, a new superblock will be allocated and basic
 * initialisation will be performed (s_type, s_fs_info and s_id will be set and
 * the set() callback will be invoked), the superblock will be published and it
 * will be returned in a partially constructed state with SB_BORN and SB_ACTIVE
 * as yet unset.
 */
struct super_block *sget_fc(struct fs_context *fc,
                            int (*test)(struct super_block *, struct fs_context *),
                            int (*set)(struct super_block *, struct fs_context *))
{
        struct super_block *s = NULL;
        struct super_block *old;
        struct user_namespace *user_ns = fc->global ? &init_user_ns : fc->user_ns;
        int err;

        /*
         * Never allow s_user_ns != &init_user_ns when FS_USERNS_MOUNT is
         * not set, as the filesystem is likely unprepared to handle it.
         * This can happen when fsconfig() is called from init_user_ns with
         * an fs_fd opened in another user namespace.
         */
        if (user_ns != &init_user_ns && !(fc->fs_type->fs_flags & FS_USERNS_MOUNT)) {
                errorfc(fc, "VFS: Mounting from non-initial user namespace is not allowed");
                return ERR_PTR(-EPERM);
        }

retry:
        spin_lock(&sb_lock);
        if (test) {
                hlist_for_each_entry(old, &fc->fs_type->fs_supers, s_instances) {
                        if (test(old, fc))
                                goto share_extant_sb;
                }
        }
        if (!s) {
                spin_unlock(&sb_lock);
                s = alloc_super(fc->fs_type, fc->sb_flags, user_ns);
                if (!s)
                        return ERR_PTR(-ENOMEM);
                goto retry;
        }

        s->s_fs_info = fc->s_fs_info;
        err = set(s, fc);
        if (err) {
                s->s_fs_info = NULL;
                spin_unlock(&sb_lock);
                destroy_unused_super(s);
                return ERR_PTR(err);
        }
        fc->s_fs_info = NULL;
        s->s_type = fc->fs_type;
        s->s_iflags |= fc->s_iflags;
        strlcpy(s->s_id, s->s_type->name, sizeof(s->s_id));
        list_add_tail(&s->s_list, &super_blocks);
        hlist_add_head(&s->s_instances, &s->s_type->fs_supers);
        spin_unlock(&sb_lock);
        get_filesystem(s->s_type);
        register_shrinker_prepared(&s->s_shrink);
        return s;

share_extant_sb:
        if (user_ns != old->s_user_ns) {
                spin_unlock(&sb_lock);
                destroy_unused_super(s);
                return ERR_PTR(-EBUSY);
        }
        if (!grab_super(old))
                goto retry;
        destroy_unused_super(s);
        return old;
}
EXPORT_SYMBOL(sget_fc);

/**
 *        sget        -        find or create a superblock
 *        @type:          filesystem type superblock should belong to
 *        @test:          comparison callback
 *        @set:          setup callback
 *        @flags:          mount flags
 *        @data:          argument to each of them
 */
struct super_block *sget(struct file_system_type *type,
                        int (*test)(struct super_block *,void *),
                        int (*set)(struct super_block *,void *),
                        int flags,
                        void *data)
{
        struct user_namespace *user_ns = current_user_ns();
        struct super_block *s = NULL;
        struct super_block *old;
        int err;

        /* We don't yet pass the user namespace of the parent
         * mount through to here so always use &init_user_ns
         * until that changes.
         */
        if (flags & SB_SUBMOUNT)
                user_ns = &init_user_ns;

retry:
        spin_lock(&sb_lock);
        if (test) {
                hlist_for_each_entry(old, &type->fs_supers, s_instances) {
                        if (!test(old, data))
                                continue;
                        if (user_ns != old->s_user_ns) {
                                spin_unlock(&sb_lock);
                                destroy_unused_super(s);
                                return ERR_PTR(-EBUSY);
                        }
                        if (!grab_super(old))
                                goto retry;
                        destroy_unused_super(s);
                        return old;
                }
        }
        if (!s) {
                spin_unlock(&sb_lock);
                s = alloc_super(type, (flags & ~SB_SUBMOUNT), user_ns);
                if (!s)
                        return ERR_PTR(-ENOMEM);
                goto retry;
        }

        err = set(s, data);
        if (err) {
                spin_unlock(&sb_lock);
                destroy_unused_super(s);
                return ERR_PTR(err);
        }
        s->s_type = type;
        strlcpy(s->s_id, type->name, sizeof(s->s_id));
        list_add_tail(&s->s_list, &super_blocks);
        hlist_add_head(&s->s_instances, &type->fs_supers);
        spin_unlock(&sb_lock);
        get_filesystem(type);
        register_shrinker_prepared(&s->s_shrink);
        return s;
}
EXPORT_SYMBOL(sget);

void drop_super(struct super_block *sb)
{
        up_read(&sb->s_umount);
        put_super(sb);
}

EXPORT_SYMBOL(drop_super);

void drop_super_exclusive(struct super_block *sb)
{
        up_write(&sb->s_umount);
        put_super(sb);
}
EXPORT_SYMBOL(drop_super_exclusive);

static void __iterate_supers(void (*f)(struct super_block *))
{
        struct super_block *sb, *p = NULL;

        spin_lock(&sb_lock);
        list_for_each_entry(sb, &super_blocks, s_list) {
                if (hlist_unhashed(&sb->s_instances))
                        continue;
                sb->s_count++;
                spin_unlock(&sb_lock);

                f(sb);

                spin_lock(&sb_lock);
                if (p)
                        __put_super(p);
                p = sb;
        }
        if (p)
                __put_super(p);
        spin_unlock(&sb_lock);
}
/**
 *        iterate_supers - call function for all active superblocks
 *        @f: function to call
 *        @arg: argument to pass to it
 *
 *        Scans the superblock list and calls given function, passing it
 *        locked superblock and given argument.
 */
void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
{
        struct super_block *sb, *p = NULL;

        spin_lock(&sb_lock);
        list_for_each_entry(sb, &super_blocks, s_list) {
                if (hlist_unhashed(&sb->s_instances))
                        continue;
                sb->s_count++;
                spin_unlock(&sb_lock);

                down_read(&sb->s_umount);
                if (sb->s_root && (sb->s_flags & SB_BORN))
                        f(sb, arg);
                up_read(&sb->s_umount);

                spin_lock(&sb_lock);
                if (p)
                        __put_super(p);
                p = sb;
        }
        if (p)
                __put_super(p);
        spin_unlock(&sb_lock);
}

/**
 *        iterate_supers_type - call function for superblocks of given type
 *        @type: fs type
 *        @f: function to call
 *        @arg: argument to pass to it
 *
 *        Scans the superblock list and calls given function, passing it
 *        locked superblock and given argument.
 */
void iterate_supers_type(struct file_system_type *type,
        void (*f)(struct super_block *, void *), void *arg)
{
        struct super_block *sb, *p = NULL;

        spin_lock(&sb_lock);
        hlist_for_each_entry(sb, &type->fs_supers, s_instances) {
                sb->s_count++;
                spin_unlock(&sb_lock);

                down_read(&sb->s_umount);
                if (sb->s_root && (sb->s_flags & SB_BORN))
                        f(sb, arg);
                up_read(&sb->s_umount);

                spin_lock(&sb_lock);
                if (p)
                        __put_super(p);
                p = sb;
        }
        if (p)
                __put_super(p);
        spin_unlock(&sb_lock);
}

EXPORT_SYMBOL(iterate_supers_type);

static struct super_block *__get_super(struct block_device *bdev, bool excl)
{
        struct super_block *sb;

        if (!bdev)
                return NULL;

        spin_lock(&sb_lock);
rescan:
        list_for_each_entry(sb, &super_blocks, s_list) {
                if (hlist_unhashed(&sb->s_instances))
                        continue;
                if (sb->s_bdev == bdev) {
                        sb->s_count++;
                        spin_unlock(&sb_lock);
                        if (!excl)
                                down_read(&sb->s_umount);
                        else
                                down_write(&sb->s_umount);
                        /* still alive? */
                        if (sb->s_root && (sb->s_flags & SB_BORN))
                                return sb;
                        if (!excl)
                                up_read(&sb->s_umount);
                        else
                                up_write(&sb->s_umount);
                        /* nope, got unmounted */
                        spin_lock(&sb_lock);
                        __put_super(sb);
                        goto rescan;
                }
        }
        spin_unlock(&sb_lock);
        return NULL;
}

/**
 *        get_super - get the superblock of a device
 *        @bdev: device to get the superblock for
 *
 *        Scans the superblock list and finds the superblock of the file system
 *        mounted on the device given. %NULL is returned if no match is found.
 */
struct super_block *get_super(struct block_device *bdev)
{
        return __get_super(bdev, false);
}
EXPORT_SYMBOL(get_super);

static struct super_block *__get_super_thawed(struct block_device *bdev,
                                              bool excl)
{
        while (1) {
                struct super_block *s = __get_super(bdev, excl);
                if (!s || s->s_writers.frozen == SB_UNFROZEN)
                        return s;
                if (!excl)
                        up_read(&s->s_umount);
                else
                        up_write(&s->s_umount);
                wait_event(s->s_writers.wait_unfrozen,
                           s->s_writers.frozen == SB_UNFROZEN);
                put_super(s);
        }
}

/**
 *        get_super_thawed - get thawed superblock of a device
 *        @bdev: device to get the superblock for
 *
 *        Scans the superblock list and finds the superblock of the file system
 *        mounted on the device. The superblock is returned once it is thawed
 *        (or immediately if it was not frozen). %NULL is returned if no match
 *        is found.
 */
struct super_block *get_super_thawed(struct block_device *bdev)
{
        return __get_super_thawed(bdev, false);
}
EXPORT_SYMBOL(get_super_thawed);

/**
 *        get_super_exclusive_thawed - get thawed superblock of a device
 *        @bdev: device to get the superblock for
 *
 *        Scans the superblock list and finds the superblock of the file system
 *        mounted on the device. The superblock is returned once it is thawed
 *        (or immediately if it was not frozen) and s_umount semaphore is held
 *        in exclusive mode. %NULL is returned if no match is found.
 */
struct super_block *get_super_exclusive_thawed(struct block_device *bdev)
{
        return __get_super_thawed(bdev, true);
}
EXPORT_SYMBOL(get_super_exclusive_thawed);

/**
 * get_active_super - get an active reference to the superblock of a device
 * @bdev: device to get the superblock for
 *
 * Scans the superblock list and finds the superblock of the file system
 * mounted on the device given.  Returns the superblock with an active
 * reference or %NULL if none was found.
 */
struct super_block *get_active_super(struct block_device *bdev)
{
        struct super_block *sb;

        if (!bdev)
                return NULL;

restart:
        spin_lock(&sb_lock);
        list_for_each_entry(sb, &super_blocks, s_list) {
                if (hlist_unhashed(&sb->s_instances))
                        continue;
                if (sb->s_bdev == bdev) {
                        if (!grab_super(sb))
                                goto restart;
                        up_write(&sb->s_umount);
                        return sb;
                }
        }
        spin_unlock(&sb_lock);
        return NULL;
}

struct super_block *user_get_super(dev_t dev)
{
        struct super_block *sb;

        spin_lock(&sb_lock);
rescan:
        list_for_each_entry(sb, &super_blocks, s_list) {
                if (hlist_unhashed(&sb->s_instances))
                        continue;
                if (sb->s_dev ==  dev) {
                        sb->s_count++;
                        spin_unlock(&sb_lock);
                        down_read(&sb->s_umount);
                        /* still alive? */
                        if (sb->s_root && (sb->s_flags & SB_BORN))
                                return sb;
                        up_read(&sb->s_umount);
                        /* nope, got unmounted */
                        spin_lock(&sb_lock);
                        __put_super(sb);
                        goto rescan;
                }
        }
        spin_unlock(&sb_lock);
        return NULL;
}

/**
 * reconfigure_super - asks filesystem to change superblock parameters
 * @fc: The superblock and configuration
 *
 * Alters the configuration parameters of a live superblock.
 */
int reconfigure_super(struct fs_context *fc)
{
        struct super_block *sb = fc->root->d_sb;
        int retval;
        bool remount_ro = false;
        bool remount_rw = false;
        bool force = fc->sb_flags & SB_FORCE;

        if (fc->sb_flags_mask & ~MS_RMT_MASK)
                return -EINVAL;
        if (sb->s_writers.frozen != SB_UNFROZEN)
                return -EBUSY;

        retval = security_sb_remount(sb, fc->security);
        if (retval)
                return retval;

        if (fc->sb_flags_mask & SB_RDONLY) {
#ifdef CONFIG_BLOCK
                if (!(fc->sb_flags & SB_RDONLY) && bdev_read_only(sb->s_bdev))
                        return -EACCES;
#endif
                remount_rw = !(fc->sb_flags & SB_RDONLY) && sb_rdonly(sb);
                remount_ro = (fc->sb_flags & SB_RDONLY) && !sb_rdonly(sb);
        }

        if (remount_ro) {
                if (!hlist_empty(&sb->s_pins)) {
                        up_write(&sb->s_umount);
                        group_pin_kill(&sb->s_pins);
                        down_write(&sb->s_umount);
                        if (!sb->s_root)
                                return 0;
                        if (sb->s_writers.frozen != SB_UNFROZEN)
                                return -EBUSY;
                        remount_ro = !sb_rdonly(sb);
                }
        }
        shrink_dcache_sb(sb);

        /* If we are reconfiguring to RDONLY and current sb is read/write,
         * make sure there are no files open for writing.
         */
        if (remount_ro) {
                if (force) {
                        sb->s_readonly_remount = 1;
                        smp_wmb();
                } else {
                        retval = sb_prepare_remount_readonly(sb);
                        if (retval)
                                return retval;
                }
        } else if (remount_rw) {
                /*
                 * We set s_readonly_remount here to protect filesystem's
                 * reconfigure code from writes from userspace until
                 * reconfigure finishes.
                 */
                sb->s_readonly_remount = 1;
                smp_wmb();
        }

        if (fc->ops->reconfigure) {
                retval = fc->ops->reconfigure(fc);
                if (retval) {
                        if (!force)
                                goto cancel_readonly;
                        /* If forced remount, go ahead despite any errors */
                        WARN(1, "forced remount of a %s fs returned %i\n",
                             sb->s_type->name, retval);
                }
        }

        WRITE_ONCE(sb->s_flags, ((sb->s_flags & ~fc->sb_flags_mask) |
                                 (fc->sb_flags & fc->sb_flags_mask)));
        /* Needs to be ordered wrt mnt_is_readonly() */
        smp_wmb();
        sb->s_readonly_remount = 0;

        /*
         * Some filesystems modify their metadata via some other path than the
         * bdev buffer cache (eg. use a private mapping, or directories in
         * pagecache, etc). Also file data modifications go via their own
         * mappings. So If we try to mount readonly then copy the filesystem
         * from bdev, we could get stale data, so invalidate it to give a best
         * effort at coherency.
         */
        if (remount_ro && sb->s_bdev)
                invalidate_bdev(sb->s_bdev);
        return 0;

cancel_readonly:
        sb->s_readonly_remount = 0;
        return retval;
}

static void do_emergency_remount_callback(struct super_block *sb)
{
        down_write(&sb->s_umount);
        if (sb->s_root && sb->s_bdev && (sb->s_flags & SB_BORN) &&
            !sb_rdonly(sb)) {
                struct fs_context *fc;

                fc = fs_context_for_reconfigure(sb->s_root,
                                        SB_RDONLY | SB_FORCE, SB_RDONLY);
                if (!IS_ERR(fc)) {
                        if (parse_monolithic_mount_data(fc, NULL) == 0)
                                (void)reconfigure_super(fc);
                        put_fs_context(fc);
                }
        }
        up_write(&sb->s_umount);
}

static void do_emergency_remount(struct work_struct *work)
{
        __iterate_supers(do_emergency_remount_callback);
        kfree(work);
        printk("Emergency Remount complete\n");
}

void emergency_remount(void)
{
        struct work_struct *work;

        work = kmalloc(sizeof(*work), GFP_ATOMIC);
        if (work) {
                INIT_WORK(work, do_emergency_remount);
                schedule_work(work);
        }
}

static void do_thaw_all_callback(struct super_block *sb)
{
        down_write(&sb->s_umount);
        if (sb->s_root && sb->s_flags & SB_BORN) {
                emergency_thaw_bdev(sb);
                thaw_super_locked(sb);
        } else {
                up_write(&sb->s_umount);
        }
}

static void do_thaw_all(struct work_struct *work)
{
        __iterate_supers(do_thaw_all_callback);
        kfree(work);
        printk(KERN_WARNING "Emergency Thaw complete\n");
}

/**
 * emergency_thaw_all -- forcibly thaw every frozen filesystem
 *
 * Used for emergency unfreeze of all filesystems via SysRq
 */
void emergency_thaw_all(void)
{
        struct work_struct *work;

        work = kmalloc(sizeof(*work), GFP_ATOMIC);
        if (work) {
                INIT_WORK(work, do_thaw_all);
                schedule_work(work);
        }
}

static DEFINE_IDA(unnamed_dev_ida);

/**
 * get_anon_bdev - Allocate a block device for filesystems which don't have one.
 * @p: Pointer to a dev_t.
 *
 * Filesystems which don't use real block devices can call this function
 * to allocate a virtual block device.
 *
 * Context: Any context.  Frequently called while holding sb_lock.
 * Return: 0 on success, -EMFILE if there are no anonymous bdevs left
 * or -ENOMEM if memory allocation failed.
 */
int get_anon_bdev(dev_t *p)
{
        int dev;

        /*
         * Many userspace utilities consider an FSID of 0 invalid.
         * Always return at least 1 from get_anon_bdev.
         */
        dev = ida_alloc_range(&unnamed_dev_ida, 1, (1 << MINORBITS) - 1,
                        GFP_ATOMIC);
        if (dev == -ENOSPC)
                dev = -EMFILE;
        if (dev < 0)
                return dev;

        *p = MKDEV(0, dev);
        return 0;
}
EXPORT_SYMBOL(get_anon_bdev);

void free_anon_bdev(dev_t dev)
{
        ida_free(&unnamed_dev_ida, MINOR(dev));
}
EXPORT_SYMBOL(free_anon_bdev);

int set_anon_super(struct super_block *s, void *data)
{
        return get_anon_bdev(&s->s_dev);
}
EXPORT_SYMBOL(set_anon_super);

void kill_anon_super(struct super_block *sb)
{
        dev_t dev = sb->s_dev;
        generic_shutdown_super(sb);
        free_anon_bdev(dev);
}
EXPORT_SYMBOL(kill_anon_super);

void kill_litter_super(struct super_block *sb)
{
        if (sb->s_root)
                d_genocide(sb->s_root);
        kill_anon_super(sb);
}
EXPORT_SYMBOL(kill_litter_super);

int set_anon_super_fc(struct super_block *sb, struct fs_context *fc)
{
        return set_anon_super(sb, NULL);
}
EXPORT_SYMBOL(set_anon_super_fc);

static int test_keyed_super(struct super_block *sb, struct fs_context *fc)
{
        return sb->s_fs_info == fc->s_fs_info;
}

static int test_single_super(struct super_block *s, struct fs_context *fc)
{
        return 1;
}

/**
 * vfs_get_super - Get a superblock with a search key set in s_fs_info.
 * @fc: The filesystem context holding the parameters
 * @keying: How to distinguish superblocks
 * @fill_super: Helper to initialise a new superblock
 *
 * Search for a superblock and create a new one if not found.  The search
 * criterion is controlled by @keying.  If the search fails, a new superblock
 * is created and @fill_super() is called to initialise it.
 *
 * @keying can take one of a number of values:
 *
 * (1) vfs_get_single_super - Only one superblock of this type may exist on the
 *     system.  This is typically used for special system filesystems.
 *
 * (2) vfs_get_keyed_super - Multiple superblocks may exist, but they must have
 *     distinct keys (where the key is in s_fs_info).  Searching for the same
 *     key again will turn up the superblock for that key.
 *
 * (3) vfs_get_independent_super - Multiple superblocks may exist and are
 *     unkeyed.  Each call will get a new superblock.
 *
 * A permissions check is made by sget_fc() unless we're getting a superblock
 * for a kernel-internal mount or a submount.
 */
int vfs_get_super(struct fs_context *fc,
                  enum vfs_get_super_keying keying,
                  int (*fill_super)(struct super_block *sb,
                                    struct fs_context *fc))
{
        int (*test)(struct super_block *, struct fs_context *);
        struct super_block *sb;
        int err;

        switch (keying) {
        case vfs_get_single_super:
        case vfs_get_single_reconf_super:
                test = test_single_super;
                break;
        case vfs_get_keyed_super:
                test = test_keyed_super;
                break;
        case vfs_get_independent_super:
                test = NULL;
                break;
        default:
                BUG();
        }

        sb = sget_fc(fc, test, set_anon_super_fc);
        if (IS_ERR(sb))
                return PTR_ERR(sb);

        if (!sb->s_root) {
                err = fill_super(sb, fc);
                if (err)
                        goto error;

                sb->s_flags |= SB_ACTIVE;
                fc->root = dget(sb->s_root);
        } else {
                fc->root = dget(sb->s_root);
                if (keying == vfs_get_single_reconf_super) {
                        err = reconfigure_super(fc);
                        if (err < 0) {
                                dput(fc->root);
                                fc->root = NULL;
                                goto error;
                        }
                }
        }

        return 0;

error:
        deactivate_locked_super(sb);
        return err;
}
EXPORT_SYMBOL(vfs_get_super);

int get_tree_nodev(struct fs_context *fc,
                  int (*fill_super)(struct super_block *sb,
                                    struct fs_context *fc))
{
        return vfs_get_super(fc, vfs_get_independent_super, fill_super);
}
EXPORT_SYMBOL(get_tree_nodev);

int get_tree_single(struct fs_context *fc,
                  int (*fill_super)(struct super_block *sb,
                                    struct fs_context *fc))
{
        return vfs_get_super(fc, vfs_get_single_super, fill_super);
}
EXPORT_SYMBOL(get_tree_single);

int get_tree_single_reconf(struct fs_context *fc,
                  int (*fill_super)(struct super_block *sb,
                                    struct fs_context *fc))
{
        return vfs_get_super(fc, vfs_get_single_reconf_super, fill_super);
}
EXPORT_SYMBOL(get_tree_single_reconf);

int get_tree_keyed(struct fs_context *fc,
                  int (*fill_super)(struct super_block *sb,
                                    struct fs_context *fc),
                void *key)
{
        fc->s_fs_info = key;
        return vfs_get_super(fc, vfs_get_keyed_super, fill_super);
}
EXPORT_SYMBOL(get_tree_keyed);

#ifdef CONFIG_BLOCK

static int set_bdev_super(struct super_block *s, void *data)
{
        s->s_bdev = data;
        s->s_dev = s->s_bdev->bd_dev;
        s->s_bdi = bdi_get(s->s_bdev->bd_bdi);

        if (blk_queue_stable_writes(s->s_bdev->bd_disk->queue))
                s->s_iflags |= SB_I_STABLE_WRITES;
        return 0;
}

static int set_bdev_super_fc(struct super_block *s, struct fs_context *fc)
{
        return set_bdev_super(s, fc->sget_key);
}

static int test_bdev_super_fc(struct super_block *s, struct fs_context *fc)
{
        return s->s_bdev == fc->sget_key;
}

/**
 * get_tree_bdev - Get a superblock based on a single block device
 * @fc: The filesystem context holding the parameters
 * @fill_super: Helper to initialise a new superblock
 */
int get_tree_bdev(struct fs_context *fc,
                int (*fill_super)(struct super_block *,
                                  struct fs_context *))
{
        struct block_device *bdev;
        struct super_block *s;
        fmode_t mode = FMODE_READ | FMODE_EXCL;
        int error = 0;

        if (!(fc->sb_flags & SB_RDONLY))
                mode |= FMODE_WRITE;

        if (!fc->source)
                return invalf(fc, "No source specified");

        bdev = blkdev_get_by_path(fc->source, mode, fc->fs_type);
        if (IS_ERR(bdev)) {
                errorf(fc, "%s: Can't open blockdev", fc->source);
                return PTR_ERR(bdev);
        }

        /* Once the superblock is inserted into the list by sget_fc(), s_umount
         * will protect the lockfs code from trying to start a snapshot while
         * we are mounting
         */
        mutex_lock(&bdev->bd_fsfreeze_mutex);
        if (bdev->bd_fsfreeze_count > 0) {
                mutex_unlock(&bdev->bd_fsfreeze_mutex);
                warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
                blkdev_put(bdev, mode);
                return -EBUSY;
        }

        fc->sb_flags |= SB_NOSEC;
        fc->sget_key = bdev;
        s = sget_fc(fc, test_bdev_super_fc, set_bdev_super_fc);
        mutex_unlock(&bdev->bd_fsfreeze_mutex);
        if (IS_ERR(s)) {
                blkdev_put(bdev, mode);
                return PTR_ERR(s);
        }

        if (s->s_root) {
                /* Don't summarily change the RO/RW state. */
                if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) {
                        warnf(fc, "%pg: Can't mount, would change RO state", bdev);
                        deactivate_locked_super(s);
                        blkdev_put(bdev, mode);
                        return -EBUSY;
                }

                /*
                 * s_umount nests inside bd_mutex during
                 * __invalidate_device().  blkdev_put() acquires
                 * bd_mutex and can't be called under s_umount.  Drop
                 * s_umount temporarily.  This is safe as we're
                 * holding an active reference.
                 */
                up_write(&s->s_umount);
                blkdev_put(bdev, mode);
                down_write(&s->s_umount);
        } else {
                s->s_mode = mode;
                snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
                sb_set_blocksize(s, block_size(bdev));
                error = fill_super(s, fc);
                if (error) {
                        deactivate_locked_super(s);
                        return error;
                }

                s->s_flags |= SB_ACTIVE;
                bdev->bd_super = s;
        }

        BUG_ON(fc->root);
        fc->root = dget(s->s_root);
        return 0;
}
EXPORT_SYMBOL(get_tree_bdev);

static int test_bdev_super(struct super_block *s, void *data)
{
        return (void *)s->s_bdev == data;
}

struct dentry *mount_bdev(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data,
        int (*fill_super)(struct super_block *, void *, int))
{
        struct block_device *bdev;
        struct super_block *s;
        fmode_t mode = FMODE_READ | FMODE_EXCL;
        int error = 0;

        if (!(flags & SB_RDONLY))
                mode |= FMODE_WRITE;

        bdev = blkdev_get_by_path(dev_name, mode, fs_type);
        if (IS_ERR(bdev))
                return ERR_CAST(bdev);

        /*
         * once the super is inserted into the list by sget, s_umount
         * will protect the lockfs code from trying to start a snapshot
         * while we are mounting
         */
        mutex_lock(&bdev->bd_fsfreeze_mutex);
        if (bdev->bd_fsfreeze_count > 0) {
                mutex_unlock(&bdev->bd_fsfreeze_mutex);
                error = -EBUSY;
                goto error_bdev;
        }
        s = sget(fs_type, test_bdev_super, set_bdev_super, flags | SB_NOSEC,
                 bdev);
        mutex_unlock(&bdev->bd_fsfreeze_mutex);
        if (IS_ERR(s))
                goto error_s;

        if (s->s_root) {
                if ((flags ^ s->s_flags) & SB_RDONLY) {
                        deactivate_locked_super(s);
                        error = -EBUSY;
                        goto error_bdev;
                }

                /*
                 * s_umount nests inside bd_mutex during
                 * __invalidate_device().  blkdev_put() acquires
                 * bd_mutex and can't be called under s_umount.  Drop
                 * s_umount temporarily.  This is safe as we're
                 * holding an active reference.
                 */
                up_write(&s->s_umount);
                blkdev_put(bdev, mode);
                down_write(&s->s_umount);
        } else {
                s->s_mode = mode;
                snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
                sb_set_blocksize(s, block_size(bdev));
                error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
                if (error) {
                        deactivate_locked_super(s);
                        goto error;
                }

                s->s_flags |= SB_ACTIVE;
                bdev->bd_super = s;
        }

        return dget(s->s_root);

error_s:
        error = PTR_ERR(s);
error_bdev:
        blkdev_put(bdev, mode);
error:
        return ERR_PTR(error);
}
EXPORT_SYMBOL(mount_bdev);

void kill_block_super(struct super_block *sb)
{
        struct block_device *bdev = sb->s_bdev;
        fmode_t mode = sb->s_mode;

        bdev->bd_super = NULL;
        generic_shutdown_super(sb);
        sync_blockdev(bdev);
        WARN_ON_ONCE(!(mode & FMODE_EXCL));
        blkdev_put(bdev, mode | FMODE_EXCL);
}

EXPORT_SYMBOL(kill_block_super);
#endif

struct dentry *mount_nodev(struct file_system_type *fs_type,
        int flags, void *data,
        int (*fill_super)(struct super_block *, void *, int))
{
        int error;
        struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL);

        if (IS_ERR(s))
                return ERR_CAST(s);

        error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
        if (error) {
                deactivate_locked_super(s);
                return ERR_PTR(error);
        }
        s->s_flags |= SB_ACTIVE;
        return dget(s->s_root);
}
EXPORT_SYMBOL(mount_nodev);

int reconfigure_single(struct super_block *s,
                       int flags, void *data)
{
        struct fs_context *fc;
        int ret;

        /* The caller really need to be passing fc down into mount_single(),
         * then a chunk of this can be removed.  [Bollocks -- AV]
         * Better yet, reconfiguration shouldn't happen, but rather the second
         * mount should be rejected if the parameters are not compatible.
         */
        fc = fs_context_for_reconfigure(s->s_root, flags, MS_RMT_MASK);
        if (IS_ERR(fc))
                return PTR_ERR(fc);

        ret = parse_monolithic_mount_data(fc, data);
        if (ret < 0)
                goto out;

        ret = reconfigure_super(fc);
out:
        put_fs_context(fc);
        return ret;
}

static int compare_single(struct super_block *s, void *p)
{
        return 1;
}

struct dentry *mount_single(struct file_system_type *fs_type,
        int flags, void *data,
        int (*fill_super)(struct super_block *, void *, int))
{
        struct super_block *s;
        int error;

        s = sget(fs_type, compare_single, set_anon_super, flags, NULL);
        if (IS_ERR(s))
                return ERR_CAST(s);
        if (!s->s_root) {
                error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
                if (!error)
                        s->s_flags |= SB_ACTIVE;
        } else {
                error = reconfigure_single(s, flags, data);
        }
        if (unlikely(error)) {
                deactivate_locked_super(s);
                return ERR_PTR(error);
        }
        return dget(s->s_root);
}
EXPORT_SYMBOL(mount_single);

/**
 * vfs_get_tree - Get the mountable root
 * @fc: The superblock configuration context.
 *
 * The filesystem is invoked to get or create a superblock which can then later
 * be used for mounting.  The filesystem places a pointer to the root to be
 * used for mounting in @fc->root.
 */
int vfs_get_tree(struct fs_context *fc)
{
        struct super_block *sb;
        int error;

        if (fc->root)
                return -EBUSY;

        /* Get the mountable root in fc->root, with a ref on the root and a ref
         * on the superblock.
         */
        error = fc->ops->get_tree(fc);
        if (error < 0)
                return error;

        if (!fc->root) {
                pr_err("Filesystem %s get_tree() didn't set fc->root\n",
                       fc->fs_type->name);
                /* We don't know what the locking state of the superblock is -
                 * if there is a superblock.
                 */
                BUG();
        }

        sb = fc->root->d_sb;
        WARN_ON(!sb->s_bdi);

        /*
         * Write barrier is for super_cache_count(). We place it before setting
         * SB_BORN as the data dependency between the two functions is the
         * superblock structure contents that we just set up, not the SB_BORN
         * flag.
         */
        smp_wmb();
        sb->s_flags |= SB_BORN;

        error = security_sb_set_mnt_opts(sb, fc->security, 0, NULL);
        if (unlikely(error)) {
                fc_drop_locked(fc);
                return error;
        }

        /*
         * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
         * but s_maxbytes was an unsigned long long for many releases. Throw
         * this warning for a little while to try and catch filesystems that
         * violate this rule.
         */
        WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
                "negative value (%lld)\n", fc->fs_type->name, sb->s_maxbytes);

        return 0;
}
EXPORT_SYMBOL(vfs_get_tree);

/*
 * Setup private BDI for given superblock. It gets automatically cleaned up
 * in generic_shutdown_super().
 */
int super_setup_bdi_name(struct super_block *sb, char *fmt, ...)
{
        struct backing_dev_info *bdi;
        int err;
        va_list args;

        bdi = bdi_alloc(NUMA_NO_NODE);
        if (!bdi)
                return -ENOMEM;

        va_start(args, fmt);
        err = bdi_register_va(bdi, fmt, args);
        va_end(args);
        if (err) {
                bdi_put(bdi);
                return err;
        }
        WARN_ON(sb->s_bdi != &noop_backing_dev_info);
        sb->s_bdi = bdi;
        sb->s_iflags |= SB_I_PERSB_BDI;

        return 0;
}
EXPORT_SYMBOL(super_setup_bdi_name);

/*
 * Setup private BDI for given superblock. I gets automatically cleaned up
 * in generic_shutdown_super().
 */
int super_setup_bdi(struct super_block *sb)
{
        static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);

        return super_setup_bdi_name(sb, "%.28s-%ld", sb->s_type->name,
                                    atomic_long_inc_return(&bdi_seq));
}
EXPORT_SYMBOL(super_setup_bdi);

/**
 * sb_wait_write - wait until all writers to given file system finish
 * @sb: the super for which we wait
 * @level: type of writers we wait for (normal vs page fault)
 *
 * This function waits until there are no writers of given type to given file
 * system.
 */
static void sb_wait_write(struct super_block *sb, int level)
{
        percpu_down_write(sb->s_writers.rw_sem + level-1);
}

/*
 * We are going to return to userspace and forget about these locks, the
 * ownership goes to the caller of thaw_super() which does unlock().
 */
static void lockdep_sb_freeze_release(struct super_block *sb)
{
        int level;

        for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--)
                percpu_rwsem_release(sb->s_writers.rw_sem + level, 0, _THIS_IP_);
}

/*
 * Tell lockdep we are holding these locks before we call ->unfreeze_fs(sb).
 */
static void lockdep_sb_freeze_acquire(struct super_block *sb)
{
        int level;

        for (level = 0; level < SB_FREEZE_LEVELS; ++level)
                percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_);
}

static void sb_freeze_unlock(struct super_block *sb, int level)
{
        for (level--; level >= 0; level--)
                percpu_up_write(sb->s_writers.rw_sem + level);
}

/**
 * freeze_super - lock the filesystem and force it into a consistent state
 * @sb: the super to lock
 *
 * Syncs the super to make sure the filesystem is consistent and calls the fs's
 * freeze_fs.  Subsequent calls to this without first thawing the fs will return
 * -EBUSY.
 *
 * During this function, sb->s_writers.frozen goes through these values:
 *
 * SB_UNFROZEN: File system is normal, all writes progress as usual.
 *
 * SB_FREEZE_WRITE: The file system is in the process of being frozen.  New
 * writes should be blocked, though page faults are still allowed. We wait for
 * all writes to complete and then proceed to the next stage.
 *
 * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked
 * but internal fs threads can still modify the filesystem (although they
 * should not dirty new pages or inodes), writeback can run etc. After waiting
 * for all running page faults we sync the filesystem which will clean all
 * dirty pages and inodes (no new dirty pages or inodes can be created when
 * sync is running).
 *
 * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs
 * modification are blocked (e.g. XFS preallocation truncation on inode
 * reclaim). This is usually implemented by blocking new transactions for
 * filesystems that have them and need this additional guard. After all
 * internal writers are finished we call ->freeze_fs() to finish filesystem
 * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is
 * mostly auxiliary for filesystems to verify they do not modify frozen fs.
 *
 * sb->s_writers.frozen is protected by sb->s_umount.
 */
int freeze_super(struct super_block *sb)
{
        int ret;

        atomic_inc(&sb->s_active);
        down_write(&sb->s_umount);
        if (sb->s_writers.frozen != SB_UNFROZEN) {
                deactivate_locked_super(sb);
                return -EBUSY;
        }

        if (!(sb->s_flags & SB_BORN)) {
                up_write(&sb->s_umount);
                return 0;        /* sic - it's "nothing to do" */
        }

        if (sb_rdonly(sb)) {
                /* Nothing to do really... */
                sb->s_writers.frozen = SB_FREEZE_COMPLETE;
                up_write(&sb->s_umount);
                return 0;
        }

        sb->s_writers.frozen = SB_FREEZE_WRITE;
        /* Release s_umount to preserve sb_start_write -> s_umount ordering */
        up_write(&sb->s_umount);
        sb_wait_write(sb, SB_FREEZE_WRITE);
        down_write(&sb->s_umount);

        /* Now we go and block page faults... */
        sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
        sb_wait_write(sb, SB_FREEZE_PAGEFAULT);

        /* All writers are done so after syncing there won't be dirty data */
        ret = sync_filesystem(sb);
        if (ret) {
                sb->s_writers.frozen = SB_UNFROZEN;
                sb_freeze_unlock(sb, SB_FREEZE_PAGEFAULT);
                wake_up(&sb->s_writers.wait_unfrozen);
                deactivate_locked_super(sb);
                return ret;
        }

        /* Now wait for internal filesystem counter */
        sb->s_writers.frozen = SB_FREEZE_FS;
        sb_wait_write(sb, SB_FREEZE_FS);

        if (sb->s_op->freeze_fs) {
                ret = sb->s_op->freeze_fs(sb);
                if (ret) {
                        printk(KERN_ERR
                                "VFS:Filesystem freeze failed\n");
                        sb->s_writers.frozen = SB_UNFROZEN;
                        sb_freeze_unlock(sb, SB_FREEZE_FS);
                        wake_up(&sb->s_writers.wait_unfrozen);
                        deactivate_locked_super(sb);
                        return ret;
                }
        }
        /*
         * For debugging purposes so that fs can warn if it sees write activity
         * when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super().
         */
        sb->s_writers.frozen = SB_FREEZE_COMPLETE;
        lockdep_sb_freeze_release(sb);
        up_write(&sb->s_umount);
        return 0;
}
EXPORT_SYMBOL(freeze_super);

/**
 * thaw_super -- unlock filesystem
 * @sb: the super to thaw
 *
 * Unlocks the filesystem and marks it writeable again after freeze_super().
 */
static int thaw_super_locked(struct super_block *sb)
{
        int error;

        if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) {
                up_write(&sb->s_umount);
                return -EINVAL;
        }

        if (sb_rdonly(sb)) {
                sb->s_writers.frozen = SB_UNFROZEN;
                goto out;
        }

        lockdep_sb_freeze_acquire(sb);

        if (sb->s_op->unfreeze_fs) {
                error = sb->s_op->unfreeze_fs(sb);
                if (error) {
                        printk(KERN_ERR
                                "VFS:Filesystem thaw failed\n");
                        lockdep_sb_freeze_release(sb);
                        up_write(&sb->s_umount);
                        return error;
                }
        }

        sb->s_writers.frozen = SB_UNFROZEN;
        sb_freeze_unlock(sb, SB_FREEZE_FS);
out:
        wake_up(&sb->s_writers.wait_unfrozen);
        deactivate_locked_super(sb);
        return 0;
}

int thaw_super(struct super_block *sb)
{
        down_write(&sb->s_umount);
        return thaw_super_locked(sb);
}
EXPORT_SYMBOL(thaw_super);



































































































































































































































































































    1 


    1 
    1 




    1 

    1 


    1 


    1 
    1 



























































































































































































































    1 
































































    1 
    1 
































    1 



















    1 





    1 





    1 
    1 

























































































































































































































































    2 













































    2 



































































































































































































































































































































































































































































































































































































































    2 










    2 





    2 















    2 


















    2 



    2 










    2 












    2 









    2 
















    2 

























    2 





















    2 























    2 














    2 

    2 

    2 
    2 





    2 





    2 







    2 












    2 







    2 







    2 



















    2 









































































































































































































































































































































































    2 















    2 







    2 


    2 


    2 




    2 



























    2 







    2 





















    2 



























    2 





    2 











    2 









    2 


    2 
    2 


    2 

    2 


    2 




















    2 




    2 
    2 





























    2 
    2 


    2 





    2 


    2 


    2 
    2 















































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
// SPDX-License-Identifier: GPL-2.0
/*
 * linux/fs/ext4/xattr.c
 *
 * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
 *
 * Fix by Harrison Xing <harrison@mountainviewdata.com>.
 * Ext4 code with a lot of help from Eric Jarman <ejarman@acm.org>.
 * Extended attributes for symlinks and special files added per
 *  suggestion of Luka Renko <luka.renko@hermes.si>.
 * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>,
 *  Red Hat Inc.
 * ea-in-inode support by Alex Tomas <alex@clusterfs.com> aka bzzz
 *  and Andreas Gruenbacher <agruen@suse.de>.
 */

/*
 * Extended attributes are stored directly in inodes (on file systems with
 * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl
 * field contains the block number if an inode uses an additional block. All
 * attributes must fit in the inode and one additional block. Blocks that
 * contain the identical set of attributes may be shared among several inodes.
 * Identical blocks are detected by keeping a cache of blocks that have
 * recently been accessed.
 *
 * The attributes in inodes and on blocks have a different header; the entries
 * are stored in the same format:
 *
 *   +------------------+
 *   | header           |
 *   | entry 1          | |
 *   | entry 2          | | growing downwards
 *   | entry 3          | v
 *   | four null bytes  |
 *   | . . .            |
 *   | value 1          | ^
 *   | value 3          | | growing upwards
 *   | value 2          | |
 *   +------------------+
 *
 * The header is followed by multiple entry descriptors. In disk blocks, the
 * entry descriptors are kept sorted. In inodes, they are unsorted. The
 * attribute values are aligned to the end of the block in no specific order.
 *
 * Locking strategy
 * ----------------
 * EXT4_I(inode)->i_file_acl is protected by EXT4_I(inode)->xattr_sem.
 * EA blocks are only changed if they are exclusive to an inode, so
 * holding xattr_sem also means that nothing but the EA block's reference
 * count can change. Multiple writers to the same block are synchronized
 * by the buffer lock.
 */

#include <linux/init.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/mbcache.h>
#include <linux/quotaops.h>
#include <linux/iversion.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#include "xattr.h"
#include "acl.h"

#ifdef EXT4_XATTR_DEBUG
# define ea_idebug(inode, fmt, ...)                                        \
        printk(KERN_DEBUG "inode %s:%lu: " fmt "\n",                        \
               inode->i_sb->s_id, inode->i_ino, ##__VA_ARGS__)
# define ea_bdebug(bh, fmt, ...)                                        \
        printk(KERN_DEBUG "block %pg:%lu: " fmt "\n",                        \
               bh->b_bdev, (unsigned long)bh->b_blocknr, ##__VA_ARGS__)
#else
# define ea_idebug(inode, fmt, ...)        no_printk(fmt, ##__VA_ARGS__)
# define ea_bdebug(bh, fmt, ...)        no_printk(fmt, ##__VA_ARGS__)
#endif

static void ext4_xattr_block_cache_insert(struct mb_cache *,
                                          struct buffer_head *);
static struct buffer_head *
ext4_xattr_block_cache_find(struct inode *, struct ext4_xattr_header *,
                            struct mb_cache_entry **);
static __le32 ext4_xattr_hash_entry(char *name, size_t name_len, __le32 *value,
                                    size_t value_count);
static void ext4_xattr_rehash(struct ext4_xattr_header *);

static const struct xattr_handler * const ext4_xattr_handler_map[] = {
        [EXT4_XATTR_INDEX_USER]                     = &ext4_xattr_user_handler,
#ifdef CONFIG_EXT4_FS_POSIX_ACL
        [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS]  = &posix_acl_access_xattr_handler,
        [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler,
#endif
        [EXT4_XATTR_INDEX_TRUSTED]             = &ext4_xattr_trusted_handler,
#ifdef CONFIG_EXT4_FS_SECURITY
        [EXT4_XATTR_INDEX_SECURITY]             = &ext4_xattr_security_handler,
#endif
        [EXT4_XATTR_INDEX_HURD]                     = &ext4_xattr_hurd_handler,
};

const struct xattr_handler *ext4_xattr_handlers[] = {
        &ext4_xattr_user_handler,
        &ext4_xattr_trusted_handler,
#ifdef CONFIG_EXT4_FS_POSIX_ACL
        &posix_acl_access_xattr_handler,
        &posix_acl_default_xattr_handler,
#endif
#ifdef CONFIG_EXT4_FS_SECURITY
        &ext4_xattr_security_handler,
#endif
        &ext4_xattr_hurd_handler,
        NULL
};

#define EA_BLOCK_CACHE(inode)        (((struct ext4_sb_info *) \
                                inode->i_sb->s_fs_info)->s_ea_block_cache)

#define EA_INODE_CACHE(inode)        (((struct ext4_sb_info *) \
                                inode->i_sb->s_fs_info)->s_ea_inode_cache)

static int
ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
                        struct inode *inode);

#ifdef CONFIG_LOCKDEP
void ext4_xattr_inode_set_class(struct inode *ea_inode)
{
        struct ext4_inode_info *ei = EXT4_I(ea_inode);

        lockdep_set_subclass(&ea_inode->i_rwsem, 1);
        (void) ei;        /* shut up clang warning if !CONFIG_LOCKDEP */
        lockdep_set_subclass(&ei->i_data_sem, I_DATA_SEM_EA);
}
#endif

static __le32 ext4_xattr_block_csum(struct inode *inode,
                                    sector_t block_nr,
                                    struct ext4_xattr_header *hdr)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        __u32 csum;
        __le64 dsk_block_nr = cpu_to_le64(block_nr);
        __u32 dummy_csum = 0;
        int offset = offsetof(struct ext4_xattr_header, h_checksum);

        csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr,
                           sizeof(dsk_block_nr));
        csum = ext4_chksum(sbi, csum, (__u8 *)hdr, offset);
        csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum));
        offset += sizeof(dummy_csum);
        csum = ext4_chksum(sbi, csum, (__u8 *)hdr + offset,
                           EXT4_BLOCK_SIZE(inode->i_sb) - offset);

        return cpu_to_le32(csum);
}

static int ext4_xattr_block_csum_verify(struct inode *inode,
                                        struct buffer_head *bh)
{
        struct ext4_xattr_header *hdr = BHDR(bh);
        int ret = 1;

        if (ext4_has_metadata_csum(inode->i_sb)) {
                lock_buffer(bh);
                ret = (hdr->h_checksum == ext4_xattr_block_csum(inode,
                                                        bh->b_blocknr, hdr));
                unlock_buffer(bh);
        }
        return ret;
}

static void ext4_xattr_block_csum_set(struct inode *inode,
                                      struct buffer_head *bh)
{
        if (ext4_has_metadata_csum(inode->i_sb))
                BHDR(bh)->h_checksum = ext4_xattr_block_csum(inode,
                                                bh->b_blocknr, BHDR(bh));
}

static inline const struct xattr_handler *
ext4_xattr_handler(int name_index)
{
        const struct xattr_handler *handler = NULL;

        if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map))
                handler = ext4_xattr_handler_map[name_index];
        return handler;
}

static int
ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end,
                         void *value_start)
{
        struct ext4_xattr_entry *e = entry;

        /* Find the end of the names list */
        while (!IS_LAST_ENTRY(e)) {
                struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e);
                if ((void *)next >= end)
                        return -EFSCORRUPTED;
                if (strnlen(e->e_name, e->e_name_len) != e->e_name_len)
                        return -EFSCORRUPTED;
                e = next;
        }

        /* Check the values */
        while (!IS_LAST_ENTRY(entry)) {
                u32 size = le32_to_cpu(entry->e_value_size);

                if (size > EXT4_XATTR_SIZE_MAX)
                        return -EFSCORRUPTED;

                if (size != 0 && entry->e_value_inum == 0) {
                        u16 offs = le16_to_cpu(entry->e_value_offs);
                        void *value;

                        /*
                         * The value cannot overlap the names, and the value
                         * with padding cannot extend beyond 'end'.  Check both
                         * the padded and unpadded sizes, since the size may
                         * overflow to 0 when adding padding.
                         */
                        if (offs > end - value_start)
                                return -EFSCORRUPTED;
                        value = value_start + offs;
                        if (value < (void *)e + sizeof(u32) ||
                            size > end - value ||
                            EXT4_XATTR_SIZE(size) > end - value)
                                return -EFSCORRUPTED;
                }
                entry = EXT4_XATTR_NEXT(entry);
        }

        return 0;
}

static inline int
__ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh,
                         const char *function, unsigned int line)
{
        int error = -EFSCORRUPTED;

        if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
            BHDR(bh)->h_blocks != cpu_to_le32(1))
                goto errout;
        if (buffer_verified(bh))
                return 0;

        error = -EFSBADCRC;
        if (!ext4_xattr_block_csum_verify(inode, bh))
                goto errout;
        error = ext4_xattr_check_entries(BFIRST(bh), bh->b_data + bh->b_size,
                                         bh->b_data);
errout:
        if (error)
                __ext4_error_inode(inode, function, line, 0, -error,
                                   "corrupted xattr block %llu",
                                   (unsigned long long) bh->b_blocknr);
        else
                set_buffer_verified(bh);
        return error;
}

#define ext4_xattr_check_block(inode, bh) \
        __ext4_xattr_check_block((inode), (bh),  __func__, __LINE__)


int
__xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header,
                         void *end, const char *function, unsigned int line)
{
        int error = -EFSCORRUPTED;

        if (end - (void *)header < sizeof(*header) + sizeof(u32) ||
            (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)))
                goto errout;
        error = ext4_xattr_check_entries(IFIRST(header), end, IFIRST(header));
errout:
        if (error)
                __ext4_error_inode(inode, function, line, 0, -error,
                                   "corrupted in-inode xattr");
        return error;
}

static int
xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry,
                 void *end, int name_index, const char *name, int sorted)
{
        struct ext4_xattr_entry *entry, *next;
        size_t name_len;
        int cmp = 1;

        if (name == NULL)
                return -EINVAL;
        name_len = strlen(name);
        for (entry = *pentry; !IS_LAST_ENTRY(entry); entry = next) {
                next = EXT4_XATTR_NEXT(entry);
                if ((void *) next >= end) {
                        EXT4_ERROR_INODE(inode, "corrupted xattr entries");
                        return -EFSCORRUPTED;
                }
                cmp = name_index - entry->e_name_index;
                if (!cmp)
                        cmp = name_len - entry->e_name_len;
                if (!cmp)
                        cmp = memcmp(name, entry->e_name, name_len);
                if (cmp <= 0 && (sorted || cmp == 0))
                        break;
        }
        *pentry = entry;
        return cmp ? -ENODATA : 0;
}

static u32
ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size)
{
        return ext4_chksum(sbi, sbi->s_csum_seed, buffer, size);
}

static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode)
{
        return ((u64)ea_inode->i_ctime.tv_sec << 32) |
                (u32) inode_peek_iversion_raw(ea_inode);
}

static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count)
{
        ea_inode->i_ctime.tv_sec = (u32)(ref_count >> 32);
        inode_set_iversion_raw(ea_inode, ref_count & 0xffffffff);
}

static u32 ext4_xattr_inode_get_hash(struct inode *ea_inode)
{
        return (u32)ea_inode->i_atime.tv_sec;
}

static void ext4_xattr_inode_set_hash(struct inode *ea_inode, u32 hash)
{
        ea_inode->i_atime.tv_sec = hash;
}

/*
 * Read the EA value from an inode.
 */
static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size)
{
        int blocksize = 1 << ea_inode->i_blkbits;
        int bh_count = (size + blocksize - 1) >> ea_inode->i_blkbits;
        int tail_size = (size % blocksize) ?: blocksize;
        struct buffer_head *bhs_inline[8];
        struct buffer_head **bhs = bhs_inline;
        int i, ret;

        if (bh_count > ARRAY_SIZE(bhs_inline)) {
                bhs = kmalloc_array(bh_count, sizeof(*bhs), GFP_NOFS);
                if (!bhs)
                        return -ENOMEM;
        }

        ret = ext4_bread_batch(ea_inode, 0 /* block */, bh_count,
                               true /* wait */, bhs);
        if (ret)
                goto free_bhs;

        for (i = 0; i < bh_count; i++) {
                /* There shouldn't be any holes in ea_inode. */
                if (!bhs[i]) {
                        ret = -EFSCORRUPTED;
                        goto put_bhs;
                }
                memcpy((char *)buf + blocksize * i, bhs[i]->b_data,
                       i < bh_count - 1 ? blocksize : tail_size);
        }
        ret = 0;
put_bhs:
        for (i = 0; i < bh_count; i++)
                brelse(bhs[i]);
free_bhs:
        if (bhs != bhs_inline)
                kfree(bhs);
        return ret;
}

#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode)->i_mtime.tv_sec)

static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
                                 u32 ea_inode_hash, struct inode **ea_inode)
{
        struct inode *inode;
        int err;

        /*
         * We have to check for this corruption early as otherwise
         * iget_locked() could wait indefinitely for the state of our
         * parent inode.
         */
        if (parent->i_ino == ea_ino) {
                ext4_error(parent->i_sb,
                           "Parent and EA inode have the same ino %lu", ea_ino);
                return -EFSCORRUPTED;
        }

        inode = ext4_iget(parent->i_sb, ea_ino, EXT4_IGET_EA_INODE);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                ext4_error(parent->i_sb,
                           "error while reading EA inode %lu err=%d", ea_ino,
                           err);
                return err;
        }
        ext4_xattr_inode_set_class(inode);

        /*
         * Check whether this is an old Lustre-style xattr inode. Lustre
         * implementation does not have hash validation, rather it has a
         * backpointer from ea_inode to the parent inode.
         */
        if (ea_inode_hash != ext4_xattr_inode_get_hash(inode) &&
            EXT4_XATTR_INODE_GET_PARENT(inode) == parent->i_ino &&
            inode->i_generation == parent->i_generation) {
                ext4_set_inode_state(inode, EXT4_STATE_LUSTRE_EA_INODE);
                ext4_xattr_inode_set_ref(inode, 1);
        } else {
                inode_lock_nested(inode, I_MUTEX_XATTR);
                inode->i_flags |= S_NOQUOTA;
                inode_unlock(inode);
        }

        *ea_inode = inode;
        return 0;
}

/* Remove entry from mbcache when EA inode is getting evicted */
void ext4_evict_ea_inode(struct inode *inode)
{
        struct mb_cache_entry *oe;

        if (!EA_INODE_CACHE(inode))
                return;
        /* Wait for entry to get unused so that we can remove it */
        while ((oe = mb_cache_entry_delete_or_get(EA_INODE_CACHE(inode),
                        ext4_xattr_inode_get_hash(inode), inode->i_ino))) {
                mb_cache_entry_wait_unused(oe);
                mb_cache_entry_put(EA_INODE_CACHE(inode), oe);
        }
}

static int
ext4_xattr_inode_verify_hashes(struct inode *ea_inode,
                               struct ext4_xattr_entry *entry, void *buffer,
                               size_t size)
{
        u32 hash;

        /* Verify stored hash matches calculated hash. */
        hash = ext4_xattr_inode_hash(EXT4_SB(ea_inode->i_sb), buffer, size);
        if (hash != ext4_xattr_inode_get_hash(ea_inode))
                return -EFSCORRUPTED;

        if (entry) {
                __le32 e_hash, tmp_data;

                /* Verify entry hash. */
                tmp_data = cpu_to_le32(hash);
                e_hash = ext4_xattr_hash_entry(entry->e_name, entry->e_name_len,
                                               &tmp_data, 1);
                if (e_hash != entry->e_hash)
                        return -EFSCORRUPTED;
        }
        return 0;
}

/*
 * Read xattr value from the EA inode.
 */
static int
ext4_xattr_inode_get(struct inode *inode, struct ext4_xattr_entry *entry,
                     void *buffer, size_t size)
{
        struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
        struct inode *ea_inode;
        int err;

        err = ext4_xattr_inode_iget(inode, le32_to_cpu(entry->e_value_inum),
                                    le32_to_cpu(entry->e_hash), &ea_inode);
        if (err) {
                ea_inode = NULL;
                goto out;
        }

        if (i_size_read(ea_inode) != size) {
                ext4_warning_inode(ea_inode,
                                   "ea_inode file size=%llu entry size=%zu",
                                   i_size_read(ea_inode), size);
                err = -EFSCORRUPTED;
                goto out;
        }

        err = ext4_xattr_inode_read(ea_inode, buffer, size);
        if (err)
                goto out;

        if (!ext4_test_inode_state(ea_inode, EXT4_STATE_LUSTRE_EA_INODE)) {
                err = ext4_xattr_inode_verify_hashes(ea_inode, entry, buffer,
                                                     size);
                if (err) {
                        ext4_warning_inode(ea_inode,
                                           "EA inode hash validation failed");
                        goto out;
                }

                if (ea_inode_cache)
                        mb_cache_entry_create(ea_inode_cache, GFP_NOFS,
                                        ext4_xattr_inode_get_hash(ea_inode),
                                        ea_inode->i_ino, true /* reusable */);
        }
out:
        iput(ea_inode);
        return err;
}

static int
ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
                     void *buffer, size_t buffer_size)
{
        struct buffer_head *bh = NULL;
        struct ext4_xattr_entry *entry;
        size_t size;
        void *end;
        int error;
        struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);

        ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
                  name_index, name, buffer, (long)buffer_size);

        if (!EXT4_I(inode)->i_file_acl)
                return -ENODATA;
        ea_idebug(inode, "reading block %llu",
                  (unsigned long long)EXT4_I(inode)->i_file_acl);
        bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
        if (IS_ERR(bh))
                return PTR_ERR(bh);
        ea_bdebug(bh, "b_count=%d, refcount=%d",
                atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
        error = ext4_xattr_check_block(inode, bh);
        if (error)
                goto cleanup;
        ext4_xattr_block_cache_insert(ea_block_cache, bh);
        entry = BFIRST(bh);
        end = bh->b_data + bh->b_size;
        error = xattr_find_entry(inode, &entry, end, name_index, name, 1);
        if (error)
                goto cleanup;
        size = le32_to_cpu(entry->e_value_size);
        error = -ERANGE;
        if (unlikely(size > EXT4_XATTR_SIZE_MAX))
                goto cleanup;
        if (buffer) {
                if (size > buffer_size)
                        goto cleanup;
                if (entry->e_value_inum) {
                        error = ext4_xattr_inode_get(inode, entry, buffer,
                                                     size);
                        if (error)
                                goto cleanup;
                } else {
                        u16 offset = le16_to_cpu(entry->e_value_offs);
                        void *p = bh->b_data + offset;

                        if (unlikely(p + size > end))
                                goto cleanup;
                        memcpy(buffer, p, size);
                }
        }
        error = size;

cleanup:
        brelse(bh);
        return error;
}

int
ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
                     void *buffer, size_t buffer_size)
{
        struct ext4_xattr_ibody_header *header;
        struct ext4_xattr_entry *entry;
        struct ext4_inode *raw_inode;
        struct ext4_iloc iloc;
        size_t size;
        void *end;
        int error;

        if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
                return -ENODATA;
        error = ext4_get_inode_loc(inode, &iloc);
        if (error)
                return error;
        raw_inode = ext4_raw_inode(&iloc);
        header = IHDR(inode, raw_inode);
        end = ITAIL(inode, raw_inode);
        entry = IFIRST(header);
        error = xattr_find_entry(inode, &entry, end, name_index, name, 0);
        if (error)
                goto cleanup;
        size = le32_to_cpu(entry->e_value_size);
        error = -ERANGE;
        if (unlikely(size > EXT4_XATTR_SIZE_MAX))
                goto cleanup;
        if (buffer) {
                if (size > buffer_size)
                        goto cleanup;
                if (entry->e_value_inum) {
                        error = ext4_xattr_inode_get(inode, entry, buffer,
                                                     size);
                        if (error)
                                goto cleanup;
                } else {
                        u16 offset = le16_to_cpu(entry->e_value_offs);
                        void *p = (void *)IFIRST(header) + offset;

                        if (unlikely(p + size > end))
                                goto cleanup;
                        memcpy(buffer, p, size);
                }
        }
        error = size;

cleanup:
        brelse(iloc.bh);
        return error;
}

/*
 * ext4_xattr_get()
 *
 * Copy an extended attribute into the buffer
 * provided, or compute the buffer size required.
 * Buffer is NULL to compute the size of the buffer required.
 *
 * Returns a negative error number on failure, or the number of bytes
 * used / required on success.
 */
int
ext4_xattr_get(struct inode *inode, int name_index, const char *name,
               void *buffer, size_t buffer_size)
{
        int error;

        if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
                return -EIO;

        if (strlen(name) > 255)
                return -ERANGE;

        down_read(&EXT4_I(inode)->xattr_sem);
        error = ext4_xattr_ibody_get(inode, name_index, name, buffer,
                                     buffer_size);
        if (error == -ENODATA)
                error = ext4_xattr_block_get(inode, name_index, name, buffer,
                                             buffer_size);
        up_read(&EXT4_I(inode)->xattr_sem);
        return error;
}

static int
ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry,
                        char *buffer, size_t buffer_size)
{
        size_t rest = buffer_size;

        for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
                const struct xattr_handler *handler =
                        ext4_xattr_handler(entry->e_name_index);

                if (handler && (!handler->list || handler->list(dentry))) {
                        const char *prefix = handler->prefix ?: handler->name;
                        size_t prefix_len = strlen(prefix);
                        size_t size = prefix_len + entry->e_name_len + 1;

                        if (buffer) {
                                if (size > rest)
                                        return -ERANGE;
                                memcpy(buffer, prefix, prefix_len);
                                buffer += prefix_len;
                                memcpy(buffer, entry->e_name, entry->e_name_len);
                                buffer += entry->e_name_len;
                                *buffer++ = 0;
                        }
                        rest -= size;
                }
        }
        return buffer_size - rest;  /* total size */
}

static int
ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
{
        struct inode *inode = d_inode(dentry);
        struct buffer_head *bh = NULL;
        int error;

        ea_idebug(inode, "buffer=%p, buffer_size=%ld",
                  buffer, (long)buffer_size);

        if (!EXT4_I(inode)->i_file_acl)
                return 0;
        ea_idebug(inode, "reading block %llu",
                  (unsigned long long)EXT4_I(inode)->i_file_acl);
        bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
        if (IS_ERR(bh))
                return PTR_ERR(bh);
        ea_bdebug(bh, "b_count=%d, refcount=%d",
                atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
        error = ext4_xattr_check_block(inode, bh);
        if (error)
                goto cleanup;
        ext4_xattr_block_cache_insert(EA_BLOCK_CACHE(inode), bh);
        error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer,
                                        buffer_size);
cleanup:
        brelse(bh);
        return error;
}

static int
ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
{
        struct inode *inode = d_inode(dentry);
        struct ext4_xattr_ibody_header *header;
        struct ext4_inode *raw_inode;
        struct ext4_iloc iloc;
        int error;

        if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
                return 0;
        error = ext4_get_inode_loc(inode, &iloc);
        if (error)
                return error;
        raw_inode = ext4_raw_inode(&iloc);
        header = IHDR(inode, raw_inode);
        error = ext4_xattr_list_entries(dentry, IFIRST(header),
                                        buffer, buffer_size);

        brelse(iloc.bh);
        return error;
}

/*
 * Inode operation listxattr()
 *
 * d_inode(dentry)->i_rwsem: don't care
 *
 * Copy a list of attribute names into the buffer
 * provided, or compute the buffer size required.
 * Buffer is NULL to compute the size of the buffer required.
 *
 * Returns a negative error number on failure, or the number of bytes
 * used / required on success.
 */
ssize_t
ext4_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
{
        int ret, ret2;

        down_read(&EXT4_I(d_inode(dentry))->xattr_sem);
        ret = ret2 = ext4_xattr_ibody_list(dentry, buffer, buffer_size);
        if (ret < 0)
                goto errout;
        if (buffer) {
                buffer += ret;
                buffer_size -= ret;
        }
        ret = ext4_xattr_block_list(dentry, buffer, buffer_size);
        if (ret < 0)
                goto errout;
        ret += ret2;
errout:
        up_read(&EXT4_I(d_inode(dentry))->xattr_sem);
        return ret;
}

/*
 * If the EXT4_FEATURE_COMPAT_EXT_ATTR feature of this file system is
 * not set, set it.
 */
static void ext4_xattr_update_super_block(handle_t *handle,
                                          struct super_block *sb)
{
        if (ext4_has_feature_xattr(sb))
                return;

        BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
        if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
                ext4_set_feature_xattr(sb);
                ext4_handle_dirty_super(handle, sb);
        }
}

int ext4_get_inode_usage(struct inode *inode, qsize_t *usage)
{
        struct ext4_iloc iloc = { .bh = NULL };
        struct buffer_head *bh = NULL;
        struct ext4_inode *raw_inode;
        struct ext4_xattr_ibody_header *header;
        struct ext4_xattr_entry *entry;
        qsize_t ea_inode_refs = 0;
        int ret;

        lockdep_assert_held_read(&EXT4_I(inode)->xattr_sem);

        if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
                ret = ext4_get_inode_loc(inode, &iloc);
                if (ret)
                        goto out;
                raw_inode = ext4_raw_inode(&iloc);
                header = IHDR(inode, raw_inode);

                for (entry = IFIRST(header); !IS_LAST_ENTRY(entry);
                     entry = EXT4_XATTR_NEXT(entry))
                        if (entry->e_value_inum)
                                ea_inode_refs++;
        }

        if (EXT4_I(inode)->i_file_acl) {
                bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
                if (IS_ERR(bh)) {
                        ret = PTR_ERR(bh);
                        bh = NULL;
                        goto out;
                }

                ret = ext4_xattr_check_block(inode, bh);
                if (ret)
                        goto out;

                for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
                     entry = EXT4_XATTR_NEXT(entry))
                        if (entry->e_value_inum)
                                ea_inode_refs++;
        }
        *usage = ea_inode_refs + 1;
        ret = 0;
out:
        brelse(iloc.bh);
        brelse(bh);
        return ret;
}

static inline size_t round_up_cluster(struct inode *inode, size_t length)
{
        struct super_block *sb = inode->i_sb;
        size_t cluster_size = 1 << (EXT4_SB(sb)->s_cluster_bits +
                                    inode->i_blkbits);
        size_t mask = ~(cluster_size - 1);

        return (length + cluster_size - 1) & mask;
}

static int ext4_xattr_inode_alloc_quota(struct inode *inode, size_t len)
{
        int err;

        err = dquot_alloc_inode(inode);
        if (err)
                return err;
        err = dquot_alloc_space_nodirty(inode, round_up_cluster(inode, len));
        if (err)
                dquot_free_inode(inode);
        return err;
}

static void ext4_xattr_inode_free_quota(struct inode *parent,
                                        struct inode *ea_inode,
                                        size_t len)
{
        if (ea_inode &&
            ext4_test_inode_state(ea_inode, EXT4_STATE_LUSTRE_EA_INODE))
                return;
        dquot_free_space_nodirty(parent, round_up_cluster(parent, len));
        dquot_free_inode(parent);
}

int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode,
                             struct buffer_head *block_bh, size_t value_len,
                             bool is_create)
{
        int credits;
        int blocks;

        /*
         * 1) Owner inode update
         * 2) Ref count update on old xattr block
         * 3) new xattr block
         * 4) block bitmap update for new xattr block
         * 5) group descriptor for new xattr block
         * 6) block bitmap update for old xattr block
         * 7) group descriptor for old block
         *
         * 6 & 7 can happen if we have two racing threads T_a and T_b
         * which are each trying to set an xattr on inodes I_a and I_b
         * which were both initially sharing an xattr block.
         */
        credits = 7;

        /* Quota updates. */
        credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(sb);

        /*
         * In case of inline data, we may push out the data to a block,
         * so we need to reserve credits for this eventuality
         */
        if (inode && ext4_has_inline_data(inode))
                credits += ext4_writepage_trans_blocks(inode) + 1;

        /* We are done if ea_inode feature is not enabled. */
        if (!ext4_has_feature_ea_inode(sb))
                return credits;

        /* New ea_inode, inode map, block bitmap, group descriptor. */
        credits += 4;

        /* Data blocks. */
        blocks = (value_len + sb->s_blocksize - 1) >> sb->s_blocksize_bits;

        /* Indirection block or one level of extent tree. */
        blocks += 1;

        /* Block bitmap and group descriptor updates for each block. */
        credits += blocks * 2;

        /* Blocks themselves. */
        credits += blocks;

        if (!is_create) {
                /* Dereference ea_inode holding old xattr value.
                 * Old ea_inode, inode map, block bitmap, group descriptor.
                 */
                credits += 4;

                /* Data blocks for old ea_inode. */
                blocks = XATTR_SIZE_MAX >> sb->s_blocksize_bits;

                /* Indirection block or one level of extent tree for old
                 * ea_inode.
                 */
                blocks += 1;

                /* Block bitmap and group descriptor updates for each block. */
                credits += blocks * 2;
        }

        /* We may need to clone the existing xattr block in which case we need
         * to increment ref counts for existing ea_inodes referenced by it.
         */
        if (block_bh) {
                struct ext4_xattr_entry *entry = BFIRST(block_bh);

                for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry))
                        if (entry->e_value_inum)
                                /* Ref count update on ea_inode. */
                                credits += 1;
        }
        return credits;
}

static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
                                       int ref_change)
{
        struct ext4_iloc iloc;
        u64 ref_count;
        int ret;

        inode_lock_nested(ea_inode, I_MUTEX_XATTR);

        ret = ext4_reserve_inode_write(handle, ea_inode, &iloc);
        if (ret)
                goto out;

        ref_count = ext4_xattr_inode_get_ref(ea_inode);
        if ((ref_count == 0 && ref_change < 0) || (ref_count == U64_MAX && ref_change > 0)) {
                ext4_error_inode(ea_inode, __func__, __LINE__, 0,
                        "EA inode %lu ref wraparound: ref_count=%lld ref_change=%d",
                        ea_inode->i_ino, ref_count, ref_change);
                brelse(iloc.bh);
                ret = -EFSCORRUPTED;
                goto out;
        }
        ref_count += ref_change;
        ext4_xattr_inode_set_ref(ea_inode, ref_count);

        if (ref_change > 0) {
                if (ref_count == 1) {
                        WARN_ONCE(ea_inode->i_nlink, "EA inode %lu i_nlink=%u",
                                  ea_inode->i_ino, ea_inode->i_nlink);

                        set_nlink(ea_inode, 1);
                        ext4_orphan_del(handle, ea_inode);
                }
        } else {
                if (ref_count == 0) {
                        WARN_ONCE(ea_inode->i_nlink != 1,
                                  "EA inode %lu i_nlink=%u",
                                  ea_inode->i_ino, ea_inode->i_nlink);

                        clear_nlink(ea_inode);
                        ext4_orphan_add(handle, ea_inode);
                }
        }

        ret = ext4_mark_iloc_dirty(handle, ea_inode, &iloc);
        if (ret)
                ext4_warning_inode(ea_inode,
                                   "ext4_mark_iloc_dirty() failed ret=%d", ret);
out:
        inode_unlock(ea_inode);
        return ret;
}

static int ext4_xattr_inode_inc_ref(handle_t *handle, struct inode *ea_inode)
{
        return ext4_xattr_inode_update_ref(handle, ea_inode, 1);
}

static int ext4_xattr_inode_dec_ref(handle_t *handle, struct inode *ea_inode)
{
        return ext4_xattr_inode_update_ref(handle, ea_inode, -1);
}

static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent,
                                        struct ext4_xattr_entry *first)
{
        struct inode *ea_inode;
        struct ext4_xattr_entry *entry;
        struct ext4_xattr_entry *failed_entry;
        unsigned int ea_ino;
        int err, saved_err;

        for (entry = first; !IS_LAST_ENTRY(entry);
             entry = EXT4_XATTR_NEXT(entry)) {
                if (!entry->e_value_inum)
                        continue;
                ea_ino = le32_to_cpu(entry->e_value_inum);
                err = ext4_xattr_inode_iget(parent, ea_ino,
                                            le32_to_cpu(entry->e_hash),
                                            &ea_inode);
                if (err)
                        goto cleanup;
                err = ext4_xattr_inode_inc_ref(handle, ea_inode);
                if (err) {
                        ext4_warning_inode(ea_inode, "inc ref error %d", err);
                        iput(ea_inode);
                        goto cleanup;
                }
                iput(ea_inode);
        }
        return 0;

cleanup:
        saved_err = err;
        failed_entry = entry;

        for (entry = first; entry != failed_entry;
             entry = EXT4_XATTR_NEXT(entry)) {
                if (!entry->e_value_inum)
                        continue;
                ea_ino = le32_to_cpu(entry->e_value_inum);
                err = ext4_xattr_inode_iget(parent, ea_ino,
                                            le32_to_cpu(entry->e_hash),
                                            &ea_inode);
                if (err) {
                        ext4_warning(parent->i_sb,
                                     "cleanup ea_ino %u iget error %d", ea_ino,
                                     err);
                        continue;
                }
                err = ext4_xattr_inode_dec_ref(handle, ea_inode);
                if (err)
                        ext4_warning_inode(ea_inode, "cleanup dec ref error %d",
                                           err);
                iput(ea_inode);
        }
        return saved_err;
}

static int ext4_xattr_restart_fn(handle_t *handle, struct inode *inode,
                        struct buffer_head *bh, bool block_csum, bool dirty)
{
        int error;

        if (bh && dirty) {
                if (block_csum)
                        ext4_xattr_block_csum_set(inode, bh);
                error = ext4_handle_dirty_metadata(handle, NULL, bh);
                if (error) {
                        ext4_warning(inode->i_sb, "Handle metadata (error %d)",
                                     error);
                        return error;
                }
        }
        return 0;
}

static void
ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
                             struct buffer_head *bh,
                             struct ext4_xattr_entry *first, bool block_csum,
                             struct ext4_xattr_inode_array **ea_inode_array,
                             int extra_credits, bool skip_quota)
{
        struct inode *ea_inode;
        struct ext4_xattr_entry *entry;
        struct ext4_iloc iloc;
        bool dirty = false;
        unsigned int ea_ino;
        int err;
        int credits;
        void *end;

        if (block_csum)
                end = (void *)bh->b_data + bh->b_size;
        else {
                err = ext4_get_inode_loc(parent, &iloc);
                if (err) {
                        EXT4_ERROR_INODE(parent, "parent inode loc (error %d)", err);
                        return;
                }
                end = (void *)ext4_raw_inode(&iloc) + EXT4_SB(parent->i_sb)->s_inode_size;
        }

        /* One credit for dec ref on ea_inode, one for orphan list addition, */
        credits = 2 + extra_credits;

        for (entry = first; (void *)entry < end && !IS_LAST_ENTRY(entry);
             entry = EXT4_XATTR_NEXT(entry)) {
                if (!entry->e_value_inum)
                        continue;
                ea_ino = le32_to_cpu(entry->e_value_inum);
                err = ext4_xattr_inode_iget(parent, ea_ino,
                                            le32_to_cpu(entry->e_hash),
                                            &ea_inode);
                if (err)
                        continue;

                err = ext4_expand_inode_array(ea_inode_array, ea_inode);
                if (err) {
                        ext4_warning_inode(ea_inode,
                                           "Expand inode array err=%d", err);
                        iput(ea_inode);
                        continue;
                }

                err = ext4_journal_ensure_credits_fn(handle, credits, credits,
                        ext4_free_metadata_revoke_credits(parent->i_sb, 1),
                        ext4_xattr_restart_fn(handle, parent, bh, block_csum,
                                              dirty));
                if (err < 0) {
                        ext4_warning_inode(ea_inode, "Ensure credits err=%d",
                                           err);
                        continue;
                }
                if (err > 0) {
                        err = ext4_journal_get_write_access(handle, bh);
                        if (err) {
                                ext4_warning_inode(ea_inode,
                                                "Re-get write access err=%d",
                                                err);
                                continue;
                        }
                }

                err = ext4_xattr_inode_dec_ref(handle, ea_inode);
                if (err) {
                        ext4_warning_inode(ea_inode, "ea_inode dec ref err=%d",
                                           err);
                        continue;
                }

                if (!skip_quota)
                        ext4_xattr_inode_free_quota(parent, ea_inode,
                                              le32_to_cpu(entry->e_value_size));

                /*
                 * Forget about ea_inode within the same transaction that
                 * decrements the ref count. This avoids duplicate decrements in
                 * case the rest of the work spills over to subsequent
                 * transactions.
                 */
                entry->e_value_inum = 0;
                entry->e_value_size = 0;

                dirty = true;
        }

        if (dirty) {
                /*
                 * Note that we are deliberately skipping csum calculation for
                 * the final update because we do not expect any journal
                 * restarts until xattr block is freed.
                 */

                err = ext4_handle_dirty_metadata(handle, NULL, bh);
                if (err)
                        ext4_warning_inode(parent,
                                           "handle dirty metadata err=%d", err);
        }
}

/*
 * Release the xattr block BH: If the reference count is > 1, decrement it;
 * otherwise free the block.
 */
static void
ext4_xattr_release_block(handle_t *handle, struct inode *inode,
                         struct buffer_head *bh,
                         struct ext4_xattr_inode_array **ea_inode_array,
                         int extra_credits)
{
        struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);
        u32 hash, ref;
        int error = 0;

        BUFFER_TRACE(bh, "get_write_access");
        error = ext4_journal_get_write_access(handle, bh);
        if (error)
                goto out;

retry_ref:
        lock_buffer(bh);
        hash = le32_to_cpu(BHDR(bh)->h_hash);
        ref = le32_to_cpu(BHDR(bh)->h_refcount);
        if (ref == 1) {
                ea_bdebug(bh, "refcount now=0; freeing");
                /*
                 * This must happen under buffer lock for
                 * ext4_xattr_block_set() to reliably detect freed block
                 */
                if (ea_block_cache) {
                        struct mb_cache_entry *oe;

                        oe = mb_cache_entry_delete_or_get(ea_block_cache, hash,
                                                          bh->b_blocknr);
                        if (oe) {
                                unlock_buffer(bh);
                                mb_cache_entry_wait_unused(oe);
                                mb_cache_entry_put(ea_block_cache, oe);
                                goto retry_ref;
                        }
                }
                get_bh(bh);
                unlock_buffer(bh);

                if (ext4_has_feature_ea_inode(inode->i_sb))
                        ext4_xattr_inode_dec_ref_all(handle, inode, bh,
                                                     BFIRST(bh),
                                                     true /* block_csum */,
                                                     ea_inode_array,
                                                     extra_credits,
                                                     true /* skip_quota */);
                ext4_free_blocks(handle, inode, bh, 0, 1,
                                 EXT4_FREE_BLOCKS_METADATA |
                                 EXT4_FREE_BLOCKS_FORGET);
        } else {
                ref--;
                BHDR(bh)->h_refcount = cpu_to_le32(ref);
                if (ref == EXT4_XATTR_REFCOUNT_MAX - 1) {
                        struct mb_cache_entry *ce;

                        if (ea_block_cache) {
                                ce = mb_cache_entry_get(ea_block_cache, hash,
                                                        bh->b_blocknr);
                                if (ce) {
                                        set_bit(MBE_REUSABLE_B, &ce->e_flags);
                                        mb_cache_entry_put(ea_block_cache, ce);
                                }
                        }
                }

                ext4_xattr_block_csum_set(inode, bh);
                /*
                 * Beware of this ugliness: Releasing of xattr block references
                 * from different inodes can race and so we have to protect
                 * from a race where someone else frees the block (and releases
                 * its journal_head) before we are done dirtying the buffer. In
                 * nojournal mode this race is harmless and we actually cannot
                 * call ext4_handle_dirty_metadata() with locked buffer as
                 * that function can call sync_dirty_buffer() so for that case
                 * we handle the dirtying after unlocking the buffer.
                 */
                if (ext4_handle_valid(handle))
                        error = ext4_handle_dirty_metadata(handle, inode, bh);
                unlock_buffer(bh);
                if (!ext4_handle_valid(handle))
                        error = ext4_handle_dirty_metadata(handle, inode, bh);
                if (IS_SYNC(inode))
                        ext4_handle_sync(handle);
                dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1));
                ea_bdebug(bh, "refcount now=%d; releasing",
                          le32_to_cpu(BHDR(bh)->h_refcount));
        }
out:
        ext4_std_error(inode->i_sb, error);
        return;
}

/*
 * Find the available free space for EAs. This also returns the total number of
 * bytes used by EA entries.
 */
static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
                                    size_t *min_offs, void *base, int *total)
{
        for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
                if (!last->e_value_inum && last->e_value_size) {
                        size_t offs = le16_to_cpu(last->e_value_offs);
                        if (offs < *min_offs)
                                *min_offs = offs;
                }
                if (total)
                        *total += EXT4_XATTR_LEN(last->e_name_len);
        }
        return (*min_offs - ((void *)last - base) - sizeof(__u32));
}

/*
 * Write the value of the EA in an inode.
 */
static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode,
                                  const void *buf, int bufsize)
{
        struct buffer_head *bh = NULL;
        unsigned long block = 0;
        int blocksize = ea_inode->i_sb->s_blocksize;
        int max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits;
        int csize, wsize = 0;
        int ret = 0, ret2 = 0;
        int retries = 0;

retry:
        while (ret >= 0 && ret < max_blocks) {
                struct ext4_map_blocks map;
                map.m_lblk = block += ret;
                map.m_len = max_blocks -= ret;

                ret = ext4_map_blocks(handle, ea_inode, &map,
                                      EXT4_GET_BLOCKS_CREATE);
                if (ret <= 0) {
                        ext4_mark_inode_dirty(handle, ea_inode);
                        if (ret == -ENOSPC &&
                            ext4_should_retry_alloc(ea_inode->i_sb, &retries)) {
                                ret = 0;
                                goto retry;
                        }
                        break;
                }
        }

        if (ret < 0)
                return ret;

        block = 0;
        while (wsize < bufsize) {
                brelse(bh);
                csize = (bufsize - wsize) > blocksize ? blocksize :
                                                                bufsize - wsize;
                bh = ext4_getblk(handle, ea_inode, block, 0);
                if (IS_ERR(bh))
                        return PTR_ERR(bh);
                if (!bh) {
                        WARN_ON_ONCE(1);
                        EXT4_ERROR_INODE(ea_inode,
                                         "ext4_getblk() return bh = NULL");
                        return -EFSCORRUPTED;
                }
                ret = ext4_journal_get_write_access(handle, bh);
                if (ret)
                        goto out;

                memcpy(bh->b_data, buf, csize);
                /*
                 * Zero out block tail to avoid writing uninitialized memory
                 * to disk.
                 */
                if (csize < blocksize)
                        memset(bh->b_data + csize, 0, blocksize - csize);
                set_buffer_uptodate(bh);
                ext4_handle_dirty_metadata(handle, ea_inode, bh);

                buf += csize;
                wsize += csize;
                block += 1;
        }

        inode_lock(ea_inode);
        i_size_write(ea_inode, wsize);
        ext4_update_i_disksize(ea_inode, wsize);
        inode_unlock(ea_inode);

        ret2 = ext4_mark_inode_dirty(handle, ea_inode);
        if (unlikely(ret2 && !ret))
                ret = ret2;

out:
        brelse(bh);

        return ret;
}

/*
 * Create an inode to store the value of a large EA.
 */
static struct inode *ext4_xattr_inode_create(handle_t *handle,
                                             struct inode *inode, u32 hash)
{
        struct inode *ea_inode = NULL;
        uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) };
        int err;

        if (inode->i_sb->s_root == NULL) {
                ext4_warning(inode->i_sb,
                             "refuse to create EA inode when umounting");
                WARN_ON(1);
                return ERR_PTR(-EINVAL);
        }

        /*
         * Let the next inode be the goal, so we try and allocate the EA inode
         * in the same group, or nearby one.
         */
        ea_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
                                  S_IFREG | 0600, NULL, inode->i_ino + 1, owner,
                                  EXT4_EA_INODE_FL);
        if (!IS_ERR(ea_inode)) {
                ea_inode->i_op = &ext4_file_inode_operations;
                ea_inode->i_fop = &ext4_file_operations;
                ext4_set_aops(ea_inode);
                ext4_xattr_inode_set_class(ea_inode);
                unlock_new_inode(ea_inode);
                ext4_xattr_inode_set_ref(ea_inode, 1);
                ext4_xattr_inode_set_hash(ea_inode, hash);
                err = ext4_mark_inode_dirty(handle, ea_inode);
                if (!err)
                        err = ext4_inode_attach_jinode(ea_inode);
                if (err) {
                        if (ext4_xattr_inode_dec_ref(handle, ea_inode))
                                ext4_warning_inode(ea_inode,
                                        "cleanup dec ref error %d", err);
                        iput(ea_inode);
                        return ERR_PTR(err);
                }

                /*
                 * Xattr inodes are shared therefore quota charging is performed
                 * at a higher level.
                 */
                dquot_free_inode(ea_inode);
                dquot_drop(ea_inode);
                inode_lock(ea_inode);
                ea_inode->i_flags |= S_NOQUOTA;
                inode_unlock(ea_inode);
        }

        return ea_inode;
}

static struct inode *
ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
                            size_t value_len, u32 hash)
{
        struct inode *ea_inode;
        struct mb_cache_entry *ce;
        struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
        void *ea_data;

        if (!ea_inode_cache)
                return NULL;

        ce = mb_cache_entry_find_first(ea_inode_cache, hash);
        if (!ce)
                return NULL;

        WARN_ON_ONCE(ext4_handle_valid(journal_current_handle()) &&
                     !(current->flags & PF_MEMALLOC_NOFS));

        ea_data = kvmalloc(value_len, GFP_NOFS);
        if (!ea_data) {
                mb_cache_entry_put(ea_inode_cache, ce);
                return NULL;
        }

        while (ce) {
                ea_inode = ext4_iget(inode->i_sb, ce->e_value,
                                     EXT4_IGET_EA_INODE);
                if (IS_ERR(ea_inode))
                        goto next_entry;
                ext4_xattr_inode_set_class(ea_inode);
                if (i_size_read(ea_inode) == value_len &&
                    !ext4_xattr_inode_read(ea_inode, ea_data, value_len) &&
                    !ext4_xattr_inode_verify_hashes(ea_inode, NULL, ea_data,
                                                    value_len) &&
                    !memcmp(value, ea_data, value_len)) {
                        mb_cache_entry_touch(ea_inode_cache, ce);
                        mb_cache_entry_put(ea_inode_cache, ce);
                        kvfree(ea_data);
                        return ea_inode;
                }
                iput(ea_inode);
        next_entry:
                ce = mb_cache_entry_find_next(ea_inode_cache, ce);
        }
        kvfree(ea_data);
        return NULL;
}

/*
 * Add value of the EA in an inode.
 */
static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode,
                                          const void *value, size_t value_len,
                                          struct inode **ret_inode)
{
        struct inode *ea_inode;
        u32 hash;
        int err;

        hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), value, value_len);
        ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash);
        if (ea_inode) {
                err = ext4_xattr_inode_inc_ref(handle, ea_inode);
                if (err) {
                        iput(ea_inode);
                        return err;
                }

                *ret_inode = ea_inode;
                return 0;
        }

        /* Create an inode for the EA value */
        ea_inode = ext4_xattr_inode_create(handle, inode, hash);
        if (IS_ERR(ea_inode))
                return PTR_ERR(ea_inode);

        err = ext4_xattr_inode_write(handle, ea_inode, value, value_len);
        if (err) {
                ext4_xattr_inode_dec_ref(handle, ea_inode);
                iput(ea_inode);
                return err;
        }

        if (EA_INODE_CACHE(inode))
                mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash,
                                      ea_inode->i_ino, true /* reusable */);

        *ret_inode = ea_inode;
        return 0;
}

/*
 * Reserve min(block_size/8, 1024) bytes for xattr entries/names if ea_inode
 * feature is enabled.
 */
#define EXT4_XATTR_BLOCK_RESERVE(inode)        min(i_blocksize(inode)/8, 1024U)

static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
                                struct ext4_xattr_search *s,
                                handle_t *handle, struct inode *inode,
                                bool is_block)
{
        struct ext4_xattr_entry *last, *next;
        struct ext4_xattr_entry *here = s->here;
        size_t min_offs = s->end - s->base, name_len = strlen(i->name);
        int in_inode = i->in_inode;
        struct inode *old_ea_inode = NULL;
        struct inode *new_ea_inode = NULL;
        size_t old_size, new_size;
        int ret;

        /* Space used by old and new values. */
        old_size = (!s->not_found && !here->e_value_inum) ?
                        EXT4_XATTR_SIZE(le32_to_cpu(here->e_value_size)) : 0;
        new_size = (i->value && !in_inode) ? EXT4_XATTR_SIZE(i->value_len) : 0;

        /*
         * Optimization for the simple case when old and new values have the
         * same padded sizes. Not applicable if external inodes are involved.
         */
        if (new_size && new_size == old_size) {
                size_t offs = le16_to_cpu(here->e_value_offs);
                void *val = s->base + offs;

                here->e_value_size = cpu_to_le32(i->value_len);
                if (i->value == EXT4_ZERO_XATTR_VALUE) {
                        memset(val, 0, new_size);
                } else {
                        memcpy(val, i->value, i->value_len);
                        /* Clear padding bytes. */
                        memset(val + i->value_len, 0, new_size - i->value_len);
                }
                goto update_hash;
        }

        /* Compute min_offs and last. */
        last = s->first;
        for (; !IS_LAST_ENTRY(last); last = next) {
                next = EXT4_XATTR_NEXT(last);
                if ((void *)next >= s->end) {
                        EXT4_ERROR_INODE(inode, "corrupted xattr entries");
                        ret = -EFSCORRUPTED;
                        goto out;
                }
                if (!last->e_value_inum && last->e_value_size) {
                        size_t offs = le16_to_cpu(last->e_value_offs);
                        if (offs < min_offs)
                                min_offs = offs;
                }
        }

        /* Check whether we have enough space. */
        if (i->value) {
                size_t free;

                free = min_offs - ((void *)last - s->base) - sizeof(__u32);
                if (!s->not_found)
                        free += EXT4_XATTR_LEN(name_len) + old_size;

                if (free < EXT4_XATTR_LEN(name_len) + new_size) {
                        ret = -ENOSPC;
                        goto out;
                }

                /*
                 * If storing the value in an external inode is an option,
                 * reserve space for xattr entries/names in the external
                 * attribute block so that a long value does not occupy the
                 * whole space and prevent further entries being added.
                 */
                if (ext4_has_feature_ea_inode(inode->i_sb) &&
                    new_size && is_block &&
                    (min_offs + old_size - new_size) <
                                        EXT4_XATTR_BLOCK_RESERVE(inode)) {
                        ret = -ENOSPC;
                        goto out;
                }
        }

        /*
         * Getting access to old and new ea inodes is subject to failures.
         * Finish that work before doing any modifications to the xattr data.
         */
        if (!s->not_found && here->e_value_inum) {
                ret = ext4_xattr_inode_iget(inode,
                                            le32_to_cpu(here->e_value_inum),
                                            le32_to_cpu(here->e_hash),
                                            &old_ea_inode);
                if (ret) {
                        old_ea_inode = NULL;
                        goto out;
                }
        }
        if (i->value && in_inode) {
                WARN_ON_ONCE(!i->value_len);

                ret = ext4_xattr_inode_alloc_quota(inode, i->value_len);
                if (ret)
                        goto out;

                ret = ext4_xattr_inode_lookup_create(handle, inode, i->value,
                                                     i->value_len,
                                                     &new_ea_inode);
                if (ret) {
                        new_ea_inode = NULL;
                        ext4_xattr_inode_free_quota(inode, NULL, i->value_len);
                        goto out;
                }
        }

        if (old_ea_inode) {
                /* We are ready to release ref count on the old_ea_inode. */
                ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode);
                if (ret) {
                        /* Release newly required ref count on new_ea_inode. */
                        if (new_ea_inode) {
                                int err;

                                err = ext4_xattr_inode_dec_ref(handle,
                                                               new_ea_inode);
                                if (err)
                                        ext4_warning_inode(new_ea_inode,
                                                  "dec ref new_ea_inode err=%d",
                                                  err);
                                ext4_xattr_inode_free_quota(inode, new_ea_inode,
                                                            i->value_len);
                        }
                        goto out;
                }

                ext4_xattr_inode_free_quota(inode, old_ea_inode,
                                            le32_to_cpu(here->e_value_size));
        }

        /* No failures allowed past this point. */

        if (!s->not_found && here->e_value_size && !here->e_value_inum) {
                /* Remove the old value. */
                void *first_val = s->base + min_offs;
                size_t offs = le16_to_cpu(here->e_value_offs);
                void *val = s->base + offs;

                memmove(first_val + old_size, first_val, val - first_val);
                memset(first_val, 0, old_size);
                min_offs += old_size;

                /* Adjust all value offsets. */
                last = s->first;
                while (!IS_LAST_ENTRY(last)) {
                        size_t o = le16_to_cpu(last->e_value_offs);

                        if (!last->e_value_inum &&
                            last->e_value_size && o < offs)
                                last->e_value_offs = cpu_to_le16(o + old_size);
                        last = EXT4_XATTR_NEXT(last);
                }
        }

        if (!i->value) {
                /* Remove old name. */
                size_t size = EXT4_XATTR_LEN(name_len);

                last = ENTRY((void *)last - size);
                memmove(here, (void *)here + size,
                        (void *)last - (void *)here + sizeof(__u32));
                memset(last, 0, size);

                /*
                 * Update i_inline_off - moved ibody region might contain
                 * system.data attribute.  Handling a failure here won't
                 * cause other complications for setting an xattr.
                 */
                if (!is_block && ext4_has_inline_data(inode)) {
                        ret = ext4_find_inline_data_nolock(inode);
                        if (ret) {
                                ext4_warning_inode(inode,
                                        "unable to update i_inline_off");
                                goto out;
                        }
                }
        } else if (s->not_found) {
                /* Insert new name. */
                size_t size = EXT4_XATTR_LEN(name_len);
                size_t rest = (void *)last - (void *)here + sizeof(__u32);

                memmove((void *)here + size, here, rest);
                memset(here, 0, size);
                here->e_name_index = i->name_index;
                here->e_name_len = name_len;
                memcpy(here->e_name, i->name, name_len);
        } else {
                /* This is an update, reset value info. */
                here->e_value_inum = 0;
                here->e_value_offs = 0;
                here->e_value_size = 0;
        }

        if (i->value) {
                /* Insert new value. */
                if (in_inode) {
                        here->e_value_inum = cpu_to_le32(new_ea_inode->i_ino);
                } else if (i->value_len) {
                        void *val = s->base + min_offs - new_size;

                        here->e_value_offs = cpu_to_le16(min_offs - new_size);
                        if (i->value == EXT4_ZERO_XATTR_VALUE) {
                                memset(val, 0, new_size);
                        } else {
                                memcpy(val, i->value, i->value_len);
                                /* Clear padding bytes. */
                                memset(val + i->value_len, 0,
                                       new_size - i->value_len);
                        }
                }
                here->e_value_size = cpu_to_le32(i->value_len);
        }

update_hash:
        if (i->value) {
                __le32 hash = 0;

                /* Entry hash calculation. */
                if (in_inode) {
                        __le32 crc32c_hash;

                        /*
                         * Feed crc32c hash instead of the raw value for entry
                         * hash calculation. This is to avoid walking
                         * potentially long value buffer again.
                         */
                        crc32c_hash = cpu_to_le32(
                                       ext4_xattr_inode_get_hash(new_ea_inode));
                        hash = ext4_xattr_hash_entry(here->e_name,
                                                     here->e_name_len,
                                                     &crc32c_hash, 1);
                } else if (is_block) {
                        __le32 *value = s->base + le16_to_cpu(
                                                        here->e_value_offs);

                        hash = ext4_xattr_hash_entry(here->e_name,
                                                     here->e_name_len, value,
                                                     new_size >> 2);
                }
                here->e_hash = hash;
        }

        if (is_block)
                ext4_xattr_rehash((struct ext4_xattr_header *)s->base);

        ret = 0;
out:
        iput(old_ea_inode);
        iput(new_ea_inode);
        return ret;
}

struct ext4_xattr_block_find {
        struct ext4_xattr_search s;
        struct buffer_head *bh;
};

static int
ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
                      struct ext4_xattr_block_find *bs)
{
        struct super_block *sb = inode->i_sb;
        int error;

        ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
                  i->name_index, i->name, i->value, (long)i->value_len);

        if (EXT4_I(inode)->i_file_acl) {
                /* The inode already has an extended attribute block. */
                bs->bh = ext4_sb_bread(sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
                if (IS_ERR(bs->bh)) {
                        error = PTR_ERR(bs->bh);
                        bs->bh = NULL;
                        return error;
                }
                ea_bdebug(bs->bh, "b_count=%d, refcount=%d",
                        atomic_read(&(bs->bh->b_count)),
                        le32_to_cpu(BHDR(bs->bh)->h_refcount));
                error = ext4_xattr_check_block(inode, bs->bh);
                if (error)
                        return error;
                /* Find the named attribute. */
                bs->s.base = BHDR(bs->bh);
                bs->s.first = BFIRST(bs->bh);
                bs->s.end = bs->bh->b_data + bs->bh->b_size;
                bs->s.here = bs->s.first;
                error = xattr_find_entry(inode, &bs->s.here, bs->s.end,
                                         i->name_index, i->name, 1);
                if (error && error != -ENODATA)
                        return error;
                bs->s.not_found = error;
        }
        return 0;
}

static int
ext4_xattr_block_set(handle_t *handle, struct inode *inode,
                     struct ext4_xattr_info *i,
                     struct ext4_xattr_block_find *bs)
{
        struct super_block *sb = inode->i_sb;
        struct buffer_head *new_bh = NULL;
        struct ext4_xattr_search s_copy = bs->s;
        struct ext4_xattr_search *s = &s_copy;
        struct mb_cache_entry *ce = NULL;
        int error = 0;
        struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);
        struct inode *ea_inode = NULL, *tmp_inode;
        size_t old_ea_inode_quota = 0;
        unsigned int ea_ino;


#define header(x) ((struct ext4_xattr_header *)(x))

        if (s->base) {
                int offset = (char *)s->here - bs->bh->b_data;

                BUFFER_TRACE(bs->bh, "get_write_access");
                error = ext4_journal_get_write_access(handle, bs->bh);
                if (error)
                        goto cleanup;
                lock_buffer(bs->bh);

                if (header(s->base)->h_refcount == cpu_to_le32(1)) {
                        __u32 hash = le32_to_cpu(BHDR(bs->bh)->h_hash);

                        /*
                         * This must happen under buffer lock for
                         * ext4_xattr_block_set() to reliably detect modified
                         * block
                         */
                        if (ea_block_cache) {
                                struct mb_cache_entry *oe;

                                oe = mb_cache_entry_delete_or_get(ea_block_cache,
                                        hash, bs->bh->b_blocknr);
                                if (oe) {
                                        /*
                                         * Xattr block is getting reused. Leave
                                         * it alone.
                                         */
                                        mb_cache_entry_put(ea_block_cache, oe);
                                        goto clone_block;
                                }
                        }
                        ea_bdebug(bs->bh, "modifying in-place");
                        error = ext4_xattr_set_entry(i, s, handle, inode,
                                                     true /* is_block */);
                        ext4_xattr_block_csum_set(inode, bs->bh);
                        unlock_buffer(bs->bh);
                        if (error == -EFSCORRUPTED)
                                goto bad_block;
                        if (!error)
                                error = ext4_handle_dirty_metadata(handle,
                                                                   inode,
                                                                   bs->bh);
                        if (error)
                                goto cleanup;
                        goto inserted;
                }
clone_block:
                unlock_buffer(bs->bh);
                ea_bdebug(bs->bh, "cloning");
                s->base = kmemdup(BHDR(bs->bh), bs->bh->b_size, GFP_NOFS);
                error = -ENOMEM;
                if (s->base == NULL)
                        goto cleanup;
                s->first = ENTRY(header(s->base)+1);
                header(s->base)->h_refcount = cpu_to_le32(1);
                s->here = ENTRY(s->base + offset);
                s->end = s->base + bs->bh->b_size;

                /*
                 * If existing entry points to an xattr inode, we need
                 * to prevent ext4_xattr_set_entry() from decrementing
                 * ref count on it because the reference belongs to the
                 * original block. In this case, make the entry look
                 * like it has an empty value.
                 */
                if (!s->not_found && s->here->e_value_inum) {
                        ea_ino = le32_to_cpu(s->here->e_value_inum);
                        error = ext4_xattr_inode_iget(inode, ea_ino,
                                      le32_to_cpu(s->here->e_hash),
                                      &tmp_inode);
                        if (error)
                                goto cleanup;

                        if (!ext4_test_inode_state(tmp_inode,
                                        EXT4_STATE_LUSTRE_EA_INODE)) {
                                /*
                                 * Defer quota free call for previous
                                 * inode until success is guaranteed.
                                 */
                                old_ea_inode_quota = le32_to_cpu(
                                                s->here->e_value_size);
                        }
                        iput(tmp_inode);

                        s->here->e_value_inum = 0;
                        s->here->e_value_size = 0;
                }
        } else {
                /* Allocate a buffer where we construct the new block. */
                s->base = kzalloc(sb->s_blocksize, GFP_NOFS);
                /* assert(header == s->base) */
                error = -ENOMEM;
                if (s->base == NULL)
                        goto cleanup;
                header(s->base)->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
                header(s->base)->h_blocks = cpu_to_le32(1);
                header(s->base)->h_refcount = cpu_to_le32(1);
                s->first = ENTRY(header(s->base)+1);
                s->here = ENTRY(header(s->base)+1);
                s->end = s->base + sb->s_blocksize;
        }

        error = ext4_xattr_set_entry(i, s, handle, inode, true /* is_block */);
        if (error == -EFSCORRUPTED)
                goto bad_block;
        if (error)
                goto cleanup;

        if (i->value && s->here->e_value_inum) {
                /*
                 * A ref count on ea_inode has been taken as part of the call to
                 * ext4_xattr_set_entry() above. We would like to drop this
                 * extra ref but we have to wait until the xattr block is
                 * initialized and has its own ref count on the ea_inode.
                 */
                ea_ino = le32_to_cpu(s->here->e_value_inum);
                error = ext4_xattr_inode_iget(inode, ea_ino,
                                              le32_to_cpu(s->here->e_hash),
                                              &ea_inode);
                if (error) {
                        ea_inode = NULL;
                        goto cleanup;
                }
        }

inserted:
        if (!IS_LAST_ENTRY(s->first)) {
                new_bh = ext4_xattr_block_cache_find(inode, header(s->base),
                                                     &ce);
                if (new_bh) {
                        /* We found an identical block in the cache. */
                        if (new_bh == bs->bh)
                                ea_bdebug(new_bh, "keeping");
                        else {
                                u32 ref;

#ifdef EXT4_XATTR_DEBUG
                                WARN_ON_ONCE(dquot_initialize_needed(inode));
#endif
                                /* The old block is released after updating
                                   the inode. */
                                error = dquot_alloc_block(inode,
                                                EXT4_C2B(EXT4_SB(sb), 1));
                                if (error)
                                        goto cleanup;
                                BUFFER_TRACE(new_bh, "get_write_access");
                                error = ext4_journal_get_write_access(handle,
                                                                      new_bh);
                                if (error)
                                        goto cleanup_dquot;
                                lock_buffer(new_bh);
                                /*
                                 * We have to be careful about races with
                                 * adding references to xattr block. Once we
                                 * hold buffer lock xattr block's state is
                                 * stable so we can check the additional
                                 * reference fits.
                                 */
                                ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1;
                                if (ref > EXT4_XATTR_REFCOUNT_MAX) {
                                        /*
                                         * Undo everything and check mbcache
                                         * again.
                                         */
                                        unlock_buffer(new_bh);
                                        dquot_free_block(inode,
                                                         EXT4_C2B(EXT4_SB(sb),
                                                                  1));
                                        brelse(new_bh);
                                        mb_cache_entry_put(ea_block_cache, ce);
                                        ce = NULL;
                                        new_bh = NULL;
                                        goto inserted;
                                }
                                BHDR(new_bh)->h_refcount = cpu_to_le32(ref);
                                if (ref == EXT4_XATTR_REFCOUNT_MAX)
                                        clear_bit(MBE_REUSABLE_B, &ce->e_flags);
                                ea_bdebug(new_bh, "reusing; refcount now=%d",
                                          ref);
                                ext4_xattr_block_csum_set(inode, new_bh);
                                unlock_buffer(new_bh);
                                error = ext4_handle_dirty_metadata(handle,
                                                                   inode,
                                                                   new_bh);
                                if (error)
                                        goto cleanup_dquot;
                        }
                        mb_cache_entry_touch(ea_block_cache, ce);
                        mb_cache_entry_put(ea_block_cache, ce);
                        ce = NULL;
                } else if (bs->bh && s->base == bs->bh->b_data) {
                        /* We were modifying this block in-place. */
                        ea_bdebug(bs->bh, "keeping this block");
                        ext4_xattr_block_cache_insert(ea_block_cache, bs->bh);
                        new_bh = bs->bh;
                        get_bh(new_bh);
                } else {
                        /* We need to allocate a new block */
                        ext4_fsblk_t goal, block;

#ifdef EXT4_XATTR_DEBUG
                        WARN_ON_ONCE(dquot_initialize_needed(inode));
#endif
                        goal = ext4_group_first_block_no(sb,
                                                EXT4_I(inode)->i_block_group);
                        block = ext4_new_meta_blocks(handle, inode, goal, 0,
                                                     NULL, &error);
                        if (error)
                                goto cleanup;

                        ea_idebug(inode, "creating block %llu",
                                  (unsigned long long)block);

                        new_bh = sb_getblk(sb, block);
                        if (unlikely(!new_bh)) {
                                error = -ENOMEM;
getblk_failed:
                                ext4_free_blocks(handle, inode, NULL, block, 1,
                                                 EXT4_FREE_BLOCKS_METADATA);
                                goto cleanup;
                        }
                        error = ext4_xattr_inode_inc_ref_all(handle, inode,
                                                      ENTRY(header(s->base)+1));
                        if (error)
                                goto getblk_failed;
                        if (ea_inode) {
                                /* Drop the extra ref on ea_inode. */
                                error = ext4_xattr_inode_dec_ref(handle,
                                                                 ea_inode);
                                if (error)
                                        ext4_warning_inode(ea_inode,
                                                           "dec ref error=%d",
                                                           error);
                                iput(ea_inode);
                                ea_inode = NULL;
                        }

                        lock_buffer(new_bh);
                        error = ext4_journal_get_create_access(handle, new_bh);
                        if (error) {
                                unlock_buffer(new_bh);
                                error = -EIO;
                                goto getblk_failed;
                        }
                        memcpy(new_bh->b_data, s->base, new_bh->b_size);
                        ext4_xattr_block_csum_set(inode, new_bh);
                        set_buffer_uptodate(new_bh);
                        unlock_buffer(new_bh);
                        ext4_xattr_block_cache_insert(ea_block_cache, new_bh);
                        error = ext4_handle_dirty_metadata(handle, inode,
                                                           new_bh);
                        if (error)
                                goto cleanup;
                }
        }

        if (old_ea_inode_quota)
                ext4_xattr_inode_free_quota(inode, NULL, old_ea_inode_quota);

        /* Update the inode. */
        EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;

        /* Drop the previous xattr block. */
        if (bs->bh && bs->bh != new_bh) {
                struct ext4_xattr_inode_array *ea_inode_array = NULL;

                ext4_xattr_release_block(handle, inode, bs->bh,
                                         &ea_inode_array,
                                         0 /* extra_credits */);
                ext4_xattr_inode_array_free(ea_inode_array);
        }
        error = 0;

cleanup:
        if (ea_inode) {
                int error2;

                error2 = ext4_xattr_inode_dec_ref(handle, ea_inode);
                if (error2)
                        ext4_warning_inode(ea_inode, "dec ref error=%d",
                                           error2);

                /* If there was an error, revert the quota charge. */
                if (error)
                        ext4_xattr_inode_free_quota(inode, ea_inode,
                                                    i_size_read(ea_inode));
                iput(ea_inode);
        }
        if (ce)
                mb_cache_entry_put(ea_block_cache, ce);
        brelse(new_bh);
        if (!(bs->bh && s->base == bs->bh->b_data))
                kfree(s->base);

        return error;

cleanup_dquot:
        dquot_free_block(inode, EXT4_C2B(EXT4_SB(sb), 1));
        goto cleanup;

bad_block:
        EXT4_ERROR_INODE(inode, "bad block %llu",
                         EXT4_I(inode)->i_file_acl);
        goto cleanup;

#undef header
}

int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
                          struct ext4_xattr_ibody_find *is)
{
        struct ext4_xattr_ibody_header *header;
        struct ext4_inode *raw_inode;
        int error;

        if (!EXT4_INODE_HAS_XATTR_SPACE(inode))
                return 0;

        raw_inode = ext4_raw_inode(&is->iloc);
        header = IHDR(inode, raw_inode);
        is->s.base = is->s.first = IFIRST(header);
        is->s.here = is->s.first;
        is->s.end = ITAIL(inode, raw_inode);
        if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
                /* Find the named attribute. */
                error = xattr_find_entry(inode, &is->s.here, is->s.end,
                                         i->name_index, i->name, 0);
                if (error && error != -ENODATA)
                        return error;
                is->s.not_found = error;
        }
        return 0;
}

int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
                                struct ext4_xattr_info *i,
                                struct ext4_xattr_ibody_find *is)
{
        struct ext4_xattr_ibody_header *header;
        struct ext4_xattr_search *s = &is->s;
        int error;

        if (!EXT4_INODE_HAS_XATTR_SPACE(inode))
                return -ENOSPC;

        error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */);
        if (error)
                return error;
        header = IHDR(inode, ext4_raw_inode(&is->iloc));
        if (!IS_LAST_ENTRY(s->first)) {
                header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
                ext4_set_inode_state(inode, EXT4_STATE_XATTR);
        } else {
                header->h_magic = cpu_to_le32(0);
                ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
        }
        return 0;
}

static int ext4_xattr_value_same(struct ext4_xattr_search *s,
                                 struct ext4_xattr_info *i)
{
        void *value;

        /* When e_value_inum is set the value is stored externally. */
        if (s->here->e_value_inum)
                return 0;
        if (le32_to_cpu(s->here->e_value_size) != i->value_len)
                return 0;
        value = ((void *)s->base) + le16_to_cpu(s->here->e_value_offs);
        return !memcmp(value, i->value, i->value_len);
}

static struct buffer_head *ext4_xattr_get_block(struct inode *inode)
{
        struct buffer_head *bh;
        int error;

        if (!EXT4_I(inode)->i_file_acl)
                return NULL;
        bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
        if (IS_ERR(bh))
                return bh;
        error = ext4_xattr_check_block(inode, bh);
        if (error) {
                brelse(bh);
                return ERR_PTR(error);
        }
        return bh;
}

/*
 * ext4_xattr_set_handle()
 *
 * Create, replace or remove an extended attribute for this inode.  Value
 * is NULL to remove an existing extended attribute, and non-NULL to
 * either replace an existing extended attribute, or create a new extended
 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
 * specify that an extended attribute must exist and must not exist
 * previous to the call, respectively.
 *
 * Returns 0, or a negative error number on failure.
 */
int
ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
                      const char *name, const void *value, size_t value_len,
                      int flags)
{
        struct ext4_xattr_info i = {
                .name_index = name_index,
                .name = name,
                .value = value,
                .value_len = value_len,
                .in_inode = 0,
        };
        struct ext4_xattr_ibody_find is = {
                .s = { .not_found = -ENODATA, },
        };
        struct ext4_xattr_block_find bs = {
                .s = { .not_found = -ENODATA, },
        };
        int no_expand;
        int error;

        if (!name)
                return -EINVAL;
        if (strlen(name) > 255)
                return -ERANGE;

        ext4_write_lock_xattr(inode, &no_expand);

        /* Check journal credits under write lock. */
        if (ext4_handle_valid(handle)) {
                struct buffer_head *bh;
                int credits;

                bh = ext4_xattr_get_block(inode);
                if (IS_ERR(bh)) {
                        error = PTR_ERR(bh);
                        goto cleanup;
                }

                credits = __ext4_xattr_set_credits(inode->i_sb, inode, bh,
                                                   value_len,
                                                   flags & XATTR_CREATE);
                brelse(bh);

                if (jbd2_handle_buffer_credits(handle) < credits) {
                        error = -ENOSPC;
                        goto cleanup;
                }
                WARN_ON_ONCE(!(current->flags & PF_MEMALLOC_NOFS));
        }

        error = ext4_reserve_inode_write(handle, inode, &is.iloc);
        if (error)
                goto cleanup;

        if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) {
                struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
                memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
                ext4_clear_inode_state(inode, EXT4_STATE_NEW);
        }

        error = ext4_xattr_ibody_find(inode, &i, &is);
        if (error)
                goto cleanup;
        if (is.s.not_found)
                error = ext4_xattr_block_find(inode, &i, &bs);
        if (error)
                goto cleanup;
        if (is.s.not_found && bs.s.not_found) {
                error = -ENODATA;
                if (flags & XATTR_REPLACE)
                        goto cleanup;
                error = 0;
                if (!value)
                        goto cleanup;
        } else {
                error = -EEXIST;
                if (flags & XATTR_CREATE)
                        goto cleanup;
        }

        if (!value) {
                if (!is.s.not_found)
                        error = ext4_xattr_ibody_set(handle, inode, &i, &is);
                else if (!bs.s.not_found)
                        error = ext4_xattr_block_set(handle, inode, &i, &bs);
        } else {
                error = 0;
                /* Xattr value did not change? Save us some work and bail out */
                if (!is.s.not_found && ext4_xattr_value_same(&is.s, &i))
                        goto cleanup;
                if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i))
                        goto cleanup;

                if (ext4_has_feature_ea_inode(inode->i_sb) &&
                    (EXT4_XATTR_SIZE(i.value_len) >
                        EXT4_XATTR_MIN_LARGE_EA_SIZE(inode->i_sb->s_blocksize)))
                        i.in_inode = 1;
retry_inode:
                error = ext4_xattr_ibody_set(handle, inode, &i, &is);
                if (!error && !bs.s.not_found) {
                        i.value = NULL;
                        error = ext4_xattr_block_set(handle, inode, &i, &bs);
                } else if (error == -ENOSPC) {
                        if (EXT4_I(inode)->i_file_acl && !bs.s.base) {
                                brelse(bs.bh);
                                bs.bh = NULL;
                                error = ext4_xattr_block_find(inode, &i, &bs);
                                if (error)
                                        goto cleanup;
                        }
                        error = ext4_xattr_block_set(handle, inode, &i, &bs);
                        if (!error && !is.s.not_found) {
                                i.value = NULL;
                                error = ext4_xattr_ibody_set(handle, inode, &i,
                                                             &is);
                        } else if (error == -ENOSPC) {
                                /*
                                 * Xattr does not fit in the block, store at
                                 * external inode if possible.
                                 */
                                if (ext4_has_feature_ea_inode(inode->i_sb) &&
                                    i.value_len && !i.in_inode) {
                                        i.in_inode = 1;
                                        goto retry_inode;
                                }
                        }
                }
        }
        if (!error) {
                ext4_xattr_update_super_block(handle, inode->i_sb);
                inode->i_ctime = current_time(inode);
                if (!value)
                        no_expand = 0;
                error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
                /*
                 * The bh is consumed by ext4_mark_iloc_dirty, even with
                 * error != 0.
                 */
                is.iloc.bh = NULL;
                if (IS_SYNC(inode))
                        ext4_handle_sync(handle);
        }
        ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR);

cleanup:
        brelse(is.iloc.bh);
        brelse(bs.bh);
        ext4_write_unlock_xattr(inode, &no_expand);
        return error;
}

int ext4_xattr_set_credits(struct inode *inode, size_t value_len,
                           bool is_create, int *credits)
{
        struct buffer_head *bh;
        int err;

        *credits = 0;

        if (!EXT4_SB(inode->i_sb)->s_journal)
                return 0;

        down_read(&EXT4_I(inode)->xattr_sem);

        bh = ext4_xattr_get_block(inode);
        if (IS_ERR(bh)) {
                err = PTR_ERR(bh);
        } else {
                *credits = __ext4_xattr_set_credits(inode->i_sb, inode, bh,
                                                    value_len, is_create);
                brelse(bh);
                err = 0;
        }

        up_read(&EXT4_I(inode)->xattr_sem);
        return err;
}

/*
 * ext4_xattr_set()
 *
 * Like ext4_xattr_set_handle, but start from an inode. This extended
 * attribute modification is a filesystem transaction by itself.
 *
 * Returns 0, or a negative error number on failure.
 */
int
ext4_xattr_set(struct inode *inode, int name_index, const char *name,
               const void *value, size_t value_len, int flags)
{
        handle_t *handle;
        struct super_block *sb = inode->i_sb;
        int error, retries = 0;
        int credits;

        error = dquot_initialize(inode);
        if (error)
                return error;

retry:
        error = ext4_xattr_set_credits(inode, value_len, flags & XATTR_CREATE,
                                       &credits);
        if (error)
                return error;

        handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
        if (IS_ERR(handle)) {
                error = PTR_ERR(handle);
        } else {
                int error2;

                error = ext4_xattr_set_handle(handle, inode, name_index, name,
                                              value, value_len, flags);
                error2 = ext4_journal_stop(handle);
                if (error == -ENOSPC &&
                    ext4_should_retry_alloc(sb, &retries))
                        goto retry;
                if (error == 0)
                        error = error2;
        }
        ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR);

        return error;
}

/*
 * Shift the EA entries in the inode to create space for the increased
 * i_extra_isize.
 */
static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry,
                                     int value_offs_shift, void *to,
                                     void *from, size_t n)
{
        struct ext4_xattr_entry *last = entry;
        int new_offs;

        /* We always shift xattr headers further thus offsets get lower */
        BUG_ON(value_offs_shift > 0);

        /* Adjust the value offsets of the entries */
        for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
                if (!last->e_value_inum && last->e_value_size) {
                        new_offs = le16_to_cpu(last->e_value_offs) +
                                                        value_offs_shift;
                        last->e_value_offs = cpu_to_le16(new_offs);
                }
        }
        /* Shift the entries by n bytes */
        memmove(to, from, n);
}

/*
 * Move xattr pointed to by 'entry' from inode into external xattr block
 */
static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
                                    struct ext4_inode *raw_inode,
                                    struct ext4_xattr_entry *entry)
{
        struct ext4_xattr_ibody_find *is = NULL;
        struct ext4_xattr_block_find *bs = NULL;
        char *buffer = NULL, *b_entry_name = NULL;
        size_t value_size = le32_to_cpu(entry->e_value_size);
        struct ext4_xattr_info i = {
                .value = NULL,
                .value_len = 0,
                .name_index = entry->e_name_index,
                .in_inode = !!entry->e_value_inum,
        };
        struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode);
        int needs_kvfree = 0;
        int error;

        is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS);
        bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS);
        b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS);
        if (!is || !bs || !b_entry_name) {
                error = -ENOMEM;
                goto out;
        }

        is->s.not_found = -ENODATA;
        bs->s.not_found = -ENODATA;
        is->iloc.bh = NULL;
        bs->bh = NULL;

        /* Save the entry name and the entry value */
        if (entry->e_value_inum) {
                buffer = kvmalloc(value_size, GFP_NOFS);
                if (!buffer) {
                        error = -ENOMEM;
                        goto out;
                }
                needs_kvfree = 1;
                error = ext4_xattr_inode_get(inode, entry, buffer, value_size);
                if (error)
                        goto out;
        } else {
                size_t value_offs = le16_to_cpu(entry->e_value_offs);
                buffer = (void *)IFIRST(header) + value_offs;
        }

        memcpy(b_entry_name, entry->e_name, entry->e_name_len);
        b_entry_name[entry->e_name_len] = '\0';
        i.name = b_entry_name;

        error = ext4_get_inode_loc(inode, &is->iloc);
        if (error)
                goto out;

        error = ext4_xattr_ibody_find(inode, &i, is);
        if (error)
                goto out;

        i.value = buffer;
        i.value_len = value_size;
        error = ext4_xattr_block_find(inode, &i, bs);
        if (error)
                goto out;

        /* Move ea entry from the inode into the block */
        error = ext4_xattr_block_set(handle, inode, &i, bs);
        if (error)
                goto out;

        /* Remove the chosen entry from the inode */
        i.value = NULL;
        i.value_len = 0;
        error = ext4_xattr_ibody_set(handle, inode, &i, is);

out:
        kfree(b_entry_name);
        if (needs_kvfree && buffer)
                kvfree(buffer);
        if (is)
                brelse(is->iloc.bh);
        if (bs)
                brelse(bs->bh);
        kfree(is);
        kfree(bs);

        return error;
}

static int ext4_xattr_make_inode_space(handle_t *handle, struct inode *inode,
                                       struct ext4_inode *raw_inode,
                                       int isize_diff, size_t ifree,
                                       size_t bfree, int *total_ino)
{
        struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode);
        struct ext4_xattr_entry *small_entry;
        struct ext4_xattr_entry *entry;
        struct ext4_xattr_entry *last;
        unsigned int entry_size;        /* EA entry size */
        unsigned int total_size;        /* EA entry size + value size */
        unsigned int min_total_size;
        int error;

        while (isize_diff > ifree) {
                entry = NULL;
                small_entry = NULL;
                min_total_size = ~0U;
                last = IFIRST(header);
                /* Find the entry best suited to be pushed into EA block */
                for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
                        /* never move system.data out of the inode */
                        if ((last->e_name_len == 4) &&
                            (last->e_name_index == EXT4_XATTR_INDEX_SYSTEM) &&
                            !memcmp(last->e_name, "data", 4))
                                continue;
                        total_size = EXT4_XATTR_LEN(last->e_name_len);
                        if (!last->e_value_inum)
                                total_size += EXT4_XATTR_SIZE(
                                               le32_to_cpu(last->e_value_size));
                        if (total_size <= bfree &&
                            total_size < min_total_size) {
                                if (total_size + ifree < isize_diff) {
                                        small_entry = last;
                                } else {
                                        entry = last;
                                        min_total_size = total_size;
                                }
                        }
                }

                if (entry == NULL) {
                        if (small_entry == NULL)
                                return -ENOSPC;
                        entry = small_entry;
                }

                entry_size = EXT4_XATTR_LEN(entry->e_name_len);
                total_size = entry_size;
                if (!entry->e_value_inum)
                        total_size += EXT4_XATTR_SIZE(
                                              le32_to_cpu(entry->e_value_size));
                error = ext4_xattr_move_to_block(handle, inode, raw_inode,
                                                 entry);
                if (error)
                        return error;

                *total_ino -= entry_size;
                ifree += total_size;
                bfree -= total_size;
        }

        return 0;
}

/*
 * Expand an inode by new_extra_isize bytes when EAs are present.
 * Returns 0 on success or negative error number on failure.
 */
int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
                               struct ext4_inode *raw_inode, handle_t *handle)
{
        struct ext4_xattr_ibody_header *header;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        static unsigned int mnt_count;
        size_t min_offs;
        size_t ifree, bfree;
        int total_ino;
        void *base, *end;
        int error = 0, tried_min_extra_isize = 0;
        int s_min_extra_isize = le16_to_cpu(sbi->s_es->s_min_extra_isize);
        int isize_diff;        /* How much do we need to grow i_extra_isize */

retry:
        isize_diff = new_extra_isize - EXT4_I(inode)->i_extra_isize;
        if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
                return 0;

        header = IHDR(inode, raw_inode);

        /*
         * Check if enough free space is available in the inode to shift the
         * entries ahead by new_extra_isize.
         */

        base = IFIRST(header);
        end = ITAIL(inode, raw_inode);
        min_offs = end - base;
        total_ino = sizeof(struct ext4_xattr_ibody_header) + sizeof(u32);

        ifree = ext4_xattr_free_space(base, &min_offs, base, &total_ino);
        if (ifree >= isize_diff)
                goto shift;

        /*
         * Enough free space isn't available in the inode, check if
         * EA block can hold new_extra_isize bytes.
         */
        if (EXT4_I(inode)->i_file_acl) {
                struct buffer_head *bh;

                bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
                if (IS_ERR(bh)) {
                        error = PTR_ERR(bh);
                        goto cleanup;
                }
                error = ext4_xattr_check_block(inode, bh);
                if (error) {
                        brelse(bh);
                        goto cleanup;
                }
                base = BHDR(bh);
                end = bh->b_data + bh->b_size;
                min_offs = end - base;
                bfree = ext4_xattr_free_space(BFIRST(bh), &min_offs, base,
                                              NULL);
                brelse(bh);
                if (bfree + ifree < isize_diff) {
                        if (!tried_min_extra_isize && s_min_extra_isize) {
                                tried_min_extra_isize++;
                                new_extra_isize = s_min_extra_isize;
                                goto retry;
                        }
                        error = -ENOSPC;
                        goto cleanup;
                }
        } else {
                bfree = inode->i_sb->s_blocksize;
        }

        error = ext4_xattr_make_inode_space(handle, inode, raw_inode,
                                            isize_diff, ifree, bfree,
                                            &total_ino);
        if (error) {
                if (error == -ENOSPC && !tried_min_extra_isize &&
                    s_min_extra_isize) {
                        tried_min_extra_isize++;
                        new_extra_isize = s_min_extra_isize;
                        goto retry;
                }
                goto cleanup;
        }
shift:
        /* Adjust the offsets and shift the remaining entries ahead */
        ext4_xattr_shift_entries(IFIRST(header), EXT4_I(inode)->i_extra_isize
                        - new_extra_isize, (void *)raw_inode +
                        EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize,
                        (void *)header, total_ino);
        EXT4_I(inode)->i_extra_isize = new_extra_isize;

        if (ext4_has_inline_data(inode))
                error = ext4_find_inline_data_nolock(inode);

cleanup:
        if (error && (mnt_count != le16_to_cpu(sbi->s_es->s_mnt_count))) {
                ext4_warning(inode->i_sb, "Unable to expand inode %lu. Delete some EAs or run e2fsck.",
                             inode->i_ino);
                mnt_count = le16_to_cpu(sbi->s_es->s_mnt_count);
        }
        return error;
}

#define EIA_INCR 16 /* must be 2^n */
#define EIA_MASK (EIA_INCR - 1)

/* Add the large xattr @inode into @ea_inode_array for deferred iput().
 * If @ea_inode_array is new or full it will be grown and the old
 * contents copied over.
 */
static int
ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
                        struct inode *inode)
{
        if (*ea_inode_array == NULL) {
                /*
                 * Start with 15 inodes, so it fits into a power-of-two size.
                 * If *ea_inode_array is NULL, this is essentially offsetof()
                 */
                (*ea_inode_array) =
                        kmalloc(offsetof(struct ext4_xattr_inode_array,
                                         inodes[EIA_MASK]),
                                GFP_NOFS);
                if (*ea_inode_array == NULL)
                        return -ENOMEM;
                (*ea_inode_array)->count = 0;
        } else if (((*ea_inode_array)->count & EIA_MASK) == EIA_MASK) {
                /* expand the array once all 15 + n * 16 slots are full */
                struct ext4_xattr_inode_array *new_array = NULL;
                int count = (*ea_inode_array)->count;

                /* if new_array is NULL, this is essentially offsetof() */
                new_array = kmalloc(
                                offsetof(struct ext4_xattr_inode_array,
                                         inodes[count + EIA_INCR]),
                                GFP_NOFS);
                if (new_array == NULL)
                        return -ENOMEM;
                memcpy(new_array, *ea_inode_array,
                       offsetof(struct ext4_xattr_inode_array, inodes[count]));
                kfree(*ea_inode_array);
                *ea_inode_array = new_array;
        }
        (*ea_inode_array)->inodes[(*ea_inode_array)->count++] = inode;
        return 0;
}

/*
 * ext4_xattr_delete_inode()
 *
 * Free extended attribute resources associated with this inode. Traverse
 * all entries and decrement reference on any xattr inodes associated with this
 * inode. This is called immediately before an inode is freed. We have exclusive
 * access to the inode. If an orphan inode is deleted it will also release its
 * references on xattr block and xattr inodes.
 */
int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
                            struct ext4_xattr_inode_array **ea_inode_array,
                            int extra_credits)
{
        struct buffer_head *bh = NULL;
        struct ext4_xattr_ibody_header *header;
        struct ext4_iloc iloc = { .bh = NULL };
        struct ext4_xattr_entry *entry;
        struct inode *ea_inode;
        int error;

        error = ext4_journal_ensure_credits(handle, extra_credits,
                        ext4_free_metadata_revoke_credits(inode->i_sb, 1));
        if (error < 0) {
                EXT4_ERROR_INODE(inode, "ensure credits (error %d)", error);
                goto cleanup;
        }

        if (ext4_has_feature_ea_inode(inode->i_sb) &&
            ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {

                error = ext4_get_inode_loc(inode, &iloc);
                if (error) {
                        EXT4_ERROR_INODE(inode, "inode loc (error %d)", error);
                        goto cleanup;
                }

                error = ext4_journal_get_write_access(handle, iloc.bh);
                if (error) {
                        EXT4_ERROR_INODE(inode, "write access (error %d)",
                                         error);
                        goto cleanup;
                }

                header = IHDR(inode, ext4_raw_inode(&iloc));
                if (header->h_magic == cpu_to_le32(EXT4_XATTR_MAGIC))
                        ext4_xattr_inode_dec_ref_all(handle, inode, iloc.bh,
                                                     IFIRST(header),
                                                     false /* block_csum */,
                                                     ea_inode_array,
                                                     extra_credits,
                                                     false /* skip_quota */);
        }

        if (EXT4_I(inode)->i_file_acl) {
                bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
                if (IS_ERR(bh)) {
                        error = PTR_ERR(bh);
                        if (error == -EIO) {
                                EXT4_ERROR_INODE_ERR(inode, EIO,
                                                     "block %llu read error",
                                                     EXT4_I(inode)->i_file_acl);
                        }
                        bh = NULL;
                        goto cleanup;
                }
                error = ext4_xattr_check_block(inode, bh);
                if (error)
                        goto cleanup;

                if (ext4_has_feature_ea_inode(inode->i_sb)) {
                        for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
                             entry = EXT4_XATTR_NEXT(entry)) {
                                if (!entry->e_value_inum)
                                        continue;
                                error = ext4_xattr_inode_iget(inode,
                                              le32_to_cpu(entry->e_value_inum),
                                              le32_to_cpu(entry->e_hash),
                                              &ea_inode);
                                if (error)
                                        continue;
                                ext4_xattr_inode_free_quota(inode, ea_inode,
                                              le32_to_cpu(entry->e_value_size));
                                iput(ea_inode);
                        }

                }

                ext4_xattr_release_block(handle, inode, bh, ea_inode_array,
                                         extra_credits);
                /*
                 * Update i_file_acl value in the same transaction that releases
                 * block.
                 */
                EXT4_I(inode)->i_file_acl = 0;
                error = ext4_mark_inode_dirty(handle, inode);
                if (error) {
                        EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
                                         error);
                        goto cleanup;
                }
                ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR);
        }
        error = 0;
cleanup:
        brelse(iloc.bh);
        brelse(bh);
        return error;
}

void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array)
{
        int idx;

        if (ea_inode_array == NULL)
                return;

        for (idx = 0; idx < ea_inode_array->count; ++idx)
                iput(ea_inode_array->inodes[idx]);
        kfree(ea_inode_array);
}

/*
 * ext4_xattr_block_cache_insert()
 *
 * Create a new entry in the extended attribute block cache, and insert
 * it unless such an entry is already in the cache.
 *
 * Returns 0, or a negative error number on failure.
 */
static void
ext4_xattr_block_cache_insert(struct mb_cache *ea_block_cache,
                              struct buffer_head *bh)
{
        struct ext4_xattr_header *header = BHDR(bh);
        __u32 hash = le32_to_cpu(header->h_hash);
        int reusable = le32_to_cpu(header->h_refcount) <
                       EXT4_XATTR_REFCOUNT_MAX;
        int error;

        if (!ea_block_cache)
                return;
        error = mb_cache_entry_create(ea_block_cache, GFP_NOFS, hash,
                                      bh->b_blocknr, reusable);
        if (error) {
                if (error == -EBUSY)
                        ea_bdebug(bh, "already in cache");
        } else
                ea_bdebug(bh, "inserting [%x]", (int)hash);
}

/*
 * ext4_xattr_cmp()
 *
 * Compare two extended attribute blocks for equality.
 *
 * Returns 0 if the blocks are equal, 1 if they differ, and
 * a negative error number on errors.
 */
static int
ext4_xattr_cmp(struct ext4_xattr_header *header1,
               struct ext4_xattr_header *header2)
{
        struct ext4_xattr_entry *entry1, *entry2;

        entry1 = ENTRY(header1+1);
        entry2 = ENTRY(header2+1);
        while (!IS_LAST_ENTRY(entry1)) {
                if (IS_LAST_ENTRY(entry2))
                        return 1;
                if (entry1->e_hash != entry2->e_hash ||
                    entry1->e_name_index != entry2->e_name_index ||
                    entry1->e_name_len != entry2->e_name_len ||
                    entry1->e_value_size != entry2->e_value_size ||
                    entry1->e_value_inum != entry2->e_value_inum ||
                    memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
                        return 1;
                if (!entry1->e_value_inum &&
                    memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
                           (char *)header2 + le16_to_cpu(entry2->e_value_offs),
                           le32_to_cpu(entry1->e_value_size)))
                        return 1;

                entry1 = EXT4_XATTR_NEXT(entry1);
                entry2 = EXT4_XATTR_NEXT(entry2);
        }
        if (!IS_LAST_ENTRY(entry2))
                return 1;
        return 0;
}

/*
 * ext4_xattr_block_cache_find()
 *
 * Find an identical extended attribute block.
 *
 * Returns a pointer to the block found, or NULL if such a block was
 * not found or an error occurred.
 */
static struct buffer_head *
ext4_xattr_block_cache_find(struct inode *inode,
                            struct ext4_xattr_header *header,
                            struct mb_cache_entry **pce)
{
        __u32 hash = le32_to_cpu(header->h_hash);
        struct mb_cache_entry *ce;
        struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);

        if (!ea_block_cache)
                return NULL;
        if (!header->h_hash)
                return NULL;  /* never share */
        ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
        ce = mb_cache_entry_find_first(ea_block_cache, hash);
        while (ce) {
                struct buffer_head *bh;

                bh = ext4_sb_bread(inode->i_sb, ce->e_value, REQ_PRIO);
                if (IS_ERR(bh)) {
                        if (PTR_ERR(bh) == -ENOMEM) {
                                mb_cache_entry_put(ea_block_cache, ce);
                                return NULL;
                        }
                        bh = NULL;
                        EXT4_ERROR_INODE(inode, "block %lu read error",
                                         (unsigned long)ce->e_value);
                } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
                        *pce = ce;
                        return bh;
                }
                brelse(bh);
                ce = mb_cache_entry_find_next(ea_block_cache, ce);
        }
        return NULL;
}

#define NAME_HASH_SHIFT 5
#define VALUE_HASH_SHIFT 16

/*
 * ext4_xattr_hash_entry()
 *
 * Compute the hash of an extended attribute.
 */
static __le32 ext4_xattr_hash_entry(char *name, size_t name_len, __le32 *value,
                                    size_t value_count)
{
        __u32 hash = 0;

        while (name_len--) {
                hash = (hash << NAME_HASH_SHIFT) ^
                       (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
                       *name++;
        }
        while (value_count--) {
                hash = (hash << VALUE_HASH_SHIFT) ^
                       (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
                       le32_to_cpu(*value++);
        }
        return cpu_to_le32(hash);
}

#undef NAME_HASH_SHIFT
#undef VALUE_HASH_SHIFT

#define BLOCK_HASH_SHIFT 16

/*
 * ext4_xattr_rehash()
 *
 * Re-compute the extended attribute hash value after an entry has changed.
 */
static void ext4_xattr_rehash(struct ext4_xattr_header *header)
{
        struct ext4_xattr_entry *here;
        __u32 hash = 0;

        here = ENTRY(header+1);
        while (!IS_LAST_ENTRY(here)) {
                if (!here->e_hash) {
                        /* Block is not shared if an entry's hash value == 0 */
                        hash = 0;
                        break;
                }
                hash = (hash << BLOCK_HASH_SHIFT) ^
                       (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^
                       le32_to_cpu(here->e_hash);
                here = EXT4_XATTR_NEXT(here);
        }
        header->h_hash = cpu_to_le32(hash);
}

#undef BLOCK_HASH_SHIFT

#define        HASH_BUCKET_BITS        10

struct mb_cache *
ext4_xattr_create_cache(void)
{
        return mb_cache_create(HASH_BUCKET_BITS);
}

void ext4_xattr_destroy_cache(struct mb_cache *cache)
{
        if (cache)
                mb_cache_destroy(cache);
}






































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
/* SPDX-License-Identifier: GPL-2.0 */
/*
 *  include/linux/eventfd.h
 *
 *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
 *
 */

#ifndef _LINUX_EVENTFD_H
#define _LINUX_EVENTFD_H

#include <linux/fcntl.h>
#include <linux/wait.h>
#include <linux/err.h>
#include <linux/percpu-defs.h>
#include <linux/percpu.h>

/*
 * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining
 * new flags, since they might collide with O_* ones. We want
 * to re-use O_* flags that couldn't possibly have a meaning
 * from eventfd, in order to leave a free define-space for
 * shared O_* flags.
 */
#define EFD_SEMAPHORE (1 << 0)
#define EFD_CLOEXEC O_CLOEXEC
#define EFD_NONBLOCK O_NONBLOCK

#define EFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
#define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE)

struct eventfd_ctx;
struct file;

#ifdef CONFIG_EVENTFD

void eventfd_ctx_put(struct eventfd_ctx *ctx);
struct file *eventfd_fget(int fd);
struct eventfd_ctx *eventfd_ctx_fdget(int fd);
struct eventfd_ctx *eventfd_ctx_fileget(struct file *file);
__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n);
__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, unsigned mask);
int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
                                  __u64 *cnt);
void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt);

DECLARE_PER_CPU(int, eventfd_wake_count);

static inline bool eventfd_signal_count(void)
{
        return this_cpu_read(eventfd_wake_count);
}

#else /* CONFIG_EVENTFD */

/*
 * Ugly ugly ugly error layer to support modules that uses eventfd but
 * pretend to work in !CONFIG_EVENTFD configurations. Namely, AIO.
 */

static inline struct eventfd_ctx *eventfd_ctx_fdget(int fd)
{
        return ERR_PTR(-ENOSYS);
}

static inline int eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
{
        return -ENOSYS;
}

static inline int eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n,
                                      unsigned mask)
{
        return -ENOSYS;
}

static inline void eventfd_ctx_put(struct eventfd_ctx *ctx)
{

}

static inline int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx,
                                                wait_queue_entry_t *wait, __u64 *cnt)
{
        return -ENOSYS;
}

static inline bool eventfd_signal_count(void)
{
        return false;
}

static inline void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
{

}

#endif

#endif /* _LINUX_EVENTFD_H */






































































































    1 







    1 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Internals of the DMA direct mapping implementation.  Only for use by the
 * DMA mapping code and IOMMU drivers.
 */
#ifndef _LINUX_DMA_DIRECT_H
#define _LINUX_DMA_DIRECT_H 1

#include <linux/dma-mapping.h>
#include <linux/dma-map-ops.h>
#include <linux/memblock.h> /* for min_low_pfn */
#include <linux/mem_encrypt.h>
#include <linux/swiotlb.h>

extern unsigned int zone_dma_bits;

/*
 * Record the mapping of CPU physical to DMA addresses for a given region.
 */
struct bus_dma_region {
        phys_addr_t        cpu_start;
        dma_addr_t        dma_start;
        u64                size;
        u64                offset;
};

static inline dma_addr_t translate_phys_to_dma(struct device *dev,
                phys_addr_t paddr)
{
        const struct bus_dma_region *m;

        for (m = dev->dma_range_map; m->size; m++)
                if (paddr >= m->cpu_start && paddr - m->cpu_start < m->size)
                        return (dma_addr_t)paddr - m->offset;

        /* make sure dma_capable fails when no translation is available */
        return DMA_MAPPING_ERROR;
}

static inline phys_addr_t translate_dma_to_phys(struct device *dev,
                dma_addr_t dma_addr)
{
        const struct bus_dma_region *m;

        for (m = dev->dma_range_map; m->size; m++)
                if (dma_addr >= m->dma_start && dma_addr - m->dma_start < m->size)
                        return (phys_addr_t)dma_addr + m->offset;

        return (phys_addr_t)-1;
}

#ifdef CONFIG_ARCH_HAS_PHYS_TO_DMA
#include <asm/dma-direct.h>
#ifndef phys_to_dma_unencrypted
#define phys_to_dma_unencrypted                phys_to_dma
#endif
#else
static inline dma_addr_t phys_to_dma_unencrypted(struct device *dev,
                phys_addr_t paddr)
{
        if (dev->dma_range_map)
                return translate_phys_to_dma(dev, paddr);
        return paddr;
}

/*
 * If memory encryption is supported, phys_to_dma will set the memory encryption
 * bit in the DMA address, and dma_to_phys will clear it.
 * phys_to_dma_unencrypted is for use on special unencrypted memory like swiotlb
 * buffers.
 */
static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
{
        return __sme_set(phys_to_dma_unencrypted(dev, paddr));
}

static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dma_addr)
{
        phys_addr_t paddr;

        if (dev->dma_range_map)
                paddr = translate_dma_to_phys(dev, dma_addr);
        else
                paddr = dma_addr;

        return __sme_clr(paddr);
}
#endif /* !CONFIG_ARCH_HAS_PHYS_TO_DMA */

#ifdef CONFIG_ARCH_HAS_FORCE_DMA_UNENCRYPTED
bool force_dma_unencrypted(struct device *dev);
#else
static inline bool force_dma_unencrypted(struct device *dev)
{
        return false;
}
#endif /* CONFIG_ARCH_HAS_FORCE_DMA_UNENCRYPTED */

static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size,
                bool is_ram)
{
        dma_addr_t end = addr + size - 1;

        if (addr == DMA_MAPPING_ERROR)
                return false;
        if (is_ram && !IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT) &&
            min(addr, end) < phys_to_dma(dev, PFN_PHYS(min_low_pfn)))
                return false;

        return end <= min_not_zero(*dev->dma_mask, dev->bus_dma_limit);
}

u64 dma_direct_get_required_mask(struct device *dev);
void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
                gfp_t gfp, unsigned long attrs);
void dma_direct_free(struct device *dev, size_t size, void *cpu_addr,
                dma_addr_t dma_addr, unsigned long attrs);
struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
                dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp);
void dma_direct_free_pages(struct device *dev, size_t size,
                struct page *page, dma_addr_t dma_addr,
                enum dma_data_direction dir);
int dma_direct_supported(struct device *dev, u64 mask);
dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
                size_t size, enum dma_data_direction dir, unsigned long attrs);

#endif /* _LINUX_DMA_DIRECT_H */





























    8 











    7 




























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _X86_IRQFLAGS_H_
#define _X86_IRQFLAGS_H_

#include <asm/processor-flags.h>

#ifndef __ASSEMBLY__

#include <asm/nospec-branch.h>

/* Provide __cpuidle; we can't safely include <linux/cpu.h> */
#define __cpuidle __section(".cpuidle.text")

/*
 * Interrupt control:
 */

/* Declaration required for gcc < 4.9 to prevent -Werror=missing-prototypes */
extern inline unsigned long native_save_fl(void);
extern __always_inline unsigned long native_save_fl(void)
{
        unsigned long flags;

        /*
         * "=rm" is safe here, because "pop" adjusts the stack before
         * it evaluates its effective address -- this is part of the
         * documented behavior of the "pop" instruction.
         */
        asm volatile("# __raw_save_flags\n\t"
                     "pushf ; pop %0"
                     : "=rm" (flags)
                     : /* no input */
                     : "memory");

        return flags;
}

extern inline void native_restore_fl(unsigned long flags);
extern inline void native_restore_fl(unsigned long flags)
{
        asm volatile("push %0 ; popf"
                     : /* no output */
                     :"g" (flags)
                     :"memory", "cc");
}

static __always_inline void native_irq_disable(void)
{
        asm volatile("cli": : :"memory");
}

static __always_inline void native_irq_enable(void)
{
        asm volatile("sti": : :"memory");
}

static inline __cpuidle void native_safe_halt(void)
{
        x86_idle_clear_cpu_buffers();
        asm volatile("sti; hlt": : :"memory");
}

static inline __cpuidle void native_halt(void)
{
        x86_idle_clear_cpu_buffers();
        asm volatile("hlt": : :"memory");
}

#endif

#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else
#ifndef __ASSEMBLY__
#include <linux/types.h>

static __always_inline unsigned long arch_local_save_flags(void)
{
        return native_save_fl();
}

static __always_inline void arch_local_irq_restore(unsigned long flags)
{
        native_restore_fl(flags);
}

static __always_inline void arch_local_irq_disable(void)
{
        native_irq_disable();
}

static __always_inline void arch_local_irq_enable(void)
{
        native_irq_enable();
}

/*
 * Used in the idle loop; sti takes one instruction cycle
 * to complete:
 */
static inline __cpuidle void arch_safe_halt(void)
{
        native_safe_halt();
}

/*
 * Used when interrupts are already enabled or to
 * shutdown the processor:
 */
static inline __cpuidle void halt(void)
{
        native_halt();
}

/*
 * For spinlocks, etc:
 */
static __always_inline unsigned long arch_local_irq_save(void)
{
        unsigned long flags = arch_local_save_flags();
        arch_local_irq_disable();
        return flags;
}
#else

#define ENABLE_INTERRUPTS(x)        sti
#define DISABLE_INTERRUPTS(x)        cli

#ifdef CONFIG_X86_64
#ifdef CONFIG_DEBUG_ENTRY
#define SAVE_FLAGS(x)                pushfq; popq %rax
#endif

#define INTERRUPT_RETURN        jmp native_iret

#else
#define INTERRUPT_RETURN                iret
#endif

#endif /* __ASSEMBLY__ */
#endif /* CONFIG_PARAVIRT_XXL */

#ifndef __ASSEMBLY__
static __always_inline int arch_irqs_disabled_flags(unsigned long flags)
{
        return !(flags & X86_EFLAGS_IF);
}

static __always_inline int arch_irqs_disabled(void)
{
        unsigned long flags = arch_local_save_flags();

        return arch_irqs_disabled_flags(flags);
}
#else
#ifdef CONFIG_X86_64
#ifdef CONFIG_XEN_PV
#define SWAPGS        ALTERNATIVE "swapgs", "", X86_FEATURE_XENPV
#else
#define SWAPGS        swapgs
#endif
#endif
#endif /* !__ASSEMBLY__ */

#endif










































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions of the Internet Protocol.
 *
 * Version:        @(#)in.h        1.0.1        04/21/93
 *
 * Authors:        Original taken from the GNU Project <netinet/in.h> file.
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 */
#ifndef _LINUX_IN_H
#define _LINUX_IN_H


#include <linux/errno.h>
#include <uapi/linux/in.h>

static inline int proto_ports_offset(int proto)
{
        switch (proto) {
        case IPPROTO_TCP:
        case IPPROTO_UDP:
        case IPPROTO_DCCP:
        case IPPROTO_ESP:        /* SPI */
        case IPPROTO_SCTP:
        case IPPROTO_UDPLITE:
                return 0;
        case IPPROTO_AH:        /* SPI */
                return 4;
        default:
                return -EINVAL;
        }
}

static inline bool ipv4_is_loopback(__be32 addr)
{
        return (addr & htonl(0xff000000)) == htonl(0x7f000000);
}

static inline bool ipv4_is_multicast(__be32 addr)
{
        return (addr & htonl(0xf0000000)) == htonl(0xe0000000);
}

static inline bool ipv4_is_local_multicast(__be32 addr)
{
        return (addr & htonl(0xffffff00)) == htonl(0xe0000000);
}

static inline bool ipv4_is_lbcast(__be32 addr)
{
        /* limited broadcast */
        return addr == htonl(INADDR_BROADCAST);
}

static inline bool ipv4_is_all_snoopers(__be32 addr)
{
        return addr == htonl(INADDR_ALLSNOOPERS_GROUP);
}

static inline bool ipv4_is_zeronet(__be32 addr)
{
        return (addr == 0);
}

/* Special-Use IPv4 Addresses (RFC3330) */

static inline bool ipv4_is_private_10(__be32 addr)
{
        return (addr & htonl(0xff000000)) == htonl(0x0a000000);
}

static inline bool ipv4_is_private_172(__be32 addr)
{
        return (addr & htonl(0xfff00000)) == htonl(0xac100000);
}

static inline bool ipv4_is_private_192(__be32 addr)
{
        return (addr & htonl(0xffff0000)) == htonl(0xc0a80000);
}

static inline bool ipv4_is_linklocal_169(__be32 addr)
{
        return (addr & htonl(0xffff0000)) == htonl(0xa9fe0000);
}

static inline bool ipv4_is_anycast_6to4(__be32 addr)
{
        return (addr & htonl(0xffffff00)) == htonl(0xc0586300);
}

static inline bool ipv4_is_test_192(__be32 addr)
{
        return (addr & htonl(0xffffff00)) == htonl(0xc0000200);
}

static inline bool ipv4_is_test_198(__be32 addr)
{
        return (addr & htonl(0xfffe0000)) == htonl(0xc6120000);
}
#endif        /* _LINUX_IN_H */















































































































































































































    4 


    3 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_MMU_CONTEXT_H
#define _ASM_X86_MMU_CONTEXT_H

#include <asm/desc.h>
#include <linux/atomic.h>
#include <linux/mm_types.h>
#include <linux/pkeys.h>

#include <trace/events/tlb.h>

#include <asm/tlbflush.h>
#include <asm/paravirt.h>
#include <asm/debugreg.h>

extern atomic64_t last_mm_ctx_id;

#ifndef CONFIG_PARAVIRT_XXL
static inline void paravirt_activate_mm(struct mm_struct *prev,
                                        struct mm_struct *next)
{
}
#endif        /* !CONFIG_PARAVIRT_XXL */

#ifdef CONFIG_PERF_EVENTS
DECLARE_STATIC_KEY_FALSE(rdpmc_never_available_key);
DECLARE_STATIC_KEY_FALSE(rdpmc_always_available_key);
void cr4_update_pce(void *ignored);
#endif

#ifdef CONFIG_MODIFY_LDT_SYSCALL
/*
 * ldt_structs can be allocated, used, and freed, but they are never
 * modified while live.
 */
struct ldt_struct {
        /*
         * Xen requires page-aligned LDTs with special permissions.  This is
         * needed to prevent us from installing evil descriptors such as
         * call gates.  On native, we could merge the ldt_struct and LDT
         * allocations, but it's not worth trying to optimize.
         */
        struct desc_struct        *entries;
        unsigned int                nr_entries;

        /*
         * If PTI is in use, then the entries array is not mapped while we're
         * in user mode.  The whole array will be aliased at the addressed
         * given by ldt_slot_va(slot).  We use two slots so that we can allocate
         * and map, and enable a new LDT without invalidating the mapping
         * of an older, still-in-use LDT.
         *
         * slot will be -1 if this LDT doesn't have an alias mapping.
         */
        int                        slot;
};

/*
 * Used for LDT copy/destruction.
 */
static inline void init_new_context_ldt(struct mm_struct *mm)
{
        mm->context.ldt = NULL;
        init_rwsem(&mm->context.ldt_usr_sem);
}
int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
void destroy_context_ldt(struct mm_struct *mm);
void ldt_arch_exit_mmap(struct mm_struct *mm);
#else        /* CONFIG_MODIFY_LDT_SYSCALL */
static inline void init_new_context_ldt(struct mm_struct *mm) { }
static inline int ldt_dup_context(struct mm_struct *oldmm,
                                  struct mm_struct *mm)
{
        return 0;
}
static inline void destroy_context_ldt(struct mm_struct *mm) { }
static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { }
#endif

#ifdef CONFIG_MODIFY_LDT_SYSCALL
extern void load_mm_ldt(struct mm_struct *mm);
extern void switch_ldt(struct mm_struct *prev, struct mm_struct *next);
#else
static inline void load_mm_ldt(struct mm_struct *mm)
{
        clear_LDT();
}
static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
{
        DEBUG_LOCKS_WARN_ON(preemptible());
}
#endif

extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);

/*
 * Init a new mm.  Used on mm copies, like at fork()
 * and on mm's that are brand-new, like at execve().
 */
static inline int init_new_context(struct task_struct *tsk,
                                   struct mm_struct *mm)
{
        mutex_init(&mm->context.lock);

        mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
        atomic64_set(&mm->context.tlb_gen, 0);

#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
        if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
                /* pkey 0 is the default and allocated implicitly */
                mm->context.pkey_allocation_map = 0x1;
                /* -1 means unallocated or invalid */
                mm->context.execute_only_pkey = -1;
        }
#endif
        init_new_context_ldt(mm);
        return 0;
}
static inline void destroy_context(struct mm_struct *mm)
{
        destroy_context_ldt(mm);
}

extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
                      struct task_struct *tsk);

extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
                               struct task_struct *tsk);
#define switch_mm_irqs_off switch_mm_irqs_off

#define activate_mm(prev, next)                        \
do {                                                \
        paravirt_activate_mm((prev), (next));        \
        switch_mm((prev), (next), NULL);        \
} while (0);

#ifdef CONFIG_X86_32
#define deactivate_mm(tsk, mm)                        \
do {                                                \
        lazy_load_gs(0);                        \
} while (0)
#else
#define deactivate_mm(tsk, mm)                        \
do {                                                \
        load_gs_index(0);                        \
        loadsegment(fs, 0);                        \
} while (0)
#endif

static inline void arch_dup_pkeys(struct mm_struct *oldmm,
                                  struct mm_struct *mm)
{
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
        if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
                return;

        /* Duplicate the oldmm pkey state in mm: */
        mm->context.pkey_allocation_map = oldmm->context.pkey_allocation_map;
        mm->context.execute_only_pkey   = oldmm->context.execute_only_pkey;
#endif
}

static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
{
        arch_dup_pkeys(oldmm, mm);
        paravirt_arch_dup_mmap(oldmm, mm);
        return ldt_dup_context(oldmm, mm);
}

static inline void arch_exit_mmap(struct mm_struct *mm)
{
        paravirt_arch_exit_mmap(mm);
        ldt_arch_exit_mmap(mm);
}

#ifdef CONFIG_X86_64
static inline bool is_64bit_mm(struct mm_struct *mm)
{
        return        !IS_ENABLED(CONFIG_IA32_EMULATION) ||
                !(mm->context.ia32_compat == TIF_IA32);
}
#else
static inline bool is_64bit_mm(struct mm_struct *mm)
{
        return false;
}
#endif

static inline void arch_unmap(struct mm_struct *mm, unsigned long start,
                              unsigned long end)
{
}

/*
 * We only want to enforce protection keys on the current process
 * because we effectively have no access to PKRU for other
 * processes or any way to tell *which * PKRU in a threaded
 * process we could use.
 *
 * So do not enforce things if the VMA is not from the current
 * mm, or if we are in a kernel thread.
 */
static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
                bool write, bool execute, bool foreign)
{
        /* pkeys never affect instruction fetches */
        if (execute)
                return true;
        /* allow access if the VMA is not one from this process */
        if (foreign || vma_is_foreign(vma))
                return true;
        return __pkru_allows_pkey(vma_pkey(vma), write);
}

unsigned long __get_current_cr3_fast(void);

#endif /* _ASM_X86_MMU_CONTEXT_H */
































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
#undef TRACE_SYSTEM
#define TRACE_SYSTEM neigh

#if !defined(_TRACE_NEIGH_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_NEIGH_H

#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/tracepoint.h>
#include <net/neighbour.h>

#define neigh_state_str(state)                                \
        __print_symbolic(state,                                \
                { NUD_INCOMPLETE, "incomplete" },        \
                { NUD_REACHABLE, "reachable" },                \
                { NUD_STALE, "stale" },                        \
                { NUD_DELAY, "delay" },                        \
                { NUD_PROBE, "probe" },                        \
                { NUD_FAILED, "failed" },                \
                { NUD_NOARP, "noarp" },                        \
                { NUD_PERMANENT, "permanent"})

TRACE_EVENT(neigh_create,

        TP_PROTO(struct neigh_table *tbl, struct net_device *dev,
                 const void *pkey, const struct neighbour *n,
                 bool exempt_from_gc),

        TP_ARGS(tbl, dev, pkey, n, exempt_from_gc),

        TP_STRUCT__entry(
                __field(u32, family)
                __dynamic_array(char,  dev,   IFNAMSIZ )
                __field(int, entries)
                __field(u8, created)
                __field(u8, gc_exempt)
                __array(u8, primary_key4, 4)
                __array(u8, primary_key6, 16)
        ),

        TP_fast_assign(
                __be32 *p32;

                __entry->family = tbl->family;
                __assign_str(dev, (dev ? dev->name : "NULL"));
                __entry->entries = atomic_read(&tbl->gc_entries);
                __entry->created = n != NULL;
                __entry->gc_exempt = exempt_from_gc;
                p32 = (__be32 *)__entry->primary_key4;

                if (tbl->family == AF_INET)
                        *p32 = *(__be32 *)pkey;
                else
                        *p32 = 0;

#if IS_ENABLED(CONFIG_IPV6)
                if (tbl->family == AF_INET6) {
                        struct in6_addr *pin6;

                        pin6 = (struct in6_addr *)__entry->primary_key6;
                        *pin6 = *(struct in6_addr *)pkey;
                }
#endif
        ),

        TP_printk("family %d dev %s entries %d primary_key4 %pI4 primary_key6 %pI6c created %d gc_exempt %d",
                  __entry->family, __get_str(dev), __entry->entries,
                  __entry->primary_key4, __entry->primary_key6,
                  __entry->created, __entry->gc_exempt)
);

TRACE_EVENT(neigh_update,

        TP_PROTO(struct neighbour *n, const u8 *lladdr, u8 new,
                 u32 flags, u32 nlmsg_pid),

        TP_ARGS(n, lladdr, new, flags, nlmsg_pid),

        TP_STRUCT__entry(
                __field(u32, family)
                __string(dev, (n->dev ? n->dev->name : "NULL"))
                __array(u8, lladdr, MAX_ADDR_LEN)
                __field(u8, lladdr_len)
                __field(u8, flags)
                __field(u8, nud_state)
                __field(u8, type)
                __field(u8, dead)
                __field(int, refcnt)
                __array(__u8, primary_key4, 4)
                __array(__u8, primary_key6, 16)
                __field(unsigned long, confirmed)
                __field(unsigned long, updated)
                __field(unsigned long, used)
                __array(u8, new_lladdr, MAX_ADDR_LEN)
                __field(u8, new_state)
                __field(u32, update_flags)
                __field(u32, pid)
        ),

        TP_fast_assign(
                int lladdr_len = (n->dev ? n->dev->addr_len : MAX_ADDR_LEN);
                struct in6_addr *pin6;
                __be32 *p32;

                __entry->family = n->tbl->family;
                __assign_str(dev, (n->dev ? n->dev->name : "NULL"));
                __entry->lladdr_len = lladdr_len;
                memcpy(__entry->lladdr, n->ha, lladdr_len);
                __entry->flags = n->flags;
                __entry->nud_state = n->nud_state;
                __entry->type = n->type;
                __entry->dead = n->dead;
                __entry->refcnt = refcount_read(&n->refcnt);
                pin6 = (struct in6_addr *)__entry->primary_key6;
                p32 = (__be32 *)__entry->primary_key4;

                if (n->tbl->family == AF_INET)
                        *p32 = *(__be32 *)n->primary_key;
                else
                        *p32 = 0;

#if IS_ENABLED(CONFIG_IPV6)
                if (n->tbl->family == AF_INET6) {
                        pin6 = (struct in6_addr *)__entry->primary_key6;
                        *pin6 = *(struct in6_addr *)n->primary_key;
                } else
#endif
                {
                        ipv6_addr_set_v4mapped(*p32, pin6);
                }
                __entry->confirmed = n->confirmed;
                __entry->updated = n->updated;
                __entry->used = n->used;
                if (lladdr)
                        memcpy(__entry->new_lladdr, lladdr, lladdr_len);
                __entry->new_state = new;
                __entry->update_flags = flags;
                __entry->pid = nlmsg_pid;
        ),

        TP_printk("family %d dev %s lladdr %s flags %02x nud_state %s type %02x "
                  "dead %d refcnt %d primary_key4 %pI4 primary_key6 %pI6c "
                  "confirmed %lu updated %lu used %lu new_lladdr %s "
                  "new_state %s update_flags %02x pid %d",
                  __entry->family, __get_str(dev),
                  __print_hex_str(__entry->lladdr, __entry->lladdr_len),
                  __entry->flags, neigh_state_str(__entry->nud_state),
                  __entry->type, __entry->dead, __entry->refcnt,
                  __entry->primary_key4, __entry->primary_key6,
                  __entry->confirmed, __entry->updated, __entry->used,
                  __print_hex_str(__entry->new_lladdr, __entry->lladdr_len),
                  neigh_state_str(__entry->new_state),
                  __entry->update_flags, __entry->pid)
);

DECLARE_EVENT_CLASS(neigh__update,
        TP_PROTO(struct neighbour *n, int err),
        TP_ARGS(n, err),
        TP_STRUCT__entry(
                __field(u32, family)
                __string(dev, (n->dev ? n->dev->name : "NULL"))
                __array(u8, lladdr, MAX_ADDR_LEN)
                __field(u8, lladdr_len)
                __field(u8, flags)
                __field(u8, nud_state)
                __field(u8, type)
                __field(u8, dead)
                __field(int, refcnt)
                __array(__u8, primary_key4, 4)
                __array(__u8, primary_key6, 16)
                __field(unsigned long, confirmed)
                __field(unsigned long, updated)
                __field(unsigned long, used)
                __field(u32, err)
        ),

        TP_fast_assign(
                int lladdr_len = (n->dev ? n->dev->addr_len : MAX_ADDR_LEN);
                struct in6_addr *pin6;
                __be32 *p32;

                __entry->family = n->tbl->family;
                __assign_str(dev, (n->dev ? n->dev->name : "NULL"));
                __entry->lladdr_len = lladdr_len;
                memcpy(__entry->lladdr, n->ha, lladdr_len);
                __entry->flags = n->flags;
                __entry->nud_state = n->nud_state;
                __entry->type = n->type;
                __entry->dead = n->dead;
                __entry->refcnt = refcount_read(&n->refcnt);
                pin6 = (struct in6_addr *)__entry->primary_key6;
                p32 = (__be32 *)__entry->primary_key4;

                if (n->tbl->family == AF_INET)
                        *p32 = *(__be32 *)n->primary_key;
                else
                        *p32 = 0;

#if IS_ENABLED(CONFIG_IPV6)
                if (n->tbl->family == AF_INET6) {
                        pin6 = (struct in6_addr *)__entry->primary_key6;
                        *pin6 = *(struct in6_addr *)n->primary_key;
                } else
#endif
                {
                        ipv6_addr_set_v4mapped(*p32, pin6);
                }

                __entry->confirmed = n->confirmed;
                __entry->updated = n->updated;
                __entry->used = n->used;
                __entry->err = err;
        ),

        TP_printk("family %d dev %s lladdr %s flags %02x nud_state %s type %02x "
                  "dead %d refcnt %d primary_key4 %pI4 primary_key6 %pI6c "
                  "confirmed %lu updated %lu used %lu err %d",
                  __entry->family, __get_str(dev),
                  __print_hex_str(__entry->lladdr, __entry->lladdr_len),
                  __entry->flags, neigh_state_str(__entry->nud_state),
                  __entry->type, __entry->dead, __entry->refcnt,
                  __entry->primary_key4, __entry->primary_key6,
                  __entry->confirmed, __entry->updated, __entry->used,
                  __entry->err)
);

DEFINE_EVENT(neigh__update, neigh_update_done,
        TP_PROTO(struct neighbour *neigh, int err),
        TP_ARGS(neigh, err)
);

DEFINE_EVENT(neigh__update, neigh_timer_handler,
        TP_PROTO(struct neighbour *neigh, int err),
        TP_ARGS(neigh, err)
);

DEFINE_EVENT(neigh__update, neigh_event_send_done,
        TP_PROTO(struct neighbour *neigh, int err),
        TP_ARGS(neigh, err)
);

DEFINE_EVENT(neigh__update, neigh_event_send_dead,
        TP_PROTO(struct neighbour *neigh, int err),
        TP_ARGS(neigh, err)
);

DEFINE_EVENT(neigh__update, neigh_cleanup_and_release,
        TP_PROTO(struct neighbour *neigh, int rc),
        TP_ARGS(neigh, rc)
);

#endif /* _TRACE_NEIGH_H */

/* This part must be outside protection */
#include <trace/define_trace.h>






























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM signal

#if !defined(_TRACE_SIGNAL_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_SIGNAL_H

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/tracepoint.h>

#define TP_STORE_SIGINFO(__entry, info)                                \
        do {                                                        \
                if (info == SEND_SIG_NOINFO) {                        \
                        __entry->errno        = 0;                        \
                        __entry->code        = SI_USER;                \
                } else if (info == SEND_SIG_PRIV) {                \
                        __entry->errno        = 0;                        \
                        __entry->code        = SI_KERNEL;                \
                } else {                                        \
                        __entry->errno        = info->si_errno;        \
                        __entry->code        = info->si_code;        \
                }                                                \
        } while (0)

#ifndef TRACE_HEADER_MULTI_READ
enum {
        TRACE_SIGNAL_DELIVERED,
        TRACE_SIGNAL_IGNORED,
        TRACE_SIGNAL_ALREADY_PENDING,
        TRACE_SIGNAL_OVERFLOW_FAIL,
        TRACE_SIGNAL_LOSE_INFO,
};
#endif

/**
 * signal_generate - called when a signal is generated
 * @sig: signal number
 * @info: pointer to struct siginfo
 * @task: pointer to struct task_struct
 * @group: shared or private
 * @result: TRACE_SIGNAL_*
 *
 * Current process sends a 'sig' signal to 'task' process with
 * 'info' siginfo. If 'info' is SEND_SIG_NOINFO or SEND_SIG_PRIV,
 * 'info' is not a pointer and you can't access its field. Instead,
 * SEND_SIG_NOINFO means that si_code is SI_USER, and SEND_SIG_PRIV
 * means that si_code is SI_KERNEL.
 */
TRACE_EVENT(signal_generate,

        TP_PROTO(int sig, struct kernel_siginfo *info, struct task_struct *task,
                        int group, int result),

        TP_ARGS(sig, info, task, group, result),

        TP_STRUCT__entry(
                __field(        int,        sig                        )
                __field(        int,        errno                        )
                __field(        int,        code                        )
                __array(        char,        comm,        TASK_COMM_LEN        )
                __field(        pid_t,        pid                        )
                __field(        int,        group                        )
                __field(        int,        result                        )
        ),

        TP_fast_assign(
                __entry->sig        = sig;
                TP_STORE_SIGINFO(__entry, info);
                memcpy(__entry->comm, task->comm, TASK_COMM_LEN);
                __entry->pid        = task->pid;
                __entry->group        = group;
                __entry->result        = result;
        ),

        TP_printk("sig=%d errno=%d code=%d comm=%s pid=%d grp=%d res=%d",
                  __entry->sig, __entry->errno, __entry->code,
                  __entry->comm, __entry->pid, __entry->group,
                  __entry->result)
);

/**
 * signal_deliver - called when a signal is delivered
 * @sig: signal number
 * @info: pointer to struct siginfo
 * @ka: pointer to struct k_sigaction
 *
 * A 'sig' signal is delivered to current process with 'info' siginfo,
 * and it will be handled by 'ka'. ka->sa.sa_handler can be SIG_IGN or
 * SIG_DFL.
 * Note that some signals reported by signal_generate tracepoint can be
 * lost, ignored or modified (by debugger) before hitting this tracepoint.
 * This means, this can show which signals are actually delivered, but
 * matching generated signals and delivered signals may not be correct.
 */
TRACE_EVENT(signal_deliver,

        TP_PROTO(int sig, struct kernel_siginfo *info, struct k_sigaction *ka),

        TP_ARGS(sig, info, ka),

        TP_STRUCT__entry(
                __field(        int,                sig                )
                __field(        int,                errno                )
                __field(        int,                code                )
                __field(        unsigned long,        sa_handler        )
                __field(        unsigned long,        sa_flags        )
        ),

        TP_fast_assign(
                __entry->sig        = sig;
                TP_STORE_SIGINFO(__entry, info);
                __entry->sa_handler        = (unsigned long)ka->sa.sa_handler;
                __entry->sa_flags        = ka->sa.sa_flags;
        ),

        TP_printk("sig=%d errno=%d code=%d sa_handler=%lx sa_flags=%lx",
                  __entry->sig, __entry->errno, __entry->code,
                  __entry->sa_handler, __entry->sa_flags)
);

#endif /* _TRACE_SIGNAL_H */

/* This part must be outside protection */
#include <trace/define_trace.h>
























    4 

























































    4 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RCULIST_BL_H
#define _LINUX_RCULIST_BL_H

/*
 * RCU-protected bl list version. See include/linux/list_bl.h.
 */
#include <linux/list_bl.h>
#include <linux/rcupdate.h>

static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h,
                                        struct hlist_bl_node *n)
{
        LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK);
        LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) !=
                                                        LIST_BL_LOCKMASK);
        rcu_assign_pointer(h->first,
                (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK));
}

static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h)
{
        return (struct hlist_bl_node *)
                ((unsigned long)rcu_dereference_check(h->first, hlist_bl_is_locked(h)) & ~LIST_BL_LOCKMASK);
}

/**
 * hlist_bl_del_rcu - deletes entry from hash list without re-initialization
 * @n: the element to delete from the hash list.
 *
 * Note: hlist_bl_unhashed() on entry does not return true after this,
 * the entry is in an undefined state. It is useful for RCU based
 * lockfree traversal.
 *
 * In particular, it means that we can not poison the forward
 * pointers that may still be used for walking the hash list.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_bl_add_head_rcu()
 * or hlist_bl_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_bl_for_each_entry().
 */
static inline void hlist_bl_del_rcu(struct hlist_bl_node *n)
{
        __hlist_bl_del(n);
        n->pprev = LIST_POISON2;
}

/**
 * hlist_bl_add_head_rcu
 * @n: the element to add to the hash list.
 * @h: the list to add to.
 *
 * Description:
 * Adds the specified element to the specified hlist_bl,
 * while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_bl_add_head_rcu()
 * or hlist_bl_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_bl_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.  Regardless of the type of CPU, the
 * list-traversal primitive must be guarded by rcu_read_lock().
 */
static inline void hlist_bl_add_head_rcu(struct hlist_bl_node *n,
                                        struct hlist_bl_head *h)
{
        struct hlist_bl_node *first;

        /* don't need hlist_bl_first_rcu because we're under lock */
        first = hlist_bl_first(h);

        n->next = first;
        if (first)
                first->pprev = &n->next;
        n->pprev = &h->first;

        /* need _rcu because we can have concurrent lock free readers */
        hlist_bl_set_first_rcu(h, n);
}
/**
 * hlist_bl_for_each_entry_rcu - iterate over rcu list of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_bl_node to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_bl_node within the struct.
 *
 */
#define hlist_bl_for_each_entry_rcu(tpos, pos, head, member)                \
        for (pos = hlist_bl_first_rcu(head);                                \
                pos &&                                                        \
                ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1; }); \
                pos = rcu_dereference_raw(pos->next))

#endif





































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 

    1 



    1 




    1 


    1 













    1 




































    1 

    1 























    1 





















































































    1 























































































































































































































































































    1 















































































































































































































































































































































































































































































































































































































































































































































































































    1 


































































    1 













    1 





    1 








    1 




    1 















































    1 



    1 





    1 



    1 


















    1 







































































































































































































































































    1 




















































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Generic socket support routines. Memory allocators, socket lock/release
 *                handler for protocols to use and generic option handler.
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Florian La Roche, <flla@stud.uni-sb.de>
 *                Alan Cox, <A.Cox@swansea.ac.uk>
 *
 * Fixes:
 *                Alan Cox        :         Numerous verify_area() problems
 *                Alan Cox        :        Connecting on a connecting socket
 *                                        now returns an error for tcp.
 *                Alan Cox        :        sock->protocol is set correctly.
 *                                        and is not sometimes left as 0.
 *                Alan Cox        :        connect handles icmp errors on a
 *                                        connect properly. Unfortunately there
 *                                        is a restart syscall nasty there. I
 *                                        can't match BSD without hacking the C
 *                                        library. Ideas urgently sought!
 *                Alan Cox        :        Disallow bind() to addresses that are
 *                                        not ours - especially broadcast ones!!
 *                Alan Cox        :        Socket 1024 _IS_ ok for users. (fencepost)
 *                Alan Cox        :        sock_wfree/sock_rfree don't destroy sockets,
 *                                        instead they leave that for the DESTROY timer.
 *                Alan Cox        :        Clean up error flag in accept
 *                Alan Cox        :        TCP ack handling is buggy, the DESTROY timer
 *                                        was buggy. Put a remove_sock() in the handler
 *                                        for memory when we hit 0. Also altered the timer
 *                                        code. The ACK stuff can wait and needs major
 *                                        TCP layer surgery.
 *                Alan Cox        :        Fixed TCP ack bug, removed remove sock
 *                                        and fixed timer/inet_bh race.
 *                Alan Cox        :        Added zapped flag for TCP
 *                Alan Cox        :        Move kfree_skb into skbuff.c and tidied up surplus code
 *                Alan Cox        :        for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
 *                Alan Cox        :        kfree_s calls now are kfree_skbmem so we can track skb resources
 *                Alan Cox        :        Supports socket option broadcast now as does udp. Packet and raw need fixing.
 *                Alan Cox        :        Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
 *                Rick Sladkey        :        Relaxed UDP rules for matching packets.
 *                C.E.Hawkins        :        IFF_PROMISC/SIOCGHWADDR support
 *        Pauline Middelink        :        identd support
 *                Alan Cox        :        Fixed connect() taking signals I think.
 *                Alan Cox        :        SO_LINGER supported
 *                Alan Cox        :        Error reporting fixes
 *                Anonymous        :        inet_create tidied up (sk->reuse setting)
 *                Alan Cox        :        inet sockets don't set sk->type!
 *                Alan Cox        :        Split socket option code
 *                Alan Cox        :        Callbacks
 *                Alan Cox        :        Nagle flag for Charles & Johannes stuff
 *                Alex                :        Removed restriction on inet fioctl
 *                Alan Cox        :        Splitting INET from NET core
 *                Alan Cox        :        Fixed bogus SO_TYPE handling in getsockopt()
 *                Adam Caldwell        :        Missing return in SO_DONTROUTE/SO_DEBUG code
 *                Alan Cox        :        Split IP from generic code
 *                Alan Cox        :        New kfree_skbmem()
 *                Alan Cox        :        Make SO_DEBUG superuser only.
 *                Alan Cox        :        Allow anyone to clear SO_DEBUG
 *                                        (compatibility fix)
 *                Alan Cox        :        Added optimistic memory grabbing for AF_UNIX throughput.
 *                Alan Cox        :        Allocator for a socket is settable.
 *                Alan Cox        :        SO_ERROR includes soft errors.
 *                Alan Cox        :        Allow NULL arguments on some SO_ opts
 *                Alan Cox        :         Generic socket allocation to make hooks
 *                                        easier (suggested by Craig Metz).
 *                Michael Pall        :        SO_ERROR returns positive errno again
 *              Steve Whitehouse:       Added default destructor to free
 *                                      protocol private data.
 *              Steve Whitehouse:       Added various other default routines
 *                                      common to several socket families.
 *              Chris Evans     :       Call suser() check last on F_SETOWN
 *                Jay Schulist        :        Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
 *                Andi Kleen        :        Add sock_kmalloc()/sock_kfree_s()
 *                Andi Kleen        :        Fix write_space callback
 *                Chris Evans        :        Security fixes - signedness again
 *                Arnaldo C. Melo :       cleanups, use skb_queue_purge
 *
 * To Fix:
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <asm/unaligned.h>
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/errqueue.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/interrupt.h>
#include <linux/poll.h>
#include <linux/tcp.h>
#include <linux/init.h>
#include <linux/highmem.h>
#include <linux/user_namespace.h>
#include <linux/static_key.h>
#include <linux/memcontrol.h>
#include <linux/prefetch.h>
#include <linux/compat.h>

#include <linux/uaccess.h>

#include <linux/netdevice.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
#include <net/net_namespace.h>
#include <net/request_sock.h>
#include <net/sock.h>
#include <linux/net_tstamp.h>
#include <net/xfrm.h>
#include <linux/ipsec.h>
#include <net/cls_cgroup.h>
#include <net/netprio_cgroup.h>
#include <linux/sock_diag.h>

#include <linux/filter.h>
#include <net/sock_reuseport.h>
#include <net/bpf_sk_storage.h>

#include <trace/events/sock.h>

#include <net/tcp.h>
#include <net/busy_poll.h>

static DEFINE_MUTEX(proto_list_mutex);
static LIST_HEAD(proto_list);

static void sock_inuse_add(struct net *net, int val);

/**
 * sk_ns_capable - General socket capability test
 * @sk: Socket to use a capability on or through
 * @user_ns: The user namespace of the capability to use
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket had when the socket was
 * created and the current process has the capability @cap in the user
 * namespace @user_ns.
 */
bool sk_ns_capable(const struct sock *sk,
                   struct user_namespace *user_ns, int cap)
{
        return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
                ns_capable(user_ns, cap);
}
EXPORT_SYMBOL(sk_ns_capable);

/**
 * sk_capable - Socket global capability test
 * @sk: Socket to use a capability on or through
 * @cap: The global capability to use
 *
 * Test to see if the opener of the socket had when the socket was
 * created and the current process has the capability @cap in all user
 * namespaces.
 */
bool sk_capable(const struct sock *sk, int cap)
{
        return sk_ns_capable(sk, &init_user_ns, cap);
}
EXPORT_SYMBOL(sk_capable);

/**
 * sk_net_capable - Network namespace socket capability test
 * @sk: Socket to use a capability on or through
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket had when the socket was created
 * and the current process has the capability @cap over the network namespace
 * the socket is a member of.
 */
bool sk_net_capable(const struct sock *sk, int cap)
{
        return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
}
EXPORT_SYMBOL(sk_net_capable);

/*
 * Each address family might have different locking rules, so we have
 * one slock key per address family and separate keys for internal and
 * userspace sockets.
 */
static struct lock_class_key af_family_keys[AF_MAX];
static struct lock_class_key af_family_kern_keys[AF_MAX];
static struct lock_class_key af_family_slock_keys[AF_MAX];
static struct lock_class_key af_family_kern_slock_keys[AF_MAX];

/*
 * Make lock validator output more readable. (we pre-construct these
 * strings build-time, so that runtime initialization of socket
 * locks is fast):
 */

#define _sock_locks(x)                                                  \
  x "AF_UNSPEC",        x "AF_UNIX"     ,        x "AF_INET"     , \
  x "AF_AX25"  ,        x "AF_IPX"      ,        x "AF_APPLETALK", \
  x "AF_NETROM",        x "AF_BRIDGE"   ,        x "AF_ATMPVC"   , \
  x "AF_X25"   ,        x "AF_INET6"    ,        x "AF_ROSE"     , \
  x "AF_DECnet",        x "AF_NETBEUI"  ,        x "AF_SECURITY" , \
  x "AF_KEY"   ,        x "AF_NETLINK"  ,        x "AF_PACKET"   , \
  x "AF_ASH"   ,        x "AF_ECONET"   ,        x "AF_ATMSVC"   , \
  x "AF_RDS"   ,        x "AF_SNA"      ,        x "AF_IRDA"     , \
  x "AF_PPPOX" ,        x "AF_WANPIPE"  ,        x "AF_LLC"      , \
  x "27"       ,        x "28"          ,        x "AF_CAN"      , \
  x "AF_TIPC"  ,        x "AF_BLUETOOTH",        x "IUCV"        , \
  x "AF_RXRPC" ,        x "AF_ISDN"     ,        x "AF_PHONET"   , \
  x "AF_IEEE802154",        x "AF_CAIF"        ,        x "AF_ALG"      , \
  x "AF_NFC"   ,        x "AF_VSOCK"    ,        x "AF_KCM"      , \
  x "AF_QIPCRTR",        x "AF_SMC"        ,        x "AF_XDP"        , \
  x "AF_MAX"

static const char *const af_family_key_strings[AF_MAX+1] = {
        _sock_locks("sk_lock-")
};
static const char *const af_family_slock_key_strings[AF_MAX+1] = {
        _sock_locks("slock-")
};
static const char *const af_family_clock_key_strings[AF_MAX+1] = {
        _sock_locks("clock-")
};

static const char *const af_family_kern_key_strings[AF_MAX+1] = {
        _sock_locks("k-sk_lock-")
};
static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
        _sock_locks("k-slock-")
};
static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
        _sock_locks("k-clock-")
};
static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
        _sock_locks("rlock-")
};
static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
        _sock_locks("wlock-")
};
static const char *const af_family_elock_key_strings[AF_MAX+1] = {
        _sock_locks("elock-")
};

/*
 * sk_callback_lock and sk queues locking rules are per-address-family,
 * so split the lock classes by using a per-AF key:
 */
static struct lock_class_key af_callback_keys[AF_MAX];
static struct lock_class_key af_rlock_keys[AF_MAX];
static struct lock_class_key af_wlock_keys[AF_MAX];
static struct lock_class_key af_elock_keys[AF_MAX];
static struct lock_class_key af_kern_callback_keys[AF_MAX];

/* Run time adjustable parameters. */
__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
EXPORT_SYMBOL(sysctl_wmem_max);
__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
EXPORT_SYMBOL(sysctl_rmem_max);
__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;

/* Maximal space eaten by iovec or ancillary data plus some space */
int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
EXPORT_SYMBOL(sysctl_optmem_max);

int sysctl_tstamp_allow_data __read_mostly = 1;

DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
EXPORT_SYMBOL_GPL(memalloc_socks_key);

/**
 * sk_set_memalloc - sets %SOCK_MEMALLOC
 * @sk: socket to set it on
 *
 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 * It's the responsibility of the admin to adjust min_free_kbytes
 * to meet the requirements
 */
void sk_set_memalloc(struct sock *sk)
{
        sock_set_flag(sk, SOCK_MEMALLOC);
        sk->sk_allocation |= __GFP_MEMALLOC;
        static_branch_inc(&memalloc_socks_key);
}
EXPORT_SYMBOL_GPL(sk_set_memalloc);

void sk_clear_memalloc(struct sock *sk)
{
        sock_reset_flag(sk, SOCK_MEMALLOC);
        sk->sk_allocation &= ~__GFP_MEMALLOC;
        static_branch_dec(&memalloc_socks_key);

        /*
         * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
         * progress of swapping. SOCK_MEMALLOC may be cleared while
         * it has rmem allocations due to the last swapfile being deactivated
         * but there is a risk that the socket is unusable due to exceeding
         * the rmem limits. Reclaim the reserves and obey rmem limits again.
         */
        sk_mem_reclaim(sk);
}
EXPORT_SYMBOL_GPL(sk_clear_memalloc);

int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
{
        int ret;
        unsigned int noreclaim_flag;

        /* these should have been dropped before queueing */
        BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));

        noreclaim_flag = memalloc_noreclaim_save();
        ret = sk->sk_backlog_rcv(sk, skb);
        memalloc_noreclaim_restore(noreclaim_flag);

        return ret;
}
EXPORT_SYMBOL(__sk_backlog_rcv);

static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
{
        struct __kernel_sock_timeval tv;

        if (timeo == MAX_SCHEDULE_TIMEOUT) {
                tv.tv_sec = 0;
                tv.tv_usec = 0;
        } else {
                tv.tv_sec = timeo / HZ;
                tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
        }

        if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
                struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
                *(struct old_timeval32 *)optval = tv32;
                return sizeof(tv32);
        }

        if (old_timeval) {
                struct __kernel_old_timeval old_tv;
                old_tv.tv_sec = tv.tv_sec;
                old_tv.tv_usec = tv.tv_usec;
                *(struct __kernel_old_timeval *)optval = old_tv;
                return sizeof(old_tv);
        }

        *(struct __kernel_sock_timeval *)optval = tv;
        return sizeof(tv);
}

static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
                            bool old_timeval)
{
        struct __kernel_sock_timeval tv;

        if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
                struct old_timeval32 tv32;

                if (optlen < sizeof(tv32))
                        return -EINVAL;

                if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
                        return -EFAULT;
                tv.tv_sec = tv32.tv_sec;
                tv.tv_usec = tv32.tv_usec;
        } else if (old_timeval) {
                struct __kernel_old_timeval old_tv;

                if (optlen < sizeof(old_tv))
                        return -EINVAL;
                if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
                        return -EFAULT;
                tv.tv_sec = old_tv.tv_sec;
                tv.tv_usec = old_tv.tv_usec;
        } else {
                if (optlen < sizeof(tv))
                        return -EINVAL;
                if (copy_from_sockptr(&tv, optval, sizeof(tv)))
                        return -EFAULT;
        }
        if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
                return -EDOM;

        if (tv.tv_sec < 0) {
                static int warned __read_mostly;

                *timeo_p = 0;
                if (warned < 10 && net_ratelimit()) {
                        warned++;
                        pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
                                __func__, current->comm, task_pid_nr(current));
                }
                return 0;
        }
        *timeo_p = MAX_SCHEDULE_TIMEOUT;
        if (tv.tv_sec == 0 && tv.tv_usec == 0)
                return 0;
        if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
                *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
        return 0;
}

static bool sock_needs_netstamp(const struct sock *sk)
{
        switch (sk->sk_family) {
        case AF_UNSPEC:
        case AF_UNIX:
                return false;
        default:
                return true;
        }
}

static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
{
        if (sk->sk_flags & flags) {
                sk->sk_flags &= ~flags;
                if (sock_needs_netstamp(sk) &&
                    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
                        net_disable_timestamp();
        }
}


int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
        unsigned long flags;
        struct sk_buff_head *list = &sk->sk_receive_queue;

        if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
                atomic_inc(&sk->sk_drops);
                trace_sock_rcvqueue_full(sk, skb);
                return -ENOMEM;
        }

        if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
                atomic_inc(&sk->sk_drops);
                return -ENOBUFS;
        }

        skb->dev = NULL;
        skb_set_owner_r(skb, sk);

        /* we escape from rcu protected region, make sure we dont leak
         * a norefcounted dst
         */
        skb_dst_force(skb);

        spin_lock_irqsave(&list->lock, flags);
        sock_skb_set_dropcount(sk, skb);
        __skb_queue_tail(list, skb);
        spin_unlock_irqrestore(&list->lock, flags);

        if (!sock_flag(sk, SOCK_DEAD))
                sk->sk_data_ready(sk);
        return 0;
}
EXPORT_SYMBOL(__sock_queue_rcv_skb);

int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
        int err;

        err = sk_filter(sk, skb);
        if (err)
                return err;

        return __sock_queue_rcv_skb(sk, skb);
}
EXPORT_SYMBOL(sock_queue_rcv_skb);

int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
                     const int nested, unsigned int trim_cap, bool refcounted)
{
        int rc = NET_RX_SUCCESS;

        if (sk_filter_trim_cap(sk, skb, trim_cap))
                goto discard_and_relse;

        skb->dev = NULL;

        if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
                atomic_inc(&sk->sk_drops);
                goto discard_and_relse;
        }
        if (nested)
                bh_lock_sock_nested(sk);
        else
                bh_lock_sock(sk);
        if (!sock_owned_by_user(sk)) {
                /*
                 * trylock + unlock semantics:
                 */
                mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);

                rc = sk_backlog_rcv(sk, skb);

                mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
        } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
                bh_unlock_sock(sk);
                atomic_inc(&sk->sk_drops);
                goto discard_and_relse;
        }

        bh_unlock_sock(sk);
out:
        if (refcounted)
                sock_put(sk);
        return rc;
discard_and_relse:
        kfree_skb(skb);
        goto out;
}
EXPORT_SYMBOL(__sk_receive_skb);

struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
{
        struct dst_entry *dst = __sk_dst_get(sk);

        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
                sk_tx_queue_clear(sk);
                WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
                RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
                dst_release(dst);
                return NULL;
        }

        return dst;
}
EXPORT_SYMBOL(__sk_dst_check);

struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
{
        struct dst_entry *dst = sk_dst_get(sk);

        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
                sk_dst_reset(sk);
                dst_release(dst);
                return NULL;
        }

        return dst;
}
EXPORT_SYMBOL(sk_dst_check);

static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
{
        int ret = -ENOPROTOOPT;
#ifdef CONFIG_NETDEVICES
        struct net *net = sock_net(sk);

        /* Sorry... */
        ret = -EPERM;
        if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
                goto out;

        ret = -EINVAL;
        if (ifindex < 0)
                goto out;

        sk->sk_bound_dev_if = ifindex;
        if (sk->sk_prot->rehash)
                sk->sk_prot->rehash(sk);
        sk_dst_reset(sk);

        ret = 0;

out:
#endif

        return ret;
}

int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
{
        int ret;

        if (lock_sk)
                lock_sock(sk);
        ret = sock_bindtoindex_locked(sk, ifindex);
        if (lock_sk)
                release_sock(sk);

        return ret;
}
EXPORT_SYMBOL(sock_bindtoindex);

static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
{
        int ret = -ENOPROTOOPT;
#ifdef CONFIG_NETDEVICES
        struct net *net = sock_net(sk);
        char devname[IFNAMSIZ];
        int index;

        ret = -EINVAL;
        if (optlen < 0)
                goto out;

        /* Bind this socket to a particular device like "eth0",
         * as specified in the passed interface name. If the
         * name is "" or the option length is zero the socket
         * is not bound.
         */
        if (optlen > IFNAMSIZ - 1)
                optlen = IFNAMSIZ - 1;
        memset(devname, 0, sizeof(devname));

        ret = -EFAULT;
        if (copy_from_sockptr(devname, optval, optlen))
                goto out;

        index = 0;
        if (devname[0] != '\0') {
                struct net_device *dev;

                rcu_read_lock();
                dev = dev_get_by_name_rcu(net, devname);
                if (dev)
                        index = dev->ifindex;
                rcu_read_unlock();
                ret = -ENODEV;
                if (!dev)
                        goto out;
        }

        return sock_bindtoindex(sk, index, true);
out:
#endif

        return ret;
}

static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
                                sockptr_t optlen, int len)
{
        int ret = -ENOPROTOOPT;
#ifdef CONFIG_NETDEVICES
        struct net *net = sock_net(sk);
        char devname[IFNAMSIZ];

        if (sk->sk_bound_dev_if == 0) {
                len = 0;
                goto zero;
        }

        ret = -EINVAL;
        if (len < IFNAMSIZ)
                goto out;

        ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
        if (ret)
                goto out;

        len = strlen(devname) + 1;

        ret = -EFAULT;
        if (copy_to_sockptr(optval, devname, len))
                goto out;

zero:
        ret = -EFAULT;
        if (copy_to_sockptr(optlen, &len, sizeof(int)))
                goto out;

        ret = 0;

out:
#endif

        return ret;
}

bool sk_mc_loop(struct sock *sk)
{
        if (dev_recursion_level())
                return false;
        if (!sk)
                return true;
        /* IPV6_ADDRFORM can change sk->sk_family under us. */
        switch (READ_ONCE(sk->sk_family)) {
        case AF_INET:
                return inet_sk(sk)->mc_loop;
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                return inet6_sk(sk)->mc_loop;
#endif
        }
        WARN_ON_ONCE(1);
        return true;
}
EXPORT_SYMBOL(sk_mc_loop);

void sock_set_reuseaddr(struct sock *sk)
{
        lock_sock(sk);
        sk->sk_reuse = SK_CAN_REUSE;
        release_sock(sk);
}
EXPORT_SYMBOL(sock_set_reuseaddr);

void sock_set_reuseport(struct sock *sk)
{
        lock_sock(sk);
        sk->sk_reuseport = true;
        release_sock(sk);
}
EXPORT_SYMBOL(sock_set_reuseport);

void sock_no_linger(struct sock *sk)
{
        lock_sock(sk);
        sk->sk_lingertime = 0;
        sock_set_flag(sk, SOCK_LINGER);
        release_sock(sk);
}
EXPORT_SYMBOL(sock_no_linger);

void sock_set_priority(struct sock *sk, u32 priority)
{
        lock_sock(sk);
        sk->sk_priority = priority;
        release_sock(sk);
}
EXPORT_SYMBOL(sock_set_priority);

void sock_set_sndtimeo(struct sock *sk, s64 secs)
{
        lock_sock(sk);
        if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
                sk->sk_sndtimeo = secs * HZ;
        else
                sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
        release_sock(sk);
}
EXPORT_SYMBOL(sock_set_sndtimeo);

static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
{
        if (val)  {
                sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
                sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
                sock_set_flag(sk, SOCK_RCVTSTAMP);
                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
        } else {
                sock_reset_flag(sk, SOCK_RCVTSTAMP);
                sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
        }
}

void sock_enable_timestamps(struct sock *sk)
{
        lock_sock(sk);
        __sock_set_timestamps(sk, true, false, true);
        release_sock(sk);
}
EXPORT_SYMBOL(sock_enable_timestamps);

void sock_set_keepalive(struct sock *sk)
{
        lock_sock(sk);
        if (sk->sk_prot->keepalive)
                sk->sk_prot->keepalive(sk, true);
        sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
        release_sock(sk);
}
EXPORT_SYMBOL(sock_set_keepalive);

static void __sock_set_rcvbuf(struct sock *sk, int val)
{
        /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
         * as a negative value.
         */
        val = min_t(int, val, INT_MAX / 2);
        sk->sk_userlocks |= SOCK_RCVBUF_LOCK;

        /* We double it on the way in to account for "struct sk_buff" etc.
         * overhead.   Applications assume that the SO_RCVBUF setting they make
         * will allow that much actual data to be received on that socket.
         *
         * Applications are unaware that "struct sk_buff" and other overheads
         * allocate from the receive buffer during socket buffer allocation.
         *
         * And after considering the possible alternatives, returning the value
         * we actually used in getsockopt is the most desirable behavior.
         */
        WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
}

void sock_set_rcvbuf(struct sock *sk, int val)
{
        lock_sock(sk);
        __sock_set_rcvbuf(sk, val);
        release_sock(sk);
}
EXPORT_SYMBOL(sock_set_rcvbuf);

static void __sock_set_mark(struct sock *sk, u32 val)
{
        if (val != sk->sk_mark) {
                sk->sk_mark = val;
                sk_dst_reset(sk);
        }
}

void sock_set_mark(struct sock *sk, u32 val)
{
        lock_sock(sk);
        __sock_set_mark(sk, val);
        release_sock(sk);
}
EXPORT_SYMBOL(sock_set_mark);

/*
 *        This is meant for all protocols to use and covers goings on
 *        at the socket level. Everything here is generic.
 */

int sock_setsockopt(struct socket *sock, int level, int optname,
                    sockptr_t optval, unsigned int optlen)
{
        struct sock_txtime sk_txtime;
        struct sock *sk = sock->sk;
        int val;
        int valbool;
        struct linger ling;
        int ret = 0;

        /*
         *        Options without arguments
         */

        if (optname == SO_BINDTODEVICE)
                return sock_setbindtodevice(sk, optval, optlen);

        if (optlen < sizeof(int))
                return -EINVAL;

        if (copy_from_sockptr(&val, optval, sizeof(val)))
                return -EFAULT;

        valbool = val ? 1 : 0;

        lock_sock(sk);

        switch (optname) {
        case SO_DEBUG:
                if (val && !capable(CAP_NET_ADMIN))
                        ret = -EACCES;
                else
                        sock_valbool_flag(sk, SOCK_DBG, valbool);
                break;
        case SO_REUSEADDR:
                sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
                break;
        case SO_REUSEPORT:
                sk->sk_reuseport = valbool;
                break;
        case SO_TYPE:
        case SO_PROTOCOL:
        case SO_DOMAIN:
        case SO_ERROR:
                ret = -ENOPROTOOPT;
                break;
        case SO_DONTROUTE:
                sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
                sk_dst_reset(sk);
                break;
        case SO_BROADCAST:
                sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
                break;
        case SO_SNDBUF:
                /* Don't error on this BSD doesn't and if you think
                 * about it this is right. Otherwise apps have to
                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
                 * are treated in BSD as hints
                 */
                val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
set_sndbuf:
                /* Ensure val * 2 fits into an int, to prevent max_t()
                 * from treating it as a negative value.
                 */
                val = min_t(int, val, INT_MAX / 2);
                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
                WRITE_ONCE(sk->sk_sndbuf,
                           max_t(int, val * 2, SOCK_MIN_SNDBUF));
                /* Wake up sending tasks if we upped the value. */
                sk->sk_write_space(sk);
                break;

        case SO_SNDBUFFORCE:
                if (!capable(CAP_NET_ADMIN)) {
                        ret = -EPERM;
                        break;
                }

                /* No negative values (to prevent underflow, as val will be
                 * multiplied by 2).
                 */
                if (val < 0)
                        val = 0;
                goto set_sndbuf;

        case SO_RCVBUF:
                /* Don't error on this BSD doesn't and if you think
                 * about it this is right. Otherwise apps have to
                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
                 * are treated in BSD as hints
                 */
                __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
                break;

        case SO_RCVBUFFORCE:
                if (!capable(CAP_NET_ADMIN)) {
                        ret = -EPERM;
                        break;
                }

                /* No negative values (to prevent underflow, as val will be
                 * multiplied by 2).
                 */
                __sock_set_rcvbuf(sk, max(val, 0));
                break;

        case SO_KEEPALIVE:
                if (sk->sk_prot->keepalive)
                        sk->sk_prot->keepalive(sk, valbool);
                sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
                break;

        case SO_OOBINLINE:
                sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
                break;

        case SO_NO_CHECK:
                sk->sk_no_check_tx = valbool;
                break;

        case SO_PRIORITY:
                if ((val >= 0 && val <= 6) ||
                    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
                        sk->sk_priority = val;
                else
                        ret = -EPERM;
                break;

        case SO_LINGER:
                if (optlen < sizeof(ling)) {
                        ret = -EINVAL;        /* 1003.1g */
                        break;
                }
                if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
                        ret = -EFAULT;
                        break;
                }
                if (!ling.l_onoff)
                        sock_reset_flag(sk, SOCK_LINGER);
                else {
#if (BITS_PER_LONG == 32)
                        if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
                                sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
                        else
#endif
                                sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
                        sock_set_flag(sk, SOCK_LINGER);
                }
                break;

        case SO_BSDCOMPAT:
                break;

        case SO_PASSCRED:
                if (valbool)
                        set_bit(SOCK_PASSCRED, &sock->flags);
                else
                        clear_bit(SOCK_PASSCRED, &sock->flags);
                break;

        case SO_TIMESTAMP_OLD:
                __sock_set_timestamps(sk, valbool, false, false);
                break;
        case SO_TIMESTAMP_NEW:
                __sock_set_timestamps(sk, valbool, true, false);
                break;
        case SO_TIMESTAMPNS_OLD:
                __sock_set_timestamps(sk, valbool, false, true);
                break;
        case SO_TIMESTAMPNS_NEW:
                __sock_set_timestamps(sk, valbool, true, true);
                break;
        case SO_TIMESTAMPING_NEW:
        case SO_TIMESTAMPING_OLD:
                if (val & ~SOF_TIMESTAMPING_MASK) {
                        ret = -EINVAL;
                        break;
                }

                if (val & SOF_TIMESTAMPING_OPT_ID &&
                    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
                        if (sk->sk_protocol == IPPROTO_TCP &&
                            sk->sk_type == SOCK_STREAM) {
                                if ((1 << sk->sk_state) &
                                    (TCPF_CLOSE | TCPF_LISTEN)) {
                                        ret = -EINVAL;
                                        break;
                                }
                                sk->sk_tskey = tcp_sk(sk)->snd_una;
                        } else {
                                sk->sk_tskey = 0;
                        }
                }

                if (val & SOF_TIMESTAMPING_OPT_STATS &&
                    !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
                        ret = -EINVAL;
                        break;
                }

                sk->sk_tsflags = val;
                sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);

                if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
                        sock_enable_timestamp(sk,
                                              SOCK_TIMESTAMPING_RX_SOFTWARE);
                else
                        sock_disable_timestamp(sk,
                                               (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
                break;

        case SO_RCVLOWAT:
                if (val < 0)
                        val = INT_MAX;
                if (sock->ops->set_rcvlowat)
                        ret = sock->ops->set_rcvlowat(sk, val);
                else
                        WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
                break;

        case SO_RCVTIMEO_OLD:
        case SO_RCVTIMEO_NEW:
                ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
                                       optlen, optname == SO_RCVTIMEO_OLD);
                break;

        case SO_SNDTIMEO_OLD:
        case SO_SNDTIMEO_NEW:
                ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
                                       optlen, optname == SO_SNDTIMEO_OLD);
                break;

        case SO_ATTACH_FILTER: {
                struct sock_fprog fprog;

                ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
                if (!ret)
                        ret = sk_attach_filter(&fprog, sk);
                break;
        }
        case SO_ATTACH_BPF:
                ret = -EINVAL;
                if (optlen == sizeof(u32)) {
                        u32 ufd;

                        ret = -EFAULT;
                        if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
                                break;

                        ret = sk_attach_bpf(ufd, sk);
                }
                break;

        case SO_ATTACH_REUSEPORT_CBPF: {
                struct sock_fprog fprog;

                ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
                if (!ret)
                        ret = sk_reuseport_attach_filter(&fprog, sk);
                break;
        }
        case SO_ATTACH_REUSEPORT_EBPF:
                ret = -EINVAL;
                if (optlen == sizeof(u32)) {
                        u32 ufd;

                        ret = -EFAULT;
                        if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
                                break;

                        ret = sk_reuseport_attach_bpf(ufd, sk);
                }
                break;

        case SO_DETACH_REUSEPORT_BPF:
                ret = reuseport_detach_prog(sk);
                break;

        case SO_DETACH_FILTER:
                ret = sk_detach_filter(sk);
                break;

        case SO_LOCK_FILTER:
                if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
                        ret = -EPERM;
                else
                        sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
                break;

        case SO_PASSSEC:
                if (valbool)
                        set_bit(SOCK_PASSSEC, &sock->flags);
                else
                        clear_bit(SOCK_PASSSEC, &sock->flags);
                break;
        case SO_MARK:
                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
                        ret = -EPERM;
                        break;
                }

                __sock_set_mark(sk, val);
                break;

        case SO_RXQ_OVFL:
                sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
                break;

        case SO_WIFI_STATUS:
                sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
                break;

        case SO_PEEK_OFF:
                if (sock->ops->set_peek_off)
                        ret = sock->ops->set_peek_off(sk, val);
                else
                        ret = -EOPNOTSUPP;
                break;

        case SO_NOFCS:
                sock_valbool_flag(sk, SOCK_NOFCS, valbool);
                break;

        case SO_SELECT_ERR_QUEUE:
                sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
                break;

#ifdef CONFIG_NET_RX_BUSY_POLL
        case SO_BUSY_POLL:
                /* allow unprivileged users to decrease the value */
                if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
                        ret = -EPERM;
                else {
                        if (val < 0)
                                ret = -EINVAL;
                        else
                                WRITE_ONCE(sk->sk_ll_usec, val);
                }
                break;
#endif

        case SO_MAX_PACING_RATE:
                {
                unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;

                if (sizeof(ulval) != sizeof(val) &&
                    optlen >= sizeof(ulval) &&
                    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
                        ret = -EFAULT;
                        break;
                }
                if (ulval != ~0UL)
                        cmpxchg(&sk->sk_pacing_status,
                                SK_PACING_NONE,
                                SK_PACING_NEEDED);
                /* Pairs with READ_ONCE() from sk_getsockopt() */
                WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
                sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
                break;
                }
        case SO_INCOMING_CPU:
                WRITE_ONCE(sk->sk_incoming_cpu, val);
                break;

        case SO_CNX_ADVICE:
                if (val == 1)
                        dst_negative_advice(sk);
                break;

        case SO_ZEROCOPY:
                if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
                        if (!((sk->sk_type == SOCK_STREAM &&
                               sk->sk_protocol == IPPROTO_TCP) ||
                              (sk->sk_type == SOCK_DGRAM &&
                               sk->sk_protocol == IPPROTO_UDP)))
                                ret = -ENOTSUPP;
                } else if (sk->sk_family != PF_RDS) {
                        ret = -ENOTSUPP;
                }
                if (!ret) {
                        if (val < 0 || val > 1)
                                ret = -EINVAL;
                        else
                                sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
                }
                break;

        case SO_TXTIME:
                if (optlen != sizeof(struct sock_txtime)) {
                        ret = -EINVAL;
                        break;
                } else if (copy_from_sockptr(&sk_txtime, optval,
                           sizeof(struct sock_txtime))) {
                        ret = -EFAULT;
                        break;
                } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
                        ret = -EINVAL;
                        break;
                }
                /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
                 * scheduler has enough safe guards.
                 */
                if (sk_txtime.clockid != CLOCK_MONOTONIC &&
                    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
                        ret = -EPERM;
                        break;
                }
                sock_valbool_flag(sk, SOCK_TXTIME, true);
                sk->sk_clockid = sk_txtime.clockid;
                sk->sk_txtime_deadline_mode =
                        !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
                sk->sk_txtime_report_errors =
                        !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
                break;

        case SO_BINDTOIFINDEX:
                ret = sock_bindtoindex_locked(sk, val);
                break;

        default:
                ret = -ENOPROTOOPT;
                break;
        }
        release_sock(sk);
        return ret;
}
EXPORT_SYMBOL(sock_setsockopt);

static const struct cred *sk_get_peer_cred(struct sock *sk)
{
        const struct cred *cred;

        spin_lock(&sk->sk_peer_lock);
        cred = get_cred(sk->sk_peer_cred);
        spin_unlock(&sk->sk_peer_lock);

        return cred;
}

static void cred_to_ucred(struct pid *pid, const struct cred *cred,
                          struct ucred *ucred)
{
        ucred->pid = pid_vnr(pid);
        ucred->uid = ucred->gid = -1;
        if (cred) {
                struct user_namespace *current_ns = current_user_ns();

                ucred->uid = from_kuid_munged(current_ns, cred->euid);
                ucred->gid = from_kgid_munged(current_ns, cred->egid);
        }
}

static int groups_to_user(sockptr_t dst, const struct group_info *src)
{
        struct user_namespace *user_ns = current_user_ns();
        int i;

        for (i = 0; i < src->ngroups; i++) {
                gid_t gid = from_kgid_munged(user_ns, src->gid[i]);

                if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
                        return -EFAULT;
        }

        return 0;
}

static int sk_getsockopt(struct sock *sk, int level, int optname,
                         sockptr_t optval, sockptr_t optlen)
{
        struct socket *sock = sk->sk_socket;

        union {
                int val;
                u64 val64;
                unsigned long ulval;
                struct linger ling;
                struct old_timeval32 tm32;
                struct __kernel_old_timeval tm;
                struct  __kernel_sock_timeval stm;
                struct sock_txtime txtime;
        } v;

        int lv = sizeof(int);
        int len;

        if (copy_from_sockptr(&len, optlen, sizeof(int)))
                return -EFAULT;
        if (len < 0)
                return -EINVAL;

        memset(&v, 0, sizeof(v));

        switch (optname) {
        case SO_DEBUG:
                v.val = sock_flag(sk, SOCK_DBG);
                break;

        case SO_DONTROUTE:
                v.val = sock_flag(sk, SOCK_LOCALROUTE);
                break;

        case SO_BROADCAST:
                v.val = sock_flag(sk, SOCK_BROADCAST);
                break;

        case SO_SNDBUF:
                v.val = READ_ONCE(sk->sk_sndbuf);
                break;

        case SO_RCVBUF:
                v.val = READ_ONCE(sk->sk_rcvbuf);
                break;

        case SO_REUSEADDR:
                v.val = sk->sk_reuse;
                break;

        case SO_REUSEPORT:
                v.val = sk->sk_reuseport;
                break;

        case SO_KEEPALIVE:
                v.val = sock_flag(sk, SOCK_KEEPOPEN);
                break;

        case SO_TYPE:
                v.val = sk->sk_type;
                break;

        case SO_PROTOCOL:
                v.val = sk->sk_protocol;
                break;

        case SO_DOMAIN:
                v.val = sk->sk_family;
                break;

        case SO_ERROR:
                v.val = -sock_error(sk);
                if (v.val == 0)
                        v.val = xchg(&sk->sk_err_soft, 0);
                break;

        case SO_OOBINLINE:
                v.val = sock_flag(sk, SOCK_URGINLINE);
                break;

        case SO_NO_CHECK:
                v.val = sk->sk_no_check_tx;
                break;

        case SO_PRIORITY:
                v.val = sk->sk_priority;
                break;

        case SO_LINGER:
                lv                = sizeof(v.ling);
                v.ling.l_onoff        = sock_flag(sk, SOCK_LINGER);
                v.ling.l_linger        = sk->sk_lingertime / HZ;
                break;

        case SO_BSDCOMPAT:
                break;

        case SO_TIMESTAMP_OLD:
                v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
                                !sock_flag(sk, SOCK_TSTAMP_NEW) &&
                                !sock_flag(sk, SOCK_RCVTSTAMPNS);
                break;

        case SO_TIMESTAMPNS_OLD:
                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
                break;

        case SO_TIMESTAMP_NEW:
                v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
                break;

        case SO_TIMESTAMPNS_NEW:
                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
                break;

        case SO_TIMESTAMPING_OLD:
                v.val = sk->sk_tsflags;
                break;

        case SO_RCVTIMEO_OLD:
        case SO_RCVTIMEO_NEW:
                lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
                break;

        case SO_SNDTIMEO_OLD:
        case SO_SNDTIMEO_NEW:
                lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
                break;

        case SO_RCVLOWAT:
                v.val = READ_ONCE(sk->sk_rcvlowat);
                break;

        case SO_SNDLOWAT:
                v.val = 1;
                break;

        case SO_PASSCRED:
                v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
                break;

        case SO_PEERCRED:
        {
                struct ucred peercred;
                if (len > sizeof(peercred))
                        len = sizeof(peercred);

                spin_lock(&sk->sk_peer_lock);
                cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
                spin_unlock(&sk->sk_peer_lock);

                if (copy_to_sockptr(optval, &peercred, len))
                        return -EFAULT;
                goto lenout;
        }

        case SO_PEERGROUPS:
        {
                const struct cred *cred;
                int ret, n;

                cred = sk_get_peer_cred(sk);
                if (!cred)
                        return -ENODATA;

                n = cred->group_info->ngroups;
                if (len < n * sizeof(gid_t)) {
                        len = n * sizeof(gid_t);
                        put_cred(cred);
                        return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
                }
                len = n * sizeof(gid_t);

                ret = groups_to_user(optval, cred->group_info);
                put_cred(cred);
                if (ret)
                        return ret;
                goto lenout;
        }

        case SO_PEERNAME:
        {
                char address[128];

                lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
                if (lv < 0)
                        return -ENOTCONN;
                if (lv < len)
                        return -EINVAL;
                if (copy_to_sockptr(optval, address, len))
                        return -EFAULT;
                goto lenout;
        }

        /* Dubious BSD thing... Probably nobody even uses it, but
         * the UNIX standard wants it for whatever reason... -DaveM
         */
        case SO_ACCEPTCONN:
                v.val = sk->sk_state == TCP_LISTEN;
                break;

        case SO_PASSSEC:
                v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
                break;

        case SO_PEERSEC:
                return security_socket_getpeersec_stream(sock,
                                                         optval, optlen, len);

        case SO_MARK:
                v.val = sk->sk_mark;
                break;

        case SO_RXQ_OVFL:
                v.val = sock_flag(sk, SOCK_RXQ_OVFL);
                break;

        case SO_WIFI_STATUS:
                v.val = sock_flag(sk, SOCK_WIFI_STATUS);
                break;

        case SO_PEEK_OFF:
                if (!sock->ops->set_peek_off)
                        return -EOPNOTSUPP;

                v.val = READ_ONCE(sk->sk_peek_off);
                break;
        case SO_NOFCS:
                v.val = sock_flag(sk, SOCK_NOFCS);
                break;

        case SO_BINDTODEVICE:
                return sock_getbindtodevice(sk, optval, optlen, len);

        case SO_GET_FILTER:
                len = sk_get_filter(sk, optval, len);
                if (len < 0)
                        return len;

                goto lenout;

        case SO_LOCK_FILTER:
                v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
                break;

        case SO_BPF_EXTENSIONS:
                v.val = bpf_tell_extensions();
                break;

        case SO_SELECT_ERR_QUEUE:
                v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
                break;

#ifdef CONFIG_NET_RX_BUSY_POLL
        case SO_BUSY_POLL:
                v.val = READ_ONCE(sk->sk_ll_usec);
                break;
#endif

        case SO_MAX_PACING_RATE:
                /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
                if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
                        lv = sizeof(v.ulval);
                        v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
                } else {
                        /* 32bit version */
                        v.val = min_t(unsigned long, ~0U,
                                      READ_ONCE(sk->sk_max_pacing_rate));
                }
                break;

        case SO_INCOMING_CPU:
                v.val = READ_ONCE(sk->sk_incoming_cpu);
                break;

        case SO_MEMINFO:
        {
                u32 meminfo[SK_MEMINFO_VARS];

                sk_get_meminfo(sk, meminfo);

                len = min_t(unsigned int, len, sizeof(meminfo));
                if (copy_to_sockptr(optval, &meminfo, len))
                        return -EFAULT;

                goto lenout;
        }

#ifdef CONFIG_NET_RX_BUSY_POLL
        case SO_INCOMING_NAPI_ID:
                v.val = READ_ONCE(sk->sk_napi_id);

                /* aggregate non-NAPI IDs down to 0 */
                if (v.val < MIN_NAPI_ID)
                        v.val = 0;

                break;
#endif

        case SO_COOKIE:
                lv = sizeof(u64);
                if (len < lv)
                        return -EINVAL;
                v.val64 = sock_gen_cookie(sk);
                break;

        case SO_ZEROCOPY:
                v.val = sock_flag(sk, SOCK_ZEROCOPY);
                break;

        case SO_TXTIME:
                lv = sizeof(v.txtime);
                v.txtime.clockid = sk->sk_clockid;
                v.txtime.flags |= sk->sk_txtime_deadline_mode ?
                                  SOF_TXTIME_DEADLINE_MODE : 0;
                v.txtime.flags |= sk->sk_txtime_report_errors ?
                                  SOF_TXTIME_REPORT_ERRORS : 0;
                break;

        case SO_BINDTOIFINDEX:
                v.val = sk->sk_bound_dev_if;
                break;

        default:
                /* We implement the SO_SNDLOWAT etc to not be settable
                 * (1003.1g 7).
                 */
                return -ENOPROTOOPT;
        }

        if (len > lv)
                len = lv;
        if (copy_to_sockptr(optval, &v, len))
                return -EFAULT;
lenout:
        if (copy_to_sockptr(optlen, &len, sizeof(int)))
                return -EFAULT;
        return 0;
}

int sock_getsockopt(struct socket *sock, int level, int optname,
                    char __user *optval, int __user *optlen)
{
        return sk_getsockopt(sock->sk, level, optname,
                             USER_SOCKPTR(optval),
                             USER_SOCKPTR(optlen));
}

/*
 * Initialize an sk_lock.
 *
 * (We also register the sk_lock with the lock validator.)
 */
static inline void sock_lock_init(struct sock *sk)
{
        sk_owner_clear(sk);

        if (sk->sk_kern_sock)
                sock_lock_init_class_and_name(
                        sk,
                        af_family_kern_slock_key_strings[sk->sk_family],
                        af_family_kern_slock_keys + sk->sk_family,
                        af_family_kern_key_strings[sk->sk_family],
                        af_family_kern_keys + sk->sk_family);
        else
                sock_lock_init_class_and_name(
                        sk,
                        af_family_slock_key_strings[sk->sk_family],
                        af_family_slock_keys + sk->sk_family,
                        af_family_key_strings[sk->sk_family],
                        af_family_keys + sk->sk_family);
}

/*
 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
 * even temporarly, because of RCU lookups. sk_node should also be left as is.
 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
 */
static void sock_copy(struct sock *nsk, const struct sock *osk)
{
        const struct proto *prot = READ_ONCE(osk->sk_prot);
#ifdef CONFIG_SECURITY_NETWORK
        void *sptr = nsk->sk_security;
#endif
        memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));

        memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
               prot->obj_size - offsetof(struct sock, sk_dontcopy_end));

#ifdef CONFIG_SECURITY_NETWORK
        nsk->sk_security = sptr;
        security_sk_clone(osk, nsk);
#endif
}

static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
                int family)
{
        struct sock *sk;
        struct kmem_cache *slab;

        slab = prot->slab;
        if (slab != NULL) {
                sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
                if (!sk)
                        return sk;
                if (want_init_on_alloc(priority))
                        sk_prot_clear_nulls(sk, prot->obj_size);
        } else
                sk = kmalloc(prot->obj_size, priority);

        if (sk != NULL) {
                if (security_sk_alloc(sk, family, priority))
                        goto out_free;

                if (!try_module_get(prot->owner))
                        goto out_free_sec;
                sk_tx_queue_clear(sk);
        }

        return sk;

out_free_sec:
        security_sk_free(sk);
out_free:
        if (slab != NULL)
                kmem_cache_free(slab, sk);
        else
                kfree(sk);
        return NULL;
}

static void sk_prot_free(struct proto *prot, struct sock *sk)
{
        struct kmem_cache *slab;
        struct module *owner;

        owner = prot->owner;
        slab = prot->slab;

        cgroup_sk_free(&sk->sk_cgrp_data);
        mem_cgroup_sk_free(sk);
        security_sk_free(sk);

        sk_owner_put(sk);

        if (slab != NULL)
                kmem_cache_free(slab, sk);
        else
                kfree(sk);
        module_put(owner);
}

/**
 *        sk_alloc - All socket objects are allocated here
 *        @net: the applicable net namespace
 *        @family: protocol family
 *        @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
 *        @prot: struct proto associated with this new sock instance
 *        @kern: is this to be a kernel socket?
 */
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
                      struct proto *prot, int kern)
{
        struct sock *sk;

        sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
        if (sk) {
                sk->sk_family = family;
                /*
                 * See comment in struct sock definition to understand
                 * why we need sk_prot_creator -acme
                 */
                sk->sk_prot = sk->sk_prot_creator = prot;
                sk->sk_kern_sock = kern;
                sock_lock_init(sk);
                sk->sk_net_refcnt = kern ? 0 : 1;
                if (likely(sk->sk_net_refcnt)) {
                        get_net(net);
                        sock_inuse_add(net, 1);
                }

                sock_net_set(sk, net);
                refcount_set(&sk->sk_wmem_alloc, 1);

                mem_cgroup_sk_alloc(sk);
                cgroup_sk_alloc(&sk->sk_cgrp_data);
                sock_update_classid(&sk->sk_cgrp_data);
                sock_update_netprioidx(&sk->sk_cgrp_data);
                sk_tx_queue_clear(sk);
        }

        return sk;
}
EXPORT_SYMBOL(sk_alloc);

/* Sockets having SOCK_RCU_FREE will call this function after one RCU
 * grace period. This is the case for UDP sockets and TCP listeners.
 */
static void __sk_destruct(struct rcu_head *head)
{
        struct sock *sk = container_of(head, struct sock, sk_rcu);
        struct sk_filter *filter;

        if (sk->sk_destruct)
                sk->sk_destruct(sk);

        filter = rcu_dereference_check(sk->sk_filter,
                                       refcount_read(&sk->sk_wmem_alloc) == 0);
        if (filter) {
                sk_filter_uncharge(sk, filter);
                RCU_INIT_POINTER(sk->sk_filter, NULL);
        }

        sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);

#ifdef CONFIG_BPF_SYSCALL
        bpf_sk_storage_free(sk);
#endif

        if (atomic_read(&sk->sk_omem_alloc))
                pr_debug("%s: optmem leakage (%d bytes) detected\n",
                         __func__, atomic_read(&sk->sk_omem_alloc));

        if (sk->sk_frag.page) {
                put_page(sk->sk_frag.page);
                sk->sk_frag.page = NULL;
        }

        /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
        put_cred(sk->sk_peer_cred);
        put_pid(sk->sk_peer_pid);

        if (likely(sk->sk_net_refcnt))
                put_net(sock_net(sk));
        sk_prot_free(sk->sk_prot_creator, sk);
}

void sk_destruct(struct sock *sk)
{
        bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);

        if (rcu_access_pointer(sk->sk_reuseport_cb)) {
                reuseport_detach_sock(sk);
                use_call_rcu = true;
        }

        if (use_call_rcu)
                call_rcu(&sk->sk_rcu, __sk_destruct);
        else
                __sk_destruct(&sk->sk_rcu);
}

static void __sk_free(struct sock *sk)
{
        if (likely(sk->sk_net_refcnt))
                sock_inuse_add(sock_net(sk), -1);

        if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
                sock_diag_broadcast_destroy(sk);
        else
                sk_destruct(sk);
}

void sk_free(struct sock *sk)
{
        /*
         * We subtract one from sk_wmem_alloc and can know if
         * some packets are still in some tx queue.
         * If not null, sock_wfree() will call __sk_free(sk) later
         */
        if (refcount_dec_and_test(&sk->sk_wmem_alloc))
                __sk_free(sk);
}
EXPORT_SYMBOL(sk_free);

static void sk_init_common(struct sock *sk)
{
        skb_queue_head_init(&sk->sk_receive_queue);
        skb_queue_head_init(&sk->sk_write_queue);
        skb_queue_head_init(&sk->sk_error_queue);

        rwlock_init(&sk->sk_callback_lock);
        lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
                        af_rlock_keys + sk->sk_family,
                        af_family_rlock_key_strings[sk->sk_family]);
        lockdep_set_class_and_name(&sk->sk_write_queue.lock,
                        af_wlock_keys + sk->sk_family,
                        af_family_wlock_key_strings[sk->sk_family]);
        lockdep_set_class_and_name(&sk->sk_error_queue.lock,
                        af_elock_keys + sk->sk_family,
                        af_family_elock_key_strings[sk->sk_family]);
        lockdep_set_class_and_name(&sk->sk_callback_lock,
                        af_callback_keys + sk->sk_family,
                        af_family_clock_key_strings[sk->sk_family]);
}

/**
 *        sk_clone_lock - clone a socket, and lock its clone
 *        @sk: the socket to clone
 *        @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
 *
 *        Caller must unlock socket even in error path (bh_unlock_sock(newsk))
 */
struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
{
        struct proto *prot = READ_ONCE(sk->sk_prot);
        struct sk_filter *filter;
        bool is_charged = true;
        struct sock *newsk;

        newsk = sk_prot_alloc(prot, priority, sk->sk_family);
        if (!newsk)
                goto out;

        sock_copy(newsk, sk);

        newsk->sk_prot_creator = prot;

        /* SANITY */
        if (likely(newsk->sk_net_refcnt)) {
                get_net(sock_net(newsk));
                sock_inuse_add(sock_net(newsk), 1);
        }
        sk_node_init(&newsk->sk_node);
        sock_lock_init(newsk);
        bh_lock_sock(newsk);
        newsk->sk_backlog.head        = newsk->sk_backlog.tail = NULL;
        newsk->sk_backlog.len = 0;

        atomic_set(&newsk->sk_rmem_alloc, 0);

        /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
        refcount_set(&newsk->sk_wmem_alloc, 1);

        atomic_set(&newsk->sk_omem_alloc, 0);
        sk_init_common(newsk);

        newsk->sk_dst_cache        = NULL;
        newsk->sk_dst_pending_confirm = 0;
        newsk->sk_wmem_queued        = 0;
        newsk->sk_forward_alloc = 0;
        atomic_set(&newsk->sk_drops, 0);
        newsk->sk_send_head        = NULL;
        newsk->sk_userlocks        = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
        atomic_set(&newsk->sk_zckey, 0);

        sock_reset_flag(newsk, SOCK_DONE);

        /* sk->sk_memcg will be populated at accept() time */
        newsk->sk_memcg = NULL;

        cgroup_sk_clone(&newsk->sk_cgrp_data);

        rcu_read_lock();
        filter = rcu_dereference(sk->sk_filter);
        if (filter != NULL)
                /* though it's an empty new sock, the charging may fail
                 * if sysctl_optmem_max was changed between creation of
                 * original socket and cloning
                 */
                is_charged = sk_filter_charge(newsk, filter);
        RCU_INIT_POINTER(newsk->sk_filter, filter);
        rcu_read_unlock();

        if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
                /* We need to make sure that we don't uncharge the new
                 * socket if we couldn't charge it in the first place
                 * as otherwise we uncharge the parent's filter.
                 */
                if (!is_charged)
                        RCU_INIT_POINTER(newsk->sk_filter, NULL);
                sk_free_unlock_clone(newsk);
                newsk = NULL;
                goto out;
        }
        RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);

        if (bpf_sk_storage_clone(sk, newsk)) {
                sk_free_unlock_clone(newsk);
                newsk = NULL;
                goto out;
        }

        /* Clear sk_user_data if parent had the pointer tagged
         * as not suitable for copying when cloning.
         */
        if (sk_user_data_is_nocopy(newsk))
                newsk->sk_user_data = NULL;

        newsk->sk_err           = 0;
        newsk->sk_err_soft = 0;
        newsk->sk_priority = 0;
        newsk->sk_incoming_cpu = raw_smp_processor_id();

        /* Before updating sk_refcnt, we must commit prior changes to memory
         * (Documentation/RCU/rculist_nulls.rst for details)
         */
        smp_wmb();
        refcount_set(&newsk->sk_refcnt, 2);

        /* Increment the counter in the same struct proto as the master
         * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
         * is the same as sk->sk_prot->socks, as this field was copied
         * with memcpy).
         *
         * This _changes_ the previous behaviour, where
         * tcp_create_openreq_child always was incrementing the
         * equivalent to tcp_prot->socks (inet_sock_nr), so this have
         * to be taken into account in all callers. -acme
         */
        sk_refcnt_debug_inc(newsk);
        sk_set_socket(newsk, NULL);
        sk_tx_queue_clear(newsk);
        RCU_INIT_POINTER(newsk->sk_wq, NULL);

        if (newsk->sk_prot->sockets_allocated)
                sk_sockets_allocated_inc(newsk);

        if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
                net_enable_timestamp();
out:
        return newsk;
}
EXPORT_SYMBOL_GPL(sk_clone_lock);

void sk_free_unlock_clone(struct sock *sk)
{
        /* It is still raw copy of parent, so invalidate
         * destructor and make plain sk_free() */
        sk->sk_destruct = NULL;
        bh_unlock_sock(sk);
        sk_free(sk);
}
EXPORT_SYMBOL_GPL(sk_free_unlock_clone);

void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
{
        u32 max_segs = 1;

        sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
        if (sk->sk_route_caps & NETIF_F_GSO)
                sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
        sk->sk_route_caps &= ~sk->sk_route_nocaps;
        if (sk_can_gso(sk)) {
                if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
                        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
                } else {
                        sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
                        sk->sk_gso_max_size = dst->dev->gso_max_size;
                        max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
                }
        }
        sk->sk_gso_max_segs = max_segs;
        sk_dst_set(sk, dst);
}
EXPORT_SYMBOL_GPL(sk_setup_caps);

/*
 *        Simple resource managers for sockets.
 */


/*
 * Write buffer destructor automatically called from kfree_skb.
 */
void sock_wfree(struct sk_buff *skb)
{
        struct sock *sk = skb->sk;
        unsigned int len = skb->truesize;

        if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
                /*
                 * Keep a reference on sk_wmem_alloc, this will be released
                 * after sk_write_space() call
                 */
                WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
                sk->sk_write_space(sk);
                len = 1;
        }
        /*
         * if sk_wmem_alloc reaches 0, we must finish what sk_free()
         * could not do because of in-flight packets
         */
        if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
                __sk_free(sk);
}
EXPORT_SYMBOL(sock_wfree);

/* This variant of sock_wfree() is used by TCP,
 * since it sets SOCK_USE_WRITE_QUEUE.
 */
void __sock_wfree(struct sk_buff *skb)
{
        struct sock *sk = skb->sk;

        if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
                __sk_free(sk);
}

void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
{
        skb_orphan(skb);
        skb->sk = sk;
#ifdef CONFIG_INET
        if (unlikely(!sk_fullsock(sk))) {
                skb->destructor = sock_edemux;
                sock_hold(sk);
                return;
        }
#endif
        skb->destructor = sock_wfree;
        skb_set_hash_from_sk(skb, sk);
        /*
         * We used to take a refcount on sk, but following operation
         * is enough to guarantee sk_free() wont free this sock until
         * all in-flight packets are completed
         */
        refcount_add(skb->truesize, &sk->sk_wmem_alloc);
}
EXPORT_SYMBOL(skb_set_owner_w);

static bool can_skb_orphan_partial(const struct sk_buff *skb)
{
#ifdef CONFIG_TLS_DEVICE
        /* Drivers depend on in-order delivery for crypto offload,
         * partial orphan breaks out-of-order-OK logic.
         */
        if (skb->decrypted)
                return false;
#endif
        return (skb->destructor == sock_wfree ||
                (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
}

/* This helper is used by netem, as it can hold packets in its
 * delay queue. We want to allow the owner socket to send more
 * packets, as if they were already TX completed by a typical driver.
 * But we also want to keep skb->sk set because some packet schedulers
 * rely on it (sch_fq for example).
 */
void skb_orphan_partial(struct sk_buff *skb)
{
        if (skb_is_tcp_pure_ack(skb))
                return;

        if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
                return;

        skb_orphan(skb);
}
EXPORT_SYMBOL(skb_orphan_partial);

/*
 * Read buffer destructor automatically called from kfree_skb.
 */
void sock_rfree(struct sk_buff *skb)
{
        struct sock *sk = skb->sk;
        unsigned int len = skb->truesize;

        atomic_sub(len, &sk->sk_rmem_alloc);
        sk_mem_uncharge(sk, len);
}
EXPORT_SYMBOL(sock_rfree);

/*
 * Buffer destructor for skbs that are not used directly in read or write
 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
 */
void sock_efree(struct sk_buff *skb)
{
        sock_put(skb->sk);
}
EXPORT_SYMBOL(sock_efree);

/* Buffer destructor for prefetch/receive path where reference count may
 * not be held, e.g. for listen sockets.
 */
#ifdef CONFIG_INET
void sock_pfree(struct sk_buff *skb)
{
        if (sk_is_refcounted(skb->sk))
                sock_gen_put(skb->sk);
}
EXPORT_SYMBOL(sock_pfree);
#endif /* CONFIG_INET */

kuid_t sock_i_uid(struct sock *sk)
{
        kuid_t uid;

        read_lock_bh(&sk->sk_callback_lock);
        uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
        read_unlock_bh(&sk->sk_callback_lock);
        return uid;
}
EXPORT_SYMBOL(sock_i_uid);

unsigned long __sock_i_ino(struct sock *sk)
{
        unsigned long ino;

        read_lock(&sk->sk_callback_lock);
        ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
        read_unlock(&sk->sk_callback_lock);
        return ino;
}
EXPORT_SYMBOL(__sock_i_ino);

unsigned long sock_i_ino(struct sock *sk)
{
        unsigned long ino;

        local_bh_disable();
        ino = __sock_i_ino(sk);
        local_bh_enable();
        return ino;
}
EXPORT_SYMBOL(sock_i_ino);

/*
 * Allocate a skb from the socket's send buffer.
 */
struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
                             gfp_t priority)
{
        if (force ||
            refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
                struct sk_buff *skb = alloc_skb(size, priority);

                if (skb) {
                        skb_set_owner_w(skb, sk);
                        return skb;
                }
        }
        return NULL;
}
EXPORT_SYMBOL(sock_wmalloc);

static void sock_ofree(struct sk_buff *skb)
{
        struct sock *sk = skb->sk;

        atomic_sub(skb->truesize, &sk->sk_omem_alloc);
}

struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
                             gfp_t priority)
{
        struct sk_buff *skb;

        /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
        if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
            READ_ONCE(sysctl_optmem_max))
                return NULL;

        skb = alloc_skb(size, priority);
        if (!skb)
                return NULL;

        atomic_add(skb->truesize, &sk->sk_omem_alloc);
        skb->sk = sk;
        skb->destructor = sock_ofree;
        return skb;
}

/*
 * Allocate a memory block from the socket's option memory buffer.
 */
void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
{
        int optmem_max = READ_ONCE(sysctl_optmem_max);

        if ((unsigned int)size <= optmem_max &&
            atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
                void *mem;
                /* First do the add, to avoid the race if kmalloc
                 * might sleep.
                 */
                atomic_add(size, &sk->sk_omem_alloc);
                mem = kmalloc(size, priority);
                if (mem)
                        return mem;
                atomic_sub(size, &sk->sk_omem_alloc);
        }
        return NULL;
}
EXPORT_SYMBOL(sock_kmalloc);

/* Free an option memory block. Note, we actually want the inline
 * here as this allows gcc to detect the nullify and fold away the
 * condition entirely.
 */
static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
                                  const bool nullify)
{
        if (WARN_ON_ONCE(!mem))
                return;
        if (nullify)
                kfree_sensitive(mem);
        else
                kfree(mem);
        atomic_sub(size, &sk->sk_omem_alloc);
}

void sock_kfree_s(struct sock *sk, void *mem, int size)
{
        __sock_kfree_s(sk, mem, size, false);
}
EXPORT_SYMBOL(sock_kfree_s);

void sock_kzfree_s(struct sock *sk, void *mem, int size)
{
        __sock_kfree_s(sk, mem, size, true);
}
EXPORT_SYMBOL(sock_kzfree_s);

/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
   I think, these locks should be removed for datagram sockets.
 */
static long sock_wait_for_wmem(struct sock *sk, long timeo)
{
        DEFINE_WAIT(wait);

        sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
        for (;;) {
                if (!timeo)
                        break;
                if (signal_pending(current))
                        break;
                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
                if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
                        break;
                if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
                        break;
                if (READ_ONCE(sk->sk_err))
                        break;
                timeo = schedule_timeout(timeo);
        }
        finish_wait(sk_sleep(sk), &wait);
        return timeo;
}


/*
 *        Generic send/receive buffer handlers
 */

struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
                                     unsigned long data_len, int noblock,
                                     int *errcode, int max_page_order)
{
        struct sk_buff *skb;
        long timeo;
        int err;

        timeo = sock_sndtimeo(sk, noblock);
        for (;;) {
                err = sock_error(sk);
                if (err != 0)
                        goto failure;

                err = -EPIPE;
                if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
                        goto failure;

                if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
                        break;

                sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
                err = -EAGAIN;
                if (!timeo)
                        goto failure;
                if (signal_pending(current))
                        goto interrupted;
                timeo = sock_wait_for_wmem(sk, timeo);
        }
        skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
                                   errcode, sk->sk_allocation);
        if (skb)
                skb_set_owner_w(skb, sk);
        return skb;

interrupted:
        err = sock_intr_errno(timeo);
failure:
        *errcode = err;
        return NULL;
}
EXPORT_SYMBOL(sock_alloc_send_pskb);

struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
                                    int noblock, int *errcode)
{
        return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
}
EXPORT_SYMBOL(sock_alloc_send_skb);

int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
                     struct sockcm_cookie *sockc)
{
        u32 tsflags;

        switch (cmsg->cmsg_type) {
        case SO_MARK:
                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
                        return -EPERM;
                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
                        return -EINVAL;
                sockc->mark = *(u32 *)CMSG_DATA(cmsg);
                break;
        case SO_TIMESTAMPING_OLD:
        case SO_TIMESTAMPING_NEW:
                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
                        return -EINVAL;

                tsflags = *(u32 *)CMSG_DATA(cmsg);
                if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
                        return -EINVAL;

                sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
                sockc->tsflags |= tsflags;
                break;
        case SCM_TXTIME:
                if (!sock_flag(sk, SOCK_TXTIME))
                        return -EINVAL;
                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
                        return -EINVAL;
                sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
                break;
        /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
        case SCM_RIGHTS:
        case SCM_CREDENTIALS:
                break;
        default:
                return -EINVAL;
        }
        return 0;
}
EXPORT_SYMBOL(__sock_cmsg_send);

int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
                   struct sockcm_cookie *sockc)
{
        struct cmsghdr *cmsg;
        int ret;

        for_each_cmsghdr(cmsg, msg) {
                if (!CMSG_OK(msg, cmsg))
                        return -EINVAL;
                if (cmsg->cmsg_level != SOL_SOCKET)
                        continue;
                ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
                if (ret)
                        return ret;
        }
        return 0;
}
EXPORT_SYMBOL(sock_cmsg_send);

static void sk_enter_memory_pressure(struct sock *sk)
{
        if (!sk->sk_prot->enter_memory_pressure)
                return;

        sk->sk_prot->enter_memory_pressure(sk);
}

static void sk_leave_memory_pressure(struct sock *sk)
{
        if (sk->sk_prot->leave_memory_pressure) {
                sk->sk_prot->leave_memory_pressure(sk);
        } else {
                unsigned long *memory_pressure = sk->sk_prot->memory_pressure;

                if (memory_pressure && READ_ONCE(*memory_pressure))
                        WRITE_ONCE(*memory_pressure, 0);
        }
}

#define SKB_FRAG_PAGE_ORDER        get_order(32768)
DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);

/**
 * skb_page_frag_refill - check that a page_frag contains enough room
 * @sz: minimum size of the fragment we want to get
 * @pfrag: pointer to page_frag
 * @gfp: priority for memory allocation
 *
 * Note: While this allocator tries to use high order pages, there is
 * no guarantee that allocations succeed. Therefore, @sz MUST be
 * less or equal than PAGE_SIZE.
 */
bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
{
        if (pfrag->page) {
                if (page_ref_count(pfrag->page) == 1) {
                        pfrag->offset = 0;
                        return true;
                }
                if (pfrag->offset + sz <= pfrag->size)
                        return true;
                put_page(pfrag->page);
        }

        pfrag->offset = 0;
        if (SKB_FRAG_PAGE_ORDER &&
            !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
                /* Avoid direct reclaim but allow kswapd to wake */
                pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
                                          __GFP_COMP | __GFP_NOWARN |
                                          __GFP_NORETRY,
                                          SKB_FRAG_PAGE_ORDER);
                if (likely(pfrag->page)) {
                        pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
                        return true;
                }
        }
        pfrag->page = alloc_page(gfp);
        if (likely(pfrag->page)) {
                pfrag->size = PAGE_SIZE;
                return true;
        }
        return false;
}
EXPORT_SYMBOL(skb_page_frag_refill);

bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
{
        if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
                return true;

        sk_enter_memory_pressure(sk);
        sk_stream_moderate_sndbuf(sk);
        return false;
}
EXPORT_SYMBOL(sk_page_frag_refill);

static void __lock_sock(struct sock *sk)
        __releases(&sk->sk_lock.slock)
        __acquires(&sk->sk_lock.slock)
{
        DEFINE_WAIT(wait);

        for (;;) {
                prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
                                        TASK_UNINTERRUPTIBLE);
                spin_unlock_bh(&sk->sk_lock.slock);
                schedule();
                spin_lock_bh(&sk->sk_lock.slock);
                if (!sock_owned_by_user(sk))
                        break;
        }
        finish_wait(&sk->sk_lock.wq, &wait);
}

void __release_sock(struct sock *sk)
        __releases(&sk->sk_lock.slock)
        __acquires(&sk->sk_lock.slock)
{
        struct sk_buff *skb, *next;
        int nb = 0;

        while ((skb = sk->sk_backlog.head) != NULL) {
                sk->sk_backlog.head = sk->sk_backlog.tail = NULL;

                spin_unlock_bh(&sk->sk_lock.slock);

                while (1) {
                        next = skb->next;
                        prefetch(next);
                        WARN_ON_ONCE(skb_dst_is_noref(skb));
                        skb_mark_not_on_list(skb);
                        sk_backlog_rcv(sk, skb);

                        skb = next;
                        if (!skb)
                                break;

                        if (!(++nb & 15))
                                cond_resched();
                }

                spin_lock_bh(&sk->sk_lock.slock);
        }

        /*
         * Doing the zeroing here guarantee we can not loop forever
         * while a wild producer attempts to flood us.
         */
        sk->sk_backlog.len = 0;
}

void __sk_flush_backlog(struct sock *sk)
{
        spin_lock_bh(&sk->sk_lock.slock);
        __release_sock(sk);
        spin_unlock_bh(&sk->sk_lock.slock);
}

/**
 * sk_wait_data - wait for data to arrive at sk_receive_queue
 * @sk:    sock to wait on
 * @timeo: for how long
 * @skb:   last skb seen on sk_receive_queue
 *
 * Now socket state including sk->sk_err is changed only under lock,
 * hence we may omit checks after joining wait queue.
 * We check receive queue before schedule() only as optimization;
 * it is very likely that release_sock() added new data.
 */
int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
{
        DEFINE_WAIT_FUNC(wait, woken_wake_function);
        int rc;

        add_wait_queue(sk_sleep(sk), &wait);
        sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
        rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
        sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
        remove_wait_queue(sk_sleep(sk), &wait);
        return rc;
}
EXPORT_SYMBOL(sk_wait_data);

/**
 *        __sk_mem_raise_allocated - increase memory_allocated
 *        @sk: socket
 *        @size: memory size to allocate
 *        @amt: pages to allocate
 *        @kind: allocation type
 *
 *        Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
 */
int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
{
        struct proto *prot = sk->sk_prot;
        long allocated = sk_memory_allocated_add(sk, amt);
        bool charged = true;

        if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
            !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
                goto suppress_allocation;

        /* Under limit. */
        if (allocated <= sk_prot_mem_limits(sk, 0)) {
                sk_leave_memory_pressure(sk);
                return 1;
        }

        /* Under pressure. */
        if (allocated > sk_prot_mem_limits(sk, 1))
                sk_enter_memory_pressure(sk);

        /* Over hard limit. */
        if (allocated > sk_prot_mem_limits(sk, 2))
                goto suppress_allocation;

        /* guarantee minimum buffer size under pressure */
        if (kind == SK_MEM_RECV) {
                if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
                        return 1;

        } else { /* SK_MEM_SEND */
                int wmem0 = sk_get_wmem0(sk, prot);

                if (sk->sk_type == SOCK_STREAM) {
                        if (sk->sk_wmem_queued < wmem0)
                                return 1;
                } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
                                return 1;
                }
        }

        if (sk_has_memory_pressure(sk)) {
                u64 alloc;

                if (!sk_under_memory_pressure(sk))
                        return 1;
                alloc = sk_sockets_allocated_read_positive(sk);
                if (sk_prot_mem_limits(sk, 2) > alloc *
                    sk_mem_pages(sk->sk_wmem_queued +
                                 atomic_read(&sk->sk_rmem_alloc) +
                                 sk->sk_forward_alloc))
                        return 1;
        }

suppress_allocation:

        if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
                sk_stream_moderate_sndbuf(sk);

                /* Fail only if socket is _under_ its sndbuf.
                 * In this case we cannot block, so that we have to fail.
                 */
                if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
                        return 1;
        }

        trace_sock_exceed_buf_limit(sk, prot, allocated, kind);

        sk_memory_allocated_sub(sk, amt);

        if (mem_cgroup_sockets_enabled && sk->sk_memcg)
                mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);

        return 0;
}
EXPORT_SYMBOL(__sk_mem_raise_allocated);

/**
 *        __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
 *        @sk: socket
 *        @size: memory size to allocate
 *        @kind: allocation type
 *
 *        If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
 *        rmem allocation. This function assumes that protocols which have
 *        memory_pressure use sk_wmem_queued as write buffer accounting.
 */
int __sk_mem_schedule(struct sock *sk, int size, int kind)
{
        int ret, amt = sk_mem_pages(size);

        sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
        ret = __sk_mem_raise_allocated(sk, size, amt, kind);
        if (!ret)
                sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
        return ret;
}
EXPORT_SYMBOL(__sk_mem_schedule);

/**
 *        __sk_mem_reduce_allocated - reclaim memory_allocated
 *        @sk: socket
 *        @amount: number of quanta
 *
 *        Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
 */
void __sk_mem_reduce_allocated(struct sock *sk, int amount)
{
        sk_memory_allocated_sub(sk, amount);

        if (mem_cgroup_sockets_enabled && sk->sk_memcg)
                mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);

        if (sk_under_global_memory_pressure(sk) &&
            (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
                sk_leave_memory_pressure(sk);
}
EXPORT_SYMBOL(__sk_mem_reduce_allocated);

/**
 *        __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
 *        @sk: socket
 *        @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
 */
void __sk_mem_reclaim(struct sock *sk, int amount)
{
        amount >>= SK_MEM_QUANTUM_SHIFT;
        sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
        __sk_mem_reduce_allocated(sk, amount);
}
EXPORT_SYMBOL(__sk_mem_reclaim);

int sk_set_peek_off(struct sock *sk, int val)
{
        WRITE_ONCE(sk->sk_peek_off, val);
        return 0;
}
EXPORT_SYMBOL_GPL(sk_set_peek_off);

/*
 * Set of default routines for initialising struct proto_ops when
 * the protocol does not support a particular function. In certain
 * cases where it makes no sense for a protocol to have a "do nothing"
 * function, some default processing is provided.
 */

int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_bind);

int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
                    int len, int flags)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_connect);

int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_socketpair);

int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
                   bool kern)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_accept);

int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
                    int peer)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_getname);

int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_ioctl);

int sock_no_listen(struct socket *sock, int backlog)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_listen);

int sock_no_shutdown(struct socket *sock, int how)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_shutdown);

int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_sendmsg);

int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_sendmsg_locked);

int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
                    int flags)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_recvmsg);

int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
{
        /* Mirror missing mmap method error code */
        return -ENODEV;
}
EXPORT_SYMBOL(sock_no_mmap);

/*
 * When a file is received (via SCM_RIGHTS, etc), we must bump the
 * various sock-based usage counts.
 */
void __receive_sock(struct file *file)
{
        struct socket *sock;
        int error;

        /*
         * The resulting value of "error" is ignored here since we only
         * need to take action when the file is a socket and testing
         * "sock" for NULL is sufficient.
         */
        sock = sock_from_file(file, &error);
        if (sock) {
                sock_update_netprioidx(&sock->sk->sk_cgrp_data);
                sock_update_classid(&sock->sk->sk_cgrp_data);
        }
}

ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
{
        ssize_t res;
        struct msghdr msg = {.msg_flags = flags};
        struct kvec iov;
        char *kaddr = kmap(page);
        iov.iov_base = kaddr + offset;
        iov.iov_len = size;
        res = kernel_sendmsg(sock, &msg, &iov, 1, size);
        kunmap(page);
        return res;
}
EXPORT_SYMBOL(sock_no_sendpage);

ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
                                int offset, size_t size, int flags)
{
        ssize_t res;
        struct msghdr msg = {.msg_flags = flags};
        struct kvec iov;
        char *kaddr = kmap(page);

        iov.iov_base = kaddr + offset;
        iov.iov_len = size;
        res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
        kunmap(page);
        return res;
}
EXPORT_SYMBOL(sock_no_sendpage_locked);

/*
 *        Default Socket Callbacks
 */

static void sock_def_wakeup(struct sock *sk)
{
        struct socket_wq *wq;

        rcu_read_lock();
        wq = rcu_dereference(sk->sk_wq);
        if (skwq_has_sleeper(wq))
                wake_up_interruptible_all(&wq->wait);
        rcu_read_unlock();
}

static void sock_def_error_report(struct sock *sk)
{
        struct socket_wq *wq;

        rcu_read_lock();
        wq = rcu_dereference(sk->sk_wq);
        if (skwq_has_sleeper(wq))
                wake_up_interruptible_poll(&wq->wait, EPOLLERR);
        sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
        rcu_read_unlock();
}

void sock_def_readable(struct sock *sk)
{
        struct socket_wq *wq;

        rcu_read_lock();
        wq = rcu_dereference(sk->sk_wq);
        if (skwq_has_sleeper(wq))
                wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
                                                EPOLLRDNORM | EPOLLRDBAND);
        sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
        rcu_read_unlock();
}

static void sock_def_write_space(struct sock *sk)
{
        struct socket_wq *wq;

        rcu_read_lock();

        /* Do not wake up a writer until he can make "significant"
         * progress.  --DaveM
         */
        if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
                wq = rcu_dereference(sk->sk_wq);
                if (skwq_has_sleeper(wq))
                        wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
                                                EPOLLWRNORM | EPOLLWRBAND);

                /* Should agree with poll, otherwise some programs break */
                if (sock_writeable(sk))
                        sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
        }

        rcu_read_unlock();
}

static void sock_def_destruct(struct sock *sk)
{
}

void sk_send_sigurg(struct sock *sk)
{
        if (sk->sk_socket && sk->sk_socket->file)
                if (send_sigurg(&sk->sk_socket->file->f_owner))
                        sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
}
EXPORT_SYMBOL(sk_send_sigurg);

void sk_reset_timer(struct sock *sk, struct timer_list* timer,
                    unsigned long expires)
{
        if (!mod_timer(timer, expires))
                sock_hold(sk);
}
EXPORT_SYMBOL(sk_reset_timer);

void sk_stop_timer(struct sock *sk, struct timer_list* timer)
{
        if (del_timer(timer))
                __sock_put(sk);
}
EXPORT_SYMBOL(sk_stop_timer);

void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
{
        if (del_timer_sync(timer))
                __sock_put(sk);
}
EXPORT_SYMBOL(sk_stop_timer_sync);

void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
{
        sk_init_common(sk);
        sk->sk_send_head        =        NULL;

        timer_setup(&sk->sk_timer, NULL, 0);

        sk->sk_allocation        =        GFP_KERNEL;
        sk->sk_rcvbuf                =        READ_ONCE(sysctl_rmem_default);
        sk->sk_sndbuf                =        READ_ONCE(sysctl_wmem_default);
        sk->sk_state                =        TCP_CLOSE;
        sk_set_socket(sk, sock);

        sock_set_flag(sk, SOCK_ZAPPED);

        if (sock) {
                sk->sk_type        =        sock->type;
                RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
                sock->sk        =        sk;
        } else {
                RCU_INIT_POINTER(sk->sk_wq, NULL);
        }
        sk->sk_uid        =        uid;

        rwlock_init(&sk->sk_callback_lock);
        if (sk->sk_kern_sock)
                lockdep_set_class_and_name(
                        &sk->sk_callback_lock,
                        af_kern_callback_keys + sk->sk_family,
                        af_family_kern_clock_key_strings[sk->sk_family]);
        else
                lockdep_set_class_and_name(
                        &sk->sk_callback_lock,
                        af_callback_keys + sk->sk_family,
                        af_family_clock_key_strings[sk->sk_family]);

        sk->sk_state_change        =        sock_def_wakeup;
        sk->sk_data_ready        =        sock_def_readable;
        sk->sk_write_space        =        sock_def_write_space;
        sk->sk_error_report        =        sock_def_error_report;
        sk->sk_destruct                =        sock_def_destruct;

        sk->sk_frag.page        =        NULL;
        sk->sk_frag.offset        =        0;
        sk->sk_peek_off                =        -1;

        sk->sk_peer_pid         =        NULL;
        sk->sk_peer_cred        =        NULL;
        spin_lock_init(&sk->sk_peer_lock);

        sk->sk_write_pending        =        0;
        sk->sk_rcvlowat                =        1;
        sk->sk_rcvtimeo                =        MAX_SCHEDULE_TIMEOUT;
        sk->sk_sndtimeo                =        MAX_SCHEDULE_TIMEOUT;

        sk->sk_stamp = SK_DEFAULT_STAMP;
#if BITS_PER_LONG==32
        seqlock_init(&sk->sk_stamp_seq);
#endif
        atomic_set(&sk->sk_zckey, 0);

#ifdef CONFIG_NET_RX_BUSY_POLL
        sk->sk_napi_id                =        0;
        sk->sk_ll_usec                =        READ_ONCE(sysctl_net_busy_read);
#endif

        sk->sk_max_pacing_rate = ~0UL;
        sk->sk_pacing_rate = ~0UL;
        WRITE_ONCE(sk->sk_pacing_shift, 10);
        sk->sk_incoming_cpu = -1;

        sk_rx_queue_clear(sk);
        /*
         * Before updating sk_refcnt, we must commit prior changes to memory
         * (Documentation/RCU/rculist_nulls.rst for details)
         */
        smp_wmb();
        refcount_set(&sk->sk_refcnt, 1);
        atomic_set(&sk->sk_drops, 0);
}
EXPORT_SYMBOL(sock_init_data_uid);

void sock_init_data(struct socket *sock, struct sock *sk)
{
        kuid_t uid = sock ?
                SOCK_INODE(sock)->i_uid :
                make_kuid(sock_net(sk)->user_ns, 0);

        sock_init_data_uid(sock, sk, uid);
}
EXPORT_SYMBOL(sock_init_data);

void lock_sock_nested(struct sock *sk, int subclass)
{
        might_sleep();
        spin_lock_bh(&sk->sk_lock.slock);
        if (sk->sk_lock.owned)
                __lock_sock(sk);
        sk->sk_lock.owned = 1;
        spin_unlock(&sk->sk_lock.slock);
        /*
         * The sk_lock has mutex_lock() semantics here:
         */
        mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
        local_bh_enable();
}
EXPORT_SYMBOL(lock_sock_nested);

void release_sock(struct sock *sk)
{
        spin_lock_bh(&sk->sk_lock.slock);
        if (sk->sk_backlog.tail)
                __release_sock(sk);

        /* Warning : release_cb() might need to release sk ownership,
         * ie call sock_release_ownership(sk) before us.
         */
        if (sk->sk_prot->release_cb)
                sk->sk_prot->release_cb(sk);

        sock_release_ownership(sk);
        if (waitqueue_active(&sk->sk_lock.wq))
                wake_up(&sk->sk_lock.wq);
        spin_unlock_bh(&sk->sk_lock.slock);
}
EXPORT_SYMBOL(release_sock);

/**
 * lock_sock_fast - fast version of lock_sock
 * @sk: socket
 *
 * This version should be used for very small section, where process wont block
 * return false if fast path is taken:
 *
 *   sk_lock.slock locked, owned = 0, BH disabled
 *
 * return true if slow path is taken:
 *
 *   sk_lock.slock unlocked, owned = 1, BH enabled
 */
bool lock_sock_fast(struct sock *sk)
{
        might_sleep();
        spin_lock_bh(&sk->sk_lock.slock);

        if (!sk->sk_lock.owned)
                /*
                 * Note : We must disable BH
                 */
                return false;

        __lock_sock(sk);
        sk->sk_lock.owned = 1;
        spin_unlock(&sk->sk_lock.slock);
        /*
         * The sk_lock has mutex_lock() semantics here:
         */
        mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
        local_bh_enable();
        return true;
}
EXPORT_SYMBOL(lock_sock_fast);

int sock_gettstamp(struct socket *sock, void __user *userstamp,
                   bool timeval, bool time32)
{
        struct sock *sk = sock->sk;
        struct timespec64 ts;

        sock_enable_timestamp(sk, SOCK_TIMESTAMP);
        ts = ktime_to_timespec64(sock_read_timestamp(sk));
        if (ts.tv_sec == -1)
                return -ENOENT;
        if (ts.tv_sec == 0) {
                ktime_t kt = ktime_get_real();
                sock_write_timestamp(sk, kt);
                ts = ktime_to_timespec64(kt);
        }

        if (timeval)
                ts.tv_nsec /= 1000;

#ifdef CONFIG_COMPAT_32BIT_TIME
        if (time32)
                return put_old_timespec32(&ts, userstamp);
#endif
#ifdef CONFIG_SPARC64
        /* beware of padding in sparc64 timeval */
        if (timeval && !in_compat_syscall()) {
                struct __kernel_old_timeval __user tv = {
                        .tv_sec = ts.tv_sec,
                        .tv_usec = ts.tv_nsec,
                };
                if (copy_to_user(userstamp, &tv, sizeof(tv)))
                        return -EFAULT;
                return 0;
        }
#endif
        return put_timespec64(&ts, userstamp);
}
EXPORT_SYMBOL(sock_gettstamp);

void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
{
        if (!sock_flag(sk, flag)) {
                unsigned long previous_flags = sk->sk_flags;

                sock_set_flag(sk, flag);
                /*
                 * we just set one of the two flags which require net
                 * time stamping, but time stamping might have been on
                 * already because of the other one
                 */
                if (sock_needs_netstamp(sk) &&
                    !(previous_flags & SK_FLAGS_TIMESTAMP))
                        net_enable_timestamp();
        }
}

int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
                       int level, int type)
{
        struct sock_extended_err ee;
        struct sk_buff *skb;
        int copied, err;

        err = -EAGAIN;
        skb = sock_dequeue_err_skb(sk);
        if (skb == NULL)
                goto out;

        copied = skb->len;
        if (copied > len) {
                msg->msg_flags |= MSG_TRUNC;
                copied = len;
        }
        err = skb_copy_datagram_msg(skb, 0, msg, copied);
        if (err)
                goto out_free_skb;

        sock_recv_timestamp(msg, sk, skb);

        /* We must use a bounce buffer for CONFIG_HARDENED_USERCOPY=y */
        ee = SKB_EXT_ERR(skb)->ee;
        put_cmsg(msg, level, type, sizeof(ee), &ee);

        msg->msg_flags |= MSG_ERRQUEUE;
        err = copied;

out_free_skb:
        kfree_skb(skb);
out:
        return err;
}
EXPORT_SYMBOL(sock_recv_errqueue);

/*
 *        Get a socket option on an socket.
 *
 *        FIX: POSIX 1003.1g is very ambiguous here. It states that
 *        asynchronous errors should be reported by getsockopt. We assume
 *        this means if you specify SO_ERROR (otherwise whats the point of it).
 */
int sock_common_getsockopt(struct socket *sock, int level, int optname,
                           char __user *optval, int __user *optlen)
{
        struct sock *sk = sock->sk;

        /* IPV6_ADDRFORM can change sk->sk_prot under us. */
        return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
}
EXPORT_SYMBOL(sock_common_getsockopt);

int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
                        int flags)
{
        struct sock *sk = sock->sk;
        int addr_len = 0;
        int err;

        err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
                                   flags & ~MSG_DONTWAIT, &addr_len);
        if (err >= 0)
                msg->msg_namelen = addr_len;
        return err;
}
EXPORT_SYMBOL(sock_common_recvmsg);

/*
 *        Set socket options on an inet socket.
 */
int sock_common_setsockopt(struct socket *sock, int level, int optname,
                           sockptr_t optval, unsigned int optlen)
{
        struct sock *sk = sock->sk;

        /* IPV6_ADDRFORM can change sk->sk_prot under us. */
        return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
}
EXPORT_SYMBOL(sock_common_setsockopt);

void sk_common_release(struct sock *sk)
{
        if (sk->sk_prot->destroy)
                sk->sk_prot->destroy(sk);

        /*
         * Observation: when sk_common_release is called, processes have
         * no access to socket. But net still has.
         * Step one, detach it from networking:
         *
         * A. Remove from hash tables.
         */

        sk->sk_prot->unhash(sk);

        /*
         * In this point socket cannot receive new packets, but it is possible
         * that some packets are in flight because some CPU runs receiver and
         * did hash table lookup before we unhashed socket. They will achieve
         * receive queue and will be purged by socket destructor.
         *
         * Also we still have packets pending on receive queue and probably,
         * our own packets waiting in device queues. sock_destroy will drain
         * receive queue, but transmitted packets will delay socket destruction
         * until the last reference will be released.
         */

        sock_orphan(sk);

        xfrm_sk_free_policy(sk);

        sk_refcnt_debug_release(sk);

        sock_put(sk);
}
EXPORT_SYMBOL(sk_common_release);

void sk_get_meminfo(const struct sock *sk, u32 *mem)
{
        memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);

        mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
        mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
        mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
        mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
        mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
        mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
        mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
        mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
        mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
}

#ifdef CONFIG_PROC_FS
#define PROTO_INUSE_NR        64        /* should be enough for the first time */
struct prot_inuse {
        int val[PROTO_INUSE_NR];
};

static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);

void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
{
        __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
}
EXPORT_SYMBOL_GPL(sock_prot_inuse_add);

int sock_prot_inuse_get(struct net *net, struct proto *prot)
{
        int cpu, idx = prot->inuse_idx;
        int res = 0;

        for_each_possible_cpu(cpu)
                res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];

        return res >= 0 ? res : 0;
}
EXPORT_SYMBOL_GPL(sock_prot_inuse_get);

static void sock_inuse_add(struct net *net, int val)
{
        this_cpu_add(*net->core.sock_inuse, val);
}

int sock_inuse_get(struct net *net)
{
        int cpu, res = 0;

        for_each_possible_cpu(cpu)
                res += *per_cpu_ptr(net->core.sock_inuse, cpu);

        return res;
}

EXPORT_SYMBOL_GPL(sock_inuse_get);

static int __net_init sock_inuse_init_net(struct net *net)
{
        net->core.prot_inuse = alloc_percpu(struct prot_inuse);
        if (net->core.prot_inuse == NULL)
                return -ENOMEM;

        net->core.sock_inuse = alloc_percpu(int);
        if (net->core.sock_inuse == NULL)
                goto out;

        return 0;

out:
        free_percpu(net->core.prot_inuse);
        return -ENOMEM;
}

static void __net_exit sock_inuse_exit_net(struct net *net)
{
        free_percpu(net->core.prot_inuse);
        free_percpu(net->core.sock_inuse);
}

static struct pernet_operations net_inuse_ops = {
        .init = sock_inuse_init_net,
        .exit = sock_inuse_exit_net,
};

static __init int net_inuse_init(void)
{
        if (register_pernet_subsys(&net_inuse_ops))
                panic("Cannot initialize net inuse counters");

        return 0;
}

core_initcall(net_inuse_init);

static int assign_proto_idx(struct proto *prot)
{
        prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);

        if (unlikely(prot->inuse_idx == PROTO_INUSE_NR)) {
                pr_err("PROTO_INUSE_NR exhausted\n");
                return -ENOSPC;
        }

        set_bit(prot->inuse_idx, proto_inuse_idx);
        return 0;
}

static void release_proto_idx(struct proto *prot)
{
        if (prot->inuse_idx != PROTO_INUSE_NR)
                clear_bit(prot->inuse_idx, proto_inuse_idx);
}
#else
static inline int assign_proto_idx(struct proto *prot)
{
        return 0;
}

static inline void release_proto_idx(struct proto *prot)
{
}

static void sock_inuse_add(struct net *net, int val)
{
}
#endif

static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
{
        if (!twsk_prot)
                return;
        kfree(twsk_prot->twsk_slab_name);
        twsk_prot->twsk_slab_name = NULL;
        kmem_cache_destroy(twsk_prot->twsk_slab);
        twsk_prot->twsk_slab = NULL;
}

static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
{
        if (!rsk_prot)
                return;
        kfree(rsk_prot->slab_name);
        rsk_prot->slab_name = NULL;
        kmem_cache_destroy(rsk_prot->slab);
        rsk_prot->slab = NULL;
}

static int req_prot_init(const struct proto *prot)
{
        struct request_sock_ops *rsk_prot = prot->rsk_prot;

        if (!rsk_prot)
                return 0;

        rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
                                        prot->name);
        if (!rsk_prot->slab_name)
                return -ENOMEM;

        rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
                                           rsk_prot->obj_size, 0,
                                           SLAB_ACCOUNT | prot->slab_flags,
                                           NULL);

        if (!rsk_prot->slab) {
                pr_crit("%s: Can't create request sock SLAB cache!\n",
                        prot->name);
                return -ENOMEM;
        }
        return 0;
}

int proto_register(struct proto *prot, int alloc_slab)
{
        int ret = -ENOBUFS;

        if (alloc_slab) {
                prot->slab = kmem_cache_create_usercopy(prot->name,
                                        prot->obj_size, 0,
                                        SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
                                        prot->slab_flags,
                                        prot->useroffset, prot->usersize,
                                        NULL);

                if (prot->slab == NULL) {
                        pr_crit("%s: Can't create sock SLAB cache!\n",
                                prot->name);
                        goto out;
                }

                if (req_prot_init(prot))
                        goto out_free_request_sock_slab;

                if (prot->twsk_prot != NULL) {
                        prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);

                        if (prot->twsk_prot->twsk_slab_name == NULL)
                                goto out_free_request_sock_slab;

                        prot->twsk_prot->twsk_slab =
                                kmem_cache_create(prot->twsk_prot->twsk_slab_name,
                                                  prot->twsk_prot->twsk_obj_size,
                                                  0,
                                                  SLAB_ACCOUNT |
                                                  prot->slab_flags,
                                                  NULL);
                        if (prot->twsk_prot->twsk_slab == NULL)
                                goto out_free_timewait_sock_slab;
                }
        }

        mutex_lock(&proto_list_mutex);
        ret = assign_proto_idx(prot);
        if (ret) {
                mutex_unlock(&proto_list_mutex);
                goto out_free_timewait_sock_slab;
        }
        list_add(&prot->node, &proto_list);
        mutex_unlock(&proto_list_mutex);
        return ret;

out_free_timewait_sock_slab:
        if (alloc_slab && prot->twsk_prot)
                tw_prot_cleanup(prot->twsk_prot);
out_free_request_sock_slab:
        if (alloc_slab) {
                req_prot_cleanup(prot->rsk_prot);

                kmem_cache_destroy(prot->slab);
                prot->slab = NULL;
        }
out:
        return ret;
}
EXPORT_SYMBOL(proto_register);

void proto_unregister(struct proto *prot)
{
        mutex_lock(&proto_list_mutex);
        release_proto_idx(prot);
        list_del(&prot->node);
        mutex_unlock(&proto_list_mutex);

        kmem_cache_destroy(prot->slab);
        prot->slab = NULL;

        req_prot_cleanup(prot->rsk_prot);
        tw_prot_cleanup(prot->twsk_prot);
}
EXPORT_SYMBOL(proto_unregister);

int sock_load_diag_module(int family, int protocol)
{
        if (!protocol) {
                if (!sock_is_registered(family))
                        return -ENOENT;

                return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
                                      NETLINK_SOCK_DIAG, family);
        }

#ifdef CONFIG_INET
        if (family == AF_INET &&
            protocol != IPPROTO_RAW &&
            protocol < MAX_INET_PROTOS &&
            !rcu_access_pointer(inet_protos[protocol]))
                return -ENOENT;
#endif

        return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
                              NETLINK_SOCK_DIAG, family, protocol);
}
EXPORT_SYMBOL(sock_load_diag_module);

#ifdef CONFIG_PROC_FS
static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
        __acquires(proto_list_mutex)
{
        mutex_lock(&proto_list_mutex);
        return seq_list_start_head(&proto_list, *pos);
}

static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        return seq_list_next(v, &proto_list, pos);
}

static void proto_seq_stop(struct seq_file *seq, void *v)
        __releases(proto_list_mutex)
{
        mutex_unlock(&proto_list_mutex);
}

static char proto_method_implemented(const void *method)
{
        return method == NULL ? 'n' : 'y';
}
static long sock_prot_memory_allocated(struct proto *proto)
{
        return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
}

static const char *sock_prot_memory_pressure(struct proto *proto)
{
        return proto->memory_pressure != NULL ?
        proto_memory_pressure(proto) ? "yes" : "no" : "NI";
}

static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
{

        seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
                        "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
                   proto->name,
                   proto->obj_size,
                   sock_prot_inuse_get(seq_file_net(seq), proto),
                   sock_prot_memory_allocated(proto),
                   sock_prot_memory_pressure(proto),
                   proto->max_header,
                   proto->slab == NULL ? "no" : "yes",
                   module_name(proto->owner),
                   proto_method_implemented(proto->close),
                   proto_method_implemented(proto->connect),
                   proto_method_implemented(proto->disconnect),
                   proto_method_implemented(proto->accept),
                   proto_method_implemented(proto->ioctl),
                   proto_method_implemented(proto->init),
                   proto_method_implemented(proto->destroy),
                   proto_method_implemented(proto->shutdown),
                   proto_method_implemented(proto->setsockopt),
                   proto_method_implemented(proto->getsockopt),
                   proto_method_implemented(proto->sendmsg),
                   proto_method_implemented(proto->recvmsg),
                   proto_method_implemented(proto->sendpage),
                   proto_method_implemented(proto->bind),
                   proto_method_implemented(proto->backlog_rcv),
                   proto_method_implemented(proto->hash),
                   proto_method_implemented(proto->unhash),
                   proto_method_implemented(proto->get_port),
                   proto_method_implemented(proto->enter_memory_pressure));
}

static int proto_seq_show(struct seq_file *seq, void *v)
{
        if (v == &proto_list)
                seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
                           "protocol",
                           "size",
                           "sockets",
                           "memory",
                           "press",
                           "maxhdr",
                           "slab",
                           "module",
                           "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
        else
                proto_seq_printf(seq, list_entry(v, struct proto, node));
        return 0;
}

static const struct seq_operations proto_seq_ops = {
        .start  = proto_seq_start,
        .next   = proto_seq_next,
        .stop   = proto_seq_stop,
        .show   = proto_seq_show,
};

static __net_init int proto_init_net(struct net *net)
{
        if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
                        sizeof(struct seq_net_private)))
                return -ENOMEM;

        return 0;
}

static __net_exit void proto_exit_net(struct net *net)
{
        remove_proc_entry("protocols", net->proc_net);
}


static __net_initdata struct pernet_operations proto_net_ops = {
        .init = proto_init_net,
        .exit = proto_exit_net,
};

static int __init proto_init(void)
{
        return register_pernet_subsys(&proto_net_ops);
}

subsys_initcall(proto_init);

#endif /* PROC_FS */

#ifdef CONFIG_NET_RX_BUSY_POLL
bool sk_busy_loop_end(void *p, unsigned long start_time)
{
        struct sock *sk = p;

        return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
               sk_busy_loop_timeout(sk, start_time);
}
EXPORT_SYMBOL(sk_busy_loop_end);
#endif /* CONFIG_NET_RX_BUSY_POLL */

int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
{
        if (!sk->sk_prot->bind_add)
                return -EOPNOTSUPP;
        return sk->sk_prot->bind_add(sk, addr, addr_len);
}
EXPORT_SYMBOL(sock_bind_add);















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright 2002-2005, Instant802 Networks, Inc.
 * Copyright 2005, Devicescape Software, Inc.
 * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz>
 */

#ifndef IEEE80211_RATE_H
#define IEEE80211_RATE_H

#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/types.h>
#include <net/mac80211.h>
#include "ieee80211_i.h"
#include "sta_info.h"
#include "driver-ops.h"

struct rate_control_ref {
        const struct rate_control_ops *ops;
        void *priv;
};

void rate_control_get_rate(struct ieee80211_sub_if_data *sdata,
                           struct sta_info *sta,
                           struct ieee80211_tx_rate_control *txrc);

void rate_control_tx_status(struct ieee80211_local *local,
                            struct ieee80211_supported_band *sband,
                            struct ieee80211_tx_status *st);

void rate_control_rate_init(struct sta_info *sta);
void rate_control_rate_update(struct ieee80211_local *local,
                                    struct ieee80211_supported_band *sband,
                                    struct sta_info *sta, u32 changed);

static inline void *rate_control_alloc_sta(struct rate_control_ref *ref,
                                           struct sta_info *sta, gfp_t gfp)
{
        spin_lock_init(&sta->rate_ctrl_lock);
        return ref->ops->alloc_sta(ref->priv, &sta->sta, gfp);
}

static inline void rate_control_free_sta(struct sta_info *sta)
{
        struct rate_control_ref *ref = sta->rate_ctrl;
        struct ieee80211_sta *ista = &sta->sta;
        void *priv_sta = sta->rate_ctrl_priv;

        ref->ops->free_sta(ref->priv, ista, priv_sta);
}

static inline void rate_control_add_sta_debugfs(struct sta_info *sta)
{
#ifdef CONFIG_MAC80211_DEBUGFS
        struct rate_control_ref *ref = sta->rate_ctrl;
        if (ref && sta->debugfs_dir && ref->ops->add_sta_debugfs)
                ref->ops->add_sta_debugfs(ref->priv, sta->rate_ctrl_priv,
                                          sta->debugfs_dir);
#endif
}

extern const struct file_operations rcname_ops;

static inline void rate_control_add_debugfs(struct ieee80211_local *local)
{
#ifdef CONFIG_MAC80211_DEBUGFS
        struct dentry *debugfsdir;

        if (!local->rate_ctrl)
                return;

        if (!local->rate_ctrl->ops->add_debugfs)
                return;

        debugfsdir = debugfs_create_dir("rc", local->hw.wiphy->debugfsdir);
        local->debugfs.rcdir = debugfsdir;
        debugfs_create_file("name", 0400, debugfsdir,
                            local->rate_ctrl, &rcname_ops);

        local->rate_ctrl->ops->add_debugfs(&local->hw, local->rate_ctrl->priv,
                                           debugfsdir);
#endif
}

void ieee80211_check_rate_mask(struct ieee80211_sub_if_data *sdata);

/* Get a reference to the rate control algorithm. If `name' is NULL, get the
 * first available algorithm. */
int ieee80211_init_rate_ctrl_alg(struct ieee80211_local *local,
                                 const char *name);
void rate_control_deinitialize(struct ieee80211_local *local);


/* Rate control algorithms */
#ifdef CONFIG_MAC80211_RC_MINSTREL
int rc80211_minstrel_init(void);
void rc80211_minstrel_exit(void);
#else
static inline int rc80211_minstrel_init(void)
{
        return 0;
}
static inline void rc80211_minstrel_exit(void)
{
}
#endif


#endif /* IEEE80211_RATE_H */




































































































































































































































    1 









































    5 
    5 









    1 

















    1 




    1 






































































































































































    1 





























    1 




    1 











    1 

































































































    1 



















    1 

    1 


    1 



    1 

    1 


    1 
    1 
    1 
























































































































































































































































    1 





    1 



    1 






    1 


































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Generic helpers for smp ipi calls
 *
 * (C) Jens Axboe <jens.axboe@oracle.com> 2008
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/irq_work.h>
#include <linux/rcupdate.h>
#include <linux/rculist.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/gfp.h>
#include <linux/smp.h>
#include <linux/cpu.h>
#include <linux/sched.h>
#include <linux/sched/idle.h>
#include <linux/hypervisor.h>
#include <linux/sched/clock.h>
#include <linux/nmi.h>
#include <linux/sched/debug.h>

#include "smpboot.h"
#include "sched/smp.h"

#define CSD_TYPE(_csd)        ((_csd)->flags & CSD_FLAG_TYPE_MASK)

struct call_function_data {
        call_single_data_t        __percpu *csd;
        cpumask_var_t                cpumask;
        cpumask_var_t                cpumask_ipi;
};

static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data);

static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);

static void flush_smp_call_function_queue(bool warn_cpu_offline);

int smpcfd_prepare_cpu(unsigned int cpu)
{
        struct call_function_data *cfd = &per_cpu(cfd_data, cpu);

        if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
                                     cpu_to_node(cpu)))
                return -ENOMEM;
        if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
                                     cpu_to_node(cpu))) {
                free_cpumask_var(cfd->cpumask);
                return -ENOMEM;
        }
        cfd->csd = alloc_percpu(call_single_data_t);
        if (!cfd->csd) {
                free_cpumask_var(cfd->cpumask);
                free_cpumask_var(cfd->cpumask_ipi);
                return -ENOMEM;
        }

        return 0;
}

int smpcfd_dead_cpu(unsigned int cpu)
{
        struct call_function_data *cfd = &per_cpu(cfd_data, cpu);

        free_cpumask_var(cfd->cpumask);
        free_cpumask_var(cfd->cpumask_ipi);
        free_percpu(cfd->csd);
        return 0;
}

int smpcfd_dying_cpu(unsigned int cpu)
{
        /*
         * The IPIs for the smp-call-function callbacks queued by other
         * CPUs might arrive late, either due to hardware latencies or
         * because this CPU disabled interrupts (inside stop-machine)
         * before the IPIs were sent. So flush out any pending callbacks
         * explicitly (without waiting for the IPIs to arrive), to
         * ensure that the outgoing CPU doesn't go offline with work
         * still pending.
         */
        flush_smp_call_function_queue(false);
        irq_work_run();
        return 0;
}

void __init call_function_init(void)
{
        int i;

        for_each_possible_cpu(i)
                init_llist_head(&per_cpu(call_single_queue, i));

        smpcfd_prepare_cpu(smp_processor_id());
}

#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG

static DEFINE_PER_CPU(call_single_data_t *, cur_csd);
static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func);
static DEFINE_PER_CPU(void *, cur_csd_info);

#define CSD_LOCK_TIMEOUT (5ULL * NSEC_PER_SEC)
static atomic_t csd_bug_count = ATOMIC_INIT(0);

/* Record current CSD work for current CPU, NULL to erase. */
static void csd_lock_record(struct __call_single_data *csd)
{
        if (!csd) {
                smp_mb(); /* NULL cur_csd after unlock. */
                __this_cpu_write(cur_csd, NULL);
                return;
        }
        __this_cpu_write(cur_csd_func, csd->func);
        __this_cpu_write(cur_csd_info, csd->info);
        smp_wmb(); /* func and info before csd. */
        __this_cpu_write(cur_csd, csd);
        smp_mb(); /* Update cur_csd before function call. */
                  /* Or before unlock, as the case may be. */
}

static __always_inline int csd_lock_wait_getcpu(struct __call_single_data *csd)
{
        unsigned int csd_type;

        csd_type = CSD_TYPE(csd);
        if (csd_type == CSD_TYPE_ASYNC || csd_type == CSD_TYPE_SYNC)
                return csd->dst; /* Other CSD_TYPE_ values might not have ->dst. */
        return -1;
}

/*
 * Complain if too much time spent waiting.  Note that only
 * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU,
 * so waiting on other types gets much less information.
 */
static __always_inline bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *ts1, int *bug_id)
{
        int cpu = -1;
        int cpux;
        bool firsttime;
        u64 ts2, ts_delta;
        call_single_data_t *cpu_cur_csd;
        unsigned int flags = READ_ONCE(csd->flags);

        if (!(flags & CSD_FLAG_LOCK)) {
                if (!unlikely(*bug_id))
                        return true;
                cpu = csd_lock_wait_getcpu(csd);
                pr_alert("csd: CSD lock (#%d) got unstuck on CPU#%02d, CPU#%02d released the lock.\n",
                         *bug_id, raw_smp_processor_id(), cpu);
                return true;
        }

        ts2 = sched_clock();
        ts_delta = ts2 - *ts1;
        if (likely(ts_delta <= CSD_LOCK_TIMEOUT))
                return false;

        firsttime = !*bug_id;
        if (firsttime)
                *bug_id = atomic_inc_return(&csd_bug_count);
        cpu = csd_lock_wait_getcpu(csd);
        if (WARN_ONCE(cpu < 0 || cpu >= nr_cpu_ids, "%s: cpu = %d\n", __func__, cpu))
                cpux = 0;
        else
                cpux = cpu;
        cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */
        pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %llu ns for CPU#%02d %pS(%ps).\n",
                 firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts2 - ts0,
                 cpu, csd->func, csd->info);
        if (cpu_cur_csd && csd != cpu_cur_csd) {
                pr_alert("\tcsd: CSD lock (#%d) handling prior %pS(%ps) request.\n",
                         *bug_id, READ_ONCE(per_cpu(cur_csd_func, cpux)),
                         READ_ONCE(per_cpu(cur_csd_info, cpux)));
        } else {
                pr_alert("\tcsd: CSD lock (#%d) %s.\n",
                         *bug_id, !cpu_cur_csd ? "unresponsive" : "handling this request");
        }
        if (cpu >= 0) {
                if (!trigger_single_cpu_backtrace(cpu))
                        dump_cpu_task(cpu);
                if (!cpu_cur_csd) {
                        pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu);
                        arch_send_call_function_single_ipi(cpu);
                }
        }
        dump_stack();
        *ts1 = ts2;

        return false;
}

/*
 * csd_lock/csd_unlock used to serialize access to per-cpu csd resources
 *
 * For non-synchronous ipi calls the csd can still be in use by the
 * previous function call. For multi-cpu calls its even more interesting
 * as we'll have to ensure no other cpu is observing our csd.
 */
static __always_inline void csd_lock_wait(struct __call_single_data *csd)
{
        int bug_id = 0;
        u64 ts0, ts1;

        ts1 = ts0 = sched_clock();
        for (;;) {
                if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id))
                        break;
                cpu_relax();
        }
        smp_acquire__after_ctrl_dep();
}

#else
static void csd_lock_record(struct __call_single_data *csd)
{
}

static __always_inline void csd_lock_wait(struct __call_single_data *csd)
{
        smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK));
}
#endif

static __always_inline void csd_lock(struct __call_single_data *csd)
{
        csd_lock_wait(csd);
        csd->flags |= CSD_FLAG_LOCK;

        /*
         * prevent CPU from reordering the above assignment
         * to ->flags with any subsequent assignments to other
         * fields of the specified call_single_data_t structure:
         */
        smp_wmb();
}

static __always_inline void csd_unlock(struct __call_single_data *csd)
{
        WARN_ON(!(csd->flags & CSD_FLAG_LOCK));

        /*
         * ensure we're all done before releasing data:
         */
        smp_store_release(&csd->flags, 0);
}

static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);

void __smp_call_single_queue(int cpu, struct llist_node *node)
{
        /*
         * The list addition should be visible before sending the IPI
         * handler locks the list to pull the entry off it because of
         * normal cache coherency rules implied by spinlocks.
         *
         * If IPIs can go out of order to the cache coherency protocol
         * in an architecture, sufficient synchronisation should be added
         * to arch code to make it appear to obey cache coherency WRT
         * locking and barrier primitives. Generic code isn't really
         * equipped to do the right thing...
         */
        if (llist_add(node, &per_cpu(call_single_queue, cpu)))
                send_call_function_single_ipi(cpu);
}

/*
 * Insert a previously allocated call_single_data_t element
 * for execution on the given CPU. data must already have
 * ->func, ->info, and ->flags set.
 */
static int generic_exec_single(int cpu, struct __call_single_data *csd)
{
        if (cpu == smp_processor_id()) {
                smp_call_func_t func = csd->func;
                void *info = csd->info;
                unsigned long flags;

                /*
                 * We can unlock early even for the synchronous on-stack case,
                 * since we're doing this from the same CPU..
                 */
                csd_lock_record(csd);
                csd_unlock(csd);
                local_irq_save(flags);
                func(info);
                csd_lock_record(NULL);
                local_irq_restore(flags);
                return 0;
        }

        if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) {
                csd_unlock(csd);
                return -ENXIO;
        }

        __smp_call_single_queue(cpu, &csd->llist);

        return 0;
}

/**
 * generic_smp_call_function_single_interrupt - Execute SMP IPI callbacks
 *
 * Invoked by arch to handle an IPI for call function single.
 * Must be called with interrupts disabled.
 */
void generic_smp_call_function_single_interrupt(void)
{
        flush_smp_call_function_queue(true);
}

/**
 * flush_smp_call_function_queue - Flush pending smp-call-function callbacks
 *
 * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an
 *                      offline CPU. Skip this check if set to 'false'.
 *
 * Flush any pending smp-call-function callbacks queued on this CPU. This is
 * invoked by the generic IPI handler, as well as by a CPU about to go offline,
 * to ensure that all pending IPI callbacks are run before it goes completely
 * offline.
 *
 * Loop through the call_single_queue and run all the queued callbacks.
 * Must be called with interrupts disabled.
 */
static void flush_smp_call_function_queue(bool warn_cpu_offline)
{
        call_single_data_t *csd, *csd_next;
        struct llist_node *entry, *prev;
        struct llist_head *head;
        static bool warned;

        lockdep_assert_irqs_disabled();

        head = this_cpu_ptr(&call_single_queue);
        entry = llist_del_all(head);
        entry = llist_reverse_order(entry);

        /* There shouldn't be any pending callbacks on an offline CPU. */
        if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) &&
                     !warned && entry != NULL)) {
                warned = true;
                WARN(1, "IPI on offline CPU %d\n", smp_processor_id());

                /*
                 * We don't have to use the _safe() variant here
                 * because we are not invoking the IPI handlers yet.
                 */
                llist_for_each_entry(csd, entry, llist) {
                        switch (CSD_TYPE(csd)) {
                        case CSD_TYPE_ASYNC:
                        case CSD_TYPE_SYNC:
                        case CSD_TYPE_IRQ_WORK:
                                pr_warn("IPI callback %pS sent to offline CPU\n",
                                        csd->func);
                                break;

                        case CSD_TYPE_TTWU:
                                pr_warn("IPI task-wakeup sent to offline CPU\n");
                                break;

                        default:
                                pr_warn("IPI callback, unknown type %d, sent to offline CPU\n",
                                        CSD_TYPE(csd));
                                break;
                        }
                }
        }

        /*
         * First; run all SYNC callbacks, people are waiting for us.
         */
        prev = NULL;
        llist_for_each_entry_safe(csd, csd_next, entry, llist) {
                /* Do we wait until *after* callback? */
                if (CSD_TYPE(csd) == CSD_TYPE_SYNC) {
                        smp_call_func_t func = csd->func;
                        void *info = csd->info;

                        if (prev) {
                                prev->next = &csd_next->llist;
                        } else {
                                entry = &csd_next->llist;
                        }

                        csd_lock_record(csd);
                        func(info);
                        csd_unlock(csd);
                        csd_lock_record(NULL);
                } else {
                        prev = &csd->llist;
                }
        }

        if (!entry)
                return;

        /*
         * Second; run all !SYNC callbacks.
         */
        prev = NULL;
        llist_for_each_entry_safe(csd, csd_next, entry, llist) {
                int type = CSD_TYPE(csd);

                if (type != CSD_TYPE_TTWU) {
                        if (prev) {
                                prev->next = &csd_next->llist;
                        } else {
                                entry = &csd_next->llist;
                        }

                        if (type == CSD_TYPE_ASYNC) {
                                smp_call_func_t func = csd->func;
                                void *info = csd->info;

                                csd_lock_record(csd);
                                csd_unlock(csd);
                                func(info);
                                csd_lock_record(NULL);
                        } else if (type == CSD_TYPE_IRQ_WORK) {
                                irq_work_single(csd);
                        }

                } else {
                        prev = &csd->llist;
                }
        }

        /*
         * Third; only CSD_TYPE_TTWU is left, issue those.
         */
        if (entry)
                sched_ttwu_pending(entry);
}

void flush_smp_call_function_from_idle(void)
{
        unsigned long flags;

        if (llist_empty(this_cpu_ptr(&call_single_queue)))
                return;

        local_irq_save(flags);
        flush_smp_call_function_queue(true);
        if (local_softirq_pending())
                do_softirq();

        local_irq_restore(flags);
}

/*
 * smp_call_function_single - Run a function on a specific CPU
 * @func: The function to run. This must be fast and non-blocking.
 * @info: An arbitrary pointer to pass to the function.
 * @wait: If true, wait until function has completed on other CPUs.
 *
 * Returns 0 on success, else a negative status code.
 */
int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
                             int wait)
{
        call_single_data_t *csd;
        call_single_data_t csd_stack = {
                .flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC,
        };
        int this_cpu;
        int err;

        /*
         * prevent preemption and reschedule on another processor,
         * as well as CPU removal
         */
        this_cpu = get_cpu();

        /*
         * Can deadlock when called with interrupts disabled.
         * We allow cpu's that are not yet online though, as no one else can
         * send smp call function interrupt to this cpu and as such deadlocks
         * can't happen.
         */
        WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
                     && !oops_in_progress);

        /*
         * When @wait we can deadlock when we interrupt between llist_add() and
         * arch_send_call_function_ipi*(); when !@wait we can deadlock due to
         * csd_lock() on because the interrupt context uses the same csd
         * storage.
         */
        WARN_ON_ONCE(!in_task());

        csd = &csd_stack;
        if (!wait) {
                csd = this_cpu_ptr(&csd_data);
                csd_lock(csd);
        }

        csd->func = func;
        csd->info = info;
#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
        csd->src = smp_processor_id();
        csd->dst = cpu;
#endif

        err = generic_exec_single(cpu, csd);

        if (wait)
                csd_lock_wait(csd);

        put_cpu();

        return err;
}
EXPORT_SYMBOL(smp_call_function_single);

/**
 * smp_call_function_single_async(): Run an asynchronous function on a
 *                                  specific CPU.
 * @cpu: The CPU to run on.
 * @csd: Pre-allocated and setup data structure
 *
 * Like smp_call_function_single(), but the call is asynchonous and
 * can thus be done from contexts with disabled interrupts.
 *
 * The caller passes his own pre-allocated data structure
 * (ie: embedded in an object) and is responsible for synchronizing it
 * such that the IPIs performed on the @csd are strictly serialized.
 *
 * If the function is called with one csd which has not yet been
 * processed by previous call to smp_call_function_single_async(), the
 * function will return immediately with -EBUSY showing that the csd
 * object is still in progress.
 *
 * NOTE: Be careful, there is unfortunately no current debugging facility to
 * validate the correctness of this serialization.
 */
int smp_call_function_single_async(int cpu, struct __call_single_data *csd)
{
        int err = 0;

        preempt_disable();

        if (csd->flags & CSD_FLAG_LOCK) {
                err = -EBUSY;
                goto out;
        }

        csd->flags = CSD_FLAG_LOCK;
        smp_wmb();

        err = generic_exec_single(cpu, csd);

out:
        preempt_enable();

        return err;
}
EXPORT_SYMBOL_GPL(smp_call_function_single_async);

/*
 * smp_call_function_any - Run a function on any of the given cpus
 * @mask: The mask of cpus it can run on.
 * @func: The function to run. This must be fast and non-blocking.
 * @info: An arbitrary pointer to pass to the function.
 * @wait: If true, wait until function has completed.
 *
 * Returns 0 on success, else a negative status code (if no cpus were online).
 *
 * Selection preference:
 *        1) current cpu if in @mask
 *        2) any cpu of current node if in @mask
 *        3) any other online cpu in @mask
 */
int smp_call_function_any(const struct cpumask *mask,
                          smp_call_func_t func, void *info, int wait)
{
        unsigned int cpu;
        const struct cpumask *nodemask;
        int ret;

        /* Try for same CPU (cheapest) */
        cpu = get_cpu();
        if (cpumask_test_cpu(cpu, mask))
                goto call;

        /* Try for same node. */
        nodemask = cpumask_of_node(cpu_to_node(cpu));
        for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids;
             cpu = cpumask_next_and(cpu, nodemask, mask)) {
                if (cpu_online(cpu))
                        goto call;
        }

        /* Any online will do: smp_call_function_single handles nr_cpu_ids. */
        cpu = cpumask_any_and(mask, cpu_online_mask);
call:
        ret = smp_call_function_single(cpu, func, info, wait);
        put_cpu();
        return ret;
}
EXPORT_SYMBOL_GPL(smp_call_function_any);

static void smp_call_function_many_cond(const struct cpumask *mask,
                                        smp_call_func_t func, void *info,
                                        bool wait, smp_cond_func_t cond_func)
{
        struct call_function_data *cfd;
        int cpu, next_cpu, this_cpu = smp_processor_id();

        /*
         * Can deadlock when called with interrupts disabled.
         * We allow cpu's that are not yet online though, as no one else can
         * send smp call function interrupt to this cpu and as such deadlocks
         * can't happen.
         */
        WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
                     && !oops_in_progress && !early_boot_irqs_disabled);

        /*
         * When @wait we can deadlock when we interrupt between llist_add() and
         * arch_send_call_function_ipi*(); when !@wait we can deadlock due to
         * csd_lock() on because the interrupt context uses the same csd
         * storage.
         */
        WARN_ON_ONCE(!in_task());

        /* Try to fastpath.  So, what's a CPU they want? Ignoring this one. */
        cpu = cpumask_first_and(mask, cpu_online_mask);
        if (cpu == this_cpu)
                cpu = cpumask_next_and(cpu, mask, cpu_online_mask);

        /* No online cpus?  We're done. */
        if (cpu >= nr_cpu_ids)
                return;

        /* Do we have another CPU which isn't us? */
        next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
        if (next_cpu == this_cpu)
                next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask);

        /* Fastpath: do that cpu by itself. */
        if (next_cpu >= nr_cpu_ids) {
                if (!cond_func || cond_func(cpu, info))
                        smp_call_function_single(cpu, func, info, wait);
                return;
        }

        cfd = this_cpu_ptr(&cfd_data);

        cpumask_and(cfd->cpumask, mask, cpu_online_mask);
        __cpumask_clear_cpu(this_cpu, cfd->cpumask);

        /* Some callers race with other cpus changing the passed mask */
        if (unlikely(!cpumask_weight(cfd->cpumask)))
                return;

        cpumask_clear(cfd->cpumask_ipi);
        for_each_cpu(cpu, cfd->cpumask) {
                call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu);

                if (cond_func && !cond_func(cpu, info))
                        continue;

                csd_lock(csd);
                if (wait)
                        csd->flags |= CSD_TYPE_SYNC;
                csd->func = func;
                csd->info = info;
#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
                csd->src = smp_processor_id();
                csd->dst = cpu;
#endif
                if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
                        __cpumask_set_cpu(cpu, cfd->cpumask_ipi);
        }

        /* Send a message to all CPUs in the map */
        arch_send_call_function_ipi_mask(cfd->cpumask_ipi);

        if (wait) {
                for_each_cpu(cpu, cfd->cpumask) {
                        call_single_data_t *csd;

                        csd = per_cpu_ptr(cfd->csd, cpu);
                        csd_lock_wait(csd);
                }
        }
}

/**
 * smp_call_function_many(): Run a function on a set of other CPUs.
 * @mask: The set of cpus to run on (only runs on online subset).
 * @func: The function to run. This must be fast and non-blocking.
 * @info: An arbitrary pointer to pass to the function.
 * @wait: If true, wait (atomically) until function has completed
 *        on other CPUs.
 *
 * If @wait is true, then returns once @func has returned.
 *
 * You must not call this function with disabled interrupts or from a
 * hardware interrupt handler or from a bottom half handler. Preemption
 * must be disabled when calling this function.
 */
void smp_call_function_many(const struct cpumask *mask,
                            smp_call_func_t func, void *info, bool wait)
{
        smp_call_function_many_cond(mask, func, info, wait, NULL);
}
EXPORT_SYMBOL(smp_call_function_many);

/**
 * smp_call_function(): Run a function on all other CPUs.
 * @func: The function to run. This must be fast and non-blocking.
 * @info: An arbitrary pointer to pass to the function.
 * @wait: If true, wait (atomically) until function has completed
 *        on other CPUs.
 *
 * Returns 0.
 *
 * If @wait is true, then returns once @func has returned; otherwise
 * it returns just before the target cpu calls @func.
 *
 * You must not call this function with disabled interrupts or from a
 * hardware interrupt handler or from a bottom half handler.
 */
void smp_call_function(smp_call_func_t func, void *info, int wait)
{
        preempt_disable();
        smp_call_function_many(cpu_online_mask, func, info, wait);
        preempt_enable();
}
EXPORT_SYMBOL(smp_call_function);

/* Setup configured maximum number of CPUs to activate */
unsigned int setup_max_cpus = NR_CPUS;
EXPORT_SYMBOL(setup_max_cpus);


/*
 * Setup routine for controlling SMP activation
 *
 * Command-line option of "nosmp" or "maxcpus=0" will disable SMP
 * activation entirely (the MPS table probe still happens, though).
 *
 * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer
 * greater than 0, limits the maximum number of CPUs activated in
 * SMP mode to <NUM>.
 */

void __weak arch_disable_smp_support(void) { }

static int __init nosmp(char *str)
{
        setup_max_cpus = 0;
        arch_disable_smp_support();

        return 0;
}

early_param("nosmp", nosmp);

/* this is hard limit */
static int __init nrcpus(char *str)
{
        int nr_cpus;

        if (get_option(&str, &nr_cpus) && nr_cpus > 0 && nr_cpus < nr_cpu_ids)
                nr_cpu_ids = nr_cpus;

        return 0;
}

early_param("nr_cpus", nrcpus);

static int __init maxcpus(char *str)
{
        get_option(&str, &setup_max_cpus);
        if (setup_max_cpus == 0)
                arch_disable_smp_support();

        return 0;
}

early_param("maxcpus", maxcpus);

/* Setup number of possible processor ids */
unsigned int nr_cpu_ids __read_mostly = NR_CPUS;
EXPORT_SYMBOL(nr_cpu_ids);

/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
void __init setup_nr_cpu_ids(void)
{
        nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
}

/* Called by boot processor to activate the rest. */
void __init smp_init(void)
{
        int num_nodes, num_cpus;

        idle_threads_init();
        cpuhp_threads_init();

        pr_info("Bringing up secondary CPUs ...\n");

        bringup_nonboot_cpus(setup_max_cpus);

        num_nodes = num_online_nodes();
        num_cpus  = num_online_cpus();
        pr_info("Brought up %d node%s, %d CPU%s\n",
                num_nodes, (num_nodes > 1 ? "s" : ""),
                num_cpus,  (num_cpus  > 1 ? "s" : ""));

        /* Any cleanup work */
        smp_cpus_done(setup_max_cpus);
}

/*
 * Call a function on all processors.  May be used during early boot while
 * early_boot_irqs_disabled is set.  Use local_irq_save/restore() instead
 * of local_irq_disable/enable().
 */
void on_each_cpu(smp_call_func_t func, void *info, int wait)
{
        unsigned long flags;

        preempt_disable();
        smp_call_function(func, info, wait);
        local_irq_save(flags);
        func(info);
        local_irq_restore(flags);
        preempt_enable();
}
EXPORT_SYMBOL(on_each_cpu);

/**
 * on_each_cpu_mask(): Run a function on processors specified by
 * cpumask, which may include the local processor.
 * @mask: The set of cpus to run on (only runs on online subset).
 * @func: The function to run. This must be fast and non-blocking.
 * @info: An arbitrary pointer to pass to the function.
 * @wait: If true, wait (atomically) until function has completed
 *        on other CPUs.
 *
 * If @wait is true, then returns once @func has returned.
 *
 * You must not call this function with disabled interrupts or from a
 * hardware interrupt handler or from a bottom half handler.  The
 * exception is that it may be used during early boot while
 * early_boot_irqs_disabled is set.
 */
void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func,
                        void *info, bool wait)
{
        int cpu = get_cpu();

        smp_call_function_many(mask, func, info, wait);
        if (cpumask_test_cpu(cpu, mask)) {
                unsigned long flags;
                local_irq_save(flags);
                func(info);
                local_irq_restore(flags);
        }
        put_cpu();
}
EXPORT_SYMBOL(on_each_cpu_mask);

/*
 * on_each_cpu_cond(): Call a function on each processor for which
 * the supplied function cond_func returns true, optionally waiting
 * for all the required CPUs to finish. This may include the local
 * processor.
 * @cond_func:        A callback function that is passed a cpu id and
 *                the info parameter. The function is called
 *                with preemption disabled. The function should
 *                return a blooean value indicating whether to IPI
 *                the specified CPU.
 * @func:        The function to run on all applicable CPUs.
 *                This must be fast and non-blocking.
 * @info:        An arbitrary pointer to pass to both functions.
 * @wait:        If true, wait (atomically) until function has
 *                completed on other CPUs.
 *
 * Preemption is disabled to protect against CPUs going offline but not online.
 * CPUs going online during the call will not be seen or sent an IPI.
 *
 * You must not call this function with disabled interrupts or
 * from a hardware interrupt handler or from a bottom half handler.
 */
void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
                           void *info, bool wait, const struct cpumask *mask)
{
        int cpu = get_cpu();

        smp_call_function_many_cond(mask, func, info, wait, cond_func);
        if (cpumask_test_cpu(cpu, mask) && cond_func(cpu, info)) {
                unsigned long flags;

                local_irq_save(flags);
                func(info);
                local_irq_restore(flags);
        }
        put_cpu();
}
EXPORT_SYMBOL(on_each_cpu_cond_mask);

void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func,
                      void *info, bool wait)
{
        on_each_cpu_cond_mask(cond_func, func, info, wait, cpu_online_mask);
}
EXPORT_SYMBOL(on_each_cpu_cond);

static void do_nothing(void *unused)
{
}

/**
 * kick_all_cpus_sync - Force all cpus out of idle
 *
 * Used to synchronize the update of pm_idle function pointer. It's
 * called after the pointer is updated and returns after the dummy
 * callback function has been executed on all cpus. The execution of
 * the function can only happen on the remote cpus after they have
 * left the idle function which had been called via pm_idle function
 * pointer. So it's guaranteed that nothing uses the previous pointer
 * anymore.
 */
void kick_all_cpus_sync(void)
{
        /* Make sure the change is visible before we kick the cpus */
        smp_mb();
        smp_call_function(do_nothing, NULL, 1);
}
EXPORT_SYMBOL_GPL(kick_all_cpus_sync);

/**
 * wake_up_all_idle_cpus - break all cpus out of idle
 * wake_up_all_idle_cpus try to break all cpus which is in idle state even
 * including idle polling cpus, for non-idle cpus, we will do nothing
 * for them.
 */
void wake_up_all_idle_cpus(void)
{
        int cpu;

        preempt_disable();
        for_each_online_cpu(cpu) {
                if (cpu == smp_processor_id())
                        continue;

                wake_up_if_idle(cpu);
        }
        preempt_enable();
}
EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus);

/**
 * smp_call_on_cpu - Call a function on a specific cpu
 *
 * Used to call a function on a specific cpu and wait for it to return.
 * Optionally make sure the call is done on a specified physical cpu via vcpu
 * pinning in order to support virtualized environments.
 */
struct smp_call_on_cpu_struct {
        struct work_struct        work;
        struct completion        done;
        int                        (*func)(void *);
        void                        *data;
        int                        ret;
        int                        cpu;
};

static void smp_call_on_cpu_callback(struct work_struct *work)
{
        struct smp_call_on_cpu_struct *sscs;

        sscs = container_of(work, struct smp_call_on_cpu_struct, work);
        if (sscs->cpu >= 0)
                hypervisor_pin_vcpu(sscs->cpu);
        sscs->ret = sscs->func(sscs->data);
        if (sscs->cpu >= 0)
                hypervisor_pin_vcpu(-1);

        complete(&sscs->done);
}

int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys)
{
        struct smp_call_on_cpu_struct sscs = {
                .done = COMPLETION_INITIALIZER_ONSTACK(sscs.done),
                .func = func,
                .data = par,
                .cpu  = phys ? cpu : -1,
        };

        INIT_WORK_ONSTACK(&sscs.work, smp_call_on_cpu_callback);

        if (cpu >= nr_cpu_ids || !cpu_online(cpu))
                return -ENXIO;

        queue_work_on(cpu, system_wq, &sscs.work);
        wait_for_completion(&sscs.done);
        destroy_work_on_stack(&sscs.work);

        return sscs.ret;
}
EXPORT_SYMBOL_GPL(smp_call_on_cpu);













    1 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2020 ARM Ltd.
 */
#ifndef __ASM_VDSO_PROCESSOR_H
#define __ASM_VDSO_PROCESSOR_H

#ifndef __ASSEMBLY__

/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
static __always_inline void rep_nop(void)
{
        asm volatile("rep; nop" ::: "memory");
}

static __always_inline void cpu_relax(void)
{
        rep_nop();
}

#endif /* __ASSEMBLY__ */

#endif /* __ASM_VDSO_PROCESSOR_H */































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM tlb

#if !defined(_TRACE_TLB_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_TLB_H

#include <linux/mm_types.h>
#include <linux/tracepoint.h>

#define TLB_FLUSH_REASON                                                \
        EM(  TLB_FLUSH_ON_TASK_SWITCH,        "flush on task switch" )        \
        EM(  TLB_REMOTE_SHOOTDOWN,        "remote shootdown" )                \
        EM(  TLB_LOCAL_SHOOTDOWN,        "local shootdown" )                \
        EM(  TLB_LOCAL_MM_SHOOTDOWN,        "local mm shootdown" )                \
        EMe( TLB_REMOTE_SEND_IPI,        "remote ipi send" )

/*
 * First define the enums in TLB_FLUSH_REASON to be exported to userspace
 * via TRACE_DEFINE_ENUM().
 */
#undef EM
#undef EMe
#define EM(a,b)                TRACE_DEFINE_ENUM(a);
#define EMe(a,b)        TRACE_DEFINE_ENUM(a);

TLB_FLUSH_REASON

/*
 * Now redefine the EM() and EMe() macros to map the enums to the strings
 * that will be printed in the output.
 */
#undef EM
#undef EMe
#define EM(a,b)                { a, b },
#define EMe(a,b)        { a, b }

TRACE_EVENT(tlb_flush,

        TP_PROTO(int reason, unsigned long pages),
        TP_ARGS(reason, pages),

        TP_STRUCT__entry(
                __field(          int, reason)
                __field(unsigned long,  pages)
        ),

        TP_fast_assign(
                __entry->reason = reason;
                __entry->pages  = pages;
        ),

        TP_printk("pages:%ld reason:%s (%d)",
                __entry->pages,
                __print_symbolic(__entry->reason, TLB_FLUSH_REASON),
                __entry->reason)
);

#endif /* _TRACE_TLB_H */

/* This part must be outside protection */
#include <trace/define_trace.h>



































































    3 




    2 







    2 




















































    2 





















































    1 















    2 












    3 

















































































































    1 




















    1 





































    2 



    3 













































































    1 













































































    2 


    1 
























    2 










































































































































































    3 



















    3 



    3 
    2 
    3 
    3 









































    2 






    2 
    2 



    2 



























    2 



















    2 








    2 






    2 






































    2 







    1 























































    2 
    2 























































































    2 

    2 
    2 










    1 























    2 
















































































































































































































































    1 











    1 




































    1 






    1 



    1 


    1 

    1 






































    1 









    2 







    2 


    2 






    2 
    1 

    1 
    2 

















    1 

    1 

    1 






    1 

    2 


    2 







































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Copyright (C) 2001 Momchil Velikov
 * Portions Copyright (C) 2001 Christoph Hellwig
 * Copyright (C) 2005 SGI, Christoph Lameter
 * Copyright (C) 2006 Nick Piggin
 * Copyright (C) 2012 Konstantin Khlebnikov
 * Copyright (C) 2016 Intel, Matthew Wilcox
 * Copyright (C) 2016 Intel, Ross Zwisler
 */

#include <linux/bitmap.h>
#include <linux/bitops.h>
#include <linux/bug.h>
#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/export.h>
#include <linux/idr.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/kmemleak.h>
#include <linux/percpu.h>
#include <linux/preempt.h>                /* in_interrupt() */
#include <linux/radix-tree.h>
#include <linux/rcupdate.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/xarray.h>

/*
 * Radix tree node cache.
 */
struct kmem_cache *radix_tree_node_cachep;

/*
 * The radix tree is variable-height, so an insert operation not only has
 * to build the branch to its corresponding item, it also has to build the
 * branch to existing items if the size has to be increased (by
 * radix_tree_extend).
 *
 * The worst case is a zero height tree with just a single item at index 0,
 * and then inserting an item at index ULONG_MAX. This requires 2 new branches
 * of RADIX_TREE_MAX_PATH size to be created, with only the root node shared.
 * Hence:
 */
#define RADIX_TREE_PRELOAD_SIZE (RADIX_TREE_MAX_PATH * 2 - 1)

/*
 * The IDR does not have to be as high as the radix tree since it uses
 * signed integers, not unsigned longs.
 */
#define IDR_INDEX_BITS                (8 /* CHAR_BIT */ * sizeof(int) - 1)
#define IDR_MAX_PATH                (DIV_ROUND_UP(IDR_INDEX_BITS, \
                                                RADIX_TREE_MAP_SHIFT))
#define IDR_PRELOAD_SIZE        (IDR_MAX_PATH * 2 - 1)

/*
 * Per-cpu pool of preloaded nodes
 */
DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = {
        .lock = INIT_LOCAL_LOCK(lock),
};
EXPORT_PER_CPU_SYMBOL_GPL(radix_tree_preloads);

static inline struct radix_tree_node *entry_to_node(void *ptr)
{
        return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE);
}

static inline void *node_to_entry(void *ptr)
{
        return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE);
}

#define RADIX_TREE_RETRY        XA_RETRY_ENTRY

static inline unsigned long
get_slot_offset(const struct radix_tree_node *parent, void __rcu **slot)
{
        return parent ? slot - parent->slots : 0;
}

static unsigned int radix_tree_descend(const struct radix_tree_node *parent,
                        struct radix_tree_node **nodep, unsigned long index)
{
        unsigned int offset = (index >> parent->shift) & RADIX_TREE_MAP_MASK;
        void __rcu **entry = rcu_dereference_raw(parent->slots[offset]);

        *nodep = (void *)entry;
        return offset;
}

static inline gfp_t root_gfp_mask(const struct radix_tree_root *root)
{
        return root->xa_flags & (__GFP_BITS_MASK & ~GFP_ZONEMASK);
}

static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
                int offset)
{
        __set_bit(offset, node->tags[tag]);
}

static inline void tag_clear(struct radix_tree_node *node, unsigned int tag,
                int offset)
{
        __clear_bit(offset, node->tags[tag]);
}

static inline int tag_get(const struct radix_tree_node *node, unsigned int tag,
                int offset)
{
        return test_bit(offset, node->tags[tag]);
}

static inline void root_tag_set(struct radix_tree_root *root, unsigned tag)
{
        root->xa_flags |= (__force gfp_t)(1 << (tag + ROOT_TAG_SHIFT));
}

static inline void root_tag_clear(struct radix_tree_root *root, unsigned tag)
{
        root->xa_flags &= (__force gfp_t)~(1 << (tag + ROOT_TAG_SHIFT));
}

static inline void root_tag_clear_all(struct radix_tree_root *root)
{
        root->xa_flags &= (__force gfp_t)((1 << ROOT_TAG_SHIFT) - 1);
}

static inline int root_tag_get(const struct radix_tree_root *root, unsigned tag)
{
        return (__force int)root->xa_flags & (1 << (tag + ROOT_TAG_SHIFT));
}

static inline unsigned root_tags_get(const struct radix_tree_root *root)
{
        return (__force unsigned)root->xa_flags >> ROOT_TAG_SHIFT;
}

static inline bool is_idr(const struct radix_tree_root *root)
{
        return !!(root->xa_flags & ROOT_IS_IDR);
}

/*
 * Returns 1 if any slot in the node has this tag set.
 * Otherwise returns 0.
 */
static inline int any_tag_set(const struct radix_tree_node *node,
                                                        unsigned int tag)
{
        unsigned idx;
        for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
                if (node->tags[tag][idx])
                        return 1;
        }
        return 0;
}

static inline void all_tag_set(struct radix_tree_node *node, unsigned int tag)
{
        bitmap_fill(node->tags[tag], RADIX_TREE_MAP_SIZE);
}

/**
 * radix_tree_find_next_bit - find the next set bit in a memory region
 *
 * @addr: The address to base the search on
 * @size: The bitmap size in bits
 * @offset: The bitnumber to start searching at
 *
 * Unrollable variant of find_next_bit() for constant size arrays.
 * Tail bits starting from size to roundup(size, BITS_PER_LONG) must be zero.
 * Returns next bit offset, or size if nothing found.
 */
static __always_inline unsigned long
radix_tree_find_next_bit(struct radix_tree_node *node, unsigned int tag,
                         unsigned long offset)
{
        const unsigned long *addr = node->tags[tag];

        if (offset < RADIX_TREE_MAP_SIZE) {
                unsigned long tmp;

                addr += offset / BITS_PER_LONG;
                tmp = *addr >> (offset % BITS_PER_LONG);
                if (tmp)
                        return __ffs(tmp) + offset;
                offset = (offset + BITS_PER_LONG) & ~(BITS_PER_LONG - 1);
                while (offset < RADIX_TREE_MAP_SIZE) {
                        tmp = *++addr;
                        if (tmp)
                                return __ffs(tmp) + offset;
                        offset += BITS_PER_LONG;
                }
        }
        return RADIX_TREE_MAP_SIZE;
}

static unsigned int iter_offset(const struct radix_tree_iter *iter)
{
        return iter->index & RADIX_TREE_MAP_MASK;
}

/*
 * The maximum index which can be stored in a radix tree
 */
static inline unsigned long shift_maxindex(unsigned int shift)
{
        return (RADIX_TREE_MAP_SIZE << shift) - 1;
}

static inline unsigned long node_maxindex(const struct radix_tree_node *node)
{
        return shift_maxindex(node->shift);
}

static unsigned long next_index(unsigned long index,
                                const struct radix_tree_node *node,
                                unsigned long offset)
{
        return (index & ~node_maxindex(node)) + (offset << node->shift);
}

/*
 * This assumes that the caller has performed appropriate preallocation, and
 * that the caller has pinned this thread of control to the current CPU.
 */
static struct radix_tree_node *
radix_tree_node_alloc(gfp_t gfp_mask, struct radix_tree_node *parent,
                        struct radix_tree_root *root,
                        unsigned int shift, unsigned int offset,
                        unsigned int count, unsigned int nr_values)
{
        struct radix_tree_node *ret = NULL;

        /*
         * Preload code isn't irq safe and it doesn't make sense to use
         * preloading during an interrupt anyway as all the allocations have
         * to be atomic. So just do normal allocation when in interrupt.
         */
        if (!gfpflags_allow_blocking(gfp_mask) && !in_interrupt()) {
                struct radix_tree_preload *rtp;

                /*
                 * Even if the caller has preloaded, try to allocate from the
                 * cache first for the new node to get accounted to the memory
                 * cgroup.
                 */
                ret = kmem_cache_alloc(radix_tree_node_cachep,
                                       gfp_mask | __GFP_NOWARN);
                if (ret)
                        goto out;

                /*
                 * Provided the caller has preloaded here, we will always
                 * succeed in getting a node here (and never reach
                 * kmem_cache_alloc)
                 */
                rtp = this_cpu_ptr(&radix_tree_preloads);
                if (rtp->nr) {
                        ret = rtp->nodes;
                        rtp->nodes = ret->parent;
                        rtp->nr--;
                }
                /*
                 * Update the allocation stack trace as this is more useful
                 * for debugging.
                 */
                kmemleak_update_trace(ret);
                goto out;
        }
        ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
out:
        BUG_ON(radix_tree_is_internal_node(ret));
        if (ret) {
                ret->shift = shift;
                ret->offset = offset;
                ret->count = count;
                ret->nr_values = nr_values;
                ret->parent = parent;
                ret->array = root;
        }
        return ret;
}

void radix_tree_node_rcu_free(struct rcu_head *head)
{
        struct radix_tree_node *node =
                        container_of(head, struct radix_tree_node, rcu_head);

        /*
         * Must only free zeroed nodes into the slab.  We can be left with
         * non-NULL entries by radix_tree_free_nodes, so clear the entries
         * and tags here.
         */
        memset(node->slots, 0, sizeof(node->slots));
        memset(node->tags, 0, sizeof(node->tags));
        INIT_LIST_HEAD(&node->private_list);

        kmem_cache_free(radix_tree_node_cachep, node);
}

static inline void
radix_tree_node_free(struct radix_tree_node *node)
{
        call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
}

/*
 * Load up this CPU's radix_tree_node buffer with sufficient objects to
 * ensure that the addition of a single element in the tree cannot fail.  On
 * success, return zero, with preemption disabled.  On error, return -ENOMEM
 * with preemption not disabled.
 *
 * To make use of this facility, the radix tree must be initialised without
 * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE().
 */
static __must_check int __radix_tree_preload(gfp_t gfp_mask, unsigned nr)
{
        struct radix_tree_preload *rtp;
        struct radix_tree_node *node;
        int ret = -ENOMEM;

        /*
         * Nodes preloaded by one cgroup can be used by another cgroup, so
         * they should never be accounted to any particular memory cgroup.
         */
        gfp_mask &= ~__GFP_ACCOUNT;

        local_lock(&radix_tree_preloads.lock);
        rtp = this_cpu_ptr(&radix_tree_preloads);
        while (rtp->nr < nr) {
                local_unlock(&radix_tree_preloads.lock);
                node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
                if (node == NULL)
                        goto out;
                local_lock(&radix_tree_preloads.lock);
                rtp = this_cpu_ptr(&radix_tree_preloads);
                if (rtp->nr < nr) {
                        node->parent = rtp->nodes;
                        rtp->nodes = node;
                        rtp->nr++;
                } else {
                        kmem_cache_free(radix_tree_node_cachep, node);
                }
        }
        ret = 0;
out:
        return ret;
}

/*
 * Load up this CPU's radix_tree_node buffer with sufficient objects to
 * ensure that the addition of a single element in the tree cannot fail.  On
 * success, return zero, with preemption disabled.  On error, return -ENOMEM
 * with preemption not disabled.
 *
 * To make use of this facility, the radix tree must be initialised without
 * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE().
 */
int radix_tree_preload(gfp_t gfp_mask)
{
        /* Warn on non-sensical use... */
        WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask));
        return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE);
}
EXPORT_SYMBOL(radix_tree_preload);

/*
 * The same as above function, except we don't guarantee preloading happens.
 * We do it, if we decide it helps. On success, return zero with preemption
 * disabled. On error, return -ENOMEM with preemption not disabled.
 */
int radix_tree_maybe_preload(gfp_t gfp_mask)
{
        if (gfpflags_allow_blocking(gfp_mask))
                return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE);
        /* Preloading doesn't help anything with this gfp mask, skip it */
        local_lock(&radix_tree_preloads.lock);
        return 0;
}
EXPORT_SYMBOL(radix_tree_maybe_preload);

static unsigned radix_tree_load_root(const struct radix_tree_root *root,
                struct radix_tree_node **nodep, unsigned long *maxindex)
{
        struct radix_tree_node *node = rcu_dereference_raw(root->xa_head);

        *nodep = node;

        if (likely(radix_tree_is_internal_node(node))) {
                node = entry_to_node(node);
                *maxindex = node_maxindex(node);
                return node->shift + RADIX_TREE_MAP_SHIFT;
        }

        *maxindex = 0;
        return 0;
}

/*
 *        Extend a radix tree so it can store key @index.
 */
static int radix_tree_extend(struct radix_tree_root *root, gfp_t gfp,
                                unsigned long index, unsigned int shift)
{
        void *entry;
        unsigned int maxshift;
        int tag;

        /* Figure out what the shift should be.  */
        maxshift = shift;
        while (index > shift_maxindex(maxshift))
                maxshift += RADIX_TREE_MAP_SHIFT;

        entry = rcu_dereference_raw(root->xa_head);
        if (!entry && (!is_idr(root) || root_tag_get(root, IDR_FREE)))
                goto out;

        do {
                struct radix_tree_node *node = radix_tree_node_alloc(gfp, NULL,
                                                        root, shift, 0, 1, 0);
                if (!node)
                        return -ENOMEM;

                if (is_idr(root)) {
                        all_tag_set(node, IDR_FREE);
                        if (!root_tag_get(root, IDR_FREE)) {
                                tag_clear(node, IDR_FREE, 0);
                                root_tag_set(root, IDR_FREE);
                        }
                } else {
                        /* Propagate the aggregated tag info to the new child */
                        for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
                                if (root_tag_get(root, tag))
                                        tag_set(node, tag, 0);
                        }
                }

                BUG_ON(shift > BITS_PER_LONG);
                if (radix_tree_is_internal_node(entry)) {
                        entry_to_node(entry)->parent = node;
                } else if (xa_is_value(entry)) {
                        /* Moving a value entry root->xa_head to a node */
                        node->nr_values = 1;
                }
                /*
                 * entry was already in the radix tree, so we do not need
                 * rcu_assign_pointer here
                 */
                node->slots[0] = (void __rcu *)entry;
                entry = node_to_entry(node);
                rcu_assign_pointer(root->xa_head, entry);
                shift += RADIX_TREE_MAP_SHIFT;
        } while (shift <= maxshift);
out:
        return maxshift + RADIX_TREE_MAP_SHIFT;
}

/**
 *        radix_tree_shrink    -    shrink radix tree to minimum height
 *        @root                radix tree root
 */
static inline bool radix_tree_shrink(struct radix_tree_root *root)
{
        bool shrunk = false;

        for (;;) {
                struct radix_tree_node *node = rcu_dereference_raw(root->xa_head);
                struct radix_tree_node *child;

                if (!radix_tree_is_internal_node(node))
                        break;
                node = entry_to_node(node);

                /*
                 * The candidate node has more than one child, or its child
                 * is not at the leftmost slot, we cannot shrink.
                 */
                if (node->count != 1)
                        break;
                child = rcu_dereference_raw(node->slots[0]);
                if (!child)
                        break;

                /*
                 * For an IDR, we must not shrink entry 0 into the root in
                 * case somebody calls idr_replace() with a pointer that
                 * appears to be an internal entry
                 */
                if (!node->shift && is_idr(root))
                        break;

                if (radix_tree_is_internal_node(child))
                        entry_to_node(child)->parent = NULL;

                /*
                 * We don't need rcu_assign_pointer(), since we are simply
                 * moving the node from one part of the tree to another: if it
                 * was safe to dereference the old pointer to it
                 * (node->slots[0]), it will be safe to dereference the new
                 * one (root->xa_head) as far as dependent read barriers go.
                 */
                root->xa_head = (void __rcu *)child;
                if (is_idr(root) && !tag_get(node, IDR_FREE, 0))
                        root_tag_clear(root, IDR_FREE);

                /*
                 * We have a dilemma here. The node's slot[0] must not be
                 * NULLed in case there are concurrent lookups expecting to
                 * find the item. However if this was a bottom-level node,
                 * then it may be subject to the slot pointer being visible
                 * to callers dereferencing it. If item corresponding to
                 * slot[0] is subsequently deleted, these callers would expect
                 * their slot to become empty sooner or later.
                 *
                 * For example, lockless pagecache will look up a slot, deref
                 * the page pointer, and if the page has 0 refcount it means it
                 * was concurrently deleted from pagecache so try the deref
                 * again. Fortunately there is already a requirement for logic
                 * to retry the entire slot lookup -- the indirect pointer
                 * problem (replacing direct root node with an indirect pointer
                 * also results in a stale slot). So tag the slot as indirect
                 * to force callers to retry.
                 */
                node->count = 0;
                if (!radix_tree_is_internal_node(child)) {
                        node->slots[0] = (void __rcu *)RADIX_TREE_RETRY;
                }

                WARN_ON_ONCE(!list_empty(&node->private_list));
                radix_tree_node_free(node);
                shrunk = true;
        }

        return shrunk;
}

static bool delete_node(struct radix_tree_root *root,
                        struct radix_tree_node *node)
{
        bool deleted = false;

        do {
                struct radix_tree_node *parent;

                if (node->count) {
                        if (node_to_entry(node) ==
                                        rcu_dereference_raw(root->xa_head))
                                deleted |= radix_tree_shrink(root);
                        return deleted;
                }

                parent = node->parent;
                if (parent) {
                        parent->slots[node->offset] = NULL;
                        parent->count--;
                } else {
                        /*
                         * Shouldn't the tags already have all been cleared
                         * by the caller?
                         */
                        if (!is_idr(root))
                                root_tag_clear_all(root);
                        root->xa_head = NULL;
                }

                WARN_ON_ONCE(!list_empty(&node->private_list));
                radix_tree_node_free(node);
                deleted = true;

                node = parent;
        } while (node);

        return deleted;
}

/**
 *        __radix_tree_create        -        create a slot in a radix tree
 *        @root:                radix tree root
 *        @index:                index key
 *        @nodep:                returns node
 *        @slotp:                returns slot
 *
 *        Create, if necessary, and return the node and slot for an item
 *        at position @index in the radix tree @root.
 *
 *        Until there is more than one item in the tree, no nodes are
 *        allocated and @root->xa_head is used as a direct slot instead of
 *        pointing to a node, in which case *@nodep will be NULL.
 *
 *        Returns -ENOMEM, or 0 for success.
 */
static int __radix_tree_create(struct radix_tree_root *root,
                unsigned long index, struct radix_tree_node **nodep,
                void __rcu ***slotp)
{
        struct radix_tree_node *node = NULL, *child;
        void __rcu **slot = (void __rcu **)&root->xa_head;
        unsigned long maxindex;
        unsigned int shift, offset = 0;
        unsigned long max = index;
        gfp_t gfp = root_gfp_mask(root);

        shift = radix_tree_load_root(root, &child, &maxindex);

        /* Make sure the tree is high enough.  */
        if (max > maxindex) {
                int error = radix_tree_extend(root, gfp, max, shift);
                if (error < 0)
                        return error;
                shift = error;
                child = rcu_dereference_raw(root->xa_head);
        }

        while (shift > 0) {
                shift -= RADIX_TREE_MAP_SHIFT;
                if (child == NULL) {
                        /* Have to add a child node.  */
                        child = radix_tree_node_alloc(gfp, node, root, shift,
                                                        offset, 0, 0);
                        if (!child)
                                return -ENOMEM;
                        rcu_assign_pointer(*slot, node_to_entry(child));
                        if (node)
                                node->count++;
                } else if (!radix_tree_is_internal_node(child))
                        break;

                /* Go a level down */
                node = entry_to_node(child);
                offset = radix_tree_descend(node, &child, index);
                slot = &node->slots[offset];
        }

        if (nodep)
                *nodep = node;
        if (slotp)
                *slotp = slot;
        return 0;
}

/*
 * Free any nodes below this node.  The tree is presumed to not need
 * shrinking, and any user data in the tree is presumed to not need a
 * destructor called on it.  If we need to add a destructor, we can
 * add that functionality later.  Note that we may not clear tags or
 * slots from the tree as an RCU walker may still have a pointer into
 * this subtree.  We could replace the entries with RADIX_TREE_RETRY,
 * but we'll still have to clear those in rcu_free.
 */
static void radix_tree_free_nodes(struct radix_tree_node *node)
{
        unsigned offset = 0;
        struct radix_tree_node *child = entry_to_node(node);

        for (;;) {
                void *entry = rcu_dereference_raw(child->slots[offset]);
                if (xa_is_node(entry) && child->shift) {
                        child = entry_to_node(entry);
                        offset = 0;
                        continue;
                }
                offset++;
                while (offset == RADIX_TREE_MAP_SIZE) {
                        struct radix_tree_node *old = child;
                        offset = child->offset + 1;
                        child = child->parent;
                        WARN_ON_ONCE(!list_empty(&old->private_list));
                        radix_tree_node_free(old);
                        if (old == entry_to_node(node))
                                return;
                }
        }
}

static inline int insert_entries(struct radix_tree_node *node,
                void __rcu **slot, void *item, bool replace)
{
        if (*slot)
                return -EEXIST;
        rcu_assign_pointer(*slot, item);
        if (node) {
                node->count++;
                if (xa_is_value(item))
                        node->nr_values++;
        }
        return 1;
}

/**
 *        __radix_tree_insert    -    insert into a radix tree
 *        @root:                radix tree root
 *        @index:                index key
 *        @item:                item to insert
 *
 *        Insert an item into the radix tree at position @index.
 */
int radix_tree_insert(struct radix_tree_root *root, unsigned long index,
                        void *item)
{
        struct radix_tree_node *node;
        void __rcu **slot;
        int error;

        BUG_ON(radix_tree_is_internal_node(item));

        error = __radix_tree_create(root, index, &node, &slot);
        if (error)
                return error;

        error = insert_entries(node, slot, item, false);
        if (error < 0)
                return error;

        if (node) {
                unsigned offset = get_slot_offset(node, slot);
                BUG_ON(tag_get(node, 0, offset));
                BUG_ON(tag_get(node, 1, offset));
                BUG_ON(tag_get(node, 2, offset));
        } else {
                BUG_ON(root_tags_get(root));
        }

        return 0;
}
EXPORT_SYMBOL(radix_tree_insert);

/**
 *        __radix_tree_lookup        -        lookup an item in a radix tree
 *        @root:                radix tree root
 *        @index:                index key
 *        @nodep:                returns node
 *        @slotp:                returns slot
 *
 *        Lookup and return the item at position @index in the radix
 *        tree @root.
 *
 *        Until there is more than one item in the tree, no nodes are
 *        allocated and @root->xa_head is used as a direct slot instead of
 *        pointing to a node, in which case *@nodep will be NULL.
 */
void *__radix_tree_lookup(const struct radix_tree_root *root,
                          unsigned long index, struct radix_tree_node **nodep,
                          void __rcu ***slotp)
{
        struct radix_tree_node *node, *parent;
        unsigned long maxindex;
        void __rcu **slot;

 restart:
        parent = NULL;
        slot = (void __rcu **)&root->xa_head;
        radix_tree_load_root(root, &node, &maxindex);
        if (index > maxindex)
                return NULL;

        while (radix_tree_is_internal_node(node)) {
                unsigned offset;

                parent = entry_to_node(node);
                offset = radix_tree_descend(parent, &node, index);
                slot = parent->slots + offset;
                if (node == RADIX_TREE_RETRY)
                        goto restart;
                if (parent->shift == 0)
                        break;
        }

        if (nodep)
                *nodep = parent;
        if (slotp)
                *slotp = slot;
        return node;
}

/**
 *        radix_tree_lookup_slot    -    lookup a slot in a radix tree
 *        @root:                radix tree root
 *        @index:                index key
 *
 *        Returns:  the slot corresponding to the position @index in the
 *        radix tree @root. This is useful for update-if-exists operations.
 *
 *        This function can be called under rcu_read_lock iff the slot is not
 *        modified by radix_tree_replace_slot, otherwise it must be called
 *        exclusive from other writers. Any dereference of the slot must be done
 *        using radix_tree_deref_slot.
 */
void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *root,
                                unsigned long index)
{
        void __rcu **slot;

        if (!__radix_tree_lookup(root, index, NULL, &slot))
                return NULL;
        return slot;
}
EXPORT_SYMBOL(radix_tree_lookup_slot);

/**
 *        radix_tree_lookup    -    perform lookup operation on a radix tree
 *        @root:                radix tree root
 *        @index:                index key
 *
 *        Lookup the item at the position @index in the radix tree @root.
 *
 *        This function can be called under rcu_read_lock, however the caller
 *        must manage lifetimes of leaf nodes (eg. RCU may also be used to free
 *        them safely). No RCU barriers are required to access or modify the
 *        returned item, however.
 */
void *radix_tree_lookup(const struct radix_tree_root *root, unsigned long index)
{
        return __radix_tree_lookup(root, index, NULL, NULL);
}
EXPORT_SYMBOL(radix_tree_lookup);

static void replace_slot(void __rcu **slot, void *item,
                struct radix_tree_node *node, int count, int values)
{
        if (node && (count || values)) {
                node->count += count;
                node->nr_values += values;
        }

        rcu_assign_pointer(*slot, item);
}

static bool node_tag_get(const struct radix_tree_root *root,
                                const struct radix_tree_node *node,
                                unsigned int tag, unsigned int offset)
{
        if (node)
                return tag_get(node, tag, offset);
        return root_tag_get(root, tag);
}

/*
 * IDR users want to be able to store NULL in the tree, so if the slot isn't
 * free, don't adjust the count, even if it's transitioning between NULL and
 * non-NULL.  For the IDA, we mark slots as being IDR_FREE while they still
 * have empty bits, but it only stores NULL in slots when they're being
 * deleted.
 */
static int calculate_count(struct radix_tree_root *root,
                                struct radix_tree_node *node, void __rcu **slot,
                                void *item, void *old)
{
        if (is_idr(root)) {
                unsigned offset = get_slot_offset(node, slot);
                bool free = node_tag_get(root, node, IDR_FREE, offset);
                if (!free)
                        return 0;
                if (!old)
                        return 1;
        }
        return !!item - !!old;
}

/**
 * __radix_tree_replace                - replace item in a slot
 * @root:                radix tree root
 * @node:                pointer to tree node
 * @slot:                pointer to slot in @node
 * @item:                new item to store in the slot.
 *
 * For use with __radix_tree_lookup().  Caller must hold tree write locked
 * across slot lookup and replacement.
 */
void __radix_tree_replace(struct radix_tree_root *root,
                          struct radix_tree_node *node,
                          void __rcu **slot, void *item)
{
        void *old = rcu_dereference_raw(*slot);
        int values = !!xa_is_value(item) - !!xa_is_value(old);
        int count = calculate_count(root, node, slot, item, old);

        /*
         * This function supports replacing value entries and
         * deleting entries, but that needs accounting against the
         * node unless the slot is root->xa_head.
         */
        WARN_ON_ONCE(!node && (slot != (void __rcu **)&root->xa_head) &&
                        (count || values));
        replace_slot(slot, item, node, count, values);

        if (!node)
                return;

        delete_node(root, node);
}

/**
 * radix_tree_replace_slot        - replace item in a slot
 * @root:        radix tree root
 * @slot:        pointer to slot
 * @item:        new item to store in the slot.
 *
 * For use with radix_tree_lookup_slot() and
 * radix_tree_gang_lookup_tag_slot().  Caller must hold tree write locked
 * across slot lookup and replacement.
 *
 * NOTE: This cannot be used to switch between non-entries (empty slots),
 * regular entries, and value entries, as that requires accounting
 * inside the radix tree node. When switching from one type of entry or
 * deleting, use __radix_tree_lookup() and __radix_tree_replace() or
 * radix_tree_iter_replace().
 */
void radix_tree_replace_slot(struct radix_tree_root *root,
                             void __rcu **slot, void *item)
{
        __radix_tree_replace(root, NULL, slot, item);
}
EXPORT_SYMBOL(radix_tree_replace_slot);

/**
 * radix_tree_iter_replace - replace item in a slot
 * @root:        radix tree root
 * @slot:        pointer to slot
 * @item:        new item to store in the slot.
 *
 * For use with radix_tree_for_each_slot().
 * Caller must hold tree write locked.
 */
void radix_tree_iter_replace(struct radix_tree_root *root,
                                const struct radix_tree_iter *iter,
                                void __rcu **slot, void *item)
{
        __radix_tree_replace(root, iter->node, slot, item);
}

static void node_tag_set(struct radix_tree_root *root,
                                struct radix_tree_node *node,
                                unsigned int tag, unsigned int offset)
{
        while (node) {
                if (tag_get(node, tag, offset))
                        return;
                tag_set(node, tag, offset);
                offset = node->offset;
                node = node->parent;
        }

        if (!root_tag_get(root, tag))
                root_tag_set(root, tag);
}

/**
 *        radix_tree_tag_set - set a tag on a radix tree node
 *        @root:                radix tree root
 *        @index:                index key
 *        @tag:                tag index
 *
 *        Set the search tag (which must be < RADIX_TREE_MAX_TAGS)
 *        corresponding to @index in the radix tree.  From
 *        the root all the way down to the leaf node.
 *
 *        Returns the address of the tagged item.  Setting a tag on a not-present
 *        item is a bug.
 */
void *radix_tree_tag_set(struct radix_tree_root *root,
                        unsigned long index, unsigned int tag)
{
        struct radix_tree_node *node, *parent;
        unsigned long maxindex;

        radix_tree_load_root(root, &node, &maxindex);
        BUG_ON(index > maxindex);

        while (radix_tree_is_internal_node(node)) {
                unsigned offset;

                parent = entry_to_node(node);
                offset = radix_tree_descend(parent, &node, index);
                BUG_ON(!node);

                if (!tag_get(parent, tag, offset))
                        tag_set(parent, tag, offset);
        }

        /* set the root's tag bit */
        if (!root_tag_get(root, tag))
                root_tag_set(root, tag);

        return node;
}
EXPORT_SYMBOL(radix_tree_tag_set);

static void node_tag_clear(struct radix_tree_root *root,
                                struct radix_tree_node *node,
                                unsigned int tag, unsigned int offset)
{
        while (node) {
                if (!tag_get(node, tag, offset))
                        return;
                tag_clear(node, tag, offset);
                if (any_tag_set(node, tag))
                        return;

                offset = node->offset;
                node = node->parent;
        }

        /* clear the root's tag bit */
        if (root_tag_get(root, tag))
                root_tag_clear(root, tag);
}

/**
 *        radix_tree_tag_clear - clear a tag on a radix tree node
 *        @root:                radix tree root
 *        @index:                index key
 *        @tag:                tag index
 *
 *        Clear the search tag (which must be < RADIX_TREE_MAX_TAGS)
 *        corresponding to @index in the radix tree.  If this causes
 *        the leaf node to have no tags set then clear the tag in the
 *        next-to-leaf node, etc.
 *
 *        Returns the address of the tagged item on success, else NULL.  ie:
 *        has the same return value and semantics as radix_tree_lookup().
 */
void *radix_tree_tag_clear(struct radix_tree_root *root,
                        unsigned long index, unsigned int tag)
{
        struct radix_tree_node *node, *parent;
        unsigned long maxindex;
        int offset;

        radix_tree_load_root(root, &node, &maxindex);
        if (index > maxindex)
                return NULL;

        parent = NULL;

        while (radix_tree_is_internal_node(node)) {
                parent = entry_to_node(node);
                offset = radix_tree_descend(parent, &node, index);
        }

        if (node)
                node_tag_clear(root, parent, tag, offset);

        return node;
}
EXPORT_SYMBOL(radix_tree_tag_clear);

/**
  * radix_tree_iter_tag_clear - clear a tag on the current iterator entry
  * @root: radix tree root
  * @iter: iterator state
  * @tag: tag to clear
  */
void radix_tree_iter_tag_clear(struct radix_tree_root *root,
                        const struct radix_tree_iter *iter, unsigned int tag)
{
        node_tag_clear(root, iter->node, tag, iter_offset(iter));
}

/**
 * radix_tree_tag_get - get a tag on a radix tree node
 * @root:                radix tree root
 * @index:                index key
 * @tag:                tag index (< RADIX_TREE_MAX_TAGS)
 *
 * Return values:
 *
 *  0: tag not present or not set
 *  1: tag set
 *
 * Note that the return value of this function may not be relied on, even if
 * the RCU lock is held, unless tag modification and node deletion are excluded
 * from concurrency.
 */
int radix_tree_tag_get(const struct radix_tree_root *root,
                        unsigned long index, unsigned int tag)
{
        struct radix_tree_node *node, *parent;
        unsigned long maxindex;

        if (!root_tag_get(root, tag))
                return 0;

        radix_tree_load_root(root, &node, &maxindex);
        if (index > maxindex)
                return 0;

        while (radix_tree_is_internal_node(node)) {
                unsigned offset;

                parent = entry_to_node(node);
                offset = radix_tree_descend(parent, &node, index);

                if (!tag_get(parent, tag, offset))
                        return 0;
                if (node == RADIX_TREE_RETRY)
                        break;
        }

        return 1;
}
EXPORT_SYMBOL(radix_tree_tag_get);

/* Construct iter->tags bit-mask from node->tags[tag] array */
static void set_iter_tags(struct radix_tree_iter *iter,
                                struct radix_tree_node *node, unsigned offset,
                                unsigned tag)
{
        unsigned tag_long = offset / BITS_PER_LONG;
        unsigned tag_bit  = offset % BITS_PER_LONG;

        if (!node) {
                iter->tags = 1;
                return;
        }

        iter->tags = node->tags[tag][tag_long] >> tag_bit;

        /* This never happens if RADIX_TREE_TAG_LONGS == 1 */
        if (tag_long < RADIX_TREE_TAG_LONGS - 1) {
                /* Pick tags from next element */
                if (tag_bit)
                        iter->tags |= node->tags[tag][tag_long + 1] <<
                                                (BITS_PER_LONG - tag_bit);
                /* Clip chunk size, here only BITS_PER_LONG tags */
                iter->next_index = __radix_tree_iter_add(iter, BITS_PER_LONG);
        }
}

void __rcu **radix_tree_iter_resume(void __rcu **slot,
                                        struct radix_tree_iter *iter)
{
        iter->index = __radix_tree_iter_add(iter, 1);
        iter->next_index = iter->index;
        iter->tags = 0;
        return NULL;
}
EXPORT_SYMBOL(radix_tree_iter_resume);

/**
 * radix_tree_next_chunk - find next chunk of slots for iteration
 *
 * @root:        radix tree root
 * @iter:        iterator state
 * @flags:        RADIX_TREE_ITER_* flags and tag index
 * Returns:        pointer to chunk first slot, or NULL if iteration is over
 */
void __rcu **radix_tree_next_chunk(const struct radix_tree_root *root,
                             struct radix_tree_iter *iter, unsigned flags)
{
        unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK;
        struct radix_tree_node *node, *child;
        unsigned long index, offset, maxindex;

        if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag))
                return NULL;

        /*
         * Catch next_index overflow after ~0UL. iter->index never overflows
         * during iterating; it can be zero only at the beginning.
         * And we cannot overflow iter->next_index in a single step,
         * because RADIX_TREE_MAP_SHIFT < BITS_PER_LONG.
         *
         * This condition also used by radix_tree_next_slot() to stop
         * contiguous iterating, and forbid switching to the next chunk.
         */
        index = iter->next_index;
        if (!index && iter->index)
                return NULL;

 restart:
        radix_tree_load_root(root, &child, &maxindex);
        if (index > maxindex)
                return NULL;
        if (!child)
                return NULL;

        if (!radix_tree_is_internal_node(child)) {
                /* Single-slot tree */
                iter->index = index;
                iter->next_index = maxindex + 1;
                iter->tags = 1;
                iter->node = NULL;
                return (void __rcu **)&root->xa_head;
        }

        do {
                node = entry_to_node(child);
                offset = radix_tree_descend(node, &child, index);

                if ((flags & RADIX_TREE_ITER_TAGGED) ?
                                !tag_get(node, tag, offset) : !child) {
                        /* Hole detected */
                        if (flags & RADIX_TREE_ITER_CONTIG)
                                return NULL;

                        if (flags & RADIX_TREE_ITER_TAGGED)
                                offset = radix_tree_find_next_bit(node, tag,
                                                offset + 1);
                        else
                                while (++offset        < RADIX_TREE_MAP_SIZE) {
                                        void *slot = rcu_dereference_raw(
                                                        node->slots[offset]);
                                        if (slot)
                                                break;
                                }
                        index &= ~node_maxindex(node);
                        index += offset << node->shift;
                        /* Overflow after ~0UL */
                        if (!index)
                                return NULL;
                        if (offset == RADIX_TREE_MAP_SIZE)
                                goto restart;
                        child = rcu_dereference_raw(node->slots[offset]);
                }

                if (!child)
                        goto restart;
                if (child == RADIX_TREE_RETRY)
                        break;
        } while (node->shift && radix_tree_is_internal_node(child));

        /* Update the iterator state */
        iter->index = (index &~ node_maxindex(node)) | offset;
        iter->next_index = (index | node_maxindex(node)) + 1;
        iter->node = node;

        if (flags & RADIX_TREE_ITER_TAGGED)
                set_iter_tags(iter, node, offset, tag);

        return node->slots + offset;
}
EXPORT_SYMBOL(radix_tree_next_chunk);

/**
 *        radix_tree_gang_lookup - perform multiple lookup on a radix tree
 *        @root:                radix tree root
 *        @results:        where the results of the lookup are placed
 *        @first_index:        start the lookup from this key
 *        @max_items:        place up to this many items at *results
 *
 *        Performs an index-ascending scan of the tree for present items.  Places
 *        them at *@results and returns the number of items which were placed at
 *        *@results.
 *
 *        The implementation is naive.
 *
 *        Like radix_tree_lookup, radix_tree_gang_lookup may be called under
 *        rcu_read_lock. In this case, rather than the returned results being
 *        an atomic snapshot of the tree at a single point in time, the
 *        semantics of an RCU protected gang lookup are as though multiple
 *        radix_tree_lookups have been issued in individual locks, and results
 *        stored in 'results'.
 */
unsigned int
radix_tree_gang_lookup(const struct radix_tree_root *root, void **results,
                        unsigned long first_index, unsigned int max_items)
{
        struct radix_tree_iter iter;
        void __rcu **slot;
        unsigned int ret = 0;

        if (unlikely(!max_items))
                return 0;

        radix_tree_for_each_slot(slot, root, &iter, first_index) {
                results[ret] = rcu_dereference_raw(*slot);
                if (!results[ret])
                        continue;
                if (radix_tree_is_internal_node(results[ret])) {
                        slot = radix_tree_iter_retry(&iter);
                        continue;
                }
                if (++ret == max_items)
                        break;
        }

        return ret;
}
EXPORT_SYMBOL(radix_tree_gang_lookup);

/**
 *        radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree
 *                                     based on a tag
 *        @root:                radix tree root
 *        @results:        where the results of the lookup are placed
 *        @first_index:        start the lookup from this key
 *        @max_items:        place up to this many items at *results
 *        @tag:                the tag index (< RADIX_TREE_MAX_TAGS)
 *
 *        Performs an index-ascending scan of the tree for present items which
 *        have the tag indexed by @tag set.  Places the items at *@results and
 *        returns the number of items which were placed at *@results.
 */
unsigned int
radix_tree_gang_lookup_tag(const struct radix_tree_root *root, void **results,
                unsigned long first_index, unsigned int max_items,
                unsigned int tag)
{
        struct radix_tree_iter iter;
        void __rcu **slot;
        unsigned int ret = 0;

        if (unlikely(!max_items))
                return 0;

        radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) {
                results[ret] = rcu_dereference_raw(*slot);
                if (!results[ret])
                        continue;
                if (radix_tree_is_internal_node(results[ret])) {
                        slot = radix_tree_iter_retry(&iter);
                        continue;
                }
                if (++ret == max_items)
                        break;
        }

        return ret;
}
EXPORT_SYMBOL(radix_tree_gang_lookup_tag);

/**
 *        radix_tree_gang_lookup_tag_slot - perform multiple slot lookup on a
 *                                          radix tree based on a tag
 *        @root:                radix tree root
 *        @results:        where the results of the lookup are placed
 *        @first_index:        start the lookup from this key
 *        @max_items:        place up to this many items at *results
 *        @tag:                the tag index (< RADIX_TREE_MAX_TAGS)
 *
 *        Performs an index-ascending scan of the tree for present items which
 *        have the tag indexed by @tag set.  Places the slots at *@results and
 *        returns the number of slots which were placed at *@results.
 */
unsigned int
radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *root,
                void __rcu ***results, unsigned long first_index,
                unsigned int max_items, unsigned int tag)
{
        struct radix_tree_iter iter;
        void __rcu **slot;
        unsigned int ret = 0;

        if (unlikely(!max_items))
                return 0;

        radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) {
                results[ret] = slot;
                if (++ret == max_items)
                        break;
        }

        return ret;
}
EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot);

static bool __radix_tree_delete(struct radix_tree_root *root,
                                struct radix_tree_node *node, void __rcu **slot)
{
        void *old = rcu_dereference_raw(*slot);
        int values = xa_is_value(old) ? -1 : 0;
        unsigned offset = get_slot_offset(node, slot);
        int tag;

        if (is_idr(root))
                node_tag_set(root, node, IDR_FREE, offset);
        else
                for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
                        node_tag_clear(root, node, tag, offset);

        replace_slot(slot, NULL, node, -1, values);
        return node && delete_node(root, node);
}

/**
 * radix_tree_iter_delete - delete the entry at this iterator position
 * @root: radix tree root
 * @iter: iterator state
 * @slot: pointer to slot
 *
 * Delete the entry at the position currently pointed to by the iterator.
 * This may result in the current node being freed; if it is, the iterator
 * is advanced so that it will not reference the freed memory.  This
 * function may be called without any locking if there are no other threads
 * which can access this tree.
 */
void radix_tree_iter_delete(struct radix_tree_root *root,
                                struct radix_tree_iter *iter, void __rcu **slot)
{
        if (__radix_tree_delete(root, iter->node, slot))
                iter->index = iter->next_index;
}
EXPORT_SYMBOL(radix_tree_iter_delete);

/**
 * radix_tree_delete_item - delete an item from a radix tree
 * @root: radix tree root
 * @index: index key
 * @item: expected item
 *
 * Remove @item at @index from the radix tree rooted at @root.
 *
 * Return: the deleted entry, or %NULL if it was not present
 * or the entry at the given @index was not @item.
 */
void *radix_tree_delete_item(struct radix_tree_root *root,
                             unsigned long index, void *item)
{
        struct radix_tree_node *node = NULL;
        void __rcu **slot = NULL;
        void *entry;

        entry = __radix_tree_lookup(root, index, &node, &slot);
        if (!slot)
                return NULL;
        if (!entry && (!is_idr(root) || node_tag_get(root, node, IDR_FREE,
                                                get_slot_offset(node, slot))))
                return NULL;

        if (item && entry != item)
                return NULL;

        __radix_tree_delete(root, node, slot);

        return entry;
}
EXPORT_SYMBOL(radix_tree_delete_item);

/**
 * radix_tree_delete - delete an entry from a radix tree
 * @root: radix tree root
 * @index: index key
 *
 * Remove the entry at @index from the radix tree rooted at @root.
 *
 * Return: The deleted entry, or %NULL if it was not present.
 */
void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
{
        return radix_tree_delete_item(root, index, NULL);
}
EXPORT_SYMBOL(radix_tree_delete);

/**
 *        radix_tree_tagged - test whether any items in the tree are tagged
 *        @root:                radix tree root
 *        @tag:                tag to test
 */
int radix_tree_tagged(const struct radix_tree_root *root, unsigned int tag)
{
        return root_tag_get(root, tag);
}
EXPORT_SYMBOL(radix_tree_tagged);

/**
 * idr_preload - preload for idr_alloc()
 * @gfp_mask: allocation mask to use for preloading
 *
 * Preallocate memory to use for the next call to idr_alloc().  This function
 * returns with preemption disabled.  It will be enabled by idr_preload_end().
 */
void idr_preload(gfp_t gfp_mask)
{
        if (__radix_tree_preload(gfp_mask, IDR_PRELOAD_SIZE))
                local_lock(&radix_tree_preloads.lock);
}
EXPORT_SYMBOL(idr_preload);

void __rcu **idr_get_free(struct radix_tree_root *root,
                              struct radix_tree_iter *iter, gfp_t gfp,
                              unsigned long max)
{
        struct radix_tree_node *node = NULL, *child;
        void __rcu **slot = (void __rcu **)&root->xa_head;
        unsigned long maxindex, start = iter->next_index;
        unsigned int shift, offset = 0;

 grow:
        shift = radix_tree_load_root(root, &child, &maxindex);
        if (!radix_tree_tagged(root, IDR_FREE))
                start = max(start, maxindex + 1);
        if (start > max)
                return ERR_PTR(-ENOSPC);

        if (start > maxindex) {
                int error = radix_tree_extend(root, gfp, start, shift);
                if (error < 0)
                        return ERR_PTR(error);
                shift = error;
                child = rcu_dereference_raw(root->xa_head);
        }
        if (start == 0 && shift == 0)
                shift = RADIX_TREE_MAP_SHIFT;

        while (shift) {
                shift -= RADIX_TREE_MAP_SHIFT;
                if (child == NULL) {
                        /* Have to add a child node.  */
                        child = radix_tree_node_alloc(gfp, node, root, shift,
                                                        offset, 0, 0);
                        if (!child)
                                return ERR_PTR(-ENOMEM);
                        all_tag_set(child, IDR_FREE);
                        rcu_assign_pointer(*slot, node_to_entry(child));
                        if (node)
                                node->count++;
                } else if (!radix_tree_is_internal_node(child))
                        break;

                node = entry_to_node(child);
                offset = radix_tree_descend(node, &child, start);
                if (!tag_get(node, IDR_FREE, offset)) {
                        offset = radix_tree_find_next_bit(node, IDR_FREE,
                                                        offset + 1);
                        start = next_index(start, node, offset);
                        if (start > max || start == 0)
                                return ERR_PTR(-ENOSPC);
                        while (offset == RADIX_TREE_MAP_SIZE) {
                                offset = node->offset + 1;
                                node = node->parent;
                                if (!node)
                                        goto grow;
                                shift = node->shift;
                        }
                        child = rcu_dereference_raw(node->slots[offset]);
                }
                slot = &node->slots[offset];
        }

        iter->index = start;
        if (node)
                iter->next_index = 1 + min(max, (start | node_maxindex(node)));
        else
                iter->next_index = 1;
        iter->node = node;
        set_iter_tags(iter, node, offset, IDR_FREE);

        return slot;
}

/**
 * idr_destroy - release all internal memory from an IDR
 * @idr: idr handle
 *
 * After this function is called, the IDR is empty, and may be reused or
 * the data structure containing it may be freed.
 *
 * A typical clean-up sequence for objects stored in an idr tree will use
 * idr_for_each() to free all objects, if necessary, then idr_destroy() to
 * free the memory used to keep track of those objects.
 */
void idr_destroy(struct idr *idr)
{
        struct radix_tree_node *node = rcu_dereference_raw(idr->idr_rt.xa_head);
        if (radix_tree_is_internal_node(node))
                radix_tree_free_nodes(node);
        idr->idr_rt.xa_head = NULL;
        root_tag_set(&idr->idr_rt, IDR_FREE);
}
EXPORT_SYMBOL(idr_destroy);

static void
radix_tree_node_ctor(void *arg)
{
        struct radix_tree_node *node = arg;

        memset(node, 0, sizeof(*node));
        INIT_LIST_HEAD(&node->private_list);
}

static int radix_tree_cpu_dead(unsigned int cpu)
{
        struct radix_tree_preload *rtp;
        struct radix_tree_node *node;

        /* Free per-cpu pool of preloaded nodes */
        rtp = &per_cpu(radix_tree_preloads, cpu);
        while (rtp->nr) {
                node = rtp->nodes;
                rtp->nodes = node->parent;
                kmem_cache_free(radix_tree_node_cachep, node);
                rtp->nr--;
        }
        return 0;
}

void __init radix_tree_init(void)
{
        int ret;

        BUILD_BUG_ON(RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT > 32);
        BUILD_BUG_ON(ROOT_IS_IDR & ~GFP_ZONEMASK);
        BUILD_BUG_ON(XA_CHUNK_SIZE > 255);
        radix_tree_node_cachep = kmem_cache_create("radix_tree_node",
                        sizeof(struct radix_tree_node), 0,
                        SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
                        radix_tree_node_ctor);
        ret = cpuhp_setup_state_nocalls(CPUHP_RADIX_DEAD, "lib/radix:dead",
                                        NULL, radix_tree_cpu_dead);
        WARN_ON(ret < 0);
}







































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MIN_HEAP_H
#define _LINUX_MIN_HEAP_H

#include <linux/bug.h>
#include <linux/string.h>
#include <linux/types.h>

/**
 * struct min_heap - Data structure to hold a min-heap.
 * @data: Start of array holding the heap elements.
 * @nr: Number of elements currently in the heap.
 * @size: Maximum number of elements that can be held in current storage.
 */
struct min_heap {
        void *data;
        int nr;
        int size;
};

/**
 * struct min_heap_callbacks - Data/functions to customise the min_heap.
 * @elem_size: The nr of each element in bytes.
 * @less: Partial order function for this heap.
 * @swp: Swap elements function.
 */
struct min_heap_callbacks {
        int elem_size;
        bool (*less)(const void *lhs, const void *rhs);
        void (*swp)(void *lhs, void *rhs);
};

/* Sift the element at pos down the heap. */
static __always_inline
void min_heapify(struct min_heap *heap, int pos,
                const struct min_heap_callbacks *func)
{
        void *left, *right, *parent, *smallest;
        void *data = heap->data;

        for (;;) {
                if (pos * 2 + 1 >= heap->nr)
                        break;

                left = data + ((pos * 2 + 1) * func->elem_size);
                parent = data + (pos * func->elem_size);
                smallest = parent;
                if (func->less(left, smallest))
                        smallest = left;

                if (pos * 2 + 2 < heap->nr) {
                        right = data + ((pos * 2 + 2) * func->elem_size);
                        if (func->less(right, smallest))
                                smallest = right;
                }
                if (smallest == parent)
                        break;
                func->swp(smallest, parent);
                if (smallest == left)
                        pos = (pos * 2) + 1;
                else
                        pos = (pos * 2) + 2;
        }
}

/* Floyd's approach to heapification that is O(nr). */
static __always_inline
void min_heapify_all(struct min_heap *heap,
                const struct min_heap_callbacks *func)
{
        int i;

        for (i = heap->nr / 2; i >= 0; i--)
                min_heapify(heap, i, func);
}

/* Remove minimum element from the heap, O(log2(nr)). */
static __always_inline
void min_heap_pop(struct min_heap *heap,
                const struct min_heap_callbacks *func)
{
        void *data = heap->data;

        if (WARN_ONCE(heap->nr <= 0, "Popping an empty heap"))
                return;

        /* Place last element at the root (position 0) and then sift down. */
        heap->nr--;
        memcpy(data, data + (heap->nr * func->elem_size), func->elem_size);
        min_heapify(heap, 0, func);
}

/*
 * Remove the minimum element and then push the given element. The
 * implementation performs 1 sift (O(log2(nr))) and is therefore more
 * efficient than a pop followed by a push that does 2.
 */
static __always_inline
void min_heap_pop_push(struct min_heap *heap,
                const void *element,
                const struct min_heap_callbacks *func)
{
        memcpy(heap->data, element, func->elem_size);
        min_heapify(heap, 0, func);
}

/* Push an element on to the heap, O(log2(nr)). */
static __always_inline
void min_heap_push(struct min_heap *heap, const void *element,
                const struct min_heap_callbacks *func)
{
        void *data = heap->data;
        void *child, *parent;
        int pos;

        if (WARN_ONCE(heap->nr >= heap->size, "Pushing on a full heap"))
                return;

        /* Place at the end of data. */
        pos = heap->nr;
        memcpy(data + (pos * func->elem_size), element, func->elem_size);
        heap->nr++;

        /* Sift child at pos up. */
        for (; pos > 0; pos = (pos - 1) / 2) {
                child = data + (pos * func->elem_size);
                parent = data + ((pos - 1) / 2) * func->elem_size;
                if (func->less(parent, child))
                        break;
                func->swp(parent, child);
        }
}

#endif /* _LINUX_MIN_HEAP_H */


































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef _INPUT_MT_H
#define _INPUT_MT_H

/*
 * Input Multitouch Library
 *
 * Copyright (c) 2010 Henrik Rydberg
 */

#include <linux/input.h>

#define TRKID_MAX        0xffff

#define INPUT_MT_POINTER        0x0001        /* pointer device, e.g. trackpad */
#define INPUT_MT_DIRECT                0x0002        /* direct device, e.g. touchscreen */
#define INPUT_MT_DROP_UNUSED        0x0004        /* drop contacts not seen in frame */
#define INPUT_MT_TRACK                0x0008        /* use in-kernel tracking */
#define INPUT_MT_SEMI_MT        0x0010        /* semi-mt device, finger count handled manually */

/**
 * struct input_mt_slot - represents the state of an input MT slot
 * @abs: holds current values of ABS_MT axes for this slot
 * @frame: last frame at which input_mt_report_slot_state() was called
 * @key: optional driver designation of this slot
 */
struct input_mt_slot {
        int abs[ABS_MT_LAST - ABS_MT_FIRST + 1];
        unsigned int frame;
        unsigned int key;
};

/**
 * struct input_mt - state of tracked contacts
 * @trkid: stores MT tracking ID for the next contact
 * @num_slots: number of MT slots the device uses
 * @slot: MT slot currently being transmitted
 * @flags: input_mt operation flags
 * @frame: increases every time input_mt_sync_frame() is called
 * @red: reduced cost matrix for in-kernel tracking
 * @slots: array of slots holding current values of tracked contacts
 */
struct input_mt {
        int trkid;
        int num_slots;
        int slot;
        unsigned int flags;
        unsigned int frame;
        int *red;
        struct input_mt_slot slots[];
};

static inline void input_mt_set_value(struct input_mt_slot *slot,
                                      unsigned code, int value)
{
        slot->abs[code - ABS_MT_FIRST] = value;
}

static inline int input_mt_get_value(const struct input_mt_slot *slot,
                                     unsigned code)
{
        return slot->abs[code - ABS_MT_FIRST];
}

static inline bool input_mt_is_active(const struct input_mt_slot *slot)
{
        return input_mt_get_value(slot, ABS_MT_TRACKING_ID) >= 0;
}

static inline bool input_mt_is_used(const struct input_mt *mt,
                                    const struct input_mt_slot *slot)
{
        return slot->frame == mt->frame;
}

int input_mt_init_slots(struct input_dev *dev, unsigned int num_slots,
                        unsigned int flags);
void input_mt_destroy_slots(struct input_dev *dev);

static inline int input_mt_new_trkid(struct input_mt *mt)
{
        return mt->trkid++ & TRKID_MAX;
}

static inline void input_mt_slot(struct input_dev *dev, int slot)
{
        input_event(dev, EV_ABS, ABS_MT_SLOT, slot);
}

static inline bool input_is_mt_value(int axis)
{
        return axis >= ABS_MT_FIRST && axis <= ABS_MT_LAST;
}

static inline bool input_is_mt_axis(int axis)
{
        return axis == ABS_MT_SLOT || input_is_mt_value(axis);
}

bool input_mt_report_slot_state(struct input_dev *dev,
                                unsigned int tool_type, bool active);

static inline void input_mt_report_slot_inactive(struct input_dev *dev)
{
        input_mt_report_slot_state(dev, 0, false);
}

void input_mt_report_finger_count(struct input_dev *dev, int count);
void input_mt_report_pointer_emulation(struct input_dev *dev, bool use_count);
void input_mt_drop_unused(struct input_dev *dev);

void input_mt_sync_frame(struct input_dev *dev);

/**
 * struct input_mt_pos - contact position
 * @x: horizontal coordinate
 * @y: vertical coordinate
 */
struct input_mt_pos {
        s16 x, y;
};

int input_mt_assign_slots(struct input_dev *dev, int *slots,
                          const struct input_mt_pos *pos, int num_pos,
                          int dmax);

int input_mt_get_slot_by_key(struct input_dev *dev, int key);

#endif








































































































































































































































































    1 









































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * net/dst.h        Protocol independent destination cache definitions.
 *
 * Authors:        Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 */

#ifndef _NET_DST_H
#define _NET_DST_H

#include <net/dst_ops.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/rcupdate.h>
#include <linux/bug.h>
#include <linux/jiffies.h>
#include <linux/refcount.h>
#include <net/neighbour.h>
#include <asm/processor.h>
#include <linux/indirect_call_wrapper.h>

struct sk_buff;

struct dst_entry {
        struct net_device       *dev;
        struct  dst_ops                *ops;
        unsigned long                _metrics;
        unsigned long           expires;
#ifdef CONFIG_XFRM
        struct xfrm_state        *xfrm;
#else
        void                        *__pad1;
#endif
        int                        (*input)(struct sk_buff *);
        int                        (*output)(struct net *net, struct sock *sk, struct sk_buff *skb);

        unsigned short                flags;
#define DST_NOXFRM                0x0002
#define DST_NOPOLICY                0x0004
#define DST_NOCOUNT                0x0008
#define DST_FAKE_RTABLE                0x0010
#define DST_XFRM_TUNNEL                0x0020
#define DST_XFRM_QUEUE                0x0040
#define DST_METADATA                0x0080

        /* A non-zero value of dst->obsolete forces by-hand validation
         * of the route entry.  Positive values are set by the generic
         * dst layer to indicate that the entry has been forcefully
         * destroyed.
         *
         * Negative values are used by the implementation layer code to
         * force invocation of the dst_ops->check() method.
         */
        short                        obsolete;
#define DST_OBSOLETE_NONE        0
#define DST_OBSOLETE_DEAD        2
#define DST_OBSOLETE_FORCE_CHK        -1
#define DST_OBSOLETE_KILL        -2
        unsigned short                header_len;        /* more space at head required */
        unsigned short                trailer_len;        /* space to reserve at tail */

        /*
         * __refcnt wants to be on a different cache line from
         * input/output/ops or performance tanks badly
         */
#ifdef CONFIG_64BIT
        atomic_t                __refcnt;        /* 64-bit offset 64 */
#endif
        int                        __use;
        unsigned long                lastuse;
        struct lwtunnel_state   *lwtstate;
        struct rcu_head                rcu_head;
        short                        error;
        short                        __pad;
        __u32                        tclassid;
#ifndef CONFIG_64BIT
        atomic_t                __refcnt;        /* 32-bit offset 64 */
#endif
};

struct dst_metrics {
        u32                metrics[RTAX_MAX];
        refcount_t        refcnt;
} __aligned(4);                /* Low pointer bits contain DST_METRICS_FLAGS */
extern const struct dst_metrics dst_default_metrics;

u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old);

#define DST_METRICS_READ_ONLY                0x1UL
#define DST_METRICS_REFCOUNTED                0x2UL
#define DST_METRICS_FLAGS                0x3UL
#define __DST_METRICS_PTR(Y)        \
        ((u32 *)((Y) & ~DST_METRICS_FLAGS))
#define DST_METRICS_PTR(X)        __DST_METRICS_PTR((X)->_metrics)

static inline bool dst_metrics_read_only(const struct dst_entry *dst)
{
        return dst->_metrics & DST_METRICS_READ_ONLY;
}

void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old);

static inline void dst_destroy_metrics_generic(struct dst_entry *dst)
{
        unsigned long val = dst->_metrics;
        if (!(val & DST_METRICS_READ_ONLY))
                __dst_destroy_metrics_generic(dst, val);
}

static inline u32 *dst_metrics_write_ptr(struct dst_entry *dst)
{
        unsigned long p = dst->_metrics;

        BUG_ON(!p);

        if (p & DST_METRICS_READ_ONLY)
                return dst->ops->cow_metrics(dst, p);
        return __DST_METRICS_PTR(p);
}

/* This may only be invoked before the entry has reached global
 * visibility.
 */
static inline void dst_init_metrics(struct dst_entry *dst,
                                    const u32 *src_metrics,
                                    bool read_only)
{
        dst->_metrics = ((unsigned long) src_metrics) |
                (read_only ? DST_METRICS_READ_ONLY : 0);
}

static inline void dst_copy_metrics(struct dst_entry *dest, const struct dst_entry *src)
{
        u32 *dst_metrics = dst_metrics_write_ptr(dest);

        if (dst_metrics) {
                u32 *src_metrics = DST_METRICS_PTR(src);

                memcpy(dst_metrics, src_metrics, RTAX_MAX * sizeof(u32));
        }
}

static inline u32 *dst_metrics_ptr(struct dst_entry *dst)
{
        return DST_METRICS_PTR(dst);
}

static inline u32
dst_metric_raw(const struct dst_entry *dst, const int metric)
{
        u32 *p = DST_METRICS_PTR(dst);

        return p[metric-1];
}

static inline u32
dst_metric(const struct dst_entry *dst, const int metric)
{
        WARN_ON_ONCE(metric == RTAX_HOPLIMIT ||
                     metric == RTAX_ADVMSS ||
                     metric == RTAX_MTU);
        return dst_metric_raw(dst, metric);
}

static inline u32
dst_metric_advmss(const struct dst_entry *dst)
{
        u32 advmss = dst_metric_raw(dst, RTAX_ADVMSS);

        if (!advmss)
                advmss = dst->ops->default_advmss(dst);

        return advmss;
}

static inline void dst_metric_set(struct dst_entry *dst, int metric, u32 val)
{
        u32 *p = dst_metrics_write_ptr(dst);

        if (p)
                p[metric-1] = val;
}

/* Kernel-internal feature bits that are unallocated in user space. */
#define DST_FEATURE_ECN_CA        (1U << 31)

#define DST_FEATURE_MASK        (DST_FEATURE_ECN_CA)
#define DST_FEATURE_ECN_MASK        (DST_FEATURE_ECN_CA | RTAX_FEATURE_ECN)

static inline u32
dst_feature(const struct dst_entry *dst, u32 feature)
{
        return dst_metric(dst, RTAX_FEATURES) & feature;
}

static inline u32 dst_mtu(const struct dst_entry *dst)
{
        return dst->ops->mtu(dst);
}

/* RTT metrics are stored in milliseconds for user ABI, but used as jiffies */
static inline unsigned long dst_metric_rtt(const struct dst_entry *dst, int metric)
{
        return msecs_to_jiffies(dst_metric(dst, metric));
}

static inline u32
dst_allfrag(const struct dst_entry *dst)
{
        int ret = dst_feature(dst,  RTAX_FEATURE_ALLFRAG);
        return ret;
}

static inline int
dst_metric_locked(const struct dst_entry *dst, int metric)
{
        return dst_metric(dst, RTAX_LOCK) & (1 << metric);
}

static inline void dst_hold(struct dst_entry *dst)
{
        /*
         * If your kernel compilation stops here, please check
         * the placement of __refcnt in struct dst_entry
         */
        BUILD_BUG_ON(offsetof(struct dst_entry, __refcnt) & 63);
        WARN_ON(atomic_inc_not_zero(&dst->__refcnt) == 0);
}

static inline void dst_use_noref(struct dst_entry *dst, unsigned long time)
{
        if (unlikely(time != dst->lastuse)) {
                dst->__use++;
                dst->lastuse = time;
        }
}

static inline struct dst_entry *dst_clone(struct dst_entry *dst)
{
        if (dst)
                dst_hold(dst);
        return dst;
}

void dst_release(struct dst_entry *dst);

void dst_release_immediate(struct dst_entry *dst);

static inline void refdst_drop(unsigned long refdst)
{
        if (!(refdst & SKB_DST_NOREF))
                dst_release((struct dst_entry *)(refdst & SKB_DST_PTRMASK));
}

/**
 * skb_dst_drop - drops skb dst
 * @skb: buffer
 *
 * Drops dst reference count if a reference was taken.
 */
static inline void skb_dst_drop(struct sk_buff *skb)
{
        if (skb->_skb_refdst) {
                refdst_drop(skb->_skb_refdst);
                skb->_skb_refdst = 0UL;
        }
}

static inline void __skb_dst_copy(struct sk_buff *nskb, unsigned long refdst)
{
        nskb->_skb_refdst = refdst;
        if (!(nskb->_skb_refdst & SKB_DST_NOREF))
                dst_clone(skb_dst(nskb));
}

static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb)
{
        __skb_dst_copy(nskb, oskb->_skb_refdst);
}

/**
 * dst_hold_safe - Take a reference on a dst if possible
 * @dst: pointer to dst entry
 *
 * This helper returns false if it could not safely
 * take a reference on a dst.
 */
static inline bool dst_hold_safe(struct dst_entry *dst)
{
        return atomic_inc_not_zero(&dst->__refcnt);
}

/**
 * skb_dst_force - makes sure skb dst is refcounted
 * @skb: buffer
 *
 * If dst is not yet refcounted and not destroyed, grab a ref on it.
 * Returns true if dst is refcounted.
 */
static inline bool skb_dst_force(struct sk_buff *skb)
{
        if (skb_dst_is_noref(skb)) {
                struct dst_entry *dst = skb_dst(skb);

                WARN_ON(!rcu_read_lock_held());
                if (!dst_hold_safe(dst))
                        dst = NULL;

                skb->_skb_refdst = (unsigned long)dst;
        }

        return skb->_skb_refdst != 0UL;
}


/**
 *        __skb_tunnel_rx - prepare skb for rx reinsert
 *        @skb: buffer
 *        @dev: tunnel device
 *        @net: netns for packet i/o
 *
 *        After decapsulation, packet is going to re-enter (netif_rx()) our stack,
 *        so make some cleanups. (no accounting done)
 */
static inline void __skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev,
                                   struct net *net)
{
        skb->dev = dev;

        /*
         * Clear hash so that we can recalulate the hash for the
         * encapsulated packet, unless we have already determine the hash
         * over the L4 4-tuple.
         */
        skb_clear_hash_if_not_l4(skb);
        skb_set_queue_mapping(skb, 0);
        skb_scrub_packet(skb, !net_eq(net, dev_net(dev)));
}

/**
 *        skb_tunnel_rx - prepare skb for rx reinsert
 *        @skb: buffer
 *        @dev: tunnel device
 *        @net: netns for packet i/o
 *
 *        After decapsulation, packet is going to re-enter (netif_rx()) our stack,
 *        so make some cleanups, and perform accounting.
 *        Note: this accounting is not SMP safe.
 */
static inline void skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev,
                                 struct net *net)
{
        DEV_STATS_INC(dev, rx_packets);
        DEV_STATS_ADD(dev, rx_bytes, skb->len);
        __skb_tunnel_rx(skb, dev, net);
}

static inline u32 dst_tclassid(const struct sk_buff *skb)
{
#ifdef CONFIG_IP_ROUTE_CLASSID
        const struct dst_entry *dst;

        dst = skb_dst(skb);
        if (dst)
                return dst->tclassid;
#endif
        return 0;
}

int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
static inline int dst_discard(struct sk_buff *skb)
{
        return dst_discard_out(&init_net, skb->sk, skb);
}
void *dst_alloc(struct dst_ops *ops, struct net_device *dev, int initial_ref,
                int initial_obsolete, unsigned short flags);
void dst_init(struct dst_entry *dst, struct dst_ops *ops,
              struct net_device *dev, int initial_ref, int initial_obsolete,
              unsigned short flags);
struct dst_entry *dst_destroy(struct dst_entry *dst);
void dst_dev_put(struct dst_entry *dst);

static inline void dst_confirm(struct dst_entry *dst)
{
}

static inline struct neighbour *dst_neigh_lookup(const struct dst_entry *dst, const void *daddr)
{
        struct neighbour *n = dst->ops->neigh_lookup(dst, NULL, daddr);
        return IS_ERR(n) ? NULL : n;
}

static inline struct neighbour *dst_neigh_lookup_skb(const struct dst_entry *dst,
                                                     struct sk_buff *skb)
{
        struct neighbour *n = NULL;

        /* The packets from tunnel devices (eg bareudp) may have only
         * metadata in the dst pointer of skb. Hence a pointer check of
         * neigh_lookup is needed.
         */
        if (dst->ops->neigh_lookup)
                n = dst->ops->neigh_lookup(dst, skb, NULL);

        return IS_ERR(n) ? NULL : n;
}

static inline void dst_confirm_neigh(const struct dst_entry *dst,
                                     const void *daddr)
{
        if (dst->ops->confirm_neigh)
                dst->ops->confirm_neigh(dst, daddr);
}

static inline void dst_link_failure(struct sk_buff *skb)
{
        struct dst_entry *dst = skb_dst(skb);
        if (dst && dst->ops && dst->ops->link_failure)
                dst->ops->link_failure(skb);
}

static inline void dst_set_expires(struct dst_entry *dst, int timeout)
{
        unsigned long expires = jiffies + timeout;

        if (expires == 0)
                expires = 1;

        if (dst->expires == 0 || time_before(expires, dst->expires))
                dst->expires = expires;
}

static inline unsigned int dst_dev_overhead(struct dst_entry *dst,
                                            struct sk_buff *skb)
{
        if (likely(dst))
                return LL_RESERVED_SPACE(dst->dev);

        return skb->mac_len;
}

INDIRECT_CALLABLE_DECLARE(int ip6_output(struct net *, struct sock *,
                                         struct sk_buff *));
INDIRECT_CALLABLE_DECLARE(int ip_output(struct net *, struct sock *,
                                         struct sk_buff *));
/* Output packet to network from transport.  */
static inline int dst_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        return INDIRECT_CALL_INET(skb_dst(skb)->output,
                                  ip6_output, ip_output,
                                  net, sk, skb);
}

INDIRECT_CALLABLE_DECLARE(int ip6_input(struct sk_buff *));
INDIRECT_CALLABLE_DECLARE(int ip_local_deliver(struct sk_buff *));
/* Input packet from network to transport.  */
static inline int dst_input(struct sk_buff *skb)
{
        return INDIRECT_CALL_INET(skb_dst(skb)->input,
                                  ip6_input, ip_local_deliver, skb);
}

static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie)
{
        if (dst->obsolete)
                dst = dst->ops->check(dst, cookie);
        return dst;
}

/* Flags for xfrm_lookup flags argument. */
enum {
        XFRM_LOOKUP_ICMP = 1 << 0,
        XFRM_LOOKUP_QUEUE = 1 << 1,
        XFRM_LOOKUP_KEEP_DST_REF = 1 << 2,
};

struct flowi;
#ifndef CONFIG_XFRM
static inline struct dst_entry *xfrm_lookup(struct net *net,
                                            struct dst_entry *dst_orig,
                                            const struct flowi *fl,
                                            const struct sock *sk,
                                            int flags)
{
        return dst_orig;
}

static inline struct dst_entry *
xfrm_lookup_with_ifid(struct net *net, struct dst_entry *dst_orig,
                      const struct flowi *fl, const struct sock *sk,
                      int flags, u32 if_id)
{
        return dst_orig;
}

static inline struct dst_entry *xfrm_lookup_route(struct net *net,
                                                  struct dst_entry *dst_orig,
                                                  const struct flowi *fl,
                                                  const struct sock *sk,
                                                  int flags)
{
        return dst_orig;
}

static inline struct xfrm_state *dst_xfrm(const struct dst_entry *dst)
{
        return NULL;
}

#else
struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
                              const struct flowi *fl, const struct sock *sk,
                              int flags);

struct dst_entry *xfrm_lookup_with_ifid(struct net *net,
                                        struct dst_entry *dst_orig,
                                        const struct flowi *fl,
                                        const struct sock *sk, int flags,
                                        u32 if_id);

struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig,
                                    const struct flowi *fl, const struct sock *sk,
                                    int flags);

/* skb attached with this dst needs transformation if dst->xfrm is valid */
static inline struct xfrm_state *dst_xfrm(const struct dst_entry *dst)
{
        return dst->xfrm;
}
#endif

static inline void skb_dst_update_pmtu(struct sk_buff *skb, u32 mtu)
{
        struct dst_entry *dst = skb_dst(skb);

        if (dst && dst->ops->update_pmtu)
                dst->ops->update_pmtu(dst, NULL, skb, mtu, true);
}

/* update dst pmtu but not do neighbor confirm */
static inline void skb_dst_update_pmtu_no_confirm(struct sk_buff *skb, u32 mtu)
{
        struct dst_entry *dst = skb_dst(skb);

        if (dst && dst->ops->update_pmtu)
                dst->ops->update_pmtu(dst, NULL, skb, mtu, false);
}

struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie);
void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
                               struct sk_buff *skb, u32 mtu, bool confirm_neigh);
void dst_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
                            struct sk_buff *skb);
u32 *dst_blackhole_cow_metrics(struct dst_entry *dst, unsigned long old);
struct neighbour *dst_blackhole_neigh_lookup(const struct dst_entry *dst,
                                             struct sk_buff *skb,
                                             const void *daddr);
unsigned int dst_blackhole_mtu(const struct dst_entry *dst);

#endif /* _NET_DST_H */














































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM xdp

#if !defined(_TRACE_XDP_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_XDP_H

#include <linux/netdevice.h>
#include <linux/filter.h>
#include <linux/tracepoint.h>
#include <linux/bpf.h>

#define __XDP_ACT_MAP(FN)        \
        FN(ABORTED)                \
        FN(DROP)                \
        FN(PASS)                \
        FN(TX)                        \
        FN(REDIRECT)

#define __XDP_ACT_TP_FN(x)        \
        TRACE_DEFINE_ENUM(XDP_##x);
#define __XDP_ACT_SYM_FN(x)        \
        { XDP_##x, #x },
#define __XDP_ACT_SYM_TAB        \
        __XDP_ACT_MAP(__XDP_ACT_SYM_FN) { -1, NULL }
__XDP_ACT_MAP(__XDP_ACT_TP_FN)

TRACE_EVENT(xdp_exception,

        TP_PROTO(const struct net_device *dev,
                 const struct bpf_prog *xdp, u32 act),

        TP_ARGS(dev, xdp, act),

        TP_STRUCT__entry(
                __field(int, prog_id)
                __field(u32, act)
                __field(int, ifindex)
        ),

        TP_fast_assign(
                __entry->prog_id        = xdp->aux->id;
                __entry->act                = act;
                __entry->ifindex        = dev->ifindex;
        ),

        TP_printk("prog_id=%d action=%s ifindex=%d",
                  __entry->prog_id,
                  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
                  __entry->ifindex)
);

TRACE_EVENT(xdp_bulk_tx,

        TP_PROTO(const struct net_device *dev,
                 int sent, int drops, int err),

        TP_ARGS(dev, sent, drops, err),

        TP_STRUCT__entry(
                __field(int, ifindex)
                __field(u32, act)
                __field(int, drops)
                __field(int, sent)
                __field(int, err)
        ),

        TP_fast_assign(
                __entry->ifindex        = dev->ifindex;
                __entry->act                = XDP_TX;
                __entry->drops                = drops;
                __entry->sent                = sent;
                __entry->err                = err;
        ),

        TP_printk("ifindex=%d action=%s sent=%d drops=%d err=%d",
                  __entry->ifindex,
                  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
                  __entry->sent, __entry->drops, __entry->err)
);

#ifndef __DEVMAP_OBJ_TYPE
#define __DEVMAP_OBJ_TYPE
struct _bpf_dtab_netdev {
        struct net_device *dev;
};
#endif /* __DEVMAP_OBJ_TYPE */

#define devmap_ifindex(tgt, map)                                \
        (((map->map_type == BPF_MAP_TYPE_DEVMAP ||        \
                  map->map_type == BPF_MAP_TYPE_DEVMAP_HASH)) ? \
          ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex : 0)

DECLARE_EVENT_CLASS(xdp_redirect_template,

        TP_PROTO(const struct net_device *dev,
                 const struct bpf_prog *xdp,
                 const void *tgt, int err,
                 const struct bpf_map *map, u32 index),

        TP_ARGS(dev, xdp, tgt, err, map, index),

        TP_STRUCT__entry(
                __field(int, prog_id)
                __field(u32, act)
                __field(int, ifindex)
                __field(int, err)
                __field(int, to_ifindex)
                __field(u32, map_id)
                __field(int, map_index)
        ),

        TP_fast_assign(
                __entry->prog_id        = xdp->aux->id;
                __entry->act                = XDP_REDIRECT;
                __entry->ifindex        = dev->ifindex;
                __entry->err                = err;
                __entry->to_ifindex        = map ? devmap_ifindex(tgt, map) :
                                                index;
                __entry->map_id                = map ? map->id : 0;
                __entry->map_index        = map ? index : 0;
        ),

        TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d"
                  " map_id=%d map_index=%d",
                  __entry->prog_id,
                  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
                  __entry->ifindex, __entry->to_ifindex,
                  __entry->err, __entry->map_id, __entry->map_index)
);

DEFINE_EVENT(xdp_redirect_template, xdp_redirect,
        TP_PROTO(const struct net_device *dev,
                 const struct bpf_prog *xdp,
                 const void *tgt, int err,
                 const struct bpf_map *map, u32 index),
        TP_ARGS(dev, xdp, tgt, err, map, index)
);

DEFINE_EVENT(xdp_redirect_template, xdp_redirect_err,
        TP_PROTO(const struct net_device *dev,
                 const struct bpf_prog *xdp,
                 const void *tgt, int err,
                 const struct bpf_map *map, u32 index),
        TP_ARGS(dev, xdp, tgt, err, map, index)
);

#define _trace_xdp_redirect(dev, xdp, to)                \
         trace_xdp_redirect(dev, xdp, NULL, 0, NULL, to);

#define _trace_xdp_redirect_err(dev, xdp, to, err)        \
         trace_xdp_redirect_err(dev, xdp, NULL, err, NULL, to);

#define _trace_xdp_redirect_map(dev, xdp, to, map, index)                \
         trace_xdp_redirect(dev, xdp, to, 0, map, index);

#define _trace_xdp_redirect_map_err(dev, xdp, to, map, index, err)        \
         trace_xdp_redirect_err(dev, xdp, to, err, map, index);

/* not used anymore, but kept around so as not to break old programs */
DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map,
        TP_PROTO(const struct net_device *dev,
                 const struct bpf_prog *xdp,
                 const void *tgt, int err,
                 const struct bpf_map *map, u32 index),
        TP_ARGS(dev, xdp, tgt, err, map, index)
);

DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map_err,
        TP_PROTO(const struct net_device *dev,
                 const struct bpf_prog *xdp,
                 const void *tgt, int err,
                 const struct bpf_map *map, u32 index),
        TP_ARGS(dev, xdp, tgt, err, map, index)
);

TRACE_EVENT(xdp_cpumap_kthread,

        TP_PROTO(int map_id, unsigned int processed,  unsigned int drops,
                 int sched, struct xdp_cpumap_stats *xdp_stats),

        TP_ARGS(map_id, processed, drops, sched, xdp_stats),

        TP_STRUCT__entry(
                __field(int, map_id)
                __field(u32, act)
                __field(int, cpu)
                __field(unsigned int, drops)
                __field(unsigned int, processed)
                __field(int, sched)
                __field(unsigned int, xdp_pass)
                __field(unsigned int, xdp_drop)
                __field(unsigned int, xdp_redirect)
        ),

        TP_fast_assign(
                __entry->map_id                = map_id;
                __entry->act                = XDP_REDIRECT;
                __entry->cpu                = smp_processor_id();
                __entry->drops                = drops;
                __entry->processed        = processed;
                __entry->sched        = sched;
                __entry->xdp_pass        = xdp_stats->pass;
                __entry->xdp_drop        = xdp_stats->drop;
                __entry->xdp_redirect        = xdp_stats->redirect;
        ),

        TP_printk("kthread"
                  " cpu=%d map_id=%d action=%s"
                  " processed=%u drops=%u"
                  " sched=%d"
                  " xdp_pass=%u xdp_drop=%u xdp_redirect=%u",
                  __entry->cpu, __entry->map_id,
                  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
                  __entry->processed, __entry->drops,
                  __entry->sched,
                  __entry->xdp_pass, __entry->xdp_drop, __entry->xdp_redirect)
);

TRACE_EVENT(xdp_cpumap_enqueue,

        TP_PROTO(int map_id, unsigned int processed,  unsigned int drops,
                 int to_cpu),

        TP_ARGS(map_id, processed, drops, to_cpu),

        TP_STRUCT__entry(
                __field(int, map_id)
                __field(u32, act)
                __field(int, cpu)
                __field(unsigned int, drops)
                __field(unsigned int, processed)
                __field(int, to_cpu)
        ),

        TP_fast_assign(
                __entry->map_id                = map_id;
                __entry->act                = XDP_REDIRECT;
                __entry->cpu                = smp_processor_id();
                __entry->drops                = drops;
                __entry->processed        = processed;
                __entry->to_cpu                = to_cpu;
        ),

        TP_printk("enqueue"
                  " cpu=%d map_id=%d action=%s"
                  " processed=%u drops=%u"
                  " to_cpu=%d",
                  __entry->cpu, __entry->map_id,
                  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
                  __entry->processed, __entry->drops,
                  __entry->to_cpu)
);

TRACE_EVENT(xdp_devmap_xmit,

        TP_PROTO(const struct net_device *from_dev,
                 const struct net_device *to_dev,
                 int sent, int drops, int err),

        TP_ARGS(from_dev, to_dev, sent, drops, err),

        TP_STRUCT__entry(
                __field(int, from_ifindex)
                __field(u32, act)
                __field(int, to_ifindex)
                __field(int, drops)
                __field(int, sent)
                __field(int, err)
        ),

        TP_fast_assign(
                __entry->from_ifindex        = from_dev->ifindex;
                __entry->act                = XDP_REDIRECT;
                __entry->to_ifindex        = to_dev->ifindex;
                __entry->drops                = drops;
                __entry->sent                = sent;
                __entry->err                = err;
        ),

        TP_printk("ndo_xdp_xmit"
                  " from_ifindex=%d to_ifindex=%d action=%s"
                  " sent=%d drops=%d"
                  " err=%d",
                  __entry->from_ifindex, __entry->to_ifindex,
                  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
                  __entry->sent, __entry->drops,
                  __entry->err)
);

/* Expect users already include <net/xdp.h>, but not xdp_priv.h */
#include <net/xdp_priv.h>

#define __MEM_TYPE_MAP(FN)        \
        FN(PAGE_SHARED)                \
        FN(PAGE_ORDER0)                \
        FN(PAGE_POOL)                \
        FN(XSK_BUFF_POOL)

#define __MEM_TYPE_TP_FN(x)        \
        TRACE_DEFINE_ENUM(MEM_TYPE_##x);
#define __MEM_TYPE_SYM_FN(x)        \
        { MEM_TYPE_##x, #x },
#define __MEM_TYPE_SYM_TAB        \
        __MEM_TYPE_MAP(__MEM_TYPE_SYM_FN) { -1, 0 }
__MEM_TYPE_MAP(__MEM_TYPE_TP_FN)

TRACE_EVENT(mem_disconnect,

        TP_PROTO(const struct xdp_mem_allocator *xa),

        TP_ARGS(xa),

        TP_STRUCT__entry(
                __field(const struct xdp_mem_allocator *,        xa)
                __field(u32,                mem_id)
                __field(u32,                mem_type)
                __field(const void *,        allocator)
        ),

        TP_fast_assign(
                __entry->xa                = xa;
                __entry->mem_id                = xa->mem.id;
                __entry->mem_type        = xa->mem.type;
                __entry->allocator        = xa->allocator;
        ),

        TP_printk("mem_id=%d mem_type=%s allocator=%p",
                  __entry->mem_id,
                  __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB),
                  __entry->allocator
        )
);

TRACE_EVENT(mem_connect,

        TP_PROTO(const struct xdp_mem_allocator *xa,
                 const struct xdp_rxq_info *rxq),

        TP_ARGS(xa, rxq),

        TP_STRUCT__entry(
                __field(const struct xdp_mem_allocator *,        xa)
                __field(u32,                mem_id)
                __field(u32,                mem_type)
                __field(const void *,        allocator)
                __field(const struct xdp_rxq_info *,                rxq)
                __field(int,                ifindex)
        ),

        TP_fast_assign(
                __entry->xa                = xa;
                __entry->mem_id                = xa->mem.id;
                __entry->mem_type        = xa->mem.type;
                __entry->allocator        = xa->allocator;
                __entry->rxq                = rxq;
                __entry->ifindex        = rxq->dev->ifindex;
        ),

        TP_printk("mem_id=%d mem_type=%s allocator=%p"
                  " ifindex=%d",
                  __entry->mem_id,
                  __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB),
                  __entry->allocator,
                  __entry->ifindex
        )
);

TRACE_EVENT(mem_return_failed,

        TP_PROTO(const struct xdp_mem_info *mem,
                 const struct page *page),

        TP_ARGS(mem, page),

        TP_STRUCT__entry(
                __field(const struct page *,        page)
                __field(u32,                mem_id)
                __field(u32,                mem_type)
        ),

        TP_fast_assign(
                __entry->page                = page;
                __entry->mem_id                = mem->id;
                __entry->mem_type        = mem->type;
        ),

        TP_printk("mem_id=%d mem_type=%s page=%p",
                  __entry->mem_id,
                  __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB),
                  __entry->page
        )
);

#endif /* _TRACE_XDP_H */

#include <trace/define_trace.h>






























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the ICMP module.
 *
 * Version:        @(#)icmp.h        1.0.4        05/13/93
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 */
#ifndef _ICMP_H
#define        _ICMP_H

#include <linux/icmp.h>

#include <net/inet_sock.h>
#include <net/snmp.h>
#include <net/ip.h>

struct icmp_err {
  int                errno;
  unsigned int        fatal:1;
};

extern const struct icmp_err icmp_err_convert[];
#define ICMP_INC_STATS(net, field)        SNMP_INC_STATS((net)->mib.icmp_statistics, field)
#define __ICMP_INC_STATS(net, field)        __SNMP_INC_STATS((net)->mib.icmp_statistics, field)
#define ICMPMSGOUT_INC_STATS(net, field)        SNMP_INC_STATS_ATOMIC_LONG((net)->mib.icmpmsg_statistics, field+256)
#define ICMPMSGIN_INC_STATS(net, field)                SNMP_INC_STATS_ATOMIC_LONG((net)->mib.icmpmsg_statistics, field)

struct dst_entry;
struct net_proto_family;
struct sk_buff;
struct net;

void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
                 const struct ip_options *opt);
static inline void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
{
        __icmp_send(skb_in, type, code, info, &IPCB(skb_in)->opt);
}

#if IS_ENABLED(CONFIG_NF_NAT)
void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info);
#else
static inline void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info)
{
        struct ip_options opts = { 0 };
        __icmp_send(skb_in, type, code, info, &opts);
}
#endif

int icmp_rcv(struct sk_buff *skb);
int icmp_err(struct sk_buff *skb, u32 info);
int icmp_init(void);
void icmp_out_count(struct net *net, unsigned char type);

#endif        /* _ICMP_H */



































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * linux/ipc/util.h
 * Copyright (C) 1999 Christoph Rohland
 *
 * ipc helper functions (c) 1999 Manfred Spraul <manfred@colorfullife.com>
 * namespaces support.      2006 OpenVZ, SWsoft Inc.
 *                               Pavel Emelianov <xemul@openvz.org>
 */

#ifndef _IPC_UTIL_H
#define _IPC_UTIL_H

#include <linux/unistd.h>
#include <linux/err.h>
#include <linux/ipc_namespace.h>

/*
 * The IPC ID contains 2 separate numbers - index and sequence number.
 * By default,
 *   bits  0-14: index (32k, 15 bits)
 *   bits 15-30: sequence number (64k, 16 bits)
 *
 * When IPCMNI extension mode is turned on, the composition changes:
 *   bits  0-23: index (16M, 24 bits)
 *   bits 24-30: sequence number (128, 7 bits)
 */
#define IPCMNI_SHIFT                15
#define IPCMNI_EXTEND_SHIFT        24
#define IPCMNI_EXTEND_MIN_CYCLE        (RADIX_TREE_MAP_SIZE * RADIX_TREE_MAP_SIZE)
#define IPCMNI                        (1 << IPCMNI_SHIFT)
#define IPCMNI_EXTEND                (1 << IPCMNI_EXTEND_SHIFT)

#ifdef CONFIG_SYSVIPC_SYSCTL
extern int ipc_mni;
extern int ipc_mni_shift;
extern int ipc_min_cycle;

#define ipcmni_seq_shift()        ipc_mni_shift
#define IPCMNI_IDX_MASK                ((1 << ipc_mni_shift) - 1)

#else /* CONFIG_SYSVIPC_SYSCTL */

#define ipc_mni                        IPCMNI
#define ipc_min_cycle                ((int)RADIX_TREE_MAP_SIZE)
#define ipcmni_seq_shift()        IPCMNI_SHIFT
#define IPCMNI_IDX_MASK                ((1 << IPCMNI_SHIFT) - 1)
#endif /* CONFIG_SYSVIPC_SYSCTL */

void sem_init(void);
void msg_init(void);
void shm_init(void);

struct ipc_namespace;
struct pid_namespace;

#ifdef CONFIG_POSIX_MQUEUE
extern void mq_clear_sbinfo(struct ipc_namespace *ns);
extern void mq_put_mnt(struct ipc_namespace *ns);
#else
static inline void mq_clear_sbinfo(struct ipc_namespace *ns) { }
static inline void mq_put_mnt(struct ipc_namespace *ns) { }
#endif

#ifdef CONFIG_SYSVIPC
void sem_init_ns(struct ipc_namespace *ns);
void msg_init_ns(struct ipc_namespace *ns);
void shm_init_ns(struct ipc_namespace *ns);

void sem_exit_ns(struct ipc_namespace *ns);
void msg_exit_ns(struct ipc_namespace *ns);
void shm_exit_ns(struct ipc_namespace *ns);
#else
static inline void sem_init_ns(struct ipc_namespace *ns) { }
static inline void msg_init_ns(struct ipc_namespace *ns) { }
static inline void shm_init_ns(struct ipc_namespace *ns) { }

static inline void sem_exit_ns(struct ipc_namespace *ns) { }
static inline void msg_exit_ns(struct ipc_namespace *ns) { }
static inline void shm_exit_ns(struct ipc_namespace *ns) { }
#endif

/*
 * Structure that holds the parameters needed by the ipc operations
 * (see after)
 */
struct ipc_params {
        key_t key;
        int flg;
        union {
                size_t size;        /* for shared memories */
                int nsems;        /* for semaphores */
        } u;                        /* holds the getnew() specific param */
};

/*
 * Structure that holds some ipc operations. This structure is used to unify
 * the calls to sys_msgget(), sys_semget(), sys_shmget()
 *      . routine to call to create a new ipc object. Can be one of newque,
 *        newary, newseg
 *      . routine to call to check permissions for a new ipc object.
 *        Can be one of security_msg_associate, security_sem_associate,
 *        security_shm_associate
 *      . routine to call for an extra check if needed
 */
struct ipc_ops {
        int (*getnew)(struct ipc_namespace *, struct ipc_params *);
        int (*associate)(struct kern_ipc_perm *, int);
        int (*more_checks)(struct kern_ipc_perm *, struct ipc_params *);
};

struct seq_file;
struct ipc_ids;

void ipc_init_ids(struct ipc_ids *ids);
#ifdef CONFIG_PROC_FS
void __init ipc_init_proc_interface(const char *path, const char *header,
                int ids, int (*show)(struct seq_file *, void *));
struct pid_namespace *ipc_seq_pid_ns(struct seq_file *);
#else
#define ipc_init_proc_interface(path, header, ids, show) do {} while (0)
#endif

#define IPC_SEM_IDS        0
#define IPC_MSG_IDS        1
#define IPC_SHM_IDS        2

#define ipcid_to_idx(id)  ((id) & IPCMNI_IDX_MASK)
#define ipcid_to_seqx(id) ((id) >> ipcmni_seq_shift())
#define ipcid_seq_max()          (INT_MAX >> ipcmni_seq_shift())

/* must be called with ids->rwsem acquired for writing */
int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int);

/* must be called with both locks acquired. */
void ipc_rmid(struct ipc_ids *, struct kern_ipc_perm *);

/* must be called with both locks acquired. */
void ipc_set_key_private(struct ipc_ids *, struct kern_ipc_perm *);

/* must be called with ipcp locked */
int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flg);

/**
 * ipc_get_maxidx - get the highest assigned index
 * @ids: ipc identifier set
 *
 * Called with ipc_ids.rwsem held for reading.
 */
static inline int ipc_get_maxidx(struct ipc_ids *ids)
{
        if (ids->in_use == 0)
                return -1;

        if (ids->in_use == ipc_mni)
                return ipc_mni - 1;

        return ids->max_idx;
}

/*
 * For allocation that need to be freed by RCU.
 * Objects are reference counted, they start with reference count 1.
 * getref increases the refcount, the putref call that reduces the recount
 * to 0 schedules the rcu destruction. Caller must guarantee locking.
 *
 * refcount is initialized by ipc_addid(), before that point call_rcu()
 * must be used.
 */
bool ipc_rcu_getref(struct kern_ipc_perm *ptr);
void ipc_rcu_putref(struct kern_ipc_perm *ptr,
                        void (*func)(struct rcu_head *head));

struct kern_ipc_perm *ipc_obtain_object_idr(struct ipc_ids *ids, int id);

void kernel_to_ipc64_perm(struct kern_ipc_perm *in, struct ipc64_perm *out);
void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out);
int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out);
struct kern_ipc_perm *ipcctl_obtain_check(struct ipc_namespace *ns,
                                             struct ipc_ids *ids, int id, int cmd,
                                             struct ipc64_perm *perm, int extra_perm);

static inline void ipc_update_pid(struct pid **pos, struct pid *pid)
{
        struct pid *old = *pos;
        if (old != pid) {
                *pos = get_pid(pid);
                put_pid(old);
        }
}

#ifdef CONFIG_ARCH_WANT_IPC_PARSE_VERSION
int ipc_parse_version(int *cmd);
#endif

extern void free_msg(struct msg_msg *msg);
extern struct msg_msg *load_msg(const void __user *src, size_t len);
extern struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst);
extern int store_msg(void __user *dest, struct msg_msg *msg, size_t len);

static inline int ipc_checkid(struct kern_ipc_perm *ipcp, int id)
{
        return ipcid_to_seqx(id) != ipcp->seq;
}

static inline void ipc_lock_object(struct kern_ipc_perm *perm)
{
        spin_lock(&perm->lock);
}

static inline void ipc_unlock_object(struct kern_ipc_perm *perm)
{
        spin_unlock(&perm->lock);
}

static inline void ipc_assert_locked_object(struct kern_ipc_perm *perm)
{
        assert_spin_locked(&perm->lock);
}

static inline void ipc_unlock(struct kern_ipc_perm *perm)
{
        ipc_unlock_object(perm);
        rcu_read_unlock();
}

/*
 * ipc_valid_object() - helper to sort out IPC_RMID races for codepaths
 * where the respective ipc_ids.rwsem is not being held down.
 * Checks whether the ipc object is still around or if it's gone already, as
 * ipc_rmid() may have already freed the ID while the ipc lock was spinning.
 * Needs to be called with kern_ipc_perm.lock held -- exception made for one
 * checkpoint case at sys_semtimedop() as noted in code commentary.
 */
static inline bool ipc_valid_object(struct kern_ipc_perm *perm)
{
        return !perm->deleted;
}

struct kern_ipc_perm *ipc_obtain_object_check(struct ipc_ids *ids, int id);
int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids,
                        const struct ipc_ops *ops, struct ipc_params *params);
void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
                void (*free)(struct ipc_namespace *, struct kern_ipc_perm *));

static inline int sem_check_semmni(struct ipc_namespace *ns) {
        /*
         * Check semmni range [0, ipc_mni]
         * semmni is the last element of sem_ctls[4] array
         */
        return ((ns->sem_ctls[3] < 0) || (ns->sem_ctls[3] > ipc_mni))
                ? -ERANGE : 0;
}

#ifdef CONFIG_COMPAT
#include <linux/compat.h>
struct compat_ipc_perm {
        key_t key;
        __compat_uid_t uid;
        __compat_gid_t gid;
        __compat_uid_t cuid;
        __compat_gid_t cgid;
        compat_mode_t mode;
        unsigned short seq;
};

void to_compat_ipc_perm(struct compat_ipc_perm *, struct ipc64_perm *);
void to_compat_ipc64_perm(struct compat_ipc64_perm *, struct ipc64_perm *);
int get_compat_ipc_perm(struct ipc64_perm *, struct compat_ipc_perm __user *);
int get_compat_ipc64_perm(struct ipc64_perm *,
                          struct compat_ipc64_perm __user *);

static inline int compat_ipc_parse_version(int *cmd)
{
        int version = *cmd & IPC_64;
        *cmd &= ~IPC_64;
        return version;
}

long compat_ksys_old_semctl(int semid, int semnum, int cmd, int arg);
long compat_ksys_old_msgctl(int msqid, int cmd, void __user *uptr);
long compat_ksys_msgrcv(int msqid, compat_uptr_t msgp, compat_ssize_t msgsz,
                        compat_long_t msgtyp, int msgflg);
long compat_ksys_msgsnd(int msqid, compat_uptr_t msgp,
                       compat_ssize_t msgsz, int msgflg);
long compat_ksys_old_shmctl(int shmid, int cmd, void __user *uptr);

#endif

#endif








































































































    1 



















































































    1 




    1 












    1 




    1 
    1 
    1 















    1 












    1 












    1 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
 * Written by Alex Tomas <alex@clusterfs.com>
 */

#ifndef _EXT4_EXTENTS
#define _EXT4_EXTENTS

#include "ext4.h"

/*
 * With AGGRESSIVE_TEST defined, the capacity of index/leaf blocks
 * becomes very small, so index split, in-depth growing and
 * other hard changes happen much more often.
 * This is for debug purposes only.
 */
#define AGGRESSIVE_TEST_

/*
 * With EXTENTS_STATS defined, the number of blocks and extents
 * are collected in the truncate path. They'll be shown at
 * umount time.
 */
#define EXTENTS_STATS__

/*
 * If CHECK_BINSEARCH is defined, then the results of the binary search
 * will also be checked by linear search.
 */
#define CHECK_BINSEARCH__

/*
 * If EXT_STATS is defined then stats numbers are collected.
 * These number will be displayed at umount time.
 */
#define EXT_STATS_


/*
 * ext4_inode has i_block array (60 bytes total).
 * The first 12 bytes store ext4_extent_header;
 * the remainder stores an array of ext4_extent.
 * For non-inode extent blocks, ext4_extent_tail
 * follows the array.
 */

/*
 * This is the extent tail on-disk structure.
 * All other extent structures are 12 bytes long.  It turns out that
 * block_size % 12 >= 4 for at least all powers of 2 greater than 512, which
 * covers all valid ext4 block sizes.  Therefore, this tail structure can be
 * crammed into the end of the block without having to rebalance the tree.
 */
struct ext4_extent_tail {
        __le32        et_checksum;        /* crc32c(uuid+inum+extent_block) */
};

/*
 * This is the extent on-disk structure.
 * It's used at the bottom of the tree.
 */
struct ext4_extent {
        __le32        ee_block;        /* first logical block extent covers */
        __le16        ee_len;                /* number of blocks covered by extent */
        __le16        ee_start_hi;        /* high 16 bits of physical block */
        __le32        ee_start_lo;        /* low 32 bits of physical block */
};

/*
 * This is index on-disk structure.
 * It's used at all the levels except the bottom.
 */
struct ext4_extent_idx {
        __le32        ei_block;        /* index covers logical blocks from 'block' */
        __le32        ei_leaf_lo;        /* pointer to the physical block of the next *
                                 * level. leaf or next index could be there */
        __le16        ei_leaf_hi;        /* high 16 bits of physical block */
        __u16        ei_unused;
};

/*
 * Each block (leaves and indexes), even inode-stored has header.
 */
struct ext4_extent_header {
        __le16        eh_magic;        /* probably will support different formats */
        __le16        eh_entries;        /* number of valid entries */
        __le16        eh_max;                /* capacity of store in entries */
        __le16        eh_depth;        /* has tree real underlying blocks? */
        __le32        eh_generation;        /* generation of the tree */
};

#define EXT4_EXT_MAGIC                cpu_to_le16(0xf30a)
#define EXT4_MAX_EXTENT_DEPTH 5

#define EXT4_EXTENT_TAIL_OFFSET(hdr) \
        (sizeof(struct ext4_extent_header) + \
         (sizeof(struct ext4_extent) * le16_to_cpu((hdr)->eh_max)))

static inline struct ext4_extent_tail *
find_ext4_extent_tail(struct ext4_extent_header *eh)
{
        return (struct ext4_extent_tail *)(((void *)eh) +
                                           EXT4_EXTENT_TAIL_OFFSET(eh));
}

/*
 * Array of ext4_ext_path contains path to some extent.
 * Creation/lookup routines use it for traversal/splitting/etc.
 * Truncate uses it to simulate recursive walking.
 */
struct ext4_ext_path {
        ext4_fsblk_t                        p_block;
        __u16                                p_depth;
        __u16                                p_maxdepth;
        struct ext4_extent                *p_ext;
        struct ext4_extent_idx                *p_idx;
        struct ext4_extent_header        *p_hdr;
        struct buffer_head                *p_bh;
};

/*
 * Used to record a portion of a cluster found at the beginning or end
 * of an extent while traversing the extent tree during space removal.
 * A partial cluster may be removed if it does not contain blocks shared
 * with extents that aren't being deleted (tofree state).  Otherwise,
 * it cannot be removed (nofree state).
 */
struct partial_cluster {
        ext4_fsblk_t pclu;  /* physical cluster number */
        ext4_lblk_t lblk;   /* logical block number within logical cluster */
        enum {initial, tofree, nofree} state;
};

/*
 * structure for external API
 */

/*
 * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an
 * initialized extent. This is 2^15 and not (2^16 - 1), since we use the
 * MSB of ee_len field in the extent datastructure to signify if this
 * particular extent is an initialized extent or an unwritten (i.e.
 * preallocated).
 * EXT_UNWRITTEN_MAX_LEN is the maximum number of blocks we can have in an
 * unwritten extent.
 * If ee_len is <= 0x8000, it is an initialized extent. Otherwise, it is an
 * unwritten one. In other words, if MSB of ee_len is set, it is an
 * unwritten extent with only one special scenario when ee_len = 0x8000.
 * In this case we can not have an unwritten extent of zero length and
 * thus we make it as a special case of initialized extent with 0x8000 length.
 * This way we get better extent-to-group alignment for initialized extents.
 * Hence, the maximum number of blocks we can have in an *initialized*
 * extent is 2^15 (32768) and in an *unwritten* extent is 2^15-1 (32767).
 */
#define EXT_INIT_MAX_LEN        (1UL << 15)
#define EXT_UNWRITTEN_MAX_LEN        (EXT_INIT_MAX_LEN - 1)


#define EXT_FIRST_EXTENT(__hdr__) \
        ((struct ext4_extent *) (((char *) (__hdr__)) +                \
                                 sizeof(struct ext4_extent_header)))
#define EXT_FIRST_INDEX(__hdr__) \
        ((struct ext4_extent_idx *) (((char *) (__hdr__)) +        \
                                     sizeof(struct ext4_extent_header)))
#define EXT_HAS_FREE_INDEX(__path__) \
        (le16_to_cpu((__path__)->p_hdr->eh_entries) \
                                     < le16_to_cpu((__path__)->p_hdr->eh_max))
#define EXT_LAST_EXTENT(__hdr__) \
        (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1)
#define EXT_LAST_INDEX(__hdr__) \
        (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1)
#define EXT_MAX_EXTENT(__hdr__)        \
        ((le16_to_cpu((__hdr__)->eh_max)) ? \
        ((EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) \
                                        : 0)
#define EXT_MAX_INDEX(__hdr__) \
        ((le16_to_cpu((__hdr__)->eh_max)) ? \
        ((EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) : 0)

static inline struct ext4_extent_header *ext_inode_hdr(struct inode *inode)
{
        return (struct ext4_extent_header *) EXT4_I(inode)->i_data;
}

static inline struct ext4_extent_header *ext_block_hdr(struct buffer_head *bh)
{
        return (struct ext4_extent_header *) bh->b_data;
}

static inline unsigned short ext_depth(struct inode *inode)
{
        return le16_to_cpu(ext_inode_hdr(inode)->eh_depth);
}

static inline void ext4_ext_mark_unwritten(struct ext4_extent *ext)
{
        /* We can not have an unwritten extent of zero length! */
        BUG_ON((le16_to_cpu(ext->ee_len) & ~EXT_INIT_MAX_LEN) == 0);
        ext->ee_len |= cpu_to_le16(EXT_INIT_MAX_LEN);
}

static inline int ext4_ext_is_unwritten(struct ext4_extent *ext)
{
        /* Extent with ee_len of 0x8000 is treated as an initialized extent */
        return (le16_to_cpu(ext->ee_len) > EXT_INIT_MAX_LEN);
}

static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
{
        return (le16_to_cpu(ext->ee_len) <= EXT_INIT_MAX_LEN ?
                le16_to_cpu(ext->ee_len) :
                (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
}

static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
{
        ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
}

/*
 * ext4_ext_pblock:
 * combine low and high parts of physical block number into ext4_fsblk_t
 */
static inline ext4_fsblk_t ext4_ext_pblock(struct ext4_extent *ex)
{
        ext4_fsblk_t block;

        block = le32_to_cpu(ex->ee_start_lo);
        block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
        return block;
}

/*
 * ext4_idx_pblock:
 * combine low and high parts of a leaf physical block number into ext4_fsblk_t
 */
static inline ext4_fsblk_t ext4_idx_pblock(struct ext4_extent_idx *ix)
{
        ext4_fsblk_t block;

        block = le32_to_cpu(ix->ei_leaf_lo);
        block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
        return block;
}

/*
 * ext4_ext_store_pblock:
 * stores a large physical block number into an extent struct,
 * breaking it into parts
 */
static inline void ext4_ext_store_pblock(struct ext4_extent *ex,
                                         ext4_fsblk_t pb)
{
        ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
        ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
                                      0xffff);
}

/*
 * ext4_idx_store_pblock:
 * stores a large physical block number into an index struct,
 * breaking it into parts
 */
static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
                                         ext4_fsblk_t pb)
{
        ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
        ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
                                     0xffff);
}

#endif /* _EXT4_EXTENTS */
















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_DELAY_H
#define _LINUX_DELAY_H

/*
 * Copyright (C) 1993 Linus Torvalds
 *
 * Delay routines, using a pre-computed "loops_per_jiffy" value.
 *
 * Please note that ndelay(), udelay() and mdelay() may return early for
 * several reasons:
 *  1. computed loops_per_jiffy too low (due to the time taken to
 *     execute the timer interrupt.)
 *  2. cache behaviour affecting the time it takes to execute the
 *     loop function.
 *  3. CPU clock rate changes.
 *
 * Please see this thread:
 *   https://lists.openwall.net/linux-kernel/2011/01/09/56
 */

#include <linux/kernel.h>

extern unsigned long loops_per_jiffy;

#include <asm/delay.h>

/*
 * Using udelay() for intervals greater than a few milliseconds can
 * risk overflow for high loops_per_jiffy (high bogomips) machines. The
 * mdelay() provides a wrapper to prevent this.  For delays greater
 * than MAX_UDELAY_MS milliseconds, the wrapper is used.  Architecture
 * specific values can be defined in asm-???/delay.h as an override.
 * The 2nd mdelay() definition ensures GCC will optimize away the 
 * while loop for the common cases where n <= MAX_UDELAY_MS  --  Paul G.
 */

#ifndef MAX_UDELAY_MS
#define MAX_UDELAY_MS        5
#endif

#ifndef mdelay
#define mdelay(n) (\
        (__builtin_constant_p(n) && (n)<=MAX_UDELAY_MS) ? udelay((n)*1000) : \
        ({unsigned long __ms=(n); while (__ms--) udelay(1000);}))
#endif

#ifndef ndelay
static inline void ndelay(unsigned long x)
{
        udelay(DIV_ROUND_UP(x, 1000));
}
#define ndelay(x) ndelay(x)
#endif

extern unsigned long lpj_fine;
void calibrate_delay(void);
void __attribute__((weak)) calibration_delay_done(void);
void msleep(unsigned int msecs);
unsigned long msleep_interruptible(unsigned int msecs);
void usleep_range(unsigned long min, unsigned long max);

static inline void ssleep(unsigned int seconds)
{
        msleep(seconds * 1000);
}

/* see Documentation/timers/timers-howto.rst for the thresholds */
static inline void fsleep(unsigned long usecs)
{
        if (usecs <= 10)
                udelay(usecs);
        else if (usecs <= 20000)
                usleep_range(usecs, 2 * usecs);
        else
                msleep(DIV_ROUND_UP(usecs, 1000));
}

#endif /* defined(_LINUX_DELAY_H) */

































































































































































































    2 
    2 





    2 


















    2 



































    2 


    2 
























































































































































































































































































































































    1 


    1 












    1 


    1 













    1 

    1 


























    1 

    1 










    1 

    1 














    1 
























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 



    2 


































    1 


















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        Routines having to do with the 'struct sk_buff' memory handlers.
 *
 *        Authors:        Alan Cox <alan@lxorguk.ukuu.org.uk>
 *                        Florian La Roche <rzsfl@rz.uni-sb.de>
 *
 *        Fixes:
 *                Alan Cox        :        Fixed the worst of the load
 *                                        balancer bugs.
 *                Dave Platt        :        Interrupt stacking fix.
 *        Richard Kooijman        :        Timestamp fixes.
 *                Alan Cox        :        Changed buffer format.
 *                Alan Cox        :        destructor hook for AF_UNIX etc.
 *                Linus Torvalds        :        Better skb_clone.
 *                Alan Cox        :        Added skb_copy.
 *                Alan Cox        :        Added all the changed routines Linus
 *                                        only put in the headers
 *                Ray VanTassle        :        Fixed --skb->lock in free
 *                Alan Cox        :        skb_copy copy arp field
 *                Andi Kleen        :        slabified it.
 *                Robert Olsson        :        Removed skb_head_pool
 *
 *        NOTE:
 *                The __skb_ routines should be called with interrupts
 *        disabled, or you better be *real* sure that the operation is atomic
 *        with respect to whatever list is being frobbed (e.g. via lock_sock()
 *        or via disabling bottom half handlers, etc).
 */

/*
 *        The functions in this file will not compile correctly with gcc 2.4.x
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/slab.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/sctp.h>
#include <linux/netdevice.h>
#ifdef CONFIG_NET_CLS_ACT
#include <net/pkt_sched.h>
#endif
#include <linux/string.h>
#include <linux/skbuff.h>
#include <linux/splice.h>
#include <linux/cache.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/scatterlist.h>
#include <linux/errqueue.h>
#include <linux/prefetch.h>
#include <linux/if_vlan.h>
#include <linux/mpls.h>
#include <linux/kcov.h>

#include <net/protocol.h>
#include <net/dst.h>
#include <net/sock.h>
#include <net/checksum.h>
#include <net/ip6_checksum.h>
#include <net/xfrm.h>
#include <net/mpls.h>
#include <net/mptcp.h>

#include <linux/uaccess.h>
#include <trace/events/skb.h>
#include <linux/highmem.h>
#include <linux/capability.h>
#include <linux/user_namespace.h>
#include <linux/indirect_call_wrapper.h>

#include "datagram.h"
#include "sock_destructor.h"

struct kmem_cache *skbuff_head_cache __ro_after_init;
static struct kmem_cache *skbuff_fclone_cache __ro_after_init;
#ifdef CONFIG_SKB_EXTENSIONS
static struct kmem_cache *skbuff_ext_cache __ro_after_init;
#endif
int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
EXPORT_SYMBOL(sysctl_max_skb_frags);

/**
 *        skb_panic - private function for out-of-line support
 *        @skb:        buffer
 *        @sz:        size
 *        @addr:        address
 *        @msg:        skb_over_panic or skb_under_panic
 *
 *        Out-of-line support for skb_put() and skb_push().
 *        Called via the wrapper skb_over_panic() or skb_under_panic().
 *        Keep out of line to prevent kernel bloat.
 *        __builtin_return_address is not used because it is not always reliable.
 */
static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
                      const char msg[])
{
        pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n",
                 msg, addr, skb->len, sz, skb->head, skb->data,
                 (unsigned long)skb->tail, (unsigned long)skb->end,
                 skb->dev ? skb->dev->name : "<NULL>");
        BUG();
}

static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)
{
        skb_panic(skb, sz, addr, __func__);
}

static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
{
        skb_panic(skb, sz, addr, __func__);
}

/*
 * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
 * the caller if emergency pfmemalloc reserves are being used. If it is and
 * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
 * may be used. Otherwise, the packet data may be discarded until enough
 * memory is free
 */
#define kmalloc_reserve(size, gfp, node, pfmemalloc) \
         __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc)

static void *__kmalloc_reserve(size_t size, gfp_t flags, int node,
                               unsigned long ip, bool *pfmemalloc)
{
        void *obj;
        bool ret_pfmemalloc = false;

        /*
         * Try a regular allocation, when that fails and we're not entitled
         * to the reserves, fail.
         */
        obj = kmalloc_node_track_caller(size,
                                        flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
                                        node);
        if (obj || !(gfp_pfmemalloc_allowed(flags)))
                goto out;

        /* Try again but now we are using pfmemalloc reserves */
        ret_pfmemalloc = true;
        obj = kmalloc_node_track_caller(size, flags, node);

out:
        if (pfmemalloc)
                *pfmemalloc = ret_pfmemalloc;

        return obj;
}

/*         Allocate a new skbuff. We do this ourselves so we can fill in a few
 *        'private' fields and also do memory statistics to find all the
 *        [BEEP] leaks.
 *
 */

/**
 *        __alloc_skb        -        allocate a network buffer
 *        @size: size to allocate
 *        @gfp_mask: allocation mask
 *        @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
 *                instead of head cache and allocate a cloned (child) skb.
 *                If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
 *                allocations in case the data is required for writeback
 *        @node: numa node to allocate memory on
 *
 *        Allocate a new &sk_buff. The returned buffer has no headroom and a
 *        tail room of at least size bytes. The object has a reference count
 *        of one. The return is the buffer. On a failure the return is %NULL.
 *
 *        Buffers may only be allocated from interrupts using a @gfp_mask of
 *        %GFP_ATOMIC.
 */
struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
                            int flags, int node)
{
        struct kmem_cache *cache;
        struct skb_shared_info *shinfo;
        struct sk_buff *skb;
        u8 *data;
        bool pfmemalloc;

        cache = (flags & SKB_ALLOC_FCLONE)
                ? skbuff_fclone_cache : skbuff_head_cache;

        if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
                gfp_mask |= __GFP_MEMALLOC;

        /* Get the HEAD */
        skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
        if (!skb)
                goto out;
        prefetchw(skb);

        /* We do our best to align skb_shared_info on a separate cache
         * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
         * aligned memory blocks, unless SLUB/SLAB debug is enabled.
         * Both skb->head and skb_shared_info are cache line aligned.
         */
        size = SKB_DATA_ALIGN(size);
        size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
        data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
        if (!data)
                goto nodata;
        /* kmalloc(size) might give us more room than requested.
         * Put skb_shared_info exactly at the end of allocated zone,
         * to allow max possible filling before reallocation.
         */
        size = SKB_WITH_OVERHEAD(ksize(data));
        prefetchw(data + size);

        /*
         * Only clear those fields we need to clear, not those that we will
         * actually initialise below. Hence, don't put any more fields after
         * the tail pointer in struct sk_buff!
         */
        memset(skb, 0, offsetof(struct sk_buff, tail));
        /* Account for allocated memory : skb + skb->head */
        skb->truesize = SKB_TRUESIZE(size);
        skb->pfmemalloc = pfmemalloc;
        refcount_set(&skb->users, 1);
        skb->head = data;
        skb->data = data;
        skb_reset_tail_pointer(skb);
        skb->end = skb->tail + size;
        skb->mac_header = (typeof(skb->mac_header))~0U;
        skb->transport_header = (typeof(skb->transport_header))~0U;

        /* make sure we initialize shinfo sequentially */
        shinfo = skb_shinfo(skb);
        memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
        atomic_set(&shinfo->dataref, 1);

        if (flags & SKB_ALLOC_FCLONE) {
                struct sk_buff_fclones *fclones;

                fclones = container_of(skb, struct sk_buff_fclones, skb1);

                skb->fclone = SKB_FCLONE_ORIG;
                refcount_set(&fclones->fclone_ref, 1);

                fclones->skb2.fclone = SKB_FCLONE_CLONE;
        }

        skb_set_kcov_handle(skb, kcov_common_handle());

out:
        return skb;
nodata:
        kmem_cache_free(cache, skb);
        skb = NULL;
        goto out;
}
EXPORT_SYMBOL(__alloc_skb);

/* Caller must provide SKB that is memset cleared */
static struct sk_buff *__build_skb_around(struct sk_buff *skb,
                                          void *data, unsigned int frag_size)
{
        struct skb_shared_info *shinfo;
        unsigned int size = frag_size ? : ksize(data);

        size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));

        /* Assumes caller memset cleared SKB */
        skb->truesize = SKB_TRUESIZE(size);
        refcount_set(&skb->users, 1);
        skb->head = data;
        skb->data = data;
        skb_reset_tail_pointer(skb);
        skb->end = skb->tail + size;
        skb->mac_header = (typeof(skb->mac_header))~0U;
        skb->transport_header = (typeof(skb->transport_header))~0U;

        /* make sure we initialize shinfo sequentially */
        shinfo = skb_shinfo(skb);
        memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
        atomic_set(&shinfo->dataref, 1);

        skb_set_kcov_handle(skb, kcov_common_handle());

        return skb;
}

/**
 * __build_skb - build a network buffer
 * @data: data buffer provided by caller
 * @frag_size: size of data, or 0 if head was kmalloced
 *
 * Allocate a new &sk_buff. Caller provides space holding head and
 * skb_shared_info. @data must have been allocated by kmalloc() only if
 * @frag_size is 0, otherwise data should come from the page allocator
 *  or vmalloc()
 * The return is the new skb buffer.
 * On a failure the return is %NULL, and @data is not freed.
 * Notes :
 *  Before IO, driver allocates only data buffer where NIC put incoming frame
 *  Driver should add room at head (NET_SKB_PAD) and
 *  MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
 *  After IO, driver calls build_skb(), to allocate sk_buff and populate it
 *  before giving packet to stack.
 *  RX rings only contains data buffers, not full skbs.
 */
struct sk_buff *__build_skb(void *data, unsigned int frag_size)
{
        struct sk_buff *skb;

        skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
        if (unlikely(!skb))
                return NULL;

        memset(skb, 0, offsetof(struct sk_buff, tail));

        return __build_skb_around(skb, data, frag_size);
}

/* build_skb() is wrapper over __build_skb(), that specifically
 * takes care of skb->head and skb->pfmemalloc
 * This means that if @frag_size is not zero, then @data must be backed
 * by a page fragment, not kmalloc() or vmalloc()
 */
struct sk_buff *build_skb(void *data, unsigned int frag_size)
{
        struct sk_buff *skb = __build_skb(data, frag_size);

        if (skb && frag_size) {
                skb->head_frag = 1;
                if (page_is_pfmemalloc(virt_to_head_page(data)))
                        skb->pfmemalloc = 1;
        }
        return skb;
}
EXPORT_SYMBOL(build_skb);

/**
 * build_skb_around - build a network buffer around provided skb
 * @skb: sk_buff provide by caller, must be memset cleared
 * @data: data buffer provided by caller
 * @frag_size: size of data, or 0 if head was kmalloced
 */
struct sk_buff *build_skb_around(struct sk_buff *skb,
                                 void *data, unsigned int frag_size)
{
        if (unlikely(!skb))
                return NULL;

        skb = __build_skb_around(skb, data, frag_size);

        if (skb && frag_size) {
                skb->head_frag = 1;
                if (page_is_pfmemalloc(virt_to_head_page(data)))
                        skb->pfmemalloc = 1;
        }
        return skb;
}
EXPORT_SYMBOL(build_skb_around);

#define NAPI_SKB_CACHE_SIZE        64

struct napi_alloc_cache {
        struct page_frag_cache page;
        unsigned int skb_count;
        void *skb_cache[NAPI_SKB_CACHE_SIZE];
};

static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);

static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
{
        struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);

        return page_frag_alloc(&nc->page, fragsz, gfp_mask);
}

void *napi_alloc_frag(unsigned int fragsz)
{
        fragsz = SKB_DATA_ALIGN(fragsz);

        return __napi_alloc_frag(fragsz, GFP_ATOMIC);
}
EXPORT_SYMBOL(napi_alloc_frag);

/**
 * netdev_alloc_frag - allocate a page fragment
 * @fragsz: fragment size
 *
 * Allocates a frag from a page for receive buffer.
 * Uses GFP_ATOMIC allocations.
 */
void *netdev_alloc_frag(unsigned int fragsz)
{
        struct page_frag_cache *nc;
        void *data;

        fragsz = SKB_DATA_ALIGN(fragsz);
        if (in_irq() || irqs_disabled()) {
                nc = this_cpu_ptr(&netdev_alloc_cache);
                data = page_frag_alloc(nc, fragsz, GFP_ATOMIC);
        } else {
                local_bh_disable();
                data = __napi_alloc_frag(fragsz, GFP_ATOMIC);
                local_bh_enable();
        }
        return data;
}
EXPORT_SYMBOL(netdev_alloc_frag);

/**
 *        __netdev_alloc_skb - allocate an skbuff for rx on a specific device
 *        @dev: network device to receive on
 *        @len: length to allocate
 *        @gfp_mask: get_free_pages mask, passed to alloc_skb
 *
 *        Allocate a new &sk_buff and assign it a usage count of one. The
 *        buffer has NET_SKB_PAD headroom built in. Users should allocate
 *        the headroom they think they need without accounting for the
 *        built in space. The built in space is used for optimisations.
 *
 *        %NULL is returned if there is no free memory.
 */
struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
                                   gfp_t gfp_mask)
{
        struct page_frag_cache *nc;
        struct sk_buff *skb;
        bool pfmemalloc;
        void *data;

        len += NET_SKB_PAD;

        /* If requested length is either too small or too big,
         * we use kmalloc() for skb->head allocation.
         */
        if (len <= SKB_WITH_OVERHEAD(1024) ||
            len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
            (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
                skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
                if (!skb)
                        goto skb_fail;
                goto skb_success;
        }

        len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
        len = SKB_DATA_ALIGN(len);

        if (sk_memalloc_socks())
                gfp_mask |= __GFP_MEMALLOC;

        if (in_irq() || irqs_disabled()) {
                nc = this_cpu_ptr(&netdev_alloc_cache);
                data = page_frag_alloc(nc, len, gfp_mask);
                pfmemalloc = nc->pfmemalloc;
        } else {
                local_bh_disable();
                nc = this_cpu_ptr(&napi_alloc_cache.page);
                data = page_frag_alloc(nc, len, gfp_mask);
                pfmemalloc = nc->pfmemalloc;
                local_bh_enable();
        }

        if (unlikely(!data))
                return NULL;

        skb = __build_skb(data, len);
        if (unlikely(!skb)) {
                skb_free_frag(data);
                return NULL;
        }

        if (pfmemalloc)
                skb->pfmemalloc = 1;
        skb->head_frag = 1;

skb_success:
        skb_reserve(skb, NET_SKB_PAD);
        skb->dev = dev;

skb_fail:
        return skb;
}
EXPORT_SYMBOL(__netdev_alloc_skb);

/**
 *        __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
 *        @napi: napi instance this buffer was allocated for
 *        @len: length to allocate
 *        @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages
 *
 *        Allocate a new sk_buff for use in NAPI receive.  This buffer will
 *        attempt to allocate the head from a special reserved region used
 *        only for NAPI Rx allocation.  By doing this we can save several
 *        CPU cycles by avoiding having to disable and re-enable IRQs.
 *
 *        %NULL is returned if there is no free memory.
 */
struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
                                 gfp_t gfp_mask)
{
        struct napi_alloc_cache *nc;
        struct sk_buff *skb;
        void *data;

        len += NET_SKB_PAD + NET_IP_ALIGN;

        /* If requested length is either too small or too big,
         * we use kmalloc() for skb->head allocation.
         */
        if (len <= SKB_WITH_OVERHEAD(1024) ||
            len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
            (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
                skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
                if (!skb)
                        goto skb_fail;
                goto skb_success;
        }

        nc = this_cpu_ptr(&napi_alloc_cache);
        len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
        len = SKB_DATA_ALIGN(len);

        if (sk_memalloc_socks())
                gfp_mask |= __GFP_MEMALLOC;

        data = page_frag_alloc(&nc->page, len, gfp_mask);
        if (unlikely(!data))
                return NULL;

        skb = __build_skb(data, len);
        if (unlikely(!skb)) {
                skb_free_frag(data);
                return NULL;
        }

        if (nc->page.pfmemalloc)
                skb->pfmemalloc = 1;
        skb->head_frag = 1;

skb_success:
        skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
        skb->dev = napi->dev;

skb_fail:
        return skb;
}
EXPORT_SYMBOL(__napi_alloc_skb);

void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
                     int size, unsigned int truesize)
{
        skb_fill_page_desc(skb, i, page, off, size);
        skb->len += size;
        skb->data_len += size;
        skb->truesize += truesize;
}
EXPORT_SYMBOL(skb_add_rx_frag);

void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
                          unsigned int truesize)
{
        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];

        skb_frag_size_add(frag, size);
        skb->len += size;
        skb->data_len += size;
        skb->truesize += truesize;
}
EXPORT_SYMBOL(skb_coalesce_rx_frag);

static void skb_drop_list(struct sk_buff **listp)
{
        kfree_skb_list(*listp);
        *listp = NULL;
}

static inline void skb_drop_fraglist(struct sk_buff *skb)
{
        skb_drop_list(&skb_shinfo(skb)->frag_list);
}

static void skb_clone_fraglist(struct sk_buff *skb)
{
        struct sk_buff *list;

        skb_walk_frags(skb, list)
                skb_get(list);
}

static void skb_free_head(struct sk_buff *skb)
{
        unsigned char *head = skb->head;

        if (skb->head_frag)
                skb_free_frag(head);
        else
                kfree(head);
}

static void skb_release_data(struct sk_buff *skb)
{
        struct skb_shared_info *shinfo = skb_shinfo(skb);
        int i;

        if (skb->cloned &&
            atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
                              &shinfo->dataref))
                return;

        for (i = 0; i < shinfo->nr_frags; i++)
                __skb_frag_unref(&shinfo->frags[i]);

        if (shinfo->frag_list)
                kfree_skb_list(shinfo->frag_list);

        skb_zcopy_clear(skb, true);
        skb_free_head(skb);
}

/*
 *        Free an skbuff by memory without cleaning the state.
 */
static void kfree_skbmem(struct sk_buff *skb)
{
        struct sk_buff_fclones *fclones;

        switch (skb->fclone) {
        case SKB_FCLONE_UNAVAILABLE:
                kmem_cache_free(skbuff_head_cache, skb);
                return;

        case SKB_FCLONE_ORIG:
                fclones = container_of(skb, struct sk_buff_fclones, skb1);

                /* We usually free the clone (TX completion) before original skb
                 * This test would have no chance to be true for the clone,
                 * while here, branch prediction will be good.
                 */
                if (refcount_read(&fclones->fclone_ref) == 1)
                        goto fastpath;
                break;

        default: /* SKB_FCLONE_CLONE */
                fclones = container_of(skb, struct sk_buff_fclones, skb2);
                break;
        }
        if (!refcount_dec_and_test(&fclones->fclone_ref))
                return;
fastpath:
        kmem_cache_free(skbuff_fclone_cache, fclones);
}

void skb_release_head_state(struct sk_buff *skb)
{
        skb_dst_drop(skb);
        if (skb->destructor) {
                WARN_ON(in_irq());
                skb->destructor(skb);
        }
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        nf_conntrack_put(skb_nfct(skb));
#endif
        skb_ext_put(skb);
}

/* Free everything but the sk_buff shell. */
static void skb_release_all(struct sk_buff *skb)
{
        skb_release_head_state(skb);
        if (likely(skb->head))
                skb_release_data(skb);
}

/**
 *        __kfree_skb - private function
 *        @skb: buffer
 *
 *        Free an sk_buff. Release anything attached to the buffer.
 *        Clean the state. This is an internal helper function. Users should
 *        always call kfree_skb
 */

void __kfree_skb(struct sk_buff *skb)
{
        skb_release_all(skb);
        kfree_skbmem(skb);
}
EXPORT_SYMBOL(__kfree_skb);

/**
 *        kfree_skb - free an sk_buff
 *        @skb: buffer to free
 *
 *        Drop a reference to the buffer and free it if the usage count has
 *        hit zero.
 */
void kfree_skb(struct sk_buff *skb)
{
        if (!skb_unref(skb))
                return;

        trace_kfree_skb(skb, __builtin_return_address(0));
        __kfree_skb(skb);
}
EXPORT_SYMBOL(kfree_skb);

void kfree_skb_list(struct sk_buff *segs)
{
        while (segs) {
                struct sk_buff *next = segs->next;

                kfree_skb(segs);
                segs = next;
        }
}
EXPORT_SYMBOL(kfree_skb_list);

/* Dump skb information and contents.
 *
 * Must only be called from net_ratelimit()-ed paths.
 *
 * Dumps whole packets if full_pkt, only headers otherwise.
 */
void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)
{
        struct skb_shared_info *sh = skb_shinfo(skb);
        struct net_device *dev = skb->dev;
        struct sock *sk = skb->sk;
        struct sk_buff *list_skb;
        bool has_mac, has_trans;
        int headroom, tailroom;
        int i, len, seg_len;

        if (full_pkt)
                len = skb->len;
        else
                len = min_t(int, skb->len, MAX_HEADER + 128);

        headroom = skb_headroom(skb);
        tailroom = skb_tailroom(skb);

        has_mac = skb_mac_header_was_set(skb);
        has_trans = skb_transport_header_was_set(skb);

        printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n"
               "mac=(%d,%d) net=(%d,%d) trans=%d\n"
               "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n"
               "csum(0x%x ip_summed=%u complete_sw=%u valid=%u level=%u)\n"
               "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n",
               level, skb->len, headroom, skb_headlen(skb), tailroom,
               has_mac ? skb->mac_header : -1,
               has_mac ? skb_mac_header_len(skb) : -1,
               skb->network_header,
               has_trans ? skb_network_header_len(skb) : -1,
               has_trans ? skb->transport_header : -1,
               sh->tx_flags, sh->nr_frags,
               sh->gso_size, sh->gso_type, sh->gso_segs,
               skb->csum, skb->ip_summed, skb->csum_complete_sw,
               skb->csum_valid, skb->csum_level,
               skb->hash, skb->sw_hash, skb->l4_hash,
               ntohs(skb->protocol), skb->pkt_type, skb->skb_iif);

        if (dev)
                printk("%sdev name=%s feat=%pNF\n",
                       level, dev->name, &dev->features);
        if (sk)
                printk("%ssk family=%hu type=%u proto=%u\n",
                       level, sk->sk_family, sk->sk_type, sk->sk_protocol);

        if (full_pkt && headroom)
                print_hex_dump(level, "skb headroom: ", DUMP_PREFIX_OFFSET,
                               16, 1, skb->head, headroom, false);

        seg_len = min_t(int, skb_headlen(skb), len);
        if (seg_len)
                print_hex_dump(level, "skb linear:   ", DUMP_PREFIX_OFFSET,
                               16, 1, skb->data, seg_len, false);
        len -= seg_len;

        if (full_pkt && tailroom)
                print_hex_dump(level, "skb tailroom: ", DUMP_PREFIX_OFFSET,
                               16, 1, skb_tail_pointer(skb), tailroom, false);

        for (i = 0; len && i < skb_shinfo(skb)->nr_frags; i++) {
                skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
                u32 p_off, p_len, copied;
                struct page *p;
                u8 *vaddr;

                skb_frag_foreach_page(frag, skb_frag_off(frag),
                                      skb_frag_size(frag), p, p_off, p_len,
                                      copied) {
                        seg_len = min_t(int, p_len, len);
                        vaddr = kmap_atomic(p);
                        print_hex_dump(level, "skb frag:     ",
                                       DUMP_PREFIX_OFFSET,
                                       16, 1, vaddr + p_off, seg_len, false);
                        kunmap_atomic(vaddr);
                        len -= seg_len;
                        if (!len)
                                break;
                }
        }

        if (full_pkt && skb_has_frag_list(skb)) {
                printk("skb fraglist:\n");
                skb_walk_frags(skb, list_skb)
                        skb_dump(level, list_skb, true);
        }
}
EXPORT_SYMBOL(skb_dump);

/**
 *        skb_tx_error - report an sk_buff xmit error
 *        @skb: buffer that triggered an error
 *
 *        Report xmit error if a device callback is tracking this skb.
 *        skb must be freed afterwards.
 */
void skb_tx_error(struct sk_buff *skb)
{
        skb_zcopy_clear(skb, true);
}
EXPORT_SYMBOL(skb_tx_error);

#ifdef CONFIG_TRACEPOINTS
/**
 *        consume_skb - free an skbuff
 *        @skb: buffer to free
 *
 *        Drop a ref to the buffer and free it if the usage count has hit zero
 *        Functions identically to kfree_skb, but kfree_skb assumes that the frame
 *        is being dropped after a failure and notes that
 */
void consume_skb(struct sk_buff *skb)
{
        if (!skb_unref(skb))
                return;

        trace_consume_skb(skb);
        __kfree_skb(skb);
}
EXPORT_SYMBOL(consume_skb);
#endif

/**
 *        consume_stateless_skb - free an skbuff, assuming it is stateless
 *        @skb: buffer to free
 *
 *        Alike consume_skb(), but this variant assumes that this is the last
 *        skb reference and all the head states have been already dropped
 */
void __consume_stateless_skb(struct sk_buff *skb)
{
        trace_consume_skb(skb);
        skb_release_data(skb);
        kfree_skbmem(skb);
}

void __kfree_skb_flush(void)
{
        struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);

        /* flush skb_cache if containing objects */
        if (nc->skb_count) {
                kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
                                     nc->skb_cache);
                nc->skb_count = 0;
        }
}

static inline void _kfree_skb_defer(struct sk_buff *skb)
{
        struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);

        /* drop skb->head and call any destructors for packet */
        skb_release_all(skb);

        /* record skb to CPU local list */
        nc->skb_cache[nc->skb_count++] = skb;

#ifdef CONFIG_SLUB
        /* SLUB writes into objects when freeing */
        prefetchw(skb);
#endif

        /* flush skb_cache if it is filled */
        if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
                kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_SIZE,
                                     nc->skb_cache);
                nc->skb_count = 0;
        }
}
void __kfree_skb_defer(struct sk_buff *skb)
{
        _kfree_skb_defer(skb);
}

void napi_consume_skb(struct sk_buff *skb, int budget)
{
        /* Zero budget indicate non-NAPI context called us, like netpoll */
        if (unlikely(!budget)) {
                dev_consume_skb_any(skb);
                return;
        }

        if (!skb_unref(skb))
                return;

        /* if reaching here SKB is ready to free */
        trace_consume_skb(skb);

        /* if SKB is a clone, don't handle this case */
        if (skb->fclone != SKB_FCLONE_UNAVAILABLE) {
                __kfree_skb(skb);
                return;
        }

        _kfree_skb_defer(skb);
}
EXPORT_SYMBOL(napi_consume_skb);

/* Make sure a field is enclosed inside headers_start/headers_end section */
#define CHECK_SKB_FIELD(field) \
        BUILD_BUG_ON(offsetof(struct sk_buff, field) <                \
                     offsetof(struct sk_buff, headers_start));        \
        BUILD_BUG_ON(offsetof(struct sk_buff, field) >                \
                     offsetof(struct sk_buff, headers_end));        \

static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
{
        new->tstamp                = old->tstamp;
        /* We do not copy old->sk */
        new->dev                = old->dev;
        memcpy(new->cb, old->cb, sizeof(old->cb));
        skb_dst_copy(new, old);
        __skb_ext_copy(new, old);
        __nf_copy(new, old, false);

        /* Note : this field could be in headers_start/headers_end section
         * It is not yet because we do not want to have a 16 bit hole
         */
        new->queue_mapping = old->queue_mapping;

        memcpy(&new->headers_start, &old->headers_start,
               offsetof(struct sk_buff, headers_end) -
               offsetof(struct sk_buff, headers_start));
        CHECK_SKB_FIELD(protocol);
        CHECK_SKB_FIELD(csum);
        CHECK_SKB_FIELD(hash);
        CHECK_SKB_FIELD(priority);
        CHECK_SKB_FIELD(skb_iif);
        CHECK_SKB_FIELD(vlan_proto);
        CHECK_SKB_FIELD(vlan_tci);
        CHECK_SKB_FIELD(transport_header);
        CHECK_SKB_FIELD(network_header);
        CHECK_SKB_FIELD(mac_header);
        CHECK_SKB_FIELD(inner_protocol);
        CHECK_SKB_FIELD(inner_transport_header);
        CHECK_SKB_FIELD(inner_network_header);
        CHECK_SKB_FIELD(inner_mac_header);
        CHECK_SKB_FIELD(mark);
#ifdef CONFIG_NETWORK_SECMARK
        CHECK_SKB_FIELD(secmark);
#endif
#ifdef CONFIG_NET_RX_BUSY_POLL
        CHECK_SKB_FIELD(napi_id);
#endif
#ifdef CONFIG_XPS
        CHECK_SKB_FIELD(sender_cpu);
#endif
#ifdef CONFIG_NET_SCHED
        CHECK_SKB_FIELD(tc_index);
#endif

}

/*
 * You should not add any new code to this function.  Add it to
 * __copy_skb_header above instead.
 */
static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
{
#define C(x) n->x = skb->x

        n->next = n->prev = NULL;
        n->sk = NULL;
        __copy_skb_header(n, skb);

        C(len);
        C(data_len);
        C(mac_len);
        n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
        n->cloned = 1;
        n->nohdr = 0;
        n->peeked = 0;
        C(pfmemalloc);
        n->destructor = NULL;
        C(tail);
        C(end);
        C(head);
        C(head_frag);
        C(data);
        C(truesize);
        refcount_set(&n->users, 1);

        atomic_inc(&(skb_shinfo(skb)->dataref));
        skb->cloned = 1;

        return n;
#undef C
}

/**
 * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg
 * @first: first sk_buff of the msg
 */
struct sk_buff *alloc_skb_for_msg(struct sk_buff *first)
{
        struct sk_buff *n;

        n = alloc_skb(0, GFP_ATOMIC);
        if (!n)
                return NULL;

        n->len = first->len;
        n->data_len = first->len;
        n->truesize = first->truesize;

        skb_shinfo(n)->frag_list = first;

        __copy_skb_header(n, first);
        n->destructor = NULL;

        return n;
}
EXPORT_SYMBOL_GPL(alloc_skb_for_msg);

/**
 *        skb_morph        -        morph one skb into another
 *        @dst: the skb to receive the contents
 *        @src: the skb to supply the contents
 *
 *        This is identical to skb_clone except that the target skb is
 *        supplied by the user.
 *
 *        The target skb is returned upon exit.
 */
struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
{
        skb_release_all(dst);
        return __skb_clone(dst, src);
}
EXPORT_SYMBOL_GPL(skb_morph);

int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
{
        unsigned long max_pg, num_pg, new_pg, old_pg;
        struct user_struct *user;

        if (capable(CAP_IPC_LOCK) || !size)
                return 0;

        num_pg = (size >> PAGE_SHIFT) + 2;        /* worst case */
        max_pg = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
        user = mmp->user ? : current_user();

        do {
                old_pg = atomic_long_read(&user->locked_vm);
                new_pg = old_pg + num_pg;
                if (new_pg > max_pg)
                        return -ENOBUFS;
        } while (atomic_long_cmpxchg(&user->locked_vm, old_pg, new_pg) !=
                 old_pg);

        if (!mmp->user) {
                mmp->user = get_uid(user);
                mmp->num_pg = num_pg;
        } else {
                mmp->num_pg += num_pg;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(mm_account_pinned_pages);

void mm_unaccount_pinned_pages(struct mmpin *mmp)
{
        if (mmp->user) {
                atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm);
                free_uid(mmp->user);
        }
}
EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages);

struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
{
        struct ubuf_info *uarg;
        struct sk_buff *skb;

        WARN_ON_ONCE(!in_task());

        skb = sock_omalloc(sk, 0, GFP_KERNEL);
        if (!skb)
                return NULL;

        BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb));
        uarg = (void *)skb->cb;
        uarg->mmp.user = NULL;

        if (mm_account_pinned_pages(&uarg->mmp, size)) {
                kfree_skb(skb);
                return NULL;
        }

        uarg->callback = sock_zerocopy_callback;
        uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;
        uarg->len = 1;
        uarg->bytelen = size;
        uarg->zerocopy = 1;
        refcount_set(&uarg->refcnt, 1);
        sock_hold(sk);

        return uarg;
}
EXPORT_SYMBOL_GPL(sock_zerocopy_alloc);

static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg)
{
        return container_of((void *)uarg, struct sk_buff, cb);
}

struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
                                        struct ubuf_info *uarg)
{
        if (uarg) {
                const u32 byte_limit = 1 << 19;                /* limit to a few TSO */
                u32 bytelen, next;

                /* realloc only when socket is locked (TCP, UDP cork),
                 * so uarg->len and sk_zckey access is serialized
                 */
                if (!sock_owned_by_user(sk)) {
                        WARN_ON_ONCE(1);
                        return NULL;
                }

                bytelen = uarg->bytelen + size;
                if (uarg->len == USHRT_MAX - 1 || bytelen > byte_limit) {
                        /* TCP can create new skb to attach new uarg */
                        if (sk->sk_type == SOCK_STREAM)
                                goto new_alloc;
                        return NULL;
                }

                next = (u32)atomic_read(&sk->sk_zckey);
                if ((u32)(uarg->id + uarg->len) == next) {
                        if (mm_account_pinned_pages(&uarg->mmp, size))
                                return NULL;
                        uarg->len++;
                        uarg->bytelen = bytelen;
                        atomic_set(&sk->sk_zckey, ++next);

                        /* no extra ref when appending to datagram (MSG_MORE) */
                        if (sk->sk_type == SOCK_STREAM)
                                sock_zerocopy_get(uarg);

                        return uarg;
                }
        }

new_alloc:
        return sock_zerocopy_alloc(sk, size);
}
EXPORT_SYMBOL_GPL(sock_zerocopy_realloc);

static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len)
{
        struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
        u32 old_lo, old_hi;
        u64 sum_len;

        old_lo = serr->ee.ee_info;
        old_hi = serr->ee.ee_data;
        sum_len = old_hi - old_lo + 1ULL + len;

        if (sum_len >= (1ULL << 32))
                return false;

        if (lo != old_hi + 1)
                return false;

        serr->ee.ee_data += len;
        return true;
}

void sock_zerocopy_callback(struct ubuf_info *uarg, bool success)
{
        struct sk_buff *tail, *skb = skb_from_uarg(uarg);
        struct sock_exterr_skb *serr;
        struct sock *sk = skb->sk;
        struct sk_buff_head *q;
        unsigned long flags;
        u32 lo, hi;
        u16 len;

        mm_unaccount_pinned_pages(&uarg->mmp);

        /* if !len, there was only 1 call, and it was aborted
         * so do not queue a completion notification
         */
        if (!uarg->len || sock_flag(sk, SOCK_DEAD))
                goto release;

        len = uarg->len;
        lo = uarg->id;
        hi = uarg->id + len - 1;

        serr = SKB_EXT_ERR(skb);
        memset(serr, 0, sizeof(*serr));
        serr->ee.ee_errno = 0;
        serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY;
        serr->ee.ee_data = hi;
        serr->ee.ee_info = lo;
        if (!success)
                serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED;

        q = &sk->sk_error_queue;
        spin_lock_irqsave(&q->lock, flags);
        tail = skb_peek_tail(q);
        if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY ||
            !skb_zerocopy_notify_extend(tail, lo, len)) {
                __skb_queue_tail(q, skb);
                skb = NULL;
        }
        spin_unlock_irqrestore(&q->lock, flags);

        sk->sk_error_report(sk);

release:
        consume_skb(skb);
        sock_put(sk);
}
EXPORT_SYMBOL_GPL(sock_zerocopy_callback);

void sock_zerocopy_put(struct ubuf_info *uarg)
{
        if (uarg && refcount_dec_and_test(&uarg->refcnt)) {
                if (uarg->callback)
                        uarg->callback(uarg, uarg->zerocopy);
                else
                        consume_skb(skb_from_uarg(uarg));
        }
}
EXPORT_SYMBOL_GPL(sock_zerocopy_put);

void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
{
        if (uarg) {
                struct sock *sk = skb_from_uarg(uarg)->sk;

                atomic_dec(&sk->sk_zckey);
                uarg->len--;

                if (have_uref)
                        sock_zerocopy_put(uarg);
        }
}
EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort);

int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len)
{
        return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len);
}
EXPORT_SYMBOL_GPL(skb_zerocopy_iter_dgram);

int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
                             struct msghdr *msg, int len,
                             struct ubuf_info *uarg)
{
        struct ubuf_info *orig_uarg = skb_zcopy(skb);
        struct iov_iter orig_iter = msg->msg_iter;
        int err, orig_len = skb->len;

        /* An skb can only point to one uarg. This edge case happens when
         * TCP appends to an skb, but zerocopy_realloc triggered a new alloc.
         */
        if (orig_uarg && uarg != orig_uarg)
                return -EEXIST;

        err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len);
        if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
                struct sock *save_sk = skb->sk;

                /* Streams do not free skb on error. Reset to prev state. */
                msg->msg_iter = orig_iter;
                skb->sk = sk;
                ___pskb_trim(skb, orig_len);
                skb->sk = save_sk;
                return err;
        }

        skb_zcopy_set(skb, uarg, NULL);
        return skb->len - orig_len;
}
EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);

static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
                              gfp_t gfp_mask)
{
        if (skb_zcopy(orig)) {
                if (skb_zcopy(nskb)) {
                        /* !gfp_mask callers are verified to !skb_zcopy(nskb) */
                        if (!gfp_mask) {
                                WARN_ON_ONCE(1);
                                return -ENOMEM;
                        }
                        if (skb_uarg(nskb) == skb_uarg(orig))
                                return 0;
                        if (skb_copy_ubufs(nskb, GFP_ATOMIC))
                                return -EIO;
                }
                skb_zcopy_set(nskb, skb_uarg(orig), NULL);
        }
        return 0;
}

/**
 *        skb_copy_ubufs        -        copy userspace skb frags buffers to kernel
 *        @skb: the skb to modify
 *        @gfp_mask: allocation priority
 *
 *        This must be called on SKBTX_DEV_ZEROCOPY skb.
 *        It will copy all frags into kernel and drop the reference
 *        to userspace pages.
 *
 *        If this function is called from an interrupt gfp_mask() must be
 *        %GFP_ATOMIC.
 *
 *        Returns 0 on success or a negative error code on failure
 *        to allocate kernel memory to copy to.
 */
int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
{
        int num_frags = skb_shinfo(skb)->nr_frags;
        struct page *page, *head = NULL;
        int i, new_frags;
        u32 d_off;

        if (skb_shared(skb) || skb_unclone(skb, gfp_mask))
                return -EINVAL;

        if (!num_frags)
                goto release;

        new_frags = (__skb_pagelen(skb) + PAGE_SIZE - 1) >> PAGE_SHIFT;
        for (i = 0; i < new_frags; i++) {
                page = alloc_page(gfp_mask);
                if (!page) {
                        while (head) {
                                struct page *next = (struct page *)page_private(head);
                                put_page(head);
                                head = next;
                        }
                        return -ENOMEM;
                }
                set_page_private(page, (unsigned long)head);
                head = page;
        }

        page = head;
        d_off = 0;
        for (i = 0; i < num_frags; i++) {
                skb_frag_t *f = &skb_shinfo(skb)->frags[i];
                u32 p_off, p_len, copied;
                struct page *p;
                u8 *vaddr;

                skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f),
                                      p, p_off, p_len, copied) {
                        u32 copy, done = 0;
                        vaddr = kmap_atomic(p);

                        while (done < p_len) {
                                if (d_off == PAGE_SIZE) {
                                        d_off = 0;
                                        page = (struct page *)page_private(page);
                                }
                                copy = min_t(u32, PAGE_SIZE - d_off, p_len - done);
                                memcpy(page_address(page) + d_off,
                                       vaddr + p_off + done, copy);
                                done += copy;
                                d_off += copy;
                        }
                        kunmap_atomic(vaddr);
                }
        }

        /* skb frags release userspace buffers */
        for (i = 0; i < num_frags; i++)
                skb_frag_unref(skb, i);

        /* skb frags point to kernel buffers */
        for (i = 0; i < new_frags - 1; i++) {
                __skb_fill_page_desc(skb, i, head, 0, PAGE_SIZE);
                head = (struct page *)page_private(head);
        }
        __skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off);
        skb_shinfo(skb)->nr_frags = new_frags;

release:
        skb_zcopy_clear(skb, false);
        return 0;
}
EXPORT_SYMBOL_GPL(skb_copy_ubufs);

/**
 *        skb_clone        -        duplicate an sk_buff
 *        @skb: buffer to clone
 *        @gfp_mask: allocation priority
 *
 *        Duplicate an &sk_buff. The new one is not owned by a socket. Both
 *        copies share the same packet data but not structure. The new
 *        buffer has a reference count of 1. If the allocation fails the
 *        function returns %NULL otherwise the new buffer is returned.
 *
 *        If this function is called from an interrupt gfp_mask() must be
 *        %GFP_ATOMIC.
 */

struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
{
        struct sk_buff_fclones *fclones = container_of(skb,
                                                       struct sk_buff_fclones,
                                                       skb1);
        struct sk_buff *n;

        if (skb_orphan_frags(skb, gfp_mask))
                return NULL;

        if (skb->fclone == SKB_FCLONE_ORIG &&
            refcount_read(&fclones->fclone_ref) == 1) {
                n = &fclones->skb2;
                refcount_set(&fclones->fclone_ref, 2);
        } else {
                if (skb_pfmemalloc(skb))
                        gfp_mask |= __GFP_MEMALLOC;

                n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
                if (!n)
                        return NULL;

                n->fclone = SKB_FCLONE_UNAVAILABLE;
        }

        return __skb_clone(n, skb);
}
EXPORT_SYMBOL(skb_clone);

void skb_headers_offset_update(struct sk_buff *skb, int off)
{
        /* Only adjust this if it actually is csum_start rather than csum */
        if (skb->ip_summed == CHECKSUM_PARTIAL)
                skb->csum_start += off;
        /* {transport,network,mac}_header and tail are relative to skb->head */
        skb->transport_header += off;
        skb->network_header   += off;
        if (skb_mac_header_was_set(skb))
                skb->mac_header += off;
        skb->inner_transport_header += off;
        skb->inner_network_header += off;
        skb->inner_mac_header += off;
}
EXPORT_SYMBOL(skb_headers_offset_update);

void skb_copy_header(struct sk_buff *new, const struct sk_buff *old)
{
        __copy_skb_header(new, old);

        skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
        skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
        skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
}
EXPORT_SYMBOL(skb_copy_header);

static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
{
        if (skb_pfmemalloc(skb))
                return SKB_ALLOC_RX;
        return 0;
}

/**
 *        skb_copy        -        create private copy of an sk_buff
 *        @skb: buffer to copy
 *        @gfp_mask: allocation priority
 *
 *        Make a copy of both an &sk_buff and its data. This is used when the
 *        caller wishes to modify the data and needs a private copy of the
 *        data to alter. Returns %NULL on failure or the pointer to the buffer
 *        on success. The returned buffer has a reference count of 1.
 *
 *        As by-product this function converts non-linear &sk_buff to linear
 *        one, so that &sk_buff becomes completely private and caller is allowed
 *        to modify all the data of returned buffer. This means that this
 *        function is not recommended for use in circumstances when only
 *        header is going to be modified. Use pskb_copy() instead.
 */

struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
{
        struct sk_buff *n;
        unsigned int size;
        int headerlen;

        if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST))
                return NULL;

        headerlen = skb_headroom(skb);
        size = skb_end_offset(skb) + skb->data_len;
        n = __alloc_skb(size, gfp_mask,
                        skb_alloc_rx_flag(skb), NUMA_NO_NODE);
        if (!n)
                return NULL;

        /* Set the data pointer */
        skb_reserve(n, headerlen);
        /* Set the tail pointer and length */
        skb_put(n, skb->len);

        BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len));

        skb_copy_header(n, skb);
        return n;
}
EXPORT_SYMBOL(skb_copy);

/**
 *        __pskb_copy_fclone        -  create copy of an sk_buff with private head.
 *        @skb: buffer to copy
 *        @headroom: headroom of new skb
 *        @gfp_mask: allocation priority
 *        @fclone: if true allocate the copy of the skb from the fclone
 *        cache instead of the head cache; it is recommended to set this
 *        to true for the cases where the copy will likely be cloned
 *
 *        Make a copy of both an &sk_buff and part of its data, located
 *        in header. Fragmented data remain shared. This is used when
 *        the caller wishes to modify only header of &sk_buff and needs
 *        private copy of the header to alter. Returns %NULL on failure
 *        or the pointer to the buffer on success.
 *        The returned buffer has a reference count of 1.
 */

struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
                                   gfp_t gfp_mask, bool fclone)
{
        unsigned int size = skb_headlen(skb) + headroom;
        int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0);
        struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE);

        if (!n)
                goto out;

        /* Set the data pointer */
        skb_reserve(n, headroom);
        /* Set the tail pointer and length */
        skb_put(n, skb_headlen(skb));
        /* Copy the bytes */
        skb_copy_from_linear_data(skb, n->data, n->len);

        n->truesize += skb->data_len;
        n->data_len  = skb->data_len;
        n->len             = skb->len;

        if (skb_shinfo(skb)->nr_frags) {
                int i;

                if (skb_orphan_frags(skb, gfp_mask) ||
                    skb_zerocopy_clone(n, skb, gfp_mask)) {
                        kfree_skb(n);
                        n = NULL;
                        goto out;
                }
                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                        skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
                        skb_frag_ref(skb, i);
                }
                skb_shinfo(n)->nr_frags = i;
                skb_shinfo(n)->tx_flags |= skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG;
        }

        if (skb_has_frag_list(skb)) {
                skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
                skb_clone_fraglist(n);
        }

        skb_copy_header(n, skb);
out:
        return n;
}
EXPORT_SYMBOL(__pskb_copy_fclone);

/**
 *        pskb_expand_head - reallocate header of &sk_buff
 *        @skb: buffer to reallocate
 *        @nhead: room to add at head
 *        @ntail: room to add at tail
 *        @gfp_mask: allocation priority
 *
 *        Expands (or creates identical copy, if @nhead and @ntail are zero)
 *        header of @skb. &sk_buff itself is not changed. &sk_buff MUST have
 *        reference count of 1. Returns zero in the case of success or error,
 *        if expansion failed. In the last case, &sk_buff is not changed.
 *
 *        All the pointers pointing into skb header may change and must be
 *        reloaded after call to this function.
 */

int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
                     gfp_t gfp_mask)
{
        int i, osize = skb_end_offset(skb);
        int size = osize + nhead + ntail;
        long off;
        u8 *data;

        BUG_ON(nhead < 0);

        BUG_ON(skb_shared(skb));

        size = SKB_DATA_ALIGN(size);

        if (skb_pfmemalloc(skb))
                gfp_mask |= __GFP_MEMALLOC;
        data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
                               gfp_mask, NUMA_NO_NODE, NULL);
        if (!data)
                goto nodata;
        size = SKB_WITH_OVERHEAD(ksize(data));

        /* Copy only real data... and, alas, header. This should be
         * optimized for the cases when header is void.
         */
        memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head);

        memcpy((struct skb_shared_info *)(data + size),
               skb_shinfo(skb),
               offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));

        /*
         * if shinfo is shared we must drop the old head gracefully, but if it
         * is not we can just drop the old head and let the existing refcount
         * be since all we did is relocate the values
         */
        if (skb_cloned(skb)) {
                if (skb_orphan_frags(skb, gfp_mask))
                        goto nofrags;
                if (skb_zcopy(skb))
                        refcount_inc(&skb_uarg(skb)->refcnt);
                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
                        skb_frag_ref(skb, i);

                if (skb_has_frag_list(skb))
                        skb_clone_fraglist(skb);

                skb_release_data(skb);
        } else {
                skb_free_head(skb);
        }
        off = (data + nhead) - skb->head;

        skb->head     = data;
        skb->head_frag = 0;
        skb->data    += off;
#ifdef NET_SKBUFF_DATA_USES_OFFSET
        skb->end      = size;
        off           = nhead;
#else
        skb->end      = skb->head + size;
#endif
        skb->tail              += off;
        skb_headers_offset_update(skb, nhead);
        skb->cloned   = 0;
        skb->hdr_len  = 0;
        skb->nohdr    = 0;
        atomic_set(&skb_shinfo(skb)->dataref, 1);

        skb_metadata_clear(skb);

        /* It is not generally safe to change skb->truesize.
         * For the moment, we really care of rx path, or
         * when skb is orphaned (not attached to a socket).
         */
        if (!skb->sk || skb->destructor == sock_edemux)
                skb->truesize += size - osize;

        return 0;

nofrags:
        kfree(data);
nodata:
        return -ENOMEM;
}
EXPORT_SYMBOL(pskb_expand_head);

/* Make private copy of skb with writable head and some headroom */

struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
{
        struct sk_buff *skb2;
        int delta = headroom - skb_headroom(skb);

        if (delta <= 0)
                skb2 = pskb_copy(skb, GFP_ATOMIC);
        else {
                skb2 = skb_clone(skb, GFP_ATOMIC);
                if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0,
                                             GFP_ATOMIC)) {
                        kfree_skb(skb2);
                        skb2 = NULL;
                }
        }
        return skb2;
}
EXPORT_SYMBOL(skb_realloc_headroom);

/**
 *        skb_expand_head - reallocate header of &sk_buff
 *        @skb: buffer to reallocate
 *        @headroom: needed headroom
 *
 *        Unlike skb_realloc_headroom, this one does not allocate a new skb
 *        if possible; copies skb->sk to new skb as needed
 *        and frees original skb in case of failures.
 *
 *        It expect increased headroom and generates warning otherwise.
 */

struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
{
        int delta = headroom - skb_headroom(skb);
        int osize = skb_end_offset(skb);
        struct sock *sk = skb->sk;

        if (WARN_ONCE(delta <= 0,
                      "%s is expecting an increase in the headroom", __func__))
                return skb;

        delta = SKB_DATA_ALIGN(delta);
        /* pskb_expand_head() might crash, if skb is shared. */
        if (skb_shared(skb) || !is_skb_wmem(skb)) {
                struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);

                if (unlikely(!nskb))
                        goto fail;

                if (sk)
                        skb_set_owner_w(nskb, sk);
                consume_skb(skb);
                skb = nskb;
        }
        if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC))
                goto fail;

        if (sk && is_skb_wmem(skb)) {
                delta = skb_end_offset(skb) - osize;
                refcount_add(delta, &sk->sk_wmem_alloc);
                skb->truesize += delta;
        }
        return skb;

fail:
        kfree_skb(skb);
        return NULL;
}
EXPORT_SYMBOL(skb_expand_head);

/**
 *        skb_copy_expand        -        copy and expand sk_buff
 *        @skb: buffer to copy
 *        @newheadroom: new free bytes at head
 *        @newtailroom: new free bytes at tail
 *        @gfp_mask: allocation priority
 *
 *        Make a copy of both an &sk_buff and its data and while doing so
 *        allocate additional space.
 *
 *        This is used when the caller wishes to modify the data and needs a
 *        private copy of the data to alter as well as more space for new fields.
 *        Returns %NULL on failure or the pointer to the buffer
 *        on success. The returned buffer has a reference count of 1.
 *
 *        You must pass %GFP_ATOMIC as the allocation priority if this function
 *        is called from an interrupt.
 */
struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
                                int newheadroom, int newtailroom,
                                gfp_t gfp_mask)
{
        /*
         *        Allocate the copy buffer
         */
        int head_copy_len, head_copy_off;
        struct sk_buff *n;
        int oldheadroom;

        if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST))
                return NULL;

        oldheadroom = skb_headroom(skb);
        n = __alloc_skb(newheadroom + skb->len + newtailroom,
                        gfp_mask, skb_alloc_rx_flag(skb),
                        NUMA_NO_NODE);
        if (!n)
                return NULL;

        skb_reserve(n, newheadroom);

        /* Set the tail pointer and length */
        skb_put(n, skb->len);

        head_copy_len = oldheadroom;
        head_copy_off = 0;
        if (newheadroom <= head_copy_len)
                head_copy_len = newheadroom;
        else
                head_copy_off = newheadroom - head_copy_len;

        /* Copy the linear header and data. */
        BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
                             skb->len + head_copy_len));

        skb_copy_header(n, skb);

        skb_headers_offset_update(n, newheadroom - oldheadroom);

        return n;
}
EXPORT_SYMBOL(skb_copy_expand);

/**
 *        __skb_pad                -        zero pad the tail of an skb
 *        @skb: buffer to pad
 *        @pad: space to pad
 *        @free_on_error: free buffer on error
 *
 *        Ensure that a buffer is followed by a padding area that is zero
 *        filled. Used by network drivers which may DMA or transfer data
 *        beyond the buffer end onto the wire.
 *
 *        May return error in out of memory cases. The skb is freed on error
 *        if @free_on_error is true.
 */

int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error)
{
        int err;
        int ntail;

        /* If the skbuff is non linear tailroom is always zero.. */
        if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) {
                memset(skb->data+skb->len, 0, pad);
                return 0;
        }

        ntail = skb->data_len + pad - (skb->end - skb->tail);
        if (likely(skb_cloned(skb) || ntail > 0)) {
                err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC);
                if (unlikely(err))
                        goto free_skb;
        }

        /* FIXME: The use of this function with non-linear skb's really needs
         * to be audited.
         */
        err = skb_linearize(skb);
        if (unlikely(err))
                goto free_skb;

        memset(skb->data + skb->len, 0, pad);
        return 0;

free_skb:
        if (free_on_error)
                kfree_skb(skb);
        return err;
}
EXPORT_SYMBOL(__skb_pad);

/**
 *        pskb_put - add data to the tail of a potentially fragmented buffer
 *        @skb: start of the buffer to use
 *        @tail: tail fragment of the buffer to use
 *        @len: amount of data to add
 *
 *        This function extends the used data area of the potentially
 *        fragmented buffer. @tail must be the last fragment of @skb -- or
 *        @skb itself. If this would exceed the total buffer size the kernel
 *        will panic. A pointer to the first byte of the extra data is
 *        returned.
 */

void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len)
{
        if (tail != skb) {
                skb->data_len += len;
                skb->len += len;
        }
        return skb_put(tail, len);
}
EXPORT_SYMBOL_GPL(pskb_put);

/**
 *        skb_put - add data to a buffer
 *        @skb: buffer to use
 *        @len: amount of data to add
 *
 *        This function extends the used data area of the buffer. If this would
 *        exceed the total buffer size the kernel will panic. A pointer to the
 *        first byte of the extra data is returned.
 */
void *skb_put(struct sk_buff *skb, unsigned int len)
{
        void *tmp = skb_tail_pointer(skb);
        SKB_LINEAR_ASSERT(skb);
        skb->tail += len;
        skb->len  += len;
        if (unlikely(skb->tail > skb->end))
                skb_over_panic(skb, len, __builtin_return_address(0));
        return tmp;
}
EXPORT_SYMBOL(skb_put);

/**
 *        skb_push - add data to the start of a buffer
 *        @skb: buffer to use
 *        @len: amount of data to add
 *
 *        This function extends the used data area of the buffer at the buffer
 *        start. If this would exceed the total buffer headroom the kernel will
 *        panic. A pointer to the first byte of the extra data is returned.
 */
void *skb_push(struct sk_buff *skb, unsigned int len)
{
        skb->data -= len;
        skb->len  += len;
        if (unlikely(skb->data < skb->head))
                skb_under_panic(skb, len, __builtin_return_address(0));
        return skb->data;
}
EXPORT_SYMBOL(skb_push);

/**
 *        skb_pull - remove data from the start of a buffer
 *        @skb: buffer to use
 *        @len: amount of data to remove
 *
 *        This function removes data from the start of a buffer, returning
 *        the memory to the headroom. A pointer to the next data in the buffer
 *        is returned. Once the data has been pulled future pushes will overwrite
 *        the old data.
 */
void *skb_pull(struct sk_buff *skb, unsigned int len)
{
        return skb_pull_inline(skb, len);
}
EXPORT_SYMBOL(skb_pull);

/**
 *        skb_trim - remove end from a buffer
 *        @skb: buffer to alter
 *        @len: new length
 *
 *        Cut the length of a buffer down by removing data from the tail. If
 *        the buffer is already under the length specified it is not modified.
 *        The skb must be linear.
 */
void skb_trim(struct sk_buff *skb, unsigned int len)
{
        if (skb->len > len)
                __skb_trim(skb, len);
}
EXPORT_SYMBOL(skb_trim);

/* Trims skb to length len. It can change skb pointers.
 */

int ___pskb_trim(struct sk_buff *skb, unsigned int len)
{
        struct sk_buff **fragp;
        struct sk_buff *frag;
        int offset = skb_headlen(skb);
        int nfrags = skb_shinfo(skb)->nr_frags;
        int i;
        int err;

        if (skb_cloned(skb) &&
            unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))))
                return err;

        i = 0;
        if (offset >= len)
                goto drop_pages;

        for (; i < nfrags; i++) {
                int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]);

                if (end < len) {
                        offset = end;
                        continue;
                }

                skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset);

drop_pages:
                skb_shinfo(skb)->nr_frags = i;

                for (; i < nfrags; i++)
                        skb_frag_unref(skb, i);

                if (skb_has_frag_list(skb))
                        skb_drop_fraglist(skb);
                goto done;
        }

        for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp);
             fragp = &frag->next) {
                int end = offset + frag->len;

                if (skb_shared(frag)) {
                        struct sk_buff *nfrag;

                        nfrag = skb_clone(frag, GFP_ATOMIC);
                        if (unlikely(!nfrag))
                                return -ENOMEM;

                        nfrag->next = frag->next;
                        consume_skb(frag);
                        frag = nfrag;
                        *fragp = frag;
                }

                if (end < len) {
                        offset = end;
                        continue;
                }

                if (end > len &&
                    unlikely((err = pskb_trim(frag, len - offset))))
                        return err;

                if (frag->next)
                        skb_drop_list(&frag->next);
                break;
        }

done:
        if (len > skb_headlen(skb)) {
                skb->data_len -= skb->len - len;
                skb->len       = len;
        } else {
                skb->len       = len;
                skb->data_len  = 0;
                skb_set_tail_pointer(skb, len);
        }

        if (!skb->sk || skb->destructor == sock_edemux)
                skb_condense(skb);
        return 0;
}
EXPORT_SYMBOL(___pskb_trim);

/* Note : use pskb_trim_rcsum() instead of calling this directly
 */
int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE) {
                int delta = skb->len - len;

                skb->csum = csum_block_sub(skb->csum,
                                           skb_checksum(skb, len, delta, 0),
                                           len);
        } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
                int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len;
                int offset = skb_checksum_start_offset(skb) + skb->csum_offset;

                if (offset + sizeof(__sum16) > hdlen)
                        return -EINVAL;
        }
        return __pskb_trim(skb, len);
}
EXPORT_SYMBOL(pskb_trim_rcsum_slow);

/**
 *        __pskb_pull_tail - advance tail of skb header
 *        @skb: buffer to reallocate
 *        @delta: number of bytes to advance tail
 *
 *        The function makes a sense only on a fragmented &sk_buff,
 *        it expands header moving its tail forward and copying necessary
 *        data from fragmented part.
 *
 *        &sk_buff MUST have reference count of 1.
 *
 *        Returns %NULL (and &sk_buff does not change) if pull failed
 *        or value of new tail of skb in the case of success.
 *
 *        All the pointers pointing into skb header may change and must be
 *        reloaded after call to this function.
 */

/* Moves tail of skb head forward, copying data from fragmented part,
 * when it is necessary.
 * 1. It may fail due to malloc failure.
 * 2. It may change skb pointers.
 *
 * It is pretty complicated. Luckily, it is called only in exceptional cases.
 */
void *__pskb_pull_tail(struct sk_buff *skb, int delta)
{
        /* If skb has not enough free space at tail, get new one
         * plus 128 bytes for future expansions. If we have enough
         * room at tail, reallocate without expansion only if skb is cloned.
         */
        int i, k, eat = (skb->tail + delta) - skb->end;

        if (eat > 0 || skb_cloned(skb)) {
                if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
                                     GFP_ATOMIC))
                        return NULL;
        }

        BUG_ON(skb_copy_bits(skb, skb_headlen(skb),
                             skb_tail_pointer(skb), delta));

        /* Optimization: no fragments, no reasons to preestimate
         * size of pulled pages. Superb.
         */
        if (!skb_has_frag_list(skb))
                goto pull_pages;

        /* Estimate size of pulled pages. */
        eat = delta;
        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);

                if (size >= eat)
                        goto pull_pages;
                eat -= size;
        }

        /* If we need update frag list, we are in troubles.
         * Certainly, it is possible to add an offset to skb data,
         * but taking into account that pulling is expected to
         * be very rare operation, it is worth to fight against
         * further bloating skb head and crucify ourselves here instead.
         * Pure masohism, indeed. 8)8)
         */
        if (eat) {
                struct sk_buff *list = skb_shinfo(skb)->frag_list;
                struct sk_buff *clone = NULL;
                struct sk_buff *insp = NULL;

                do {
                        if (list->len <= eat) {
                                /* Eaten as whole. */
                                eat -= list->len;
                                list = list->next;
                                insp = list;
                        } else {
                                /* Eaten partially. */
                                if (skb_is_gso(skb) && !list->head_frag &&
                                    skb_headlen(list))
                                        skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;

                                if (skb_shared(list)) {
                                        /* Sucks! We need to fork list. :-( */
                                        clone = skb_clone(list, GFP_ATOMIC);
                                        if (!clone)
                                                return NULL;
                                        insp = list->next;
                                        list = clone;
                                } else {
                                        /* This may be pulled without
                                         * problems. */
                                        insp = list;
                                }
                                if (!pskb_pull(list, eat)) {
                                        kfree_skb(clone);
                                        return NULL;
                                }
                                break;
                        }
                } while (eat);

                /* Free pulled out fragments. */
                while ((list = skb_shinfo(skb)->frag_list) != insp) {
                        skb_shinfo(skb)->frag_list = list->next;
                        consume_skb(list);
                }
                /* And insert new clone at head. */
                if (clone) {
                        clone->next = list;
                        skb_shinfo(skb)->frag_list = clone;
                }
        }
        /* Success! Now we may commit changes to skb data. */

pull_pages:
        eat = delta;
        k = 0;
        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);

                if (size <= eat) {
                        skb_frag_unref(skb, i);
                        eat -= size;
                } else {
                        skb_frag_t *frag = &skb_shinfo(skb)->frags[k];

                        *frag = skb_shinfo(skb)->frags[i];
                        if (eat) {
                                skb_frag_off_add(frag, eat);
                                skb_frag_size_sub(frag, eat);
                                if (!i)
                                        goto end;
                                eat = 0;
                        }
                        k++;
                }
        }
        skb_shinfo(skb)->nr_frags = k;

end:
        skb->tail     += delta;
        skb->data_len -= delta;

        if (!skb->data_len)
                skb_zcopy_clear(skb, false);

        return skb_tail_pointer(skb);
}
EXPORT_SYMBOL(__pskb_pull_tail);

/**
 *        skb_copy_bits - copy bits from skb to kernel buffer
 *        @skb: source skb
 *        @offset: offset in source
 *        @to: destination buffer
 *        @len: number of bytes to copy
 *
 *        Copy the specified number of bytes from the source skb to the
 *        destination buffer.
 *
 *        CAUTION ! :
 *                If its prototype is ever changed,
 *                check arch/{*}/net/{*}.S files,
 *                since it is called from BPF assembly code.
 */
int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
{
        int start = skb_headlen(skb);
        struct sk_buff *frag_iter;
        int i, copy;

        if (offset > (int)skb->len - len)
                goto fault;

        /* Copy header. */
        if ((copy = start - offset) > 0) {
                if (copy > len)
                        copy = len;
                skb_copy_from_linear_data_offset(skb, offset, to, copy);
                if ((len -= copy) == 0)
                        return 0;
                offset += copy;
                to     += copy;
        }

        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int end;
                skb_frag_t *f = &skb_shinfo(skb)->frags[i];

                WARN_ON(start > offset + len);

                end = start + skb_frag_size(f);
                if ((copy = end - offset) > 0) {
                        u32 p_off, p_len, copied;
                        struct page *p;
                        u8 *vaddr;

                        if (copy > len)
                                copy = len;

                        skb_frag_foreach_page(f,
                                              skb_frag_off(f) + offset - start,
                                              copy, p, p_off, p_len, copied) {
                                vaddr = kmap_atomic(p);
                                memcpy(to + copied, vaddr + p_off, p_len);
                                kunmap_atomic(vaddr);
                        }

                        if ((len -= copy) == 0)
                                return 0;
                        offset += copy;
                        to     += copy;
                }
                start = end;
        }

        skb_walk_frags(skb, frag_iter) {
                int end;

                WARN_ON(start > offset + len);

                end = start + frag_iter->len;
                if ((copy = end - offset) > 0) {
                        if (copy > len)
                                copy = len;
                        if (skb_copy_bits(frag_iter, offset - start, to, copy))
                                goto fault;
                        if ((len -= copy) == 0)
                                return 0;
                        offset += copy;
                        to     += copy;
                }
                start = end;
        }

        if (!len)
                return 0;

fault:
        return -EFAULT;
}
EXPORT_SYMBOL(skb_copy_bits);

/*
 * Callback from splice_to_pipe(), if we need to release some pages
 * at the end of the spd in case we error'ed out in filling the pipe.
 */
static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
{
        put_page(spd->pages[i]);
}

static struct page *linear_to_page(struct page *page, unsigned int *len,
                                   unsigned int *offset,
                                   struct sock *sk)
{
        struct page_frag *pfrag = sk_page_frag(sk);

        if (!sk_page_frag_refill(sk, pfrag))
                return NULL;

        *len = min_t(unsigned int, *len, pfrag->size - pfrag->offset);

        memcpy(page_address(pfrag->page) + pfrag->offset,
               page_address(page) + *offset, *len);
        *offset = pfrag->offset;
        pfrag->offset += *len;

        return pfrag->page;
}

static bool spd_can_coalesce(const struct splice_pipe_desc *spd,
                             struct page *page,
                             unsigned int offset)
{
        return        spd->nr_pages &&
                spd->pages[spd->nr_pages - 1] == page &&
                (spd->partial[spd->nr_pages - 1].offset +
                 spd->partial[spd->nr_pages - 1].len == offset);
}

/*
 * Fill page/offset/length into spd, if it can hold more pages.
 */
static bool spd_fill_page(struct splice_pipe_desc *spd,
                          struct pipe_inode_info *pipe, struct page *page,
                          unsigned int *len, unsigned int offset,
                          bool linear,
                          struct sock *sk)
{
        if (unlikely(spd->nr_pages == MAX_SKB_FRAGS))
                return true;

        if (linear) {
                page = linear_to_page(page, len, &offset, sk);
                if (!page)
                        return true;
        }
        if (spd_can_coalesce(spd, page, offset)) {
                spd->partial[spd->nr_pages - 1].len += *len;
                return false;
        }
        get_page(page);
        spd->pages[spd->nr_pages] = page;
        spd->partial[spd->nr_pages].len = *len;
        spd->partial[spd->nr_pages].offset = offset;
        spd->nr_pages++;

        return false;
}

static bool __splice_segment(struct page *page, unsigned int poff,
                             unsigned int plen, unsigned int *off,
                             unsigned int *len,
                             struct splice_pipe_desc *spd, bool linear,
                             struct sock *sk,
                             struct pipe_inode_info *pipe)
{
        if (!*len)
                return true;

        /* skip this segment if already processed */
        if (*off >= plen) {
                *off -= plen;
                return false;
        }

        /* ignore any bits we already processed */
        poff += *off;
        plen -= *off;
        *off = 0;

        do {
                unsigned int flen = min(*len, plen);

                if (spd_fill_page(spd, pipe, page, &flen, poff,
                                  linear, sk))
                        return true;
                poff += flen;
                plen -= flen;
                *len -= flen;
        } while (*len && plen);

        return false;
}

/*
 * Map linear and fragment data from the skb to spd. It reports true if the
 * pipe is full or if we already spliced the requested length.
 */
static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
                              unsigned int *offset, unsigned int *len,
                              struct splice_pipe_desc *spd, struct sock *sk)
{
        int seg;
        struct sk_buff *iter;

        /* map the linear part :
         * If skb->head_frag is set, this 'linear' part is backed by a
         * fragment, and if the head is not shared with any clones then
         * we can avoid a copy since we own the head portion of this page.
         */
        if (__splice_segment(virt_to_page(skb->data),
                             (unsigned long) skb->data & (PAGE_SIZE - 1),
                             skb_headlen(skb),
                             offset, len, spd,
                             skb_head_is_locked(skb),
                             sk, pipe))
                return true;

        /*
         * then map the fragments
         */
        for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {
                const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];

                if (__splice_segment(skb_frag_page(f),
                                     skb_frag_off(f), skb_frag_size(f),
                                     offset, len, spd, false, sk, pipe))
                        return true;
        }

        skb_walk_frags(skb, iter) {
                if (*offset >= iter->len) {
                        *offset -= iter->len;
                        continue;
                }
                /* __skb_splice_bits() only fails if the output has no room
                 * left, so no point in going over the frag_list for the error
                 * case.
                 */
                if (__skb_splice_bits(iter, pipe, offset, len, spd, sk))
                        return true;
        }

        return false;
}

/*
 * Map data from the skb to a pipe. Should handle both the linear part,
 * the fragments, and the frag list.
 */
int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
                    struct pipe_inode_info *pipe, unsigned int tlen,
                    unsigned int flags)
{
        struct partial_page partial[MAX_SKB_FRAGS];
        struct page *pages[MAX_SKB_FRAGS];
        struct splice_pipe_desc spd = {
                .pages = pages,
                .partial = partial,
                .nr_pages_max = MAX_SKB_FRAGS,
                .ops = &nosteal_pipe_buf_ops,
                .spd_release = sock_spd_release,
        };
        int ret = 0;

        __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk);

        if (spd.nr_pages)
                ret = splice_to_pipe(pipe, &spd);

        return ret;
}
EXPORT_SYMBOL_GPL(skb_splice_bits);

/* Send skb data on a socket. Socket must be locked. */
int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
                         int len)
{
        unsigned int orig_len = len;
        struct sk_buff *head = skb;
        unsigned short fragidx;
        int slen, ret;

do_frag_list:

        /* Deal with head data */
        while (offset < skb_headlen(skb) && len) {
                struct kvec kv;
                struct msghdr msg;

                slen = min_t(int, len, skb_headlen(skb) - offset);
                kv.iov_base = skb->data + offset;
                kv.iov_len = slen;
                memset(&msg, 0, sizeof(msg));
                msg.msg_flags = MSG_DONTWAIT;

                ret = kernel_sendmsg_locked(sk, &msg, &kv, 1, slen);
                if (ret <= 0)
                        goto error;

                offset += ret;
                len -= ret;
        }

        /* All the data was skb head? */
        if (!len)
                goto out;

        /* Make offset relative to start of frags */
        offset -= skb_headlen(skb);

        /* Find where we are in frag list */
        for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
                skb_frag_t *frag  = &skb_shinfo(skb)->frags[fragidx];

                if (offset < skb_frag_size(frag))
                        break;

                offset -= skb_frag_size(frag);
        }

        for (; len && fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
                skb_frag_t *frag  = &skb_shinfo(skb)->frags[fragidx];

                slen = min_t(size_t, len, skb_frag_size(frag) - offset);

                while (slen) {
                        ret = kernel_sendpage_locked(sk, skb_frag_page(frag),
                                                     skb_frag_off(frag) + offset,
                                                     slen, MSG_DONTWAIT);
                        if (ret <= 0)
                                goto error;

                        len -= ret;
                        offset += ret;
                        slen -= ret;
                }

                offset = 0;
        }

        if (len) {
                /* Process any frag lists */

                if (skb == head) {
                        if (skb_has_frag_list(skb)) {
                                skb = skb_shinfo(skb)->frag_list;
                                goto do_frag_list;
                        }
                } else if (skb->next) {
                        skb = skb->next;
                        goto do_frag_list;
                }
        }

out:
        return orig_len - len;

error:
        return orig_len == len ? ret : orig_len - len;
}
EXPORT_SYMBOL_GPL(skb_send_sock_locked);

/**
 *        skb_store_bits - store bits from kernel buffer to skb
 *        @skb: destination buffer
 *        @offset: offset in destination
 *        @from: source buffer
 *        @len: number of bytes to copy
 *
 *        Copy the specified number of bytes from the source buffer to the
 *        destination skb.  This function handles all the messy bits of
 *        traversing fragment lists and such.
 */

int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
{
        int start = skb_headlen(skb);
        struct sk_buff *frag_iter;
        int i, copy;

        if (offset > (int)skb->len - len)
                goto fault;

        if ((copy = start - offset) > 0) {
                if (copy > len)
                        copy = len;
                skb_copy_to_linear_data_offset(skb, offset, from, copy);
                if ((len -= copy) == 0)
                        return 0;
                offset += copy;
                from += copy;
        }

        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
                int end;

                WARN_ON(start > offset + len);

                end = start + skb_frag_size(frag);
                if ((copy = end - offset) > 0) {
                        u32 p_off, p_len, copied;
                        struct page *p;
                        u8 *vaddr;

                        if (copy > len)
                                copy = len;

                        skb_frag_foreach_page(frag,
                                              skb_frag_off(frag) + offset - start,
                                              copy, p, p_off, p_len, copied) {
                                vaddr = kmap_atomic(p);
                                memcpy(vaddr + p_off, from + copied, p_len);
                                kunmap_atomic(vaddr);
                        }

                        if ((len -= copy) == 0)
                                return 0;
                        offset += copy;
                        from += copy;
                }
                start = end;
        }

        skb_walk_frags(skb, frag_iter) {
                int end;

                WARN_ON(start > offset + len);

                end = start + frag_iter->len;
                if ((copy = end - offset) > 0) {
                        if (copy > len)
                                copy = len;
                        if (skb_store_bits(frag_iter, offset - start,
                                           from, copy))
                                goto fault;
                        if ((len -= copy) == 0)
                                return 0;
                        offset += copy;
                        from += copy;
                }
                start = end;
        }
        if (!len)
                return 0;

fault:
        return -EFAULT;
}
EXPORT_SYMBOL(skb_store_bits);

/* Checksum skb data. */
__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
                      __wsum csum, const struct skb_checksum_ops *ops)
{
        int start = skb_headlen(skb);
        int i, copy = start - offset;
        struct sk_buff *frag_iter;
        int pos = 0;

        /* Checksum header. */
        if (copy > 0) {
                if (copy > len)
                        copy = len;
                csum = INDIRECT_CALL_1(ops->update, csum_partial_ext,
                                       skb->data + offset, copy, csum);
                if ((len -= copy) == 0)
                        return csum;
                offset += copy;
                pos        = copy;
        }

        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int end;
                skb_frag_t *frag = &skb_shinfo(skb)->frags[i];

                WARN_ON(start > offset + len);

                end = start + skb_frag_size(frag);
                if ((copy = end - offset) > 0) {
                        u32 p_off, p_len, copied;
                        struct page *p;
                        __wsum csum2;
                        u8 *vaddr;

                        if (copy > len)
                                copy = len;

                        skb_frag_foreach_page(frag,
                                              skb_frag_off(frag) + offset - start,
                                              copy, p, p_off, p_len, copied) {
                                vaddr = kmap_atomic(p);
                                csum2 = INDIRECT_CALL_1(ops->update,
                                                        csum_partial_ext,
                                                        vaddr + p_off, p_len, 0);
                                kunmap_atomic(vaddr);
                                csum = INDIRECT_CALL_1(ops->combine,
                                                       csum_block_add_ext, csum,
                                                       csum2, pos, p_len);
                                pos += p_len;
                        }

                        if (!(len -= copy))
                                return csum;
                        offset += copy;
                }
                start = end;
        }

        skb_walk_frags(skb, frag_iter) {
                int end;

                WARN_ON(start > offset + len);

                end = start + frag_iter->len;
                if ((copy = end - offset) > 0) {
                        __wsum csum2;
                        if (copy > len)
                                copy = len;
                        csum2 = __skb_checksum(frag_iter, offset - start,
                                               copy, 0, ops);
                        csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext,
                                               csum, csum2, pos, copy);
                        if ((len -= copy) == 0)
                                return csum;
                        offset += copy;
                        pos    += copy;
                }
                start = end;
        }
        BUG_ON(len);

        return csum;
}
EXPORT_SYMBOL(__skb_checksum);

__wsum skb_checksum(const struct sk_buff *skb, int offset,
                    int len, __wsum csum)
{
        const struct skb_checksum_ops ops = {
                .update  = csum_partial_ext,
                .combine = csum_block_add_ext,
        };

        return __skb_checksum(skb, offset, len, csum, &ops);
}
EXPORT_SYMBOL(skb_checksum);

/* Both of above in one bottle. */

__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
                                    u8 *to, int len)
{
        int start = skb_headlen(skb);
        int i, copy = start - offset;
        struct sk_buff *frag_iter;
        int pos = 0;
        __wsum csum = 0;

        /* Copy header. */
        if (copy > 0) {
                if (copy > len)
                        copy = len;
                csum = csum_partial_copy_nocheck(skb->data + offset, to,
                                                 copy);
                if ((len -= copy) == 0)
                        return csum;
                offset += copy;
                to     += copy;
                pos        = copy;
        }

        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int end;

                WARN_ON(start > offset + len);

                end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
                if ((copy = end - offset) > 0) {
                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
                        u32 p_off, p_len, copied;
                        struct page *p;
                        __wsum csum2;
                        u8 *vaddr;

                        if (copy > len)
                                copy = len;

                        skb_frag_foreach_page(frag,
                                              skb_frag_off(frag) + offset - start,
                                              copy, p, p_off, p_len, copied) {
                                vaddr = kmap_atomic(p);
                                csum2 = csum_partial_copy_nocheck(vaddr + p_off,
                                                                  to + copied,
                                                                  p_len);
                                kunmap_atomic(vaddr);
                                csum = csum_block_add(csum, csum2, pos);
                                pos += p_len;
                        }

                        if (!(len -= copy))
                                return csum;
                        offset += copy;
                        to     += copy;
                }
                start = end;
        }

        skb_walk_frags(skb, frag_iter) {
                __wsum csum2;
                int end;

                WARN_ON(start > offset + len);

                end = start + frag_iter->len;
                if ((copy = end - offset) > 0) {
                        if (copy > len)
                                copy = len;
                        csum2 = skb_copy_and_csum_bits(frag_iter,
                                                       offset - start,
                                                       to, copy);
                        csum = csum_block_add(csum, csum2, pos);
                        if ((len -= copy) == 0)
                                return csum;
                        offset += copy;
                        to     += copy;
                        pos    += copy;
                }
                start = end;
        }
        BUG_ON(len);
        return csum;
}
EXPORT_SYMBOL(skb_copy_and_csum_bits);

__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
{
        __sum16 sum;

        sum = csum_fold(skb_checksum(skb, 0, len, skb->csum));
        /* See comments in __skb_checksum_complete(). */
        if (likely(!sum)) {
                if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
                    !skb->csum_complete_sw)
                        netdev_rx_csum_fault(skb->dev, skb);
        }
        if (!skb_shared(skb))
                skb->csum_valid = !sum;
        return sum;
}
EXPORT_SYMBOL(__skb_checksum_complete_head);

/* This function assumes skb->csum already holds pseudo header's checksum,
 * which has been changed from the hardware checksum, for example, by
 * __skb_checksum_validate_complete(). And, the original skb->csum must
 * have been validated unsuccessfully for CHECKSUM_COMPLETE case.
 *
 * It returns non-zero if the recomputed checksum is still invalid, otherwise
 * zero. The new checksum is stored back into skb->csum unless the skb is
 * shared.
 */
__sum16 __skb_checksum_complete(struct sk_buff *skb)
{
        __wsum csum;
        __sum16 sum;

        csum = skb_checksum(skb, 0, skb->len, 0);

        sum = csum_fold(csum_add(skb->csum, csum));
        /* This check is inverted, because we already knew the hardware
         * checksum is invalid before calling this function. So, if the
         * re-computed checksum is valid instead, then we have a mismatch
         * between the original skb->csum and skb_checksum(). This means either
         * the original hardware checksum is incorrect or we screw up skb->csum
         * when moving skb->data around.
         */
        if (likely(!sum)) {
                if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
                    !skb->csum_complete_sw)
                        netdev_rx_csum_fault(skb->dev, skb);
        }

        if (!skb_shared(skb)) {
                /* Save full packet checksum */
                skb->csum = csum;
                skb->ip_summed = CHECKSUM_COMPLETE;
                skb->csum_complete_sw = 1;
                skb->csum_valid = !sum;
        }

        return sum;
}
EXPORT_SYMBOL(__skb_checksum_complete);

static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum)
{
        net_warn_ratelimited(
                "%s: attempt to compute crc32c without libcrc32c.ko\n",
                __func__);
        return 0;
}

static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2,
                                       int offset, int len)
{
        net_warn_ratelimited(
                "%s: attempt to compute crc32c without libcrc32c.ko\n",
                __func__);
        return 0;
}

static const struct skb_checksum_ops default_crc32c_ops = {
        .update  = warn_crc32c_csum_update,
        .combine = warn_crc32c_csum_combine,
};

const struct skb_checksum_ops *crc32c_csum_stub __read_mostly =
        &default_crc32c_ops;
EXPORT_SYMBOL(crc32c_csum_stub);

 /**
 *        skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy()
 *        @from: source buffer
 *
 *        Calculates the amount of linear headroom needed in the 'to' skb passed
 *        into skb_zerocopy().
 */
unsigned int
skb_zerocopy_headlen(const struct sk_buff *from)
{
        unsigned int hlen = 0;

        if (!from->head_frag ||
            skb_headlen(from) < L1_CACHE_BYTES ||
            skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) {
                hlen = skb_headlen(from);
                if (!hlen)
                        hlen = from->len;
        }

        if (skb_has_frag_list(from))
                hlen = from->len;

        return hlen;
}
EXPORT_SYMBOL_GPL(skb_zerocopy_headlen);

/**
 *        skb_zerocopy - Zero copy skb to skb
 *        @to: destination buffer
 *        @from: source buffer
 *        @len: number of bytes to copy from source buffer
 *        @hlen: size of linear headroom in destination buffer
 *
 *        Copies up to `len` bytes from `from` to `to` by creating references
 *        to the frags in the source buffer.
 *
 *        The `hlen` as calculated by skb_zerocopy_headlen() specifies the
 *        headroom in the `to` buffer.
 *
 *        Return value:
 *        0: everything is OK
 *        -ENOMEM: couldn't orphan frags of @from due to lack of memory
 *        -EFAULT: skb_copy_bits() found some problem with skb geometry
 */
int
skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen)
{
        int i, j = 0;
        int plen = 0; /* length of skb->head fragment */
        int ret;
        struct page *page;
        unsigned int offset;

        BUG_ON(!from->head_frag && !hlen);

        /* dont bother with small payloads */
        if (len <= skb_tailroom(to))
                return skb_copy_bits(from, 0, skb_put(to, len), len);

        if (hlen) {
                ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen);
                if (unlikely(ret))
                        return ret;
                len -= hlen;
        } else {
                plen = min_t(int, skb_headlen(from), len);
                if (plen) {
                        page = virt_to_head_page(from->head);
                        offset = from->data - (unsigned char *)page_address(page);
                        __skb_fill_page_desc(to, 0, page, offset, plen);
                        get_page(page);
                        j = 1;
                        len -= plen;
                }
        }

        to->truesize += len + plen;
        to->len += len + plen;
        to->data_len += len + plen;

        if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) {
                skb_tx_error(from);
                return -ENOMEM;
        }
        skb_zerocopy_clone(to, from, GFP_ATOMIC);

        for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
                int size;

                if (!len)
                        break;
                skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i];
                size = min_t(int, skb_frag_size(&skb_shinfo(to)->frags[j]),
                                        len);
                skb_frag_size_set(&skb_shinfo(to)->frags[j], size);
                len -= size;
                skb_frag_ref(to, j);
                j++;
        }
        skb_shinfo(to)->nr_frags = j;

        return 0;
}
EXPORT_SYMBOL_GPL(skb_zerocopy);

void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
{
        __wsum csum;
        long csstart;

        if (skb->ip_summed == CHECKSUM_PARTIAL)
                csstart = skb_checksum_start_offset(skb);
        else
                csstart = skb_headlen(skb);

        BUG_ON(csstart > skb_headlen(skb));

        skb_copy_from_linear_data(skb, to, csstart);

        csum = 0;
        if (csstart != skb->len)
                csum = skb_copy_and_csum_bits(skb, csstart, to + csstart,
                                              skb->len - csstart);

        if (skb->ip_summed == CHECKSUM_PARTIAL) {
                long csstuff = csstart + skb->csum_offset;

                *((__sum16 *)(to + csstuff)) = csum_fold(csum);
        }
}
EXPORT_SYMBOL(skb_copy_and_csum_dev);

/**
 *        skb_dequeue - remove from the head of the queue
 *        @list: list to dequeue from
 *
 *        Remove the head of the list. The list lock is taken so the function
 *        may be used safely with other locking list functions. The head item is
 *        returned or %NULL if the list is empty.
 */

struct sk_buff *skb_dequeue(struct sk_buff_head *list)
{
        unsigned long flags;
        struct sk_buff *result;

        spin_lock_irqsave(&list->lock, flags);
        result = __skb_dequeue(list);
        spin_unlock_irqrestore(&list->lock, flags);
        return result;
}
EXPORT_SYMBOL(skb_dequeue);

/**
 *        skb_dequeue_tail - remove from the tail of the queue
 *        @list: list to dequeue from
 *
 *        Remove the tail of the list. The list lock is taken so the function
 *        may be used safely with other locking list functions. The tail item is
 *        returned or %NULL if the list is empty.
 */
struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list)
{
        unsigned long flags;
        struct sk_buff *result;

        spin_lock_irqsave(&list->lock, flags);
        result = __skb_dequeue_tail(list);
        spin_unlock_irqrestore(&list->lock, flags);
        return result;
}
EXPORT_SYMBOL(skb_dequeue_tail);

/**
 *        skb_queue_purge - empty a list
 *        @list: list to empty
 *
 *        Delete all buffers on an &sk_buff list. Each buffer is removed from
 *        the list and one reference dropped. This function takes the list
 *        lock and is atomic with respect to other list locking functions.
 */
void skb_queue_purge(struct sk_buff_head *list)
{
        struct sk_buff *skb;
        while ((skb = skb_dequeue(list)) != NULL)
                kfree_skb(skb);
}
EXPORT_SYMBOL(skb_queue_purge);

/**
 *        skb_rbtree_purge - empty a skb rbtree
 *        @root: root of the rbtree to empty
 *        Return value: the sum of truesizes of all purged skbs.
 *
 *        Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
 *        the list and one reference dropped. This function does not take
 *        any lock. Synchronization should be handled by the caller (e.g., TCP
 *        out-of-order queue is protected by the socket lock).
 */
unsigned int skb_rbtree_purge(struct rb_root *root)
{
        struct rb_node *p = rb_first(root);
        unsigned int sum = 0;

        while (p) {
                struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);

                p = rb_next(p);
                rb_erase(&skb->rbnode, root);
                sum += skb->truesize;
                kfree_skb(skb);
        }
        return sum;
}

/**
 *        skb_queue_head - queue a buffer at the list head
 *        @list: list to use
 *        @newsk: buffer to queue
 *
 *        Queue a buffer at the start of the list. This function takes the
 *        list lock and can be used safely with other locking &sk_buff functions
 *        safely.
 *
 *        A buffer cannot be placed on two lists at the same time.
 */
void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk)
{
        unsigned long flags;

        spin_lock_irqsave(&list->lock, flags);
        __skb_queue_head(list, newsk);
        spin_unlock_irqrestore(&list->lock, flags);
}
EXPORT_SYMBOL(skb_queue_head);

/**
 *        skb_queue_tail - queue a buffer at the list tail
 *        @list: list to use
 *        @newsk: buffer to queue
 *
 *        Queue a buffer at the tail of the list. This function takes the
 *        list lock and can be used safely with other locking &sk_buff functions
 *        safely.
 *
 *        A buffer cannot be placed on two lists at the same time.
 */
void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)
{
        unsigned long flags;

        spin_lock_irqsave(&list->lock, flags);
        __skb_queue_tail(list, newsk);
        spin_unlock_irqrestore(&list->lock, flags);
}
EXPORT_SYMBOL(skb_queue_tail);

/**
 *        skb_unlink        -        remove a buffer from a list
 *        @skb: buffer to remove
 *        @list: list to use
 *
 *        Remove a packet from a list. The list locks are taken and this
 *        function is atomic with respect to other list locked calls
 *
 *        You must know what list the SKB is on.
 */
void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
{
        unsigned long flags;

        spin_lock_irqsave(&list->lock, flags);
        __skb_unlink(skb, list);
        spin_unlock_irqrestore(&list->lock, flags);
}
EXPORT_SYMBOL(skb_unlink);

/**
 *        skb_append        -        append a buffer
 *        @old: buffer to insert after
 *        @newsk: buffer to insert
 *        @list: list to use
 *
 *        Place a packet after a given packet in a list. The list locks are taken
 *        and this function is atomic with respect to other list locked calls.
 *        A buffer cannot be placed on two lists at the same time.
 */
void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
{
        unsigned long flags;

        spin_lock_irqsave(&list->lock, flags);
        __skb_queue_after(list, old, newsk);
        spin_unlock_irqrestore(&list->lock, flags);
}
EXPORT_SYMBOL(skb_append);

static inline void skb_split_inside_header(struct sk_buff *skb,
                                           struct sk_buff* skb1,
                                           const u32 len, const int pos)
{
        int i;

        skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len),
                                         pos - len);
        /* And move data appendix as is. */
        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
                skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];

        skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
        skb_shinfo(skb)->nr_frags  = 0;
        skb1->data_len                   = skb->data_len;
        skb1->len                   += skb1->data_len;
        skb->data_len                   = 0;
        skb->len                   = len;
        skb_set_tail_pointer(skb, len);
}

static inline void skb_split_no_header(struct sk_buff *skb,
                                       struct sk_buff* skb1,
                                       const u32 len, int pos)
{
        int i, k = 0;
        const int nfrags = skb_shinfo(skb)->nr_frags;

        skb_shinfo(skb)->nr_frags = 0;
        skb1->len                  = skb1->data_len = skb->len - len;
        skb->len                  = len;
        skb->data_len                  = len - pos;

        for (i = 0; i < nfrags; i++) {
                int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);

                if (pos + size > len) {
                        skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];

                        if (pos < len) {
                                /* Split frag.
                                 * We have two variants in this case:
                                 * 1. Move all the frag to the second
                                 *    part, if it is possible. F.e.
                                 *    this approach is mandatory for TUX,
                                 *    where splitting is expensive.
                                 * 2. Split is accurately. We make this.
                                 */
                                skb_frag_ref(skb, i);
                                skb_frag_off_add(&skb_shinfo(skb1)->frags[0], len - pos);
                                skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos);
                                skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos);
                                skb_shinfo(skb)->nr_frags++;
                        }
                        k++;
                } else
                        skb_shinfo(skb)->nr_frags++;
                pos += size;
        }
        skb_shinfo(skb1)->nr_frags = k;
}

/**
 * skb_split - Split fragmented skb to two parts at length len.
 * @skb: the buffer to split
 * @skb1: the buffer to receive the second part
 * @len: new length for skb
 */
void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
{
        int pos = skb_headlen(skb);

        skb_shinfo(skb1)->tx_flags |= skb_shinfo(skb)->tx_flags &
                                      SKBTX_SHARED_FRAG;
        skb_zerocopy_clone(skb1, skb, 0);
        if (len < pos)        /* Split line is inside header. */
                skb_split_inside_header(skb, skb1, len, pos);
        else                /* Second chunk has no header, nothing to copy. */
                skb_split_no_header(skb, skb1, len, pos);
}
EXPORT_SYMBOL(skb_split);

/* Shifting from/to a cloned skb is a no-go.
 *
 * Caller cannot keep skb_shinfo related pointers past calling here!
 */
static int skb_prepare_for_shift(struct sk_buff *skb)
{
        int ret = 0;

        if (skb_cloned(skb)) {
                /* Save and restore truesize: pskb_expand_head() may reallocate
                 * memory where ksize(kmalloc(S)) != ksize(kmalloc(S)), but we
                 * cannot change truesize at this point.
                 */
                unsigned int save_truesize = skb->truesize;

                ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
                skb->truesize = save_truesize;
        }
        return ret;
}

/**
 * skb_shift - Shifts paged data partially from skb to another
 * @tgt: buffer into which tail data gets added
 * @skb: buffer from which the paged data comes from
 * @shiftlen: shift up to this many bytes
 *
 * Attempts to shift up to shiftlen worth of bytes, which may be less than
 * the length of the skb, from skb to tgt. Returns number bytes shifted.
 * It's up to caller to free skb if everything was shifted.
 *
 * If @tgt runs out of frags, the whole operation is aborted.
 *
 * Skb cannot include anything else but paged data while tgt is allowed
 * to have non-paged data as well.
 *
 * TODO: full sized shift could be optimized but that would need
 * specialized skb free'er to handle frags without up-to-date nr_frags.
 */
int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
{
        int from, to, merge, todo;
        skb_frag_t *fragfrom, *fragto;

        BUG_ON(shiftlen > skb->len);

        if (skb_headlen(skb))
                return 0;
        if (skb_zcopy(tgt) || skb_zcopy(skb))
                return 0;

        todo = shiftlen;
        from = 0;
        to = skb_shinfo(tgt)->nr_frags;
        fragfrom = &skb_shinfo(skb)->frags[from];

        /* Actual merge is delayed until the point when we know we can
         * commit all, so that we don't have to undo partial changes
         */
        if (!to ||
            !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom),
                              skb_frag_off(fragfrom))) {
                merge = -1;
        } else {
                merge = to - 1;

                todo -= skb_frag_size(fragfrom);
                if (todo < 0) {
                        if (skb_prepare_for_shift(skb) ||
                            skb_prepare_for_shift(tgt))
                                return 0;

                        /* All previous frag pointers might be stale! */
                        fragfrom = &skb_shinfo(skb)->frags[from];
                        fragto = &skb_shinfo(tgt)->frags[merge];

                        skb_frag_size_add(fragto, shiftlen);
                        skb_frag_size_sub(fragfrom, shiftlen);
                        skb_frag_off_add(fragfrom, shiftlen);

                        goto onlymerged;
                }

                from++;
        }

        /* Skip full, not-fitting skb to avoid expensive operations */
        if ((shiftlen == skb->len) &&
            (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to))
                return 0;

        if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt))
                return 0;

        while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) {
                if (to == MAX_SKB_FRAGS)
                        return 0;

                fragfrom = &skb_shinfo(skb)->frags[from];
                fragto = &skb_shinfo(tgt)->frags[to];

                if (todo >= skb_frag_size(fragfrom)) {
                        *fragto = *fragfrom;
                        todo -= skb_frag_size(fragfrom);
                        from++;
                        to++;

                } else {
                        __skb_frag_ref(fragfrom);
                        skb_frag_page_copy(fragto, fragfrom);
                        skb_frag_off_copy(fragto, fragfrom);
                        skb_frag_size_set(fragto, todo);

                        skb_frag_off_add(fragfrom, todo);
                        skb_frag_size_sub(fragfrom, todo);
                        todo = 0;

                        to++;
                        break;
                }
        }

        /* Ready to "commit" this state change to tgt */
        skb_shinfo(tgt)->nr_frags = to;

        if (merge >= 0) {
                fragfrom = &skb_shinfo(skb)->frags[0];
                fragto = &skb_shinfo(tgt)->frags[merge];

                skb_frag_size_add(fragto, skb_frag_size(fragfrom));
                __skb_frag_unref(fragfrom);
        }

        /* Reposition in the original skb */
        to = 0;
        while (from < skb_shinfo(skb)->nr_frags)
                skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++];
        skb_shinfo(skb)->nr_frags = to;

        BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags);

onlymerged:
        /* Most likely the tgt won't ever need its checksum anymore, skb on
         * the other hand might need it if it needs to be resent
         */
        tgt->ip_summed = CHECKSUM_PARTIAL;
        skb->ip_summed = CHECKSUM_PARTIAL;

        skb_shinfo(tgt)->tx_flags |= skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG;

        /* Yak, is it really working this way? Some helper please? */
        skb->len -= shiftlen;
        skb->data_len -= shiftlen;
        skb->truesize -= shiftlen;
        tgt->len += shiftlen;
        tgt->data_len += shiftlen;
        tgt->truesize += shiftlen;

        return shiftlen;
}

/**
 * skb_prepare_seq_read - Prepare a sequential read of skb data
 * @skb: the buffer to read
 * @from: lower offset of data to be read
 * @to: upper offset of data to be read
 * @st: state variable
 *
 * Initializes the specified state variable. Must be called before
 * invoking skb_seq_read() for the first time.
 */
void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
                          unsigned int to, struct skb_seq_state *st)
{
        st->lower_offset = from;
        st->upper_offset = to;
        st->root_skb = st->cur_skb = skb;
        st->frag_idx = st->stepped_offset = 0;
        st->frag_data = NULL;
}
EXPORT_SYMBOL(skb_prepare_seq_read);

/**
 * skb_seq_read - Sequentially read skb data
 * @consumed: number of bytes consumed by the caller so far
 * @data: destination pointer for data to be returned
 * @st: state variable
 *
 * Reads a block of skb data at @consumed relative to the
 * lower offset specified to skb_prepare_seq_read(). Assigns
 * the head of the data block to @data and returns the length
 * of the block or 0 if the end of the skb data or the upper
 * offset has been reached.
 *
 * The caller is not required to consume all of the data
 * returned, i.e. @consumed is typically set to the number
 * of bytes already consumed and the next call to
 * skb_seq_read() will return the remaining part of the block.
 *
 * Note 1: The size of each block of data returned can be arbitrary,
 *       this limitation is the cost for zerocopy sequential
 *       reads of potentially non linear data.
 *
 * Note 2: Fragment lists within fragments are not implemented
 *       at the moment, state->root_skb could be replaced with
 *       a stack for this purpose.
 */
unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
                          struct skb_seq_state *st)
{
        unsigned int block_limit, abs_offset = consumed + st->lower_offset;
        skb_frag_t *frag;

        if (unlikely(abs_offset >= st->upper_offset)) {
                if (st->frag_data) {
                        kunmap_atomic(st->frag_data);
                        st->frag_data = NULL;
                }
                return 0;
        }

next_skb:
        block_limit = skb_headlen(st->cur_skb) + st->stepped_offset;

        if (abs_offset < block_limit && !st->frag_data) {
                *data = st->cur_skb->data + (abs_offset - st->stepped_offset);
                return block_limit - abs_offset;
        }

        if (st->frag_idx == 0 && !st->frag_data)
                st->stepped_offset += skb_headlen(st->cur_skb);

        while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
                frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
                block_limit = skb_frag_size(frag) + st->stepped_offset;

                if (abs_offset < block_limit) {
                        if (!st->frag_data)
                                st->frag_data = kmap_atomic(skb_frag_page(frag));

                        *data = (u8 *) st->frag_data + skb_frag_off(frag) +
                                (abs_offset - st->stepped_offset);

                        return block_limit - abs_offset;
                }

                if (st->frag_data) {
                        kunmap_atomic(st->frag_data);
                        st->frag_data = NULL;
                }

                st->frag_idx++;
                st->stepped_offset += skb_frag_size(frag);
        }

        if (st->frag_data) {
                kunmap_atomic(st->frag_data);
                st->frag_data = NULL;
        }

        if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) {
                st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
                st->frag_idx = 0;
                goto next_skb;
        } else if (st->cur_skb->next) {
                st->cur_skb = st->cur_skb->next;
                st->frag_idx = 0;
                goto next_skb;
        }

        return 0;
}
EXPORT_SYMBOL(skb_seq_read);

/**
 * skb_abort_seq_read - Abort a sequential read of skb data
 * @st: state variable
 *
 * Must be called if skb_seq_read() was not called until it
 * returned 0.
 */
void skb_abort_seq_read(struct skb_seq_state *st)
{
        if (st->frag_data)
                kunmap_atomic(st->frag_data);
}
EXPORT_SYMBOL(skb_abort_seq_read);

#define TS_SKB_CB(state)        ((struct skb_seq_state *) &((state)->cb))

static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
                                          struct ts_config *conf,
                                          struct ts_state *state)
{
        return skb_seq_read(offset, text, TS_SKB_CB(state));
}

static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
{
        skb_abort_seq_read(TS_SKB_CB(state));
}

/**
 * skb_find_text - Find a text pattern in skb data
 * @skb: the buffer to look in
 * @from: search offset
 * @to: search limit
 * @config: textsearch configuration
 *
 * Finds a pattern in the skb data according to the specified
 * textsearch configuration. Use textsearch_next() to retrieve
 * subsequent occurrences of the pattern. Returns the offset
 * to the first occurrence or UINT_MAX if no match was found.
 */
unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
                           unsigned int to, struct ts_config *config)
{
        struct ts_state state;
        unsigned int ret;

        config->get_next_block = skb_ts_get_next_block;
        config->finish = skb_ts_finish;

        skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state));

        ret = textsearch_find(config, &state);
        return (ret <= to - from ? ret : UINT_MAX);
}
EXPORT_SYMBOL(skb_find_text);

int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
                         int offset, size_t size)
{
        int i = skb_shinfo(skb)->nr_frags;

        if (skb_can_coalesce(skb, i, page, offset)) {
                skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
        } else if (i < MAX_SKB_FRAGS) {
                get_page(page);
                skb_fill_page_desc(skb, i, page, offset, size);
        } else {
                return -EMSGSIZE;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(skb_append_pagefrags);

/**
 *        skb_pull_rcsum - pull skb and update receive checksum
 *        @skb: buffer to update
 *        @len: length of data pulled
 *
 *        This function performs an skb_pull on the packet and updates
 *        the CHECKSUM_COMPLETE checksum.  It should be used on
 *        receive path processing instead of skb_pull unless you know
 *        that the checksum difference is zero (e.g., a valid IP header)
 *        or you are setting ip_summed to CHECKSUM_NONE.
 */
void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
{
        unsigned char *data = skb->data;

        BUG_ON(len > skb->len);
        __skb_pull(skb, len);
        skb_postpull_rcsum(skb, data, len);
        return skb->data;
}
EXPORT_SYMBOL_GPL(skb_pull_rcsum);

static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb)
{
        skb_frag_t head_frag;
        struct page *page;

        page = virt_to_head_page(frag_skb->head);
        __skb_frag_set_page(&head_frag, page);
        skb_frag_off_set(&head_frag, frag_skb->data -
                         (unsigned char *)page_address(page));
        skb_frag_size_set(&head_frag, skb_headlen(frag_skb));
        return head_frag;
}

struct sk_buff *skb_segment_list(struct sk_buff *skb,
                                 netdev_features_t features,
                                 unsigned int offset)
{
        struct sk_buff *list_skb = skb_shinfo(skb)->frag_list;
        unsigned int tnl_hlen = skb_tnl_header_len(skb);
        unsigned int delta_truesize = 0;
        unsigned int delta_len = 0;
        struct sk_buff *tail = NULL;
        struct sk_buff *nskb, *tmp;
        int err;

        skb_push(skb, -skb_network_offset(skb) + offset);

        /* Ensure the head is writeable before touching the shared info */
        err = skb_unclone(skb, GFP_ATOMIC);
        if (err)
                goto err_linearize;

        skb_shinfo(skb)->frag_list = NULL;

        while (list_skb) {
                nskb = list_skb;
                list_skb = list_skb->next;

                err = 0;
                delta_truesize += nskb->truesize;
                if (skb_shared(nskb)) {
                        tmp = skb_clone(nskb, GFP_ATOMIC);
                        if (tmp) {
                                consume_skb(nskb);
                                nskb = tmp;
                                err = skb_unclone(nskb, GFP_ATOMIC);
                        } else {
                                err = -ENOMEM;
                        }
                }

                if (!tail)
                        skb->next = nskb;
                else
                        tail->next = nskb;

                if (unlikely(err)) {
                        nskb->next = list_skb;
                        goto err_linearize;
                }

                tail = nskb;

                delta_len += nskb->len;

                skb_push(nskb, -skb_network_offset(nskb) + offset);

                skb_release_head_state(nskb);
                 __copy_skb_header(nskb, skb);

                skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb));
                skb_copy_from_linear_data_offset(skb, -tnl_hlen,
                                                 nskb->data - tnl_hlen,
                                                 offset + tnl_hlen);

                if (skb_needs_linearize(nskb, features) &&
                    __skb_linearize(nskb))
                        goto err_linearize;
        }

        skb->truesize = skb->truesize - delta_truesize;
        skb->data_len = skb->data_len - delta_len;
        skb->len = skb->len - delta_len;

        skb_gso_reset(skb);

        skb->prev = tail;

        if (skb_needs_linearize(skb, features) &&
            __skb_linearize(skb))
                goto err_linearize;

        skb_get(skb);

        return skb;

err_linearize:
        kfree_skb_list(skb->next);
        skb->next = NULL;
        return ERR_PTR(-ENOMEM);
}
EXPORT_SYMBOL_GPL(skb_segment_list);

int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
{
        if (unlikely(p->len + skb->len >= 65536))
                return -E2BIG;

        if (NAPI_GRO_CB(p)->last == p)
                skb_shinfo(p)->frag_list = skb;
        else
                NAPI_GRO_CB(p)->last->next = skb;

        skb_pull(skb, skb_gro_offset(skb));

        NAPI_GRO_CB(p)->last = skb;
        NAPI_GRO_CB(p)->count++;
        p->data_len += skb->len;
        p->truesize += skb->truesize;
        p->len += skb->len;

        skb_shinfo(p)->tx_flags |= skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG;

        NAPI_GRO_CB(skb)->same_flow = 1;

        return 0;
}

/**
 *        skb_segment - Perform protocol segmentation on skb.
 *        @head_skb: buffer to segment
 *        @features: features for the output path (see dev->features)
 *
 *        This function performs segmentation on the given skb.  It returns
 *        a pointer to the first in a list of new skbs for the segments.
 *        In case of error it returns ERR_PTR(err).
 */
struct sk_buff *skb_segment(struct sk_buff *head_skb,
                            netdev_features_t features)
{
        struct sk_buff *segs = NULL;
        struct sk_buff *tail = NULL;
        struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list;
        unsigned int mss = skb_shinfo(head_skb)->gso_size;
        unsigned int doffset = head_skb->data - skb_mac_header(head_skb);
        unsigned int offset = doffset;
        unsigned int tnl_hlen = skb_tnl_header_len(head_skb);
        unsigned int partial_segs = 0;
        unsigned int headroom;
        unsigned int len = head_skb->len;
        struct sk_buff *frag_skb;
        skb_frag_t *frag;
        __be16 proto;
        bool csum, sg;
        int err = -ENOMEM;
        int i = 0;
        int nfrags, pos;

        if ((skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY) &&
            mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) {
                struct sk_buff *check_skb;

                for (check_skb = list_skb; check_skb; check_skb = check_skb->next) {
                        if (skb_headlen(check_skb) && !check_skb->head_frag) {
                                /* gso_size is untrusted, and we have a frag_list with
                                 * a linear non head_frag item.
                                 *
                                 * If head_skb's headlen does not fit requested gso_size,
                                 * it means that the frag_list members do NOT terminate
                                 * on exact gso_size boundaries. Hence we cannot perform
                                 * skb_frag_t page sharing. Therefore we must fallback to
                                 * copying the frag_list skbs; we do so by disabling SG.
                                 */
                                features &= ~NETIF_F_SG;
                                break;
                        }
                }
        }

        __skb_push(head_skb, doffset);
        proto = skb_network_protocol(head_skb, NULL);
        if (unlikely(!proto))
                return ERR_PTR(-EINVAL);

        sg = !!(features & NETIF_F_SG);
        csum = !!can_checksum_protocol(features, proto);

        if (sg && csum && (mss != GSO_BY_FRAGS))  {
                if (!(features & NETIF_F_GSO_PARTIAL)) {
                        struct sk_buff *iter;
                        unsigned int frag_len;

                        if (!list_skb ||
                            !net_gso_ok(features, skb_shinfo(head_skb)->gso_type))
                                goto normal;

                        /* If we get here then all the required
                         * GSO features except frag_list are supported.
                         * Try to split the SKB to multiple GSO SKBs
                         * with no frag_list.
                         * Currently we can do that only when the buffers don't
                         * have a linear part and all the buffers except
                         * the last are of the same length.
                         */
                        frag_len = list_skb->len;
                        skb_walk_frags(head_skb, iter) {
                                if (frag_len != iter->len && iter->next)
                                        goto normal;
                                if (skb_headlen(iter) && !iter->head_frag)
                                        goto normal;

                                len -= iter->len;
                        }

                        if (len != frag_len)
                                goto normal;
                }

                /* GSO partial only requires that we trim off any excess that
                 * doesn't fit into an MSS sized block, so take care of that
                 * now.
                 * Cap len to not accidentally hit GSO_BY_FRAGS.
                 */
                partial_segs = min(len, GSO_BY_FRAGS - 1U) / mss;
                if (partial_segs > 1)
                        mss *= partial_segs;
                else
                        partial_segs = 0;
        }

normal:
        headroom = skb_headroom(head_skb);
        pos = skb_headlen(head_skb);

        if (skb_orphan_frags(head_skb, GFP_ATOMIC))
                return ERR_PTR(-ENOMEM);

        nfrags = skb_shinfo(head_skb)->nr_frags;
        frag = skb_shinfo(head_skb)->frags;
        frag_skb = head_skb;

        do {
                struct sk_buff *nskb;
                skb_frag_t *nskb_frag;
                int hsize;
                int size;

                if (unlikely(mss == GSO_BY_FRAGS)) {
                        len = list_skb->len;
                } else {
                        len = head_skb->len - offset;
                        if (len > mss)
                                len = mss;
                }

                hsize = skb_headlen(head_skb) - offset;
                if (hsize < 0)
                        hsize = 0;
                if (hsize > len || !sg)
                        hsize = len;

                if (!hsize && i >= nfrags && skb_headlen(list_skb) &&
                    (skb_headlen(list_skb) == len || sg)) {
                        BUG_ON(skb_headlen(list_skb) > len);

                        nskb = skb_clone(list_skb, GFP_ATOMIC);
                        if (unlikely(!nskb))
                                goto err;

                        i = 0;
                        nfrags = skb_shinfo(list_skb)->nr_frags;
                        frag = skb_shinfo(list_skb)->frags;
                        frag_skb = list_skb;
                        pos += skb_headlen(list_skb);

                        while (pos < offset + len) {
                                BUG_ON(i >= nfrags);

                                size = skb_frag_size(frag);
                                if (pos + size > offset + len)
                                        break;

                                i++;
                                pos += size;
                                frag++;
                        }

                        list_skb = list_skb->next;

                        if (unlikely(pskb_trim(nskb, len))) {
                                kfree_skb(nskb);
                                goto err;
                        }

                        hsize = skb_end_offset(nskb);
                        if (skb_cow_head(nskb, doffset + headroom)) {
                                kfree_skb(nskb);
                                goto err;
                        }

                        nskb->truesize += skb_end_offset(nskb) - hsize;
                        skb_release_head_state(nskb);
                        __skb_push(nskb, doffset);
                } else {
                        nskb = __alloc_skb(hsize + doffset + headroom,
                                           GFP_ATOMIC, skb_alloc_rx_flag(head_skb),
                                           NUMA_NO_NODE);

                        if (unlikely(!nskb))
                                goto err;

                        skb_reserve(nskb, headroom);
                        __skb_put(nskb, doffset);
                }

                if (segs)
                        tail->next = nskb;
                else
                        segs = nskb;
                tail = nskb;

                __copy_skb_header(nskb, head_skb);

                skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom);
                skb_reset_mac_len(nskb);

                skb_copy_from_linear_data_offset(head_skb, -tnl_hlen,
                                                 nskb->data - tnl_hlen,
                                                 doffset + tnl_hlen);

                if (nskb->len == len + doffset)
                        goto perform_csum_check;

                if (!sg) {
                        if (!csum) {
                                if (!nskb->remcsum_offload)
                                        nskb->ip_summed = CHECKSUM_NONE;
                                SKB_GSO_CB(nskb)->csum =
                                        skb_copy_and_csum_bits(head_skb, offset,
                                                               skb_put(nskb,
                                                                       len),
                                                               len);
                                SKB_GSO_CB(nskb)->csum_start =
                                        skb_headroom(nskb) + doffset;
                        } else {
                                if (skb_copy_bits(head_skb, offset, skb_put(nskb, len), len))
                                        goto err;
                        }
                        continue;
                }

                nskb_frag = skb_shinfo(nskb)->frags;

                skb_copy_from_linear_data_offset(head_skb, offset,
                                                 skb_put(nskb, hsize), hsize);

                skb_shinfo(nskb)->tx_flags |= (skb_shinfo(head_skb)->tx_flags |
                                               skb_shinfo(frag_skb)->tx_flags) &
                                              SKBTX_SHARED_FRAG;

                if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
                        goto err;

                while (pos < offset + len) {
                        if (i >= nfrags) {
                                if (skb_orphan_frags(list_skb, GFP_ATOMIC) ||
                                    skb_zerocopy_clone(nskb, list_skb,
                                                       GFP_ATOMIC))
                                        goto err;

                                i = 0;
                                nfrags = skb_shinfo(list_skb)->nr_frags;
                                frag = skb_shinfo(list_skb)->frags;
                                frag_skb = list_skb;

                                skb_shinfo(nskb)->tx_flags |= skb_shinfo(frag_skb)->tx_flags & SKBTX_SHARED_FRAG;

                                if (!skb_headlen(list_skb)) {
                                        BUG_ON(!nfrags);
                                } else {
                                        BUG_ON(!list_skb->head_frag);

                                        /* to make room for head_frag. */
                                        i--;
                                        frag--;
                                }

                                list_skb = list_skb->next;
                        }

                        if (unlikely(skb_shinfo(nskb)->nr_frags >=
                                     MAX_SKB_FRAGS)) {
                                net_warn_ratelimited(
                                        "skb_segment: too many frags: %u %u\n",
                                        pos, mss);
                                err = -EINVAL;
                                goto err;
                        }

                        *nskb_frag = (i < 0) ? skb_head_frag_to_page_desc(frag_skb) : *frag;
                        __skb_frag_ref(nskb_frag);
                        size = skb_frag_size(nskb_frag);

                        if (pos < offset) {
                                skb_frag_off_add(nskb_frag, offset - pos);
                                skb_frag_size_sub(nskb_frag, offset - pos);
                        }

                        skb_shinfo(nskb)->nr_frags++;

                        if (pos + size <= offset + len) {
                                i++;
                                frag++;
                                pos += size;
                        } else {
                                skb_frag_size_sub(nskb_frag, pos + size - (offset + len));
                                goto skip_fraglist;
                        }

                        nskb_frag++;
                }

skip_fraglist:
                nskb->data_len = len - hsize;
                nskb->len += nskb->data_len;
                nskb->truesize += nskb->data_len;

perform_csum_check:
                if (!csum) {
                        if (skb_has_shared_frag(nskb) &&
                            __skb_linearize(nskb))
                                goto err;

                        if (!nskb->remcsum_offload)
                                nskb->ip_summed = CHECKSUM_NONE;
                        SKB_GSO_CB(nskb)->csum =
                                skb_checksum(nskb, doffset,
                                             nskb->len - doffset, 0);
                        SKB_GSO_CB(nskb)->csum_start =
                                skb_headroom(nskb) + doffset;
                }
        } while ((offset += len) < head_skb->len);

        /* Some callers want to get the end of the list.
         * Put it in segs->prev to avoid walking the list.
         * (see validate_xmit_skb_list() for example)
         */
        segs->prev = tail;

        if (partial_segs) {
                struct sk_buff *iter;
                int type = skb_shinfo(head_skb)->gso_type;
                unsigned short gso_size = skb_shinfo(head_skb)->gso_size;

                /* Update type to add partial and then remove dodgy if set */
                type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL;
                type &= ~SKB_GSO_DODGY;

                /* Update GSO info and prepare to start updating headers on
                 * our way back down the stack of protocols.
                 */
                for (iter = segs; iter; iter = iter->next) {
                        skb_shinfo(iter)->gso_size = gso_size;
                        skb_shinfo(iter)->gso_segs = partial_segs;
                        skb_shinfo(iter)->gso_type = type;
                        SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset;
                }

                if (tail->len - doffset <= gso_size)
                        skb_shinfo(tail)->gso_size = 0;
                else if (tail != segs)
                        skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size);
        }

        /* Following permits correct backpressure, for protocols
         * using skb_set_owner_w().
         * Idea is to tranfert ownership from head_skb to last segment.
         */
        if (head_skb->destructor == sock_wfree) {
                swap(tail->truesize, head_skb->truesize);
                swap(tail->destructor, head_skb->destructor);
                swap(tail->sk, head_skb->sk);
        }
        return segs;

err:
        kfree_skb_list(segs);
        return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(skb_segment);

int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
{
        struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb);
        unsigned int offset = skb_gro_offset(skb);
        unsigned int headlen = skb_headlen(skb);
        unsigned int len = skb_gro_len(skb);
        unsigned int delta_truesize;
        struct sk_buff *lp;

        if (unlikely(p->len + len >= 65536 || NAPI_GRO_CB(skb)->flush))
                return -E2BIG;

        lp = NAPI_GRO_CB(p)->last;
        pinfo = skb_shinfo(lp);

        if (headlen <= offset) {
                skb_frag_t *frag;
                skb_frag_t *frag2;
                int i = skbinfo->nr_frags;
                int nr_frags = pinfo->nr_frags + i;

                if (nr_frags > MAX_SKB_FRAGS)
                        goto merge;

                offset -= headlen;
                pinfo->nr_frags = nr_frags;
                skbinfo->nr_frags = 0;

                frag = pinfo->frags + nr_frags;
                frag2 = skbinfo->frags + i;
                do {
                        *--frag = *--frag2;
                } while (--i);

                skb_frag_off_add(frag, offset);
                skb_frag_size_sub(frag, offset);

                /* all fragments truesize : remove (head size + sk_buff) */
                delta_truesize = skb->truesize -
                                 SKB_TRUESIZE(skb_end_offset(skb));

                skb->truesize -= skb->data_len;
                skb->len -= skb->data_len;
                skb->data_len = 0;

                NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE;
                goto done;
        } else if (skb->head_frag) {
                int nr_frags = pinfo->nr_frags;
                skb_frag_t *frag = pinfo->frags + nr_frags;
                struct page *page = virt_to_head_page(skb->head);
                unsigned int first_size = headlen - offset;
                unsigned int first_offset;

                if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS)
                        goto merge;

                first_offset = skb->data -
                               (unsigned char *)page_address(page) +
                               offset;

                pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags;

                __skb_frag_set_page(frag, page);
                skb_frag_off_set(frag, first_offset);
                skb_frag_size_set(frag, first_size);

                memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags);
                /* We dont need to clear skbinfo->nr_frags here */

                delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
                NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD;
                goto done;
        }

merge:
        delta_truesize = skb->truesize;
        if (offset > headlen) {
                unsigned int eat = offset - headlen;

                skb_frag_off_add(&skbinfo->frags[0], eat);
                skb_frag_size_sub(&skbinfo->frags[0], eat);
                skb->data_len -= eat;
                skb->len -= eat;
                offset = headlen;
        }

        __skb_pull(skb, offset);

        if (NAPI_GRO_CB(p)->last == p)
                skb_shinfo(p)->frag_list = skb;
        else
                NAPI_GRO_CB(p)->last->next = skb;
        NAPI_GRO_CB(p)->last = skb;
        __skb_header_release(skb);
        lp = p;

done:
        NAPI_GRO_CB(p)->count++;
        p->data_len += len;
        p->truesize += delta_truesize;
        p->len += len;
        skb_shinfo(p)->tx_flags |= skbinfo->tx_flags & SKBTX_SHARED_FRAG;
        if (lp != p) {
                lp->data_len += len;
                lp->truesize += delta_truesize;
                lp->len += len;
                skb_shinfo(lp)->tx_flags |= skbinfo->tx_flags & SKBTX_SHARED_FRAG;
        }
        NAPI_GRO_CB(skb)->same_flow = 1;
        return 0;
}

#ifdef CONFIG_SKB_EXTENSIONS
#define SKB_EXT_ALIGN_VALUE        8
#define SKB_EXT_CHUNKSIZEOF(x)        (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE)

static const u8 skb_ext_type_len[] = {
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
        [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info),
#endif
#ifdef CONFIG_XFRM
        [SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path),
#endif
#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
        [TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext),
#endif
#if IS_ENABLED(CONFIG_MPTCP)
        [SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext),
#endif
};

static __always_inline unsigned int skb_ext_total_length(void)
{
        return SKB_EXT_CHUNKSIZEOF(struct skb_ext) +
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
                skb_ext_type_len[SKB_EXT_BRIDGE_NF] +
#endif
#ifdef CONFIG_XFRM
                skb_ext_type_len[SKB_EXT_SEC_PATH] +
#endif
#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
                skb_ext_type_len[TC_SKB_EXT] +
#endif
#if IS_ENABLED(CONFIG_MPTCP)
                skb_ext_type_len[SKB_EXT_MPTCP] +
#endif
                0;
}

static void skb_extensions_init(void)
{
        BUILD_BUG_ON(SKB_EXT_NUM >= 8);
        BUILD_BUG_ON(skb_ext_total_length() > 255);

        skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache",
                                             SKB_EXT_ALIGN_VALUE * skb_ext_total_length(),
                                             0,
                                             SLAB_HWCACHE_ALIGN|SLAB_PANIC,
                                             NULL);
}
#else
static void skb_extensions_init(void) {}
#endif

void __init skb_init(void)
{
        skbuff_head_cache = kmem_cache_create_usercopy("skbuff_head_cache",
                                              sizeof(struct sk_buff),
                                              0,
                                              SLAB_HWCACHE_ALIGN|SLAB_PANIC,
                                              offsetof(struct sk_buff, cb),
                                              sizeof_field(struct sk_buff, cb),
                                              NULL);
        skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
                                                sizeof(struct sk_buff_fclones),
                                                0,
                                                SLAB_HWCACHE_ALIGN|SLAB_PANIC,
                                                NULL);
        skb_extensions_init();
}

static int
__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len,
               unsigned int recursion_level)
{
        int start = skb_headlen(skb);
        int i, copy = start - offset;
        struct sk_buff *frag_iter;
        int elt = 0;

        if (unlikely(recursion_level >= 24))
                return -EMSGSIZE;

        if (copy > 0) {
                if (copy > len)
                        copy = len;
                sg_set_buf(sg, skb->data + offset, copy);
                elt++;
                if ((len -= copy) == 0)
                        return elt;
                offset += copy;
        }

        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int end;

                WARN_ON(start > offset + len);

                end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
                if ((copy = end - offset) > 0) {
                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
                        if (unlikely(elt && sg_is_last(&sg[elt - 1])))
                                return -EMSGSIZE;

                        if (copy > len)
                                copy = len;
                        sg_set_page(&sg[elt], skb_frag_page(frag), copy,
                                    skb_frag_off(frag) + offset - start);
                        elt++;
                        if (!(len -= copy))
                                return elt;
                        offset += copy;
                }
                start = end;
        }

        skb_walk_frags(skb, frag_iter) {
                int end, ret;

                WARN_ON(start > offset + len);

                end = start + frag_iter->len;
                if ((copy = end - offset) > 0) {
                        if (unlikely(elt && sg_is_last(&sg[elt - 1])))
                                return -EMSGSIZE;

                        if (copy > len)
                                copy = len;
                        ret = __skb_to_sgvec(frag_iter, sg+elt, offset - start,
                                              copy, recursion_level + 1);
                        if (unlikely(ret < 0))
                                return ret;
                        elt += ret;
                        if ((len -= copy) == 0)
                                return elt;
                        offset += copy;
                }
                start = end;
        }
        BUG_ON(len);
        return elt;
}

/**
 *        skb_to_sgvec - Fill a scatter-gather list from a socket buffer
 *        @skb: Socket buffer containing the buffers to be mapped
 *        @sg: The scatter-gather list to map into
 *        @offset: The offset into the buffer's contents to start mapping
 *        @len: Length of buffer space to be mapped
 *
 *        Fill the specified scatter-gather list with mappings/pointers into a
 *        region of the buffer space attached to a socket buffer. Returns either
 *        the number of scatterlist items used, or -EMSGSIZE if the contents
 *        could not fit.
 */
int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
{
        int nsg = __skb_to_sgvec(skb, sg, offset, len, 0);

        if (nsg <= 0)
                return nsg;

        sg_mark_end(&sg[nsg - 1]);

        return nsg;
}
EXPORT_SYMBOL_GPL(skb_to_sgvec);

/* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given
 * sglist without mark the sg which contain last skb data as the end.
 * So the caller can mannipulate sg list as will when padding new data after
 * the first call without calling sg_unmark_end to expend sg list.
 *
 * Scenario to use skb_to_sgvec_nomark:
 * 1. sg_init_table
 * 2. skb_to_sgvec_nomark(payload1)
 * 3. skb_to_sgvec_nomark(payload2)
 *
 * This is equivalent to:
 * 1. sg_init_table
 * 2. skb_to_sgvec(payload1)
 * 3. sg_unmark_end
 * 4. skb_to_sgvec(payload2)
 *
 * When mapping mutilple payload conditionally, skb_to_sgvec_nomark
 * is more preferable.
 */
int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
                        int offset, int len)
{
        return __skb_to_sgvec(skb, sg, offset, len, 0);
}
EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark);



/**
 *        skb_cow_data - Check that a socket buffer's data buffers are writable
 *        @skb: The socket buffer to check.
 *        @tailbits: Amount of trailing space to be added
 *        @trailer: Returned pointer to the skb where the @tailbits space begins
 *
 *        Make sure that the data buffers attached to a socket buffer are
 *        writable. If they are not, private copies are made of the data buffers
 *        and the socket buffer is set to use these instead.
 *
 *        If @tailbits is given, make sure that there is space to write @tailbits
 *        bytes of data beyond current end of socket buffer.  @trailer will be
 *        set to point to the skb in which this space begins.
 *
 *        The number of scatterlist elements required to completely map the
 *        COW'd and extended socket buffer will be returned.
 */
int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
{
        int copyflag;
        int elt;
        struct sk_buff *skb1, **skb_p;

        /* If skb is cloned or its head is paged, reallocate
         * head pulling out all the pages (pages are considered not writable
         * at the moment even if they are anonymous).
         */
        if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) &&
            !__pskb_pull_tail(skb, __skb_pagelen(skb)))
                return -ENOMEM;

        /* Easy case. Most of packets will go this way. */
        if (!skb_has_frag_list(skb)) {
                /* A little of trouble, not enough of space for trailer.
                 * This should not happen, when stack is tuned to generate
                 * good frames. OK, on miss we reallocate and reserve even more
                 * space, 128 bytes is fair. */

                if (skb_tailroom(skb) < tailbits &&
                    pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC))
                        return -ENOMEM;

                /* Voila! */
                *trailer = skb;
                return 1;
        }

        /* Misery. We are in troubles, going to mincer fragments... */

        elt = 1;
        skb_p = &skb_shinfo(skb)->frag_list;
        copyflag = 0;

        while ((skb1 = *skb_p) != NULL) {
                int ntail = 0;

                /* The fragment is partially pulled by someone,
                 * this can happen on input. Copy it and everything
                 * after it. */

                if (skb_shared(skb1))
                        copyflag = 1;

                /* If the skb is the last, worry about trailer. */

                if (skb1->next == NULL && tailbits) {
                        if (skb_shinfo(skb1)->nr_frags ||
                            skb_has_frag_list(skb1) ||
                            skb_tailroom(skb1) < tailbits)
                                ntail = tailbits + 128;
                }

                if (copyflag ||
                    skb_cloned(skb1) ||
                    ntail ||
                    skb_shinfo(skb1)->nr_frags ||
                    skb_has_frag_list(skb1)) {
                        struct sk_buff *skb2;

                        /* Fuck, we are miserable poor guys... */
                        if (ntail == 0)
                                skb2 = skb_copy(skb1, GFP_ATOMIC);
                        else
                                skb2 = skb_copy_expand(skb1,
                                                       skb_headroom(skb1),
                                                       ntail,
                                                       GFP_ATOMIC);
                        if (unlikely(skb2 == NULL))
                                return -ENOMEM;

                        if (skb1->sk)
                                skb_set_owner_w(skb2, skb1->sk);

                        /* Looking around. Are we still alive?
                         * OK, link new skb, drop old one */

                        skb2->next = skb1->next;
                        *skb_p = skb2;
                        kfree_skb(skb1);
                        skb1 = skb2;
                }
                elt++;
                *trailer = skb1;
                skb_p = &skb1->next;
        }

        return elt;
}
EXPORT_SYMBOL_GPL(skb_cow_data);

static void sock_rmem_free(struct sk_buff *skb)
{
        struct sock *sk = skb->sk;

        atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
}

static void skb_set_err_queue(struct sk_buff *skb)
{
        /* pkt_type of skbs received on local sockets is never PACKET_OUTGOING.
         * So, it is safe to (mis)use it to mark skbs on the error queue.
         */
        skb->pkt_type = PACKET_OUTGOING;
        BUILD_BUG_ON(PACKET_OUTGOING == 0);
}

/*
 * Note: We dont mem charge error packets (no sk_forward_alloc changes)
 */
int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
{
        if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
            (unsigned int)READ_ONCE(sk->sk_rcvbuf))
                return -ENOMEM;

        skb_orphan(skb);
        skb->sk = sk;
        skb->destructor = sock_rmem_free;
        atomic_add(skb->truesize, &sk->sk_rmem_alloc);
        skb_set_err_queue(skb);

        /* before exiting rcu section, make sure dst is refcounted */
        skb_dst_force(skb);

        skb_queue_tail(&sk->sk_error_queue, skb);
        if (!sock_flag(sk, SOCK_DEAD))
                sk->sk_error_report(sk);
        return 0;
}
EXPORT_SYMBOL(sock_queue_err_skb);

static bool is_icmp_err_skb(const struct sk_buff *skb)
{
        return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
                       SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6);
}

struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
{
        struct sk_buff_head *q = &sk->sk_error_queue;
        struct sk_buff *skb, *skb_next = NULL;
        bool icmp_next = false;
        unsigned long flags;

        spin_lock_irqsave(&q->lock, flags);
        skb = __skb_dequeue(q);
        if (skb && (skb_next = skb_peek(q))) {
                icmp_next = is_icmp_err_skb(skb_next);
                if (icmp_next)
                        sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_errno;
        }
        spin_unlock_irqrestore(&q->lock, flags);

        if (is_icmp_err_skb(skb) && !icmp_next)
                sk->sk_err = 0;

        if (skb_next)
                sk->sk_error_report(sk);

        return skb;
}
EXPORT_SYMBOL(sock_dequeue_err_skb);

/**
 * skb_clone_sk - create clone of skb, and take reference to socket
 * @skb: the skb to clone
 *
 * This function creates a clone of a buffer that holds a reference on
 * sk_refcnt.  Buffers created via this function are meant to be
 * returned using sock_queue_err_skb, or free via kfree_skb.
 *
 * When passing buffers allocated with this function to sock_queue_err_skb
 * it is necessary to wrap the call with sock_hold/sock_put in order to
 * prevent the socket from being released prior to being enqueued on
 * the sk_error_queue.
 */
struct sk_buff *skb_clone_sk(struct sk_buff *skb)
{
        struct sock *sk = skb->sk;
        struct sk_buff *clone;

        if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt))
                return NULL;

        clone = skb_clone(skb, GFP_ATOMIC);
        if (!clone) {
                sock_put(sk);
                return NULL;
        }

        clone->sk = sk;
        clone->destructor = sock_efree;

        return clone;
}
EXPORT_SYMBOL(skb_clone_sk);

static void __skb_complete_tx_timestamp(struct sk_buff *skb,
                                        struct sock *sk,
                                        int tstype,
                                        bool opt_stats)
{
        struct sock_exterr_skb *serr;
        int err;

        BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb));

        serr = SKB_EXT_ERR(skb);
        memset(serr, 0, sizeof(*serr));
        serr->ee.ee_errno = ENOMSG;
        serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
        serr->ee.ee_info = tstype;
        serr->opt_stats = opt_stats;
        serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0;
        if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) {
                serr->ee.ee_data = skb_shinfo(skb)->tskey;
                if (sk->sk_protocol == IPPROTO_TCP &&
                    sk->sk_type == SOCK_STREAM)
                        serr->ee.ee_data -= sk->sk_tskey;
        }

        err = sock_queue_err_skb(sk, skb);

        if (err)
                kfree_skb(skb);
}

static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly)
{
        bool ret;

        if (likely(READ_ONCE(sysctl_tstamp_allow_data) || tsonly))
                return true;

        read_lock_bh(&sk->sk_callback_lock);
        ret = sk->sk_socket && sk->sk_socket->file &&
              file_ns_capable(sk->sk_socket->file, &init_user_ns, CAP_NET_RAW);
        read_unlock_bh(&sk->sk_callback_lock);
        return ret;
}

void skb_complete_tx_timestamp(struct sk_buff *skb,
                               struct skb_shared_hwtstamps *hwtstamps)
{
        struct sock *sk = skb->sk;

        if (!skb_may_tx_timestamp(sk, false))
                goto err;

        /* Take a reference to prevent skb_orphan() from freeing the socket,
         * but only if the socket refcount is not zero.
         */
        if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) {
                *skb_hwtstamps(skb) = *hwtstamps;
                __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false);
                sock_put(sk);
                return;
        }

err:
        kfree_skb(skb);
}
EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);

void __skb_tstamp_tx(struct sk_buff *orig_skb,
                     struct skb_shared_hwtstamps *hwtstamps,
                     struct sock *sk, int tstype)
{
        struct sk_buff *skb;
        bool tsonly, opt_stats = false;

        if (!sk)
                return;

        if (!hwtstamps && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) &&
            skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS)
                return;

        tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY;
        if (!skb_may_tx_timestamp(sk, tsonly))
                return;

        if (tsonly) {
#ifdef CONFIG_INET
                if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
                    sk->sk_protocol == IPPROTO_TCP &&
                    sk->sk_type == SOCK_STREAM) {
                        skb = tcp_get_timestamping_opt_stats(sk, orig_skb);
                        opt_stats = true;
                } else
#endif
                        skb = alloc_skb(0, GFP_ATOMIC);
        } else {
                skb = skb_clone(orig_skb, GFP_ATOMIC);

                if (skb_orphan_frags_rx(skb, GFP_ATOMIC)) {
                        kfree_skb(skb);
                        return;
                }
        }
        if (!skb)
                return;

        if (tsonly) {
                skb_shinfo(skb)->tx_flags |= skb_shinfo(orig_skb)->tx_flags &
                                             SKBTX_ANY_TSTAMP;
                skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey;
        }

        if (hwtstamps)
                *skb_hwtstamps(skb) = *hwtstamps;
        else
                skb->tstamp = ktime_get_real();

        __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
}
EXPORT_SYMBOL_GPL(__skb_tstamp_tx);

void skb_tstamp_tx(struct sk_buff *orig_skb,
                   struct skb_shared_hwtstamps *hwtstamps)
{
        return __skb_tstamp_tx(orig_skb, hwtstamps, orig_skb->sk,
                               SCM_TSTAMP_SND);
}
EXPORT_SYMBOL_GPL(skb_tstamp_tx);

void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
{
        struct sock *sk = skb->sk;
        struct sock_exterr_skb *serr;
        int err = 1;

        skb->wifi_acked_valid = 1;
        skb->wifi_acked = acked;

        serr = SKB_EXT_ERR(skb);
        memset(serr, 0, sizeof(*serr));
        serr->ee.ee_errno = ENOMSG;
        serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;

        /* Take a reference to prevent skb_orphan() from freeing the socket,
         * but only if the socket refcount is not zero.
         */
        if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) {
                err = sock_queue_err_skb(sk, skb);
                sock_put(sk);
        }
        if (err)
                kfree_skb(skb);
}
EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);

/**
 * skb_partial_csum_set - set up and verify partial csum values for packet
 * @skb: the skb to set
 * @start: the number of bytes after skb->data to start checksumming.
 * @off: the offset from start to place the checksum.
 *
 * For untrusted partially-checksummed packets, we need to make sure the values
 * for skb->csum_start and skb->csum_offset are valid so we don't oops.
 *
 * This function checks and sets those values and skb->ip_summed: if this
 * returns false you should drop the packet.
 */
bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)
{
        u32 csum_end = (u32)start + (u32)off + sizeof(__sum16);
        u32 csum_start = skb_headroom(skb) + (u32)start;

        if (unlikely(csum_start > U16_MAX || csum_end > skb_headlen(skb))) {
                net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n",
                                     start, off, skb_headroom(skb), skb_headlen(skb));
                return false;
        }
        skb->ip_summed = CHECKSUM_PARTIAL;
        skb->csum_start = csum_start;
        skb->csum_offset = off;
        skb_set_transport_header(skb, start);
        return true;
}
EXPORT_SYMBOL_GPL(skb_partial_csum_set);

static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len,
                               unsigned int max)
{
        if (skb_headlen(skb) >= len)
                return 0;

        /* If we need to pullup then pullup to the max, so we
         * won't need to do it again.
         */
        if (max > skb->len)
                max = skb->len;

        if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL)
                return -ENOMEM;

        if (skb_headlen(skb) < len)
                return -EPROTO;

        return 0;
}

#define MAX_TCP_HDR_LEN (15 * 4)

static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb,
                                      typeof(IPPROTO_IP) proto,
                                      unsigned int off)
{
        int err;

        switch (proto) {
        case IPPROTO_TCP:
                err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr),
                                          off + MAX_TCP_HDR_LEN);
                if (!err && !skb_partial_csum_set(skb, off,
                                                  offsetof(struct tcphdr,
                                                           check)))
                        err = -EPROTO;
                return err ? ERR_PTR(err) : &tcp_hdr(skb)->check;

        case IPPROTO_UDP:
                err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr),
                                          off + sizeof(struct udphdr));
                if (!err && !skb_partial_csum_set(skb, off,
                                                  offsetof(struct udphdr,
                                                           check)))
                        err = -EPROTO;
                return err ? ERR_PTR(err) : &udp_hdr(skb)->check;
        }

        return ERR_PTR(-EPROTO);
}

/* This value should be large enough to cover a tagged ethernet header plus
 * maximally sized IP and TCP or UDP headers.
 */
#define MAX_IP_HDR_LEN 128

static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate)
{
        unsigned int off;
        bool fragment;
        __sum16 *csum;
        int err;

        fragment = false;

        err = skb_maybe_pull_tail(skb,
                                  sizeof(struct iphdr),
                                  MAX_IP_HDR_LEN);
        if (err < 0)
                goto out;

        if (ip_is_fragment(ip_hdr(skb)))
                fragment = true;

        off = ip_hdrlen(skb);

        err = -EPROTO;

        if (fragment)
                goto out;

        csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off);
        if (IS_ERR(csum))
                return PTR_ERR(csum);

        if (recalculate)
                *csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr,
                                           ip_hdr(skb)->daddr,
                                           skb->len - off,
                                           ip_hdr(skb)->protocol, 0);
        err = 0;

out:
        return err;
}

/* This value should be large enough to cover a tagged ethernet header plus
 * an IPv6 header, all options, and a maximal TCP or UDP header.
 */
#define MAX_IPV6_HDR_LEN 256

#define OPT_HDR(type, skb, off) \
        (type *)(skb_network_header(skb) + (off))

static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate)
{
        int err;
        u8 nexthdr;
        unsigned int off;
        unsigned int len;
        bool fragment;
        bool done;
        __sum16 *csum;

        fragment = false;
        done = false;

        off = sizeof(struct ipv6hdr);

        err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN);
        if (err < 0)
                goto out;

        nexthdr = ipv6_hdr(skb)->nexthdr;

        len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len);
        while (off <= len && !done) {
                switch (nexthdr) {
                case IPPROTO_DSTOPTS:
                case IPPROTO_HOPOPTS:
                case IPPROTO_ROUTING: {
                        struct ipv6_opt_hdr *hp;

                        err = skb_maybe_pull_tail(skb,
                                                  off +
                                                  sizeof(struct ipv6_opt_hdr),
                                                  MAX_IPV6_HDR_LEN);
                        if (err < 0)
                                goto out;

                        hp = OPT_HDR(struct ipv6_opt_hdr, skb, off);
                        nexthdr = hp->nexthdr;
                        off += ipv6_optlen(hp);
                        break;
                }
                case IPPROTO_AH: {
                        struct ip_auth_hdr *hp;

                        err = skb_maybe_pull_tail(skb,
                                                  off +
                                                  sizeof(struct ip_auth_hdr),
                                                  MAX_IPV6_HDR_LEN);
                        if (err < 0)
                                goto out;

                        hp = OPT_HDR(struct ip_auth_hdr, skb, off);
                        nexthdr = hp->nexthdr;
                        off += ipv6_authlen(hp);
                        break;
                }
                case IPPROTO_FRAGMENT: {
                        struct frag_hdr *hp;

                        err = skb_maybe_pull_tail(skb,
                                                  off +
                                                  sizeof(struct frag_hdr),
                                                  MAX_IPV6_HDR_LEN);
                        if (err < 0)
                                goto out;

                        hp = OPT_HDR(struct frag_hdr, skb, off);

                        if (hp->frag_off & htons(IP6_OFFSET | IP6_MF))
                                fragment = true;

                        nexthdr = hp->nexthdr;
                        off += sizeof(struct frag_hdr);
                        break;
                }
                default:
                        done = true;
                        break;
                }
        }

        err = -EPROTO;

        if (!done || fragment)
                goto out;

        csum = skb_checksum_setup_ip(skb, nexthdr, off);
        if (IS_ERR(csum))
                return PTR_ERR(csum);

        if (recalculate)
                *csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
                                         &ipv6_hdr(skb)->daddr,
                                         skb->len - off, nexthdr, 0);
        err = 0;

out:
        return err;
}

/**
 * skb_checksum_setup - set up partial checksum offset
 * @skb: the skb to set up
 * @recalculate: if true the pseudo-header checksum will be recalculated
 */
int skb_checksum_setup(struct sk_buff *skb, bool recalculate)
{
        int err;

        switch (skb->protocol) {
        case htons(ETH_P_IP):
                err = skb_checksum_setup_ipv4(skb, recalculate);
                break;

        case htons(ETH_P_IPV6):
                err = skb_checksum_setup_ipv6(skb, recalculate);
                break;

        default:
                err = -EPROTO;
                break;
        }

        return err;
}
EXPORT_SYMBOL(skb_checksum_setup);

/**
 * skb_checksum_maybe_trim - maybe trims the given skb
 * @skb: the skb to check
 * @transport_len: the data length beyond the network header
 *
 * Checks whether the given skb has data beyond the given transport length.
 * If so, returns a cloned skb trimmed to this transport length.
 * Otherwise returns the provided skb. Returns NULL in error cases
 * (e.g. transport_len exceeds skb length or out-of-memory).
 *
 * Caller needs to set the skb transport header and free any returned skb if it
 * differs from the provided skb.
 */
static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb,
                                               unsigned int transport_len)
{
        struct sk_buff *skb_chk;
        unsigned int len = skb_transport_offset(skb) + transport_len;
        int ret;

        if (skb->len < len)
                return NULL;
        else if (skb->len == len)
                return skb;

        skb_chk = skb_clone(skb, GFP_ATOMIC);
        if (!skb_chk)
                return NULL;

        ret = pskb_trim_rcsum(skb_chk, len);
        if (ret) {
                kfree_skb(skb_chk);
                return NULL;
        }

        return skb_chk;
}

/**
 * skb_checksum_trimmed - validate checksum of an skb
 * @skb: the skb to check
 * @transport_len: the data length beyond the network header
 * @skb_chkf: checksum function to use
 *
 * Applies the given checksum function skb_chkf to the provided skb.
 * Returns a checked and maybe trimmed skb. Returns NULL on error.
 *
 * If the skb has data beyond the given transport length, then a
 * trimmed & cloned skb is checked and returned.
 *
 * Caller needs to set the skb transport header and free any returned skb if it
 * differs from the provided skb.
 */
struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
                                     unsigned int transport_len,
                                     __sum16(*skb_chkf)(struct sk_buff *skb))
{
        struct sk_buff *skb_chk;
        unsigned int offset = skb_transport_offset(skb);
        __sum16 ret;

        skb_chk = skb_checksum_maybe_trim(skb, transport_len);
        if (!skb_chk)
                goto err;

        if (!pskb_may_pull(skb_chk, offset))
                goto err;

        skb_pull_rcsum(skb_chk, offset);
        ret = skb_chkf(skb_chk);
        skb_push_rcsum(skb_chk, offset);

        if (ret)
                goto err;

        return skb_chk;

err:
        if (skb_chk && skb_chk != skb)
                kfree_skb(skb_chk);

        return NULL;

}
EXPORT_SYMBOL(skb_checksum_trimmed);

void __skb_warn_lro_forwarding(const struct sk_buff *skb)
{
        net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n",
                             skb->dev->name);
}
EXPORT_SYMBOL(__skb_warn_lro_forwarding);

void kfree_skb_partial(struct sk_buff *skb, bool head_stolen)
{
        if (head_stolen) {
                skb_release_head_state(skb);
                kmem_cache_free(skbuff_head_cache, skb);
        } else {
                __kfree_skb(skb);
        }
}
EXPORT_SYMBOL(kfree_skb_partial);

/**
 * skb_try_coalesce - try to merge skb to prior one
 * @to: prior buffer
 * @from: buffer to add
 * @fragstolen: pointer to boolean
 * @delta_truesize: how much more was allocated than was requested
 */
bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
                      bool *fragstolen, int *delta_truesize)
{
        struct skb_shared_info *to_shinfo, *from_shinfo;
        int i, delta, len = from->len;

        *fragstolen = false;

        if (skb_cloned(to))
                return false;

        if (len <= skb_tailroom(to)) {
                if (len)
                        BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
                *delta_truesize = 0;
                return true;
        }

        to_shinfo = skb_shinfo(to);
        from_shinfo = skb_shinfo(from);
        if (to_shinfo->frag_list || from_shinfo->frag_list)
                return false;
        if (skb_zcopy(to) || skb_zcopy(from))
                return false;

        if (skb_headlen(from) != 0) {
                struct page *page;
                unsigned int offset;

                if (to_shinfo->nr_frags +
                    from_shinfo->nr_frags >= MAX_SKB_FRAGS)
                        return false;

                if (skb_head_is_locked(from))
                        return false;

                delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));

                page = virt_to_head_page(from->head);
                offset = from->data - (unsigned char *)page_address(page);

                skb_fill_page_desc(to, to_shinfo->nr_frags,
                                   page, offset, skb_headlen(from));
                *fragstolen = true;
        } else {
                if (to_shinfo->nr_frags +
                    from_shinfo->nr_frags > MAX_SKB_FRAGS)
                        return false;

                delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from));
        }

        WARN_ON_ONCE(delta < len);

        memcpy(to_shinfo->frags + to_shinfo->nr_frags,
               from_shinfo->frags,
               from_shinfo->nr_frags * sizeof(skb_frag_t));
        to_shinfo->nr_frags += from_shinfo->nr_frags;
        if (from_shinfo->nr_frags)
                to_shinfo->tx_flags |= from_shinfo->tx_flags & SKBTX_SHARED_FRAG;

        if (!skb_cloned(from))
                from_shinfo->nr_frags = 0;

        /* if the skb is not cloned this does nothing
         * since we set nr_frags to 0.
         */
        for (i = 0; i < from_shinfo->nr_frags; i++)
                __skb_frag_ref(&from_shinfo->frags[i]);

        to->truesize += delta;
        to->len += len;
        to->data_len += len;

        *delta_truesize = delta;
        return true;
}
EXPORT_SYMBOL(skb_try_coalesce);

/**
 * skb_scrub_packet - scrub an skb
 *
 * @skb: buffer to clean
 * @xnet: packet is crossing netns
 *
 * skb_scrub_packet can be used after encapsulating or decapsulting a packet
 * into/from a tunnel. Some information have to be cleared during these
 * operations.
 * skb_scrub_packet can also be used to clean a skb before injecting it in
 * another namespace (@xnet == true). We have to clear all information in the
 * skb that could impact namespace isolation.
 */
void skb_scrub_packet(struct sk_buff *skb, bool xnet)
{
        skb->pkt_type = PACKET_HOST;
        skb->skb_iif = 0;
        skb->ignore_df = 0;
        skb_dst_drop(skb);
        skb_ext_reset(skb);
        nf_reset_ct(skb);
        nf_reset_trace(skb);

#ifdef CONFIG_NET_SWITCHDEV
        skb->offload_fwd_mark = 0;
        skb->offload_l3_fwd_mark = 0;
#endif
        ipvs_reset(skb);

        if (!xnet)
                return;

        skb->mark = 0;
        skb->tstamp = 0;
}
EXPORT_SYMBOL_GPL(skb_scrub_packet);

/**
 * skb_gso_transport_seglen - Return length of individual segments of a gso packet
 *
 * @skb: GSO skb
 *
 * skb_gso_transport_seglen is used to determine the real size of the
 * individual segments, including Layer4 headers (TCP/UDP).
 *
 * The MAC/L2 or network (IP, IPv6) headers are not accounted for.
 */
static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
{
        const struct skb_shared_info *shinfo = skb_shinfo(skb);
        unsigned int thlen = 0;

        if (skb->encapsulation) {
                thlen = skb_inner_transport_header(skb) -
                        skb_transport_header(skb);

                if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
                        thlen += inner_tcp_hdrlen(skb);
        } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
                thlen = tcp_hdrlen(skb);
        } else if (unlikely(skb_is_gso_sctp(skb))) {
                thlen = sizeof(struct sctphdr);
        } else if (shinfo->gso_type & SKB_GSO_UDP_L4) {
                thlen = sizeof(struct udphdr);
        }
        /* UFO sets gso_size to the size of the fragmentation
         * payload, i.e. the size of the L4 (UDP) header is already
         * accounted for.
         */
        return thlen + shinfo->gso_size;
}

/**
 * skb_gso_network_seglen - Return length of individual segments of a gso packet
 *
 * @skb: GSO skb
 *
 * skb_gso_network_seglen is used to determine the real size of the
 * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP).
 *
 * The MAC/L2 header is not accounted for.
 */
static unsigned int skb_gso_network_seglen(const struct sk_buff *skb)
{
        unsigned int hdr_len = skb_transport_header(skb) -
                               skb_network_header(skb);

        return hdr_len + skb_gso_transport_seglen(skb);
}

/**
 * skb_gso_mac_seglen - Return length of individual segments of a gso packet
 *
 * @skb: GSO skb
 *
 * skb_gso_mac_seglen is used to determine the real size of the
 * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4
 * headers (TCP/UDP).
 */
static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb)
{
        unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb);

        return hdr_len + skb_gso_transport_seglen(skb);
}

/**
 * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS
 *
 * There are a couple of instances where we have a GSO skb, and we
 * want to determine what size it would be after it is segmented.
 *
 * We might want to check:
 * -    L3+L4+payload size (e.g. IP forwarding)
 * - L2+L3+L4+payload size (e.g. sanity check before passing to driver)
 *
 * This is a helper to do that correctly considering GSO_BY_FRAGS.
 *
 * @skb: GSO skb
 *
 * @seg_len: The segmented length (from skb_gso_*_seglen). In the
 *           GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS].
 *
 * @max_len: The maximum permissible length.
 *
 * Returns true if the segmented length <= max length.
 */
static inline bool skb_gso_size_check(const struct sk_buff *skb,
                                      unsigned int seg_len,
                                      unsigned int max_len) {
        const struct skb_shared_info *shinfo = skb_shinfo(skb);
        const struct sk_buff *iter;

        if (shinfo->gso_size != GSO_BY_FRAGS)
                return seg_len <= max_len;

        /* Undo this so we can re-use header sizes */
        seg_len -= GSO_BY_FRAGS;

        skb_walk_frags(skb, iter) {
                if (seg_len + skb_headlen(iter) > max_len)
                        return false;
        }

        return true;
}

/**
 * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU?
 *
 * @skb: GSO skb
 * @mtu: MTU to validate against
 *
 * skb_gso_validate_network_len validates if a given skb will fit a
 * wanted MTU once split. It considers L3 headers, L4 headers, and the
 * payload.
 */
bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu)
{
        return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu);
}
EXPORT_SYMBOL_GPL(skb_gso_validate_network_len);

/**
 * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length?
 *
 * @skb: GSO skb
 * @len: length to validate against
 *
 * skb_gso_validate_mac_len validates if a given skb will fit a wanted
 * length once split, including L2, L3 and L4 headers and the payload.
 */
bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len)
{
        return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len);
}
EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len);

static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb)
{
        int mac_len, meta_len;
        void *meta;

        if (skb_cow(skb, skb_headroom(skb)) < 0) {
                kfree_skb(skb);
                return NULL;
        }

        mac_len = skb->data - skb_mac_header(skb);
        if (likely(mac_len > VLAN_HLEN + ETH_TLEN)) {
                memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb),
                        mac_len - VLAN_HLEN - ETH_TLEN);
        }

        meta_len = skb_metadata_len(skb);
        if (meta_len) {
                meta = skb_metadata_end(skb) - meta_len;
                memmove(meta + VLAN_HLEN, meta, meta_len);
        }

        skb->mac_header += VLAN_HLEN;
        return skb;
}

struct sk_buff *skb_vlan_untag(struct sk_buff *skb)
{
        struct vlan_hdr *vhdr;
        u16 vlan_tci;

        if (unlikely(skb_vlan_tag_present(skb))) {
                /* vlan_tci is already set-up so leave this for another time */
                return skb;
        }

        skb = skb_share_check(skb, GFP_ATOMIC);
        if (unlikely(!skb))
                goto err_free;
        /* We may access the two bytes after vlan_hdr in vlan_set_encap_proto(). */
        if (unlikely(!pskb_may_pull(skb, VLAN_HLEN + sizeof(unsigned short))))
                goto err_free;

        vhdr = (struct vlan_hdr *)skb->data;
        vlan_tci = ntohs(vhdr->h_vlan_TCI);
        __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci);

        skb_pull_rcsum(skb, VLAN_HLEN);
        vlan_set_encap_proto(skb, vhdr);

        skb = skb_reorder_vlan_header(skb);
        if (unlikely(!skb))
                goto err_free;

        skb_reset_network_header(skb);
        skb_reset_transport_header(skb);
        skb_reset_mac_len(skb);

        return skb;

err_free:
        kfree_skb(skb);
        return NULL;
}
EXPORT_SYMBOL(skb_vlan_untag);

int skb_ensure_writable(struct sk_buff *skb, int write_len)
{
        if (!pskb_may_pull(skb, write_len))
                return -ENOMEM;

        if (!skb_cloned(skb) || skb_clone_writable(skb, write_len))
                return 0;

        return pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
}
EXPORT_SYMBOL(skb_ensure_writable);

/* remove VLAN header from packet and update csum accordingly.
 * expects a non skb_vlan_tag_present skb with a vlan tag payload
 */
int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
{
        struct vlan_hdr *vhdr;
        int offset = skb->data - skb_mac_header(skb);
        int err;

        if (WARN_ONCE(offset,
                      "__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n",
                      offset)) {
                return -EINVAL;
        }

        err = skb_ensure_writable(skb, VLAN_ETH_HLEN);
        if (unlikely(err))
                return err;

        skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);

        vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN);
        *vlan_tci = ntohs(vhdr->h_vlan_TCI);

        memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN);
        __skb_pull(skb, VLAN_HLEN);

        vlan_set_encap_proto(skb, vhdr);
        skb->mac_header += VLAN_HLEN;

        if (skb_network_offset(skb) < ETH_HLEN)
                skb_set_network_header(skb, ETH_HLEN);

        skb_reset_mac_len(skb);

        return err;
}
EXPORT_SYMBOL(__skb_vlan_pop);

/* Pop a vlan tag either from hwaccel or from payload.
 * Expects skb->data at mac header.
 */
int skb_vlan_pop(struct sk_buff *skb)
{
        u16 vlan_tci;
        __be16 vlan_proto;
        int err;

        if (likely(skb_vlan_tag_present(skb))) {
                __vlan_hwaccel_clear_tag(skb);
        } else {
                if (unlikely(!eth_type_vlan(skb->protocol)))
                        return 0;

                err = __skb_vlan_pop(skb, &vlan_tci);
                if (err)
                        return err;
        }
        /* move next vlan tag to hw accel tag */
        if (likely(!eth_type_vlan(skb->protocol)))
                return 0;

        vlan_proto = skb->protocol;
        err = __skb_vlan_pop(skb, &vlan_tci);
        if (unlikely(err))
                return err;

        __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
        return 0;
}
EXPORT_SYMBOL(skb_vlan_pop);

/* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present).
 * Expects skb->data at mac header.
 */
int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
{
        if (skb_vlan_tag_present(skb)) {
                int offset = skb->data - skb_mac_header(skb);
                int err;

                if (WARN_ONCE(offset,
                              "skb_vlan_push got skb with skb->data not at mac header (offset %d)\n",
                              offset)) {
                        return -EINVAL;
                }

                err = __vlan_insert_tag(skb, skb->vlan_proto,
                                        skb_vlan_tag_get(skb));
                if (err)
                        return err;

                skb->protocol = skb->vlan_proto;
                skb->mac_len += VLAN_HLEN;

                skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
        }
        __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
        return 0;
}
EXPORT_SYMBOL(skb_vlan_push);

/**
 * skb_eth_pop() - Drop the Ethernet header at the head of a packet
 *
 * @skb: Socket buffer to modify
 *
 * Drop the Ethernet header of @skb.
 *
 * Expects that skb->data points to the mac header and that no VLAN tags are
 * present.
 *
 * Returns 0 on success, -errno otherwise.
 */
int skb_eth_pop(struct sk_buff *skb)
{
        if (!pskb_may_pull(skb, ETH_HLEN) || skb_vlan_tagged(skb) ||
            skb_network_offset(skb) < ETH_HLEN)
                return -EPROTO;

        skb_pull_rcsum(skb, ETH_HLEN);
        skb_reset_mac_header(skb);
        skb_reset_mac_len(skb);

        return 0;
}
EXPORT_SYMBOL(skb_eth_pop);

/**
 * skb_eth_push() - Add a new Ethernet header at the head of a packet
 *
 * @skb: Socket buffer to modify
 * @dst: Destination MAC address of the new header
 * @src: Source MAC address of the new header
 *
 * Prepend @skb with a new Ethernet header.
 *
 * Expects that skb->data points to the mac header, which must be empty.
 *
 * Returns 0 on success, -errno otherwise.
 */
int skb_eth_push(struct sk_buff *skb, const unsigned char *dst,
                 const unsigned char *src)
{
        struct ethhdr *eth;
        int err;

        if (skb_network_offset(skb) || skb_vlan_tag_present(skb))
                return -EPROTO;

        err = skb_cow_head(skb, sizeof(*eth));
        if (err < 0)
                return err;

        skb_push(skb, sizeof(*eth));
        skb_reset_mac_header(skb);
        skb_reset_mac_len(skb);

        eth = eth_hdr(skb);
        ether_addr_copy(eth->h_dest, dst);
        ether_addr_copy(eth->h_source, src);
        eth->h_proto = skb->protocol;

        skb_postpush_rcsum(skb, eth, sizeof(*eth));

        return 0;
}
EXPORT_SYMBOL(skb_eth_push);

/* Update the ethertype of hdr and the skb csum value if required. */
static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr,
                             __be16 ethertype)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE) {
                __be16 diff[] = { ~hdr->h_proto, ethertype };

                skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
        }

        hdr->h_proto = ethertype;
}

/**
 * skb_mpls_push() - push a new MPLS header after mac_len bytes from start of
 *                   the packet
 *
 * @skb: buffer
 * @mpls_lse: MPLS label stack entry to push
 * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848)
 * @mac_len: length of the MAC header
 * @ethernet: flag to indicate if the resulting packet after skb_mpls_push is
 *            ethernet
 *
 * Expects skb->data at mac header.
 *
 * Returns 0 on success, -errno otherwise.
 */
int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto,
                  int mac_len, bool ethernet)
{
        struct mpls_shim_hdr *lse;
        int err;

        if (unlikely(!eth_p_mpls(mpls_proto)))
                return -EINVAL;

        /* Networking stack does not allow simultaneous Tunnel and MPLS GSO. */
        if (skb->encapsulation)
                return -EINVAL;

        err = skb_cow_head(skb, MPLS_HLEN);
        if (unlikely(err))
                return err;

        if (!skb->inner_protocol) {
                skb_set_inner_network_header(skb, skb_network_offset(skb));
                skb_set_inner_protocol(skb, skb->protocol);
        }

        skb_push(skb, MPLS_HLEN);
        memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb),
                mac_len);
        skb_reset_mac_header(skb);
        skb_set_network_header(skb, mac_len);
        skb_reset_mac_len(skb);

        lse = mpls_hdr(skb);
        lse->label_stack_entry = mpls_lse;
        skb_postpush_rcsum(skb, lse, MPLS_HLEN);

        if (ethernet && mac_len >= ETH_HLEN)
                skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto);
        skb->protocol = mpls_proto;

        return 0;
}
EXPORT_SYMBOL_GPL(skb_mpls_push);

/**
 * skb_mpls_pop() - pop the outermost MPLS header
 *
 * @skb: buffer
 * @next_proto: ethertype of header after popped MPLS header
 * @mac_len: length of the MAC header
 * @ethernet: flag to indicate if the packet is ethernet
 *
 * Expects skb->data at mac header.
 *
 * Returns 0 on success, -errno otherwise.
 */
int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len,
                 bool ethernet)
{
        int err;

        if (unlikely(!eth_p_mpls(skb->protocol)))
                return 0;

        err = skb_ensure_writable(skb, mac_len + MPLS_HLEN);
        if (unlikely(err))
                return err;

        skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN);
        memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb),
                mac_len);

        __skb_pull(skb, MPLS_HLEN);
        skb_reset_mac_header(skb);
        skb_set_network_header(skb, mac_len);

        if (ethernet && mac_len >= ETH_HLEN) {
                struct ethhdr *hdr;

                /* use mpls_hdr() to get ethertype to account for VLANs. */
                hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN);
                skb_mod_eth_type(skb, hdr, next_proto);
        }
        skb->protocol = next_proto;

        return 0;
}
EXPORT_SYMBOL_GPL(skb_mpls_pop);

/**
 * skb_mpls_update_lse() - modify outermost MPLS header and update csum
 *
 * @skb: buffer
 * @mpls_lse: new MPLS label stack entry to update to
 *
 * Expects skb->data at mac header.
 *
 * Returns 0 on success, -errno otherwise.
 */
int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse)
{
        int err;

        if (unlikely(!eth_p_mpls(skb->protocol)))
                return -EINVAL;

        err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
        if (unlikely(err))
                return err;

        if (skb->ip_summed == CHECKSUM_COMPLETE) {
                __be32 diff[] = { ~mpls_hdr(skb)->label_stack_entry, mpls_lse };

                skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
        }

        mpls_hdr(skb)->label_stack_entry = mpls_lse;

        return 0;
}
EXPORT_SYMBOL_GPL(skb_mpls_update_lse);

/**
 * skb_mpls_dec_ttl() - decrement the TTL of the outermost MPLS header
 *
 * @skb: buffer
 *
 * Expects skb->data at mac header.
 *
 * Returns 0 on success, -errno otherwise.
 */
int skb_mpls_dec_ttl(struct sk_buff *skb)
{
        u32 lse;
        u8 ttl;

        if (unlikely(!eth_p_mpls(skb->protocol)))
                return -EINVAL;

        if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN))
                return -ENOMEM;

        lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry);
        ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT;
        if (!--ttl)
                return -EINVAL;

        lse &= ~MPLS_LS_TTL_MASK;
        lse |= ttl << MPLS_LS_TTL_SHIFT;

        return skb_mpls_update_lse(skb, cpu_to_be32(lse));
}
EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl);

/**
 * alloc_skb_with_frags - allocate skb with page frags
 *
 * @header_len: size of linear part
 * @data_len: needed length in frags
 * @max_page_order: max page order desired.
 * @errcode: pointer to error code if any
 * @gfp_mask: allocation mask
 *
 * This can be used to allocate a paged skb, given a maximal order for frags.
 */
struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
                                     unsigned long data_len,
                                     int max_page_order,
                                     int *errcode,
                                     gfp_t gfp_mask)
{
        int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
        unsigned long chunk;
        struct sk_buff *skb;
        struct page *page;
        int i;

        *errcode = -EMSGSIZE;
        /* Note this test could be relaxed, if we succeed to allocate
         * high order pages...
         */
        if (npages > MAX_SKB_FRAGS)
                return NULL;

        *errcode = -ENOBUFS;
        skb = alloc_skb(header_len, gfp_mask);
        if (!skb)
                return NULL;

        skb->truesize += npages << PAGE_SHIFT;

        for (i = 0; npages > 0; i++) {
                int order = max_page_order;

                while (order) {
                        if (npages >= 1 << order) {
                                page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) |
                                                   __GFP_COMP |
                                                   __GFP_NOWARN,
                                                   order);
                                if (page)
                                        goto fill_page;
                                /* Do not retry other high order allocations */
                                order = 1;
                                max_page_order = 0;
                        }
                        order--;
                }
                page = alloc_page(gfp_mask);
                if (!page)
                        goto failure;
fill_page:
                chunk = min_t(unsigned long, data_len,
                              PAGE_SIZE << order);
                skb_fill_page_desc(skb, i, page, 0, chunk);
                data_len -= chunk;
                npages -= 1 << order;
        }
        return skb;

failure:
        kfree_skb(skb);
        return NULL;
}
EXPORT_SYMBOL(alloc_skb_with_frags);

/* carve out the first off bytes from skb when off < headlen */
static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
                                    const int headlen, gfp_t gfp_mask)
{
        int i;
        int size = skb_end_offset(skb);
        int new_hlen = headlen - off;
        u8 *data;

        size = SKB_DATA_ALIGN(size);

        if (skb_pfmemalloc(skb))
                gfp_mask |= __GFP_MEMALLOC;
        data = kmalloc_reserve(size +
                               SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
                               gfp_mask, NUMA_NO_NODE, NULL);
        if (!data)
                return -ENOMEM;

        size = SKB_WITH_OVERHEAD(ksize(data));

        /* Copy real data, and all frags */
        skb_copy_from_linear_data_offset(skb, off, data, new_hlen);
        skb->len -= off;

        memcpy((struct skb_shared_info *)(data + size),
               skb_shinfo(skb),
               offsetof(struct skb_shared_info,
                        frags[skb_shinfo(skb)->nr_frags]));
        if (skb_cloned(skb)) {
                /* drop the old head gracefully */
                if (skb_orphan_frags(skb, gfp_mask)) {
                        kfree(data);
                        return -ENOMEM;
                }
                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
                        skb_frag_ref(skb, i);
                if (skb_has_frag_list(skb))
                        skb_clone_fraglist(skb);
                skb_release_data(skb);
        } else {
                /* we can reuse existing recount- all we did was
                 * relocate values
                 */
                skb_free_head(skb);
        }

        skb->head = data;
        skb->data = data;
        skb->head_frag = 0;
#ifdef NET_SKBUFF_DATA_USES_OFFSET
        skb->end = size;
#else
        skb->end = skb->head + size;
#endif
        skb_set_tail_pointer(skb, skb_headlen(skb));
        skb_headers_offset_update(skb, 0);
        skb->cloned = 0;
        skb->hdr_len = 0;
        skb->nohdr = 0;
        atomic_set(&skb_shinfo(skb)->dataref, 1);

        return 0;
}

static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp);

/* carve out the first eat bytes from skb's frag_list. May recurse into
 * pskb_carve()
 */
static int pskb_carve_frag_list(struct sk_buff *skb,
                                struct skb_shared_info *shinfo, int eat,
                                gfp_t gfp_mask)
{
        struct sk_buff *list = shinfo->frag_list;
        struct sk_buff *clone = NULL;
        struct sk_buff *insp = NULL;

        do {
                if (!list) {
                        pr_err("Not enough bytes to eat. Want %d\n", eat);
                        return -EFAULT;
                }
                if (list->len <= eat) {
                        /* Eaten as whole. */
                        eat -= list->len;
                        list = list->next;
                        insp = list;
                } else {
                        /* Eaten partially. */
                        if (skb_shared(list)) {
                                clone = skb_clone(list, gfp_mask);
                                if (!clone)
                                        return -ENOMEM;
                                insp = list->next;
                                list = clone;
                        } else {
                                /* This may be pulled without problems. */
                                insp = list;
                        }
                        if (pskb_carve(list, eat, gfp_mask) < 0) {
                                kfree_skb(clone);
                                return -ENOMEM;
                        }
                        break;
                }
        } while (eat);

        /* Free pulled out fragments. */
        while ((list = shinfo->frag_list) != insp) {
                shinfo->frag_list = list->next;
                consume_skb(list);
        }
        /* And insert new clone at head. */
        if (clone) {
                clone->next = list;
                shinfo->frag_list = clone;
        }
        return 0;
}

/* carve off first len bytes from skb. Split line (off) is in the
 * non-linear part of skb
 */
static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
                                       int pos, gfp_t gfp_mask)
{
        int i, k = 0;
        int size = skb_end_offset(skb);
        u8 *data;
        const int nfrags = skb_shinfo(skb)->nr_frags;
        struct skb_shared_info *shinfo;

        size = SKB_DATA_ALIGN(size);

        if (skb_pfmemalloc(skb))
                gfp_mask |= __GFP_MEMALLOC;
        data = kmalloc_reserve(size +
                               SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
                               gfp_mask, NUMA_NO_NODE, NULL);
        if (!data)
                return -ENOMEM;

        size = SKB_WITH_OVERHEAD(ksize(data));

        memcpy((struct skb_shared_info *)(data + size),
               skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0]));
        if (skb_orphan_frags(skb, gfp_mask)) {
                kfree(data);
                return -ENOMEM;
        }
        shinfo = (struct skb_shared_info *)(data + size);
        for (i = 0; i < nfrags; i++) {
                int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]);

                if (pos + fsize > off) {
                        shinfo->frags[k] = skb_shinfo(skb)->frags[i];

                        if (pos < off) {
                                /* Split frag.
                                 * We have two variants in this case:
                                 * 1. Move all the frag to the second
                                 *    part, if it is possible. F.e.
                                 *    this approach is mandatory for TUX,
                                 *    where splitting is expensive.
                                 * 2. Split is accurately. We make this.
                                 */
                                skb_frag_off_add(&shinfo->frags[0], off - pos);
                                skb_frag_size_sub(&shinfo->frags[0], off - pos);
                        }
                        skb_frag_ref(skb, i);
                        k++;
                }
                pos += fsize;
        }
        shinfo->nr_frags = k;
        if (skb_has_frag_list(skb))
                skb_clone_fraglist(skb);

        /* split line is in frag list */
        if (k == 0 && pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask)) {
                /* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */
                if (skb_has_frag_list(skb))
                        kfree_skb_list(skb_shinfo(skb)->frag_list);
                kfree(data);
                return -ENOMEM;
        }
        skb_release_data(skb);

        skb->head = data;
        skb->head_frag = 0;
        skb->data = data;
#ifdef NET_SKBUFF_DATA_USES_OFFSET
        skb->end = size;
#else
        skb->end = skb->head + size;
#endif
        skb_reset_tail_pointer(skb);
        skb_headers_offset_update(skb, 0);
        skb->cloned   = 0;
        skb->hdr_len  = 0;
        skb->nohdr    = 0;
        skb->len -= off;
        skb->data_len = skb->len;
        atomic_set(&skb_shinfo(skb)->dataref, 1);
        return 0;
}

/* remove len bytes from the beginning of the skb */
static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp)
{
        int headlen = skb_headlen(skb);

        if (len < headlen)
                return pskb_carve_inside_header(skb, len, headlen, gfp);
        else
                return pskb_carve_inside_nonlinear(skb, len, headlen, gfp);
}

/* Extract to_copy bytes starting at off from skb, and return this in
 * a new skb
 */
struct sk_buff *pskb_extract(struct sk_buff *skb, int off,
                             int to_copy, gfp_t gfp)
{
        struct sk_buff  *clone = skb_clone(skb, gfp);

        if (!clone)
                return NULL;

        if (pskb_carve(clone, off, gfp) < 0 ||
            pskb_trim(clone, to_copy)) {
                kfree_skb(clone);
                return NULL;
        }
        return clone;
}
EXPORT_SYMBOL(pskb_extract);

/**
 * skb_condense - try to get rid of fragments/frag_list if possible
 * @skb: buffer
 *
 * Can be used to save memory before skb is added to a busy queue.
 * If packet has bytes in frags and enough tail room in skb->head,
 * pull all of them, so that we can free the frags right now and adjust
 * truesize.
 * Notes:
 *        We do not reallocate skb->head thus can not fail.
 *        Caller must re-evaluate skb->truesize if needed.
 */
void skb_condense(struct sk_buff *skb)
{
        if (skb->data_len) {
                if (skb->data_len > skb->end - skb->tail ||
                    skb_cloned(skb))
                        return;

                /* Nice, we can free page frag(s) right now */
                __pskb_pull_tail(skb, skb->data_len);
        }
        /* At this point, skb->truesize might be over estimated,
         * because skb had a fragment, and fragments do not tell
         * their truesize.
         * When we pulled its content into skb->head, fragment
         * was freed, but __pskb_pull_tail() could not possibly
         * adjust skb->truesize, not knowing the frag truesize.
         */
        skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
}

#ifdef CONFIG_SKB_EXTENSIONS
static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id)
{
        return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE);
}

/**
 * __skb_ext_alloc - allocate a new skb extensions storage
 *
 * @flags: See kmalloc().
 *
 * Returns the newly allocated pointer. The pointer can later attached to a
 * skb via __skb_ext_set().
 * Note: caller must handle the skb_ext as an opaque data.
 */
struct skb_ext *__skb_ext_alloc(gfp_t flags)
{
        struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags);

        if (new) {
                memset(new->offset, 0, sizeof(new->offset));
                refcount_set(&new->refcnt, 1);
        }

        return new;
}

static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old,
                                         unsigned int old_active)
{
        struct skb_ext *new;

        if (refcount_read(&old->refcnt) == 1)
                return old;

        new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC);
        if (!new)
                return NULL;

        memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE);
        refcount_set(&new->refcnt, 1);

#ifdef CONFIG_XFRM
        if (old_active & (1 << SKB_EXT_SEC_PATH)) {
                struct sec_path *sp = skb_ext_get_ptr(old, SKB_EXT_SEC_PATH);
                unsigned int i;

                for (i = 0; i < sp->len; i++)
                        xfrm_state_hold(sp->xvec[i]);
        }
#endif
        __skb_ext_put(old);
        return new;
}

/**
 * __skb_ext_set - attach the specified extension storage to this skb
 * @skb: buffer
 * @id: extension id
 * @ext: extension storage previously allocated via __skb_ext_alloc()
 *
 * Existing extensions, if any, are cleared.
 *
 * Returns the pointer to the extension.
 */
void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id,
                    struct skb_ext *ext)
{
        unsigned int newlen, newoff = SKB_EXT_CHUNKSIZEOF(*ext);

        skb_ext_put(skb);
        newlen = newoff + skb_ext_type_len[id];
        ext->chunks = newlen;
        ext->offset[id] = newoff;
        skb->extensions = ext;
        skb->active_extensions = 1 << id;
        return skb_ext_get_ptr(ext, id);
}

/**
 * skb_ext_add - allocate space for given extension, COW if needed
 * @skb: buffer
 * @id: extension to allocate space for
 *
 * Allocates enough space for the given extension.
 * If the extension is already present, a pointer to that extension
 * is returned.
 *
 * If the skb was cloned, COW applies and the returned memory can be
 * modified without changing the extension space of clones buffers.
 *
 * Returns pointer to the extension or NULL on allocation failure.
 */
void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id)
{
        struct skb_ext *new, *old = NULL;
        unsigned int newlen, newoff;

        if (skb->active_extensions) {
                old = skb->extensions;

                new = skb_ext_maybe_cow(old, skb->active_extensions);
                if (!new)
                        return NULL;

                if (__skb_ext_exist(new, id))
                        goto set_active;

                newoff = new->chunks;
        } else {
                newoff = SKB_EXT_CHUNKSIZEOF(*new);

                new = __skb_ext_alloc(GFP_ATOMIC);
                if (!new)
                        return NULL;
        }

        newlen = newoff + skb_ext_type_len[id];
        new->chunks = newlen;
        new->offset[id] = newoff;
set_active:
        skb->extensions = new;
        skb->active_extensions |= 1 << id;
        return skb_ext_get_ptr(new, id);
}
EXPORT_SYMBOL(skb_ext_add);

#ifdef CONFIG_XFRM
static void skb_ext_put_sp(struct sec_path *sp)
{
        unsigned int i;

        for (i = 0; i < sp->len; i++)
                xfrm_state_put(sp->xvec[i]);
}
#endif

void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id)
{
        struct skb_ext *ext = skb->extensions;

        skb->active_extensions &= ~(1 << id);
        if (skb->active_extensions == 0) {
                skb->extensions = NULL;
                __skb_ext_put(ext);
#ifdef CONFIG_XFRM
        } else if (id == SKB_EXT_SEC_PATH &&
                   refcount_read(&ext->refcnt) == 1) {
                struct sec_path *sp = skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH);

                skb_ext_put_sp(sp);
                sp->len = 0;
#endif
        }
}
EXPORT_SYMBOL(__skb_ext_del);

void __skb_ext_put(struct skb_ext *ext)
{
        /* If this is last clone, nothing can increment
         * it after check passes.  Avoids one atomic op.
         */
        if (refcount_read(&ext->refcnt) == 1)
                goto free_now;

        if (!refcount_dec_and_test(&ext->refcnt))
                return;
free_now:
#ifdef CONFIG_XFRM
        if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH))
                skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH));
#endif

        kmem_cache_free(skbuff_ext_cache, ext);
}
EXPORT_SYMBOL(__skb_ext_put);
#endif /* CONFIG_SKB_EXTENSIONS */





































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_UNALIGNED_ACCESS_OK_H
#define _LINUX_UNALIGNED_ACCESS_OK_H

#include <linux/kernel.h>
#include <asm/byteorder.h>

static __always_inline u16 get_unaligned_le16(const void *p)
{
        return le16_to_cpup((__le16 *)p);
}

static __always_inline u32 get_unaligned_le32(const void *p)
{
        return le32_to_cpup((__le32 *)p);
}

static __always_inline u64 get_unaligned_le64(const void *p)
{
        return le64_to_cpup((__le64 *)p);
}

static __always_inline u16 get_unaligned_be16(const void *p)
{
        return be16_to_cpup((__be16 *)p);
}

static __always_inline u32 get_unaligned_be32(const void *p)
{
        return be32_to_cpup((__be32 *)p);
}

static __always_inline u64 get_unaligned_be64(const void *p)
{
        return be64_to_cpup((__be64 *)p);
}

static __always_inline void put_unaligned_le16(u16 val, void *p)
{
        *((__le16 *)p) = cpu_to_le16(val);
}

static __always_inline void put_unaligned_le32(u32 val, void *p)
{
        *((__le32 *)p) = cpu_to_le32(val);
}

static __always_inline void put_unaligned_le64(u64 val, void *p)
{
        *((__le64 *)p) = cpu_to_le64(val);
}

static __always_inline void put_unaligned_be16(u16 val, void *p)
{
        *((__be16 *)p) = cpu_to_be16(val);
}

static __always_inline void put_unaligned_be32(u32 val, void *p)
{
        *((__be32 *)p) = cpu_to_be32(val);
}

static __always_inline void put_unaligned_be64(u64 val, void *p)
{
        *((__be64 *)p) = cpu_to_be64(val);
}

#endif /* _LINUX_UNALIGNED_ACCESS_OK_H */







































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *  linux/drivers/char/serial_core.h
 *
 *  Copyright (C) 2000 Deep Blue Solutions Ltd.
 */
#ifndef LINUX_SERIAL_CORE_H
#define LINUX_SERIAL_CORE_H

#include <linux/bitops.h>
#include <linux/compiler.h>
#include <linux/console.h>
#include <linux/interrupt.h>
#include <linux/circ_buf.h>
#include <linux/spinlock.h>
#include <linux/sched.h>
#include <linux/tty.h>
#include <linux/mutex.h>
#include <linux/sysrq.h>
#include <uapi/linux/serial_core.h>

#ifdef CONFIG_SERIAL_CORE_CONSOLE
#define uart_console(port) \
        ((port)->cons && (port)->cons->index == (port)->line)
#else
#define uart_console(port)      ({ (void)port; 0; })
#endif

struct uart_port;
struct serial_struct;
struct device;
struct gpio_desc;

/*
 * This structure describes all the operations that can be done on the
 * physical hardware.  See Documentation/driver-api/serial/driver.rst for details.
 */
struct uart_ops {
        unsigned int        (*tx_empty)(struct uart_port *);
        void                (*set_mctrl)(struct uart_port *, unsigned int mctrl);
        unsigned int        (*get_mctrl)(struct uart_port *);
        void                (*stop_tx)(struct uart_port *);
        void                (*start_tx)(struct uart_port *);
        void                (*throttle)(struct uart_port *);
        void                (*unthrottle)(struct uart_port *);
        void                (*send_xchar)(struct uart_port *, char ch);
        void                (*stop_rx)(struct uart_port *);
        void                (*enable_ms)(struct uart_port *);
        void                (*break_ctl)(struct uart_port *, int ctl);
        int                (*startup)(struct uart_port *);
        void                (*shutdown)(struct uart_port *);
        void                (*flush_buffer)(struct uart_port *);
        void                (*set_termios)(struct uart_port *, struct ktermios *new,
                                       struct ktermios *old);
        void                (*set_ldisc)(struct uart_port *, struct ktermios *);
        void                (*pm)(struct uart_port *, unsigned int state,
                              unsigned int oldstate);

        /*
         * Return a string describing the type of the port
         */
        const char        *(*type)(struct uart_port *);

        /*
         * Release IO and memory resources used by the port.
         * This includes iounmap if necessary.
         */
        void                (*release_port)(struct uart_port *);

        /*
         * Request IO and memory resources used by the port.
         * This includes iomapping the port if necessary.
         */
        int                (*request_port)(struct uart_port *);
        void                (*config_port)(struct uart_port *, int);
        int                (*verify_port)(struct uart_port *, struct serial_struct *);
        int                (*ioctl)(struct uart_port *, unsigned int, unsigned long);
#ifdef CONFIG_CONSOLE_POLL
        int                (*poll_init)(struct uart_port *);
        void                (*poll_put_char)(struct uart_port *, unsigned char);
        int                (*poll_get_char)(struct uart_port *);
#endif
};

#define NO_POLL_CHAR                0x00ff0000
#define UART_CONFIG_TYPE        (1 << 0)
#define UART_CONFIG_IRQ                (1 << 1)

struct uart_icount {
        __u32        cts;
        __u32        dsr;
        __u32        rng;
        __u32        dcd;
        __u32        rx;
        __u32        tx;
        __u32        frame;
        __u32        overrun;
        __u32        parity;
        __u32        brk;
        __u32        buf_overrun;
};

typedef u64 __bitwise upf_t;
typedef unsigned int __bitwise upstat_t;

struct uart_port {
        spinlock_t                lock;                        /* port lock */
        unsigned long                iobase;                        /* in/out[bwl] */
        unsigned char __iomem        *membase;                /* read/write[bwl] */
        unsigned int                (*serial_in)(struct uart_port *, int);
        void                        (*serial_out)(struct uart_port *, int, int);
        void                        (*set_termios)(struct uart_port *,
                                               struct ktermios *new,
                                               struct ktermios *old);
        void                        (*set_ldisc)(struct uart_port *,
                                             struct ktermios *);
        unsigned int                (*get_mctrl)(struct uart_port *);
        void                        (*set_mctrl)(struct uart_port *, unsigned int);
        unsigned int                (*get_divisor)(struct uart_port *,
                                               unsigned int baud,
                                               unsigned int *frac);
        void                        (*set_divisor)(struct uart_port *,
                                               unsigned int baud,
                                               unsigned int quot,
                                               unsigned int quot_frac);
        int                        (*startup)(struct uart_port *port);
        void                        (*shutdown)(struct uart_port *port);
        void                        (*throttle)(struct uart_port *port);
        void                        (*unthrottle)(struct uart_port *port);
        int                        (*handle_irq)(struct uart_port *);
        void                        (*pm)(struct uart_port *, unsigned int state,
                                      unsigned int old);
        void                        (*handle_break)(struct uart_port *);
        int                        (*rs485_config)(struct uart_port *,
                                                struct serial_rs485 *rs485);
        int                        (*iso7816_config)(struct uart_port *,
                                                  struct serial_iso7816 *iso7816);
        unsigned int                irq;                        /* irq number */
        unsigned long                irqflags;                /* irq flags  */
        unsigned int                uartclk;                /* base uart clock */
        unsigned int                fifosize;                /* tx fifo size */
        unsigned char                x_char;                        /* xon/xoff char */
        unsigned char                regshift;                /* reg offset shift */
        unsigned char                iotype;                        /* io access style */
        unsigned char                quirks;                        /* internal quirks */

#define UPIO_PORT                (SERIAL_IO_PORT)        /* 8b I/O port access */
#define UPIO_HUB6                (SERIAL_IO_HUB6)        /* Hub6 ISA card */
#define UPIO_MEM                (SERIAL_IO_MEM)                /* driver-specific */
#define UPIO_MEM32                (SERIAL_IO_MEM32)        /* 32b little endian */
#define UPIO_AU                        (SERIAL_IO_AU)                /* Au1x00 and RT288x type IO */
#define UPIO_TSI                (SERIAL_IO_TSI)                /* Tsi108/109 type IO */
#define UPIO_MEM32BE                (SERIAL_IO_MEM32BE)        /* 32b big endian */
#define UPIO_MEM16                (SERIAL_IO_MEM16)        /* 16b little endian */

        /* quirks must be updated while holding port mutex */
#define UPQ_NO_TXEN_TEST        BIT(0)

        unsigned int                read_status_mask;        /* driver specific */
        unsigned int                ignore_status_mask;        /* driver specific */
        struct uart_state        *state;                        /* pointer to parent state */
        struct uart_icount        icount;                        /* statistics */

        struct console                *cons;                        /* struct console, if any */
        /* flags must be updated while holding port mutex */
        upf_t                        flags;

        /*
         * These flags must be equivalent to the flags defined in
         * include/uapi/linux/tty_flags.h which are the userspace definitions
         * assigned from the serial_struct flags in uart_set_info()
         * [for bit definitions in the UPF_CHANGE_MASK]
         *
         * Bits [0..UPF_LAST_USER] are userspace defined/visible/changeable
         * The remaining bits are serial-core specific and not modifiable by
         * userspace.
         */
#define UPF_FOURPORT                ((__force upf_t) ASYNC_FOURPORT       /* 1  */ )
#define UPF_SAK                        ((__force upf_t) ASYNC_SAK            /* 2  */ )
#define UPF_SPD_HI                ((__force upf_t) ASYNC_SPD_HI         /* 4  */ )
#define UPF_SPD_VHI                ((__force upf_t) ASYNC_SPD_VHI        /* 5  */ )
#define UPF_SPD_CUST                ((__force upf_t) ASYNC_SPD_CUST   /* 0x0030 */ )
#define UPF_SPD_WARP                ((__force upf_t) ASYNC_SPD_WARP   /* 0x1010 */ )
#define UPF_SPD_MASK                ((__force upf_t) ASYNC_SPD_MASK   /* 0x1030 */ )
#define UPF_SKIP_TEST                ((__force upf_t) ASYNC_SKIP_TEST      /* 6  */ )
#define UPF_AUTO_IRQ                ((__force upf_t) ASYNC_AUTO_IRQ       /* 7  */ )
#define UPF_HARDPPS_CD                ((__force upf_t) ASYNC_HARDPPS_CD     /* 11 */ )
#define UPF_SPD_SHI                ((__force upf_t) ASYNC_SPD_SHI        /* 12 */ )
#define UPF_LOW_LATENCY                ((__force upf_t) ASYNC_LOW_LATENCY    /* 13 */ )
#define UPF_BUGGY_UART                ((__force upf_t) ASYNC_BUGGY_UART     /* 14 */ )
#define UPF_MAGIC_MULTIPLIER        ((__force upf_t) ASYNC_MAGIC_MULTIPLIER /* 16 */ )

#define UPF_NO_THRE_TEST        ((__force upf_t) (1 << 19))
/* Port has hardware-assisted h/w flow control */
#define UPF_AUTO_CTS                ((__force upf_t) (1 << 20))
#define UPF_AUTO_RTS                ((__force upf_t) (1 << 21))
#define UPF_HARD_FLOW                ((__force upf_t) (UPF_AUTO_CTS | UPF_AUTO_RTS))
/* Port has hardware-assisted s/w flow control */
#define UPF_SOFT_FLOW                ((__force upf_t) (1 << 22))
#define UPF_CONS_FLOW                ((__force upf_t) (1 << 23))
#define UPF_SHARE_IRQ                ((__force upf_t) (1 << 24))
#define UPF_EXAR_EFR                ((__force upf_t) (1 << 25))
#define UPF_BUG_THRE                ((__force upf_t) (1 << 26))
/* The exact UART type is known and should not be probed.  */
#define UPF_FIXED_TYPE                ((__force upf_t) (1 << 27))
#define UPF_BOOT_AUTOCONF        ((__force upf_t) (1 << 28))
#define UPF_FIXED_PORT                ((__force upf_t) (1 << 29))
#define UPF_DEAD                ((__force upf_t) (1 << 30))
#define UPF_IOREMAP                ((__force upf_t) (1 << 31))
#define UPF_FULL_PROBE                ((__force upf_t) (1ULL << 32))

#define __UPF_CHANGE_MASK        0x17fff
#define UPF_CHANGE_MASK                ((__force upf_t) __UPF_CHANGE_MASK)
#define UPF_USR_MASK                ((__force upf_t) (UPF_SPD_MASK|UPF_LOW_LATENCY))

#if __UPF_CHANGE_MASK > ASYNC_FLAGS
#error Change mask not equivalent to userspace-visible bit defines
#endif

        /*
         * Must hold termios_rwsem, port mutex and port lock to change;
         * can hold any one lock to read.
         */
        upstat_t                status;

#define UPSTAT_CTS_ENABLE        ((__force upstat_t) (1 << 0))
#define UPSTAT_DCD_ENABLE        ((__force upstat_t) (1 << 1))
#define UPSTAT_AUTORTS                ((__force upstat_t) (1 << 2))
#define UPSTAT_AUTOCTS                ((__force upstat_t) (1 << 3))
#define UPSTAT_AUTOXOFF                ((__force upstat_t) (1 << 4))
#define UPSTAT_SYNC_FIFO        ((__force upstat_t) (1 << 5))

        int                        hw_stopped;                /* sw-assisted CTS flow state */
        unsigned int                mctrl;                        /* current modem ctrl settings */
        unsigned int                timeout;                /* character-based timeout */
        unsigned int                type;                        /* port type */
        const struct uart_ops        *ops;
        unsigned int                custom_divisor;
        unsigned int                line;                        /* port index */
        unsigned int                minor;
        resource_size_t                mapbase;                /* for ioremap */
        resource_size_t                mapsize;
        struct device                *dev;                        /* parent device */

        unsigned long                sysrq;                        /* sysrq timeout */
        unsigned int                sysrq_ch;                /* char for sysrq */
        unsigned char                has_sysrq;
        unsigned char                sysrq_seq;                /* index in sysrq_toggle_seq */

        unsigned char                hub6;                        /* this should be in the 8250 driver */
        unsigned char                suspended;
        unsigned char                console_reinit;
        const char                *name;                        /* port name */
        struct attribute_group        *attr_group;                /* port specific attributes */
        const struct attribute_group **tty_groups;        /* all attributes (serial core use only) */
        struct serial_rs485     rs485;
        const struct serial_rs485        *rs485_supported;        /* Supported mask for serial_rs485 */
        struct gpio_desc        *rs485_term_gpio;        /* enable RS485 bus termination */
        struct serial_iso7816   iso7816;
        void                        *private_data;                /* generic platform data pointer */
};

/**
 * uart_port_lock - Lock the UART port
 * @up:                Pointer to UART port structure
 */
static inline void uart_port_lock(struct uart_port *up)
{
        spin_lock(&up->lock);
}

/**
 * uart_port_lock_irq - Lock the UART port and disable interrupts
 * @up:                Pointer to UART port structure
 */
static inline void uart_port_lock_irq(struct uart_port *up)
{
        spin_lock_irq(&up->lock);
}

/**
 * uart_port_lock_irqsave - Lock the UART port, save and disable interrupts
 * @up:                Pointer to UART port structure
 * @flags:        Pointer to interrupt flags storage
 */
static inline void uart_port_lock_irqsave(struct uart_port *up, unsigned long *flags)
{
        spin_lock_irqsave(&up->lock, *flags);
}

/**
 * uart_port_trylock - Try to lock the UART port
 * @up:                Pointer to UART port structure
 *
 * Returns: True if lock was acquired, false otherwise
 */
static inline bool uart_port_trylock(struct uart_port *up)
{
        return spin_trylock(&up->lock);
}

/**
 * uart_port_trylock_irqsave - Try to lock the UART port, save and disable interrupts
 * @up:                Pointer to UART port structure
 * @flags:        Pointer to interrupt flags storage
 *
 * Returns: True if lock was acquired, false otherwise
 */
static inline bool uart_port_trylock_irqsave(struct uart_port *up, unsigned long *flags)
{
        return spin_trylock_irqsave(&up->lock, *flags);
}

/**
 * uart_port_unlock - Unlock the UART port
 * @up:                Pointer to UART port structure
 */
static inline void uart_port_unlock(struct uart_port *up)
{
        spin_unlock(&up->lock);
}

/**
 * uart_port_unlock_irq - Unlock the UART port and re-enable interrupts
 * @up:                Pointer to UART port structure
 */
static inline void uart_port_unlock_irq(struct uart_port *up)
{
        spin_unlock_irq(&up->lock);
}

/**
 * uart_port_unlock_irqrestore - Unlock the UART port, restore interrupts
 * @up:                Pointer to UART port structure
 * @flags:        The saved interrupt flags for restore
 */
static inline void uart_port_unlock_irqrestore(struct uart_port *up, unsigned long flags)
{
        spin_unlock_irqrestore(&up->lock, flags);
}

static inline int serial_port_in(struct uart_port *up, int offset)
{
        return up->serial_in(up, offset);
}

static inline void serial_port_out(struct uart_port *up, int offset, int value)
{
        up->serial_out(up, offset, value);
}

/**
 * enum uart_pm_state - power states for UARTs
 * @UART_PM_STATE_ON: UART is powered, up and operational
 * @UART_PM_STATE_OFF: UART is powered off
 * @UART_PM_STATE_UNDEFINED: sentinel
 */
enum uart_pm_state {
        UART_PM_STATE_ON = 0,
        UART_PM_STATE_OFF = 3, /* number taken from ACPI */
        UART_PM_STATE_UNDEFINED,
};

/*
 * This is the state information which is persistent across opens.
 */
struct uart_state {
        struct tty_port                port;

        enum uart_pm_state        pm_state;
        struct circ_buf                xmit;

        atomic_t                refcount;
        wait_queue_head_t        remove_wait;
        struct uart_port        *uart_port;
};

#define UART_XMIT_SIZE        PAGE_SIZE


/* number of characters left in xmit buffer before we ask for more */
#define WAKEUP_CHARS                256

/**
 * uart_xmit_advance - Advance xmit buffer and account Tx'ed chars
 * @up: uart_port structure describing the port
 * @chars: number of characters sent
 *
 * This function advances the tail of circular xmit buffer by the number of
 * @chars transmitted and handles accounting of transmitted bytes (into
 * @up's icount.tx).
 */
static inline void uart_xmit_advance(struct uart_port *up, unsigned int chars)
{
        struct circ_buf *xmit = &up->state->xmit;

        xmit->tail = (xmit->tail + chars) & (UART_XMIT_SIZE - 1);
        up->icount.tx += chars;
}

struct module;
struct tty_driver;

struct uart_driver {
        struct module                *owner;
        const char                *driver_name;
        const char                *dev_name;
        int                         major;
        int                         minor;
        int                         nr;
        struct console                *cons;

        /*
         * these are private; the low level driver should not
         * touch these; they should be initialised to NULL
         */
        struct uart_state        *state;
        struct tty_driver        *tty_driver;
};

void uart_write_wakeup(struct uart_port *port);

/*
 * Baud rate helpers.
 */
void uart_update_timeout(struct uart_port *port, unsigned int cflag,
                         unsigned int baud);
unsigned int uart_get_baud_rate(struct uart_port *port, struct ktermios *termios,
                                struct ktermios *old, unsigned int min,
                                unsigned int max);
unsigned int uart_get_divisor(struct uart_port *port, unsigned int baud);

/* Base timer interval for polling */
static inline int uart_poll_timeout(struct uart_port *port)
{
        int timeout = port->timeout;

        return timeout > 6 ? (timeout / 2 - 2) : 1;
}

/*
 * Console helpers.
 */
struct earlycon_device {
        struct console *con;
        struct uart_port port;
        char options[16];                /* e.g., 115200n8 */
        unsigned int baud;
};

struct earlycon_id {
        char        name[15];
        char        name_term;        /* In case compiler didn't '\0' term name */
        char        compatible[128];
        int        (*setup)(struct earlycon_device *, const char *options);
};

extern const struct earlycon_id *__earlycon_table[];
extern const struct earlycon_id *__earlycon_table_end[];

#if defined(CONFIG_SERIAL_EARLYCON) && !defined(MODULE)
#define EARLYCON_USED_OR_UNUSED        __used
#else
#define EARLYCON_USED_OR_UNUSED        __maybe_unused
#endif

#define _OF_EARLYCON_DECLARE(_name, compat, fn, unique_id)                \
        static const struct earlycon_id unique_id                        \
             EARLYCON_USED_OR_UNUSED __initconst                        \
                = { .name = __stringify(_name),                                \
                    .compatible = compat,                                \
                    .setup = fn  };                                        \
        static const struct earlycon_id EARLYCON_USED_OR_UNUSED                \
                __section("__earlycon_table")                                \
                * const __PASTE(__p, unique_id) = &unique_id

#define OF_EARLYCON_DECLARE(_name, compat, fn)                                \
        _OF_EARLYCON_DECLARE(_name, compat, fn,                                \
                             __UNIQUE_ID(__earlycon_##_name))

#define EARLYCON_DECLARE(_name, fn)        OF_EARLYCON_DECLARE(_name, "", fn)

extern int of_setup_earlycon(const struct earlycon_id *match,
                             unsigned long node,
                             const char *options);

#ifdef CONFIG_SERIAL_EARLYCON
extern bool earlycon_acpi_spcr_enable __initdata;
int setup_earlycon(char *buf);
#else
static const bool earlycon_acpi_spcr_enable EARLYCON_USED_OR_UNUSED;
static inline int setup_earlycon(char *buf) { return 0; }
#endif

static inline bool uart_console_enabled(struct uart_port *port)
{
        return uart_console(port) && (port->cons->flags & CON_ENABLED);
}

struct uart_port *uart_get_console(struct uart_port *ports, int nr,
                                   struct console *c);
int uart_parse_earlycon(char *p, unsigned char *iotype, resource_size_t *addr,
                        char **options);
void uart_parse_options(const char *options, int *baud, int *parity, int *bits,
                        int *flow);
int uart_set_options(struct uart_port *port, struct console *co, int baud,
                     int parity, int bits, int flow);
struct tty_driver *uart_console_device(struct console *co, int *index);
void uart_console_write(struct uart_port *port, const char *s,
                        unsigned int count,
                        void (*putchar)(struct uart_port *, int));

/*
 * Port/driver registration/removal
 */
int uart_register_driver(struct uart_driver *uart);
void uart_unregister_driver(struct uart_driver *uart);
int uart_add_one_port(struct uart_driver *reg, struct uart_port *port);
int uart_remove_one_port(struct uart_driver *reg, struct uart_port *port);
int uart_match_port(struct uart_port *port1, struct uart_port *port2);

/*
 * Power Management
 */
int uart_suspend_port(struct uart_driver *reg, struct uart_port *port);
int uart_resume_port(struct uart_driver *reg, struct uart_port *port);

#define uart_circ_empty(circ)                ((circ)->head == (circ)->tail)
#define uart_circ_clear(circ)                ((circ)->head = (circ)->tail = 0)

#define uart_circ_chars_pending(circ)        \
        (CIRC_CNT((circ)->head, (circ)->tail, UART_XMIT_SIZE))

#define uart_circ_chars_free(circ)        \
        (CIRC_SPACE((circ)->head, (circ)->tail, UART_XMIT_SIZE))

static inline int uart_tx_stopped(struct uart_port *port)
{
        struct tty_struct *tty = port->state->port.tty;
        if ((tty && tty->stopped) || port->hw_stopped)
                return 1;
        return 0;
}

static inline bool uart_cts_enabled(struct uart_port *uport)
{
        return !!(uport->status & UPSTAT_CTS_ENABLE);
}

static inline bool uart_softcts_mode(struct uart_port *uport)
{
        upstat_t mask = UPSTAT_CTS_ENABLE | UPSTAT_AUTOCTS;

        return ((uport->status & mask) == UPSTAT_CTS_ENABLE);
}

/*
 * The following are helper functions for the low level drivers.
 */

extern void uart_handle_dcd_change(struct uart_port *uport,
                unsigned int status);
extern void uart_handle_cts_change(struct uart_port *uport,
                unsigned int status);

extern void uart_insert_char(struct uart_port *port, unsigned int status,
                 unsigned int overrun, unsigned int ch, unsigned int flag);

void uart_xchar_out(struct uart_port *uport, int offset);

#ifdef CONFIG_MAGIC_SYSRQ_SERIAL
#define SYSRQ_TIMEOUT        (HZ * 5)

bool uart_try_toggle_sysrq(struct uart_port *port, unsigned int ch);

static inline int uart_handle_sysrq_char(struct uart_port *port, unsigned int ch)
{
        if (!port->sysrq)
                return 0;

        if (ch && time_before(jiffies, port->sysrq)) {
                if (sysrq_mask()) {
                        handle_sysrq(ch);
                        port->sysrq = 0;
                        return 1;
                }
                if (uart_try_toggle_sysrq(port, ch))
                        return 1;
        }
        port->sysrq = 0;

        return 0;
}

static inline int uart_prepare_sysrq_char(struct uart_port *port, unsigned int ch)
{
        if (!port->sysrq)
                return 0;

        if (ch && time_before(jiffies, port->sysrq)) {
                if (sysrq_mask()) {
                        port->sysrq_ch = ch;
                        port->sysrq = 0;
                        return 1;
                }
                if (uart_try_toggle_sysrq(port, ch))
                        return 1;
        }
        port->sysrq = 0;

        return 0;
}

static inline void uart_unlock_and_check_sysrq(struct uart_port *port, unsigned long irqflags)
{
        int sysrq_ch;

        if (!port->has_sysrq) {
                spin_unlock_irqrestore(&port->lock, irqflags);
                return;
        }

        sysrq_ch = port->sysrq_ch;
        port->sysrq_ch = 0;

        spin_unlock_irqrestore(&port->lock, irqflags);

        if (sysrq_ch)
                handle_sysrq(sysrq_ch);
}
#else        /* CONFIG_MAGIC_SYSRQ_SERIAL */
static inline int uart_handle_sysrq_char(struct uart_port *port, unsigned int ch)
{
        return 0;
}
static inline int uart_prepare_sysrq_char(struct uart_port *port, unsigned int ch)
{
        return 0;
}
static inline void uart_unlock_and_check_sysrq(struct uart_port *port, unsigned long irqflags)
{
        spin_unlock_irqrestore(&port->lock, irqflags);
}
#endif        /* CONFIG_MAGIC_SYSRQ_SERIAL */

/*
 * We do the SysRQ and SAK checking like this...
 */
static inline int uart_handle_break(struct uart_port *port)
{
        struct uart_state *state = port->state;

        if (port->handle_break)
                port->handle_break(port);

#ifdef CONFIG_MAGIC_SYSRQ_SERIAL
        if (port->has_sysrq && uart_console(port)) {
                if (!port->sysrq) {
                        port->sysrq = jiffies + SYSRQ_TIMEOUT;
                        return 1;
                }
                port->sysrq = 0;
        }
#endif
        if (port->flags & UPF_SAK)
                do_SAK(state->port.tty);
        return 0;
}

/*
 *        UART_ENABLE_MS - determine if port should enable modem status irqs
 */
#define UART_ENABLE_MS(port,cflag)        ((port)->flags & UPF_HARDPPS_CD || \
                                         (cflag) & CRTSCTS || \
                                         !((cflag) & CLOCAL))

int uart_get_rs485_mode(struct uart_port *port);
#endif /* LINUX_SERIAL_CORE_H */





















    1 

    1 
    1 







































    1 





















    1 

    1 






























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * The "hash function" used as the core of the ChaCha stream cipher (RFC7539)
 *
 * Copyright (C) 2015 Martin Willi
 */

#include <linux/bug.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/bitops.h>
#include <linux/string.h>
#include <asm/unaligned.h>
#include <crypto/chacha.h>

static void chacha_permute(u32 *x, int nrounds)
{
        int i;

        /* whitelist the allowed round counts */
        WARN_ON_ONCE(nrounds != 20 && nrounds != 12);

        for (i = 0; i < nrounds; i += 2) {
                x[0]  += x[4];    x[12] = rol32(x[12] ^ x[0],  16);
                x[1]  += x[5];    x[13] = rol32(x[13] ^ x[1],  16);
                x[2]  += x[6];    x[14] = rol32(x[14] ^ x[2],  16);
                x[3]  += x[7];    x[15] = rol32(x[15] ^ x[3],  16);

                x[8]  += x[12];   x[4]  = rol32(x[4]  ^ x[8],  12);
                x[9]  += x[13];   x[5]  = rol32(x[5]  ^ x[9],  12);
                x[10] += x[14];   x[6]  = rol32(x[6]  ^ x[10], 12);
                x[11] += x[15];   x[7]  = rol32(x[7]  ^ x[11], 12);

                x[0]  += x[4];    x[12] = rol32(x[12] ^ x[0],   8);
                x[1]  += x[5];    x[13] = rol32(x[13] ^ x[1],   8);
                x[2]  += x[6];    x[14] = rol32(x[14] ^ x[2],   8);
                x[3]  += x[7];    x[15] = rol32(x[15] ^ x[3],   8);

                x[8]  += x[12];   x[4]  = rol32(x[4]  ^ x[8],   7);
                x[9]  += x[13];   x[5]  = rol32(x[5]  ^ x[9],   7);
                x[10] += x[14];   x[6]  = rol32(x[6]  ^ x[10],  7);
                x[11] += x[15];   x[7]  = rol32(x[7]  ^ x[11],  7);

                x[0]  += x[5];    x[15] = rol32(x[15] ^ x[0],  16);
                x[1]  += x[6];    x[12] = rol32(x[12] ^ x[1],  16);
                x[2]  += x[7];    x[13] = rol32(x[13] ^ x[2],  16);
                x[3]  += x[4];    x[14] = rol32(x[14] ^ x[3],  16);

                x[10] += x[15];   x[5]  = rol32(x[5]  ^ x[10], 12);
                x[11] += x[12];   x[6]  = rol32(x[6]  ^ x[11], 12);
                x[8]  += x[13];   x[7]  = rol32(x[7]  ^ x[8],  12);
                x[9]  += x[14];   x[4]  = rol32(x[4]  ^ x[9],  12);

                x[0]  += x[5];    x[15] = rol32(x[15] ^ x[0],   8);
                x[1]  += x[6];    x[12] = rol32(x[12] ^ x[1],   8);
                x[2]  += x[7];    x[13] = rol32(x[13] ^ x[2],   8);
                x[3]  += x[4];    x[14] = rol32(x[14] ^ x[3],   8);

                x[10] += x[15];   x[5]  = rol32(x[5]  ^ x[10],  7);
                x[11] += x[12];   x[6]  = rol32(x[6]  ^ x[11],  7);
                x[8]  += x[13];   x[7]  = rol32(x[7]  ^ x[8],   7);
                x[9]  += x[14];   x[4]  = rol32(x[4]  ^ x[9],   7);
        }
}

/**
 * chacha_block_generic - generate one keystream block and increment block counter
 * @state: input state matrix (16 32-bit words)
 * @stream: output keystream block (64 bytes)
 * @nrounds: number of rounds (20 or 12; 20 is recommended)
 *
 * This is the ChaCha core, a function from 64-byte strings to 64-byte strings.
 * The caller has already converted the endianness of the input.  This function
 * also handles incrementing the block counter in the input matrix.
 */
void chacha_block_generic(u32 *state, u8 *stream, int nrounds)
{
        u32 x[16];
        int i;

        memcpy(x, state, 64);

        chacha_permute(x, nrounds);

        for (i = 0; i < ARRAY_SIZE(x); i++)
                put_unaligned_le32(x[i] + state[i], &stream[i * sizeof(u32)]);

        state[12]++;

        memzero_explicit(x, sizeof(x));
}
EXPORT_SYMBOL(chacha_block_generic);

/**
 * hchacha_block_generic - abbreviated ChaCha core, for XChaCha
 * @state: input state matrix (16 32-bit words)
 * @stream: output (8 32-bit words)
 * @nrounds: number of rounds (20 or 12; 20 is recommended)
 *
 * HChaCha is the ChaCha equivalent of HSalsa and is an intermediate step
 * towards XChaCha (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf).  HChaCha
 * skips the final addition of the initial state, and outputs only certain words
 * of the state.  It should not be used for streaming directly.
 */
void hchacha_block_generic(const u32 *state, u32 *stream, int nrounds)
{
        u32 x[16];

        memcpy(x, state, 64);

        chacha_permute(x, nrounds);

        memcpy(&stream[0], &x[0], 16);
        memcpy(&stream[4], &x[12], 16);

        memzero_explicit(x, sizeof(x));
}
EXPORT_SYMBOL(hchacha_block_generic);
























































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_SPECIAL_INSNS_H
#define _ASM_X86_SPECIAL_INSNS_H


#ifdef __KERNEL__

#include <asm/nops.h>
#include <asm/processor-flags.h>
#include <linux/irqflags.h>
#include <linux/jump_label.h>

/*
 * The compiler should not reorder volatile asm statements with respect to each
 * other: they should execute in program order. However GCC 4.9.x and 5.x have
 * a bug (which was fixed in 8.1, 7.3 and 6.5) where they might reorder
 * volatile asm. The write functions are not affected since they have memory
 * clobbers preventing reordering. To prevent reads from being reordered with
 * respect to writes, use a dummy memory operand.
 */

#define __FORCE_ORDER "m"(*(unsigned int *)0x1000UL)

void native_write_cr0(unsigned long val);

static inline unsigned long native_read_cr0(void)
{
        unsigned long val;
        asm volatile("mov %%cr0,%0\n\t" : "=r" (val) : __FORCE_ORDER);
        return val;
}

static __always_inline unsigned long native_read_cr2(void)
{
        unsigned long val;
        asm volatile("mov %%cr2,%0\n\t" : "=r" (val) : __FORCE_ORDER);
        return val;
}

static __always_inline void native_write_cr2(unsigned long val)
{
        asm volatile("mov %0,%%cr2": : "r" (val) : "memory");
}

static inline unsigned long __native_read_cr3(void)
{
        unsigned long val;
        asm volatile("mov %%cr3,%0\n\t" : "=r" (val) : __FORCE_ORDER);
        return val;
}

static inline void native_write_cr3(unsigned long val)
{
        asm volatile("mov %0,%%cr3": : "r" (val) : "memory");
}

static inline unsigned long native_read_cr4(void)
{
        unsigned long val;
#ifdef CONFIG_X86_32
        /*
         * This could fault if CR4 does not exist.  Non-existent CR4
         * is functionally equivalent to CR4 == 0.  Keep it simple and pretend
         * that CR4 == 0 on CPUs that don't have CR4.
         */
        asm volatile("1: mov %%cr4, %0\n"
                     "2:\n"
                     _ASM_EXTABLE(1b, 2b)
                     : "=r" (val) : "0" (0), __FORCE_ORDER);
#else
        /* CR4 always exists on x86_64. */
        asm volatile("mov %%cr4,%0\n\t" : "=r" (val) : __FORCE_ORDER);
#endif
        return val;
}

void native_write_cr4(unsigned long val);

#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
static inline u32 rdpkru(void)
{
        u32 ecx = 0;
        u32 edx, pkru;

        /*
         * "rdpkru" instruction.  Places PKRU contents in to EAX,
         * clears EDX and requires that ecx=0.
         */
        asm volatile(".byte 0x0f,0x01,0xee\n\t"
                     : "=a" (pkru), "=d" (edx)
                     : "c" (ecx));
        return pkru;
}

static inline void wrpkru(u32 pkru)
{
        u32 ecx = 0, edx = 0;

        /*
         * "wrpkru" instruction.  Loads contents in EAX to PKRU,
         * requires that ecx = edx = 0.
         */
        asm volatile(".byte 0x0f,0x01,0xef\n\t"
                     : : "a" (pkru), "c"(ecx), "d"(edx));
}

static inline void __write_pkru(u32 pkru)
{
        /*
         * WRPKRU is relatively expensive compared to RDPKRU.
         * Avoid WRPKRU when it would not change the value.
         */
        if (pkru == rdpkru())
                return;

        wrpkru(pkru);
}

#else
static inline u32 rdpkru(void)
{
        return 0;
}

static inline void __write_pkru(u32 pkru)
{
}
#endif

static inline void native_wbinvd(void)
{
        asm volatile("wbinvd": : :"memory");
}

extern asmlinkage void asm_load_gs_index(unsigned int selector);

static inline void native_load_gs_index(unsigned int selector)
{
        unsigned long flags;

        local_irq_save(flags);
        asm_load_gs_index(selector);
        local_irq_restore(flags);
}

static inline unsigned long __read_cr4(void)
{
        return native_read_cr4();
}

#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else

static inline unsigned long read_cr0(void)
{
        return native_read_cr0();
}

static inline void write_cr0(unsigned long x)
{
        native_write_cr0(x);
}

static __always_inline unsigned long read_cr2(void)
{
        return native_read_cr2();
}

static __always_inline void write_cr2(unsigned long x)
{
        native_write_cr2(x);
}

/*
 * Careful!  CR3 contains more than just an address.  You probably want
 * read_cr3_pa() instead.
 */
static inline unsigned long __read_cr3(void)
{
        return __native_read_cr3();
}

static inline void write_cr3(unsigned long x)
{
        native_write_cr3(x);
}

static inline void __write_cr4(unsigned long x)
{
        native_write_cr4(x);
}

static inline void wbinvd(void)
{
        native_wbinvd();
}

#ifdef CONFIG_X86_64

static inline void load_gs_index(unsigned int selector)
{
        native_load_gs_index(selector);
}

#endif

#endif /* CONFIG_PARAVIRT_XXL */

static inline void clflush(volatile void *__p)
{
        asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
}

static inline void clflushopt(volatile void *__p)
{
        alternative_io(".byte " __stringify(NOP_DS_PREFIX) "; clflush %P0",
                       ".byte 0x66; clflush %P0",
                       X86_FEATURE_CLFLUSHOPT,
                       "+m" (*(volatile char __force *)__p));
}

static inline void clwb(volatile void *__p)
{
        volatile struct { char x[64]; } *p = __p;

        asm volatile(ALTERNATIVE_2(
                ".byte " __stringify(NOP_DS_PREFIX) "; clflush (%[pax])",
                ".byte 0x66; clflush (%[pax])", /* clflushopt (%%rax) */
                X86_FEATURE_CLFLUSHOPT,
                ".byte 0x66, 0x0f, 0xae, 0x30",  /* clwb (%%rax) */
                X86_FEATURE_CLWB)
                : [p] "+m" (*p)
                : [pax] "a" (p));
}

#define nop() asm volatile ("nop")

static __always_inline void serialize(void)
{
        /* Instruction opcode for SERIALIZE; supported in binutils >= 2.35. */
        asm volatile(".byte 0xf, 0x1, 0xe8" ::: "memory");
}

/* The dst parameter must be 64-bytes aligned */
static inline void movdir64b(void *dst, const void *src)
{
        const struct { char _[64]; } *__src = src;
        struct { char _[64]; } *__dst = dst;

        /*
         * MOVDIR64B %(rdx), rax.
         *
         * Both __src and __dst must be memory constraints in order to tell the
         * compiler that no other memory accesses should be reordered around
         * this one.
         *
         * Also, both must be supplied as lvalues because this tells
         * the compiler what the object is (its size) the instruction accesses.
         * I.e., not the pointers but what they point to, thus the deref'ing '*'.
         */
        asm volatile(".byte 0x66, 0x0f, 0x38, 0xf8, 0x02"
                     : "+m" (*__dst)
                     :  "m" (*__src), "a" (__dst), "d" (__src));
}

/**
 * enqcmds - Enqueue a command in supervisor (CPL0) mode
 * @dst: destination, in MMIO space (must be 512-bit aligned)
 * @src: 512 bits memory operand
 *
 * The ENQCMDS instruction allows software to write a 512-bit command to
 * a 512-bit-aligned special MMIO region that supports the instruction.
 * A return status is loaded into the ZF flag in the RFLAGS register.
 * ZF = 0 equates to success, and ZF = 1 indicates retry or error.
 *
 * This function issues the ENQCMDS instruction to submit data from
 * kernel space to MMIO space, in a unit of 512 bits. Order of data access
 * is not guaranteed, nor is a memory barrier performed afterwards. It
 * returns 0 on success and -EAGAIN on failure.
 *
 * Warning: Do not use this helper unless your driver has checked that the
 * ENQCMDS instruction is supported on the platform and the device accepts
 * ENQCMDS.
 */
static inline int enqcmds(void __iomem *dst, const void *src)
{
        const struct { char _[64]; } *__src = src;
        struct { char _[64]; } __iomem *__dst = dst;
        bool zf;

        /*
         * ENQCMDS %(rdx), rax
         *
         * See movdir64b()'s comment on operand specification.
         */
        asm volatile(".byte 0xf3, 0x0f, 0x38, 0xf8, 0x02, 0x66, 0x90"
                     CC_SET(z)
                     : CC_OUT(z) (zf), "+m" (*__dst)
                     : "m" (*__src), "a" (__dst), "d" (__src));

        /* Submission failure is indicated via EFLAGS.ZF=1 */
        if (zf)
                return -EAGAIN;

        return 0;
}

#endif /* __KERNEL__ */

#endif /* _ASM_X86_SPECIAL_INSNS_H */































    1 


















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_COREDUMP_H
#define _LINUX_SCHED_COREDUMP_H

#include <linux/mm_types.h>

#define SUID_DUMP_DISABLE        0        /* No setuid dumping */
#define SUID_DUMP_USER                1        /* Dump as user of process */
#define SUID_DUMP_ROOT                2        /* Dump as root */

/* mm flags */

/* for SUID_DUMP_* above */
#define MMF_DUMPABLE_BITS 2
#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1)

extern void set_dumpable(struct mm_struct *mm, int value);
/*
 * This returns the actual value of the suid_dumpable flag. For things
 * that are using this for checking for privilege transitions, it must
 * test against SUID_DUMP_USER rather than treating it as a boolean
 * value.
 */
static inline int __get_dumpable(unsigned long mm_flags)
{
        return mm_flags & MMF_DUMPABLE_MASK;
}

static inline int get_dumpable(struct mm_struct *mm)
{
        return __get_dumpable(mm->flags);
}

/* coredump filter bits */
#define MMF_DUMP_ANON_PRIVATE        2
#define MMF_DUMP_ANON_SHARED        3
#define MMF_DUMP_MAPPED_PRIVATE        4
#define MMF_DUMP_MAPPED_SHARED        5
#define MMF_DUMP_ELF_HEADERS        6
#define MMF_DUMP_HUGETLB_PRIVATE 7
#define MMF_DUMP_HUGETLB_SHARED  8
#define MMF_DUMP_DAX_PRIVATE        9
#define MMF_DUMP_DAX_SHARED        10

#define MMF_DUMP_FILTER_SHIFT        MMF_DUMPABLE_BITS
#define MMF_DUMP_FILTER_BITS        9
#define MMF_DUMP_FILTER_MASK \
        (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
#define MMF_DUMP_FILTER_DEFAULT \
        ((1 << MMF_DUMP_ANON_PRIVATE) |        (1 << MMF_DUMP_ANON_SHARED) |\
         (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF)

#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS
# define MMF_DUMP_MASK_DEFAULT_ELF        (1 << MMF_DUMP_ELF_HEADERS)
#else
# define MMF_DUMP_MASK_DEFAULT_ELF        0
#endif
                                        /* leave room for more dump flags */
#define MMF_VM_MERGEABLE        16        /* KSM may merge identical pages */
#define MMF_VM_HUGEPAGE                17        /* set when VM_HUGEPAGE is set on vma */
/*
 * This one-shot flag is dropped due to necessity of changing exe once again
 * on NFS restore
 */
//#define MMF_EXE_FILE_CHANGED        18        /* see prctl_set_mm_exe_file() */

#define MMF_HAS_UPROBES                19        /* has uprobes */
#define MMF_RECALC_UPROBES        20        /* MMF_HAS_UPROBES can be wrong */
#define MMF_OOM_SKIP                21        /* mm is of no interest for the OOM killer */
#define MMF_UNSTABLE                22        /* mm is unstable for copy_from_user */
#define MMF_HUGE_ZERO_PAGE        23      /* mm has ever used the global huge zero page */
#define MMF_DISABLE_THP                24        /* disable THP for all VMAs */
#define MMF_OOM_VICTIM                25        /* mm is the oom victim */
#define MMF_OOM_REAP_QUEUED        26        /* mm was queued for oom_reaper */
#define MMF_MULTIPROCESS        27        /* mm is shared between processes */
#define MMF_DISABLE_THP_MASK        (1 << MMF_DISABLE_THP)

#define MMF_INIT_MASK                (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
                                 MMF_DISABLE_THP_MASK)

#endif /* _LINUX_SCHED_COREDUMP_H */











































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_TIMER_H
#define _LINUX_TIMER_H

#include <linux/list.h>
#include <linux/ktime.h>
#include <linux/stddef.h>
#include <linux/debugobjects.h>
#include <linux/stringify.h>

struct timer_list {
        /*
         * All fields that change during normal runtime grouped to the
         * same cacheline
         */
        struct hlist_node        entry;
        unsigned long                expires;
        void                        (*function)(struct timer_list *);
        u32                        flags;

#ifdef CONFIG_LOCKDEP
        struct lockdep_map        lockdep_map;
#endif
};

#ifdef CONFIG_LOCKDEP
/*
 * NB: because we have to copy the lockdep_map, setting the lockdep_map key
 * (second argument) here is required, otherwise it could be initialised to
 * the copy of the lockdep_map later! We use the pointer to and the string
 * "<file>:<line>" as the key resp. the name of the lockdep_map.
 */
#define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn)                                \
        .lockdep_map = STATIC_LOCKDEP_MAP_INIT(_kn, &_kn),
#else
#define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn)
#endif

/**
 * @TIMER_DEFERRABLE: A deferrable timer will work normally when the
 * system is busy, but will not cause a CPU to come out of idle just
 * to service it; instead, the timer will be serviced when the CPU
 * eventually wakes up with a subsequent non-deferrable timer.
 *
 * @TIMER_IRQSAFE: An irqsafe timer is executed with IRQ disabled and
 * it's safe to wait for the completion of the running instance from
 * IRQ handlers, for example, by calling del_timer_sync().
 *
 * Note: The irq disabled callback execution is a special case for
 * workqueue locking issues. It's not meant for executing random crap
 * with interrupts disabled. Abuse is monitored!
 *
 * @TIMER_PINNED: A pinned timer will not be affected by any timer
 * placement heuristics (like, NOHZ) and will always expire on the CPU
 * on which the timer was enqueued.
 *
 * Note: Because enqueuing of timers can migrate the timer from one
 * CPU to another, pinned timers are not guaranteed to stay on the
 * initialy selected CPU.  They move to the CPU on which the enqueue
 * function is invoked via mod_timer() or add_timer().  If the timer
 * should be placed on a particular CPU, then add_timer_on() has to be
 * used.
 */
#define TIMER_CPUMASK                0x0003FFFF
#define TIMER_MIGRATING                0x00040000
#define TIMER_BASEMASK                (TIMER_CPUMASK | TIMER_MIGRATING)
#define TIMER_DEFERRABLE        0x00080000
#define TIMER_PINNED                0x00100000
#define TIMER_IRQSAFE                0x00200000
#define TIMER_INIT_FLAGS        (TIMER_DEFERRABLE | TIMER_PINNED | TIMER_IRQSAFE)
#define TIMER_ARRAYSHIFT        22
#define TIMER_ARRAYMASK                0xFFC00000

#define TIMER_TRACE_FLAGMASK        (TIMER_MIGRATING | TIMER_DEFERRABLE | TIMER_PINNED | TIMER_IRQSAFE)

#define __TIMER_INITIALIZER(_function, _flags) {                \
                .entry = { .next = TIMER_ENTRY_STATIC },        \
                .function = (_function),                        \
                .flags = (_flags),                                \
                __TIMER_LOCKDEP_MAP_INITIALIZER(                \
                        __FILE__ ":" __stringify(__LINE__))        \
        }

#define DEFINE_TIMER(_name, _function)                                \
        struct timer_list _name =                                \
                __TIMER_INITIALIZER(_function, 0)

/*
 * LOCKDEP and DEBUG timer interfaces.
 */
void init_timer_key(struct timer_list *timer,
                    void (*func)(struct timer_list *), unsigned int flags,
                    const char *name, struct lock_class_key *key);

#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
extern void init_timer_on_stack_key(struct timer_list *timer,
                                    void (*func)(struct timer_list *),
                                    unsigned int flags, const char *name,
                                    struct lock_class_key *key);
#else
static inline void init_timer_on_stack_key(struct timer_list *timer,
                                           void (*func)(struct timer_list *),
                                           unsigned int flags,
                                           const char *name,
                                           struct lock_class_key *key)
{
        init_timer_key(timer, func, flags, name, key);
}
#endif

#ifdef CONFIG_LOCKDEP
#define __init_timer(_timer, _fn, _flags)                                \
        do {                                                                \
                static struct lock_class_key __key;                        \
                init_timer_key((_timer), (_fn), (_flags), #_timer, &__key);\
        } while (0)

#define __init_timer_on_stack(_timer, _fn, _flags)                        \
        do {                                                                \
                static struct lock_class_key __key;                        \
                init_timer_on_stack_key((_timer), (_fn), (_flags),        \
                                        #_timer, &__key);                 \
        } while (0)
#else
#define __init_timer(_timer, _fn, _flags)                                \
        init_timer_key((_timer), (_fn), (_flags), NULL, NULL)
#define __init_timer_on_stack(_timer, _fn, _flags)                        \
        init_timer_on_stack_key((_timer), (_fn), (_flags), NULL, NULL)
#endif

/**
 * timer_setup - prepare a timer for first use
 * @timer: the timer in question
 * @callback: the function to call when timer expires
 * @flags: any TIMER_* flags
 *
 * Regular timer initialization should use either DEFINE_TIMER() above,
 * or timer_setup(). For timers on the stack, timer_setup_on_stack() must
 * be used and must be balanced with a call to destroy_timer_on_stack().
 */
#define timer_setup(timer, callback, flags)                        \
        __init_timer((timer), (callback), (flags))

#define timer_setup_on_stack(timer, callback, flags)                \
        __init_timer_on_stack((timer), (callback), (flags))

#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
extern void destroy_timer_on_stack(struct timer_list *timer);
#else
static inline void destroy_timer_on_stack(struct timer_list *timer) { }
#endif

#define from_timer(var, callback_timer, timer_fieldname) \
        container_of(callback_timer, typeof(*var), timer_fieldname)

/**
 * timer_pending - is a timer pending?
 * @timer: the timer in question
 *
 * timer_pending will tell whether a given timer is currently pending,
 * or not. Callers must ensure serialization wrt. other operations done
 * to this timer, eg. interrupt contexts, or other CPUs on SMP.
 *
 * return value: 1 if the timer is pending, 0 if not.
 */
static inline int timer_pending(const struct timer_list * timer)
{
        return !hlist_unhashed_lockless(&timer->entry);
}

extern void add_timer_on(struct timer_list *timer, int cpu);
extern int del_timer(struct timer_list * timer);
extern int mod_timer(struct timer_list *timer, unsigned long expires);
extern int mod_timer_pending(struct timer_list *timer, unsigned long expires);
extern int timer_reduce(struct timer_list *timer, unsigned long expires);

/*
 * The jiffies value which is added to now, when there is no timer
 * in the timer wheel:
 */
#define NEXT_TIMER_MAX_DELTA        ((1UL << 30) - 1)

extern void add_timer(struct timer_list *timer);

extern int try_to_del_timer_sync(struct timer_list *timer);
extern int timer_delete_sync(struct timer_list *timer);

/**
 * del_timer_sync - Delete a pending timer and wait for a running callback
 * @timer:        The timer to be deleted
 *
 * See timer_delete_sync() for detailed explanation.
 *
 * Do not use in new code. Use timer_delete_sync() instead.
 */
static inline int del_timer_sync(struct timer_list *timer)
{
        return timer_delete_sync(timer);
}

#define del_singleshot_timer_sync(t) del_timer_sync(t)

extern void init_timers(void);
extern void run_local_timers(void);
struct hrtimer;
extern enum hrtimer_restart it_real_fn(struct hrtimer *);

#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
struct ctl_table;

extern unsigned int sysctl_timer_migration;
int timer_migration_handler(struct ctl_table *table, int write,
                            void *buffer, size_t *lenp, loff_t *ppos);
#endif

unsigned long __round_jiffies(unsigned long j, int cpu);
unsigned long __round_jiffies_relative(unsigned long j, int cpu);
unsigned long round_jiffies(unsigned long j);
unsigned long round_jiffies_relative(unsigned long j);

unsigned long __round_jiffies_up(unsigned long j, int cpu);
unsigned long __round_jiffies_up_relative(unsigned long j, int cpu);
unsigned long round_jiffies_up(unsigned long j);
unsigned long round_jiffies_up_relative(unsigned long j);

#ifdef CONFIG_HOTPLUG_CPU
int timers_prepare_cpu(unsigned int cpu);
int timers_dead_cpu(unsigned int cpu);
#else
#define timers_prepare_cpu        NULL
#define timers_dead_cpu                NULL
#endif

#endif

































    1 


    1 





























































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
// SPDX-License-Identifier: GPL-2.0-or-later
/* In-software asymmetric public-key crypto subtype
 *
 * See Documentation/crypto/asymmetric-keys.rst
 *
 * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#define pr_fmt(fmt) "PKEY: "fmt
#include <linux/module.h>
#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/seq_file.h>
#include <linux/scatterlist.h>
#include <keys/asymmetric-subtype.h>
#include <crypto/public_key.h>
#include <crypto/akcipher.h>
#include <crypto/sm2.h>
#include <crypto/sm3_base.h>

MODULE_DESCRIPTION("In-software asymmetric public-key subtype");
MODULE_AUTHOR("Red Hat, Inc.");
MODULE_LICENSE("GPL");

/*
 * Provide a part of a description of the key for /proc/keys.
 */
static void public_key_describe(const struct key *asymmetric_key,
                                struct seq_file *m)
{
        struct public_key *key = asymmetric_key->payload.data[asym_crypto];

        if (key)
                seq_printf(m, "%s.%s", key->id_type, key->pkey_algo);
}

/*
 * Destroy a public key algorithm key.
 */
void public_key_free(struct public_key *key)
{
        if (key) {
                kfree(key->key);
                kfree(key->params);
                kfree(key);
        }
}
EXPORT_SYMBOL_GPL(public_key_free);

/*
 * Destroy a public key algorithm key.
 */
static void public_key_destroy(void *payload0, void *payload3)
{
        public_key_free(payload0);
        public_key_signature_free(payload3);
}

/*
 * Determine the crypto algorithm name.
 */
static
int software_key_determine_akcipher(const char *encoding,
                                    const char *hash_algo,
                                    const struct public_key *pkey,
                                    char alg_name[CRYPTO_MAX_ALG_NAME])
{
        int n;

        if (strcmp(encoding, "pkcs1") == 0) {
                /* The data wangled by the RSA algorithm is typically padded
                 * and encoded in some manner, such as EMSA-PKCS1-1_5 [RFC3447
                 * sec 8.2].
                 */
                if (!hash_algo)
                        n = snprintf(alg_name, CRYPTO_MAX_ALG_NAME,
                                     "pkcs1pad(%s)",
                                     pkey->pkey_algo);
                else
                        n = snprintf(alg_name, CRYPTO_MAX_ALG_NAME,
                                     "pkcs1pad(%s,%s)",
                                     pkey->pkey_algo, hash_algo);
                return n >= CRYPTO_MAX_ALG_NAME ? -EINVAL : 0;
        }

        if (strcmp(encoding, "raw") == 0) {
                strcpy(alg_name, pkey->pkey_algo);
                return 0;
        }

        return -ENOPKG;
}

static u8 *pkey_pack_u32(u8 *dst, u32 val)
{
        memcpy(dst, &val, sizeof(val));
        return dst + sizeof(val);
}

/*
 * Query information about a key.
 */
static int software_key_query(const struct kernel_pkey_params *params,
                              struct kernel_pkey_query *info)
{
        struct crypto_akcipher *tfm;
        struct public_key *pkey = params->key->payload.data[asym_crypto];
        char alg_name[CRYPTO_MAX_ALG_NAME];
        u8 *key, *ptr;
        int ret, len;

        ret = software_key_determine_akcipher(params->encoding,
                                              params->hash_algo,
                                              pkey, alg_name);
        if (ret < 0)
                return ret;

        tfm = crypto_alloc_akcipher(alg_name, 0, 0);
        if (IS_ERR(tfm))
                return PTR_ERR(tfm);

        ret = -ENOMEM;
        key = kmalloc(pkey->keylen + sizeof(u32) * 2 + pkey->paramlen,
                      GFP_KERNEL);
        if (!key)
                goto error_free_tfm;
        memcpy(key, pkey->key, pkey->keylen);
        ptr = key + pkey->keylen;
        ptr = pkey_pack_u32(ptr, pkey->algo);
        ptr = pkey_pack_u32(ptr, pkey->paramlen);
        memcpy(ptr, pkey->params, pkey->paramlen);

        if (pkey->key_is_private)
                ret = crypto_akcipher_set_priv_key(tfm, key, pkey->keylen);
        else
                ret = crypto_akcipher_set_pub_key(tfm, key, pkey->keylen);
        if (ret < 0)
                goto error_free_key;

        len = crypto_akcipher_maxsize(tfm);
        info->key_size = len * 8;
        info->max_data_size = len;
        info->max_sig_size = len;
        info->max_enc_size = len;
        info->max_dec_size = len;
        info->supported_ops = (KEYCTL_SUPPORTS_ENCRYPT |
                               KEYCTL_SUPPORTS_VERIFY);
        if (pkey->key_is_private)
                info->supported_ops |= (KEYCTL_SUPPORTS_DECRYPT |
                                        KEYCTL_SUPPORTS_SIGN);
        ret = 0;

error_free_key:
        kfree(key);
error_free_tfm:
        crypto_free_akcipher(tfm);
        pr_devel("<==%s() = %d\n", __func__, ret);
        return ret;
}

/*
 * Do encryption, decryption and signing ops.
 */
static int software_key_eds_op(struct kernel_pkey_params *params,
                               const void *in, void *out)
{
        const struct public_key *pkey = params->key->payload.data[asym_crypto];
        struct akcipher_request *req;
        struct crypto_akcipher *tfm;
        struct crypto_wait cwait;
        struct scatterlist in_sg, out_sg;
        char alg_name[CRYPTO_MAX_ALG_NAME];
        char *key, *ptr;
        int ret;

        pr_devel("==>%s()\n", __func__);

        ret = software_key_determine_akcipher(params->encoding,
                                              params->hash_algo,
                                              pkey, alg_name);
        if (ret < 0)
                return ret;

        tfm = crypto_alloc_akcipher(alg_name, 0, 0);
        if (IS_ERR(tfm))
                return PTR_ERR(tfm);

        ret = -ENOMEM;
        req = akcipher_request_alloc(tfm, GFP_KERNEL);
        if (!req)
                goto error_free_tfm;

        key = kmalloc(pkey->keylen + sizeof(u32) * 2 + pkey->paramlen,
                      GFP_KERNEL);
        if (!key)
                goto error_free_req;

        memcpy(key, pkey->key, pkey->keylen);
        ptr = key + pkey->keylen;
        ptr = pkey_pack_u32(ptr, pkey->algo);
        ptr = pkey_pack_u32(ptr, pkey->paramlen);
        memcpy(ptr, pkey->params, pkey->paramlen);

        if (pkey->key_is_private)
                ret = crypto_akcipher_set_priv_key(tfm, key, pkey->keylen);
        else
                ret = crypto_akcipher_set_pub_key(tfm, key, pkey->keylen);
        if (ret)
                goto error_free_key;

        sg_init_one(&in_sg, in, params->in_len);
        sg_init_one(&out_sg, out, params->out_len);
        akcipher_request_set_crypt(req, &in_sg, &out_sg, params->in_len,
                                   params->out_len);
        crypto_init_wait(&cwait);
        akcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG |
                                      CRYPTO_TFM_REQ_MAY_SLEEP,
                                      crypto_req_done, &cwait);

        /* Perform the encryption calculation. */
        switch (params->op) {
        case kernel_pkey_encrypt:
                ret = crypto_akcipher_encrypt(req);
                break;
        case kernel_pkey_decrypt:
                ret = crypto_akcipher_decrypt(req);
                break;
        case kernel_pkey_sign:
                ret = crypto_akcipher_sign(req);
                break;
        default:
                BUG();
        }

        ret = crypto_wait_req(ret, &cwait);
        if (ret == 0)
                ret = req->dst_len;

error_free_key:
        kfree(key);
error_free_req:
        akcipher_request_free(req);
error_free_tfm:
        crypto_free_akcipher(tfm);
        pr_devel("<==%s() = %d\n", __func__, ret);
        return ret;
}

#if IS_REACHABLE(CONFIG_CRYPTO_SM2)
static int cert_sig_digest_update(const struct public_key_signature *sig,
                                  struct crypto_akcipher *tfm_pkey)
{
        struct crypto_shash *tfm;
        struct shash_desc *desc;
        size_t desc_size;
        unsigned char dgst[SM3_DIGEST_SIZE];
        int ret;

        BUG_ON(!sig->data);

        /* SM2 signatures always use the SM3 hash algorithm */
        if (!sig->hash_algo || strcmp(sig->hash_algo, "sm3") != 0)
                return -EINVAL;

        ret = sm2_compute_z_digest(tfm_pkey, SM2_DEFAULT_USERID,
                                        SM2_DEFAULT_USERID_LEN, dgst);
        if (ret)
                return ret;

        tfm = crypto_alloc_shash(sig->hash_algo, 0, 0);
        if (IS_ERR(tfm))
                return PTR_ERR(tfm);

        desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
        desc = kzalloc(desc_size, GFP_KERNEL);
        if (!desc) {
                ret = -ENOMEM;
                goto error_free_tfm;
        }

        desc->tfm = tfm;

        ret = crypto_shash_init(desc);
        if (ret < 0)
                goto error_free_desc;

        ret = crypto_shash_update(desc, dgst, SM3_DIGEST_SIZE);
        if (ret < 0)
                goto error_free_desc;

        ret = crypto_shash_finup(desc, sig->data, sig->data_size, sig->digest);

error_free_desc:
        kfree(desc);
error_free_tfm:
        crypto_free_shash(tfm);
        return ret;
}
#else
static inline int cert_sig_digest_update(
        const struct public_key_signature *sig,
        struct crypto_akcipher *tfm_pkey)
{
        return -ENOTSUPP;
}
#endif /* ! IS_REACHABLE(CONFIG_CRYPTO_SM2) */

/*
 * Verify a signature using a public key.
 */
int public_key_verify_signature(const struct public_key *pkey,
                                const struct public_key_signature *sig)
{
        struct crypto_wait cwait;
        struct crypto_akcipher *tfm;
        struct akcipher_request *req;
        struct scatterlist src_sg;
        char alg_name[CRYPTO_MAX_ALG_NAME];
        char *buf, *ptr;
        size_t buf_len;
        int ret;

        pr_devel("==>%s()\n", __func__);

        BUG_ON(!pkey);
        BUG_ON(!sig);
        BUG_ON(!sig->s);

        ret = software_key_determine_akcipher(sig->encoding,
                                              sig->hash_algo,
                                              pkey, alg_name);
        if (ret < 0)
                return ret;

        tfm = crypto_alloc_akcipher(alg_name, 0, 0);
        if (IS_ERR(tfm))
                return PTR_ERR(tfm);

        ret = -ENOMEM;
        req = akcipher_request_alloc(tfm, GFP_KERNEL);
        if (!req)
                goto error_free_tfm;

        buf_len = max_t(size_t, pkey->keylen + sizeof(u32) * 2 + pkey->paramlen,
                        sig->s_size + sig->digest_size);

        buf = kmalloc(buf_len, GFP_KERNEL);
        if (!buf)
                goto error_free_req;

        memcpy(buf, pkey->key, pkey->keylen);
        ptr = buf + pkey->keylen;
        ptr = pkey_pack_u32(ptr, pkey->algo);
        ptr = pkey_pack_u32(ptr, pkey->paramlen);
        memcpy(ptr, pkey->params, pkey->paramlen);

        if (pkey->key_is_private)
                ret = crypto_akcipher_set_priv_key(tfm, buf, pkey->keylen);
        else
                ret = crypto_akcipher_set_pub_key(tfm, buf, pkey->keylen);
        if (ret)
                goto error_free_buf;

        if (strcmp(pkey->pkey_algo, "sm2") == 0 && sig->data_size) {
                ret = cert_sig_digest_update(sig, tfm);
                if (ret)
                        goto error_free_buf;
        }

        memcpy(buf, sig->s, sig->s_size);
        memcpy(buf + sig->s_size, sig->digest, sig->digest_size);

        sg_init_one(&src_sg, buf, sig->s_size + sig->digest_size);
        akcipher_request_set_crypt(req, &src_sg, NULL, sig->s_size,
                                   sig->digest_size);
        crypto_init_wait(&cwait);
        akcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG |
                                      CRYPTO_TFM_REQ_MAY_SLEEP,
                                      crypto_req_done, &cwait);
        ret = crypto_wait_req(crypto_akcipher_verify(req), &cwait);

error_free_buf:
        kfree(buf);
error_free_req:
        akcipher_request_free(req);
error_free_tfm:
        crypto_free_akcipher(tfm);
        pr_devel("<==%s() = %d\n", __func__, ret);
        if (WARN_ON_ONCE(ret > 0))
                ret = -EINVAL;
        return ret;
}
EXPORT_SYMBOL_GPL(public_key_verify_signature);

static int public_key_verify_signature_2(const struct key *key,
                                         const struct public_key_signature *sig)
{
        const struct public_key *pk = key->payload.data[asym_crypto];
        return public_key_verify_signature(pk, sig);
}

/*
 * Public key algorithm asymmetric key subtype
 */
struct asymmetric_key_subtype public_key_subtype = {
        .owner                        = THIS_MODULE,
        .name                        = "public_key",
        .name_len                = sizeof("public_key") - 1,
        .describe                = public_key_describe,
        .destroy                = public_key_destroy,
        .query                        = software_key_query,
        .eds_op                        = software_key_eds_op,
        .verify_signature        = public_key_verify_signature_2,
};
EXPORT_SYMBOL_GPL(public_key_subtype);






































































































    6 
    2 













































































































































































    2 














































































































































































    5 




































    1 






































































    1 


















































































































































































































































































    5 




































































































































































































































































































    1 














    1 




































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_SEQLOCK_H
#define __LINUX_SEQLOCK_H

/*
 * seqcount_t / seqlock_t - a reader-writer consistency mechanism with
 * lockless readers (read-only retry loops), and no writer starvation.
 *
 * See Documentation/locking/seqlock.rst
 *
 * Copyrights:
 * - Based on x86_64 vsyscall gettimeofday: Keith Owens, Andrea Arcangeli
 * - Sequence counters with associated locks, (C) 2020 Linutronix GmbH
 */

#include <linux/compiler.h>
#include <linux/kcsan-checks.h>
#include <linux/lockdep.h>
#include <linux/mutex.h>
#include <linux/ww_mutex.h>
#include <linux/preempt.h>
#include <linux/spinlock.h>

#include <asm/processor.h>

/*
 * The seqlock seqcount_t interface does not prescribe a precise sequence of
 * read begin/retry/end. For readers, typically there is a call to
 * read_seqcount_begin() and read_seqcount_retry(), however, there are more
 * esoteric cases which do not follow this pattern.
 *
 * As a consequence, we take the following best-effort approach for raw usage
 * via seqcount_t under KCSAN: upon beginning a seq-reader critical section,
 * pessimistically mark the next KCSAN_SEQLOCK_REGION_MAX memory accesses as
 * atomics; if there is a matching read_seqcount_retry() call, no following
 * memory operations are considered atomic. Usage of the seqlock_t interface
 * is not affected.
 */
#define KCSAN_SEQLOCK_REGION_MAX 1000

/*
 * Sequence counters (seqcount_t)
 *
 * This is the raw counting mechanism, without any writer protection.
 *
 * Write side critical sections must be serialized and non-preemptible.
 *
 * If readers can be invoked from hardirq or softirq contexts,
 * interrupts or bottom halves must also be respectively disabled before
 * entering the write section.
 *
 * This mechanism can't be used if the protected data contains pointers,
 * as the writer can invalidate a pointer that a reader is following.
 *
 * If the write serialization mechanism is one of the common kernel
 * locking primitives, use a sequence counter with associated lock
 * (seqcount_LOCKNAME_t) instead.
 *
 * If it's desired to automatically handle the sequence counter writer
 * serialization and non-preemptibility requirements, use a sequential
 * lock (seqlock_t) instead.
 *
 * See Documentation/locking/seqlock.rst
 */
typedef struct seqcount {
        unsigned sequence;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map dep_map;
#endif
} seqcount_t;

static inline void __seqcount_init(seqcount_t *s, const char *name,
                                          struct lock_class_key *key)
{
        /*
         * Make sure we are not reinitializing a held lock:
         */
        lockdep_init_map(&s->dep_map, name, key, 0);
        s->sequence = 0;
}

#ifdef CONFIG_DEBUG_LOCK_ALLOC

# define SEQCOUNT_DEP_MAP_INIT(lockname)                                \
                .dep_map = { .name = #lockname }

/**
 * seqcount_init() - runtime initializer for seqcount_t
 * @s: Pointer to the seqcount_t instance
 */
# define seqcount_init(s)                                                \
        do {                                                                \
                static struct lock_class_key __key;                        \
                __seqcount_init((s), #s, &__key);                        \
        } while (0)

static inline void seqcount_lockdep_reader_access(const seqcount_t *s)
{
        seqcount_t *l = (seqcount_t *)s;
        unsigned long flags;

        local_irq_save(flags);
        seqcount_acquire_read(&l->dep_map, 0, 0, _RET_IP_);
        seqcount_release(&l->dep_map, _RET_IP_);
        local_irq_restore(flags);
}

#else
# define SEQCOUNT_DEP_MAP_INIT(lockname)
# define seqcount_init(s) __seqcount_init(s, NULL, NULL)
# define seqcount_lockdep_reader_access(x)
#endif

/**
 * SEQCNT_ZERO() - static initializer for seqcount_t
 * @name: Name of the seqcount_t instance
 */
#define SEQCNT_ZERO(name) { .sequence = 0, SEQCOUNT_DEP_MAP_INIT(name) }

/*
 * Sequence counters with associated locks (seqcount_LOCKNAME_t)
 *
 * A sequence counter which associates the lock used for writer
 * serialization at initialization time. This enables lockdep to validate
 * that the write side critical section is properly serialized.
 *
 * For associated locks which do not implicitly disable preemption,
 * preemption protection is enforced in the write side function.
 *
 * Lockdep is never used in any for the raw write variants.
 *
 * See Documentation/locking/seqlock.rst
 */

/*
 * For PREEMPT_RT, seqcount_LOCKNAME_t write side critical sections cannot
 * disable preemption. It can lead to higher latencies, and the write side
 * sections will not be able to acquire locks which become sleeping locks
 * (e.g. spinlock_t).
 *
 * To remain preemptible while avoiding a possible livelock caused by the
 * reader preempting the writer, use a different technique: let the reader
 * detect if a seqcount_LOCKNAME_t writer is in progress. If that is the
 * case, acquire then release the associated LOCKNAME writer serialization
 * lock. This will allow any possibly-preempted writer to make progress
 * until the end of its writer serialization lock critical section.
 *
 * This lock-unlock technique must be implemented for all of PREEMPT_RT
 * sleeping locks.  See Documentation/locking/locktypes.rst
 */
#if defined(CONFIG_LOCKDEP) || defined(CONFIG_PREEMPT_RT)
#define __SEQ_LOCK(expr)        expr
#else
#define __SEQ_LOCK(expr)
#endif

/*
 * typedef seqcount_LOCKNAME_t - sequence counter with LOCKNAME associated
 * @seqcount:        The real sequence counter
 * @lock:        Pointer to the associated lock
 *
 * A plain sequence counter with external writer synchronization by
 * LOCKNAME @lock. The lock is associated to the sequence counter in the
 * static initializer or init function. This enables lockdep to validate
 * that the write side critical section is properly serialized.
 *
 * LOCKNAME:        raw_spinlock, spinlock, rwlock, mutex, or ww_mutex.
 */

/*
 * seqcount_LOCKNAME_init() - runtime initializer for seqcount_LOCKNAME_t
 * @s:                Pointer to the seqcount_LOCKNAME_t instance
 * @lock:        Pointer to the associated lock
 */

#define seqcount_LOCKNAME_init(s, _lock, lockname)                        \
        do {                                                                \
                seqcount_##lockname##_t *____s = (s);                        \
                seqcount_init(&____s->seqcount);                        \
                __SEQ_LOCK(____s->lock = (_lock));                        \
        } while (0)

#define seqcount_raw_spinlock_init(s, lock)        seqcount_LOCKNAME_init(s, lock, raw_spinlock)
#define seqcount_spinlock_init(s, lock)                seqcount_LOCKNAME_init(s, lock, spinlock)
#define seqcount_rwlock_init(s, lock)                seqcount_LOCKNAME_init(s, lock, rwlock);
#define seqcount_mutex_init(s, lock)                seqcount_LOCKNAME_init(s, lock, mutex);
#define seqcount_ww_mutex_init(s, lock)                seqcount_LOCKNAME_init(s, lock, ww_mutex);

/*
 * SEQCOUNT_LOCKNAME()        - Instantiate seqcount_LOCKNAME_t and helpers
 * seqprop_LOCKNAME_*()        - Property accessors for seqcount_LOCKNAME_t
 *
 * @lockname:                "LOCKNAME" part of seqcount_LOCKNAME_t
 * @locktype:                LOCKNAME canonical C data type
 * @preemptible:        preemptibility of above locktype
 * @lockmember:                argument for lockdep_assert_held()
 * @lockbase:                associated lock release function (prefix only)
 * @lock_acquire:        associated lock acquisition function (full call)
 */
#define SEQCOUNT_LOCKNAME(lockname, locktype, preemptible, lockmember, lockbase, lock_acquire) \
typedef struct seqcount_##lockname {                                        \
        seqcount_t                seqcount;                                \
        __SEQ_LOCK(locktype        *lock);                                        \
} seqcount_##lockname##_t;                                                \
                                                                        \
static __always_inline seqcount_t *                                        \
__seqprop_##lockname##_ptr(seqcount_##lockname##_t *s)                        \
{                                                                        \
        return &s->seqcount;                                                \
}                                                                        \
                                                                        \
static __always_inline unsigned                                                \
__seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s)        \
{                                                                        \
        unsigned seq = READ_ONCE(s->seqcount.sequence);                        \
                                                                        \
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))                                \
                return seq;                                                \
                                                                        \
        if (preemptible && unlikely(seq & 1)) {                                \
                __SEQ_LOCK(lock_acquire);                                \
                __SEQ_LOCK(lockbase##_unlock(s->lock));                        \
                                                                        \
                /*                                                        \
                 * Re-read the sequence counter since the (possibly        \
                 * preempted) writer made progress.                        \
                 */                                                        \
                seq = READ_ONCE(s->seqcount.sequence);                        \
        }                                                                \
                                                                        \
        return seq;                                                        \
}                                                                        \
                                                                        \
static __always_inline bool                                                \
__seqprop_##lockname##_preemptible(const seqcount_##lockname##_t *s)        \
{                                                                        \
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))                                \
                return preemptible;                                        \
                                                                        \
        /* PREEMPT_RT relies on the above LOCK+UNLOCK */                \
        return false;                                                        \
}                                                                        \
                                                                        \
static __always_inline void                                                \
__seqprop_##lockname##_assert(const seqcount_##lockname##_t *s)                \
{                                                                        \
        __SEQ_LOCK(lockdep_assert_held(lockmember));                        \
}

/*
 * __seqprop() for seqcount_t
 */

static inline seqcount_t *__seqprop_ptr(seqcount_t *s)
{
        return s;
}

static inline unsigned __seqprop_sequence(const seqcount_t *s)
{
        return READ_ONCE(s->sequence);
}

static inline bool __seqprop_preemptible(const seqcount_t *s)
{
        return false;
}

static inline void __seqprop_assert(const seqcount_t *s)
{
        lockdep_assert_preemption_disabled();
}

#define __SEQ_RT        IS_ENABLED(CONFIG_PREEMPT_RT)

SEQCOUNT_LOCKNAME(raw_spinlock, raw_spinlock_t,  false,    s->lock,        raw_spin, raw_spin_lock(s->lock))
SEQCOUNT_LOCKNAME(spinlock,     spinlock_t,      __SEQ_RT, s->lock,        spin,     spin_lock(s->lock))
SEQCOUNT_LOCKNAME(rwlock,       rwlock_t,        __SEQ_RT, s->lock,        read,     read_lock(s->lock))
SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     s->lock,        mutex,    mutex_lock(s->lock))
SEQCOUNT_LOCKNAME(ww_mutex,     struct ww_mutex, true,     &s->lock->base, ww_mutex, ww_mutex_lock(s->lock, NULL))

/*
 * SEQCNT_LOCKNAME_ZERO - static initializer for seqcount_LOCKNAME_t
 * @name:        Name of the seqcount_LOCKNAME_t instance
 * @lock:        Pointer to the associated LOCKNAME
 */

#define SEQCOUNT_LOCKNAME_ZERO(seq_name, assoc_lock) {                        \
        .seqcount                = SEQCNT_ZERO(seq_name.seqcount),        \
        __SEQ_LOCK(.lock        = (assoc_lock))                                \
}

#define SEQCNT_RAW_SPINLOCK_ZERO(name, lock)        SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define SEQCNT_SPINLOCK_ZERO(name, lock)        SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define SEQCNT_RWLOCK_ZERO(name, lock)                SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define SEQCNT_MUTEX_ZERO(name, lock)                SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define SEQCNT_WW_MUTEX_ZERO(name, lock)         SEQCOUNT_LOCKNAME_ZERO(name, lock)

#define __seqprop_case(s, lockname, prop)                                \
        seqcount_##lockname##_t: __seqprop_##lockname##_##prop((void *)(s))

#define __seqprop(s, prop) _Generic(*(s),                                \
        seqcount_t:                __seqprop_##prop((void *)(s)),                \
        __seqprop_case((s),        raw_spinlock,        prop),                        \
        __seqprop_case((s),        spinlock,        prop),                        \
        __seqprop_case((s),        rwlock,                prop),                        \
        __seqprop_case((s),        mutex,                prop),                        \
        __seqprop_case((s),        ww_mutex,        prop))

#define seqprop_ptr(s)                        __seqprop(s, ptr)
#define seqprop_sequence(s)                __seqprop(s, sequence)
#define seqprop_preemptible(s)                __seqprop(s, preemptible)
#define seqprop_assert(s)                __seqprop(s, assert)

/**
 * __read_seqcount_begin() - begin a seqcount_t read section w/o barrier
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * __read_seqcount_begin is like read_seqcount_begin, but has no smp_rmb()
 * barrier. Callers should ensure that smp_rmb() or equivalent ordering is
 * provided before actually loading any of the variables that are to be
 * protected in this critical section.
 *
 * Use carefully, only in critical code, and comment how the barrier is
 * provided.
 *
 * Return: count to be passed to read_seqcount_retry()
 */
#define __read_seqcount_begin(s)                                        \
({                                                                        \
        unsigned __seq;                                                        \
                                                                        \
        while ((__seq = seqprop_sequence(s)) & 1)                        \
                cpu_relax();                                                \
                                                                        \
        kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);                        \
        __seq;                                                                \
})

/**
 * raw_read_seqcount_begin() - begin a seqcount_t read section w/o lockdep
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * Return: count to be passed to read_seqcount_retry()
 */
#define raw_read_seqcount_begin(s)                                        \
({                                                                        \
        unsigned _seq = __read_seqcount_begin(s);                        \
                                                                        \
        smp_rmb();                                                        \
        _seq;                                                                \
})

/**
 * read_seqcount_begin() - begin a seqcount_t read critical section
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * Return: count to be passed to read_seqcount_retry()
 */
#define read_seqcount_begin(s)                                                \
({                                                                        \
        seqcount_lockdep_reader_access(seqprop_ptr(s));                        \
        raw_read_seqcount_begin(s);                                        \
})

/**
 * raw_read_seqcount() - read the raw seqcount_t counter value
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * raw_read_seqcount opens a read critical section of the given
 * seqcount_t, without any lockdep checking, and without checking or
 * masking the sequence counter LSB. Calling code is responsible for
 * handling that.
 *
 * Return: count to be passed to read_seqcount_retry()
 */
#define raw_read_seqcount(s)                                                \
({                                                                        \
        unsigned __seq = seqprop_sequence(s);                                \
                                                                        \
        smp_rmb();                                                        \
        kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);                        \
        __seq;                                                                \
})

/**
 * raw_seqcount_begin() - begin a seqcount_t read critical section w/o
 *                        lockdep and w/o counter stabilization
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * raw_seqcount_begin opens a read critical section of the given
 * seqcount_t. Unlike read_seqcount_begin(), this function will not wait
 * for the count to stabilize. If a writer is active when it begins, it
 * will fail the read_seqcount_retry() at the end of the read critical
 * section instead of stabilizing at the beginning of it.
 *
 * Use this only in special kernel hot paths where the read section is
 * small and has a high probability of success through other external
 * means. It will save a single branching instruction.
 *
 * Return: count to be passed to read_seqcount_retry()
 */
#define raw_seqcount_begin(s)                                                \
({                                                                        \
        /*                                                                \
         * If the counter is odd, let read_seqcount_retry() fail        \
         * by decrementing the counter.                                        \
         */                                                                \
        raw_read_seqcount(s) & ~1;                                        \
})

/**
 * __read_seqcount_retry() - end a seqcount_t read section w/o barrier
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 * @start: count, from read_seqcount_begin()
 *
 * __read_seqcount_retry is like read_seqcount_retry, but has no smp_rmb()
 * barrier. Callers should ensure that smp_rmb() or equivalent ordering is
 * provided before actually loading any of the variables that are to be
 * protected in this critical section.
 *
 * Use carefully, only in critical code, and comment how the barrier is
 * provided.
 *
 * Return: true if a read section retry is required, else false
 */
#define __read_seqcount_retry(s, start)                                        \
        do___read_seqcount_retry(seqprop_ptr(s), start)

static inline int do___read_seqcount_retry(const seqcount_t *s, unsigned start)
{
        kcsan_atomic_next(0);
        return unlikely(READ_ONCE(s->sequence) != start);
}

/**
 * read_seqcount_retry() - end a seqcount_t read critical section
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 * @start: count, from read_seqcount_begin()
 *
 * read_seqcount_retry closes the read critical section of given
 * seqcount_t.  If the critical section was invalid, it must be ignored
 * (and typically retried).
 *
 * Return: true if a read section retry is required, else false
 */
#define read_seqcount_retry(s, start)                                        \
        do_read_seqcount_retry(seqprop_ptr(s), start)

static inline int do_read_seqcount_retry(const seqcount_t *s, unsigned start)
{
        smp_rmb();
        return do___read_seqcount_retry(s, start);
}

/**
 * raw_write_seqcount_begin() - start a seqcount_t write section w/o lockdep
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 */
#define raw_write_seqcount_begin(s)                                        \
do {                                                                        \
        if (seqprop_preemptible(s))                                        \
                preempt_disable();                                        \
                                                                        \
        do_raw_write_seqcount_begin(seqprop_ptr(s));                        \
} while (0)

static inline void do_raw_write_seqcount_begin(seqcount_t *s)
{
        kcsan_nestable_atomic_begin();
        s->sequence++;
        smp_wmb();
}

/**
 * raw_write_seqcount_end() - end a seqcount_t write section w/o lockdep
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 */
#define raw_write_seqcount_end(s)                                        \
do {                                                                        \
        do_raw_write_seqcount_end(seqprop_ptr(s));                        \
                                                                        \
        if (seqprop_preemptible(s))                                        \
                preempt_enable();                                        \
} while (0)

static inline void do_raw_write_seqcount_end(seqcount_t *s)
{
        smp_wmb();
        s->sequence++;
        kcsan_nestable_atomic_end();
}

/**
 * write_seqcount_begin_nested() - start a seqcount_t write section with
 *                                 custom lockdep nesting level
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 * @subclass: lockdep nesting level
 *
 * See Documentation/locking/lockdep-design.rst
 */
#define write_seqcount_begin_nested(s, subclass)                        \
do {                                                                        \
        seqprop_assert(s);                                                \
                                                                        \
        if (seqprop_preemptible(s))                                        \
                preempt_disable();                                        \
                                                                        \
        do_write_seqcount_begin_nested(seqprop_ptr(s), subclass);        \
} while (0)

static inline void do_write_seqcount_begin_nested(seqcount_t *s, int subclass)
{
        seqcount_acquire(&s->dep_map, subclass, 0, _RET_IP_);
        do_raw_write_seqcount_begin(s);
}

/**
 * write_seqcount_begin() - start a seqcount_t write side critical section
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * write_seqcount_begin opens a write side critical section of the given
 * seqcount_t.
 *
 * Context: seqcount_t write side critical sections must be serialized and
 * non-preemptible. If readers can be invoked from hardirq or softirq
 * context, interrupts or bottom halves must be respectively disabled.
 */
#define write_seqcount_begin(s)                                                \
do {                                                                        \
        seqprop_assert(s);                                                \
                                                                        \
        if (seqprop_preemptible(s))                                        \
                preempt_disable();                                        \
                                                                        \
        do_write_seqcount_begin(seqprop_ptr(s));                        \
} while (0)

static inline void do_write_seqcount_begin(seqcount_t *s)
{
        do_write_seqcount_begin_nested(s, 0);
}

/**
 * write_seqcount_end() - end a seqcount_t write side critical section
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * The write section must've been opened with write_seqcount_begin().
 */
#define write_seqcount_end(s)                                                \
do {                                                                        \
        do_write_seqcount_end(seqprop_ptr(s));                                \
                                                                        \
        if (seqprop_preemptible(s))                                        \
                preempt_enable();                                        \
} while (0)

static inline void do_write_seqcount_end(seqcount_t *s)
{
        seqcount_release(&s->dep_map, _RET_IP_);
        do_raw_write_seqcount_end(s);
}

/**
 * raw_write_seqcount_barrier() - do a seqcount_t write barrier
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * This can be used to provide an ordering guarantee instead of the usual
 * consistency guarantee. It is one wmb cheaper, because it can collapse
 * the two back-to-back wmb()s.
 *
 * Note that writes surrounding the barrier should be declared atomic (e.g.
 * via WRITE_ONCE): a) to ensure the writes become visible to other threads
 * atomically, avoiding compiler optimizations; b) to document which writes are
 * meant to propagate to the reader critical section. This is necessary because
 * neither writes before and after the barrier are enclosed in a seq-writer
 * critical section that would ensure readers are aware of ongoing writes::
 *
 *        seqcount_t seq;
 *        bool X = true, Y = false;
 *
 *        void read(void)
 *        {
 *                bool x, y;
 *
 *                do {
 *                        int s = read_seqcount_begin(&seq);
 *
 *                        x = X; y = Y;
 *
 *                } while (read_seqcount_retry(&seq, s));
 *
 *                BUG_ON(!x && !y);
 *      }
 *
 *      void write(void)
 *      {
 *                WRITE_ONCE(Y, true);
 *
 *                raw_write_seqcount_barrier(seq);
 *
 *                WRITE_ONCE(X, false);
 *      }
 */
#define raw_write_seqcount_barrier(s)                                        \
        do_raw_write_seqcount_barrier(seqprop_ptr(s))

static inline void do_raw_write_seqcount_barrier(seqcount_t *s)
{
        kcsan_nestable_atomic_begin();
        s->sequence++;
        smp_wmb();
        s->sequence++;
        kcsan_nestable_atomic_end();
}

/**
 * write_seqcount_invalidate() - invalidate in-progress seqcount_t read
 *                               side operations
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * After write_seqcount_invalidate, no seqcount_t read side operations
 * will complete successfully and see data older than this.
 */
#define write_seqcount_invalidate(s)                                        \
        do_write_seqcount_invalidate(seqprop_ptr(s))

static inline void do_write_seqcount_invalidate(seqcount_t *s)
{
        smp_wmb();
        kcsan_nestable_atomic_begin();
        s->sequence+=2;
        kcsan_nestable_atomic_end();
}

/*
 * Latch sequence counters (seqcount_latch_t)
 *
 * A sequence counter variant where the counter even/odd value is used to
 * switch between two copies of protected data. This allows the read path,
 * typically NMIs, to safely interrupt the write side critical section.
 *
 * As the write sections are fully preemptible, no special handling for
 * PREEMPT_RT is needed.
 */
typedef struct {
        seqcount_t seqcount;
} seqcount_latch_t;

/**
 * SEQCNT_LATCH_ZERO() - static initializer for seqcount_latch_t
 * @seq_name: Name of the seqcount_latch_t instance
 */
#define SEQCNT_LATCH_ZERO(seq_name) {                                        \
        .seqcount                = SEQCNT_ZERO(seq_name.seqcount),        \
}

/**
 * seqcount_latch_init() - runtime initializer for seqcount_latch_t
 * @s: Pointer to the seqcount_latch_t instance
 */
#define seqcount_latch_init(s) seqcount_init(&(s)->seqcount)

/**
 * raw_read_seqcount_latch() - pick even/odd latch data copy
 * @s: Pointer to seqcount_latch_t
 *
 * See raw_write_seqcount_latch() for details and a full reader/writer
 * usage example.
 *
 * Return: sequence counter raw value. Use the lowest bit as an index for
 * picking which data copy to read. The full counter must then be checked
 * with read_seqcount_latch_retry().
 */
static inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *s)
{
        /*
         * Pairs with the first smp_wmb() in raw_write_seqcount_latch().
         * Due to the dependent load, a full smp_rmb() is not needed.
         */
        return READ_ONCE(s->seqcount.sequence);
}

/**
 * read_seqcount_latch_retry() - end a seqcount_latch_t read section
 * @s:                Pointer to seqcount_latch_t
 * @start:        count, from raw_read_seqcount_latch()
 *
 * Return: true if a read section retry is required, else false
 */
static inline int
read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start)
{
        return read_seqcount_retry(&s->seqcount, start);
}

/**
 * raw_write_seqcount_latch() - redirect latch readers to even/odd copy
 * @s: Pointer to seqcount_latch_t
 *
 * The latch technique is a multiversion concurrency control method that allows
 * queries during non-atomic modifications. If you can guarantee queries never
 * interrupt the modification -- e.g. the concurrency is strictly between CPUs
 * -- you most likely do not need this.
 *
 * Where the traditional RCU/lockless data structures rely on atomic
 * modifications to ensure queries observe either the old or the new state the
 * latch allows the same for non-atomic updates. The trade-off is doubling the
 * cost of storage; we have to maintain two copies of the entire data
 * structure.
 *
 * Very simply put: we first modify one copy and then the other. This ensures
 * there is always one copy in a stable state, ready to give us an answer.
 *
 * The basic form is a data structure like::
 *
 *        struct latch_struct {
 *                seqcount_latch_t        seq;
 *                struct data_struct        data[2];
 *        };
 *
 * Where a modification, which is assumed to be externally serialized, does the
 * following::
 *
 *        void latch_modify(struct latch_struct *latch, ...)
 *        {
 *                smp_wmb();        // Ensure that the last data[1] update is visible
 *                latch->seq.sequence++;
 *                smp_wmb();        // Ensure that the seqcount update is visible
 *
 *                modify(latch->data[0], ...);
 *
 *                smp_wmb();        // Ensure that the data[0] update is visible
 *                latch->seq.sequence++;
 *                smp_wmb();        // Ensure that the seqcount update is visible
 *
 *                modify(latch->data[1], ...);
 *        }
 *
 * The query will have a form like::
 *
 *        struct entry *latch_query(struct latch_struct *latch, ...)
 *        {
 *                struct entry *entry;
 *                unsigned seq, idx;
 *
 *                do {
 *                        seq = raw_read_seqcount_latch(&latch->seq);
 *
 *                        idx = seq & 0x01;
 *                        entry = data_query(latch->data[idx], ...);
 *
 *                // This includes needed smp_rmb()
 *                } while (read_seqcount_latch_retry(&latch->seq, seq));
 *
 *                return entry;
 *        }
 *
 * So during the modification, queries are first redirected to data[1]. Then we
 * modify data[0]. When that is complete, we redirect queries back to data[0]
 * and we can modify data[1].
 *
 * NOTE:
 *
 *        The non-requirement for atomic modifications does _NOT_ include
 *        the publishing of new entries in the case where data is a dynamic
 *        data structure.
 *
 *        An iteration might start in data[0] and get suspended long enough
 *        to miss an entire modification sequence, once it resumes it might
 *        observe the new entry.
 *
 * NOTE2:
 *
 *        When data is a dynamic data structure; one should use regular RCU
 *        patterns to manage the lifetimes of the objects within.
 */
static inline void raw_write_seqcount_latch(seqcount_latch_t *s)
{
        smp_wmb();        /* prior stores before incrementing "sequence" */
        s->seqcount.sequence++;
        smp_wmb();      /* increment "sequence" before following stores */
}

/*
 * Sequential locks (seqlock_t)
 *
 * Sequence counters with an embedded spinlock for writer serialization
 * and non-preemptibility.
 *
 * For more info, see:
 *    - Comments on top of seqcount_t
 *    - Documentation/locking/seqlock.rst
 */
typedef struct {
        /*
         * Make sure that readers don't starve writers on PREEMPT_RT: use
         * seqcount_spinlock_t instead of seqcount_t. Check __SEQ_LOCK().
         */
        seqcount_spinlock_t seqcount;
        spinlock_t lock;
} seqlock_t;

#define __SEQLOCK_UNLOCKED(lockname)                                        \
        {                                                                \
                .seqcount = SEQCNT_SPINLOCK_ZERO(lockname, &(lockname).lock), \
                .lock =        __SPIN_LOCK_UNLOCKED(lockname)                        \
        }

/**
 * seqlock_init() - dynamic initializer for seqlock_t
 * @sl: Pointer to the seqlock_t instance
 */
#define seqlock_init(sl)                                                \
        do {                                                                \
                spin_lock_init(&(sl)->lock);                                \
                seqcount_spinlock_init(&(sl)->seqcount, &(sl)->lock);        \
        } while (0)

/**
 * DEFINE_SEQLOCK(sl) - Define a statically allocated seqlock_t
 * @sl: Name of the seqlock_t instance
 */
#define DEFINE_SEQLOCK(sl) \
                seqlock_t sl = __SEQLOCK_UNLOCKED(sl)

/**
 * read_seqbegin() - start a seqlock_t read side critical section
 * @sl: Pointer to seqlock_t
 *
 * Return: count, to be passed to read_seqretry()
 */
static inline unsigned read_seqbegin(const seqlock_t *sl)
{
        return read_seqcount_begin(&sl->seqcount);
}

/**
 * read_seqretry() - end a seqlock_t read side section
 * @sl: Pointer to seqlock_t
 * @start: count, from read_seqbegin()
 *
 * read_seqretry closes the read side critical section of given seqlock_t.
 * If the critical section was invalid, it must be ignored (and typically
 * retried).
 *
 * Return: true if a read section retry is required, else false
 */
static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
{
        return read_seqcount_retry(&sl->seqcount, start);
}

/*
 * For all seqlock_t write side functions, use the the internal
 * do_write_seqcount_begin() instead of generic write_seqcount_begin().
 * This way, no redundant lockdep_assert_held() checks are added.
 */

/**
 * write_seqlock() - start a seqlock_t write side critical section
 * @sl: Pointer to seqlock_t
 *
 * write_seqlock opens a write side critical section for the given
 * seqlock_t.  It also implicitly acquires the spinlock_t embedded inside
 * that sequential lock. All seqlock_t write side sections are thus
 * automatically serialized and non-preemptible.
 *
 * Context: if the seqlock_t read section, or other write side critical
 * sections, can be invoked from hardirq or softirq contexts, use the
 * _irqsave or _bh variants of this function instead.
 */
static inline void write_seqlock(seqlock_t *sl)
{
        spin_lock(&sl->lock);
        do_write_seqcount_begin(&sl->seqcount.seqcount);
}

/**
 * write_sequnlock() - end a seqlock_t write side critical section
 * @sl: Pointer to seqlock_t
 *
 * write_sequnlock closes the (serialized and non-preemptible) write side
 * critical section of given seqlock_t.
 */
static inline void write_sequnlock(seqlock_t *sl)
{
        do_write_seqcount_end(&sl->seqcount.seqcount);
        spin_unlock(&sl->lock);
}

/**
 * write_seqlock_bh() - start a softirqs-disabled seqlock_t write section
 * @sl: Pointer to seqlock_t
 *
 * _bh variant of write_seqlock(). Use only if the read side section, or
 * other write side sections, can be invoked from softirq contexts.
 */
static inline void write_seqlock_bh(seqlock_t *sl)
{
        spin_lock_bh(&sl->lock);
        do_write_seqcount_begin(&sl->seqcount.seqcount);
}

/**
 * write_sequnlock_bh() - end a softirqs-disabled seqlock_t write section
 * @sl: Pointer to seqlock_t
 *
 * write_sequnlock_bh closes the serialized, non-preemptible, and
 * softirqs-disabled, seqlock_t write side critical section opened with
 * write_seqlock_bh().
 */
static inline void write_sequnlock_bh(seqlock_t *sl)
{
        do_write_seqcount_end(&sl->seqcount.seqcount);
        spin_unlock_bh(&sl->lock);
}

/**
 * write_seqlock_irq() - start a non-interruptible seqlock_t write section
 * @sl: Pointer to seqlock_t
 *
 * _irq variant of write_seqlock(). Use only if the read side section, or
 * other write sections, can be invoked from hardirq contexts.
 */
static inline void write_seqlock_irq(seqlock_t *sl)
{
        spin_lock_irq(&sl->lock);
        do_write_seqcount_begin(&sl->seqcount.seqcount);
}

/**
 * write_sequnlock_irq() - end a non-interruptible seqlock_t write section
 * @sl: Pointer to seqlock_t
 *
 * write_sequnlock_irq closes the serialized and non-interruptible
 * seqlock_t write side section opened with write_seqlock_irq().
 */
static inline void write_sequnlock_irq(seqlock_t *sl)
{
        do_write_seqcount_end(&sl->seqcount.seqcount);
        spin_unlock_irq(&sl->lock);
}

static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
{
        unsigned long flags;

        spin_lock_irqsave(&sl->lock, flags);
        do_write_seqcount_begin(&sl->seqcount.seqcount);
        return flags;
}

/**
 * write_seqlock_irqsave() - start a non-interruptible seqlock_t write
 *                           section
 * @lock:  Pointer to seqlock_t
 * @flags: Stack-allocated storage for saving caller's local interrupt
 *         state, to be passed to write_sequnlock_irqrestore().
 *
 * _irqsave variant of write_seqlock(). Use it only if the read side
 * section, or other write sections, can be invoked from hardirq context.
 */
#define write_seqlock_irqsave(lock, flags)                                \
        do { flags = __write_seqlock_irqsave(lock); } while (0)

/**
 * write_sequnlock_irqrestore() - end non-interruptible seqlock_t write
 *                                section
 * @sl:    Pointer to seqlock_t
 * @flags: Caller's saved interrupt state, from write_seqlock_irqsave()
 *
 * write_sequnlock_irqrestore closes the serialized and non-interruptible
 * seqlock_t write section previously opened with write_seqlock_irqsave().
 */
static inline void
write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
{
        do_write_seqcount_end(&sl->seqcount.seqcount);
        spin_unlock_irqrestore(&sl->lock, flags);
}

/**
 * read_seqlock_excl() - begin a seqlock_t locking reader section
 * @sl:        Pointer to seqlock_t
 *
 * read_seqlock_excl opens a seqlock_t locking reader critical section.  A
 * locking reader exclusively locks out *both* other writers *and* other
 * locking readers, but it does not update the embedded sequence number.
 *
 * Locking readers act like a normal spin_lock()/spin_unlock().
 *
 * Context: if the seqlock_t write section, *or other read sections*, can
 * be invoked from hardirq or softirq contexts, use the _irqsave or _bh
 * variant of this function instead.
 *
 * The opened read section must be closed with read_sequnlock_excl().
 */
static inline void read_seqlock_excl(seqlock_t *sl)
{
        spin_lock(&sl->lock);
}

/**
 * read_sequnlock_excl() - end a seqlock_t locking reader critical section
 * @sl: Pointer to seqlock_t
 */
static inline void read_sequnlock_excl(seqlock_t *sl)
{
        spin_unlock(&sl->lock);
}

/**
 * read_seqlock_excl_bh() - start a seqlock_t locking reader section with
 *                            softirqs disabled
 * @sl: Pointer to seqlock_t
 *
 * _bh variant of read_seqlock_excl(). Use this variant only if the
 * seqlock_t write side section, *or other read sections*, can be invoked
 * from softirq contexts.
 */
static inline void read_seqlock_excl_bh(seqlock_t *sl)
{
        spin_lock_bh(&sl->lock);
}

/**
 * read_sequnlock_excl_bh() - stop a seqlock_t softirq-disabled locking
 *                              reader section
 * @sl: Pointer to seqlock_t
 */
static inline void read_sequnlock_excl_bh(seqlock_t *sl)
{
        spin_unlock_bh(&sl->lock);
}

/**
 * read_seqlock_excl_irq() - start a non-interruptible seqlock_t locking
 *                             reader section
 * @sl: Pointer to seqlock_t
 *
 * _irq variant of read_seqlock_excl(). Use this only if the seqlock_t
 * write side section, *or other read sections*, can be invoked from a
 * hardirq context.
 */
static inline void read_seqlock_excl_irq(seqlock_t *sl)
{
        spin_lock_irq(&sl->lock);
}

/**
 * read_sequnlock_excl_irq() - end an interrupts-disabled seqlock_t
 *                             locking reader section
 * @sl: Pointer to seqlock_t
 */
static inline void read_sequnlock_excl_irq(seqlock_t *sl)
{
        spin_unlock_irq(&sl->lock);
}

static inline unsigned long __read_seqlock_excl_irqsave(seqlock_t *sl)
{
        unsigned long flags;

        spin_lock_irqsave(&sl->lock, flags);
        return flags;
}

/**
 * read_seqlock_excl_irqsave() - start a non-interruptible seqlock_t
 *                                 locking reader section
 * @lock:  Pointer to seqlock_t
 * @flags: Stack-allocated storage for saving caller's local interrupt
 *         state, to be passed to read_sequnlock_excl_irqrestore().
 *
 * _irqsave variant of read_seqlock_excl(). Use this only if the seqlock_t
 * write side section, *or other read sections*, can be invoked from a
 * hardirq context.
 */
#define read_seqlock_excl_irqsave(lock, flags)                                \
        do { flags = __read_seqlock_excl_irqsave(lock); } while (0)

/**
 * read_sequnlock_excl_irqrestore() - end non-interruptible seqlock_t
 *                                      locking reader section
 * @sl:    Pointer to seqlock_t
 * @flags: Caller saved interrupt state, from read_seqlock_excl_irqsave()
 */
static inline void
read_sequnlock_excl_irqrestore(seqlock_t *sl, unsigned long flags)
{
        spin_unlock_irqrestore(&sl->lock, flags);
}

/**
 * read_seqbegin_or_lock() - begin a seqlock_t lockless or locking reader
 * @lock: Pointer to seqlock_t
 * @seq : Marker and return parameter. If the passed value is even, the
 * reader will become a *lockless* seqlock_t reader as in read_seqbegin().
 * If the passed value is odd, the reader will become a *locking* reader
 * as in read_seqlock_excl().  In the first call to this function, the
 * caller *must* initialize and pass an even value to @seq; this way, a
 * lockless read can be optimistically tried first.
 *
 * read_seqbegin_or_lock is an API designed to optimistically try a normal
 * lockless seqlock_t read section first.  If an odd counter is found, the
 * lockless read trial has failed, and the next read iteration transforms
 * itself into a full seqlock_t locking reader.
 *
 * This is typically used to avoid seqlock_t lockless readers starvation
 * (too much retry loops) in the case of a sharp spike in write side
 * activity.
 *
 * Context: if the seqlock_t write section, *or other read sections*, can
 * be invoked from hardirq or softirq contexts, use the _irqsave or _bh
 * variant of this function instead.
 *
 * Check Documentation/locking/seqlock.rst for template example code.
 *
 * Return: the encountered sequence counter value, through the @seq
 * parameter, which is overloaded as a return parameter. This returned
 * value must be checked with need_seqretry(). If the read section need to
 * be retried, this returned value must also be passed as the @seq
 * parameter of the next read_seqbegin_or_lock() iteration.
 */
static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq)
{
        if (!(*seq & 1))        /* Even */
                *seq = read_seqbegin(lock);
        else                        /* Odd */
                read_seqlock_excl(lock);
}

/**
 * need_seqretry() - validate seqlock_t "locking or lockless" read section
 * @lock: Pointer to seqlock_t
 * @seq: sequence count, from read_seqbegin_or_lock()
 *
 * Return: true if a read section retry is required, false otherwise
 */
static inline int need_seqretry(seqlock_t *lock, int seq)
{
        return !(seq & 1) && read_seqretry(lock, seq);
}

/**
 * done_seqretry() - end seqlock_t "locking or lockless" reader section
 * @lock: Pointer to seqlock_t
 * @seq: count, from read_seqbegin_or_lock()
 *
 * done_seqretry finishes the seqlock_t read side critical section started
 * with read_seqbegin_or_lock() and validated by need_seqretry().
 */
static inline void done_seqretry(seqlock_t *lock, int seq)
{
        if (seq & 1)
                read_sequnlock_excl(lock);
}

/**
 * read_seqbegin_or_lock_irqsave() - begin a seqlock_t lockless reader, or
 *                                   a non-interruptible locking reader
 * @lock: Pointer to seqlock_t
 * @seq:  Marker and return parameter. Check read_seqbegin_or_lock().
 *
 * This is the _irqsave variant of read_seqbegin_or_lock(). Use it only if
 * the seqlock_t write section, *or other read sections*, can be invoked
 * from hardirq context.
 *
 * Note: Interrupts will be disabled only for "locking reader" mode.
 *
 * Return:
 *
 *   1. The saved local interrupts state in case of a locking reader, to
 *      be passed to done_seqretry_irqrestore().
 *
 *   2. The encountered sequence counter value, returned through @seq
 *      overloaded as a return parameter. Check read_seqbegin_or_lock().
 */
static inline unsigned long
read_seqbegin_or_lock_irqsave(seqlock_t *lock, int *seq)
{
        unsigned long flags = 0;

        if (!(*seq & 1))        /* Even */
                *seq = read_seqbegin(lock);
        else                        /* Odd */
                read_seqlock_excl_irqsave(lock, flags);

        return flags;
}

/**
 * done_seqretry_irqrestore() - end a seqlock_t lockless reader, or a
 *                                non-interruptible locking reader section
 * @lock:  Pointer to seqlock_t
 * @seq:   Count, from read_seqbegin_or_lock_irqsave()
 * @flags: Caller's saved local interrupt state in case of a locking
 *           reader, also from read_seqbegin_or_lock_irqsave()
 *
 * This is the _irqrestore variant of done_seqretry(). The read section
 * must've been opened with read_seqbegin_or_lock_irqsave(), and validated
 * by need_seqretry().
 */
static inline void
done_seqretry_irqrestore(seqlock_t *lock, int seq, unsigned long flags)
{
        if (seq & 1)
                read_sequnlock_excl_irqrestore(lock, flags);
}
#endif /* __LINUX_SEQLOCK_H */



































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 










































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
/*
 * Performance events:
 *
 *    Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de>
 *    Copyright (C) 2008-2011, Red Hat, Inc., Ingo Molnar
 *    Copyright (C) 2008-2011, Red Hat, Inc., Peter Zijlstra
 *
 * Data type definitions, declarations, prototypes.
 *
 *    Started by: Thomas Gleixner and Ingo Molnar
 *
 * For licencing details see kernel-base/COPYING
 */
#ifndef _LINUX_PERF_EVENT_H
#define _LINUX_PERF_EVENT_H

#include <uapi/linux/perf_event.h>
#include <uapi/linux/bpf_perf_event.h>

/*
 * Kernel-internal data types and definitions:
 */

#ifdef CONFIG_PERF_EVENTS
# include <asm/perf_event.h>
# include <asm/local64.h>
#endif

struct perf_guest_info_callbacks {
        int                                (*is_in_guest)(void);
        int                                (*is_user_mode)(void);
        unsigned long                        (*get_guest_ip)(void);
        void                                (*handle_intel_pt_intr)(void);
};

#ifdef CONFIG_HAVE_HW_BREAKPOINT
#include <asm/hw_breakpoint.h>
#endif

#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
#include <linux/hrtimer.h>
#include <linux/fs.h>
#include <linux/pid_namespace.h>
#include <linux/workqueue.h>
#include <linux/ftrace.h>
#include <linux/cpu.h>
#include <linux/irq_work.h>
#include <linux/static_key.h>
#include <linux/jump_label_ratelimit.h>
#include <linux/atomic.h>
#include <linux/sysfs.h>
#include <linux/perf_regs.h>
#include <linux/cgroup.h>
#include <linux/refcount.h>
#include <linux/security.h>
#include <asm/local.h>

struct perf_callchain_entry {
        __u64                                nr;
        __u64                                ip[]; /* /proc/sys/kernel/perf_event_max_stack */
};

struct perf_callchain_entry_ctx {
        struct perf_callchain_entry *entry;
        u32                            max_stack;
        u32                            nr;
        short                            contexts;
        bool                            contexts_maxed;
};

typedef unsigned long (*perf_copy_f)(void *dst, const void *src,
                                     unsigned long off, unsigned long len);

struct perf_raw_frag {
        union {
                struct perf_raw_frag        *next;
                unsigned long                pad;
        };
        perf_copy_f                        copy;
        void                                *data;
        u32                                size;
} __packed;

struct perf_raw_record {
        struct perf_raw_frag                frag;
        u32                                size;
};

/*
 * branch stack layout:
 *  nr: number of taken branches stored in entries[]
 *  hw_idx: The low level index of raw branch records
 *          for the most recent branch.
 *          -1ULL means invalid/unknown.
 *
 * Note that nr can vary from sample to sample
 * branches (to, from) are stored from most recent
 * to least recent, i.e., entries[0] contains the most
 * recent branch.
 * The entries[] is an abstraction of raw branch records,
 * which may not be stored in age order in HW, e.g. Intel LBR.
 * The hw_idx is to expose the low level index of raw
 * branch record for the most recent branch aka entries[0].
 * The hw_idx index is between -1 (unknown) and max depth,
 * which can be retrieved in /sys/devices/cpu/caps/branches.
 * For the architectures whose raw branch records are
 * already stored in age order, the hw_idx should be 0.
 */
struct perf_branch_stack {
        __u64                                nr;
        __u64                                hw_idx;
        struct perf_branch_entry        entries[];
};

struct task_struct;

/*
 * extra PMU register associated with an event
 */
struct hw_perf_event_extra {
        u64                config;        /* register value */
        unsigned int        reg;        /* register address or index */
        int                alloc;        /* extra register already allocated */
        int                idx;        /* index in shared_regs->regs[] */
};

/**
 * struct hw_perf_event - performance event hardware details:
 */
struct hw_perf_event {
#ifdef CONFIG_PERF_EVENTS
        union {
                struct { /* hardware */
                        u64                config;
                        u64                last_tag;
                        unsigned long        config_base;
                        unsigned long        event_base;
                        int                event_base_rdpmc;
                        int                idx;
                        int                last_cpu;
                        int                flags;

                        struct hw_perf_event_extra extra_reg;
                        struct hw_perf_event_extra branch_reg;
                };
                struct { /* software */
                        struct hrtimer        hrtimer;
                };
                struct { /* tracepoint */
                        /* for tp_event->class */
                        struct list_head        tp_list;
                };
                struct { /* amd_power */
                        u64        pwr_acc;
                        u64        ptsc;
                };
#ifdef CONFIG_HAVE_HW_BREAKPOINT
                struct { /* breakpoint */
                        /*
                         * Crufty hack to avoid the chicken and egg
                         * problem hw_breakpoint has with context
                         * creation and event initalization.
                         */
                        struct arch_hw_breakpoint        info;
                        struct list_head                bp_list;
                };
#endif
                struct { /* amd_iommu */
                        u8        iommu_bank;
                        u8        iommu_cntr;
                        u16        padding;
                        u64        conf;
                        u64        conf1;
                };
        };
        /*
         * If the event is a per task event, this will point to the task in
         * question. See the comment in perf_event_alloc().
         */
        struct task_struct                *target;

        /*
         * PMU would store hardware filter configuration
         * here.
         */
        void                                *addr_filters;

        /* Last sync'ed generation of filters */
        unsigned long                        addr_filters_gen;

/*
 * hw_perf_event::state flags; used to track the PERF_EF_* state.
 */
#define PERF_HES_STOPPED        0x01 /* the counter is stopped */
#define PERF_HES_UPTODATE        0x02 /* event->count up-to-date */
#define PERF_HES_ARCH                0x04

        int                                state;

        /*
         * The last observed hardware counter value, updated with a
         * local64_cmpxchg() such that pmu::read() can be called nested.
         */
        local64_t                        prev_count;

        /*
         * The period to start the next sample with.
         */
        u64                                sample_period;

        union {
                struct { /* Sampling */
                        /*
                         * The period we started this sample with.
                         */
                        u64                                last_period;

                        /*
                         * However much is left of the current period;
                         * note that this is a full 64bit value and
                         * allows for generation of periods longer
                         * than hardware might allow.
                         */
                        local64_t                        period_left;
                };
                struct { /* Topdown events counting for context switch */
                        u64                                saved_metric;
                        u64                                saved_slots;
                };
        };

        /*
         * State for throttling the event, see __perf_event_overflow() and
         * perf_adjust_freq_unthr_context().
         */
        u64                             interrupts_seq;
        u64                                interrupts;

        /*
         * State for freq target events, see __perf_event_overflow() and
         * perf_adjust_freq_unthr_context().
         */
        u64                                freq_time_stamp;
        u64                                freq_count_stamp;
#endif
};

struct perf_event;

/*
 * Common implementation detail of pmu::{start,commit,cancel}_txn
 */
#define PERF_PMU_TXN_ADD  0x1                /* txn to add/schedule event on PMU */
#define PERF_PMU_TXN_READ 0x2                /* txn to read event group from PMU */

/**
 * pmu::capabilities flags
 */
#define PERF_PMU_CAP_NO_INTERRUPT                0x01
#define PERF_PMU_CAP_NO_NMI                        0x02
#define PERF_PMU_CAP_AUX_NO_SG                        0x04
#define PERF_PMU_CAP_EXTENDED_REGS                0x08
#define PERF_PMU_CAP_EXCLUSIVE                        0x10
#define PERF_PMU_CAP_ITRACE                        0x20
#define PERF_PMU_CAP_HETEROGENEOUS_CPUS                0x40
#define PERF_PMU_CAP_NO_EXCLUDE                        0x80
#define PERF_PMU_CAP_AUX_OUTPUT                        0x100

struct perf_output_handle;

/**
 * struct pmu - generic performance monitoring unit
 */
struct pmu {
        struct list_head                entry;

        struct module                        *module;
        struct device                        *dev;
        const struct attribute_group        **attr_groups;
        const struct attribute_group        **attr_update;
        const char                        *name;
        int                                type;

        /*
         * various common per-pmu feature flags
         */
        int                                capabilities;

        int __percpu                        *pmu_disable_count;
        struct perf_cpu_context __percpu *pmu_cpu_context;
        atomic_t                        exclusive_cnt; /* < 0: cpu; > 0: tsk */
        int                                task_ctx_nr;
        int                                hrtimer_interval_ms;

        /* number of address filters this PMU can do */
        unsigned int                        nr_addr_filters;

        /*
         * Fully disable/enable this PMU, can be used to protect from the PMI
         * as well as for lazy/batch writing of the MSRs.
         */
        void (*pmu_enable)                (struct pmu *pmu); /* optional */
        void (*pmu_disable)                (struct pmu *pmu); /* optional */

        /*
         * Try and initialize the event for this PMU.
         *
         * Returns:
         *  -ENOENT        -- @event is not for this PMU
         *
         *  -ENODEV        -- @event is for this PMU but PMU not present
         *  -EBUSY        -- @event is for this PMU but PMU temporarily unavailable
         *  -EINVAL        -- @event is for this PMU but @event is not valid
         *  -EOPNOTSUPP -- @event is for this PMU, @event is valid, but not supported
         *  -EACCES        -- @event is for this PMU, @event is valid, but no privileges
         *
         *  0                -- @event is for this PMU and valid
         *
         * Other error return values are allowed.
         */
        int (*event_init)                (struct perf_event *event);

        /*
         * Notification that the event was mapped or unmapped.  Called
         * in the context of the mapping task.
         */
        void (*event_mapped)                (struct perf_event *event, struct mm_struct *mm); /* optional */
        void (*event_unmapped)                (struct perf_event *event, struct mm_struct *mm); /* optional */

        /*
         * Flags for ->add()/->del()/ ->start()/->stop(). There are
         * matching hw_perf_event::state flags.
         */
#define PERF_EF_START        0x01                /* start the counter when adding    */
#define PERF_EF_RELOAD        0x02                /* reload the counter when starting */
#define PERF_EF_UPDATE        0x04                /* update the counter when stopping */

        /*
         * Adds/Removes a counter to/from the PMU, can be done inside a
         * transaction, see the ->*_txn() methods.
         *
         * The add/del callbacks will reserve all hardware resources required
         * to service the event, this includes any counter constraint
         * scheduling etc.
         *
         * Called with IRQs disabled and the PMU disabled on the CPU the event
         * is on.
         *
         * ->add() called without PERF_EF_START should result in the same state
         *  as ->add() followed by ->stop().
         *
         * ->del() must always PERF_EF_UPDATE stop an event. If it calls
         *  ->stop() that must deal with already being stopped without
         *  PERF_EF_UPDATE.
         */
        int  (*add)                        (struct perf_event *event, int flags);
        void (*del)                        (struct perf_event *event, int flags);

        /*
         * Starts/Stops a counter present on the PMU.
         *
         * The PMI handler should stop the counter when perf_event_overflow()
         * returns !0. ->start() will be used to continue.
         *
         * Also used to change the sample period.
         *
         * Called with IRQs disabled and the PMU disabled on the CPU the event
         * is on -- will be called from NMI context with the PMU generates
         * NMIs.
         *
         * ->stop() with PERF_EF_UPDATE will read the counter and update
         *  period/count values like ->read() would.
         *
         * ->start() with PERF_EF_RELOAD will reprogram the counter
         *  value, must be preceded by a ->stop() with PERF_EF_UPDATE.
         */
        void (*start)                        (struct perf_event *event, int flags);
        void (*stop)                        (struct perf_event *event, int flags);

        /*
         * Updates the counter value of the event.
         *
         * For sampling capable PMUs this will also update the software period
         * hw_perf_event::period_left field.
         */
        void (*read)                        (struct perf_event *event);

        /*
         * Group events scheduling is treated as a transaction, add
         * group events as a whole and perform one schedulability test.
         * If the test fails, roll back the whole group
         *
         * Start the transaction, after this ->add() doesn't need to
         * do schedulability tests.
         *
         * Optional.
         */
        void (*start_txn)                (struct pmu *pmu, unsigned int txn_flags);
        /*
         * If ->start_txn() disabled the ->add() schedulability test
         * then ->commit_txn() is required to perform one. On success
         * the transaction is closed. On error the transaction is kept
         * open until ->cancel_txn() is called.
         *
         * Optional.
         */
        int  (*commit_txn)                (struct pmu *pmu);
        /*
         * Will cancel the transaction, assumes ->del() is called
         * for each successful ->add() during the transaction.
         *
         * Optional.
         */
        void (*cancel_txn)                (struct pmu *pmu);

        /*
         * Will return the value for perf_event_mmap_page::index for this event,
         * if no implementation is provided it will default to: event->hw.idx + 1.
         */
        int (*event_idx)                (struct perf_event *event); /*optional */

        /*
         * context-switches callback
         */
        void (*sched_task)                (struct perf_event_context *ctx,
                                        bool sched_in);

        /*
         * Kmem cache of PMU specific data
         */
        struct kmem_cache                *task_ctx_cache;

        /*
         * PMU specific parts of task perf event context (i.e. ctx->task_ctx_data)
         * can be synchronized using this function. See Intel LBR callstack support
         * implementation and Perf core context switch handling callbacks for usage
         * examples.
         */
        void (*swap_task_ctx)                (struct perf_event_context *prev,
                                         struct perf_event_context *next);
                                        /* optional */

        /*
         * Set up pmu-private data structures for an AUX area
         */
        void *(*setup_aux)                (struct perf_event *event, void **pages,
                                         int nr_pages, bool overwrite);
                                        /* optional */

        /*
         * Free pmu-private AUX data structures
         */
        void (*free_aux)                (void *aux); /* optional */

        /*
         * Take a snapshot of the AUX buffer without touching the event
         * state, so that preempting ->start()/->stop() callbacks does
         * not interfere with their logic. Called in PMI context.
         *
         * Returns the size of AUX data copied to the output handle.
         *
         * Optional.
         */
        long (*snapshot_aux)                (struct perf_event *event,
                                         struct perf_output_handle *handle,
                                         unsigned long size);

        /*
         * Validate address range filters: make sure the HW supports the
         * requested configuration and number of filters; return 0 if the
         * supplied filters are valid, -errno otherwise.
         *
         * Runs in the context of the ioctl()ing process and is not serialized
         * with the rest of the PMU callbacks.
         */
        int (*addr_filters_validate)        (struct list_head *filters);
                                        /* optional */

        /*
         * Synchronize address range filter configuration:
         * translate hw-agnostic filters into hardware configuration in
         * event::hw::addr_filters.
         *
         * Runs as a part of filter sync sequence that is done in ->start()
         * callback by calling perf_event_addr_filters_sync().
         *
         * May (and should) traverse event::addr_filters::list, for which its
         * caller provides necessary serialization.
         */
        void (*addr_filters_sync)        (struct perf_event *event);
                                        /* optional */

        /*
         * Check if event can be used for aux_output purposes for
         * events of this PMU.
         *
         * Runs from perf_event_open(). Should return 0 for "no match"
         * or non-zero for "match".
         */
        int (*aux_output_match)                (struct perf_event *event);
                                        /* optional */

        /*
         * Filter events for PMU-specific reasons.
         */
        int (*filter_match)                (struct perf_event *event); /* optional */

        /*
         * Check period value for PERF_EVENT_IOC_PERIOD ioctl.
         */
        int (*check_period)                (struct perf_event *event, u64 value); /* optional */
};

enum perf_addr_filter_action_t {
        PERF_ADDR_FILTER_ACTION_STOP = 0,
        PERF_ADDR_FILTER_ACTION_START,
        PERF_ADDR_FILTER_ACTION_FILTER,
};

/**
 * struct perf_addr_filter - address range filter definition
 * @entry:        event's filter list linkage
 * @path:        object file's path for file-based filters
 * @offset:        filter range offset
 * @size:        filter range size (size==0 means single address trigger)
 * @action:        filter/start/stop
 *
 * This is a hardware-agnostic filter configuration as specified by the user.
 */
struct perf_addr_filter {
        struct list_head        entry;
        struct path                path;
        unsigned long                offset;
        unsigned long                size;
        enum perf_addr_filter_action_t        action;
};

/**
 * struct perf_addr_filters_head - container for address range filters
 * @list:        list of filters for this event
 * @lock:        spinlock that serializes accesses to the @list and event's
 *                (and its children's) filter generations.
 * @nr_file_filters:        number of file-based filters
 *
 * A child event will use parent's @list (and therefore @lock), so they are
 * bundled together; see perf_event_addr_filters().
 */
struct perf_addr_filters_head {
        struct list_head        list;
        raw_spinlock_t                lock;
        unsigned int                nr_file_filters;
};

struct perf_addr_filter_range {
        unsigned long                start;
        unsigned long                size;
};

/**
 * enum perf_event_state - the states of an event:
 */
enum perf_event_state {
        PERF_EVENT_STATE_DEAD                = -4,
        PERF_EVENT_STATE_EXIT                = -3,
        PERF_EVENT_STATE_ERROR                = -2,
        PERF_EVENT_STATE_OFF                = -1,
        PERF_EVENT_STATE_INACTIVE        =  0,
        PERF_EVENT_STATE_ACTIVE                =  1,
};

struct file;
struct perf_sample_data;

typedef void (*perf_overflow_handler_t)(struct perf_event *,
                                        struct perf_sample_data *,
                                        struct pt_regs *regs);

/*
 * Event capabilities. For event_caps and groups caps.
 *
 * PERF_EV_CAP_SOFTWARE: Is a software event.
 * PERF_EV_CAP_READ_ACTIVE_PKG: A CPU event (or cgroup event) that can be read
 * from any CPU in the package where it is active.
 * PERF_EV_CAP_SIBLING: An event with this flag must be a group sibling and
 * cannot be a group leader. If an event with this flag is detached from the
 * group it is scheduled out and moved into an unrecoverable ERROR state.
 */
#define PERF_EV_CAP_SOFTWARE                BIT(0)
#define PERF_EV_CAP_READ_ACTIVE_PKG        BIT(1)
#define PERF_EV_CAP_SIBLING                BIT(2)

#define SWEVENT_HLIST_BITS                8
#define SWEVENT_HLIST_SIZE                (1 << SWEVENT_HLIST_BITS)

struct swevent_hlist {
        struct hlist_head                heads[SWEVENT_HLIST_SIZE];
        struct rcu_head                        rcu_head;
};

#define PERF_ATTACH_CONTEXT        0x01
#define PERF_ATTACH_GROUP        0x02
#define PERF_ATTACH_TASK        0x04
#define PERF_ATTACH_TASK_DATA        0x08
#define PERF_ATTACH_ITRACE        0x10
#define PERF_ATTACH_SCHED_CB        0x20

struct perf_cgroup;
struct perf_buffer;

struct pmu_event_list {
        raw_spinlock_t                lock;
        struct list_head        list;
};

#define for_each_sibling_event(sibling, event)                        \
        if ((event)->group_leader == (event))                        \
                list_for_each_entry((sibling), &(event)->sibling_list, sibling_list)

/**
 * struct perf_event - performance event kernel representation:
 */
struct perf_event {
#ifdef CONFIG_PERF_EVENTS
        /*
         * entry onto perf_event_context::event_list;
         *   modifications require ctx->lock
         *   RCU safe iterations.
         */
        struct list_head                event_entry;

        /*
         * Locked for modification by both ctx->mutex and ctx->lock; holding
         * either sufficies for read.
         */
        struct list_head                sibling_list;
        struct list_head                active_list;
        /*
         * Node on the pinned or flexible tree located at the event context;
         */
        struct rb_node                        group_node;
        u64                                group_index;
        /*
         * We need storage to track the entries in perf_pmu_migrate_context; we
         * cannot use the event_entry because of RCU and we want to keep the
         * group in tact which avoids us using the other two entries.
         */
        struct list_head                migrate_entry;

        struct hlist_node                hlist_entry;
        struct list_head                active_entry;
        int                                nr_siblings;

        /* Not serialized. Only written during event initialization. */
        int                                event_caps;
        /* The cumulative AND of all event_caps for events in this group. */
        int                                group_caps;

        unsigned int                        group_generation;
        struct perf_event                *group_leader;
        struct pmu                        *pmu;
        void                                *pmu_private;

        enum perf_event_state                state;
        unsigned int                        attach_state;
        local64_t                        count;
        atomic64_t                        child_count;

        /*
         * These are the total time in nanoseconds that the event
         * has been enabled (i.e. eligible to run, and the task has
         * been scheduled in, if this is a per-task event)
         * and running (scheduled onto the CPU), respectively.
         */
        u64                                total_time_enabled;
        u64                                total_time_running;
        u64                                tstamp;

        struct perf_event_attr                attr;
        u16                                header_size;
        u16                                id_header_size;
        u16                                read_size;
        struct hw_perf_event                hw;

        struct perf_event_context        *ctx;
        atomic_long_t                        refcount;

        /*
         * These accumulate total time (in nanoseconds) that children
         * events have been enabled and running, respectively.
         */
        atomic64_t                        child_total_time_enabled;
        atomic64_t                        child_total_time_running;

        /*
         * Protect attach/detach and child_list:
         */
        struct mutex                        child_mutex;
        struct list_head                child_list;
        struct perf_event                *parent;

        int                                oncpu;
        int                                cpu;

        struct list_head                owner_entry;
        struct task_struct                *owner;

        /* mmap bits */
        struct mutex                        mmap_mutex;
        atomic_t                        mmap_count;

        struct perf_buffer                *rb;
        struct list_head                rb_entry;
        unsigned long                        rcu_batches;
        int                                rcu_pending;

        /* poll related */
        wait_queue_head_t                waitq;
        struct fasync_struct                *fasync;

        /* delayed work for NMIs and such */
        int                                pending_wakeup;
        int                                pending_kill;
        int                                pending_disable;
        struct irq_work                        pending;

        atomic_t                        event_limit;

        /* address range filters */
        struct perf_addr_filters_head        addr_filters;
        /* vma address array for file-based filders */
        struct perf_addr_filter_range        *addr_filter_ranges;
        unsigned long                        addr_filters_gen;

        /* for aux_output events */
        struct perf_event                *aux_event;

        void (*destroy)(struct perf_event *);
        struct rcu_head                        rcu_head;

        struct pid_namespace                *ns;
        u64                                id;

        atomic64_t                        lost_samples;

        u64                                (*clock)(void);
        perf_overflow_handler_t                overflow_handler;
        void                                *overflow_handler_context;
#ifdef CONFIG_BPF_SYSCALL
        perf_overflow_handler_t                orig_overflow_handler;
        struct bpf_prog                        *prog;
#endif

#ifdef CONFIG_EVENT_TRACING
        struct trace_event_call                *tp_event;
        struct event_filter                *filter;
#ifdef CONFIG_FUNCTION_TRACER
        struct ftrace_ops               ftrace_ops;
#endif
#endif

#ifdef CONFIG_CGROUP_PERF
        struct perf_cgroup                *cgrp; /* cgroup event is attach to */
#endif

#ifdef CONFIG_SECURITY
        void *security;
#endif
        struct list_head                sb_list;
#endif /* CONFIG_PERF_EVENTS */
};


struct perf_event_groups {
        struct rb_root        tree;
        u64                index;
};

/**
 * struct perf_event_context - event context structure
 *
 * Used as a container for task events and CPU events as well:
 */
struct perf_event_context {
        struct pmu                        *pmu;
        /*
         * Protect the states of the events in the list,
         * nr_active, and the list:
         */
        raw_spinlock_t                        lock;
        /*
         * Protect the list of events.  Locking either mutex or lock
         * is sufficient to ensure the list doesn't change; to change
         * the list you need to lock both the mutex and the spinlock.
         */
        struct mutex                        mutex;

        struct list_head                active_ctx_list;
        struct perf_event_groups        pinned_groups;
        struct perf_event_groups        flexible_groups;
        struct list_head                event_list;

        struct list_head                pinned_active;
        struct list_head                flexible_active;

        int                                nr_events;
        int                                nr_active;
        int                                is_active;
        int                                nr_stat;
        int                                nr_freq;
        int                                rotate_disable;
        /*
         * Set when nr_events != nr_active, except tolerant to events not
         * necessary to be active due to scheduling constraints, such as cgroups.
         */
        int                                rotate_necessary;
        refcount_t                        refcount;
        struct task_struct                *task;

        /*
         * Context clock, runs when context enabled.
         */
        u64                                time;
        u64                                timestamp;
        u64                                timeoffset;

        /*
         * These fields let us detect when two contexts have both
         * been cloned (inherited) from a common ancestor.
         */
        struct perf_event_context        *parent_ctx;
        u64                                parent_gen;
        u64                                generation;
        int                                pin_count;
#ifdef CONFIG_CGROUP_PERF
        int                                nr_cgroups;         /* cgroup evts */
#endif
        void                                *task_ctx_data; /* pmu specific data */
        struct rcu_head                        rcu_head;
};

/*
 * Number of contexts where an event can trigger:
 *        task, softirq, hardirq, nmi.
 */
#define PERF_NR_CONTEXTS        4

/**
 * struct perf_event_cpu_context - per cpu event context structure
 */
struct perf_cpu_context {
        struct perf_event_context        ctx;
        struct perf_event_context        *task_ctx;
        int                                active_oncpu;
        int                                exclusive;

        raw_spinlock_t                        hrtimer_lock;
        struct hrtimer                        hrtimer;
        ktime_t                                hrtimer_interval;
        unsigned int                        hrtimer_active;

#ifdef CONFIG_CGROUP_PERF
        struct perf_cgroup                *cgrp;
        struct list_head                cgrp_cpuctx_entry;
#endif

        struct list_head                sched_cb_entry;
        int                                sched_cb_usage;

        int                                online;
        /*
         * Per-CPU storage for iterators used in visit_groups_merge. The default
         * storage is of size 2 to hold the CPU and any CPU event iterators.
         */
        int                                heap_size;
        struct perf_event                **heap;
        struct perf_event                *heap_default[2];
};

struct perf_output_handle {
        struct perf_event                *event;
        struct perf_buffer                *rb;
        unsigned long                        wakeup;
        unsigned long                        size;
        u64                                aux_flags;
        union {
                void                        *addr;
                unsigned long                head;
        };
        int                                page;
};

struct bpf_perf_event_data_kern {
        bpf_user_pt_regs_t *regs;
        struct perf_sample_data *data;
        struct perf_event *event;
};

#ifdef CONFIG_CGROUP_PERF

/*
 * perf_cgroup_info keeps track of time_enabled for a cgroup.
 * This is a per-cpu dynamically allocated data structure.
 */
struct perf_cgroup_info {
        u64                                time;
        u64                                timestamp;
        u64                                timeoffset;
        int                                active;
};

struct perf_cgroup {
        struct cgroup_subsys_state        css;
        struct perf_cgroup_info        __percpu *info;
};

/*
 * Must ensure cgroup is pinned (css_get) before calling
 * this function. In other words, we cannot call this function
 * if there is no cgroup event for the current CPU context.
 */
static inline struct perf_cgroup *
perf_cgroup_from_task(struct task_struct *task, struct perf_event_context *ctx)
{
        return container_of(task_css_check(task, perf_event_cgrp_id,
                                           ctx ? lockdep_is_held(&ctx->lock)
                                               : true),
                            struct perf_cgroup, css);
}
#endif /* CONFIG_CGROUP_PERF */

#ifdef CONFIG_PERF_EVENTS

extern void *perf_aux_output_begin(struct perf_output_handle *handle,
                                   struct perf_event *event);
extern void perf_aux_output_end(struct perf_output_handle *handle,
                                unsigned long size);
extern int perf_aux_output_skip(struct perf_output_handle *handle,
                                unsigned long size);
extern void *perf_get_aux(struct perf_output_handle *handle);
extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags);
extern void perf_event_itrace_started(struct perf_event *event);

extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
extern void perf_pmu_unregister(struct pmu *pmu);

extern int perf_num_counters(void);
extern const char *perf_pmu_name(void);
extern void __perf_event_task_sched_in(struct task_struct *prev,
                                       struct task_struct *task);
extern void __perf_event_task_sched_out(struct task_struct *prev,
                                        struct task_struct *next);
extern int perf_event_init_task(struct task_struct *child);
extern void perf_event_exit_task(struct task_struct *child);
extern void perf_event_free_task(struct task_struct *task);
extern void perf_event_delayed_put(struct task_struct *task);
extern struct file *perf_event_get(unsigned int fd);
extern const struct perf_event *perf_get_event(struct file *file);
extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event);
extern void perf_event_print_debug(void);
extern void perf_pmu_disable(struct pmu *pmu);
extern void perf_pmu_enable(struct pmu *pmu);
extern void perf_sched_cb_dec(struct pmu *pmu);
extern void perf_sched_cb_inc(struct pmu *pmu);
extern int perf_event_task_disable(void);
extern int perf_event_task_enable(void);

extern void perf_pmu_resched(struct pmu *pmu);

extern int perf_event_refresh(struct perf_event *event, int refresh);
extern void perf_event_update_userpage(struct perf_event *event);
extern int perf_event_release_kernel(struct perf_event *event);
extern struct perf_event *
perf_event_create_kernel_counter(struct perf_event_attr *attr,
                                int cpu,
                                struct task_struct *task,
                                perf_overflow_handler_t callback,
                                void *context);
extern void perf_pmu_migrate_context(struct pmu *pmu,
                                int src_cpu, int dst_cpu);
int perf_event_read_local(struct perf_event *event, u64 *value,
                          u64 *enabled, u64 *running);
extern u64 perf_event_read_value(struct perf_event *event,
                                 u64 *enabled, u64 *running);


struct perf_sample_data {
        /*
         * Fields set by perf_sample_data_init(), group so as to
         * minimize the cachelines touched.
         */
        u64                                addr;
        struct perf_raw_record                *raw;
        struct perf_branch_stack        *br_stack;
        u64                                period;
        u64                                weight;
        u64                                txn;
        union  perf_mem_data_src        data_src;

        /*
         * The other fields, optionally {set,used} by
         * perf_{prepare,output}_sample().
         */
        u64                                type;
        u64                                ip;
        struct {
                u32        pid;
                u32        tid;
        }                                tid_entry;
        u64                                time;
        u64                                id;
        u64                                stream_id;
        struct {
                u32        cpu;
                u32        reserved;
        }                                cpu_entry;
        struct perf_callchain_entry        *callchain;
        u64                                aux_size;

        struct perf_regs                regs_user;
        struct perf_regs                regs_intr;
        u64                                stack_user_size;

        u64                                phys_addr;
        u64                                cgroup;
} ____cacheline_aligned;

/* default value for data source */
#define PERF_MEM_NA (PERF_MEM_S(OP, NA)   |\
                    PERF_MEM_S(LVL, NA)   |\
                    PERF_MEM_S(SNOOP, NA) |\
                    PERF_MEM_S(LOCK, NA)  |\
                    PERF_MEM_S(TLB, NA))

static inline void perf_sample_data_init(struct perf_sample_data *data,
                                         u64 addr, u64 period)
{
        /* remaining struct members initialized in perf_prepare_sample() */
        data->addr = addr;
        data->raw  = NULL;
        data->br_stack = NULL;
        data->period = period;
        data->weight = 0;
        data->data_src.val = PERF_MEM_NA;
        data->txn = 0;
}

extern void perf_output_sample(struct perf_output_handle *handle,
                               struct perf_event_header *header,
                               struct perf_sample_data *data,
                               struct perf_event *event);
extern void perf_prepare_sample(struct perf_event_header *header,
                                struct perf_sample_data *data,
                                struct perf_event *event,
                                struct pt_regs *regs);

extern int perf_event_overflow(struct perf_event *event,
                                 struct perf_sample_data *data,
                                 struct pt_regs *regs);

extern void perf_event_output_forward(struct perf_event *event,
                                     struct perf_sample_data *data,
                                     struct pt_regs *regs);
extern void perf_event_output_backward(struct perf_event *event,
                                       struct perf_sample_data *data,
                                       struct pt_regs *regs);
extern int perf_event_output(struct perf_event *event,
                             struct perf_sample_data *data,
                             struct pt_regs *regs);

static inline bool
__is_default_overflow_handler(perf_overflow_handler_t overflow_handler)
{
        if (likely(overflow_handler == perf_event_output_forward))
                return true;
        if (unlikely(overflow_handler == perf_event_output_backward))
                return true;
        return false;
}

#define is_default_overflow_handler(event) \
        __is_default_overflow_handler((event)->overflow_handler)

#ifdef CONFIG_BPF_SYSCALL
static inline bool uses_default_overflow_handler(struct perf_event *event)
{
        if (likely(is_default_overflow_handler(event)))
                return true;

        return __is_default_overflow_handler(event->orig_overflow_handler);
}
#else
#define uses_default_overflow_handler(event) \
        is_default_overflow_handler(event)
#endif

extern void
perf_event_header__init_id(struct perf_event_header *header,
                           struct perf_sample_data *data,
                           struct perf_event *event);
extern void
perf_event__output_id_sample(struct perf_event *event,
                             struct perf_output_handle *handle,
                             struct perf_sample_data *sample);

extern void
perf_log_lost_samples(struct perf_event *event, u64 lost);

static inline bool event_has_any_exclude_flag(struct perf_event *event)
{
        struct perf_event_attr *attr = &event->attr;

        return attr->exclude_idle || attr->exclude_user ||
               attr->exclude_kernel || attr->exclude_hv ||
               attr->exclude_guest || attr->exclude_host;
}

static inline bool is_sampling_event(struct perf_event *event)
{
        return event->attr.sample_period != 0;
}

/*
 * Return 1 for a software event, 0 for a hardware event
 */
static inline int is_software_event(struct perf_event *event)
{
        return event->event_caps & PERF_EV_CAP_SOFTWARE;
}

/*
 * Return 1 for event in sw context, 0 for event in hw context
 */
static inline int in_software_context(struct perf_event *event)
{
        return event->ctx->pmu->task_ctx_nr == perf_sw_context;
}

static inline int is_exclusive_pmu(struct pmu *pmu)
{
        return pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE;
}

extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];

extern void ___perf_sw_event(u32, u64, struct pt_regs *, u64);
extern void __perf_sw_event(u32, u64, struct pt_regs *, u64);

#ifndef perf_arch_fetch_caller_regs
static inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip) { }
#endif

/*
 * When generating a perf sample in-line, instead of from an interrupt /
 * exception, we lack a pt_regs. This is typically used from software events
 * like: SW_CONTEXT_SWITCHES, SW_MIGRATIONS and the tie-in with tracepoints.
 *
 * We typically don't need a full set, but (for x86) do require:
 * - ip for PERF_SAMPLE_IP
 * - cs for user_mode() tests
 * - sp for PERF_SAMPLE_CALLCHAIN
 * - eflags for MISC bits and CALLCHAIN (see: perf_hw_regs())
 *
 * NOTE: assumes @regs is otherwise already 0 filled; this is important for
 * things like PERF_SAMPLE_REGS_INTR.
 */
static inline void perf_fetch_caller_regs(struct pt_regs *regs)
{
        perf_arch_fetch_caller_regs(regs, CALLER_ADDR0);
}

static __always_inline void
perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
{
        if (static_key_false(&perf_swevent_enabled[event_id]))
                __perf_sw_event(event_id, nr, regs, addr);
}

DECLARE_PER_CPU(struct pt_regs, __perf_regs[4]);

/*
 * 'Special' version for the scheduler, it hard assumes no recursion,
 * which is guaranteed by us not actually scheduling inside other swevents
 * because those disable preemption.
 */
static __always_inline void
perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)
{
        if (static_key_false(&perf_swevent_enabled[event_id])) {
                struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]);

                perf_fetch_caller_regs(regs);
                ___perf_sw_event(event_id, nr, regs, addr);
        }
}

extern struct static_key_false perf_sched_events;

static __always_inline bool
perf_sw_migrate_enabled(void)
{
        if (static_key_false(&perf_swevent_enabled[PERF_COUNT_SW_CPU_MIGRATIONS]))
                return true;
        return false;
}

static inline void perf_event_task_migrate(struct task_struct *task)
{
        if (perf_sw_migrate_enabled())
                task->sched_migrated = 1;
}

static inline void perf_event_task_sched_in(struct task_struct *prev,
                                            struct task_struct *task)
{
        if (static_branch_unlikely(&perf_sched_events))
                __perf_event_task_sched_in(prev, task);

        if (perf_sw_migrate_enabled() && task->sched_migrated) {
                struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]);

                perf_fetch_caller_regs(regs);
                ___perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, regs, 0);
                task->sched_migrated = 0;
        }
}

static inline void perf_event_task_sched_out(struct task_struct *prev,
                                             struct task_struct *next)
{
        perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0);

        if (static_branch_unlikely(&perf_sched_events))
                __perf_event_task_sched_out(prev, next);
}

extern void perf_event_mmap(struct vm_area_struct *vma);

extern void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len,
                               bool unregister, const char *sym);
extern void perf_event_bpf_event(struct bpf_prog *prog,
                                 enum perf_bpf_event_type type,
                                 u16 flags);

extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
static inline struct perf_guest_info_callbacks *perf_get_guest_cbs(void)
{
        /*
         * Callbacks are RCU-protected and must be READ_ONCE to avoid reloading
         * the callbacks between a !NULL check and dereferences, to ensure
         * pending stores/changes to the callback pointers are visible before a
         * non-NULL perf_guest_cbs is visible to readers, and to prevent a
         * module from unloading callbacks while readers are active.
         */
        return rcu_dereference(perf_guest_cbs);
}
extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);

extern void perf_event_exec(void);
extern void perf_event_comm(struct task_struct *tsk, bool exec);
extern void perf_event_namespaces(struct task_struct *tsk);
extern void perf_event_fork(struct task_struct *tsk);
extern void perf_event_text_poke(const void *addr,
                                 const void *old_bytes, size_t old_len,
                                 const void *new_bytes, size_t new_len);

/* Callchains */
DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);

extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
extern struct perf_callchain_entry *
get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
                   u32 max_stack, bool crosstask, bool add_mark);
extern struct perf_callchain_entry *perf_callchain(struct perf_event *event, struct pt_regs *regs);
extern int get_callchain_buffers(int max_stack);
extern void put_callchain_buffers(void);
extern struct perf_callchain_entry *get_callchain_entry(int *rctx);
extern void put_callchain_entry(int rctx);

extern int sysctl_perf_event_max_stack;
extern int sysctl_perf_event_max_contexts_per_stack;

static inline int perf_callchain_store_context(struct perf_callchain_entry_ctx *ctx, u64 ip)
{
        if (ctx->contexts < sysctl_perf_event_max_contexts_per_stack) {
                struct perf_callchain_entry *entry = ctx->entry;
                entry->ip[entry->nr++] = ip;
                ++ctx->contexts;
                return 0;
        } else {
                ctx->contexts_maxed = true;
                return -1; /* no more room, stop walking the stack */
        }
}

static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 ip)
{
        if (ctx->nr < ctx->max_stack && !ctx->contexts_maxed) {
                struct perf_callchain_entry *entry = ctx->entry;
                entry->ip[entry->nr++] = ip;
                ++ctx->nr;
                return 0;
        } else {
                return -1; /* no more room, stop walking the stack */
        }
}

extern int sysctl_perf_event_paranoid;
extern int sysctl_perf_event_mlock;
extern int sysctl_perf_event_sample_rate;
extern int sysctl_perf_cpu_time_max_percent;

extern void perf_sample_event_took(u64 sample_len_ns);

int perf_proc_update_handler(struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos);
int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos);
int perf_event_max_stack_handler(struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos);

/* Access to perf_event_open(2) syscall. */
#define PERF_SECURITY_OPEN                0

/* Finer grained perf_event_open(2) access control. */
#define PERF_SECURITY_CPU                1
#define PERF_SECURITY_KERNEL                2
#define PERF_SECURITY_TRACEPOINT        3

static inline int perf_is_paranoid(void)
{
        return sysctl_perf_event_paranoid > -1;
}

static inline int perf_allow_kernel(struct perf_event_attr *attr)
{
        if (sysctl_perf_event_paranoid > 1 && !perfmon_capable())
                return -EACCES;

        return security_perf_event_open(attr, PERF_SECURITY_KERNEL);
}

static inline int perf_allow_cpu(struct perf_event_attr *attr)
{
        if (sysctl_perf_event_paranoid > 0 && !perfmon_capable())
                return -EACCES;

        return security_perf_event_open(attr, PERF_SECURITY_CPU);
}

static inline int perf_allow_tracepoint(struct perf_event_attr *attr)
{
        if (sysctl_perf_event_paranoid > -1 && !perfmon_capable())
                return -EPERM;

        return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT);
}

extern void perf_event_init(void);
extern void perf_tp_event(u16 event_type, u64 count, void *record,
                          int entry_size, struct pt_regs *regs,
                          struct hlist_head *head, int rctx,
                          struct task_struct *task);
extern void perf_bp_event(struct perf_event *event, void *data);

#ifndef perf_misc_flags
# define perf_misc_flags(regs) \
                (user_mode(regs) ? PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL)
# define perf_instruction_pointer(regs)        instruction_pointer(regs)
#endif
#ifndef perf_arch_bpf_user_pt_regs
# define perf_arch_bpf_user_pt_regs(regs) regs
#endif

static inline bool has_branch_stack(struct perf_event *event)
{
        return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK;
}

static inline bool needs_branch_stack(struct perf_event *event)
{
        return event->attr.branch_sample_type != 0;
}

static inline bool has_aux(struct perf_event *event)
{
        return event->pmu->setup_aux;
}

static inline bool is_write_backward(struct perf_event *event)
{
        return !!event->attr.write_backward;
}

static inline bool has_addr_filter(struct perf_event *event)
{
        return event->pmu->nr_addr_filters;
}

/*
 * An inherited event uses parent's filters
 */
static inline struct perf_addr_filters_head *
perf_event_addr_filters(struct perf_event *event)
{
        struct perf_addr_filters_head *ifh = &event->addr_filters;

        if (event->parent)
                ifh = &event->parent->addr_filters;

        return ifh;
}

extern void perf_event_addr_filters_sync(struct perf_event *event);

extern int perf_output_begin(struct perf_output_handle *handle,
                             struct perf_sample_data *data,
                             struct perf_event *event, unsigned int size);
extern int perf_output_begin_forward(struct perf_output_handle *handle,
                                     struct perf_sample_data *data,
                                     struct perf_event *event,
                                     unsigned int size);
extern int perf_output_begin_backward(struct perf_output_handle *handle,
                                      struct perf_sample_data *data,
                                      struct perf_event *event,
                                      unsigned int size);

extern void perf_output_end(struct perf_output_handle *handle);
extern unsigned int perf_output_copy(struct perf_output_handle *handle,
                             const void *buf, unsigned int len);
extern unsigned int perf_output_skip(struct perf_output_handle *handle,
                                     unsigned int len);
extern long perf_output_copy_aux(struct perf_output_handle *aux_handle,
                                 struct perf_output_handle *handle,
                                 unsigned long from, unsigned long to);
extern int perf_swevent_get_recursion_context(void);
extern void perf_swevent_put_recursion_context(int rctx);
extern u64 perf_swevent_set_period(struct perf_event *event);
extern void perf_event_enable(struct perf_event *event);
extern void perf_event_disable(struct perf_event *event);
extern void perf_event_disable_local(struct perf_event *event);
extern void perf_event_disable_inatomic(struct perf_event *event);
extern void perf_event_task_tick(void);
extern int perf_event_account_interrupt(struct perf_event *event);
extern int perf_event_period(struct perf_event *event, u64 value);
extern u64 perf_event_pause(struct perf_event *event, bool reset);
#else /* !CONFIG_PERF_EVENTS: */
static inline void *
perf_aux_output_begin(struct perf_output_handle *handle,
                      struct perf_event *event)                                { return NULL; }
static inline void
perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
                                                                        { }
static inline int
perf_aux_output_skip(struct perf_output_handle *handle,
                     unsigned long size)                                { return -EINVAL; }
static inline void *
perf_get_aux(struct perf_output_handle *handle)                                { return NULL; }
static inline void
perf_event_task_migrate(struct task_struct *task)                        { }
static inline void
perf_event_task_sched_in(struct task_struct *prev,
                         struct task_struct *task)                        { }
static inline void
perf_event_task_sched_out(struct task_struct *prev,
                          struct task_struct *next)                        { }
static inline int perf_event_init_task(struct task_struct *child)        { return 0; }
static inline void perf_event_exit_task(struct task_struct *child)        { }
static inline void perf_event_free_task(struct task_struct *task)        { }
static inline void perf_event_delayed_put(struct task_struct *task)        { }
static inline struct file *perf_event_get(unsigned int fd)        { return ERR_PTR(-EINVAL); }
static inline const struct perf_event *perf_get_event(struct file *file)
{
        return ERR_PTR(-EINVAL);
}
static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
{
        return ERR_PTR(-EINVAL);
}
static inline int perf_event_read_local(struct perf_event *event, u64 *value,
                                        u64 *enabled, u64 *running)
{
        return -EINVAL;
}
static inline void perf_event_print_debug(void)                                { }
static inline int perf_event_task_disable(void)                                { return -EINVAL; }
static inline int perf_event_task_enable(void)                                { return -EINVAL; }
static inline int perf_event_refresh(struct perf_event *event, int refresh)
{
        return -EINVAL;
}

static inline void
perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)        { }
static inline void
perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)                        { }
static inline void
perf_bp_event(struct perf_event *event, void *data)                        { }

static inline int perf_register_guest_info_callbacks
(struct perf_guest_info_callbacks *callbacks)                                { return 0; }
static inline int perf_unregister_guest_info_callbacks
(struct perf_guest_info_callbacks *callbacks)                                { return 0; }

static inline void perf_event_mmap(struct vm_area_struct *vma)                { }

typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data);
static inline void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len,
                                      bool unregister, const char *sym)        { }
static inline void perf_event_bpf_event(struct bpf_prog *prog,
                                        enum perf_bpf_event_type type,
                                        u16 flags)                        { }
static inline void perf_event_exec(void)                                { }
static inline void perf_event_comm(struct task_struct *tsk, bool exec)        { }
static inline void perf_event_namespaces(struct task_struct *tsk)        { }
static inline void perf_event_fork(struct task_struct *tsk)                { }
static inline void perf_event_text_poke(const void *addr,
                                        const void *old_bytes,
                                        size_t old_len,
                                        const void *new_bytes,
                                        size_t new_len)                        { }
static inline void perf_event_init(void)                                { }
static inline int  perf_swevent_get_recursion_context(void)                { return -1; }
static inline void perf_swevent_put_recursion_context(int rctx)                { }
static inline u64 perf_swevent_set_period(struct perf_event *event)        { return 0; }
static inline void perf_event_enable(struct perf_event *event)                { }
static inline void perf_event_disable(struct perf_event *event)                { }
static inline int __perf_event_disable(void *info)                        { return -1; }
static inline void perf_event_task_tick(void)                                { }
static inline int perf_event_release_kernel(struct perf_event *event)        { return 0; }
static inline int perf_event_period(struct perf_event *event, u64 value)
{
        return -EINVAL;
}
static inline u64 perf_event_pause(struct perf_event *event, bool reset)
{
        return 0;
}
#endif

#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
extern void perf_restore_debug_store(void);
#else
static inline void perf_restore_debug_store(void)                        { }
#endif

static __always_inline bool perf_raw_frag_last(const struct perf_raw_frag *frag)
{
        return frag->pad < sizeof(u64);
}

#define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x))

struct perf_pmu_events_attr {
        struct device_attribute attr;
        u64 id;
        const char *event_str;
};

struct perf_pmu_events_ht_attr {
        struct device_attribute                        attr;
        u64                                        id;
        const char                                *event_str_ht;
        const char                                *event_str_noht;
};

ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
                              char *page);

#define PMU_EVENT_ATTR(_name, _var, _id, _show)                                \
static struct perf_pmu_events_attr _var = {                                \
        .attr = __ATTR(_name, 0444, _show, NULL),                        \
        .id   =  _id,                                                        \
};

#define PMU_EVENT_ATTR_STRING(_name, _var, _str)                            \
static struct perf_pmu_events_attr _var = {                                    \
        .attr                = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \
        .id                = 0,                                                    \
        .event_str        = _str,                                                    \
};

#define PMU_FORMAT_ATTR(_name, _format)                                        \
static ssize_t                                                                \
_name##_show(struct device *dev,                                        \
                               struct device_attribute *attr,                \
                               char *page)                                \
{                                                                        \
        BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);                        \
        return sprintf(page, _format "\n");                                \
}                                                                        \
                                                                        \
static struct device_attribute format_attr_##_name = __ATTR_RO(_name)

/* Performance counter hotplug functions */
#ifdef CONFIG_PERF_EVENTS
int perf_event_init_cpu(unsigned int cpu);
int perf_event_exit_cpu(unsigned int cpu);
#else
#define perf_event_init_cpu        NULL
#define perf_event_exit_cpu        NULL
#endif

extern void __weak arch_perf_update_userpage(struct perf_event *event,
                                             struct perf_event_mmap_page *userpg,
                                             u64 now);

#endif /* _LINUX_PERF_EVENT_H */
























































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM kmem

#if !defined(_TRACE_KMEM_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_KMEM_H

#include <linux/types.h>
#include <linux/tracepoint.h>
#include <trace/events/mmflags.h>

DECLARE_EVENT_CLASS(kmem_alloc,

        TP_PROTO(unsigned long call_site,
                 const void *ptr,
                 size_t bytes_req,
                 size_t bytes_alloc,
                 gfp_t gfp_flags),

        TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags),

        TP_STRUCT__entry(
                __field(        unsigned long,        call_site        )
                __field(        const void *,        ptr                )
                __field(        size_t,                bytes_req        )
                __field(        size_t,                bytes_alloc        )
                __field(        gfp_t,                gfp_flags        )
        ),

        TP_fast_assign(
                __entry->call_site        = call_site;
                __entry->ptr                = ptr;
                __entry->bytes_req        = bytes_req;
                __entry->bytes_alloc        = bytes_alloc;
                __entry->gfp_flags        = gfp_flags;
        ),

        TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s",
                (void *)__entry->call_site,
                __entry->ptr,
                __entry->bytes_req,
                __entry->bytes_alloc,
                show_gfp_flags(__entry->gfp_flags))
);

DEFINE_EVENT(kmem_alloc, kmalloc,

        TP_PROTO(unsigned long call_site, const void *ptr,
                 size_t bytes_req, size_t bytes_alloc, gfp_t gfp_flags),

        TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags)
);

DEFINE_EVENT(kmem_alloc, kmem_cache_alloc,

        TP_PROTO(unsigned long call_site, const void *ptr,
                 size_t bytes_req, size_t bytes_alloc, gfp_t gfp_flags),

        TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags)
);

DECLARE_EVENT_CLASS(kmem_alloc_node,

        TP_PROTO(unsigned long call_site,
                 const void *ptr,
                 size_t bytes_req,
                 size_t bytes_alloc,
                 gfp_t gfp_flags,
                 int node),

        TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),

        TP_STRUCT__entry(
                __field(        unsigned long,        call_site        )
                __field(        const void *,        ptr                )
                __field(        size_t,                bytes_req        )
                __field(        size_t,                bytes_alloc        )
                __field(        gfp_t,                gfp_flags        )
                __field(        int,                node                )
        ),

        TP_fast_assign(
                __entry->call_site        = call_site;
                __entry->ptr                = ptr;
                __entry->bytes_req        = bytes_req;
                __entry->bytes_alloc        = bytes_alloc;
                __entry->gfp_flags        = gfp_flags;
                __entry->node                = node;
        ),

        TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d",
                (void *)__entry->call_site,
                __entry->ptr,
                __entry->bytes_req,
                __entry->bytes_alloc,
                show_gfp_flags(__entry->gfp_flags),
                __entry->node)
);

DEFINE_EVENT(kmem_alloc_node, kmalloc_node,

        TP_PROTO(unsigned long call_site, const void *ptr,
                 size_t bytes_req, size_t bytes_alloc,
                 gfp_t gfp_flags, int node),

        TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node)
);

DEFINE_EVENT(kmem_alloc_node, kmem_cache_alloc_node,

        TP_PROTO(unsigned long call_site, const void *ptr,
                 size_t bytes_req, size_t bytes_alloc,
                 gfp_t gfp_flags, int node),

        TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node)
);

DECLARE_EVENT_CLASS(kmem_free,

        TP_PROTO(unsigned long call_site, const void *ptr),

        TP_ARGS(call_site, ptr),

        TP_STRUCT__entry(
                __field(        unsigned long,        call_site        )
                __field(        const void *,        ptr                )
        ),

        TP_fast_assign(
                __entry->call_site        = call_site;
                __entry->ptr                = ptr;
        ),

        TP_printk("call_site=%pS ptr=%p",
                  (void *)__entry->call_site, __entry->ptr)
);

DEFINE_EVENT(kmem_free, kfree,

        TP_PROTO(unsigned long call_site, const void *ptr),

        TP_ARGS(call_site, ptr)
);

DEFINE_EVENT(kmem_free, kmem_cache_free,

        TP_PROTO(unsigned long call_site, const void *ptr),

        TP_ARGS(call_site, ptr)
);

TRACE_EVENT(mm_page_free,

        TP_PROTO(struct page *page, unsigned int order),

        TP_ARGS(page, order),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                )
                __field(        unsigned int,        order                )
        ),

        TP_fast_assign(
                __entry->pfn                = page_to_pfn(page);
                __entry->order                = order;
        ),

        TP_printk("page=%p pfn=%lu order=%d",
                        pfn_to_page(__entry->pfn),
                        __entry->pfn,
                        __entry->order)
);

TRACE_EVENT(mm_page_free_batched,

        TP_PROTO(struct page *page),

        TP_ARGS(page),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                )
        ),

        TP_fast_assign(
                __entry->pfn                = page_to_pfn(page);
        ),

        TP_printk("page=%p pfn=%lu order=0",
                        pfn_to_page(__entry->pfn),
                        __entry->pfn)
);

TRACE_EVENT(mm_page_alloc,

        TP_PROTO(struct page *page, unsigned int order,
                        gfp_t gfp_flags, int migratetype),

        TP_ARGS(page, order, gfp_flags, migratetype),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                )
                __field(        unsigned int,        order                )
                __field(        gfp_t,                gfp_flags        )
                __field(        int,                migratetype        )
        ),

        TP_fast_assign(
                __entry->pfn                = page ? page_to_pfn(page) : -1UL;
                __entry->order                = order;
                __entry->gfp_flags        = gfp_flags;
                __entry->migratetype        = migratetype;
        ),

        TP_printk("page=%p pfn=%lu order=%d migratetype=%d gfp_flags=%s",
                __entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL,
                __entry->pfn != -1UL ? __entry->pfn : 0,
                __entry->order,
                __entry->migratetype,
                show_gfp_flags(__entry->gfp_flags))
);

DECLARE_EVENT_CLASS(mm_page,

        TP_PROTO(struct page *page, unsigned int order, int migratetype),

        TP_ARGS(page, order, migratetype),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                )
                __field(        unsigned int,        order                )
                __field(        int,                migratetype        )
        ),

        TP_fast_assign(
                __entry->pfn                = page ? page_to_pfn(page) : -1UL;
                __entry->order                = order;
                __entry->migratetype        = migratetype;
        ),

        TP_printk("page=%p pfn=%lu order=%u migratetype=%d percpu_refill=%d",
                __entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL,
                __entry->pfn != -1UL ? __entry->pfn : 0,
                __entry->order,
                __entry->migratetype,
                __entry->order == 0)
);

DEFINE_EVENT(mm_page, mm_page_alloc_zone_locked,

        TP_PROTO(struct page *page, unsigned int order, int migratetype),

        TP_ARGS(page, order, migratetype)
);

TRACE_EVENT(mm_page_pcpu_drain,

        TP_PROTO(struct page *page, unsigned int order, int migratetype),

        TP_ARGS(page, order, migratetype),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                )
                __field(        unsigned int,        order                )
                __field(        int,                migratetype        )
        ),

        TP_fast_assign(
                __entry->pfn                = page ? page_to_pfn(page) : -1UL;
                __entry->order                = order;
                __entry->migratetype        = migratetype;
        ),

        TP_printk("page=%p pfn=%lu order=%d migratetype=%d",
                pfn_to_page(__entry->pfn), __entry->pfn,
                __entry->order, __entry->migratetype)
);

TRACE_EVENT(mm_page_alloc_extfrag,

        TP_PROTO(struct page *page,
                int alloc_order, int fallback_order,
                int alloc_migratetype, int fallback_migratetype),

        TP_ARGS(page,
                alloc_order, fallback_order,
                alloc_migratetype, fallback_migratetype),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                        )
                __field(        int,                alloc_order                )
                __field(        int,                fallback_order                )
                __field(        int,                alloc_migratetype        )
                __field(        int,                fallback_migratetype        )
                __field(        int,                change_ownership        )
        ),

        TP_fast_assign(
                __entry->pfn                        = page_to_pfn(page);
                __entry->alloc_order                = alloc_order;
                __entry->fallback_order                = fallback_order;
                __entry->alloc_migratetype        = alloc_migratetype;
                __entry->fallback_migratetype        = fallback_migratetype;
                __entry->change_ownership        = (alloc_migratetype ==
                                        get_pageblock_migratetype(page));
        ),

        TP_printk("page=%p pfn=%lu alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d",
                pfn_to_page(__entry->pfn),
                __entry->pfn,
                __entry->alloc_order,
                __entry->fallback_order,
                pageblock_order,
                __entry->alloc_migratetype,
                __entry->fallback_migratetype,
                __entry->fallback_order < pageblock_order,
                __entry->change_ownership)
);

/*
 * Required for uniquely and securely identifying mm in rss_stat tracepoint.
 */
#ifndef __PTR_TO_HASHVAL
static unsigned int __maybe_unused mm_ptr_to_hash(const void *ptr)
{
        int ret;
        unsigned long hashval;

        ret = ptr_to_hashval(ptr, &hashval);
        if (ret)
                return 0;

        /* The hashed value is only 32-bit */
        return (unsigned int)hashval;
}
#define __PTR_TO_HASHVAL
#endif

TRACE_EVENT(rss_stat,

        TP_PROTO(struct mm_struct *mm,
                int member,
                long count),

        TP_ARGS(mm, member, count),

        TP_STRUCT__entry(
                __field(unsigned int, mm_id)
                __field(unsigned int, curr)
                __field(int, member)
                __field(long, size)
        ),

        TP_fast_assign(
                __entry->mm_id = mm_ptr_to_hash(mm);
                /*
                 * curr is true if the mm matches the current task's mm_struct.
                 * Since kthreads (PF_KTHREAD) have no mm_struct of their own
                 * but can borrow one via kthread_use_mm(), we must filter them
                 * out to avoid incorrectly attributing the RSS update to them.
                 */
                __entry->curr = current->mm == mm && !(current->flags & PF_KTHREAD);
                __entry->member = member;
                __entry->size = (count << PAGE_SHIFT);
        ),

        TP_printk("mm_id=%u curr=%d member=%d size=%ldB",
                __entry->mm_id,
                __entry->curr,
                __entry->member,
                __entry->size)
        );
#endif /* _TRACE_KMEM_H */

/* This part must be outside protection */
#include <trace/define_trace.h>













































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the TCP module.
 *
 * Version:        @(#)tcp.h        1.0.5        05/23/93
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 */
#ifndef _TCP_H
#define _TCP_H

#define FASTRETRANS_DEBUG 1

#include <linux/list.h>
#include <linux/tcp.h>
#include <linux/bug.h>
#include <linux/slab.h>
#include <linux/cache.h>
#include <linux/percpu.h>
#include <linux/skbuff.h>
#include <linux/kref.h>
#include <linux/ktime.h>
#include <linux/indirect_call_wrapper.h>

#include <net/inet_connection_sock.h>
#include <net/inet_timewait_sock.h>
#include <net/inet_hashtables.h>
#include <net/checksum.h>
#include <net/request_sock.h>
#include <net/sock_reuseport.h>
#include <net/sock.h>
#include <net/snmp.h>
#include <net/ip.h>
#include <net/tcp_states.h>
#include <net/inet_ecn.h>
#include <net/dst.h>
#include <net/mptcp.h>

#include <linux/seq_file.h>
#include <linux/memcontrol.h>
#include <linux/bpf-cgroup.h>
#include <linux/siphash.h>

extern struct inet_hashinfo tcp_hashinfo;

DECLARE_PER_CPU(unsigned int, tcp_orphan_count);
int tcp_orphan_count_sum(void);

void tcp_time_wait(struct sock *sk, int state, int timeo);

#define MAX_TCP_HEADER        L1_CACHE_ALIGN(128 + MAX_HEADER)
#define MAX_TCP_OPTION_SPACE 40
#define TCP_MIN_SND_MSS                48
#define TCP_MIN_GSO_SIZE        (TCP_MIN_SND_MSS - MAX_TCP_OPTION_SPACE)

/*
 * Never offer a window over 32767 without using window scaling. Some
 * poor stacks do signed 16bit maths!
 */
#define MAX_TCP_WINDOW                32767U

/* Minimal accepted MSS. It is (60+60+8) - (20+20). */
#define TCP_MIN_MSS                88U

/* The initial MTU to use for probing */
#define TCP_BASE_MSS                1024

/* probing interval, default to 10 minutes as per RFC4821 */
#define TCP_PROBE_INTERVAL        600

/* Specify interval when tcp mtu probing will stop */
#define TCP_PROBE_THRESHOLD        8

/* After receiving this amount of duplicate ACKs fast retransmit starts. */
#define TCP_FASTRETRANS_THRESH 3

/* Maximal number of ACKs sent quickly to accelerate slow-start. */
#define TCP_MAX_QUICKACKS        16U

/* Maximal number of window scale according to RFC1323 */
#define TCP_MAX_WSCALE                14U

/* urg_data states */
#define TCP_URG_VALID        0x0100
#define TCP_URG_NOTYET        0x0200
#define TCP_URG_READ        0x0400

#define TCP_RETR1        3        /*
                                 * This is how many retries it does before it
                                 * tries to figure out if the gateway is
                                 * down. Minimal RFC value is 3; it corresponds
                                 * to ~3sec-8min depending on RTO.
                                 */

#define TCP_RETR2        15        /*
                                 * This should take at least
                                 * 90 minutes to time out.
                                 * RFC1122 says that the limit is 100 sec.
                                 * 15 is ~13-30min depending on RTO.
                                 */

#define TCP_SYN_RETRIES         6        /* This is how many retries are done
                                 * when active opening a connection.
                                 * RFC1122 says the minimum retry MUST
                                 * be at least 180secs.  Nevertheless
                                 * this value is corresponding to
                                 * 63secs of retransmission with the
                                 * current initial RTO.
                                 */

#define TCP_SYNACK_RETRIES 5        /* This is how may retries are done
                                 * when passive opening a connection.
                                 * This is corresponding to 31secs of
                                 * retransmission with the current
                                 * initial RTO.
                                 */

#define TCP_TIMEWAIT_LEN (60*HZ) /* how long to wait to destroy TIME-WAIT
                                  * state, about 60 seconds        */
#define TCP_FIN_TIMEOUT        TCP_TIMEWAIT_LEN
                                 /* BSD style FIN_WAIT2 deadlock breaker.
                                  * It used to be 3min, new value is 60sec,
                                  * to combine FIN-WAIT-2 timeout with
                                  * TIME-WAIT timer.
                                  */
#define TCP_FIN_TIMEOUT_MAX (120 * HZ) /* max TCP_LINGER2 value (two minutes) */

#define TCP_DELACK_MAX        ((unsigned)(HZ/5))        /* maximal time to delay before sending an ACK */
#if HZ >= 100
#define TCP_DELACK_MIN        ((unsigned)(HZ/25))        /* minimal time to delay before sending an ACK */
#define TCP_ATO_MIN        ((unsigned)(HZ/25))
#else
#define TCP_DELACK_MIN        4U
#define TCP_ATO_MIN        4U
#endif
#define TCP_RTO_MAX        ((unsigned)(120*HZ))
#define TCP_RTO_MIN        ((unsigned)(HZ/5))
#define TCP_TIMEOUT_MIN        (2U) /* Min timeout for TCP timers in jiffies */

#define TCP_TIMEOUT_MIN_US (2*USEC_PER_MSEC) /* Min TCP timeout in microsecs */

#define TCP_TIMEOUT_INIT ((unsigned)(1*HZ))        /* RFC6298 2.1 initial RTO value        */
#define TCP_TIMEOUT_FALLBACK ((unsigned)(3*HZ))        /* RFC 1122 initial RTO value, now
                                                 * used as a fallback RTO for the
                                                 * initial data transmission if no
                                                 * valid RTT sample has been acquired,
                                                 * most likely due to retrans in 3WHS.
                                                 */

#define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes
                                                         * for local resources.
                                                         */
#define TCP_KEEPALIVE_TIME        (120*60*HZ)        /* two hours */
#define TCP_KEEPALIVE_PROBES        9                /* Max of 9 keepalive probes        */
#define TCP_KEEPALIVE_INTVL        (75*HZ)

#define MAX_TCP_KEEPIDLE        32767
#define MAX_TCP_KEEPINTVL        32767
#define MAX_TCP_KEEPCNT                127
#define MAX_TCP_SYNCNT                127

#define TCP_SYNQ_INTERVAL        (HZ/5)        /* Period of SYNACK timer */

#define TCP_PAWS_24DAYS        (60 * 60 * 24 * 24)
#define TCP_PAWS_MSL        60                /* Per-host timestamps are invalidated
                                         * after this time. It should be equal
                                         * (or greater than) TCP_TIMEWAIT_LEN
                                         * to provide reliability equal to one
                                         * provided by timewait state.
                                         */
#define TCP_PAWS_WINDOW        1                /* Replay window for per-host
                                         * timestamps. It must be less than
                                         * minimal timewait lifetime.
                                         */
/*
 *        TCP option
 */

#define TCPOPT_NOP                1        /* Padding */
#define TCPOPT_EOL                0        /* End of options */
#define TCPOPT_MSS                2        /* Segment size negotiating */
#define TCPOPT_WINDOW                3        /* Window scaling */
#define TCPOPT_SACK_PERM        4       /* SACK Permitted */
#define TCPOPT_SACK             5       /* SACK Block */
#define TCPOPT_TIMESTAMP        8        /* Better RTT estimations/PAWS */
#define TCPOPT_MD5SIG                19        /* MD5 Signature (RFC2385) */
#define TCPOPT_MPTCP                30        /* Multipath TCP (RFC6824) */
#define TCPOPT_FASTOPEN                34        /* Fast open (RFC7413) */
#define TCPOPT_EXP                254        /* Experimental */
/* Magic number to be after the option value for sharing TCP
 * experimental options. See draft-ietf-tcpm-experimental-options-00.txt
 */
#define TCPOPT_FASTOPEN_MAGIC        0xF989
#define TCPOPT_SMC_MAGIC        0xE2D4C3D9

/*
 *     TCP option lengths
 */

#define TCPOLEN_MSS            4
#define TCPOLEN_WINDOW         3
#define TCPOLEN_SACK_PERM      2
#define TCPOLEN_TIMESTAMP      10
#define TCPOLEN_MD5SIG         18
#define TCPOLEN_FASTOPEN_BASE  2
#define TCPOLEN_EXP_FASTOPEN_BASE  4
#define TCPOLEN_EXP_SMC_BASE   6

/* But this is what stacks really send out. */
#define TCPOLEN_TSTAMP_ALIGNED                12
#define TCPOLEN_WSCALE_ALIGNED                4
#define TCPOLEN_SACKPERM_ALIGNED        4
#define TCPOLEN_SACK_BASE                2
#define TCPOLEN_SACK_BASE_ALIGNED        4
#define TCPOLEN_SACK_PERBLOCK                8
#define TCPOLEN_MD5SIG_ALIGNED                20
#define TCPOLEN_MSS_ALIGNED                4
#define TCPOLEN_EXP_SMC_BASE_ALIGNED        8

/* Flags in tp->nonagle */
#define TCP_NAGLE_OFF                1        /* Nagle's algo is disabled */
#define TCP_NAGLE_CORK                2        /* Socket is corked            */
#define TCP_NAGLE_PUSH                4        /* Cork is overridden for already queued data */

/* TCP thin-stream limits */
#define TCP_THIN_LINEAR_RETRIES 6       /* After 6 linear retries, do exp. backoff */

/* TCP initial congestion window as per rfc6928 */
#define TCP_INIT_CWND                10

/* Bit Flags for sysctl_tcp_fastopen */
#define        TFO_CLIENT_ENABLE        1
#define        TFO_SERVER_ENABLE        2
#define        TFO_CLIENT_NO_COOKIE        4        /* Data in SYN w/o cookie option */

/* Accept SYN data w/o any cookie option */
#define        TFO_SERVER_COOKIE_NOT_REQD        0x200

/* Force enable TFO on all listeners, i.e., not requiring the
 * TCP_FASTOPEN socket option.
 */
#define        TFO_SERVER_WO_SOCKOPT1        0x400


/* sysctl variables for tcp */
extern int sysctl_tcp_max_orphans;
extern long sysctl_tcp_mem[3];

#define TCP_RACK_LOSS_DETECTION  0x1 /* Use RACK to detect losses */
#define TCP_RACK_STATIC_REO_WND  0x2 /* Use static RACK reo wnd */
#define TCP_RACK_NO_DUPTHRESH    0x4 /* Do not use DUPACK threshold in RACK */

extern atomic_long_t tcp_memory_allocated;
extern struct percpu_counter tcp_sockets_allocated;
extern unsigned long tcp_memory_pressure;

/* optimized version of sk_under_memory_pressure() for TCP sockets */
static inline bool tcp_under_memory_pressure(const struct sock *sk)
{
        if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
            mem_cgroup_under_socket_pressure(sk->sk_memcg))
                return true;

        return READ_ONCE(tcp_memory_pressure);
}
/*
 * The next routines deal with comparing 32 bit unsigned ints
 * and worry about wraparound (automatic with unsigned arithmetic).
 */

static inline bool before(__u32 seq1, __u32 seq2)
{
        return (__s32)(seq1-seq2) < 0;
}
#define after(seq2, seq1)         before(seq1, seq2)

/* is s2<=s1<=s3 ? */
static inline bool between(__u32 seq1, __u32 seq2, __u32 seq3)
{
        return seq3 - seq2 >= seq1 - seq2;
}

static inline bool tcp_out_of_memory(struct sock *sk)
{
        if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
            sk_memory_allocated(sk) > sk_prot_mem_limits(sk, 2))
                return true;
        return false;
}

void sk_forced_mem_schedule(struct sock *sk, int size);

bool tcp_check_oom(struct sock *sk, int shift);


extern struct proto tcp_prot;

#define TCP_INC_STATS(net, field)        SNMP_INC_STATS((net)->mib.tcp_statistics, field)
#define __TCP_INC_STATS(net, field)        __SNMP_INC_STATS((net)->mib.tcp_statistics, field)
#define TCP_DEC_STATS(net, field)        SNMP_DEC_STATS((net)->mib.tcp_statistics, field)
#define TCP_ADD_STATS(net, field, val)        SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val)

void tcp_tasklet_init(void);

int tcp_v4_err(struct sk_buff *skb, u32);

void tcp_shutdown(struct sock *sk, int how);

int tcp_v4_early_demux(struct sk_buff *skb);
int tcp_v4_rcv(struct sk_buff *skb);

int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw);
int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size);
int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
                 int flags);
int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
                        size_t size, int flags);
ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
                 size_t size, int flags);
int tcp_send_mss(struct sock *sk, int *size_goal, int flags);
void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle,
              int size_goal);
void tcp_release_cb(struct sock *sk);
void tcp_wfree(struct sk_buff *skb);
void tcp_write_timer_handler(struct sock *sk);
void tcp_delack_timer_handler(struct sock *sk);
int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg);
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
void tcp_rcv_space_adjust(struct sock *sk);
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
void tcp_twsk_destructor(struct sock *sk);
ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos,
                        struct pipe_inode_info *pipe, size_t len,
                        unsigned int flags);

static inline void tcp_dec_quickack_mode(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);

        if (icsk->icsk_ack.quick) {
                /* How many ACKs S/ACKing new data have we sent? */
                const unsigned int pkts = inet_csk_ack_scheduled(sk) ? 1 : 0;

                if (pkts >= icsk->icsk_ack.quick) {
                        icsk->icsk_ack.quick = 0;
                        /* Leaving quickack mode we deflate ATO. */
                        icsk->icsk_ack.ato   = TCP_ATO_MIN;
                } else
                        icsk->icsk_ack.quick -= pkts;
        }
}

#define        TCP_ECN_OK                1
#define        TCP_ECN_QUEUE_CWR        2
#define        TCP_ECN_DEMAND_CWR        4
#define        TCP_ECN_SEEN                8

enum tcp_tw_status {
        TCP_TW_SUCCESS = 0,
        TCP_TW_RST = 1,
        TCP_TW_ACK = 2,
        TCP_TW_SYN = 3
};


enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw,
                                              struct sk_buff *skb,
                                              const struct tcphdr *th);
struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
                           struct request_sock *req, bool fastopen,
                           bool *lost_race);
int tcp_child_process(struct sock *parent, struct sock *child,
                      struct sk_buff *skb);
void tcp_enter_loss(struct sock *sk);
void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag);
void tcp_clear_retrans(struct tcp_sock *tp);
void tcp_update_metrics(struct sock *sk);
void tcp_init_metrics(struct sock *sk);
void tcp_metrics_init(void);
bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst);
void __tcp_close(struct sock *sk, long timeout);
void tcp_close(struct sock *sk, long timeout);
void tcp_init_sock(struct sock *sk);
void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb);
__poll_t tcp_poll(struct file *file, struct socket *sock,
                      struct poll_table_struct *wait);
int tcp_getsockopt(struct sock *sk, int level, int optname,
                   char __user *optval, int __user *optlen);
bool tcp_bpf_bypass_getsockopt(int level, int optname);
int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
                   unsigned int optlen);
void tcp_set_keepalive(struct sock *sk, int val);
void tcp_syn_ack_timeout(const struct request_sock *req);
int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
                int flags, int *addr_len);
int tcp_set_rcvlowat(struct sock *sk, int val);
void tcp_data_ready(struct sock *sk);
#ifdef CONFIG_MMU
int tcp_mmap(struct file *file, struct socket *sock,
             struct vm_area_struct *vma);
#endif
void tcp_parse_options(const struct net *net, const struct sk_buff *skb,
                       struct tcp_options_received *opt_rx,
                       int estab, struct tcp_fastopen_cookie *foc);
const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);

/*
 *        BPF SKB-less helpers
 */
u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
                         struct tcphdr *th, u32 *cookie);
u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph,
                         struct tcphdr *th, u32 *cookie);
u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
                          const struct tcp_request_sock_ops *af_ops,
                          struct sock *sk, struct tcphdr *th);
/*
 *        TCP v4 functions exported for the inet6 API
 */

void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb);
void tcp_v4_mtu_reduced(struct sock *sk);
void tcp_req_err(struct sock *sk, u32 seq, bool abort);
void tcp_ld_RTO_revert(struct sock *sk, u32 seq);
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
struct sock *tcp_create_openreq_child(const struct sock *sk,
                                      struct request_sock *req,
                                      struct sk_buff *skb);
void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst);
struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
                                  struct request_sock *req,
                                  struct dst_entry *dst,
                                  struct request_sock *req_unhash,
                                  bool *own_req);
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb);
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
int tcp_connect(struct sock *sk);
enum tcp_synack_type {
        TCP_SYNACK_NORMAL,
        TCP_SYNACK_FASTOPEN,
        TCP_SYNACK_COOKIE,
};
struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
                                struct request_sock *req,
                                struct tcp_fastopen_cookie *foc,
                                enum tcp_synack_type synack_type,
                                struct sk_buff *syn_skb);
int tcp_disconnect(struct sock *sk, int flags);

void tcp_finish_connect(struct sock *sk, struct sk_buff *skb);
int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size);
void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb);

/* From syncookies.c */
struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb,
                                 struct request_sock *req,
                                 struct dst_entry *dst, u32 tsoff);
int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
                      u32 cookie);
struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb);
struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
                                            const struct tcp_request_sock_ops *af_ops,
                                            struct sock *sk, struct sk_buff *skb);
#ifdef CONFIG_SYN_COOKIES

/* Syncookies use a monotonic timer which increments every 60 seconds.
 * This counter is used both as a hash input and partially encoded into
 * the cookie value.  A cookie is only validated further if the delta
 * between the current counter value and the encoded one is less than this,
 * i.e. a sent cookie is valid only at most for 2*60 seconds (or less if
 * the counter advances immediately after a cookie is generated).
 */
#define MAX_SYNCOOKIE_AGE        2
#define TCP_SYNCOOKIE_PERIOD        (60 * HZ)
#define TCP_SYNCOOKIE_VALID        (MAX_SYNCOOKIE_AGE * TCP_SYNCOOKIE_PERIOD)

/* syncookies: remember time of last synqueue overflow
 * But do not dirty this field too often (once per second is enough)
 * It is racy as we do not hold a lock, but race is very minor.
 */
static inline void tcp_synq_overflow(const struct sock *sk)
{
        unsigned int last_overflow;
        unsigned int now = jiffies;

        if (sk->sk_reuseport) {
                struct sock_reuseport *reuse;

                reuse = rcu_dereference(sk->sk_reuseport_cb);
                if (likely(reuse)) {
                        last_overflow = READ_ONCE(reuse->synq_overflow_ts);
                        if (!time_between32(now, last_overflow,
                                            last_overflow + HZ))
                                WRITE_ONCE(reuse->synq_overflow_ts, now);
                        return;
                }
        }

        last_overflow = READ_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp);
        if (!time_between32(now, last_overflow, last_overflow + HZ))
                WRITE_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp, now);
}

/* syncookies: no recent synqueue overflow on this listening socket? */
static inline bool tcp_synq_no_recent_overflow(const struct sock *sk)
{
        unsigned int last_overflow;
        unsigned int now = jiffies;

        if (sk->sk_reuseport) {
                struct sock_reuseport *reuse;

                reuse = rcu_dereference(sk->sk_reuseport_cb);
                if (likely(reuse)) {
                        last_overflow = READ_ONCE(reuse->synq_overflow_ts);
                        return !time_between32(now, last_overflow - HZ,
                                               last_overflow +
                                               TCP_SYNCOOKIE_VALID);
                }
        }

        last_overflow = READ_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp);

        /* If last_overflow <= jiffies <= last_overflow + TCP_SYNCOOKIE_VALID,
         * then we're under synflood. However, we have to use
         * 'last_overflow - HZ' as lower bound. That's because a concurrent
         * tcp_synq_overflow() could update .ts_recent_stamp after we read
         * jiffies but before we store .ts_recent_stamp into last_overflow,
         * which could lead to rejecting a valid syncookie.
         */
        return !time_between32(now, last_overflow - HZ,
                               last_overflow + TCP_SYNCOOKIE_VALID);
}

static inline u32 tcp_cookie_time(void)
{
        u64 val = get_jiffies_64();

        do_div(val, TCP_SYNCOOKIE_PERIOD);
        return val;
}

u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
                              u16 *mssp);
__u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss);
u64 cookie_init_timestamp(struct request_sock *req, u64 now);
bool cookie_timestamp_decode(const struct net *net,
                             struct tcp_options_received *opt);
bool cookie_ecn_ok(const struct tcp_options_received *opt,
                   const struct net *net, const struct dst_entry *dst);

/* From net/ipv6/syncookies.c */
int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th,
                      u32 cookie);
struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb);

u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph,
                              const struct tcphdr *th, u16 *mssp);
__u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mss);
#endif
/* tcp_output.c */

void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
                               int nonagle);
int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs);
int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs);
void tcp_retransmit_timer(struct sock *sk);
void tcp_xmit_retransmit_queue(struct sock *);
void tcp_simple_retransmit(struct sock *);
void tcp_enter_recovery(struct sock *sk, bool ece_ack);
int tcp_trim_head(struct sock *, struct sk_buff *, u32);
enum tcp_queue {
        TCP_FRAG_IN_WRITE_QUEUE,
        TCP_FRAG_IN_RTX_QUEUE,
};
int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
                 struct sk_buff *skb, u32 len,
                 unsigned int mss_now, gfp_t gfp);

void tcp_send_probe0(struct sock *);
void tcp_send_partial(struct sock *);
int tcp_write_wakeup(struct sock *, int mib);
void tcp_send_fin(struct sock *sk);
void tcp_send_active_reset(struct sock *sk, gfp_t priority);
int tcp_send_synack(struct sock *);
void tcp_push_one(struct sock *, unsigned int mss_now);
void __tcp_send_ack(struct sock *sk, u32 rcv_nxt);
void tcp_send_ack(struct sock *sk);
void tcp_send_delayed_ack(struct sock *sk);
void tcp_send_loss_probe(struct sock *sk);
bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto);
void tcp_skb_collapse_tstamp(struct sk_buff *skb,
                             const struct sk_buff *next_skb);

/* tcp_input.c */
void tcp_rearm_rto(struct sock *sk);
void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
void tcp_reset(struct sock *sk);
void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb);
void tcp_fin(struct sock *sk);
void tcp_check_space(struct sock *sk);

/* tcp_timer.c */
void tcp_init_xmit_timers(struct sock *);
static inline void tcp_clear_xmit_timers(struct sock *sk)
{
        if (hrtimer_try_to_cancel(&tcp_sk(sk)->pacing_timer) == 1)
                __sock_put(sk);

        if (hrtimer_try_to_cancel(&tcp_sk(sk)->compressed_ack_timer) == 1)
                __sock_put(sk);

        inet_csk_clear_xmit_timers(sk);
}

unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu);
unsigned int tcp_current_mss(struct sock *sk);
u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when);

/* Bound MSS / TSO packet size with the half of the window */
static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
{
        int cutoff;

        /* When peer uses tiny windows, there is no use in packetizing
         * to sub-MSS pieces for the sake of SWS or making sure there
         * are enough packets in the pipe for fast recovery.
         *
         * On the other hand, for extremely large MSS devices, handling
         * smaller than MSS windows in this way does make sense.
         */
        if (tp->max_window > TCP_MSS_DEFAULT)
                cutoff = (tp->max_window >> 1);
        else
                cutoff = tp->max_window;

        if (cutoff && pktsize > cutoff)
                return max_t(int, cutoff, 68U - tp->tcp_header_len);
        else
                return pktsize;
}

/* tcp.c */
void tcp_get_info(struct sock *, struct tcp_info *);

/* Read 'sendfile()'-style from a TCP socket */
int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
                  sk_read_actor_t recv_actor);

void tcp_initialize_rcv_mss(struct sock *sk);

int tcp_mtu_to_mss(struct sock *sk, int pmtu);
int tcp_mss_to_mtu(struct sock *sk, int mss);
void tcp_mtup_init(struct sock *sk);

static inline void tcp_bound_rto(const struct sock *sk)
{
        if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX)
                inet_csk(sk)->icsk_rto = TCP_RTO_MAX;
}

static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
{
        return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us);
}

static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
{
        /* mptcp hooks are only on the slow path */
        if (sk_is_mptcp((struct sock *)tp))
                return;

        tp->pred_flags = htonl((tp->tcp_header_len << 26) |
                               ntohl(TCP_FLAG_ACK) |
                               snd_wnd);
}

static inline void tcp_fast_path_on(struct tcp_sock *tp)
{
        __tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale);
}

static inline void tcp_fast_path_check(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (RB_EMPTY_ROOT(&tp->out_of_order_queue) &&
            tp->rcv_wnd &&
            atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf &&
            !tp->urg_data)
                tcp_fast_path_on(tp);
}

/* Compute the actual rto_min value */
static inline u32 tcp_rto_min(struct sock *sk)
{
        const struct dst_entry *dst = __sk_dst_get(sk);
        u32 rto_min = inet_csk(sk)->icsk_rto_min;

        if (dst && dst_metric_locked(dst, RTAX_RTO_MIN))
                rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN);
        return rto_min;
}

static inline u32 tcp_rto_min_us(struct sock *sk)
{
        return jiffies_to_usecs(tcp_rto_min(sk));
}

static inline bool tcp_ca_dst_locked(const struct dst_entry *dst)
{
        return dst_metric_locked(dst, RTAX_CC_ALGO);
}

/* Minimum RTT in usec. ~0 means not available. */
static inline u32 tcp_min_rtt(const struct tcp_sock *tp)
{
        return minmax_get(&tp->rtt_min);
}

/* Compute the actual receive window we are currently advertising.
 * Rcv_nxt can be after the window if our peer push more data
 * than the offered window.
 */
static inline u32 tcp_receive_window(const struct tcp_sock *tp)
{
        s32 win = tp->rcv_wup + tp->rcv_wnd - tp->rcv_nxt;

        if (win < 0)
                win = 0;
        return (u32) win;
}

/* Choose a new window, without checks for shrinking, and without
 * scaling applied to the result.  The caller does these things
 * if necessary.  This is a "raw" window selection.
 */
u32 __tcp_select_window(struct sock *sk);

void tcp_send_window_probe(struct sock *sk);

/* TCP uses 32bit jiffies to save some space.
 * Note that this is different from tcp_time_stamp, which
 * historically has been the same until linux-4.13.
 */
#define tcp_jiffies32 ((u32)jiffies)

/*
 * Deliver a 32bit value for TCP timestamp option (RFC 7323)
 * It is no longer tied to jiffies, but to 1 ms clock.
 * Note: double check if you want to use tcp_jiffies32 instead of this.
 */
#define TCP_TS_HZ        1000

static inline u64 tcp_clock_ns(void)
{
        return ktime_get_ns();
}

static inline u64 tcp_clock_us(void)
{
        return div_u64(tcp_clock_ns(), NSEC_PER_USEC);
}

/* This should only be used in contexts where tp->tcp_mstamp is up to date */
static inline u32 tcp_time_stamp(const struct tcp_sock *tp)
{
        return div_u64(tp->tcp_mstamp, USEC_PER_SEC / TCP_TS_HZ);
}

/* Convert a nsec timestamp into TCP TSval timestamp (ms based currently) */
static inline u64 tcp_ns_to_ts(u64 ns)
{
        return div_u64(ns, NSEC_PER_SEC / TCP_TS_HZ);
}

/* Could use tcp_clock_us() / 1000, but this version uses a single divide */
static inline u32 tcp_time_stamp_raw(void)
{
        return tcp_ns_to_ts(tcp_clock_ns());
}

void tcp_mstamp_refresh(struct tcp_sock *tp);

static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
{
        return max_t(s64, t1 - t0, 0);
}

static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
{
        return tcp_ns_to_ts(skb->skb_mstamp_ns);
}

/* provide the departure time in us unit */
static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb)
{
        return div_u64(skb->skb_mstamp_ns, NSEC_PER_USEC);
}


#define tcp_flag_byte(th) (((u_int8_t *)th)[13])

#define TCPHDR_FIN 0x01
#define TCPHDR_SYN 0x02
#define TCPHDR_RST 0x04
#define TCPHDR_PSH 0x08
#define TCPHDR_ACK 0x10
#define TCPHDR_URG 0x20
#define TCPHDR_ECE 0x40
#define TCPHDR_CWR 0x80

#define TCPHDR_SYN_ECN        (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR)

/* This is what the send packet queuing engine uses to pass
 * TCP per-packet control information to the transmission code.
 * We also store the host-order sequence numbers in here too.
 * This is 44 bytes if IPV6 is enabled.
 * If this grows please adjust skbuff.h:skbuff->cb[xxx] size appropriately.
 */
struct tcp_skb_cb {
        __u32                seq;                /* Starting sequence number        */
        __u32                end_seq;        /* SEQ + FIN + SYN + datalen        */
        union {
                /* Note : tcp_tw_isn is used in input path only
                 *          (isn chosen by tcp_timewait_state_process())
                 *
                 *           tcp_gso_segs/size are used in write queue only,
                 *          cf tcp_skb_pcount()/tcp_skb_mss()
                 */
                __u32                tcp_tw_isn;
                struct {
                        u16        tcp_gso_segs;
                        u16        tcp_gso_size;
                };
        };
        __u8                tcp_flags;        /* TCP header flags. (tcp[13])        */

        __u8                sacked;                /* State flags for SACK.        */
#define TCPCB_SACKED_ACKED        0x01        /* SKB ACK'd by a SACK block        */
#define TCPCB_SACKED_RETRANS        0x02        /* SKB retransmitted                */
#define TCPCB_LOST                0x04        /* SKB is lost                        */
#define TCPCB_TAGBITS                0x07        /* All tag bits                        */
#define TCPCB_REPAIRED                0x10        /* SKB repaired (no skb_mstamp_ns)        */
#define TCPCB_EVER_RETRANS        0x80        /* Ever retransmitted frame        */
#define TCPCB_RETRANS                (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS| \
                                TCPCB_REPAIRED)

        __u8                ip_dsfield;        /* IPv4 tos or IPv6 dsfield        */
        __u8                txstamp_ack:1,        /* Record TX timestamp for ack? */
                        eor:1,                /* Is skb MSG_EOR marked? */
                        has_rxtstamp:1,        /* SKB has a RX timestamp        */
                        unused:5;
        __u32                ack_seq;        /* Sequence number ACK'd        */
        union {
                struct {
                        /* There is space for up to 24 bytes */
                        __u32 in_flight:30,/* Bytes in flight at transmit */
                              is_app_limited:1, /* cwnd not fully used? */
                              unused:1;
                        /* pkts S/ACKed so far upon tx of skb, incl retrans: */
                        __u32 delivered;
                        /* start of send pipeline phase */
                        u64 first_tx_mstamp;
                        /* when we reached the "delivered" count */
                        u64 delivered_mstamp;
                } tx;   /* only used for outgoing skbs */
                union {
                        struct inet_skb_parm        h4;
#if IS_ENABLED(CONFIG_IPV6)
                        struct inet6_skb_parm        h6;
#endif
                } header;        /* For incoming skbs */
                struct {
                        __u32 flags;
                        struct sock *sk_redir;
                        void *data_end;
                } bpf;
        };
};

#define TCP_SKB_CB(__skb)        ((struct tcp_skb_cb *)&((__skb)->cb[0]))

static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb)
{
        TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb);
}

static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb)
{
        return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS;
}

static inline struct sock *tcp_skb_bpf_redirect_fetch(struct sk_buff *skb)
{
        return TCP_SKB_CB(skb)->bpf.sk_redir;
}

static inline void tcp_skb_bpf_redirect_clear(struct sk_buff *skb)
{
        TCP_SKB_CB(skb)->bpf.sk_redir = NULL;
}

extern const struct inet_connection_sock_af_ops ipv4_specific;

#if IS_ENABLED(CONFIG_IPV6)
/* This is the variant of inet6_iif() that must be used by TCP,
 * as TCP moves IP6CB into a different location in skb->cb[]
 */
static inline int tcp_v6_iif(const struct sk_buff *skb)
{
        return TCP_SKB_CB(skb)->header.h6.iif;
}

static inline int tcp_v6_iif_l3_slave(const struct sk_buff *skb)
{
        bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags);

        return l3_slave ? skb->skb_iif : TCP_SKB_CB(skb)->header.h6.iif;
}

/* TCP_SKB_CB reference means this can not be used from early demux */
static inline int tcp_v6_sdif(const struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
        if (skb && ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags))
                return TCP_SKB_CB(skb)->header.h6.iif;
#endif
        return 0;
}

extern const struct inet_connection_sock_af_ops ipv6_specific;

INDIRECT_CALLABLE_DECLARE(void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb));
INDIRECT_CALLABLE_DECLARE(int tcp_v6_rcv(struct sk_buff *skb));
void tcp_v6_early_demux(struct sk_buff *skb);

#endif

/* TCP_SKB_CB reference means this can not be used from early demux */
static inline int tcp_v4_sdif(struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
        if (skb && ipv4_l3mdev_skb(TCP_SKB_CB(skb)->header.h4.flags))
                return TCP_SKB_CB(skb)->header.h4.iif;
#endif
        return 0;
}

/* Due to TSO, an SKB can be composed of multiple actual
 * packets.  To keep these tracked properly, we use this.
 */
static inline int tcp_skb_pcount(const struct sk_buff *skb)
{
        return TCP_SKB_CB(skb)->tcp_gso_segs;
}

static inline void tcp_skb_pcount_set(struct sk_buff *skb, int segs)
{
        TCP_SKB_CB(skb)->tcp_gso_segs = segs;
}

static inline void tcp_skb_pcount_add(struct sk_buff *skb, int segs)
{
        TCP_SKB_CB(skb)->tcp_gso_segs += segs;
}

/* This is valid iff skb is in write queue and tcp_skb_pcount() > 1. */
static inline int tcp_skb_mss(const struct sk_buff *skb)
{
        return TCP_SKB_CB(skb)->tcp_gso_size;
}

static inline bool tcp_skb_can_collapse_to(const struct sk_buff *skb)
{
        return likely(!TCP_SKB_CB(skb)->eor);
}

static inline bool tcp_skb_can_collapse(const struct sk_buff *to,
                                        const struct sk_buff *from)
{
        return likely(tcp_skb_can_collapse_to(to) &&
                      mptcp_skb_can_collapse(to, from));
}

/* Events passed to congestion control interface */
enum tcp_ca_event {
        CA_EVENT_TX_START,        /* first transmit when no packets in flight */
        CA_EVENT_CWND_RESTART,        /* congestion window restart */
        CA_EVENT_COMPLETE_CWR,        /* end of congestion recovery */
        CA_EVENT_LOSS,                /* loss timeout */
        CA_EVENT_ECN_NO_CE,        /* ECT set, but not CE marked */
        CA_EVENT_ECN_IS_CE,        /* received CE marked IP packet */
};

/* Information about inbound ACK, passed to cong_ops->in_ack_event() */
enum tcp_ca_ack_event_flags {
        CA_ACK_SLOWPATH                = (1 << 0),        /* In slow path processing */
        CA_ACK_WIN_UPDATE        = (1 << 1),        /* ACK updated window */
        CA_ACK_ECE                = (1 << 2),        /* ECE bit is set on ack */
};

/*
 * Interface for adding new TCP congestion control handlers
 */
#define TCP_CA_NAME_MAX        16
#define TCP_CA_MAX        128
#define TCP_CA_BUF_MAX        (TCP_CA_NAME_MAX*TCP_CA_MAX)

#define TCP_CA_UNSPEC        0

/* Algorithm can be set on socket without CAP_NET_ADMIN privileges */
#define TCP_CONG_NON_RESTRICTED 0x1
/* Requires ECN/ECT set on all packets */
#define TCP_CONG_NEEDS_ECN        0x2
#define TCP_CONG_MASK        (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN)

union tcp_cc_info;

struct ack_sample {
        u32 pkts_acked;
        s32 rtt_us;
        u32 in_flight;
};

/* A rate sample measures the number of (original/retransmitted) data
 * packets delivered "delivered" over an interval of time "interval_us".
 * The tcp_rate.c code fills in the rate sample, and congestion
 * control modules that define a cong_control function to run at the end
 * of ACK processing can optionally chose to consult this sample when
 * setting cwnd and pacing rate.
 * A sample is invalid if "delivered" or "interval_us" is negative.
 */
struct rate_sample {
        u64  prior_mstamp; /* starting timestamp for interval */
        u32  prior_delivered;        /* tp->delivered at "prior_mstamp" */
        s32  delivered;                /* number of packets delivered over interval */
        long interval_us;        /* time for tp->delivered to incr "delivered" */
        u32 snd_interval_us;        /* snd interval for delivered packets */
        u32 rcv_interval_us;        /* rcv interval for delivered packets */
        long rtt_us;                /* RTT of last (S)ACKed packet (or -1) */
        int  losses;                /* number of packets marked lost upon ACK */
        u32  acked_sacked;        /* number of packets newly (S)ACKed upon ACK */
        u32  prior_in_flight;        /* in flight before this ACK */
        u32  last_end_seq;        /* end_seq of most recently ACKed packet */
        bool is_app_limited;        /* is sample from packet with bubble in pipe? */
        bool is_retrans;        /* is sample from retransmission? */
        bool is_ack_delayed;        /* is this (likely) a delayed ACK? */
};

struct tcp_congestion_ops {
        struct list_head        list;
        u32 key;
        u32 flags;

        /* initialize private data (optional) */
        void (*init)(struct sock *sk);
        /* cleanup private data  (optional) */
        void (*release)(struct sock *sk);

        /* return slow start threshold (required) */
        u32 (*ssthresh)(struct sock *sk);
        /* do new cwnd calculation (required) */
        void (*cong_avoid)(struct sock *sk, u32 ack, u32 acked);
        /* call before changing ca_state (optional) */
        void (*set_state)(struct sock *sk, u8 new_state);
        /* call when cwnd event occurs (optional) */
        void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev);
        /* call when ack arrives (optional) */
        void (*in_ack_event)(struct sock *sk, u32 flags);
        /* new value of cwnd after loss (required) */
        u32  (*undo_cwnd)(struct sock *sk);
        /* hook for packet ack accounting (optional) */
        void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
        /* override sysctl_tcp_min_tso_segs */
        u32 (*min_tso_segs)(struct sock *sk);
        /* returns the multiplier used in tcp_sndbuf_expand (optional) */
        u32 (*sndbuf_expand)(struct sock *sk);
        /* call when packets are delivered to update cwnd and pacing rate,
         * after all the ca_state processing. (optional)
         */
        void (*cong_control)(struct sock *sk, const struct rate_sample *rs);
        /* get info for inet_diag (optional) */
        size_t (*get_info)(struct sock *sk, u32 ext, int *attr,
                           union tcp_cc_info *info);

        char                 name[TCP_CA_NAME_MAX];
        struct module         *owner;
};

int tcp_register_congestion_control(struct tcp_congestion_ops *type);
void tcp_unregister_congestion_control(struct tcp_congestion_ops *type);

void tcp_assign_congestion_control(struct sock *sk);
void tcp_init_congestion_control(struct sock *sk);
void tcp_cleanup_congestion_control(struct sock *sk);
int tcp_set_default_congestion_control(struct net *net, const char *name);
void tcp_get_default_congestion_control(struct net *net, char *name);
void tcp_get_available_congestion_control(char *buf, size_t len);
void tcp_get_allowed_congestion_control(char *buf, size_t len);
int tcp_set_allowed_congestion_control(char *allowed);
int tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
                               bool cap_net_admin);
u32 tcp_slow_start(struct tcp_sock *tp, u32 acked);
void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked);

u32 tcp_reno_ssthresh(struct sock *sk);
u32 tcp_reno_undo_cwnd(struct sock *sk);
void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
extern struct tcp_congestion_ops tcp_reno;

struct tcp_congestion_ops *tcp_ca_find(const char *name);
struct tcp_congestion_ops *tcp_ca_find_key(u32 key);
u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca);
#ifdef CONFIG_INET
char *tcp_ca_get_name_by_key(u32 key, char *buffer);
#else
static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer)
{
        return NULL;
}
#endif

static inline bool tcp_ca_needs_ecn(const struct sock *sk)
{
        const struct inet_connection_sock *icsk = inet_csk(sk);

        return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN;
}

static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state)
{
        struct inet_connection_sock *icsk = inet_csk(sk);

        if (icsk->icsk_ca_ops->set_state)
                icsk->icsk_ca_ops->set_state(sk, ca_state);
        icsk->icsk_ca_state = ca_state;
}

static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
{
        const struct inet_connection_sock *icsk = inet_csk(sk);

        if (icsk->icsk_ca_ops->cwnd_event)
                icsk->icsk_ca_ops->cwnd_event(sk, event);
}

/* From tcp_rate.c */
void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
                            struct rate_sample *rs);
void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
                  bool is_sack_reneg, struct rate_sample *rs);
void tcp_rate_check_app_limited(struct sock *sk);

static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
{
        return t1 > t2 || (t1 == t2 && after(seq1, seq2));
}

/* These functions determine how the current flow behaves in respect of SACK
 * handling. SACK is negotiated with the peer, and therefore it can vary
 * between different flows.
 *
 * tcp_is_sack - SACK enabled
 * tcp_is_reno - No SACK
 */
static inline int tcp_is_sack(const struct tcp_sock *tp)
{
        return likely(tp->rx_opt.sack_ok);
}

static inline bool tcp_is_reno(const struct tcp_sock *tp)
{
        return !tcp_is_sack(tp);
}

static inline unsigned int tcp_left_out(const struct tcp_sock *tp)
{
        return tp->sacked_out + tp->lost_out;
}

/* This determines how many packets are "in the network" to the best
 * of our knowledge.  In many cases it is conservative, but where
 * detailed information is available from the receiver (via SACK
 * blocks etc.) we can make more aggressive calculations.
 *
 * Use this for decisions involving congestion control, use just
 * tp->packets_out to determine if the send queue is empty or not.
 *
 * Read this equation as:
 *
 *        "Packets sent once on transmission queue" MINUS
 *        "Packets left network, but not honestly ACKed yet" PLUS
 *        "Packets fast retransmitted"
 */
static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp)
{
        return tp->packets_out - tcp_left_out(tp) + tp->retrans_out;
}

#define TCP_INFINITE_SSTHRESH        0x7fffffff

static inline bool tcp_in_slow_start(const struct tcp_sock *tp)
{
        return tp->snd_cwnd < tp->snd_ssthresh;
}

static inline bool tcp_in_initial_slowstart(const struct tcp_sock *tp)
{
        return tp->snd_ssthresh >= TCP_INFINITE_SSTHRESH;
}

static inline bool tcp_in_cwnd_reduction(const struct sock *sk)
{
        return (TCPF_CA_CWR | TCPF_CA_Recovery) &
               (1 << inet_csk(sk)->icsk_ca_state);
}

/* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd.
 * The exception is cwnd reduction phase, when cwnd is decreasing towards
 * ssthresh.
 */
static inline __u32 tcp_current_ssthresh(const struct sock *sk)
{
        const struct tcp_sock *tp = tcp_sk(sk);

        if (tcp_in_cwnd_reduction(sk))
                return tp->snd_ssthresh;
        else
                return max(tp->snd_ssthresh,
                           ((tp->snd_cwnd >> 1) +
                            (tp->snd_cwnd >> 2)));
}

/* Use define here intentionally to get WARN_ON location shown at the caller */
#define tcp_verify_left_out(tp)        WARN_ON(tcp_left_out(tp) > tp->packets_out)

void tcp_enter_cwr(struct sock *sk);
__u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst);

/* The maximum number of MSS of available cwnd for which TSO defers
 * sending if not using sysctl_tcp_tso_win_divisor.
 */
static inline __u32 tcp_max_tso_deferred_mss(const struct tcp_sock *tp)
{
        return 3;
}

/* Returns end sequence number of the receiver's advertised window */
static inline u32 tcp_wnd_end(const struct tcp_sock *tp)
{
        return tp->snd_una + tp->snd_wnd;
}

/* We follow the spirit of RFC2861 to validate cwnd but implement a more
 * flexible approach. The RFC suggests cwnd should not be raised unless
 * it was fully used previously. And that's exactly what we do in
 * congestion avoidance mode. But in slow start we allow cwnd to grow
 * as long as the application has used half the cwnd.
 * Example :
 *    cwnd is 10 (IW10), but application sends 9 frames.
 *    We allow cwnd to reach 18 when all frames are ACKed.
 * This check is safe because it's as aggressive as slow start which already
 * risks 100% overshoot. The advantage is that we discourage application to
 * either send more filler packets or data to artificially blow up the cwnd
 * usage, and allow application-limited process to probe bw more aggressively.
 */
static inline bool tcp_is_cwnd_limited(const struct sock *sk)
{
        const struct tcp_sock *tp = tcp_sk(sk);

        if (tp->is_cwnd_limited)
                return true;

        /* If in slow start, ensure cwnd grows to twice what was ACKed. */
        if (tcp_in_slow_start(tp))
                return tp->snd_cwnd < 2 * tp->max_packets_out;

        return false;
}

/* BBR congestion control needs pacing.
 * Same remark for SO_MAX_PACING_RATE.
 * sch_fq packet scheduler is efficiently handling pacing,
 * but is not always installed/used.
 * Return true if TCP stack should pace packets itself.
 */
static inline bool tcp_needs_internal_pacing(const struct sock *sk)
{
        return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED;
}

/* Estimates in how many jiffies next packet for this flow can be sent.
 * Scheduling a retransmit timer too early would be silly.
 */
static inline unsigned long tcp_pacing_delay(const struct sock *sk)
{
        s64 delay = tcp_sk(sk)->tcp_wstamp_ns - tcp_sk(sk)->tcp_clock_cache;

        return delay > 0 ? nsecs_to_jiffies(delay) : 0;
}

static inline void tcp_reset_xmit_timer(struct sock *sk,
                                        const int what,
                                        unsigned long when,
                                        const unsigned long max_when)
{
        inet_csk_reset_xmit_timer(sk, what, when + tcp_pacing_delay(sk),
                                  max_when);
}

/* Something is really bad, we could not queue an additional packet,
 * because qdisc is full or receiver sent a 0 window, or we are paced.
 * We do not want to add fuel to the fire, or abort too early,
 * so make sure the timer we arm now is at least 200ms in the future,
 * regardless of current icsk_rto value (as it could be ~2ms)
 */
static inline unsigned long tcp_probe0_base(const struct sock *sk)
{
        return max_t(unsigned long, inet_csk(sk)->icsk_rto, TCP_RTO_MIN);
}

/* Variant of inet_csk_rto_backoff() used for zero window probes */
static inline unsigned long tcp_probe0_when(const struct sock *sk,
                                            unsigned long max_when)
{
        u64 when = (u64)tcp_probe0_base(sk) << inet_csk(sk)->icsk_backoff;

        return (unsigned long)min_t(u64, when, max_when);
}

static inline void tcp_check_probe_timer(struct sock *sk)
{
        if (!tcp_sk(sk)->packets_out && !inet_csk(sk)->icsk_pending)
                tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
                                     tcp_probe0_base(sk), TCP_RTO_MAX);
}

static inline void tcp_init_wl(struct tcp_sock *tp, u32 seq)
{
        tp->snd_wl1 = seq;
}

static inline void tcp_update_wl(struct tcp_sock *tp, u32 seq)
{
        tp->snd_wl1 = seq;
}

/*
 * Calculate(/check) TCP checksum
 */
static inline __sum16 tcp_v4_check(int len, __be32 saddr,
                                   __be32 daddr, __wsum base)
{
        return csum_tcpudp_magic(saddr, daddr, len, IPPROTO_TCP, base);
}

static inline bool tcp_checksum_complete(struct sk_buff *skb)
{
        return !skb_csum_unnecessary(skb) &&
                __skb_checksum_complete(skb);
}

bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb);
int tcp_filter(struct sock *sk, struct sk_buff *skb);
void tcp_set_state(struct sock *sk, int state);
void tcp_done(struct sock *sk);
int tcp_abort(struct sock *sk, int err);

static inline void tcp_sack_reset(struct tcp_options_received *rx_opt)
{
        rx_opt->dsack = 0;
        rx_opt->num_sacks = 0;
}

void tcp_cwnd_restart(struct sock *sk, s32 delta);

static inline void tcp_slow_start_after_idle_check(struct sock *sk)
{
        const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
        struct tcp_sock *tp = tcp_sk(sk);
        s32 delta;

        if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle) ||
            tp->packets_out || ca_ops->cong_control)
                return;
        delta = tcp_jiffies32 - tp->lsndtime;
        if (delta > inet_csk(sk)->icsk_rto)
                tcp_cwnd_restart(sk, delta);
}

/* Determine a window scaling and initial window to offer. */
void tcp_select_initial_window(const struct sock *sk, int __space,
                               __u32 mss, __u32 *rcv_wnd,
                               __u32 *window_clamp, int wscale_ok,
                               __u8 *rcv_wscale, __u32 init_rcv_wnd);

static inline int tcp_win_from_space(const struct sock *sk, int space)
{
        int tcp_adv_win_scale = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_adv_win_scale);

        return tcp_adv_win_scale <= 0 ?
                (space>>(-tcp_adv_win_scale)) :
                space - (space>>tcp_adv_win_scale);
}

/* Note: caller must be prepared to deal with negative returns */
static inline int tcp_space(const struct sock *sk)
{
        return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) -
                                  READ_ONCE(sk->sk_backlog.len) -
                                  atomic_read(&sk->sk_rmem_alloc));
}

static inline int tcp_full_space(const struct sock *sk)
{
        return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf));
}

void tcp_cleanup_rbuf(struct sock *sk, int copied);

/* We provision sk_rcvbuf around 200% of sk_rcvlowat.
 * If 87.5 % (7/8) of the space has been consumed, we want to override
 * SO_RCVLOWAT constraint, since we are receiving skbs with too small
 * len/truesize ratio.
 */
static inline bool tcp_rmem_pressure(const struct sock *sk)
{
        int rcvbuf, threshold;

        if (tcp_under_memory_pressure(sk))
                return true;

        rcvbuf = READ_ONCE(sk->sk_rcvbuf);
        threshold = rcvbuf - (rcvbuf >> 3);

        return atomic_read(&sk->sk_rmem_alloc) > threshold;
}

static inline bool tcp_epollin_ready(const struct sock *sk, int target)
{
        const struct tcp_sock *tp = tcp_sk(sk);
        int avail = READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq);

        if (avail <= 0)
                return false;

        return (avail >= target) || tcp_rmem_pressure(sk) ||
               (tcp_receive_window(tp) <= inet_csk(sk)->icsk_ack.rcv_mss);
}

extern void tcp_openreq_init_rwin(struct request_sock *req,
                                  const struct sock *sk_listener,
                                  const struct dst_entry *dst);

void tcp_enter_memory_pressure(struct sock *sk);
void tcp_leave_memory_pressure(struct sock *sk);

static inline int keepalive_intvl_when(const struct tcp_sock *tp)
{
        struct net *net = sock_net((struct sock *)tp);
        int val;

        /* Paired with WRITE_ONCE() in tcp_sock_set_keepintvl()
         * and do_tcp_setsockopt().
         */
        val = READ_ONCE(tp->keepalive_intvl);

        return val ? : READ_ONCE(net->ipv4.sysctl_tcp_keepalive_intvl);
}

static inline int keepalive_time_when(const struct tcp_sock *tp)
{
        struct net *net = sock_net((struct sock *)tp);
        int val;

        /* Paired with WRITE_ONCE() in tcp_sock_set_keepidle_locked() */
        val = READ_ONCE(tp->keepalive_time);

        return val ? : READ_ONCE(net->ipv4.sysctl_tcp_keepalive_time);
}

static inline int keepalive_probes(const struct tcp_sock *tp)
{
        struct net *net = sock_net((struct sock *)tp);
        int val;

        /* Paired with WRITE_ONCE() in tcp_sock_set_keepcnt()
         * and do_tcp_setsockopt().
         */
        val = READ_ONCE(tp->keepalive_probes);

        return val ? : READ_ONCE(net->ipv4.sysctl_tcp_keepalive_probes);
}

static inline u32 keepalive_time_elapsed(const struct tcp_sock *tp)
{
        const struct inet_connection_sock *icsk = &tp->inet_conn;

        return min_t(u32, tcp_jiffies32 - icsk->icsk_ack.lrcvtime,
                          tcp_jiffies32 - tp->rcv_tstamp);
}

static inline int tcp_fin_time(const struct sock *sk)
{
        int fin_timeout = tcp_sk(sk)->linger2 ? :
                READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fin_timeout);
        const int rto = inet_csk(sk)->icsk_rto;

        if (fin_timeout < (rto << 2) - (rto >> 1))
                fin_timeout = (rto << 2) - (rto >> 1);

        return fin_timeout;
}

static inline bool tcp_paws_check(const struct tcp_options_received *rx_opt,
                                  int paws_win)
{
        if ((s32)(rx_opt->ts_recent - rx_opt->rcv_tsval) <= paws_win)
                return true;
        if (unlikely(!time_before32(ktime_get_seconds(),
                                    rx_opt->ts_recent_stamp + TCP_PAWS_24DAYS)))
                return true;
        /*
         * Some OSes send SYN and SYNACK messages with tsval=0 tsecr=0,
         * then following tcp messages have valid values. Ignore 0 value,
         * or else 'negative' tsval might forbid us to accept their packets.
         */
        if (!rx_opt->ts_recent)
                return true;
        return false;
}

static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt,
                                   int rst)
{
        if (tcp_paws_check(rx_opt, 0))
                return false;

        /* RST segments are not recommended to carry timestamp,
           and, if they do, it is recommended to ignore PAWS because
           "their cleanup function should take precedence over timestamps."
           Certainly, it is mistake. It is necessary to understand the reasons
           of this constraint to relax it: if peer reboots, clock may go
           out-of-sync and half-open connections will not be reset.
           Actually, the problem would be not existing if all
           the implementations followed draft about maintaining clock
           via reboots. Linux-2.2 DOES NOT!

           However, we can relax time bounds for RST segments to MSL.
         */
        if (rst && !time_before32(ktime_get_seconds(),
                                  rx_opt->ts_recent_stamp + TCP_PAWS_MSL))
                return false;
        return true;
}

bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
                          int mib_idx, u32 *last_oow_ack_time);

static inline void tcp_mib_init(struct net *net)
{
        /* See RFC 2012 */
        TCP_ADD_STATS(net, TCP_MIB_RTOALGORITHM, 1);
        TCP_ADD_STATS(net, TCP_MIB_RTOMIN, TCP_RTO_MIN*1000/HZ);
        TCP_ADD_STATS(net, TCP_MIB_RTOMAX, TCP_RTO_MAX*1000/HZ);
        TCP_ADD_STATS(net, TCP_MIB_MAXCONN, -1);
}

/* from STCP */
static inline void tcp_clear_retrans_hints_partial(struct tcp_sock *tp)
{
        tp->lost_skb_hint = NULL;
}

static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp)
{
        tcp_clear_retrans_hints_partial(tp);
        tp->retransmit_skb_hint = NULL;
}

union tcp_md5_addr {
        struct in_addr  a4;
#if IS_ENABLED(CONFIG_IPV6)
        struct in6_addr        a6;
#endif
};

/* - key database */
struct tcp_md5sig_key {
        struct hlist_node        node;
        u8                        keylen;
        u8                        family; /* AF_INET or AF_INET6 */
        u8                        prefixlen;
        union tcp_md5_addr        addr;
        int                        l3index; /* set if key added with L3 scope */
        u8                        key[TCP_MD5SIG_MAXKEYLEN];
        struct rcu_head                rcu;
};

/* - sock block */
struct tcp_md5sig_info {
        struct hlist_head        head;
        struct rcu_head                rcu;
};

/* - pseudo header */
struct tcp4_pseudohdr {
        __be32                saddr;
        __be32                daddr;
        __u8                pad;
        __u8                protocol;
        __be16                len;
};

struct tcp6_pseudohdr {
        struct in6_addr        saddr;
        struct in6_addr daddr;
        __be32                len;
        __be32                protocol;        /* including padding */
};

union tcp_md5sum_block {
        struct tcp4_pseudohdr ip4;
#if IS_ENABLED(CONFIG_IPV6)
        struct tcp6_pseudohdr ip6;
#endif
};

/* - pool: digest algorithm, hash description and scratch buffer */
struct tcp_md5sig_pool {
        struct ahash_request        *md5_req;
        void                        *scratch;
};

/* - functions */
int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
                        const struct sock *sk, const struct sk_buff *skb);
int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
                   int family, u8 prefixlen, int l3index,
                   const u8 *newkey, u8 newkeylen, gfp_t gfp);
int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr,
                   int family, u8 prefixlen, int l3index);
struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
                                         const struct sock *addr_sk);

#ifdef CONFIG_TCP_MD5SIG
#include <linux/jump_label.h>
extern struct static_key_false tcp_md5_needed;
struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
                                           const union tcp_md5_addr *addr,
                                           int family);
static inline struct tcp_md5sig_key *
tcp_md5_do_lookup(const struct sock *sk, int l3index,
                  const union tcp_md5_addr *addr, int family)
{
        if (!static_branch_unlikely(&tcp_md5_needed))
                return NULL;
        return __tcp_md5_do_lookup(sk, l3index, addr, family);
}

#define tcp_twsk_md5_key(twsk)        ((twsk)->tw_md5_key)
#else
static inline struct tcp_md5sig_key *
tcp_md5_do_lookup(const struct sock *sk, int l3index,
                  const union tcp_md5_addr *addr, int family)
{
        return NULL;
}
#define tcp_twsk_md5_key(twsk)        NULL
#endif

bool tcp_alloc_md5sig_pool(void);

struct tcp_md5sig_pool *tcp_get_md5sig_pool(void);
static inline void tcp_put_md5sig_pool(void)
{
        local_bh_enable();
}

int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *, const struct sk_buff *,
                          unsigned int header_len);
int tcp_md5_hash_key(struct tcp_md5sig_pool *hp,
                     const struct tcp_md5sig_key *key);

/* From tcp_fastopen.c */
void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
                            struct tcp_fastopen_cookie *cookie);
void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
                            struct tcp_fastopen_cookie *cookie, bool syn_lost,
                            u16 try_exp);
struct tcp_fastopen_request {
        /* Fast Open cookie. Size 0 means a cookie request */
        struct tcp_fastopen_cookie        cookie;
        struct msghdr                        *data;  /* data in MSG_FASTOPEN */
        size_t                                size;
        int                                copied;        /* queued in tcp_connect() */
        struct ubuf_info                *uarg;
};
void tcp_free_fastopen_req(struct tcp_sock *tp);
void tcp_fastopen_destroy_cipher(struct sock *sk);
void tcp_fastopen_ctx_destroy(struct net *net);
int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
                              void *primary_key, void *backup_key);
int tcp_fastopen_get_cipher(struct net *net, struct inet_connection_sock *icsk,
                            u64 *key);
void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb);
struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
                              struct request_sock *req,
                              struct tcp_fastopen_cookie *foc,
                              const struct dst_entry *dst);
void tcp_fastopen_init_key_once(struct net *net);
bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
                             struct tcp_fastopen_cookie *cookie);
bool tcp_fastopen_defer_connect(struct sock *sk, int *err);
#define TCP_FASTOPEN_KEY_LENGTH sizeof(siphash_key_t)
#define TCP_FASTOPEN_KEY_MAX 2
#define TCP_FASTOPEN_KEY_BUF_LENGTH \
        (TCP_FASTOPEN_KEY_LENGTH * TCP_FASTOPEN_KEY_MAX)

/* Fastopen key context */
struct tcp_fastopen_context {
        siphash_key_t        key[TCP_FASTOPEN_KEY_MAX];
        int                num;
        struct rcu_head        rcu;
};

extern unsigned int sysctl_tcp_fastopen_blackhole_timeout;
void tcp_fastopen_active_disable(struct sock *sk);
bool tcp_fastopen_active_should_disable(struct sock *sk);
void tcp_fastopen_active_disable_ofo_check(struct sock *sk);
void tcp_fastopen_active_detect_blackhole(struct sock *sk, bool expired);

/* Caller needs to wrap with rcu_read_(un)lock() */
static inline
struct tcp_fastopen_context *tcp_fastopen_get_ctx(const struct sock *sk)
{
        struct tcp_fastopen_context *ctx;

        ctx = rcu_dereference(inet_csk(sk)->icsk_accept_queue.fastopenq.ctx);
        if (!ctx)
                ctx = rcu_dereference(sock_net(sk)->ipv4.tcp_fastopen_ctx);
        return ctx;
}

static inline
bool tcp_fastopen_cookie_match(const struct tcp_fastopen_cookie *foc,
                               const struct tcp_fastopen_cookie *orig)
{
        if (orig->len == TCP_FASTOPEN_COOKIE_SIZE &&
            orig->len == foc->len &&
            !memcmp(orig->val, foc->val, foc->len))
                return true;
        return false;
}

static inline
int tcp_fastopen_context_len(const struct tcp_fastopen_context *ctx)
{
        return ctx->num;
}

/* Latencies incurred by various limits for a sender. They are
 * chronograph-like stats that are mutually exclusive.
 */
enum tcp_chrono {
        TCP_CHRONO_UNSPEC,
        TCP_CHRONO_BUSY, /* Actively sending data (non-empty write queue) */
        TCP_CHRONO_RWND_LIMITED, /* Stalled by insufficient receive window */
        TCP_CHRONO_SNDBUF_LIMITED, /* Stalled by insufficient send buffer */
        __TCP_CHRONO_MAX,
};

void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type);
void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type);

/* This helper is needed, because skb->tcp_tsorted_anchor uses
 * the same memory storage than skb->destructor/_skb_refdst
 */
static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb)
{
        skb->destructor = NULL;
        skb->_skb_refdst = 0UL;
}

#define tcp_skb_tsorted_save(skb) {                \
        unsigned long _save = skb->_skb_refdst;        \
        skb->_skb_refdst = 0UL;

#define tcp_skb_tsorted_restore(skb)                \
        skb->_skb_refdst = _save;                \
}

void tcp_write_queue_purge(struct sock *sk);

static inline struct sk_buff *tcp_rtx_queue_head(const struct sock *sk)
{
        return skb_rb_first(&sk->tcp_rtx_queue);
}

static inline struct sk_buff *tcp_rtx_queue_tail(const struct sock *sk)
{
        return skb_rb_last(&sk->tcp_rtx_queue);
}

static inline struct sk_buff *tcp_write_queue_head(const struct sock *sk)
{
        return skb_peek(&sk->sk_write_queue);
}

static inline struct sk_buff *tcp_write_queue_tail(const struct sock *sk)
{
        return skb_peek_tail(&sk->sk_write_queue);
}

#define tcp_for_write_queue_from_safe(skb, tmp, sk)                        \
        skb_queue_walk_from_safe(&(sk)->sk_write_queue, skb, tmp)

static inline struct sk_buff *tcp_send_head(const struct sock *sk)
{
        return skb_peek(&sk->sk_write_queue);
}

static inline bool tcp_skb_is_last(const struct sock *sk,
                                   const struct sk_buff *skb)
{
        return skb_queue_is_last(&sk->sk_write_queue, skb);
}

/**
 * tcp_write_queue_empty - test if any payload (or FIN) is available in write queue
 * @sk: socket
 *
 * Since the write queue can have a temporary empty skb in it,
 * we must not use "return skb_queue_empty(&sk->sk_write_queue)"
 */
static inline bool tcp_write_queue_empty(const struct sock *sk)
{
        const struct tcp_sock *tp = tcp_sk(sk);

        return tp->write_seq == tp->snd_nxt;
}

static inline bool tcp_rtx_queue_empty(const struct sock *sk)
{
        return RB_EMPTY_ROOT(&sk->tcp_rtx_queue);
}

static inline bool tcp_rtx_and_write_queues_empty(const struct sock *sk)
{
        return tcp_rtx_queue_empty(sk) && tcp_write_queue_empty(sk);
}

static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb)
{
        __skb_queue_tail(&sk->sk_write_queue, skb);

        /* Queue it, remembering where we must start sending. */
        if (sk->sk_write_queue.next == skb)
                tcp_chrono_start(sk, TCP_CHRONO_BUSY);
}

/* Insert new before skb on the write queue of sk.  */
static inline void tcp_insert_write_queue_before(struct sk_buff *new,
                                                  struct sk_buff *skb,
                                                  struct sock *sk)
{
        __skb_queue_before(&sk->sk_write_queue, skb, new);
}

static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk)
{
        tcp_skb_tsorted_anchor_cleanup(skb);
        __skb_unlink(skb, &sk->sk_write_queue);
}

void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb);

static inline void tcp_rtx_queue_unlink(struct sk_buff *skb, struct sock *sk)
{
        tcp_skb_tsorted_anchor_cleanup(skb);
        rb_erase(&skb->rbnode, &sk->tcp_rtx_queue);
}

static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct sock *sk)
{
        list_del(&skb->tcp_tsorted_anchor);
        tcp_rtx_queue_unlink(skb, sk);
        sk_wmem_free_skb(sk, skb);
}

static inline void tcp_push_pending_frames(struct sock *sk)
{
        if (tcp_send_head(sk)) {
                struct tcp_sock *tp = tcp_sk(sk);

                __tcp_push_pending_frames(sk, tcp_current_mss(sk), tp->nonagle);
        }
}

/* Start sequence of the skb just after the highest skb with SACKed
 * bit, valid only if sacked_out > 0 or when the caller has ensured
 * validity by itself.
 */
static inline u32 tcp_highest_sack_seq(struct tcp_sock *tp)
{
        if (!tp->sacked_out)
                return tp->snd_una;

        if (tp->highest_sack == NULL)
                return tp->snd_nxt;

        return TCP_SKB_CB(tp->highest_sack)->seq;
}

static inline void tcp_advance_highest_sack(struct sock *sk, struct sk_buff *skb)
{
        tcp_sk(sk)->highest_sack = skb_rb_next(skb);
}

static inline struct sk_buff *tcp_highest_sack(struct sock *sk)
{
        return tcp_sk(sk)->highest_sack;
}

static inline void tcp_highest_sack_reset(struct sock *sk)
{
        tcp_sk(sk)->highest_sack = tcp_rtx_queue_head(sk);
}

/* Called when old skb is about to be deleted and replaced by new skb */
static inline void tcp_highest_sack_replace(struct sock *sk,
                                            struct sk_buff *old,
                                            struct sk_buff *new)
{
        if (old == tcp_highest_sack(sk))
                tcp_sk(sk)->highest_sack = new;
}

/* This helper checks if socket has IP_TRANSPARENT set */
static inline bool inet_sk_transparent(const struct sock *sk)
{
        switch (sk->sk_state) {
        case TCP_TIME_WAIT:
                return inet_twsk(sk)->tw_transparent;
        case TCP_NEW_SYN_RECV:
                return inet_rsk(inet_reqsk(sk))->no_srccheck;
        }
        return inet_sk(sk)->transparent;
}

/* Determines whether this is a thin stream (which may suffer from
 * increased latency). Used to trigger latency-reducing mechanisms.
 */
static inline bool tcp_stream_is_thin(struct tcp_sock *tp)
{
        return tp->packets_out < 4 && !tcp_in_initial_slowstart(tp);
}

/* /proc */
enum tcp_seq_states {
        TCP_SEQ_STATE_LISTENING,
        TCP_SEQ_STATE_ESTABLISHED,
};

void *tcp_seq_start(struct seq_file *seq, loff_t *pos);
void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos);
void tcp_seq_stop(struct seq_file *seq, void *v);

struct tcp_seq_afinfo {
        sa_family_t                        family;
};

struct tcp_iter_state {
        struct seq_net_private        p;
        enum tcp_seq_states        state;
        struct sock                *syn_wait_sk;
        struct tcp_seq_afinfo        *bpf_seq_afinfo;
        int                        bucket, offset, sbucket, num;
        loff_t                        last_pos;
};

extern struct request_sock_ops tcp_request_sock_ops;
extern struct request_sock_ops tcp6_request_sock_ops;

void tcp_v4_destroy_sock(struct sock *sk);

struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
                                netdev_features_t features);
struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb);
INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *skb, int thoff));
INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb));
INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *skb, int thoff));
INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb));
int tcp_gro_complete(struct sk_buff *skb);

void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr);

static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp)
{
        struct net *net = sock_net((struct sock *)tp);
        u32 val;

        val = READ_ONCE(tp->notsent_lowat);

        return val ?: READ_ONCE(net->ipv4.sysctl_tcp_notsent_lowat);
}

/* @wake is one when sk_stream_write_space() calls us.
 * This sends EPOLLOUT only if notsent_bytes is half the limit.
 * This mimics the strategy used in sock_def_write_space().
 */
static inline bool tcp_stream_memory_free(const struct sock *sk, int wake)
{
        const struct tcp_sock *tp = tcp_sk(sk);
        u32 notsent_bytes = READ_ONCE(tp->write_seq) -
                            READ_ONCE(tp->snd_nxt);

        return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
}

#ifdef CONFIG_PROC_FS
int tcp4_proc_init(void);
void tcp4_proc_exit(void);
#endif

int tcp_rtx_synack(const struct sock *sk, struct request_sock *req);
int tcp_conn_request(struct request_sock_ops *rsk_ops,
                     const struct tcp_request_sock_ops *af_ops,
                     struct sock *sk, struct sk_buff *skb);

/* TCP af-specific functions */
struct tcp_sock_af_ops {
#ifdef CONFIG_TCP_MD5SIG
        struct tcp_md5sig_key        *(*md5_lookup) (const struct sock *sk,
                                                const struct sock *addr_sk);
        int                (*calc_md5_hash)(char *location,
                                         const struct tcp_md5sig_key *md5,
                                         const struct sock *sk,
                                         const struct sk_buff *skb);
        int                (*md5_parse)(struct sock *sk,
                                     int optname,
                                     sockptr_t optval,
                                     int optlen);
#endif
};

struct tcp_request_sock_ops {
        u16 mss_clamp;
#ifdef CONFIG_TCP_MD5SIG
        struct tcp_md5sig_key *(*req_md5_lookup)(const struct sock *sk,
                                                 const struct sock *addr_sk);
        int                (*calc_md5_hash) (char *location,
                                          const struct tcp_md5sig_key *md5,
                                          const struct sock *sk,
                                          const struct sk_buff *skb);
#endif
        void (*init_req)(struct request_sock *req,
                         const struct sock *sk_listener,
                         struct sk_buff *skb);
#ifdef CONFIG_SYN_COOKIES
        __u32 (*cookie_init_seq)(const struct sk_buff *skb,
                                 __u16 *mss);
#endif
        struct dst_entry *(*route_req)(const struct sock *sk, struct flowi *fl,
                                       const struct request_sock *req);
        u32 (*init_seq)(const struct sk_buff *skb);
        u32 (*init_ts_off)(const struct net *net, const struct sk_buff *skb);
        int (*send_synack)(const struct sock *sk, struct dst_entry *dst,
                           struct flowi *fl, struct request_sock *req,
                           struct tcp_fastopen_cookie *foc,
                           enum tcp_synack_type synack_type,
                           struct sk_buff *syn_skb);
};

extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops;
#if IS_ENABLED(CONFIG_IPV6)
extern const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops;
#endif

#ifdef CONFIG_SYN_COOKIES
static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,
                                         const struct sock *sk, struct sk_buff *skb,
                                         __u16 *mss)
{
        tcp_synq_overflow(sk);
        __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
        return ops->cookie_init_seq(skb, mss);
}
#else
static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,
                                         const struct sock *sk, struct sk_buff *skb,
                                         __u16 *mss)
{
        return 0;
}
#endif

int tcpv4_offload_init(void);

void tcp_v4_init(void);
void tcp_init(void);

/* tcp_recovery.c */
void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb);
void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced);
extern s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb,
                                u32 reo_wnd);
extern bool tcp_rack_mark_lost(struct sock *sk);
extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
                             u64 xmit_time);
extern void tcp_rack_reo_timeout(struct sock *sk);
extern void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs);

/* At how many usecs into the future should the RTO fire? */
static inline s64 tcp_rto_delta_us(const struct sock *sk)
{
        const struct sk_buff *skb = tcp_rtx_queue_head(sk);
        u32 rto = inet_csk(sk)->icsk_rto;

        if (likely(skb)) {
                u64 rto_time_stamp_us = tcp_skb_timestamp_us(skb) + jiffies_to_usecs(rto);

                return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp;
        } else {
                WARN_ONCE(1,
                        "rtx queue emtpy: "
                        "out:%u sacked:%u lost:%u retrans:%u "
                        "tlp_high_seq:%u sk_state:%u ca_state:%u "
                        "advmss:%u mss_cache:%u pmtu:%u\n",
                        tcp_sk(sk)->packets_out, tcp_sk(sk)->sacked_out,
                        tcp_sk(sk)->lost_out, tcp_sk(sk)->retrans_out,
                        tcp_sk(sk)->tlp_high_seq, sk->sk_state,
                        inet_csk(sk)->icsk_ca_state,
                        tcp_sk(sk)->advmss, tcp_sk(sk)->mss_cache,
                        inet_csk(sk)->icsk_pmtu_cookie);
                return jiffies_to_usecs(rto);
        }

}

/*
 * Save and compile IPv4 options, return a pointer to it
 */
static inline struct ip_options_rcu *tcp_v4_save_options(struct net *net,
                                                         struct sk_buff *skb)
{
        const struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt;
        struct ip_options_rcu *dopt = NULL;

        if (opt->optlen) {
                int opt_size = sizeof(*dopt) + opt->optlen;

                dopt = kmalloc(opt_size, GFP_ATOMIC);
                if (dopt && __ip_options_echo(net, &dopt->opt, skb, opt)) {
                        kfree(dopt);
                        dopt = NULL;
                }
        }
        return dopt;
}

/* locally generated TCP pure ACKs have skb->truesize == 2
 * (check tcp_send_ack() in net/ipv4/tcp_output.c )
 * This is much faster than dissecting the packet to find out.
 * (Think of GRE encapsulations, IPv4, IPv6, ...)
 */
static inline bool skb_is_tcp_pure_ack(const struct sk_buff *skb)
{
        return skb->truesize == 2;
}

static inline void skb_set_tcp_pure_ack(struct sk_buff *skb)
{
        skb->truesize = 2;
}

static inline int tcp_inq(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);
        int answ;

        if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
                answ = 0;
        } else if (sock_flag(sk, SOCK_URGINLINE) ||
                   !tp->urg_data ||
                   before(tp->urg_seq, tp->copied_seq) ||
                   !before(tp->urg_seq, tp->rcv_nxt)) {

                answ = tp->rcv_nxt - tp->copied_seq;

                /* Subtract 1, if FIN was received */
                if (answ && sock_flag(sk, SOCK_DONE))
                        answ--;
        } else {
                answ = tp->urg_seq - tp->copied_seq;
        }

        return answ;
}

int tcp_peek_len(struct socket *sock);

static inline void tcp_segs_in(struct tcp_sock *tp, const struct sk_buff *skb)
{
        u16 segs_in;

        segs_in = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
        tp->segs_in += segs_in;
        if (skb->len > tcp_hdrlen(skb))
                tp->data_segs_in += segs_in;
}

/*
 * TCP listen path runs lockless.
 * We forced "struct sock" to be const qualified to make sure
 * we don't modify one of its field by mistake.
 * Here, we increment sk_drops which is an atomic_t, so we can safely
 * make sock writable again.
 */
static inline void tcp_listendrop(const struct sock *sk)
{
        atomic_inc(&((struct sock *)sk)->sk_drops);
        __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS);
}

enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer);

/*
 * Interface for adding Upper Level Protocols over TCP
 */

#define TCP_ULP_NAME_MAX        16
#define TCP_ULP_MAX                128
#define TCP_ULP_BUF_MAX                (TCP_ULP_NAME_MAX*TCP_ULP_MAX)

struct tcp_ulp_ops {
        struct list_head        list;

        /* initialize ulp */
        int (*init)(struct sock *sk);
        /* update ulp */
        void (*update)(struct sock *sk, struct proto *p,
                       void (*write_space)(struct sock *sk));
        /* cleanup ulp */
        void (*release)(struct sock *sk);
        /* diagnostic */
        int (*get_info)(struct sock *sk, struct sk_buff *skb);
        size_t (*get_info_size)(const struct sock *sk);
        /* clone ulp */
        void (*clone)(const struct request_sock *req, struct sock *newsk,
                      const gfp_t priority);

        char                name[TCP_ULP_NAME_MAX];
        struct module        *owner;
};
int tcp_register_ulp(struct tcp_ulp_ops *type);
void tcp_unregister_ulp(struct tcp_ulp_ops *type);
int tcp_set_ulp(struct sock *sk, const char *name);
void tcp_get_available_ulp(char *buf, size_t len);
void tcp_cleanup_ulp(struct sock *sk);
void tcp_update_ulp(struct sock *sk, struct proto *p,
                    void (*write_space)(struct sock *sk));

#define MODULE_ALIAS_TCP_ULP(name)                                \
        __MODULE_INFO(alias, alias_userspace, name);                \
        __MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name)

#ifdef CONFIG_NET_SOCK_MSG
struct sk_msg;
struct sk_psock;

#ifdef CONFIG_BPF_SYSCALL
struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock);
void tcp_bpf_clone(const struct sock *sk, struct sock *newsk);
#endif /* CONFIG_BPF_SYSCALL */

int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes,
                          int flags);
int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
                      struct msghdr *msg, int len, int flags);
#endif /* CONFIG_NET_SOCK_MSG */

#if !defined(CONFIG_BPF_SYSCALL) || !defined(CONFIG_NET_SOCK_MSG)
static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk)
{
}
#endif

#ifdef CONFIG_CGROUP_BPF
static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops,
                                      struct sk_buff *skb,
                                      unsigned int end_offset)
{
        skops->skb = skb;
        skops->skb_data_end = skb->data + end_offset;
}
#else
static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops,
                                      struct sk_buff *skb,
                                      unsigned int end_offset)
{
}
#endif

/* Call BPF_SOCK_OPS program that returns an int. If the return value
 * is < 0, then the BPF op failed (for example if the loaded BPF
 * program does not support the chosen operation or there is no BPF
 * program loaded).
 */
#ifdef CONFIG_BPF
static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
{
        struct bpf_sock_ops_kern sock_ops;
        int ret;

        memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
        if (sk_fullsock(sk)) {
                sock_ops.is_fullsock = 1;
                sock_owned_by_me(sk);
        }

        sock_ops.sk = sk;
        sock_ops.op = op;
        if (nargs > 0)
                memcpy(sock_ops.args, args, nargs * sizeof(*args));

        ret = BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
        if (ret == 0)
                ret = sock_ops.reply;
        else
                ret = -1;
        return ret;
}

static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2)
{
        u32 args[2] = {arg1, arg2};

        return tcp_call_bpf(sk, op, 2, args);
}

static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2,
                                    u32 arg3)
{
        u32 args[3] = {arg1, arg2, arg3};

        return tcp_call_bpf(sk, op, 3, args);
}

#else
static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
{
        return -EPERM;
}

static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2)
{
        return -EPERM;
}

static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2,
                                    u32 arg3)
{
        return -EPERM;
}

#endif

static inline u32 tcp_timeout_init(struct sock *sk)
{
        int timeout;

        timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT, 0, NULL);

        if (timeout <= 0)
                timeout = TCP_TIMEOUT_INIT;
        return timeout;
}

static inline u32 tcp_rwnd_init_bpf(struct sock *sk)
{
        int rwnd;

        rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT, 0, NULL);

        if (rwnd < 0)
                rwnd = 0;
        return rwnd;
}

static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
{
        return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1);
}

static inline void tcp_bpf_rtt(struct sock *sk)
{
        if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_RTT_CB_FLAG))
                tcp_call_bpf(sk, BPF_SOCK_OPS_RTT_CB, 0, NULL);
}

#if IS_ENABLED(CONFIG_SMC)
extern struct static_key_false tcp_have_smc;
#endif

#if IS_ENABLED(CONFIG_TLS_DEVICE)
void clean_acked_data_enable(struct inet_connection_sock *icsk,
                             void (*cad)(struct sock *sk, u32 ack_seq));
void clean_acked_data_disable(struct inet_connection_sock *icsk);
void clean_acked_data_flush(void);
#endif

DECLARE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
static inline void tcp_add_tx_delay(struct sk_buff *skb,
                                    const struct tcp_sock *tp)
{
        if (static_branch_unlikely(&tcp_tx_delay_enabled))
                skb->skb_mstamp_ns += (u64)tp->tcp_tx_delay * NSEC_PER_USEC;
}

/* Compute Earliest Departure Time for some control packets
 * like ACK or RST for TIME_WAIT or non ESTABLISHED sockets.
 */
static inline u64 tcp_transmit_time(const struct sock *sk)
{
        if (static_branch_unlikely(&tcp_tx_delay_enabled)) {
                u32 delay = (sk->sk_state == TCP_TIME_WAIT) ?
                        tcp_twsk(sk)->tw_tx_delay : tcp_sk(sk)->tcp_tx_delay;

                return tcp_clock_ns() + (u64)delay * NSEC_PER_USEC;
        }
        return 0;
}

#endif        /* _TCP_H */























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * acpi.h - ACPI Interface
 *
 * Copyright (C) 2001 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
 */

#ifndef _LINUX_ACPI_H
#define _LINUX_ACPI_H

#include <linux/errno.h>
#include <linux/ioport.h>        /* for struct resource */
#include <linux/irqdomain.h>
#include <linux/resource_ext.h>
#include <linux/device.h>
#include <linux/property.h>
#include <linux/uuid.h>

#ifndef _LINUX
#define _LINUX
#endif
#include <acpi/acpi.h>

#ifdef        CONFIG_ACPI

#include <linux/list.h>
#include <linux/mod_devicetable.h>
#include <linux/dynamic_debug.h>
#include <linux/module.h>
#include <linux/mutex.h>

#include <acpi/acpi_bus.h>
#include <acpi/acpi_drivers.h>
#include <acpi/acpi_numa.h>
#include <acpi/acpi_io.h>
#include <asm/acpi.h>

static inline acpi_handle acpi_device_handle(struct acpi_device *adev)
{
        return adev ? adev->handle : NULL;
}

#define ACPI_COMPANION(dev)                to_acpi_device_node((dev)->fwnode)
#define ACPI_COMPANION_SET(dev, adev)        set_primary_fwnode(dev, (adev) ? \
        acpi_fwnode_handle(adev) : NULL)
#define ACPI_HANDLE(dev)                acpi_device_handle(ACPI_COMPANION(dev))
#define ACPI_HANDLE_FWNODE(fwnode)        \
                                acpi_device_handle(to_acpi_device_node(fwnode))

static inline struct fwnode_handle *acpi_alloc_fwnode_static(void)
{
        struct fwnode_handle *fwnode;

        fwnode = kzalloc(sizeof(struct fwnode_handle), GFP_KERNEL);
        if (!fwnode)
                return NULL;

        fwnode->ops = &acpi_static_fwnode_ops;

        return fwnode;
}

static inline void acpi_free_fwnode_static(struct fwnode_handle *fwnode)
{
        if (WARN_ON(!is_acpi_static_node(fwnode)))
                return;

        kfree(fwnode);
}

/**
 * ACPI_DEVICE_CLASS - macro used to describe an ACPI device with
 * the PCI-defined class-code information
 *
 * @_cls : the class, subclass, prog-if triple for this device
 * @_msk : the class mask for this device
 *
 * This macro is used to create a struct acpi_device_id that matches a
 * specific PCI class. The .id and .driver_data fields will be left
 * initialized with the default value.
 */
#define ACPI_DEVICE_CLASS(_cls, _msk)        .cls = (_cls), .cls_msk = (_msk),

static inline bool has_acpi_companion(struct device *dev)
{
        return is_acpi_device_node(dev->fwnode);
}

static inline void acpi_preset_companion(struct device *dev,
                                         struct acpi_device *parent, u64 addr)
{
        ACPI_COMPANION_SET(dev, acpi_find_child_device(parent, addr, false));
}

static inline const char *acpi_dev_name(struct acpi_device *adev)
{
        return dev_name(&adev->dev);
}

struct device *acpi_get_first_physical_node(struct acpi_device *adev);

enum acpi_irq_model_id {
        ACPI_IRQ_MODEL_PIC = 0,
        ACPI_IRQ_MODEL_IOAPIC,
        ACPI_IRQ_MODEL_IOSAPIC,
        ACPI_IRQ_MODEL_PLATFORM,
        ACPI_IRQ_MODEL_GIC,
        ACPI_IRQ_MODEL_COUNT
};

extern enum acpi_irq_model_id        acpi_irq_model;

enum acpi_interrupt_id {
        ACPI_INTERRUPT_PMI        = 1,
        ACPI_INTERRUPT_INIT,
        ACPI_INTERRUPT_CPEI,
        ACPI_INTERRUPT_COUNT
};

#define        ACPI_SPACE_MEM                0

enum acpi_address_range_id {
        ACPI_ADDRESS_RANGE_MEMORY = 1,
        ACPI_ADDRESS_RANGE_RESERVED = 2,
        ACPI_ADDRESS_RANGE_ACPI = 3,
        ACPI_ADDRESS_RANGE_NVS        = 4,
        ACPI_ADDRESS_RANGE_COUNT
};


/* Table Handlers */
union acpi_subtable_headers {
        struct acpi_subtable_header common;
        struct acpi_hmat_structure hmat;
};

typedef int (*acpi_tbl_table_handler)(struct acpi_table_header *table);

typedef int (*acpi_tbl_entry_handler)(union acpi_subtable_headers *header,
                                      const unsigned long end);

/* Debugger support */

struct acpi_debugger_ops {
        int (*create_thread)(acpi_osd_exec_callback function, void *context);
        ssize_t (*write_log)(const char *msg);
        ssize_t (*read_cmd)(char *buffer, size_t length);
        int (*wait_command_ready)(bool single_step, char *buffer, size_t length);
        int (*notify_command_complete)(void);
};

struct acpi_debugger {
        const struct acpi_debugger_ops *ops;
        struct module *owner;
        struct mutex lock;
};

#ifdef CONFIG_ACPI_DEBUGGER
int __init acpi_debugger_init(void);
int acpi_register_debugger(struct module *owner,
                           const struct acpi_debugger_ops *ops);
void acpi_unregister_debugger(const struct acpi_debugger_ops *ops);
int acpi_debugger_create_thread(acpi_osd_exec_callback function, void *context);
ssize_t acpi_debugger_write_log(const char *msg);
ssize_t acpi_debugger_read_cmd(char *buffer, size_t buffer_length);
int acpi_debugger_wait_command_ready(void);
int acpi_debugger_notify_command_complete(void);
#else
static inline int acpi_debugger_init(void)
{
        return -ENODEV;
}

static inline int acpi_register_debugger(struct module *owner,
                                         const struct acpi_debugger_ops *ops)
{
        return -ENODEV;
}

static inline void acpi_unregister_debugger(const struct acpi_debugger_ops *ops)
{
}

static inline int acpi_debugger_create_thread(acpi_osd_exec_callback function,
                                              void *context)
{
        return -ENODEV;
}

static inline int acpi_debugger_write_log(const char *msg)
{
        return -ENODEV;
}

static inline int acpi_debugger_read_cmd(char *buffer, u32 buffer_length)
{
        return -ENODEV;
}

static inline int acpi_debugger_wait_command_ready(void)
{
        return -ENODEV;
}

static inline int acpi_debugger_notify_command_complete(void)
{
        return -ENODEV;
}
#endif

#define BAD_MADT_ENTRY(entry, end) (                                            \
                (!entry) || (unsigned long)entry + sizeof(*entry) > end ||  \
                ((struct acpi_subtable_header *)entry)->length < sizeof(*entry))

struct acpi_subtable_proc {
        int id;
        acpi_tbl_entry_handler handler;
        int count;
};

void __iomem *__acpi_map_table(unsigned long phys, unsigned long size);
void __acpi_unmap_table(void __iomem *map, unsigned long size);
int early_acpi_boot_init(void);
int acpi_boot_init (void);
void acpi_boot_table_prepare (void);
void acpi_boot_table_init (void);
int acpi_mps_check (void);
int acpi_numa_init (void);

int acpi_locate_initial_tables (void);
void acpi_reserve_initial_tables (void);
void acpi_table_init_complete (void);
int acpi_table_init (void);
int acpi_table_parse(char *id, acpi_tbl_table_handler handler);
int __init acpi_table_parse_entries(char *id, unsigned long table_size,
                              int entry_id,
                              acpi_tbl_entry_handler handler,
                              unsigned int max_entries);
int __init acpi_table_parse_entries_array(char *id, unsigned long table_size,
                              struct acpi_subtable_proc *proc, int proc_num,
                              unsigned int max_entries);
int acpi_table_parse_madt(enum acpi_madt_type id,
                          acpi_tbl_entry_handler handler,
                          unsigned int max_entries);
int acpi_parse_mcfg (struct acpi_table_header *header);
void acpi_table_print_madt_entry (struct acpi_subtable_header *madt);

/* the following numa functions are architecture-dependent */
void acpi_numa_slit_init (struct acpi_table_slit *slit);

#if defined(CONFIG_X86) || defined(CONFIG_IA64)
void acpi_numa_processor_affinity_init (struct acpi_srat_cpu_affinity *pa);
#else
static inline void
acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) { }
#endif

void acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa);

#ifdef CONFIG_ARM64
void acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa);
#else
static inline void
acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa) { }
#endif

int acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma);

#ifndef PHYS_CPUID_INVALID
typedef u32 phys_cpuid_t;
#define PHYS_CPUID_INVALID (phys_cpuid_t)(-1)
#endif

static inline bool invalid_logical_cpuid(u32 cpuid)
{
        return (int)cpuid < 0;
}

static inline bool invalid_phys_cpuid(phys_cpuid_t phys_id)
{
        return phys_id == PHYS_CPUID_INVALID;
}

/* Validate the processor object's proc_id */
bool acpi_duplicate_processor_id(int proc_id);
/* Processor _CTS control */
struct acpi_processor_power;

#ifdef CONFIG_ACPI_PROCESSOR_CSTATE
bool acpi_processor_claim_cst_control(void);
int acpi_processor_evaluate_cst(acpi_handle handle, u32 cpu,
                                struct acpi_processor_power *info);
#else
static inline bool acpi_processor_claim_cst_control(void) { return false; }
static inline int acpi_processor_evaluate_cst(acpi_handle handle, u32 cpu,
                                              struct acpi_processor_power *info)
{
        return -ENODEV;
}
#endif

#ifdef CONFIG_ACPI_HOTPLUG_CPU
/* Arch dependent functions for cpu hotplug support */
int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id,
                 int *pcpu);
int acpi_unmap_cpu(int cpu);
#endif /* CONFIG_ACPI_HOTPLUG_CPU */

#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
int acpi_get_ioapic_id(acpi_handle handle, u32 gsi_base, u64 *phys_addr);
#endif

int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base);
int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base);
int acpi_ioapic_registered(acpi_handle handle, u32 gsi_base);
void acpi_irq_stats_init(void);
extern u32 acpi_irq_handled;
extern u32 acpi_irq_not_handled;
extern unsigned int acpi_sci_irq;
extern bool acpi_no_s5;
#define INVALID_ACPI_IRQ        ((unsigned)-1)
static inline bool acpi_sci_irq_valid(void)
{
        return acpi_sci_irq != INVALID_ACPI_IRQ;
}

extern int sbf_port;
extern unsigned long acpi_realmode_flags;

int acpi_register_gsi (struct device *dev, u32 gsi, int triggering, int polarity);
int acpi_gsi_to_irq (u32 gsi, unsigned int *irq);
int acpi_isa_irq_to_gsi (unsigned isa_irq, u32 *gsi);

void acpi_set_irq_model(enum acpi_irq_model_id model,
                        struct fwnode_handle *fwnode);

struct irq_domain *acpi_irq_create_hierarchy(unsigned int flags,
                                             unsigned int size,
                                             struct fwnode_handle *fwnode,
                                             const struct irq_domain_ops *ops,
                                             void *host_data);

#ifdef CONFIG_X86_IO_APIC
extern int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity);
#else
static inline int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
{
        return -1;
}
#endif
/*
 * This function undoes the effect of one call to acpi_register_gsi().
 * If this matches the last registration, any IRQ resources for gsi
 * are freed.
 */
void acpi_unregister_gsi (u32 gsi);

struct pci_dev;

int acpi_pci_irq_enable (struct pci_dev *dev);
void acpi_penalize_isa_irq(int irq, int active);
bool acpi_isa_irq_available(int irq);
#ifdef CONFIG_PCI
void acpi_penalize_sci_irq(int irq, int trigger, int polarity);
#else
static inline void acpi_penalize_sci_irq(int irq, int trigger,
                                        int polarity)
{
}
#endif
void acpi_pci_irq_disable (struct pci_dev *dev);

extern int ec_read(u8 addr, u8 *val);
extern int ec_write(u8 addr, u8 val);
extern int ec_transaction(u8 command,
                          const u8 *wdata, unsigned wdata_len,
                          u8 *rdata, unsigned rdata_len);
extern acpi_handle ec_get_handle(void);

extern bool acpi_is_pnp_device(struct acpi_device *);

#if defined(CONFIG_ACPI_WMI) || defined(CONFIG_ACPI_WMI_MODULE)

typedef void (*wmi_notify_handler) (u32 value, void *context);

extern acpi_status wmi_evaluate_method(const char *guid, u8 instance,
                                        u32 method_id,
                                        const struct acpi_buffer *in,
                                        struct acpi_buffer *out);
extern acpi_status wmi_query_block(const char *guid, u8 instance,
                                        struct acpi_buffer *out);
extern acpi_status wmi_set_block(const char *guid, u8 instance,
                                        const struct acpi_buffer *in);
extern acpi_status wmi_install_notify_handler(const char *guid,
                                        wmi_notify_handler handler, void *data);
extern acpi_status wmi_remove_notify_handler(const char *guid);
extern acpi_status wmi_get_event_data(u32 event, struct acpi_buffer *out);
extern bool wmi_has_guid(const char *guid);
extern char *wmi_get_acpi_device_uid(const char *guid);

#endif        /* CONFIG_ACPI_WMI */

#define ACPI_VIDEO_OUTPUT_SWITCHING                        0x0001
#define ACPI_VIDEO_DEVICE_POSTING                        0x0002
#define ACPI_VIDEO_ROM_AVAILABLE                        0x0004
#define ACPI_VIDEO_BACKLIGHT                                0x0008
#define ACPI_VIDEO_BACKLIGHT_FORCE_VENDOR                0x0010
#define ACPI_VIDEO_BACKLIGHT_FORCE_VIDEO                0x0020
#define ACPI_VIDEO_OUTPUT_SWITCHING_FORCE_VENDOR        0x0040
#define ACPI_VIDEO_OUTPUT_SWITCHING_FORCE_VIDEO                0x0080
#define ACPI_VIDEO_BACKLIGHT_DMI_VENDOR                        0x0100
#define ACPI_VIDEO_BACKLIGHT_DMI_VIDEO                        0x0200
#define ACPI_VIDEO_OUTPUT_SWITCHING_DMI_VENDOR                0x0400
#define ACPI_VIDEO_OUTPUT_SWITCHING_DMI_VIDEO                0x0800

extern char acpi_video_backlight_string[];
extern long acpi_is_video_device(acpi_handle handle);
extern int acpi_blacklisted(void);
extern void acpi_osi_setup(char *str);
extern bool acpi_osi_is_win8(void);

#ifdef CONFIG_ACPI_NUMA
int acpi_map_pxm_to_node(int pxm);
int acpi_get_node(acpi_handle handle);

/**
 * pxm_to_online_node - Map proximity ID to online node
 * @pxm: ACPI proximity ID
 *
 * This is similar to pxm_to_node(), but always returns an online
 * node.  When the mapped node from a given proximity ID is offline, it
 * looks up the node distance table and returns the nearest online node.
 *
 * ACPI device drivers, which are called after the NUMA initialization has
 * completed in the kernel, can call this interface to obtain their device
 * NUMA topology from ACPI tables.  Such drivers do not have to deal with
 * offline nodes.  A node may be offline when SRAT memory entry does not exist,
 * or NUMA is disabled, ex. "numa=off" on x86.
 */
static inline int pxm_to_online_node(int pxm)
{
        int node = pxm_to_node(pxm);

        return numa_map_to_online_node(node);
}
#else
static inline int pxm_to_online_node(int pxm)
{
        return 0;
}
static inline int acpi_map_pxm_to_node(int pxm)
{
        return 0;
}
static inline int acpi_get_node(acpi_handle handle)
{
        return 0;
}
#endif
extern int acpi_paddr_to_node(u64 start_addr, u64 size);

extern int pnpacpi_disabled;

#define PXM_INVAL        (-1)

bool acpi_dev_resource_memory(struct acpi_resource *ares, struct resource *res);
bool acpi_dev_resource_io(struct acpi_resource *ares, struct resource *res);
bool acpi_dev_resource_address_space(struct acpi_resource *ares,
                                     struct resource_win *win);
bool acpi_dev_resource_ext_address_space(struct acpi_resource *ares,
                                         struct resource_win *win);
unsigned long acpi_dev_irq_flags(u8 triggering, u8 polarity, u8 shareable);
unsigned int acpi_dev_get_irq_type(int triggering, int polarity);
bool acpi_dev_resource_interrupt(struct acpi_resource *ares, int index,
                                 struct resource *res);

void acpi_dev_free_resource_list(struct list_head *list);
int acpi_dev_get_resources(struct acpi_device *adev, struct list_head *list,
                           int (*preproc)(struct acpi_resource *, void *),
                           void *preproc_data);
int acpi_dev_get_dma_resources(struct acpi_device *adev,
                               struct list_head *list);
int acpi_dev_filter_resource_type(struct acpi_resource *ares,
                                  unsigned long types);

static inline int acpi_dev_filter_resource_type_cb(struct acpi_resource *ares,
                                                   void *arg)
{
        return acpi_dev_filter_resource_type(ares, (unsigned long)arg);
}

struct acpi_device *acpi_resource_consumer(struct resource *res);

int acpi_check_resource_conflict(const struct resource *res);

int acpi_check_region(resource_size_t start, resource_size_t n,
                      const char *name);

acpi_status acpi_release_memory(acpi_handle handle, struct resource *res,
                                u32 level);

int acpi_resources_are_enforced(void);

#ifdef CONFIG_HIBERNATION
void __init acpi_no_s4_hw_signature(void);
#endif

#ifdef CONFIG_PM_SLEEP
void __init acpi_old_suspend_ordering(void);
void __init acpi_nvs_nosave(void);
void __init acpi_nvs_nosave_s3(void);
void __init acpi_sleep_no_blacklist(void);
#endif /* CONFIG_PM_SLEEP */

int acpi_register_wakeup_handler(
        int wake_irq, bool (*wakeup)(void *context), void *context);
void acpi_unregister_wakeup_handler(
        bool (*wakeup)(void *context), void *context);

struct acpi_osc_context {
        char *uuid_str;                        /* UUID string */
        int rev;
        struct acpi_buffer cap;                /* list of DWORD capabilities */
        struct acpi_buffer ret;                /* free by caller if success */
};

acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context);

/* Indexes into _OSC Capabilities Buffer (DWORDs 2 & 3 are device-specific) */
#define OSC_QUERY_DWORD                                0        /* DWORD 1 */
#define OSC_SUPPORT_DWORD                        1        /* DWORD 2 */
#define OSC_CONTROL_DWORD                        2        /* DWORD 3 */

/* _OSC Capabilities DWORD 1: Query/Control and Error Returns (generic) */
#define OSC_QUERY_ENABLE                        0x00000001  /* input */
#define OSC_REQUEST_ERROR                        0x00000002  /* return */
#define OSC_INVALID_UUID_ERROR                        0x00000004  /* return */
#define OSC_INVALID_REVISION_ERROR                0x00000008  /* return */
#define OSC_CAPABILITIES_MASK_ERROR                0x00000010  /* return */

/* Platform-Wide Capabilities _OSC: Capabilities DWORD 2: Support Field */
#define OSC_SB_PAD_SUPPORT                        0x00000001
#define OSC_SB_PPC_OST_SUPPORT                        0x00000002
#define OSC_SB_PR3_SUPPORT                        0x00000004
#define OSC_SB_HOTPLUG_OST_SUPPORT                0x00000008
#define OSC_SB_APEI_SUPPORT                        0x00000010
#define OSC_SB_CPC_SUPPORT                        0x00000020
#define OSC_SB_CPCV2_SUPPORT                        0x00000040
#define OSC_SB_PCLPI_SUPPORT                        0x00000080
#define OSC_SB_OSLPI_SUPPORT                        0x00000100
#define OSC_SB_CPC_DIVERSE_HIGH_SUPPORT                0x00001000
#define OSC_SB_GENERIC_INITIATOR_SUPPORT        0x00002000

extern bool osc_sb_apei_support_acked;
extern bool osc_pc_lpi_support_confirmed;

/* PCI Host Bridge _OSC: Capabilities DWORD 2: Support Field */
#define OSC_PCI_EXT_CONFIG_SUPPORT                0x00000001
#define OSC_PCI_ASPM_SUPPORT                        0x00000002
#define OSC_PCI_CLOCK_PM_SUPPORT                0x00000004
#define OSC_PCI_SEGMENT_GROUPS_SUPPORT                0x00000008
#define OSC_PCI_MSI_SUPPORT                        0x00000010
#define OSC_PCI_EDR_SUPPORT                        0x00000080
#define OSC_PCI_HPX_TYPE_3_SUPPORT                0x00000100
#define OSC_PCI_SUPPORT_MASKS                        0x0000019f

/* PCI Host Bridge _OSC: Capabilities DWORD 3: Control Field */
#define OSC_PCI_EXPRESS_NATIVE_HP_CONTROL        0x00000001
#define OSC_PCI_SHPC_NATIVE_HP_CONTROL                0x00000002
#define OSC_PCI_EXPRESS_PME_CONTROL                0x00000004
#define OSC_PCI_EXPRESS_AER_CONTROL                0x00000008
#define OSC_PCI_EXPRESS_CAPABILITY_CONTROL        0x00000010
#define OSC_PCI_EXPRESS_LTR_CONTROL                0x00000020
#define OSC_PCI_EXPRESS_DPC_CONTROL                0x00000080
#define OSC_PCI_CONTROL_MASKS                        0x000000bf

#define ACPI_GSB_ACCESS_ATTRIB_QUICK                0x00000002
#define ACPI_GSB_ACCESS_ATTRIB_SEND_RCV         0x00000004
#define ACPI_GSB_ACCESS_ATTRIB_BYTE                0x00000006
#define ACPI_GSB_ACCESS_ATTRIB_WORD                0x00000008
#define ACPI_GSB_ACCESS_ATTRIB_BLOCK                0x0000000A
#define ACPI_GSB_ACCESS_ATTRIB_MULTIBYTE        0x0000000B
#define ACPI_GSB_ACCESS_ATTRIB_WORD_CALL        0x0000000C
#define ACPI_GSB_ACCESS_ATTRIB_BLOCK_CALL        0x0000000D
#define ACPI_GSB_ACCESS_ATTRIB_RAW_BYTES        0x0000000E
#define ACPI_GSB_ACCESS_ATTRIB_RAW_PROCESS        0x0000000F

extern acpi_status acpi_pci_osc_control_set(acpi_handle handle,
                                             u32 *mask, u32 req);

/* Enable _OST when all relevant hotplug operations are enabled */
#if defined(CONFIG_ACPI_HOTPLUG_CPU) &&                        \
        defined(CONFIG_ACPI_HOTPLUG_MEMORY) &&                \
        defined(CONFIG_ACPI_CONTAINER)
#define ACPI_HOTPLUG_OST
#endif

/* _OST Source Event Code (OSPM Action) */
#define ACPI_OST_EC_OSPM_SHUTDOWN                0x100
#define ACPI_OST_EC_OSPM_EJECT                        0x103
#define ACPI_OST_EC_OSPM_INSERTION                0x200

/* _OST General Processing Status Code */
#define ACPI_OST_SC_SUCCESS                        0x0
#define ACPI_OST_SC_NON_SPECIFIC_FAILURE        0x1
#define ACPI_OST_SC_UNRECOGNIZED_NOTIFY                0x2

/* _OST OS Shutdown Processing (0x100) Status Code */
#define ACPI_OST_SC_OS_SHUTDOWN_DENIED                0x80
#define ACPI_OST_SC_OS_SHUTDOWN_IN_PROGRESS        0x81
#define ACPI_OST_SC_OS_SHUTDOWN_COMPLETED        0x82
#define ACPI_OST_SC_OS_SHUTDOWN_NOT_SUPPORTED        0x83

/* _OST Ejection Request (0x3, 0x103) Status Code */
#define ACPI_OST_SC_EJECT_NOT_SUPPORTED                0x80
#define ACPI_OST_SC_DEVICE_IN_USE                0x81
#define ACPI_OST_SC_DEVICE_BUSY                        0x82
#define ACPI_OST_SC_EJECT_DEPENDENCY_BUSY        0x83
#define ACPI_OST_SC_EJECT_IN_PROGRESS                0x84

/* _OST Insertion Request (0x200) Status Code */
#define ACPI_OST_SC_INSERT_IN_PROGRESS                0x80
#define ACPI_OST_SC_DRIVER_LOAD_FAILURE                0x81
#define ACPI_OST_SC_INSERT_NOT_SUPPORTED        0x82

enum acpi_predicate {
        all_versions,
        less_than_or_equal,
        equal,
        greater_than_or_equal,
};

/* Table must be terminted by a NULL entry */
struct acpi_platform_list {
        char        oem_id[ACPI_OEM_ID_SIZE+1];
        char        oem_table_id[ACPI_OEM_TABLE_ID_SIZE+1];
        u32        oem_revision;
        char        *table;
        enum acpi_predicate pred;
        char        *reason;
        u32        data;
};
int acpi_match_platform_list(const struct acpi_platform_list *plat);

extern void acpi_early_init(void);
extern void acpi_subsystem_init(void);
extern void arch_post_acpi_subsys_init(void);

extern int acpi_nvs_register(__u64 start, __u64 size);

extern int acpi_nvs_for_each_region(int (*func)(__u64, __u64, void *),
                                    void *data);

const struct acpi_device_id *acpi_match_device(const struct acpi_device_id *ids,
                                               const struct device *dev);

const void *acpi_device_get_match_data(const struct device *dev);
extern bool acpi_driver_match_device(struct device *dev,
                                     const struct device_driver *drv);
int acpi_device_uevent_modalias(struct device *, struct kobj_uevent_env *);
int acpi_device_modalias(struct device *, char *, int);
void acpi_walk_dep_device_list(acpi_handle handle);

struct platform_device *acpi_create_platform_device(struct acpi_device *,
                                                    struct property_entry *);
#define ACPI_PTR(_ptr)        (_ptr)

static inline void acpi_device_set_enumerated(struct acpi_device *adev)
{
        adev->flags.visited = true;
}

static inline void acpi_device_clear_enumerated(struct acpi_device *adev)
{
        adev->flags.visited = false;
}

enum acpi_reconfig_event  {
        ACPI_RECONFIG_DEVICE_ADD = 0,
        ACPI_RECONFIG_DEVICE_REMOVE,
};

int acpi_reconfig_notifier_register(struct notifier_block *nb);
int acpi_reconfig_notifier_unregister(struct notifier_block *nb);

#ifdef CONFIG_ACPI_GTDT
int acpi_gtdt_init(struct acpi_table_header *table, int *platform_timer_count);
int acpi_gtdt_map_ppi(int type);
bool acpi_gtdt_c3stop(int type);
int acpi_arch_timer_mem_init(struct arch_timer_mem *timer_mem, int *timer_count);
#endif

#ifndef ACPI_HAVE_ARCH_SET_ROOT_POINTER
static inline void acpi_arch_set_root_pointer(u64 addr)
{
}
#endif

#ifndef ACPI_HAVE_ARCH_GET_ROOT_POINTER
static inline u64 acpi_arch_get_root_pointer(void)
{
        return 0;
}
#endif

#else        /* !CONFIG_ACPI */

#define acpi_disabled 1

#define ACPI_COMPANION(dev)                (NULL)
#define ACPI_COMPANION_SET(dev, adev)        do { } while (0)
#define ACPI_HANDLE(dev)                (NULL)
#define ACPI_HANDLE_FWNODE(fwnode)        (NULL)
#define ACPI_DEVICE_CLASS(_cls, _msk)        .cls = (0), .cls_msk = (0),

#include <acpi/acpi_numa.h>

struct fwnode_handle;

static inline bool acpi_dev_found(const char *hid)
{
        return false;
}

static inline bool acpi_dev_present(const char *hid, const char *uid, s64 hrv)
{
        return false;
}

struct acpi_device;

static inline bool
acpi_dev_hid_uid_match(struct acpi_device *adev, const char *hid2, const char *uid2)
{
        return false;
}

static inline struct acpi_device *
acpi_dev_get_first_match_dev(const char *hid, const char *uid, s64 hrv)
{
        return NULL;
}

static inline void acpi_dev_put(struct acpi_device *adev) {}

static inline bool is_acpi_node(struct fwnode_handle *fwnode)
{
        return false;
}

static inline bool is_acpi_device_node(struct fwnode_handle *fwnode)
{
        return false;
}

static inline struct acpi_device *to_acpi_device_node(struct fwnode_handle *fwnode)
{
        return NULL;
}

static inline bool is_acpi_data_node(struct fwnode_handle *fwnode)
{
        return false;
}

static inline struct acpi_data_node *to_acpi_data_node(struct fwnode_handle *fwnode)
{
        return NULL;
}

static inline bool acpi_data_node_match(struct fwnode_handle *fwnode,
                                        const char *name)
{
        return false;
}

static inline struct fwnode_handle *acpi_fwnode_handle(struct acpi_device *adev)
{
        return NULL;
}

static inline bool has_acpi_companion(struct device *dev)
{
        return false;
}

static inline void acpi_preset_companion(struct device *dev,
                                         struct acpi_device *parent, u64 addr)
{
}

static inline const char *acpi_dev_name(struct acpi_device *adev)
{
        return NULL;
}

static inline struct device *acpi_get_first_physical_node(struct acpi_device *adev)
{
        return NULL;
}

static inline void acpi_early_init(void) { }
static inline void acpi_subsystem_init(void) { }

static inline int early_acpi_boot_init(void)
{
        return 0;
}
static inline int acpi_boot_init(void)
{
        return 0;
}

static inline void acpi_boot_table_prepare(void)
{
}

static inline void acpi_boot_table_init(void)
{
}

static inline int acpi_mps_check(void)
{
        return 0;
}

static inline int acpi_check_resource_conflict(struct resource *res)
{
        return 0;
}

static inline int acpi_check_region(resource_size_t start, resource_size_t n,
                                    const char *name)
{
        return 0;
}

struct acpi_table_header;
static inline int acpi_table_parse(char *id,
                                int (*handler)(struct acpi_table_header *))
{
        return -ENODEV;
}

static inline int acpi_nvs_register(__u64 start, __u64 size)
{
        return 0;
}

static inline int acpi_nvs_for_each_region(int (*func)(__u64, __u64, void *),
                                           void *data)
{
        return 0;
}

struct acpi_device_id;

static inline const struct acpi_device_id *acpi_match_device(
        const struct acpi_device_id *ids, const struct device *dev)
{
        return NULL;
}

static inline const void *acpi_device_get_match_data(const struct device *dev)
{
        return NULL;
}

static inline bool acpi_driver_match_device(struct device *dev,
                                            const struct device_driver *drv)
{
        return false;
}

static inline union acpi_object *acpi_evaluate_dsm(acpi_handle handle,
                                                   const guid_t *guid,
                                                   u64 rev, u64 func,
                                                   union acpi_object *argv4)
{
        return NULL;
}

static inline int acpi_device_uevent_modalias(struct device *dev,
                                struct kobj_uevent_env *env)
{
        return -ENODEV;
}

static inline int acpi_device_modalias(struct device *dev,
                                char *buf, int size)
{
        return -ENODEV;
}

static inline struct platform_device *
acpi_create_platform_device(struct acpi_device *adev,
                            struct property_entry *properties)
{
        return NULL;
}

static inline bool acpi_dma_supported(struct acpi_device *adev)
{
        return false;
}

static inline enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev)
{
        return DEV_DMA_NOT_SUPPORTED;
}

static inline int acpi_dma_get_range(struct device *dev, u64 *dma_addr,
                                     u64 *offset, u64 *size)
{
        return -ENODEV;
}

static inline int acpi_dma_configure(struct device *dev,
                                     enum dev_dma_attr attr)
{
        return 0;
}

static inline int acpi_dma_configure_id(struct device *dev,
                                        enum dev_dma_attr attr,
                                        const u32 *input_id)
{
        return 0;
}

#define ACPI_PTR(_ptr)        (NULL)

static inline void acpi_device_set_enumerated(struct acpi_device *adev)
{
}

static inline void acpi_device_clear_enumerated(struct acpi_device *adev)
{
}

static inline int acpi_reconfig_notifier_register(struct notifier_block *nb)
{
        return -EINVAL;
}

static inline int acpi_reconfig_notifier_unregister(struct notifier_block *nb)
{
        return -EINVAL;
}

static inline struct acpi_device *acpi_resource_consumer(struct resource *res)
{
        return NULL;
}

static inline int acpi_register_wakeup_handler(int wake_irq,
        bool (*wakeup)(void *context), void *context)
{
        return -ENXIO;
}

static inline void acpi_unregister_wakeup_handler(
        bool (*wakeup)(void *context), void *context) { }

#endif        /* !CONFIG_ACPI */

#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
int acpi_ioapic_add(acpi_handle root);
#else
static inline int acpi_ioapic_add(acpi_handle root) { return 0; }
#endif

#ifdef CONFIG_ACPI
void acpi_os_set_prepare_sleep(int (*func)(u8 sleep_state,
                               u32 pm1a_ctrl,  u32 pm1b_ctrl));

acpi_status acpi_os_prepare_sleep(u8 sleep_state,
                                  u32 pm1a_control, u32 pm1b_control);

void acpi_os_set_prepare_extended_sleep(int (*func)(u8 sleep_state,
                                        u32 val_a,  u32 val_b));

acpi_status acpi_os_prepare_extended_sleep(u8 sleep_state,
                                           u32 val_a, u32 val_b);

#ifndef CONFIG_IA64
void arch_reserve_mem_area(acpi_physical_address addr, size_t size);
#else
static inline void arch_reserve_mem_area(acpi_physical_address addr,
                                          size_t size)
{
}
#endif /* CONFIG_X86 */
#else
#define acpi_os_set_prepare_sleep(func, pm1a_ctrl, pm1b_ctrl) do { } while (0)
#endif

#if defined(CONFIG_ACPI) && defined(CONFIG_PM)
int acpi_dev_suspend(struct device *dev, bool wakeup);
int acpi_dev_resume(struct device *dev);
int acpi_subsys_runtime_suspend(struct device *dev);
int acpi_subsys_runtime_resume(struct device *dev);
int acpi_dev_pm_attach(struct device *dev, bool power_on);
bool acpi_storage_d3(struct device *dev);
#else
static inline int acpi_subsys_runtime_suspend(struct device *dev) { return 0; }
static inline int acpi_subsys_runtime_resume(struct device *dev) { return 0; }
static inline int acpi_dev_pm_attach(struct device *dev, bool power_on)
{
        return 0;
}
static inline bool acpi_storage_d3(struct device *dev)
{
        return false;
}
#endif

#if defined(CONFIG_ACPI) && defined(CONFIG_PM_SLEEP)
int acpi_subsys_prepare(struct device *dev);
void acpi_subsys_complete(struct device *dev);
int acpi_subsys_suspend_late(struct device *dev);
int acpi_subsys_suspend_noirq(struct device *dev);
int acpi_subsys_suspend(struct device *dev);
int acpi_subsys_freeze(struct device *dev);
int acpi_subsys_poweroff(struct device *dev);
void acpi_ec_mark_gpe_for_wake(void);
void acpi_ec_set_gpe_wake_mask(u8 action);
#else
static inline int acpi_subsys_prepare(struct device *dev) { return 0; }
static inline void acpi_subsys_complete(struct device *dev) {}
static inline int acpi_subsys_suspend_late(struct device *dev) { return 0; }
static inline int acpi_subsys_suspend_noirq(struct device *dev) { return 0; }
static inline int acpi_subsys_suspend(struct device *dev) { return 0; }
static inline int acpi_subsys_freeze(struct device *dev) { return 0; }
static inline int acpi_subsys_poweroff(struct device *dev) { return 0; }
static inline void acpi_ec_mark_gpe_for_wake(void) {}
static inline void acpi_ec_set_gpe_wake_mask(u8 action) {}
#endif

#ifdef CONFIG_ACPI
__printf(3, 4)
void acpi_handle_printk(const char *level, acpi_handle handle,
                        const char *fmt, ...);
#else        /* !CONFIG_ACPI */
static inline __printf(3, 4) void
acpi_handle_printk(const char *level, void *handle, const char *fmt, ...) {}
#endif        /* !CONFIG_ACPI */

#if defined(CONFIG_ACPI) && defined(CONFIG_DYNAMIC_DEBUG)
__printf(3, 4)
void __acpi_handle_debug(struct _ddebug *descriptor, acpi_handle handle, const char *fmt, ...);
#endif

/*
 * acpi_handle_<level>: Print message with ACPI prefix and object path
 *
 * These interfaces acquire the global namespace mutex to obtain an object
 * path.  In interrupt context, it shows the object path as <n/a>.
 */
#define acpi_handle_emerg(handle, fmt, ...)                                \
        acpi_handle_printk(KERN_EMERG, handle, fmt, ##__VA_ARGS__)
#define acpi_handle_alert(handle, fmt, ...)                                \
        acpi_handle_printk(KERN_ALERT, handle, fmt, ##__VA_ARGS__)
#define acpi_handle_crit(handle, fmt, ...)                                \
        acpi_handle_printk(KERN_CRIT, handle, fmt, ##__VA_ARGS__)
#define acpi_handle_err(handle, fmt, ...)                                \
        acpi_handle_printk(KERN_ERR, handle, fmt, ##__VA_ARGS__)
#define acpi_handle_warn(handle, fmt, ...)                                \
        acpi_handle_printk(KERN_WARNING, handle, fmt, ##__VA_ARGS__)
#define acpi_handle_notice(handle, fmt, ...)                                \
        acpi_handle_printk(KERN_NOTICE, handle, fmt, ##__VA_ARGS__)
#define acpi_handle_info(handle, fmt, ...)                                \
        acpi_handle_printk(KERN_INFO, handle, fmt, ##__VA_ARGS__)

#if defined(DEBUG)
#define acpi_handle_debug(handle, fmt, ...)                                \
        acpi_handle_printk(KERN_DEBUG, handle, fmt, ##__VA_ARGS__)
#else
#if defined(CONFIG_DYNAMIC_DEBUG)
#define acpi_handle_debug(handle, fmt, ...)                                \
        _dynamic_func_call(fmt, __acpi_handle_debug,                        \
                           handle, pr_fmt(fmt), ##__VA_ARGS__)
#else
#define acpi_handle_debug(handle, fmt, ...)                                \
({                                                                        \
        if (0)                                                                \
                acpi_handle_printk(KERN_DEBUG, handle, fmt, ##__VA_ARGS__); \
        0;                                                                \
})
#endif
#endif

#if defined(CONFIG_ACPI) && defined(CONFIG_GPIOLIB)
bool acpi_gpio_get_irq_resource(struct acpi_resource *ares,
                                struct acpi_resource_gpio **agpio);
int acpi_dev_gpio_irq_get_by(struct acpi_device *adev, const char *name, int index);
#else
static inline bool acpi_gpio_get_irq_resource(struct acpi_resource *ares,
                                              struct acpi_resource_gpio **agpio)
{
        return false;
}
static inline int acpi_dev_gpio_irq_get_by(struct acpi_device *adev,
                                           const char *name, int index)
{
        return -ENXIO;
}
#endif

static inline int acpi_dev_gpio_irq_get(struct acpi_device *adev, int index)
{
        return acpi_dev_gpio_irq_get_by(adev, NULL, index);
}

/* Device properties */

#ifdef CONFIG_ACPI
int acpi_dev_get_property(const struct acpi_device *adev, const char *name,
                          acpi_object_type type, const union acpi_object **obj);
int __acpi_node_get_property_reference(const struct fwnode_handle *fwnode,
                                const char *name, size_t index, size_t num_args,
                                struct fwnode_reference_args *args);

static inline int acpi_node_get_property_reference(
                                const struct fwnode_handle *fwnode,
                                const char *name, size_t index,
                                struct fwnode_reference_args *args)
{
        return __acpi_node_get_property_reference(fwnode, name, index,
                NR_FWNODE_REFERENCE_ARGS, args);
}

static inline bool acpi_dev_has_props(const struct acpi_device *adev)
{
        return !list_empty(&adev->data.properties);
}

struct acpi_device_properties *
acpi_data_add_props(struct acpi_device_data *data, const guid_t *guid,
                    const union acpi_object *properties);

int acpi_node_prop_get(const struct fwnode_handle *fwnode, const char *propname,
                       void **valptr);
int acpi_dev_prop_read_single(struct acpi_device *adev,
                              const char *propname, enum dev_prop_type proptype,
                              void *val);
int acpi_node_prop_read(const struct fwnode_handle *fwnode,
                        const char *propname, enum dev_prop_type proptype,
                        void *val, size_t nval);
int acpi_dev_prop_read(const struct acpi_device *adev, const char *propname,
                       enum dev_prop_type proptype, void *val, size_t nval);

struct fwnode_handle *acpi_get_next_subnode(const struct fwnode_handle *fwnode,
                                            struct fwnode_handle *child);
struct fwnode_handle *acpi_node_get_parent(const struct fwnode_handle *fwnode);

struct acpi_probe_entry;
typedef bool (*acpi_probe_entry_validate_subtbl)(struct acpi_subtable_header *,
                                                 struct acpi_probe_entry *);

#define ACPI_TABLE_ID_LEN        5

/**
 * struct acpi_probe_entry - boot-time probing entry
 * @id:                        ACPI table name
 * @type:                Optional subtable type to match
 *                        (if @id contains subtables)
 * @subtable_valid:        Optional callback to check the validity of
 *                        the subtable
 * @probe_table:        Callback to the driver being probed when table
 *                        match is successful
 * @probe_subtbl:        Callback to the driver being probed when table and
 *                        subtable match (and optional callback is successful)
 * @driver_data:        Sideband data provided back to the driver
 */
struct acpi_probe_entry {
        __u8 id[ACPI_TABLE_ID_LEN];
        __u8 type;
        acpi_probe_entry_validate_subtbl subtable_valid;
        union {
                acpi_tbl_table_handler probe_table;
                acpi_tbl_entry_handler probe_subtbl;
        };
        kernel_ulong_t driver_data;
};

#define ACPI_DECLARE_PROBE_ENTRY(table, name, table_id, subtable,        \
                                 valid, data, fn)                        \
        static const struct acpi_probe_entry __acpi_probe_##name        \
                __used __section("__" #table "_acpi_probe_table") = {        \
                        .id = table_id,                                        \
                        .type = subtable,                                \
                        .subtable_valid = valid,                        \
                        .probe_table = fn,                                \
                        .driver_data = data,                                \
                }

#define ACPI_DECLARE_SUBTABLE_PROBE_ENTRY(table, name, table_id,        \
                                          subtable, valid, data, fn)        \
        static const struct acpi_probe_entry __acpi_probe_##name        \
                __used __section("__" #table "_acpi_probe_table") = {        \
                        .id = table_id,                                        \
                        .type = subtable,                                \
                        .subtable_valid = valid,                        \
                        .probe_subtbl = fn,                                \
                        .driver_data = data,                                \
                }

#define ACPI_PROBE_TABLE(name)                __##name##_acpi_probe_table
#define ACPI_PROBE_TABLE_END(name)        __##name##_acpi_probe_table_end

int __acpi_probe_device_table(struct acpi_probe_entry *start, int nr);

#define acpi_probe_device_table(t)                                        \
        ({                                                                 \
                extern struct acpi_probe_entry ACPI_PROBE_TABLE(t),        \
                                               ACPI_PROBE_TABLE_END(t);        \
                __acpi_probe_device_table(&ACPI_PROBE_TABLE(t),                \
                                          (&ACPI_PROBE_TABLE_END(t) -        \
                                           &ACPI_PROBE_TABLE(t)));        \
        })
#else
static inline int acpi_dev_get_property(struct acpi_device *adev,
                                        const char *name, acpi_object_type type,
                                        const union acpi_object **obj)
{
        return -ENXIO;
}

static inline int
__acpi_node_get_property_reference(const struct fwnode_handle *fwnode,
                                const char *name, size_t index, size_t num_args,
                                struct fwnode_reference_args *args)
{
        return -ENXIO;
}

static inline int
acpi_node_get_property_reference(const struct fwnode_handle *fwnode,
                                 const char *name, size_t index,
                                 struct fwnode_reference_args *args)
{
        return -ENXIO;
}

static inline int acpi_node_prop_get(const struct fwnode_handle *fwnode,
                                     const char *propname,
                                     void **valptr)
{
        return -ENXIO;
}

static inline int acpi_dev_prop_read_single(const struct acpi_device *adev,
                                            const char *propname,
                                            enum dev_prop_type proptype,
                                            void *val)
{
        return -ENXIO;
}

static inline int acpi_node_prop_read(const struct fwnode_handle *fwnode,
                                      const char *propname,
                                      enum dev_prop_type proptype,
                                      void *val, size_t nval)
{
        return -ENXIO;
}

static inline int acpi_dev_prop_read(const struct acpi_device *adev,
                                     const char *propname,
                                     enum dev_prop_type proptype,
                                     void *val, size_t nval)
{
        return -ENXIO;
}

static inline struct fwnode_handle *
acpi_get_next_subnode(const struct fwnode_handle *fwnode,
                      struct fwnode_handle *child)
{
        return NULL;
}

static inline struct fwnode_handle *
acpi_node_get_parent(const struct fwnode_handle *fwnode)
{
        return NULL;
}

static inline struct fwnode_handle *
acpi_graph_get_next_endpoint(const struct fwnode_handle *fwnode,
                             struct fwnode_handle *prev)
{
        return ERR_PTR(-ENXIO);
}

static inline int
acpi_graph_get_remote_endpoint(const struct fwnode_handle *fwnode,
                               struct fwnode_handle **remote,
                               struct fwnode_handle **port,
                               struct fwnode_handle **endpoint)
{
        return -ENXIO;
}

#define ACPI_DECLARE_PROBE_ENTRY(table, name, table_id, subtable, valid, data, fn) \
        static const void * __acpi_table_##name[]                        \
                __attribute__((unused))                                        \
                 = { (void *) table_id,                                        \
                     (void *) subtable,                                        \
                     (void *) valid,                                        \
                     (void *) fn,                                        \
                     (void *) data }

#define acpi_probe_device_table(t)        ({ int __r = 0; __r;})
#endif

#ifdef CONFIG_ACPI_TABLE_UPGRADE
void acpi_table_upgrade(void);
#else
static inline void acpi_table_upgrade(void) { }
#endif

#if defined(CONFIG_ACPI) && defined(CONFIG_ACPI_WATCHDOG)
extern bool acpi_has_watchdog(void);
#else
static inline bool acpi_has_watchdog(void) { return false; }
#endif

#ifdef CONFIG_ACPI_SPCR_TABLE
extern bool qdf2400_e44_present;
int acpi_parse_spcr(bool enable_earlycon, bool enable_console);
#else
static inline int acpi_parse_spcr(bool enable_earlycon, bool enable_console)
{
        return 0;
}
#endif

#if IS_ENABLED(CONFIG_ACPI_GENERIC_GSI)
int acpi_irq_get(acpi_handle handle, unsigned int index, struct resource *res);
#else
static inline
int acpi_irq_get(acpi_handle handle, unsigned int index, struct resource *res)
{
        return -EINVAL;
}
#endif

#ifdef CONFIG_ACPI_LPIT
int lpit_read_residency_count_address(u64 *address);
#else
static inline int lpit_read_residency_count_address(u64 *address)
{
        return -EINVAL;
}
#endif

#ifdef CONFIG_ACPI_PPTT
int acpi_pptt_cpu_is_thread(unsigned int cpu);
int find_acpi_cpu_topology(unsigned int cpu, int level);
int find_acpi_cpu_topology_package(unsigned int cpu);
int find_acpi_cpu_topology_hetero_id(unsigned int cpu);
int find_acpi_cpu_cache_topology(unsigned int cpu, int level);
#else
static inline int acpi_pptt_cpu_is_thread(unsigned int cpu)
{
        return -EINVAL;
}
static inline int find_acpi_cpu_topology(unsigned int cpu, int level)
{
        return -EINVAL;
}
static inline int find_acpi_cpu_topology_package(unsigned int cpu)
{
        return -EINVAL;
}
static inline int find_acpi_cpu_topology_hetero_id(unsigned int cpu)
{
        return -EINVAL;
}
static inline int find_acpi_cpu_cache_topology(unsigned int cpu, int level)
{
        return -EINVAL;
}
#endif

#ifdef CONFIG_ACPI
extern int acpi_platform_notify(struct device *dev, enum kobject_action action);
#else
static inline int
acpi_platform_notify(struct device *dev, enum kobject_action action)
{
        return 0;
}
#endif

#endif        /*_LINUX_ACPI_H*/








































































































































    4 































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PTRACE_H
#define _ASM_X86_PTRACE_H

#include <asm/segment.h>
#include <asm/page_types.h>
#include <uapi/asm/ptrace.h>

#ifndef __ASSEMBLY__
#ifdef __i386__

struct pt_regs {
        /*
         * NB: 32-bit x86 CPUs are inconsistent as what happens in the
         * following cases (where %seg represents a segment register):
         *
         * - pushl %seg: some do a 16-bit write and leave the high
         *   bits alone
         * - movl %seg, [mem]: some do a 16-bit write despite the movl
         * - IDT entry: some (e.g. 486) will leave the high bits of CS
         *   and (if applicable) SS undefined.
         *
         * Fortunately, x86-32 doesn't read the high bits on POP or IRET,
         * so we can just treat all of the segment registers as 16-bit
         * values.
         */
        unsigned long bx;
        unsigned long cx;
        unsigned long dx;
        unsigned long si;
        unsigned long di;
        unsigned long bp;
        unsigned long ax;
        unsigned short ds;
        unsigned short __dsh;
        unsigned short es;
        unsigned short __esh;
        unsigned short fs;
        unsigned short __fsh;
        /*
         * On interrupt, gs and __gsh store the vector number.  They never
         * store gs any more.
         */
        unsigned short gs;
        unsigned short __gsh;
        /* On interrupt, this is the error code. */
        unsigned long orig_ax;
        unsigned long ip;
        unsigned short cs;
        unsigned short __csh;
        unsigned long flags;
        unsigned long sp;
        unsigned short ss;
        unsigned short __ssh;
};

#else /* __i386__ */

struct pt_regs {
/*
 * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
 * unless syscall needs a complete, fully filled "struct pt_regs".
 */
        unsigned long r15;
        unsigned long r14;
        unsigned long r13;
        unsigned long r12;
        unsigned long bp;
        unsigned long bx;
/* These regs are callee-clobbered. Always saved on kernel entry. */
        unsigned long r11;
        unsigned long r10;
        unsigned long r9;
        unsigned long r8;
        unsigned long ax;
        unsigned long cx;
        unsigned long dx;
        unsigned long si;
        unsigned long di;
/*
 * On syscall entry, this is syscall#. On CPU exception, this is error code.
 * On hw interrupt, it's IRQ number:
 */
        unsigned long orig_ax;
/* Return frame for iretq */
        unsigned long ip;
        unsigned long cs;
        unsigned long flags;
        unsigned long sp;
        unsigned long ss;
/* top of stack page */
};

#endif /* !__i386__ */

#ifdef CONFIG_PARAVIRT
#include <asm/paravirt_types.h>
#endif

#include <asm/proto.h>

struct cpuinfo_x86;
struct task_struct;

extern unsigned long profile_pc(struct pt_regs *regs);

extern unsigned long
convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs);
extern void send_sigtrap(struct pt_regs *regs, int error_code, int si_code);


static __always_inline unsigned long regs_return_value(struct pt_regs *regs)
{
        return regs->ax;
}

static __always_inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
{
        regs->ax = rc;
}

/*
 * user_mode(regs) determines whether a register set came from user
 * mode.  On x86_32, this is true if V8086 mode was enabled OR if the
 * register set was from protected mode with RPL-3 CS value.  This
 * tricky test checks that with one comparison.
 *
 * On x86_64, vm86 mode is mercifully nonexistent, and we don't need
 * the extra check.
 */
static __always_inline int user_mode(struct pt_regs *regs)
{
#ifdef CONFIG_X86_32
        return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >= USER_RPL;
#else
        return !!(regs->cs & 3);
#endif
}

static inline int v8086_mode(struct pt_regs *regs)
{
#ifdef CONFIG_X86_32
        return (regs->flags & X86_VM_MASK);
#else
        return 0;        /* No V86 mode support in long mode */
#endif
}

static inline bool user_64bit_mode(struct pt_regs *regs)
{
#ifdef CONFIG_X86_64
#ifndef CONFIG_PARAVIRT_XXL
        /*
         * On non-paravirt systems, this is the only long mode CPL 3
         * selector.  We do not allow long mode selectors in the LDT.
         */
        return regs->cs == __USER_CS;
#else
        /* Headers are too twisted for this to go in paravirt.h. */
        return regs->cs == __USER_CS || regs->cs == pv_info.extra_user_64bit_cs;
#endif
#else /* !CONFIG_X86_64 */
        return false;
#endif
}

/*
 * Determine whether the register set came from any context that is running in
 * 64-bit mode.
 */
static inline bool any_64bit_mode(struct pt_regs *regs)
{
#ifdef CONFIG_X86_64
        return !user_mode(regs) || user_64bit_mode(regs);
#else
        return false;
#endif
}

#ifdef CONFIG_X86_64
#define current_user_stack_pointer()        current_pt_regs()->sp
#define compat_user_stack_pointer()        current_pt_regs()->sp

static inline bool ip_within_syscall_gap(struct pt_regs *regs)
{
        bool ret = (regs->ip >= (unsigned long)entry_SYSCALL_64 &&
                    regs->ip <  (unsigned long)entry_SYSCALL_64_safe_stack);

#ifdef CONFIG_IA32_EMULATION
        ret = ret || (regs->ip >= (unsigned long)entry_SYSCALL_compat &&
                      regs->ip <  (unsigned long)entry_SYSCALL_compat_safe_stack);
#endif

        return ret;
}
#endif

static __always_inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
{
        return regs->sp;
}

static __always_inline unsigned long instruction_pointer(struct pt_regs *regs)
{
        return regs->ip;
}

static __always_inline
void instruction_pointer_set(struct pt_regs *regs, unsigned long val)
{
        regs->ip = val;
}

static __always_inline unsigned long frame_pointer(struct pt_regs *regs)
{
        return regs->bp;
}

static __always_inline unsigned long user_stack_pointer(struct pt_regs *regs)
{
        return regs->sp;
}

static __always_inline
void user_stack_pointer_set(struct pt_regs *regs, unsigned long val)
{
        regs->sp = val;
}

static __always_inline bool regs_irqs_disabled(struct pt_regs *regs)
{
        return !(regs->flags & X86_EFLAGS_IF);
}

/* Query offset/name of register from its name/offset */
extern int regs_query_register_offset(const char *name);
extern const char *regs_query_register_name(unsigned int offset);
#define MAX_REG_OFFSET (offsetof(struct pt_regs, ss))

/**
 * regs_get_register() - get register value from its offset
 * @regs:        pt_regs from which register value is gotten.
 * @offset:        offset number of the register.
 *
 * regs_get_register returns the value of a register. The @offset is the
 * offset of the register in struct pt_regs address which specified by @regs.
 * If @offset is bigger than MAX_REG_OFFSET, this returns 0.
 */
static inline unsigned long regs_get_register(struct pt_regs *regs,
                                              unsigned int offset)
{
        if (unlikely(offset > MAX_REG_OFFSET))
                return 0;
#ifdef CONFIG_X86_32
        /* The selector fields are 16-bit. */
        if (offset == offsetof(struct pt_regs, cs) ||
            offset == offsetof(struct pt_regs, ss) ||
            offset == offsetof(struct pt_regs, ds) ||
            offset == offsetof(struct pt_regs, es) ||
            offset == offsetof(struct pt_regs, fs) ||
            offset == offsetof(struct pt_regs, gs)) {
                return *(u16 *)((unsigned long)regs + offset);

        }
#endif
        return *(unsigned long *)((unsigned long)regs + offset);
}

/**
 * regs_within_kernel_stack() - check the address in the stack
 * @regs:        pt_regs which contains kernel stack pointer.
 * @addr:        address which is checked.
 *
 * regs_within_kernel_stack() checks @addr is within the kernel stack page(s).
 * If @addr is within the kernel stack, it returns true. If not, returns false.
 */
static inline int regs_within_kernel_stack(struct pt_regs *regs,
                                           unsigned long addr)
{
        return ((addr & ~(THREAD_SIZE - 1)) == (regs->sp & ~(THREAD_SIZE - 1)));
}

/**
 * regs_get_kernel_stack_nth_addr() - get the address of the Nth entry on stack
 * @regs:        pt_regs which contains kernel stack pointer.
 * @n:                stack entry number.
 *
 * regs_get_kernel_stack_nth() returns the address of the @n th entry of the
 * kernel stack which is specified by @regs. If the @n th entry is NOT in
 * the kernel stack, this returns NULL.
 */
static inline unsigned long *regs_get_kernel_stack_nth_addr(struct pt_regs *regs, unsigned int n)
{
        unsigned long *addr = (unsigned long *)regs->sp;

        addr += n;
        if (regs_within_kernel_stack(regs, (unsigned long)addr))
                return addr;
        else
                return NULL;
}

/* To avoid include hell, we can't include uaccess.h */
extern long copy_from_kernel_nofault(void *dst, const void *src, size_t size);

/**
 * regs_get_kernel_stack_nth() - get Nth entry of the stack
 * @regs:        pt_regs which contains kernel stack pointer.
 * @n:                stack entry number.
 *
 * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which
 * is specified by @regs. If the @n th entry is NOT in the kernel stack
 * this returns 0.
 */
static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
                                                      unsigned int n)
{
        unsigned long *addr;
        unsigned long val;
        long ret;

        addr = regs_get_kernel_stack_nth_addr(regs, n);
        if (addr) {
                ret = copy_from_kernel_nofault(&val, addr, sizeof(val));
                if (!ret)
                        return val;
        }
        return 0;
}

/**
 * regs_get_kernel_argument() - get Nth function argument in kernel
 * @regs:        pt_regs of that context
 * @n:                function argument number (start from 0)
 *
 * regs_get_argument() returns @n th argument of the function call.
 * Note that this chooses most probably assignment, in some case
 * it can be incorrect.
 * This is expected to be called from kprobes or ftrace with regs
 * where the top of stack is the return address.
 */
static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs,
                                                     unsigned int n)
{
        static const unsigned int argument_offs[] = {
#ifdef __i386__
                offsetof(struct pt_regs, ax),
                offsetof(struct pt_regs, dx),
                offsetof(struct pt_regs, cx),
#define NR_REG_ARGUMENTS 3
#else
                offsetof(struct pt_regs, di),
                offsetof(struct pt_regs, si),
                offsetof(struct pt_regs, dx),
                offsetof(struct pt_regs, cx),
                offsetof(struct pt_regs, r8),
                offsetof(struct pt_regs, r9),
#define NR_REG_ARGUMENTS 6
#endif
        };

        if (n >= NR_REG_ARGUMENTS) {
                n -= NR_REG_ARGUMENTS - 1;
                return regs_get_kernel_stack_nth(regs, n);
        } else
                return regs_get_register(regs, argument_offs[n]);
}

#define arch_has_single_step()        (1)
#ifdef CONFIG_X86_DEBUGCTLMSR
#define arch_has_block_step()        (1)
#else
#define arch_has_block_step()        (boot_cpu_data.x86 >= 6)
#endif

#define ARCH_HAS_USER_SINGLE_STEP_REPORT

struct user_desc;
extern int do_get_thread_area(struct task_struct *p, int idx,
                              struct user_desc __user *info);
extern int do_set_thread_area(struct task_struct *p, int idx,
                              struct user_desc __user *info, int can_allocate);

#ifdef CONFIG_X86_64
# define do_set_thread_area_64(p, s, t)        do_arch_prctl_64(p, s, t)
#else
# define do_set_thread_area_64(p, s, t)        (0)
#endif

#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_PTRACE_H */

















































    1 





    1 

    1 

    1 














































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 















































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Resizable, Scalable, Concurrent Hash Table
 *
 * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au>
 * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch>
 * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
 *
 * Code partially derived from nft_hash
 * Rewritten with rehash code from br_multicast plus single list
 * pointer as suggested by Josh Triplett
 */

#include <linux/atomic.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/log2.h>
#include <linux/sched.h>
#include <linux/rculist.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <linux/jhash.h>
#include <linux/random.h>
#include <linux/rhashtable.h>
#include <linux/err.h>
#include <linux/export.h>

#define HASH_DEFAULT_SIZE        64UL
#define HASH_MIN_SIZE                4U

union nested_table {
        union nested_table __rcu *table;
        struct rhash_lock_head __rcu *bucket;
};

static u32 head_hashfn(struct rhashtable *ht,
                       const struct bucket_table *tbl,
                       const struct rhash_head *he)
{
        return rht_head_hashfn(ht, tbl, he, ht->p);
}

#ifdef CONFIG_PROVE_LOCKING
#define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT))

int lockdep_rht_mutex_is_held(struct rhashtable *ht)
{
        return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1;
}
EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);

int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash)
{
        if (!debug_locks)
                return 1;
        if (unlikely(tbl->nest))
                return 1;
        return bit_spin_is_locked(0, (unsigned long *)&tbl->buckets[hash]);
}
EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
#else
#define ASSERT_RHT_MUTEX(HT)
#endif

static inline union nested_table *nested_table_top(
        const struct bucket_table *tbl)
{
        /* The top-level bucket entry does not need RCU protection
         * because it's set at the same time as tbl->nest.
         */
        return (void *)rcu_dereference_protected(tbl->buckets[0], 1);
}

static void nested_table_free(union nested_table *ntbl, unsigned int size)
{
        const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
        const unsigned int len = 1 << shift;
        unsigned int i;

        ntbl = rcu_dereference_protected(ntbl->table, 1);
        if (!ntbl)
                return;

        if (size > len) {
                size >>= shift;
                for (i = 0; i < len; i++)
                        nested_table_free(ntbl + i, size);
        }

        kfree(ntbl);
}

static void nested_bucket_table_free(const struct bucket_table *tbl)
{
        unsigned int size = tbl->size >> tbl->nest;
        unsigned int len = 1 << tbl->nest;
        union nested_table *ntbl;
        unsigned int i;

        ntbl = nested_table_top(tbl);

        for (i = 0; i < len; i++)
                nested_table_free(ntbl + i, size);

        kfree(ntbl);
}

static void bucket_table_free(const struct bucket_table *tbl)
{
        if (tbl->nest)
                nested_bucket_table_free(tbl);

        kvfree(tbl);
}

static void bucket_table_free_rcu(struct rcu_head *head)
{
        bucket_table_free(container_of(head, struct bucket_table, rcu));
}

static union nested_table *nested_table_alloc(struct rhashtable *ht,
                                              union nested_table __rcu **prev,
                                              bool leaf)
{
        union nested_table *ntbl;
        int i;

        ntbl = rcu_dereference(*prev);
        if (ntbl)
                return ntbl;

        ntbl = kzalloc(PAGE_SIZE, GFP_ATOMIC);

        if (ntbl && leaf) {
                for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0]); i++)
                        INIT_RHT_NULLS_HEAD(ntbl[i].bucket);
        }

        if (cmpxchg((union nested_table **)prev, NULL, ntbl) == NULL)
                return ntbl;
        /* Raced with another thread. */
        kfree(ntbl);
        return rcu_dereference(*prev);
}

static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht,
                                                      size_t nbuckets,
                                                      gfp_t gfp)
{
        const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
        struct bucket_table *tbl;
        size_t size;

        if (nbuckets < (1 << (shift + 1)))
                return NULL;

        size = sizeof(*tbl) + sizeof(tbl->buckets[0]);

        tbl = kzalloc(size, gfp);
        if (!tbl)
                return NULL;

        if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets,
                                false)) {
                kfree(tbl);
                return NULL;
        }

        tbl->nest = (ilog2(nbuckets) - 1) % shift + 1;

        return tbl;
}

static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
                                               size_t nbuckets,
                                               gfp_t gfp)
{
        struct bucket_table *tbl = NULL;
        size_t size;
        int i;
        static struct lock_class_key __key;

        tbl = kvzalloc(struct_size(tbl, buckets, nbuckets), gfp);

        size = nbuckets;

        if (tbl == NULL && (gfp & ~__GFP_NOFAIL) != GFP_KERNEL) {
                tbl = nested_bucket_table_alloc(ht, nbuckets, gfp);
                nbuckets = 0;
        }

        if (tbl == NULL)
                return NULL;

        lockdep_init_map(&tbl->dep_map, "rhashtable_bucket", &__key, 0);

        tbl->size = size;

        rcu_head_init(&tbl->rcu);
        INIT_LIST_HEAD(&tbl->walkers);

        tbl->hash_rnd = get_random_u32();

        for (i = 0; i < nbuckets; i++)
                INIT_RHT_NULLS_HEAD(tbl->buckets[i]);

        return tbl;
}

static struct bucket_table *rhashtable_last_table(struct rhashtable *ht,
                                                  struct bucket_table *tbl)
{
        struct bucket_table *new_tbl;

        do {
                new_tbl = tbl;
                tbl = rht_dereference_rcu(tbl->future_tbl, ht);
        } while (tbl);

        return new_tbl;
}

static int rhashtable_rehash_one(struct rhashtable *ht,
                                 struct rhash_lock_head __rcu **bkt,
                                 unsigned int old_hash)
{
        struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
        struct bucket_table *new_tbl = rhashtable_last_table(ht, old_tbl);
        int err = -EAGAIN;
        struct rhash_head *head, *next, *entry;
        struct rhash_head __rcu **pprev = NULL;
        unsigned int new_hash;

        if (new_tbl->nest)
                goto out;

        err = -ENOENT;

        rht_for_each_from(entry, rht_ptr(bkt, old_tbl, old_hash),
                          old_tbl, old_hash) {
                err = 0;
                next = rht_dereference_bucket(entry->next, old_tbl, old_hash);

                if (rht_is_a_nulls(next))
                        break;

                pprev = &entry->next;
        }

        if (err)
                goto out;

        new_hash = head_hashfn(ht, new_tbl, entry);

        rht_lock_nested(new_tbl, &new_tbl->buckets[new_hash], SINGLE_DEPTH_NESTING);

        head = rht_ptr(new_tbl->buckets + new_hash, new_tbl, new_hash);

        RCU_INIT_POINTER(entry->next, head);

        rht_assign_unlock(new_tbl, &new_tbl->buckets[new_hash], entry);

        if (pprev)
                rcu_assign_pointer(*pprev, next);
        else
                /* Need to preserved the bit lock. */
                rht_assign_locked(bkt, next);

out:
        return err;
}

static int rhashtable_rehash_chain(struct rhashtable *ht,
                                    unsigned int old_hash)
{
        struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
        struct rhash_lock_head __rcu **bkt = rht_bucket_var(old_tbl, old_hash);
        int err;

        if (!bkt)
                return 0;
        rht_lock(old_tbl, bkt);

        while (!(err = rhashtable_rehash_one(ht, bkt, old_hash)))
                ;

        if (err == -ENOENT)
                err = 0;
        rht_unlock(old_tbl, bkt);

        return err;
}

static int rhashtable_rehash_attach(struct rhashtable *ht,
                                    struct bucket_table *old_tbl,
                                    struct bucket_table *new_tbl)
{
        /* Make insertions go into the new, empty table right away. Deletions
         * and lookups will be attempted in both tables until we synchronize.
         * As cmpxchg() provides strong barriers, we do not need
         * rcu_assign_pointer().
         */

        if (cmpxchg((struct bucket_table **)&old_tbl->future_tbl, NULL,
                    new_tbl) != NULL)
                return -EEXIST;

        return 0;
}

static int rhashtable_rehash_table(struct rhashtable *ht)
{
        struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
        struct bucket_table *new_tbl;
        struct rhashtable_walker *walker;
        unsigned int old_hash;
        int err;

        new_tbl = rht_dereference(old_tbl->future_tbl, ht);
        if (!new_tbl)
                return 0;

        for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
                err = rhashtable_rehash_chain(ht, old_hash);
                if (err)
                        return err;
                cond_resched();
        }

        /* Publish the new table pointer. */
        rcu_assign_pointer(ht->tbl, new_tbl);

        spin_lock(&ht->lock);
        list_for_each_entry(walker, &old_tbl->walkers, list)
                walker->tbl = NULL;

        /* Wait for readers. All new readers will see the new
         * table, and thus no references to the old table will
         * remain.
         * We do this inside the locked region so that
         * rhashtable_walk_stop() can use rcu_head_after_call_rcu()
         * to check if it should not re-link the table.
         */
        call_rcu(&old_tbl->rcu, bucket_table_free_rcu);
        spin_unlock(&ht->lock);

        return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0;
}

static int rhashtable_rehash_alloc(struct rhashtable *ht,
                                   struct bucket_table *old_tbl,
                                   unsigned int size)
{
        struct bucket_table *new_tbl;
        int err;

        ASSERT_RHT_MUTEX(ht);

        new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
        if (new_tbl == NULL)
                return -ENOMEM;

        err = rhashtable_rehash_attach(ht, old_tbl, new_tbl);
        if (err)
                bucket_table_free(new_tbl);

        return err;
}

/**
 * rhashtable_shrink - Shrink hash table while allowing concurrent lookups
 * @ht:                the hash table to shrink
 *
 * This function shrinks the hash table to fit, i.e., the smallest
 * size would not cause it to expand right away automatically.
 *
 * The caller must ensure that no concurrent resizing occurs by holding
 * ht->mutex.
 *
 * The caller must ensure that no concurrent table mutations take place.
 * It is however valid to have concurrent lookups if they are RCU protected.
 *
 * It is valid to have concurrent insertions and deletions protected by per
 * bucket locks or concurrent RCU protected lookups and traversals.
 */
static int rhashtable_shrink(struct rhashtable *ht)
{
        struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
        unsigned int nelems = atomic_read(&ht->nelems);
        unsigned int size = 0;

        if (nelems)
                size = roundup_pow_of_two(nelems * 3 / 2);
        if (size < ht->p.min_size)
                size = ht->p.min_size;

        if (old_tbl->size <= size)
                return 0;

        if (rht_dereference(old_tbl->future_tbl, ht))
                return -EEXIST;

        return rhashtable_rehash_alloc(ht, old_tbl, size);
}

static void rht_deferred_worker(struct work_struct *work)
{
        struct rhashtable *ht;
        struct bucket_table *tbl;
        int err = 0;

        ht = container_of(work, struct rhashtable, run_work);
        mutex_lock(&ht->mutex);

        tbl = rht_dereference(ht->tbl, ht);
        tbl = rhashtable_last_table(ht, tbl);

        if (rht_grow_above_75(ht, tbl))
                err = rhashtable_rehash_alloc(ht, tbl, tbl->size * 2);
        else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl))
                err = rhashtable_shrink(ht);
        else if (tbl->nest)
                err = rhashtable_rehash_alloc(ht, tbl, tbl->size);

        if (!err || err == -EEXIST) {
                int nerr;

                nerr = rhashtable_rehash_table(ht);
                err = err ?: nerr;
        }

        mutex_unlock(&ht->mutex);

        if (err)
                schedule_work(&ht->run_work);
}

static int rhashtable_insert_rehash(struct rhashtable *ht,
                                    struct bucket_table *tbl)
{
        struct bucket_table *old_tbl;
        struct bucket_table *new_tbl;
        unsigned int size;
        int err;

        old_tbl = rht_dereference_rcu(ht->tbl, ht);

        size = tbl->size;

        err = -EBUSY;

        if (rht_grow_above_75(ht, tbl))
                size *= 2;
        /* Do not schedule more than one rehash */
        else if (old_tbl != tbl)
                goto fail;

        err = -ENOMEM;

        new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC | __GFP_NOWARN);
        if (new_tbl == NULL)
                goto fail;

        err = rhashtable_rehash_attach(ht, tbl, new_tbl);
        if (err) {
                bucket_table_free(new_tbl);
                if (err == -EEXIST)
                        err = 0;
        } else
                schedule_work(&ht->run_work);

        return err;

fail:
        /* Do not fail the insert if someone else did a rehash. */
        if (likely(rcu_access_pointer(tbl->future_tbl)))
                return 0;

        /* Schedule async rehash to retry allocation in process context. */
        if (err == -ENOMEM)
                schedule_work(&ht->run_work);

        return err;
}

static void *rhashtable_lookup_one(struct rhashtable *ht,
                                   struct rhash_lock_head __rcu **bkt,
                                   struct bucket_table *tbl, unsigned int hash,
                                   const void *key, struct rhash_head *obj)
{
        struct rhashtable_compare_arg arg = {
                .ht = ht,
                .key = key,
        };
        struct rhash_head __rcu **pprev = NULL;
        struct rhash_head *head;
        int elasticity;

        elasticity = RHT_ELASTICITY;
        rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) {
                struct rhlist_head *list;
                struct rhlist_head *plist;

                elasticity--;
                if (!key ||
                    (ht->p.obj_cmpfn ?
                     ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) :
                     rhashtable_compare(&arg, rht_obj(ht, head)))) {
                        pprev = &head->next;
                        continue;
                }

                if (!ht->rhlist)
                        return rht_obj(ht, head);

                list = container_of(obj, struct rhlist_head, rhead);
                plist = container_of(head, struct rhlist_head, rhead);

                RCU_INIT_POINTER(list->next, plist);
                head = rht_dereference_bucket(head->next, tbl, hash);
                RCU_INIT_POINTER(list->rhead.next, head);
                if (pprev)
                        rcu_assign_pointer(*pprev, obj);
                else
                        /* Need to preserve the bit lock */
                        rht_assign_locked(bkt, obj);

                return NULL;
        }

        if (elasticity <= 0)
                return ERR_PTR(-EAGAIN);

        return ERR_PTR(-ENOENT);
}

static struct bucket_table *rhashtable_insert_one(
        struct rhashtable *ht, struct rhash_lock_head __rcu **bkt,
        struct bucket_table *tbl, unsigned int hash, struct rhash_head *obj,
        void *data)
{
        struct bucket_table *new_tbl;
        struct rhash_head *head;

        if (!IS_ERR_OR_NULL(data))
                return ERR_PTR(-EEXIST);

        if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT)
                return ERR_CAST(data);

        new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
        if (new_tbl)
                return new_tbl;

        if (PTR_ERR(data) != -ENOENT)
                return ERR_CAST(data);

        if (unlikely(rht_grow_above_max(ht, tbl)))
                return ERR_PTR(-E2BIG);

        if (unlikely(rht_grow_above_100(ht, tbl)))
                return ERR_PTR(-EAGAIN);

        head = rht_ptr(bkt, tbl, hash);

        RCU_INIT_POINTER(obj->next, head);
        if (ht->rhlist) {
                struct rhlist_head *list;

                list = container_of(obj, struct rhlist_head, rhead);
                RCU_INIT_POINTER(list->next, NULL);
        }

        /* bkt is always the head of the list, so it holds
         * the lock, which we need to preserve
         */
        rht_assign_locked(bkt, obj);

        atomic_inc(&ht->nelems);
        if (rht_grow_above_75(ht, tbl))
                schedule_work(&ht->run_work);

        return NULL;
}

static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
                                   struct rhash_head *obj)
{
        struct bucket_table *new_tbl;
        struct bucket_table *tbl;
        struct rhash_lock_head __rcu **bkt;
        unsigned int hash;
        void *data;

        new_tbl = rcu_dereference(ht->tbl);

        do {
                tbl = new_tbl;
                hash = rht_head_hashfn(ht, tbl, obj, ht->p);
                if (rcu_access_pointer(tbl->future_tbl))
                        /* Failure is OK */
                        bkt = rht_bucket_var(tbl, hash);
                else
                        bkt = rht_bucket_insert(ht, tbl, hash);
                if (bkt == NULL) {
                        new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
                        data = ERR_PTR(-EAGAIN);
                } else {
                        rht_lock(tbl, bkt);
                        data = rhashtable_lookup_one(ht, bkt, tbl,
                                                     hash, key, obj);
                        new_tbl = rhashtable_insert_one(ht, bkt, tbl,
                                                        hash, obj, data);
                        if (PTR_ERR(new_tbl) != -EEXIST)
                                data = ERR_CAST(new_tbl);

                        rht_unlock(tbl, bkt);
                }
        } while (!IS_ERR_OR_NULL(new_tbl));

        if (PTR_ERR(data) == -EAGAIN)
                data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?:
                               -EAGAIN);

        return data;
}

void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
                             struct rhash_head *obj)
{
        void *data;

        do {
                rcu_read_lock();
                data = rhashtable_try_insert(ht, key, obj);
                rcu_read_unlock();
        } while (PTR_ERR(data) == -EAGAIN);

        return data;
}
EXPORT_SYMBOL_GPL(rhashtable_insert_slow);

/**
 * rhashtable_walk_enter - Initialise an iterator
 * @ht:                Table to walk over
 * @iter:        Hash table Iterator
 *
 * This function prepares a hash table walk.
 *
 * Note that if you restart a walk after rhashtable_walk_stop you
 * may see the same object twice.  Also, you may miss objects if
 * there are removals in between rhashtable_walk_stop and the next
 * call to rhashtable_walk_start.
 *
 * For a completely stable walk you should construct your own data
 * structure outside the hash table.
 *
 * This function may be called from any process context, including
 * non-preemptable context, but cannot be called from softirq or
 * hardirq context.
 *
 * You must call rhashtable_walk_exit after this function returns.
 */
void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter)
{
        iter->ht = ht;
        iter->p = NULL;
        iter->slot = 0;
        iter->skip = 0;
        iter->end_of_table = 0;

        spin_lock(&ht->lock);
        iter->walker.tbl =
                rcu_dereference_protected(ht->tbl, lockdep_is_held(&ht->lock));
        list_add(&iter->walker.list, &iter->walker.tbl->walkers);
        spin_unlock(&ht->lock);
}
EXPORT_SYMBOL_GPL(rhashtable_walk_enter);

/**
 * rhashtable_walk_exit - Free an iterator
 * @iter:        Hash table Iterator
 *
 * This function frees resources allocated by rhashtable_walk_enter.
 */
void rhashtable_walk_exit(struct rhashtable_iter *iter)
{
        spin_lock(&iter->ht->lock);
        if (iter->walker.tbl)
                list_del(&iter->walker.list);
        spin_unlock(&iter->ht->lock);
}
EXPORT_SYMBOL_GPL(rhashtable_walk_exit);

/**
 * rhashtable_walk_start_check - Start a hash table walk
 * @iter:        Hash table iterator
 *
 * Start a hash table walk at the current iterator position.  Note that we take
 * the RCU lock in all cases including when we return an error.  So you must
 * always call rhashtable_walk_stop to clean up.
 *
 * Returns zero if successful.
 *
 * Returns -EAGAIN if resize event occured.  Note that the iterator
 * will rewind back to the beginning and you may use it immediately
 * by calling rhashtable_walk_next.
 *
 * rhashtable_walk_start is defined as an inline variant that returns
 * void. This is preferred in cases where the caller would ignore
 * resize events and always continue.
 */
int rhashtable_walk_start_check(struct rhashtable_iter *iter)
        __acquires(RCU)
{
        struct rhashtable *ht = iter->ht;
        bool rhlist = ht->rhlist;

        rcu_read_lock();

        spin_lock(&ht->lock);
        if (iter->walker.tbl)
                list_del(&iter->walker.list);
        spin_unlock(&ht->lock);

        if (iter->end_of_table)
                return 0;
        if (!iter->walker.tbl) {
                iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht);
                iter->slot = 0;
                iter->skip = 0;
                return -EAGAIN;
        }

        if (iter->p && !rhlist) {
                /*
                 * We need to validate that 'p' is still in the table, and
                 * if so, update 'skip'
                 */
                struct rhash_head *p;
                int skip = 0;
                rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
                        skip++;
                        if (p == iter->p) {
                                iter->skip = skip;
                                goto found;
                        }
                }
                iter->p = NULL;
        } else if (iter->p && rhlist) {
                /* Need to validate that 'list' is still in the table, and
                 * if so, update 'skip' and 'p'.
                 */
                struct rhash_head *p;
                struct rhlist_head *list;
                int skip = 0;
                rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
                        for (list = container_of(p, struct rhlist_head, rhead);
                             list;
                             list = rcu_dereference(list->next)) {
                                skip++;
                                if (list == iter->list) {
                                        iter->p = p;
                                        iter->skip = skip;
                                        goto found;
                                }
                        }
                }
                iter->p = NULL;
        }
found:
        return 0;
}
EXPORT_SYMBOL_GPL(rhashtable_walk_start_check);

/**
 * __rhashtable_walk_find_next - Find the next element in a table (or the first
 * one in case of a new walk).
 *
 * @iter:        Hash table iterator
 *
 * Returns the found object or NULL when the end of the table is reached.
 *
 * Returns -EAGAIN if resize event occurred.
 */
static void *__rhashtable_walk_find_next(struct rhashtable_iter *iter)
{
        struct bucket_table *tbl = iter->walker.tbl;
        struct rhlist_head *list = iter->list;
        struct rhashtable *ht = iter->ht;
        struct rhash_head *p = iter->p;
        bool rhlist = ht->rhlist;

        if (!tbl)
                return NULL;

        for (; iter->slot < tbl->size; iter->slot++) {
                int skip = iter->skip;

                rht_for_each_rcu(p, tbl, iter->slot) {
                        if (rhlist) {
                                list = container_of(p, struct rhlist_head,
                                                    rhead);
                                do {
                                        if (!skip)
                                                goto next;
                                        skip--;
                                        list = rcu_dereference(list->next);
                                } while (list);

                                continue;
                        }
                        if (!skip)
                                break;
                        skip--;
                }

next:
                if (!rht_is_a_nulls(p)) {
                        iter->skip++;
                        iter->p = p;
                        iter->list = list;
                        return rht_obj(ht, rhlist ? &list->rhead : p);
                }

                iter->skip = 0;
        }

        iter->p = NULL;

        /* Ensure we see any new tables. */
        smp_rmb();

        iter->walker.tbl = rht_dereference_rcu(tbl->future_tbl, ht);
        if (iter->walker.tbl) {
                iter->slot = 0;
                iter->skip = 0;
                return ERR_PTR(-EAGAIN);
        } else {
                iter->end_of_table = true;
        }

        return NULL;
}

/**
 * rhashtable_walk_next - Return the next object and advance the iterator
 * @iter:        Hash table iterator
 *
 * Note that you must call rhashtable_walk_stop when you are finished
 * with the walk.
 *
 * Returns the next object or NULL when the end of the table is reached.
 *
 * Returns -EAGAIN if resize event occurred.  Note that the iterator
 * will rewind back to the beginning and you may continue to use it.
 */
void *rhashtable_walk_next(struct rhashtable_iter *iter)
{
        struct rhlist_head *list = iter->list;
        struct rhashtable *ht = iter->ht;
        struct rhash_head *p = iter->p;
        bool rhlist = ht->rhlist;

        if (p) {
                if (!rhlist || !(list = rcu_dereference(list->next))) {
                        p = rcu_dereference(p->next);
                        list = container_of(p, struct rhlist_head, rhead);
                }
                if (!rht_is_a_nulls(p)) {
                        iter->skip++;
                        iter->p = p;
                        iter->list = list;
                        return rht_obj(ht, rhlist ? &list->rhead : p);
                }

                /* At the end of this slot, switch to next one and then find
                 * next entry from that point.
                 */
                iter->skip = 0;
                iter->slot++;
        }

        return __rhashtable_walk_find_next(iter);
}
EXPORT_SYMBOL_GPL(rhashtable_walk_next);

/**
 * rhashtable_walk_peek - Return the next object but don't advance the iterator
 * @iter:        Hash table iterator
 *
 * Returns the next object or NULL when the end of the table is reached.
 *
 * Returns -EAGAIN if resize event occurred.  Note that the iterator
 * will rewind back to the beginning and you may continue to use it.
 */
void *rhashtable_walk_peek(struct rhashtable_iter *iter)
{
        struct rhlist_head *list = iter->list;
        struct rhashtable *ht = iter->ht;
        struct rhash_head *p = iter->p;

        if (p)
                return rht_obj(ht, ht->rhlist ? &list->rhead : p);

        /* No object found in current iter, find next one in the table. */

        if (iter->skip) {
                /* A nonzero skip value points to the next entry in the table
                 * beyond that last one that was found. Decrement skip so
                 * we find the current value. __rhashtable_walk_find_next
                 * will restore the original value of skip assuming that
                 * the table hasn't changed.
                 */
                iter->skip--;
        }

        return __rhashtable_walk_find_next(iter);
}
EXPORT_SYMBOL_GPL(rhashtable_walk_peek);

/**
 * rhashtable_walk_stop - Finish a hash table walk
 * @iter:        Hash table iterator
 *
 * Finish a hash table walk.  Does not reset the iterator to the start of the
 * hash table.
 */
void rhashtable_walk_stop(struct rhashtable_iter *iter)
        __releases(RCU)
{
        struct rhashtable *ht;
        struct bucket_table *tbl = iter->walker.tbl;

        if (!tbl)
                goto out;

        ht = iter->ht;

        spin_lock(&ht->lock);
        if (rcu_head_after_call_rcu(&tbl->rcu, bucket_table_free_rcu))
                /* This bucket table is being freed, don't re-link it. */
                iter->walker.tbl = NULL;
        else
                list_add(&iter->walker.list, &tbl->walkers);
        spin_unlock(&ht->lock);

out:
        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(rhashtable_walk_stop);

static size_t rounded_hashtable_size(const struct rhashtable_params *params)
{
        size_t retsize;

        if (params->nelem_hint)
                retsize = max(roundup_pow_of_two(params->nelem_hint * 4 / 3),
                              (unsigned long)params->min_size);
        else
                retsize = max(HASH_DEFAULT_SIZE,
                              (unsigned long)params->min_size);

        return retsize;
}

static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed)
{
        return jhash2(key, length, seed);
}

/**
 * rhashtable_init - initialize a new hash table
 * @ht:                hash table to be initialized
 * @params:        configuration parameters
 *
 * Initializes a new hash table based on the provided configuration
 * parameters. A table can be configured either with a variable or
 * fixed length key:
 *
 * Configuration Example 1: Fixed length keys
 * struct test_obj {
 *        int                        key;
 *        void *                        my_member;
 *        struct rhash_head        node;
 * };
 *
 * struct rhashtable_params params = {
 *        .head_offset = offsetof(struct test_obj, node),
 *        .key_offset = offsetof(struct test_obj, key),
 *        .key_len = sizeof(int),
 *        .hashfn = jhash,
 * };
 *
 * Configuration Example 2: Variable length keys
 * struct test_obj {
 *        [...]
 *        struct rhash_head        node;
 * };
 *
 * u32 my_hash_fn(const void *data, u32 len, u32 seed)
 * {
 *        struct test_obj *obj = data;
 *
 *        return [... hash ...];
 * }
 *
 * struct rhashtable_params params = {
 *        .head_offset = offsetof(struct test_obj, node),
 *        .hashfn = jhash,
 *        .obj_hashfn = my_hash_fn,
 * };
 */
int rhashtable_init(struct rhashtable *ht,
                    const struct rhashtable_params *params)
{
        struct bucket_table *tbl;
        size_t size;

        if ((!params->key_len && !params->obj_hashfn) ||
            (params->obj_hashfn && !params->obj_cmpfn))
                return -EINVAL;

        memset(ht, 0, sizeof(*ht));
        mutex_init(&ht->mutex);
        spin_lock_init(&ht->lock);
        memcpy(&ht->p, params, sizeof(*params));

        if (params->min_size)
                ht->p.min_size = roundup_pow_of_two(params->min_size);

        /* Cap total entries at 2^31 to avoid nelems overflow. */
        ht->max_elems = 1u << 31;

        if (params->max_size) {
                ht->p.max_size = rounddown_pow_of_two(params->max_size);
                if (ht->p.max_size < ht->max_elems / 2)
                        ht->max_elems = ht->p.max_size * 2;
        }

        ht->p.min_size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE);

        size = rounded_hashtable_size(&ht->p);

        ht->key_len = ht->p.key_len;
        if (!params->hashfn) {
                ht->p.hashfn = jhash;

                if (!(ht->key_len & (sizeof(u32) - 1))) {
                        ht->key_len /= sizeof(u32);
                        ht->p.hashfn = rhashtable_jhash2;
                }
        }

        /*
         * This is api initialization and thus we need to guarantee the
         * initial rhashtable allocation. Upon failure, retry with the
         * smallest possible size with __GFP_NOFAIL semantics.
         */
        tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
        if (unlikely(tbl == NULL)) {
                size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE);
                tbl = bucket_table_alloc(ht, size, GFP_KERNEL | __GFP_NOFAIL);
        }

        atomic_set(&ht->nelems, 0);

        RCU_INIT_POINTER(ht->tbl, tbl);

        INIT_WORK(&ht->run_work, rht_deferred_worker);

        return 0;
}
EXPORT_SYMBOL_GPL(rhashtable_init);

/**
 * rhltable_init - initialize a new hash list table
 * @hlt:        hash list table to be initialized
 * @params:        configuration parameters
 *
 * Initializes a new hash list table.
 *
 * See documentation for rhashtable_init.
 */
int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params)
{
        int err;

        err = rhashtable_init(&hlt->ht, params);
        hlt->ht.rhlist = true;
        return err;
}
EXPORT_SYMBOL_GPL(rhltable_init);

static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj,
                                void (*free_fn)(void *ptr, void *arg),
                                void *arg)
{
        struct rhlist_head *list;

        if (!ht->rhlist) {
                free_fn(rht_obj(ht, obj), arg);
                return;
        }

        list = container_of(obj, struct rhlist_head, rhead);
        do {
                obj = &list->rhead;
                list = rht_dereference(list->next, ht);
                free_fn(rht_obj(ht, obj), arg);
        } while (list);
}

/**
 * rhashtable_free_and_destroy - free elements and destroy hash table
 * @ht:                the hash table to destroy
 * @free_fn:        callback to release resources of element
 * @arg:        pointer passed to free_fn
 *
 * Stops an eventual async resize. If defined, invokes free_fn for each
 * element to releasal resources. Please note that RCU protected
 * readers may still be accessing the elements. Releasing of resources
 * must occur in a compatible manner. Then frees the bucket array.
 *
 * This function will eventually sleep to wait for an async resize
 * to complete. The caller is responsible that no further write operations
 * occurs in parallel.
 */
void rhashtable_free_and_destroy(struct rhashtable *ht,
                                 void (*free_fn)(void *ptr, void *arg),
                                 void *arg)
{
        struct bucket_table *tbl, *next_tbl;
        unsigned int i;

        cancel_work_sync(&ht->run_work);

        mutex_lock(&ht->mutex);
        tbl = rht_dereference(ht->tbl, ht);
restart:
        if (free_fn) {
                for (i = 0; i < tbl->size; i++) {
                        struct rhash_head *pos, *next;

                        cond_resched();
                        for (pos = rht_ptr_exclusive(rht_bucket(tbl, i)),
                             next = !rht_is_a_nulls(pos) ?
                                        rht_dereference(pos->next, ht) : NULL;
                             !rht_is_a_nulls(pos);
                             pos = next,
                             next = !rht_is_a_nulls(pos) ?
                                        rht_dereference(pos->next, ht) : NULL)
                                rhashtable_free_one(ht, pos, free_fn, arg);
                }
        }

        next_tbl = rht_dereference(tbl->future_tbl, ht);
        bucket_table_free(tbl);
        if (next_tbl) {
                tbl = next_tbl;
                goto restart;
        }
        mutex_unlock(&ht->mutex);
}
EXPORT_SYMBOL_GPL(rhashtable_free_and_destroy);

void rhashtable_destroy(struct rhashtable *ht)
{
        return rhashtable_free_and_destroy(ht, NULL, NULL);
}
EXPORT_SYMBOL_GPL(rhashtable_destroy);

struct rhash_lock_head __rcu **__rht_bucket_nested(
        const struct bucket_table *tbl, unsigned int hash)
{
        const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
        unsigned int index = hash & ((1 << tbl->nest) - 1);
        unsigned int size = tbl->size >> tbl->nest;
        unsigned int subhash = hash;
        union nested_table *ntbl;

        ntbl = nested_table_top(tbl);
        ntbl = rht_dereference_bucket_rcu(ntbl[index].table, tbl, hash);
        subhash >>= tbl->nest;

        while (ntbl && size > (1 << shift)) {
                index = subhash & ((1 << shift) - 1);
                ntbl = rht_dereference_bucket_rcu(ntbl[index].table,
                                                  tbl, hash);
                size >>= shift;
                subhash >>= shift;
        }

        if (!ntbl)
                return NULL;

        return &ntbl[subhash].bucket;

}
EXPORT_SYMBOL_GPL(__rht_bucket_nested);

struct rhash_lock_head __rcu **rht_bucket_nested(
        const struct bucket_table *tbl, unsigned int hash)
{
        static struct rhash_lock_head __rcu *rhnull;

        if (!rhnull)
                INIT_RHT_NULLS_HEAD(rhnull);
        return __rht_bucket_nested(tbl, hash) ?: &rhnull;
}
EXPORT_SYMBOL_GPL(rht_bucket_nested);

struct rhash_lock_head __rcu **rht_bucket_nested_insert(
        struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash)
{
        const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
        unsigned int index = hash & ((1 << tbl->nest) - 1);
        unsigned int size = tbl->size >> tbl->nest;
        union nested_table *ntbl;

        ntbl = nested_table_top(tbl);
        hash >>= tbl->nest;
        ntbl = nested_table_alloc(ht, &ntbl[index].table,
                                  size <= (1 << shift));

        while (ntbl && size > (1 << shift)) {
                index = hash & ((1 << shift) - 1);
                size >>= shift;
                hash >>= shift;
                ntbl = nested_table_alloc(ht, &ntbl[index].table,
                                          size <= (1 << shift));
        }

        if (!ntbl)
                return NULL;

        return &ntbl[hash].bucket;

}
EXPORT_SYMBOL_GPL(rht_bucket_nested_insert);































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved.
 * Authors: David Chinner and Glauber Costa
 *
 * Generic LRU infrastructure
 */
#ifndef _LRU_LIST_H
#define _LRU_LIST_H

#include <linux/list.h>
#include <linux/nodemask.h>
#include <linux/shrinker.h>

struct mem_cgroup;

/* list_lru_walk_cb has to always return one of those */
enum lru_status {
        LRU_REMOVED,                /* item removed from list */
        LRU_REMOVED_RETRY,        /* item removed, but lock has been
                                   dropped and reacquired */
        LRU_ROTATE,                /* item referenced, give another pass */
        LRU_SKIP,                /* item cannot be locked, skip */
        LRU_RETRY,                /* item not freeable. May drop the lock
                                   internally, but has to return locked. */
};

struct list_lru_one {
        struct list_head        list;
        /* may become negative during memcg reparenting */
        long                        nr_items;
};

struct list_lru_memcg {
        struct rcu_head                rcu;
        /* array of per cgroup lists, indexed by memcg_cache_id */
        struct list_lru_one        *lru[];
};

struct list_lru_node {
        /* protects all lists on the node, including per cgroup */
        spinlock_t                lock;
        /* global list, used for the root cgroup in cgroup aware lrus */
        struct list_lru_one        lru;
#ifdef CONFIG_MEMCG_KMEM
        /* for cgroup aware lrus points to per cgroup lists, otherwise NULL */
        struct list_lru_memcg        __rcu *memcg_lrus;
#endif
        long nr_items;
} ____cacheline_aligned_in_smp;

struct list_lru {
        struct list_lru_node        *node;
#ifdef CONFIG_MEMCG_KMEM
        struct list_head        list;
        int                        shrinker_id;
        bool                        memcg_aware;
#endif
};

void list_lru_destroy(struct list_lru *lru);
int __list_lru_init(struct list_lru *lru, bool memcg_aware,
                    struct lock_class_key *key, struct shrinker *shrinker);

#define list_lru_init(lru)                                \
        __list_lru_init((lru), false, NULL, NULL)
#define list_lru_init_key(lru, key)                        \
        __list_lru_init((lru), false, (key), NULL)
#define list_lru_init_memcg(lru, shrinker)                \
        __list_lru_init((lru), true, NULL, shrinker)

int memcg_update_all_list_lrus(int num_memcgs);
void memcg_drain_all_list_lrus(int src_idx, struct mem_cgroup *dst_memcg);

/**
 * list_lru_add: add an element to the lru list's tail
 * @list_lru: the lru pointer
 * @item: the item to be added.
 *
 * If the element is already part of a list, this function returns doing
 * nothing. Therefore the caller does not need to keep state about whether or
 * not the element already belongs in the list and is allowed to lazy update
 * it. Note however that this is valid for *a* list, not *this* list. If
 * the caller organize itself in a way that elements can be in more than
 * one type of list, it is up to the caller to fully remove the item from
 * the previous list (with list_lru_del() for instance) before moving it
 * to @list_lru
 *
 * Return value: true if the list was updated, false otherwise
 */
bool list_lru_add(struct list_lru *lru, struct list_head *item);

/**
 * list_lru_del: delete an element to the lru list
 * @list_lru: the lru pointer
 * @item: the item to be deleted.
 *
 * This function works analogously as list_lru_add in terms of list
 * manipulation. The comments about an element already pertaining to
 * a list are also valid for list_lru_del.
 *
 * Return value: true if the list was updated, false otherwise
 */
bool list_lru_del(struct list_lru *lru, struct list_head *item);

/**
 * list_lru_count_one: return the number of objects currently held by @lru
 * @lru: the lru pointer.
 * @nid: the node id to count from.
 * @memcg: the cgroup to count from.
 *
 * Always return a non-negative number, 0 for empty lists. There is no
 * guarantee that the list is not updated while the count is being computed.
 * Callers that want such a guarantee need to provide an outer lock.
 */
unsigned long list_lru_count_one(struct list_lru *lru,
                                 int nid, struct mem_cgroup *memcg);
unsigned long list_lru_count_node(struct list_lru *lru, int nid);

static inline unsigned long list_lru_shrink_count(struct list_lru *lru,
                                                  struct shrink_control *sc)
{
        return list_lru_count_one(lru, sc->nid, sc->memcg);
}

static inline unsigned long list_lru_count(struct list_lru *lru)
{
        long count = 0;
        int nid;

        for_each_node_state(nid, N_NORMAL_MEMORY)
                count += list_lru_count_node(lru, nid);

        return count;
}

void list_lru_isolate(struct list_lru_one *list, struct list_head *item);
void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
                           struct list_head *head);

typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item,
                struct list_lru_one *list, spinlock_t *lock, void *cb_arg);

/**
 * list_lru_walk_one: walk a list_lru, isolating and disposing freeable items.
 * @lru: the lru pointer.
 * @nid: the node id to scan from.
 * @memcg: the cgroup to scan from.
 * @isolate: callback function that is resposible for deciding what to do with
 *  the item currently being scanned
 * @cb_arg: opaque type that will be passed to @isolate
 * @nr_to_walk: how many items to scan.
 *
 * This function will scan all elements in a particular list_lru, calling the
 * @isolate callback for each of those items, along with the current list
 * spinlock and a caller-provided opaque. The @isolate callback can choose to
 * drop the lock internally, but *must* return with the lock held. The callback
 * will return an enum lru_status telling the list_lru infrastructure what to
 * do with the object being scanned.
 *
 * Please note that nr_to_walk does not mean how many objects will be freed,
 * just how many objects will be scanned.
 *
 * Return value: the number of objects effectively removed from the LRU.
 */
unsigned long list_lru_walk_one(struct list_lru *lru,
                                int nid, struct mem_cgroup *memcg,
                                list_lru_walk_cb isolate, void *cb_arg,
                                unsigned long *nr_to_walk);
/**
 * list_lru_walk_one_irq: walk a list_lru, isolating and disposing freeable items.
 * @lru: the lru pointer.
 * @nid: the node id to scan from.
 * @memcg: the cgroup to scan from.
 * @isolate: callback function that is resposible for deciding what to do with
 *  the item currently being scanned
 * @cb_arg: opaque type that will be passed to @isolate
 * @nr_to_walk: how many items to scan.
 *
 * Same as @list_lru_walk_one except that the spinlock is acquired with
 * spin_lock_irq().
 */
unsigned long list_lru_walk_one_irq(struct list_lru *lru,
                                    int nid, struct mem_cgroup *memcg,
                                    list_lru_walk_cb isolate, void *cb_arg,
                                    unsigned long *nr_to_walk);
unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
                                 list_lru_walk_cb isolate, void *cb_arg,
                                 unsigned long *nr_to_walk);

static inline unsigned long
list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
                     list_lru_walk_cb isolate, void *cb_arg)
{
        return list_lru_walk_one(lru, sc->nid, sc->memcg, isolate, cb_arg,
                                 &sc->nr_to_scan);
}

static inline unsigned long
list_lru_shrink_walk_irq(struct list_lru *lru, struct shrink_control *sc,
                         list_lru_walk_cb isolate, void *cb_arg)
{
        return list_lru_walk_one_irq(lru, sc->nid, sc->memcg, isolate, cb_arg,
                                     &sc->nr_to_scan);
}

static inline unsigned long
list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate,
              void *cb_arg, unsigned long nr_to_walk)
{
        long isolated = 0;
        int nid;

        for_each_node_state(nid, N_NORMAL_MEMORY) {
                isolated += list_lru_walk_node(lru, nid, isolate,
                                               cb_arg, &nr_to_walk);
                if (nr_to_walk <= 0)
                        break;
        }
        return isolated;
}
#endif /* _LRU_LIST_H */




















































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_NETLINK_H
#define __LINUX_NETLINK_H


#include <linux/capability.h>
#include <linux/skbuff.h>
#include <linux/export.h>
#include <net/scm.h>
#include <uapi/linux/netlink.h>

struct net;

static inline struct nlmsghdr *nlmsg_hdr(const struct sk_buff *skb)
{
        return (struct nlmsghdr *)skb->data;
}

enum netlink_skb_flags {
        NETLINK_SKB_DST                = 0x8,        /* Dst set in sendto or sendmsg */
};

struct netlink_skb_parms {
        struct scm_creds        creds;                /* Skb credentials        */
        __u32                        portid;
        __u32                        dst_group;
        __u32                        flags;
        struct sock                *sk;
        bool                        nsid_is_set;
        int                        nsid;
};

#define NETLINK_CB(skb)                (*(struct netlink_skb_parms*)&((skb)->cb))
#define NETLINK_CREDS(skb)        (&NETLINK_CB((skb)).creds)


void netlink_table_grab(void);
void netlink_table_ungrab(void);

#define NL_CFG_F_NONROOT_RECV        (1 << 0)
#define NL_CFG_F_NONROOT_SEND        (1 << 1)

/* optional Netlink kernel configuration parameters */
struct netlink_kernel_cfg {
        unsigned int        groups;
        unsigned int        flags;
        void                (*input)(struct sk_buff *skb);
        struct mutex        *cb_mutex;
        int                (*bind)(struct net *net, int group);
        void                (*unbind)(struct net *net, int group);
        bool                (*compare)(struct net *net, struct sock *sk);
};

struct sock *__netlink_kernel_create(struct net *net, int unit,
                                            struct module *module,
                                            struct netlink_kernel_cfg *cfg);
static inline struct sock *
netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg)
{
        return __netlink_kernel_create(net, unit, THIS_MODULE, cfg);
}

/* this can be increased when necessary - don't expose to userland */
#define NETLINK_MAX_COOKIE_LEN        20

/**
 * struct netlink_ext_ack - netlink extended ACK report struct
 * @_msg: message string to report - don't access directly, use
 *        %NL_SET_ERR_MSG
 * @bad_attr: attribute with error
 * @policy: policy for a bad attribute
 * @cookie: cookie data to return to userspace (for success)
 * @cookie_len: actual cookie data length
 */
struct netlink_ext_ack {
        const char *_msg;
        const struct nlattr *bad_attr;
        const struct nla_policy *policy;
        u8 cookie[NETLINK_MAX_COOKIE_LEN];
        u8 cookie_len;
};

/* Always use this macro, this allows later putting the
 * message into a separate section or such for things
 * like translation or listing all possible messages.
 * Currently string formatting is not supported (due
 * to the lack of an output buffer.)
 */
#define NL_SET_ERR_MSG(extack, msg) do {                \
        static const char __msg[] = msg;                \
        struct netlink_ext_ack *__extack = (extack);        \
                                                        \
        if (__extack)                                        \
                __extack->_msg = __msg;                        \
} while (0)

#define NL_SET_ERR_MSG_MOD(extack, msg)                        \
        NL_SET_ERR_MSG((extack), KBUILD_MODNAME ": " msg)

#define NL_SET_BAD_ATTR_POLICY(extack, attr, pol) do {        \
        if ((extack)) {                                        \
                (extack)->bad_attr = (attr);                \
                (extack)->policy = (pol);                \
        }                                                \
} while (0)

#define NL_SET_BAD_ATTR(extack, attr) NL_SET_BAD_ATTR_POLICY(extack, attr, NULL)

#define NL_SET_ERR_MSG_ATTR_POL(extack, attr, pol, msg) do {        \
        static const char __msg[] = msg;                        \
        struct netlink_ext_ack *__extack = (extack);                \
                                                                \
        if (__extack) {                                                \
                __extack->_msg = __msg;                                \
                __extack->bad_attr = (attr);                        \
                __extack->policy = (pol);                        \
        }                                                        \
} while (0)

#define NL_SET_ERR_MSG_ATTR(extack, attr, msg)                \
        NL_SET_ERR_MSG_ATTR_POL(extack, attr, NULL, msg)

static inline void nl_set_extack_cookie_u64(struct netlink_ext_ack *extack,
                                            u64 cookie)
{
        u64 __cookie = cookie;

        if (!extack)
                return;
        memcpy(extack->cookie, &__cookie, sizeof(__cookie));
        extack->cookie_len = sizeof(__cookie);
}

static inline void nl_set_extack_cookie_u32(struct netlink_ext_ack *extack,
                                            u32 cookie)
{
        u32 __cookie = cookie;

        if (!extack)
                return;
        memcpy(extack->cookie, &__cookie, sizeof(__cookie));
        extack->cookie_len = sizeof(__cookie);
}

void netlink_kernel_release(struct sock *sk);
int __netlink_change_ngroups(struct sock *sk, unsigned int groups);
int netlink_change_ngroups(struct sock *sk, unsigned int groups);
void __netlink_clear_multicast_users(struct sock *sk, unsigned int group);
void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err,
                 const struct netlink_ext_ack *extack);
int netlink_has_listeners(struct sock *sk, unsigned int group);
bool netlink_strict_get_check(struct sk_buff *skb);

int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock);
int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 portid,
                      __u32 group, gfp_t allocation);
int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb,
                               __u32 portid, __u32 group, gfp_t allocation,
                               int (*filter)(struct sock *dsk, struct sk_buff *skb, void *data),
                               void *filter_data);
int netlink_set_err(struct sock *ssk, __u32 portid, __u32 group, int code);
int netlink_register_notifier(struct notifier_block *nb);
int netlink_unregister_notifier(struct notifier_block *nb);

/* finegrained unicast helpers: */
struct sock *netlink_getsockbyfilp(struct file *filp);
int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
                      long *timeo, struct sock *ssk);
void netlink_detachskb(struct sock *sk, struct sk_buff *skb);
int netlink_sendskb(struct sock *sk, struct sk_buff *skb);

static inline struct sk_buff *
netlink_skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
{
        struct sk_buff *nskb;

        nskb = skb_clone(skb, gfp_mask);
        if (!nskb)
                return NULL;

        /* This is a large skb, set destructor callback to release head */
        if (is_vmalloc_addr(skb->head))
                nskb->destructor = skb->destructor;

        return nskb;
}

/*
 *        skb should fit one page. This choice is good for headerless malloc.
 *        But we should limit to 8K so that userspace does not have to
 *        use enormous buffer sizes on recvmsg() calls just to avoid
 *        MSG_TRUNC when PAGE_SIZE is very large.
 */
#if PAGE_SIZE < 8192UL
#define NLMSG_GOODSIZE        SKB_WITH_OVERHEAD(PAGE_SIZE)
#else
#define NLMSG_GOODSIZE        SKB_WITH_OVERHEAD(8192UL)
#endif

#define NLMSG_DEFAULT_SIZE (NLMSG_GOODSIZE - NLMSG_HDRLEN)


struct netlink_callback {
        struct sk_buff                *skb;
        const struct nlmsghdr        *nlh;
        int                        (*dump)(struct sk_buff * skb,
                                        struct netlink_callback *cb);
        int                        (*done)(struct netlink_callback *cb);
        void                        *data;
        /* the module that dump function belong to */
        struct module                *module;
        struct netlink_ext_ack        *extack;
        u16                        family;
        u16                        answer_flags;
        u32                        min_dump_alloc;
        unsigned int                prev_seq, seq;
        bool                        strict_check;
        union {
                u8                ctx[48];

                /* args is deprecated. Cast a struct over ctx instead
                 * for proper type safety.
                 */
                long                args[6];
        };
};

struct netlink_notify {
        struct net *net;
        u32 portid;
        int protocol;
};

struct nlmsghdr *
__nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags);

struct netlink_dump_control {
        int (*start)(struct netlink_callback *);
        int (*dump)(struct sk_buff *skb, struct netlink_callback *);
        int (*done)(struct netlink_callback *);
        void *data;
        struct module *module;
        u32 min_dump_alloc;
};

int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
                                const struct nlmsghdr *nlh,
                                struct netlink_dump_control *control);
static inline int netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
                                     const struct nlmsghdr *nlh,
                                     struct netlink_dump_control *control)
{
        if (!control->module)
                control->module = THIS_MODULE;

        return __netlink_dump_start(ssk, skb, nlh, control);
}

struct netlink_tap {
        struct net_device *dev;
        struct module *module;
        struct list_head list;
};

int netlink_add_tap(struct netlink_tap *nt);
int netlink_remove_tap(struct netlink_tap *nt);

bool __netlink_ns_capable(const struct netlink_skb_parms *nsp,
                          struct user_namespace *ns, int cap);
bool netlink_ns_capable(const struct sk_buff *skb,
                        struct user_namespace *ns, int cap);
bool netlink_capable(const struct sk_buff *skb, int cap);
bool netlink_net_capable(const struct sk_buff *skb, int cap);

#endif        /* __LINUX_NETLINK_H */

































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * An interface between IEEE802.15.4 device and rest of the kernel.
 *
 * Copyright (C) 2007-2012 Siemens AG
 *
 * Written by:
 * Pavel Smolenskiy <pavel.smolenskiy@gmail.com>
 * Maxim Gorbachyov <maxim.gorbachev@siemens.com>
 * Maxim Osipov <maxim.osipov@siemens.com>
 * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
 * Alexander Smirnov <alex.bluesman.smirnov@gmail.com>
 */

#ifndef IEEE802154_NETDEVICE_H
#define IEEE802154_NETDEVICE_H

#define IEEE802154_REQUIRED_SIZE(struct_type, member) \
        (offsetof(typeof(struct_type), member) + \
        sizeof(((typeof(struct_type) *)(NULL))->member))

#define IEEE802154_ADDR_OFFSET \
        offsetof(typeof(struct sockaddr_ieee802154), addr)

#define IEEE802154_MIN_NAMELEN (IEEE802154_ADDR_OFFSET + \
        IEEE802154_REQUIRED_SIZE(struct ieee802154_addr_sa, addr_type))

#define IEEE802154_NAMELEN_SHORT (IEEE802154_ADDR_OFFSET + \
        IEEE802154_REQUIRED_SIZE(struct ieee802154_addr_sa, short_addr))

#define IEEE802154_NAMELEN_LONG (IEEE802154_ADDR_OFFSET + \
        IEEE802154_REQUIRED_SIZE(struct ieee802154_addr_sa, hwaddr))

#include <net/af_ieee802154.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/ieee802154.h>

#include <net/cfg802154.h>

struct ieee802154_sechdr {
#if defined(__LITTLE_ENDIAN_BITFIELD)
        u8 level:3,
           key_id_mode:2,
           reserved:3;
#elif defined(__BIG_ENDIAN_BITFIELD)
        u8 reserved:3,
           key_id_mode:2,
           level:3;
#else
#error        "Please fix <asm/byteorder.h>"
#endif
        u8 key_id;
        __le32 frame_counter;
        union {
                __le32 short_src;
                __le64 extended_src;
        };
};

struct ieee802154_hdr_fc {
#if defined(__LITTLE_ENDIAN_BITFIELD)
        u16 type:3,
            security_enabled:1,
            frame_pending:1,
            ack_request:1,
            intra_pan:1,
            reserved:3,
            dest_addr_mode:2,
            version:2,
            source_addr_mode:2;
#elif defined(__BIG_ENDIAN_BITFIELD)
        u16 reserved:1,
            intra_pan:1,
            ack_request:1,
            frame_pending:1,
            security_enabled:1,
            type:3,
            source_addr_mode:2,
            version:2,
            dest_addr_mode:2,
            reserved2:2;
#else
#error        "Please fix <asm/byteorder.h>"
#endif
};

struct ieee802154_hdr {
        struct ieee802154_hdr_fc fc;
        u8 seq;
        struct ieee802154_addr source;
        struct ieee802154_addr dest;
        struct ieee802154_sechdr sec;
};

/* pushes hdr onto the skb. fields of hdr->fc that can be calculated from
 * the contents of hdr will be, and the actual value of those bits in
 * hdr->fc will be ignored. this includes the INTRA_PAN bit and the frame
 * version, if SECEN is set.
 */
int ieee802154_hdr_push(struct sk_buff *skb, struct ieee802154_hdr *hdr);

/* pulls the entire 802.15.4 header off of the skb, including the security
 * header, and performs pan id decompression
 */
int ieee802154_hdr_pull(struct sk_buff *skb, struct ieee802154_hdr *hdr);

/* parses the frame control, sequence number of address fields in a given skb
 * and stores them into hdr, performing pan id decompression and length checks
 * to be suitable for use in header_ops.parse
 */
int ieee802154_hdr_peek_addrs(const struct sk_buff *skb,
                              struct ieee802154_hdr *hdr);

/* parses the full 802.15.4 header a given skb and stores them into hdr,
 * performing pan id decompression and length checks to be suitable for use in
 * header_ops.parse
 */
int ieee802154_hdr_peek(const struct sk_buff *skb, struct ieee802154_hdr *hdr);

int ieee802154_max_payload(const struct ieee802154_hdr *hdr);

static inline int
ieee802154_sechdr_authtag_len(const struct ieee802154_sechdr *sec)
{
        switch (sec->level) {
        case IEEE802154_SCF_SECLEVEL_MIC32:
        case IEEE802154_SCF_SECLEVEL_ENC_MIC32:
                return 4;
        case IEEE802154_SCF_SECLEVEL_MIC64:
        case IEEE802154_SCF_SECLEVEL_ENC_MIC64:
                return 8;
        case IEEE802154_SCF_SECLEVEL_MIC128:
        case IEEE802154_SCF_SECLEVEL_ENC_MIC128:
                return 16;
        case IEEE802154_SCF_SECLEVEL_NONE:
        case IEEE802154_SCF_SECLEVEL_ENC:
        default:
                return 0;
        }
}

static inline int ieee802154_hdr_length(struct sk_buff *skb)
{
        struct ieee802154_hdr hdr;
        int len = ieee802154_hdr_pull(skb, &hdr);

        if (len > 0)
                skb_push(skb, len);

        return len;
}

static inline bool ieee802154_addr_equal(const struct ieee802154_addr *a1,
                                         const struct ieee802154_addr *a2)
{
        if (a1->pan_id != a2->pan_id || a1->mode != a2->mode)
                return false;

        if ((a1->mode == IEEE802154_ADDR_LONG &&
             a1->extended_addr != a2->extended_addr) ||
            (a1->mode == IEEE802154_ADDR_SHORT &&
             a1->short_addr != a2->short_addr))
                return false;

        return true;
}

static inline __le64 ieee802154_devaddr_from_raw(const void *raw)
{
        u64 temp;

        memcpy(&temp, raw, IEEE802154_ADDR_LEN);
        return (__force __le64)swab64(temp);
}

static inline void ieee802154_devaddr_to_raw(void *raw, __le64 addr)
{
        u64 temp = swab64((__force u64)addr);

        memcpy(raw, &temp, IEEE802154_ADDR_LEN);
}

static inline int
ieee802154_sockaddr_check_size(struct sockaddr_ieee802154 *daddr, int len)
{
        struct ieee802154_addr_sa *sa;
        int ret = 0;

        sa = &daddr->addr;
        if (len < IEEE802154_MIN_NAMELEN)
                return -EINVAL;
        switch (sa->addr_type) {
        case IEEE802154_ADDR_NONE:
                break;
        case IEEE802154_ADDR_SHORT:
                if (len < IEEE802154_NAMELEN_SHORT)
                        ret = -EINVAL;
                break;
        case IEEE802154_ADDR_LONG:
                if (len < IEEE802154_NAMELEN_LONG)
                        ret = -EINVAL;
                break;
        default:
                ret = -EINVAL;
                break;
        }
        return ret;
}

static inline void ieee802154_addr_from_sa(struct ieee802154_addr *a,
                                           const struct ieee802154_addr_sa *sa)
{
        a->mode = sa->addr_type;
        a->pan_id = cpu_to_le16(sa->pan_id);

        switch (a->mode) {
        case IEEE802154_ADDR_SHORT:
                a->short_addr = cpu_to_le16(sa->short_addr);
                break;
        case IEEE802154_ADDR_LONG:
                a->extended_addr = ieee802154_devaddr_from_raw(sa->hwaddr);
                break;
        }
}

static inline void ieee802154_addr_to_sa(struct ieee802154_addr_sa *sa,
                                         const struct ieee802154_addr *a)
{
        sa->addr_type = a->mode;
        sa->pan_id = le16_to_cpu(a->pan_id);

        switch (a->mode) {
        case IEEE802154_ADDR_SHORT:
                sa->short_addr = le16_to_cpu(a->short_addr);
                break;
        case IEEE802154_ADDR_LONG:
                ieee802154_devaddr_to_raw(sa->hwaddr, a->extended_addr);
                break;
        }
}

/*
 * A control block of skb passed between the ARPHRD_IEEE802154 device
 * and other stack parts.
 */
struct ieee802154_mac_cb {
        u8 lqi;
        u8 type;
        bool ackreq;
        bool secen;
        bool secen_override;
        u8 seclevel;
        bool seclevel_override;
        struct ieee802154_addr source;
        struct ieee802154_addr dest;
};

static inline struct ieee802154_mac_cb *mac_cb(struct sk_buff *skb)
{
        return (struct ieee802154_mac_cb *)skb->cb;
}

static inline struct ieee802154_mac_cb *mac_cb_init(struct sk_buff *skb)
{
        BUILD_BUG_ON(sizeof(struct ieee802154_mac_cb) > sizeof(skb->cb));

        memset(skb->cb, 0, sizeof(struct ieee802154_mac_cb));
        return mac_cb(skb);
}

enum {
        IEEE802154_LLSEC_DEVKEY_IGNORE,
        IEEE802154_LLSEC_DEVKEY_RESTRICT,
        IEEE802154_LLSEC_DEVKEY_RECORD,

        __IEEE802154_LLSEC_DEVKEY_MAX,
};

#define IEEE802154_MAC_SCAN_ED                0
#define IEEE802154_MAC_SCAN_ACTIVE        1
#define IEEE802154_MAC_SCAN_PASSIVE        2
#define IEEE802154_MAC_SCAN_ORPHAN        3

struct ieee802154_mac_params {
        s8 transmit_power;
        u8 min_be;
        u8 max_be;
        u8 csma_retries;
        s8 frame_retries;

        bool lbt;
        struct wpan_phy_cca cca;
        s32 cca_ed_level;
};

struct wpan_phy;

enum {
        IEEE802154_LLSEC_PARAM_ENABLED                = BIT(0),
        IEEE802154_LLSEC_PARAM_FRAME_COUNTER        = BIT(1),
        IEEE802154_LLSEC_PARAM_OUT_LEVEL        = BIT(2),
        IEEE802154_LLSEC_PARAM_OUT_KEY                = BIT(3),
        IEEE802154_LLSEC_PARAM_KEY_SOURCE        = BIT(4),
        IEEE802154_LLSEC_PARAM_PAN_ID                = BIT(5),
        IEEE802154_LLSEC_PARAM_HWADDR                = BIT(6),
        IEEE802154_LLSEC_PARAM_COORD_HWADDR        = BIT(7),
        IEEE802154_LLSEC_PARAM_COORD_SHORTADDR        = BIT(8),
};

struct ieee802154_llsec_ops {
        int (*get_params)(struct net_device *dev,
                          struct ieee802154_llsec_params *params);
        int (*set_params)(struct net_device *dev,
                          const struct ieee802154_llsec_params *params,
                          int changed);

        int (*add_key)(struct net_device *dev,
                       const struct ieee802154_llsec_key_id *id,
                       const struct ieee802154_llsec_key *key);
        int (*del_key)(struct net_device *dev,
                       const struct ieee802154_llsec_key_id *id);

        int (*add_dev)(struct net_device *dev,
                       const struct ieee802154_llsec_device *llsec_dev);
        int (*del_dev)(struct net_device *dev, __le64 dev_addr);

        int (*add_devkey)(struct net_device *dev,
                          __le64 device_addr,
                          const struct ieee802154_llsec_device_key *key);
        int (*del_devkey)(struct net_device *dev,
                          __le64 device_addr,
                          const struct ieee802154_llsec_device_key *key);

        int (*add_seclevel)(struct net_device *dev,
                            const struct ieee802154_llsec_seclevel *sl);
        int (*del_seclevel)(struct net_device *dev,
                            const struct ieee802154_llsec_seclevel *sl);

        void (*lock_table)(struct net_device *dev);
        void (*get_table)(struct net_device *dev,
                          struct ieee802154_llsec_table **t);
        void (*unlock_table)(struct net_device *dev);
};
/*
 * This should be located at net_device->ml_priv
 *
 * get_phy should increment the reference counting on returned phy.
 * Use wpan_wpy_put to put that reference.
 */
struct ieee802154_mlme_ops {
        /* The following fields are optional (can be NULL). */

        int (*assoc_req)(struct net_device *dev,
                        struct ieee802154_addr *addr,
                        u8 channel, u8 page, u8 cap);
        int (*assoc_resp)(struct net_device *dev,
                        struct ieee802154_addr *addr,
                        __le16 short_addr, u8 status);
        int (*disassoc_req)(struct net_device *dev,
                        struct ieee802154_addr *addr,
                        u8 reason);
        int (*start_req)(struct net_device *dev,
                        struct ieee802154_addr *addr,
                        u8 channel, u8 page, u8 bcn_ord, u8 sf_ord,
                        u8 pan_coord, u8 blx, u8 coord_realign);
        int (*scan_req)(struct net_device *dev,
                        u8 type, u32 channels, u8 page, u8 duration);

        int (*set_mac_params)(struct net_device *dev,
                              const struct ieee802154_mac_params *params);
        void (*get_mac_params)(struct net_device *dev,
                               struct ieee802154_mac_params *params);

        const struct ieee802154_llsec_ops *llsec;
};

static inline struct ieee802154_mlme_ops *
ieee802154_mlme_ops(const struct net_device *dev)
{
        return dev->ml_priv;
}

#endif























































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_HUGE_MM_H
#define _LINUX_HUGE_MM_H

#include <linux/sched/coredump.h>
#include <linux/mm_types.h>

#include <linux/fs.h> /* only for vma_is_dax() */

vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
                  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd);
int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                  pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
                  struct vm_area_struct *vma);

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud);
#else
static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
{
}
#endif

vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd);
struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
                                   unsigned long addr, pmd_t *pmd,
                                   unsigned int flags);
bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                           pmd_t *pmd, unsigned long addr, unsigned long next);
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd,
                 unsigned long addr);
int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud,
                 unsigned long addr);
bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
                   unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd);
int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
                    pgprot_t newprot, unsigned long cp_flags);
vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn,
                                   pgprot_t pgprot, bool write);

/**
 * vmf_insert_pfn_pmd - insert a pmd size pfn
 * @vmf: Structure describing the fault
 * @pfn: pfn to insert
 * @pgprot: page protection to use
 * @write: whether it's a write fault
 *
 * Insert a pmd size pfn. See vmf_insert_pfn() for additional info.
 *
 * Return: vm_fault_t value.
 */
static inline vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn,
                                            bool write)
{
        return vmf_insert_pfn_pmd_prot(vmf, pfn, vmf->vma->vm_page_prot, write);
}
vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn,
                                   pgprot_t pgprot, bool write);

/**
 * vmf_insert_pfn_pud - insert a pud size pfn
 * @vmf: Structure describing the fault
 * @pfn: pfn to insert
 * @pgprot: page protection to use
 * @write: whether it's a write fault
 *
 * Insert a pud size pfn. See vmf_insert_pfn() for additional info.
 *
 * Return: vm_fault_t value.
 */
static inline vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn,
                                            bool write)
{
        return vmf_insert_pfn_pud_prot(vmf, pfn, vmf->vma->vm_page_prot, write);
}

enum transparent_hugepage_flag {
        TRANSPARENT_HUGEPAGE_NEVER_DAX,
        TRANSPARENT_HUGEPAGE_FLAG,
        TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
        TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
        TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
        TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
        TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
        TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
        TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG,
#ifdef CONFIG_DEBUG_VM
        TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG,
#endif
};

struct kobject;
struct kobj_attribute;

ssize_t single_hugepage_flag_store(struct kobject *kobj,
                                   struct kobj_attribute *attr,
                                   const char *buf, size_t count,
                                   enum transparent_hugepage_flag flag);
ssize_t single_hugepage_flag_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf,
                                  enum transparent_hugepage_flag flag);
extern struct kobj_attribute shmem_enabled_attr;

#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define HPAGE_PMD_SHIFT PMD_SHIFT
#define HPAGE_PMD_SIZE        ((1UL) << HPAGE_PMD_SHIFT)
#define HPAGE_PMD_MASK        (~(HPAGE_PMD_SIZE - 1))

#define HPAGE_PUD_SHIFT PUD_SHIFT
#define HPAGE_PUD_SIZE        ((1UL) << HPAGE_PUD_SHIFT)
#define HPAGE_PUD_MASK        (~(HPAGE_PUD_SIZE - 1))

extern unsigned long transparent_hugepage_flags;

static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
                unsigned long haddr)
{
        /* Don't have to check pgoff for anonymous vma */
        if (!vma_is_anonymous(vma)) {
                if (!IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
                                HPAGE_PMD_NR))
                        return false;
        }

        if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
                return false;
        return true;
}

static inline bool transhuge_vma_enabled(struct vm_area_struct *vma,
                                          unsigned long vm_flags)
{
        /* Explicitly disabled through madvise. */
        if ((vm_flags & VM_NOHUGEPAGE) ||
            test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
                return false;
        return true;
}

/*
 * to be used on vmas which are known to support THP.
 * Use transparent_hugepage_active otherwise
 */
static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
{

        /*
         * If the hardware/firmware marked hugepage support disabled.
         */
        if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX))
                return false;

        if (!transhuge_vma_enabled(vma, vma->vm_flags))
                return false;

        if (vma_is_temporary_stack(vma))
                return false;

        if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_FLAG))
                return true;

        if (vma_is_dax(vma))
                return true;

        if (transparent_hugepage_flags &
                                (1 << TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG))
                return !!(vma->vm_flags & VM_HUGEPAGE);

        return false;
}

bool transparent_hugepage_active(struct vm_area_struct *vma);

#define transparent_hugepage_use_zero_page()                                \
        (transparent_hugepage_flags &                                        \
         (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))

unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
                unsigned long len, unsigned long pgoff, unsigned long flags);

void prep_transhuge_page(struct page *page);
void free_transhuge_page(struct page *page);
bool is_transparent_hugepage(struct page *page);

bool can_split_huge_page(struct page *page, int *pextra_pins);
int split_huge_page_to_list(struct page *page, struct list_head *list);
static inline int split_huge_page(struct page *page)
{
        return split_huge_page_to_list(page, NULL);
}
void deferred_split_huge_page(struct page *page);

void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long address, bool freeze, struct page *page);

#define split_huge_pmd(__vma, __pmd, __address)                                \
        do {                                                                \
                pmd_t *____pmd = (__pmd);                                \
                if (is_swap_pmd(*____pmd) || pmd_trans_huge(*____pmd)        \
                                        || pmd_devmap(*____pmd))        \
                        __split_huge_pmd(__vma, __pmd, __address,        \
                                                false, NULL);                \
        }  while (0)


void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
                bool freeze, struct page *page);

void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
                unsigned long address);

#define split_huge_pud(__vma, __pud, __address)                                \
        do {                                                                \
                pud_t *____pud = (__pud);                                \
                if (pud_trans_huge(*____pud)                                \
                                        || pud_devmap(*____pud))        \
                        __split_huge_pud(__vma, __pud, __address);        \
        }  while (0)

int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags,
                     int advice);
void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start,
                           unsigned long end, long adjust_next);
spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma);
spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma);

static inline int is_swap_pmd(pmd_t pmd)
{
        return !pmd_none(pmd) && !pmd_present(pmd);
}

/* mmap_lock must be held on entry */
static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
                struct vm_area_struct *vma)
{
        if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd))
                return __pmd_trans_huge_lock(pmd, vma);
        else
                return NULL;
}
static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
                struct vm_area_struct *vma)
{
        if (pud_trans_huge(*pud) || pud_devmap(*pud))
                return __pud_trans_huge_lock(pud, vma);
        else
                return NULL;
}

/**
 * thp_head - Head page of a transparent huge page.
 * @page: Any page (tail, head or regular) found in the page cache.
 */
static inline struct page *thp_head(struct page *page)
{
        return compound_head(page);
}

/**
 * thp_order - Order of a transparent huge page.
 * @page: Head page of a transparent huge page.
 */
static inline unsigned int thp_order(struct page *page)
{
        VM_BUG_ON_PGFLAGS(PageTail(page), page);
        if (PageHead(page))
                return HPAGE_PMD_ORDER;
        return 0;
}

/**
 * thp_nr_pages - The number of regular pages in this huge page.
 * @page: The head page of a huge page.
 */
static inline int thp_nr_pages(struct page *page)
{
        VM_BUG_ON_PGFLAGS(PageTail(page), page);
        if (PageHead(page))
                return HPAGE_PMD_NR;
        return 1;
}

struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
                pmd_t *pmd, int flags, struct dev_pagemap **pgmap);
struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
                pud_t *pud, int flags, struct dev_pagemap **pgmap);

vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd);

extern struct page *huge_zero_page;
extern unsigned long huge_zero_pfn;

static inline bool is_huge_zero_page(struct page *page)
{
        return READ_ONCE(huge_zero_page) == page;
}

static inline bool is_huge_zero_pmd(pmd_t pmd)
{
        return READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd) && pmd_present(pmd);
}

static inline bool is_huge_zero_pud(pud_t pud)
{
        return false;
}

struct page *mm_get_huge_zero_page(struct mm_struct *mm);
void mm_put_huge_zero_page(struct mm_struct *mm);

#define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot))

static inline bool thp_migration_supported(void)
{
        return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION);
}

static inline struct list_head *page_deferred_list(struct page *page)
{
        /*
         * Global or memcg deferred list in the second tail pages is
         * occupied by compound_head.
         */
        return &page[2].deferred_list;
}

#else /* CONFIG_TRANSPARENT_HUGEPAGE */
#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
#define HPAGE_PMD_SIZE ({ BUILD_BUG(); 0; })

#define HPAGE_PUD_SHIFT ({ BUILD_BUG(); 0; })
#define HPAGE_PUD_MASK ({ BUILD_BUG(); 0; })
#define HPAGE_PUD_SIZE ({ BUILD_BUG(); 0; })

static inline struct page *thp_head(struct page *page)
{
        VM_BUG_ON_PGFLAGS(PageTail(page), page);
        return page;
}

static inline unsigned int thp_order(struct page *page)
{
        VM_BUG_ON_PGFLAGS(PageTail(page), page);
        return 0;
}

static inline int thp_nr_pages(struct page *page)
{
        VM_BUG_ON_PGFLAGS(PageTail(page), page);
        return 1;
}

static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
{
        return false;
}

static inline bool transparent_hugepage_active(struct vm_area_struct *vma)
{
        return false;
}

static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
                unsigned long haddr)
{
        return false;
}

static inline bool transhuge_vma_enabled(struct vm_area_struct *vma,
                                          unsigned long vm_flags)
{
        return false;
}

static inline void prep_transhuge_page(struct page *page) {}

static inline bool is_transparent_hugepage(struct page *page)
{
        return false;
}

#define transparent_hugepage_flags 0UL

#define thp_get_unmapped_area        NULL

static inline bool
can_split_huge_page(struct page *page, int *pextra_pins)
{
        BUILD_BUG();
        return false;
}
static inline int
split_huge_page_to_list(struct page *page, struct list_head *list)
{
        return 0;
}
static inline int split_huge_page(struct page *page)
{
        return 0;
}
static inline void deferred_split_huge_page(struct page *page) {}
#define split_huge_pmd(__vma, __pmd, __address)        \
        do { } while (0)

static inline void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long address, bool freeze, struct page *page) {}
static inline void split_huge_pmd_address(struct vm_area_struct *vma,
                unsigned long address, bool freeze, struct page *page) {}

#define split_huge_pud(__vma, __pmd, __address)        \
        do { } while (0)

static inline int hugepage_madvise(struct vm_area_struct *vma,
                                   unsigned long *vm_flags, int advice)
{
        BUG();
        return 0;
}
static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
                                         unsigned long start,
                                         unsigned long end,
                                         long adjust_next)
{
}
static inline int is_swap_pmd(pmd_t pmd)
{
        return 0;
}
static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
                struct vm_area_struct *vma)
{
        return NULL;
}
static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
                struct vm_area_struct *vma)
{
        return NULL;
}

static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf,
                pmd_t orig_pmd)
{
        return 0;
}

static inline bool is_huge_zero_page(struct page *page)
{
        return false;
}

static inline bool is_huge_zero_pmd(pmd_t pmd)
{
        return false;
}

static inline bool is_huge_zero_pud(pud_t pud)
{
        return false;
}

static inline void mm_put_huge_zero_page(struct mm_struct *mm)
{
        return;
}

static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
        unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
{
        return NULL;
}

static inline struct page *follow_devmap_pud(struct vm_area_struct *vma,
        unsigned long addr, pud_t *pud, int flags, struct dev_pagemap **pgmap)
{
        return NULL;
}

static inline bool thp_migration_supported(void)
{
        return false;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

/**
 * thp_size - Size of a transparent huge page.
 * @page: Head page of a transparent huge page.
 *
 * Return: Number of bytes in this page.
 */
static inline unsigned long thp_size(struct page *page)
{
        return PAGE_SIZE << thp_order(page);
}

#endif /* _LINUX_HUGE_MM_H */












1
2
3
4
5
6
7
8
9
10
11
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_NS_HASH_H__
#define __NET_NS_HASH_H__

#include <net/net_namespace.h>

static inline u32 net_hash_mix(const struct net *net)
{
        return net->hash_mix;
}
#endif












































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
/*
 * linux/include/video/vga.h -- standard VGA chipset interaction
 *
 * Copyright 1999 Jeff Garzik <jgarzik@pobox.com>
 * 
 * Copyright history from vga16fb.c:
 *        Copyright 1999 Ben Pfaff and Petr Vandrovec
 *        Based on VGA info at http://www.osdever.net/FreeVGA/home.htm 
 *        Based on VESA framebuffer (c) 1998 Gerd Knorr
 *
 * This file is subject to the terms and conditions of the GNU General
 * Public License.  See the file COPYING in the main directory of this
 * archive for more details.  
 *
 */

#ifndef __linux_video_vga_h__
#define __linux_video_vga_h__

#include <linux/types.h>
#include <linux/io.h>
#include <asm/vga.h>
#include <asm/byteorder.h>


/* Some of the code below is taken from SVGAlib.  The original,
   unmodified copyright notice for that code is below. */
/* VGAlib version 1.2 - (c) 1993 Tommy Frandsen                    */
/*                                                                 */
/* This library is free software; you can redistribute it and/or   */
/* modify it without any restrictions. This library is distributed */
/* in the hope that it will be useful, but without any warranty.   */

/* Multi-chipset support Copyright 1993 Harm Hanemaayer */
/* partially copyrighted (C) 1993 by Hartmut Schirmer */

/* VGA data register ports */
#define VGA_CRT_DC          0x3D5        /* CRT Controller Data Register - color emulation */
#define VGA_CRT_DM          0x3B5        /* CRT Controller Data Register - mono emulation */
#define VGA_ATT_R           0x3C1        /* Attribute Controller Data Read Register */
#define VGA_ATT_W           0x3C0        /* Attribute Controller Data Write Register */
#define VGA_GFX_D           0x3CF        /* Graphics Controller Data Register */
#define VGA_SEQ_D           0x3C5        /* Sequencer Data Register */
#define VGA_MIS_R           0x3CC        /* Misc Output Read Register */
#define VGA_MIS_W           0x3C2        /* Misc Output Write Register */
#define VGA_FTC_R        0x3CA        /* Feature Control Read Register */
#define VGA_IS1_RC          0x3DA        /* Input Status Register 1 - color emulation */
#define VGA_IS1_RM          0x3BA        /* Input Status Register 1 - mono emulation */
#define VGA_PEL_D           0x3C9        /* PEL Data Register */
#define VGA_PEL_MSK         0x3C6        /* PEL mask register */

/* EGA-specific registers */
#define EGA_GFX_E0        0x3CC        /* Graphics enable processor 0 */
#define EGA_GFX_E1        0x3CA        /* Graphics enable processor 1 */

/* VGA index register ports */
#define VGA_CRT_IC          0x3D4        /* CRT Controller Index - color emulation */
#define VGA_CRT_IM          0x3B4        /* CRT Controller Index - mono emulation */
#define VGA_ATT_IW          0x3C0        /* Attribute Controller Index & Data Write Register */
#define VGA_GFX_I           0x3CE        /* Graphics Controller Index */
#define VGA_SEQ_I           0x3C4        /* Sequencer Index */
#define VGA_PEL_IW          0x3C8        /* PEL Write Index */
#define VGA_PEL_IR          0x3C7        /* PEL Read Index */

/* standard VGA indexes max counts */
#define VGA_CRT_C           0x19        /* Number of CRT Controller Registers */
#define VGA_ATT_C           0x15        /* Number of Attribute Controller Registers */
#define VGA_GFX_C           0x09        /* Number of Graphics Controller Registers */
#define VGA_SEQ_C           0x05        /* Number of Sequencer Registers */
#define VGA_MIS_C           0x01        /* Number of Misc Output Register */

/* VGA misc register bit masks */
#define VGA_MIS_COLOR                0x01
#define VGA_MIS_ENB_MEM_ACCESS        0x02
#define VGA_MIS_DCLK_28322_720        0x04
#define VGA_MIS_ENB_PLL_LOAD        (0x04 | 0x08)
#define VGA_MIS_SEL_HIGH_PAGE        0x20

/* VGA CRT controller register indices */
#define VGA_CRTC_H_TOTAL        0
#define VGA_CRTC_H_DISP                1
#define VGA_CRTC_H_BLANK_START        2
#define VGA_CRTC_H_BLANK_END        3
#define VGA_CRTC_H_SYNC_START        4
#define VGA_CRTC_H_SYNC_END        5
#define VGA_CRTC_V_TOTAL        6
#define VGA_CRTC_OVERFLOW        7
#define VGA_CRTC_PRESET_ROW        8
#define VGA_CRTC_MAX_SCAN        9
#define VGA_CRTC_CURSOR_START        0x0A
#define VGA_CRTC_CURSOR_END        0x0B
#define VGA_CRTC_START_HI        0x0C
#define VGA_CRTC_START_LO        0x0D
#define VGA_CRTC_CURSOR_HI        0x0E
#define VGA_CRTC_CURSOR_LO        0x0F
#define VGA_CRTC_V_SYNC_START        0x10
#define VGA_CRTC_V_SYNC_END        0x11
#define VGA_CRTC_V_DISP_END        0x12
#define VGA_CRTC_OFFSET                0x13
#define VGA_CRTC_UNDERLINE        0x14
#define VGA_CRTC_V_BLANK_START        0x15
#define VGA_CRTC_V_BLANK_END        0x16
#define VGA_CRTC_MODE                0x17
#define VGA_CRTC_LINE_COMPARE        0x18
#define VGA_CRTC_REGS                VGA_CRT_C

/* VGA CRT controller bit masks */
#define VGA_CR11_LOCK_CR0_CR7        0x80 /* lock writes to CR0 - CR7 */
#define VGA_CR17_H_V_SIGNALS_ENABLED 0x80

/* VGA attribute controller register indices */
#define VGA_ATC_PALETTE0        0x00
#define VGA_ATC_PALETTE1        0x01
#define VGA_ATC_PALETTE2        0x02
#define VGA_ATC_PALETTE3        0x03
#define VGA_ATC_PALETTE4        0x04
#define VGA_ATC_PALETTE5        0x05
#define VGA_ATC_PALETTE6        0x06
#define VGA_ATC_PALETTE7        0x07
#define VGA_ATC_PALETTE8        0x08
#define VGA_ATC_PALETTE9        0x09
#define VGA_ATC_PALETTEA        0x0A
#define VGA_ATC_PALETTEB        0x0B
#define VGA_ATC_PALETTEC        0x0C
#define VGA_ATC_PALETTED        0x0D
#define VGA_ATC_PALETTEE        0x0E
#define VGA_ATC_PALETTEF        0x0F
#define VGA_ATC_MODE                0x10
#define VGA_ATC_OVERSCAN        0x11
#define VGA_ATC_PLANE_ENABLE        0x12
#define VGA_ATC_PEL                0x13
#define VGA_ATC_COLOR_PAGE        0x14

#define VGA_AR_ENABLE_DISPLAY        0x20

/* VGA sequencer register indices */
#define VGA_SEQ_RESET                0x00
#define VGA_SEQ_CLOCK_MODE        0x01
#define VGA_SEQ_PLANE_WRITE        0x02
#define VGA_SEQ_CHARACTER_MAP        0x03
#define VGA_SEQ_MEMORY_MODE        0x04

/* VGA sequencer register bit masks */
#define VGA_SR01_CHAR_CLK_8DOTS        0x01 /* bit 0: character clocks 8 dots wide are generated */
#define VGA_SR01_SCREEN_OFF        0x20 /* bit 5: Screen is off */
#define VGA_SR02_ALL_PLANES        0x0F /* bits 3-0: enable access to all planes */
#define VGA_SR04_EXT_MEM        0x02 /* bit 1: allows complete mem access to 256K */
#define VGA_SR04_SEQ_MODE        0x04 /* bit 2: directs system to use a sequential addressing mode */
#define VGA_SR04_CHN_4M                0x08 /* bit 3: selects modulo 4 addressing for CPU access to display memory */

/* VGA graphics controller register indices */
#define VGA_GFX_SR_VALUE        0x00
#define VGA_GFX_SR_ENABLE        0x01
#define VGA_GFX_COMPARE_VALUE        0x02
#define VGA_GFX_DATA_ROTATE        0x03
#define VGA_GFX_PLANE_READ        0x04
#define VGA_GFX_MODE                0x05
#define VGA_GFX_MISC                0x06
#define VGA_GFX_COMPARE_MASK        0x07
#define VGA_GFX_BIT_MASK        0x08

/* VGA graphics controller bit masks */
#define VGA_GR06_GRAPHICS_MODE        0x01

/* macro for composing an 8-bit VGA register index and value
 * into a single 16-bit quantity */
#define VGA_OUT16VAL(v, r)       (((v) << 8) | (r))

/* decide whether we should enable the faster 16-bit VGA register writes */
#ifdef __LITTLE_ENDIAN
#define VGA_OUTW_WRITE
#endif

/* VGA State Save and Restore */
#define VGA_SAVE_FONT0 1  /* save/restore plane 2 fonts          */
#define VGA_SAVE_FONT1 2  /* save/restore plane 3 fonts   */
#define VGA_SAVE_TEXT  4  /* save/restore plane 0/1 fonts */
#define VGA_SAVE_FONTS 7  /* save/restore all fonts          */
#define VGA_SAVE_MODE  8  /* save/restore video mode           */
#define VGA_SAVE_CMAP  16 /* save/restore color map/DAC   */

struct vgastate {
        void __iomem *vgabase;        /* mmio base, if supported                    */
        unsigned long membase;        /* VGA window base, 0 for default - 0xA000 */
        __u32 memsize;                /* VGA window size, 0 for default 64K           */
        __u32 flags;                /* what state[s] to save (see VGA_SAVE_*)  */
        __u32 depth;                /* current fb depth, not important           */
        __u32 num_attr;                /* number of att registers, 0 for default  */
        __u32 num_crtc;                /* number of crt registers, 0 for default  */
        __u32 num_gfx;                /* number of gfx registers, 0 for default  */
        __u32 num_seq;                /* number of seq registers, 0 for default  */
        void *vidstate;
};        

extern int save_vga(struct vgastate *state);
extern int restore_vga(struct vgastate *state);

/*
 * generic VGA port read/write
 */
 
static inline unsigned char vga_io_r (unsigned short port)
{
        return inb_p(port);
}

static inline void vga_io_w (unsigned short port, unsigned char val)
{
        outb_p(val, port);
}

static inline void vga_io_w_fast (unsigned short port, unsigned char reg,
                                  unsigned char val)
{
        outw(VGA_OUT16VAL (val, reg), port);
}

static inline unsigned char vga_mm_r (void __iomem *regbase, unsigned short port)
{
        return readb (regbase + port);
}

static inline void vga_mm_w (void __iomem *regbase, unsigned short port, unsigned char val)
{
        writeb (val, regbase + port);
}

static inline void vga_mm_w_fast (void __iomem *regbase, unsigned short port,
                                  unsigned char reg, unsigned char val)
{
        writew (VGA_OUT16VAL (val, reg), regbase + port);
}

static inline unsigned char vga_r (void __iomem *regbase, unsigned short port)
{
        if (regbase)
                return vga_mm_r (regbase, port);
        else
                return vga_io_r (port);
}

static inline void vga_w (void __iomem *regbase, unsigned short port, unsigned char val)
{
        if (regbase)
                vga_mm_w (regbase, port, val);
        else
                vga_io_w (port, val);
}


static inline void vga_w_fast (void __iomem *regbase, unsigned short port,
                               unsigned char reg, unsigned char val)
{
        if (regbase)
                vga_mm_w_fast (regbase, port, reg, val);
        else
                vga_io_w_fast (port, reg, val);
}


/*
 * VGA CRTC register read/write
 */
 
static inline unsigned char vga_rcrt (void __iomem *regbase, unsigned char reg)
{
        vga_w (regbase, VGA_CRT_IC, reg);
        return vga_r (regbase, VGA_CRT_DC);
}

static inline void vga_wcrt (void __iomem *regbase, unsigned char reg, unsigned char val)
{
#ifdef VGA_OUTW_WRITE
        vga_w_fast (regbase, VGA_CRT_IC, reg, val);
#else
        vga_w (regbase, VGA_CRT_IC, reg);
        vga_w (regbase, VGA_CRT_DC, val);
#endif /* VGA_OUTW_WRITE */
}

static inline unsigned char vga_io_rcrt (unsigned char reg)
{
        vga_io_w (VGA_CRT_IC, reg);
        return vga_io_r (VGA_CRT_DC);
}

static inline void vga_io_wcrt (unsigned char reg, unsigned char val)
{
#ifdef VGA_OUTW_WRITE
        vga_io_w_fast (VGA_CRT_IC, reg, val);
#else
        vga_io_w (VGA_CRT_IC, reg);
        vga_io_w (VGA_CRT_DC, val);
#endif /* VGA_OUTW_WRITE */
}

static inline unsigned char vga_mm_rcrt (void __iomem *regbase, unsigned char reg)
{
        vga_mm_w (regbase, VGA_CRT_IC, reg);
        return vga_mm_r (regbase, VGA_CRT_DC);
}

static inline void vga_mm_wcrt (void __iomem *regbase, unsigned char reg, unsigned char val)
{
#ifdef VGA_OUTW_WRITE
        vga_mm_w_fast (regbase, VGA_CRT_IC, reg, val);
#else
        vga_mm_w (regbase, VGA_CRT_IC, reg);
        vga_mm_w (regbase, VGA_CRT_DC, val);
#endif /* VGA_OUTW_WRITE */
}


/*
 * VGA sequencer register read/write
 */
 
static inline unsigned char vga_rseq (void __iomem *regbase, unsigned char reg)
{
        vga_w (regbase, VGA_SEQ_I, reg);
        return vga_r (regbase, VGA_SEQ_D);
}

static inline void vga_wseq (void __iomem *regbase, unsigned char reg, unsigned char val)
{
#ifdef VGA_OUTW_WRITE
        vga_w_fast (regbase, VGA_SEQ_I, reg, val);
#else
        vga_w (regbase, VGA_SEQ_I, reg);
        vga_w (regbase, VGA_SEQ_D, val);
#endif /* VGA_OUTW_WRITE */
}

static inline unsigned char vga_io_rseq (unsigned char reg)
{
        vga_io_w (VGA_SEQ_I, reg);
        return vga_io_r (VGA_SEQ_D);
}

static inline void vga_io_wseq (unsigned char reg, unsigned char val)
{
#ifdef VGA_OUTW_WRITE
        vga_io_w_fast (VGA_SEQ_I, reg, val);
#else
        vga_io_w (VGA_SEQ_I, reg);
        vga_io_w (VGA_SEQ_D, val);
#endif /* VGA_OUTW_WRITE */
}

static inline unsigned char vga_mm_rseq (void __iomem *regbase, unsigned char reg)
{
        vga_mm_w (regbase, VGA_SEQ_I, reg);
        return vga_mm_r (regbase, VGA_SEQ_D);
}

static inline void vga_mm_wseq (void __iomem *regbase, unsigned char reg, unsigned char val)
{
#ifdef VGA_OUTW_WRITE
        vga_mm_w_fast (regbase, VGA_SEQ_I, reg, val);
#else
        vga_mm_w (regbase, VGA_SEQ_I, reg);
        vga_mm_w (regbase, VGA_SEQ_D, val);
#endif /* VGA_OUTW_WRITE */
}

/*
 * VGA graphics controller register read/write
 */
 
static inline unsigned char vga_rgfx (void __iomem *regbase, unsigned char reg)
{
        vga_w (regbase, VGA_GFX_I, reg);
        return vga_r (regbase, VGA_GFX_D);
}

static inline void vga_wgfx (void __iomem *regbase, unsigned char reg, unsigned char val)
{
#ifdef VGA_OUTW_WRITE
        vga_w_fast (regbase, VGA_GFX_I, reg, val);
#else
        vga_w (regbase, VGA_GFX_I, reg);
        vga_w (regbase, VGA_GFX_D, val);
#endif /* VGA_OUTW_WRITE */
}

static inline unsigned char vga_io_rgfx (unsigned char reg)
{
        vga_io_w (VGA_GFX_I, reg);
        return vga_io_r (VGA_GFX_D);
}

static inline void vga_io_wgfx (unsigned char reg, unsigned char val)
{
#ifdef VGA_OUTW_WRITE
        vga_io_w_fast (VGA_GFX_I, reg, val);
#else
        vga_io_w (VGA_GFX_I, reg);
        vga_io_w (VGA_GFX_D, val);
#endif /* VGA_OUTW_WRITE */
}

static inline unsigned char vga_mm_rgfx (void __iomem *regbase, unsigned char reg)
{
        vga_mm_w (regbase, VGA_GFX_I, reg);
        return vga_mm_r (regbase, VGA_GFX_D);
}

static inline void vga_mm_wgfx (void __iomem *regbase, unsigned char reg, unsigned char val)
{
#ifdef VGA_OUTW_WRITE
        vga_mm_w_fast (regbase, VGA_GFX_I, reg, val);
#else
        vga_mm_w (regbase, VGA_GFX_I, reg);
        vga_mm_w (regbase, VGA_GFX_D, val);
#endif /* VGA_OUTW_WRITE */
}


/*
 * VGA attribute controller register read/write
 */
 
static inline unsigned char vga_rattr (void __iomem *regbase, unsigned char reg)
{
        vga_w (regbase, VGA_ATT_IW, reg);
        return vga_r (regbase, VGA_ATT_R);
}

static inline void vga_wattr (void __iomem *regbase, unsigned char reg, unsigned char val)
{
        vga_w (regbase, VGA_ATT_IW, reg);
        vga_w (regbase, VGA_ATT_W, val);
}

static inline unsigned char vga_io_rattr (unsigned char reg)
{
        vga_io_w (VGA_ATT_IW, reg);
        return vga_io_r (VGA_ATT_R);
}

static inline void vga_io_wattr (unsigned char reg, unsigned char val)
{
        vga_io_w (VGA_ATT_IW, reg);
        vga_io_w (VGA_ATT_W, val);
}

static inline unsigned char vga_mm_rattr (void __iomem *regbase, unsigned char reg)
{
        vga_mm_w (regbase, VGA_ATT_IW, reg);
        return vga_mm_r (regbase, VGA_ATT_R);
}

static inline void vga_mm_wattr (void __iomem *regbase, unsigned char reg, unsigned char val)
{
        vga_mm_w (regbase, VGA_ATT_IW, reg);
        vga_mm_w (regbase, VGA_ATT_W, val);
}

#endif /* __linux_video_vga_h__ */























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Traceprobe fetch helper inlines
 */

static nokprobe_inline void
fetch_store_raw(unsigned long val, struct fetch_insn *code, void *buf)
{
        switch (code->size) {
        case 1:
                *(u8 *)buf = (u8)val;
                break;
        case 2:
                *(u16 *)buf = (u16)val;
                break;
        case 4:
                *(u32 *)buf = (u32)val;
                break;
        case 8:
                //TBD: 32bit signed
                *(u64 *)buf = (u64)val;
                break;
        default:
                *(unsigned long *)buf = val;
        }
}

static nokprobe_inline void
fetch_apply_bitfield(struct fetch_insn *code, void *buf)
{
        switch (code->basesize) {
        case 1:
                *(u8 *)buf <<= code->lshift;
                *(u8 *)buf >>= code->rshift;
                break;
        case 2:
                *(u16 *)buf <<= code->lshift;
                *(u16 *)buf >>= code->rshift;
                break;
        case 4:
                *(u32 *)buf <<= code->lshift;
                *(u32 *)buf >>= code->rshift;
                break;
        case 8:
                *(u64 *)buf <<= code->lshift;
                *(u64 *)buf >>= code->rshift;
                break;
        }
}

/*
 * These functions must be defined for each callsite.
 * Return consumed dynamic data size (>= 0), or error (< 0).
 * If dest is NULL, don't store result and return required dynamic data size.
 */
static int
process_fetch_insn(struct fetch_insn *code, void *rec,
                   void *dest, void *base);
static nokprobe_inline int fetch_store_strlen(unsigned long addr);
static nokprobe_inline int
fetch_store_string(unsigned long addr, void *dest, void *base);
static nokprobe_inline int fetch_store_strlen_user(unsigned long addr);
static nokprobe_inline int
fetch_store_string_user(unsigned long addr, void *dest, void *base);
static nokprobe_inline int
probe_mem_read(void *dest, void *src, size_t size);
static nokprobe_inline int
probe_mem_read_user(void *dest, void *src, size_t size);

/* From the 2nd stage, routine is same */
static nokprobe_inline int
process_fetch_insn_bottom(struct fetch_insn *code, unsigned long val,
                           void *dest, void *base)
{
        struct fetch_insn *s3 = NULL;
        int total = 0, ret = 0, i = 0;
        u32 loc = 0;
        unsigned long lval = val;

stage2:
        /* 2nd stage: dereference memory if needed */
        do {
                if (code->op == FETCH_OP_DEREF) {
                        lval = val;
                        ret = probe_mem_read(&val, (void *)val + code->offset,
                                             sizeof(val));
                } else if (code->op == FETCH_OP_UDEREF) {
                        lval = val;
                        ret = probe_mem_read_user(&val,
                                 (void *)val + code->offset, sizeof(val));
                } else
                        break;
                if (ret)
                        return ret;
                code++;
        } while (1);

        s3 = code;
stage3:
        /* 3rd stage: store value to buffer */
        if (unlikely(!dest)) {
                if (code->op == FETCH_OP_ST_STRING) {
                        ret = fetch_store_strlen(val + code->offset);
                        code++;
                        goto array;
                } else if (code->op == FETCH_OP_ST_USTRING) {
                        ret += fetch_store_strlen_user(val + code->offset);
                        code++;
                        goto array;
                } else
                        return -EILSEQ;
        }

        switch (code->op) {
        case FETCH_OP_ST_RAW:
                fetch_store_raw(val, code, dest);
                break;
        case FETCH_OP_ST_MEM:
                probe_mem_read(dest, (void *)val + code->offset, code->size);
                break;
        case FETCH_OP_ST_UMEM:
                probe_mem_read_user(dest, (void *)val + code->offset, code->size);
                break;
        case FETCH_OP_ST_STRING:
                loc = *(u32 *)dest;
                ret = fetch_store_string(val + code->offset, dest, base);
                break;
        case FETCH_OP_ST_USTRING:
                loc = *(u32 *)dest;
                ret = fetch_store_string_user(val + code->offset, dest, base);
                break;
        default:
                return -EILSEQ;
        }
        code++;

        /* 4th stage: modify stored value if needed */
        if (code->op == FETCH_OP_MOD_BF) {
                fetch_apply_bitfield(code, dest);
                code++;
        }

array:
        /* the last stage: Loop on array */
        if (code->op == FETCH_OP_LP_ARRAY) {
                if (ret < 0)
                        ret = 0;
                total += ret;
                if (++i < code->param) {
                        code = s3;
                        if (s3->op != FETCH_OP_ST_STRING &&
                            s3->op != FETCH_OP_ST_USTRING) {
                                dest += s3->size;
                                val += s3->size;
                                goto stage3;
                        }
                        code--;
                        val = lval + sizeof(char *);
                        if (dest) {
                                dest += sizeof(u32);
                                *(u32 *)dest = update_data_loc(loc, ret);
                        }
                        goto stage2;
                }
                code++;
                ret = total;
        }

        return code->op == FETCH_OP_END ? ret : -EILSEQ;
}

/* Sum up total data length for dynamic arraies (strings) */
static nokprobe_inline int
__get_data_size(struct trace_probe *tp, struct pt_regs *regs)
{
        struct probe_arg *arg;
        int i, len, ret = 0;

        for (i = 0; i < tp->nr_args; i++) {
                arg = tp->args + i;
                if (unlikely(arg->dynamic)) {
                        len = process_fetch_insn(arg->code, regs, NULL, NULL);
                        if (len > 0)
                                ret += len;
                }
        }

        return ret;
}

/* Store the value of each argument */
static nokprobe_inline void
store_trace_args(void *data, struct trace_probe *tp, void *rec,
                 int header_size, int maxlen)
{
        struct probe_arg *arg;
        void *base = data - header_size;
        void *dyndata = data + tp->size;
        u32 *dl;        /* Data location */
        int ret, i;

        for (i = 0; i < tp->nr_args; i++) {
                arg = tp->args + i;
                dl = data + arg->offset;
                /* Point the dynamic data area if needed */
                if (unlikely(arg->dynamic))
                        *dl = make_data_loc(maxlen, dyndata - base);
                ret = process_fetch_insn(arg->code, rec, dl, base);
                if (arg->dynamic) {
                        if (unlikely(ret < 0)) {
                                *dl = make_data_loc(0, dyndata - base);
                        } else {
                                dyndata += ret;
                                maxlen -= ret;
                        }
                }
        }
}

static inline int
print_probe_args(struct trace_seq *s, struct probe_arg *args, int nr_args,
                 u8 *data, void *field)
{
        void *p;
        int i, j;

        for (i = 0; i < nr_args; i++) {
                struct probe_arg *a = args + i;

                trace_seq_printf(s, " %s=", a->name);
                if (likely(!a->count)) {
                        if (!a->type->print(s, data + a->offset, field))
                                return -ENOMEM;
                        continue;
                }
                trace_seq_putc(s, '{');
                p = data + a->offset;
                for (j = 0; j < a->count; j++) {
                        if (!a->type->print(s, p, field))
                                return -ENOMEM;
                        trace_seq_putc(s, j == a->count - 1 ? '}' : ',');
                        p += a->type->size;
                }
        }
        return 0;
}



















































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_UDP_TUNNEL_H
#define __NET_UDP_TUNNEL_H

#include <net/ip_tunnels.h>
#include <net/udp.h>

#if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6.h>
#include <net/ipv6_stubs.h>
#endif

struct udp_port_cfg {
        u8                        family;

        /* Used only for kernel-created sockets */
        union {
                struct in_addr                local_ip;
#if IS_ENABLED(CONFIG_IPV6)
                struct in6_addr                local_ip6;
#endif
        };

        union {
                struct in_addr                peer_ip;
#if IS_ENABLED(CONFIG_IPV6)
                struct in6_addr                peer_ip6;
#endif
        };

        __be16                        local_udp_port;
        __be16                        peer_udp_port;
        int                        bind_ifindex;
        unsigned int                use_udp_checksums:1,
                                use_udp6_tx_checksums:1,
                                use_udp6_rx_checksums:1,
                                ipv6_v6only:1;
};

int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
                     struct socket **sockp);

#if IS_ENABLED(CONFIG_IPV6)
int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
                     struct socket **sockp);
#else
static inline int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
                                   struct socket **sockp)
{
        return -EPFNOSUPPORT;
}
#endif

static inline int udp_sock_create(struct net *net,
                                  struct udp_port_cfg *cfg,
                                  struct socket **sockp)
{
        if (cfg->family == AF_INET)
                return udp_sock_create4(net, cfg, sockp);

        if (cfg->family == AF_INET6)
                return udp_sock_create6(net, cfg, sockp);

        return -EPFNOSUPPORT;
}

typedef int (*udp_tunnel_encap_rcv_t)(struct sock *sk, struct sk_buff *skb);
typedef int (*udp_tunnel_encap_err_lookup_t)(struct sock *sk,
                                             struct sk_buff *skb);
typedef void (*udp_tunnel_encap_destroy_t)(struct sock *sk);
typedef struct sk_buff *(*udp_tunnel_gro_receive_t)(struct sock *sk,
                                                    struct list_head *head,
                                                    struct sk_buff *skb);
typedef int (*udp_tunnel_gro_complete_t)(struct sock *sk, struct sk_buff *skb,
                                         int nhoff);

struct udp_tunnel_sock_cfg {
        void *sk_user_data;     /* user data used by encap_rcv call back */
        /* Used for setting up udp_sock fields, see udp.h for details */
        __u8  encap_type;
        udp_tunnel_encap_rcv_t encap_rcv;
        udp_tunnel_encap_err_lookup_t encap_err_lookup;
        udp_tunnel_encap_destroy_t encap_destroy;
        udp_tunnel_gro_receive_t gro_receive;
        udp_tunnel_gro_complete_t gro_complete;
};

/* Setup the given (UDP) sock to receive UDP encapsulated packets */
void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
                           struct udp_tunnel_sock_cfg *sock_cfg);

/* -- List of parsable UDP tunnel types --
 *
 * Adding to this list will result in serious debate.  The main issue is
 * that this list is essentially a list of workarounds for either poorly
 * designed tunnels, or poorly designed device offloads.
 *
 * The parsing supported via these types should really be used for Rx
 * traffic only as the network stack will have already inserted offsets for
 * the location of the headers in the skb.  In addition any ports that are
 * pushed should be kept within the namespace without leaking to other
 * devices such as VFs or other ports on the same device.
 *
 * It is strongly encouraged to use CHECKSUM_COMPLETE for Rx to avoid the
 * need to use this for Rx checksum offload.  It should not be necessary to
 * call this function to perform Tx offloads on outgoing traffic.
 */
enum udp_parsable_tunnel_type {
        UDP_TUNNEL_TYPE_VXLAN          = BIT(0), /* RFC 7348 */
        UDP_TUNNEL_TYPE_GENEVE          = BIT(1), /* draft-ietf-nvo3-geneve */
        UDP_TUNNEL_TYPE_VXLAN_GPE = BIT(2), /* draft-ietf-nvo3-vxlan-gpe */
};

struct udp_tunnel_info {
        unsigned short type;
        sa_family_t sa_family;
        __be16 port;
        u8 hw_priv;
};

/* Notify network devices of offloadable types */
void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock,
                             unsigned short type);
void udp_tunnel_drop_rx_port(struct net_device *dev, struct socket *sock,
                             unsigned short type);
void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type);
void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type);

static inline void udp_tunnel_get_rx_info(struct net_device *dev)
{
        ASSERT_RTNL();
        call_netdevice_notifiers(NETDEV_UDP_TUNNEL_PUSH_INFO, dev);
}

static inline void udp_tunnel_drop_rx_info(struct net_device *dev)
{
        ASSERT_RTNL();
        call_netdevice_notifiers(NETDEV_UDP_TUNNEL_DROP_INFO, dev);
}

/* Transmit the skb using UDP encapsulation. */
void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
                         __be32 src, __be32 dst, __u8 tos, __u8 ttl,
                         __be16 df, __be16 src_port, __be16 dst_port,
                         bool xnet, bool nocheck);

int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
                         struct sk_buff *skb,
                         struct net_device *dev, struct in6_addr *saddr,
                         struct in6_addr *daddr,
                         __u8 prio, __u8 ttl, __be32 label,
                         __be16 src_port, __be16 dst_port, bool nocheck);

void udp_tunnel_sock_release(struct socket *sock);

struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family,
                                    __be16 flags, __be64 tunnel_id,
                                    int md_size);

#ifdef CONFIG_INET
static inline int udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum)
{
        int type = udp_csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;

        return iptunnel_handle_offloads(skb, type);
}
#endif

static inline void udp_tunnel_encap_enable(struct socket *sock)
{
        struct udp_sock *up = udp_sk(sock->sk);

        if (up->encap_enabled)
                return;

        up->encap_enabled = 1;
#if IS_ENABLED(CONFIG_IPV6)
        if (sock->sk->sk_family == PF_INET6)
                ipv6_stub->udpv6_encap_enable();
#endif
        udp_encap_enable();
}

#define UDP_TUNNEL_NIC_MAX_TABLES        4

enum udp_tunnel_nic_info_flags {
        /* Device callbacks may sleep */
        UDP_TUNNEL_NIC_INFO_MAY_SLEEP        = BIT(0),
        /* Device only supports offloads when it's open, all ports
         * will be removed before close and re-added after open.
         */
        UDP_TUNNEL_NIC_INFO_OPEN_ONLY        = BIT(1),
        /* Device supports only IPv4 tunnels */
        UDP_TUNNEL_NIC_INFO_IPV4_ONLY        = BIT(2),
        /* Device has hard-coded the IANA VXLAN port (4789) as VXLAN.
         * This port must not be counted towards n_entries of any table.
         * Driver will not receive any callback associated with port 4789.
         */
        UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN        = BIT(3),
};

struct udp_tunnel_nic;

#define UDP_TUNNEL_NIC_MAX_SHARING_DEVICES        (U16_MAX / 2)

struct udp_tunnel_nic_shared {
        struct udp_tunnel_nic *udp_tunnel_nic_info;

        struct list_head devices;
};

struct udp_tunnel_nic_shared_node {
        struct net_device *dev;
        struct list_head list;
};

/**
 * struct udp_tunnel_nic_info - driver UDP tunnel offload information
 * @set_port:        callback for adding a new port
 * @unset_port:        callback for removing a port
 * @sync_table:        callback for syncing the entire port table at once
 * @shared:        reference to device global state (optional)
 * @flags:        device flags from enum udp_tunnel_nic_info_flags
 * @tables:        UDP port tables this device has
 * @tables.n_entries:                number of entries in this table
 * @tables.tunnel_types:        types of tunnels this table accepts
 *
 * Drivers are expected to provide either @set_port and @unset_port callbacks
 * or the @sync_table callback. Callbacks are invoked with rtnl lock held.
 *
 * Devices which (misguidedly) share the UDP tunnel port table across multiple
 * netdevs should allocate an instance of struct udp_tunnel_nic_shared and
 * point @shared at it.
 * There must never be more than %UDP_TUNNEL_NIC_MAX_SHARING_DEVICES devices
 * sharing a table.
 *
 * Known limitations:
 *  - UDP tunnel port notifications are fundamentally best-effort -
 *    it is likely the driver will both see skbs which use a UDP tunnel port,
 *    while not being a tunneled skb, and tunnel skbs from other ports -
 *    drivers should only use these ports for non-critical RX-side offloads,
 *    e.g. the checksum offload;
 *  - none of the devices care about the socket family at present, so we don't
 *    track it. Please extend this code if you care.
 */
struct udp_tunnel_nic_info {
        /* one-by-one */
        int (*set_port)(struct net_device *dev,
                        unsigned int table, unsigned int entry,
                        struct udp_tunnel_info *ti);
        int (*unset_port)(struct net_device *dev,
                          unsigned int table, unsigned int entry,
                          struct udp_tunnel_info *ti);

        /* all at once */
        int (*sync_table)(struct net_device *dev, unsigned int table);

        struct udp_tunnel_nic_shared *shared;

        unsigned int flags;

        struct udp_tunnel_nic_table_info {
                unsigned int n_entries;
                unsigned int tunnel_types;
        } tables[UDP_TUNNEL_NIC_MAX_TABLES];
};

/* UDP tunnel module dependencies
 *
 * Tunnel drivers are expected to have a hard dependency on the udp_tunnel
 * module. NIC drivers are not, they just attach their
 * struct udp_tunnel_nic_info to the netdev and wait for callbacks to come.
 * Loading a tunnel driver will cause the udp_tunnel module to be loaded
 * and only then will all the required state structures be allocated.
 * Since we want a weak dependency from the drivers and the core to udp_tunnel
 * we call things through the following stubs.
 */
struct udp_tunnel_nic_ops {
        void (*get_port)(struct net_device *dev, unsigned int table,
                         unsigned int idx, struct udp_tunnel_info *ti);
        void (*set_port_priv)(struct net_device *dev, unsigned int table,
                              unsigned int idx, u8 priv);
        void (*add_port)(struct net_device *dev, struct udp_tunnel_info *ti);
        void (*del_port)(struct net_device *dev, struct udp_tunnel_info *ti);
        void (*reset_ntf)(struct net_device *dev);

        size_t (*dump_size)(struct net_device *dev, unsigned int table);
        int (*dump_write)(struct net_device *dev, unsigned int table,
                          struct sk_buff *skb);
};

#ifdef CONFIG_INET
extern const struct udp_tunnel_nic_ops *udp_tunnel_nic_ops;
#else
#define udp_tunnel_nic_ops        ((struct udp_tunnel_nic_ops *)NULL)
#endif

static inline void
udp_tunnel_nic_get_port(struct net_device *dev, unsigned int table,
                        unsigned int idx, struct udp_tunnel_info *ti)
{
        /* This helper is used from .sync_table, we indicate empty entries
         * by zero'ed @ti. Drivers which need to know the details of a port
         * when it gets deleted should use the .set_port / .unset_port
         * callbacks.
         * Zero out here, otherwise !CONFIG_INET causes uninitilized warnings.
         */
        memset(ti, 0, sizeof(*ti));

        if (udp_tunnel_nic_ops)
                udp_tunnel_nic_ops->get_port(dev, table, idx, ti);
}

static inline void
udp_tunnel_nic_set_port_priv(struct net_device *dev, unsigned int table,
                             unsigned int idx, u8 priv)
{
        if (udp_tunnel_nic_ops)
                udp_tunnel_nic_ops->set_port_priv(dev, table, idx, priv);
}

static inline void
udp_tunnel_nic_add_port(struct net_device *dev, struct udp_tunnel_info *ti)
{
        if (udp_tunnel_nic_ops)
                udp_tunnel_nic_ops->add_port(dev, ti);
}

static inline void
udp_tunnel_nic_del_port(struct net_device *dev, struct udp_tunnel_info *ti)
{
        if (udp_tunnel_nic_ops)
                udp_tunnel_nic_ops->del_port(dev, ti);
}

/**
 * udp_tunnel_nic_reset_ntf() - device-originating reset notification
 * @dev: network interface device structure
 *
 * Called by the driver to inform the core that the entire UDP tunnel port
 * state has been lost, usually due to device reset. Core will assume device
 * forgot all the ports and issue .set_port and .sync_table callbacks as
 * necessary.
 *
 * This function must be called with rtnl lock held, and will issue all
 * the callbacks before returning.
 */
static inline void udp_tunnel_nic_reset_ntf(struct net_device *dev)
{
        if (udp_tunnel_nic_ops)
                udp_tunnel_nic_ops->reset_ntf(dev);
}

static inline size_t
udp_tunnel_nic_dump_size(struct net_device *dev, unsigned int table)
{
        if (!udp_tunnel_nic_ops)
                return 0;
        return udp_tunnel_nic_ops->dump_size(dev, table);
}

static inline int
udp_tunnel_nic_dump_write(struct net_device *dev, unsigned int table,
                          struct sk_buff *skb)
{
        if (!udp_tunnel_nic_ops)
                return 0;
        return udp_tunnel_nic_ops->dump_write(dev, table, skb);
}
#endif






































































































































































































































































   12 










    6 








    1 

    6 


    5 



























   14 








   14 































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
// SPDX-License-Identifier: GPL-2.0
/*
 *  fs/signalfd.c
 *
 *  Copyright (C) 2003  Linus Torvalds
 *
 *  Mon Mar 5, 2007: Davide Libenzi <davidel@xmailserver.org>
 *      Changed ->read() to return a siginfo strcture instead of signal number.
 *      Fixed locking in ->poll().
 *      Added sighand-detach notification.
 *      Added fd re-use in sys_signalfd() syscall.
 *      Now using anonymous inode source.
 *      Thanks to Oleg Nesterov for useful code review and suggestions.
 *      More comments and suggestions from Arnd Bergmann.
 *  Sat May 19, 2007: Davi E. M. Arnaut <davi@haxent.com.br>
 *      Retrieve multiple signals with one read() call
 *  Sun Jul 15, 2007: Davide Libenzi <davidel@xmailserver.org>
 *      Attach to the sighand only during read() and poll().
 */

#include <linux/file.h>
#include <linux/poll.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/signal.h>
#include <linux/list.h>
#include <linux/anon_inodes.h>
#include <linux/signalfd.h>
#include <linux/syscalls.h>
#include <linux/proc_fs.h>
#include <linux/compat.h>

void signalfd_cleanup(struct sighand_struct *sighand)
{
        wake_up_pollfree(&sighand->signalfd_wqh);
}

struct signalfd_ctx {
        sigset_t sigmask;
};

static int signalfd_release(struct inode *inode, struct file *file)
{
        kfree(file->private_data);
        return 0;
}

static __poll_t signalfd_poll(struct file *file, poll_table *wait)
{
        struct signalfd_ctx *ctx = file->private_data;
        __poll_t events = 0;

        poll_wait(file, &current->sighand->signalfd_wqh, wait);

        spin_lock_irq(&current->sighand->siglock);
        if (next_signal(&current->pending, &ctx->sigmask) ||
            next_signal(&current->signal->shared_pending,
                        &ctx->sigmask))
                events |= EPOLLIN;
        spin_unlock_irq(&current->sighand->siglock);

        return events;
}

/*
 * Copied from copy_siginfo_to_user() in kernel/signal.c
 */
static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
                             kernel_siginfo_t const *kinfo)
{
        struct signalfd_siginfo new;

        BUILD_BUG_ON(sizeof(struct signalfd_siginfo) != 128);

        /*
         * Unused members should be zero ...
         */
        memset(&new, 0, sizeof(new));

        /*
         * If you change siginfo_t structure, please be sure
         * this code is fixed accordingly.
         */
        new.ssi_signo = kinfo->si_signo;
        new.ssi_errno = kinfo->si_errno;
        new.ssi_code  = kinfo->si_code;
        switch (siginfo_layout(kinfo->si_signo, kinfo->si_code)) {
        case SIL_KILL:
                new.ssi_pid = kinfo->si_pid;
                new.ssi_uid = kinfo->si_uid;
                break;
        case SIL_TIMER:
                new.ssi_tid = kinfo->si_tid;
                new.ssi_overrun = kinfo->si_overrun;
                new.ssi_ptr = (long) kinfo->si_ptr;
                new.ssi_int = kinfo->si_int;
                break;
        case SIL_POLL:
                new.ssi_band = kinfo->si_band;
                new.ssi_fd   = kinfo->si_fd;
                break;
        case SIL_FAULT_BNDERR:
        case SIL_FAULT_PKUERR:
                /*
                 * Fall through to the SIL_FAULT case.  Both SIL_FAULT_BNDERR
                 * and SIL_FAULT_PKUERR are only generated by faults that
                 * deliver them synchronously to userspace.  In case someone
                 * injects one of these signals and signalfd catches it treat
                 * it as SIL_FAULT.
                 */
        case SIL_FAULT:
                new.ssi_addr = (long) kinfo->si_addr;
#ifdef __ARCH_SI_TRAPNO
                new.ssi_trapno = kinfo->si_trapno;
#endif
                break;
        case SIL_FAULT_MCEERR:
                new.ssi_addr = (long) kinfo->si_addr;
#ifdef __ARCH_SI_TRAPNO
                new.ssi_trapno = kinfo->si_trapno;
#endif
                new.ssi_addr_lsb = (short) kinfo->si_addr_lsb;
                break;
        case SIL_CHLD:
                new.ssi_pid    = kinfo->si_pid;
                new.ssi_uid    = kinfo->si_uid;
                new.ssi_status = kinfo->si_status;
                new.ssi_utime  = kinfo->si_utime;
                new.ssi_stime  = kinfo->si_stime;
                break;
        case SIL_RT:
                /*
                 * This case catches also the signals queued by sigqueue().
                 */
                new.ssi_pid = kinfo->si_pid;
                new.ssi_uid = kinfo->si_uid;
                new.ssi_ptr = (long) kinfo->si_ptr;
                new.ssi_int = kinfo->si_int;
                break;
        case SIL_SYS:
                new.ssi_call_addr = (long) kinfo->si_call_addr;
                new.ssi_syscall   = kinfo->si_syscall;
                new.ssi_arch      = kinfo->si_arch;
                break;
        }

        if (copy_to_user(uinfo, &new, sizeof(struct signalfd_siginfo)))
                return -EFAULT;

        return sizeof(*uinfo);
}

static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, kernel_siginfo_t *info,
                                int nonblock)
{
        ssize_t ret;
        DECLARE_WAITQUEUE(wait, current);

        spin_lock_irq(&current->sighand->siglock);
        ret = dequeue_signal(current, &ctx->sigmask, info);
        switch (ret) {
        case 0:
                if (!nonblock)
                        break;
                ret = -EAGAIN;
                fallthrough;
        default:
                spin_unlock_irq(&current->sighand->siglock);
                return ret;
        }

        add_wait_queue(&current->sighand->signalfd_wqh, &wait);
        for (;;) {
                set_current_state(TASK_INTERRUPTIBLE);
                ret = dequeue_signal(current, &ctx->sigmask, info);
                if (ret != 0)
                        break;
                if (signal_pending(current)) {
                        ret = -ERESTARTSYS;
                        break;
                }
                spin_unlock_irq(&current->sighand->siglock);
                schedule();
                spin_lock_irq(&current->sighand->siglock);
        }
        spin_unlock_irq(&current->sighand->siglock);

        remove_wait_queue(&current->sighand->signalfd_wqh, &wait);
        __set_current_state(TASK_RUNNING);

        return ret;
}

/*
 * Returns a multiple of the size of a "struct signalfd_siginfo", or a negative
 * error code. The "count" parameter must be at least the size of a
 * "struct signalfd_siginfo".
 */
static ssize_t signalfd_read(struct file *file, char __user *buf, size_t count,
                             loff_t *ppos)
{
        struct signalfd_ctx *ctx = file->private_data;
        struct signalfd_siginfo __user *siginfo;
        int nonblock = file->f_flags & O_NONBLOCK;
        ssize_t ret, total = 0;
        kernel_siginfo_t info;

        count /= sizeof(struct signalfd_siginfo);
        if (!count)
                return -EINVAL;

        siginfo = (struct signalfd_siginfo __user *) buf;
        do {
                ret = signalfd_dequeue(ctx, &info, nonblock);
                if (unlikely(ret <= 0))
                        break;
                ret = signalfd_copyinfo(siginfo, &info);
                if (ret < 0)
                        break;
                siginfo++;
                total += ret;
                nonblock = 1;
        } while (--count);

        return total ? total: ret;
}

#ifdef CONFIG_PROC_FS
static void signalfd_show_fdinfo(struct seq_file *m, struct file *f)
{
        struct signalfd_ctx *ctx = f->private_data;
        sigset_t sigmask;

        sigmask = ctx->sigmask;
        signotset(&sigmask);
        render_sigset_t(m, "sigmask:\t", &sigmask);
}
#endif

static const struct file_operations signalfd_fops = {
#ifdef CONFIG_PROC_FS
        .show_fdinfo        = signalfd_show_fdinfo,
#endif
        .release        = signalfd_release,
        .poll                = signalfd_poll,
        .read                = signalfd_read,
        .llseek                = noop_llseek,
        .may_pollfree        = true,
};

static int do_signalfd4(int ufd, sigset_t *mask, int flags)
{
        struct signalfd_ctx *ctx;

        /* Check the SFD_* constants for consistency.  */
        BUILD_BUG_ON(SFD_CLOEXEC != O_CLOEXEC);
        BUILD_BUG_ON(SFD_NONBLOCK != O_NONBLOCK);

        if (flags & ~(SFD_CLOEXEC | SFD_NONBLOCK))
                return -EINVAL;

        sigdelsetmask(mask, sigmask(SIGKILL) | sigmask(SIGSTOP));
        signotset(mask);

        if (ufd == -1) {
                ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
                if (!ctx)
                        return -ENOMEM;

                ctx->sigmask = *mask;

                /*
                 * When we call this, the initialization must be complete, since
                 * anon_inode_getfd() will install the fd.
                 */
                ufd = anon_inode_getfd("[signalfd]", &signalfd_fops, ctx,
                                       O_RDWR | (flags & (O_CLOEXEC | O_NONBLOCK)));
                if (ufd < 0)
                        kfree(ctx);
        } else {
                struct fd f = fdget(ufd);
                if (!f.file)
                        return -EBADF;
                ctx = f.file->private_data;
                if (f.file->f_op != &signalfd_fops) {
                        fdput(f);
                        return -EINVAL;
                }
                spin_lock_irq(&current->sighand->siglock);
                ctx->sigmask = *mask;
                spin_unlock_irq(&current->sighand->siglock);

                wake_up(&current->sighand->signalfd_wqh);
                fdput(f);
        }

        return ufd;
}

SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
                size_t, sizemask, int, flags)
{
        sigset_t mask;

        if (sizemask != sizeof(sigset_t))
                return -EINVAL;
        if (copy_from_user(&mask, user_mask, sizeof(mask)))
                return -EFAULT;
        return do_signalfd4(ufd, &mask, flags);
}

SYSCALL_DEFINE3(signalfd, int, ufd, sigset_t __user *, user_mask,
                size_t, sizemask)
{
        sigset_t mask;

        if (sizemask != sizeof(sigset_t))
                return -EINVAL;
        if (copy_from_user(&mask, user_mask, sizeof(mask)))
                return -EFAULT;
        return do_signalfd4(ufd, &mask, 0);
}

#ifdef CONFIG_COMPAT
static long do_compat_signalfd4(int ufd,
                        const compat_sigset_t __user *user_mask,
                        compat_size_t sigsetsize, int flags)
{
        sigset_t mask;

        if (sigsetsize != sizeof(compat_sigset_t))
                return -EINVAL;
        if (get_compat_sigset(&mask, user_mask))
                return -EFAULT;
        return do_signalfd4(ufd, &mask, flags);
}

COMPAT_SYSCALL_DEFINE4(signalfd4, int, ufd,
                     const compat_sigset_t __user *, user_mask,
                     compat_size_t, sigsetsize,
                     int, flags)
{
        return do_compat_signalfd4(ufd, user_mask, sigsetsize, flags);
}

COMPAT_SYSCALL_DEFINE3(signalfd, int, ufd,
                     const compat_sigset_t __user *, user_mask,
                     compat_size_t, sigsetsize)
{
        return do_compat_signalfd4(ufd, user_mask, sigsetsize, 0);
}
#endif



















































































































































































































































































































    3 









    3 
    3 


    3 
    3 

    3 
    3 
    3 
    3 




















    3 
























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ext4/block_validity.c
 *
 * Copyright (C) 2009
 * Theodore Ts'o (tytso@mit.edu)
 *
 * Track which blocks in the filesystem are metadata blocks that
 * should never be used as data blocks by files or directories.
 */

#include <linux/time.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/quotaops.h>
#include <linux/buffer_head.h>
#include <linux/swap.h>
#include <linux/pagemap.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
#include "ext4.h"

struct ext4_system_zone {
        struct rb_node        node;
        ext4_fsblk_t        start_blk;
        unsigned int        count;
        u32                ino;
};

static struct kmem_cache *ext4_system_zone_cachep;

int __init ext4_init_system_zone(void)
{
        ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 0);
        if (ext4_system_zone_cachep == NULL)
                return -ENOMEM;
        return 0;
}

void ext4_exit_system_zone(void)
{
        rcu_barrier();
        kmem_cache_destroy(ext4_system_zone_cachep);
}

static inline int can_merge(struct ext4_system_zone *entry1,
                     struct ext4_system_zone *entry2)
{
        if ((entry1->start_blk + entry1->count) == entry2->start_blk &&
            entry1->ino == entry2->ino)
                return 1;
        return 0;
}

static void release_system_zone(struct ext4_system_blocks *system_blks)
{
        struct ext4_system_zone        *entry, *n;

        rbtree_postorder_for_each_entry_safe(entry, n,
                                &system_blks->root, node)
                kmem_cache_free(ext4_system_zone_cachep, entry);
}

/*
 * Mark a range of blocks as belonging to the "system zone" --- that
 * is, filesystem metadata blocks which should never be used by
 * inodes.
 */
static int add_system_zone(struct ext4_system_blocks *system_blks,
                           ext4_fsblk_t start_blk,
                           unsigned int count, u32 ino)
{
        struct ext4_system_zone *new_entry, *entry;
        struct rb_node **n = &system_blks->root.rb_node, *node;
        struct rb_node *parent = NULL, *new_node = NULL;

        while (*n) {
                parent = *n;
                entry = rb_entry(parent, struct ext4_system_zone, node);
                if (start_blk < entry->start_blk)
                        n = &(*n)->rb_left;
                else if (start_blk >= (entry->start_blk + entry->count))
                        n = &(*n)->rb_right;
                else        /* Unexpected overlap of system zones. */
                        return -EFSCORRUPTED;
        }

        new_entry = kmem_cache_alloc(ext4_system_zone_cachep,
                                     GFP_KERNEL);
        if (!new_entry)
                return -ENOMEM;
        new_entry->start_blk = start_blk;
        new_entry->count = count;
        new_entry->ino = ino;
        new_node = &new_entry->node;

        rb_link_node(new_node, parent, n);
        rb_insert_color(new_node, &system_blks->root);

        /* Can we merge to the left? */
        node = rb_prev(new_node);
        if (node) {
                entry = rb_entry(node, struct ext4_system_zone, node);
                if (can_merge(entry, new_entry)) {
                        new_entry->start_blk = entry->start_blk;
                        new_entry->count += entry->count;
                        rb_erase(node, &system_blks->root);
                        kmem_cache_free(ext4_system_zone_cachep, entry);
                }
        }

        /* Can we merge to the right? */
        node = rb_next(new_node);
        if (node) {
                entry = rb_entry(node, struct ext4_system_zone, node);
                if (can_merge(new_entry, entry)) {
                        new_entry->count += entry->count;
                        rb_erase(node, &system_blks->root);
                        kmem_cache_free(ext4_system_zone_cachep, entry);
                }
        }
        return 0;
}

static void debug_print_tree(struct ext4_sb_info *sbi)
{
        struct rb_node *node;
        struct ext4_system_zone *entry;
        struct ext4_system_blocks *system_blks;
        int first = 1;

        printk(KERN_INFO "System zones: ");
        rcu_read_lock();
        system_blks = rcu_dereference(sbi->s_system_blks);
        node = rb_first(&system_blks->root);
        while (node) {
                entry = rb_entry(node, struct ext4_system_zone, node);
                printk(KERN_CONT "%s%llu-%llu", first ? "" : ", ",
                       entry->start_blk, entry->start_blk + entry->count - 1);
                first = 0;
                node = rb_next(node);
        }
        rcu_read_unlock();
        printk(KERN_CONT "\n");
}

static int ext4_protect_reserved_inode(struct super_block *sb,
                                       struct ext4_system_blocks *system_blks,
                                       u32 ino)
{
        struct inode *inode;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_map_blocks map;
        u32 i = 0, num;
        int err = 0, n;

        if ((ino < EXT4_ROOT_INO) ||
            (ino > le32_to_cpu(sbi->s_es->s_inodes_count)))
                return -EINVAL;
        inode = ext4_iget(sb, ino, EXT4_IGET_SPECIAL);
        if (IS_ERR(inode))
                return PTR_ERR(inode);
        num = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
        while (i < num) {
                cond_resched();
                map.m_lblk = i;
                map.m_len = num - i;
                n = ext4_map_blocks(NULL, inode, &map, 0);
                if (n < 0) {
                        err = n;
                        break;
                }
                if (n == 0) {
                        i++;
                } else {
                        err = add_system_zone(system_blks, map.m_pblk, n, ino);
                        if (err < 0) {
                                if (err == -EFSCORRUPTED) {
                                        __ext4_error(sb, __func__, __LINE__,
                                                     -err, map.m_pblk,
                                                     "blocks %llu-%llu from inode %u overlap system zone",
                                                     map.m_pblk,
                                                     map.m_pblk + map.m_len - 1,
                                                     ino);
                                }
                                break;
                        }
                        i += n;
                }
        }
        iput(inode);
        return err;
}

static void ext4_destroy_system_zone(struct rcu_head *rcu)
{
        struct ext4_system_blocks *system_blks;

        system_blks = container_of(rcu, struct ext4_system_blocks, rcu);
        release_system_zone(system_blks);
        kfree(system_blks);
}

/*
 * Build system zone rbtree which is used for block validity checking.
 *
 * The update of system_blks pointer in this function is protected by
 * sb->s_umount semaphore. However we have to be careful as we can be
 * racing with ext4_data_block_valid() calls reading system_blks rbtree
 * protected only by RCU. That's why we first build the rbtree and then
 * swap it in place.
 */
int ext4_setup_system_zone(struct super_block *sb)
{
        ext4_group_t ngroups = ext4_get_groups_count(sb);
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_system_blocks *system_blks;
        struct ext4_group_desc *gdp;
        ext4_group_t i;
        int ret;

        system_blks = kzalloc(sizeof(*system_blks), GFP_KERNEL);
        if (!system_blks)
                return -ENOMEM;

        for (i=0; i < ngroups; i++) {
                unsigned int meta_blks = ext4_num_base_meta_blocks(sb, i);

                cond_resched();
                if (meta_blks != 0) {
                        ret = add_system_zone(system_blks,
                                        ext4_group_first_block_no(sb, i),
                                        meta_blks, 0);
                        if (ret)
                                goto err;
                }
                gdp = ext4_get_group_desc(sb, i, NULL);
                ret = add_system_zone(system_blks,
                                ext4_block_bitmap(sb, gdp), 1, 0);
                if (ret)
                        goto err;
                ret = add_system_zone(system_blks,
                                ext4_inode_bitmap(sb, gdp), 1, 0);
                if (ret)
                        goto err;
                ret = add_system_zone(system_blks,
                                ext4_inode_table(sb, gdp),
                                sbi->s_itb_per_group, 0);
                if (ret)
                        goto err;
        }
        if (ext4_has_feature_journal(sb) && sbi->s_es->s_journal_inum) {
                ret = ext4_protect_reserved_inode(sb, system_blks,
                                le32_to_cpu(sbi->s_es->s_journal_inum));
                if (ret)
                        goto err;
        }

        /*
         * System blks rbtree complete, announce it once to prevent racing
         * with ext4_data_block_valid() accessing the rbtree at the same
         * time.
         */
        rcu_assign_pointer(sbi->s_system_blks, system_blks);

        if (test_opt(sb, DEBUG))
                debug_print_tree(sbi);
        return 0;
err:
        release_system_zone(system_blks);
        kfree(system_blks);
        return ret;
}

/*
 * Called when the filesystem is unmounted or when remounting it with
 * noblock_validity specified.
 *
 * The update of system_blks pointer in this function is protected by
 * sb->s_umount semaphore. However we have to be careful as we can be
 * racing with ext4_data_block_valid() calls reading system_blks rbtree
 * protected only by RCU. So we first clear the system_blks pointer and
 * then free the rbtree only after RCU grace period expires.
 */
void ext4_release_system_zone(struct super_block *sb)
{
        struct ext4_system_blocks *system_blks;

        system_blks = rcu_dereference_protected(EXT4_SB(sb)->s_system_blks,
                                        lockdep_is_held(&sb->s_umount));
        rcu_assign_pointer(EXT4_SB(sb)->s_system_blks, NULL);

        if (system_blks)
                call_rcu(&system_blks->rcu, ext4_destroy_system_zone);
}

int ext4_sb_block_valid(struct super_block *sb, struct inode *inode,
                                ext4_fsblk_t start_blk, unsigned int count)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_system_blocks *system_blks;
        struct ext4_system_zone *entry;
        struct rb_node *n;
        int ret = 1;

        if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
            (start_blk + count < start_blk) ||
            (start_blk + count > ext4_blocks_count(sbi->s_es)))
                return 0;

        /*
         * Lock the system zone to prevent it being released concurrently
         * when doing a remount which inverse current "[no]block_validity"
         * mount option.
         */
        rcu_read_lock();
        system_blks = rcu_dereference(sbi->s_system_blks);
        if (system_blks == NULL)
                goto out_rcu;

        n = system_blks->root.rb_node;
        while (n) {
                entry = rb_entry(n, struct ext4_system_zone, node);
                if (start_blk + count - 1 < entry->start_blk)
                        n = n->rb_left;
                else if (start_blk >= (entry->start_blk + entry->count))
                        n = n->rb_right;
                else {
                        ret = 0;
                        if (inode)
                                ret = (entry->ino == inode->i_ino);
                        break;
                }
        }
out_rcu:
        rcu_read_unlock();
        return ret;
}

/*
 * Returns 1 if the passed-in block region (start_blk,
 * start_blk+count) is valid; 0 if some part of the block region
 * overlaps with some other filesystem metadata blocks.
 */
int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk,
                          unsigned int count)
{
        return ext4_sb_block_valid(inode->i_sb, inode, start_blk, count);
}

int ext4_check_blockref(const char *function, unsigned int line,
                        struct inode *inode, __le32 *p, unsigned int max)
{
        __le32 *bref = p;
        unsigned int blk;
        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;

        if (journal && inode == journal->j_inode)
                return 0;

        while (bref < p+max) {
                blk = le32_to_cpu(*bref++);
                if (blk &&
                    unlikely(!ext4_inode_block_valid(inode, blk, 1))) {
                        ext4_error_inode(inode, function, line, blk,
                                         "invalid block");
                        return -EFSCORRUPTED;
                }
        }
        return 0;
}












































































    1 




    1 




















































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_VMSTAT_H
#define _LINUX_VMSTAT_H

#include <linux/types.h>
#include <linux/percpu.h>
#include <linux/mmzone.h>
#include <linux/vm_event_item.h>
#include <linux/atomic.h>
#include <linux/static_key.h>
#include <linux/mmdebug.h>

extern int sysctl_stat_interval;

#ifdef CONFIG_NUMA
#define ENABLE_NUMA_STAT   1
#define DISABLE_NUMA_STAT   0
extern int sysctl_vm_numa_stat;
DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key);
int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write,
                void *buffer, size_t *length, loff_t *ppos);
#endif

struct reclaim_stat {
        unsigned nr_dirty;
        unsigned nr_unqueued_dirty;
        unsigned nr_congested;
        unsigned nr_writeback;
        unsigned nr_immediate;
        unsigned nr_pageout;
        unsigned nr_activate[ANON_AND_FILE];
        unsigned nr_ref_keep;
        unsigned nr_unmap_fail;
        unsigned nr_lazyfree_fail;
};

enum writeback_stat_item {
        NR_DIRTY_THRESHOLD,
        NR_DIRTY_BG_THRESHOLD,
        NR_VM_WRITEBACK_STAT_ITEMS,
};

#ifdef CONFIG_VM_EVENT_COUNTERS
/*
 * Light weight per cpu counter implementation.
 *
 * Counters should only be incremented and no critical kernel component
 * should rely on the counter values.
 *
 * Counters are handled completely inline. On many platforms the code
 * generated will simply be the increment of a global address.
 */

struct vm_event_state {
        unsigned long event[NR_VM_EVENT_ITEMS];
};

DECLARE_PER_CPU(struct vm_event_state, vm_event_states);

/*
 * vm counters are allowed to be racy. Use raw_cpu_ops to avoid the
 * local_irq_disable overhead.
 */
static inline void __count_vm_event(enum vm_event_item item)
{
        raw_cpu_inc(vm_event_states.event[item]);
}

static inline void count_vm_event(enum vm_event_item item)
{
        this_cpu_inc(vm_event_states.event[item]);
}

static inline void __count_vm_events(enum vm_event_item item, long delta)
{
        raw_cpu_add(vm_event_states.event[item], delta);
}

static inline void count_vm_events(enum vm_event_item item, long delta)
{
        this_cpu_add(vm_event_states.event[item], delta);
}

extern void all_vm_events(unsigned long *);

extern void vm_events_fold_cpu(int cpu);

#else

/* Disable counters */
static inline void count_vm_event(enum vm_event_item item)
{
}
static inline void count_vm_events(enum vm_event_item item, long delta)
{
}
static inline void __count_vm_event(enum vm_event_item item)
{
}
static inline void __count_vm_events(enum vm_event_item item, long delta)
{
}
static inline void all_vm_events(unsigned long *ret)
{
}
static inline void vm_events_fold_cpu(int cpu)
{
}

#endif /* CONFIG_VM_EVENT_COUNTERS */

#ifdef CONFIG_NUMA_BALANCING
#define count_vm_numa_event(x)     count_vm_event(x)
#define count_vm_numa_events(x, y) count_vm_events(x, y)
#else
#define count_vm_numa_event(x) do {} while (0)
#define count_vm_numa_events(x, y) do { (void)(y); } while (0)
#endif /* CONFIG_NUMA_BALANCING */

#ifdef CONFIG_DEBUG_TLBFLUSH
#define count_vm_tlb_event(x)           count_vm_event(x)
#define count_vm_tlb_events(x, y)  count_vm_events(x, y)
#else
#define count_vm_tlb_event(x)     do {} while (0)
#define count_vm_tlb_events(x, y) do { (void)(y); } while (0)
#endif

#ifdef CONFIG_DEBUG_VM_VMACACHE
#define count_vm_vmacache_event(x) count_vm_event(x)
#else
#define count_vm_vmacache_event(x) do {} while (0)
#endif

#define __count_zid_vm_events(item, zid, delta) \
        __count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta)

/*
 * Zone and node-based page accounting with per cpu differentials.
 */
extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS];
extern atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
extern atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS];

#ifdef CONFIG_NUMA
static inline void zone_numa_state_add(long x, struct zone *zone,
                                 enum numa_stat_item item)
{
        atomic_long_add(x, &zone->vm_numa_stat[item]);
        atomic_long_add(x, &vm_numa_stat[item]);
}

static inline unsigned long global_numa_state(enum numa_stat_item item)
{
        long x = atomic_long_read(&vm_numa_stat[item]);

        return x;
}

static inline unsigned long zone_numa_state_snapshot(struct zone *zone,
                                        enum numa_stat_item item)
{
        long x = atomic_long_read(&zone->vm_numa_stat[item]);
        int cpu;

        for_each_online_cpu(cpu)
                x += per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item];

        return x;
}
#endif /* CONFIG_NUMA */

static inline void zone_page_state_add(long x, struct zone *zone,
                                 enum zone_stat_item item)
{
        atomic_long_add(x, &zone->vm_stat[item]);
        atomic_long_add(x, &vm_zone_stat[item]);
}

static inline void node_page_state_add(long x, struct pglist_data *pgdat,
                                 enum node_stat_item item)
{
        atomic_long_add(x, &pgdat->vm_stat[item]);
        atomic_long_add(x, &vm_node_stat[item]);
}

static inline unsigned long global_zone_page_state(enum zone_stat_item item)
{
        long x = atomic_long_read(&vm_zone_stat[item]);
#ifdef CONFIG_SMP
        if (x < 0)
                x = 0;
#endif
        return x;
}

static inline
unsigned long global_node_page_state_pages(enum node_stat_item item)
{
        long x = atomic_long_read(&vm_node_stat[item]);
#ifdef CONFIG_SMP
        if (x < 0)
                x = 0;
#endif
        return x;
}

static inline unsigned long global_node_page_state(enum node_stat_item item)
{
        VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));

        return global_node_page_state_pages(item);
}

static inline unsigned long zone_page_state(struct zone *zone,
                                        enum zone_stat_item item)
{
        long x = atomic_long_read(&zone->vm_stat[item]);
#ifdef CONFIG_SMP
        if (x < 0)
                x = 0;
#endif
        return x;
}

/*
 * More accurate version that also considers the currently pending
 * deltas. For that we need to loop over all cpus to find the current
 * deltas. There is no synchronization so the result cannot be
 * exactly accurate either.
 */
static inline unsigned long zone_page_state_snapshot(struct zone *zone,
                                        enum zone_stat_item item)
{
        long x = atomic_long_read(&zone->vm_stat[item]);

#ifdef CONFIG_SMP
        int cpu;
        for_each_online_cpu(cpu)
                x += per_cpu_ptr(zone->pageset, cpu)->vm_stat_diff[item];

        if (x < 0)
                x = 0;
#endif
        return x;
}

#ifdef CONFIG_NUMA
extern void __inc_numa_state(struct zone *zone, enum numa_stat_item item);
extern unsigned long sum_zone_node_page_state(int node,
                                              enum zone_stat_item item);
extern unsigned long sum_zone_numa_state(int node, enum numa_stat_item item);
extern unsigned long node_page_state(struct pglist_data *pgdat,
                                                enum node_stat_item item);
extern unsigned long node_page_state_pages(struct pglist_data *pgdat,
                                           enum node_stat_item item);
#else
#define sum_zone_node_page_state(node, item) global_zone_page_state(item)
#define node_page_state(node, item) global_node_page_state(item)
#define node_page_state_pages(node, item) global_node_page_state_pages(item)
#endif /* CONFIG_NUMA */

#ifdef CONFIG_SMP
void __mod_zone_page_state(struct zone *, enum zone_stat_item item, long);
void __inc_zone_page_state(struct page *, enum zone_stat_item);
void __dec_zone_page_state(struct page *, enum zone_stat_item);

void __mod_node_page_state(struct pglist_data *, enum node_stat_item item, long);
void __inc_node_page_state(struct page *, enum node_stat_item);
void __dec_node_page_state(struct page *, enum node_stat_item);

void mod_zone_page_state(struct zone *, enum zone_stat_item, long);
void inc_zone_page_state(struct page *, enum zone_stat_item);
void dec_zone_page_state(struct page *, enum zone_stat_item);

void mod_node_page_state(struct pglist_data *, enum node_stat_item, long);
void inc_node_page_state(struct page *, enum node_stat_item);
void dec_node_page_state(struct page *, enum node_stat_item);

extern void inc_node_state(struct pglist_data *, enum node_stat_item);
extern void __inc_zone_state(struct zone *, enum zone_stat_item);
extern void __inc_node_state(struct pglist_data *, enum node_stat_item);
extern void dec_zone_state(struct zone *, enum zone_stat_item);
extern void __dec_zone_state(struct zone *, enum zone_stat_item);
extern void __dec_node_state(struct pglist_data *, enum node_stat_item);

void quiet_vmstat(void);
void cpu_vm_stats_fold(int cpu);
void refresh_zone_stat_thresholds(void);

struct ctl_table;
int vmstat_refresh(struct ctl_table *, int write, void *buffer, size_t *lenp,
                loff_t *ppos);

void drain_zonestat(struct zone *zone, struct per_cpu_pageset *);

int calculate_pressure_threshold(struct zone *zone);
int calculate_normal_threshold(struct zone *zone);
void set_pgdat_percpu_threshold(pg_data_t *pgdat,
                                int (*calculate_pressure)(struct zone *));
#else /* CONFIG_SMP */

/*
 * We do not maintain differentials in a single processor configuration.
 * The functions directly modify the zone and global counters.
 */
static inline void __mod_zone_page_state(struct zone *zone,
                        enum zone_stat_item item, long delta)
{
        zone_page_state_add(delta, zone, item);
}

static inline void __mod_node_page_state(struct pglist_data *pgdat,
                        enum node_stat_item item, int delta)
{
        if (vmstat_item_in_bytes(item)) {
                VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
                delta >>= PAGE_SHIFT;
        }

        node_page_state_add(delta, pgdat, item);
}

static inline void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
{
        atomic_long_inc(&zone->vm_stat[item]);
        atomic_long_inc(&vm_zone_stat[item]);
}

static inline void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
{
        atomic_long_inc(&pgdat->vm_stat[item]);
        atomic_long_inc(&vm_node_stat[item]);
}

static inline void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
{
        atomic_long_dec(&zone->vm_stat[item]);
        atomic_long_dec(&vm_zone_stat[item]);
}

static inline void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
{
        atomic_long_dec(&pgdat->vm_stat[item]);
        atomic_long_dec(&vm_node_stat[item]);
}

static inline void __inc_zone_page_state(struct page *page,
                        enum zone_stat_item item)
{
        __inc_zone_state(page_zone(page), item);
}

static inline void __inc_node_page_state(struct page *page,
                        enum node_stat_item item)
{
        __inc_node_state(page_pgdat(page), item);
}


static inline void __dec_zone_page_state(struct page *page,
                        enum zone_stat_item item)
{
        __dec_zone_state(page_zone(page), item);
}

static inline void __dec_node_page_state(struct page *page,
                        enum node_stat_item item)
{
        __dec_node_state(page_pgdat(page), item);
}


/*
 * We only use atomic operations to update counters. So there is no need to
 * disable interrupts.
 */
#define inc_zone_page_state __inc_zone_page_state
#define dec_zone_page_state __dec_zone_page_state
#define mod_zone_page_state __mod_zone_page_state

#define inc_node_page_state __inc_node_page_state
#define dec_node_page_state __dec_node_page_state
#define mod_node_page_state __mod_node_page_state

#define inc_zone_state __inc_zone_state
#define inc_node_state __inc_node_state
#define dec_zone_state __dec_zone_state

#define set_pgdat_percpu_threshold(pgdat, callback) { }

static inline void refresh_zone_stat_thresholds(void) { }
static inline void cpu_vm_stats_fold(int cpu) { }
static inline void quiet_vmstat(void) { }

static inline void drain_zonestat(struct zone *zone,
                        struct per_cpu_pageset *pset) { }
#endif                /* CONFIG_SMP */

static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages,
                                             int migratetype)
{
        __mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages);
        if (is_migrate_cma(migratetype))
                __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
}

extern const char * const vmstat_text[];

static inline const char *zone_stat_name(enum zone_stat_item item)
{
        return vmstat_text[item];
}

#ifdef CONFIG_NUMA
static inline const char *numa_stat_name(enum numa_stat_item item)
{
        return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
                           item];
}
#endif /* CONFIG_NUMA */

static inline const char *node_stat_name(enum node_stat_item item)
{
        return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
                           NR_VM_NUMA_STAT_ITEMS +
                           item];
}

static inline const char *lru_list_name(enum lru_list lru)
{
        return node_stat_name(NR_LRU_BASE + (enum node_stat_item)lru) + 3; // skip "nr_"
}

static inline const char *writeback_stat_name(enum writeback_stat_item item)
{
        return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
                           NR_VM_NUMA_STAT_ITEMS +
                           NR_VM_NODE_STAT_ITEMS +
                           item];
}

#if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
static inline const char *vm_event_name(enum vm_event_item item)
{
        return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
                           NR_VM_NUMA_STAT_ITEMS +
                           NR_VM_NODE_STAT_ITEMS +
                           NR_VM_WRITEBACK_STAT_ITEMS +
                           item];
}
#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */

#endif /* _LINUX_VMSTAT_H */



















































































    4 
    4 







































































































































































































































































































































































































































































































































































































































































































































































































































    1 

    1 





    1 
    1 

    1 







    1 



    1 


    1 




















    1 

    1 


    1 























































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
// SPDX-License-Identifier: GPL-2.0-only
/* Kernel thread helper functions.
 *   Copyright (C) 2004 IBM Corporation, Rusty Russell.
 *   Copyright (C) 2009 Red Hat, Inc.
 *
 * Creation is done via kthreadd, so that we get a clean environment
 * even if we're invoked from userspace (think modprobe, hotplug cpu,
 * etc.).
 */
#include <uapi/linux/sched/types.h>
#include <linux/mm.h>
#include <linux/mmu_context.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/kthread.h>
#include <linux/completion.h>
#include <linux/err.h>
#include <linux/cgroup.h>
#include <linux/cpuset.h>
#include <linux/unistd.h>
#include <linux/file.h>
#include <linux/export.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/freezer.h>
#include <linux/ptrace.h>
#include <linux/uaccess.h>
#include <linux/numa.h>
#include <linux/sched/isolation.h>
#include <trace/events/sched.h>


static DEFINE_SPINLOCK(kthread_create_lock);
static LIST_HEAD(kthread_create_list);
struct task_struct *kthreadd_task;

struct kthread_create_info
{
        /* Information passed to kthread() from kthreadd. */
        int (*threadfn)(void *data);
        void *data;
        int node;

        /* Result passed back to kthread_create() from kthreadd. */
        struct task_struct *result;
        struct completion *done;

        struct list_head list;
};

struct kthread {
        unsigned long flags;
        unsigned int cpu;
        int (*threadfn)(void *);
        void *data;
        mm_segment_t oldfs;
        struct completion parked;
        struct completion exited;
#ifdef CONFIG_BLK_CGROUP
        struct cgroup_subsys_state *blkcg_css;
#endif
};

enum KTHREAD_BITS {
        KTHREAD_IS_PER_CPU = 0,
        KTHREAD_SHOULD_STOP,
        KTHREAD_SHOULD_PARK,
};

static inline void set_kthread_struct(void *kthread)
{
        /*
         * We abuse ->set_child_tid to avoid the new member and because it
         * can't be wrongly copied by copy_process(). We also rely on fact
         * that the caller can't exec, so PF_KTHREAD can't be cleared.
         */
        current->set_child_tid = (__force void __user *)kthread;
}

static inline struct kthread *to_kthread(struct task_struct *k)
{
        WARN_ON(!(k->flags & PF_KTHREAD));
        return (__force void *)k->set_child_tid;
}

/*
 * Variant of to_kthread() that doesn't assume @p is a kthread.
 *
 * Per construction; when:
 *
 *   (p->flags & PF_KTHREAD) && p->set_child_tid
 *
 * the task is both a kthread and struct kthread is persistent. However
 * PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and
 * begin_new_exec()).
 */
static inline struct kthread *__to_kthread(struct task_struct *p)
{
        void *kthread = (__force void *)p->set_child_tid;
        if (kthread && !(p->flags & PF_KTHREAD))
                kthread = NULL;
        return kthread;
}

void free_kthread_struct(struct task_struct *k)
{
        struct kthread *kthread;

        /*
         * Can be NULL if this kthread was created by kernel_thread()
         * or if kmalloc() in kthread() failed.
         */
        kthread = to_kthread(k);
#ifdef CONFIG_BLK_CGROUP
        WARN_ON_ONCE(kthread && kthread->blkcg_css);
#endif
        kfree(kthread);
}

/**
 * kthread_should_stop - should this kthread return now?
 *
 * When someone calls kthread_stop() on your kthread, it will be woken
 * and this will return true.  You should then return, and your return
 * value will be passed through to kthread_stop().
 */
bool kthread_should_stop(void)
{
        return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags);
}
EXPORT_SYMBOL(kthread_should_stop);

bool __kthread_should_park(struct task_struct *k)
{
        return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags);
}
EXPORT_SYMBOL_GPL(__kthread_should_park);

/**
 * kthread_should_park - should this kthread park now?
 *
 * When someone calls kthread_park() on your kthread, it will be woken
 * and this will return true.  You should then do the necessary
 * cleanup and call kthread_parkme()
 *
 * Similar to kthread_should_stop(), but this keeps the thread alive
 * and in a park position. kthread_unpark() "restarts" the thread and
 * calls the thread function again.
 */
bool kthread_should_park(void)
{
        return __kthread_should_park(current);
}
EXPORT_SYMBOL_GPL(kthread_should_park);

/**
 * kthread_freezable_should_stop - should this freezable kthread return now?
 * @was_frozen: optional out parameter, indicates whether %current was frozen
 *
 * kthread_should_stop() for freezable kthreads, which will enter
 * refrigerator if necessary.  This function is safe from kthread_stop() /
 * freezer deadlock and freezable kthreads should use this function instead
 * of calling try_to_freeze() directly.
 */
bool kthread_freezable_should_stop(bool *was_frozen)
{
        bool frozen = false;

        might_sleep();

        if (unlikely(freezing(current)))
                frozen = __refrigerator(true);

        if (was_frozen)
                *was_frozen = frozen;

        return kthread_should_stop();
}
EXPORT_SYMBOL_GPL(kthread_freezable_should_stop);

/**
 * kthread_func - return the function specified on kthread creation
 * @task: kthread task in question
 *
 * Returns NULL if the task is not a kthread.
 */
void *kthread_func(struct task_struct *task)
{
        struct kthread *kthread = __to_kthread(task);
        if (kthread)
                return kthread->threadfn;
        return NULL;
}
EXPORT_SYMBOL_GPL(kthread_func);

/**
 * kthread_data - return data value specified on kthread creation
 * @task: kthread task in question
 *
 * Return the data value specified when kthread @task was created.
 * The caller is responsible for ensuring the validity of @task when
 * calling this function.
 */
void *kthread_data(struct task_struct *task)
{
        return to_kthread(task)->data;
}
EXPORT_SYMBOL_GPL(kthread_data);

/**
 * kthread_probe_data - speculative version of kthread_data()
 * @task: possible kthread task in question
 *
 * @task could be a kthread task.  Return the data value specified when it
 * was created if accessible.  If @task isn't a kthread task or its data is
 * inaccessible for any reason, %NULL is returned.  This function requires
 * that @task itself is safe to dereference.
 */
void *kthread_probe_data(struct task_struct *task)
{
        struct kthread *kthread = __to_kthread(task);
        void *data = NULL;

        if (kthread)
                copy_from_kernel_nofault(&data, &kthread->data, sizeof(data));
        return data;
}

static void __kthread_parkme(struct kthread *self)
{
        for (;;) {
                /*
                 * TASK_PARKED is a special state; we must serialize against
                 * possible pending wakeups to avoid store-store collisions on
                 * task->state.
                 *
                 * Such a collision might possibly result in the task state
                 * changin from TASK_PARKED and us failing the
                 * wait_task_inactive() in kthread_park().
                 */
                set_special_state(TASK_PARKED);
                if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags))
                        break;

                /*
                 * Thread is going to call schedule(), do not preempt it,
                 * or the caller of kthread_park() may spend more time in
                 * wait_task_inactive().
                 */
                preempt_disable();
                complete(&self->parked);
                schedule_preempt_disabled();
                preempt_enable();
        }
        __set_current_state(TASK_RUNNING);
}

void kthread_parkme(void)
{
        __kthread_parkme(to_kthread(current));
}
EXPORT_SYMBOL_GPL(kthread_parkme);

/**
 * kthread_exit - Cause the current kthread return @result to kthread_stop().
 * @result: The integer value to return to kthread_stop().
 *
 * While kthread_exit can be called directly, it exists so that
 * functions which do some additional work in non-modular code such as
 * module_put_and_kthread_exit can be implemented.
 *
 * Does not return.
 */
void __noreturn kthread_exit(long result)
{
        do_exit(result);
}

static int kthread(void *_create)
{
        /* Copy data: it's on kthread's stack */
        struct kthread_create_info *create = _create;
        int (*threadfn)(void *data) = create->threadfn;
        void *data = create->data;
        struct completion *done;
        struct kthread *self;
        int ret;

        self = kzalloc(sizeof(*self), GFP_KERNEL);
        set_kthread_struct(self);

        /* If user was SIGKILLed, I release the structure. */
        done = xchg(&create->done, NULL);
        if (!done) {
                kfree(create);
                kthread_exit(-EINTR);
        }

        if (!self) {
                create->result = ERR_PTR(-ENOMEM);
                complete(done);
                kthread_exit(-ENOMEM);
        }

        self->threadfn = threadfn;
        self->data = data;
        init_completion(&self->exited);
        init_completion(&self->parked);
        current->vfork_done = &self->exited;

        /* OK, tell user we're spawned, wait for stop or wakeup */
        __set_current_state(TASK_UNINTERRUPTIBLE);
        create->result = current;
        /*
         * Thread is going to call schedule(), do not preempt it,
         * or the creator may spend more time in wait_task_inactive().
         */
        preempt_disable();
        complete(done);
        schedule_preempt_disabled();
        preempt_enable();

        ret = -EINTR;
        if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) {
                cgroup_kthread_ready();
                __kthread_parkme(self);
                ret = threadfn(data);
        }
        kthread_exit(ret);
}

/* called from do_fork() to get node information for about to be created task */
int tsk_fork_get_node(struct task_struct *tsk)
{
#ifdef CONFIG_NUMA
        if (tsk == kthreadd_task)
                return tsk->pref_node_fork;
#endif
        return NUMA_NO_NODE;
}

static void create_kthread(struct kthread_create_info *create)
{
        int pid;

#ifdef CONFIG_NUMA
        current->pref_node_fork = create->node;
#endif
        /* We want our own signal handler (we take no signals by default). */
        pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
        if (pid < 0) {
                /* If user was SIGKILLed, I release the structure. */
                struct completion *done = xchg(&create->done, NULL);

                if (!done) {
                        kfree(create);
                        return;
                }
                create->result = ERR_PTR(pid);
                complete(done);
        }
}

static __printf(4, 0)
struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
                                                    void *data, int node,
                                                    const char namefmt[],
                                                    va_list args)
{
        DECLARE_COMPLETION_ONSTACK(done);
        struct task_struct *task;
        struct kthread_create_info *create = kmalloc(sizeof(*create),
                                                     GFP_KERNEL);

        if (!create)
                return ERR_PTR(-ENOMEM);
        create->threadfn = threadfn;
        create->data = data;
        create->node = node;
        create->done = &done;

        spin_lock(&kthread_create_lock);
        list_add_tail(&create->list, &kthread_create_list);
        spin_unlock(&kthread_create_lock);

        wake_up_process(kthreadd_task);
        /*
         * Wait for completion in killable state, for I might be chosen by
         * the OOM killer while kthreadd is trying to allocate memory for
         * new kernel thread.
         */
        if (unlikely(wait_for_completion_killable(&done))) {
                /*
                 * If I was SIGKILLed before kthreadd (or new kernel thread)
                 * calls complete(), leave the cleanup of this structure to
                 * that thread.
                 */
                if (xchg(&create->done, NULL))
                        return ERR_PTR(-EINTR);
                /*
                 * kthreadd (or new kernel thread) will call complete()
                 * shortly.
                 */
                wait_for_completion(&done);
        }
        task = create->result;
        if (!IS_ERR(task)) {
                static const struct sched_param param = { .sched_priority = 0 };
                char name[TASK_COMM_LEN];

                /*
                 * task is already visible to other tasks, so updating
                 * COMM must be protected.
                 */
                vsnprintf(name, sizeof(name), namefmt, args);
                set_task_comm(task, name);
                /*
                 * root may have changed our (kthreadd's) priority or CPU mask.
                 * The kernel thread should not inherit these properties.
                 */
                sched_setscheduler_nocheck(task, SCHED_NORMAL, &param);
                set_cpus_allowed_ptr(task,
                                     housekeeping_cpumask(HK_FLAG_KTHREAD));
        }
        kfree(create);
        return task;
}

/**
 * kthread_create_on_node - create a kthread.
 * @threadfn: the function to run until signal_pending(current).
 * @data: data ptr for @threadfn.
 * @node: task and thread structures for the thread are allocated on this node
 * @namefmt: printf-style name for the thread.
 *
 * Description: This helper function creates and names a kernel
 * thread.  The thread will be stopped: use wake_up_process() to start
 * it.  See also kthread_run().  The new thread has SCHED_NORMAL policy and
 * is affine to all CPUs.
 *
 * If thread is going to be bound on a particular cpu, give its node
 * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE.
 * When woken, the thread will run @threadfn() with @data as its
 * argument. @threadfn() can either call do_exit() directly if it is a
 * standalone thread for which no one will call kthread_stop(), or
 * return when 'kthread_should_stop()' is true (which means
 * kthread_stop() has been called).  The return value should be zero
 * or a negative error number; it will be passed to kthread_stop().
 *
 * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR).
 */
struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
                                           void *data, int node,
                                           const char namefmt[],
                                           ...)
{
        struct task_struct *task;
        va_list args;

        va_start(args, namefmt);
        task = __kthread_create_on_node(threadfn, data, node, namefmt, args);
        va_end(args);

        return task;
}
EXPORT_SYMBOL(kthread_create_on_node);

static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state)
{
        unsigned long flags;

        if (!wait_task_inactive(p, state)) {
                WARN_ON(1);
                return;
        }

        /* It's safe because the task is inactive. */
        raw_spin_lock_irqsave(&p->pi_lock, flags);
        do_set_cpus_allowed(p, mask);
        p->flags |= PF_NO_SETAFFINITY;
        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
}

static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
{
        __kthread_bind_mask(p, cpumask_of(cpu), state);
}

void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask)
{
        __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE);
}

/**
 * kthread_bind - bind a just-created kthread to a cpu.
 * @p: thread created by kthread_create().
 * @cpu: cpu (might not be online, must be possible) for @k to run on.
 *
 * Description: This function is equivalent to set_cpus_allowed(),
 * except that @cpu doesn't need to be online, and the thread must be
 * stopped (i.e., just returned from kthread_create()).
 */
void kthread_bind(struct task_struct *p, unsigned int cpu)
{
        __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(kthread_bind);

/**
 * kthread_create_on_cpu - Create a cpu bound kthread
 * @threadfn: the function to run until signal_pending(current).
 * @data: data ptr for @threadfn.
 * @cpu: The cpu on which the thread should be bound,
 * @namefmt: printf-style name for the thread. Format is restricted
 *             to "name.*%u". Code fills in cpu number.
 *
 * Description: This helper function creates and names a kernel thread
 */
struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
                                          void *data, unsigned int cpu,
                                          const char *namefmt)
{
        struct task_struct *p;

        p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt,
                                   cpu);
        if (IS_ERR(p))
                return p;
        kthread_bind(p, cpu);
        /* CPU hotplug need to bind once again when unparking the thread. */
        to_kthread(p)->cpu = cpu;
        return p;
}

void kthread_set_per_cpu(struct task_struct *k, int cpu)
{
        struct kthread *kthread = to_kthread(k);
        if (!kthread)
                return;

        WARN_ON_ONCE(!(k->flags & PF_NO_SETAFFINITY));

        if (cpu < 0) {
                clear_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
                return;
        }

        kthread->cpu = cpu;
        set_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
}

bool kthread_is_per_cpu(struct task_struct *p)
{
        struct kthread *kthread = __to_kthread(p);
        if (!kthread)
                return false;

        return test_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
}

/**
 * kthread_unpark - unpark a thread created by kthread_create().
 * @k:                thread created by kthread_create().
 *
 * Sets kthread_should_park() for @k to return false, wakes it, and
 * waits for it to return. If the thread is marked percpu then its
 * bound to the cpu again.
 */
void kthread_unpark(struct task_struct *k)
{
        struct kthread *kthread = to_kthread(k);

        /*
         * Newly created kthread was parked when the CPU was offline.
         * The binding was lost and we need to set it again.
         */
        if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
                __kthread_bind(k, kthread->cpu, TASK_PARKED);

        clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
        /*
         * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup.
         */
        wake_up_state(k, TASK_PARKED);
}
EXPORT_SYMBOL_GPL(kthread_unpark);

/**
 * kthread_park - park a thread created by kthread_create().
 * @k: thread created by kthread_create().
 *
 * Sets kthread_should_park() for @k to return true, wakes it, and
 * waits for it to return. This can also be called after kthread_create()
 * instead of calling wake_up_process(): the thread will park without
 * calling threadfn().
 *
 * Returns 0 if the thread is parked, -ENOSYS if the thread exited.
 * If called by the kthread itself just the park bit is set.
 */
int kthread_park(struct task_struct *k)
{
        struct kthread *kthread = to_kthread(k);

        if (WARN_ON(k->flags & PF_EXITING))
                return -ENOSYS;

        if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags)))
                return -EBUSY;

        set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
        if (k != current) {
                wake_up_process(k);
                /*
                 * Wait for __kthread_parkme() to complete(), this means we
                 * _will_ have TASK_PARKED and are about to call schedule().
                 */
                wait_for_completion(&kthread->parked);
                /*
                 * Now wait for that schedule() to complete and the task to
                 * get scheduled out.
                 */
                WARN_ON_ONCE(!wait_task_inactive(k, TASK_PARKED));
        }

        return 0;
}
EXPORT_SYMBOL_GPL(kthread_park);

/**
 * kthread_stop - stop a thread created by kthread_create().
 * @k: thread created by kthread_create().
 *
 * Sets kthread_should_stop() for @k to return true, wakes it, and
 * waits for it to exit. This can also be called after kthread_create()
 * instead of calling wake_up_process(): the thread will exit without
 * calling threadfn().
 *
 * If threadfn() may call kthread_exit() itself, the caller must ensure
 * task_struct can't go away.
 *
 * Returns the result of threadfn(), or %-EINTR if wake_up_process()
 * was never called.
 */
int kthread_stop(struct task_struct *k)
{
        struct kthread *kthread;
        int ret;

        trace_sched_kthread_stop(k);

        get_task_struct(k);
        kthread = to_kthread(k);
        set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
        kthread_unpark(k);
        wake_up_process(k);
        wait_for_completion(&kthread->exited);
        ret = k->exit_code;
        put_task_struct(k);

        trace_sched_kthread_stop_ret(ret);
        return ret;
}
EXPORT_SYMBOL(kthread_stop);

int kthreadd(void *unused)
{
        struct task_struct *tsk = current;

        /* Setup a clean context for our children to inherit. */
        set_task_comm(tsk, "kthreadd");
        ignore_signals(tsk);
        set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_FLAG_KTHREAD));
        set_mems_allowed(node_states[N_MEMORY]);

        current->flags |= PF_NOFREEZE;
        cgroup_init_kthreadd();

        for (;;) {
                set_current_state(TASK_INTERRUPTIBLE);
                if (list_empty(&kthread_create_list))
                        schedule();
                __set_current_state(TASK_RUNNING);

                spin_lock(&kthread_create_lock);
                while (!list_empty(&kthread_create_list)) {
                        struct kthread_create_info *create;

                        create = list_entry(kthread_create_list.next,
                                            struct kthread_create_info, list);
                        list_del_init(&create->list);
                        spin_unlock(&kthread_create_lock);

                        create_kthread(create);

                        spin_lock(&kthread_create_lock);
                }
                spin_unlock(&kthread_create_lock);
        }

        return 0;
}

void __kthread_init_worker(struct kthread_worker *worker,
                                const char *name,
                                struct lock_class_key *key)
{
        memset(worker, 0, sizeof(struct kthread_worker));
        raw_spin_lock_init(&worker->lock);
        lockdep_set_class_and_name(&worker->lock, key, name);
        INIT_LIST_HEAD(&worker->work_list);
        INIT_LIST_HEAD(&worker->delayed_work_list);
}
EXPORT_SYMBOL_GPL(__kthread_init_worker);

/**
 * kthread_worker_fn - kthread function to process kthread_worker
 * @worker_ptr: pointer to initialized kthread_worker
 *
 * This function implements the main cycle of kthread worker. It processes
 * work_list until it is stopped with kthread_stop(). It sleeps when the queue
 * is empty.
 *
 * The works are not allowed to keep any locks, disable preemption or interrupts
 * when they finish. There is defined a safe point for freezing when one work
 * finishes and before a new one is started.
 *
 * Also the works must not be handled by more than one worker at the same time,
 * see also kthread_queue_work().
 */
int kthread_worker_fn(void *worker_ptr)
{
        struct kthread_worker *worker = worker_ptr;
        struct kthread_work *work;

        /*
         * FIXME: Update the check and remove the assignment when all kthread
         * worker users are created using kthread_create_worker*() functions.
         */
        WARN_ON(worker->task && worker->task != current);
        worker->task = current;

        if (worker->flags & KTW_FREEZABLE)
                set_freezable();

repeat:
        set_current_state(TASK_INTERRUPTIBLE);        /* mb paired w/ kthread_stop */

        if (kthread_should_stop()) {
                __set_current_state(TASK_RUNNING);
                raw_spin_lock_irq(&worker->lock);
                worker->task = NULL;
                raw_spin_unlock_irq(&worker->lock);
                return 0;
        }

        work = NULL;
        raw_spin_lock_irq(&worker->lock);
        if (!list_empty(&worker->work_list)) {
                work = list_first_entry(&worker->work_list,
                                        struct kthread_work, node);
                list_del_init(&work->node);
        }
        worker->current_work = work;
        raw_spin_unlock_irq(&worker->lock);

        if (work) {
                kthread_work_func_t func = work->func;
                __set_current_state(TASK_RUNNING);
                trace_sched_kthread_work_execute_start(work);
                work->func(work);
                /*
                 * Avoid dereferencing work after this point.  The trace
                 * event only cares about the address.
                 */
                trace_sched_kthread_work_execute_end(work, func);
        } else if (!freezing(current)) {
                schedule();
        } else {
                /*
                 * Handle the case where the current remains
                 * TASK_INTERRUPTIBLE. try_to_freeze() expects
                 * the current to be TASK_RUNNING.
                 */
                __set_current_state(TASK_RUNNING);
        }

        try_to_freeze();
        cond_resched();
        goto repeat;
}
EXPORT_SYMBOL_GPL(kthread_worker_fn);

static __printf(3, 0) struct kthread_worker *
__kthread_create_worker(int cpu, unsigned int flags,
                        const char namefmt[], va_list args)
{
        struct kthread_worker *worker;
        struct task_struct *task;
        int node = NUMA_NO_NODE;

        worker = kzalloc(sizeof(*worker), GFP_KERNEL);
        if (!worker)
                return ERR_PTR(-ENOMEM);

        kthread_init_worker(worker);

        if (cpu >= 0)
                node = cpu_to_node(cpu);

        task = __kthread_create_on_node(kthread_worker_fn, worker,
                                                node, namefmt, args);
        if (IS_ERR(task))
                goto fail_task;

        if (cpu >= 0)
                kthread_bind(task, cpu);

        worker->flags = flags;
        worker->task = task;
        wake_up_process(task);
        return worker;

fail_task:
        kfree(worker);
        return ERR_CAST(task);
}

/**
 * kthread_create_worker - create a kthread worker
 * @flags: flags modifying the default behavior of the worker
 * @namefmt: printf-style name for the kthread worker (task).
 *
 * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
 * when the needed structures could not get allocated, and ERR_PTR(-EINTR)
 * when the worker was SIGKILLed.
 */
struct kthread_worker *
kthread_create_worker(unsigned int flags, const char namefmt[], ...)
{
        struct kthread_worker *worker;
        va_list args;

        va_start(args, namefmt);
        worker = __kthread_create_worker(-1, flags, namefmt, args);
        va_end(args);

        return worker;
}
EXPORT_SYMBOL(kthread_create_worker);

/**
 * kthread_create_worker_on_cpu - create a kthread worker and bind it
 *        to a given CPU and the associated NUMA node.
 * @cpu: CPU number
 * @flags: flags modifying the default behavior of the worker
 * @namefmt: printf-style name for the kthread worker (task).
 *
 * Use a valid CPU number if you want to bind the kthread worker
 * to the given CPU and the associated NUMA node.
 *
 * A good practice is to add the cpu number also into the worker name.
 * For example, use kthread_create_worker_on_cpu(cpu, "helper/%d", cpu).
 *
 * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
 * when the needed structures could not get allocated, and ERR_PTR(-EINTR)
 * when the worker was SIGKILLed.
 */
struct kthread_worker *
kthread_create_worker_on_cpu(int cpu, unsigned int flags,
                             const char namefmt[], ...)
{
        struct kthread_worker *worker;
        va_list args;

        va_start(args, namefmt);
        worker = __kthread_create_worker(cpu, flags, namefmt, args);
        va_end(args);

        return worker;
}
EXPORT_SYMBOL(kthread_create_worker_on_cpu);

/*
 * Returns true when the work could not be queued at the moment.
 * It happens when it is already pending in a worker list
 * or when it is being cancelled.
 */
static inline bool queuing_blocked(struct kthread_worker *worker,
                                   struct kthread_work *work)
{
        lockdep_assert_held(&worker->lock);

        return !list_empty(&work->node) || work->canceling;
}

static void kthread_insert_work_sanity_check(struct kthread_worker *worker,
                                             struct kthread_work *work)
{
        lockdep_assert_held(&worker->lock);
        WARN_ON_ONCE(!list_empty(&work->node));
        /* Do not use a work with >1 worker, see kthread_queue_work() */
        WARN_ON_ONCE(work->worker && work->worker != worker);
}

/* insert @work before @pos in @worker */
static void kthread_insert_work(struct kthread_worker *worker,
                                struct kthread_work *work,
                                struct list_head *pos)
{
        kthread_insert_work_sanity_check(worker, work);

        trace_sched_kthread_work_queue_work(worker, work);

        list_add_tail(&work->node, pos);
        work->worker = worker;
        if (!worker->current_work && likely(worker->task))
                wake_up_process(worker->task);
}

/**
 * kthread_queue_work - queue a kthread_work
 * @worker: target kthread_worker
 * @work: kthread_work to queue
 *
 * Queue @work to work processor @task for async execution.  @task
 * must have been created with kthread_worker_create().  Returns %true
 * if @work was successfully queued, %false if it was already pending.
 *
 * Reinitialize the work if it needs to be used by another worker.
 * For example, when the worker was stopped and started again.
 */
bool kthread_queue_work(struct kthread_worker *worker,
                        struct kthread_work *work)
{
        bool ret = false;
        unsigned long flags;

        raw_spin_lock_irqsave(&worker->lock, flags);
        if (!queuing_blocked(worker, work)) {
                kthread_insert_work(worker, work, &worker->work_list);
                ret = true;
        }
        raw_spin_unlock_irqrestore(&worker->lock, flags);
        return ret;
}
EXPORT_SYMBOL_GPL(kthread_queue_work);

/**
 * kthread_delayed_work_timer_fn - callback that queues the associated kthread
 *        delayed work when the timer expires.
 * @t: pointer to the expired timer
 *
 * The format of the function is defined by struct timer_list.
 * It should have been called from irqsafe timer with irq already off.
 */
void kthread_delayed_work_timer_fn(struct timer_list *t)
{
        struct kthread_delayed_work *dwork = from_timer(dwork, t, timer);
        struct kthread_work *work = &dwork->work;
        struct kthread_worker *worker = work->worker;
        unsigned long flags;

        /*
         * This might happen when a pending work is reinitialized.
         * It means that it is used a wrong way.
         */
        if (WARN_ON_ONCE(!worker))
                return;

        raw_spin_lock_irqsave(&worker->lock, flags);
        /* Work must not be used with >1 worker, see kthread_queue_work(). */
        WARN_ON_ONCE(work->worker != worker);

        /* Move the work from worker->delayed_work_list. */
        WARN_ON_ONCE(list_empty(&work->node));
        list_del_init(&work->node);
        if (!work->canceling)
                kthread_insert_work(worker, work, &worker->work_list);

        raw_spin_unlock_irqrestore(&worker->lock, flags);
}
EXPORT_SYMBOL(kthread_delayed_work_timer_fn);

static void __kthread_queue_delayed_work(struct kthread_worker *worker,
                                         struct kthread_delayed_work *dwork,
                                         unsigned long delay)
{
        struct timer_list *timer = &dwork->timer;
        struct kthread_work *work = &dwork->work;

        WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn);

        /*
         * If @delay is 0, queue @dwork->work immediately.  This is for
         * both optimization and correctness.  The earliest @timer can
         * expire is on the closest next tick and delayed_work users depend
         * on that there's no such delay when @delay is 0.
         */
        if (!delay) {
                kthread_insert_work(worker, work, &worker->work_list);
                return;
        }

        /* Be paranoid and try to detect possible races already now. */
        kthread_insert_work_sanity_check(worker, work);

        list_add(&work->node, &worker->delayed_work_list);
        work->worker = worker;
        timer->expires = jiffies + delay;
        add_timer(timer);
}

/**
 * kthread_queue_delayed_work - queue the associated kthread work
 *        after a delay.
 * @worker: target kthread_worker
 * @dwork: kthread_delayed_work to queue
 * @delay: number of jiffies to wait before queuing
 *
 * If the work has not been pending it starts a timer that will queue
 * the work after the given @delay. If @delay is zero, it queues the
 * work immediately.
 *
 * Return: %false if the @work has already been pending. It means that
 * either the timer was running or the work was queued. It returns %true
 * otherwise.
 */
bool kthread_queue_delayed_work(struct kthread_worker *worker,
                                struct kthread_delayed_work *dwork,
                                unsigned long delay)
{
        struct kthread_work *work = &dwork->work;
        unsigned long flags;
        bool ret = false;

        raw_spin_lock_irqsave(&worker->lock, flags);

        if (!queuing_blocked(worker, work)) {
                __kthread_queue_delayed_work(worker, dwork, delay);
                ret = true;
        }

        raw_spin_unlock_irqrestore(&worker->lock, flags);
        return ret;
}
EXPORT_SYMBOL_GPL(kthread_queue_delayed_work);

struct kthread_flush_work {
        struct kthread_work        work;
        struct completion        done;
};

static void kthread_flush_work_fn(struct kthread_work *work)
{
        struct kthread_flush_work *fwork =
                container_of(work, struct kthread_flush_work, work);
        complete(&fwork->done);
}

/**
 * kthread_flush_work - flush a kthread_work
 * @work: work to flush
 *
 * If @work is queued or executing, wait for it to finish execution.
 */
void kthread_flush_work(struct kthread_work *work)
{
        struct kthread_flush_work fwork = {
                KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
                COMPLETION_INITIALIZER_ONSTACK(fwork.done),
        };
        struct kthread_worker *worker;
        bool noop = false;

        worker = work->worker;
        if (!worker)
                return;

        raw_spin_lock_irq(&worker->lock);
        /* Work must not be used with >1 worker, see kthread_queue_work(). */
        WARN_ON_ONCE(work->worker != worker);

        if (!list_empty(&work->node))
                kthread_insert_work(worker, &fwork.work, work->node.next);
        else if (worker->current_work == work)
                kthread_insert_work(worker, &fwork.work,
                                    worker->work_list.next);
        else
                noop = true;

        raw_spin_unlock_irq(&worker->lock);

        if (!noop)
                wait_for_completion(&fwork.done);
}
EXPORT_SYMBOL_GPL(kthread_flush_work);

/*
 * Make sure that the timer is neither set nor running and could
 * not manipulate the work list_head any longer.
 *
 * The function is called under worker->lock. The lock is temporary
 * released but the timer can't be set again in the meantime.
 */
static void kthread_cancel_delayed_work_timer(struct kthread_work *work,
                                              unsigned long *flags)
{
        struct kthread_delayed_work *dwork =
                container_of(work, struct kthread_delayed_work, work);
        struct kthread_worker *worker = work->worker;

        /*
         * del_timer_sync() must be called to make sure that the timer
         * callback is not running. The lock must be temporary released
         * to avoid a deadlock with the callback. In the meantime,
         * any queuing is blocked by setting the canceling counter.
         */
        work->canceling++;
        raw_spin_unlock_irqrestore(&worker->lock, *flags);
        del_timer_sync(&dwork->timer);
        raw_spin_lock_irqsave(&worker->lock, *flags);
        work->canceling--;
}

/*
 * This function removes the work from the worker queue.
 *
 * It is called under worker->lock. The caller must make sure that
 * the timer used by delayed work is not running, e.g. by calling
 * kthread_cancel_delayed_work_timer().
 *
 * The work might still be in use when this function finishes. See the
 * current_work proceed by the worker.
 *
 * Return: %true if @work was pending and successfully canceled,
 *        %false if @work was not pending
 */
static bool __kthread_cancel_work(struct kthread_work *work)
{
        /*
         * Try to remove the work from a worker list. It might either
         * be from worker->work_list or from worker->delayed_work_list.
         */
        if (!list_empty(&work->node)) {
                list_del_init(&work->node);
                return true;
        }

        return false;
}

/**
 * kthread_mod_delayed_work - modify delay of or queue a kthread delayed work
 * @worker: kthread worker to use
 * @dwork: kthread delayed work to queue
 * @delay: number of jiffies to wait before queuing
 *
 * If @dwork is idle, equivalent to kthread_queue_delayed_work(). Otherwise,
 * modify @dwork's timer so that it expires after @delay. If @delay is zero,
 * @work is guaranteed to be queued immediately.
 *
 * Return: %false if @dwork was idle and queued, %true otherwise.
 *
 * A special case is when the work is being canceled in parallel.
 * It might be caused either by the real kthread_cancel_delayed_work_sync()
 * or yet another kthread_mod_delayed_work() call. We let the other command
 * win and return %true here. The return value can be used for reference
 * counting and the number of queued works stays the same. Anyway, the caller
 * is supposed to synchronize these operations a reasonable way.
 *
 * This function is safe to call from any context including IRQ handler.
 * See __kthread_cancel_work() and kthread_delayed_work_timer_fn()
 * for details.
 */
bool kthread_mod_delayed_work(struct kthread_worker *worker,
                              struct kthread_delayed_work *dwork,
                              unsigned long delay)
{
        struct kthread_work *work = &dwork->work;
        unsigned long flags;
        int ret;

        raw_spin_lock_irqsave(&worker->lock, flags);

        /* Do not bother with canceling when never queued. */
        if (!work->worker) {
                ret = false;
                goto fast_queue;
        }

        /* Work must not be used with >1 worker, see kthread_queue_work() */
        WARN_ON_ONCE(work->worker != worker);

        /*
         * Temporary cancel the work but do not fight with another command
         * that is canceling the work as well.
         *
         * It is a bit tricky because of possible races with another
         * mod_delayed_work() and cancel_delayed_work() callers.
         *
         * The timer must be canceled first because worker->lock is released
         * when doing so. But the work can be removed from the queue (list)
         * only when it can be queued again so that the return value can
         * be used for reference counting.
         */
        kthread_cancel_delayed_work_timer(work, &flags);
        if (work->canceling) {
                /* The number of works in the queue does not change. */
                ret = true;
                goto out;
        }
        ret = __kthread_cancel_work(work);

fast_queue:
        __kthread_queue_delayed_work(worker, dwork, delay);
out:
        raw_spin_unlock_irqrestore(&worker->lock, flags);
        return ret;
}
EXPORT_SYMBOL_GPL(kthread_mod_delayed_work);

static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork)
{
        struct kthread_worker *worker = work->worker;
        unsigned long flags;
        int ret = false;

        if (!worker)
                goto out;

        raw_spin_lock_irqsave(&worker->lock, flags);
        /* Work must not be used with >1 worker, see kthread_queue_work(). */
        WARN_ON_ONCE(work->worker != worker);

        if (is_dwork)
                kthread_cancel_delayed_work_timer(work, &flags);

        ret = __kthread_cancel_work(work);

        if (worker->current_work != work)
                goto out_fast;

        /*
         * The work is in progress and we need to wait with the lock released.
         * In the meantime, block any queuing by setting the canceling counter.
         */
        work->canceling++;
        raw_spin_unlock_irqrestore(&worker->lock, flags);
        kthread_flush_work(work);
        raw_spin_lock_irqsave(&worker->lock, flags);
        work->canceling--;

out_fast:
        raw_spin_unlock_irqrestore(&worker->lock, flags);
out:
        return ret;
}

/**
 * kthread_cancel_work_sync - cancel a kthread work and wait for it to finish
 * @work: the kthread work to cancel
 *
 * Cancel @work and wait for its execution to finish.  This function
 * can be used even if the work re-queues itself. On return from this
 * function, @work is guaranteed to be not pending or executing on any CPU.
 *
 * kthread_cancel_work_sync(&delayed_work->work) must not be used for
 * delayed_work's. Use kthread_cancel_delayed_work_sync() instead.
 *
 * The caller must ensure that the worker on which @work was last
 * queued can't be destroyed before this function returns.
 *
 * Return: %true if @work was pending, %false otherwise.
 */
bool kthread_cancel_work_sync(struct kthread_work *work)
{
        return __kthread_cancel_work_sync(work, false);
}
EXPORT_SYMBOL_GPL(kthread_cancel_work_sync);

/**
 * kthread_cancel_delayed_work_sync - cancel a kthread delayed work and
 *        wait for it to finish.
 * @dwork: the kthread delayed work to cancel
 *
 * This is kthread_cancel_work_sync() for delayed works.
 *
 * Return: %true if @dwork was pending, %false otherwise.
 */
bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *dwork)
{
        return __kthread_cancel_work_sync(&dwork->work, true);
}
EXPORT_SYMBOL_GPL(kthread_cancel_delayed_work_sync);

/**
 * kthread_flush_worker - flush all current works on a kthread_worker
 * @worker: worker to flush
 *
 * Wait until all currently executing or pending works on @worker are
 * finished.
 */
void kthread_flush_worker(struct kthread_worker *worker)
{
        struct kthread_flush_work fwork = {
                KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
                COMPLETION_INITIALIZER_ONSTACK(fwork.done),
        };

        kthread_queue_work(worker, &fwork.work);
        wait_for_completion(&fwork.done);
}
EXPORT_SYMBOL_GPL(kthread_flush_worker);

/**
 * kthread_destroy_worker - destroy a kthread worker
 * @worker: worker to be destroyed
 *
 * Flush and destroy @worker.  The simple flush is enough because the kthread
 * worker API is used only in trivial scenarios.  There are no multi-step state
 * machines needed.
 */
void kthread_destroy_worker(struct kthread_worker *worker)
{
        struct task_struct *task;

        task = worker->task;
        if (WARN_ON(!task))
                return;

        kthread_flush_worker(worker);
        kthread_stop(task);
        WARN_ON(!list_empty(&worker->work_list));
        kfree(worker);
}
EXPORT_SYMBOL(kthread_destroy_worker);

/**
 * kthread_use_mm - make the calling kthread operate on an address space
 * @mm: address space to operate on
 */
void kthread_use_mm(struct mm_struct *mm)
{
        struct mm_struct *active_mm;
        struct task_struct *tsk = current;

        WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
        WARN_ON_ONCE(tsk->mm);

        task_lock(tsk);
        /* Hold off tlb flush IPIs while switching mm's */
        local_irq_disable();
        active_mm = tsk->active_mm;
        if (active_mm != mm) {
                mmgrab(mm);
                tsk->active_mm = mm;
        }
        tsk->mm = mm;
        switch_mm_irqs_off(active_mm, mm, tsk);
        local_irq_enable();
        task_unlock(tsk);
#ifdef finish_arch_post_lock_switch
        finish_arch_post_lock_switch();
#endif

        if (active_mm != mm)
                mmdrop(active_mm);

        to_kthread(tsk)->oldfs = force_uaccess_begin();
}
EXPORT_SYMBOL_GPL(kthread_use_mm);

/**
 * kthread_unuse_mm - reverse the effect of kthread_use_mm()
 * @mm: address space to operate on
 */
void kthread_unuse_mm(struct mm_struct *mm)
{
        struct task_struct *tsk = current;

        WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
        WARN_ON_ONCE(!tsk->mm);

        force_uaccess_end(to_kthread(tsk)->oldfs);

        task_lock(tsk);
        sync_mm_rss(mm);
        local_irq_disable();
        tsk->mm = NULL;
        /* active_mm is still 'mm' */
        enter_lazy_tlb(mm, tsk);
        local_irq_enable();
        task_unlock(tsk);
}
EXPORT_SYMBOL_GPL(kthread_unuse_mm);

#ifdef CONFIG_BLK_CGROUP
/**
 * kthread_associate_blkcg - associate blkcg to current kthread
 * @css: the cgroup info
 *
 * Current thread must be a kthread. The thread is running jobs on behalf of
 * other threads. In some cases, we expect the jobs attach cgroup info of
 * original threads instead of that of current thread. This function stores
 * original thread's cgroup info in current kthread context for later
 * retrieval.
 */
void kthread_associate_blkcg(struct cgroup_subsys_state *css)
{
        struct kthread *kthread;

        if (!(current->flags & PF_KTHREAD))
                return;
        kthread = to_kthread(current);
        if (!kthread)
                return;

        if (kthread->blkcg_css) {
                css_put(kthread->blkcg_css);
                kthread->blkcg_css = NULL;
        }
        if (css) {
                css_get(css);
                kthread->blkcg_css = css;
        }
}
EXPORT_SYMBOL(kthread_associate_blkcg);

/**
 * kthread_blkcg - get associated blkcg css of current kthread
 *
 * Current thread must be a kthread.
 */
struct cgroup_subsys_state *kthread_blkcg(void)
{
        struct kthread *kthread;

        if (current->flags & PF_KTHREAD) {
                kthread = to_kthread(current);
                if (kthread)
                        return kthread->blkcg_css;
        }
        return NULL;
}
EXPORT_SYMBOL(kthread_blkcg);
#endif
































































    4 
    4 


    4 



















































































































    1 






    1 
    1 


    1 















    1 






    1 










    1 




    5 





    5 


    5 
    1 


    5 








    5 




    2 






    4 





    4 





























































































































































































































































































    3 




    3 



    3 


































    3 




    3 
    3 


    3 

    3 








    3 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
// SPDX-License-Identifier: GPL-2.0
/*
 * Implementation of the SID table type.
 *
 * Original author: Stephen Smalley, <sds@tycho.nsa.gov>
 * Author: Ondrej Mosnacek, <omosnacek@gmail.com>
 *
 * Copyright (C) 2018 Red Hat, Inc.
 */
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/rcupdate.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/spinlock.h>
#include <asm/barrier.h>
#include "flask.h"
#include "security.h"
#include "sidtab.h"

struct sidtab_str_cache {
        struct rcu_head rcu_member;
        struct list_head lru_member;
        struct sidtab_entry *parent;
        u32 len;
        char str[];
};

#define index_to_sid(index) (index + SECINITSID_NUM + 1)
#define sid_to_index(sid) (sid - (SECINITSID_NUM + 1))

int sidtab_init(struct sidtab *s)
{
        u32 i;

        memset(s->roots, 0, sizeof(s->roots));

        for (i = 0; i < SECINITSID_NUM; i++)
                s->isids[i].set = 0;

        s->frozen = false;
        s->count = 0;
        s->convert = NULL;
        hash_init(s->context_to_sid);

        spin_lock_init(&s->lock);

#if CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE > 0
        s->cache_free_slots = CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE;
        INIT_LIST_HEAD(&s->cache_lru_list);
        spin_lock_init(&s->cache_lock);
#endif

        return 0;
}

static u32 context_to_sid(struct sidtab *s, struct context *context, u32 hash)
{
        struct sidtab_entry *entry;
        u32 sid = 0;

        rcu_read_lock();
        hash_for_each_possible_rcu(s->context_to_sid, entry, list, hash) {
                if (entry->hash != hash)
                        continue;
                if (context_cmp(&entry->context, context)) {
                        sid = entry->sid;
                        break;
                }
        }
        rcu_read_unlock();
        return sid;
}

int sidtab_set_initial(struct sidtab *s, u32 sid, struct context *context)
{
        struct sidtab_isid_entry *isid;
        u32 hash;
        int rc;

        if (sid == 0 || sid > SECINITSID_NUM)
                return -EINVAL;

        isid = &s->isids[sid - 1];

        rc = context_cpy(&isid->entry.context, context);
        if (rc)
                return rc;

#if CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE > 0
        isid->entry.cache = NULL;
#endif
        isid->set = 1;

        hash = context_compute_hash(context);

        /*
         * Multiple initial sids may map to the same context. Check that this
         * context is not already represented in the context_to_sid hashtable
         * to avoid duplicate entries and long linked lists upon hash
         * collision.
         */
        if (!context_to_sid(s, context, hash)) {
                isid->entry.sid = sid;
                isid->entry.hash = hash;
                hash_add(s->context_to_sid, &isid->entry.list, hash);
        }

        return 0;
}

int sidtab_hash_stats(struct sidtab *sidtab, char *page)
{
        int i;
        int chain_len = 0;
        int slots_used = 0;
        int entries = 0;
        int max_chain_len = 0;
        int cur_bucket = 0;
        struct sidtab_entry *entry;

        rcu_read_lock();
        hash_for_each_rcu(sidtab->context_to_sid, i, entry, list) {
                entries++;
                if (i == cur_bucket) {
                        chain_len++;
                        if (chain_len == 1)
                                slots_used++;
                } else {
                        cur_bucket = i;
                        if (chain_len > max_chain_len)
                                max_chain_len = chain_len;
                        chain_len = 0;
                }
        }
        rcu_read_unlock();

        if (chain_len > max_chain_len)
                max_chain_len = chain_len;

        return scnprintf(page, PAGE_SIZE, "entries: %d\nbuckets used: %d/%d\n"
                         "longest chain: %d\n", entries,
                         slots_used, SIDTAB_HASH_BUCKETS, max_chain_len);
}

static u32 sidtab_level_from_count(u32 count)
{
        u32 capacity = SIDTAB_LEAF_ENTRIES;
        u32 level = 0;

        while (count > capacity) {
                capacity <<= SIDTAB_INNER_SHIFT;
                ++level;
        }
        return level;
}

static int sidtab_alloc_roots(struct sidtab *s, u32 level)
{
        u32 l;

        if (!s->roots[0].ptr_leaf) {
                s->roots[0].ptr_leaf = kzalloc(SIDTAB_NODE_ALLOC_SIZE,
                                               GFP_ATOMIC);
                if (!s->roots[0].ptr_leaf)
                        return -ENOMEM;
        }
        for (l = 1; l <= level; ++l)
                if (!s->roots[l].ptr_inner) {
                        s->roots[l].ptr_inner = kzalloc(SIDTAB_NODE_ALLOC_SIZE,
                                                        GFP_ATOMIC);
                        if (!s->roots[l].ptr_inner)
                                return -ENOMEM;
                        s->roots[l].ptr_inner->entries[0] = s->roots[l - 1];
                }
        return 0;
}

static struct sidtab_entry *sidtab_do_lookup(struct sidtab *s, u32 index,
                                             int alloc)
{
        union sidtab_entry_inner *entry;
        u32 level, capacity_shift, leaf_index = index / SIDTAB_LEAF_ENTRIES;

        /* find the level of the subtree we need */
        level = sidtab_level_from_count(index + 1);
        capacity_shift = level * SIDTAB_INNER_SHIFT;

        /* allocate roots if needed */
        if (alloc && sidtab_alloc_roots(s, level) != 0)
                return NULL;

        /* lookup inside the subtree */
        entry = &s->roots[level];
        while (level != 0) {
                capacity_shift -= SIDTAB_INNER_SHIFT;
                --level;

                entry = &entry->ptr_inner->entries[leaf_index >> capacity_shift];
                leaf_index &= ((u32)1 << capacity_shift) - 1;

                if (!entry->ptr_inner) {
                        if (alloc)
                                entry->ptr_inner = kzalloc(SIDTAB_NODE_ALLOC_SIZE,
                                                           GFP_ATOMIC);
                        if (!entry->ptr_inner)
                                return NULL;
                }
        }
        if (!entry->ptr_leaf) {
                if (alloc)
                        entry->ptr_leaf = kzalloc(SIDTAB_NODE_ALLOC_SIZE,
                                                  GFP_ATOMIC);
                if (!entry->ptr_leaf)
                        return NULL;
        }
        return &entry->ptr_leaf->entries[index % SIDTAB_LEAF_ENTRIES];
}

static struct sidtab_entry *sidtab_lookup(struct sidtab *s, u32 index)
{
        /* read entries only after reading count */
        u32 count = smp_load_acquire(&s->count);

        if (index >= count)
                return NULL;

        return sidtab_do_lookup(s, index, 0);
}

static struct sidtab_entry *sidtab_lookup_initial(struct sidtab *s, u32 sid)
{
        return s->isids[sid - 1].set ? &s->isids[sid - 1].entry : NULL;
}

static struct sidtab_entry *sidtab_search_core(struct sidtab *s, u32 sid,
                                               int force)
{
        if (sid != 0) {
                struct sidtab_entry *entry;

                if (sid > SECINITSID_NUM)
                        entry = sidtab_lookup(s, sid_to_index(sid));
                else
                        entry = sidtab_lookup_initial(s, sid);
                if (entry && (!entry->context.len || force))
                        return entry;
        }

        return sidtab_lookup_initial(s, SECINITSID_UNLABELED);
}

struct sidtab_entry *sidtab_search_entry(struct sidtab *s, u32 sid)
{
        return sidtab_search_core(s, sid, 0);
}

struct sidtab_entry *sidtab_search_entry_force(struct sidtab *s, u32 sid)
{
        return sidtab_search_core(s, sid, 1);
}

int sidtab_context_to_sid(struct sidtab *s, struct context *context,
                          u32 *sid)
{
        unsigned long flags;
        u32 count, hash = context_compute_hash(context);
        struct sidtab_convert_params *convert;
        struct sidtab_entry *dst, *dst_convert;
        int rc;

        *sid = context_to_sid(s, context, hash);
        if (*sid)
                return 0;

        /* lock-free search failed: lock, re-search, and insert if not found */
        spin_lock_irqsave(&s->lock, flags);

        rc = 0;
        *sid = context_to_sid(s, context, hash);
        if (*sid)
                goto out_unlock;

        if (unlikely(s->frozen)) {
                /*
                 * This sidtab is now frozen - tell the caller to abort and
                 * get the new one.
                 */
                rc = -ESTALE;
                goto out_unlock;
        }

        count = s->count;
        convert = s->convert;

        /* bail out if we already reached max entries */
        rc = -EOVERFLOW;
        if (count >= SIDTAB_MAX)
                goto out_unlock;

        /* insert context into new entry */
        rc = -ENOMEM;
        dst = sidtab_do_lookup(s, count, 1);
        if (!dst)
                goto out_unlock;

        dst->sid = index_to_sid(count);
        dst->hash = hash;

        rc = context_cpy(&dst->context, context);
        if (rc)
                goto out_unlock;

        /*
         * if we are building a new sidtab, we need to convert the context
         * and insert it there as well
         */
        if (convert) {
                rc = -ENOMEM;
                dst_convert = sidtab_do_lookup(convert->target, count, 1);
                if (!dst_convert) {
                        context_destroy(&dst->context);
                        goto out_unlock;
                }

                rc = convert->func(context, &dst_convert->context,
                                   convert->args, GFP_ATOMIC);
                if (rc) {
                        context_destroy(&dst->context);
                        goto out_unlock;
                }
                dst_convert->sid = index_to_sid(count);
                dst_convert->hash = context_compute_hash(&dst_convert->context);
                convert->target->count = count + 1;

                hash_add_rcu(convert->target->context_to_sid,
                             &dst_convert->list, dst_convert->hash);
        }

        if (context->len)
                pr_info("SELinux:  Context %s is not valid (left unmapped).\n",
                        context->str);

        *sid = index_to_sid(count);

        /* write entries before updating count */
        smp_store_release(&s->count, count + 1);
        hash_add_rcu(s->context_to_sid, &dst->list, dst->hash);

        rc = 0;
out_unlock:
        spin_unlock_irqrestore(&s->lock, flags);
        return rc;
}

static void sidtab_convert_hashtable(struct sidtab *s, u32 count)
{
        struct sidtab_entry *entry;
        u32 i;

        for (i = 0; i < count; i++) {
                entry = sidtab_do_lookup(s, i, 0);
                entry->sid = index_to_sid(i);
                entry->hash = context_compute_hash(&entry->context);

                hash_add_rcu(s->context_to_sid, &entry->list, entry->hash);
        }
}

static int sidtab_convert_tree(union sidtab_entry_inner *edst,
                               union sidtab_entry_inner *esrc,
                               u32 *pos, u32 count, u32 level,
                               struct sidtab_convert_params *convert)
{
        int rc;
        u32 i;

        if (level != 0) {
                if (!edst->ptr_inner) {
                        edst->ptr_inner = kzalloc(SIDTAB_NODE_ALLOC_SIZE,
                                                  GFP_KERNEL);
                        if (!edst->ptr_inner)
                                return -ENOMEM;
                }
                i = 0;
                while (i < SIDTAB_INNER_ENTRIES && *pos < count) {
                        rc = sidtab_convert_tree(&edst->ptr_inner->entries[i],
                                                 &esrc->ptr_inner->entries[i],
                                                 pos, count, level - 1,
                                                 convert);
                        if (rc)
                                return rc;
                        i++;
                }
        } else {
                if (!edst->ptr_leaf) {
                        edst->ptr_leaf = kzalloc(SIDTAB_NODE_ALLOC_SIZE,
                                                 GFP_KERNEL);
                        if (!edst->ptr_leaf)
                                return -ENOMEM;
                }
                i = 0;
                while (i < SIDTAB_LEAF_ENTRIES && *pos < count) {
                        rc = convert->func(&esrc->ptr_leaf->entries[i].context,
                                           &edst->ptr_leaf->entries[i].context,
                                           convert->args, GFP_KERNEL);
                        if (rc)
                                return rc;
                        (*pos)++;
                        i++;
                }
                cond_resched();
        }
        return 0;
}

int sidtab_convert(struct sidtab *s, struct sidtab_convert_params *params)
{
        unsigned long flags;
        u32 count, level, pos;
        int rc;

        spin_lock_irqsave(&s->lock, flags);

        /* concurrent policy loads are not allowed */
        if (s->convert) {
                spin_unlock_irqrestore(&s->lock, flags);
                return -EBUSY;
        }

        count = s->count;
        level = sidtab_level_from_count(count);

        /* allocate last leaf in the new sidtab (to avoid race with
         * live convert)
         */
        rc = sidtab_do_lookup(params->target, count - 1, 1) ? 0 : -ENOMEM;
        if (rc) {
                spin_unlock_irqrestore(&s->lock, flags);
                return rc;
        }

        /* set count in case no new entries are added during conversion */
        params->target->count = count;

        /* enable live convert of new entries */
        s->convert = params;

        /* we can safely convert the tree outside the lock */
        spin_unlock_irqrestore(&s->lock, flags);

        pr_info("SELinux:  Converting %u SID table entries...\n", count);

        /* convert all entries not covered by live convert */
        pos = 0;
        rc = sidtab_convert_tree(&params->target->roots[level],
                                 &s->roots[level], &pos, count, level, params);
        if (rc) {
                /* we need to keep the old table - disable live convert */
                spin_lock_irqsave(&s->lock, flags);
                s->convert = NULL;
                spin_unlock_irqrestore(&s->lock, flags);
                return rc;
        }
        /*
         * The hashtable can also be modified in sidtab_context_to_sid()
         * so we must re-acquire the lock here.
         */
        spin_lock_irqsave(&s->lock, flags);
        sidtab_convert_hashtable(params->target, count);
        spin_unlock_irqrestore(&s->lock, flags);

        return 0;
}

void sidtab_cancel_convert(struct sidtab *s)
{
        unsigned long flags;

        /* cancelling policy load - disable live convert of sidtab */
        spin_lock_irqsave(&s->lock, flags);
        s->convert = NULL;
        spin_unlock_irqrestore(&s->lock, flags);
}

void sidtab_freeze_begin(struct sidtab *s, unsigned long *flags) __acquires(&s->lock)
{
        spin_lock_irqsave(&s->lock, *flags);
        s->frozen = true;
        s->convert = NULL;
}
void sidtab_freeze_end(struct sidtab *s, unsigned long *flags) __releases(&s->lock)
{
        spin_unlock_irqrestore(&s->lock, *flags);
}

static void sidtab_destroy_entry(struct sidtab_entry *entry)
{
        context_destroy(&entry->context);
#if CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE > 0
        kfree(rcu_dereference_raw(entry->cache));
#endif
}

static void sidtab_destroy_tree(union sidtab_entry_inner entry, u32 level)
{
        u32 i;

        if (level != 0) {
                struct sidtab_node_inner *node = entry.ptr_inner;

                if (!node)
                        return;

                for (i = 0; i < SIDTAB_INNER_ENTRIES; i++)
                        sidtab_destroy_tree(node->entries[i], level - 1);
                kfree(node);
        } else {
                struct sidtab_node_leaf *node = entry.ptr_leaf;

                if (!node)
                        return;

                for (i = 0; i < SIDTAB_LEAF_ENTRIES; i++)
                        sidtab_destroy_entry(&node->entries[i]);
                kfree(node);
        }
}

void sidtab_destroy(struct sidtab *s)
{
        u32 i, level;

        for (i = 0; i < SECINITSID_NUM; i++)
                if (s->isids[i].set)
                        sidtab_destroy_entry(&s->isids[i].entry);

        level = SIDTAB_MAX_LEVEL;
        while (level && !s->roots[level].ptr_inner)
                --level;

        sidtab_destroy_tree(s->roots[level], level);
        /*
         * The context_to_sid hashtable's objects are all shared
         * with the isids array and context tree, and so don't need
         * to be cleaned up here.
         */
}

#if CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE > 0

void sidtab_sid2str_put(struct sidtab *s, struct sidtab_entry *entry,
                        const char *str, u32 str_len)
{
        struct sidtab_str_cache *cache, *victim = NULL;
        unsigned long flags;

        /* do not cache invalid contexts */
        if (entry->context.len)
                return;

        spin_lock_irqsave(&s->cache_lock, flags);

        cache = rcu_dereference_protected(entry->cache,
                                          lockdep_is_held(&s->cache_lock));
        if (cache) {
                /* entry in cache - just bump to the head of LRU list */
                list_move(&cache->lru_member, &s->cache_lru_list);
                goto out_unlock;
        }

        cache = kmalloc(sizeof(struct sidtab_str_cache) + str_len, GFP_ATOMIC);
        if (!cache)
                goto out_unlock;

        if (s->cache_free_slots == 0) {
                /* pop a cache entry from the tail and free it */
                victim = container_of(s->cache_lru_list.prev,
                                      struct sidtab_str_cache, lru_member);
                list_del(&victim->lru_member);
                rcu_assign_pointer(victim->parent->cache, NULL);
        } else {
                s->cache_free_slots--;
        }
        cache->parent = entry;
        cache->len = str_len;
        memcpy(cache->str, str, str_len);
        list_add(&cache->lru_member, &s->cache_lru_list);

        rcu_assign_pointer(entry->cache, cache);

out_unlock:
        spin_unlock_irqrestore(&s->cache_lock, flags);
        kfree_rcu(victim, rcu_member);
}

int sidtab_sid2str_get(struct sidtab *s, struct sidtab_entry *entry,
                       char **out, u32 *out_len)
{
        struct sidtab_str_cache *cache;
        int rc = 0;

        if (entry->context.len)
                return -ENOENT; /* do not cache invalid contexts */

        rcu_read_lock();

        cache = rcu_dereference(entry->cache);
        if (!cache) {
                rc = -ENOENT;
        } else {
                *out_len = cache->len;
                if (out) {
                        *out = kmemdup(cache->str, cache->len, GFP_ATOMIC);
                        if (!*out)
                                rc = -ENOMEM;
                }
        }

        rcu_read_unlock();

        if (!rc && out)
                sidtab_sid2str_put(s, entry, *out, *out_len);
        return rc;
}

#endif /* CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE > 0 */













































































































































    1 
    1 



    1 
    1 





















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057
8058
8059
8060
8061
8062
8063
8064
8065
8066
8067
8068
8069
8070
8071
8072
8073
8074
8075
8076
8077
8078
8079
8080
8081
8082
8083
8084
8085
8086
8087
8088
8089
8090
8091
8092
8093
8094
8095
8096
8097
8098
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118
8119
8120
8121
8122
8123
8124
8125
8126
8127
8128
8129
8130
8131
8132
8133
8134
8135
8136
8137
8138
8139
8140
8141
8142
8143
8144
8145
8146
8147
8148
8149
8150
8151
8152
8153
8154
8155
8156
8157
8158
8159
8160
8161
8162
8163
8164
8165
8166
8167
8168
8169
8170
8171
8172
8173
8174
8175
8176
8177
8178
8179
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195
8196
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213
8214
8215
8216
8217
8218
8219
8220
8221
8222
8223
8224
8225
8226
8227
8228
8229
8230
8231
8232
8233
8234
8235
8236
8237
8238
8239
8240
8241
8242
8243
8244
8245
8246
8247
8248
8249
8250
8251
8252
8253
8254
8255
8256
8257
8258
8259
8260
8261
8262
8263
8264
8265
8266
8267
8268
8269
8270
8271
8272
8273
8274
8275
8276
8277
8278
8279
8280
8281
8282
8283
8284
8285
8286
8287
8288
8289
8290
8291
8292
8293
8294
8295
8296
8297
8298
8299
8300
8301
8302
8303
8304
8305
8306
8307
8308
8309
8310
8311
8312
8313
8314
8315
8316
8317
8318
8319
8320
8321
8322
8323
8324
8325
8326
8327
8328
8329
8330
8331
8332
8333
8334
8335
8336
8337
8338
8339
8340
8341
8342
8343
8344
8345
8346
8347
8348
8349
8350
8351
8352
8353
8354
8355
8356
8357
8358
8359
8360
8361
8362
8363
8364
8365
8366
8367
8368
8369
8370
8371
8372
8373
8374
8375
8376
8377
8378
8379
8380
8381
8382
8383
8384
8385
8386
8387
8388
8389
8390
8391
8392
8393
8394
8395
8396
8397
8398
8399
8400
8401
8402
8403
8404
8405
8406
8407
8408
8409
8410
8411
8412
8413
8414
8415
8416
8417
8418
8419
8420
8421
8422
8423
8424
8425
8426
8427
8428
8429
8430
8431
8432
8433
8434
8435
8436
8437
8438
8439
8440
8441
8442
8443
8444
8445
8446
8447
8448
8449
8450
8451
8452
8453
8454
8455
8456
8457
8458
8459
8460
8461
8462
8463
8464
8465
8466
8467
8468
8469
8470
8471
8472
8473
8474
8475
8476
8477
8478
8479
8480
8481
8482
8483
8484
8485
8486
8487
8488
8489
8490
8491
8492
8493
8494
8495
8496
8497
8498
8499
8500
8501
8502
8503
8504
8505
8506
8507
8508
8509
8510
8511
8512
8513
8514
8515
8516
8517
8518
8519
8520
8521
8522
8523
8524
8525
8526
8527
8528
8529
8530
8531
8532
8533
8534
8535
8536
8537
8538
8539
8540
8541
8542
8543
8544
8545
8546
8547
8548
8549
8550
8551
8552
8553
8554
8555
8556
8557
8558
8559
8560
8561
8562
8563
8564
8565
8566
8567
8568
8569
8570
8571
8572
8573
8574
8575
8576
8577
8578
8579
8580
8581
8582
8583
8584
8585
8586
8587
8588
8589
8590
8591
8592
8593
8594
8595
8596
8597
8598
8599
8600
8601
8602
8603
8604
8605
8606
8607
8608
8609
8610
8611
8612
8613
8614
8615
8616
8617
8618
8619
8620
8621
8622
8623
8624
8625
8626
8627
8628
8629
8630
8631
8632
8633
8634
8635
8636
8637
8638
8639
8640
8641
8642
8643
8644
8645
8646
8647
8648
8649
8650
8651
8652
8653
8654
8655
8656
8657
8658
8659
8660
8661
8662
8663
8664
8665
8666
8667
8668
8669
8670
8671
8672
8673
8674
8675
8676
8677
8678
8679
8680
8681
8682
8683
8684
8685
8686
8687
8688
8689
8690
8691
8692
8693
8694
8695
8696
8697
8698
8699
8700
8701
8702
8703
8704
8705
8706
8707
8708
8709
8710
8711
8712
8713
8714
8715
8716
8717
8718
8719
8720
8721
8722
8723
8724
8725
8726
8727
8728
8729
8730
8731
8732
8733
8734
8735
8736
8737
8738
8739
8740
8741
8742
8743
8744
8745
8746
8747
8748
8749
8750
8751
8752
8753
8754
8755
8756
8757
8758
8759
8760
8761
8762
8763
8764
8765
8766
8767
8768
8769
8770
8771
8772
8773
8774
8775
8776
8777
8778
8779
8780
8781
8782
8783
8784
8785
8786
8787
8788
8789
8790
8791
8792
8793
8794
8795
8796
8797
8798
8799
8800
8801
8802
8803
8804
8805
8806
8807
8808
8809
8810
8811
8812
8813
8814
8815
8816
8817
8818
8819
8820
8821
8822
8823
8824
8825
8826
8827
8828
8829
8830
8831
8832
8833
8834
8835
8836
8837
8838
8839
8840
8841
8842
8843
8844
8845
8846
8847
8848
8849
8850
8851
8852
8853
8854
8855
8856
8857
8858
8859
8860
8861
8862
8863
8864
8865
8866
8867
8868
8869
8870
8871
8872
8873
8874
8875
8876
8877
8878
8879
8880
8881
8882
8883
8884
8885
8886
8887
8888
8889
8890
8891
8892
8893
8894
8895
8896
8897
8898
8899
8900
8901
8902
8903
8904
8905
8906
8907
8908
8909
8910
8911
8912
8913
8914
8915
8916
8917
8918
8919
8920
8921
8922
8923
8924
8925
8926
8927
8928
8929
8930
8931
8932
8933
8934
8935
8936
8937
8938
8939
8940
8941
8942
8943
8944
8945
8946
8947
8948
8949
8950
8951
8952
8953
8954
8955
8956
8957
8958
8959
8960
8961
8962
8963
8964
8965
8966
8967
8968
8969
8970
8971
8972
8973
8974
8975
8976
8977
8978
8979
8980
8981
8982
8983
8984
8985
8986
8987
8988
8989
8990
8991
8992
8993
8994
8995
8996
8997
8998
8999
9000
9001
9002
9003
9004
9005
9006
9007
9008
9009
9010
9011
9012
9013
9014
9015
9016
9017
9018
9019
9020
9021
9022
9023
9024
9025
9026
9027
9028
9029
9030
9031
9032
9033
9034
9035
9036
9037
9038
9039
9040
9041
9042
9043
9044
9045
9046
9047
9048
9049
9050
9051
9052
9053
9054
9055
9056
9057
9058
9059
9060
9061
9062
9063
9064
9065
9066
9067
9068
9069
9070
9071
9072
9073
9074
9075
9076
9077
9078
9079
9080
9081
9082
9083
9084
9085
9086
9087
9088
9089
9090
9091
9092
9093
9094
9095
9096
9097
9098
9099
9100
9101
9102
9103
9104
9105
9106
9107
9108
9109
9110
9111
9112
9113
9114
9115
9116
9117
9118
9119
9120
9121
9122
9123
9124
9125
9126
9127
9128
9129
9130
9131
9132
9133
9134
9135
9136
9137
9138
9139
9140
9141
9142
9143
9144
9145
9146
9147
9148
9149
9150
9151
9152
9153
9154
9155
9156
9157
9158
9159
9160
9161
9162
9163
9164
9165
9166
9167
9168
9169
9170
9171
9172
9173
9174
9175
9176
9177
9178
9179
9180
9181
9182
9183
9184
9185
9186
9187
9188
9189
9190
9191
9192
9193
9194
9195
9196
9197
9198
9199
9200
9201
9202
9203
9204
9205
9206
9207
9208
9209
9210
9211
9212
9213
9214
9215
9216
9217
9218
9219
9220
9221
9222
9223
9224
9225
9226
9227
9228
9229
9230
9231
9232
9233
9234
9235
9236
9237
9238
9239
9240
9241
9242
9243
9244
9245
9246
9247
9248
9249
9250
9251
9252
9253
9254
9255
9256
9257
9258
9259
9260
9261
9262
9263
9264
9265
9266
9267
9268
9269
9270
9271
9272
9273
9274
9275
9276
9277
9278
9279
9280
9281
9282
9283
9284
9285
9286
9287
9288
9289
9290
9291
9292
9293
9294
9295
9296
9297
9298
9299
9300
9301
9302
9303
9304
9305
9306
9307
9308
9309
9310
9311
9312
9313
9314
9315
9316
9317
9318
9319
9320
9321
9322
9323
9324
9325
9326
9327
9328
9329
9330
9331
9332
9333
9334
9335
9336
9337
9338
9339
9340
9341
9342
9343
9344
9345
9346
9347
9348
9349
9350
9351
9352
9353
9354
9355
9356
9357
9358
9359
9360
9361
9362
9363
9364
9365
9366
9367
9368
9369
9370
9371
9372
9373
9374
9375
9376
9377
9378
9379
9380
9381
9382
9383
9384
9385
9386
9387
9388
9389
9390
9391
9392
9393
9394
9395
9396
9397
9398
9399
9400
9401
9402
9403
9404
9405
9406
9407
9408
9409
9410
9411
9412
9413
9414
9415
9416
9417
9418
9419
9420
9421
9422
9423
9424
9425
9426
9427
9428
9429
9430
9431
9432
9433
9434
9435
9436
9437
9438
9439
9440
9441
9442
9443
9444
9445
9446
9447
9448
9449
9450
9451
9452
9453
9454
9455
9456
9457
9458
9459
9460
9461
9462
9463
9464
9465
9466
9467
9468
9469
9470
9471
9472
9473
9474
9475
9476
9477
9478
9479
9480
9481
9482
9483
9484
9485
9486
9487
9488
9489
9490
9491
9492
9493
9494
9495
9496
9497
9498
9499
9500
9501
9502
9503
9504
9505
9506
9507
9508
9509
9510
9511
9512
9513
9514
9515
9516
9517
9518
9519
9520
9521
9522
9523
9524
9525
9526
9527
9528
9529
9530
9531
9532
9533
9534
9535
9536
9537
9538
9539
9540
9541
9542
9543
9544
9545
9546
9547
9548
9549
9550
9551
9552
9553
9554
9555
9556
9557
9558
9559
9560
9561
9562
9563
9564
9565
9566
9567
9568
9569
9570
9571
9572
9573
9574
9575
9576
9577
9578
9579
9580
9581
9582
9583
9584
9585
9586
9587
9588
9589
9590
9591
9592
9593
9594
9595
9596
9597
9598
9599
9600
9601
9602
9603
9604
9605
9606
9607
9608
9609
9610
9611
9612
9613
9614
9615
9616
9617
9618
9619
9620
9621
9622
9623
9624
9625
9626
9627
9628
9629
9630
9631
9632
9633
9634
9635
9636
9637
9638
9639
9640
9641
9642
9643
9644
9645
9646
9647
9648
9649
9650
9651
9652
9653
9654
9655
9656
9657
9658
9659
9660
9661
9662
9663
9664
9665
9666
9667
9668
9669
9670
9671
9672
9673
9674
9675
9676
9677
9678
9679
9680
9681
9682
9683
9684
9685
9686
9687
9688
9689
9690
9691
9692
9693
9694
9695
9696
9697
9698
9699
9700
9701
9702
9703
9704
9705
9706
9707
9708
9709
9710
9711
9712
9713
9714
9715
9716
9717
9718
9719
9720
9721
9722
9723
9724
9725
9726
9727
9728
9729
9730
9731
9732
9733
9734
9735
9736
9737
9738
9739
9740
9741
9742
9743
9744
9745
9746
9747
9748
9749
9750
9751
9752
9753
9754
9755
9756
9757
9758
9759
9760
9761
9762
9763
9764
9765
9766
9767
9768
9769
9770
9771
9772
9773
9774
9775
9776
9777
9778
9779
9780
9781
9782
9783
9784
9785
9786
9787
9788
9789
9790
9791
9792
9793
9794
9795
9796
9797
9798
9799
9800
9801
9802
9803
9804
9805
9806
9807
9808
9809
9810
9811
9812
9813
9814
9815
9816
9817
9818
9819
9820
9821
9822
9823
9824
9825
9826
9827
9828
9829
9830
9831
9832
9833
9834
9835
9836
9837
9838
9839
9840
9841
9842
9843
9844
9845
9846
9847
9848
9849
9850
9851
9852
9853
9854
9855
9856
9857
9858
9859
9860
9861
9862
9863
9864
9865
9866
9867
9868
9869
9870
9871
9872
9873
9874
9875
9876
9877
9878
9879
9880
9881
9882
9883
9884
9885
9886
9887
9888
9889
9890
9891
9892
9893
9894
9895
9896
9897
9898
9899
9900
9901
9902
9903
9904
9905
9906
9907
9908
9909
9910
9911
9912
9913
9914
9915
9916
9917
9918
9919
9920
9921
9922
9923
9924
9925
9926
9927
9928
9929
9930
9931
9932
9933
9934
9935
9936
9937
9938
9939
9940
9941
9942
9943
9944
9945
9946
9947
9948
9949
9950
9951
9952
9953
9954
9955
9956
9957
9958
9959
9960
9961
9962
9963
9964
9965
9966
9967
9968
9969
9970
9971
9972
9973
9974
9975
9976
9977
9978
9979
9980
9981
9982
9983
9984
9985
9986
9987
9988
9989
9990
9991
9992
9993
9994
9995
9996
9997
9998
9999
10000
10001
10002
10003
10004
10005
10006
10007
10008
10009
10010
10011
10012
10013
10014
10015
10016
10017
10018
10019
10020
10021
10022
10023
10024
10025
10026
10027
10028
10029
10030
10031
10032
10033
10034
10035
10036
10037
10038
10039
10040
10041
10042
10043
10044
10045
10046
10047
10048
10049
10050
10051
10052
10053
10054
10055
10056
10057
10058
10059
10060
10061
10062
10063
10064
10065
10066
10067
10068
10069
10070
10071
10072
10073
10074
10075
10076
10077
10078
10079
10080
10081
10082
10083
10084
10085
10086
10087
10088
10089
10090
10091
10092
10093
10094
10095
10096
10097
10098
10099
10100
10101
10102
10103
10104
10105
10106
10107
10108
10109
10110
10111
10112
10113
10114
10115
10116
10117
10118
10119
10120
10121
10122
10123
10124
10125
10126
10127
10128
10129
10130
10131
10132
10133
10134
10135
10136
10137
10138
10139
10140
10141
10142
10143
10144
10145
10146
10147
10148
10149
10150
10151
10152
10153
10154
10155
10156
10157
10158
10159
10160
10161
10162
10163
10164
10165
10166
10167
10168
10169
10170
10171
10172
10173
10174
10175
10176
10177
10178
10179
10180
10181
10182
10183
10184
10185
10186
10187
10188
10189
10190
10191
10192
10193
10194
10195
10196
10197
10198
10199
10200
10201
10202
10203
10204
10205
10206
10207
10208
10209
10210
10211
10212
10213
10214
10215
10216
10217
10218
10219
10220
10221
10222
10223
10224
10225
10226
10227
10228
10229
10230
10231
10232
10233
10234
10235
10236
10237
10238
10239
10240
10241
10242
10243
10244
10245
10246
10247
10248
10249
10250
10251
10252
10253
10254
10255
10256
10257
10258
10259
10260
10261
10262
10263
10264
10265
10266
10267
10268
10269
10270
10271
10272
10273
10274
10275
10276
10277
10278
10279
10280
10281
10282
10283
10284
10285
10286
10287
10288
10289
10290
10291
10292
10293
10294
10295
10296
10297
10298
10299
10300
10301
10302
10303
10304
10305
10306
10307
10308
10309
10310
10311
10312
10313
10314
10315
10316
10317
10318
10319
10320
10321
10322
10323
10324
10325
10326
10327
10328
10329
10330
10331
10332
10333
10334
10335
10336
10337
10338
10339
10340
10341
10342
10343
10344
10345
10346
10347
10348
10349
10350
10351
10352
10353
10354
10355
10356
10357
10358
10359
10360
10361
10362
10363
10364
10365
10366
10367
10368
10369
10370
10371
10372
10373
10374
10375
10376
10377
10378
10379
10380
10381
10382
10383
10384
10385
10386
10387
10388
10389
10390
10391
10392
10393
10394
10395
10396
10397
10398
10399
10400
10401
10402
10403
10404
10405
10406
10407
10408
10409
10410
10411
10412
10413
10414
10415
10416
10417
10418
10419
10420
10421
10422
10423
10424
10425
10426
10427
10428
10429
10430
10431
10432
10433
10434
10435
10436
10437
10438
10439
10440
10441
10442
10443
10444
10445
10446
10447
10448
10449
10450
10451
10452
10453
10454
10455
10456
10457
10458
10459
10460
10461
10462
10463
10464
10465
10466
10467
10468
10469
10470
10471
10472
10473
10474
10475
10476
10477
10478
10479
10480
10481
10482
10483
10484
10485
10486
10487
10488
10489
10490
10491
10492
10493
10494
10495
10496
10497
10498
10499
10500
10501
10502
10503
10504
10505
10506
10507
10508
10509
10510
10511
10512
10513
10514
10515
10516
10517
10518
10519
10520
10521
10522
10523
10524
10525
10526
10527
10528
10529
10530
10531
10532
10533
10534
10535
10536
10537
10538
10539
10540
10541
10542
10543
10544
10545
10546
10547
10548
10549
10550
10551
10552
10553
10554
10555
10556
10557
10558
10559
10560
10561
10562
10563
10564
10565
10566
10567
10568
10569
10570
10571
10572
10573
10574
10575
10576
10577
10578
10579
10580
10581
10582
10583
10584
10585
10586
10587
10588
10589
10590
10591
10592
10593
10594
10595
10596
10597
10598
10599
10600
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Linux Socket Filter - Kernel level socket filtering
 *
 * Based on the design of the Berkeley Packet Filter. The new
 * internal format has been designed by PLUMgrid:
 *
 *        Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
 *
 * Authors:
 *
 *        Jay Schulist <jschlst@samba.org>
 *        Alexei Starovoitov <ast@plumgrid.com>
 *        Daniel Borkmann <dborkman@redhat.com>
 *
 * Andi Kleen - Fix a few bad bugs and races.
 * Kris Katterjohn - Added many additional checks in bpf_check_classic()
 */

#include <linux/module.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/fcntl.h>
#include <linux/socket.h>
#include <linux/sock_diag.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/if_packet.h>
#include <linux/if_arp.h>
#include <linux/gfp.h>
#include <net/inet_common.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/netlink.h>
#include <linux/skbuff.h>
#include <linux/skmsg.h>
#include <net/sock.h>
#include <net/flow_dissector.h>
#include <linux/errno.h>
#include <linux/timer.h>
#include <linux/uaccess.h>
#include <asm/unaligned.h>
#include <asm/cmpxchg.h>
#include <linux/filter.h>
#include <linux/ratelimit.h>
#include <linux/seccomp.h>
#include <linux/if_vlan.h>
#include <linux/bpf.h>
#include <linux/btf.h>
#include <net/sch_generic.h>
#include <net/cls_cgroup.h>
#include <net/dst_metadata.h>
#include <net/dst.h>
#include <net/sock_reuseport.h>
#include <net/busy_poll.h>
#include <net/tcp.h>
#include <net/xfrm.h>
#include <net/udp.h>
#include <linux/bpf_trace.h>
#include <net/xdp_sock.h>
#include <linux/inetdevice.h>
#include <net/inet_hashtables.h>
#include <net/inet6_hashtables.h>
#include <net/ip_fib.h>
#include <net/nexthop.h>
#include <net/flow.h>
#include <net/arp.h>
#include <net/ipv6.h>
#include <net/net_namespace.h>
#include <linux/seg6_local.h>
#include <net/seg6.h>
#include <net/seg6_local.h>
#include <net/lwtunnel.h>
#include <net/ipv6_stubs.h>
#include <net/bpf_sk_storage.h>
#include <net/transp_v6.h>
#include <linux/btf_ids.h>
#include <net/tls.h>

/* Keep the struct bpf_fib_lookup small so that it fits into a cacheline */
static_assert(sizeof(struct bpf_fib_lookup) == 64, "struct bpf_fib_lookup size check");

static const struct bpf_func_proto *
bpf_sk_base_func_proto(enum bpf_func_id func_id);

int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len)
{
        if (in_compat_syscall()) {
                struct compat_sock_fprog f32;

                if (len != sizeof(f32))
                        return -EINVAL;
                if (copy_from_sockptr(&f32, src, sizeof(f32)))
                        return -EFAULT;
                memset(dst, 0, sizeof(*dst));
                dst->len = f32.len;
                dst->filter = compat_ptr(f32.filter);
        } else {
                if (len != sizeof(*dst))
                        return -EINVAL;
                if (copy_from_sockptr(dst, src, sizeof(*dst)))
                        return -EFAULT;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user);

/**
 *        sk_filter_trim_cap - run a packet through a socket filter
 *        @sk: sock associated with &sk_buff
 *        @skb: buffer to filter
 *        @cap: limit on how short the eBPF program may trim the packet
 *
 * Run the eBPF program and then cut skb->data to correct size returned by
 * the program. If pkt_len is 0 we toss packet. If skb->len is smaller
 * than pkt_len we keep whole skb->data. This is the socket level
 * wrapper to BPF_PROG_RUN. It returns 0 if the packet should
 * be accepted or -EPERM if the packet should be tossed.
 *
 */
int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
{
        int err;
        struct sk_filter *filter;

        /*
         * If the skb was allocated from pfmemalloc reserves, only
         * allow SOCK_MEMALLOC sockets to use it as this socket is
         * helping free memory
         */
        if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) {
                NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
                return -ENOMEM;
        }
        err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb);
        if (err)
                return err;

        err = security_sock_rcv_skb(sk, skb);
        if (err)
                return err;

        rcu_read_lock();
        filter = rcu_dereference(sk->sk_filter);
        if (filter) {
                struct sock *save_sk = skb->sk;
                unsigned int pkt_len;

                skb->sk = sk;
                pkt_len = bpf_prog_run_save_cb(filter->prog, skb);
                skb->sk = save_sk;
                err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM;
        }
        rcu_read_unlock();

        return err;
}
EXPORT_SYMBOL(sk_filter_trim_cap);

BPF_CALL_1(bpf_skb_get_pay_offset, struct sk_buff *, skb)
{
        return skb_get_poff(skb);
}

BPF_CALL_3(bpf_skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x)
{
        struct nlattr *nla;

        if (skb_is_nonlinear(skb))
                return 0;

        if (skb->len < sizeof(struct nlattr))
                return 0;

        if (a > skb->len - sizeof(struct nlattr))
                return 0;

        nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x);
        if (nla)
                return (void *) nla - (void *) skb->data;

        return 0;
}

BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
{
        struct nlattr *nla;

        if (skb_is_nonlinear(skb))
                return 0;

        if (skb->len < sizeof(struct nlattr))
                return 0;

        if (a > skb->len - sizeof(struct nlattr))
                return 0;

        nla = (struct nlattr *) &skb->data[a];
        if (nla->nla_len > skb->len - a)
                return 0;

        nla = nla_find_nested(nla, x);
        if (nla)
                return (void *) nla - (void *) skb->data;

        return 0;
}

static int bpf_skb_load_helper_convert_offset(const struct sk_buff *skb, int offset)
{
        if (likely(offset >= 0))
                return offset;

        if (offset >= SKF_NET_OFF)
                return offset - SKF_NET_OFF + skb_network_offset(skb);

        if (offset >= SKF_LL_OFF && skb_mac_header_was_set(skb))
                return offset - SKF_LL_OFF + skb_mac_offset(skb);

        return INT_MIN;
}

BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *,
           data, int, headlen, int, offset)
{
        u8 tmp;
        const int len = sizeof(tmp);

        offset = bpf_skb_load_helper_convert_offset(skb, offset);
        if (offset == INT_MIN)
                return -EFAULT;

        if (headlen - offset >= len)
                return *(u8 *)(data + offset);
        if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
                return tmp;
        else
                return -EFAULT;
}

BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb,
           int, offset)
{
        return ____bpf_skb_load_helper_8(skb, skb->data, skb->len - skb->data_len,
                                         offset);
}

BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *,
           data, int, headlen, int, offset)
{
        __be16 tmp;
        const int len = sizeof(tmp);

        offset = bpf_skb_load_helper_convert_offset(skb, offset);
        if (offset == INT_MIN)
                return -EFAULT;

        if (headlen - offset >= len)
                return get_unaligned_be16(data + offset);
        if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
                return be16_to_cpu(tmp);
        else
                return -EFAULT;
}

BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb,
           int, offset)
{
        return ____bpf_skb_load_helper_16(skb, skb->data, skb->len - skb->data_len,
                                          offset);
}

BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *,
           data, int, headlen, int, offset)
{
        __be32 tmp;
        const int len = sizeof(tmp);

        offset = bpf_skb_load_helper_convert_offset(skb, offset);
        if (offset == INT_MIN)
                return -EFAULT;

        if (headlen - offset >= len)
                return get_unaligned_be32(data + offset);
        if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
                return be32_to_cpu(tmp);
        else
                return -EFAULT;
}

BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb,
           int, offset)
{
        return ____bpf_skb_load_helper_32(skb, skb->data, skb->len - skb->data_len,
                                          offset);
}

static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
                              struct bpf_insn *insn_buf)
{
        struct bpf_insn *insn = insn_buf;

        switch (skb_field) {
        case SKF_AD_MARK:
                BUILD_BUG_ON(sizeof_field(struct sk_buff, mark) != 4);

                *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
                                      offsetof(struct sk_buff, mark));
                break;

        case SKF_AD_PKTTYPE:
                *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET());
                *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX);
#ifdef __BIG_ENDIAN_BITFIELD
                *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5);
#endif
                break;

        case SKF_AD_QUEUE:
                BUILD_BUG_ON(sizeof_field(struct sk_buff, queue_mapping) != 2);

                *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
                                      offsetof(struct sk_buff, queue_mapping));
                break;

        case SKF_AD_VLAN_TAG:
                BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_tci) != 2);

                /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */
                *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
                                      offsetof(struct sk_buff, vlan_tci));
                break;
        case SKF_AD_VLAN_TAG_PRESENT:
                *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_VLAN_PRESENT_OFFSET());
                if (PKT_VLAN_PRESENT_BIT)
                        *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, PKT_VLAN_PRESENT_BIT);
                if (PKT_VLAN_PRESENT_BIT < 7)
                        *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1);
                break;
        }

        return insn - insn_buf;
}

static bool convert_bpf_extensions(struct sock_filter *fp,
                                   struct bpf_insn **insnp)
{
        struct bpf_insn *insn = *insnp;
        u32 cnt;

        switch (fp->k) {
        case SKF_AD_OFF + SKF_AD_PROTOCOL:
                BUILD_BUG_ON(sizeof_field(struct sk_buff, protocol) != 2);

                /* A = *(u16 *) (CTX + offsetof(protocol)) */
                *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
                                      offsetof(struct sk_buff, protocol));
                /* A = ntohs(A) [emitting a nop or swap16] */
                *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
                break;

        case SKF_AD_OFF + SKF_AD_PKTTYPE:
                cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn);
                insn += cnt - 1;
                break;

        case SKF_AD_OFF + SKF_AD_IFINDEX:
        case SKF_AD_OFF + SKF_AD_HATYPE:
                BUILD_BUG_ON(sizeof_field(struct net_device, ifindex) != 4);
                BUILD_BUG_ON(sizeof_field(struct net_device, type) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
                                      BPF_REG_TMP, BPF_REG_CTX,
                                      offsetof(struct sk_buff, dev));
                /* if (tmp != 0) goto pc + 1 */
                *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1);
                *insn++ = BPF_EXIT_INSN();
                if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX)
                        *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP,
                                            offsetof(struct net_device, ifindex));
                else
                        *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP,
                                            offsetof(struct net_device, type));
                break;

        case SKF_AD_OFF + SKF_AD_MARK:
                cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn);
                insn += cnt - 1;
                break;

        case SKF_AD_OFF + SKF_AD_RXHASH:
                BUILD_BUG_ON(sizeof_field(struct sk_buff, hash) != 4);

                *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX,
                                    offsetof(struct sk_buff, hash));
                break;

        case SKF_AD_OFF + SKF_AD_QUEUE:
                cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn);
                insn += cnt - 1;
                break;

        case SKF_AD_OFF + SKF_AD_VLAN_TAG:
                cnt = convert_skb_access(SKF_AD_VLAN_TAG,
                                         BPF_REG_A, BPF_REG_CTX, insn);
                insn += cnt - 1;
                break;

        case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT:
                cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
                                         BPF_REG_A, BPF_REG_CTX, insn);
                insn += cnt - 1;
                break;

        case SKF_AD_OFF + SKF_AD_VLAN_TPID:
                BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_proto) != 2);

                /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */
                *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
                                      offsetof(struct sk_buff, vlan_proto));
                /* A = ntohs(A) [emitting a nop or swap16] */
                *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
                break;

        case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
        case SKF_AD_OFF + SKF_AD_NLATTR:
        case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
        case SKF_AD_OFF + SKF_AD_CPU:
        case SKF_AD_OFF + SKF_AD_RANDOM:
                /* arg1 = CTX */
                *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
                /* arg2 = A */
                *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A);
                /* arg3 = X */
                *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X);
                /* Emit call(arg1=CTX, arg2=A, arg3=X) */
                switch (fp->k) {
                case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
                        *insn = BPF_EMIT_CALL(bpf_skb_get_pay_offset);
                        break;
                case SKF_AD_OFF + SKF_AD_NLATTR:
                        *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr);
                        break;
                case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
                        *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr_nest);
                        break;
                case SKF_AD_OFF + SKF_AD_CPU:
                        *insn = BPF_EMIT_CALL(bpf_get_raw_cpu_id);
                        break;
                case SKF_AD_OFF + SKF_AD_RANDOM:
                        *insn = BPF_EMIT_CALL(bpf_user_rnd_u32);
                        bpf_user_rnd_init_once();
                        break;
                }
                break;

        case SKF_AD_OFF + SKF_AD_ALU_XOR_X:
                /* A ^= X */
                *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X);
                break;

        default:
                /* This is just a dummy call to avoid letting the compiler
                 * evict __bpf_call_base() as an optimization. Placed here
                 * where no-one bothers.
                 */
                BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0);
                return false;
        }

        *insnp = insn;
        return true;
}

static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp)
{
        const bool unaligned_ok = IS_BUILTIN(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS);
        int size = bpf_size_to_bytes(BPF_SIZE(fp->code));
        bool endian = BPF_SIZE(fp->code) == BPF_H ||
                      BPF_SIZE(fp->code) == BPF_W;
        bool indirect = BPF_MODE(fp->code) == BPF_IND;
        const int ip_align = NET_IP_ALIGN;
        struct bpf_insn *insn = *insnp;
        int offset = fp->k;

        if (!indirect &&
            ((unaligned_ok && offset >= 0) ||
             (!unaligned_ok && offset >= 0 &&
              offset + ip_align >= 0 &&
              offset + ip_align % size == 0))) {
                bool ldx_off_ok = offset <= S16_MAX;

                *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H);
                if (offset)
                        *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset);
                *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP,
                                      size, 2 + endian + (!ldx_off_ok * 2));
                if (ldx_off_ok) {
                        *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A,
                                              BPF_REG_D, offset);
                } else {
                        *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_D);
                        *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_TMP, offset);
                        *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A,
                                              BPF_REG_TMP, 0);
                }
                if (endian)
                        *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8);
                *insn++ = BPF_JMP_A(8);
        }

        *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
        *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_D);
        *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_H);
        if (!indirect) {
                *insn++ = BPF_MOV64_IMM(BPF_REG_ARG4, offset);
        } else {
                *insn++ = BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_X);
                if (fp->k)
                        *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG4, offset);
        }

        switch (BPF_SIZE(fp->code)) {
        case BPF_B:
                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8);
                break;
        case BPF_H:
                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16);
                break;
        case BPF_W:
                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32);
                break;
        default:
                return false;
        }

        *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_A, 0, 2);
        *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
        *insn   = BPF_EXIT_INSN();

        *insnp = insn;
        return true;
}

/**
 *        bpf_convert_filter - convert filter program
 *        @prog: the user passed filter program
 *        @len: the length of the user passed filter program
 *        @new_prog: allocated 'struct bpf_prog' or NULL
 *        @new_len: pointer to store length of converted program
 *        @seen_ld_abs: bool whether we've seen ld_abs/ind
 *
 * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn'
 * style extended BPF (eBPF).
 * Conversion workflow:
 *
 * 1) First pass for calculating the new program length:
 *   bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs)
 *
 * 2) 2nd pass to remap in two passes: 1st pass finds new
 *    jump offsets, 2nd pass remapping:
 *   bpf_convert_filter(old_prog, old_len, new_prog, &new_len, &seen_ld_abs)
 */
static int bpf_convert_filter(struct sock_filter *prog, int len,
                              struct bpf_prog *new_prog, int *new_len,
                              bool *seen_ld_abs)
{
        int new_flen = 0, pass = 0, target, i, stack_off;
        struct bpf_insn *new_insn, *first_insn = NULL;
        struct sock_filter *fp;
        int *addrs = NULL;
        u8 bpf_src;

        BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK);
        BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);

        if (len <= 0 || len > BPF_MAXINSNS)
                return -EINVAL;

        if (new_prog) {
                first_insn = new_prog->insnsi;
                addrs = kcalloc(len, sizeof(*addrs),
                                GFP_KERNEL | __GFP_NOWARN);
                if (!addrs)
                        return -ENOMEM;
        }

do_pass:
        new_insn = first_insn;
        fp = prog;

        /* Classic BPF related prologue emission. */
        if (new_prog) {
                /* Classic BPF expects A and X to be reset first. These need
                 * to be guaranteed to be the first two instructions.
                 */
                *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
                *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_X, BPF_REG_X);

                /* All programs must keep CTX in callee saved BPF_REG_CTX.
                 * In eBPF case it's done by the compiler, here we need to
                 * do this ourself. Initial CTX is present in BPF_REG_ARG1.
                 */
                *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1);
                if (*seen_ld_abs) {
                        /* For packet access in classic BPF, cache skb->data
                         * in callee-saved BPF R8 and skb->len - skb->data_len
                         * (headlen) in BPF R9. Since classic BPF is read-only
                         * on CTX, we only need to cache it once.
                         */
                        *new_insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
                                                  BPF_REG_D, BPF_REG_CTX,
                                                  offsetof(struct sk_buff, data));
                        *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_H, BPF_REG_CTX,
                                                  offsetof(struct sk_buff, len));
                        *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_TMP, BPF_REG_CTX,
                                                  offsetof(struct sk_buff, data_len));
                        *new_insn++ = BPF_ALU32_REG(BPF_SUB, BPF_REG_H, BPF_REG_TMP);
                }
        } else {
                new_insn += 3;
        }

        for (i = 0; i < len; fp++, i++) {
                struct bpf_insn tmp_insns[32] = { };
                struct bpf_insn *insn = tmp_insns;

                if (addrs)
                        addrs[i] = new_insn - first_insn;

                switch (fp->code) {
                /* All arithmetic insns and skb loads map as-is. */
                case BPF_ALU | BPF_ADD | BPF_X:
                case BPF_ALU | BPF_ADD | BPF_K:
                case BPF_ALU | BPF_SUB | BPF_X:
                case BPF_ALU | BPF_SUB | BPF_K:
                case BPF_ALU | BPF_AND | BPF_X:
                case BPF_ALU | BPF_AND | BPF_K:
                case BPF_ALU | BPF_OR | BPF_X:
                case BPF_ALU | BPF_OR | BPF_K:
                case BPF_ALU | BPF_LSH | BPF_X:
                case BPF_ALU | BPF_LSH | BPF_K:
                case BPF_ALU | BPF_RSH | BPF_X:
                case BPF_ALU | BPF_RSH | BPF_K:
                case BPF_ALU | BPF_XOR | BPF_X:
                case BPF_ALU | BPF_XOR | BPF_K:
                case BPF_ALU | BPF_MUL | BPF_X:
                case BPF_ALU | BPF_MUL | BPF_K:
                case BPF_ALU | BPF_DIV | BPF_X:
                case BPF_ALU | BPF_DIV | BPF_K:
                case BPF_ALU | BPF_MOD | BPF_X:
                case BPF_ALU | BPF_MOD | BPF_K:
                case BPF_ALU | BPF_NEG:
                case BPF_LD | BPF_ABS | BPF_W:
                case BPF_LD | BPF_ABS | BPF_H:
                case BPF_LD | BPF_ABS | BPF_B:
                case BPF_LD | BPF_IND | BPF_W:
                case BPF_LD | BPF_IND | BPF_H:
                case BPF_LD | BPF_IND | BPF_B:
                        /* Check for overloaded BPF extension and
                         * directly convert it if found, otherwise
                         * just move on with mapping.
                         */
                        if (BPF_CLASS(fp->code) == BPF_LD &&
                            BPF_MODE(fp->code) == BPF_ABS &&
                            convert_bpf_extensions(fp, &insn))
                                break;
                        if (BPF_CLASS(fp->code) == BPF_LD &&
                            convert_bpf_ld_abs(fp, &insn)) {
                                *seen_ld_abs = true;
                                break;
                        }

                        if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) ||
                            fp->code == (BPF_ALU | BPF_MOD | BPF_X)) {
                                *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X);
                                /* Error with exception code on div/mod by 0.
                                 * For cBPF programs, this was always return 0.
                                 */
                                *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_X, 0, 2);
                                *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
                                *insn++ = BPF_EXIT_INSN();
                        }

                        *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k);
                        break;

                /* Jump transformation cannot use BPF block macros
                 * everywhere as offset calculation and target updates
                 * require a bit more work than the rest, i.e. jump
                 * opcodes map as-is, but offsets need adjustment.
                 */

#define BPF_EMIT_JMP                                                        \
        do {                                                                \
                const s32 off_min = S16_MIN, off_max = S16_MAX;                \
                s32 off;                                                \
                                                                        \
                if (target >= len || target < 0)                        \
                        goto err;                                        \
                off = addrs ? addrs[target] - addrs[i] - 1 : 0;                \
                /* Adjust pc relative offset for 2nd or 3rd insn. */        \
                off -= insn - tmp_insns;                                \
                /* Reject anything not fitting into insn->off. */        \
                if (off < off_min || off > off_max)                        \
                        goto err;                                        \
                insn->off = off;                                        \
        } while (0)

                case BPF_JMP | BPF_JA:
                        target = i + fp->k + 1;
                        insn->code = fp->code;
                        BPF_EMIT_JMP;
                        break;

                case BPF_JMP | BPF_JEQ | BPF_K:
                case BPF_JMP | BPF_JEQ | BPF_X:
                case BPF_JMP | BPF_JSET | BPF_K:
                case BPF_JMP | BPF_JSET | BPF_X:
                case BPF_JMP | BPF_JGT | BPF_K:
                case BPF_JMP | BPF_JGT | BPF_X:
                case BPF_JMP | BPF_JGE | BPF_K:
                case BPF_JMP | BPF_JGE | BPF_X:
                        if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) {
                                /* BPF immediates are signed, zero extend
                                 * immediate into tmp register and use it
                                 * in compare insn.
                                 */
                                *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k);

                                insn->dst_reg = BPF_REG_A;
                                insn->src_reg = BPF_REG_TMP;
                                bpf_src = BPF_X;
                        } else {
                                insn->dst_reg = BPF_REG_A;
                                insn->imm = fp->k;
                                bpf_src = BPF_SRC(fp->code);
                                insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0;
                        }

                        /* Common case where 'jump_false' is next insn. */
                        if (fp->jf == 0) {
                                insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
                                target = i + fp->jt + 1;
                                BPF_EMIT_JMP;
                                break;
                        }

                        /* Convert some jumps when 'jump_true' is next insn. */
                        if (fp->jt == 0) {
                                switch (BPF_OP(fp->code)) {
                                case BPF_JEQ:
                                        insn->code = BPF_JMP | BPF_JNE | bpf_src;
                                        break;
                                case BPF_JGT:
                                        insn->code = BPF_JMP | BPF_JLE | bpf_src;
                                        break;
                                case BPF_JGE:
                                        insn->code = BPF_JMP | BPF_JLT | bpf_src;
                                        break;
                                default:
                                        goto jmp_rest;
                                }

                                target = i + fp->jf + 1;
                                BPF_EMIT_JMP;
                                break;
                        }
jmp_rest:
                        /* Other jumps are mapped into two insns: Jxx and JA. */
                        target = i + fp->jt + 1;
                        insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
                        BPF_EMIT_JMP;
                        insn++;

                        insn->code = BPF_JMP | BPF_JA;
                        target = i + fp->jf + 1;
                        BPF_EMIT_JMP;
                        break;

                /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */
                case BPF_LDX | BPF_MSH | BPF_B: {
                        struct sock_filter tmp = {
                                .code        = BPF_LD | BPF_ABS | BPF_B,
                                .k        = fp->k,
                        };

                        *seen_ld_abs = true;

                        /* X = A */
                        *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
                        /* A = BPF_R0 = *(u8 *) (skb->data + K) */
                        convert_bpf_ld_abs(&tmp, &insn);
                        insn++;
                        /* A &= 0xf */
                        *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf);
                        /* A <<= 2 */
                        *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2);
                        /* tmp = X */
                        *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_X);
                        /* X = A */
                        *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
                        /* A = tmp */
                        *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP);
                        break;
                }
                /* RET_K is remaped into 2 insns. RET_A case doesn't need an
                 * extra mov as BPF_REG_0 is already mapped into BPF_REG_A.
                 */
                case BPF_RET | BPF_A:
                case BPF_RET | BPF_K:
                        if (BPF_RVAL(fp->code) == BPF_K)
                                *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0,
                                                        0, fp->k);
                        *insn = BPF_EXIT_INSN();
                        break;

                /* Store to stack. */
                case BPF_ST:
                case BPF_STX:
                        stack_off = fp->k * 4  + 4;
                        *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) ==
                                            BPF_ST ? BPF_REG_A : BPF_REG_X,
                                            -stack_off);
                        /* check_load_and_stores() verifies that classic BPF can
                         * load from stack only after write, so tracking
                         * stack_depth for ST|STX insns is enough
                         */
                        if (new_prog && new_prog->aux->stack_depth < stack_off)
                                new_prog->aux->stack_depth = stack_off;
                        break;

                /* Load from stack. */
                case BPF_LD | BPF_MEM:
                case BPF_LDX | BPF_MEM:
                        stack_off = fp->k * 4  + 4;
                        *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD  ?
                                            BPF_REG_A : BPF_REG_X, BPF_REG_FP,
                                            -stack_off);
                        break;

                /* A = K or X = K */
                case BPF_LD | BPF_IMM:
                case BPF_LDX | BPF_IMM:
                        *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ?
                                              BPF_REG_A : BPF_REG_X, fp->k);
                        break;

                /* X = A */
                case BPF_MISC | BPF_TAX:
                        *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
                        break;

                /* A = X */
                case BPF_MISC | BPF_TXA:
                        *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X);
                        break;

                /* A = skb->len or X = skb->len */
                case BPF_LD | BPF_W | BPF_LEN:
                case BPF_LDX | BPF_W | BPF_LEN:
                        *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ?
                                            BPF_REG_A : BPF_REG_X, BPF_REG_CTX,
                                            offsetof(struct sk_buff, len));
                        break;

                /* Access seccomp_data fields. */
                case BPF_LDX | BPF_ABS | BPF_W:
                        /* A = *(u32 *) (ctx + K) */
                        *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k);
                        break;

                /* Unknown instruction. */
                default:
                        goto err;
                }

                insn++;
                if (new_prog)
                        memcpy(new_insn, tmp_insns,
                               sizeof(*insn) * (insn - tmp_insns));
                new_insn += insn - tmp_insns;
        }

        if (!new_prog) {
                /* Only calculating new length. */
                *new_len = new_insn - first_insn;
                if (*seen_ld_abs)
                        *new_len += 4; /* Prologue bits. */
                return 0;
        }

        pass++;
        if (new_flen != new_insn - first_insn) {
                new_flen = new_insn - first_insn;
                if (pass > 2)
                        goto err;
                goto do_pass;
        }

        kfree(addrs);
        BUG_ON(*new_len != new_flen);
        return 0;
err:
        kfree(addrs);
        return -EINVAL;
}

/* Security:
 *
 * As we dont want to clear mem[] array for each packet going through
 * __bpf_prog_run(), we check that filter loaded by user never try to read
 * a cell if not previously written, and we check all branches to be sure
 * a malicious user doesn't try to abuse us.
 */
static int check_load_and_stores(const struct sock_filter *filter, int flen)
{
        u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */
        int pc, ret = 0;

        BUILD_BUG_ON(BPF_MEMWORDS > 16);

        masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL);
        if (!masks)
                return -ENOMEM;

        memset(masks, 0xff, flen * sizeof(*masks));

        for (pc = 0; pc < flen; pc++) {
                memvalid &= masks[pc];

                switch (filter[pc].code) {
                case BPF_ST:
                case BPF_STX:
                        memvalid |= (1 << filter[pc].k);
                        break;
                case BPF_LD | BPF_MEM:
                case BPF_LDX | BPF_MEM:
                        if (!(memvalid & (1 << filter[pc].k))) {
                                ret = -EINVAL;
                                goto error;
                        }
                        break;
                case BPF_JMP | BPF_JA:
                        /* A jump must set masks on target */
                        masks[pc + 1 + filter[pc].k] &= memvalid;
                        memvalid = ~0;
                        break;
                case BPF_JMP | BPF_JEQ | BPF_K:
                case BPF_JMP | BPF_JEQ | BPF_X:
                case BPF_JMP | BPF_JGE | BPF_K:
                case BPF_JMP | BPF_JGE | BPF_X:
                case BPF_JMP | BPF_JGT | BPF_K:
                case BPF_JMP | BPF_JGT | BPF_X:
                case BPF_JMP | BPF_JSET | BPF_K:
                case BPF_JMP | BPF_JSET | BPF_X:
                        /* A jump must set masks on targets */
                        masks[pc + 1 + filter[pc].jt] &= memvalid;
                        masks[pc + 1 + filter[pc].jf] &= memvalid;
                        memvalid = ~0;
                        break;
                }
        }
error:
        kfree(masks);
        return ret;
}

static bool chk_code_allowed(u16 code_to_probe)
{
        static const bool codes[] = {
                /* 32 bit ALU operations */
                [BPF_ALU | BPF_ADD | BPF_K] = true,
                [BPF_ALU | BPF_ADD | BPF_X] = true,
                [BPF_ALU | BPF_SUB | BPF_K] = true,
                [BPF_ALU | BPF_SUB | BPF_X] = true,
                [BPF_ALU | BPF_MUL | BPF_K] = true,
                [BPF_ALU | BPF_MUL | BPF_X] = true,
                [BPF_ALU | BPF_DIV | BPF_K] = true,
                [BPF_ALU | BPF_DIV | BPF_X] = true,
                [BPF_ALU | BPF_MOD | BPF_K] = true,
                [BPF_ALU | BPF_MOD | BPF_X] = true,
                [BPF_ALU | BPF_AND | BPF_K] = true,
                [BPF_ALU | BPF_AND | BPF_X] = true,
                [BPF_ALU | BPF_OR | BPF_K] = true,
                [BPF_ALU | BPF_OR | BPF_X] = true,
                [BPF_ALU | BPF_XOR | BPF_K] = true,
                [BPF_ALU | BPF_XOR | BPF_X] = true,
                [BPF_ALU | BPF_LSH | BPF_K] = true,
                [BPF_ALU | BPF_LSH | BPF_X] = true,
                [BPF_ALU | BPF_RSH | BPF_K] = true,
                [BPF_ALU | BPF_RSH | BPF_X] = true,
                [BPF_ALU | BPF_NEG] = true,
                /* Load instructions */
                [BPF_LD | BPF_W | BPF_ABS] = true,
                [BPF_LD | BPF_H | BPF_ABS] = true,
                [BPF_LD | BPF_B | BPF_ABS] = true,
                [BPF_LD | BPF_W | BPF_LEN] = true,
                [BPF_LD | BPF_W | BPF_IND] = true,
                [BPF_LD | BPF_H | BPF_IND] = true,
                [BPF_LD | BPF_B | BPF_IND] = true,
                [BPF_LD | BPF_IMM] = true,
                [BPF_LD | BPF_MEM] = true,
                [BPF_LDX | BPF_W | BPF_LEN] = true,
                [BPF_LDX | BPF_B | BPF_MSH] = true,
                [BPF_LDX | BPF_IMM] = true,
                [BPF_LDX | BPF_MEM] = true,
                /* Store instructions */
                [BPF_ST] = true,
                [BPF_STX] = true,
                /* Misc instructions */
                [BPF_MISC | BPF_TAX] = true,
                [BPF_MISC | BPF_TXA] = true,
                /* Return instructions */
                [BPF_RET | BPF_K] = true,
                [BPF_RET | BPF_A] = true,
                /* Jump instructions */
                [BPF_JMP | BPF_JA] = true,
                [BPF_JMP | BPF_JEQ | BPF_K] = true,
                [BPF_JMP | BPF_JEQ | BPF_X] = true,
                [BPF_JMP | BPF_JGE | BPF_K] = true,
                [BPF_JMP | BPF_JGE | BPF_X] = true,
                [BPF_JMP | BPF_JGT | BPF_K] = true,
                [BPF_JMP | BPF_JGT | BPF_X] = true,
                [BPF_JMP | BPF_JSET | BPF_K] = true,
                [BPF_JMP | BPF_JSET | BPF_X] = true,
        };

        if (code_to_probe >= ARRAY_SIZE(codes))
                return false;

        return codes[code_to_probe];
}

static bool bpf_check_basics_ok(const struct sock_filter *filter,
                                unsigned int flen)
{
        if (filter == NULL)
                return false;
        if (flen == 0 || flen > BPF_MAXINSNS)
                return false;

        return true;
}

/**
 *        bpf_check_classic - verify socket filter code
 *        @filter: filter to verify
 *        @flen: length of filter
 *
 * Check the user's filter code. If we let some ugly
 * filter code slip through kaboom! The filter must contain
 * no references or jumps that are out of range, no illegal
 * instructions, and must end with a RET instruction.
 *
 * All jumps are forward as they are not signed.
 *
 * Returns 0 if the rule set is legal or -EINVAL if not.
 */
static int bpf_check_classic(const struct sock_filter *filter,
                             unsigned int flen)
{
        bool anc_found;
        int pc;

        /* Check the filter code now */
        for (pc = 0; pc < flen; pc++) {
                const struct sock_filter *ftest = &filter[pc];

                /* May we actually operate on this code? */
                if (!chk_code_allowed(ftest->code))
                        return -EINVAL;

                /* Some instructions need special checks */
                switch (ftest->code) {
                case BPF_ALU | BPF_DIV | BPF_K:
                case BPF_ALU | BPF_MOD | BPF_K:
                        /* Check for division by zero */
                        if (ftest->k == 0)
                                return -EINVAL;
                        break;
                case BPF_ALU | BPF_LSH | BPF_K:
                case BPF_ALU | BPF_RSH | BPF_K:
                        if (ftest->k >= 32)
                                return -EINVAL;
                        break;
                case BPF_LD | BPF_MEM:
                case BPF_LDX | BPF_MEM:
                case BPF_ST:
                case BPF_STX:
                        /* Check for invalid memory addresses */
                        if (ftest->k >= BPF_MEMWORDS)
                                return -EINVAL;
                        break;
                case BPF_JMP | BPF_JA:
                        /* Note, the large ftest->k might cause loops.
                         * Compare this with conditional jumps below,
                         * where offsets are limited. --ANK (981016)
                         */
                        if (ftest->k >= (unsigned int)(flen - pc - 1))
                                return -EINVAL;
                        break;
                case BPF_JMP | BPF_JEQ | BPF_K:
                case BPF_JMP | BPF_JEQ | BPF_X:
                case BPF_JMP | BPF_JGE | BPF_K:
                case BPF_JMP | BPF_JGE | BPF_X:
                case BPF_JMP | BPF_JGT | BPF_K:
                case BPF_JMP | BPF_JGT | BPF_X:
                case BPF_JMP | BPF_JSET | BPF_K:
                case BPF_JMP | BPF_JSET | BPF_X:
                        /* Both conditionals must be safe */
                        if (pc + ftest->jt + 1 >= flen ||
                            pc + ftest->jf + 1 >= flen)
                                return -EINVAL;
                        break;
                case BPF_LD | BPF_W | BPF_ABS:
                case BPF_LD | BPF_H | BPF_ABS:
                case BPF_LD | BPF_B | BPF_ABS:
                        anc_found = false;
                        if (bpf_anc_helper(ftest) & BPF_ANC)
                                anc_found = true;
                        /* Ancillary operation unknown or unsupported */
                        if (anc_found == false && ftest->k >= SKF_AD_OFF)
                                return -EINVAL;
                }
        }

        /* Last instruction must be a RET code */
        switch (filter[flen - 1].code) {
        case BPF_RET | BPF_K:
        case BPF_RET | BPF_A:
                return check_load_and_stores(filter, flen);
        }

        return -EINVAL;
}

static int bpf_prog_store_orig_filter(struct bpf_prog *fp,
                                      const struct sock_fprog *fprog)
{
        unsigned int fsize = bpf_classic_proglen(fprog);
        struct sock_fprog_kern *fkprog;

        fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL);
        if (!fp->orig_prog)
                return -ENOMEM;

        fkprog = fp->orig_prog;
        fkprog->len = fprog->len;

        fkprog->filter = kmemdup(fp->insns, fsize,
                                 GFP_KERNEL | __GFP_NOWARN);
        if (!fkprog->filter) {
                kfree(fp->orig_prog);
                return -ENOMEM;
        }

        return 0;
}

static void bpf_release_orig_filter(struct bpf_prog *fp)
{
        struct sock_fprog_kern *fprog = fp->orig_prog;

        if (fprog) {
                kfree(fprog->filter);
                kfree(fprog);
        }
}

static void __bpf_prog_release(struct bpf_prog *prog)
{
        if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) {
                bpf_prog_put(prog);
        } else {
                bpf_release_orig_filter(prog);
                bpf_prog_free(prog);
        }
}

static void __sk_filter_release(struct sk_filter *fp)
{
        __bpf_prog_release(fp->prog);
        kfree(fp);
}

/**
 *         sk_filter_release_rcu - Release a socket filter by rcu_head
 *        @rcu: rcu_head that contains the sk_filter to free
 */
static void sk_filter_release_rcu(struct rcu_head *rcu)
{
        struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);

        __sk_filter_release(fp);
}

/**
 *        sk_filter_release - release a socket filter
 *        @fp: filter to remove
 *
 *        Remove a filter from a socket and release its resources.
 */
static void sk_filter_release(struct sk_filter *fp)
{
        if (refcount_dec_and_test(&fp->refcnt))
                call_rcu(&fp->rcu, sk_filter_release_rcu);
}

void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
{
        u32 filter_size = bpf_prog_size(fp->prog->len);

        atomic_sub(filter_size, &sk->sk_omem_alloc);
        sk_filter_release(fp);
}

/* try to charge the socket memory if there is space available
 * return true on success
 */
static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp)
{
        u32 filter_size = bpf_prog_size(fp->prog->len);
        int optmem_max = READ_ONCE(sysctl_optmem_max);

        /* same check as in sock_kmalloc() */
        if (filter_size <= optmem_max &&
            atomic_read(&sk->sk_omem_alloc) + filter_size < optmem_max) {
                atomic_add(filter_size, &sk->sk_omem_alloc);
                return true;
        }
        return false;
}

bool sk_filter_charge(struct sock *sk, struct sk_filter *fp)
{
        if (!refcount_inc_not_zero(&fp->refcnt))
                return false;

        if (!__sk_filter_charge(sk, fp)) {
                sk_filter_release(fp);
                return false;
        }
        return true;
}

static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
{
        struct sock_filter *old_prog;
        struct bpf_prog *old_fp;
        int err, new_len, old_len = fp->len;
        bool seen_ld_abs = false;

        /* We are free to overwrite insns et al right here as it
         * won't be used at this point in time anymore internally
         * after the migration to the internal BPF instruction
         * representation.
         */
        BUILD_BUG_ON(sizeof(struct sock_filter) !=
                     sizeof(struct bpf_insn));

        /* Conversion cannot happen on overlapping memory areas,
         * so we need to keep the user BPF around until the 2nd
         * pass. At this time, the user BPF is stored in fp->insns.
         */
        old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter),
                           GFP_KERNEL | __GFP_NOWARN);
        if (!old_prog) {
                err = -ENOMEM;
                goto out_err;
        }

        /* 1st pass: calculate the new program length. */
        err = bpf_convert_filter(old_prog, old_len, NULL, &new_len,
                                 &seen_ld_abs);
        if (err)
                goto out_err_free;

        /* Expand fp for appending the new filter representation. */
        old_fp = fp;
        fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0);
        if (!fp) {
                /* The old_fp is still around in case we couldn't
                 * allocate new memory, so uncharge on that one.
                 */
                fp = old_fp;
                err = -ENOMEM;
                goto out_err_free;
        }

        fp->len = new_len;

        /* 2nd pass: remap sock_filter insns into bpf_insn insns. */
        err = bpf_convert_filter(old_prog, old_len, fp, &new_len,
                                 &seen_ld_abs);
        if (err)
                /* 2nd bpf_convert_filter() can fail only if it fails
                 * to allocate memory, remapping must succeed. Note,
                 * that at this time old_fp has already been released
                 * by krealloc().
                 */
                goto out_err_free;

        fp = bpf_prog_select_runtime(fp, &err);
        if (err)
                goto out_err_free;

        kfree(old_prog);
        return fp;

out_err_free:
        kfree(old_prog);
out_err:
        __bpf_prog_release(fp);
        return ERR_PTR(err);
}

static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp,
                                           bpf_aux_classic_check_t trans)
{
        int err;

        fp->bpf_func = NULL;
        fp->jited = 0;

        err = bpf_check_classic(fp->insns, fp->len);
        if (err) {
                __bpf_prog_release(fp);
                return ERR_PTR(err);
        }

        /* There might be additional checks and transformations
         * needed on classic filters, f.e. in case of seccomp.
         */
        if (trans) {
                err = trans(fp->insns, fp->len);
                if (err) {
                        __bpf_prog_release(fp);
                        return ERR_PTR(err);
                }
        }

        /* Probe if we can JIT compile the filter and if so, do
         * the compilation of the filter.
         */
        bpf_jit_compile(fp);

        /* JIT compiler couldn't process this filter, so do the
         * internal BPF translation for the optimized interpreter.
         */
        if (!fp->jited)
                fp = bpf_migrate_filter(fp);

        return fp;
}

/**
 *        bpf_prog_create - create an unattached filter
 *        @pfp: the unattached filter that is created
 *        @fprog: the filter program
 *
 * Create a filter independent of any socket. We first run some
 * sanity checks on it to make sure it does not explode on us later.
 * If an error occurs or there is insufficient memory for the filter
 * a negative errno code is returned. On success the return is zero.
 */
int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog)
{
        unsigned int fsize = bpf_classic_proglen(fprog);
        struct bpf_prog *fp;

        /* Make sure new filter is there and in the right amounts. */
        if (!bpf_check_basics_ok(fprog->filter, fprog->len))
                return -EINVAL;

        fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
        if (!fp)
                return -ENOMEM;

        memcpy(fp->insns, fprog->filter, fsize);

        fp->len = fprog->len;
        /* Since unattached filters are not copied back to user
         * space through sk_get_filter(), we do not need to hold
         * a copy here, and can spare us the work.
         */
        fp->orig_prog = NULL;

        /* bpf_prepare_filter() already takes care of freeing
         * memory in case something goes wrong.
         */
        fp = bpf_prepare_filter(fp, NULL);
        if (IS_ERR(fp))
                return PTR_ERR(fp);

        *pfp = fp;
        return 0;
}
EXPORT_SYMBOL_GPL(bpf_prog_create);

/**
 *        bpf_prog_create_from_user - create an unattached filter from user buffer
 *        @pfp: the unattached filter that is created
 *        @fprog: the filter program
 *        @trans: post-classic verifier transformation handler
 *        @save_orig: save classic BPF program
 *
 * This function effectively does the same as bpf_prog_create(), only
 * that it builds up its insns buffer from user space provided buffer.
 * It also allows for passing a bpf_aux_classic_check_t handler.
 */
int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
                              bpf_aux_classic_check_t trans, bool save_orig)
{
        unsigned int fsize = bpf_classic_proglen(fprog);
        struct bpf_prog *fp;
        int err;

        /* Make sure new filter is there and in the right amounts. */
        if (!bpf_check_basics_ok(fprog->filter, fprog->len))
                return -EINVAL;

        fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
        if (!fp)
                return -ENOMEM;

        if (copy_from_user(fp->insns, fprog->filter, fsize)) {
                __bpf_prog_free(fp);
                return -EFAULT;
        }

        fp->len = fprog->len;
        fp->orig_prog = NULL;

        if (save_orig) {
                err = bpf_prog_store_orig_filter(fp, fprog);
                if (err) {
                        __bpf_prog_free(fp);
                        return -ENOMEM;
                }
        }

        /* bpf_prepare_filter() already takes care of freeing
         * memory in case something goes wrong.
         */
        fp = bpf_prepare_filter(fp, trans);
        if (IS_ERR(fp))
                return PTR_ERR(fp);

        *pfp = fp;
        return 0;
}
EXPORT_SYMBOL_GPL(bpf_prog_create_from_user);

void bpf_prog_destroy(struct bpf_prog *fp)
{
        __bpf_prog_release(fp);
}
EXPORT_SYMBOL_GPL(bpf_prog_destroy);

static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
{
        struct sk_filter *fp, *old_fp;

        fp = kmalloc(sizeof(*fp), GFP_KERNEL);
        if (!fp)
                return -ENOMEM;

        fp->prog = prog;

        if (!__sk_filter_charge(sk, fp)) {
                kfree(fp);
                return -ENOMEM;
        }
        refcount_set(&fp->refcnt, 1);

        old_fp = rcu_dereference_protected(sk->sk_filter,
                                           lockdep_sock_is_held(sk));
        rcu_assign_pointer(sk->sk_filter, fp);

        if (old_fp)
                sk_filter_uncharge(sk, old_fp);

        return 0;
}

static
struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk)
{
        unsigned int fsize = bpf_classic_proglen(fprog);
        struct bpf_prog *prog;
        int err;

        if (sock_flag(sk, SOCK_FILTER_LOCKED))
                return ERR_PTR(-EPERM);

        /* Make sure new filter is there and in the right amounts. */
        if (!bpf_check_basics_ok(fprog->filter, fprog->len))
                return ERR_PTR(-EINVAL);

        prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
        if (!prog)
                return ERR_PTR(-ENOMEM);

        if (copy_from_user(prog->insns, fprog->filter, fsize)) {
                __bpf_prog_free(prog);
                return ERR_PTR(-EFAULT);
        }

        prog->len = fprog->len;

        err = bpf_prog_store_orig_filter(prog, fprog);
        if (err) {
                __bpf_prog_free(prog);
                return ERR_PTR(-ENOMEM);
        }

        /* bpf_prepare_filter() already takes care of freeing
         * memory in case something goes wrong.
         */
        return bpf_prepare_filter(prog, NULL);
}

/**
 *        sk_attach_filter - attach a socket filter
 *        @fprog: the filter program
 *        @sk: the socket to use
 *
 * Attach the user's filter code. We first run some sanity checks on
 * it to make sure it does not explode on us later. If an error
 * occurs or there is insufficient memory for the filter a negative
 * errno code is returned. On success the return is zero.
 */
int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
{
        struct bpf_prog *prog = __get_filter(fprog, sk);
        int err;

        if (IS_ERR(prog))
                return PTR_ERR(prog);

        err = __sk_attach_prog(prog, sk);
        if (err < 0) {
                __bpf_prog_release(prog);
                return err;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(sk_attach_filter);

int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
{
        struct bpf_prog *prog = __get_filter(fprog, sk);
        int err;

        if (IS_ERR(prog))
                return PTR_ERR(prog);

        if (bpf_prog_size(prog->len) > READ_ONCE(sysctl_optmem_max))
                err = -ENOMEM;
        else
                err = reuseport_attach_prog(sk, prog);

        if (err)
                __bpf_prog_release(prog);

        return err;
}

static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk)
{
        if (sock_flag(sk, SOCK_FILTER_LOCKED))
                return ERR_PTR(-EPERM);

        return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
}

int sk_attach_bpf(u32 ufd, struct sock *sk)
{
        struct bpf_prog *prog = __get_bpf(ufd, sk);
        int err;

        if (IS_ERR(prog))
                return PTR_ERR(prog);

        err = __sk_attach_prog(prog, sk);
        if (err < 0) {
                bpf_prog_put(prog);
                return err;
        }

        return 0;
}

int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
{
        struct bpf_prog *prog;
        int err;

        if (sock_flag(sk, SOCK_FILTER_LOCKED))
                return -EPERM;

        prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
        if (PTR_ERR(prog) == -EINVAL)
                prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SK_REUSEPORT);
        if (IS_ERR(prog))
                return PTR_ERR(prog);

        if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) {
                /* Like other non BPF_PROG_TYPE_SOCKET_FILTER
                 * bpf prog (e.g. sockmap).  It depends on the
                 * limitation imposed by bpf_prog_load().
                 * Hence, sysctl_optmem_max is not checked.
                 */
                if ((sk->sk_type != SOCK_STREAM &&
                     sk->sk_type != SOCK_DGRAM) ||
                    (sk->sk_protocol != IPPROTO_UDP &&
                     sk->sk_protocol != IPPROTO_TCP) ||
                    (sk->sk_family != AF_INET &&
                     sk->sk_family != AF_INET6)) {
                        err = -ENOTSUPP;
                        goto err_prog_put;
                }
        } else {
                /* BPF_PROG_TYPE_SOCKET_FILTER */
                if (bpf_prog_size(prog->len) > READ_ONCE(sysctl_optmem_max)) {
                        err = -ENOMEM;
                        goto err_prog_put;
                }
        }

        err = reuseport_attach_prog(sk, prog);
err_prog_put:
        if (err)
                bpf_prog_put(prog);

        return err;
}

void sk_reuseport_prog_free(struct bpf_prog *prog)
{
        if (!prog)
                return;

        if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
                bpf_prog_put(prog);
        else
                bpf_prog_destroy(prog);
}

struct bpf_scratchpad {
        union {
                __be32 diff[MAX_BPF_STACK / sizeof(__be32)];
                u8     buff[MAX_BPF_STACK];
        };
};

static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp);

static inline int __bpf_try_make_writable(struct sk_buff *skb,
                                          unsigned int write_len)
{
        return skb_ensure_writable(skb, write_len);
}

static inline int bpf_try_make_writable(struct sk_buff *skb,
                                        unsigned int write_len)
{
        int err = __bpf_try_make_writable(skb, write_len);

        bpf_compute_data_pointers(skb);
        return err;
}

static int bpf_try_make_head_writable(struct sk_buff *skb)
{
        return bpf_try_make_writable(skb, skb_headlen(skb));
}

static inline void bpf_push_mac_rcsum(struct sk_buff *skb)
{
        if (skb_at_tc_ingress(skb))
                skb_postpush_rcsum(skb, skb_mac_header(skb), skb->mac_len);
}

static inline void bpf_pull_mac_rcsum(struct sk_buff *skb)
{
        if (skb_at_tc_ingress(skb))
                skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len);
}

BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset,
           const void *, from, u32, len, u64, flags)
{
        void *ptr;

        if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH)))
                return -EINVAL;
        if (unlikely(offset > INT_MAX))
                return -EFAULT;
        if (unlikely(bpf_try_make_writable(skb, offset + len)))
                return -EFAULT;

        ptr = skb->data + offset;
        if (flags & BPF_F_RECOMPUTE_CSUM)
                __skb_postpull_rcsum(skb, ptr, len, offset);

        memcpy(ptr, from, len);

        if (flags & BPF_F_RECOMPUTE_CSUM)
                __skb_postpush_rcsum(skb, ptr, len, offset);
        if (flags & BPF_F_INVALIDATE_HASH)
                skb_clear_hash(skb);

        return 0;
}

static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
        .func                = bpf_skb_store_bytes,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_MEM,
        .arg4_type        = ARG_CONST_SIZE,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset,
           void *, to, u32, len)
{
        void *ptr;

        if (unlikely(offset > INT_MAX))
                goto err_clear;

        ptr = skb_header_pointer(skb, offset, len, to);
        if (unlikely(!ptr))
                goto err_clear;
        if (ptr != to)
                memcpy(to, ptr, len);

        return 0;
err_clear:
        memset(to, 0, len);
        return -EFAULT;
}

static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
        .func                = bpf_skb_load_bytes,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
};

BPF_CALL_4(bpf_flow_dissector_load_bytes,
           const struct bpf_flow_dissector *, ctx, u32, offset,
           void *, to, u32, len)
{
        void *ptr;

        if (unlikely(offset > 0xffff))
                goto err_clear;

        if (unlikely(!ctx->skb))
                goto err_clear;

        ptr = skb_header_pointer(ctx->skb, offset, len, to);
        if (unlikely(!ptr))
                goto err_clear;
        if (ptr != to)
                memcpy(to, ptr, len);

        return 0;
err_clear:
        memset(to, 0, len);
        return -EFAULT;
}

static const struct bpf_func_proto bpf_flow_dissector_load_bytes_proto = {
        .func                = bpf_flow_dissector_load_bytes,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb,
           u32, offset, void *, to, u32, len, u32, start_header)
{
        u8 *end = skb_tail_pointer(skb);
        u8 *start, *ptr;

        if (unlikely(offset > 0xffff))
                goto err_clear;

        switch (start_header) {
        case BPF_HDR_START_MAC:
                if (unlikely(!skb_mac_header_was_set(skb)))
                        goto err_clear;
                start = skb_mac_header(skb);
                break;
        case BPF_HDR_START_NET:
                start = skb_network_header(skb);
                break;
        default:
                goto err_clear;
        }

        ptr = start + offset;

        if (likely(ptr + len <= end)) {
                memcpy(to, ptr, len);
                return 0;
        }

err_clear:
        memset(to, 0, len);
        return -EFAULT;
}

static const struct bpf_func_proto bpf_skb_load_bytes_relative_proto = {
        .func                = bpf_skb_load_bytes_relative,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
{
        /* Idea is the following: should the needed direct read/write
         * test fail during runtime, we can pull in more data and redo
         * again, since implicitly, we invalidate previous checks here.
         *
         * Or, since we know how much we need to make read/writeable,
         * this can be done once at the program beginning for direct
         * access case. By this we overcome limitations of only current
         * headroom being accessible.
         */
        return bpf_try_make_writable(skb, len ? : skb_headlen(skb));
}

static const struct bpf_func_proto bpf_skb_pull_data_proto = {
        .func                = bpf_skb_pull_data,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

BPF_CALL_1(bpf_sk_fullsock, struct sock *, sk)
{
        return sk_fullsock(sk) ? (unsigned long)sk : (unsigned long)NULL;
}

static const struct bpf_func_proto bpf_sk_fullsock_proto = {
        .func                = bpf_sk_fullsock,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_SOCK_COMMON,
};

static inline int sk_skb_try_make_writable(struct sk_buff *skb,
                                           unsigned int write_len)
{
        int err = __bpf_try_make_writable(skb, write_len);

        bpf_compute_data_end_sk_skb(skb);
        return err;
}

BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len)
{
        /* Idea is the following: should the needed direct read/write
         * test fail during runtime, we can pull in more data and redo
         * again, since implicitly, we invalidate previous checks here.
         *
         * Or, since we know how much we need to make read/writeable,
         * this can be done once at the program beginning for direct
         * access case. By this we overcome limitations of only current
         * headroom being accessible.
         */
        return sk_skb_try_make_writable(skb, len ? : skb_headlen(skb));
}

static const struct bpf_func_proto sk_skb_pull_data_proto = {
        .func                = sk_skb_pull_data,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset,
           u64, from, u64, to, u64, flags)
{
        __sum16 *ptr;

        if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK)))
                return -EINVAL;
        if (unlikely(offset > 0xffff || offset & 1))
                return -EFAULT;
        if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr))))
                return -EFAULT;

        ptr = (__sum16 *)(skb->data + offset);
        switch (flags & BPF_F_HDR_FIELD_MASK) {
        case 0:
                if (unlikely(from != 0))
                        return -EINVAL;

                csum_replace_by_diff(ptr, to);
                break;
        case 2:
                csum_replace2(ptr, from, to);
                break;
        case 4:
                csum_replace4(ptr, from, to);
                break;
        default:
                return -EINVAL;
        }

        return 0;
}

static const struct bpf_func_proto bpf_l3_csum_replace_proto = {
        .func                = bpf_l3_csum_replace,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
           u64, from, u64, to, u64, flags)
{
        bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
        bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
        bool do_mforce = flags & BPF_F_MARK_ENFORCE;
        bool is_ipv6   = flags & BPF_F_IPV6;
        __sum16 *ptr;

        if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE |
                               BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK | BPF_F_IPV6)))
                return -EINVAL;
        if (unlikely(offset > 0xffff || offset & 1))
                return -EFAULT;
        if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr))))
                return -EFAULT;

        ptr = (__sum16 *)(skb->data + offset);
        if (is_mmzero && !do_mforce && !*ptr)
                return 0;

        switch (flags & BPF_F_HDR_FIELD_MASK) {
        case 0:
                if (unlikely(from != 0))
                        return -EINVAL;

                inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo, is_ipv6);
                break;
        case 2:
                inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo);
                break;
        case 4:
                inet_proto_csum_replace4(ptr, skb, from, to, is_pseudo);
                break;
        default:
                return -EINVAL;
        }

        if (is_mmzero && !*ptr)
                *ptr = CSUM_MANGLED_0;
        return 0;
}

static const struct bpf_func_proto bpf_l4_csum_replace_proto = {
        .func                = bpf_l4_csum_replace,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size,
           __be32 *, to, u32, to_size, __wsum, seed)
{
        struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp);
        u32 diff_size = from_size + to_size;
        int i, j = 0;

        /* This is quite flexible, some examples:
         *
         * from_size == 0, to_size > 0,  seed := csum --> pushing data
         * from_size > 0,  to_size == 0, seed := csum --> pulling data
         * from_size > 0,  to_size > 0,  seed := 0    --> diffing data
         *
         * Even for diffing, from_size and to_size don't need to be equal.
         */
        if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) ||
                     diff_size > sizeof(sp->diff)))
                return -EINVAL;

        for (i = 0; i < from_size / sizeof(__be32); i++, j++)
                sp->diff[j] = ~from[i];
        for (i = 0; i <   to_size / sizeof(__be32); i++, j++)
                sp->diff[j] = to[i];

        return csum_partial(sp->diff, diff_size, seed);
}

static const struct bpf_func_proto bpf_csum_diff_proto = {
        .func                = bpf_csum_diff,
        .gpl_only        = false,
        .pkt_access        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_MEM_OR_NULL,
        .arg2_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg3_type        = ARG_PTR_TO_MEM_OR_NULL,
        .arg4_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum)
{
        /* The interface is to be used in combination with bpf_csum_diff()
         * for direct packet writes. csum rotation for alignment as well
         * as emulating csum_sub() can be done from the eBPF program.
         */
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                return (skb->csum = csum_add(skb->csum, csum));

        return -ENOTSUPP;
}

static const struct bpf_func_proto bpf_csum_update_proto = {
        .func                = bpf_csum_update,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

BPF_CALL_2(bpf_csum_level, struct sk_buff *, skb, u64, level)
{
        /* The interface is to be used in combination with bpf_skb_adjust_room()
         * for encap/decap of packet headers when BPF_F_ADJ_ROOM_NO_CSUM_RESET
         * is passed as flags, for example.
         */
        switch (level) {
        case BPF_CSUM_LEVEL_INC:
                __skb_incr_checksum_unnecessary(skb);
                break;
        case BPF_CSUM_LEVEL_DEC:
                __skb_decr_checksum_unnecessary(skb);
                break;
        case BPF_CSUM_LEVEL_RESET:
                __skb_reset_checksum_unnecessary(skb);
                break;
        case BPF_CSUM_LEVEL_QUERY:
                return skb->ip_summed == CHECKSUM_UNNECESSARY ?
                       skb->csum_level : -EACCES;
        default:
                return -EINVAL;
        }

        return 0;
}

static const struct bpf_func_proto bpf_csum_level_proto = {
        .func                = bpf_csum_level,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
{
        return dev_forward_skb(dev, skb);
}

static inline int __bpf_rx_skb_no_mac(struct net_device *dev,
                                      struct sk_buff *skb)
{
        int ret = ____dev_forward_skb(dev, skb);

        if (likely(!ret)) {
                skb->dev = dev;
                ret = netif_rx(skb);
        }

        return ret;
}

static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
{
        int ret;

        if (dev_xmit_recursion()) {
                net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
                kfree_skb(skb);
                return -ENETDOWN;
        }

        skb->dev = dev;
        skb->tstamp = 0;

        dev_xmit_recursion_inc();
        ret = dev_queue_xmit(skb);
        dev_xmit_recursion_dec();

        return ret;
}

static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev,
                                 u32 flags)
{
        unsigned int mlen = skb_network_offset(skb);

        if (unlikely(skb->len <= mlen)) {
                kfree_skb(skb);
                return -ERANGE;
        }

        if (mlen) {
                __skb_pull(skb, mlen);
                if (unlikely(!skb->len)) {
                        kfree_skb(skb);
                        return -ERANGE;
                }

                /* At ingress, the mac header has already been pulled once.
                 * At egress, skb_pospull_rcsum has to be done in case that
                 * the skb is originated from ingress (i.e. a forwarded skb)
                 * to ensure that rcsum starts at net header.
                 */
                if (!skb_at_tc_ingress(skb))
                        skb_postpull_rcsum(skb, skb_mac_header(skb), mlen);
        }
        skb_pop_mac_header(skb);
        skb_reset_mac_len(skb);
        return flags & BPF_F_INGRESS ?
               __bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb);
}

static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev,
                                 u32 flags)
{
        /* Verify that a link layer header is carried */
        if (unlikely(skb->mac_header >= skb->network_header || skb->len == 0)) {
                kfree_skb(skb);
                return -ERANGE;
        }

        bpf_push_mac_rcsum(skb);
        return flags & BPF_F_INGRESS ?
               __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
}

static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
                          u32 flags)
{
        if (dev_is_mac_header_xmit(dev))
                return __bpf_redirect_common(skb, dev, flags);
        else
                return __bpf_redirect_no_mac(skb, dev, flags);
}

#if IS_ENABLED(CONFIG_IPV6)
static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
                            struct net_device *dev, struct bpf_nh_params *nh)
{
        u32 hh_len = LL_RESERVED_SPACE(dev);
        const struct in6_addr *nexthop;
        struct dst_entry *dst = NULL;
        struct neighbour *neigh;

        if (dev_xmit_recursion()) {
                net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
                goto out_drop;
        }

        skb->dev = dev;
        skb->tstamp = 0;

        if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
                struct sk_buff *skb2;

                skb2 = skb_realloc_headroom(skb, hh_len);
                if (unlikely(!skb2)) {
                        kfree_skb(skb);
                        return -ENOMEM;
                }
                if (skb->sk)
                        skb_set_owner_w(skb2, skb->sk);
                consume_skb(skb);
                skb = skb2;
        }

        rcu_read_lock_bh();
        if (!nh) {
                dst = skb_dst(skb);
                nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst),
                                      &ipv6_hdr(skb)->daddr);
        } else {
                nexthop = &nh->ipv6_nh;
        }
        neigh = ip_neigh_gw6(dev, nexthop);
        if (likely(!IS_ERR(neigh))) {
                int ret;

                sock_confirm_neigh(skb, neigh);
                dev_xmit_recursion_inc();
                ret = neigh_output(neigh, skb, false);
                dev_xmit_recursion_dec();
                rcu_read_unlock_bh();
                return ret;
        }
        rcu_read_unlock_bh();
        if (dst)
                IP6_INC_STATS(dev_net(dst->dev),
                              ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
out_drop:
        kfree_skb(skb);
        return -ENETDOWN;
}

static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
                                   struct bpf_nh_params *nh)
{
        const struct ipv6hdr *ip6h = ipv6_hdr(skb);
        struct net *net = dev_net(dev);
        int err, ret = NET_XMIT_DROP;

        if (!nh) {
                struct dst_entry *dst;
                struct flowi6 fl6 = {
                        .flowi6_flags = FLOWI_FLAG_ANYSRC,
                        .flowi6_mark  = skb->mark,
                        .flowlabel    = ip6_flowinfo(ip6h),
                        .flowi6_oif   = dev->ifindex,
                        .flowi6_proto = ip6h->nexthdr,
                        .daddr              = ip6h->daddr,
                        .saddr              = ip6h->saddr,
                };

                dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL);
                if (IS_ERR(dst))
                        goto out_drop;

                skb_dst_drop(skb);
                skb_dst_set(skb, dst);
        } else if (nh->nh_family != AF_INET6) {
                goto out_drop;
        }

        err = bpf_out_neigh_v6(net, skb, dev, nh);
        if (unlikely(net_xmit_eval(err)))
                dev->stats.tx_errors++;
        else
                ret = NET_XMIT_SUCCESS;
        goto out_xmit;
out_drop:
        dev->stats.tx_errors++;
        kfree_skb(skb);
out_xmit:
        return ret;
}
#else
static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
                                   struct bpf_nh_params *nh)
{
        kfree_skb(skb);
        return NET_XMIT_DROP;
}
#endif /* CONFIG_IPV6 */

#if IS_ENABLED(CONFIG_INET)
static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,
                            struct net_device *dev, struct bpf_nh_params *nh)
{
        u32 hh_len = LL_RESERVED_SPACE(dev);
        struct neighbour *neigh;
        bool is_v6gw = false;

        if (dev_xmit_recursion()) {
                net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
                goto out_drop;
        }

        skb->dev = dev;
        skb->tstamp = 0;

        if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
                struct sk_buff *skb2;

                skb2 = skb_realloc_headroom(skb, hh_len);
                if (unlikely(!skb2)) {
                        kfree_skb(skb);
                        return -ENOMEM;
                }
                if (skb->sk)
                        skb_set_owner_w(skb2, skb->sk);
                consume_skb(skb);
                skb = skb2;
        }

        rcu_read_lock_bh();
        if (!nh) {
                struct dst_entry *dst = skb_dst(skb);
                struct rtable *rt = container_of(dst, struct rtable, dst);

                neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
        } else if (nh->nh_family == AF_INET6) {
                neigh = ip_neigh_gw6(dev, &nh->ipv6_nh);
                is_v6gw = true;
        } else if (nh->nh_family == AF_INET) {
                neigh = ip_neigh_gw4(dev, nh->ipv4_nh);
        } else {
                rcu_read_unlock_bh();
                goto out_drop;
        }

        if (likely(!IS_ERR(neigh))) {
                int ret;

                sock_confirm_neigh(skb, neigh);
                dev_xmit_recursion_inc();
                ret = neigh_output(neigh, skb, is_v6gw);
                dev_xmit_recursion_dec();
                rcu_read_unlock_bh();
                return ret;
        }
        rcu_read_unlock_bh();
out_drop:
        kfree_skb(skb);
        return -ENETDOWN;
}

static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
                                   struct bpf_nh_params *nh)
{
        const struct iphdr *ip4h = ip_hdr(skb);
        struct net *net = dev_net(dev);
        int err, ret = NET_XMIT_DROP;

        if (!nh) {
                struct flowi4 fl4 = {
                        .flowi4_flags = FLOWI_FLAG_ANYSRC,
                        .flowi4_mark  = skb->mark,
                        .flowi4_tos   = RT_TOS(ip4h->tos),
                        .flowi4_oif   = dev->ifindex,
                        .flowi4_proto = ip4h->protocol,
                        .daddr              = ip4h->daddr,
                        .saddr              = ip4h->saddr,
                };
                struct rtable *rt;

                rt = ip_route_output_flow(net, &fl4, NULL);
                if (IS_ERR(rt))
                        goto out_drop;
                if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
                        ip_rt_put(rt);
                        goto out_drop;
                }

                skb_dst_drop(skb);
                skb_dst_set(skb, &rt->dst);
        }

        err = bpf_out_neigh_v4(net, skb, dev, nh);
        if (unlikely(net_xmit_eval(err)))
                dev->stats.tx_errors++;
        else
                ret = NET_XMIT_SUCCESS;
        goto out_xmit;
out_drop:
        dev->stats.tx_errors++;
        kfree_skb(skb);
out_xmit:
        return ret;
}
#else
static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
                                   struct bpf_nh_params *nh)
{
        kfree_skb(skb);
        return NET_XMIT_DROP;
}
#endif /* CONFIG_INET */

static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev,
                                struct bpf_nh_params *nh)
{
        struct ethhdr *ethh = eth_hdr(skb);

        if (unlikely(skb->mac_header >= skb->network_header))
                goto out;
        bpf_push_mac_rcsum(skb);
        if (is_multicast_ether_addr(ethh->h_dest))
                goto out;

        skb_pull(skb, sizeof(*ethh));
        skb_unset_mac_header(skb);
        skb_reset_network_header(skb);

        if (skb->protocol == htons(ETH_P_IP))
                return __bpf_redirect_neigh_v4(skb, dev, nh);
        else if (skb->protocol == htons(ETH_P_IPV6))
                return __bpf_redirect_neigh_v6(skb, dev, nh);
out:
        kfree_skb(skb);
        return -ENOTSUPP;
}

/* Internal, non-exposed redirect flags. */
enum {
        BPF_F_NEIGH        = (1ULL << 1),
        BPF_F_PEER        = (1ULL << 2),
        BPF_F_NEXTHOP        = (1ULL << 3),
#define BPF_F_REDIRECT_INTERNAL        (BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP)
};

BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
{
        struct net_device *dev;
        struct sk_buff *clone;
        int ret;

        if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
                return -EINVAL;

        /* BPF test infra's convert___skb_to_skb() can create type-less
         * GSO packets. gso_features_check() will detect this as a bad
         * offload. However, lets not leak them out in the first place.
         */
        if (unlikely(skb_is_gso(skb) && !skb_shinfo(skb)->gso_type))
                return -EBADMSG;

        dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex);
        if (unlikely(!dev))
                return -EINVAL;

        clone = skb_clone(skb, GFP_ATOMIC);
        if (unlikely(!clone))
                return -ENOMEM;

        /* For direct write, we need to keep the invariant that the skbs
         * we're dealing with need to be uncloned. Should uncloning fail
         * here, we need to free the just generated clone to unclone once
         * again.
         */
        ret = bpf_try_make_head_writable(skb);
        if (unlikely(ret)) {
                kfree_skb(clone);
                return -ENOMEM;
        }

        return __bpf_redirect(clone, dev, flags);
}

static const struct bpf_func_proto bpf_clone_redirect_proto = {
        .func           = bpf_clone_redirect,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_ANYTHING,
        .arg3_type      = ARG_ANYTHING,
};

DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info);

int skb_do_redirect(struct sk_buff *skb)
{
        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
        struct net *net = dev_net(skb->dev);
        struct net_device *dev;
        u32 flags = ri->flags;

        dev = dev_get_by_index_rcu(net, ri->tgt_index);
        ri->tgt_index = 0;
        ri->flags = 0;
        if (unlikely(!dev))
                goto out_drop;
        if (flags & BPF_F_PEER) {
                const struct net_device_ops *ops = dev->netdev_ops;

                if (unlikely(!ops->ndo_get_peer_dev ||
                             !skb_at_tc_ingress(skb)))
                        goto out_drop;
                dev = ops->ndo_get_peer_dev(dev);
                if (unlikely(!dev ||
                             !is_skb_forwardable(dev, skb) ||
                             net_eq(net, dev_net(dev))))
                        goto out_drop;
                skb->dev = dev;
                return -EAGAIN;
        }
        return flags & BPF_F_NEIGH ?
               __bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ?
                                    &ri->nh : NULL) :
               __bpf_redirect(skb, dev, flags);
out_drop:
        kfree_skb(skb);
        return -EINVAL;
}

BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
{
        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);

        if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
                return TC_ACT_SHOT;

        ri->flags = flags;
        ri->tgt_index = ifindex;

        return TC_ACT_REDIRECT;
}

static const struct bpf_func_proto bpf_redirect_proto = {
        .func           = bpf_redirect,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_ANYTHING,
        .arg2_type      = ARG_ANYTHING,
};

BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags)
{
        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);

        if (unlikely(flags))
                return TC_ACT_SHOT;

        ri->flags = BPF_F_PEER;
        ri->tgt_index = ifindex;

        return TC_ACT_REDIRECT;
}

static const struct bpf_func_proto bpf_redirect_peer_proto = {
        .func           = bpf_redirect_peer,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_ANYTHING,
        .arg2_type      = ARG_ANYTHING,
};

BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params,
           int, plen, u64, flags)
{
        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);

        if (unlikely((plen && plen < sizeof(*params)) || flags))
                return TC_ACT_SHOT;

        ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0);
        ri->tgt_index = ifindex;

        BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params));
        if (plen)
                memcpy(&ri->nh, params, sizeof(ri->nh));

        return TC_ACT_REDIRECT;
}

static const struct bpf_func_proto bpf_redirect_neigh_proto = {
        .func                = bpf_redirect_neigh,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_ANYTHING,
        .arg2_type      = ARG_PTR_TO_MEM_OR_NULL,
        .arg3_type      = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes)
{
        msg->apply_bytes = bytes;
        return 0;
}

static const struct bpf_func_proto bpf_msg_apply_bytes_proto = {
        .func           = bpf_msg_apply_bytes,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_ANYTHING,
};

BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes)
{
        msg->cork_bytes = bytes;
        return 0;
}

static void sk_msg_reset_curr(struct sk_msg *msg)
{
        if (!msg->sg.size) {
                msg->sg.curr = msg->sg.start;
                msg->sg.copybreak = 0;
        } else {
                u32 i = msg->sg.end;

                sk_msg_iter_var_prev(i);
                msg->sg.curr = i;
                msg->sg.copybreak = msg->sg.data[i].length;
        }
}

static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
        .func           = bpf_msg_cork_bytes,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_ANYTHING,
};

BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
           u32, end, u64, flags)
{
        u32 len = 0, offset = 0, copy = 0, poffset = 0, bytes = end - start;
        u32 first_sge, last_sge, i, shift, bytes_sg_total;
        struct scatterlist *sge;
        u8 *raw, *to, *from;
        struct page *page;

        if (unlikely(flags || end <= start))
                return -EINVAL;

        /* First find the starting scatterlist element */
        i = msg->sg.start;
        do {
                offset += len;
                len = sk_msg_elem(msg, i)->length;
                if (start < offset + len)
                        break;
                sk_msg_iter_var_next(i);
        } while (i != msg->sg.end);

        if (unlikely(start >= offset + len))
                return -EINVAL;

        first_sge = i;
        /* The start may point into the sg element so we need to also
         * account for the headroom.
         */
        bytes_sg_total = start - offset + bytes;
        if (!test_bit(i, &msg->sg.copy) && bytes_sg_total <= len)
                goto out;

        /* At this point we need to linearize multiple scatterlist
         * elements or a single shared page. Either way we need to
         * copy into a linear buffer exclusively owned by BPF. Then
         * place the buffer in the scatterlist and fixup the original
         * entries by removing the entries now in the linear buffer
         * and shifting the remaining entries. For now we do not try
         * to copy partial entries to avoid complexity of running out
         * of sg_entry slots. The downside is reading a single byte
         * will copy the entire sg entry.
         */
        do {
                copy += sk_msg_elem(msg, i)->length;
                sk_msg_iter_var_next(i);
                if (bytes_sg_total <= copy)
                        break;
        } while (i != msg->sg.end);
        last_sge = i;

        if (unlikely(bytes_sg_total > copy))
                return -EINVAL;

        page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP,
                           get_order(copy));
        if (unlikely(!page))
                return -ENOMEM;

        raw = page_address(page);
        i = first_sge;
        do {
                sge = sk_msg_elem(msg, i);
                from = sg_virt(sge);
                len = sge->length;
                to = raw + poffset;

                memcpy(to, from, len);
                poffset += len;
                sge->length = 0;
                put_page(sg_page(sge));

                sk_msg_iter_var_next(i);
        } while (i != last_sge);

        sg_set_page(&msg->sg.data[first_sge], page, copy, 0);

        /* To repair sg ring we need to shift entries. If we only
         * had a single entry though we can just replace it and
         * be done. Otherwise walk the ring and shift the entries.
         */
        WARN_ON_ONCE(last_sge == first_sge);
        shift = last_sge > first_sge ?
                last_sge - first_sge - 1 :
                NR_MSG_FRAG_IDS - first_sge + last_sge - 1;
        if (!shift)
                goto out;

        i = first_sge;
        sk_msg_iter_var_next(i);
        do {
                u32 move_from;

                if (i + shift >= NR_MSG_FRAG_IDS)
                        move_from = i + shift - NR_MSG_FRAG_IDS;
                else
                        move_from = i + shift;
                if (move_from == msg->sg.end)
                        break;

                msg->sg.data[i] = msg->sg.data[move_from];
                msg->sg.data[move_from].length = 0;
                msg->sg.data[move_from].page_link = 0;
                msg->sg.data[move_from].offset = 0;
                sk_msg_iter_var_next(i);
        } while (1);

        msg->sg.end = msg->sg.end - shift > msg->sg.end ?
                      msg->sg.end - shift + NR_MSG_FRAG_IDS :
                      msg->sg.end - shift;
out:
        sk_msg_reset_curr(msg);
        msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset;
        msg->data_end = msg->data + bytes;
        return 0;
}

static const struct bpf_func_proto bpf_msg_pull_data_proto = {
        .func                = bpf_msg_pull_data,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
           u32, len, u64, flags)
{
        struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge;
        u32 new, i = 0, l = 0, space, copy = 0, offset = 0;
        u8 *raw, *to, *from;
        struct page *page;

        if (unlikely(flags))
                return -EINVAL;

        if (unlikely(len == 0))
                return 0;

        /* First find the starting scatterlist element */
        i = msg->sg.start;
        do {
                offset += l;
                l = sk_msg_elem(msg, i)->length;

                if (start < offset + l)
                        break;
                sk_msg_iter_var_next(i);
        } while (i != msg->sg.end);

        if (start > offset + l)
                return -EINVAL;

        space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);

        /* If no space available will fallback to copy, we need at
         * least one scatterlist elem available to push data into
         * when start aligns to the beginning of an element or two
         * when it falls inside an element. We handle the start equals
         * offset case because its the common case for inserting a
         * header.
         */
        if (!space || (space == 1 && start != offset))
                copy = msg->sg.data[i].length;

        page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP,
                           get_order(copy + len));
        if (unlikely(!page))
                return -ENOMEM;

        if (copy) {
                int front, back;

                raw = page_address(page);

                if (i == msg->sg.end)
                        sk_msg_iter_var_prev(i);
                psge = sk_msg_elem(msg, i);
                front = start - offset;
                back = psge->length - front;
                from = sg_virt(psge);

                if (front)
                        memcpy(raw, from, front);

                if (back) {
                        from += front;
                        to = raw + front + len;

                        memcpy(to, from, back);
                }

                put_page(sg_page(psge));
                new = i;
                goto place_new;
        }

        if (start - offset) {
                if (i == msg->sg.end)
                        sk_msg_iter_var_prev(i);
                psge = sk_msg_elem(msg, i);
                rsge = sk_msg_elem_cpy(msg, i);

                psge->length = start - offset;
                rsge.length -= psge->length;
                rsge.offset += start;

                sk_msg_iter_var_next(i);
                sg_unmark_end(psge);
                sg_unmark_end(&rsge);
        }

        /* Slot(s) to place newly allocated data */
        sk_msg_iter_next(msg, end);
        new = i;
        sk_msg_iter_var_next(i);

        if (i == msg->sg.end) {
                if (!rsge.length)
                        goto place_new;
                sk_msg_iter_next(msg, end);
                goto place_new;
        }

        /* Shift one or two slots as needed */
        sge = sk_msg_elem_cpy(msg, new);
        sg_unmark_end(&sge);

        nsge = sk_msg_elem_cpy(msg, i);
        if (rsge.length) {
                sk_msg_iter_var_next(i);
                nnsge = sk_msg_elem_cpy(msg, i);
                sk_msg_iter_next(msg, end);
        }

        while (i != msg->sg.end) {
                msg->sg.data[i] = sge;
                sge = nsge;
                sk_msg_iter_var_next(i);
                if (rsge.length) {
                        nsge = nnsge;
                        nnsge = sk_msg_elem_cpy(msg, i);
                } else {
                        nsge = sk_msg_elem_cpy(msg, i);
                }
        }

place_new:
        /* Place newly allocated data buffer */
        sk_mem_charge(msg->sk, len);
        msg->sg.size += len;
        __clear_bit(new, &msg->sg.copy);
        sg_set_page(&msg->sg.data[new], page, len + copy, 0);
        if (rsge.length) {
                get_page(sg_page(&rsge));
                sk_msg_iter_var_next(new);
                msg->sg.data[new] = rsge;
        }

        sk_msg_reset_curr(msg);
        sk_msg_compute_data_pointers(msg);
        return 0;
}

static const struct bpf_func_proto bpf_msg_push_data_proto = {
        .func                = bpf_msg_push_data,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_ANYTHING,
};

static void sk_msg_shift_left(struct sk_msg *msg, int i)
{
        struct scatterlist *sge = sk_msg_elem(msg, i);
        int prev;

        put_page(sg_page(sge));
        do {
                prev = i;
                sk_msg_iter_var_next(i);
                msg->sg.data[prev] = msg->sg.data[i];
        } while (i != msg->sg.end);

        sk_msg_iter_prev(msg, end);
}

static void sk_msg_shift_right(struct sk_msg *msg, int i)
{
        struct scatterlist tmp, sge;

        sk_msg_iter_next(msg, end);
        sge = sk_msg_elem_cpy(msg, i);
        sk_msg_iter_var_next(i);
        tmp = sk_msg_elem_cpy(msg, i);

        while (i != msg->sg.end) {
                msg->sg.data[i] = sge;
                sk_msg_iter_var_next(i);
                sge = tmp;
                tmp = sk_msg_elem_cpy(msg, i);
        }
}

BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
           u32, len, u64, flags)
{
        u32 i = 0, l = 0, space, offset = 0;
        u64 last = start + len;
        int pop;

        if (unlikely(flags))
                return -EINVAL;

        if (unlikely(len == 0))
                return 0;

        /* First find the starting scatterlist element */
        i = msg->sg.start;
        do {
                offset += l;
                l = sk_msg_elem(msg, i)->length;

                if (start < offset + l)
                        break;
                sk_msg_iter_var_next(i);
        } while (i != msg->sg.end);

        /* Bounds checks: start and pop must be inside message */
        if (start >= offset + l || last > msg->sg.size)
                return -EINVAL;

        space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);

        pop = len;
        /* --------------| offset
         * -| start      |-------- len -------|
         *
         *  |----- a ----|-------- pop -------|----- b ----|
         *  |______________________________________________| length
         *
         *
         * a:   region at front of scatter element to save
         * b:   region at back of scatter element to save when length > A + pop
         * pop: region to pop from element, same as input 'pop' here will be
         *      decremented below per iteration.
         *
         * Two top-level cases to handle when start != offset, first B is non
         * zero and second B is zero corresponding to when a pop includes more
         * than one element.
         *
         * Then if B is non-zero AND there is no space allocate space and
         * compact A, B regions into page. If there is space shift ring to
         * the rigth free'ing the next element in ring to place B, leaving
         * A untouched except to reduce length.
         */
        if (start != offset) {
                struct scatterlist *nsge, *sge = sk_msg_elem(msg, i);
                int a = start - offset;
                int b = sge->length - pop - a;

                sk_msg_iter_var_next(i);

                if (b > 0) {
                        if (space) {
                                sge->length = a;
                                sk_msg_shift_right(msg, i);
                                nsge = sk_msg_elem(msg, i);
                                get_page(sg_page(sge));
                                sg_set_page(nsge,
                                            sg_page(sge),
                                            b, sge->offset + pop + a);
                        } else {
                                struct page *page, *orig;
                                u8 *to, *from;

                                page = alloc_pages(__GFP_NOWARN |
                                                   __GFP_COMP   | GFP_ATOMIC,
                                                   get_order(a + b));
                                if (unlikely(!page))
                                        return -ENOMEM;

                                orig = sg_page(sge);
                                from = sg_virt(sge);
                                to = page_address(page);
                                memcpy(to, from, a);
                                memcpy(to + a, from + a + pop, b);
                                sg_set_page(sge, page, a + b, 0);
                                put_page(orig);
                        }
                        pop = 0;
                } else {
                        pop -= (sge->length - a);
                        sge->length = a;
                }
        }

        /* From above the current layout _must_ be as follows,
         *
         * -| offset
         * -| start
         *
         *  |---- pop ---|---------------- b ------------|
         *  |____________________________________________| length
         *
         * Offset and start of the current msg elem are equal because in the
         * previous case we handled offset != start and either consumed the
         * entire element and advanced to the next element OR pop == 0.
         *
         * Two cases to handle here are first pop is less than the length
         * leaving some remainder b above. Simply adjust the element's layout
         * in this case. Or pop >= length of the element so that b = 0. In this
         * case advance to next element decrementing pop.
         */
        while (pop) {
                struct scatterlist *sge = sk_msg_elem(msg, i);

                if (pop < sge->length) {
                        sge->length -= pop;
                        sge->offset += pop;
                        pop = 0;
                } else {
                        pop -= sge->length;
                        sk_msg_shift_left(msg, i);
                }
        }

        sk_mem_uncharge(msg->sk, len - pop);
        msg->sg.size -= (len - pop);
        sk_msg_reset_curr(msg);
        sk_msg_compute_data_pointers(msg);
        return 0;
}

static const struct bpf_func_proto bpf_msg_pop_data_proto = {
        .func                = bpf_msg_pop_data,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_ANYTHING,
};

#ifdef CONFIG_CGROUP_NET_CLASSID
BPF_CALL_0(bpf_get_cgroup_classid_curr)
{
        return __task_get_classid(current);
}

static const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = {
        .func                = bpf_get_cgroup_classid_curr,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
};

BPF_CALL_1(bpf_skb_cgroup_classid, const struct sk_buff *, skb)
{
        struct sock *sk = skb_to_full_sk(skb);

        if (!sk || !sk_fullsock(sk))
                return 0;

        return sock_cgroup_classid(&sk->sk_cgrp_data);
}

static const struct bpf_func_proto bpf_skb_cgroup_classid_proto = {
        .func                = bpf_skb_cgroup_classid,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};
#endif

BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
{
        return task_get_classid(skb);
}

static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
        .func           = bpf_get_cgroup_classid,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_route_realm, const struct sk_buff *, skb)
{
        return dst_tclassid(skb);
}

static const struct bpf_func_proto bpf_get_route_realm_proto = {
        .func           = bpf_get_route_realm,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb)
{
        /* If skb_clear_hash() was called due to mangling, we can
         * trigger SW recalculation here. Later access to hash
         * can then use the inline skb->hash via context directly
         * instead of calling this helper again.
         */
        return skb_get_hash(skb);
}

static const struct bpf_func_proto bpf_get_hash_recalc_proto = {
        .func                = bpf_get_hash_recalc,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb)
{
        /* After all direct packet write, this can be used once for
         * triggering a lazy recalc on next skb_get_hash() invocation.
         */
        skb_clear_hash(skb);
        return 0;
}

static const struct bpf_func_proto bpf_set_hash_invalid_proto = {
        .func                = bpf_set_hash_invalid,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BPF_CALL_2(bpf_set_hash, struct sk_buff *, skb, u32, hash)
{
        /* Set user specified hash as L4(+), so that it gets returned
         * on skb_get_hash() call unless BPF prog later on triggers a
         * skb_clear_hash().
         */
        __skb_set_sw_hash(skb, hash, true);
        return 0;
}

static const struct bpf_func_proto bpf_set_hash_proto = {
        .func                = bpf_set_hash,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
           u16, vlan_tci)
{
        int ret;

        if (unlikely(vlan_proto != htons(ETH_P_8021Q) &&
                     vlan_proto != htons(ETH_P_8021AD)))
                vlan_proto = htons(ETH_P_8021Q);

        bpf_push_mac_rcsum(skb);
        ret = skb_vlan_push(skb, vlan_proto, vlan_tci);
        bpf_pull_mac_rcsum(skb);

        bpf_compute_data_pointers(skb);
        return ret;
}

static const struct bpf_func_proto bpf_skb_vlan_push_proto = {
        .func           = bpf_skb_vlan_push,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_ANYTHING,
        .arg3_type      = ARG_ANYTHING,
};

BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
{
        int ret;

        bpf_push_mac_rcsum(skb);
        ret = skb_vlan_pop(skb);
        bpf_pull_mac_rcsum(skb);

        bpf_compute_data_pointers(skb);
        return ret;
}

static const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
        .func           = bpf_skb_vlan_pop,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
};

static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
{
        /* Caller already did skb_cow() with len as headroom,
         * so no need to do it here.
         */
        skb_push(skb, len);
        memmove(skb->data, skb->data + len, off);
        memset(skb->data + off, 0, len);

        /* No skb_postpush_rcsum(skb, skb->data + off, len)
         * needed here as it does not change the skb->csum
         * result for checksum complete when summing over
         * zeroed blocks.
         */
        return 0;
}

static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len)
{
        void *old_data;

        /* skb_ensure_writable() is not needed here, as we're
         * already working on an uncloned skb.
         */
        if (unlikely(!pskb_may_pull(skb, off + len)))
                return -ENOMEM;

        old_data = skb->data;
        __skb_pull(skb, len);
        skb_postpull_rcsum(skb, old_data + off, len);
        memmove(skb->data, old_data, off);

        return 0;
}

static int bpf_skb_net_hdr_push(struct sk_buff *skb, u32 off, u32 len)
{
        bool trans_same = skb->transport_header == skb->network_header;
        int ret;

        /* There's no need for __skb_push()/__skb_pull() pair to
         * get to the start of the mac header as we're guaranteed
         * to always start from here under eBPF.
         */
        ret = bpf_skb_generic_push(skb, off, len);
        if (likely(!ret)) {
                skb->mac_header -= len;
                skb->network_header -= len;
                if (trans_same)
                        skb->transport_header = skb->network_header;
        }

        return ret;
}

static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len)
{
        bool trans_same = skb->transport_header == skb->network_header;
        int ret;

        /* Same here, __skb_push()/__skb_pull() pair not needed. */
        ret = bpf_skb_generic_pop(skb, off, len);
        if (likely(!ret)) {
                skb->mac_header += len;
                skb->network_header += len;
                if (trans_same)
                        skb->transport_header = skb->network_header;
        }

        return ret;
}

static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
{
        const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
        u32 off = skb_mac_header_len(skb);
        int ret;

        if (skb_is_gso(skb) && !skb_is_gso_tcp(skb))
                return -ENOTSUPP;

        ret = skb_cow(skb, len_diff);
        if (unlikely(ret < 0))
                return ret;

        ret = bpf_skb_net_hdr_push(skb, off, len_diff);
        if (unlikely(ret < 0))
                return ret;

        if (skb_is_gso(skb)) {
                struct skb_shared_info *shinfo = skb_shinfo(skb);

                /* SKB_GSO_TCPV4 needs to be changed into
                 * SKB_GSO_TCPV6.
                 */
                if (shinfo->gso_type & SKB_GSO_TCPV4) {
                        shinfo->gso_type &= ~SKB_GSO_TCPV4;
                        shinfo->gso_type |=  SKB_GSO_TCPV6;
                }

                /* Header must be checked, and gso_segs recomputed. */
                shinfo->gso_type |= SKB_GSO_DODGY;
                shinfo->gso_segs = 0;
        }

        skb->protocol = htons(ETH_P_IPV6);
        skb_clear_hash(skb);

        return 0;
}

static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
{
        const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
        u32 off = skb_mac_header_len(skb);
        int ret;

        if (skb_is_gso(skb) && !skb_is_gso_tcp(skb))
                return -ENOTSUPP;

        ret = skb_unclone(skb, GFP_ATOMIC);
        if (unlikely(ret < 0))
                return ret;

        ret = bpf_skb_net_hdr_pop(skb, off, len_diff);
        if (unlikely(ret < 0))
                return ret;

        if (skb_is_gso(skb)) {
                struct skb_shared_info *shinfo = skb_shinfo(skb);

                /* SKB_GSO_TCPV6 needs to be changed into
                 * SKB_GSO_TCPV4.
                 */
                if (shinfo->gso_type & SKB_GSO_TCPV6) {
                        shinfo->gso_type &= ~SKB_GSO_TCPV6;
                        shinfo->gso_type |=  SKB_GSO_TCPV4;
                }

                /* Header must be checked, and gso_segs recomputed. */
                shinfo->gso_type |= SKB_GSO_DODGY;
                shinfo->gso_segs = 0;
        }

        skb->protocol = htons(ETH_P_IP);
        skb_clear_hash(skb);

        return 0;
}

static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto)
{
        __be16 from_proto = skb->protocol;

        if (from_proto == htons(ETH_P_IP) &&
              to_proto == htons(ETH_P_IPV6))
                return bpf_skb_proto_4_to_6(skb);

        if (from_proto == htons(ETH_P_IPV6) &&
              to_proto == htons(ETH_P_IP))
                return bpf_skb_proto_6_to_4(skb);

        return -ENOTSUPP;
}

BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto,
           u64, flags)
{
        int ret;

        if (unlikely(flags))
                return -EINVAL;

        /* General idea is that this helper does the basic groundwork
         * needed for changing the protocol, and eBPF program fills the
         * rest through bpf_skb_store_bytes(), bpf_lX_csum_replace()
         * and other helpers, rather than passing a raw buffer here.
         *
         * The rationale is to keep this minimal and without a need to
         * deal with raw packet data. F.e. even if we would pass buffers
         * here, the program still needs to call the bpf_lX_csum_replace()
         * helpers anyway. Plus, this way we keep also separation of
         * concerns, since f.e. bpf_skb_store_bytes() should only take
         * care of stores.
         *
         * Currently, additional options and extension header space are
         * not supported, but flags register is reserved so we can adapt
         * that. For offloads, we mark packet as dodgy, so that headers
         * need to be verified first.
         */
        ret = bpf_skb_proto_xlat(skb, proto);
        bpf_compute_data_pointers(skb);
        return ret;
}

static const struct bpf_func_proto bpf_skb_change_proto_proto = {
        .func                = bpf_skb_change_proto,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
};

BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type)
{
        /* We only allow a restricted subset to be changed for now. */
        if (unlikely(!skb_pkt_type_ok(skb->pkt_type) ||
                     !skb_pkt_type_ok(pkt_type)))
                return -EINVAL;

        skb->pkt_type = pkt_type;
        return 0;
}

static const struct bpf_func_proto bpf_skb_change_type_proto = {
        .func                = bpf_skb_change_type,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

static u32 bpf_skb_net_base_len(const struct sk_buff *skb)
{
        switch (skb->protocol) {
        case htons(ETH_P_IP):
                return sizeof(struct iphdr);
        case htons(ETH_P_IPV6):
                return sizeof(struct ipv6hdr);
        default:
                return ~0U;
        }
}

#define BPF_F_ADJ_ROOM_ENCAP_L3_MASK        (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \
                                         BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)

#define BPF_F_ADJ_ROOM_MASK                (BPF_F_ADJ_ROOM_FIXED_GSO | \
                                         BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \
                                         BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \
                                         BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \
                                         BPF_F_ADJ_ROOM_ENCAP_L2( \
                                          BPF_ADJ_ROOM_ENCAP_L2_MASK))

static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
                            u64 flags)
{
        u8 inner_mac_len = flags >> BPF_ADJ_ROOM_ENCAP_L2_SHIFT;
        bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK;
        u16 mac_len = 0, inner_net = 0, inner_trans = 0;
        unsigned int gso_type = SKB_GSO_DODGY;
        int ret;

        if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) {
                /* udp gso_size delineates datagrams, only allow if fixed */
                if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) ||
                    !(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
                        return -ENOTSUPP;
        }

        ret = skb_cow_head(skb, len_diff);
        if (unlikely(ret < 0))
                return ret;

        if (encap) {
                if (skb->protocol != htons(ETH_P_IP) &&
                    skb->protocol != htons(ETH_P_IPV6))
                        return -ENOTSUPP;

                if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 &&
                    flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
                        return -EINVAL;

                if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE &&
                    flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
                        return -EINVAL;

                if (skb->encapsulation)
                        return -EALREADY;

                mac_len = skb->network_header - skb->mac_header;
                inner_net = skb->network_header;
                if (inner_mac_len > len_diff)
                        return -EINVAL;
                inner_trans = skb->transport_header;
        }

        ret = bpf_skb_net_hdr_push(skb, off, len_diff);
        if (unlikely(ret < 0))
                return ret;

        if (encap) {
                skb->inner_mac_header = inner_net - inner_mac_len;
                skb->inner_network_header = inner_net;
                skb->inner_transport_header = inner_trans;
                skb_set_inner_protocol(skb, skb->protocol);

                skb->encapsulation = 1;
                skb_set_network_header(skb, mac_len);

                if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
                        gso_type |= SKB_GSO_UDP_TUNNEL;
                else if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE)
                        gso_type |= SKB_GSO_GRE;
                else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
                        gso_type |= SKB_GSO_IPXIP6;
                else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4)
                        gso_type |= SKB_GSO_IPXIP4;

                if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE ||
                    flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) {
                        int nh_len = flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 ?
                                        sizeof(struct ipv6hdr) :
                                        sizeof(struct iphdr);

                        skb_set_transport_header(skb, mac_len + nh_len);
                }

                /* Match skb->protocol to new outer l3 protocol */
                if (skb->protocol == htons(ETH_P_IP) &&
                    flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
                        skb->protocol = htons(ETH_P_IPV6);
                else if (skb->protocol == htons(ETH_P_IPV6) &&
                         flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4)
                        skb->protocol = htons(ETH_P_IP);
        }

        if (skb_is_gso(skb)) {
                struct skb_shared_info *shinfo = skb_shinfo(skb);

                /* Header must be checked, and gso_segs recomputed. */
                shinfo->gso_type |= gso_type;
                shinfo->gso_segs = 0;

                /* Due to header growth, MSS needs to be downgraded.
                 * There is a BUG_ON() when segmenting the frag_list with
                 * head_frag true, so linearize the skb after downgrading
                 * the MSS.
                 */
                if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) {
                        skb_decrease_gso_size(shinfo, len_diff);
                        if (shinfo->frag_list)
                                return skb_linearize(skb);
                }
        }

        return 0;
}

static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff,
                              u64 flags)
{
        int ret;

        if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO |
                               BPF_F_ADJ_ROOM_NO_CSUM_RESET)))
                return -EINVAL;

        if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) {
                /* udp gso_size delineates datagrams, only allow if fixed */
                if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) ||
                    !(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
                        return -ENOTSUPP;
        }

        ret = skb_unclone(skb, GFP_ATOMIC);
        if (unlikely(ret < 0))
                return ret;

        ret = bpf_skb_net_hdr_pop(skb, off, len_diff);
        if (unlikely(ret < 0))
                return ret;

        if (skb_is_gso(skb)) {
                struct skb_shared_info *shinfo = skb_shinfo(skb);

                /* Due to header shrink, MSS can be upgraded. */
                if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
                        skb_increase_gso_size(shinfo, len_diff);

                /* Header must be checked, and gso_segs recomputed. */
                shinfo->gso_type |= SKB_GSO_DODGY;
                shinfo->gso_segs = 0;
        }

        return 0;
}

#define BPF_SKB_MAX_LEN SKB_MAX_ALLOC

BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
           u32, mode, u64, flags)
{
        u32 len_diff_abs = abs(len_diff);
        bool shrink = len_diff < 0;
        int ret = 0;

        if (unlikely(flags || mode))
                return -EINVAL;
        if (unlikely(len_diff_abs > 0xfffU))
                return -EFAULT;

        if (!shrink) {
                ret = skb_cow(skb, len_diff);
                if (unlikely(ret < 0))
                        return ret;
                __skb_push(skb, len_diff_abs);
                memset(skb->data, 0, len_diff_abs);
        } else {
                if (unlikely(!pskb_may_pull(skb, len_diff_abs)))
                        return -ENOMEM;
                __skb_pull(skb, len_diff_abs);
        }
        bpf_compute_data_end_sk_skb(skb);
        if (tls_sw_has_ctx_rx(skb->sk)) {
                struct strp_msg *rxm = strp_msg(skb);

                rxm->full_len += len_diff;
        }
        return ret;
}

static const struct bpf_func_proto sk_skb_adjust_room_proto = {
        .func                = sk_skb_adjust_room,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
           u32, mode, u64, flags)
{
        u32 len_cur, len_diff_abs = abs(len_diff);
        u32 len_min = bpf_skb_net_base_len(skb);
        u32 len_max = BPF_SKB_MAX_LEN;
        __be16 proto = skb->protocol;
        bool shrink = len_diff < 0;
        u32 off;
        int ret;

        if (unlikely(flags & ~(BPF_F_ADJ_ROOM_MASK |
                               BPF_F_ADJ_ROOM_NO_CSUM_RESET)))
                return -EINVAL;
        if (unlikely(len_diff_abs > 0xfffU))
                return -EFAULT;
        if (unlikely(proto != htons(ETH_P_IP) &&
                     proto != htons(ETH_P_IPV6)))
                return -ENOTSUPP;

        off = skb_mac_header_len(skb);
        switch (mode) {
        case BPF_ADJ_ROOM_NET:
                off += bpf_skb_net_base_len(skb);
                break;
        case BPF_ADJ_ROOM_MAC:
                break;
        default:
                return -ENOTSUPP;
        }

        len_cur = skb->len - skb_network_offset(skb);
        if ((shrink && (len_diff_abs >= len_cur ||
                        len_cur - len_diff_abs < len_min)) ||
            (!shrink && (skb->len + len_diff_abs > len_max &&
                         !skb_is_gso(skb))))
                return -ENOTSUPP;

        ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) :
                       bpf_skb_net_grow(skb, off, len_diff_abs, flags);
        if (!ret && !(flags & BPF_F_ADJ_ROOM_NO_CSUM_RESET))
                __skb_reset_checksum_unnecessary(skb);

        bpf_compute_data_pointers(skb);
        return ret;
}

static const struct bpf_func_proto bpf_skb_adjust_room_proto = {
        .func                = bpf_skb_adjust_room,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_ANYTHING,
};

static u32 __bpf_skb_min_len(const struct sk_buff *skb)
{
        int offset = skb_network_offset(skb);
        u32 min_len = 0;

        if (offset > 0)
                min_len = offset;
        if (skb_transport_header_was_set(skb)) {
                offset = skb_transport_offset(skb);
                if (offset > 0)
                        min_len = offset;
        }
        if (skb->ip_summed == CHECKSUM_PARTIAL) {
                offset = skb_checksum_start_offset(skb) +
                         skb->csum_offset + sizeof(__sum16);
                if (offset > 0)
                        min_len = offset;
        }
        return min_len;
}

static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len)
{
        unsigned int old_len = skb->len;
        int ret;

        ret = __skb_grow_rcsum(skb, new_len);
        if (!ret)
                memset(skb->data + old_len, 0, new_len - old_len);
        return ret;
}

static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len)
{
        return __skb_trim_rcsum(skb, new_len);
}

static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len,
                                        u64 flags)
{
        u32 max_len = BPF_SKB_MAX_LEN;
        u32 min_len = __bpf_skb_min_len(skb);
        int ret;

        if (unlikely(flags || new_len > max_len || new_len < min_len))
                return -EINVAL;
        if (skb->encapsulation)
                return -ENOTSUPP;

        /* The basic idea of this helper is that it's performing the
         * needed work to either grow or trim an skb, and eBPF program
         * rewrites the rest via helpers like bpf_skb_store_bytes(),
         * bpf_lX_csum_replace() and others rather than passing a raw
         * buffer here. This one is a slow path helper and intended
         * for replies with control messages.
         *
         * Like in bpf_skb_change_proto(), we want to keep this rather
         * minimal and without protocol specifics so that we are able
         * to separate concerns as in bpf_skb_store_bytes() should only
         * be the one responsible for writing buffers.
         *
         * It's really expected to be a slow path operation here for
         * control message replies, so we're implicitly linearizing,
         * uncloning and drop offloads from the skb by this.
         */
        ret = __bpf_try_make_writable(skb, skb->len);
        if (!ret) {
                if (new_len > skb->len)
                        ret = bpf_skb_grow_rcsum(skb, new_len);
                else if (new_len < skb->len)
                        ret = bpf_skb_trim_rcsum(skb, new_len);
                if (!ret && skb_is_gso(skb))
                        skb_gso_reset(skb);
        }
        return ret;
}

BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len,
           u64, flags)
{
        int ret = __bpf_skb_change_tail(skb, new_len, flags);

        bpf_compute_data_pointers(skb);
        return ret;
}

static const struct bpf_func_proto bpf_skb_change_tail_proto = {
        .func                = bpf_skb_change_tail,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
};

BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len,
           u64, flags)
{
        int ret = __bpf_skb_change_tail(skb, new_len, flags);

        bpf_compute_data_end_sk_skb(skb);
        return ret;
}

static const struct bpf_func_proto sk_skb_change_tail_proto = {
        .func                = sk_skb_change_tail,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
};

static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room,
                                        u64 flags)
{
        u32 max_len = BPF_SKB_MAX_LEN;
        u32 new_len = skb->len + head_room;
        int ret;

        if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) ||
                     new_len < skb->len))
                return -EINVAL;

        ret = skb_cow(skb, head_room);
        if (likely(!ret)) {
                /* Idea for this helper is that we currently only
                 * allow to expand on mac header. This means that
                 * skb->protocol network header, etc, stay as is.
                 * Compared to bpf_skb_change_tail(), we're more
                 * flexible due to not needing to linearize or
                 * reset GSO. Intention for this helper is to be
                 * used by an L3 skb that needs to push mac header
                 * for redirection into L2 device.
                 */
                __skb_push(skb, head_room);
                memset(skb->data, 0, head_room);
                skb_reset_mac_header(skb);
                skb_reset_mac_len(skb);
        }

        return ret;
}

BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room,
           u64, flags)
{
        int ret = __bpf_skb_change_head(skb, head_room, flags);

        bpf_compute_data_pointers(skb);
        return ret;
}

static const struct bpf_func_proto bpf_skb_change_head_proto = {
        .func                = bpf_skb_change_head,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
};

BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room,
           u64, flags)
{
        int ret = __bpf_skb_change_head(skb, head_room, flags);

        bpf_compute_data_end_sk_skb(skb);
        return ret;
}

static const struct bpf_func_proto sk_skb_change_head_proto = {
        .func                = sk_skb_change_head,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
};
static unsigned long xdp_get_metalen(const struct xdp_buff *xdp)
{
        return xdp_data_meta_unsupported(xdp) ? 0 :
               xdp->data - xdp->data_meta;
}

BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
{
        void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
        unsigned long metalen = xdp_get_metalen(xdp);
        void *data_start = xdp_frame_end + metalen;
        void *data = xdp->data + offset;

        if (unlikely(data < data_start ||
                     data > xdp->data_end - ETH_HLEN))
                return -EINVAL;

        if (metalen)
                memmove(xdp->data_meta + offset,
                        xdp->data_meta, metalen);
        xdp->data_meta += offset;
        xdp->data = data;

        return 0;
}

static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
        .func                = bpf_xdp_adjust_head,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset)
{
        void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */
        void *data_end = xdp->data_end + offset;

        /* Notice that xdp_data_hard_end have reserved some tailroom */
        if (unlikely(data_end > data_hard_end))
                return -EINVAL;

        /* ALL drivers MUST init xdp->frame_sz, chicken check below */
        if (unlikely(xdp->frame_sz > PAGE_SIZE)) {
                WARN_ONCE(1, "Too BIG xdp->frame_sz = %d\n", xdp->frame_sz);
                return -EINVAL;
        }

        if (unlikely(data_end < xdp->data + ETH_HLEN))
                return -EINVAL;

        /* Clear memory area on grow, can contain uninit kernel memory */
        if (offset > 0)
                memset(xdp->data_end, 0, offset);

        xdp->data_end = data_end;

        return 0;
}

static const struct bpf_func_proto bpf_xdp_adjust_tail_proto = {
        .func                = bpf_xdp_adjust_tail,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
{
        void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
        void *meta = xdp->data_meta + offset;
        unsigned long metalen = xdp->data - meta;

        if (xdp_data_meta_unsupported(xdp))
                return -ENOTSUPP;
        if (unlikely(meta < xdp_frame_end ||
                     meta > xdp->data))
                return -EINVAL;
        if (unlikely((metalen & (sizeof(__u32) - 1)) ||
                     (metalen > 32)))
                return -EACCES;

        xdp->data_meta = meta;

        return 0;
}

static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
        .func                = bpf_xdp_adjust_meta,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
                            struct bpf_map *map, struct xdp_buff *xdp)
{
        switch (map->map_type) {
        case BPF_MAP_TYPE_DEVMAP:
        case BPF_MAP_TYPE_DEVMAP_HASH:
                return dev_map_enqueue(fwd, xdp, dev_rx);
        case BPF_MAP_TYPE_CPUMAP:
                return cpu_map_enqueue(fwd, xdp, dev_rx);
        case BPF_MAP_TYPE_XSKMAP:
                return __xsk_map_redirect(fwd, xdp);
        default:
                return -EBADRQC;
        }
        return 0;
}

void xdp_do_flush(void)
{
        __dev_flush();
        __cpu_map_flush();
        __xsk_map_flush();
}
EXPORT_SYMBOL_GPL(xdp_do_flush);

static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
{
        switch (map->map_type) {
        case BPF_MAP_TYPE_DEVMAP:
                return __dev_map_lookup_elem(map, index);
        case BPF_MAP_TYPE_DEVMAP_HASH:
                return __dev_map_hash_lookup_elem(map, index);
        case BPF_MAP_TYPE_CPUMAP:
                return __cpu_map_lookup_elem(map, index);
        case BPF_MAP_TYPE_XSKMAP:
                return __xsk_map_lookup_elem(map, index);
        default:
                return NULL;
        }
}

void bpf_clear_redirect_map(struct bpf_map *map)
{
        struct bpf_redirect_info *ri;
        int cpu;

        for_each_possible_cpu(cpu) {
                ri = per_cpu_ptr(&bpf_redirect_info, cpu);
                /* Avoid polluting remote cacheline due to writes if
                 * not needed. Once we pass this test, we need the
                 * cmpxchg() to make sure it hasn't been changed in
                 * the meantime by remote CPU.
                 */
                if (unlikely(READ_ONCE(ri->map) == map))
                        cmpxchg(&ri->map, map, NULL);
        }
}

int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
                    struct bpf_prog *xdp_prog)
{
        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
        struct bpf_map *map = READ_ONCE(ri->map);
        u32 index = ri->tgt_index;
        void *fwd = ri->tgt_value;
        int err;

        ri->tgt_index = 0;
        ri->tgt_value = NULL;
        WRITE_ONCE(ri->map, NULL);

        if (unlikely(!map)) {
                fwd = dev_get_by_index_rcu(dev_net(dev), index);
                if (unlikely(!fwd)) {
                        err = -EINVAL;
                        goto err;
                }

                err = dev_xdp_enqueue(fwd, xdp, dev);
        } else {
                err = __bpf_tx_xdp_map(dev, fwd, map, xdp);
        }

        if (unlikely(err))
                goto err;

        _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index);
        return 0;
err:
        _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err);
        return err;
}
EXPORT_SYMBOL_GPL(xdp_do_redirect);

static int xdp_do_generic_redirect_map(struct net_device *dev,
                                       struct sk_buff *skb,
                                       struct xdp_buff *xdp,
                                       struct bpf_prog *xdp_prog,
                                       struct bpf_map *map)
{
        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
        u32 index = ri->tgt_index;
        void *fwd = ri->tgt_value;
        int err = 0;

        ri->tgt_index = 0;
        ri->tgt_value = NULL;
        WRITE_ONCE(ri->map, NULL);

        if (map->map_type == BPF_MAP_TYPE_DEVMAP ||
            map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
                struct bpf_dtab_netdev *dst = fwd;

                err = dev_map_generic_redirect(dst, skb, xdp_prog);
                if (unlikely(err))
                        goto err;
        } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
                struct xdp_sock *xs = fwd;

                err = xsk_generic_rcv(xs, xdp);
                if (err)
                        goto err;
                consume_skb(skb);
        } else {
                /* TODO: Handle BPF_MAP_TYPE_CPUMAP */
                err = -EBADRQC;
                goto err;
        }

        _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index);
        return 0;
err:
        _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err);
        return err;
}

int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
                            struct xdp_buff *xdp, struct bpf_prog *xdp_prog)
{
        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
        struct bpf_map *map = READ_ONCE(ri->map);
        u32 index = ri->tgt_index;
        struct net_device *fwd;
        int err = 0;

        if (map)
                return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog,
                                                   map);
        ri->tgt_index = 0;
        fwd = dev_get_by_index_rcu(dev_net(dev), index);
        if (unlikely(!fwd)) {
                err = -EINVAL;
                goto err;
        }

        err = xdp_ok_fwd_dev(fwd, skb->len);
        if (unlikely(err))
                goto err;

        skb->dev = fwd;
        _trace_xdp_redirect(dev, xdp_prog, index);
        generic_xdp_tx(skb, xdp_prog);
        return 0;
err:
        _trace_xdp_redirect_err(dev, xdp_prog, index, err);
        return err;
}

BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
{
        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);

        if (unlikely(flags))
                return XDP_ABORTED;

        ri->flags = flags;
        ri->tgt_index = ifindex;
        ri->tgt_value = NULL;
        WRITE_ONCE(ri->map, NULL);

        return XDP_REDIRECT;
}

static const struct bpf_func_proto bpf_xdp_redirect_proto = {
        .func           = bpf_xdp_redirect,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_ANYTHING,
        .arg2_type      = ARG_ANYTHING,
};

BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex,
           u64, flags)
{
        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);

        /* Lower bits of the flags are used as return code on lookup failure */
        if (unlikely(flags > XDP_TX))
                return XDP_ABORTED;

        ri->tgt_value = __xdp_map_lookup_elem(map, ifindex);
        if (unlikely(!ri->tgt_value)) {
                /* If the lookup fails we want to clear out the state in the
                 * redirect_info struct completely, so that if an eBPF program
                 * performs multiple lookups, the last one always takes
                 * precedence.
                 */
                WRITE_ONCE(ri->map, NULL);
                return flags;
        }

        ri->flags = flags;
        ri->tgt_index = ifindex;
        WRITE_ONCE(ri->map, map);

        return XDP_REDIRECT;
}

static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
        .func           = bpf_xdp_redirect_map,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_CONST_MAP_PTR,
        .arg2_type      = ARG_ANYTHING,
        .arg3_type      = ARG_ANYTHING,
};

static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
                                  unsigned long off, unsigned long len)
{
        void *ptr = skb_header_pointer(skb, off, len, dst_buff);

        if (unlikely(!ptr))
                return len;
        if (ptr != dst_buff)
                memcpy(dst_buff, ptr, len);

        return 0;
}

BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map,
           u64, flags, void *, meta, u64, meta_size)
{
        u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32;

        if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
                return -EINVAL;
        if (unlikely(!skb || skb_size > skb->len))
                return -EFAULT;

        return bpf_event_output(map, flags, meta, meta_size, skb, skb_size,
                                bpf_skb_copy);
}

static const struct bpf_func_proto bpf_skb_event_output_proto = {
        .func                = bpf_skb_event_output,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM,
        .arg5_type        = ARG_CONST_SIZE_OR_ZERO,
};

BTF_ID_LIST_SINGLE(bpf_skb_output_btf_ids, struct, sk_buff)

const struct bpf_func_proto bpf_skb_output_proto = {
        .func                = bpf_skb_event_output,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID,
        .arg1_btf_id        = &bpf_skb_output_btf_ids[0],
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM,
        .arg5_type        = ARG_CONST_SIZE_OR_ZERO,
};

static unsigned short bpf_tunnel_key_af(u64 flags)
{
        return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET;
}

BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key *, to,
           u32, size, u64, flags)
{
        const struct ip_tunnel_info *info = skb_tunnel_info(skb);
        u8 compat[sizeof(struct bpf_tunnel_key)];
        void *to_orig = to;
        int err;

        if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6)))) {
                err = -EINVAL;
                goto err_clear;
        }
        if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) {
                err = -EPROTO;
                goto err_clear;
        }
        if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
                err = -EINVAL;
                switch (size) {
                case offsetof(struct bpf_tunnel_key, tunnel_label):
                case offsetof(struct bpf_tunnel_key, tunnel_ext):
                        goto set_compat;
                case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
                        /* Fixup deprecated structure layouts here, so we have
                         * a common path later on.
                         */
                        if (ip_tunnel_info_af(info) != AF_INET)
                                goto err_clear;
set_compat:
                        to = (struct bpf_tunnel_key *)compat;
                        break;
                default:
                        goto err_clear;
                }
        }

        to->tunnel_id = be64_to_cpu(info->key.tun_id);
        to->tunnel_tos = info->key.tos;
        to->tunnel_ttl = info->key.ttl;
        to->tunnel_ext = 0;

        if (flags & BPF_F_TUNINFO_IPV6) {
                memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
                       sizeof(to->remote_ipv6));
                to->tunnel_label = be32_to_cpu(info->key.label);
        } else {
                to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
                memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
                to->tunnel_label = 0;
        }

        if (unlikely(size != sizeof(struct bpf_tunnel_key)))
                memcpy(to_orig, to, size);

        return 0;
err_clear:
        memset(to_orig, 0, size);
        return err;
}

static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
        .func                = bpf_skb_get_tunnel_key,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size)
{
        const struct ip_tunnel_info *info = skb_tunnel_info(skb);
        int err;

        if (unlikely(!info ||
                     !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) {
                err = -ENOENT;
                goto err_clear;
        }
        if (unlikely(size < info->options_len)) {
                err = -ENOMEM;
                goto err_clear;
        }

        ip_tunnel_info_opts_get(to, info);
        if (size > info->options_len)
                memset(to + info->options_len, 0, size - info->options_len);

        return info->options_len;
err_clear:
        memset(to, 0, size);
        return err;
}

static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
        .func                = bpf_skb_get_tunnel_opt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg3_type        = ARG_CONST_SIZE,
};

static struct metadata_dst __percpu *md_dst;

BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
           const struct bpf_tunnel_key *, from, u32, size, u64, flags)
{
        struct metadata_dst *md = this_cpu_ptr(md_dst);
        u8 compat[sizeof(struct bpf_tunnel_key)];
        struct ip_tunnel_info *info;

        if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX |
                               BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER)))
                return -EINVAL;
        if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
                switch (size) {
                case offsetof(struct bpf_tunnel_key, tunnel_label):
                case offsetof(struct bpf_tunnel_key, tunnel_ext):
                case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
                        /* Fixup deprecated structure layouts here, so we have
                         * a common path later on.
                         */
                        memcpy(compat, from, size);
                        memset(compat + size, 0, sizeof(compat) - size);
                        from = (const struct bpf_tunnel_key *) compat;
                        break;
                default:
                        return -EINVAL;
                }
        }
        if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) ||
                     from->tunnel_ext))
                return -EINVAL;

        skb_dst_drop(skb);
        dst_hold((struct dst_entry *) md);
        skb_dst_set(skb, (struct dst_entry *) md);

        info = &md->u.tun_info;
        memset(info, 0, sizeof(*info));
        info->mode = IP_TUNNEL_INFO_TX;

        info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE;
        if (flags & BPF_F_DONT_FRAGMENT)
                info->key.tun_flags |= TUNNEL_DONT_FRAGMENT;
        if (flags & BPF_F_ZERO_CSUM_TX)
                info->key.tun_flags &= ~TUNNEL_CSUM;
        if (flags & BPF_F_SEQ_NUMBER)
                info->key.tun_flags |= TUNNEL_SEQ;

        info->key.tun_id = cpu_to_be64(from->tunnel_id);
        info->key.tos = from->tunnel_tos;
        info->key.ttl = from->tunnel_ttl;

        if (flags & BPF_F_TUNINFO_IPV6) {
                info->mode |= IP_TUNNEL_INFO_IPV6;
                memcpy(&info->key.u.ipv6.dst, from->remote_ipv6,
                       sizeof(from->remote_ipv6));
                info->key.label = cpu_to_be32(from->tunnel_label) &
                                  IPV6_FLOWLABEL_MASK;
        } else {
                info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
        }

        return 0;
}

static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
        .func                = bpf_skb_set_tunnel_key,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb,
           const u8 *, from, u32, size)
{
        struct ip_tunnel_info *info = skb_tunnel_info(skb);
        const struct metadata_dst *md = this_cpu_ptr(md_dst);

        if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1))))
                return -EINVAL;
        if (unlikely(size > IP_TUNNEL_OPTS_MAX))
                return -ENOMEM;

        ip_tunnel_info_opts_set(info, from, size, TUNNEL_OPTIONS_PRESENT);

        return 0;
}

static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = {
        .func                = bpf_skb_set_tunnel_opt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM,
        .arg3_type        = ARG_CONST_SIZE,
};

static const struct bpf_func_proto *
bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
{
        if (!md_dst) {
                struct metadata_dst __percpu *tmp;

                tmp = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX,
                                                METADATA_IP_TUNNEL,
                                                GFP_KERNEL);
                if (!tmp)
                        return NULL;
                if (cmpxchg(&md_dst, NULL, tmp))
                        metadata_dst_free_percpu(tmp);
        }

        switch (which) {
        case BPF_FUNC_skb_set_tunnel_key:
                return &bpf_skb_set_tunnel_key_proto;
        case BPF_FUNC_skb_set_tunnel_opt:
                return &bpf_skb_set_tunnel_opt_proto;
        default:
                return NULL;
        }
}

BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map,
           u32, idx)
{
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        struct cgroup *cgrp;
        struct sock *sk;

        sk = skb_to_full_sk(skb);
        if (!sk || !sk_fullsock(sk))
                return -ENOENT;
        if (unlikely(idx >= array->map.max_entries))
                return -E2BIG;

        cgrp = READ_ONCE(array->ptrs[idx]);
        if (unlikely(!cgrp))
                return -EAGAIN;

        return sk_under_cgroup_hierarchy(sk, cgrp);
}

static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
        .func                = bpf_skb_under_cgroup,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
};

#ifdef CONFIG_SOCK_CGROUP_DATA
static inline u64 __bpf_sk_cgroup_id(struct sock *sk)
{
        struct cgroup *cgrp;

        sk = sk_to_full_sk(sk);
        if (!sk || !sk_fullsock(sk))
                return 0;

        cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
        return cgroup_id(cgrp);
}

BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb)
{
        return __bpf_sk_cgroup_id(skb->sk);
}

static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
        .func           = bpf_skb_cgroup_id,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
};

static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk,
                                              int ancestor_level)
{
        struct cgroup *ancestor;
        struct cgroup *cgrp;

        sk = sk_to_full_sk(sk);
        if (!sk || !sk_fullsock(sk))
                return 0;

        cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
        ancestor = cgroup_ancestor(cgrp, ancestor_level);
        if (!ancestor)
                return 0;

        return cgroup_id(ancestor);
}

BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
           ancestor_level)
{
        return __bpf_sk_ancestor_cgroup_id(skb->sk, ancestor_level);
}

static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = {
        .func           = bpf_skb_ancestor_cgroup_id,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_ANYTHING,
};

BPF_CALL_1(bpf_sk_cgroup_id, struct sock *, sk)
{
        return __bpf_sk_cgroup_id(sk);
}

static const struct bpf_func_proto bpf_sk_cgroup_id_proto = {
        .func           = bpf_sk_cgroup_id,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
};

BPF_CALL_2(bpf_sk_ancestor_cgroup_id, struct sock *, sk, int, ancestor_level)
{
        return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level);
}

static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = {
        .func           = bpf_sk_ancestor_cgroup_id,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .arg2_type      = ARG_ANYTHING,
};
#endif

static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
                                  unsigned long off, unsigned long len)
{
        memcpy(dst_buff, src_buff + off, len);
        return 0;
}

BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map,
           u64, flags, void *, meta, u64, meta_size)
{
        u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32;

        if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
                return -EINVAL;
        if (unlikely(!xdp ||
                     xdp_size > (unsigned long)(xdp->data_end - xdp->data)))
                return -EFAULT;

        return bpf_event_output(map, flags, meta, meta_size, xdp->data,
                                xdp_size, bpf_xdp_copy);
}

static const struct bpf_func_proto bpf_xdp_event_output_proto = {
        .func                = bpf_xdp_event_output,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM,
        .arg5_type        = ARG_CONST_SIZE_OR_ZERO,
};

BTF_ID_LIST_SINGLE(bpf_xdp_output_btf_ids, struct, xdp_buff)

const struct bpf_func_proto bpf_xdp_output_proto = {
        .func                = bpf_xdp_event_output,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID,
        .arg1_btf_id        = &bpf_xdp_output_btf_ids[0],
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM,
        .arg5_type        = ARG_CONST_SIZE_OR_ZERO,
};

BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb)
{
        return skb->sk ? __sock_gen_cookie(skb->sk) : 0;
}

static const struct bpf_func_proto bpf_get_socket_cookie_proto = {
        .func           = bpf_get_socket_cookie,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_socket_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx)
{
        return __sock_gen_cookie(ctx->sk);
}

static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = {
        .func                = bpf_get_socket_cookie_sock_addr,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_socket_cookie_sock, struct sock *, ctx)
{
        return __sock_gen_cookie(ctx);
}

static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = {
        .func                = bpf_get_socket_cookie_sock,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
{
        return __sock_gen_cookie(ctx->sk);
}

static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = {
        .func                = bpf_get_socket_cookie_sock_ops,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

static u64 __bpf_get_netns_cookie(struct sock *sk)
{
#ifdef CONFIG_NET_NS
        return __net_gen_cookie(sk ? sk->sk_net.net : &init_net);
#else
        return 0;
#endif
}

BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx)
{
        return __bpf_get_netns_cookie(ctx);
}

static const struct bpf_func_proto bpf_get_netns_cookie_sock_proto = {
        .func                = bpf_get_netns_cookie_sock,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX_OR_NULL,
};

BPF_CALL_1(bpf_get_netns_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx)
{
        return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
}

static const struct bpf_func_proto bpf_get_netns_cookie_sock_addr_proto = {
        .func                = bpf_get_netns_cookie_sock_addr,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX_OR_NULL,
};

BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb)
{
        struct sock *sk = sk_to_full_sk(skb->sk);
        kuid_t kuid;

        if (!sk || !sk_fullsock(sk))
                return overflowuid;
        kuid = sock_net_uid(sock_net(sk), sk);
        return from_kuid_munged(sock_net(sk)->user_ns, kuid);
}

static const struct bpf_func_proto bpf_get_socket_uid_proto = {
        .func           = bpf_get_socket_uid,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
};

static int _bpf_setsockopt(struct sock *sk, int level, int optname,
                           char *optval, int optlen)
{
        char devname[IFNAMSIZ];
        int val, valbool;
        struct net *net;
        int ifindex;
        int ret = 0;

        if (!sk_fullsock(sk))
                return -EINVAL;

        sock_owned_by_me(sk);

        if (level == SOL_SOCKET) {
                if (optlen != sizeof(int) && optname != SO_BINDTODEVICE)
                        return -EINVAL;
                val = *((int *)optval);
                valbool = val ? 1 : 0;

                /* Only some socketops are supported */
                switch (optname) {
                case SO_RCVBUF:
                        val = min_t(u32, val, READ_ONCE(sysctl_rmem_max));
                        val = min_t(int, val, INT_MAX / 2);
                        sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
                        WRITE_ONCE(sk->sk_rcvbuf,
                                   max_t(int, val * 2, SOCK_MIN_RCVBUF));
                        break;
                case SO_SNDBUF:
                        val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
                        val = min_t(int, val, INT_MAX / 2);
                        sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
                        WRITE_ONCE(sk->sk_sndbuf,
                                   max_t(int, val * 2, SOCK_MIN_SNDBUF));
                        break;
                case SO_MAX_PACING_RATE: /* 32bit version */
                        if (val != ~0U)
                                cmpxchg(&sk->sk_pacing_status,
                                        SK_PACING_NONE,
                                        SK_PACING_NEEDED);
                        sk->sk_max_pacing_rate = (val == ~0U) ?
                                                 ~0UL : (unsigned int)val;
                        sk->sk_pacing_rate = min(sk->sk_pacing_rate,
                                                 sk->sk_max_pacing_rate);
                        break;
                case SO_PRIORITY:
                        sk->sk_priority = val;
                        break;
                case SO_RCVLOWAT:
                        if (val < 0)
                                val = INT_MAX;
                        WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
                        break;
                case SO_MARK:
                        if (sk->sk_mark != val) {
                                sk->sk_mark = val;
                                sk_dst_reset(sk);
                        }
                        break;
                case SO_BINDTODEVICE:
                        optlen = min_t(long, optlen, IFNAMSIZ - 1);
                        strncpy(devname, optval, optlen);
                        devname[optlen] = 0;

                        ifindex = 0;
                        if (devname[0] != '\0') {
                                struct net_device *dev;

                                ret = -ENODEV;

                                net = sock_net(sk);
                                dev = dev_get_by_name(net, devname);
                                if (!dev)
                                        break;
                                ifindex = dev->ifindex;
                                dev_put(dev);
                        }
                        ret = sock_bindtoindex(sk, ifindex, false);
                        break;
                case SO_KEEPALIVE:
                        if (sk->sk_prot->keepalive)
                                sk->sk_prot->keepalive(sk, valbool);
                        sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
                        break;
                default:
                        ret = -EINVAL;
                }
#ifdef CONFIG_INET
        } else if (level == SOL_IP) {
                if (optlen != sizeof(int) || sk->sk_family != AF_INET)
                        return -EINVAL;

                val = *((int *)optval);
                /* Only some options are supported */
                switch (optname) {
                case IP_TOS:
                        if (val < -1 || val > 0xff) {
                                ret = -EINVAL;
                        } else {
                                struct inet_sock *inet = inet_sk(sk);

                                if (val == -1)
                                        val = 0;
                                inet->tos = val;
                        }
                        break;
                default:
                        ret = -EINVAL;
                }
#if IS_ENABLED(CONFIG_IPV6)
        } else if (level == SOL_IPV6) {
                if (optlen != sizeof(int) || sk->sk_family != AF_INET6)
                        return -EINVAL;

                val = *((int *)optval);
                /* Only some options are supported */
                switch (optname) {
                case IPV6_TCLASS:
                        if (val < -1 || val > 0xff) {
                                ret = -EINVAL;
                        } else {
                                struct ipv6_pinfo *np = inet6_sk(sk);

                                if (val == -1)
                                        val = 0;
                                np->tclass = val;
                        }
                        break;
                default:
                        ret = -EINVAL;
                }
#endif
        } else if (level == SOL_TCP &&
                   sk->sk_prot->setsockopt == tcp_setsockopt) {
                if (optname == TCP_CONGESTION) {
                        char name[TCP_CA_NAME_MAX];

                        strncpy(name, optval, min_t(long, optlen,
                                                    TCP_CA_NAME_MAX-1));
                        name[TCP_CA_NAME_MAX-1] = 0;
                        ret = tcp_set_congestion_control(sk, name, false, true);
                } else {
                        struct inet_connection_sock *icsk = inet_csk(sk);
                        struct tcp_sock *tp = tcp_sk(sk);
                        unsigned long timeout;

                        if (optlen != sizeof(int))
                                return -EINVAL;

                        val = *((int *)optval);
                        /* Only some options are supported */
                        switch (optname) {
                        case TCP_BPF_IW:
                                if (val <= 0 || tp->data_segs_out > tp->syn_data)
                                        ret = -EINVAL;
                                else
                                        tp->snd_cwnd = val;
                                break;
                        case TCP_BPF_SNDCWND_CLAMP:
                                if (val <= 0) {
                                        ret = -EINVAL;
                                } else {
                                        tp->snd_cwnd_clamp = val;
                                        tp->snd_ssthresh = val;
                                }
                                break;
                        case TCP_BPF_DELACK_MAX:
                                timeout = usecs_to_jiffies(val);
                                if (timeout > TCP_DELACK_MAX ||
                                    timeout < TCP_TIMEOUT_MIN)
                                        return -EINVAL;
                                inet_csk(sk)->icsk_delack_max = timeout;
                                break;
                        case TCP_BPF_RTO_MIN:
                                timeout = usecs_to_jiffies(val);
                                if (timeout > TCP_RTO_MIN ||
                                    timeout < TCP_TIMEOUT_MIN)
                                        return -EINVAL;
                                inet_csk(sk)->icsk_rto_min = timeout;
                                break;
                        case TCP_SAVE_SYN:
                                if (val < 0 || val > 1)
                                        ret = -EINVAL;
                                else
                                        tp->save_syn = val;
                                break;
                        case TCP_KEEPIDLE:
                                ret = tcp_sock_set_keepidle_locked(sk, val);
                                break;
                        case TCP_KEEPINTVL:
                                if (val < 1 || val > MAX_TCP_KEEPINTVL)
                                        ret = -EINVAL;
                                else
                                        tp->keepalive_intvl = val * HZ;
                                break;
                        case TCP_KEEPCNT:
                                if (val < 1 || val > MAX_TCP_KEEPCNT)
                                        ret = -EINVAL;
                                else
                                        tp->keepalive_probes = val;
                                break;
                        case TCP_SYNCNT:
                                if (val < 1 || val > MAX_TCP_SYNCNT)
                                        ret = -EINVAL;
                                else
                                        icsk->icsk_syn_retries = val;
                                break;
                        case TCP_USER_TIMEOUT:
                                if (val < 0)
                                        ret = -EINVAL;
                                else
                                        icsk->icsk_user_timeout = val;
                                break;
                        case TCP_NOTSENT_LOWAT:
                                tp->notsent_lowat = val;
                                sk->sk_write_space(sk);
                                break;
                        default:
                                ret = -EINVAL;
                        }
                }
#endif
        } else {
                ret = -EINVAL;
        }
        return ret;
}

static int _bpf_getsockopt(struct sock *sk, int level, int optname,
                           char *optval, int optlen)
{
        if (!sk_fullsock(sk))
                goto err_clear;

        sock_owned_by_me(sk);

#ifdef CONFIG_INET
        if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) {
                struct inet_connection_sock *icsk;
                struct tcp_sock *tp;

                switch (optname) {
                case TCP_CONGESTION:
                        icsk = inet_csk(sk);

                        if (!icsk->icsk_ca_ops || optlen <= 1)
                                goto err_clear;
                        strncpy(optval, icsk->icsk_ca_ops->name, optlen);
                        optval[optlen - 1] = 0;
                        break;
                case TCP_SAVED_SYN:
                        tp = tcp_sk(sk);

                        if (optlen <= 0 || !tp->saved_syn ||
                            optlen > tcp_saved_syn_len(tp->saved_syn))
                                goto err_clear;
                        memcpy(optval, tp->saved_syn->data, optlen);
                        break;
                default:
                        goto err_clear;
                }
        } else if (level == SOL_IP) {
                struct inet_sock *inet = inet_sk(sk);

                if (optlen != sizeof(int) || sk->sk_family != AF_INET)
                        goto err_clear;

                /* Only some options are supported */
                switch (optname) {
                case IP_TOS:
                        *((int *)optval) = (int)inet->tos;
                        break;
                default:
                        goto err_clear;
                }
#if IS_ENABLED(CONFIG_IPV6)
        } else if (level == SOL_IPV6) {
                struct ipv6_pinfo *np = inet6_sk(sk);

                if (optlen != sizeof(int) || sk->sk_family != AF_INET6)
                        goto err_clear;

                /* Only some options are supported */
                switch (optname) {
                case IPV6_TCLASS:
                        *((int *)optval) = (int)np->tclass;
                        break;
                default:
                        goto err_clear;
                }
#endif
        } else {
                goto err_clear;
        }
        return 0;
#endif
err_clear:
        memset(optval, 0, optlen);
        return -EINVAL;
}

BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx,
           int, level, int, optname, char *, optval, int, optlen)
{
        return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen);
}

static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = {
        .func                = bpf_sock_addr_setsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_sock_addr_getsockopt, struct bpf_sock_addr_kern *, ctx,
           int, level, int, optname, char *, optval, int, optlen)
{
        return _bpf_getsockopt(ctx->sk, level, optname, optval, optlen);
}

static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = {
        .func                = bpf_sock_addr_getsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
           int, level, int, optname, char *, optval, int, optlen)
{
        return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen);
}

static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = {
        .func                = bpf_sock_ops_setsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM,
        .arg5_type        = ARG_CONST_SIZE,
};

static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock,
                                int optname, const u8 **start)
{
        struct sk_buff *syn_skb = bpf_sock->syn_skb;
        const u8 *hdr_start;
        int ret;

        if (syn_skb) {
                /* sk is a request_sock here */

                if (optname == TCP_BPF_SYN) {
                        hdr_start = syn_skb->data;
                        ret = tcp_hdrlen(syn_skb);
                } else if (optname == TCP_BPF_SYN_IP) {
                        hdr_start = skb_network_header(syn_skb);
                        ret = skb_network_header_len(syn_skb) +
                                tcp_hdrlen(syn_skb);
                } else {
                        /* optname == TCP_BPF_SYN_MAC */
                        hdr_start = skb_mac_header(syn_skb);
                        ret = skb_mac_header_len(syn_skb) +
                                skb_network_header_len(syn_skb) +
                                tcp_hdrlen(syn_skb);
                }
        } else {
                struct sock *sk = bpf_sock->sk;
                struct saved_syn *saved_syn;

                if (sk->sk_state == TCP_NEW_SYN_RECV)
                        /* synack retransmit. bpf_sock->syn_skb will
                         * not be available.  It has to resort to
                         * saved_syn (if it is saved).
                         */
                        saved_syn = inet_reqsk(sk)->saved_syn;
                else
                        saved_syn = tcp_sk(sk)->saved_syn;

                if (!saved_syn)
                        return -ENOENT;

                if (optname == TCP_BPF_SYN) {
                        hdr_start = saved_syn->data +
                                saved_syn->mac_hdrlen +
                                saved_syn->network_hdrlen;
                        ret = saved_syn->tcp_hdrlen;
                } else if (optname == TCP_BPF_SYN_IP) {
                        hdr_start = saved_syn->data +
                                saved_syn->mac_hdrlen;
                        ret = saved_syn->network_hdrlen +
                                saved_syn->tcp_hdrlen;
                } else {
                        /* optname == TCP_BPF_SYN_MAC */

                        /* TCP_SAVE_SYN may not have saved the mac hdr */
                        if (!saved_syn->mac_hdrlen)
                                return -ENOENT;

                        hdr_start = saved_syn->data;
                        ret = saved_syn->mac_hdrlen +
                                saved_syn->network_hdrlen +
                                saved_syn->tcp_hdrlen;
                }
        }

        *start = hdr_start;
        return ret;
}

BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
           int, level, int, optname, char *, optval, int, optlen)
{
        if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP &&
            optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_MAC) {
                int ret, copy_len = 0;
                const u8 *start;

                ret = bpf_sock_ops_get_syn(bpf_sock, optname, &start);
                if (ret > 0) {
                        copy_len = ret;
                        if (optlen < copy_len) {
                                copy_len = optlen;
                                ret = -ENOSPC;
                        }

                        memcpy(optval, start, copy_len);
                }

                /* Zero out unused buffer at the end */
                memset(optval + copy_len, 0, optlen - copy_len);

                return ret;
        }

        return _bpf_getsockopt(bpf_sock->sk, level, optname, optval, optlen);
}

static const struct bpf_func_proto bpf_sock_ops_getsockopt_proto = {
        .func                = bpf_sock_ops_getsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock,
           int, argval)
{
        struct sock *sk = bpf_sock->sk;
        int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS;

        if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk))
                return -EINVAL;

        tcp_sk(sk)->bpf_sock_ops_cb_flags = val;

        return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS);
}

static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = {
        .func                = bpf_sock_ops_cb_flags_set,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly;
EXPORT_SYMBOL_GPL(ipv6_bpf_stub);

BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr,
           int, addr_len)
{
#ifdef CONFIG_INET
        struct sock *sk = ctx->sk;
        u32 flags = BIND_FROM_BPF;
        int err;

        err = -EINVAL;
        if (addr_len < offsetofend(struct sockaddr, sa_family))
                return err;
        if (addr->sa_family == AF_INET) {
                if (addr_len < sizeof(struct sockaddr_in))
                        return err;
                if (((struct sockaddr_in *)addr)->sin_port == htons(0))
                        flags |= BIND_FORCE_ADDRESS_NO_PORT;
                return __inet_bind(sk, addr, addr_len, flags);
#if IS_ENABLED(CONFIG_IPV6)
        } else if (addr->sa_family == AF_INET6) {
                if (addr_len < SIN6_LEN_RFC2133)
                        return err;
                if (((struct sockaddr_in6 *)addr)->sin6_port == htons(0))
                        flags |= BIND_FORCE_ADDRESS_NO_PORT;
                /* ipv6_bpf_stub cannot be NULL, since it's called from
                 * bpf_cgroup_inet6_connect hook and ipv6 is already loaded
                 */
                return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, flags);
#endif /* CONFIG_IPV6 */
        }
#endif /* CONFIG_INET */

        return -EAFNOSUPPORT;
}

static const struct bpf_func_proto bpf_bind_proto = {
        .func                = bpf_bind,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM,
        .arg3_type        = ARG_CONST_SIZE,
};

#ifdef CONFIG_XFRM
BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index,
           struct bpf_xfrm_state *, to, u32, size, u64, flags)
{
        const struct sec_path *sp = skb_sec_path(skb);
        const struct xfrm_state *x;

        if (!sp || unlikely(index >= sp->len || flags))
                goto err_clear;

        x = sp->xvec[index];

        if (unlikely(size != sizeof(struct bpf_xfrm_state)))
                goto err_clear;

        to->reqid = x->props.reqid;
        to->spi = x->id.spi;
        to->family = x->props.family;
        to->ext = 0;

        if (to->family == AF_INET6) {
                memcpy(to->remote_ipv6, x->props.saddr.a6,
                       sizeof(to->remote_ipv6));
        } else {
                to->remote_ipv4 = x->props.saddr.a4;
                memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
        }

        return 0;
err_clear:
        memset(to, 0, size);
        return -EINVAL;
}

static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
        .func                = bpf_skb_get_xfrm_state,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
        .arg5_type        = ARG_ANYTHING,
};
#endif

#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
                                  const struct neighbour *neigh,
                                  const struct net_device *dev)
{
        memcpy(params->dmac, neigh->ha, ETH_ALEN);
        memcpy(params->smac, dev->dev_addr, ETH_ALEN);
        params->h_vlan_TCI = 0;
        params->h_vlan_proto = 0;

        return 0;
}
#endif

#if IS_ENABLED(CONFIG_INET)
static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
                               u32 flags, bool check_mtu)
{
        struct fib_nh_common *nhc;
        struct in_device *in_dev;
        struct neighbour *neigh;
        struct net_device *dev;
        struct fib_result res;
        struct flowi4 fl4;
        int err;
        u32 mtu;

        dev = dev_get_by_index_rcu(net, params->ifindex);
        if (unlikely(!dev))
                return -ENODEV;

        /* verify forwarding is enabled on this interface */
        in_dev = __in_dev_get_rcu(dev);
        if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
                return BPF_FIB_LKUP_RET_FWD_DISABLED;

        if (flags & BPF_FIB_LOOKUP_OUTPUT) {
                fl4.flowi4_iif = 1;
                fl4.flowi4_oif = params->ifindex;
        } else {
                fl4.flowi4_iif = params->ifindex;
                fl4.flowi4_oif = 0;
        }
        fl4.flowi4_tos = params->tos & IPTOS_RT_MASK;
        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
        fl4.flowi4_flags = 0;

        fl4.flowi4_proto = params->l4_protocol;
        fl4.daddr = params->ipv4_dst;
        fl4.saddr = params->ipv4_src;
        fl4.fl4_sport = params->sport;
        fl4.fl4_dport = params->dport;
        fl4.flowi4_multipath_hash = 0;

        if (flags & BPF_FIB_LOOKUP_DIRECT) {
                u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
                struct fib_table *tb;

                tb = fib_get_table(net, tbid);
                if (unlikely(!tb))
                        return BPF_FIB_LKUP_RET_NOT_FWDED;

                err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
        } else {
                fl4.flowi4_mark = 0;
                fl4.flowi4_secid = 0;
                fl4.flowi4_tun_key.tun_id = 0;
                fl4.flowi4_uid = sock_net_uid(net, NULL);

                err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
        }

        if (err) {
                /* map fib lookup errors to RTN_ type */
                if (err == -EINVAL)
                        return BPF_FIB_LKUP_RET_BLACKHOLE;
                if (err == -EHOSTUNREACH)
                        return BPF_FIB_LKUP_RET_UNREACHABLE;
                if (err == -EACCES)
                        return BPF_FIB_LKUP_RET_PROHIBIT;

                return BPF_FIB_LKUP_RET_NOT_FWDED;
        }

        if (res.type != RTN_UNICAST)
                return BPF_FIB_LKUP_RET_NOT_FWDED;

        if (fib_info_num_path(res.fi) > 1)
                fib_select_path(net, &res, &fl4, NULL);

        if (check_mtu) {
                mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst);
                if (params->tot_len > mtu)
                        return BPF_FIB_LKUP_RET_FRAG_NEEDED;
        }

        nhc = res.nhc;

        /* do not handle lwt encaps right now */
        if (nhc->nhc_lwtstate)
                return BPF_FIB_LKUP_RET_UNSUPP_LWT;

        dev = nhc->nhc_dev;

        params->rt_metric = res.fi->fib_priority;
        params->ifindex = dev->ifindex;

        /* xdp and cls_bpf programs are run in RCU-bh so
         * rcu_read_lock_bh is not needed here
         */
        if (likely(nhc->nhc_gw_family != AF_INET6)) {
                if (nhc->nhc_gw_family)
                        params->ipv4_dst = nhc->nhc_gw.ipv4;

                neigh = __ipv4_neigh_lookup_noref(dev,
                                                 (__force u32)params->ipv4_dst);
        } else {
                struct in6_addr *dst = (struct in6_addr *)params->ipv6_dst;

                params->family = AF_INET6;
                *dst = nhc->nhc_gw.ipv6;
                neigh = __ipv6_neigh_lookup_noref_stub(dev, dst);
        }

        if (!neigh || !(neigh->nud_state & NUD_VALID))
                return BPF_FIB_LKUP_RET_NO_NEIGH;

        return bpf_fib_set_fwd_params(params, neigh, dev);
}
#endif

#if IS_ENABLED(CONFIG_IPV6)
static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
                               u32 flags, bool check_mtu)
{
        struct in6_addr *src = (struct in6_addr *) params->ipv6_src;
        struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst;
        struct fib6_result res = {};
        struct neighbour *neigh;
        struct net_device *dev;
        struct inet6_dev *idev;
        struct flowi6 fl6;
        int strict = 0;
        int oif, err;
        u32 mtu;

        /* link local addresses are never forwarded */
        if (rt6_need_strict(dst) || rt6_need_strict(src))
                return BPF_FIB_LKUP_RET_NOT_FWDED;

        dev = dev_get_by_index_rcu(net, params->ifindex);
        if (unlikely(!dev))
                return -ENODEV;

        idev = __in6_dev_get_safely(dev);
        if (unlikely(!idev || !idev->cnf.forwarding))
                return BPF_FIB_LKUP_RET_FWD_DISABLED;

        if (flags & BPF_FIB_LOOKUP_OUTPUT) {
                fl6.flowi6_iif = 1;
                oif = fl6.flowi6_oif = params->ifindex;
        } else {
                oif = fl6.flowi6_iif = params->ifindex;
                fl6.flowi6_oif = 0;
                strict = RT6_LOOKUP_F_HAS_SADDR;
        }
        fl6.flowlabel = params->flowinfo;
        fl6.flowi6_scope = 0;
        fl6.flowi6_flags = 0;
        fl6.mp_hash = 0;

        fl6.flowi6_proto = params->l4_protocol;
        fl6.daddr = *dst;
        fl6.saddr = *src;
        fl6.fl6_sport = params->sport;
        fl6.fl6_dport = params->dport;

        if (flags & BPF_FIB_LOOKUP_DIRECT) {
                u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
                struct fib6_table *tb;

                tb = ipv6_stub->fib6_get_table(net, tbid);
                if (unlikely(!tb))
                        return BPF_FIB_LKUP_RET_NOT_FWDED;

                err = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, &res,
                                                   strict);
        } else {
                fl6.flowi6_mark = 0;
                fl6.flowi6_secid = 0;
                fl6.flowi6_tun_key.tun_id = 0;
                fl6.flowi6_uid = sock_net_uid(net, NULL);

                err = ipv6_stub->fib6_lookup(net, oif, &fl6, &res, strict);
        }

        if (unlikely(err || IS_ERR_OR_NULL(res.f6i) ||
                     res.f6i == net->ipv6.fib6_null_entry))
                return BPF_FIB_LKUP_RET_NOT_FWDED;

        switch (res.fib6_type) {
        /* only unicast is forwarded */
        case RTN_UNICAST:
                break;
        case RTN_BLACKHOLE:
                return BPF_FIB_LKUP_RET_BLACKHOLE;
        case RTN_UNREACHABLE:
                return BPF_FIB_LKUP_RET_UNREACHABLE;
        case RTN_PROHIBIT:
                return BPF_FIB_LKUP_RET_PROHIBIT;
        default:
                return BPF_FIB_LKUP_RET_NOT_FWDED;
        }

        ipv6_stub->fib6_select_path(net, &res, &fl6, fl6.flowi6_oif,
                                    fl6.flowi6_oif != 0, NULL, strict);

        if (check_mtu) {
                mtu = ipv6_stub->ip6_mtu_from_fib6(&res, dst, src);
                if (params->tot_len > mtu)
                        return BPF_FIB_LKUP_RET_FRAG_NEEDED;
        }

        if (res.nh->fib_nh_lws)
                return BPF_FIB_LKUP_RET_UNSUPP_LWT;

        if (res.nh->fib_nh_gw_family)
                *dst = res.nh->fib_nh_gw6;

        dev = res.nh->fib_nh_dev;
        params->rt_metric = res.f6i->fib6_metric;
        params->ifindex = dev->ifindex;

        /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
         * not needed here.
         */
        neigh = __ipv6_neigh_lookup_noref_stub(dev, dst);
        if (!neigh || !(neigh->nud_state & NUD_VALID))
                return BPF_FIB_LKUP_RET_NO_NEIGH;

        return bpf_fib_set_fwd_params(params, neigh, dev);
}
#endif

BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
           struct bpf_fib_lookup *, params, int, plen, u32, flags)
{
        if (plen < sizeof(*params))
                return -EINVAL;

        if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT))
                return -EINVAL;

        switch (params->family) {
#if IS_ENABLED(CONFIG_INET)
        case AF_INET:
                return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params,
                                           flags, true);
#endif
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params,
                                           flags, true);
#endif
        }
        return -EAFNOSUPPORT;
}

static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = {
        .func                = bpf_xdp_fib_lookup,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_PTR_TO_MEM,
        .arg3_type      = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
           struct bpf_fib_lookup *, params, int, plen, u32, flags)
{
        struct net *net = dev_net(skb->dev);
        int rc = -EAFNOSUPPORT;
        bool check_mtu = false;

        if (plen < sizeof(*params))
                return -EINVAL;

        if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT))
                return -EINVAL;

        if (params->tot_len)
                check_mtu = true;

        switch (params->family) {
#if IS_ENABLED(CONFIG_INET)
        case AF_INET:
                rc = bpf_ipv4_fib_lookup(net, params, flags, check_mtu);
                break;
#endif
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                rc = bpf_ipv6_fib_lookup(net, params, flags, check_mtu);
                break;
#endif
        }

        if (rc == BPF_FIB_LKUP_RET_SUCCESS && !check_mtu) {
                struct net_device *dev;

                /* When tot_len isn't provided by user, check skb
                 * against MTU of FIB lookup resulting net_device
                 */
                dev = dev_get_by_index_rcu(net, params->ifindex);
                if (!is_skb_forwardable(dev, skb))
                        rc = BPF_FIB_LKUP_RET_FRAG_NEEDED;
        }

        return rc;
}

static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
        .func                = bpf_skb_fib_lookup,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_PTR_TO_MEM,
        .arg3_type      = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
};

#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
{
        int err;
        struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr;

        if (!seg6_validate_srh(srh, len, false))
                return -EINVAL;

        switch (type) {
        case BPF_LWT_ENCAP_SEG6_INLINE:
                if (skb->protocol != htons(ETH_P_IPV6))
                        return -EBADMSG;

                err = seg6_do_srh_inline(skb, srh);
                break;
        case BPF_LWT_ENCAP_SEG6:
                skb_reset_inner_headers(skb);
                skb->encapsulation = 1;
                err = seg6_do_srh_encap(skb, srh, IPPROTO_IPV6);
                break;
        default:
                return -EINVAL;
        }

        bpf_compute_data_pointers(skb);
        if (err)
                return err;

        skb_set_transport_header(skb, sizeof(struct ipv6hdr));

        return seg6_lookup_nexthop(skb, NULL, 0);
}
#endif /* CONFIG_IPV6_SEG6_BPF */

#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
                             bool ingress)
{
        return bpf_lwt_push_ip_encap(skb, hdr, len, ingress);
}
#endif

BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
           u32, len)
{
        switch (type) {
#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
        case BPF_LWT_ENCAP_SEG6:
        case BPF_LWT_ENCAP_SEG6_INLINE:
                return bpf_push_seg6_encap(skb, type, hdr, len);
#endif
#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
        case BPF_LWT_ENCAP_IP:
                return bpf_push_ip_encap(skb, hdr, len, true /* ingress */);
#endif
        default:
                return -EINVAL;
        }
}

BPF_CALL_4(bpf_lwt_xmit_push_encap, struct sk_buff *, skb, u32, type,
           void *, hdr, u32, len)
{
        switch (type) {
#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
        case BPF_LWT_ENCAP_IP:
                return bpf_push_ip_encap(skb, hdr, len, false /* egress */);
#endif
        default:
                return -EINVAL;
        }
}

static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = {
        .func                = bpf_lwt_in_push_encap,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_MEM,
        .arg4_type        = ARG_CONST_SIZE
};

static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = {
        .func                = bpf_lwt_xmit_push_encap,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_MEM,
        .arg4_type        = ARG_CONST_SIZE
};

#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset,
           const void *, from, u32, len)
{
        struct seg6_bpf_srh_state *srh_state =
                this_cpu_ptr(&seg6_bpf_srh_states);
        struct ipv6_sr_hdr *srh = srh_state->srh;
        void *srh_tlvs, *srh_end, *ptr;
        int srhoff = 0;

        if (srh == NULL)
                return -EINVAL;

        srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4));
        srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen);

        ptr = skb->data + offset;
        if (ptr >= srh_tlvs && ptr + len <= srh_end)
                srh_state->valid = false;
        else if (ptr < (void *)&srh->flags ||
                 ptr + len > (void *)&srh->segments)
                return -EFAULT;

        if (unlikely(bpf_try_make_writable(skb, offset + len)))
                return -EFAULT;
        if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
                return -EINVAL;
        srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);

        memcpy(skb->data + offset, from, len);
        return 0;
}

static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = {
        .func                = bpf_lwt_seg6_store_bytes,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_MEM,
        .arg4_type        = ARG_CONST_SIZE
};

static void bpf_update_srh_state(struct sk_buff *skb)
{
        struct seg6_bpf_srh_state *srh_state =
                this_cpu_ptr(&seg6_bpf_srh_states);
        int srhoff = 0;

        if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) {
                srh_state->srh = NULL;
        } else {
                srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
                srh_state->hdrlen = srh_state->srh->hdrlen << 3;
                srh_state->valid = true;
        }
}

BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb,
           u32, action, void *, param, u32, param_len)
{
        struct seg6_bpf_srh_state *srh_state =
                this_cpu_ptr(&seg6_bpf_srh_states);
        int hdroff = 0;
        int err;

        switch (action) {
        case SEG6_LOCAL_ACTION_END_X:
                if (!seg6_bpf_has_valid_srh(skb))
                        return -EBADMSG;
                if (param_len != sizeof(struct in6_addr))
                        return -EINVAL;
                return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0);
        case SEG6_LOCAL_ACTION_END_T:
                if (!seg6_bpf_has_valid_srh(skb))
                        return -EBADMSG;
                if (param_len != sizeof(int))
                        return -EINVAL;
                return seg6_lookup_nexthop(skb, NULL, *(int *)param);
        case SEG6_LOCAL_ACTION_END_DT6:
                if (!seg6_bpf_has_valid_srh(skb))
                        return -EBADMSG;
                if (param_len != sizeof(int))
                        return -EINVAL;

                if (ipv6_find_hdr(skb, &hdroff, IPPROTO_IPV6, NULL, NULL) < 0)
                        return -EBADMSG;
                if (!pskb_pull(skb, hdroff))
                        return -EBADMSG;

                skb_postpull_rcsum(skb, skb_network_header(skb), hdroff);
                skb_reset_network_header(skb);
                skb_reset_transport_header(skb);
                skb->encapsulation = 0;

                bpf_compute_data_pointers(skb);
                bpf_update_srh_state(skb);
                return seg6_lookup_nexthop(skb, NULL, *(int *)param);
        case SEG6_LOCAL_ACTION_END_B6:
                if (srh_state->srh && !seg6_bpf_has_valid_srh(skb))
                        return -EBADMSG;
                err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE,
                                          param, param_len);
                if (!err)
                        bpf_update_srh_state(skb);

                return err;
        case SEG6_LOCAL_ACTION_END_B6_ENCAP:
                if (srh_state->srh && !seg6_bpf_has_valid_srh(skb))
                        return -EBADMSG;
                err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6,
                                          param, param_len);
                if (!err)
                        bpf_update_srh_state(skb);

                return err;
        default:
                return -EINVAL;
        }
}

static const struct bpf_func_proto bpf_lwt_seg6_action_proto = {
        .func                = bpf_lwt_seg6_action,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_MEM,
        .arg4_type        = ARG_CONST_SIZE
};

BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset,
           s32, len)
{
        struct seg6_bpf_srh_state *srh_state =
                this_cpu_ptr(&seg6_bpf_srh_states);
        struct ipv6_sr_hdr *srh = srh_state->srh;
        void *srh_end, *srh_tlvs, *ptr;
        struct ipv6hdr *hdr;
        int srhoff = 0;
        int ret;

        if (unlikely(srh == NULL))
                return -EINVAL;

        srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) +
                        ((srh->first_segment + 1) << 4));
        srh_end = (void *)((unsigned char *)srh + sizeof(*srh) +
                        srh_state->hdrlen);
        ptr = skb->data + offset;

        if (unlikely(ptr < srh_tlvs || ptr > srh_end))
                return -EFAULT;
        if (unlikely(len < 0 && (void *)((char *)ptr - len) > srh_end))
                return -EFAULT;

        if (len > 0) {
                ret = skb_cow_head(skb, len);
                if (unlikely(ret < 0))
                        return ret;

                ret = bpf_skb_net_hdr_push(skb, offset, len);
        } else {
                ret = bpf_skb_net_hdr_pop(skb, offset, -1 * len);
        }

        bpf_compute_data_pointers(skb);
        if (unlikely(ret < 0))
                return ret;

        hdr = (struct ipv6hdr *)skb->data;
        hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));

        if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
                return -EINVAL;
        srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
        srh_state->hdrlen += len;
        srh_state->valid = false;
        return 0;
}

static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
        .func                = bpf_lwt_seg6_adjust_srh,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
};
#endif /* CONFIG_IPV6_SEG6_BPF */

#ifdef CONFIG_INET
static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
                              int dif, int sdif, u8 family, u8 proto)
{
        bool refcounted = false;
        struct sock *sk = NULL;

        if (family == AF_INET) {
                __be32 src4 = tuple->ipv4.saddr;
                __be32 dst4 = tuple->ipv4.daddr;

                if (proto == IPPROTO_TCP)
                        sk = __inet_lookup(net, &tcp_hashinfo, NULL, 0,
                                           src4, tuple->ipv4.sport,
                                           dst4, tuple->ipv4.dport,
                                           dif, sdif, &refcounted);
                else
                        sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport,
                                               dst4, tuple->ipv4.dport,
                                               dif, sdif, &udp_table, NULL);
#if IS_ENABLED(CONFIG_IPV6)
        } else {
                struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr;
                struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr;

                if (proto == IPPROTO_TCP)
                        sk = __inet6_lookup(net, &tcp_hashinfo, NULL, 0,
                                            src6, tuple->ipv6.sport,
                                            dst6, ntohs(tuple->ipv6.dport),
                                            dif, sdif, &refcounted);
                else if (likely(ipv6_bpf_stub))
                        sk = ipv6_bpf_stub->udp6_lib_lookup(net,
                                                            src6, tuple->ipv6.sport,
                                                            dst6, tuple->ipv6.dport,
                                                            dif, sdif,
                                                            &udp_table, NULL);
#endif
        }

        if (unlikely(sk && !refcounted && !sock_flag(sk, SOCK_RCU_FREE))) {
                WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
                sk = NULL;
        }
        return sk;
}

/* bpf_skc_lookup performs the core lookup for different types of sockets,
 * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE.
 * Returns the socket as an 'unsigned long' to simplify the casting in the
 * callers to satisfy BPF_CALL declarations.
 */
static struct sock *
__bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
                 struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id,
                 u64 flags)
{
        struct sock *sk = NULL;
        u8 family = AF_UNSPEC;
        struct net *net;
        int sdif;

        if (len == sizeof(tuple->ipv4))
                family = AF_INET;
        else if (len == sizeof(tuple->ipv6))
                family = AF_INET6;
        else
                return NULL;

        if (unlikely(family == AF_UNSPEC || flags ||
                     !((s32)netns_id < 0 || netns_id <= S32_MAX)))
                goto out;

        if (family == AF_INET)
                sdif = inet_sdif(skb);
        else
                sdif = inet6_sdif(skb);

        if ((s32)netns_id < 0) {
                net = caller_net;
                sk = sk_lookup(net, tuple, ifindex, sdif, family, proto);
        } else {
                net = get_net_ns_by_id(caller_net, netns_id);
                if (unlikely(!net))
                        goto out;
                sk = sk_lookup(net, tuple, ifindex, sdif, family, proto);
                put_net(net);
        }

out:
        return sk;
}

static struct sock *
__bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
                struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id,
                u64 flags)
{
        struct sock *sk = __bpf_skc_lookup(skb, tuple, len, caller_net,
                                           ifindex, proto, netns_id, flags);

        if (sk) {
                struct sock *sk2 = sk_to_full_sk(sk);

                /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
                 * sock refcnt is decremented to prevent a request_sock leak.
                 */
                if (!sk_fullsock(sk2))
                        sk2 = NULL;
                if (sk2 != sk) {
                        sock_gen_put(sk);
                        /* Ensure there is no need to bump sk2 refcnt */
                        if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) {
                                WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
                                return NULL;
                        }
                        sk = sk2;
                }
        }

        return sk;
}

static struct sock *
bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
               u8 proto, u64 netns_id, u64 flags)
{
        struct net *caller_net;
        int ifindex;

        if (skb->dev) {
                caller_net = dev_net(skb->dev);
                ifindex = skb->dev->ifindex;
        } else {
                caller_net = sock_net(skb->sk);
                ifindex = 0;
        }

        return __bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, proto,
                                netns_id, flags);
}

static struct sock *
bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
              u8 proto, u64 netns_id, u64 flags)
{
        struct sock *sk = bpf_skc_lookup(skb, tuple, len, proto, netns_id,
                                         flags);

        if (sk) {
                struct sock *sk2 = sk_to_full_sk(sk);

                /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
                 * sock refcnt is decremented to prevent a request_sock leak.
                 */
                if (!sk_fullsock(sk2))
                        sk2 = NULL;
                if (sk2 != sk) {
                        sock_gen_put(sk);
                        /* Ensure there is no need to bump sk2 refcnt */
                        if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) {
                                WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
                                return NULL;
                        }
                        sk = sk2;
                }
        }

        return sk;
}

BPF_CALL_5(bpf_skc_lookup_tcp, struct sk_buff *, skb,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        return (unsigned long)bpf_skc_lookup(skb, tuple, len, IPPROTO_TCP,
                                             netns_id, flags);
}

static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = {
        .func                = bpf_skc_lookup_tcp,
        .gpl_only        = false,
        .pkt_access        = true,
        .ret_type        = RET_PTR_TO_SOCK_COMMON_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP,
                                            netns_id, flags);
}

static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = {
        .func                = bpf_sk_lookup_tcp,
        .gpl_only        = false,
        .pkt_access        = true,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP,
                                            netns_id, flags);
}

static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
        .func                = bpf_sk_lookup_udp,
        .gpl_only        = false,
        .pkt_access        = true,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_1(bpf_sk_release, struct sock *, sk)
{
        if (sk && sk_is_refcounted(sk))
                sock_gen_put(sk);
        return 0;
}

static const struct bpf_func_proto bpf_sk_release_proto = {
        .func                = bpf_sk_release,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
};

BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx,
           struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
{
        struct net *caller_net = dev_net(ctx->rxq->dev);
        int ifindex = ctx->rxq->dev->ifindex;

        return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net,
                                              ifindex, IPPROTO_UDP, netns_id,
                                              flags);
}

static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = {
        .func           = bpf_xdp_sk_lookup_udp,
        .gpl_only       = false,
        .pkt_access     = true,
        .ret_type       = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_PTR_TO_MEM,
        .arg3_type      = ARG_CONST_SIZE,
        .arg4_type      = ARG_ANYTHING,
        .arg5_type      = ARG_ANYTHING,
};

BPF_CALL_5(bpf_xdp_skc_lookup_tcp, struct xdp_buff *, ctx,
           struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
{
        struct net *caller_net = dev_net(ctx->rxq->dev);
        int ifindex = ctx->rxq->dev->ifindex;

        return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, caller_net,
                                               ifindex, IPPROTO_TCP, netns_id,
                                               flags);
}

static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = {
        .func           = bpf_xdp_skc_lookup_tcp,
        .gpl_only       = false,
        .pkt_access     = true,
        .ret_type       = RET_PTR_TO_SOCK_COMMON_OR_NULL,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_PTR_TO_MEM,
        .arg3_type      = ARG_CONST_SIZE,
        .arg4_type      = ARG_ANYTHING,
        .arg5_type      = ARG_ANYTHING,
};

BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx,
           struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
{
        struct net *caller_net = dev_net(ctx->rxq->dev);
        int ifindex = ctx->rxq->dev->ifindex;

        return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net,
                                              ifindex, IPPROTO_TCP, netns_id,
                                              flags);
}

static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = {
        .func           = bpf_xdp_sk_lookup_tcp,
        .gpl_only       = false,
        .pkt_access     = true,
        .ret_type       = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_PTR_TO_MEM,
        .arg3_type      = ARG_CONST_SIZE,
        .arg4_type      = ARG_ANYTHING,
        .arg5_type      = ARG_ANYTHING,
};

BPF_CALL_5(bpf_sock_addr_skc_lookup_tcp, struct bpf_sock_addr_kern *, ctx,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        return (unsigned long)__bpf_skc_lookup(NULL, tuple, len,
                                               sock_net(ctx->sk), 0,
                                               IPPROTO_TCP, netns_id, flags);
}

static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = {
        .func                = bpf_sock_addr_skc_lookup_tcp,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_SOCK_COMMON_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        return (unsigned long)__bpf_sk_lookup(NULL, tuple, len,
                                              sock_net(ctx->sk), 0, IPPROTO_TCP,
                                              netns_id, flags);
}

static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = {
        .func                = bpf_sock_addr_sk_lookup_tcp,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        return (unsigned long)__bpf_sk_lookup(NULL, tuple, len,
                                              sock_net(ctx->sk), 0, IPPROTO_UDP,
                                              netns_id, flags);
}

static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = {
        .func                = bpf_sock_addr_sk_lookup_udp,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
                                  struct bpf_insn_access_aux *info)
{
        if (off < 0 || off >= offsetofend(struct bpf_tcp_sock,
                                          icsk_retransmits))
                return false;

        if (off % size != 0)
                return false;

        switch (off) {
        case offsetof(struct bpf_tcp_sock, bytes_received):
        case offsetof(struct bpf_tcp_sock, bytes_acked):
                return size == sizeof(__u64);
        default:
                return size == sizeof(__u32);
        }
}

u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
                                    const struct bpf_insn *si,
                                    struct bpf_insn *insn_buf,
                                    struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;

#define BPF_TCP_SOCK_GET_COMMON(FIELD)                                        \
        do {                                                                \
                BUILD_BUG_ON(sizeof_field(struct tcp_sock, FIELD) >        \
                             sizeof_field(struct bpf_tcp_sock, FIELD));        \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_sock, FIELD),\
                                      si->dst_reg, si->src_reg,                \
                                      offsetof(struct tcp_sock, FIELD)); \
        } while (0)

#define BPF_INET_SOCK_GET_COMMON(FIELD)                                        \
        do {                                                                \
                BUILD_BUG_ON(sizeof_field(struct inet_connection_sock,        \
                                          FIELD) >                        \
                             sizeof_field(struct bpf_tcp_sock, FIELD));        \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                        \
                                        struct inet_connection_sock,        \
                                        FIELD),                                \
                                      si->dst_reg, si->src_reg,                \
                                      offsetof(                                \
                                        struct inet_connection_sock,        \
                                        FIELD));                        \
        } while (0)

        if (insn > insn_buf)
                return insn - insn_buf;

        switch (si->off) {
        case offsetof(struct bpf_tcp_sock, rtt_min):
                BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) !=
                             sizeof(struct minmax));
                BUILD_BUG_ON(sizeof(struct minmax) <
                             sizeof(struct minmax_sample));

                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      offsetof(struct tcp_sock, rtt_min) +
                                      offsetof(struct minmax_sample, v));
                break;
        case offsetof(struct bpf_tcp_sock, snd_cwnd):
                BPF_TCP_SOCK_GET_COMMON(snd_cwnd);
                break;
        case offsetof(struct bpf_tcp_sock, srtt_us):
                BPF_TCP_SOCK_GET_COMMON(srtt_us);
                break;
        case offsetof(struct bpf_tcp_sock, snd_ssthresh):
                BPF_TCP_SOCK_GET_COMMON(snd_ssthresh);
                break;
        case offsetof(struct bpf_tcp_sock, rcv_nxt):
                BPF_TCP_SOCK_GET_COMMON(rcv_nxt);
                break;
        case offsetof(struct bpf_tcp_sock, snd_nxt):
                BPF_TCP_SOCK_GET_COMMON(snd_nxt);
                break;
        case offsetof(struct bpf_tcp_sock, snd_una):
                BPF_TCP_SOCK_GET_COMMON(snd_una);
                break;
        case offsetof(struct bpf_tcp_sock, mss_cache):
                BPF_TCP_SOCK_GET_COMMON(mss_cache);
                break;
        case offsetof(struct bpf_tcp_sock, ecn_flags):
                BPF_TCP_SOCK_GET_COMMON(ecn_flags);
                break;
        case offsetof(struct bpf_tcp_sock, rate_delivered):
                BPF_TCP_SOCK_GET_COMMON(rate_delivered);
                break;
        case offsetof(struct bpf_tcp_sock, rate_interval_us):
                BPF_TCP_SOCK_GET_COMMON(rate_interval_us);
                break;
        case offsetof(struct bpf_tcp_sock, packets_out):
                BPF_TCP_SOCK_GET_COMMON(packets_out);
                break;
        case offsetof(struct bpf_tcp_sock, retrans_out):
                BPF_TCP_SOCK_GET_COMMON(retrans_out);
                break;
        case offsetof(struct bpf_tcp_sock, total_retrans):
                BPF_TCP_SOCK_GET_COMMON(total_retrans);
                break;
        case offsetof(struct bpf_tcp_sock, segs_in):
                BPF_TCP_SOCK_GET_COMMON(segs_in);
                break;
        case offsetof(struct bpf_tcp_sock, data_segs_in):
                BPF_TCP_SOCK_GET_COMMON(data_segs_in);
                break;
        case offsetof(struct bpf_tcp_sock, segs_out):
                BPF_TCP_SOCK_GET_COMMON(segs_out);
                break;
        case offsetof(struct bpf_tcp_sock, data_segs_out):
                BPF_TCP_SOCK_GET_COMMON(data_segs_out);
                break;
        case offsetof(struct bpf_tcp_sock, lost_out):
                BPF_TCP_SOCK_GET_COMMON(lost_out);
                break;
        case offsetof(struct bpf_tcp_sock, sacked_out):
                BPF_TCP_SOCK_GET_COMMON(sacked_out);
                break;
        case offsetof(struct bpf_tcp_sock, bytes_received):
                BPF_TCP_SOCK_GET_COMMON(bytes_received);
                break;
        case offsetof(struct bpf_tcp_sock, bytes_acked):
                BPF_TCP_SOCK_GET_COMMON(bytes_acked);
                break;
        case offsetof(struct bpf_tcp_sock, dsack_dups):
                BPF_TCP_SOCK_GET_COMMON(dsack_dups);
                break;
        case offsetof(struct bpf_tcp_sock, delivered):
                BPF_TCP_SOCK_GET_COMMON(delivered);
                break;
        case offsetof(struct bpf_tcp_sock, delivered_ce):
                BPF_TCP_SOCK_GET_COMMON(delivered_ce);
                break;
        case offsetof(struct bpf_tcp_sock, icsk_retransmits):
                BPF_INET_SOCK_GET_COMMON(icsk_retransmits);
                break;
        }

        return insn - insn_buf;
}

BPF_CALL_1(bpf_tcp_sock, struct sock *, sk)
{
        if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP)
                return (unsigned long)sk;

        return (unsigned long)NULL;
}

const struct bpf_func_proto bpf_tcp_sock_proto = {
        .func                = bpf_tcp_sock,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_TCP_SOCK_OR_NULL,
        .arg1_type        = ARG_PTR_TO_SOCK_COMMON,
};

BPF_CALL_1(bpf_get_listener_sock, struct sock *, sk)
{
        sk = sk_to_full_sk(sk);

        if (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE))
                return (unsigned long)sk;

        return (unsigned long)NULL;
}

static const struct bpf_func_proto bpf_get_listener_sock_proto = {
        .func                = bpf_get_listener_sock,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_SOCK_COMMON,
};

BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb)
{
        unsigned int iphdr_len;

        switch (skb_protocol(skb, true)) {
        case cpu_to_be16(ETH_P_IP):
                iphdr_len = sizeof(struct iphdr);
                break;
        case cpu_to_be16(ETH_P_IPV6):
                iphdr_len = sizeof(struct ipv6hdr);
                break;
        default:
                return 0;
        }

        if (skb_headlen(skb) < iphdr_len)
                return 0;

        if (skb_cloned(skb) && !skb_clone_writable(skb, iphdr_len))
                return 0;

        return INET_ECN_set_ce(skb);
}

bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
                                  struct bpf_insn_access_aux *info)
{
        if (off < 0 || off >= offsetofend(struct bpf_xdp_sock, queue_id))
                return false;

        if (off % size != 0)
                return false;

        switch (off) {
        default:
                return size == sizeof(__u32);
        }
}

u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
                                    const struct bpf_insn *si,
                                    struct bpf_insn *insn_buf,
                                    struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;

#define BPF_XDP_SOCK_GET(FIELD)                                                \
        do {                                                                \
                BUILD_BUG_ON(sizeof_field(struct xdp_sock, FIELD) >        \
                             sizeof_field(struct bpf_xdp_sock, FIELD));        \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_sock, FIELD),\
                                      si->dst_reg, si->src_reg,                \
                                      offsetof(struct xdp_sock, FIELD)); \
        } while (0)

        switch (si->off) {
        case offsetof(struct bpf_xdp_sock, queue_id):
                BPF_XDP_SOCK_GET(queue_id);
                break;
        }

        return insn - insn_buf;
}

static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = {
        .func           = bpf_skb_ecn_set_ce,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
};

BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len,
           struct tcphdr *, th, u32, th_len)
{
#ifdef CONFIG_SYN_COOKIES
        u32 cookie;
        int ret;

        if (unlikely(!sk || th_len < sizeof(*th)))
                return -EINVAL;

        /* sk_listener() allows TCP_NEW_SYN_RECV, which makes no sense here. */
        if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN)
                return -EINVAL;

        if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies))
                return -EINVAL;

        if (!th->ack || th->rst || th->syn)
                return -ENOENT;

        if (unlikely(iph_len < sizeof(struct iphdr)))
                return -EINVAL;

        if (tcp_synq_no_recent_overflow(sk))
                return -ENOENT;

        cookie = ntohl(th->ack_seq) - 1;

        /* Both struct iphdr and struct ipv6hdr have the version field at the
         * same offset so we can cast to the shorter header (struct iphdr).
         */
        switch (((struct iphdr *)iph)->version) {
        case 4:
                if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk))
                        return -EINVAL;

                ret = __cookie_v4_check((struct iphdr *)iph, th, cookie);
                break;

#if IS_BUILTIN(CONFIG_IPV6)
        case 6:
                if (unlikely(iph_len < sizeof(struct ipv6hdr)))
                        return -EINVAL;

                if (sk->sk_family != AF_INET6)
                        return -EINVAL;

                ret = __cookie_v6_check((struct ipv6hdr *)iph, th, cookie);
                break;
#endif /* CONFIG_IPV6 */

        default:
                return -EPROTONOSUPPORT;
        }

        if (ret > 0)
                return 0;

        return -ENOENT;
#else
        return -ENOTSUPP;
#endif
}

static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = {
        .func                = bpf_tcp_check_syncookie,
        .gpl_only        = true,
        .pkt_access        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .arg2_type        = ARG_PTR_TO_MEM,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_PTR_TO_MEM,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len,
           struct tcphdr *, th, u32, th_len)
{
#ifdef CONFIG_SYN_COOKIES
        u32 cookie;
        u16 mss;

        if (unlikely(!sk || th_len < sizeof(*th) || th_len != th->doff * 4))
                return -EINVAL;

        if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN)
                return -EINVAL;

        if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies))
                return -ENOENT;

        if (!th->syn || th->ack || th->fin || th->rst)
                return -EINVAL;

        if (unlikely(iph_len < sizeof(struct iphdr)))
                return -EINVAL;

        /* Both struct iphdr and struct ipv6hdr have the version field at the
         * same offset so we can cast to the shorter header (struct iphdr).
         */
        switch (((struct iphdr *)iph)->version) {
        case 4:
                if (sk->sk_family == AF_INET6 && sk->sk_ipv6only)
                        return -EINVAL;

                mss = tcp_v4_get_syncookie(sk, iph, th, &cookie);
                break;

#if IS_BUILTIN(CONFIG_IPV6)
        case 6:
                if (unlikely(iph_len < sizeof(struct ipv6hdr)))
                        return -EINVAL;

                if (sk->sk_family != AF_INET6)
                        return -EINVAL;

                mss = tcp_v6_get_syncookie(sk, iph, th, &cookie);
                break;
#endif /* CONFIG_IPV6 */

        default:
                return -EPROTONOSUPPORT;
        }
        if (mss == 0)
                return -ENOENT;

        return cookie | ((u64)mss << 32);
#else
        return -EOPNOTSUPP;
#endif /* CONFIG_SYN_COOKIES */
}

static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = {
        .func                = bpf_tcp_gen_syncookie,
        .gpl_only        = true, /* __cookie_v*_init_sequence() is GPL */
        .pkt_access        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .arg2_type        = ARG_PTR_TO_MEM,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_PTR_TO_MEM,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags)
{
        if (!sk || flags != 0)
                return -EINVAL;
        if (!skb_at_tc_ingress(skb))
                return -EOPNOTSUPP;
        if (unlikely(dev_net(skb->dev) != sock_net(sk)))
                return -ENETUNREACH;
        if (unlikely(sk_fullsock(sk) && sk->sk_reuseport))
                return -ESOCKTNOSUPPORT;
        if (sk_unhashed(sk))
                return -EOPNOTSUPP;
        if (sk_is_refcounted(sk) &&
            unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
                return -ENOENT;

        skb_orphan(skb);
        skb->sk = sk;
        skb->destructor = sock_pfree;

        return 0;
}

static const struct bpf_func_proto bpf_sk_assign_proto = {
        .func                = bpf_sk_assign,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .arg3_type        = ARG_ANYTHING,
};

static const u8 *bpf_search_tcp_opt(const u8 *op, const u8 *opend,
                                    u8 search_kind, const u8 *magic,
                                    u8 magic_len, bool *eol)
{
        u8 kind, kind_len;

        *eol = false;

        while (op < opend) {
                kind = op[0];

                if (kind == TCPOPT_EOL) {
                        *eol = true;
                        return ERR_PTR(-ENOMSG);
                } else if (kind == TCPOPT_NOP) {
                        op++;
                        continue;
                }

                if (opend - op < 2 || opend - op < op[1] || op[1] < 2)
                        /* Something is wrong in the received header.
                         * Follow the TCP stack's tcp_parse_options()
                         * and just bail here.
                         */
                        return ERR_PTR(-EFAULT);

                kind_len = op[1];
                if (search_kind == kind) {
                        if (!magic_len)
                                return op;

                        if (magic_len > kind_len - 2)
                                return ERR_PTR(-ENOMSG);

                        if (!memcmp(&op[2], magic, magic_len))
                                return op;
                }

                op += kind_len;
        }

        return ERR_PTR(-ENOMSG);
}

BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
           void *, search_res, u32, len, u64, flags)
{
        bool eol, load_syn = flags & BPF_LOAD_HDR_OPT_TCP_SYN;
        const u8 *op, *opend, *magic, *search = search_res;
        u8 search_kind, search_len, copy_len, magic_len;
        int ret;

        /* 2 byte is the minimal option len except TCPOPT_NOP and
         * TCPOPT_EOL which are useless for the bpf prog to learn
         * and this helper disallow loading them also.
         */
        if (len < 2 || flags & ~BPF_LOAD_HDR_OPT_TCP_SYN)
                return -EINVAL;

        search_kind = search[0];
        search_len = search[1];

        if (search_len > len || search_kind == TCPOPT_NOP ||
            search_kind == TCPOPT_EOL)
                return -EINVAL;

        if (search_kind == TCPOPT_EXP || search_kind == 253) {
                /* 16 or 32 bit magic.  +2 for kind and kind length */
                if (search_len != 4 && search_len != 6)
                        return -EINVAL;
                magic = &search[2];
                magic_len = search_len - 2;
        } else {
                if (search_len)
                        return -EINVAL;
                magic = NULL;
                magic_len = 0;
        }

        if (load_syn) {
                ret = bpf_sock_ops_get_syn(bpf_sock, TCP_BPF_SYN, &op);
                if (ret < 0)
                        return ret;

                opend = op + ret;
                op += sizeof(struct tcphdr);
        } else {
                if (!bpf_sock->skb ||
                    bpf_sock->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB)
                        /* This bpf_sock->op cannot call this helper */
                        return -EPERM;

                opend = bpf_sock->skb_data_end;
                op = bpf_sock->skb->data + sizeof(struct tcphdr);
        }

        op = bpf_search_tcp_opt(op, opend, search_kind, magic, magic_len,
                                &eol);
        if (IS_ERR(op))
                return PTR_ERR(op);

        copy_len = op[1];
        ret = copy_len;
        if (copy_len > len) {
                ret = -ENOSPC;
                copy_len = len;
        }

        memcpy(search_res, op, copy_len);
        return ret;
}

static const struct bpf_func_proto bpf_sock_ops_load_hdr_opt_proto = {
        .func                = bpf_sock_ops_load_hdr_opt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_4(bpf_sock_ops_store_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
           const void *, from, u32, len, u64, flags)
{
        u8 new_kind, new_kind_len, magic_len = 0, *opend;
        const u8 *op, *new_op, *magic = NULL;
        struct sk_buff *skb;
        bool eol;

        if (bpf_sock->op != BPF_SOCK_OPS_WRITE_HDR_OPT_CB)
                return -EPERM;

        if (len < 2 || flags)
                return -EINVAL;

        new_op = from;
        new_kind = new_op[0];
        new_kind_len = new_op[1];

        if (new_kind_len > len || new_kind == TCPOPT_NOP ||
            new_kind == TCPOPT_EOL)
                return -EINVAL;

        if (new_kind_len > bpf_sock->remaining_opt_len)
                return -ENOSPC;

        /* 253 is another experimental kind */
        if (new_kind == TCPOPT_EXP || new_kind == 253)  {
                if (new_kind_len < 4)
                        return -EINVAL;
                /* Match for the 2 byte magic also.
                 * RFC 6994: the magic could be 2 or 4 bytes.
                 * Hence, matching by 2 byte only is on the
                 * conservative side but it is the right
                 * thing to do for the 'search-for-duplication'
                 * purpose.
                 */
                magic = &new_op[2];
                magic_len = 2;
        }

        /* Check for duplication */
        skb = bpf_sock->skb;
        op = skb->data + sizeof(struct tcphdr);
        opend = bpf_sock->skb_data_end;

        op = bpf_search_tcp_opt(op, opend, new_kind, magic, magic_len,
                                &eol);
        if (!IS_ERR(op))
                return -EEXIST;

        if (PTR_ERR(op) != -ENOMSG)
                return PTR_ERR(op);

        if (eol)
                /* The option has been ended.  Treat it as no more
                 * header option can be written.
                 */
                return -ENOSPC;

        /* No duplication found.  Store the header option. */
        memcpy(opend, from, new_kind_len);

        bpf_sock->remaining_opt_len -= new_kind_len;
        bpf_sock->skb_data_end += new_kind_len;

        return 0;
}

static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = {
        .func                = bpf_sock_ops_store_hdr_opt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_3(bpf_sock_ops_reserve_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
           u32, len, u64, flags)
{
        if (bpf_sock->op != BPF_SOCK_OPS_HDR_OPT_LEN_CB)
                return -EPERM;

        if (flags || len < 2)
                return -EINVAL;

        if (len > bpf_sock->remaining_opt_len)
                return -ENOSPC;

        bpf_sock->remaining_opt_len -= len;

        return 0;
}

static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = {
        .func                = bpf_sock_ops_reserve_hdr_opt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
};

#endif /* CONFIG_INET */

bool bpf_helper_changes_pkt_data(void *func)
{
        if (func == bpf_skb_vlan_push ||
            func == bpf_skb_vlan_pop ||
            func == bpf_skb_store_bytes ||
            func == bpf_skb_change_proto ||
            func == bpf_skb_change_head ||
            func == sk_skb_change_head ||
            func == bpf_skb_change_tail ||
            func == sk_skb_change_tail ||
            func == bpf_skb_adjust_room ||
            func == sk_skb_adjust_room ||
            func == bpf_skb_pull_data ||
            func == sk_skb_pull_data ||
            func == bpf_clone_redirect ||
            func == bpf_l3_csum_replace ||
            func == bpf_l4_csum_replace ||
            func == bpf_xdp_adjust_head ||
            func == bpf_xdp_adjust_meta ||
            func == bpf_msg_pull_data ||
            func == bpf_msg_push_data ||
            func == bpf_msg_pop_data ||
            func == bpf_xdp_adjust_tail ||
#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
            func == bpf_lwt_seg6_store_bytes ||
            func == bpf_lwt_seg6_adjust_srh ||
            func == bpf_lwt_seg6_action ||
#endif
#ifdef CONFIG_INET
            func == bpf_sock_ops_store_hdr_opt ||
#endif
            func == bpf_lwt_in_push_encap ||
            func == bpf_lwt_xmit_push_encap)
                return true;

        return false;
}

const struct bpf_func_proto bpf_event_output_data_proto __weak;
const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto __weak;

static const struct bpf_func_proto *
sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        /* inet and inet6 sockets are created in a process
         * context so there is always a valid uid/gid
         */
        case BPF_FUNC_get_current_uid_gid:
                return &bpf_get_current_uid_gid_proto;
        case BPF_FUNC_get_local_storage:
                return &bpf_get_local_storage_proto;
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_cookie_sock_proto;
        case BPF_FUNC_get_netns_cookie:
                return &bpf_get_netns_cookie_sock_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_event_output_data_proto;
        case BPF_FUNC_get_current_pid_tgid:
                return &bpf_get_current_pid_tgid_proto;
        case BPF_FUNC_get_current_comm:
                return &bpf_get_current_comm_proto;
#ifdef CONFIG_CGROUPS
        case BPF_FUNC_get_current_cgroup_id:
                return &bpf_get_current_cgroup_id_proto;
        case BPF_FUNC_get_current_ancestor_cgroup_id:
                return &bpf_get_current_ancestor_cgroup_id_proto;
#endif
#ifdef CONFIG_CGROUP_NET_CLASSID
        case BPF_FUNC_get_cgroup_classid:
                return &bpf_get_cgroup_classid_curr_proto;
#endif
        case BPF_FUNC_sk_storage_get:
                return &bpf_sk_storage_get_cg_sock_proto;
        default:
                return bpf_base_func_proto(func_id);
        }
}

static const struct bpf_func_proto *
sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        /* inet and inet6 sockets are created in a process
         * context so there is always a valid uid/gid
         */
        case BPF_FUNC_get_current_uid_gid:
                return &bpf_get_current_uid_gid_proto;
        case BPF_FUNC_bind:
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_INET4_CONNECT:
                case BPF_CGROUP_INET6_CONNECT:
                        return &bpf_bind_proto;
                default:
                        return NULL;
                }
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_cookie_sock_addr_proto;
        case BPF_FUNC_get_netns_cookie:
                return &bpf_get_netns_cookie_sock_addr_proto;
        case BPF_FUNC_get_local_storage:
                return &bpf_get_local_storage_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_event_output_data_proto;
        case BPF_FUNC_get_current_pid_tgid:
                return &bpf_get_current_pid_tgid_proto;
        case BPF_FUNC_get_current_comm:
                return &bpf_get_current_comm_proto;
#ifdef CONFIG_CGROUPS
        case BPF_FUNC_get_current_cgroup_id:
                return &bpf_get_current_cgroup_id_proto;
        case BPF_FUNC_get_current_ancestor_cgroup_id:
                return &bpf_get_current_ancestor_cgroup_id_proto;
#endif
#ifdef CONFIG_CGROUP_NET_CLASSID
        case BPF_FUNC_get_cgroup_classid:
                return &bpf_get_cgroup_classid_curr_proto;
#endif
#ifdef CONFIG_INET
        case BPF_FUNC_sk_lookup_tcp:
                return &bpf_sock_addr_sk_lookup_tcp_proto;
        case BPF_FUNC_sk_lookup_udp:
                return &bpf_sock_addr_sk_lookup_udp_proto;
        case BPF_FUNC_sk_release:
                return &bpf_sk_release_proto;
        case BPF_FUNC_skc_lookup_tcp:
                return &bpf_sock_addr_skc_lookup_tcp_proto;
#endif /* CONFIG_INET */
        case BPF_FUNC_sk_storage_get:
                return &bpf_sk_storage_get_proto;
        case BPF_FUNC_sk_storage_delete:
                return &bpf_sk_storage_delete_proto;
        case BPF_FUNC_setsockopt:
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_INET4_CONNECT:
                case BPF_CGROUP_INET6_CONNECT:
                        return &bpf_sock_addr_setsockopt_proto;
                default:
                        return NULL;
                }
        case BPF_FUNC_getsockopt:
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_INET4_CONNECT:
                case BPF_CGROUP_INET6_CONNECT:
                        return &bpf_sock_addr_getsockopt_proto;
                default:
                        return NULL;
                }
        default:
                return bpf_sk_base_func_proto(func_id);
        }
}

static const struct bpf_func_proto *
sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_skb_load_bytes:
                return &bpf_skb_load_bytes_proto;
        case BPF_FUNC_skb_load_bytes_relative:
                return &bpf_skb_load_bytes_relative_proto;
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_cookie_proto;
        case BPF_FUNC_get_socket_uid:
                return &bpf_get_socket_uid_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_skb_event_output_proto;
        default:
                return bpf_sk_base_func_proto(func_id);
        }
}

const struct bpf_func_proto bpf_sk_storage_get_proto __weak;
const struct bpf_func_proto bpf_sk_storage_delete_proto __weak;

static const struct bpf_func_proto *
cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_get_local_storage:
                return &bpf_get_local_storage_proto;
        case BPF_FUNC_sk_fullsock:
                return &bpf_sk_fullsock_proto;
        case BPF_FUNC_sk_storage_get:
                return &bpf_sk_storage_get_proto;
        case BPF_FUNC_sk_storage_delete:
                return &bpf_sk_storage_delete_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_skb_event_output_proto;
#ifdef CONFIG_SOCK_CGROUP_DATA
        case BPF_FUNC_skb_cgroup_id:
                return &bpf_skb_cgroup_id_proto;
        case BPF_FUNC_skb_ancestor_cgroup_id:
                return &bpf_skb_ancestor_cgroup_id_proto;
        case BPF_FUNC_sk_cgroup_id:
                return &bpf_sk_cgroup_id_proto;
        case BPF_FUNC_sk_ancestor_cgroup_id:
                return &bpf_sk_ancestor_cgroup_id_proto;
#endif
#ifdef CONFIG_INET
        case BPF_FUNC_sk_lookup_tcp:
                return &bpf_sk_lookup_tcp_proto;
        case BPF_FUNC_sk_lookup_udp:
                return &bpf_sk_lookup_udp_proto;
        case BPF_FUNC_sk_release:
                return &bpf_sk_release_proto;
        case BPF_FUNC_skc_lookup_tcp:
                return &bpf_skc_lookup_tcp_proto;
        case BPF_FUNC_tcp_sock:
                return &bpf_tcp_sock_proto;
        case BPF_FUNC_get_listener_sock:
                return &bpf_get_listener_sock_proto;
        case BPF_FUNC_skb_ecn_set_ce:
                return &bpf_skb_ecn_set_ce_proto;
#endif
        default:
                return sk_filter_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_skb_store_bytes:
                return &bpf_skb_store_bytes_proto;
        case BPF_FUNC_skb_load_bytes:
                return &bpf_skb_load_bytes_proto;
        case BPF_FUNC_skb_load_bytes_relative:
                return &bpf_skb_load_bytes_relative_proto;
        case BPF_FUNC_skb_pull_data:
                return &bpf_skb_pull_data_proto;
        case BPF_FUNC_csum_diff:
                return &bpf_csum_diff_proto;
        case BPF_FUNC_csum_update:
                return &bpf_csum_update_proto;
        case BPF_FUNC_csum_level:
                return &bpf_csum_level_proto;
        case BPF_FUNC_l3_csum_replace:
                return &bpf_l3_csum_replace_proto;
        case BPF_FUNC_l4_csum_replace:
                return &bpf_l4_csum_replace_proto;
        case BPF_FUNC_clone_redirect:
                return &bpf_clone_redirect_proto;
        case BPF_FUNC_get_cgroup_classid:
                return &bpf_get_cgroup_classid_proto;
        case BPF_FUNC_skb_vlan_push:
                return &bpf_skb_vlan_push_proto;
        case BPF_FUNC_skb_vlan_pop:
                return &bpf_skb_vlan_pop_proto;
        case BPF_FUNC_skb_change_proto:
                return &bpf_skb_change_proto_proto;
        case BPF_FUNC_skb_change_type:
                return &bpf_skb_change_type_proto;
        case BPF_FUNC_skb_adjust_room:
                return &bpf_skb_adjust_room_proto;
        case BPF_FUNC_skb_change_tail:
                return &bpf_skb_change_tail_proto;
        case BPF_FUNC_skb_change_head:
                return &bpf_skb_change_head_proto;
        case BPF_FUNC_skb_get_tunnel_key:
                return &bpf_skb_get_tunnel_key_proto;
        case BPF_FUNC_skb_set_tunnel_key:
                return bpf_get_skb_set_tunnel_proto(func_id);
        case BPF_FUNC_skb_get_tunnel_opt:
                return &bpf_skb_get_tunnel_opt_proto;
        case BPF_FUNC_skb_set_tunnel_opt:
                return bpf_get_skb_set_tunnel_proto(func_id);
        case BPF_FUNC_redirect:
                return &bpf_redirect_proto;
        case BPF_FUNC_redirect_neigh:
                return &bpf_redirect_neigh_proto;
        case BPF_FUNC_redirect_peer:
                return &bpf_redirect_peer_proto;
        case BPF_FUNC_get_route_realm:
                return &bpf_get_route_realm_proto;
        case BPF_FUNC_get_hash_recalc:
                return &bpf_get_hash_recalc_proto;
        case BPF_FUNC_set_hash_invalid:
                return &bpf_set_hash_invalid_proto;
        case BPF_FUNC_set_hash:
                return &bpf_set_hash_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_skb_event_output_proto;
        case BPF_FUNC_get_smp_processor_id:
                return &bpf_get_smp_processor_id_proto;
        case BPF_FUNC_skb_under_cgroup:
                return &bpf_skb_under_cgroup_proto;
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_cookie_proto;
        case BPF_FUNC_get_socket_uid:
                return &bpf_get_socket_uid_proto;
        case BPF_FUNC_fib_lookup:
                return &bpf_skb_fib_lookup_proto;
        case BPF_FUNC_sk_fullsock:
                return &bpf_sk_fullsock_proto;
        case BPF_FUNC_sk_storage_get:
                return &bpf_sk_storage_get_proto;
        case BPF_FUNC_sk_storage_delete:
                return &bpf_sk_storage_delete_proto;
#ifdef CONFIG_XFRM
        case BPF_FUNC_skb_get_xfrm_state:
                return &bpf_skb_get_xfrm_state_proto;
#endif
#ifdef CONFIG_CGROUP_NET_CLASSID
        case BPF_FUNC_skb_cgroup_classid:
                return &bpf_skb_cgroup_classid_proto;
#endif
#ifdef CONFIG_SOCK_CGROUP_DATA
        case BPF_FUNC_skb_cgroup_id:
                return &bpf_skb_cgroup_id_proto;
        case BPF_FUNC_skb_ancestor_cgroup_id:
                return &bpf_skb_ancestor_cgroup_id_proto;
#endif
#ifdef CONFIG_INET
        case BPF_FUNC_sk_lookup_tcp:
                return &bpf_sk_lookup_tcp_proto;
        case BPF_FUNC_sk_lookup_udp:
                return &bpf_sk_lookup_udp_proto;
        case BPF_FUNC_sk_release:
                return &bpf_sk_release_proto;
        case BPF_FUNC_tcp_sock:
                return &bpf_tcp_sock_proto;
        case BPF_FUNC_get_listener_sock:
                return &bpf_get_listener_sock_proto;
        case BPF_FUNC_skc_lookup_tcp:
                return &bpf_skc_lookup_tcp_proto;
        case BPF_FUNC_tcp_check_syncookie:
                return &bpf_tcp_check_syncookie_proto;
        case BPF_FUNC_skb_ecn_set_ce:
                return &bpf_skb_ecn_set_ce_proto;
        case BPF_FUNC_tcp_gen_syncookie:
                return &bpf_tcp_gen_syncookie_proto;
        case BPF_FUNC_sk_assign:
                return &bpf_sk_assign_proto;
#endif
        default:
                return bpf_sk_base_func_proto(func_id);
        }
}

static const struct bpf_func_proto *
xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_perf_event_output:
                return &bpf_xdp_event_output_proto;
        case BPF_FUNC_get_smp_processor_id:
                return &bpf_get_smp_processor_id_proto;
        case BPF_FUNC_csum_diff:
                return &bpf_csum_diff_proto;
        case BPF_FUNC_xdp_adjust_head:
                return &bpf_xdp_adjust_head_proto;
        case BPF_FUNC_xdp_adjust_meta:
                return &bpf_xdp_adjust_meta_proto;
        case BPF_FUNC_redirect:
                return &bpf_xdp_redirect_proto;
        case BPF_FUNC_redirect_map:
                return &bpf_xdp_redirect_map_proto;
        case BPF_FUNC_xdp_adjust_tail:
                return &bpf_xdp_adjust_tail_proto;
        case BPF_FUNC_fib_lookup:
                return &bpf_xdp_fib_lookup_proto;
#ifdef CONFIG_INET
        case BPF_FUNC_sk_lookup_udp:
                return &bpf_xdp_sk_lookup_udp_proto;
        case BPF_FUNC_sk_lookup_tcp:
                return &bpf_xdp_sk_lookup_tcp_proto;
        case BPF_FUNC_sk_release:
                return &bpf_sk_release_proto;
        case BPF_FUNC_skc_lookup_tcp:
                return &bpf_xdp_skc_lookup_tcp_proto;
        case BPF_FUNC_tcp_check_syncookie:
                return &bpf_tcp_check_syncookie_proto;
        case BPF_FUNC_tcp_gen_syncookie:
                return &bpf_tcp_gen_syncookie_proto;
#endif
        default:
                return bpf_sk_base_func_proto(func_id);
        }
}

const struct bpf_func_proto bpf_sock_map_update_proto __weak;
const struct bpf_func_proto bpf_sock_hash_update_proto __weak;

static const struct bpf_func_proto *
sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_setsockopt:
                return &bpf_sock_ops_setsockopt_proto;
        case BPF_FUNC_getsockopt:
                return &bpf_sock_ops_getsockopt_proto;
        case BPF_FUNC_sock_ops_cb_flags_set:
                return &bpf_sock_ops_cb_flags_set_proto;
        case BPF_FUNC_sock_map_update:
                return &bpf_sock_map_update_proto;
        case BPF_FUNC_sock_hash_update:
                return &bpf_sock_hash_update_proto;
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_cookie_sock_ops_proto;
        case BPF_FUNC_get_local_storage:
                return &bpf_get_local_storage_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_event_output_data_proto;
        case BPF_FUNC_sk_storage_get:
                return &bpf_sk_storage_get_proto;
        case BPF_FUNC_sk_storage_delete:
                return &bpf_sk_storage_delete_proto;
#ifdef CONFIG_INET
        case BPF_FUNC_load_hdr_opt:
                return &bpf_sock_ops_load_hdr_opt_proto;
        case BPF_FUNC_store_hdr_opt:
                return &bpf_sock_ops_store_hdr_opt_proto;
        case BPF_FUNC_reserve_hdr_opt:
                return &bpf_sock_ops_reserve_hdr_opt_proto;
        case BPF_FUNC_tcp_sock:
                return &bpf_tcp_sock_proto;
#endif /* CONFIG_INET */
        default:
                return bpf_sk_base_func_proto(func_id);
        }
}

const struct bpf_func_proto bpf_msg_redirect_map_proto __weak;
const struct bpf_func_proto bpf_msg_redirect_hash_proto __weak;

static const struct bpf_func_proto *
sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_msg_redirect_map:
                return &bpf_msg_redirect_map_proto;
        case BPF_FUNC_msg_redirect_hash:
                return &bpf_msg_redirect_hash_proto;
        case BPF_FUNC_msg_apply_bytes:
                return &bpf_msg_apply_bytes_proto;
        case BPF_FUNC_msg_cork_bytes:
                return &bpf_msg_cork_bytes_proto;
        case BPF_FUNC_msg_pull_data:
                return &bpf_msg_pull_data_proto;
        case BPF_FUNC_msg_push_data:
                return &bpf_msg_push_data_proto;
        case BPF_FUNC_msg_pop_data:
                return &bpf_msg_pop_data_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_event_output_data_proto;
        case BPF_FUNC_get_current_uid_gid:
                return &bpf_get_current_uid_gid_proto;
        case BPF_FUNC_get_current_pid_tgid:
                return &bpf_get_current_pid_tgid_proto;
        case BPF_FUNC_sk_storage_get:
                return &bpf_sk_storage_get_proto;
        case BPF_FUNC_sk_storage_delete:
                return &bpf_sk_storage_delete_proto;
#ifdef CONFIG_CGROUPS
        case BPF_FUNC_get_current_cgroup_id:
                return &bpf_get_current_cgroup_id_proto;
        case BPF_FUNC_get_current_ancestor_cgroup_id:
                return &bpf_get_current_ancestor_cgroup_id_proto;
#endif
#ifdef CONFIG_CGROUP_NET_CLASSID
        case BPF_FUNC_get_cgroup_classid:
                return &bpf_get_cgroup_classid_curr_proto;
#endif
        default:
                return bpf_sk_base_func_proto(func_id);
        }
}

const struct bpf_func_proto bpf_sk_redirect_map_proto __weak;
const struct bpf_func_proto bpf_sk_redirect_hash_proto __weak;

static const struct bpf_func_proto *
sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_skb_store_bytes:
                return &bpf_skb_store_bytes_proto;
        case BPF_FUNC_skb_load_bytes:
                return &bpf_skb_load_bytes_proto;
        case BPF_FUNC_skb_pull_data:
                return &sk_skb_pull_data_proto;
        case BPF_FUNC_skb_change_tail:
                return &sk_skb_change_tail_proto;
        case BPF_FUNC_skb_change_head:
                return &sk_skb_change_head_proto;
        case BPF_FUNC_skb_adjust_room:
                return &sk_skb_adjust_room_proto;
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_cookie_proto;
        case BPF_FUNC_get_socket_uid:
                return &bpf_get_socket_uid_proto;
        case BPF_FUNC_sk_redirect_map:
                return &bpf_sk_redirect_map_proto;
        case BPF_FUNC_sk_redirect_hash:
                return &bpf_sk_redirect_hash_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_skb_event_output_proto;
#ifdef CONFIG_INET
        case BPF_FUNC_sk_lookup_tcp:
                return &bpf_sk_lookup_tcp_proto;
        case BPF_FUNC_sk_lookup_udp:
                return &bpf_sk_lookup_udp_proto;
        case BPF_FUNC_sk_release:
                return &bpf_sk_release_proto;
        case BPF_FUNC_skc_lookup_tcp:
                return &bpf_skc_lookup_tcp_proto;
#endif
        default:
                return bpf_sk_base_func_proto(func_id);
        }
}

static const struct bpf_func_proto *
flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_skb_load_bytes:
                return &bpf_flow_dissector_load_bytes_proto;
        default:
                return bpf_sk_base_func_proto(func_id);
        }
}

static const struct bpf_func_proto *
lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_skb_load_bytes:
                return &bpf_skb_load_bytes_proto;
        case BPF_FUNC_skb_pull_data:
                return &bpf_skb_pull_data_proto;
        case BPF_FUNC_csum_diff:
                return &bpf_csum_diff_proto;
        case BPF_FUNC_get_cgroup_classid:
                return &bpf_get_cgroup_classid_proto;
        case BPF_FUNC_get_route_realm:
                return &bpf_get_route_realm_proto;
        case BPF_FUNC_get_hash_recalc:
                return &bpf_get_hash_recalc_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_skb_event_output_proto;
        case BPF_FUNC_get_smp_processor_id:
                return &bpf_get_smp_processor_id_proto;
        case BPF_FUNC_skb_under_cgroup:
                return &bpf_skb_under_cgroup_proto;
        default:
                return bpf_sk_base_func_proto(func_id);
        }
}

static const struct bpf_func_proto *
lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_lwt_push_encap:
                return &bpf_lwt_in_push_encap_proto;
        default:
                return lwt_out_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_skb_get_tunnel_key:
                return &bpf_skb_get_tunnel_key_proto;
        case BPF_FUNC_skb_set_tunnel_key:
                return bpf_get_skb_set_tunnel_proto(func_id);
        case BPF_FUNC_skb_get_tunnel_opt:
                return &bpf_skb_get_tunnel_opt_proto;
        case BPF_FUNC_skb_set_tunnel_opt:
                return bpf_get_skb_set_tunnel_proto(func_id);
        case BPF_FUNC_redirect:
                return &bpf_redirect_proto;
        case BPF_FUNC_clone_redirect:
                return &bpf_clone_redirect_proto;
        case BPF_FUNC_skb_change_tail:
                return &bpf_skb_change_tail_proto;
        case BPF_FUNC_skb_change_head:
                return &bpf_skb_change_head_proto;
        case BPF_FUNC_skb_store_bytes:
                return &bpf_skb_store_bytes_proto;
        case BPF_FUNC_csum_update:
                return &bpf_csum_update_proto;
        case BPF_FUNC_csum_level:
                return &bpf_csum_level_proto;
        case BPF_FUNC_l3_csum_replace:
                return &bpf_l3_csum_replace_proto;
        case BPF_FUNC_l4_csum_replace:
                return &bpf_l4_csum_replace_proto;
        case BPF_FUNC_set_hash_invalid:
                return &bpf_set_hash_invalid_proto;
        case BPF_FUNC_lwt_push_encap:
                return &bpf_lwt_xmit_push_encap_proto;
        default:
                return lwt_out_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
        case BPF_FUNC_lwt_seg6_store_bytes:
                return &bpf_lwt_seg6_store_bytes_proto;
        case BPF_FUNC_lwt_seg6_action:
                return &bpf_lwt_seg6_action_proto;
        case BPF_FUNC_lwt_seg6_adjust_srh:
                return &bpf_lwt_seg6_adjust_srh_proto;
#endif
        default:
                return lwt_out_func_proto(func_id, prog);
        }
}

static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type,
                                    const struct bpf_prog *prog,
                                    struct bpf_insn_access_aux *info)
{
        const int size_default = sizeof(__u32);

        if (off < 0 || off >= sizeof(struct __sk_buff))
                return false;

        /* The verifier guarantees that size > 0. */
        if (off % size != 0)
                return false;

        switch (off) {
        case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
                if (off + size > offsetofend(struct __sk_buff, cb[4]))
                        return false;
                break;
        case bpf_ctx_range_till(struct __sk_buff, remote_ip6[0], remote_ip6[3]):
        case bpf_ctx_range_till(struct __sk_buff, local_ip6[0], local_ip6[3]):
        case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4):
        case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4):
        case bpf_ctx_range(struct __sk_buff, data):
        case bpf_ctx_range(struct __sk_buff, data_meta):
        case bpf_ctx_range(struct __sk_buff, data_end):
                if (size != size_default)
                        return false;
                break;
        case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
                return false;
        case bpf_ctx_range(struct __sk_buff, tstamp):
                if (size != sizeof(__u64))
                        return false;
                break;
        case bpf_ctx_range_ptr(struct __sk_buff, sk):
                if (type == BPF_WRITE || size != sizeof(__u64))
                        return false;
                info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
                break;
        default:
                /* Only narrow read access allowed for now. */
                if (type == BPF_WRITE) {
                        if (size != size_default)
                                return false;
                } else {
                        bpf_ctx_record_field_size(info, size_default);
                        if (!bpf_ctx_narrow_access_ok(off, size, size_default))
                                return false;
                }
        }

        return true;
}

static bool sk_filter_is_valid_access(int off, int size,
                                      enum bpf_access_type type,
                                      const struct bpf_prog *prog,
                                      struct bpf_insn_access_aux *info)
{
        switch (off) {
        case bpf_ctx_range(struct __sk_buff, tc_classid):
        case bpf_ctx_range(struct __sk_buff, data):
        case bpf_ctx_range(struct __sk_buff, data_meta):
        case bpf_ctx_range(struct __sk_buff, data_end):
        case bpf_ctx_range_till(struct __sk_buff, family, local_port):
        case bpf_ctx_range(struct __sk_buff, tstamp):
        case bpf_ctx_range(struct __sk_buff, wire_len):
                return false;
        }

        if (type == BPF_WRITE) {
                switch (off) {
                case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
                        break;
                default:
                        return false;
                }
        }

        return bpf_skb_is_valid_access(off, size, type, prog, info);
}

static bool cg_skb_is_valid_access(int off, int size,
                                   enum bpf_access_type type,
                                   const struct bpf_prog *prog,
                                   struct bpf_insn_access_aux *info)
{
        switch (off) {
        case bpf_ctx_range(struct __sk_buff, tc_classid):
        case bpf_ctx_range(struct __sk_buff, data_meta):
        case bpf_ctx_range(struct __sk_buff, wire_len):
                return false;
        case bpf_ctx_range(struct __sk_buff, data):
        case bpf_ctx_range(struct __sk_buff, data_end):
                if (!bpf_capable())
                        return false;
                break;
        }

        if (type == BPF_WRITE) {
                switch (off) {
                case bpf_ctx_range(struct __sk_buff, mark):
                case bpf_ctx_range(struct __sk_buff, priority):
                case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
                        break;
                case bpf_ctx_range(struct __sk_buff, tstamp):
                        if (!bpf_capable())
                                return false;
                        break;
                default:
                        return false;
                }
        }

        switch (off) {
        case bpf_ctx_range(struct __sk_buff, data):
                info->reg_type = PTR_TO_PACKET;
                break;
        case bpf_ctx_range(struct __sk_buff, data_end):
                info->reg_type = PTR_TO_PACKET_END;
                break;
        }

        return bpf_skb_is_valid_access(off, size, type, prog, info);
}

static bool lwt_is_valid_access(int off, int size,
                                enum bpf_access_type type,
                                const struct bpf_prog *prog,
                                struct bpf_insn_access_aux *info)
{
        switch (off) {
        case bpf_ctx_range(struct __sk_buff, tc_classid):
        case bpf_ctx_range_till(struct __sk_buff, family, local_port):
        case bpf_ctx_range(struct __sk_buff, data_meta):
        case bpf_ctx_range(struct __sk_buff, tstamp):
        case bpf_ctx_range(struct __sk_buff, wire_len):
                return false;
        }

        if (type == BPF_WRITE) {
                switch (off) {
                case bpf_ctx_range(struct __sk_buff, mark):
                case bpf_ctx_range(struct __sk_buff, priority):
                case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
                        break;
                default:
                        return false;
                }
        }

        switch (off) {
        case bpf_ctx_range(struct __sk_buff, data):
                info->reg_type = PTR_TO_PACKET;
                break;
        case bpf_ctx_range(struct __sk_buff, data_end):
                info->reg_type = PTR_TO_PACKET_END;
                break;
        }

        return bpf_skb_is_valid_access(off, size, type, prog, info);
}

/* Attach type specific accesses */
static bool __sock_filter_check_attach_type(int off,
                                            enum bpf_access_type access_type,
                                            enum bpf_attach_type attach_type)
{
        switch (off) {
        case offsetof(struct bpf_sock, bound_dev_if):
        case offsetof(struct bpf_sock, mark):
        case offsetof(struct bpf_sock, priority):
                switch (attach_type) {
                case BPF_CGROUP_INET_SOCK_CREATE:
                case BPF_CGROUP_INET_SOCK_RELEASE:
                        goto full_access;
                default:
                        return false;
                }
        case bpf_ctx_range(struct bpf_sock, src_ip4):
                switch (attach_type) {
                case BPF_CGROUP_INET4_POST_BIND:
                        goto read_only;
                default:
                        return false;
                }
        case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
                switch (attach_type) {
                case BPF_CGROUP_INET6_POST_BIND:
                        goto read_only;
                default:
                        return false;
                }
        case bpf_ctx_range(struct bpf_sock, src_port):
                switch (attach_type) {
                case BPF_CGROUP_INET4_POST_BIND:
                case BPF_CGROUP_INET6_POST_BIND:
                        goto read_only;
                default:
                        return false;
                }
        }
read_only:
        return access_type == BPF_READ;
full_access:
        return true;
}

bool bpf_sock_common_is_valid_access(int off, int size,
                                     enum bpf_access_type type,
                                     struct bpf_insn_access_aux *info)
{
        switch (off) {
        case bpf_ctx_range_till(struct bpf_sock, type, priority):
                return false;
        default:
                return bpf_sock_is_valid_access(off, size, type, info);
        }
}

bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
                              struct bpf_insn_access_aux *info)
{
        const int size_default = sizeof(__u32);
        int field_size;

        if (off < 0 || off >= sizeof(struct bpf_sock))
                return false;
        if (off % size != 0)
                return false;

        switch (off) {
        case offsetof(struct bpf_sock, state):
        case offsetof(struct bpf_sock, family):
        case offsetof(struct bpf_sock, type):
        case offsetof(struct bpf_sock, protocol):
        case offsetof(struct bpf_sock, src_port):
        case offsetof(struct bpf_sock, rx_queue_mapping):
        case bpf_ctx_range(struct bpf_sock, src_ip4):
        case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
        case bpf_ctx_range(struct bpf_sock, dst_ip4):
        case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]):
                bpf_ctx_record_field_size(info, size_default);
                return bpf_ctx_narrow_access_ok(off, size, size_default);
        case bpf_ctx_range(struct bpf_sock, dst_port):
                field_size = size == size_default ?
                        size_default : sizeof_field(struct bpf_sock, dst_port);
                bpf_ctx_record_field_size(info, field_size);
                return bpf_ctx_narrow_access_ok(off, size, field_size);
        case offsetofend(struct bpf_sock, dst_port) ...
             offsetof(struct bpf_sock, dst_ip4) - 1:
                return false;
        }

        return size == size_default;
}

static bool sock_filter_is_valid_access(int off, int size,
                                        enum bpf_access_type type,
                                        const struct bpf_prog *prog,
                                        struct bpf_insn_access_aux *info)
{
        if (!bpf_sock_is_valid_access(off, size, type, info))
                return false;
        return __sock_filter_check_attach_type(off, type,
                                               prog->expected_attach_type);
}

static int bpf_noop_prologue(struct bpf_insn *insn_buf, bool direct_write,
                             const struct bpf_prog *prog)
{
        /* Neither direct read nor direct write requires any preliminary
         * action.
         */
        return 0;
}

static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write,
                                const struct bpf_prog *prog, int drop_verdict)
{
        struct bpf_insn *insn = insn_buf;

        if (!direct_write)
                return 0;

        /* if (!skb->cloned)
         *       goto start;
         *
         * (Fast-path, otherwise approximation that we might be
         *  a clone, do the rest in helper.)
         */
        *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET());
        *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK);
        *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7);

        /* ret = bpf_skb_pull_data(skb, 0); */
        *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
        *insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2);
        *insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
                               BPF_FUNC_skb_pull_data);
        /* if (!ret)
         *      goto restore;
         * return TC_ACT_SHOT;
         */
        *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2);
        *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, drop_verdict);
        *insn++ = BPF_EXIT_INSN();

        /* restore: */
        *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6);
        /* start: */
        *insn++ = prog->insnsi[0];

        return insn - insn_buf;
}

static int bpf_gen_ld_abs(const struct bpf_insn *orig,
                          struct bpf_insn *insn_buf)
{
        bool indirect = BPF_MODE(orig->code) == BPF_IND;
        struct bpf_insn *insn = insn_buf;

        if (!indirect) {
                *insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm);
        } else {
                *insn++ = BPF_MOV64_REG(BPF_REG_2, orig->src_reg);
                if (orig->imm)
                        *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm);
        }
        /* We're guaranteed here that CTX is in R6. */
        *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX);

        switch (BPF_SIZE(orig->code)) {
        case BPF_B:
                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8_no_cache);
                break;
        case BPF_H:
                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16_no_cache);
                break;
        case BPF_W:
                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32_no_cache);
                break;
        }

        *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 2);
        *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0);
        *insn++ = BPF_EXIT_INSN();

        return insn - insn_buf;
}

static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
                               const struct bpf_prog *prog)
{
        return bpf_unclone_prologue(insn_buf, direct_write, prog, TC_ACT_SHOT);
}

static bool tc_cls_act_is_valid_access(int off, int size,
                                       enum bpf_access_type type,
                                       const struct bpf_prog *prog,
                                       struct bpf_insn_access_aux *info)
{
        if (type == BPF_WRITE) {
                switch (off) {
                case bpf_ctx_range(struct __sk_buff, mark):
                case bpf_ctx_range(struct __sk_buff, tc_index):
                case bpf_ctx_range(struct __sk_buff, priority):
                case bpf_ctx_range(struct __sk_buff, tc_classid):
                case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
                case bpf_ctx_range(struct __sk_buff, tstamp):
                case bpf_ctx_range(struct __sk_buff, queue_mapping):
                        break;
                default:
                        return false;
                }
        }

        switch (off) {
        case bpf_ctx_range(struct __sk_buff, data):
                info->reg_type = PTR_TO_PACKET;
                break;
        case bpf_ctx_range(struct __sk_buff, data_meta):
                info->reg_type = PTR_TO_PACKET_META;
                break;
        case bpf_ctx_range(struct __sk_buff, data_end):
                info->reg_type = PTR_TO_PACKET_END;
                break;
        case bpf_ctx_range_till(struct __sk_buff, family, local_port):
                return false;
        }

        return bpf_skb_is_valid_access(off, size, type, prog, info);
}

static bool __is_valid_xdp_access(int off, int size)
{
        if (off < 0 || off >= sizeof(struct xdp_md))
                return false;
        if (off % size != 0)
                return false;
        if (size != sizeof(__u32))
                return false;

        return true;
}

static bool xdp_is_valid_access(int off, int size,
                                enum bpf_access_type type,
                                const struct bpf_prog *prog,
                                struct bpf_insn_access_aux *info)
{
        if (prog->expected_attach_type != BPF_XDP_DEVMAP) {
                switch (off) {
                case offsetof(struct xdp_md, egress_ifindex):
                        return false;
                }
        }

        if (type == BPF_WRITE) {
                if (bpf_prog_is_dev_bound(prog->aux)) {
                        switch (off) {
                        case offsetof(struct xdp_md, rx_queue_index):
                                return __is_valid_xdp_access(off, size);
                        }
                }
                return false;
        }

        switch (off) {
        case offsetof(struct xdp_md, data):
                info->reg_type = PTR_TO_PACKET;
                break;
        case offsetof(struct xdp_md, data_meta):
                info->reg_type = PTR_TO_PACKET_META;
                break;
        case offsetof(struct xdp_md, data_end):
                info->reg_type = PTR_TO_PACKET_END;
                break;
        }

        return __is_valid_xdp_access(off, size);
}

void bpf_warn_invalid_xdp_action(u32 act)
{
        const u32 act_max = XDP_REDIRECT;

        pr_warn_once("%s XDP return value %u, expect packet loss!\n",
                     act > act_max ? "Illegal" : "Driver unsupported",
                     act);
}
EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);

static bool sock_addr_is_valid_access(int off, int size,
                                      enum bpf_access_type type,
                                      const struct bpf_prog *prog,
                                      struct bpf_insn_access_aux *info)
{
        const int size_default = sizeof(__u32);

        if (off < 0 || off >= sizeof(struct bpf_sock_addr))
                return false;
        if (off % size != 0)
                return false;

        /* Disallow access to IPv6 fields from IPv4 contex and vise
         * versa.
         */
        switch (off) {
        case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_INET4_BIND:
                case BPF_CGROUP_INET4_CONNECT:
                case BPF_CGROUP_INET4_GETPEERNAME:
                case BPF_CGROUP_INET4_GETSOCKNAME:
                case BPF_CGROUP_UDP4_SENDMSG:
                case BPF_CGROUP_UDP4_RECVMSG:
                        break;
                default:
                        return false;
                }
                break;
        case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_INET6_BIND:
                case BPF_CGROUP_INET6_CONNECT:
                case BPF_CGROUP_INET6_GETPEERNAME:
                case BPF_CGROUP_INET6_GETSOCKNAME:
                case BPF_CGROUP_UDP6_SENDMSG:
                case BPF_CGROUP_UDP6_RECVMSG:
                        break;
                default:
                        return false;
                }
                break;
        case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_UDP4_SENDMSG:
                        break;
                default:
                        return false;
                }
                break;
        case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
                                msg_src_ip6[3]):
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_UDP6_SENDMSG:
                        break;
                default:
                        return false;
                }
                break;
        }

        switch (off) {
        case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
        case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
        case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
        case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
                                msg_src_ip6[3]):
        case bpf_ctx_range(struct bpf_sock_addr, user_port):
                if (type == BPF_READ) {
                        bpf_ctx_record_field_size(info, size_default);

                        if (bpf_ctx_wide_access_ok(off, size,
                                                   struct bpf_sock_addr,
                                                   user_ip6))
                                return true;

                        if (bpf_ctx_wide_access_ok(off, size,
                                                   struct bpf_sock_addr,
                                                   msg_src_ip6))
                                return true;

                        if (!bpf_ctx_narrow_access_ok(off, size, size_default))
                                return false;
                } else {
                        if (bpf_ctx_wide_access_ok(off, size,
                                                   struct bpf_sock_addr,
                                                   user_ip6))
                                return true;

                        if (bpf_ctx_wide_access_ok(off, size,
                                                   struct bpf_sock_addr,
                                                   msg_src_ip6))
                                return true;

                        if (size != size_default)
                                return false;
                }
                break;
        case bpf_ctx_range_ptr(struct bpf_sock_addr, sk):
                if (type != BPF_READ)
                        return false;
                if (size != sizeof(__u64))
                        return false;
                info->reg_type = PTR_TO_SOCKET;
                break;
        case bpf_ctx_range(struct bpf_sock_addr, user_family):
        case bpf_ctx_range(struct bpf_sock_addr, family):
        case bpf_ctx_range(struct bpf_sock_addr, type):
        case bpf_ctx_range(struct bpf_sock_addr, protocol):
                if (type != BPF_READ)
                        return false;
                if (size != size_default)
                        return false;
                break;
        default:
                return false;
        }

        return true;
}

static bool sock_ops_is_valid_access(int off, int size,
                                     enum bpf_access_type type,
                                     const struct bpf_prog *prog,
                                     struct bpf_insn_access_aux *info)
{
        const int size_default = sizeof(__u32);

        if (off < 0 || off >= sizeof(struct bpf_sock_ops))
                return false;

        /* The verifier guarantees that size > 0. */
        if (off % size != 0)
                return false;

        if (type == BPF_WRITE) {
                switch (off) {
                case offsetof(struct bpf_sock_ops, reply):
                case offsetof(struct bpf_sock_ops, sk_txhash):
                        if (size != size_default)
                                return false;
                        break;
                default:
                        return false;
                }
        } else {
                switch (off) {
                case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received,
                                        bytes_acked):
                        if (size != sizeof(__u64))
                                return false;
                        break;
                case bpf_ctx_range_ptr(struct bpf_sock_ops, sk):
                        if (size != sizeof(__u64))
                                return false;
                        info->reg_type = PTR_TO_SOCKET_OR_NULL;
                        break;
                case bpf_ctx_range_ptr(struct bpf_sock_ops, skb_data):
                        if (size != sizeof(__u64))
                                return false;
                        info->reg_type = PTR_TO_PACKET;
                        break;
                case bpf_ctx_range_ptr(struct bpf_sock_ops, skb_data_end):
                        if (size != sizeof(__u64))
                                return false;
                        info->reg_type = PTR_TO_PACKET_END;
                        break;
                case offsetof(struct bpf_sock_ops, skb_tcp_flags):
                        bpf_ctx_record_field_size(info, size_default);
                        return bpf_ctx_narrow_access_ok(off, size,
                                                        size_default);
                default:
                        if (size != size_default)
                                return false;
                        break;
                }
        }

        return true;
}

static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write,
                           const struct bpf_prog *prog)
{
        return bpf_unclone_prologue(insn_buf, direct_write, prog, SK_DROP);
}

static bool sk_skb_is_valid_access(int off, int size,
                                   enum bpf_access_type type,
                                   const struct bpf_prog *prog,
                                   struct bpf_insn_access_aux *info)
{
        switch (off) {
        case bpf_ctx_range(struct __sk_buff, tc_classid):
        case bpf_ctx_range(struct __sk_buff, data_meta):
        case bpf_ctx_range(struct __sk_buff, tstamp):
        case bpf_ctx_range(struct __sk_buff, wire_len):
                return false;
        }

        if (type == BPF_WRITE) {
                switch (off) {
                case bpf_ctx_range(struct __sk_buff, tc_index):
                case bpf_ctx_range(struct __sk_buff, priority):
                        break;
                default:
                        return false;
                }
        }

        switch (off) {
        case bpf_ctx_range(struct __sk_buff, mark):
                return false;
        case bpf_ctx_range(struct __sk_buff, data):
                info->reg_type = PTR_TO_PACKET;
                break;
        case bpf_ctx_range(struct __sk_buff, data_end):
                info->reg_type = PTR_TO_PACKET_END;
                break;
        }

        return bpf_skb_is_valid_access(off, size, type, prog, info);
}

static bool sk_msg_is_valid_access(int off, int size,
                                   enum bpf_access_type type,
                                   const struct bpf_prog *prog,
                                   struct bpf_insn_access_aux *info)
{
        if (type == BPF_WRITE)
                return false;

        if (off % size != 0)
                return false;

        switch (off) {
        case bpf_ctx_range_ptr(struct sk_msg_md, data):
                info->reg_type = PTR_TO_PACKET;
                if (size != sizeof(__u64))
                        return false;
                break;
        case bpf_ctx_range_ptr(struct sk_msg_md, data_end):
                info->reg_type = PTR_TO_PACKET_END;
                if (size != sizeof(__u64))
                        return false;
                break;
        case bpf_ctx_range_ptr(struct sk_msg_md, sk):
                if (size != sizeof(__u64))
                        return false;
                info->reg_type = PTR_TO_SOCKET;
                break;
        case bpf_ctx_range(struct sk_msg_md, family):
        case bpf_ctx_range(struct sk_msg_md, remote_ip4):
        case bpf_ctx_range(struct sk_msg_md, local_ip4):
        case bpf_ctx_range_till(struct sk_msg_md, remote_ip6[0], remote_ip6[3]):
        case bpf_ctx_range_till(struct sk_msg_md, local_ip6[0], local_ip6[3]):
        case bpf_ctx_range(struct sk_msg_md, remote_port):
        case bpf_ctx_range(struct sk_msg_md, local_port):
        case bpf_ctx_range(struct sk_msg_md, size):
                if (size != sizeof(__u32))
                        return false;
                break;
        default:
                return false;
        }
        return true;
}

static bool flow_dissector_is_valid_access(int off, int size,
                                           enum bpf_access_type type,
                                           const struct bpf_prog *prog,
                                           struct bpf_insn_access_aux *info)
{
        const int size_default = sizeof(__u32);

        if (off < 0 || off >= sizeof(struct __sk_buff))
                return false;

        if (off % size != 0)
                return false;

        if (type == BPF_WRITE)
                return false;

        switch (off) {
        case bpf_ctx_range(struct __sk_buff, data):
                if (size != size_default)
                        return false;
                info->reg_type = PTR_TO_PACKET;
                return true;
        case bpf_ctx_range(struct __sk_buff, data_end):
                if (size != size_default)
                        return false;
                info->reg_type = PTR_TO_PACKET_END;
                return true;
        case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
                if (size != sizeof(__u64))
                        return false;
                info->reg_type = PTR_TO_FLOW_KEYS;
                return true;
        default:
                return false;
        }
}

static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type,
                                             const struct bpf_insn *si,
                                             struct bpf_insn *insn_buf,
                                             struct bpf_prog *prog,
                                             u32 *target_size)

{
        struct bpf_insn *insn = insn_buf;

        switch (si->off) {
        case offsetof(struct __sk_buff, data):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_flow_dissector, data));
                break;

        case offsetof(struct __sk_buff, data_end):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data_end),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_flow_dissector, data_end));
                break;

        case offsetof(struct __sk_buff, flow_keys):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, flow_keys),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_flow_dissector, flow_keys));
                break;
        }

        return insn - insn_buf;
}

static struct bpf_insn *bpf_convert_shinfo_access(const struct bpf_insn *si,
                                                  struct bpf_insn *insn)
{
        /* si->dst_reg = skb_shinfo(SKB); */
#ifdef NET_SKBUFF_DATA_USES_OFFSET
        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end),
                              BPF_REG_AX, si->src_reg,
                              offsetof(struct sk_buff, end));
        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head),
                              si->dst_reg, si->src_reg,
                              offsetof(struct sk_buff, head));
        *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX);
#else
        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end),
                              si->dst_reg, si->src_reg,
                              offsetof(struct sk_buff, end));
#endif

        return insn;
}

static u32 bpf_convert_ctx_access(enum bpf_access_type type,
                                  const struct bpf_insn *si,
                                  struct bpf_insn *insn_buf,
                                  struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;
        int off;

        switch (si->off) {
        case offsetof(struct __sk_buff, len):
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff, len, 4,
                                                     target_size));
                break;

        case offsetof(struct __sk_buff, protocol):
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff, protocol, 2,
                                                     target_size));
                break;

        case offsetof(struct __sk_buff, vlan_proto):
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff, vlan_proto, 2,
                                                     target_size));
                break;

        case offsetof(struct __sk_buff, priority):
                if (type == BPF_WRITE)
                        *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                              bpf_target_off(struct sk_buff, priority, 4,
                                                             target_size));
                else
                        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                              bpf_target_off(struct sk_buff, priority, 4,
                                                             target_size));
                break;

        case offsetof(struct __sk_buff, ingress_ifindex):
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff, skb_iif, 4,
                                                     target_size));
                break;

        case offsetof(struct __sk_buff, ifindex):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, dev));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct net_device, ifindex, 4,
                                                     target_size));
                break;

        case offsetof(struct __sk_buff, hash):
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff, hash, 4,
                                                     target_size));
                break;

        case offsetof(struct __sk_buff, mark):
                if (type == BPF_WRITE)
                        *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                              bpf_target_off(struct sk_buff, mark, 4,
                                                             target_size));
                else
                        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                              bpf_target_off(struct sk_buff, mark, 4,
                                                             target_size));
                break;

        case offsetof(struct __sk_buff, pkt_type):
                *target_size = 1;
                *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg,
                                      PKT_TYPE_OFFSET());
                *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, PKT_TYPE_MAX);
#ifdef __BIG_ENDIAN_BITFIELD
                *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5);
#endif
                break;

        case offsetof(struct __sk_buff, queue_mapping):
                if (type == BPF_WRITE) {
                        *insn++ = BPF_JMP_IMM(BPF_JGE, si->src_reg, NO_QUEUE_MAPPING, 1);
                        *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                              bpf_target_off(struct sk_buff,
                                                             queue_mapping,
                                                             2, target_size));
                } else {
                        *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                              bpf_target_off(struct sk_buff,
                                                             queue_mapping,
                                                             2, target_size));
                }
                break;

        case offsetof(struct __sk_buff, vlan_present):
                *target_size = 1;
                *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg,
                                      PKT_VLAN_PRESENT_OFFSET());
                if (PKT_VLAN_PRESENT_BIT)
                        *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, PKT_VLAN_PRESENT_BIT);
                if (PKT_VLAN_PRESENT_BIT < 7)
                        *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, 1);
                break;

        case offsetof(struct __sk_buff, vlan_tci):
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff, vlan_tci, 2,
                                                     target_size));
                break;

        case offsetof(struct __sk_buff, cb[0]) ...
             offsetofend(struct __sk_buff, cb[4]) - 1:
                BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, data) < 20);
                BUILD_BUG_ON((offsetof(struct sk_buff, cb) +
                              offsetof(struct qdisc_skb_cb, data)) %
                             sizeof(__u64));

                prog->cb_access = 1;
                off  = si->off;
                off -= offsetof(struct __sk_buff, cb[0]);
                off += offsetof(struct sk_buff, cb);
                off += offsetof(struct qdisc_skb_cb, data);
                if (type == BPF_WRITE)
                        *insn++ = BPF_STX_MEM(BPF_SIZE(si->code), si->dst_reg,
                                              si->src_reg, off);
                else
                        *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg,
                                              si->src_reg, off);
                break;

        case offsetof(struct __sk_buff, tc_classid):
                BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, tc_classid) != 2);

                off  = si->off;
                off -= offsetof(struct __sk_buff, tc_classid);
                off += offsetof(struct sk_buff, cb);
                off += offsetof(struct qdisc_skb_cb, tc_classid);
                *target_size = 2;
                if (type == BPF_WRITE)
                        *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg,
                                              si->src_reg, off);
                else
                        *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg,
                                              si->src_reg, off);
                break;

        case offsetof(struct __sk_buff, data):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, data));
                break;

        case offsetof(struct __sk_buff, data_meta):
                off  = si->off;
                off -= offsetof(struct __sk_buff, data_meta);
                off += offsetof(struct sk_buff, cb);
                off += offsetof(struct bpf_skb_data_end, data_meta);
                *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
                                      si->src_reg, off);
                break;

        case offsetof(struct __sk_buff, data_end):
                off  = si->off;
                off -= offsetof(struct __sk_buff, data_end);
                off += offsetof(struct sk_buff, cb);
                off += offsetof(struct bpf_skb_data_end, data_end);
                *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
                                      si->src_reg, off);
                break;

        case offsetof(struct __sk_buff, tc_index):
#ifdef CONFIG_NET_SCHED
                if (type == BPF_WRITE)
                        *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                              bpf_target_off(struct sk_buff, tc_index, 2,
                                                             target_size));
                else
                        *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                              bpf_target_off(struct sk_buff, tc_index, 2,
                                                             target_size));
#else
                *target_size = 2;
                if (type == BPF_WRITE)
                        *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg);
                else
                        *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
#endif
                break;

        case offsetof(struct __sk_buff, napi_id):
#if defined(CONFIG_NET_RX_BUSY_POLL)
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff, napi_id, 4,
                                                     target_size));
                *insn++ = BPF_JMP_IMM(BPF_JGE, si->dst_reg, MIN_NAPI_ID, 1);
                *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
#else
                *target_size = 4;
                *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
#endif
                break;
        case offsetof(struct __sk_buff, family):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct sock_common,
                                                     skc_family,
                                                     2, target_size));
                break;
        case offsetof(struct __sk_buff, remote_ip4):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct sock_common,
                                                     skc_daddr,
                                                     4, target_size));
                break;
        case offsetof(struct __sk_buff, local_ip4):
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_rcv_saddr) != 4);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct sock_common,
                                                     skc_rcv_saddr,
                                                     4, target_size));
                break;
        case offsetof(struct __sk_buff, remote_ip6[0]) ...
             offsetof(struct __sk_buff, remote_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_v6_daddr.s6_addr32[0]) != 4);

                off = si->off;
                off -= offsetof(struct __sk_buff, remote_ip6[0]);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_v6_daddr.s6_addr32[0]) +
                                      off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;
        case offsetof(struct __sk_buff, local_ip6[0]) ...
             offsetof(struct __sk_buff, local_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_v6_rcv_saddr.s6_addr32[0]) != 4);

                off = si->off;
                off -= offsetof(struct __sk_buff, local_ip6[0]);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_v6_rcv_saddr.s6_addr32[0]) +
                                      off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;

        case offsetof(struct __sk_buff, remote_port):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct sock_common,
                                                     skc_dport,
                                                     2, target_size));
#ifndef __BIG_ENDIAN_BITFIELD
                *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
#endif
                break;

        case offsetof(struct __sk_buff, local_port):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct sock_common,
                                                     skc_num, 2, target_size));
                break;

        case offsetof(struct __sk_buff, tstamp):
                BUILD_BUG_ON(sizeof_field(struct sk_buff, tstamp) != 8);

                if (type == BPF_WRITE)
                        *insn++ = BPF_STX_MEM(BPF_DW,
                                              si->dst_reg, si->src_reg,
                                              bpf_target_off(struct sk_buff,
                                                             tstamp, 8,
                                                             target_size));
                else
                        *insn++ = BPF_LDX_MEM(BPF_DW,
                                              si->dst_reg, si->src_reg,
                                              bpf_target_off(struct sk_buff,
                                                             tstamp, 8,
                                                             target_size));
                break;

        case offsetof(struct __sk_buff, gso_segs):
                insn = bpf_convert_shinfo_access(si, insn);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_segs),
                                      si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct skb_shared_info,
                                                     gso_segs, 2,
                                                     target_size));
                break;
        case offsetof(struct __sk_buff, gso_size):
                insn = bpf_convert_shinfo_access(si, insn);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_size),
                                      si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct skb_shared_info,
                                                     gso_size, 2,
                                                     target_size));
                break;
        case offsetof(struct __sk_buff, wire_len):
                BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, pkt_len) != 4);

                off = si->off;
                off -= offsetof(struct __sk_buff, wire_len);
                off += offsetof(struct sk_buff, cb);
                off += offsetof(struct qdisc_skb_cb, pkt_len);
                *target_size = 4;
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off);
                break;

        case offsetof(struct __sk_buff, sk):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                break;
        }

        return insn - insn_buf;
}

u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
                                const struct bpf_insn *si,
                                struct bpf_insn *insn_buf,
                                struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;
        int off;

        switch (si->off) {
        case offsetof(struct bpf_sock, bound_dev_if):
                BUILD_BUG_ON(sizeof_field(struct sock, sk_bound_dev_if) != 4);

                if (type == BPF_WRITE)
                        *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                        offsetof(struct sock, sk_bound_dev_if));
                else
                        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      offsetof(struct sock, sk_bound_dev_if));
                break;

        case offsetof(struct bpf_sock, mark):
                BUILD_BUG_ON(sizeof_field(struct sock, sk_mark) != 4);

                if (type == BPF_WRITE)
                        *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                        offsetof(struct sock, sk_mark));
                else
                        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      offsetof(struct sock, sk_mark));
                break;

        case offsetof(struct bpf_sock, priority):
                BUILD_BUG_ON(sizeof_field(struct sock, sk_priority) != 4);

                if (type == BPF_WRITE)
                        *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                        offsetof(struct sock, sk_priority));
                else
                        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      offsetof(struct sock, sk_priority));
                break;

        case offsetof(struct bpf_sock, family):
                *insn++ = BPF_LDX_MEM(
                        BPF_FIELD_SIZEOF(struct sock_common, skc_family),
                        si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock_common,
                                       skc_family,
                                       sizeof_field(struct sock_common,
                                                    skc_family),
                                       target_size));
                break;

        case offsetof(struct bpf_sock, type):
                *insn++ = BPF_LDX_MEM(
                        BPF_FIELD_SIZEOF(struct sock, sk_type),
                        si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock, sk_type,
                                       sizeof_field(struct sock, sk_type),
                                       target_size));
                break;

        case offsetof(struct bpf_sock, protocol):
                *insn++ = BPF_LDX_MEM(
                        BPF_FIELD_SIZEOF(struct sock, sk_protocol),
                        si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock, sk_protocol,
                                       sizeof_field(struct sock, sk_protocol),
                                       target_size));
                break;

        case offsetof(struct bpf_sock, src_ip4):
                *insn++ = BPF_LDX_MEM(
                        BPF_SIZE(si->code), si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock_common, skc_rcv_saddr,
                                       sizeof_field(struct sock_common,
                                                    skc_rcv_saddr),
                                       target_size));
                break;

        case offsetof(struct bpf_sock, dst_ip4):
                *insn++ = BPF_LDX_MEM(
                        BPF_SIZE(si->code), si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock_common, skc_daddr,
                                       sizeof_field(struct sock_common,
                                                    skc_daddr),
                                       target_size));
                break;

        case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                off = si->off;
                off -= offsetof(struct bpf_sock, src_ip6[0]);
                *insn++ = BPF_LDX_MEM(
                        BPF_SIZE(si->code), si->dst_reg, si->src_reg,
                        bpf_target_off(
                                struct sock_common,
                                skc_v6_rcv_saddr.s6_addr32[0],
                                sizeof_field(struct sock_common,
                                             skc_v6_rcv_saddr.s6_addr32[0]),
                                target_size) + off);
#else
                (void)off;
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;

        case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                off = si->off;
                off -= offsetof(struct bpf_sock, dst_ip6[0]);
                *insn++ = BPF_LDX_MEM(
                        BPF_SIZE(si->code), si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock_common,
                                       skc_v6_daddr.s6_addr32[0],
                                       sizeof_field(struct sock_common,
                                                    skc_v6_daddr.s6_addr32[0]),
                                       target_size) + off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
                *target_size = 4;
#endif
                break;

        case offsetof(struct bpf_sock, src_port):
                *insn++ = BPF_LDX_MEM(
                        BPF_FIELD_SIZEOF(struct sock_common, skc_num),
                        si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock_common, skc_num,
                                       sizeof_field(struct sock_common,
                                                    skc_num),
                                       target_size));
                break;

        case offsetof(struct bpf_sock, dst_port):
                *insn++ = BPF_LDX_MEM(
                        BPF_FIELD_SIZEOF(struct sock_common, skc_dport),
                        si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock_common, skc_dport,
                                       sizeof_field(struct sock_common,
                                                    skc_dport),
                                       target_size));
                break;

        case offsetof(struct bpf_sock, state):
                *insn++ = BPF_LDX_MEM(
                        BPF_FIELD_SIZEOF(struct sock_common, skc_state),
                        si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock_common, skc_state,
                                       sizeof_field(struct sock_common,
                                                    skc_state),
                                       target_size));
                break;
        case offsetof(struct bpf_sock, rx_queue_mapping):
#ifdef CONFIG_XPS
                *insn++ = BPF_LDX_MEM(
                        BPF_FIELD_SIZEOF(struct sock, sk_rx_queue_mapping),
                        si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock, sk_rx_queue_mapping,
                                       sizeof_field(struct sock,
                                                    sk_rx_queue_mapping),
                                       target_size));
                *insn++ = BPF_JMP_IMM(BPF_JNE, si->dst_reg, NO_QUEUE_MAPPING,
                                      1);
                *insn++ = BPF_MOV64_IMM(si->dst_reg, -1);
#else
                *insn++ = BPF_MOV64_IMM(si->dst_reg, -1);
                *target_size = 2;
#endif
                break;
        }

        return insn - insn_buf;
}

static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type,
                                         const struct bpf_insn *si,
                                         struct bpf_insn *insn_buf,
                                         struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;

        switch (si->off) {
        case offsetof(struct __sk_buff, ifindex):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, dev));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct net_device, ifindex, 4,
                                                     target_size));
                break;
        default:
                return bpf_convert_ctx_access(type, si, insn_buf, prog,
                                              target_size);
        }

        return insn - insn_buf;
}

static u32 xdp_convert_ctx_access(enum bpf_access_type type,
                                  const struct bpf_insn *si,
                                  struct bpf_insn *insn_buf,
                                  struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;

        switch (si->off) {
        case offsetof(struct xdp_md, data):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct xdp_buff, data));
                break;
        case offsetof(struct xdp_md, data_meta):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_meta),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct xdp_buff, data_meta));
                break;
        case offsetof(struct xdp_md, data_end):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct xdp_buff, data_end));
                break;
        case offsetof(struct xdp_md, ingress_ifindex):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct xdp_buff, rxq));
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev),
                                      si->dst_reg, si->dst_reg,
                                      offsetof(struct xdp_rxq_info, dev));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct net_device, ifindex));
                break;
        case offsetof(struct xdp_md, rx_queue_index):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct xdp_buff, rxq));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct xdp_rxq_info,
                                               queue_index));
                break;
        case offsetof(struct xdp_md, egress_ifindex):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, txq),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct xdp_buff, txq));
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_txq_info, dev),
                                      si->dst_reg, si->dst_reg,
                                      offsetof(struct xdp_txq_info, dev));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct net_device, ifindex));
                break;
        }

        return insn - insn_buf;
}

/* SOCK_ADDR_LOAD_NESTED_FIELD() loads Nested Field S.F.NF where S is type of
 * context Structure, F is Field in context structure that contains a pointer
 * to Nested Structure of type NS that has the field NF.
 *
 * SIZE encodes the load size (BPF_B, BPF_H, etc). It's up to caller to make
 * sure that SIZE is not greater than actual size of S.F.NF.
 *
 * If offset OFF is provided, the load happens from that offset relative to
 * offset of NF.
 */
#define SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF)               \
        do {                                                                       \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), si->dst_reg,     \
                                      si->src_reg, offsetof(S, F));               \
                *insn++ = BPF_LDX_MEM(                                               \
                        SIZE, si->dst_reg, si->dst_reg,                               \
                        bpf_target_off(NS, NF, sizeof_field(NS, NF),               \
                                       target_size)                               \
                                + OFF);                                               \
        } while (0)

#define SOCK_ADDR_LOAD_NESTED_FIELD(S, NS, F, NF)                               \
        SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF,                       \
                                             BPF_FIELD_SIZEOF(NS, NF), 0)

/* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to
 * SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation.
 *
 * In addition it uses Temporary Field TF (member of struct S) as the 3rd
 * "register" since two registers available in convert_ctx_access are not
 * enough: we can't override neither SRC, since it contains value to store, nor
 * DST since it contains pointer to context that may be used by later
 * instructions. But we need a temporary place to save pointer to nested
 * structure whose field we want to store to.
 */
#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, OFF, TF)               \
        do {                                                                       \
                int tmp_reg = BPF_REG_9;                                       \
                if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg)               \
                        --tmp_reg;                                               \
                if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg)               \
                        --tmp_reg;                                               \
                *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, tmp_reg,               \
                                      offsetof(S, TF));                               \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg,               \
                                      si->dst_reg, offsetof(S, F));               \
                *insn++ = BPF_STX_MEM(SIZE, tmp_reg, si->src_reg,               \
                        bpf_target_off(NS, NF, sizeof_field(NS, NF),               \
                                       target_size)                               \
                                + OFF);                                               \
                *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, si->dst_reg,               \
                                      offsetof(S, TF));                               \
        } while (0)

#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF, \
                                                      TF)                       \
        do {                                                                       \
                if (type == BPF_WRITE) {                                       \
                        SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE,   \
                                                         OFF, TF);               \
                } else {                                                       \
                        SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(                       \
                                S, NS, F, NF, SIZE, OFF);  \
                }                                                               \
        } while (0)

#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(S, NS, F, NF, TF)                       \
        SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(                               \
                S, NS, F, NF, BPF_FIELD_SIZEOF(NS, NF), 0, TF)

static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
                                        const struct bpf_insn *si,
                                        struct bpf_insn *insn_buf,
                                        struct bpf_prog *prog, u32 *target_size)
{
        int off, port_size = sizeof_field(struct sockaddr_in6, sin6_port);
        struct bpf_insn *insn = insn_buf;

        switch (si->off) {
        case offsetof(struct bpf_sock_addr, user_family):
                SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
                                            struct sockaddr, uaddr, sa_family);
                break;

        case offsetof(struct bpf_sock_addr, user_ip4):
                SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
                        struct bpf_sock_addr_kern, struct sockaddr_in, uaddr,
                        sin_addr, BPF_SIZE(si->code), 0, tmp_reg);
                break;

        case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
                off = si->off;
                off -= offsetof(struct bpf_sock_addr, user_ip6[0]);
                SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
                        struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr,
                        sin6_addr.s6_addr32[0], BPF_SIZE(si->code), off,
                        tmp_reg);
                break;

        case offsetof(struct bpf_sock_addr, user_port):
                /* To get port we need to know sa_family first and then treat
                 * sockaddr as either sockaddr_in or sockaddr_in6.
                 * Though we can simplify since port field has same offset and
                 * size in both structures.
                 * Here we check this invariant and use just one of the
                 * structures if it's true.
                 */
                BUILD_BUG_ON(offsetof(struct sockaddr_in, sin_port) !=
                             offsetof(struct sockaddr_in6, sin6_port));
                BUILD_BUG_ON(sizeof_field(struct sockaddr_in, sin_port) !=
                             sizeof_field(struct sockaddr_in6, sin6_port));
                /* Account for sin6_port being smaller than user_port. */
                port_size = min(port_size, BPF_LDST_BYTES(si));
                SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
                        struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr,
                        sin6_port, bytes_to_bpf_size(port_size), 0, tmp_reg);
                break;

        case offsetof(struct bpf_sock_addr, family):
                SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
                                            struct sock, sk, sk_family);
                break;

        case offsetof(struct bpf_sock_addr, type):
                SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
                                            struct sock, sk, sk_type);
                break;

        case offsetof(struct bpf_sock_addr, protocol):
                SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
                                            struct sock, sk, sk_protocol);
                break;

        case offsetof(struct bpf_sock_addr, msg_src_ip4):
                /* Treat t_ctx as struct in_addr for msg_src_ip4. */
                SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
                        struct bpf_sock_addr_kern, struct in_addr, t_ctx,
                        s_addr, BPF_SIZE(si->code), 0, tmp_reg);
                break;

        case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
                                msg_src_ip6[3]):
                off = si->off;
                off -= offsetof(struct bpf_sock_addr, msg_src_ip6[0]);
                /* Treat t_ctx as struct in6_addr for msg_src_ip6. */
                SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
                        struct bpf_sock_addr_kern, struct in6_addr, t_ctx,
                        s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg);
                break;
        case offsetof(struct bpf_sock_addr, sk):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_addr_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_addr_kern, sk));
                break;
        }

        return insn - insn_buf;
}

static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
                                       const struct bpf_insn *si,
                                       struct bpf_insn *insn_buf,
                                       struct bpf_prog *prog,
                                       u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;
        int off;

/* Helper macro for adding read access to tcp_sock or sock fields. */
#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)                              \
        do {                                                                      \
                int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 2;     \
                BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) >                      \
                             sizeof_field(struct bpf_sock_ops, BPF_FIELD));   \
                if (si->dst_reg == reg || si->src_reg == reg)                      \
                        reg--;                                                      \
                if (si->dst_reg == reg || si->src_reg == reg)                      \
                        reg--;                                                      \
                if (si->dst_reg == si->src_reg) {                              \
                        *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg,              \
                                          offsetof(struct bpf_sock_ops_kern,  \
                                          temp));                              \
                        fullsock_reg = reg;                                      \
                        jmp += 2;                                              \
                }                                                              \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                              \
                                                struct bpf_sock_ops_kern,     \
                                                is_fullsock),                      \
                                      fullsock_reg, si->src_reg,              \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                               is_fullsock));                      \
                *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp);              \
                if (si->dst_reg == si->src_reg)                                      \
                        *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg,              \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                      temp));                                      \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                              \
                                                struct bpf_sock_ops_kern, sk),\
                                      si->dst_reg, si->src_reg,                      \
                                      offsetof(struct bpf_sock_ops_kern, sk));\
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ,                      \
                                                       OBJ_FIELD),              \
                                      si->dst_reg, si->dst_reg,                      \
                                      offsetof(OBJ, OBJ_FIELD));              \
                if (si->dst_reg == si->src_reg)        {                              \
                        *insn++ = BPF_JMP_A(1);                                      \
                        *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg,              \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                      temp));                                      \
                }                                                              \
        } while (0)

#define SOCK_OPS_GET_SK()                                                              \
        do {                                                                      \
                int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 1;     \
                if (si->dst_reg == reg || si->src_reg == reg)                      \
                        reg--;                                                      \
                if (si->dst_reg == reg || si->src_reg == reg)                      \
                        reg--;                                                      \
                if (si->dst_reg == si->src_reg) {                              \
                        *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg,              \
                                          offsetof(struct bpf_sock_ops_kern,  \
                                          temp));                              \
                        fullsock_reg = reg;                                      \
                        jmp += 2;                                              \
                }                                                              \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                              \
                                                struct bpf_sock_ops_kern,     \
                                                is_fullsock),                      \
                                      fullsock_reg, si->src_reg,              \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                               is_fullsock));                      \
                *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp);              \
                if (si->dst_reg == si->src_reg)                                      \
                        *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg,              \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                      temp));                                      \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                              \
                                                struct bpf_sock_ops_kern, sk),\
                                      si->dst_reg, si->src_reg,                      \
                                      offsetof(struct bpf_sock_ops_kern, sk));\
                if (si->dst_reg == si->src_reg)        {                              \
                        *insn++ = BPF_JMP_A(1);                                      \
                        *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg,              \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                      temp));                                      \
                }                                                              \
        } while (0)

#define SOCK_OPS_GET_TCP_SOCK_FIELD(FIELD) \
                SOCK_OPS_GET_FIELD(FIELD, FIELD, struct tcp_sock)

/* Helper macro for adding write access to tcp_sock or sock fields.
 * The macro is called with two registers, dst_reg which contains a pointer
 * to ctx (context) and src_reg which contains the value that should be
 * stored. However, we need an additional register since we cannot overwrite
 * dst_reg because it may be used later in the program.
 * Instead we "borrow" one of the other register. We first save its value
 * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore
 * it at the end of the macro.
 */
#define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)                              \
        do {                                                                      \
                int reg = BPF_REG_9;                                              \
                BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) >                      \
                             sizeof_field(struct bpf_sock_ops, BPF_FIELD));   \
                if (si->dst_reg == reg || si->src_reg == reg)                      \
                        reg--;                                                      \
                if (si->dst_reg == reg || si->src_reg == reg)                      \
                        reg--;                                                      \
                *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg,                      \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                               temp));                              \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                              \
                                                struct bpf_sock_ops_kern,     \
                                                is_fullsock),                      \
                                      reg, si->dst_reg,                              \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                               is_fullsock));                      \
                *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2);                      \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                              \
                                                struct bpf_sock_ops_kern, sk),\
                                      reg, si->dst_reg,                              \
                                      offsetof(struct bpf_sock_ops_kern, sk));\
                *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD),              \
                                      reg, si->src_reg,                              \
                                      offsetof(OBJ, OBJ_FIELD));              \
                *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg,                      \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                               temp));                              \
        } while (0)

#define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE)              \
        do {                                                                      \
                if (TYPE == BPF_WRITE)                                              \
                        SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ);              \
                else                                                              \
                        SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ);              \
        } while (0)

        if (insn > insn_buf)
                return insn - insn_buf;

        switch (si->off) {
        case offsetof(struct bpf_sock_ops, op):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
                                                       op),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, op));
                break;

        case offsetof(struct bpf_sock_ops, replylong[0]) ...
             offsetof(struct bpf_sock_ops, replylong[3]):
                BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, reply) !=
                             sizeof_field(struct bpf_sock_ops_kern, reply));
                BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, replylong) !=
                             sizeof_field(struct bpf_sock_ops_kern, replylong));
                off = si->off;
                off -= offsetof(struct bpf_sock_ops, replylong[0]);
                off += offsetof(struct bpf_sock_ops_kern, replylong[0]);
                if (type == BPF_WRITE)
                        *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                              off);
                else
                        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                              off);
                break;

        case offsetof(struct bpf_sock_ops, family):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                              struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_family));
                break;

        case offsetof(struct bpf_sock_ops, remote_ip4):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_daddr));
                break;

        case offsetof(struct bpf_sock_ops, local_ip4):
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_rcv_saddr) != 4);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                              struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_rcv_saddr));
                break;

        case offsetof(struct bpf_sock_ops, remote_ip6[0]) ...
             offsetof(struct bpf_sock_ops, remote_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_v6_daddr.s6_addr32[0]) != 4);

                off = si->off;
                off -= offsetof(struct bpf_sock_ops, remote_ip6[0]);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_v6_daddr.s6_addr32[0]) +
                                      off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;

        case offsetof(struct bpf_sock_ops, local_ip6[0]) ...
             offsetof(struct bpf_sock_ops, local_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_v6_rcv_saddr.s6_addr32[0]) != 4);

                off = si->off;
                off -= offsetof(struct bpf_sock_ops, local_ip6[0]);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_v6_rcv_saddr.s6_addr32[0]) +
                                      off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;

        case offsetof(struct bpf_sock_ops, remote_port):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_dport));
#ifndef __BIG_ENDIAN_BITFIELD
                *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
#endif
                break;

        case offsetof(struct bpf_sock_ops, local_port):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_num));
                break;

        case offsetof(struct bpf_sock_ops, is_fullsock):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern,
                                                is_fullsock),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern,
                                               is_fullsock));
                break;

        case offsetof(struct bpf_sock_ops, state):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_state) != 1);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_state));
                break;

        case offsetof(struct bpf_sock_ops, rtt_min):
                BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) !=
                             sizeof(struct minmax));
                BUILD_BUG_ON(sizeof(struct minmax) <
                             sizeof(struct minmax_sample));

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct tcp_sock, rtt_min) +
                                      sizeof_field(struct minmax_sample, t));
                break;

        case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags):
                SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags,
                                   struct tcp_sock);
                break;

        case offsetof(struct bpf_sock_ops, sk_txhash):
                SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
                                          struct sock, type);
                break;
        case offsetof(struct bpf_sock_ops, snd_cwnd):
                SOCK_OPS_GET_TCP_SOCK_FIELD(snd_cwnd);
                break;
        case offsetof(struct bpf_sock_ops, srtt_us):
                SOCK_OPS_GET_TCP_SOCK_FIELD(srtt_us);
                break;
        case offsetof(struct bpf_sock_ops, snd_ssthresh):
                SOCK_OPS_GET_TCP_SOCK_FIELD(snd_ssthresh);
                break;
        case offsetof(struct bpf_sock_ops, rcv_nxt):
                SOCK_OPS_GET_TCP_SOCK_FIELD(rcv_nxt);
                break;
        case offsetof(struct bpf_sock_ops, snd_nxt):
                SOCK_OPS_GET_TCP_SOCK_FIELD(snd_nxt);
                break;
        case offsetof(struct bpf_sock_ops, snd_una):
                SOCK_OPS_GET_TCP_SOCK_FIELD(snd_una);
                break;
        case offsetof(struct bpf_sock_ops, mss_cache):
                SOCK_OPS_GET_TCP_SOCK_FIELD(mss_cache);
                break;
        case offsetof(struct bpf_sock_ops, ecn_flags):
                SOCK_OPS_GET_TCP_SOCK_FIELD(ecn_flags);
                break;
        case offsetof(struct bpf_sock_ops, rate_delivered):
                SOCK_OPS_GET_TCP_SOCK_FIELD(rate_delivered);
                break;
        case offsetof(struct bpf_sock_ops, rate_interval_us):
                SOCK_OPS_GET_TCP_SOCK_FIELD(rate_interval_us);
                break;
        case offsetof(struct bpf_sock_ops, packets_out):
                SOCK_OPS_GET_TCP_SOCK_FIELD(packets_out);
                break;
        case offsetof(struct bpf_sock_ops, retrans_out):
                SOCK_OPS_GET_TCP_SOCK_FIELD(retrans_out);
                break;
        case offsetof(struct bpf_sock_ops, total_retrans):
                SOCK_OPS_GET_TCP_SOCK_FIELD(total_retrans);
                break;
        case offsetof(struct bpf_sock_ops, segs_in):
                SOCK_OPS_GET_TCP_SOCK_FIELD(segs_in);
                break;
        case offsetof(struct bpf_sock_ops, data_segs_in):
                SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_in);
                break;
        case offsetof(struct bpf_sock_ops, segs_out):
                SOCK_OPS_GET_TCP_SOCK_FIELD(segs_out);
                break;
        case offsetof(struct bpf_sock_ops, data_segs_out):
                SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_out);
                break;
        case offsetof(struct bpf_sock_ops, lost_out):
                SOCK_OPS_GET_TCP_SOCK_FIELD(lost_out);
                break;
        case offsetof(struct bpf_sock_ops, sacked_out):
                SOCK_OPS_GET_TCP_SOCK_FIELD(sacked_out);
                break;
        case offsetof(struct bpf_sock_ops, bytes_received):
                SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_received);
                break;
        case offsetof(struct bpf_sock_ops, bytes_acked):
                SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_acked);
                break;
        case offsetof(struct bpf_sock_ops, sk):
                SOCK_OPS_GET_SK();
                break;
        case offsetof(struct bpf_sock_ops, skb_data_end):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
                                                       skb_data_end),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern,
                                               skb_data_end));
                break;
        case offsetof(struct bpf_sock_ops, skb_data):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
                                                       skb),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern,
                                               skb));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
                                      si->dst_reg, si->dst_reg,
                                      offsetof(struct sk_buff, data));
                break;
        case offsetof(struct bpf_sock_ops, skb_len):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
                                                       skb),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern,
                                               skb));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len),
                                      si->dst_reg, si->dst_reg,
                                      offsetof(struct sk_buff, len));
                break;
        case offsetof(struct bpf_sock_ops, skb_tcp_flags):
                off = offsetof(struct sk_buff, cb);
                off += offsetof(struct tcp_skb_cb, tcp_flags);
                *target_size = sizeof_field(struct tcp_skb_cb, tcp_flags);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
                                                       skb),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern,
                                               skb));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_skb_cb,
                                                       tcp_flags),
                                      si->dst_reg, si->dst_reg, off);
                break;
        }
        return insn - insn_buf;
}

static u32 sk_skb_convert_ctx_access(enum bpf_access_type type,
                                     const struct bpf_insn *si,
                                     struct bpf_insn *insn_buf,
                                     struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;
        int off;

        switch (si->off) {
        case offsetof(struct __sk_buff, data_end):
                off  = si->off;
                off -= offsetof(struct __sk_buff, data_end);
                off += offsetof(struct sk_buff, cb);
                off += offsetof(struct tcp_skb_cb, bpf.data_end);
                *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
                                      si->src_reg, off);
                break;
        case offsetof(struct __sk_buff, cb[0]) ...
             offsetofend(struct __sk_buff, cb[4]) - 1:
                BUILD_BUG_ON(sizeof_field(struct sk_skb_cb, data) < 20);
                BUILD_BUG_ON((offsetof(struct sk_buff, cb) +
                              offsetof(struct sk_skb_cb, data)) %
                             sizeof(__u64));

                prog->cb_access = 1;
                off  = si->off;
                off -= offsetof(struct __sk_buff, cb[0]);
                off += offsetof(struct sk_buff, cb);
                off += offsetof(struct sk_skb_cb, data);
                if (type == BPF_WRITE)
                        *insn++ = BPF_STX_MEM(BPF_SIZE(si->code), si->dst_reg,
                                              si->src_reg, off);
                else
                        *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg,
                                              si->src_reg, off);
                break;


        default:
                return bpf_convert_ctx_access(type, si, insn_buf, prog,
                                              target_size);
        }

        return insn - insn_buf;
}

static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
                                     const struct bpf_insn *si,
                                     struct bpf_insn *insn_buf,
                                     struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;
#if IS_ENABLED(CONFIG_IPV6)
        int off;
#endif

        /* convert ctx uses the fact sg element is first in struct */
        BUILD_BUG_ON(offsetof(struct sk_msg, sg) != 0);

        switch (si->off) {
        case offsetof(struct sk_msg_md, data):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, data));
                break;
        case offsetof(struct sk_msg_md, data_end):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data_end),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, data_end));
                break;
        case offsetof(struct sk_msg_md, family):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                              struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_family));
                break;

        case offsetof(struct sk_msg_md, remote_ip4):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_daddr));
                break;

        case offsetof(struct sk_msg_md, local_ip4):
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_rcv_saddr) != 4);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                              struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_rcv_saddr));
                break;

        case offsetof(struct sk_msg_md, remote_ip6[0]) ...
             offsetof(struct sk_msg_md, remote_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_v6_daddr.s6_addr32[0]) != 4);

                off = si->off;
                off -= offsetof(struct sk_msg_md, remote_ip6[0]);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_v6_daddr.s6_addr32[0]) +
                                      off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;

        case offsetof(struct sk_msg_md, local_ip6[0]) ...
             offsetof(struct sk_msg_md, local_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_v6_rcv_saddr.s6_addr32[0]) != 4);

                off = si->off;
                off -= offsetof(struct sk_msg_md, local_ip6[0]);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_v6_rcv_saddr.s6_addr32[0]) +
                                      off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;

        case offsetof(struct sk_msg_md, remote_port):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_dport));
#ifndef __BIG_ENDIAN_BITFIELD
                *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
#endif
                break;

        case offsetof(struct sk_msg_md, local_port):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_num));
                break;

        case offsetof(struct sk_msg_md, size):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_sg, size),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg_sg, size));
                break;

        case offsetof(struct sk_msg_md, sk):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                break;
        }

        return insn - insn_buf;
}

const struct bpf_verifier_ops sk_filter_verifier_ops = {
        .get_func_proto                = sk_filter_func_proto,
        .is_valid_access        = sk_filter_is_valid_access,
        .convert_ctx_access        = bpf_convert_ctx_access,
        .gen_ld_abs                = bpf_gen_ld_abs,
};

const struct bpf_prog_ops sk_filter_prog_ops = {
        .test_run                = bpf_prog_test_run_skb,
};

const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
        .get_func_proto                = tc_cls_act_func_proto,
        .is_valid_access        = tc_cls_act_is_valid_access,
        .convert_ctx_access        = tc_cls_act_convert_ctx_access,
        .gen_prologue                = tc_cls_act_prologue,
        .gen_ld_abs                = bpf_gen_ld_abs,
};

const struct bpf_prog_ops tc_cls_act_prog_ops = {
        .test_run                = bpf_prog_test_run_skb,
};

const struct bpf_verifier_ops xdp_verifier_ops = {
        .get_func_proto                = xdp_func_proto,
        .is_valid_access        = xdp_is_valid_access,
        .convert_ctx_access        = xdp_convert_ctx_access,
        .gen_prologue                = bpf_noop_prologue,
};

const struct bpf_prog_ops xdp_prog_ops = {
        .test_run                = bpf_prog_test_run_xdp,
};

const struct bpf_verifier_ops cg_skb_verifier_ops = {
        .get_func_proto                = cg_skb_func_proto,
        .is_valid_access        = cg_skb_is_valid_access,
        .convert_ctx_access        = bpf_convert_ctx_access,
};

const struct bpf_prog_ops cg_skb_prog_ops = {
        .test_run                = bpf_prog_test_run_skb,
};

const struct bpf_verifier_ops lwt_in_verifier_ops = {
        .get_func_proto                = lwt_in_func_proto,
        .is_valid_access        = lwt_is_valid_access,
        .convert_ctx_access        = bpf_convert_ctx_access,
};

const struct bpf_prog_ops lwt_in_prog_ops = {
        .test_run                = bpf_prog_test_run_skb,
};

const struct bpf_verifier_ops lwt_out_verifier_ops = {
        .get_func_proto                = lwt_out_func_proto,
        .is_valid_access        = lwt_is_valid_access,
        .convert_ctx_access        = bpf_convert_ctx_access,
};

const struct bpf_prog_ops lwt_out_prog_ops = {
        .test_run                = bpf_prog_test_run_skb,
};

const struct bpf_verifier_ops lwt_xmit_verifier_ops = {
        .get_func_proto                = lwt_xmit_func_proto,
        .is_valid_access        = lwt_is_valid_access,
        .convert_ctx_access        = bpf_convert_ctx_access,
        .gen_prologue                = tc_cls_act_prologue,
};

const struct bpf_prog_ops lwt_xmit_prog_ops = {
        .test_run                = bpf_prog_test_run_skb,
};

const struct bpf_verifier_ops lwt_seg6local_verifier_ops = {
        .get_func_proto                = lwt_seg6local_func_proto,
        .is_valid_access        = lwt_is_valid_access,
        .convert_ctx_access        = bpf_convert_ctx_access,
};

const struct bpf_prog_ops lwt_seg6local_prog_ops = {
        .test_run                = bpf_prog_test_run_skb,
};

const struct bpf_verifier_ops cg_sock_verifier_ops = {
        .get_func_proto                = sock_filter_func_proto,
        .is_valid_access        = sock_filter_is_valid_access,
        .convert_ctx_access        = bpf_sock_convert_ctx_access,
};

const struct bpf_prog_ops cg_sock_prog_ops = {
};

const struct bpf_verifier_ops cg_sock_addr_verifier_ops = {
        .get_func_proto                = sock_addr_func_proto,
        .is_valid_access        = sock_addr_is_valid_access,
        .convert_ctx_access        = sock_addr_convert_ctx_access,
};

const struct bpf_prog_ops cg_sock_addr_prog_ops = {
};

const struct bpf_verifier_ops sock_ops_verifier_ops = {
        .get_func_proto                = sock_ops_func_proto,
        .is_valid_access        = sock_ops_is_valid_access,
        .convert_ctx_access        = sock_ops_convert_ctx_access,
};

const struct bpf_prog_ops sock_ops_prog_ops = {
};

const struct bpf_verifier_ops sk_skb_verifier_ops = {
        .get_func_proto                = sk_skb_func_proto,
        .is_valid_access        = sk_skb_is_valid_access,
        .convert_ctx_access        = sk_skb_convert_ctx_access,
        .gen_prologue                = sk_skb_prologue,
};

const struct bpf_prog_ops sk_skb_prog_ops = {
};

const struct bpf_verifier_ops sk_msg_verifier_ops = {
        .get_func_proto                = sk_msg_func_proto,
        .is_valid_access        = sk_msg_is_valid_access,
        .convert_ctx_access        = sk_msg_convert_ctx_access,
        .gen_prologue                = bpf_noop_prologue,
};

const struct bpf_prog_ops sk_msg_prog_ops = {
};

const struct bpf_verifier_ops flow_dissector_verifier_ops = {
        .get_func_proto                = flow_dissector_func_proto,
        .is_valid_access        = flow_dissector_is_valid_access,
        .convert_ctx_access        = flow_dissector_convert_ctx_access,
};

const struct bpf_prog_ops flow_dissector_prog_ops = {
        .test_run                = bpf_prog_test_run_flow_dissector,
};

int sk_detach_filter(struct sock *sk)
{
        int ret = -ENOENT;
        struct sk_filter *filter;

        if (sock_flag(sk, SOCK_FILTER_LOCKED))
                return -EPERM;

        filter = rcu_dereference_protected(sk->sk_filter,
                                           lockdep_sock_is_held(sk));
        if (filter) {
                RCU_INIT_POINTER(sk->sk_filter, NULL);
                sk_filter_uncharge(sk, filter);
                ret = 0;
        }

        return ret;
}
EXPORT_SYMBOL_GPL(sk_detach_filter);

int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len)
{
        struct sock_fprog_kern *fprog;
        struct sk_filter *filter;
        int ret = 0;

        lock_sock(sk);
        filter = rcu_dereference_protected(sk->sk_filter,
                                           lockdep_sock_is_held(sk));
        if (!filter)
                goto out;

        /* We're copying the filter that has been originally attached,
         * so no conversion/decode needed anymore. eBPF programs that
         * have no original program cannot be dumped through this.
         */
        ret = -EACCES;
        fprog = filter->prog->orig_prog;
        if (!fprog)
                goto out;

        ret = fprog->len;
        if (!len)
                /* User space only enquires number of filter blocks. */
                goto out;

        ret = -EINVAL;
        if (len < fprog->len)
                goto out;

        ret = -EFAULT;
        if (copy_to_sockptr(optval, fprog->filter, bpf_classic_proglen(fprog)))
                goto out;

        /* Instead of bytes, the API requests to return the number
         * of filter blocks.
         */
        ret = fprog->len;
out:
        release_sock(sk);
        return ret;
}

#ifdef CONFIG_INET
static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
                                    struct sock_reuseport *reuse,
                                    struct sock *sk, struct sk_buff *skb,
                                    u32 hash)
{
        reuse_kern->skb = skb;
        reuse_kern->sk = sk;
        reuse_kern->selected_sk = NULL;
        reuse_kern->data_end = skb->data + skb_headlen(skb);
        reuse_kern->hash = hash;
        reuse_kern->reuseport_id = reuse->reuseport_id;
        reuse_kern->bind_inany = reuse->bind_inany;
}

struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
                                  struct bpf_prog *prog, struct sk_buff *skb,
                                  u32 hash)
{
        struct sk_reuseport_kern reuse_kern;
        enum sk_action action;

        bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, hash);
        action = BPF_PROG_RUN(prog, &reuse_kern);

        if (action == SK_PASS)
                return reuse_kern.selected_sk;
        else
                return ERR_PTR(-ECONNREFUSED);
}

BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern,
           struct bpf_map *, map, void *, key, u32, flags)
{
        bool is_sockarray = map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY;
        struct sock_reuseport *reuse;
        struct sock *selected_sk;
        int err;

        selected_sk = map->ops->map_lookup_elem(map, key);
        if (!selected_sk)
                return -ENOENT;

        reuse = rcu_dereference(selected_sk->sk_reuseport_cb);
        if (!reuse) {
                /* reuseport_array has only sk with non NULL sk_reuseport_cb.
                 * The only (!reuse) case here is - the sk has already been
                 * unhashed (e.g. by close()), so treat it as -ENOENT.
                 *
                 * Other maps (e.g. sock_map) do not provide this guarantee and
                 * the sk may never be in the reuseport group to begin with.
                 */
                err = is_sockarray ? -ENOENT : -EINVAL;
                goto error;
        }

        if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) {
                struct sock *sk = reuse_kern->sk;

                if (sk->sk_protocol != selected_sk->sk_protocol) {
                        err = -EPROTOTYPE;
                } else if (sk->sk_family != selected_sk->sk_family) {
                        err = -EAFNOSUPPORT;
                } else {
                        /* Catch all. Likely bound to a different sockaddr. */
                        err = -EBADFD;
                }
                goto error;
        }

        reuse_kern->selected_sk = selected_sk;

        return 0;
error:
        /* Lookup in sock_map can return TCP ESTABLISHED sockets. */
        if (sk_is_refcounted(selected_sk))
                sock_put(selected_sk);

        return err;
}

static const struct bpf_func_proto sk_select_reuseport_proto = {
        .func           = sk_select_reuseport,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_CONST_MAP_PTR,
        .arg3_type      = ARG_PTR_TO_MAP_KEY,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_4(sk_reuseport_load_bytes,
           const struct sk_reuseport_kern *, reuse_kern, u32, offset,
           void *, to, u32, len)
{
        return ____bpf_skb_load_bytes(reuse_kern->skb, offset, to, len);
}

static const struct bpf_func_proto sk_reuseport_load_bytes_proto = {
        .func                = sk_reuseport_load_bytes,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(sk_reuseport_load_bytes_relative,
           const struct sk_reuseport_kern *, reuse_kern, u32, offset,
           void *, to, u32, len, u32, start_header)
{
        return ____bpf_skb_load_bytes_relative(reuse_kern->skb, offset, to,
                                               len, start_header);
}

static const struct bpf_func_proto sk_reuseport_load_bytes_relative_proto = {
        .func                = sk_reuseport_load_bytes_relative,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
        .arg5_type        = ARG_ANYTHING,
};

static const struct bpf_func_proto *
sk_reuseport_func_proto(enum bpf_func_id func_id,
                        const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_sk_select_reuseport:
                return &sk_select_reuseport_proto;
        case BPF_FUNC_skb_load_bytes:
                return &sk_reuseport_load_bytes_proto;
        case BPF_FUNC_skb_load_bytes_relative:
                return &sk_reuseport_load_bytes_relative_proto;
        default:
                return bpf_base_func_proto(func_id);
        }
}

static bool
sk_reuseport_is_valid_access(int off, int size,
                             enum bpf_access_type type,
                             const struct bpf_prog *prog,
                             struct bpf_insn_access_aux *info)
{
        const u32 size_default = sizeof(__u32);

        if (off < 0 || off >= sizeof(struct sk_reuseport_md) ||
            off % size || type != BPF_READ)
                return false;

        switch (off) {
        case offsetof(struct sk_reuseport_md, data):
                info->reg_type = PTR_TO_PACKET;
                return size == sizeof(__u64);

        case offsetof(struct sk_reuseport_md, data_end):
                info->reg_type = PTR_TO_PACKET_END;
                return size == sizeof(__u64);

        case offsetof(struct sk_reuseport_md, hash):
                return size == size_default;

        /* Fields that allow narrowing */
        case bpf_ctx_range(struct sk_reuseport_md, eth_protocol):
                if (size < sizeof_field(struct sk_buff, protocol))
                        return false;
                fallthrough;
        case bpf_ctx_range(struct sk_reuseport_md, ip_protocol):
        case bpf_ctx_range(struct sk_reuseport_md, bind_inany):
        case bpf_ctx_range(struct sk_reuseport_md, len):
                bpf_ctx_record_field_size(info, size_default);
                return bpf_ctx_narrow_access_ok(off, size, size_default);

        default:
                return false;
        }
}

#define SK_REUSEPORT_LOAD_FIELD(F) ({                                        \
        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_reuseport_kern, F), \
                              si->dst_reg, si->src_reg,                        \
                              bpf_target_off(struct sk_reuseport_kern, F, \
                                             sizeof_field(struct sk_reuseport_kern, F), \
                                             target_size));                \
        })

#define SK_REUSEPORT_LOAD_SKB_FIELD(SKB_FIELD)                                \
        SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern,                \
                                    struct sk_buff,                        \
                                    skb,                                \
                                    SKB_FIELD)

#define SK_REUSEPORT_LOAD_SK_FIELD(SK_FIELD)                                \
        SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern,                \
                                    struct sock,                        \
                                    sk,                                        \
                                    SK_FIELD)

static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type,
                                           const struct bpf_insn *si,
                                           struct bpf_insn *insn_buf,
                                           struct bpf_prog *prog,
                                           u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;

        switch (si->off) {
        case offsetof(struct sk_reuseport_md, data):
                SK_REUSEPORT_LOAD_SKB_FIELD(data);
                break;

        case offsetof(struct sk_reuseport_md, len):
                SK_REUSEPORT_LOAD_SKB_FIELD(len);
                break;

        case offsetof(struct sk_reuseport_md, eth_protocol):
                SK_REUSEPORT_LOAD_SKB_FIELD(protocol);
                break;

        case offsetof(struct sk_reuseport_md, ip_protocol):
                SK_REUSEPORT_LOAD_SK_FIELD(sk_protocol);
                break;

        case offsetof(struct sk_reuseport_md, data_end):
                SK_REUSEPORT_LOAD_FIELD(data_end);
                break;

        case offsetof(struct sk_reuseport_md, hash):
                SK_REUSEPORT_LOAD_FIELD(hash);
                break;

        case offsetof(struct sk_reuseport_md, bind_inany):
                SK_REUSEPORT_LOAD_FIELD(bind_inany);
                break;
        }

        return insn - insn_buf;
}

const struct bpf_verifier_ops sk_reuseport_verifier_ops = {
        .get_func_proto                = sk_reuseport_func_proto,
        .is_valid_access        = sk_reuseport_is_valid_access,
        .convert_ctx_access        = sk_reuseport_convert_ctx_access,
};

const struct bpf_prog_ops sk_reuseport_prog_ops = {
};

DEFINE_STATIC_KEY_FALSE(bpf_sk_lookup_enabled);
EXPORT_SYMBOL(bpf_sk_lookup_enabled);

BPF_CALL_3(bpf_sk_lookup_assign, struct bpf_sk_lookup_kern *, ctx,
           struct sock *, sk, u64, flags)
{
        if (unlikely(flags & ~(BPF_SK_LOOKUP_F_REPLACE |
                               BPF_SK_LOOKUP_F_NO_REUSEPORT)))
                return -EINVAL;
        if (unlikely(sk && sk_is_refcounted(sk)))
                return -ESOCKTNOSUPPORT; /* reject non-RCU freed sockets */
        if (unlikely(sk && sk->sk_state == TCP_ESTABLISHED))
                return -ESOCKTNOSUPPORT; /* reject connected sockets */

        /* Check if socket is suitable for packet L3/L4 protocol */
        if (sk && sk->sk_protocol != ctx->protocol)
                return -EPROTOTYPE;
        if (sk && sk->sk_family != ctx->family &&
            (sk->sk_family == AF_INET || ipv6_only_sock(sk)))
                return -EAFNOSUPPORT;

        if (ctx->selected_sk && !(flags & BPF_SK_LOOKUP_F_REPLACE))
                return -EEXIST;

        /* Select socket as lookup result */
        ctx->selected_sk = sk;
        ctx->no_reuseport = flags & BPF_SK_LOOKUP_F_NO_REUSEPORT;
        return 0;
}

static const struct bpf_func_proto bpf_sk_lookup_assign_proto = {
        .func                = bpf_sk_lookup_assign,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_SOCKET_OR_NULL,
        .arg3_type        = ARG_ANYTHING,
};

static const struct bpf_func_proto *
sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_perf_event_output:
                return &bpf_event_output_data_proto;
        case BPF_FUNC_sk_assign:
                return &bpf_sk_lookup_assign_proto;
        case BPF_FUNC_sk_release:
                return &bpf_sk_release_proto;
        default:
                return bpf_sk_base_func_proto(func_id);
        }
}

static bool sk_lookup_is_valid_access(int off, int size,
                                      enum bpf_access_type type,
                                      const struct bpf_prog *prog,
                                      struct bpf_insn_access_aux *info)
{
        if (off < 0 || off >= sizeof(struct bpf_sk_lookup))
                return false;
        if (off % size != 0)
                return false;
        if (type != BPF_READ)
                return false;

        switch (off) {
        case bpf_ctx_range_ptr(struct bpf_sk_lookup, sk):
                info->reg_type = PTR_TO_SOCKET_OR_NULL;
                return size == sizeof(__u64);

        case bpf_ctx_range(struct bpf_sk_lookup, family):
        case bpf_ctx_range(struct bpf_sk_lookup, protocol):
        case bpf_ctx_range(struct bpf_sk_lookup, remote_ip4):
        case bpf_ctx_range(struct bpf_sk_lookup, local_ip4):
        case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]):
        case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]):
        case bpf_ctx_range(struct bpf_sk_lookup, remote_port):
        case bpf_ctx_range(struct bpf_sk_lookup, local_port):
                bpf_ctx_record_field_size(info, sizeof(__u32));
                return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32));

        default:
                return false;
        }
}

static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type,
                                        const struct bpf_insn *si,
                                        struct bpf_insn *insn_buf,
                                        struct bpf_prog *prog,
                                        u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;

        switch (si->off) {
        case offsetof(struct bpf_sk_lookup, sk):
                *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sk_lookup_kern, selected_sk));
                break;

        case offsetof(struct bpf_sk_lookup, family):
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct bpf_sk_lookup_kern,
                                                     family, 2, target_size));
                break;

        case offsetof(struct bpf_sk_lookup, protocol):
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct bpf_sk_lookup_kern,
                                                     protocol, 2, target_size));
                break;

        case offsetof(struct bpf_sk_lookup, remote_ip4):
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct bpf_sk_lookup_kern,
                                                     v4.saddr, 4, target_size));
                break;

        case offsetof(struct bpf_sk_lookup, local_ip4):
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct bpf_sk_lookup_kern,
                                                     v4.daddr, 4, target_size));
                break;

        case bpf_ctx_range_till(struct bpf_sk_lookup,
                                remote_ip6[0], remote_ip6[3]): {
#if IS_ENABLED(CONFIG_IPV6)
                int off = si->off;

                off -= offsetof(struct bpf_sk_lookup, remote_ip6[0]);
                off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size);
                *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sk_lookup_kern, v6.saddr));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;
        }
        case bpf_ctx_range_till(struct bpf_sk_lookup,
                                local_ip6[0], local_ip6[3]): {
#if IS_ENABLED(CONFIG_IPV6)
                int off = si->off;

                off -= offsetof(struct bpf_sk_lookup, local_ip6[0]);
                off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size);
                *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sk_lookup_kern, v6.daddr));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;
        }
        case offsetof(struct bpf_sk_lookup, remote_port):
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct bpf_sk_lookup_kern,
                                                     sport, 2, target_size));
                break;

        case offsetof(struct bpf_sk_lookup, local_port):
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct bpf_sk_lookup_kern,
                                                     dport, 2, target_size));
                break;
        }

        return insn - insn_buf;
}

const struct bpf_prog_ops sk_lookup_prog_ops = {
        .test_run = bpf_prog_test_run_sk_lookup,
};

const struct bpf_verifier_ops sk_lookup_verifier_ops = {
        .get_func_proto                = sk_lookup_func_proto,
        .is_valid_access        = sk_lookup_is_valid_access,
        .convert_ctx_access        = sk_lookup_convert_ctx_access,
};

#endif /* CONFIG_INET */

DEFINE_BPF_DISPATCHER(xdp)

void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog)
{
        bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog);
}

#ifdef CONFIG_DEBUG_INFO_BTF
BTF_ID_LIST_GLOBAL(btf_sock_ids)
#define BTF_SOCK_TYPE(name, type) BTF_ID(struct, type)
BTF_SOCK_TYPE_xxx
#undef BTF_SOCK_TYPE
#else
u32 btf_sock_ids[MAX_BTF_SOCK_TYPE];
#endif

BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk)
{
        /* tcp6_sock type is not generated in dwarf and hence btf,
         * trigger an explicit type generation here.
         */
        BTF_TYPE_EMIT(struct tcp6_sock);
        if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP &&
            sk->sk_family == AF_INET6)
                return (unsigned long)sk;

        return (unsigned long)NULL;
}

const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = {
        .func                        = bpf_skc_to_tcp6_sock,
        .gpl_only                = false,
        .ret_type                = RET_PTR_TO_BTF_ID_OR_NULL,
        .arg1_type                = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .ret_btf_id                = &btf_sock_ids[BTF_SOCK_TYPE_TCP6],
};

BPF_CALL_1(bpf_skc_to_tcp_sock, struct sock *, sk)
{
        if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP)
                return (unsigned long)sk;

        return (unsigned long)NULL;
}

const struct bpf_func_proto bpf_skc_to_tcp_sock_proto = {
        .func                        = bpf_skc_to_tcp_sock,
        .gpl_only                = false,
        .ret_type                = RET_PTR_TO_BTF_ID_OR_NULL,
        .arg1_type                = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .ret_btf_id                = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
};

BPF_CALL_1(bpf_skc_to_tcp_timewait_sock, struct sock *, sk)
{
        /* BTF types for tcp_timewait_sock and inet_timewait_sock are not
         * generated if CONFIG_INET=n. Trigger an explicit generation here.
         */
        BTF_TYPE_EMIT(struct inet_timewait_sock);
        BTF_TYPE_EMIT(struct tcp_timewait_sock);

#ifdef CONFIG_INET
        if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_TIME_WAIT)
                return (unsigned long)sk;
#endif

#if IS_BUILTIN(CONFIG_IPV6)
        if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_TIME_WAIT)
                return (unsigned long)sk;
#endif

        return (unsigned long)NULL;
}

const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto = {
        .func                        = bpf_skc_to_tcp_timewait_sock,
        .gpl_only                = false,
        .ret_type                = RET_PTR_TO_BTF_ID_OR_NULL,
        .arg1_type                = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .ret_btf_id                = &btf_sock_ids[BTF_SOCK_TYPE_TCP_TW],
};

BPF_CALL_1(bpf_skc_to_tcp_request_sock, struct sock *, sk)
{
#ifdef CONFIG_INET
        if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_NEW_SYN_RECV)
                return (unsigned long)sk;
#endif

#if IS_BUILTIN(CONFIG_IPV6)
        if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_NEW_SYN_RECV)
                return (unsigned long)sk;
#endif

        return (unsigned long)NULL;
}

const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto = {
        .func                        = bpf_skc_to_tcp_request_sock,
        .gpl_only                = false,
        .ret_type                = RET_PTR_TO_BTF_ID_OR_NULL,
        .arg1_type                = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .ret_btf_id                = &btf_sock_ids[BTF_SOCK_TYPE_TCP_REQ],
};

BPF_CALL_1(bpf_skc_to_udp6_sock, struct sock *, sk)
{
        /* udp6_sock type is not generated in dwarf and hence btf,
         * trigger an explicit type generation here.
         */
        BTF_TYPE_EMIT(struct udp6_sock);
        if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_UDP &&
            sk->sk_type == SOCK_DGRAM && sk->sk_family == AF_INET6)
                return (unsigned long)sk;

        return (unsigned long)NULL;
}

const struct bpf_func_proto bpf_skc_to_udp6_sock_proto = {
        .func                        = bpf_skc_to_udp6_sock,
        .gpl_only                = false,
        .ret_type                = RET_PTR_TO_BTF_ID_OR_NULL,
        .arg1_type                = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .ret_btf_id                = &btf_sock_ids[BTF_SOCK_TYPE_UDP6],
};

static const struct bpf_func_proto *
bpf_sk_base_func_proto(enum bpf_func_id func_id)
{
        const struct bpf_func_proto *func;

        switch (func_id) {
        case BPF_FUNC_skc_to_tcp6_sock:
                func = &bpf_skc_to_tcp6_sock_proto;
                break;
        case BPF_FUNC_skc_to_tcp_sock:
                func = &bpf_skc_to_tcp_sock_proto;
                break;
        case BPF_FUNC_skc_to_tcp_timewait_sock:
                func = &bpf_skc_to_tcp_timewait_sock_proto;
                break;
        case BPF_FUNC_skc_to_tcp_request_sock:
                func = &bpf_skc_to_tcp_request_sock_proto;
                break;
        case BPF_FUNC_skc_to_udp6_sock:
                func = &bpf_skc_to_udp6_sock_proto;
                break;
        default:
                return bpf_base_func_proto(func_id);
        }

        if (!perfmon_capable())
                return NULL;

        return func;
}
























































































































































































































































































































































































































    2 




    2 



    2 
    2 
    2 



    2 









































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
// SPDX-License-Identifier: GPL-2.0
/*
 * trace event based perf event profiling/tracing
 *
 * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra
 * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com>
 */

#include <linux/module.h>
#include <linux/kprobes.h>
#include <linux/security.h>
#include "trace.h"
#include "trace_probe.h"

static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];

/*
 * Force it to be aligned to unsigned long to avoid misaligned accesses
 * suprises
 */
typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
        perf_trace_t;

/* Count the events in use (per event id, not per instance) */
static int        total_ref_count;

static int perf_trace_event_perm(struct trace_event_call *tp_event,
                                 struct perf_event *p_event)
{
        int ret;

        if (tp_event->perf_perm) {
                ret = tp_event->perf_perm(tp_event, p_event);
                if (ret)
                        return ret;
        }

        /*
         * We checked and allowed to create parent,
         * allow children without checking.
         */
        if (p_event->parent)
                return 0;

        /*
         * It's ok to check current process (owner) permissions in here,
         * because code below is called only via perf_event_open syscall.
         */

        /* The ftrace function trace is allowed only for root. */
        if (ftrace_event_is_function(tp_event)) {
                ret = perf_allow_tracepoint(&p_event->attr);
                if (ret)
                        return ret;

                if (!is_sampling_event(p_event))
                        return 0;

                /*
                 * We don't allow user space callchains for  function trace
                 * event, due to issues with page faults while tracing page
                 * fault handler and its overall trickiness nature.
                 */
                if (!p_event->attr.exclude_callchain_user)
                        return -EINVAL;

                /*
                 * Same reason to disable user stack dump as for user space
                 * callchains above.
                 */
                if (p_event->attr.sample_type & PERF_SAMPLE_STACK_USER)
                        return -EINVAL;
        }

        /* No tracing, just counting, so no obvious leak */
        if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
                return 0;

        /* Some events are ok to be traced by non-root users... */
        if (p_event->attach_state == PERF_ATTACH_TASK) {
                if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY)
                        return 0;
        }

        /*
         * ...otherwise raw tracepoint data can be a severe data leak,
         * only allow root to have these.
         */
        ret = perf_allow_tracepoint(&p_event->attr);
        if (ret)
                return ret;

        return 0;
}

static int perf_trace_event_reg(struct trace_event_call *tp_event,
                                struct perf_event *p_event)
{
        struct hlist_head __percpu *list;
        int ret = -ENOMEM;
        int cpu;

        p_event->tp_event = tp_event;
        if (tp_event->perf_refcount++ > 0)
                return 0;

        list = alloc_percpu(struct hlist_head);
        if (!list)
                goto fail;

        for_each_possible_cpu(cpu)
                INIT_HLIST_HEAD(per_cpu_ptr(list, cpu));

        tp_event->perf_events = list;

        if (!total_ref_count) {
                char __percpu *buf;
                int i;

                for (i = 0; i < PERF_NR_CONTEXTS; i++) {
                        buf = (char __percpu *)alloc_percpu(perf_trace_t);
                        if (!buf)
                                goto fail;

                        perf_trace_buf[i] = buf;
                }
        }

        ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL);
        if (ret)
                goto fail;

        total_ref_count++;
        return 0;

fail:
        if (!total_ref_count) {
                int i;

                for (i = 0; i < PERF_NR_CONTEXTS; i++) {
                        free_percpu(perf_trace_buf[i]);
                        perf_trace_buf[i] = NULL;
                }
        }

        if (!--tp_event->perf_refcount) {
                free_percpu(tp_event->perf_events);
                tp_event->perf_events = NULL;
        }

        return ret;
}

static void perf_trace_event_unreg(struct perf_event *p_event)
{
        struct trace_event_call *tp_event = p_event->tp_event;
        int i;

        if (--tp_event->perf_refcount > 0)
                goto out;

        tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL);

        /*
         * Ensure our callback won't be called anymore. The buffers
         * will be freed after that.
         */
        tracepoint_synchronize_unregister();

        free_percpu(tp_event->perf_events);
        tp_event->perf_events = NULL;

        if (!--total_ref_count) {
                for (i = 0; i < PERF_NR_CONTEXTS; i++) {
                        free_percpu(perf_trace_buf[i]);
                        perf_trace_buf[i] = NULL;
                }
        }
out:
        module_put(tp_event->mod);
}

static int perf_trace_event_open(struct perf_event *p_event)
{
        struct trace_event_call *tp_event = p_event->tp_event;
        return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event);
}

static void perf_trace_event_close(struct perf_event *p_event)
{
        struct trace_event_call *tp_event = p_event->tp_event;
        tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event);
}

static int perf_trace_event_init(struct trace_event_call *tp_event,
                                 struct perf_event *p_event)
{
        int ret;

        ret = perf_trace_event_perm(tp_event, p_event);
        if (ret)
                return ret;

        ret = perf_trace_event_reg(tp_event, p_event);
        if (ret)
                return ret;

        ret = perf_trace_event_open(p_event);
        if (ret) {
                perf_trace_event_unreg(p_event);
                return ret;
        }

        return 0;
}

int perf_trace_init(struct perf_event *p_event)
{
        struct trace_event_call *tp_event;
        u64 event_id = p_event->attr.config;
        int ret = -EINVAL;

        mutex_lock(&event_mutex);
        list_for_each_entry(tp_event, &ftrace_events, list) {
                if (tp_event->event.type == event_id &&
                    tp_event->class && tp_event->class->reg &&
                    try_module_get(tp_event->mod)) {
                        ret = perf_trace_event_init(tp_event, p_event);
                        if (ret)
                                module_put(tp_event->mod);
                        break;
                }
        }
        mutex_unlock(&event_mutex);

        return ret;
}

void perf_trace_destroy(struct perf_event *p_event)
{
        mutex_lock(&event_mutex);
        perf_trace_event_close(p_event);
        perf_trace_event_unreg(p_event);
        mutex_unlock(&event_mutex);
}

#ifdef CONFIG_KPROBE_EVENTS
int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe)
{
        int ret;
        char *func = NULL;
        struct trace_event_call *tp_event;

        if (p_event->attr.kprobe_func) {
                func = kzalloc(KSYM_NAME_LEN, GFP_KERNEL);
                if (!func)
                        return -ENOMEM;
                ret = strncpy_from_user(
                        func, u64_to_user_ptr(p_event->attr.kprobe_func),
                        KSYM_NAME_LEN);
                if (ret == KSYM_NAME_LEN)
                        ret = -E2BIG;
                if (ret < 0)
                        goto out;

                if (func[0] == '\0') {
                        kfree(func);
                        func = NULL;
                }
        }

        tp_event = create_local_trace_kprobe(
                func, (void *)(unsigned long)(p_event->attr.kprobe_addr),
                p_event->attr.probe_offset, is_retprobe);
        if (IS_ERR(tp_event)) {
                ret = PTR_ERR(tp_event);
                goto out;
        }

        mutex_lock(&event_mutex);
        ret = perf_trace_event_init(tp_event, p_event);
        if (ret)
                destroy_local_trace_kprobe(tp_event);
        mutex_unlock(&event_mutex);
out:
        kfree(func);
        return ret;
}

void perf_kprobe_destroy(struct perf_event *p_event)
{
        mutex_lock(&event_mutex);
        perf_trace_event_close(p_event);
        perf_trace_event_unreg(p_event);
        mutex_unlock(&event_mutex);

        destroy_local_trace_kprobe(p_event->tp_event);
}
#endif /* CONFIG_KPROBE_EVENTS */

#ifdef CONFIG_UPROBE_EVENTS
int perf_uprobe_init(struct perf_event *p_event,
                     unsigned long ref_ctr_offset, bool is_retprobe)
{
        int ret;
        char *path = NULL;
        struct trace_event_call *tp_event;

        if (!p_event->attr.uprobe_path)
                return -EINVAL;

        path = strndup_user(u64_to_user_ptr(p_event->attr.uprobe_path),
                            PATH_MAX);
        if (IS_ERR(path)) {
                ret = PTR_ERR(path);
                return (ret == -EINVAL) ? -E2BIG : ret;
        }
        if (path[0] == '\0') {
                ret = -EINVAL;
                goto out;
        }

        tp_event = create_local_trace_uprobe(path, p_event->attr.probe_offset,
                                             ref_ctr_offset, is_retprobe);
        if (IS_ERR(tp_event)) {
                ret = PTR_ERR(tp_event);
                goto out;
        }

        /*
         * local trace_uprobe need to hold event_mutex to call
         * uprobe_buffer_enable() and uprobe_buffer_disable().
         * event_mutex is not required for local trace_kprobes.
         */
        mutex_lock(&event_mutex);
        ret = perf_trace_event_init(tp_event, p_event);
        if (ret)
                destroy_local_trace_uprobe(tp_event);
        mutex_unlock(&event_mutex);
out:
        kfree(path);
        return ret;
}

void perf_uprobe_destroy(struct perf_event *p_event)
{
        mutex_lock(&event_mutex);
        perf_trace_event_close(p_event);
        perf_trace_event_unreg(p_event);
        mutex_unlock(&event_mutex);
        destroy_local_trace_uprobe(p_event->tp_event);
}
#endif /* CONFIG_UPROBE_EVENTS */

int perf_trace_add(struct perf_event *p_event, int flags)
{
        struct trace_event_call *tp_event = p_event->tp_event;
        struct hw_perf_event *hwc = &p_event->hw;

        if (!(flags & PERF_EF_START))
                p_event->hw.state = PERF_HES_STOPPED;

        if (is_sampling_event(p_event)) {
                hwc->last_period = hwc->sample_period;
                perf_swevent_set_period(p_event);
        }

        /*
         * If TRACE_REG_PERF_ADD returns false; no custom action was performed
         * and we need to take the default action of enqueueing our event on
         * the right per-cpu hlist.
         */
        if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event)) {
                struct hlist_head __percpu *pcpu_list;
                struct hlist_head *list;

                pcpu_list = tp_event->perf_events;
                if (WARN_ON_ONCE(!pcpu_list))
                        return -EINVAL;

                list = this_cpu_ptr(pcpu_list);
                hlist_add_head_rcu(&p_event->hlist_entry, list);
        }

        return 0;
}

void perf_trace_del(struct perf_event *p_event, int flags)
{
        struct trace_event_call *tp_event = p_event->tp_event;

        /*
         * If TRACE_REG_PERF_DEL returns false; no custom action was performed
         * and we need to take the default action of dequeueing our event from
         * the right per-cpu hlist.
         */
        if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event))
                hlist_del_rcu(&p_event->hlist_entry);
}

void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
{
        char *raw_data;
        int rctx;

        BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));

        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
                      "perf buffer not large enough, wanted %d, have %d",
                      size, PERF_MAX_TRACE_SIZE))
                return NULL;

        *rctxp = rctx = perf_swevent_get_recursion_context();
        if (rctx < 0)
                return NULL;

        if (regs)
                *regs = this_cpu_ptr(&__perf_regs[rctx]);
        raw_data = this_cpu_ptr(perf_trace_buf[rctx]);

        /* zero the dead bytes from align to not leak stack to user */
        memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
        return raw_data;
}
EXPORT_SYMBOL_GPL(perf_trace_buf_alloc);
NOKPROBE_SYMBOL(perf_trace_buf_alloc);

void perf_trace_buf_update(void *record, u16 type)
{
        struct trace_entry *entry = record;
        int pc = preempt_count();
        unsigned long flags;

        local_save_flags(flags);
        tracing_generic_entry_update(entry, type, flags, pc);
}
NOKPROBE_SYMBOL(perf_trace_buf_update);

#ifdef CONFIG_FUNCTION_TRACER
static void
perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
                          struct ftrace_ops *ops, struct pt_regs *pt_regs)
{
        struct ftrace_entry *entry;
        struct perf_event *event;
        struct hlist_head head;
        struct pt_regs regs;
        int rctx;

        if ((unsigned long)ops->private != smp_processor_id())
                return;

        event = container_of(ops, struct perf_event, ftrace_ops);

        /*
         * @event->hlist entry is NULL (per INIT_HLIST_NODE), and all
         * the perf code does is hlist_for_each_entry_rcu(), so we can
         * get away with simply setting the @head.first pointer in order
         * to create a singular list.
         */
        head.first = &event->hlist_entry;

#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
                    sizeof(u64)) - sizeof(u32))

        BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE);

        memset(&regs, 0, sizeof(regs));
        perf_fetch_caller_regs(&regs);

        entry = perf_trace_buf_alloc(ENTRY_SIZE, NULL, &rctx);
        if (!entry)
                return;

        entry->ip = ip;
        entry->parent_ip = parent_ip;
        perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
                              1, &regs, &head, NULL);

#undef ENTRY_SIZE
}

static int perf_ftrace_function_register(struct perf_event *event)
{
        struct ftrace_ops *ops = &event->ftrace_ops;

        ops->flags   = FTRACE_OPS_FL_RCU;
        ops->func    = perf_ftrace_function_call;
        ops->private = (void *)(unsigned long)nr_cpu_ids;

        return register_ftrace_function(ops);
}

static int perf_ftrace_function_unregister(struct perf_event *event)
{
        struct ftrace_ops *ops = &event->ftrace_ops;
        int ret = unregister_ftrace_function(ops);
        ftrace_free_filter(ops);
        return ret;
}

int perf_ftrace_event_register(struct trace_event_call *call,
                               enum trace_reg type, void *data)
{
        struct perf_event *event = data;

        switch (type) {
        case TRACE_REG_REGISTER:
        case TRACE_REG_UNREGISTER:
                break;
        case TRACE_REG_PERF_REGISTER:
        case TRACE_REG_PERF_UNREGISTER:
                return 0;
        case TRACE_REG_PERF_OPEN:
                return perf_ftrace_function_register(data);
        case TRACE_REG_PERF_CLOSE:
                return perf_ftrace_function_unregister(data);
        case TRACE_REG_PERF_ADD:
                event->ftrace_ops.private = (void *)(unsigned long)smp_processor_id();
                return 1;
        case TRACE_REG_PERF_DEL:
                event->ftrace_ops.private = (void *)(unsigned long)nr_cpu_ids;
                return 1;
        }

        return -EINVAL;
}
#endif /* CONFIG_FUNCTION_TRACER */
































































































































































    1 


    1 












































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
// SPDX-License-Identifier: GPL-2.0-or-later
/* user_defined.c: user defined key type
 *
 * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#include <linux/export.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/seq_file.h>
#include <linux/err.h>
#include <keys/user-type.h>
#include <linux/uaccess.h>
#include "internal.h"

static int logon_vet_description(const char *desc);

/*
 * user defined keys take an arbitrary string as the description and an
 * arbitrary blob of data as the payload
 */
struct key_type key_type_user = {
        .name                        = "user",
        .preparse                = user_preparse,
        .free_preparse                = user_free_preparse,
        .instantiate                = generic_key_instantiate,
        .update                        = user_update,
        .revoke                        = user_revoke,
        .destroy                = user_destroy,
        .describe                = user_describe,
        .read                        = user_read,
};

EXPORT_SYMBOL_GPL(key_type_user);

/*
 * This key type is essentially the same as key_type_user, but it does
 * not define a .read op. This is suitable for storing username and
 * password pairs in the keyring that you do not want to be readable
 * from userspace.
 */
struct key_type key_type_logon = {
        .name                        = "logon",
        .preparse                = user_preparse,
        .free_preparse                = user_free_preparse,
        .instantiate                = generic_key_instantiate,
        .update                        = user_update,
        .revoke                        = user_revoke,
        .destroy                = user_destroy,
        .describe                = user_describe,
        .vet_description        = logon_vet_description,
};
EXPORT_SYMBOL_GPL(key_type_logon);

/*
 * Preparse a user defined key payload
 */
int user_preparse(struct key_preparsed_payload *prep)
{
        struct user_key_payload *upayload;
        size_t datalen = prep->datalen;

        if (datalen <= 0 || datalen > 32767 || !prep->data)
                return -EINVAL;

        upayload = kmalloc(sizeof(*upayload) + datalen, GFP_KERNEL);
        if (!upayload)
                return -ENOMEM;

        /* attach the data */
        prep->quotalen = datalen;
        prep->payload.data[0] = upayload;
        upayload->datalen = datalen;
        memcpy(upayload->data, prep->data, datalen);
        return 0;
}
EXPORT_SYMBOL_GPL(user_preparse);

/*
 * Free a preparse of a user defined key payload
 */
void user_free_preparse(struct key_preparsed_payload *prep)
{
        kfree_sensitive(prep->payload.data[0]);
}
EXPORT_SYMBOL_GPL(user_free_preparse);

static void user_free_payload_rcu(struct rcu_head *head)
{
        struct user_key_payload *payload;

        payload = container_of(head, struct user_key_payload, rcu);
        kfree_sensitive(payload);
}

/*
 * update a user defined key
 * - the key's semaphore is write-locked
 */
int user_update(struct key *key, struct key_preparsed_payload *prep)
{
        struct user_key_payload *zap = NULL;
        int ret;

        /* check the quota and attach the new data */
        ret = key_payload_reserve(key, prep->datalen);
        if (ret < 0)
                return ret;

        /* attach the new data, displacing the old */
        key->expiry = prep->expiry;
        if (key_is_positive(key))
                zap = dereference_key_locked(key);
        rcu_assign_keypointer(key, prep->payload.data[0]);
        prep->payload.data[0] = NULL;

        if (zap)
                call_rcu(&zap->rcu, user_free_payload_rcu);
        return ret;
}
EXPORT_SYMBOL_GPL(user_update);

/*
 * dispose of the links from a revoked keyring
 * - called with the key sem write-locked
 */
void user_revoke(struct key *key)
{
        struct user_key_payload *upayload = user_key_payload_locked(key);

        /* clear the quota */
        key_payload_reserve(key, 0);

        if (upayload) {
                rcu_assign_keypointer(key, NULL);
                call_rcu(&upayload->rcu, user_free_payload_rcu);
        }
}

EXPORT_SYMBOL(user_revoke);

/*
 * dispose of the data dangling from the corpse of a user key
 */
void user_destroy(struct key *key)
{
        struct user_key_payload *upayload = key->payload.data[0];

        kfree_sensitive(upayload);
}

EXPORT_SYMBOL_GPL(user_destroy);

/*
 * describe the user key
 */
void user_describe(const struct key *key, struct seq_file *m)
{
        seq_puts(m, key->description);
        if (key_is_positive(key))
                seq_printf(m, ": %u", key->datalen);
}

EXPORT_SYMBOL_GPL(user_describe);

/*
 * read the key data
 * - the key's semaphore is read-locked
 */
long user_read(const struct key *key, char *buffer, size_t buflen)
{
        const struct user_key_payload *upayload;
        long ret;

        upayload = user_key_payload_locked(key);
        ret = upayload->datalen;

        /* we can return the data as is */
        if (buffer && buflen > 0) {
                if (buflen > upayload->datalen)
                        buflen = upayload->datalen;

                memcpy(buffer, upayload->data, buflen);
        }

        return ret;
}

EXPORT_SYMBOL_GPL(user_read);

/* Vet the description for a "logon" key */
static int logon_vet_description(const char *desc)
{
        char *p;

        /* require a "qualified" description string */
        p = strchr(desc, ':');
        if (!p)
                return -EINVAL;

        /* also reject description with ':' as first char */
        if (p == desc)
                return -EINVAL;

        return 0;
}



























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright 2006, Johannes Berg <johannes@sipsolutions.net>
 */

#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/leds.h>
#include "ieee80211_i.h"

#define MAC80211_BLINK_DELAY 50 /* ms */

static inline void ieee80211_led_rx(struct ieee80211_local *local)
{
#ifdef CONFIG_MAC80211_LEDS
        unsigned long led_delay = MAC80211_BLINK_DELAY;

        if (!atomic_read(&local->rx_led_active))
                return;
        led_trigger_blink_oneshot(&local->rx_led, &led_delay, &led_delay, 0);
#endif
}

static inline void ieee80211_led_tx(struct ieee80211_local *local)
{
#ifdef CONFIG_MAC80211_LEDS
        unsigned long led_delay = MAC80211_BLINK_DELAY;

        if (!atomic_read(&local->tx_led_active))
                return;
        led_trigger_blink_oneshot(&local->tx_led, &led_delay, &led_delay, 0);
#endif
}

#ifdef CONFIG_MAC80211_LEDS
void ieee80211_led_assoc(struct ieee80211_local *local,
                         bool associated);
void ieee80211_led_radio(struct ieee80211_local *local,
                         bool enabled);
void ieee80211_alloc_led_names(struct ieee80211_local *local);
void ieee80211_free_led_names(struct ieee80211_local *local);
void ieee80211_led_init(struct ieee80211_local *local);
void ieee80211_led_exit(struct ieee80211_local *local);
void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local,
                                unsigned int types_on, unsigned int types_off);
#else
static inline void ieee80211_led_assoc(struct ieee80211_local *local,
                                       bool associated)
{
}
static inline void ieee80211_led_radio(struct ieee80211_local *local,
                                       bool enabled)
{
}
static inline void ieee80211_alloc_led_names(struct ieee80211_local *local)
{
}
static inline void ieee80211_free_led_names(struct ieee80211_local *local)
{
}
static inline void ieee80211_led_init(struct ieee80211_local *local)
{
}
static inline void ieee80211_led_exit(struct ieee80211_local *local)
{
}
static inline void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local,
                                              unsigned int types_on,
                                              unsigned int types_off)
{
}
#endif

static inline void
ieee80211_tpt_led_trig_tx(struct ieee80211_local *local, __le16 fc, int bytes)
{
#ifdef CONFIG_MAC80211_LEDS
        if (ieee80211_is_data(fc) && atomic_read(&local->tpt_led_active))
                local->tpt_led_trigger->tx_bytes += bytes;
#endif
}

static inline void
ieee80211_tpt_led_trig_rx(struct ieee80211_local *local, __le16 fc, int bytes)
{
#ifdef CONFIG_MAC80211_LEDS
        if (ieee80211_is_data(fc) && atomic_read(&local->tpt_led_active))
                local->tpt_led_trigger->rx_bytes += bytes;
#endif
}

















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2014 Felix Fietkau <nbd@nbd.name>
 * Copyright (C) 2004 - 2009 Ivo van Doorn <IvDoorn@gmail.com>
 */

#ifndef _LINUX_BITFIELD_H
#define _LINUX_BITFIELD_H

#include <linux/build_bug.h>
#include <asm/byteorder.h>

/*
 * Bitfield access macros
 *
 * FIELD_{GET,PREP} macros take as first parameter shifted mask
 * from which they extract the base mask and shift amount.
 * Mask must be a compilation time constant.
 *
 * Example:
 *
 *  #define REG_FIELD_A  GENMASK(6, 0)
 *  #define REG_FIELD_B  BIT(7)
 *  #define REG_FIELD_C  GENMASK(15, 8)
 *  #define REG_FIELD_D  GENMASK(31, 16)
 *
 * Get:
 *  a = FIELD_GET(REG_FIELD_A, reg);
 *  b = FIELD_GET(REG_FIELD_B, reg);
 *
 * Set:
 *  reg = FIELD_PREP(REG_FIELD_A, 1) |
 *          FIELD_PREP(REG_FIELD_B, 0) |
 *          FIELD_PREP(REG_FIELD_C, c) |
 *          FIELD_PREP(REG_FIELD_D, 0x40);
 *
 * Modify:
 *  reg &= ~REG_FIELD_C;
 *  reg |= FIELD_PREP(REG_FIELD_C, c);
 */

#define __bf_shf(x) (__builtin_ffsll(x) - 1)

#define __scalar_type_to_unsigned_cases(type)                                \
                unsigned type:        (unsigned type)0,                        \
                signed type:        (unsigned type)0

#define __unsigned_scalar_typeof(x) typeof(                                \
                _Generic((x),                                                \
                        char:        (unsigned char)0,                        \
                        __scalar_type_to_unsigned_cases(char),                \
                        __scalar_type_to_unsigned_cases(short),                \
                        __scalar_type_to_unsigned_cases(int),                \
                        __scalar_type_to_unsigned_cases(long),                \
                        __scalar_type_to_unsigned_cases(long long),        \
                        default: (x)))

#define __bf_cast_unsigned(type, x)        ((__unsigned_scalar_typeof(type))(x))

#define __BF_FIELD_CHECK(_mask, _reg, _val, _pfx)                        \
        ({                                                                \
                BUILD_BUG_ON_MSG(!__builtin_constant_p(_mask),                \
                                 _pfx "mask is not constant");                \
                BUILD_BUG_ON_MSG((_mask) == 0, _pfx "mask is zero");        \
                BUILD_BUG_ON_MSG(__builtin_constant_p(_val) ?                \
                                 ~((_mask) >> __bf_shf(_mask)) & (_val) : 0, \
                                 _pfx "value too large for the field"); \
                BUILD_BUG_ON_MSG(__bf_cast_unsigned(_mask, _mask) >        \
                                 __bf_cast_unsigned(_reg, ~0ull),        \
                                 _pfx "type of reg too small for mask"); \
                __BUILD_BUG_ON_NOT_POWER_OF_2((_mask) +                        \
                                              (1ULL << __bf_shf(_mask))); \
        })

/**
 * FIELD_MAX() - produce the maximum value representable by a field
 * @_mask: shifted mask defining the field's length and position
 *
 * FIELD_MAX() returns the maximum value that can be held in the field
 * specified by @_mask.
 */
#define FIELD_MAX(_mask)                                                \
        ({                                                                \
                __BF_FIELD_CHECK(_mask, 0ULL, 0ULL, "FIELD_MAX: ");        \
                (typeof(_mask))((_mask) >> __bf_shf(_mask));                \
        })

/**
 * FIELD_FIT() - check if value fits in the field
 * @_mask: shifted mask defining the field's length and position
 * @_val:  value to test against the field
 *
 * Return: true if @_val can fit inside @_mask, false if @_val is too big.
 */
#define FIELD_FIT(_mask, _val)                                                \
        ({                                                                \
                __BF_FIELD_CHECK(_mask, 0ULL, 0ULL, "FIELD_FIT: ");        \
                !((((typeof(_mask))_val) << __bf_shf(_mask)) & ~(_mask)); \
        })

/**
 * FIELD_PREP() - prepare a bitfield element
 * @_mask: shifted mask defining the field's length and position
 * @_val:  value to put in the field
 *
 * FIELD_PREP() masks and shifts up the value.  The result should
 * be combined with other fields of the bitfield using logical OR.
 */
#define FIELD_PREP(_mask, _val)                                                \
        ({                                                                \
                __BF_FIELD_CHECK(_mask, 0ULL, _val, "FIELD_PREP: ");        \
                ((typeof(_mask))(_val) << __bf_shf(_mask)) & (_mask);        \
        })

/**
 * FIELD_GET() - extract a bitfield element
 * @_mask: shifted mask defining the field's length and position
 * @_reg:  value of entire bitfield
 *
 * FIELD_GET() extracts the field specified by @_mask from the
 * bitfield passed in as @_reg by masking and shifting it down.
 */
#define FIELD_GET(_mask, _reg)                                                \
        ({                                                                \
                __BF_FIELD_CHECK(_mask, _reg, 0U, "FIELD_GET: ");        \
                (typeof(_mask))(((_reg) & (_mask)) >> __bf_shf(_mask));        \
        })

extern void __compiletime_error("value doesn't fit into mask")
__field_overflow(void);
extern void __compiletime_error("bad bitfield mask")
__bad_mask(void);
static __always_inline u64 field_multiplier(u64 field)
{
        if ((field | (field - 1)) & ((field | (field - 1)) + 1))
                __bad_mask();
        return field & -field;
}
static __always_inline u64 field_mask(u64 field)
{
        return field / field_multiplier(field);
}
#define field_max(field)        ((typeof(field))field_mask(field))
#define ____MAKE_OP(type,base,to,from)                                        \
static __always_inline __##type type##_encode_bits(base v, base field)        \
{                                                                        \
        if (__builtin_constant_p(v) && (v & ~field_mask(field)))        \
                __field_overflow();                                        \
        return to((v & field_mask(field)) * field_multiplier(field));        \
}                                                                        \
static __always_inline __##type type##_replace_bits(__##type old,        \
                                        base val, base field)                \
{                                                                        \
        return (old & ~to(field)) | type##_encode_bits(val, field);        \
}                                                                        \
static __always_inline void type##p_replace_bits(__##type *p,                \
                                        base val, base field)                \
{                                                                        \
        *p = (*p & ~to(field)) | type##_encode_bits(val, field);        \
}                                                                        \
static __always_inline base type##_get_bits(__##type v, base field)        \
{                                                                        \
        return (from(v) & field)/field_multiplier(field);                \
}
#define __MAKE_OP(size)                                                        \
        ____MAKE_OP(le##size,u##size,cpu_to_le##size,le##size##_to_cpu)        \
        ____MAKE_OP(be##size,u##size,cpu_to_be##size,be##size##_to_cpu)        \
        ____MAKE_OP(u##size,u##size,,)
____MAKE_OP(u8,u8,,)
__MAKE_OP(16)
__MAKE_OP(32)
__MAKE_OP(64)
#undef __MAKE_OP
#undef ____MAKE_OP

#endif
















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RATELIMIT_H
#define _LINUX_RATELIMIT_H

#include <linux/ratelimit_types.h>
#include <linux/sched.h>
#include <linux/spinlock.h>

static inline void ratelimit_state_init(struct ratelimit_state *rs,
                                        int interval, int burst)
{
        memset(rs, 0, sizeof(*rs));

        raw_spin_lock_init(&rs->lock);
        rs->interval        = interval;
        rs->burst        = burst;
}

static inline void ratelimit_default_init(struct ratelimit_state *rs)
{
        return ratelimit_state_init(rs, DEFAULT_RATELIMIT_INTERVAL,
                                        DEFAULT_RATELIMIT_BURST);
}

static inline void ratelimit_state_exit(struct ratelimit_state *rs)
{
        if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE))
                return;

        if (rs->missed) {
                pr_warn("%s: %d output lines suppressed due to ratelimiting\n",
                        current->comm, rs->missed);
                rs->missed = 0;
        }
}

static inline void
ratelimit_set_flags(struct ratelimit_state *rs, unsigned long flags)
{
        rs->flags = flags;
}

extern struct ratelimit_state printk_ratelimit_state;

#ifdef CONFIG_PRINTK

#define WARN_ON_RATELIMIT(condition, state)        ({                \
        bool __rtn_cond = !!(condition);                        \
        WARN_ON(__rtn_cond && __ratelimit(state));                \
        __rtn_cond;                                                \
})

#define WARN_RATELIMIT(condition, format, ...)                        \
({                                                                \
        static DEFINE_RATELIMIT_STATE(_rs,                        \
                                      DEFAULT_RATELIMIT_INTERVAL,        \
                                      DEFAULT_RATELIMIT_BURST);        \
        int rtn = !!(condition);                                \
                                                                \
        if (unlikely(rtn && __ratelimit(&_rs)))                        \
                WARN(rtn, format, ##__VA_ARGS__);                \
                                                                \
        rtn;                                                        \
})

#else

#define WARN_ON_RATELIMIT(condition, state)                        \
        WARN_ON(condition)

#define WARN_RATELIMIT(condition, format, ...)                        \
({                                                                \
        int rtn = WARN(condition, format, ##__VA_ARGS__);        \
        rtn;                                                        \
})

#endif

#endif /* _LINUX_RATELIMIT_H */






































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_NVRAM_H
#define _LINUX_NVRAM_H

#include <linux/errno.h>
#include <uapi/linux/nvram.h>

#ifdef CONFIG_PPC
#include <asm/machdep.h>
#endif

/**
 * struct nvram_ops - NVRAM functionality made available to drivers
 * @read: validate checksum (if any) then load a range of bytes from NVRAM
 * @write: store a range of bytes to NVRAM then update checksum (if any)
 * @read_byte: load a single byte from NVRAM
 * @write_byte: store a single byte to NVRAM
 * @get_size: return the fixed number of bytes in the NVRAM
 *
 * Architectures which provide an nvram ops struct need not implement all
 * of these methods. If the NVRAM hardware can be accessed only one byte
 * at a time then it may be sufficient to provide .read_byte and .write_byte.
 * If the NVRAM has a checksum (and it is to be checked) the .read and
 * .write methods can be used to implement that efficiently.
 *
 * Portable drivers may use the wrapper functions defined here.
 * The nvram_read() and nvram_write() functions call the .read and .write
 * methods when available and fall back on the .read_byte and .write_byte
 * methods otherwise.
 */

struct nvram_ops {
        ssize_t         (*get_size)(void);
        unsigned char   (*read_byte)(int);
        void            (*write_byte)(unsigned char, int);
        ssize_t         (*read)(char *, size_t, loff_t *);
        ssize_t         (*write)(char *, size_t, loff_t *);
#if defined(CONFIG_X86) || defined(CONFIG_M68K)
        long            (*initialize)(void);
        long            (*set_checksum)(void);
#endif
};

extern const struct nvram_ops arch_nvram_ops;

static inline ssize_t nvram_get_size(void)
{
#ifdef CONFIG_PPC
        if (ppc_md.nvram_size)
                return ppc_md.nvram_size();
#else
        if (arch_nvram_ops.get_size)
                return arch_nvram_ops.get_size();
#endif
        return -ENODEV;
}

static inline unsigned char nvram_read_byte(int addr)
{
#ifdef CONFIG_PPC
        if (ppc_md.nvram_read_val)
                return ppc_md.nvram_read_val(addr);
#else
        if (arch_nvram_ops.read_byte)
                return arch_nvram_ops.read_byte(addr);
#endif
        return 0xFF;
}

static inline void nvram_write_byte(unsigned char val, int addr)
{
#ifdef CONFIG_PPC
        if (ppc_md.nvram_write_val)
                ppc_md.nvram_write_val(addr, val);
#else
        if (arch_nvram_ops.write_byte)
                arch_nvram_ops.write_byte(val, addr);
#endif
}

static inline ssize_t nvram_read_bytes(char *buf, size_t count, loff_t *ppos)
{
        ssize_t nvram_size = nvram_get_size();
        loff_t i;
        char *p = buf;

        if (nvram_size < 0)
                return nvram_size;
        for (i = *ppos; count > 0 && i < nvram_size; ++i, ++p, --count)
                *p = nvram_read_byte(i);
        *ppos = i;
        return p - buf;
}

static inline ssize_t nvram_write_bytes(char *buf, size_t count, loff_t *ppos)
{
        ssize_t nvram_size = nvram_get_size();
        loff_t i;
        char *p = buf;

        if (nvram_size < 0)
                return nvram_size;
        for (i = *ppos; count > 0 && i < nvram_size; ++i, ++p, --count)
                nvram_write_byte(*p, i);
        *ppos = i;
        return p - buf;
}

static inline ssize_t nvram_read(char *buf, size_t count, loff_t *ppos)
{
#ifdef CONFIG_PPC
        if (ppc_md.nvram_read)
                return ppc_md.nvram_read(buf, count, ppos);
#else
        if (arch_nvram_ops.read)
                return arch_nvram_ops.read(buf, count, ppos);
#endif
        return nvram_read_bytes(buf, count, ppos);
}

static inline ssize_t nvram_write(char *buf, size_t count, loff_t *ppos)
{
#ifdef CONFIG_PPC
        if (ppc_md.nvram_write)
                return ppc_md.nvram_write(buf, count, ppos);
#else
        if (arch_nvram_ops.write)
                return arch_nvram_ops.write(buf, count, ppos);
#endif
        return nvram_write_bytes(buf, count, ppos);
}

#endif  /* _LINUX_NVRAM_H */












































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * AEAD: Authenticated Encryption with Associated Data
 * 
 * Copyright (c) 2007-2015 Herbert Xu <herbert@gondor.apana.org.au>
 */

#ifndef _CRYPTO_AEAD_H
#define _CRYPTO_AEAD_H

#include <linux/crypto.h>
#include <linux/kernel.h>
#include <linux/slab.h>

/**
 * DOC: Authenticated Encryption With Associated Data (AEAD) Cipher API
 *
 * The AEAD cipher API is used with the ciphers of type CRYPTO_ALG_TYPE_AEAD
 * (listed as type "aead" in /proc/crypto)
 *
 * The most prominent examples for this type of encryption is GCM and CCM.
 * However, the kernel supports other types of AEAD ciphers which are defined
 * with the following cipher string:
 *
 *        authenc(keyed message digest, block cipher)
 *
 * For example: authenc(hmac(sha256), cbc(aes))
 *
 * The example code provided for the symmetric key cipher operation
 * applies here as well. Naturally all *skcipher* symbols must be exchanged
 * the *aead* pendants discussed in the following. In addition, for the AEAD
 * operation, the aead_request_set_ad function must be used to set the
 * pointer to the associated data memory location before performing the
 * encryption or decryption operation. In case of an encryption, the associated
 * data memory is filled during the encryption operation. For decryption, the
 * associated data memory must contain data that is used to verify the integrity
 * of the decrypted data. Another deviation from the asynchronous block cipher
 * operation is that the caller should explicitly check for -EBADMSG of the
 * crypto_aead_decrypt. That error indicates an authentication error, i.e.
 * a breach in the integrity of the message. In essence, that -EBADMSG error
 * code is the key bonus an AEAD cipher has over "standard" block chaining
 * modes.
 *
 * Memory Structure:
 *
 * The source scatterlist must contain the concatenation of
 * associated data || plaintext or ciphertext.
 *
 * The destination scatterlist has the same layout, except that the plaintext
 * (resp. ciphertext) will grow (resp. shrink) by the authentication tag size
 * during encryption (resp. decryption).
 *
 * In-place encryption/decryption is enabled by using the same scatterlist
 * pointer for both the source and destination.
 *
 * Even in the out-of-place case, space must be reserved in the destination for
 * the associated data, even though it won't be written to.  This makes the
 * in-place and out-of-place cases more consistent.  It is permissible for the
 * "destination" associated data to alias the "source" associated data.
 *
 * As with the other scatterlist crypto APIs, zero-length scatterlist elements
 * are not allowed in the used part of the scatterlist.  Thus, if there is no
 * associated data, the first element must point to the plaintext/ciphertext.
 *
 * To meet the needs of IPsec, a special quirk applies to rfc4106, rfc4309,
 * rfc4543, and rfc7539esp ciphers.  For these ciphers, the final 'ivsize' bytes
 * of the associated data buffer must contain a second copy of the IV.  This is
 * in addition to the copy passed to aead_request_set_crypt().  These two IV
 * copies must not differ; different implementations of the same algorithm may
 * behave differently in that case.  Note that the algorithm might not actually
 * treat the IV as associated data; nevertheless the length passed to
 * aead_request_set_ad() must include it.
 */

struct crypto_aead;

/**
 *        struct aead_request - AEAD request
 *        @base: Common attributes for async crypto requests
 *        @assoclen: Length in bytes of associated data for authentication
 *        @cryptlen: Length of data to be encrypted or decrypted
 *        @iv: Initialisation vector
 *        @src: Source data
 *        @dst: Destination data
 *        @__ctx: Start of private context data
 */
struct aead_request {
        struct crypto_async_request base;

        unsigned int assoclen;
        unsigned int cryptlen;

        u8 *iv;

        struct scatterlist *src;
        struct scatterlist *dst;

        void *__ctx[] CRYPTO_MINALIGN_ATTR;
};

/**
 * struct aead_alg - AEAD cipher definition
 * @maxauthsize: Set the maximum authentication tag size supported by the
 *                 transformation. A transformation may support smaller tag sizes.
 *                 As the authentication tag is a message digest to ensure the
 *                 integrity of the encrypted data, a consumer typically wants the
 *                 largest authentication tag possible as defined by this
 *                 variable.
 * @setauthsize: Set authentication size for the AEAD transformation. This
 *                 function is used to specify the consumer requested size of the
 *                  authentication tag to be either generated by the transformation
 *                 during encryption or the size of the authentication tag to be
 *                 supplied during the decryption operation. This function is also
 *                 responsible for checking the authentication tag size for
 *                 validity.
 * @setkey: see struct skcipher_alg
 * @encrypt: see struct skcipher_alg
 * @decrypt: see struct skcipher_alg
 * @ivsize: see struct skcipher_alg
 * @chunksize: see struct skcipher_alg
 * @init: Initialize the cryptographic transformation object. This function
 *          is used to initialize the cryptographic transformation object.
 *          This function is called only once at the instantiation time, right
 *          after the transformation context was allocated. In case the
 *          cryptographic hardware has some special requirements which need to
 *          be handled by software, this function shall check for the precise
 *          requirement of the transformation and put any software fallbacks
 *          in place.
 * @exit: Deinitialize the cryptographic transformation object. This is a
 *          counterpart to @init, used to remove various changes set in
 *          @init.
 * @base: Definition of a generic crypto cipher algorithm.
 *
 * All fields except @ivsize is mandatory and must be filled.
 */
struct aead_alg {
        int (*setkey)(struct crypto_aead *tfm, const u8 *key,
                      unsigned int keylen);
        int (*setauthsize)(struct crypto_aead *tfm, unsigned int authsize);
        int (*encrypt)(struct aead_request *req);
        int (*decrypt)(struct aead_request *req);
        int (*init)(struct crypto_aead *tfm);
        void (*exit)(struct crypto_aead *tfm);

        unsigned int ivsize;
        unsigned int maxauthsize;
        unsigned int chunksize;

        struct crypto_alg base;
};

struct crypto_aead {
        unsigned int authsize;
        unsigned int reqsize;

        struct crypto_tfm base;
};

static inline struct crypto_aead *__crypto_aead_cast(struct crypto_tfm *tfm)
{
        return container_of(tfm, struct crypto_aead, base);
}

/**
 * crypto_alloc_aead() - allocate AEAD cipher handle
 * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
 *             AEAD cipher
 * @type: specifies the type of the cipher
 * @mask: specifies the mask for the cipher
 *
 * Allocate a cipher handle for an AEAD. The returned struct
 * crypto_aead is the cipher handle that is required for any subsequent
 * API invocation for that AEAD.
 *
 * Return: allocated cipher handle in case of success; IS_ERR() is true in case
 *           of an error, PTR_ERR() returns the error code.
 */
struct crypto_aead *crypto_alloc_aead(const char *alg_name, u32 type, u32 mask);

static inline struct crypto_tfm *crypto_aead_tfm(struct crypto_aead *tfm)
{
        return &tfm->base;
}

/**
 * crypto_free_aead() - zeroize and free aead handle
 * @tfm: cipher handle to be freed
 *
 * If @tfm is a NULL or error pointer, this function does nothing.
 */
static inline void crypto_free_aead(struct crypto_aead *tfm)
{
        crypto_destroy_tfm(tfm, crypto_aead_tfm(tfm));
}

static inline struct aead_alg *crypto_aead_alg(struct crypto_aead *tfm)
{
        return container_of(crypto_aead_tfm(tfm)->__crt_alg,
                            struct aead_alg, base);
}

static inline unsigned int crypto_aead_alg_ivsize(struct aead_alg *alg)
{
        return alg->ivsize;
}

/**
 * crypto_aead_ivsize() - obtain IV size
 * @tfm: cipher handle
 *
 * The size of the IV for the aead referenced by the cipher handle is
 * returned. This IV size may be zero if the cipher does not need an IV.
 *
 * Return: IV size in bytes
 */
static inline unsigned int crypto_aead_ivsize(struct crypto_aead *tfm)
{
        return crypto_aead_alg_ivsize(crypto_aead_alg(tfm));
}

/**
 * crypto_aead_authsize() - obtain maximum authentication data size
 * @tfm: cipher handle
 *
 * The maximum size of the authentication data for the AEAD cipher referenced
 * by the AEAD cipher handle is returned. The authentication data size may be
 * zero if the cipher implements a hard-coded maximum.
 *
 * The authentication data may also be known as "tag value".
 *
 * Return: authentication data size / tag size in bytes
 */
static inline unsigned int crypto_aead_authsize(struct crypto_aead *tfm)
{
        return tfm->authsize;
}

static inline unsigned int crypto_aead_alg_maxauthsize(struct aead_alg *alg)
{
        return alg->maxauthsize;
}

static inline unsigned int crypto_aead_maxauthsize(struct crypto_aead *aead)
{
        return crypto_aead_alg_maxauthsize(crypto_aead_alg(aead));
}

/**
 * crypto_aead_blocksize() - obtain block size of cipher
 * @tfm: cipher handle
 *
 * The block size for the AEAD referenced with the cipher handle is returned.
 * The caller may use that information to allocate appropriate memory for the
 * data returned by the encryption or decryption operation
 *
 * Return: block size of cipher
 */
static inline unsigned int crypto_aead_blocksize(struct crypto_aead *tfm)
{
        return crypto_tfm_alg_blocksize(crypto_aead_tfm(tfm));
}

static inline unsigned int crypto_aead_alignmask(struct crypto_aead *tfm)
{
        return crypto_tfm_alg_alignmask(crypto_aead_tfm(tfm));
}

static inline u32 crypto_aead_get_flags(struct crypto_aead *tfm)
{
        return crypto_tfm_get_flags(crypto_aead_tfm(tfm));
}

static inline void crypto_aead_set_flags(struct crypto_aead *tfm, u32 flags)
{
        crypto_tfm_set_flags(crypto_aead_tfm(tfm), flags);
}

static inline void crypto_aead_clear_flags(struct crypto_aead *tfm, u32 flags)
{
        crypto_tfm_clear_flags(crypto_aead_tfm(tfm), flags);
}

/**
 * crypto_aead_setkey() - set key for cipher
 * @tfm: cipher handle
 * @key: buffer holding the key
 * @keylen: length of the key in bytes
 *
 * The caller provided key is set for the AEAD referenced by the cipher
 * handle.
 *
 * Note, the key length determines the cipher type. Many block ciphers implement
 * different cipher modes depending on the key size, such as AES-128 vs AES-192
 * vs. AES-256. When providing a 16 byte key for an AES cipher handle, AES-128
 * is performed.
 *
 * Return: 0 if the setting of the key was successful; < 0 if an error occurred
 */
int crypto_aead_setkey(struct crypto_aead *tfm,
                       const u8 *key, unsigned int keylen);

/**
 * crypto_aead_setauthsize() - set authentication data size
 * @tfm: cipher handle
 * @authsize: size of the authentication data / tag in bytes
 *
 * Set the authentication data size / tag size. AEAD requires an authentication
 * tag (or MAC) in addition to the associated data.
 *
 * Return: 0 if the setting of the key was successful; < 0 if an error occurred
 */
int crypto_aead_setauthsize(struct crypto_aead *tfm, unsigned int authsize);

static inline struct crypto_aead *crypto_aead_reqtfm(struct aead_request *req)
{
        return __crypto_aead_cast(req->base.tfm);
}

/**
 * crypto_aead_encrypt() - encrypt plaintext
 * @req: reference to the aead_request handle that holds all information
 *         needed to perform the cipher operation
 *
 * Encrypt plaintext data using the aead_request handle. That data structure
 * and how it is filled with data is discussed with the aead_request_*
 * functions.
 *
 * IMPORTANT NOTE The encryption operation creates the authentication data /
 *                  tag. That data is concatenated with the created ciphertext.
 *                  The ciphertext memory size is therefore the given number of
 *                  block cipher blocks + the size defined by the
 *                  crypto_aead_setauthsize invocation. The caller must ensure
 *                  that sufficient memory is available for the ciphertext and
 *                  the authentication tag.
 *
 * Return: 0 if the cipher operation was successful; < 0 if an error occurred
 */
int crypto_aead_encrypt(struct aead_request *req);

/**
 * crypto_aead_decrypt() - decrypt ciphertext
 * @req: reference to the aead_request handle that holds all information
 *         needed to perform the cipher operation
 *
 * Decrypt ciphertext data using the aead_request handle. That data structure
 * and how it is filled with data is discussed with the aead_request_*
 * functions.
 *
 * IMPORTANT NOTE The caller must concatenate the ciphertext followed by the
 *                  authentication data / tag. That authentication data / tag
 *                  must have the size defined by the crypto_aead_setauthsize
 *                  invocation.
 *
 *
 * Return: 0 if the cipher operation was successful; -EBADMSG: The AEAD
 *           cipher operation performs the authentication of the data during the
 *           decryption operation. Therefore, the function returns this error if
 *           the authentication of the ciphertext was unsuccessful (i.e. the
 *           integrity of the ciphertext or the associated data was violated);
 *           < 0 if an error occurred.
 */
int crypto_aead_decrypt(struct aead_request *req);

/**
 * DOC: Asynchronous AEAD Request Handle
 *
 * The aead_request data structure contains all pointers to data required for
 * the AEAD cipher operation. This includes the cipher handle (which can be
 * used by multiple aead_request instances), pointer to plaintext and
 * ciphertext, asynchronous callback function, etc. It acts as a handle to the
 * aead_request_* API calls in a similar way as AEAD handle to the
 * crypto_aead_* API calls.
 */

/**
 * crypto_aead_reqsize() - obtain size of the request data structure
 * @tfm: cipher handle
 *
 * Return: number of bytes
 */
static inline unsigned int crypto_aead_reqsize(struct crypto_aead *tfm)
{
        return tfm->reqsize;
}

/**
 * aead_request_set_tfm() - update cipher handle reference in request
 * @req: request handle to be modified
 * @tfm: cipher handle that shall be added to the request handle
 *
 * Allow the caller to replace the existing aead handle in the request
 * data structure with a different one.
 */
static inline void aead_request_set_tfm(struct aead_request *req,
                                        struct crypto_aead *tfm)
{
        req->base.tfm = crypto_aead_tfm(tfm);
}

/**
 * aead_request_alloc() - allocate request data structure
 * @tfm: cipher handle to be registered with the request
 * @gfp: memory allocation flag that is handed to kmalloc by the API call.
 *
 * Allocate the request data structure that must be used with the AEAD
 * encrypt and decrypt API calls. During the allocation, the provided aead
 * handle is registered in the request data structure.
 *
 * Return: allocated request handle in case of success, or NULL if out of memory
 */
static inline struct aead_request *aead_request_alloc(struct crypto_aead *tfm,
                                                      gfp_t gfp)
{
        struct aead_request *req;

        req = kmalloc(sizeof(*req) + crypto_aead_reqsize(tfm), gfp);

        if (likely(req))
                aead_request_set_tfm(req, tfm);

        return req;
}

/**
 * aead_request_free() - zeroize and free request data structure
 * @req: request data structure cipher handle to be freed
 */
static inline void aead_request_free(struct aead_request *req)
{
        kfree_sensitive(req);
}

/**
 * aead_request_set_callback() - set asynchronous callback function
 * @req: request handle
 * @flags: specify zero or an ORing of the flags
 *           CRYPTO_TFM_REQ_MAY_BACKLOG the request queue may back log and
 *           increase the wait queue beyond the initial maximum size;
 *           CRYPTO_TFM_REQ_MAY_SLEEP the request processing may sleep
 * @compl: callback function pointer to be registered with the request handle
 * @data: The data pointer refers to memory that is not used by the kernel
 *          crypto API, but provided to the callback function for it to use. Here,
 *          the caller can provide a reference to memory the callback function can
 *          operate on. As the callback function is invoked asynchronously to the
 *          related functionality, it may need to access data structures of the
 *          related functionality which can be referenced using this pointer. The
 *          callback function can access the memory via the "data" field in the
 *          crypto_async_request data structure provided to the callback function.
 *
 * Setting the callback function that is triggered once the cipher operation
 * completes
 *
 * The callback function is registered with the aead_request handle and
 * must comply with the following template::
 *
 *        void callback_function(struct crypto_async_request *req, int error)
 */
static inline void aead_request_set_callback(struct aead_request *req,
                                             u32 flags,
                                             crypto_completion_t compl,
                                             void *data)
{
        req->base.complete = compl;
        req->base.data = data;
        req->base.flags = flags;
}

/**
 * aead_request_set_crypt - set data buffers
 * @req: request handle
 * @src: source scatter / gather list
 * @dst: destination scatter / gather list
 * @cryptlen: number of bytes to process from @src
 * @iv: IV for the cipher operation which must comply with the IV size defined
 *      by crypto_aead_ivsize()
 *
 * Setting the source data and destination data scatter / gather lists which
 * hold the associated data concatenated with the plaintext or ciphertext. See
 * below for the authentication tag.
 *
 * For encryption, the source is treated as the plaintext and the
 * destination is the ciphertext. For a decryption operation, the use is
 * reversed - the source is the ciphertext and the destination is the plaintext.
 *
 * The memory structure for cipher operation has the following structure:
 *
 * - AEAD encryption input:  assoc data || plaintext
 * - AEAD encryption output: assoc data || cipherntext || auth tag
 * - AEAD decryption input:  assoc data || ciphertext || auth tag
 * - AEAD decryption output: assoc data || plaintext
 *
 * Albeit the kernel requires the presence of the AAD buffer, however,
 * the kernel does not fill the AAD buffer in the output case. If the
 * caller wants to have that data buffer filled, the caller must either
 * use an in-place cipher operation (i.e. same memory location for
 * input/output memory location).
 */
static inline void aead_request_set_crypt(struct aead_request *req,
                                          struct scatterlist *src,
                                          struct scatterlist *dst,
                                          unsigned int cryptlen, u8 *iv)
{
        req->src = src;
        req->dst = dst;
        req->cryptlen = cryptlen;
        req->iv = iv;
}

/**
 * aead_request_set_ad - set associated data information
 * @req: request handle
 * @assoclen: number of bytes in associated data
 *
 * Setting the AD information.  This function sets the length of
 * the associated data.
 */
static inline void aead_request_set_ad(struct aead_request *req,
                                       unsigned int assoclen)
{
        req->assoclen = assoclen;
}

#endif        /* _CRYPTO_AEAD_H */


























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 




    1 

    1 


























































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * NetLabel Kernel API
 *
 * This file defines the kernel API for the NetLabel system.  The NetLabel
 * system manages static and dynamic label mappings for network protocols such
 * as CIPSO and RIPSO.
 *
 * Author: Paul Moore <paul@paul-moore.com>
 */

/*
 * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008
 */

#include <linux/init.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/audit.h>
#include <linux/in.h>
#include <linux/in6.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/netlabel.h>
#include <net/cipso_ipv4.h>
#include <net/calipso.h>
#include <asm/bug.h>
#include <linux/atomic.h>

#include "netlabel_domainhash.h"
#include "netlabel_unlabeled.h"
#include "netlabel_cipso_v4.h"
#include "netlabel_calipso.h"
#include "netlabel_user.h"
#include "netlabel_mgmt.h"
#include "netlabel_addrlist.h"

/*
 * Configuration Functions
 */

/**
 * netlbl_cfg_map_del - Remove a NetLabel/LSM domain mapping
 * @domain: the domain mapping to remove
 * @family: address family
 * @addr: IP address
 * @mask: IP address mask
 * @audit_info: NetLabel audit information
 *
 * Description:
 * Removes a NetLabel/LSM domain mapping.  A @domain value of NULL causes the
 * default domain mapping to be removed.  Returns zero on success, negative
 * values on failure.
 *
 */
int netlbl_cfg_map_del(const char *domain,
                       u16 family,
                       const void *addr,
                       const void *mask,
                       struct netlbl_audit *audit_info)
{
        if (addr == NULL && mask == NULL) {
                return netlbl_domhsh_remove(domain, family, audit_info);
        } else if (addr != NULL && mask != NULL) {
                switch (family) {
                case AF_INET:
                        return netlbl_domhsh_remove_af4(domain, addr, mask,
                                                        audit_info);
#if IS_ENABLED(CONFIG_IPV6)
                case AF_INET6:
                        return netlbl_domhsh_remove_af6(domain, addr, mask,
                                                        audit_info);
#endif /* IPv6 */
                default:
                        return -EPFNOSUPPORT;
                }
        } else
                return -EINVAL;
}

/**
 * netlbl_cfg_unlbl_map_add - Add a new unlabeled mapping
 * @domain: the domain mapping to add
 * @family: address family
 * @addr: IP address
 * @mask: IP address mask
 * @audit_info: NetLabel audit information
 *
 * Description:
 * Adds a new unlabeled NetLabel/LSM domain mapping.  A @domain value of NULL
 * causes a new default domain mapping to be added.  Returns zero on success,
 * negative values on failure.
 *
 */
int netlbl_cfg_unlbl_map_add(const char *domain,
                             u16 family,
                             const void *addr,
                             const void *mask,
                             struct netlbl_audit *audit_info)
{
        int ret_val = -ENOMEM;
        struct netlbl_dom_map *entry;
        struct netlbl_domaddr_map *addrmap = NULL;
        struct netlbl_domaddr4_map *map4 = NULL;
        struct netlbl_domaddr6_map *map6 = NULL;

        entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
        if (entry == NULL)
                return -ENOMEM;
        if (domain != NULL) {
                entry->domain = kstrdup(domain, GFP_ATOMIC);
                if (entry->domain == NULL)
                        goto cfg_unlbl_map_add_failure;
        }
        entry->family = family;

        if (addr == NULL && mask == NULL)
                entry->def.type = NETLBL_NLTYPE_UNLABELED;
        else if (addr != NULL && mask != NULL) {
                addrmap = kzalloc(sizeof(*addrmap), GFP_ATOMIC);
                if (addrmap == NULL)
                        goto cfg_unlbl_map_add_failure;
                INIT_LIST_HEAD(&addrmap->list4);
                INIT_LIST_HEAD(&addrmap->list6);

                switch (family) {
                case AF_INET: {
                        const struct in_addr *addr4 = addr;
                        const struct in_addr *mask4 = mask;
                        map4 = kzalloc(sizeof(*map4), GFP_ATOMIC);
                        if (map4 == NULL)
                                goto cfg_unlbl_map_add_failure;
                        map4->def.type = NETLBL_NLTYPE_UNLABELED;
                        map4->list.addr = addr4->s_addr & mask4->s_addr;
                        map4->list.mask = mask4->s_addr;
                        map4->list.valid = 1;
                        ret_val = netlbl_af4list_add(&map4->list,
                                                     &addrmap->list4);
                        if (ret_val != 0)
                                goto cfg_unlbl_map_add_failure;
                        break;
                        }
#if IS_ENABLED(CONFIG_IPV6)
                case AF_INET6: {
                        const struct in6_addr *addr6 = addr;
                        const struct in6_addr *mask6 = mask;
                        map6 = kzalloc(sizeof(*map6), GFP_ATOMIC);
                        if (map6 == NULL)
                                goto cfg_unlbl_map_add_failure;
                        map6->def.type = NETLBL_NLTYPE_UNLABELED;
                        map6->list.addr = *addr6;
                        map6->list.addr.s6_addr32[0] &= mask6->s6_addr32[0];
                        map6->list.addr.s6_addr32[1] &= mask6->s6_addr32[1];
                        map6->list.addr.s6_addr32[2] &= mask6->s6_addr32[2];
                        map6->list.addr.s6_addr32[3] &= mask6->s6_addr32[3];
                        map6->list.mask = *mask6;
                        map6->list.valid = 1;
                        ret_val = netlbl_af6list_add(&map6->list,
                                                     &addrmap->list6);
                        if (ret_val != 0)
                                goto cfg_unlbl_map_add_failure;
                        break;
                        }
#endif /* IPv6 */
                default:
                        goto cfg_unlbl_map_add_failure;
                }

                entry->def.addrsel = addrmap;
                entry->def.type = NETLBL_NLTYPE_ADDRSELECT;
        } else {
                ret_val = -EINVAL;
                goto cfg_unlbl_map_add_failure;
        }

        ret_val = netlbl_domhsh_add(entry, audit_info);
        if (ret_val != 0)
                goto cfg_unlbl_map_add_failure;

        return 0;

cfg_unlbl_map_add_failure:
        kfree(entry->domain);
        kfree(entry);
        kfree(addrmap);
        kfree(map4);
        kfree(map6);
        return ret_val;
}


/**
 * netlbl_cfg_unlbl_static_add - Adds a new static label
 * @net: network namespace
 * @dev_name: interface name
 * @addr: IP address in network byte order (struct in[6]_addr)
 * @mask: address mask in network byte order (struct in[6]_addr)
 * @family: address family
 * @secid: LSM secid value for the entry
 * @audit_info: NetLabel audit information
 *
 * Description:
 * Adds a new NetLabel static label to be used when protocol provided labels
 * are not present on incoming traffic.  If @dev_name is NULL then the default
 * interface will be used.  Returns zero on success, negative values on failure.
 *
 */
int netlbl_cfg_unlbl_static_add(struct net *net,
                                const char *dev_name,
                                const void *addr,
                                const void *mask,
                                u16 family,
                                u32 secid,
                                struct netlbl_audit *audit_info)
{
        u32 addr_len;

        switch (family) {
        case AF_INET:
                addr_len = sizeof(struct in_addr);
                break;
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                addr_len = sizeof(struct in6_addr);
                break;
#endif /* IPv6 */
        default:
                return -EPFNOSUPPORT;
        }

        return netlbl_unlhsh_add(net,
                                 dev_name, addr, mask, addr_len,
                                 secid, audit_info);
}

/**
 * netlbl_cfg_unlbl_static_del - Removes an existing static label
 * @net: network namespace
 * @dev_name: interface name
 * @addr: IP address in network byte order (struct in[6]_addr)
 * @mask: address mask in network byte order (struct in[6]_addr)
 * @family: address family
 * @audit_info: NetLabel audit information
 *
 * Description:
 * Removes an existing NetLabel static label used when protocol provided labels
 * are not present on incoming traffic.  If @dev_name is NULL then the default
 * interface will be used.  Returns zero on success, negative values on failure.
 *
 */
int netlbl_cfg_unlbl_static_del(struct net *net,
                                const char *dev_name,
                                const void *addr,
                                const void *mask,
                                u16 family,
                                struct netlbl_audit *audit_info)
{
        u32 addr_len;

        switch (family) {
        case AF_INET:
                addr_len = sizeof(struct in_addr);
                break;
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                addr_len = sizeof(struct in6_addr);
                break;
#endif /* IPv6 */
        default:
                return -EPFNOSUPPORT;
        }

        return netlbl_unlhsh_remove(net,
                                    dev_name, addr, mask, addr_len,
                                    audit_info);
}

/**
 * netlbl_cfg_cipsov4_add - Add a new CIPSOv4 DOI definition
 * @doi_def: CIPSO DOI definition
 * @audit_info: NetLabel audit information
 *
 * Description:
 * Add a new CIPSO DOI definition as defined by @doi_def.  Returns zero on
 * success and negative values on failure.
 *
 */
int netlbl_cfg_cipsov4_add(struct cipso_v4_doi *doi_def,
                           struct netlbl_audit *audit_info)
{
        return cipso_v4_doi_add(doi_def, audit_info);
}

/**
 * netlbl_cfg_cipsov4_del - Remove an existing CIPSOv4 DOI definition
 * @doi: CIPSO DOI
 * @audit_info: NetLabel audit information
 *
 * Description:
 * Remove an existing CIPSO DOI definition matching @doi.  Returns zero on
 * success and negative values on failure.
 *
 */
void netlbl_cfg_cipsov4_del(u32 doi, struct netlbl_audit *audit_info)
{
        cipso_v4_doi_remove(doi, audit_info);
}

/**
 * netlbl_cfg_cipsov4_map_add - Add a new CIPSOv4 DOI mapping
 * @doi: the CIPSO DOI
 * @domain: the domain mapping to add
 * @addr: IP address
 * @mask: IP address mask
 * @audit_info: NetLabel audit information
 *
 * Description:
 * Add a new NetLabel/LSM domain mapping for the given CIPSO DOI to the NetLabel
 * subsystem.  A @domain value of NULL adds a new default domain mapping.
 * Returns zero on success, negative values on failure.
 *
 */
int netlbl_cfg_cipsov4_map_add(u32 doi,
                               const char *domain,
                               const struct in_addr *addr,
                               const struct in_addr *mask,
                               struct netlbl_audit *audit_info)
{
        int ret_val = -ENOMEM;
        struct cipso_v4_doi *doi_def;
        struct netlbl_dom_map *entry;
        struct netlbl_domaddr_map *addrmap = NULL;
        struct netlbl_domaddr4_map *addrinfo = NULL;

        doi_def = cipso_v4_doi_getdef(doi);
        if (doi_def == NULL)
                return -ENOENT;

        entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
        if (entry == NULL)
                goto out_entry;
        entry->family = AF_INET;
        if (domain != NULL) {
                entry->domain = kstrdup(domain, GFP_ATOMIC);
                if (entry->domain == NULL)
                        goto out_domain;
        }

        if (addr == NULL && mask == NULL) {
                entry->def.cipso = doi_def;
                entry->def.type = NETLBL_NLTYPE_CIPSOV4;
        } else if (addr != NULL && mask != NULL) {
                addrmap = kzalloc(sizeof(*addrmap), GFP_ATOMIC);
                if (addrmap == NULL)
                        goto out_addrmap;
                INIT_LIST_HEAD(&addrmap->list4);
                INIT_LIST_HEAD(&addrmap->list6);

                addrinfo = kzalloc(sizeof(*addrinfo), GFP_ATOMIC);
                if (addrinfo == NULL)
                        goto out_addrinfo;
                addrinfo->def.cipso = doi_def;
                addrinfo->def.type = NETLBL_NLTYPE_CIPSOV4;
                addrinfo->list.addr = addr->s_addr & mask->s_addr;
                addrinfo->list.mask = mask->s_addr;
                addrinfo->list.valid = 1;
                ret_val = netlbl_af4list_add(&addrinfo->list, &addrmap->list4);
                if (ret_val != 0)
                        goto cfg_cipsov4_map_add_failure;

                entry->def.addrsel = addrmap;
                entry->def.type = NETLBL_NLTYPE_ADDRSELECT;
        } else {
                ret_val = -EINVAL;
                goto out_addrmap;
        }

        ret_val = netlbl_domhsh_add(entry, audit_info);
        if (ret_val != 0)
                goto cfg_cipsov4_map_add_failure;

        return 0;

cfg_cipsov4_map_add_failure:
        kfree(addrinfo);
out_addrinfo:
        kfree(addrmap);
out_addrmap:
        kfree(entry->domain);
out_domain:
        kfree(entry);
out_entry:
        cipso_v4_doi_putdef(doi_def);
        return ret_val;
}

/**
 * netlbl_cfg_calipso_add - Add a new CALIPSO DOI definition
 * @doi_def: CALIPSO DOI definition
 * @audit_info: NetLabel audit information
 *
 * Description:
 * Add a new CALIPSO DOI definition as defined by @doi_def.  Returns zero on
 * success and negative values on failure.
 *
 */
int netlbl_cfg_calipso_add(struct calipso_doi *doi_def,
                           struct netlbl_audit *audit_info)
{
#if IS_ENABLED(CONFIG_IPV6)
        return calipso_doi_add(doi_def, audit_info);
#else /* IPv6 */
        return -ENOSYS;
#endif /* IPv6 */
}

/**
 * netlbl_cfg_calipso_del - Remove an existing CALIPSO DOI definition
 * @doi: CALIPSO DOI
 * @audit_info: NetLabel audit information
 *
 * Description:
 * Remove an existing CALIPSO DOI definition matching @doi.  Returns zero on
 * success and negative values on failure.
 *
 */
void netlbl_cfg_calipso_del(u32 doi, struct netlbl_audit *audit_info)
{
#if IS_ENABLED(CONFIG_IPV6)
        calipso_doi_remove(doi, audit_info);
#endif /* IPv6 */
}

/**
 * netlbl_cfg_calipso_map_add - Add a new CALIPSO DOI mapping
 * @doi: the CALIPSO DOI
 * @domain: the domain mapping to add
 * @addr: IP address
 * @mask: IP address mask
 * @audit_info: NetLabel audit information
 *
 * Description:
 * Add a new NetLabel/LSM domain mapping for the given CALIPSO DOI to the
 * NetLabel subsystem.  A @domain value of NULL adds a new default domain
 * mapping.  Returns zero on success, negative values on failure.
 *
 */
int netlbl_cfg_calipso_map_add(u32 doi,
                               const char *domain,
                               const struct in6_addr *addr,
                               const struct in6_addr *mask,
                               struct netlbl_audit *audit_info)
{
#if IS_ENABLED(CONFIG_IPV6)
        int ret_val = -ENOMEM;
        struct calipso_doi *doi_def;
        struct netlbl_dom_map *entry;
        struct netlbl_domaddr_map *addrmap = NULL;
        struct netlbl_domaddr6_map *addrinfo = NULL;

        doi_def = calipso_doi_getdef(doi);
        if (doi_def == NULL)
                return -ENOENT;

        entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
        if (entry == NULL)
                goto out_entry;
        entry->family = AF_INET6;
        if (domain != NULL) {
                entry->domain = kstrdup(domain, GFP_ATOMIC);
                if (entry->domain == NULL)
                        goto out_domain;
        }

        if (addr == NULL && mask == NULL) {
                entry->def.calipso = doi_def;
                entry->def.type = NETLBL_NLTYPE_CALIPSO;
        } else if (addr != NULL && mask != NULL) {
                addrmap = kzalloc(sizeof(*addrmap), GFP_ATOMIC);
                if (addrmap == NULL)
                        goto out_addrmap;
                INIT_LIST_HEAD(&addrmap->list4);
                INIT_LIST_HEAD(&addrmap->list6);

                addrinfo = kzalloc(sizeof(*addrinfo), GFP_ATOMIC);
                if (addrinfo == NULL)
                        goto out_addrinfo;
                addrinfo->def.calipso = doi_def;
                addrinfo->def.type = NETLBL_NLTYPE_CALIPSO;
                addrinfo->list.addr = *addr;
                addrinfo->list.addr.s6_addr32[0] &= mask->s6_addr32[0];
                addrinfo->list.addr.s6_addr32[1] &= mask->s6_addr32[1];
                addrinfo->list.addr.s6_addr32[2] &= mask->s6_addr32[2];
                addrinfo->list.addr.s6_addr32[3] &= mask->s6_addr32[3];
                addrinfo->list.mask = *mask;
                addrinfo->list.valid = 1;
                ret_val = netlbl_af6list_add(&addrinfo->list, &addrmap->list6);
                if (ret_val != 0)
                        goto cfg_calipso_map_add_failure;

                entry->def.addrsel = addrmap;
                entry->def.type = NETLBL_NLTYPE_ADDRSELECT;
        } else {
                ret_val = -EINVAL;
                goto out_addrmap;
        }

        ret_val = netlbl_domhsh_add(entry, audit_info);
        if (ret_val != 0)
                goto cfg_calipso_map_add_failure;

        return 0;

cfg_calipso_map_add_failure:
        kfree(addrinfo);
out_addrinfo:
        kfree(addrmap);
out_addrmap:
        kfree(entry->domain);
out_domain:
        kfree(entry);
out_entry:
        calipso_doi_putdef(doi_def);
        return ret_val;
#else /* IPv6 */
        return -ENOSYS;
#endif /* IPv6 */
}

/*
 * Security Attribute Functions
 */

#define _CM_F_NONE        0x00000000
#define _CM_F_ALLOC        0x00000001
#define _CM_F_WALK        0x00000002

/**
 * _netlbl_catmap_getnode - Get a individual node from a catmap
 * @catmap: pointer to the category bitmap
 * @offset: the requested offset
 * @cm_flags: catmap flags, see _CM_F_*
 * @gfp_flags: memory allocation flags
 *
 * Description:
 * Iterate through the catmap looking for the node associated with @offset.
 * If the _CM_F_ALLOC flag is set in @cm_flags and there is no associated node,
 * one will be created and inserted into the catmap.  If the _CM_F_WALK flag is
 * set in @cm_flags and there is no associated node, the next highest node will
 * be returned.  Returns a pointer to the node on success, NULL on failure.
 *
 */
static struct netlbl_lsm_catmap *_netlbl_catmap_getnode(
                                             struct netlbl_lsm_catmap **catmap,
                                             u32 offset,
                                             unsigned int cm_flags,
                                             gfp_t gfp_flags)
{
        struct netlbl_lsm_catmap *iter = *catmap;
        struct netlbl_lsm_catmap *prev = NULL;

        if (iter == NULL)
                goto catmap_getnode_alloc;
        if (offset < iter->startbit)
                goto catmap_getnode_walk;
        while (iter && offset >= (iter->startbit + NETLBL_CATMAP_SIZE)) {
                prev = iter;
                iter = iter->next;
        }
        if (iter == NULL || offset < iter->startbit)
                goto catmap_getnode_walk;

        return iter;

catmap_getnode_walk:
        if (cm_flags & _CM_F_WALK)
                return iter;
catmap_getnode_alloc:
        if (!(cm_flags & _CM_F_ALLOC))
                return NULL;

        iter = netlbl_catmap_alloc(gfp_flags);
        if (iter == NULL)
                return NULL;
        iter->startbit = offset & ~(NETLBL_CATMAP_SIZE - 1);

        if (prev == NULL) {
                iter->next = *catmap;
                *catmap = iter;
        } else {
                iter->next = prev->next;
                prev->next = iter;
        }

        return iter;
}

/**
 * netlbl_catmap_walk - Walk a LSM secattr catmap looking for a bit
 * @catmap: the category bitmap
 * @offset: the offset to start searching at, in bits
 *
 * Description:
 * This function walks a LSM secattr category bitmap starting at @offset and
 * returns the spot of the first set bit or -ENOENT if no bits are set.
 *
 */
int netlbl_catmap_walk(struct netlbl_lsm_catmap *catmap, u32 offset)
{
        struct netlbl_lsm_catmap *iter;
        u32 idx;
        u32 bit;
        NETLBL_CATMAP_MAPTYPE bitmap;

        iter = _netlbl_catmap_getnode(&catmap, offset, _CM_F_WALK, 0);
        if (iter == NULL)
                return -ENOENT;
        if (offset > iter->startbit) {
                offset -= iter->startbit;
                idx = offset / NETLBL_CATMAP_MAPSIZE;
                bit = offset % NETLBL_CATMAP_MAPSIZE;
        } else {
                idx = 0;
                bit = 0;
        }
        bitmap = iter->bitmap[idx] >> bit;

        for (;;) {
                if (bitmap != 0) {
                        while ((bitmap & NETLBL_CATMAP_BIT) == 0) {
                                bitmap >>= 1;
                                bit++;
                        }
                        return iter->startbit +
                               (NETLBL_CATMAP_MAPSIZE * idx) + bit;
                }
                if (++idx >= NETLBL_CATMAP_MAPCNT) {
                        if (iter->next != NULL) {
                                iter = iter->next;
                                idx = 0;
                        } else
                                return -ENOENT;
                }
                bitmap = iter->bitmap[idx];
                bit = 0;
        }

        return -ENOENT;
}
EXPORT_SYMBOL(netlbl_catmap_walk);

/**
 * netlbl_catmap_walkrng - Find the end of a string of set bits
 * @catmap: the category bitmap
 * @offset: the offset to start searching at, in bits
 *
 * Description:
 * This function walks a LSM secattr category bitmap starting at @offset and
 * returns the spot of the first cleared bit or -ENOENT if the offset is past
 * the end of the bitmap.
 *
 */
int netlbl_catmap_walkrng(struct netlbl_lsm_catmap *catmap, u32 offset)
{
        struct netlbl_lsm_catmap *iter;
        struct netlbl_lsm_catmap *prev = NULL;
        u32 idx;
        u32 bit;
        NETLBL_CATMAP_MAPTYPE bitmask;
        NETLBL_CATMAP_MAPTYPE bitmap;

        iter = _netlbl_catmap_getnode(&catmap, offset, _CM_F_WALK, 0);
        if (iter == NULL)
                return -ENOENT;
        if (offset > iter->startbit) {
                offset -= iter->startbit;
                idx = offset / NETLBL_CATMAP_MAPSIZE;
                bit = offset % NETLBL_CATMAP_MAPSIZE;
        } else {
                idx = 0;
                bit = 0;
        }
        bitmask = NETLBL_CATMAP_BIT << bit;

        for (;;) {
                bitmap = iter->bitmap[idx];
                while (bitmask != 0 && (bitmap & bitmask) != 0) {
                        bitmask <<= 1;
                        bit++;
                }

                if (prev && idx == 0 && bit == 0)
                        return prev->startbit + NETLBL_CATMAP_SIZE - 1;
                else if (bitmask != 0)
                        return iter->startbit +
                                (NETLBL_CATMAP_MAPSIZE * idx) + bit - 1;
                else if (++idx >= NETLBL_CATMAP_MAPCNT) {
                        if (iter->next == NULL)
                                return iter->startbit + NETLBL_CATMAP_SIZE - 1;
                        prev = iter;
                        iter = iter->next;
                        idx = 0;
                }
                bitmask = NETLBL_CATMAP_BIT;
                bit = 0;
        }

        return -ENOENT;
}

/**
 * netlbl_catmap_getlong - Export an unsigned long bitmap
 * @catmap: pointer to the category bitmap
 * @offset: pointer to the requested offset
 * @bitmap: the exported bitmap
 *
 * Description:
 * Export a bitmap with an offset greater than or equal to @offset and return
 * it in @bitmap.  The @offset must be aligned to an unsigned long and will be
 * updated on return if different from what was requested; if the catmap is
 * empty at the requested offset and beyond, the @offset is set to (u32)-1.
 * Returns zero on sucess, negative values on failure.
 *
 */
int netlbl_catmap_getlong(struct netlbl_lsm_catmap *catmap,
                          u32 *offset,
                          unsigned long *bitmap)
{
        struct netlbl_lsm_catmap *iter;
        u32 off = *offset;
        u32 idx;

        /* only allow aligned offsets */
        if ((off & (BITS_PER_LONG - 1)) != 0)
                return -EINVAL;

        /* a null catmap is equivalent to an empty one */
        if (!catmap) {
                *offset = (u32)-1;
                return 0;
        }

        if (off < catmap->startbit) {
                off = catmap->startbit;
                *offset = off;
        }
        iter = _netlbl_catmap_getnode(&catmap, off, _CM_F_WALK, 0);
        if (iter == NULL) {
                *offset = (u32)-1;
                return 0;
        }

        if (off < iter->startbit) {
                *offset = iter->startbit;
                off = 0;
        } else
                off -= iter->startbit;
        idx = off / NETLBL_CATMAP_MAPSIZE;
        *bitmap = iter->bitmap[idx] >> (off % NETLBL_CATMAP_MAPSIZE);

        return 0;
}

/**
 * netlbl_catmap_setbit - Set a bit in a LSM secattr catmap
 * @catmap: pointer to the category bitmap
 * @bit: the bit to set
 * @flags: memory allocation flags
 *
 * Description:
 * Set the bit specified by @bit in @catmap.  Returns zero on success,
 * negative values on failure.
 *
 */
int netlbl_catmap_setbit(struct netlbl_lsm_catmap **catmap,
                         u32 bit,
                         gfp_t flags)
{
        struct netlbl_lsm_catmap *iter;
        u32 idx;

        iter = _netlbl_catmap_getnode(catmap, bit, _CM_F_ALLOC, flags);
        if (iter == NULL)
                return -ENOMEM;

        bit -= iter->startbit;
        idx = bit / NETLBL_CATMAP_MAPSIZE;
        iter->bitmap[idx] |= NETLBL_CATMAP_BIT << (bit % NETLBL_CATMAP_MAPSIZE);

        return 0;
}
EXPORT_SYMBOL(netlbl_catmap_setbit);

/**
 * netlbl_catmap_setrng - Set a range of bits in a LSM secattr catmap
 * @catmap: pointer to the category bitmap
 * @start: the starting bit
 * @end: the last bit in the string
 * @flags: memory allocation flags
 *
 * Description:
 * Set a range of bits, starting at @start and ending with @end.  Returns zero
 * on success, negative values on failure.
 *
 */
int netlbl_catmap_setrng(struct netlbl_lsm_catmap **catmap,
                         u32 start,
                         u32 end,
                         gfp_t flags)
{
        int rc = 0;
        u32 spot = start;

        while (rc == 0 && spot <= end) {
                if (((spot & (BITS_PER_LONG - 1)) == 0) &&
                    ((end - spot) > BITS_PER_LONG)) {
                        rc = netlbl_catmap_setlong(catmap,
                                                   spot,
                                                   (unsigned long)-1,
                                                   flags);
                        spot += BITS_PER_LONG;
                } else
                        rc = netlbl_catmap_setbit(catmap, spot++, flags);
        }

        return rc;
}

/**
 * netlbl_catmap_setlong - Import an unsigned long bitmap
 * @catmap: pointer to the category bitmap
 * @offset: offset to the start of the imported bitmap
 * @bitmap: the bitmap to import
 * @flags: memory allocation flags
 *
 * Description:
 * Import the bitmap specified in @bitmap into @catmap, using the offset
 * in @offset.  The offset must be aligned to an unsigned long.  Returns zero
 * on success, negative values on failure.
 *
 */
int netlbl_catmap_setlong(struct netlbl_lsm_catmap **catmap,
                          u32 offset,
                          unsigned long bitmap,
                          gfp_t flags)
{
        struct netlbl_lsm_catmap *iter;
        u32 idx;

        /* only allow aligned offsets */
        if ((offset & (BITS_PER_LONG - 1)) != 0)
                return -EINVAL;

        iter = _netlbl_catmap_getnode(catmap, offset, _CM_F_ALLOC, flags);
        if (iter == NULL)
                return -ENOMEM;

        offset -= iter->startbit;
        idx = offset / NETLBL_CATMAP_MAPSIZE;
        iter->bitmap[idx] |= (NETLBL_CATMAP_MAPTYPE)bitmap
                             << (offset % NETLBL_CATMAP_MAPSIZE);

        return 0;
}

/* Bitmap functions
 */

/**
 * netlbl_bitmap_walk - Walk a bitmap looking for a bit
 * @bitmap: the bitmap
 * @bitmap_len: length in bits
 * @offset: starting offset
 * @state: if non-zero, look for a set (1) bit else look for a cleared (0) bit
 *
 * Description:
 * Starting at @offset, walk the bitmap from left to right until either the
 * desired bit is found or we reach the end.  Return the bit offset, -1 if
 * not found, or -2 if error.
 */
int netlbl_bitmap_walk(const unsigned char *bitmap, u32 bitmap_len,
                       u32 offset, u8 state)
{
        u32 bit_spot;
        u32 byte_offset;
        unsigned char bitmask;
        unsigned char byte;

        if (offset >= bitmap_len)
                return -1;
        byte_offset = offset / 8;
        byte = bitmap[byte_offset];
        bit_spot = offset;
        bitmask = 0x80 >> (offset % 8);

        while (bit_spot < bitmap_len) {
                if ((state && (byte & bitmask) == bitmask) ||
                    (state == 0 && (byte & bitmask) == 0))
                        return bit_spot;

                if (++bit_spot >= bitmap_len)
                        return -1;
                bitmask >>= 1;
                if (bitmask == 0) {
                        byte = bitmap[++byte_offset];
                        bitmask = 0x80;
                }
        }

        return -1;
}
EXPORT_SYMBOL(netlbl_bitmap_walk);

/**
 * netlbl_bitmap_setbit - Sets a single bit in a bitmap
 * @bitmap: the bitmap
 * @bit: the bit
 * @state: if non-zero, set the bit (1) else clear the bit (0)
 *
 * Description:
 * Set a single bit in the bitmask.  Returns zero on success, negative values
 * on error.
 */
void netlbl_bitmap_setbit(unsigned char *bitmap, u32 bit, u8 state)
{
        u32 byte_spot;
        u8 bitmask;

        /* gcc always rounds to zero when doing integer division */
        byte_spot = bit / 8;
        bitmask = 0x80 >> (bit % 8);
        if (state)
                bitmap[byte_spot] |= bitmask;
        else
                bitmap[byte_spot] &= ~bitmask;
}
EXPORT_SYMBOL(netlbl_bitmap_setbit);

/*
 * LSM Functions
 */

/**
 * netlbl_enabled - Determine if the NetLabel subsystem is enabled
 *
 * Description:
 * The LSM can use this function to determine if it should use NetLabel
 * security attributes in it's enforcement mechanism.  Currently, NetLabel is
 * considered to be enabled when it's configuration contains a valid setup for
 * at least one labeled protocol (i.e. NetLabel can understand incoming
 * labeled packets of at least one type); otherwise NetLabel is considered to
 * be disabled.
 *
 */
int netlbl_enabled(void)
{
        /* At some point we probably want to expose this mechanism to the user
         * as well so that admins can toggle NetLabel regardless of the
         * configuration */
        return (atomic_read(&netlabel_mgmt_protocount) > 0);
}

/**
 * netlbl_sock_setattr - Label a socket using the correct protocol
 * @sk: the socket to label
 * @family: protocol family
 * @secattr: the security attributes
 *
 * Description:
 * Attach the correct label to the given socket using the security attributes
 * specified in @secattr.  This function requires exclusive access to @sk,
 * which means it either needs to be in the process of being created or locked.
 * Returns zero on success, -EDESTADDRREQ if the domain is configured to use
 * network address selectors (can't blindly label the socket), and negative
 * values on all other failures.
 *
 */
int netlbl_sock_setattr(struct sock *sk,
                        u16 family,
                        const struct netlbl_lsm_secattr *secattr)
{
        int ret_val;
        struct netlbl_dom_map *dom_entry;

        rcu_read_lock();
        dom_entry = netlbl_domhsh_getentry(secattr->domain, family);
        if (dom_entry == NULL) {
                ret_val = -ENOENT;
                goto socket_setattr_return;
        }
        switch (family) {
        case AF_INET:
                switch (dom_entry->def.type) {
                case NETLBL_NLTYPE_ADDRSELECT:
                        ret_val = -EDESTADDRREQ;
                        break;
                case NETLBL_NLTYPE_CIPSOV4:
                        ret_val = cipso_v4_sock_setattr(sk,
                                                        dom_entry->def.cipso,
                                                        secattr);
                        break;
                case NETLBL_NLTYPE_UNLABELED:
                        ret_val = 0;
                        break;
                default:
                        ret_val = -ENOENT;
                }
                break;
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                switch (dom_entry->def.type) {
                case NETLBL_NLTYPE_ADDRSELECT:
                        ret_val = -EDESTADDRREQ;
                        break;
                case NETLBL_NLTYPE_CALIPSO:
                        ret_val = calipso_sock_setattr(sk,
                                                       dom_entry->def.calipso,
                                                       secattr);
                        break;
                case NETLBL_NLTYPE_UNLABELED:
                        ret_val = 0;
                        break;
                default:
                        ret_val = -ENOENT;
                }
                break;
#endif /* IPv6 */
        default:
                ret_val = -EPROTONOSUPPORT;
        }

socket_setattr_return:
        rcu_read_unlock();
        return ret_val;
}

/**
 * netlbl_sock_delattr - Delete all the NetLabel labels on a socket
 * @sk: the socket
 *
 * Description:
 * Remove all the NetLabel labeling from @sk.  The caller is responsible for
 * ensuring that @sk is locked.
 *
 */
void netlbl_sock_delattr(struct sock *sk)
{
        switch (sk->sk_family) {
        case AF_INET:
                cipso_v4_sock_delattr(sk);
                break;
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                calipso_sock_delattr(sk);
                break;
#endif /* IPv6 */
        }
}

/**
 * netlbl_sock_getattr - Determine the security attributes of a sock
 * @sk: the sock
 * @secattr: the security attributes
 *
 * Description:
 * Examines the given sock to see if any NetLabel style labeling has been
 * applied to the sock, if so it parses the socket label and returns the
 * security attributes in @secattr.  Returns zero on success, negative values
 * on failure.
 *
 */
int netlbl_sock_getattr(struct sock *sk,
                        struct netlbl_lsm_secattr *secattr)
{
        int ret_val;

        switch (sk->sk_family) {
        case AF_INET:
                ret_val = cipso_v4_sock_getattr(sk, secattr);
                break;
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                ret_val = calipso_sock_getattr(sk, secattr);
                break;
#endif /* IPv6 */
        default:
                ret_val = -EPROTONOSUPPORT;
        }

        return ret_val;
}

/**
 * netlbl_conn_setattr - Label a connected socket using the correct protocol
 * @sk: the socket to label
 * @addr: the destination address
 * @secattr: the security attributes
 *
 * Description:
 * Attach the correct label to the given connected socket using the security
 * attributes specified in @secattr.  The caller is responsible for ensuring
 * that @sk is locked.  Returns zero on success, negative values on failure.
 *
 */
int netlbl_conn_setattr(struct sock *sk,
                        struct sockaddr *addr,
                        const struct netlbl_lsm_secattr *secattr)
{
        int ret_val;
        struct sockaddr_in *addr4;
#if IS_ENABLED(CONFIG_IPV6)
        struct sockaddr_in6 *addr6;
#endif
        struct netlbl_dommap_def *entry;

        rcu_read_lock();
        switch (addr->sa_family) {
        case AF_INET:
                addr4 = (struct sockaddr_in *)addr;
                entry = netlbl_domhsh_getentry_af4(secattr->domain,
                                                   addr4->sin_addr.s_addr);
                if (entry == NULL) {
                        ret_val = -ENOENT;
                        goto conn_setattr_return;
                }
                switch (entry->type) {
                case NETLBL_NLTYPE_CIPSOV4:
                        ret_val = cipso_v4_sock_setattr(sk,
                                                        entry->cipso, secattr);
                        break;
                case NETLBL_NLTYPE_UNLABELED:
                        /* just delete the protocols we support for right now
                         * but we could remove other protocols if needed */
                        netlbl_sock_delattr(sk);
                        ret_val = 0;
                        break;
                default:
                        ret_val = -ENOENT;
                }
                break;
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                if (sk->sk_family != AF_INET6) {
                        ret_val = -EAFNOSUPPORT;
                        goto conn_setattr_return;
                }

                addr6 = (struct sockaddr_in6 *)addr;
                entry = netlbl_domhsh_getentry_af6(secattr->domain,
                                                   &addr6->sin6_addr);
                if (entry == NULL) {
                        ret_val = -ENOENT;
                        goto conn_setattr_return;
                }
                switch (entry->type) {
                case NETLBL_NLTYPE_CALIPSO:
                        ret_val = calipso_sock_setattr(sk,
                                                       entry->calipso, secattr);
                        break;
                case NETLBL_NLTYPE_UNLABELED:
                        /* just delete the protocols we support for right now
                         * but we could remove other protocols if needed */
                        netlbl_sock_delattr(sk);
                        ret_val = 0;
                        break;
                default:
                        ret_val = -ENOENT;
                }
                break;
#endif /* IPv6 */
        default:
                ret_val = -EPROTONOSUPPORT;
        }

conn_setattr_return:
        rcu_read_unlock();
        return ret_val;
}

/**
 * netlbl_req_setattr - Label a request socket using the correct protocol
 * @req: the request socket to label
 * @secattr: the security attributes
 *
 * Description:
 * Attach the correct label to the given socket using the security attributes
 * specified in @secattr.  Returns zero on success, negative values on failure.
 *
 */
int netlbl_req_setattr(struct request_sock *req,
                       const struct netlbl_lsm_secattr *secattr)
{
        int ret_val;
        struct netlbl_dommap_def *entry;
        struct inet_request_sock *ireq = inet_rsk(req);

        rcu_read_lock();
        switch (req->rsk_ops->family) {
        case AF_INET:
                entry = netlbl_domhsh_getentry_af4(secattr->domain,
                                                   ireq->ir_rmt_addr);
                if (entry == NULL) {
                        ret_val = -ENOENT;
                        goto req_setattr_return;
                }
                switch (entry->type) {
                case NETLBL_NLTYPE_CIPSOV4:
                        ret_val = cipso_v4_req_setattr(req,
                                                       entry->cipso, secattr);
                        break;
                case NETLBL_NLTYPE_UNLABELED:
                        netlbl_req_delattr(req);
                        ret_val = 0;
                        break;
                default:
                        ret_val = -ENOENT;
                }
                break;
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                entry = netlbl_domhsh_getentry_af6(secattr->domain,
                                                   &ireq->ir_v6_rmt_addr);
                if (entry == NULL) {
                        ret_val = -ENOENT;
                        goto req_setattr_return;
                }
                switch (entry->type) {
                case NETLBL_NLTYPE_CALIPSO:
                        ret_val = calipso_req_setattr(req,
                                                      entry->calipso, secattr);
                        break;
                case NETLBL_NLTYPE_UNLABELED:
                        netlbl_req_delattr(req);
                        ret_val = 0;
                        break;
                default:
                        ret_val = -ENOENT;
                }
                break;
#endif /* IPv6 */
        default:
                ret_val = -EPROTONOSUPPORT;
        }

req_setattr_return:
        rcu_read_unlock();
        return ret_val;
}

/**
* netlbl_req_delattr - Delete all the NetLabel labels on a socket
* @req: the socket
*
* Description:
* Remove all the NetLabel labeling from @req.
*
*/
void netlbl_req_delattr(struct request_sock *req)
{
        switch (req->rsk_ops->family) {
        case AF_INET:
                cipso_v4_req_delattr(req);
                break;
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                calipso_req_delattr(req);
                break;
#endif /* IPv6 */
        }
}

/**
 * netlbl_skbuff_setattr - Label a packet using the correct protocol
 * @skb: the packet
 * @family: protocol family
 * @secattr: the security attributes
 *
 * Description:
 * Attach the correct label to the given packet using the security attributes
 * specified in @secattr.  Returns zero on success, negative values on failure.
 *
 */
int netlbl_skbuff_setattr(struct sk_buff *skb,
                          u16 family,
                          const struct netlbl_lsm_secattr *secattr)
{
        int ret_val;
        struct iphdr *hdr4;
#if IS_ENABLED(CONFIG_IPV6)
        struct ipv6hdr *hdr6;
#endif
        struct netlbl_dommap_def *entry;

        rcu_read_lock();
        switch (family) {
        case AF_INET:
                hdr4 = ip_hdr(skb);
                entry = netlbl_domhsh_getentry_af4(secattr->domain,
                                                   hdr4->daddr);
                if (entry == NULL) {
                        ret_val = -ENOENT;
                        goto skbuff_setattr_return;
                }
                switch (entry->type) {
                case NETLBL_NLTYPE_CIPSOV4:
                        ret_val = cipso_v4_skbuff_setattr(skb, entry->cipso,
                                                          secattr);
                        break;
                case NETLBL_NLTYPE_UNLABELED:
                        /* just delete the protocols we support for right now
                         * but we could remove other protocols if needed */
                        ret_val = cipso_v4_skbuff_delattr(skb);
                        break;
                default:
                        ret_val = -ENOENT;
                }
                break;
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                hdr6 = ipv6_hdr(skb);
                entry = netlbl_domhsh_getentry_af6(secattr->domain,
                                                   &hdr6->daddr);
                if (entry == NULL) {
                        ret_val = -ENOENT;
                        goto skbuff_setattr_return;
                }
                switch (entry->type) {
                case NETLBL_NLTYPE_CALIPSO:
                        ret_val = calipso_skbuff_setattr(skb, entry->calipso,
                                                         secattr);
                        break;
                case NETLBL_NLTYPE_UNLABELED:
                        /* just delete the protocols we support for right now
                         * but we could remove other protocols if needed */
                        ret_val = calipso_skbuff_delattr(skb);
                        break;
                default:
                        ret_val = -ENOENT;
                }
                break;
#endif /* IPv6 */
        default:
                ret_val = -EPROTONOSUPPORT;
        }

skbuff_setattr_return:
        rcu_read_unlock();
        return ret_val;
}

/**
 * netlbl_skbuff_getattr - Determine the security attributes of a packet
 * @skb: the packet
 * @family: protocol family
 * @secattr: the security attributes
 *
 * Description:
 * Examines the given packet to see if a recognized form of packet labeling
 * is present, if so it parses the packet label and returns the security
 * attributes in @secattr.  Returns zero on success, negative values on
 * failure.
 *
 */
int netlbl_skbuff_getattr(const struct sk_buff *skb,
                          u16 family,
                          struct netlbl_lsm_secattr *secattr)
{
        unsigned char *ptr;

        switch (family) {
        case AF_INET:
                ptr = cipso_v4_optptr(skb);
                if (ptr && cipso_v4_getattr(ptr, secattr) == 0)
                        return 0;
                break;
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                ptr = calipso_optptr(skb);
                if (ptr && calipso_getattr(ptr, secattr) == 0)
                        return 0;
                break;
#endif /* IPv6 */
        }

        return netlbl_unlabel_getattr(skb, family, secattr);
}

/**
 * netlbl_skbuff_err - Handle a LSM error on a sk_buff
 * @skb: the packet
 * @family: the family
 * @error: the error code
 * @gateway: true if host is acting as a gateway, false otherwise
 *
 * Description:
 * Deal with a LSM problem when handling the packet in @skb, typically this is
 * a permission denied problem (-EACCES).  The correct action is determined
 * according to the packet's labeling protocol.
 *
 */
void netlbl_skbuff_err(struct sk_buff *skb, u16 family, int error, int gateway)
{
        switch (family) {
        case AF_INET:
                if (cipso_v4_optptr(skb))
                        cipso_v4_error(skb, error, gateway);
                break;
        }
}

/**
 * netlbl_cache_invalidate - Invalidate all of the NetLabel protocol caches
 *
 * Description:
 * For all of the NetLabel protocols that support some form of label mapping
 * cache, invalidate the cache.  Returns zero on success, negative values on
 * error.
 *
 */
void netlbl_cache_invalidate(void)
{
        cipso_v4_cache_invalidate();
#if IS_ENABLED(CONFIG_IPV6)
        calipso_cache_invalidate();
#endif /* IPv6 */
}

/**
 * netlbl_cache_add - Add an entry to a NetLabel protocol cache
 * @skb: the packet
 * @family: the family
 * @secattr: the packet's security attributes
 *
 * Description:
 * Add the LSM security attributes for the given packet to the underlying
 * NetLabel protocol's label mapping cache.  Returns zero on success, negative
 * values on error.
 *
 */
int netlbl_cache_add(const struct sk_buff *skb, u16 family,
                     const struct netlbl_lsm_secattr *secattr)
{
        unsigned char *ptr;

        if ((secattr->flags & NETLBL_SECATTR_CACHE) == 0)
                return -ENOMSG;

        switch (family) {
        case AF_INET:
                ptr = cipso_v4_optptr(skb);
                if (ptr)
                        return cipso_v4_cache_add(ptr, secattr);
                break;
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                ptr = calipso_optptr(skb);
                if (ptr)
                        return calipso_cache_add(ptr, secattr);
                break;
#endif /* IPv6 */
        }
        return -ENOMSG;
}

/*
 * Protocol Engine Functions
 */

/**
 * netlbl_audit_start - Start an audit message
 * @type: audit message type
 * @audit_info: NetLabel audit information
 *
 * Description:
 * Start an audit message using the type specified in @type and fill the audit
 * message with some fields common to all NetLabel audit messages.  This
 * function should only be used by protocol engines, not LSMs.  Returns a
 * pointer to the audit buffer on success, NULL on failure.
 *
 */
struct audit_buffer *netlbl_audit_start(int type,
                                        struct netlbl_audit *audit_info)
{
        return netlbl_audit_start_common(type, audit_info);
}
EXPORT_SYMBOL(netlbl_audit_start);

/*
 * Setup Functions
 */

/**
 * netlbl_init - Initialize NetLabel
 *
 * Description:
 * Perform the required NetLabel initialization before first use.
 *
 */
static int __init netlbl_init(void)
{
        int ret_val;

        printk(KERN_INFO "NetLabel: Initializing\n");
        printk(KERN_INFO "NetLabel:  domain hash size = %u\n",
               (1 << NETLBL_DOMHSH_BITSIZE));
        printk(KERN_INFO "NetLabel:  protocols = UNLABELED CIPSOv4 CALIPSO\n");

        ret_val = netlbl_domhsh_init(NETLBL_DOMHSH_BITSIZE);
        if (ret_val != 0)
                goto init_failure;

        ret_val = netlbl_unlabel_init(NETLBL_UNLHSH_BITSIZE);
        if (ret_val != 0)
                goto init_failure;

        ret_val = netlbl_netlink_init();
        if (ret_val != 0)
                goto init_failure;

        ret_val = netlbl_unlabel_defconf();
        if (ret_val != 0)
                goto init_failure;
        printk(KERN_INFO "NetLabel:  unlabeled traffic allowed by default\n");

        return 0;

init_failure:
        panic("NetLabel: failed to initialize properly (%d)\n", ret_val);
}

subsys_initcall(netlbl_init);





















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __PROCFS_FD_H__
#define __PROCFS_FD_H__

#include <linux/fs.h>

extern const struct file_operations proc_fd_operations;
extern const struct inode_operations proc_fd_inode_operations;

extern const struct file_operations proc_fdinfo_operations;
extern const struct inode_operations proc_fdinfo_inode_operations;

extern int proc_fd_permission(struct inode *inode, int mask);

static inline unsigned int proc_fd(struct inode *inode)
{
        return PROC_I(inode)->fd;
}

#endif /* __PROCFS_FD_H__ */
























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
// SPDX-License-Identifier: GPL-2.0
/*
 * kobject.h - generic kernel object infrastructure.
 *
 * Copyright (c) 2002-2003 Patrick Mochel
 * Copyright (c) 2002-2003 Open Source Development Labs
 * Copyright (c) 2006-2008 Greg Kroah-Hartman <greg@kroah.com>
 * Copyright (c) 2006-2008 Novell Inc.
 *
 * Please read Documentation/core-api/kobject.rst before using the kobject
 * interface, ESPECIALLY the parts about reference counts and object
 * destructors.
 */

#ifndef _KOBJECT_H_
#define _KOBJECT_H_

#include <linux/types.h>
#include <linux/list.h>
#include <linux/sysfs.h>
#include <linux/compiler.h>
#include <linux/spinlock.h>
#include <linux/kref.h>
#include <linux/kobject_ns.h>
#include <linux/kernel.h>
#include <linux/wait.h>
#include <linux/atomic.h>
#include <linux/workqueue.h>
#include <linux/uidgid.h>

#define UEVENT_HELPER_PATH_LEN                256
#define UEVENT_NUM_ENVP                        64        /* number of env pointers */
#define UEVENT_BUFFER_SIZE                2048        /* buffer for the variables */

#ifdef CONFIG_UEVENT_HELPER
/* path to the userspace helper executed on an event */
extern char uevent_helper[];
#endif

/* counter to tag the uevent, read only except for the kobject core */
extern u64 uevent_seqnum;

/*
 * The actions here must match the index to the string array
 * in lib/kobject_uevent.c
 *
 * Do not add new actions here without checking with the driver-core
 * maintainers. Action strings are not meant to express subsystem
 * or device specific properties. In most cases you want to send a
 * kobject_uevent_env(kobj, KOBJ_CHANGE, env) with additional event
 * specific variables added to the event environment.
 */
enum kobject_action {
        KOBJ_ADD,
        KOBJ_REMOVE,
        KOBJ_CHANGE,
        KOBJ_MOVE,
        KOBJ_ONLINE,
        KOBJ_OFFLINE,
        KOBJ_BIND,
        KOBJ_UNBIND,
};

struct kobject {
        const char                *name;
        struct list_head        entry;
        struct kobject                *parent;
        struct kset                *kset;
        struct kobj_type        *ktype;
        struct kernfs_node        *sd; /* sysfs directory entry */
        struct kref                kref;
#ifdef CONFIG_DEBUG_KOBJECT_RELEASE
        struct delayed_work        release;
#endif
        unsigned int state_initialized:1;
        unsigned int state_in_sysfs:1;
        unsigned int state_add_uevent_sent:1;
        unsigned int state_remove_uevent_sent:1;
        unsigned int uevent_suppress:1;
};

extern __printf(2, 3)
int kobject_set_name(struct kobject *kobj, const char *name, ...);
extern __printf(2, 0)
int kobject_set_name_vargs(struct kobject *kobj, const char *fmt,
                           va_list vargs);

static inline const char *kobject_name(const struct kobject *kobj)
{
        return kobj->name;
}

extern void kobject_init(struct kobject *kobj, struct kobj_type *ktype);
extern __printf(3, 4) __must_check
int kobject_add(struct kobject *kobj, struct kobject *parent,
                const char *fmt, ...);
extern __printf(4, 5) __must_check
int kobject_init_and_add(struct kobject *kobj,
                         struct kobj_type *ktype, struct kobject *parent,
                         const char *fmt, ...);

extern void kobject_del(struct kobject *kobj);

extern struct kobject * __must_check kobject_create(void);
extern struct kobject * __must_check kobject_create_and_add(const char *name,
                                                struct kobject *parent);

extern int __must_check kobject_rename(struct kobject *, const char *new_name);
extern int __must_check kobject_move(struct kobject *, struct kobject *);

extern struct kobject *kobject_get(struct kobject *kobj);
extern struct kobject * __must_check kobject_get_unless_zero(
                                                struct kobject *kobj);
extern void kobject_put(struct kobject *kobj);

extern const void *kobject_namespace(struct kobject *kobj);
extern void kobject_get_ownership(struct kobject *kobj,
                                  kuid_t *uid, kgid_t *gid);
extern char *kobject_get_path(struct kobject *kobj, gfp_t flag);

/**
 * kobject_has_children - Returns whether a kobject has children.
 * @kobj: the object to test
 *
 * This will return whether a kobject has other kobjects as children.
 *
 * It does NOT account for the presence of attribute files, only sub
 * directories. It also assumes there is no concurrent addition or
 * removal of such children, and thus relies on external locking.
 */
static inline bool kobject_has_children(struct kobject *kobj)
{
        WARN_ON_ONCE(kref_read(&kobj->kref) == 0);

        return kobj->sd && kobj->sd->dir.subdirs;
}

struct kobj_type {
        void (*release)(struct kobject *kobj);
        const struct sysfs_ops *sysfs_ops;
        struct attribute **default_attrs;        /* use default_groups instead */
        const struct attribute_group **default_groups;
        const struct kobj_ns_type_operations *(*child_ns_type)(struct kobject *kobj);
        const void *(*namespace)(struct kobject *kobj);
        void (*get_ownership)(struct kobject *kobj, kuid_t *uid, kgid_t *gid);
};

struct kobj_uevent_env {
        char *argv[3];
        char *envp[UEVENT_NUM_ENVP];
        int envp_idx;
        char buf[UEVENT_BUFFER_SIZE];
        int buflen;
};

struct kset_uevent_ops {
        int (* const filter)(struct kset *kset, struct kobject *kobj);
        const char *(* const name)(struct kset *kset, struct kobject *kobj);
        int (* const uevent)(struct kset *kset, struct kobject *kobj,
                      struct kobj_uevent_env *env);
};

struct kobj_attribute {
        struct attribute attr;
        ssize_t (*show)(struct kobject *kobj, struct kobj_attribute *attr,
                        char *buf);
        ssize_t (*store)(struct kobject *kobj, struct kobj_attribute *attr,
                         const char *buf, size_t count);
};

extern const struct sysfs_ops kobj_sysfs_ops;

struct sock;

/**
 * struct kset - a set of kobjects of a specific type, belonging to a specific subsystem.
 *
 * A kset defines a group of kobjects.  They can be individually
 * different "types" but overall these kobjects all want to be grouped
 * together and operated on in the same manner.  ksets are used to
 * define the attribute callbacks and other common events that happen to
 * a kobject.
 *
 * @list: the list of all kobjects for this kset
 * @list_lock: a lock for iterating over the kobjects
 * @kobj: the embedded kobject for this kset (recursion, isn't it fun...)
 * @uevent_ops: the set of uevent operations for this kset.  These are
 * called whenever a kobject has something happen to it so that the kset
 * can add new environment variables, or filter out the uevents if so
 * desired.
 */
struct kset {
        struct list_head list;
        spinlock_t list_lock;
        struct kobject kobj;
        const struct kset_uevent_ops *uevent_ops;
} __randomize_layout;

extern void kset_init(struct kset *kset);
extern int __must_check kset_register(struct kset *kset);
extern void kset_unregister(struct kset *kset);
extern struct kset * __must_check kset_create_and_add(const char *name,
                                                const struct kset_uevent_ops *u,
                                                struct kobject *parent_kobj);

static inline struct kset *to_kset(struct kobject *kobj)
{
        return kobj ? container_of(kobj, struct kset, kobj) : NULL;
}

static inline struct kset *kset_get(struct kset *k)
{
        return k ? to_kset(kobject_get(&k->kobj)) : NULL;
}

static inline void kset_put(struct kset *k)
{
        kobject_put(&k->kobj);
}

static inline struct kobj_type *get_ktype(struct kobject *kobj)
{
        return kobj->ktype;
}

extern struct kobject *kset_find_obj(struct kset *, const char *);

/* The global /sys/kernel/ kobject for people to chain off of */
extern struct kobject *kernel_kobj;
/* The global /sys/kernel/mm/ kobject for people to chain off of */
extern struct kobject *mm_kobj;
/* The global /sys/hypervisor/ kobject for people to chain off of */
extern struct kobject *hypervisor_kobj;
/* The global /sys/power/ kobject for people to chain off of */
extern struct kobject *power_kobj;
/* The global /sys/firmware/ kobject for people to chain off of */
extern struct kobject *firmware_kobj;

int kobject_uevent(struct kobject *kobj, enum kobject_action action);
int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
                        char *envp[]);
int kobject_synth_uevent(struct kobject *kobj, const char *buf, size_t count);

__printf(2, 3)
int add_uevent_var(struct kobj_uevent_env *env, const char *format, ...);

#endif /* _KOBJECT_H_ */





































    3 

























    4 





    4 

    3 







    4 




    3 

































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2014 Davidlohr Bueso.
 */
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/mm.h>
#include <linux/vmacache.h>

/*
 * Hash based on the pmd of addr if configured with MMU, which provides a good
 * hit rate for workloads with spatial locality.  Otherwise, use pages.
 */
#ifdef CONFIG_MMU
#define VMACACHE_SHIFT        PMD_SHIFT
#else
#define VMACACHE_SHIFT        PAGE_SHIFT
#endif
#define VMACACHE_HASH(addr) ((addr >> VMACACHE_SHIFT) & VMACACHE_MASK)

/*
 * This task may be accessing a foreign mm via (for example)
 * get_user_pages()->find_vma().  The vmacache is task-local and this
 * task's vmacache pertains to a different mm (ie, its own).  There is
 * nothing we can do here.
 *
 * Also handle the case where a kernel thread has adopted this mm via
 * kthread_use_mm(). That kernel thread's vmacache is not applicable to this mm.
 */
static inline bool vmacache_valid_mm(struct mm_struct *mm)
{
        return current->mm == mm && !(current->flags & PF_KTHREAD);
}

void vmacache_update(unsigned long addr, struct vm_area_struct *newvma)
{
        if (vmacache_valid_mm(newvma->vm_mm))
                current->vmacache.vmas[VMACACHE_HASH(addr)] = newvma;
}

static bool vmacache_valid(struct mm_struct *mm)
{
        struct task_struct *curr;

        if (!vmacache_valid_mm(mm))
                return false;

        curr = current;
        if (mm->vmacache_seqnum != curr->vmacache.seqnum) {
                /*
                 * First attempt will always be invalid, initialize
                 * the new cache for this task here.
                 */
                curr->vmacache.seqnum = mm->vmacache_seqnum;
                vmacache_flush(curr);
                return false;
        }
        return true;
}

struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
{
        int idx = VMACACHE_HASH(addr);
        int i;

        count_vm_vmacache_event(VMACACHE_FIND_CALLS);

        if (!vmacache_valid(mm))
                return NULL;

        for (i = 0; i < VMACACHE_SIZE; i++) {
                struct vm_area_struct *vma = current->vmacache.vmas[idx];

                if (vma) {
#ifdef CONFIG_DEBUG_VM_VMACACHE
                        if (WARN_ON_ONCE(vma->vm_mm != mm))
                                break;
#endif
                        if (vma->vm_start <= addr && vma->vm_end > addr) {
                                count_vm_vmacache_event(VMACACHE_FIND_HITS);
                                return vma;
                        }
                }
                if (++idx == VMACACHE_SIZE)
                        idx = 0;
        }

        return NULL;
}

#ifndef CONFIG_MMU
struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
                                           unsigned long start,
                                           unsigned long end)
{
        int idx = VMACACHE_HASH(start);
        int i;

        count_vm_vmacache_event(VMACACHE_FIND_CALLS);

        if (!vmacache_valid(mm))
                return NULL;

        for (i = 0; i < VMACACHE_SIZE; i++) {
                struct vm_area_struct *vma = current->vmacache.vmas[idx];

                if (vma && vma->vm_start == start && vma->vm_end == end) {
                        count_vm_vmacache_event(VMACACHE_FIND_HITS);
                        return vma;
                }
                if (++idx == VMACACHE_SIZE)
                        idx = 0;
        }

        return NULL;
}
#endif

































































































































































































































































































































































    1 
    1 










    1 








    1 
    1 










    1 
































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
// SPDX-License-Identifier: GPL-2.0-only
/*
 * lib/bitmap.c
 * Helper functions for bitmap.h.
 */

#include <linux/bitmap.h>
#include <linux/bitops.h>
#include <linux/bug.h>
#include <linux/ctype.h>
#include <linux/device.h>
#include <linux/errno.h>
#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/thread_info.h>
#include <linux/uaccess.h>

#include <asm/page.h>

#include "kstrtox.h"

/**
 * DOC: bitmap introduction
 *
 * bitmaps provide an array of bits, implemented using an
 * array of unsigned longs.  The number of valid bits in a
 * given bitmap does _not_ need to be an exact multiple of
 * BITS_PER_LONG.
 *
 * The possible unused bits in the last, partially used word
 * of a bitmap are 'don't care'.  The implementation makes
 * no particular effort to keep them zero.  It ensures that
 * their value will not affect the results of any operation.
 * The bitmap operations that return Boolean (bitmap_empty,
 * for example) or scalar (bitmap_weight, for example) results
 * carefully filter out these unused bits from impacting their
 * results.
 *
 * The byte ordering of bitmaps is more natural on little
 * endian architectures.  See the big-endian headers
 * include/asm-ppc64/bitops.h and include/asm-s390/bitops.h
 * for the best explanations of this ordering.
 */

int __bitmap_equal(const unsigned long *bitmap1,
                const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k, lim = bits/BITS_PER_LONG;
        for (k = 0; k < lim; ++k)
                if (bitmap1[k] != bitmap2[k])
                        return 0;

        if (bits % BITS_PER_LONG)
                if ((bitmap1[k] ^ bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits))
                        return 0;

        return 1;
}
EXPORT_SYMBOL(__bitmap_equal);

bool __bitmap_or_equal(const unsigned long *bitmap1,
                       const unsigned long *bitmap2,
                       const unsigned long *bitmap3,
                       unsigned int bits)
{
        unsigned int k, lim = bits / BITS_PER_LONG;
        unsigned long tmp;

        for (k = 0; k < lim; ++k) {
                if ((bitmap1[k] | bitmap2[k]) != bitmap3[k])
                        return false;
        }

        if (!(bits % BITS_PER_LONG))
                return true;

        tmp = (bitmap1[k] | bitmap2[k]) ^ bitmap3[k];
        return (tmp & BITMAP_LAST_WORD_MASK(bits)) == 0;
}

void __bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int bits)
{
        unsigned int k, lim = BITS_TO_LONGS(bits);
        for (k = 0; k < lim; ++k)
                dst[k] = ~src[k];
}
EXPORT_SYMBOL(__bitmap_complement);

/**
 * __bitmap_shift_right - logical right shift of the bits in a bitmap
 *   @dst : destination bitmap
 *   @src : source bitmap
 *   @shift : shift by this many bits
 *   @nbits : bitmap size, in bits
 *
 * Shifting right (dividing) means moving bits in the MS -> LS bit
 * direction.  Zeros are fed into the vacated MS positions and the
 * LS bits shifted off the bottom are lost.
 */
void __bitmap_shift_right(unsigned long *dst, const unsigned long *src,
                        unsigned shift, unsigned nbits)
{
        unsigned k, lim = BITS_TO_LONGS(nbits);
        unsigned off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG;
        unsigned long mask = BITMAP_LAST_WORD_MASK(nbits);
        for (k = 0; off + k < lim; ++k) {
                unsigned long upper, lower;

                /*
                 * If shift is not word aligned, take lower rem bits of
                 * word above and make them the top rem bits of result.
                 */
                if (!rem || off + k + 1 >= lim)
                        upper = 0;
                else {
                        upper = src[off + k + 1];
                        if (off + k + 1 == lim - 1)
                                upper &= mask;
                        upper <<= (BITS_PER_LONG - rem);
                }
                lower = src[off + k];
                if (off + k == lim - 1)
                        lower &= mask;
                lower >>= rem;
                dst[k] = lower | upper;
        }
        if (off)
                memset(&dst[lim - off], 0, off*sizeof(unsigned long));
}
EXPORT_SYMBOL(__bitmap_shift_right);


/**
 * __bitmap_shift_left - logical left shift of the bits in a bitmap
 *   @dst : destination bitmap
 *   @src : source bitmap
 *   @shift : shift by this many bits
 *   @nbits : bitmap size, in bits
 *
 * Shifting left (multiplying) means moving bits in the LS -> MS
 * direction.  Zeros are fed into the vacated LS bit positions
 * and those MS bits shifted off the top are lost.
 */

void __bitmap_shift_left(unsigned long *dst, const unsigned long *src,
                        unsigned int shift, unsigned int nbits)
{
        int k;
        unsigned int lim = BITS_TO_LONGS(nbits);
        unsigned int off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG;
        for (k = lim - off - 1; k >= 0; --k) {
                unsigned long upper, lower;

                /*
                 * If shift is not word aligned, take upper rem bits of
                 * word below and make them the bottom rem bits of result.
                 */
                if (rem && k > 0)
                        lower = src[k - 1] >> (BITS_PER_LONG - rem);
                else
                        lower = 0;
                upper = src[k] << rem;
                dst[k + off] = lower | upper;
        }
        if (off)
                memset(dst, 0, off*sizeof(unsigned long));
}
EXPORT_SYMBOL(__bitmap_shift_left);

/**
 * bitmap_cut() - remove bit region from bitmap and right shift remaining bits
 * @dst: destination bitmap, might overlap with src
 * @src: source bitmap
 * @first: start bit of region to be removed
 * @cut: number of bits to remove
 * @nbits: bitmap size, in bits
 *
 * Set the n-th bit of @dst iff the n-th bit of @src is set and
 * n is less than @first, or the m-th bit of @src is set for any
 * m such that @first <= n < nbits, and m = n + @cut.
 *
 * In pictures, example for a big-endian 32-bit architecture:
 *
 * The @src bitmap is::
 *
 *   31                                   63
 *   |                                    |
 *   10000000 11000001 11110010 00010101  10000000 11000001 01110010 00010101
 *                   |  |              |                                    |
 *                  16  14             0                                   32
 *
 * if @cut is 3, and @first is 14, bits 14-16 in @src are cut and @dst is::
 *
 *   31                                   63
 *   |                                    |
 *   10110000 00011000 00110010 00010101  00010000 00011000 00101110 01000010
 *                      |              |                                    |
 *                      14 (bit 17     0                                   32
 *                          from @src)
 *
 * Note that @dst and @src might overlap partially or entirely.
 *
 * This is implemented in the obvious way, with a shift and carry
 * step for each moved bit. Optimisation is left as an exercise
 * for the compiler.
 */
void bitmap_cut(unsigned long *dst, const unsigned long *src,
                unsigned int first, unsigned int cut, unsigned int nbits)
{
        unsigned int len = BITS_TO_LONGS(nbits);
        unsigned long keep = 0, carry;
        int i;

        if (first % BITS_PER_LONG) {
                keep = src[first / BITS_PER_LONG] &
                       (~0UL >> (BITS_PER_LONG - first % BITS_PER_LONG));
        }

        memmove(dst, src, len * sizeof(*dst));

        while (cut--) {
                for (i = first / BITS_PER_LONG; i < len; i++) {
                        if (i < len - 1)
                                carry = dst[i + 1] & 1UL;
                        else
                                carry = 0;

                        dst[i] = (dst[i] >> 1) | (carry << (BITS_PER_LONG - 1));
                }
        }

        dst[first / BITS_PER_LONG] &= ~0UL << (first % BITS_PER_LONG);
        dst[first / BITS_PER_LONG] |= keep;
}
EXPORT_SYMBOL(bitmap_cut);

int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k;
        unsigned int lim = bits/BITS_PER_LONG;
        unsigned long result = 0;

        for (k = 0; k < lim; k++)
                result |= (dst[k] = bitmap1[k] & bitmap2[k]);
        if (bits % BITS_PER_LONG)
                result |= (dst[k] = bitmap1[k] & bitmap2[k] &
                           BITMAP_LAST_WORD_MASK(bits));
        return result != 0;
}
EXPORT_SYMBOL(__bitmap_and);

void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k;
        unsigned int nr = BITS_TO_LONGS(bits);

        for (k = 0; k < nr; k++)
                dst[k] = bitmap1[k] | bitmap2[k];
}
EXPORT_SYMBOL(__bitmap_or);

void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k;
        unsigned int nr = BITS_TO_LONGS(bits);

        for (k = 0; k < nr; k++)
                dst[k] = bitmap1[k] ^ bitmap2[k];
}
EXPORT_SYMBOL(__bitmap_xor);

int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k;
        unsigned int lim = bits/BITS_PER_LONG;
        unsigned long result = 0;

        for (k = 0; k < lim; k++)
                result |= (dst[k] = bitmap1[k] & ~bitmap2[k]);
        if (bits % BITS_PER_LONG)
                result |= (dst[k] = bitmap1[k] & ~bitmap2[k] &
                           BITMAP_LAST_WORD_MASK(bits));
        return result != 0;
}
EXPORT_SYMBOL(__bitmap_andnot);

void __bitmap_replace(unsigned long *dst,
                      const unsigned long *old, const unsigned long *new,
                      const unsigned long *mask, unsigned int nbits)
{
        unsigned int k;
        unsigned int nr = BITS_TO_LONGS(nbits);

        for (k = 0; k < nr; k++)
                dst[k] = (old[k] & ~mask[k]) | (new[k] & mask[k]);
}
EXPORT_SYMBOL(__bitmap_replace);

int __bitmap_intersects(const unsigned long *bitmap1,
                        const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k, lim = bits/BITS_PER_LONG;
        for (k = 0; k < lim; ++k)
                if (bitmap1[k] & bitmap2[k])
                        return 1;

        if (bits % BITS_PER_LONG)
                if ((bitmap1[k] & bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits))
                        return 1;
        return 0;
}
EXPORT_SYMBOL(__bitmap_intersects);

int __bitmap_subset(const unsigned long *bitmap1,
                    const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k, lim = bits/BITS_PER_LONG;
        for (k = 0; k < lim; ++k)
                if (bitmap1[k] & ~bitmap2[k])
                        return 0;

        if (bits % BITS_PER_LONG)
                if ((bitmap1[k] & ~bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits))
                        return 0;
        return 1;
}
EXPORT_SYMBOL(__bitmap_subset);

int __bitmap_weight(const unsigned long *bitmap, unsigned int bits)
{
        unsigned int k, lim = bits/BITS_PER_LONG;
        int w = 0;

        for (k = 0; k < lim; k++)
                w += hweight_long(bitmap[k]);

        if (bits % BITS_PER_LONG)
                w += hweight_long(bitmap[k] & BITMAP_LAST_WORD_MASK(bits));

        return w;
}
EXPORT_SYMBOL(__bitmap_weight);

void __bitmap_set(unsigned long *map, unsigned int start, int len)
{
        unsigned long *p = map + BIT_WORD(start);
        const unsigned int size = start + len;
        int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
        unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start);

        while (len - bits_to_set >= 0) {
                *p |= mask_to_set;
                len -= bits_to_set;
                bits_to_set = BITS_PER_LONG;
                mask_to_set = ~0UL;
                p++;
        }
        if (len) {
                mask_to_set &= BITMAP_LAST_WORD_MASK(size);
                *p |= mask_to_set;
        }
}
EXPORT_SYMBOL(__bitmap_set);

void __bitmap_clear(unsigned long *map, unsigned int start, int len)
{
        unsigned long *p = map + BIT_WORD(start);
        const unsigned int size = start + len;
        int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
        unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);

        while (len - bits_to_clear >= 0) {
                *p &= ~mask_to_clear;
                len -= bits_to_clear;
                bits_to_clear = BITS_PER_LONG;
                mask_to_clear = ~0UL;
                p++;
        }
        if (len) {
                mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
                *p &= ~mask_to_clear;
        }
}
EXPORT_SYMBOL(__bitmap_clear);

/**
 * bitmap_find_next_zero_area_off - find a contiguous aligned zero area
 * @map: The address to base the search on
 * @size: The bitmap size in bits
 * @start: The bitnumber to start searching at
 * @nr: The number of zeroed bits we're looking for
 * @align_mask: Alignment mask for zero area
 * @align_offset: Alignment offset for zero area.
 *
 * The @align_mask should be one less than a power of 2; the effect is that
 * the bit offset of all zero areas this function finds plus @align_offset
 * is multiple of that power of 2.
 */
unsigned long bitmap_find_next_zero_area_off(unsigned long *map,
                                             unsigned long size,
                                             unsigned long start,
                                             unsigned int nr,
                                             unsigned long align_mask,
                                             unsigned long align_offset)
{
        unsigned long index, end, i;
again:
        index = find_next_zero_bit(map, size, start);

        /* Align allocation */
        index = __ALIGN_MASK(index + align_offset, align_mask) - align_offset;

        end = index + nr;
        if (end > size)
                return end;
        i = find_next_bit(map, end, index);
        if (i < end) {
                start = i + 1;
                goto again;
        }
        return index;
}
EXPORT_SYMBOL(bitmap_find_next_zero_area_off);

/*
 * Bitmap printing & parsing functions: first version by Nadia Yvette Chambers,
 * second version by Paul Jackson, third by Joe Korty.
 */

/**
 * bitmap_parse_user - convert an ASCII hex string in a user buffer into a bitmap
 *
 * @ubuf: pointer to user buffer containing string.
 * @ulen: buffer size in bytes.  If string is smaller than this
 *    then it must be terminated with a \0.
 * @maskp: pointer to bitmap array that will contain result.
 * @nmaskbits: size of bitmap, in bits.
 */
int bitmap_parse_user(const char __user *ubuf,
                        unsigned int ulen, unsigned long *maskp,
                        int nmaskbits)
{
        char *buf;
        int ret;

        buf = memdup_user_nul(ubuf, ulen);
        if (IS_ERR(buf))
                return PTR_ERR(buf);

        ret = bitmap_parse(buf, UINT_MAX, maskp, nmaskbits);

        kfree(buf);
        return ret;
}
EXPORT_SYMBOL(bitmap_parse_user);

/**
 * bitmap_print_to_pagebuf - convert bitmap to list or hex format ASCII string
 * @list: indicates whether the bitmap must be list
 * @buf: page aligned buffer into which string is placed
 * @maskp: pointer to bitmap to convert
 * @nmaskbits: size of bitmap, in bits
 *
 * Output format is a comma-separated list of decimal numbers and
 * ranges if list is specified or hex digits grouped into comma-separated
 * sets of 8 digits/set. Returns the number of characters written to buf.
 *
 * It is assumed that @buf is a pointer into a PAGE_SIZE, page-aligned
 * area and that sufficient storage remains at @buf to accommodate the
 * bitmap_print_to_pagebuf() output. Returns the number of characters
 * actually printed to @buf, excluding terminating '\0'.
 */
int bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp,
                            int nmaskbits)
{
        ptrdiff_t len = PAGE_SIZE - offset_in_page(buf);

        return list ? scnprintf(buf, len, "%*pbl\n", nmaskbits, maskp) :
                      scnprintf(buf, len, "%*pb\n", nmaskbits, maskp);
}
EXPORT_SYMBOL(bitmap_print_to_pagebuf);

/*
 * Region 9-38:4/10 describes the following bitmap structure:
 * 0           9  12    18                        38
 * .........****......****......****......
 *            ^  ^     ^                         ^
 *      start  off   group_len               end
 */
struct region {
        unsigned int start;
        unsigned int off;
        unsigned int group_len;
        unsigned int end;
};

static int bitmap_set_region(const struct region *r,
                                unsigned long *bitmap, int nbits)
{
        unsigned int start;

        if (r->end >= nbits)
                return -ERANGE;

        for (start = r->start; start <= r->end; start += r->group_len)
                bitmap_set(bitmap, start, min(r->end - start + 1, r->off));

        return 0;
}

static int bitmap_check_region(const struct region *r)
{
        if (r->start > r->end || r->group_len == 0 || r->off > r->group_len)
                return -EINVAL;

        return 0;
}

static const char *bitmap_getnum(const char *str, unsigned int *num)
{
        unsigned long long n;
        unsigned int len;

        len = _parse_integer(str, 10, &n);
        if (!len)
                return ERR_PTR(-EINVAL);
        if (len & KSTRTOX_OVERFLOW || n != (unsigned int)n)
                return ERR_PTR(-EOVERFLOW);

        *num = n;
        return str + len;
}

static inline bool end_of_str(char c)
{
        return c == '\0' || c == '\n';
}

static inline bool __end_of_region(char c)
{
        return isspace(c) || c == ',';
}

static inline bool end_of_region(char c)
{
        return __end_of_region(c) || end_of_str(c);
}

/*
 * The format allows commas and whitespaces at the beginning
 * of the region.
 */
static const char *bitmap_find_region(const char *str)
{
        while (__end_of_region(*str))
                str++;

        return end_of_str(*str) ? NULL : str;
}

static const char *bitmap_find_region_reverse(const char *start, const char *end)
{
        while (start <= end && __end_of_region(*end))
                end--;

        return end;
}

static const char *bitmap_parse_region(const char *str, struct region *r)
{
        str = bitmap_getnum(str, &r->start);
        if (IS_ERR(str))
                return str;

        if (end_of_region(*str))
                goto no_end;

        if (*str != '-')
                return ERR_PTR(-EINVAL);

        str = bitmap_getnum(str + 1, &r->end);
        if (IS_ERR(str))
                return str;

        if (end_of_region(*str))
                goto no_pattern;

        if (*str != ':')
                return ERR_PTR(-EINVAL);

        str = bitmap_getnum(str + 1, &r->off);
        if (IS_ERR(str))
                return str;

        if (*str != '/')
                return ERR_PTR(-EINVAL);

        return bitmap_getnum(str + 1, &r->group_len);

no_end:
        r->end = r->start;
no_pattern:
        r->off = r->end + 1;
        r->group_len = r->end + 1;

        return end_of_str(*str) ? NULL : str;
}

/**
 * bitmap_parselist - convert list format ASCII string to bitmap
 * @buf: read user string from this buffer; must be terminated
 *    with a \0 or \n.
 * @maskp: write resulting mask here
 * @nmaskbits: number of bits in mask to be written
 *
 * Input format is a comma-separated list of decimal numbers and
 * ranges.  Consecutively set bits are shown as two hyphen-separated
 * decimal numbers, the smallest and largest bit numbers set in
 * the range.
 * Optionally each range can be postfixed to denote that only parts of it
 * should be set. The range will divided to groups of specific size.
 * From each group will be used only defined amount of bits.
 * Syntax: range:used_size/group_size
 * Example: 0-1023:2/256 ==> 0,1,256,257,512,513,768,769
 *
 * Returns: 0 on success, -errno on invalid input strings. Error values:
 *
 *   - ``-EINVAL``: wrong region format
 *   - ``-EINVAL``: invalid character in string
 *   - ``-ERANGE``: bit number specified too large for mask
 *   - ``-EOVERFLOW``: integer overflow in the input parameters
 */
int bitmap_parselist(const char *buf, unsigned long *maskp, int nmaskbits)
{
        struct region r;
        long ret;

        bitmap_zero(maskp, nmaskbits);

        while (buf) {
                buf = bitmap_find_region(buf);
                if (buf == NULL)
                        return 0;

                buf = bitmap_parse_region(buf, &r);
                if (IS_ERR(buf))
                        return PTR_ERR(buf);

                ret = bitmap_check_region(&r);
                if (ret)
                        return ret;

                ret = bitmap_set_region(&r, maskp, nmaskbits);
                if (ret)
                        return ret;
        }

        return 0;
}
EXPORT_SYMBOL(bitmap_parselist);


/**
 * bitmap_parselist_user()
 *
 * @ubuf: pointer to user buffer containing string.
 * @ulen: buffer size in bytes.  If string is smaller than this
 *    then it must be terminated with a \0.
 * @maskp: pointer to bitmap array that will contain result.
 * @nmaskbits: size of bitmap, in bits.
 *
 * Wrapper for bitmap_parselist(), providing it with user buffer.
 */
int bitmap_parselist_user(const char __user *ubuf,
                        unsigned int ulen, unsigned long *maskp,
                        int nmaskbits)
{
        char *buf;
        int ret;

        buf = memdup_user_nul(ubuf, ulen);
        if (IS_ERR(buf))
                return PTR_ERR(buf);

        ret = bitmap_parselist(buf, maskp, nmaskbits);

        kfree(buf);
        return ret;
}
EXPORT_SYMBOL(bitmap_parselist_user);

static const char *bitmap_get_x32_reverse(const char *start,
                                        const char *end, u32 *num)
{
        u32 ret = 0;
        int c, i;

        for (i = 0; i < 32; i += 4) {
                c = hex_to_bin(*end--);
                if (c < 0)
                        return ERR_PTR(-EINVAL);

                ret |= c << i;

                if (start > end || __end_of_region(*end))
                        goto out;
        }

        if (hex_to_bin(*end--) >= 0)
                return ERR_PTR(-EOVERFLOW);
out:
        *num = ret;
        return end;
}

/**
 * bitmap_parse - convert an ASCII hex string into a bitmap.
 * @start: pointer to buffer containing string.
 * @buflen: buffer size in bytes.  If string is smaller than this
 *    then it must be terminated with a \0 or \n. In that case,
 *    UINT_MAX may be provided instead of string length.
 * @maskp: pointer to bitmap array that will contain result.
 * @nmaskbits: size of bitmap, in bits.
 *
 * Commas group hex digits into chunks.  Each chunk defines exactly 32
 * bits of the resultant bitmask.  No chunk may specify a value larger
 * than 32 bits (%-EOVERFLOW), and if a chunk specifies a smaller value
 * then leading 0-bits are prepended.  %-EINVAL is returned for illegal
 * characters. Grouping such as "1,,5", ",44", "," or "" is allowed.
 * Leading, embedded and trailing whitespace accepted.
 */
int bitmap_parse(const char *start, unsigned int buflen,
                unsigned long *maskp, int nmaskbits)
{
        const char *end = strnchrnul(start, buflen, '\n') - 1;
        int chunks = BITS_TO_U32(nmaskbits);
        u32 *bitmap = (u32 *)maskp;
        int unset_bit;
        int chunk;

        for (chunk = 0; ; chunk++) {
                end = bitmap_find_region_reverse(start, end);
                if (start > end)
                        break;

                if (!chunks--)
                        return -EOVERFLOW;

#if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN)
                end = bitmap_get_x32_reverse(start, end, &bitmap[chunk ^ 1]);
#else
                end = bitmap_get_x32_reverse(start, end, &bitmap[chunk]);
#endif
                if (IS_ERR(end))
                        return PTR_ERR(end);
        }

        unset_bit = (BITS_TO_U32(nmaskbits) - chunks) * 32;
        if (unset_bit < nmaskbits) {
                bitmap_clear(maskp, unset_bit, nmaskbits - unset_bit);
                return 0;
        }

        if (find_next_bit(maskp, unset_bit, nmaskbits) != unset_bit)
                return -EOVERFLOW;

        return 0;
}
EXPORT_SYMBOL(bitmap_parse);


#ifdef CONFIG_NUMA
/**
 * bitmap_pos_to_ord - find ordinal of set bit at given position in bitmap
 *        @buf: pointer to a bitmap
 *        @pos: a bit position in @buf (0 <= @pos < @nbits)
 *        @nbits: number of valid bit positions in @buf
 *
 * Map the bit at position @pos in @buf (of length @nbits) to the
 * ordinal of which set bit it is.  If it is not set or if @pos
 * is not a valid bit position, map to -1.
 *
 * If for example, just bits 4 through 7 are set in @buf, then @pos
 * values 4 through 7 will get mapped to 0 through 3, respectively,
 * and other @pos values will get mapped to -1.  When @pos value 7
 * gets mapped to (returns) @ord value 3 in this example, that means
 * that bit 7 is the 3rd (starting with 0th) set bit in @buf.
 *
 * The bit positions 0 through @bits are valid positions in @buf.
 */
static int bitmap_pos_to_ord(const unsigned long *buf, unsigned int pos, unsigned int nbits)
{
        if (pos >= nbits || !test_bit(pos, buf))
                return -1;

        return __bitmap_weight(buf, pos);
}

/**
 * bitmap_ord_to_pos - find position of n-th set bit in bitmap
 *        @buf: pointer to bitmap
 *        @ord: ordinal bit position (n-th set bit, n >= 0)
 *        @nbits: number of valid bit positions in @buf
 *
 * Map the ordinal offset of bit @ord in @buf to its position in @buf.
 * Value of @ord should be in range 0 <= @ord < weight(buf). If @ord
 * >= weight(buf), returns @nbits.
 *
 * If for example, just bits 4 through 7 are set in @buf, then @ord
 * values 0 through 3 will get mapped to 4 through 7, respectively,
 * and all other @ord values returns @nbits.  When @ord value 3
 * gets mapped to (returns) @pos value 7 in this example, that means
 * that the 3rd set bit (starting with 0th) is at position 7 in @buf.
 *
 * The bit positions 0 through @nbits-1 are valid positions in @buf.
 */
unsigned int bitmap_ord_to_pos(const unsigned long *buf, unsigned int ord, unsigned int nbits)
{
        unsigned int pos;

        for (pos = find_first_bit(buf, nbits);
             pos < nbits && ord;
             pos = find_next_bit(buf, nbits, pos + 1))
                ord--;

        return pos;
}

/**
 * bitmap_remap - Apply map defined by a pair of bitmaps to another bitmap
 *        @dst: remapped result
 *        @src: subset to be remapped
 *        @old: defines domain of map
 *        @new: defines range of map
 *        @nbits: number of bits in each of these bitmaps
 *
 * Let @old and @new define a mapping of bit positions, such that
 * whatever position is held by the n-th set bit in @old is mapped
 * to the n-th set bit in @new.  In the more general case, allowing
 * for the possibility that the weight 'w' of @new is less than the
 * weight of @old, map the position of the n-th set bit in @old to
 * the position of the m-th set bit in @new, where m == n % w.
 *
 * If either of the @old and @new bitmaps are empty, or if @src and
 * @dst point to the same location, then this routine copies @src
 * to @dst.
 *
 * The positions of unset bits in @old are mapped to themselves
 * (the identify map).
 *
 * Apply the above specified mapping to @src, placing the result in
 * @dst, clearing any bits previously set in @dst.
 *
 * For example, lets say that @old has bits 4 through 7 set, and
 * @new has bits 12 through 15 set.  This defines the mapping of bit
 * position 4 to 12, 5 to 13, 6 to 14 and 7 to 15, and of all other
 * bit positions unchanged.  So if say @src comes into this routine
 * with bits 1, 5 and 7 set, then @dst should leave with bits 1,
 * 13 and 15 set.
 */
void bitmap_remap(unsigned long *dst, const unsigned long *src,
                const unsigned long *old, const unsigned long *new,
                unsigned int nbits)
{
        unsigned int oldbit, w;

        if (dst == src)                /* following doesn't handle inplace remaps */
                return;
        bitmap_zero(dst, nbits);

        w = bitmap_weight(new, nbits);
        for_each_set_bit(oldbit, src, nbits) {
                int n = bitmap_pos_to_ord(old, oldbit, nbits);

                if (n < 0 || w == 0)
                        set_bit(oldbit, dst);        /* identity map */
                else
                        set_bit(bitmap_ord_to_pos(new, n % w, nbits), dst);
        }
}

/**
 * bitmap_bitremap - Apply map defined by a pair of bitmaps to a single bit
 *        @oldbit: bit position to be mapped
 *        @old: defines domain of map
 *        @new: defines range of map
 *        @bits: number of bits in each of these bitmaps
 *
 * Let @old and @new define a mapping of bit positions, such that
 * whatever position is held by the n-th set bit in @old is mapped
 * to the n-th set bit in @new.  In the more general case, allowing
 * for the possibility that the weight 'w' of @new is less than the
 * weight of @old, map the position of the n-th set bit in @old to
 * the position of the m-th set bit in @new, where m == n % w.
 *
 * The positions of unset bits in @old are mapped to themselves
 * (the identify map).
 *
 * Apply the above specified mapping to bit position @oldbit, returning
 * the new bit position.
 *
 * For example, lets say that @old has bits 4 through 7 set, and
 * @new has bits 12 through 15 set.  This defines the mapping of bit
 * position 4 to 12, 5 to 13, 6 to 14 and 7 to 15, and of all other
 * bit positions unchanged.  So if say @oldbit is 5, then this routine
 * returns 13.
 */
int bitmap_bitremap(int oldbit, const unsigned long *old,
                                const unsigned long *new, int bits)
{
        int w = bitmap_weight(new, bits);
        int n = bitmap_pos_to_ord(old, oldbit, bits);
        if (n < 0 || w == 0)
                return oldbit;
        else
                return bitmap_ord_to_pos(new, n % w, bits);
}

/**
 * bitmap_onto - translate one bitmap relative to another
 *        @dst: resulting translated bitmap
 *         @orig: original untranslated bitmap
 *         @relmap: bitmap relative to which translated
 *        @bits: number of bits in each of these bitmaps
 *
 * Set the n-th bit of @dst iff there exists some m such that the
 * n-th bit of @relmap is set, the m-th bit of @orig is set, and
 * the n-th bit of @relmap is also the m-th _set_ bit of @relmap.
 * (If you understood the previous sentence the first time your
 * read it, you're overqualified for your current job.)
 *
 * In other words, @orig is mapped onto (surjectively) @dst,
 * using the map { <n, m> | the n-th bit of @relmap is the
 * m-th set bit of @relmap }.
 *
 * Any set bits in @orig above bit number W, where W is the
 * weight of (number of set bits in) @relmap are mapped nowhere.
 * In particular, if for all bits m set in @orig, m >= W, then
 * @dst will end up empty.  In situations where the possibility
 * of such an empty result is not desired, one way to avoid it is
 * to use the bitmap_fold() operator, below, to first fold the
 * @orig bitmap over itself so that all its set bits x are in the
 * range 0 <= x < W.  The bitmap_fold() operator does this by
 * setting the bit (m % W) in @dst, for each bit (m) set in @orig.
 *
 * Example [1] for bitmap_onto():
 *  Let's say @relmap has bits 30-39 set, and @orig has bits
 *  1, 3, 5, 7, 9 and 11 set.  Then on return from this routine,
 *  @dst will have bits 31, 33, 35, 37 and 39 set.
 *
 *  When bit 0 is set in @orig, it means turn on the bit in
 *  @dst corresponding to whatever is the first bit (if any)
 *  that is turned on in @relmap.  Since bit 0 was off in the
 *  above example, we leave off that bit (bit 30) in @dst.
 *
 *  When bit 1 is set in @orig (as in the above example), it
 *  means turn on the bit in @dst corresponding to whatever
 *  is the second bit that is turned on in @relmap.  The second
 *  bit in @relmap that was turned on in the above example was
 *  bit 31, so we turned on bit 31 in @dst.
 *
 *  Similarly, we turned on bits 33, 35, 37 and 39 in @dst,
 *  because they were the 4th, 6th, 8th and 10th set bits
 *  set in @relmap, and the 4th, 6th, 8th and 10th bits of
 *  @orig (i.e. bits 3, 5, 7 and 9) were also set.
 *
 *  When bit 11 is set in @orig, it means turn on the bit in
 *  @dst corresponding to whatever is the twelfth bit that is
 *  turned on in @relmap.  In the above example, there were
 *  only ten bits turned on in @relmap (30..39), so that bit
 *  11 was set in @orig had no affect on @dst.
 *
 * Example [2] for bitmap_fold() + bitmap_onto():
 *  Let's say @relmap has these ten bits set::
 *
 *                40 41 42 43 45 48 53 61 74 95
 *
 *  (for the curious, that's 40 plus the first ten terms of the
 *  Fibonacci sequence.)
 *
 *  Further lets say we use the following code, invoking
 *  bitmap_fold() then bitmap_onto, as suggested above to
 *  avoid the possibility of an empty @dst result::
 *
 *        unsigned long *tmp;        // a temporary bitmap's bits
 *
 *        bitmap_fold(tmp, orig, bitmap_weight(relmap, bits), bits);
 *        bitmap_onto(dst, tmp, relmap, bits);
 *
 *  Then this table shows what various values of @dst would be, for
 *  various @orig's.  I list the zero-based positions of each set bit.
 *  The tmp column shows the intermediate result, as computed by
 *  using bitmap_fold() to fold the @orig bitmap modulo ten
 *  (the weight of @relmap):
 *
 *      =============== ============== =================
 *      @orig           tmp            @dst
 *      0                0             40
 *      1                1             41
 *      9                9             95
 *      10               0             40 [#f1]_
 *      1 3 5 7          1 3 5 7       41 43 48 61
 *      0 1 2 3 4        0 1 2 3 4     40 41 42 43 45
 *      0 9 18 27        0 9 8 7       40 61 74 95
 *      0 10 20 30       0             40
 *      0 11 22 33       0 1 2 3       40 41 42 43
 *      0 12 24 36       0 2 4 6       40 42 45 53
 *      78 102 211       1 2 8         41 42 74 [#f1]_
 *      =============== ============== =================
 *
 * .. [#f1]
 *
 *     For these marked lines, if we hadn't first done bitmap_fold()
 *     into tmp, then the @dst result would have been empty.
 *
 * If either of @orig or @relmap is empty (no set bits), then @dst
 * will be returned empty.
 *
 * If (as explained above) the only set bits in @orig are in positions
 * m where m >= W, (where W is the weight of @relmap) then @dst will
 * once again be returned empty.
 *
 * All bits in @dst not set by the above rule are cleared.
 */
void bitmap_onto(unsigned long *dst, const unsigned long *orig,
                        const unsigned long *relmap, unsigned int bits)
{
        unsigned int n, m;        /* same meaning as in above comment */

        if (dst == orig)        /* following doesn't handle inplace mappings */
                return;
        bitmap_zero(dst, bits);

        /*
         * The following code is a more efficient, but less
         * obvious, equivalent to the loop:
         *        for (m = 0; m < bitmap_weight(relmap, bits); m++) {
         *                n = bitmap_ord_to_pos(orig, m, bits);
         *                if (test_bit(m, orig))
         *                        set_bit(n, dst);
         *        }
         */

        m = 0;
        for_each_set_bit(n, relmap, bits) {
                /* m == bitmap_pos_to_ord(relmap, n, bits) */
                if (test_bit(m, orig))
                        set_bit(n, dst);
                m++;
        }
}

/**
 * bitmap_fold - fold larger bitmap into smaller, modulo specified size
 *        @dst: resulting smaller bitmap
 *        @orig: original larger bitmap
 *        @sz: specified size
 *        @nbits: number of bits in each of these bitmaps
 *
 * For each bit oldbit in @orig, set bit oldbit mod @sz in @dst.
 * Clear all other bits in @dst.  See further the comment and
 * Example [2] for bitmap_onto() for why and how to use this.
 */
void bitmap_fold(unsigned long *dst, const unsigned long *orig,
                        unsigned int sz, unsigned int nbits)
{
        unsigned int oldbit;

        if (dst == orig)        /* following doesn't handle inplace mappings */
                return;
        bitmap_zero(dst, nbits);

        for_each_set_bit(oldbit, orig, nbits)
                set_bit(oldbit % sz, dst);
}
#endif /* CONFIG_NUMA */

/*
 * Common code for bitmap_*_region() routines.
 *        bitmap: array of unsigned longs corresponding to the bitmap
 *        pos: the beginning of the region
 *        order: region size (log base 2 of number of bits)
 *        reg_op: operation(s) to perform on that region of bitmap
 *
 * Can set, verify and/or release a region of bits in a bitmap,
 * depending on which combination of REG_OP_* flag bits is set.
 *
 * A region of a bitmap is a sequence of bits in the bitmap, of
 * some size '1 << order' (a power of two), aligned to that same
 * '1 << order' power of two.
 *
 * Returns 1 if REG_OP_ISFREE succeeds (region is all zero bits).
 * Returns 0 in all other cases and reg_ops.
 */

enum {
        REG_OP_ISFREE,                /* true if region is all zero bits */
        REG_OP_ALLOC,                /* set all bits in region */
        REG_OP_RELEASE,                /* clear all bits in region */
};

static int __reg_op(unsigned long *bitmap, unsigned int pos, int order, int reg_op)
{
        int nbits_reg;                /* number of bits in region */
        int index;                /* index first long of region in bitmap */
        int offset;                /* bit offset region in bitmap[index] */
        int nlongs_reg;                /* num longs spanned by region in bitmap */
        int nbitsinlong;        /* num bits of region in each spanned long */
        unsigned long mask;        /* bitmask for one long of region */
        int i;                        /* scans bitmap by longs */
        int ret = 0;                /* return value */

        /*
         * Either nlongs_reg == 1 (for small orders that fit in one long)
         * or (offset == 0 && mask == ~0UL) (for larger multiword orders.)
         */
        nbits_reg = 1 << order;
        index = pos / BITS_PER_LONG;
        offset = pos - (index * BITS_PER_LONG);
        nlongs_reg = BITS_TO_LONGS(nbits_reg);
        nbitsinlong = min(nbits_reg,  BITS_PER_LONG);

        /*
         * Can't do "mask = (1UL << nbitsinlong) - 1", as that
         * overflows if nbitsinlong == BITS_PER_LONG.
         */
        mask = (1UL << (nbitsinlong - 1));
        mask += mask - 1;
        mask <<= offset;

        switch (reg_op) {
        case REG_OP_ISFREE:
                for (i = 0; i < nlongs_reg; i++) {
                        if (bitmap[index + i] & mask)
                                goto done;
                }
                ret = 1;        /* all bits in region free (zero) */
                break;

        case REG_OP_ALLOC:
                for (i = 0; i < nlongs_reg; i++)
                        bitmap[index + i] |= mask;
                break;

        case REG_OP_RELEASE:
                for (i = 0; i < nlongs_reg; i++)
                        bitmap[index + i] &= ~mask;
                break;
        }
done:
        return ret;
}

/**
 * bitmap_find_free_region - find a contiguous aligned mem region
 *        @bitmap: array of unsigned longs corresponding to the bitmap
 *        @bits: number of bits in the bitmap
 *        @order: region size (log base 2 of number of bits) to find
 *
 * Find a region of free (zero) bits in a @bitmap of @bits bits and
 * allocate them (set them to one).  Only consider regions of length
 * a power (@order) of two, aligned to that power of two, which
 * makes the search algorithm much faster.
 *
 * Return the bit offset in bitmap of the allocated region,
 * or -errno on failure.
 */
int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order)
{
        unsigned int pos, end;                /* scans bitmap by regions of size order */

        for (pos = 0 ; (end = pos + (1U << order)) <= bits; pos = end) {
                if (!__reg_op(bitmap, pos, order, REG_OP_ISFREE))
                        continue;
                __reg_op(bitmap, pos, order, REG_OP_ALLOC);
                return pos;
        }
        return -ENOMEM;
}
EXPORT_SYMBOL(bitmap_find_free_region);

/**
 * bitmap_release_region - release allocated bitmap region
 *        @bitmap: array of unsigned longs corresponding to the bitmap
 *        @pos: beginning of bit region to release
 *        @order: region size (log base 2 of number of bits) to release
 *
 * This is the complement to __bitmap_find_free_region() and releases
 * the found region (by clearing it in the bitmap).
 *
 * No return value.
 */
void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order)
{
        __reg_op(bitmap, pos, order, REG_OP_RELEASE);
}
EXPORT_SYMBOL(bitmap_release_region);

/**
 * bitmap_allocate_region - allocate bitmap region
 *        @bitmap: array of unsigned longs corresponding to the bitmap
 *        @pos: beginning of bit region to allocate
 *        @order: region size (log base 2 of number of bits) to allocate
 *
 * Allocate (set bits in) a specified region of a bitmap.
 *
 * Return 0 on success, or %-EBUSY if specified region wasn't
 * free (not all bits were zero).
 */
int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order)
{
        if (!__reg_op(bitmap, pos, order, REG_OP_ISFREE))
                return -EBUSY;
        return __reg_op(bitmap, pos, order, REG_OP_ALLOC);
}
EXPORT_SYMBOL(bitmap_allocate_region);

/**
 * bitmap_copy_le - copy a bitmap, putting the bits into little-endian order.
 * @dst:   destination buffer
 * @src:   bitmap to copy
 * @nbits: number of bits in the bitmap
 *
 * Require nbits % BITS_PER_LONG == 0.
 */
#ifdef __BIG_ENDIAN
void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int nbits)
{
        unsigned int i;

        for (i = 0; i < nbits/BITS_PER_LONG; i++) {
                if (BITS_PER_LONG == 64)
                        dst[i] = cpu_to_le64(src[i]);
                else
                        dst[i] = cpu_to_le32(src[i]);
        }
}
EXPORT_SYMBOL(bitmap_copy_le);
#endif

unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags)
{
        return kmalloc_array(BITS_TO_LONGS(nbits), sizeof(unsigned long),
                             flags);
}
EXPORT_SYMBOL(bitmap_alloc);

unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags)
{
        return bitmap_alloc(nbits, flags | __GFP_ZERO);
}
EXPORT_SYMBOL(bitmap_zalloc);

void bitmap_free(const unsigned long *bitmap)
{
        kfree(bitmap);
}
EXPORT_SYMBOL(bitmap_free);

static void devm_bitmap_free(void *data)
{
        unsigned long *bitmap = data;

        bitmap_free(bitmap);
}

unsigned long *devm_bitmap_alloc(struct device *dev,
                                 unsigned int nbits, gfp_t flags)
{
        unsigned long *bitmap;
        int ret;

        bitmap = bitmap_alloc(nbits, flags);
        if (!bitmap)
                return NULL;

        ret = devm_add_action_or_reset(dev, devm_bitmap_free, bitmap);
        if (ret)
                return NULL;

        return bitmap;
}
EXPORT_SYMBOL_GPL(devm_bitmap_alloc);

unsigned long *devm_bitmap_zalloc(struct device *dev,
                                  unsigned int nbits, gfp_t flags)
{
        return devm_bitmap_alloc(dev, nbits, flags | __GFP_ZERO);
}
EXPORT_SYMBOL_GPL(devm_bitmap_zalloc);

#if BITS_PER_LONG == 64
/**
 * bitmap_from_arr32 - copy the contents of u32 array of bits to bitmap
 *        @bitmap: array of unsigned longs, the destination bitmap
 *        @buf: array of u32 (in host byte order), the source bitmap
 *        @nbits: number of bits in @bitmap
 */
void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf, unsigned int nbits)
{
        unsigned int i, halfwords;

        halfwords = DIV_ROUND_UP(nbits, 32);
        for (i = 0; i < halfwords; i++) {
                bitmap[i/2] = (unsigned long) buf[i];
                if (++i < halfwords)
                        bitmap[i/2] |= ((unsigned long) buf[i]) << 32;
        }

        /* Clear tail bits in last word beyond nbits. */
        if (nbits % BITS_PER_LONG)
                bitmap[(halfwords - 1) / 2] &= BITMAP_LAST_WORD_MASK(nbits);
}
EXPORT_SYMBOL(bitmap_from_arr32);

/**
 * bitmap_to_arr32 - copy the contents of bitmap to a u32 array of bits
 *        @buf: array of u32 (in host byte order), the dest bitmap
 *        @bitmap: array of unsigned longs, the source bitmap
 *        @nbits: number of bits in @bitmap
 */
void bitmap_to_arr32(u32 *buf, const unsigned long *bitmap, unsigned int nbits)
{
        unsigned int i, halfwords;

        halfwords = DIV_ROUND_UP(nbits, 32);
        for (i = 0; i < halfwords; i++) {
                buf[i] = (u32) (bitmap[i/2] & UINT_MAX);
                if (++i < halfwords)
                        buf[i] = (u32) (bitmap[i/2] >> 32);
        }

        /* Clear tail bits in last element of array beyond nbits. */
        if (nbits % BITS_PER_LONG)
                buf[halfwords - 1] &= (u32) (UINT_MAX >> ((-nbits) & 31));
}
EXPORT_SYMBOL(bitmap_to_arr32);

#endif


































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
/* SPDX-License-Identifier: GPL-2.0 OR MIT */
/*
 * Helper functions for BLAKE2s implementations.
 * Keep this in sync with the corresponding BLAKE2b header.
 */

#ifndef _CRYPTO_INTERNAL_BLAKE2S_H
#define _CRYPTO_INTERNAL_BLAKE2S_H

#include <crypto/blake2s.h>
#include <crypto/internal/hash.h>
#include <linux/string.h>

void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
                              size_t nblocks, const u32 inc);

void blake2s_compress(struct blake2s_state *state, const u8 *block,
                      size_t nblocks, const u32 inc);

bool blake2s_selftest(void);

static inline void blake2s_set_lastblock(struct blake2s_state *state)
{
        state->f[0] = -1;
}

/* Helper functions for BLAKE2s shared by the library and shash APIs */

static __always_inline void
__blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen,
                 bool force_generic)
{
        const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;

        if (unlikely(!inlen))
                return;
        if (inlen > fill) {
                memcpy(state->buf + state->buflen, in, fill);
                if (force_generic)
                        blake2s_compress_generic(state, state->buf, 1,
                                                 BLAKE2S_BLOCK_SIZE);
                else
                        blake2s_compress(state, state->buf, 1,
                                         BLAKE2S_BLOCK_SIZE);
                state->buflen = 0;
                in += fill;
                inlen -= fill;
        }
        if (inlen > BLAKE2S_BLOCK_SIZE) {
                const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
                /* Hash one less (full) block than strictly possible */
                if (force_generic)
                        blake2s_compress_generic(state, in, nblocks - 1,
                                                 BLAKE2S_BLOCK_SIZE);
                else
                        blake2s_compress(state, in, nblocks - 1,
                                         BLAKE2S_BLOCK_SIZE);
                in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
                inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
        }
        memcpy(state->buf + state->buflen, in, inlen);
        state->buflen += inlen;
}

static __always_inline void
__blake2s_final(struct blake2s_state *state, u8 *out, bool force_generic)
{
        blake2s_set_lastblock(state);
        memset(state->buf + state->buflen, 0,
               BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
        if (force_generic)
                blake2s_compress_generic(state, state->buf, 1, state->buflen);
        else
                blake2s_compress(state, state->buf, 1, state->buflen);
        cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
        memcpy(out, state->h, state->outlen);
}

/* Helper functions for shash implementations of BLAKE2s */

struct blake2s_tfm_ctx {
        u8 key[BLAKE2S_KEY_SIZE];
        unsigned int keylen;
};

static inline int crypto_blake2s_setkey(struct crypto_shash *tfm,
                                        const u8 *key, unsigned int keylen)
{
        struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(tfm);

        if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE)
                return -EINVAL;

        memcpy(tctx->key, key, keylen);
        tctx->keylen = keylen;

        return 0;
}

static inline int crypto_blake2s_init(struct shash_desc *desc)
{
        const struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
        struct blake2s_state *state = shash_desc_ctx(desc);
        unsigned int outlen = crypto_shash_digestsize(desc->tfm);

        __blake2s_init(state, outlen, tctx->key, tctx->keylen);
        return 0;
}

static inline int crypto_blake2s_update(struct shash_desc *desc,
                                        const u8 *in, unsigned int inlen,
                                        bool force_generic)
{
        struct blake2s_state *state = shash_desc_ctx(desc);

        __blake2s_update(state, in, inlen, force_generic);
        return 0;
}

static inline int crypto_blake2s_final(struct shash_desc *desc, u8 *out,
                                       bool force_generic)
{
        struct blake2s_state *state = shash_desc_ctx(desc);

        __blake2s_final(state, out, force_generic);
        return 0;
}

#endif /* _CRYPTO_INTERNAL_BLAKE2S_H */



















































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_IP_TUNNELS_H
#define __NET_IP_TUNNELS_H 1

#include <linux/if_tunnel.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/socket.h>
#include <linux/types.h>
#include <linux/u64_stats_sync.h>
#include <linux/bitops.h>

#include <net/dsfield.h>
#include <net/gro_cells.h>
#include <net/inet_ecn.h>
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
#include <net/lwtunnel.h>
#include <net/dst_cache.h>

#if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
#endif

/* Keep error state on tunnel for 30 sec */
#define IPTUNNEL_ERR_TIMEO        (30*HZ)

/* Used to memset ip_tunnel padding. */
#define IP_TUNNEL_KEY_SIZE        offsetofend(struct ip_tunnel_key, tp_dst)

/* Used to memset ipv4 address padding. */
#define IP_TUNNEL_KEY_IPV4_PAD        offsetofend(struct ip_tunnel_key, u.ipv4.dst)
#define IP_TUNNEL_KEY_IPV4_PAD_LEN                                \
        (sizeof_field(struct ip_tunnel_key, u) -                \
         sizeof_field(struct ip_tunnel_key, u.ipv4))

struct ip_tunnel_key {
        __be64                        tun_id;
        union {
                struct {
                        __be32        src;
                        __be32        dst;
                } ipv4;
                struct {
                        struct in6_addr src;
                        struct in6_addr dst;
                } ipv6;
        } u;
        __be16                        tun_flags;
        u8                        tos;                /* TOS for IPv4, TC for IPv6 */
        u8                        ttl;                /* TTL for IPv4, HL for IPv6 */
        __be32                        label;                /* Flow Label for IPv6 */
        __be16                        tp_src;
        __be16                        tp_dst;
};

/* Flags for ip_tunnel_info mode. */
#define IP_TUNNEL_INFO_TX        0x01        /* represents tx tunnel parameters */
#define IP_TUNNEL_INFO_IPV6        0x02        /* key contains IPv6 addresses */
#define IP_TUNNEL_INFO_BRIDGE        0x04        /* represents a bridged tunnel id */

/* Maximum tunnel options length. */
#define IP_TUNNEL_OPTS_MAX                                        \
        GENMASK((sizeof_field(struct ip_tunnel_info,                \
                              options_len) * BITS_PER_BYTE) - 1, 0)

struct ip_tunnel_info {
        struct ip_tunnel_key        key;
#ifdef CONFIG_DST_CACHE
        struct dst_cache        dst_cache;
#endif
        u8                        options_len;
        u8                        mode;
};

/* 6rd prefix/relay information */
#ifdef CONFIG_IPV6_SIT_6RD
struct ip_tunnel_6rd_parm {
        struct in6_addr                prefix;
        __be32                        relay_prefix;
        u16                        prefixlen;
        u16                        relay_prefixlen;
};
#endif

struct ip_tunnel_encap {
        u16                        type;
        u16                        flags;
        __be16                        sport;
        __be16                        dport;
};

struct ip_tunnel_prl_entry {
        struct ip_tunnel_prl_entry __rcu *next;
        __be32                                addr;
        u16                                flags;
        struct rcu_head                        rcu_head;
};

struct metadata_dst;

struct ip_tunnel {
        struct ip_tunnel __rcu        *next;
        struct hlist_node hash_node;
        struct net_device        *dev;
        struct net                *net;        /* netns for packet i/o */

        unsigned long        err_time;        /* Time when the last ICMP error
                                         * arrived */
        int                err_count;        /* Number of arrived ICMP errors */

        /* These four fields used only by GRE */
        u32                i_seqno;        /* The last seen seqno        */
        atomic_t        o_seqno;        /* The last output seqno */
        int                tun_hlen;        /* Precalculated header length */

        /* These four fields used only by ERSPAN */
        u32                index;                /* ERSPAN type II index */
        u8                erspan_ver;        /* ERSPAN version */
        u8                dir;                /* ERSPAN direction */
        u16                hwid;                /* ERSPAN hardware ID */

        struct dst_cache dst_cache;

        struct ip_tunnel_parm parms;

        int                mlink;
        int                encap_hlen;        /* Encap header length (FOU,GUE) */
        int                hlen;                /* tun_hlen + encap_hlen */
        struct ip_tunnel_encap encap;

        /* for SIT */
#ifdef CONFIG_IPV6_SIT_6RD
        struct ip_tunnel_6rd_parm ip6rd;
#endif
        struct ip_tunnel_prl_entry __rcu *prl;        /* potential router list */
        unsigned int                prl_count;        /* # of entries in PRL */
        unsigned int                ip_tnl_net_id;
        struct gro_cells        gro_cells;
        __u32                        fwmark;
        bool                        collect_md;
        bool                        ignore_df;
};

struct tnl_ptk_info {
        __be16 flags;
        __be16 proto;
        __be32 key;
        __be32 seq;
        int hdr_len;
};

#define PACKET_RCVD        0
#define PACKET_REJECT        1
#define PACKET_NEXT        2

#define IP_TNL_HASH_BITS   7
#define IP_TNL_HASH_SIZE   (1 << IP_TNL_HASH_BITS)

struct ip_tunnel_net {
        struct net_device *fb_tunnel_dev;
        struct rtnl_link_ops *rtnl_link_ops;
        struct hlist_head tunnels[IP_TNL_HASH_SIZE];
        struct ip_tunnel __rcu *collect_md_tun;
        int type;
};

static inline void ip_tunnel_key_init(struct ip_tunnel_key *key,
                                      __be32 saddr, __be32 daddr,
                                      u8 tos, u8 ttl, __be32 label,
                                      __be16 tp_src, __be16 tp_dst,
                                      __be64 tun_id, __be16 tun_flags)
{
        key->tun_id = tun_id;
        key->u.ipv4.src = saddr;
        key->u.ipv4.dst = daddr;
        memset((unsigned char *)key + IP_TUNNEL_KEY_IPV4_PAD,
               0, IP_TUNNEL_KEY_IPV4_PAD_LEN);
        key->tos = tos;
        key->ttl = ttl;
        key->label = label;
        key->tun_flags = tun_flags;

        /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of
         * the upper tunnel are used.
         * E.g: GRE over IPSEC, the tp_src and tp_port are zero.
         */
        key->tp_src = tp_src;
        key->tp_dst = tp_dst;

        /* Clear struct padding. */
        if (sizeof(*key) != IP_TUNNEL_KEY_SIZE)
                memset((unsigned char *)key + IP_TUNNEL_KEY_SIZE,
                       0, sizeof(*key) - IP_TUNNEL_KEY_SIZE);
}

static inline bool
ip_tunnel_dst_cache_usable(const struct sk_buff *skb,
                           const struct ip_tunnel_info *info)
{
        if (skb->mark)
                return false;
        if (!info)
                return true;
        if (info->key.tun_flags & TUNNEL_NOCACHE)
                return false;

        return true;
}

static inline unsigned short ip_tunnel_info_af(const struct ip_tunnel_info
                                               *tun_info)
{
        return tun_info->mode & IP_TUNNEL_INFO_IPV6 ? AF_INET6 : AF_INET;
}

static inline __be64 key32_to_tunnel_id(__be32 key)
{
#ifdef __BIG_ENDIAN
        return (__force __be64)key;
#else
        return (__force __be64)((__force u64)key << 32);
#endif
}

/* Returns the least-significant 32 bits of a __be64. */
static inline __be32 tunnel_id_to_key32(__be64 tun_id)
{
#ifdef __BIG_ENDIAN
        return (__force __be32)tun_id;
#else
        return (__force __be32)((__force u64)tun_id >> 32);
#endif
}

#ifdef CONFIG_INET

static inline void ip_tunnel_init_flow(struct flowi4 *fl4,
                                       int proto,
                                       __be32 daddr, __be32 saddr,
                                       __be32 key, __u8 tos,
                                       struct net *net, int oif,
                                       __u32 mark, __u32 tun_inner_hash)
{
        memset(fl4, 0, sizeof(*fl4));

        if (oif) {
                fl4->flowi4_l3mdev = l3mdev_master_upper_ifindex_by_index(net, oif);
                /* Legacy VRF/l3mdev use case */
                fl4->flowi4_oif = fl4->flowi4_l3mdev ? 0 : oif;
        }

        fl4->daddr = daddr;
        fl4->saddr = saddr;
        fl4->flowi4_tos = tos;
        fl4->flowi4_proto = proto;
        fl4->fl4_gre_key = key;
        fl4->flowi4_mark = mark;
        fl4->flowi4_multipath_hash = tun_inner_hash;
}

int ip_tunnel_init(struct net_device *dev);
void ip_tunnel_uninit(struct net_device *dev);
void  ip_tunnel_dellink(struct net_device *dev, struct list_head *head);
struct net *ip_tunnel_get_link_net(const struct net_device *dev);
int ip_tunnel_get_iflink(const struct net_device *dev);
int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
                       struct rtnl_link_ops *ops, char *devname);

void ip_tunnel_delete_nets(struct list_head *list_net, unsigned int id,
                           struct rtnl_link_ops *ops);

void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
                    const struct iphdr *tnl_params, const u8 protocol);
void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
                       const u8 proto, int tunnel_hlen);
int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd);
int ip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd);
int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict);
int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu);

void ip_tunnel_get_stats64(struct net_device *dev,
                           struct rtnl_link_stats64 *tot);
struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
                                   int link, __be16 flags,
                                   __be32 remote, __be32 local,
                                   __be32 key);

int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
                  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
                  bool log_ecn_error);
int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
                         struct ip_tunnel_parm *p, __u32 fwmark);
int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
                      struct ip_tunnel_parm *p, __u32 fwmark);
void ip_tunnel_setup(struct net_device *dev, unsigned int net_id);

extern const struct header_ops ip_tunnel_header_ops;
__be16 ip_tunnel_parse_protocol(const struct sk_buff *skb);

struct ip_tunnel_encap_ops {
        size_t (*encap_hlen)(struct ip_tunnel_encap *e);
        int (*build_header)(struct sk_buff *skb, struct ip_tunnel_encap *e,
                            u8 *protocol, struct flowi4 *fl4);
        int (*err_handler)(struct sk_buff *skb, u32 info);
};

#define MAX_IPTUN_ENCAP_OPS 8

extern const struct ip_tunnel_encap_ops __rcu *
                iptun_encaps[MAX_IPTUN_ENCAP_OPS];

int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *op,
                            unsigned int num);
int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *op,
                            unsigned int num);

int ip_tunnel_encap_setup(struct ip_tunnel *t,
                          struct ip_tunnel_encap *ipencap);

static inline bool pskb_inet_may_pull(struct sk_buff *skb)
{
        int nhlen;

        switch (skb->protocol) {
#if IS_ENABLED(CONFIG_IPV6)
        case htons(ETH_P_IPV6):
                nhlen = sizeof(struct ipv6hdr);
                break;
#endif
        case htons(ETH_P_IP):
                nhlen = sizeof(struct iphdr);
                break;
        default:
                nhlen = 0;
        }

        return pskb_network_may_pull(skb, nhlen);
}

/* Variant of pskb_inet_may_pull().
 */
static inline bool skb_vlan_inet_prepare(struct sk_buff *skb,
                                         bool inner_proto_inherit)
{
        int nhlen = 0, maclen = inner_proto_inherit ? 0 : ETH_HLEN;
        __be16 type = skb->protocol;

        /* Essentially this is skb_protocol(skb, true)
         * And we get MAC len.
         */
        if (eth_type_vlan(type))
                type = __vlan_get_protocol(skb, type, &maclen);

        switch (type) {
#if IS_ENABLED(CONFIG_IPV6)
        case htons(ETH_P_IPV6):
                nhlen = sizeof(struct ipv6hdr);
                break;
#endif
        case htons(ETH_P_IP):
                nhlen = sizeof(struct iphdr);
                break;
        }
        /* For ETH_P_IPV6/ETH_P_IP we make sure to pull
         * a base network header in skb->head.
         */
        if (!pskb_may_pull(skb, maclen + nhlen))
                return false;

        skb_set_network_header(skb, maclen);
        return true;
}

static inline int ip_encap_hlen(struct ip_tunnel_encap *e)
{
        const struct ip_tunnel_encap_ops *ops;
        int hlen = -EINVAL;

        if (e->type == TUNNEL_ENCAP_NONE)
                return 0;

        if (e->type >= MAX_IPTUN_ENCAP_OPS)
                return -EINVAL;

        rcu_read_lock();
        ops = rcu_dereference(iptun_encaps[e->type]);
        if (likely(ops && ops->encap_hlen))
                hlen = ops->encap_hlen(e);
        rcu_read_unlock();

        return hlen;
}

static inline int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
                                  u8 *protocol, struct flowi4 *fl4)
{
        const struct ip_tunnel_encap_ops *ops;
        int ret = -EINVAL;

        if (t->encap.type == TUNNEL_ENCAP_NONE)
                return 0;

        if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
                return -EINVAL;

        rcu_read_lock();
        ops = rcu_dereference(iptun_encaps[t->encap.type]);
        if (likely(ops && ops->build_header))
                ret = ops->build_header(skb, &t->encap, protocol, fl4);
        rcu_read_unlock();

        return ret;
}

/* Extract dsfield from inner protocol */
static inline u8 ip_tunnel_get_dsfield(const struct iphdr *iph,
                                       const struct sk_buff *skb)
{
        __be16 payload_protocol = skb_protocol(skb, true);

        if (payload_protocol == htons(ETH_P_IP))
                return iph->tos;
        else if (payload_protocol == htons(ETH_P_IPV6))
                return ipv6_get_dsfield((const struct ipv6hdr *)iph);
        else
                return 0;
}

static inline u8 ip_tunnel_get_ttl(const struct iphdr *iph,
                                       const struct sk_buff *skb)
{
        __be16 payload_protocol = skb_protocol(skb, true);

        if (payload_protocol == htons(ETH_P_IP))
                return iph->ttl;
        else if (payload_protocol == htons(ETH_P_IPV6))
                return ((const struct ipv6hdr *)iph)->hop_limit;
        else
                return 0;
}

/* Propogate ECN bits out */
static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph,
                                     const struct sk_buff *skb)
{
        u8 inner = ip_tunnel_get_dsfield(iph, skb);

        return INET_ECN_encapsulate(tos, inner);
}

int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len,
                           __be16 inner_proto, bool raw_proto, bool xnet);

static inline int iptunnel_pull_header(struct sk_buff *skb, int hdr_len,
                                       __be16 inner_proto, bool xnet)
{
        return __iptunnel_pull_header(skb, hdr_len, inner_proto, false, xnet);
}

void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
                   __be32 src, __be32 dst, u8 proto,
                   u8 tos, u8 ttl, __be16 df, bool xnet);
struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
                                             gfp_t flags);
int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst,
                          int headroom, bool reply);

static inline void ip_tunnel_adj_headroom(struct net_device *dev,
                                          unsigned int headroom)
{
        /* we must cap headroom to some upperlimit, else pskb_expand_head
         * will overflow header offsets in skb_headers_offset_update().
         */
        const unsigned int max_allowed = 512;

        if (headroom > max_allowed)
                headroom = max_allowed;

        if (headroom > READ_ONCE(dev->needed_headroom))
                WRITE_ONCE(dev->needed_headroom, headroom);
}

int iptunnel_handle_offloads(struct sk_buff *skb, int gso_type_mask);

static inline int iptunnel_pull_offloads(struct sk_buff *skb)
{
        if (skb_is_gso(skb)) {
                int err;

                err = skb_unclone(skb, GFP_ATOMIC);
                if (unlikely(err))
                        return err;
                skb_shinfo(skb)->gso_type &= ~(NETIF_F_GSO_ENCAP_ALL >>
                                               NETIF_F_GSO_SHIFT);
        }

        skb->encapsulation = 0;
        return 0;
}

static inline void iptunnel_xmit_stats(struct net_device *dev, int pkt_len)
{
        if (pkt_len > 0) {
                struct pcpu_sw_netstats *tstats = get_cpu_ptr(dev->tstats);

                u64_stats_update_begin(&tstats->syncp);
                tstats->tx_bytes += pkt_len;
                tstats->tx_packets++;
                u64_stats_update_end(&tstats->syncp);
                put_cpu_ptr(tstats);
                return;
        }

        if (pkt_len < 0) {
                DEV_STATS_INC(dev, tx_errors);
                DEV_STATS_INC(dev, tx_aborted_errors);
        } else {
                DEV_STATS_INC(dev, tx_dropped);
        }
}

static inline void *ip_tunnel_info_opts(struct ip_tunnel_info *info)
{
        return info + 1;
}

static inline void ip_tunnel_info_opts_get(void *to,
                                           const struct ip_tunnel_info *info)
{
        memcpy(to, info + 1, info->options_len);
}

static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info,
                                           const void *from, int len,
                                           __be16 flags)
{
        info->options_len = len;
        if (len > 0) {
                memcpy(ip_tunnel_info_opts(info), from, len);
                info->key.tun_flags |= flags;
        }
}

static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstate)
{
        return (struct ip_tunnel_info *)lwtstate->data;
}

DECLARE_STATIC_KEY_FALSE(ip_tunnel_metadata_cnt);

/* Returns > 0 if metadata should be collected */
static inline int ip_tunnel_collect_metadata(void)
{
        return static_branch_unlikely(&ip_tunnel_metadata_cnt);
}

void __init ip_tunnel_core_init(void);

void ip_tunnel_need_metadata(void);
void ip_tunnel_unneed_metadata(void);

#else /* CONFIG_INET */

static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstate)
{
        return NULL;
}

static inline void ip_tunnel_need_metadata(void)
{
}

static inline void ip_tunnel_unneed_metadata(void)
{
}

static inline void ip_tunnel_info_opts_get(void *to,
                                           const struct ip_tunnel_info *info)
{
}

static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info,
                                           const void *from, int len,
                                           __be16 flags)
{
        info->options_len = 0;
}

#endif /* CONFIG_INET */

#endif /* __NET_IP_TUNNELS_H */








































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the BSD Socket
 *                interface as the means of communication with the user level.
 *
 * Authors:        Lotsa people, from code originally in tcp
 */

#ifndef _INET_HASHTABLES_H
#define _INET_HASHTABLES_H


#include <linux/interrupt.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/socket.h>
#include <linux/spinlock.h>
#include <linux/types.h>
#include <linux/wait.h>

#include <net/inet_connection_sock.h>
#include <net/inet_sock.h>
#include <net/sock.h>
#include <net/route.h>
#include <net/tcp_states.h>
#include <net/netns/hash.h>

#include <linux/refcount.h>
#include <asm/byteorder.h>

/* This is for all connections with a full identity, no wildcards.
 * The 'e' prefix stands for Establish, but we really put all sockets
 * but LISTEN ones.
 */
struct inet_ehash_bucket {
        struct hlist_nulls_head chain;
};

/* There are a few simple rules, which allow for local port reuse by
 * an application.  In essence:
 *
 *        1) Sockets bound to different interfaces may share a local port.
 *           Failing that, goto test 2.
 *        2) If all sockets have sk->sk_reuse set, and none of them are in
 *           TCP_LISTEN state, the port may be shared.
 *           Failing that, goto test 3.
 *        3) If all sockets are bound to a specific inet_sk(sk)->rcv_saddr local
 *           address, and none of them are the same, the port may be
 *           shared.
 *           Failing this, the port cannot be shared.
 *
 * The interesting point, is test #2.  This is what an FTP server does
 * all day.  To optimize this case we use a specific flag bit defined
 * below.  As we add sockets to a bind bucket list, we perform a
 * check of: (newsk->sk_reuse && (newsk->sk_state != TCP_LISTEN))
 * As long as all sockets added to a bind bucket pass this test,
 * the flag bit will be set.
 * The resulting situation is that tcp_v[46]_verify_bind() can just check
 * for this flag bit, if it is set and the socket trying to bind has
 * sk->sk_reuse set, we don't even have to walk the owners list at all,
 * we return that it is ok to bind this socket to the requested local port.
 *
 * Sounds like a lot of work, but it is worth it.  In a more naive
 * implementation (ie. current FreeBSD etc.) the entire list of ports
 * must be walked for each data port opened by an ftp server.  Needless
 * to say, this does not scale at all.  With a couple thousand FTP
 * users logged onto your box, isn't it nice to know that new data
 * ports are created in O(1) time?  I thought so. ;-)        -DaveM
 */
#define FASTREUSEPORT_ANY        1
#define FASTREUSEPORT_STRICT        2

struct inet_bind_bucket {
        possible_net_t                ib_net;
        int                        l3mdev;
        unsigned short                port;
        signed char                fastreuse;
        signed char                fastreuseport;
        kuid_t                        fastuid;
#if IS_ENABLED(CONFIG_IPV6)
        struct in6_addr                fast_v6_rcv_saddr;
#endif
        __be32                        fast_rcv_saddr;
        unsigned short                fast_sk_family;
        bool                        fast_ipv6_only;
        struct hlist_node        node;
        struct hlist_head        owners;
};

static inline struct net *ib_net(struct inet_bind_bucket *ib)
{
        return read_pnet(&ib->ib_net);
}

#define inet_bind_bucket_for_each(tb, head) \
        hlist_for_each_entry(tb, head, node)

struct inet_bind_hashbucket {
        spinlock_t                lock;
        struct hlist_head        chain;
};

/* Sockets can be hashed in established or listening table.
 * We must use different 'nulls' end-of-chain value for all hash buckets :
 * A socket might transition from ESTABLISH to LISTEN state without
 * RCU grace period. A lookup in ehash table needs to handle this case.
 */
#define LISTENING_NULLS_BASE (1U << 29)
struct inet_listen_hashbucket {
        spinlock_t                lock;
        unsigned int                count;
        union {
                struct hlist_head        head;
                struct hlist_nulls_head        nulls_head;
        };
};

/* This is for listening sockets, thus all sockets which possess wildcards. */
#define INET_LHTABLE_SIZE        32        /* Yes, really, this is all you need. */

struct inet_hashinfo {
        /* This is for sockets with full identity only.  Sockets here will
         * always be without wildcards and will have the following invariant:
         *
         *          TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE
         *
         */
        struct inet_ehash_bucket        *ehash;
        spinlock_t                        *ehash_locks;
        unsigned int                        ehash_mask;
        unsigned int                        ehash_locks_mask;

        /* Ok, let's try this, I give up, we do need a local binding
         * TCP hash as well as the others for fast bind/connect.
         */
        struct kmem_cache                *bind_bucket_cachep;
        struct inet_bind_hashbucket        *bhash;
        unsigned int                        bhash_size;

        /* The 2nd listener table hashed by local port and address */
        unsigned int                        lhash2_mask;
        struct inet_listen_hashbucket        *lhash2;

        /* All the above members are written once at bootup and
         * never written again _or_ are predominantly read-access.
         *
         * Now align to a new cache line as all the following members
         * might be often dirty.
         */
        /* All sockets in TCP_LISTEN state will be in listening_hash.
         * This is the only table where wildcard'd TCP sockets can
         * exist.  listening_hash is only hashed by local port number.
         * If lhash2 is initialized, the same socket will also be hashed
         * to lhash2 by port and address.
         */
        struct inet_listen_hashbucket        listening_hash[INET_LHTABLE_SIZE]
                                        ____cacheline_aligned_in_smp;
};

#define inet_lhash2_for_each_icsk_rcu(__icsk, list) \
        hlist_for_each_entry_rcu(__icsk, list, icsk_listen_portaddr_node)

static inline struct inet_listen_hashbucket *
inet_lhash2_bucket(struct inet_hashinfo *h, u32 hash)
{
        return &h->lhash2[hash & h->lhash2_mask];
}

static inline struct inet_ehash_bucket *inet_ehash_bucket(
        struct inet_hashinfo *hashinfo,
        unsigned int hash)
{
        return &hashinfo->ehash[hash & hashinfo->ehash_mask];
}

static inline spinlock_t *inet_ehash_lockp(
        struct inet_hashinfo *hashinfo,
        unsigned int hash)
{
        return &hashinfo->ehash_locks[hash & hashinfo->ehash_locks_mask];
}

int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo);

static inline void inet_hashinfo2_free_mod(struct inet_hashinfo *h)
{
        kfree(h->lhash2);
        h->lhash2 = NULL;
}

static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo)
{
        kvfree(hashinfo->ehash_locks);
        hashinfo->ehash_locks = NULL;
}

struct inet_bind_bucket *
inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,
                        struct inet_bind_hashbucket *head,
                        const unsigned short snum, int l3mdev);
void inet_bind_bucket_destroy(struct kmem_cache *cachep,
                              struct inet_bind_bucket *tb);

static inline u32 inet_bhashfn(const struct net *net, const __u16 lport,
                               const u32 bhash_size)
{
        return (lport + net_hash_mix(net)) & (bhash_size - 1);
}

void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
                    const unsigned short snum);

/* These can have wildcards, don't try too hard. */
static inline u32 inet_lhashfn(const struct net *net, const unsigned short num)
{
        return (num + net_hash_mix(net)) & (INET_LHTABLE_SIZE - 1);
}

static inline int inet_sk_listen_hashfn(const struct sock *sk)
{
        return inet_lhashfn(sock_net(sk), inet_sk(sk)->inet_num);
}

/* Caller must disable local BH processing. */
int __inet_inherit_port(const struct sock *sk, struct sock *child);

void inet_put_port(struct sock *sk);

void inet_hashinfo_init(struct inet_hashinfo *h);
void inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
                         unsigned long numentries, int scale,
                         unsigned long low_limit,
                         unsigned long high_limit);
int inet_hashinfo2_init_mod(struct inet_hashinfo *h);

bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk);
bool inet_ehash_nolisten(struct sock *sk, struct sock *osk,
                         bool *found_dup_sk);
int __inet_hash(struct sock *sk, struct sock *osk);
int inet_hash(struct sock *sk);
void inet_unhash(struct sock *sk);

struct sock *__inet_lookup_listener(struct net *net,
                                    struct inet_hashinfo *hashinfo,
                                    struct sk_buff *skb, int doff,
                                    const __be32 saddr, const __be16 sport,
                                    const __be32 daddr,
                                    const unsigned short hnum,
                                    const int dif, const int sdif);

static inline struct sock *inet_lookup_listener(struct net *net,
                struct inet_hashinfo *hashinfo,
                struct sk_buff *skb, int doff,
                __be32 saddr, __be16 sport,
                __be32 daddr, __be16 dport, int dif, int sdif)
{
        return __inet_lookup_listener(net, hashinfo, skb, doff, saddr, sport,
                                      daddr, ntohs(dport), dif, sdif);
}

/* Socket demux engine toys. */
/* What happens here is ugly; there's a pair of adjacent fields in
   struct inet_sock; __be16 dport followed by __u16 num.  We want to
   search by pair, so we combine the keys into a single 32bit value
   and compare with 32bit value read from &...->dport.  Let's at least
   make sure that it's not mixed with anything else...
   On 64bit targets we combine comparisons with pair of adjacent __be32
   fields in the same way.
*/
#ifdef __BIG_ENDIAN
#define INET_COMBINED_PORTS(__sport, __dport) \
        ((__force __portpair)(((__force __u32)(__be16)(__sport) << 16) | (__u32)(__dport)))
#else /* __LITTLE_ENDIAN */
#define INET_COMBINED_PORTS(__sport, __dport) \
        ((__force __portpair)(((__u32)(__dport) << 16) | (__force __u32)(__be16)(__sport)))
#endif

#ifdef __BIG_ENDIAN
#define INET_ADDR_COOKIE(__name, __saddr, __daddr) \
        const __addrpair __name = (__force __addrpair) ( \
                                   (((__force __u64)(__be32)(__saddr)) << 32) | \
                                   ((__force __u64)(__be32)(__daddr)))
#else /* __LITTLE_ENDIAN */
#define INET_ADDR_COOKIE(__name, __saddr, __daddr) \
        const __addrpair __name = (__force __addrpair) ( \
                                   (((__force __u64)(__be32)(__daddr)) << 32) | \
                                   ((__force __u64)(__be32)(__saddr)))
#endif /* __BIG_ENDIAN */

static inline bool INET_MATCH(struct net *net, const struct sock *sk,
                              const __addrpair cookie, const __portpair ports,
                              int dif, int sdif)
{
        if (!net_eq(sock_net(sk), net) ||
            sk->sk_portpair != ports ||
            sk->sk_addrpair != cookie)
                return false;

        /* READ_ONCE() paired with WRITE_ONCE() in sock_bindtoindex_locked() */
        return inet_sk_bound_dev_eq(net, READ_ONCE(sk->sk_bound_dev_if), dif,
                                    sdif);
}

/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need
 * not check it for lookups anymore, thanks Alexey. -DaveM
 */
struct sock *__inet_lookup_established(struct net *net,
                                       struct inet_hashinfo *hashinfo,
                                       const __be32 saddr, const __be16 sport,
                                       const __be32 daddr, const u16 hnum,
                                       const int dif, const int sdif);

typedef u32 (inet_ehashfn_t)(const struct net *net,
                              const __be32 laddr, const __u16 lport,
                              const __be32 faddr, const __be16 fport);

inet_ehashfn_t inet_ehashfn;

INDIRECT_CALLABLE_DECLARE(inet_ehashfn_t udp_ehashfn);

struct sock *inet_lookup_reuseport(struct net *net, struct sock *sk,
                                   struct sk_buff *skb, int doff,
                                   __be32 saddr, __be16 sport,
                                   __be32 daddr, unsigned short hnum,
                                   inet_ehashfn_t *ehashfn);

static inline struct sock *
        inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo,
                                const __be32 saddr, const __be16 sport,
                                const __be32 daddr, const __be16 dport,
                                const int dif)
{
        return __inet_lookup_established(net, hashinfo, saddr, sport, daddr,
                                         ntohs(dport), dif, 0);
}

static inline struct sock *__inet_lookup(struct net *net,
                                         struct inet_hashinfo *hashinfo,
                                         struct sk_buff *skb, int doff,
                                         const __be32 saddr, const __be16 sport,
                                         const __be32 daddr, const __be16 dport,
                                         const int dif, const int sdif,
                                         bool *refcounted)
{
        u16 hnum = ntohs(dport);
        struct sock *sk;

        sk = __inet_lookup_established(net, hashinfo, saddr, sport,
                                       daddr, hnum, dif, sdif);
        *refcounted = true;
        if (sk)
                return sk;
        *refcounted = false;
        return __inet_lookup_listener(net, hashinfo, skb, doff, saddr,
                                      sport, daddr, hnum, dif, sdif);
}

static inline struct sock *inet_lookup(struct net *net,
                                       struct inet_hashinfo *hashinfo,
                                       struct sk_buff *skb, int doff,
                                       const __be32 saddr, const __be16 sport,
                                       const __be32 daddr, const __be16 dport,
                                       const int dif)
{
        struct sock *sk;
        bool refcounted;

        sk = __inet_lookup(net, hashinfo, skb, doff, saddr, sport, daddr,
                           dport, dif, 0, &refcounted);

        if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt))
                sk = NULL;
        return sk;
}

static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo,
                                             struct sk_buff *skb,
                                             int doff,
                                             const __be16 sport,
                                             const __be16 dport,
                                             const int sdif,
                                             bool *refcounted)
{
        struct sock *sk = skb_steal_sock(skb, refcounted);
        const struct iphdr *iph = ip_hdr(skb);

        if (sk)
                return sk;

        return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo, skb,
                             doff, iph->saddr, sport,
                             iph->daddr, dport, inet_iif(skb), sdif,
                             refcounted);
}

static inline void sk_daddr_set(struct sock *sk, __be32 addr)
{
        sk->sk_daddr = addr; /* alias of inet_daddr */
#if IS_ENABLED(CONFIG_IPV6)
        ipv6_addr_set_v4mapped(addr, &sk->sk_v6_daddr);
#endif
}

static inline void sk_rcv_saddr_set(struct sock *sk, __be32 addr)
{
        sk->sk_rcv_saddr = addr; /* alias of inet_rcv_saddr */
#if IS_ENABLED(CONFIG_IPV6)
        ipv6_addr_set_v4mapped(addr, &sk->sk_v6_rcv_saddr);
#endif
}

int __inet_hash_connect(struct inet_timewait_death_row *death_row,
                        struct sock *sk, u64 port_offset,
                        int (*check_established)(struct inet_timewait_death_row *,
                                                 struct sock *, __u16,
                                                 struct inet_timewait_sock **));

int inet_hash_connect(struct inet_timewait_death_row *death_row,
                      struct sock *sk);
#endif /* _INET_HASHTABLES_H */

































    2 








    2 




    1 

















    2 








    3 


    3 
















    3 







    4 
    4 

    4 








    4 


    4 


    3 





    2 








































































    1 





































































































































    3 
































    1 

    1 

    1 
    1 


    1 

    1 

    1 
    1 

    1 




















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_FS_NOTIFY_H
#define _LINUX_FS_NOTIFY_H

/*
 * include/linux/fsnotify.h - generic hooks for filesystem notification, to
 * reduce in-source duplication from both dnotify and inotify.
 *
 * We don't compile any of this away in some complicated menagerie of ifdefs.
 * Instead, we rely on the code inside to optimize away as needed.
 *
 * (C) Copyright 2005 Robert Love
 */

#include <linux/fsnotify_backend.h>
#include <linux/audit.h>
#include <linux/slab.h>
#include <linux/bug.h>

/*
 * Notify this @dir inode about a change in a child directory entry.
 * The directory entry may have turned positive or negative or its inode may
 * have changed (i.e. renamed over).
 *
 * Unlike fsnotify_parent(), the event will be reported regardless of the
 * FS_EVENT_ON_CHILD mask on the parent inode and will not be reported if only
 * the child is interested and not the parent.
 */
static inline int fsnotify_name(__u32 mask, const void *data, int data_type,
                                struct inode *dir, const struct qstr *name,
                                u32 cookie)
{
        if (atomic_long_read(&dir->i_sb->s_fsnotify_connectors) == 0)
                return 0;

        return fsnotify(mask, data, data_type, dir, name, NULL, cookie);
}

static inline void fsnotify_dirent(struct inode *dir, struct dentry *dentry,
                                   __u32 mask)
{
        fsnotify_name(mask, dentry, FSNOTIFY_EVENT_DENTRY, dir, &dentry->d_name, 0);
}

static inline void fsnotify_inode(struct inode *inode, __u32 mask)
{
        if (atomic_long_read(&inode->i_sb->s_fsnotify_connectors) == 0)
                return;

        if (S_ISDIR(inode->i_mode))
                mask |= FS_ISDIR;

        fsnotify(mask, inode, FSNOTIFY_EVENT_INODE, NULL, NULL, inode, 0);
}

/* Notify this dentry's parent about a child's events. */
static inline int fsnotify_parent(struct dentry *dentry, __u32 mask,
                                  const void *data, int data_type)
{
        struct inode *inode = d_inode(dentry);

        if (atomic_long_read(&inode->i_sb->s_fsnotify_connectors) == 0)
                return 0;

        if (S_ISDIR(inode->i_mode)) {
                mask |= FS_ISDIR;

                /* sb/mount marks are not interested in name of directory */
                if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
                        goto notify_child;
        }

        /* disconnected dentry cannot notify parent */
        if (IS_ROOT(dentry))
                goto notify_child;

        return __fsnotify_parent(dentry, mask, data, data_type);

notify_child:
        return fsnotify(mask, data, data_type, NULL, NULL, inode, 0);
}

/*
 * Simple wrappers to consolidate calls to fsnotify_parent() when an event
 * is on a file/dentry.
 */
static inline void fsnotify_dentry(struct dentry *dentry, __u32 mask)
{
        fsnotify_parent(dentry, mask, dentry, FSNOTIFY_EVENT_DENTRY);
}

static inline int fsnotify_file(struct file *file, __u32 mask)
{
        const struct path *path = &file->f_path;

        /*
         * FMODE_NONOTIFY are fds generated by fanotify itself which should not
         * generate new events. We also don't want to generate events for
         * FMODE_PATH fds (involves open & close events) as they are just
         * handle creation / destruction events and not "real" file events.
         */
        if (file->f_mode & (FMODE_NONOTIFY | FMODE_PATH))
                return 0;

        return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH);
}

/* Simple call site for access decisions */
static inline int fsnotify_perm(struct file *file, int mask)
{
        int ret;
        __u32 fsnotify_mask = 0;

        if (!(mask & (MAY_READ | MAY_OPEN)))
                return 0;

        if (mask & MAY_OPEN) {
                fsnotify_mask = FS_OPEN_PERM;

                if (file->f_flags & __FMODE_EXEC) {
                        ret = fsnotify_file(file, FS_OPEN_EXEC_PERM);

                        if (ret)
                                return ret;
                }
        } else if (mask & MAY_READ) {
                fsnotify_mask = FS_ACCESS_PERM;
        }

        return fsnotify_file(file, fsnotify_mask);
}

/*
 * fsnotify_link_count - inode's link count changed
 */
static inline void fsnotify_link_count(struct inode *inode)
{
        fsnotify_inode(inode, FS_ATTRIB);
}

/*
 * fsnotify_move - file old_name at old_dir was moved to new_name at new_dir
 */
static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
                                 const struct qstr *old_name,
                                 int isdir, struct inode *target,
                                 struct dentry *moved)
{
        struct inode *source = moved->d_inode;
        u32 fs_cookie = fsnotify_get_cookie();
        __u32 old_dir_mask = FS_MOVED_FROM;
        __u32 new_dir_mask = FS_MOVED_TO;
        __u32 rename_mask = FS_RENAME;
        const struct qstr *new_name = &moved->d_name;

        if (isdir) {
                old_dir_mask |= FS_ISDIR;
                new_dir_mask |= FS_ISDIR;
                rename_mask |= FS_ISDIR;
        }

        /* Event with information about both old and new parent+name */
        fsnotify_name(rename_mask, moved, FSNOTIFY_EVENT_DENTRY,
                      old_dir, old_name, 0);

        fsnotify_name(old_dir_mask, source, FSNOTIFY_EVENT_INODE,
                      old_dir, old_name, fs_cookie);
        fsnotify_name(new_dir_mask, source, FSNOTIFY_EVENT_INODE,
                      new_dir, new_name, fs_cookie);

        if (target)
                fsnotify_link_count(target);
        fsnotify_inode(source, FS_MOVE_SELF);
        audit_inode_child(new_dir, moved, AUDIT_TYPE_CHILD_CREATE);
}

/*
 * fsnotify_inode_delete - and inode is being evicted from cache, clean up is needed
 */
static inline void fsnotify_inode_delete(struct inode *inode)
{
        __fsnotify_inode_delete(inode);
}

/*
 * fsnotify_vfsmount_delete - a vfsmount is being destroyed, clean up is needed
 */
static inline void fsnotify_vfsmount_delete(struct vfsmount *mnt)
{
        __fsnotify_vfsmount_delete(mnt);
}

/*
 * fsnotify_inoderemove - an inode is going away
 */
static inline void fsnotify_inoderemove(struct inode *inode)
{
        fsnotify_inode(inode, FS_DELETE_SELF);
        __fsnotify_inode_delete(inode);
}

/*
 * fsnotify_create - 'name' was linked in
 *
 * Caller must make sure that dentry->d_name is stable.
 * Note: some filesystems (e.g. kernfs) leave @dentry negative and instantiate
 * ->d_inode later
 */
static inline void fsnotify_create(struct inode *dir, struct dentry *dentry)
{
        audit_inode_child(dir, dentry, AUDIT_TYPE_CHILD_CREATE);

        fsnotify_dirent(dir, dentry, FS_CREATE);
}

/*
 * fsnotify_link - new hardlink in 'inode' directory
 *
 * Caller must make sure that new_dentry->d_name is stable.
 * Note: We have to pass also the linked inode ptr as some filesystems leave
 *   new_dentry->d_inode NULL and instantiate inode pointer later
 */
static inline void fsnotify_link(struct inode *dir, struct inode *inode,
                                 struct dentry *new_dentry)
{
        fsnotify_link_count(inode);
        audit_inode_child(dir, new_dentry, AUDIT_TYPE_CHILD_CREATE);

        fsnotify_name(FS_CREATE, inode, FSNOTIFY_EVENT_INODE,
                      dir, &new_dentry->d_name, 0);
}

/*
 * fsnotify_delete - @dentry was unlinked and unhashed
 *
 * Caller must make sure that dentry->d_name is stable.
 *
 * Note: unlike fsnotify_unlink(), we have to pass also the unlinked inode
 * as this may be called after d_delete() and old_dentry may be negative.
 */
static inline void fsnotify_delete(struct inode *dir, struct inode *inode,
                                   struct dentry *dentry)
{
        __u32 mask = FS_DELETE;

        if (S_ISDIR(inode->i_mode))
                mask |= FS_ISDIR;

        fsnotify_name(mask, inode, FSNOTIFY_EVENT_INODE, dir, &dentry->d_name,
                      0);
}

/**
 * d_delete_notify - delete a dentry and call fsnotify_delete()
 * @dentry: The dentry to delete
 *
 * This helper is used to guaranty that the unlinked inode cannot be found
 * by lookup of this name after fsnotify_delete() event has been delivered.
 */
static inline void d_delete_notify(struct inode *dir, struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);

        ihold(inode);
        d_delete(dentry);
        fsnotify_delete(dir, inode, dentry);
        iput(inode);
}

/*
 * fsnotify_unlink - 'name' was unlinked
 *
 * Caller must make sure that dentry->d_name is stable.
 */
static inline void fsnotify_unlink(struct inode *dir, struct dentry *dentry)
{
        if (WARN_ON_ONCE(d_is_negative(dentry)))
                return;

        fsnotify_delete(dir, d_inode(dentry), dentry);
}

/*
 * fsnotify_mkdir - directory 'name' was created
 *
 * Caller must make sure that dentry->d_name is stable.
 * Note: some filesystems (e.g. kernfs) leave @dentry negative and instantiate
 * ->d_inode later
 */
static inline void fsnotify_mkdir(struct inode *dir, struct dentry *dentry)
{
        audit_inode_child(dir, dentry, AUDIT_TYPE_CHILD_CREATE);

        fsnotify_dirent(dir, dentry, FS_CREATE | FS_ISDIR);
}

/*
 * fsnotify_rmdir - directory 'name' was removed
 *
 * Caller must make sure that dentry->d_name is stable.
 */
static inline void fsnotify_rmdir(struct inode *dir, struct dentry *dentry)
{
        if (WARN_ON_ONCE(d_is_negative(dentry)))
                return;

        fsnotify_delete(dir, d_inode(dentry), dentry);
}

/*
 * fsnotify_access - file was read
 */
static inline void fsnotify_access(struct file *file)
{
        fsnotify_file(file, FS_ACCESS);
}

/*
 * fsnotify_modify - file was modified
 */
static inline void fsnotify_modify(struct file *file)
{
        fsnotify_file(file, FS_MODIFY);
}

/*
 * fsnotify_open - file was opened
 */
static inline void fsnotify_open(struct file *file)
{
        __u32 mask = FS_OPEN;

        if (file->f_flags & __FMODE_EXEC)
                mask |= FS_OPEN_EXEC;

        fsnotify_file(file, mask);
}

/*
 * fsnotify_close - file was closed
 */
static inline void fsnotify_close(struct file *file)
{
        __u32 mask = (file->f_mode & FMODE_WRITE) ? FS_CLOSE_WRITE :
                                                    FS_CLOSE_NOWRITE;

        fsnotify_file(file, mask);
}

/*
 * fsnotify_xattr - extended attributes were changed
 */
static inline void fsnotify_xattr(struct dentry *dentry)
{
        fsnotify_dentry(dentry, FS_ATTRIB);
}

/*
 * fsnotify_change - notify_change event.  file was modified and/or metadata
 * was changed.
 */
static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
{
        __u32 mask = 0;

        if (ia_valid & ATTR_UID)
                mask |= FS_ATTRIB;
        if (ia_valid & ATTR_GID)
                mask |= FS_ATTRIB;
        if (ia_valid & ATTR_SIZE)
                mask |= FS_MODIFY;

        /* both times implies a utime(s) call */
        if ((ia_valid & (ATTR_ATIME | ATTR_MTIME)) == (ATTR_ATIME | ATTR_MTIME))
                mask |= FS_ATTRIB;
        else if (ia_valid & ATTR_ATIME)
                mask |= FS_ACCESS;
        else if (ia_valid & ATTR_MTIME)
                mask |= FS_MODIFY;

        if (ia_valid & ATTR_MODE)
                mask |= FS_ATTRIB;

        if (mask)
                fsnotify_dentry(dentry, mask);
}

static inline int fsnotify_sb_error(struct super_block *sb, struct inode *inode,
                                    int error)
{
        struct fs_error_report report = {
                .error = error,
                .inode = inode,
                .sb = sb,
        };

        return fsnotify(FS_ERROR, &report, FSNOTIFY_EVENT_ERROR,
                        NULL, NULL, NULL, 0);
}

#endif        /* _LINUX_FS_NOTIFY_H */


































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Authentication token and access key management internal defs
 *
 * Copyright (C) 2003-5, 2007 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#ifndef _INTERNAL_H
#define _INTERNAL_H

#include <linux/sched.h>
#include <linux/wait_bit.h>
#include <linux/cred.h>
#include <linux/key-type.h>
#include <linux/task_work.h>
#include <linux/keyctl.h>
#include <linux/refcount.h>
#include <linux/watch_queue.h>
#include <linux/compat.h>
#include <linux/mm.h>
#include <linux/vmalloc.h>

struct iovec;

#ifdef __KDEBUG
#define kenter(FMT, ...) \
        printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
#define kleave(FMT, ...) \
        printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__)
#define kdebug(FMT, ...) \
        printk(KERN_DEBUG "   "FMT"\n", ##__VA_ARGS__)
#else
#define kenter(FMT, ...) \
        no_printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
#define kleave(FMT, ...) \
        no_printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__)
#define kdebug(FMT, ...) \
        no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__)
#endif

extern struct key_type key_type_dead;
extern struct key_type key_type_user;
extern struct key_type key_type_logon;

/*****************************************************************************/
/*
 * Keep track of keys for a user.
 *
 * This needs to be separate to user_struct to avoid a refcount-loop
 * (user_struct pins some keyrings which pin this struct).
 *
 * We also keep track of keys under request from userspace for this UID here.
 */
struct key_user {
        struct rb_node                node;
        struct mutex                cons_lock;        /* construction initiation lock */
        spinlock_t                lock;
        refcount_t                usage;                /* for accessing qnkeys & qnbytes */
        atomic_t                nkeys;                /* number of keys */
        atomic_t                nikeys;                /* number of instantiated keys */
        kuid_t                        uid;
        int                        qnkeys;                /* number of keys allocated to this user */
        int                        qnbytes;        /* number of bytes allocated to this user */
};

extern struct rb_root        key_user_tree;
extern spinlock_t        key_user_lock;
extern struct key_user        root_key_user;

extern struct key_user *key_user_lookup(kuid_t uid);
extern void key_user_put(struct key_user *user);

/*
 * Key quota limits.
 * - root has its own separate limits to everyone else
 */
extern unsigned key_quota_root_maxkeys;
extern unsigned key_quota_root_maxbytes;
extern unsigned key_quota_maxkeys;
extern unsigned key_quota_maxbytes;

#define KEYQUOTA_LINK_BYTES        4                /* a link in a keyring is worth 4 bytes */


extern struct kmem_cache *key_jar;
extern struct rb_root key_serial_tree;
extern spinlock_t key_serial_lock;
extern struct mutex key_construction_mutex;
extern wait_queue_head_t request_key_conswq;

extern void key_set_index_key(struct keyring_index_key *index_key);
extern struct key_type *key_type_lookup(const char *type);
extern void key_type_put(struct key_type *ktype);

extern int __key_link_lock(struct key *keyring,
                           const struct keyring_index_key *index_key);
extern int __key_move_lock(struct key *l_keyring, struct key *u_keyring,
                           const struct keyring_index_key *index_key);
extern int __key_link_begin(struct key *keyring,
                            const struct keyring_index_key *index_key,
                            struct assoc_array_edit **_edit);
extern int __key_link_check_live_key(struct key *keyring, struct key *key);
extern void __key_link(struct key *keyring, struct key *key,
                       struct assoc_array_edit **_edit);
extern void __key_link_end(struct key *keyring,
                           const struct keyring_index_key *index_key,
                           struct assoc_array_edit *edit);

extern key_ref_t find_key_to_update(key_ref_t keyring_ref,
                                    const struct keyring_index_key *index_key);

extern struct key *keyring_search_instkey(struct key *keyring,
                                          key_serial_t target_id);

extern int iterate_over_keyring(const struct key *keyring,
                                int (*func)(const struct key *key, void *data),
                                void *data);

struct keyring_search_context {
        struct keyring_index_key index_key;
        const struct cred        *cred;
        struct key_match_data        match_data;
        unsigned                flags;
#define KEYRING_SEARCH_NO_STATE_CHECK        0x0001        /* Skip state checks */
#define KEYRING_SEARCH_DO_STATE_CHECK        0x0002        /* Override NO_STATE_CHECK */
#define KEYRING_SEARCH_NO_UPDATE_TIME        0x0004        /* Don't update times */
#define KEYRING_SEARCH_NO_CHECK_PERM        0x0008        /* Don't check permissions */
#define KEYRING_SEARCH_DETECT_TOO_DEEP        0x0010        /* Give an error on excessive depth */
#define KEYRING_SEARCH_SKIP_EXPIRED        0x0020        /* Ignore expired keys (intention to replace) */
#define KEYRING_SEARCH_RECURSE                0x0040        /* Search child keyrings also */

        int (*iterator)(const void *object, void *iterator_data);

        /* Internal stuff */
        int                        skipped_ret;
        bool                        possessed;
        key_ref_t                result;
        time64_t                now;
};

extern bool key_default_cmp(const struct key *key,
                            const struct key_match_data *match_data);
extern key_ref_t keyring_search_rcu(key_ref_t keyring_ref,
                                    struct keyring_search_context *ctx);

extern key_ref_t search_cred_keyrings_rcu(struct keyring_search_context *ctx);
extern key_ref_t search_process_keyrings_rcu(struct keyring_search_context *ctx);

extern struct key *find_keyring_by_name(const char *name, bool uid_keyring);

extern int look_up_user_keyrings(struct key **, struct key **);
extern struct key *get_user_session_keyring_rcu(const struct cred *);
extern int install_thread_keyring_to_cred(struct cred *);
extern int install_process_keyring_to_cred(struct cred *);
extern int install_session_keyring_to_cred(struct cred *, struct key *);

extern struct key *request_key_and_link(struct key_type *type,
                                        const char *description,
                                        struct key_tag *domain_tag,
                                        const void *callout_info,
                                        size_t callout_len,
                                        void *aux,
                                        struct key *dest_keyring,
                                        unsigned long flags);

extern bool lookup_user_key_possessed(const struct key *key,
                                      const struct key_match_data *match_data);
#define KEY_LOOKUP_CREATE        0x01
#define KEY_LOOKUP_PARTIAL        0x02

extern long join_session_keyring(const char *name);
extern void key_change_session_keyring(struct callback_head *twork);

extern struct work_struct key_gc_work;
extern unsigned key_gc_delay;
extern void keyring_gc(struct key *keyring, time64_t limit);
extern void keyring_restriction_gc(struct key *keyring,
                                   struct key_type *dead_type);
void key_set_expiry(struct key *key, time64_t expiry);
extern void key_schedule_gc(time64_t gc_at);
extern void key_schedule_gc_links(void);
extern void key_gc_keytype(struct key_type *ktype);

extern int key_task_permission(const key_ref_t key_ref,
                               const struct cred *cred,
                               enum key_need_perm need_perm);

static inline void notify_key(struct key *key,
                              enum key_notification_subtype subtype, u32 aux)
{
#ifdef CONFIG_KEY_NOTIFICATIONS
        struct key_notification n = {
                .watch.type        = WATCH_TYPE_KEY_NOTIFY,
                .watch.subtype        = subtype,
                .watch.info        = watch_sizeof(n),
                .key_id                = key_serial(key),
                .aux                = aux,
        };

        post_watch_notification(key->watchers, &n.watch, current_cred(),
                                n.key_id);
#endif
}

/*
 * Check to see whether permission is granted to use a key in the desired way.
 */
static inline int key_permission(const key_ref_t key_ref,
                                 enum key_need_perm need_perm)
{
        return key_task_permission(key_ref, current_cred(), need_perm);
}

extern struct key_type key_type_request_key_auth;
extern struct key *request_key_auth_new(struct key *target,
                                        const char *op,
                                        const void *callout_info,
                                        size_t callout_len,
                                        struct key *dest_keyring);

extern struct key *key_get_instantiation_authkey(key_serial_t target_id);

/*
 * Determine whether a key is dead.
 */
static inline bool key_is_dead(const struct key *key, time64_t limit)
{
        time64_t expiry = key->expiry;

        if (expiry != TIME64_MAX) {
                if (!(key->type->flags & KEY_TYPE_INSTANT_REAP))
                        expiry += key_gc_delay;
                if (expiry <= limit)
                        return true;
        }

        return
                key->flags & ((1 << KEY_FLAG_DEAD) |
                              (1 << KEY_FLAG_INVALIDATED)) ||
                key->domain_tag->removed;
}

/*
 * keyctl() functions
 */
extern long keyctl_get_keyring_ID(key_serial_t, int);
extern long keyctl_join_session_keyring(const char __user *);
extern long keyctl_update_key(key_serial_t, const void __user *, size_t);
extern long keyctl_revoke_key(key_serial_t);
extern long keyctl_keyring_clear(key_serial_t);
extern long keyctl_keyring_link(key_serial_t, key_serial_t);
extern long keyctl_keyring_move(key_serial_t, key_serial_t, key_serial_t, unsigned int);
extern long keyctl_keyring_unlink(key_serial_t, key_serial_t);
extern long keyctl_describe_key(key_serial_t, char __user *, size_t);
extern long keyctl_keyring_search(key_serial_t, const char __user *,
                                  const char __user *, key_serial_t);
extern long keyctl_read_key(key_serial_t, char __user *, size_t);
extern long keyctl_chown_key(key_serial_t, uid_t, gid_t);
extern long keyctl_setperm_key(key_serial_t, key_perm_t);
extern long keyctl_instantiate_key(key_serial_t, const void __user *,
                                   size_t, key_serial_t);
extern long keyctl_negate_key(key_serial_t, unsigned, key_serial_t);
extern long keyctl_set_reqkey_keyring(int);
extern long keyctl_set_timeout(key_serial_t, unsigned);
extern long keyctl_assume_authority(key_serial_t);
extern long keyctl_get_security(key_serial_t keyid, char __user *buffer,
                                size_t buflen);
extern long keyctl_session_to_parent(void);
extern long keyctl_reject_key(key_serial_t, unsigned, unsigned, key_serial_t);
extern long keyctl_instantiate_key_iov(key_serial_t,
                                       const struct iovec __user *,
                                       unsigned, key_serial_t);
extern long keyctl_invalidate_key(key_serial_t);
extern long keyctl_restrict_keyring(key_serial_t id,
                                    const char __user *_type,
                                    const char __user *_restriction);
#ifdef CONFIG_PERSISTENT_KEYRINGS
extern long keyctl_get_persistent(uid_t, key_serial_t);
extern unsigned persistent_keyring_expiry;
#else
static inline long keyctl_get_persistent(uid_t uid, key_serial_t destring)
{
        return -EOPNOTSUPP;
}
#endif

#ifdef CONFIG_KEY_DH_OPERATIONS
extern long keyctl_dh_compute(struct keyctl_dh_params __user *, char __user *,
                              size_t, struct keyctl_kdf_params __user *);
extern long __keyctl_dh_compute(struct keyctl_dh_params __user *, char __user *,
                                size_t, struct keyctl_kdf_params *);
#ifdef CONFIG_COMPAT
extern long compat_keyctl_dh_compute(struct keyctl_dh_params __user *params,
                                char __user *buffer, size_t buflen,
                                struct compat_keyctl_kdf_params __user *kdf);
#endif
#define KEYCTL_KDF_MAX_OUTPUT_LEN        1024        /* max length of KDF output */
#define KEYCTL_KDF_MAX_OI_LEN                64        /* max length of otherinfo */
#else
static inline long keyctl_dh_compute(struct keyctl_dh_params __user *params,
                                     char __user *buffer, size_t buflen,
                                     struct keyctl_kdf_params __user *kdf)
{
        return -EOPNOTSUPP;
}

#ifdef CONFIG_COMPAT
static inline long compat_keyctl_dh_compute(
                                struct keyctl_dh_params __user *params,
                                char __user *buffer, size_t buflen,
                                struct keyctl_kdf_params __user *kdf)
{
        return -EOPNOTSUPP;
}
#endif
#endif

#ifdef CONFIG_ASYMMETRIC_KEY_TYPE
extern long keyctl_pkey_query(key_serial_t,
                              const char __user *,
                              struct keyctl_pkey_query __user *);

extern long keyctl_pkey_verify(const struct keyctl_pkey_params __user *,
                               const char __user *,
                               const void __user *, const void __user *);

extern long keyctl_pkey_e_d_s(int,
                              const struct keyctl_pkey_params __user *,
                              const char __user *,
                              const void __user *, void __user *);
#else
static inline long keyctl_pkey_query(key_serial_t id,
                                     const char __user *_info,
                                     struct keyctl_pkey_query __user *_res)
{
        return -EOPNOTSUPP;
}

static inline long keyctl_pkey_verify(const struct keyctl_pkey_params __user *params,
                                      const char __user *_info,
                                      const void __user *_in,
                                      const void __user *_in2)
{
        return -EOPNOTSUPP;
}

static inline long keyctl_pkey_e_d_s(int op,
                                     const struct keyctl_pkey_params __user *params,
                                     const char __user *_info,
                                     const void __user *_in,
                                     void __user *_out)
{
        return -EOPNOTSUPP;
}
#endif

extern long keyctl_capabilities(unsigned char __user *_buffer, size_t buflen);

#ifdef CONFIG_KEY_NOTIFICATIONS
extern long keyctl_watch_key(key_serial_t, int, int);
#else
static inline long keyctl_watch_key(key_serial_t key_id, int watch_fd, int watch_id)
{
        return -EOPNOTSUPP;
}
#endif

/*
 * Debugging key validation
 */
#ifdef KEY_DEBUGGING
extern void __key_check(const struct key *);

static inline void key_check(const struct key *key)
{
        if (key && (IS_ERR(key) || key->magic != KEY_DEBUG_MAGIC))
                __key_check(key);
}

#else

#define key_check(key) do {} while(0)

#endif
#endif /* _INTERNAL_H */





























































































































































































    1 




































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PIPE_FS_I_H
#define _LINUX_PIPE_FS_I_H

#define PIPE_DEF_BUFFERS        16

#define PIPE_BUF_FLAG_LRU        0x01        /* page is on the LRU */
#define PIPE_BUF_FLAG_ATOMIC        0x02        /* was atomically mapped */
#define PIPE_BUF_FLAG_GIFT        0x04        /* page is a gift */
#define PIPE_BUF_FLAG_PACKET        0x08        /* read() as a packet */
#define PIPE_BUF_FLAG_CAN_MERGE        0x10        /* can merge buffers */
#define PIPE_BUF_FLAG_WHOLE        0x20        /* read() must return entire buffer or error */
#ifdef CONFIG_WATCH_QUEUE
#define PIPE_BUF_FLAG_LOSS        0x40        /* Message loss happened after this buffer */
#endif

/**
 *        struct pipe_buffer - a linux kernel pipe buffer
 *        @page: the page containing the data for the pipe buffer
 *        @offset: offset of data inside the @page
 *        @len: length of data inside the @page
 *        @ops: operations associated with this buffer. See @pipe_buf_operations.
 *        @flags: pipe buffer flags. See above.
 *        @private: private data owned by the ops.
 **/
struct pipe_buffer {
        struct page *page;
        unsigned int offset, len;
        const struct pipe_buf_operations *ops;
        unsigned int flags;
        unsigned long private;
};

/**
 *        struct pipe_inode_info - a linux kernel pipe
 *        @mutex: mutex protecting the whole thing
 *        @rd_wait: reader wait point in case of empty pipe
 *        @wr_wait: writer wait point in case of full pipe
 *        @head: The point of buffer production
 *        @tail: The point of buffer consumption
 *        @note_loss: The next read() should insert a data-lost message
 *        @max_usage: The maximum number of slots that may be used in the ring
 *        @ring_size: total number of buffers (should be a power of 2)
 *        @nr_accounted: The amount this pipe accounts for in user->pipe_bufs
 *        @tmp_page: cached released page
 *        @readers: number of current readers of this pipe
 *        @writers: number of current writers of this pipe
 *        @files: number of struct file referring this pipe (protected by ->i_lock)
 *        @r_counter: reader counter
 *        @w_counter: writer counter
 *        @poll_usage: is this pipe used for epoll, which has crazy wakeups?
 *        @fasync_readers: reader side fasync
 *        @fasync_writers: writer side fasync
 *        @bufs: the circular array of pipe buffers
 *        @user: the user who created this pipe
 *        @watch_queue: If this pipe is a watch_queue, this is the stuff for that
 **/
struct pipe_inode_info {
        struct mutex mutex;
        wait_queue_head_t rd_wait, wr_wait;
        unsigned int head;
        unsigned int tail;
        unsigned int max_usage;
        unsigned int ring_size;
#ifdef CONFIG_WATCH_QUEUE
        bool note_loss;
#endif
        unsigned int nr_accounted;
        unsigned int readers;
        unsigned int writers;
        unsigned int files;
        unsigned int r_counter;
        unsigned int w_counter;
        bool poll_usage;
        struct page *tmp_page;
        struct fasync_struct *fasync_readers;
        struct fasync_struct *fasync_writers;
        struct pipe_buffer *bufs;
        struct user_struct *user;
#ifdef CONFIG_WATCH_QUEUE
        struct watch_queue *watch_queue;
#endif
};

/*
 * Note on the nesting of these functions:
 *
 * ->confirm()
 *        ->try_steal()
 *
 * That is, ->try_steal() must be called on a confirmed buffer.  See below for
 * the meaning of each operation.  Also see the kerneldoc in fs/pipe.c for the
 * pipe and generic variants of these hooks.
 */
struct pipe_buf_operations {
        /*
         * ->confirm() verifies that the data in the pipe buffer is there
         * and that the contents are good. If the pages in the pipe belong
         * to a file system, we may need to wait for IO completion in this
         * hook. Returns 0 for good, or a negative error value in case of
         * error.  If not present all pages are considered good.
         */
        int (*confirm)(struct pipe_inode_info *, struct pipe_buffer *);

        /*
         * When the contents of this pipe buffer has been completely
         * consumed by a reader, ->release() is called.
         */
        void (*release)(struct pipe_inode_info *, struct pipe_buffer *);

        /*
         * Attempt to take ownership of the pipe buffer and its contents.
         * ->try_steal() returns %true for success, in which case the contents
         * of the pipe (the buf->page) is locked and now completely owned by the
         * caller. The page may then be transferred to a different mapping, the
         * most often used case is insertion into different file address space
         * cache.
         */
        bool (*try_steal)(struct pipe_inode_info *, struct pipe_buffer *);

        /*
         * Get a reference to the pipe buffer.
         */
        bool (*get)(struct pipe_inode_info *, struct pipe_buffer *);
};

/**
 * pipe_has_watch_queue - Check whether the pipe is a watch_queue,
 * i.e. it was created with O_NOTIFICATION_PIPE
 * @pipe: The pipe to check
 *
 * Return: true if pipe is a watch queue, false otherwise.
 */
static inline bool pipe_has_watch_queue(const struct pipe_inode_info *pipe)
{
#ifdef CONFIG_WATCH_QUEUE
        return pipe->watch_queue != NULL;
#else
        return false;
#endif
}

/**
 * pipe_empty - Return true if the pipe is empty
 * @head: The pipe ring head pointer
 * @tail: The pipe ring tail pointer
 */
static inline bool pipe_empty(unsigned int head, unsigned int tail)
{
        return head == tail;
}

/**
 * pipe_occupancy - Return number of slots used in the pipe
 * @head: The pipe ring head pointer
 * @tail: The pipe ring tail pointer
 */
static inline unsigned int pipe_occupancy(unsigned int head, unsigned int tail)
{
        return head - tail;
}

/**
 * pipe_full - Return true if the pipe is full
 * @head: The pipe ring head pointer
 * @tail: The pipe ring tail pointer
 * @limit: The maximum amount of slots available.
 */
static inline bool pipe_full(unsigned int head, unsigned int tail,
                             unsigned int limit)
{
        return pipe_occupancy(head, tail) >= limit;
}

/**
 * pipe_space_for_user - Return number of slots available to userspace
 * @head: The pipe ring head pointer
 * @tail: The pipe ring tail pointer
 * @pipe: The pipe info structure
 */
static inline unsigned int pipe_space_for_user(unsigned int head, unsigned int tail,
                                               struct pipe_inode_info *pipe)
{
        unsigned int p_occupancy, p_space;

        p_occupancy = pipe_occupancy(head, tail);
        if (p_occupancy >= pipe->max_usage)
                return 0;
        p_space = pipe->ring_size - p_occupancy;
        if (p_space > pipe->max_usage)
                p_space = pipe->max_usage;
        return p_space;
}

/**
 * pipe_buf_get - get a reference to a pipe_buffer
 * @pipe:        the pipe that the buffer belongs to
 * @buf:        the buffer to get a reference to
 *
 * Return: %true if the reference was successfully obtained.
 */
static inline __must_check bool pipe_buf_get(struct pipe_inode_info *pipe,
                                struct pipe_buffer *buf)
{
        return buf->ops->get(pipe, buf);
}

/**
 * pipe_buf_release - put a reference to a pipe_buffer
 * @pipe:        the pipe that the buffer belongs to
 * @buf:        the buffer to put a reference to
 */
static inline void pipe_buf_release(struct pipe_inode_info *pipe,
                                    struct pipe_buffer *buf)
{
        const struct pipe_buf_operations *ops = buf->ops;

        buf->ops = NULL;
        ops->release(pipe, buf);
}

/**
 * pipe_buf_confirm - verify contents of the pipe buffer
 * @pipe:        the pipe that the buffer belongs to
 * @buf:        the buffer to confirm
 */
static inline int pipe_buf_confirm(struct pipe_inode_info *pipe,
                                   struct pipe_buffer *buf)
{
        if (!buf->ops->confirm)
                return 0;
        return buf->ops->confirm(pipe, buf);
}

/**
 * pipe_buf_try_steal - attempt to take ownership of a pipe_buffer
 * @pipe:        the pipe that the buffer belongs to
 * @buf:        the buffer to attempt to steal
 */
static inline bool pipe_buf_try_steal(struct pipe_inode_info *pipe,
                struct pipe_buffer *buf)
{
        if (!buf->ops->try_steal)
                return false;
        return buf->ops->try_steal(pipe, buf);
}

/* Differs from PIPE_BUF in that PIPE_SIZE is the length of the actual
   memory allocation, whereas PIPE_BUF makes atomicity guarantees.  */
#define PIPE_SIZE                PAGE_SIZE

/* Pipe lock and unlock operations */
void pipe_lock(struct pipe_inode_info *);
void pipe_unlock(struct pipe_inode_info *);
void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *);

extern unsigned int pipe_max_size;
extern unsigned long pipe_user_pages_hard;
extern unsigned long pipe_user_pages_soft;

/* Wait for a pipe to be readable/writable while dropping the pipe lock */
void pipe_wait_readable(struct pipe_inode_info *);
void pipe_wait_writable(struct pipe_inode_info *);

struct pipe_inode_info *alloc_pipe_info(void);
void free_pipe_info(struct pipe_inode_info *);

/* Generic pipe buffer ops functions */
bool generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *);
bool generic_pipe_buf_try_steal(struct pipe_inode_info *, struct pipe_buffer *);
void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *);

extern const struct pipe_buf_operations nosteal_pipe_buf_ops;

unsigned long account_pipe_buffers(struct user_struct *user,
                                   unsigned long old, unsigned long new);
bool too_many_pipe_buffers_soft(unsigned long user_bufs);
bool too_many_pipe_buffers_hard(unsigned long user_bufs);
bool pipe_is_unprivileged_user(void);

/* for F_SETPIPE_SZ and F_GETPIPE_SZ */
int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots);
long pipe_fcntl(struct file *, unsigned int, unsigned long arg);
struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice);

int create_pipe_files(struct file **, int);
unsigned int round_pipe_size(unsigned long size);

#endif




































































































































































    3 













































    3 






























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM jbd2

#if !defined(_TRACE_JBD2_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_JBD2_H

#include <linux/jbd2.h>
#include <linux/tracepoint.h>

struct transaction_chp_stats_s;
struct transaction_run_stats_s;

TRACE_EVENT(jbd2_checkpoint,

        TP_PROTO(journal_t *journal, int result),

        TP_ARGS(journal, result),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        int,        result                        )
        ),

        TP_fast_assign(
                __entry->dev                = journal->j_fs_dev->bd_dev;
                __entry->result                = result;
        ),

        TP_printk("dev %d,%d result %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->result)
);

DECLARE_EVENT_CLASS(jbd2_commit,

        TP_PROTO(journal_t *journal, transaction_t *commit_transaction),

        TP_ARGS(journal, commit_transaction),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        char,        sync_commit                  )
                __field(        tid_t,        transaction                  )
        ),

        TP_fast_assign(
                __entry->dev                = journal->j_fs_dev->bd_dev;
                __entry->sync_commit = commit_transaction->t_synchronous_commit;
                __entry->transaction        = commit_transaction->t_tid;
        ),

        TP_printk("dev %d,%d transaction %u sync %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->transaction, __entry->sync_commit)
);

DEFINE_EVENT(jbd2_commit, jbd2_start_commit,

        TP_PROTO(journal_t *journal, transaction_t *commit_transaction),

        TP_ARGS(journal, commit_transaction)
);

DEFINE_EVENT(jbd2_commit, jbd2_commit_locking,

        TP_PROTO(journal_t *journal, transaction_t *commit_transaction),

        TP_ARGS(journal, commit_transaction)
);

DEFINE_EVENT(jbd2_commit, jbd2_commit_flushing,

        TP_PROTO(journal_t *journal, transaction_t *commit_transaction),

        TP_ARGS(journal, commit_transaction)
);

DEFINE_EVENT(jbd2_commit, jbd2_commit_logging,

        TP_PROTO(journal_t *journal, transaction_t *commit_transaction),

        TP_ARGS(journal, commit_transaction)
);

DEFINE_EVENT(jbd2_commit, jbd2_drop_transaction,

        TP_PROTO(journal_t *journal, transaction_t *commit_transaction),

        TP_ARGS(journal, commit_transaction)
);

TRACE_EVENT(jbd2_end_commit,
        TP_PROTO(journal_t *journal, transaction_t *commit_transaction),

        TP_ARGS(journal, commit_transaction),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        char,        sync_commit                  )
                __field(        tid_t,        transaction                  )
                __field(        tid_t,        head                            )
        ),

        TP_fast_assign(
                __entry->dev                = journal->j_fs_dev->bd_dev;
                __entry->sync_commit = commit_transaction->t_synchronous_commit;
                __entry->transaction        = commit_transaction->t_tid;
                __entry->head                = journal->j_tail_sequence;
        ),

        TP_printk("dev %d,%d transaction %u sync %d head %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->transaction, __entry->sync_commit, __entry->head)
);

TRACE_EVENT(jbd2_submit_inode_data,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
        ),

        TP_printk("dev %d,%d ino %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino)
);

DECLARE_EVENT_CLASS(jbd2_handle_start_class,
        TP_PROTO(dev_t dev, tid_t tid, unsigned int type,
                 unsigned int line_no, int requested_blocks),

        TP_ARGS(dev, tid, type, line_no, requested_blocks),

        TP_STRUCT__entry(
                __field(                dev_t,        dev                )
                __field(                tid_t,        tid                )
                __field(         unsigned int,        type                )
                __field(         unsigned int,        line_no                )
                __field(                  int,        requested_blocks)
        ),

        TP_fast_assign(
                __entry->dev                  = dev;
                __entry->tid                  = tid;
                __entry->type                  = type;
                __entry->line_no          = line_no;
                __entry->requested_blocks = requested_blocks;
        ),

        TP_printk("dev %d,%d tid %u type %u line_no %u "
                  "requested_blocks %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid,
                  __entry->type, __entry->line_no, __entry->requested_blocks)
);

DEFINE_EVENT(jbd2_handle_start_class, jbd2_handle_start,
        TP_PROTO(dev_t dev, tid_t tid, unsigned int type,
                 unsigned int line_no, int requested_blocks),

        TP_ARGS(dev, tid, type, line_no, requested_blocks)
);

DEFINE_EVENT(jbd2_handle_start_class, jbd2_handle_restart,
        TP_PROTO(dev_t dev, tid_t tid, unsigned int type,
                 unsigned int line_no, int requested_blocks),

        TP_ARGS(dev, tid, type, line_no, requested_blocks)
);

TRACE_EVENT(jbd2_handle_extend,
        TP_PROTO(dev_t dev, tid_t tid, unsigned int type,
                 unsigned int line_no, int buffer_credits,
                 int requested_blocks),

        TP_ARGS(dev, tid, type, line_no, buffer_credits, requested_blocks),

        TP_STRUCT__entry(
                __field(                dev_t,        dev                )
                __field(                tid_t,        tid                )
                __field(         unsigned int,        type                )
                __field(         unsigned int,        line_no                )
                __field(                  int,        buffer_credits  )
                __field(                  int,        requested_blocks)
        ),

        TP_fast_assign(
                __entry->dev                  = dev;
                __entry->tid                  = tid;
                __entry->type                  = type;
                __entry->line_no          = line_no;
                __entry->buffer_credits   = buffer_credits;
                __entry->requested_blocks = requested_blocks;
        ),

        TP_printk("dev %d,%d tid %u type %u line_no %u "
                  "buffer_credits %d requested_blocks %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid,
                  __entry->type, __entry->line_no, __entry->buffer_credits,
                  __entry->requested_blocks)
);

TRACE_EVENT(jbd2_handle_stats,
        TP_PROTO(dev_t dev, tid_t tid, unsigned int type,
                 unsigned int line_no, int interval, int sync,
                 int requested_blocks, int dirtied_blocks),

        TP_ARGS(dev, tid, type, line_no, interval, sync,
                requested_blocks, dirtied_blocks),

        TP_STRUCT__entry(
                __field(                dev_t,        dev                )
                __field(                tid_t,        tid                )
                __field(         unsigned int,        type                )
                __field(         unsigned int,        line_no                )
                __field(                  int,        interval        )
                __field(                  int,        sync                )
                __field(                  int,        requested_blocks)
                __field(                  int,        dirtied_blocks        )
        ),

        TP_fast_assign(
                __entry->dev                  = dev;
                __entry->tid                  = tid;
                __entry->type                  = type;
                __entry->line_no          = line_no;
                __entry->interval          = interval;
                __entry->sync                  = sync;
                __entry->requested_blocks = requested_blocks;
                __entry->dirtied_blocks          = dirtied_blocks;
        ),

        TP_printk("dev %d,%d tid %u type %u line_no %u interval %d "
                  "sync %d requested_blocks %d dirtied_blocks %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid,
                  __entry->type, __entry->line_no, __entry->interval,
                  __entry->sync, __entry->requested_blocks,
                  __entry->dirtied_blocks)
);

TRACE_EVENT(jbd2_run_stats,
        TP_PROTO(dev_t dev, tid_t tid,
                 struct transaction_run_stats_s *stats),

        TP_ARGS(dev, tid, stats),

        TP_STRUCT__entry(
                __field(                dev_t,        dev                )
                __field(                tid_t,        tid                )
                __field(        unsigned long,        wait                )
                __field(        unsigned long,        request_delay        )
                __field(        unsigned long,        running                )
                __field(        unsigned long,        locked                )
                __field(        unsigned long,        flushing        )
                __field(        unsigned long,        logging                )
                __field(                __u32,        handle_count        )
                __field(                __u32,        blocks                )
                __field(                __u32,        blocks_logged        )
        ),

        TP_fast_assign(
                __entry->dev                = dev;
                __entry->tid                = tid;
                __entry->wait                = stats->rs_wait;
                __entry->request_delay        = stats->rs_request_delay;
                __entry->running        = stats->rs_running;
                __entry->locked                = stats->rs_locked;
                __entry->flushing        = stats->rs_flushing;
                __entry->logging        = stats->rs_logging;
                __entry->handle_count        = stats->rs_handle_count;
                __entry->blocks                = stats->rs_blocks;
                __entry->blocks_logged        = stats->rs_blocks_logged;
        ),

        TP_printk("dev %d,%d tid %u wait %u request_delay %u running %u "
                  "locked %u flushing %u logging %u handle_count %u "
                  "blocks %u blocks_logged %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid,
                  jiffies_to_msecs(__entry->wait),
                  jiffies_to_msecs(__entry->request_delay),
                  jiffies_to_msecs(__entry->running),
                  jiffies_to_msecs(__entry->locked),
                  jiffies_to_msecs(__entry->flushing),
                  jiffies_to_msecs(__entry->logging),
                  __entry->handle_count, __entry->blocks,
                  __entry->blocks_logged)
);

TRACE_EVENT(jbd2_checkpoint_stats,
        TP_PROTO(dev_t dev, tid_t tid,
                 struct transaction_chp_stats_s *stats),

        TP_ARGS(dev, tid, stats),

        TP_STRUCT__entry(
                __field(                dev_t,        dev                )
                __field(                tid_t,        tid                )
                __field(        unsigned long,        chp_time        )
                __field(                __u32,        forced_to_close        )
                __field(                __u32,        written                )
                __field(                __u32,        dropped                )
        ),

        TP_fast_assign(
                __entry->dev                = dev;
                __entry->tid                = tid;
                __entry->chp_time        = stats->cs_chp_time;
                __entry->forced_to_close= stats->cs_forced_to_close;
                __entry->written        = stats->cs_written;
                __entry->dropped        = stats->cs_dropped;
        ),

        TP_printk("dev %d,%d tid %u chp_time %u forced_to_close %u "
                  "written %u dropped %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid,
                  jiffies_to_msecs(__entry->chp_time),
                  __entry->forced_to_close, __entry->written, __entry->dropped)
);

TRACE_EVENT(jbd2_update_log_tail,

        TP_PROTO(journal_t *journal, tid_t first_tid,
                 unsigned long block_nr, unsigned long freed),

        TP_ARGS(journal, first_tid, block_nr, freed),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        tid_t,        tail_sequence                )
                __field(        tid_t,        first_tid                )
                __field(unsigned long,        block_nr                )
                __field(unsigned long,        freed                        )
        ),

        TP_fast_assign(
                __entry->dev                = journal->j_fs_dev->bd_dev;
                __entry->tail_sequence        = journal->j_tail_sequence;
                __entry->first_tid        = first_tid;
                __entry->block_nr        = block_nr;
                __entry->freed                = freed;
        ),

        TP_printk("dev %d,%d from %u to %u offset %lu freed %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->tail_sequence, __entry->first_tid,
                  __entry->block_nr, __entry->freed)
);

TRACE_EVENT(jbd2_write_superblock,

        TP_PROTO(journal_t *journal, int write_op),

        TP_ARGS(journal, write_op),

        TP_STRUCT__entry(
                __field(        dev_t,  dev                        )
                __field(          int,  write_op                )
        ),

        TP_fast_assign(
                __entry->dev                = journal->j_fs_dev->bd_dev;
                __entry->write_op        = write_op;
        ),

        TP_printk("dev %d,%d write_op %x", MAJOR(__entry->dev),
                  MINOR(__entry->dev), __entry->write_op)
);

TRACE_EVENT(jbd2_lock_buffer_stall,

        TP_PROTO(dev_t dev, unsigned long stall_ms),

        TP_ARGS(dev, stall_ms),

        TP_STRUCT__entry(
                __field(        dev_t, dev        )
                __field(unsigned long, stall_ms        )
        ),

        TP_fast_assign(
                __entry->dev                = dev;
                __entry->stall_ms        = stall_ms;
        ),

        TP_printk("dev %d,%d stall_ms %lu",
                MAJOR(__entry->dev), MINOR(__entry->dev),
                __entry->stall_ms)
);

#endif /* _TRACE_JBD2_H */

/* This part must be outside protection */
#include <trace/define_trace.h>



































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_LOCAL_H
#define _ASM_X86_LOCAL_H

#include <linux/percpu.h>

#include <linux/atomic.h>
#include <asm/asm.h>

typedef struct {
        atomic_long_t a;
} local_t;

#define LOCAL_INIT(i)        { ATOMIC_LONG_INIT(i) }

#define local_read(l)        atomic_long_read(&(l)->a)
#define local_set(l, i)        atomic_long_set(&(l)->a, (i))

static inline void local_inc(local_t *l)
{
        asm volatile(_ASM_INC "%0"
                     : "+m" (l->a.counter));
}

static inline void local_dec(local_t *l)
{
        asm volatile(_ASM_DEC "%0"
                     : "+m" (l->a.counter));
}

static inline void local_add(long i, local_t *l)
{
        asm volatile(_ASM_ADD "%1,%0"
                     : "+m" (l->a.counter)
                     : "ir" (i));
}

static inline void local_sub(long i, local_t *l)
{
        asm volatile(_ASM_SUB "%1,%0"
                     : "+m" (l->a.counter)
                     : "ir" (i));
}

/**
 * local_sub_and_test - subtract value from variable and test result
 * @i: integer value to subtract
 * @l: pointer to type local_t
 *
 * Atomically subtracts @i from @l and returns
 * true if the result is zero, or false for all
 * other cases.
 */
static inline bool local_sub_and_test(long i, local_t *l)
{
        return GEN_BINARY_RMWcc(_ASM_SUB, l->a.counter, e, "er", i);
}

/**
 * local_dec_and_test - decrement and test
 * @l: pointer to type local_t
 *
 * Atomically decrements @l by 1 and
 * returns true if the result is 0, or false for all other
 * cases.
 */
static inline bool local_dec_and_test(local_t *l)
{
        return GEN_UNARY_RMWcc(_ASM_DEC, l->a.counter, e);
}

/**
 * local_inc_and_test - increment and test
 * @l: pointer to type local_t
 *
 * Atomically increments @l by 1
 * and returns true if the result is zero, or false for all
 * other cases.
 */
static inline bool local_inc_and_test(local_t *l)
{
        return GEN_UNARY_RMWcc(_ASM_INC, l->a.counter, e);
}

/**
 * local_add_negative - add and test if negative
 * @i: integer value to add
 * @l: pointer to type local_t
 *
 * Atomically adds @i to @l and returns true
 * if the result is negative, or false when
 * result is greater than or equal to zero.
 */
static inline bool local_add_negative(long i, local_t *l)
{
        return GEN_BINARY_RMWcc(_ASM_ADD, l->a.counter, s, "er", i);
}

/**
 * local_add_return - add and return
 * @i: integer value to add
 * @l: pointer to type local_t
 *
 * Atomically adds @i to @l and returns @i + @l
 */
static inline long local_add_return(long i, local_t *l)
{
        long __i = i;
        asm volatile(_ASM_XADD "%0, %1;"
                     : "+r" (i), "+m" (l->a.counter)
                     : : "memory");
        return i + __i;
}

static inline long local_sub_return(long i, local_t *l)
{
        return local_add_return(-i, l);
}

#define local_inc_return(l)  (local_add_return(1, l))
#define local_dec_return(l)  (local_sub_return(1, l))

#define local_cmpxchg(l, o, n) \
        (cmpxchg_local(&((l)->a.counter), (o), (n)))
/* Always has a lock prefix */
#define local_xchg(l, n) (xchg(&((l)->a.counter), (n)))

/**
 * local_add_unless - add unless the number is a given value
 * @l: pointer of type local_t
 * @a: the amount to add to l...
 * @u: ...unless l is equal to u.
 *
 * Atomically adds @a to @l, so long as it was not @u.
 * Returns non-zero if @l was not @u, and zero otherwise.
 */
#define local_add_unless(l, a, u)                                \
({                                                                \
        long c, old;                                                \
        c = local_read((l));                                        \
        for (;;) {                                                \
                if (unlikely(c == (u)))                                \
                        break;                                        \
                old = local_cmpxchg((l), c, c + (a));                \
                if (likely(old == c))                                \
                        break;                                        \
                c = old;                                        \
        }                                                        \
        c != (u);                                                \
})
#define local_inc_not_zero(l) local_add_unless((l), 1, 0)

/* On x86_32, these are no better than the atomic variants.
 * On x86-64 these are better than the atomic variants on SMP kernels
 * because they dont use a lock prefix.
 */
#define __local_inc(l)                local_inc(l)
#define __local_dec(l)                local_dec(l)
#define __local_add(i, l)        local_add((i), (l))
#define __local_sub(i, l)        local_sub((i), (l))

#endif /* _ASM_X86_LOCAL_H */


























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NF_CONNTRACK_ZONES_H
#define _NF_CONNTRACK_ZONES_H

#include <linux/netfilter/nf_conntrack_zones_common.h>
#include <net/netfilter/nf_conntrack.h>

static inline const struct nf_conntrack_zone *
nf_ct_zone(const struct nf_conn *ct)
{
#ifdef CONFIG_NF_CONNTRACK_ZONES
        return &ct->zone;
#else
        return &nf_ct_zone_dflt;
#endif
}

static inline const struct nf_conntrack_zone *
nf_ct_zone_init(struct nf_conntrack_zone *zone, u16 id, u8 dir, u8 flags)
{
        zone->id = id;
        zone->flags = flags;
        zone->dir = dir;

        return zone;
}

static inline const struct nf_conntrack_zone *
nf_ct_zone_tmpl(const struct nf_conn *tmpl, const struct sk_buff *skb,
                struct nf_conntrack_zone *tmp)
{
#ifdef CONFIG_NF_CONNTRACK_ZONES
        if (!tmpl)
                return &nf_ct_zone_dflt;

        if (tmpl->zone.flags & NF_CT_FLAG_MARK)
                return nf_ct_zone_init(tmp, skb->mark, tmpl->zone.dir, 0);
#endif
        return nf_ct_zone(tmpl);
}

static inline void nf_ct_zone_add(struct nf_conn *ct,
                                  const struct nf_conntrack_zone *zone)
{
#ifdef CONFIG_NF_CONNTRACK_ZONES
        ct->zone = *zone;
#endif
}

static inline bool nf_ct_zone_matches_dir(const struct nf_conntrack_zone *zone,
                                          enum ip_conntrack_dir dir)
{
        return zone->dir & (1 << dir);
}

static inline u16 nf_ct_zone_id(const struct nf_conntrack_zone *zone,
                                enum ip_conntrack_dir dir)
{
#ifdef CONFIG_NF_CONNTRACK_ZONES
        return nf_ct_zone_matches_dir(zone, dir) ?
               zone->id : NF_CT_DEFAULT_ZONE_ID;
#else
        return NF_CT_DEFAULT_ZONE_ID;
#endif
}

static inline bool nf_ct_zone_equal(const struct nf_conn *a,
                                    const struct nf_conntrack_zone *b,
                                    enum ip_conntrack_dir dir)
{
#ifdef CONFIG_NF_CONNTRACK_ZONES
        return nf_ct_zone_id(nf_ct_zone(a), dir) ==
               nf_ct_zone_id(b, dir);
#else
        return true;
#endif
}

static inline bool nf_ct_zone_equal_any(const struct nf_conn *a,
                                        const struct nf_conntrack_zone *b)
{
#ifdef CONFIG_NF_CONNTRACK_ZONES
        return nf_ct_zone(a)->id == b->id;
#else
        return true;
#endif
}

#endif /* _NF_CONNTRACK_ZONES_H */






























































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __MAC802154_DRIVER_OPS
#define __MAC802154_DRIVER_OPS

#include <linux/types.h>
#include <linux/rtnetlink.h>

#include <net/mac802154.h>

#include "ieee802154_i.h"
#include "trace.h"

static inline int
drv_xmit_async(struct ieee802154_local *local, struct sk_buff *skb)
{
        return local->ops->xmit_async(&local->hw, skb);
}

static inline int
drv_xmit_sync(struct ieee802154_local *local, struct sk_buff *skb)
{
        might_sleep();

        return local->ops->xmit_sync(&local->hw, skb);
}

static inline int drv_start(struct ieee802154_local *local)
{
        int ret;

        might_sleep();

        trace_802154_drv_start(local);
        local->started = true;
        smp_mb();
        ret = local->ops->start(&local->hw);
        trace_802154_drv_return_int(local, ret);
        return ret;
}

static inline void drv_stop(struct ieee802154_local *local)
{
        might_sleep();

        trace_802154_drv_stop(local);
        local->ops->stop(&local->hw);
        trace_802154_drv_return_void(local);

        /* sync away all work on the tasklet before clearing started */
        tasklet_disable(&local->tasklet);
        tasklet_enable(&local->tasklet);

        barrier();

        local->started = false;
}

static inline int
drv_set_channel(struct ieee802154_local *local, u8 page, u8 channel)
{
        int ret;

        might_sleep();

        trace_802154_drv_set_channel(local, page, channel);
        ret = local->ops->set_channel(&local->hw, page, channel);
        trace_802154_drv_return_int(local, ret);
        return ret;
}

static inline int drv_set_tx_power(struct ieee802154_local *local, s32 mbm)
{
        int ret;

        might_sleep();

        if (!local->ops->set_txpower) {
                WARN_ON(1);
                return -EOPNOTSUPP;
        }

        trace_802154_drv_set_tx_power(local, mbm);
        ret = local->ops->set_txpower(&local->hw, mbm);
        trace_802154_drv_return_int(local, ret);
        return ret;
}

static inline int drv_set_cca_mode(struct ieee802154_local *local,
                                   const struct wpan_phy_cca *cca)
{
        int ret;

        might_sleep();

        if (!local->ops->set_cca_mode) {
                WARN_ON(1);
                return -EOPNOTSUPP;
        }

        trace_802154_drv_set_cca_mode(local, cca);
        ret = local->ops->set_cca_mode(&local->hw, cca);
        trace_802154_drv_return_int(local, ret);
        return ret;
}

static inline int drv_set_lbt_mode(struct ieee802154_local *local, bool mode)
{
        int ret;

        might_sleep();

        if (!local->ops->set_lbt) {
                WARN_ON(1);
                return -EOPNOTSUPP;
        }

        trace_802154_drv_set_lbt_mode(local, mode);
        ret = local->ops->set_lbt(&local->hw, mode);
        trace_802154_drv_return_int(local, ret);
        return ret;
}

static inline int
drv_set_cca_ed_level(struct ieee802154_local *local, s32 mbm)
{
        int ret;

        might_sleep();

        if (!local->ops->set_cca_ed_level) {
                WARN_ON(1);
                return -EOPNOTSUPP;
        }

        trace_802154_drv_set_cca_ed_level(local, mbm);
        ret = local->ops->set_cca_ed_level(&local->hw, mbm);
        trace_802154_drv_return_int(local, ret);
        return ret;
}

static inline int drv_set_pan_id(struct ieee802154_local *local, __le16 pan_id)
{
        struct ieee802154_hw_addr_filt filt;
        int ret;

        might_sleep();

        if (!local->ops->set_hw_addr_filt) {
                WARN_ON(1);
                return -EOPNOTSUPP;
        }

        filt.pan_id = pan_id;

        trace_802154_drv_set_pan_id(local, pan_id);
        ret = local->ops->set_hw_addr_filt(&local->hw, &filt,
                                            IEEE802154_AFILT_PANID_CHANGED);
        trace_802154_drv_return_int(local, ret);
        return ret;
}

static inline int
drv_set_extended_addr(struct ieee802154_local *local, __le64 extended_addr)
{
        struct ieee802154_hw_addr_filt filt;
        int ret;

        might_sleep();

        if (!local->ops->set_hw_addr_filt) {
                WARN_ON(1);
                return -EOPNOTSUPP;
        }

        filt.ieee_addr = extended_addr;

        trace_802154_drv_set_extended_addr(local, extended_addr);
        ret = local->ops->set_hw_addr_filt(&local->hw, &filt,
                                            IEEE802154_AFILT_IEEEADDR_CHANGED);
        trace_802154_drv_return_int(local, ret);
        return ret;
}

static inline int
drv_set_short_addr(struct ieee802154_local *local, __le16 short_addr)
{
        struct ieee802154_hw_addr_filt filt;
        int ret;

        might_sleep();

        if (!local->ops->set_hw_addr_filt) {
                WARN_ON(1);
                return -EOPNOTSUPP;
        }

        filt.short_addr = short_addr;

        trace_802154_drv_set_short_addr(local, short_addr);
        ret = local->ops->set_hw_addr_filt(&local->hw, &filt,
                                            IEEE802154_AFILT_SADDR_CHANGED);
        trace_802154_drv_return_int(local, ret);
        return ret;
}

static inline int
drv_set_pan_coord(struct ieee802154_local *local, bool is_coord)
{
        struct ieee802154_hw_addr_filt filt;
        int ret;

        might_sleep();

        if (!local->ops->set_hw_addr_filt) {
                WARN_ON(1);
                return -EOPNOTSUPP;
        }

        filt.pan_coord = is_coord;

        trace_802154_drv_set_pan_coord(local, is_coord);
        ret = local->ops->set_hw_addr_filt(&local->hw, &filt,
                                            IEEE802154_AFILT_PANC_CHANGED);
        trace_802154_drv_return_int(local, ret);
        return ret;
}

static inline int
drv_set_csma_params(struct ieee802154_local *local, u8 min_be, u8 max_be,
                    u8 max_csma_backoffs)
{
        int ret;

        might_sleep();

        if (!local->ops->set_csma_params) {
                WARN_ON(1);
                return -EOPNOTSUPP;
        }

        trace_802154_drv_set_csma_params(local, min_be, max_be,
                                         max_csma_backoffs);
        ret = local->ops->set_csma_params(&local->hw, min_be, max_be,
                                           max_csma_backoffs);
        trace_802154_drv_return_int(local, ret);
        return ret;
}

static inline int
drv_set_max_frame_retries(struct ieee802154_local *local, s8 max_frame_retries)
{
        int ret;

        might_sleep();

        if (!local->ops->set_frame_retries) {
                WARN_ON(1);
                return -EOPNOTSUPP;
        }

        trace_802154_drv_set_max_frame_retries(local, max_frame_retries);
        ret = local->ops->set_frame_retries(&local->hw, max_frame_retries);
        trace_802154_drv_return_int(local, ret);
        return ret;
}

static inline int
drv_set_promiscuous_mode(struct ieee802154_local *local, bool on)
{
        int ret;

        might_sleep();

        if (!local->ops->set_promiscuous_mode) {
                WARN_ON(1);
                return -EOPNOTSUPP;
        }

        trace_802154_drv_set_promiscuous_mode(local, on);
        ret = local->ops->set_promiscuous_mode(&local->hw, on);
        trace_802154_drv_return_int(local, ret);
        return ret;
}

#endif /* __MAC802154_DRIVER_OPS */








































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
/* Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
 *
 * This file is provided under a dual BSD/GPLv2 license.
 *
 * SipHash: a fast short-input PRF
 * https://131002.net/siphash/
 *
 * This implementation is specifically for SipHash2-4 for a secure PRF
 * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for
 * hashtables.
 */

#ifndef _LINUX_SIPHASH_H
#define _LINUX_SIPHASH_H

#include <linux/types.h>
#include <linux/kernel.h>

#define SIPHASH_ALIGNMENT __alignof__(u64)
typedef struct {
        u64 key[2];
} siphash_key_t;

static inline bool siphash_key_is_zero(const siphash_key_t *key)
{
        return !(key->key[0] | key->key[1]);
}

u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key);
u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key);

u64 siphash_1u64(const u64 a, const siphash_key_t *key);
u64 siphash_2u64(const u64 a, const u64 b, const siphash_key_t *key);
u64 siphash_3u64(const u64 a, const u64 b, const u64 c,
                 const siphash_key_t *key);
u64 siphash_4u64(const u64 a, const u64 b, const u64 c, const u64 d,
                 const siphash_key_t *key);
u64 siphash_1u32(const u32 a, const siphash_key_t *key);
u64 siphash_3u32(const u32 a, const u32 b, const u32 c,
                 const siphash_key_t *key);

static inline u64 siphash_2u32(const u32 a, const u32 b,
                               const siphash_key_t *key)
{
        return siphash_1u64((u64)b << 32 | a, key);
}
static inline u64 siphash_4u32(const u32 a, const u32 b, const u32 c,
                               const u32 d, const siphash_key_t *key)
{
        return siphash_2u64((u64)b << 32 | a, (u64)d << 32 | c, key);
}


static inline u64 ___siphash_aligned(const __le64 *data, size_t len,
                                     const siphash_key_t *key)
{
        if (__builtin_constant_p(len) && len == 4)
                return siphash_1u32(le32_to_cpup((const __le32 *)data), key);
        if (__builtin_constant_p(len) && len == 8)
                return siphash_1u64(le64_to_cpu(data[0]), key);
        if (__builtin_constant_p(len) && len == 16)
                return siphash_2u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]),
                                    key);
        if (__builtin_constant_p(len) && len == 24)
                return siphash_3u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]),
                                    le64_to_cpu(data[2]), key);
        if (__builtin_constant_p(len) && len == 32)
                return siphash_4u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]),
                                    le64_to_cpu(data[2]), le64_to_cpu(data[3]),
                                    key);
        return __siphash_aligned(data, len, key);
}

/**
 * siphash - compute 64-bit siphash PRF value
 * @data: buffer to hash
 * @size: size of @data
 * @key: the siphash key
 */
static inline u64 siphash(const void *data, size_t len,
                          const siphash_key_t *key)
{
        if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
            !IS_ALIGNED((unsigned long)data, SIPHASH_ALIGNMENT))
                return __siphash_unaligned(data, len, key);
        return ___siphash_aligned(data, len, key);
}

#define HSIPHASH_ALIGNMENT __alignof__(unsigned long)
typedef struct {
        unsigned long key[2];
} hsiphash_key_t;

u32 __hsiphash_aligned(const void *data, size_t len,
                       const hsiphash_key_t *key);
u32 __hsiphash_unaligned(const void *data, size_t len,
                         const hsiphash_key_t *key);

u32 hsiphash_1u32(const u32 a, const hsiphash_key_t *key);
u32 hsiphash_2u32(const u32 a, const u32 b, const hsiphash_key_t *key);
u32 hsiphash_3u32(const u32 a, const u32 b, const u32 c,
                  const hsiphash_key_t *key);
u32 hsiphash_4u32(const u32 a, const u32 b, const u32 c, const u32 d,
                  const hsiphash_key_t *key);

static inline u32 ___hsiphash_aligned(const __le32 *data, size_t len,
                                      const hsiphash_key_t *key)
{
        if (__builtin_constant_p(len) && len == 4)
                return hsiphash_1u32(le32_to_cpu(data[0]), key);
        if (__builtin_constant_p(len) && len == 8)
                return hsiphash_2u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]),
                                     key);
        if (__builtin_constant_p(len) && len == 12)
                return hsiphash_3u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]),
                                     le32_to_cpu(data[2]), key);
        if (__builtin_constant_p(len) && len == 16)
                return hsiphash_4u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]),
                                     le32_to_cpu(data[2]), le32_to_cpu(data[3]),
                                     key);
        return __hsiphash_aligned(data, len, key);
}

/**
 * hsiphash - compute 32-bit hsiphash PRF value
 * @data: buffer to hash
 * @size: size of @data
 * @key: the hsiphash key
 */
static inline u32 hsiphash(const void *data, size_t len,
                           const hsiphash_key_t *key)
{
        if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
            !IS_ALIGNED((unsigned long)data, HSIPHASH_ALIGNMENT))
                return __hsiphash_unaligned(data, len, key);
        return ___hsiphash_aligned(data, len, key);
}

/*
 * These macros expose the raw SipHash and HalfSipHash permutations.
 * Do not use them directly! If you think you have a use for them,
 * be sure to CC the maintainer of this file explaining why.
 */

#define SIPHASH_PERMUTATION(a, b, c, d) ( \
        (a) += (b), (b) = rol64((b), 13), (b) ^= (a), (a) = rol64((a), 32), \
        (c) += (d), (d) = rol64((d), 16), (d) ^= (c), \
        (a) += (d), (d) = rol64((d), 21), (d) ^= (a), \
        (c) += (b), (b) = rol64((b), 17), (b) ^= (c), (c) = rol64((c), 32))

#define SIPHASH_CONST_0 0x736f6d6570736575ULL
#define SIPHASH_CONST_1 0x646f72616e646f6dULL
#define SIPHASH_CONST_2 0x6c7967656e657261ULL
#define SIPHASH_CONST_3 0x7465646279746573ULL

#define HSIPHASH_PERMUTATION(a, b, c, d) ( \
        (a) += (b), (b) = rol32((b), 5), (b) ^= (a), (a) = rol32((a), 16), \
        (c) += (d), (d) = rol32((d), 8), (d) ^= (c), \
        (a) += (d), (d) = rol32((d), 7), (d) ^= (a), \
        (c) += (b), (b) = rol32((b), 13), (b) ^= (c), (c) = rol32((c), 16))

#define HSIPHASH_CONST_0 0U
#define HSIPHASH_CONST_1 0U
#define HSIPHASH_CONST_2 0x6c796765U
#define HSIPHASH_CONST_3 0x74656462U

#endif /* _LINUX_SIPHASH_H */










































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef __SOUND_CORE_H
#define __SOUND_CORE_H

/*
 *  Main header file for the ALSA driver
 *  Copyright (c) 1994-2001 by Jaroslav Kysela <perex@perex.cz>
 */

#include <linux/device.h>
#include <linux/sched.h>                /* wake_up() */
#include <linux/mutex.h>                /* struct mutex */
#include <linux/rwsem.h>                /* struct rw_semaphore */
#include <linux/pm.h>                        /* pm_message_t */
#include <linux/stringify.h>
#include <linux/printk.h>

/* number of supported soundcards */
#ifdef CONFIG_SND_DYNAMIC_MINORS
#define SNDRV_CARDS CONFIG_SND_MAX_CARDS
#else
#define SNDRV_CARDS 8                /* don't change - minor numbers */
#endif

#define CONFIG_SND_MAJOR        116        /* standard configuration */

/* forward declarations */
struct pci_dev;
struct module;
struct completion;

/* device allocation stuff */

/* type of the object used in snd_device_*()
 * this also defines the calling order
 */
enum snd_device_type {
        SNDRV_DEV_LOWLEVEL,
        SNDRV_DEV_INFO,
        SNDRV_DEV_BUS,
        SNDRV_DEV_CODEC,
        SNDRV_DEV_PCM,
        SNDRV_DEV_COMPRESS,
        SNDRV_DEV_RAWMIDI,
        SNDRV_DEV_TIMER,
        SNDRV_DEV_SEQUENCER,
        SNDRV_DEV_HWDEP,
        SNDRV_DEV_JACK,
        SNDRV_DEV_CONTROL,        /* NOTE: this must be the last one */
};

enum snd_device_state {
        SNDRV_DEV_BUILD,
        SNDRV_DEV_REGISTERED,
        SNDRV_DEV_DISCONNECTED,
};

struct snd_device;

struct snd_device_ops {
        int (*dev_free)(struct snd_device *dev);
        int (*dev_register)(struct snd_device *dev);
        int (*dev_disconnect)(struct snd_device *dev);
};

struct snd_device {
        struct list_head list;                /* list of registered devices */
        struct snd_card *card;                /* card which holds this device */
        enum snd_device_state state;        /* state of the device */
        enum snd_device_type type;        /* device type */
        void *device_data;                /* device structure */
        const struct snd_device_ops *ops;        /* operations */
};

#define snd_device(n) list_entry(n, struct snd_device, list)

/* main structure for soundcard */

struct snd_card {
        int number;                        /* number of soundcard (index to
                                                                snd_cards) */

        char id[16];                        /* id string of this card */
        char driver[16];                /* driver name */
        char shortname[32];                /* short name of this soundcard */
        char longname[80];                /* name of this soundcard */
        char irq_descr[32];                /* Interrupt description */
        char mixername[80];                /* mixer name */
        char components[128];                /* card components delimited with
                                                                space */
        struct module *module;                /* top-level module */

        void *private_data;                /* private data for soundcard */
        void (*private_free) (struct snd_card *card); /* callback for freeing of
                                                                private data */
        struct list_head devices;        /* devices */

        struct device ctl_dev;                /* control device */
        unsigned int last_numid;        /* last used numeric ID */
        struct rw_semaphore controls_rwsem;        /* controls list lock */
        rwlock_t ctl_files_rwlock;        /* ctl_files list lock */
        int controls_count;                /* count of all controls */
        int user_ctl_count;                /* count of all user controls */
        struct list_head controls;        /* all controls for this card */
        struct list_head ctl_files;        /* active control files */

        struct snd_info_entry *proc_root;        /* root for soundcard specific files */
        struct proc_dir_entry *proc_root_link;        /* number link to real id */

        struct list_head files_list;        /* all files associated to this card */
        struct snd_shutdown_f_ops *s_f_ops; /* file operations in the shutdown
                                                                state */
        spinlock_t files_lock;                /* lock the files for this card */
        int shutdown;                        /* this card is going down */
        struct completion *release_completion;
        struct device *dev;                /* device assigned to this card */
        struct device card_dev;                /* cardX object for sysfs */
        const struct attribute_group *dev_groups[4]; /* assigned sysfs attr */
        bool registered;                /* card_dev is registered? */
        int sync_irq;                        /* assigned irq, used for PCM sync */
        wait_queue_head_t remove_sleep;

        size_t total_pcm_alloc_bytes;        /* total amount of allocated buffers */
        struct mutex memory_mutex;        /* protection for the above */

#ifdef CONFIG_PM
        unsigned int power_state;        /* power state */
        wait_queue_head_t power_sleep;
#endif

#if IS_ENABLED(CONFIG_SND_MIXER_OSS)
        struct snd_mixer_oss *mixer_oss;
        int mixer_oss_change_count;
#endif
};

#define dev_to_snd_card(p)        container_of(p, struct snd_card, card_dev)

#ifdef CONFIG_PM
static inline unsigned int snd_power_get_state(struct snd_card *card)
{
        return card->power_state;
}

static inline void snd_power_change_state(struct snd_card *card, unsigned int state)
{
        card->power_state = state;
        wake_up(&card->power_sleep);
}

/* init.c */
int snd_power_wait(struct snd_card *card, unsigned int power_state);

#else /* ! CONFIG_PM */

static inline int snd_power_wait(struct snd_card *card, unsigned int state) { return 0; }
#define snd_power_get_state(card)        ({ (void)(card); SNDRV_CTL_POWER_D0; })
#define snd_power_change_state(card, state)        do { (void)(card); } while (0)

#endif /* CONFIG_PM */

struct snd_minor {
        int type;                        /* SNDRV_DEVICE_TYPE_XXX */
        int card;                        /* card number */
        int device;                        /* device number */
        const struct file_operations *f_ops;        /* file operations */
        void *private_data;                /* private data for f_ops->open */
        struct device *dev;                /* device for sysfs */
        struct snd_card *card_ptr;        /* assigned card instance */
};

/* return a device pointer linked to each sound device as a parent */
static inline struct device *snd_card_get_device_link(struct snd_card *card)
{
        return card ? &card->card_dev : NULL;
}

/* sound.c */

extern int snd_major;
extern int snd_ecards_limit;
extern struct class *sound_class;

void snd_request_card(int card);

void snd_device_initialize(struct device *dev, struct snd_card *card);

int snd_register_device(int type, struct snd_card *card, int dev,
                        const struct file_operations *f_ops,
                        void *private_data, struct device *device);
int snd_unregister_device(struct device *dev);
void *snd_lookup_minor_data(unsigned int minor, int type);

#ifdef CONFIG_SND_OSSEMUL
int snd_register_oss_device(int type, struct snd_card *card, int dev,
                            const struct file_operations *f_ops, void *private_data);
int snd_unregister_oss_device(int type, struct snd_card *card, int dev);
void *snd_lookup_oss_minor_data(unsigned int minor, int type);
#endif

int snd_minor_info_init(void);

/* sound_oss.c */

#ifdef CONFIG_SND_OSSEMUL
int snd_minor_info_oss_init(void);
#else
static inline int snd_minor_info_oss_init(void) { return 0; }
#endif

/* memory.c */

int copy_to_user_fromio(void __user *dst, const volatile void __iomem *src, size_t count);
int copy_from_user_toio(volatile void __iomem *dst, const void __user *src, size_t count);

/* init.c */

int snd_card_locked(int card);
#if IS_ENABLED(CONFIG_SND_MIXER_OSS)
#define SND_MIXER_OSS_NOTIFY_REGISTER        0
#define SND_MIXER_OSS_NOTIFY_DISCONNECT        1
#define SND_MIXER_OSS_NOTIFY_FREE        2
extern int (*snd_mixer_oss_notify_callback)(struct snd_card *card, int cmd);
#endif

int snd_card_new(struct device *parent, int idx, const char *xid,
                 struct module *module, int extra_size,
                 struct snd_card **card_ret);

int snd_card_disconnect(struct snd_card *card);
void snd_card_disconnect_sync(struct snd_card *card);
int snd_card_free(struct snd_card *card);
int snd_card_free_when_closed(struct snd_card *card);
void snd_card_set_id(struct snd_card *card, const char *id);
int snd_card_register(struct snd_card *card);
int snd_card_info_init(void);
int snd_card_add_dev_attr(struct snd_card *card,
                          const struct attribute_group *group);
int snd_component_add(struct snd_card *card, const char *component);
int snd_card_file_add(struct snd_card *card, struct file *file);
int snd_card_file_remove(struct snd_card *card, struct file *file);

struct snd_card *snd_card_ref(int card);

/**
 * snd_card_unref - Unreference the card object
 * @card: the card object to unreference
 *
 * Call this function for the card object that was obtained via snd_card_ref()
 * or snd_lookup_minor_data().
 */
static inline void snd_card_unref(struct snd_card *card)
{
        put_device(&card->card_dev);
}

#define snd_card_set_dev(card, devptr) ((card)->dev = (devptr))

/* device.c */

int snd_device_new(struct snd_card *card, enum snd_device_type type,
                   void *device_data, const struct snd_device_ops *ops);
int snd_device_register(struct snd_card *card, void *device_data);
int snd_device_register_all(struct snd_card *card);
void snd_device_disconnect(struct snd_card *card, void *device_data);
void snd_device_disconnect_all(struct snd_card *card);
void snd_device_free(struct snd_card *card, void *device_data);
void snd_device_free_all(struct snd_card *card);
int snd_device_get_state(struct snd_card *card, void *device_data);

/* isadma.c */

#ifdef CONFIG_ISA_DMA_API
#define DMA_MODE_NO_ENABLE        0x0100

void snd_dma_program(unsigned long dma, unsigned long addr, unsigned int size, unsigned short mode);
void snd_dma_disable(unsigned long dma);
unsigned int snd_dma_pointer(unsigned long dma, unsigned int size);
#endif

/* misc.c */
struct resource;
void release_and_free_resource(struct resource *res);

/* --- */

/* sound printk debug levels */
enum {
        SND_PR_ALWAYS,
        SND_PR_DEBUG,
        SND_PR_VERBOSE,
};

#if defined(CONFIG_SND_DEBUG) || defined(CONFIG_SND_VERBOSE_PRINTK)
__printf(4, 5)
void __snd_printk(unsigned int level, const char *file, int line,
                  const char *format, ...);
#else
#define __snd_printk(level, file, line, format, ...) \
        printk(format, ##__VA_ARGS__)
#endif

/**
 * snd_printk - printk wrapper
 * @fmt: format string
 *
 * Works like printk() but prints the file and the line of the caller
 * when configured with CONFIG_SND_VERBOSE_PRINTK.
 */
#define snd_printk(fmt, ...) \
        __snd_printk(0, __FILE__, __LINE__, fmt, ##__VA_ARGS__)

#ifdef CONFIG_SND_DEBUG
/**
 * snd_printd - debug printk
 * @fmt: format string
 *
 * Works like snd_printk() for debugging purposes.
 * Ignored when CONFIG_SND_DEBUG is not set.
 */
#define snd_printd(fmt, ...) \
        __snd_printk(1, __FILE__, __LINE__, fmt, ##__VA_ARGS__)
#define _snd_printd(level, fmt, ...) \
        __snd_printk(level, __FILE__, __LINE__, fmt, ##__VA_ARGS__)

/**
 * snd_BUG - give a BUG warning message and stack trace
 *
 * Calls WARN() if CONFIG_SND_DEBUG is set.
 * Ignored when CONFIG_SND_DEBUG is not set.
 */
#define snd_BUG()                WARN(1, "BUG?\n")

/**
 * snd_printd_ratelimit - Suppress high rates of output when
 *                           CONFIG_SND_DEBUG is enabled.
 */
#define snd_printd_ratelimit() printk_ratelimit()

/**
 * snd_BUG_ON - debugging check macro
 * @cond: condition to evaluate
 *
 * Has the same behavior as WARN_ON when CONFIG_SND_DEBUG is set,
 * otherwise just evaluates the conditional and returns the value.
 */
#define snd_BUG_ON(cond)        WARN_ON((cond))

#else /* !CONFIG_SND_DEBUG */

__printf(1, 2)
static inline void snd_printd(const char *format, ...) {}
__printf(2, 3)
static inline void _snd_printd(int level, const char *format, ...) {}

#define snd_BUG()                        do { } while (0)

#define snd_BUG_ON(condition) ({ \
        int __ret_warn_on = !!(condition); \
        unlikely(__ret_warn_on); \
})

static inline bool snd_printd_ratelimit(void) { return false; }

#endif /* CONFIG_SND_DEBUG */

#ifdef CONFIG_SND_DEBUG_VERBOSE
/**
 * snd_printdd - debug printk
 * @format: format string
 *
 * Works like snd_printk() for debugging purposes.
 * Ignored when CONFIG_SND_DEBUG_VERBOSE is not set.
 */
#define snd_printdd(format, ...) \
        __snd_printk(2, __FILE__, __LINE__, format, ##__VA_ARGS__)
#else
__printf(1, 2)
static inline void snd_printdd(const char *format, ...) {}
#endif


#define SNDRV_OSS_VERSION         ((3<<16)|(8<<8)|(1<<4)|(0))        /* 3.8.1a */

/* for easier backward-porting */
#if IS_ENABLED(CONFIG_GAMEPORT)
#define gameport_set_dev_parent(gp,xdev) ((gp)->dev.parent = (xdev))
#define gameport_set_port_data(gp,r) ((gp)->port_data = (r))
#define gameport_get_port_data(gp) (gp)->port_data
#endif

/* PCI quirk list helper */
struct snd_pci_quirk {
        unsigned short subvendor;        /* PCI subvendor ID */
        unsigned short subdevice;        /* PCI subdevice ID */
        unsigned short subdevice_mask;        /* bitmask to match */
        int value;                        /* value */
#ifdef CONFIG_SND_DEBUG_VERBOSE
        const char *name;                /* name of the device (optional) */
#endif
};

#define _SND_PCI_QUIRK_ID_MASK(vend, mask, dev)        \
        .subvendor = (vend), .subdevice = (dev), .subdevice_mask = (mask)
#define _SND_PCI_QUIRK_ID(vend, dev) \
        _SND_PCI_QUIRK_ID_MASK(vend, 0xffff, dev)
#define SND_PCI_QUIRK_ID(vend,dev) {_SND_PCI_QUIRK_ID(vend, dev)}
#ifdef CONFIG_SND_DEBUG_VERBOSE
#define SND_PCI_QUIRK(vend,dev,xname,val) \
        {_SND_PCI_QUIRK_ID(vend, dev), .value = (val), .name = (xname)}
#define SND_PCI_QUIRK_VENDOR(vend, xname, val)                        \
        {_SND_PCI_QUIRK_ID_MASK(vend, 0, 0), .value = (val), .name = (xname)}
#define SND_PCI_QUIRK_MASK(vend, mask, dev, xname, val)                        \
        {_SND_PCI_QUIRK_ID_MASK(vend, mask, dev),                        \
                        .value = (val), .name = (xname)}
#define snd_pci_quirk_name(q)        ((q)->name)
#else
#define SND_PCI_QUIRK(vend,dev,xname,val) \
        {_SND_PCI_QUIRK_ID(vend, dev), .value = (val)}
#define SND_PCI_QUIRK_MASK(vend, mask, dev, xname, val)                        \
        {_SND_PCI_QUIRK_ID_MASK(vend, mask, dev), .value = (val)}
#define SND_PCI_QUIRK_VENDOR(vend, xname, val)                        \
        {_SND_PCI_QUIRK_ID_MASK(vend, 0, 0), .value = (val)}
#define snd_pci_quirk_name(q)        ""
#endif

#ifdef CONFIG_PCI
const struct snd_pci_quirk *
snd_pci_quirk_lookup(struct pci_dev *pci, const struct snd_pci_quirk *list);

const struct snd_pci_quirk *
snd_pci_quirk_lookup_id(u16 vendor, u16 device,
                        const struct snd_pci_quirk *list);
#else
static inline const struct snd_pci_quirk *
snd_pci_quirk_lookup(struct pci_dev *pci, const struct snd_pci_quirk *list)
{
        return NULL;
}

static inline const struct snd_pci_quirk *
snd_pci_quirk_lookup_id(u16 vendor, u16 device,
                        const struct snd_pci_quirk *list)
{
        return NULL;
}
#endif

/* async signal helpers */
struct snd_fasync;

int snd_fasync_helper(int fd, struct file *file, int on,
                      struct snd_fasync **fasyncp);
void snd_kill_fasync(struct snd_fasync *fasync, int signal, int poll);
void snd_fasync_free(struct snd_fasync *fasync);

#endif /* __SOUND_CORE_H */














































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NET_DST_CACHE_H
#define _NET_DST_CACHE_H

#include <linux/jiffies.h>
#include <net/dst.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ip6_fib.h>
#endif

struct dst_cache {
        struct dst_cache_pcpu __percpu *cache;
        unsigned long reset_ts;
};

/**
 *        dst_cache_get - perform cache lookup
 *        @dst_cache: the cache
 *
 *        The caller should use dst_cache_get_ip4() if it need to retrieve the
 *        source address to be used when xmitting to the cached dst.
 *        local BH must be disabled.
 */
struct dst_entry *dst_cache_get(struct dst_cache *dst_cache);

/**
 *        dst_cache_get_ip4 - perform cache lookup and fetch ipv4 source address
 *        @dst_cache: the cache
 *        @saddr: return value for the retrieved source address
 *
 *        local BH must be disabled.
 */
struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr);

/**
 *        dst_cache_set_ip4 - store the ipv4 dst into the cache
 *        @dst_cache: the cache
 *        @dst: the entry to be cached
 *        @saddr: the source address to be stored inside the cache
 *
 *        local BH must be disabled.
 */
void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst,
                       __be32 saddr);

#if IS_ENABLED(CONFIG_IPV6)

/**
 *        dst_cache_set_ip6 - store the ipv6 dst into the cache
 *        @dst_cache: the cache
 *        @dst: the entry to be cached
 *        @saddr: the source address to be stored inside the cache
 *
 *        local BH must be disabled.
 */
void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst,
                       const struct in6_addr *saddr);

/**
 *        dst_cache_get_ip6 - perform cache lookup and fetch ipv6 source address
 *        @dst_cache: the cache
 *        @saddr: return value for the retrieved source address
 *
 *        local BH must be disabled.
 */
struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache,
                                    struct in6_addr *saddr);
#endif

/**
 *        dst_cache_reset - invalidate the cache contents
 *        @dst_cache: the cache
 *
 *        This does not free the cached dst to avoid races and contentions.
 *        the dst will be freed on later cache lookup.
 */
static inline void dst_cache_reset(struct dst_cache *dst_cache)
{
        dst_cache->reset_ts = jiffies;
}

/**
 *        dst_cache_reset_now - invalidate the cache contents immediately
 *        @dst_cache: the cache
 *
 *        The caller must be sure there are no concurrent users, as this frees
 *        all dst_cache users immediately, rather than waiting for the next
 *        per-cpu usage like dst_cache_reset does. Most callers should use the
 *        higher speed lazily-freed dst_cache_reset function instead.
 */
void dst_cache_reset_now(struct dst_cache *dst_cache);

/**
 *        dst_cache_init - initialize the cache, allocating the required storage
 *        @dst_cache: the cache
 *        @gfp: allocation flags
 */
int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp);

/**
 *        dst_cache_destroy - empty the cache and free the allocated storage
 *        @dst_cache: the cache
 *
 *        No synchronization is enforced: it must be called only when the cache
 *        is unsed.
 */
void dst_cache_destroy(struct dst_cache *dst_cache);

#endif




























































































































    3 






    3 
















































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Synchronous Cryptographic Hash operations.
 *
 * Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au>
 */

#include <crypto/scatterwalk.h>
#include <crypto/internal/hash.h>
#include <linux/err.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/seq_file.h>
#include <linux/cryptouser.h>
#include <net/netlink.h>
#include <linux/compiler.h>

#include "internal.h"

static const struct crypto_type crypto_shash_type;

static int shash_no_setkey(struct crypto_shash *tfm, const u8 *key,
                           unsigned int keylen)
{
        return -ENOSYS;
}

/*
 * Check whether an shash algorithm has a setkey function.
 *
 * For CFI compatibility, this must not be an inline function.  This is because
 * when CFI is enabled, modules won't get the same address for shash_no_setkey
 * (if it were exported, which inlining would require) as the core kernel will.
 */
bool crypto_shash_alg_has_setkey(struct shash_alg *alg)
{
        return alg->setkey != shash_no_setkey;
}
EXPORT_SYMBOL_GPL(crypto_shash_alg_has_setkey);

static int shash_setkey_unaligned(struct crypto_shash *tfm, const u8 *key,
                                  unsigned int keylen)
{
        struct shash_alg *shash = crypto_shash_alg(tfm);
        unsigned long alignmask = crypto_shash_alignmask(tfm);
        unsigned long absize;
        u8 *buffer, *alignbuffer;
        int err;

        absize = keylen + (alignmask & ~(crypto_tfm_ctx_alignment() - 1));
        buffer = kmalloc(absize, GFP_ATOMIC);
        if (!buffer)
                return -ENOMEM;

        alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1);
        memcpy(alignbuffer, key, keylen);
        err = shash->setkey(tfm, alignbuffer, keylen);
        kfree_sensitive(buffer);
        return err;
}

static void shash_set_needkey(struct crypto_shash *tfm, struct shash_alg *alg)
{
        if (crypto_shash_alg_needs_key(alg))
                crypto_shash_set_flags(tfm, CRYPTO_TFM_NEED_KEY);
}

int crypto_shash_setkey(struct crypto_shash *tfm, const u8 *key,
                        unsigned int keylen)
{
        struct shash_alg *shash = crypto_shash_alg(tfm);
        unsigned long alignmask = crypto_shash_alignmask(tfm);
        int err;

        if ((unsigned long)key & alignmask)
                err = shash_setkey_unaligned(tfm, key, keylen);
        else
                err = shash->setkey(tfm, key, keylen);

        if (unlikely(err)) {
                shash_set_needkey(tfm, shash);
                return err;
        }

        crypto_shash_clear_flags(tfm, CRYPTO_TFM_NEED_KEY);
        return 0;
}
EXPORT_SYMBOL_GPL(crypto_shash_setkey);

static int shash_update_unaligned(struct shash_desc *desc, const u8 *data,
                                  unsigned int len)
{
        struct crypto_shash *tfm = desc->tfm;
        struct shash_alg *shash = crypto_shash_alg(tfm);
        unsigned long alignmask = crypto_shash_alignmask(tfm);
        unsigned int unaligned_len = alignmask + 1 -
                                     ((unsigned long)data & alignmask);
        /*
         * We cannot count on __aligned() working for large values:
         * https://patchwork.kernel.org/patch/9507697/
         */
        u8 ubuf[MAX_ALGAPI_ALIGNMASK * 2];
        u8 *buf = PTR_ALIGN(&ubuf[0], alignmask + 1);
        int err;

        if (WARN_ON(buf + unaligned_len > ubuf + sizeof(ubuf)))
                return -EINVAL;

        if (unaligned_len > len)
                unaligned_len = len;

        memcpy(buf, data, unaligned_len);
        err = shash->update(desc, buf, unaligned_len);
        memset(buf, 0, unaligned_len);

        return err ?:
               shash->update(desc, data + unaligned_len, len - unaligned_len);
}

int crypto_shash_update(struct shash_desc *desc, const u8 *data,
                        unsigned int len)
{
        struct crypto_shash *tfm = desc->tfm;
        struct shash_alg *shash = crypto_shash_alg(tfm);
        unsigned long alignmask = crypto_shash_alignmask(tfm);

        if ((unsigned long)data & alignmask)
                return shash_update_unaligned(desc, data, len);

        return shash->update(desc, data, len);
}
EXPORT_SYMBOL_GPL(crypto_shash_update);

static int shash_final_unaligned(struct shash_desc *desc, u8 *out)
{
        struct crypto_shash *tfm = desc->tfm;
        unsigned long alignmask = crypto_shash_alignmask(tfm);
        struct shash_alg *shash = crypto_shash_alg(tfm);
        unsigned int ds = crypto_shash_digestsize(tfm);
        /*
         * We cannot count on __aligned() working for large values:
         * https://patchwork.kernel.org/patch/9507697/
         */
        u8 ubuf[MAX_ALGAPI_ALIGNMASK + HASH_MAX_DIGESTSIZE];
        u8 *buf = PTR_ALIGN(&ubuf[0], alignmask + 1);
        int err;

        if (WARN_ON(buf + ds > ubuf + sizeof(ubuf)))
                return -EINVAL;

        err = shash->final(desc, buf);
        if (err)
                goto out;

        memcpy(out, buf, ds);

out:
        memset(buf, 0, ds);
        return err;
}

int crypto_shash_final(struct shash_desc *desc, u8 *out)
{
        struct crypto_shash *tfm = desc->tfm;
        struct shash_alg *shash = crypto_shash_alg(tfm);
        unsigned long alignmask = crypto_shash_alignmask(tfm);

        if ((unsigned long)out & alignmask)
                return shash_final_unaligned(desc, out);

        return shash->final(desc, out);
}
EXPORT_SYMBOL_GPL(crypto_shash_final);

static int shash_finup_unaligned(struct shash_desc *desc, const u8 *data,
                                 unsigned int len, u8 *out)
{
        return crypto_shash_update(desc, data, len) ?:
               crypto_shash_final(desc, out);
}

int crypto_shash_finup(struct shash_desc *desc, const u8 *data,
                       unsigned int len, u8 *out)
{
        struct crypto_shash *tfm = desc->tfm;
        struct shash_alg *shash = crypto_shash_alg(tfm);
        unsigned long alignmask = crypto_shash_alignmask(tfm);

        if (((unsigned long)data | (unsigned long)out) & alignmask)
                return shash_finup_unaligned(desc, data, len, out);

        return shash->finup(desc, data, len, out);
}
EXPORT_SYMBOL_GPL(crypto_shash_finup);

static int shash_digest_unaligned(struct shash_desc *desc, const u8 *data,
                                  unsigned int len, u8 *out)
{
        return crypto_shash_init(desc) ?:
               crypto_shash_finup(desc, data, len, out);
}

int crypto_shash_digest(struct shash_desc *desc, const u8 *data,
                        unsigned int len, u8 *out)
{
        struct crypto_shash *tfm = desc->tfm;
        struct shash_alg *shash = crypto_shash_alg(tfm);
        unsigned long alignmask = crypto_shash_alignmask(tfm);

        if (crypto_shash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
                return -ENOKEY;

        if (((unsigned long)data | (unsigned long)out) & alignmask)
                return shash_digest_unaligned(desc, data, len, out);

        return shash->digest(desc, data, len, out);
}
EXPORT_SYMBOL_GPL(crypto_shash_digest);

int crypto_shash_tfm_digest(struct crypto_shash *tfm, const u8 *data,
                            unsigned int len, u8 *out)
{
        SHASH_DESC_ON_STACK(desc, tfm);
        int err;

        desc->tfm = tfm;

        err = crypto_shash_digest(desc, data, len, out);

        shash_desc_zero(desc);

        return err;
}
EXPORT_SYMBOL_GPL(crypto_shash_tfm_digest);

static int shash_default_export(struct shash_desc *desc, void *out)
{
        memcpy(out, shash_desc_ctx(desc), crypto_shash_descsize(desc->tfm));
        return 0;
}

static int shash_default_import(struct shash_desc *desc, const void *in)
{
        memcpy(shash_desc_ctx(desc), in, crypto_shash_descsize(desc->tfm));
        return 0;
}

static int shash_async_setkey(struct crypto_ahash *tfm, const u8 *key,
                              unsigned int keylen)
{
        struct crypto_shash **ctx = crypto_ahash_ctx(tfm);

        return crypto_shash_setkey(*ctx, key, keylen);
}

static int shash_async_init(struct ahash_request *req)
{
        struct crypto_shash **ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req));
        struct shash_desc *desc = ahash_request_ctx(req);

        desc->tfm = *ctx;

        return crypto_shash_init(desc);
}

int shash_ahash_update(struct ahash_request *req, struct shash_desc *desc)
{
        struct crypto_hash_walk walk;
        int nbytes;

        for (nbytes = crypto_hash_walk_first(req, &walk); nbytes > 0;
             nbytes = crypto_hash_walk_done(&walk, nbytes))
                nbytes = crypto_shash_update(desc, walk.data, nbytes);

        return nbytes;
}
EXPORT_SYMBOL_GPL(shash_ahash_update);

static int shash_async_update(struct ahash_request *req)
{
        return shash_ahash_update(req, ahash_request_ctx(req));
}

static int shash_async_final(struct ahash_request *req)
{
        return crypto_shash_final(ahash_request_ctx(req), req->result);
}

int shash_ahash_finup(struct ahash_request *req, struct shash_desc *desc)
{
        struct crypto_hash_walk walk;
        int nbytes;

        nbytes = crypto_hash_walk_first(req, &walk);
        if (!nbytes)
                return crypto_shash_final(desc, req->result);

        do {
                nbytes = crypto_hash_walk_last(&walk) ?
                         crypto_shash_finup(desc, walk.data, nbytes,
                                            req->result) :
                         crypto_shash_update(desc, walk.data, nbytes);
                nbytes = crypto_hash_walk_done(&walk, nbytes);
        } while (nbytes > 0);

        return nbytes;
}
EXPORT_SYMBOL_GPL(shash_ahash_finup);

static int shash_async_finup(struct ahash_request *req)
{
        struct crypto_shash **ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req));
        struct shash_desc *desc = ahash_request_ctx(req);

        desc->tfm = *ctx;

        return shash_ahash_finup(req, desc);
}

int shash_ahash_digest(struct ahash_request *req, struct shash_desc *desc)
{
        unsigned int nbytes = req->nbytes;
        struct scatterlist *sg;
        unsigned int offset;
        int err;

        if (nbytes &&
            (sg = req->src, offset = sg->offset,
             nbytes <= min(sg->length, ((unsigned int)(PAGE_SIZE)) - offset))) {
                void *data;

                data = kmap_atomic(sg_page(sg));
                err = crypto_shash_digest(desc, data + offset, nbytes,
                                          req->result);
                kunmap_atomic(data);
        } else
                err = crypto_shash_init(desc) ?:
                      shash_ahash_finup(req, desc);

        return err;
}
EXPORT_SYMBOL_GPL(shash_ahash_digest);

static int shash_async_digest(struct ahash_request *req)
{
        struct crypto_shash **ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req));
        struct shash_desc *desc = ahash_request_ctx(req);

        desc->tfm = *ctx;

        return shash_ahash_digest(req, desc);
}

static int shash_async_export(struct ahash_request *req, void *out)
{
        return crypto_shash_export(ahash_request_ctx(req), out);
}

static int shash_async_import(struct ahash_request *req, const void *in)
{
        struct crypto_shash **ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req));
        struct shash_desc *desc = ahash_request_ctx(req);

        desc->tfm = *ctx;

        return crypto_shash_import(desc, in);
}

static void crypto_exit_shash_ops_async(struct crypto_tfm *tfm)
{
        struct crypto_shash **ctx = crypto_tfm_ctx(tfm);

        crypto_free_shash(*ctx);
}

int crypto_init_shash_ops_async(struct crypto_tfm *tfm)
{
        struct crypto_alg *calg = tfm->__crt_alg;
        struct shash_alg *alg = __crypto_shash_alg(calg);
        struct crypto_ahash *crt = __crypto_ahash_cast(tfm);
        struct crypto_shash **ctx = crypto_tfm_ctx(tfm);
        struct crypto_shash *shash;

        if (!crypto_mod_get(calg))
                return -EAGAIN;

        shash = crypto_create_tfm(calg, &crypto_shash_type);
        if (IS_ERR(shash)) {
                crypto_mod_put(calg);
                return PTR_ERR(shash);
        }

        *ctx = shash;
        tfm->exit = crypto_exit_shash_ops_async;

        crt->init = shash_async_init;
        crt->update = shash_async_update;
        crt->final = shash_async_final;
        crt->finup = shash_async_finup;
        crt->digest = shash_async_digest;
        if (crypto_shash_alg_has_setkey(alg))
                crt->setkey = shash_async_setkey;

        crypto_ahash_set_flags(crt, crypto_shash_get_flags(shash) &
                                    CRYPTO_TFM_NEED_KEY);

        crt->export = shash_async_export;
        crt->import = shash_async_import;

        crt->reqsize = sizeof(struct shash_desc) + crypto_shash_descsize(shash);

        return 0;
}

static void crypto_shash_exit_tfm(struct crypto_tfm *tfm)
{
        struct crypto_shash *hash = __crypto_shash_cast(tfm);
        struct shash_alg *alg = crypto_shash_alg(hash);

        alg->exit_tfm(hash);
}

static int crypto_shash_init_tfm(struct crypto_tfm *tfm)
{
        struct crypto_shash *hash = __crypto_shash_cast(tfm);
        struct shash_alg *alg = crypto_shash_alg(hash);
        int err;

        hash->descsize = alg->descsize;

        shash_set_needkey(hash, alg);

        if (alg->exit_tfm)
                tfm->exit = crypto_shash_exit_tfm;

        if (!alg->init_tfm)
                return 0;

        err = alg->init_tfm(hash);
        if (err)
                return err;

        /* ->init_tfm() may have increased the descsize. */
        if (WARN_ON_ONCE(hash->descsize > HASH_MAX_DESCSIZE)) {
                if (alg->exit_tfm)
                        alg->exit_tfm(hash);
                return -EINVAL;
        }

        return 0;
}

static void crypto_shash_free_instance(struct crypto_instance *inst)
{
        struct shash_instance *shash = shash_instance(inst);

        shash->free(shash);
}

#ifdef CONFIG_NET
static int crypto_shash_report(struct sk_buff *skb, struct crypto_alg *alg)
{
        struct crypto_report_hash rhash;
        struct shash_alg *salg = __crypto_shash_alg(alg);

        memset(&rhash, 0, sizeof(rhash));

        strscpy(rhash.type, "shash", sizeof(rhash.type));

        rhash.blocksize = alg->cra_blocksize;
        rhash.digestsize = salg->digestsize;

        return nla_put(skb, CRYPTOCFGA_REPORT_HASH, sizeof(rhash), &rhash);
}
#else
static int crypto_shash_report(struct sk_buff *skb, struct crypto_alg *alg)
{
        return -ENOSYS;
}
#endif

static void crypto_shash_show(struct seq_file *m, struct crypto_alg *alg)
        __maybe_unused;
static void crypto_shash_show(struct seq_file *m, struct crypto_alg *alg)
{
        struct shash_alg *salg = __crypto_shash_alg(alg);

        seq_printf(m, "type         : shash\n");
        seq_printf(m, "blocksize    : %u\n", alg->cra_blocksize);
        seq_printf(m, "digestsize   : %u\n", salg->digestsize);
}

static const struct crypto_type crypto_shash_type = {
        .extsize = crypto_alg_extsize,
        .init_tfm = crypto_shash_init_tfm,
        .free = crypto_shash_free_instance,
#ifdef CONFIG_PROC_FS
        .show = crypto_shash_show,
#endif
        .report = crypto_shash_report,
        .maskclear = ~CRYPTO_ALG_TYPE_MASK,
        .maskset = CRYPTO_ALG_TYPE_MASK,
        .type = CRYPTO_ALG_TYPE_SHASH,
        .tfmsize = offsetof(struct crypto_shash, base),
};

int crypto_grab_shash(struct crypto_shash_spawn *spawn,
                      struct crypto_instance *inst,
                      const char *name, u32 type, u32 mask)
{
        spawn->base.frontend = &crypto_shash_type;
        return crypto_grab_spawn(&spawn->base, inst, name, type, mask);
}
EXPORT_SYMBOL_GPL(crypto_grab_shash);

struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type,
                                        u32 mask)
{
        return crypto_alloc_tfm(alg_name, &crypto_shash_type, type, mask);
}
EXPORT_SYMBOL_GPL(crypto_alloc_shash);

static int shash_prepare_alg(struct shash_alg *alg)
{
        struct crypto_alg *base = &alg->base;

        if (alg->digestsize > HASH_MAX_DIGESTSIZE ||
            alg->descsize > HASH_MAX_DESCSIZE ||
            alg->statesize > HASH_MAX_STATESIZE)
                return -EINVAL;

        if ((alg->export && !alg->import) || (alg->import && !alg->export))
                return -EINVAL;

        base->cra_type = &crypto_shash_type;
        base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
        base->cra_flags |= CRYPTO_ALG_TYPE_SHASH;

        if (!alg->finup)
                alg->finup = shash_finup_unaligned;
        if (!alg->digest)
                alg->digest = shash_digest_unaligned;
        if (!alg->export) {
                alg->export = shash_default_export;
                alg->import = shash_default_import;
                alg->statesize = alg->descsize;
        }
        if (!alg->setkey)
                alg->setkey = shash_no_setkey;

        return 0;
}

int crypto_register_shash(struct shash_alg *alg)
{
        struct crypto_alg *base = &alg->base;
        int err;

        err = shash_prepare_alg(alg);
        if (err)
                return err;

        return crypto_register_alg(base);
}
EXPORT_SYMBOL_GPL(crypto_register_shash);

void crypto_unregister_shash(struct shash_alg *alg)
{
        crypto_unregister_alg(&alg->base);
}
EXPORT_SYMBOL_GPL(crypto_unregister_shash);

int crypto_register_shashes(struct shash_alg *algs, int count)
{
        int i, ret;

        for (i = 0; i < count; i++) {
                ret = crypto_register_shash(&algs[i]);
                if (ret)
                        goto err;
        }

        return 0;

err:
        for (--i; i >= 0; --i)
                crypto_unregister_shash(&algs[i]);

        return ret;
}
EXPORT_SYMBOL_GPL(crypto_register_shashes);

void crypto_unregister_shashes(struct shash_alg *algs, int count)
{
        int i;

        for (i = count - 1; i >= 0; --i)
                crypto_unregister_shash(&algs[i]);
}
EXPORT_SYMBOL_GPL(crypto_unregister_shashes);

int shash_register_instance(struct crypto_template *tmpl,
                            struct shash_instance *inst)
{
        int err;

        if (WARN_ON(!inst->free))
                return -EINVAL;

        err = shash_prepare_alg(&inst->alg);
        if (err)
                return err;

        return crypto_register_instance(tmpl, shash_crypto_instance(inst));
}
EXPORT_SYMBOL_GPL(shash_register_instance);

void shash_free_singlespawn_instance(struct shash_instance *inst)
{
        crypto_drop_spawn(shash_instance_ctx(inst));
        kfree(inst);
}
EXPORT_SYMBOL_GPL(shash_free_singlespawn_instance);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Synchronous cryptographic hash type");





















































































































    4 




































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
/* This file is automatically generated.  Do not edit. */
#ifndef _SELINUX_FLASK_H_
#define _SELINUX_FLASK_H_

#define SECCLASS_SECURITY                                 1
#define SECCLASS_PROCESS                                  2
#define SECCLASS_PROCESS2                                 3
#define SECCLASS_SYSTEM                                   4
#define SECCLASS_CAPABILITY                               5
#define SECCLASS_FILESYSTEM                               6
#define SECCLASS_FILE                                     7
#define SECCLASS_DIR                                      8
#define SECCLASS_FD                                       9
#define SECCLASS_LNK_FILE                                10
#define SECCLASS_CHR_FILE                                11
#define SECCLASS_BLK_FILE                                12
#define SECCLASS_SOCK_FILE                               13
#define SECCLASS_FIFO_FILE                               14
#define SECCLASS_SOCKET                                  15
#define SECCLASS_TCP_SOCKET                              16
#define SECCLASS_UDP_SOCKET                              17
#define SECCLASS_RAWIP_SOCKET                            18
#define SECCLASS_NODE                                    19
#define SECCLASS_NETIF                                   20
#define SECCLASS_NETLINK_SOCKET                          21
#define SECCLASS_PACKET_SOCKET                           22
#define SECCLASS_KEY_SOCKET                              23
#define SECCLASS_UNIX_STREAM_SOCKET                      24
#define SECCLASS_UNIX_DGRAM_SOCKET                       25
#define SECCLASS_SEM                                     26
#define SECCLASS_MSG                                     27
#define SECCLASS_MSGQ                                    28
#define SECCLASS_SHM                                     29
#define SECCLASS_IPC                                     30
#define SECCLASS_NETLINK_ROUTE_SOCKET                    31
#define SECCLASS_NETLINK_TCPDIAG_SOCKET                  32
#define SECCLASS_NETLINK_NFLOG_SOCKET                    33
#define SECCLASS_NETLINK_XFRM_SOCKET                     34
#define SECCLASS_NETLINK_SELINUX_SOCKET                  35
#define SECCLASS_NETLINK_ISCSI_SOCKET                    36
#define SECCLASS_NETLINK_AUDIT_SOCKET                    37
#define SECCLASS_NETLINK_FIB_LOOKUP_SOCKET               38
#define SECCLASS_NETLINK_CONNECTOR_SOCKET                39
#define SECCLASS_NETLINK_NETFILTER_SOCKET                40
#define SECCLASS_NETLINK_DNRT_SOCKET                     41
#define SECCLASS_ASSOCIATION                             42
#define SECCLASS_NETLINK_KOBJECT_UEVENT_SOCKET           43
#define SECCLASS_NETLINK_GENERIC_SOCKET                  44
#define SECCLASS_NETLINK_SCSITRANSPORT_SOCKET            45
#define SECCLASS_NETLINK_RDMA_SOCKET                     46
#define SECCLASS_NETLINK_CRYPTO_SOCKET                   47
#define SECCLASS_APPLETALK_SOCKET                        48
#define SECCLASS_PACKET                                  49
#define SECCLASS_KEY                                     50
#define SECCLASS_DCCP_SOCKET                             51
#define SECCLASS_MEMPROTECT                              52
#define SECCLASS_PEER                                    53
#define SECCLASS_CAPABILITY2                             54
#define SECCLASS_KERNEL_SERVICE                          55
#define SECCLASS_TUN_SOCKET                              56
#define SECCLASS_BINDER                                  57
#define SECCLASS_CAP_USERNS                              58
#define SECCLASS_CAP2_USERNS                             59
#define SECCLASS_SCTP_SOCKET                             60
#define SECCLASS_ICMP_SOCKET                             61
#define SECCLASS_AX25_SOCKET                             62
#define SECCLASS_IPX_SOCKET                              63
#define SECCLASS_NETROM_SOCKET                           64
#define SECCLASS_ATMPVC_SOCKET                           65
#define SECCLASS_X25_SOCKET                              66
#define SECCLASS_ROSE_SOCKET                             67
#define SECCLASS_DECNET_SOCKET                           68
#define SECCLASS_ATMSVC_SOCKET                           69
#define SECCLASS_RDS_SOCKET                              70
#define SECCLASS_IRDA_SOCKET                             71
#define SECCLASS_PPPOX_SOCKET                            72
#define SECCLASS_LLC_SOCKET                              73
#define SECCLASS_CAN_SOCKET                              74
#define SECCLASS_TIPC_SOCKET                             75
#define SECCLASS_BLUETOOTH_SOCKET                        76
#define SECCLASS_IUCV_SOCKET                             77
#define SECCLASS_RXRPC_SOCKET                            78
#define SECCLASS_ISDN_SOCKET                             79
#define SECCLASS_PHONET_SOCKET                           80
#define SECCLASS_IEEE802154_SOCKET                       81
#define SECCLASS_CAIF_SOCKET                             82
#define SECCLASS_ALG_SOCKET                              83
#define SECCLASS_NFC_SOCKET                              84
#define SECCLASS_VSOCK_SOCKET                            85
#define SECCLASS_KCM_SOCKET                              86
#define SECCLASS_QIPCRTR_SOCKET                          87
#define SECCLASS_SMC_SOCKET                              88
#define SECCLASS_INFINIBAND_PKEY                         89
#define SECCLASS_INFINIBAND_ENDPORT                      90
#define SECCLASS_BPF                                     91
#define SECCLASS_XDP_SOCKET                              92
#define SECCLASS_PERF_EVENT                              93
#define SECCLASS_LOCKDOWN                                94

#define SECINITSID_KERNEL                                   1
#define SECINITSID_SECURITY                                 2
#define SECINITSID_UNLABELED                                3
#define SECINITSID_FILE                                     5
#define SECINITSID_ANY_SOCKET                               8
#define SECINITSID_PORT                                     9
#define SECINITSID_NETIF                                   10
#define SECINITSID_NETMSG                                  11
#define SECINITSID_NODE                                    12
#define SECINITSID_DEVNULL                                 27

#define SECINITSID_NUM 27

static inline bool security_is_socket_class(u16 kern_tclass)
{
        bool sock = false;

        switch (kern_tclass) {
        case SECCLASS_SOCKET:
        case SECCLASS_TCP_SOCKET:
        case SECCLASS_UDP_SOCKET:
        case SECCLASS_RAWIP_SOCKET:
        case SECCLASS_NETLINK_SOCKET:
        case SECCLASS_PACKET_SOCKET:
        case SECCLASS_KEY_SOCKET:
        case SECCLASS_UNIX_STREAM_SOCKET:
        case SECCLASS_UNIX_DGRAM_SOCKET:
        case SECCLASS_NETLINK_ROUTE_SOCKET:
        case SECCLASS_NETLINK_TCPDIAG_SOCKET:
        case SECCLASS_NETLINK_NFLOG_SOCKET:
        case SECCLASS_NETLINK_XFRM_SOCKET:
        case SECCLASS_NETLINK_SELINUX_SOCKET:
        case SECCLASS_NETLINK_ISCSI_SOCKET:
        case SECCLASS_NETLINK_AUDIT_SOCKET:
        case SECCLASS_NETLINK_FIB_LOOKUP_SOCKET:
        case SECCLASS_NETLINK_CONNECTOR_SOCKET:
        case SECCLASS_NETLINK_NETFILTER_SOCKET:
        case SECCLASS_NETLINK_DNRT_SOCKET:
        case SECCLASS_NETLINK_KOBJECT_UEVENT_SOCKET:
        case SECCLASS_NETLINK_GENERIC_SOCKET:
        case SECCLASS_NETLINK_SCSITRANSPORT_SOCKET:
        case SECCLASS_NETLINK_RDMA_SOCKET:
        case SECCLASS_NETLINK_CRYPTO_SOCKET:
        case SECCLASS_APPLETALK_SOCKET:
        case SECCLASS_DCCP_SOCKET:
        case SECCLASS_TUN_SOCKET:
        case SECCLASS_SCTP_SOCKET:
        case SECCLASS_ICMP_SOCKET:
        case SECCLASS_AX25_SOCKET:
        case SECCLASS_IPX_SOCKET:
        case SECCLASS_NETROM_SOCKET:
        case SECCLASS_ATMPVC_SOCKET:
        case SECCLASS_X25_SOCKET:
        case SECCLASS_ROSE_SOCKET:
        case SECCLASS_DECNET_SOCKET:
        case SECCLASS_ATMSVC_SOCKET:
        case SECCLASS_RDS_SOCKET:
        case SECCLASS_IRDA_SOCKET:
        case SECCLASS_PPPOX_SOCKET:
        case SECCLASS_LLC_SOCKET:
        case SECCLASS_CAN_SOCKET:
        case SECCLASS_TIPC_SOCKET:
        case SECCLASS_BLUETOOTH_SOCKET:
        case SECCLASS_IUCV_SOCKET:
        case SECCLASS_RXRPC_SOCKET:
        case SECCLASS_ISDN_SOCKET:
        case SECCLASS_PHONET_SOCKET:
        case SECCLASS_IEEE802154_SOCKET:
        case SECCLASS_CAIF_SOCKET:
        case SECCLASS_ALG_SOCKET:
        case SECCLASS_NFC_SOCKET:
        case SECCLASS_VSOCK_SOCKET:
        case SECCLASS_KCM_SOCKET:
        case SECCLASS_QIPCRTR_SOCKET:
        case SECCLASS_SMC_SOCKET:
        case SECCLASS_XDP_SOCKET:
                sock = true;
                break;
        default:
                break;
        }

        return sock;
}

#endif




























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
/* SPDX-License-Identifier: GPL-2.0 OR MIT */
#ifndef __LINUX_OVERFLOW_H
#define __LINUX_OVERFLOW_H

#include <linux/compiler.h>
#include <linux/limits.h>

/*
 * We need to compute the minimum and maximum values representable in a given
 * type. These macros may also be useful elsewhere. It would seem more obvious
 * to do something like:
 *
 * #define type_min(T) (T)(is_signed_type(T) ? (T)1 << (8*sizeof(T)-1) : 0)
 * #define type_max(T) (T)(is_signed_type(T) ? ((T)1 << (8*sizeof(T)-1)) - 1 : ~(T)0)
 *
 * Unfortunately, the middle expressions, strictly speaking, have
 * undefined behaviour, and at least some versions of gcc warn about
 * the type_max expression (but not if -fsanitize=undefined is in
 * effect; in that case, the warning is deferred to runtime...).
 *
 * The slightly excessive casting in type_min is to make sure the
 * macros also produce sensible values for the exotic type _Bool. [The
 * overflow checkers only almost work for _Bool, but that's
 * a-feature-not-a-bug, since people shouldn't be doing arithmetic on
 * _Bools. Besides, the gcc builtins don't allow _Bool* as third
 * argument.]
 *
 * Idea stolen from
 * https://mail-index.netbsd.org/tech-misc/2007/02/05/0000.html -
 * credit to Christian Biere.
 */
#define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - is_signed_type(type)))
#define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T)))
#define type_min(T) ((T)((T)-type_max(T)-(T)1))

/*
 * Avoids triggering -Wtype-limits compilation warning,
 * while using unsigned data types to check a < 0.
 */
#define is_non_negative(a) ((a) > 0 || (a) == 0)
#define is_negative(a) (!(is_non_negative(a)))

/*
 * Allows for effectively applying __must_check to a macro so we can have
 * both the type-agnostic benefits of the macros while also being able to
 * enforce that the return value is, in fact, checked.
 */
static inline bool __must_check __must_check_overflow(bool overflow)
{
        return unlikely(overflow);
}

/** check_add_overflow() - Calculate addition with overflow checking
 *
 * @a: first addend
 * @b: second addend
 * @d: pointer to store sum
 *
 * Returns 0 on success.
 *
 * *@d holds the results of the attempted addition, but is not considered
 * "safe for use" on a non-zero return value, which indicates that the
 * sum has overflowed or been truncated.
 */
#define check_add_overflow(a, b, d)        \
        __must_check_overflow(__builtin_add_overflow(a, b, d))

/** check_sub_overflow() - Calculate subtraction with overflow checking
 *
 * @a: minuend; value to subtract from
 * @b: subtrahend; value to subtract from @a
 * @d: pointer to store difference
 *
 * Returns 0 on success.
 *
 * *@d holds the results of the attempted subtraction, but is not considered
 * "safe for use" on a non-zero return value, which indicates that the
 * difference has underflowed or been truncated.
 */
#define check_sub_overflow(a, b, d)        \
        __must_check_overflow(__builtin_sub_overflow(a, b, d))

/** check_mul_overflow() - Calculate multiplication with overflow checking
 *
 * @a: first factor
 * @b: second factor
 * @d: pointer to store product
 *
 * Returns 0 on success.
 *
 * *@d holds the results of the attempted multiplication, but is not
 * considered "safe for use" on a non-zero return value, which indicates
 * that the product has overflowed or been truncated.
 */
#define check_mul_overflow(a, b, d)        \
        __must_check_overflow(__builtin_mul_overflow(a, b, d))

/** check_shl_overflow() - Calculate a left-shifted value and check overflow
 *
 * @a: Value to be shifted
 * @s: How many bits left to shift
 * @d: Pointer to where to store the result
 *
 * Computes *@d = (@a << @s)
 *
 * Returns true if '*d' cannot hold the result or when 'a << s' doesn't
 * make sense. Example conditions:
 * - 'a << s' causes bits to be lost when stored in *d.
 * - 's' is garbage (e.g. negative) or so large that the result of
 *   'a << s' is guaranteed to be 0.
 * - 'a' is negative.
 * - 'a << s' sets the sign bit, if any, in '*d'.
 *
 * '*d' will hold the results of the attempted shift, but is not
 * considered "safe for use" if true is returned.
 */
#define check_shl_overflow(a, s, d) __must_check_overflow(({                \
        typeof(a) _a = a;                                                \
        typeof(s) _s = s;                                                \
        typeof(d) _d = d;                                                \
        u64 _a_full = _a;                                                \
        unsigned int _to_shift =                                        \
                is_non_negative(_s) && _s < 8 * sizeof(*d) ? _s : 0;        \
        *_d = (_a_full << _to_shift);                                        \
        (_to_shift != _s || is_negative(*_d) || is_negative(_a) ||        \
        (*_d >> _to_shift) != _a);                                        \
}))

/**
 * size_mul() - Calculate size_t multiplication with saturation at SIZE_MAX
 *
 * @factor1: first factor
 * @factor2: second factor
 *
 * Returns: calculate @factor1 * @factor2, both promoted to size_t,
 * with any overflow causing the return value to be SIZE_MAX. The
 * lvalue must be size_t to avoid implicit type conversion.
 */
static inline size_t __must_check size_mul(size_t factor1, size_t factor2)
{
        size_t bytes;

        if (check_mul_overflow(factor1, factor2, &bytes))
                return SIZE_MAX;

        return bytes;
}

/**
 * size_add() - Calculate size_t addition with saturation at SIZE_MAX
 *
 * @addend1: first addend
 * @addend2: second addend
 *
 * Returns: calculate @addend1 + @addend2, both promoted to size_t,
 * with any overflow causing the return value to be SIZE_MAX. The
 * lvalue must be size_t to avoid implicit type conversion.
 */
static inline size_t __must_check size_add(size_t addend1, size_t addend2)
{
        size_t bytes;

        if (check_add_overflow(addend1, addend2, &bytes))
                return SIZE_MAX;

        return bytes;
}

/**
 * size_sub() - Calculate size_t subtraction with saturation at SIZE_MAX
 *
 * @minuend: value to subtract from
 * @subtrahend: value to subtract from @minuend
 *
 * Returns: calculate @minuend - @subtrahend, both promoted to size_t,
 * with any overflow causing the return value to be SIZE_MAX. For
 * composition with the size_add() and size_mul() helpers, neither
 * argument may be SIZE_MAX (or the result with be forced to SIZE_MAX).
 * The lvalue must be size_t to avoid implicit type conversion.
 */
static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend)
{
        size_t bytes;

        if (minuend == SIZE_MAX || subtrahend == SIZE_MAX ||
            check_sub_overflow(minuend, subtrahend, &bytes))
                return SIZE_MAX;

        return bytes;
}

/**
 * array_size() - Calculate size of 2-dimensional array.
 *
 * @a: dimension one
 * @b: dimension two
 *
 * Calculates size of 2-dimensional array: @a * @b.
 *
 * Returns: number of bytes needed to represent the array or SIZE_MAX on
 * overflow.
 */
#define array_size(a, b)        size_mul(a, b)

/**
 * array3_size() - Calculate size of 3-dimensional array.
 *
 * @a: dimension one
 * @b: dimension two
 * @c: dimension three
 *
 * Calculates size of 3-dimensional array: @a * @b * @c.
 *
 * Returns: number of bytes needed to represent the array or SIZE_MAX on
 * overflow.
 */
#define array3_size(a, b, c)        size_mul(size_mul(a, b), c)

/**
 * flex_array_size() - Calculate size of a flexible array member
 *                     within an enclosing structure.
 *
 * @p: Pointer to the structure.
 * @member: Name of the flexible array member.
 * @count: Number of elements in the array.
 *
 * Calculates size of a flexible array of @count number of @member
 * elements, at the end of structure @p.
 *
 * Return: number of bytes needed or SIZE_MAX on overflow.
 */
#define flex_array_size(p, member, count)                                \
        size_mul(count,                                                        \
                 sizeof(*(p)->member) + __must_be_array((p)->member))

/**
 * struct_size() - Calculate size of structure with trailing flexible array.
 *
 * @p: Pointer to the structure.
 * @member: Name of the array member.
 * @count: Number of elements in the array.
 *
 * Calculates size of memory needed for structure @p followed by an
 * array of @count number of @member elements.
 *
 * Return: number of bytes needed or SIZE_MAX on overflow.
 */
#define struct_size(p, member, count)                                        \
        size_add(sizeof(*(p)), flex_array_size(p, member, count))

#endif /* __LINUX_OVERFLOW_H */























































































    1 




































    1 



    1 































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 



    1 
































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 








    1 


    1 
    1 



    1 
    1 






    1 

    1 






    1 
    1 



    1 
    1 
    1 














    1 



    1 













    1 



    1 



    1 
    1 



    1 




































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
// SPDX-License-Identifier: GPL-2.0-only
/*
 * fs/fs-writeback.c
 *
 * Copyright (C) 2002, Linus Torvalds.
 *
 * Contains all the functions related to writing back and waiting
 * upon dirty inodes against superblocks, and writing back dirty
 * pages against inodes.  ie: data writeback.  Writeout of the
 * inode itself is not handled here.
 *
 * 10Apr2002        Andrew Morton
 *                Split out of fs/inode.c
 *                Additions for address_space-based writeback
 */

#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/kthread.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/tracepoint.h>
#include <linux/device.h>
#include <linux/memcontrol.h>
#include "internal.h"

/*
 * 4MB minimal write chunk size
 */
#define MIN_WRITEBACK_PAGES        (4096UL >> (PAGE_SHIFT - 10))

/*
 * Passed into wb_writeback(), essentially a subset of writeback_control
 */
struct wb_writeback_work {
        long nr_pages;
        struct super_block *sb;
        enum writeback_sync_modes sync_mode;
        unsigned int tagged_writepages:1;
        unsigned int for_kupdate:1;
        unsigned int range_cyclic:1;
        unsigned int for_background:1;
        unsigned int for_sync:1;        /* sync(2) WB_SYNC_ALL writeback */
        unsigned int auto_free:1;        /* free on completion */
        enum wb_reason reason;                /* why was writeback initiated? */

        struct list_head list;                /* pending work list */
        struct wb_completion *done;        /* set if the caller waits */
};

/*
 * If an inode is constantly having its pages dirtied, but then the
 * updates stop dirtytime_expire_interval seconds in the past, it's
 * possible for the worst case time between when an inode has its
 * timestamps updated and when they finally get written out to be two
 * dirtytime_expire_intervals.  We set the default to 12 hours (in
 * seconds), which means most of the time inodes will have their
 * timestamps written to disk after 12 hours, but in the worst case a
 * few inodes might not their timestamps updated for 24 hours.
 */
unsigned int dirtytime_expire_interval = 12 * 60 * 60;

static inline struct inode *wb_inode(struct list_head *head)
{
        return list_entry(head, struct inode, i_io_list);
}

/*
 * Include the creation of the trace points after defining the
 * wb_writeback_work structure and inline functions so that the definition
 * remains local to this file.
 */
#define CREATE_TRACE_POINTS
#include <trace/events/writeback.h>

EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);

static bool wb_io_lists_populated(struct bdi_writeback *wb)
{
        if (wb_has_dirty_io(wb)) {
                return false;
        } else {
                set_bit(WB_has_dirty_io, &wb->state);
                WARN_ON_ONCE(!wb->avg_write_bandwidth);
                atomic_long_add(wb->avg_write_bandwidth,
                                &wb->bdi->tot_write_bandwidth);
                return true;
        }
}

static void wb_io_lists_depopulated(struct bdi_writeback *wb)
{
        if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) &&
            list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) {
                clear_bit(WB_has_dirty_io, &wb->state);
                WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth,
                                        &wb->bdi->tot_write_bandwidth) < 0);
        }
}

/**
 * inode_io_list_move_locked - move an inode onto a bdi_writeback IO list
 * @inode: inode to be moved
 * @wb: target bdi_writeback
 * @head: one of @wb->b_{dirty|io|more_io|dirty_time}
 *
 * Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io.
 * Returns %true if @inode is the first occupant of the !dirty_time IO
 * lists; otherwise, %false.
 */
static bool inode_io_list_move_locked(struct inode *inode,
                                      struct bdi_writeback *wb,
                                      struct list_head *head)
{
        assert_spin_locked(&wb->list_lock);

        list_move(&inode->i_io_list, head);

        /* dirty_time doesn't count as dirty_io until expiration */
        if (head != &wb->b_dirty_time)
                return wb_io_lists_populated(wb);

        wb_io_lists_depopulated(wb);
        return false;
}

/**
 * inode_io_list_del_locked - remove an inode from its bdi_writeback IO list
 * @inode: inode to be removed
 * @wb: bdi_writeback @inode is being removed from
 *
 * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and
 * clear %WB_has_dirty_io if all are empty afterwards.
 */
static void inode_io_list_del_locked(struct inode *inode,
                                     struct bdi_writeback *wb)
{
        assert_spin_locked(&wb->list_lock);
        assert_spin_locked(&inode->i_lock);

        inode->i_state &= ~I_SYNC_QUEUED;
        list_del_init(&inode->i_io_list);
        wb_io_lists_depopulated(wb);
}

static void wb_wakeup(struct bdi_writeback *wb)
{
        spin_lock_bh(&wb->work_lock);
        if (test_bit(WB_registered, &wb->state))
                mod_delayed_work(bdi_wq, &wb->dwork, 0);
        spin_unlock_bh(&wb->work_lock);
}

static void finish_writeback_work(struct bdi_writeback *wb,
                                  struct wb_writeback_work *work)
{
        struct wb_completion *done = work->done;

        if (work->auto_free)
                kfree(work);
        if (done) {
                wait_queue_head_t *waitq = done->waitq;

                /* @done can't be accessed after the following dec */
                if (atomic_dec_and_test(&done->cnt))
                        wake_up_all(waitq);
        }
}

static void wb_queue_work(struct bdi_writeback *wb,
                          struct wb_writeback_work *work)
{
        trace_writeback_queue(wb, work);

        if (work->done)
                atomic_inc(&work->done->cnt);

        spin_lock_bh(&wb->work_lock);

        if (test_bit(WB_registered, &wb->state)) {
                list_add_tail(&work->list, &wb->work_list);
                mod_delayed_work(bdi_wq, &wb->dwork, 0);
        } else
                finish_writeback_work(wb, work);

        spin_unlock_bh(&wb->work_lock);
}

/**
 * wb_wait_for_completion - wait for completion of bdi_writeback_works
 * @done: target wb_completion
 *
 * Wait for one or more work items issued to @bdi with their ->done field
 * set to @done, which should have been initialized with
 * DEFINE_WB_COMPLETION().  This function returns after all such work items
 * are completed.  Work items which are waited upon aren't freed
 * automatically on completion.
 */
void wb_wait_for_completion(struct wb_completion *done)
{
        atomic_dec(&done->cnt);                /* put down the initial count */
        wait_event(*done->waitq, !atomic_read(&done->cnt));
}

#ifdef CONFIG_CGROUP_WRITEBACK

/*
 * Parameters for foreign inode detection, see wbc_detach_inode() to see
 * how they're used.
 *
 * These paramters are inherently heuristical as the detection target
 * itself is fuzzy.  All we want to do is detaching an inode from the
 * current owner if it's being written to by some other cgroups too much.
 *
 * The current cgroup writeback is built on the assumption that multiple
 * cgroups writing to the same inode concurrently is very rare and a mode
 * of operation which isn't well supported.  As such, the goal is not
 * taking too long when a different cgroup takes over an inode while
 * avoiding too aggressive flip-flops from occasional foreign writes.
 *
 * We record, very roughly, 2s worth of IO time history and if more than
 * half of that is foreign, trigger the switch.  The recording is quantized
 * to 16 slots.  To avoid tiny writes from swinging the decision too much,
 * writes smaller than 1/8 of avg size are ignored.
 */
#define WB_FRN_TIME_SHIFT        13        /* 1s = 2^13, upto 8 secs w/ 16bit */
#define WB_FRN_TIME_AVG_SHIFT        3        /* avg = avg * 7/8 + new * 1/8 */
#define WB_FRN_TIME_CUT_DIV        8        /* ignore rounds < avg / 8 */
#define WB_FRN_TIME_PERIOD        (2 * (1 << WB_FRN_TIME_SHIFT))        /* 2s */

#define WB_FRN_HIST_SLOTS        16        /* inode->i_wb_frn_history is 16bit */
#define WB_FRN_HIST_UNIT        (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS)
                                        /* each slot's duration is 2s / 16 */
#define WB_FRN_HIST_THR_SLOTS        (WB_FRN_HIST_SLOTS / 2)
                                        /* if foreign slots >= 8, switch */
#define WB_FRN_HIST_MAX_SLOTS        (WB_FRN_HIST_THR_SLOTS / 2 + 1)
                                        /* one round can affect upto 5 slots */
#define WB_FRN_MAX_IN_FLIGHT        1024        /* don't queue too many concurrently */

static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
static struct workqueue_struct *isw_wq;

void __inode_attach_wb(struct inode *inode, struct page *page)
{
        struct backing_dev_info *bdi = inode_to_bdi(inode);
        struct bdi_writeback *wb = NULL;

        if (inode_cgwb_enabled(inode)) {
                struct cgroup_subsys_state *memcg_css;

                if (page) {
                        memcg_css = mem_cgroup_css_from_page(page);
                        wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
                } else {
                        /* must pin memcg_css, see wb_get_create() */
                        memcg_css = task_get_css(current, memory_cgrp_id);
                        wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
                        css_put(memcg_css);
                }
        }

        if (!wb)
                wb = &bdi->wb;

        /*
         * There may be multiple instances of this function racing to
         * update the same inode.  Use cmpxchg() to tell the winner.
         */
        if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
                wb_put(wb);
}
EXPORT_SYMBOL_GPL(__inode_attach_wb);

/**
 * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it
 * @inode: inode of interest with i_lock held
 *
 * Returns @inode's wb with its list_lock held.  @inode->i_lock must be
 * held on entry and is released on return.  The returned wb is guaranteed
 * to stay @inode's associated wb until its list_lock is released.
 */
static struct bdi_writeback *
locked_inode_to_wb_and_lock_list(struct inode *inode)
        __releases(&inode->i_lock)
        __acquires(&wb->list_lock)
{
        while (true) {
                struct bdi_writeback *wb = inode_to_wb(inode);

                /*
                 * inode_to_wb() association is protected by both
                 * @inode->i_lock and @wb->list_lock but list_lock nests
                 * outside i_lock.  Drop i_lock and verify that the
                 * association hasn't changed after acquiring list_lock.
                 */
                wb_get(wb);
                spin_unlock(&inode->i_lock);
                spin_lock(&wb->list_lock);

                /* i_wb may have changed inbetween, can't use inode_to_wb() */
                if (likely(wb == inode->i_wb)) {
                        wb_put(wb);        /* @inode already has ref */
                        return wb;
                }

                spin_unlock(&wb->list_lock);
                wb_put(wb);
                cpu_relax();
                spin_lock(&inode->i_lock);
        }
}

/**
 * inode_to_wb_and_lock_list - determine an inode's wb and lock it
 * @inode: inode of interest
 *
 * Same as locked_inode_to_wb_and_lock_list() but @inode->i_lock isn't held
 * on entry.
 */
static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
        __acquires(&wb->list_lock)
{
        spin_lock(&inode->i_lock);
        return locked_inode_to_wb_and_lock_list(inode);
}

struct inode_switch_wbs_context {
        struct inode                *inode;
        struct bdi_writeback        *new_wb;

        struct rcu_head                rcu_head;
        struct work_struct        work;
};

static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
{
        down_write(&bdi->wb_switch_rwsem);
}

static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi)
{
        up_write(&bdi->wb_switch_rwsem);
}

static void inode_switch_wbs_work_fn(struct work_struct *work)
{
        struct inode_switch_wbs_context *isw =
                container_of(work, struct inode_switch_wbs_context, work);
        struct inode *inode = isw->inode;
        struct backing_dev_info *bdi = inode_to_bdi(inode);
        struct address_space *mapping = inode->i_mapping;
        struct bdi_writeback *old_wb = inode->i_wb;
        struct bdi_writeback *new_wb = isw->new_wb;
        XA_STATE(xas, &mapping->i_pages, 0);
        struct page *page;
        bool switched = false;

        /*
         * If @inode switches cgwb membership while sync_inodes_sb() is
         * being issued, sync_inodes_sb() might miss it.  Synchronize.
         */
        down_read(&bdi->wb_switch_rwsem);

        /*
         * By the time control reaches here, RCU grace period has passed
         * since I_WB_SWITCH assertion and all wb stat update transactions
         * between unlocked_inode_to_wb_begin/end() are guaranteed to be
         * synchronizing against the i_pages lock.
         *
         * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock
         * gives us exclusion against all wb related operations on @inode
         * including IO list manipulations and stat updates.
         */
        if (old_wb < new_wb) {
                spin_lock(&old_wb->list_lock);
                spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
        } else {
                spin_lock(&new_wb->list_lock);
                spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
        }
        spin_lock(&inode->i_lock);
        xa_lock_irq(&mapping->i_pages);

        /*
         * Once I_FREEING is visible under i_lock, the eviction path owns
         * the inode and we shouldn't modify ->i_io_list.
         */
        if (unlikely(inode->i_state & I_FREEING))
                goto skip_switch;

        trace_inode_switch_wbs(inode, old_wb, new_wb);

        /*
         * Count and transfer stats.  Note that PAGECACHE_TAG_DIRTY points
         * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to
         * pages actually under writeback.
         */
        xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_DIRTY) {
                if (PageDirty(page)) {
                        dec_wb_stat(old_wb, WB_RECLAIMABLE);
                        inc_wb_stat(new_wb, WB_RECLAIMABLE);
                }
        }

        xas_set(&xas, 0);
        xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) {
                WARN_ON_ONCE(!PageWriteback(page));
                dec_wb_stat(old_wb, WB_WRITEBACK);
                inc_wb_stat(new_wb, WB_WRITEBACK);
        }

        wb_get(new_wb);

        /*
         * Transfer to @new_wb's IO list if necessary.  The specific list
         * @inode was on is ignored and the inode is put on ->b_dirty which
         * is always correct including from ->b_dirty_time.  The transfer
         * preserves @inode->dirtied_when ordering.
         */
        if (!list_empty(&inode->i_io_list)) {
                struct inode *pos;

                inode_io_list_del_locked(inode, old_wb);
                inode->i_wb = new_wb;
                list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
                        if (time_after_eq(inode->dirtied_when,
                                          pos->dirtied_when))
                                break;
                inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev);
        } else {
                inode->i_wb = new_wb;
        }

        /* ->i_wb_frn updates may race wbc_detach_inode() but doesn't matter */
        inode->i_wb_frn_winner = 0;
        inode->i_wb_frn_avg_time = 0;
        inode->i_wb_frn_history = 0;
        switched = true;
skip_switch:
        /*
         * Paired with load_acquire in unlocked_inode_to_wb_begin() and
         * ensures that the new wb is visible if they see !I_WB_SWITCH.
         */
        smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);

        xa_unlock_irq(&mapping->i_pages);
        spin_unlock(&inode->i_lock);
        spin_unlock(&new_wb->list_lock);
        spin_unlock(&old_wb->list_lock);

        up_read(&bdi->wb_switch_rwsem);

        if (switched) {
                wb_wakeup(new_wb);
                wb_put(old_wb);
        }
        wb_put(new_wb);

        iput(inode);
        kfree(isw);

        atomic_dec(&isw_nr_in_flight);
}

static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
{
        struct inode_switch_wbs_context *isw = container_of(rcu_head,
                                struct inode_switch_wbs_context, rcu_head);

        /* needs to grab bh-unsafe locks, bounce to work item */
        INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
        queue_work(isw_wq, &isw->work);
}

/**
 * inode_switch_wbs - change the wb association of an inode
 * @inode: target inode
 * @new_wb_id: ID of the new wb
 *
 * Switch @inode's wb association to the wb identified by @new_wb_id.  The
 * switching is performed asynchronously and may fail silently.
 */
static void inode_switch_wbs(struct inode *inode, int new_wb_id)
{
        struct backing_dev_info *bdi = inode_to_bdi(inode);
        struct cgroup_subsys_state *memcg_css;
        struct inode_switch_wbs_context *isw;

        /* noop if seems to be already in progress */
        if (inode->i_state & I_WB_SWITCH)
                return;

        /* avoid queueing a new switch if too many are already in flight */
        if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT)
                return;

        isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
        if (!isw)
                return;

        atomic_inc(&isw_nr_in_flight);

        /* find and pin the new wb */
        rcu_read_lock();
        memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
        if (memcg_css && !css_tryget(memcg_css))
                memcg_css = NULL;
        rcu_read_unlock();
        if (!memcg_css)
                goto out_free;

        isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
        css_put(memcg_css);
        if (!isw->new_wb)
                goto out_free;

        /* while holding I_WB_SWITCH, no one else can update the association */
        spin_lock(&inode->i_lock);
        if (!(inode->i_sb->s_flags & SB_ACTIVE) ||
            inode->i_state & (I_WB_SWITCH | I_FREEING) ||
            inode_to_wb(inode) == isw->new_wb) {
                spin_unlock(&inode->i_lock);
                goto out_free;
        }
        inode->i_state |= I_WB_SWITCH;
        __iget(inode);
        spin_unlock(&inode->i_lock);

        isw->inode = inode;

        /*
         * In addition to synchronizing among switchers, I_WB_SWITCH tells
         * the RCU protected stat update paths to grab the i_page
         * lock so that stat transfer can synchronize against them.
         * Let's continue after I_WB_SWITCH is guaranteed to be visible.
         */
        call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
        return;

out_free:
        atomic_dec(&isw_nr_in_flight);
        if (isw->new_wb)
                wb_put(isw->new_wb);
        kfree(isw);
}

/**
 * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it
 * @wbc: writeback_control of interest
 * @inode: target inode
 *
 * @inode is locked and about to be written back under the control of @wbc.
 * Record @inode's writeback context into @wbc and unlock the i_lock.  On
 * writeback completion, wbc_detach_inode() should be called.  This is used
 * to track the cgroup writeback context.
 */
void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
                                 struct inode *inode)
{
        if (!inode_cgwb_enabled(inode)) {
                spin_unlock(&inode->i_lock);
                return;
        }

        wbc->wb = inode_to_wb(inode);
        wbc->inode = inode;

        wbc->wb_id = wbc->wb->memcg_css->id;
        wbc->wb_lcand_id = inode->i_wb_frn_winner;
        wbc->wb_tcand_id = 0;
        wbc->wb_bytes = 0;
        wbc->wb_lcand_bytes = 0;
        wbc->wb_tcand_bytes = 0;

        wb_get(wbc->wb);
        spin_unlock(&inode->i_lock);

        /*
         * A dying wb indicates that either the blkcg associated with the
         * memcg changed or the associated memcg is dying.  In the first
         * case, a replacement wb should already be available and we should
         * refresh the wb immediately.  In the second case, trying to
         * refresh will keep failing.
         */
        if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css)))
                inode_switch_wbs(inode, wbc->wb_id);
}
EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode);

/**
 * wbc_detach_inode - disassociate wbc from inode and perform foreign detection
 * @wbc: writeback_control of the just finished writeback
 *
 * To be called after a writeback attempt of an inode finishes and undoes
 * wbc_attach_and_unlock_inode().  Can be called under any context.
 *
 * As concurrent write sharing of an inode is expected to be very rare and
 * memcg only tracks page ownership on first-use basis severely confining
 * the usefulness of such sharing, cgroup writeback tracks ownership
 * per-inode.  While the support for concurrent write sharing of an inode
 * is deemed unnecessary, an inode being written to by different cgroups at
 * different points in time is a lot more common, and, more importantly,
 * charging only by first-use can too readily lead to grossly incorrect
 * behaviors (single foreign page can lead to gigabytes of writeback to be
 * incorrectly attributed).
 *
 * To resolve this issue, cgroup writeback detects the majority dirtier of
 * an inode and transfers the ownership to it.  To avoid unnnecessary
 * oscillation, the detection mechanism keeps track of history and gives
 * out the switch verdict only if the foreign usage pattern is stable over
 * a certain amount of time and/or writeback attempts.
 *
 * On each writeback attempt, @wbc tries to detect the majority writer
 * using Boyer-Moore majority vote algorithm.  In addition to the byte
 * count from the majority voting, it also counts the bytes written for the
 * current wb and the last round's winner wb (max of last round's current
 * wb, the winner from two rounds ago, and the last round's majority
 * candidate).  Keeping track of the historical winner helps the algorithm
 * to semi-reliably detect the most active writer even when it's not the
 * absolute majority.
 *
 * Once the winner of the round is determined, whether the winner is
 * foreign or not and how much IO time the round consumed is recorded in
 * inode->i_wb_frn_history.  If the amount of recorded foreign IO time is
 * over a certain threshold, the switch verdict is given.
 */
void wbc_detach_inode(struct writeback_control *wbc)
{
        struct bdi_writeback *wb = wbc->wb;
        struct inode *inode = wbc->inode;
        unsigned long avg_time, max_bytes, max_time;
        u16 history;
        int max_id;

        if (!wb)
                return;

        history = inode->i_wb_frn_history;
        avg_time = inode->i_wb_frn_avg_time;

        /* pick the winner of this round */
        if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
            wbc->wb_bytes >= wbc->wb_tcand_bytes) {
                max_id = wbc->wb_id;
                max_bytes = wbc->wb_bytes;
        } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) {
                max_id = wbc->wb_lcand_id;
                max_bytes = wbc->wb_lcand_bytes;
        } else {
                max_id = wbc->wb_tcand_id;
                max_bytes = wbc->wb_tcand_bytes;
        }

        /*
         * Calculate the amount of IO time the winner consumed and fold it
         * into the running average kept per inode.  If the consumed IO
         * time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for
         * deciding whether to switch or not.  This is to prevent one-off
         * small dirtiers from skewing the verdict.
         */
        max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT,
                                wb->avg_write_bandwidth);
        if (avg_time)
                avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) -
                            (avg_time >> WB_FRN_TIME_AVG_SHIFT);
        else
                avg_time = max_time;        /* immediate catch up on first run */

        if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) {
                int slots;

                /*
                 * The switch verdict is reached if foreign wb's consume
                 * more than a certain proportion of IO time in a
                 * WB_FRN_TIME_PERIOD.  This is loosely tracked by 16 slot
                 * history mask where each bit represents one sixteenth of
                 * the period.  Determine the number of slots to shift into
                 * history from @max_time.
                 */
                slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT),
                            (unsigned long)WB_FRN_HIST_MAX_SLOTS);
                history <<= slots;
                if (wbc->wb_id != max_id)
                        history |= (1U << slots) - 1;

                if (history)
                        trace_inode_foreign_history(inode, wbc, history);

                /*
                 * Switch if the current wb isn't the consistent winner.
                 * If there are multiple closely competing dirtiers, the
                 * inode may switch across them repeatedly over time, which
                 * is okay.  The main goal is avoiding keeping an inode on
                 * the wrong wb for an extended period of time.
                 */
                if (hweight16(history) > WB_FRN_HIST_THR_SLOTS)
                        inode_switch_wbs(inode, max_id);
        }

        /*
         * Multiple instances of this function may race to update the
         * following fields but we don't mind occassional inaccuracies.
         */
        inode->i_wb_frn_winner = max_id;
        inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX);
        inode->i_wb_frn_history = history;

        wb_put(wbc->wb);
        wbc->wb = NULL;
}
EXPORT_SYMBOL_GPL(wbc_detach_inode);

/**
 * wbc_account_cgroup_owner - account writeback to update inode cgroup ownership
 * @wbc: writeback_control of the writeback in progress
 * @page: page being written out
 * @bytes: number of bytes being written out
 *
 * @bytes from @page are about to written out during the writeback
 * controlled by @wbc.  Keep the book for foreign inode detection.  See
 * wbc_detach_inode().
 */
void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
                              size_t bytes)
{
        struct cgroup_subsys_state *css;
        int id;

        /*
         * pageout() path doesn't attach @wbc to the inode being written
         * out.  This is intentional as we don't want the function to block
         * behind a slow cgroup.  Ultimately, we want pageout() to kick off
         * regular writeback instead of writing things out itself.
         */
        if (!wbc->wb || wbc->no_cgroup_owner)
                return;

        css = mem_cgroup_css_from_page(page);
        /* dead cgroups shouldn't contribute to inode ownership arbitration */
        if (!(css->flags & CSS_ONLINE))
                return;

        id = css->id;

        if (id == wbc->wb_id) {
                wbc->wb_bytes += bytes;
                return;
        }

        if (id == wbc->wb_lcand_id)
                wbc->wb_lcand_bytes += bytes;

        /* Boyer-Moore majority vote algorithm */
        if (!wbc->wb_tcand_bytes)
                wbc->wb_tcand_id = id;
        if (id == wbc->wb_tcand_id)
                wbc->wb_tcand_bytes += bytes;
        else
                wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
}
EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);

/**
 * inode_congested - test whether an inode is congested
 * @inode: inode to test for congestion (may be NULL)
 * @cong_bits: mask of WB_[a]sync_congested bits to test
 *
 * Tests whether @inode is congested.  @cong_bits is the mask of congestion
 * bits to test and the return value is the mask of set bits.
 *
 * If cgroup writeback is enabled for @inode, the congestion state is
 * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg
 * associated with @inode is congested; otherwise, the root wb's congestion
 * state is used.
 *
 * @inode is allowed to be NULL as this function is often called on
 * mapping->host which is NULL for the swapper space.
 */
int inode_congested(struct inode *inode, int cong_bits)
{
        /*
         * Once set, ->i_wb never becomes NULL while the inode is alive.
         * Start transaction iff ->i_wb is visible.
         */
        if (inode && inode_to_wb_is_valid(inode)) {
                struct bdi_writeback *wb;
                struct wb_lock_cookie lock_cookie = {};
                bool congested;

                wb = unlocked_inode_to_wb_begin(inode, &lock_cookie);
                congested = wb_congested(wb, cong_bits);
                unlocked_inode_to_wb_end(inode, &lock_cookie);
                return congested;
        }

        return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
}
EXPORT_SYMBOL_GPL(inode_congested);

/**
 * wb_split_bdi_pages - split nr_pages to write according to bandwidth
 * @wb: target bdi_writeback to split @nr_pages to
 * @nr_pages: number of pages to write for the whole bdi
 *
 * Split @wb's portion of @nr_pages according to @wb's write bandwidth in
 * relation to the total write bandwidth of all wb's w/ dirty inodes on
 * @wb->bdi.
 */
static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
{
        unsigned long this_bw = wb->avg_write_bandwidth;
        unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);

        if (nr_pages == LONG_MAX)
                return LONG_MAX;

        /*
         * This may be called on clean wb's and proportional distribution
         * may not make sense, just use the original @nr_pages in those
         * cases.  In general, we wanna err on the side of writing more.
         */
        if (!tot_bw || this_bw >= tot_bw)
                return nr_pages;
        else
                return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
}

/**
 * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi
 * @bdi: target backing_dev_info
 * @base_work: wb_writeback_work to issue
 * @skip_if_busy: skip wb's which already have writeback in progress
 *
 * Split and issue @base_work to all wb's (bdi_writeback's) of @bdi which
 * have dirty inodes.  If @base_work->nr_page isn't %LONG_MAX, it's
 * distributed to the busy wbs according to each wb's proportion in the
 * total active write bandwidth of @bdi.
 */
static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
                                  struct wb_writeback_work *base_work,
                                  bool skip_if_busy)
{
        struct bdi_writeback *last_wb = NULL;
        struct bdi_writeback *wb = list_entry(&bdi->wb_list,
                                              struct bdi_writeback, bdi_node);

        might_sleep();
restart:
        rcu_read_lock();
        list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
                DEFINE_WB_COMPLETION(fallback_work_done, bdi);
                struct wb_writeback_work fallback_work;
                struct wb_writeback_work *work;
                long nr_pages;

                if (last_wb) {
                        wb_put(last_wb);
                        last_wb = NULL;
                }

                /* SYNC_ALL writes out I_DIRTY_TIME too */
                if (!wb_has_dirty_io(wb) &&
                    (base_work->sync_mode == WB_SYNC_NONE ||
                     list_empty(&wb->b_dirty_time)))
                        continue;
                if (skip_if_busy && writeback_in_progress(wb))
                        continue;

                nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);

                work = kmalloc(sizeof(*work), GFP_ATOMIC);
                if (work) {
                        *work = *base_work;
                        work->nr_pages = nr_pages;
                        work->auto_free = 1;
                        wb_queue_work(wb, work);
                        continue;
                }

                /*
                 * If wb_tryget fails, the wb has been shutdown, skip it.
                 *
                 * Pin @wb so that it stays on @bdi->wb_list.  This allows
                 * continuing iteration from @wb after dropping and
                 * regrabbing rcu read lock.
                 */
                if (!wb_tryget(wb))
                        continue;

                /* alloc failed, execute synchronously using on-stack fallback */
                work = &fallback_work;
                *work = *base_work;
                work->nr_pages = nr_pages;
                work->auto_free = 0;
                work->done = &fallback_work_done;

                wb_queue_work(wb, work);
                last_wb = wb;

                rcu_read_unlock();
                wb_wait_for_completion(&fallback_work_done);
                goto restart;
        }
        rcu_read_unlock();

        if (last_wb)
                wb_put(last_wb);
}

/**
 * cgroup_writeback_by_id - initiate cgroup writeback from bdi and memcg IDs
 * @bdi_id: target bdi id
 * @memcg_id: target memcg css id
 * @nr: number of pages to write, 0 for best-effort dirty flushing
 * @reason: reason why some writeback work initiated
 * @done: target wb_completion
 *
 * Initiate flush of the bdi_writeback identified by @bdi_id and @memcg_id
 * with the specified parameters.
 */
int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr,
                           enum wb_reason reason, struct wb_completion *done)
{
        struct backing_dev_info *bdi;
        struct cgroup_subsys_state *memcg_css;
        struct bdi_writeback *wb;
        struct wb_writeback_work *work;
        int ret;

        /* lookup bdi and memcg */
        bdi = bdi_get_by_id(bdi_id);
        if (!bdi)
                return -ENOENT;

        rcu_read_lock();
        memcg_css = css_from_id(memcg_id, &memory_cgrp_subsys);
        if (memcg_css && !css_tryget(memcg_css))
                memcg_css = NULL;
        rcu_read_unlock();
        if (!memcg_css) {
                ret = -ENOENT;
                goto out_bdi_put;
        }

        /*
         * And find the associated wb.  If the wb isn't there already
         * there's nothing to flush, don't create one.
         */
        wb = wb_get_lookup(bdi, memcg_css);
        if (!wb) {
                ret = -ENOENT;
                goto out_css_put;
        }

        /*
         * If @nr is zero, the caller is attempting to write out most of
         * the currently dirty pages.  Let's take the current dirty page
         * count and inflate it by 25% which should be large enough to
         * flush out most dirty pages while avoiding getting livelocked by
         * concurrent dirtiers.
         */
        if (!nr) {
                unsigned long filepages, headroom, dirty, writeback;

                mem_cgroup_wb_stats(wb, &filepages, &headroom, &dirty,
                                      &writeback);
                nr = dirty * 10 / 8;
        }

        /* issue the writeback work */
        work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN);
        if (work) {
                work->nr_pages = nr;
                work->sync_mode = WB_SYNC_NONE;
                work->range_cyclic = 1;
                work->reason = reason;
                work->done = done;
                work->auto_free = 1;
                wb_queue_work(wb, work);
                ret = 0;
        } else {
                ret = -ENOMEM;
        }

        wb_put(wb);
out_css_put:
        css_put(memcg_css);
out_bdi_put:
        bdi_put(bdi);
        return ret;
}

/**
 * cgroup_writeback_umount - flush inode wb switches for umount
 *
 * This function is called when a super_block is about to be destroyed and
 * flushes in-flight inode wb switches.  An inode wb switch goes through
 * RCU and then workqueue, so the two need to be flushed in order to ensure
 * that all previously scheduled switches are finished.  As wb switches are
 * rare occurrences and synchronize_rcu() can take a while, perform
 * flushing iff wb switches are in flight.
 */
void cgroup_writeback_umount(void)
{
        if (atomic_read(&isw_nr_in_flight)) {
                /*
                 * Use rcu_barrier() to wait for all pending callbacks to
                 * ensure that all in-flight wb switches are in the workqueue.
                 */
                rcu_barrier();
                flush_workqueue(isw_wq);
        }
}

static int __init cgroup_writeback_init(void)
{
        isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
        if (!isw_wq)
                return -ENOMEM;
        return 0;
}
fs_initcall(cgroup_writeback_init);

#else        /* CONFIG_CGROUP_WRITEBACK */

static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }

static struct bdi_writeback *
locked_inode_to_wb_and_lock_list(struct inode *inode)
        __releases(&inode->i_lock)
        __acquires(&wb->list_lock)
{
        struct bdi_writeback *wb = inode_to_wb(inode);

        spin_unlock(&inode->i_lock);
        spin_lock(&wb->list_lock);
        return wb;
}

static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
        __acquires(&wb->list_lock)
{
        struct bdi_writeback *wb = inode_to_wb(inode);

        spin_lock(&wb->list_lock);
        return wb;
}

static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
{
        return nr_pages;
}

static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
                                  struct wb_writeback_work *base_work,
                                  bool skip_if_busy)
{
        might_sleep();

        if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
                base_work->auto_free = 0;
                wb_queue_work(&bdi->wb, base_work);
        }
}

#endif        /* CONFIG_CGROUP_WRITEBACK */

/*
 * Add in the number of potentially dirty inodes, because each inode
 * write can dirty pagecache in the underlying blockdev.
 */
static unsigned long get_nr_dirty_pages(void)
{
        return global_node_page_state(NR_FILE_DIRTY) +
                get_nr_dirty_inodes();
}

static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
{
        if (!wb_has_dirty_io(wb))
                return;

        /*
         * All callers of this function want to start writeback of all
         * dirty pages. Places like vmscan can call this at a very
         * high frequency, causing pointless allocations of tons of
         * work items and keeping the flusher threads busy retrieving
         * that work. Ensure that we only allow one of them pending and
         * inflight at the time.
         */
        if (test_bit(WB_start_all, &wb->state) ||
            test_and_set_bit(WB_start_all, &wb->state))
                return;

        wb->start_all_reason = reason;
        wb_wakeup(wb);
}

/**
 * wb_start_background_writeback - start background writeback
 * @wb: bdi_writback to write from
 *
 * Description:
 *   This makes sure WB_SYNC_NONE background writeback happens. When
 *   this function returns, it is only guaranteed that for given wb
 *   some IO is happening if we are over background dirty threshold.
 *   Caller need not hold sb s_umount semaphore.
 */
void wb_start_background_writeback(struct bdi_writeback *wb)
{
        /*
         * We just wake up the flusher thread. It will perform background
         * writeback as soon as there is no other work to do.
         */
        trace_writeback_wake_background(wb);
        wb_wakeup(wb);
}

/*
 * Remove the inode from the writeback list it is on.
 */
void inode_io_list_del(struct inode *inode)
{
        struct bdi_writeback *wb;

        wb = inode_to_wb_and_lock_list(inode);
        spin_lock(&inode->i_lock);
        inode_io_list_del_locked(inode, wb);
        spin_unlock(&inode->i_lock);
        spin_unlock(&wb->list_lock);
}
EXPORT_SYMBOL(inode_io_list_del);

/*
 * mark an inode as under writeback on the sb
 */
void sb_mark_inode_writeback(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        unsigned long flags;

        if (list_empty(&inode->i_wb_list)) {
                spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
                if (list_empty(&inode->i_wb_list)) {
                        list_add_tail(&inode->i_wb_list, &sb->s_inodes_wb);
                        trace_sb_mark_inode_writeback(inode);
                }
                spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
        }
}

/*
 * clear an inode as under writeback on the sb
 */
void sb_clear_inode_writeback(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        unsigned long flags;

        if (!list_empty(&inode->i_wb_list)) {
                spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
                if (!list_empty(&inode->i_wb_list)) {
                        list_del_init(&inode->i_wb_list);
                        trace_sb_clear_inode_writeback(inode);
                }
                spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
        }
}

/*
 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
 * furthest end of its superblock's dirty-inode list.
 *
 * Before stamping the inode's ->dirtied_when, we check to see whether it is
 * already the most-recently-dirtied inode on the b_dirty list.  If that is
 * the case then the inode must have been redirtied while it was being written
 * out and we don't reset its dirtied_when.
 */
static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb)
{
        assert_spin_locked(&inode->i_lock);

        if (!list_empty(&wb->b_dirty)) {
                struct inode *tail;

                tail = wb_inode(wb->b_dirty.next);
                if (time_before(inode->dirtied_when, tail->dirtied_when))
                        inode->dirtied_when = jiffies;
        }
        inode_io_list_move_locked(inode, wb, &wb->b_dirty);
        inode->i_state &= ~I_SYNC_QUEUED;
}

static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
{
        spin_lock(&inode->i_lock);
        redirty_tail_locked(inode, wb);
        spin_unlock(&inode->i_lock);
}

/*
 * requeue inode for re-scanning after bdi->b_io list is exhausted.
 */
static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
{
        inode_io_list_move_locked(inode, wb, &wb->b_more_io);
}

static void inode_sync_complete(struct inode *inode)
{
        inode->i_state &= ~I_SYNC;
        /* If inode is clean an unused, put it into LRU now... */
        inode_add_lru(inode);
        /* Waiters must see I_SYNC cleared before being woken up */
        smp_mb();
        wake_up_bit(&inode->i_state, __I_SYNC);
}

static bool inode_dirtied_after(struct inode *inode, unsigned long t)
{
        bool ret = time_after(inode->dirtied_when, t);
#ifndef CONFIG_64BIT
        /*
         * For inodes being constantly redirtied, dirtied_when can get stuck.
         * It _appears_ to be in the future, but is actually in distant past.
         * This test is necessary to prevent such wrapped-around relative times
         * from permanently stopping the whole bdi writeback.
         */
        ret = ret && time_before_eq(inode->dirtied_when, jiffies);
#endif
        return ret;
}

#define EXPIRE_DIRTY_ATIME 0x0001

/*
 * Move expired (dirtied before dirtied_before) dirty inodes from
 * @delaying_queue to @dispatch_queue.
 */
static int move_expired_inodes(struct list_head *delaying_queue,
                               struct list_head *dispatch_queue,
                               unsigned long dirtied_before)
{
        LIST_HEAD(tmp);
        struct list_head *pos, *node;
        struct super_block *sb = NULL;
        struct inode *inode;
        int do_sb_sort = 0;
        int moved = 0;

        while (!list_empty(delaying_queue)) {
                inode = wb_inode(delaying_queue->prev);
                if (inode_dirtied_after(inode, dirtied_before))
                        break;
                list_move(&inode->i_io_list, &tmp);
                moved++;
                spin_lock(&inode->i_lock);
                inode->i_state |= I_SYNC_QUEUED;
                spin_unlock(&inode->i_lock);
                if (sb_is_blkdev_sb(inode->i_sb))
                        continue;
                if (sb && sb != inode->i_sb)
                        do_sb_sort = 1;
                sb = inode->i_sb;
        }

        /* just one sb in list, splice to dispatch_queue and we're done */
        if (!do_sb_sort) {
                list_splice(&tmp, dispatch_queue);
                goto out;
        }

        /* Move inodes from one superblock together */
        while (!list_empty(&tmp)) {
                sb = wb_inode(tmp.prev)->i_sb;
                list_for_each_prev_safe(pos, node, &tmp) {
                        inode = wb_inode(pos);
                        if (inode->i_sb == sb)
                                list_move(&inode->i_io_list, dispatch_queue);
                }
        }
out:
        return moved;
}

/*
 * Queue all expired dirty inodes for io, eldest first.
 * Before
 *         newly dirtied     b_dirty    b_io    b_more_io
 *         =============>    gf         edc     BA
 * After
 *         newly dirtied     b_dirty    b_io    b_more_io
 *         =============>    g          fBAedc
 *                                           |
 *                                           +--> dequeue for IO
 */
static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work,
                     unsigned long dirtied_before)
{
        int moved;
        unsigned long time_expire_jif = dirtied_before;

        assert_spin_locked(&wb->list_lock);
        list_splice_init(&wb->b_more_io, &wb->b_io);
        moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, dirtied_before);
        if (!work->for_sync)
                time_expire_jif = jiffies - dirtytime_expire_interval * HZ;
        moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
                                     time_expire_jif);
        if (moved)
                wb_io_lists_populated(wb);
        trace_writeback_queue_io(wb, work, dirtied_before, moved);
}

static int write_inode(struct inode *inode, struct writeback_control *wbc)
{
        int ret;

        if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) {
                trace_writeback_write_inode_start(inode, wbc);
                ret = inode->i_sb->s_op->write_inode(inode, wbc);
                trace_writeback_write_inode(inode, wbc);
                return ret;
        }
        return 0;
}

/*
 * Wait for writeback on an inode to complete. Called with i_lock held.
 * Caller must make sure inode cannot go away when we drop i_lock.
 */
static void __inode_wait_for_writeback(struct inode *inode)
        __releases(inode->i_lock)
        __acquires(inode->i_lock)
{
        DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
        wait_queue_head_t *wqh;

        wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
        while (inode->i_state & I_SYNC) {
                spin_unlock(&inode->i_lock);
                __wait_on_bit(wqh, &wq, bit_wait,
                              TASK_UNINTERRUPTIBLE);
                spin_lock(&inode->i_lock);
        }
}

/*
 * Wait for writeback on an inode to complete. Caller must have inode pinned.
 */
void inode_wait_for_writeback(struct inode *inode)
{
        spin_lock(&inode->i_lock);
        __inode_wait_for_writeback(inode);
        spin_unlock(&inode->i_lock);
}

/*
 * Sleep until I_SYNC is cleared. This function must be called with i_lock
 * held and drops it. It is aimed for callers not holding any inode reference
 * so once i_lock is dropped, inode can go away.
 */
static void inode_sleep_on_writeback(struct inode *inode)
        __releases(inode->i_lock)
{
        DEFINE_WAIT(wait);
        wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
        int sleep;

        prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
        sleep = inode->i_state & I_SYNC;
        spin_unlock(&inode->i_lock);
        if (sleep)
                schedule();
        finish_wait(wqh, &wait);
}

/*
 * Find proper writeback list for the inode depending on its current state and
 * possibly also change of its state while we were doing writeback.  Here we
 * handle things such as livelock prevention or fairness of writeback among
 * inodes. This function can be called only by flusher thread - noone else
 * processes all inodes in writeback lists and requeueing inodes behind flusher
 * thread's back can have unexpected consequences.
 */
static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
                          struct writeback_control *wbc)
{
        if (inode->i_state & I_FREEING)
                return;

        /*
         * Sync livelock prevention. Each inode is tagged and synced in one
         * shot. If still dirty, it will be redirty_tail()'ed below.  Update
         * the dirty time to prevent enqueue and sync it again.
         */
        if ((inode->i_state & I_DIRTY) &&
            (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
                inode->dirtied_when = jiffies;

        if (wbc->pages_skipped) {
                /*
                 * writeback is not making progress due to locked
                 * buffers. Skip this inode for now.
                 */
                redirty_tail_locked(inode, wb);
                return;
        }

        if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
                /*
                 * We didn't write back all the pages.  nfs_writepages()
                 * sometimes bales out without doing anything.
                 */
                if (wbc->nr_to_write <= 0) {
                        /* Slice used up. Queue for next turn. */
                        requeue_io(inode, wb);
                } else {
                        /*
                         * Writeback blocked by something other than
                         * congestion. Delay the inode for some time to
                         * avoid spinning on the CPU (100% iowait)
                         * retrying writeback of the dirty page/inode
                         * that cannot be performed immediately.
                         */
                        redirty_tail_locked(inode, wb);
                }
        } else if (inode->i_state & I_DIRTY) {
                /*
                 * Filesystems can dirty the inode during writeback operations,
                 * such as delayed allocation during submission or metadata
                 * updates after data IO completion.
                 */
                redirty_tail_locked(inode, wb);
        } else if (inode->i_state & I_DIRTY_TIME) {
                inode->dirtied_when = jiffies;
                inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
                inode->i_state &= ~I_SYNC_QUEUED;
        } else {
                /* The inode is clean. Remove from writeback lists. */
                inode_io_list_del_locked(inode, wb);
        }
}

/*
 * Write out an inode and its dirty pages. Do not update the writeback list
 * linkage. That is left to the caller. The caller is also responsible for
 * setting I_SYNC flag and calling inode_sync_complete() to clear it.
 */
static int
__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
{
        struct address_space *mapping = inode->i_mapping;
        long nr_to_write = wbc->nr_to_write;
        unsigned dirty;
        int ret;

        WARN_ON(!(inode->i_state & I_SYNC));

        trace_writeback_single_inode_start(inode, wbc, nr_to_write);

        ret = do_writepages(mapping, wbc);

        /*
         * Make sure to wait on the data before writing out the metadata.
         * This is important for filesystems that modify metadata on data
         * I/O completion. We don't do it for sync(2) writeback because it has a
         * separate, external IO completion path and ->sync_fs for guaranteeing
         * inode metadata is written back correctly.
         */
        if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
                int err = filemap_fdatawait(mapping);
                if (ret == 0)
                        ret = err;
        }

        /*
         * If the inode has dirty timestamps and we need to write them, call
         * mark_inode_dirty_sync() to notify the filesystem about it and to
         * change I_DIRTY_TIME into I_DIRTY_SYNC.
         */
        if ((inode->i_state & I_DIRTY_TIME) &&
            (wbc->sync_mode == WB_SYNC_ALL || wbc->for_sync ||
             time_after(jiffies, inode->dirtied_time_when +
                        dirtytime_expire_interval * HZ))) {
                trace_writeback_lazytime(inode);
                mark_inode_dirty_sync(inode);
        }

        /*
         * Some filesystems may redirty the inode during the writeback
         * due to delalloc, clear dirty metadata flags right before
         * write_inode()
         */
        spin_lock(&inode->i_lock);
        dirty = inode->i_state & I_DIRTY;
        inode->i_state &= ~dirty;

        /*
         * Paired with smp_mb() in __mark_inode_dirty().  This allows
         * __mark_inode_dirty() to test i_state without grabbing i_lock -
         * either they see the I_DIRTY bits cleared or we see the dirtied
         * inode.
         *
         * I_DIRTY_PAGES is always cleared together above even if @mapping
         * still has dirty pages.  The flag is reinstated after smp_mb() if
         * necessary.  This guarantees that either __mark_inode_dirty()
         * sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY.
         */
        smp_mb();

        if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
                inode->i_state |= I_DIRTY_PAGES;

        spin_unlock(&inode->i_lock);

        /* Don't write the inode if only I_DIRTY_PAGES was set */
        if (dirty & ~I_DIRTY_PAGES) {
                int err = write_inode(inode, wbc);
                if (ret == 0)
                        ret = err;
        }
        trace_writeback_single_inode(inode, wbc, nr_to_write);
        return ret;
}

/*
 * Write out an inode's dirty pages. Either the caller has an active reference
 * on the inode or the inode has I_WILL_FREE set.
 *
 * This function is designed to be called for writing back one inode which
 * we go e.g. from filesystem. Flusher thread uses __writeback_single_inode()
 * and does more profound writeback list handling in writeback_sb_inodes().
 */
static int writeback_single_inode(struct inode *inode,
                                  struct writeback_control *wbc)
{
        struct bdi_writeback *wb;
        int ret = 0;

        spin_lock(&inode->i_lock);
        if (!atomic_read(&inode->i_count))
                WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
        else
                WARN_ON(inode->i_state & I_WILL_FREE);

        if (inode->i_state & I_SYNC) {
                if (wbc->sync_mode != WB_SYNC_ALL)
                        goto out;
                /*
                 * It's a data-integrity sync. We must wait. Since callers hold
                 * inode reference or inode has I_WILL_FREE set, it cannot go
                 * away under us.
                 */
                __inode_wait_for_writeback(inode);
        }
        WARN_ON(inode->i_state & I_SYNC);
        /*
         * Skip inode if it is clean and we have no outstanding writeback in
         * WB_SYNC_ALL mode. We don't want to mess with writeback lists in this
         * function since flusher thread may be doing for example sync in
         * parallel and if we move the inode, it could get skipped. So here we
         * make sure inode is on some writeback list and leave it there unless
         * we have completely cleaned the inode.
         */
        if (!(inode->i_state & I_DIRTY_ALL) &&
            (wbc->sync_mode != WB_SYNC_ALL ||
             !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
                goto out;
        inode->i_state |= I_SYNC;
        wbc_attach_and_unlock_inode(wbc, inode);

        ret = __writeback_single_inode(inode, wbc);

        wbc_detach_inode(wbc);

        wb = inode_to_wb_and_lock_list(inode);
        spin_lock(&inode->i_lock);
        /*
         * If inode is clean, remove it from writeback lists. Otherwise don't
         * touch it. See comment above for explanation.
         */
        if (!(inode->i_state & I_DIRTY_ALL))
                inode_io_list_del_locked(inode, wb);
        spin_unlock(&wb->list_lock);
        inode_sync_complete(inode);
out:
        spin_unlock(&inode->i_lock);
        return ret;
}

static long writeback_chunk_size(struct bdi_writeback *wb,
                                 struct wb_writeback_work *work)
{
        long pages;

        /*
         * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
         * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
         * here avoids calling into writeback_inodes_wb() more than once.
         *
         * The intended call sequence for WB_SYNC_ALL writeback is:
         *
         *      wb_writeback()
         *          writeback_sb_inodes()       <== called only once
         *              write_cache_pages()     <== called once for each inode
         *                   (quickly) tag currently dirty pages
         *                   (maybe slowly) sync all tagged pages
         */
        if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
                pages = LONG_MAX;
        else {
                pages = min(wb->avg_write_bandwidth / 2,
                            global_wb_domain.dirty_limit / DIRTY_SCOPE);
                pages = min(pages, work->nr_pages);
                pages = round_down(pages + MIN_WRITEBACK_PAGES,
                                   MIN_WRITEBACK_PAGES);
        }

        return pages;
}

/*
 * Write a portion of b_io inodes which belong to @sb.
 *
 * Return the number of pages and/or inodes written.
 *
 * NOTE! This is called with wb->list_lock held, and will
 * unlock and relock that for each inode it ends up doing
 * IO for.
 */
static long writeback_sb_inodes(struct super_block *sb,
                                struct bdi_writeback *wb,
                                struct wb_writeback_work *work)
{
        struct writeback_control wbc = {
                .sync_mode                = work->sync_mode,
                .tagged_writepages        = work->tagged_writepages,
                .for_kupdate                = work->for_kupdate,
                .for_background                = work->for_background,
                .for_sync                = work->for_sync,
                .range_cyclic                = work->range_cyclic,
                .range_start                = 0,
                .range_end                = LLONG_MAX,
        };
        unsigned long start_time = jiffies;
        long write_chunk;
        long total_wrote = 0;  /* count both pages and inodes */

        while (!list_empty(&wb->b_io)) {
                struct inode *inode = wb_inode(wb->b_io.prev);
                struct bdi_writeback *tmp_wb;
                long wrote;

                if (inode->i_sb != sb) {
                        if (work->sb) {
                                /*
                                 * We only want to write back data for this
                                 * superblock, move all inodes not belonging
                                 * to it back onto the dirty list.
                                 */
                                redirty_tail(inode, wb);
                                continue;
                        }

                        /*
                         * The inode belongs to a different superblock.
                         * Bounce back to the caller to unpin this and
                         * pin the next superblock.
                         */
                        break;
                }

                /*
                 * Don't bother with new inodes or inodes being freed, first
                 * kind does not need periodic writeout yet, and for the latter
                 * kind writeout is handled by the freer.
                 */
                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
                        redirty_tail_locked(inode, wb);
                        spin_unlock(&inode->i_lock);
                        continue;
                }
                if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
                        /*
                         * If this inode is locked for writeback and we are not
                         * doing writeback-for-data-integrity, move it to
                         * b_more_io so that writeback can proceed with the
                         * other inodes on s_io.
                         *
                         * We'll have another go at writing back this inode
                         * when we completed a full scan of b_io.
                         */
                        spin_unlock(&inode->i_lock);
                        requeue_io(inode, wb);
                        trace_writeback_sb_inodes_requeue(inode);
                        continue;
                }
                spin_unlock(&wb->list_lock);

                /*
                 * We already requeued the inode if it had I_SYNC set and we
                 * are doing WB_SYNC_NONE writeback. So this catches only the
                 * WB_SYNC_ALL case.
                 */
                if (inode->i_state & I_SYNC) {
                        /* Wait for I_SYNC. This function drops i_lock... */
                        inode_sleep_on_writeback(inode);
                        /* Inode may be gone, start again */
                        spin_lock(&wb->list_lock);
                        continue;
                }
                inode->i_state |= I_SYNC;
                wbc_attach_and_unlock_inode(&wbc, inode);

                write_chunk = writeback_chunk_size(wb, work);
                wbc.nr_to_write = write_chunk;
                wbc.pages_skipped = 0;

                /*
                 * We use I_SYNC to pin the inode in memory. While it is set
                 * evict_inode() will wait so the inode cannot be freed.
                 */
                __writeback_single_inode(inode, &wbc);

                wbc_detach_inode(&wbc);
                work->nr_pages -= write_chunk - wbc.nr_to_write;
                wrote = write_chunk - wbc.nr_to_write - wbc.pages_skipped;
                wrote = wrote < 0 ? 0 : wrote;
                total_wrote += wrote;

                if (need_resched()) {
                        /*
                         * We're trying to balance between building up a nice
                         * long list of IOs to improve our merge rate, and
                         * getting those IOs out quickly for anyone throttling
                         * in balance_dirty_pages().  cond_resched() doesn't
                         * unplug, so get our IOs out the door before we
                         * give up the CPU.
                         */
                        blk_flush_plug(current);
                        cond_resched();
                }

                /*
                 * Requeue @inode if still dirty.  Be careful as @inode may
                 * have been switched to another wb in the meantime.
                 */
                tmp_wb = inode_to_wb_and_lock_list(inode);
                spin_lock(&inode->i_lock);
                if (!(inode->i_state & I_DIRTY_ALL))
                        total_wrote++;
                requeue_inode(inode, tmp_wb, &wbc);
                inode_sync_complete(inode);
                spin_unlock(&inode->i_lock);

                if (unlikely(tmp_wb != wb)) {
                        spin_unlock(&tmp_wb->list_lock);
                        spin_lock(&wb->list_lock);
                }

                /*
                 * bail out to wb_writeback() often enough to check
                 * background threshold and other termination conditions.
                 */
                if (total_wrote) {
                        if (time_is_before_jiffies(start_time + HZ / 10UL))
                                break;
                        if (work->nr_pages <= 0)
                                break;
                }
        }
        return total_wrote;
}

static long __writeback_inodes_wb(struct bdi_writeback *wb,
                                  struct wb_writeback_work *work)
{
        unsigned long start_time = jiffies;
        long wrote = 0;

        while (!list_empty(&wb->b_io)) {
                struct inode *inode = wb_inode(wb->b_io.prev);
                struct super_block *sb = inode->i_sb;

                if (!trylock_super(sb)) {
                        /*
                         * trylock_super() may fail consistently due to
                         * s_umount being grabbed by someone else. Don't use
                         * requeue_io() to avoid busy retrying the inode/sb.
                         */
                        redirty_tail(inode, wb);
                        continue;
                }
                wrote += writeback_sb_inodes(sb, wb, work);
                up_read(&sb->s_umount);

                /* refer to the same tests at the end of writeback_sb_inodes */
                if (wrote) {
                        if (time_is_before_jiffies(start_time + HZ / 10UL))
                                break;
                        if (work->nr_pages <= 0)
                                break;
                }
        }
        /* Leave any unwritten inodes on b_io */
        return wrote;
}

static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
                                enum wb_reason reason)
{
        struct wb_writeback_work work = {
                .nr_pages        = nr_pages,
                .sync_mode        = WB_SYNC_NONE,
                .range_cyclic        = 1,
                .reason                = reason,
        };
        struct blk_plug plug;

        blk_start_plug(&plug);
        spin_lock(&wb->list_lock);
        if (list_empty(&wb->b_io))
                queue_io(wb, &work, jiffies);
        __writeback_inodes_wb(wb, &work);
        spin_unlock(&wb->list_lock);
        blk_finish_plug(&plug);

        return nr_pages - work.nr_pages;
}

/*
 * Explicit flushing or periodic writeback of "old" data.
 *
 * Define "old": the first time one of an inode's pages is dirtied, we mark the
 * dirtying-time in the inode's address_space.  So this periodic writeback code
 * just walks the superblock inode list, writing back any inodes which are
 * older than a specific point in time.
 *
 * Try to run once per dirty_writeback_interval.  But if a writeback event
 * takes longer than a dirty_writeback_interval interval, then leave a
 * one-second gap.
 *
 * dirtied_before takes precedence over nr_to_write.  So we'll only write back
 * all dirty pages if they are all attached to "old" mappings.
 */
static long wb_writeback(struct bdi_writeback *wb,
                         struct wb_writeback_work *work)
{
        unsigned long wb_start = jiffies;
        long nr_pages = work->nr_pages;
        unsigned long dirtied_before = jiffies;
        struct inode *inode;
        long progress;
        struct blk_plug plug;

        blk_start_plug(&plug);
        spin_lock(&wb->list_lock);
        for (;;) {
                /*
                 * Stop writeback when nr_pages has been consumed
                 */
                if (work->nr_pages <= 0)
                        break;

                /*
                 * Background writeout and kupdate-style writeback may
                 * run forever. Stop them if there is other work to do
                 * so that e.g. sync can proceed. They'll be restarted
                 * after the other works are all done.
                 */
                if ((work->for_background || work->for_kupdate) &&
                    !list_empty(&wb->work_list))
                        break;

                /*
                 * For background writeout, stop when we are below the
                 * background dirty threshold
                 */
                if (work->for_background && !wb_over_bg_thresh(wb))
                        break;

                /*
                 * Kupdate and background works are special and we want to
                 * include all inodes that need writing. Livelock avoidance is
                 * handled by these works yielding to any other work so we are
                 * safe.
                 */
                if (work->for_kupdate) {
                        dirtied_before = jiffies -
                                msecs_to_jiffies(dirty_expire_interval * 10);
                } else if (work->for_background)
                        dirtied_before = jiffies;

                trace_writeback_start(wb, work);
                if (list_empty(&wb->b_io))
                        queue_io(wb, work, dirtied_before);
                if (work->sb)
                        progress = writeback_sb_inodes(work->sb, wb, work);
                else
                        progress = __writeback_inodes_wb(wb, work);
                trace_writeback_written(wb, work);

                wb_update_bandwidth(wb, wb_start);

                /*
                 * Did we write something? Try for more
                 *
                 * Dirty inodes are moved to b_io for writeback in batches.
                 * The completion of the current batch does not necessarily
                 * mean the overall work is done. So we keep looping as long
                 * as made some progress on cleaning pages or inodes.
                 */
                if (progress)
                        continue;
                /*
                 * No more inodes for IO, bail
                 */
                if (list_empty(&wb->b_more_io))
                        break;
                /*
                 * Nothing written. Wait for some inode to
                 * become available for writeback. Otherwise
                 * we'll just busyloop.
                 */
                trace_writeback_wait(wb, work);
                inode = wb_inode(wb->b_more_io.prev);
                spin_lock(&inode->i_lock);
                spin_unlock(&wb->list_lock);
                /* This function drops i_lock... */
                inode_sleep_on_writeback(inode);
                spin_lock(&wb->list_lock);
        }
        spin_unlock(&wb->list_lock);
        blk_finish_plug(&plug);

        return nr_pages - work->nr_pages;
}

/*
 * Return the next wb_writeback_work struct that hasn't been processed yet.
 */
static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
{
        struct wb_writeback_work *work = NULL;

        spin_lock_bh(&wb->work_lock);
        if (!list_empty(&wb->work_list)) {
                work = list_entry(wb->work_list.next,
                                  struct wb_writeback_work, list);
                list_del_init(&work->list);
        }
        spin_unlock_bh(&wb->work_lock);
        return work;
}

static long wb_check_background_flush(struct bdi_writeback *wb)
{
        if (wb_over_bg_thresh(wb)) {

                struct wb_writeback_work work = {
                        .nr_pages        = LONG_MAX,
                        .sync_mode        = WB_SYNC_NONE,
                        .for_background        = 1,
                        .range_cyclic        = 1,
                        .reason                = WB_REASON_BACKGROUND,
                };

                return wb_writeback(wb, &work);
        }

        return 0;
}

static long wb_check_old_data_flush(struct bdi_writeback *wb)
{
        unsigned long expired;
        long nr_pages;

        /*
         * When set to zero, disable periodic writeback
         */
        if (!dirty_writeback_interval)
                return 0;

        expired = wb->last_old_flush +
                        msecs_to_jiffies(dirty_writeback_interval * 10);
        if (time_before(jiffies, expired))
                return 0;

        wb->last_old_flush = jiffies;
        nr_pages = get_nr_dirty_pages();

        if (nr_pages) {
                struct wb_writeback_work work = {
                        .nr_pages        = nr_pages,
                        .sync_mode        = WB_SYNC_NONE,
                        .for_kupdate        = 1,
                        .range_cyclic        = 1,
                        .reason                = WB_REASON_PERIODIC,
                };

                return wb_writeback(wb, &work);
        }

        return 0;
}

static long wb_check_start_all(struct bdi_writeback *wb)
{
        long nr_pages;

        if (!test_bit(WB_start_all, &wb->state))
                return 0;

        nr_pages = get_nr_dirty_pages();
        if (nr_pages) {
                struct wb_writeback_work work = {
                        .nr_pages        = wb_split_bdi_pages(wb, nr_pages),
                        .sync_mode        = WB_SYNC_NONE,
                        .range_cyclic        = 1,
                        .reason                = wb->start_all_reason,
                };

                nr_pages = wb_writeback(wb, &work);
        }

        clear_bit(WB_start_all, &wb->state);
        return nr_pages;
}


/*
 * Retrieve work items and do the writeback they describe
 */
static long wb_do_writeback(struct bdi_writeback *wb)
{
        struct wb_writeback_work *work;
        long wrote = 0;

        set_bit(WB_writeback_running, &wb->state);
        while ((work = get_next_work_item(wb)) != NULL) {
                trace_writeback_exec(wb, work);
                wrote += wb_writeback(wb, work);
                finish_writeback_work(wb, work);
        }

        /*
         * Check for a flush-everything request
         */
        wrote += wb_check_start_all(wb);

        /*
         * Check for periodic writeback, kupdated() style
         */
        wrote += wb_check_old_data_flush(wb);
        wrote += wb_check_background_flush(wb);
        clear_bit(WB_writeback_running, &wb->state);

        return wrote;
}

/*
 * Handle writeback of dirty data for the device backed by this bdi. Also
 * reschedules periodically and does kupdated style flushing.
 */
void wb_workfn(struct work_struct *work)
{
        struct bdi_writeback *wb = container_of(to_delayed_work(work),
                                                struct bdi_writeback, dwork);
        long pages_written;

        set_worker_desc("flush-%s", bdi_dev_name(wb->bdi));
        current->flags |= PF_SWAPWRITE;

        if (likely(!current_is_workqueue_rescuer() ||
                   !test_bit(WB_registered, &wb->state))) {
                /*
                 * The normal path.  Keep writing back @wb until its
                 * work_list is empty.  Note that this path is also taken
                 * if @wb is shutting down even when we're running off the
                 * rescuer as work_list needs to be drained.
                 */
                do {
                        pages_written = wb_do_writeback(wb);
                        trace_writeback_pages_written(pages_written);
                } while (!list_empty(&wb->work_list));
        } else {
                /*
                 * bdi_wq can't get enough workers and we're running off
                 * the emergency worker.  Don't hog it.  Hopefully, 1024 is
                 * enough for efficient IO.
                 */
                pages_written = writeback_inodes_wb(wb, 1024,
                                                    WB_REASON_FORKER_THREAD);
                trace_writeback_pages_written(pages_written);
        }

        if (!list_empty(&wb->work_list))
                wb_wakeup(wb);
        else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
                wb_wakeup_delayed(wb);

        current->flags &= ~PF_SWAPWRITE;
}

/*
 * Start writeback of `nr_pages' pages on this bdi. If `nr_pages' is zero,
 * write back the whole world.
 */
static void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
                                         enum wb_reason reason)
{
        struct bdi_writeback *wb;

        if (!bdi_has_dirty_io(bdi))
                return;

        list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
                wb_start_writeback(wb, reason);
}

void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
                                enum wb_reason reason)
{
        rcu_read_lock();
        __wakeup_flusher_threads_bdi(bdi, reason);
        rcu_read_unlock();
}

/*
 * Wakeup the flusher threads to start writeback of all currently dirty pages
 */
void wakeup_flusher_threads(enum wb_reason reason)
{
        struct backing_dev_info *bdi;

        /*
         * If we are expecting writeback progress we must submit plugged IO.
         */
        if (blk_needs_flush_plug(current))
                blk_schedule_flush_plug(current);

        rcu_read_lock();
        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
                __wakeup_flusher_threads_bdi(bdi, reason);
        rcu_read_unlock();
}

/*
 * Wake up bdi's periodically to make sure dirtytime inodes gets
 * written back periodically.  We deliberately do *not* check the
 * b_dirtytime list in wb_has_dirty_io(), since this would cause the
 * kernel to be constantly waking up once there are any dirtytime
 * inodes on the system.  So instead we define a separate delayed work
 * function which gets called much more rarely.  (By default, only
 * once every 12 hours.)
 *
 * If there is any other write activity going on in the file system,
 * this function won't be necessary.  But if the only thing that has
 * happened on the file system is a dirtytime inode caused by an atime
 * update, we need this infrastructure below to make sure that inode
 * eventually gets pushed out to disk.
 */
static void wakeup_dirtytime_writeback(struct work_struct *w);
static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback);

static void wakeup_dirtytime_writeback(struct work_struct *w)
{
        struct backing_dev_info *bdi;

        rcu_read_lock();
        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
                struct bdi_writeback *wb;

                list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
                        if (!list_empty(&wb->b_dirty_time))
                                wb_wakeup(wb);
        }
        rcu_read_unlock();
        if (dirtytime_expire_interval)
                schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
}

static int __init start_dirtytime_writeback(void)
{
        if (dirtytime_expire_interval)
                schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
        return 0;
}
__initcall(start_dirtytime_writeback);

int dirtytime_interval_handler(struct ctl_table *table, int write,
                               void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret;

        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write) {
                if (dirtytime_expire_interval)
                        mod_delayed_work(system_wq, &dirtytime_work, 0);
                else
                        cancel_delayed_work_sync(&dirtytime_work);
        }
        return ret;
}

/**
 * __mark_inode_dirty -        internal function
 *
 * @inode: inode to mark
 * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
 *
 * Mark an inode as dirty. Callers should use mark_inode_dirty or
 * mark_inode_dirty_sync.
 *
 * Put the inode on the super block's dirty list.
 *
 * CAREFUL! We mark it dirty unconditionally, but move it onto the
 * dirty list only if it is hashed or if it refers to a blockdev.
 * If it was not hashed, it will never be added to the dirty list
 * even if it is later hashed, as it will have been marked dirty already.
 *
 * In short, make sure you hash any inodes _before_ you start marking
 * them dirty.
 *
 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
 * the block-special inode (/dev/hda1) itself.  And the ->dirtied_when field of
 * the kernel-internal blockdev inode represents the dirtying time of the
 * blockdev's pages.  This is why for I_DIRTY_PAGES we always use
 * page->mapping->host, so the page-dirtying time is recorded in the internal
 * blockdev inode.
 */
void __mark_inode_dirty(struct inode *inode, int flags)
{
        struct super_block *sb = inode->i_sb;
        int dirtytime;

        trace_writeback_mark_inode_dirty(inode, flags);

        /*
         * Don't do this for I_DIRTY_PAGES - that doesn't actually
         * dirty the inode itself
         */
        if (flags & (I_DIRTY_INODE | I_DIRTY_TIME)) {
                trace_writeback_dirty_inode_start(inode, flags);

                if (sb->s_op->dirty_inode)
                        sb->s_op->dirty_inode(inode, flags);

                trace_writeback_dirty_inode(inode, flags);
        }
        if (flags & I_DIRTY_INODE)
                flags &= ~I_DIRTY_TIME;
        dirtytime = flags & I_DIRTY_TIME;

        /*
         * Paired with smp_mb() in __writeback_single_inode() for the
         * following lockless i_state test.  See there for details.
         */
        smp_mb();

        if (((inode->i_state & flags) == flags) ||
            (dirtytime && (inode->i_state & I_DIRTY_INODE)))
                return;

        spin_lock(&inode->i_lock);
        if (dirtytime && (inode->i_state & I_DIRTY_INODE))
                goto out_unlock_inode;
        if ((inode->i_state & flags) != flags) {
                const int was_dirty = inode->i_state & I_DIRTY;

                inode_attach_wb(inode, NULL);

                if (flags & I_DIRTY_INODE)
                        inode->i_state &= ~I_DIRTY_TIME;
                inode->i_state |= flags;

                /*
                 * If the inode is queued for writeback by flush worker, just
                 * update its dirty state. Once the flush worker is done with
                 * the inode it will place it on the appropriate superblock
                 * list, based upon its state.
                 */
                if (inode->i_state & I_SYNC_QUEUED)
                        goto out_unlock_inode;

                /*
                 * Only add valid (hashed) inodes to the superblock's
                 * dirty list.  Add blockdev inodes as well.
                 */
                if (!S_ISBLK(inode->i_mode)) {
                        if (inode_unhashed(inode))
                                goto out_unlock_inode;
                }
                if (inode->i_state & I_FREEING)
                        goto out_unlock_inode;

                /*
                 * If the inode was already on b_dirty/b_io/b_more_io, don't
                 * reposition it (that would break b_dirty time-ordering).
                 */
                if (!was_dirty) {
                        struct bdi_writeback *wb;
                        struct list_head *dirty_list;
                        bool wakeup_bdi = false;

                        wb = locked_inode_to_wb_and_lock_list(inode);

                        WARN((wb->bdi->capabilities & BDI_CAP_WRITEBACK) &&
                             !test_bit(WB_registered, &wb->state),
                             "bdi-%s not registered\n", bdi_dev_name(wb->bdi));

                        inode->dirtied_when = jiffies;
                        if (dirtytime)
                                inode->dirtied_time_when = jiffies;

                        if (inode->i_state & I_DIRTY)
                                dirty_list = &wb->b_dirty;
                        else
                                dirty_list = &wb->b_dirty_time;

                        wakeup_bdi = inode_io_list_move_locked(inode, wb,
                                                               dirty_list);

                        /*
                         * If this is the first dirty inode for this bdi,
                         * we have to wake-up the corresponding bdi thread
                         * to make sure background write-back happens
                         * later.
                         */
                        if (wakeup_bdi &&
                            (wb->bdi->capabilities & BDI_CAP_WRITEBACK))
                                wb_wakeup_delayed(wb);

                        spin_unlock(&wb->list_lock);
                        trace_writeback_dirty_inode_enqueue(inode);

                        return;
                }
        }
out_unlock_inode:
        spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(__mark_inode_dirty);

/*
 * The @s_sync_lock is used to serialise concurrent sync operations
 * to avoid lock contention problems with concurrent wait_sb_inodes() calls.
 * Concurrent callers will block on the s_sync_lock rather than doing contending
 * walks. The queueing maintains sync(2) required behaviour as all the IO that
 * has been issued up to the time this function is enter is guaranteed to be
 * completed by the time we have gained the lock and waited for all IO that is
 * in progress regardless of the order callers are granted the lock.
 */
static void wait_sb_inodes(struct super_block *sb)
{
        LIST_HEAD(sync_list);

        /*
         * We need to be protected against the filesystem going from
         * r/o to r/w or vice versa.
         */
        WARN_ON(!rwsem_is_locked(&sb->s_umount));

        mutex_lock(&sb->s_sync_lock);

        /*
         * Splice the writeback list onto a temporary list to avoid waiting on
         * inodes that have started writeback after this point.
         *
         * Use rcu_read_lock() to keep the inodes around until we have a
         * reference. s_inode_wblist_lock protects sb->s_inodes_wb as well as
         * the local list because inodes can be dropped from either by writeback
         * completion.
         */
        rcu_read_lock();
        spin_lock_irq(&sb->s_inode_wblist_lock);
        list_splice_init(&sb->s_inodes_wb, &sync_list);

        /*
         * Data integrity sync. Must wait for all pages under writeback, because
         * there may have been pages dirtied before our sync call, but which had
         * writeout started before we write it out.  In which case, the inode
         * may not be on the dirty list, but we still have to wait for that
         * writeout.
         */
        while (!list_empty(&sync_list)) {
                struct inode *inode = list_first_entry(&sync_list, struct inode,
                                                       i_wb_list);
                struct address_space *mapping = inode->i_mapping;

                /*
                 * Move each inode back to the wb list before we drop the lock
                 * to preserve consistency between i_wb_list and the mapping
                 * writeback tag. Writeback completion is responsible to remove
                 * the inode from either list once the writeback tag is cleared.
                 */
                list_move_tail(&inode->i_wb_list, &sb->s_inodes_wb);

                /*
                 * The mapping can appear untagged while still on-list since we
                 * do not have the mapping lock. Skip it here, wb completion
                 * will remove it.
                 */
                if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
                        continue;

                spin_unlock_irq(&sb->s_inode_wblist_lock);

                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
                        spin_unlock(&inode->i_lock);

                        spin_lock_irq(&sb->s_inode_wblist_lock);
                        continue;
                }
                __iget(inode);
                spin_unlock(&inode->i_lock);
                rcu_read_unlock();

                /*
                 * We keep the error status of individual mapping so that
                 * applications can catch the writeback error using fsync(2).
                 * See filemap_fdatawait_keep_errors() for details.
                 */
                filemap_fdatawait_keep_errors(mapping);

                cond_resched();

                iput(inode);

                rcu_read_lock();
                spin_lock_irq(&sb->s_inode_wblist_lock);
        }
        spin_unlock_irq(&sb->s_inode_wblist_lock);
        rcu_read_unlock();
        mutex_unlock(&sb->s_sync_lock);
}

static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
                                     enum wb_reason reason, bool skip_if_busy)
{
        struct backing_dev_info *bdi = sb->s_bdi;
        DEFINE_WB_COMPLETION(done, bdi);
        struct wb_writeback_work work = {
                .sb                        = sb,
                .sync_mode                = WB_SYNC_NONE,
                .tagged_writepages        = 1,
                .done                        = &done,
                .nr_pages                = nr,
                .reason                        = reason,
        };

        if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
                return;
        WARN_ON(!rwsem_is_locked(&sb->s_umount));

        bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
        wb_wait_for_completion(&done);
}

/**
 * writeback_inodes_sb_nr -        writeback dirty inodes from given super_block
 * @sb: the superblock
 * @nr: the number of pages to write
 * @reason: reason why some writeback work initiated
 *
 * Start writeback on some inodes on this super_block. No guarantees are made
 * on how many (if any) will be written, and this function does not wait
 * for IO completion of submitted IO.
 */
void writeback_inodes_sb_nr(struct super_block *sb,
                            unsigned long nr,
                            enum wb_reason reason)
{
        __writeback_inodes_sb_nr(sb, nr, reason, false);
}
EXPORT_SYMBOL(writeback_inodes_sb_nr);

/**
 * writeback_inodes_sb        -        writeback dirty inodes from given super_block
 * @sb: the superblock
 * @reason: reason why some writeback work was initiated
 *
 * Start writeback on some inodes on this super_block. No guarantees are made
 * on how many (if any) will be written, and this function does not wait
 * for IO completion of submitted IO.
 */
void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
{
        return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
}
EXPORT_SYMBOL(writeback_inodes_sb);

/**
 * try_to_writeback_inodes_sb - try to start writeback if none underway
 * @sb: the superblock
 * @reason: reason why some writeback work was initiated
 *
 * Invoke __writeback_inodes_sb_nr if no writeback is currently underway.
 */
void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
{
        if (!down_read_trylock(&sb->s_umount))
                return;

        __writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason, true);
        up_read(&sb->s_umount);
}
EXPORT_SYMBOL(try_to_writeback_inodes_sb);

/**
 * sync_inodes_sb        -        sync sb inode pages
 * @sb: the superblock
 *
 * This function writes and waits on any dirty inode belonging to this
 * super_block.
 */
void sync_inodes_sb(struct super_block *sb)
{
        struct backing_dev_info *bdi = sb->s_bdi;
        DEFINE_WB_COMPLETION(done, bdi);
        struct wb_writeback_work work = {
                .sb                = sb,
                .sync_mode        = WB_SYNC_ALL,
                .nr_pages        = LONG_MAX,
                .range_cyclic        = 0,
                .done                = &done,
                .reason                = WB_REASON_SYNC,
                .for_sync        = 1,
        };

        /*
         * Can't skip on !bdi_has_dirty() because we should wait for !dirty
         * inodes under writeback and I_DIRTY_TIME inodes ignored by
         * bdi_has_dirty() need to be written out too.
         */
        if (bdi == &noop_backing_dev_info)
                return;
        WARN_ON(!rwsem_is_locked(&sb->s_umount));

        /* protect against inode wb switch, see inode_switch_wbs_work_fn() */
        bdi_down_write_wb_switch_rwsem(bdi);
        bdi_split_work_to_wbs(bdi, &work, false);
        wb_wait_for_completion(&done);
        bdi_up_write_wb_switch_rwsem(bdi);

        wait_sb_inodes(sb);
}
EXPORT_SYMBOL(sync_inodes_sb);

/**
 * write_inode_now        -        write an inode to disk
 * @inode: inode to write to disk
 * @sync: whether the write should be synchronous or not
 *
 * This function commits an inode to disk immediately if it is dirty. This is
 * primarily needed by knfsd.
 *
 * The caller must either have a ref on the inode or must have set I_WILL_FREE.
 */
int write_inode_now(struct inode *inode, int sync)
{
        struct writeback_control wbc = {
                .nr_to_write = LONG_MAX,
                .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
                .range_start = 0,
                .range_end = LLONG_MAX,
        };

        if (!mapping_can_writeback(inode->i_mapping))
                wbc.nr_to_write = 0;

        might_sleep();
        return writeback_single_inode(inode, &wbc);
}
EXPORT_SYMBOL(write_inode_now);

/**
 * sync_inode - write an inode and its pages to disk.
 * @inode: the inode to sync
 * @wbc: controls the writeback mode
 *
 * sync_inode() will write an inode and its pages to disk.  It will also
 * correctly update the inode on its superblock's dirty inode lists and will
 * update inode->i_state.
 *
 * The caller must have a ref on the inode.
 */
int sync_inode(struct inode *inode, struct writeback_control *wbc)
{
        return writeback_single_inode(inode, wbc);
}
EXPORT_SYMBOL(sync_inode);

/**
 * sync_inode_metadata - write an inode to disk
 * @inode: the inode to sync
 * @wait: wait for I/O to complete.
 *
 * Write an inode to disk and adjust its dirty state after completion.
 *
 * Note: only writes the actual inode, no associated data or other metadata.
 */
int sync_inode_metadata(struct inode *inode, int wait)
{
        struct writeback_control wbc = {
                .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
                .nr_to_write = 0, /* metadata-only */
        };

        return sync_inode(inode, &wbc);
}
EXPORT_SYMBOL(sync_inode_metadata);




















































    1 

































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SEQ_FILE_H
#define _LINUX_SEQ_FILE_H

#include <linux/types.h>
#include <linux/string.h>
#include <linux/bug.h>
#include <linux/mutex.h>
#include <linux/cpumask.h>
#include <linux/nodemask.h>
#include <linux/fs.h>
#include <linux/cred.h>

struct seq_operations;

struct seq_file {
        char *buf;
        size_t size;
        size_t from;
        size_t count;
        size_t pad_until;
        loff_t index;
        loff_t read_pos;
        struct mutex lock;
        const struct seq_operations *op;
        int poll_event;
        const struct file *file;
        void *private;
};

struct seq_operations {
        void * (*start) (struct seq_file *m, loff_t *pos);
        void (*stop) (struct seq_file *m, void *v);
        void * (*next) (struct seq_file *m, void *v, loff_t *pos);
        int (*show) (struct seq_file *m, void *v);
};

#define SEQ_SKIP 1

/**
 * seq_has_overflowed - check if the buffer has overflowed
 * @m: the seq_file handle
 *
 * seq_files have a buffer which may overflow. When this happens a larger
 * buffer is reallocated and all the data will be printed again.
 * The overflow state is true when m->count == m->size.
 *
 * Returns true if the buffer received more than it can hold.
 */
static inline bool seq_has_overflowed(struct seq_file *m)
{
        return m->count == m->size;
}

/**
 * seq_get_buf - get buffer to write arbitrary data to
 * @m: the seq_file handle
 * @bufp: the beginning of the buffer is stored here
 *
 * Return the number of bytes available in the buffer, or zero if
 * there's no space.
 */
static inline size_t seq_get_buf(struct seq_file *m, char **bufp)
{
        BUG_ON(m->count > m->size);
        if (m->count < m->size)
                *bufp = m->buf + m->count;
        else
                *bufp = NULL;

        return m->size - m->count;
}

/**
 * seq_commit - commit data to the buffer
 * @m: the seq_file handle
 * @num: the number of bytes to commit
 *
 * Commit @num bytes of data written to a buffer previously acquired
 * by seq_buf_get.  To signal an error condition, or that the data
 * didn't fit in the available space, pass a negative @num value.
 */
static inline void seq_commit(struct seq_file *m, int num)
{
        if (num < 0) {
                m->count = m->size;
        } else {
                BUG_ON(m->count + num > m->size);
                m->count += num;
        }
}

/**
 * seq_setwidth - set padding width
 * @m: the seq_file handle
 * @size: the max number of bytes to pad.
 *
 * Call seq_setwidth() for setting max width, then call seq_printf() etc. and
 * finally call seq_pad() to pad the remaining bytes.
 */
static inline void seq_setwidth(struct seq_file *m, size_t size)
{
        m->pad_until = m->count + size;
}
void seq_pad(struct seq_file *m, char c);

char *mangle_path(char *s, const char *p, const char *esc);
int seq_open(struct file *, const struct seq_operations *);
ssize_t seq_read(struct file *, char __user *, size_t, loff_t *);
ssize_t seq_read_iter(struct kiocb *iocb, struct iov_iter *iter);
loff_t seq_lseek(struct file *, loff_t, int);
int seq_release(struct inode *, struct file *);
int seq_write(struct seq_file *seq, const void *data, size_t len);

__printf(2, 0)
void seq_vprintf(struct seq_file *m, const char *fmt, va_list args);
__printf(2, 3)
void seq_printf(struct seq_file *m, const char *fmt, ...);
void seq_putc(struct seq_file *m, char c);
void seq_puts(struct seq_file *m, const char *s);
void seq_put_decimal_ull_width(struct seq_file *m, const char *delimiter,
                               unsigned long long num, unsigned int width);
void seq_put_decimal_ull(struct seq_file *m, const char *delimiter,
                         unsigned long long num);
void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num);
void seq_put_hex_ll(struct seq_file *m, const char *delimiter,
                    unsigned long long v, unsigned int width);

void seq_escape(struct seq_file *m, const char *s, const char *esc);
void seq_escape_mem_ascii(struct seq_file *m, const char *src, size_t isz);

void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
                  int rowsize, int groupsize, const void *buf, size_t len,
                  bool ascii);

int seq_path(struct seq_file *, const struct path *, const char *);
int seq_file_path(struct seq_file *, struct file *, const char *);
int seq_dentry(struct seq_file *, struct dentry *, const char *);
int seq_path_root(struct seq_file *m, const struct path *path,
                  const struct path *root, const char *esc);

int single_open(struct file *, int (*)(struct seq_file *, void *), void *);
int single_open_size(struct file *, int (*)(struct seq_file *, void *), void *, size_t);
int single_release(struct inode *, struct file *);
void *__seq_open_private(struct file *, const struct seq_operations *, int);
int seq_open_private(struct file *, const struct seq_operations *, int);
int seq_release_private(struct inode *, struct file *);

#define DEFINE_SEQ_ATTRIBUTE(__name)                                        \
static int __name ## _open(struct inode *inode, struct file *file)        \
{                                                                        \
        int ret = seq_open(file, &__name ## _sops);                        \
        if (!ret && inode->i_private) {                                        \
                struct seq_file *seq_f = file->private_data;                \
                seq_f->private = inode->i_private;                        \
        }                                                                \
        return ret;                                                        \
}                                                                        \
                                                                        \
static const struct file_operations __name ## _fops = {                        \
        .owner                = THIS_MODULE,                                        \
        .open                = __name ## _open,                                \
        .read                = seq_read,                                        \
        .llseek                = seq_lseek,                                        \
        .release        = seq_release,                                        \
}

#define DEFINE_SHOW_ATTRIBUTE(__name)                                        \
static int __name ## _open(struct inode *inode, struct file *file)        \
{                                                                        \
        return single_open(file, __name ## _show, inode->i_private);        \
}                                                                        \
                                                                        \
static const struct file_operations __name ## _fops = {                        \
        .owner                = THIS_MODULE,                                        \
        .open                = __name ## _open,                                \
        .read                = seq_read,                                        \
        .llseek                = seq_lseek,                                        \
        .release        = single_release,                                \
}

#define DEFINE_PROC_SHOW_ATTRIBUTE(__name)                                \
static int __name ## _open(struct inode *inode, struct file *file)        \
{                                                                        \
        return single_open(file, __name ## _show, PDE_DATA(inode));        \
}                                                                        \
                                                                        \
static const struct proc_ops __name ## _proc_ops = {                        \
        .proc_open        = __name ## _open,                                \
        .proc_read        = seq_read,                                        \
        .proc_lseek        = seq_lseek,                                        \
        .proc_release        = single_release,                                \
}

static inline struct user_namespace *seq_user_ns(struct seq_file *seq)
{
#ifdef CONFIG_USER_NS
        return seq->file->f_cred->user_ns;
#else
        extern struct user_namespace init_user_ns;
        return &init_user_ns;
#endif
}

/**
 * seq_show_options - display mount options with appropriate escapes.
 * @m: the seq_file handle
 * @name: the mount option name
 * @value: the mount option name's value, can be NULL
 */
static inline void seq_show_option(struct seq_file *m, const char *name,
                                   const char *value)
{
        seq_putc(m, ',');
        seq_escape(m, name, ",= \t\n\\");
        if (value) {
                seq_putc(m, '=');
                seq_escape(m, value, ", \t\n\\");
        }
}

/**
 * seq_show_option_n - display mount options with appropriate escapes
 *                       where @value must be a specific length.
 * @m: the seq_file handle
 * @name: the mount option name
 * @value: the mount option name's value, cannot be NULL
 * @length: the length of @value to display
 *
 * This is a macro since this uses "length" to define the size of the
 * stack buffer.
 */
#define seq_show_option_n(m, name, value, length) {        \
        char val_buf[length + 1];                        \
        strncpy(val_buf, value, length);                \
        val_buf[length] = '\0';                                \
        seq_show_option(m, name, val_buf);                \
}

#define SEQ_START_TOKEN ((void *)1)
/*
 * Helpers for iteration over list_head-s in seq_files
 */

extern struct list_head *seq_list_start(struct list_head *head,
                loff_t pos);
extern struct list_head *seq_list_start_head(struct list_head *head,
                loff_t pos);
extern struct list_head *seq_list_next(void *v, struct list_head *head,
                loff_t *ppos);

/*
 * Helpers for iteration over hlist_head-s in seq_files
 */

extern struct hlist_node *seq_hlist_start(struct hlist_head *head,
                                          loff_t pos);
extern struct hlist_node *seq_hlist_start_head(struct hlist_head *head,
                                               loff_t pos);
extern struct hlist_node *seq_hlist_next(void *v, struct hlist_head *head,
                                         loff_t *ppos);

extern struct hlist_node *seq_hlist_start_rcu(struct hlist_head *head,
                                              loff_t pos);
extern struct hlist_node *seq_hlist_start_head_rcu(struct hlist_head *head,
                                                   loff_t pos);
extern struct hlist_node *seq_hlist_next_rcu(void *v,
                                                   struct hlist_head *head,
                                                   loff_t *ppos);

/* Helpers for iterating over per-cpu hlist_head-s in seq_files */
extern struct hlist_node *seq_hlist_start_percpu(struct hlist_head __percpu *head, int *cpu, loff_t pos);

extern struct hlist_node *seq_hlist_next_percpu(void *v, struct hlist_head __percpu *head, int *cpu, loff_t *pos);

void seq_file_init(void);
#endif










































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NET_DST_OPS_H
#define _NET_DST_OPS_H
#include <linux/types.h>
#include <linux/percpu_counter.h>
#include <linux/cache.h>

struct dst_entry;
struct kmem_cachep;
struct net_device;
struct sk_buff;
struct sock;
struct net;

struct dst_ops {
        unsigned short                family;
        unsigned int                gc_thresh;

        void                        (*gc)(struct dst_ops *ops);
        struct dst_entry *        (*check)(struct dst_entry *, __u32 cookie);
        unsigned int                (*default_advmss)(const struct dst_entry *);
        unsigned int                (*mtu)(const struct dst_entry *);
        u32 *                        (*cow_metrics)(struct dst_entry *, unsigned long);
        void                        (*destroy)(struct dst_entry *);
        void                        (*ifdown)(struct dst_entry *,
                                          struct net_device *dev, int how);
        void                        (*negative_advice)(struct sock *sk, struct dst_entry *);
        void                        (*link_failure)(struct sk_buff *);
        void                        (*update_pmtu)(struct dst_entry *dst, struct sock *sk,
                                               struct sk_buff *skb, u32 mtu,
                                               bool confirm_neigh);
        void                        (*redirect)(struct dst_entry *dst, struct sock *sk,
                                            struct sk_buff *skb);
        int                        (*local_out)(struct net *net, struct sock *sk, struct sk_buff *skb);
        struct neighbour *        (*neigh_lookup)(const struct dst_entry *dst,
                                                struct sk_buff *skb,
                                                const void *daddr);
        void                        (*confirm_neigh)(const struct dst_entry *dst,
                                                 const void *daddr);

        struct kmem_cache        *kmem_cachep;

        struct percpu_counter        pcpuc_entries ____cacheline_aligned_in_smp;
};

static inline int dst_entries_get_fast(struct dst_ops *dst)
{
        return percpu_counter_read_positive(&dst->pcpuc_entries);
}

static inline int dst_entries_get_slow(struct dst_ops *dst)
{
        return percpu_counter_sum_positive(&dst->pcpuc_entries);
}

#define DST_PERCPU_COUNTER_BATCH 32
static inline void dst_entries_add(struct dst_ops *dst, int val)
{
        percpu_counter_add_batch(&dst->pcpuc_entries, val,
                                 DST_PERCPU_COUNTER_BATCH);
}

static inline int dst_entries_init(struct dst_ops *dst)
{
        return percpu_counter_init(&dst->pcpuc_entries, 0, GFP_KERNEL);
}

static inline void dst_entries_destroy(struct dst_ops *dst)
{
        percpu_counter_destroy(&dst->pcpuc_entries);
}

#endif
















































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * NET                Generic infrastructure for Network protocols.
 *
 *                Definitions for request_sock
 *
 * Authors:        Arnaldo Carvalho de Melo <acme@conectiva.com.br>
 *
 *                 From code originally in include/net/tcp.h
 */
#ifndef _REQUEST_SOCK_H
#define _REQUEST_SOCK_H

#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/types.h>
#include <linux/bug.h>
#include <linux/refcount.h>

#include <net/sock.h>

struct request_sock;
struct sk_buff;
struct dst_entry;
struct proto;

struct request_sock_ops {
        int                family;
        unsigned int        obj_size;
        struct kmem_cache        *slab;
        char                *slab_name;
        int                (*rtx_syn_ack)(const struct sock *sk,
                                       struct request_sock *req);
        void                (*send_ack)(const struct sock *sk, struct sk_buff *skb,
                                    struct request_sock *req);
        void                (*send_reset)(const struct sock *sk,
                                      struct sk_buff *skb);
        void                (*destructor)(struct request_sock *req);
        void                (*syn_ack_timeout)(const struct request_sock *req);
};

int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req);

struct saved_syn {
        u32 mac_hdrlen;
        u32 network_hdrlen;
        u32 tcp_hdrlen;
        u8 data[];
};

/* struct request_sock - mini sock to represent a connection request
 */
struct request_sock {
        struct sock_common                __req_common;
#define rsk_refcnt                        __req_common.skc_refcnt
#define rsk_hash                        __req_common.skc_hash
#define rsk_listener                        __req_common.skc_listener
#define rsk_window_clamp                __req_common.skc_window_clamp
#define rsk_rcv_wnd                        __req_common.skc_rcv_wnd

        struct request_sock                *dl_next;
        u16                                mss;
        u8                                num_retrans; /* number of retransmits */
        u8                                syncookie:1; /* syncookie: encode tcpopts in timestamp */
        u8                                num_timeout:7; /* number of timeouts */
        u32                                ts_recent;
        struct timer_list                rsk_timer;
        const struct request_sock_ops        *rsk_ops;
        struct sock                        *sk;
        struct saved_syn                *saved_syn;
        u32                                secid;
        u32                                peer_secid;
};

static inline struct request_sock *inet_reqsk(const struct sock *sk)
{
        return (struct request_sock *)sk;
}

static inline struct sock *req_to_sk(struct request_sock *req)
{
        return (struct sock *)req;
}

static inline struct request_sock *
reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener,
            bool attach_listener)
{
        struct request_sock *req;

        req = kmem_cache_alloc(ops->slab, GFP_ATOMIC | __GFP_NOWARN);
        if (!req)
                return NULL;
        req->rsk_listener = NULL;
        if (attach_listener) {
                if (unlikely(!refcount_inc_not_zero(&sk_listener->sk_refcnt))) {
                        kmem_cache_free(ops->slab, req);
                        return NULL;
                }
                req->rsk_listener = sk_listener;
        }
        req->rsk_ops = ops;
        req_to_sk(req)->sk_prot = sk_listener->sk_prot;
        sk_node_init(&req_to_sk(req)->sk_node);
        sk_tx_queue_clear(req_to_sk(req));
        req->saved_syn = NULL;
        req->num_timeout = 0;
        req->num_retrans = 0;
        req->sk = NULL;
        refcount_set(&req->rsk_refcnt, 0);

        return req;
}

static inline void __reqsk_free(struct request_sock *req)
{
        req->rsk_ops->destructor(req);
        if (req->rsk_listener)
                sock_put(req->rsk_listener);
        kfree(req->saved_syn);
        kmem_cache_free(req->rsk_ops->slab, req);
}

static inline void reqsk_free(struct request_sock *req)
{
        WARN_ON_ONCE(refcount_read(&req->rsk_refcnt) != 0);
        __reqsk_free(req);
}

static inline void reqsk_put(struct request_sock *req)
{
        if (refcount_dec_and_test(&req->rsk_refcnt))
                reqsk_free(req);
}

/*
 * For a TCP Fast Open listener -
 *        lock - protects the access to all the reqsk, which is co-owned by
 *                the listener and the child socket.
 *        qlen - pending TFO requests (still in TCP_SYN_RECV).
 *        max_qlen - max TFO reqs allowed before TFO is disabled.
 *
 *        XXX (TFO) - ideally these fields can be made as part of "listen_sock"
 *        structure above. But there is some implementation difficulty due to
 *        listen_sock being part of request_sock_queue hence will be freed when
 *        a listener is stopped. But TFO related fields may continue to be
 *        accessed even after a listener is closed, until its sk_refcnt drops
 *        to 0 implying no more outstanding TFO reqs. One solution is to keep
 *        listen_opt around until        sk_refcnt drops to 0. But there is some other
 *        complexity that needs to be resolved. E.g., a listener can be disabled
 *        temporarily through shutdown()->tcp_disconnect(), and re-enabled later.
 */
struct fastopen_queue {
        struct request_sock        *rskq_rst_head; /* Keep track of past TFO */
        struct request_sock        *rskq_rst_tail; /* requests that caused RST.
                                                 * This is part of the defense
                                                 * against spoofing attack.
                                                 */
        spinlock_t        lock;
        int                qlen;                /* # of pending (TCP_SYN_RECV) reqs */
        int                max_qlen;        /* != 0 iff TFO is currently enabled */

        struct tcp_fastopen_context __rcu *ctx; /* cipher context for cookie */
};

/** struct request_sock_queue - queue of request_socks
 *
 * @rskq_accept_head - FIFO head of established children
 * @rskq_accept_tail - FIFO tail of established children
 * @rskq_defer_accept - User waits for some data after accept()
 *
 */
struct request_sock_queue {
        spinlock_t                rskq_lock;
        u8                        rskq_defer_accept;

        u32                        synflood_warned;
        atomic_t                qlen;
        atomic_t                young;

        struct request_sock        *rskq_accept_head;
        struct request_sock        *rskq_accept_tail;
        struct fastopen_queue        fastopenq;  /* Check max_qlen != 0 to determine
                                             * if TFO is enabled.
                                             */
};

void reqsk_queue_alloc(struct request_sock_queue *queue);

void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
                           bool reset);

static inline bool reqsk_queue_empty(const struct request_sock_queue *queue)
{
        return READ_ONCE(queue->rskq_accept_head) == NULL;
}

static inline struct request_sock *reqsk_queue_remove(struct request_sock_queue *queue,
                                                      struct sock *parent)
{
        struct request_sock *req;

        spin_lock_bh(&queue->rskq_lock);
        req = queue->rskq_accept_head;
        if (req) {
                sk_acceptq_removed(parent);
                WRITE_ONCE(queue->rskq_accept_head, req->dl_next);
                if (queue->rskq_accept_head == NULL)
                        queue->rskq_accept_tail = NULL;
        }
        spin_unlock_bh(&queue->rskq_lock);
        return req;
}

static inline void reqsk_queue_removed(struct request_sock_queue *queue,
                                       const struct request_sock *req)
{
        if (req->num_timeout == 0)
                atomic_dec(&queue->young);
        atomic_dec(&queue->qlen);
}

static inline void reqsk_queue_added(struct request_sock_queue *queue)
{
        atomic_inc(&queue->young);
        atomic_inc(&queue->qlen);
}

static inline int reqsk_queue_len(const struct request_sock_queue *queue)
{
        return atomic_read(&queue->qlen);
}

static inline int reqsk_queue_len_young(const struct request_sock_queue *queue)
{
        return atomic_read(&queue->young);
}

#endif /* _REQUEST_SOCK_H */





































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
/* SPDX-License-Identifier: GPL-2.0 */

/*
 * This file provides wrappers with sanitizer instrumentation for atomic bit
 * operations.
 *
 * To use this functionality, an arch's bitops.h file needs to define each of
 * the below bit operations with an arch_ prefix (e.g. arch_set_bit(),
 * arch___set_bit(), etc.).
 */
#ifndef _ASM_GENERIC_BITOPS_INSTRUMENTED_ATOMIC_H
#define _ASM_GENERIC_BITOPS_INSTRUMENTED_ATOMIC_H

#include <linux/instrumented.h>

/**
 * set_bit - Atomically set a bit in memory
 * @nr: the bit to set
 * @addr: the address to start counting from
 *
 * This is a relaxed atomic operation (no implied memory barriers).
 *
 * Note that @nr may be almost arbitrarily large; this function is not
 * restricted to acting on a single-word quantity.
 */
static inline void set_bit(long nr, volatile unsigned long *addr)
{
        instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long));
        arch_set_bit(nr, addr);
}

/**
 * clear_bit - Clears a bit in memory
 * @nr: Bit to clear
 * @addr: Address to start counting from
 *
 * This is a relaxed atomic operation (no implied memory barriers).
 */
static inline void clear_bit(long nr, volatile unsigned long *addr)
{
        instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long));
        arch_clear_bit(nr, addr);
}

/**
 * change_bit - Toggle a bit in memory
 * @nr: Bit to change
 * @addr: Address to start counting from
 *
 * This is a relaxed atomic operation (no implied memory barriers).
 *
 * Note that @nr may be almost arbitrarily large; this function is not
 * restricted to acting on a single-word quantity.
 */
static inline void change_bit(long nr, volatile unsigned long *addr)
{
        instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long));
        arch_change_bit(nr, addr);
}

/**
 * test_and_set_bit - Set a bit and return its old value
 * @nr: Bit to set
 * @addr: Address to count from
 *
 * This is an atomic fully-ordered operation (implied full memory barrier).
 */
static inline bool test_and_set_bit(long nr, volatile unsigned long *addr)
{
        instrument_atomic_read_write(addr + BIT_WORD(nr), sizeof(long));
        return arch_test_and_set_bit(nr, addr);
}

/**
 * test_and_clear_bit - Clear a bit and return its old value
 * @nr: Bit to clear
 * @addr: Address to count from
 *
 * This is an atomic fully-ordered operation (implied full memory barrier).
 */
static inline bool test_and_clear_bit(long nr, volatile unsigned long *addr)
{
        instrument_atomic_read_write(addr + BIT_WORD(nr), sizeof(long));
        return arch_test_and_clear_bit(nr, addr);
}

/**
 * test_and_change_bit - Change a bit and return its old value
 * @nr: Bit to change
 * @addr: Address to count from
 *
 * This is an atomic fully-ordered operation (implied full memory barrier).
 */
static inline bool test_and_change_bit(long nr, volatile unsigned long *addr)
{
        instrument_atomic_read_write(addr + BIT_WORD(nr), sizeof(long));
        return arch_test_and_change_bit(nr, addr);
}

#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */





































   14 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/fault-inject.h>
#include <linux/fault-inject-usercopy.h>

static struct {
        struct fault_attr attr;
} fail_usercopy = {
        .attr = FAULT_ATTR_INITIALIZER,
};

static int __init setup_fail_usercopy(char *str)
{
        return setup_fault_attr(&fail_usercopy.attr, str);
}
__setup("fail_usercopy=", setup_fail_usercopy);

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

static int __init fail_usercopy_debugfs(void)
{
        struct dentry *dir;

        dir = fault_create_debugfs_attr("fail_usercopy", NULL,
                                        &fail_usercopy.attr);
        if (IS_ERR(dir))
                return PTR_ERR(dir);

        return 0;
}

late_initcall(fail_usercopy_debugfs);

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

bool should_fail_usercopy(void)
{
        return should_fail(&fail_usercopy.attr, 1);
}
EXPORT_SYMBOL_GPL(should_fail_usercopy);













































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_GENERIC_TERMIOS_H
#define _ASM_GENERIC_TERMIOS_H


#include <linux/uaccess.h>
#include <uapi/asm-generic/termios.h>

/*        intr=^C                quit=^\                erase=del        kill=^U
        eof=^D                vtime=\0        vmin=\1                sxtc=\0
        start=^Q        stop=^S                susp=^Z                eol=\0
        reprint=^R        discard=^U        werase=^W        lnext=^V
        eol2=\0
*/
#define INIT_C_CC "\003\034\177\025\004\0\1\0\021\023\032\0\022\017\027\026\0"

/*
 * Translate a "termio" structure into a "termios". Ugh.
 */
static inline int user_termio_to_kernel_termios(struct ktermios *termios,
                                                const struct termio __user *termio)
{
        unsigned short tmp;

        if (get_user(tmp, &termio->c_iflag) < 0)
                goto fault;
        termios->c_iflag = (0xffff0000 & termios->c_iflag) | tmp;

        if (get_user(tmp, &termio->c_oflag) < 0)
                goto fault;
        termios->c_oflag = (0xffff0000 & termios->c_oflag) | tmp;

        if (get_user(tmp, &termio->c_cflag) < 0)
                goto fault;
        termios->c_cflag = (0xffff0000 & termios->c_cflag) | tmp;

        if (get_user(tmp, &termio->c_lflag) < 0)
                goto fault;
        termios->c_lflag = (0xffff0000 & termios->c_lflag) | tmp;

        if (get_user(termios->c_line, &termio->c_line) < 0)
                goto fault;

        if (copy_from_user(termios->c_cc, termio->c_cc, NCC) != 0)
                goto fault;

        return 0;

 fault:
        return -EFAULT;
}

/*
 * Translate a "termios" structure into a "termio". Ugh.
 */
static inline int kernel_termios_to_user_termio(struct termio __user *termio,
                                                struct ktermios *termios)
{
        if (put_user(termios->c_iflag, &termio->c_iflag) < 0 ||
            put_user(termios->c_oflag, &termio->c_oflag) < 0 ||
            put_user(termios->c_cflag, &termio->c_cflag) < 0 ||
            put_user(termios->c_lflag, &termio->c_lflag) < 0 ||
            put_user(termios->c_line,  &termio->c_line) < 0 ||
            copy_to_user(termio->c_cc, termios->c_cc, NCC) != 0)
                return -EFAULT;

        return 0;
}

#ifdef TCGETS2
static inline int user_termios_to_kernel_termios(struct ktermios *k,
                                                 struct termios2 __user *u)
{
        return copy_from_user(k, u, sizeof(struct termios2));
}

static inline int kernel_termios_to_user_termios(struct termios2 __user *u,
                                                 struct ktermios *k)
{
        return copy_to_user(u, k, sizeof(struct termios2));
}

static inline int user_termios_to_kernel_termios_1(struct ktermios *k,
                                                   struct termios __user *u)
{
        return copy_from_user(k, u, sizeof(struct termios));
}

static inline int kernel_termios_to_user_termios_1(struct termios __user *u,
                                                   struct ktermios *k)
{
        return copy_to_user(u, k, sizeof(struct termios));
}
#else /* TCGETS2 */
static inline int user_termios_to_kernel_termios(struct ktermios *k,
                                                 struct termios __user *u)
{
        return copy_from_user(k, u, sizeof(struct termios));
}

static inline int kernel_termios_to_user_termios(struct termios __user *u,
                                                 struct ktermios *k)
{
        return copy_to_user(u, k, sizeof(struct termios));
}
#endif /* TCGETS2 */

#endif /* _ASM_GENERIC_TERMIOS_H */





































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 




































































































































































































































































































































































































































































































































































































































































    1 







































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_H
#define _LINUX_SCHED_H

/*
 * Define 'struct task_struct' and provide the main scheduler
 * APIs (schedule(), wakeup variants, etc.)
 */

#include <uapi/linux/sched.h>

#include <asm/current.h>

#include <linux/pid.h>
#include <linux/sem.h>
#include <linux/shm.h>
#include <linux/mutex.h>
#include <linux/plist.h>
#include <linux/hrtimer.h>
#include <linux/irqflags.h>
#include <linux/seccomp.h>
#include <linux/nodemask.h>
#include <linux/rcupdate.h>
#include <linux/refcount.h>
#include <linux/resource.h>
#include <linux/latencytop.h>
#include <linux/sched/prio.h>
#include <linux/sched/types.h>
#include <linux/signal_types.h>
#include <linux/mm_types_task.h>
#include <linux/task_io_accounting.h>
#include <linux/posix-timers.h>
#include <linux/rseq.h>
#include <linux/seqlock.h>
#include <linux/kcsan.h>

/* task_struct member predeclarations (sorted alphabetically): */
struct audit_context;
struct backing_dev_info;
struct bio_list;
struct blk_plug;
struct capture_control;
struct cfs_rq;
struct fs_struct;
struct futex_pi_state;
struct io_context;
struct mempolicy;
struct nameidata;
struct nsproxy;
struct perf_event_context;
struct pid_namespace;
struct pipe_inode_info;
struct rcu_node;
struct reclaim_state;
struct robust_list_head;
struct root_domain;
struct rq;
struct sched_attr;
struct sched_param;
struct seq_file;
struct sighand_struct;
struct signal_struct;
struct task_delay_info;
struct task_group;
struct io_uring_task;

/*
 * Task state bitmask. NOTE! These bits are also
 * encoded in fs/proc/array.c: get_task_state().
 *
 * We have two separate sets of flags: task->state
 * is about runnability, while task->exit_state are
 * about the task exiting. Confusing, but this way
 * modifying one set can't modify the other one by
 * mistake.
 */

/* Used in tsk->state: */
#define TASK_RUNNING                        0x0000
#define TASK_INTERRUPTIBLE                0x0001
#define TASK_UNINTERRUPTIBLE                0x0002
#define __TASK_STOPPED                        0x0004
#define __TASK_TRACED                        0x0008
/* Used in tsk->exit_state: */
#define EXIT_DEAD                        0x0010
#define EXIT_ZOMBIE                        0x0020
#define EXIT_TRACE                        (EXIT_ZOMBIE | EXIT_DEAD)
/* Used in tsk->state again: */
#define TASK_PARKED                        0x0040
#define TASK_DEAD                        0x0080
#define TASK_WAKEKILL                        0x0100
#define TASK_WAKING                        0x0200
#define TASK_NOLOAD                        0x0400
#define TASK_NEW                        0x0800
#define TASK_STATE_MAX                        0x1000

/* Convenience macros for the sake of set_current_state: */
#define TASK_KILLABLE                        (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
#define TASK_STOPPED                        (TASK_WAKEKILL | __TASK_STOPPED)
#define TASK_TRACED                        (TASK_WAKEKILL | __TASK_TRACED)

#define TASK_IDLE                        (TASK_UNINTERRUPTIBLE | TASK_NOLOAD)

/* Convenience macros for the sake of wake_up(): */
#define TASK_NORMAL                        (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)

/* get_task_state(): */
#define TASK_REPORT                        (TASK_RUNNING | TASK_INTERRUPTIBLE | \
                                         TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
                                         __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \
                                         TASK_PARKED)

#define task_is_traced(task)                ((task->state & __TASK_TRACED) != 0)

#define task_is_stopped(task)                ((task->state & __TASK_STOPPED) != 0)

#define task_is_stopped_or_traced(task)        ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)

#ifdef CONFIG_DEBUG_ATOMIC_SLEEP

/*
 * Special states are those that do not use the normal wait-loop pattern. See
 * the comment with set_special_state().
 */
#define is_special_task_state(state)                                \
        ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD))

#define __set_current_state(state_value)                        \
        do {                                                        \
                WARN_ON_ONCE(is_special_task_state(state_value));\
                current->task_state_change = _THIS_IP_;                \
                current->state = (state_value);                        \
        } while (0)

#define set_current_state(state_value)                                \
        do {                                                        \
                WARN_ON_ONCE(is_special_task_state(state_value));\
                current->task_state_change = _THIS_IP_;                \
                smp_store_mb(current->state, (state_value));        \
        } while (0)

#define set_special_state(state_value)                                        \
        do {                                                                \
                unsigned long flags; /* may shadow */                        \
                WARN_ON_ONCE(!is_special_task_state(state_value));        \
                raw_spin_lock_irqsave(&current->pi_lock, flags);        \
                current->task_state_change = _THIS_IP_;                        \
                current->state = (state_value);                                \
                raw_spin_unlock_irqrestore(&current->pi_lock, flags);        \
        } while (0)
#else
/*
 * set_current_state() includes a barrier so that the write of current->state
 * is correctly serialised wrt the caller's subsequent test of whether to
 * actually sleep:
 *
 *   for (;;) {
 *        set_current_state(TASK_UNINTERRUPTIBLE);
 *        if (CONDITION)
 *           break;
 *
 *        schedule();
 *   }
 *   __set_current_state(TASK_RUNNING);
 *
 * If the caller does not need such serialisation (because, for instance, the
 * CONDITION test and condition change and wakeup are under the same lock) then
 * use __set_current_state().
 *
 * The above is typically ordered against the wakeup, which does:
 *
 *   CONDITION = 1;
 *   wake_up_state(p, TASK_UNINTERRUPTIBLE);
 *
 * where wake_up_state()/try_to_wake_up() executes a full memory barrier before
 * accessing p->state.
 *
 * Wakeup will do: if (@state & p->state) p->state = TASK_RUNNING, that is,
 * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a
 * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING).
 *
 * However, with slightly different timing the wakeup TASK_RUNNING store can
 * also collide with the TASK_UNINTERRUPTIBLE store. Losing that store is not
 * a problem either because that will result in one extra go around the loop
 * and our @cond test will save the day.
 *
 * Also see the comments of try_to_wake_up().
 */
#define __set_current_state(state_value)                                \
        current->state = (state_value)

#define set_current_state(state_value)                                        \
        smp_store_mb(current->state, (state_value))

/*
 * set_special_state() should be used for those states when the blocking task
 * can not use the regular condition based wait-loop. In that case we must
 * serialize against wakeups such that any possible in-flight TASK_RUNNING stores
 * will not collide with our state change.
 */
#define set_special_state(state_value)                                        \
        do {                                                                \
                unsigned long flags; /* may shadow */                        \
                raw_spin_lock_irqsave(&current->pi_lock, flags);        \
                current->state = (state_value);                                \
                raw_spin_unlock_irqrestore(&current->pi_lock, flags);        \
        } while (0)

#endif

/* Task command name length: */
#define TASK_COMM_LEN                        16

extern void scheduler_tick(void);

#define        MAX_SCHEDULE_TIMEOUT                LONG_MAX

extern long schedule_timeout(long timeout);
extern long schedule_timeout_interruptible(long timeout);
extern long schedule_timeout_killable(long timeout);
extern long schedule_timeout_uninterruptible(long timeout);
extern long schedule_timeout_idle(long timeout);
asmlinkage void schedule(void);
extern void schedule_preempt_disabled(void);
asmlinkage void preempt_schedule_irq(void);

extern int __must_check io_schedule_prepare(void);
extern void io_schedule_finish(int token);
extern long io_schedule_timeout(long timeout);
extern void io_schedule(void);

/**
 * struct prev_cputime - snapshot of system and user cputime
 * @utime: time spent in user mode
 * @stime: time spent in system mode
 * @lock: protects the above two fields
 *
 * Stores previous user/system time values such that we can guarantee
 * monotonicity.
 */
struct prev_cputime {
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
        u64                                utime;
        u64                                stime;
        raw_spinlock_t                        lock;
#endif
};

enum vtime_state {
        /* Task is sleeping or running in a CPU with VTIME inactive: */
        VTIME_INACTIVE = 0,
        /* Task is idle */
        VTIME_IDLE,
        /* Task runs in kernelspace in a CPU with VTIME active: */
        VTIME_SYS,
        /* Task runs in userspace in a CPU with VTIME active: */
        VTIME_USER,
        /* Task runs as guests in a CPU with VTIME active: */
        VTIME_GUEST,
};

struct vtime {
        seqcount_t                seqcount;
        unsigned long long        starttime;
        enum vtime_state        state;
        unsigned int                cpu;
        u64                        utime;
        u64                        stime;
        u64                        gtime;
};

/*
 * Utilization clamp constraints.
 * @UCLAMP_MIN:        Minimum utilization
 * @UCLAMP_MAX:        Maximum utilization
 * @UCLAMP_CNT:        Utilization clamp constraints count
 */
enum uclamp_id {
        UCLAMP_MIN = 0,
        UCLAMP_MAX,
        UCLAMP_CNT
};

#ifdef CONFIG_SMP
extern struct root_domain def_root_domain;
extern struct mutex sched_domains_mutex;
#endif

struct sched_info {
#ifdef CONFIG_SCHED_INFO
        /* Cumulative counters: */

        /* # of times we have run on this CPU: */
        unsigned long                        pcount;

        /* Time spent waiting on a runqueue: */
        unsigned long long                run_delay;

        /* Timestamps: */

        /* When did we last run on a CPU? */
        unsigned long long                last_arrival;

        /* When were we last queued to run? */
        unsigned long long                last_queued;

#endif /* CONFIG_SCHED_INFO */
};

/*
 * Integer metrics need fixed point arithmetic, e.g., sched/fair
 * has a few: load, load_avg, util_avg, freq, and capacity.
 *
 * We define a basic fixed point arithmetic range, and then formalize
 * all these metrics based on that basic range.
 */
# define SCHED_FIXEDPOINT_SHIFT                10
# define SCHED_FIXEDPOINT_SCALE                (1L << SCHED_FIXEDPOINT_SHIFT)

/* Increase resolution of cpu_capacity calculations */
# define SCHED_CAPACITY_SHIFT                SCHED_FIXEDPOINT_SHIFT
# define SCHED_CAPACITY_SCALE                (1L << SCHED_CAPACITY_SHIFT)

struct load_weight {
        unsigned long                        weight;
        u32                                inv_weight;
};

/**
 * struct util_est - Estimation utilization of FAIR tasks
 * @enqueued: instantaneous estimated utilization of a task/cpu
 * @ewma:     the Exponential Weighted Moving Average (EWMA)
 *            utilization of a task
 *
 * Support data structure to track an Exponential Weighted Moving Average
 * (EWMA) of a FAIR task's utilization. New samples are added to the moving
 * average each time a task completes an activation. Sample's weight is chosen
 * so that the EWMA will be relatively insensitive to transient changes to the
 * task's workload.
 *
 * The enqueued attribute has a slightly different meaning for tasks and cpus:
 * - task:   the task's util_avg at last task dequeue time
 * - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU
 * Thus, the util_est.enqueued of a task represents the contribution on the
 * estimated utilization of the CPU where that task is currently enqueued.
 *
 * Only for tasks we track a moving average of the past instantaneous
 * estimated utilization. This allows to absorb sporadic drops in utilization
 * of an otherwise almost periodic task.
 *
 * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
 * updates. When a task is dequeued, its util_est should not be updated if its
 * util_avg has not been updated in the meantime.
 * This information is mapped into the MSB bit of util_est.enqueued at dequeue
 * time. Since max value of util_est.enqueued for a task is 1024 (PELT util_avg
 * for a task) it is safe to use MSB.
 */
struct util_est {
        unsigned int                        enqueued;
        unsigned int                        ewma;
#define UTIL_EST_WEIGHT_SHIFT                2
#define UTIL_AVG_UNCHANGED                0x80000000
} __attribute__((__aligned__(sizeof(u64))));

/*
 * The load/runnable/util_avg accumulates an infinite geometric series
 * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c).
 *
 * [load_avg definition]
 *
 *   load_avg = runnable% * scale_load_down(load)
 *
 * [runnable_avg definition]
 *
 *   runnable_avg = runnable% * SCHED_CAPACITY_SCALE
 *
 * [util_avg definition]
 *
 *   util_avg = running% * SCHED_CAPACITY_SCALE
 *
 * where runnable% is the time ratio that a sched_entity is runnable and
 * running% the time ratio that a sched_entity is running.
 *
 * For cfs_rq, they are the aggregated values of all runnable and blocked
 * sched_entities.
 *
 * The load/runnable/util_avg doesn't directly factor frequency scaling and CPU
 * capacity scaling. The scaling is done through the rq_clock_pelt that is used
 * for computing those signals (see update_rq_clock_pelt())
 *
 * N.B., the above ratios (runnable% and running%) themselves are in the
 * range of [0, 1]. To do fixed point arithmetics, we therefore scale them
 * to as large a range as necessary. This is for example reflected by
 * util_avg's SCHED_CAPACITY_SCALE.
 *
 * [Overflow issue]
 *
 * The 64-bit load_sum can have 4353082796 (=2^64/47742/88761) entities
 * with the highest load (=88761), always runnable on a single cfs_rq,
 * and should not overflow as the number already hits PID_MAX_LIMIT.
 *
 * For all other cases (including 32-bit kernels), struct load_weight's
 * weight will overflow first before we do, because:
 *
 *    Max(load_avg) <= Max(load.weight)
 *
 * Then it is the load_weight's responsibility to consider overflow
 * issues.
 */
struct sched_avg {
        u64                                last_update_time;
        u64                                load_sum;
        u64                                runnable_sum;
        u32                                util_sum;
        u32                                period_contrib;
        unsigned long                        load_avg;
        unsigned long                        runnable_avg;
        unsigned long                        util_avg;
        struct util_est                        util_est;
} ____cacheline_aligned;

struct sched_statistics {
#ifdef CONFIG_SCHEDSTATS
        u64                                wait_start;
        u64                                wait_max;
        u64                                wait_count;
        u64                                wait_sum;
        u64                                iowait_count;
        u64                                iowait_sum;

        u64                                sleep_start;
        u64                                sleep_max;
        s64                                sum_sleep_runtime;

        u64                                block_start;
        u64                                block_max;
        u64                                exec_max;
        u64                                slice_max;

        u64                                nr_migrations_cold;
        u64                                nr_failed_migrations_affine;
        u64                                nr_failed_migrations_running;
        u64                                nr_failed_migrations_hot;
        u64                                nr_forced_migrations;

        u64                                nr_wakeups;
        u64                                nr_wakeups_sync;
        u64                                nr_wakeups_migrate;
        u64                                nr_wakeups_local;
        u64                                nr_wakeups_remote;
        u64                                nr_wakeups_affine;
        u64                                nr_wakeups_affine_attempts;
        u64                                nr_wakeups_passive;
        u64                                nr_wakeups_idle;
#endif
};

struct sched_entity {
        /* For load-balancing: */
        struct load_weight                load;
        struct rb_node                        run_node;
        struct list_head                group_node;
        unsigned int                        on_rq;

        u64                                exec_start;
        u64                                sum_exec_runtime;
        u64                                vruntime;
        u64                                prev_sum_exec_runtime;

        u64                                nr_migrations;

        struct sched_statistics                statistics;

#ifdef CONFIG_FAIR_GROUP_SCHED
        int                                depth;
        struct sched_entity                *parent;
        /* rq on which this entity is (to be) queued: */
        struct cfs_rq                        *cfs_rq;
        /* rq "owned" by this entity/group: */
        struct cfs_rq                        *my_q;
        /* cached value of my_q->h_nr_running */
        unsigned long                        runnable_weight;
#endif

#ifdef CONFIG_SMP
        /*
         * Per entity load average tracking.
         *
         * Put into separate cache line so it does not
         * collide with read-mostly values above.
         */
        struct sched_avg                avg;
#endif
};

struct sched_rt_entity {
        struct list_head                run_list;
        unsigned long                        timeout;
        unsigned long                        watchdog_stamp;
        unsigned int                        time_slice;
        unsigned short                        on_rq;
        unsigned short                        on_list;

        struct sched_rt_entity                *back;
#ifdef CONFIG_RT_GROUP_SCHED
        struct sched_rt_entity                *parent;
        /* rq on which this entity is (to be) queued: */
        struct rt_rq                        *rt_rq;
        /* rq "owned" by this entity/group: */
        struct rt_rq                        *my_q;
#endif
} __randomize_layout;

struct sched_dl_entity {
        struct rb_node                        rb_node;

        /*
         * Original scheduling parameters. Copied here from sched_attr
         * during sched_setattr(), they will remain the same until
         * the next sched_setattr().
         */
        u64                                dl_runtime;        /* Maximum runtime for each instance        */
        u64                                dl_deadline;        /* Relative deadline of each instance        */
        u64                                dl_period;        /* Separation of two instances (period) */
        u64                                dl_bw;                /* dl_runtime / dl_period                */
        u64                                dl_density;        /* dl_runtime / dl_deadline                */

        /*
         * Actual scheduling parameters. Initialized with the values above,
         * they are continuously updated during task execution. Note that
         * the remaining runtime could be < 0 in case we are in overrun.
         */
        s64                                runtime;        /* Remaining runtime for this instance        */
        u64                                deadline;        /* Absolute deadline for this instance        */
        unsigned int                        flags;                /* Specifying the scheduler behaviour        */

        /*
         * Some bool flags:
         *
         * @dl_throttled tells if we exhausted the runtime. If so, the
         * task has to wait for a replenishment to be performed at the
         * next firing of dl_timer.
         *
         * @dl_yielded tells if task gave up the CPU before consuming
         * all its available runtime during the last job.
         *
         * @dl_non_contending tells if the task is inactive while still
         * contributing to the active utilization. In other words, it
         * indicates if the inactive timer has been armed and its handler
         * has not been executed yet. This flag is useful to avoid race
         * conditions between the inactive timer handler and the wakeup
         * code.
         *
         * @dl_overrun tells if the task asked to be informed about runtime
         * overruns.
         */
        unsigned int                        dl_throttled      : 1;
        unsigned int                        dl_yielded        : 1;
        unsigned int                        dl_non_contending : 1;
        unsigned int                        dl_overrun          : 1;

        /*
         * Bandwidth enforcement timer. Each -deadline task has its
         * own bandwidth to be enforced, thus we need one timer per task.
         */
        struct hrtimer                        dl_timer;

        /*
         * Inactive timer, responsible for decreasing the active utilization
         * at the "0-lag time". When a -deadline task blocks, it contributes
         * to GRUB's active utilization until the "0-lag time", hence a
         * timer is needed to decrease the active utilization at the correct
         * time.
         */
        struct hrtimer inactive_timer;

#ifdef CONFIG_RT_MUTEXES
        /*
         * Priority Inheritance. When a DEADLINE scheduling entity is boosted
         * pi_se points to the donor, otherwise points to the dl_se it belongs
         * to (the original one/itself).
         */
        struct sched_dl_entity *pi_se;
#endif
};

#ifdef CONFIG_UCLAMP_TASK
/* Number of utilization clamp buckets (shorter alias) */
#define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT

/*
 * Utilization clamp for a scheduling entity
 * @value:                clamp value "assigned" to a se
 * @bucket_id:                bucket index corresponding to the "assigned" value
 * @active:                the se is currently refcounted in a rq's bucket
 * @user_defined:        the requested clamp value comes from user-space
 *
 * The bucket_id is the index of the clamp bucket matching the clamp value
 * which is pre-computed and stored to avoid expensive integer divisions from
 * the fast path.
 *
 * The active bit is set whenever a task has got an "effective" value assigned,
 * which can be different from the clamp value "requested" from user-space.
 * This allows to know a task is refcounted in the rq's bucket corresponding
 * to the "effective" bucket_id.
 *
 * The user_defined bit is set whenever a task has got a task-specific clamp
 * value requested from userspace, i.e. the system defaults apply to this task
 * just as a restriction. This allows to relax default clamps when a less
 * restrictive task-specific value has been requested, thus allowing to
 * implement a "nice" semantic. For example, a task running with a 20%
 * default boost can still drop its own boosting to 0%.
 */
struct uclamp_se {
        unsigned int value                : bits_per(SCHED_CAPACITY_SCALE);
        unsigned int bucket_id                : bits_per(UCLAMP_BUCKETS);
        unsigned int active                : 1;
        unsigned int user_defined        : 1;
};
#endif /* CONFIG_UCLAMP_TASK */

union rcu_special {
        struct {
                u8                        blocked;
                u8                        need_qs;
                u8                        exp_hint; /* Hint for performance. */
                u8                        need_mb; /* Readers need smp_mb(). */
        } b; /* Bits. */
        u32 s; /* Set of bits. */
};

enum perf_event_task_context {
        perf_invalid_context = -1,
        perf_hw_context = 0,
        perf_sw_context,
        perf_nr_task_contexts,
};

struct wake_q_node {
        struct wake_q_node *next;
};

struct task_struct {
#ifdef CONFIG_THREAD_INFO_IN_TASK
        /*
         * For reasons of header soup (see current_thread_info()), this
         * must be the first element of task_struct.
         */
        struct thread_info                thread_info;
#endif
        /* -1 unrunnable, 0 runnable, >0 stopped: */
        volatile long                        state;

        /*
         * This begins the randomizable portion of task_struct. Only
         * scheduling-critical items should be added above here.
         */
        randomized_struct_fields_start

        void                                *stack;
        refcount_t                        usage;
        /* Per task flags (PF_*), defined further below: */
        unsigned int                        flags;
        unsigned int                        ptrace;

#ifdef CONFIG_SMP
        int                                on_cpu;
        struct __call_single_node        wake_entry;
#ifdef CONFIG_THREAD_INFO_IN_TASK
        /* Current CPU: */
        unsigned int                        cpu;
#endif
        unsigned int                        wakee_flips;
        unsigned long                        wakee_flip_decay_ts;
        struct task_struct                *last_wakee;

        /*
         * recent_used_cpu is initially set as the last CPU used by a task
         * that wakes affine another task. Waker/wakee relationships can
         * push tasks around a CPU where each wakeup moves to the next one.
         * Tracking a recently used CPU allows a quick search for a recently
         * used CPU that may be idle.
         */
        int                                recent_used_cpu;
        int                                wake_cpu;
#endif
        int                                on_rq;

        int                                prio;
        int                                static_prio;
        int                                normal_prio;
        unsigned int                        rt_priority;

        const struct sched_class        *sched_class;
        struct sched_entity                se;
        struct sched_rt_entity                rt;
#ifdef CONFIG_CGROUP_SCHED
        struct task_group                *sched_task_group;
#endif
        struct sched_dl_entity                dl;

#ifdef CONFIG_UCLAMP_TASK
        /*
         * Clamp values requested for a scheduling entity.
         * Must be updated with task_rq_lock() held.
         */
        struct uclamp_se                uclamp_req[UCLAMP_CNT];
        /*
         * Effective clamp values used for a scheduling entity.
         * Must be updated with task_rq_lock() held.
         */
        struct uclamp_se                uclamp[UCLAMP_CNT];
#endif

#ifdef CONFIG_PREEMPT_NOTIFIERS
        /* List of struct preempt_notifier: */
        struct hlist_head                preempt_notifiers;
#endif

#ifdef CONFIG_BLK_DEV_IO_TRACE
        unsigned int                        btrace_seq;
#endif

        unsigned int                        policy;
        int                                nr_cpus_allowed;
        const cpumask_t                        *cpus_ptr;
        cpumask_t                        cpus_mask;

#ifdef CONFIG_PREEMPT_RCU
        int                                rcu_read_lock_nesting;
        union rcu_special                rcu_read_unlock_special;
        struct list_head                rcu_node_entry;
        struct rcu_node                        *rcu_blocked_node;
#endif /* #ifdef CONFIG_PREEMPT_RCU */

#ifdef CONFIG_TASKS_RCU
        unsigned long                        rcu_tasks_nvcsw;
        u8                                rcu_tasks_holdout;
        u8                                rcu_tasks_idx;
        int                                rcu_tasks_idle_cpu;
        struct list_head                rcu_tasks_holdout_list;
#endif /* #ifdef CONFIG_TASKS_RCU */

#ifdef CONFIG_TASKS_TRACE_RCU
        int                                trc_reader_nesting;
        int                                trc_ipi_to_cpu;
        union rcu_special                trc_reader_special;
        bool                                trc_reader_checked;
        struct list_head                trc_holdout_list;
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */

        struct sched_info                sched_info;

        struct list_head                tasks;
#ifdef CONFIG_SMP
        struct plist_node                pushable_tasks;
        struct rb_node                        pushable_dl_tasks;
#endif

        struct mm_struct                *mm;
        struct mm_struct                *active_mm;

        /* Per-thread vma caching: */
        struct vmacache                        vmacache;

#ifdef SPLIT_RSS_COUNTING
        struct task_rss_stat                rss_stat;
#endif
        int                                exit_state;
        int                                exit_code;
        int                                exit_signal;
        /* The signal sent when the parent dies: */
        int                                pdeath_signal;
        /* JOBCTL_*, siglock protected: */
        unsigned long                        jobctl;

        /* Used for emulating ABI behavior of previous Linux versions: */
        unsigned int                        personality;

        /* Scheduler bits, serialized by scheduler locks: */
        unsigned                        sched_reset_on_fork:1;
        unsigned                        sched_contributes_to_load:1;
        unsigned                        sched_migrated:1;
#ifdef CONFIG_PSI
        unsigned                        sched_psi_wake_requeue:1;
#endif

        /* Force alignment to the next boundary: */
        unsigned                        :0;

        /* Unserialized, strictly 'current' */

        /*
         * This field must not be in the scheduler word above due to wakelist
         * queueing no longer being serialized by p->on_cpu. However:
         *
         * p->XXX = X;                        ttwu()
         * schedule()                          if (p->on_rq && ..) // false
         *   smp_mb__after_spinlock();          if (smp_load_acquire(&p->on_cpu) && //true
         *   deactivate_task()                      ttwu_queue_wakelist())
         *     p->on_rq = 0;                        p->sched_remote_wakeup = Y;
         *
         * guarantees all stores of 'current' are visible before
         * ->sched_remote_wakeup gets used, so it can be in this word.
         */
        unsigned                        sched_remote_wakeup:1;

        /* Save user-dumpable when mm goes away */
        unsigned                        user_dumpable:1;

        /* Bit to tell LSMs we're in execve(): */
        unsigned                        in_execve:1;
        unsigned                        in_iowait:1;
#ifndef TIF_RESTORE_SIGMASK
        unsigned                        restore_sigmask:1;
#endif
#ifdef CONFIG_MEMCG
        unsigned                        in_user_fault:1;
#endif
#ifdef CONFIG_COMPAT_BRK
        unsigned                        brk_randomized:1;
#endif
#ifdef CONFIG_CGROUPS
        /* disallow userland-initiated cgroup migration */
        unsigned                        no_cgroup_migration:1;
        /* task is frozen/stopped (used by the cgroup freezer) */
        unsigned                        frozen:1;
#endif
#ifdef CONFIG_BLK_CGROUP
        unsigned                        use_memdelay:1;
#endif
#ifdef CONFIG_PSI
        /* Stalled due to lack of memory */
        unsigned                        in_memstall:1;
#endif

        unsigned long                        atomic_flags; /* Flags requiring atomic access. */

        struct restart_block                restart_block;

        pid_t                                pid;
        pid_t                                tgid;

#ifdef CONFIG_STACKPROTECTOR
        /* Canary value for the -fstack-protector GCC feature: */
        unsigned long                        stack_canary;
#endif
        /*
         * Pointers to the (original) parent process, youngest child, younger sibling,
         * older sibling, respectively.  (p->father can be replaced with
         * p->real_parent->pid)
         */

        /* Real parent process: */
        struct task_struct __rcu        *real_parent;

        /* Recipient of SIGCHLD, wait4() reports: */
        struct task_struct __rcu        *parent;

        /*
         * Children/sibling form the list of natural children:
         */
        struct list_head                children;
        struct list_head                sibling;
        struct task_struct                *group_leader;

        /*
         * 'ptraced' is the list of tasks this task is using ptrace() on.
         *
         * This includes both natural children and PTRACE_ATTACH targets.
         * 'ptrace_entry' is this task's link on the p->parent->ptraced list.
         */
        struct list_head                ptraced;
        struct list_head                ptrace_entry;

        /* PID/PID hash table linkage. */
        struct pid                        *thread_pid;
        struct hlist_node                pid_links[PIDTYPE_MAX];
        struct list_head                thread_group;
        struct list_head                thread_node;

        struct completion                *vfork_done;

        /* CLONE_CHILD_SETTID: */
        int __user                        *set_child_tid;

        /* CLONE_CHILD_CLEARTID: */
        int __user                        *clear_child_tid;

        /* PF_IO_WORKER */
        void                                *pf_io_worker;

        u64                                utime;
        u64                                stime;
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
        u64                                utimescaled;
        u64                                stimescaled;
#endif
        u64                                gtime;
        struct prev_cputime                prev_cputime;
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
        struct vtime                        vtime;
#endif

#ifdef CONFIG_NO_HZ_FULL
        atomic_t                        tick_dep_mask;
#endif
        /* Context switch counts: */
        unsigned long                        nvcsw;
        unsigned long                        nivcsw;

        /* Monotonic time in nsecs: */
        u64                                start_time;

        /* Boot based time in nsecs: */
        u64                                start_boottime;

        /* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */
        unsigned long                        min_flt;
        unsigned long                        maj_flt;

        /* Empty if CONFIG_POSIX_CPUTIMERS=n */
        struct posix_cputimers                posix_cputimers;

#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
        struct posix_cputimers_work        posix_cputimers_work;
#endif

        /* Process credentials: */

        /* Tracer's credentials at attach: */
        const struct cred __rcu                *ptracer_cred;

        /* Objective and real subjective task credentials (COW): */
        const struct cred __rcu                *real_cred;

        /* Effective (overridable) subjective task credentials (COW): */
        const struct cred __rcu                *cred;

#ifdef CONFIG_KEYS
        /* Cached requested key. */
        struct key                        *cached_requested_key;
#endif

        /*
         * executable name, excluding path.
         *
         * - normally initialized setup_new_exec()
         * - access it with [gs]et_task_comm()
         * - lock it with task_lock()
         */
        char                                comm[TASK_COMM_LEN];

        struct nameidata                *nameidata;

#ifdef CONFIG_SYSVIPC
        struct sysv_sem                        sysvsem;
        struct sysv_shm                        sysvshm;
#endif
#ifdef CONFIG_DETECT_HUNG_TASK
        unsigned long                        last_switch_count;
        unsigned long                        last_switch_time;
#endif
        /* Filesystem information: */
        struct fs_struct                *fs;

        /* Open file information: */
        struct files_struct                *files;

#ifdef CONFIG_IO_URING
        struct io_uring_task                *io_uring;
#endif

        /* Namespaces: */
        struct nsproxy                        *nsproxy;

        /* Signal handlers: */
        struct signal_struct                *signal;
        struct sighand_struct __rcu                *sighand;
        sigset_t                        blocked;
        sigset_t                        real_blocked;
        /* Restored if set_restore_sigmask() was used: */
        sigset_t                        saved_sigmask;
        struct sigpending                pending;
        unsigned long                        sas_ss_sp;
        size_t                                sas_ss_size;
        unsigned int                        sas_ss_flags;

        struct callback_head                *task_works;

#ifdef CONFIG_AUDIT
#ifdef CONFIG_AUDITSYSCALL
        struct audit_context                *audit_context;
#endif
        kuid_t                                loginuid;
        unsigned int                        sessionid;
#endif
        struct seccomp                        seccomp;

        /* Thread group tracking: */
        u64                                parent_exec_id;
        u64                                self_exec_id;

        /* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */
        spinlock_t                        alloc_lock;

        /* Protection of the PI data structures: */
        raw_spinlock_t                        pi_lock;

        struct wake_q_node                wake_q;

#ifdef CONFIG_RT_MUTEXES
        /* PI waiters blocked on a rt_mutex held by this task: */
        struct rb_root_cached                pi_waiters;
        /* Updated under owner's pi_lock and rq lock */
        struct task_struct                *pi_top_task;
        /* Deadlock detection and priority inheritance handling: */
        struct rt_mutex_waiter                *pi_blocked_on;
#endif

#ifdef CONFIG_DEBUG_MUTEXES
        /* Mutex deadlock detection: */
        struct mutex_waiter                *blocked_on;
#endif

#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
        int                                non_block_count;
#endif

#ifdef CONFIG_TRACE_IRQFLAGS
        struct irqtrace_events                irqtrace;
        unsigned int                        hardirq_threaded;
        u64                                hardirq_chain_key;
        int                                softirqs_enabled;
        int                                softirq_context;
        int                                irq_config;
#endif

#ifdef CONFIG_LOCKDEP
# define MAX_LOCK_DEPTH                        48UL
        u64                                curr_chain_key;
        int                                lockdep_depth;
        unsigned int                        lockdep_recursion;
        struct held_lock                held_locks[MAX_LOCK_DEPTH];
#endif

#if defined(CONFIG_UBSAN) && !defined(CONFIG_UBSAN_TRAP)
        unsigned int                        in_ubsan;
#endif

        /* Journalling filesystem info: */
        void                                *journal_info;

        /* Stacked block device info: */
        struct bio_list                        *bio_list;

#ifdef CONFIG_BLOCK
        /* Stack plugging: */
        struct blk_plug                        *plug;
#endif

        /* VM state: */
        struct reclaim_state                *reclaim_state;

        struct backing_dev_info                *backing_dev_info;

        struct io_context                *io_context;

#ifdef CONFIG_COMPACTION
        struct capture_control                *capture_control;
#endif
        /* Ptrace state: */
        unsigned long                        ptrace_message;
        kernel_siginfo_t                *last_siginfo;

        struct task_io_accounting        ioac;
#ifdef CONFIG_PSI
        /* Pressure stall state */
        unsigned int                        psi_flags;
#endif
#ifdef CONFIG_TASK_XACCT
        /* Accumulated RSS usage: */
        u64                                acct_rss_mem1;
        /* Accumulated virtual memory usage: */
        u64                                acct_vm_mem1;
        /* stime + utime since last update: */
        u64                                acct_timexpd;
#endif
#ifdef CONFIG_CPUSETS
        /* Protected by ->alloc_lock: */
        nodemask_t                        mems_allowed;
        /* Seqence number to catch updates: */
        seqcount_spinlock_t                mems_allowed_seq;
        int                                cpuset_mem_spread_rotor;
        int                                cpuset_slab_spread_rotor;
#endif
#ifdef CONFIG_CGROUPS
        /* Control Group info protected by css_set_lock: */
        struct css_set __rcu                *cgroups;
        /* cg_list protected by css_set_lock and tsk->alloc_lock: */
        struct list_head                cg_list;
#endif
#ifdef CONFIG_X86_CPU_RESCTRL
        u32                                closid;
        u32                                rmid;
#endif
#ifdef CONFIG_FUTEX
        struct robust_list_head __user        *robust_list;
#ifdef CONFIG_COMPAT
        struct compat_robust_list_head __user *compat_robust_list;
#endif
        struct list_head                pi_state_list;
        struct futex_pi_state                *pi_state_cache;
        struct mutex                        futex_exit_mutex;
        unsigned int                        futex_state;
#endif
#ifdef CONFIG_PERF_EVENTS
        struct perf_event_context        *perf_event_ctxp[perf_nr_task_contexts];
        struct mutex                        perf_event_mutex;
        struct list_head                perf_event_list;
#endif
#ifdef CONFIG_DEBUG_PREEMPT
        unsigned long                        preempt_disable_ip;
#endif
#ifdef CONFIG_NUMA
        /* Protected by alloc_lock: */
        struct mempolicy                *mempolicy;
        short                                il_prev;
        short                                pref_node_fork;
#endif
#ifdef CONFIG_NUMA_BALANCING
        int                                numa_scan_seq;
        unsigned int                        numa_scan_period;
        unsigned int                        numa_scan_period_max;
        int                                numa_preferred_nid;
        unsigned long                        numa_migrate_retry;
        /* Migration stamp: */
        u64                                node_stamp;
        u64                                last_task_numa_placement;
        u64                                last_sum_exec_runtime;
        struct callback_head                numa_work;

        /*
         * This pointer is only modified for current in syscall and
         * pagefault context (and for tasks being destroyed), so it can be read
         * from any of the following contexts:
         *  - RCU read-side critical section
         *  - current->numa_group from everywhere
         *  - task's runqueue locked, task not running
         */
        struct numa_group __rcu                *numa_group;

        /*
         * numa_faults is an array split into four regions:
         * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer
         * in this precise order.
         *
         * faults_memory: Exponential decaying average of faults on a per-node
         * basis. Scheduling placement decisions are made based on these
         * counts. The values remain static for the duration of a PTE scan.
         * faults_cpu: Track the nodes the process was running on when a NUMA
         * hinting fault was incurred.
         * faults_memory_buffer and faults_cpu_buffer: Record faults per node
         * during the current scan window. When the scan completes, the counts
         * in faults_memory and faults_cpu decay and these values are copied.
         */
        unsigned long                        *numa_faults;
        unsigned long                        total_numa_faults;

        /*
         * numa_faults_locality tracks if faults recorded during the last
         * scan window were remote/local or failed to migrate. The task scan
         * period is adapted based on the locality of the faults with different
         * weights depending on whether they were shared or private faults
         */
        unsigned long                        numa_faults_locality[3];

        unsigned long                        numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */

#ifdef CONFIG_RSEQ
        struct rseq __user *rseq;
        u32 rseq_sig;
        /*
         * RmW on rseq_event_mask must be performed atomically
         * with respect to preemption.
         */
        unsigned long rseq_event_mask;
#endif

        struct tlbflush_unmap_batch        tlb_ubc;

        union {
                refcount_t                rcu_users;
                struct rcu_head                rcu;
        };

        /* Cache last used pipe for splice(): */
        struct pipe_inode_info                *splice_pipe;

        struct page_frag                task_frag;

#ifdef CONFIG_TASK_DELAY_ACCT
        struct task_delay_info                *delays;
#endif

#ifdef CONFIG_FAULT_INJECTION
        int                                make_it_fail;
        unsigned int                        fail_nth;
#endif
        /*
         * When (nr_dirtied >= nr_dirtied_pause), it's time to call
         * balance_dirty_pages() for a dirty throttling pause:
         */
        int                                nr_dirtied;
        int                                nr_dirtied_pause;
        /* Start of a write-and-pause period: */
        unsigned long                        dirty_paused_when;

#ifdef CONFIG_LATENCYTOP
        int                                latency_record_count;
        struct latency_record                latency_record[LT_SAVECOUNT];
#endif
        /*
         * Time slack values; these are used to round up poll() and
         * select() etc timeout values. These are in nanoseconds.
         */
        u64                                timer_slack_ns;
        u64                                default_timer_slack_ns;

#ifdef CONFIG_KASAN
        unsigned int                        kasan_depth;
#endif

#ifdef CONFIG_KCSAN
        struct kcsan_ctx                kcsan_ctx;
#ifdef CONFIG_TRACE_IRQFLAGS
        struct irqtrace_events                kcsan_save_irqtrace;
#endif
#endif

#if IS_ENABLED(CONFIG_KUNIT)
        struct kunit                        *kunit_test;
#endif

#ifdef CONFIG_FUNCTION_GRAPH_TRACER
        /* Index of current stored address in ret_stack: */
        int                                curr_ret_stack;
        int                                curr_ret_depth;

        /* Stack of return addresses for return function tracing: */
        struct ftrace_ret_stack                *ret_stack;

        /* Timestamp for last schedule: */
        unsigned long long                ftrace_timestamp;

        /*
         * Number of functions that haven't been traced
         * because of depth overrun:
         */
        atomic_t                        trace_overrun;

        /* Pause tracing: */
        atomic_t                        tracing_graph_pause;
#endif

#ifdef CONFIG_TRACING
        /* State flags for use by tracers: */
        unsigned long                        trace;

        /* Bitmask and counter of trace recursion: */
        unsigned long                        trace_recursion;
#endif /* CONFIG_TRACING */

#ifdef CONFIG_KCOV
        /* See kernel/kcov.c for more details. */

        /* Coverage collection mode enabled for this task (0 if disabled): */
        unsigned int                        kcov_mode;

        /* Size of the kcov_area: */
        unsigned int                        kcov_size;

        /* Buffer for coverage collection: */
        void                                *kcov_area;

        /* KCOV descriptor wired with this task or NULL: */
        struct kcov                        *kcov;

        /* KCOV common handle for remote coverage collection: */
        u64                                kcov_handle;

        /* KCOV sequence number: */
        int                                kcov_sequence;

        /* Collect coverage from softirq context: */
        unsigned int                        kcov_softirq;
#endif

#ifdef CONFIG_MEMCG
        struct mem_cgroup                *memcg_in_oom;
        gfp_t                                memcg_oom_gfp_mask;
        int                                memcg_oom_order;

        /* Number of pages to reclaim on returning to userland: */
        unsigned int                        memcg_nr_pages_over_high;

        /* Used by memcontrol for targeted memcg charge: */
        struct mem_cgroup                *active_memcg;
#endif

#ifdef CONFIG_BLK_CGROUP
        struct request_queue                *throttle_queue;
#endif

#ifdef CONFIG_UPROBES
        struct uprobe_task                *utask;
#endif
#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
        unsigned int                        sequential_io;
        unsigned int                        sequential_io_avg;
#endif
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
        unsigned long                        task_state_change;
#endif
        int                                pagefault_disabled;
#ifdef CONFIG_MMU
        struct task_struct                *oom_reaper_list;
        struct timer_list                oom_reaper_timer;
#endif
#ifdef CONFIG_VMAP_STACK
        struct vm_struct                *stack_vm_area;
#endif
#ifdef CONFIG_THREAD_INFO_IN_TASK
        /* A live task holds one reference: */
        refcount_t                        stack_refcount;
#endif
#ifdef CONFIG_LIVEPATCH
        int patch_state;
#endif
#ifdef CONFIG_SECURITY
        /* Used by LSM modules for access restriction: */
        void                                *security;
#endif

#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
        unsigned long                        lowest_stack;
        unsigned long                        prev_lowest_stack;
#endif

#ifdef CONFIG_X86_MCE
        void __user                        *mce_vaddr;
        __u64                                mce_kflags;
        u64                                mce_addr;
        __u64                                mce_ripv : 1,
                                        mce_whole_page : 1,
                                        __mce_reserved : 62;
        struct callback_head                mce_kill_me;
        int                                mce_count;
#endif

        /*
         * New fields for task_struct should be added above here, so that
         * they are included in the randomized portion of task_struct.
         */
        randomized_struct_fields_end

        /* CPU-specific state of this task: */
        struct thread_struct                thread;

        /*
         * WARNING: on x86, 'thread_struct' contains a variable-sized
         * structure.  It *MUST* be at the end of 'task_struct'.
         *
         * Do not put anything below here!
         */
};

static inline struct pid *task_pid(struct task_struct *task)
{
        return task->thread_pid;
}

/*
 * the helpers to get the task's different pids as they are seen
 * from various namespaces
 *
 * task_xid_nr()     : global id, i.e. the id seen from the init namespace;
 * task_xid_vnr()    : virtual id, i.e. the id seen from the pid namespace of
 *                     current.
 * task_xid_nr_ns()  : id seen from the ns specified;
 *
 * see also pid_nr() etc in include/linux/pid.h
 */
pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns);

static inline pid_t task_pid_nr(struct task_struct *tsk)
{
        return tsk->pid;
}

static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
}

static inline pid_t task_pid_vnr(struct task_struct *tsk)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
}


static inline pid_t task_tgid_nr(struct task_struct *tsk)
{
        return tsk->tgid;
}

/**
 * pid_alive - check that a task structure is not stale
 * @p: Task structure to be checked.
 *
 * Test if a process is not yet dead (at most zombie state)
 * If pid_alive fails, then pointers within the task structure
 * can be stale and must not be dereferenced.
 *
 * Return: 1 if the process is alive. 0 otherwise.
 */
static inline int pid_alive(const struct task_struct *p)
{
        return p->thread_pid != NULL;
}

static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
}

static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
}


static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
}

static inline pid_t task_session_vnr(struct task_struct *tsk)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
}

static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns);
}

static inline pid_t task_tgid_vnr(struct task_struct *tsk)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL);
}

static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns)
{
        pid_t pid = 0;

        rcu_read_lock();
        if (pid_alive(tsk))
                pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns);
        rcu_read_unlock();

        return pid;
}

static inline pid_t task_ppid_nr(const struct task_struct *tsk)
{
        return task_ppid_nr_ns(tsk, &init_pid_ns);
}

/* Obsolete, do not use: */
static inline pid_t task_pgrp_nr(struct task_struct *tsk)
{
        return task_pgrp_nr_ns(tsk, &init_pid_ns);
}

#define TASK_REPORT_IDLE        (TASK_REPORT + 1)
#define TASK_REPORT_MAX                (TASK_REPORT_IDLE << 1)

static inline unsigned int task_state_index(struct task_struct *tsk)
{
        unsigned int tsk_state = READ_ONCE(tsk->state);
        unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT;

        BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX);

        if (tsk_state == TASK_IDLE)
                state = TASK_REPORT_IDLE;

        return fls(state);
}

static inline char task_index_to_char(unsigned int state)
{
        static const char state_char[] = "RSDTtXZPI";

        BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != sizeof(state_char) - 1);

        return state_char[state];
}

static inline char task_state_to_char(struct task_struct *tsk)
{
        return task_index_to_char(task_state_index(tsk));
}

/**
 * is_global_init - check if a task structure is init. Since init
 * is free to have sub-threads we need to check tgid.
 * @tsk: Task structure to be checked.
 *
 * Check if a task structure is the first user space task the kernel created.
 *
 * Return: 1 if the task structure is init. 0 otherwise.
 */
static inline int is_global_init(struct task_struct *tsk)
{
        return task_tgid_nr(tsk) == 1;
}

extern struct pid *cad_pid;

/*
 * Per process flags
 */
#define PF_VCPU                        0x00000001        /* I'm a virtual CPU */
#define PF_IDLE                        0x00000002        /* I am an IDLE thread */
#define PF_EXITING                0x00000004        /* Getting shut down */
#define PF_IO_WORKER                0x00000010        /* Task is an IO worker */
#define PF_WQ_WORKER                0x00000020        /* I'm a workqueue worker */
#define PF_FORKNOEXEC                0x00000040        /* Forked but didn't exec */
#define PF_MCE_PROCESS                0x00000080      /* Process policy on mce errors */
#define PF_SUPERPRIV                0x00000100        /* Used super-user privileges */
#define PF_DUMPCORE                0x00000200        /* Dumped core */
#define PF_SIGNALED                0x00000400        /* Killed by a signal */
#define PF_MEMALLOC                0x00000800        /* Allocating memory */
#define PF_NPROC_EXCEEDED        0x00001000        /* set_user() noticed that RLIMIT_NPROC was exceeded */
#define PF_USED_MATH                0x00002000        /* If unset the fpu must be initialized before use */
#define PF_NOFREEZE                0x00008000        /* This thread should not be frozen */
#define PF_FROZEN                0x00010000        /* Frozen for system suspend */
#define PF_KSWAPD                0x00020000        /* I am kswapd */
#define PF_MEMALLOC_NOFS        0x00040000        /* All allocation requests will inherit GFP_NOFS */
#define PF_MEMALLOC_NOIO        0x00080000        /* All allocation requests will inherit GFP_NOIO */
#define PF_LOCAL_THROTTLE        0x00100000        /* Throttle writes only against the bdi I write to,
                                                 * I am cleaning dirty pages from some other bdi. */
#define PF_KTHREAD                0x00200000        /* I am a kernel thread */
#define PF_RANDOMIZE                0x00400000        /* Randomize virtual address space */
#define PF_SWAPWRITE                0x00800000        /* Allowed to write to swap */
#define PF_NO_SETAFFINITY        0x04000000        /* Userland is not allowed to meddle with cpus_mask */
#define PF_MCE_EARLY                0x08000000      /* Early kill for mce process policy */
#define PF_MEMALLOC_NOCMA        0x10000000        /* All allocation request will have _GFP_MOVABLE cleared */
#define PF_FREEZER_SKIP                0x40000000        /* Freezer should not count it as freezable */
#define PF_SUSPEND_TASK                0x80000000      /* This thread called freeze_processes() and should not be frozen */

/*
 * Only the _current_ task can read/write to tsk->flags, but other
 * tasks can access tsk->flags in readonly mode for example
 * with tsk_used_math (like during threaded core dumping).
 * There is however an exception to this rule during ptrace
 * or during fork: the ptracer task is allowed to write to the
 * child->flags of its traced child (same goes for fork, the parent
 * can write to the child->flags), because we're guaranteed the
 * child is not running and in turn not changing child->flags
 * at the same time the parent does it.
 */
#define clear_stopped_child_used_math(child)        do { (child)->flags &= ~PF_USED_MATH; } while (0)
#define set_stopped_child_used_math(child)        do { (child)->flags |= PF_USED_MATH; } while (0)
#define clear_used_math()                        clear_stopped_child_used_math(current)
#define set_used_math()                                set_stopped_child_used_math(current)

#define conditional_stopped_child_used_math(condition, child) \
        do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0)

#define conditional_used_math(condition)        conditional_stopped_child_used_math(condition, current)

#define copy_to_stopped_child_used_math(child) \
        do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0)

/* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */
#define tsk_used_math(p)                        ((p)->flags & PF_USED_MATH)
#define used_math()                                tsk_used_math(current)

static __always_inline bool is_percpu_thread(void)
{
#ifdef CONFIG_SMP
        return (current->flags & PF_NO_SETAFFINITY) &&
                (current->nr_cpus_allowed  == 1);
#else
        return true;
#endif
}

/* Per-process atomic flags. */
#define PFA_NO_NEW_PRIVS                0        /* May not gain new privileges. */
#define PFA_SPREAD_PAGE                        1        /* Spread page cache over cpuset */
#define PFA_SPREAD_SLAB                        2        /* Spread some slab caches over cpuset */
#define PFA_SPEC_SSB_DISABLE                3        /* Speculative Store Bypass disabled */
#define PFA_SPEC_SSB_FORCE_DISABLE        4        /* Speculative Store Bypass force disabled*/
#define PFA_SPEC_IB_DISABLE                5        /* Indirect branch speculation restricted */
#define PFA_SPEC_IB_FORCE_DISABLE        6        /* Indirect branch speculation permanently restricted */
#define PFA_SPEC_SSB_NOEXEC                7        /* Speculative Store Bypass clear on execve() */

#define TASK_PFA_TEST(name, func)                                        \
        static inline bool task_##func(struct task_struct *p)                \
        { return test_bit(PFA_##name, &p->atomic_flags); }

#define TASK_PFA_SET(name, func)                                        \
        static inline void task_set_##func(struct task_struct *p)        \
        { set_bit(PFA_##name, &p->atomic_flags); }

#define TASK_PFA_CLEAR(name, func)                                        \
        static inline void task_clear_##func(struct task_struct *p)        \
        { clear_bit(PFA_##name, &p->atomic_flags); }

TASK_PFA_TEST(NO_NEW_PRIVS, no_new_privs)
TASK_PFA_SET(NO_NEW_PRIVS, no_new_privs)

TASK_PFA_TEST(SPREAD_PAGE, spread_page)
TASK_PFA_SET(SPREAD_PAGE, spread_page)
TASK_PFA_CLEAR(SPREAD_PAGE, spread_page)

TASK_PFA_TEST(SPREAD_SLAB, spread_slab)
TASK_PFA_SET(SPREAD_SLAB, spread_slab)
TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab)

TASK_PFA_TEST(SPEC_SSB_DISABLE, spec_ssb_disable)
TASK_PFA_SET(SPEC_SSB_DISABLE, spec_ssb_disable)
TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable)

TASK_PFA_TEST(SPEC_SSB_NOEXEC, spec_ssb_noexec)
TASK_PFA_SET(SPEC_SSB_NOEXEC, spec_ssb_noexec)
TASK_PFA_CLEAR(SPEC_SSB_NOEXEC, spec_ssb_noexec)

TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable)
TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable)

TASK_PFA_TEST(SPEC_IB_DISABLE, spec_ib_disable)
TASK_PFA_SET(SPEC_IB_DISABLE, spec_ib_disable)
TASK_PFA_CLEAR(SPEC_IB_DISABLE, spec_ib_disable)

TASK_PFA_TEST(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable)
TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable)

static inline void
current_restore_flags(unsigned long orig_flags, unsigned long flags)
{
        current->flags &= ~flags;
        current->flags |= orig_flags & flags;
}

extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
extern int task_can_attach(struct task_struct *p);
extern int dl_bw_alloc(int cpu, u64 dl_bw);
extern void dl_bw_free(int cpu, u64 dl_bw);
#ifdef CONFIG_SMP
extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);
extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask);
#else
static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
{
}
static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
{
        if (!cpumask_test_cpu(0, new_mask))
                return -EINVAL;
        return 0;
}
#endif

extern int yield_to(struct task_struct *p, bool preempt);
extern void set_user_nice(struct task_struct *p, long nice);
extern int task_prio(const struct task_struct *p);

/**
 * task_nice - return the nice value of a given task.
 * @p: the task in question.
 *
 * Return: The nice value [ -20 ... 0 ... 19 ].
 */
static inline int task_nice(const struct task_struct *p)
{
        return PRIO_TO_NICE((p)->static_prio);
}

extern int can_nice(const struct task_struct *p, const int nice);
extern int task_curr(const struct task_struct *p);
extern int idle_cpu(int cpu);
extern int available_idle_cpu(int cpu);
extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *);
extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *);
extern void sched_set_fifo(struct task_struct *p);
extern void sched_set_fifo_low(struct task_struct *p);
extern void sched_set_normal(struct task_struct *p, int nice);
extern int sched_setattr(struct task_struct *, const struct sched_attr *);
extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *);
extern struct task_struct *idle_task(int cpu);

/**
 * is_idle_task - is the specified task an idle task?
 * @p: the task in question.
 *
 * Return: 1 if @p is an idle task. 0 otherwise.
 */
static __always_inline bool is_idle_task(const struct task_struct *p)
{
        return !!(p->flags & PF_IDLE);
}

extern struct task_struct *curr_task(int cpu);
extern void ia64_set_curr_task(int cpu, struct task_struct *p);

void yield(void);

union thread_union {
#ifndef CONFIG_ARCH_TASK_STRUCT_ON_STACK
        struct task_struct task;
#endif
#ifndef CONFIG_THREAD_INFO_IN_TASK
        struct thread_info thread_info;
#endif
        unsigned long stack[THREAD_SIZE/sizeof(long)];
};

#ifndef CONFIG_THREAD_INFO_IN_TASK
extern struct thread_info init_thread_info;
#endif

extern unsigned long init_stack[THREAD_SIZE / sizeof(unsigned long)];

#ifdef CONFIG_THREAD_INFO_IN_TASK
static inline struct thread_info *task_thread_info(struct task_struct *task)
{
        return &task->thread_info;
}
#elif !defined(__HAVE_THREAD_FUNCTIONS)
# define task_thread_info(task)        ((struct thread_info *)(task)->stack)
#endif

/*
 * find a task by one of its numerical ids
 *
 * find_task_by_pid_ns():
 *      finds a task by its pid in the specified namespace
 * find_task_by_vpid():
 *      finds a task by its virtual pid
 *
 * see also find_vpid() etc in include/linux/pid.h
 */

extern struct task_struct *find_task_by_vpid(pid_t nr);
extern struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns);

/*
 * find a task by its virtual pid and get the task struct
 */
extern struct task_struct *find_get_task_by_vpid(pid_t nr);

extern int wake_up_state(struct task_struct *tsk, unsigned int state);
extern int wake_up_process(struct task_struct *tsk);
extern void wake_up_new_task(struct task_struct *tsk);

#ifdef CONFIG_SMP
extern void kick_process(struct task_struct *tsk);
#else
static inline void kick_process(struct task_struct *tsk) { }
#endif

extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec);

static inline void set_task_comm(struct task_struct *tsk, const char *from)
{
        __set_task_comm(tsk, from, false);
}

extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk);
#define get_task_comm(buf, tsk) ({                        \
        BUILD_BUG_ON(sizeof(buf) != TASK_COMM_LEN);        \
        __get_task_comm(buf, sizeof(buf), tsk);                \
})

#ifdef CONFIG_SMP
static __always_inline void scheduler_ipi(void)
{
        /*
         * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
         * TIF_NEED_RESCHED remotely (for the first time) will also send
         * this IPI.
         */
        preempt_fold_need_resched();
}
extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
#else
static inline void scheduler_ipi(void) { }
static inline unsigned long wait_task_inactive(struct task_struct *p, long match_state)
{
        return 1;
}
#endif

/*
 * Set thread flags in other task's structures.
 * See asm/thread_info.h for TIF_xxxx flags available:
 */
static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag)
{
        set_ti_thread_flag(task_thread_info(tsk), flag);
}

static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag)
{
        clear_ti_thread_flag(task_thread_info(tsk), flag);
}

static inline void update_tsk_thread_flag(struct task_struct *tsk, int flag,
                                          bool value)
{
        update_ti_thread_flag(task_thread_info(tsk), flag, value);
}

static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag)
{
        return test_and_set_ti_thread_flag(task_thread_info(tsk), flag);
}

static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag)
{
        return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag);
}

static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
{
        return test_ti_thread_flag(task_thread_info(tsk), flag);
}

static inline void set_tsk_need_resched(struct task_struct *tsk)
{
        set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
}

static inline void clear_tsk_need_resched(struct task_struct *tsk)
{
        clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
}

static inline int test_tsk_need_resched(struct task_struct *tsk)
{
        return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
}

/*
 * cond_resched() and cond_resched_lock(): latency reduction via
 * explicit rescheduling in places that are safe. The return
 * value indicates whether a reschedule was done in fact.
 * cond_resched_lock() will drop the spinlock before scheduling,
 */
#ifndef CONFIG_PREEMPTION
extern int _cond_resched(void);
#else
static inline int _cond_resched(void) { return 0; }
#endif

#define cond_resched() ({                        \
        ___might_sleep(__FILE__, __LINE__, 0);        \
        _cond_resched();                        \
})

extern int __cond_resched_lock(spinlock_t *lock);

#define cond_resched_lock(lock) ({                                \
        ___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\
        __cond_resched_lock(lock);                                \
})

static inline void cond_resched_rcu(void)
{
#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU)
        rcu_read_unlock();
        cond_resched();
        rcu_read_lock();
#endif
}

/*
 * Does a critical section need to be broken due to another
 * task waiting?: (technically does not depend on CONFIG_PREEMPTION,
 * but a general need for low latency)
 */
static inline int spin_needbreak(spinlock_t *lock)
{
#ifdef CONFIG_PREEMPTION
        return spin_is_contended(lock);
#else
        return 0;
#endif
}

static __always_inline bool need_resched(void)
{
        return unlikely(tif_need_resched());
}

/*
 * Wrappers for p->thread_info->cpu access. No-op on UP.
 */
#ifdef CONFIG_SMP

static inline unsigned int task_cpu(const struct task_struct *p)
{
#ifdef CONFIG_THREAD_INFO_IN_TASK
        return READ_ONCE(p->cpu);
#else
        return READ_ONCE(task_thread_info(p)->cpu);
#endif
}

extern void set_task_cpu(struct task_struct *p, unsigned int cpu);

#else

static inline unsigned int task_cpu(const struct task_struct *p)
{
        return 0;
}

static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
{
}

#endif /* CONFIG_SMP */

/*
 * In order to reduce various lock holder preemption latencies provide an
 * interface to see if a vCPU is currently running or not.
 *
 * This allows us to terminate optimistic spin loops and block, analogous to
 * the native optimistic spin heuristic of testing if the lock owner task is
 * running or not.
 */
#ifndef vcpu_is_preempted
static inline bool vcpu_is_preempted(int cpu)
{
        return false;
}
#endif

extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
extern long sched_getaffinity(pid_t pid, struct cpumask *mask);

#ifndef TASK_SIZE_OF
#define TASK_SIZE_OF(tsk)        TASK_SIZE
#endif

#ifdef CONFIG_RSEQ

/*
 * Map the event mask on the user-space ABI enum rseq_cs_flags
 * for direct mask checks.
 */
enum rseq_event_mask_bits {
        RSEQ_EVENT_PREEMPT_BIT        = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT,
        RSEQ_EVENT_SIGNAL_BIT        = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT,
        RSEQ_EVENT_MIGRATE_BIT        = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT,
};

enum rseq_event_mask {
        RSEQ_EVENT_PREEMPT        = (1U << RSEQ_EVENT_PREEMPT_BIT),
        RSEQ_EVENT_SIGNAL        = (1U << RSEQ_EVENT_SIGNAL_BIT),
        RSEQ_EVENT_MIGRATE        = (1U << RSEQ_EVENT_MIGRATE_BIT),
};

static inline void rseq_set_notify_resume(struct task_struct *t)
{
        if (t->rseq)
                set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
}

void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs);

static inline void rseq_handle_notify_resume(struct ksignal *ksig,
                                             struct pt_regs *regs)
{
        if (current->rseq)
                __rseq_handle_notify_resume(ksig, regs);
}

static inline void rseq_signal_deliver(struct ksignal *ksig,
                                       struct pt_regs *regs)
{
        preempt_disable();
        __set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask);
        preempt_enable();
        rseq_handle_notify_resume(ksig, regs);
}

/* rseq_preempt() requires preemption to be disabled. */
static inline void rseq_preempt(struct task_struct *t)
{
        __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
        rseq_set_notify_resume(t);
}

/* rseq_migrate() requires preemption to be disabled. */
static inline void rseq_migrate(struct task_struct *t)
{
        __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask);
        rseq_set_notify_resume(t);
}

/*
 * If parent process has a registered restartable sequences area, the
 * child inherits. Unregister rseq for a clone with CLONE_VM set.
 */
static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
{
        if (clone_flags & CLONE_VM) {
                t->rseq = NULL;
                t->rseq_sig = 0;
                t->rseq_event_mask = 0;
        } else {
                t->rseq = current->rseq;
                t->rseq_sig = current->rseq_sig;
                t->rseq_event_mask = current->rseq_event_mask;
        }
}

static inline void rseq_execve(struct task_struct *t)
{
        t->rseq = NULL;
        t->rseq_sig = 0;
        t->rseq_event_mask = 0;
}

#else

static inline void rseq_set_notify_resume(struct task_struct *t)
{
}
static inline void rseq_handle_notify_resume(struct ksignal *ksig,
                                             struct pt_regs *regs)
{
}
static inline void rseq_signal_deliver(struct ksignal *ksig,
                                       struct pt_regs *regs)
{
}
static inline void rseq_preempt(struct task_struct *t)
{
}
static inline void rseq_migrate(struct task_struct *t)
{
}
static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
{
}
static inline void rseq_execve(struct task_struct *t)
{
}

#endif

#ifdef CONFIG_DEBUG_RSEQ

void rseq_syscall(struct pt_regs *regs);

#else

static inline void rseq_syscall(struct pt_regs *regs)
{
}

#endif

const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq);
char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len);
int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq);

const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq);
const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq);
const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq);

int sched_trace_rq_cpu(struct rq *rq);
int sched_trace_rq_cpu_capacity(struct rq *rq);
int sched_trace_rq_nr_running(struct rq *rq);

const struct cpumask *sched_trace_rd_span(struct root_domain *rd);

#endif
































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *  acpi_bus.h - ACPI Bus Driver ($Revision: 22 $)
 *
 *  Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
 *  Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
 */

#ifndef __ACPI_BUS_H__
#define __ACPI_BUS_H__

#include <linux/device.h>
#include <linux/property.h>

/* TBD: Make dynamic */
#define ACPI_MAX_HANDLES        10
struct acpi_handle_list {
        u32 count;
        acpi_handle handles[ACPI_MAX_HANDLES];
};

/* acpi_utils.h */
acpi_status
acpi_extract_package(union acpi_object *package,
                     struct acpi_buffer *format, struct acpi_buffer *buffer);
acpi_status
acpi_evaluate_integer(acpi_handle handle,
                      acpi_string pathname,
                      struct acpi_object_list *arguments, unsigned long long *data);
acpi_status
acpi_evaluate_reference(acpi_handle handle,
                        acpi_string pathname,
                        struct acpi_object_list *arguments,
                        struct acpi_handle_list *list);
acpi_status
acpi_evaluate_ost(acpi_handle handle, u32 source_event, u32 status_code,
                  struct acpi_buffer *status_buf);

acpi_status
acpi_get_physical_device_location(acpi_handle handle, struct acpi_pld_info **pld);

bool acpi_has_method(acpi_handle handle, char *name);
acpi_status acpi_execute_simple_method(acpi_handle handle, char *method,
                                       u64 arg);
acpi_status acpi_evaluate_ej0(acpi_handle handle);
acpi_status acpi_evaluate_lck(acpi_handle handle, int lock);
acpi_status acpi_evaluate_reg(acpi_handle handle, u8 space_id, u32 function);
bool acpi_ata_match(acpi_handle handle);
bool acpi_bay_match(acpi_handle handle);
bool acpi_dock_match(acpi_handle handle);

bool acpi_check_dsm(acpi_handle handle, const guid_t *guid, u64 rev, u64 funcs);
union acpi_object *acpi_evaluate_dsm(acpi_handle handle, const guid_t *guid,
                        u64 rev, u64 func, union acpi_object *argv4);

static inline union acpi_object *
acpi_evaluate_dsm_typed(acpi_handle handle, const guid_t *guid, u64 rev,
                        u64 func, union acpi_object *argv4,
                        acpi_object_type type)
{
        union acpi_object *obj;

        obj = acpi_evaluate_dsm(handle, guid, rev, func, argv4);
        if (obj && obj->type != type) {
                ACPI_FREE(obj);
                obj = NULL;
        }

        return obj;
}

#define        ACPI_INIT_DSM_ARGV4(cnt, eles)                        \
        {                                                \
          .package.type = ACPI_TYPE_PACKAGE,                \
          .package.count = (cnt),                        \
          .package.elements = (eles)                        \
        }

bool acpi_dev_found(const char *hid);
bool acpi_dev_present(const char *hid, const char *uid, s64 hrv);

#ifdef CONFIG_ACPI

struct proc_dir_entry;

#define ACPI_BUS_FILE_ROOT        "acpi"
extern struct proc_dir_entry *acpi_root_dir;

enum acpi_bus_device_type {
        ACPI_BUS_TYPE_DEVICE = 0,
        ACPI_BUS_TYPE_POWER,
        ACPI_BUS_TYPE_PROCESSOR,
        ACPI_BUS_TYPE_THERMAL,
        ACPI_BUS_TYPE_POWER_BUTTON,
        ACPI_BUS_TYPE_SLEEP_BUTTON,
        ACPI_BUS_TYPE_ECDT_EC,
        ACPI_BUS_DEVICE_TYPE_COUNT
};

struct acpi_driver;
struct acpi_device;

/*
 * ACPI Scan Handler
 * -----------------
 */

struct acpi_hotplug_profile {
        struct kobject kobj;
        int (*scan_dependent)(struct acpi_device *adev);
        void (*notify_online)(struct acpi_device *adev);
        bool enabled:1;
        bool demand_offline:1;
};

static inline struct acpi_hotplug_profile *to_acpi_hotplug_profile(
                                                struct kobject *kobj)
{
        return container_of(kobj, struct acpi_hotplug_profile, kobj);
}

struct acpi_scan_handler {
        const struct acpi_device_id *ids;
        struct list_head list_node;
        bool (*match)(const char *idstr, const struct acpi_device_id **matchid);
        int (*attach)(struct acpi_device *dev, const struct acpi_device_id *id);
        void (*detach)(struct acpi_device *dev);
        void (*bind)(struct device *phys_dev);
        void (*unbind)(struct device *phys_dev);
        struct acpi_hotplug_profile hotplug;
};

/*
 * ACPI Hotplug Context
 * --------------------
 */

struct acpi_hotplug_context {
        struct acpi_device *self;
        int (*notify)(struct acpi_device *, u32);
        void (*uevent)(struct acpi_device *, u32);
        void (*fixup)(struct acpi_device *);
};

/*
 * ACPI Driver
 * -----------
 */

typedef int (*acpi_op_add) (struct acpi_device * device);
typedef int (*acpi_op_remove) (struct acpi_device * device);
typedef void (*acpi_op_notify) (struct acpi_device * device, u32 event);

struct acpi_device_ops {
        acpi_op_add add;
        acpi_op_remove remove;
        acpi_op_notify notify;
};

#define ACPI_DRIVER_ALL_NOTIFY_EVENTS        0x1        /* system AND device events */

struct acpi_driver {
        char name[80];
        char class[80];
        const struct acpi_device_id *ids; /* Supported Hardware IDs */
        unsigned int flags;
        struct acpi_device_ops ops;
        struct device_driver drv;
        struct module *owner;
};

/*
 * ACPI Device
 * -----------
 */

/* Status (_STA) */

struct acpi_device_status {
        u32 present:1;
        u32 enabled:1;
        u32 show_in_ui:1;
        u32 functional:1;
        u32 battery_present:1;
        u32 reserved:27;
};

/* Flags */

struct acpi_device_flags {
        u32 dynamic_status:1;
        u32 removable:1;
        u32 ejectable:1;
        u32 power_manageable:1;
        u32 match_driver:1;
        u32 initialized:1;
        u32 visited:1;
        u32 hotplug_notify:1;
        u32 is_dock_station:1;
        u32 of_compatible_ok:1;
        u32 coherent_dma:1;
        u32 cca_seen:1;
        u32 enumeration_by_parent:1;
        u32 reserved:19;
};

/* File System */

struct acpi_device_dir {
        struct proc_dir_entry *entry;
};

#define acpi_device_dir(d)        ((d)->dir.entry)

/* Plug and Play */

typedef char acpi_bus_id[8];
typedef u64 acpi_bus_address;
typedef char acpi_device_name[40];
typedef char acpi_device_class[20];

struct acpi_hardware_id {
        struct list_head list;
        const char *id;
};

struct acpi_pnp_type {
        u32 hardware_id:1;
        u32 bus_address:1;
        u32 platform_id:1;
        u32 reserved:29;
};

struct acpi_device_pnp {
        acpi_bus_id bus_id;                /* Object name */
        int instance_no;                /* Instance number of this object */
        struct acpi_pnp_type type;        /* ID type */
        acpi_bus_address bus_address;        /* _ADR */
        char *unique_id;                /* _UID */
        struct list_head ids;                /* _HID and _CIDs */
        acpi_device_name device_name;        /* Driver-determined */
        acpi_device_class device_class;        /*        "          */
        union acpi_object *str_obj;        /* unicode string for _STR method */
};

#define acpi_device_bid(d)        ((d)->pnp.bus_id)
#define acpi_device_adr(d)        ((d)->pnp.bus_address)
const char *acpi_device_hid(struct acpi_device *device);
#define acpi_device_uid(d)        ((d)->pnp.unique_id)
#define acpi_device_name(d)        ((d)->pnp.device_name)
#define acpi_device_class(d)        ((d)->pnp.device_class)

/* Power Management */

struct acpi_device_power_flags {
        u32 explicit_get:1;        /* _PSC present? */
        u32 power_resources:1;        /* Power resources */
        u32 inrush_current:1;        /* Serialize Dx->D0 */
        u32 power_removed:1;        /* Optimize Dx->D0 */
        u32 ignore_parent:1;        /* Power is independent of parent power state */
        u32 dsw_present:1;        /* _DSW present? */
        u32 reserved:26;
};

struct acpi_device_power_state {
        struct {
                u8 valid:1;
                u8 explicit_set:1;        /* _PSx present? */
                u8 reserved:6;
        } flags;
        int power;                /* % Power (compared to D0) */
        int latency;                /* Dx->D0 time (microseconds) */
        struct list_head resources;        /* Power resources referenced */
};

struct acpi_device_power {
        int state;                /* Current state */
        struct acpi_device_power_flags flags;
        struct acpi_device_power_state states[ACPI_D_STATE_COUNT];        /* Power states (D0-D3Cold) */
};

/* Performance Management */

struct acpi_device_perf_flags {
        u8 reserved:8;
};

struct acpi_device_perf_state {
        struct {
                u8 valid:1;
                u8 reserved:7;
        } flags;
        u8 power;                /* % Power (compared to P0) */
        u8 performance;                /* % Performance (    "   ) */
        int latency;                /* Px->P0 time (microseconds) */
};

struct acpi_device_perf {
        int state;
        struct acpi_device_perf_flags flags;
        int state_count;
        struct acpi_device_perf_state *states;
};

/* Wakeup Management */
struct acpi_device_wakeup_flags {
        u8 valid:1;                /* Can successfully enable wakeup? */
        u8 notifier_present:1;  /* Wake-up notify handler has been installed */
};

struct acpi_device_wakeup_context {
        void (*func)(struct acpi_device_wakeup_context *context);
        struct device *dev;
};

struct acpi_device_wakeup {
        acpi_handle gpe_device;
        u64 gpe_number;
        u64 sleep_state;
        struct list_head resources;
        struct acpi_device_wakeup_flags flags;
        struct acpi_device_wakeup_context context;
        struct wakeup_source *ws;
        int prepare_count;
        int enable_count;
};

struct acpi_device_physical_node {
        unsigned int node_id;
        struct list_head node;
        struct device *dev;
        bool put_online:1;
};

struct acpi_device_properties {
        const guid_t *guid;
        const union acpi_object *properties;
        struct list_head list;
};

/* ACPI Device Specific Data (_DSD) */
struct acpi_device_data {
        const union acpi_object *pointer;
        struct list_head properties;
        const union acpi_object *of_compatible;
        struct list_head subnodes;
};

struct acpi_gpio_mapping;

/* Device */
struct acpi_device {
        int device_type;
        acpi_handle handle;                /* no handle for fixed hardware */
        struct fwnode_handle fwnode;
        struct acpi_device *parent;
        struct list_head children;
        struct list_head node;
        struct list_head wakeup_list;
        struct list_head del_list;
        struct acpi_device_status status;
        struct acpi_device_flags flags;
        struct acpi_device_pnp pnp;
        struct acpi_device_power power;
        struct acpi_device_wakeup wakeup;
        struct acpi_device_perf performance;
        struct acpi_device_dir dir;
        struct acpi_device_data data;
        struct acpi_scan_handler *handler;
        struct acpi_hotplug_context *hp;
        struct acpi_driver *driver;
        const struct acpi_gpio_mapping *driver_gpios;
        void *driver_data;
        struct device dev;
        unsigned int physical_node_count;
        unsigned int dep_unmet;
        struct list_head physical_node_list;
        struct mutex physical_node_lock;
        void (*remove)(struct acpi_device *);
};

/* Non-device subnode */
struct acpi_data_node {
        const char *name;
        acpi_handle handle;
        struct fwnode_handle fwnode;
        struct fwnode_handle *parent;
        struct acpi_device_data data;
        struct list_head sibling;
        struct kobject kobj;
        struct completion kobj_done;
};

extern const struct fwnode_operations acpi_device_fwnode_ops;
extern const struct fwnode_operations acpi_data_fwnode_ops;
extern const struct fwnode_operations acpi_static_fwnode_ops;

bool is_acpi_device_node(const struct fwnode_handle *fwnode);
bool is_acpi_data_node(const struct fwnode_handle *fwnode);

static inline bool is_acpi_node(const struct fwnode_handle *fwnode)
{
        return (is_acpi_device_node(fwnode) || is_acpi_data_node(fwnode));
}

#define to_acpi_device_node(__fwnode)                                        \
        ({                                                                \
                typeof(__fwnode) __to_acpi_device_node_fwnode = __fwnode; \
                                                                        \
                is_acpi_device_node(__to_acpi_device_node_fwnode) ?        \
                        container_of(__to_acpi_device_node_fwnode,        \
                                     struct acpi_device, fwnode) :        \
                        NULL;                                                \
        })

#define to_acpi_data_node(__fwnode)                                        \
        ({                                                                \
                typeof(__fwnode) __to_acpi_data_node_fwnode = __fwnode;        \
                                                                        \
                is_acpi_data_node(__to_acpi_data_node_fwnode) ?                \
                        container_of(__to_acpi_data_node_fwnode,        \
                                     struct acpi_data_node, fwnode) :        \
                        NULL;                                                \
        })

static inline bool is_acpi_static_node(const struct fwnode_handle *fwnode)
{
        return !IS_ERR_OR_NULL(fwnode) &&
                fwnode->ops == &acpi_static_fwnode_ops;
}

static inline bool acpi_data_node_match(const struct fwnode_handle *fwnode,
                                        const char *name)
{
        return is_acpi_data_node(fwnode) ?
                (!strcmp(to_acpi_data_node(fwnode)->name, name)) : false;
}

static inline struct fwnode_handle *acpi_fwnode_handle(struct acpi_device *adev)
{
        return &adev->fwnode;
}

static inline void *acpi_driver_data(struct acpi_device *d)
{
        return d->driver_data;
}

#define to_acpi_device(d)        container_of(d, struct acpi_device, dev)
#define to_acpi_driver(d)        container_of(d, struct acpi_driver, drv)

static inline void acpi_set_device_status(struct acpi_device *adev, u32 sta)
{
        *((u32 *)&adev->status) = sta;
}

static inline void acpi_set_hp_context(struct acpi_device *adev,
                                       struct acpi_hotplug_context *hp)
{
        hp->self = adev;
        adev->hp = hp;
}

void acpi_initialize_hp_context(struct acpi_device *adev,
                                struct acpi_hotplug_context *hp,
                                int (*notify)(struct acpi_device *, u32),
                                void (*uevent)(struct acpi_device *, u32));

/* acpi_device.dev.bus == &acpi_bus_type */
extern struct bus_type acpi_bus_type;

/*
 * Events
 * ------
 */

struct acpi_bus_event {
        struct list_head node;
        acpi_device_class device_class;
        acpi_bus_id bus_id;
        u32 type;
        u32 data;
};

extern struct kobject *acpi_kobj;
extern int acpi_bus_generate_netlink_event(const char*, const char*, u8, int);
void acpi_bus_private_data_handler(acpi_handle, void *);
int acpi_bus_get_private_data(acpi_handle, void **);
int acpi_bus_attach_private_data(acpi_handle, void *);
void acpi_bus_detach_private_data(acpi_handle);
extern int acpi_notifier_call_chain(struct acpi_device *, u32, u32);
extern int register_acpi_notifier(struct notifier_block *);
extern int unregister_acpi_notifier(struct notifier_block *);

/*
 * External Functions
 */

int acpi_bus_get_device(acpi_handle handle, struct acpi_device **device);
struct acpi_device *acpi_bus_get_acpi_device(acpi_handle handle);
void acpi_bus_put_acpi_device(struct acpi_device *adev);
acpi_status acpi_bus_get_status_handle(acpi_handle handle,
                                       unsigned long long *sta);
int acpi_bus_get_status(struct acpi_device *device);

int acpi_bus_set_power(acpi_handle handle, int state);
const char *acpi_power_state_string(int state);
int acpi_device_set_power(struct acpi_device *device, int state);
int acpi_bus_init_power(struct acpi_device *device);
int acpi_device_fix_up_power(struct acpi_device *device);
int acpi_bus_update_power(acpi_handle handle, int *state_p);
int acpi_device_update_power(struct acpi_device *device, int *state_p);
bool acpi_bus_power_manageable(acpi_handle handle);
int acpi_device_power_add_dependent(struct acpi_device *adev,
                                    struct device *dev);
void acpi_device_power_remove_dependent(struct acpi_device *adev,
                                        struct device *dev);

#ifdef CONFIG_PM
bool acpi_bus_can_wakeup(acpi_handle handle);
#else
static inline bool acpi_bus_can_wakeup(acpi_handle handle) { return false; }
#endif

void acpi_scan_lock_acquire(void);
void acpi_scan_lock_release(void);
void acpi_lock_hp_context(void);
void acpi_unlock_hp_context(void);
int acpi_scan_add_handler(struct acpi_scan_handler *handler);
int acpi_bus_register_driver(struct acpi_driver *driver);
void acpi_bus_unregister_driver(struct acpi_driver *driver);
int acpi_bus_scan(acpi_handle handle);
void acpi_bus_trim(struct acpi_device *start);
acpi_status acpi_bus_get_ejd(acpi_handle handle, acpi_handle * ejd);
int acpi_match_device_ids(struct acpi_device *device,
                          const struct acpi_device_id *ids);
void acpi_set_modalias(struct acpi_device *adev, const char *default_id,
                       char *modalias, size_t len);
int acpi_create_dir(struct acpi_device *);
void acpi_remove_dir(struct acpi_device *);

static inline bool acpi_device_enumerated(struct acpi_device *adev)
{
        return adev && adev->flags.initialized && adev->flags.visited;
}

/**
 * module_acpi_driver(acpi_driver) - Helper macro for registering an ACPI driver
 * @__acpi_driver: acpi_driver struct
 *
 * Helper macro for ACPI drivers which do not do anything special in module
 * init/exit. This eliminates a lot of boilerplate. Each module may only
 * use this macro once, and calling it replaces module_init() and module_exit()
 */
#define module_acpi_driver(__acpi_driver) \
        module_driver(__acpi_driver, acpi_bus_register_driver, \
                      acpi_bus_unregister_driver)

/*
 * Bind physical devices with ACPI devices
 */
struct acpi_bus_type {
        struct list_head list;
        const char *name;
        bool (*match)(struct device *dev);
        struct acpi_device * (*find_companion)(struct device *);
        void (*setup)(struct device *);
        void (*cleanup)(struct device *);
};
int register_acpi_bus_type(struct acpi_bus_type *);
int unregister_acpi_bus_type(struct acpi_bus_type *);
int acpi_bind_one(struct device *dev, struct acpi_device *adev);
int acpi_unbind_one(struct device *dev);

struct acpi_pci_root {
        struct acpi_device * device;
        struct pci_bus *bus;
        u16 segment;
        struct resource secondary;        /* downstream bus range */

        u32 osc_support_set;        /* _OSC state of support bits */
        u32 osc_control_set;        /* _OSC state of control bits */
        phys_addr_t mcfg_addr;
};

/* helper */

bool acpi_dma_supported(struct acpi_device *adev);
enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev);
int acpi_dma_get_range(struct device *dev, u64 *dma_addr, u64 *offset,
                       u64 *size);
int acpi_dma_configure_id(struct device *dev, enum dev_dma_attr attr,
                           const u32 *input_id);
static inline int acpi_dma_configure(struct device *dev,
                                     enum dev_dma_attr attr)
{
        return acpi_dma_configure_id(dev, attr, NULL);
}
struct acpi_device *acpi_find_child_device(struct acpi_device *parent,
                                           u64 address, bool check_children);
int acpi_is_root_bridge(acpi_handle);
struct acpi_pci_root *acpi_pci_find_root(acpi_handle handle);

int acpi_enable_wakeup_device_power(struct acpi_device *dev, int state);
int acpi_disable_wakeup_device_power(struct acpi_device *dev);

#ifdef CONFIG_X86
bool acpi_device_override_status(struct acpi_device *adev, unsigned long long *status);
#else
static inline bool acpi_device_override_status(struct acpi_device *adev,
                                               unsigned long long *status)
{
        return false;
}
#endif

#ifdef CONFIG_PM
void acpi_pm_wakeup_event(struct device *dev);
acpi_status acpi_add_pm_notifier(struct acpi_device *adev, struct device *dev,
                        void (*func)(struct acpi_device_wakeup_context *context));
acpi_status acpi_remove_pm_notifier(struct acpi_device *adev);
bool acpi_pm_device_can_wakeup(struct device *dev);
int acpi_pm_device_sleep_state(struct device *, int *, int);
int acpi_pm_set_device_wakeup(struct device *dev, bool enable);
#else
static inline void acpi_pm_wakeup_event(struct device *dev)
{
}
static inline acpi_status acpi_add_pm_notifier(struct acpi_device *adev,
                                               struct device *dev,
                                               void (*func)(struct acpi_device_wakeup_context *context))
{
        return AE_SUPPORT;
}
static inline acpi_status acpi_remove_pm_notifier(struct acpi_device *adev)
{
        return AE_SUPPORT;
}
static inline bool acpi_pm_device_can_wakeup(struct device *dev)
{
        return false;
}
static inline int acpi_pm_device_sleep_state(struct device *d, int *p, int m)
{
        if (p)
                *p = ACPI_STATE_D0;

        return (m >= ACPI_STATE_D0 && m <= ACPI_STATE_D3_COLD) ?
                m : ACPI_STATE_D0;
}
static inline int acpi_pm_set_device_wakeup(struct device *dev, bool enable)
{
        return -ENODEV;
}
#endif

#ifdef CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT
bool acpi_sleep_state_supported(u8 sleep_state);
#else
static inline bool acpi_sleep_state_supported(u8 sleep_state) { return false; }
#endif

#ifdef CONFIG_ACPI_SLEEP
u32 acpi_target_system_state(void);
#else
static inline u32 acpi_target_system_state(void) { return ACPI_STATE_S0; }
#endif

static inline bool acpi_device_power_manageable(struct acpi_device *adev)
{
        return adev->flags.power_manageable;
}

static inline bool acpi_device_can_wakeup(struct acpi_device *adev)
{
        return adev->wakeup.flags.valid;
}

static inline bool acpi_device_can_poweroff(struct acpi_device *adev)
{
        return adev->power.states[ACPI_STATE_D3_COLD].flags.valid ||
                ((acpi_gbl_FADT.header.revision < 6) &&
                adev->power.states[ACPI_STATE_D3_HOT].flags.explicit_set);
}

bool acpi_dev_hid_uid_match(struct acpi_device *adev, const char *hid2, const char *uid2);

struct acpi_device *
acpi_dev_get_first_match_dev(const char *hid, const char *uid, s64 hrv);

static inline void acpi_dev_put(struct acpi_device *adev)
{
        if (adev)
                put_device(&adev->dev);
}
#else        /* CONFIG_ACPI */

static inline int register_acpi_bus_type(void *bus) { return 0; }
static inline int unregister_acpi_bus_type(void *bus) { return 0; }

#endif                                /* CONFIG_ACPI */

#endif /*__ACPI_BUS_H__*/















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Functions used by both the SCSI initiator code and the SCSI target code.
 */

#ifndef _SCSI_COMMON_H_
#define _SCSI_COMMON_H_

#include <linux/types.h>
#include <scsi/scsi_proto.h>

static inline unsigned
scsi_varlen_cdb_length(const void *hdr)
{
        return ((struct scsi_varlen_cdb_hdr *)hdr)->additional_cdb_length + 8;
}

extern const unsigned char scsi_command_size_tbl[8];
#define COMMAND_SIZE(opcode) scsi_command_size_tbl[((opcode) >> 5) & 7]

static inline unsigned
scsi_command_size(const unsigned char *cmnd)
{
        return (cmnd[0] == VARIABLE_LENGTH_CMD) ?
                scsi_varlen_cdb_length(cmnd) : COMMAND_SIZE(cmnd[0]);
}

static inline unsigned char
scsi_command_control(const unsigned char *cmnd)
{
        return (cmnd[0] == VARIABLE_LENGTH_CMD) ?
                cmnd[1] : cmnd[COMMAND_SIZE(cmnd[0]) - 1];
}

/* Returns a human-readable name for the device */
extern const char *scsi_device_type(unsigned type);

extern void int_to_scsilun(u64, struct scsi_lun *);
extern u64 scsilun_to_int(struct scsi_lun *);

/*
 * This is a slightly modified SCSI sense "descriptor" format header.
 * The addition is to allow the 0x70 and 0x71 response codes. The idea
 * is to place the salient data from either "fixed" or "descriptor" sense
 * format into one structure to ease application processing.
 *
 * The original sense buffer should be kept around for those cases
 * in which more information is required (e.g. the LBA of a MEDIUM ERROR).
 */
struct scsi_sense_hdr {                /* See SPC-3 section 4.5 */
        u8 response_code;        /* permit: 0x0, 0x70, 0x71, 0x72, 0x73 */
        u8 sense_key;
        u8 asc;
        u8 ascq;
        u8 byte4;
        u8 byte5;
        u8 byte6;
        u8 additional_length;        /* always 0 for fixed sense format */
};

static inline bool scsi_sense_valid(const struct scsi_sense_hdr *sshdr)
{
        if (!sshdr)
                return false;

        return (sshdr->response_code & 0x70) == 0x70;
}

extern bool scsi_normalize_sense(const u8 *sense_buffer, int sb_len,
                                 struct scsi_sense_hdr *sshdr);

extern void scsi_build_sense_buffer(int desc, u8 *buf, u8 key, u8 asc, u8 ascq);
int scsi_set_sense_information(u8 *buf, int buf_len, u64 info);
int scsi_set_sense_field_pointer(u8 *buf, int buf_len, u16 fp, u8 bp, bool cd);
extern const u8 * scsi_sense_desc_find(const u8 * sense_buffer, int sb_len,
                                       int desc_type);

#endif /* _SCSI_COMMON_H_ */
























































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_MSR_H
#define _ASM_X86_MSR_H

#include "msr-index.h"

#ifndef __ASSEMBLY__

#include <asm/asm.h>
#include <asm/errno.h>
#include <asm/cpumask.h>
#include <uapi/asm/msr.h>

struct msr {
        union {
                struct {
                        u32 l;
                        u32 h;
                };
                u64 q;
        };
};

struct msr_info {
        u32 msr_no;
        struct msr reg;
        struct msr *msrs;
        int err;
};

struct msr_regs_info {
        u32 *regs;
        int err;
};

struct saved_msr {
        bool valid;
        struct msr_info info;
};

struct saved_msrs {
        unsigned int num;
        struct saved_msr *array;
};

/*
 * both i386 and x86_64 returns 64-bit value in edx:eax, but gcc's "A"
 * constraint has different meanings. For i386, "A" means exactly
 * edx:eax, while for x86_64 it doesn't mean rdx:rax or edx:eax. Instead,
 * it means rax *or* rdx.
 */
#ifdef CONFIG_X86_64
/* Using 64-bit values saves one instruction clearing the high half of low */
#define DECLARE_ARGS(val, low, high)        unsigned long low, high
#define EAX_EDX_VAL(val, low, high)        ((low) | (high) << 32)
#define EAX_EDX_RET(val, low, high)        "=a" (low), "=d" (high)
#else
#define DECLARE_ARGS(val, low, high)        unsigned long long val
#define EAX_EDX_VAL(val, low, high)        (val)
#define EAX_EDX_RET(val, low, high)        "=A" (val)
#endif

/*
 * Be very careful with includes. This header is prone to include loops.
 */
#include <asm/atomic.h>
#include <linux/tracepoint-defs.h>

#ifdef CONFIG_TRACEPOINTS
DECLARE_TRACEPOINT(read_msr);
DECLARE_TRACEPOINT(write_msr);
DECLARE_TRACEPOINT(rdpmc);
extern void do_trace_write_msr(unsigned int msr, u64 val, int failed);
extern void do_trace_read_msr(unsigned int msr, u64 val, int failed);
extern void do_trace_rdpmc(unsigned int msr, u64 val, int failed);
#else
static inline void do_trace_write_msr(unsigned int msr, u64 val, int failed) {}
static inline void do_trace_read_msr(unsigned int msr, u64 val, int failed) {}
static inline void do_trace_rdpmc(unsigned int msr, u64 val, int failed) {}
#endif

/*
 * __rdmsr() and __wrmsr() are the two primitives which are the bare minimum MSR
 * accessors and should not have any tracing or other functionality piggybacking
 * on them - those are *purely* for accessing MSRs and nothing more. So don't even
 * think of extending them - you will be slapped with a stinking trout or a frozen
 * shark will reach you, wherever you are! You've been warned.
 */
static __always_inline unsigned long long __rdmsr(unsigned int msr)
{
        DECLARE_ARGS(val, low, high);

        asm volatile("1: rdmsr\n"
                     "2:\n"
                     _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_unsafe)
                     : EAX_EDX_RET(val, low, high) : "c" (msr));

        return EAX_EDX_VAL(val, low, high);
}

static __always_inline void __wrmsr(unsigned int msr, u32 low, u32 high)
{
        asm volatile("1: wrmsr\n"
                     "2:\n"
                     _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_unsafe)
                     : : "c" (msr), "a"(low), "d" (high) : "memory");
}

#define native_rdmsr(msr, val1, val2)                        \
do {                                                        \
        u64 __val = __rdmsr((msr));                        \
        (void)((val1) = (u32)__val);                        \
        (void)((val2) = (u32)(__val >> 32));                \
} while (0)

#define native_wrmsr(msr, low, high)                        \
        __wrmsr(msr, low, high)

#define native_wrmsrl(msr, val)                                \
        __wrmsr((msr), (u32)((u64)(val)),                \
                       (u32)((u64)(val) >> 32))

static inline unsigned long long native_read_msr(unsigned int msr)
{
        unsigned long long val;

        val = __rdmsr(msr);

        if (tracepoint_enabled(read_msr))
                do_trace_read_msr(msr, val, 0);

        return val;
}

static inline unsigned long long native_read_msr_safe(unsigned int msr,
                                                      int *err)
{
        DECLARE_ARGS(val, low, high);

        asm volatile("2: rdmsr ; xor %[err],%[err]\n"
                     "1:\n\t"
                     ".section .fixup,\"ax\"\n\t"
                     "3: mov %[fault],%[err]\n\t"
                     "xorl %%eax, %%eax\n\t"
                     "xorl %%edx, %%edx\n\t"
                     "jmp 1b\n\t"
                     ".previous\n\t"
                     _ASM_EXTABLE(2b, 3b)
                     : [err] "=r" (*err), EAX_EDX_RET(val, low, high)
                     : "c" (msr), [fault] "i" (-EIO));
        if (tracepoint_enabled(read_msr))
                do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), *err);
        return EAX_EDX_VAL(val, low, high);
}

/* Can be uninlined because referenced by paravirt */
static inline void notrace
native_write_msr(unsigned int msr, u32 low, u32 high)
{
        __wrmsr(msr, low, high);

        if (tracepoint_enabled(write_msr))
                do_trace_write_msr(msr, ((u64)high << 32 | low), 0);
}

/* Can be uninlined because referenced by paravirt */
static inline int notrace
native_write_msr_safe(unsigned int msr, u32 low, u32 high)
{
        int err;

        asm volatile("2: wrmsr ; xor %[err],%[err]\n"
                     "1:\n\t"
                     ".section .fixup,\"ax\"\n\t"
                     "3:  mov %[fault],%[err] ; jmp 1b\n\t"
                     ".previous\n\t"
                     _ASM_EXTABLE(2b, 3b)
                     : [err] "=a" (err)
                     : "c" (msr), "0" (low), "d" (high),
                       [fault] "i" (-EIO)
                     : "memory");
        if (tracepoint_enabled(write_msr))
                do_trace_write_msr(msr, ((u64)high << 32 | low), err);
        return err;
}

extern int rdmsr_safe_regs(u32 regs[8]);
extern int wrmsr_safe_regs(u32 regs[8]);

/**
 * rdtsc() - returns the current TSC without ordering constraints
 *
 * rdtsc() returns the result of RDTSC as a 64-bit integer.  The
 * only ordering constraint it supplies is the ordering implied by
 * "asm volatile": it will put the RDTSC in the place you expect.  The
 * CPU can and will speculatively execute that RDTSC, though, so the
 * results can be non-monotonic if compared on different CPUs.
 */
static __always_inline unsigned long long rdtsc(void)
{
        DECLARE_ARGS(val, low, high);

        asm volatile("rdtsc" : EAX_EDX_RET(val, low, high));

        return EAX_EDX_VAL(val, low, high);
}

/**
 * rdtsc_ordered() - read the current TSC in program order
 *
 * rdtsc_ordered() returns the result of RDTSC as a 64-bit integer.
 * It is ordered like a load to a global in-memory counter.  It should
 * be impossible to observe non-monotonic rdtsc_unordered() behavior
 * across multiple CPUs as long as the TSC is synced.
 */
static __always_inline unsigned long long rdtsc_ordered(void)
{
        DECLARE_ARGS(val, low, high);

        /*
         * The RDTSC instruction is not ordered relative to memory
         * access.  The Intel SDM and the AMD APM are both vague on this
         * point, but empirically an RDTSC instruction can be
         * speculatively executed before prior loads.  An RDTSC
         * immediately after an appropriate barrier appears to be
         * ordered as a normal load, that is, it provides the same
         * ordering guarantees as reading from a global memory location
         * that some other imaginary CPU is updating continuously with a
         * time stamp.
         *
         * Thus, use the preferred barrier on the respective CPU, aiming for
         * RDTSCP as the default.
         */
        asm volatile(ALTERNATIVE_2("rdtsc",
                                   "lfence; rdtsc", X86_FEATURE_LFENCE_RDTSC,
                                   "rdtscp", X86_FEATURE_RDTSCP)
                        : EAX_EDX_RET(val, low, high)
                        /* RDTSCP clobbers ECX with MSR_TSC_AUX. */
                        :: "ecx");

        return EAX_EDX_VAL(val, low, high);
}

static inline unsigned long long native_read_pmc(int counter)
{
        DECLARE_ARGS(val, low, high);

        asm volatile("rdpmc" : EAX_EDX_RET(val, low, high) : "c" (counter));
        if (tracepoint_enabled(rdpmc))
                do_trace_rdpmc(counter, EAX_EDX_VAL(val, low, high), 0);
        return EAX_EDX_VAL(val, low, high);
}

#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else
#include <linux/errno.h>
/*
 * Access to machine-specific registers (available on 586 and better only)
 * Note: the rd* operations modify the parameters directly (without using
 * pointer indirection), this allows gcc to optimize better
 */

#define rdmsr(msr, low, high)                                        \
do {                                                                \
        u64 __val = native_read_msr((msr));                        \
        (void)((low) = (u32)__val);                                \
        (void)((high) = (u32)(__val >> 32));                        \
} while (0)

static inline void wrmsr(unsigned int msr, u32 low, u32 high)
{
        native_write_msr(msr, low, high);
}

#define rdmsrl(msr, val)                        \
        ((val) = native_read_msr((msr)))

static inline void wrmsrl(unsigned int msr, u64 val)
{
        native_write_msr(msr, (u32)(val & 0xffffffffULL), (u32)(val >> 32));
}

/* wrmsr with exception handling */
static inline int wrmsr_safe(unsigned int msr, u32 low, u32 high)
{
        return native_write_msr_safe(msr, low, high);
}

/* rdmsr with exception handling */
#define rdmsr_safe(msr, low, high)                                \
({                                                                \
        int __err;                                                \
        u64 __val = native_read_msr_safe((msr), &__err);        \
        (*low) = (u32)__val;                                        \
        (*high) = (u32)(__val >> 32);                                \
        __err;                                                        \
})

static inline int rdmsrl_safe(unsigned int msr, unsigned long long *p)
{
        int err;

        *p = native_read_msr_safe(msr, &err);
        return err;
}

#define rdpmc(counter, low, high)                        \
do {                                                        \
        u64 _l = native_read_pmc((counter));                \
        (low)  = (u32)_l;                                \
        (high) = (u32)(_l >> 32);                        \
} while (0)

#define rdpmcl(counter, val) ((val) = native_read_pmc(counter))

#endif        /* !CONFIG_PARAVIRT_XXL */

/*
 * 64-bit version of wrmsr_safe():
 */
static inline int wrmsrl_safe(u32 msr, u64 val)
{
        return wrmsr_safe(msr, (u32)val,  (u32)(val >> 32));
}

#define write_tsc(low, high) wrmsr(MSR_IA32_TSC, (low), (high))

#define write_rdtscp_aux(val) wrmsr(MSR_TSC_AUX, (val), 0)

struct msr *msrs_alloc(void);
void msrs_free(struct msr *msrs);
int msr_set_bit(u32 msr, u8 bit);
int msr_clear_bit(u32 msr, u8 bit);

#ifdef CONFIG_SMP
int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q);
int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q);
void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs);
void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs);
int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q);
int wrmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q);
int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]);
int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]);
#else  /*  CONFIG_SMP  */
static inline int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
{
        rdmsr(msr_no, *l, *h);
        return 0;
}
static inline int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
{
        wrmsr(msr_no, l, h);
        return 0;
}
static inline int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q)
{
        rdmsrl(msr_no, *q);
        return 0;
}
static inline int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q)
{
        wrmsrl(msr_no, q);
        return 0;
}
static inline void rdmsr_on_cpus(const struct cpumask *m, u32 msr_no,
                                struct msr *msrs)
{
        rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h));
}
static inline void wrmsr_on_cpus(const struct cpumask *m, u32 msr_no,
                                struct msr *msrs)
{
        wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h);
}
static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no,
                                    u32 *l, u32 *h)
{
        return rdmsr_safe(msr_no, l, h);
}
static inline int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
{
        return wrmsr_safe(msr_no, l, h);
}
static inline int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q)
{
        return rdmsrl_safe(msr_no, q);
}
static inline int wrmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q)
{
        return wrmsrl_safe(msr_no, q);
}
static inline int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8])
{
        return rdmsr_safe_regs(regs);
}
static inline int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8])
{
        return wrmsr_safe_regs(regs);
}
#endif  /* CONFIG_SMP */
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_MSR_H */





















































































































































































































































































    1 




































    1 
    1 























































































































































































































































































































































































































































































































































































































































































































































































































    1 








    1 










    1 
    1 








    1 
    1 





































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
    1 



























    1 
    1 







    1 

    1 











    1 
























    1 




































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/locks.c
 *
 *  Provide support for fcntl()'s F_GETLK, F_SETLK, and F_SETLKW calls.
 *  Doug Evans (dje@spiff.uucp), August 07, 1992
 *
 *  Deadlock detection added.
 *  FIXME: one thing isn't handled yet:
 *        - mandatory locks (requires lots of changes elsewhere)
 *  Kelly Carmichael (kelly@[142.24.8.65]), September 17, 1994.
 *
 *  Miscellaneous edits, and a total rewrite of posix_lock_file() code.
 *  Kai Petzke (wpp@marie.physik.tu-berlin.de), 1994
 *
 *  Converted file_lock_table to a linked list from an array, which eliminates
 *  the limits on how many active file locks are open.
 *  Chad Page (pageone@netcom.com), November 27, 1994
 *
 *  Removed dependency on file descriptors. dup()'ed file descriptors now
 *  get the same locks as the original file descriptors, and a close() on
 *  any file descriptor removes ALL the locks on the file for the current
 *  process. Since locks still depend on the process id, locks are inherited
 *  after an exec() but not after a fork(). This agrees with POSIX, and both
 *  BSD and SVR4 practice.
 *  Andy Walker (andy@lysaker.kvaerner.no), February 14, 1995
 *
 *  Scrapped free list which is redundant now that we allocate locks
 *  dynamically with kmalloc()/kfree().
 *  Andy Walker (andy@lysaker.kvaerner.no), February 21, 1995
 *
 *  Implemented two lock personalities - FL_FLOCK and FL_POSIX.
 *
 *  FL_POSIX locks are created with calls to fcntl() and lockf() through the
 *  fcntl() system call. They have the semantics described above.
 *
 *  FL_FLOCK locks are created with calls to flock(), through the flock()
 *  system call, which is new. Old C libraries implement flock() via fcntl()
 *  and will continue to use the old, broken implementation.
 *
 *  FL_FLOCK locks follow the 4.4 BSD flock() semantics. They are associated
 *  with a file pointer (filp). As a result they can be shared by a parent
 *  process and its children after a fork(). They are removed when the last
 *  file descriptor referring to the file pointer is closed (unless explicitly
 *  unlocked).
 *
 *  FL_FLOCK locks never deadlock, an existing lock is always removed before
 *  upgrading from shared to exclusive (or vice versa). When this happens
 *  any processes blocked by the current lock are woken up and allowed to
 *  run before the new lock is applied.
 *  Andy Walker (andy@lysaker.kvaerner.no), June 09, 1995
 *
 *  Removed some race conditions in flock_lock_file(), marked other possible
 *  races. Just grep for FIXME to see them.
 *  Dmitry Gorodchanin (pgmdsg@ibi.com), February 09, 1996.
 *
 *  Addressed Dmitry's concerns. Deadlock checking no longer recursive.
 *  Lock allocation changed to GFP_ATOMIC as we can't afford to sleep
 *  once we've checked for blocking and deadlocking.
 *  Andy Walker (andy@lysaker.kvaerner.no), April 03, 1996.
 *
 *  Initial implementation of mandatory locks. SunOS turned out to be
 *  a rotten model, so I implemented the "obvious" semantics.
 *  See 'Documentation/filesystems/mandatory-locking.rst' for details.
 *  Andy Walker (andy@lysaker.kvaerner.no), April 06, 1996.
 *
 *  Don't allow mandatory locks on mmap()'ed files. Added simple functions to
 *  check if a file has mandatory locks, used by mmap(), open() and creat() to
 *  see if system call should be rejected. Ref. HP-UX/SunOS/Solaris Reference
 *  Manual, Section 2.
 *  Andy Walker (andy@lysaker.kvaerner.no), April 09, 1996.
 *
 *  Tidied up block list handling. Added '/proc/locks' interface.
 *  Andy Walker (andy@lysaker.kvaerner.no), April 24, 1996.
 *
 *  Fixed deadlock condition for pathological code that mixes calls to
 *  flock() and fcntl().
 *  Andy Walker (andy@lysaker.kvaerner.no), April 29, 1996.
 *
 *  Allow only one type of locking scheme (FL_POSIX or FL_FLOCK) to be in use
 *  for a given file at a time. Changed the CONFIG_LOCK_MANDATORY scheme to
 *  guarantee sensible behaviour in the case where file system modules might
 *  be compiled with different options than the kernel itself.
 *  Andy Walker (andy@lysaker.kvaerner.no), May 15, 1996.
 *
 *  Added a couple of missing wake_up() calls. Thanks to Thomas Meckel
 *  (Thomas.Meckel@mni.fh-giessen.de) for spotting this.
 *  Andy Walker (andy@lysaker.kvaerner.no), May 15, 1996.
 *
 *  Changed FL_POSIX locks to use the block list in the same way as FL_FLOCK
 *  locks. Changed process synchronisation to avoid dereferencing locks that
 *  have already been freed.
 *  Andy Walker (andy@lysaker.kvaerner.no), Sep 21, 1996.
 *
 *  Made the block list a circular list to minimise searching in the list.
 *  Andy Walker (andy@lysaker.kvaerner.no), Sep 25, 1996.
 *
 *  Made mandatory locking a mount option. Default is not to allow mandatory
 *  locking.
 *  Andy Walker (andy@lysaker.kvaerner.no), Oct 04, 1996.
 *
 *  Some adaptations for NFS support.
 *  Olaf Kirch (okir@monad.swb.de), Dec 1996,
 *
 *  Fixed /proc/locks interface so that we can't overrun the buffer we are handed.
 *  Andy Walker (andy@lysaker.kvaerner.no), May 12, 1997.
 *
 *  Use slab allocator instead of kmalloc/kfree.
 *  Use generic list implementation from <linux/list.h>.
 *  Sped up posix_locks_deadlock by only considering blocked locks.
 *  Matthew Wilcox <willy@debian.org>, March, 2000.
 *
 *  Leases and LOCK_MAND
 *  Matthew Wilcox <willy@debian.org>, June, 2000.
 *  Stephen Rothwell <sfr@canb.auug.org.au>, June, 2000.
 *
 * Locking conflicts and dependencies:
 * If multiple threads attempt to lock the same byte (or flock the same file)
 * only one can be granted the lock, and other must wait their turn.
 * The first lock has been "applied" or "granted", the others are "waiting"
 * and are "blocked" by the "applied" lock..
 *
 * Waiting and applied locks are all kept in trees whose properties are:
 *
 *        - the root of a tree may be an applied or waiting lock.
 *        - every other node in the tree is a waiting lock that
 *          conflicts with every ancestor of that node.
 *
 * Every such tree begins life as a waiting singleton which obviously
 * satisfies the above properties.
 *
 * The only ways we modify trees preserve these properties:
 *
 *        1. We may add a new leaf node, but only after first verifying that it
 *           conflicts with all of its ancestors.
 *        2. We may remove the root of a tree, creating a new singleton
 *           tree from the root and N new trees rooted in the immediate
 *           children.
 *        3. If the root of a tree is not currently an applied lock, we may
 *           apply it (if possible).
 *        4. We may upgrade the root of the tree (either extend its range,
 *           or upgrade its entire range from read to write).
 *
 * When an applied lock is modified in a way that reduces or downgrades any
 * part of its range, we remove all its children (2 above).  This particularly
 * happens when a lock is unlocked.
 *
 * For each of those child trees we "wake up" the thread which is
 * waiting for the lock so it can continue handling as follows: if the
 * root of the tree applies, we do so (3).  If it doesn't, it must
 * conflict with some applied lock.  We remove (wake up) all of its children
 * (2), and add it is a new leaf to the tree rooted in the applied
 * lock (1).  We then repeat the process recursively with those
 * children.
 *
 */

#include <linux/capability.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/security.h>
#include <linux/slab.h>
#include <linux/syscalls.h>
#include <linux/time.h>
#include <linux/rcupdate.h>
#include <linux/pid_namespace.h>
#include <linux/hashtable.h>
#include <linux/percpu.h>

#define CREATE_TRACE_POINTS
#include <trace/events/filelock.h>

#include <linux/uaccess.h>

#define IS_POSIX(fl)        (fl->fl_flags & FL_POSIX)
#define IS_FLOCK(fl)        (fl->fl_flags & FL_FLOCK)
#define IS_LEASE(fl)        (fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT))
#define IS_OFDLCK(fl)        (fl->fl_flags & FL_OFDLCK)
#define IS_REMOTELCK(fl)        (fl->fl_pid <= 0)

static bool lease_breaking(struct file_lock *fl)
{
        return fl->fl_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING);
}

static int target_leasetype(struct file_lock *fl)
{
        if (fl->fl_flags & FL_UNLOCK_PENDING)
                return F_UNLCK;
        if (fl->fl_flags & FL_DOWNGRADE_PENDING)
                return F_RDLCK;
        return fl->fl_type;
}

int leases_enable = 1;
int lease_break_time = 45;

/*
 * The global file_lock_list is only used for displaying /proc/locks, so we
 * keep a list on each CPU, with each list protected by its own spinlock.
 * Global serialization is done using file_rwsem.
 *
 * Note that alterations to the list also require that the relevant flc_lock is
 * held.
 */
struct file_lock_list_struct {
        spinlock_t                lock;
        struct hlist_head        hlist;
};
static DEFINE_PER_CPU(struct file_lock_list_struct, file_lock_list);
DEFINE_STATIC_PERCPU_RWSEM(file_rwsem);


/*
 * The blocked_hash is used to find POSIX lock loops for deadlock detection.
 * It is protected by blocked_lock_lock.
 *
 * We hash locks by lockowner in order to optimize searching for the lock a
 * particular lockowner is waiting on.
 *
 * FIXME: make this value scale via some heuristic? We generally will want more
 * buckets when we have more lockowners holding locks, but that's a little
 * difficult to determine without knowing what the workload will look like.
 */
#define BLOCKED_HASH_BITS        7
static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS);

/*
 * This lock protects the blocked_hash. Generally, if you're accessing it, you
 * want to be holding this lock.
 *
 * In addition, it also protects the fl->fl_blocked_requests list, and the
 * fl->fl_blocker pointer for file_lock structures that are acting as lock
 * requests (in contrast to those that are acting as records of acquired locks).
 *
 * Note that when we acquire this lock in order to change the above fields,
 * we often hold the flc_lock as well. In certain cases, when reading the fields
 * protected by this lock, we can skip acquiring it iff we already hold the
 * flc_lock.
 */
static DEFINE_SPINLOCK(blocked_lock_lock);

static struct kmem_cache *flctx_cache __read_mostly;
static struct kmem_cache *filelock_cache __read_mostly;

static struct file_lock_context *
locks_get_lock_context(struct inode *inode, int type)
{
        struct file_lock_context *ctx;

        /* paired with cmpxchg() below */
        ctx = locks_inode_context(inode);
        if (likely(ctx) || type == F_UNLCK)
                goto out;

        ctx = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
        if (!ctx)
                goto out;

        spin_lock_init(&ctx->flc_lock);
        INIT_LIST_HEAD(&ctx->flc_flock);
        INIT_LIST_HEAD(&ctx->flc_posix);
        INIT_LIST_HEAD(&ctx->flc_lease);

        /*
         * Assign the pointer if it's not already assigned. If it is, then
         * free the context we just allocated.
         */
        if (cmpxchg(&inode->i_flctx, NULL, ctx)) {
                kmem_cache_free(flctx_cache, ctx);
                ctx = locks_inode_context(inode);
        }
out:
        trace_locks_get_lock_context(inode, type, ctx);
        return ctx;
}

static void
locks_dump_ctx_list(struct list_head *list, char *list_type)
{
        struct file_lock *fl;

        list_for_each_entry(fl, list, fl_list) {
                pr_warn("%s: fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n", list_type, fl->fl_owner, fl->fl_flags, fl->fl_type, fl->fl_pid);
        }
}

static void
locks_check_ctx_lists(struct inode *inode)
{
        struct file_lock_context *ctx = inode->i_flctx;

        if (unlikely(!list_empty(&ctx->flc_flock) ||
                     !list_empty(&ctx->flc_posix) ||
                     !list_empty(&ctx->flc_lease))) {
                pr_warn("Leaked locks on dev=0x%x:0x%x ino=0x%lx:\n",
                        MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev),
                        inode->i_ino);
                locks_dump_ctx_list(&ctx->flc_flock, "FLOCK");
                locks_dump_ctx_list(&ctx->flc_posix, "POSIX");
                locks_dump_ctx_list(&ctx->flc_lease, "LEASE");
        }
}

static void
locks_check_ctx_file_list(struct file *filp, struct list_head *list,
                                char *list_type)
{
        struct file_lock *fl;
        struct inode *inode = locks_inode(filp);

        list_for_each_entry(fl, list, fl_list)
                if (fl->fl_file == filp)
                        pr_warn("Leaked %s lock on dev=0x%x:0x%x ino=0x%lx "
                                " fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n",
                                list_type, MAJOR(inode->i_sb->s_dev),
                                MINOR(inode->i_sb->s_dev), inode->i_ino,
                                fl->fl_owner, fl->fl_flags, fl->fl_type, fl->fl_pid);
}

void
locks_free_lock_context(struct inode *inode)
{
        struct file_lock_context *ctx = locks_inode_context(inode);

        if (unlikely(ctx)) {
                locks_check_ctx_lists(inode);
                kmem_cache_free(flctx_cache, ctx);
        }
}

static void locks_init_lock_heads(struct file_lock *fl)
{
        INIT_HLIST_NODE(&fl->fl_link);
        INIT_LIST_HEAD(&fl->fl_list);
        INIT_LIST_HEAD(&fl->fl_blocked_requests);
        INIT_LIST_HEAD(&fl->fl_blocked_member);
        init_waitqueue_head(&fl->fl_wait);
}

/* Allocate an empty lock structure. */
struct file_lock *locks_alloc_lock(void)
{
        struct file_lock *fl = kmem_cache_zalloc(filelock_cache, GFP_KERNEL);

        if (fl)
                locks_init_lock_heads(fl);

        return fl;
}
EXPORT_SYMBOL_GPL(locks_alloc_lock);

void locks_release_private(struct file_lock *fl)
{
        BUG_ON(waitqueue_active(&fl->fl_wait));
        BUG_ON(!list_empty(&fl->fl_list));
        BUG_ON(!list_empty(&fl->fl_blocked_requests));
        BUG_ON(!list_empty(&fl->fl_blocked_member));
        BUG_ON(!hlist_unhashed(&fl->fl_link));

        if (fl->fl_ops) {
                if (fl->fl_ops->fl_release_private)
                        fl->fl_ops->fl_release_private(fl);
                fl->fl_ops = NULL;
        }

        if (fl->fl_lmops) {
                if (fl->fl_lmops->lm_put_owner) {
                        fl->fl_lmops->lm_put_owner(fl->fl_owner);
                        fl->fl_owner = NULL;
                }
                fl->fl_lmops = NULL;
        }
}
EXPORT_SYMBOL_GPL(locks_release_private);

/**
 * locks_owner_has_blockers - Check for blocking lock requests
 * @flctx: file lock context
 * @owner: lock owner
 *
 * Return values:
 *   %true: @owner has at least one blocker
 *   %false: @owner has no blockers
 */
bool locks_owner_has_blockers(struct file_lock_context *flctx,
                fl_owner_t owner)
{
        struct file_lock *fl;

        spin_lock(&flctx->flc_lock);
        list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
                if (fl->fl_owner != owner)
                        continue;
                if (!list_empty(&fl->fl_blocked_requests)) {
                        spin_unlock(&flctx->flc_lock);
                        return true;
                }
        }
        spin_unlock(&flctx->flc_lock);
        return false;
}
EXPORT_SYMBOL_GPL(locks_owner_has_blockers);

/* Free a lock which is not in use. */
void locks_free_lock(struct file_lock *fl)
{
        locks_release_private(fl);
        kmem_cache_free(filelock_cache, fl);
}
EXPORT_SYMBOL(locks_free_lock);

static void
locks_dispose_list(struct list_head *dispose)
{
        struct file_lock *fl;

        while (!list_empty(dispose)) {
                fl = list_first_entry(dispose, struct file_lock, fl_list);
                list_del_init(&fl->fl_list);
                locks_free_lock(fl);
        }
}

void locks_init_lock(struct file_lock *fl)
{
        memset(fl, 0, sizeof(struct file_lock));
        locks_init_lock_heads(fl);
}
EXPORT_SYMBOL(locks_init_lock);

/*
 * Initialize a new lock from an existing file_lock structure.
 */
void locks_copy_conflock(struct file_lock *new, struct file_lock *fl)
{
        new->fl_owner = fl->fl_owner;
        new->fl_pid = fl->fl_pid;
        new->fl_file = NULL;
        new->fl_flags = fl->fl_flags;
        new->fl_type = fl->fl_type;
        new->fl_start = fl->fl_start;
        new->fl_end = fl->fl_end;
        new->fl_lmops = fl->fl_lmops;
        new->fl_ops = NULL;

        if (fl->fl_lmops) {
                if (fl->fl_lmops->lm_get_owner)
                        fl->fl_lmops->lm_get_owner(fl->fl_owner);
        }
}
EXPORT_SYMBOL(locks_copy_conflock);

void locks_copy_lock(struct file_lock *new, struct file_lock *fl)
{
        /* "new" must be a freshly-initialized lock */
        WARN_ON_ONCE(new->fl_ops);

        locks_copy_conflock(new, fl);

        new->fl_file = fl->fl_file;
        new->fl_ops = fl->fl_ops;

        if (fl->fl_ops) {
                if (fl->fl_ops->fl_copy_lock)
                        fl->fl_ops->fl_copy_lock(new, fl);
        }
}
EXPORT_SYMBOL(locks_copy_lock);

static void locks_move_blocks(struct file_lock *new, struct file_lock *fl)
{
        struct file_lock *f;

        /*
         * As ctx->flc_lock is held, new requests cannot be added to
         * ->fl_blocked_requests, so we don't need a lock to check if it
         * is empty.
         */
        if (list_empty(&fl->fl_blocked_requests))
                return;
        spin_lock(&blocked_lock_lock);
        list_splice_init(&fl->fl_blocked_requests, &new->fl_blocked_requests);
        list_for_each_entry(f, &new->fl_blocked_requests, fl_blocked_member)
                f->fl_blocker = new;
        spin_unlock(&blocked_lock_lock);
}

static inline int flock_translate_cmd(int cmd) {
        if (cmd & LOCK_MAND)
                return cmd & (LOCK_MAND | LOCK_RW);
        switch (cmd) {
        case LOCK_SH:
                return F_RDLCK;
        case LOCK_EX:
                return F_WRLCK;
        case LOCK_UN:
                return F_UNLCK;
        }
        return -EINVAL;
}

/* Fill in a file_lock structure with an appropriate FLOCK lock. */
static struct file_lock *
flock_make_lock(struct file *filp, unsigned int cmd, struct file_lock *fl)
{
        int type = flock_translate_cmd(cmd);

        if (type < 0)
                return ERR_PTR(type);

        if (fl == NULL) {
                fl = locks_alloc_lock();
                if (fl == NULL)
                        return ERR_PTR(-ENOMEM);
        } else {
                locks_init_lock(fl);
        }

        fl->fl_file = filp;
        fl->fl_owner = filp;
        fl->fl_pid = current->tgid;
        fl->fl_flags = FL_FLOCK;
        fl->fl_type = type;
        fl->fl_end = OFFSET_MAX;

        return fl;
}

static int assign_type(struct file_lock *fl, long type)
{
        switch (type) {
        case F_RDLCK:
        case F_WRLCK:
        case F_UNLCK:
                fl->fl_type = type;
                break;
        default:
                return -EINVAL;
        }
        return 0;
}

static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,
                                 struct flock64 *l)
{
        switch (l->l_whence) {
        case SEEK_SET:
                fl->fl_start = 0;
                break;
        case SEEK_CUR:
                fl->fl_start = filp->f_pos;
                break;
        case SEEK_END:
                fl->fl_start = i_size_read(file_inode(filp));
                break;
        default:
                return -EINVAL;
        }
        if (l->l_start > OFFSET_MAX - fl->fl_start)
                return -EOVERFLOW;
        fl->fl_start += l->l_start;
        if (fl->fl_start < 0)
                return -EINVAL;

        /* POSIX-1996 leaves the case l->l_len < 0 undefined;
           POSIX-2001 defines it. */
        if (l->l_len > 0) {
                if (l->l_len - 1 > OFFSET_MAX - fl->fl_start)
                        return -EOVERFLOW;
                fl->fl_end = fl->fl_start + l->l_len - 1;

        } else if (l->l_len < 0) {
                if (fl->fl_start + l->l_len < 0)
                        return -EINVAL;
                fl->fl_end = fl->fl_start - 1;
                fl->fl_start += l->l_len;
        } else
                fl->fl_end = OFFSET_MAX;

        fl->fl_owner = current->files;
        fl->fl_pid = current->tgid;
        fl->fl_file = filp;
        fl->fl_flags = FL_POSIX;
        fl->fl_ops = NULL;
        fl->fl_lmops = NULL;

        return assign_type(fl, l->l_type);
}

/* Verify a "struct flock" and copy it to a "struct file_lock" as a POSIX
 * style lock.
 */
static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
                               struct flock *l)
{
        struct flock64 ll = {
                .l_type = l->l_type,
                .l_whence = l->l_whence,
                .l_start = l->l_start,
                .l_len = l->l_len,
        };

        return flock64_to_posix_lock(filp, fl, &ll);
}

/* default lease lock manager operations */
static bool
lease_break_callback(struct file_lock *fl)
{
        kill_fasync(&fl->fl_fasync, SIGIO, POLL_MSG);
        return false;
}

static void
lease_setup(struct file_lock *fl, void **priv)
{
        struct file *filp = fl->fl_file;
        struct fasync_struct *fa = *priv;

        /*
         * fasync_insert_entry() returns the old entry if any. If there was no
         * old entry, then it used "priv" and inserted it into the fasync list.
         * Clear the pointer to indicate that it shouldn't be freed.
         */
        if (!fasync_insert_entry(fa->fa_fd, filp, &fl->fl_fasync, fa))
                *priv = NULL;

        __f_setown(filp, task_pid(current), PIDTYPE_TGID, 0);
}

static const struct lock_manager_operations lease_manager_ops = {
        .lm_break = lease_break_callback,
        .lm_change = lease_modify,
        .lm_setup = lease_setup,
};

/*
 * Initialize a lease, use the default lock manager operations
 */
static int lease_init(struct file *filp, long type, struct file_lock *fl)
{
        if (assign_type(fl, type) != 0)
                return -EINVAL;

        fl->fl_owner = filp;
        fl->fl_pid = current->tgid;

        fl->fl_file = filp;
        fl->fl_flags = FL_LEASE;
        fl->fl_start = 0;
        fl->fl_end = OFFSET_MAX;
        fl->fl_ops = NULL;
        fl->fl_lmops = &lease_manager_ops;
        return 0;
}

/* Allocate a file_lock initialised to this type of lease */
static struct file_lock *lease_alloc(struct file *filp, long type)
{
        struct file_lock *fl = locks_alloc_lock();
        int error = -ENOMEM;

        if (fl == NULL)
                return ERR_PTR(error);

        error = lease_init(filp, type, fl);
        if (error) {
                locks_free_lock(fl);
                return ERR_PTR(error);
        }
        return fl;
}

/* Check if two locks overlap each other.
 */
static inline int locks_overlap(struct file_lock *fl1, struct file_lock *fl2)
{
        return ((fl1->fl_end >= fl2->fl_start) &&
                (fl2->fl_end >= fl1->fl_start));
}

/*
 * Check whether two locks have the same owner.
 */
static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
{
        return fl1->fl_owner == fl2->fl_owner;
}

/* Must be called with the flc_lock held! */
static void locks_insert_global_locks(struct file_lock *fl)
{
        struct file_lock_list_struct *fll = this_cpu_ptr(&file_lock_list);

        percpu_rwsem_assert_held(&file_rwsem);

        spin_lock(&fll->lock);
        fl->fl_link_cpu = smp_processor_id();
        hlist_add_head(&fl->fl_link, &fll->hlist);
        spin_unlock(&fll->lock);
}

/* Must be called with the flc_lock held! */
static void locks_delete_global_locks(struct file_lock *fl)
{
        struct file_lock_list_struct *fll;

        percpu_rwsem_assert_held(&file_rwsem);

        /*
         * Avoid taking lock if already unhashed. This is safe since this check
         * is done while holding the flc_lock, and new insertions into the list
         * also require that it be held.
         */
        if (hlist_unhashed(&fl->fl_link))
                return;

        fll = per_cpu_ptr(&file_lock_list, fl->fl_link_cpu);
        spin_lock(&fll->lock);
        hlist_del_init(&fl->fl_link);
        spin_unlock(&fll->lock);
}

static unsigned long
posix_owner_key(struct file_lock *fl)
{
        return (unsigned long)fl->fl_owner;
}

static void locks_insert_global_blocked(struct file_lock *waiter)
{
        lockdep_assert_held(&blocked_lock_lock);

        hash_add(blocked_hash, &waiter->fl_link, posix_owner_key(waiter));
}

static void locks_delete_global_blocked(struct file_lock *waiter)
{
        lockdep_assert_held(&blocked_lock_lock);

        hash_del(&waiter->fl_link);
}

/* Remove waiter from blocker's block list.
 * When blocker ends up pointing to itself then the list is empty.
 *
 * Must be called with blocked_lock_lock held.
 */
static void __locks_delete_block(struct file_lock *waiter)
{
        locks_delete_global_blocked(waiter);
        list_del_init(&waiter->fl_blocked_member);
}

static void __locks_wake_up_blocks(struct file_lock *blocker)
{
        while (!list_empty(&blocker->fl_blocked_requests)) {
                struct file_lock *waiter;

                waiter = list_first_entry(&blocker->fl_blocked_requests,
                                          struct file_lock, fl_blocked_member);
                __locks_delete_block(waiter);
                if (waiter->fl_lmops && waiter->fl_lmops->lm_notify)
                        waiter->fl_lmops->lm_notify(waiter);
                else
                        wake_up(&waiter->fl_wait);

                /*
                 * The setting of fl_blocker to NULL marks the "done"
                 * point in deleting a block. Paired with acquire at the top
                 * of locks_delete_block().
                 */
                smp_store_release(&waiter->fl_blocker, NULL);
        }
}

/**
 *        locks_delete_lock - stop waiting for a file lock
 *        @waiter: the lock which was waiting
 *
 *        lockd/nfsd need to disconnect the lock while working on it.
 */
int locks_delete_block(struct file_lock *waiter)
{
        int status = -ENOENT;

        /*
         * If fl_blocker is NULL, it won't be set again as this thread "owns"
         * the lock and is the only one that might try to claim the lock.
         *
         * We use acquire/release to manage fl_blocker so that we can
         * optimize away taking the blocked_lock_lock in many cases.
         *
         * The smp_load_acquire guarantees two things:
         *
         * 1/ that fl_blocked_requests can be tested locklessly. If something
         * was recently added to that list it must have been in a locked region
         * *before* the locked region when fl_blocker was set to NULL.
         *
         * 2/ that no other thread is accessing 'waiter', so it is safe to free
         * it.  __locks_wake_up_blocks is careful not to touch waiter after
         * fl_blocker is released.
         *
         * If a lockless check of fl_blocker shows it to be NULL, we know that
         * no new locks can be inserted into its fl_blocked_requests list, and
         * can avoid doing anything further if the list is empty.
         */
        if (!smp_load_acquire(&waiter->fl_blocker) &&
            list_empty(&waiter->fl_blocked_requests))
                return status;

        spin_lock(&blocked_lock_lock);
        if (waiter->fl_blocker)
                status = 0;
        __locks_wake_up_blocks(waiter);
        __locks_delete_block(waiter);

        /*
         * The setting of fl_blocker to NULL marks the "done" point in deleting
         * a block. Paired with acquire at the top of this function.
         */
        smp_store_release(&waiter->fl_blocker, NULL);
        spin_unlock(&blocked_lock_lock);
        return status;
}
EXPORT_SYMBOL(locks_delete_block);

/* Insert waiter into blocker's block list.
 * We use a circular list so that processes can be easily woken up in
 * the order they blocked. The documentation doesn't require this but
 * it seems like the reasonable thing to do.
 *
 * Must be called with both the flc_lock and blocked_lock_lock held. The
 * fl_blocked_requests list itself is protected by the blocked_lock_lock,
 * but by ensuring that the flc_lock is also held on insertions we can avoid
 * taking the blocked_lock_lock in some cases when we see that the
 * fl_blocked_requests list is empty.
 *
 * Rather than just adding to the list, we check for conflicts with any existing
 * waiters, and add beneath any waiter that blocks the new waiter.
 * Thus wakeups don't happen until needed.
 */
static void __locks_insert_block(struct file_lock *blocker,
                                 struct file_lock *waiter,
                                 bool conflict(struct file_lock *,
                                               struct file_lock *))
{
        struct file_lock *fl;
        BUG_ON(!list_empty(&waiter->fl_blocked_member));

new_blocker:
        list_for_each_entry(fl, &blocker->fl_blocked_requests, fl_blocked_member)
                if (conflict(fl, waiter)) {
                        blocker =  fl;
                        goto new_blocker;
                }
        waiter->fl_blocker = blocker;
        list_add_tail(&waiter->fl_blocked_member, &blocker->fl_blocked_requests);
        if (IS_POSIX(blocker) && !IS_OFDLCK(blocker))
                locks_insert_global_blocked(waiter);

        /* The requests in waiter->fl_blocked are known to conflict with
         * waiter, but might not conflict with blocker, or the requests
         * and lock which block it.  So they all need to be woken.
         */
        __locks_wake_up_blocks(waiter);
}

/* Must be called with flc_lock held. */
static void locks_insert_block(struct file_lock *blocker,
                               struct file_lock *waiter,
                               bool conflict(struct file_lock *,
                                             struct file_lock *))
{
        spin_lock(&blocked_lock_lock);
        __locks_insert_block(blocker, waiter, conflict);
        spin_unlock(&blocked_lock_lock);
}

/*
 * Wake up processes blocked waiting for blocker.
 *
 * Must be called with the inode->flc_lock held!
 */
static void locks_wake_up_blocks(struct file_lock *blocker)
{
        /*
         * Avoid taking global lock if list is empty. This is safe since new
         * blocked requests are only added to the list under the flc_lock, and
         * the flc_lock is always held here. Note that removal from the
         * fl_blocked_requests list does not require the flc_lock, so we must
         * recheck list_empty() after acquiring the blocked_lock_lock.
         */
        if (list_empty(&blocker->fl_blocked_requests))
                return;

        spin_lock(&blocked_lock_lock);
        __locks_wake_up_blocks(blocker);
        spin_unlock(&blocked_lock_lock);
}

static void
locks_insert_lock_ctx(struct file_lock *fl, struct list_head *before)
{
        list_add_tail(&fl->fl_list, before);
        locks_insert_global_locks(fl);
}

static void
locks_unlink_lock_ctx(struct file_lock *fl)
{
        locks_delete_global_locks(fl);
        list_del_init(&fl->fl_list);
        locks_wake_up_blocks(fl);
}

static void
locks_delete_lock_ctx(struct file_lock *fl, struct list_head *dispose)
{
        locks_unlink_lock_ctx(fl);
        if (dispose)
                list_add(&fl->fl_list, dispose);
        else
                locks_free_lock(fl);
}

/* Determine if lock sys_fl blocks lock caller_fl. Common functionality
 * checks for shared/exclusive status of overlapping locks.
 */
static bool locks_conflict(struct file_lock *caller_fl,
                           struct file_lock *sys_fl)
{
        if (sys_fl->fl_type == F_WRLCK)
                return true;
        if (caller_fl->fl_type == F_WRLCK)
                return true;
        return false;
}

/* Determine if lock sys_fl blocks lock caller_fl. POSIX specific
 * checking before calling the locks_conflict().
 */
static bool posix_locks_conflict(struct file_lock *caller_fl,
                                 struct file_lock *sys_fl)
{
        /* POSIX locks owned by the same process do not conflict with
         * each other.
         */
        if (posix_same_owner(caller_fl, sys_fl))
                return false;

        /* Check whether they overlap */
        if (!locks_overlap(caller_fl, sys_fl))
                return false;

        return locks_conflict(caller_fl, sys_fl);
}

/* Determine if lock sys_fl blocks lock caller_fl. FLOCK specific
 * checking before calling the locks_conflict().
 */
static bool flock_locks_conflict(struct file_lock *caller_fl,
                                 struct file_lock *sys_fl)
{
        /* FLOCK locks referring to the same filp do not conflict with
         * each other.
         */
        if (caller_fl->fl_file == sys_fl->fl_file)
                return false;
        if ((caller_fl->fl_type & LOCK_MAND) || (sys_fl->fl_type & LOCK_MAND))
                return false;

        return locks_conflict(caller_fl, sys_fl);
}

void
posix_test_lock(struct file *filp, struct file_lock *fl)
{
        struct file_lock *cfl;
        struct file_lock_context *ctx;
        struct inode *inode = locks_inode(filp);
        void *owner;
        void (*func)(void);

        ctx = locks_inode_context(inode);
        if (!ctx || list_empty_careful(&ctx->flc_posix)) {
                fl->fl_type = F_UNLCK;
                return;
        }

retry:
        spin_lock(&ctx->flc_lock);
        list_for_each_entry(cfl, &ctx->flc_posix, fl_list) {
                if (!posix_locks_conflict(fl, cfl))
                        continue;
                if (cfl->fl_lmops && cfl->fl_lmops->lm_lock_expirable
                        && (*cfl->fl_lmops->lm_lock_expirable)(cfl)) {
                        owner = cfl->fl_lmops->lm_mod_owner;
                        func = cfl->fl_lmops->lm_expire_lock;
                        __module_get(owner);
                        spin_unlock(&ctx->flc_lock);
                        (*func)();
                        module_put(owner);
                        goto retry;
                }
                locks_copy_conflock(fl, cfl);
                goto out;
        }
        fl->fl_type = F_UNLCK;
out:
        spin_unlock(&ctx->flc_lock);
        return;
}
EXPORT_SYMBOL(posix_test_lock);

/*
 * Deadlock detection:
 *
 * We attempt to detect deadlocks that are due purely to posix file
 * locks.
 *
 * We assume that a task can be waiting for at most one lock at a time.
 * So for any acquired lock, the process holding that lock may be
 * waiting on at most one other lock.  That lock in turns may be held by
 * someone waiting for at most one other lock.  Given a requested lock
 * caller_fl which is about to wait for a conflicting lock block_fl, we
 * follow this chain of waiters to ensure we are not about to create a
 * cycle.
 *
 * Since we do this before we ever put a process to sleep on a lock, we
 * are ensured that there is never a cycle; that is what guarantees that
 * the while() loop in posix_locks_deadlock() eventually completes.
 *
 * Note: the above assumption may not be true when handling lock
 * requests from a broken NFS client. It may also fail in the presence
 * of tasks (such as posix threads) sharing the same open file table.
 * To handle those cases, we just bail out after a few iterations.
 *
 * For FL_OFDLCK locks, the owner is the filp, not the files_struct.
 * Because the owner is not even nominally tied to a thread of
 * execution, the deadlock detection below can't reasonably work well. Just
 * skip it for those.
 *
 * In principle, we could do a more limited deadlock detection on FL_OFDLCK
 * locks that just checks for the case where two tasks are attempting to
 * upgrade from read to write locks on the same inode.
 */

#define MAX_DEADLK_ITERATIONS 10

/* Find a lock that the owner of the given block_fl is blocking on. */
static struct file_lock *what_owner_is_waiting_for(struct file_lock *block_fl)
{
        struct file_lock *fl;

        hash_for_each_possible(blocked_hash, fl, fl_link, posix_owner_key(block_fl)) {
                if (posix_same_owner(fl, block_fl)) {
                        while (fl->fl_blocker)
                                fl = fl->fl_blocker;
                        return fl;
                }
        }
        return NULL;
}

/* Must be called with the blocked_lock_lock held! */
static int posix_locks_deadlock(struct file_lock *caller_fl,
                                struct file_lock *block_fl)
{
        int i = 0;

        lockdep_assert_held(&blocked_lock_lock);

        /*
         * This deadlock detector can't reasonably detect deadlocks with
         * FL_OFDLCK locks, since they aren't owned by a process, per-se.
         */
        if (IS_OFDLCK(caller_fl))
                return 0;

        while ((block_fl = what_owner_is_waiting_for(block_fl))) {
                if (i++ > MAX_DEADLK_ITERATIONS)
                        return 0;
                if (posix_same_owner(caller_fl, block_fl))
                        return 1;
        }
        return 0;
}

/* Try to create a FLOCK lock on filp. We always insert new FLOCK locks
 * after any leases, but before any posix locks.
 *
 * Note that if called with an FL_EXISTS argument, the caller may determine
 * whether or not a lock was successfully freed by testing the return
 * value for -ENOENT.
 */
static int flock_lock_inode(struct inode *inode, struct file_lock *request)
{
        struct file_lock *new_fl = NULL;
        struct file_lock *fl;
        struct file_lock_context *ctx;
        int error = 0;
        bool found = false;
        LIST_HEAD(dispose);

        ctx = locks_get_lock_context(inode, request->fl_type);
        if (!ctx) {
                if (request->fl_type != F_UNLCK)
                        return -ENOMEM;
                return (request->fl_flags & FL_EXISTS) ? -ENOENT : 0;
        }

        if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) {
                new_fl = locks_alloc_lock();
                if (!new_fl)
                        return -ENOMEM;
        }

        percpu_down_read(&file_rwsem);
        spin_lock(&ctx->flc_lock);
        if (request->fl_flags & FL_ACCESS)
                goto find_conflict;

        list_for_each_entry(fl, &ctx->flc_flock, fl_list) {
                if (request->fl_file != fl->fl_file)
                        continue;
                if (request->fl_type == fl->fl_type)
                        goto out;
                found = true;
                locks_delete_lock_ctx(fl, &dispose);
                break;
        }

        if (request->fl_type == F_UNLCK) {
                if ((request->fl_flags & FL_EXISTS) && !found)
                        error = -ENOENT;
                goto out;
        }

find_conflict:
        list_for_each_entry(fl, &ctx->flc_flock, fl_list) {
                if (!flock_locks_conflict(request, fl))
                        continue;
                error = -EAGAIN;
                if (!(request->fl_flags & FL_SLEEP))
                        goto out;
                error = FILE_LOCK_DEFERRED;
                locks_insert_block(fl, request, flock_locks_conflict);
                goto out;
        }
        if (request->fl_flags & FL_ACCESS)
                goto out;
        locks_copy_lock(new_fl, request);
        locks_move_blocks(new_fl, request);
        locks_insert_lock_ctx(new_fl, &ctx->flc_flock);
        new_fl = NULL;
        error = 0;

out:
        spin_unlock(&ctx->flc_lock);
        percpu_up_read(&file_rwsem);
        if (new_fl)
                locks_free_lock(new_fl);
        locks_dispose_list(&dispose);
        trace_flock_lock_inode(inode, request, error);
        return error;
}

static int posix_lock_inode(struct inode *inode, struct file_lock *request,
                            struct file_lock *conflock)
{
        struct file_lock *fl, *tmp;
        struct file_lock *new_fl = NULL;
        struct file_lock *new_fl2 = NULL;
        struct file_lock *left = NULL;
        struct file_lock *right = NULL;
        struct file_lock_context *ctx;
        int error;
        bool added = false;
        LIST_HEAD(dispose);
        void *owner;
        void (*func)(void);

        ctx = locks_get_lock_context(inode, request->fl_type);
        if (!ctx)
                return (request->fl_type == F_UNLCK) ? 0 : -ENOMEM;

        /*
         * We may need two file_lock structures for this operation,
         * so we get them in advance to avoid races.
         *
         * In some cases we can be sure, that no new locks will be needed
         */
        if (!(request->fl_flags & FL_ACCESS) &&
            (request->fl_type != F_UNLCK ||
             request->fl_start != 0 || request->fl_end != OFFSET_MAX)) {
                new_fl = locks_alloc_lock();
                new_fl2 = locks_alloc_lock();
        }

retry:
        percpu_down_read(&file_rwsem);
        spin_lock(&ctx->flc_lock);
        /*
         * New lock request. Walk all POSIX locks and look for conflicts. If
         * there are any, either return error or put the request on the
         * blocker's list of waiters and the global blocked_hash.
         */
        if (request->fl_type != F_UNLCK) {
                list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
                        if (!posix_locks_conflict(request, fl))
                                continue;
                        if (fl->fl_lmops && fl->fl_lmops->lm_lock_expirable
                                && (*fl->fl_lmops->lm_lock_expirable)(fl)) {
                                owner = fl->fl_lmops->lm_mod_owner;
                                func = fl->fl_lmops->lm_expire_lock;
                                __module_get(owner);
                                spin_unlock(&ctx->flc_lock);
                                percpu_up_read(&file_rwsem);
                                (*func)();
                                module_put(owner);
                                goto retry;
                        }
                        if (conflock)
                                locks_copy_conflock(conflock, fl);
                        error = -EAGAIN;
                        if (!(request->fl_flags & FL_SLEEP))
                                goto out;
                        /*
                         * Deadlock detection and insertion into the blocked
                         * locks list must be done while holding the same lock!
                         */
                        error = -EDEADLK;
                        spin_lock(&blocked_lock_lock);
                        /*
                         * Ensure that we don't find any locks blocked on this
                         * request during deadlock detection.
                         */
                        __locks_wake_up_blocks(request);
                        if (likely(!posix_locks_deadlock(request, fl))) {
                                error = FILE_LOCK_DEFERRED;
                                __locks_insert_block(fl, request,
                                                     posix_locks_conflict);
                        }
                        spin_unlock(&blocked_lock_lock);
                        goto out;
                }
        }

        /* If we're just looking for a conflict, we're done. */
        error = 0;
        if (request->fl_flags & FL_ACCESS)
                goto out;

        /* Find the first old lock with the same owner as the new lock */
        list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
                if (posix_same_owner(request, fl))
                        break;
        }

        /* Process locks with this owner. */
        list_for_each_entry_safe_from(fl, tmp, &ctx->flc_posix, fl_list) {
                if (!posix_same_owner(request, fl))
                        break;

                /* Detect adjacent or overlapping regions (if same lock type) */
                if (request->fl_type == fl->fl_type) {
                        /* In all comparisons of start vs end, use
                         * "start - 1" rather than "end + 1". If end
                         * is OFFSET_MAX, end + 1 will become negative.
                         */
                        if (fl->fl_end < request->fl_start - 1)
                                continue;
                        /* If the next lock in the list has entirely bigger
                         * addresses than the new one, insert the lock here.
                         */
                        if (fl->fl_start - 1 > request->fl_end)
                                break;

                        /* If we come here, the new and old lock are of the
                         * same type and adjacent or overlapping. Make one
                         * lock yielding from the lower start address of both
                         * locks to the higher end address.
                         */
                        if (fl->fl_start > request->fl_start)
                                fl->fl_start = request->fl_start;
                        else
                                request->fl_start = fl->fl_start;
                        if (fl->fl_end < request->fl_end)
                                fl->fl_end = request->fl_end;
                        else
                                request->fl_end = fl->fl_end;
                        if (added) {
                                locks_delete_lock_ctx(fl, &dispose);
                                continue;
                        }
                        request = fl;
                        added = true;
                } else {
                        /* Processing for different lock types is a bit
                         * more complex.
                         */
                        if (fl->fl_end < request->fl_start)
                                continue;
                        if (fl->fl_start > request->fl_end)
                                break;
                        if (request->fl_type == F_UNLCK)
                                added = true;
                        if (fl->fl_start < request->fl_start)
                                left = fl;
                        /* If the next lock in the list has a higher end
                         * address than the new one, insert the new one here.
                         */
                        if (fl->fl_end > request->fl_end) {
                                right = fl;
                                break;
                        }
                        if (fl->fl_start >= request->fl_start) {
                                /* The new lock completely replaces an old
                                 * one (This may happen several times).
                                 */
                                if (added) {
                                        locks_delete_lock_ctx(fl, &dispose);
                                        continue;
                                }
                                /*
                                 * Replace the old lock with new_fl, and
                                 * remove the old one. It's safe to do the
                                 * insert here since we know that we won't be
                                 * using new_fl later, and that the lock is
                                 * just replacing an existing lock.
                                 */
                                error = -ENOLCK;
                                if (!new_fl)
                                        goto out;
                                locks_copy_lock(new_fl, request);
                                locks_move_blocks(new_fl, request);
                                request = new_fl;
                                new_fl = NULL;
                                locks_insert_lock_ctx(request, &fl->fl_list);
                                locks_delete_lock_ctx(fl, &dispose);
                                added = true;
                        }
                }
        }

        /*
         * The above code only modifies existing locks in case of merging or
         * replacing. If new lock(s) need to be inserted all modifications are
         * done below this, so it's safe yet to bail out.
         */
        error = -ENOLCK; /* "no luck" */
        if (right && left == right && !new_fl2)
                goto out;

        error = 0;
        if (!added) {
                if (request->fl_type == F_UNLCK) {
                        if (request->fl_flags & FL_EXISTS)
                                error = -ENOENT;
                        goto out;
                }

                if (!new_fl) {
                        error = -ENOLCK;
                        goto out;
                }
                locks_copy_lock(new_fl, request);
                locks_move_blocks(new_fl, request);
                locks_insert_lock_ctx(new_fl, &fl->fl_list);
                fl = new_fl;
                new_fl = NULL;
        }
        if (right) {
                if (left == right) {
                        /* The new lock breaks the old one in two pieces,
                         * so we have to use the second new lock.
                         */
                        left = new_fl2;
                        new_fl2 = NULL;
                        locks_copy_lock(left, right);
                        locks_insert_lock_ctx(left, &fl->fl_list);
                }
                right->fl_start = request->fl_end + 1;
                locks_wake_up_blocks(right);
        }
        if (left) {
                left->fl_end = request->fl_start - 1;
                locks_wake_up_blocks(left);
        }
 out:
        trace_posix_lock_inode(inode, request, error);
        spin_unlock(&ctx->flc_lock);
        percpu_up_read(&file_rwsem);
        /*
         * Free any unused locks.
         */
        if (new_fl)
                locks_free_lock(new_fl);
        if (new_fl2)
                locks_free_lock(new_fl2);
        locks_dispose_list(&dispose);

        return error;
}

/**
 * posix_lock_file - Apply a POSIX-style lock to a file
 * @filp: The file to apply the lock to
 * @fl: The lock to be applied
 * @conflock: Place to return a copy of the conflicting lock, if found.
 *
 * Add a POSIX style lock to a file.
 * We merge adjacent & overlapping locks whenever possible.
 * POSIX locks are sorted by owner task, then by starting address
 *
 * Note that if called with an FL_EXISTS argument, the caller may determine
 * whether or not a lock was successfully freed by testing the return
 * value for -ENOENT.
 */
int posix_lock_file(struct file *filp, struct file_lock *fl,
                        struct file_lock *conflock)
{
        return posix_lock_inode(locks_inode(filp), fl, conflock);
}
EXPORT_SYMBOL(posix_lock_file);

/**
 * posix_lock_inode_wait - Apply a POSIX-style lock to a file
 * @inode: inode of file to which lock request should be applied
 * @fl: The lock to be applied
 *
 * Apply a POSIX style lock request to an inode.
 */
static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
{
        int error;
        might_sleep ();
        for (;;) {
                error = posix_lock_inode(inode, fl, NULL);
                if (error != FILE_LOCK_DEFERRED)
                        break;
                error = wait_event_interruptible(fl->fl_wait,
                                        list_empty(&fl->fl_blocked_member));
                if (error)
                        break;
        }
        locks_delete_block(fl);
        return error;
}

#ifdef CONFIG_MANDATORY_FILE_LOCKING
/**
 * locks_mandatory_locked - Check for an active lock
 * @file: the file to check
 *
 * Searches the inode's list of locks to find any POSIX locks which conflict.
 * This function is called from locks_verify_locked() only.
 */
int locks_mandatory_locked(struct file *file)
{
        int ret;
        struct inode *inode = locks_inode(file);
        struct file_lock_context *ctx;
        struct file_lock *fl;

        ctx = smp_load_acquire(&inode->i_flctx);
        if (!ctx || list_empty_careful(&ctx->flc_posix))
                return 0;

        /*
         * Search the lock list for this inode for any POSIX locks.
         */
        spin_lock(&ctx->flc_lock);
        ret = 0;
        list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
                if (fl->fl_owner != current->files &&
                    fl->fl_owner != file) {
                        ret = -EAGAIN;
                        break;
                }
        }
        spin_unlock(&ctx->flc_lock);
        return ret;
}

/**
 * locks_mandatory_area - Check for a conflicting lock
 * @inode:        the file to check
 * @filp:       how the file was opened (if it was)
 * @start:        first byte in the file to check
 * @end:        lastbyte in the file to check
 * @type:        %F_WRLCK for a write lock, else %F_RDLCK
 *
 * Searches the inode's list of locks to find any POSIX locks which conflict.
 */
int locks_mandatory_area(struct inode *inode, struct file *filp, loff_t start,
                         loff_t end, unsigned char type)
{
        struct file_lock fl;
        int error;
        bool sleep = false;

        locks_init_lock(&fl);
        fl.fl_pid = current->tgid;
        fl.fl_file = filp;
        fl.fl_flags = FL_POSIX | FL_ACCESS;
        if (filp && !(filp->f_flags & O_NONBLOCK))
                sleep = true;
        fl.fl_type = type;
        fl.fl_start = start;
        fl.fl_end = end;

        for (;;) {
                if (filp) {
                        fl.fl_owner = filp;
                        fl.fl_flags &= ~FL_SLEEP;
                        error = posix_lock_inode(inode, &fl, NULL);
                        if (!error)
                                break;
                }

                if (sleep)
                        fl.fl_flags |= FL_SLEEP;
                fl.fl_owner = current->files;
                error = posix_lock_inode(inode, &fl, NULL);
                if (error != FILE_LOCK_DEFERRED)
                        break;
                error = wait_event_interruptible(fl.fl_wait,
                                        list_empty(&fl.fl_blocked_member));
                if (!error) {
                        /*
                         * If we've been sleeping someone might have
                         * changed the permissions behind our back.
                         */
                        if (__mandatory_lock(inode))
                                continue;
                }

                break;
        }
        locks_delete_block(&fl);

        return error;
}
EXPORT_SYMBOL(locks_mandatory_area);
#endif /* CONFIG_MANDATORY_FILE_LOCKING */

static void lease_clear_pending(struct file_lock *fl, int arg)
{
        switch (arg) {
        case F_UNLCK:
                fl->fl_flags &= ~FL_UNLOCK_PENDING;
                fallthrough;
        case F_RDLCK:
                fl->fl_flags &= ~FL_DOWNGRADE_PENDING;
        }
}

/* We already had a lease on this file; just change its type */
int lease_modify(struct file_lock *fl, int arg, struct list_head *dispose)
{
        int error = assign_type(fl, arg);

        if (error)
                return error;
        lease_clear_pending(fl, arg);
        locks_wake_up_blocks(fl);
        if (arg == F_UNLCK) {
                struct file *filp = fl->fl_file;

                f_delown(filp);
                filp->f_owner.signum = 0;
                fasync_helper(0, fl->fl_file, 0, &fl->fl_fasync);
                if (fl->fl_fasync != NULL) {
                        printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync);
                        fl->fl_fasync = NULL;
                }
                locks_delete_lock_ctx(fl, dispose);
        }
        return 0;
}
EXPORT_SYMBOL(lease_modify);

static bool past_time(unsigned long then)
{
        if (!then)
                /* 0 is a special value meaning "this never expires": */
                return false;
        return time_after(jiffies, then);
}

static void time_out_leases(struct inode *inode, struct list_head *dispose)
{
        struct file_lock_context *ctx = inode->i_flctx;
        struct file_lock *fl, *tmp;

        lockdep_assert_held(&ctx->flc_lock);

        list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) {
                trace_time_out_leases(inode, fl);
                if (past_time(fl->fl_downgrade_time))
                        lease_modify(fl, F_RDLCK, dispose);
                if (past_time(fl->fl_break_time))
                        lease_modify(fl, F_UNLCK, dispose);
        }
}

static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker)
{
        bool rc;

        if (lease->fl_lmops->lm_breaker_owns_lease
                        && lease->fl_lmops->lm_breaker_owns_lease(lease))
                return false;
        if ((breaker->fl_flags & FL_LAYOUT) != (lease->fl_flags & FL_LAYOUT)) {
                rc = false;
                goto trace;
        }
        if ((breaker->fl_flags & FL_DELEG) && (lease->fl_flags & FL_LEASE)) {
                rc = false;
                goto trace;
        }

        rc = locks_conflict(breaker, lease);
trace:
        trace_leases_conflict(rc, lease, breaker);
        return rc;
}

static bool
any_leases_conflict(struct inode *inode, struct file_lock *breaker)
{
        struct file_lock_context *ctx = inode->i_flctx;
        struct file_lock *fl;

        lockdep_assert_held(&ctx->flc_lock);

        list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
                if (leases_conflict(fl, breaker))
                        return true;
        }
        return false;
}

/**
 *        __break_lease        -        revoke all outstanding leases on file
 *        @inode: the inode of the file to return
 *        @mode: O_RDONLY: break only write leases; O_WRONLY or O_RDWR:
 *            break all leases
 *        @type: FL_LEASE: break leases and delegations; FL_DELEG: break
 *            only delegations
 *
 *        break_lease (inlined for speed) has checked there already is at least
 *        some kind of lock (maybe a lease) on this file.  Leases are broken on
 *        a call to open() or truncate().  This function can sleep unless you
 *        specified %O_NONBLOCK to your open().
 */
int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
{
        int error = 0;
        struct file_lock_context *ctx;
        struct file_lock *new_fl, *fl, *tmp;
        unsigned long break_time;
        int want_write = (mode & O_ACCMODE) != O_RDONLY;
        LIST_HEAD(dispose);

        new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK);
        if (IS_ERR(new_fl))
                return PTR_ERR(new_fl);
        new_fl->fl_flags = type;

        /* typically we will check that ctx is non-NULL before calling */
        ctx = locks_inode_context(inode);
        if (!ctx) {
                WARN_ON_ONCE(1);
                goto free_lock;
        }

        percpu_down_read(&file_rwsem);
        spin_lock(&ctx->flc_lock);

        time_out_leases(inode, &dispose);

        if (!any_leases_conflict(inode, new_fl))
                goto out;

        break_time = 0;
        if (lease_break_time > 0) {
                break_time = jiffies + lease_break_time * HZ;
                if (break_time == 0)
                        break_time++;        /* so that 0 means no break time */
        }

        list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) {
                if (!leases_conflict(fl, new_fl))
                        continue;
                if (want_write) {
                        if (fl->fl_flags & FL_UNLOCK_PENDING)
                                continue;
                        fl->fl_flags |= FL_UNLOCK_PENDING;
                        fl->fl_break_time = break_time;
                } else {
                        if (lease_breaking(fl))
                                continue;
                        fl->fl_flags |= FL_DOWNGRADE_PENDING;
                        fl->fl_downgrade_time = break_time;
                }
                if (fl->fl_lmops->lm_break(fl))
                        locks_delete_lock_ctx(fl, &dispose);
        }

        if (list_empty(&ctx->flc_lease))
                goto out;

        if (mode & O_NONBLOCK) {
                trace_break_lease_noblock(inode, new_fl);
                error = -EWOULDBLOCK;
                goto out;
        }

restart:
        fl = list_first_entry(&ctx->flc_lease, struct file_lock, fl_list);
        break_time = fl->fl_break_time;
        if (break_time != 0)
                break_time -= jiffies;
        if (break_time == 0)
                break_time++;
        locks_insert_block(fl, new_fl, leases_conflict);
        trace_break_lease_block(inode, new_fl);
        spin_unlock(&ctx->flc_lock);
        percpu_up_read(&file_rwsem);

        locks_dispose_list(&dispose);
        error = wait_event_interruptible_timeout(new_fl->fl_wait,
                                        list_empty(&new_fl->fl_blocked_member),
                                        break_time);

        percpu_down_read(&file_rwsem);
        spin_lock(&ctx->flc_lock);
        trace_break_lease_unblock(inode, new_fl);
        locks_delete_block(new_fl);
        if (error >= 0) {
                /*
                 * Wait for the next conflicting lease that has not been
                 * broken yet
                 */
                if (error == 0)
                        time_out_leases(inode, &dispose);
                if (any_leases_conflict(inode, new_fl))
                        goto restart;
                error = 0;
        }
out:
        spin_unlock(&ctx->flc_lock);
        percpu_up_read(&file_rwsem);
        locks_dispose_list(&dispose);
free_lock:
        locks_free_lock(new_fl);
        return error;
}
EXPORT_SYMBOL(__break_lease);

/**
 *        lease_get_mtime - update modified time of an inode with exclusive lease
 *        @inode: the inode
 *      @time:  pointer to a timespec which contains the last modified time
 *
 * This is to force NFS clients to flush their caches for files with
 * exclusive leases.  The justification is that if someone has an
 * exclusive lease, then they could be modifying it.
 */
void lease_get_mtime(struct inode *inode, struct timespec64 *time)
{
        bool has_lease = false;
        struct file_lock_context *ctx;
        struct file_lock *fl;

        ctx = locks_inode_context(inode);
        if (ctx && !list_empty_careful(&ctx->flc_lease)) {
                spin_lock(&ctx->flc_lock);
                fl = list_first_entry_or_null(&ctx->flc_lease,
                                              struct file_lock, fl_list);
                if (fl && (fl->fl_type == F_WRLCK))
                        has_lease = true;
                spin_unlock(&ctx->flc_lock);
        }

        if (has_lease)
                *time = current_time(inode);
}
EXPORT_SYMBOL(lease_get_mtime);

/**
 *        fcntl_getlease - Enquire what lease is currently active
 *        @filp: the file
 *
 *        The value returned by this function will be one of
 *        (if no lease break is pending):
 *
 *        %F_RDLCK to indicate a shared lease is held.
 *
 *        %F_WRLCK to indicate an exclusive lease is held.
 *
 *        %F_UNLCK to indicate no lease is held.
 *
 *        (if a lease break is pending):
 *
 *        %F_RDLCK to indicate an exclusive lease needs to be
 *                changed to a shared lease (or removed).
 *
 *        %F_UNLCK to indicate the lease needs to be removed.
 *
 *        XXX: sfr & willy disagree over whether F_INPROGRESS
 *        should be returned to userspace.
 */
int fcntl_getlease(struct file *filp)
{
        struct file_lock *fl;
        struct inode *inode = locks_inode(filp);
        struct file_lock_context *ctx;
        int type = F_UNLCK;
        LIST_HEAD(dispose);

        ctx = locks_inode_context(inode);
        if (ctx && !list_empty_careful(&ctx->flc_lease)) {
                percpu_down_read(&file_rwsem);
                spin_lock(&ctx->flc_lock);
                time_out_leases(inode, &dispose);
                list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
                        if (fl->fl_file != filp)
                                continue;
                        type = target_leasetype(fl);
                        break;
                }
                spin_unlock(&ctx->flc_lock);
                percpu_up_read(&file_rwsem);

                locks_dispose_list(&dispose);
        }
        return type;
}

/**
 * check_conflicting_open - see if the given file points to an inode that has
 *                            an existing open that would conflict with the
 *                            desired lease.
 * @filp:        file to check
 * @arg:        type of lease that we're trying to acquire
 * @flags:        current lock flags
 *
 * Check to see if there's an existing open fd on this file that would
 * conflict with the lease we're trying to set.
 */
static int
check_conflicting_open(struct file *filp, const long arg, int flags)
{
        struct inode *inode = locks_inode(filp);
        int self_wcount = 0, self_rcount = 0;

        if (flags & FL_LAYOUT)
                return 0;
        if (flags & FL_DELEG)
                /* We leave these checks to the caller */
                return 0;

        if (arg == F_RDLCK)
                return inode_is_open_for_write(inode) ? -EAGAIN : 0;
        else if (arg != F_WRLCK)
                return 0;

        /*
         * Make sure that only read/write count is from lease requestor.
         * Note that this will result in denying write leases when i_writecount
         * is negative, which is what we want.  (We shouldn't grant write leases
         * on files open for execution.)
         */
        if (filp->f_mode & FMODE_WRITE)
                self_wcount = 1;
        else if (filp->f_mode & FMODE_READ)
                self_rcount = 1;

        if (atomic_read(&inode->i_writecount) != self_wcount ||
            atomic_read(&inode->i_readcount) != self_rcount)
                return -EAGAIN;

        return 0;
}

static int
generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **priv)
{
        struct file_lock *fl, *my_fl = NULL, *lease;
        struct inode *inode = locks_inode(filp);
        struct file_lock_context *ctx;
        bool is_deleg = (*flp)->fl_flags & FL_DELEG;
        int error;
        LIST_HEAD(dispose);

        lease = *flp;
        trace_generic_add_lease(inode, lease);

        /* Note that arg is never F_UNLCK here */
        ctx = locks_get_lock_context(inode, arg);
        if (!ctx)
                return -ENOMEM;

        /*
         * In the delegation case we need mutual exclusion with
         * a number of operations that take the i_mutex.  We trylock
         * because delegations are an optional optimization, and if
         * there's some chance of a conflict--we'd rather not
         * bother, maybe that's a sign this just isn't a good file to
         * hand out a delegation on.
         */
        if (is_deleg && !inode_trylock(inode))
                return -EAGAIN;

        if (is_deleg && arg == F_WRLCK) {
                /* Write delegations are not currently supported: */
                inode_unlock(inode);
                WARN_ON_ONCE(1);
                return -EINVAL;
        }

        percpu_down_read(&file_rwsem);
        spin_lock(&ctx->flc_lock);
        time_out_leases(inode, &dispose);
        error = check_conflicting_open(filp, arg, lease->fl_flags);
        if (error)
                goto out;

        /*
         * At this point, we know that if there is an exclusive
         * lease on this file, then we hold it on this filp
         * (otherwise our open of this file would have blocked).
         * And if we are trying to acquire an exclusive lease,
         * then the file is not open by anyone (including us)
         * except for this filp.
         */
        error = -EAGAIN;
        list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
                if (fl->fl_file == filp &&
                    fl->fl_owner == lease->fl_owner) {
                        my_fl = fl;
                        continue;
                }

                /*
                 * No exclusive leases if someone else has a lease on
                 * this file:
                 */
                if (arg == F_WRLCK)
                        goto out;
                /*
                 * Modifying our existing lease is OK, but no getting a
                 * new lease if someone else is opening for write:
                 */
                if (fl->fl_flags & FL_UNLOCK_PENDING)
                        goto out;
        }

        if (my_fl != NULL) {
                lease = my_fl;
                error = lease->fl_lmops->lm_change(lease, arg, &dispose);
                if (error)
                        goto out;
                goto out_setup;
        }

        error = -EINVAL;
        if (!leases_enable)
                goto out;

        locks_insert_lock_ctx(lease, &ctx->flc_lease);
        /*
         * The check in break_lease() is lockless. It's possible for another
         * open to race in after we did the earlier check for a conflicting
         * open but before the lease was inserted. Check again for a
         * conflicting open and cancel the lease if there is one.
         *
         * We also add a barrier here to ensure that the insertion of the lock
         * precedes these checks.
         */
        smp_mb();
        error = check_conflicting_open(filp, arg, lease->fl_flags);
        if (error) {
                locks_unlink_lock_ctx(lease);
                goto out;
        }

out_setup:
        if (lease->fl_lmops->lm_setup)
                lease->fl_lmops->lm_setup(lease, priv);
out:
        spin_unlock(&ctx->flc_lock);
        percpu_up_read(&file_rwsem);
        locks_dispose_list(&dispose);
        if (is_deleg)
                inode_unlock(inode);
        if (!error && !my_fl)
                *flp = NULL;
        return error;
}

static int generic_delete_lease(struct file *filp, void *owner)
{
        int error = -EAGAIN;
        struct file_lock *fl, *victim = NULL;
        struct inode *inode = locks_inode(filp);
        struct file_lock_context *ctx;
        LIST_HEAD(dispose);

        ctx = locks_inode_context(inode);
        if (!ctx) {
                trace_generic_delete_lease(inode, NULL);
                return error;
        }

        percpu_down_read(&file_rwsem);
        spin_lock(&ctx->flc_lock);
        list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
                if (fl->fl_file == filp &&
                    fl->fl_owner == owner) {
                        victim = fl;
                        break;
                }
        }
        trace_generic_delete_lease(inode, victim);
        if (victim)
                error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
        spin_unlock(&ctx->flc_lock);
        percpu_up_read(&file_rwsem);
        locks_dispose_list(&dispose);
        return error;
}

/**
 *        generic_setlease        -        sets a lease on an open file
 *        @filp:        file pointer
 *        @arg:        type of lease to obtain
 *        @flp:        input - file_lock to use, output - file_lock inserted
 *        @priv:        private data for lm_setup (may be NULL if lm_setup
 *                doesn't require it)
 *
 *        The (input) flp->fl_lmops->lm_break function is required
 *        by break_lease().
 */
int generic_setlease(struct file *filp, long arg, struct file_lock **flp,
                        void **priv)
{
        struct inode *inode = locks_inode(filp);
        int error;

        if ((!uid_eq(current_fsuid(), inode->i_uid)) && !capable(CAP_LEASE))
                return -EACCES;
        if (!S_ISREG(inode->i_mode))
                return -EINVAL;
        error = security_file_lock(filp, arg);
        if (error)
                return error;

        switch (arg) {
        case F_UNLCK:
                return generic_delete_lease(filp, *priv);
        case F_RDLCK:
        case F_WRLCK:
                if (!(*flp)->fl_lmops->lm_break) {
                        WARN_ON_ONCE(1);
                        return -ENOLCK;
                }

                return generic_add_lease(filp, arg, flp, priv);
        default:
                return -EINVAL;
        }
}
EXPORT_SYMBOL(generic_setlease);

#if IS_ENABLED(CONFIG_SRCU)
/*
 * Kernel subsystems can register to be notified on any attempt to set
 * a new lease with the lease_notifier_chain. This is used by (e.g.) nfsd
 * to close files that it may have cached when there is an attempt to set a
 * conflicting lease.
 */
static struct srcu_notifier_head lease_notifier_chain;

static inline void
lease_notifier_chain_init(void)
{
        srcu_init_notifier_head(&lease_notifier_chain);
}

static inline void
setlease_notifier(long arg, struct file_lock *lease)
{
        if (arg != F_UNLCK)
                srcu_notifier_call_chain(&lease_notifier_chain, arg, lease);
}

int lease_register_notifier(struct notifier_block *nb)
{
        return srcu_notifier_chain_register(&lease_notifier_chain, nb);
}
EXPORT_SYMBOL_GPL(lease_register_notifier);

void lease_unregister_notifier(struct notifier_block *nb)
{
        srcu_notifier_chain_unregister(&lease_notifier_chain, nb);
}
EXPORT_SYMBOL_GPL(lease_unregister_notifier);

#else /* !IS_ENABLED(CONFIG_SRCU) */
static inline void
lease_notifier_chain_init(void)
{
}

static inline void
setlease_notifier(long arg, struct file_lock *lease)
{
}

int lease_register_notifier(struct notifier_block *nb)
{
        return 0;
}
EXPORT_SYMBOL_GPL(lease_register_notifier);

void lease_unregister_notifier(struct notifier_block *nb)
{
}
EXPORT_SYMBOL_GPL(lease_unregister_notifier);

#endif /* IS_ENABLED(CONFIG_SRCU) */

/**
 * vfs_setlease        -       sets a lease on an open file
 * @filp:        file pointer
 * @arg:        type of lease to obtain
 * @lease:        file_lock to use when adding a lease
 * @priv:        private info for lm_setup when adding a lease (may be
 *                NULL if lm_setup doesn't require it)
 *
 * Call this to establish a lease on the file. The "lease" argument is not
 * used for F_UNLCK requests and may be NULL. For commands that set or alter
 * an existing lease, the ``(*lease)->fl_lmops->lm_break`` operation must be
 * set; if not, this function will return -ENOLCK (and generate a scary-looking
 * stack trace).
 *
 * The "priv" pointer is passed directly to the lm_setup function as-is. It
 * may be NULL if the lm_setup operation doesn't require it.
 */
int
vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv)
{
        if (lease)
                setlease_notifier(arg, *lease);
        if (filp->f_op->setlease)
                return filp->f_op->setlease(filp, arg, lease, priv);
        else
                return generic_setlease(filp, arg, lease, priv);
}
EXPORT_SYMBOL_GPL(vfs_setlease);

static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
{
        struct file_lock *fl;
        struct fasync_struct *new;
        int error;

        fl = lease_alloc(filp, arg);
        if (IS_ERR(fl))
                return PTR_ERR(fl);

        new = fasync_alloc();
        if (!new) {
                locks_free_lock(fl);
                return -ENOMEM;
        }
        new->fa_fd = fd;

        error = vfs_setlease(filp, arg, &fl, (void **)&new);
        if (fl)
                locks_free_lock(fl);
        if (new)
                fasync_free(new);
        return error;
}

/**
 *        fcntl_setlease        -        sets a lease on an open file
 *        @fd: open file descriptor
 *        @filp: file pointer
 *        @arg: type of lease to obtain
 *
 *        Call this fcntl to establish a lease on the file.
 *        Note that you also need to call %F_SETSIG to
 *        receive a signal when the lease is broken.
 */
int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
{
        if (arg == F_UNLCK)
                return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp);
        return do_fcntl_add_lease(fd, filp, arg);
}

/**
 * flock_lock_inode_wait - Apply a FLOCK-style lock to a file
 * @inode: inode of the file to apply to
 * @fl: The lock to be applied
 *
 * Apply a FLOCK style lock request to an inode.
 */
static int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl)
{
        int error;
        might_sleep();
        for (;;) {
                error = flock_lock_inode(inode, fl);
                if (error != FILE_LOCK_DEFERRED)
                        break;
                error = wait_event_interruptible(fl->fl_wait,
                                list_empty(&fl->fl_blocked_member));
                if (error)
                        break;
        }
        locks_delete_block(fl);
        return error;
}

/**
 * locks_lock_inode_wait - Apply a lock to an inode
 * @inode: inode of the file to apply to
 * @fl: The lock to be applied
 *
 * Apply a POSIX or FLOCK style lock request to an inode.
 */
int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl)
{
        int res = 0;
        switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) {
                case FL_POSIX:
                        res = posix_lock_inode_wait(inode, fl);
                        break;
                case FL_FLOCK:
                        res = flock_lock_inode_wait(inode, fl);
                        break;
                default:
                        BUG();
        }
        return res;
}
EXPORT_SYMBOL(locks_lock_inode_wait);

/**
 *        sys_flock: - flock() system call.
 *        @fd: the file descriptor to lock.
 *        @cmd: the type of lock to apply.
 *
 *        Apply a %FL_FLOCK style lock to an open file descriptor.
 *        The @cmd can be one of:
 *
 *        - %LOCK_SH -- a shared lock.
 *        - %LOCK_EX -- an exclusive lock.
 *        - %LOCK_UN -- remove an existing lock.
 *        - %LOCK_MAND -- a 'mandatory' flock.
 *          This exists to emulate Windows Share Modes.
 *
 *        %LOCK_MAND can be combined with %LOCK_READ or %LOCK_WRITE to allow other
 *        processes read and write access respectively.
 */
SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
{
        struct fd f = fdget(fd);
        struct file_lock *lock;
        int can_sleep, unlock;
        int error;

        error = -EBADF;
        if (!f.file)
                goto out;

        can_sleep = !(cmd & LOCK_NB);
        cmd &= ~LOCK_NB;
        unlock = (cmd == LOCK_UN);

        if (!unlock && !(cmd & LOCK_MAND) &&
            !(f.file->f_mode & (FMODE_READ|FMODE_WRITE)))
                goto out_putf;

        lock = flock_make_lock(f.file, cmd, NULL);
        if (IS_ERR(lock)) {
                error = PTR_ERR(lock);
                goto out_putf;
        }

        if (can_sleep)
                lock->fl_flags |= FL_SLEEP;

        error = security_file_lock(f.file, lock->fl_type);
        if (error)
                goto out_free;

        if (f.file->f_op->flock)
                error = f.file->f_op->flock(f.file,
                                          (can_sleep) ? F_SETLKW : F_SETLK,
                                          lock);
        else
                error = locks_lock_file_wait(f.file, lock);

 out_free:
        locks_free_lock(lock);

 out_putf:
        fdput(f);
 out:
        return error;
}

/**
 * vfs_test_lock - test file byte range lock
 * @filp: The file to test lock for
 * @fl: The byte-range in the file to test; also used to hold result
 *
 * On entry, @fl does not contain a lock, but identifies a range (fl_start, fl_end)
 * in the file (c.flc_file), and an owner (c.flc_owner) for whom existing locks
 * should be ignored.  c.flc_type and c.flc_flags are ignored.
 * Both fl_lmops and fl_ops in @fl must be NULL.
 * Returns -ERRNO on failure.  Indicates presence of conflicting lock by
 * setting fl->fl_type to something other than F_UNLCK.
 *
 * If vfs_test_lock() does find a lock and return it, the caller must
 * use locks_free_lock() or locks_release_private() on the returned lock.
 */
int vfs_test_lock(struct file *filp, struct file_lock *fl)
{
        WARN_ON_ONCE(fl->fl_ops || fl->fl_lmops);
        WARN_ON_ONCE(filp != fl->fl_file);
        if (filp->f_op->lock)
                return filp->f_op->lock(filp, F_GETLK, fl);
        posix_test_lock(filp, fl);
        return 0;
}
EXPORT_SYMBOL_GPL(vfs_test_lock);

/**
 * locks_translate_pid - translate a file_lock's fl_pid number into a namespace
 * @fl: The file_lock who's fl_pid should be translated
 * @ns: The namespace into which the pid should be translated
 *
 * Used to tranlate a fl_pid into a namespace virtual pid number
 */
static pid_t locks_translate_pid(struct file_lock *fl, struct pid_namespace *ns)
{
        pid_t vnr;
        struct pid *pid;

        if (IS_OFDLCK(fl))
                return -1;
        if (IS_REMOTELCK(fl))
                return fl->fl_pid;
        /*
         * If the flock owner process is dead and its pid has been already
         * freed, the translation below won't work, but we still want to show
         * flock owner pid number in init pidns.
         */
        if (ns == &init_pid_ns)
                return (pid_t)fl->fl_pid;

        rcu_read_lock();
        pid = find_pid_ns(fl->fl_pid, &init_pid_ns);
        vnr = pid_nr_ns(pid, ns);
        rcu_read_unlock();
        return vnr;
}

static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl)
{
        flock->l_pid = locks_translate_pid(fl, task_active_pid_ns(current));
#if BITS_PER_LONG == 32
        /*
         * Make sure we can represent the posix lock via
         * legacy 32bit flock.
         */
        if (fl->fl_start > OFFT_OFFSET_MAX)
                return -EOVERFLOW;
        if (fl->fl_end != OFFSET_MAX && fl->fl_end > OFFT_OFFSET_MAX)
                return -EOVERFLOW;
#endif
        flock->l_start = fl->fl_start;
        flock->l_len = fl->fl_end == OFFSET_MAX ? 0 :
                fl->fl_end - fl->fl_start + 1;
        flock->l_whence = 0;
        flock->l_type = fl->fl_type;
        return 0;
}

#if BITS_PER_LONG == 32
static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl)
{
        flock->l_pid = locks_translate_pid(fl, task_active_pid_ns(current));
        flock->l_start = fl->fl_start;
        flock->l_len = fl->fl_end == OFFSET_MAX ? 0 :
                fl->fl_end - fl->fl_start + 1;
        flock->l_whence = 0;
        flock->l_type = fl->fl_type;
}
#endif

/* Report the first existing lock that would conflict with l.
 * This implements the F_GETLK command of fcntl().
 */
int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock *flock)
{
        struct file_lock *fl;
        int error;

        fl = locks_alloc_lock();
        if (fl == NULL)
                return -ENOMEM;
        error = -EINVAL;
        if (flock->l_type != F_RDLCK && flock->l_type != F_WRLCK)
                goto out;

        error = flock_to_posix_lock(filp, fl, flock);
        if (error)
                goto out;

        if (cmd == F_OFD_GETLK) {
                error = -EINVAL;
                if (flock->l_pid != 0)
                        goto out;

                cmd = F_GETLK;
                fl->fl_flags |= FL_OFDLCK;
                fl->fl_owner = filp;
        }

        error = vfs_test_lock(filp, fl);
        if (error)
                goto out;

        flock->l_type = fl->fl_type;
        if (fl->fl_type != F_UNLCK) {
                error = posix_lock_to_flock(flock, fl);
                if (error)
                        goto out;
        }
out:
        locks_free_lock(fl);
        return error;
}

/**
 * vfs_lock_file - file byte range lock
 * @filp: The file to apply the lock to
 * @cmd: type of locking operation (F_SETLK, F_GETLK, etc.)
 * @fl: The lock to be applied
 * @conf: Place to return a copy of the conflicting lock, if found.
 *
 * A caller that doesn't care about the conflicting lock may pass NULL
 * as the final argument.
 *
 * If the filesystem defines a private ->lock() method, then @conf will
 * be left unchanged; so a caller that cares should initialize it to
 * some acceptable default.
 *
 * To avoid blocking kernel daemons, such as lockd, that need to acquire POSIX
 * locks, the ->lock() interface may return asynchronously, before the lock has
 * been granted or denied by the underlying filesystem, if (and only if)
 * lm_grant is set. Callers expecting ->lock() to return asynchronously
 * will only use F_SETLK, not F_SETLKW; they will set FL_SLEEP if (and only if)
 * the request is for a blocking lock. When ->lock() does return asynchronously,
 * it must return FILE_LOCK_DEFERRED, and call ->lm_grant() when the lock
 * request completes.
 * If the request is for non-blocking lock the file system should return
 * FILE_LOCK_DEFERRED then try to get the lock and call the callback routine
 * with the result. If the request timed out the callback routine will return a
 * nonzero return code and the file system should release the lock. The file
 * system is also responsible to keep a corresponding posix lock when it
 * grants a lock so the VFS can find out which locks are locally held and do
 * the correct lock cleanup when required.
 * The underlying filesystem must not drop the kernel lock or call
 * ->lm_grant() before returning to the caller with a FILE_LOCK_DEFERRED
 * return code.
 */
int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf)
{
        if (filp->f_op->lock)
                return filp->f_op->lock(filp, cmd, fl);
        else
                return posix_lock_file(filp, fl, conf);
}
EXPORT_SYMBOL_GPL(vfs_lock_file);

static int do_lock_file_wait(struct file *filp, unsigned int cmd,
                             struct file_lock *fl)
{
        int error;

        error = security_file_lock(filp, fl->fl_type);
        if (error)
                return error;

        for (;;) {
                error = vfs_lock_file(filp, cmd, fl, NULL);
                if (error != FILE_LOCK_DEFERRED)
                        break;
                error = wait_event_interruptible(fl->fl_wait,
                                        list_empty(&fl->fl_blocked_member));
                if (error)
                        break;
        }
        locks_delete_block(fl);

        return error;
}

/* Ensure that fl->fl_file has compatible f_mode for F_SETLK calls */
static int
check_fmode_for_setlk(struct file_lock *fl)
{
        switch (fl->fl_type) {
        case F_RDLCK:
                if (!(fl->fl_file->f_mode & FMODE_READ))
                        return -EBADF;
                break;
        case F_WRLCK:
                if (!(fl->fl_file->f_mode & FMODE_WRITE))
                        return -EBADF;
        }
        return 0;
}

/* Apply the lock described by l to an open file descriptor.
 * This implements both the F_SETLK and F_SETLKW commands of fcntl().
 */
int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
                struct flock *flock)
{
        struct file_lock *file_lock = locks_alloc_lock();
        struct inode *inode = locks_inode(filp);
        struct file *f;
        int error;

        if (file_lock == NULL)
                return -ENOLCK;

        /* Don't allow mandatory locks on files that may be memory mapped
         * and shared.
         */
        if (mandatory_lock(inode) && mapping_writably_mapped(filp->f_mapping)) {
                error = -EAGAIN;
                goto out;
        }

        error = flock_to_posix_lock(filp, file_lock, flock);
        if (error)
                goto out;

        error = check_fmode_for_setlk(file_lock);
        if (error)
                goto out;

        /*
         * If the cmd is requesting file-private locks, then set the
         * FL_OFDLCK flag and override the owner.
         */
        switch (cmd) {
        case F_OFD_SETLK:
                error = -EINVAL;
                if (flock->l_pid != 0)
                        goto out;

                cmd = F_SETLK;
                file_lock->fl_flags |= FL_OFDLCK;
                file_lock->fl_owner = filp;
                break;
        case F_OFD_SETLKW:
                error = -EINVAL;
                if (flock->l_pid != 0)
                        goto out;

                cmd = F_SETLKW;
                file_lock->fl_flags |= FL_OFDLCK;
                file_lock->fl_owner = filp;
                fallthrough;
        case F_SETLKW:
                file_lock->fl_flags |= FL_SLEEP;
        }

        error = do_lock_file_wait(filp, cmd, file_lock);

        /*
         * Detect close/fcntl races and recover by zapping all POSIX locks
         * associated with this file and our files_struct, just like on
         * filp_flush(). There is no need to do that when we're
         * unlocking though, or for OFD locks.
         */
        if (!error && file_lock->fl_type != F_UNLCK &&
            !(file_lock->fl_flags & FL_OFDLCK)) {
                struct files_struct *files = current->files;
                /*
                 * We need that spin_lock here - it prevents reordering between
                 * update of i_flctx->flc_posix and check for it done in
                 * close(). rcu_read_lock() wouldn't do.
                 */
                spin_lock(&files->file_lock);
                f = files_lookup_fd_locked(files, fd);
                spin_unlock(&files->file_lock);
                if (f != filp) {
                        locks_remove_posix(filp, files);
                        error = -EBADF;
                }
        }
out:
        trace_fcntl_setlk(inode, file_lock, error);
        locks_free_lock(file_lock);
        return error;
}

#if BITS_PER_LONG == 32
/* Report the first existing lock that would conflict with l.
 * This implements the F_GETLK command of fcntl().
 */
int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 *flock)
{
        struct file_lock *fl;
        int error;

        fl = locks_alloc_lock();
        if (fl == NULL)
                return -ENOMEM;

        error = -EINVAL;
        if (flock->l_type != F_RDLCK && flock->l_type != F_WRLCK)
                goto out;

        error = flock64_to_posix_lock(filp, fl, flock);
        if (error)
                goto out;

        if (cmd == F_OFD_GETLK) {
                error = -EINVAL;
                if (flock->l_pid != 0)
                        goto out;

                cmd = F_GETLK64;
                fl->fl_flags |= FL_OFDLCK;
                fl->fl_owner = filp;
        }

        error = vfs_test_lock(filp, fl);
        if (error)
                goto out;

        flock->l_type = fl->fl_type;
        if (fl->fl_type != F_UNLCK)
                posix_lock_to_flock64(flock, fl);

out:
        locks_free_lock(fl);
        return error;
}

/* Apply the lock described by l to an open file descriptor.
 * This implements both the F_SETLK and F_SETLKW commands of fcntl().
 */
int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
                struct flock64 *flock)
{
        struct file_lock *file_lock = locks_alloc_lock();
        struct inode *inode = locks_inode(filp);
        struct file *f;
        int error;

        if (file_lock == NULL)
                return -ENOLCK;

        /* Don't allow mandatory locks on files that may be memory mapped
         * and shared.
         */
        if (mandatory_lock(inode) && mapping_writably_mapped(filp->f_mapping)) {
                error = -EAGAIN;
                goto out;
        }

        error = flock64_to_posix_lock(filp, file_lock, flock);
        if (error)
                goto out;

        error = check_fmode_for_setlk(file_lock);
        if (error)
                goto out;

        /*
         * If the cmd is requesting file-private locks, then set the
         * FL_OFDLCK flag and override the owner.
         */
        switch (cmd) {
        case F_OFD_SETLK:
                error = -EINVAL;
                if (flock->l_pid != 0)
                        goto out;

                cmd = F_SETLK64;
                file_lock->fl_flags |= FL_OFDLCK;
                file_lock->fl_owner = filp;
                break;
        case F_OFD_SETLKW:
                error = -EINVAL;
                if (flock->l_pid != 0)
                        goto out;

                cmd = F_SETLKW64;
                file_lock->fl_flags |= FL_OFDLCK;
                file_lock->fl_owner = filp;
                fallthrough;
        case F_SETLKW64:
                file_lock->fl_flags |= FL_SLEEP;
        }

        error = do_lock_file_wait(filp, cmd, file_lock);

        /*
         * Detect close/fcntl races and recover by zapping all POSIX locks
         * associated with this file and our files_struct, just like on
         * filp_flush(). There is no need to do that when we're
         * unlocking though, or for OFD locks.
         */
        if (!error && file_lock->fl_type != F_UNLCK &&
            !(file_lock->fl_flags & FL_OFDLCK)) {
                struct files_struct *files = current->files;
                /*
                 * We need that spin_lock here - it prevents reordering between
                 * update of i_flctx->flc_posix and check for it done in
                 * close(). rcu_read_lock() wouldn't do.
                 */
                spin_lock(&files->file_lock);
                f = files_lookup_fd_locked(files, fd);
                spin_unlock(&files->file_lock);
                if (f != filp) {
                        locks_remove_posix(filp, files);
                        error = -EBADF;
                }
        }
out:
        locks_free_lock(file_lock);
        return error;
}
#endif /* BITS_PER_LONG == 32 */

/*
 * This function is called when the file is being removed
 * from the task's fd array.  POSIX locks belonging to this task
 * are deleted at this time.
 */
void locks_remove_posix(struct file *filp, fl_owner_t owner)
{
        int error;
        struct inode *inode = locks_inode(filp);
        struct file_lock lock;
        struct file_lock_context *ctx;

        /*
         * If there are no locks held on this file, we don't need to call
         * posix_lock_file().  Another process could be setting a lock on this
         * file at the same time, but we wouldn't remove that lock anyway.
         */
        ctx = locks_inode_context(inode);
        if (!ctx || list_empty(&ctx->flc_posix))
                return;

        locks_init_lock(&lock);
        lock.fl_type = F_UNLCK;
        lock.fl_flags = FL_POSIX | FL_CLOSE;
        lock.fl_start = 0;
        lock.fl_end = OFFSET_MAX;
        lock.fl_owner = owner;
        lock.fl_pid = current->tgid;
        lock.fl_file = filp;
        lock.fl_ops = NULL;
        lock.fl_lmops = NULL;

        error = vfs_lock_file(filp, F_SETLK, &lock, NULL);

        if (lock.fl_ops && lock.fl_ops->fl_release_private)
                lock.fl_ops->fl_release_private(&lock);
        trace_locks_remove_posix(inode, &lock, error);
}
EXPORT_SYMBOL(locks_remove_posix);

/* The i_flctx must be valid when calling into here */
static void
locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
{
        struct file_lock fl;
        struct inode *inode = locks_inode(filp);

        if (list_empty(&flctx->flc_flock))
                return;

        flock_make_lock(filp, LOCK_UN, &fl);
        fl.fl_flags |= FL_CLOSE;

        if (filp->f_op->flock)
                filp->f_op->flock(filp, F_SETLKW, &fl);
        else
                flock_lock_inode(inode, &fl);

        if (fl.fl_ops && fl.fl_ops->fl_release_private)
                fl.fl_ops->fl_release_private(&fl);
}

/* The i_flctx must be valid when calling into here */
static void
locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
{
        struct file_lock *fl, *tmp;
        LIST_HEAD(dispose);

        if (list_empty(&ctx->flc_lease))
                return;

        percpu_down_read(&file_rwsem);
        spin_lock(&ctx->flc_lock);
        list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
                if (filp == fl->fl_file)
                        lease_modify(fl, F_UNLCK, &dispose);
        spin_unlock(&ctx->flc_lock);
        percpu_up_read(&file_rwsem);

        locks_dispose_list(&dispose);
}

/*
 * This function is called on the last close of an open file.
 */
void locks_remove_file(struct file *filp)
{
        struct file_lock_context *ctx;

        ctx = locks_inode_context(locks_inode(filp));
        if (!ctx)
                return;

        /* remove any OFD locks */
        locks_remove_posix(filp, filp);

        /* remove flock locks */
        locks_remove_flock(filp, ctx);

        /* remove any leases */
        locks_remove_lease(filp, ctx);

        spin_lock(&ctx->flc_lock);
        locks_check_ctx_file_list(filp, &ctx->flc_posix, "POSIX");
        locks_check_ctx_file_list(filp, &ctx->flc_flock, "FLOCK");
        locks_check_ctx_file_list(filp, &ctx->flc_lease, "LEASE");
        spin_unlock(&ctx->flc_lock);
}

/**
 * vfs_cancel_lock - file byte range unblock lock
 * @filp: The file to apply the unblock to
 * @fl: The lock to be unblocked
 *
 * Used by lock managers to cancel blocked requests
 */
int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
{
        if (filp->f_op->lock)
                return filp->f_op->lock(filp, F_CANCELLK, fl);
        return 0;
}
EXPORT_SYMBOL_GPL(vfs_cancel_lock);

/**
 * vfs_inode_has_locks - are any file locks held on @inode?
 * @inode: inode to check for locks
 *
 * Return true if there are any FL_POSIX or FL_FLOCK locks currently
 * set on @inode.
 */
bool vfs_inode_has_locks(struct inode *inode)
{
        struct file_lock_context *ctx;
        bool ret;

        ctx = locks_inode_context(inode);
        if (!ctx)
                return false;

        spin_lock(&ctx->flc_lock);
        ret = !list_empty(&ctx->flc_posix) || !list_empty(&ctx->flc_flock);
        spin_unlock(&ctx->flc_lock);
        return ret;
}
EXPORT_SYMBOL_GPL(vfs_inode_has_locks);

#ifdef CONFIG_PROC_FS
#include <linux/proc_fs.h>
#include <linux/seq_file.h>

struct locks_iterator {
        int        li_cpu;
        loff_t        li_pos;
};

static void lock_get_status(struct seq_file *f, struct file_lock *fl,
                            loff_t id, char *pfx)
{
        struct inode *inode = NULL;
        unsigned int fl_pid;
        struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb);

        fl_pid = locks_translate_pid(fl, proc_pidns);
        /*
         * If lock owner is dead (and pid is freed) or not visible in current
         * pidns, zero is shown as a pid value. Check lock info from
         * init_pid_ns to get saved lock pid value.
         */

        if (fl->fl_file != NULL)
                inode = locks_inode(fl->fl_file);

        seq_printf(f, "%lld:%s ", id, pfx);
        if (IS_POSIX(fl)) {
                if (fl->fl_flags & FL_ACCESS)
                        seq_puts(f, "ACCESS");
                else if (IS_OFDLCK(fl))
                        seq_puts(f, "OFDLCK");
                else
                        seq_puts(f, "POSIX ");

                seq_printf(f, " %s ",
                             (inode == NULL) ? "*NOINODE*" :
                             mandatory_lock(inode) ? "MANDATORY" : "ADVISORY ");
        } else if (IS_FLOCK(fl)) {
                if (fl->fl_type & LOCK_MAND) {
                        seq_puts(f, "FLOCK  MSNFS     ");
                } else {
                        seq_puts(f, "FLOCK  ADVISORY  ");
                }
        } else if (IS_LEASE(fl)) {
                if (fl->fl_flags & FL_DELEG)
                        seq_puts(f, "DELEG  ");
                else
                        seq_puts(f, "LEASE  ");

                if (lease_breaking(fl))
                        seq_puts(f, "BREAKING  ");
                else if (fl->fl_file)
                        seq_puts(f, "ACTIVE    ");
                else
                        seq_puts(f, "BREAKER   ");
        } else {
                seq_puts(f, "UNKNOWN UNKNOWN  ");
        }
        if (fl->fl_type & LOCK_MAND) {
                seq_printf(f, "%s ",
                               (fl->fl_type & LOCK_READ)
                               ? (fl->fl_type & LOCK_WRITE) ? "RW   " : "READ "
                               : (fl->fl_type & LOCK_WRITE) ? "WRITE" : "NONE ");
        } else {
                int type = IS_LEASE(fl) ? target_leasetype(fl) : fl->fl_type;

                seq_printf(f, "%s ", (type == F_WRLCK) ? "WRITE" :
                                     (type == F_RDLCK) ? "READ" : "UNLCK");
        }
        if (inode) {
                /* userspace relies on this representation of dev_t */
                seq_printf(f, "%d %02x:%02x:%lu ", fl_pid,
                                MAJOR(inode->i_sb->s_dev),
                                MINOR(inode->i_sb->s_dev), inode->i_ino);
        } else {
                seq_printf(f, "%d <none>:0 ", fl_pid);
        }
        if (IS_POSIX(fl)) {
                if (fl->fl_end == OFFSET_MAX)
                        seq_printf(f, "%Ld EOF\n", fl->fl_start);
                else
                        seq_printf(f, "%Ld %Ld\n", fl->fl_start, fl->fl_end);
        } else {
                seq_puts(f, "0 EOF\n");
        }
}

static int locks_show(struct seq_file *f, void *v)
{
        struct locks_iterator *iter = f->private;
        struct file_lock *fl, *bfl;
        struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb);

        fl = hlist_entry(v, struct file_lock, fl_link);

        if (locks_translate_pid(fl, proc_pidns) == 0)
                return 0;

        lock_get_status(f, fl, iter->li_pos, "");

        list_for_each_entry(bfl, &fl->fl_blocked_requests, fl_blocked_member)
                lock_get_status(f, bfl, iter->li_pos, " ->");

        return 0;
}

static void __show_fd_locks(struct seq_file *f,
                        struct list_head *head, int *id,
                        struct file *filp, struct files_struct *files)
{
        struct file_lock *fl;

        list_for_each_entry(fl, head, fl_list) {

                if (filp != fl->fl_file)
                        continue;
                if (fl->fl_owner != files &&
                    fl->fl_owner != filp)
                        continue;

                (*id)++;
                seq_puts(f, "lock:\t");
                lock_get_status(f, fl, *id, "");
        }
}

void show_fd_locks(struct seq_file *f,
                  struct file *filp, struct files_struct *files)
{
        struct inode *inode = locks_inode(filp);
        struct file_lock_context *ctx;
        int id = 0;

        ctx = locks_inode_context(inode);
        if (!ctx)
                return;

        spin_lock(&ctx->flc_lock);
        __show_fd_locks(f, &ctx->flc_flock, &id, filp, files);
        __show_fd_locks(f, &ctx->flc_posix, &id, filp, files);
        __show_fd_locks(f, &ctx->flc_lease, &id, filp, files);
        spin_unlock(&ctx->flc_lock);
}

static void *locks_start(struct seq_file *f, loff_t *pos)
        __acquires(&blocked_lock_lock)
{
        struct locks_iterator *iter = f->private;

        iter->li_pos = *pos + 1;
        percpu_down_write(&file_rwsem);
        spin_lock(&blocked_lock_lock);
        return seq_hlist_start_percpu(&file_lock_list.hlist, &iter->li_cpu, *pos);
}

static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
{
        struct locks_iterator *iter = f->private;

        ++iter->li_pos;
        return seq_hlist_next_percpu(v, &file_lock_list.hlist, &iter->li_cpu, pos);
}

static void locks_stop(struct seq_file *f, void *v)
        __releases(&blocked_lock_lock)
{
        spin_unlock(&blocked_lock_lock);
        percpu_up_write(&file_rwsem);
}

static const struct seq_operations locks_seq_operations = {
        .start        = locks_start,
        .next        = locks_next,
        .stop        = locks_stop,
        .show        = locks_show,
};

static int __init proc_locks_init(void)
{
        proc_create_seq_private("locks", 0, NULL, &locks_seq_operations,
                        sizeof(struct locks_iterator), NULL);
        return 0;
}
fs_initcall(proc_locks_init);
#endif

static int __init filelock_init(void)
{
        int i;

        flctx_cache = kmem_cache_create("file_lock_ctx",
                        sizeof(struct file_lock_context), 0, SLAB_PANIC, NULL);

        filelock_cache = kmem_cache_create("file_lock_cache",
                        sizeof(struct file_lock), 0, SLAB_PANIC, NULL);

        for_each_possible_cpu(i) {
                struct file_lock_list_struct *fll = per_cpu_ptr(&file_lock_list, i);

                spin_lock_init(&fll->lock);
                INIT_HLIST_HEAD(&fll->hlist);
        }

        lease_notifier_chain_init();
        return 0;
}
core_initcall(filelock_init);















































    1 

    1 



























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  Copyright (C) 1993  Linus Torvalds
 *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
 *  SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
 *  Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
 *  Numa awareness, Christoph Lameter, SGI, June 2005
 *  Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019
 */

#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/highmem.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/set_memory.h>
#include <linux/debugobjects.h>
#include <linux/kallsyms.h>
#include <linux/list.h>
#include <linux/notifier.h>
#include <linux/rbtree.h>
#include <linux/xarray.h>
#include <linux/rcupdate.h>
#include <linux/pfn.h>
#include <linux/kmemleak.h>
#include <linux/atomic.h>
#include <linux/compiler.h>
#include <linux/llist.h>
#include <linux/bitops.h>
#include <linux/rbtree_augmented.h>
#include <linux/overflow.h>

#include <linux/uaccess.h>
#include <asm/tlbflush.h>
#include <asm/shmparam.h>

#include "internal.h"
#include "pgalloc-track.h"

bool is_vmalloc_addr(const void *x)
{
        unsigned long addr = (unsigned long)x;

        return addr >= VMALLOC_START && addr < VMALLOC_END;
}
EXPORT_SYMBOL(is_vmalloc_addr);

struct vfree_deferred {
        struct llist_head list;
        struct work_struct wq;
};
static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);

static void __vunmap(const void *, int);

static void free_work(struct work_struct *w)
{
        struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
        struct llist_node *t, *llnode;

        llist_for_each_safe(llnode, t, llist_del_all(&p->list))
                __vunmap((void *)llnode, 1);
}

/*** Page table manipulation functions ***/

static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                             pgtbl_mod_mask *mask)
{
        pte_t *pte;

        pte = pte_offset_kernel(pmd, addr);
        do {
                pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
                WARN_ON(!pte_none(ptent) && !pte_present(ptent));
        } while (pte++, addr += PAGE_SIZE, addr != end);
        *mask |= PGTBL_PTE_MODIFIED;
}

static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
                             pgtbl_mod_mask *mask)
{
        pmd_t *pmd;
        unsigned long next;
        int cleared;

        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);

                cleared = pmd_clear_huge(pmd);
                if (cleared || pmd_bad(*pmd))
                        *mask |= PGTBL_PMD_MODIFIED;

                if (cleared)
                        continue;
                if (pmd_none_or_clear_bad(pmd))
                        continue;
                vunmap_pte_range(pmd, addr, next, mask);

                cond_resched();
        } while (pmd++, addr = next, addr != end);
}

static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
                             pgtbl_mod_mask *mask)
{
        pud_t *pud;
        unsigned long next;
        int cleared;

        pud = pud_offset(p4d, addr);
        do {
                next = pud_addr_end(addr, end);

                cleared = pud_clear_huge(pud);
                if (cleared || pud_bad(*pud))
                        *mask |= PGTBL_PUD_MODIFIED;

                if (cleared)
                        continue;
                if (pud_none_or_clear_bad(pud))
                        continue;
                vunmap_pmd_range(pud, addr, next, mask);
        } while (pud++, addr = next, addr != end);
}

static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
                             pgtbl_mod_mask *mask)
{
        p4d_t *p4d;
        unsigned long next;
        int cleared;

        p4d = p4d_offset(pgd, addr);
        do {
                next = p4d_addr_end(addr, end);

                cleared = p4d_clear_huge(p4d);
                if (cleared || p4d_bad(*p4d))
                        *mask |= PGTBL_P4D_MODIFIED;

                if (cleared)
                        continue;
                if (p4d_none_or_clear_bad(p4d))
                        continue;
                vunmap_pud_range(p4d, addr, next, mask);
        } while (p4d++, addr = next, addr != end);
}

/**
 * unmap_kernel_range_noflush - unmap kernel VM area
 * @start: start of the VM area to unmap
 * @size: size of the VM area to unmap
 *
 * Unmap PFN_UP(@size) pages at @addr.  The VM area @addr and @size specify
 * should have been allocated using get_vm_area() and its friends.
 *
 * NOTE:
 * This function does NOT do any cache flushing.  The caller is responsible
 * for calling flush_cache_vunmap() on to-be-mapped areas before calling this
 * function and flush_tlb_kernel_range() after.
 */
void unmap_kernel_range_noflush(unsigned long start, unsigned long size)
{
        unsigned long end = start + size;
        unsigned long next;
        pgd_t *pgd;
        unsigned long addr = start;
        pgtbl_mod_mask mask = 0;

        BUG_ON(addr >= end);
        pgd = pgd_offset_k(addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_bad(*pgd))
                        mask |= PGTBL_PGD_MODIFIED;
                if (pgd_none_or_clear_bad(pgd))
                        continue;
                vunmap_p4d_range(pgd, addr, next, &mask);
        } while (pgd++, addr = next, addr != end);

        if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
                arch_sync_kernel_mappings(start, end);
}

static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
                unsigned long end, pgprot_t prot, struct page **pages, int *nr,
                pgtbl_mod_mask *mask)
{
        int err = 0;
        pte_t *pte;

        /*
         * nr is a running index into the array which helps higher level
         * callers keep track of where we're up to.
         */

        pte = pte_alloc_kernel_track(pmd, addr, mask);
        if (!pte)
                return -ENOMEM;
        do {
                struct page *page = pages[*nr];

                if (WARN_ON(!pte_none(*pte))) {
                        err = -EBUSY;
                        break;
                }
                if (WARN_ON(!page)) {
                        err = -ENOMEM;
                        break;
                }
                set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
                (*nr)++;
        } while (pte++, addr += PAGE_SIZE, addr != end);
        *mask |= PGTBL_PTE_MODIFIED;

        return err;
}

static int vmap_pmd_range(pud_t *pud, unsigned long addr,
                unsigned long end, pgprot_t prot, struct page **pages, int *nr,
                pgtbl_mod_mask *mask)
{
        pmd_t *pmd;
        unsigned long next;

        pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
        if (!pmd)
                return -ENOMEM;
        do {
                next = pmd_addr_end(addr, end);
                if (vmap_pte_range(pmd, addr, next, prot, pages, nr, mask))
                        return -ENOMEM;
        } while (pmd++, addr = next, addr != end);
        return 0;
}

static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
                unsigned long end, pgprot_t prot, struct page **pages, int *nr,
                pgtbl_mod_mask *mask)
{
        pud_t *pud;
        unsigned long next;

        pud = pud_alloc_track(&init_mm, p4d, addr, mask);
        if (!pud)
                return -ENOMEM;
        do {
                next = pud_addr_end(addr, end);
                if (vmap_pmd_range(pud, addr, next, prot, pages, nr, mask))
                        return -ENOMEM;
        } while (pud++, addr = next, addr != end);
        return 0;
}

static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
                unsigned long end, pgprot_t prot, struct page **pages, int *nr,
                pgtbl_mod_mask *mask)
{
        p4d_t *p4d;
        unsigned long next;

        p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
        if (!p4d)
                return -ENOMEM;
        do {
                next = p4d_addr_end(addr, end);
                if (vmap_pud_range(p4d, addr, next, prot, pages, nr, mask))
                        return -ENOMEM;
        } while (p4d++, addr = next, addr != end);
        return 0;
}

/**
 * map_kernel_range_noflush - map kernel VM area with the specified pages
 * @addr: start of the VM area to map
 * @size: size of the VM area to map
 * @prot: page protection flags to use
 * @pages: pages to map
 *
 * Map PFN_UP(@size) pages at @addr.  The VM area @addr and @size specify should
 * have been allocated using get_vm_area() and its friends.
 *
 * NOTE:
 * This function does NOT do any cache flushing.  The caller is responsible for
 * calling flush_cache_vmap() on to-be-mapped areas before calling this
 * function.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
int map_kernel_range_noflush(unsigned long addr, unsigned long size,
                             pgprot_t prot, struct page **pages)
{
        unsigned long start = addr;
        unsigned long end = addr + size;
        unsigned long next;
        pgd_t *pgd;
        int err = 0;
        int nr = 0;
        pgtbl_mod_mask mask = 0;

        BUG_ON(addr >= end);
        pgd = pgd_offset_k(addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_bad(*pgd))
                        mask |= PGTBL_PGD_MODIFIED;
                err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
                if (err)
                        return err;
        } while (pgd++, addr = next, addr != end);

        if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
                arch_sync_kernel_mappings(start, end);

        return 0;
}

int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot,
                struct page **pages)
{
        int ret;

        ret = map_kernel_range_noflush(start, size, prot, pages);
        flush_cache_vmap(start, start + size);
        return ret;
}

int is_vmalloc_or_module_addr(const void *x)
{
        /*
         * ARM, x86-64 and sparc64 put modules in a special place,
         * and fall back on vmalloc() if that fails. Others
         * just put it in the vmalloc space.
         */
#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
        unsigned long addr = (unsigned long)x;
        if (addr >= MODULES_VADDR && addr < MODULES_END)
                return 1;
#endif
        return is_vmalloc_addr(x);
}

/*
 * Walk a vmap address to the struct page it maps.
 */
struct page *vmalloc_to_page(const void *vmalloc_addr)
{
        unsigned long addr = (unsigned long) vmalloc_addr;
        struct page *page = NULL;
        pgd_t *pgd = pgd_offset_k(addr);
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;
        pte_t *ptep, pte;

        /*
         * XXX we might need to change this if we add VIRTUAL_BUG_ON for
         * architectures that do not vmalloc module space
         */
        VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));

        if (pgd_none(*pgd))
                return NULL;
        p4d = p4d_offset(pgd, addr);
        if (p4d_none(*p4d))
                return NULL;
        pud = pud_offset(p4d, addr);

        /*
         * Don't dereference bad PUD or PMD (below) entries. This will also
         * identify huge mappings, which we may encounter on architectures
         * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
         * identified as vmalloc addresses by is_vmalloc_addr(), but are
         * not [unambiguously] associated with a struct page, so there is
         * no correct value to return for them.
         */
        WARN_ON_ONCE(pud_bad(*pud));
        if (pud_none(*pud) || pud_bad(*pud))
                return NULL;
        pmd = pmd_offset(pud, addr);
        WARN_ON_ONCE(pmd_bad(*pmd));
        if (pmd_none(*pmd) || pmd_bad(*pmd))
                return NULL;

        ptep = pte_offset_map(pmd, addr);
        pte = *ptep;
        if (pte_present(pte))
                page = pte_page(pte);
        pte_unmap(ptep);
        return page;
}
EXPORT_SYMBOL(vmalloc_to_page);

/*
 * Map a vmalloc()-space virtual address to the physical page frame number.
 */
unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
{
        return page_to_pfn(vmalloc_to_page(vmalloc_addr));
}
EXPORT_SYMBOL(vmalloc_to_pfn);


/*** Global kva allocator ***/

#define DEBUG_AUGMENT_PROPAGATE_CHECK 0
#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0


static DEFINE_SPINLOCK(vmap_area_lock);
static DEFINE_SPINLOCK(free_vmap_area_lock);
/* Export for kexec only */
LIST_HEAD(vmap_area_list);
static LLIST_HEAD(vmap_purge_list);
static struct rb_root vmap_area_root = RB_ROOT;
static bool vmap_initialized __read_mostly;

/*
 * This kmem_cache is used for vmap_area objects. Instead of
 * allocating from slab we reuse an object from this cache to
 * make things faster. Especially in "no edge" splitting of
 * free block.
 */
static struct kmem_cache *vmap_area_cachep;

/*
 * This linked list is used in pair with free_vmap_area_root.
 * It gives O(1) access to prev/next to perform fast coalescing.
 */
static LIST_HEAD(free_vmap_area_list);

/*
 * This augment red-black tree represents the free vmap space.
 * All vmap_area objects in this tree are sorted by va->va_start
 * address. It is used for allocation and merging when a vmap
 * object is released.
 *
 * Each vmap_area node contains a maximum available free block
 * of its sub-tree, right or left. Therefore it is possible to
 * find a lowest match of free area.
 */
static struct rb_root free_vmap_area_root = RB_ROOT;

/*
 * Preload a CPU with one object for "no edge" split case. The
 * aim is to get rid of allocations from the atomic context, thus
 * to use more permissive allocation masks.
 */
static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);

static __always_inline unsigned long
va_size(struct vmap_area *va)
{
        return (va->va_end - va->va_start);
}

static __always_inline unsigned long
get_subtree_max_size(struct rb_node *node)
{
        struct vmap_area *va;

        va = rb_entry_safe(node, struct vmap_area, rb_node);
        return va ? va->subtree_max_size : 0;
}

/*
 * Gets called when remove the node and rotate.
 */
static __always_inline unsigned long
compute_subtree_max_size(struct vmap_area *va)
{
        return max3(va_size(va),
                get_subtree_max_size(va->rb_node.rb_left),
                get_subtree_max_size(va->rb_node.rb_right));
}

RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
        struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)

static void purge_vmap_area_lazy(void);
static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
static unsigned long lazy_max_pages(void);

static atomic_long_t nr_vmalloc_pages;

unsigned long vmalloc_nr_pages(void)
{
        return atomic_long_read(&nr_vmalloc_pages);
}

static struct vmap_area *__find_vmap_area(unsigned long addr)
{
        struct rb_node *n = vmap_area_root.rb_node;

        while (n) {
                struct vmap_area *va;

                va = rb_entry(n, struct vmap_area, rb_node);
                if (addr < va->va_start)
                        n = n->rb_left;
                else if (addr >= va->va_end)
                        n = n->rb_right;
                else
                        return va;
        }

        return NULL;
}

/*
 * This function returns back addresses of parent node
 * and its left or right link for further processing.
 *
 * Otherwise NULL is returned. In that case all further
 * steps regarding inserting of conflicting overlap range
 * have to be declined and actually considered as a bug.
 */
static __always_inline struct rb_node **
find_va_links(struct vmap_area *va,
        struct rb_root *root, struct rb_node *from,
        struct rb_node **parent)
{
        struct vmap_area *tmp_va;
        struct rb_node **link;

        if (root) {
                link = &root->rb_node;
                if (unlikely(!*link)) {
                        *parent = NULL;
                        return link;
                }
        } else {
                link = &from;
        }

        /*
         * Go to the bottom of the tree. When we hit the last point
         * we end up with parent rb_node and correct direction, i name
         * it link, where the new va->rb_node will be attached to.
         */
        do {
                tmp_va = rb_entry(*link, struct vmap_area, rb_node);

                /*
                 * During the traversal we also do some sanity check.
                 * Trigger the BUG() if there are sides(left/right)
                 * or full overlaps.
                 */
                if (va->va_start < tmp_va->va_end &&
                                va->va_end <= tmp_va->va_start)
                        link = &(*link)->rb_left;
                else if (va->va_end > tmp_va->va_start &&
                                va->va_start >= tmp_va->va_end)
                        link = &(*link)->rb_right;
                else {
                        WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n",
                                va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end);

                        return NULL;
                }
        } while (*link);

        *parent = &tmp_va->rb_node;
        return link;
}

static __always_inline struct list_head *
get_va_next_sibling(struct rb_node *parent, struct rb_node **link)
{
        struct list_head *list;

        if (unlikely(!parent))
                /*
                 * The red-black tree where we try to find VA neighbors
                 * before merging or inserting is empty, i.e. it means
                 * there is no free vmap space. Normally it does not
                 * happen but we handle this case anyway.
                 */
                return NULL;

        list = &rb_entry(parent, struct vmap_area, rb_node)->list;
        return (&parent->rb_right == link ? list->next : list);
}

static __always_inline void
link_va(struct vmap_area *va, struct rb_root *root,
        struct rb_node *parent, struct rb_node **link, struct list_head *head)
{
        /*
         * VA is still not in the list, but we can
         * identify its future previous list_head node.
         */
        if (likely(parent)) {
                head = &rb_entry(parent, struct vmap_area, rb_node)->list;
                if (&parent->rb_right != link)
                        head = head->prev;
        }

        /* Insert to the rb-tree */
        rb_link_node(&va->rb_node, parent, link);
        if (root == &free_vmap_area_root) {
                /*
                 * Some explanation here. Just perform simple insertion
                 * to the tree. We do not set va->subtree_max_size to
                 * its current size before calling rb_insert_augmented().
                 * It is because of we populate the tree from the bottom
                 * to parent levels when the node _is_ in the tree.
                 *
                 * Therefore we set subtree_max_size to zero after insertion,
                 * to let __augment_tree_propagate_from() puts everything to
                 * the correct order later on.
                 */
                rb_insert_augmented(&va->rb_node,
                        root, &free_vmap_area_rb_augment_cb);
                va->subtree_max_size = 0;
        } else {
                rb_insert_color(&va->rb_node, root);
        }

        /* Address-sort this list */
        list_add(&va->list, head);
}

static __always_inline void
unlink_va(struct vmap_area *va, struct rb_root *root)
{
        if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
                return;

        if (root == &free_vmap_area_root)
                rb_erase_augmented(&va->rb_node,
                        root, &free_vmap_area_rb_augment_cb);
        else
                rb_erase(&va->rb_node, root);

        list_del(&va->list);
        RB_CLEAR_NODE(&va->rb_node);
}

#if DEBUG_AUGMENT_PROPAGATE_CHECK
static void
augment_tree_propagate_check(void)
{
        struct vmap_area *va;
        unsigned long computed_size;

        list_for_each_entry(va, &free_vmap_area_list, list) {
                computed_size = compute_subtree_max_size(va);
                if (computed_size != va->subtree_max_size)
                        pr_emerg("tree is corrupted: %lu, %lu\n",
                                va_size(va), va->subtree_max_size);
        }
}
#endif

/*
 * This function populates subtree_max_size from bottom to upper
 * levels starting from VA point. The propagation must be done
 * when VA size is modified by changing its va_start/va_end. Or
 * in case of newly inserting of VA to the tree.
 *
 * It means that __augment_tree_propagate_from() must be called:
 * - After VA has been inserted to the tree(free path);
 * - After VA has been shrunk(allocation path);
 * - After VA has been increased(merging path).
 *
 * Please note that, it does not mean that upper parent nodes
 * and their subtree_max_size are recalculated all the time up
 * to the root node.
 *
 *       4--8
 *        /\
 *       /  \
 *      /    \
 *    2--2  8--8
 *
 * For example if we modify the node 4, shrinking it to 2, then
 * no any modification is required. If we shrink the node 2 to 1
 * its subtree_max_size is updated only, and set to 1. If we shrink
 * the node 8 to 6, then its subtree_max_size is set to 6 and parent
 * node becomes 4--6.
 */
static __always_inline void
augment_tree_propagate_from(struct vmap_area *va)
{
        /*
         * Populate the tree from bottom towards the root until
         * the calculated maximum available size of checked node
         * is equal to its current one.
         */
        free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL);

#if DEBUG_AUGMENT_PROPAGATE_CHECK
        augment_tree_propagate_check();
#endif
}

static void
insert_vmap_area(struct vmap_area *va,
        struct rb_root *root, struct list_head *head)
{
        struct rb_node **link;
        struct rb_node *parent;

        link = find_va_links(va, root, NULL, &parent);
        if (link)
                link_va(va, root, parent, link, head);
}

static void
insert_vmap_area_augment(struct vmap_area *va,
        struct rb_node *from, struct rb_root *root,
        struct list_head *head)
{
        struct rb_node **link;
        struct rb_node *parent;

        if (from)
                link = find_va_links(va, NULL, from, &parent);
        else
                link = find_va_links(va, root, NULL, &parent);

        if (link) {
                link_va(va, root, parent, link, head);
                augment_tree_propagate_from(va);
        }
}

/*
 * Merge de-allocated chunk of VA memory with previous
 * and next free blocks. If coalesce is not done a new
 * free area is inserted. If VA has been merged, it is
 * freed.
 *
 * Please note, it can return NULL in case of overlap
 * ranges, followed by WARN() report. Despite it is a
 * buggy behaviour, a system can be alive and keep
 * ongoing.
 */
static __always_inline struct vmap_area *
merge_or_add_vmap_area(struct vmap_area *va,
        struct rb_root *root, struct list_head *head)
{
        struct vmap_area *sibling;
        struct list_head *next;
        struct rb_node **link;
        struct rb_node *parent;
        bool merged = false;

        /*
         * Find a place in the tree where VA potentially will be
         * inserted, unless it is merged with its sibling/siblings.
         */
        link = find_va_links(va, root, NULL, &parent);
        if (!link)
                return NULL;

        /*
         * Get next node of VA to check if merging can be done.
         */
        next = get_va_next_sibling(parent, link);
        if (unlikely(next == NULL))
                goto insert;

        /*
         * start            end
         * |                |
         * |<------VA------>|<-----Next----->|
         *                  |                |
         *                  start            end
         */
        if (next != head) {
                sibling = list_entry(next, struct vmap_area, list);
                if (sibling->va_start == va->va_end) {
                        sibling->va_start = va->va_start;

                        /* Free vmap_area object. */
                        kmem_cache_free(vmap_area_cachep, va);

                        /* Point to the new merged area. */
                        va = sibling;
                        merged = true;
                }
        }

        /*
         * start            end
         * |                |
         * |<-----Prev----->|<------VA------>|
         *                  |                |
         *                  start            end
         */
        if (next->prev != head) {
                sibling = list_entry(next->prev, struct vmap_area, list);
                if (sibling->va_end == va->va_start) {
                        /*
                         * If both neighbors are coalesced, it is important
                         * to unlink the "next" node first, followed by merging
                         * with "previous" one. Otherwise the tree might not be
                         * fully populated if a sibling's augmented value is
                         * "normalized" because of rotation operations.
                         */
                        if (merged)
                                unlink_va(va, root);

                        sibling->va_end = va->va_end;

                        /* Free vmap_area object. */
                        kmem_cache_free(vmap_area_cachep, va);

                        /* Point to the new merged area. */
                        va = sibling;
                        merged = true;
                }
        }

insert:
        if (!merged)
                link_va(va, root, parent, link, head);

        /*
         * Last step is to check and update the tree.
         */
        augment_tree_propagate_from(va);
        return va;
}

static __always_inline bool
is_within_this_va(struct vmap_area *va, unsigned long size,
        unsigned long align, unsigned long vstart)
{
        unsigned long nva_start_addr;

        if (va->va_start > vstart)
                nva_start_addr = ALIGN(va->va_start, align);
        else
                nva_start_addr = ALIGN(vstart, align);

        /* Can be overflowed due to big size or alignment. */
        if (nva_start_addr + size < nva_start_addr ||
                        nva_start_addr < vstart)
                return false;

        return (nva_start_addr + size <= va->va_end);
}

/*
 * Find the first free block(lowest start address) in the tree,
 * that will accomplish the request corresponding to passing
 * parameters.
 */
static __always_inline struct vmap_area *
find_vmap_lowest_match(unsigned long size,
        unsigned long align, unsigned long vstart)
{
        struct vmap_area *va;
        struct rb_node *node;
        unsigned long length;

        /* Start from the root. */
        node = free_vmap_area_root.rb_node;

        /* Adjust the search size for alignment overhead. */
        length = size + align - 1;

        while (node) {
                va = rb_entry(node, struct vmap_area, rb_node);

                if (get_subtree_max_size(node->rb_left) >= length &&
                                vstart < va->va_start) {
                        node = node->rb_left;
                } else {
                        if (is_within_this_va(va, size, align, vstart))
                                return va;

                        /*
                         * Does not make sense to go deeper towards the right
                         * sub-tree if it does not have a free block that is
                         * equal or bigger to the requested search length.
                         */
                        if (get_subtree_max_size(node->rb_right) >= length) {
                                node = node->rb_right;
                                continue;
                        }

                        /*
                         * OK. We roll back and find the first right sub-tree,
                         * that will satisfy the search criteria. It can happen
                         * only once due to "vstart" restriction.
                         */
                        while ((node = rb_parent(node))) {
                                va = rb_entry(node, struct vmap_area, rb_node);
                                if (is_within_this_va(va, size, align, vstart))
                                        return va;

                                if (get_subtree_max_size(node->rb_right) >= length &&
                                                vstart <= va->va_start) {
                                        node = node->rb_right;
                                        break;
                                }
                        }
                }
        }

        return NULL;
}

#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
#include <linux/random.h>

static struct vmap_area *
find_vmap_lowest_linear_match(unsigned long size,
        unsigned long align, unsigned long vstart)
{
        struct vmap_area *va;

        list_for_each_entry(va, &free_vmap_area_list, list) {
                if (!is_within_this_va(va, size, align, vstart))
                        continue;

                return va;
        }

        return NULL;
}

static void
find_vmap_lowest_match_check(unsigned long size)
{
        struct vmap_area *va_1, *va_2;
        unsigned long vstart;
        unsigned int rnd;

        get_random_bytes(&rnd, sizeof(rnd));
        vstart = VMALLOC_START + rnd;

        va_1 = find_vmap_lowest_match(size, 1, vstart);
        va_2 = find_vmap_lowest_linear_match(size, 1, vstart);

        if (va_1 != va_2)
                pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
                        va_1, va_2, vstart);
}
#endif

enum fit_type {
        NOTHING_FIT = 0,
        FL_FIT_TYPE = 1,        /* full fit */
        LE_FIT_TYPE = 2,        /* left edge fit */
        RE_FIT_TYPE = 3,        /* right edge fit */
        NE_FIT_TYPE = 4                /* no edge fit */
};

static __always_inline enum fit_type
classify_va_fit_type(struct vmap_area *va,
        unsigned long nva_start_addr, unsigned long size)
{
        enum fit_type type;

        /* Check if it is within VA. */
        if (nva_start_addr < va->va_start ||
                        nva_start_addr + size > va->va_end)
                return NOTHING_FIT;

        /* Now classify. */
        if (va->va_start == nva_start_addr) {
                if (va->va_end == nva_start_addr + size)
                        type = FL_FIT_TYPE;
                else
                        type = LE_FIT_TYPE;
        } else if (va->va_end == nva_start_addr + size) {
                type = RE_FIT_TYPE;
        } else {
                type = NE_FIT_TYPE;
        }

        return type;
}

static __always_inline int
adjust_va_to_fit_type(struct vmap_area *va,
        unsigned long nva_start_addr, unsigned long size,
        enum fit_type type)
{
        struct vmap_area *lva = NULL;

        if (type == FL_FIT_TYPE) {
                /*
                 * No need to split VA, it fully fits.
                 *
                 * |               |
                 * V      NVA      V
                 * |---------------|
                 */
                unlink_va(va, &free_vmap_area_root);
                kmem_cache_free(vmap_area_cachep, va);
        } else if (type == LE_FIT_TYPE) {
                /*
                 * Split left edge of fit VA.
                 *
                 * |       |
                 * V  NVA  V   R
                 * |-------|-------|
                 */
                va->va_start += size;
        } else if (type == RE_FIT_TYPE) {
                /*
                 * Split right edge of fit VA.
                 *
                 *         |       |
                 *     L   V  NVA  V
                 * |-------|-------|
                 */
                va->va_end = nva_start_addr;
        } else if (type == NE_FIT_TYPE) {
                /*
                 * Split no edge of fit VA.
                 *
                 *     |       |
                 *   L V  NVA  V R
                 * |---|-------|---|
                 */
                lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
                if (unlikely(!lva)) {
                        /*
                         * For percpu allocator we do not do any pre-allocation
                         * and leave it as it is. The reason is it most likely
                         * never ends up with NE_FIT_TYPE splitting. In case of
                         * percpu allocations offsets and sizes are aligned to
                         * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
                         * are its main fitting cases.
                         *
                         * There are a few exceptions though, as an example it is
                         * a first allocation (early boot up) when we have "one"
                         * big free space that has to be split.
                         *
                         * Also we can hit this path in case of regular "vmap"
                         * allocations, if "this" current CPU was not preloaded.
                         * See the comment in alloc_vmap_area() why. If so, then
                         * GFP_NOWAIT is used instead to get an extra object for
                         * split purpose. That is rare and most time does not
                         * occur.
                         *
                         * What happens if an allocation gets failed. Basically,
                         * an "overflow" path is triggered to purge lazily freed
                         * areas to free some memory, then, the "retry" path is
                         * triggered to repeat one more time. See more details
                         * in alloc_vmap_area() function.
                         */
                        lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
                        if (!lva)
                                return -1;
                }

                /*
                 * Build the remainder.
                 */
                lva->va_start = va->va_start;
                lva->va_end = nva_start_addr;

                /*
                 * Shrink this VA to remaining size.
                 */
                va->va_start = nva_start_addr + size;
        } else {
                return -1;
        }

        if (type != FL_FIT_TYPE) {
                augment_tree_propagate_from(va);

                if (lva)        /* type == NE_FIT_TYPE */
                        insert_vmap_area_augment(lva, &va->rb_node,
                                &free_vmap_area_root, &free_vmap_area_list);
        }

        return 0;
}

/*
 * Returns a start address of the newly allocated area, if success.
 * Otherwise a vend is returned that indicates failure.
 */
static __always_inline unsigned long
__alloc_vmap_area(unsigned long size, unsigned long align,
        unsigned long vstart, unsigned long vend)
{
        unsigned long nva_start_addr;
        struct vmap_area *va;
        enum fit_type type;
        int ret;

        va = find_vmap_lowest_match(size, align, vstart);
        if (unlikely(!va))
                return vend;

        if (va->va_start > vstart)
                nva_start_addr = ALIGN(va->va_start, align);
        else
                nva_start_addr = ALIGN(vstart, align);

        /* Check the "vend" restriction. */
        if (nva_start_addr + size > vend)
                return vend;

        /* Classify what we have found. */
        type = classify_va_fit_type(va, nva_start_addr, size);
        if (WARN_ON_ONCE(type == NOTHING_FIT))
                return vend;

        /* Update the free vmap_area. */
        ret = adjust_va_to_fit_type(va, nva_start_addr, size, type);
        if (ret)
                return vend;

#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
        find_vmap_lowest_match_check(size);
#endif

        return nva_start_addr;
}

/*
 * Free a region of KVA allocated by alloc_vmap_area
 */
static void free_vmap_area(struct vmap_area *va)
{
        /*
         * Remove from the busy tree/list.
         */
        spin_lock(&vmap_area_lock);
        unlink_va(va, &vmap_area_root);
        spin_unlock(&vmap_area_lock);

        /*
         * Insert/Merge it back to the free tree/list.
         */
        spin_lock(&free_vmap_area_lock);
        merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list);
        spin_unlock(&free_vmap_area_lock);
}

/*
 * Allocate a region of KVA of the specified size and alignment, within the
 * vstart and vend.
 */
static struct vmap_area *alloc_vmap_area(unsigned long size,
                                unsigned long align,
                                unsigned long vstart, unsigned long vend,
                                int node, gfp_t gfp_mask)
{
        struct vmap_area *va, *pva;
        unsigned long addr;
        int purged = 0;
        int ret;

        BUG_ON(!size);
        BUG_ON(offset_in_page(size));
        BUG_ON(!is_power_of_2(align));

        if (unlikely(!vmap_initialized))
                return ERR_PTR(-EBUSY);

        might_sleep();
        gfp_mask = gfp_mask & GFP_RECLAIM_MASK;

        va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
        if (unlikely(!va))
                return ERR_PTR(-ENOMEM);

        /*
         * Only scan the relevant parts containing pointers to other objects
         * to avoid false negatives.
         */
        kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);

retry:
        /*
         * Preload this CPU with one extra vmap_area object. It is used
         * when fit type of free area is NE_FIT_TYPE. Please note, it
         * does not guarantee that an allocation occurs on a CPU that
         * is preloaded, instead we minimize the case when it is not.
         * It can happen because of cpu migration, because there is a
         * race until the below spinlock is taken.
         *
         * The preload is done in non-atomic context, thus it allows us
         * to use more permissive allocation masks to be more stable under
         * low memory condition and high memory pressure. In rare case,
         * if not preloaded, GFP_NOWAIT is used.
         *
         * Set "pva" to NULL here, because of "retry" path.
         */
        pva = NULL;

        if (!this_cpu_read(ne_fit_preload_node))
                /*
                 * Even if it fails we do not really care about that.
                 * Just proceed as it is. If needed "overflow" path
                 * will refill the cache we allocate from.
                 */
                pva = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);

        spin_lock(&free_vmap_area_lock);

        if (pva && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva))
                kmem_cache_free(vmap_area_cachep, pva);

        /*
         * If an allocation fails, the "vend" address is
         * returned. Therefore trigger the overflow path.
         */
        addr = __alloc_vmap_area(size, align, vstart, vend);
        spin_unlock(&free_vmap_area_lock);

        if (unlikely(addr == vend))
                goto overflow;

        va->va_start = addr;
        va->va_end = addr + size;
        va->vm = NULL;


        spin_lock(&vmap_area_lock);
        insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
        spin_unlock(&vmap_area_lock);

        BUG_ON(!IS_ALIGNED(va->va_start, align));
        BUG_ON(va->va_start < vstart);
        BUG_ON(va->va_end > vend);

        ret = kasan_populate_vmalloc(addr, size);
        if (ret) {
                free_vmap_area(va);
                return ERR_PTR(ret);
        }

        return va;

overflow:
        if (!purged) {
                purge_vmap_area_lazy();
                purged = 1;
                goto retry;
        }

        if (gfpflags_allow_blocking(gfp_mask)) {
                unsigned long freed = 0;
                blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
                if (freed > 0) {
                        purged = 0;
                        goto retry;
                }
        }

        if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
                pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
                        size);

        kmem_cache_free(vmap_area_cachep, va);
        return ERR_PTR(-EBUSY);
}

int register_vmap_purge_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_register(&vmap_notify_list, nb);
}
EXPORT_SYMBOL_GPL(register_vmap_purge_notifier);

int unregister_vmap_purge_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_unregister(&vmap_notify_list, nb);
}
EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);

/*
 * lazy_max_pages is the maximum amount of virtual address space we gather up
 * before attempting to purge with a TLB flush.
 *
 * There is a tradeoff here: a larger number will cover more kernel page tables
 * and take slightly longer to purge, but it will linearly reduce the number of
 * global TLB flushes that must be performed. It would seem natural to scale
 * this number up linearly with the number of CPUs (because vmapping activity
 * could also scale linearly with the number of CPUs), however it is likely
 * that in practice, workloads might be constrained in other ways that mean
 * vmap activity will not scale linearly with CPUs. Also, I want to be
 * conservative and not introduce a big latency on huge systems, so go with
 * a less aggressive log scale. It will still be an improvement over the old
 * code, and it will be simple to change the scale factor if we find that it
 * becomes a problem on bigger systems.
 */
static unsigned long lazy_max_pages(void)
{
        unsigned int log;

        log = fls(num_online_cpus());

        return log * (32UL * 1024 * 1024 / PAGE_SIZE);
}

static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);

/*
 * Serialize vmap purging.  There is no actual criticial section protected
 * by this look, but we want to avoid concurrent calls for performance
 * reasons and to make the pcpu_get_vm_areas more deterministic.
 */
static DEFINE_MUTEX(vmap_purge_lock);

/* for per-CPU blocks */
static void purge_fragmented_blocks_allcpus(void);

/*
 * called before a call to iounmap() if the caller wants vm_area_struct's
 * immediately freed.
 */
void set_iounmap_nonlazy(void)
{
        atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1);
}

/*
 * Purges all lazily-freed vmap areas.
 */
static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
{
        unsigned long resched_threshold;
        struct llist_node *valist;
        struct vmap_area *va;
        struct vmap_area *n_va;

        lockdep_assert_held(&vmap_purge_lock);

        valist = llist_del_all(&vmap_purge_list);
        if (unlikely(valist == NULL))
                return false;

        /*
         * TODO: to calculate a flush range without looping.
         * The list can be up to lazy_max_pages() elements.
         */
        llist_for_each_entry(va, valist, purge_list) {
                if (va->va_start < start)
                        start = va->va_start;
                if (va->va_end > end)
                        end = va->va_end;
        }

        flush_tlb_kernel_range(start, end);
        resched_threshold = lazy_max_pages() << 1;

        spin_lock(&free_vmap_area_lock);
        llist_for_each_entry_safe(va, n_va, valist, purge_list) {
                unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
                unsigned long orig_start = va->va_start;
                unsigned long orig_end = va->va_end;

                /*
                 * Finally insert or merge lazily-freed area. It is
                 * detached and there is no need to "unlink" it from
                 * anything.
                 */
                va = merge_or_add_vmap_area(va, &free_vmap_area_root,
                                            &free_vmap_area_list);

                if (!va)
                        continue;

                if (is_vmalloc_or_module_addr((void *)orig_start))
                        kasan_release_vmalloc(orig_start, orig_end,
                                              va->va_start, va->va_end);

                atomic_long_sub(nr, &vmap_lazy_nr);

                if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
                        cond_resched_lock(&free_vmap_area_lock);
        }
        spin_unlock(&free_vmap_area_lock);
        return true;
}

/*
 * Kick off a purge of the outstanding lazy areas. Don't bother if somebody
 * is already purging.
 */
static void try_purge_vmap_area_lazy(void)
{
        if (mutex_trylock(&vmap_purge_lock)) {
                __purge_vmap_area_lazy(ULONG_MAX, 0);
                mutex_unlock(&vmap_purge_lock);
        }
}

/*
 * Kick off a purge of the outstanding lazy areas.
 */
static void purge_vmap_area_lazy(void)
{
        mutex_lock(&vmap_purge_lock);
        purge_fragmented_blocks_allcpus();
        __purge_vmap_area_lazy(ULONG_MAX, 0);
        mutex_unlock(&vmap_purge_lock);
}

/*
 * Free a vmap area, caller ensuring that the area has been unmapped
 * and flush_cache_vunmap had been called for the correct range
 * previously.
 */
static void free_vmap_area_noflush(struct vmap_area *va)
{
        unsigned long nr_lazy;

        spin_lock(&vmap_area_lock);
        unlink_va(va, &vmap_area_root);
        spin_unlock(&vmap_area_lock);

        nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
                                PAGE_SHIFT, &vmap_lazy_nr);

        /* After this point, we may free va at any time */
        llist_add(&va->purge_list, &vmap_purge_list);

        if (unlikely(nr_lazy > lazy_max_pages()))
                try_purge_vmap_area_lazy();
}

/*
 * Free and unmap a vmap area
 */
static void free_unmap_vmap_area(struct vmap_area *va)
{
        flush_cache_vunmap(va->va_start, va->va_end);
        unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start);
        if (debug_pagealloc_enabled_static())
                flush_tlb_kernel_range(va->va_start, va->va_end);

        free_vmap_area_noflush(va);
}

static struct vmap_area *find_vmap_area(unsigned long addr)
{
        struct vmap_area *va;

        spin_lock(&vmap_area_lock);
        va = __find_vmap_area(addr);
        spin_unlock(&vmap_area_lock);

        return va;
}

/*** Per cpu kva allocator ***/

/*
 * vmap space is limited especially on 32 bit architectures. Ensure there is
 * room for at least 16 percpu vmap blocks per CPU.
 */
/*
 * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
 * to #define VMALLOC_SPACE                (VMALLOC_END-VMALLOC_START). Guess
 * instead (we just need a rough idea)
 */
#if BITS_PER_LONG == 32
#define VMALLOC_SPACE                (128UL*1024*1024)
#else
#define VMALLOC_SPACE                (128UL*1024*1024*1024)
#endif

#define VMALLOC_PAGES                (VMALLOC_SPACE / PAGE_SIZE)
#define VMAP_MAX_ALLOC                BITS_PER_LONG        /* 256K with 4K pages */
#define VMAP_BBMAP_BITS_MAX        1024        /* 4MB with 4K pages */
#define VMAP_BBMAP_BITS_MIN        (VMAP_MAX_ALLOC*2)
#define VMAP_MIN(x, y)                ((x) < (y) ? (x) : (y)) /* can't use min() */
#define VMAP_MAX(x, y)                ((x) > (y) ? (x) : (y)) /* can't use max() */
#define VMAP_BBMAP_BITS                \
                VMAP_MIN(VMAP_BBMAP_BITS_MAX,        \
                VMAP_MAX(VMAP_BBMAP_BITS_MIN,        \
                        VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))

#define VMAP_BLOCK_SIZE                (VMAP_BBMAP_BITS * PAGE_SIZE)

struct vmap_block_queue {
        spinlock_t lock;
        struct list_head free;
};

struct vmap_block {
        spinlock_t lock;
        struct vmap_area *va;
        unsigned long free, dirty;
        unsigned long dirty_min, dirty_max; /*< dirty range */
        struct list_head free_list;
        struct rcu_head rcu_head;
        struct list_head purge;
};

/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);

/*
 * XArray of vmap blocks, indexed by address, to quickly find a vmap block
 * in the free path. Could get rid of this if we change the API to return a
 * "cookie" from alloc, to be passed to free. But no big deal yet.
 */
static DEFINE_XARRAY(vmap_blocks);

/*
 * We should probably have a fallback mechanism to allocate virtual memory
 * out of partially filled vmap blocks. However vmap block sizing should be
 * fairly reasonable according to the vmalloc size, so it shouldn't be a
 * big problem.
 */

static unsigned long addr_to_vb_idx(unsigned long addr)
{
        addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
        addr /= VMAP_BLOCK_SIZE;
        return addr;
}

static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
{
        unsigned long addr;

        addr = va_start + (pages_off << PAGE_SHIFT);
        BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
        return (void *)addr;
}

/**
 * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
 *                  block. Of course pages number can't exceed VMAP_BBMAP_BITS
 * @order:    how many 2^order pages should be occupied in newly allocated block
 * @gfp_mask: flags for the page level allocator
 *
 * Return: virtual address in a newly allocated block or ERR_PTR(-errno)
 */
static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
{
        struct vmap_block_queue *vbq;
        struct vmap_block *vb;
        struct vmap_area *va;
        unsigned long vb_idx;
        int node, err;
        void *vaddr;

        node = numa_node_id();

        vb = kmalloc_node(sizeof(struct vmap_block),
                        gfp_mask & GFP_RECLAIM_MASK, node);
        if (unlikely(!vb))
                return ERR_PTR(-ENOMEM);

        va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
                                        VMALLOC_START, VMALLOC_END,
                                        node, gfp_mask);
        if (IS_ERR(va)) {
                kfree(vb);
                return ERR_CAST(va);
        }

        vaddr = vmap_block_vaddr(va->va_start, 0);
        spin_lock_init(&vb->lock);
        vb->va = va;
        /* At least something should be left free */
        BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
        vb->free = VMAP_BBMAP_BITS - (1UL << order);
        vb->dirty = 0;
        vb->dirty_min = VMAP_BBMAP_BITS;
        vb->dirty_max = 0;
        INIT_LIST_HEAD(&vb->free_list);

        vb_idx = addr_to_vb_idx(va->va_start);
        err = xa_insert(&vmap_blocks, vb_idx, vb, gfp_mask);
        if (err) {
                kfree(vb);
                free_vmap_area(va);
                return ERR_PTR(err);
        }

        vbq = &get_cpu_var(vmap_block_queue);
        spin_lock(&vbq->lock);
        list_add_tail_rcu(&vb->free_list, &vbq->free);
        spin_unlock(&vbq->lock);
        put_cpu_var(vmap_block_queue);

        return vaddr;
}

static void free_vmap_block(struct vmap_block *vb)
{
        struct vmap_block *tmp;

        tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start));
        BUG_ON(tmp != vb);

        free_vmap_area_noflush(vb->va);
        kfree_rcu(vb, rcu_head);
}

static void purge_fragmented_blocks(int cpu)
{
        LIST_HEAD(purge);
        struct vmap_block *vb;
        struct vmap_block *n_vb;
        struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);

        rcu_read_lock();
        list_for_each_entry_rcu(vb, &vbq->free, free_list) {

                if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS))
                        continue;

                spin_lock(&vb->lock);
                if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
                        vb->free = 0; /* prevent further allocs after releasing lock */
                        vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
                        vb->dirty_min = 0;
                        vb->dirty_max = VMAP_BBMAP_BITS;
                        spin_lock(&vbq->lock);
                        list_del_rcu(&vb->free_list);
                        spin_unlock(&vbq->lock);
                        spin_unlock(&vb->lock);
                        list_add_tail(&vb->purge, &purge);
                } else
                        spin_unlock(&vb->lock);
        }
        rcu_read_unlock();

        list_for_each_entry_safe(vb, n_vb, &purge, purge) {
                list_del(&vb->purge);
                free_vmap_block(vb);
        }
}

static void purge_fragmented_blocks_allcpus(void)
{
        int cpu;

        for_each_possible_cpu(cpu)
                purge_fragmented_blocks(cpu);
}

static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
{
        struct vmap_block_queue *vbq;
        struct vmap_block *vb;
        void *vaddr = NULL;
        unsigned int order;

        BUG_ON(offset_in_page(size));
        BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
        if (WARN_ON(size == 0)) {
                /*
                 * Allocating 0 bytes isn't what caller wants since
                 * get_order(0) returns funny result. Just warn and terminate
                 * early.
                 */
                return NULL;
        }
        order = get_order(size);

        rcu_read_lock();
        vbq = &get_cpu_var(vmap_block_queue);
        list_for_each_entry_rcu(vb, &vbq->free, free_list) {
                unsigned long pages_off;

                spin_lock(&vb->lock);
                if (vb->free < (1UL << order)) {
                        spin_unlock(&vb->lock);
                        continue;
                }

                pages_off = VMAP_BBMAP_BITS - vb->free;
                vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
                vb->free -= 1UL << order;
                if (vb->free == 0) {
                        spin_lock(&vbq->lock);
                        list_del_rcu(&vb->free_list);
                        spin_unlock(&vbq->lock);
                }

                spin_unlock(&vb->lock);
                break;
        }

        put_cpu_var(vmap_block_queue);
        rcu_read_unlock();

        /* Allocate new block if nothing was found */
        if (!vaddr)
                vaddr = new_vmap_block(order, gfp_mask);

        return vaddr;
}

static void vb_free(unsigned long addr, unsigned long size)
{
        unsigned long offset;
        unsigned int order;
        struct vmap_block *vb;

        BUG_ON(offset_in_page(size));
        BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);

        flush_cache_vunmap(addr, addr + size);

        order = get_order(size);
        offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
        vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr));

        unmap_kernel_range_noflush(addr, size);

        if (debug_pagealloc_enabled_static())
                flush_tlb_kernel_range(addr, addr + size);

        spin_lock(&vb->lock);

        /* Expand dirty range */
        vb->dirty_min = min(vb->dirty_min, offset);
        vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));

        vb->dirty += 1UL << order;
        if (vb->dirty == VMAP_BBMAP_BITS) {
                BUG_ON(vb->free);
                spin_unlock(&vb->lock);
                free_vmap_block(vb);
        } else
                spin_unlock(&vb->lock);
}

static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
{
        int cpu;

        if (unlikely(!vmap_initialized))
                return;

        might_sleep();

        for_each_possible_cpu(cpu) {
                struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
                struct vmap_block *vb;

                rcu_read_lock();
                list_for_each_entry_rcu(vb, &vbq->free, free_list) {
                        spin_lock(&vb->lock);
                        if (vb->dirty) {
                                unsigned long va_start = vb->va->va_start;
                                unsigned long s, e;

                                s = va_start + (vb->dirty_min << PAGE_SHIFT);
                                e = va_start + (vb->dirty_max << PAGE_SHIFT);

                                start = min(s, start);
                                end   = max(e, end);

                                flush = 1;
                        }
                        spin_unlock(&vb->lock);
                }
                rcu_read_unlock();
        }

        mutex_lock(&vmap_purge_lock);
        purge_fragmented_blocks_allcpus();
        if (!__purge_vmap_area_lazy(start, end) && flush)
                flush_tlb_kernel_range(start, end);
        mutex_unlock(&vmap_purge_lock);
}

/**
 * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
 *
 * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
 * to amortize TLB flushing overheads. What this means is that any page you
 * have now, may, in a former life, have been mapped into kernel virtual
 * address by the vmap layer and so there might be some CPUs with TLB entries
 * still referencing that page (additional to the regular 1:1 kernel mapping).
 *
 * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
 * be sure that none of the pages we have control over will have any aliases
 * from the vmap layer.
 */
void vm_unmap_aliases(void)
{
        unsigned long start = ULONG_MAX, end = 0;
        int flush = 0;

        _vm_unmap_aliases(start, end, flush);
}
EXPORT_SYMBOL_GPL(vm_unmap_aliases);

/**
 * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
 * @mem: the pointer returned by vm_map_ram
 * @count: the count passed to that vm_map_ram call (cannot unmap partial)
 */
void vm_unmap_ram(const void *mem, unsigned int count)
{
        unsigned long size = (unsigned long)count << PAGE_SHIFT;
        unsigned long addr = (unsigned long)mem;
        struct vmap_area *va;

        might_sleep();
        BUG_ON(!addr);
        BUG_ON(addr < VMALLOC_START);
        BUG_ON(addr > VMALLOC_END);
        BUG_ON(!PAGE_ALIGNED(addr));

        kasan_poison_vmalloc(mem, size);

        if (likely(count <= VMAP_MAX_ALLOC)) {
                debug_check_no_locks_freed(mem, size);
                vb_free(addr, size);
                return;
        }

        va = find_vmap_area(addr);
        BUG_ON(!va);
        debug_check_no_locks_freed((void *)va->va_start,
                                    (va->va_end - va->va_start));
        free_unmap_vmap_area(va);
}
EXPORT_SYMBOL(vm_unmap_ram);

/**
 * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
 * @pages: an array of pointers to the pages to be mapped
 * @count: number of pages
 * @node: prefer to allocate data structures on this node
 *
 * If you use this function for less than VMAP_MAX_ALLOC pages, it could be
 * faster than vmap so it's good.  But if you mix long-life and short-life
 * objects with vm_map_ram(), it could consume lots of address space through
 * fragmentation (especially on a 32bit machine).  You could see failures in
 * the end.  Please use this function for short-lived objects.
 *
 * Returns: a pointer to the address that has been mapped, or %NULL on failure
 */
void *vm_map_ram(struct page **pages, unsigned int count, int node)
{
        unsigned long size = (unsigned long)count << PAGE_SHIFT;
        unsigned long addr;
        void *mem;

        if (likely(count <= VMAP_MAX_ALLOC)) {
                mem = vb_alloc(size, GFP_KERNEL);
                if (IS_ERR(mem))
                        return NULL;
                addr = (unsigned long)mem;
        } else {
                struct vmap_area *va;
                va = alloc_vmap_area(size, PAGE_SIZE,
                                VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
                if (IS_ERR(va))
                        return NULL;

                addr = va->va_start;
                mem = (void *)addr;
        }

        kasan_unpoison_vmalloc(mem, size);

        if (map_kernel_range(addr, size, PAGE_KERNEL, pages) < 0) {
                vm_unmap_ram(mem, count);
                return NULL;
        }
        return mem;
}
EXPORT_SYMBOL(vm_map_ram);

static struct vm_struct *vmlist __initdata;

/**
 * vm_area_add_early - add vmap area early during boot
 * @vm: vm_struct to add
 *
 * This function is used to add fixed kernel vm area to vmlist before
 * vmalloc_init() is called.  @vm->addr, @vm->size, and @vm->flags
 * should contain proper values and the other fields should be zero.
 *
 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
 */
void __init vm_area_add_early(struct vm_struct *vm)
{
        struct vm_struct *tmp, **p;

        BUG_ON(vmap_initialized);
        for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
                if (tmp->addr >= vm->addr) {
                        BUG_ON(tmp->addr < vm->addr + vm->size);
                        break;
                } else
                        BUG_ON(tmp->addr + tmp->size > vm->addr);
        }
        vm->next = *p;
        *p = vm;
}

/**
 * vm_area_register_early - register vmap area early during boot
 * @vm: vm_struct to register
 * @align: requested alignment
 *
 * This function is used to register kernel vm area before
 * vmalloc_init() is called.  @vm->size and @vm->flags should contain
 * proper values on entry and other fields should be zero.  On return,
 * vm->addr contains the allocated address.
 *
 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
 */
void __init vm_area_register_early(struct vm_struct *vm, size_t align)
{
        static size_t vm_init_off __initdata;
        unsigned long addr;

        addr = ALIGN(VMALLOC_START + vm_init_off, align);
        vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START;

        vm->addr = (void *)addr;

        vm_area_add_early(vm);
}

static void vmap_init_free_space(void)
{
        unsigned long vmap_start = 1;
        const unsigned long vmap_end = ULONG_MAX;
        struct vmap_area *busy, *free;

        /*
         *     B     F     B     B     B     F
         * -|-----|.....|-----|-----|-----|.....|-
         *  |           The KVA space           |
         *  |<--------------------------------->|
         */
        list_for_each_entry(busy, &vmap_area_list, list) {
                if (busy->va_start - vmap_start > 0) {
                        free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
                        if (!WARN_ON_ONCE(!free)) {
                                free->va_start = vmap_start;
                                free->va_end = busy->va_start;

                                insert_vmap_area_augment(free, NULL,
                                        &free_vmap_area_root,
                                                &free_vmap_area_list);
                        }
                }

                vmap_start = busy->va_end;
        }

        if (vmap_end - vmap_start > 0) {
                free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
                if (!WARN_ON_ONCE(!free)) {
                        free->va_start = vmap_start;
                        free->va_end = vmap_end;

                        insert_vmap_area_augment(free, NULL,
                                &free_vmap_area_root,
                                        &free_vmap_area_list);
                }
        }
}

void __init vmalloc_init(void)
{
        struct vmap_area *va;
        struct vm_struct *tmp;
        int i;

        /*
         * Create the cache for vmap_area objects.
         */
        vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);

        for_each_possible_cpu(i) {
                struct vmap_block_queue *vbq;
                struct vfree_deferred *p;

                vbq = &per_cpu(vmap_block_queue, i);
                spin_lock_init(&vbq->lock);
                INIT_LIST_HEAD(&vbq->free);
                p = &per_cpu(vfree_deferred, i);
                init_llist_head(&p->list);
                INIT_WORK(&p->wq, free_work);
        }

        /* Import existing vmlist entries. */
        for (tmp = vmlist; tmp; tmp = tmp->next) {
                va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
                if (WARN_ON_ONCE(!va))
                        continue;

                va->va_start = (unsigned long)tmp->addr;
                va->va_end = va->va_start + tmp->size;
                va->vm = tmp;
                insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
        }

        /*
         * Now we can initialize a free vmap space.
         */
        vmap_init_free_space();
        vmap_initialized = true;
}

/**
 * unmap_kernel_range - unmap kernel VM area and flush cache and TLB
 * @addr: start of the VM area to unmap
 * @size: size of the VM area to unmap
 *
 * Similar to unmap_kernel_range_noflush() but flushes vcache before
 * the unmapping and tlb after.
 */
void unmap_kernel_range(unsigned long addr, unsigned long size)
{
        unsigned long end = addr + size;

        flush_cache_vunmap(addr, end);
        unmap_kernel_range_noflush(addr, size);
        flush_tlb_kernel_range(addr, end);
}

static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
        struct vmap_area *va, unsigned long flags, const void *caller)
{
        vm->flags = flags;
        vm->addr = (void *)va->va_start;
        vm->size = va->va_end - va->va_start;
        vm->caller = caller;
        va->vm = vm;
}

static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
                              unsigned long flags, const void *caller)
{
        spin_lock(&vmap_area_lock);
        setup_vmalloc_vm_locked(vm, va, flags, caller);
        spin_unlock(&vmap_area_lock);
}

static void clear_vm_uninitialized_flag(struct vm_struct *vm)
{
        /*
         * Before removing VM_UNINITIALIZED,
         * we should make sure that vm has proper values.
         * Pair with smp_rmb() in show_numa_info().
         */
        smp_wmb();
        vm->flags &= ~VM_UNINITIALIZED;
}

static struct vm_struct *__get_vm_area_node(unsigned long size,
                unsigned long align, unsigned long flags, unsigned long start,
                unsigned long end, int node, gfp_t gfp_mask, const void *caller)
{
        struct vmap_area *va;
        struct vm_struct *area;
        unsigned long requested_size = size;

        BUG_ON(in_interrupt());
        size = PAGE_ALIGN(size);
        if (unlikely(!size))
                return NULL;

        if (flags & VM_IOREMAP)
                align = 1ul << clamp_t(int, get_count_order_long(size),
                                       PAGE_SHIFT, IOREMAP_MAX_ORDER);

        area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
        if (unlikely(!area))
                return NULL;

        if (!(flags & VM_NO_GUARD))
                size += PAGE_SIZE;

        va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
        if (IS_ERR(va)) {
                kfree(area);
                return NULL;
        }

        kasan_unpoison_vmalloc((void *)va->va_start, requested_size);

        setup_vmalloc_vm(area, va, flags, caller);

        return area;
}

struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
                                       unsigned long start, unsigned long end,
                                       const void *caller)
{
        return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
                                  GFP_KERNEL, caller);
}

/**
 * get_vm_area - reserve a contiguous kernel virtual area
 * @size:         size of the area
 * @flags:         %VM_IOREMAP for I/O mappings or VM_ALLOC
 *
 * Search an area of @size in the kernel virtual mapping area,
 * and reserved it for out purposes.  Returns the area descriptor
 * on success or %NULL on failure.
 *
 * Return: the area descriptor on success or %NULL on failure.
 */
struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
{
        return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
                                  NUMA_NO_NODE, GFP_KERNEL,
                                  __builtin_return_address(0));
}

struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
                                const void *caller)
{
        return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
                                  NUMA_NO_NODE, GFP_KERNEL, caller);
}

/**
 * find_vm_area - find a continuous kernel virtual area
 * @addr:          base address
 *
 * Search for the kernel VM area starting at @addr, and return it.
 * It is up to the caller to do all required locking to keep the returned
 * pointer valid.
 *
 * Return: the area descriptor on success or %NULL on failure.
 */
struct vm_struct *find_vm_area(const void *addr)
{
        struct vmap_area *va;

        va = find_vmap_area((unsigned long)addr);
        if (!va)
                return NULL;

        return va->vm;
}

/**
 * remove_vm_area - find and remove a continuous kernel virtual area
 * @addr:            base address
 *
 * Search for the kernel VM area starting at @addr, and remove it.
 * This function returns the found VM area, but using it is NOT safe
 * on SMP machines, except for its size or flags.
 *
 * Return: the area descriptor on success or %NULL on failure.
 */
struct vm_struct *remove_vm_area(const void *addr)
{
        struct vmap_area *va;

        might_sleep();

        spin_lock(&vmap_area_lock);
        va = __find_vmap_area((unsigned long)addr);
        if (va && va->vm) {
                struct vm_struct *vm = va->vm;

                va->vm = NULL;
                spin_unlock(&vmap_area_lock);

                kasan_free_shadow(vm);
                free_unmap_vmap_area(va);

                return vm;
        }

        spin_unlock(&vmap_area_lock);
        return NULL;
}

static inline void set_area_direct_map(const struct vm_struct *area,
                                       int (*set_direct_map)(struct page *page))
{
        int i;

        for (i = 0; i < area->nr_pages; i++)
                if (page_address(area->pages[i]))
                        set_direct_map(area->pages[i]);
}

/* Handle removing and resetting vm mappings related to the vm_struct. */
static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
{
        unsigned long start = ULONG_MAX, end = 0;
        int flush_reset = area->flags & VM_FLUSH_RESET_PERMS;
        int flush_dmap = 0;
        int i;

        remove_vm_area(area->addr);

        /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */
        if (!flush_reset)
                return;

        /*
         * If not deallocating pages, just do the flush of the VM area and
         * return.
         */
        if (!deallocate_pages) {
                vm_unmap_aliases();
                return;
        }

        /*
         * If execution gets here, flush the vm mapping and reset the direct
         * map. Find the start and end range of the direct mappings to make sure
         * the vm_unmap_aliases() flush includes the direct map.
         */
        for (i = 0; i < area->nr_pages; i++) {
                unsigned long addr = (unsigned long)page_address(area->pages[i]);
                if (addr) {
                        start = min(addr, start);
                        end = max(addr + PAGE_SIZE, end);
                        flush_dmap = 1;
                }
        }

        /*
         * Set direct map to something invalid so that it won't be cached if
         * there are any accesses after the TLB flush, then flush the TLB and
         * reset the direct map permissions to the default.
         */
        set_area_direct_map(area, set_direct_map_invalid_noflush);
        _vm_unmap_aliases(start, end, flush_dmap);
        set_area_direct_map(area, set_direct_map_default_noflush);
}

static void __vunmap(const void *addr, int deallocate_pages)
{
        struct vm_struct *area;

        if (!addr)
                return;

        if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
                        addr))
                return;

        area = find_vm_area(addr);
        if (unlikely(!area)) {
                WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
                                addr);
                return;
        }

        debug_check_no_locks_freed(area->addr, get_vm_area_size(area));
        debug_check_no_obj_freed(area->addr, get_vm_area_size(area));

        kasan_poison_vmalloc(area->addr, get_vm_area_size(area));

        vm_remove_mappings(area, deallocate_pages);

        if (deallocate_pages) {
                int i;

                for (i = 0; i < area->nr_pages; i++) {
                        struct page *page = area->pages[i];

                        BUG_ON(!page);
                        __free_pages(page, 0);
                }
                if (!(area->flags & VM_MAP_PUT_PAGES))
                        atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);

                kvfree(area->pages);
        }

        kfree(area);
        return;
}

static inline void __vfree_deferred(const void *addr)
{
        /*
         * Use raw_cpu_ptr() because this can be called from preemptible
         * context. Preemption is absolutely fine here, because the llist_add()
         * implementation is lockless, so it works even if we are adding to
         * another cpu's list. schedule_work() should be fine with this too.
         */
        struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);

        if (llist_add((struct llist_node *)addr, &p->list))
                schedule_work(&p->wq);
}

/**
 * vfree_atomic - release memory allocated by vmalloc()
 * @addr:          memory base address
 *
 * This one is just like vfree() but can be called in any atomic context
 * except NMIs.
 */
void vfree_atomic(const void *addr)
{
        BUG_ON(in_nmi());

        kmemleak_free(addr);

        if (!addr)
                return;
        __vfree_deferred(addr);
}

static void __vfree(const void *addr)
{
        if (unlikely(in_interrupt()))
                __vfree_deferred(addr);
        else
                __vunmap(addr, 1);
}

/**
 * vfree - Release memory allocated by vmalloc()
 * @addr:  Memory base address
 *
 * Free the virtually continuous memory area starting at @addr, as obtained
 * from one of the vmalloc() family of APIs.  This will usually also free the
 * physical memory underlying the virtual allocation, but that memory is
 * reference counted, so it will not be freed until the last user goes away.
 *
 * If @addr is NULL, no operation is performed.
 *
 * Context:
 * May sleep if called *not* from interrupt context.
 * Must not be called in NMI context (strictly speaking, it could be
 * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
 * conventions for vfree() arch-depenedent would be a really bad idea).
 */
void vfree(const void *addr)
{
        BUG_ON(in_nmi());

        kmemleak_free(addr);

        might_sleep_if(!in_interrupt());

        if (!addr)
                return;

        __vfree(addr);
}
EXPORT_SYMBOL(vfree);

/**
 * vunmap - release virtual mapping obtained by vmap()
 * @addr:   memory base address
 *
 * Free the virtually contiguous memory area starting at @addr,
 * which was created from the page array passed to vmap().
 *
 * Must not be called in interrupt context.
 */
void vunmap(const void *addr)
{
        BUG_ON(in_interrupt());
        might_sleep();
        if (addr)
                __vunmap(addr, 0);
}
EXPORT_SYMBOL(vunmap);

/**
 * vmap - map an array of pages into virtually contiguous space
 * @pages: array of page pointers
 * @count: number of pages to map
 * @flags: vm_area->flags
 * @prot: page protection for the mapping
 *
 * Maps @count pages from @pages into contiguous kernel virtual space.
 * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself
 * (which must be kmalloc or vmalloc memory) and one reference per pages in it
 * are transferred from the caller to vmap(), and will be freed / dropped when
 * vfree() is called on the return value.
 *
 * Return: the address of the area or %NULL on failure
 */
void *vmap(struct page **pages, unsigned int count,
           unsigned long flags, pgprot_t prot)
{
        struct vm_struct *area;
        unsigned long size;                /* In bytes */

        might_sleep();

        if (count > totalram_pages())
                return NULL;

        size = (unsigned long)count << PAGE_SHIFT;
        area = get_vm_area_caller(size, flags, __builtin_return_address(0));
        if (!area)
                return NULL;

        if (map_kernel_range((unsigned long)area->addr, size, pgprot_nx(prot),
                        pages) < 0) {
                vunmap(area->addr);
                return NULL;
        }

        if (flags & VM_MAP_PUT_PAGES) {
                area->pages = pages;
                area->nr_pages = count;
        }
        return area->addr;
}
EXPORT_SYMBOL(vmap);

#ifdef CONFIG_VMAP_PFN
struct vmap_pfn_data {
        unsigned long        *pfns;
        pgprot_t        prot;
        unsigned int        idx;
};

static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private)
{
        struct vmap_pfn_data *data = private;

        if (WARN_ON_ONCE(pfn_valid(data->pfns[data->idx])))
                return -EINVAL;
        *pte = pte_mkspecial(pfn_pte(data->pfns[data->idx++], data->prot));
        return 0;
}

/**
 * vmap_pfn - map an array of PFNs into virtually contiguous space
 * @pfns: array of PFNs
 * @count: number of pages to map
 * @prot: page protection for the mapping
 *
 * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns
 * the start address of the mapping.
 */
void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot)
{
        struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) };
        struct vm_struct *area;

        area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP,
                        __builtin_return_address(0));
        if (!area)
                return NULL;
        if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
                        count * PAGE_SIZE, vmap_pfn_apply, &data)) {
                free_vm_area(area);
                return NULL;
        }

        flush_cache_vmap((unsigned long)area->addr,
                         (unsigned long)area->addr + count * PAGE_SIZE);

        return area->addr;
}
EXPORT_SYMBOL_GPL(vmap_pfn);
#endif /* CONFIG_VMAP_PFN */

static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
                                 pgprot_t prot, int node)
{
        const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
        unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
        unsigned int array_size = nr_pages * sizeof(struct page *), i;
        struct page **pages;

        gfp_mask |= __GFP_NOWARN;
        if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
                gfp_mask |= __GFP_HIGHMEM;

        /* Please note that the recursion is strictly bounded. */
        if (array_size > PAGE_SIZE) {
                pages = __vmalloc_node(array_size, 1, nested_gfp, node,
                                        area->caller);
        } else {
                pages = kmalloc_node(array_size, nested_gfp, node);
        }

        if (!pages) {
                remove_vm_area(area->addr);
                kfree(area);
                return NULL;
        }

        area->pages = pages;
        area->nr_pages = nr_pages;

        for (i = 0; i < area->nr_pages; i++) {
                struct page *page;

                if (node == NUMA_NO_NODE)
                        page = alloc_page(gfp_mask);
                else
                        page = alloc_pages_node(node, gfp_mask, 0);

                if (unlikely(!page)) {
                        /* Successfully allocated i pages, free them in __vfree() */
                        area->nr_pages = i;
                        atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
                        goto fail;
                }
                area->pages[i] = page;
                if (gfpflags_allow_blocking(gfp_mask))
                        cond_resched();
        }
        atomic_long_add(area->nr_pages, &nr_vmalloc_pages);

        if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area),
                        prot, pages) < 0)
                goto fail;

        return area->addr;

fail:
        warn_alloc(gfp_mask, NULL,
                          "vmalloc: allocation failure, allocated %ld of %ld bytes",
                          (area->nr_pages*PAGE_SIZE), area->size);
        __vfree(area->addr);
        return NULL;
}

/**
 * __vmalloc_node_range - allocate virtually contiguous memory
 * @size:                  allocation size
 * @align:                  desired alignment
 * @start:                  vm area range start
 * @end:                  vm area range end
 * @gfp_mask:                  flags for the page level allocator
 * @prot:                  protection mask for the allocated pages
 * @vm_flags:                  additional vm area flags (e.g. %VM_NO_GUARD)
 * @node:                  node to use for allocation or NUMA_NO_NODE
 * @caller:                  caller's return address
 *
 * Allocate enough pages to cover @size from the page level
 * allocator with @gfp_mask flags.  Map them into contiguous
 * kernel virtual space, using a pagetable protection of @prot.
 *
 * Return: the address of the area or %NULL on failure
 */
void *__vmalloc_node_range(unsigned long size, unsigned long align,
                        unsigned long start, unsigned long end, gfp_t gfp_mask,
                        pgprot_t prot, unsigned long vm_flags, int node,
                        const void *caller)
{
        struct vm_struct *area;
        void *addr;
        unsigned long real_size = size;

        size = PAGE_ALIGN(size);
        if (!size || (size >> PAGE_SHIFT) > totalram_pages())
                goto fail;

        area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED |
                                vm_flags, start, end, node, gfp_mask, caller);
        if (!area)
                goto fail;

        addr = __vmalloc_area_node(area, gfp_mask, prot, node);
        if (!addr)
                return NULL;

        /*
         * In this function, newly allocated vm_struct has VM_UNINITIALIZED
         * flag. It means that vm_struct is not fully initialized.
         * Now, it is fully initialized, so remove this flag here.
         */
        clear_vm_uninitialized_flag(area);

        kmemleak_vmalloc(area, size, gfp_mask);

        return addr;

fail:
        warn_alloc(gfp_mask, NULL,
                          "vmalloc: allocation failure: %lu bytes", real_size);
        return NULL;
}

/**
 * __vmalloc_node - allocate virtually contiguous memory
 * @size:            allocation size
 * @align:            desired alignment
 * @gfp_mask:            flags for the page level allocator
 * @node:            node to use for allocation or NUMA_NO_NODE
 * @caller:            caller's return address
 *
 * Allocate enough pages to cover @size from the page level allocator with
 * @gfp_mask flags.  Map them into contiguous kernel virtual space.
 *
 * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
 * and __GFP_NOFAIL are not supported
 *
 * Any use of gfp flags outside of GFP_KERNEL should be consulted
 * with mm people.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *__vmalloc_node(unsigned long size, unsigned long align,
                            gfp_t gfp_mask, int node, const void *caller)
{
        return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
                                gfp_mask, PAGE_KERNEL, 0, node, caller);
}
/*
 * This is only for performance analysis of vmalloc and stress purpose.
 * It is required by vmalloc test module, therefore do not use it other
 * than that.
 */
#ifdef CONFIG_TEST_VMALLOC_MODULE
EXPORT_SYMBOL_GPL(__vmalloc_node);
#endif

void *__vmalloc(unsigned long size, gfp_t gfp_mask)
{
        return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE,
                                __builtin_return_address(0));
}
EXPORT_SYMBOL(__vmalloc);

/**
 * vmalloc - allocate virtually contiguous memory
 * @size:    allocation size
 *
 * Allocate enough pages to cover @size from the page level
 * allocator and map them into contiguous kernel virtual space.
 *
 * For tight control over page level allocator and protection flags
 * use __vmalloc() instead.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vmalloc(unsigned long size)
{
        return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE,
                                __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc);

/**
 * vzalloc - allocate virtually contiguous memory with zero fill
 * @size:    allocation size
 *
 * Allocate enough pages to cover @size from the page level
 * allocator and map them into contiguous kernel virtual space.
 * The memory allocated is set to zero.
 *
 * For tight control over page level allocator and protection flags
 * use __vmalloc() instead.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vzalloc(unsigned long size)
{
        return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE,
                                __builtin_return_address(0));
}
EXPORT_SYMBOL(vzalloc);

/**
 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
 * @size: allocation size
 *
 * The resulting memory area is zeroed so it can be mapped to userspace
 * without leaking data.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vmalloc_user(unsigned long size)
{
        return __vmalloc_node_range(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
                                    GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL,
                                    VM_USERMAP, NUMA_NO_NODE,
                                    __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_user);

/**
 * vmalloc_node - allocate memory on a specific node
 * @size:          allocation size
 * @node:          numa node
 *
 * Allocate enough pages to cover @size from the page level
 * allocator and map them into contiguous kernel virtual space.
 *
 * For tight control over page level allocator and protection flags
 * use __vmalloc() instead.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vmalloc_node(unsigned long size, int node)
{
        return __vmalloc_node(size, 1, GFP_KERNEL, node,
                        __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_node);

/**
 * vzalloc_node - allocate memory on a specific node with zero fill
 * @size:        allocation size
 * @node:        numa node
 *
 * Allocate enough pages to cover @size from the page level
 * allocator and map them into contiguous kernel virtual space.
 * The memory allocated is set to zero.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vzalloc_node(unsigned long size, int node)
{
        return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node,
                                __builtin_return_address(0));
}
EXPORT_SYMBOL(vzalloc_node);

#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
#define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL)
#else
/*
 * 64b systems should always have either DMA or DMA32 zones. For others
 * GFP_DMA32 should do the right thing and use the normal zone.
 */
#define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL
#endif

/**
 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
 * @size:        allocation size
 *
 * Allocate enough 32bit PA addressable pages to cover @size from the
 * page level allocator and map them into contiguous kernel virtual space.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vmalloc_32(unsigned long size)
{
        return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE,
                        __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_32);

/**
 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
 * @size:             allocation size
 *
 * The resulting memory area is 32bit addressable and zeroed so it can be
 * mapped to userspace without leaking data.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vmalloc_32_user(unsigned long size)
{
        return __vmalloc_node_range(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
                                    GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
                                    VM_USERMAP, NUMA_NO_NODE,
                                    __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_32_user);

/*
 * small helper routine , copy contents to buf from addr.
 * If the page is not present, fill zero.
 */

static int aligned_vread(char *buf, char *addr, unsigned long count)
{
        struct page *p;
        int copied = 0;

        while (count) {
                unsigned long offset, length;

                offset = offset_in_page(addr);
                length = PAGE_SIZE - offset;
                if (length > count)
                        length = count;
                p = vmalloc_to_page(addr);
                /*
                 * To do safe access to this _mapped_ area, we need
                 * lock. But adding lock here means that we need to add
                 * overhead of vmalloc()/vfree() calles for this _debug_
                 * interface, rarely used. Instead of that, we'll use
                 * kmap() and get small overhead in this access function.
                 */
                if (p) {
                        /*
                         * we can expect USER0 is not used (see vread/vwrite's
                         * function description)
                         */
                        void *map = kmap_atomic(p);
                        memcpy(buf, map + offset, length);
                        kunmap_atomic(map);
                } else
                        memset(buf, 0, length);

                addr += length;
                buf += length;
                copied += length;
                count -= length;
        }
        return copied;
}

static int aligned_vwrite(char *buf, char *addr, unsigned long count)
{
        struct page *p;
        int copied = 0;

        while (count) {
                unsigned long offset, length;

                offset = offset_in_page(addr);
                length = PAGE_SIZE - offset;
                if (length > count)
                        length = count;
                p = vmalloc_to_page(addr);
                /*
                 * To do safe access to this _mapped_ area, we need
                 * lock. But adding lock here means that we need to add
                 * overhead of vmalloc()/vfree() calles for this _debug_
                 * interface, rarely used. Instead of that, we'll use
                 * kmap() and get small overhead in this access function.
                 */
                if (p) {
                        /*
                         * we can expect USER0 is not used (see vread/vwrite's
                         * function description)
                         */
                        void *map = kmap_atomic(p);
                        memcpy(map + offset, buf, length);
                        kunmap_atomic(map);
                }
                addr += length;
                buf += length;
                copied += length;
                count -= length;
        }
        return copied;
}

/**
 * vread() - read vmalloc area in a safe way.
 * @buf:     buffer for reading data
 * @addr:    vm address.
 * @count:   number of bytes to be read.
 *
 * This function checks that addr is a valid vmalloc'ed area, and
 * copy data from that area to a given buffer. If the given memory range
 * of [addr...addr+count) includes some valid address, data is copied to
 * proper area of @buf. If there are memory holes, they'll be zero-filled.
 * IOREMAP area is treated as memory hole and no copy is done.
 *
 * If [addr...addr+count) doesn't includes any intersects with alive
 * vm_struct area, returns 0. @buf should be kernel's buffer.
 *
 * Note: In usual ops, vread() is never necessary because the caller
 * should know vmalloc() area is valid and can use memcpy().
 * This is for routines which have to access vmalloc area without
 * any information, as /dev/kmem.
 *
 * Return: number of bytes for which addr and buf should be increased
 * (same number as @count) or %0 if [addr...addr+count) doesn't
 * include any intersection with valid vmalloc area
 */
long vread(char *buf, char *addr, unsigned long count)
{
        struct vmap_area *va;
        struct vm_struct *vm;
        char *vaddr, *buf_start = buf;
        unsigned long buflen = count;
        unsigned long n;

        /* Don't allow overflow */
        if ((unsigned long) addr + count < count)
                count = -(unsigned long) addr;

        spin_lock(&vmap_area_lock);
        list_for_each_entry(va, &vmap_area_list, list) {
                if (!count)
                        break;

                if (!va->vm)
                        continue;

                vm = va->vm;
                vaddr = (char *) vm->addr;
                if (addr >= vaddr + get_vm_area_size(vm))
                        continue;
                while (addr < vaddr) {
                        if (count == 0)
                                goto finished;
                        *buf = '\0';
                        buf++;
                        addr++;
                        count--;
                }
                n = vaddr + get_vm_area_size(vm) - addr;
                if (n > count)
                        n = count;
                if (!(vm->flags & VM_IOREMAP))
                        aligned_vread(buf, addr, n);
                else /* IOREMAP area is treated as memory hole */
                        memset(buf, 0, n);
                buf += n;
                addr += n;
                count -= n;
        }
finished:
        spin_unlock(&vmap_area_lock);

        if (buf == buf_start)
                return 0;
        /* zero-fill memory holes */
        if (buf != buf_start + buflen)
                memset(buf, 0, buflen - (buf - buf_start));

        return buflen;
}

/**
 * vwrite() - write vmalloc area in a safe way.
 * @buf:      buffer for source data
 * @addr:     vm address.
 * @count:    number of bytes to be read.
 *
 * This function checks that addr is a valid vmalloc'ed area, and
 * copy data from a buffer to the given addr. If specified range of
 * [addr...addr+count) includes some valid address, data is copied from
 * proper area of @buf. If there are memory holes, no copy to hole.
 * IOREMAP area is treated as memory hole and no copy is done.
 *
 * If [addr...addr+count) doesn't includes any intersects with alive
 * vm_struct area, returns 0. @buf should be kernel's buffer.
 *
 * Note: In usual ops, vwrite() is never necessary because the caller
 * should know vmalloc() area is valid and can use memcpy().
 * This is for routines which have to access vmalloc area without
 * any information, as /dev/kmem.
 *
 * Return: number of bytes for which addr and buf should be
 * increased (same number as @count) or %0 if [addr...addr+count)
 * doesn't include any intersection with valid vmalloc area
 */
long vwrite(char *buf, char *addr, unsigned long count)
{
        struct vmap_area *va;
        struct vm_struct *vm;
        char *vaddr;
        unsigned long n, buflen;
        int copied = 0;

        /* Don't allow overflow */
        if ((unsigned long) addr + count < count)
                count = -(unsigned long) addr;
        buflen = count;

        spin_lock(&vmap_area_lock);
        list_for_each_entry(va, &vmap_area_list, list) {
                if (!count)
                        break;

                if (!va->vm)
                        continue;

                vm = va->vm;
                vaddr = (char *) vm->addr;
                if (addr >= vaddr + get_vm_area_size(vm))
                        continue;
                while (addr < vaddr) {
                        if (count == 0)
                                goto finished;
                        buf++;
                        addr++;
                        count--;
                }
                n = vaddr + get_vm_area_size(vm) - addr;
                if (n > count)
                        n = count;
                if (!(vm->flags & VM_IOREMAP)) {
                        aligned_vwrite(buf, addr, n);
                        copied++;
                }
                buf += n;
                addr += n;
                count -= n;
        }
finished:
        spin_unlock(&vmap_area_lock);
        if (!copied)
                return 0;
        return buflen;
}

/**
 * remap_vmalloc_range_partial - map vmalloc pages to userspace
 * @vma:                vma to cover
 * @uaddr:                target user address to start at
 * @kaddr:                virtual address of vmalloc kernel memory
 * @pgoff:                offset from @kaddr to start at
 * @size:                size of map area
 *
 * Returns:        0 for success, -Exxx on failure
 *
 * This function checks that @kaddr is a valid vmalloc'ed area,
 * and that it is big enough to cover the range starting at
 * @uaddr in @vma. Will return failure if that criteria isn't
 * met.
 *
 * Similar to remap_pfn_range() (see mm/memory.c)
 */
int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
                                void *kaddr, unsigned long pgoff,
                                unsigned long size)
{
        struct vm_struct *area;
        unsigned long off;
        unsigned long end_index;

        if (check_shl_overflow(pgoff, PAGE_SHIFT, &off))
                return -EINVAL;

        size = PAGE_ALIGN(size);

        if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr))
                return -EINVAL;

        area = find_vm_area(kaddr);
        if (!area)
                return -EINVAL;

        if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT)))
                return -EINVAL;

        if (check_add_overflow(size, off, &end_index) ||
            end_index > get_vm_area_size(area))
                return -EINVAL;
        kaddr += off;

        do {
                struct page *page = vmalloc_to_page(kaddr);
                int ret;

                ret = vm_insert_page(vma, uaddr, page);
                if (ret)
                        return ret;

                uaddr += PAGE_SIZE;
                kaddr += PAGE_SIZE;
                size -= PAGE_SIZE;
        } while (size > 0);

        vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;

        return 0;
}
EXPORT_SYMBOL(remap_vmalloc_range_partial);

/**
 * remap_vmalloc_range - map vmalloc pages to userspace
 * @vma:                vma to cover (map full range of vma)
 * @addr:                vmalloc memory
 * @pgoff:                number of pages into addr before first page to map
 *
 * Returns:        0 for success, -Exxx on failure
 *
 * This function checks that addr is a valid vmalloc'ed area, and
 * that it is big enough to cover the vma. Will return failure if
 * that criteria isn't met.
 *
 * Similar to remap_pfn_range() (see mm/memory.c)
 */
int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
                                                unsigned long pgoff)
{
        return remap_vmalloc_range_partial(vma, vma->vm_start,
                                           addr, pgoff,
                                           vma->vm_end - vma->vm_start);
}
EXPORT_SYMBOL(remap_vmalloc_range);

void free_vm_area(struct vm_struct *area)
{
        struct vm_struct *ret;
        ret = remove_vm_area(area->addr);
        BUG_ON(ret != area);
        kfree(area);
}
EXPORT_SYMBOL_GPL(free_vm_area);

#ifdef CONFIG_SMP
static struct vmap_area *node_to_va(struct rb_node *n)
{
        return rb_entry_safe(n, struct vmap_area, rb_node);
}

/**
 * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to
 * @addr: target address
 *
 * Returns: vmap_area if it is found. If there is no such area
 *   the first highest(reverse order) vmap_area is returned
 *   i.e. va->va_start < addr && va->va_end < addr or NULL
 *   if there are no any areas before @addr.
 */
static struct vmap_area *
pvm_find_va_enclose_addr(unsigned long addr)
{
        struct vmap_area *va, *tmp;
        struct rb_node *n;

        n = free_vmap_area_root.rb_node;
        va = NULL;

        while (n) {
                tmp = rb_entry(n, struct vmap_area, rb_node);
                if (tmp->va_start <= addr) {
                        va = tmp;
                        if (tmp->va_end >= addr)
                                break;

                        n = n->rb_right;
                } else {
                        n = n->rb_left;
                }
        }

        return va;
}

/**
 * pvm_determine_end_from_reverse - find the highest aligned address
 * of free block below VMALLOC_END
 * @va:
 *   in - the VA we start the search(reverse order);
 *   out - the VA with the highest aligned end address.
 *
 * Returns: determined end address within vmap_area
 */
static unsigned long
pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align)
{
        unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
        unsigned long addr;

        if (likely(*va)) {
                list_for_each_entry_from_reverse((*va),
                                &free_vmap_area_list, list) {
                        addr = min((*va)->va_end & ~(align - 1), vmalloc_end);
                        if ((*va)->va_start < addr)
                                return addr;
                }
        }

        return 0;
}

/**
 * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
 * @offsets: array containing offset of each area
 * @sizes: array containing size of each area
 * @nr_vms: the number of areas to allocate
 * @align: alignment, all entries in @offsets and @sizes must be aligned to this
 *
 * Returns: kmalloc'd vm_struct pointer array pointing to allocated
 *            vm_structs on success, %NULL on failure
 *
 * Percpu allocator wants to use congruent vm areas so that it can
 * maintain the offsets among percpu areas.  This function allocates
 * congruent vmalloc areas for it with GFP_KERNEL.  These areas tend to
 * be scattered pretty far, distance between two areas easily going up
 * to gigabytes.  To avoid interacting with regular vmallocs, these
 * areas are allocated from top.
 *
 * Despite its complicated look, this allocator is rather simple. It
 * does everything top-down and scans free blocks from the end looking
 * for matching base. While scanning, if any of the areas do not fit the
 * base address is pulled down to fit the area. Scanning is repeated till
 * all the areas fit and then all necessary data structures are inserted
 * and the result is returned.
 */
struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
                                     const size_t *sizes, int nr_vms,
                                     size_t align)
{
        const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
        const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
        struct vmap_area **vas, *va;
        struct vm_struct **vms;
        int area, area2, last_area, term_area;
        unsigned long base, start, size, end, last_end, orig_start, orig_end;
        bool purged = false;
        enum fit_type type;

        /* verify parameters and allocate data structures */
        BUG_ON(offset_in_page(align) || !is_power_of_2(align));
        for (last_area = 0, area = 0; area < nr_vms; area++) {
                start = offsets[area];
                end = start + sizes[area];

                /* is everything aligned properly? */
                BUG_ON(!IS_ALIGNED(offsets[area], align));
                BUG_ON(!IS_ALIGNED(sizes[area], align));

                /* detect the area with the highest address */
                if (start > offsets[last_area])
                        last_area = area;

                for (area2 = area + 1; area2 < nr_vms; area2++) {
                        unsigned long start2 = offsets[area2];
                        unsigned long end2 = start2 + sizes[area2];

                        BUG_ON(start2 < end && start < end2);
                }
        }
        last_end = offsets[last_area] + sizes[last_area];

        if (vmalloc_end - vmalloc_start < last_end) {
                WARN_ON(true);
                return NULL;
        }

        vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
        vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
        if (!vas || !vms)
                goto err_free2;

        for (area = 0; area < nr_vms; area++) {
                vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL);
                vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
                if (!vas[area] || !vms[area])
                        goto err_free;
        }
retry:
        spin_lock(&free_vmap_area_lock);

        /* start scanning - we scan from the top, begin with the last area */
        area = term_area = last_area;
        start = offsets[area];
        end = start + sizes[area];

        va = pvm_find_va_enclose_addr(vmalloc_end);
        base = pvm_determine_end_from_reverse(&va, align) - end;

        while (true) {
                /*
                 * base might have underflowed, add last_end before
                 * comparing.
                 */
                if (base + last_end < vmalloc_start + last_end)
                        goto overflow;

                /*
                 * Fitting base has not been found.
                 */
                if (va == NULL)
                        goto overflow;

                /*
                 * If required width exceeds current VA block, move
                 * base downwards and then recheck.
                 */
                if (base + end > va->va_end) {
                        base = pvm_determine_end_from_reverse(&va, align) - end;
                        term_area = area;
                        continue;
                }

                /*
                 * If this VA does not fit, move base downwards and recheck.
                 */
                if (base + start < va->va_start) {
                        va = node_to_va(rb_prev(&va->rb_node));
                        base = pvm_determine_end_from_reverse(&va, align) - end;
                        term_area = area;
                        continue;
                }

                /*
                 * This area fits, move on to the previous one.  If
                 * the previous one is the terminal one, we're done.
                 */
                area = (area + nr_vms - 1) % nr_vms;
                if (area == term_area)
                        break;

                start = offsets[area];
                end = start + sizes[area];
                va = pvm_find_va_enclose_addr(base + end);
        }

        /* we've found a fitting base, insert all va's */
        for (area = 0; area < nr_vms; area++) {
                int ret;

                start = base + offsets[area];
                size = sizes[area];

                va = pvm_find_va_enclose_addr(start);
                if (WARN_ON_ONCE(va == NULL))
                        /* It is a BUG(), but trigger recovery instead. */
                        goto recovery;

                type = classify_va_fit_type(va, start, size);
                if (WARN_ON_ONCE(type == NOTHING_FIT))
                        /* It is a BUG(), but trigger recovery instead. */
                        goto recovery;

                ret = adjust_va_to_fit_type(va, start, size, type);
                if (unlikely(ret))
                        goto recovery;

                /* Allocated area. */
                va = vas[area];
                va->va_start = start;
                va->va_end = start + size;
        }

        spin_unlock(&free_vmap_area_lock);

        /* populate the kasan shadow space */
        for (area = 0; area < nr_vms; area++) {
                if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area]))
                        goto err_free_shadow;

                kasan_unpoison_vmalloc((void *)vas[area]->va_start,
                                       sizes[area]);
        }

        /* insert all vm's */
        spin_lock(&vmap_area_lock);
        for (area = 0; area < nr_vms; area++) {
                insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);

                setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
                                 pcpu_get_vm_areas);
        }
        spin_unlock(&vmap_area_lock);

        kfree(vas);
        return vms;

recovery:
        /*
         * Remove previously allocated areas. There is no
         * need in removing these areas from the busy tree,
         * because they are inserted only on the final step
         * and when pcpu_get_vm_areas() is success.
         */
        while (area--) {
                orig_start = vas[area]->va_start;
                orig_end = vas[area]->va_end;
                va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
                                            &free_vmap_area_list);
                if (va)
                        kasan_release_vmalloc(orig_start, orig_end,
                                va->va_start, va->va_end);
                vas[area] = NULL;
        }

overflow:
        spin_unlock(&free_vmap_area_lock);
        if (!purged) {
                purge_vmap_area_lazy();
                purged = true;

                /* Before "retry", check if we recover. */
                for (area = 0; area < nr_vms; area++) {
                        if (vas[area])
                                continue;

                        vas[area] = kmem_cache_zalloc(
                                vmap_area_cachep, GFP_KERNEL);
                        if (!vas[area])
                                goto err_free;
                }

                goto retry;
        }

err_free:
        for (area = 0; area < nr_vms; area++) {
                if (vas[area])
                        kmem_cache_free(vmap_area_cachep, vas[area]);

                kfree(vms[area]);
        }
err_free2:
        kfree(vas);
        kfree(vms);
        return NULL;

err_free_shadow:
        spin_lock(&free_vmap_area_lock);
        /*
         * We release all the vmalloc shadows, even the ones for regions that
         * hadn't been successfully added. This relies on kasan_release_vmalloc
         * being able to tolerate this case.
         */
        for (area = 0; area < nr_vms; area++) {
                orig_start = vas[area]->va_start;
                orig_end = vas[area]->va_end;
                va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
                                            &free_vmap_area_list);
                if (va)
                        kasan_release_vmalloc(orig_start, orig_end,
                                va->va_start, va->va_end);
                vas[area] = NULL;
                kfree(vms[area]);
        }
        spin_unlock(&free_vmap_area_lock);
        kfree(vas);
        kfree(vms);
        return NULL;
}

/**
 * pcpu_free_vm_areas - free vmalloc areas for percpu allocator
 * @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
 * @nr_vms: the number of allocated areas
 *
 * Free vm_structs and the array allocated by pcpu_get_vm_areas().
 */
void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
{
        int i;

        for (i = 0; i < nr_vms; i++)
                free_vm_area(vms[i]);
        kfree(vms);
}
#endif        /* CONFIG_SMP */

#ifdef CONFIG_PROC_FS
static void *s_start(struct seq_file *m, loff_t *pos)
        __acquires(&vmap_purge_lock)
        __acquires(&vmap_area_lock)
{
        mutex_lock(&vmap_purge_lock);
        spin_lock(&vmap_area_lock);

        return seq_list_start(&vmap_area_list, *pos);
}

static void *s_next(struct seq_file *m, void *p, loff_t *pos)
{
        return seq_list_next(p, &vmap_area_list, pos);
}

static void s_stop(struct seq_file *m, void *p)
        __releases(&vmap_area_lock)
        __releases(&vmap_purge_lock)
{
        spin_unlock(&vmap_area_lock);
        mutex_unlock(&vmap_purge_lock);
}

static void show_numa_info(struct seq_file *m, struct vm_struct *v)
{
        if (IS_ENABLED(CONFIG_NUMA)) {
                unsigned int nr, *counters = m->private;

                if (!counters)
                        return;

                if (v->flags & VM_UNINITIALIZED)
                        return;
                /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
                smp_rmb();

                memset(counters, 0, nr_node_ids * sizeof(unsigned int));

                for (nr = 0; nr < v->nr_pages; nr++)
                        counters[page_to_nid(v->pages[nr])]++;

                for_each_node_state(nr, N_HIGH_MEMORY)
                        if (counters[nr])
                                seq_printf(m, " N%u=%u", nr, counters[nr]);
        }
}

static void show_purge_info(struct seq_file *m)
{
        struct llist_node *head;
        struct vmap_area *va;

        head = READ_ONCE(vmap_purge_list.first);
        if (head == NULL)
                return;

        llist_for_each_entry(va, head, purge_list) {
                seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
                        (void *)va->va_start, (void *)va->va_end,
                        va->va_end - va->va_start);
        }
}

static int s_show(struct seq_file *m, void *p)
{
        struct vmap_area *va;
        struct vm_struct *v;

        va = list_entry(p, struct vmap_area, list);

        /*
         * s_show can encounter race with remove_vm_area, !vm on behalf
         * of vmap area is being tear down or vm_map_ram allocation.
         */
        if (!va->vm) {
                seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
                        (void *)va->va_start, (void *)va->va_end,
                        va->va_end - va->va_start);

                return 0;
        }

        v = va->vm;

        seq_printf(m, "0x%pK-0x%pK %7ld",
                v->addr, v->addr + v->size, v->size);

        if (v->caller)
                seq_printf(m, " %pS", v->caller);

        if (v->nr_pages)
                seq_printf(m, " pages=%d", v->nr_pages);

        if (v->phys_addr)
                seq_printf(m, " phys=%pa", &v->phys_addr);

        if (v->flags & VM_IOREMAP)
                seq_puts(m, " ioremap");

        if (v->flags & VM_ALLOC)
                seq_puts(m, " vmalloc");

        if (v->flags & VM_MAP)
                seq_puts(m, " vmap");

        if (v->flags & VM_USERMAP)
                seq_puts(m, " user");

        if (v->flags & VM_DMA_COHERENT)
                seq_puts(m, " dma-coherent");

        if (is_vmalloc_addr(v->pages))
                seq_puts(m, " vpages");

        show_numa_info(m, v);
        seq_putc(m, '\n');

        /*
         * As a final step, dump "unpurged" areas. Note,
         * that entire "/proc/vmallocinfo" output will not
         * be address sorted, because the purge list is not
         * sorted.
         */
        if (list_is_last(&va->list, &vmap_area_list))
                show_purge_info(m);

        return 0;
}

static const struct seq_operations vmalloc_op = {
        .start = s_start,
        .next = s_next,
        .stop = s_stop,
        .show = s_show,
};

static int __init proc_vmalloc_init(void)
{
        if (IS_ENABLED(CONFIG_NUMA))
                proc_create_seq_private("vmallocinfo", 0400, NULL,
                                &vmalloc_op,
                                nr_node_ids * sizeof(unsigned int), NULL);
        else
                proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op);
        return 0;
}
module_init(proc_vmalloc_init);

#endif




































































































































































































































































































































































































































































































































































































    1 






    1 




    1 



    1 

    1 
    1 














































































    1 





    1 





























    1 
    1 



    1 
















    1 


    1 




















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 













































    1 








    1 














































    1 







    1 





    1 
    1 





















































































































































































































































    1 







    1 








    1 

    1 



    1 






    1 








    1 






    1 



    1 












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  libata-core.c - helper library for ATA
 *
 *  Copyright 2003-2004 Red Hat, Inc.  All rights reserved.
 *  Copyright 2003-2004 Jeff Garzik
 *
 *  libata documentation is available via 'make {ps|pdf}docs',
 *  as Documentation/driver-api/libata.rst
 *
 *  Hardware documentation available from http://www.t13.org/ and
 *  http://www.sata-io.org/
 *
 *  Standards documents from:
 *        http://www.t13.org (ATA standards, PCI DMA IDE spec)
 *        http://www.t10.org (SCSI MMC - for ATAPI MMC)
 *        http://www.sata-io.org (SATA)
 *        http://www.compactflash.org (CF)
 *        http://www.qic.org (QIC157 - Tape and DSC)
 *        http://www.ce-ata.org (CE-ATA: not supported)
 *
 * libata is essentially a library of internal helper functions for
 * low-level ATA host controller drivers.  As such, the API/ABI is
 * likely to change as new drivers are added and updated.
 * Do not depend on ABI/API stability.
 */

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/pci.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/mm.h>
#include <linux/spinlock.h>
#include <linux/blkdev.h>
#include <linux/delay.h>
#include <linux/timer.h>
#include <linux/time.h>
#include <linux/interrupt.h>
#include <linux/completion.h>
#include <linux/suspend.h>
#include <linux/workqueue.h>
#include <linux/scatterlist.h>
#include <linux/io.h>
#include <linux/log2.h>
#include <linux/slab.h>
#include <linux/glob.h>
#include <scsi/scsi.h>
#include <scsi/scsi_cmnd.h>
#include <scsi/scsi_host.h>
#include <linux/libata.h>
#include <asm/byteorder.h>
#include <asm/unaligned.h>
#include <linux/cdrom.h>
#include <linux/ratelimit.h>
#include <linux/leds.h>
#include <linux/pm_runtime.h>
#include <linux/platform_device.h>
#include <asm/setup.h>

#define CREATE_TRACE_POINTS
#include <trace/events/libata.h>

#include "libata.h"
#include "libata-transport.h"

const struct ata_port_operations ata_base_port_ops = {
        .prereset                = ata_std_prereset,
        .postreset                = ata_std_postreset,
        .error_handler                = ata_std_error_handler,
        .sched_eh                = ata_std_sched_eh,
        .end_eh                        = ata_std_end_eh,
};

const struct ata_port_operations sata_port_ops = {
        .inherits                = &ata_base_port_ops,

        .qc_defer                = ata_std_qc_defer,
        .hardreset                = sata_std_hardreset,
};
EXPORT_SYMBOL_GPL(sata_port_ops);

static unsigned int ata_dev_init_params(struct ata_device *dev,
                                        u16 heads, u16 sectors);
static unsigned int ata_dev_set_xfermode(struct ata_device *dev);
static void ata_dev_xfermask(struct ata_device *dev);
static unsigned long ata_dev_blacklisted(const struct ata_device *dev);

atomic_t ata_print_id = ATOMIC_INIT(0);

#ifdef CONFIG_ATA_FORCE
struct ata_force_param {
        const char        *name;
        u8                cbl;
        u8                spd_limit;
        unsigned long        xfer_mask;
        unsigned int        horkage_on;
        unsigned int        horkage_off;
        u16                lflags;
};

struct ata_force_ent {
        int                        port;
        int                        device;
        struct ata_force_param        param;
};

static struct ata_force_ent *ata_force_tbl;
static int ata_force_tbl_size;

static char ata_force_param_buf[COMMAND_LINE_SIZE] __initdata;
/* param_buf is thrown away after initialization, disallow read */
module_param_string(force, ata_force_param_buf, sizeof(ata_force_param_buf), 0);
MODULE_PARM_DESC(force, "Force ATA configurations including cable type, link speed and transfer mode (see Documentation/admin-guide/kernel-parameters.rst for details)");
#endif

static int atapi_enabled = 1;
module_param(atapi_enabled, int, 0444);
MODULE_PARM_DESC(atapi_enabled, "Enable discovery of ATAPI devices (0=off, 1=on [default])");

static int atapi_dmadir = 0;
module_param(atapi_dmadir, int, 0444);
MODULE_PARM_DESC(atapi_dmadir, "Enable ATAPI DMADIR bridge support (0=off [default], 1=on)");

int atapi_passthru16 = 1;
module_param(atapi_passthru16, int, 0444);
MODULE_PARM_DESC(atapi_passthru16, "Enable ATA_16 passthru for ATAPI devices (0=off, 1=on [default])");

int libata_fua = 0;
module_param_named(fua, libata_fua, int, 0444);
MODULE_PARM_DESC(fua, "FUA support (0=off [default], 1=on)");

static int ata_ignore_hpa;
module_param_named(ignore_hpa, ata_ignore_hpa, int, 0644);
MODULE_PARM_DESC(ignore_hpa, "Ignore HPA limit (0=keep BIOS limits, 1=ignore limits, using full disk)");

static int libata_dma_mask = ATA_DMA_MASK_ATA|ATA_DMA_MASK_ATAPI|ATA_DMA_MASK_CFA;
module_param_named(dma, libata_dma_mask, int, 0444);
MODULE_PARM_DESC(dma, "DMA enable/disable (0x1==ATA, 0x2==ATAPI, 0x4==CF)");

static int ata_probe_timeout;
module_param(ata_probe_timeout, int, 0444);
MODULE_PARM_DESC(ata_probe_timeout, "Set ATA probing timeout (seconds)");

int libata_noacpi = 0;
module_param_named(noacpi, libata_noacpi, int, 0444);
MODULE_PARM_DESC(noacpi, "Disable the use of ACPI in probe/suspend/resume (0=off [default], 1=on)");

int libata_allow_tpm = 0;
module_param_named(allow_tpm, libata_allow_tpm, int, 0444);
MODULE_PARM_DESC(allow_tpm, "Permit the use of TPM commands (0=off [default], 1=on)");

static int atapi_an;
module_param(atapi_an, int, 0444);
MODULE_PARM_DESC(atapi_an, "Enable ATAPI AN media presence notification (0=0ff [default], 1=on)");

MODULE_AUTHOR("Jeff Garzik");
MODULE_DESCRIPTION("Library module for ATA devices");
MODULE_LICENSE("GPL");
MODULE_VERSION(DRV_VERSION);


static bool ata_sstatus_online(u32 sstatus)
{
        return (sstatus & 0xf) == 0x3;
}

/**
 *        ata_link_next - link iteration helper
 *        @link: the previous link, NULL to start
 *        @ap: ATA port containing links to iterate
 *        @mode: iteration mode, one of ATA_LITER_*
 *
 *        LOCKING:
 *        Host lock or EH context.
 *
 *        RETURNS:
 *        Pointer to the next link.
 */
struct ata_link *ata_link_next(struct ata_link *link, struct ata_port *ap,
                               enum ata_link_iter_mode mode)
{
        BUG_ON(mode != ATA_LITER_EDGE &&
               mode != ATA_LITER_PMP_FIRST && mode != ATA_LITER_HOST_FIRST);

        /* NULL link indicates start of iteration */
        if (!link)
                switch (mode) {
                case ATA_LITER_EDGE:
                case ATA_LITER_PMP_FIRST:
                        if (sata_pmp_attached(ap))
                                return ap->pmp_link;
                        fallthrough;
                case ATA_LITER_HOST_FIRST:
                        return &ap->link;
                }

        /* we just iterated over the host link, what's next? */
        if (link == &ap->link)
                switch (mode) {
                case ATA_LITER_HOST_FIRST:
                        if (sata_pmp_attached(ap))
                                return ap->pmp_link;
                        fallthrough;
                case ATA_LITER_PMP_FIRST:
                        if (unlikely(ap->slave_link))
                                return ap->slave_link;
                        fallthrough;
                case ATA_LITER_EDGE:
                        return NULL;
                }

        /* slave_link excludes PMP */
        if (unlikely(link == ap->slave_link))
                return NULL;

        /* we were over a PMP link */
        if (++link < ap->pmp_link + ap->nr_pmp_links)
                return link;

        if (mode == ATA_LITER_PMP_FIRST)
                return &ap->link;

        return NULL;
}
EXPORT_SYMBOL_GPL(ata_link_next);

/**
 *        ata_dev_next - device iteration helper
 *        @dev: the previous device, NULL to start
 *        @link: ATA link containing devices to iterate
 *        @mode: iteration mode, one of ATA_DITER_*
 *
 *        LOCKING:
 *        Host lock or EH context.
 *
 *        RETURNS:
 *        Pointer to the next device.
 */
struct ata_device *ata_dev_next(struct ata_device *dev, struct ata_link *link,
                                enum ata_dev_iter_mode mode)
{
        BUG_ON(mode != ATA_DITER_ENABLED && mode != ATA_DITER_ENABLED_REVERSE &&
               mode != ATA_DITER_ALL && mode != ATA_DITER_ALL_REVERSE);

        /* NULL dev indicates start of iteration */
        if (!dev)
                switch (mode) {
                case ATA_DITER_ENABLED:
                case ATA_DITER_ALL:
                        dev = link->device;
                        goto check;
                case ATA_DITER_ENABLED_REVERSE:
                case ATA_DITER_ALL_REVERSE:
                        dev = link->device + ata_link_max_devices(link) - 1;
                        goto check;
                }

 next:
        /* move to the next one */
        switch (mode) {
        case ATA_DITER_ENABLED:
        case ATA_DITER_ALL:
                if (++dev < link->device + ata_link_max_devices(link))
                        goto check;
                return NULL;
        case ATA_DITER_ENABLED_REVERSE:
        case ATA_DITER_ALL_REVERSE:
                if (--dev >= link->device)
                        goto check;
                return NULL;
        }

 check:
        if ((mode == ATA_DITER_ENABLED || mode == ATA_DITER_ENABLED_REVERSE) &&
            !ata_dev_enabled(dev))
                goto next;
        return dev;
}
EXPORT_SYMBOL_GPL(ata_dev_next);

/**
 *        ata_dev_phys_link - find physical link for a device
 *        @dev: ATA device to look up physical link for
 *
 *        Look up physical link which @dev is attached to.  Note that
 *        this is different from @dev->link only when @dev is on slave
 *        link.  For all other cases, it's the same as @dev->link.
 *
 *        LOCKING:
 *        Don't care.
 *
 *        RETURNS:
 *        Pointer to the found physical link.
 */
struct ata_link *ata_dev_phys_link(struct ata_device *dev)
{
        struct ata_port *ap = dev->link->ap;

        if (!ap->slave_link)
                return dev->link;
        if (!dev->devno)
                return &ap->link;
        return ap->slave_link;
}

#ifdef CONFIG_ATA_FORCE
/**
 *        ata_force_cbl - force cable type according to libata.force
 *        @ap: ATA port of interest
 *
 *        Force cable type according to libata.force and whine about it.
 *        The last entry which has matching port number is used, so it
 *        can be specified as part of device force parameters.  For
 *        example, both "a:40c,1.00:udma4" and "1.00:40c,udma4" have the
 *        same effect.
 *
 *        LOCKING:
 *        EH context.
 */
void ata_force_cbl(struct ata_port *ap)
{
        int i;

        for (i = ata_force_tbl_size - 1; i >= 0; i--) {
                const struct ata_force_ent *fe = &ata_force_tbl[i];

                if (fe->port != -1 && fe->port != ap->print_id)
                        continue;

                if (fe->param.cbl == ATA_CBL_NONE)
                        continue;

                ap->cbl = fe->param.cbl;
                ata_port_notice(ap, "FORCE: cable set to %s\n", fe->param.name);
                return;
        }
}

/**
 *        ata_force_link_limits - force link limits according to libata.force
 *        @link: ATA link of interest
 *
 *        Force link flags and SATA spd limit according to libata.force
 *        and whine about it.  When only the port part is specified
 *        (e.g. 1:), the limit applies to all links connected to both
 *        the host link and all fan-out ports connected via PMP.  If the
 *        device part is specified as 0 (e.g. 1.00:), it specifies the
 *        first fan-out link not the host link.  Device number 15 always
 *        points to the host link whether PMP is attached or not.  If the
 *        controller has slave link, device number 16 points to it.
 *
 *        LOCKING:
 *        EH context.
 */
static void ata_force_link_limits(struct ata_link *link)
{
        bool did_spd = false;
        int linkno = link->pmp;
        int i;

        if (ata_is_host_link(link))
                linkno += 15;

        for (i = ata_force_tbl_size - 1; i >= 0; i--) {
                const struct ata_force_ent *fe = &ata_force_tbl[i];

                if (fe->port != -1 && fe->port != link->ap->print_id)
                        continue;

                if (fe->device != -1 && fe->device != linkno)
                        continue;

                /* only honor the first spd limit */
                if (!did_spd && fe->param.spd_limit) {
                        link->hw_sata_spd_limit = (1 << fe->param.spd_limit) - 1;
                        ata_link_notice(link, "FORCE: PHY spd limit set to %s\n",
                                        fe->param.name);
                        did_spd = true;
                }

                /* let lflags stack */
                if (fe->param.lflags) {
                        link->flags |= fe->param.lflags;
                        ata_link_notice(link,
                                        "FORCE: link flag 0x%x forced -> 0x%x\n",
                                        fe->param.lflags, link->flags);
                }
        }
}

/**
 *        ata_force_xfermask - force xfermask according to libata.force
 *        @dev: ATA device of interest
 *
 *        Force xfer_mask according to libata.force and whine about it.
 *        For consistency with link selection, device number 15 selects
 *        the first device connected to the host link.
 *
 *        LOCKING:
 *        EH context.
 */
static void ata_force_xfermask(struct ata_device *dev)
{
        int devno = dev->link->pmp + dev->devno;
        int alt_devno = devno;
        int i;

        /* allow n.15/16 for devices attached to host port */
        if (ata_is_host_link(dev->link))
                alt_devno += 15;

        for (i = ata_force_tbl_size - 1; i >= 0; i--) {
                const struct ata_force_ent *fe = &ata_force_tbl[i];
                unsigned long pio_mask, mwdma_mask, udma_mask;

                if (fe->port != -1 && fe->port != dev->link->ap->print_id)
                        continue;

                if (fe->device != -1 && fe->device != devno &&
                    fe->device != alt_devno)
                        continue;

                if (!fe->param.xfer_mask)
                        continue;

                ata_unpack_xfermask(fe->param.xfer_mask,
                                    &pio_mask, &mwdma_mask, &udma_mask);
                if (udma_mask)
                        dev->udma_mask = udma_mask;
                else if (mwdma_mask) {
                        dev->udma_mask = 0;
                        dev->mwdma_mask = mwdma_mask;
                } else {
                        dev->udma_mask = 0;
                        dev->mwdma_mask = 0;
                        dev->pio_mask = pio_mask;
                }

                ata_dev_notice(dev, "FORCE: xfer_mask set to %s\n",
                               fe->param.name);
                return;
        }
}

/**
 *        ata_force_horkage - force horkage according to libata.force
 *        @dev: ATA device of interest
 *
 *        Force horkage according to libata.force and whine about it.
 *        For consistency with link selection, device number 15 selects
 *        the first device connected to the host link.
 *
 *        LOCKING:
 *        EH context.
 */
static void ata_force_horkage(struct ata_device *dev)
{
        int devno = dev->link->pmp + dev->devno;
        int alt_devno = devno;
        int i;

        /* allow n.15/16 for devices attached to host port */
        if (ata_is_host_link(dev->link))
                alt_devno += 15;

        for (i = 0; i < ata_force_tbl_size; i++) {
                const struct ata_force_ent *fe = &ata_force_tbl[i];

                if (fe->port != -1 && fe->port != dev->link->ap->print_id)
                        continue;

                if (fe->device != -1 && fe->device != devno &&
                    fe->device != alt_devno)
                        continue;

                if (!(~dev->horkage & fe->param.horkage_on) &&
                    !(dev->horkage & fe->param.horkage_off))
                        continue;

                dev->horkage |= fe->param.horkage_on;
                dev->horkage &= ~fe->param.horkage_off;

                ata_dev_notice(dev, "FORCE: horkage modified (%s)\n",
                               fe->param.name);
        }
}
#else
static inline void ata_force_link_limits(struct ata_link *link) { }
static inline void ata_force_xfermask(struct ata_device *dev) { }
static inline void ata_force_horkage(struct ata_device *dev) { }
#endif

/**
 *        atapi_cmd_type - Determine ATAPI command type from SCSI opcode
 *        @opcode: SCSI opcode
 *
 *        Determine ATAPI command type from @opcode.
 *
 *        LOCKING:
 *        None.
 *
 *        RETURNS:
 *        ATAPI_{READ|WRITE|READ_CD|PASS_THRU|MISC}
 */
int atapi_cmd_type(u8 opcode)
{
        switch (opcode) {
        case GPCMD_READ_10:
        case GPCMD_READ_12:
                return ATAPI_READ;

        case GPCMD_WRITE_10:
        case GPCMD_WRITE_12:
        case GPCMD_WRITE_AND_VERIFY_10:
                return ATAPI_WRITE;

        case GPCMD_READ_CD:
        case GPCMD_READ_CD_MSF:
                return ATAPI_READ_CD;

        case ATA_16:
        case ATA_12:
                if (atapi_passthru16)
                        return ATAPI_PASS_THRU;
                fallthrough;
        default:
                return ATAPI_MISC;
        }
}
EXPORT_SYMBOL_GPL(atapi_cmd_type);

static const u8 ata_rw_cmds[] = {
        /* pio multi */
        ATA_CMD_READ_MULTI,
        ATA_CMD_WRITE_MULTI,
        ATA_CMD_READ_MULTI_EXT,
        ATA_CMD_WRITE_MULTI_EXT,
        0,
        0,
        0,
        ATA_CMD_WRITE_MULTI_FUA_EXT,
        /* pio */
        ATA_CMD_PIO_READ,
        ATA_CMD_PIO_WRITE,
        ATA_CMD_PIO_READ_EXT,
        ATA_CMD_PIO_WRITE_EXT,
        0,
        0,
        0,
        0,
        /* dma */
        ATA_CMD_READ,
        ATA_CMD_WRITE,
        ATA_CMD_READ_EXT,
        ATA_CMD_WRITE_EXT,
        0,
        0,
        0,
        ATA_CMD_WRITE_FUA_EXT
};

/**
 *        ata_rwcmd_protocol - set taskfile r/w commands and protocol
 *        @tf: command to examine and configure
 *        @dev: device tf belongs to
 *
 *        Examine the device configuration and tf->flags to calculate
 *        the proper read/write commands and protocol to use.
 *
 *        LOCKING:
 *        caller.
 */
static int ata_rwcmd_protocol(struct ata_taskfile *tf, struct ata_device *dev)
{
        u8 cmd;

        int index, fua, lba48, write;

        fua = (tf->flags & ATA_TFLAG_FUA) ? 4 : 0;
        lba48 = (tf->flags & ATA_TFLAG_LBA48) ? 2 : 0;
        write = (tf->flags & ATA_TFLAG_WRITE) ? 1 : 0;

        if (dev->flags & ATA_DFLAG_PIO) {
                tf->protocol = ATA_PROT_PIO;
                index = dev->multi_count ? 0 : 8;
        } else if (lba48 && (dev->link->ap->flags & ATA_FLAG_PIO_LBA48)) {
                /* Unable to use DMA due to host limitation */
                tf->protocol = ATA_PROT_PIO;
                index = dev->multi_count ? 0 : 8;
        } else {
                tf->protocol = ATA_PROT_DMA;
                index = 16;
        }

        cmd = ata_rw_cmds[index + fua + lba48 + write];
        if (cmd) {
                tf->command = cmd;
                return 0;
        }
        return -1;
}

/**
 *        ata_tf_read_block - Read block address from ATA taskfile
 *        @tf: ATA taskfile of interest
 *        @dev: ATA device @tf belongs to
 *
 *        LOCKING:
 *        None.
 *
 *        Read block address from @tf.  This function can handle all
 *        three address formats - LBA, LBA48 and CHS.  tf->protocol and
 *        flags select the address format to use.
 *
 *        RETURNS:
 *        Block address read from @tf.
 */
u64 ata_tf_read_block(const struct ata_taskfile *tf, struct ata_device *dev)
{
        u64 block = 0;

        if (tf->flags & ATA_TFLAG_LBA) {
                if (tf->flags & ATA_TFLAG_LBA48) {
                        block |= (u64)tf->hob_lbah << 40;
                        block |= (u64)tf->hob_lbam << 32;
                        block |= (u64)tf->hob_lbal << 24;
                } else
                        block |= (tf->device & 0xf) << 24;

                block |= tf->lbah << 16;
                block |= tf->lbam << 8;
                block |= tf->lbal;
        } else {
                u32 cyl, head, sect;

                cyl = tf->lbam | (tf->lbah << 8);
                head = tf->device & 0xf;
                sect = tf->lbal;

                if (!sect) {
                        ata_dev_warn(dev,
                                     "device reported invalid CHS sector 0\n");
                        return U64_MAX;
                }

                block = (cyl * dev->heads + head) * dev->sectors + sect - 1;
        }

        return block;
}

/**
 *        ata_build_rw_tf - Build ATA taskfile for given read/write request
 *        @tf: Target ATA taskfile
 *        @dev: ATA device @tf belongs to
 *        @block: Block address
 *        @n_block: Number of blocks
 *        @tf_flags: RW/FUA etc...
 *        @tag: tag
 *        @class: IO priority class
 *
 *        LOCKING:
 *        None.
 *
 *        Build ATA taskfile @tf for read/write request described by
 *        @block, @n_block, @tf_flags and @tag on @dev.
 *
 *        RETURNS:
 *
 *        0 on success, -ERANGE if the request is too large for @dev,
 *        -EINVAL if the request is invalid.
 */
int ata_build_rw_tf(struct ata_taskfile *tf, struct ata_device *dev,
                    u64 block, u32 n_block, unsigned int tf_flags,
                    unsigned int tag, int class)
{
        tf->flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE;
        tf->flags |= tf_flags;

        if (ata_ncq_enabled(dev) && !ata_tag_internal(tag)) {
                /* yay, NCQ */
                if (!lba_48_ok(block, n_block))
                        return -ERANGE;

                tf->protocol = ATA_PROT_NCQ;
                tf->flags |= ATA_TFLAG_LBA | ATA_TFLAG_LBA48;

                if (tf->flags & ATA_TFLAG_WRITE)
                        tf->command = ATA_CMD_FPDMA_WRITE;
                else
                        tf->command = ATA_CMD_FPDMA_READ;

                tf->nsect = tag << 3;
                tf->hob_feature = (n_block >> 8) & 0xff;
                tf->feature = n_block & 0xff;

                tf->hob_lbah = (block >> 40) & 0xff;
                tf->hob_lbam = (block >> 32) & 0xff;
                tf->hob_lbal = (block >> 24) & 0xff;
                tf->lbah = (block >> 16) & 0xff;
                tf->lbam = (block >> 8) & 0xff;
                tf->lbal = block & 0xff;

                tf->device = ATA_LBA;
                if (tf->flags & ATA_TFLAG_FUA)
                        tf->device |= 1 << 7;

                if (dev->flags & ATA_DFLAG_NCQ_PRIO) {
                        if (class == IOPRIO_CLASS_RT)
                                tf->hob_nsect |= ATA_PRIO_HIGH <<
                                                 ATA_SHIFT_PRIO;
                }
        } else if (dev->flags & ATA_DFLAG_LBA) {
                tf->flags |= ATA_TFLAG_LBA;

                if (lba_28_ok(block, n_block)) {
                        /* use LBA28 */
                        tf->device |= (block >> 24) & 0xf;
                } else if (lba_48_ok(block, n_block)) {
                        if (!(dev->flags & ATA_DFLAG_LBA48))
                                return -ERANGE;

                        /* use LBA48 */
                        tf->flags |= ATA_TFLAG_LBA48;

                        tf->hob_nsect = (n_block >> 8) & 0xff;

                        tf->hob_lbah = (block >> 40) & 0xff;
                        tf->hob_lbam = (block >> 32) & 0xff;
                        tf->hob_lbal = (block >> 24) & 0xff;
                } else
                        /* request too large even for LBA48 */
                        return -ERANGE;

                if (unlikely(ata_rwcmd_protocol(tf, dev) < 0))
                        return -EINVAL;

                tf->nsect = n_block & 0xff;

                tf->lbah = (block >> 16) & 0xff;
                tf->lbam = (block >> 8) & 0xff;
                tf->lbal = block & 0xff;

                tf->device |= ATA_LBA;
        } else {
                /* CHS */
                u32 sect, head, cyl, track;

                /* The request -may- be too large for CHS addressing. */
                if (!lba_28_ok(block, n_block))
                        return -ERANGE;

                if (unlikely(ata_rwcmd_protocol(tf, dev) < 0))
                        return -EINVAL;

                /* Convert LBA to CHS */
                track = (u32)block / dev->sectors;
                cyl   = track / dev->heads;
                head  = track % dev->heads;
                sect  = (u32)block % dev->sectors + 1;

                DPRINTK("block %u track %u cyl %u head %u sect %u\n",
                        (u32)block, track, cyl, head, sect);

                /* Check whether the converted CHS can fit.
                   Cylinder: 0-65535
                   Head: 0-15
                   Sector: 1-255*/
                if ((cyl >> 16) || (head >> 4) || (sect >> 8) || (!sect))
                        return -ERANGE;

                tf->nsect = n_block & 0xff; /* Sector count 0 means 256 sectors */
                tf->lbal = sect;
                tf->lbam = cyl;
                tf->lbah = cyl >> 8;
                tf->device |= head;
        }

        return 0;
}

/**
 *        ata_pack_xfermask - Pack pio, mwdma and udma masks into xfer_mask
 *        @pio_mask: pio_mask
 *        @mwdma_mask: mwdma_mask
 *        @udma_mask: udma_mask
 *
 *        Pack @pio_mask, @mwdma_mask and @udma_mask into a single
 *        unsigned int xfer_mask.
 *
 *        LOCKING:
 *        None.
 *
 *        RETURNS:
 *        Packed xfer_mask.
 */
unsigned long ata_pack_xfermask(unsigned long pio_mask,
                                unsigned long mwdma_mask,
                                unsigned long udma_mask)
{
        return ((pio_mask << ATA_SHIFT_PIO) & ATA_MASK_PIO) |
                ((mwdma_mask << ATA_SHIFT_MWDMA) & ATA_MASK_MWDMA) |
                ((udma_mask << ATA_SHIFT_UDMA) & ATA_MASK_UDMA);
}
EXPORT_SYMBOL_GPL(ata_pack_xfermask);

/**
 *        ata_unpack_xfermask - Unpack xfer_mask into pio, mwdma and udma masks
 *        @xfer_mask: xfer_mask to unpack
 *        @pio_mask: resulting pio_mask
 *        @mwdma_mask: resulting mwdma_mask
 *        @udma_mask: resulting udma_mask
 *
 *        Unpack @xfer_mask into @pio_mask, @mwdma_mask and @udma_mask.
 *        Any NULL destination masks will be ignored.
 */
void ata_unpack_xfermask(unsigned long xfer_mask, unsigned long *pio_mask,
                         unsigned long *mwdma_mask, unsigned long *udma_mask)
{
        if (pio_mask)
                *pio_mask = (xfer_mask & ATA_MASK_PIO) >> ATA_SHIFT_PIO;
        if (mwdma_mask)
                *mwdma_mask = (xfer_mask & ATA_MASK_MWDMA) >> ATA_SHIFT_MWDMA;
        if (udma_mask)
                *udma_mask = (xfer_mask & ATA_MASK_UDMA) >> ATA_SHIFT_UDMA;
}

static const struct ata_xfer_ent {
        int shift, bits;
        u8 base;
} ata_xfer_tbl[] = {
        { ATA_SHIFT_PIO, ATA_NR_PIO_MODES, XFER_PIO_0 },
        { ATA_SHIFT_MWDMA, ATA_NR_MWDMA_MODES, XFER_MW_DMA_0 },
        { ATA_SHIFT_UDMA, ATA_NR_UDMA_MODES, XFER_UDMA_0 },
        { -1, },
};

/**
 *        ata_xfer_mask2mode - Find matching XFER_* for the given xfer_mask
 *        @xfer_mask: xfer_mask of interest
 *
 *        Return matching XFER_* value for @xfer_mask.  Only the highest
 *        bit of @xfer_mask is considered.
 *
 *        LOCKING:
 *        None.
 *
 *        RETURNS:
 *        Matching XFER_* value, 0xff if no match found.
 */
u8 ata_xfer_mask2mode(unsigned long xfer_mask)
{
        int highbit = fls(xfer_mask) - 1;
        const struct ata_xfer_ent *ent;

        for (ent = ata_xfer_tbl; ent->shift >= 0; ent++)
                if (highbit >= ent->shift && highbit < ent->shift + ent->bits)
                        return ent->base + highbit - ent->shift;
        return 0xff;
}
EXPORT_SYMBOL_GPL(ata_xfer_mask2mode);

/**
 *        ata_xfer_mode2mask - Find matching xfer_mask for XFER_*
 *        @xfer_mode: XFER_* of interest
 *
 *        Return matching xfer_mask for @xfer_mode.
 *
 *        LOCKING:
 *        None.
 *
 *        RETURNS:
 *        Matching xfer_mask, 0 if no match found.
 */
unsigned long ata_xfer_mode2mask(u8 xfer_mode)
{
        const struct ata_xfer_ent *ent;

        for (ent = ata_xfer_tbl; ent->shift >= 0; ent++)
                if (xfer_mode >= ent->base && xfer_mode < ent->base + ent->bits)
                        return ((2 << (ent->shift + xfer_mode - ent->base)) - 1)
                                & ~((1 << ent->shift) - 1);
        return 0;
}
EXPORT_SYMBOL_GPL(ata_xfer_mode2mask);

/**
 *        ata_xfer_mode2shift - Find matching xfer_shift for XFER_*
 *        @xfer_mode: XFER_* of interest
 *
 *        Return matching xfer_shift for @xfer_mode.
 *
 *        LOCKING:
 *        None.
 *
 *        RETURNS:
 *        Matching xfer_shift, -1 if no match found.
 */
int ata_xfer_mode2shift(unsigned long xfer_mode)
{
        const struct ata_xfer_ent *ent;

        for (ent = ata_xfer_tbl; ent->shift >= 0; ent++)
                if (xfer_mode >= ent->base && xfer_mode < ent->base + ent->bits)
                        return ent->shift;
        return -1;
}
EXPORT_SYMBOL_GPL(ata_xfer_mode2shift);

/**
 *        ata_mode_string - convert xfer_mask to string
 *        @xfer_mask: mask of bits supported; only highest bit counts.
 *
 *        Determine string which represents the highest speed
 *        (highest bit in @modemask).
 *
 *        LOCKING:
 *        None.
 *
 *        RETURNS:
 *        Constant C string representing highest speed listed in
 *        @mode_mask, or the constant C string "<n/a>".
 */
const char *ata_mode_string(unsigned long xfer_mask)
{
        static const char * const xfer_mode_str[] = {
                "PIO0",
                "PIO1",
                "PIO2",
                "PIO3",
                "PIO4",
                "PIO5",
                "PIO6",
                "MWDMA0",
                "MWDMA1",
                "MWDMA2",
                "MWDMA3",
                "MWDMA4",
                "UDMA/16",
                "UDMA/25",
                "UDMA/33",
                "UDMA/44",
                "UDMA/66",
                "UDMA/100",
                "UDMA/133",
                "UDMA7",
        };
        int highbit;

        highbit = fls(xfer_mask) - 1;
        if (highbit >= 0 && highbit < ARRAY_SIZE(xfer_mode_str))
                return xfer_mode_str[highbit];
        return "<n/a>";
}
EXPORT_SYMBOL_GPL(ata_mode_string);

const char *sata_spd_string(unsigned int spd)
{
        static const char * const spd_str[] = {
                "1.5 Gbps",
                "3.0 Gbps",
                "6.0 Gbps",
        };

        if (spd == 0 || (spd - 1) >= ARRAY_SIZE(spd_str))
                return "<unknown>";
        return spd_str[spd - 1];
}

/**
 *        ata_dev_classify - determine device type based on ATA-spec signature
 *        @tf: ATA taskfile register set for device to be identified
 *
 *        Determine from taskfile register contents whether a device is
 *        ATA or ATAPI, as per "Signature and persistence" section
 *        of ATA/PI spec (volume 1, sect 5.14).
 *
 *        LOCKING:
 *        None.
 *
 *        RETURNS:
 *        Device type, %ATA_DEV_ATA, %ATA_DEV_ATAPI, %ATA_DEV_PMP,
 *        %ATA_DEV_ZAC, or %ATA_DEV_UNKNOWN the event of failure.
 */
unsigned int ata_dev_classify(const struct ata_taskfile *tf)
{
        /* Apple's open source Darwin code hints that some devices only
         * put a proper signature into the LBA mid/high registers,
         * So, we only check those.  It's sufficient for uniqueness.
         *
         * ATA/ATAPI-7 (d1532v1r1: Feb. 19, 2003) specified separate
         * signatures for ATA and ATAPI devices attached on SerialATA,
         * 0x3c/0xc3 and 0x69/0x96 respectively.  However, SerialATA
         * spec has never mentioned about using different signatures
         * for ATA/ATAPI devices.  Then, Serial ATA II: Port
         * Multiplier specification began to use 0x69/0x96 to identify
         * port multpliers and 0x3c/0xc3 to identify SEMB device.
         * ATA/ATAPI-7 dropped descriptions about 0x3c/0xc3 and
         * 0x69/0x96 shortly and described them as reserved for
         * SerialATA.
         *
         * We follow the current spec and consider that 0x69/0x96
         * identifies a port multiplier and 0x3c/0xc3 a SEMB device.
         * Unfortunately, WDC WD1600JS-62MHB5 (a hard drive) reports
         * SEMB signature.  This is worked around in
         * ata_dev_read_id().
         */
        if ((tf->lbam == 0) && (tf->lbah == 0)) {
                DPRINTK("found ATA device by sig\n");
                return ATA_DEV_ATA;
        }

        if ((tf->lbam == 0x14) && (tf->lbah == 0xeb)) {
                DPRINTK("found ATAPI device by sig\n");
                return ATA_DEV_ATAPI;
        }

        if ((tf->lbam == 0x69) && (tf->lbah == 0x96)) {
                DPRINTK("found PMP device by sig\n");
                return ATA_DEV_PMP;
        }

        if ((tf->lbam == 0x3c) && (tf->lbah == 0xc3)) {
                DPRINTK("found SEMB device by sig (could be ATA device)\n");
                return ATA_DEV_SEMB;
        }

        if ((tf->lbam == 0xcd) && (tf->lbah == 0xab)) {
                DPRINTK("found ZAC device by sig\n");
                return ATA_DEV_ZAC;
        }

        DPRINTK("unknown device\n");
        return ATA_DEV_UNKNOWN;
}
EXPORT_SYMBOL_GPL(ata_dev_classify);

/**
 *        ata_id_string - Convert IDENTIFY DEVICE page into string
 *        @id: IDENTIFY DEVICE results we will examine
 *        @s: string into which data is output
 *        @ofs: offset into identify device page
 *        @len: length of string to return. must be an even number.
 *
 *        The strings in the IDENTIFY DEVICE page are broken up into
 *        16-bit chunks.  Run through the string, and output each
 *        8-bit chunk linearly, regardless of platform.
 *
 *        LOCKING:
 *        caller.
 */

void ata_id_string(const u16 *id, unsigned char *s,
                   unsigned int ofs, unsigned int len)
{
        unsigned int c;

        BUG_ON(len & 1);

        while (len > 0) {
                c = id[ofs] >> 8;
                *s = c;
                s++;

                c = id[ofs] & 0xff;
                *s = c;
                s++;

                ofs++;
                len -= 2;
        }
}
EXPORT_SYMBOL_GPL(ata_id_string);

/**
 *        ata_id_c_string - Convert IDENTIFY DEVICE page into C string
 *        @id: IDENTIFY DEVICE results we will examine
 *        @s: string into which data is output
 *        @ofs: offset into identify device page
 *        @len: length of string to return. must be an odd number.
 *
 *        This function is identical to ata_id_string except that it
 *        trims trailing spaces and terminates the resulting string with
 *        null.  @len must be actual maximum length (even number) + 1.
 *
 *        LOCKING:
 *        caller.
 */
void ata_id_c_string(const u16 *id, unsigned char *s,
                     unsigned int ofs, unsigned int len)
{
        unsigned char *p;

        ata_id_string(id, s, ofs, len - 1);

        p = s + strnlen(s, len - 1);
        while (p > s && p[-1] == ' ')
                p--;
        *p = '\0';
}
EXPORT_SYMBOL_GPL(ata_id_c_string);

static u64 ata_id_n_sectors(const u16 *id)
{
        if (ata_id_has_lba(id)) {
                if (ata_id_has_lba48(id))
                        return ata_id_u64(id, ATA_ID_LBA_CAPACITY_2);
                else
                        return ata_id_u32(id, ATA_ID_LBA_CAPACITY);
        } else {
                if (ata_id_current_chs_valid(id))
                        return id[ATA_ID_CUR_CYLS] * id[ATA_ID_CUR_HEADS] *
                               id[ATA_ID_CUR_SECTORS];
                else
                        return id[ATA_ID_CYLS] * id[ATA_ID_HEADS] *
                               id[ATA_ID_SECTORS];
        }
}

u64 ata_tf_to_lba48(const struct ata_taskfile *tf)
{
        u64 sectors = 0;

        sectors |= ((u64)(tf->hob_lbah & 0xff)) << 40;
        sectors |= ((u64)(tf->hob_lbam & 0xff)) << 32;
        sectors |= ((u64)(tf->hob_lbal & 0xff)) << 24;
        sectors |= (tf->lbah & 0xff) << 16;
        sectors |= (tf->lbam & 0xff) << 8;
        sectors |= (tf->lbal & 0xff);

        return sectors;
}

u64 ata_tf_to_lba(const struct ata_taskfile *tf)
{
        u64 sectors = 0;

        sectors |= (tf->device & 0x0f) << 24;
        sectors |= (tf->lbah & 0xff) << 16;
        sectors |= (tf->lbam & 0xff) << 8;
        sectors |= (tf->lbal & 0xff);

        return sectors;
}

/**
 *        ata_read_native_max_address - Read native max address
 *        @dev: target device
 *        @max_sectors: out parameter for the result native max address
 *
 *        Perform an LBA48 or LBA28 native size query upon the device in
 *        question.
 *
 *        RETURNS:
 *        0 on success, -EACCES if command is aborted by the drive.
 *        -EIO on other errors.
 */
static int ata_read_native_max_address(struct ata_device *dev, u64 *max_sectors)
{
        unsigned int err_mask;
        struct ata_taskfile tf;
        int lba48 = ata_id_has_lba48(dev->id);

        ata_tf_init(dev, &tf);

        /* always clear all address registers */
        tf.flags |= ATA_TFLAG_DEVICE | ATA_TFLAG_ISADDR;

        if (lba48) {
                tf.command = ATA_CMD_READ_NATIVE_MAX_EXT;
                tf.flags |= ATA_TFLAG_LBA48;
        } else
                tf.command = ATA_CMD_READ_NATIVE_MAX;

        tf.protocol = ATA_PROT_NODATA;
        tf.device |= ATA_LBA;

        err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, 0);
        if (err_mask) {
                ata_dev_warn(dev,
                             "failed to read native max address (err_mask=0x%x)\n",
                             err_mask);
                if (err_mask == AC_ERR_DEV && (tf.feature & ATA_ABORTED))
                        return -EACCES;
                return -EIO;
        }

        if (lba48)
                *max_sectors = ata_tf_to_lba48(&tf) + 1;
        else
                *max_sectors = ata_tf_to_lba(&tf) + 1;
        if (dev->horkage & ATA_HORKAGE_HPA_SIZE)
                (*max_sectors)--;
        return 0;
}

/**
 *        ata_set_max_sectors - Set max sectors
 *        @dev: target device
 *        @new_sectors: new max sectors value to set for the device
 *
 *        Set max sectors of @dev to @new_sectors.
 *
 *        RETURNS:
 *        0 on success, -EACCES if command is aborted or denied (due to
 *        previous non-volatile SET_MAX) by the drive.  -EIO on other
 *        errors.
 */
static int ata_set_max_sectors(struct ata_device *dev, u64 new_sectors)
{
        unsigned int err_mask;
        struct ata_taskfile tf;
        int lba48 = ata_id_has_lba48(dev->id);

        new_sectors--;

        ata_tf_init(dev, &tf);

        tf.flags |= ATA_TFLAG_DEVICE | ATA_TFLAG_ISADDR;

        if (lba48) {
                tf.command = ATA_CMD_SET_MAX_EXT;
                tf.flags |= ATA_TFLAG_LBA48;

                tf.hob_lbal = (new_sectors >> 24) & 0xff;
                tf.hob_lbam = (new_sectors >> 32) & 0xff;
                tf.hob_lbah = (new_sectors >> 40) & 0xff;
        } else {
                tf.command = ATA_CMD_SET_MAX;

                tf.device |= (new_sectors >> 24) & 0xf;
        }

        tf.protocol = ATA_PROT_NODATA;
        tf.device |= ATA_LBA;

        tf.lbal = (new_sectors >> 0) & 0xff;
        tf.lbam = (new_sectors >> 8) & 0xff;
        tf.lbah = (new_sectors >> 16) & 0xff;

        err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, 0);
        if (err_mask) {
                ata_dev_warn(dev,
                             "failed to set max address (err_mask=0x%x)\n",
                             err_mask);
                if (err_mask == AC_ERR_DEV &&
                    (tf.feature & (ATA_ABORTED | ATA_IDNF)))
                        return -EACCES;
                return -EIO;
        }

        return 0;
}

/**
 *        ata_hpa_resize                -        Resize a device with an HPA set
 *        @dev: Device to resize
 *
 *        Read the size of an LBA28 or LBA48 disk with HPA features and resize
 *        it if required to the full size of the media. The caller must check
 *        the drive has the HPA feature set enabled.
 *
 *        RETURNS:
 *        0 on success, -errno on failure.
 */
static int ata_hpa_resize(struct ata_device *dev)
{
        struct ata_eh_context *ehc = &dev->link->eh_context;
        int print_info = ehc->i.flags & ATA_EHI_PRINTINFO;
        bool unlock_hpa = ata_ignore_hpa || dev->flags & ATA_DFLAG_UNLOCK_HPA;
        u64 sectors = ata_id_n_sectors(dev->id);
        u64 native_sectors;
        int rc;

        /* do we need to do it? */
        if ((dev->class != ATA_DEV_ATA && dev->class != ATA_DEV_ZAC) ||
            !ata_id_has_lba(dev->id) || !ata_id_hpa_enabled(dev->id) ||
            (dev->horkage & ATA_HORKAGE_BROKEN_HPA))
                return 0;

        /* read native max address */
        rc = ata_read_native_max_address(dev, &native_sectors);
        if (rc) {
                /* If device aborted the command or HPA isn't going to
                 * be unlocked, skip HPA resizing.
                 */
                if (rc == -EACCES || !unlock_hpa) {
                        ata_dev_warn(dev,
                                     "HPA support seems broken, skipping HPA handling\n");
                        dev->horkage |= ATA_HORKAGE_BROKEN_HPA;

                        /* we can continue if device aborted the command */
                        if (rc == -EACCES)
                                rc = 0;
                }

                return rc;
        }
        dev->n_native_sectors = native_sectors;

        /* nothing to do? */
        if (native_sectors <= sectors || !unlock_hpa) {
                if (!print_info || native_sectors == sectors)
                        return 0;

                if (native_sectors > sectors)
                        ata_dev_info(dev,
                                "HPA detected: current %llu, native %llu\n",
                                (unsigned long long)sectors,
                                (unsigned long long)native_sectors);
                else if (native_sectors < sectors)
                        ata_dev_warn(dev,
                                "native sectors (%llu) is smaller than sectors (%llu)\n",
                                (unsigned long long)native_sectors,
                                (unsigned long long)sectors);
                return 0;
        }

        /* let's unlock HPA */
        rc = ata_set_max_sectors(dev, native_sectors);
        if (rc == -EACCES) {
                /* if device aborted the command, skip HPA resizing */
                ata_dev_warn(dev,
                             "device aborted resize (%llu -> %llu), skipping HPA handling\n",
                             (unsigned long long)sectors,
                             (unsigned long long)native_sectors);
                dev->horkage |= ATA_HORKAGE_BROKEN_HPA;
                return 0;
        } else if (rc)
                return rc;

        /* re-read IDENTIFY data */
        rc = ata_dev_reread_id(dev, 0);
        if (rc) {
                ata_dev_err(dev,
                            "failed to re-read IDENTIFY data after HPA resizing\n");
                return rc;
        }

        if (print_info) {
                u64 new_sectors = ata_id_n_sectors(dev->id);
                ata_dev_info(dev,
                        "HPA unlocked: %llu -> %llu, native %llu\n",
                        (unsigned long long)sectors,
                        (unsigned long long)new_sectors,
                        (unsigned long long)native_sectors);
        }

        return 0;
}

/**
 *        ata_dump_id - IDENTIFY DEVICE info debugging output
 *        @id: IDENTIFY DEVICE page to dump
 *
 *        Dump selected 16-bit words from the given IDENTIFY DEVICE
 *        page.
 *
 *        LOCKING:
 *        caller.
 */

static inline void ata_dump_id(const u16 *id)
{
        DPRINTK("49==0x%04x  "
                "53==0x%04x  "
                "63==0x%04x  "
                "64==0x%04x  "
                "75==0x%04x  \n",
                id[49],
                id[53],
                id[63],
                id[64],
                id[75]);
        DPRINTK("80==0x%04x  "
                "81==0x%04x  "
                "82==0x%04x  "
                "83==0x%04x  "
                "84==0x%04x  \n",
                id[80],
                id[81],
                id[82],
                id[83],
                id[84]);
        DPRINTK("88==0x%04x  "
                "93==0x%04x\n",
                id[88],
                id[93]);
}

/**
 *        ata_id_xfermask - Compute xfermask from the given IDENTIFY data
 *        @id: IDENTIFY data to compute xfer mask from
 *
 *        Compute the xfermask for this device. This is not as trivial
 *        as it seems if we must consider early devices correctly.
 *
 *        FIXME: pre IDE drive timing (do we care ?).
 *
 *        LOCKING:
 *        None.
 *
 *        RETURNS:
 *        Computed xfermask
 */
unsigned long ata_id_xfermask(const u16 *id)
{
        unsigned long pio_mask, mwdma_mask, udma_mask;

        /* Usual case. Word 53 indicates word 64 is valid */
        if (id[ATA_ID_FIELD_VALID] & (1 << 1)) {
                pio_mask = id[ATA_ID_PIO_MODES] & 0x03;
                pio_mask <<= 3;
                pio_mask |= 0x7;
        } else {
                /* If word 64 isn't valid then Word 51 high byte holds
                 * the PIO timing number for the maximum. Turn it into
                 * a mask.
                 */
                u8 mode = (id[ATA_ID_OLD_PIO_MODES] >> 8) & 0xFF;
                if (mode < 5)        /* Valid PIO range */
                        pio_mask = (2 << mode) - 1;
                else
                        pio_mask = 1;

                /* But wait.. there's more. Design your standards by
                 * committee and you too can get a free iordy field to
                 * process. However its the speeds not the modes that
                 * are supported... Note drivers using the timing API
                 * will get this right anyway
                 */
        }

        mwdma_mask = id[ATA_ID_MWDMA_MODES] & 0x07;

        if (ata_id_is_cfa(id)) {
                /*
                 *        Process compact flash extended modes
                 */
                int pio = (id[ATA_ID_CFA_MODES] >> 0) & 0x7;
                int dma = (id[ATA_ID_CFA_MODES] >> 3) & 0x7;

                if (pio)
                        pio_mask |= (1 << 5);
                if (pio > 1)
                        pio_mask |= (1 << 6);
                if (dma)
                        mwdma_mask |= (1 << 3);
                if (dma > 1)
                        mwdma_mask |= (1 << 4);
        }

        udma_mask = 0;
        if (id[ATA_ID_FIELD_VALID] & (1 << 2))
                udma_mask = id[ATA_ID_UDMA_MODES] & 0xff;

        return ata_pack_xfermask(pio_mask, mwdma_mask, udma_mask);
}
EXPORT_SYMBOL_GPL(ata_id_xfermask);

static void ata_qc_complete_internal(struct ata_queued_cmd *qc)
{
        struct completion *waiting = qc->private_data;

        complete(waiting);
}

/**
 *        ata_exec_internal_sg - execute libata internal command
 *        @dev: Device to which the command is sent
 *        @tf: Taskfile registers for the command and the result
 *        @cdb: CDB for packet command
 *        @dma_dir: Data transfer direction of the command
 *        @sgl: sg list for the data buffer of the command
 *        @n_elem: Number of sg entries
 *        @timeout: Timeout in msecs (0 for default)
 *
 *        Executes libata internal command with timeout.  @tf contains
 *        command on entry and result on return.  Timeout and error
 *        conditions are reported via return value.  No recovery action
 *        is taken after a command times out.  It's caller's duty to
 *        clean up after timeout.
 *
 *        LOCKING:
 *        None.  Should be called with kernel context, might sleep.
 *
 *        RETURNS:
 *        Zero on success, AC_ERR_* mask on failure
 */
unsigned ata_exec_internal_sg(struct ata_device *dev,
                              struct ata_taskfile *tf, const u8 *cdb,
                              int dma_dir, struct scatterlist *sgl,
                              unsigned int n_elem, unsigned long timeout)
{
        struct ata_link *link = dev->link;
        struct ata_port *ap = link->ap;
        u8 command = tf->command;
        int auto_timeout = 0;
        struct ata_queued_cmd *qc;
        unsigned int preempted_tag;
        u32 preempted_sactive;
        u64 preempted_qc_active;
        int preempted_nr_active_links;
        DECLARE_COMPLETION_ONSTACK(wait);
        unsigned long flags;
        unsigned int err_mask;
        int rc;

        spin_lock_irqsave(ap->lock, flags);

        /* no internal command while frozen */
        if (ap->pflags & ATA_PFLAG_FROZEN) {
                spin_unlock_irqrestore(ap->lock, flags);
                return AC_ERR_SYSTEM;
        }

        /* initialize internal qc */
        qc = __ata_qc_from_tag(ap, ATA_TAG_INTERNAL);

        qc->tag = ATA_TAG_INTERNAL;
        qc->hw_tag = 0;
        qc->scsicmd = NULL;
        qc->ap = ap;
        qc->dev = dev;
        ata_qc_reinit(qc);

        preempted_tag = link->active_tag;
        preempted_sactive = link->sactive;
        preempted_qc_active = ap->qc_active;
        preempted_nr_active_links = ap->nr_active_links;
        link->active_tag = ATA_TAG_POISON;
        link->sactive = 0;
        ap->qc_active = 0;
        ap->nr_active_links = 0;

        /* prepare & issue qc */
        qc->tf = *tf;
        if (cdb)
                memcpy(qc->cdb, cdb, ATAPI_CDB_LEN);

        /* some SATA bridges need us to indicate data xfer direction */
        if (tf->protocol == ATAPI_PROT_DMA && (dev->flags & ATA_DFLAG_DMADIR) &&
            dma_dir == DMA_FROM_DEVICE)
                qc->tf.feature |= ATAPI_DMADIR;

        qc->flags |= ATA_QCFLAG_RESULT_TF;
        qc->dma_dir = dma_dir;
        if (dma_dir != DMA_NONE) {
                unsigned int i, buflen = 0;
                struct scatterlist *sg;

                for_each_sg(sgl, sg, n_elem, i)
                        buflen += sg->length;

                ata_sg_init(qc, sgl, n_elem);
                qc->nbytes = buflen;
        }

        qc->private_data = &wait;
        qc->complete_fn = ata_qc_complete_internal;

        ata_qc_issue(qc);

        spin_unlock_irqrestore(ap->lock, flags);

        if (!timeout) {
                if (ata_probe_timeout)
                        timeout = ata_probe_timeout * 1000;
                else {
                        timeout = ata_internal_cmd_timeout(dev, command);
                        auto_timeout = 1;
                }
        }

        if (ap->ops->error_handler)
                ata_eh_release(ap);

        rc = wait_for_completion_timeout(&wait, msecs_to_jiffies(timeout));

        if (ap->ops->error_handler)
                ata_eh_acquire(ap);

        ata_sff_flush_pio_task(ap);

        if (!rc) {
                spin_lock_irqsave(ap->lock, flags);

                /* We're racing with irq here.  If we lose, the
                 * following test prevents us from completing the qc
                 * twice.  If we win, the port is frozen and will be
                 * cleaned up by ->post_internal_cmd().
                 */
                if (qc->flags & ATA_QCFLAG_ACTIVE) {
                        qc->err_mask |= AC_ERR_TIMEOUT;

                        if (ap->ops->error_handler)
                                ata_port_freeze(ap);
                        else
                                ata_qc_complete(qc);

                        if (ata_msg_warn(ap))
                                ata_dev_warn(dev, "qc timeout (cmd 0x%x)\n",
                                             command);
                }

                spin_unlock_irqrestore(ap->lock, flags);
        }

        /* do post_internal_cmd */
        if (ap->ops->post_internal_cmd)
                ap->ops->post_internal_cmd(qc);

        /* perform minimal error analysis */
        if (qc->flags & ATA_QCFLAG_FAILED) {
                if (qc->result_tf.command & (ATA_ERR | ATA_DF))
                        qc->err_mask |= AC_ERR_DEV;

                if (!qc->err_mask)
                        qc->err_mask |= AC_ERR_OTHER;

                if (qc->err_mask & ~AC_ERR_OTHER)
                        qc->err_mask &= ~AC_ERR_OTHER;
        } else if (qc->tf.command == ATA_CMD_REQ_SENSE_DATA) {
                qc->result_tf.command |= ATA_SENSE;
        }

        /* finish up */
        spin_lock_irqsave(ap->lock, flags);

        *tf = qc->result_tf;
        err_mask = qc->err_mask;

        ata_qc_free(qc);
        link->active_tag = preempted_tag;
        link->sactive = preempted_sactive;
        ap->qc_active = preempted_qc_active;
        ap->nr_active_links = preempted_nr_active_links;

        spin_unlock_irqrestore(ap->lock, flags);

        if ((err_mask & AC_ERR_TIMEOUT) && auto_timeout)
                ata_internal_cmd_timed_out(dev, command);

        return err_mask;
}

/**
 *        ata_exec_internal - execute libata internal command
 *        @dev: Device to which the command is sent
 *        @tf: Taskfile registers for the command and the result
 *        @cdb: CDB for packet command
 *        @dma_dir: Data transfer direction of the command
 *        @buf: Data buffer of the command
 *        @buflen: Length of data buffer
 *        @timeout: Timeout in msecs (0 for default)
 *
 *        Wrapper around ata_exec_internal_sg() which takes simple
 *        buffer instead of sg list.
 *
 *        LOCKING:
 *        None.  Should be called with kernel context, might sleep.
 *
 *        RETURNS:
 *        Zero on success, AC_ERR_* mask on failure
 */
unsigned ata_exec_internal(struct ata_device *dev,
                           struct ata_taskfile *tf, const u8 *cdb,
                           int dma_dir, void *buf, unsigned int buflen,
                           unsigned long timeout)
{
        struct scatterlist *psg = NULL, sg;
        unsigned int n_elem = 0;

        if (dma_dir != DMA_NONE) {
                WARN_ON(!buf);
                sg_init_one(&sg, buf, buflen);
                psg = &sg;
                n_elem++;
        }

        return ata_exec_internal_sg(dev, tf, cdb, dma_dir, psg, n_elem,
                                    timeout);
}

/**
 *        ata_pio_need_iordy        -        check if iordy needed
 *        @adev: ATA device
 *
 *        Check if the current speed of the device requires IORDY. Used
 *        by various controllers for chip configuration.
 */
unsigned int ata_pio_need_iordy(const struct ata_device *adev)
{
        /* Don't set IORDY if we're preparing for reset.  IORDY may
         * lead to controller lock up on certain controllers if the
         * port is not occupied.  See bko#11703 for details.
         */
        if (adev->link->ap->pflags & ATA_PFLAG_RESETTING)
                return 0;
        /* Controller doesn't support IORDY.  Probably a pointless
         * check as the caller should know this.
         */
        if (adev->link->ap->flags & ATA_FLAG_NO_IORDY)
                return 0;
        /* CF spec. r4.1 Table 22 says no iordy on PIO5 and PIO6.  */
        if (ata_id_is_cfa(adev->id)
            && (adev->pio_mode == XFER_PIO_5 || adev->pio_mode == XFER_PIO_6))
                return 0;
        /* PIO3 and higher it is mandatory */
        if (adev->pio_mode > XFER_PIO_2)
                return 1;
        /* We turn it on when possible */
        if (ata_id_has_iordy(adev->id))
                return 1;
        return 0;
}
EXPORT_SYMBOL_GPL(ata_pio_need_iordy);

/**
 *        ata_pio_mask_no_iordy        -        Return the non IORDY mask
 *        @adev: ATA device
 *
 *        Compute the highest mode possible if we are not using iordy. Return
 *        -1 if no iordy mode is available.
 */
static u32 ata_pio_mask_no_iordy(const struct ata_device *adev)
{
        /* If we have no drive specific rule, then PIO 2 is non IORDY */
        if (adev->id[ATA_ID_FIELD_VALID] & 2) {        /* EIDE */
                u16 pio = adev->id[ATA_ID_EIDE_PIO];
                /* Is the speed faster than the drive allows non IORDY ? */
                if (pio) {
                        /* This is cycle times not frequency - watch the logic! */
                        if (pio > 240)        /* PIO2 is 240nS per cycle */
                                return 3 << ATA_SHIFT_PIO;
                        return 7 << ATA_SHIFT_PIO;
                }
        }
        return 3 << ATA_SHIFT_PIO;
}

/**
 *        ata_do_dev_read_id                -        default ID read method
 *        @dev: device
 *        @tf: proposed taskfile
 *        @id: data buffer
 *
 *        Issue the identify taskfile and hand back the buffer containing
 *        identify data. For some RAID controllers and for pre ATA devices
 *        this function is wrapped or replaced by the driver
 */
unsigned int ata_do_dev_read_id(struct ata_device *dev,
                                        struct ata_taskfile *tf, u16 *id)
{
        return ata_exec_internal(dev, tf, NULL, DMA_FROM_DEVICE,
                                     id, sizeof(id[0]) * ATA_ID_WORDS, 0);
}
EXPORT_SYMBOL_GPL(ata_do_dev_read_id);

/**
 *        ata_dev_read_id - Read ID data from the specified device
 *        @dev: target device
 *        @p_class: pointer to class of the target device (may be changed)
 *        @flags: ATA_READID_* flags
 *        @id: buffer to read IDENTIFY data into
 *
 *        Read ID data from the specified device.  ATA_CMD_ID_ATA is
 *        performed on ATA devices and ATA_CMD_ID_ATAPI on ATAPI
 *        devices.  This function also issues ATA_CMD_INIT_DEV_PARAMS
 *        for pre-ATA4 drives.
 *
 *        FIXME: ATA_CMD_ID_ATA is optional for early drives and right
 *        now we abort if we hit that case.
 *
 *        LOCKING:
 *        Kernel thread context (may sleep)
 *
 *        RETURNS:
 *        0 on success, -errno otherwise.
 */
int ata_dev_read_id(struct ata_device *dev, unsigned int *p_class,
                    unsigned int flags, u16 *id)
{
        struct ata_port *ap = dev->link->ap;
        unsigned int class = *p_class;
        struct ata_taskfile tf;
        unsigned int err_mask = 0;
        const char *reason;
        bool is_semb = class == ATA_DEV_SEMB;
        int may_fallback = 1, tried_spinup = 0;
        int rc;

        if (ata_msg_ctl(ap))
                ata_dev_dbg(dev, "%s: ENTER\n", __func__);

retry:
        ata_tf_init(dev, &tf);

        switch (class) {
        case ATA_DEV_SEMB:
                class = ATA_DEV_ATA;        /* some hard drives report SEMB sig */
                fallthrough;
        case ATA_DEV_ATA:
        case ATA_DEV_ZAC:
                tf.command = ATA_CMD_ID_ATA;
                break;
        case ATA_DEV_ATAPI:
                tf.command = ATA_CMD_ID_ATAPI;
                break;
        default:
                rc = -ENODEV;
                reason = "unsupported class";
                goto err_out;
        }

        tf.protocol = ATA_PROT_PIO;

        /* Some devices choke if TF registers contain garbage.  Make
         * sure those are properly initialized.
         */
        tf.flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE;

        /* Device presence detection is unreliable on some
         * controllers.  Always poll IDENTIFY if available.
         */
        tf.flags |= ATA_TFLAG_POLLING;

        if (ap->ops->read_id)
                err_mask = ap->ops->read_id(dev, &tf, id);
        else
                err_mask = ata_do_dev_read_id(dev, &tf, id);

        if (err_mask) {
                if (err_mask & AC_ERR_NODEV_HINT) {
                        ata_dev_dbg(dev, "NODEV after polling detection\n");
                        return -ENOENT;
                }

                if (is_semb) {
                        ata_dev_info(dev,
                     "IDENTIFY failed on device w/ SEMB sig, disabled\n");
                        /* SEMB is not supported yet */
                        *p_class = ATA_DEV_SEMB_UNSUP;
                        return 0;
                }

                if ((err_mask == AC_ERR_DEV) && (tf.feature & ATA_ABORTED)) {
                        /* Device or controller might have reported
                         * the wrong device class.  Give a shot at the
                         * other IDENTIFY if the current one is
                         * aborted by the device.
                         */
                        if (may_fallback) {
                                may_fallback = 0;

                                if (class == ATA_DEV_ATA)
                                        class = ATA_DEV_ATAPI;
                                else
                                        class = ATA_DEV_ATA;
                                goto retry;
                        }

                        /* Control reaches here iff the device aborted
                         * both flavors of IDENTIFYs which happens
                         * sometimes with phantom devices.
                         */
                        ata_dev_dbg(dev,
                                    "both IDENTIFYs aborted, assuming NODEV\n");
                        return -ENOENT;
                }

                rc = -EIO;
                reason = "I/O error";
                goto err_out;
        }

        if (dev->horkage & ATA_HORKAGE_DUMP_ID) {
                ata_dev_dbg(dev, "dumping IDENTIFY data, "
                            "class=%d may_fallback=%d tried_spinup=%d\n",
                            class, may_fallback, tried_spinup);
                print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET,
                               16, 2, id, ATA_ID_WORDS * sizeof(*id), true);
        }

        /* Falling back doesn't make sense if ID data was read
         * successfully at least once.
         */
        may_fallback = 0;

        swap_buf_le16(id, ATA_ID_WORDS);

        /* sanity check */
        rc = -EINVAL;
        reason = "device reports invalid type";

        if (class == ATA_DEV_ATA || class == ATA_DEV_ZAC) {
                if (!ata_id_is_ata(id) && !ata_id_is_cfa(id))
                        goto err_out;
                if (ap->host->flags & ATA_HOST_IGNORE_ATA &&
                                                        ata_id_is_ata(id)) {
                        ata_dev_dbg(dev,
                                "host indicates ignore ATA devices, ignored\n");
                        return -ENOENT;
                }
        } else {
                if (ata_id_is_ata(id))
                        goto err_out;
        }

        if (!tried_spinup && (id[2] == 0x37c8 || id[2] == 0x738c)) {
                tried_spinup = 1;
                /*
                 * Drive powered-up in standby mode, and requires a specific
                 * SET_FEATURES spin-up subcommand before it will accept
                 * anything other than the original IDENTIFY command.
                 */
                err_mask = ata_dev_set_feature(dev, SETFEATURES_SPINUP, 0);
                if (err_mask && id[2] != 0x738c) {
                        rc = -EIO;
                        reason = "SPINUP failed";
                        goto err_out;
                }
                /*
                 * If the drive initially returned incomplete IDENTIFY info,
                 * we now must reissue the IDENTIFY command.
                 */
                if (id[2] == 0x37c8)
                        goto retry;
        }

        if ((flags & ATA_READID_POSTRESET) &&
            (class == ATA_DEV_ATA || class == ATA_DEV_ZAC)) {
                /*
                 * The exact sequence expected by certain pre-ATA4 drives is:
                 * SRST RESET
                 * IDENTIFY (optional in early ATA)
                 * INITIALIZE DEVICE PARAMETERS (later IDE and ATA)
                 * anything else..
                 * Some drives were very specific about that exact sequence.
                 *
                 * Note that ATA4 says lba is mandatory so the second check
                 * should never trigger.
                 */
                if (ata_id_major_version(id) < 4 || !ata_id_has_lba(id)) {
                        err_mask = ata_dev_init_params(dev, id[3], id[6]);
                        if (err_mask) {
                                rc = -EIO;
                                reason = "INIT_DEV_PARAMS failed";
                                goto err_out;
                        }

                        /* current CHS translation info (id[53-58]) might be
                         * changed. reread the identify device info.
                         */
                        flags &= ~ATA_READID_POSTRESET;
                        goto retry;
                }
        }

        *p_class = class;

        return 0;

 err_out:
        if (ata_msg_warn(ap))
                ata_dev_warn(dev, "failed to IDENTIFY (%s, err_mask=0x%x)\n",
                             reason, err_mask);
        return rc;
}

/**
 *        ata_read_log_page - read a specific log page
 *        @dev: target device
 *        @log: log to read
 *        @page: page to read
 *        @buf: buffer to store read page
 *        @sectors: number of sectors to read
 *
 *        Read log page using READ_LOG_EXT command.
 *
 *        LOCKING:
 *        Kernel thread context (may sleep).
 *
 *        RETURNS:
 *        0 on success, AC_ERR_* mask otherwise.
 */
unsigned int ata_read_log_page(struct ata_device *dev, u8 log,
                               u8 page, void *buf, unsigned int sectors)
{
        unsigned long ap_flags = dev->link->ap->flags;
        struct ata_taskfile tf;
        unsigned int err_mask;
        bool dma = false;

        DPRINTK("read log page - log 0x%x, page 0x%x\n", log, page);

        /*
         * Return error without actually issuing the command on controllers
         * which e.g. lockup on a read log page.
         */
        if (ap_flags & ATA_FLAG_NO_LOG_PAGE)
                return AC_ERR_DEV;

retry:
        ata_tf_init(dev, &tf);
        if (ata_dma_enabled(dev) && ata_id_has_read_log_dma_ext(dev->id) &&
            !(dev->horkage & ATA_HORKAGE_NO_DMA_LOG)) {
                tf.command = ATA_CMD_READ_LOG_DMA_EXT;
                tf.protocol = ATA_PROT_DMA;
                dma = true;
        } else {
                tf.command = ATA_CMD_READ_LOG_EXT;
                tf.protocol = ATA_PROT_PIO;
                dma = false;
        }
        tf.lbal = log;
        tf.lbam = page;
        tf.nsect = sectors;
        tf.hob_nsect = sectors >> 8;
        tf.flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_LBA48 | ATA_TFLAG_DEVICE;

        err_mask = ata_exec_internal(dev, &tf, NULL, DMA_FROM_DEVICE,
                                     buf, sectors * ATA_SECT_SIZE, 0);

        if (err_mask && dma) {
                dev->horkage |= ATA_HORKAGE_NO_DMA_LOG;
                ata_dev_warn(dev, "READ LOG DMA EXT failed, trying PIO\n");
                goto retry;
        }

        DPRINTK("EXIT, err_mask=%x\n", err_mask);
        return err_mask;
}

static bool ata_log_supported(struct ata_device *dev, u8 log)
{
        struct ata_port *ap = dev->link->ap;

        if (ata_read_log_page(dev, ATA_LOG_DIRECTORY, 0, ap->sector_buf, 1))
                return false;
        return get_unaligned_le16(&ap->sector_buf[log * 2]) ? true : false;
}

static bool ata_identify_page_supported(struct ata_device *dev, u8 page)
{
        struct ata_port *ap = dev->link->ap;
        unsigned int err, i;

        if (!ata_log_supported(dev, ATA_LOG_IDENTIFY_DEVICE)) {
                ata_dev_warn(dev, "ATA Identify Device Log not supported\n");
                return false;
        }

        /*
         * Read IDENTIFY DEVICE data log, page 0, to figure out if the page is
         * supported.
         */
        err = ata_read_log_page(dev, ATA_LOG_IDENTIFY_DEVICE, 0, ap->sector_buf,
                                1);
        if (err) {
                ata_dev_info(dev,
                             "failed to get Device Identify Log Emask 0x%x\n",
                             err);
                return false;
        }

        for (i = 0; i < ap->sector_buf[8]; i++) {
                if (ap->sector_buf[9 + i] == page)
                        return true;
        }

        return false;
}

static int ata_do_link_spd_horkage(struct ata_device *dev)
{
        struct ata_link *plink = ata_dev_phys_link(dev);
        u32 target, target_limit;

        if (!sata_scr_valid(plink))
                return 0;

        if (dev->horkage & ATA_HORKAGE_1_5_GBPS)
                target = 1;
        else
                return 0;

        target_limit = (1 << target) - 1;

        /* if already on stricter limit, no need to push further */
        if (plink->sata_spd_limit <= target_limit)
                return 0;

        plink->sata_spd_limit = target_limit;

        /* Request another EH round by returning -EAGAIN if link is
         * going faster than the target speed.  Forward progress is
         * guaranteed by setting sata_spd_limit to target_limit above.
         */
        if (plink->sata_spd > target) {
                ata_dev_info(dev, "applying link speed limit horkage to %s\n",
                             sata_spd_string(target));
                return -EAGAIN;
        }
        return 0;
}

static inline u8 ata_dev_knobble(struct ata_device *dev)
{
        struct ata_port *ap = dev->link->ap;

        if (ata_dev_blacklisted(dev) & ATA_HORKAGE_BRIDGE_OK)
                return 0;

        return ((ap->cbl == ATA_CBL_SATA) && (!ata_id_is_sata(dev->id)));
}

static void ata_dev_config_ncq_send_recv(struct ata_device *dev)
{
        struct ata_port *ap = dev->link->ap;
        unsigned int err_mask;

        if (!ata_log_supported(dev, ATA_LOG_NCQ_SEND_RECV)) {
                ata_dev_warn(dev, "NCQ Send/Recv Log not supported\n");
                return;
        }
        err_mask = ata_read_log_page(dev, ATA_LOG_NCQ_SEND_RECV,
                                     0, ap->sector_buf, 1);
        if (err_mask) {
                ata_dev_dbg(dev,
                            "failed to get NCQ Send/Recv Log Emask 0x%x\n",
                            err_mask);
        } else {
                u8 *cmds = dev->ncq_send_recv_cmds;

                dev->flags |= ATA_DFLAG_NCQ_SEND_RECV;
                memcpy(cmds, ap->sector_buf, ATA_LOG_NCQ_SEND_RECV_SIZE);

                if (dev->horkage & ATA_HORKAGE_NO_NCQ_TRIM) {
                        ata_dev_dbg(dev, "disabling queued TRIM support\n");
                        cmds[ATA_LOG_NCQ_SEND_RECV_DSM_OFFSET] &=
                                ~ATA_LOG_NCQ_SEND_RECV_DSM_TRIM;
                }
        }
}

static void ata_dev_config_ncq_non_data(struct ata_device *dev)
{
        struct ata_port *ap = dev->link->ap;
        unsigned int err_mask;

        if (!ata_log_supported(dev, ATA_LOG_NCQ_NON_DATA)) {
                ata_dev_warn(dev,
                             "NCQ Send/Recv Log not supported\n");
                return;
        }
        err_mask = ata_read_log_page(dev, ATA_LOG_NCQ_NON_DATA,
                                     0, ap->sector_buf, 1);
        if (err_mask) {
                ata_dev_dbg(dev,
                            "failed to get NCQ Non-Data Log Emask 0x%x\n",
                            err_mask);
        } else {
                u8 *cmds = dev->ncq_non_data_cmds;

                memcpy(cmds, ap->sector_buf, ATA_LOG_NCQ_NON_DATA_SIZE);
        }
}

static void ata_dev_config_ncq_prio(struct ata_device *dev)
{
        struct ata_port *ap = dev->link->ap;
        unsigned int err_mask;

        if (!(dev->flags & ATA_DFLAG_NCQ_PRIO_ENABLE)) {
                dev->flags &= ~ATA_DFLAG_NCQ_PRIO;
                return;
        }

        err_mask = ata_read_log_page(dev,
                                     ATA_LOG_IDENTIFY_DEVICE,
                                     ATA_LOG_SATA_SETTINGS,
                                     ap->sector_buf,
                                     1);
        if (err_mask) {
                ata_dev_dbg(dev,
                            "failed to get Identify Device data, Emask 0x%x\n",
                            err_mask);
                return;
        }

        if (ap->sector_buf[ATA_LOG_NCQ_PRIO_OFFSET] & BIT(3)) {
                dev->flags |= ATA_DFLAG_NCQ_PRIO;
        } else {
                dev->flags &= ~ATA_DFLAG_NCQ_PRIO;
                ata_dev_dbg(dev, "SATA page does not support priority\n");
        }

}

static bool ata_dev_check_adapter(struct ata_device *dev,
                                  unsigned short vendor_id)
{
        struct pci_dev *pcidev = NULL;
        struct device *parent_dev = NULL;

        for (parent_dev = dev->tdev.parent; parent_dev != NULL;
             parent_dev = parent_dev->parent) {
                if (dev_is_pci(parent_dev)) {
                        pcidev = to_pci_dev(parent_dev);
                        if (pcidev->vendor == vendor_id)
                                return true;
                        break;
                }
        }

        return false;
}

static int ata_dev_config_ncq(struct ata_device *dev,
                               char *desc, size_t desc_sz)
{
        struct ata_port *ap = dev->link->ap;
        int hdepth = 0, ddepth = ata_id_queue_depth(dev->id);
        unsigned int err_mask;
        char *aa_desc = "";

        if (!ata_id_has_ncq(dev->id)) {
                desc[0] = '\0';
                return 0;
        }
        if (!IS_ENABLED(CONFIG_SATA_HOST))
                return 0;
        if (dev->horkage & ATA_HORKAGE_NONCQ) {
                snprintf(desc, desc_sz, "NCQ (not used)");
                return 0;
        }

        if (dev->horkage & ATA_HORKAGE_NO_NCQ_ON_ATI &&
            ata_dev_check_adapter(dev, PCI_VENDOR_ID_ATI)) {
                snprintf(desc, desc_sz, "NCQ (not used)");
                return 0;
        }

        if (ap->flags & ATA_FLAG_NCQ) {
                hdepth = min(ap->scsi_host->can_queue, ATA_MAX_QUEUE);
                dev->flags |= ATA_DFLAG_NCQ;
        }

        if (!(dev->horkage & ATA_HORKAGE_BROKEN_FPDMA_AA) &&
                (ap->flags & ATA_FLAG_FPDMA_AA) &&
                ata_id_has_fpdma_aa(dev->id)) {
                err_mask = ata_dev_set_feature(dev, SETFEATURES_SATA_ENABLE,
                        SATA_FPDMA_AA);
                if (err_mask) {
                        ata_dev_err(dev,
                                    "failed to enable AA (error_mask=0x%x)\n",
                                    err_mask);
                        if (err_mask != AC_ERR_DEV) {
                                dev->horkage |= ATA_HORKAGE_BROKEN_FPDMA_AA;
                                return -EIO;
                        }
                } else
                        aa_desc = ", AA";
        }

        if (hdepth >= ddepth)
                snprintf(desc, desc_sz, "NCQ (depth %d)%s", ddepth, aa_desc);
        else
                snprintf(desc, desc_sz, "NCQ (depth %d/%d)%s", hdepth,
                        ddepth, aa_desc);

        if ((ap->flags & ATA_FLAG_FPDMA_AUX)) {
                if (ata_id_has_ncq_send_and_recv(dev->id))
                        ata_dev_config_ncq_send_recv(dev);
                if (ata_id_has_ncq_non_data(dev->id))
                        ata_dev_config_ncq_non_data(dev);
                if (ata_id_has_ncq_prio(dev->id))
                        ata_dev_config_ncq_prio(dev);
        }

        return 0;
}

static void ata_dev_config_sense_reporting(struct ata_device *dev)
{
        unsigned int err_mask;

        if (!ata_id_has_sense_reporting(dev->id))
                return;

        if (ata_id_sense_reporting_enabled(dev->id))
                return;

        err_mask = ata_dev_set_feature(dev, SETFEATURE_SENSE_DATA, 0x1);
        if (err_mask) {
                ata_dev_dbg(dev,
                            "failed to enable Sense Data Reporting, Emask 0x%x\n",
                            err_mask);
        }
}

static void ata_dev_config_zac(struct ata_device *dev)
{
        struct ata_port *ap = dev->link->ap;
        unsigned int err_mask;
        u8 *identify_buf = ap->sector_buf;

        dev->zac_zones_optimal_open = U32_MAX;
        dev->zac_zones_optimal_nonseq = U32_MAX;
        dev->zac_zones_max_open = U32_MAX;

        /*
         * Always set the 'ZAC' flag for Host-managed devices.
         */
        if (dev->class == ATA_DEV_ZAC)
                dev->flags |= ATA_DFLAG_ZAC;
        else if (ata_id_zoned_cap(dev->id) == 0x01)
                /*
                 * Check for host-aware devices.
                 */
                dev->flags |= ATA_DFLAG_ZAC;

        if (!(dev->flags & ATA_DFLAG_ZAC))
                return;

        if (!ata_identify_page_supported(dev, ATA_LOG_ZONED_INFORMATION)) {
                ata_dev_warn(dev,
                             "ATA Zoned Information Log not supported\n");
                return;
        }

        /*
         * Read IDENTIFY DEVICE data log, page 9 (Zoned-device information)
         */
        err_mask = ata_read_log_page(dev, ATA_LOG_IDENTIFY_DEVICE,
                                     ATA_LOG_ZONED_INFORMATION,
                                     identify_buf, 1);
        if (!err_mask) {
                u64 zoned_cap, opt_open, opt_nonseq, max_open;

                zoned_cap = get_unaligned_le64(&identify_buf[8]);
                if ((zoned_cap >> 63))
                        dev->zac_zoned_cap = (zoned_cap & 1);
                opt_open = get_unaligned_le64(&identify_buf[24]);
                if ((opt_open >> 63))
                        dev->zac_zones_optimal_open = (u32)opt_open;
                opt_nonseq = get_unaligned_le64(&identify_buf[32]);
                if ((opt_nonseq >> 63))
                        dev->zac_zones_optimal_nonseq = (u32)opt_nonseq;
                max_open = get_unaligned_le64(&identify_buf[40]);
                if ((max_open >> 63))
                        dev->zac_zones_max_open = (u32)max_open;
        }
}

static void ata_dev_config_trusted(struct ata_device *dev)
{
        struct ata_port *ap = dev->link->ap;
        u64 trusted_cap;
        unsigned int err;

        if (!ata_id_has_trusted(dev->id))
                return;

        if (!ata_identify_page_supported(dev, ATA_LOG_SECURITY)) {
                ata_dev_warn(dev,
                             "Security Log not supported\n");
                return;
        }

        err = ata_read_log_page(dev, ATA_LOG_IDENTIFY_DEVICE, ATA_LOG_SECURITY,
                        ap->sector_buf, 1);
        if (err) {
                ata_dev_dbg(dev,
                            "failed to read Security Log, Emask 0x%x\n", err);
                return;
        }

        trusted_cap = get_unaligned_le64(&ap->sector_buf[40]);
        if (!(trusted_cap & (1ULL << 63))) {
                ata_dev_dbg(dev,
                            "Trusted Computing capability qword not valid!\n");
                return;
        }

        if (trusted_cap & (1 << 0))
                dev->flags |= ATA_DFLAG_TRUSTED;
}

/**
 *        ata_dev_configure - Configure the specified ATA/ATAPI device
 *        @dev: Target device to configure
 *
 *        Configure @dev according to @dev->id.  Generic and low-level
 *        driver specific fixups are also applied.
 *
 *        LOCKING:
 *        Kernel thread context (may sleep)
 *
 *        RETURNS:
 *        0 on success, -errno otherwise
 */
int ata_dev_configure(struct ata_device *dev)
{
        struct ata_port *ap = dev->link->ap;
        struct ata_eh_context *ehc = &dev->link->eh_context;
        int print_info = ehc->i.flags & ATA_EHI_PRINTINFO;
        const u16 *id = dev->id;
        unsigned long xfer_mask;
        unsigned int err_mask;
        char revbuf[7];                /* XYZ-99\0 */
        char fwrevbuf[ATA_ID_FW_REV_LEN+1];
        char modelbuf[ATA_ID_PROD_LEN+1];
        int rc;

        if (!ata_dev_enabled(dev) && ata_msg_info(ap)) {
                ata_dev_info(dev, "%s: ENTER/EXIT -- nodev\n", __func__);
                return 0;
        }

        if (ata_msg_probe(ap))
                ata_dev_dbg(dev, "%s: ENTER\n", __func__);

        /* set horkage */
        dev->horkage |= ata_dev_blacklisted(dev);
        ata_force_horkage(dev);

        if (dev->horkage & ATA_HORKAGE_DISABLE) {
                ata_dev_info(dev, "unsupported device, disabling\n");
                ata_dev_disable(dev);
                return 0;
        }

        if ((!atapi_enabled || (ap->flags & ATA_FLAG_NO_ATAPI)) &&
            dev->class == ATA_DEV_ATAPI) {
                ata_dev_warn(dev, "WARNING: ATAPI is %s, device ignored\n",
                             atapi_enabled ? "not supported with this driver"
                             : "disabled");
                ata_dev_disable(dev);
                return 0;
        }

        rc = ata_do_link_spd_horkage(dev);
        if (rc)
                return rc;

        /* some WD SATA-1 drives have issues with LPM, turn on NOLPM for them */
        if ((dev->horkage & ATA_HORKAGE_WD_BROKEN_LPM) &&
            (id[ATA_ID_SATA_CAPABILITY] & 0xe) == 0x2)
                dev->horkage |= ATA_HORKAGE_NOLPM;

        if (ap->flags & ATA_FLAG_NO_LPM)
                dev->horkage |= ATA_HORKAGE_NOLPM;

        if (dev->horkage & ATA_HORKAGE_NOLPM) {
                ata_dev_warn(dev, "LPM support broken, forcing max_power\n");
                dev->link->ap->target_lpm_policy = ATA_LPM_MAX_POWER;
        }

        /* let ACPI work its magic */
        rc = ata_acpi_on_devcfg(dev);
        if (rc)
                return rc;

        /* massage HPA, do it early as it might change IDENTIFY data */
        rc = ata_hpa_resize(dev);
        if (rc)
                return rc;

        /* print device capabilities */
        if (ata_msg_probe(ap))
                ata_dev_dbg(dev,
                            "%s: cfg 49:%04x 82:%04x 83:%04x 84:%04x "
                            "85:%04x 86:%04x 87:%04x 88:%04x\n",
                            __func__,
                            id[49], id[82], id[83], id[84],
                            id[85], id[86], id[87], id[88]);

        /* initialize to-be-configured parameters */
        dev->flags &= ~ATA_DFLAG_CFG_MASK;
        dev->max_sectors = 0;
        dev->cdb_len = 0;
        dev->n_sectors = 0;
        dev->cylinders = 0;
        dev->heads = 0;
        dev->sectors = 0;
        dev->multi_count = 0;

        /*
         * common ATA, ATAPI feature tests
         */

        /* find max transfer mode; for printk only */
        xfer_mask = ata_id_xfermask(id);

        if (ata_msg_probe(ap))
                ata_dump_id(id);

        /* SCSI only uses 4-char revisions, dump full 8 chars from ATA */
        ata_id_c_string(dev->id, fwrevbuf, ATA_ID_FW_REV,
                        sizeof(fwrevbuf));

        ata_id_c_string(dev->id, modelbuf, ATA_ID_PROD,
                        sizeof(modelbuf));

        /* ATA-specific feature tests */
        if (dev->class == ATA_DEV_ATA || dev->class == ATA_DEV_ZAC) {
                if (ata_id_is_cfa(id)) {
                        /* CPRM may make this media unusable */
                        if (id[ATA_ID_CFA_KEY_MGMT] & 1)
                                ata_dev_warn(dev,
        "supports DRM functions and may not be fully accessible\n");
                        snprintf(revbuf, 7, "CFA");
                } else {
                        snprintf(revbuf, 7, "ATA-%d", ata_id_major_version(id));
                        /* Warn the user if the device has TPM extensions */
                        if (ata_id_has_tpm(id))
                                ata_dev_warn(dev,
        "supports DRM functions and may not be fully accessible\n");
                }

                dev->n_sectors = ata_id_n_sectors(id);

                /* get current R/W Multiple count setting */
                if ((dev->id[47] >> 8) == 0x80 && (dev->id[59] & 0x100)) {
                        unsigned int max = dev->id[47] & 0xff;
                        unsigned int cnt = dev->id[59] & 0xff;
                        /* only recognize/allow powers of two here */
                        if (is_power_of_2(max) && is_power_of_2(cnt))
                                if (cnt <= max)
                                        dev->multi_count = cnt;
                }

                if (ata_id_has_lba(id)) {
                        const char *lba_desc;
                        char ncq_desc[24];

                        lba_desc = "LBA";
                        dev->flags |= ATA_DFLAG_LBA;
                        if (ata_id_has_lba48(id)) {
                                dev->flags |= ATA_DFLAG_LBA48;
                                lba_desc = "LBA48";

                                if (dev->n_sectors >= (1UL << 28) &&
                                    ata_id_has_flush_ext(id))
                                        dev->flags |= ATA_DFLAG_FLUSH_EXT;
                        }

                        /* config NCQ */
                        rc = ata_dev_config_ncq(dev, ncq_desc, sizeof(ncq_desc));
                        if (rc)
                                return rc;

                        /* print device info to dmesg */
                        if (ata_msg_drv(ap) && print_info) {
                                ata_dev_info(dev, "%s: %s, %s, max %s\n",
                                             revbuf, modelbuf, fwrevbuf,
                                             ata_mode_string(xfer_mask));
                                ata_dev_info(dev,
                                             "%llu sectors, multi %u: %s %s\n",
                                        (unsigned long long)dev->n_sectors,
                                        dev->multi_count, lba_desc, ncq_desc);
                        }
                } else {
                        /* CHS */

                        /* Default translation */
                        dev->cylinders        = id[1];
                        dev->heads        = id[3];
                        dev->sectors        = id[6];

                        if (ata_id_current_chs_valid(id)) {
                                /* Current CHS translation is valid. */
                                dev->cylinders = id[54];
                                dev->heads     = id[55];
                                dev->sectors   = id[56];
                        }

                        /* print device info to dmesg */
                        if (ata_msg_drv(ap) && print_info) {
                                ata_dev_info(dev, "%s: %s, %s, max %s\n",
                                             revbuf,        modelbuf, fwrevbuf,
                                             ata_mode_string(xfer_mask));
                                ata_dev_info(dev,
                                             "%llu sectors, multi %u, CHS %u/%u/%u\n",
                                             (unsigned long long)dev->n_sectors,
                                             dev->multi_count, dev->cylinders,
                                             dev->heads, dev->sectors);
                        }
                }

                /* Check and mark DevSlp capability. Get DevSlp timing variables
                 * from SATA Settings page of Identify Device Data Log.
                 */
                if (ata_id_has_devslp(dev->id)) {
                        u8 *sata_setting = ap->sector_buf;
                        int i, j;

                        dev->flags |= ATA_DFLAG_DEVSLP;
                        err_mask = ata_read_log_page(dev,
                                                     ATA_LOG_IDENTIFY_DEVICE,
                                                     ATA_LOG_SATA_SETTINGS,
                                                     sata_setting,
                                                     1);
                        if (err_mask)
                                ata_dev_dbg(dev,
                                            "failed to get Identify Device Data, Emask 0x%x\n",
                                            err_mask);
                        else
                                for (i = 0; i < ATA_LOG_DEVSLP_SIZE; i++) {
                                        j = ATA_LOG_DEVSLP_OFFSET + i;
                                        dev->devslp_timing[i] = sata_setting[j];
                                }
                }
                ata_dev_config_sense_reporting(dev);
                ata_dev_config_zac(dev);
                ata_dev_config_trusted(dev);
                dev->cdb_len = 32;
        }

        /* ATAPI-specific feature tests */
        else if (dev->class == ATA_DEV_ATAPI) {
                const char *cdb_intr_string = "";
                const char *atapi_an_string = "";
                const char *dma_dir_string = "";
                u32 sntf;

                rc = atapi_cdb_len(id);
                if ((rc < 12) || (rc > ATAPI_CDB_LEN)) {
                        if (ata_msg_warn(ap))
                                ata_dev_warn(dev, "unsupported CDB len\n");
                        rc = -EINVAL;
                        goto err_out_nosup;
                }
                dev->cdb_len = (unsigned int) rc;

                /* Enable ATAPI AN if both the host and device have
                 * the support.  If PMP is attached, SNTF is required
                 * to enable ATAPI AN to discern between PHY status
                 * changed notifications and ATAPI ANs.
                 */
                if (atapi_an &&
                    (ap->flags & ATA_FLAG_AN) && ata_id_has_atapi_AN(id) &&
                    (!sata_pmp_attached(ap) ||
                     sata_scr_read(&ap->link, SCR_NOTIFICATION, &sntf) == 0)) {
                        /* issue SET feature command to turn this on */
                        err_mask = ata_dev_set_feature(dev,
                                        SETFEATURES_SATA_ENABLE, SATA_AN);
                        if (err_mask)
                                ata_dev_err(dev,
                                            "failed to enable ATAPI AN (err_mask=0x%x)\n",
                                            err_mask);
                        else {
                                dev->flags |= ATA_DFLAG_AN;
                                atapi_an_string = ", ATAPI AN";
                        }
                }

                if (ata_id_cdb_intr(dev->id)) {
                        dev->flags |= ATA_DFLAG_CDB_INTR;
                        cdb_intr_string = ", CDB intr";
                }

                if (atapi_dmadir || (dev->horkage & ATA_HORKAGE_ATAPI_DMADIR) || atapi_id_dmadir(dev->id)) {
                        dev->flags |= ATA_DFLAG_DMADIR;
                        dma_dir_string = ", DMADIR";
                }

                if (ata_id_has_da(dev->id)) {
                        dev->flags |= ATA_DFLAG_DA;
                        zpodd_init(dev);
                }

                /* print device info to dmesg */
                if (ata_msg_drv(ap) && print_info)
                        ata_dev_info(dev,
                                     "ATAPI: %s, %s, max %s%s%s%s\n",
                                     modelbuf, fwrevbuf,
                                     ata_mode_string(xfer_mask),
                                     cdb_intr_string, atapi_an_string,
                                     dma_dir_string);
        }

        /* determine max_sectors */
        dev->max_sectors = ATA_MAX_SECTORS;
        if (dev->flags & ATA_DFLAG_LBA48)
                dev->max_sectors = ATA_MAX_SECTORS_LBA48;

        /* Limit PATA drive on SATA cable bridge transfers to udma5,
           200 sectors */
        if (ata_dev_knobble(dev)) {
                if (ata_msg_drv(ap) && print_info)
                        ata_dev_info(dev, "applying bridge limits\n");
                dev->udma_mask &= ATA_UDMA5;
                dev->max_sectors = ATA_MAX_SECTORS;
        }

        if ((dev->class == ATA_DEV_ATAPI) &&
            (atapi_command_packet_set(id) == TYPE_TAPE)) {
                dev->max_sectors = ATA_MAX_SECTORS_TAPE;
                dev->horkage |= ATA_HORKAGE_STUCK_ERR;
        }

        if (dev->horkage & ATA_HORKAGE_MAX_SEC_128)
                dev->max_sectors = min_t(unsigned int, ATA_MAX_SECTORS_128,
                                         dev->max_sectors);

        if (dev->horkage & ATA_HORKAGE_MAX_SEC_1024)
                dev->max_sectors = min_t(unsigned int, ATA_MAX_SECTORS_1024,
                                         dev->max_sectors);

        if (dev->horkage & ATA_HORKAGE_MAX_SEC_LBA48)
                dev->max_sectors = ATA_MAX_SECTORS_LBA48;

        if (ap->ops->dev_config)
                ap->ops->dev_config(dev);

        if (dev->horkage & ATA_HORKAGE_DIAGNOSTIC) {
                /* Let the user know. We don't want to disallow opens for
                   rescue purposes, or in case the vendor is just a blithering
                   idiot. Do this after the dev_config call as some controllers
                   with buggy firmware may want to avoid reporting false device
                   bugs */

                if (print_info) {
                        ata_dev_warn(dev,
"Drive reports diagnostics failure. This may indicate a drive\n");
                        ata_dev_warn(dev,
"fault or invalid emulation. Contact drive vendor for information.\n");
                }
        }

        if ((dev->horkage & ATA_HORKAGE_FIRMWARE_WARN) && print_info) {
                ata_dev_warn(dev, "WARNING: device requires firmware update to be fully functional\n");
                ata_dev_warn(dev, "         contact the vendor or visit http://ata.wiki.kernel.org\n");
        }

        return 0;

err_out_nosup:
        if (ata_msg_probe(ap))
                ata_dev_dbg(dev, "%s: EXIT, err\n", __func__);
        return rc;
}

/**
 *        ata_cable_40wire        -        return 40 wire cable type
 *        @ap: port
 *
 *        Helper method for drivers which want to hardwire 40 wire cable
 *        detection.
 */

int ata_cable_40wire(struct ata_port *ap)
{
        return ATA_CBL_PATA40;
}
EXPORT_SYMBOL_GPL(ata_cable_40wire);

/**
 *        ata_cable_80wire        -        return 80 wire cable type
 *        @ap: port
 *
 *        Helper method for drivers which want to hardwire 80 wire cable
 *        detection.
 */

int ata_cable_80wire(struct ata_port *ap)
{
        return ATA_CBL_PATA80;
}
EXPORT_SYMBOL_GPL(ata_cable_80wire);

/**
 *        ata_cable_unknown        -        return unknown PATA cable.
 *        @ap: port
 *
 *        Helper method for drivers which have no PATA cable detection.
 */

int ata_cable_unknown(struct ata_port *ap)
{
        return ATA_CBL_PATA_UNK;
}
EXPORT_SYMBOL_GPL(ata_cable_unknown);

/**
 *        ata_cable_ignore        -        return ignored PATA cable.
 *        @ap: port
 *
 *        Helper method for drivers which don't use cable type to limit
 *        transfer mode.
 */
int ata_cable_ignore(struct ata_port *ap)
{
        return ATA_CBL_PATA_IGN;
}
EXPORT_SYMBOL_GPL(ata_cable_ignore);

/**
 *        ata_cable_sata        -        return SATA cable type
 *        @ap: port
 *
 *        Helper method for drivers which have SATA cables
 */

int ata_cable_sata(struct ata_port *ap)
{
        return ATA_CBL_SATA;
}
EXPORT_SYMBOL_GPL(ata_cable_sata);

/**
 *        ata_bus_probe - Reset and probe ATA bus
 *        @ap: Bus to probe
 *
 *        Master ATA bus probing function.  Initiates a hardware-dependent
 *        bus reset, then attempts to identify any devices found on
 *        the bus.
 *
 *        LOCKING:
 *        PCI/etc. bus probe sem.
 *
 *        RETURNS:
 *        Zero on success, negative errno otherwise.
 */

int ata_bus_probe(struct ata_port *ap)
{
        unsigned int classes[ATA_MAX_DEVICES];
        int tries[ATA_MAX_DEVICES];
        int rc;
        struct ata_device *dev;

        ata_for_each_dev(dev, &ap->link, ALL)
                tries[dev->devno] = ATA_PROBE_MAX_TRIES;

 retry:
        ata_for_each_dev(dev, &ap->link, ALL) {
                /* If we issue an SRST then an ATA drive (not ATAPI)
                 * may change configuration and be in PIO0 timing. If
                 * we do a hard reset (or are coming from power on)
                 * this is true for ATA or ATAPI. Until we've set a
                 * suitable controller mode we should not touch the
                 * bus as we may be talking too fast.
                 */
                dev->pio_mode = XFER_PIO_0;
                dev->dma_mode = 0xff;

                /* If the controller has a pio mode setup function
                 * then use it to set the chipset to rights. Don't
                 * touch the DMA setup as that will be dealt with when
                 * configuring devices.
                 */
                if (ap->ops->set_piomode)
                        ap->ops->set_piomode(ap, dev);
        }

        /* reset and determine device classes */
        ap->ops->phy_reset(ap);

        ata_for_each_dev(dev, &ap->link, ALL) {
                if (dev->class != ATA_DEV_UNKNOWN)
                        classes[dev->devno] = dev->class;
                else
                        classes[dev->devno] = ATA_DEV_NONE;

                dev->class = ATA_DEV_UNKNOWN;
        }

        /* read IDENTIFY page and configure devices. We have to do the identify
           specific sequence bass-ackwards so that PDIAG- is released by
           the slave device */

        ata_for_each_dev(dev, &ap->link, ALL_REVERSE) {
                if (tries[dev->devno])
                        dev->class = classes[dev->devno];

                if (!ata_dev_enabled(dev))
                        continue;

                rc = ata_dev_read_id(dev, &dev->class, ATA_READID_POSTRESET,
                                     dev->id);
                if (rc)
                        goto fail;
        }

        /* Now ask for the cable type as PDIAG- should have been released */
        if (ap->ops->cable_detect)
                ap->cbl = ap->ops->cable_detect(ap);

        /* We may have SATA bridge glue hiding here irrespective of
         * the reported cable types and sensed types.  When SATA
         * drives indicate we have a bridge, we don't know which end
         * of the link the bridge is which is a problem.
         */
        ata_for_each_dev(dev, &ap->link, ENABLED)
                if (ata_id_is_sata(dev->id))
                        ap->cbl = ATA_CBL_SATA;

        /* After the identify sequence we can now set up the devices. We do
           this in the normal order so that the user doesn't get confused */

        ata_for_each_dev(dev, &ap->link, ENABLED) {
                ap->link.eh_context.i.flags |= ATA_EHI_PRINTINFO;
                rc = ata_dev_configure(dev);
                ap->link.eh_context.i.flags &= ~ATA_EHI_PRINTINFO;
                if (rc)
                        goto fail;
        }

        /* configure transfer mode */
        rc = ata_set_mode(&ap->link, &dev);
        if (rc)
                goto fail;

        ata_for_each_dev(dev, &ap->link, ENABLED)
                return 0;

        return -ENODEV;

 fail:
        tries[dev->devno]--;

        switch (rc) {
        case -EINVAL:
                /* eeek, something went very wrong, give up */
                tries[dev->devno] = 0;
                break;

        case -ENODEV:
                /* give it just one more chance */
                tries[dev->devno] = min(tries[dev->devno], 1);
                fallthrough;
        case -EIO:
                if (tries[dev->devno] == 1) {
                        /* This is the last chance, better to slow
                         * down than lose it.
                         */
                        sata_down_spd_limit(&ap->link, 0);
                        ata_down_xfermask_limit(dev, ATA_DNXFER_PIO);
                }
        }

        if (!tries[dev->devno])
                ata_dev_disable(dev);

        goto retry;
}

/**
 *        sata_print_link_status - Print SATA link status
 *        @link: SATA link to printk link status about
 *
 *        This function prints link speed and status of a SATA link.
 *
 *        LOCKING:
 *        None.
 */
static void sata_print_link_status(struct ata_link *link)
{
        u32 sstatus, scontrol, tmp;

        if (sata_scr_read(link, SCR_STATUS, &sstatus))
                return;
        sata_scr_read(link, SCR_CONTROL, &scontrol);

        if (ata_phys_link_online(link)) {
                tmp = (sstatus >> 4) & 0xf;
                ata_link_info(link, "SATA link up %s (SStatus %X SControl %X)\n",
                              sata_spd_string(tmp), sstatus, scontrol);
        } else {
                ata_link_info(link, "SATA link down (SStatus %X SControl %X)\n",
                              sstatus, scontrol);
        }
}

/**
 *        ata_dev_pair                -        return other device on cable
 *        @adev: device
 *
 *        Obtain the other device on the same cable, or if none is
 *        present NULL is returned
 */

struct ata_device *ata_dev_pair(struct ata_device *adev)
{
        struct ata_link *link = adev->link;
        struct ata_device *pair = &link->device[1 - adev->devno];
        if (!ata_dev_enabled(pair))
                return NULL;
        return pair;
}
EXPORT_SYMBOL_GPL(ata_dev_pair);

/**
 *        sata_down_spd_limit - adjust SATA spd limit downward
 *        @link: Link to adjust SATA spd limit for
 *        @spd_limit: Additional limit
 *
 *        Adjust SATA spd limit of @link downward.  Note that this
 *        function only adjusts the limit.  The change must be applied
 *        using sata_set_spd().
 *
 *        If @spd_limit is non-zero, the speed is limited to equal to or
 *        lower than @spd_limit if such speed is supported.  If
 *        @spd_limit is slower than any supported speed, only the lowest
 *        supported speed is allowed.
 *
 *        LOCKING:
 *        Inherited from caller.
 *
 *        RETURNS:
 *        0 on success, negative errno on failure
 */
int sata_down_spd_limit(struct ata_link *link, u32 spd_limit)
{
        u32 sstatus, spd, mask;
        int rc, bit;

        if (!sata_scr_valid(link))
                return -EOPNOTSUPP;

        /* If SCR can be read, use it to determine the current SPD.
         * If not, use cached value in link->sata_spd.
         */
        rc = sata_scr_read(link, SCR_STATUS, &sstatus);
        if (rc == 0 && ata_sstatus_online(sstatus))
                spd = (sstatus >> 4) & 0xf;
        else
                spd = link->sata_spd;

        mask = link->sata_spd_limit;
        if (mask <= 1)
                return -EINVAL;

        /* unconditionally mask off the highest bit */
        bit = fls(mask) - 1;
        mask &= ~(1 << bit);

        /*
         * Mask off all speeds higher than or equal to the current one.  At
         * this point, if current SPD is not available and we previously
         * recorded the link speed from SStatus, the driver has already
         * masked off the highest bit so mask should already be 1 or 0.
         * Otherwise, we should not force 1.5Gbps on a link where we have
         * not previously recorded speed from SStatus.  Just return in this
         * case.
         */
        if (spd > 1)
                mask &= (1 << (spd - 1)) - 1;
        else if (link->sata_spd)
                return -EINVAL;

        /* were we already at the bottom? */
        if (!mask)
                return -EINVAL;

        if (spd_limit) {
                if (mask & ((1 << spd_limit) - 1))
                        mask &= (1 << spd_limit) - 1;
                else {
                        bit = ffs(mask) - 1;
                        mask = 1 << bit;
                }
        }

        link->sata_spd_limit = mask;

        ata_link_warn(link, "limiting SATA link speed to %s\n",
                      sata_spd_string(fls(mask)));

        return 0;
}

#ifdef CONFIG_ATA_ACPI
/**
 *        ata_timing_cycle2mode - find xfer mode for the specified cycle duration
 *        @xfer_shift: ATA_SHIFT_* value for transfer type to examine.
 *        @cycle: cycle duration in ns
 *
 *        Return matching xfer mode for @cycle.  The returned mode is of
 *        the transfer type specified by @xfer_shift.  If @cycle is too
 *        slow for @xfer_shift, 0xff is returned.  If @cycle is faster
 *        than the fastest known mode, the fasted mode is returned.
 *
 *        LOCKING:
 *        None.
 *
 *        RETURNS:
 *        Matching xfer_mode, 0xff if no match found.
 */
u8 ata_timing_cycle2mode(unsigned int xfer_shift, int cycle)
{
        u8 base_mode = 0xff, last_mode = 0xff;
        const struct ata_xfer_ent *ent;
        const struct ata_timing *t;

        for (ent = ata_xfer_tbl; ent->shift >= 0; ent++)
                if (ent->shift == xfer_shift)
                        base_mode = ent->base;

        for (t = ata_timing_find_mode(base_mode);
             t && ata_xfer_mode2shift(t->mode) == xfer_shift; t++) {
                unsigned short this_cycle;

                switch (xfer_shift) {
                case ATA_SHIFT_PIO:
                case ATA_SHIFT_MWDMA:
                        this_cycle = t->cycle;
                        break;
                case ATA_SHIFT_UDMA:
                        this_cycle = t->udma;
                        break;
                default:
                        return 0xff;
                }

                if (cycle > this_cycle)
                        break;

                last_mode = t->mode;
        }

        return last_mode;
}
#endif

/**
 *        ata_down_xfermask_limit - adjust dev xfer masks downward
 *        @dev: Device to adjust xfer masks
 *        @sel: ATA_DNXFER_* selector
 *
 *        Adjust xfer masks of @dev downward.  Note that this function
 *        does not apply the change.  Invoking ata_set_mode() afterwards
 *        will apply the limit.
 *
 *        LOCKING:
 *        Inherited from caller.
 *
 *        RETURNS:
 *        0 on success, negative errno on failure
 */
int ata_down_xfermask_limit(struct ata_device *dev, unsigned int sel)
{
        char buf[32];
        unsigned long orig_mask, xfer_mask;
        unsigned long pio_mask, mwdma_mask, udma_mask;
        int quiet, highbit;

        quiet = !!(sel & ATA_DNXFER_QUIET);
        sel &= ~ATA_DNXFER_QUIET;

        xfer_mask = orig_mask = ata_pack_xfermask(dev->pio_mask,
                                                  dev->mwdma_mask,
                                                  dev->udma_mask);
        ata_unpack_xfermask(xfer_mask, &pio_mask, &mwdma_mask, &udma_mask);

        switch (sel) {
        case ATA_DNXFER_PIO:
                highbit = fls(pio_mask) - 1;
                pio_mask &= ~(1 << highbit);
                break;

        case ATA_DNXFER_DMA:
                if (udma_mask) {
                        highbit = fls(udma_mask) - 1;
                        udma_mask &= ~(1 << highbit);
                        if (!udma_mask)
                                return -ENOENT;
                } else if (mwdma_mask) {
                        highbit = fls(mwdma_mask) - 1;
                        mwdma_mask &= ~(1 << highbit);
                        if (!mwdma_mask)
                                return -ENOENT;
                }
                break;

        case ATA_DNXFER_40C:
                udma_mask &= ATA_UDMA_MASK_40C;
                break;

        case ATA_DNXFER_FORCE_PIO0:
                pio_mask &= 1;
                fallthrough;
        case ATA_DNXFER_FORCE_PIO:
                mwdma_mask = 0;
                udma_mask = 0;
                break;

        default:
                BUG();
        }

        xfer_mask &= ata_pack_xfermask(pio_mask, mwdma_mask, udma_mask);

        if (!(xfer_mask & ATA_MASK_PIO) || xfer_mask == orig_mask)
                return -ENOENT;

        if (!quiet) {
                if (xfer_mask & (ATA_MASK_MWDMA | ATA_MASK_UDMA))
                        snprintf(buf, sizeof(buf), "%s:%s",
                                 ata_mode_string(xfer_mask),
                                 ata_mode_string(xfer_mask & ATA_MASK_PIO));
                else
                        snprintf(buf, sizeof(buf), "%s",
                                 ata_mode_string(xfer_mask));

                ata_dev_warn(dev, "limiting speed to %s\n", buf);
        }

        ata_unpack_xfermask(xfer_mask, &dev->pio_mask, &dev->mwdma_mask,
                            &dev->udma_mask);

        return 0;
}

static int ata_dev_set_mode(struct ata_device *dev)
{
        struct ata_port *ap = dev->link->ap;
        struct ata_eh_context *ehc = &dev->link->eh_context;
        const bool nosetxfer = dev->horkage & ATA_HORKAGE_NOSETXFER;
        const char *dev_err_whine = "";
        int ign_dev_err = 0;
        unsigned int err_mask = 0;
        int rc;

        dev->flags &= ~ATA_DFLAG_PIO;
        if (dev->xfer_shift == ATA_SHIFT_PIO)
                dev->flags |= ATA_DFLAG_PIO;

        if (nosetxfer && ap->flags & ATA_FLAG_SATA && ata_id_is_sata(dev->id))
                dev_err_whine = " (SET_XFERMODE skipped)";
        else {
                if (nosetxfer)
                        ata_dev_warn(dev,
                                     "NOSETXFER but PATA detected - can't "
                                     "skip SETXFER, might malfunction\n");
                err_mask = ata_dev_set_xfermode(dev);
        }

        if (err_mask & ~AC_ERR_DEV)
                goto fail;

        /* revalidate */
        ehc->i.flags |= ATA_EHI_POST_SETMODE;
        rc = ata_dev_revalidate(dev, ATA_DEV_UNKNOWN, 0);
        ehc->i.flags &= ~ATA_EHI_POST_SETMODE;
        if (rc)
                return rc;

        if (dev->xfer_shift == ATA_SHIFT_PIO) {
                /* Old CFA may refuse this command, which is just fine */
                if (ata_id_is_cfa(dev->id))
                        ign_dev_err = 1;
                /* Catch several broken garbage emulations plus some pre
                   ATA devices */
                if (ata_id_major_version(dev->id) == 0 &&
                                        dev->pio_mode <= XFER_PIO_2)
                        ign_dev_err = 1;
                /* Some very old devices and some bad newer ones fail
                   any kind of SET_XFERMODE request but support PIO0-2
                   timings and no IORDY */
                if (!ata_id_has_iordy(dev->id) && dev->pio_mode <= XFER_PIO_2)
                        ign_dev_err = 1;
        }
        /* Early MWDMA devices do DMA but don't allow DMA mode setting.
           Don't fail an MWDMA0 set IFF the device indicates it is in MWDMA0 */
        if (dev->xfer_shift == ATA_SHIFT_MWDMA &&
            dev->dma_mode == XFER_MW_DMA_0 &&
            (dev->id[63] >> 8) & 1)
                ign_dev_err = 1;

        /* if the device is actually configured correctly, ignore dev err */
        if (dev->xfer_mode == ata_xfer_mask2mode(ata_id_xfermask(dev->id)))
                ign_dev_err = 1;

        if (err_mask & AC_ERR_DEV) {
                if (!ign_dev_err)
                        goto fail;
                else
                        dev_err_whine = " (device error ignored)";
        }

        DPRINTK("xfer_shift=%u, xfer_mode=0x%x\n",
                dev->xfer_shift, (int)dev->xfer_mode);

        if (!(ehc->i.flags & ATA_EHI_QUIET) ||
            ehc->i.flags & ATA_EHI_DID_HARDRESET)
                ata_dev_info(dev, "configured for %s%s\n",
                             ata_mode_string(ata_xfer_mode2mask(dev->xfer_mode)),
                             dev_err_whine);

        return 0;

 fail:
        ata_dev_err(dev, "failed to set xfermode (err_mask=0x%x)\n", err_mask);
        return -EIO;
}

/**
 *        ata_do_set_mode - Program timings and issue SET FEATURES - XFER
 *        @link: link on which timings will be programmed
 *        @r_failed_dev: out parameter for failed device
 *
 *        Standard implementation of the function used to tune and set
 *        ATA device disk transfer mode (PIO3, UDMA6, etc.).  If
 *        ata_dev_set_mode() fails, pointer to the failing device is
 *        returned in @r_failed_dev.
 *
 *        LOCKING:
 *        PCI/etc. bus probe sem.
 *
 *        RETURNS:
 *        0 on success, negative errno otherwise
 */

int ata_do_set_mode(struct ata_link *link, struct ata_device **r_failed_dev)
{
        struct ata_port *ap = link->ap;
        struct ata_device *dev;
        int rc = 0, used_dma = 0, found = 0;

        /* step 1: calculate xfer_mask */
        ata_for_each_dev(dev, link, ENABLED) {
                unsigned long pio_mask, dma_mask;
                unsigned int mode_mask;

                mode_mask = ATA_DMA_MASK_ATA;
                if (dev->class == ATA_DEV_ATAPI)
                        mode_mask = ATA_DMA_MASK_ATAPI;
                else if (ata_id_is_cfa(dev->id))
                        mode_mask = ATA_DMA_MASK_CFA;

                ata_dev_xfermask(dev);
                ata_force_xfermask(dev);

                pio_mask = ata_pack_xfermask(dev->pio_mask, 0, 0);

                if (libata_dma_mask & mode_mask)
                        dma_mask = ata_pack_xfermask(0, dev->mwdma_mask,
                                                     dev->udma_mask);
                else
                        dma_mask = 0;

                dev->pio_mode = ata_xfer_mask2mode(pio_mask);
                dev->dma_mode = ata_xfer_mask2mode(dma_mask);

                found = 1;
                if (ata_dma_enabled(dev))
                        used_dma = 1;
        }
        if (!found)
                goto out;

        /* step 2: always set host PIO timings */
        ata_for_each_dev(dev, link, ENABLED) {
                if (dev->pio_mode == 0xff) {
                        ata_dev_warn(dev, "no PIO support\n");
                        rc = -EINVAL;
                        goto out;
                }

                dev->xfer_mode = dev->pio_mode;
                dev->xfer_shift = ATA_SHIFT_PIO;
                if (ap->ops->set_piomode)
                        ap->ops->set_piomode(ap, dev);
        }

        /* step 3: set host DMA timings */
        ata_for_each_dev(dev, link, ENABLED) {
                if (!ata_dma_enabled(dev))
                        continue;

                dev->xfer_mode = dev->dma_mode;
                dev->xfer_shift = ata_xfer_mode2shift(dev->dma_mode);
                if (ap->ops->set_dmamode)
                        ap->ops->set_dmamode(ap, dev);
        }

        /* step 4: update devices' xfer mode */
        ata_for_each_dev(dev, link, ENABLED) {
                rc = ata_dev_set_mode(dev);
                if (rc)
                        goto out;
        }

        /* Record simplex status. If we selected DMA then the other
         * host channels are not permitted to do so.
         */
        if (used_dma && (ap->host->flags & ATA_HOST_SIMPLEX))
                ap->host->simplex_claimed = ap;

 out:
        if (rc)
                *r_failed_dev = dev;
        return rc;
}
EXPORT_SYMBOL_GPL(ata_do_set_mode);

/**
 *        ata_wait_ready - wait for link to become ready
 *        @link: link to be waited on
 *        @deadline: deadline jiffies for the operation
 *        @check_ready: callback to check link readiness
 *
 *        Wait for @link to become ready.  @check_ready should return
 *        positive number if @link is ready, 0 if it isn't, -ENODEV if
 *        link doesn't seem to be occupied, other errno for other error
 *        conditions.
 *
 *        Transient -ENODEV conditions are allowed for
 *        ATA_TMOUT_FF_WAIT.
 *
 *        LOCKING:
 *        EH context.
 *
 *        RETURNS:
 *        0 if @link is ready before @deadline; otherwise, -errno.
 */
int ata_wait_ready(struct ata_link *link, unsigned long deadline,
                   int (*check_ready)(struct ata_link *link))
{
        unsigned long start = jiffies;
        unsigned long nodev_deadline;
        int warned = 0;

        /* choose which 0xff timeout to use, read comment in libata.h */
        if (link->ap->host->flags & ATA_HOST_PARALLEL_SCAN)
                nodev_deadline = ata_deadline(start, ATA_TMOUT_FF_WAIT_LONG);
        else
                nodev_deadline = ata_deadline(start, ATA_TMOUT_FF_WAIT);

        /* Slave readiness can't be tested separately from master.  On
         * M/S emulation configuration, this function should be called
         * only on the master and it will handle both master and slave.
         */
        WARN_ON(link == link->ap->slave_link);

        if (time_after(nodev_deadline, deadline))
                nodev_deadline = deadline;

        while (1) {
                unsigned long now = jiffies;
                int ready, tmp;

                ready = tmp = check_ready(link);
                if (ready > 0)
                        return 0;

                /*
                 * -ENODEV could be transient.  Ignore -ENODEV if link
                 * is online.  Also, some SATA devices take a long
                 * time to clear 0xff after reset.  Wait for
                 * ATA_TMOUT_FF_WAIT[_LONG] on -ENODEV if link isn't
                 * offline.
                 *
                 * Note that some PATA controllers (pata_ali) explode
                 * if status register is read more than once when
                 * there's no device attached.
                 */
                if (ready == -ENODEV) {
                        if (ata_link_online(link))
                                ready = 0;
                        else if ((link->ap->flags & ATA_FLAG_SATA) &&
                                 !ata_link_offline(link) &&
                                 time_before(now, nodev_deadline))
                                ready = 0;
                }

                if (ready)
                        return ready;
                if (time_after(now, deadline))
                        return -EBUSY;

                if (!warned && time_after(now, start + 5 * HZ) &&
                    (deadline - now > 3 * HZ)) {
                        ata_link_warn(link,
                                "link is slow to respond, please be patient "
                                "(ready=%d)\n", tmp);
                        warned = 1;
                }

                ata_msleep(link->ap, 50);
        }
}

/**
 *        ata_wait_after_reset - wait for link to become ready after reset
 *        @link: link to be waited on
 *        @deadline: deadline jiffies for the operation
 *        @check_ready: callback to check link readiness
 *
 *        Wait for @link to become ready after reset.
 *
 *        LOCKING:
 *        EH context.
 *
 *        RETURNS:
 *        0 if @link is ready before @deadline; otherwise, -errno.
 */
int ata_wait_after_reset(struct ata_link *link, unsigned long deadline,
                                int (*check_ready)(struct ata_link *link))
{
        ata_msleep(link->ap, ATA_WAIT_AFTER_RESET);

        return ata_wait_ready(link, deadline, check_ready);
}
EXPORT_SYMBOL_GPL(ata_wait_after_reset);

/**
 *        ata_std_prereset - prepare for reset
 *        @link: ATA link to be reset
 *        @deadline: deadline jiffies for the operation
 *
 *        @link is about to be reset.  Initialize it.  Failure from
 *        prereset makes libata abort whole reset sequence and give up
 *        that port, so prereset should be best-effort.  It does its
 *        best to prepare for reset sequence but if things go wrong, it
 *        should just whine, not fail.
 *
 *        LOCKING:
 *        Kernel thread context (may sleep)
 *
 *        RETURNS:
 *        0 on success, -errno otherwise.
 */
int ata_std_prereset(struct ata_link *link, unsigned long deadline)
{
        struct ata_port *ap = link->ap;
        struct ata_eh_context *ehc = &link->eh_context;
        const unsigned long *timing = sata_ehc_deb_timing(ehc);
        int rc;

        /* if we're about to do hardreset, nothing more to do */
        if (ehc->i.action & ATA_EH_HARDRESET)
                return 0;

        /* if SATA, resume link */
        if (ap->flags & ATA_FLAG_SATA) {
                rc = sata_link_resume(link, timing, deadline);
                /* whine about phy resume failure but proceed */
                if (rc && rc != -EOPNOTSUPP)
                        ata_link_warn(link,
                                      "failed to resume link for reset (errno=%d)\n",
                                      rc);
        }

        /* no point in trying softreset on offline link */
        if (ata_phys_link_offline(link))
                ehc->i.action &= ~ATA_EH_SOFTRESET;

        return 0;
}
EXPORT_SYMBOL_GPL(ata_std_prereset);

/**
 *        sata_std_hardreset - COMRESET w/o waiting or classification
 *        @link: link to reset
 *        @class: resulting class of attached device
 *        @deadline: deadline jiffies for the operation
 *
 *        Standard SATA COMRESET w/o waiting or classification.
 *
 *        LOCKING:
 *        Kernel thread context (may sleep)
 *
 *        RETURNS:
 *        0 if link offline, -EAGAIN if link online, -errno on errors.
 */
int sata_std_hardreset(struct ata_link *link, unsigned int *class,
                       unsigned long deadline)
{
        const unsigned long *timing = sata_ehc_deb_timing(&link->eh_context);
        bool online;
        int rc;

        /* do hardreset */
        rc = sata_link_hardreset(link, timing, deadline, &online, NULL);
        return online ? -EAGAIN : rc;
}
EXPORT_SYMBOL_GPL(sata_std_hardreset);

/**
 *        ata_std_postreset - standard postreset callback
 *        @link: the target ata_link
 *        @classes: classes of attached devices
 *
 *        This function is invoked after a successful reset.  Note that
 *        the device might have been reset more than once using
 *        different reset methods before postreset is invoked.
 *
 *        LOCKING:
 *        Kernel thread context (may sleep)
 */
void ata_std_postreset(struct ata_link *link, unsigned int *classes)
{
        u32 serror;

        DPRINTK("ENTER\n");

        /* reset complete, clear SError */
        if (!sata_scr_read(link, SCR_ERROR, &serror))
                sata_scr_write(link, SCR_ERROR, serror);

        /* print link status */
        sata_print_link_status(link);

        DPRINTK("EXIT\n");
}
EXPORT_SYMBOL_GPL(ata_std_postreset);

/**
 *        ata_dev_same_device - Determine whether new ID matches configured device
 *        @dev: device to compare against
 *        @new_class: class of the new device
 *        @new_id: IDENTIFY page of the new device
 *
 *        Compare @new_class and @new_id against @dev and determine
 *        whether @dev is the device indicated by @new_class and
 *        @new_id.
 *
 *        LOCKING:
 *        None.
 *
 *        RETURNS:
 *        1 if @dev matches @new_class and @new_id, 0 otherwise.
 */
static int ata_dev_same_device(struct ata_device *dev, unsigned int new_class,
                               const u16 *new_id)
{
        const u16 *old_id = dev->id;
        unsigned char model[2][ATA_ID_PROD_LEN + 1];
        unsigned char serial[2][ATA_ID_SERNO_LEN + 1];

        if (dev->class != new_class) {
                ata_dev_info(dev, "class mismatch %d != %d\n",
                             dev->class, new_class);
                return 0;
        }

        ata_id_c_string(old_id, model[0], ATA_ID_PROD, sizeof(model[0]));
        ata_id_c_string(new_id, model[1], ATA_ID_PROD, sizeof(model[1]));
        ata_id_c_string(old_id, serial[0], ATA_ID_SERNO, sizeof(serial[0]));
        ata_id_c_string(new_id, serial[1], ATA_ID_SERNO, sizeof(serial[1]));

        if (strcmp(model[0], model[1])) {
                ata_dev_info(dev, "model number mismatch '%s' != '%s'\n",
                             model[0], model[1]);
                return 0;
        }

        if (strcmp(serial[0], serial[1])) {
                ata_dev_info(dev, "serial number mismatch '%s' != '%s'\n",
                             serial[0], serial[1]);
                return 0;
        }

        return 1;
}

/**
 *        ata_dev_reread_id - Re-read IDENTIFY data
 *        @dev: target ATA device
 *        @readid_flags: read ID flags
 *
 *        Re-read IDENTIFY page and make sure @dev is still attached to
 *        the port.
 *
 *        LOCKING:
 *        Kernel thread context (may sleep)
 *
 *        RETURNS:
 *        0 on success, negative errno otherwise
 */
int ata_dev_reread_id(struct ata_device *dev, unsigned int readid_flags)
{
        unsigned int class = dev->class;
        u16 *id = (void *)dev->link->ap->sector_buf;
        int rc;

        /* read ID data */
        rc = ata_dev_read_id(dev, &class, readid_flags, id);
        if (rc)
                return rc;

        /* is the device still there? */
        if (!ata_dev_same_device(dev, class, id))
                return -ENODEV;

        memcpy(dev->id, id, sizeof(id[0]) * ATA_ID_WORDS);
        return 0;
}

/**
 *        ata_dev_revalidate - Revalidate ATA device
 *        @dev: device to revalidate
 *        @new_class: new class code
 *        @readid_flags: read ID flags
 *
 *        Re-read IDENTIFY page, make sure @dev is still attached to the
 *        port and reconfigure it according to the new IDENTIFY page.
 *
 *        LOCKING:
 *        Kernel thread context (may sleep)
 *
 *        RETURNS:
 *        0 on success, negative errno otherwise
 */
int ata_dev_revalidate(struct ata_device *dev, unsigned int new_class,
                       unsigned int readid_flags)
{
        u64 n_sectors = dev->n_sectors;
        u64 n_native_sectors = dev->n_native_sectors;
        int rc;

        if (!ata_dev_enabled(dev))
                return -ENODEV;

        /* fail early if !ATA && !ATAPI to avoid issuing [P]IDENTIFY to PMP */
        if (ata_class_enabled(new_class) &&
            new_class != ATA_DEV_ATA &&
            new_class != ATA_DEV_ATAPI &&
            new_class != ATA_DEV_ZAC &&
            new_class != ATA_DEV_SEMB) {
                ata_dev_info(dev, "class mismatch %u != %u\n",
                             dev->class, new_class);
                rc = -ENODEV;
                goto fail;
        }

        /* re-read ID */
        rc = ata_dev_reread_id(dev, readid_flags);
        if (rc)
                goto fail;

        /* configure device according to the new ID */
        rc = ata_dev_configure(dev);
        if (rc)
                goto fail;

        /* verify n_sectors hasn't changed */
        if (dev->class != ATA_DEV_ATA || !n_sectors ||
            dev->n_sectors == n_sectors)
                return 0;

        /* n_sectors has changed */
        ata_dev_warn(dev, "n_sectors mismatch %llu != %llu\n",
                     (unsigned long long)n_sectors,
                     (unsigned long long)dev->n_sectors);

        /*
         * Something could have caused HPA to be unlocked
         * involuntarily.  If n_native_sectors hasn't changed and the
         * new size matches it, keep the device.
         */
        if (dev->n_native_sectors == n_native_sectors &&
            dev->n_sectors > n_sectors && dev->n_sectors == n_native_sectors) {
                ata_dev_warn(dev,
                             "new n_sectors matches native, probably "
                             "late HPA unlock, n_sectors updated\n");
                /* use the larger n_sectors */
                return 0;
        }

        /*
         * Some BIOSes boot w/o HPA but resume w/ HPA locked.  Try
         * unlocking HPA in those cases.
         *
         * https://bugzilla.kernel.org/show_bug.cgi?id=15396
         */
        if (dev->n_native_sectors == n_native_sectors &&
            dev->n_sectors < n_sectors && n_sectors == n_native_sectors &&
            !(dev->horkage & ATA_HORKAGE_BROKEN_HPA)) {
                ata_dev_warn(dev,
                             "old n_sectors matches native, probably "
                             "late HPA lock, will try to unlock HPA\n");
                /* try unlocking HPA */
                dev->flags |= ATA_DFLAG_UNLOCK_HPA;
                rc = -EIO;
        } else
                rc = -ENODEV;

        /* restore original n_[native_]sectors and fail */
        dev->n_native_sectors = n_native_sectors;
        dev->n_sectors = n_sectors;
 fail:
        ata_dev_err(dev, "revalidation failed (errno=%d)\n", rc);
        return rc;
}

struct ata_blacklist_entry {
        const char *model_num;
        const char *model_rev;
        unsigned long horkage;
};

static const struct ata_blacklist_entry ata_device_blacklist [] = {
        /* Devices with DMA related problems under Linux */
        { "WDC AC11000H",        NULL,                ATA_HORKAGE_NODMA },
        { "WDC AC22100H",        NULL,                ATA_HORKAGE_NODMA },
        { "WDC AC32500H",        NULL,                ATA_HORKAGE_NODMA },
        { "WDC AC33100H",        NULL,                ATA_HORKAGE_NODMA },
        { "WDC AC31600H",        NULL,                ATA_HORKAGE_NODMA },
        { "WDC AC32100H",        "24.09P07",        ATA_HORKAGE_NODMA },
        { "WDC AC23200L",        "21.10N21",        ATA_HORKAGE_NODMA },
        { "Compaq CRD-8241B",         NULL,                ATA_HORKAGE_NODMA },
        { "CRD-8400B",                NULL,                 ATA_HORKAGE_NODMA },
        { "CRD-848[02]B",        NULL,                ATA_HORKAGE_NODMA },
        { "CRD-84",                NULL,                ATA_HORKAGE_NODMA },
        { "SanDisk SDP3B",        NULL,                ATA_HORKAGE_NODMA },
        { "SanDisk SDP3B-64",        NULL,                ATA_HORKAGE_NODMA },
        { "SANYO CD-ROM CRD",        NULL,                ATA_HORKAGE_NODMA },
        { "HITACHI CDR-8",        NULL,                ATA_HORKAGE_NODMA },
        { "HITACHI CDR-8[34]35",NULL,                ATA_HORKAGE_NODMA },
        { "Toshiba CD-ROM XM-6202B", NULL,        ATA_HORKAGE_NODMA },
        { "TOSHIBA CD-ROM XM-1702BC", NULL,        ATA_HORKAGE_NODMA },
        { "CD-532E-A",                 NULL,                ATA_HORKAGE_NODMA },
        { "E-IDE CD-ROM CR-840",NULL,                ATA_HORKAGE_NODMA },
        { "CD-ROM Drive/F5A",        NULL,                ATA_HORKAGE_NODMA },
        { "WPI CDD-820",         NULL,                ATA_HORKAGE_NODMA },
        { "SAMSUNG CD-ROM SC-148C", NULL,        ATA_HORKAGE_NODMA },
        { "SAMSUNG CD-ROM SC",        NULL,                ATA_HORKAGE_NODMA },
        { "ATAPI CD-ROM DRIVE 40X MAXIMUM",NULL,ATA_HORKAGE_NODMA },
        { "_NEC DV5800A",         NULL,                ATA_HORKAGE_NODMA },
        { "SAMSUNG CD-ROM SN-124", "N001",        ATA_HORKAGE_NODMA },
        { "Seagate STT20000A", NULL,                ATA_HORKAGE_NODMA },
        { " 2GB ATA Flash Disk", "ADMA428M",        ATA_HORKAGE_NODMA },
        { "VRFDFC22048UCHC-TE*", NULL,                ATA_HORKAGE_NODMA },
        /* Odd clown on sil3726/4726 PMPs */
        { "Config  Disk",        NULL,                ATA_HORKAGE_DISABLE },
        /* Similar story with ASMedia 1092 */
        { "ASMT109x- Config",        NULL,                ATA_HORKAGE_DISABLE },

        /* Weird ATAPI devices */
        { "TORiSAN DVD-ROM DRD-N216", NULL,        ATA_HORKAGE_MAX_SEC_128 },
        { "QUANTUM DAT    DAT72-000", NULL,        ATA_HORKAGE_ATAPI_MOD16_DMA },
        { "Slimtype DVD A  DS8A8SH", NULL,        ATA_HORKAGE_MAX_SEC_LBA48 },
        { "Slimtype DVD A  DS8A9SH", NULL,        ATA_HORKAGE_MAX_SEC_LBA48 },

        /*
         * Causes silent data corruption with higher max sects.
         * http://lkml.kernel.org/g/x49wpy40ysk.fsf@segfault.boston.devel.redhat.com
         */
        { "ST380013AS",                "3.20",                ATA_HORKAGE_MAX_SEC_1024 },

        /*
         * These devices time out with higher max sects.
         * https://bugzilla.kernel.org/show_bug.cgi?id=121671
         */
        { "LITEON CX1-JB*-HP",        NULL,                ATA_HORKAGE_MAX_SEC_1024 },
        { "LITEON EP1-*",        NULL,                ATA_HORKAGE_MAX_SEC_1024 },

        /* Devices we expect to fail diagnostics */

        /* Devices where NCQ should be avoided */
        /* NCQ is slow */
        { "WDC WD740ADFD-00",        NULL,                ATA_HORKAGE_NONCQ },
        { "WDC WD740ADFD-00NLR1", NULL,                ATA_HORKAGE_NONCQ, },
        /* http://thread.gmane.org/gmane.linux.ide/14907 */
        { "FUJITSU MHT2060BH",        NULL,                ATA_HORKAGE_NONCQ },
        /* NCQ is broken */
        { "Maxtor *",                "BANC*",        ATA_HORKAGE_NONCQ },
        { "Maxtor 7V300F0",        "VA111630",        ATA_HORKAGE_NONCQ },
        { "ST380817AS",                "3.42",                ATA_HORKAGE_NONCQ },
        { "ST3160023AS",        "3.42",                ATA_HORKAGE_NONCQ },
        { "OCZ CORE_SSD",        "02.10104",        ATA_HORKAGE_NONCQ },

        /* Seagate NCQ + FLUSH CACHE firmware bug */
        { "ST31500341AS",        "SD1[5-9]",        ATA_HORKAGE_NONCQ |
                                                ATA_HORKAGE_FIRMWARE_WARN },

        { "ST31000333AS",        "SD1[5-9]",        ATA_HORKAGE_NONCQ |
                                                ATA_HORKAGE_FIRMWARE_WARN },

        { "ST3640[36]23AS",        "SD1[5-9]",        ATA_HORKAGE_NONCQ |
                                                ATA_HORKAGE_FIRMWARE_WARN },

        { "ST3320[68]13AS",        "SD1[5-9]",        ATA_HORKAGE_NONCQ |
                                                ATA_HORKAGE_FIRMWARE_WARN },

        /* drives which fail FPDMA_AA activation (some may freeze afterwards)
           the ST disks also have LPM issues */
        { "ST1000LM024 HN-M101MBB", NULL,        ATA_HORKAGE_BROKEN_FPDMA_AA |
                                                ATA_HORKAGE_NOLPM, },
        { "VB0250EAVER",        "HPG7",                ATA_HORKAGE_BROKEN_FPDMA_AA },

        /* Blacklist entries taken from Silicon Image 3124/3132
           Windows driver .inf file - also several Linux problem reports */
        { "HTS541060G9SA00",    "MB3OC60D",     ATA_HORKAGE_NONCQ, },
        { "HTS541080G9SA00",    "MB4OC60D",     ATA_HORKAGE_NONCQ, },
        { "HTS541010G9SA00",    "MBZOC60D",     ATA_HORKAGE_NONCQ, },

        /* https://bugzilla.kernel.org/show_bug.cgi?id=15573 */
        { "C300-CTFDDAC128MAG",        "0001",                ATA_HORKAGE_NONCQ, },

        /* Sandisk SD7/8/9s lock up hard on large trims */
        { "SanDisk SD[789]*",        NULL,                ATA_HORKAGE_MAX_TRIM_128M, },

        /* devices which puke on READ_NATIVE_MAX */
        { "HDS724040KLSA80",        "KFAOA20N",        ATA_HORKAGE_BROKEN_HPA, },
        { "WDC WD3200JD-00KLB0", "WD-WCAMR1130137", ATA_HORKAGE_BROKEN_HPA },
        { "WDC WD2500JD-00HBB0", "WD-WMAL71490727", ATA_HORKAGE_BROKEN_HPA },
        { "MAXTOR 6L080L4",        "A93.0500",        ATA_HORKAGE_BROKEN_HPA },

        /* this one allows HPA unlocking but fails IOs on the area */
        { "OCZ-VERTEX",                    "1.30",        ATA_HORKAGE_BROKEN_HPA },

        /* Devices which report 1 sector over size HPA */
        { "ST340823A",                NULL,                ATA_HORKAGE_HPA_SIZE, },
        { "ST320413A",                NULL,                ATA_HORKAGE_HPA_SIZE, },
        { "ST310211A",                NULL,                ATA_HORKAGE_HPA_SIZE, },

        /* Devices which get the IVB wrong */
        { "QUANTUM FIREBALLlct10 05", "A03.0900", ATA_HORKAGE_IVB, },
        /* Maybe we should just blacklist TSSTcorp... */
        { "TSSTcorp CDDVDW SH-S202[HJN]", "SB0[01]",  ATA_HORKAGE_IVB, },

        /* Devices that do not need bridging limits applied */
        { "MTRON MSP-SATA*",                NULL,        ATA_HORKAGE_BRIDGE_OK, },
        { "BUFFALO HD-QSU2/R5",                NULL,        ATA_HORKAGE_BRIDGE_OK, },

        /* Devices which aren't very happy with higher link speeds */
        { "WD My Book",                        NULL,        ATA_HORKAGE_1_5_GBPS, },
        { "Seagate FreeAgent GoFlex",        NULL,        ATA_HORKAGE_1_5_GBPS, },

        /*
         * Devices which choke on SETXFER.  Applies only if both the
         * device and controller are SATA.
         */
        { "PIONEER DVD-RW  DVRTD08",        NULL,        ATA_HORKAGE_NOSETXFER },
        { "PIONEER DVD-RW  DVRTD08A",        NULL,        ATA_HORKAGE_NOSETXFER },
        { "PIONEER DVD-RW  DVR-215",        NULL,        ATA_HORKAGE_NOSETXFER },
        { "PIONEER DVD-RW  DVR-212D",        NULL,        ATA_HORKAGE_NOSETXFER },
        { "PIONEER DVD-RW  DVR-216D",        NULL,        ATA_HORKAGE_NOSETXFER },

        /* These specific Pioneer models have LPM issues */
        { "PIONEER BD-RW   BDR-207M",        NULL,        ATA_HORKAGE_NOLPM },
        { "PIONEER BD-RW   BDR-205",        NULL,        ATA_HORKAGE_NOLPM },

        /* Crucial BX100 SSD 500GB has broken LPM support */
        { "CT500BX100SSD1",                NULL,        ATA_HORKAGE_NOLPM },

        /* 512GB MX100 with MU01 firmware has both queued TRIM and LPM issues */
        { "Crucial_CT512MX100*",        "MU01",        ATA_HORKAGE_NO_NCQ_TRIM |
                                                ATA_HORKAGE_ZERO_AFTER_TRIM |
                                                ATA_HORKAGE_NOLPM, },
        /* 512GB MX100 with newer firmware has only LPM issues */
        { "Crucial_CT512MX100*",        NULL,        ATA_HORKAGE_ZERO_AFTER_TRIM |
                                                ATA_HORKAGE_NOLPM, },

        /* 480GB+ M500 SSDs have both queued TRIM and LPM issues */
        { "Crucial_CT480M500*",                NULL,        ATA_HORKAGE_NO_NCQ_TRIM |
                                                ATA_HORKAGE_ZERO_AFTER_TRIM |
                                                ATA_HORKAGE_NOLPM, },
        { "Crucial_CT960M500*",                NULL,        ATA_HORKAGE_NO_NCQ_TRIM |
                                                ATA_HORKAGE_ZERO_AFTER_TRIM |
                                                ATA_HORKAGE_NOLPM, },

        /* These specific Samsung models/firmware-revs do not handle LPM well */
        { "SAMSUNG MZMPC128HBFU-000MV", "CXM14M1Q", ATA_HORKAGE_NOLPM, },
        { "SAMSUNG SSD PM830 mSATA *",  "CXM13D1Q", ATA_HORKAGE_NOLPM, },
        { "SAMSUNG MZ7TD256HAFV-000L9", NULL,       ATA_HORKAGE_NOLPM, },
        { "SAMSUNG MZ7TE512HMHP-000L1", "EXT06L0Q", ATA_HORKAGE_NOLPM, },

        /* devices that don't properly handle queued TRIM commands */
        { "Micron_M500IT_*",                "MU01",        ATA_HORKAGE_NO_NCQ_TRIM |
                                                ATA_HORKAGE_ZERO_AFTER_TRIM, },
        { "Micron_M500_*",                NULL,        ATA_HORKAGE_NO_NCQ_TRIM |
                                                ATA_HORKAGE_ZERO_AFTER_TRIM, },
        { "Crucial_CT*M500*",                NULL,        ATA_HORKAGE_NO_NCQ_TRIM |
                                                ATA_HORKAGE_ZERO_AFTER_TRIM, },
        { "Micron_M5[15]0_*",                "MU01",        ATA_HORKAGE_NO_NCQ_TRIM |
                                                ATA_HORKAGE_ZERO_AFTER_TRIM, },
        { "Crucial_CT*M550*",                "MU01",        ATA_HORKAGE_NO_NCQ_TRIM |
                                                ATA_HORKAGE_ZERO_AFTER_TRIM, },
        { "Crucial_CT*MX100*",                "MU01",        ATA_HORKAGE_NO_NCQ_TRIM |
                                                ATA_HORKAGE_ZERO_AFTER_TRIM, },
        { "Samsung SSD 840 EVO*",        NULL,        ATA_HORKAGE_NO_NCQ_TRIM |
                                                ATA_HORKAGE_NO_DMA_LOG |
                                                ATA_HORKAGE_ZERO_AFTER_TRIM, },
        { "Samsung SSD 840*",                NULL,        ATA_HORKAGE_NO_NCQ_TRIM |
                                                ATA_HORKAGE_ZERO_AFTER_TRIM, },
        { "Samsung SSD 850*",                NULL,        ATA_HORKAGE_NO_NCQ_TRIM |
                                                ATA_HORKAGE_ZERO_AFTER_TRIM, },
        { "Samsung SSD 860*",                NULL,        ATA_HORKAGE_NO_NCQ_TRIM |
                                                ATA_HORKAGE_ZERO_AFTER_TRIM |
                                                ATA_HORKAGE_NO_NCQ_ON_ATI, },
        { "Samsung SSD 870*",                NULL,        ATA_HORKAGE_NO_NCQ_TRIM |
                                                ATA_HORKAGE_ZERO_AFTER_TRIM |
                                                ATA_HORKAGE_NO_NCQ_ON_ATI, },
        { "FCCT*M500*",                        NULL,        ATA_HORKAGE_NO_NCQ_TRIM |
                                                ATA_HORKAGE_ZERO_AFTER_TRIM, },

        /* devices that don't properly handle TRIM commands */
        { "SuperSSpeed S238*",                NULL,        ATA_HORKAGE_NOTRIM, },
        { "M88V29*",                        NULL,        ATA_HORKAGE_NOTRIM, },

        /*
         * As defined, the DRAT (Deterministic Read After Trim) and RZAT
         * (Return Zero After Trim) flags in the ATA Command Set are
         * unreliable in the sense that they only define what happens if
         * the device successfully executed the DSM TRIM command. TRIM
         * is only advisory, however, and the device is free to silently
         * ignore all or parts of the request.
         *
         * Whitelist drives that are known to reliably return zeroes
         * after TRIM.
         */

        /*
         * The intel 510 drive has buggy DRAT/RZAT. Explicitly exclude
         * that model before whitelisting all other intel SSDs.
         */
        { "INTEL*SSDSC2MH*",                NULL,        0, },

        { "Micron*",                        NULL,        ATA_HORKAGE_ZERO_AFTER_TRIM, },
        { "Crucial*",                        NULL,        ATA_HORKAGE_ZERO_AFTER_TRIM, },
        { "INTEL*SSD*",                 NULL,        ATA_HORKAGE_ZERO_AFTER_TRIM, },
        { "SSD*INTEL*",                        NULL,        ATA_HORKAGE_ZERO_AFTER_TRIM, },
        { "Samsung*SSD*",                NULL,        ATA_HORKAGE_ZERO_AFTER_TRIM, },
        { "SAMSUNG*SSD*",                NULL,        ATA_HORKAGE_ZERO_AFTER_TRIM, },
        { "SAMSUNG*MZ7KM*",                NULL,        ATA_HORKAGE_ZERO_AFTER_TRIM, },
        { "ST[1248][0248]0[FH]*",        NULL,        ATA_HORKAGE_ZERO_AFTER_TRIM, },

        /*
         * Some WD SATA-I drives spin up and down erratically when the link
         * is put into the slumber mode.  We don't have full list of the
         * affected devices.  Disable LPM if the device matches one of the
         * known prefixes and is SATA-1.  As a side effect LPM partial is
         * lost too.
         *
         * https://bugzilla.kernel.org/show_bug.cgi?id=57211
         */
        { "WDC WD800JD-*",                NULL,        ATA_HORKAGE_WD_BROKEN_LPM },
        { "WDC WD1200JD-*",                NULL,        ATA_HORKAGE_WD_BROKEN_LPM },
        { "WDC WD1600JD-*",                NULL,        ATA_HORKAGE_WD_BROKEN_LPM },
        { "WDC WD2000JD-*",                NULL,        ATA_HORKAGE_WD_BROKEN_LPM },
        { "WDC WD2500JD-*",                NULL,        ATA_HORKAGE_WD_BROKEN_LPM },
        { "WDC WD3000JD-*",                NULL,        ATA_HORKAGE_WD_BROKEN_LPM },
        { "WDC WD3200JD-*",                NULL,        ATA_HORKAGE_WD_BROKEN_LPM },

        /* End Marker */
        { }
};

static unsigned long ata_dev_blacklisted(const struct ata_device *dev)
{
        unsigned char model_num[ATA_ID_PROD_LEN + 1];
        unsigned char model_rev[ATA_ID_FW_REV_LEN + 1];
        const struct ata_blacklist_entry *ad = ata_device_blacklist;

        ata_id_c_string(dev->id, model_num, ATA_ID_PROD, sizeof(model_num));
        ata_id_c_string(dev->id, model_rev, ATA_ID_FW_REV, sizeof(model_rev));

        while (ad->model_num) {
                if (glob_match(ad->model_num, model_num)) {
                        if (ad->model_rev == NULL)
                                return ad->horkage;
                        if (glob_match(ad->model_rev, model_rev))
                                return ad->horkage;
                }
                ad++;
        }
        return 0;
}

static int ata_dma_blacklisted(const struct ata_device *dev)
{
        /* We don't support polling DMA.
         * DMA blacklist those ATAPI devices with CDB-intr (and use PIO)
         * if the LLDD handles only interrupts in the HSM_ST_LAST state.
         */
        if ((dev->link->ap->flags & ATA_FLAG_PIO_POLLING) &&
            (dev->flags & ATA_DFLAG_CDB_INTR))
                return 1;
        return (dev->horkage & ATA_HORKAGE_NODMA) ? 1 : 0;
}

/**
 *        ata_is_40wire                -        check drive side detection
 *        @dev: device
 *
 *        Perform drive side detection decoding, allowing for device vendors
 *        who can't follow the documentation.
 */

static int ata_is_40wire(struct ata_device *dev)
{
        if (dev->horkage & ATA_HORKAGE_IVB)
                return ata_drive_40wire_relaxed(dev->id);
        return ata_drive_40wire(dev->id);
}

/**
 *        cable_is_40wire                -        40/80/SATA decider
 *        @ap: port to consider
 *
 *        This function encapsulates the policy for speed management
 *        in one place. At the moment we don't cache the result but
 *        there is a good case for setting ap->cbl to the result when
 *        we are called with unknown cables (and figuring out if it
 *        impacts hotplug at all).
 *
 *        Return 1 if the cable appears to be 40 wire.
 */

static int cable_is_40wire(struct ata_port *ap)
{
        struct ata_link *link;
        struct ata_device *dev;

        /* If the controller thinks we are 40 wire, we are. */
        if (ap->cbl == ATA_CBL_PATA40)
                return 1;

        /* If the controller thinks we are 80 wire, we are. */
        if (ap->cbl == ATA_CBL_PATA80 || ap->cbl == ATA_CBL_SATA)
                return 0;

        /* If the system is known to be 40 wire short cable (eg
         * laptop), then we allow 80 wire modes even if the drive
         * isn't sure.
         */
        if (ap->cbl == ATA_CBL_PATA40_SHORT)
                return 0;

        /* If the controller doesn't know, we scan.
         *
         * Note: We look for all 40 wire detects at this point.  Any
         *       80 wire detect is taken to be 80 wire cable because
         * - in many setups only the one drive (slave if present) will
         *   give a valid detect
         * - if you have a non detect capable drive you don't want it
         *   to colour the choice
         */
        ata_for_each_link(link, ap, EDGE) {
                ata_for_each_dev(dev, link, ENABLED) {
                        if (!ata_is_40wire(dev))
                                return 0;
                }
        }
        return 1;
}

/**
 *        ata_dev_xfermask - Compute supported xfermask of the given device
 *        @dev: Device to compute xfermask for
 *
 *        Compute supported xfermask of @dev and store it in
 *        dev->*_mask.  This function is responsible for applying all
 *        known limits including host controller limits, device
 *        blacklist, etc...
 *
 *        LOCKING:
 *        None.
 */
static void ata_dev_xfermask(struct ata_device *dev)
{
        struct ata_link *link = dev->link;
        struct ata_port *ap = link->ap;
        struct ata_host *host = ap->host;
        unsigned long xfer_mask;

        /* controller modes available */
        xfer_mask = ata_pack_xfermask(ap->pio_mask,
                                      ap->mwdma_mask, ap->udma_mask);

        /* drive modes available */
        xfer_mask &= ata_pack_xfermask(dev->pio_mask,
                                       dev->mwdma_mask, dev->udma_mask);
        xfer_mask &= ata_id_xfermask(dev->id);

        /*
         *        CFA Advanced TrueIDE timings are not allowed on a shared
         *        cable
         */
        if (ata_dev_pair(dev)) {
                /* No PIO5 or PIO6 */
                xfer_mask &= ~(0x03 << (ATA_SHIFT_PIO + 5));
                /* No MWDMA3 or MWDMA 4 */
                xfer_mask &= ~(0x03 << (ATA_SHIFT_MWDMA + 3));
        }

        if (ata_dma_blacklisted(dev)) {
                xfer_mask &= ~(ATA_MASK_MWDMA | ATA_MASK_UDMA);
                ata_dev_warn(dev,
                             "device is on DMA blacklist, disabling DMA\n");
        }

        if ((host->flags & ATA_HOST_SIMPLEX) &&
            host->simplex_claimed && host->simplex_claimed != ap) {
                xfer_mask &= ~(ATA_MASK_MWDMA | ATA_MASK_UDMA);
                ata_dev_warn(dev,
                             "simplex DMA is claimed by other device, disabling DMA\n");
        }

        if (ap->flags & ATA_FLAG_NO_IORDY)
                xfer_mask &= ata_pio_mask_no_iordy(dev);

        if (ap->ops->mode_filter)
                xfer_mask = ap->ops->mode_filter(dev, xfer_mask);

        /* Apply cable rule here.  Don't apply it early because when
         * we handle hot plug the cable type can itself change.
         * Check this last so that we know if the transfer rate was
         * solely limited by the cable.
         * Unknown or 80 wire cables reported host side are checked
         * drive side as well. Cases where we know a 40wire cable
         * is used safely for 80 are not checked here.
         */
        if (xfer_mask & (0xF8 << ATA_SHIFT_UDMA))
                /* UDMA/44 or higher would be available */
                if (cable_is_40wire(ap)) {
                        ata_dev_warn(dev,
                                     "limited to UDMA/33 due to 40-wire cable\n");
                        xfer_mask &= ~(0xF8 << ATA_SHIFT_UDMA);
                }

        ata_unpack_xfermask(xfer_mask, &dev->pio_mask,
                            &dev->mwdma_mask, &dev->udma_mask);
}

/**
 *        ata_dev_set_xfermode - Issue SET FEATURES - XFER MODE command
 *        @dev: Device to which command will be sent
 *
 *        Issue SET FEATURES - XFER MODE command to device @dev
 *        on port @ap.
 *
 *        LOCKING:
 *        PCI/etc. bus probe sem.
 *
 *        RETURNS:
 *        0 on success, AC_ERR_* mask otherwise.
 */

static unsigned int ata_dev_set_xfermode(struct ata_device *dev)
{
        struct ata_taskfile tf;
        unsigned int err_mask;

        /* set up set-features taskfile */
        DPRINTK("set features - xfer mode\n");

        /* Some controllers and ATAPI devices show flaky interrupt
         * behavior after setting xfer mode.  Use polling instead.
         */
        ata_tf_init(dev, &tf);
        tf.command = ATA_CMD_SET_FEATURES;
        tf.feature = SETFEATURES_XFER;
        tf.flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE | ATA_TFLAG_POLLING;
        tf.protocol = ATA_PROT_NODATA;
        /* If we are using IORDY we must send the mode setting command */
        if (ata_pio_need_iordy(dev))
                tf.nsect = dev->xfer_mode;
        /* If the device has IORDY and the controller does not - turn it off */
         else if (ata_id_has_iordy(dev->id))
                tf.nsect = 0x01;
        else /* In the ancient relic department - skip all of this */
                return 0;

        /* On some disks, this command causes spin-up, so we need longer timeout */
        err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, 15000);

        DPRINTK("EXIT, err_mask=%x\n", err_mask);
        return err_mask;
}

/**
 *        ata_dev_set_feature - Issue SET FEATURES - SATA FEATURES
 *        @dev: Device to which command will be sent
 *        @enable: Whether to enable or disable the feature
 *        @feature: The sector count represents the feature to set
 *
 *        Issue SET FEATURES - SATA FEATURES command to device @dev
 *        on port @ap with sector count
 *
 *        LOCKING:
 *        PCI/etc. bus probe sem.
 *
 *        RETURNS:
 *        0 on success, AC_ERR_* mask otherwise.
 */
unsigned int ata_dev_set_feature(struct ata_device *dev, u8 enable, u8 feature)
{
        struct ata_taskfile tf;
        unsigned int err_mask;
        unsigned long timeout = 0;

        /* set up set-features taskfile */
        DPRINTK("set features - SATA features\n");

        ata_tf_init(dev, &tf);
        tf.command = ATA_CMD_SET_FEATURES;
        tf.feature = enable;
        tf.flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE;
        tf.protocol = ATA_PROT_NODATA;
        tf.nsect = feature;

        if (enable == SETFEATURES_SPINUP)
                timeout = ata_probe_timeout ?
                          ata_probe_timeout * 1000 : SETFEATURES_SPINUP_TIMEOUT;
        err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, timeout);

        DPRINTK("EXIT, err_mask=%x\n", err_mask);
        return err_mask;
}
EXPORT_SYMBOL_GPL(ata_dev_set_feature);

/**
 *        ata_dev_init_params - Issue INIT DEV PARAMS command
 *        @dev: Device to which command will be sent
 *        @heads: Number of heads (taskfile parameter)
 *        @sectors: Number of sectors (taskfile parameter)
 *
 *        LOCKING:
 *        Kernel thread context (may sleep)
 *
 *        RETURNS:
 *        0 on success, AC_ERR_* mask otherwise.
 */
static unsigned int ata_dev_init_params(struct ata_device *dev,
                                        u16 heads, u16 sectors)
{
        struct ata_taskfile tf;
        unsigned int err_mask;

        /* Number of sectors per track 1-255. Number of heads 1-16 */
        if (sectors < 1 || sectors > 255 || heads < 1 || heads > 16)
                return AC_ERR_INVALID;

        /* set up init dev params taskfile */
        DPRINTK("init dev params \n");

        ata_tf_init(dev, &tf);
        tf.command = ATA_CMD_INIT_DEV_PARAMS;
        tf.flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE;
        tf.protocol = ATA_PROT_NODATA;
        tf.nsect = sectors;
        tf.device |= (heads - 1) & 0x0f; /* max head = num. of heads - 1 */

        err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, 0);
        /* A clean abort indicates an original or just out of spec drive
           and we should continue as we issue the setup based on the
           drive reported working geometry */
        if (err_mask == AC_ERR_DEV && (tf.feature & ATA_ABORTED))
                err_mask = 0;

        DPRINTK("EXIT, err_mask=%x\n", err_mask);
        return err_mask;
}

/**
 *        atapi_check_dma - Check whether ATAPI DMA can be supported
 *        @qc: Metadata associated with taskfile to check
 *
 *        Allow low-level driver to filter ATA PACKET commands, returning
 *        a status indicating whether or not it is OK to use DMA for the
 *        supplied PACKET command.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 *
 *        RETURNS: 0 when ATAPI DMA can be used
 *               nonzero otherwise
 */
int atapi_check_dma(struct ata_queued_cmd *qc)
{
        struct ata_port *ap = qc->ap;

        /* Don't allow DMA if it isn't multiple of 16 bytes.  Quite a
         * few ATAPI devices choke on such DMA requests.
         */
        if (!(qc->dev->horkage & ATA_HORKAGE_ATAPI_MOD16_DMA) &&
            unlikely(qc->nbytes & 15))
                return 1;

        if (ap->ops->check_atapi_dma)
                return ap->ops->check_atapi_dma(qc);

        return 0;
}

/**
 *        ata_std_qc_defer - Check whether a qc needs to be deferred
 *        @qc: ATA command in question
 *
 *        Non-NCQ commands cannot run with any other command, NCQ or
 *        not.  As upper layer only knows the queue depth, we are
 *        responsible for maintaining exclusion.  This function checks
 *        whether a new command @qc can be issued.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 *
 *        RETURNS:
 *        ATA_DEFER_* if deferring is needed, 0 otherwise.
 */
int ata_std_qc_defer(struct ata_queued_cmd *qc)
{
        struct ata_link *link = qc->dev->link;

        if (ata_is_ncq(qc->tf.protocol)) {
                if (!ata_tag_valid(link->active_tag))
                        return 0;
        } else {
                if (!ata_tag_valid(link->active_tag) && !link->sactive)
                        return 0;
        }

        return ATA_DEFER_LINK;
}
EXPORT_SYMBOL_GPL(ata_std_qc_defer);

enum ata_completion_errors ata_noop_qc_prep(struct ata_queued_cmd *qc)
{
        return AC_ERR_OK;
}
EXPORT_SYMBOL_GPL(ata_noop_qc_prep);

/**
 *        ata_sg_init - Associate command with scatter-gather table.
 *        @qc: Command to be associated
 *        @sg: Scatter-gather table.
 *        @n_elem: Number of elements in s/g table.
 *
 *        Initialize the data-related elements of queued_cmd @qc
 *        to point to a scatter-gather table @sg, containing @n_elem
 *        elements.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 */
void ata_sg_init(struct ata_queued_cmd *qc, struct scatterlist *sg,
                 unsigned int n_elem)
{
        qc->sg = sg;
        qc->n_elem = n_elem;
        qc->cursg = qc->sg;
}

#ifdef CONFIG_HAS_DMA

/**
 *        ata_sg_clean - Unmap DMA memory associated with command
 *        @qc: Command containing DMA memory to be released
 *
 *        Unmap all mapped DMA memory associated with this command.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 */
static void ata_sg_clean(struct ata_queued_cmd *qc)
{
        struct ata_port *ap = qc->ap;
        struct scatterlist *sg = qc->sg;
        int dir = qc->dma_dir;

        WARN_ON_ONCE(sg == NULL);

        if (qc->n_elem)
                dma_unmap_sg(ap->dev, sg, qc->orig_n_elem, dir);

        qc->flags &= ~ATA_QCFLAG_DMAMAP;
        qc->sg = NULL;
}

/**
 *        ata_sg_setup - DMA-map the scatter-gather table associated with a command.
 *        @qc: Command with scatter-gather table to be mapped.
 *
 *        DMA-map the scatter-gather table associated with queued_cmd @qc.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 *
 *        RETURNS:
 *        Zero on success, negative on error.
 *
 */
static int ata_sg_setup(struct ata_queued_cmd *qc)
{
        struct ata_port *ap = qc->ap;
        unsigned int n_elem;

        VPRINTK("ENTER, ata%u\n", ap->print_id);

        n_elem = dma_map_sg(ap->dev, qc->sg, qc->n_elem, qc->dma_dir);
        if (n_elem < 1)
                return -1;

        qc->orig_n_elem = qc->n_elem;
        qc->n_elem = n_elem;
        qc->flags |= ATA_QCFLAG_DMAMAP;

        return 0;
}

#else /* !CONFIG_HAS_DMA */

static inline void ata_sg_clean(struct ata_queued_cmd *qc) {}
static inline int ata_sg_setup(struct ata_queued_cmd *qc) { return -1; }

#endif /* !CONFIG_HAS_DMA */

/**
 *        swap_buf_le16 - swap halves of 16-bit words in place
 *        @buf:  Buffer to swap
 *        @buf_words:  Number of 16-bit words in buffer.
 *
 *        Swap halves of 16-bit words if needed to convert from
 *        little-endian byte order to native cpu byte order, or
 *        vice-versa.
 *
 *        LOCKING:
 *        Inherited from caller.
 */
void swap_buf_le16(u16 *buf, unsigned int buf_words)
{
#ifdef __BIG_ENDIAN
        unsigned int i;

        for (i = 0; i < buf_words; i++)
                buf[i] = le16_to_cpu(buf[i]);
#endif /* __BIG_ENDIAN */
}

/**
 *        ata_qc_new_init - Request an available ATA command, and initialize it
 *        @dev: Device from whom we request an available command structure
 *        @tag: tag
 *
 *        LOCKING:
 *        None.
 */

struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev, int tag)
{
        struct ata_port *ap = dev->link->ap;
        struct ata_queued_cmd *qc;

        /* no command while frozen */
        if (unlikely(ap->pflags & ATA_PFLAG_FROZEN))
                return NULL;

        /* libsas case */
        if (ap->flags & ATA_FLAG_SAS_HOST) {
                tag = ata_sas_allocate_tag(ap);
                if (tag < 0)
                        return NULL;
        }

        qc = __ata_qc_from_tag(ap, tag);
        qc->tag = qc->hw_tag = tag;
        qc->scsicmd = NULL;
        qc->ap = ap;
        qc->dev = dev;

        ata_qc_reinit(qc);

        return qc;
}

/**
 *        ata_qc_free - free unused ata_queued_cmd
 *        @qc: Command to complete
 *
 *        Designed to free unused ata_queued_cmd object
 *        in case something prevents using it.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 */
void ata_qc_free(struct ata_queued_cmd *qc)
{
        struct ata_port *ap;
        unsigned int tag;

        WARN_ON_ONCE(qc == NULL); /* ata_qc_from_tag _might_ return NULL */
        ap = qc->ap;

        qc->flags = 0;
        tag = qc->tag;
        if (ata_tag_valid(tag)) {
                qc->tag = ATA_TAG_POISON;
                if (ap->flags & ATA_FLAG_SAS_HOST)
                        ata_sas_free_tag(tag, ap);
        }
}

void __ata_qc_complete(struct ata_queued_cmd *qc)
{
        struct ata_port *ap;
        struct ata_link *link;

        WARN_ON_ONCE(qc == NULL); /* ata_qc_from_tag _might_ return NULL */
        WARN_ON_ONCE(!(qc->flags & ATA_QCFLAG_ACTIVE));
        ap = qc->ap;
        link = qc->dev->link;

        if (likely(qc->flags & ATA_QCFLAG_DMAMAP))
                ata_sg_clean(qc);

        /* command should be marked inactive atomically with qc completion */
        if (ata_is_ncq(qc->tf.protocol)) {
                link->sactive &= ~(1 << qc->hw_tag);
                if (!link->sactive)
                        ap->nr_active_links--;
        } else {
                link->active_tag = ATA_TAG_POISON;
                ap->nr_active_links--;
        }

        /* clear exclusive status */
        if (unlikely(qc->flags & ATA_QCFLAG_CLEAR_EXCL &&
                     ap->excl_link == link))
                ap->excl_link = NULL;

        /* atapi: mark qc as inactive to prevent the interrupt handler
         * from completing the command twice later, before the error handler
         * is called. (when rc != 0 and atapi request sense is needed)
         */
        qc->flags &= ~ATA_QCFLAG_ACTIVE;
        ap->qc_active &= ~(1ULL << qc->tag);

        /* call completion callback */
        qc->complete_fn(qc);
}

static void fill_result_tf(struct ata_queued_cmd *qc)
{
        struct ata_port *ap = qc->ap;

        qc->result_tf.flags = qc->tf.flags;
        ap->ops->qc_fill_rtf(qc);
}

static void ata_verify_xfer(struct ata_queued_cmd *qc)
{
        struct ata_device *dev = qc->dev;

        if (!ata_is_data(qc->tf.protocol))
                return;

        if ((dev->mwdma_mask || dev->udma_mask) && ata_is_pio(qc->tf.protocol))
                return;

        dev->flags &= ~ATA_DFLAG_DUBIOUS_XFER;
}

/**
 *        ata_qc_complete - Complete an active ATA command
 *        @qc: Command to complete
 *
 *        Indicate to the mid and upper layers that an ATA command has
 *        completed, with either an ok or not-ok status.
 *
 *        Refrain from calling this function multiple times when
 *        successfully completing multiple NCQ commands.
 *        ata_qc_complete_multiple() should be used instead, which will
 *        properly update IRQ expect state.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 */
void ata_qc_complete(struct ata_queued_cmd *qc)
{
        struct ata_port *ap = qc->ap;

        /* Trigger the LED (if available) */
        ledtrig_disk_activity(!!(qc->tf.flags & ATA_TFLAG_WRITE));

        /* XXX: New EH and old EH use different mechanisms to
         * synchronize EH with regular execution path.
         *
         * In new EH, a failed qc is marked with ATA_QCFLAG_FAILED.
         * Normal execution path is responsible for not accessing a
         * failed qc.  libata core enforces the rule by returning NULL
         * from ata_qc_from_tag() for failed qcs.
         *
         * Old EH depends on ata_qc_complete() nullifying completion
         * requests if ATA_QCFLAG_EH_SCHEDULED is set.  Old EH does
         * not synchronize with interrupt handler.  Only PIO task is
         * taken care of.
         */
        if (ap->ops->error_handler) {
                struct ata_device *dev = qc->dev;
                struct ata_eh_info *ehi = &dev->link->eh_info;

                if (unlikely(qc->err_mask))
                        qc->flags |= ATA_QCFLAG_FAILED;

                /*
                 * Finish internal commands without any further processing
                 * and always with the result TF filled.
                 */
                if (unlikely(ata_tag_internal(qc->tag))) {
                        fill_result_tf(qc);
                        trace_ata_qc_complete_internal(qc);
                        __ata_qc_complete(qc);
                        return;
                }

                /*
                 * Non-internal qc has failed.  Fill the result TF and
                 * summon EH.
                 */
                if (unlikely(qc->flags & ATA_QCFLAG_FAILED)) {
                        fill_result_tf(qc);
                        trace_ata_qc_complete_failed(qc);
                        ata_qc_schedule_eh(qc);
                        return;
                }

                WARN_ON_ONCE(ap->pflags & ATA_PFLAG_FROZEN);

                /* read result TF if requested */
                if (qc->flags & ATA_QCFLAG_RESULT_TF)
                        fill_result_tf(qc);

                trace_ata_qc_complete_done(qc);
                /* Some commands need post-processing after successful
                 * completion.
                 */
                switch (qc->tf.command) {
                case ATA_CMD_SET_FEATURES:
                        if (qc->tf.feature != SETFEATURES_WC_ON &&
                            qc->tf.feature != SETFEATURES_WC_OFF &&
                            qc->tf.feature != SETFEATURES_RA_ON &&
                            qc->tf.feature != SETFEATURES_RA_OFF)
                                break;
                        fallthrough;
                case ATA_CMD_INIT_DEV_PARAMS: /* CHS translation changed */
                case ATA_CMD_SET_MULTI: /* multi_count changed */
                        /* revalidate device */
                        ehi->dev_action[dev->devno] |= ATA_EH_REVALIDATE;
                        ata_port_schedule_eh(ap);
                        break;

                case ATA_CMD_SLEEP:
                        dev->flags |= ATA_DFLAG_SLEEPING;
                        break;
                }

                if (unlikely(dev->flags & ATA_DFLAG_DUBIOUS_XFER))
                        ata_verify_xfer(qc);

                __ata_qc_complete(qc);
        } else {
                if (qc->flags & ATA_QCFLAG_EH_SCHEDULED)
                        return;

                /* read result TF if failed or requested */
                if (qc->err_mask || qc->flags & ATA_QCFLAG_RESULT_TF)
                        fill_result_tf(qc);

                __ata_qc_complete(qc);
        }
}
EXPORT_SYMBOL_GPL(ata_qc_complete);

/**
 *        ata_qc_get_active - get bitmask of active qcs
 *        @ap: port in question
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 *
 *        RETURNS:
 *        Bitmask of active qcs
 */
u64 ata_qc_get_active(struct ata_port *ap)
{
        u64 qc_active = ap->qc_active;

        /* ATA_TAG_INTERNAL is sent to hw as tag 0 */
        if (qc_active & (1ULL << ATA_TAG_INTERNAL)) {
                qc_active |= (1 << 0);
                qc_active &= ~(1ULL << ATA_TAG_INTERNAL);
        }

        return qc_active;
}
EXPORT_SYMBOL_GPL(ata_qc_get_active);

/**
 *        ata_qc_issue - issue taskfile to device
 *        @qc: command to issue to device
 *
 *        Prepare an ATA command to submission to device.
 *        This includes mapping the data into a DMA-able
 *        area, filling in the S/G table, and finally
 *        writing the taskfile to hardware, starting the command.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 */
void ata_qc_issue(struct ata_queued_cmd *qc)
{
        struct ata_port *ap = qc->ap;
        struct ata_link *link = qc->dev->link;
        u8 prot = qc->tf.protocol;

        /* Make sure only one non-NCQ command is outstanding.  The
         * check is skipped for old EH because it reuses active qc to
         * request ATAPI sense.
         */
        WARN_ON_ONCE(ap->ops->error_handler && ata_tag_valid(link->active_tag));

        if (ata_is_ncq(prot)) {
                WARN_ON_ONCE(link->sactive & (1 << qc->hw_tag));

                if (!link->sactive)
                        ap->nr_active_links++;
                link->sactive |= 1 << qc->hw_tag;
        } else {
                WARN_ON_ONCE(link->sactive);

                ap->nr_active_links++;
                link->active_tag = qc->tag;
        }

        qc->flags |= ATA_QCFLAG_ACTIVE;
        ap->qc_active |= 1ULL << qc->tag;

        /*
         * We guarantee to LLDs that they will have at least one
         * non-zero sg if the command is a data command.
         */
        if (ata_is_data(prot) && (!qc->sg || !qc->n_elem || !qc->nbytes))
                goto sys_err;

        if (ata_is_dma(prot) || (ata_is_pio(prot) &&
                                 (ap->flags & ATA_FLAG_PIO_DMA)))
                if (ata_sg_setup(qc))
                        goto sys_err;

        /* if device is sleeping, schedule reset and abort the link */
        if (unlikely(qc->dev->flags & ATA_DFLAG_SLEEPING)) {
                link->eh_info.action |= ATA_EH_RESET;
                ata_ehi_push_desc(&link->eh_info, "waking up from sleep");
                ata_link_abort(link);
                return;
        }

        qc->err_mask |= ap->ops->qc_prep(qc);
        if (unlikely(qc->err_mask))
                goto err;
        trace_ata_qc_issue(qc);
        qc->err_mask |= ap->ops->qc_issue(qc);
        if (unlikely(qc->err_mask))
                goto err;
        return;

sys_err:
        qc->err_mask |= AC_ERR_SYSTEM;
err:
        ata_qc_complete(qc);
}

/**
 *        ata_phys_link_online - test whether the given link is online
 *        @link: ATA link to test
 *
 *        Test whether @link is online.  Note that this function returns
 *        0 if online status of @link cannot be obtained, so
 *        ata_link_online(link) != !ata_link_offline(link).
 *
 *        LOCKING:
 *        None.
 *
 *        RETURNS:
 *        True if the port online status is available and online.
 */
bool ata_phys_link_online(struct ata_link *link)
{
        u32 sstatus;

        if (sata_scr_read(link, SCR_STATUS, &sstatus) == 0 &&
            ata_sstatus_online(sstatus))
                return true;
        return false;
}

/**
 *        ata_phys_link_offline - test whether the given link is offline
 *        @link: ATA link to test
 *
 *        Test whether @link is offline.  Note that this function
 *        returns 0 if offline status of @link cannot be obtained, so
 *        ata_link_online(link) != !ata_link_offline(link).
 *
 *        LOCKING:
 *        None.
 *
 *        RETURNS:
 *        True if the port offline status is available and offline.
 */
bool ata_phys_link_offline(struct ata_link *link)
{
        u32 sstatus;

        if (sata_scr_read(link, SCR_STATUS, &sstatus) == 0 &&
            !ata_sstatus_online(sstatus))
                return true;
        return false;
}

/**
 *        ata_link_online - test whether the given link is online
 *        @link: ATA link to test
 *
 *        Test whether @link is online.  This is identical to
 *        ata_phys_link_online() when there's no slave link.  When
 *        there's a slave link, this function should only be called on
 *        the master link and will return true if any of M/S links is
 *        online.
 *
 *        LOCKING:
 *        None.
 *
 *        RETURNS:
 *        True if the port online status is available and online.
 */
bool ata_link_online(struct ata_link *link)
{
        struct ata_link *slave = link->ap->slave_link;

        WARN_ON(link == slave);        /* shouldn't be called on slave link */

        return ata_phys_link_online(link) ||
                (slave && ata_phys_link_online(slave));
}
EXPORT_SYMBOL_GPL(ata_link_online);

/**
 *        ata_link_offline - test whether the given link is offline
 *        @link: ATA link to test
 *
 *        Test whether @link is offline.  This is identical to
 *        ata_phys_link_offline() when there's no slave link.  When
 *        there's a slave link, this function should only be called on
 *        the master link and will return true if both M/S links are
 *        offline.
 *
 *        LOCKING:
 *        None.
 *
 *        RETURNS:
 *        True if the port offline status is available and offline.
 */
bool ata_link_offline(struct ata_link *link)
{
        struct ata_link *slave = link->ap->slave_link;

        WARN_ON(link == slave);        /* shouldn't be called on slave link */

        return ata_phys_link_offline(link) &&
                (!slave || ata_phys_link_offline(slave));
}
EXPORT_SYMBOL_GPL(ata_link_offline);

#ifdef CONFIG_PM
static void ata_port_request_pm(struct ata_port *ap, pm_message_t mesg,
                                unsigned int action, unsigned int ehi_flags,
                                bool async)
{
        struct ata_link *link;
        unsigned long flags;

        spin_lock_irqsave(ap->lock, flags);

        /*
         * A previous PM operation might still be in progress. Wait for
         * ATA_PFLAG_PM_PENDING to clear.
         */
        if (ap->pflags & ATA_PFLAG_PM_PENDING) {
                spin_unlock_irqrestore(ap->lock, flags);
                ata_port_wait_eh(ap);
                spin_lock_irqsave(ap->lock, flags);
        }

        /* Request PM operation to EH */
        ap->pm_mesg = mesg;
        ap->pflags |= ATA_PFLAG_PM_PENDING;
        ata_for_each_link(link, ap, HOST_FIRST) {
                link->eh_info.action |= action;
                link->eh_info.flags |= ehi_flags;
        }

        ata_port_schedule_eh(ap);

        spin_unlock_irqrestore(ap->lock, flags);

        if (!async)
                ata_port_wait_eh(ap);
}

/*
 * On some hardware, device fails to respond after spun down for suspend.  As
 * the device won't be used before being resumed, we don't need to touch the
 * device.  Ask EH to skip the usual stuff and proceed directly to suspend.
 *
 * http://thread.gmane.org/gmane.linux.ide/46764
 */
static const unsigned int ata_port_suspend_ehi = ATA_EHI_QUIET
                                                 | ATA_EHI_NO_AUTOPSY
                                                 | ATA_EHI_NO_RECOVERY;

static void ata_port_suspend(struct ata_port *ap, pm_message_t mesg)
{
        ata_port_request_pm(ap, mesg, 0, ata_port_suspend_ehi, false);
}

static void ata_port_suspend_async(struct ata_port *ap, pm_message_t mesg)
{
        ata_port_request_pm(ap, mesg, 0, ata_port_suspend_ehi, true);
}

static int ata_port_pm_suspend(struct device *dev)
{
        struct ata_port *ap = to_ata_port(dev);

        if (pm_runtime_suspended(dev))
                return 0;

        ata_port_suspend(ap, PMSG_SUSPEND);
        return 0;
}

static int ata_port_pm_freeze(struct device *dev)
{
        struct ata_port *ap = to_ata_port(dev);

        if (pm_runtime_suspended(dev))
                return 0;

        ata_port_suspend(ap, PMSG_FREEZE);
        return 0;
}

static int ata_port_pm_poweroff(struct device *dev)
{
        ata_port_suspend(to_ata_port(dev), PMSG_HIBERNATE);
        return 0;
}

static const unsigned int ata_port_resume_ehi = ATA_EHI_NO_AUTOPSY
                                                | ATA_EHI_QUIET;

static void ata_port_resume(struct ata_port *ap, pm_message_t mesg)
{
        ata_port_request_pm(ap, mesg, ATA_EH_RESET, ata_port_resume_ehi, false);
}

static void ata_port_resume_async(struct ata_port *ap, pm_message_t mesg)
{
        ata_port_request_pm(ap, mesg, ATA_EH_RESET, ata_port_resume_ehi, true);
}

static int ata_port_pm_resume(struct device *dev)
{
        ata_port_resume_async(to_ata_port(dev), PMSG_RESUME);
        pm_runtime_disable(dev);
        pm_runtime_set_active(dev);
        pm_runtime_enable(dev);
        return 0;
}

/*
 * For ODDs, the upper layer will poll for media change every few seconds,
 * which will make it enter and leave suspend state every few seconds. And
 * as each suspend will cause a hard/soft reset, the gain of runtime suspend
 * is very little and the ODD may malfunction after constantly being reset.
 * So the idle callback here will not proceed to suspend if a non-ZPODD capable
 * ODD is attached to the port.
 */
static int ata_port_runtime_idle(struct device *dev)
{
        struct ata_port *ap = to_ata_port(dev);
        struct ata_link *link;
        struct ata_device *adev;

        ata_for_each_link(link, ap, HOST_FIRST) {
                ata_for_each_dev(adev, link, ENABLED)
                        if (adev->class == ATA_DEV_ATAPI &&
                            !zpodd_dev_enabled(adev))
                                return -EBUSY;
        }

        return 0;
}

static int ata_port_runtime_suspend(struct device *dev)
{
        ata_port_suspend(to_ata_port(dev), PMSG_AUTO_SUSPEND);
        return 0;
}

static int ata_port_runtime_resume(struct device *dev)
{
        ata_port_resume(to_ata_port(dev), PMSG_AUTO_RESUME);
        return 0;
}

static const struct dev_pm_ops ata_port_pm_ops = {
        .suspend = ata_port_pm_suspend,
        .resume = ata_port_pm_resume,
        .freeze = ata_port_pm_freeze,
        .thaw = ata_port_pm_resume,
        .poweroff = ata_port_pm_poweroff,
        .restore = ata_port_pm_resume,

        .runtime_suspend = ata_port_runtime_suspend,
        .runtime_resume = ata_port_runtime_resume,
        .runtime_idle = ata_port_runtime_idle,
};

/* sas ports don't participate in pm runtime management of ata_ports,
 * and need to resume ata devices at the domain level, not the per-port
 * level. sas suspend/resume is async to allow parallel port recovery
 * since sas has multiple ata_port instances per Scsi_Host.
 */
void ata_sas_port_suspend(struct ata_port *ap)
{
        ata_port_suspend_async(ap, PMSG_SUSPEND);
}
EXPORT_SYMBOL_GPL(ata_sas_port_suspend);

void ata_sas_port_resume(struct ata_port *ap)
{
        ata_port_resume_async(ap, PMSG_RESUME);
}
EXPORT_SYMBOL_GPL(ata_sas_port_resume);

/**
 *        ata_host_suspend - suspend host
 *        @host: host to suspend
 *        @mesg: PM message
 *
 *        Suspend @host.  Actual operation is performed by port suspend.
 */
int ata_host_suspend(struct ata_host *host, pm_message_t mesg)
{
        host->dev->power.power_state = mesg;
        return 0;
}
EXPORT_SYMBOL_GPL(ata_host_suspend);

/**
 *        ata_host_resume - resume host
 *        @host: host to resume
 *
 *        Resume @host.  Actual operation is performed by port resume.
 */
void ata_host_resume(struct ata_host *host)
{
        host->dev->power.power_state = PMSG_ON;
}
EXPORT_SYMBOL_GPL(ata_host_resume);
#endif

const struct device_type ata_port_type = {
        .name = ATA_PORT_TYPE_NAME,
#ifdef CONFIG_PM
        .pm = &ata_port_pm_ops,
#endif
};

/**
 *        ata_dev_init - Initialize an ata_device structure
 *        @dev: Device structure to initialize
 *
 *        Initialize @dev in preparation for probing.
 *
 *        LOCKING:
 *        Inherited from caller.
 */
void ata_dev_init(struct ata_device *dev)
{
        struct ata_link *link = ata_dev_phys_link(dev);
        struct ata_port *ap = link->ap;
        unsigned long flags;

        /* SATA spd limit is bound to the attached device, reset together */
        link->sata_spd_limit = link->hw_sata_spd_limit;
        link->sata_spd = 0;

        /* High bits of dev->flags are used to record warm plug
         * requests which occur asynchronously.  Synchronize using
         * host lock.
         */
        spin_lock_irqsave(ap->lock, flags);
        dev->flags &= ~ATA_DFLAG_INIT_MASK;
        dev->horkage = 0;
        spin_unlock_irqrestore(ap->lock, flags);

        memset((void *)dev + ATA_DEVICE_CLEAR_BEGIN, 0,
               ATA_DEVICE_CLEAR_END - ATA_DEVICE_CLEAR_BEGIN);
        dev->pio_mask = UINT_MAX;
        dev->mwdma_mask = UINT_MAX;
        dev->udma_mask = UINT_MAX;
}

/**
 *        ata_link_init - Initialize an ata_link structure
 *        @ap: ATA port link is attached to
 *        @link: Link structure to initialize
 *        @pmp: Port multiplier port number
 *
 *        Initialize @link.
 *
 *        LOCKING:
 *        Kernel thread context (may sleep)
 */
void ata_link_init(struct ata_port *ap, struct ata_link *link, int pmp)
{
        int i;

        /* clear everything except for devices */
        memset((void *)link + ATA_LINK_CLEAR_BEGIN, 0,
               ATA_LINK_CLEAR_END - ATA_LINK_CLEAR_BEGIN);

        link->ap = ap;
        link->pmp = pmp;
        link->active_tag = ATA_TAG_POISON;
        link->hw_sata_spd_limit = UINT_MAX;

        /* can't use iterator, ap isn't initialized yet */
        for (i = 0; i < ATA_MAX_DEVICES; i++) {
                struct ata_device *dev = &link->device[i];

                dev->link = link;
                dev->devno = dev - link->device;
#ifdef CONFIG_ATA_ACPI
                dev->gtf_filter = ata_acpi_gtf_filter;
#endif
                ata_dev_init(dev);
        }
}

/**
 *        sata_link_init_spd - Initialize link->sata_spd_limit
 *        @link: Link to configure sata_spd_limit for
 *
 *        Initialize ``link->[hw_]sata_spd_limit`` to the currently
 *        configured value.
 *
 *        LOCKING:
 *        Kernel thread context (may sleep).
 *
 *        RETURNS:
 *        0 on success, -errno on failure.
 */
int sata_link_init_spd(struct ata_link *link)
{
        u8 spd;
        int rc;

        rc = sata_scr_read(link, SCR_CONTROL, &link->saved_scontrol);
        if (rc)
                return rc;

        spd = (link->saved_scontrol >> 4) & 0xf;
        if (spd)
                link->hw_sata_spd_limit &= (1 << spd) - 1;

        ata_force_link_limits(link);

        link->sata_spd_limit = link->hw_sata_spd_limit;

        return 0;
}

/**
 *        ata_port_alloc - allocate and initialize basic ATA port resources
 *        @host: ATA host this allocated port belongs to
 *
 *        Allocate and initialize basic ATA port resources.
 *
 *        RETURNS:
 *        Allocate ATA port on success, NULL on failure.
 *
 *        LOCKING:
 *        Inherited from calling layer (may sleep).
 */
struct ata_port *ata_port_alloc(struct ata_host *host)
{
        struct ata_port *ap;

        DPRINTK("ENTER\n");

        ap = kzalloc(sizeof(*ap), GFP_KERNEL);
        if (!ap)
                return NULL;

        ap->pflags |= ATA_PFLAG_INITIALIZING | ATA_PFLAG_FROZEN;
        ap->lock = &host->lock;
        ap->print_id = -1;
        ap->local_port_no = -1;
        ap->host = host;
        ap->dev = host->dev;

#if defined(ATA_VERBOSE_DEBUG)
        /* turn on all debugging levels */
        ap->msg_enable = 0x00FF;
#elif defined(ATA_DEBUG)
        ap->msg_enable = ATA_MSG_DRV | ATA_MSG_INFO | ATA_MSG_CTL | ATA_MSG_WARN | ATA_MSG_ERR;
#else
        ap->msg_enable = ATA_MSG_DRV | ATA_MSG_ERR | ATA_MSG_WARN;
#endif

        mutex_init(&ap->scsi_scan_mutex);
        INIT_DELAYED_WORK(&ap->hotplug_task, ata_scsi_hotplug);
        INIT_WORK(&ap->scsi_rescan_task, ata_scsi_dev_rescan);
        INIT_LIST_HEAD(&ap->eh_done_q);
        init_waitqueue_head(&ap->eh_wait_q);
        init_completion(&ap->park_req_pending);
        timer_setup(&ap->fastdrain_timer, ata_eh_fastdrain_timerfn,
                    TIMER_DEFERRABLE);

        ap->cbl = ATA_CBL_NONE;

        ata_link_init(ap, &ap->link, 0);

#ifdef ATA_IRQ_TRAP
        ap->stats.unhandled_irq = 1;
        ap->stats.idle_irq = 1;
#endif
        ata_sff_port_init(ap);

        return ap;
}

static void ata_devres_release(struct device *gendev, void *res)
{
        struct ata_host *host = dev_get_drvdata(gendev);
        int i;

        for (i = 0; i < host->n_ports; i++) {
                struct ata_port *ap = host->ports[i];

                if (!ap)
                        continue;

                if (ap->scsi_host)
                        scsi_host_put(ap->scsi_host);

        }

        dev_set_drvdata(gendev, NULL);
        ata_host_put(host);
}

static void ata_host_release(struct kref *kref)
{
        struct ata_host *host = container_of(kref, struct ata_host, kref);
        int i;

        for (i = 0; i < host->n_ports; i++) {
                struct ata_port *ap = host->ports[i];

                if (!ap)
                        continue;

                kfree(ap->pmp_link);
                kfree(ap->slave_link);
                kfree(ap);
                host->ports[i] = NULL;
        }
        kfree(host);
}

void ata_host_get(struct ata_host *host)
{
        kref_get(&host->kref);
}

void ata_host_put(struct ata_host *host)
{
        kref_put(&host->kref, ata_host_release);
}
EXPORT_SYMBOL_GPL(ata_host_put);

/**
 *        ata_host_alloc - allocate and init basic ATA host resources
 *        @dev: generic device this host is associated with
 *        @max_ports: maximum number of ATA ports associated with this host
 *
 *        Allocate and initialize basic ATA host resources.  LLD calls
 *        this function to allocate a host, initializes it fully and
 *        attaches it using ata_host_register().
 *
 *        @max_ports ports are allocated and host->n_ports is
 *        initialized to @max_ports.  The caller is allowed to decrease
 *        host->n_ports before calling ata_host_register().  The unused
 *        ports will be automatically freed on registration.
 *
 *        RETURNS:
 *        Allocate ATA host on success, NULL on failure.
 *
 *        LOCKING:
 *        Inherited from calling layer (may sleep).
 */
struct ata_host *ata_host_alloc(struct device *dev, int max_ports)
{
        struct ata_host *host;
        size_t sz;
        int i;
        void *dr;

        DPRINTK("ENTER\n");

        /* alloc a container for our list of ATA ports (buses) */
        sz = sizeof(struct ata_host) + (max_ports + 1) * sizeof(void *);
        host = kzalloc(sz, GFP_KERNEL);
        if (!host)
                return NULL;

        if (!devres_open_group(dev, NULL, GFP_KERNEL)) {
                kfree(host);
                return NULL;
        }

        dr = devres_alloc(ata_devres_release, 0, GFP_KERNEL);
        if (!dr) {
                kfree(host);
                goto err_out;
        }

        devres_add(dev, dr);
        dev_set_drvdata(dev, host);

        spin_lock_init(&host->lock);
        mutex_init(&host->eh_mutex);
        host->dev = dev;
        host->n_ports = max_ports;
        kref_init(&host->kref);

        /* allocate ports bound to this host */
        for (i = 0; i < max_ports; i++) {
                struct ata_port *ap;

                ap = ata_port_alloc(host);
                if (!ap)
                        goto err_out;

                ap->port_no = i;
                host->ports[i] = ap;
        }

        devres_remove_group(dev, NULL);
        return host;

 err_out:
        devres_release_group(dev, NULL);
        return NULL;
}
EXPORT_SYMBOL_GPL(ata_host_alloc);

/**
 *        ata_host_alloc_pinfo - alloc host and init with port_info array
 *        @dev: generic device this host is associated with
 *        @ppi: array of ATA port_info to initialize host with
 *        @n_ports: number of ATA ports attached to this host
 *
 *        Allocate ATA host and initialize with info from @ppi.  If NULL
 *        terminated, @ppi may contain fewer entries than @n_ports.  The
 *        last entry will be used for the remaining ports.
 *
 *        RETURNS:
 *        Allocate ATA host on success, NULL on failure.
 *
 *        LOCKING:
 *        Inherited from calling layer (may sleep).
 */
struct ata_host *ata_host_alloc_pinfo(struct device *dev,
                                      const struct ata_port_info * const * ppi,
                                      int n_ports)
{
        const struct ata_port_info *pi = &ata_dummy_port_info;
        struct ata_host *host;
        int i, j;

        host = ata_host_alloc(dev, n_ports);
        if (!host)
                return NULL;

        for (i = 0, j = 0; i < host->n_ports; i++) {
                struct ata_port *ap = host->ports[i];

                if (ppi[j])
                        pi = ppi[j++];

                ap->pio_mask = pi->pio_mask;
                ap->mwdma_mask = pi->mwdma_mask;
                ap->udma_mask = pi->udma_mask;
                ap->flags |= pi->flags;
                ap->link.flags |= pi->link_flags;
                ap->ops = pi->port_ops;

                if (!host->ops && (pi->port_ops != &ata_dummy_port_ops))
                        host->ops = pi->port_ops;
        }

        return host;
}
EXPORT_SYMBOL_GPL(ata_host_alloc_pinfo);

static void ata_host_stop(struct device *gendev, void *res)
{
        struct ata_host *host = dev_get_drvdata(gendev);
        int i;

        WARN_ON(!(host->flags & ATA_HOST_STARTED));

        for (i = 0; i < host->n_ports; i++) {
                struct ata_port *ap = host->ports[i];

                if (ap->ops->port_stop)
                        ap->ops->port_stop(ap);
        }

        if (host->ops->host_stop)
                host->ops->host_stop(host);
}

/**
 *        ata_finalize_port_ops - finalize ata_port_operations
 *        @ops: ata_port_operations to finalize
 *
 *        An ata_port_operations can inherit from another ops and that
 *        ops can again inherit from another.  This can go on as many
 *        times as necessary as long as there is no loop in the
 *        inheritance chain.
 *
 *        Ops tables are finalized when the host is started.  NULL or
 *        unspecified entries are inherited from the closet ancestor
 *        which has the method and the entry is populated with it.
 *        After finalization, the ops table directly points to all the
 *        methods and ->inherits is no longer necessary and cleared.
 *
 *        Using ATA_OP_NULL, inheriting ops can force a method to NULL.
 *
 *        LOCKING:
 *        None.
 */
static void ata_finalize_port_ops(struct ata_port_operations *ops)
{
        static DEFINE_SPINLOCK(lock);
        const struct ata_port_operations *cur;
        void **begin = (void **)ops;
        void **end = (void **)&ops->inherits;
        void **pp;

        if (!ops || !ops->inherits)
                return;

        spin_lock(&lock);

        for (cur = ops->inherits; cur; cur = cur->inherits) {
                void **inherit = (void **)cur;

                for (pp = begin; pp < end; pp++, inherit++)
                        if (!*pp)
                                *pp = *inherit;
        }

        for (pp = begin; pp < end; pp++)
                if (IS_ERR(*pp))
                        *pp = NULL;

        ops->inherits = NULL;

        spin_unlock(&lock);
}

/**
 *        ata_host_start - start and freeze ports of an ATA host
 *        @host: ATA host to start ports for
 *
 *        Start and then freeze ports of @host.  Started status is
 *        recorded in host->flags, so this function can be called
 *        multiple times.  Ports are guaranteed to get started only
 *        once.  If host->ops isn't initialized yet, its set to the
 *        first non-dummy port ops.
 *
 *        LOCKING:
 *        Inherited from calling layer (may sleep).
 *
 *        RETURNS:
 *        0 if all ports are started successfully, -errno otherwise.
 */
int ata_host_start(struct ata_host *host)
{
        int have_stop = 0;
        void *start_dr = NULL;
        int i, rc;

        if (host->flags & ATA_HOST_STARTED)
                return 0;

        ata_finalize_port_ops(host->ops);

        for (i = 0; i < host->n_ports; i++) {
                struct ata_port *ap = host->ports[i];

                ata_finalize_port_ops(ap->ops);

                if (!host->ops && !ata_port_is_dummy(ap))
                        host->ops = ap->ops;

                if (ap->ops->port_stop)
                        have_stop = 1;
        }

        if (host->ops && host->ops->host_stop)
                have_stop = 1;

        if (have_stop) {
                start_dr = devres_alloc(ata_host_stop, 0, GFP_KERNEL);
                if (!start_dr)
                        return -ENOMEM;
        }

        for (i = 0; i < host->n_ports; i++) {
                struct ata_port *ap = host->ports[i];

                if (ap->ops->port_start) {
                        rc = ap->ops->port_start(ap);
                        if (rc) {
                                if (rc != -ENODEV)
                                        dev_err(host->dev,
                                                "failed to start port %d (errno=%d)\n",
                                                i, rc);
                                goto err_out;
                        }
                }
                ata_eh_freeze_port(ap);
        }

        if (start_dr)
                devres_add(host->dev, start_dr);
        host->flags |= ATA_HOST_STARTED;
        return 0;

 err_out:
        while (--i >= 0) {
                struct ata_port *ap = host->ports[i];

                if (ap->ops->port_stop)
                        ap->ops->port_stop(ap);
        }
        devres_free(start_dr);
        return rc;
}
EXPORT_SYMBOL_GPL(ata_host_start);

/**
 *        ata_host_init - Initialize a host struct for sas (ipr, libsas)
 *        @host:        host to initialize
 *        @dev:        device host is attached to
 *        @ops:        port_ops
 *
 */
void ata_host_init(struct ata_host *host, struct device *dev,
                   struct ata_port_operations *ops)
{
        spin_lock_init(&host->lock);
        mutex_init(&host->eh_mutex);
        host->n_tags = ATA_MAX_QUEUE;
        host->dev = dev;
        host->ops = ops;
        kref_init(&host->kref);
}
EXPORT_SYMBOL_GPL(ata_host_init);

void __ata_port_probe(struct ata_port *ap)
{
        struct ata_eh_info *ehi = &ap->link.eh_info;
        unsigned long flags;

        /* kick EH for boot probing */
        spin_lock_irqsave(ap->lock, flags);

        ehi->probe_mask |= ATA_ALL_DEVICES;
        ehi->action |= ATA_EH_RESET;
        ehi->flags |= ATA_EHI_NO_AUTOPSY | ATA_EHI_QUIET;

        ap->pflags &= ~ATA_PFLAG_INITIALIZING;
        ap->pflags |= ATA_PFLAG_LOADING;
        ata_port_schedule_eh(ap);

        spin_unlock_irqrestore(ap->lock, flags);
}

int ata_port_probe(struct ata_port *ap)
{
        int rc = 0;

        if (ap->ops->error_handler) {
                __ata_port_probe(ap);
                ata_port_wait_eh(ap);
        } else {
                DPRINTK("ata%u: bus probe begin\n", ap->print_id);
                rc = ata_bus_probe(ap);
                DPRINTK("ata%u: bus probe end\n", ap->print_id);
        }
        return rc;
}


static void async_port_probe(void *data, async_cookie_t cookie)
{
        struct ata_port *ap = data;

        /*
         * If we're not allowed to scan this host in parallel,
         * we need to wait until all previous scans have completed
         * before going further.
         * Jeff Garzik says this is only within a controller, so we
         * don't need to wait for port 0, only for later ports.
         */
        if (!(ap->host->flags & ATA_HOST_PARALLEL_SCAN) && ap->port_no != 0)
                async_synchronize_cookie(cookie);

        (void)ata_port_probe(ap);

        /* in order to keep device order, we need to synchronize at this point */
        async_synchronize_cookie(cookie);

        ata_scsi_scan_host(ap, 1);
}

/**
 *        ata_host_register - register initialized ATA host
 *        @host: ATA host to register
 *        @sht: template for SCSI host
 *
 *        Register initialized ATA host.  @host is allocated using
 *        ata_host_alloc() and fully initialized by LLD.  This function
 *        starts ports, registers @host with ATA and SCSI layers and
 *        probe registered devices.
 *
 *        LOCKING:
 *        Inherited from calling layer (may sleep).
 *
 *        RETURNS:
 *        0 on success, -errno otherwise.
 */
int ata_host_register(struct ata_host *host, struct scsi_host_template *sht)
{
        int i, rc;

        host->n_tags = clamp(sht->can_queue, 1, ATA_MAX_QUEUE);

        /* host must have been started */
        if (!(host->flags & ATA_HOST_STARTED)) {
                dev_err(host->dev, "BUG: trying to register unstarted host\n");
                WARN_ON(1);
                return -EINVAL;
        }

        /* Blow away unused ports.  This happens when LLD can't
         * determine the exact number of ports to allocate at
         * allocation time.
         */
        for (i = host->n_ports; host->ports[i]; i++)
                kfree(host->ports[i]);

        /* give ports names and add SCSI hosts */
        for (i = 0; i < host->n_ports; i++) {
                host->ports[i]->print_id = atomic_inc_return(&ata_print_id);
                host->ports[i]->local_port_no = i + 1;
        }

        /* Create associated sysfs transport objects  */
        for (i = 0; i < host->n_ports; i++) {
                rc = ata_tport_add(host->dev,host->ports[i]);
                if (rc) {
                        goto err_tadd;
                }
        }

        rc = ata_scsi_add_hosts(host, sht);
        if (rc)
                goto err_tadd;

        /* set cable, sata_spd_limit and report */
        for (i = 0; i < host->n_ports; i++) {
                struct ata_port *ap = host->ports[i];
                unsigned long xfer_mask;

                /* set SATA cable type if still unset */
                if (ap->cbl == ATA_CBL_NONE && (ap->flags & ATA_FLAG_SATA))
                        ap->cbl = ATA_CBL_SATA;

                /* init sata_spd_limit to the current value */
                sata_link_init_spd(&ap->link);
                if (ap->slave_link)
                        sata_link_init_spd(ap->slave_link);

                /* print per-port info to dmesg */
                xfer_mask = ata_pack_xfermask(ap->pio_mask, ap->mwdma_mask,
                                              ap->udma_mask);

                if (!ata_port_is_dummy(ap)) {
                        ata_port_info(ap, "%cATA max %s %s\n",
                                      (ap->flags & ATA_FLAG_SATA) ? 'S' : 'P',
                                      ata_mode_string(xfer_mask),
                                      ap->link.eh_info.desc);
                        ata_ehi_clear_desc(&ap->link.eh_info);
                } else
                        ata_port_info(ap, "DUMMY\n");
        }

        /* perform each probe asynchronously */
        for (i = 0; i < host->n_ports; i++) {
                struct ata_port *ap = host->ports[i];
                ap->cookie = async_schedule(async_port_probe, ap);
        }

        return 0;

 err_tadd:
        while (--i >= 0) {
                ata_tport_delete(host->ports[i]);
        }
        return rc;

}
EXPORT_SYMBOL_GPL(ata_host_register);

/**
 *        ata_host_activate - start host, request IRQ and register it
 *        @host: target ATA host
 *        @irq: IRQ to request
 *        @irq_handler: irq_handler used when requesting IRQ
 *        @irq_flags: irq_flags used when requesting IRQ
 *        @sht: scsi_host_template to use when registering the host
 *
 *        After allocating an ATA host and initializing it, most libata
 *        LLDs perform three steps to activate the host - start host,
 *        request IRQ and register it.  This helper takes necessary
 *        arguments and performs the three steps in one go.
 *
 *        An invalid IRQ skips the IRQ registration and expects the host to
 *        have set polling mode on the port. In this case, @irq_handler
 *        should be NULL.
 *
 *        LOCKING:
 *        Inherited from calling layer (may sleep).
 *
 *        RETURNS:
 *        0 on success, -errno otherwise.
 */
int ata_host_activate(struct ata_host *host, int irq,
                      irq_handler_t irq_handler, unsigned long irq_flags,
                      struct scsi_host_template *sht)
{
        int i, rc;
        char *irq_desc;

        rc = ata_host_start(host);
        if (rc)
                return rc;

        /* Special case for polling mode */
        if (!irq) {
                WARN_ON(irq_handler);
                return ata_host_register(host, sht);
        }

        irq_desc = devm_kasprintf(host->dev, GFP_KERNEL, "%s[%s]",
                                  dev_driver_string(host->dev),
                                  dev_name(host->dev));
        if (!irq_desc)
                return -ENOMEM;

        rc = devm_request_irq(host->dev, irq, irq_handler, irq_flags,
                              irq_desc, host);
        if (rc)
                return rc;

        for (i = 0; i < host->n_ports; i++)
                ata_port_desc(host->ports[i], "irq %d", irq);

        rc = ata_host_register(host, sht);
        /* if failed, just free the IRQ and leave ports alone */
        if (rc)
                devm_free_irq(host->dev, irq, host);

        return rc;
}
EXPORT_SYMBOL_GPL(ata_host_activate);

/**
 *        ata_port_detach - Detach ATA port in preparation of device removal
 *        @ap: ATA port to be detached
 *
 *        Detach all ATA devices and the associated SCSI devices of @ap;
 *        then, remove the associated SCSI host.  @ap is guaranteed to
 *        be quiescent on return from this function.
 *
 *        LOCKING:
 *        Kernel thread context (may sleep).
 */
static void ata_port_detach(struct ata_port *ap)
{
        unsigned long flags;
        struct ata_link *link;
        struct ata_device *dev;

        if (!ap->ops->error_handler)
                goto skip_eh;

        /* Wait for any ongoing EH */
        ata_port_wait_eh(ap);

        mutex_lock(&ap->scsi_scan_mutex);
        spin_lock_irqsave(ap->lock, flags);

        /* Remove scsi devices */
        ata_for_each_link(link, ap, HOST_FIRST) {
                ata_for_each_dev(dev, link, ALL) {
                        if (dev->sdev) {
                                spin_unlock_irqrestore(ap->lock, flags);
                                scsi_remove_device(dev->sdev);
                                spin_lock_irqsave(ap->lock, flags);
                                dev->sdev = NULL;
                        }
                }
        }

        /* Tell EH to disable all devices */
        ap->pflags |= ATA_PFLAG_UNLOADING;
        ata_port_schedule_eh(ap);

        spin_unlock_irqrestore(ap->lock, flags);
        mutex_unlock(&ap->scsi_scan_mutex);

        /* wait till EH commits suicide */
        ata_port_wait_eh(ap);

        /* it better be dead now */
        WARN_ON(!(ap->pflags & ATA_PFLAG_UNLOADED));

        cancel_delayed_work_sync(&ap->hotplug_task);

 skip_eh:
        /* clean up zpodd on port removal */
        ata_for_each_link(link, ap, HOST_FIRST) {
                ata_for_each_dev(dev, link, ALL) {
                        if (zpodd_dev_enabled(dev))
                                zpodd_exit(dev);
                }
        }
        if (ap->pmp_link) {
                int i;
                for (i = 0; i < SATA_PMP_MAX_PORTS; i++)
                        ata_tlink_delete(&ap->pmp_link[i]);
        }
        /* remove the associated SCSI host */
        scsi_remove_host(ap->scsi_host);
        ata_tport_delete(ap);
}

/**
 *        ata_host_detach - Detach all ports of an ATA host
 *        @host: Host to detach
 *
 *        Detach all ports of @host.
 *
 *        LOCKING:
 *        Kernel thread context (may sleep).
 */
void ata_host_detach(struct ata_host *host)
{
        int i;

        for (i = 0; i < host->n_ports; i++) {
                /* Ensure ata_port probe has completed */
                async_synchronize_cookie(host->ports[i]->cookie + 1);
                ata_port_detach(host->ports[i]);
        }

        /* the host is dead now, dissociate ACPI */
        ata_acpi_dissociate(host);
}
EXPORT_SYMBOL_GPL(ata_host_detach);

#ifdef CONFIG_PCI

/**
 *        ata_pci_remove_one - PCI layer callback for device removal
 *        @pdev: PCI device that was removed
 *
 *        PCI layer indicates to libata via this hook that hot-unplug or
 *        module unload event has occurred.  Detach all ports.  Resource
 *        release is handled via devres.
 *
 *        LOCKING:
 *        Inherited from PCI layer (may sleep).
 */
void ata_pci_remove_one(struct pci_dev *pdev)
{
        struct ata_host *host = pci_get_drvdata(pdev);

        ata_host_detach(host);
}
EXPORT_SYMBOL_GPL(ata_pci_remove_one);

void ata_pci_shutdown_one(struct pci_dev *pdev)
{
        struct ata_host *host = pci_get_drvdata(pdev);
        int i;

        for (i = 0; i < host->n_ports; i++) {
                struct ata_port *ap = host->ports[i];

                ap->pflags |= ATA_PFLAG_FROZEN;

                /* Disable port interrupts */
                if (ap->ops->freeze)
                        ap->ops->freeze(ap);

                /* Stop the port DMA engines */
                if (ap->ops->port_stop)
                        ap->ops->port_stop(ap);
        }
}
EXPORT_SYMBOL_GPL(ata_pci_shutdown_one);

/* move to PCI subsystem */
int pci_test_config_bits(struct pci_dev *pdev, const struct pci_bits *bits)
{
        unsigned long tmp = 0;

        switch (bits->width) {
        case 1: {
                u8 tmp8 = 0;
                pci_read_config_byte(pdev, bits->reg, &tmp8);
                tmp = tmp8;
                break;
        }
        case 2: {
                u16 tmp16 = 0;
                pci_read_config_word(pdev, bits->reg, &tmp16);
                tmp = tmp16;
                break;
        }
        case 4: {
                u32 tmp32 = 0;
                pci_read_config_dword(pdev, bits->reg, &tmp32);
                tmp = tmp32;
                break;
        }

        default:
                return -EINVAL;
        }

        tmp &= bits->mask;

        return (tmp == bits->val) ? 1 : 0;
}
EXPORT_SYMBOL_GPL(pci_test_config_bits);

#ifdef CONFIG_PM
void ata_pci_device_do_suspend(struct pci_dev *pdev, pm_message_t mesg)
{
        pci_save_state(pdev);
        pci_disable_device(pdev);

        if (mesg.event & PM_EVENT_SLEEP)
                pci_set_power_state(pdev, PCI_D3hot);
}
EXPORT_SYMBOL_GPL(ata_pci_device_do_suspend);

int ata_pci_device_do_resume(struct pci_dev *pdev)
{
        int rc;

        pci_set_power_state(pdev, PCI_D0);
        pci_restore_state(pdev);

        rc = pcim_enable_device(pdev);
        if (rc) {
                dev_err(&pdev->dev,
                        "failed to enable device after resume (%d)\n", rc);
                return rc;
        }

        pci_set_master(pdev);
        return 0;
}
EXPORT_SYMBOL_GPL(ata_pci_device_do_resume);

int ata_pci_device_suspend(struct pci_dev *pdev, pm_message_t mesg)
{
        struct ata_host *host = pci_get_drvdata(pdev);
        int rc = 0;

        rc = ata_host_suspend(host, mesg);
        if (rc)
                return rc;

        ata_pci_device_do_suspend(pdev, mesg);

        return 0;
}
EXPORT_SYMBOL_GPL(ata_pci_device_suspend);

int ata_pci_device_resume(struct pci_dev *pdev)
{
        struct ata_host *host = pci_get_drvdata(pdev);
        int rc;

        rc = ata_pci_device_do_resume(pdev);
        if (rc == 0)
                ata_host_resume(host);
        return rc;
}
EXPORT_SYMBOL_GPL(ata_pci_device_resume);
#endif /* CONFIG_PM */
#endif /* CONFIG_PCI */

/**
 *        ata_platform_remove_one - Platform layer callback for device removal
 *        @pdev: Platform device that was removed
 *
 *        Platform layer indicates to libata via this hook that hot-unplug or
 *        module unload event has occurred.  Detach all ports.  Resource
 *        release is handled via devres.
 *
 *        LOCKING:
 *        Inherited from platform layer (may sleep).
 */
int ata_platform_remove_one(struct platform_device *pdev)
{
        struct ata_host *host = platform_get_drvdata(pdev);

        ata_host_detach(host);

        return 0;
}
EXPORT_SYMBOL_GPL(ata_platform_remove_one);

#ifdef CONFIG_ATA_FORCE
static int __init ata_parse_force_one(char **cur,
                                      struct ata_force_ent *force_ent,
                                      const char **reason)
{
        static const struct ata_force_param force_tbl[] __initconst = {
                { "40c",        .cbl                = ATA_CBL_PATA40 },
                { "80c",        .cbl                = ATA_CBL_PATA80 },
                { "short40c",        .cbl                = ATA_CBL_PATA40_SHORT },
                { "unk",        .cbl                = ATA_CBL_PATA_UNK },
                { "ign",        .cbl                = ATA_CBL_PATA_IGN },
                { "sata",        .cbl                = ATA_CBL_SATA },
                { "1.5Gbps",        .spd_limit        = 1 },
                { "3.0Gbps",        .spd_limit        = 2 },
                { "noncq",        .horkage_on        = ATA_HORKAGE_NONCQ },
                { "ncq",        .horkage_off        = ATA_HORKAGE_NONCQ },
                { "noncqtrim",        .horkage_on        = ATA_HORKAGE_NO_NCQ_TRIM },
                { "ncqtrim",        .horkage_off        = ATA_HORKAGE_NO_NCQ_TRIM },
                { "noncqati",        .horkage_on        = ATA_HORKAGE_NO_NCQ_ON_ATI },
                { "ncqati",        .horkage_off        = ATA_HORKAGE_NO_NCQ_ON_ATI },
                { "dump_id",        .horkage_on        = ATA_HORKAGE_DUMP_ID },
                { "pio0",        .xfer_mask        = 1 << (ATA_SHIFT_PIO + 0) },
                { "pio1",        .xfer_mask        = 1 << (ATA_SHIFT_PIO + 1) },
                { "pio2",        .xfer_mask        = 1 << (ATA_SHIFT_PIO + 2) },
                { "pio3",        .xfer_mask        = 1 << (ATA_SHIFT_PIO + 3) },
                { "pio4",        .xfer_mask        = 1 << (ATA_SHIFT_PIO + 4) },
                { "pio5",        .xfer_mask        = 1 << (ATA_SHIFT_PIO + 5) },
                { "pio6",        .xfer_mask        = 1 << (ATA_SHIFT_PIO + 6) },
                { "mwdma0",        .xfer_mask        = 1 << (ATA_SHIFT_MWDMA + 0) },
                { "mwdma1",        .xfer_mask        = 1 << (ATA_SHIFT_MWDMA + 1) },
                { "mwdma2",        .xfer_mask        = 1 << (ATA_SHIFT_MWDMA + 2) },
                { "mwdma3",        .xfer_mask        = 1 << (ATA_SHIFT_MWDMA + 3) },
                { "mwdma4",        .xfer_mask        = 1 << (ATA_SHIFT_MWDMA + 4) },
                { "udma0",        .xfer_mask        = 1 << (ATA_SHIFT_UDMA + 0) },
                { "udma16",        .xfer_mask        = 1 << (ATA_SHIFT_UDMA + 0) },
                { "udma/16",        .xfer_mask        = 1 << (ATA_SHIFT_UDMA + 0) },
                { "udma1",        .xfer_mask        = 1 << (ATA_SHIFT_UDMA + 1) },
                { "udma25",        .xfer_mask        = 1 << (ATA_SHIFT_UDMA + 1) },
                { "udma/25",        .xfer_mask        = 1 << (ATA_SHIFT_UDMA + 1) },
                { "udma2",        .xfer_mask        = 1 << (ATA_SHIFT_UDMA + 2) },
                { "udma33",        .xfer_mask        = 1 << (ATA_SHIFT_UDMA + 2) },
                { "udma/33",        .xfer_mask        = 1 << (ATA_SHIFT_UDMA + 2) },
                { "udma3",        .xfer_mask        = 1 << (ATA_SHIFT_UDMA + 3) },
                { "udma44",        .xfer_mask        = 1 << (ATA_SHIFT_UDMA + 3) },
                { "udma/44",        .xfer_mask        = 1 << (ATA_SHIFT_UDMA + 3) },
                { "udma4",        .xfer_mask        = 1 << (ATA_SHIFT_UDMA + 4) },
                { "udma66",        .xfer_mask        = 1 << (ATA_SHIFT_UDMA + 4) },
                { "udma/66",        .xfer_mask        = 1 << (ATA_SHIFT_UDMA + 4) },
                { "udma5",        .xfer_mask        = 1 << (ATA_SHIFT_UDMA + 5) },
                { "udma100",        .xfer_mask        = 1 << (ATA_SHIFT_UDMA + 5) },
                { "udma/100",        .xfer_mask        = 1 << (ATA_SHIFT_UDMA + 5) },
                { "udma6",        .xfer_mask        = 1 << (ATA_SHIFT_UDMA + 6) },
                { "udma133",        .xfer_mask        = 1 << (ATA_SHIFT_UDMA + 6) },
                { "udma/133",        .xfer_mask        = 1 << (ATA_SHIFT_UDMA + 6) },
                { "udma7",        .xfer_mask        = 1 << (ATA_SHIFT_UDMA + 7) },
                { "nohrst",        .lflags                = ATA_LFLAG_NO_HRST },
                { "nosrst",        .lflags                = ATA_LFLAG_NO_SRST },
                { "norst",        .lflags                = ATA_LFLAG_NO_HRST | ATA_LFLAG_NO_SRST },
                { "rstonce",        .lflags                = ATA_LFLAG_RST_ONCE },
                { "atapi_dmadir", .horkage_on        = ATA_HORKAGE_ATAPI_DMADIR },
                { "disable",        .horkage_on        = ATA_HORKAGE_DISABLE },
        };
        char *start = *cur, *p = *cur;
        char *id, *val, *endp;
        const struct ata_force_param *match_fp = NULL;
        int nr_matches = 0, i;

        /* find where this param ends and update *cur */
        while (*p != '\0' && *p != ',')
                p++;

        if (*p == '\0')
                *cur = p;
        else
                *cur = p + 1;

        *p = '\0';

        /* parse */
        p = strchr(start, ':');
        if (!p) {
                val = strstrip(start);
                goto parse_val;
        }
        *p = '\0';

        id = strstrip(start);
        val = strstrip(p + 1);

        /* parse id */
        p = strchr(id, '.');
        if (p) {
                *p++ = '\0';
                force_ent->device = simple_strtoul(p, &endp, 10);
                if (p == endp || *endp != '\0') {
                        *reason = "invalid device";
                        return -EINVAL;
                }
        }

        force_ent->port = simple_strtoul(id, &endp, 10);
        if (id == endp || *endp != '\0') {
                *reason = "invalid port/link";
                return -EINVAL;
        }

 parse_val:
        /* parse val, allow shortcuts so that both 1.5 and 1.5Gbps work */
        for (i = 0; i < ARRAY_SIZE(force_tbl); i++) {
                const struct ata_force_param *fp = &force_tbl[i];

                if (strncasecmp(val, fp->name, strlen(val)))
                        continue;

                nr_matches++;
                match_fp = fp;

                if (strcasecmp(val, fp->name) == 0) {
                        nr_matches = 1;
                        break;
                }
        }

        if (!nr_matches) {
                *reason = "unknown value";
                return -EINVAL;
        }
        if (nr_matches > 1) {
                *reason = "ambiguous value";
                return -EINVAL;
        }

        force_ent->param = *match_fp;

        return 0;
}

static void __init ata_parse_force_param(void)
{
        int idx = 0, size = 1;
        int last_port = -1, last_device = -1;
        char *p, *cur, *next;

        /* calculate maximum number of params and allocate force_tbl */
        for (p = ata_force_param_buf; *p; p++)
                if (*p == ',')
                        size++;

        ata_force_tbl = kcalloc(size, sizeof(ata_force_tbl[0]), GFP_KERNEL);
        if (!ata_force_tbl) {
                printk(KERN_WARNING "ata: failed to extend force table, "
                       "libata.force ignored\n");
                return;
        }

        /* parse and populate the table */
        for (cur = ata_force_param_buf; *cur != '\0'; cur = next) {
                const char *reason = "";
                struct ata_force_ent te = { .port = -1, .device = -1 };

                next = cur;
                if (ata_parse_force_one(&next, &te, &reason)) {
                        printk(KERN_WARNING "ata: failed to parse force "
                               "parameter \"%s\" (%s)\n",
                               cur, reason);
                        continue;
                }

                if (te.port == -1) {
                        te.port = last_port;
                        te.device = last_device;
                }

                ata_force_tbl[idx++] = te;

                last_port = te.port;
                last_device = te.device;
        }

        ata_force_tbl_size = idx;
}

static void ata_free_force_param(void)
{
        kfree(ata_force_tbl);
}
#else
static inline void ata_parse_force_param(void) { }
static inline void ata_free_force_param(void) { }
#endif

static int __init ata_init(void)
{
        int rc;

        ata_parse_force_param();

        rc = ata_sff_init();
        if (rc) {
                ata_free_force_param();
                return rc;
        }

        libata_transport_init();
        ata_scsi_transport_template = ata_attach_transport();
        if (!ata_scsi_transport_template) {
                ata_sff_exit();
                rc = -ENOMEM;
                goto err_out;
        }

        printk(KERN_DEBUG "libata version " DRV_VERSION " loaded.\n");
        return 0;

err_out:
        return rc;
}

static void __exit ata_exit(void)
{
        ata_release_transport(ata_scsi_transport_template);
        libata_transport_exit();
        ata_sff_exit();
        ata_free_force_param();
}

subsys_initcall(ata_init);
module_exit(ata_exit);

static DEFINE_RATELIMIT_STATE(ratelimit, HZ / 5, 1);

int ata_ratelimit(void)
{
        return __ratelimit(&ratelimit);
}
EXPORT_SYMBOL_GPL(ata_ratelimit);

/**
 *        ata_msleep - ATA EH owner aware msleep
 *        @ap: ATA port to attribute the sleep to
 *        @msecs: duration to sleep in milliseconds
 *
 *        Sleeps @msecs.  If the current task is owner of @ap's EH, the
 *        ownership is released before going to sleep and reacquired
 *        after the sleep is complete.  IOW, other ports sharing the
 *        @ap->host will be allowed to own the EH while this task is
 *        sleeping.
 *
 *        LOCKING:
 *        Might sleep.
 */
void ata_msleep(struct ata_port *ap, unsigned int msecs)
{
        bool owns_eh = ap && ap->host->eh_owner == current;

        if (owns_eh)
                ata_eh_release(ap);

        if (msecs < 20) {
                unsigned long usecs = msecs * USEC_PER_MSEC;
                usleep_range(usecs, usecs + 50);
        } else {
                msleep(msecs);
        }

        if (owns_eh)
                ata_eh_acquire(ap);
}
EXPORT_SYMBOL_GPL(ata_msleep);

/**
 *        ata_wait_register - wait until register value changes
 *        @ap: ATA port to wait register for, can be NULL
 *        @reg: IO-mapped register
 *        @mask: Mask to apply to read register value
 *        @val: Wait condition
 *        @interval: polling interval in milliseconds
 *        @timeout: timeout in milliseconds
 *
 *        Waiting for some bits of register to change is a common
 *        operation for ATA controllers.  This function reads 32bit LE
 *        IO-mapped register @reg and tests for the following condition.
 *
 *        (*@reg & mask) != val
 *
 *        If the condition is met, it returns; otherwise, the process is
 *        repeated after @interval_msec until timeout.
 *
 *        LOCKING:
 *        Kernel thread context (may sleep)
 *
 *        RETURNS:
 *        The final register value.
 */
u32 ata_wait_register(struct ata_port *ap, void __iomem *reg, u32 mask, u32 val,
                      unsigned long interval, unsigned long timeout)
{
        unsigned long deadline;
        u32 tmp;

        tmp = ioread32(reg);

        /* Calculate timeout _after_ the first read to make sure
         * preceding writes reach the controller before starting to
         * eat away the timeout.
         */
        deadline = ata_deadline(jiffies, timeout);

        while ((tmp & mask) == val && time_before(jiffies, deadline)) {
                ata_msleep(ap, interval);
                tmp = ioread32(reg);
        }

        return tmp;
}
EXPORT_SYMBOL_GPL(ata_wait_register);

/*
 * Dummy port_ops
 */
static unsigned int ata_dummy_qc_issue(struct ata_queued_cmd *qc)
{
        return AC_ERR_SYSTEM;
}

static void ata_dummy_error_handler(struct ata_port *ap)
{
        /* truly dummy */
}

struct ata_port_operations ata_dummy_port_ops = {
        .qc_prep                = ata_noop_qc_prep,
        .qc_issue                = ata_dummy_qc_issue,
        .error_handler                = ata_dummy_error_handler,
        .sched_eh                = ata_std_sched_eh,
        .end_eh                        = ata_std_end_eh,
};
EXPORT_SYMBOL_GPL(ata_dummy_port_ops);

const struct ata_port_info ata_dummy_port_info = {
        .port_ops                = &ata_dummy_port_ops,
};
EXPORT_SYMBOL_GPL(ata_dummy_port_info);

/*
 * Utility print functions
 */
void ata_port_printk(const struct ata_port *ap, const char *level,
                     const char *fmt, ...)
{
        struct va_format vaf;
        va_list args;

        va_start(args, fmt);

        vaf.fmt = fmt;
        vaf.va = &args;

        printk("%sata%u: %pV", level, ap->print_id, &vaf);

        va_end(args);
}
EXPORT_SYMBOL(ata_port_printk);

void ata_link_printk(const struct ata_link *link, const char *level,
                     const char *fmt, ...)
{
        struct va_format vaf;
        va_list args;

        va_start(args, fmt);

        vaf.fmt = fmt;
        vaf.va = &args;

        if (sata_pmp_attached(link->ap) || link->ap->slave_link)
                printk("%sata%u.%02u: %pV",
                       level, link->ap->print_id, link->pmp, &vaf);
        else
                printk("%sata%u: %pV",
                       level, link->ap->print_id, &vaf);

        va_end(args);
}
EXPORT_SYMBOL(ata_link_printk);

void ata_dev_printk(const struct ata_device *dev, const char *level,
                    const char *fmt, ...)
{
        struct va_format vaf;
        va_list args;

        va_start(args, fmt);

        vaf.fmt = fmt;
        vaf.va = &args;

        printk("%sata%u.%02u: %pV",
               level, dev->link->ap->print_id, dev->link->pmp + dev->devno,
               &vaf);

        va_end(args);
}
EXPORT_SYMBOL(ata_dev_printk);

void ata_print_version(const struct device *dev, const char *version)
{
        dev_printk(KERN_DEBUG, dev, "version %s\n", version);
}
EXPORT_SYMBOL(ata_print_version);

















































































































































































































































    1 
    1 










































































































































































































    1 
























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
// SPDX-License-Identifier: GPL-2.0
/*
 * Workingset detection
 *
 * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner
 */

#include <linux/memcontrol.h>
#include <linux/mm_inline.h>
#include <linux/writeback.h>
#include <linux/shmem_fs.h>
#include <linux/pagemap.h>
#include <linux/atomic.h>
#include <linux/module.h>
#include <linux/swap.h>
#include <linux/dax.h>
#include <linux/fs.h>
#include <linux/mm.h>

/*
 *                Double CLOCK lists
 *
 * Per node, two clock lists are maintained for file pages: the
 * inactive and the active list.  Freshly faulted pages start out at
 * the head of the inactive list and page reclaim scans pages from the
 * tail.  Pages that are accessed multiple times on the inactive list
 * are promoted to the active list, to protect them from reclaim,
 * whereas active pages are demoted to the inactive list when the
 * active list grows too big.
 *
 *   fault ------------------------+
 *                                 |
 *              +--------------+   |            +-------------+
 *   reclaim <- |   inactive   | <-+-- demotion |    active   | <--+
 *              +--------------+                +-------------+    |
 *                     |                                           |
 *                     +-------------- promotion ------------------+
 *
 *
 *                Access frequency and refault distance
 *
 * A workload is thrashing when its pages are frequently used but they
 * are evicted from the inactive list every time before another access
 * would have promoted them to the active list.
 *
 * In cases where the average access distance between thrashing pages
 * is bigger than the size of memory there is nothing that can be
 * done - the thrashing set could never fit into memory under any
 * circumstance.
 *
 * However, the average access distance could be bigger than the
 * inactive list, yet smaller than the size of memory.  In this case,
 * the set could fit into memory if it weren't for the currently
 * active pages - which may be used more, hopefully less frequently:
 *
 *      +-memory available to cache-+
 *      |                           |
 *      +-inactive------+-active----+
 *  a b | c d e f g h i | J K L M N |
 *      +---------------+-----------+
 *
 * It is prohibitively expensive to accurately track access frequency
 * of pages.  But a reasonable approximation can be made to measure
 * thrashing on the inactive list, after which refaulting pages can be
 * activated optimistically to compete with the existing active pages.
 *
 * Approximating inactive page access frequency - Observations:
 *
 * 1. When a page is accessed for the first time, it is added to the
 *    head of the inactive list, slides every existing inactive page
 *    towards the tail by one slot, and pushes the current tail page
 *    out of memory.
 *
 * 2. When a page is accessed for the second time, it is promoted to
 *    the active list, shrinking the inactive list by one slot.  This
 *    also slides all inactive pages that were faulted into the cache
 *    more recently than the activated page towards the tail of the
 *    inactive list.
 *
 * Thus:
 *
 * 1. The sum of evictions and activations between any two points in
 *    time indicate the minimum number of inactive pages accessed in
 *    between.
 *
 * 2. Moving one inactive page N page slots towards the tail of the
 *    list requires at least N inactive page accesses.
 *
 * Combining these:
 *
 * 1. When a page is finally evicted from memory, the number of
 *    inactive pages accessed while the page was in cache is at least
 *    the number of page slots on the inactive list.
 *
 * 2. In addition, measuring the sum of evictions and activations (E)
 *    at the time of a page's eviction, and comparing it to another
 *    reading (R) at the time the page faults back into memory tells
 *    the minimum number of accesses while the page was not cached.
 *    This is called the refault distance.
 *
 * Because the first access of the page was the fault and the second
 * access the refault, we combine the in-cache distance with the
 * out-of-cache distance to get the complete minimum access distance
 * of this page:
 *
 *      NR_inactive + (R - E)
 *
 * And knowing the minimum access distance of a page, we can easily
 * tell if the page would be able to stay in cache assuming all page
 * slots in the cache were available:
 *
 *   NR_inactive + (R - E) <= NR_inactive + NR_active
 *
 * which can be further simplified to
 *
 *   (R - E) <= NR_active
 *
 * Put into words, the refault distance (out-of-cache) can be seen as
 * a deficit in inactive list space (in-cache).  If the inactive list
 * had (R - E) more page slots, the page would not have been evicted
 * in between accesses, but activated instead.  And on a full system,
 * the only thing eating into inactive list space is active pages.
 *
 *
 *                Refaulting inactive pages
 *
 * All that is known about the active list is that the pages have been
 * accessed more than once in the past.  This means that at any given
 * time there is actually a good chance that pages on the active list
 * are no longer in active use.
 *
 * So when a refault distance of (R - E) is observed and there are at
 * least (R - E) active pages, the refaulting page is activated
 * optimistically in the hope that (R - E) active pages are actually
 * used less frequently than the refaulting page - or even not used at
 * all anymore.
 *
 * That means if inactive cache is refaulting with a suitable refault
 * distance, we assume the cache workingset is transitioning and put
 * pressure on the current active list.
 *
 * If this is wrong and demotion kicks in, the pages which are truly
 * used more frequently will be reactivated while the less frequently
 * used once will be evicted from memory.
 *
 * But if this is right, the stale pages will be pushed out of memory
 * and the used pages get to stay in cache.
 *
 *                Refaulting active pages
 *
 * If on the other hand the refaulting pages have recently been
 * deactivated, it means that the active list is no longer protecting
 * actively used cache from reclaim. The cache is NOT transitioning to
 * a different workingset; the existing workingset is thrashing in the
 * space allocated to the page cache.
 *
 *
 *                Implementation
 *
 * For each node's LRU lists, a counter for inactive evictions and
 * activations is maintained (node->nonresident_age).
 *
 * On eviction, a snapshot of this counter (along with some bits to
 * identify the node) is stored in the now empty page cache
 * slot of the evicted page.  This is called a shadow entry.
 *
 * On cache misses for which there are shadow entries, an eligible
 * refault distance will immediately activate the refaulting page.
 */

#define EVICTION_SHIFT        ((BITS_PER_LONG - BITS_PER_XA_VALUE) +        \
                         1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT)
#define EVICTION_MASK        (~0UL >> EVICTION_SHIFT)

/*
 * Eviction timestamps need to be able to cover the full range of
 * actionable refaults. However, bits are tight in the xarray
 * entry, and after storing the identifier for the lruvec there might
 * not be enough left to represent every single actionable refault. In
 * that case, we have to sacrifice granularity for distance, and group
 * evictions into coarser buckets by shaving off lower timestamp bits.
 */
static unsigned int bucket_order __read_mostly;

static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
                         bool workingset)
{
        eviction >>= bucket_order;
        eviction &= EVICTION_MASK;
        eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
        eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
        eviction = (eviction << 1) | workingset;

        return xa_mk_value(eviction);
}

static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
                          unsigned long *evictionp, bool *workingsetp)
{
        unsigned long entry = xa_to_value(shadow);
        int memcgid, nid;
        bool workingset;

        workingset = entry & 1;
        entry >>= 1;
        nid = entry & ((1UL << NODES_SHIFT) - 1);
        entry >>= NODES_SHIFT;
        memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
        entry >>= MEM_CGROUP_ID_SHIFT;

        *memcgidp = memcgid;
        *pgdat = NODE_DATA(nid);
        *evictionp = entry << bucket_order;
        *workingsetp = workingset;
}

/**
 * workingset_age_nonresident - age non-resident entries as LRU ages
 * @lruvec: the lruvec that was aged
 * @nr_pages: the number of pages to count
 *
 * As in-memory pages are aged, non-resident pages need to be aged as
 * well, in order for the refault distances later on to be comparable
 * to the in-memory dimensions. This function allows reclaim and LRU
 * operations to drive the non-resident aging along in parallel.
 */
void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages)
{
        /*
         * Reclaiming a cgroup means reclaiming all its children in a
         * round-robin fashion. That means that each cgroup has an LRU
         * order that is composed of the LRU orders of its child
         * cgroups; and every page has an LRU position not just in the
         * cgroup that owns it, but in all of that group's ancestors.
         *
         * So when the physical inactive list of a leaf cgroup ages,
         * the virtual inactive lists of all its parents, including
         * the root cgroup's, age as well.
         */
        do {
                atomic_long_add(nr_pages, &lruvec->nonresident_age);
        } while ((lruvec = parent_lruvec(lruvec)));
}

/**
 * workingset_eviction - note the eviction of a page from memory
 * @target_memcg: the cgroup that is causing the reclaim
 * @page: the page being evicted
 *
 * Returns a shadow entry to be stored in @page->mapping->i_pages in place
 * of the evicted @page so that a later refault can be detected.
 */
void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
{
        struct pglist_data *pgdat = page_pgdat(page);
        unsigned long eviction;
        struct lruvec *lruvec;
        int memcgid;

        /* Page is fully exclusive and pins page->mem_cgroup */
        VM_BUG_ON_PAGE(PageLRU(page), page);
        VM_BUG_ON_PAGE(page_count(page), page);
        VM_BUG_ON_PAGE(!PageLocked(page), page);

        lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
        workingset_age_nonresident(lruvec, thp_nr_pages(page));
        /* XXX: target_memcg can be NULL, go through lruvec */
        memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
        eviction = atomic_long_read(&lruvec->nonresident_age);
        return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
}

/**
 * workingset_refault - evaluate the refault of a previously evicted page
 * @page: the freshly allocated replacement page
 * @shadow: shadow entry of the evicted page
 *
 * Calculates and evaluates the refault distance of the previously
 * evicted page in the context of the node and the memcg whose memory
 * pressure caused the eviction.
 */
void workingset_refault(struct page *page, void *shadow)
{
        bool file = page_is_file_lru(page);
        struct mem_cgroup *eviction_memcg;
        struct lruvec *eviction_lruvec;
        unsigned long refault_distance;
        unsigned long workingset_size;
        struct pglist_data *pgdat;
        struct mem_cgroup *memcg;
        unsigned long eviction;
        struct lruvec *lruvec;
        unsigned long refault;
        bool workingset;
        int memcgid;

        unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);

        rcu_read_lock();
        /*
         * Look up the memcg associated with the stored ID. It might
         * have been deleted since the page's eviction.
         *
         * Note that in rare events the ID could have been recycled
         * for a new cgroup that refaults a shared page. This is
         * impossible to tell from the available data. However, this
         * should be a rare and limited disturbance, and activations
         * are always speculative anyway. Ultimately, it's the aging
         * algorithm's job to shake out the minimum access frequency
         * for the active cache.
         *
         * XXX: On !CONFIG_MEMCG, this will always return NULL; it
         * would be better if the root_mem_cgroup existed in all
         * configurations instead.
         */
        eviction_memcg = mem_cgroup_from_id(memcgid);
        if (!mem_cgroup_disabled() && !eviction_memcg)
                goto out;
        eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
        refault = atomic_long_read(&eviction_lruvec->nonresident_age);

        /*
         * Calculate the refault distance
         *
         * The unsigned subtraction here gives an accurate distance
         * across nonresident_age overflows in most cases. There is a
         * special case: usually, shadow entries have a short lifetime
         * and are either refaulted or reclaimed along with the inode
         * before they get too old.  But it is not impossible for the
         * nonresident_age to lap a shadow entry in the field, which
         * can then result in a false small refault distance, leading
         * to a false activation should this old entry actually
         * refault again.  However, earlier kernels used to deactivate
         * unconditionally with *every* reclaim invocation for the
         * longest time, so the occasional inappropriate activation
         * leading to pressure on the active list is not a problem.
         */
        refault_distance = (refault - eviction) & EVICTION_MASK;

        /*
         * The activation decision for this page is made at the level
         * where the eviction occurred, as that is where the LRU order
         * during page reclaim is being determined.
         *
         * However, the cgroup that will own the page is the one that
         * is actually experiencing the refault event.
         */
        memcg = page_memcg(page);
        lruvec = mem_cgroup_lruvec(memcg, pgdat);

        inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file);

        /*
         * Compare the distance to the existing workingset size. We
         * don't activate pages that couldn't stay resident even if
         * all the memory was available to the workingset. Whether
         * workingset competition needs to consider anon or not depends
         * on having swap.
         */
        workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE);
        if (!file) {
                workingset_size += lruvec_page_state(eviction_lruvec,
                                                     NR_INACTIVE_FILE);
        }
        if (mem_cgroup_get_nr_swap_pages(memcg) > 0) {
                workingset_size += lruvec_page_state(eviction_lruvec,
                                                     NR_ACTIVE_ANON);
                if (file) {
                        workingset_size += lruvec_page_state(eviction_lruvec,
                                                     NR_INACTIVE_ANON);
                }
        }
        if (refault_distance > workingset_size)
                goto out;

        SetPageActive(page);
        workingset_age_nonresident(lruvec, thp_nr_pages(page));
        inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file);

        /* Page was active prior to eviction */
        if (workingset) {
                SetPageWorkingset(page);
                /* XXX: Move to lru_cache_add() when it supports new vs putback */
                spin_lock_irq(&page_pgdat(page)->lru_lock);
                lru_note_cost_page(page);
                spin_unlock_irq(&page_pgdat(page)->lru_lock);
                inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file);
        }
out:
        rcu_read_unlock();
}

/**
 * workingset_activation - note a page activation
 * @page: page that is being activated
 */
void workingset_activation(struct page *page)
{
        struct mem_cgroup *memcg;
        struct lruvec *lruvec;

        rcu_read_lock();
        /*
         * Filter non-memcg pages here, e.g. unmap can call
         * mark_page_accessed() on VDSO pages.
         *
         * XXX: See workingset_refault() - this should return
         * root_mem_cgroup even for !CONFIG_MEMCG.
         */
        memcg = page_memcg_rcu(page);
        if (!mem_cgroup_disabled() && !memcg)
                goto out;
        lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
        workingset_age_nonresident(lruvec, thp_nr_pages(page));
out:
        rcu_read_unlock();
}

/*
 * Shadow entries reflect the share of the working set that does not
 * fit into memory, so their number depends on the access pattern of
 * the workload.  In most cases, they will refault or get reclaimed
 * along with the inode, but a (malicious) workload that streams
 * through files with a total size several times that of available
 * memory, while preventing the inodes from being reclaimed, can
 * create excessive amounts of shadow nodes.  To keep a lid on this,
 * track shadow nodes and reclaim them when they grow way past the
 * point where they would still be useful.
 */

static struct list_lru shadow_nodes;

void workingset_update_node(struct xa_node *node)
{
        /*
         * Track non-empty nodes that contain only shadow entries;
         * unlink those that contain pages or are being freed.
         *
         * Avoid acquiring the list_lru lock when the nodes are
         * already where they should be. The list_empty() test is safe
         * as node->private_list is protected by the i_pages lock.
         */
        VM_WARN_ON_ONCE(!irqs_disabled());  /* For __inc_lruvec_page_state */

        if (node->count && node->count == node->nr_values) {
                if (list_empty(&node->private_list)) {
                        list_lru_add(&shadow_nodes, &node->private_list);
                        __inc_lruvec_slab_state(node, WORKINGSET_NODES);
                }
        } else {
                if (!list_empty(&node->private_list)) {
                        list_lru_del(&shadow_nodes, &node->private_list);
                        __dec_lruvec_slab_state(node, WORKINGSET_NODES);
                }
        }
}

static unsigned long count_shadow_nodes(struct shrinker *shrinker,
                                        struct shrink_control *sc)
{
        unsigned long max_nodes;
        unsigned long nodes;
        unsigned long pages;

        nodes = list_lru_shrink_count(&shadow_nodes, sc);

        /*
         * Approximate a reasonable limit for the nodes
         * containing shadow entries. We don't need to keep more
         * shadow entries than possible pages on the active list,
         * since refault distances bigger than that are dismissed.
         *
         * The size of the active list converges toward 100% of
         * overall page cache as memory grows, with only a tiny
         * inactive list. Assume the total cache size for that.
         *
         * Nodes might be sparsely populated, with only one shadow
         * entry in the extreme case. Obviously, we cannot keep one
         * node for every eligible shadow entry, so compromise on a
         * worst-case density of 1/8th. Below that, not all eligible
         * refaults can be detected anymore.
         *
         * On 64-bit with 7 xa_nodes per page and 64 slots
         * each, this will reclaim shadow entries when they consume
         * ~1.8% of available memory:
         *
         * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE
         */
#ifdef CONFIG_MEMCG
        if (sc->memcg) {
                struct lruvec *lruvec;
                int i;

                lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
                for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
                        pages += lruvec_page_state_local(lruvec,
                                                         NR_LRU_BASE + i);
                pages += lruvec_page_state_local(
                        lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT;
                pages += lruvec_page_state_local(
                        lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT;
        } else
#endif
                pages = node_present_pages(sc->nid);

        max_nodes = pages >> (XA_CHUNK_SHIFT - 3);

        if (!nodes)
                return SHRINK_EMPTY;

        if (nodes <= max_nodes)
                return 0;
        return nodes - max_nodes;
}

static enum lru_status shadow_lru_isolate(struct list_head *item,
                                          struct list_lru_one *lru,
                                          spinlock_t *lru_lock,
                                          void *arg) __must_hold(lru_lock)
{
        struct xa_node *node = container_of(item, struct xa_node, private_list);
        struct address_space *mapping;
        int ret;

        /*
         * Page cache insertions and deletions synchronously maintain
         * the shadow node LRU under the i_pages lock and the
         * lru_lock.  Because the page cache tree is emptied before
         * the inode can be destroyed, holding the lru_lock pins any
         * address_space that has nodes on the LRU.
         *
         * We can then safely transition to the i_pages lock to
         * pin only the address_space of the particular node we want
         * to reclaim, take the node off-LRU, and drop the lru_lock.
         */

        mapping = container_of(node->array, struct address_space, i_pages);

        /* Coming from the list, invert the lock order */
        if (!xa_trylock(&mapping->i_pages)) {
                spin_unlock_irq(lru_lock);
                ret = LRU_RETRY;
                goto out;
        }

        list_lru_isolate(lru, item);
        __dec_lruvec_slab_state(node, WORKINGSET_NODES);

        spin_unlock(lru_lock);

        /*
         * The nodes should only contain one or more shadow entries,
         * no pages, so we expect to be able to remove them all and
         * delete and free the empty node afterwards.
         */
        if (WARN_ON_ONCE(!node->nr_values))
                goto out_invalid;
        if (WARN_ON_ONCE(node->count != node->nr_values))
                goto out_invalid;
        mapping->nrexceptional -= node->nr_values;
        xa_delete_node(node, workingset_update_node);
        __inc_lruvec_slab_state(node, WORKINGSET_NODERECLAIM);

out_invalid:
        xa_unlock_irq(&mapping->i_pages);
        ret = LRU_REMOVED_RETRY;
out:
        cond_resched();
        spin_lock_irq(lru_lock);
        return ret;
}

static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
                                       struct shrink_control *sc)
{
        /* list_lru lock nests inside the IRQ-safe i_pages lock */
        return list_lru_shrink_walk_irq(&shadow_nodes, sc, shadow_lru_isolate,
                                        NULL);
}

static struct shrinker workingset_shadow_shrinker = {
        .count_objects = count_shadow_nodes,
        .scan_objects = scan_shadow_nodes,
        .seeks = 0, /* ->count reports only fully expendable nodes */
        .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE,
};

/*
 * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe
 * i_pages lock.
 */
static struct lock_class_key shadow_nodes_key;

static int __init workingset_init(void)
{
        unsigned int timestamp_bits;
        unsigned int max_order;
        int ret;

        BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
        /*
         * Calculate the eviction bucket size to cover the longest
         * actionable refault distance, which is currently half of
         * memory (totalram_pages/2). However, memory hotplug may add
         * some more pages at runtime, so keep working with up to
         * double the initial memory by using totalram_pages as-is.
         */
        timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
        max_order = fls_long(totalram_pages() - 1);
        if (max_order > timestamp_bits)
                bucket_order = max_order - timestamp_bits;
        pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
               timestamp_bits, max_order, bucket_order);

        ret = prealloc_shrinker(&workingset_shadow_shrinker);
        if (ret)
                goto err;
        ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key,
                              &workingset_shadow_shrinker);
        if (ret)
                goto err_list_lru;
        register_shrinker_prepared(&workingset_shadow_shrinker);
        return 0;
err_list_lru:
        free_prealloced_shrinker(&workingset_shadow_shrinker);
err:
        return ret;
}
module_init(workingset_init);























































    1 


    1 










































    9 



    9 






    9 








    9 





























    9 









































    7 



    7 





    7 
    7 
    7 
    7 
    2 
    7 

    7 











    7 






    7 
    1 
    7 


    7 






















    1 







    1 







    1 






    1 
    1 
    1 



    1 
    1 


    1 



    1 


    1 
















    1 






















    7 


    1 

















    7 












































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/file_table.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *  Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
 */

#include <linux/string.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/security.h>
#include <linux/cred.h>
#include <linux/eventpoll.h>
#include <linux/rcupdate.h>
#include <linux/mount.h>
#include <linux/capability.h>
#include <linux/cdev.h>
#include <linux/fsnotify.h>
#include <linux/sysctl.h>
#include <linux/percpu_counter.h>
#include <linux/percpu.h>
#include <linux/task_work.h>
#include <linux/ima.h>
#include <linux/swap.h>

#include <linux/atomic.h>

#include "internal.h"

/* sysctl tunables... */
struct files_stat_struct files_stat = {
        .max_files = NR_FILE
};

/* SLAB cache for file structures */
static struct kmem_cache *filp_cachep __read_mostly;

static struct percpu_counter nr_files __cacheline_aligned_in_smp;

static void file_free_rcu(struct rcu_head *head)
{
        struct file *f = container_of(head, struct file, f_u.fu_rcuhead);

        put_cred(f->f_cred);
        kmem_cache_free(filp_cachep, f);
}

static inline void file_free(struct file *f)
{
        security_file_free(f);
        if (!(f->f_mode & FMODE_NOACCOUNT))
                percpu_counter_dec(&nr_files);
        call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
}

/*
 * Return the total number of open files in the system
 */
static long get_nr_files(void)
{
        return percpu_counter_read_positive(&nr_files);
}

/*
 * Return the maximum number of open files in the system
 */
unsigned long get_max_files(void)
{
        return files_stat.max_files;
}
EXPORT_SYMBOL_GPL(get_max_files);

/*
 * Handle nr_files sysctl
 */
#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
int proc_nr_files(struct ctl_table *table, int write,
                     void *buffer, size_t *lenp, loff_t *ppos)
{
        files_stat.nr_files = get_nr_files();
        return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
#else
int proc_nr_files(struct ctl_table *table, int write,
                     void *buffer, size_t *lenp, loff_t *ppos)
{
        return -ENOSYS;
}
#endif

static struct file *__alloc_file(int flags, const struct cred *cred)
{
        struct file *f;
        int error;

        f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
        if (unlikely(!f))
                return ERR_PTR(-ENOMEM);

        f->f_cred = get_cred(cred);
        error = security_file_alloc(f);
        if (unlikely(error)) {
                file_free_rcu(&f->f_u.fu_rcuhead);
                return ERR_PTR(error);
        }

        atomic_long_set(&f->f_count, 1);
        rwlock_init(&f->f_owner.lock);
        spin_lock_init(&f->f_lock);
        mutex_init(&f->f_pos_lock);
        eventpoll_init_file(f);
        f->f_flags = flags;
        f->f_mode = OPEN_FMODE(flags);
        /* f->f_version: 0 */

        return f;
}

/* Find an unused file structure and return a pointer to it.
 * Returns an error pointer if some error happend e.g. we over file
 * structures limit, run out of memory or operation is not permitted.
 *
 * Be very careful using this.  You are responsible for
 * getting write access to any mount that you might assign
 * to this filp, if it is opened for write.  If this is not
 * done, you will imbalance int the mount's writer count
 * and a warning at __fput() time.
 */
struct file *alloc_empty_file(int flags, const struct cred *cred)
{
        static long old_max;
        struct file *f;

        /*
         * Privileged users can go above max_files
         */
        if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) {
                /*
                 * percpu_counters are inaccurate.  Do an expensive check before
                 * we go and fail.
                 */
                if (percpu_counter_sum_positive(&nr_files) >= files_stat.max_files)
                        goto over;
        }

        f = __alloc_file(flags, cred);
        if (!IS_ERR(f))
                percpu_counter_inc(&nr_files);

        return f;

over:
        /* Ran out of filps - report that */
        if (get_nr_files() > old_max) {
                pr_info("VFS: file-max limit %lu reached\n", get_max_files());
                old_max = get_nr_files();
        }
        return ERR_PTR(-ENFILE);
}

/*
 * Variant of alloc_empty_file() that doesn't check and modify nr_files.
 *
 * Should not be used unless there's a very good reason to do so.
 */
struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred)
{
        struct file *f = __alloc_file(flags, cred);

        if (!IS_ERR(f))
                f->f_mode |= FMODE_NOACCOUNT;

        return f;
}

/**
 * alloc_file - allocate and initialize a 'struct file'
 *
 * @path: the (dentry, vfsmount) pair for the new file
 * @flags: O_... flags with which the new file will be opened
 * @fop: the 'struct file_operations' for the new file
 */
static struct file *alloc_file(const struct path *path, int flags,
                const struct file_operations *fop)
{
        struct file *file;

        file = alloc_empty_file(flags, current_cred());
        if (IS_ERR(file))
                return file;

        file->f_path = *path;
        file->f_inode = path->dentry->d_inode;
        file->f_mapping = path->dentry->d_inode->i_mapping;
        file->f_wb_err = filemap_sample_wb_err(file->f_mapping);
        file->f_sb_err = file_sample_sb_err(file);
        if ((file->f_mode & FMODE_READ) &&
             likely(fop->read || fop->read_iter))
                file->f_mode |= FMODE_CAN_READ;
        if ((file->f_mode & FMODE_WRITE) &&
             likely(fop->write || fop->write_iter))
                file->f_mode |= FMODE_CAN_WRITE;
        file->f_mode |= FMODE_OPENED;
        file->f_op = fop;
        if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
                i_readcount_inc(path->dentry->d_inode);
        return file;
}

struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
                                const char *name, int flags,
                                const struct file_operations *fops)
{
        static const struct dentry_operations anon_ops = {
                .d_dname = simple_dname
        };
        struct qstr this = QSTR_INIT(name, strlen(name));
        struct path path;
        struct file *file;

        path.dentry = d_alloc_pseudo(mnt->mnt_sb, &this);
        if (!path.dentry)
                return ERR_PTR(-ENOMEM);
        if (!mnt->mnt_sb->s_d_op)
                d_set_d_op(path.dentry, &anon_ops);
        path.mnt = mntget(mnt);
        d_instantiate(path.dentry, inode);
        file = alloc_file(&path, flags, fops);
        if (IS_ERR(file)) {
                ihold(inode);
                path_put(&path);
        }
        return file;
}
EXPORT_SYMBOL(alloc_file_pseudo);

struct file *alloc_file_clone(struct file *base, int flags,
                                const struct file_operations *fops)
{
        struct file *f = alloc_file(&base->f_path, flags, fops);
        if (!IS_ERR(f)) {
                path_get(&f->f_path);
                f->f_mapping = base->f_mapping;
        }
        return f;
}

/* the real guts of fput() - releasing the last reference to file
 */
static void __fput(struct file *file)
{
        struct dentry *dentry = file->f_path.dentry;
        struct vfsmount *mnt = file->f_path.mnt;
        struct inode *inode = file->f_inode;
        fmode_t mode = file->f_mode;

        if (unlikely(!(file->f_mode & FMODE_OPENED)))
                goto out;

        might_sleep();

        fsnotify_close(file);
        /*
         * The function eventpoll_release() should be the first called
         * in the file cleanup chain.
         */
        eventpoll_release(file);
        locks_remove_file(file);

        ima_file_free(file);
        if (unlikely(file->f_flags & FASYNC)) {
                if (file->f_op->fasync)
                        file->f_op->fasync(-1, file, 0);
        }
        if (file->f_op->release)
                file->f_op->release(inode, file);
        if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
                     !(mode & FMODE_PATH))) {
                cdev_put(inode->i_cdev);
        }
        fops_put(file->f_op);
        put_pid(file->f_owner.pid);
        if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
                i_readcount_dec(inode);
        if (mode & FMODE_WRITER) {
                put_write_access(inode);
                __mnt_drop_write(mnt);
        }
        dput(dentry);
        if (unlikely(mode & FMODE_NEED_UNMOUNT))
                dissolve_on_fput(mnt);
        mntput(mnt);
out:
        file_free(file);
}

static LLIST_HEAD(delayed_fput_list);
static void delayed_fput(struct work_struct *unused)
{
        struct llist_node *node = llist_del_all(&delayed_fput_list);
        struct file *f, *t;

        llist_for_each_entry_safe(f, t, node, f_u.fu_llist)
                __fput(f);
}

static void ____fput(struct callback_head *work)
{
        __fput(container_of(work, struct file, f_u.fu_rcuhead));
}

/*
 * If kernel thread really needs to have the final fput() it has done
 * to complete, call this.  The only user right now is the boot - we
 * *do* need to make sure our writes to binaries on initramfs has
 * not left us with opened struct file waiting for __fput() - execve()
 * won't work without that.  Please, don't add more callers without
 * very good reasons; in particular, never call that with locks
 * held and never call that from a thread that might need to do
 * some work on any kind of umount.
 */
void flush_delayed_fput(void)
{
        delayed_fput(NULL);
}
EXPORT_SYMBOL_GPL(flush_delayed_fput);

static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);

void fput_many(struct file *file, unsigned int refs)
{
        if (atomic_long_sub_and_test(refs, &file->f_count)) {
                struct task_struct *task = current;

                if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
                        init_task_work(&file->f_u.fu_rcuhead, ____fput);
                        if (!task_work_add(task, &file->f_u.fu_rcuhead, TWA_RESUME))
                                return;
                        /*
                         * After this task has run exit_task_work(),
                         * task_work_add() will fail.  Fall through to delayed
                         * fput to avoid leaking *file.
                         */
                }

                if (llist_add(&file->f_u.fu_llist, &delayed_fput_list))
                        schedule_delayed_work(&delayed_fput_work, 1);
        }
}

void fput(struct file *file)
{
        fput_many(file, 1);
}

/*
 * synchronous analog of fput(); for kernel threads that might be needed
 * in some umount() (and thus can't use flush_delayed_fput() without
 * risking deadlocks), need to wait for completion of __fput() and know
 * for this specific struct file it won't involve anything that would
 * need them.  Use only if you really need it - at the very least,
 * don't blindly convert fput() by kernel thread to that.
 */
void __fput_sync(struct file *file)
{
        if (atomic_long_dec_and_test(&file->f_count)) {
                struct task_struct *task = current;
                BUG_ON(!(task->flags & PF_KTHREAD));
                __fput(file);
        }
}

EXPORT_SYMBOL(fput);
EXPORT_SYMBOL(__fput_sync);

void __init files_init(void)
{
        filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
                        SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL);
        percpu_counter_init(&nr_files, 0, GFP_KERNEL);
}

/*
 * One file with associated inode and dcache is very roughly 1K. Per default
 * do not use more than 10% of our memory for files.
 */
void __init files_maxfiles_init(void)
{
        unsigned long n;
        unsigned long nr_pages = totalram_pages();
        unsigned long memreserve = (nr_pages - nr_free_pages()) * 3/2;

        memreserve = min(memreserve, nr_pages - 1);
        n = ((nr_pages - memreserve) * (PAGE_SIZE / 1024)) / 10;

        files_stat.max_files = max_t(unsigned long, n, NR_FILE);
}









































































    1 
    1 

























































































































































































































    2 






    2 
    2 
    2 











    2 














    2 



    2 


    1 
    1 


    1 




    2 





    2 











    2 






    1 















    2 












    2 























































































































































































































































































































    2 








    2 

    2 















    2 



    2 








    2 
















































































































































































































































































































































    1 





    1 







    1 



    1 





    2 




    2 




    2 
    2 





































    1 
    1 



















































































    2 




    1 


















































    1 




    1 












    1 














    1 










    1 
















    1 



    1 


    1 








    1 









    1 
    1 
    1 


    1 














    1 










    1 




    1 
    1 








    1 
    1 













































    1 


    1 







    1 





























    1 













    1 

    1 










































































    1 


    1 







    1 












    1 

























    1 


    1 

    1 
























































































































































































































































    1 


    1 


    1 


    1 



    1 














    1 
    1 












    2 







    2 






    1 














    1 



    1 












    1 







    1 









    1 





    1 





    1 











    1 



















    1 

    1 




    1 


    1 






















































    1 








































    2 



















    1 


    2 


    2 

    2 








    1 

    2 

    2 
















    2 









    1 













    1 
    1 

























    1 
    1 




    1 





    2 


    2 










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
// SPDX-License-Identifier: GPL-2.0
/*
 * Block multiqueue core code
 *
 * Copyright (C) 2013-2014 Jens Axboe
 * Copyright (C) 2013-2014 Christoph Hellwig
 */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/backing-dev.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/kmemleak.h>
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/smp.h>
#include <linux/llist.h>
#include <linux/list_sort.h>
#include <linux/cpu.h>
#include <linux/cache.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/topology.h>
#include <linux/sched/signal.h>
#include <linux/suspend.h>
#include <linux/delay.h>
#include <linux/crash_dump.h>
#include <linux/prefetch.h>
#include <linux/blk-crypto.h>

#include <trace/events/block.h>

#include <linux/blk-mq.h>
#include <linux/t10-pi.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
#include "blk-mq-tag.h"
#include "blk-pm.h"
#include "blk-stat.h"
#include "blk-mq-sched.h"
#include "blk-rq-qos.h"

static DEFINE_PER_CPU(struct list_head, blk_cpu_done);

static void blk_mq_poll_stats_start(struct request_queue *q);
static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);

static int blk_mq_poll_stats_bkt(const struct request *rq)
{
        int ddir, sectors, bucket;

        ddir = rq_data_dir(rq);
        sectors = blk_rq_stats_sectors(rq);

        bucket = ddir + 2 * ilog2(sectors);

        if (bucket < 0)
                return -1;
        else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
                return ddir + BLK_MQ_POLL_STATS_BKTS - 2;

        return bucket;
}

/*
 * Check if any of the ctx, dispatch list or elevator
 * have pending work in this hardware queue.
 */
static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
{
        return !list_empty_careful(&hctx->dispatch) ||
                sbitmap_any_bit_set(&hctx->ctx_map) ||
                        blk_mq_sched_has_work(hctx);
}

/*
 * Mark this ctx as having pending work in this hardware queue
 */
static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
                                     struct blk_mq_ctx *ctx)
{
        const int bit = ctx->index_hw[hctx->type];

        if (!sbitmap_test_bit(&hctx->ctx_map, bit))
                sbitmap_set_bit(&hctx->ctx_map, bit);
}

static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
                                      struct blk_mq_ctx *ctx)
{
        const int bit = ctx->index_hw[hctx->type];

        sbitmap_clear_bit(&hctx->ctx_map, bit);
}

struct mq_inflight {
        struct hd_struct *part;
        unsigned int inflight[2];
};

static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
                                  struct request *rq, void *priv,
                                  bool reserved)
{
        struct mq_inflight *mi = priv;

        if ((!mi->part->partno || rq->part == mi->part) &&
            blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
                mi->inflight[rq_data_dir(rq)]++;

        return true;
}

unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part)
{
        struct mq_inflight mi = { .part = part };

        blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);

        return mi.inflight[0] + mi.inflight[1];
}

void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
                         unsigned int inflight[2])
{
        struct mq_inflight mi = { .part = part };

        blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
        inflight[0] = mi.inflight[0];
        inflight[1] = mi.inflight[1];
}

void blk_freeze_queue_start(struct request_queue *q)
{
        mutex_lock(&q->mq_freeze_lock);
        if (++q->mq_freeze_depth == 1) {
                percpu_ref_kill(&q->q_usage_counter);
                mutex_unlock(&q->mq_freeze_lock);
                if (queue_is_mq(q))
                        blk_mq_run_hw_queues(q, false);
        } else {
                mutex_unlock(&q->mq_freeze_lock);
        }
}
EXPORT_SYMBOL_GPL(blk_freeze_queue_start);

void blk_mq_freeze_queue_wait(struct request_queue *q)
{
        wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);

int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
                                     unsigned long timeout)
{
        return wait_event_timeout(q->mq_freeze_wq,
                                        percpu_ref_is_zero(&q->q_usage_counter),
                                        timeout);
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);

/*
 * Guarantee no request is in use, so we can change any data structure of
 * the queue afterward.
 */
void blk_freeze_queue(struct request_queue *q)
{
        /*
         * In the !blk_mq case we are only calling this to kill the
         * q_usage_counter, otherwise this increases the freeze depth
         * and waits for it to return to zero.  For this reason there is
         * no blk_unfreeze_queue(), and blk_freeze_queue() is not
         * exported to drivers as the only user for unfreeze is blk_mq.
         */
        blk_freeze_queue_start(q);
        blk_mq_freeze_queue_wait(q);
}

void blk_mq_freeze_queue(struct request_queue *q)
{
        /*
         * ...just an alias to keep freeze and unfreeze actions balanced
         * in the blk_mq_* namespace
         */
        blk_freeze_queue(q);
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);

void blk_mq_unfreeze_queue(struct request_queue *q)
{
        mutex_lock(&q->mq_freeze_lock);
        q->mq_freeze_depth--;
        WARN_ON_ONCE(q->mq_freeze_depth < 0);
        if (!q->mq_freeze_depth) {
                percpu_ref_resurrect(&q->q_usage_counter);
                wake_up_all(&q->mq_freeze_wq);
        }
        mutex_unlock(&q->mq_freeze_lock);
}
EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);

/*
 * FIXME: replace the scsi_internal_device_*block_nowait() calls in the
 * mpt3sas driver such that this function can be removed.
 */
void blk_mq_quiesce_queue_nowait(struct request_queue *q)
{
        blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q);
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);

/**
 * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
 * @q: request queue.
 *
 * Note: this function does not prevent that the struct request end_io()
 * callback function is invoked. Once this function is returned, we make
 * sure no dispatch can happen until the queue is unquiesced via
 * blk_mq_unquiesce_queue().
 */
void blk_mq_quiesce_queue(struct request_queue *q)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned int i;
        bool rcu = false;

        blk_mq_quiesce_queue_nowait(q);

        queue_for_each_hw_ctx(q, hctx, i) {
                if (hctx->flags & BLK_MQ_F_BLOCKING)
                        synchronize_srcu(hctx->srcu);
                else
                        rcu = true;
        }
        if (rcu)
                synchronize_rcu();
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);

/*
 * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue()
 * @q: request queue.
 *
 * This function recovers queue into the state before quiescing
 * which is done by blk_mq_quiesce_queue.
 */
void blk_mq_unquiesce_queue(struct request_queue *q)
{
        blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);

        /* dispatch requests which are inserted during quiescing */
        blk_mq_run_hw_queues(q, true);
}
EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);

void blk_mq_wake_waiters(struct request_queue *q)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned int i;

        queue_for_each_hw_ctx(q, hctx, i)
                if (blk_mq_hw_queue_mapped(hctx))
                        blk_mq_tag_wakeup_all(hctx->tags, true);
}

/*
 * Only need start/end time stamping if we have iostat or
 * blk stats enabled, or using an IO scheduler.
 */
static inline bool blk_mq_need_time_stamp(struct request *rq)
{
        return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS)) || rq->q->elevator;
}

static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
                unsigned int tag, u64 alloc_time_ns)
{
        struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
        struct request *rq = tags->static_rqs[tag];

        if (data->q->elevator) {
                rq->tag = BLK_MQ_NO_TAG;
                rq->internal_tag = tag;
        } else {
                rq->tag = tag;
                rq->internal_tag = BLK_MQ_NO_TAG;
        }

        /* csd/requeue_work/fifo_time is initialized before use */
        rq->q = data->q;
        rq->mq_ctx = data->ctx;
        rq->mq_hctx = data->hctx;
        rq->rq_flags = 0;
        rq->cmd_flags = data->cmd_flags;
        if (data->flags & BLK_MQ_REQ_PM)
                rq->rq_flags |= RQF_PM;
        if (blk_queue_io_stat(data->q))
                rq->rq_flags |= RQF_IO_STAT;
        INIT_LIST_HEAD(&rq->queuelist);
        INIT_HLIST_NODE(&rq->hash);
        RB_CLEAR_NODE(&rq->rb_node);
        rq->rq_disk = NULL;
        rq->part = NULL;
#ifdef CONFIG_BLK_RQ_ALLOC_TIME
        rq->alloc_time_ns = alloc_time_ns;
#endif
        if (blk_mq_need_time_stamp(rq))
                rq->start_time_ns = ktime_get_ns();
        else
                rq->start_time_ns = 0;
        rq->io_start_time_ns = 0;
        rq->stats_sectors = 0;
        rq->nr_phys_segments = 0;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
        rq->nr_integrity_segments = 0;
#endif
        blk_crypto_rq_set_defaults(rq);
        /* tag was already set */
        WRITE_ONCE(rq->deadline, 0);

        rq->timeout = 0;

        rq->end_io = NULL;
        rq->end_io_data = NULL;

        data->ctx->rq_dispatched[op_is_sync(data->cmd_flags)]++;
        refcount_set(&rq->ref, 1);

        if (!op_is_flush(data->cmd_flags)) {
                struct elevator_queue *e = data->q->elevator;

                rq->elv.icq = NULL;
                if (e && e->type->ops.prepare_request) {
                        if (e->type->icq_cache)
                                blk_mq_sched_assign_ioc(rq);

                        e->type->ops.prepare_request(rq);
                        rq->rq_flags |= RQF_ELVPRIV;
                }
        }

        data->hctx->queued++;
        return rq;
}

static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
{
        struct request_queue *q = data->q;
        struct elevator_queue *e = q->elevator;
        u64 alloc_time_ns = 0;
        unsigned int tag;

        /* alloc_time includes depth and tag waits */
        if (blk_queue_rq_alloc_time(q))
                alloc_time_ns = ktime_get_ns();

        if (data->cmd_flags & REQ_NOWAIT)
                data->flags |= BLK_MQ_REQ_NOWAIT;

        if (e) {
                /*
                 * Flush requests are special and go directly to the
                 * dispatch list. Don't include reserved tags in the
                 * limiting, as it isn't useful.
                 */
                if (!op_is_flush(data->cmd_flags) &&
                    e->type->ops.limit_depth &&
                    !(data->flags & BLK_MQ_REQ_RESERVED))
                        e->type->ops.limit_depth(data->cmd_flags, data);
        }

retry:
        data->ctx = blk_mq_get_ctx(q);
        data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
        if (!e)
                blk_mq_tag_busy(data->hctx);

        /*
         * Waiting allocations only fail because of an inactive hctx.  In that
         * case just retry the hctx assignment and tag allocation as CPU hotplug
         * should have migrated us to an online CPU by now.
         */
        tag = blk_mq_get_tag(data);
        if (tag == BLK_MQ_NO_TAG) {
                if (data->flags & BLK_MQ_REQ_NOWAIT)
                        return NULL;

                /*
                 * Give up the CPU and sleep for a random short time to ensure
                 * that thread using a realtime scheduling class are migrated
                 * off the CPU, and thus off the hctx that is going away.
                 */
                msleep(3);
                goto retry;
        }
        return blk_mq_rq_ctx_init(data, tag, alloc_time_ns);
}

struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
                blk_mq_req_flags_t flags)
{
        struct blk_mq_alloc_data data = {
                .q                = q,
                .flags                = flags,
                .cmd_flags        = op,
        };
        struct request *rq;
        int ret;

        ret = blk_queue_enter(q, flags);
        if (ret)
                return ERR_PTR(ret);

        rq = __blk_mq_alloc_request(&data);
        if (!rq)
                goto out_queue_exit;
        rq->__data_len = 0;
        rq->__sector = (sector_t) -1;
        rq->bio = rq->biotail = NULL;
        return rq;
out_queue_exit:
        blk_queue_exit(q);
        return ERR_PTR(-EWOULDBLOCK);
}
EXPORT_SYMBOL(blk_mq_alloc_request);

struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
        unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
{
        struct blk_mq_alloc_data data = {
                .q                = q,
                .flags                = flags,
                .cmd_flags        = op,
        };
        u64 alloc_time_ns = 0;
        unsigned int cpu;
        unsigned int tag;
        int ret;

        /* alloc_time includes depth and tag waits */
        if (blk_queue_rq_alloc_time(q))
                alloc_time_ns = ktime_get_ns();

        /*
         * If the tag allocator sleeps we could get an allocation for a
         * different hardware context.  No need to complicate the low level
         * allocator for this for the rare use case of a command tied to
         * a specific queue.
         */
        if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)) ||
            WARN_ON_ONCE(!(flags & BLK_MQ_REQ_RESERVED)))
                return ERR_PTR(-EINVAL);

        if (hctx_idx >= q->nr_hw_queues)
                return ERR_PTR(-EIO);

        ret = blk_queue_enter(q, flags);
        if (ret)
                return ERR_PTR(ret);

        /*
         * Check if the hardware context is actually mapped to anything.
         * If not tell the caller that it should skip this queue.
         */
        ret = -EXDEV;
        data.hctx = q->queue_hw_ctx[hctx_idx];
        if (!blk_mq_hw_queue_mapped(data.hctx))
                goto out_queue_exit;
        cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
        if (cpu >= nr_cpu_ids)
                goto out_queue_exit;
        data.ctx = __blk_mq_get_ctx(q, cpu);

        if (!q->elevator)
                blk_mq_tag_busy(data.hctx);

        ret = -EWOULDBLOCK;
        tag = blk_mq_get_tag(&data);
        if (tag == BLK_MQ_NO_TAG)
                goto out_queue_exit;
        return blk_mq_rq_ctx_init(&data, tag, alloc_time_ns);

out_queue_exit:
        blk_queue_exit(q);
        return ERR_PTR(ret);
}
EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);

static void __blk_mq_free_request(struct request *rq)
{
        struct request_queue *q = rq->q;
        struct blk_mq_ctx *ctx = rq->mq_ctx;
        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
        const int sched_tag = rq->internal_tag;

        blk_crypto_free_request(rq);
        blk_pm_mark_last_busy(rq);
        rq->mq_hctx = NULL;
        if (rq->tag != BLK_MQ_NO_TAG)
                blk_mq_put_tag(hctx->tags, ctx, rq->tag);
        if (sched_tag != BLK_MQ_NO_TAG)
                blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
        blk_mq_sched_restart(hctx);
        blk_queue_exit(q);
}

void blk_mq_free_request(struct request *rq)
{
        struct request_queue *q = rq->q;
        struct elevator_queue *e = q->elevator;
        struct blk_mq_ctx *ctx = rq->mq_ctx;
        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;

        if (rq->rq_flags & RQF_ELVPRIV) {
                if (e && e->type->ops.finish_request)
                        e->type->ops.finish_request(rq);
                if (rq->elv.icq) {
                        put_io_context(rq->elv.icq->ioc);
                        rq->elv.icq = NULL;
                }
        }

        ctx->rq_completed[rq_is_sync(rq)]++;
        if (rq->rq_flags & RQF_MQ_INFLIGHT)
                __blk_mq_dec_active_requests(hctx);

        if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
                laptop_io_completion(q->backing_dev_info);

        rq_qos_done(q, rq);

        WRITE_ONCE(rq->state, MQ_RQ_IDLE);
        if (refcount_dec_and_test(&rq->ref))
                __blk_mq_free_request(rq);
}
EXPORT_SYMBOL_GPL(blk_mq_free_request);

inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
{
        u64 now = 0;

        if (blk_mq_need_time_stamp(rq))
                now = ktime_get_ns();

        if (rq->rq_flags & RQF_STATS) {
                blk_mq_poll_stats_start(rq->q);
                blk_stat_add(rq, now);
        }

        blk_mq_sched_completed_request(rq, now);

        blk_account_io_done(rq, now);

        if (rq->end_io) {
                rq_qos_done(rq->q, rq);
                rq->end_io(rq, error);
        } else {
                blk_mq_free_request(rq);
        }
}
EXPORT_SYMBOL(__blk_mq_end_request);

void blk_mq_end_request(struct request *rq, blk_status_t error)
{
        if (blk_update_request(rq, error, blk_rq_bytes(rq)))
                BUG();
        __blk_mq_end_request(rq, error);
}
EXPORT_SYMBOL(blk_mq_end_request);

/*
 * Softirq action handler - move entries to local list and loop over them
 * while passing them to the queue registered handler.
 */
static __latent_entropy void blk_done_softirq(struct softirq_action *h)
{
        struct list_head *cpu_list, local_list;

        local_irq_disable();
        cpu_list = this_cpu_ptr(&blk_cpu_done);
        list_replace_init(cpu_list, &local_list);
        local_irq_enable();

        while (!list_empty(&local_list)) {
                struct request *rq;

                rq = list_entry(local_list.next, struct request, ipi_list);
                list_del_init(&rq->ipi_list);
                rq->q->mq_ops->complete(rq);
        }
}

static void blk_mq_trigger_softirq(struct request *rq)
{
        struct list_head *list;
        unsigned long flags;

        local_irq_save(flags);
        list = this_cpu_ptr(&blk_cpu_done);
        list_add_tail(&rq->ipi_list, list);

        /*
         * If the list only contains our just added request, signal a raise of
         * the softirq.  If there are already entries there, someone already
         * raised the irq but it hasn't run yet.
         */
        if (list->next == &rq->ipi_list)
                raise_softirq_irqoff(BLOCK_SOFTIRQ);
        local_irq_restore(flags);
}

static int blk_softirq_cpu_dead(unsigned int cpu)
{
        /*
         * If a CPU goes away, splice its entries to the current CPU
         * and trigger a run of the softirq
         */
        local_irq_disable();
        list_splice_init(&per_cpu(blk_cpu_done, cpu),
                         this_cpu_ptr(&blk_cpu_done));
        raise_softirq_irqoff(BLOCK_SOFTIRQ);
        local_irq_enable();

        return 0;
}


static void __blk_mq_complete_request_remote(void *data)
{
        struct request *rq = data;

        /*
         * For most of single queue controllers, there is only one irq vector
         * for handling I/O completion, and the only irq's affinity is set
         * to all possible CPUs.  On most of ARCHs, this affinity means the irq
         * is handled on one specific CPU.
         *
         * So complete I/O requests in softirq context in case of single queue
         * devices to avoid degrading I/O performance due to irqsoff latency.
         */
        if (rq->q->nr_hw_queues == 1)
                blk_mq_trigger_softirq(rq);
        else
                rq->q->mq_ops->complete(rq);
}

static inline bool blk_mq_complete_need_ipi(struct request *rq)
{
        int cpu = raw_smp_processor_id();

        if (!IS_ENABLED(CONFIG_SMP) ||
            !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags))
                return false;

        /* same CPU or cache domain?  Complete locally */
        if (cpu == rq->mq_ctx->cpu ||
            (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) &&
             cpus_share_cache(cpu, rq->mq_ctx->cpu)))
                return false;

        /* don't try to IPI to an offline CPU */
        return cpu_online(rq->mq_ctx->cpu);
}

bool blk_mq_complete_request_remote(struct request *rq)
{
        WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);

        /*
         * For a polled request, always complete locallly, it's pointless
         * to redirect the completion.
         */
        if (rq->cmd_flags & REQ_HIPRI)
                return false;

        if (blk_mq_complete_need_ipi(rq)) {
                rq->csd.func = __blk_mq_complete_request_remote;
                rq->csd.info = rq;
                rq->csd.flags = 0;
                smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd);
        } else {
                if (rq->q->nr_hw_queues > 1)
                        return false;
                blk_mq_trigger_softirq(rq);
        }

        return true;
}
EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote);

/**
 * blk_mq_complete_request - end I/O on a request
 * @rq:                the request being processed
 *
 * Description:
 *        Complete a request by scheduling the ->complete_rq operation.
 **/
void blk_mq_complete_request(struct request *rq)
{
        if (!blk_mq_complete_request_remote(rq))
                rq->q->mq_ops->complete(rq);
}
EXPORT_SYMBOL(blk_mq_complete_request);

static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
        __releases(hctx->srcu)
{
        if (!(hctx->flags & BLK_MQ_F_BLOCKING))
                rcu_read_unlock();
        else
                srcu_read_unlock(hctx->srcu, srcu_idx);
}

static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
        __acquires(hctx->srcu)
{
        if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
                /* shut up gcc false positive */
                *srcu_idx = 0;
                rcu_read_lock();
        } else
                *srcu_idx = srcu_read_lock(hctx->srcu);
}

/**
 * blk_mq_start_request - Start processing a request
 * @rq: Pointer to request to be started
 *
 * Function used by device drivers to notify the block layer that a request
 * is going to be processed now, so blk layer can do proper initializations
 * such as starting the timeout timer.
 */
void blk_mq_start_request(struct request *rq)
{
        struct request_queue *q = rq->q;

        trace_block_rq_issue(rq);

        if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
                rq->io_start_time_ns = ktime_get_ns();
                rq->stats_sectors = blk_rq_sectors(rq);
                rq->rq_flags |= RQF_STATS;
                rq_qos_issue(q, rq);
        }

        WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);

        blk_add_timer(rq);
        WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);

#ifdef CONFIG_BLK_DEV_INTEGRITY
        if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
                q->integrity.profile->prepare_fn(rq);
#endif
}
EXPORT_SYMBOL(blk_mq_start_request);

static void __blk_mq_requeue_request(struct request *rq)
{
        struct request_queue *q = rq->q;

        blk_mq_put_driver_tag(rq);

        trace_block_rq_requeue(rq);
        rq_qos_requeue(q, rq);

        if (blk_mq_request_started(rq)) {
                WRITE_ONCE(rq->state, MQ_RQ_IDLE);
                rq->rq_flags &= ~RQF_TIMED_OUT;
        }
}

void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
{
        __blk_mq_requeue_request(rq);

        /* this request will be re-inserted to io scheduler queue */
        blk_mq_sched_requeue_request(rq);

        blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
}
EXPORT_SYMBOL(blk_mq_requeue_request);

static void blk_mq_requeue_work(struct work_struct *work)
{
        struct request_queue *q =
                container_of(work, struct request_queue, requeue_work.work);
        LIST_HEAD(rq_list);
        struct request *rq, *next;

        spin_lock_irq(&q->requeue_lock);
        list_splice_init(&q->requeue_list, &rq_list);
        spin_unlock_irq(&q->requeue_lock);

        list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
                if (!(rq->rq_flags & (RQF_SOFTBARRIER | RQF_DONTPREP)))
                        continue;

                rq->rq_flags &= ~RQF_SOFTBARRIER;
                list_del_init(&rq->queuelist);
                /*
                 * If RQF_DONTPREP, rq has contained some driver specific
                 * data, so insert it to hctx dispatch list to avoid any
                 * merge.
                 */
                if (rq->rq_flags & RQF_DONTPREP)
                        blk_mq_request_bypass_insert(rq, false, false);
                else
                        blk_mq_sched_insert_request(rq, true, false, false);
        }

        while (!list_empty(&rq_list)) {
                rq = list_entry(rq_list.next, struct request, queuelist);
                list_del_init(&rq->queuelist);
                blk_mq_sched_insert_request(rq, false, false, false);
        }

        blk_mq_run_hw_queues(q, false);
}

void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
                                bool kick_requeue_list)
{
        struct request_queue *q = rq->q;
        unsigned long flags;

        /*
         * We abuse this flag that is otherwise used by the I/O scheduler to
         * request head insertion from the workqueue.
         */
        BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);

        spin_lock_irqsave(&q->requeue_lock, flags);
        if (at_head) {
                rq->rq_flags |= RQF_SOFTBARRIER;
                list_add(&rq->queuelist, &q->requeue_list);
        } else {
                list_add_tail(&rq->queuelist, &q->requeue_list);
        }
        spin_unlock_irqrestore(&q->requeue_lock, flags);

        if (kick_requeue_list)
                blk_mq_kick_requeue_list(q);
}

void blk_mq_kick_requeue_list(struct request_queue *q)
{
        kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0);
}
EXPORT_SYMBOL(blk_mq_kick_requeue_list);

void blk_mq_delay_kick_requeue_list(struct request_queue *q,
                                    unsigned long msecs)
{
        kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work,
                                    msecs_to_jiffies(msecs));
}
EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);

struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
{
        if (tag < tags->nr_tags) {
                prefetch(tags->rqs[tag]);
                return tags->rqs[tag];
        }

        return NULL;
}
EXPORT_SYMBOL(blk_mq_tag_to_rq);

static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq,
                               void *priv, bool reserved)
{
        /*
         * If we find a request that isn't idle and the queue matches,
         * we know the queue is busy. Return false to stop the iteration.
         */
        if (blk_mq_request_started(rq) && rq->q == hctx->queue) {
                bool *busy = priv;

                *busy = true;
                return false;
        }

        return true;
}

bool blk_mq_queue_inflight(struct request_queue *q)
{
        bool busy = false;

        blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy);
        return busy;
}
EXPORT_SYMBOL_GPL(blk_mq_queue_inflight);

static void blk_mq_rq_timed_out(struct request *req, bool reserved)
{
        req->rq_flags |= RQF_TIMED_OUT;
        if (req->q->mq_ops->timeout) {
                enum blk_eh_timer_return ret;

                ret = req->q->mq_ops->timeout(req, reserved);
                if (ret == BLK_EH_DONE)
                        return;
                WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER);
        }

        blk_add_timer(req);
}

static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
{
        unsigned long deadline;

        if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT)
                return false;
        if (rq->rq_flags & RQF_TIMED_OUT)
                return false;

        deadline = READ_ONCE(rq->deadline);
        if (time_after_eq(jiffies, deadline))
                return true;

        if (*next == 0)
                *next = deadline;
        else if (time_after(*next, deadline))
                *next = deadline;
        return false;
}

void blk_mq_put_rq_ref(struct request *rq)
{
        if (is_flush_rq(rq))
                rq->end_io(rq, 0);
        else if (refcount_dec_and_test(&rq->ref))
                __blk_mq_free_request(rq);
}

static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
                struct request *rq, void *priv, bool reserved)
{
        unsigned long *next = priv;

        /*
         * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot
         * be reallocated underneath the timeout handler's processing, then
         * the expire check is reliable. If the request is not expired, then
         * it was completed and reallocated as a new request after returning
         * from blk_mq_check_expired().
         */
        if (blk_mq_req_expired(rq, next))
                blk_mq_rq_timed_out(rq, reserved);
        return true;
}

static void blk_mq_timeout_work(struct work_struct *work)
{
        struct request_queue *q =
                container_of(work, struct request_queue, timeout_work);
        unsigned long next = 0;
        struct blk_mq_hw_ctx *hctx;
        int i;

        /* A deadlock might occur if a request is stuck requiring a
         * timeout at the same time a queue freeze is waiting
         * completion, since the timeout code would not be able to
         * acquire the queue reference here.
         *
         * That's why we don't use blk_queue_enter here; instead, we use
         * percpu_ref_tryget directly, because we need to be able to
         * obtain a reference even in the short window between the queue
         * starting to freeze, by dropping the first reference in
         * blk_freeze_queue_start, and the moment the last request is
         * consumed, marked by the instant q_usage_counter reaches
         * zero.
         */
        if (!percpu_ref_tryget(&q->q_usage_counter))
                return;

        blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next);

        if (next != 0) {
                mod_timer(&q->timeout, next);
        } else {
                /*
                 * Request timeouts are handled as a forward rolling timer. If
                 * we end up here it means that no requests are pending and
                 * also that no request has been pending for a while. Mark
                 * each hctx as idle.
                 */
                queue_for_each_hw_ctx(q, hctx, i) {
                        /* the hctx may be unmapped, so check it here */
                        if (blk_mq_hw_queue_mapped(hctx))
                                blk_mq_tag_idle(hctx);
                }
        }
        blk_queue_exit(q);
}

struct flush_busy_ctx_data {
        struct blk_mq_hw_ctx *hctx;
        struct list_head *list;
};

static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
{
        struct flush_busy_ctx_data *flush_data = data;
        struct blk_mq_hw_ctx *hctx = flush_data->hctx;
        struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
        enum hctx_type type = hctx->type;

        spin_lock(&ctx->lock);
        list_splice_tail_init(&ctx->rq_lists[type], flush_data->list);
        sbitmap_clear_bit(sb, bitnr);
        spin_unlock(&ctx->lock);
        return true;
}

/*
 * Process software queues that have been marked busy, splicing them
 * to the for-dispatch
 */
void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
{
        struct flush_busy_ctx_data data = {
                .hctx = hctx,
                .list = list,
        };

        sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
}
EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);

struct dispatch_rq_data {
        struct blk_mq_hw_ctx *hctx;
        struct request *rq;
};

static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
                void *data)
{
        struct dispatch_rq_data *dispatch_data = data;
        struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
        struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
        enum hctx_type type = hctx->type;

        spin_lock(&ctx->lock);
        if (!list_empty(&ctx->rq_lists[type])) {
                dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next);
                list_del_init(&dispatch_data->rq->queuelist);
                if (list_empty(&ctx->rq_lists[type]))
                        sbitmap_clear_bit(sb, bitnr);
        }
        spin_unlock(&ctx->lock);

        return !dispatch_data->rq;
}

struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
                                        struct blk_mq_ctx *start)
{
        unsigned off = start ? start->index_hw[hctx->type] : 0;
        struct dispatch_rq_data data = {
                .hctx = hctx,
                .rq   = NULL,
        };

        __sbitmap_for_each_set(&hctx->ctx_map, off,
                               dispatch_rq_from_ctx, &data);

        return data.rq;
}

static inline unsigned int queued_to_index(unsigned int queued)
{
        if (!queued)
                return 0;

        return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
}

static bool __blk_mq_get_driver_tag(struct request *rq)
{
        struct sbitmap_queue *bt = rq->mq_hctx->tags->bitmap_tags;
        unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
        int tag;

        blk_mq_tag_busy(rq->mq_hctx);

        if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
                bt = rq->mq_hctx->tags->breserved_tags;
                tag_offset = 0;
        } else {
                if (!hctx_may_queue(rq->mq_hctx, bt))
                        return false;
        }

        tag = __sbitmap_queue_get(bt);
        if (tag == BLK_MQ_NO_TAG)
                return false;

        rq->tag = tag + tag_offset;
        return true;
}

static bool blk_mq_get_driver_tag(struct request *rq)
{
        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;

        if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq))
                return false;

        if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) &&
                        !(rq->rq_flags & RQF_MQ_INFLIGHT)) {
                rq->rq_flags |= RQF_MQ_INFLIGHT;
                __blk_mq_inc_active_requests(hctx);
        }
        hctx->tags->rqs[rq->tag] = rq;
        return true;
}

static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
                                int flags, void *key)
{
        struct blk_mq_hw_ctx *hctx;

        hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);

        spin_lock(&hctx->dispatch_wait_lock);
        if (!list_empty(&wait->entry)) {
                struct sbitmap_queue *sbq;

                list_del_init(&wait->entry);
                sbq = hctx->tags->bitmap_tags;
                atomic_dec(&sbq->ws_active);
        }
        spin_unlock(&hctx->dispatch_wait_lock);

        blk_mq_run_hw_queue(hctx, true);
        return 1;
}

/*
 * Mark us waiting for a tag. For shared tags, this involves hooking us into
 * the tag wakeups. For non-shared tags, we can simply mark us needing a
 * restart. For both cases, take care to check the condition again after
 * marking us as waiting.
 */
static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
                                 struct request *rq)
{
        struct sbitmap_queue *sbq = hctx->tags->bitmap_tags;
        struct wait_queue_head *wq;
        wait_queue_entry_t *wait;
        bool ret;

        if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
                blk_mq_sched_mark_restart_hctx(hctx);

                /*
                 * It's possible that a tag was freed in the window between the
                 * allocation failure and adding the hardware queue to the wait
                 * queue.
                 *
                 * Don't clear RESTART here, someone else could have set it.
                 * At most this will cost an extra queue run.
                 */
                return blk_mq_get_driver_tag(rq);
        }

        wait = &hctx->dispatch_wait;
        if (!list_empty_careful(&wait->entry))
                return false;

        wq = &bt_wait_ptr(sbq, hctx)->wait;

        spin_lock_irq(&wq->lock);
        spin_lock(&hctx->dispatch_wait_lock);
        if (!list_empty(&wait->entry)) {
                spin_unlock(&hctx->dispatch_wait_lock);
                spin_unlock_irq(&wq->lock);
                return false;
        }

        atomic_inc(&sbq->ws_active);
        wait->flags &= ~WQ_FLAG_EXCLUSIVE;
        __add_wait_queue(wq, wait);

        /*
         * Add one explicit barrier since blk_mq_get_driver_tag() may
         * not imply barrier in case of failure.
         *
         * Order adding us to wait queue and allocating driver tag.
         *
         * The pair is the one implied in sbitmap_queue_wake_up() which
         * orders clearing sbitmap tag bits and waitqueue_active() in
         * __sbitmap_queue_wake_up(), since waitqueue_active() is lockless
         *
         * Otherwise, re-order of adding wait queue and getting driver tag
         * may cause __sbitmap_queue_wake_up() to wake up nothing because
         * the waitqueue_active() may not observe us in wait queue.
         */
        smp_mb();

        /*
         * It's possible that a tag was freed in the window between the
         * allocation failure and adding the hardware queue to the wait
         * queue.
         */
        ret = blk_mq_get_driver_tag(rq);
        if (!ret) {
                spin_unlock(&hctx->dispatch_wait_lock);
                spin_unlock_irq(&wq->lock);
                return false;
        }

        /*
         * We got a tag, remove ourselves from the wait queue to ensure
         * someone else gets the wakeup.
         */
        list_del_init(&wait->entry);
        atomic_dec(&sbq->ws_active);
        spin_unlock(&hctx->dispatch_wait_lock);
        spin_unlock_irq(&wq->lock);

        return true;
}

#define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT  8
#define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR  4
/*
 * Update dispatch busy with the Exponential Weighted Moving Average(EWMA):
 * - EWMA is one simple way to compute running average value
 * - weight(7/8 and 1/8) is applied so that it can decrease exponentially
 * - take 4 as factor for avoiding to get too small(0) result, and this
 *   factor doesn't matter because EWMA decreases exponentially
 */
static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
{
        unsigned int ewma;

        ewma = hctx->dispatch_busy;

        if (!ewma && !busy)
                return;

        ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1;
        if (busy)
                ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR;
        ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT;

        hctx->dispatch_busy = ewma;
}

#define BLK_MQ_RESOURCE_DELAY        3                /* ms units */

static void blk_mq_handle_dev_resource(struct request *rq,
                                       struct list_head *list)
{
        struct request *next =
                list_first_entry_or_null(list, struct request, queuelist);

        /*
         * If an I/O scheduler has been configured and we got a driver tag for
         * the next request already, free it.
         */
        if (next)
                blk_mq_put_driver_tag(next);

        list_add(&rq->queuelist, list);
        __blk_mq_requeue_request(rq);
}

static void blk_mq_handle_zone_resource(struct request *rq,
                                        struct list_head *zone_list)
{
        /*
         * If we end up here it is because we cannot dispatch a request to a
         * specific zone due to LLD level zone-write locking or other zone
         * related resource not being available. In this case, set the request
         * aside in zone_list for retrying it later.
         */
        list_add(&rq->queuelist, zone_list);
        __blk_mq_requeue_request(rq);
}

enum prep_dispatch {
        PREP_DISPATCH_OK,
        PREP_DISPATCH_NO_TAG,
        PREP_DISPATCH_NO_BUDGET,
};

static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq,
                                                  bool need_budget)
{
        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;

        if (need_budget && !blk_mq_get_dispatch_budget(rq->q)) {
                blk_mq_put_driver_tag(rq);
                return PREP_DISPATCH_NO_BUDGET;
        }

        if (!blk_mq_get_driver_tag(rq)) {
                /*
                 * The initial allocation attempt failed, so we need to
                 * rerun the hardware queue when a tag is freed. The
                 * waitqueue takes care of that. If the queue is run
                 * before we add this entry back on the dispatch list,
                 * we'll re-run it below.
                 */
                if (!blk_mq_mark_tag_wait(hctx, rq)) {
                        /*
                         * All budgets not got from this function will be put
                         * together during handling partial dispatch
                         */
                        if (need_budget)
                                blk_mq_put_dispatch_budget(rq->q);
                        return PREP_DISPATCH_NO_TAG;
                }
        }

        return PREP_DISPATCH_OK;
}

/* release all allocated budgets before calling to blk_mq_dispatch_rq_list */
static void blk_mq_release_budgets(struct request_queue *q,
                unsigned int nr_budgets)
{
        int i;

        for (i = 0; i < nr_budgets; i++)
                blk_mq_put_dispatch_budget(q);
}

/*
 * Returns true if we did some work AND can potentially do more.
 */
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
                             unsigned int nr_budgets)
{
        enum prep_dispatch prep;
        struct request_queue *q = hctx->queue;
        struct request *rq, *nxt;
        int errors, queued;
        blk_status_t ret = BLK_STS_OK;
        LIST_HEAD(zone_list);
        bool needs_resource = false;

        if (list_empty(list))
                return false;

        /*
         * Now process all the entries, sending them to the driver.
         */
        errors = queued = 0;
        do {
                struct blk_mq_queue_data bd;

                rq = list_first_entry(list, struct request, queuelist);

                WARN_ON_ONCE(hctx != rq->mq_hctx);
                prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets);
                if (prep != PREP_DISPATCH_OK)
                        break;

                list_del_init(&rq->queuelist);

                bd.rq = rq;

                /*
                 * Flag last if we have no more requests, or if we have more
                 * but can't assign a driver tag to it.
                 */
                if (list_empty(list))
                        bd.last = true;
                else {
                        nxt = list_first_entry(list, struct request, queuelist);
                        bd.last = !blk_mq_get_driver_tag(nxt);
                }

                /*
                 * once the request is queued to lld, no need to cover the
                 * budget any more
                 */
                if (nr_budgets)
                        nr_budgets--;
                ret = q->mq_ops->queue_rq(hctx, &bd);
                switch (ret) {
                case BLK_STS_OK:
                        queued++;
                        break;
                case BLK_STS_RESOURCE:
                        needs_resource = true;
                        fallthrough;
                case BLK_STS_DEV_RESOURCE:
                        blk_mq_handle_dev_resource(rq, list);
                        goto out;
                case BLK_STS_ZONE_RESOURCE:
                        /*
                         * Move the request to zone_list and keep going through
                         * the dispatch list to find more requests the drive can
                         * accept.
                         */
                        blk_mq_handle_zone_resource(rq, &zone_list);
                        needs_resource = true;
                        break;
                default:
                        errors++;
                        blk_mq_end_request(rq, BLK_STS_IOERR);
                }
        } while (!list_empty(list));
out:
        if (!list_empty(&zone_list))
                list_splice_tail_init(&zone_list, list);

        hctx->dispatched[queued_to_index(queued)]++;

        /* If we didn't flush the entire list, we could have told the driver
         * there was more coming, but that turned out to be a lie.
         */
        if ((!list_empty(list) || errors || needs_resource ||
             ret == BLK_STS_DEV_RESOURCE) && q->mq_ops->commit_rqs && queued)
                q->mq_ops->commit_rqs(hctx);
        /*
         * Any items that need requeuing? Stuff them into hctx->dispatch,
         * that is where we will continue on next queue run.
         */
        if (!list_empty(list)) {
                bool needs_restart;
                /* For non-shared tags, the RESTART check will suffice */
                bool no_tag = prep == PREP_DISPATCH_NO_TAG &&
                        (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED);

                blk_mq_release_budgets(q, nr_budgets);

                spin_lock(&hctx->lock);
                list_splice_tail_init(list, &hctx->dispatch);
                spin_unlock(&hctx->lock);

                /*
                 * Order adding requests to hctx->dispatch and checking
                 * SCHED_RESTART flag. The pair of this smp_mb() is the one
                 * in blk_mq_sched_restart(). Avoid restart code path to
                 * miss the new added requests to hctx->dispatch, meantime
                 * SCHED_RESTART is observed here.
                 */
                smp_mb();

                /*
                 * If SCHED_RESTART was set by the caller of this function and
                 * it is no longer set that means that it was cleared by another
                 * thread and hence that a queue rerun is needed.
                 *
                 * If 'no_tag' is set, that means that we failed getting
                 * a driver tag with an I/O scheduler attached. If our dispatch
                 * waitqueue is no longer active, ensure that we run the queue
                 * AFTER adding our entries back to the list.
                 *
                 * If no I/O scheduler has been configured it is possible that
                 * the hardware queue got stopped and restarted before requests
                 * were pushed back onto the dispatch list. Rerun the queue to
                 * avoid starvation. Notes:
                 * - blk_mq_run_hw_queue() checks whether or not a queue has
                 *   been stopped before rerunning a queue.
                 * - Some but not all block drivers stop a queue before
                 *   returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
                 *   and dm-rq.
                 *
                 * If driver returns BLK_STS_RESOURCE and SCHED_RESTART
                 * bit is set, run queue after a delay to avoid IO stalls
                 * that could otherwise occur if the queue is idle.  We'll do
                 * similar if we couldn't get budget or couldn't lock a zone
                 * and SCHED_RESTART is set.
                 */
                needs_restart = blk_mq_sched_needs_restart(hctx);
                if (prep == PREP_DISPATCH_NO_BUDGET)
                        needs_resource = true;
                if (!needs_restart ||
                    (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
                        blk_mq_run_hw_queue(hctx, true);
                else if (needs_restart && needs_resource)
                        blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);

                blk_mq_update_dispatch_busy(hctx, true);
                return false;
        } else
                blk_mq_update_dispatch_busy(hctx, false);

        return (queued + errors) != 0;
}

/**
 * __blk_mq_run_hw_queue - Run a hardware queue.
 * @hctx: Pointer to the hardware queue to run.
 *
 * Send pending requests to the hardware.
 */
static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
{
        int srcu_idx;

        /*
         * We should be running this queue from one of the CPUs that
         * are mapped to it.
         *
         * There are at least two related races now between setting
         * hctx->next_cpu from blk_mq_hctx_next_cpu() and running
         * __blk_mq_run_hw_queue():
         *
         * - hctx->next_cpu is found offline in blk_mq_hctx_next_cpu(),
         *   but later it becomes online, then this warning is harmless
         *   at all
         *
         * - hctx->next_cpu is found online in blk_mq_hctx_next_cpu(),
         *   but later it becomes offline, then the warning can't be
         *   triggered, and we depend on blk-mq timeout handler to
         *   handle dispatched requests to this hctx
         */
        if (!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
                cpu_online(hctx->next_cpu)) {
                printk(KERN_WARNING "run queue from wrong CPU %d, hctx %s\n",
                        raw_smp_processor_id(),
                        cpumask_empty(hctx->cpumask) ? "inactive": "active");
                dump_stack();
        }

        /*
         * We can't run the queue inline with ints disabled. Ensure that
         * we catch bad users of this early.
         */
        WARN_ON_ONCE(in_interrupt());

        might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);

        hctx_lock(hctx, &srcu_idx);
        blk_mq_sched_dispatch_requests(hctx);
        hctx_unlock(hctx, srcu_idx);
}

static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
{
        int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask);

        if (cpu >= nr_cpu_ids)
                cpu = cpumask_first(hctx->cpumask);
        return cpu;
}

/*
 * It'd be great if the workqueue API had a way to pass
 * in a mask and had some smarts for more clever placement.
 * For now we just round-robin here, switching for every
 * BLK_MQ_CPU_WORK_BATCH queued items.
 */
static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
{
        bool tried = false;
        int next_cpu = hctx->next_cpu;

        if (hctx->queue->nr_hw_queues == 1)
                return WORK_CPU_UNBOUND;

        if (--hctx->next_cpu_batch <= 0) {
select_cpu:
                next_cpu = cpumask_next_and(next_cpu, hctx->cpumask,
                                cpu_online_mask);
                if (next_cpu >= nr_cpu_ids)
                        next_cpu = blk_mq_first_mapped_cpu(hctx);
                hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
        }

        /*
         * Do unbound schedule if we can't find a online CPU for this hctx,
         * and it should only happen in the path of handling CPU DEAD.
         */
        if (!cpu_online(next_cpu)) {
                if (!tried) {
                        tried = true;
                        goto select_cpu;
                }

                /*
                 * Make sure to re-select CPU next time once after CPUs
                 * in hctx->cpumask become online again.
                 */
                hctx->next_cpu = next_cpu;
                hctx->next_cpu_batch = 1;
                return WORK_CPU_UNBOUND;
        }

        hctx->next_cpu = next_cpu;
        return next_cpu;
}

/**
 * __blk_mq_delay_run_hw_queue - Run (or schedule to run) a hardware queue.
 * @hctx: Pointer to the hardware queue to run.
 * @async: If we want to run the queue asynchronously.
 * @msecs: Microseconds of delay to wait before running the queue.
 *
 * If !@async, try to run the queue now. Else, run the queue asynchronously and
 * with a delay of @msecs.
 */
static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
                                        unsigned long msecs)
{
        if (unlikely(blk_mq_hctx_stopped(hctx)))
                return;

        if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
                int cpu = get_cpu();
                if (cpumask_test_cpu(cpu, hctx->cpumask)) {
                        __blk_mq_run_hw_queue(hctx);
                        put_cpu();
                        return;
                }

                put_cpu();
        }

        kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
                                    msecs_to_jiffies(msecs));
}

/**
 * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously.
 * @hctx: Pointer to the hardware queue to run.
 * @msecs: Microseconds of delay to wait before running the queue.
 *
 * Run a hardware queue asynchronously with a delay of @msecs.
 */
void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
{
        __blk_mq_delay_run_hw_queue(hctx, true, msecs);
}
EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);

/**
 * blk_mq_run_hw_queue - Start to run a hardware queue.
 * @hctx: Pointer to the hardware queue to run.
 * @async: If we want to run the queue asynchronously.
 *
 * Check if the request queue is not in a quiesced state and if there are
 * pending requests to be sent. If this is true, run the queue to send requests
 * to hardware.
 */
void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
{
        int srcu_idx;
        bool need_run;

        /*
         * When queue is quiesced, we may be switching io scheduler, or
         * updating nr_hw_queues, or other things, and we can't run queue
         * any more, even __blk_mq_hctx_has_pending() can't be called safely.
         *
         * And queue will be rerun in blk_mq_unquiesce_queue() if it is
         * quiesced.
         */
        hctx_lock(hctx, &srcu_idx);
        need_run = !blk_queue_quiesced(hctx->queue) &&
                blk_mq_hctx_has_pending(hctx);
        hctx_unlock(hctx, srcu_idx);

        if (need_run)
                __blk_mq_delay_run_hw_queue(hctx, async, 0);
}
EXPORT_SYMBOL(blk_mq_run_hw_queue);

/**
 * blk_mq_run_hw_queues - Run all hardware queues in a request queue.
 * @q: Pointer to the request queue to run.
 * @async: If we want to run the queue asynchronously.
 */
void blk_mq_run_hw_queues(struct request_queue *q, bool async)
{
        struct blk_mq_hw_ctx *hctx;
        int i;

        queue_for_each_hw_ctx(q, hctx, i) {
                if (blk_mq_hctx_stopped(hctx))
                        continue;

                blk_mq_run_hw_queue(hctx, async);
        }
}
EXPORT_SYMBOL(blk_mq_run_hw_queues);

/**
 * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously.
 * @q: Pointer to the request queue to run.
 * @msecs: Microseconds of delay to wait before running the queues.
 */
void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
{
        struct blk_mq_hw_ctx *hctx;
        int i;

        queue_for_each_hw_ctx(q, hctx, i) {
                if (blk_mq_hctx_stopped(hctx))
                        continue;

                blk_mq_delay_run_hw_queue(hctx, msecs);
        }
}
EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);

/**
 * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
 * @q: request queue.
 *
 * The caller is responsible for serializing this function against
 * blk_mq_{start,stop}_hw_queue().
 */
bool blk_mq_queue_stopped(struct request_queue *q)
{
        struct blk_mq_hw_ctx *hctx;
        int i;

        queue_for_each_hw_ctx(q, hctx, i)
                if (blk_mq_hctx_stopped(hctx))
                        return true;

        return false;
}
EXPORT_SYMBOL(blk_mq_queue_stopped);

/*
 * This function is often used for pausing .queue_rq() by driver when
 * there isn't enough resource or some conditions aren't satisfied, and
 * BLK_STS_RESOURCE is usually returned.
 *
 * We do not guarantee that dispatch can be drained or blocked
 * after blk_mq_stop_hw_queue() returns. Please use
 * blk_mq_quiesce_queue() for that requirement.
 */
void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
{
        cancel_delayed_work(&hctx->run_work);

        set_bit(BLK_MQ_S_STOPPED, &hctx->state);
}
EXPORT_SYMBOL(blk_mq_stop_hw_queue);

/*
 * This function is often used for pausing .queue_rq() by driver when
 * there isn't enough resource or some conditions aren't satisfied, and
 * BLK_STS_RESOURCE is usually returned.
 *
 * We do not guarantee that dispatch can be drained or blocked
 * after blk_mq_stop_hw_queues() returns. Please use
 * blk_mq_quiesce_queue() for that requirement.
 */
void blk_mq_stop_hw_queues(struct request_queue *q)
{
        struct blk_mq_hw_ctx *hctx;
        int i;

        queue_for_each_hw_ctx(q, hctx, i)
                blk_mq_stop_hw_queue(hctx);
}
EXPORT_SYMBOL(blk_mq_stop_hw_queues);

void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
{
        clear_bit(BLK_MQ_S_STOPPED, &hctx->state);

        blk_mq_run_hw_queue(hctx, false);
}
EXPORT_SYMBOL(blk_mq_start_hw_queue);

void blk_mq_start_hw_queues(struct request_queue *q)
{
        struct blk_mq_hw_ctx *hctx;
        int i;

        queue_for_each_hw_ctx(q, hctx, i)
                blk_mq_start_hw_queue(hctx);
}
EXPORT_SYMBOL(blk_mq_start_hw_queues);

void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
{
        if (!blk_mq_hctx_stopped(hctx))
                return;

        clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
        /*
         * Pairs with the smp_mb() in blk_mq_hctx_stopped() to order the
         * clearing of BLK_MQ_S_STOPPED above and the checking of dispatch
         * list in the subsequent routine.
         */
        smp_mb__after_atomic();
        blk_mq_run_hw_queue(hctx, async);
}
EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue);

void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
{
        struct blk_mq_hw_ctx *hctx;
        int i;

        queue_for_each_hw_ctx(q, hctx, i)
                blk_mq_start_stopped_hw_queue(hctx, async);
}
EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);

static void blk_mq_run_work_fn(struct work_struct *work)
{
        struct blk_mq_hw_ctx *hctx;

        hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);

        /*
         * If we are stopped, don't run the queue.
         */
        if (blk_mq_hctx_stopped(hctx))
                return;

        __blk_mq_run_hw_queue(hctx);
}

static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
                                            struct request *rq,
                                            bool at_head)
{
        struct blk_mq_ctx *ctx = rq->mq_ctx;
        enum hctx_type type = hctx->type;

        lockdep_assert_held(&ctx->lock);

        trace_block_rq_insert(rq);

        if (at_head)
                list_add(&rq->queuelist, &ctx->rq_lists[type]);
        else
                list_add_tail(&rq->queuelist, &ctx->rq_lists[type]);
}

void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
                             bool at_head)
{
        struct blk_mq_ctx *ctx = rq->mq_ctx;

        lockdep_assert_held(&ctx->lock);

        __blk_mq_insert_req_list(hctx, rq, at_head);
        blk_mq_hctx_mark_pending(hctx, ctx);
}

/**
 * blk_mq_request_bypass_insert - Insert a request at dispatch list.
 * @rq: Pointer to request to be inserted.
 * @at_head: true if the request should be inserted at the head of the list.
 * @run_queue: If we should run the hardware queue after inserting the request.
 *
 * Should only be used carefully, when the caller knows we want to
 * bypass a potential IO scheduler on the target device.
 */
void blk_mq_request_bypass_insert(struct request *rq, bool at_head,
                                  bool run_queue)
{
        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;

        spin_lock(&hctx->lock);
        if (at_head)
                list_add(&rq->queuelist, &hctx->dispatch);
        else
                list_add_tail(&rq->queuelist, &hctx->dispatch);
        spin_unlock(&hctx->lock);

        if (run_queue)
                blk_mq_run_hw_queue(hctx, false);
}

void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
                            struct list_head *list)

{
        struct request *rq;
        enum hctx_type type = hctx->type;

        /*
         * preemption doesn't flush plug list, so it's possible ctx->cpu is
         * offline now
         */
        list_for_each_entry(rq, list, queuelist) {
                BUG_ON(rq->mq_ctx != ctx);
                trace_block_rq_insert(rq);
        }

        spin_lock(&ctx->lock);
        list_splice_tail_init(list, &ctx->rq_lists[type]);
        blk_mq_hctx_mark_pending(hctx, ctx);
        spin_unlock(&ctx->lock);
}

static int plug_rq_cmp(void *priv, const struct list_head *a,
                       const struct list_head *b)
{
        struct request *rqa = container_of(a, struct request, queuelist);
        struct request *rqb = container_of(b, struct request, queuelist);

        if (rqa->mq_ctx != rqb->mq_ctx)
                return rqa->mq_ctx > rqb->mq_ctx;
        if (rqa->mq_hctx != rqb->mq_hctx)
                return rqa->mq_hctx > rqb->mq_hctx;

        return blk_rq_pos(rqa) > blk_rq_pos(rqb);
}

void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
{
        LIST_HEAD(list);

        if (list_empty(&plug->mq_list))
                return;
        list_splice_init(&plug->mq_list, &list);

        if (plug->rq_count > 2 && plug->multiple_queues)
                list_sort(NULL, &list, plug_rq_cmp);

        plug->rq_count = 0;

        do {
                struct list_head rq_list;
                struct request *rq, *head_rq = list_entry_rq(list.next);
                struct list_head *pos = &head_rq->queuelist; /* skip first */
                struct blk_mq_hw_ctx *this_hctx = head_rq->mq_hctx;
                struct blk_mq_ctx *this_ctx = head_rq->mq_ctx;
                unsigned int depth = 1;

                list_for_each_continue(pos, &list) {
                        rq = list_entry_rq(pos);
                        BUG_ON(!rq->q);
                        if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx)
                                break;
                        depth++;
                }

                list_cut_before(&rq_list, &list, pos);
                trace_block_unplug(head_rq->q, depth, !from_schedule);
                blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list,
                                                from_schedule);
        } while(!list_empty(&list));
}

static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
                unsigned int nr_segs)
{
        int err;

        if (bio->bi_opf & REQ_RAHEAD)
                rq->cmd_flags |= REQ_FAILFAST_MASK;

        rq->__sector = bio->bi_iter.bi_sector;
        rq->write_hint = bio->bi_write_hint;
        blk_rq_bio_prep(rq, bio, nr_segs);

        /* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */
        err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO);
        WARN_ON_ONCE(err);

        blk_account_io_start(rq);
}

static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
                                            struct request *rq,
                                            blk_qc_t *cookie, bool last)
{
        struct request_queue *q = rq->q;
        struct blk_mq_queue_data bd = {
                .rq = rq,
                .last = last,
        };
        blk_qc_t new_cookie;
        blk_status_t ret;

        new_cookie = request_to_qc_t(hctx, rq);

        /*
         * For OK queue, we are done. For error, caller may kill it.
         * Any other error (busy), just add it to our list as we
         * previously would have done.
         */
        ret = q->mq_ops->queue_rq(hctx, &bd);
        switch (ret) {
        case BLK_STS_OK:
                blk_mq_update_dispatch_busy(hctx, false);
                *cookie = new_cookie;
                break;
        case BLK_STS_RESOURCE:
        case BLK_STS_DEV_RESOURCE:
                blk_mq_update_dispatch_busy(hctx, true);
                __blk_mq_requeue_request(rq);
                break;
        default:
                blk_mq_update_dispatch_busy(hctx, false);
                *cookie = BLK_QC_T_NONE;
                break;
        }

        return ret;
}

static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
                                                struct request *rq,
                                                blk_qc_t *cookie,
                                                bool bypass_insert, bool last)
{
        struct request_queue *q = rq->q;
        bool run_queue = true;

        /*
         * RCU or SRCU read lock is needed before checking quiesced flag.
         *
         * When queue is stopped or quiesced, ignore 'bypass_insert' from
         * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller,
         * and avoid driver to try to dispatch again.
         */
        if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) {
                run_queue = false;
                bypass_insert = false;
                goto insert;
        }

        if (q->elevator && !bypass_insert)
                goto insert;

        if (!blk_mq_get_dispatch_budget(q))
                goto insert;

        if (!blk_mq_get_driver_tag(rq)) {
                blk_mq_put_dispatch_budget(q);
                goto insert;
        }

        return __blk_mq_issue_directly(hctx, rq, cookie, last);
insert:
        if (bypass_insert)
                return BLK_STS_RESOURCE;

        blk_mq_sched_insert_request(rq, false, run_queue, false);

        return BLK_STS_OK;
}

/**
 * blk_mq_try_issue_directly - Try to send a request directly to device driver.
 * @hctx: Pointer of the associated hardware queue.
 * @rq: Pointer to request to be sent.
 * @cookie: Request queue cookie.
 *
 * If the device has enough resources to accept a new request now, send the
 * request directly to device driver. Else, insert at hctx->dispatch queue, so
 * we can try send it another time in the future. Requests inserted at this
 * queue have higher priority.
 */
static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
                struct request *rq, blk_qc_t *cookie)
{
        blk_status_t ret;
        int srcu_idx;

        might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);

        hctx_lock(hctx, &srcu_idx);

        ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true);
        if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
                blk_mq_request_bypass_insert(rq, false, true);
        else if (ret != BLK_STS_OK)
                blk_mq_end_request(rq, ret);

        hctx_unlock(hctx, srcu_idx);
}

blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
{
        blk_status_t ret;
        int srcu_idx;
        blk_qc_t unused_cookie;
        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;

        hctx_lock(hctx, &srcu_idx);
        ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last);
        hctx_unlock(hctx, srcu_idx);

        return ret;
}

void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
                struct list_head *list)
{
        int queued = 0;
        int errors = 0;

        while (!list_empty(list)) {
                blk_status_t ret;
                struct request *rq = list_first_entry(list, struct request,
                                queuelist);

                list_del_init(&rq->queuelist);
                ret = blk_mq_request_issue_directly(rq, list_empty(list));
                if (ret != BLK_STS_OK) {
                        errors++;
                        if (ret == BLK_STS_RESOURCE ||
                                        ret == BLK_STS_DEV_RESOURCE) {
                                blk_mq_request_bypass_insert(rq, false,
                                                        list_empty(list));
                                break;
                        }
                        blk_mq_end_request(rq, ret);
                } else
                        queued++;
        }

        /*
         * If we didn't flush the entire list, we could have told
         * the driver there was more coming, but that turned out to
         * be a lie.
         */
        if ((!list_empty(list) || errors) &&
             hctx->queue->mq_ops->commit_rqs && queued)
                hctx->queue->mq_ops->commit_rqs(hctx);
}

static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
{
        list_add_tail(&rq->queuelist, &plug->mq_list);
        plug->rq_count++;
        if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) {
                struct request *tmp;

                tmp = list_first_entry(&plug->mq_list, struct request,
                                                queuelist);
                if (tmp->q != rq->q)
                        plug->multiple_queues = true;
        }
}

/*
 * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
 * queues. This is important for md arrays to benefit from merging
 * requests.
 */
static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
{
        if (plug->multiple_queues)
                return BLK_MAX_REQUEST_COUNT * 2;
        return BLK_MAX_REQUEST_COUNT;
}

/**
 * blk_mq_submit_bio - Create and send a request to block device.
 * @bio: Bio pointer.
 *
 * Builds up a request structure from @q and @bio and send to the device. The
 * request may not be queued directly to hardware if:
 * * This request can be merged with another one
 * * We want to place request at plug queue for possible future merging
 * * There is an IO scheduler active at this queue
 *
 * It will not queue the request if there is an error with the bio, or at the
 * request creation.
 *
 * Returns: Request queue cookie.
 */
blk_qc_t blk_mq_submit_bio(struct bio *bio)
{
        struct request_queue *q = bio->bi_disk->queue;
        const int is_sync = op_is_sync(bio->bi_opf);
        const int is_flush_fua = op_is_flush(bio->bi_opf);
        struct blk_mq_alloc_data data = {
                .q                = q,
        };
        struct request *rq;
        struct blk_plug *plug;
        struct request *same_queue_rq = NULL;
        unsigned int nr_segs;
        blk_qc_t cookie;
        blk_status_t ret;

        blk_queue_bounce(q, &bio);
        __blk_queue_split(&bio, &nr_segs);

        if (!bio_integrity_prep(bio))
                goto queue_exit;

        if (!is_flush_fua && !blk_queue_nomerges(q) &&
            blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq))
                goto queue_exit;

        if (blk_mq_sched_bio_merge(q, bio, nr_segs))
                goto queue_exit;

        rq_qos_throttle(q, bio);

        data.cmd_flags = bio->bi_opf;
        rq = __blk_mq_alloc_request(&data);
        if (unlikely(!rq)) {
                rq_qos_cleanup(q, bio);
                if (bio->bi_opf & REQ_NOWAIT)
                        bio_wouldblock_error(bio);
                goto queue_exit;
        }

        trace_block_getrq(q, bio, bio->bi_opf);

        rq_qos_track(q, rq, bio);

        cookie = request_to_qc_t(data.hctx, rq);

        blk_mq_bio_to_request(rq, bio, nr_segs);

        ret = blk_crypto_rq_get_keyslot(rq);
        if (ret != BLK_STS_OK) {
                bio->bi_status = ret;
                bio_endio(bio);
                blk_mq_free_request(rq);
                return BLK_QC_T_NONE;
        }

        plug = blk_mq_plug(q, bio);
        if (unlikely(is_flush_fua)) {
                /* Bypass scheduler for flush requests */
                blk_insert_flush(rq);
                blk_mq_run_hw_queue(data.hctx, true);
        } else if (plug && (q->nr_hw_queues == 1 ||
                   blk_mq_is_sbitmap_shared(rq->mq_hctx->flags) ||
                   q->mq_ops->commit_rqs || !blk_queue_nonrot(q))) {
                /*
                 * Use plugging if we have a ->commit_rqs() hook as well, as
                 * we know the driver uses bd->last in a smart fashion.
                 *
                 * Use normal plugging if this disk is slow HDD, as sequential
                 * IO may benefit a lot from plug merging.
                 */
                unsigned int request_count = plug->rq_count;
                struct request *last = NULL;

                if (!request_count)
                        trace_block_plug(q);
                else
                        last = list_entry_rq(plug->mq_list.prev);

                if (request_count >= blk_plug_max_rq_count(plug) || (last &&
                    blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
                        blk_flush_plug_list(plug, false);
                        trace_block_plug(q);
                }

                blk_add_rq_to_plug(plug, rq);
        } else if (q->elevator) {
                /* Insert the request at the IO scheduler queue */
                blk_mq_sched_insert_request(rq, false, true, true);
        } else if (plug && !blk_queue_nomerges(q)) {
                /*
                 * We do limited plugging. If the bio can be merged, do that.
                 * Otherwise the existing request in the plug list will be
                 * issued. So the plug list will have one request at most
                 * The plug list might get flushed before this. If that happens,
                 * the plug list is empty, and same_queue_rq is invalid.
                 */
                if (list_empty(&plug->mq_list))
                        same_queue_rq = NULL;
                if (same_queue_rq) {
                        list_del_init(&same_queue_rq->queuelist);
                        plug->rq_count--;
                }
                blk_add_rq_to_plug(plug, rq);
                trace_block_plug(q);

                if (same_queue_rq) {
                        data.hctx = same_queue_rq->mq_hctx;
                        trace_block_unplug(q, 1, true);
                        blk_mq_try_issue_directly(data.hctx, same_queue_rq,
                                        &cookie);
                }
        } else if ((q->nr_hw_queues > 1 && is_sync) ||
                        !data.hctx->dispatch_busy) {
                /*
                 * There is no scheduler and we can try to send directly
                 * to the hardware.
                 */
                blk_mq_try_issue_directly(data.hctx, rq, &cookie);
        } else {
                /* Default case. */
                blk_mq_sched_insert_request(rq, false, true, true);
        }

        return cookie;
queue_exit:
        blk_queue_exit(q);
        return BLK_QC_T_NONE;
}

static size_t order_to_size(unsigned int order)
{
        return (size_t)PAGE_SIZE << order;
}

/* called before freeing request pool in @tags */
static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set,
                struct blk_mq_tags *tags, unsigned int hctx_idx)
{
        struct blk_mq_tags *drv_tags = set->tags[hctx_idx];
        struct page *page;
        unsigned long flags;

        list_for_each_entry(page, &tags->page_list, lru) {
                unsigned long start = (unsigned long)page_address(page);
                unsigned long end = start + order_to_size(page->private);
                int i;

                for (i = 0; i < set->queue_depth; i++) {
                        struct request *rq = drv_tags->rqs[i];
                        unsigned long rq_addr = (unsigned long)rq;

                        if (rq_addr >= start && rq_addr < end) {
                                WARN_ON_ONCE(refcount_read(&rq->ref) != 0);
                                cmpxchg(&drv_tags->rqs[i], rq, NULL);
                        }
                }
        }

        /*
         * Wait until all pending iteration is done.
         *
         * Request reference is cleared and it is guaranteed to be observed
         * after the ->lock is released.
         */
        spin_lock_irqsave(&drv_tags->lock, flags);
        spin_unlock_irqrestore(&drv_tags->lock, flags);
}

void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
                     unsigned int hctx_idx)
{
        struct page *page;

        if (tags->rqs && set->ops->exit_request) {
                int i;

                for (i = 0; i < tags->nr_tags; i++) {
                        struct request *rq = tags->static_rqs[i];

                        if (!rq)
                                continue;
                        set->ops->exit_request(set, rq, hctx_idx);
                        tags->static_rqs[i] = NULL;
                }
        }

        blk_mq_clear_rq_mapping(set, tags, hctx_idx);

        while (!list_empty(&tags->page_list)) {
                page = list_first_entry(&tags->page_list, struct page, lru);
                list_del_init(&page->lru);
                /*
                 * Remove kmemleak object previously allocated in
                 * blk_mq_alloc_rqs().
                 */
                kmemleak_free(page_address(page));
                __free_pages(page, page->private);
        }
}

void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags)
{
        kfree(tags->rqs);
        tags->rqs = NULL;
        kfree(tags->static_rqs);
        tags->static_rqs = NULL;

        blk_mq_free_tags(tags, flags);
}

struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
                                        unsigned int hctx_idx,
                                        unsigned int nr_tags,
                                        unsigned int reserved_tags,
                                        unsigned int flags)
{
        struct blk_mq_tags *tags;
        int node;

        node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
        if (node == NUMA_NO_NODE)
                node = set->numa_node;

        tags = blk_mq_init_tags(nr_tags, reserved_tags, node, flags);
        if (!tags)
                return NULL;

        tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *),
                                 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
                                 node);
        if (!tags->rqs) {
                blk_mq_free_tags(tags, flags);
                return NULL;
        }

        tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *),
                                        GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
                                        node);
        if (!tags->static_rqs) {
                kfree(tags->rqs);
                blk_mq_free_tags(tags, flags);
                return NULL;
        }

        return tags;
}

static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
                               unsigned int hctx_idx, int node)
{
        int ret;

        if (set->ops->init_request) {
                ret = set->ops->init_request(set, rq, hctx_idx, node);
                if (ret)
                        return ret;
        }

        WRITE_ONCE(rq->state, MQ_RQ_IDLE);
        return 0;
}

int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
                     unsigned int hctx_idx, unsigned int depth)
{
        unsigned int i, j, entries_per_page, max_order = 4;
        size_t rq_size, left;
        int node;

        node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
        if (node == NUMA_NO_NODE)
                node = set->numa_node;

        INIT_LIST_HEAD(&tags->page_list);

        /*
         * rq_size is the size of the request plus driver payload, rounded
         * to the cacheline size
         */
        rq_size = round_up(sizeof(struct request) + set->cmd_size,
                                cache_line_size());
        left = rq_size * depth;

        for (i = 0; i < depth; ) {
                int this_order = max_order;
                struct page *page;
                int to_do;
                void *p;

                while (this_order && left < order_to_size(this_order - 1))
                        this_order--;

                do {
                        page = alloc_pages_node(node,
                                GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
                                this_order);
                        if (page)
                                break;
                        if (!this_order--)
                                break;
                        if (order_to_size(this_order) < rq_size)
                                break;
                } while (1);

                if (!page)
                        goto fail;

                page->private = this_order;
                list_add_tail(&page->lru, &tags->page_list);

                p = page_address(page);
                /*
                 * Allow kmemleak to scan these pages as they contain pointers
                 * to additional allocations like via ops->init_request().
                 */
                kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO);
                entries_per_page = order_to_size(this_order) / rq_size;
                to_do = min(entries_per_page, depth - i);
                left -= to_do * rq_size;
                for (j = 0; j < to_do; j++) {
                        struct request *rq = p;

                        tags->static_rqs[i] = rq;
                        if (blk_mq_init_request(set, rq, hctx_idx, node)) {
                                tags->static_rqs[i] = NULL;
                                goto fail;
                        }

                        p += rq_size;
                        i++;
                }
        }
        return 0;

fail:
        blk_mq_free_rqs(set, tags, hctx_idx);
        return -ENOMEM;
}

struct rq_iter_data {
        struct blk_mq_hw_ctx *hctx;
        bool has_rq;
};

static bool blk_mq_has_request(struct request *rq, void *data, bool reserved)
{
        struct rq_iter_data *iter_data = data;

        if (rq->mq_hctx != iter_data->hctx)
                return true;
        iter_data->has_rq = true;
        return false;
}

static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
{
        struct blk_mq_tags *tags = hctx->sched_tags ?
                        hctx->sched_tags : hctx->tags;
        struct rq_iter_data data = {
                .hctx        = hctx,
        };

        blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
        return data.has_rq;
}

static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu,
                struct blk_mq_hw_ctx *hctx)
{
        if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu)
                return false;
        if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids)
                return false;
        return true;
}

static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
{
        struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
                        struct blk_mq_hw_ctx, cpuhp_online);
        int ret = 0;

        if (!cpumask_test_cpu(cpu, hctx->cpumask) ||
            !blk_mq_last_cpu_in_hctx(cpu, hctx))
                return 0;

        /*
         * Prevent new request from being allocated on the current hctx.
         *
         * The smp_mb__after_atomic() Pairs with the implied barrier in
         * test_and_set_bit_lock in sbitmap_get().  Ensures the inactive flag is
         * seen once we return from the tag allocator.
         */
        set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
        smp_mb__after_atomic();

        /*
         * Try to grab a reference to the queue and wait for any outstanding
         * requests.  If we could not grab a reference the queue has been
         * frozen and there are no requests.
         */
        if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) {
                while (blk_mq_hctx_has_requests(hctx)) {
                        /*
                         * The wakeup capable IRQ handler of block device is
                         * not called during suspend. Skip the loop by checking
                         * pm_wakeup_pending to prevent the deadlock and improve
                         * suspend latency.
                         */
                        if (pm_wakeup_pending()) {
                                clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
                                ret = -EBUSY;
                                break;
                        }
                        msleep(5);
                }
                percpu_ref_put(&hctx->queue->q_usage_counter);
        }

        return ret;
}

static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
{
        struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
                        struct blk_mq_hw_ctx, cpuhp_online);

        if (cpumask_test_cpu(cpu, hctx->cpumask))
                clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
        return 0;
}

/*
 * 'cpu' is going away. splice any existing rq_list entries from this
 * software queue to the hw queue dispatch list, and ensure that it
 * gets run.
 */
static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
{
        struct blk_mq_hw_ctx *hctx;
        struct blk_mq_ctx *ctx;
        LIST_HEAD(tmp);
        enum hctx_type type;

        hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
        if (!cpumask_test_cpu(cpu, hctx->cpumask))
                return 0;

        ctx = __blk_mq_get_ctx(hctx->queue, cpu);
        type = hctx->type;

        spin_lock(&ctx->lock);
        if (!list_empty(&ctx->rq_lists[type])) {
                list_splice_init(&ctx->rq_lists[type], &tmp);
                blk_mq_hctx_clear_pending(hctx, ctx);
        }
        spin_unlock(&ctx->lock);

        if (list_empty(&tmp))
                return 0;

        spin_lock(&hctx->lock);
        list_splice_tail_init(&tmp, &hctx->dispatch);
        spin_unlock(&hctx->lock);

        blk_mq_run_hw_queue(hctx, true);
        return 0;
}

static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
{
        if (!(hctx->flags & BLK_MQ_F_STACKING))
                cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
                                                    &hctx->cpuhp_online);
        cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
                                            &hctx->cpuhp_dead);
}

/*
 * Before freeing hw queue, clearing the flush request reference in
 * tags->rqs[] for avoiding potential UAF.
 */
static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags,
                unsigned int queue_depth, struct request *flush_rq)
{
        int i;
        unsigned long flags;

        /* The hw queue may not be mapped yet */
        if (!tags)
                return;

        WARN_ON_ONCE(refcount_read(&flush_rq->ref) != 0);

        for (i = 0; i < queue_depth; i++)
                cmpxchg(&tags->rqs[i], flush_rq, NULL);

        /*
         * Wait until all pending iteration is done.
         *
         * Request reference is cleared and it is guaranteed to be observed
         * after the ->lock is released.
         */
        spin_lock_irqsave(&tags->lock, flags);
        spin_unlock_irqrestore(&tags->lock, flags);
}

/* hctx->ctxs will be freed in queue's release handler */
static void blk_mq_exit_hctx(struct request_queue *q,
                struct blk_mq_tag_set *set,
                struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
{
        struct request *flush_rq = hctx->fq->flush_rq;

        if (blk_mq_hw_queue_mapped(hctx))
                blk_mq_tag_idle(hctx);

        blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx],
                        set->queue_depth, flush_rq);
        if (set->ops->exit_request)
                set->ops->exit_request(set, flush_rq, hctx_idx);

        if (set->ops->exit_hctx)
                set->ops->exit_hctx(hctx, hctx_idx);

        blk_mq_remove_cpuhp(hctx);

        spin_lock(&q->unused_hctx_lock);
        list_add(&hctx->hctx_list, &q->unused_hctx_list);
        spin_unlock(&q->unused_hctx_lock);
}

static void blk_mq_exit_hw_queues(struct request_queue *q,
                struct blk_mq_tag_set *set, int nr_queue)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned int i;

        queue_for_each_hw_ctx(q, hctx, i) {
                if (i == nr_queue)
                        break;
                blk_mq_debugfs_unregister_hctx(hctx);
                blk_mq_exit_hctx(q, set, hctx, i);
        }
}

static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
{
        int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);

        BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
                           __alignof__(struct blk_mq_hw_ctx)) !=
                     sizeof(struct blk_mq_hw_ctx));

        if (tag_set->flags & BLK_MQ_F_BLOCKING)
                hw_ctx_size += sizeof(struct srcu_struct);

        return hw_ctx_size;
}

static int blk_mq_init_hctx(struct request_queue *q,
                struct blk_mq_tag_set *set,
                struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
{
        hctx->queue_num = hctx_idx;

        if (!(hctx->flags & BLK_MQ_F_STACKING))
                cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
                                &hctx->cpuhp_online);
        cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);

        hctx->tags = set->tags[hctx_idx];

        if (set->ops->init_hctx &&
            set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
                goto unregister_cpu_notifier;

        if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx,
                                hctx->numa_node))
                goto exit_hctx;
        return 0;

 exit_hctx:
        if (set->ops->exit_hctx)
                set->ops->exit_hctx(hctx, hctx_idx);
 unregister_cpu_notifier:
        blk_mq_remove_cpuhp(hctx);
        return -1;
}

static struct blk_mq_hw_ctx *
blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
                int node)
{
        struct blk_mq_hw_ctx *hctx;
        gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;

        hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node);
        if (!hctx)
                goto fail_alloc_hctx;

        if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node))
                goto free_hctx;

        atomic_set(&hctx->nr_active, 0);
        atomic_set(&hctx->elevator_queued, 0);
        if (node == NUMA_NO_NODE)
                node = set->numa_node;
        hctx->numa_node = node;

        INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
        spin_lock_init(&hctx->lock);
        INIT_LIST_HEAD(&hctx->dispatch);
        hctx->queue = q;
        hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED;

        INIT_LIST_HEAD(&hctx->hctx_list);

        /*
         * Allocate space for all possible cpus to avoid allocation at
         * runtime
         */
        hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *),
                        gfp, node);
        if (!hctx->ctxs)
                goto free_cpumask;

        if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8),
                                gfp, node))
                goto free_ctxs;
        hctx->nr_ctx = 0;

        spin_lock_init(&hctx->dispatch_wait_lock);
        init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
        INIT_LIST_HEAD(&hctx->dispatch_wait.entry);

        hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp);
        if (!hctx->fq)
                goto free_bitmap;

        if (hctx->flags & BLK_MQ_F_BLOCKING)
                init_srcu_struct(hctx->srcu);
        blk_mq_hctx_kobj_init(hctx);

        return hctx;

 free_bitmap:
        sbitmap_free(&hctx->ctx_map);
 free_ctxs:
        kfree(hctx->ctxs);
 free_cpumask:
        free_cpumask_var(hctx->cpumask);
 free_hctx:
        kfree(hctx);
 fail_alloc_hctx:
        return NULL;
}

static void blk_mq_init_cpu_queues(struct request_queue *q,
                                   unsigned int nr_hw_queues)
{
        struct blk_mq_tag_set *set = q->tag_set;
        unsigned int i, j;

        for_each_possible_cpu(i) {
                struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
                struct blk_mq_hw_ctx *hctx;
                int k;

                __ctx->cpu = i;
                spin_lock_init(&__ctx->lock);
                for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++)
                        INIT_LIST_HEAD(&__ctx->rq_lists[k]);

                __ctx->queue = q;

                /*
                 * Set local node, IFF we have more than one hw queue. If
                 * not, we remain on the home node of the device
                 */
                for (j = 0; j < set->nr_maps; j++) {
                        hctx = blk_mq_map_queue_type(q, j, i);
                        if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
                                hctx->numa_node = cpu_to_node(i);
                }
        }
}

static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set,
                                        int hctx_idx)
{
        unsigned int flags = set->flags;
        int ret = 0;

        set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
                                        set->queue_depth, set->reserved_tags, flags);
        if (!set->tags[hctx_idx])
                return false;

        ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx,
                                set->queue_depth);
        if (!ret)
                return true;

        blk_mq_free_rq_map(set->tags[hctx_idx], flags);
        set->tags[hctx_idx] = NULL;
        return false;
}

static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
                                         unsigned int hctx_idx)
{
        unsigned int flags = set->flags;

        if (set->tags && set->tags[hctx_idx]) {
                blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
                blk_mq_free_rq_map(set->tags[hctx_idx], flags);
                set->tags[hctx_idx] = NULL;
        }
}

static void blk_mq_map_swqueue(struct request_queue *q)
{
        unsigned int i, j, hctx_idx;
        struct blk_mq_hw_ctx *hctx;
        struct blk_mq_ctx *ctx;
        struct blk_mq_tag_set *set = q->tag_set;

        queue_for_each_hw_ctx(q, hctx, i) {
                cpumask_clear(hctx->cpumask);
                hctx->nr_ctx = 0;
                hctx->dispatch_from = NULL;
        }

        /*
         * Map software to hardware queues.
         *
         * If the cpu isn't present, the cpu is mapped to first hctx.
         */
        for_each_possible_cpu(i) {

                ctx = per_cpu_ptr(q->queue_ctx, i);
                for (j = 0; j < set->nr_maps; j++) {
                        if (!set->map[j].nr_queues) {
                                ctx->hctxs[j] = blk_mq_map_queue_type(q,
                                                HCTX_TYPE_DEFAULT, i);
                                continue;
                        }
                        hctx_idx = set->map[j].mq_map[i];
                        /* unmapped hw queue can be remapped after CPU topo changed */
                        if (!set->tags[hctx_idx] &&
                            !__blk_mq_alloc_map_and_request(set, hctx_idx)) {
                                /*
                                 * If tags initialization fail for some hctx,
                                 * that hctx won't be brought online.  In this
                                 * case, remap the current ctx to hctx[0] which
                                 * is guaranteed to always have tags allocated
                                 */
                                set->map[j].mq_map[i] = 0;
                        }

                        hctx = blk_mq_map_queue_type(q, j, i);
                        ctx->hctxs[j] = hctx;
                        /*
                         * If the CPU is already set in the mask, then we've
                         * mapped this one already. This can happen if
                         * devices share queues across queue maps.
                         */
                        if (cpumask_test_cpu(i, hctx->cpumask))
                                continue;

                        cpumask_set_cpu(i, hctx->cpumask);
                        hctx->type = j;
                        ctx->index_hw[hctx->type] = hctx->nr_ctx;
                        hctx->ctxs[hctx->nr_ctx++] = ctx;

                        /*
                         * If the nr_ctx type overflows, we have exceeded the
                         * amount of sw queues we can support.
                         */
                        BUG_ON(!hctx->nr_ctx);
                }

                for (; j < HCTX_MAX_TYPES; j++)
                        ctx->hctxs[j] = blk_mq_map_queue_type(q,
                                        HCTX_TYPE_DEFAULT, i);
        }

        queue_for_each_hw_ctx(q, hctx, i) {
                /*
                 * If no software queues are mapped to this hardware queue,
                 * disable it and free the request entries.
                 */
                if (!hctx->nr_ctx) {
                        /* Never unmap queue 0.  We need it as a
                         * fallback in case of a new remap fails
                         * allocation
                         */
                        if (i && set->tags[i])
                                blk_mq_free_map_and_requests(set, i);

                        hctx->tags = NULL;
                        continue;
                }

                hctx->tags = set->tags[i];
                WARN_ON(!hctx->tags);

                /*
                 * Set the map size to the number of mapped software queues.
                 * This is more accurate and more efficient than looping
                 * over all possibly mapped software queues.
                 */
                sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx);

                /*
                 * Initialize batch roundrobin counts
                 */
                hctx->next_cpu = blk_mq_first_mapped_cpu(hctx);
                hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
        }
}

/*
 * Caller needs to ensure that we're either frozen/quiesced, or that
 * the queue isn't live yet.
 */
static void queue_set_hctx_shared(struct request_queue *q, bool shared)
{
        struct blk_mq_hw_ctx *hctx;
        int i;

        queue_for_each_hw_ctx(q, hctx, i) {
                if (shared)
                        hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
                else
                        hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
        }
}

static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set,
                                         bool shared)
{
        struct request_queue *q;

        lockdep_assert_held(&set->tag_list_lock);

        list_for_each_entry(q, &set->tag_list, tag_set_list) {
                blk_mq_freeze_queue(q);
                queue_set_hctx_shared(q, shared);
                blk_mq_unfreeze_queue(q);
        }
}

static void blk_mq_del_queue_tag_set(struct request_queue *q)
{
        struct blk_mq_tag_set *set = q->tag_set;

        mutex_lock(&set->tag_list_lock);
        list_del(&q->tag_set_list);
        if (list_is_singular(&set->tag_list)) {
                /* just transitioned to unshared */
                set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
                /* update existing queue */
                blk_mq_update_tag_set_shared(set, false);
        }
        mutex_unlock(&set->tag_list_lock);
        INIT_LIST_HEAD(&q->tag_set_list);
}

static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
                                     struct request_queue *q)
{
        mutex_lock(&set->tag_list_lock);

        /*
         * Check to see if we're transitioning to shared (from 1 to 2 queues).
         */
        if (!list_empty(&set->tag_list) &&
            !(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
                set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
                /* update existing queue */
                blk_mq_update_tag_set_shared(set, true);
        }
        if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
                queue_set_hctx_shared(q, true);
        list_add_tail(&q->tag_set_list, &set->tag_list);

        mutex_unlock(&set->tag_list_lock);
}

/* All allocations will be freed in release handler of q->mq_kobj */
static int blk_mq_alloc_ctxs(struct request_queue *q)
{
        struct blk_mq_ctxs *ctxs;
        int cpu;

        ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL);
        if (!ctxs)
                return -ENOMEM;

        ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx);
        if (!ctxs->queue_ctx)
                goto fail;

        for_each_possible_cpu(cpu) {
                struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu);
                ctx->ctxs = ctxs;
        }

        q->mq_kobj = &ctxs->kobj;
        q->queue_ctx = ctxs->queue_ctx;

        return 0;
 fail:
        kfree(ctxs);
        return -ENOMEM;
}

/*
 * It is the actual release handler for mq, but we do it from
 * request queue's release handler for avoiding use-after-free
 * and headache because q->mq_kobj shouldn't have been introduced,
 * but we can't group ctx/kctx kobj without it.
 */
void blk_mq_release(struct request_queue *q)
{
        struct blk_mq_hw_ctx *hctx, *next;
        int i;

        queue_for_each_hw_ctx(q, hctx, i)
                WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));

        /* all hctx are in .unused_hctx_list now */
        list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) {
                list_del_init(&hctx->hctx_list);
                kobject_put(&hctx->kobj);
        }

        kfree(q->queue_hw_ctx);

        /*
         * release .mq_kobj and sw queue's kobject now because
         * both share lifetime with request queue.
         */
        blk_mq_sysfs_deinit(q);
}

struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
                void *queuedata)
{
        struct request_queue *uninit_q, *q;

        uninit_q = blk_alloc_queue(set->numa_node);
        if (!uninit_q)
                return ERR_PTR(-ENOMEM);
        uninit_q->queuedata = queuedata;

        /*
         * Initialize the queue without an elevator. device_add_disk() will do
         * the initialization.
         */
        q = blk_mq_init_allocated_queue(set, uninit_q, false);
        if (IS_ERR(q))
                blk_cleanup_queue(uninit_q);

        return q;
}
EXPORT_SYMBOL_GPL(blk_mq_init_queue_data);

struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
{
        return blk_mq_init_queue_data(set, NULL);
}
EXPORT_SYMBOL(blk_mq_init_queue);

/*
 * Helper for setting up a queue with mq ops, given queue depth, and
 * the passed in mq ops flags.
 */
struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set,
                                           const struct blk_mq_ops *ops,
                                           unsigned int queue_depth,
                                           unsigned int set_flags)
{
        struct request_queue *q;
        int ret;

        memset(set, 0, sizeof(*set));
        set->ops = ops;
        set->nr_hw_queues = 1;
        set->nr_maps = 1;
        set->queue_depth = queue_depth;
        set->numa_node = NUMA_NO_NODE;
        set->flags = set_flags;

        ret = blk_mq_alloc_tag_set(set);
        if (ret)
                return ERR_PTR(ret);

        q = blk_mq_init_queue(set);
        if (IS_ERR(q)) {
                blk_mq_free_tag_set(set);
                return q;
        }

        return q;
}
EXPORT_SYMBOL(blk_mq_init_sq_queue);

static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
                struct blk_mq_tag_set *set, struct request_queue *q,
                int hctx_idx, int node)
{
        struct blk_mq_hw_ctx *hctx = NULL, *tmp;

        /* reuse dead hctx first */
        spin_lock(&q->unused_hctx_lock);
        list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) {
                if (tmp->numa_node == node) {
                        hctx = tmp;
                        break;
                }
        }
        if (hctx)
                list_del_init(&hctx->hctx_list);
        spin_unlock(&q->unused_hctx_lock);

        if (!hctx)
                hctx = blk_mq_alloc_hctx(q, set, node);
        if (!hctx)
                goto fail;

        if (blk_mq_init_hctx(q, set, hctx, hctx_idx))
                goto free_hctx;

        return hctx;

 free_hctx:
        kobject_put(&hctx->kobj);
 fail:
        return NULL;
}

static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
                                                struct request_queue *q)
{
        int i, j, end;
        struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;

        if (q->nr_hw_queues < set->nr_hw_queues) {
                struct blk_mq_hw_ctx **new_hctxs;

                new_hctxs = kcalloc_node(set->nr_hw_queues,
                                       sizeof(*new_hctxs), GFP_KERNEL,
                                       set->numa_node);
                if (!new_hctxs)
                        return;
                if (hctxs)
                        memcpy(new_hctxs, hctxs, q->nr_hw_queues *
                               sizeof(*hctxs));
                q->queue_hw_ctx = new_hctxs;
                kfree(hctxs);
                hctxs = new_hctxs;
        }

        /* protect against switching io scheduler  */
        mutex_lock(&q->sysfs_lock);
        for (i = 0; i < set->nr_hw_queues; i++) {
                int node;
                struct blk_mq_hw_ctx *hctx;

                node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i);
                /*
                 * If the hw queue has been mapped to another numa node,
                 * we need to realloc the hctx. If allocation fails, fallback
                 * to use the previous one.
                 */
                if (hctxs[i] && (hctxs[i]->numa_node == node))
                        continue;

                hctx = blk_mq_alloc_and_init_hctx(set, q, i, node);
                if (hctx) {
                        if (hctxs[i])
                                blk_mq_exit_hctx(q, set, hctxs[i], i);
                        hctxs[i] = hctx;
                } else {
                        if (hctxs[i])
                                pr_warn("Allocate new hctx on node %d fails,\
                                                fallback to previous one on node %d\n",
                                                node, hctxs[i]->numa_node);
                        else
                                break;
                }
        }
        /*
         * Increasing nr_hw_queues fails. Free the newly allocated
         * hctxs and keep the previous q->nr_hw_queues.
         */
        if (i != set->nr_hw_queues) {
                j = q->nr_hw_queues;
                end = i;
        } else {
                j = i;
                end = q->nr_hw_queues;
                q->nr_hw_queues = set->nr_hw_queues;
        }

        for (; j < end; j++) {
                struct blk_mq_hw_ctx *hctx = hctxs[j];

                if (hctx) {
                        if (hctx->tags)
                                blk_mq_free_map_and_requests(set, j);
                        blk_mq_exit_hctx(q, set, hctx, j);
                        hctxs[j] = NULL;
                }
        }
        mutex_unlock(&q->sysfs_lock);
}

struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
                                                  struct request_queue *q,
                                                  bool elevator_init)
{
        /* mark the queue as mq asap */
        q->mq_ops = set->ops;

        q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
                                             blk_mq_poll_stats_bkt,
                                             BLK_MQ_POLL_STATS_BKTS, q);
        if (!q->poll_cb)
                goto err_exit;

        if (blk_mq_alloc_ctxs(q))
                goto err_poll;

        /* init q->mq_kobj and sw queues' kobjects */
        blk_mq_sysfs_init(q);

        INIT_LIST_HEAD(&q->unused_hctx_list);
        spin_lock_init(&q->unused_hctx_lock);

        blk_mq_realloc_hw_ctxs(set, q);
        if (!q->nr_hw_queues)
                goto err_hctxs;

        INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
        blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);

        q->tag_set = set;

        q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
        if (set->nr_maps > HCTX_TYPE_POLL &&
            set->map[HCTX_TYPE_POLL].nr_queues)
                blk_queue_flag_set(QUEUE_FLAG_POLL, q);

        q->sg_reserved_size = INT_MAX;

        INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
        INIT_LIST_HEAD(&q->requeue_list);
        spin_lock_init(&q->requeue_lock);

        q->nr_requests = set->queue_depth;

        /*
         * Default to classic polling
         */
        q->poll_nsec = BLK_MQ_POLL_CLASSIC;

        blk_mq_init_cpu_queues(q, set->nr_hw_queues);
        blk_mq_add_queue_tag_set(set, q);
        blk_mq_map_swqueue(q);

        if (elevator_init)
                elevator_init_mq(q);

        return q;

err_hctxs:
        kfree(q->queue_hw_ctx);
        q->nr_hw_queues = 0;
        blk_mq_sysfs_deinit(q);
err_poll:
        blk_stat_free_callback(q->poll_cb);
        q->poll_cb = NULL;
err_exit:
        q->mq_ops = NULL;
        return ERR_PTR(-ENOMEM);
}
EXPORT_SYMBOL(blk_mq_init_allocated_queue);

/* tags can _not_ be used after returning from blk_mq_exit_queue */
void blk_mq_exit_queue(struct request_queue *q)
{
        struct blk_mq_tag_set *set = q->tag_set;

        /* Checks hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED. */
        blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
        /* May clear BLK_MQ_F_TAG_QUEUE_SHARED in hctx->flags. */
        blk_mq_del_queue_tag_set(q);
}

static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
{
        int i;

        for (i = 0; i < set->nr_hw_queues; i++) {
                if (!__blk_mq_alloc_map_and_request(set, i))
                        goto out_unwind;
                cond_resched();
        }

        return 0;

out_unwind:
        while (--i >= 0)
                blk_mq_free_map_and_requests(set, i);

        return -ENOMEM;
}

/*
 * Allocate the request maps associated with this tag_set. Note that this
 * may reduce the depth asked for, if memory is tight. set->queue_depth
 * will be updated to reflect the allocated depth.
 */
static int blk_mq_alloc_map_and_requests(struct blk_mq_tag_set *set)
{
        unsigned int depth;
        int err;

        depth = set->queue_depth;
        do {
                err = __blk_mq_alloc_rq_maps(set);
                if (!err)
                        break;

                set->queue_depth >>= 1;
                if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
                        err = -ENOMEM;
                        break;
                }
        } while (set->queue_depth);

        if (!set->queue_depth || err) {
                pr_err("blk-mq: failed to allocate request map\n");
                return -ENOMEM;
        }

        if (depth != set->queue_depth)
                pr_info("blk-mq: reduced tag depth (%u -> %u)\n",
                                                depth, set->queue_depth);

        return 0;
}

static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
{
        /*
         * blk_mq_map_queues() and multiple .map_queues() implementations
         * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the
         * number of hardware queues.
         */
        if (set->nr_maps == 1)
                set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues;

        if (set->ops->map_queues && !is_kdump_kernel()) {
                int i;

                /*
                 * transport .map_queues is usually done in the following
                 * way:
                 *
                 * for (queue = 0; queue < set->nr_hw_queues; queue++) {
                 *         mask = get_cpu_mask(queue)
                 *         for_each_cpu(cpu, mask)
                 *                 set->map[x].mq_map[cpu] = queue;
                 * }
                 *
                 * When we need to remap, the table has to be cleared for
                 * killing stale mapping since one CPU may not be mapped
                 * to any hw queue.
                 */
                for (i = 0; i < set->nr_maps; i++)
                        blk_mq_clear_mq_map(&set->map[i]);

                return set->ops->map_queues(set);
        } else {
                BUG_ON(set->nr_maps > 1);
                return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
        }
}

static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
                                  int cur_nr_hw_queues, int new_nr_hw_queues)
{
        struct blk_mq_tags **new_tags;

        if (cur_nr_hw_queues >= new_nr_hw_queues)
                return 0;

        new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *),
                                GFP_KERNEL, set->numa_node);
        if (!new_tags)
                return -ENOMEM;

        if (set->tags)
                memcpy(new_tags, set->tags, cur_nr_hw_queues *
                       sizeof(*set->tags));
        kfree(set->tags);
        set->tags = new_tags;
        set->nr_hw_queues = new_nr_hw_queues;

        return 0;
}

/*
 * Alloc a tag set to be associated with one or more request queues.
 * May fail with EINVAL for various error conditions. May adjust the
 * requested depth down, if it's too large. In that case, the set
 * value will be stored in set->queue_depth.
 */
int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
{
        int i, ret;

        BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);

        if (!set->nr_hw_queues)
                return -EINVAL;
        if (!set->queue_depth)
                return -EINVAL;
        if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
                return -EINVAL;

        if (!set->ops->queue_rq)
                return -EINVAL;

        if (!set->ops->get_budget ^ !set->ops->put_budget)
                return -EINVAL;

        if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
                pr_info("blk-mq: reduced tag depth to %u\n",
                        BLK_MQ_MAX_DEPTH);
                set->queue_depth = BLK_MQ_MAX_DEPTH;
        }

        if (!set->nr_maps)
                set->nr_maps = 1;
        else if (set->nr_maps > HCTX_MAX_TYPES)
                return -EINVAL;

        /*
         * If a crashdump is active, then we are potentially in a very
         * memory constrained environment. Limit us to 1 queue and
         * 64 tags to prevent using too much memory.
         */
        if (is_kdump_kernel()) {
                set->nr_hw_queues = 1;
                set->nr_maps = 1;
                set->queue_depth = min(64U, set->queue_depth);
        }
        /*
         * There is no use for more h/w queues than cpus if we just have
         * a single map
         */
        if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
                set->nr_hw_queues = nr_cpu_ids;

        if (blk_mq_realloc_tag_set_tags(set, 0, set->nr_hw_queues) < 0)
                return -ENOMEM;

        ret = -ENOMEM;
        for (i = 0; i < set->nr_maps; i++) {
                set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
                                                  sizeof(set->map[i].mq_map[0]),
                                                  GFP_KERNEL, set->numa_node);
                if (!set->map[i].mq_map)
                        goto out_free_mq_map;
                set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues;
        }

        ret = blk_mq_update_queue_map(set);
        if (ret)
                goto out_free_mq_map;

        ret = blk_mq_alloc_map_and_requests(set);
        if (ret)
                goto out_free_mq_map;

        if (blk_mq_is_sbitmap_shared(set->flags)) {
                atomic_set(&set->active_queues_shared_sbitmap, 0);

                if (blk_mq_init_shared_sbitmap(set, set->flags)) {
                        ret = -ENOMEM;
                        goto out_free_mq_rq_maps;
                }
        }

        mutex_init(&set->tag_list_lock);
        INIT_LIST_HEAD(&set->tag_list);

        return 0;

out_free_mq_rq_maps:
        for (i = 0; i < set->nr_hw_queues; i++)
                blk_mq_free_map_and_requests(set, i);
out_free_mq_map:
        for (i = 0; i < set->nr_maps; i++) {
                kfree(set->map[i].mq_map);
                set->map[i].mq_map = NULL;
        }
        kfree(set->tags);
        set->tags = NULL;
        return ret;
}
EXPORT_SYMBOL(blk_mq_alloc_tag_set);

void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
{
        int i, j;

        for (i = 0; i < set->nr_hw_queues; i++)
                blk_mq_free_map_and_requests(set, i);

        if (blk_mq_is_sbitmap_shared(set->flags))
                blk_mq_exit_shared_sbitmap(set);

        for (j = 0; j < set->nr_maps; j++) {
                kfree(set->map[j].mq_map);
                set->map[j].mq_map = NULL;
        }

        kfree(set->tags);
        set->tags = NULL;
}
EXPORT_SYMBOL(blk_mq_free_tag_set);

int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
{
        struct blk_mq_tag_set *set = q->tag_set;
        struct blk_mq_hw_ctx *hctx;
        int i, ret;

        if (!set)
                return -EINVAL;

        if (q->nr_requests == nr)
                return 0;

        blk_mq_freeze_queue(q);
        blk_mq_quiesce_queue(q);

        ret = 0;
        queue_for_each_hw_ctx(q, hctx, i) {
                if (!hctx->tags)
                        continue;
                /*
                 * If we're using an MQ scheduler, just update the scheduler
                 * queue depth. This is similar to what the old code would do.
                 */
                if (!hctx->sched_tags) {
                        ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
                                                        false);
                        if (!ret && blk_mq_is_sbitmap_shared(set->flags))
                                blk_mq_tag_resize_shared_sbitmap(set, nr);
                } else {
                        ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
                                                        nr, true);
                }
                if (ret)
                        break;
                if (q->elevator && q->elevator->type->ops.depth_updated)
                        q->elevator->type->ops.depth_updated(hctx);
        }

        if (!ret)
                q->nr_requests = nr;

        blk_mq_unquiesce_queue(q);
        blk_mq_unfreeze_queue(q);

        return ret;
}

/*
 * request_queue and elevator_type pair.
 * It is just used by __blk_mq_update_nr_hw_queues to cache
 * the elevator_type associated with a request_queue.
 */
struct blk_mq_qe_pair {
        struct list_head node;
        struct request_queue *q;
        struct elevator_type *type;
};

/*
 * Cache the elevator_type in qe pair list and switch the
 * io scheduler to 'none'
 */
static bool blk_mq_elv_switch_none(struct list_head *head,
                struct request_queue *q)
{
        struct blk_mq_qe_pair *qe;

        if (!q->elevator)
                return true;

        qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);
        if (!qe)
                return false;

        INIT_LIST_HEAD(&qe->node);
        qe->q = q;
        qe->type = q->elevator->type;
        list_add(&qe->node, head);

        mutex_lock(&q->sysfs_lock);
        /*
         * After elevator_switch_mq, the previous elevator_queue will be
         * released by elevator_release. The reference of the io scheduler
         * module get by elevator_get will also be put. So we need to get
         * a reference of the io scheduler module here to prevent it to be
         * removed.
         */
        __module_get(qe->type->elevator_owner);
        elevator_switch_mq(q, NULL);
        mutex_unlock(&q->sysfs_lock);

        return true;
}

static void blk_mq_elv_switch_back(struct list_head *head,
                struct request_queue *q)
{
        struct blk_mq_qe_pair *qe;
        struct elevator_type *t = NULL;

        list_for_each_entry(qe, head, node)
                if (qe->q == q) {
                        t = qe->type;
                        break;
                }

        if (!t)
                return;

        list_del(&qe->node);
        kfree(qe);

        mutex_lock(&q->sysfs_lock);
        elevator_switch_mq(q, t);
        mutex_unlock(&q->sysfs_lock);
}

static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
                                                        int nr_hw_queues)
{
        struct request_queue *q;
        LIST_HEAD(head);
        int prev_nr_hw_queues;

        lockdep_assert_held(&set->tag_list_lock);

        if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids)
                nr_hw_queues = nr_cpu_ids;
        if (nr_hw_queues < 1)
                return;
        if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues)
                return;

        list_for_each_entry(q, &set->tag_list, tag_set_list)
                blk_mq_freeze_queue(q);
        /*
         * Switch IO scheduler to 'none', cleaning up the data associated
         * with the previous scheduler. We will switch back once we are done
         * updating the new sw to hw queue mappings.
         */
        list_for_each_entry(q, &set->tag_list, tag_set_list)
                if (!blk_mq_elv_switch_none(&head, q))
                        goto switch_back;

        list_for_each_entry(q, &set->tag_list, tag_set_list) {
                blk_mq_debugfs_unregister_hctxs(q);
                blk_mq_sysfs_unregister(q);
        }

        prev_nr_hw_queues = set->nr_hw_queues;
        if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) <
            0)
                goto reregister;

        set->nr_hw_queues = nr_hw_queues;
fallback:
        blk_mq_update_queue_map(set);
        list_for_each_entry(q, &set->tag_list, tag_set_list) {
                blk_mq_realloc_hw_ctxs(set, q);
                if (q->nr_hw_queues != set->nr_hw_queues) {
                        pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
                                        nr_hw_queues, prev_nr_hw_queues);
                        set->nr_hw_queues = prev_nr_hw_queues;
                        blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
                        goto fallback;
                }
                blk_mq_map_swqueue(q);
        }

reregister:
        list_for_each_entry(q, &set->tag_list, tag_set_list) {
                blk_mq_sysfs_register(q);
                blk_mq_debugfs_register_hctxs(q);
        }

switch_back:
        list_for_each_entry(q, &set->tag_list, tag_set_list)
                blk_mq_elv_switch_back(&head, q);

        list_for_each_entry(q, &set->tag_list, tag_set_list)
                blk_mq_unfreeze_queue(q);
}

void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
{
        mutex_lock(&set->tag_list_lock);
        __blk_mq_update_nr_hw_queues(set, nr_hw_queues);
        mutex_unlock(&set->tag_list_lock);
}
EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);

/* Enable polling stats and return whether they were already enabled. */
static bool blk_poll_stats_enable(struct request_queue *q)
{
        if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
            blk_queue_flag_test_and_set(QUEUE_FLAG_POLL_STATS, q))
                return true;
        blk_stat_add_callback(q, q->poll_cb);
        return false;
}

static void blk_mq_poll_stats_start(struct request_queue *q)
{
        /*
         * We don't arm the callback if polling stats are not enabled or the
         * callback is already active.
         */
        if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
            blk_stat_is_active(q->poll_cb))
                return;

        blk_stat_activate_msecs(q->poll_cb, 100);
}

static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
{
        struct request_queue *q = cb->data;
        int bucket;

        for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) {
                if (cb->stat[bucket].nr_samples)
                        q->poll_stat[bucket] = cb->stat[bucket];
        }
}

static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
                                       struct request *rq)
{
        unsigned long ret = 0;
        int bucket;

        /*
         * If stats collection isn't on, don't sleep but turn it on for
         * future users
         */
        if (!blk_poll_stats_enable(q))
                return 0;

        /*
         * As an optimistic guess, use half of the mean service time
         * for this type of request. We can (and should) make this smarter.
         * For instance, if the completion latencies are tight, we can
         * get closer than just half the mean. This is especially
         * important on devices where the completion latencies are longer
         * than ~10 usec. We do use the stats for the relevant IO size
         * if available which does lead to better estimates.
         */
        bucket = blk_mq_poll_stats_bkt(rq);
        if (bucket < 0)
                return ret;

        if (q->poll_stat[bucket].nr_samples)
                ret = (q->poll_stat[bucket].mean + 1) / 2;

        return ret;
}

static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
                                     struct request *rq)
{
        struct hrtimer_sleeper hs;
        enum hrtimer_mode mode;
        unsigned int nsecs;
        ktime_t kt;

        if (rq->rq_flags & RQF_MQ_POLL_SLEPT)
                return false;

        /*
         * If we get here, hybrid polling is enabled. Hence poll_nsec can be:
         *
         *  0:        use half of prev avg
         * >0:        use this specific value
         */
        if (q->poll_nsec > 0)
                nsecs = q->poll_nsec;
        else
                nsecs = blk_mq_poll_nsecs(q, rq);

        if (!nsecs)
                return false;

        rq->rq_flags |= RQF_MQ_POLL_SLEPT;

        /*
         * This will be replaced with the stats tracking code, using
         * 'avg_completion_time / 2' as the pre-sleep target.
         */
        kt = nsecs;

        mode = HRTIMER_MODE_REL;
        hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode);
        hrtimer_set_expires(&hs.timer, kt);

        do {
                if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
                        break;
                set_current_state(TASK_UNINTERRUPTIBLE);
                hrtimer_sleeper_start_expires(&hs, mode);
                if (hs.task)
                        io_schedule();
                hrtimer_cancel(&hs.timer);
                mode = HRTIMER_MODE_ABS;
        } while (hs.task && !signal_pending(current));

        __set_current_state(TASK_RUNNING);
        destroy_hrtimer_on_stack(&hs.timer);
        return true;
}

static bool blk_mq_poll_hybrid(struct request_queue *q,
                               struct blk_mq_hw_ctx *hctx, blk_qc_t cookie)
{
        struct request *rq;

        if (q->poll_nsec == BLK_MQ_POLL_CLASSIC)
                return false;

        if (!blk_qc_t_is_internal(cookie))
                rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
        else {
                rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
                /*
                 * With scheduling, if the request has completed, we'll
                 * get a NULL return here, as we clear the sched tag when
                 * that happens. The request still remains valid, like always,
                 * so we should be safe with just the NULL check.
                 */
                if (!rq)
                        return false;
        }

        return blk_mq_poll_hybrid_sleep(q, rq);
}

/**
 * blk_poll - poll for IO completions
 * @q:  the queue
 * @cookie: cookie passed back at IO submission time
 * @spin: whether to spin for completions
 *
 * Description:
 *    Poll for completions on the passed in queue. Returns number of
 *    completed entries found. If @spin is true, then blk_poll will continue
 *    looping until at least one completion is found, unless the task is
 *    otherwise marked running (or we need to reschedule).
 */
int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
{
        struct blk_mq_hw_ctx *hctx;
        long state;

        if (!blk_qc_t_valid(cookie) ||
            !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
                return 0;

        if (current->plug)
                blk_flush_plug_list(current->plug, false);

        hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];

        /*
         * If we sleep, have the caller restart the poll loop to reset
         * the state. Like for the other success return cases, the
         * caller is responsible for checking if the IO completed. If
         * the IO isn't complete, we'll get called again and will go
         * straight to the busy poll loop.
         */
        if (blk_mq_poll_hybrid(q, hctx, cookie))
                return 1;

        hctx->poll_considered++;

        state = current->state;
        do {
                int ret;

                hctx->poll_invoked++;

                ret = q->mq_ops->poll(hctx);
                if (ret > 0) {
                        hctx->poll_success++;
                        __set_current_state(TASK_RUNNING);
                        return ret;
                }

                if (signal_pending_state(state, current))
                        __set_current_state(TASK_RUNNING);

                if (current->state == TASK_RUNNING)
                        return 1;
                if (ret < 0 || !spin)
                        break;
                cpu_relax();
        } while (!need_resched());

        __set_current_state(TASK_RUNNING);
        return 0;
}
EXPORT_SYMBOL_GPL(blk_poll);

unsigned int blk_mq_rq_cpu(struct request *rq)
{
        return rq->mq_ctx->cpu;
}
EXPORT_SYMBOL(blk_mq_rq_cpu);

static int __init blk_mq_init(void)
{
        int i;

        for_each_possible_cpu(i)
                INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
        open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);

        cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,
                                  "block/softirq:dead", NULL,
                                  blk_softirq_cpu_dead);
        cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
                                blk_mq_hctx_notify_dead);
        cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",
                                blk_mq_hctx_notify_online,
                                blk_mq_hctx_notify_offline);
        return 0;
}
subsys_initcall(blk_mq_init);






















































    4 





    3 
























































    1 








































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_LIST_BL_H
#define _LINUX_LIST_BL_H

#include <linux/list.h>
#include <linux/bit_spinlock.h>

/*
 * Special version of lists, where head of the list has a lock in the lowest
 * bit. This is useful for scalable hash tables without increasing memory
 * footprint overhead.
 *
 * For modification operations, the 0 bit of hlist_bl_head->first
 * pointer must be set.
 *
 * With some small modifications, this can easily be adapted to store several
 * arbitrary bits (not just a single lock bit), if the need arises to store
 * some fast and compact auxiliary data.
 */

#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
#define LIST_BL_LOCKMASK        1UL
#else
#define LIST_BL_LOCKMASK        0UL
#endif

#ifdef CONFIG_DEBUG_LIST
#define LIST_BL_BUG_ON(x) BUG_ON(x)
#else
#define LIST_BL_BUG_ON(x)
#endif


struct hlist_bl_head {
        struct hlist_bl_node *first;
};

struct hlist_bl_node {
        struct hlist_bl_node *next, **pprev;
};
#define INIT_HLIST_BL_HEAD(ptr) \
        ((ptr)->first = NULL)

static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
{
        h->next = NULL;
        h->pprev = NULL;
}

#define hlist_bl_entry(ptr, type, member) container_of(ptr,type,member)

static inline bool  hlist_bl_unhashed(const struct hlist_bl_node *h)
{
        return !h->pprev;
}

static inline struct hlist_bl_node *hlist_bl_first(struct hlist_bl_head *h)
{
        return (struct hlist_bl_node *)
                ((unsigned long)h->first & ~LIST_BL_LOCKMASK);
}

static inline void hlist_bl_set_first(struct hlist_bl_head *h,
                                        struct hlist_bl_node *n)
{
        LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK);
        LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) !=
                                                        LIST_BL_LOCKMASK);
        h->first = (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK);
}

static inline bool hlist_bl_empty(const struct hlist_bl_head *h)
{
        return !((unsigned long)READ_ONCE(h->first) & ~LIST_BL_LOCKMASK);
}

static inline void hlist_bl_add_head(struct hlist_bl_node *n,
                                        struct hlist_bl_head *h)
{
        struct hlist_bl_node *first = hlist_bl_first(h);

        n->next = first;
        if (first)
                first->pprev = &n->next;
        n->pprev = &h->first;
        hlist_bl_set_first(h, n);
}

static inline void hlist_bl_add_before(struct hlist_bl_node *n,
                                       struct hlist_bl_node *next)
{
        struct hlist_bl_node **pprev = next->pprev;

        n->pprev = pprev;
        n->next = next;
        next->pprev = &n->next;

        /* pprev may be `first`, so be careful not to lose the lock bit */
        WRITE_ONCE(*pprev,
                   (struct hlist_bl_node *)
                        ((uintptr_t)n | ((uintptr_t)*pprev & LIST_BL_LOCKMASK)));
}

static inline void hlist_bl_add_behind(struct hlist_bl_node *n,
                                       struct hlist_bl_node *prev)
{
        n->next = prev->next;
        n->pprev = &prev->next;
        prev->next = n;

        if (n->next)
                n->next->pprev = &n->next;
}

static inline void __hlist_bl_del(struct hlist_bl_node *n)
{
        struct hlist_bl_node *next = n->next;
        struct hlist_bl_node **pprev = n->pprev;

        LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK);

        /* pprev may be `first`, so be careful not to lose the lock bit */
        WRITE_ONCE(*pprev,
                   (struct hlist_bl_node *)
                        ((unsigned long)next |
                         ((unsigned long)*pprev & LIST_BL_LOCKMASK)));
        if (next)
                next->pprev = pprev;
}

static inline void hlist_bl_del(struct hlist_bl_node *n)
{
        __hlist_bl_del(n);
        n->next = LIST_POISON1;
        n->pprev = LIST_POISON2;
}

static inline void hlist_bl_del_init(struct hlist_bl_node *n)
{
        if (!hlist_bl_unhashed(n)) {
                __hlist_bl_del(n);
                INIT_HLIST_BL_NODE(n);
        }
}

static inline void hlist_bl_lock(struct hlist_bl_head *b)
{
        bit_spin_lock(0, (unsigned long *)b);
}

static inline void hlist_bl_unlock(struct hlist_bl_head *b)
{
        __bit_spin_unlock(0, (unsigned long *)b);
}

static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
{
        return bit_spin_is_locked(0, (unsigned long *)b);
}

/**
 * hlist_bl_for_each_entry        - iterate over list of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_node to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 *
 */
#define hlist_bl_for_each_entry(tpos, pos, head, member)                \
        for (pos = hlist_bl_first(head);                                \
             pos &&                                                        \
                ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1;}); \
             pos = pos->next)

/**
 * hlist_bl_for_each_entry_safe - iterate over list of given type safe against removal of list entry
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_node to use as a loop cursor.
 * @n:                another &struct hlist_node to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_bl_for_each_entry_safe(tpos, pos, n, head, member)         \
        for (pos = hlist_bl_first(head);                                 \
             pos && ({ n = pos->next; 1; }) &&                                  \
                ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1;}); \
             pos = n)

#endif
















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_STRINGHASH_H
#define __LINUX_STRINGHASH_H

#include <linux/compiler.h>        /* For __pure */
#include <linux/types.h>        /* For u32, u64 */
#include <linux/hash.h>

/*
 * Routines for hashing strings of bytes to a 32-bit hash value.
 *
 * These hash functions are NOT GUARANTEED STABLE between kernel
 * versions, architectures, or even repeated boots of the same kernel.
 * (E.g. they may depend on boot-time hardware detection or be
 * deliberately randomized.)
 *
 * They are also not intended to be secure against collisions caused by
 * malicious inputs; much slower hash functions are required for that.
 *
 * They are optimized for pathname components, meaning short strings.
 * Even if a majority of files have longer names, the dynamic profile of
 * pathname components skews short due to short directory names.
 * (E.g. /usr/lib/libsesquipedalianism.so.3.141.)
 */

/*
 * Version 1: one byte at a time.  Example of use:
 *
 * unsigned long hash = init_name_hash;
 * while (*p)
 *        hash = partial_name_hash(tolower(*p++), hash);
 * hash = end_name_hash(hash);
 *
 * Although this is designed for bytes, fs/hfsplus/unicode.c
 * abuses it to hash 16-bit values.
 */

/* Hash courtesy of the R5 hash in reiserfs modulo sign bits */
#define init_name_hash(salt)                (unsigned long)(salt)

/* partial hash update function. Assume roughly 4 bits per character */
static inline unsigned long
partial_name_hash(unsigned long c, unsigned long prevhash)
{
        return (prevhash + (c << 4) + (c >> 4)) * 11;
}

/*
 * Finally: cut down the number of bits to a int value (and try to avoid
 * losing bits).  This also has the property (wanted by the dcache)
 * that the msbits make a good hash table index.
 */
static inline unsigned int end_name_hash(unsigned long hash)
{
        return hash_long(hash, 32);
}

/*
 * Version 2: One word (32 or 64 bits) at a time.
 * If CONFIG_DCACHE_WORD_ACCESS is defined (meaning <asm/word-at-a-time.h>
 * exists, which describes major Linux platforms like x86 and ARM), then
 * this computes a different hash function much faster.
 *
 * If not set, this falls back to a wrapper around the preceding.
 */
extern unsigned int __pure full_name_hash(const void *salt, const char *, unsigned int);

/*
 * A hash_len is a u64 with the hash of a string in the low
 * half and the length in the high half.
 */
#define hashlen_hash(hashlen) ((u32)(hashlen))
#define hashlen_len(hashlen)  ((u32)((hashlen) >> 32))
#define hashlen_create(hash, len) ((u64)(len)<<32 | (u32)(hash))

/* Return the "hash_len" (hash and length) of a null-terminated string */
extern u64 __pure hashlen_string(const void *salt, const char *name);

#endif        /* __LINUX_STRINGHASH_H */





















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Cryptographic API.
 *
 * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
 * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au>
 */
#ifndef _CRYPTO_INTERNAL_H
#define _CRYPTO_INTERNAL_H

#include <crypto/algapi.h>
#include <linux/completion.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/notifier.h>
#include <linux/numa.h>
#include <linux/refcount.h>
#include <linux/rwsem.h>
#include <linux/sched.h>
#include <linux/types.h>

struct crypto_instance;
struct crypto_template;

struct crypto_larval {
        struct crypto_alg alg;
        struct crypto_alg *adult;
        struct completion completion;
        u32 mask;
};

extern struct list_head crypto_alg_list;
extern struct rw_semaphore crypto_alg_sem;
extern struct blocking_notifier_head crypto_chain;

#ifdef CONFIG_PROC_FS
void __init crypto_init_proc(void);
void __exit crypto_exit_proc(void);
#else
static inline void crypto_init_proc(void)
{ }
static inline void crypto_exit_proc(void)
{ }
#endif

static inline unsigned int crypto_cipher_ctxsize(struct crypto_alg *alg)
{
        return alg->cra_ctxsize;
}

static inline unsigned int crypto_compress_ctxsize(struct crypto_alg *alg)
{
        return alg->cra_ctxsize;
}

struct crypto_alg *crypto_mod_get(struct crypto_alg *alg);
struct crypto_alg *crypto_alg_mod_lookup(const char *name, u32 type, u32 mask);

struct crypto_larval *crypto_larval_alloc(const char *name, u32 type, u32 mask);
void crypto_larval_kill(struct crypto_alg *alg);
void crypto_alg_tested(const char *name, int err);

void crypto_remove_spawns(struct crypto_alg *alg, struct list_head *list,
                          struct crypto_alg *nalg);
void crypto_remove_final(struct list_head *list);
void crypto_shoot_alg(struct crypto_alg *alg);
struct crypto_tfm *__crypto_alloc_tfm(struct crypto_alg *alg, u32 type,
                                      u32 mask);
void *crypto_create_tfm_node(struct crypto_alg *alg,
                        const struct crypto_type *frontend, int node);

static inline void *crypto_create_tfm(struct crypto_alg *alg,
                        const struct crypto_type *frontend)
{
        return crypto_create_tfm_node(alg, frontend, NUMA_NO_NODE);
}

struct crypto_alg *crypto_find_alg(const char *alg_name,
                                   const struct crypto_type *frontend,
                                   u32 type, u32 mask);

void *crypto_alloc_tfm_node(const char *alg_name,
                       const struct crypto_type *frontend, u32 type, u32 mask,
                       int node);

static inline void *crypto_alloc_tfm(const char *alg_name,
                       const struct crypto_type *frontend, u32 type, u32 mask)
{
        return crypto_alloc_tfm_node(alg_name, frontend, type, mask, NUMA_NO_NODE);
}

int crypto_probing_notify(unsigned long val, void *v);

unsigned int crypto_alg_extsize(struct crypto_alg *alg);

int crypto_type_has_alg(const char *name, const struct crypto_type *frontend,
                        u32 type, u32 mask);

static inline struct crypto_alg *crypto_alg_get(struct crypto_alg *alg)
{
        refcount_inc(&alg->cra_refcnt);
        return alg;
}

static inline void crypto_alg_put(struct crypto_alg *alg)
{
        if (refcount_dec_and_test(&alg->cra_refcnt) && alg->cra_destroy)
                alg->cra_destroy(alg);
}

static inline int crypto_tmpl_get(struct crypto_template *tmpl)
{
        return try_module_get(tmpl->module);
}

static inline void crypto_tmpl_put(struct crypto_template *tmpl)
{
        module_put(tmpl->module);
}

static inline int crypto_is_larval(struct crypto_alg *alg)
{
        return alg->cra_flags & CRYPTO_ALG_LARVAL;
}

static inline int crypto_is_dead(struct crypto_alg *alg)
{
        return alg->cra_flags & CRYPTO_ALG_DEAD;
}

static inline int crypto_is_moribund(struct crypto_alg *alg)
{
        return alg->cra_flags & (CRYPTO_ALG_DEAD | CRYPTO_ALG_DYING);
}

static inline void crypto_notify(unsigned long val, void *v)
{
        blocking_notifier_call_chain(&crypto_chain, val, v);
}

static inline void crypto_yield(u32 flags)
{
        if (flags & CRYPTO_TFM_REQ_MAY_SLEEP)
                cond_resched();
}

#endif        /* _CRYPTO_INTERNAL_H */







































































































































































































    2 



    1 

    1 





    2 





































    2 





















    1 











    1 



































































    1 


    1 










    1 









    1 


    1 
























    1 
    1 



    1 















































































    1 


    1 









    1 





















    1 
























    1 









    1 





    1 








    1 






    1 

    1 
    1 


    1 
    1 










    1 
















































    2 


    2 
    1 



    1 



    1 

    1 




























    1 





    1 












    1 











    1 




























    4 
    2 





    4 












    4 




















































    4 









    2 
































    4 
    4 




















































































































































































































































































































































































































































































































































































































    1 

























































































































































































































































    9 









    9 


    1 
    9 











    9 


    9 























    6 









    9 

    9 













    4 














    4 




































    7 

    7 
    7 














    9 
    9 





    9 


    8 

    8 

    8 
    1 
    8 

    8 
    2 
    8 

    8 

























    9 


    9 

    2 
    2 


    2 




    9 
    5 



    5 


    9 



    9 






    8 










    8 

























    7 
    7 
















    2 
    2 




    2 










    1 
    1 

    1 



    1 






























































































































































































































    3 

























    2 


























    2 















    2 

    2 


    2 
    3 


























    4 




















    4 
























































































































    4 


    4 






















    3 








    3 




















    3 











    3 





















    3 





    3 



































































    3 


    3 

    3 















    3 

    3 


    1 






    3 



























































































































































































































































































































































    3 


    1 













































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
// SPDX-License-Identifier: GPL-2.0-only
/*
 * fs/dcache.c
 *
 * Complete reimplementation
 * (C) 1997 Thomas Schoebel-Theuer,
 * with heavy changes by Linus Torvalds
 */

/*
 * Notes on the allocation strategy:
 *
 * The dcache is a master of the icache - whenever a dcache entry
 * exists, the inode will always exist. "iput()" is done either when
 * the dcache entry is deleted or garbage collected.
 */

#include <linux/ratelimit.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/fscrypt.h>
#include <linux/fsnotify.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/cache.h>
#include <linux/export.h>
#include <linux/security.h>
#include <linux/seqlock.h>
#include <linux/memblock.h>
#include <linux/bit_spinlock.h>
#include <linux/rculist_bl.h>
#include <linux/list_lru.h>
#include "internal.h"
#include "mount.h"

/*
 * Usage:
 * dcache->d_inode->i_lock protects:
 *   - i_dentry, d_u.d_alias, d_inode of aliases
 * dcache_hash_bucket lock protects:
 *   - the dcache hash table
 * s_roots bl list spinlock protects:
 *   - the s_roots list (see __d_drop)
 * dentry->d_sb->s_dentry_lru_lock protects:
 *   - the dcache lru lists and counters
 * d_lock protects:
 *   - d_flags
 *   - d_name
 *   - d_lru
 *   - d_count
 *   - d_unhashed()
 *   - d_parent and d_subdirs
 *   - childrens' d_child and d_parent
 *   - d_u.d_alias, d_inode
 *
 * Ordering:
 * dentry->d_inode->i_lock
 *   dentry->d_lock
 *     dentry->d_sb->s_dentry_lru_lock
 *     dcache_hash_bucket lock
 *     s_roots lock
 *
 * If there is an ancestor relationship:
 * dentry->d_parent->...->d_parent->d_lock
 *   ...
 *     dentry->d_parent->d_lock
 *       dentry->d_lock
 *
 * If no ancestor relationship:
 * arbitrary, since it's serialized on rename_lock
 */
int sysctl_vfs_cache_pressure __read_mostly = 100;
EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);

__cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);

EXPORT_SYMBOL(rename_lock);

static struct kmem_cache *dentry_cache __read_mostly;

const struct qstr empty_name = QSTR_INIT("", 0);
EXPORT_SYMBOL(empty_name);
const struct qstr slash_name = QSTR_INIT("/", 1);
EXPORT_SYMBOL(slash_name);

/*
 * This is the single most critical data structure when it comes
 * to the dcache: the hashtable for lookups. Somebody should try
 * to make this good - I've just made it work.
 *
 * This hash-function tries to avoid losing too many bits of hash
 * information, yet avoid using a prime hash-size or similar.
 */

static unsigned int d_hash_shift __read_mostly;

static struct hlist_bl_head *dentry_hashtable __read_mostly;

static inline struct hlist_bl_head *d_hash(unsigned int hash)
{
        return dentry_hashtable + (hash >> d_hash_shift);
}

#define IN_LOOKUP_SHIFT 10
static struct hlist_bl_head in_lookup_hashtable[1 << IN_LOOKUP_SHIFT];

static inline struct hlist_bl_head *in_lookup_hash(const struct dentry *parent,
                                        unsigned int hash)
{
        hash += (unsigned long) parent / L1_CACHE_BYTES;
        return in_lookup_hashtable + hash_32(hash, IN_LOOKUP_SHIFT);
}


/* Statistics gathering. */
struct dentry_stat_t dentry_stat = {
        .age_limit = 45,
};

static DEFINE_PER_CPU(long, nr_dentry);
static DEFINE_PER_CPU(long, nr_dentry_unused);
static DEFINE_PER_CPU(long, nr_dentry_negative);

#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)

/*
 * Here we resort to our own counters instead of using generic per-cpu counters
 * for consistency with what the vfs inode code does. We are expected to harvest
 * better code and performance by having our own specialized counters.
 *
 * Please note that the loop is done over all possible CPUs, not over all online
 * CPUs. The reason for this is that we don't want to play games with CPUs going
 * on and off. If one of them goes off, we will just keep their counters.
 *
 * glommer: See cffbc8a for details, and if you ever intend to change this,
 * please update all vfs counters to match.
 */
static long get_nr_dentry(void)
{
        int i;
        long sum = 0;
        for_each_possible_cpu(i)
                sum += per_cpu(nr_dentry, i);
        return sum < 0 ? 0 : sum;
}

static long get_nr_dentry_unused(void)
{
        int i;
        long sum = 0;
        for_each_possible_cpu(i)
                sum += per_cpu(nr_dentry_unused, i);
        return sum < 0 ? 0 : sum;
}

static long get_nr_dentry_negative(void)
{
        int i;
        long sum = 0;

        for_each_possible_cpu(i)
                sum += per_cpu(nr_dentry_negative, i);
        return sum < 0 ? 0 : sum;
}

int proc_nr_dentry(struct ctl_table *table, int write, void *buffer,
                   size_t *lenp, loff_t *ppos)
{
        dentry_stat.nr_dentry = get_nr_dentry();
        dentry_stat.nr_unused = get_nr_dentry_unused();
        dentry_stat.nr_negative = get_nr_dentry_negative();
        return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
#endif

/*
 * Compare 2 name strings, return 0 if they match, otherwise non-zero.
 * The strings are both count bytes long, and count is non-zero.
 */
#ifdef CONFIG_DCACHE_WORD_ACCESS

#include <asm/word-at-a-time.h>
/*
 * NOTE! 'cs' and 'scount' come from a dentry, so it has a
 * aligned allocation for this particular component. We don't
 * strictly need the load_unaligned_zeropad() safety, but it
 * doesn't hurt either.
 *
 * In contrast, 'ct' and 'tcount' can be from a pathname, and do
 * need the careful unaligned handling.
 */
static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount)
{
        unsigned long a,b,mask;

        for (;;) {
                a = read_word_at_a_time(cs);
                b = load_unaligned_zeropad(ct);
                if (tcount < sizeof(unsigned long))
                        break;
                if (unlikely(a != b))
                        return 1;
                cs += sizeof(unsigned long);
                ct += sizeof(unsigned long);
                tcount -= sizeof(unsigned long);
                if (!tcount)
                        return 0;
        }
        mask = bytemask_from_count(tcount);
        return unlikely(!!((a ^ b) & mask));
}

#else

static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount)
{
        do {
                if (*cs != *ct)
                        return 1;
                cs++;
                ct++;
                tcount--;
        } while (tcount);
        return 0;
}

#endif

static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount)
{
        /*
         * Be careful about RCU walk racing with rename:
         * use 'READ_ONCE' to fetch the name pointer.
         *
         * NOTE! Even if a rename will mean that the length
         * was not loaded atomically, we don't care. The
         * RCU walk will check the sequence count eventually,
         * and catch it. And we won't overrun the buffer,
         * because we're reading the name pointer atomically,
         * and a dentry name is guaranteed to be properly
         * terminated with a NUL byte.
         *
         * End result: even if 'len' is wrong, we'll exit
         * early because the data cannot match (there can
         * be no NUL in the ct/tcount data)
         */
        const unsigned char *cs = READ_ONCE(dentry->d_name.name);

        return dentry_string_cmp(cs, ct, tcount);
}

struct external_name {
        union {
                atomic_t count;
                struct rcu_head head;
        } u;
        unsigned char name[];
};

static inline struct external_name *external_name(struct dentry *dentry)
{
        return container_of(dentry->d_name.name, struct external_name, name[0]);
}

static void __d_free(struct rcu_head *head)
{
        struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);

        kmem_cache_free(dentry_cache, dentry); 
}

static void __d_free_external(struct rcu_head *head)
{
        struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
        kfree(external_name(dentry));
        kmem_cache_free(dentry_cache, dentry);
}

static inline int dname_external(const struct dentry *dentry)
{
        return dentry->d_name.name != dentry->d_iname;
}

void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry)
{
        spin_lock(&dentry->d_lock);
        name->name = dentry->d_name;
        if (unlikely(dname_external(dentry))) {
                atomic_inc(&external_name(dentry)->u.count);
        } else {
                memcpy(name->inline_name, dentry->d_iname,
                       dentry->d_name.len + 1);
                name->name.name = name->inline_name;
        }
        spin_unlock(&dentry->d_lock);
}
EXPORT_SYMBOL(take_dentry_name_snapshot);

void release_dentry_name_snapshot(struct name_snapshot *name)
{
        if (unlikely(name->name.name != name->inline_name)) {
                struct external_name *p;
                p = container_of(name->name.name, struct external_name, name[0]);
                if (unlikely(atomic_dec_and_test(&p->u.count)))
                        kfree_rcu(p, u.head);
        }
}
EXPORT_SYMBOL(release_dentry_name_snapshot);

static inline void __d_set_inode_and_type(struct dentry *dentry,
                                          struct inode *inode,
                                          unsigned type_flags)
{
        unsigned flags;

        dentry->d_inode = inode;
        flags = READ_ONCE(dentry->d_flags);
        flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
        flags |= type_flags;
        smp_store_release(&dentry->d_flags, flags);
}

static inline void __d_clear_type_and_inode(struct dentry *dentry)
{
        unsigned flags = READ_ONCE(dentry->d_flags);

        flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
        WRITE_ONCE(dentry->d_flags, flags);
        dentry->d_inode = NULL;
        /*
         * The negative counter only tracks dentries on the LRU. Don't inc if
         * d_lru is on another list.
         */
        if ((flags & (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) == DCACHE_LRU_LIST)
                this_cpu_inc(nr_dentry_negative);
}

static void dentry_free(struct dentry *dentry)
{
        WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias));
        if (unlikely(dname_external(dentry))) {
                struct external_name *p = external_name(dentry);
                if (likely(atomic_dec_and_test(&p->u.count))) {
                        call_rcu(&dentry->d_u.d_rcu, __d_free_external);
                        return;
                }
        }
        /* if dentry was never visible to RCU, immediate free is OK */
        if (dentry->d_flags & DCACHE_NORCU)
                __d_free(&dentry->d_u.d_rcu);
        else
                call_rcu(&dentry->d_u.d_rcu, __d_free);
}

/*
 * Release the dentry's inode, using the filesystem
 * d_iput() operation if defined.
 */
static void dentry_unlink_inode(struct dentry * dentry)
        __releases(dentry->d_lock)
        __releases(dentry->d_inode->i_lock)
{
        struct inode *inode = dentry->d_inode;

        raw_write_seqcount_begin(&dentry->d_seq);
        __d_clear_type_and_inode(dentry);
        hlist_del_init(&dentry->d_u.d_alias);
        raw_write_seqcount_end(&dentry->d_seq);
        spin_unlock(&dentry->d_lock);
        spin_unlock(&inode->i_lock);
        if (!inode->i_nlink)
                fsnotify_inoderemove(inode);
        if (dentry->d_op && dentry->d_op->d_iput)
                dentry->d_op->d_iput(dentry, inode);
        else
                iput(inode);
}

/*
 * The DCACHE_LRU_LIST bit is set whenever the 'd_lru' entry
 * is in use - which includes both the "real" per-superblock
 * LRU list _and_ the DCACHE_SHRINK_LIST use.
 *
 * The DCACHE_SHRINK_LIST bit is set whenever the dentry is
 * on the shrink list (ie not on the superblock LRU list).
 *
 * The per-cpu "nr_dentry_unused" counters are updated with
 * the DCACHE_LRU_LIST bit.
 *
 * The per-cpu "nr_dentry_negative" counters are only updated
 * when deleted from or added to the per-superblock LRU list, not
 * from/to the shrink list. That is to avoid an unneeded dec/inc
 * pair when moving from LRU to shrink list in select_collect().
 *
 * These helper functions make sure we always follow the
 * rules. d_lock must be held by the caller.
 */
#define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x))
static void d_lru_add(struct dentry *dentry)
{
        D_FLAG_VERIFY(dentry, 0);
        dentry->d_flags |= DCACHE_LRU_LIST;
        this_cpu_inc(nr_dentry_unused);
        if (d_is_negative(dentry))
                this_cpu_inc(nr_dentry_negative);
        WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
}

static void d_lru_del(struct dentry *dentry)
{
        D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
        dentry->d_flags &= ~DCACHE_LRU_LIST;
        this_cpu_dec(nr_dentry_unused);
        if (d_is_negative(dentry))
                this_cpu_dec(nr_dentry_negative);
        WARN_ON_ONCE(!list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
}

static void d_shrink_del(struct dentry *dentry)
{
        D_FLAG_VERIFY(dentry, DCACHE_SHRINK_LIST | DCACHE_LRU_LIST);
        list_del_init(&dentry->d_lru);
        dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST);
        this_cpu_dec(nr_dentry_unused);
}

static void d_shrink_add(struct dentry *dentry, struct list_head *list)
{
        D_FLAG_VERIFY(dentry, 0);
        list_add(&dentry->d_lru, list);
        dentry->d_flags |= DCACHE_SHRINK_LIST | DCACHE_LRU_LIST;
        this_cpu_inc(nr_dentry_unused);
}

/*
 * These can only be called under the global LRU lock, ie during the
 * callback for freeing the LRU list. "isolate" removes it from the
 * LRU lists entirely, while shrink_move moves it to the indicated
 * private list.
 */
static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry)
{
        D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
        dentry->d_flags &= ~DCACHE_LRU_LIST;
        this_cpu_dec(nr_dentry_unused);
        if (d_is_negative(dentry))
                this_cpu_dec(nr_dentry_negative);
        list_lru_isolate(lru, &dentry->d_lru);
}

static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry,
                              struct list_head *list)
{
        D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
        dentry->d_flags |= DCACHE_SHRINK_LIST;
        if (d_is_negative(dentry))
                this_cpu_dec(nr_dentry_negative);
        list_lru_isolate_move(lru, &dentry->d_lru, list);
}

/**
 * d_drop - drop a dentry
 * @dentry: dentry to drop
 *
 * d_drop() unhashes the entry from the parent dentry hashes, so that it won't
 * be found through a VFS lookup any more. Note that this is different from
 * deleting the dentry - d_delete will try to mark the dentry negative if
 * possible, giving a successful _negative_ lookup, while d_drop will
 * just make the cache lookup fail.
 *
 * d_drop() is used mainly for stuff that wants to invalidate a dentry for some
 * reason (NFS timeouts or autofs deletes).
 *
 * __d_drop requires dentry->d_lock
 * ___d_drop doesn't mark dentry as "unhashed"
 *   (dentry->d_hash.pprev will be LIST_POISON2, not NULL).
 */
static void ___d_drop(struct dentry *dentry)
{
        struct hlist_bl_head *b;
        /*
         * Hashed dentries are normally on the dentry hashtable,
         * with the exception of those newly allocated by
         * d_obtain_root, which are always IS_ROOT:
         */
        if (unlikely(IS_ROOT(dentry)))
                b = &dentry->d_sb->s_roots;
        else
                b = d_hash(dentry->d_name.hash);

        hlist_bl_lock(b);
        __hlist_bl_del(&dentry->d_hash);
        hlist_bl_unlock(b);
}

void __d_drop(struct dentry *dentry)
{
        if (!d_unhashed(dentry)) {
                ___d_drop(dentry);
                dentry->d_hash.pprev = NULL;
                write_seqcount_invalidate(&dentry->d_seq);
        }
}
EXPORT_SYMBOL(__d_drop);

void d_drop(struct dentry *dentry)
{
        spin_lock(&dentry->d_lock);
        __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
}
EXPORT_SYMBOL(d_drop);

static inline void dentry_unlist(struct dentry *dentry, struct dentry *parent)
{
        struct dentry *next;
        /*
         * Inform d_walk() and shrink_dentry_list() that we are no longer
         * attached to the dentry tree
         */
        dentry->d_flags |= DCACHE_DENTRY_KILLED;
        if (unlikely(list_empty(&dentry->d_child)))
                return;
        __list_del_entry(&dentry->d_child);
        /*
         * Cursors can move around the list of children.  While we'd been
         * a normal list member, it didn't matter - ->d_child.next would've
         * been updated.  However, from now on it won't be and for the
         * things like d_walk() it might end up with a nasty surprise.
         * Normally d_walk() doesn't care about cursors moving around -
         * ->d_lock on parent prevents that and since a cursor has no children
         * of its own, we get through it without ever unlocking the parent.
         * There is one exception, though - if we ascend from a child that
         * gets killed as soon as we unlock it, the next sibling is found
         * using the value left in its ->d_child.next.  And if _that_
         * pointed to a cursor, and cursor got moved (e.g. by lseek())
         * before d_walk() regains parent->d_lock, we'll end up skipping
         * everything the cursor had been moved past.
         *
         * Solution: make sure that the pointer left behind in ->d_child.next
         * points to something that won't be moving around.  I.e. skip the
         * cursors.
         */
        while (dentry->d_child.next != &parent->d_subdirs) {
                next = list_entry(dentry->d_child.next, struct dentry, d_child);
                if (likely(!(next->d_flags & DCACHE_DENTRY_CURSOR)))
                        break;
                dentry->d_child.next = next->d_child.next;
        }
}

static void __dentry_kill(struct dentry *dentry)
{
        struct dentry *parent = NULL;
        bool can_free = true;
        if (!IS_ROOT(dentry))
                parent = dentry->d_parent;

        /*
         * The dentry is now unrecoverably dead to the world.
         */
        lockref_mark_dead(&dentry->d_lockref);

        /*
         * inform the fs via d_prune that this dentry is about to be
         * unhashed and destroyed.
         */
        if (dentry->d_flags & DCACHE_OP_PRUNE)
                dentry->d_op->d_prune(dentry);

        if (dentry->d_flags & DCACHE_LRU_LIST) {
                if (!(dentry->d_flags & DCACHE_SHRINK_LIST))
                        d_lru_del(dentry);
        }
        /* if it was on the hash then remove it */
        __d_drop(dentry);
        dentry_unlist(dentry, parent);
        if (parent)
                spin_unlock(&parent->d_lock);
        if (dentry->d_inode)
                dentry_unlink_inode(dentry);
        else
                spin_unlock(&dentry->d_lock);
        this_cpu_dec(nr_dentry);
        if (dentry->d_op && dentry->d_op->d_release)
                dentry->d_op->d_release(dentry);

        spin_lock(&dentry->d_lock);
        if (dentry->d_flags & DCACHE_SHRINK_LIST) {
                dentry->d_flags |= DCACHE_MAY_FREE;
                can_free = false;
        }
        spin_unlock(&dentry->d_lock);
        if (likely(can_free))
                dentry_free(dentry);
        cond_resched();
}

static struct dentry *__lock_parent(struct dentry *dentry)
{
        struct dentry *parent;
        rcu_read_lock();
        spin_unlock(&dentry->d_lock);
again:
        parent = READ_ONCE(dentry->d_parent);
        spin_lock(&parent->d_lock);
        /*
         * We can't blindly lock dentry until we are sure
         * that we won't violate the locking order.
         * Any changes of dentry->d_parent must have
         * been done with parent->d_lock held, so
         * spin_lock() above is enough of a barrier
         * for checking if it's still our child.
         */
        if (unlikely(parent != dentry->d_parent)) {
                spin_unlock(&parent->d_lock);
                goto again;
        }
        rcu_read_unlock();
        if (parent != dentry)
                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
        else
                parent = NULL;
        return parent;
}

static inline struct dentry *lock_parent(struct dentry *dentry)
{
        struct dentry *parent = dentry->d_parent;
        if (IS_ROOT(dentry))
                return NULL;
        if (likely(spin_trylock(&parent->d_lock)))
                return parent;
        return __lock_parent(dentry);
}

static inline bool retain_dentry(struct dentry *dentry)
{
        WARN_ON(d_in_lookup(dentry));

        /* Unreachable? Get rid of it */
        if (unlikely(d_unhashed(dentry)))
                return false;

        if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED))
                return false;

        if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) {
                if (dentry->d_op->d_delete(dentry))
                        return false;
        }

        if (unlikely(dentry->d_flags & DCACHE_DONTCACHE))
                return false;

        /* retain; LRU fodder */
        dentry->d_lockref.count--;
        if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST)))
                d_lru_add(dentry);
        else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED)))
                dentry->d_flags |= DCACHE_REFERENCED;
        return true;
}

void d_mark_dontcache(struct inode *inode)
{
        struct dentry *de;

        spin_lock(&inode->i_lock);
        hlist_for_each_entry(de, &inode->i_dentry, d_u.d_alias) {
                spin_lock(&de->d_lock);
                de->d_flags |= DCACHE_DONTCACHE;
                spin_unlock(&de->d_lock);
        }
        inode->i_state |= I_DONTCACHE;
        spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(d_mark_dontcache);

/*
 * Finish off a dentry we've decided to kill.
 * dentry->d_lock must be held, returns with it unlocked.
 * Returns dentry requiring refcount drop, or NULL if we're done.
 */
static struct dentry *dentry_kill(struct dentry *dentry)
        __releases(dentry->d_lock)
{
        struct inode *inode = dentry->d_inode;
        struct dentry *parent = NULL;

        if (inode && unlikely(!spin_trylock(&inode->i_lock)))
                goto slow_positive;

        if (!IS_ROOT(dentry)) {
                parent = dentry->d_parent;
                if (unlikely(!spin_trylock(&parent->d_lock))) {
                        parent = __lock_parent(dentry);
                        if (likely(inode || !dentry->d_inode))
                                goto got_locks;
                        /* negative that became positive */
                        if (parent)
                                spin_unlock(&parent->d_lock);
                        inode = dentry->d_inode;
                        goto slow_positive;
                }
        }
        __dentry_kill(dentry);
        return parent;

slow_positive:
        spin_unlock(&dentry->d_lock);
        spin_lock(&inode->i_lock);
        spin_lock(&dentry->d_lock);
        parent = lock_parent(dentry);
got_locks:
        if (unlikely(dentry->d_lockref.count != 1)) {
                dentry->d_lockref.count--;
        } else if (likely(!retain_dentry(dentry))) {
                __dentry_kill(dentry);
                return parent;
        }
        /* we are keeping it, after all */
        if (inode)
                spin_unlock(&inode->i_lock);
        if (parent)
                spin_unlock(&parent->d_lock);
        spin_unlock(&dentry->d_lock);
        return NULL;
}

/*
 * Try to do a lockless dput(), and return whether that was successful.
 *
 * If unsuccessful, we return false, having already taken the dentry lock.
 *
 * The caller needs to hold the RCU read lock, so that the dentry is
 * guaranteed to stay around even if the refcount goes down to zero!
 */
static inline bool fast_dput(struct dentry *dentry)
{
        int ret;
        unsigned int d_flags;

        /*
         * If we have a d_op->d_delete() operation, we sould not
         * let the dentry count go to zero, so use "put_or_lock".
         */
        if (unlikely(dentry->d_flags & DCACHE_OP_DELETE))
                return lockref_put_or_lock(&dentry->d_lockref);

        /*
         * .. otherwise, we can try to just decrement the
         * lockref optimistically.
         */
        ret = lockref_put_return(&dentry->d_lockref);

        /*
         * If the lockref_put_return() failed due to the lock being held
         * by somebody else, the fast path has failed. We will need to
         * get the lock, and then check the count again.
         */
        if (unlikely(ret < 0)) {
                spin_lock(&dentry->d_lock);
                if (WARN_ON_ONCE(dentry->d_lockref.count <= 0)) {
                        spin_unlock(&dentry->d_lock);
                        return true;
                }
                dentry->d_lockref.count--;
                goto locked;
        }

        /*
         * If we weren't the last ref, we're done.
         */
        if (ret)
                return true;

        /*
         * Careful, careful. The reference count went down
         * to zero, but we don't hold the dentry lock, so
         * somebody else could get it again, and do another
         * dput(), and we need to not race with that.
         *
         * However, there is a very special and common case
         * where we don't care, because there is nothing to
         * do: the dentry is still hashed, it does not have
         * a 'delete' op, and it's referenced and already on
         * the LRU list.
         *
         * NOTE! Since we aren't locked, these values are
         * not "stable". However, it is sufficient that at
         * some point after we dropped the reference the
         * dentry was hashed and the flags had the proper
         * value. Other dentry users may have re-gotten
         * a reference to the dentry and change that, but
         * our work is done - we can leave the dentry
         * around with a zero refcount.
         */
        smp_rmb();
        d_flags = READ_ONCE(dentry->d_flags);
        d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | DCACHE_DISCONNECTED;

        /* Nothing to do? Dropping the reference was all we needed? */
        if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && !d_unhashed(dentry))
                return true;

        /*
         * Not the fast normal case? Get the lock. We've already decremented
         * the refcount, but we'll need to re-check the situation after
         * getting the lock.
         */
        spin_lock(&dentry->d_lock);

        /*
         * Did somebody else grab a reference to it in the meantime, and
         * we're no longer the last user after all? Alternatively, somebody
         * else could have killed it and marked it dead. Either way, we
         * don't need to do anything else.
         */
locked:
        if (dentry->d_lockref.count) {
                spin_unlock(&dentry->d_lock);
                return true;
        }

        /*
         * Re-get the reference we optimistically dropped. We hold the
         * lock, and we just tested that it was zero, so we can just
         * set it to 1.
         */
        dentry->d_lockref.count = 1;
        return false;
}


/* 
 * This is dput
 *
 * This is complicated by the fact that we do not want to put
 * dentries that are no longer on any hash chain on the unused
 * list: we'd much rather just get rid of them immediately.
 *
 * However, that implies that we have to traverse the dentry
 * tree upwards to the parents which might _also_ now be
 * scheduled for deletion (it may have been only waiting for
 * its last child to go away).
 *
 * This tail recursion is done by hand as we don't want to depend
 * on the compiler to always get this right (gcc generally doesn't).
 * Real recursion would eat up our stack space.
 */

/*
 * dput - release a dentry
 * @dentry: dentry to release 
 *
 * Release a dentry. This will drop the usage count and if appropriate
 * call the dentry unlink method as well as removing it from the queues and
 * releasing its resources. If the parent dentries were scheduled for release
 * they too may now get deleted.
 */
void dput(struct dentry *dentry)
{
        while (dentry) {
                might_sleep();

                rcu_read_lock();
                if (likely(fast_dput(dentry))) {
                        rcu_read_unlock();
                        return;
                }

                /* Slow case: now with the dentry lock held */
                rcu_read_unlock();

                if (likely(retain_dentry(dentry))) {
                        spin_unlock(&dentry->d_lock);
                        return;
                }

                dentry = dentry_kill(dentry);
        }
}
EXPORT_SYMBOL(dput);

static void __dput_to_list(struct dentry *dentry, struct list_head *list)
__must_hold(&dentry->d_lock)
{
        if (dentry->d_flags & DCACHE_SHRINK_LIST) {
                /* let the owner of the list it's on deal with it */
                --dentry->d_lockref.count;
        } else {
                if (dentry->d_flags & DCACHE_LRU_LIST)
                        d_lru_del(dentry);
                if (!--dentry->d_lockref.count)
                        d_shrink_add(dentry, list);
        }
}

void dput_to_list(struct dentry *dentry, struct list_head *list)
{
        rcu_read_lock();
        if (likely(fast_dput(dentry))) {
                rcu_read_unlock();
                return;
        }
        rcu_read_unlock();
        if (!retain_dentry(dentry))
                __dput_to_list(dentry, list);
        spin_unlock(&dentry->d_lock);
}

/* This must be called with d_lock held */
static inline void __dget_dlock(struct dentry *dentry)
{
        dentry->d_lockref.count++;
}

static inline void __dget(struct dentry *dentry)
{
        lockref_get(&dentry->d_lockref);
}

struct dentry *dget_parent(struct dentry *dentry)
{
        int gotref;
        struct dentry *ret;
        unsigned seq;

        /*
         * Do optimistic parent lookup without any
         * locking.
         */
        rcu_read_lock();
        seq = raw_seqcount_begin(&dentry->d_seq);
        ret = READ_ONCE(dentry->d_parent);
        gotref = lockref_get_not_zero(&ret->d_lockref);
        rcu_read_unlock();
        if (likely(gotref)) {
                if (!read_seqcount_retry(&dentry->d_seq, seq))
                        return ret;
                dput(ret);
        }

repeat:
        /*
         * Don't need rcu_dereference because we re-check it was correct under
         * the lock.
         */
        rcu_read_lock();
        ret = dentry->d_parent;
        spin_lock(&ret->d_lock);
        if (unlikely(ret != dentry->d_parent)) {
                spin_unlock(&ret->d_lock);
                rcu_read_unlock();
                goto repeat;
        }
        rcu_read_unlock();
        BUG_ON(!ret->d_lockref.count);
        ret->d_lockref.count++;
        spin_unlock(&ret->d_lock);
        return ret;
}
EXPORT_SYMBOL(dget_parent);

static struct dentry * __d_find_any_alias(struct inode *inode)
{
        struct dentry *alias;

        if (hlist_empty(&inode->i_dentry))
                return NULL;
        alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
        __dget(alias);
        return alias;
}

/**
 * d_find_any_alias - find any alias for a given inode
 * @inode: inode to find an alias for
 *
 * If any aliases exist for the given inode, take and return a
 * reference for one of them.  If no aliases exist, return %NULL.
 */
struct dentry *d_find_any_alias(struct inode *inode)
{
        struct dentry *de;

        spin_lock(&inode->i_lock);
        de = __d_find_any_alias(inode);
        spin_unlock(&inode->i_lock);
        return de;
}
EXPORT_SYMBOL(d_find_any_alias);

/**
 * d_find_alias - grab a hashed alias of inode
 * @inode: inode in question
 *
 * If inode has a hashed alias, or is a directory and has any alias,
 * acquire the reference to alias and return it. Otherwise return NULL.
 * Notice that if inode is a directory there can be only one alias and
 * it can be unhashed only if it has no children, or if it is the root
 * of a filesystem, or if the directory was renamed and d_revalidate
 * was the first vfs operation to notice.
 *
 * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer
 * any other hashed alias over that one.
 */
static struct dentry *__d_find_alias(struct inode *inode)
{
        struct dentry *alias;

        if (S_ISDIR(inode->i_mode))
                return __d_find_any_alias(inode);

        hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
                spin_lock(&alias->d_lock);
                 if (!d_unhashed(alias)) {
                        __dget_dlock(alias);
                        spin_unlock(&alias->d_lock);
                        return alias;
                }
                spin_unlock(&alias->d_lock);
        }
        return NULL;
}

struct dentry *d_find_alias(struct inode *inode)
{
        struct dentry *de = NULL;

        if (!hlist_empty(&inode->i_dentry)) {
                spin_lock(&inode->i_lock);
                de = __d_find_alias(inode);
                spin_unlock(&inode->i_lock);
        }
        return de;
}
EXPORT_SYMBOL(d_find_alias);

/*
 *        Try to kill dentries associated with this inode.
 * WARNING: you must own a reference to inode.
 */
void d_prune_aliases(struct inode *inode)
{
        struct dentry *dentry;
restart:
        spin_lock(&inode->i_lock);
        hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) {
                spin_lock(&dentry->d_lock);
                if (!dentry->d_lockref.count) {
                        struct dentry *parent = lock_parent(dentry);
                        if (likely(!dentry->d_lockref.count)) {
                                __dentry_kill(dentry);
                                dput(parent);
                                goto restart;
                        }
                        if (parent)
                                spin_unlock(&parent->d_lock);
                }
                spin_unlock(&dentry->d_lock);
        }
        spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(d_prune_aliases);

/*
 * Lock a dentry from shrink list.
 * Called under rcu_read_lock() and dentry->d_lock; the former
 * guarantees that nothing we access will be freed under us.
 * Note that dentry is *not* protected from concurrent dentry_kill(),
 * d_delete(), etc.
 *
 * Return false if dentry has been disrupted or grabbed, leaving
 * the caller to kick it off-list.  Otherwise, return true and have
 * that dentry's inode and parent both locked.
 */
static bool shrink_lock_dentry(struct dentry *dentry)
{
        struct inode *inode;
        struct dentry *parent;

        if (dentry->d_lockref.count)
                return false;

        inode = dentry->d_inode;
        if (inode && unlikely(!spin_trylock(&inode->i_lock))) {
                spin_unlock(&dentry->d_lock);
                spin_lock(&inode->i_lock);
                spin_lock(&dentry->d_lock);
                if (unlikely(dentry->d_lockref.count))
                        goto out;
                /* changed inode means that somebody had grabbed it */
                if (unlikely(inode != dentry->d_inode))
                        goto out;
        }

        parent = dentry->d_parent;
        if (IS_ROOT(dentry) || likely(spin_trylock(&parent->d_lock)))
                return true;

        spin_unlock(&dentry->d_lock);
        spin_lock(&parent->d_lock);
        if (unlikely(parent != dentry->d_parent)) {
                spin_unlock(&parent->d_lock);
                spin_lock(&dentry->d_lock);
                goto out;
        }
        spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
        if (likely(!dentry->d_lockref.count))
                return true;
        spin_unlock(&parent->d_lock);
out:
        if (inode)
                spin_unlock(&inode->i_lock);
        return false;
}

void shrink_dentry_list(struct list_head *list)
{
        while (!list_empty(list)) {
                struct dentry *dentry, *parent;

                dentry = list_entry(list->prev, struct dentry, d_lru);
                spin_lock(&dentry->d_lock);
                rcu_read_lock();
                if (!shrink_lock_dentry(dentry)) {
                        bool can_free = false;
                        rcu_read_unlock();
                        d_shrink_del(dentry);
                        if (dentry->d_lockref.count < 0)
                                can_free = dentry->d_flags & DCACHE_MAY_FREE;
                        spin_unlock(&dentry->d_lock);
                        if (can_free)
                                dentry_free(dentry);
                        continue;
                }
                rcu_read_unlock();
                d_shrink_del(dentry);
                parent = dentry->d_parent;
                if (parent != dentry)
                        __dput_to_list(parent, list);
                __dentry_kill(dentry);
        }
}

static enum lru_status dentry_lru_isolate(struct list_head *item,
                struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
{
        struct list_head *freeable = arg;
        struct dentry        *dentry = container_of(item, struct dentry, d_lru);


        /*
         * we are inverting the lru lock/dentry->d_lock here,
         * so use a trylock. If we fail to get the lock, just skip
         * it
         */
        if (!spin_trylock(&dentry->d_lock))
                return LRU_SKIP;

        /*
         * Referenced dentries are still in use. If they have active
         * counts, just remove them from the LRU. Otherwise give them
         * another pass through the LRU.
         */
        if (dentry->d_lockref.count) {
                d_lru_isolate(lru, dentry);
                spin_unlock(&dentry->d_lock);
                return LRU_REMOVED;
        }

        if (dentry->d_flags & DCACHE_REFERENCED) {
                dentry->d_flags &= ~DCACHE_REFERENCED;
                spin_unlock(&dentry->d_lock);

                /*
                 * The list move itself will be made by the common LRU code. At
                 * this point, we've dropped the dentry->d_lock but keep the
                 * lru lock. This is safe to do, since every list movement is
                 * protected by the lru lock even if both locks are held.
                 *
                 * This is guaranteed by the fact that all LRU management
                 * functions are intermediated by the LRU API calls like
                 * list_lru_add and list_lru_del. List movement in this file
                 * only ever occur through this functions or through callbacks
                 * like this one, that are called from the LRU API.
                 *
                 * The only exceptions to this are functions like
                 * shrink_dentry_list, and code that first checks for the
                 * DCACHE_SHRINK_LIST flag.  Those are guaranteed to be
                 * operating only with stack provided lists after they are
                 * properly isolated from the main list.  It is thus, always a
                 * local access.
                 */
                return LRU_ROTATE;
        }

        d_lru_shrink_move(lru, dentry, freeable);
        spin_unlock(&dentry->d_lock);

        return LRU_REMOVED;
}

/**
 * prune_dcache_sb - shrink the dcache
 * @sb: superblock
 * @sc: shrink control, passed to list_lru_shrink_walk()
 *
 * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This
 * is done when we need more memory and called from the superblock shrinker
 * function.
 *
 * This function may fail to free any resources if all the dentries are in
 * use.
 */
long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc)
{
        LIST_HEAD(dispose);
        long freed;

        freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc,
                                     dentry_lru_isolate, &dispose);
        shrink_dentry_list(&dispose);
        return freed;
}

static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
                struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
{
        struct list_head *freeable = arg;
        struct dentry        *dentry = container_of(item, struct dentry, d_lru);

        /*
         * we are inverting the lru lock/dentry->d_lock here,
         * so use a trylock. If we fail to get the lock, just skip
         * it
         */
        if (!spin_trylock(&dentry->d_lock))
                return LRU_SKIP;

        d_lru_shrink_move(lru, dentry, freeable);
        spin_unlock(&dentry->d_lock);

        return LRU_REMOVED;
}


/**
 * shrink_dcache_sb - shrink dcache for a superblock
 * @sb: superblock
 *
 * Shrink the dcache for the specified super block. This is used to free
 * the dcache before unmounting a file system.
 */
void shrink_dcache_sb(struct super_block *sb)
{
        do {
                LIST_HEAD(dispose);

                list_lru_walk(&sb->s_dentry_lru,
                        dentry_lru_isolate_shrink, &dispose, 1024);
                shrink_dentry_list(&dispose);
        } while (list_lru_count(&sb->s_dentry_lru) > 0);
}
EXPORT_SYMBOL(shrink_dcache_sb);

/**
 * enum d_walk_ret - action to talke during tree walk
 * @D_WALK_CONTINUE:        contrinue walk
 * @D_WALK_QUIT:        quit walk
 * @D_WALK_NORETRY:        quit when retry is needed
 * @D_WALK_SKIP:        skip this dentry and its children
 */
enum d_walk_ret {
        D_WALK_CONTINUE,
        D_WALK_QUIT,
        D_WALK_NORETRY,
        D_WALK_SKIP,
};

/**
 * d_walk - walk the dentry tree
 * @parent:        start of walk
 * @data:        data passed to @enter() and @finish()
 * @enter:        callback when first entering the dentry
 *
 * The @enter() callbacks are called with d_lock held.
 */
static void d_walk(struct dentry *parent, void *data,
                   enum d_walk_ret (*enter)(void *, struct dentry *))
{
        struct dentry *this_parent;
        struct list_head *next;
        unsigned seq = 0;
        enum d_walk_ret ret;
        bool retry = true;

again:
        read_seqbegin_or_lock(&rename_lock, &seq);
        this_parent = parent;
        spin_lock(&this_parent->d_lock);

        ret = enter(data, this_parent);
        switch (ret) {
        case D_WALK_CONTINUE:
                break;
        case D_WALK_QUIT:
        case D_WALK_SKIP:
                goto out_unlock;
        case D_WALK_NORETRY:
                retry = false;
                break;
        }
repeat:
        next = this_parent->d_subdirs.next;
resume:
        while (next != &this_parent->d_subdirs) {
                struct list_head *tmp = next;
                struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
                next = tmp->next;

                if (unlikely(dentry->d_flags & DCACHE_DENTRY_CURSOR))
                        continue;

                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);

                ret = enter(data, dentry);
                switch (ret) {
                case D_WALK_CONTINUE:
                        break;
                case D_WALK_QUIT:
                        spin_unlock(&dentry->d_lock);
                        goto out_unlock;
                case D_WALK_NORETRY:
                        retry = false;
                        break;
                case D_WALK_SKIP:
                        spin_unlock(&dentry->d_lock);
                        continue;
                }

                if (!list_empty(&dentry->d_subdirs)) {
                        spin_unlock(&this_parent->d_lock);
                        spin_release(&dentry->d_lock.dep_map, _RET_IP_);
                        this_parent = dentry;
                        spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
                        goto repeat;
                }
                spin_unlock(&dentry->d_lock);
        }
        /*
         * All done at this level ... ascend and resume the search.
         */
        rcu_read_lock();
ascend:
        if (this_parent != parent) {
                struct dentry *child = this_parent;
                this_parent = child->d_parent;

                spin_unlock(&child->d_lock);
                spin_lock(&this_parent->d_lock);

                /* might go back up the wrong parent if we have had a rename. */
                if (need_seqretry(&rename_lock, seq))
                        goto rename_retry;
                /* go into the first sibling still alive */
                do {
                        next = child->d_child.next;
                        if (next == &this_parent->d_subdirs)
                                goto ascend;
                        child = list_entry(next, struct dentry, d_child);
                } while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED));
                rcu_read_unlock();
                goto resume;
        }
        if (need_seqretry(&rename_lock, seq))
                goto rename_retry;
        rcu_read_unlock();

out_unlock:
        spin_unlock(&this_parent->d_lock);
        done_seqretry(&rename_lock, seq);
        return;

rename_retry:
        spin_unlock(&this_parent->d_lock);
        rcu_read_unlock();
        BUG_ON(seq & 1);
        if (!retry)
                return;
        seq = 1;
        goto again;
}

struct check_mount {
        struct vfsmount *mnt;
        unsigned int mounted;
};

static enum d_walk_ret path_check_mount(void *data, struct dentry *dentry)
{
        struct check_mount *info = data;
        struct path path = { .mnt = info->mnt, .dentry = dentry };

        if (likely(!d_mountpoint(dentry)))
                return D_WALK_CONTINUE;
        if (__path_is_mountpoint(&path)) {
                info->mounted = 1;
                return D_WALK_QUIT;
        }
        return D_WALK_CONTINUE;
}

/**
 * path_has_submounts - check for mounts over a dentry in the
 *                      current namespace.
 * @parent: path to check.
 *
 * Return true if the parent or its subdirectories contain
 * a mount point in the current namespace.
 */
int path_has_submounts(const struct path *parent)
{
        struct check_mount data = { .mnt = parent->mnt, .mounted = 0 };

        read_seqlock_excl(&mount_lock);
        d_walk(parent->dentry, &data, path_check_mount);
        read_sequnlock_excl(&mount_lock);

        return data.mounted;
}
EXPORT_SYMBOL(path_has_submounts);

/*
 * Called by mount code to set a mountpoint and check if the mountpoint is
 * reachable (e.g. NFS can unhash a directory dentry and then the complete
 * subtree can become unreachable).
 *
 * Only one of d_invalidate() and d_set_mounted() must succeed.  For
 * this reason take rename_lock and d_lock on dentry and ancestors.
 */
int d_set_mounted(struct dentry *dentry)
{
        struct dentry *p;
        int ret = -ENOENT;
        write_seqlock(&rename_lock);
        for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) {
                /* Need exclusion wrt. d_invalidate() */
                spin_lock(&p->d_lock);
                if (unlikely(d_unhashed(p))) {
                        spin_unlock(&p->d_lock);
                        goto out;
                }
                spin_unlock(&p->d_lock);
        }
        spin_lock(&dentry->d_lock);
        if (!d_unlinked(dentry)) {
                ret = -EBUSY;
                if (!d_mountpoint(dentry)) {
                        dentry->d_flags |= DCACHE_MOUNTED;
                        ret = 0;
                }
        }
         spin_unlock(&dentry->d_lock);
out:
        write_sequnlock(&rename_lock);
        return ret;
}

/*
 * Search the dentry child list of the specified parent,
 * and move any unused dentries to the end of the unused
 * list for prune_dcache(). We descend to the next level
 * whenever the d_subdirs list is non-empty and continue
 * searching.
 *
 * It returns zero iff there are no unused children,
 * otherwise  it returns the number of children moved to
 * the end of the unused list. This may not be the total
 * number of unused children, because select_parent can
 * drop the lock and return early due to latency
 * constraints.
 */

struct select_data {
        struct dentry *start;
        union {
                long found;
                struct dentry *victim;
        };
        struct list_head dispose;
};

static enum d_walk_ret select_collect(void *_data, struct dentry *dentry)
{
        struct select_data *data = _data;
        enum d_walk_ret ret = D_WALK_CONTINUE;

        if (data->start == dentry)
                goto out;

        if (dentry->d_flags & DCACHE_SHRINK_LIST) {
                data->found++;
        } else {
                if (dentry->d_flags & DCACHE_LRU_LIST)
                        d_lru_del(dentry);
                if (!dentry->d_lockref.count) {
                        d_shrink_add(dentry, &data->dispose);
                        data->found++;
                }
        }
        /*
         * We can return to the caller if we have found some (this
         * ensures forward progress). We'll be coming back to find
         * the rest.
         */
        if (!list_empty(&data->dispose))
                ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY;
out:
        return ret;
}

static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry)
{
        struct select_data *data = _data;
        enum d_walk_ret ret = D_WALK_CONTINUE;

        if (data->start == dentry)
                goto out;

        if (dentry->d_flags & DCACHE_SHRINK_LIST) {
                if (!dentry->d_lockref.count) {
                        rcu_read_lock();
                        data->victim = dentry;
                        return D_WALK_QUIT;
                }
        } else {
                if (dentry->d_flags & DCACHE_LRU_LIST)
                        d_lru_del(dentry);
                if (!dentry->d_lockref.count)
                        d_shrink_add(dentry, &data->dispose);
        }
        /*
         * We can return to the caller if we have found some (this
         * ensures forward progress). We'll be coming back to find
         * the rest.
         */
        if (!list_empty(&data->dispose))
                ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY;
out:
        return ret;
}

/**
 * shrink_dcache_parent - prune dcache
 * @parent: parent of entries to prune
 *
 * Prune the dcache to remove unused children of the parent dentry.
 */
void shrink_dcache_parent(struct dentry *parent)
{
        for (;;) {
                struct select_data data = {.start = parent};

                INIT_LIST_HEAD(&data.dispose);
                d_walk(parent, &data, select_collect);

                if (!list_empty(&data.dispose)) {
                        shrink_dentry_list(&data.dispose);
                        continue;
                }

                cond_resched();
                if (!data.found)
                        break;
                data.victim = NULL;
                d_walk(parent, &data, select_collect2);
                if (data.victim) {
                        struct dentry *parent;
                        spin_lock(&data.victim->d_lock);
                        if (!shrink_lock_dentry(data.victim)) {
                                spin_unlock(&data.victim->d_lock);
                                rcu_read_unlock();
                        } else {
                                rcu_read_unlock();
                                parent = data.victim->d_parent;
                                if (parent != data.victim)
                                        __dput_to_list(parent, &data.dispose);
                                __dentry_kill(data.victim);
                        }
                }
                if (!list_empty(&data.dispose))
                        shrink_dentry_list(&data.dispose);
        }
}
EXPORT_SYMBOL(shrink_dcache_parent);

static enum d_walk_ret umount_check(void *_data, struct dentry *dentry)
{
        /* it has busy descendents; complain about those instead */
        if (!list_empty(&dentry->d_subdirs))
                return D_WALK_CONTINUE;

        /* root with refcount 1 is fine */
        if (dentry == _data && dentry->d_lockref.count == 1)
                return D_WALK_CONTINUE;

        printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%pd} "
                        " still in use (%d) [unmount of %s %s]\n",
                       dentry,
                       dentry->d_inode ?
                       dentry->d_inode->i_ino : 0UL,
                       dentry,
                       dentry->d_lockref.count,
                       dentry->d_sb->s_type->name,
                       dentry->d_sb->s_id);
        WARN_ON(1);
        return D_WALK_CONTINUE;
}

static void do_one_tree(struct dentry *dentry)
{
        shrink_dcache_parent(dentry);
        d_walk(dentry, dentry, umount_check);
        d_drop(dentry);
        dput(dentry);
}

/*
 * destroy the dentries attached to a superblock on unmounting
 */
void shrink_dcache_for_umount(struct super_block *sb)
{
        struct dentry *dentry;

        WARN(down_read_trylock(&sb->s_umount), "s_umount should've been locked");

        dentry = sb->s_root;
        sb->s_root = NULL;
        do_one_tree(dentry);

        while (!hlist_bl_empty(&sb->s_roots)) {
                dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_roots), struct dentry, d_hash));
                do_one_tree(dentry);
        }
}

static enum d_walk_ret find_submount(void *_data, struct dentry *dentry)
{
        struct dentry **victim = _data;
        if (d_mountpoint(dentry)) {
                __dget_dlock(dentry);
                *victim = dentry;
                return D_WALK_QUIT;
        }
        return D_WALK_CONTINUE;
}

/**
 * d_invalidate - detach submounts, prune dcache, and drop
 * @dentry: dentry to invalidate (aka detach, prune and drop)
 */
void d_invalidate(struct dentry *dentry)
{
        bool had_submounts = false;
        spin_lock(&dentry->d_lock);
        if (d_unhashed(dentry)) {
                spin_unlock(&dentry->d_lock);
                return;
        }
        __d_drop(dentry);
        spin_unlock(&dentry->d_lock);

        /* Negative dentries can be dropped without further checks */
        if (!dentry->d_inode)
                return;

        shrink_dcache_parent(dentry);
        for (;;) {
                struct dentry *victim = NULL;
                d_walk(dentry, &victim, find_submount);
                if (!victim) {
                        if (had_submounts)
                                shrink_dcache_parent(dentry);
                        return;
                }
                had_submounts = true;
                detach_mounts(victim);
                dput(victim);
        }
}
EXPORT_SYMBOL(d_invalidate);

/**
 * __d_alloc        -        allocate a dcache entry
 * @sb: filesystem it will belong to
 * @name: qstr of the name
 *
 * Allocates a dentry. It returns %NULL if there is insufficient memory
 * available. On a success the dentry is returned. The name passed in is
 * copied and the copy passed in may be reused after this call.
 */
 
static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
{
        struct dentry *dentry;
        char *dname;
        int err;

        dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL);
        if (!dentry)
                return NULL;

        /*
         * We guarantee that the inline name is always NUL-terminated.
         * This way the memcpy() done by the name switching in rename
         * will still always have a NUL at the end, even if we might
         * be overwriting an internal NUL character
         */
        dentry->d_iname[DNAME_INLINE_LEN-1] = 0;
        if (unlikely(!name)) {
                name = &slash_name;
                dname = dentry->d_iname;
        } else if (name->len > DNAME_INLINE_LEN-1) {
                size_t size = offsetof(struct external_name, name[1]);
                struct external_name *p = kmalloc(size + name->len,
                                                  GFP_KERNEL_ACCOUNT |
                                                  __GFP_RECLAIMABLE);
                if (!p) {
                        kmem_cache_free(dentry_cache, dentry); 
                        return NULL;
                }
                atomic_set(&p->u.count, 1);
                dname = p->name;
        } else  {
                dname = dentry->d_iname;
        }        

        dentry->d_name.len = name->len;
        dentry->d_name.hash = name->hash;
        memcpy(dname, name->name, name->len);
        dname[name->len] = 0;

        /* Make sure we always see the terminating NUL character */
        smp_store_release(&dentry->d_name.name, dname); /* ^^^ */

        dentry->d_lockref.count = 1;
        dentry->d_flags = 0;
        spin_lock_init(&dentry->d_lock);
        seqcount_spinlock_init(&dentry->d_seq, &dentry->d_lock);
        dentry->d_inode = NULL;
        dentry->d_parent = dentry;
        dentry->d_sb = sb;
        dentry->d_op = NULL;
        dentry->d_fsdata = NULL;
        INIT_HLIST_BL_NODE(&dentry->d_hash);
        INIT_LIST_HEAD(&dentry->d_lru);
        INIT_LIST_HEAD(&dentry->d_subdirs);
        INIT_HLIST_NODE(&dentry->d_u.d_alias);
        INIT_LIST_HEAD(&dentry->d_child);
        d_set_d_op(dentry, dentry->d_sb->s_d_op);

        if (dentry->d_op && dentry->d_op->d_init) {
                err = dentry->d_op->d_init(dentry);
                if (err) {
                        if (dname_external(dentry))
                                kfree(external_name(dentry));
                        kmem_cache_free(dentry_cache, dentry);
                        return NULL;
                }
        }

        this_cpu_inc(nr_dentry);

        return dentry;
}

/**
 * d_alloc        -        allocate a dcache entry
 * @parent: parent of entry to allocate
 * @name: qstr of the name
 *
 * Allocates a dentry. It returns %NULL if there is insufficient memory
 * available. On a success the dentry is returned. The name passed in is
 * copied and the copy passed in may be reused after this call.
 */
struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
{
        struct dentry *dentry = __d_alloc(parent->d_sb, name);
        if (!dentry)
                return NULL;
        spin_lock(&parent->d_lock);
        /*
         * don't need child lock because it is not subject
         * to concurrency here
         */
        __dget_dlock(parent);
        dentry->d_parent = parent;
        list_add(&dentry->d_child, &parent->d_subdirs);
        if (parent->d_flags & DCACHE_DISCONNECTED)
                dentry->d_flags |= DCACHE_DISCONNECTED;
        spin_unlock(&parent->d_lock);

        return dentry;
}
EXPORT_SYMBOL(d_alloc);

struct dentry *d_alloc_anon(struct super_block *sb)
{
        return __d_alloc(sb, NULL);
}
EXPORT_SYMBOL(d_alloc_anon);

struct dentry *d_alloc_cursor(struct dentry * parent)
{
        struct dentry *dentry = d_alloc_anon(parent->d_sb);
        if (dentry) {
                dentry->d_flags |= DCACHE_DENTRY_CURSOR;
                dentry->d_parent = dget(parent);
        }
        return dentry;
}

/**
 * d_alloc_pseudo - allocate a dentry (for lookup-less filesystems)
 * @sb: the superblock
 * @name: qstr of the name
 *
 * For a filesystem that just pins its dentries in memory and never
 * performs lookups at all, return an unhashed IS_ROOT dentry.
 * This is used for pipes, sockets et.al. - the stuff that should
 * never be anyone's children or parents.  Unlike all other
 * dentries, these will not have RCU delay between dropping the
 * last reference and freeing them.
 *
 * The only user is alloc_file_pseudo() and that's what should
 * be considered a public interface.  Don't use directly.
 */
struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
{
        struct dentry *dentry = __d_alloc(sb, name);
        if (likely(dentry))
                dentry->d_flags |= DCACHE_NORCU;
        return dentry;
}

struct dentry *d_alloc_name(struct dentry *parent, const char *name)
{
        struct qstr q;

        q.name = name;
        q.hash_len = hashlen_string(parent, name);
        return d_alloc(parent, &q);
}
EXPORT_SYMBOL(d_alloc_name);

void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
{
        WARN_ON_ONCE(dentry->d_op);
        WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH        |
                                DCACHE_OP_COMPARE        |
                                DCACHE_OP_REVALIDATE        |
                                DCACHE_OP_WEAK_REVALIDATE        |
                                DCACHE_OP_DELETE        |
                                DCACHE_OP_REAL));
        dentry->d_op = op;
        if (!op)
                return;
        if (op->d_hash)
                dentry->d_flags |= DCACHE_OP_HASH;
        if (op->d_compare)
                dentry->d_flags |= DCACHE_OP_COMPARE;
        if (op->d_revalidate)
                dentry->d_flags |= DCACHE_OP_REVALIDATE;
        if (op->d_weak_revalidate)
                dentry->d_flags |= DCACHE_OP_WEAK_REVALIDATE;
        if (op->d_delete)
                dentry->d_flags |= DCACHE_OP_DELETE;
        if (op->d_prune)
                dentry->d_flags |= DCACHE_OP_PRUNE;
        if (op->d_real)
                dentry->d_flags |= DCACHE_OP_REAL;

}
EXPORT_SYMBOL(d_set_d_op);


/*
 * d_set_fallthru - Mark a dentry as falling through to a lower layer
 * @dentry - The dentry to mark
 *
 * Mark a dentry as falling through to the lower layer (as set with
 * d_pin_lower()).  This flag may be recorded on the medium.
 */
void d_set_fallthru(struct dentry *dentry)
{
        spin_lock(&dentry->d_lock);
        dentry->d_flags |= DCACHE_FALLTHRU;
        spin_unlock(&dentry->d_lock);
}
EXPORT_SYMBOL(d_set_fallthru);

static unsigned d_flags_for_inode(struct inode *inode)
{
        unsigned add_flags = DCACHE_REGULAR_TYPE;

        if (!inode)
                return DCACHE_MISS_TYPE;

        if (S_ISDIR(inode->i_mode)) {
                add_flags = DCACHE_DIRECTORY_TYPE;
                if (unlikely(!(inode->i_opflags & IOP_LOOKUP))) {
                        if (unlikely(!inode->i_op->lookup))
                                add_flags = DCACHE_AUTODIR_TYPE;
                        else
                                inode->i_opflags |= IOP_LOOKUP;
                }
                goto type_determined;
        }

        if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
                if (unlikely(inode->i_op->get_link)) {
                        add_flags = DCACHE_SYMLINK_TYPE;
                        goto type_determined;
                }
                inode->i_opflags |= IOP_NOFOLLOW;
        }

        if (unlikely(!S_ISREG(inode->i_mode)))
                add_flags = DCACHE_SPECIAL_TYPE;

type_determined:
        if (unlikely(IS_AUTOMOUNT(inode)))
                add_flags |= DCACHE_NEED_AUTOMOUNT;
        return add_flags;
}

static void __d_instantiate(struct dentry *dentry, struct inode *inode)
{
        unsigned add_flags = d_flags_for_inode(inode);
        WARN_ON(d_in_lookup(dentry));

        spin_lock(&dentry->d_lock);
        /*
         * The negative counter only tracks dentries on the LRU. Don't dec if
         * d_lru is on another list.
         */
        if ((dentry->d_flags &
             (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) == DCACHE_LRU_LIST)
                this_cpu_dec(nr_dentry_negative);
        hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
        raw_write_seqcount_begin(&dentry->d_seq);
        __d_set_inode_and_type(dentry, inode, add_flags);
        raw_write_seqcount_end(&dentry->d_seq);
        fsnotify_update_flags(dentry);
        spin_unlock(&dentry->d_lock);
}

/**
 * d_instantiate - fill in inode information for a dentry
 * @entry: dentry to complete
 * @inode: inode to attach to this dentry
 *
 * Fill in inode information in the entry.
 *
 * This turns negative dentries into productive full members
 * of society.
 *
 * NOTE! This assumes that the inode count has been incremented
 * (or otherwise set) by the caller to indicate that it is now
 * in use by the dcache.
 */
 
void d_instantiate(struct dentry *entry, struct inode * inode)
{
        BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
        if (inode) {
                security_d_instantiate(entry, inode);
                spin_lock(&inode->i_lock);
                __d_instantiate(entry, inode);
                spin_unlock(&inode->i_lock);
        }
}
EXPORT_SYMBOL(d_instantiate);

/*
 * This should be equivalent to d_instantiate() + unlock_new_inode(),
 * with lockdep-related part of unlock_new_inode() done before
 * anything else.  Use that instead of open-coding d_instantiate()/
 * unlock_new_inode() combinations.
 */
void d_instantiate_new(struct dentry *entry, struct inode *inode)
{
        BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
        BUG_ON(!inode);
        lockdep_annotate_inode_mutex_key(inode);
        security_d_instantiate(entry, inode);
        spin_lock(&inode->i_lock);
        __d_instantiate(entry, inode);
        WARN_ON(!(inode->i_state & I_NEW));
        inode->i_state &= ~I_NEW & ~I_CREATING;
        smp_mb();
        wake_up_bit(&inode->i_state, __I_NEW);
        spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(d_instantiate_new);

struct dentry *d_make_root(struct inode *root_inode)
{
        struct dentry *res = NULL;

        if (root_inode) {
                res = d_alloc_anon(root_inode->i_sb);
                if (res)
                        d_instantiate(res, root_inode);
                else
                        iput(root_inode);
        }
        return res;
}
EXPORT_SYMBOL(d_make_root);

static struct dentry *__d_instantiate_anon(struct dentry *dentry,
                                           struct inode *inode,
                                           bool disconnected)
{
        struct dentry *res;
        unsigned add_flags;

        security_d_instantiate(dentry, inode);
        spin_lock(&inode->i_lock);
        res = __d_find_any_alias(inode);
        if (res) {
                spin_unlock(&inode->i_lock);
                dput(dentry);
                goto out_iput;
        }

        /* attach a disconnected dentry */
        add_flags = d_flags_for_inode(inode);

        if (disconnected)
                add_flags |= DCACHE_DISCONNECTED;

        spin_lock(&dentry->d_lock);
        __d_set_inode_and_type(dentry, inode, add_flags);
        hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
        if (!disconnected) {
                hlist_bl_lock(&dentry->d_sb->s_roots);
                hlist_bl_add_head(&dentry->d_hash, &dentry->d_sb->s_roots);
                hlist_bl_unlock(&dentry->d_sb->s_roots);
        }
        spin_unlock(&dentry->d_lock);
        spin_unlock(&inode->i_lock);

        return dentry;

 out_iput:
        iput(inode);
        return res;
}

struct dentry *d_instantiate_anon(struct dentry *dentry, struct inode *inode)
{
        return __d_instantiate_anon(dentry, inode, true);
}
EXPORT_SYMBOL(d_instantiate_anon);

static struct dentry *__d_obtain_alias(struct inode *inode, bool disconnected)
{
        struct dentry *tmp;
        struct dentry *res;

        if (!inode)
                return ERR_PTR(-ESTALE);
        if (IS_ERR(inode))
                return ERR_CAST(inode);

        res = d_find_any_alias(inode);
        if (res)
                goto out_iput;

        tmp = d_alloc_anon(inode->i_sb);
        if (!tmp) {
                res = ERR_PTR(-ENOMEM);
                goto out_iput;
        }

        return __d_instantiate_anon(tmp, inode, disconnected);

out_iput:
        iput(inode);
        return res;
}

/**
 * d_obtain_alias - find or allocate a DISCONNECTED dentry for a given inode
 * @inode: inode to allocate the dentry for
 *
 * Obtain a dentry for an inode resulting from NFS filehandle conversion or
 * similar open by handle operations.  The returned dentry may be anonymous,
 * or may have a full name (if the inode was already in the cache).
 *
 * When called on a directory inode, we must ensure that the inode only ever
 * has one dentry.  If a dentry is found, that is returned instead of
 * allocating a new one.
 *
 * On successful return, the reference to the inode has been transferred
 * to the dentry.  In case of an error the reference on the inode is released.
 * To make it easier to use in export operations a %NULL or IS_ERR inode may
 * be passed in and the error will be propagated to the return value,
 * with a %NULL @inode replaced by ERR_PTR(-ESTALE).
 */
struct dentry *d_obtain_alias(struct inode *inode)
{
        return __d_obtain_alias(inode, true);
}
EXPORT_SYMBOL(d_obtain_alias);

/**
 * d_obtain_root - find or allocate a dentry for a given inode
 * @inode: inode to allocate the dentry for
 *
 * Obtain an IS_ROOT dentry for the root of a filesystem.
 *
 * We must ensure that directory inodes only ever have one dentry.  If a
 * dentry is found, that is returned instead of allocating a new one.
 *
 * On successful return, the reference to the inode has been transferred
 * to the dentry.  In case of an error the reference on the inode is
 * released.  A %NULL or IS_ERR inode may be passed in and will be the
 * error will be propagate to the return value, with a %NULL @inode
 * replaced by ERR_PTR(-ESTALE).
 */
struct dentry *d_obtain_root(struct inode *inode)
{
        return __d_obtain_alias(inode, false);
}
EXPORT_SYMBOL(d_obtain_root);

/**
 * d_add_ci - lookup or allocate new dentry with case-exact name
 * @inode:  the inode case-insensitive lookup has found
 * @dentry: the negative dentry that was passed to the parent's lookup func
 * @name:   the case-exact name to be associated with the returned dentry
 *
 * This is to avoid filling the dcache with case-insensitive names to the
 * same inode, only the actual correct case is stored in the dcache for
 * case-insensitive filesystems.
 *
 * For a case-insensitive lookup match and if the the case-exact dentry
 * already exists in in the dcache, use it and return it.
 *
 * If no entry exists with the exact case name, allocate new dentry with
 * the exact case, and return the spliced entry.
 */
struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
                        struct qstr *name)
{
        struct dentry *found, *res;

        /*
         * First check if a dentry matching the name already exists,
         * if not go ahead and create it now.
         */
        found = d_hash_and_lookup(dentry->d_parent, name);
        if (found) {
                iput(inode);
                return found;
        }
        if (d_in_lookup(dentry)) {
                found = d_alloc_parallel(dentry->d_parent, name,
                                        dentry->d_wait);
                if (IS_ERR(found) || !d_in_lookup(found)) {
                        iput(inode);
                        return found;
                }
        } else {
                found = d_alloc(dentry->d_parent, name);
                if (!found) {
                        iput(inode);
                        return ERR_PTR(-ENOMEM);
                } 
        }
        res = d_splice_alias(inode, found);
        if (res) {
                dput(found);
                return res;
        }
        return found;
}
EXPORT_SYMBOL(d_add_ci);


static inline bool d_same_name(const struct dentry *dentry,
                                const struct dentry *parent,
                                const struct qstr *name)
{
        if (likely(!(parent->d_flags & DCACHE_OP_COMPARE))) {
                if (dentry->d_name.len != name->len)
                        return false;
                return dentry_cmp(dentry, name->name, name->len) == 0;
        }
        return parent->d_op->d_compare(dentry,
                                       dentry->d_name.len, dentry->d_name.name,
                                       name) == 0;
}

/**
 * __d_lookup_rcu - search for a dentry (racy, store-free)
 * @parent: parent dentry
 * @name: qstr of name we wish to find
 * @seqp: returns d_seq value at the point where the dentry was found
 * Returns: dentry, or NULL
 *
 * __d_lookup_rcu is the dcache lookup function for rcu-walk name
 * resolution (store-free path walking) design described in
 * Documentation/filesystems/path-lookup.txt.
 *
 * This is not to be used outside core vfs.
 *
 * __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock
 * held, and rcu_read_lock held. The returned dentry must not be stored into
 * without taking d_lock and checking d_seq sequence count against @seq
 * returned here.
 *
 * A refcount may be taken on the found dentry with the d_rcu_to_refcount
 * function.
 *
 * Alternatively, __d_lookup_rcu may be called again to look up the child of
 * the returned dentry, so long as its parent's seqlock is checked after the
 * child is looked up. Thus, an interlocking stepping of sequence lock checks
 * is formed, giving integrity down the path walk.
 *
 * NOTE! The caller *has* to check the resulting dentry against the sequence
 * number we've returned before using any of the resulting dentry state!
 */
struct dentry *__d_lookup_rcu(const struct dentry *parent,
                                const struct qstr *name,
                                unsigned *seqp)
{
        u64 hashlen = name->hash_len;
        const unsigned char *str = name->name;
        struct hlist_bl_head *b = d_hash(hashlen_hash(hashlen));
        struct hlist_bl_node *node;
        struct dentry *dentry;

        /*
         * Note: There is significant duplication with __d_lookup_rcu which is
         * required to prevent single threaded performance regressions
         * especially on architectures where smp_rmb (in seqcounts) are costly.
         * Keep the two functions in sync.
         */

        /*
         * The hash list is protected using RCU.
         *
         * Carefully use d_seq when comparing a candidate dentry, to avoid
         * races with d_move().
         *
         * It is possible that concurrent renames can mess up our list
         * walk here and result in missing our dentry, resulting in the
         * false-negative result. d_lookup() protects against concurrent
         * renames using rename_lock seqlock.
         *
         * See Documentation/filesystems/path-lookup.txt for more details.
         */
        hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
                unsigned seq;

seqretry:
                /*
                 * The dentry sequence count protects us from concurrent
                 * renames, and thus protects parent and name fields.
                 *
                 * The caller must perform a seqcount check in order
                 * to do anything useful with the returned dentry.
                 *
                 * NOTE! We do a "raw" seqcount_begin here. That means that
                 * we don't wait for the sequence count to stabilize if it
                 * is in the middle of a sequence change. If we do the slow
                 * dentry compare, we will do seqretries until it is stable,
                 * and if we end up with a successful lookup, we actually
                 * want to exit RCU lookup anyway.
                 *
                 * Note that raw_seqcount_begin still *does* smp_rmb(), so
                 * we are still guaranteed NUL-termination of ->d_name.name.
                 */
                seq = raw_seqcount_begin(&dentry->d_seq);
                if (dentry->d_parent != parent)
                        continue;
                if (d_unhashed(dentry))
                        continue;

                if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {
                        int tlen;
                        const char *tname;
                        if (dentry->d_name.hash != hashlen_hash(hashlen))
                                continue;
                        tlen = dentry->d_name.len;
                        tname = dentry->d_name.name;
                        /* we want a consistent (name,len) pair */
                        if (read_seqcount_retry(&dentry->d_seq, seq)) {
                                cpu_relax();
                                goto seqretry;
                        }
                        if (parent->d_op->d_compare(dentry,
                                                    tlen, tname, name) != 0)
                                continue;
                } else {
                        if (dentry->d_name.hash_len != hashlen)
                                continue;
                        if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0)
                                continue;
                }
                *seqp = seq;
                return dentry;
        }
        return NULL;
}

/**
 * d_lookup - search for a dentry
 * @parent: parent dentry
 * @name: qstr of name we wish to find
 * Returns: dentry, or NULL
 *
 * d_lookup searches the children of the parent dentry for the name in
 * question. If the dentry is found its reference count is incremented and the
 * dentry is returned. The caller must use dput to free the entry when it has
 * finished using it. %NULL is returned if the dentry does not exist.
 */
struct dentry *d_lookup(const struct dentry *parent, const struct qstr *name)
{
        struct dentry *dentry;
        unsigned seq;

        do {
                seq = read_seqbegin(&rename_lock);
                dentry = __d_lookup(parent, name);
                if (dentry)
                        break;
        } while (read_seqretry(&rename_lock, seq));
        return dentry;
}
EXPORT_SYMBOL(d_lookup);

/**
 * __d_lookup - search for a dentry (racy)
 * @parent: parent dentry
 * @name: qstr of name we wish to find
 * Returns: dentry, or NULL
 *
 * __d_lookup is like d_lookup, however it may (rarely) return a
 * false-negative result due to unrelated rename activity.
 *
 * __d_lookup is slightly faster by avoiding rename_lock read seqlock,
 * however it must be used carefully, eg. with a following d_lookup in
 * the case of failure.
 *
 * __d_lookup callers must be commented.
 */
struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name)
{
        unsigned int hash = name->hash;
        struct hlist_bl_head *b = d_hash(hash);
        struct hlist_bl_node *node;
        struct dentry *found = NULL;
        struct dentry *dentry;

        /*
         * Note: There is significant duplication with __d_lookup_rcu which is
         * required to prevent single threaded performance regressions
         * especially on architectures where smp_rmb (in seqcounts) are costly.
         * Keep the two functions in sync.
         */

        /*
         * The hash list is protected using RCU.
         *
         * Take d_lock when comparing a candidate dentry, to avoid races
         * with d_move().
         *
         * It is possible that concurrent renames can mess up our list
         * walk here and result in missing our dentry, resulting in the
         * false-negative result. d_lookup() protects against concurrent
         * renames using rename_lock seqlock.
         *
         * See Documentation/filesystems/path-lookup.txt for more details.
         */
        rcu_read_lock();
        
        hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {

                if (dentry->d_name.hash != hash)
                        continue;

                spin_lock(&dentry->d_lock);
                if (dentry->d_parent != parent)
                        goto next;
                if (d_unhashed(dentry))
                        goto next;

                if (!d_same_name(dentry, parent, name))
                        goto next;

                dentry->d_lockref.count++;
                found = dentry;
                spin_unlock(&dentry->d_lock);
                break;
next:
                spin_unlock(&dentry->d_lock);
         }
         rcu_read_unlock();

         return found;
}

/**
 * d_hash_and_lookup - hash the qstr then search for a dentry
 * @dir: Directory to search in
 * @name: qstr of name we wish to find
 *
 * On lookup failure NULL is returned; on bad name - ERR_PTR(-error)
 */
struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
{
        /*
         * Check for a fs-specific hash function. Note that we must
         * calculate the standard hash first, as the d_op->d_hash()
         * routine may choose to leave the hash value unchanged.
         */
        name->hash = full_name_hash(dir, name->name, name->len);
        if (dir->d_flags & DCACHE_OP_HASH) {
                int err = dir->d_op->d_hash(dir, name);
                if (unlikely(err < 0))
                        return ERR_PTR(err);
        }
        return d_lookup(dir, name);
}
EXPORT_SYMBOL(d_hash_and_lookup);

/*
 * When a file is deleted, we have two options:
 * - turn this dentry into a negative dentry
 * - unhash this dentry and free it.
 *
 * Usually, we want to just turn this into
 * a negative dentry, but if anybody else is
 * currently using the dentry or the inode
 * we can't do that and we fall back on removing
 * it from the hash queues and waiting for
 * it to be deleted later when it has no users
 */
 
/**
 * d_delete - delete a dentry
 * @dentry: The dentry to delete
 *
 * Turn the dentry into a negative dentry if possible, otherwise
 * remove it from the hash queues so it can be deleted later
 */
 
void d_delete(struct dentry * dentry)
{
        struct inode *inode = dentry->d_inode;

        spin_lock(&inode->i_lock);
        spin_lock(&dentry->d_lock);
        /*
         * Are we the only user?
         */
        if (dentry->d_lockref.count == 1) {
                dentry->d_flags &= ~DCACHE_CANT_MOUNT;
                dentry_unlink_inode(dentry);
        } else {
                __d_drop(dentry);
                spin_unlock(&dentry->d_lock);
                spin_unlock(&inode->i_lock);
        }
}
EXPORT_SYMBOL(d_delete);

static void __d_rehash(struct dentry *entry)
{
        struct hlist_bl_head *b = d_hash(entry->d_name.hash);

        hlist_bl_lock(b);
        hlist_bl_add_head_rcu(&entry->d_hash, b);
        hlist_bl_unlock(b);
}

/**
 * d_rehash        - add an entry back to the hash
 * @entry: dentry to add to the hash
 *
 * Adds a dentry to the hash according to its name.
 */
 
void d_rehash(struct dentry * entry)
{
        spin_lock(&entry->d_lock);
        __d_rehash(entry);
        spin_unlock(&entry->d_lock);
}
EXPORT_SYMBOL(d_rehash);

static inline unsigned start_dir_add(struct inode *dir)
{

        for (;;) {
                unsigned n = dir->i_dir_seq;
                if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n)
                        return n;
                cpu_relax();
        }
}

static inline void end_dir_add(struct inode *dir, unsigned n)
{
        smp_store_release(&dir->i_dir_seq, n + 2);
}

static void d_wait_lookup(struct dentry *dentry)
{
        if (d_in_lookup(dentry)) {
                DECLARE_WAITQUEUE(wait, current);
                add_wait_queue(dentry->d_wait, &wait);
                do {
                        set_current_state(TASK_UNINTERRUPTIBLE);
                        spin_unlock(&dentry->d_lock);
                        schedule();
                        spin_lock(&dentry->d_lock);
                } while (d_in_lookup(dentry));
        }
}

struct dentry *d_alloc_parallel(struct dentry *parent,
                                const struct qstr *name,
                                wait_queue_head_t *wq)
{
        unsigned int hash = name->hash;
        struct hlist_bl_head *b = in_lookup_hash(parent, hash);
        struct hlist_bl_node *node;
        struct dentry *new = d_alloc(parent, name);
        struct dentry *dentry;
        unsigned seq, r_seq, d_seq;

        if (unlikely(!new))
                return ERR_PTR(-ENOMEM);

retry:
        rcu_read_lock();
        seq = smp_load_acquire(&parent->d_inode->i_dir_seq);
        r_seq = read_seqbegin(&rename_lock);
        dentry = __d_lookup_rcu(parent, name, &d_seq);
        if (unlikely(dentry)) {
                if (!lockref_get_not_dead(&dentry->d_lockref)) {
                        rcu_read_unlock();
                        goto retry;
                }
                if (read_seqcount_retry(&dentry->d_seq, d_seq)) {
                        rcu_read_unlock();
                        dput(dentry);
                        goto retry;
                }
                rcu_read_unlock();
                dput(new);
                return dentry;
        }
        if (unlikely(read_seqretry(&rename_lock, r_seq))) {
                rcu_read_unlock();
                goto retry;
        }

        if (unlikely(seq & 1)) {
                rcu_read_unlock();
                goto retry;
        }

        hlist_bl_lock(b);
        if (unlikely(READ_ONCE(parent->d_inode->i_dir_seq) != seq)) {
                hlist_bl_unlock(b);
                rcu_read_unlock();
                goto retry;
        }
        /*
         * No changes for the parent since the beginning of d_lookup().
         * Since all removals from the chain happen with hlist_bl_lock(),
         * any potential in-lookup matches are going to stay here until
         * we unlock the chain.  All fields are stable in everything
         * we encounter.
         */
        hlist_bl_for_each_entry(dentry, node, b, d_u.d_in_lookup_hash) {
                if (dentry->d_name.hash != hash)
                        continue;
                if (dentry->d_parent != parent)
                        continue;
                if (!d_same_name(dentry, parent, name))
                        continue;
                hlist_bl_unlock(b);
                /* now we can try to grab a reference */
                if (!lockref_get_not_dead(&dentry->d_lockref)) {
                        rcu_read_unlock();
                        goto retry;
                }

                rcu_read_unlock();
                /*
                 * somebody is likely to be still doing lookup for it;
                 * wait for them to finish
                 */
                spin_lock(&dentry->d_lock);
                d_wait_lookup(dentry);
                /*
                 * it's not in-lookup anymore; in principle we should repeat
                 * everything from dcache lookup, but it's likely to be what
                 * d_lookup() would've found anyway.  If it is, just return it;
                 * otherwise we really have to repeat the whole thing.
                 */
                if (unlikely(dentry->d_name.hash != hash))
                        goto mismatch;
                if (unlikely(dentry->d_parent != parent))
                        goto mismatch;
                if (unlikely(d_unhashed(dentry)))
                        goto mismatch;
                if (unlikely(!d_same_name(dentry, parent, name)))
                        goto mismatch;
                /* OK, it *is* a hashed match; return it */
                spin_unlock(&dentry->d_lock);
                dput(new);
                return dentry;
        }
        rcu_read_unlock();
        /* we can't take ->d_lock here; it's OK, though. */
        new->d_flags |= DCACHE_PAR_LOOKUP;
        new->d_wait = wq;
        hlist_bl_add_head_rcu(&new->d_u.d_in_lookup_hash, b);
        hlist_bl_unlock(b);
        return new;
mismatch:
        spin_unlock(&dentry->d_lock);
        dput(dentry);
        goto retry;
}
EXPORT_SYMBOL(d_alloc_parallel);

void __d_lookup_done(struct dentry *dentry)
{
        struct hlist_bl_head *b = in_lookup_hash(dentry->d_parent,
                                                 dentry->d_name.hash);
        hlist_bl_lock(b);
        dentry->d_flags &= ~DCACHE_PAR_LOOKUP;
        __hlist_bl_del(&dentry->d_u.d_in_lookup_hash);
        wake_up_all(dentry->d_wait);
        dentry->d_wait = NULL;
        hlist_bl_unlock(b);
        INIT_HLIST_NODE(&dentry->d_u.d_alias);
        INIT_LIST_HEAD(&dentry->d_lru);
}
EXPORT_SYMBOL(__d_lookup_done);

/* inode->i_lock held if inode is non-NULL */

static inline void __d_add(struct dentry *dentry, struct inode *inode)
{
        struct inode *dir = NULL;
        unsigned n;
        spin_lock(&dentry->d_lock);
        if (unlikely(d_in_lookup(dentry))) {
                dir = dentry->d_parent->d_inode;
                n = start_dir_add(dir);
                __d_lookup_done(dentry);
        }
        if (inode) {
                unsigned add_flags = d_flags_for_inode(inode);
                hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
                raw_write_seqcount_begin(&dentry->d_seq);
                __d_set_inode_and_type(dentry, inode, add_flags);
                raw_write_seqcount_end(&dentry->d_seq);
                fsnotify_update_flags(dentry);
        }
        __d_rehash(dentry);
        if (dir)
                end_dir_add(dir, n);
        spin_unlock(&dentry->d_lock);
        if (inode)
                spin_unlock(&inode->i_lock);
}

/**
 * d_add - add dentry to hash queues
 * @entry: dentry to add
 * @inode: The inode to attach to this dentry
 *
 * This adds the entry to the hash queues and initializes @inode.
 * The entry was actually filled in earlier during d_alloc().
 */

void d_add(struct dentry *entry, struct inode *inode)
{
        if (inode) {
                security_d_instantiate(entry, inode);
                spin_lock(&inode->i_lock);
        }
        __d_add(entry, inode);
}
EXPORT_SYMBOL(d_add);

/**
 * d_exact_alias - find and hash an exact unhashed alias
 * @entry: dentry to add
 * @inode: The inode to go with this dentry
 *
 * If an unhashed dentry with the same name/parent and desired
 * inode already exists, hash and return it.  Otherwise, return
 * NULL.
 *
 * Parent directory should be locked.
 */
struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode)
{
        struct dentry *alias;
        unsigned int hash = entry->d_name.hash;

        spin_lock(&inode->i_lock);
        hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
                /*
                 * Don't need alias->d_lock here, because aliases with
                 * d_parent == entry->d_parent are not subject to name or
                 * parent changes, because the parent inode i_mutex is held.
                 */
                if (alias->d_name.hash != hash)
                        continue;
                if (alias->d_parent != entry->d_parent)
                        continue;
                if (!d_same_name(alias, entry->d_parent, &entry->d_name))
                        continue;
                spin_lock(&alias->d_lock);
                if (!d_unhashed(alias)) {
                        spin_unlock(&alias->d_lock);
                        alias = NULL;
                } else {
                        __dget_dlock(alias);
                        __d_rehash(alias);
                        spin_unlock(&alias->d_lock);
                }
                spin_unlock(&inode->i_lock);
                return alias;
        }
        spin_unlock(&inode->i_lock);
        return NULL;
}
EXPORT_SYMBOL(d_exact_alias);

static void swap_names(struct dentry *dentry, struct dentry *target)
{
        if (unlikely(dname_external(target))) {
                if (unlikely(dname_external(dentry))) {
                        /*
                         * Both external: swap the pointers
                         */
                        swap(target->d_name.name, dentry->d_name.name);
                } else {
                        /*
                         * dentry:internal, target:external.  Steal target's
                         * storage and make target internal.
                         */
                        memcpy(target->d_iname, dentry->d_name.name,
                                        dentry->d_name.len + 1);
                        dentry->d_name.name = target->d_name.name;
                        target->d_name.name = target->d_iname;
                }
        } else {
                if (unlikely(dname_external(dentry))) {
                        /*
                         * dentry:external, target:internal.  Give dentry's
                         * storage to target and make dentry internal
                         */
                        memcpy(dentry->d_iname, target->d_name.name,
                                        target->d_name.len + 1);
                        target->d_name.name = dentry->d_name.name;
                        dentry->d_name.name = dentry->d_iname;
                } else {
                        /*
                         * Both are internal.
                         */
                        unsigned int i;
                        BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long)));
                        for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) {
                                swap(((long *) &dentry->d_iname)[i],
                                     ((long *) &target->d_iname)[i]);
                        }
                }
        }
        swap(dentry->d_name.hash_len, target->d_name.hash_len);
}

static void copy_name(struct dentry *dentry, struct dentry *target)
{
        struct external_name *old_name = NULL;
        if (unlikely(dname_external(dentry)))
                old_name = external_name(dentry);
        if (unlikely(dname_external(target))) {
                atomic_inc(&external_name(target)->u.count);
                dentry->d_name = target->d_name;
        } else {
                memcpy(dentry->d_iname, target->d_name.name,
                                target->d_name.len + 1);
                dentry->d_name.name = dentry->d_iname;
                dentry->d_name.hash_len = target->d_name.hash_len;
        }
        if (old_name && likely(atomic_dec_and_test(&old_name->u.count)))
                kfree_rcu(old_name, u.head);
}

/*
 * __d_move - move a dentry
 * @dentry: entry to move
 * @target: new dentry
 * @exchange: exchange the two dentries
 *
 * Update the dcache to reflect the move of a file name. Negative
 * dcache entries should not be moved in this way. Caller must hold
 * rename_lock, the i_mutex of the source and target directories,
 * and the sb->s_vfs_rename_mutex if they differ. See lock_rename().
 */
static void __d_move(struct dentry *dentry, struct dentry *target,
                     bool exchange)
{
        struct dentry *old_parent, *p;
        struct inode *dir = NULL;
        unsigned n;

        WARN_ON(!dentry->d_inode);
        if (WARN_ON(dentry == target))
                return;

        BUG_ON(d_ancestor(target, dentry));
        old_parent = dentry->d_parent;
        p = d_ancestor(old_parent, target);
        if (IS_ROOT(dentry)) {
                BUG_ON(p);
                spin_lock(&target->d_parent->d_lock);
        } else if (!p) {
                /* target is not a descendent of dentry->d_parent */
                spin_lock(&target->d_parent->d_lock);
                spin_lock_nested(&old_parent->d_lock, DENTRY_D_LOCK_NESTED);
        } else {
                BUG_ON(p == dentry);
                spin_lock(&old_parent->d_lock);
                if (p != target)
                        spin_lock_nested(&target->d_parent->d_lock,
                                        DENTRY_D_LOCK_NESTED);
        }
        spin_lock_nested(&dentry->d_lock, 2);
        spin_lock_nested(&target->d_lock, 3);

        if (unlikely(d_in_lookup(target))) {
                dir = target->d_parent->d_inode;
                n = start_dir_add(dir);
                __d_lookup_done(target);
        }

        write_seqcount_begin(&dentry->d_seq);
        write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED);

        /* unhash both */
        if (!d_unhashed(dentry))
                ___d_drop(dentry);
        if (!d_unhashed(target))
                ___d_drop(target);

        /* ... and switch them in the tree */
        dentry->d_parent = target->d_parent;
        if (!exchange) {
                copy_name(dentry, target);
                target->d_hash.pprev = NULL;
                dentry->d_parent->d_lockref.count++;
                if (dentry != old_parent) /* wasn't IS_ROOT */
                        WARN_ON(!--old_parent->d_lockref.count);
        } else {
                target->d_parent = old_parent;
                swap_names(dentry, target);
                list_move(&target->d_child, &target->d_parent->d_subdirs);
                __d_rehash(target);
                fsnotify_update_flags(target);
        }
        list_move(&dentry->d_child, &dentry->d_parent->d_subdirs);
        __d_rehash(dentry);
        fsnotify_update_flags(dentry);
        fscrypt_handle_d_move(dentry);

        write_seqcount_end(&target->d_seq);
        write_seqcount_end(&dentry->d_seq);

        if (dir)
                end_dir_add(dir, n);

        if (dentry->d_parent != old_parent)
                spin_unlock(&dentry->d_parent->d_lock);
        if (dentry != old_parent)
                spin_unlock(&old_parent->d_lock);
        spin_unlock(&target->d_lock);
        spin_unlock(&dentry->d_lock);
}

/*
 * d_move - move a dentry
 * @dentry: entry to move
 * @target: new dentry
 *
 * Update the dcache to reflect the move of a file name. Negative
 * dcache entries should not be moved in this way. See the locking
 * requirements for __d_move.
 */
void d_move(struct dentry *dentry, struct dentry *target)
{
        write_seqlock(&rename_lock);
        __d_move(dentry, target, false);
        write_sequnlock(&rename_lock);
}
EXPORT_SYMBOL(d_move);

/*
 * d_exchange - exchange two dentries
 * @dentry1: first dentry
 * @dentry2: second dentry
 */
void d_exchange(struct dentry *dentry1, struct dentry *dentry2)
{
        write_seqlock(&rename_lock);

        WARN_ON(!dentry1->d_inode);
        WARN_ON(!dentry2->d_inode);
        WARN_ON(IS_ROOT(dentry1));
        WARN_ON(IS_ROOT(dentry2));

        __d_move(dentry1, dentry2, true);

        write_sequnlock(&rename_lock);
}

/**
 * d_ancestor - search for an ancestor
 * @p1: ancestor dentry
 * @p2: child dentry
 *
 * Returns the ancestor dentry of p2 which is a child of p1, if p1 is
 * an ancestor of p2, else NULL.
 */
struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
{
        struct dentry *p;

        for (p = p2; !IS_ROOT(p); p = p->d_parent) {
                if (p->d_parent == p1)
                        return p;
        }
        return NULL;
}

/*
 * This helper attempts to cope with remotely renamed directories
 *
 * It assumes that the caller is already holding
 * dentry->d_parent->d_inode->i_mutex, and rename_lock
 *
 * Note: If ever the locking in lock_rename() changes, then please
 * remember to update this too...
 */
static int __d_unalias(struct inode *inode,
                struct dentry *dentry, struct dentry *alias)
{
        struct mutex *m1 = NULL;
        struct rw_semaphore *m2 = NULL;
        int ret = -ESTALE;

        /* If alias and dentry share a parent, then no extra locks required */
        if (alias->d_parent == dentry->d_parent)
                goto out_unalias;

        /* See lock_rename() */
        if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex))
                goto out_err;
        m1 = &dentry->d_sb->s_vfs_rename_mutex;
        if (!inode_trylock_shared(alias->d_parent->d_inode))
                goto out_err;
        m2 = &alias->d_parent->d_inode->i_rwsem;
out_unalias:
        __d_move(alias, dentry, false);
        ret = 0;
out_err:
        if (m2)
                up_read(m2);
        if (m1)
                mutex_unlock(m1);
        return ret;
}

/**
 * d_splice_alias - splice a disconnected dentry into the tree if one exists
 * @inode:  the inode which may have a disconnected dentry
 * @dentry: a negative dentry which we want to point to the inode.
 *
 * If inode is a directory and has an IS_ROOT alias, then d_move that in
 * place of the given dentry and return it, else simply d_add the inode
 * to the dentry and return NULL.
 *
 * If a non-IS_ROOT directory is found, the filesystem is corrupt, and
 * we should error out: directories can't have multiple aliases.
 *
 * This is needed in the lookup routine of any filesystem that is exportable
 * (via knfsd) so that we can build dcache paths to directories effectively.
 *
 * If a dentry was found and moved, then it is returned.  Otherwise NULL
 * is returned.  This matches the expected return value of ->lookup.
 *
 * Cluster filesystems may call this function with a negative, hashed dentry.
 * In that case, we know that the inode will be a regular file, and also this
 * will only occur during atomic_open. So we need to check for the dentry
 * being already hashed only in the final case.
 */
struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
{
        if (IS_ERR(inode))
                return ERR_CAST(inode);

        BUG_ON(!d_unhashed(dentry));

        if (!inode)
                goto out;

        security_d_instantiate(dentry, inode);
        spin_lock(&inode->i_lock);
        if (S_ISDIR(inode->i_mode)) {
                struct dentry *new = __d_find_any_alias(inode);
                if (unlikely(new)) {
                        /* The reference to new ensures it remains an alias */
                        spin_unlock(&inode->i_lock);
                        write_seqlock(&rename_lock);
                        if (unlikely(d_ancestor(new, dentry))) {
                                write_sequnlock(&rename_lock);
                                dput(new);
                                new = ERR_PTR(-ELOOP);
                                pr_warn_ratelimited(
                                        "VFS: Lookup of '%s' in %s %s"
                                        " would have caused loop\n",
                                        dentry->d_name.name,
                                        inode->i_sb->s_type->name,
                                        inode->i_sb->s_id);
                        } else if (!IS_ROOT(new)) {
                                struct dentry *old_parent = dget(new->d_parent);
                                int err = __d_unalias(inode, dentry, new);
                                write_sequnlock(&rename_lock);
                                if (err) {
                                        dput(new);
                                        new = ERR_PTR(err);
                                }
                                dput(old_parent);
                        } else {
                                __d_move(new, dentry, false);
                                write_sequnlock(&rename_lock);
                        }
                        iput(inode);
                        return new;
                }
        }
out:
        __d_add(dentry, inode);
        return NULL;
}
EXPORT_SYMBOL(d_splice_alias);

/*
 * Test whether new_dentry is a subdirectory of old_dentry.
 *
 * Trivially implemented using the dcache structure
 */

/**
 * is_subdir - is new dentry a subdirectory of old_dentry
 * @new_dentry: new dentry
 * @old_dentry: old dentry
 *
 * Returns true if new_dentry is a subdirectory of the parent (at any depth).
 * Returns false otherwise.
 * Caller must ensure that "new_dentry" is pinned before calling is_subdir()
 */
  
bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
{
        bool subdir;
        unsigned seq;

        if (new_dentry == old_dentry)
                return true;

        /* Access d_parent under rcu as d_move() may change it. */
        rcu_read_lock();
        seq = read_seqbegin(&rename_lock);
        subdir = d_ancestor(old_dentry, new_dentry);
         /* Try lockless once... */
        if (read_seqretry(&rename_lock, seq)) {
                /* ...else acquire lock for progress even on deep chains. */
                read_seqlock_excl(&rename_lock);
                subdir = d_ancestor(old_dentry, new_dentry);
                read_sequnlock_excl(&rename_lock);
        }
        rcu_read_unlock();
        return subdir;
}
EXPORT_SYMBOL(is_subdir);

static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry)
{
        struct dentry *root = data;
        if (dentry != root) {
                if (d_unhashed(dentry) || !dentry->d_inode)
                        return D_WALK_SKIP;

                if (!(dentry->d_flags & DCACHE_GENOCIDE)) {
                        dentry->d_flags |= DCACHE_GENOCIDE;
                        dentry->d_lockref.count--;
                }
        }
        return D_WALK_CONTINUE;
}

void d_genocide(struct dentry *parent)
{
        d_walk(parent, parent, d_genocide_kill);
}

EXPORT_SYMBOL(d_genocide);

void d_tmpfile(struct dentry *dentry, struct inode *inode)
{
        inode_dec_link_count(inode);
        BUG_ON(dentry->d_name.name != dentry->d_iname ||
                !hlist_unhashed(&dentry->d_u.d_alias) ||
                !d_unlinked(dentry));
        spin_lock(&dentry->d_parent->d_lock);
        spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
        dentry->d_name.len = sprintf(dentry->d_iname, "#%llu",
                                (unsigned long long)inode->i_ino);
        spin_unlock(&dentry->d_lock);
        spin_unlock(&dentry->d_parent->d_lock);
        d_instantiate(dentry, inode);
}
EXPORT_SYMBOL(d_tmpfile);

static __initdata unsigned long dhash_entries;
static int __init set_dhash_entries(char *str)
{
        if (!str)
                return 0;
        dhash_entries = simple_strtoul(str, &str, 0);
        return 1;
}
__setup("dhash_entries=", set_dhash_entries);

static void __init dcache_init_early(void)
{
        /* If hashes are distributed across NUMA nodes, defer
         * hash allocation until vmalloc space is available.
         */
        if (hashdist)
                return;

        dentry_hashtable =
                alloc_large_system_hash("Dentry cache",
                                        sizeof(struct hlist_bl_head),
                                        dhash_entries,
                                        13,
                                        HASH_EARLY | HASH_ZERO,
                                        &d_hash_shift,
                                        NULL,
                                        0,
                                        0);
        d_hash_shift = 32 - d_hash_shift;
}

static void __init dcache_init(void)
{
        /*
         * A constructor could be added for stable state like the lists,
         * but it is probably not worth it because of the cache nature
         * of the dcache.
         */
        dentry_cache = KMEM_CACHE_USERCOPY(dentry,
                SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
                d_iname);

        /* Hash may have been set up in dcache_init_early */
        if (!hashdist)
                return;

        dentry_hashtable =
                alloc_large_system_hash("Dentry cache",
                                        sizeof(struct hlist_bl_head),
                                        dhash_entries,
                                        13,
                                        HASH_ZERO,
                                        &d_hash_shift,
                                        NULL,
                                        0,
                                        0);
        d_hash_shift = 32 - d_hash_shift;
}

/* SLAB cache for __getname() consumers */
struct kmem_cache *names_cachep __read_mostly;
EXPORT_SYMBOL(names_cachep);

void __init vfs_caches_init_early(void)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(in_lookup_hashtable); i++)
                INIT_HLIST_BL_HEAD(&in_lookup_hashtable[i]);

        dcache_init_early();
        inode_init_early();
}

void __init vfs_caches_init(void)
{
        names_cachep = kmem_cache_create_usercopy("names_cache", PATH_MAX, 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, 0, PATH_MAX, NULL);

        dcache_init();
        inode_init();
        files_init();
        files_maxfiles_init();
        mnt_init();
        bdev_cache_init();
        chrdev_init();
}










































































































    2 























    2 







    2 

    2 












    2 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
// SPDX-License-Identifier: GPL-2.0
/*
 * Functions related to generic timeout handling of requests.
 */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/blkdev.h>
#include <linux/fault-inject.h>

#include "blk.h"
#include "blk-mq.h"

#ifdef CONFIG_FAIL_IO_TIMEOUT

static DECLARE_FAULT_ATTR(fail_io_timeout);

static int __init setup_fail_io_timeout(char *str)
{
        return setup_fault_attr(&fail_io_timeout, str);
}
__setup("fail_io_timeout=", setup_fail_io_timeout);

bool __blk_should_fake_timeout(struct request_queue *q)
{
        return should_fail(&fail_io_timeout, 1);
}
EXPORT_SYMBOL_GPL(__blk_should_fake_timeout);

static int __init fail_io_timeout_debugfs(void)
{
        struct dentry *dir = fault_create_debugfs_attr("fail_io_timeout",
                                                NULL, &fail_io_timeout);

        return PTR_ERR_OR_ZERO(dir);
}

late_initcall(fail_io_timeout_debugfs);

ssize_t part_timeout_show(struct device *dev, struct device_attribute *attr,
                          char *buf)
{
        struct gendisk *disk = dev_to_disk(dev);
        int set = test_bit(QUEUE_FLAG_FAIL_IO, &disk->queue->queue_flags);

        return sprintf(buf, "%d\n", set != 0);
}

ssize_t part_timeout_store(struct device *dev, struct device_attribute *attr,
                           const char *buf, size_t count)
{
        struct gendisk *disk = dev_to_disk(dev);
        int val;

        if (count) {
                struct request_queue *q = disk->queue;
                char *p = (char *) buf;

                val = simple_strtoul(p, &p, 10);
                if (val)
                        blk_queue_flag_set(QUEUE_FLAG_FAIL_IO, q);
                else
                        blk_queue_flag_clear(QUEUE_FLAG_FAIL_IO, q);
        }

        return count;
}

#endif /* CONFIG_FAIL_IO_TIMEOUT */

/**
 * blk_abort_request - Request recovery for the specified command
 * @req:        pointer to the request of interest
 *
 * This function requests that the block layer start recovery for the
 * request by deleting the timer and calling the q's timeout function.
 * LLDDs who implement their own error recovery MAY ignore the timeout
 * event if they generated blk_abort_request.
 */
void blk_abort_request(struct request *req)
{
        /*
         * All we need to ensure is that timeout scan takes place
         * immediately and that scan sees the new timeout value.
         * No need for fancy synchronizations.
         */
        WRITE_ONCE(req->deadline, jiffies);
        kblockd_schedule_work(&req->q->timeout_work);
}
EXPORT_SYMBOL_GPL(blk_abort_request);

static unsigned long blk_timeout_mask __read_mostly;

static int __init blk_timeout_init(void)
{
        blk_timeout_mask = roundup_pow_of_two(HZ) - 1;
        return 0;
}

late_initcall(blk_timeout_init);

/*
 * Just a rough estimate, we don't care about specific values for timeouts.
 */
static inline unsigned long blk_round_jiffies(unsigned long j)
{
        return (j + blk_timeout_mask) + 1;
}

unsigned long blk_rq_timeout(unsigned long timeout)
{
        unsigned long maxt;

        maxt = blk_round_jiffies(jiffies + BLK_MAX_TIMEOUT);
        if (time_after(timeout, maxt))
                timeout = maxt;

        return timeout;
}

/**
 * blk_add_timer - Start timeout timer for a single request
 * @req:        request that is about to start running.
 *
 * Notes:
 *    Each request has its own timer, and as it is added to the queue, we
 *    set up the timer. When the request completes, we cancel the timer.
 */
void blk_add_timer(struct request *req)
{
        struct request_queue *q = req->q;
        unsigned long expiry;

        /*
         * Some LLDs, like scsi, peek at the timeout to prevent a
         * command from being retried forever.
         */
        if (!req->timeout)
                req->timeout = q->rq_timeout;

        req->rq_flags &= ~RQF_TIMED_OUT;

        expiry = jiffies + req->timeout;
        WRITE_ONCE(req->deadline, expiry);

        /*
         * If the timer isn't already pending or this timeout is earlier
         * than an existing one, modify the timer. Round up to next nearest
         * second.
         */
        expiry = blk_rq_timeout(blk_round_jiffies(expiry));

        if (!timer_pending(&q->timeout) ||
            time_before(expiry, q->timeout.expires)) {
                unsigned long diff = q->timeout.expires - expiry;

                /*
                 * Due to added timer slack to group timers, the timer
                 * will often be a little in front of what we asked for.
                 * So apply some tolerance here too, otherwise we keep
                 * modifying the timer because expires for value X
                 * will be X + something.
                 */
                if (!timer_pending(&q->timeout) || (diff >= HZ / 2))
                        mod_timer(&q->timeout, expiry);
        }

}


















    3 























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_PGTABLE_INVERT_H
#define _ASM_PGTABLE_INVERT_H 1

#ifndef __ASSEMBLY__

/*
 * A clear pte value is special, and doesn't get inverted.
 *
 * Note that even users that only pass a pgprot_t (rather
 * than a full pte) won't trigger the special zero case,
 * because even PAGE_NONE has _PAGE_PROTNONE | _PAGE_ACCESSED
 * set. So the all zero case really is limited to just the
 * cleared page table entry case.
 */
static inline bool __pte_needs_invert(u64 val)
{
        return val && !(val & _PAGE_PRESENT);
}

/* Get a mask to xor with the page table entry to get the correct pfn. */
static inline u64 protnone_mask(u64 val)
{
        return __pte_needs_invert(val) ?  ~0ull : 0;
}

static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask)
{
        /*
         * When a PTE transitions from NONE to !NONE or vice-versa
         * invert the PFN part to stop speculation.
         * pte_pfn undoes this when needed.
         */
        if (__pte_needs_invert(oldval) != __pte_needs_invert(val))
                val = (val & ~mask) | (~val & mask);
        return val;
}

#endif /* __ASSEMBLY__ */

#endif


































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _SCSI_SCSI_REQUEST_H
#define _SCSI_SCSI_REQUEST_H

#include <linux/blk-mq.h>

#define BLK_MAX_CDB        16

struct scsi_request {
        unsigned char        __cmd[BLK_MAX_CDB];
        unsigned char        *cmd;
        unsigned short        cmd_len;
        int                result;
        unsigned int        sense_len;
        unsigned int        resid_len;        /* residual count */
        int                retries;
        void                *sense;
};

static inline struct scsi_request *scsi_req(struct request *rq)
{
        return blk_mq_rq_to_pdu(rq);
}

static inline void scsi_req_free_cmd(struct scsi_request *req)
{
        if (req->cmd != req->__cmd)
                kfree(req->cmd);
}

void scsi_req_init(struct scsi_request *req);

#endif /* _SCSI_SCSI_REQUEST_H */



































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_RTNH_H
#define __NET_RTNH_H

#include <linux/rtnetlink.h>
#include <net/netlink.h>

static inline int rtnh_ok(const struct rtnexthop *rtnh, int remaining)
{
        return remaining >= (int)sizeof(*rtnh) &&
               rtnh->rtnh_len >= sizeof(*rtnh) &&
               rtnh->rtnh_len <= remaining;
}

static inline struct rtnexthop *rtnh_next(const struct rtnexthop *rtnh,
                                         int *remaining)
{
        int totlen = NLA_ALIGN(rtnh->rtnh_len);

        *remaining -= totlen;
        return (struct rtnexthop *) ((char *) rtnh + totlen);
}

static inline struct nlattr *rtnh_attrs(const struct rtnexthop *rtnh)
{
        return (struct nlattr *) ((char *) rtnh + NLA_ALIGN(sizeof(*rtnh)));
}

static inline int rtnh_attrlen(const struct rtnexthop *rtnh)
{
        return rtnh->rtnh_len - NLA_ALIGN(sizeof(*rtnh));
}

#endif


































































    1 










    1 






































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * cn_proc.c - process events connector
 *
 * Copyright (C) Matt Helsley, IBM Corp. 2005
 * Based on cn_fork.c by Guillaume Thouvenin <guillaume.thouvenin@bull.net>
 * Original copyright notice follows:
 * Copyright (C) 2005 BULL SA.
 */

#include <linux/kernel.h>
#include <linux/ktime.h>
#include <linux/init.h>
#include <linux/connector.h>
#include <linux/gfp.h>
#include <linux/ptrace.h>
#include <linux/atomic.h>
#include <linux/pid_namespace.h>

#include <linux/cn_proc.h>
#include <linux/local_lock.h>

/*
 * Size of a cn_msg followed by a proc_event structure.  Since the
 * sizeof struct cn_msg is a multiple of 4 bytes, but not 8 bytes, we
 * add one 4-byte word to the size here, and then start the actual
 * cn_msg structure 4 bytes into the stack buffer.  The result is that
 * the immediately following proc_event structure is aligned to 8 bytes.
 */
#define CN_PROC_MSG_SIZE (sizeof(struct cn_msg) + sizeof(struct proc_event) + 4)

/* See comment above; we test our assumption about sizeof struct cn_msg here. */
static inline struct cn_msg *buffer_to_cn_msg(__u8 *buffer)
{
        BUILD_BUG_ON(sizeof(struct cn_msg) != 20);
        return (struct cn_msg *)(buffer + 4);
}

static atomic_t proc_event_num_listeners = ATOMIC_INIT(0);
static struct cb_id cn_proc_event_id = { CN_IDX_PROC, CN_VAL_PROC };

/* local_event.count is used as the sequence number of the netlink message */
struct local_event {
        local_lock_t lock;
        __u32 count;
};
static DEFINE_PER_CPU(struct local_event, local_event) = {
        .lock = INIT_LOCAL_LOCK(lock),
};

static inline void send_msg(struct cn_msg *msg)
{
        local_lock(&local_event.lock);

        msg->seq = __this_cpu_inc_return(local_event.count) - 1;
        ((struct proc_event *)msg->data)->cpu = smp_processor_id();

        /*
         * local_lock() disables preemption during send to ensure the messages
         * are ordered according to their sequence numbers.
         *
         * If cn_netlink_send() fails, the data is not sent.
         */
        cn_netlink_send(msg, 0, CN_IDX_PROC, GFP_NOWAIT);

        local_unlock(&local_event.lock);
}

void proc_fork_connector(struct task_struct *task)
{
        struct cn_msg *msg;
        struct proc_event *ev;
        __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8);
        struct task_struct *parent;

        if (atomic_read(&proc_event_num_listeners) < 1)
                return;

        msg = buffer_to_cn_msg(buffer);
        ev = (struct proc_event *)msg->data;
        memset(&ev->event_data, 0, sizeof(ev->event_data));
        ev->timestamp_ns = ktime_get_ns();
        ev->what = PROC_EVENT_FORK;
        rcu_read_lock();
        parent = rcu_dereference(task->real_parent);
        ev->event_data.fork.parent_pid = parent->pid;
        ev->event_data.fork.parent_tgid = parent->tgid;
        rcu_read_unlock();
        ev->event_data.fork.child_pid = task->pid;
        ev->event_data.fork.child_tgid = task->tgid;

        memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
        msg->ack = 0; /* not used */
        msg->len = sizeof(*ev);
        msg->flags = 0; /* not used */
        send_msg(msg);
}

void proc_exec_connector(struct task_struct *task)
{
        struct cn_msg *msg;
        struct proc_event *ev;
        __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8);

        if (atomic_read(&proc_event_num_listeners) < 1)
                return;

        msg = buffer_to_cn_msg(buffer);
        ev = (struct proc_event *)msg->data;
        memset(&ev->event_data, 0, sizeof(ev->event_data));
        ev->timestamp_ns = ktime_get_ns();
        ev->what = PROC_EVENT_EXEC;
        ev->event_data.exec.process_pid = task->pid;
        ev->event_data.exec.process_tgid = task->tgid;

        memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
        msg->ack = 0; /* not used */
        msg->len = sizeof(*ev);
        msg->flags = 0; /* not used */
        send_msg(msg);
}

void proc_id_connector(struct task_struct *task, int which_id)
{
        struct cn_msg *msg;
        struct proc_event *ev;
        __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8);
        const struct cred *cred;

        if (atomic_read(&proc_event_num_listeners) < 1)
                return;

        msg = buffer_to_cn_msg(buffer);
        ev = (struct proc_event *)msg->data;
        memset(&ev->event_data, 0, sizeof(ev->event_data));
        ev->what = which_id;
        ev->event_data.id.process_pid = task->pid;
        ev->event_data.id.process_tgid = task->tgid;
        rcu_read_lock();
        cred = __task_cred(task);
        if (which_id == PROC_EVENT_UID) {
                ev->event_data.id.r.ruid = from_kuid_munged(&init_user_ns, cred->uid);
                ev->event_data.id.e.euid = from_kuid_munged(&init_user_ns, cred->euid);
        } else if (which_id == PROC_EVENT_GID) {
                ev->event_data.id.r.rgid = from_kgid_munged(&init_user_ns, cred->gid);
                ev->event_data.id.e.egid = from_kgid_munged(&init_user_ns, cred->egid);
        } else {
                rcu_read_unlock();
                return;
        }
        rcu_read_unlock();
        ev->timestamp_ns = ktime_get_ns();

        memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
        msg->ack = 0; /* not used */
        msg->len = sizeof(*ev);
        msg->flags = 0; /* not used */
        send_msg(msg);
}

void proc_sid_connector(struct task_struct *task)
{
        struct cn_msg *msg;
        struct proc_event *ev;
        __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8);

        if (atomic_read(&proc_event_num_listeners) < 1)
                return;

        msg = buffer_to_cn_msg(buffer);
        ev = (struct proc_event *)msg->data;
        memset(&ev->event_data, 0, sizeof(ev->event_data));
        ev->timestamp_ns = ktime_get_ns();
        ev->what = PROC_EVENT_SID;
        ev->event_data.sid.process_pid = task->pid;
        ev->event_data.sid.process_tgid = task->tgid;

        memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
        msg->ack = 0; /* not used */
        msg->len = sizeof(*ev);
        msg->flags = 0; /* not used */
        send_msg(msg);
}

void proc_ptrace_connector(struct task_struct *task, int ptrace_id)
{
        struct cn_msg *msg;
        struct proc_event *ev;
        __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8);

        if (atomic_read(&proc_event_num_listeners) < 1)
                return;

        msg = buffer_to_cn_msg(buffer);
        ev = (struct proc_event *)msg->data;
        memset(&ev->event_data, 0, sizeof(ev->event_data));
        ev->timestamp_ns = ktime_get_ns();
        ev->what = PROC_EVENT_PTRACE;
        ev->event_data.ptrace.process_pid  = task->pid;
        ev->event_data.ptrace.process_tgid = task->tgid;
        if (ptrace_id == PTRACE_ATTACH) {
                ev->event_data.ptrace.tracer_pid  = current->pid;
                ev->event_data.ptrace.tracer_tgid = current->tgid;
        } else if (ptrace_id == PTRACE_DETACH) {
                ev->event_data.ptrace.tracer_pid  = 0;
                ev->event_data.ptrace.tracer_tgid = 0;
        } else
                return;

        memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
        msg->ack = 0; /* not used */
        msg->len = sizeof(*ev);
        msg->flags = 0; /* not used */
        send_msg(msg);
}

void proc_comm_connector(struct task_struct *task)
{
        struct cn_msg *msg;
        struct proc_event *ev;
        __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8);

        if (atomic_read(&proc_event_num_listeners) < 1)
                return;

        msg = buffer_to_cn_msg(buffer);
        ev = (struct proc_event *)msg->data;
        memset(&ev->event_data, 0, sizeof(ev->event_data));
        ev->timestamp_ns = ktime_get_ns();
        ev->what = PROC_EVENT_COMM;
        ev->event_data.comm.process_pid  = task->pid;
        ev->event_data.comm.process_tgid = task->tgid;
        get_task_comm(ev->event_data.comm.comm, task);

        memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
        msg->ack = 0; /* not used */
        msg->len = sizeof(*ev);
        msg->flags = 0; /* not used */
        send_msg(msg);
}

void proc_coredump_connector(struct task_struct *task)
{
        struct cn_msg *msg;
        struct proc_event *ev;
        struct task_struct *parent;
        __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8);

        if (atomic_read(&proc_event_num_listeners) < 1)
                return;

        msg = buffer_to_cn_msg(buffer);
        ev = (struct proc_event *)msg->data;
        memset(&ev->event_data, 0, sizeof(ev->event_data));
        ev->timestamp_ns = ktime_get_ns();
        ev->what = PROC_EVENT_COREDUMP;
        ev->event_data.coredump.process_pid = task->pid;
        ev->event_data.coredump.process_tgid = task->tgid;

        rcu_read_lock();
        if (pid_alive(task)) {
                parent = rcu_dereference(task->real_parent);
                ev->event_data.coredump.parent_pid = parent->pid;
                ev->event_data.coredump.parent_tgid = parent->tgid;
        }
        rcu_read_unlock();

        memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
        msg->ack = 0; /* not used */
        msg->len = sizeof(*ev);
        msg->flags = 0; /* not used */
        send_msg(msg);
}

void proc_exit_connector(struct task_struct *task)
{
        struct cn_msg *msg;
        struct proc_event *ev;
        struct task_struct *parent;
        __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8);

        if (atomic_read(&proc_event_num_listeners) < 1)
                return;

        msg = buffer_to_cn_msg(buffer);
        ev = (struct proc_event *)msg->data;
        memset(&ev->event_data, 0, sizeof(ev->event_data));
        ev->timestamp_ns = ktime_get_ns();
        ev->what = PROC_EVENT_EXIT;
        ev->event_data.exit.process_pid = task->pid;
        ev->event_data.exit.process_tgid = task->tgid;
        ev->event_data.exit.exit_code = task->exit_code;
        ev->event_data.exit.exit_signal = task->exit_signal;

        rcu_read_lock();
        if (pid_alive(task)) {
                parent = rcu_dereference(task->real_parent);
                ev->event_data.exit.parent_pid = parent->pid;
                ev->event_data.exit.parent_tgid = parent->tgid;
        }
        rcu_read_unlock();

        memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
        msg->ack = 0; /* not used */
        msg->len = sizeof(*ev);
        msg->flags = 0; /* not used */
        send_msg(msg);
}

/*
 * Send an acknowledgement message to userspace
 *
 * Use 0 for success, EFOO otherwise.
 * Note: this is the negative of conventional kernel error
 * values because it's not being returned via syscall return
 * mechanisms.
 */
static void cn_proc_ack(int err, int rcvd_seq, int rcvd_ack)
{
        struct cn_msg *msg;
        struct proc_event *ev;
        __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8);

        if (atomic_read(&proc_event_num_listeners) < 1)
                return;

        msg = buffer_to_cn_msg(buffer);
        ev = (struct proc_event *)msg->data;
        memset(&ev->event_data, 0, sizeof(ev->event_data));
        msg->seq = rcvd_seq;
        ev->timestamp_ns = ktime_get_ns();
        ev->cpu = -1;
        ev->what = PROC_EVENT_NONE;
        ev->event_data.ack.err = err;
        memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
        msg->ack = rcvd_ack + 1;
        msg->len = sizeof(*ev);
        msg->flags = 0; /* not used */
        send_msg(msg);
}

/**
 * cn_proc_mcast_ctl
 * @data: message sent from userspace via the connector
 */
static void cn_proc_mcast_ctl(struct cn_msg *msg,
                              struct netlink_skb_parms *nsp)
{
        enum proc_cn_mcast_op *mc_op = NULL;
        int err = 0;

        if (msg->len != sizeof(*mc_op))
                return;

        /* 
         * Events are reported with respect to the initial pid
         * and user namespaces so ignore requestors from
         * other namespaces.
         */
        if ((current_user_ns() != &init_user_ns) ||
            (task_active_pid_ns(current) != &init_pid_ns))
                return;

        /* Can only change if privileged. */
        if (!__netlink_ns_capable(nsp, &init_user_ns, CAP_NET_ADMIN)) {
                err = EPERM;
                goto out;
        }

        mc_op = (enum proc_cn_mcast_op *)msg->data;
        switch (*mc_op) {
        case PROC_CN_MCAST_LISTEN:
                atomic_inc(&proc_event_num_listeners);
                break;
        case PROC_CN_MCAST_IGNORE:
                atomic_dec(&proc_event_num_listeners);
                break;
        default:
                err = EINVAL;
                break;
        }

out:
        cn_proc_ack(err, msg->seq, msg->ack);
}

/*
 * cn_proc_init - initialization entry point
 *
 * Adds the connector callback to the connector driver.
 */
static int __init cn_proc_init(void)
{
        int err = cn_add_callback(&cn_proc_event_id,
                                  "cn_proc",
                                  &cn_proc_mcast_ctl);
        if (err) {
                pr_warn("cn_proc failed to register\n");
                return err;
        }
        return 0;
}
device_initcall(cn_proc_init);

































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_SCHED_GENERIC_H
#define __NET_SCHED_GENERIC_H

#include <linux/netdevice.h>
#include <linux/types.h>
#include <linux/rcupdate.h>
#include <linux/pkt_sched.h>
#include <linux/pkt_cls.h>
#include <linux/percpu.h>
#include <linux/dynamic_queue_limits.h>
#include <linux/list.h>
#include <linux/refcount.h>
#include <linux/workqueue.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/atomic.h>
#include <linux/hashtable.h>
#include <net/gen_stats.h>
#include <net/rtnetlink.h>
#include <net/flow_offload.h>

struct Qdisc_ops;
struct qdisc_walker;
struct tcf_walker;
struct module;
struct bpf_flow_keys;

struct qdisc_rate_table {
        struct tc_ratespec rate;
        u32                data[256];
        struct qdisc_rate_table *next;
        int                refcnt;
};

enum qdisc_state_t {
        __QDISC_STATE_SCHED,
        __QDISC_STATE_DEACTIVATED,
        __QDISC_STATE_MISSED,
};

struct qdisc_size_table {
        struct rcu_head                rcu;
        struct list_head        list;
        struct tc_sizespec        szopts;
        int                        refcnt;
        u16                        data[];
};

/* similar to sk_buff_head, but skb->prev pointer is undefined. */
struct qdisc_skb_head {
        struct sk_buff        *head;
        struct sk_buff        *tail;
        __u32                qlen;
        spinlock_t        lock;
};

struct Qdisc {
        int                         (*enqueue)(struct sk_buff *skb,
                                           struct Qdisc *sch,
                                           struct sk_buff **to_free);
        struct sk_buff *        (*dequeue)(struct Qdisc *sch);
        unsigned int                flags;
#define TCQ_F_BUILTIN                1
#define TCQ_F_INGRESS                2
#define TCQ_F_CAN_BYPASS        4
#define TCQ_F_MQROOT                8
#define TCQ_F_ONETXQUEUE        0x10 /* dequeue_skb() can assume all skbs are for
                                      * q->dev_queue : It can test
                                      * netif_xmit_frozen_or_stopped() before
                                      * dequeueing next packet.
                                      * Its true for MQ/MQPRIO slaves, or non
                                      * multiqueue device.
                                      */
#define TCQ_F_WARN_NONWC        (1 << 16)
#define TCQ_F_CPUSTATS                0x20 /* run using percpu statistics */
#define TCQ_F_NOPARENT                0x40 /* root of its hierarchy :
                                      * qdisc_tree_decrease_qlen() should stop.
                                      */
#define TCQ_F_INVISIBLE                0x80 /* invisible by default in dump */
#define TCQ_F_NOLOCK                0x100 /* qdisc does not require locking */
#define TCQ_F_OFFLOADED                0x200 /* qdisc is offloaded to HW */
        u32                        limit;
        const struct Qdisc_ops        *ops;
        struct qdisc_size_table        __rcu *stab;
        struct hlist_node       hash;
        u32                        handle;
        u32                        parent;

        struct netdev_queue        *dev_queue;

        struct net_rate_estimator __rcu *rate_est;
        struct gnet_stats_basic_cpu __percpu *cpu_bstats;
        struct gnet_stats_queue        __percpu *cpu_qstats;
        int                        pad;
        refcount_t                refcnt;

        /*
         * For performance sake on SMP, we put highly modified fields at the end
         */
        struct sk_buff_head        gso_skb ____cacheline_aligned_in_smp;
        struct qdisc_skb_head        q;
        struct gnet_stats_basic_packed bstats;
        seqcount_t                running;
        struct gnet_stats_queue        qstats;
        unsigned long                state;
        struct Qdisc            *next_sched;
        struct sk_buff_head        skb_bad_txq;

        spinlock_t                busylock ____cacheline_aligned_in_smp;
        spinlock_t                seqlock;

        /* for NOLOCK qdisc, true if there are no enqueued skbs */
        bool                        empty;
        struct rcu_head                rcu;

        /* private data */
        long privdata[] ____cacheline_aligned;
};

static inline void qdisc_refcount_inc(struct Qdisc *qdisc)
{
        if (qdisc->flags & TCQ_F_BUILTIN)
                return;
        refcount_inc(&qdisc->refcnt);
}

/* Intended to be used by unlocked users, when concurrent qdisc release is
 * possible.
 */

static inline struct Qdisc *qdisc_refcount_inc_nz(struct Qdisc *qdisc)
{
        if (qdisc->flags & TCQ_F_BUILTIN)
                return qdisc;
        if (refcount_inc_not_zero(&qdisc->refcnt))
                return qdisc;
        return NULL;
}

static inline bool qdisc_is_running(struct Qdisc *qdisc)
{
        if (qdisc->flags & TCQ_F_NOLOCK)
                return spin_is_locked(&qdisc->seqlock);
        return (raw_read_seqcount(&qdisc->running) & 1) ? true : false;
}

static inline bool qdisc_is_percpu_stats(const struct Qdisc *q)
{
        return q->flags & TCQ_F_CPUSTATS;
}

static inline bool qdisc_is_empty(const struct Qdisc *qdisc)
{
        if (qdisc_is_percpu_stats(qdisc))
                return READ_ONCE(qdisc->empty);
        return !READ_ONCE(qdisc->q.qlen);
}

static inline bool qdisc_run_begin(struct Qdisc *qdisc)
{
        if (qdisc->flags & TCQ_F_NOLOCK) {
                if (spin_trylock(&qdisc->seqlock))
                        goto nolock_empty;

                /* No need to insist if the MISSED flag was already set.
                 * Note that test_and_set_bit() also gives us memory ordering
                 * guarantees wrt potential earlier enqueue() and below
                 * spin_trylock(), both of which are necessary to prevent races
                 */
                if (test_and_set_bit(__QDISC_STATE_MISSED, &qdisc->state))
                        return false;

                /* Try to take the lock again to make sure that we will either
                 * grab it or the CPU that still has it will see MISSED set
                 * when testing it in qdisc_run_end()
                 */
                if (!spin_trylock(&qdisc->seqlock))
                        return false;

nolock_empty:
                WRITE_ONCE(qdisc->empty, false);
        } else if (qdisc_is_running(qdisc)) {
                return false;
        }
        /* Variant of write_seqcount_begin() telling lockdep a trylock
         * was attempted.
         */
        raw_write_seqcount_begin(&qdisc->running);
        seqcount_acquire(&qdisc->running.dep_map, 0, 1, _RET_IP_);
        return true;
}

static inline void qdisc_run_end(struct Qdisc *qdisc)
{
        write_seqcount_end(&qdisc->running);
        if (qdisc->flags & TCQ_F_NOLOCK) {
                spin_unlock(&qdisc->seqlock);

                /* spin_unlock() only has store-release semantic. The unlock
                 * and test_bit() ordering is a store-load ordering, so a full
                 * memory barrier is needed here.
                 */
                smp_mb();

                if (unlikely(test_bit(__QDISC_STATE_MISSED,
                                      &qdisc->state))) {
                        clear_bit(__QDISC_STATE_MISSED, &qdisc->state);
                        __netif_schedule(qdisc);
                }
        }
}

static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
{
        return qdisc->flags & TCQ_F_ONETXQUEUE;
}

static inline int qdisc_avail_bulklimit(const struct netdev_queue *txq)
{
#ifdef CONFIG_BQL
        /* Non-BQL migrated drivers will return 0, too. */
        return dql_avail(&txq->dql);
#else
        return 0;
#endif
}

struct Qdisc_class_ops {
        unsigned int                flags;
        /* Child qdisc manipulation */
        struct netdev_queue *        (*select_queue)(struct Qdisc *, struct tcmsg *);
        int                        (*graft)(struct Qdisc *, unsigned long cl,
                                        struct Qdisc *, struct Qdisc **,
                                        struct netlink_ext_ack *extack);
        struct Qdisc *                (*leaf)(struct Qdisc *, unsigned long cl);
        void                        (*qlen_notify)(struct Qdisc *, unsigned long);

        /* Class manipulation routines */
        unsigned long                (*find)(struct Qdisc *, u32 classid);
        int                        (*change)(struct Qdisc *, u32, u32,
                                        struct nlattr **, unsigned long *,
                                        struct netlink_ext_ack *);
        int                        (*delete)(struct Qdisc *, unsigned long);
        void                        (*walk)(struct Qdisc *, struct qdisc_walker * arg);

        /* Filter manipulation */
        struct tcf_block *        (*tcf_block)(struct Qdisc *sch,
                                             unsigned long arg,
                                             struct netlink_ext_ack *extack);
        unsigned long                (*bind_tcf)(struct Qdisc *, unsigned long,
                                        u32 classid);
        void                        (*unbind_tcf)(struct Qdisc *, unsigned long);

        /* rtnetlink specific */
        int                        (*dump)(struct Qdisc *, unsigned long,
                                        struct sk_buff *skb, struct tcmsg*);
        int                        (*dump_stats)(struct Qdisc *, unsigned long,
                                        struct gnet_dump *);
};

/* Qdisc_class_ops flag values */

/* Implements API that doesn't require rtnl lock */
enum qdisc_class_ops_flags {
        QDISC_CLASS_OPS_DOIT_UNLOCKED = 1,
};

struct Qdisc_ops {
        struct Qdisc_ops        *next;
        const struct Qdisc_class_ops        *cl_ops;
        char                        id[IFNAMSIZ];
        int                        priv_size;
        unsigned int                static_flags;

        int                         (*enqueue)(struct sk_buff *skb,
                                           struct Qdisc *sch,
                                           struct sk_buff **to_free);
        struct sk_buff *        (*dequeue)(struct Qdisc *);
        struct sk_buff *        (*peek)(struct Qdisc *);

        int                        (*init)(struct Qdisc *sch, struct nlattr *arg,
                                        struct netlink_ext_ack *extack);
        void                        (*reset)(struct Qdisc *);
        void                        (*destroy)(struct Qdisc *);
        int                        (*change)(struct Qdisc *sch,
                                          struct nlattr *arg,
                                          struct netlink_ext_ack *extack);
        void                        (*attach)(struct Qdisc *sch);
        int                        (*change_tx_queue_len)(struct Qdisc *, unsigned int);
        void                        (*change_real_num_tx)(struct Qdisc *sch,
                                                      unsigned int new_real_tx);

        int                        (*dump)(struct Qdisc *, struct sk_buff *);
        int                        (*dump_stats)(struct Qdisc *, struct gnet_dump *);

        void                        (*ingress_block_set)(struct Qdisc *sch,
                                                     u32 block_index);
        void                        (*egress_block_set)(struct Qdisc *sch,
                                                    u32 block_index);
        u32                        (*ingress_block_get)(struct Qdisc *sch);
        u32                        (*egress_block_get)(struct Qdisc *sch);

        struct module                *owner;
};


struct tcf_result {
        union {
                struct {
                        unsigned long        class;
                        u32                classid;
                };
                const struct tcf_proto *goto_tp;

                /* used in the skb_tc_reinsert function */
                struct {
                        bool                ingress;
                        struct gnet_stats_queue *qstats;
                };
        };
};

struct tcf_chain;

struct tcf_proto_ops {
        struct list_head        head;
        char                        kind[IFNAMSIZ];

        int                        (*classify)(struct sk_buff *,
                                            const struct tcf_proto *,
                                            struct tcf_result *);
        int                        (*init)(struct tcf_proto*);
        void                        (*destroy)(struct tcf_proto *tp, bool rtnl_held,
                                           struct netlink_ext_ack *extack);

        void*                        (*get)(struct tcf_proto*, u32 handle);
        void                        (*put)(struct tcf_proto *tp, void *f);
        int                        (*change)(struct net *net, struct sk_buff *,
                                        struct tcf_proto*, unsigned long,
                                        u32 handle, struct nlattr **,
                                        void **, bool, bool,
                                        struct netlink_ext_ack *);
        int                        (*delete)(struct tcf_proto *tp, void *arg,
                                          bool *last, bool rtnl_held,
                                          struct netlink_ext_ack *);
        bool                        (*delete_empty)(struct tcf_proto *tp);
        void                        (*walk)(struct tcf_proto *tp,
                                        struct tcf_walker *arg, bool rtnl_held);
        int                        (*reoffload)(struct tcf_proto *tp, bool add,
                                             flow_setup_cb_t *cb, void *cb_priv,
                                             struct netlink_ext_ack *extack);
        void                        (*hw_add)(struct tcf_proto *tp,
                                          void *type_data);
        void                        (*hw_del)(struct tcf_proto *tp,
                                          void *type_data);
        void                        (*bind_class)(void *, u32, unsigned long,
                                              void *, unsigned long);
        void *                        (*tmplt_create)(struct net *net,
                                                struct tcf_chain *chain,
                                                struct nlattr **tca,
                                                struct netlink_ext_ack *extack);
        void                        (*tmplt_destroy)(void *tmplt_priv);

        /* rtnetlink specific */
        int                        (*dump)(struct net*, struct tcf_proto*, void *,
                                        struct sk_buff *skb, struct tcmsg*,
                                        bool);
        int                        (*terse_dump)(struct net *net,
                                              struct tcf_proto *tp, void *fh,
                                              struct sk_buff *skb,
                                              struct tcmsg *t, bool rtnl_held);
        int                        (*tmplt_dump)(struct sk_buff *skb,
                                              struct net *net,
                                              void *tmplt_priv);

        struct module                *owner;
        int                        flags;
};

/* Classifiers setting TCF_PROTO_OPS_DOIT_UNLOCKED in tcf_proto_ops->flags
 * are expected to implement tcf_proto_ops->delete_empty(), otherwise race
 * conditions can occur when filters are inserted/deleted simultaneously.
 */
enum tcf_proto_ops_flags {
        TCF_PROTO_OPS_DOIT_UNLOCKED = 1,
};

struct tcf_proto {
        /* Fast access part */
        struct tcf_proto __rcu        *next;
        void __rcu                *root;

        /* called under RCU BH lock*/
        int                        (*classify)(struct sk_buff *,
                                            const struct tcf_proto *,
                                            struct tcf_result *);
        __be16                        protocol;

        /* All the rest */
        u32                        prio;
        void                        *data;
        const struct tcf_proto_ops        *ops;
        struct tcf_chain        *chain;
        /* Lock protects tcf_proto shared state and can be used by unlocked
         * classifiers to protect their private data.
         */
        spinlock_t                lock;
        bool                        deleting;
        refcount_t                refcnt;
        struct rcu_head                rcu;
        struct hlist_node        destroy_ht_node;
};

struct qdisc_skb_cb {
        struct {
                unsigned int                pkt_len;
                u16                        slave_dev_queue_mapping;
                u16                        tc_classid;
        };
#define QDISC_CB_PRIV_LEN 20
        unsigned char                data[QDISC_CB_PRIV_LEN];
        u16                        mru;
};

typedef void tcf_chain_head_change_t(struct tcf_proto *tp_head, void *priv);

struct tcf_chain {
        /* Protects filter_chain. */
        struct mutex filter_chain_lock;
        struct tcf_proto __rcu *filter_chain;
        struct list_head list;
        struct tcf_block *block;
        u32 index; /* chain index */
        unsigned int refcnt;
        unsigned int action_refcnt;
        bool explicitly_created;
        bool flushing;
        const struct tcf_proto_ops *tmplt_ops;
        void *tmplt_priv;
        struct rcu_head rcu;
};

struct tcf_block {
        /* Lock protects tcf_block and lifetime-management data of chains
         * attached to the block (refcnt, action_refcnt, explicitly_created).
         */
        struct mutex lock;
        struct list_head chain_list;
        u32 index; /* block index for shared blocks */
        u32 classid; /* which class this block belongs to */
        refcount_t refcnt;
        struct net *net;
        struct Qdisc *q;
        struct rw_semaphore cb_lock; /* protects cb_list and offload counters */
        struct flow_block flow_block;
        struct list_head owner_list;
        bool keep_dst;
        atomic_t offloadcnt; /* Number of oddloaded filters */
        unsigned int nooffloaddevcnt; /* Number of devs unable to do offload */
        unsigned int lockeddevcnt; /* Number of devs that require rtnl lock. */
        struct {
                struct tcf_chain *chain;
                struct list_head filter_chain_list;
        } chain0;
        struct rcu_head rcu;
        DECLARE_HASHTABLE(proto_destroy_ht, 7);
        struct mutex proto_destroy_lock; /* Lock for proto_destroy hashtable. */
};

#ifdef CONFIG_PROVE_LOCKING
static inline bool lockdep_tcf_chain_is_locked(struct tcf_chain *chain)
{
        return lockdep_is_held(&chain->filter_chain_lock);
}

static inline bool lockdep_tcf_proto_is_locked(struct tcf_proto *tp)
{
        return lockdep_is_held(&tp->lock);
}
#else
static inline bool lockdep_tcf_chain_is_locked(struct tcf_block *chain)
{
        return true;
}

static inline bool lockdep_tcf_proto_is_locked(struct tcf_proto *tp)
{
        return true;
}
#endif /* #ifdef CONFIG_PROVE_LOCKING */

#define tcf_chain_dereference(p, chain)                                        \
        rcu_dereference_protected(p, lockdep_tcf_chain_is_locked(chain))

#define tcf_proto_dereference(p, tp)                                        \
        rcu_dereference_protected(p, lockdep_tcf_proto_is_locked(tp))

static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
{
        struct qdisc_skb_cb *qcb;

        BUILD_BUG_ON(sizeof(skb->cb) < sizeof(*qcb));
        BUILD_BUG_ON(sizeof(qcb->data) < sz);
}

static inline int qdisc_qlen_cpu(const struct Qdisc *q)
{
        return this_cpu_ptr(q->cpu_qstats)->qlen;
}

static inline int qdisc_qlen(const struct Qdisc *q)
{
        return q->q.qlen;
}

static inline int qdisc_qlen_sum(const struct Qdisc *q)
{
        __u32 qlen = q->qstats.qlen;
        int i;

        if (qdisc_is_percpu_stats(q)) {
                for_each_possible_cpu(i)
                        qlen += per_cpu_ptr(q->cpu_qstats, i)->qlen;
        } else {
                qlen += q->q.qlen;
        }

        return qlen;
}

static inline struct qdisc_skb_cb *qdisc_skb_cb(const struct sk_buff *skb)
{
        return (struct qdisc_skb_cb *)skb->cb;
}

static inline spinlock_t *qdisc_lock(struct Qdisc *qdisc)
{
        return &qdisc->q.lock;
}

static inline struct Qdisc *qdisc_root(const struct Qdisc *qdisc)
{
        struct Qdisc *q = rcu_dereference_rtnl(qdisc->dev_queue->qdisc);

        return q;
}

static inline struct Qdisc *qdisc_root_bh(const struct Qdisc *qdisc)
{
        return rcu_dereference_bh(qdisc->dev_queue->qdisc);
}

static inline struct Qdisc *qdisc_root_sleeping(const struct Qdisc *qdisc)
{
        return qdisc->dev_queue->qdisc_sleeping;
}

/* The qdisc root lock is a mechanism by which to top level
 * of a qdisc tree can be locked from any qdisc node in the
 * forest.  This allows changing the configuration of some
 * aspect of the qdisc tree while blocking out asynchronous
 * qdisc access in the packet processing paths.
 *
 * It is only legal to do this when the root will not change
 * on us.  Otherwise we'll potentially lock the wrong qdisc
 * root.  This is enforced by holding the RTNL semaphore, which
 * all users of this lock accessor must do.
 */
static inline spinlock_t *qdisc_root_lock(const struct Qdisc *qdisc)
{
        struct Qdisc *root = qdisc_root(qdisc);

        ASSERT_RTNL();
        return qdisc_lock(root);
}

static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc)
{
        struct Qdisc *root = qdisc_root_sleeping(qdisc);

        ASSERT_RTNL();
        return qdisc_lock(root);
}

static inline seqcount_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
{
        struct Qdisc *root = qdisc_root_sleeping(qdisc);

        ASSERT_RTNL();
        return &root->running;
}

static inline struct net_device *qdisc_dev(const struct Qdisc *qdisc)
{
        return qdisc->dev_queue->dev;
}

static inline void sch_tree_lock(const struct Qdisc *q)
{
        spin_lock_bh(qdisc_root_sleeping_lock(q));
}

static inline void sch_tree_unlock(const struct Qdisc *q)
{
        spin_unlock_bh(qdisc_root_sleeping_lock(q));
}

extern struct Qdisc noop_qdisc;
extern struct Qdisc_ops noop_qdisc_ops;
extern struct Qdisc_ops pfifo_fast_ops;
extern struct Qdisc_ops mq_qdisc_ops;
extern struct Qdisc_ops noqueue_qdisc_ops;
extern const struct Qdisc_ops *default_qdisc_ops;
static inline const struct Qdisc_ops *
get_default_qdisc_ops(const struct net_device *dev, int ntx)
{
        return ntx < dev->real_num_tx_queues ?
                        default_qdisc_ops : &pfifo_fast_ops;
}

struct Qdisc_class_common {
        u32                        classid;
        struct hlist_node        hnode;
};

struct Qdisc_class_hash {
        struct hlist_head        *hash;
        unsigned int                hashsize;
        unsigned int                hashmask;
        unsigned int                hashelems;
};

static inline unsigned int qdisc_class_hash(u32 id, u32 mask)
{
        id ^= id >> 8;
        id ^= id >> 4;
        return id & mask;
}

static inline struct Qdisc_class_common *
qdisc_class_find(const struct Qdisc_class_hash *hash, u32 id)
{
        struct Qdisc_class_common *cl;
        unsigned int h;

        if (!id)
                return NULL;

        h = qdisc_class_hash(id, hash->hashmask);
        hlist_for_each_entry(cl, &hash->hash[h], hnode) {
                if (cl->classid == id)
                        return cl;
        }
        return NULL;
}

static inline int tc_classid_to_hwtc(struct net_device *dev, u32 classid)
{
        u32 hwtc = TC_H_MIN(classid) - TC_H_MIN_PRIORITY;

        return (hwtc < netdev_get_num_tc(dev)) ? hwtc : -EINVAL;
}

int qdisc_class_hash_init(struct Qdisc_class_hash *);
void qdisc_class_hash_insert(struct Qdisc_class_hash *,
                             struct Qdisc_class_common *);
void qdisc_class_hash_remove(struct Qdisc_class_hash *,
                             struct Qdisc_class_common *);
void qdisc_class_hash_grow(struct Qdisc *, struct Qdisc_class_hash *);
void qdisc_class_hash_destroy(struct Qdisc_class_hash *);

int dev_qdisc_change_tx_queue_len(struct net_device *dev);
void dev_qdisc_change_real_num_tx(struct net_device *dev,
                                  unsigned int new_real_tx);
void dev_init_scheduler(struct net_device *dev);
void dev_shutdown(struct net_device *dev);
void dev_activate(struct net_device *dev);
void dev_deactivate(struct net_device *dev);
void dev_deactivate_many(struct list_head *head);
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
                              struct Qdisc *qdisc);
void qdisc_reset(struct Qdisc *qdisc);
void qdisc_put(struct Qdisc *qdisc);
void qdisc_put_unlocked(struct Qdisc *qdisc);
void qdisc_tree_reduce_backlog(struct Qdisc *qdisc, int n, int len);
#ifdef CONFIG_NET_SCHED
int qdisc_offload_dump_helper(struct Qdisc *q, enum tc_setup_type type,
                              void *type_data);
void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
                                struct Qdisc *new, struct Qdisc *old,
                                enum tc_setup_type type, void *type_data,
                                struct netlink_ext_ack *extack);
#else
static inline int
qdisc_offload_dump_helper(struct Qdisc *q, enum tc_setup_type type,
                          void *type_data)
{
        q->flags &= ~TCQ_F_OFFLOADED;
        return 0;
}

static inline void
qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
                           struct Qdisc *new, struct Qdisc *old,
                           enum tc_setup_type type, void *type_data,
                           struct netlink_ext_ack *extack)
{
}
#endif
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
                          const struct Qdisc_ops *ops,
                          struct netlink_ext_ack *extack);
void qdisc_free(struct Qdisc *qdisc);
struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
                                const struct Qdisc_ops *ops, u32 parentid,
                                struct netlink_ext_ack *extack);
void __qdisc_calculate_pkt_len(struct sk_buff *skb,
                               const struct qdisc_size_table *stab);
int skb_do_redirect(struct sk_buff *);

static inline bool skb_at_tc_ingress(const struct sk_buff *skb)
{
#ifdef CONFIG_NET_CLS_ACT
        return skb->tc_at_ingress;
#else
        return false;
#endif
}

static inline bool skb_skip_tc_classify(struct sk_buff *skb)
{
#ifdef CONFIG_NET_CLS_ACT
        if (skb->tc_skip_classify) {
                skb->tc_skip_classify = 0;
                return true;
        }
#endif
        return false;
}

/* Reset all TX qdiscs greater than index of a device.  */
static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i)
{
        struct Qdisc *qdisc;

        for (; i < dev->num_tx_queues; i++) {
                qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc);
                if (qdisc) {
                        spin_lock_bh(qdisc_lock(qdisc));
                        qdisc_reset(qdisc);
                        spin_unlock_bh(qdisc_lock(qdisc));
                }
        }
}

/* Are all TX queues of the device empty?  */
static inline bool qdisc_all_tx_empty(const struct net_device *dev)
{
        unsigned int i;

        rcu_read_lock();
        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
                const struct Qdisc *q = rcu_dereference(txq->qdisc);

                if (!qdisc_is_empty(q)) {
                        rcu_read_unlock();
                        return false;
                }
        }
        rcu_read_unlock();
        return true;
}

/* Are any of the TX qdiscs changing?  */
static inline bool qdisc_tx_changing(const struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
                if (rcu_access_pointer(txq->qdisc) != txq->qdisc_sleeping)
                        return true;
        }
        return false;
}

/* Is the device using the noop qdisc on all queues?  */
static inline bool qdisc_tx_is_noop(const struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
                if (rcu_access_pointer(txq->qdisc) != &noop_qdisc)
                        return false;
        }
        return true;
}

static inline unsigned int qdisc_pkt_len(const struct sk_buff *skb)
{
        return qdisc_skb_cb(skb)->pkt_len;
}

/* additional qdisc xmit flags (NET_XMIT_MASK in linux/netdevice.h) */
enum net_xmit_qdisc_t {
        __NET_XMIT_STOLEN = 0x00010000,
        __NET_XMIT_BYPASS = 0x00020000,
};

#ifdef CONFIG_NET_CLS_ACT
#define net_xmit_drop_count(e)        ((e) & __NET_XMIT_STOLEN ? 0 : 1)
#else
#define net_xmit_drop_count(e)        (1)
#endif

static inline void qdisc_calculate_pkt_len(struct sk_buff *skb,
                                           const struct Qdisc *sch)
{
#ifdef CONFIG_NET_SCHED
        struct qdisc_size_table *stab = rcu_dereference_bh(sch->stab);

        if (stab)
                __qdisc_calculate_pkt_len(skb, stab);
#endif
}

static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
                                struct sk_buff **to_free)
{
        return sch->enqueue(skb, sch, to_free);
}

static inline void _bstats_update(struct gnet_stats_basic_packed *bstats,
                                  __u64 bytes, __u32 packets)
{
        bstats->bytes += bytes;
        bstats->packets += packets;
}

static inline void bstats_update(struct gnet_stats_basic_packed *bstats,
                                 const struct sk_buff *skb)
{
        _bstats_update(bstats,
                       qdisc_pkt_len(skb),
                       skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1);
}

static inline void _bstats_cpu_update(struct gnet_stats_basic_cpu *bstats,
                                      __u64 bytes, __u32 packets)
{
        u64_stats_update_begin(&bstats->syncp);
        _bstats_update(&bstats->bstats, bytes, packets);
        u64_stats_update_end(&bstats->syncp);
}

static inline void bstats_cpu_update(struct gnet_stats_basic_cpu *bstats,
                                     const struct sk_buff *skb)
{
        u64_stats_update_begin(&bstats->syncp);
        bstats_update(&bstats->bstats, skb);
        u64_stats_update_end(&bstats->syncp);
}

static inline void qdisc_bstats_cpu_update(struct Qdisc *sch,
                                           const struct sk_buff *skb)
{
        bstats_cpu_update(this_cpu_ptr(sch->cpu_bstats), skb);
}

static inline void qdisc_bstats_update(struct Qdisc *sch,
                                       const struct sk_buff *skb)
{
        bstats_update(&sch->bstats, skb);
}

static inline void qdisc_qstats_backlog_dec(struct Qdisc *sch,
                                            const struct sk_buff *skb)
{
        sch->qstats.backlog -= qdisc_pkt_len(skb);
}

static inline void qdisc_qstats_cpu_backlog_dec(struct Qdisc *sch,
                                                const struct sk_buff *skb)
{
        this_cpu_sub(sch->cpu_qstats->backlog, qdisc_pkt_len(skb));
}

static inline void qdisc_qstats_backlog_inc(struct Qdisc *sch,
                                            const struct sk_buff *skb)
{
        sch->qstats.backlog += qdisc_pkt_len(skb);
}

static inline void qdisc_qstats_cpu_backlog_inc(struct Qdisc *sch,
                                                const struct sk_buff *skb)
{
        this_cpu_add(sch->cpu_qstats->backlog, qdisc_pkt_len(skb));
}

static inline void qdisc_qstats_cpu_qlen_inc(struct Qdisc *sch)
{
        this_cpu_inc(sch->cpu_qstats->qlen);
}

static inline void qdisc_qstats_cpu_qlen_dec(struct Qdisc *sch)
{
        this_cpu_dec(sch->cpu_qstats->qlen);
}

static inline void qdisc_qstats_cpu_requeues_inc(struct Qdisc *sch)
{
        this_cpu_inc(sch->cpu_qstats->requeues);
}

static inline void __qdisc_qstats_drop(struct Qdisc *sch, int count)
{
        sch->qstats.drops += count;
}

static inline void qstats_drop_inc(struct gnet_stats_queue *qstats)
{
        qstats->drops++;
}

static inline void qstats_overlimit_inc(struct gnet_stats_queue *qstats)
{
        qstats->overlimits++;
}

static inline void qdisc_qstats_drop(struct Qdisc *sch)
{
        qstats_drop_inc(&sch->qstats);
}

static inline void qdisc_qstats_cpu_drop(struct Qdisc *sch)
{
        this_cpu_inc(sch->cpu_qstats->drops);
}

static inline void qdisc_qstats_overlimit(struct Qdisc *sch)
{
        sch->qstats.overlimits++;
}

static inline int qdisc_qstats_copy(struct gnet_dump *d, struct Qdisc *sch)
{
        __u32 qlen = qdisc_qlen_sum(sch);

        return gnet_stats_copy_queue(d, sch->cpu_qstats, &sch->qstats, qlen);
}

static inline void qdisc_qstats_qlen_backlog(struct Qdisc *sch,  __u32 *qlen,
                                             __u32 *backlog)
{
        struct gnet_stats_queue qstats = { 0 };
        __u32 len = qdisc_qlen_sum(sch);

        __gnet_stats_copy_queue(&qstats, sch->cpu_qstats, &sch->qstats, len);
        *qlen = qstats.qlen;
        *backlog = qstats.backlog;
}

static inline void qdisc_tree_flush_backlog(struct Qdisc *sch)
{
        __u32 qlen, backlog;

        qdisc_qstats_qlen_backlog(sch, &qlen, &backlog);
        qdisc_tree_reduce_backlog(sch, qlen, backlog);
}

static inline void qdisc_purge_queue(struct Qdisc *sch)
{
        __u32 qlen, backlog;

        qdisc_qstats_qlen_backlog(sch, &qlen, &backlog);
        qdisc_reset(sch);
        qdisc_tree_reduce_backlog(sch, qlen, backlog);
}

static inline void qdisc_skb_head_init(struct qdisc_skb_head *qh)
{
        qh->head = NULL;
        qh->tail = NULL;
        qh->qlen = 0;
}

static inline void __qdisc_enqueue_tail(struct sk_buff *skb,
                                        struct qdisc_skb_head *qh)
{
        struct sk_buff *last = qh->tail;

        if (last) {
                skb->next = NULL;
                last->next = skb;
                qh->tail = skb;
        } else {
                qh->tail = skb;
                qh->head = skb;
        }
        qh->qlen++;
}

static inline int qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch)
{
        __qdisc_enqueue_tail(skb, &sch->q);
        qdisc_qstats_backlog_inc(sch, skb);
        return NET_XMIT_SUCCESS;
}

static inline void __qdisc_enqueue_head(struct sk_buff *skb,
                                        struct qdisc_skb_head *qh)
{
        skb->next = qh->head;

        if (!qh->head)
                qh->tail = skb;
        qh->head = skb;
        qh->qlen++;
}

static inline struct sk_buff *__qdisc_dequeue_head(struct qdisc_skb_head *qh)
{
        struct sk_buff *skb = qh->head;

        if (likely(skb != NULL)) {
                qh->head = skb->next;
                qh->qlen--;
                if (qh->head == NULL)
                        qh->tail = NULL;
                skb->next = NULL;
        }

        return skb;
}

static inline struct sk_buff *qdisc_dequeue_internal(struct Qdisc *sch, bool direct)
{
        struct sk_buff *skb;

        skb = __skb_dequeue(&sch->gso_skb);
        if (skb) {
                sch->q.qlen--;
                return skb;
        }
        if (direct)
                return __qdisc_dequeue_head(&sch->q);
        else
                return sch->dequeue(sch);
}

static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch)
{
        struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);

        if (likely(skb != NULL)) {
                qdisc_qstats_backlog_dec(sch, skb);
                qdisc_bstats_update(sch, skb);
        }

        return skb;
}

/* Instead of calling kfree_skb() while root qdisc lock is held,
 * queue the skb for future freeing at end of __dev_xmit_skb()
 */
static inline void __qdisc_drop(struct sk_buff *skb, struct sk_buff **to_free)
{
        skb->next = *to_free;
        *to_free = skb;
}

static inline void __qdisc_drop_all(struct sk_buff *skb,
                                    struct sk_buff **to_free)
{
        if (skb->prev)
                skb->prev->next = *to_free;
        else
                skb->next = *to_free;
        *to_free = skb;
}

static inline unsigned int __qdisc_queue_drop_head(struct Qdisc *sch,
                                                   struct qdisc_skb_head *qh,
                                                   struct sk_buff **to_free)
{
        struct sk_buff *skb = __qdisc_dequeue_head(qh);

        if (likely(skb != NULL)) {
                unsigned int len = qdisc_pkt_len(skb);

                qdisc_qstats_backlog_dec(sch, skb);
                __qdisc_drop(skb, to_free);
                return len;
        }

        return 0;
}

static inline struct sk_buff *qdisc_peek_head(struct Qdisc *sch)
{
        const struct qdisc_skb_head *qh = &sch->q;

        return qh->head;
}

/* generic pseudo peek method for non-work-conserving qdisc */
static inline struct sk_buff *qdisc_peek_dequeued(struct Qdisc *sch)
{
        struct sk_buff *skb = skb_peek(&sch->gso_skb);

        /* we can reuse ->gso_skb because peek isn't called for root qdiscs */
        if (!skb) {
                skb = sch->dequeue(sch);

                if (skb) {
                        __skb_queue_head(&sch->gso_skb, skb);
                        /* it's still part of the queue */
                        qdisc_qstats_backlog_inc(sch, skb);
                        sch->q.qlen++;
                }
        }

        return skb;
}

static inline void qdisc_update_stats_at_dequeue(struct Qdisc *sch,
                                                 struct sk_buff *skb)
{
        if (qdisc_is_percpu_stats(sch)) {
                qdisc_qstats_cpu_backlog_dec(sch, skb);
                qdisc_bstats_cpu_update(sch, skb);
                qdisc_qstats_cpu_qlen_dec(sch);
        } else {
                qdisc_qstats_backlog_dec(sch, skb);
                qdisc_bstats_update(sch, skb);
                sch->q.qlen--;
        }
}

static inline void qdisc_update_stats_at_enqueue(struct Qdisc *sch,
                                                 unsigned int pkt_len)
{
        if (qdisc_is_percpu_stats(sch)) {
                qdisc_qstats_cpu_qlen_inc(sch);
                this_cpu_add(sch->cpu_qstats->backlog, pkt_len);
        } else {
                sch->qstats.backlog += pkt_len;
                sch->q.qlen++;
        }
}

/* use instead of qdisc->dequeue() for all qdiscs queried with ->peek() */
static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch)
{
        struct sk_buff *skb = skb_peek(&sch->gso_skb);

        if (skb) {
                skb = __skb_dequeue(&sch->gso_skb);
                if (qdisc_is_percpu_stats(sch)) {
                        qdisc_qstats_cpu_backlog_dec(sch, skb);
                        qdisc_qstats_cpu_qlen_dec(sch);
                } else {
                        qdisc_qstats_backlog_dec(sch, skb);
                        sch->q.qlen--;
                }
        } else {
                skb = sch->dequeue(sch);
        }

        return skb;
}

static inline void __qdisc_reset_queue(struct qdisc_skb_head *qh)
{
        /*
         * We do not know the backlog in bytes of this list, it
         * is up to the caller to correct it
         */
        ASSERT_RTNL();
        if (qh->qlen) {
                rtnl_kfree_skbs(qh->head, qh->tail);

                qh->head = NULL;
                qh->tail = NULL;
                qh->qlen = 0;
        }
}

static inline void qdisc_reset_queue(struct Qdisc *sch)
{
        __qdisc_reset_queue(&sch->q);
}

static inline struct Qdisc *qdisc_replace(struct Qdisc *sch, struct Qdisc *new,
                                          struct Qdisc **pold)
{
        struct Qdisc *old;

        sch_tree_lock(sch);
        old = *pold;
        *pold = new;
        if (old != NULL)
                qdisc_purge_queue(old);
        sch_tree_unlock(sch);

        return old;
}

static inline void rtnl_qdisc_drop(struct sk_buff *skb, struct Qdisc *sch)
{
        rtnl_kfree_skbs(skb, skb);
        qdisc_qstats_drop(sch);
}

static inline int qdisc_drop_cpu(struct sk_buff *skb, struct Qdisc *sch,
                                 struct sk_buff **to_free)
{
        __qdisc_drop(skb, to_free);
        qdisc_qstats_cpu_drop(sch);

        return NET_XMIT_DROP;
}

static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch,
                             struct sk_buff **to_free)
{
        __qdisc_drop(skb, to_free);
        qdisc_qstats_drop(sch);

        return NET_XMIT_DROP;
}

static inline int qdisc_drop_all(struct sk_buff *skb, struct Qdisc *sch,
                                 struct sk_buff **to_free)
{
        __qdisc_drop_all(skb, to_free);
        qdisc_qstats_drop(sch);

        return NET_XMIT_DROP;
}

/* Length to Time (L2T) lookup in a qdisc_rate_table, to determine how
   long it will take to send a packet given its size.
 */
static inline u32 qdisc_l2t(struct qdisc_rate_table* rtab, unsigned int pktlen)
{
        int slot = pktlen + rtab->rate.cell_align + rtab->rate.overhead;
        if (slot < 0)
                slot = 0;
        slot >>= rtab->rate.cell_log;
        if (slot > 255)
                return rtab->data[255]*(slot >> 8) + rtab->data[slot & 0xFF];
        return rtab->data[slot];
}

struct psched_ratecfg {
        u64        rate_bytes_ps; /* bytes per second */
        u32        mult;
        u16        overhead;
        u16        mpu;
        u8        linklayer;
        u8        shift;
};

static inline u64 psched_l2t_ns(const struct psched_ratecfg *r,
                                unsigned int len)
{
        len += r->overhead;

        if (len < r->mpu)
                len = r->mpu;

        if (unlikely(r->linklayer == TC_LINKLAYER_ATM))
                return ((u64)(DIV_ROUND_UP(len,48)*53) * r->mult) >> r->shift;

        return ((u64)len * r->mult) >> r->shift;
}

void psched_ratecfg_precompute(struct psched_ratecfg *r,
                               const struct tc_ratespec *conf,
                               u64 rate64);

static inline void psched_ratecfg_getrate(struct tc_ratespec *res,
                                          const struct psched_ratecfg *r)
{
        memset(res, 0, sizeof(*res));

        /* legacy struct tc_ratespec has a 32bit @rate field
         * Qdisc using 64bit rate should add new attributes
         * in order to maintain compatibility.
         */
        res->rate = min_t(u64, r->rate_bytes_ps, ~0U);

        res->overhead = r->overhead;
        res->mpu = r->mpu;
        res->linklayer = (r->linklayer & TC_LINKLAYER_MASK);
}

/* Mini Qdisc serves for specific needs of ingress/clsact Qdisc.
 * The fast path only needs to access filter list and to update stats
 */
struct mini_Qdisc {
        struct tcf_proto *filter_list;
        struct tcf_block *block;
        struct gnet_stats_basic_cpu __percpu *cpu_bstats;
        struct gnet_stats_queue        __percpu *cpu_qstats;
        struct rcu_head rcu;
};

static inline void mini_qdisc_bstats_cpu_update(struct mini_Qdisc *miniq,
                                                const struct sk_buff *skb)
{
        bstats_cpu_update(this_cpu_ptr(miniq->cpu_bstats), skb);
}

static inline void mini_qdisc_qstats_cpu_drop(struct mini_Qdisc *miniq)
{
        this_cpu_inc(miniq->cpu_qstats->drops);
}

struct mini_Qdisc_pair {
        struct mini_Qdisc miniq1;
        struct mini_Qdisc miniq2;
        struct mini_Qdisc __rcu **p_miniq;
};

void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
                          struct tcf_proto *tp_head);
void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc,
                          struct mini_Qdisc __rcu **p_miniq);
void mini_qdisc_pair_block_init(struct mini_Qdisc_pair *miniqp,
                                struct tcf_block *block);

/* Make sure qdisc is no longer in SCHED state. */
static inline void qdisc_synchronize(const struct Qdisc *q)
{
        while (test_bit(__QDISC_STATE_SCHED, &q->state))
                msleep(1);
}

#endif










































































































    1 















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
































    1 










    1 








    1 








    1 














    1 
    1 



































































































    1 






























    1 





    1 


























    1 







    1 












    1 
    1 












    1 





















    1 





    1 























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 









    1 





















    1 

















    1 
    1 
    1 



    1 




    1 












    1 





    1 

































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
// SPDX-License-Identifier: GPL-2.0-only
/* xfrm_user.c: User interface to configure xfrm engine.
 *
 * Copyright (C) 2002 David S. Miller (davem@redhat.com)
 *
 * Changes:
 *        Mitsuru KANDA @USAGI
 *         Kazunori MIYAZAWA @USAGI
 *         Kunihiro Ishiguro <kunihiro@ipinfusion.com>
 *                 IPv6 support
 *
 */

#include <linux/crypto.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/socket.h>
#include <linux/string.h>
#include <linux/net.h>
#include <linux/skbuff.h>
#include <linux/pfkeyv2.h>
#include <linux/ipsec.h>
#include <linux/init.h>
#include <linux/security.h>
#include <net/sock.h>
#include <net/xfrm.h>
#include <net/netlink.h>
#include <net/ah.h>
#include <linux/uaccess.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <linux/in6.h>
#endif
#include <asm/unaligned.h>

static int verify_one_alg(struct nlattr **attrs, enum xfrm_attr_type_t type)
{
        struct nlattr *rt = attrs[type];
        struct xfrm_algo *algp;

        if (!rt)
                return 0;

        algp = nla_data(rt);
        if (nla_len(rt) < (int)xfrm_alg_len(algp))
                return -EINVAL;

        switch (type) {
        case XFRMA_ALG_AUTH:
        case XFRMA_ALG_CRYPT:
        case XFRMA_ALG_COMP:
                break;

        default:
                return -EINVAL;
        }

        algp->alg_name[sizeof(algp->alg_name) - 1] = '\0';
        return 0;
}

static int verify_auth_trunc(struct nlattr **attrs)
{
        struct nlattr *rt = attrs[XFRMA_ALG_AUTH_TRUNC];
        struct xfrm_algo_auth *algp;

        if (!rt)
                return 0;

        algp = nla_data(rt);
        if (nla_len(rt) < (int)xfrm_alg_auth_len(algp))
                return -EINVAL;

        algp->alg_name[sizeof(algp->alg_name) - 1] = '\0';
        return 0;
}

static int verify_aead(struct nlattr **attrs)
{
        struct nlattr *rt = attrs[XFRMA_ALG_AEAD];
        struct xfrm_algo_aead *algp;

        if (!rt)
                return 0;

        algp = nla_data(rt);
        if (nla_len(rt) < (int)aead_len(algp))
                return -EINVAL;

        algp->alg_name[sizeof(algp->alg_name) - 1] = '\0';
        return 0;
}

static void verify_one_addr(struct nlattr **attrs, enum xfrm_attr_type_t type,
                           xfrm_address_t **addrp)
{
        struct nlattr *rt = attrs[type];

        if (rt && addrp)
                *addrp = nla_data(rt);
}

static inline int verify_sec_ctx_len(struct nlattr **attrs)
{
        struct nlattr *rt = attrs[XFRMA_SEC_CTX];
        struct xfrm_user_sec_ctx *uctx;

        if (!rt)
                return 0;

        uctx = nla_data(rt);
        if (uctx->len > nla_len(rt) ||
            uctx->len != (sizeof(struct xfrm_user_sec_ctx) + uctx->ctx_len))
                return -EINVAL;

        return 0;
}

static inline int verify_replay(struct xfrm_usersa_info *p,
                                struct nlattr **attrs)
{
        struct nlattr *rt = attrs[XFRMA_REPLAY_ESN_VAL];
        struct xfrm_replay_state_esn *rs;

        if (!rt)
                return (p->flags & XFRM_STATE_ESN) ? -EINVAL : 0;

        rs = nla_data(rt);

        if (rs->bmp_len > XFRMA_REPLAY_ESN_MAX / sizeof(rs->bmp[0]) / 8)
                return -EINVAL;

        if (nla_len(rt) < (int)xfrm_replay_state_esn_len(rs) &&
            nla_len(rt) != sizeof(*rs))
                return -EINVAL;

        /* As only ESP and AH support ESN feature. */
        if ((p->id.proto != IPPROTO_ESP) && (p->id.proto != IPPROTO_AH))
                return -EINVAL;

        if (p->replay_window != 0)
                return -EINVAL;

        return 0;
}

static int verify_newsa_info(struct xfrm_usersa_info *p,
                             struct nlattr **attrs)
{
        int err;
        u16 family = p->sel.family;

        err = -EINVAL;
        switch (p->family) {
        case AF_INET:
                break;

        case AF_INET6:
#if IS_ENABLED(CONFIG_IPV6)
                break;
#else
                err = -EAFNOSUPPORT;
                goto out;
#endif

        default:
                goto out;
        }

        if (!family && !(p->flags & XFRM_STATE_AF_UNSPEC))
                family = p->family;

        switch (family) {
        case AF_UNSPEC:
                break;

        case AF_INET:
                if (p->sel.prefixlen_d > 32 || p->sel.prefixlen_s > 32)
                        goto out;

                break;

        case AF_INET6:
#if IS_ENABLED(CONFIG_IPV6)
                if (p->sel.prefixlen_d > 128 || p->sel.prefixlen_s > 128)
                        goto out;

                break;
#else
                err = -EAFNOSUPPORT;
                goto out;
#endif

        default:
                goto out;
        }

        err = -EINVAL;
        switch (p->id.proto) {
        case IPPROTO_AH:
                if ((!attrs[XFRMA_ALG_AUTH]        &&
                     !attrs[XFRMA_ALG_AUTH_TRUNC]) ||
                    attrs[XFRMA_ALG_AEAD]        ||
                    attrs[XFRMA_ALG_CRYPT]        ||
                    attrs[XFRMA_ALG_COMP]        ||
                    attrs[XFRMA_TFCPAD])
                        goto out;
                break;

        case IPPROTO_ESP:
                if (attrs[XFRMA_ALG_COMP])
                        goto out;
                if (!attrs[XFRMA_ALG_AUTH] &&
                    !attrs[XFRMA_ALG_AUTH_TRUNC] &&
                    !attrs[XFRMA_ALG_CRYPT] &&
                    !attrs[XFRMA_ALG_AEAD])
                        goto out;
                if ((attrs[XFRMA_ALG_AUTH] ||
                     attrs[XFRMA_ALG_AUTH_TRUNC] ||
                     attrs[XFRMA_ALG_CRYPT]) &&
                    attrs[XFRMA_ALG_AEAD])
                        goto out;
                if (attrs[XFRMA_TFCPAD] &&
                    p->mode != XFRM_MODE_TUNNEL)
                        goto out;
                break;

        case IPPROTO_COMP:
                if (!attrs[XFRMA_ALG_COMP]        ||
                    attrs[XFRMA_ALG_AEAD]        ||
                    attrs[XFRMA_ALG_AUTH]        ||
                    attrs[XFRMA_ALG_AUTH_TRUNC]        ||
                    attrs[XFRMA_ALG_CRYPT]        ||
                    attrs[XFRMA_TFCPAD]                ||
                    (ntohl(p->id.spi) >= 0x10000))
                        goto out;
                break;

#if IS_ENABLED(CONFIG_IPV6)
        case IPPROTO_DSTOPTS:
        case IPPROTO_ROUTING:
                if (attrs[XFRMA_ALG_COMP]        ||
                    attrs[XFRMA_ALG_AUTH]        ||
                    attrs[XFRMA_ALG_AUTH_TRUNC]        ||
                    attrs[XFRMA_ALG_AEAD]        ||
                    attrs[XFRMA_ALG_CRYPT]        ||
                    attrs[XFRMA_ENCAP]                ||
                    attrs[XFRMA_SEC_CTX]        ||
                    attrs[XFRMA_TFCPAD]                ||
                    !attrs[XFRMA_COADDR])
                        goto out;
                break;
#endif

        default:
                goto out;
        }

        if ((err = verify_aead(attrs)))
                goto out;
        if ((err = verify_auth_trunc(attrs)))
                goto out;
        if ((err = verify_one_alg(attrs, XFRMA_ALG_AUTH)))
                goto out;
        if ((err = verify_one_alg(attrs, XFRMA_ALG_CRYPT)))
                goto out;
        if ((err = verify_one_alg(attrs, XFRMA_ALG_COMP)))
                goto out;
        if ((err = verify_sec_ctx_len(attrs)))
                goto out;
        if ((err = verify_replay(p, attrs)))
                goto out;

        err = -EINVAL;
        switch (p->mode) {
        case XFRM_MODE_TRANSPORT:
        case XFRM_MODE_TUNNEL:
        case XFRM_MODE_ROUTEOPTIMIZATION:
        case XFRM_MODE_BEET:
                break;

        default:
                goto out;
        }

        err = 0;

        if (attrs[XFRMA_MTIMER_THRESH])
                if (!attrs[XFRMA_ENCAP])
                        err = -EINVAL;

out:
        return err;
}

static int attach_one_algo(struct xfrm_algo **algpp, u8 *props,
                           struct xfrm_algo_desc *(*get_byname)(const char *, int),
                           struct nlattr *rta)
{
        struct xfrm_algo *p, *ualg;
        struct xfrm_algo_desc *algo;

        if (!rta)
                return 0;

        ualg = nla_data(rta);

        algo = get_byname(ualg->alg_name, 1);
        if (!algo)
                return -ENOSYS;
        *props = algo->desc.sadb_alg_id;

        p = kmemdup(ualg, xfrm_alg_len(ualg), GFP_KERNEL);
        if (!p)
                return -ENOMEM;

        strcpy(p->alg_name, algo->name);
        *algpp = p;
        return 0;
}

static int attach_crypt(struct xfrm_state *x, struct nlattr *rta)
{
        struct xfrm_algo *p, *ualg;
        struct xfrm_algo_desc *algo;

        if (!rta)
                return 0;

        ualg = nla_data(rta);

        algo = xfrm_ealg_get_byname(ualg->alg_name, 1);
        if (!algo)
                return -ENOSYS;
        x->props.ealgo = algo->desc.sadb_alg_id;

        p = kmemdup(ualg, xfrm_alg_len(ualg), GFP_KERNEL);
        if (!p)
                return -ENOMEM;

        strcpy(p->alg_name, algo->name);
        x->ealg = p;
        x->geniv = algo->uinfo.encr.geniv;
        return 0;
}

static int attach_auth(struct xfrm_algo_auth **algpp, u8 *props,
                       struct nlattr *rta)
{
        struct xfrm_algo *ualg;
        struct xfrm_algo_auth *p;
        struct xfrm_algo_desc *algo;

        if (!rta)
                return 0;

        ualg = nla_data(rta);

        algo = xfrm_aalg_get_byname(ualg->alg_name, 1);
        if (!algo)
                return -ENOSYS;
        *props = algo->desc.sadb_alg_id;

        p = kmalloc(sizeof(*p) + (ualg->alg_key_len + 7) / 8, GFP_KERNEL);
        if (!p)
                return -ENOMEM;

        strcpy(p->alg_name, algo->name);
        p->alg_key_len = ualg->alg_key_len;
        p->alg_trunc_len = algo->uinfo.auth.icv_truncbits;
        memcpy(p->alg_key, ualg->alg_key, (ualg->alg_key_len + 7) / 8);

        *algpp = p;
        return 0;
}

static int attach_auth_trunc(struct xfrm_algo_auth **algpp, u8 *props,
                             struct nlattr *rta)
{
        struct xfrm_algo_auth *p, *ualg;
        struct xfrm_algo_desc *algo;

        if (!rta)
                return 0;

        ualg = nla_data(rta);

        algo = xfrm_aalg_get_byname(ualg->alg_name, 1);
        if (!algo)
                return -ENOSYS;
        if (ualg->alg_trunc_len > algo->uinfo.auth.icv_fullbits)
                return -EINVAL;
        *props = algo->desc.sadb_alg_id;

        p = kmemdup(ualg, xfrm_alg_auth_len(ualg), GFP_KERNEL);
        if (!p)
                return -ENOMEM;

        strcpy(p->alg_name, algo->name);
        if (!p->alg_trunc_len)
                p->alg_trunc_len = algo->uinfo.auth.icv_truncbits;

        *algpp = p;
        return 0;
}

static int attach_aead(struct xfrm_state *x, struct nlattr *rta)
{
        struct xfrm_algo_aead *p, *ualg;
        struct xfrm_algo_desc *algo;

        if (!rta)
                return 0;

        ualg = nla_data(rta);

        algo = xfrm_aead_get_byname(ualg->alg_name, ualg->alg_icv_len, 1);
        if (!algo)
                return -ENOSYS;
        x->props.ealgo = algo->desc.sadb_alg_id;

        p = kmemdup(ualg, aead_len(ualg), GFP_KERNEL);
        if (!p)
                return -ENOMEM;

        strcpy(p->alg_name, algo->name);
        x->aead = p;
        x->geniv = algo->uinfo.aead.geniv;
        return 0;
}

static inline int xfrm_replay_verify_len(struct xfrm_replay_state_esn *replay_esn,
                                         struct nlattr *rp)
{
        struct xfrm_replay_state_esn *up;
        unsigned int ulen;

        if (!replay_esn || !rp)
                return 0;

        up = nla_data(rp);
        ulen = xfrm_replay_state_esn_len(up);

        /* Check the overall length and the internal bitmap length to avoid
         * potential overflow. */
        if (nla_len(rp) < (int)ulen ||
            xfrm_replay_state_esn_len(replay_esn) != ulen ||
            replay_esn->bmp_len != up->bmp_len)
                return -EINVAL;

        if (up->replay_window > up->bmp_len * sizeof(__u32) * 8)
                return -EINVAL;

        return 0;
}

static int xfrm_alloc_replay_state_esn(struct xfrm_replay_state_esn **replay_esn,
                                       struct xfrm_replay_state_esn **preplay_esn,
                                       struct nlattr *rta)
{
        struct xfrm_replay_state_esn *p, *pp, *up;
        unsigned int klen, ulen;

        if (!rta)
                return 0;

        up = nla_data(rta);
        klen = xfrm_replay_state_esn_len(up);
        ulen = nla_len(rta) >= (int)klen ? klen : sizeof(*up);

        p = kzalloc(klen, GFP_KERNEL);
        if (!p)
                return -ENOMEM;

        pp = kzalloc(klen, GFP_KERNEL);
        if (!pp) {
                kfree(p);
                return -ENOMEM;
        }

        memcpy(p, up, ulen);
        memcpy(pp, up, ulen);

        *replay_esn = p;
        *preplay_esn = pp;

        return 0;
}

static inline unsigned int xfrm_user_sec_ctx_size(struct xfrm_sec_ctx *xfrm_ctx)
{
        unsigned int len = 0;

        if (xfrm_ctx) {
                len += sizeof(struct xfrm_user_sec_ctx);
                len += xfrm_ctx->ctx_len;
        }
        return len;
}

static void copy_from_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p)
{
        memcpy(&x->id, &p->id, sizeof(x->id));
        memcpy(&x->sel, &p->sel, sizeof(x->sel));
        memcpy(&x->lft, &p->lft, sizeof(x->lft));
        x->props.mode = p->mode;
        x->props.replay_window = min_t(unsigned int, p->replay_window,
                                        sizeof(x->replay.bitmap) * 8);
        x->props.reqid = p->reqid;
        x->props.family = p->family;
        memcpy(&x->props.saddr, &p->saddr, sizeof(x->props.saddr));
        x->props.flags = p->flags;

        if (!x->sel.family && !(p->flags & XFRM_STATE_AF_UNSPEC))
                x->sel.family = p->family;
}

/*
 * someday when pfkey also has support, we could have the code
 * somehow made shareable and move it to xfrm_state.c - JHS
 *
*/
static void xfrm_update_ae_params(struct xfrm_state *x, struct nlattr **attrs,
                                  int update_esn)
{
        struct nlattr *rp = attrs[XFRMA_REPLAY_VAL];
        struct nlattr *re = update_esn ? attrs[XFRMA_REPLAY_ESN_VAL] : NULL;
        struct nlattr *lt = attrs[XFRMA_LTIME_VAL];
        struct nlattr *et = attrs[XFRMA_ETIMER_THRESH];
        struct nlattr *rt = attrs[XFRMA_REPLAY_THRESH];
        struct nlattr *mt = attrs[XFRMA_MTIMER_THRESH];

        if (re && x->replay_esn && x->preplay_esn) {
                struct xfrm_replay_state_esn *replay_esn;
                replay_esn = nla_data(re);
                memcpy(x->replay_esn, replay_esn,
                       xfrm_replay_state_esn_len(replay_esn));
                memcpy(x->preplay_esn, replay_esn,
                       xfrm_replay_state_esn_len(replay_esn));
        }

        if (rp) {
                struct xfrm_replay_state *replay;
                replay = nla_data(rp);
                memcpy(&x->replay, replay, sizeof(*replay));
                memcpy(&x->preplay, replay, sizeof(*replay));
        }

        if (lt) {
                struct xfrm_lifetime_cur *ltime;
                ltime = nla_data(lt);
                x->curlft.bytes = ltime->bytes;
                x->curlft.packets = ltime->packets;
                x->curlft.add_time = ltime->add_time;
                x->curlft.use_time = ltime->use_time;
        }

        if (et)
                x->replay_maxage = nla_get_u32(et);

        if (rt)
                x->replay_maxdiff = nla_get_u32(rt);

        if (mt)
                x->mapping_maxage = nla_get_u32(mt);
}

static void xfrm_smark_init(struct nlattr **attrs, struct xfrm_mark *m)
{
        if (attrs[XFRMA_SET_MARK]) {
                m->v = nla_get_u32(attrs[XFRMA_SET_MARK]);
                if (attrs[XFRMA_SET_MARK_MASK])
                        m->m = nla_get_u32(attrs[XFRMA_SET_MARK_MASK]);
                else
                        m->m = 0xffffffff;
        } else {
                m->v = m->m = 0;
        }
}

static struct xfrm_state *xfrm_state_construct(struct net *net,
                                               struct xfrm_usersa_info *p,
                                               struct nlattr **attrs,
                                               int *errp)
{
        struct xfrm_state *x = xfrm_state_alloc(net);
        int err = -ENOMEM;

        if (!x)
                goto error_no_put;

        copy_from_user_state(x, p);

        if (attrs[XFRMA_ENCAP]) {
                x->encap = kmemdup(nla_data(attrs[XFRMA_ENCAP]),
                                   sizeof(*x->encap), GFP_KERNEL);
                if (x->encap == NULL)
                        goto error;
        }

        if (attrs[XFRMA_COADDR]) {
                x->coaddr = kmemdup(nla_data(attrs[XFRMA_COADDR]),
                                    sizeof(*x->coaddr), GFP_KERNEL);
                if (x->coaddr == NULL)
                        goto error;
        }

        if (attrs[XFRMA_SA_EXTRA_FLAGS])
                x->props.extra_flags = nla_get_u32(attrs[XFRMA_SA_EXTRA_FLAGS]);

        if ((err = attach_aead(x, attrs[XFRMA_ALG_AEAD])))
                goto error;
        if ((err = attach_auth_trunc(&x->aalg, &x->props.aalgo,
                                     attrs[XFRMA_ALG_AUTH_TRUNC])))
                goto error;
        if (!x->props.aalgo) {
                if ((err = attach_auth(&x->aalg, &x->props.aalgo,
                                       attrs[XFRMA_ALG_AUTH])))
                        goto error;
        }
        if ((err = attach_crypt(x, attrs[XFRMA_ALG_CRYPT])))
                goto error;
        if ((err = attach_one_algo(&x->calg, &x->props.calgo,
                                   xfrm_calg_get_byname,
                                   attrs[XFRMA_ALG_COMP])))
                goto error;

        if (attrs[XFRMA_TFCPAD])
                x->tfcpad = nla_get_u32(attrs[XFRMA_TFCPAD]);

        xfrm_mark_get(attrs, &x->mark);

        xfrm_smark_init(attrs, &x->props.smark);

        if (attrs[XFRMA_IF_ID])
                x->if_id = nla_get_u32(attrs[XFRMA_IF_ID]);

        err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV]);
        if (err)
                goto error;

        if (attrs[XFRMA_SEC_CTX]) {
                err = security_xfrm_state_alloc(x,
                                                nla_data(attrs[XFRMA_SEC_CTX]));
                if (err)
                        goto error;
        }

        if ((err = xfrm_alloc_replay_state_esn(&x->replay_esn, &x->preplay_esn,
                                               attrs[XFRMA_REPLAY_ESN_VAL])))
                goto error;

        x->km.seq = p->seq;
        x->replay_maxdiff = net->xfrm.sysctl_aevent_rseqth;
        /* sysctl_xfrm_aevent_etime is in 100ms units */
        x->replay_maxage = (net->xfrm.sysctl_aevent_etime*HZ)/XFRM_AE_ETH_M;

        if ((err = xfrm_init_replay(x)))
                goto error;

        /* override default values from above */
        xfrm_update_ae_params(x, attrs, 0);

        /* configure the hardware if offload is requested */
        if (attrs[XFRMA_OFFLOAD_DEV]) {
                err = xfrm_dev_state_add(net, x,
                                         nla_data(attrs[XFRMA_OFFLOAD_DEV]));
                if (err)
                        goto error;
        }

        return x;

error:
        x->km.state = XFRM_STATE_DEAD;
        xfrm_state_put(x);
error_no_put:
        *errp = err;
        return NULL;
}

static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
                struct nlattr **attrs)
{
        struct net *net = sock_net(skb->sk);
        struct xfrm_usersa_info *p = nlmsg_data(nlh);
        struct xfrm_state *x;
        int err;
        struct km_event c;

        err = verify_newsa_info(p, attrs);
        if (err)
                return err;

        x = xfrm_state_construct(net, p, attrs, &err);
        if (!x)
                return err;

        xfrm_state_hold(x);
        if (nlh->nlmsg_type == XFRM_MSG_NEWSA)
                err = xfrm_state_add(x);
        else
                err = xfrm_state_update(x);

        xfrm_audit_state_add(x, err ? 0 : 1, true);

        if (err < 0) {
                x->km.state = XFRM_STATE_DEAD;
                xfrm_dev_state_delete(x);
                __xfrm_state_put(x);
                goto out;
        }

        if (x->km.state == XFRM_STATE_VOID)
                x->km.state = XFRM_STATE_VALID;

        c.seq = nlh->nlmsg_seq;
        c.portid = nlh->nlmsg_pid;
        c.event = nlh->nlmsg_type;

        km_state_notify(x, &c);
out:
        xfrm_state_put(x);
        return err;
}

static struct xfrm_state *xfrm_user_state_lookup(struct net *net,
                                                 struct xfrm_usersa_id *p,
                                                 struct nlattr **attrs,
                                                 int *errp)
{
        struct xfrm_state *x = NULL;
        struct xfrm_mark m;
        int err;
        u32 mark = xfrm_mark_get(attrs, &m);

        if (xfrm_id_proto_match(p->proto, IPSEC_PROTO_ANY)) {
                err = -ESRCH;
                x = xfrm_state_lookup(net, mark, &p->daddr, p->spi, p->proto, p->family);
        } else {
                xfrm_address_t *saddr = NULL;

                verify_one_addr(attrs, XFRMA_SRCADDR, &saddr);
                if (!saddr) {
                        err = -EINVAL;
                        goto out;
                }

                err = -ESRCH;
                x = xfrm_state_lookup_byaddr(net, mark,
                                             &p->daddr, saddr,
                                             p->proto, p->family);
        }

 out:
        if (!x && errp)
                *errp = err;
        return x;
}

static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
                struct nlattr **attrs)
{
        struct net *net = sock_net(skb->sk);
        struct xfrm_state *x;
        int err = -ESRCH;
        struct km_event c;
        struct xfrm_usersa_id *p = nlmsg_data(nlh);

        x = xfrm_user_state_lookup(net, p, attrs, &err);
        if (x == NULL)
                return err;

        if ((err = security_xfrm_state_delete(x)) != 0)
                goto out;

        if (xfrm_state_kern(x)) {
                err = -EPERM;
                goto out;
        }

        err = xfrm_state_delete(x);

        if (err < 0)
                goto out;

        c.seq = nlh->nlmsg_seq;
        c.portid = nlh->nlmsg_pid;
        c.event = nlh->nlmsg_type;
        km_state_notify(x, &c);

out:
        xfrm_audit_state_delete(x, err ? 0 : 1, true);
        xfrm_state_put(x);
        return err;
}

static void copy_to_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p)
{
        memset(p, 0, sizeof(*p));
        memcpy(&p->id, &x->id, sizeof(p->id));
        memcpy(&p->sel, &x->sel, sizeof(p->sel));
        memcpy(&p->lft, &x->lft, sizeof(p->lft));
        memcpy(&p->curlft, &x->curlft, sizeof(p->curlft));
        put_unaligned(x->stats.replay_window, &p->stats.replay_window);
        put_unaligned(x->stats.replay, &p->stats.replay);
        put_unaligned(x->stats.integrity_failed, &p->stats.integrity_failed);
        memcpy(&p->saddr, &x->props.saddr, sizeof(p->saddr));
        p->mode = x->props.mode;
        p->replay_window = x->props.replay_window;
        p->reqid = x->props.reqid;
        p->family = x->props.family;
        p->flags = x->props.flags;
        p->seq = x->km.seq;
}

struct xfrm_dump_info {
        struct sk_buff *in_skb;
        struct sk_buff *out_skb;
        u32 nlmsg_seq;
        u16 nlmsg_flags;
};

static int copy_sec_ctx(struct xfrm_sec_ctx *s, struct sk_buff *skb)
{
        struct xfrm_user_sec_ctx *uctx;
        struct nlattr *attr;
        int ctx_size = sizeof(*uctx) + s->ctx_len;

        attr = nla_reserve(skb, XFRMA_SEC_CTX, ctx_size);
        if (attr == NULL)
                return -EMSGSIZE;

        uctx = nla_data(attr);
        uctx->exttype = XFRMA_SEC_CTX;
        uctx->len = ctx_size;
        uctx->ctx_doi = s->ctx_doi;
        uctx->ctx_alg = s->ctx_alg;
        uctx->ctx_len = s->ctx_len;
        memcpy(uctx + 1, s->ctx_str, s->ctx_len);

        return 0;
}

static int copy_user_offload(struct xfrm_dev_offload *xso, struct sk_buff *skb)
{
        struct xfrm_user_offload *xuo;
        struct nlattr *attr;

        attr = nla_reserve(skb, XFRMA_OFFLOAD_DEV, sizeof(*xuo));
        if (attr == NULL)
                return -EMSGSIZE;

        xuo = nla_data(attr);
        memset(xuo, 0, sizeof(*xuo));
        xuo->ifindex = xso->dev->ifindex;
        if (xso->dir == XFRM_DEV_OFFLOAD_IN)
                xuo->flags = XFRM_OFFLOAD_INBOUND;

        return 0;
}

static int copy_to_user_auth(struct xfrm_algo_auth *auth, struct sk_buff *skb)
{
        struct xfrm_algo *algo;
        struct nlattr *nla;

        nla = nla_reserve(skb, XFRMA_ALG_AUTH,
                          sizeof(*algo) + (auth->alg_key_len + 7) / 8);
        if (!nla)
                return -EMSGSIZE;

        algo = nla_data(nla);
        strncpy(algo->alg_name, auth->alg_name, sizeof(algo->alg_name));
        memcpy(algo->alg_key, auth->alg_key, (auth->alg_key_len + 7) / 8);
        algo->alg_key_len = auth->alg_key_len;

        return 0;
}

static int xfrm_smark_put(struct sk_buff *skb, struct xfrm_mark *m)
{
        int ret = 0;

        if (m->v | m->m) {
                ret = nla_put_u32(skb, XFRMA_SET_MARK, m->v);
                if (!ret)
                        ret = nla_put_u32(skb, XFRMA_SET_MARK_MASK, m->m);
        }
        return ret;
}

/* Don't change this without updating xfrm_sa_len! */
static int copy_to_user_state_extra(struct xfrm_state *x,
                                    struct xfrm_usersa_info *p,
                                    struct sk_buff *skb)
{
        int ret = 0;

        copy_to_user_state(x, p);

        if (x->props.extra_flags) {
                ret = nla_put_u32(skb, XFRMA_SA_EXTRA_FLAGS,
                                  x->props.extra_flags);
                if (ret)
                        goto out;
        }

        if (x->coaddr) {
                ret = nla_put(skb, XFRMA_COADDR, sizeof(*x->coaddr), x->coaddr);
                if (ret)
                        goto out;
        }
        if (x->lastused) {
                ret = nla_put_u64_64bit(skb, XFRMA_LASTUSED, x->lastused,
                                        XFRMA_PAD);
                if (ret)
                        goto out;
        }
        if (x->aead) {
                ret = nla_put(skb, XFRMA_ALG_AEAD, aead_len(x->aead), x->aead);
                if (ret)
                        goto out;
        }
        if (x->aalg) {
                ret = copy_to_user_auth(x->aalg, skb);
                if (!ret)
                        ret = nla_put(skb, XFRMA_ALG_AUTH_TRUNC,
                                      xfrm_alg_auth_len(x->aalg), x->aalg);
                if (ret)
                        goto out;
        }
        if (x->ealg) {
                ret = nla_put(skb, XFRMA_ALG_CRYPT, xfrm_alg_len(x->ealg), x->ealg);
                if (ret)
                        goto out;
        }
        if (x->calg) {
                ret = nla_put(skb, XFRMA_ALG_COMP, sizeof(*(x->calg)), x->calg);
                if (ret)
                        goto out;
        }
        if (x->encap) {
                ret = nla_put(skb, XFRMA_ENCAP, sizeof(*x->encap), x->encap);
                if (ret)
                        goto out;
        }
        if (x->tfcpad) {
                ret = nla_put_u32(skb, XFRMA_TFCPAD, x->tfcpad);
                if (ret)
                        goto out;
        }
        ret = xfrm_mark_put(skb, &x->mark);
        if (ret)
                goto out;

        ret = xfrm_smark_put(skb, &x->props.smark);
        if (ret)
                goto out;

        if (x->replay_esn)
                ret = nla_put(skb, XFRMA_REPLAY_ESN_VAL,
                              xfrm_replay_state_esn_len(x->replay_esn),
                              x->replay_esn);
        else
                ret = nla_put(skb, XFRMA_REPLAY_VAL, sizeof(x->replay),
                              &x->replay);
        if (ret)
                goto out;
        if(x->xso.dev)
                ret = copy_user_offload(&x->xso, skb);
        if (ret)
                goto out;
        if (x->if_id) {
                ret = nla_put_u32(skb, XFRMA_IF_ID, x->if_id);
                if (ret)
                        goto out;
        }
        if (x->security) {
                ret = copy_sec_ctx(x->security, skb);
                if (ret)
                        goto out;
        }
        if (x->mapping_maxage)
                ret = nla_put_u32(skb, XFRMA_MTIMER_THRESH, x->mapping_maxage);
out:
        return ret;
}

static int dump_one_state(struct xfrm_state *x, int count, void *ptr)
{
        struct xfrm_dump_info *sp = ptr;
        struct sk_buff *in_skb = sp->in_skb;
        struct sk_buff *skb = sp->out_skb;
        struct xfrm_translator *xtr;
        struct xfrm_usersa_info *p;
        struct nlmsghdr *nlh;
        int err;

        nlh = nlmsg_put(skb, NETLINK_CB(in_skb).portid, sp->nlmsg_seq,
                        XFRM_MSG_NEWSA, sizeof(*p), sp->nlmsg_flags);
        if (nlh == NULL)
                return -EMSGSIZE;

        p = nlmsg_data(nlh);

        err = copy_to_user_state_extra(x, p, skb);
        if (err) {
                nlmsg_cancel(skb, nlh);
                return err;
        }
        nlmsg_end(skb, nlh);

        xtr = xfrm_get_translator();
        if (xtr) {
                err = xtr->alloc_compat(skb, nlh);

                xfrm_put_translator(xtr);
                if (err) {
                        nlmsg_cancel(skb, nlh);
                        return err;
                }
        }

        return 0;
}

static int xfrm_dump_sa_done(struct netlink_callback *cb)
{
        struct xfrm_state_walk *walk = (struct xfrm_state_walk *) &cb->args[1];
        struct sock *sk = cb->skb->sk;
        struct net *net = sock_net(sk);

        if (cb->args[0])
                xfrm_state_walk_done(walk, net);
        return 0;
}

static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct net *net = sock_net(skb->sk);
        struct xfrm_state_walk *walk = (struct xfrm_state_walk *) &cb->args[1];
        struct xfrm_dump_info info;

        BUILD_BUG_ON(sizeof(struct xfrm_state_walk) >
                     sizeof(cb->args) - sizeof(cb->args[0]));

        info.in_skb = cb->skb;
        info.out_skb = skb;
        info.nlmsg_seq = cb->nlh->nlmsg_seq;
        info.nlmsg_flags = NLM_F_MULTI;

        if (!cb->args[0]) {
                struct nlattr *attrs[XFRMA_MAX+1];
                struct xfrm_address_filter *filter = NULL;
                u8 proto = 0;
                int err;

                err = nlmsg_parse_deprecated(cb->nlh, 0, attrs, XFRMA_MAX,
                                             xfrma_policy, cb->extack);
                if (err < 0)
                        return err;

                if (attrs[XFRMA_ADDRESS_FILTER]) {
                        filter = kmemdup(nla_data(attrs[XFRMA_ADDRESS_FILTER]),
                                         sizeof(*filter), GFP_KERNEL);
                        if (filter == NULL)
                                return -ENOMEM;

                        /* see addr_match(), (prefix length >> 5) << 2
                         * will be used to compare xfrm_address_t
                         */
                        if (filter->splen > (sizeof(xfrm_address_t) << 3) ||
                            filter->dplen > (sizeof(xfrm_address_t) << 3)) {
                                kfree(filter);
                                return -EINVAL;
                        }
                }

                if (attrs[XFRMA_PROTO])
                        proto = nla_get_u8(attrs[XFRMA_PROTO]);

                xfrm_state_walk_init(walk, proto, filter);
                cb->args[0] = 1;
        }

        (void) xfrm_state_walk(net, walk, dump_one_state, &info);

        return skb->len;
}

static struct sk_buff *xfrm_state_netlink(struct sk_buff *in_skb,
                                          struct xfrm_state *x, u32 seq)
{
        struct xfrm_dump_info info;
        struct sk_buff *skb;
        int err;

        skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
        if (!skb)
                return ERR_PTR(-ENOMEM);

        info.in_skb = in_skb;
        info.out_skb = skb;
        info.nlmsg_seq = seq;
        info.nlmsg_flags = 0;

        err = dump_one_state(x, 0, &info);
        if (err) {
                kfree_skb(skb);
                return ERR_PTR(err);
        }

        return skb;
}

/* A wrapper for nlmsg_multicast() checking that nlsk is still available.
 * Must be called with RCU read lock.
 */
static inline int xfrm_nlmsg_multicast(struct net *net, struct sk_buff *skb,
                                       u32 pid, unsigned int group)
{
        struct sock *nlsk = rcu_dereference(net->xfrm.nlsk);
        struct xfrm_translator *xtr;

        if (!nlsk) {
                kfree_skb(skb);
                return -EPIPE;
        }

        xtr = xfrm_get_translator();
        if (xtr) {
                int err = xtr->alloc_compat(skb, nlmsg_hdr(skb));

                xfrm_put_translator(xtr);
                if (err) {
                        kfree_skb(skb);
                        return err;
                }
        }

        return nlmsg_multicast(nlsk, skb, pid, group, GFP_ATOMIC);
}

static inline unsigned int xfrm_spdinfo_msgsize(void)
{
        return NLMSG_ALIGN(4)
               + nla_total_size(sizeof(struct xfrmu_spdinfo))
               + nla_total_size(sizeof(struct xfrmu_spdhinfo))
               + nla_total_size(sizeof(struct xfrmu_spdhthresh))
               + nla_total_size(sizeof(struct xfrmu_spdhthresh));
}

static int build_spdinfo(struct sk_buff *skb, struct net *net,
                         u32 portid, u32 seq, u32 flags)
{
        struct xfrmk_spdinfo si;
        struct xfrmu_spdinfo spc;
        struct xfrmu_spdhinfo sph;
        struct xfrmu_spdhthresh spt4, spt6;
        struct nlmsghdr *nlh;
        int err;
        u32 *f;
        unsigned lseq;

        nlh = nlmsg_put(skb, portid, seq, XFRM_MSG_NEWSPDINFO, sizeof(u32), 0);
        if (nlh == NULL) /* shouldn't really happen ... */
                return -EMSGSIZE;

        f = nlmsg_data(nlh);
        *f = flags;
        xfrm_spd_getinfo(net, &si);
        spc.incnt = si.incnt;
        spc.outcnt = si.outcnt;
        spc.fwdcnt = si.fwdcnt;
        spc.inscnt = si.inscnt;
        spc.outscnt = si.outscnt;
        spc.fwdscnt = si.fwdscnt;
        sph.spdhcnt = si.spdhcnt;
        sph.spdhmcnt = si.spdhmcnt;

        do {
                lseq = read_seqbegin(&net->xfrm.policy_hthresh.lock);

                spt4.lbits = net->xfrm.policy_hthresh.lbits4;
                spt4.rbits = net->xfrm.policy_hthresh.rbits4;
                spt6.lbits = net->xfrm.policy_hthresh.lbits6;
                spt6.rbits = net->xfrm.policy_hthresh.rbits6;
        } while (read_seqretry(&net->xfrm.policy_hthresh.lock, lseq));

        err = nla_put(skb, XFRMA_SPD_INFO, sizeof(spc), &spc);
        if (!err)
                err = nla_put(skb, XFRMA_SPD_HINFO, sizeof(sph), &sph);
        if (!err)
                err = nla_put(skb, XFRMA_SPD_IPV4_HTHRESH, sizeof(spt4), &spt4);
        if (!err)
                err = nla_put(skb, XFRMA_SPD_IPV6_HTHRESH, sizeof(spt6), &spt6);
        if (err) {
                nlmsg_cancel(skb, nlh);
                return err;
        }

        nlmsg_end(skb, nlh);
        return 0;
}

static int xfrm_set_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh,
                            struct nlattr **attrs)
{
        struct net *net = sock_net(skb->sk);
        struct xfrmu_spdhthresh *thresh4 = NULL;
        struct xfrmu_spdhthresh *thresh6 = NULL;

        /* selector prefixlen thresholds to hash policies */
        if (attrs[XFRMA_SPD_IPV4_HTHRESH]) {
                struct nlattr *rta = attrs[XFRMA_SPD_IPV4_HTHRESH];

                if (nla_len(rta) < sizeof(*thresh4))
                        return -EINVAL;
                thresh4 = nla_data(rta);
                if (thresh4->lbits > 32 || thresh4->rbits > 32)
                        return -EINVAL;
        }
        if (attrs[XFRMA_SPD_IPV6_HTHRESH]) {
                struct nlattr *rta = attrs[XFRMA_SPD_IPV6_HTHRESH];

                if (nla_len(rta) < sizeof(*thresh6))
                        return -EINVAL;
                thresh6 = nla_data(rta);
                if (thresh6->lbits > 128 || thresh6->rbits > 128)
                        return -EINVAL;
        }

        if (thresh4 || thresh6) {
                write_seqlock(&net->xfrm.policy_hthresh.lock);
                if (thresh4) {
                        net->xfrm.policy_hthresh.lbits4 = thresh4->lbits;
                        net->xfrm.policy_hthresh.rbits4 = thresh4->rbits;
                }
                if (thresh6) {
                        net->xfrm.policy_hthresh.lbits6 = thresh6->lbits;
                        net->xfrm.policy_hthresh.rbits6 = thresh6->rbits;
                }
                write_sequnlock(&net->xfrm.policy_hthresh.lock);

                xfrm_policy_hash_rebuild(net);
        }

        return 0;
}

static int xfrm_get_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh,
                struct nlattr **attrs)
{
        struct net *net = sock_net(skb->sk);
        struct sk_buff *r_skb;
        u32 *flags = nlmsg_data(nlh);
        u32 sportid = NETLINK_CB(skb).portid;
        u32 seq = nlh->nlmsg_seq;
        int err;

        r_skb = nlmsg_new(xfrm_spdinfo_msgsize(), GFP_ATOMIC);
        if (r_skb == NULL)
                return -ENOMEM;

        err = build_spdinfo(r_skb, net, sportid, seq, *flags);
        BUG_ON(err < 0);

        return nlmsg_unicast(net->xfrm.nlsk, r_skb, sportid);
}

static inline unsigned int xfrm_sadinfo_msgsize(void)
{
        return NLMSG_ALIGN(4)
               + nla_total_size(sizeof(struct xfrmu_sadhinfo))
               + nla_total_size(4); /* XFRMA_SAD_CNT */
}

static int build_sadinfo(struct sk_buff *skb, struct net *net,
                         u32 portid, u32 seq, u32 flags)
{
        struct xfrmk_sadinfo si;
        struct xfrmu_sadhinfo sh;
        struct nlmsghdr *nlh;
        int err;
        u32 *f;

        nlh = nlmsg_put(skb, portid, seq, XFRM_MSG_NEWSADINFO, sizeof(u32), 0);
        if (nlh == NULL) /* shouldn't really happen ... */
                return -EMSGSIZE;

        f = nlmsg_data(nlh);
        *f = flags;
        xfrm_sad_getinfo(net, &si);

        sh.sadhmcnt = si.sadhmcnt;
        sh.sadhcnt = si.sadhcnt;

        err = nla_put_u32(skb, XFRMA_SAD_CNT, si.sadcnt);
        if (!err)
                err = nla_put(skb, XFRMA_SAD_HINFO, sizeof(sh), &sh);
        if (err) {
                nlmsg_cancel(skb, nlh);
                return err;
        }

        nlmsg_end(skb, nlh);
        return 0;
}

static int xfrm_get_sadinfo(struct sk_buff *skb, struct nlmsghdr *nlh,
                struct nlattr **attrs)
{
        struct net *net = sock_net(skb->sk);
        struct sk_buff *r_skb;
        u32 *flags = nlmsg_data(nlh);
        u32 sportid = NETLINK_CB(skb).portid;
        u32 seq = nlh->nlmsg_seq;
        int err;

        r_skb = nlmsg_new(xfrm_sadinfo_msgsize(), GFP_ATOMIC);
        if (r_skb == NULL)
                return -ENOMEM;

        err = build_sadinfo(r_skb, net, sportid, seq, *flags);
        BUG_ON(err < 0);

        return nlmsg_unicast(net->xfrm.nlsk, r_skb, sportid);
}

static int xfrm_get_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
                struct nlattr **attrs)
{
        struct net *net = sock_net(skb->sk);
        struct xfrm_usersa_id *p = nlmsg_data(nlh);
        struct xfrm_state *x;
        struct sk_buff *resp_skb;
        int err = -ESRCH;

        x = xfrm_user_state_lookup(net, p, attrs, &err);
        if (x == NULL)
                goto out_noput;

        resp_skb = xfrm_state_netlink(skb, x, nlh->nlmsg_seq);
        if (IS_ERR(resp_skb)) {
                err = PTR_ERR(resp_skb);
        } else {
                err = nlmsg_unicast(net->xfrm.nlsk, resp_skb, NETLINK_CB(skb).portid);
        }
        xfrm_state_put(x);
out_noput:
        return err;
}

static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh,
                struct nlattr **attrs)
{
        struct net *net = sock_net(skb->sk);
        struct xfrm_state *x;
        struct xfrm_userspi_info *p;
        struct xfrm_translator *xtr;
        struct sk_buff *resp_skb;
        xfrm_address_t *daddr;
        int family;
        int err;
        u32 mark;
        struct xfrm_mark m;
        u32 if_id = 0;

        p = nlmsg_data(nlh);
        err = verify_spi_info(p->info.id.proto, p->min, p->max);
        if (err)
                goto out_noput;

        family = p->info.family;
        daddr = &p->info.id.daddr;

        x = NULL;

        mark = xfrm_mark_get(attrs, &m);

        if (attrs[XFRMA_IF_ID])
                if_id = nla_get_u32(attrs[XFRMA_IF_ID]);

        if (p->info.seq) {
                x = xfrm_find_acq_byseq(net, mark, p->info.seq);
                if (x && !xfrm_addr_equal(&x->id.daddr, daddr, family)) {
                        xfrm_state_put(x);
                        x = NULL;
                }
        }

        if (!x)
                x = xfrm_find_acq(net, &m, p->info.mode, p->info.reqid,
                                  if_id, p->info.id.proto, daddr,
                                  &p->info.saddr, 1,
                                  family);
        err = -ENOENT;
        if (x == NULL)
                goto out_noput;

        err = xfrm_alloc_spi(x, p->min, p->max);
        if (err)
                goto out;

        resp_skb = xfrm_state_netlink(skb, x, nlh->nlmsg_seq);
        if (IS_ERR(resp_skb)) {
                err = PTR_ERR(resp_skb);
                goto out;
        }

        xtr = xfrm_get_translator();
        if (xtr) {
                err = xtr->alloc_compat(skb, nlmsg_hdr(skb));

                xfrm_put_translator(xtr);
                if (err) {
                        kfree_skb(resp_skb);
                        goto out;
                }
        }

        err = nlmsg_unicast(net->xfrm.nlsk, resp_skb, NETLINK_CB(skb).portid);

out:
        xfrm_state_put(x);
out_noput:
        return err;
}

static int verify_policy_dir(u8 dir)
{
        switch (dir) {
        case XFRM_POLICY_IN:
        case XFRM_POLICY_OUT:
        case XFRM_POLICY_FWD:
                break;

        default:
                return -EINVAL;
        }

        return 0;
}

static int verify_policy_type(u8 type)
{
        switch (type) {
        case XFRM_POLICY_TYPE_MAIN:
#ifdef CONFIG_XFRM_SUB_POLICY
        case XFRM_POLICY_TYPE_SUB:
#endif
                break;

        default:
                return -EINVAL;
        }

        return 0;
}

static int verify_newpolicy_info(struct xfrm_userpolicy_info *p)
{
        int ret;

        switch (p->share) {
        case XFRM_SHARE_ANY:
        case XFRM_SHARE_SESSION:
        case XFRM_SHARE_USER:
        case XFRM_SHARE_UNIQUE:
                break;

        default:
                return -EINVAL;
        }

        switch (p->action) {
        case XFRM_POLICY_ALLOW:
        case XFRM_POLICY_BLOCK:
                break;

        default:
                return -EINVAL;
        }

        switch (p->sel.family) {
        case AF_INET:
                if (p->sel.prefixlen_d > 32 || p->sel.prefixlen_s > 32)
                        return -EINVAL;

                break;

        case AF_INET6:
#if IS_ENABLED(CONFIG_IPV6)
                if (p->sel.prefixlen_d > 128 || p->sel.prefixlen_s > 128)
                        return -EINVAL;

                break;
#else
                return  -EAFNOSUPPORT;
#endif

        default:
                return -EINVAL;
        }

        ret = verify_policy_dir(p->dir);
        if (ret)
                return ret;
        if (p->index && (xfrm_policy_id2dir(p->index) != p->dir))
                return -EINVAL;

        return 0;
}

static int copy_from_user_sec_ctx(struct xfrm_policy *pol, struct nlattr **attrs)
{
        struct nlattr *rt = attrs[XFRMA_SEC_CTX];
        struct xfrm_user_sec_ctx *uctx;

        if (!rt)
                return 0;

        uctx = nla_data(rt);
        return security_xfrm_policy_alloc(&pol->security, uctx, GFP_KERNEL);
}

static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut,
                           int nr)
{
        int i;

        xp->xfrm_nr = nr;
        for (i = 0; i < nr; i++, ut++) {
                struct xfrm_tmpl *t = &xp->xfrm_vec[i];

                memcpy(&t->id, &ut->id, sizeof(struct xfrm_id));
                memcpy(&t->saddr, &ut->saddr,
                       sizeof(xfrm_address_t));
                t->reqid = ut->reqid;
                t->mode = ut->mode;
                t->share = ut->share;
                t->optional = ut->optional;
                t->aalgos = ut->aalgos;
                t->ealgos = ut->ealgos;
                t->calgos = ut->calgos;
                /* If all masks are ~0, then we allow all algorithms. */
                t->allalgs = !~(t->aalgos & t->ealgos & t->calgos);
                t->encap_family = ut->family;
        }
}

static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family)
{
        u16 prev_family;
        int i;

        if (nr > XFRM_MAX_DEPTH)
                return -EINVAL;

        prev_family = family;

        for (i = 0; i < nr; i++) {
                /* We never validated the ut->family value, so many
                 * applications simply leave it at zero.  The check was
                 * never made and ut->family was ignored because all
                 * templates could be assumed to have the same family as
                 * the policy itself.  Now that we will have ipv4-in-ipv6
                 * and ipv6-in-ipv4 tunnels, this is no longer true.
                 */
                if (!ut[i].family)
                        ut[i].family = family;

                switch (ut[i].mode) {
                case XFRM_MODE_TUNNEL:
                case XFRM_MODE_BEET:
                        break;
                default:
                        if (ut[i].family != prev_family)
                                return -EINVAL;
                        break;
                }
                if (ut[i].mode >= XFRM_MODE_MAX)
                        return -EINVAL;

                prev_family = ut[i].family;

                switch (ut[i].family) {
                case AF_INET:
                        break;
#if IS_ENABLED(CONFIG_IPV6)
                case AF_INET6:
                        break;
#endif
                default:
                        return -EINVAL;
                }

                if (!xfrm_id_proto_valid(ut[i].id.proto))
                        return -EINVAL;
        }

        return 0;
}

static int copy_from_user_tmpl(struct xfrm_policy *pol, struct nlattr **attrs)
{
        struct nlattr *rt = attrs[XFRMA_TMPL];

        if (!rt) {
                pol->xfrm_nr = 0;
        } else {
                struct xfrm_user_tmpl *utmpl = nla_data(rt);
                int nr = nla_len(rt) / sizeof(*utmpl);
                int err;

                err = validate_tmpl(nr, utmpl, pol->family);
                if (err)
                        return err;

                copy_templates(pol, utmpl, nr);
        }
        return 0;
}

static int copy_from_user_policy_type(u8 *tp, struct nlattr **attrs)
{
        struct nlattr *rt = attrs[XFRMA_POLICY_TYPE];
        struct xfrm_userpolicy_type *upt;
        u8 type = XFRM_POLICY_TYPE_MAIN;
        int err;

        if (rt) {
                upt = nla_data(rt);
                type = upt->type;
        }

        err = verify_policy_type(type);
        if (err)
                return err;

        *tp = type;
        return 0;
}

static void copy_from_user_policy(struct xfrm_policy *xp, struct xfrm_userpolicy_info *p)
{
        xp->priority = p->priority;
        xp->index = p->index;
        memcpy(&xp->selector, &p->sel, sizeof(xp->selector));
        memcpy(&xp->lft, &p->lft, sizeof(xp->lft));
        xp->action = p->action;
        xp->flags = p->flags;
        xp->family = p->sel.family;
        /* XXX xp->share = p->share; */
}

static void copy_to_user_policy(struct xfrm_policy *xp, struct xfrm_userpolicy_info *p, int dir)
{
        memset(p, 0, sizeof(*p));
        memcpy(&p->sel, &xp->selector, sizeof(p->sel));
        memcpy(&p->lft, &xp->lft, sizeof(p->lft));
        memcpy(&p->curlft, &xp->curlft, sizeof(p->curlft));
        p->priority = xp->priority;
        p->index = xp->index;
        p->sel.family = xp->family;
        p->dir = dir;
        p->action = xp->action;
        p->flags = xp->flags;
        p->share = XFRM_SHARE_ANY; /* XXX xp->share */
}

static struct xfrm_policy *xfrm_policy_construct(struct net *net, struct xfrm_userpolicy_info *p, struct nlattr **attrs, int *errp)
{
        struct xfrm_policy *xp = xfrm_policy_alloc(net, GFP_KERNEL);
        int err;

        if (!xp) {
                *errp = -ENOMEM;
                return NULL;
        }

        copy_from_user_policy(xp, p);

        err = copy_from_user_policy_type(&xp->type, attrs);
        if (err)
                goto error;

        if (!(err = copy_from_user_tmpl(xp, attrs)))
                err = copy_from_user_sec_ctx(xp, attrs);
        if (err)
                goto error;

        xfrm_mark_get(attrs, &xp->mark);

        if (attrs[XFRMA_IF_ID])
                xp->if_id = nla_get_u32(attrs[XFRMA_IF_ID]);

        return xp;
 error:
        *errp = err;
        xp->walk.dead = 1;
        xfrm_policy_destroy(xp);
        return NULL;
}

static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
                struct nlattr **attrs)
{
        struct net *net = sock_net(skb->sk);
        struct xfrm_userpolicy_info *p = nlmsg_data(nlh);
        struct xfrm_policy *xp;
        struct km_event c;
        int err;
        int excl;

        err = verify_newpolicy_info(p);
        if (err)
                return err;
        err = verify_sec_ctx_len(attrs);
        if (err)
                return err;

        xp = xfrm_policy_construct(net, p, attrs, &err);
        if (!xp)
                return err;

        /* shouldn't excl be based on nlh flags??
         * Aha! this is anti-netlink really i.e  more pfkey derived
         * in netlink excl is a flag and you wouldnt need
         * a type XFRM_MSG_UPDPOLICY - JHS */
        excl = nlh->nlmsg_type == XFRM_MSG_NEWPOLICY;
        err = xfrm_policy_insert(p->dir, xp, excl);
        xfrm_audit_policy_add(xp, err ? 0 : 1, true);

        if (err) {
                security_xfrm_policy_free(xp->security);
                kfree(xp);
                return err;
        }

        c.event = nlh->nlmsg_type;
        c.seq = nlh->nlmsg_seq;
        c.portid = nlh->nlmsg_pid;
        km_policy_notify(xp, p->dir, &c);

        xfrm_pol_put(xp);

        return 0;
}

static int copy_to_user_tmpl(struct xfrm_policy *xp, struct sk_buff *skb)
{
        struct xfrm_user_tmpl vec[XFRM_MAX_DEPTH];
        int i;

        if (xp->xfrm_nr == 0)
                return 0;

        if (xp->xfrm_nr > XFRM_MAX_DEPTH)
                return -ENOBUFS;

        for (i = 0; i < xp->xfrm_nr; i++) {
                struct xfrm_user_tmpl *up = &vec[i];
                struct xfrm_tmpl *kp = &xp->xfrm_vec[i];

                memset(up, 0, sizeof(*up));
                memcpy(&up->id, &kp->id, sizeof(up->id));
                up->family = kp->encap_family;
                memcpy(&up->saddr, &kp->saddr, sizeof(up->saddr));
                up->reqid = kp->reqid;
                up->mode = kp->mode;
                up->share = kp->share;
                up->optional = kp->optional;
                up->aalgos = kp->aalgos;
                up->ealgos = kp->ealgos;
                up->calgos = kp->calgos;
        }

        return nla_put(skb, XFRMA_TMPL,
                       sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr, vec);
}

static inline int copy_to_user_state_sec_ctx(struct xfrm_state *x, struct sk_buff *skb)
{
        if (x->security) {
                return copy_sec_ctx(x->security, skb);
        }
        return 0;
}

static inline int copy_to_user_sec_ctx(struct xfrm_policy *xp, struct sk_buff *skb)
{
        if (xp->security)
                return copy_sec_ctx(xp->security, skb);
        return 0;
}
static inline unsigned int userpolicy_type_attrsize(void)
{
#ifdef CONFIG_XFRM_SUB_POLICY
        return nla_total_size(sizeof(struct xfrm_userpolicy_type));
#else
        return 0;
#endif
}

#ifdef CONFIG_XFRM_SUB_POLICY
static int copy_to_user_policy_type(u8 type, struct sk_buff *skb)
{
        struct xfrm_userpolicy_type upt;

        /* Sadly there are two holes in struct xfrm_userpolicy_type */
        memset(&upt, 0, sizeof(upt));
        upt.type = type;

        return nla_put(skb, XFRMA_POLICY_TYPE, sizeof(upt), &upt);
}

#else
static inline int copy_to_user_policy_type(u8 type, struct sk_buff *skb)
{
        return 0;
}
#endif

static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr)
{
        struct xfrm_dump_info *sp = ptr;
        struct xfrm_userpolicy_info *p;
        struct sk_buff *in_skb = sp->in_skb;
        struct sk_buff *skb = sp->out_skb;
        struct xfrm_translator *xtr;
        struct nlmsghdr *nlh;
        int err;

        nlh = nlmsg_put(skb, NETLINK_CB(in_skb).portid, sp->nlmsg_seq,
                        XFRM_MSG_NEWPOLICY, sizeof(*p), sp->nlmsg_flags);
        if (nlh == NULL)
                return -EMSGSIZE;

        p = nlmsg_data(nlh);
        copy_to_user_policy(xp, p, dir);
        err = copy_to_user_tmpl(xp, skb);
        if (!err)
                err = copy_to_user_sec_ctx(xp, skb);
        if (!err)
                err = copy_to_user_policy_type(xp->type, skb);
        if (!err)
                err = xfrm_mark_put(skb, &xp->mark);
        if (!err)
                err = xfrm_if_id_put(skb, xp->if_id);
        if (err) {
                nlmsg_cancel(skb, nlh);
                return err;
        }
        nlmsg_end(skb, nlh);

        xtr = xfrm_get_translator();
        if (xtr) {
                err = xtr->alloc_compat(skb, nlh);

                xfrm_put_translator(xtr);
                if (err) {
                        nlmsg_cancel(skb, nlh);
                        return err;
                }
        }

        return 0;
}

static int xfrm_dump_policy_done(struct netlink_callback *cb)
{
        struct xfrm_policy_walk *walk = (struct xfrm_policy_walk *)cb->args;
        struct net *net = sock_net(cb->skb->sk);

        xfrm_policy_walk_done(walk, net);
        return 0;
}

static int xfrm_dump_policy_start(struct netlink_callback *cb)
{
        struct xfrm_policy_walk *walk = (struct xfrm_policy_walk *)cb->args;

        BUILD_BUG_ON(sizeof(*walk) > sizeof(cb->args));

        xfrm_policy_walk_init(walk, XFRM_POLICY_TYPE_ANY);
        return 0;
}

static int xfrm_dump_policy(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct net *net = sock_net(skb->sk);
        struct xfrm_policy_walk *walk = (struct xfrm_policy_walk *)cb->args;
        struct xfrm_dump_info info;

        info.in_skb = cb->skb;
        info.out_skb = skb;
        info.nlmsg_seq = cb->nlh->nlmsg_seq;
        info.nlmsg_flags = NLM_F_MULTI;

        (void) xfrm_policy_walk(net, walk, dump_one_policy, &info);

        return skb->len;
}

static struct sk_buff *xfrm_policy_netlink(struct sk_buff *in_skb,
                                          struct xfrm_policy *xp,
                                          int dir, u32 seq)
{
        struct xfrm_dump_info info;
        struct sk_buff *skb;
        int err;

        skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!skb)
                return ERR_PTR(-ENOMEM);

        info.in_skb = in_skb;
        info.out_skb = skb;
        info.nlmsg_seq = seq;
        info.nlmsg_flags = 0;

        err = dump_one_policy(xp, dir, 0, &info);
        if (err) {
                kfree_skb(skb);
                return ERR_PTR(err);
        }

        return skb;
}

static int xfrm_notify_userpolicy(struct net *net)
{
        struct xfrm_userpolicy_default *up;
        int len = NLMSG_ALIGN(sizeof(*up));
        struct nlmsghdr *nlh;
        struct sk_buff *skb;
        int err;

        skb = nlmsg_new(len, GFP_ATOMIC);
        if (skb == NULL)
                return -ENOMEM;

        nlh = nlmsg_put(skb, 0, 0, XFRM_MSG_GETDEFAULT, sizeof(*up), 0);
        if (nlh == NULL) {
                kfree_skb(skb);
                return -EMSGSIZE;
        }

        up = nlmsg_data(nlh);
        up->in = net->xfrm.policy_default[XFRM_POLICY_IN];
        up->fwd = net->xfrm.policy_default[XFRM_POLICY_FWD];
        up->out = net->xfrm.policy_default[XFRM_POLICY_OUT];

        nlmsg_end(skb, nlh);

        rcu_read_lock();
        err = xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_POLICY);
        rcu_read_unlock();

        return err;
}

static bool xfrm_userpolicy_is_valid(__u8 policy)
{
        return policy == XFRM_USERPOLICY_BLOCK ||
               policy == XFRM_USERPOLICY_ACCEPT;
}

static int xfrm_set_default(struct sk_buff *skb, struct nlmsghdr *nlh,
                            struct nlattr **attrs)
{
        struct net *net = sock_net(skb->sk);
        struct xfrm_userpolicy_default *up = nlmsg_data(nlh);

        if (xfrm_userpolicy_is_valid(up->in))
                net->xfrm.policy_default[XFRM_POLICY_IN] = up->in;

        if (xfrm_userpolicy_is_valid(up->fwd))
                net->xfrm.policy_default[XFRM_POLICY_FWD] = up->fwd;

        if (xfrm_userpolicy_is_valid(up->out))
                net->xfrm.policy_default[XFRM_POLICY_OUT] = up->out;

        rt_genid_bump_all(net);

        xfrm_notify_userpolicy(net);
        return 0;
}

static int xfrm_get_default(struct sk_buff *skb, struct nlmsghdr *nlh,
                            struct nlattr **attrs)
{
        struct sk_buff *r_skb;
        struct nlmsghdr *r_nlh;
        struct net *net = sock_net(skb->sk);
        struct xfrm_userpolicy_default *r_up;
        int len = NLMSG_ALIGN(sizeof(struct xfrm_userpolicy_default));
        u32 portid = NETLINK_CB(skb).portid;
        u32 seq = nlh->nlmsg_seq;

        r_skb = nlmsg_new(len, GFP_ATOMIC);
        if (!r_skb)
                return -ENOMEM;

        r_nlh = nlmsg_put(r_skb, portid, seq, XFRM_MSG_GETDEFAULT, sizeof(*r_up), 0);
        if (!r_nlh) {
                kfree_skb(r_skb);
                return -EMSGSIZE;
        }

        r_up = nlmsg_data(r_nlh);
        r_up->in = net->xfrm.policy_default[XFRM_POLICY_IN];
        r_up->fwd = net->xfrm.policy_default[XFRM_POLICY_FWD];
        r_up->out = net->xfrm.policy_default[XFRM_POLICY_OUT];
        nlmsg_end(r_skb, r_nlh);

        return nlmsg_unicast(net->xfrm.nlsk, r_skb, portid);
}

static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
                struct nlattr **attrs)
{
        struct net *net = sock_net(skb->sk);
        struct xfrm_policy *xp;
        struct xfrm_userpolicy_id *p;
        u8 type = XFRM_POLICY_TYPE_MAIN;
        int err;
        struct km_event c;
        int delete;
        struct xfrm_mark m;
        u32 if_id = 0;

        p = nlmsg_data(nlh);
        delete = nlh->nlmsg_type == XFRM_MSG_DELPOLICY;

        err = copy_from_user_policy_type(&type, attrs);
        if (err)
                return err;

        err = verify_policy_dir(p->dir);
        if (err)
                return err;

        if (attrs[XFRMA_IF_ID])
                if_id = nla_get_u32(attrs[XFRMA_IF_ID]);

        xfrm_mark_get(attrs, &m);

        if (p->index)
                xp = xfrm_policy_byid(net, &m, if_id, type, p->dir,
                                      p->index, delete, &err);
        else {
                struct nlattr *rt = attrs[XFRMA_SEC_CTX];
                struct xfrm_sec_ctx *ctx;

                err = verify_sec_ctx_len(attrs);
                if (err)
                        return err;

                ctx = NULL;
                if (rt) {
                        struct xfrm_user_sec_ctx *uctx = nla_data(rt);

                        err = security_xfrm_policy_alloc(&ctx, uctx, GFP_KERNEL);
                        if (err)
                                return err;
                }
                xp = xfrm_policy_bysel_ctx(net, &m, if_id, type, p->dir,
                                           &p->sel, ctx, delete, &err);
                security_xfrm_policy_free(ctx);
        }
        if (xp == NULL)
                return -ENOENT;

        if (!delete) {
                struct sk_buff *resp_skb;

                resp_skb = xfrm_policy_netlink(skb, xp, p->dir, nlh->nlmsg_seq);
                if (IS_ERR(resp_skb)) {
                        err = PTR_ERR(resp_skb);
                } else {
                        err = nlmsg_unicast(net->xfrm.nlsk, resp_skb,
                                            NETLINK_CB(skb).portid);
                }
        } else {
                xfrm_audit_policy_delete(xp, err ? 0 : 1, true);

                if (err != 0)
                        goto out;

                c.data.byid = p->index;
                c.event = nlh->nlmsg_type;
                c.seq = nlh->nlmsg_seq;
                c.portid = nlh->nlmsg_pid;
                km_policy_notify(xp, p->dir, &c);
        }

out:
        xfrm_pol_put(xp);
        return err;
}

static int xfrm_flush_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
                struct nlattr **attrs)
{
        struct net *net = sock_net(skb->sk);
        struct km_event c;
        struct xfrm_usersa_flush *p = nlmsg_data(nlh);
        int err;

        err = xfrm_state_flush(net, p->proto, true);
        if (err) {
                if (err == -ESRCH) /* empty table */
                        return 0;
                return err;
        }
        c.data.proto = p->proto;
        c.event = nlh->nlmsg_type;
        c.seq = nlh->nlmsg_seq;
        c.portid = nlh->nlmsg_pid;
        c.net = net;
        km_state_notify(NULL, &c);

        return 0;
}

static inline unsigned int xfrm_aevent_msgsize(struct xfrm_state *x)
{
        unsigned int replay_size = x->replay_esn ?
                              xfrm_replay_state_esn_len(x->replay_esn) :
                              sizeof(struct xfrm_replay_state);

        return NLMSG_ALIGN(sizeof(struct xfrm_aevent_id))
               + nla_total_size(replay_size)
               + nla_total_size_64bit(sizeof(struct xfrm_lifetime_cur))
               + nla_total_size(sizeof(struct xfrm_mark))
               + nla_total_size(4) /* XFRM_AE_RTHR */
               + nla_total_size(4); /* XFRM_AE_ETHR */
}

static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct km_event *c)
{
        struct xfrm_aevent_id *id;
        struct nlmsghdr *nlh;
        int err;

        nlh = nlmsg_put(skb, c->portid, c->seq, XFRM_MSG_NEWAE, sizeof(*id), 0);
        if (nlh == NULL)
                return -EMSGSIZE;

        id = nlmsg_data(nlh);
        memset(&id->sa_id, 0, sizeof(id->sa_id));
        memcpy(&id->sa_id.daddr, &x->id.daddr, sizeof(x->id.daddr));
        id->sa_id.spi = x->id.spi;
        id->sa_id.family = x->props.family;
        id->sa_id.proto = x->id.proto;
        memcpy(&id->saddr, &x->props.saddr, sizeof(x->props.saddr));
        id->reqid = x->props.reqid;
        id->flags = c->data.aevent;

        if (x->replay_esn) {
                err = nla_put(skb, XFRMA_REPLAY_ESN_VAL,
                              xfrm_replay_state_esn_len(x->replay_esn),
                              x->replay_esn);
        } else {
                err = nla_put(skb, XFRMA_REPLAY_VAL, sizeof(x->replay),
                              &x->replay);
        }
        if (err)
                goto out_cancel;
        err = nla_put_64bit(skb, XFRMA_LTIME_VAL, sizeof(x->curlft), &x->curlft,
                            XFRMA_PAD);
        if (err)
                goto out_cancel;

        if (id->flags & XFRM_AE_RTHR) {
                err = nla_put_u32(skb, XFRMA_REPLAY_THRESH, x->replay_maxdiff);
                if (err)
                        goto out_cancel;
        }
        if (id->flags & XFRM_AE_ETHR) {
                err = nla_put_u32(skb, XFRMA_ETIMER_THRESH,
                                  x->replay_maxage * 10 / HZ);
                if (err)
                        goto out_cancel;
        }
        err = xfrm_mark_put(skb, &x->mark);
        if (err)
                goto out_cancel;

        err = xfrm_if_id_put(skb, x->if_id);
        if (err)
                goto out_cancel;

        nlmsg_end(skb, nlh);
        return 0;

out_cancel:
        nlmsg_cancel(skb, nlh);
        return err;
}

static int xfrm_get_ae(struct sk_buff *skb, struct nlmsghdr *nlh,
                struct nlattr **attrs)
{
        struct net *net = sock_net(skb->sk);
        struct xfrm_state *x;
        struct sk_buff *r_skb;
        int err;
        struct km_event c;
        u32 mark;
        struct xfrm_mark m;
        struct xfrm_aevent_id *p = nlmsg_data(nlh);
        struct xfrm_usersa_id *id = &p->sa_id;

        mark = xfrm_mark_get(attrs, &m);

        x = xfrm_state_lookup(net, mark, &id->daddr, id->spi, id->proto, id->family);
        if (x == NULL)
                return -ESRCH;

        r_skb = nlmsg_new(xfrm_aevent_msgsize(x), GFP_ATOMIC);
        if (r_skb == NULL) {
                xfrm_state_put(x);
                return -ENOMEM;
        }

        /*
         * XXX: is this lock really needed - none of the other
         * gets lock (the concern is things getting updated
         * while we are still reading) - jhs
        */
        spin_lock_bh(&x->lock);
        c.data.aevent = p->flags;
        c.seq = nlh->nlmsg_seq;
        c.portid = nlh->nlmsg_pid;

        err = build_aevent(r_skb, x, &c);
        BUG_ON(err < 0);

        err = nlmsg_unicast(net->xfrm.nlsk, r_skb, NETLINK_CB(skb).portid);
        spin_unlock_bh(&x->lock);
        xfrm_state_put(x);
        return err;
}

static int xfrm_new_ae(struct sk_buff *skb, struct nlmsghdr *nlh,
                struct nlattr **attrs)
{
        struct net *net = sock_net(skb->sk);
        struct xfrm_state *x;
        struct km_event c;
        int err = -EINVAL;
        u32 mark = 0;
        struct xfrm_mark m;
        struct xfrm_aevent_id *p = nlmsg_data(nlh);
        struct nlattr *rp = attrs[XFRMA_REPLAY_VAL];
        struct nlattr *re = attrs[XFRMA_REPLAY_ESN_VAL];
        struct nlattr *lt = attrs[XFRMA_LTIME_VAL];
        struct nlattr *et = attrs[XFRMA_ETIMER_THRESH];
        struct nlattr *rt = attrs[XFRMA_REPLAY_THRESH];

        if (!lt && !rp && !re && !et && !rt)
                return err;

        /* pedantic mode - thou shalt sayeth replaceth */
        if (!(nlh->nlmsg_flags&NLM_F_REPLACE))
                return err;

        mark = xfrm_mark_get(attrs, &m);

        x = xfrm_state_lookup(net, mark, &p->sa_id.daddr, p->sa_id.spi, p->sa_id.proto, p->sa_id.family);
        if (x == NULL)
                return -ESRCH;

        if (x->km.state != XFRM_STATE_VALID)
                goto out;

        err = xfrm_replay_verify_len(x->replay_esn, re);
        if (err)
                goto out;

        spin_lock_bh(&x->lock);
        xfrm_update_ae_params(x, attrs, 1);
        spin_unlock_bh(&x->lock);

        c.event = nlh->nlmsg_type;
        c.seq = nlh->nlmsg_seq;
        c.portid = nlh->nlmsg_pid;
        c.data.aevent = XFRM_AE_CU;
        km_state_notify(x, &c);
        err = 0;
out:
        xfrm_state_put(x);
        return err;
}

static int xfrm_flush_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
                struct nlattr **attrs)
{
        struct net *net = sock_net(skb->sk);
        struct km_event c;
        u8 type = XFRM_POLICY_TYPE_MAIN;
        int err;

        err = copy_from_user_policy_type(&type, attrs);
        if (err)
                return err;

        err = xfrm_policy_flush(net, type, true);
        if (err) {
                if (err == -ESRCH) /* empty table */
                        return 0;
                return err;
        }

        c.data.type = type;
        c.event = nlh->nlmsg_type;
        c.seq = nlh->nlmsg_seq;
        c.portid = nlh->nlmsg_pid;
        c.net = net;
        km_policy_notify(NULL, 0, &c);
        return 0;
}

static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
                struct nlattr **attrs)
{
        struct net *net = sock_net(skb->sk);
        struct xfrm_policy *xp;
        struct xfrm_user_polexpire *up = nlmsg_data(nlh);
        struct xfrm_userpolicy_info *p = &up->pol;
        u8 type = XFRM_POLICY_TYPE_MAIN;
        int err = -ENOENT;
        struct xfrm_mark m;
        u32 if_id = 0;

        err = copy_from_user_policy_type(&type, attrs);
        if (err)
                return err;

        err = verify_policy_dir(p->dir);
        if (err)
                return err;

        if (attrs[XFRMA_IF_ID])
                if_id = nla_get_u32(attrs[XFRMA_IF_ID]);

        xfrm_mark_get(attrs, &m);

        if (p->index)
                xp = xfrm_policy_byid(net, &m, if_id, type, p->dir, p->index,
                                      0, &err);
        else {
                struct nlattr *rt = attrs[XFRMA_SEC_CTX];
                struct xfrm_sec_ctx *ctx;

                err = verify_sec_ctx_len(attrs);
                if (err)
                        return err;

                ctx = NULL;
                if (rt) {
                        struct xfrm_user_sec_ctx *uctx = nla_data(rt);

                        err = security_xfrm_policy_alloc(&ctx, uctx, GFP_KERNEL);
                        if (err)
                                return err;
                }
                xp = xfrm_policy_bysel_ctx(net, &m, if_id, type, p->dir,
                                           &p->sel, ctx, 0, &err);
                security_xfrm_policy_free(ctx);
        }
        if (xp == NULL)
                return -ENOENT;

        if (unlikely(xp->walk.dead))
                goto out;

        err = 0;
        if (up->hard) {
                xfrm_policy_delete(xp, p->dir);
                xfrm_audit_policy_delete(xp, 1, true);
        }
        km_policy_expired(xp, p->dir, up->hard, nlh->nlmsg_pid);

out:
        xfrm_pol_put(xp);
        return err;
}

static int xfrm_add_sa_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
                struct nlattr **attrs)
{
        struct net *net = sock_net(skb->sk);
        struct xfrm_state *x;
        int err;
        struct xfrm_user_expire *ue = nlmsg_data(nlh);
        struct xfrm_usersa_info *p = &ue->state;
        struct xfrm_mark m;
        u32 mark = xfrm_mark_get(attrs, &m);

        x = xfrm_state_lookup(net, mark, &p->id.daddr, p->id.spi, p->id.proto, p->family);

        err = -ENOENT;
        if (x == NULL)
                return err;

        spin_lock_bh(&x->lock);
        err = -EINVAL;
        if (x->km.state != XFRM_STATE_VALID)
                goto out;
        km_state_expired(x, ue->hard, nlh->nlmsg_pid);

        if (ue->hard) {
                __xfrm_state_delete(x);
                xfrm_audit_state_delete(x, 1, true);
        }
        err = 0;
out:
        spin_unlock_bh(&x->lock);
        xfrm_state_put(x);
        return err;
}

static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh,
                struct nlattr **attrs)
{
        struct net *net = sock_net(skb->sk);
        struct xfrm_policy *xp;
        struct xfrm_user_tmpl *ut;
        int i;
        struct nlattr *rt = attrs[XFRMA_TMPL];
        struct xfrm_mark mark;

        struct xfrm_user_acquire *ua = nlmsg_data(nlh);
        struct xfrm_state *x = xfrm_state_alloc(net);
        int err = -ENOMEM;

        if (!x)
                goto nomem;

        xfrm_mark_get(attrs, &mark);

        err = verify_newpolicy_info(&ua->policy);
        if (err)
                goto free_state;
        err = verify_sec_ctx_len(attrs);
        if (err)
                goto free_state;

        /*   build an XP */
        xp = xfrm_policy_construct(net, &ua->policy, attrs, &err);
        if (!xp)
                goto free_state;

        memcpy(&x->id, &ua->id, sizeof(ua->id));
        memcpy(&x->props.saddr, &ua->saddr, sizeof(ua->saddr));
        memcpy(&x->sel, &ua->sel, sizeof(ua->sel));
        xp->mark.m = x->mark.m = mark.m;
        xp->mark.v = x->mark.v = mark.v;
        ut = nla_data(rt);
        /* extract the templates and for each call km_key */
        for (i = 0; i < xp->xfrm_nr; i++, ut++) {
                struct xfrm_tmpl *t = &xp->xfrm_vec[i];
                memcpy(&x->id, &t->id, sizeof(x->id));
                x->props.mode = t->mode;
                x->props.reqid = t->reqid;
                x->props.family = ut->family;
                t->aalgos = ua->aalgos;
                t->ealgos = ua->ealgos;
                t->calgos = ua->calgos;
                err = km_query(x, t, xp);

        }

        xfrm_state_free(x);
        kfree(xp);

        return 0;

free_state:
        xfrm_state_free(x);
nomem:
        return err;
}

#ifdef CONFIG_XFRM_MIGRATE
static int copy_from_user_migrate(struct xfrm_migrate *ma,
                                  struct xfrm_kmaddress *k,
                                  struct nlattr **attrs, int *num)
{
        struct nlattr *rt = attrs[XFRMA_MIGRATE];
        struct xfrm_user_migrate *um;
        int i, num_migrate;

        if (k != NULL) {
                struct xfrm_user_kmaddress *uk;

                uk = nla_data(attrs[XFRMA_KMADDRESS]);
                memcpy(&k->local, &uk->local, sizeof(k->local));
                memcpy(&k->remote, &uk->remote, sizeof(k->remote));
                k->family = uk->family;
                k->reserved = uk->reserved;
        }

        um = nla_data(rt);
        num_migrate = nla_len(rt) / sizeof(*um);

        if (num_migrate <= 0 || num_migrate > XFRM_MAX_DEPTH)
                return -EINVAL;

        for (i = 0; i < num_migrate; i++, um++, ma++) {
                memcpy(&ma->old_daddr, &um->old_daddr, sizeof(ma->old_daddr));
                memcpy(&ma->old_saddr, &um->old_saddr, sizeof(ma->old_saddr));
                memcpy(&ma->new_daddr, &um->new_daddr, sizeof(ma->new_daddr));
                memcpy(&ma->new_saddr, &um->new_saddr, sizeof(ma->new_saddr));

                ma->proto = um->proto;
                ma->mode = um->mode;
                ma->reqid = um->reqid;

                ma->old_family = um->old_family;
                ma->new_family = um->new_family;
        }

        *num = i;
        return 0;
}

static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh,
                           struct nlattr **attrs)
{
        struct xfrm_userpolicy_id *pi = nlmsg_data(nlh);
        struct xfrm_migrate m[XFRM_MAX_DEPTH];
        struct xfrm_kmaddress km, *kmp;
        u8 type;
        int err;
        int n = 0;
        struct net *net = sock_net(skb->sk);
        struct xfrm_encap_tmpl  *encap = NULL;
        u32 if_id = 0;

        if (attrs[XFRMA_MIGRATE] == NULL)
                return -EINVAL;

        kmp = attrs[XFRMA_KMADDRESS] ? &km : NULL;

        err = copy_from_user_policy_type(&type, attrs);
        if (err)
                return err;

        err = copy_from_user_migrate((struct xfrm_migrate *)m, kmp, attrs, &n);
        if (err)
                return err;

        if (!n)
                return 0;

        if (attrs[XFRMA_ENCAP]) {
                encap = kmemdup(nla_data(attrs[XFRMA_ENCAP]),
                                sizeof(*encap), GFP_KERNEL);
                if (!encap)
                        return 0;
        }

        if (attrs[XFRMA_IF_ID])
                if_id = nla_get_u32(attrs[XFRMA_IF_ID]);

        err = xfrm_migrate(&pi->sel, pi->dir, type, m, n, kmp, net, encap, if_id);

        kfree(encap);

        return err;
}
#else
static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh,
                           struct nlattr **attrs)
{
        return -ENOPROTOOPT;
}
#endif

#ifdef CONFIG_XFRM_MIGRATE
static int copy_to_user_migrate(const struct xfrm_migrate *m, struct sk_buff *skb)
{
        struct xfrm_user_migrate um;

        memset(&um, 0, sizeof(um));
        um.proto = m->proto;
        um.mode = m->mode;
        um.reqid = m->reqid;
        um.old_family = m->old_family;
        memcpy(&um.old_daddr, &m->old_daddr, sizeof(um.old_daddr));
        memcpy(&um.old_saddr, &m->old_saddr, sizeof(um.old_saddr));
        um.new_family = m->new_family;
        memcpy(&um.new_daddr, &m->new_daddr, sizeof(um.new_daddr));
        memcpy(&um.new_saddr, &m->new_saddr, sizeof(um.new_saddr));

        return nla_put(skb, XFRMA_MIGRATE, sizeof(um), &um);
}

static int copy_to_user_kmaddress(const struct xfrm_kmaddress *k, struct sk_buff *skb)
{
        struct xfrm_user_kmaddress uk;

        memset(&uk, 0, sizeof(uk));
        uk.family = k->family;
        uk.reserved = k->reserved;
        memcpy(&uk.local, &k->local, sizeof(uk.local));
        memcpy(&uk.remote, &k->remote, sizeof(uk.remote));

        return nla_put(skb, XFRMA_KMADDRESS, sizeof(uk), &uk);
}

static inline unsigned int xfrm_migrate_msgsize(int num_migrate, int with_kma,
                                                int with_encp)
{
        return NLMSG_ALIGN(sizeof(struct xfrm_userpolicy_id))
              + (with_kma ? nla_total_size(sizeof(struct xfrm_kmaddress)) : 0)
              + (with_encp ? nla_total_size(sizeof(struct xfrm_encap_tmpl)) : 0)
              + nla_total_size(sizeof(struct xfrm_user_migrate) * num_migrate)
              + userpolicy_type_attrsize();
}

static int build_migrate(struct sk_buff *skb, const struct xfrm_migrate *m,
                         int num_migrate, const struct xfrm_kmaddress *k,
                         const struct xfrm_selector *sel,
                         const struct xfrm_encap_tmpl *encap, u8 dir, u8 type)
{
        const struct xfrm_migrate *mp;
        struct xfrm_userpolicy_id *pol_id;
        struct nlmsghdr *nlh;
        int i, err;

        nlh = nlmsg_put(skb, 0, 0, XFRM_MSG_MIGRATE, sizeof(*pol_id), 0);
        if (nlh == NULL)
                return -EMSGSIZE;

        pol_id = nlmsg_data(nlh);
        /* copy data from selector, dir, and type to the pol_id */
        memset(pol_id, 0, sizeof(*pol_id));
        memcpy(&pol_id->sel, sel, sizeof(pol_id->sel));
        pol_id->dir = dir;

        if (k != NULL) {
                err = copy_to_user_kmaddress(k, skb);
                if (err)
                        goto out_cancel;
        }
        if (encap) {
                err = nla_put(skb, XFRMA_ENCAP, sizeof(*encap), encap);
                if (err)
                        goto out_cancel;
        }
        err = copy_to_user_policy_type(type, skb);
        if (err)
                goto out_cancel;
        for (i = 0, mp = m ; i < num_migrate; i++, mp++) {
                err = copy_to_user_migrate(mp, skb);
                if (err)
                        goto out_cancel;
        }

        nlmsg_end(skb, nlh);
        return 0;

out_cancel:
        nlmsg_cancel(skb, nlh);
        return err;
}

static int xfrm_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
                             const struct xfrm_migrate *m, int num_migrate,
                             const struct xfrm_kmaddress *k,
                             const struct xfrm_encap_tmpl *encap)
{
        struct net *net = &init_net;
        struct sk_buff *skb;
        int err;

        skb = nlmsg_new(xfrm_migrate_msgsize(num_migrate, !!k, !!encap),
                        GFP_ATOMIC);
        if (skb == NULL)
                return -ENOMEM;

        /* build migrate */
        err = build_migrate(skb, m, num_migrate, k, sel, encap, dir, type);
        BUG_ON(err < 0);

        return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_MIGRATE);
}
#else
static int xfrm_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
                             const struct xfrm_migrate *m, int num_migrate,
                             const struct xfrm_kmaddress *k,
                             const struct xfrm_encap_tmpl *encap)
{
        return -ENOPROTOOPT;
}
#endif

#define XMSGSIZE(type) sizeof(struct type)

const int xfrm_msg_min[XFRM_NR_MSGTYPES] = {
        [XFRM_MSG_NEWSA       - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_info),
        [XFRM_MSG_DELSA       - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_id),
        [XFRM_MSG_GETSA       - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_id),
        [XFRM_MSG_NEWPOLICY   - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_info),
        [XFRM_MSG_DELPOLICY   - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id),
        [XFRM_MSG_GETPOLICY   - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id),
        [XFRM_MSG_ALLOCSPI    - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userspi_info),
        [XFRM_MSG_ACQUIRE     - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_acquire),
        [XFRM_MSG_EXPIRE      - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_expire),
        [XFRM_MSG_UPDPOLICY   - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_info),
        [XFRM_MSG_UPDSA       - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_info),
        [XFRM_MSG_POLEXPIRE   - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_polexpire),
        [XFRM_MSG_FLUSHSA     - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_flush),
        [XFRM_MSG_FLUSHPOLICY - XFRM_MSG_BASE] = 0,
        [XFRM_MSG_NEWAE       - XFRM_MSG_BASE] = XMSGSIZE(xfrm_aevent_id),
        [XFRM_MSG_GETAE       - XFRM_MSG_BASE] = XMSGSIZE(xfrm_aevent_id),
        [XFRM_MSG_REPORT      - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_report),
        [XFRM_MSG_MIGRATE     - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id),
        [XFRM_MSG_GETSADINFO  - XFRM_MSG_BASE] = sizeof(u32),
        [XFRM_MSG_NEWSPDINFO  - XFRM_MSG_BASE] = sizeof(u32),
        [XFRM_MSG_GETSPDINFO  - XFRM_MSG_BASE] = sizeof(u32),
        [XFRM_MSG_SETDEFAULT  - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_default),
        [XFRM_MSG_GETDEFAULT  - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_default),
};
EXPORT_SYMBOL_GPL(xfrm_msg_min);

#undef XMSGSIZE

const struct nla_policy xfrma_policy[XFRMA_MAX+1] = {
        [XFRMA_SA]                = { .len = sizeof(struct xfrm_usersa_info)},
        [XFRMA_POLICY]                = { .len = sizeof(struct xfrm_userpolicy_info)},
        [XFRMA_LASTUSED]        = { .type = NLA_U64},
        [XFRMA_ALG_AUTH_TRUNC]        = { .len = sizeof(struct xfrm_algo_auth)},
        [XFRMA_ALG_AEAD]        = { .len = sizeof(struct xfrm_algo_aead) },
        [XFRMA_ALG_AUTH]        = { .len = sizeof(struct xfrm_algo) },
        [XFRMA_ALG_CRYPT]        = { .len = sizeof(struct xfrm_algo) },
        [XFRMA_ALG_COMP]        = { .len = sizeof(struct xfrm_algo) },
        [XFRMA_ENCAP]                = { .len = sizeof(struct xfrm_encap_tmpl) },
        [XFRMA_TMPL]                = { .len = sizeof(struct xfrm_user_tmpl) },
        [XFRMA_SEC_CTX]                = { .len = sizeof(struct xfrm_user_sec_ctx) },
        [XFRMA_LTIME_VAL]        = { .len = sizeof(struct xfrm_lifetime_cur) },
        [XFRMA_REPLAY_VAL]        = { .len = sizeof(struct xfrm_replay_state) },
        [XFRMA_REPLAY_THRESH]        = { .type = NLA_U32 },
        [XFRMA_ETIMER_THRESH]        = { .type = NLA_U32 },
        [XFRMA_SRCADDR]                = { .len = sizeof(xfrm_address_t) },
        [XFRMA_COADDR]                = { .len = sizeof(xfrm_address_t) },
        [XFRMA_POLICY_TYPE]        = { .len = sizeof(struct xfrm_userpolicy_type)},
        [XFRMA_MIGRATE]                = { .len = sizeof(struct xfrm_user_migrate) },
        [XFRMA_KMADDRESS]        = { .len = sizeof(struct xfrm_user_kmaddress) },
        [XFRMA_MARK]                = { .len = sizeof(struct xfrm_mark) },
        [XFRMA_TFCPAD]                = { .type = NLA_U32 },
        [XFRMA_REPLAY_ESN_VAL]        = { .len = sizeof(struct xfrm_replay_state_esn) },
        [XFRMA_SA_EXTRA_FLAGS]        = { .type = NLA_U32 },
        [XFRMA_PROTO]                = { .type = NLA_U8 },
        [XFRMA_ADDRESS_FILTER]        = { .len = sizeof(struct xfrm_address_filter) },
        [XFRMA_OFFLOAD_DEV]        = { .len = sizeof(struct xfrm_user_offload) },
        [XFRMA_SET_MARK]        = { .type = NLA_U32 },
        [XFRMA_SET_MARK_MASK]        = { .type = NLA_U32 },
        [XFRMA_IF_ID]                = { .type = NLA_U32 },
        [XFRMA_MTIMER_THRESH]   = { .type = NLA_U32 },
};
EXPORT_SYMBOL_GPL(xfrma_policy);

static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = {
        [XFRMA_SPD_IPV4_HTHRESH] = { .len = sizeof(struct xfrmu_spdhthresh) },
        [XFRMA_SPD_IPV6_HTHRESH] = { .len = sizeof(struct xfrmu_spdhthresh) },
};

static const struct xfrm_link {
        int (*doit)(struct sk_buff *, struct nlmsghdr *, struct nlattr **);
        int (*start)(struct netlink_callback *);
        int (*dump)(struct sk_buff *, struct netlink_callback *);
        int (*done)(struct netlink_callback *);
        const struct nla_policy *nla_pol;
        int nla_max;
} xfrm_dispatch[XFRM_NR_MSGTYPES] = {
        [XFRM_MSG_NEWSA       - XFRM_MSG_BASE] = { .doit = xfrm_add_sa        },
        [XFRM_MSG_DELSA       - XFRM_MSG_BASE] = { .doit = xfrm_del_sa        },
        [XFRM_MSG_GETSA       - XFRM_MSG_BASE] = { .doit = xfrm_get_sa,
                                                   .dump = xfrm_dump_sa,
                                                   .done = xfrm_dump_sa_done  },
        [XFRM_MSG_NEWPOLICY   - XFRM_MSG_BASE] = { .doit = xfrm_add_policy    },
        [XFRM_MSG_DELPOLICY   - XFRM_MSG_BASE] = { .doit = xfrm_get_policy    },
        [XFRM_MSG_GETPOLICY   - XFRM_MSG_BASE] = { .doit = xfrm_get_policy,
                                                   .start = xfrm_dump_policy_start,
                                                   .dump = xfrm_dump_policy,
                                                   .done = xfrm_dump_policy_done },
        [XFRM_MSG_ALLOCSPI    - XFRM_MSG_BASE] = { .doit = xfrm_alloc_userspi },
        [XFRM_MSG_ACQUIRE     - XFRM_MSG_BASE] = { .doit = xfrm_add_acquire   },
        [XFRM_MSG_EXPIRE      - XFRM_MSG_BASE] = { .doit = xfrm_add_sa_expire },
        [XFRM_MSG_UPDPOLICY   - XFRM_MSG_BASE] = { .doit = xfrm_add_policy    },
        [XFRM_MSG_UPDSA       - XFRM_MSG_BASE] = { .doit = xfrm_add_sa        },
        [XFRM_MSG_POLEXPIRE   - XFRM_MSG_BASE] = { .doit = xfrm_add_pol_expire},
        [XFRM_MSG_FLUSHSA     - XFRM_MSG_BASE] = { .doit = xfrm_flush_sa      },
        [XFRM_MSG_FLUSHPOLICY - XFRM_MSG_BASE] = { .doit = xfrm_flush_policy  },
        [XFRM_MSG_NEWAE       - XFRM_MSG_BASE] = { .doit = xfrm_new_ae  },
        [XFRM_MSG_GETAE       - XFRM_MSG_BASE] = { .doit = xfrm_get_ae  },
        [XFRM_MSG_MIGRATE     - XFRM_MSG_BASE] = { .doit = xfrm_do_migrate    },
        [XFRM_MSG_GETSADINFO  - XFRM_MSG_BASE] = { .doit = xfrm_get_sadinfo   },
        [XFRM_MSG_NEWSPDINFO  - XFRM_MSG_BASE] = { .doit = xfrm_set_spdinfo,
                                                   .nla_pol = xfrma_spd_policy,
                                                   .nla_max = XFRMA_SPD_MAX },
        [XFRM_MSG_GETSPDINFO  - XFRM_MSG_BASE] = { .doit = xfrm_get_spdinfo   },
        [XFRM_MSG_SETDEFAULT  - XFRM_MSG_BASE] = { .doit = xfrm_set_default   },
        [XFRM_MSG_GETDEFAULT  - XFRM_MSG_BASE] = { .doit = xfrm_get_default   },
};

static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
                             struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct nlattr *attrs[XFRMA_MAX+1];
        const struct xfrm_link *link;
        struct nlmsghdr *nlh64 = NULL;
        int type, err;

        type = nlh->nlmsg_type;
        if (type > XFRM_MSG_MAX)
                return -EINVAL;

        type -= XFRM_MSG_BASE;
        link = &xfrm_dispatch[type];

        /* All operations require privileges, even GET */
        if (!netlink_net_capable(skb, CAP_NET_ADMIN))
                return -EPERM;

        if (in_compat_syscall()) {
                struct xfrm_translator *xtr = xfrm_get_translator();

                if (!xtr)
                        return -EOPNOTSUPP;

                nlh64 = xtr->rcv_msg_compat(nlh, link->nla_max,
                                            link->nla_pol, extack);
                xfrm_put_translator(xtr);
                if (IS_ERR(nlh64))
                        return PTR_ERR(nlh64);
                if (nlh64)
                        nlh = nlh64;
        }

        if ((type == (XFRM_MSG_GETSA - XFRM_MSG_BASE) ||
             type == (XFRM_MSG_GETPOLICY - XFRM_MSG_BASE)) &&
            (nlh->nlmsg_flags & NLM_F_DUMP)) {
                struct netlink_dump_control c = {
                        .start = link->start,
                        .dump = link->dump,
                        .done = link->done,
                };

                if (link->dump == NULL) {
                        err = -EINVAL;
                        goto err;
                }

                err = netlink_dump_start(net->xfrm.nlsk, skb, nlh, &c);
                goto err;
        }

        err = nlmsg_parse_deprecated(nlh, xfrm_msg_min[type], attrs,
                                     link->nla_max ? : XFRMA_MAX,
                                     link->nla_pol ? : xfrma_policy, extack);
        if (err < 0)
                goto err;

        if (link->doit == NULL) {
                err = -EINVAL;
                goto err;
        }

        err = link->doit(skb, nlh, attrs);

        /* We need to free skb allocated in xfrm_alloc_compat() before
         * returning from this function, because consume_skb() won't take
         * care of frag_list since netlink destructor sets
         * sbk->head to NULL. (see netlink_skb_destructor())
         */
        if (skb_has_frag_list(skb)) {
                kfree_skb(skb_shinfo(skb)->frag_list);
                skb_shinfo(skb)->frag_list = NULL;
        }

err:
        kvfree(nlh64);
        return err;
}

static void xfrm_netlink_rcv(struct sk_buff *skb)
{
        struct net *net = sock_net(skb->sk);

        mutex_lock(&net->xfrm.xfrm_cfg_mutex);
        netlink_rcv_skb(skb, &xfrm_user_rcv_msg);
        mutex_unlock(&net->xfrm.xfrm_cfg_mutex);
}

static inline unsigned int xfrm_expire_msgsize(void)
{
        return NLMSG_ALIGN(sizeof(struct xfrm_user_expire))
               + nla_total_size(sizeof(struct xfrm_mark));
}

static int build_expire(struct sk_buff *skb, struct xfrm_state *x, const struct km_event *c)
{
        struct xfrm_user_expire *ue;
        struct nlmsghdr *nlh;
        int err;

        nlh = nlmsg_put(skb, c->portid, 0, XFRM_MSG_EXPIRE, sizeof(*ue), 0);
        if (nlh == NULL)
                return -EMSGSIZE;

        ue = nlmsg_data(nlh);
        copy_to_user_state(x, &ue->state);
        ue->hard = (c->data.hard != 0) ? 1 : 0;
        /* clear the padding bytes */
        memset(&ue->hard + 1, 0, sizeof(*ue) - offsetofend(typeof(*ue), hard));

        err = xfrm_mark_put(skb, &x->mark);
        if (err)
                return err;

        err = xfrm_if_id_put(skb, x->if_id);
        if (err)
                return err;

        nlmsg_end(skb, nlh);
        return 0;
}

static int xfrm_exp_state_notify(struct xfrm_state *x, const struct km_event *c)
{
        struct net *net = xs_net(x);
        struct sk_buff *skb;

        skb = nlmsg_new(xfrm_expire_msgsize(), GFP_ATOMIC);
        if (skb == NULL)
                return -ENOMEM;

        if (build_expire(skb, x, c) < 0) {
                kfree_skb(skb);
                return -EMSGSIZE;
        }

        return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_EXPIRE);
}

static int xfrm_aevent_state_notify(struct xfrm_state *x, const struct km_event *c)
{
        struct net *net = xs_net(x);
        struct sk_buff *skb;
        int err;

        skb = nlmsg_new(xfrm_aevent_msgsize(x), GFP_ATOMIC);
        if (skb == NULL)
                return -ENOMEM;

        err = build_aevent(skb, x, c);
        BUG_ON(err < 0);

        return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_AEVENTS);
}

static int xfrm_notify_sa_flush(const struct km_event *c)
{
        struct net *net = c->net;
        struct xfrm_usersa_flush *p;
        struct nlmsghdr *nlh;
        struct sk_buff *skb;
        int len = NLMSG_ALIGN(sizeof(struct xfrm_usersa_flush));

        skb = nlmsg_new(len, GFP_ATOMIC);
        if (skb == NULL)
                return -ENOMEM;

        nlh = nlmsg_put(skb, c->portid, c->seq, XFRM_MSG_FLUSHSA, sizeof(*p), 0);
        if (nlh == NULL) {
                kfree_skb(skb);
                return -EMSGSIZE;
        }

        p = nlmsg_data(nlh);
        p->proto = c->data.proto;

        nlmsg_end(skb, nlh);

        return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_SA);
}

static inline unsigned int xfrm_sa_len(struct xfrm_state *x)
{
        unsigned int l = 0;
        if (x->aead)
                l += nla_total_size(aead_len(x->aead));
        if (x->aalg) {
                l += nla_total_size(sizeof(struct xfrm_algo) +
                                    (x->aalg->alg_key_len + 7) / 8);
                l += nla_total_size(xfrm_alg_auth_len(x->aalg));
        }
        if (x->ealg)
                l += nla_total_size(xfrm_alg_len(x->ealg));
        if (x->calg)
                l += nla_total_size(sizeof(*x->calg));
        if (x->encap)
                l += nla_total_size(sizeof(*x->encap));
        if (x->tfcpad)
                l += nla_total_size(sizeof(x->tfcpad));
        if (x->replay_esn)
                l += nla_total_size(xfrm_replay_state_esn_len(x->replay_esn));
        else
                l += nla_total_size(sizeof(struct xfrm_replay_state));
        if (x->security)
                l += nla_total_size(sizeof(struct xfrm_user_sec_ctx) +
                                    x->security->ctx_len);
        if (x->coaddr)
                l += nla_total_size(sizeof(*x->coaddr));
        if (x->props.extra_flags)
                l += nla_total_size(sizeof(x->props.extra_flags));
        if (x->xso.dev)
                 l += nla_total_size(sizeof(struct xfrm_user_offload));
        if (x->props.smark.v | x->props.smark.m) {
                l += nla_total_size(sizeof(x->props.smark.v));
                l += nla_total_size(sizeof(x->props.smark.m));
        }
        if (x->if_id)
                l += nla_total_size(sizeof(x->if_id));

        /* Must count x->lastused as it may become non-zero behind our back. */
        l += nla_total_size_64bit(sizeof(u64));

        if (x->mapping_maxage)
                l += nla_total_size(sizeof(x->mapping_maxage));

        return l;
}

static int xfrm_notify_sa(struct xfrm_state *x, const struct km_event *c)
{
        struct net *net = xs_net(x);
        struct xfrm_usersa_info *p;
        struct xfrm_usersa_id *id;
        struct nlmsghdr *nlh;
        struct sk_buff *skb;
        unsigned int len = xfrm_sa_len(x);
        unsigned int headlen;
        int err;

        headlen = sizeof(*p);
        if (c->event == XFRM_MSG_DELSA) {
                len += nla_total_size(headlen);
                headlen = sizeof(*id);
                len += nla_total_size(sizeof(struct xfrm_mark));
        }
        len += NLMSG_ALIGN(headlen);

        skb = nlmsg_new(len, GFP_ATOMIC);
        if (skb == NULL)
                return -ENOMEM;

        nlh = nlmsg_put(skb, c->portid, c->seq, c->event, headlen, 0);
        err = -EMSGSIZE;
        if (nlh == NULL)
                goto out_free_skb;

        p = nlmsg_data(nlh);
        if (c->event == XFRM_MSG_DELSA) {
                struct nlattr *attr;

                id = nlmsg_data(nlh);
                memset(id, 0, sizeof(*id));
                memcpy(&id->daddr, &x->id.daddr, sizeof(id->daddr));
                id->spi = x->id.spi;
                id->family = x->props.family;
                id->proto = x->id.proto;

                attr = nla_reserve(skb, XFRMA_SA, sizeof(*p));
                err = -EMSGSIZE;
                if (attr == NULL)
                        goto out_free_skb;

                p = nla_data(attr);
        }
        err = copy_to_user_state_extra(x, p, skb);
        if (err)
                goto out_free_skb;

        nlmsg_end(skb, nlh);

        return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_SA);

out_free_skb:
        kfree_skb(skb);
        return err;
}

static int xfrm_send_state_notify(struct xfrm_state *x, const struct km_event *c)
{

        switch (c->event) {
        case XFRM_MSG_EXPIRE:
                return xfrm_exp_state_notify(x, c);
        case XFRM_MSG_NEWAE:
                return xfrm_aevent_state_notify(x, c);
        case XFRM_MSG_DELSA:
        case XFRM_MSG_UPDSA:
        case XFRM_MSG_NEWSA:
                return xfrm_notify_sa(x, c);
        case XFRM_MSG_FLUSHSA:
                return xfrm_notify_sa_flush(c);
        default:
                printk(KERN_NOTICE "xfrm_user: Unknown SA event %d\n",
                       c->event);
                break;
        }

        return 0;

}

static inline unsigned int xfrm_acquire_msgsize(struct xfrm_state *x,
                                                struct xfrm_policy *xp)
{
        return NLMSG_ALIGN(sizeof(struct xfrm_user_acquire))
               + nla_total_size(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr)
               + nla_total_size(sizeof(struct xfrm_mark))
               + nla_total_size(xfrm_user_sec_ctx_size(x->security))
               + userpolicy_type_attrsize();
}

static int build_acquire(struct sk_buff *skb, struct xfrm_state *x,
                         struct xfrm_tmpl *xt, struct xfrm_policy *xp)
{
        __u32 seq = xfrm_get_acqseq();
        struct xfrm_user_acquire *ua;
        struct nlmsghdr *nlh;
        int err;

        nlh = nlmsg_put(skb, 0, 0, XFRM_MSG_ACQUIRE, sizeof(*ua), 0);
        if (nlh == NULL)
                return -EMSGSIZE;

        ua = nlmsg_data(nlh);
        memcpy(&ua->id, &x->id, sizeof(ua->id));
        memcpy(&ua->saddr, &x->props.saddr, sizeof(ua->saddr));
        memcpy(&ua->sel, &x->sel, sizeof(ua->sel));
        copy_to_user_policy(xp, &ua->policy, XFRM_POLICY_OUT);
        ua->aalgos = xt->aalgos;
        ua->ealgos = xt->ealgos;
        ua->calgos = xt->calgos;
        ua->seq = x->km.seq = seq;

        err = copy_to_user_tmpl(xp, skb);
        if (!err)
                err = copy_to_user_state_sec_ctx(x, skb);
        if (!err)
                err = copy_to_user_policy_type(xp->type, skb);
        if (!err)
                err = xfrm_mark_put(skb, &xp->mark);
        if (!err)
                err = xfrm_if_id_put(skb, xp->if_id);
        if (err) {
                nlmsg_cancel(skb, nlh);
                return err;
        }

        nlmsg_end(skb, nlh);
        return 0;
}

static int xfrm_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *xt,
                             struct xfrm_policy *xp)
{
        struct net *net = xs_net(x);
        struct sk_buff *skb;
        int err;

        skb = nlmsg_new(xfrm_acquire_msgsize(x, xp), GFP_ATOMIC);
        if (skb == NULL)
                return -ENOMEM;

        err = build_acquire(skb, x, xt, xp);
        BUG_ON(err < 0);

        return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_ACQUIRE);
}

/* User gives us xfrm_user_policy_info followed by an array of 0
 * or more templates.
 */
static struct xfrm_policy *xfrm_compile_policy(struct sock *sk, int opt,
                                               u8 *data, int len, int *dir)
{
        struct net *net = sock_net(sk);
        struct xfrm_userpolicy_info *p = (struct xfrm_userpolicy_info *)data;
        struct xfrm_user_tmpl *ut = (struct xfrm_user_tmpl *) (p + 1);
        struct xfrm_policy *xp;
        int nr;

        switch (sk->sk_family) {
        case AF_INET:
                if (opt != IP_XFRM_POLICY) {
                        *dir = -EOPNOTSUPP;
                        return NULL;
                }
                break;
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                if (opt != IPV6_XFRM_POLICY) {
                        *dir = -EOPNOTSUPP;
                        return NULL;
                }
                break;
#endif
        default:
                *dir = -EINVAL;
                return NULL;
        }

        *dir = -EINVAL;

        if (len < sizeof(*p) ||
            verify_newpolicy_info(p))
                return NULL;

        nr = ((len - sizeof(*p)) / sizeof(*ut));
        if (validate_tmpl(nr, ut, p->sel.family))
                return NULL;

        if (p->dir > XFRM_POLICY_OUT)
                return NULL;

        xp = xfrm_policy_alloc(net, GFP_ATOMIC);
        if (xp == NULL) {
                *dir = -ENOBUFS;
                return NULL;
        }

        copy_from_user_policy(xp, p);
        xp->type = XFRM_POLICY_TYPE_MAIN;
        copy_templates(xp, ut, nr);

        *dir = p->dir;

        return xp;
}

static inline unsigned int xfrm_polexpire_msgsize(struct xfrm_policy *xp)
{
        return NLMSG_ALIGN(sizeof(struct xfrm_user_polexpire))
               + nla_total_size(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr)
               + nla_total_size(xfrm_user_sec_ctx_size(xp->security))
               + nla_total_size(sizeof(struct xfrm_mark))
               + userpolicy_type_attrsize();
}

static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp,
                           int dir, const struct km_event *c)
{
        struct xfrm_user_polexpire *upe;
        int hard = c->data.hard;
        struct nlmsghdr *nlh;
        int err;

        nlh = nlmsg_put(skb, c->portid, 0, XFRM_MSG_POLEXPIRE, sizeof(*upe), 0);
        if (nlh == NULL)
                return -EMSGSIZE;

        upe = nlmsg_data(nlh);
        copy_to_user_policy(xp, &upe->pol, dir);
        err = copy_to_user_tmpl(xp, skb);
        if (!err)
                err = copy_to_user_sec_ctx(xp, skb);
        if (!err)
                err = copy_to_user_policy_type(xp->type, skb);
        if (!err)
                err = xfrm_mark_put(skb, &xp->mark);
        if (!err)
                err = xfrm_if_id_put(skb, xp->if_id);
        if (err) {
                nlmsg_cancel(skb, nlh);
                return err;
        }
        upe->hard = !!hard;

        nlmsg_end(skb, nlh);
        return 0;
}

static int xfrm_exp_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c)
{
        struct net *net = xp_net(xp);
        struct sk_buff *skb;
        int err;

        skb = nlmsg_new(xfrm_polexpire_msgsize(xp), GFP_ATOMIC);
        if (skb == NULL)
                return -ENOMEM;

        err = build_polexpire(skb, xp, dir, c);
        BUG_ON(err < 0);

        return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_EXPIRE);
}

static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, const struct km_event *c)
{
        unsigned int len = nla_total_size(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr);
        struct net *net = xp_net(xp);
        struct xfrm_userpolicy_info *p;
        struct xfrm_userpolicy_id *id;
        struct nlmsghdr *nlh;
        struct sk_buff *skb;
        unsigned int headlen;
        int err;

        headlen = sizeof(*p);
        if (c->event == XFRM_MSG_DELPOLICY) {
                len += nla_total_size(headlen);
                headlen = sizeof(*id);
        }
        len += userpolicy_type_attrsize();
        len += nla_total_size(sizeof(struct xfrm_mark));
        len += NLMSG_ALIGN(headlen);

        skb = nlmsg_new(len, GFP_ATOMIC);
        if (skb == NULL)
                return -ENOMEM;

        nlh = nlmsg_put(skb, c->portid, c->seq, c->event, headlen, 0);
        err = -EMSGSIZE;
        if (nlh == NULL)
                goto out_free_skb;

        p = nlmsg_data(nlh);
        if (c->event == XFRM_MSG_DELPOLICY) {
                struct nlattr *attr;

                id = nlmsg_data(nlh);
                memset(id, 0, sizeof(*id));
                id->dir = dir;
                if (c->data.byid)
                        id->index = xp->index;
                else
                        memcpy(&id->sel, &xp->selector, sizeof(id->sel));

                attr = nla_reserve(skb, XFRMA_POLICY, sizeof(*p));
                err = -EMSGSIZE;
                if (attr == NULL)
                        goto out_free_skb;

                p = nla_data(attr);
        }

        copy_to_user_policy(xp, p, dir);
        err = copy_to_user_tmpl(xp, skb);
        if (!err)
                err = copy_to_user_policy_type(xp->type, skb);
        if (!err)
                err = xfrm_mark_put(skb, &xp->mark);
        if (!err)
                err = xfrm_if_id_put(skb, xp->if_id);
        if (err)
                goto out_free_skb;

        nlmsg_end(skb, nlh);

        return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_POLICY);

out_free_skb:
        kfree_skb(skb);
        return err;
}

static int xfrm_notify_policy_flush(const struct km_event *c)
{
        struct net *net = c->net;
        struct nlmsghdr *nlh;
        struct sk_buff *skb;
        int err;

        skb = nlmsg_new(userpolicy_type_attrsize(), GFP_ATOMIC);
        if (skb == NULL)
                return -ENOMEM;

        nlh = nlmsg_put(skb, c->portid, c->seq, XFRM_MSG_FLUSHPOLICY, 0, 0);
        err = -EMSGSIZE;
        if (nlh == NULL)
                goto out_free_skb;
        err = copy_to_user_policy_type(c->data.type, skb);
        if (err)
                goto out_free_skb;

        nlmsg_end(skb, nlh);

        return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_POLICY);

out_free_skb:
        kfree_skb(skb);
        return err;
}

static int xfrm_send_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c)
{

        switch (c->event) {
        case XFRM_MSG_NEWPOLICY:
        case XFRM_MSG_UPDPOLICY:
        case XFRM_MSG_DELPOLICY:
                return xfrm_notify_policy(xp, dir, c);
        case XFRM_MSG_FLUSHPOLICY:
                return xfrm_notify_policy_flush(c);
        case XFRM_MSG_POLEXPIRE:
                return xfrm_exp_policy_notify(xp, dir, c);
        default:
                printk(KERN_NOTICE "xfrm_user: Unknown Policy event %d\n",
                       c->event);
        }

        return 0;

}

static inline unsigned int xfrm_report_msgsize(void)
{
        return NLMSG_ALIGN(sizeof(struct xfrm_user_report));
}

static int build_report(struct sk_buff *skb, u8 proto,
                        struct xfrm_selector *sel, xfrm_address_t *addr)
{
        struct xfrm_user_report *ur;
        struct nlmsghdr *nlh;

        nlh = nlmsg_put(skb, 0, 0, XFRM_MSG_REPORT, sizeof(*ur), 0);
        if (nlh == NULL)
                return -EMSGSIZE;

        ur = nlmsg_data(nlh);
        memset(ur, 0, sizeof(*ur));
        ur->proto = proto;
        memcpy(&ur->sel, sel, sizeof(ur->sel));

        if (addr) {
                int err = nla_put(skb, XFRMA_COADDR, sizeof(*addr), addr);
                if (err) {
                        nlmsg_cancel(skb, nlh);
                        return err;
                }
        }
        nlmsg_end(skb, nlh);
        return 0;
}

static int xfrm_send_report(struct net *net, u8 proto,
                            struct xfrm_selector *sel, xfrm_address_t *addr)
{
        struct sk_buff *skb;
        int err;

        skb = nlmsg_new(xfrm_report_msgsize(), GFP_ATOMIC);
        if (skb == NULL)
                return -ENOMEM;

        err = build_report(skb, proto, sel, addr);
        BUG_ON(err < 0);

        return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_REPORT);
}

static inline unsigned int xfrm_mapping_msgsize(void)
{
        return NLMSG_ALIGN(sizeof(struct xfrm_user_mapping));
}

static int build_mapping(struct sk_buff *skb, struct xfrm_state *x,
                         xfrm_address_t *new_saddr, __be16 new_sport)
{
        struct xfrm_user_mapping *um;
        struct nlmsghdr *nlh;

        nlh = nlmsg_put(skb, 0, 0, XFRM_MSG_MAPPING, sizeof(*um), 0);
        if (nlh == NULL)
                return -EMSGSIZE;

        um = nlmsg_data(nlh);

        memcpy(&um->id.daddr, &x->id.daddr, sizeof(um->id.daddr));
        um->id.spi = x->id.spi;
        um->id.family = x->props.family;
        um->id.proto = x->id.proto;
        memcpy(&um->new_saddr, new_saddr, sizeof(um->new_saddr));
        memcpy(&um->old_saddr, &x->props.saddr, sizeof(um->old_saddr));
        um->new_sport = new_sport;
        um->old_sport = x->encap->encap_sport;
        um->reqid = x->props.reqid;

        nlmsg_end(skb, nlh);
        return 0;
}

static int xfrm_send_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr,
                             __be16 sport)
{
        struct net *net = xs_net(x);
        struct sk_buff *skb;
        int err;

        if (x->id.proto != IPPROTO_ESP)
                return -EINVAL;

        if (!x->encap)
                return -EINVAL;

        skb = nlmsg_new(xfrm_mapping_msgsize(), GFP_ATOMIC);
        if (skb == NULL)
                return -ENOMEM;

        err = build_mapping(skb, x, ipaddr, sport);
        BUG_ON(err < 0);

        return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_MAPPING);
}

static bool xfrm_is_alive(const struct km_event *c)
{
        return (bool)xfrm_acquire_is_on(c->net);
}

static struct xfrm_mgr netlink_mgr = {
        .notify                = xfrm_send_state_notify,
        .acquire        = xfrm_send_acquire,
        .compile_policy        = xfrm_compile_policy,
        .notify_policy        = xfrm_send_policy_notify,
        .report                = xfrm_send_report,
        .migrate        = xfrm_send_migrate,
        .new_mapping        = xfrm_send_mapping,
        .is_alive        = xfrm_is_alive,
};

static int __net_init xfrm_user_net_init(struct net *net)
{
        struct sock *nlsk;
        struct netlink_kernel_cfg cfg = {
                .groups        = XFRMNLGRP_MAX,
                .input        = xfrm_netlink_rcv,
        };

        nlsk = netlink_kernel_create(net, NETLINK_XFRM, &cfg);
        if (nlsk == NULL)
                return -ENOMEM;
        net->xfrm.nlsk_stash = nlsk; /* Don't set to NULL */
        rcu_assign_pointer(net->xfrm.nlsk, nlsk);
        return 0;
}

static void __net_exit xfrm_user_net_exit(struct list_head *net_exit_list)
{
        struct net *net;
        list_for_each_entry(net, net_exit_list, exit_list)
                RCU_INIT_POINTER(net->xfrm.nlsk, NULL);
        synchronize_net();
        list_for_each_entry(net, net_exit_list, exit_list)
                netlink_kernel_release(net->xfrm.nlsk_stash);
}

static struct pernet_operations xfrm_user_net_ops = {
        .init            = xfrm_user_net_init,
        .exit_batch = xfrm_user_net_exit,
};

static int __init xfrm_user_init(void)
{
        int rv;

        printk(KERN_INFO "Initializing XFRM netlink socket\n");

        rv = register_pernet_subsys(&xfrm_user_net_ops);
        if (rv < 0)
                return rv;
        rv = xfrm_register_km(&netlink_mgr);
        if (rv < 0)
                unregister_pernet_subsys(&xfrm_user_net_ops);
        return rv;
}

static void __exit xfrm_user_exit(void)
{
        xfrm_unregister_km(&netlink_mgr);
        unregister_pernet_subsys(&xfrm_user_net_ops);
}

module_init(xfrm_user_init);
module_exit(xfrm_user_exit);
MODULE_LICENSE("GPL");
MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_XFRM);

















































    6 








    6 










































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
// SPDX-License-Identifier: GPL-2.0
#include <linux/memblock.h>
#include <linux/mmdebug.h>
#include <linux/export.h>
#include <linux/mm.h>

#include <asm/page.h>
#include <linux/vmalloc.h>

#include "physaddr.h"

#ifdef CONFIG_X86_64

#ifdef CONFIG_DEBUG_VIRTUAL
unsigned long __phys_addr(unsigned long x)
{
        unsigned long y = x - __START_KERNEL_map;

        /* use the carry flag to determine if x was < __START_KERNEL_map */
        if (unlikely(x > y)) {
                x = y + phys_base;

                VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);
        } else {
                x = y + (__START_KERNEL_map - PAGE_OFFSET);

                /* carry flag will be set if starting x was >= PAGE_OFFSET */
                VIRTUAL_BUG_ON((x > y) || !phys_addr_valid(x));
        }

        return x;
}
EXPORT_SYMBOL(__phys_addr);

unsigned long __phys_addr_symbol(unsigned long x)
{
        unsigned long y = x - __START_KERNEL_map;

        /* only check upper bounds since lower bounds will trigger carry */
        VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);

        return y + phys_base;
}
EXPORT_SYMBOL(__phys_addr_symbol);
#endif

bool __virt_addr_valid(unsigned long x)
{
        unsigned long y = x - __START_KERNEL_map;

        /* use the carry flag to determine if x was < __START_KERNEL_map */
        if (unlikely(x > y)) {
                x = y + phys_base;

                if (y >= KERNEL_IMAGE_SIZE)
                        return false;
        } else {
                x = y + (__START_KERNEL_map - PAGE_OFFSET);

                /* carry flag will be set if starting x was >= PAGE_OFFSET */
                if ((x > y) || !phys_addr_valid(x))
                        return false;
        }

        return pfn_valid(x >> PAGE_SHIFT);
}
EXPORT_SYMBOL(__virt_addr_valid);

#else

#ifdef CONFIG_DEBUG_VIRTUAL
unsigned long __phys_addr(unsigned long x)
{
        unsigned long phys_addr = x - PAGE_OFFSET;
        /* VMALLOC_* aren't constants  */
        VIRTUAL_BUG_ON(x < PAGE_OFFSET);
        VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
        /* max_low_pfn is set early, but not _that_ early */
        if (max_low_pfn) {
                VIRTUAL_BUG_ON((phys_addr >> PAGE_SHIFT) > max_low_pfn);
                BUG_ON(slow_virt_to_phys((void *)x) != phys_addr);
        }
        return phys_addr;
}
EXPORT_SYMBOL(__phys_addr);
#endif

bool __virt_addr_valid(unsigned long x)
{
        if (x < PAGE_OFFSET)
                return false;
        if (__vmalloc_start_set && is_vmalloc_addr((void *) x))
                return false;
        if (x >= FIXADDR_START)
                return false;
        return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
}
EXPORT_SYMBOL(__virt_addr_valid);

#endif        /* CONFIG_X86_64 */

































































































































































    1 










    1 
    1 
    1 
    1 


    1 














    3 














    3 
    3 




    1 

































    1 



    1 

    3 













    1 


    1 


    1 




    1 


    1 




















    1 






    1 






    1 













    1 
    1 
















    1 







    1 








    1 
    1 









    1 


    1 













    1 
    1 
    1 

    1 







    1 


    1 
















    1 
    1 

    1 
    1 


    1 



    1 

    1 
    1 
    1 






    1 

























    1 
    1 
    1 
    1 






































    1 

    3 



    2 




    1 















    3 
    3 
    3 



    1 

    1 
    1 
    1 
    1 









    1 









    1 

    1 

    1 

    1 













    1 






    1 





















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
 */

#include <linux/dcache.h>
#include <linux/fs.h>
#include <linux/gfp.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/srcu.h>

#include <linux/fsnotify_backend.h>
#include "fsnotify.h"

/*
 * Clear all of the marks on an inode when it is being evicted from core
 */
void __fsnotify_inode_delete(struct inode *inode)
{
        fsnotify_clear_marks_by_inode(inode);
}
EXPORT_SYMBOL_GPL(__fsnotify_inode_delete);

void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
{
        fsnotify_clear_marks_by_mount(mnt);
}

/**
 * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
 * @sb: superblock being unmounted.
 *
 * Called during unmount with no locks held, so needs to be safe against
 * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block.
 */
static void fsnotify_unmount_inodes(struct super_block *sb)
{
        struct inode *inode, *iput_inode = NULL;

        spin_lock(&sb->s_inode_list_lock);
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
                /*
                 * We cannot __iget() an inode in state I_FREEING,
                 * I_WILL_FREE, or I_NEW which is fine because by that point
                 * the inode cannot have any associated watches.
                 */
                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }

                /*
                 * If i_count is zero, the inode cannot have any watches and
                 * doing an __iget/iput with SB_ACTIVE clear would actually
                 * evict all inodes with zero i_count from icache which is
                 * unnecessarily violent and may in fact be illegal to do.
                 * However, we should have been called /after/ evict_inodes
                 * removed all zero refcount inodes, in any case.  Test to
                 * be sure.
                 */
                if (!atomic_read(&inode->i_count)) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }

                __iget(inode);
                spin_unlock(&inode->i_lock);
                spin_unlock(&sb->s_inode_list_lock);

                iput(iput_inode);

                /* for each watch, send FS_UNMOUNT and then remove it */
                fsnotify_inode(inode, FS_UNMOUNT);

                fsnotify_inode_delete(inode);

                iput_inode = inode;

                cond_resched();
                spin_lock(&sb->s_inode_list_lock);
        }
        spin_unlock(&sb->s_inode_list_lock);

        iput(iput_inode);
}

void fsnotify_sb_delete(struct super_block *sb)
{
        fsnotify_unmount_inodes(sb);
        fsnotify_clear_marks_by_sb(sb);
        /* Wait for outstanding object references from connectors */
        wait_var_event(&sb->s_fsnotify_connectors,
                       !atomic_long_read(&sb->s_fsnotify_connectors));
}

/*
 * Given an inode, first check if we care what happens to our children.  Inotify
 * and dnotify both tell their parents about events.  If we care about any event
 * on a child we run all of our children and set a dentry flag saying that the
 * parent cares.  Thus when an event happens on a child it can quickly tell
 * if there is a need to find a parent and send the event to the parent.
 */
void fsnotify_set_children_dentry_flags(struct inode *inode)
{
        struct dentry *alias;

        if (!S_ISDIR(inode->i_mode))
                return;

        spin_lock(&inode->i_lock);
        /* run all of the dentries associated with this inode.  Since this is a
         * directory, there damn well better only be one item on this list */
        hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
                struct dentry *child;

                /* run all of the children of the original inode and fix their
                 * d_flags to indicate parental interest (their parent is the
                 * original inode) */
                spin_lock(&alias->d_lock);
                list_for_each_entry(child, &alias->d_subdirs, d_child) {
                        if (!child->d_inode)
                                continue;

                        spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
                        child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
                        spin_unlock(&child->d_lock);
                }
                spin_unlock(&alias->d_lock);
        }
        spin_unlock(&inode->i_lock);
}

/*
 * Lazily clear false positive PARENT_WATCHED flag for child whose parent had
 * stopped watching children.
 */
static void fsnotify_clear_child_dentry_flag(struct inode *pinode,
                                             struct dentry *dentry)
{
        spin_lock(&dentry->d_lock);
        /*
         * d_lock is a sufficient barrier to prevent observing a non-watched
         * parent state from before the fsnotify_set_children_dentry_flags()
         * or fsnotify_update_flags() call that had set PARENT_WATCHED.
         */
        if (!fsnotify_inode_watches_children(pinode))
                dentry->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
        spin_unlock(&dentry->d_lock);
}

/* Are inode/sb/mount interested in parent and name info with this event? */
static bool fsnotify_event_needs_parent(struct inode *inode, struct mount *mnt,
                                        __u32 mask)
{
        __u32 marks_mask = 0;

        /* We only send parent/name to inode/sb/mount for events on non-dir */
        if (mask & FS_ISDIR)
                return false;

        /*
         * All events that are possible on child can also may be reported with
         * parent/name info to inode/sb/mount.  Otherwise, a watching parent
         * could result in events reported with unexpected name info to sb/mount.
         */
        BUILD_BUG_ON(FS_EVENTS_POSS_ON_CHILD & ~FS_EVENTS_POSS_TO_PARENT);

        /* Did either inode/sb/mount subscribe for events with parent/name? */
        marks_mask |= fsnotify_parent_needed_mask(inode->i_fsnotify_mask);
        marks_mask |= fsnotify_parent_needed_mask(inode->i_sb->s_fsnotify_mask);
        if (mnt)
                marks_mask |= fsnotify_parent_needed_mask(mnt->mnt_fsnotify_mask);

        /* Did they subscribe for this event with parent/name info? */
        return mask & marks_mask;
}

/*
 * Notify this dentry's parent about a child's events with child name info
 * if parent is watching or if inode/sb/mount are interested in events with
 * parent and name info.
 *
 * Notify only the child without name info if parent is not watching and
 * inode/sb/mount are not interested in events with parent and name info.
 */
int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
                      int data_type)
{
        const struct path *path = fsnotify_data_path(data, data_type);
        struct mount *mnt = path ? real_mount(path->mnt) : NULL;
        struct inode *inode = d_inode(dentry);
        struct dentry *parent;
        bool parent_watched = dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED;
        bool parent_needed, parent_interested;
        __u32 p_mask;
        struct inode *p_inode = NULL;
        struct name_snapshot name;
        struct qstr *file_name = NULL;
        int ret = 0;

        /*
         * Do inode/sb/mount care about parent and name info on non-dir?
         * Do they care about any event at all?
         */
        if (!inode->i_fsnotify_marks && !inode->i_sb->s_fsnotify_marks &&
            (!mnt || !mnt->mnt_fsnotify_marks) && !parent_watched)
                return 0;

        parent = NULL;
        parent_needed = fsnotify_event_needs_parent(inode, mnt, mask);
        if (!parent_watched && !parent_needed)
                goto notify;

        /* Does parent inode care about events on children? */
        parent = dget_parent(dentry);
        p_inode = parent->d_inode;
        p_mask = fsnotify_inode_watches_children(p_inode);
        if (unlikely(parent_watched && !p_mask))
                fsnotify_clear_child_dentry_flag(p_inode, dentry);

        /*
         * Include parent/name in notification either if some notification
         * groups require parent info or the parent is interested in this event.
         * The parent interest in ACCESS/MODIFY events does not apply to special
         * files, where read/write are not on the filesystem of the parent and
         * events can provide an undesirable side-channel for information
         * exfiltration.
         */
        parent_interested = mask & p_mask & ALL_FSNOTIFY_EVENTS &&
                            !(data_type == FSNOTIFY_EVENT_PATH &&
                              d_is_special(dentry) &&
                              (mask & (FS_ACCESS | FS_MODIFY)));
        if (parent_needed || parent_interested) {
                /* When notifying parent, child should be passed as data */
                WARN_ON_ONCE(inode != fsnotify_data_inode(data, data_type));

                /* Notify both parent and child with child name info */
                take_dentry_name_snapshot(&name, dentry);
                file_name = &name.name;
                if (parent_interested)
                        mask |= FS_EVENT_ON_CHILD;
        }

notify:
        ret = fsnotify(mask, data, data_type, p_inode, file_name, inode, 0);

        if (file_name)
                release_dentry_name_snapshot(&name);
        dput(parent);

        return ret;
}
EXPORT_SYMBOL_GPL(__fsnotify_parent);

static int fsnotify_handle_inode_event(struct fsnotify_group *group,
                                       struct fsnotify_mark *inode_mark,
                                       u32 mask, const void *data, int data_type,
                                       struct inode *dir, const struct qstr *name,
                                       u32 cookie)
{
        const struct path *path = fsnotify_data_path(data, data_type);
        struct inode *inode = fsnotify_data_inode(data, data_type);
        const struct fsnotify_ops *ops = group->ops;

        if (WARN_ON_ONCE(!ops->handle_inode_event))
                return 0;

        if (WARN_ON_ONCE(!inode && !dir))
                return 0;

        if ((inode_mark->flags & FSNOTIFY_MARK_FLAG_EXCL_UNLINK) &&
            path && d_unlinked(path->dentry))
                return 0;

        /* Check interest of this mark in case event was sent with two marks */
        if (!(mask & inode_mark->mask & ALL_FSNOTIFY_EVENTS))
                return 0;

        return ops->handle_inode_event(inode_mark, mask, inode, dir, name, cookie);
}

static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask,
                                 const void *data, int data_type,
                                 struct inode *dir, const struct qstr *name,
                                 u32 cookie, struct fsnotify_iter_info *iter_info)
{
        struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
        struct fsnotify_mark *parent_mark = fsnotify_iter_parent_mark(iter_info);
        int ret;

        if (WARN_ON_ONCE(fsnotify_iter_sb_mark(iter_info)) ||
            WARN_ON_ONCE(fsnotify_iter_vfsmount_mark(iter_info)))
                return 0;

        /*
         * For FS_RENAME, 'dir' is old dir and 'data' is new dentry.
         * The only ->handle_inode_event() backend that supports FS_RENAME is
         * dnotify, where it means file was renamed within same parent.
         */
        if (mask & FS_RENAME) {
                struct dentry *moved = fsnotify_data_dentry(data, data_type);

                if (dir != moved->d_parent->d_inode)
                        return 0;
        }

        if (parent_mark) {
                ret = fsnotify_handle_inode_event(group, parent_mark, mask,
                                                  data, data_type, dir, name, 0);
                if (ret)
                        return ret;
        }

        if (!inode_mark)
                return 0;

        /*
         * Some events can be sent on both parent dir and child marks (e.g.
         * FS_ATTRIB).  If both parent dir and child are watching, report the
         * event once to parent dir with name (if interested) and once to child
         * without name (if interested).
         *
         * In any case regardless whether the parent is watching or not, the
         * child watcher is expecting an event without the FS_EVENT_ON_CHILD
         * flag. The file name is expected if and only if this is a directory
         * event.
         */
        mask &= ~FS_EVENT_ON_CHILD;
        if (!(mask & ALL_FSNOTIFY_DIRENT_EVENTS)) {
                dir = NULL;
                name = NULL;
        }

        return fsnotify_handle_inode_event(group, inode_mark, mask, data, data_type,
                                           dir, name, cookie);
}

static int send_to_group(__u32 mask, const void *data, int data_type,
                         struct inode *dir, const struct qstr *file_name,
                         u32 cookie, struct fsnotify_iter_info *iter_info)
{
        struct fsnotify_group *group = NULL;
        __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS);
        __u32 marks_mask = 0;
        __u32 marks_ignore_mask = 0;
        bool is_dir = mask & FS_ISDIR;
        struct fsnotify_mark *mark;
        int type;

        if (!iter_info->report_mask)
                return 0;

        /* clear ignored on inode modification */
        if (mask & FS_MODIFY) {
                fsnotify_foreach_iter_mark_type(iter_info, mark, type) {
                        if (!(mark->flags &
                              FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
                                mark->ignore_mask = 0;
                }
        }

        /* Are any of the group marks interested in this event? */
        fsnotify_foreach_iter_mark_type(iter_info, mark, type) {
                group = mark->group;
                marks_mask |= mark->mask;
                marks_ignore_mask |=
                        fsnotify_effective_ignore_mask(mark, is_dir, type);
        }

        pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignore_mask=%x data=%p data_type=%d dir=%p cookie=%d\n",
                 __func__, group, mask, marks_mask, marks_ignore_mask,
                 data, data_type, dir, cookie);

        if (!(test_mask & marks_mask & ~marks_ignore_mask))
                return 0;

        if (group->ops->handle_event) {
                return group->ops->handle_event(group, mask, data, data_type, dir,
                                                file_name, cookie, iter_info);
        }

        return fsnotify_handle_event(group, mask, data, data_type, dir,
                                     file_name, cookie, iter_info);
}

static struct fsnotify_mark *fsnotify_first_mark(struct fsnotify_mark_connector **connp)
{
        struct fsnotify_mark_connector *conn;
        struct hlist_node *node = NULL;

        conn = srcu_dereference(*connp, &fsnotify_mark_srcu);
        if (conn)
                node = srcu_dereference(conn->list.first, &fsnotify_mark_srcu);

        return hlist_entry_safe(node, struct fsnotify_mark, obj_list);
}

static struct fsnotify_mark *fsnotify_next_mark(struct fsnotify_mark *mark)
{
        struct hlist_node *node = NULL;

        if (mark)
                node = srcu_dereference(mark->obj_list.next,
                                        &fsnotify_mark_srcu);

        return hlist_entry_safe(node, struct fsnotify_mark, obj_list);
}

/*
 * iter_info is a multi head priority queue of marks.
 * Pick a subset of marks from queue heads, all with the same group
 * and set the report_mask to a subset of the selected marks.
 * Returns false if there are no more groups to iterate.
 */
static bool fsnotify_iter_select_report_types(
                struct fsnotify_iter_info *iter_info)
{
        struct fsnotify_group *max_prio_group = NULL;
        struct fsnotify_mark *mark;
        int type;

        /* Choose max prio group among groups of all queue heads */
        fsnotify_foreach_iter_type(type) {
                mark = iter_info->marks[type];
                if (mark &&
                    fsnotify_compare_groups(max_prio_group, mark->group) > 0)
                        max_prio_group = mark->group;
        }

        if (!max_prio_group)
                return false;

        /* Set the report mask for marks from same group as max prio group */
        iter_info->current_group = max_prio_group;
        iter_info->report_mask = 0;
        fsnotify_foreach_iter_type(type) {
                mark = iter_info->marks[type];
                if (mark && mark->group == iter_info->current_group) {
                        /*
                         * FSNOTIFY_ITER_TYPE_PARENT indicates that this inode
                         * is watching children and interested in this event,
                         * which is an event possible on child.
                         * But is *this mark* watching children?
                         */
                        if (type == FSNOTIFY_ITER_TYPE_PARENT &&
                            !(mark->mask & FS_EVENT_ON_CHILD) &&
                            !(fsnotify_ignore_mask(mark) & FS_EVENT_ON_CHILD))
                                continue;

                        fsnotify_iter_set_report_type(iter_info, type);
                }
        }

        return true;
}

/*
 * Pop from iter_info multi head queue, the marks that belong to the group of
 * current iteration step.
 */
static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info)
{
        struct fsnotify_mark *mark;
        int type;

        /*
         * We cannot use fsnotify_foreach_iter_mark_type() here because we
         * may need to advance a mark of type X that belongs to current_group
         * but was not selected for reporting.
         */
        fsnotify_foreach_iter_type(type) {
                mark = iter_info->marks[type];
                if (mark && mark->group == iter_info->current_group)
                        iter_info->marks[type] =
                                fsnotify_next_mark(iter_info->marks[type]);
        }
}

/*
 * fsnotify - This is the main call to fsnotify.
 *
 * The VFS calls into hook specific functions in linux/fsnotify.h.
 * Those functions then in turn call here.  Here will call out to all of the
 * registered fsnotify_group.  Those groups can then use the notification event
 * in whatever means they feel necessary.
 *
 * @mask:        event type and flags
 * @data:        object that event happened on
 * @data_type:        type of object for fanotify_data_XXX() accessors
 * @dir:        optional directory associated with event -
 *                if @file_name is not NULL, this is the directory that
 *                @file_name is relative to
 * @file_name:        optional file name associated with event
 * @inode:        optional inode associated with event -
 *                If @dir and @inode are both non-NULL, event may be
 *                reported to both.
 * @cookie:        inotify rename cookie
 */
int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
             const struct qstr *file_name, struct inode *inode, u32 cookie)
{
        const struct path *path = fsnotify_data_path(data, data_type);
        struct super_block *sb = fsnotify_data_sb(data, data_type);
        struct fsnotify_iter_info iter_info = {};
        struct mount *mnt = NULL;
        struct inode *inode2 = NULL;
        struct dentry *moved;
        int inode2_type;
        int ret = 0;
        __u32 test_mask, marks_mask;

        if (path)
                mnt = real_mount(path->mnt);

        if (!inode) {
                /* Dirent event - report on TYPE_INODE to dir */
                inode = dir;
                /* For FS_RENAME, inode is old_dir and inode2 is new_dir */
                if (mask & FS_RENAME) {
                        moved = fsnotify_data_dentry(data, data_type);
                        inode2 = moved->d_parent->d_inode;
                        inode2_type = FSNOTIFY_ITER_TYPE_INODE2;
                }
        } else if (mask & FS_EVENT_ON_CHILD) {
                /*
                 * Event on child - report on TYPE_PARENT to dir if it is
                 * watching children and on TYPE_INODE to child.
                 */
                inode2 = dir;
                inode2_type = FSNOTIFY_ITER_TYPE_PARENT;
        }

        /*
         * Optimization: srcu_read_lock() has a memory barrier which can
         * be expensive.  It protects walking the *_fsnotify_marks lists.
         * However, if we do not walk the lists, we do not have to do
         * SRCU because we have no references to any objects and do not
         * need SRCU to keep them "alive".
         */
        if (!sb->s_fsnotify_marks &&
            (!mnt || !mnt->mnt_fsnotify_marks) &&
            (!inode || !inode->i_fsnotify_marks) &&
            (!inode2 || !inode2->i_fsnotify_marks))
                return 0;

        marks_mask = sb->s_fsnotify_mask;
        if (mnt)
                marks_mask |= mnt->mnt_fsnotify_mask;
        if (inode)
                marks_mask |= inode->i_fsnotify_mask;
        if (inode2)
                marks_mask |= inode2->i_fsnotify_mask;


        /*
         * If this is a modify event we may need to clear some ignore masks.
         * In that case, the object with ignore masks will have the FS_MODIFY
         * event in its mask.
         * Otherwise, return if none of the marks care about this type of event.
         */
        test_mask = (mask & ALL_FSNOTIFY_EVENTS);
        if (!(test_mask & marks_mask))
                return 0;

        iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu);

        iter_info.marks[FSNOTIFY_ITER_TYPE_SB] =
                fsnotify_first_mark(&sb->s_fsnotify_marks);
        if (mnt) {
                iter_info.marks[FSNOTIFY_ITER_TYPE_VFSMOUNT] =
                        fsnotify_first_mark(&mnt->mnt_fsnotify_marks);
        }
        if (inode) {
                iter_info.marks[FSNOTIFY_ITER_TYPE_INODE] =
                        fsnotify_first_mark(&inode->i_fsnotify_marks);
        }
        if (inode2) {
                iter_info.marks[inode2_type] =
                        fsnotify_first_mark(&inode2->i_fsnotify_marks);
        }

        /*
         * We need to merge inode/vfsmount/sb mark lists so that e.g. inode mark
         * ignore masks are properly reflected for mount/sb mark notifications.
         * That's why this traversal is so complicated...
         */
        while (fsnotify_iter_select_report_types(&iter_info)) {
                ret = send_to_group(mask, data, data_type, dir, file_name,
                                    cookie, &iter_info);

                if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
                        goto out;

                fsnotify_iter_next(&iter_info);
        }
        ret = 0;
out:
        srcu_read_unlock(&fsnotify_mark_srcu, iter_info.srcu_idx);

        return ret;
}
EXPORT_SYMBOL_GPL(fsnotify);

static __init int fsnotify_init(void)
{
        int ret;

        BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 23);

        ret = init_srcu_struct(&fsnotify_mark_srcu);
        if (ret)
                panic("initializing fsnotify_mark_srcu");

        fsnotify_mark_connector_cachep = KMEM_CACHE(fsnotify_mark_connector,
                                                    SLAB_PANIC);

        return 0;
}
core_initcall(fsnotify_init);

































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Dynamic loading of modules into the kernel.
 *
 * Rewritten by Richard Henderson <rth@tamu.edu> Dec 1996
 * Rewritten again by Rusty Russell, 2002
 */

#ifndef _LINUX_MODULE_H
#define _LINUX_MODULE_H

#include <linux/list.h>
#include <linux/stat.h>
#include <linux/compiler.h>
#include <linux/cache.h>
#include <linux/kmod.h>
#include <linux/init.h>
#include <linux/elf.h>
#include <linux/stringify.h>
#include <linux/kobject.h>
#include <linux/moduleparam.h>
#include <linux/jump_label.h>
#include <linux/export.h>
#include <linux/rbtree_latch.h>
#include <linux/error-injection.h>
#include <linux/tracepoint-defs.h>
#include <linux/srcu.h>
#include <linux/static_call_types.h>

#include <linux/percpu.h>
#include <asm/module.h>

/* Not Yet Implemented */
#define MODULE_SUPPORTED_DEVICE(name)

#define MODULE_NAME_LEN MAX_PARAM_PREFIX_LEN

struct modversion_info {
        unsigned long crc;
        char name[MODULE_NAME_LEN];
};

struct module;
struct exception_table_entry;

struct module_kobject {
        struct kobject kobj;
        struct module *mod;
        struct kobject *drivers_dir;
        struct module_param_attrs *mp;
        struct completion *kobj_completion;
} __randomize_layout;

struct module_attribute {
        struct attribute attr;
        ssize_t (*show)(struct module_attribute *, struct module_kobject *,
                        char *);
        ssize_t (*store)(struct module_attribute *, struct module_kobject *,
                         const char *, size_t count);
        void (*setup)(struct module *, const char *);
        int (*test)(struct module *);
        void (*free)(struct module *);
};

struct module_version_attribute {
        struct module_attribute mattr;
        const char *module_name;
        const char *version;
} __attribute__ ((__aligned__(sizeof(void *))));

extern ssize_t __modver_version_show(struct module_attribute *,
                                     struct module_kobject *, char *);

extern struct module_attribute module_uevent;

/* These are either module local, or the kernel's dummy ones. */
extern int init_module(void);
extern void cleanup_module(void);

#ifndef MODULE
/**
 * module_init() - driver initialization entry point
 * @x: function to be run at kernel boot time or module insertion
 *
 * module_init() will either be called during do_initcalls() (if
 * builtin) or at module insertion time (if a module).  There can only
 * be one per module.
 */
#define module_init(x)        __initcall(x);

/**
 * module_exit() - driver exit entry point
 * @x: function to be run when driver is removed
 *
 * module_exit() will wrap the driver clean-up code
 * with cleanup_module() when used with rmmod when
 * the driver is a module.  If the driver is statically
 * compiled into the kernel, module_exit() has no effect.
 * There can only be one per module.
 */
#define module_exit(x)        __exitcall(x);

#else /* MODULE */

/*
 * In most cases loadable modules do not need custom
 * initcall levels. There are still some valid cases where
 * a driver may be needed early if built in, and does not
 * matter when built as a loadable module. Like bus
 * snooping debug drivers.
 */
#define early_initcall(fn)                module_init(fn)
#define core_initcall(fn)                module_init(fn)
#define core_initcall_sync(fn)                module_init(fn)
#define postcore_initcall(fn)                module_init(fn)
#define postcore_initcall_sync(fn)        module_init(fn)
#define arch_initcall(fn)                module_init(fn)
#define subsys_initcall(fn)                module_init(fn)
#define subsys_initcall_sync(fn)        module_init(fn)
#define fs_initcall(fn)                        module_init(fn)
#define fs_initcall_sync(fn)                module_init(fn)
#define rootfs_initcall(fn)                module_init(fn)
#define device_initcall(fn)                module_init(fn)
#define device_initcall_sync(fn)        module_init(fn)
#define late_initcall(fn)                module_init(fn)
#define late_initcall_sync(fn)                module_init(fn)

#define console_initcall(fn)                module_init(fn)

/* Each module must use one module_init(). */
#define module_init(initfn)                                        \
        static inline initcall_t __maybe_unused __inittest(void)                \
        { return initfn; }                                        \
        int init_module(void) __copy(initfn) __attribute__((alias(#initfn)));

/* This is only required if you want to be unloadable. */
#define module_exit(exitfn)                                        \
        static inline exitcall_t __maybe_unused __exittest(void)                \
        { return exitfn; }                                        \
        void cleanup_module(void) __copy(exitfn) __attribute__((alias(#exitfn)));

#endif

/* This means "can be init if no module support, otherwise module load
   may call it." */
#ifdef CONFIG_MODULES
#define __init_or_module
#define __initdata_or_module
#define __initconst_or_module
#define __INIT_OR_MODULE        .text
#define __INITDATA_OR_MODULE        .data
#define __INITRODATA_OR_MODULE        .section ".rodata","a",%progbits
#else
#define __init_or_module __init
#define __initdata_or_module __initdata
#define __initconst_or_module __initconst
#define __INIT_OR_MODULE __INIT
#define __INITDATA_OR_MODULE __INITDATA
#define __INITRODATA_OR_MODULE __INITRODATA
#endif /*CONFIG_MODULES*/

/* Generic info of form tag = "info" */
#define MODULE_INFO(tag, info) __MODULE_INFO(tag, tag, info)

/* For userspace: you can also call me... */
#define MODULE_ALIAS(_alias) MODULE_INFO(alias, _alias)

/* Soft module dependencies. See man modprobe.d for details.
 * Example: MODULE_SOFTDEP("pre: module-foo module-bar post: module-baz")
 */
#define MODULE_SOFTDEP(_softdep) MODULE_INFO(softdep, _softdep)

/*
 * MODULE_FILE is used for generating modules.builtin
 * So, make it no-op when this is being built as a module
 */
#ifdef MODULE
#define MODULE_FILE
#else
#define MODULE_FILE        MODULE_INFO(file, KBUILD_MODFILE);
#endif

/*
 * The following license idents are currently accepted as indicating free
 * software modules
 *
 *        "GPL"                                [GNU Public License v2]
 *        "GPL v2"                        [GNU Public License v2]
 *        "GPL and additional rights"        [GNU Public License v2 rights and more]
 *        "Dual BSD/GPL"                        [GNU Public License v2
 *                                         or BSD license choice]
 *        "Dual MIT/GPL"                        [GNU Public License v2
 *                                         or MIT license choice]
 *        "Dual MPL/GPL"                        [GNU Public License v2
 *                                         or Mozilla license choice]
 *
 * The following other idents are available
 *
 *        "Proprietary"                        [Non free products]
 *
 * Both "GPL v2" and "GPL" (the latter also in dual licensed strings) are
 * merely stating that the module is licensed under the GPL v2, but are not
 * telling whether "GPL v2 only" or "GPL v2 or later". The reason why there
 * are two variants is a historic and failed attempt to convey more
 * information in the MODULE_LICENSE string. For module loading the
 * "only/or later" distinction is completely irrelevant and does neither
 * replace the proper license identifiers in the corresponding source file
 * nor amends them in any way. The sole purpose is to make the
 * 'Proprietary' flagging work and to refuse to bind symbols which are
 * exported with EXPORT_SYMBOL_GPL when a non free module is loaded.
 *
 * In the same way "BSD" is not a clear license information. It merely
 * states, that the module is licensed under one of the compatible BSD
 * license variants. The detailed and correct license information is again
 * to be found in the corresponding source files.
 *
 * There are dual licensed components, but when running with Linux it is the
 * GPL that is relevant so this is a non issue. Similarly LGPL linked with GPL
 * is a GPL combined work.
 *
 * This exists for several reasons
 * 1.        So modinfo can show license info for users wanting to vet their setup
 *        is free
 * 2.        So the community can ignore bug reports including proprietary modules
 * 3.        So vendors can do likewise based on their own policies
 */
#define MODULE_LICENSE(_license) MODULE_FILE MODULE_INFO(license, _license)

/*
 * Author(s), use "Name <email>" or just "Name", for multiple
 * authors use multiple MODULE_AUTHOR() statements/lines.
 */
#define MODULE_AUTHOR(_author) MODULE_INFO(author, _author)

/* What your module does. */
#define MODULE_DESCRIPTION(_description) MODULE_INFO(description, _description)

#ifdef MODULE
/* Creates an alias so file2alias.c can find device table. */
#define MODULE_DEVICE_TABLE(type, name)                                        \
extern typeof(name) __mod_##type##__##name##_device_table                \
  __attribute__ ((unused, alias(__stringify(name))))
#else  /* !MODULE */
#define MODULE_DEVICE_TABLE(type, name)
#endif

/* Version of form [<epoch>:]<version>[-<extra-version>].
 * Or for CVS/RCS ID version, everything but the number is stripped.
 * <epoch>: A (small) unsigned integer which allows you to start versions
 * anew. If not mentioned, it's zero.  eg. "2:1.0" is after
 * "1:2.0".

 * <version>: The <version> may contain only alphanumerics and the
 * character `.'.  Ordered by numeric sort for numeric parts,
 * ascii sort for ascii parts (as per RPM or DEB algorithm).

 * <extraversion>: Like <version>, but inserted for local
 * customizations, eg "rh3" or "rusty1".

 * Using this automatically adds a checksum of the .c files and the
 * local headers in "srcversion".
 */

#if defined(MODULE) || !defined(CONFIG_SYSFS)
#define MODULE_VERSION(_version) MODULE_INFO(version, _version)
#else
#define MODULE_VERSION(_version)                                        \
        MODULE_INFO(version, _version);                                        \
        static struct module_version_attribute ___modver_attr = {        \
                .mattr        = {                                                \
                        .attr        = {                                        \
                                .name        = "version",                        \
                                .mode        = S_IRUGO,                        \
                        },                                                \
                        .show        = __modver_version_show,                \
                },                                                        \
                .module_name        = KBUILD_MODNAME,                        \
                .version        = _version,                                \
        };                                                                \
        static const struct module_version_attribute                        \
        __used __section("__modver")                                        \
        * __moduleparam_const __modver_attr = &___modver_attr
#endif

/* Optional firmware file (or files) needed by the module
 * format is simply firmware file name.  Multiple firmware
 * files require multiple MODULE_FIRMWARE() specifiers */
#define MODULE_FIRMWARE(_firmware) MODULE_INFO(firmware, _firmware)

#define MODULE_IMPORT_NS(ns) MODULE_INFO(import_ns, #ns)

struct notifier_block;

#ifdef CONFIG_MODULES

extern int modules_disabled; /* for sysctl */
/* Get/put a kernel symbol (calls must be symmetric) */
void *__symbol_get(const char *symbol);
void *__symbol_get_gpl(const char *symbol);
#define symbol_get(x) ((typeof(&x))(__symbol_get(__stringify(x))))

/* modules using other modules: kdb wants to see this. */
struct module_use {
        struct list_head source_list;
        struct list_head target_list;
        struct module *source, *target;
};

enum module_state {
        MODULE_STATE_LIVE,        /* Normal state. */
        MODULE_STATE_COMING,        /* Full formed, running module_init. */
        MODULE_STATE_GOING,        /* Going away. */
        MODULE_STATE_UNFORMED,        /* Still setting it up. */
};

struct mod_tree_node {
        struct module *mod;
        struct latch_tree_node node;
};

struct module_layout {
        /* The actual code + data. */
        void *base;
        /* Total size. */
        unsigned int size;
        /* The size of the executable code.  */
        unsigned int text_size;
        /* Size of RO section of the module (text+rodata) */
        unsigned int ro_size;
        /* Size of RO after init section */
        unsigned int ro_after_init_size;

#ifdef CONFIG_MODULES_TREE_LOOKUP
        struct mod_tree_node mtn;
#endif
};

#ifdef CONFIG_MODULES_TREE_LOOKUP
/* Only touch one cacheline for common rbtree-for-core-layout case. */
#define __module_layout_align ____cacheline_aligned
#else
#define __module_layout_align
#endif

struct mod_kallsyms {
        Elf_Sym *symtab;
        unsigned int num_symtab;
        char *strtab;
        char *typetab;
};

#ifdef CONFIG_LIVEPATCH
struct klp_modinfo {
        Elf_Ehdr hdr;
        Elf_Shdr *sechdrs;
        char *secstrings;
        unsigned int symndx;
};
#endif

struct module {
        enum module_state state;

        /* Member of list of modules */
        struct list_head list;

        /* Unique handle for this module */
        char name[MODULE_NAME_LEN];

        /* Sysfs stuff. */
        struct module_kobject mkobj;
        struct module_attribute *modinfo_attrs;
        const char *version;
        const char *srcversion;
        struct kobject *holders_dir;

        /* Exported symbols */
        const struct kernel_symbol *syms;
        const s32 *crcs;
        unsigned int num_syms;

        /* Kernel parameters. */
#ifdef CONFIG_SYSFS
        struct mutex param_lock;
#endif
        struct kernel_param *kp;
        unsigned int num_kp;

        /* GPL-only exported symbols. */
        unsigned int num_gpl_syms;
        const struct kernel_symbol *gpl_syms;
        const s32 *gpl_crcs;
        bool using_gplonly_symbols;

#ifdef CONFIG_UNUSED_SYMBOLS
        /* unused exported symbols. */
        const struct kernel_symbol *unused_syms;
        const s32 *unused_crcs;
        unsigned int num_unused_syms;

        /* GPL-only, unused exported symbols. */
        unsigned int num_unused_gpl_syms;
        const struct kernel_symbol *unused_gpl_syms;
        const s32 *unused_gpl_crcs;
#endif

#ifdef CONFIG_MODULE_SIG
        /* Signature was verified. */
        bool sig_ok;
#endif

        bool async_probe_requested;

        /* symbols that will be GPL-only in the near future. */
        const struct kernel_symbol *gpl_future_syms;
        const s32 *gpl_future_crcs;
        unsigned int num_gpl_future_syms;

        /* Exception table */
        unsigned int num_exentries;
        struct exception_table_entry *extable;

        /* Startup function. */
        int (*init)(void);

        /* Core layout: rbtree is accessed frequently, so keep together. */
        struct module_layout core_layout __module_layout_align;
        struct module_layout init_layout;

        /* Arch-specific module values */
        struct mod_arch_specific arch;

        unsigned long taints;        /* same bits as kernel:taint_flags */

#ifdef CONFIG_GENERIC_BUG
        /* Support for BUG */
        unsigned num_bugs;
        struct list_head bug_list;
        struct bug_entry *bug_table;
#endif

#ifdef CONFIG_KALLSYMS
        /* Protected by RCU and/or module_mutex: use rcu_dereference() */
        struct mod_kallsyms __rcu *kallsyms;
        struct mod_kallsyms core_kallsyms;

        /* Section attributes */
        struct module_sect_attrs *sect_attrs;

        /* Notes attributes */
        struct module_notes_attrs *notes_attrs;
#endif

        /* The command line arguments (may be mangled).  People like
           keeping pointers to this stuff */
        char *args;

#ifdef CONFIG_SMP
        /* Per-cpu data. */
        void __percpu *percpu;
        unsigned int percpu_size;
#endif
        void *noinstr_text_start;
        unsigned int noinstr_text_size;

#ifdef CONFIG_TRACEPOINTS
        unsigned int num_tracepoints;
        tracepoint_ptr_t *tracepoints_ptrs;
#endif
#ifdef CONFIG_TREE_SRCU
        unsigned int num_srcu_structs;
        struct srcu_struct **srcu_struct_ptrs;
#endif
#ifdef CONFIG_BPF_EVENTS
        unsigned int num_bpf_raw_events;
        struct bpf_raw_event_map *bpf_raw_events;
#endif
#ifdef CONFIG_JUMP_LABEL
        struct jump_entry *jump_entries;
        unsigned int num_jump_entries;
#endif
#ifdef CONFIG_TRACING
        unsigned int num_trace_bprintk_fmt;
        const char **trace_bprintk_fmt_start;
#endif
#ifdef CONFIG_EVENT_TRACING
        struct trace_event_call **trace_events;
        unsigned int num_trace_events;
        struct trace_eval_map **trace_evals;
        unsigned int num_trace_evals;
#endif
#ifdef CONFIG_FTRACE_MCOUNT_RECORD
        unsigned int num_ftrace_callsites;
        unsigned long *ftrace_callsites;
#endif
#ifdef CONFIG_KPROBES
        void *kprobes_text_start;
        unsigned int kprobes_text_size;
        unsigned long *kprobe_blacklist;
        unsigned int num_kprobe_blacklist;
#endif
#ifdef CONFIG_HAVE_STATIC_CALL_INLINE
        int num_static_call_sites;
        struct static_call_site *static_call_sites;
#endif

#ifdef CONFIG_LIVEPATCH
        bool klp; /* Is this a livepatch module? */
        bool klp_alive;

        /* Elf information */
        struct klp_modinfo *klp_info;
#endif

#ifdef CONFIG_MODULE_UNLOAD
        /* What modules depend on me? */
        struct list_head source_list;
        /* What modules do I depend on? */
        struct list_head target_list;

        /* Destruction function. */
        void (*exit)(void);

        atomic_t refcnt;
#endif

#ifdef CONFIG_MITIGATION_ITS
        int its_num_pages;
        void **its_page_array;
#endif

#ifdef CONFIG_CONSTRUCTORS
        /* Constructor functions. */
        ctor_fn_t *ctors;
        unsigned int num_ctors;
#endif

#ifdef CONFIG_FUNCTION_ERROR_INJECTION
        struct error_injection_entry *ei_funcs;
        unsigned int num_ei_funcs;
#endif
} ____cacheline_aligned __randomize_layout;
#ifndef MODULE_ARCH_INIT
#define MODULE_ARCH_INIT {}
#endif

#ifndef HAVE_ARCH_KALLSYMS_SYMBOL_VALUE
static inline unsigned long kallsyms_symbol_value(const Elf_Sym *sym)
{
        return sym->st_value;
}
#endif

extern struct mutex module_mutex;

/* FIXME: It'd be nice to isolate modules during init, too, so they
   aren't used before they (may) fail.  But presently too much code
   (IDE & SCSI) require entry into the module during init.*/
static inline bool module_is_live(struct module *mod)
{
        return mod->state != MODULE_STATE_GOING;
}

struct module *__module_text_address(unsigned long addr);
struct module *__module_address(unsigned long addr);
bool is_module_address(unsigned long addr);
bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr);
bool is_module_percpu_address(unsigned long addr);
bool is_module_text_address(unsigned long addr);

static inline bool within_module_core(unsigned long addr,
                                      const struct module *mod)
{
        return (unsigned long)mod->core_layout.base <= addr &&
               addr < (unsigned long)mod->core_layout.base + mod->core_layout.size;
}

static inline bool within_module_init(unsigned long addr,
                                      const struct module *mod)
{
        return (unsigned long)mod->init_layout.base <= addr &&
               addr < (unsigned long)mod->init_layout.base + mod->init_layout.size;
}

static inline bool within_module(unsigned long addr, const struct module *mod)
{
        return within_module_init(addr, mod) || within_module_core(addr, mod);
}

/* Search for module by name: must be in a RCU-sched critical section. */
struct module *find_module(const char *name);

struct symsearch {
        const struct kernel_symbol *start, *stop;
        const s32 *crcs;
        enum mod_license {
                NOT_GPL_ONLY,
                GPL_ONLY,
                WILL_BE_GPL_ONLY,
        } license;
        bool unused;
};

/* Returns 0 and fills in value, defined and namebuf, or -ERANGE if
   symnum out of range. */
int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
                        char *name, char *module_name, int *exported);

/* Look for this name: can be of form module:name. */
unsigned long module_kallsyms_lookup_name(const char *name);

extern void __noreturn __module_put_and_kthread_exit(struct module *mod,
                        long code);
#define module_put_and_kthread_exit(code) __module_put_and_kthread_exit(THIS_MODULE, code)

#ifdef CONFIG_MODULE_UNLOAD
int module_refcount(struct module *mod);
void __symbol_put(const char *symbol);
#define symbol_put(x) __symbol_put(__stringify(x))
void symbol_put_addr(void *addr);

/* Sometimes we know we already have a refcount, and it's easier not
   to handle the error case (which only happens with rmmod --wait). */
extern void __module_get(struct module *module);

/* This is the Right Way to get a module: if it fails, it's being removed,
 * so pretend it's not there. */
extern bool try_module_get(struct module *module);

extern void module_put(struct module *module);

#else /*!CONFIG_MODULE_UNLOAD*/
static inline bool try_module_get(struct module *module)
{
        return !module || module_is_live(module);
}
static inline void module_put(struct module *module)
{
}
static inline void __module_get(struct module *module)
{
}
#define symbol_put(x) do { } while (0)
#define symbol_put_addr(p) do { } while (0)

#endif /* CONFIG_MODULE_UNLOAD */

/* This is a #define so the string doesn't get put in every .o file */
#define module_name(mod)                        \
({                                                \
        struct module *__mod = (mod);                \
        __mod ? __mod->name : "kernel";                \
})

/* Dereference module function descriptor */
void *dereference_module_function_descriptor(struct module *mod, void *ptr);

/* For kallsyms to ask for address resolution.  namebuf should be at
 * least KSYM_NAME_LEN long: a pointer to namebuf is returned if
 * found, otherwise NULL. */
const char *module_address_lookup(unsigned long addr,
                            unsigned long *symbolsize,
                            unsigned long *offset,
                            char **modname,
                            char *namebuf);
int lookup_module_symbol_name(unsigned long addr, char *symname);
int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name);

int register_module_notifier(struct notifier_block *nb);
int unregister_module_notifier(struct notifier_block *nb);

extern void print_modules(void);

static inline bool module_requested_async_probing(struct module *module)
{
        return module && module->async_probe_requested;
}

#ifdef CONFIG_LIVEPATCH
static inline bool is_livepatch_module(struct module *mod)
{
        return mod->klp;
}
#else /* !CONFIG_LIVEPATCH */
static inline bool is_livepatch_module(struct module *mod)
{
        return false;
}
#endif /* CONFIG_LIVEPATCH */

bool is_module_sig_enforced(void);
void set_module_sig_enforced(void);

#else /* !CONFIG_MODULES... */

static inline struct module *__module_address(unsigned long addr)
{
        return NULL;
}

static inline struct module *__module_text_address(unsigned long addr)
{
        return NULL;
}

static inline bool is_module_address(unsigned long addr)
{
        return false;
}

static inline bool is_module_percpu_address(unsigned long addr)
{
        return false;
}

static inline bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
{
        return false;
}

static inline bool is_module_text_address(unsigned long addr)
{
        return false;
}

static inline bool within_module_core(unsigned long addr,
                                      const struct module *mod)
{
        return false;
}

static inline bool within_module_init(unsigned long addr,
                                      const struct module *mod)
{
        return false;
}

static inline bool within_module(unsigned long addr, const struct module *mod)
{
        return false;
}

/* Get/put a kernel symbol (calls should be symmetric) */
#define symbol_get(x) ({ extern typeof(x) x __attribute__((weak,visibility("hidden"))); &(x); })
#define symbol_put(x) do { } while (0)
#define symbol_put_addr(x) do { } while (0)

static inline void __module_get(struct module *module)
{
}

static inline bool try_module_get(struct module *module)
{
        return true;
}

static inline void module_put(struct module *module)
{
}

#define module_name(mod) "kernel"

/* For kallsyms to ask for address resolution.  NULL means not found. */
static inline const char *module_address_lookup(unsigned long addr,
                                          unsigned long *symbolsize,
                                          unsigned long *offset,
                                          char **modname,
                                          char *namebuf)
{
        return NULL;
}

static inline int lookup_module_symbol_name(unsigned long addr, char *symname)
{
        return -ERANGE;
}

static inline int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name)
{
        return -ERANGE;
}

static inline int module_get_kallsym(unsigned int symnum, unsigned long *value,
                                        char *type, char *name,
                                        char *module_name, int *exported)
{
        return -ERANGE;
}

static inline unsigned long module_kallsyms_lookup_name(const char *name)
{
        return 0;
}

static inline int register_module_notifier(struct notifier_block *nb)
{
        /* no events will happen anyway, so this can always succeed */
        return 0;
}

static inline int unregister_module_notifier(struct notifier_block *nb)
{
        return 0;
}

#define module_put_and_kthread_exit(code) kthread_exit(code)

static inline void print_modules(void)
{
}

static inline bool module_requested_async_probing(struct module *module)
{
        return false;
}

static inline bool is_module_sig_enforced(void)
{
        return false;
}

static inline void set_module_sig_enforced(void)
{
}

/* Dereference module function descriptor */
static inline
void *dereference_module_function_descriptor(struct module *mod, void *ptr)
{
        return ptr;
}

#endif /* CONFIG_MODULES */

#ifdef CONFIG_SYSFS
extern struct kset *module_kset;
extern struct kobj_type module_ktype;
extern int module_sysfs_initialized;
#endif /* CONFIG_SYSFS */

#define symbol_request(x) try_then_request_module(symbol_get(x), "symbol:" #x)

/* BELOW HERE ALL THESE ARE OBSOLETE AND WILL VANISH */

#define __MODULE_STRING(x) __stringify(x)

#ifdef CONFIG_GENERIC_BUG
void module_bug_finalize(const Elf_Ehdr *, const Elf_Shdr *,
                         struct module *);
void module_bug_cleanup(struct module *);

#else        /* !CONFIG_GENERIC_BUG */

static inline void module_bug_finalize(const Elf_Ehdr *hdr,
                                        const Elf_Shdr *sechdrs,
                                        struct module *mod)
{
}
static inline void module_bug_cleanup(struct module *mod) {}
#endif        /* CONFIG_GENERIC_BUG */

#ifdef CONFIG_RETPOLINE
extern bool retpoline_module_ok(bool has_retpoline);
#else
static inline bool retpoline_module_ok(bool has_retpoline)
{
        return true;
}
#endif

#ifdef CONFIG_MODULE_SIG
static inline bool module_sig_ok(struct module *module)
{
        return module->sig_ok;
}
#else        /* !CONFIG_MODULE_SIG */
static inline bool module_sig_ok(struct module *module)
{
        return true;
}
#endif        /* CONFIG_MODULE_SIG */

#if defined(CONFIG_MODULES) && defined(CONFIG_KALLSYMS)
int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
                                             struct module *, unsigned long),
                                   void *data);
#else
static inline int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
                                                 struct module *, unsigned long),
                                                 void *data)
{
        return -EOPNOTSUPP;
}
#endif  /* CONFIG_MODULES && CONFIG_KALLSYMS */

#endif /* _LINUX_MODULE_H */













































    1 














































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PID_NS_H
#define _LINUX_PID_NS_H

#include <linux/sched.h>
#include <linux/bug.h>
#include <linux/mm.h>
#include <linux/workqueue.h>
#include <linux/threads.h>
#include <linux/nsproxy.h>
#include <linux/kref.h>
#include <linux/ns_common.h>
#include <linux/idr.h>

/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
#define MAX_PID_NS_LEVEL 32

struct fs_pin;

struct pid_namespace {
        struct kref kref;
        struct idr idr;
        struct rcu_head rcu;
        unsigned int pid_allocated;
        struct task_struct *child_reaper;
        struct kmem_cache *pid_cachep;
        unsigned int level;
        struct pid_namespace *parent;
#ifdef CONFIG_BSD_PROCESS_ACCT
        struct fs_pin *bacct;
#endif
        struct user_namespace *user_ns;
        struct ucounts *ucounts;
        int reboot;        /* group exit code if this pidns was rebooted */
        struct ns_common ns;
} __randomize_layout;

extern struct pid_namespace init_pid_ns;

#define PIDNS_ADDING (1U << 31)

#ifdef CONFIG_PID_NS
static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
{
        if (ns != &init_pid_ns)
                kref_get(&ns->kref);
        return ns;
}

extern struct pid_namespace *copy_pid_ns(unsigned long flags,
        struct user_namespace *user_ns, struct pid_namespace *ns);
extern void zap_pid_ns_processes(struct pid_namespace *pid_ns);
extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd);
extern void put_pid_ns(struct pid_namespace *ns);

#else /* !CONFIG_PID_NS */
#include <linux/err.h>

static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
{
        return ns;
}

static inline struct pid_namespace *copy_pid_ns(unsigned long flags,
        struct user_namespace *user_ns, struct pid_namespace *ns)
{
        if (flags & CLONE_NEWPID)
                ns = ERR_PTR(-EINVAL);
        return ns;
}

static inline void put_pid_ns(struct pid_namespace *ns)
{
}

static inline void zap_pid_ns_processes(struct pid_namespace *ns)
{
        BUG();
}

static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
{
        return 0;
}
#endif /* CONFIG_PID_NS */

extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
void pidhash_init(void);
void pid_idr_init(void);

#endif /* _LINUX_PID_NS_H */










































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_NAMEI_H
#define _LINUX_NAMEI_H

#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/path.h>
#include <linux/fcntl.h>
#include <linux/errno.h>

enum { MAX_NESTED_LINKS = 8 };

#define MAXSYMLINKS 40

/*
 * Type of the last component on LOOKUP_PARENT
 */
enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT};

/* pathwalk mode */
#define LOOKUP_FOLLOW                0x0001        /* follow links at the end */
#define LOOKUP_DIRECTORY        0x0002        /* require a directory */
#define LOOKUP_AUTOMOUNT        0x0004  /* force terminal automount */
#define LOOKUP_EMPTY                0x4000        /* accept empty path [user_... only] */
#define LOOKUP_DOWN                0x8000        /* follow mounts in the starting point */
#define LOOKUP_MOUNTPOINT        0x0080        /* follow mounts in the end */

#define LOOKUP_REVAL                0x0020        /* tell ->d_revalidate() to trust no cache */
#define LOOKUP_RCU                0x0040        /* RCU pathwalk mode; semi-internal */

/* These tell filesystem methods that we are dealing with the final component... */
#define LOOKUP_OPEN                0x0100        /* ... in open */
#define LOOKUP_CREATE                0x0200        /* ... in object creation */
#define LOOKUP_EXCL                0x0400        /* ... in exclusive creation */
#define LOOKUP_RENAME_TARGET        0x0800        /* ... in destination of rename() */

/* internal use only */
#define LOOKUP_PARENT                0x0010
#define LOOKUP_JUMPED                0x1000
#define LOOKUP_ROOT                0x2000
#define LOOKUP_ROOT_GRABBED        0x0008

/* Scoping flags for lookup. */
#define LOOKUP_NO_SYMLINKS        0x010000 /* No symlink crossing. */
#define LOOKUP_NO_MAGICLINKS        0x020000 /* No nd_jump_link() crossing. */
#define LOOKUP_NO_XDEV                0x040000 /* No mountpoint crossing. */
#define LOOKUP_BENEATH                0x080000 /* No escaping from starting point. */
#define LOOKUP_IN_ROOT                0x100000 /* Treat dirfd as fs root. */
#define LOOKUP_CACHED                0x200000 /* Only do cached lookup */
/* LOOKUP_* flags which do scope-related checks based on the dirfd. */
#define LOOKUP_IS_SCOPED (LOOKUP_BENEATH | LOOKUP_IN_ROOT)

extern int path_pts(struct path *path);

extern int user_path_at_empty(int, const char __user *, unsigned, struct path *, int *empty);

static inline int user_path_at(int dfd, const char __user *name, unsigned flags,
                 struct path *path)
{
        return user_path_at_empty(dfd, name, flags, path, NULL);
}

extern int kern_path(const char *, unsigned, struct path *);

extern struct dentry *kern_path_create(int, const char *, struct path *, unsigned int);
extern struct dentry *user_path_create(int, const char __user *, struct path *, unsigned int);
extern void done_path_create(struct path *, struct dentry *);
extern struct dentry *kern_path_locked(const char *, struct path *);

extern struct dentry *try_lookup_one_len(const char *, struct dentry *, int);
extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
extern struct dentry *lookup_positive_unlocked(const char *, struct dentry *, int);

extern int follow_down_one(struct path *);
extern int follow_down(struct path *);
extern int follow_up(struct path *);

extern struct dentry *lock_rename(struct dentry *, struct dentry *);
extern void unlock_rename(struct dentry *, struct dentry *);

extern int __must_check nd_jump_link(struct path *path);

static inline void nd_terminate_link(void *name, size_t len, size_t maxlen)
{
        ((char *) name)[min(len, maxlen)] = '\0';
}

/**
 * retry_estale - determine whether the caller should retry an operation
 * @error: the error that would currently be returned
 * @flags: flags being used for next lookup attempt
 *
 * Check to see if the error code was -ESTALE, and then determine whether
 * to retry the call based on whether "flags" already has LOOKUP_REVAL set.
 *
 * Returns true if the caller should try the operation again.
 */
static inline bool
retry_estale(const long error, const unsigned int flags)
{
        return error == -ESTALE && !(flags & LOOKUP_REVAL);
}

#endif /* _LINUX_NAMEI_H */























































    1 




















































































    1 


    3 


    1 

    3 












    2 













































































































































































































    1 




































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * include/linux/backing-dev.h
 *
 * low-level device information and state which is propagated up through
 * to high-level code.
 */

#ifndef _LINUX_BACKING_DEV_H
#define _LINUX_BACKING_DEV_H

#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/blkdev.h>
#include <linux/device.h>
#include <linux/writeback.h>
#include <linux/blk-cgroup.h>
#include <linux/backing-dev-defs.h>
#include <linux/slab.h>

static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi)
{
        kref_get(&bdi->refcnt);
        return bdi;
}

struct backing_dev_info *bdi_get_by_id(u64 id);
void bdi_put(struct backing_dev_info *bdi);

__printf(2, 3)
int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...);
__printf(2, 0)
int bdi_register_va(struct backing_dev_info *bdi, const char *fmt,
                    va_list args);
void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner);
void bdi_unregister(struct backing_dev_info *bdi);

struct backing_dev_info *bdi_alloc(int node_id);

void wb_start_background_writeback(struct bdi_writeback *wb);
void wb_workfn(struct work_struct *work);
void wb_wakeup_delayed(struct bdi_writeback *wb);

void wb_wait_for_completion(struct wb_completion *done);

extern spinlock_t bdi_lock;
extern struct list_head bdi_list;

extern struct workqueue_struct *bdi_wq;
extern struct workqueue_struct *bdi_async_bio_wq;

static inline bool wb_has_dirty_io(struct bdi_writeback *wb)
{
        return test_bit(WB_has_dirty_io, &wb->state);
}

static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi)
{
        /*
         * @bdi->tot_write_bandwidth is guaranteed to be > 0 if there are
         * any dirty wbs.  See wb_update_write_bandwidth().
         */
        return atomic_long_read(&bdi->tot_write_bandwidth);
}

static inline void __add_wb_stat(struct bdi_writeback *wb,
                                 enum wb_stat_item item, s64 amount)
{
        percpu_counter_add_batch(&wb->stat[item], amount, WB_STAT_BATCH);
}

static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
{
        __add_wb_stat(wb, item, 1);
}

static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
{
        __add_wb_stat(wb, item, -1);
}

static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
{
        return percpu_counter_read_positive(&wb->stat[item]);
}

static inline s64 wb_stat_sum(struct bdi_writeback *wb, enum wb_stat_item item)
{
        return percpu_counter_sum_positive(&wb->stat[item]);
}

extern void wb_writeout_inc(struct bdi_writeback *wb);

/*
 * maximal error of a stat counter.
 */
static inline unsigned long wb_stat_error(void)
{
#ifdef CONFIG_SMP
        return nr_cpu_ids * WB_STAT_BATCH;
#else
        return 1;
#endif
}

int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio);
int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);

/*
 * Flags in backing_dev_info::capability
 *
 * BDI_CAP_WRITEBACK:                Supports dirty page writeback, and dirty pages
 *                                should contribute to accounting
 * BDI_CAP_WRITEBACK_ACCT:        Automatically account writeback pages
 * BDI_CAP_STRICTLIMIT:                Keep number of dirty pages below bdi threshold
 */
#define BDI_CAP_WRITEBACK                (1 << 0)
#define BDI_CAP_WRITEBACK_ACCT                (1 << 1)
#define BDI_CAP_STRICTLIMIT                (1 << 2)

extern struct backing_dev_info noop_backing_dev_info;

/**
 * writeback_in_progress - determine whether there is writeback in progress
 * @wb: bdi_writeback of interest
 *
 * Determine whether there is writeback waiting to be handled against a
 * bdi_writeback.
 */
static inline bool writeback_in_progress(struct bdi_writeback *wb)
{
        return test_bit(WB_writeback_running, &wb->state);
}

static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
{
        struct super_block *sb;

        if (!inode)
                return &noop_backing_dev_info;

        sb = inode->i_sb;
#ifdef CONFIG_BLOCK
        if (sb_is_blkdev_sb(sb))
                return I_BDEV(inode)->bd_bdi;
#endif
        return sb->s_bdi;
}

static inline int wb_congested(struct bdi_writeback *wb, int cong_bits)
{
        return wb->congested & cong_bits;
}

long congestion_wait(int sync, long timeout);
long wait_iff_congested(int sync, long timeout);

static inline bool mapping_can_writeback(struct address_space *mapping)
{
        return inode_to_bdi(mapping->host)->capabilities & BDI_CAP_WRITEBACK;
}

static inline int bdi_sched_wait(void *word)
{
        schedule();
        return 0;
}

#ifdef CONFIG_CGROUP_WRITEBACK

struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi,
                                    struct cgroup_subsys_state *memcg_css);
struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
                                    struct cgroup_subsys_state *memcg_css,
                                    gfp_t gfp);
void wb_memcg_offline(struct mem_cgroup *memcg);
void wb_blkcg_offline(struct blkcg *blkcg);
int inode_congested(struct inode *inode, int cong_bits);

/**
 * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode
 * @inode: inode of interest
 *
 * Cgroup writeback requires support from the filesystem.  Also, both memcg and
 * iocg have to be on the default hierarchy.  Test whether all conditions are
 * met.
 *
 * Note that the test result may change dynamically on the same inode
 * depending on how memcg and iocg are configured.
 */
static inline bool inode_cgwb_enabled(struct inode *inode)
{
        struct backing_dev_info *bdi = inode_to_bdi(inode);

        return cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
                cgroup_subsys_on_dfl(io_cgrp_subsys) &&
                (bdi->capabilities & BDI_CAP_WRITEBACK) &&
                (inode->i_sb->s_iflags & SB_I_CGROUPWB);
}

/**
 * wb_find_current - find wb for %current on a bdi
 * @bdi: bdi of interest
 *
 * Find the wb of @bdi which matches both the memcg and blkcg of %current.
 * Must be called under rcu_read_lock() which protects the returend wb.
 * NULL if not found.
 */
static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
{
        struct cgroup_subsys_state *memcg_css;
        struct bdi_writeback *wb;

        memcg_css = task_css(current, memory_cgrp_id);
        if (!memcg_css->parent)
                return &bdi->wb;

        wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);

        /*
         * %current's blkcg equals the effective blkcg of its memcg.  No
         * need to use the relatively expensive cgroup_get_e_css().
         */
        if (likely(wb && wb->blkcg_css == task_css(current, io_cgrp_id)))
                return wb;
        return NULL;
}

/**
 * wb_get_create_current - get or create wb for %current on a bdi
 * @bdi: bdi of interest
 * @gfp: allocation mask
 *
 * Equivalent to wb_get_create() on %current's memcg.  This function is
 * called from a relatively hot path and optimizes the common cases using
 * wb_find_current().
 */
static inline struct bdi_writeback *
wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
{
        struct bdi_writeback *wb;

        rcu_read_lock();
        wb = wb_find_current(bdi);
        if (wb && unlikely(!wb_tryget(wb)))
                wb = NULL;
        rcu_read_unlock();

        if (unlikely(!wb)) {
                struct cgroup_subsys_state *memcg_css;

                memcg_css = task_get_css(current, memory_cgrp_id);
                wb = wb_get_create(bdi, memcg_css, gfp);
                css_put(memcg_css);
        }
        return wb;
}

/**
 * inode_to_wb_is_valid - test whether an inode has a wb associated
 * @inode: inode of interest
 *
 * Returns %true if @inode has a wb associated.  May be called without any
 * locking.
 */
static inline bool inode_to_wb_is_valid(struct inode *inode)
{
        return inode->i_wb;
}

/**
 * inode_to_wb - determine the wb of an inode
 * @inode: inode of interest
 *
 * Returns the wb @inode is currently associated with.  The caller must be
 * holding either @inode->i_lock, the i_pages lock, or the
 * associated wb's list_lock.
 */
static inline struct bdi_writeback *inode_to_wb(const struct inode *inode)
{
#ifdef CONFIG_LOCKDEP
        WARN_ON_ONCE(debug_locks &&
                     (inode->i_sb->s_iflags & SB_I_CGROUPWB) &&
                     (!lockdep_is_held(&inode->i_lock) &&
                      !lockdep_is_held(&inode->i_mapping->i_pages.xa_lock) &&
                      !lockdep_is_held(&inode->i_wb->list_lock)));
#endif
        return inode->i_wb;
}

/**
 * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction
 * @inode: target inode
 * @cookie: output param, to be passed to the end function
 *
 * The caller wants to access the wb associated with @inode but isn't
 * holding inode->i_lock, the i_pages lock or wb->list_lock.  This
 * function determines the wb associated with @inode and ensures that the
 * association doesn't change until the transaction is finished with
 * unlocked_inode_to_wb_end().
 *
 * The caller must call unlocked_inode_to_wb_end() with *@cookie afterwards and
 * can't sleep during the transaction.  IRQs may or may not be disabled on
 * return.
 */
static inline struct bdi_writeback *
unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie)
{
        rcu_read_lock();

        /*
         * Paired with store_release in inode_switch_wbs_work_fn() and
         * ensures that we see the new wb if we see cleared I_WB_SWITCH.
         */
        cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH;

        if (unlikely(cookie->locked))
                xa_lock_irqsave(&inode->i_mapping->i_pages, cookie->flags);

        /*
         * Protected by either !I_WB_SWITCH + rcu_read_lock() or the i_pages
         * lock.  inode_to_wb() will bark.  Deref directly.
         */
        return inode->i_wb;
}

/**
 * unlocked_inode_to_wb_end - end inode wb access transaction
 * @inode: target inode
 * @cookie: @cookie from unlocked_inode_to_wb_begin()
 */
static inline void unlocked_inode_to_wb_end(struct inode *inode,
                                            struct wb_lock_cookie *cookie)
{
        if (unlikely(cookie->locked))
                xa_unlock_irqrestore(&inode->i_mapping->i_pages, cookie->flags);

        rcu_read_unlock();
}

#else        /* CONFIG_CGROUP_WRITEBACK */

static inline bool inode_cgwb_enabled(struct inode *inode)
{
        return false;
}

static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
{
        return &bdi->wb;
}

static inline struct bdi_writeback *
wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
{
        return &bdi->wb;
}

static inline bool inode_to_wb_is_valid(struct inode *inode)
{
        return true;
}

static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
{
        return &inode_to_bdi(inode)->wb;
}

static inline struct bdi_writeback *
unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie)
{
        return inode_to_wb(inode);
}

static inline void unlocked_inode_to_wb_end(struct inode *inode,
                                            struct wb_lock_cookie *cookie)
{
}

static inline void wb_memcg_offline(struct mem_cgroup *memcg)
{
}

static inline void wb_blkcg_offline(struct blkcg *blkcg)
{
}

static inline int inode_congested(struct inode *inode, int cong_bits)
{
        return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
}

#endif        /* CONFIG_CGROUP_WRITEBACK */

static inline int inode_read_congested(struct inode *inode)
{
        return inode_congested(inode, 1 << WB_sync_congested);
}

static inline int inode_write_congested(struct inode *inode)
{
        return inode_congested(inode, 1 << WB_async_congested);
}

static inline int inode_rw_congested(struct inode *inode)
{
        return inode_congested(inode, (1 << WB_sync_congested) |
                                      (1 << WB_async_congested));
}

static inline int bdi_congested(struct backing_dev_info *bdi, int cong_bits)
{
        return wb_congested(&bdi->wb, cong_bits);
}

static inline int bdi_read_congested(struct backing_dev_info *bdi)
{
        return bdi_congested(bdi, 1 << WB_sync_congested);
}

static inline int bdi_write_congested(struct backing_dev_info *bdi)
{
        return bdi_congested(bdi, 1 << WB_async_congested);
}

static inline int bdi_rw_congested(struct backing_dev_info *bdi)
{
        return bdi_congested(bdi, (1 << WB_sync_congested) |
                                  (1 << WB_async_congested));
}

const char *bdi_dev_name(struct backing_dev_info *bdi);

#endif        /* _LINUX_BACKING_DEV_H */


























































































































































































    1 









































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2008 IBM Corporation
 *
 * Authors:
 * Mimi Zohar <zohar@us.ibm.com>
 *
 * File: integrity_iint.c
 *        - implements the integrity hooks: integrity_inode_alloc,
 *          integrity_inode_free
 *        - cache integrity information associated with an inode
 *          using a rbtree tree.
 */
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/rbtree.h>
#include <linux/file.h>
#include <linux/uaccess.h>
#include <linux/security.h>
#include <linux/lsm_hooks.h>
#include "integrity.h"

static struct rb_root integrity_iint_tree = RB_ROOT;
static DEFINE_RWLOCK(integrity_iint_lock);
static struct kmem_cache *iint_cache __read_mostly;

struct dentry *integrity_dir;

/*
 * __integrity_iint_find - return the iint associated with an inode
 */
static struct integrity_iint_cache *__integrity_iint_find(struct inode *inode)
{
        struct integrity_iint_cache *iint;
        struct rb_node *n = integrity_iint_tree.rb_node;

        while (n) {
                iint = rb_entry(n, struct integrity_iint_cache, rb_node);

                if (inode < iint->inode)
                        n = n->rb_left;
                else if (inode > iint->inode)
                        n = n->rb_right;
                else
                        return iint;
        }

        return NULL;
}

/*
 * integrity_iint_find - return the iint associated with an inode
 */
struct integrity_iint_cache *integrity_iint_find(struct inode *inode)
{
        struct integrity_iint_cache *iint;

        if (!IS_IMA(inode))
                return NULL;

        read_lock(&integrity_iint_lock);
        iint = __integrity_iint_find(inode);
        read_unlock(&integrity_iint_lock);

        return iint;
}

#define IMA_MAX_NESTING (FILESYSTEM_MAX_STACK_DEPTH+1)

/*
 * It is not clear that IMA should be nested at all, but as long is it measures
 * files both on overlayfs and on underlying fs, we need to annotate the iint
 * mutex to avoid lockdep false positives related to IMA + overlayfs.
 * See ovl_lockdep_annotate_inode_mutex_key() for more details.
 */
static inline void iint_lockdep_annotate(struct integrity_iint_cache *iint,
                                         struct inode *inode)
{
#ifdef CONFIG_LOCKDEP
        static struct lock_class_key iint_mutex_key[IMA_MAX_NESTING];

        int depth = inode->i_sb->s_stack_depth;

        if (WARN_ON_ONCE(depth < 0 || depth >= IMA_MAX_NESTING))
                depth = 0;

        lockdep_set_class(&iint->mutex, &iint_mutex_key[depth]);
#endif
}

static void iint_init_always(struct integrity_iint_cache *iint,
                             struct inode *inode)
{
        iint->ima_hash = NULL;
        iint->version = 0;
        iint->flags = 0UL;
        iint->atomic_flags = 0UL;
        iint->ima_file_status = INTEGRITY_UNKNOWN;
        iint->ima_mmap_status = INTEGRITY_UNKNOWN;
        iint->ima_bprm_status = INTEGRITY_UNKNOWN;
        iint->ima_read_status = INTEGRITY_UNKNOWN;
        iint->ima_creds_status = INTEGRITY_UNKNOWN;
        iint->evm_status = INTEGRITY_UNKNOWN;
        iint->measured_pcrs = 0;
        mutex_init(&iint->mutex);
        iint_lockdep_annotate(iint, inode);
}

static void iint_free(struct integrity_iint_cache *iint)
{
        kfree(iint->ima_hash);
        mutex_destroy(&iint->mutex);
        kmem_cache_free(iint_cache, iint);
}

/**
 * integrity_inode_get - find or allocate an iint associated with an inode
 * @inode: pointer to the inode
 * @return: allocated iint
 *
 * Caller must lock i_mutex
 */
struct integrity_iint_cache *integrity_inode_get(struct inode *inode)
{
        struct rb_node **p;
        struct rb_node *node, *parent = NULL;
        struct integrity_iint_cache *iint, *test_iint;

        /*
         * The integrity's "iint_cache" is initialized at security_init(),
         * unless it is not included in the ordered list of LSMs enabled
         * on the boot command line.
         */
        if (!iint_cache)
                panic("%s: lsm=integrity required.\n", __func__);

        iint = integrity_iint_find(inode);
        if (iint)
                return iint;

        iint = kmem_cache_alloc(iint_cache, GFP_NOFS);
        if (!iint)
                return NULL;

        iint_init_always(iint, inode);

        write_lock(&integrity_iint_lock);

        p = &integrity_iint_tree.rb_node;
        while (*p) {
                parent = *p;
                test_iint = rb_entry(parent, struct integrity_iint_cache,
                                     rb_node);
                if (inode < test_iint->inode) {
                        p = &(*p)->rb_left;
                } else if (inode > test_iint->inode) {
                        p = &(*p)->rb_right;
                } else {
                        write_unlock(&integrity_iint_lock);
                        kmem_cache_free(iint_cache, iint);
                        return test_iint;
                }
        }

        iint->inode = inode;
        node = &iint->rb_node;
        inode->i_flags |= S_IMA;
        rb_link_node(node, parent, p);
        rb_insert_color(node, &integrity_iint_tree);

        write_unlock(&integrity_iint_lock);
        return iint;
}

/**
 * integrity_inode_free - called on security_inode_free
 * @inode: pointer to the inode
 *
 * Free the integrity information(iint) associated with an inode.
 */
void integrity_inode_free(struct inode *inode)
{
        struct integrity_iint_cache *iint;

        if (!IS_IMA(inode))
                return;

        write_lock(&integrity_iint_lock);
        iint = __integrity_iint_find(inode);
        rb_erase(&iint->rb_node, &integrity_iint_tree);
        write_unlock(&integrity_iint_lock);

        iint_free(iint);
}

static void iint_init_once(void *foo)
{
        struct integrity_iint_cache *iint = foo;

        memset(iint, 0, sizeof(*iint));
}

static int __init integrity_iintcache_init(void)
{
        iint_cache =
            kmem_cache_create("iint_cache", sizeof(struct integrity_iint_cache),
                              0, SLAB_PANIC, iint_init_once);
        return 0;
}
DEFINE_LSM(integrity) = {
        .name = "integrity",
        .init = integrity_iintcache_init,
};


/*
 * integrity_kernel_read - read data from the file
 *
 * This is a function for reading file content instead of kernel_read().
 * It does not perform locking checks to ensure it cannot be blocked.
 * It does not perform security checks because it is irrelevant for IMA.
 *
 */
int integrity_kernel_read(struct file *file, loff_t offset,
                          void *addr, unsigned long count)
{
        return __kernel_read(file, addr, count, &offset);
}

/*
 * integrity_load_keys - load integrity keys hook
 *
 * Hooks is called from init/main.c:kernel_init_freeable()
 * when rootfs is ready
 */
void __init integrity_load_keys(void)
{
        ima_load_x509();
        evm_load_x509();
}

static int __init integrity_fs_init(void)
{
        integrity_dir = securityfs_create_dir("integrity", NULL);
        if (IS_ERR(integrity_dir)) {
                int ret = PTR_ERR(integrity_dir);

                if (ret != -ENODEV)
                        pr_err("Unable to create integrity sysfs dir: %d\n",
                               ret);
                integrity_dir = NULL;
                return ret;
        }

        return 0;
}

late_initcall(integrity_fs_init)
















































































































































































































    1 












    1 













































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __CGROUP_INTERNAL_H
#define __CGROUP_INTERNAL_H

#include <linux/cgroup.h>
#include <linux/kernfs.h>
#include <linux/workqueue.h>
#include <linux/list.h>
#include <linux/refcount.h>
#include <linux/fs_parser.h>

#define TRACE_CGROUP_PATH_LEN 1024
extern spinlock_t trace_cgroup_path_lock;
extern char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
extern bool cgroup_debug;
extern void __init enable_debug_cgroup(void);

/*
 * cgroup_path() takes a spin lock. It is good practice not to take
 * spin locks within trace point handlers, as they are mostly hidden
 * from normal view. As cgroup_path() can take the kernfs_rename_lock
 * spin lock, it is best to not call that function from the trace event
 * handler.
 *
 * Note: trace_cgroup_##type##_enabled() is a static branch that will only
 *       be set when the trace event is enabled.
 */
#define TRACE_CGROUP_PATH(type, cgrp, ...)                                \
        do {                                                                \
                if (trace_cgroup_##type##_enabled()) {                        \
                        unsigned long flags;                                \
                        spin_lock_irqsave(&trace_cgroup_path_lock,        \
                                          flags);                        \
                        cgroup_path(cgrp, trace_cgroup_path,                \
                                    TRACE_CGROUP_PATH_LEN);                \
                        trace_cgroup_##type(cgrp, trace_cgroup_path,        \
                                            ##__VA_ARGS__);                \
                        spin_unlock_irqrestore(&trace_cgroup_path_lock, \
                                               flags);                        \
                }                                                        \
        } while (0)

/*
 * The cgroup filesystem superblock creation/mount context.
 */
struct cgroup_fs_context {
        struct kernfs_fs_context kfc;
        struct cgroup_root        *root;
        struct cgroup_namespace        *ns;
        unsigned int        flags;                        /* CGRP_ROOT_* flags */

        /* cgroup1 bits */
        bool                cpuset_clone_children;
        bool                none;                        /* User explicitly requested empty subsystem */
        bool                all_ss;                        /* Seen 'all' option */
        u16                subsys_mask;                /* Selected subsystems */
        char                *name;                        /* Hierarchy name */
        char                *release_agent;                /* Path for release notifications */
};

static inline struct cgroup_fs_context *cgroup_fc2context(struct fs_context *fc)
{
        struct kernfs_fs_context *kfc = fc->fs_private;

        return container_of(kfc, struct cgroup_fs_context, kfc);
}

struct cgroup_pidlist;

struct cgroup_file_ctx {
        struct cgroup_namespace        *ns;

        struct {
                void                        *trigger;
        } psi;

        struct {
                bool                        started;
                struct css_task_iter        iter;
        } procs;

        struct {
                struct cgroup_pidlist        *pidlist;
        } procs1;
};

/*
 * A cgroup can be associated with multiple css_sets as different tasks may
 * belong to different cgroups on different hierarchies.  In the other
 * direction, a css_set is naturally associated with multiple cgroups.
 * This M:N relationship is represented by the following link structure
 * which exists for each association and allows traversing the associations
 * from both sides.
 */
struct cgrp_cset_link {
        /* the cgroup and css_set this link associates */
        struct cgroup                *cgrp;
        struct css_set                *cset;

        /* list of cgrp_cset_links anchored at cgrp->cset_links */
        struct list_head        cset_link;

        /* list of cgrp_cset_links anchored at css_set->cgrp_links */
        struct list_head        cgrp_link;
};

/* used to track tasks and csets during migration */
struct cgroup_taskset {
        /* the src and dst cset list running through cset->mg_node */
        struct list_head        src_csets;
        struct list_head        dst_csets;

        /* the number of tasks in the set */
        int                        nr_tasks;

        /* the subsys currently being processed */
        int                        ssid;

        /*
         * Fields for cgroup_taskset_*() iteration.
         *
         * Before migration is committed, the target migration tasks are on
         * ->mg_tasks of the csets on ->src_csets.  After, on ->mg_tasks of
         * the csets on ->dst_csets.  ->csets point to either ->src_csets
         * or ->dst_csets depending on whether migration is committed.
         *
         * ->cur_csets and ->cur_task point to the current task position
         * during iteration.
         */
        struct list_head        *csets;
        struct css_set                *cur_cset;
        struct task_struct        *cur_task;
};

/* migration context also tracks preloading */
struct cgroup_mgctx {
        /*
         * Preloaded source and destination csets.  Used to guarantee
         * atomic success or failure on actual migration.
         */
        struct list_head        preloaded_src_csets;
        struct list_head        preloaded_dst_csets;

        /* tasks and csets to migrate */
        struct cgroup_taskset        tset;

        /* subsystems affected by migration */
        u16                        ss_mask;
};

#define CGROUP_TASKSET_INIT(tset)                                                \
{                                                                                \
        .src_csets                = LIST_HEAD_INIT(tset.src_csets),                \
        .dst_csets                = LIST_HEAD_INIT(tset.dst_csets),                \
        .csets                        = &tset.src_csets,                                \
}

#define CGROUP_MGCTX_INIT(name)                                                        \
{                                                                                \
        LIST_HEAD_INIT(name.preloaded_src_csets),                                \
        LIST_HEAD_INIT(name.preloaded_dst_csets),                                \
        CGROUP_TASKSET_INIT(name.tset),                                                \
}

#define DEFINE_CGROUP_MGCTX(name)                                                \
        struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)

extern struct mutex cgroup_mutex;
extern spinlock_t css_set_lock;
extern struct cgroup_subsys *cgroup_subsys[];
extern struct list_head cgroup_roots;

/* iterate across the hierarchies */
#define for_each_root(root)                                                \
        list_for_each_entry_rcu((root), &cgroup_roots, root_list,        \
                                lockdep_is_held(&cgroup_mutex))

/**
 * for_each_subsys - iterate all enabled cgroup subsystems
 * @ss: the iteration cursor
 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
 */
#define for_each_subsys(ss, ssid)                                        \
        for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&                \
             (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)

static inline bool cgroup_is_dead(const struct cgroup *cgrp)
{
        return !(cgrp->self.flags & CSS_ONLINE);
}

static inline bool notify_on_release(const struct cgroup *cgrp)
{
        return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
}

void put_css_set_locked(struct css_set *cset);

static inline void put_css_set(struct css_set *cset)
{
        unsigned long flags;

        /*
         * Ensure that the refcount doesn't hit zero while any readers
         * can see it. Similar to atomic_dec_and_lock(), but for an
         * rwlock
         */
        if (refcount_dec_not_one(&cset->refcount))
                return;

        spin_lock_irqsave(&css_set_lock, flags);
        put_css_set_locked(cset);
        spin_unlock_irqrestore(&css_set_lock, flags);
}

/*
 * refcounted get/put for css_set objects
 */
static inline void get_css_set(struct css_set *cset)
{
        refcount_inc(&cset->refcount);
}

bool cgroup_ssid_enabled(int ssid);
bool cgroup_on_dfl(const struct cgroup *cgrp);
bool cgroup_is_thread_root(struct cgroup *cgrp);
bool cgroup_is_threaded(struct cgroup *cgrp);

struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root);
struct cgroup *task_cgroup_from_root(struct task_struct *task,
                                     struct cgroup_root *root);
struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline);
void cgroup_kn_unlock(struct kernfs_node *kn);
int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
                          struct cgroup_namespace *ns);

void cgroup_free_root(struct cgroup_root *root);
void init_cgroup_root(struct cgroup_fs_context *ctx);
int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
int cgroup_do_get_tree(struct fs_context *fc);

int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp);
void cgroup_migrate_finish(struct cgroup_mgctx *mgctx);
void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp,
                            struct cgroup_mgctx *mgctx);
int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx);
int cgroup_migrate(struct task_struct *leader, bool threadgroup,
                   struct cgroup_mgctx *mgctx);

int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
                       bool threadgroup);
struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
                                             bool *locked)
        __acquires(&cgroup_threadgroup_rwsem);
void cgroup_procs_write_finish(struct task_struct *task, bool locked)
        __releases(&cgroup_threadgroup_rwsem);

void cgroup_lock_and_drain_offline(struct cgroup *cgrp);

int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode);
int cgroup_rmdir(struct kernfs_node *kn);
int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
                     struct kernfs_root *kf_root);

int __cgroup_task_count(const struct cgroup *cgrp);
int cgroup_task_count(const struct cgroup *cgrp);

/*
 * rstat.c
 */
int cgroup_rstat_init(struct cgroup *cgrp);
void cgroup_rstat_exit(struct cgroup *cgrp);
void cgroup_rstat_boot(void);
void cgroup_base_stat_cputime_show(struct seq_file *seq);

/*
 * namespace.c
 */
extern const struct proc_ns_operations cgroupns_operations;

/*
 * cgroup-v1.c
 */
extern struct cftype cgroup1_base_files[];
extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops;
extern const struct fs_parameter_spec cgroup1_fs_parameters[];

int proc_cgroupstats_show(struct seq_file *m, void *v);
bool cgroup1_ssid_disabled(int ssid);
void cgroup1_pidlist_destroy_all(struct cgroup *cgrp);
void cgroup1_release_agent(struct work_struct *work);
void cgroup1_check_for_release(struct cgroup *cgrp);
int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param);
int cgroup1_get_tree(struct fs_context *fc);
int cgroup1_reconfigure(struct fs_context *ctx);

#endif /* __CGROUP_INTERNAL_H */




























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PATH_H
#define _LINUX_PATH_H

struct dentry;
struct vfsmount;

struct path {
        struct vfsmount *mnt;
        struct dentry *dentry;
} __randomize_layout;

extern void path_get(const struct path *);
extern void path_put(const struct path *);

static inline int path_equal(const struct path *path1, const struct path *path2)
{
        return path1->mnt == path2->mnt && path1->dentry == path2->dentry;
}

static inline void path_put_init(struct path *path)
{
        path_put(path);
        *path = (struct path) { };
}

#endif  /* _LINUX_PATH_H */


























































































































































































































































































    4 


















































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Runtime locking correctness validator
 *
 *  Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
 *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
 *
 * see Documentation/locking/lockdep-design.rst for more details.
 */
#ifndef __LINUX_LOCKDEP_H
#define __LINUX_LOCKDEP_H

#include <linux/lockdep_types.h>
#include <linux/smp.h>
#include <asm/percpu.h>

struct task_struct;

/* for sysctl */
extern int prove_locking;
extern int lock_stat;

#ifdef CONFIG_LOCKDEP

#include <linux/linkage.h>
#include <linux/list.h>
#include <linux/debug_locks.h>
#include <linux/stacktrace.h>

static inline void lockdep_copy_map(struct lockdep_map *to,
                                    struct lockdep_map *from)
{
        int i;

        *to = *from;
        /*
         * Since the class cache can be modified concurrently we could observe
         * half pointers (64bit arch using 32bit copy insns). Therefore clear
         * the caches and take the performance hit.
         *
         * XXX it doesn't work well with lockdep_set_class_and_subclass(), since
         *     that relies on cache abuse.
         */
        for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
                to->class_cache[i] = NULL;
}

/*
 * Every lock has a list of other locks that were taken after it.
 * We only grow the list, never remove from it:
 */
struct lock_list {
        struct list_head                entry;
        struct lock_class                *class;
        struct lock_class                *links_to;
        const struct lock_trace                *trace;
        u16                                distance;
        /* bitmap of different dependencies from head to this */
        u8                                dep;
        /* used by BFS to record whether "prev -> this" only has -(*R)-> */
        u8                                only_xr;

        /*
         * The parent field is used to implement breadth-first search, and the
         * bit 0 is reused to indicate if the lock has been accessed in BFS.
         */
        struct lock_list                *parent;
};

/**
 * struct lock_chain - lock dependency chain record
 *
 * @irq_context: the same as irq_context in held_lock below
 * @depth:       the number of held locks in this chain
 * @base:        the index in chain_hlocks for this chain
 * @entry:       the collided lock chains in lock_chain hash list
 * @chain_key:   the hash key of this lock_chain
 */
struct lock_chain {
        /* see BUILD_BUG_ON()s in add_chain_cache() */
        unsigned int                        irq_context :  2,
                                        depth       :  6,
                                        base            : 24;
        /* 4 byte hole */
        struct hlist_node                entry;
        u64                                chain_key;
};

#define MAX_LOCKDEP_KEYS_BITS                13
#define MAX_LOCKDEP_KEYS                (1UL << MAX_LOCKDEP_KEYS_BITS)
#define INITIAL_CHAIN_KEY                -1

struct held_lock {
        /*
         * One-way hash of the dependency chain up to this point. We
         * hash the hashes step by step as the dependency chain grows.
         *
         * We use it for dependency-caching and we skip detection
         * passes and dependency-updates if there is a cache-hit, so
         * it is absolutely critical for 100% coverage of the validator
         * to have a unique key value for every unique dependency path
         * that can occur in the system, to make a unique hash value
         * as likely as possible - hence the 64-bit width.
         *
         * The task struct holds the current hash value (initialized
         * with zero), here we store the previous hash value:
         */
        u64                                prev_chain_key;
        unsigned long                        acquire_ip;
        struct lockdep_map                *instance;
        struct lockdep_map                *nest_lock;
#ifdef CONFIG_LOCK_STAT
        u64                                 waittime_stamp;
        u64                                holdtime_stamp;
#endif
        /*
         * class_idx is zero-indexed; it points to the element in
         * lock_classes this held lock instance belongs to. class_idx is in
         * the range from 0 to (MAX_LOCKDEP_KEYS-1) inclusive.
         */
        unsigned int                        class_idx:MAX_LOCKDEP_KEYS_BITS;
        /*
         * The lock-stack is unified in that the lock chains of interrupt
         * contexts nest ontop of process context chains, but we 'separate'
         * the hashes by starting with 0 if we cross into an interrupt
         * context, and we also keep do not add cross-context lock
         * dependencies - the lock usage graph walking covers that area
         * anyway, and we'd just unnecessarily increase the number of
         * dependencies otherwise. [Note: hardirq and softirq contexts
         * are separated from each other too.]
         *
         * The following field is used to detect when we cross into an
         * interrupt context:
         */
        unsigned int irq_context:2; /* bit 0 - soft, bit 1 - hard */
        unsigned int trylock:1;                                                /* 16 bits */

        unsigned int read:2;        /* see lock_acquire() comment */
        unsigned int check:1;       /* see lock_acquire() comment */
        unsigned int hardirqs_off:1;
        unsigned int references:12;                                        /* 32 bits */
        unsigned int pin_count;
};

/*
 * Initialization, self-test and debugging-output methods:
 */
extern void lockdep_init(void);
extern void lockdep_reset(void);
extern void lockdep_reset_lock(struct lockdep_map *lock);
extern void lockdep_free_key_range(void *start, unsigned long size);
extern asmlinkage void lockdep_sys_exit(void);
extern void lockdep_set_selftest_task(struct task_struct *task);

extern void lockdep_init_task(struct task_struct *task);

/*
 * Split the recrursion counter in two to readily detect 'off' vs recursion.
 */
#define LOCKDEP_RECURSION_BITS        16
#define LOCKDEP_OFF                (1U << LOCKDEP_RECURSION_BITS)
#define LOCKDEP_RECURSION_MASK        (LOCKDEP_OFF - 1)

/*
 * lockdep_{off,on}() are macros to avoid tracing and kprobes; not inlines due
 * to header dependencies.
 */

#define lockdep_off()                                        \
do {                                                        \
        current->lockdep_recursion += LOCKDEP_OFF;        \
} while (0)

#define lockdep_on()                                        \
do {                                                        \
        current->lockdep_recursion -= LOCKDEP_OFF;        \
} while (0)

extern void lockdep_register_key(struct lock_class_key *key);
extern void lockdep_unregister_key(struct lock_class_key *key);

/*
 * These methods are used by specific locking variants (spinlocks,
 * rwlocks, mutexes and rwsems) to pass init/acquire/release events
 * to lockdep:
 */

extern void lockdep_init_map_type(struct lockdep_map *lock, const char *name,
        struct lock_class_key *key, int subclass, u8 inner, u8 outer, u8 lock_type);

static inline void
lockdep_init_map_waits(struct lockdep_map *lock, const char *name,
                       struct lock_class_key *key, int subclass, u8 inner, u8 outer)
{
        lockdep_init_map_type(lock, name, key, subclass, inner, outer, LD_LOCK_NORMAL);
}

static inline void
lockdep_init_map_wait(struct lockdep_map *lock, const char *name,
                      struct lock_class_key *key, int subclass, u8 inner)
{
        lockdep_init_map_waits(lock, name, key, subclass, inner, LD_WAIT_INV);
}

static inline void lockdep_init_map(struct lockdep_map *lock, const char *name,
                             struct lock_class_key *key, int subclass)
{
        lockdep_init_map_wait(lock, name, key, subclass, LD_WAIT_INV);
}

/*
 * Reinitialize a lock key - for cases where there is special locking or
 * special initialization of locks so that the validator gets the scope
 * of dependencies wrong: they are either too broad (they need a class-split)
 * or they are too narrow (they suffer from a false class-split):
 */
#define lockdep_set_class(lock, key)                                \
        lockdep_init_map_type(&(lock)->dep_map, #key, key, 0,        \
                              (lock)->dep_map.wait_type_inner,        \
                              (lock)->dep_map.wait_type_outer,        \
                              (lock)->dep_map.lock_type)

#define lockdep_set_class_and_name(lock, key, name)                \
        lockdep_init_map_type(&(lock)->dep_map, name, key, 0,        \
                              (lock)->dep_map.wait_type_inner,        \
                              (lock)->dep_map.wait_type_outer,        \
                              (lock)->dep_map.lock_type)

#define lockdep_set_class_and_subclass(lock, key, sub)                \
        lockdep_init_map_type(&(lock)->dep_map, #key, key, sub,        \
                              (lock)->dep_map.wait_type_inner,        \
                              (lock)->dep_map.wait_type_outer,        \
                              (lock)->dep_map.lock_type)

#define lockdep_set_subclass(lock, sub)                                        \
        lockdep_init_map_type(&(lock)->dep_map, (lock)->dep_map.name, (lock)->dep_map.key, sub,\
                              (lock)->dep_map.wait_type_inner,                \
                              (lock)->dep_map.wait_type_outer,                \
                              (lock)->dep_map.lock_type)

#define lockdep_set_novalidate_class(lock) \
        lockdep_set_class_and_name(lock, &__lockdep_no_validate__, #lock)

/*
 * Compare locking classes
 */
#define lockdep_match_class(lock, key) lockdep_match_key(&(lock)->dep_map, key)

static inline int lockdep_match_key(struct lockdep_map *lock,
                                    struct lock_class_key *key)
{
        return lock->key == key;
}

/*
 * Acquire a lock.
 *
 * Values for "read":
 *
 *   0: exclusive (write) acquire
 *   1: read-acquire (no recursion allowed)
 *   2: read-acquire with same-instance recursion allowed
 *
 * Values for check:
 *
 *   0: simple checks (freeing, held-at-exit-time, etc.)
 *   1: full validation
 */
extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
                         int trylock, int read, int check,
                         struct lockdep_map *nest_lock, unsigned long ip);

extern void lock_release(struct lockdep_map *lock, unsigned long ip);

/*
 * Same "read" as for lock_acquire(), except -1 means any.
 */
extern int lock_is_held_type(const struct lockdep_map *lock, int read);

static inline int lock_is_held(const struct lockdep_map *lock)
{
        return lock_is_held_type(lock, -1);
}

#define lockdep_is_held(lock)                lock_is_held(&(lock)->dep_map)
#define lockdep_is_held_type(lock, r)        lock_is_held_type(&(lock)->dep_map, (r))

extern void lock_set_class(struct lockdep_map *lock, const char *name,
                           struct lock_class_key *key, unsigned int subclass,
                           unsigned long ip);

static inline void lock_set_subclass(struct lockdep_map *lock,
                unsigned int subclass, unsigned long ip)
{
        lock_set_class(lock, lock->name, lock->key, subclass, ip);
}

extern void lock_downgrade(struct lockdep_map *lock, unsigned long ip);

#define NIL_COOKIE (struct pin_cookie){ .val = 0U, }

extern struct pin_cookie lock_pin_lock(struct lockdep_map *lock);
extern void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie);
extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie);

#define lockdep_depth(tsk)        (debug_locks ? (tsk)->lockdep_depth : 0)

#define lockdep_assert_held(l)        do {                                \
                WARN_ON(debug_locks && !lockdep_is_held(l));        \
        } while (0)

#define lockdep_assert_held_write(l)        do {                        \
                WARN_ON(debug_locks && !lockdep_is_held_type(l, 0));        \
        } while (0)

#define lockdep_assert_held_read(l)        do {                                \
                WARN_ON(debug_locks && !lockdep_is_held_type(l, 1));        \
        } while (0)

#define lockdep_assert_held_once(l)        do {                                \
                WARN_ON_ONCE(debug_locks && !lockdep_is_held(l));        \
        } while (0)

#define lockdep_assert_none_held_once()        do {                                \
                WARN_ON_ONCE(debug_locks && current->lockdep_depth);        \
        } while (0)

#define lockdep_recursing(tsk)        ((tsk)->lockdep_recursion)

#define lockdep_pin_lock(l)        lock_pin_lock(&(l)->dep_map)
#define lockdep_repin_lock(l,c)        lock_repin_lock(&(l)->dep_map, (c))
#define lockdep_unpin_lock(l,c)        lock_unpin_lock(&(l)->dep_map, (c))

#else /* !CONFIG_LOCKDEP */

static inline void lockdep_init_task(struct task_struct *task)
{
}

static inline void lockdep_off(void)
{
}

static inline void lockdep_on(void)
{
}

static inline void lockdep_set_selftest_task(struct task_struct *task)
{
}

# define lock_acquire(l, s, t, r, c, n, i)        do { } while (0)
# define lock_release(l, i)                        do { } while (0)
# define lock_downgrade(l, i)                        do { } while (0)
# define lock_set_class(l, n, k, s, i)                do { } while (0)
# define lock_set_subclass(l, s, i)                do { } while (0)
# define lockdep_init()                                do { } while (0)
# define lockdep_init_map_type(lock, name, key, sub, inner, outer, type) \
                do { (void)(name); (void)(key); } while (0)
# define lockdep_init_map_waits(lock, name, key, sub, inner, outer) \
                do { (void)(name); (void)(key); } while (0)
# define lockdep_init_map_wait(lock, name, key, sub, inner) \
                do { (void)(name); (void)(key); } while (0)
# define lockdep_init_map(lock, name, key, sub) \
                do { (void)(name); (void)(key); } while (0)
# define lockdep_set_class(lock, key)                do { (void)(key); } while (0)
# define lockdep_set_class_and_name(lock, key, name) \
                do { (void)(key); (void)(name); } while (0)
#define lockdep_set_class_and_subclass(lock, key, sub) \
                do { (void)(key); } while (0)
#define lockdep_set_subclass(lock, sub)                do { } while (0)

#define lockdep_set_novalidate_class(lock) do { } while (0)

/*
 * We don't define lockdep_match_class() and lockdep_match_key() for !LOCKDEP
 * case since the result is not well defined and the caller should rather
 * #ifdef the call himself.
 */

# define lockdep_reset()                do { debug_locks = 1; } while (0)
# define lockdep_free_key_range(start, size)        do { } while (0)
# define lockdep_sys_exit()                         do { } while (0)

static inline void lockdep_register_key(struct lock_class_key *key)
{
}

static inline void lockdep_unregister_key(struct lock_class_key *key)
{
}

#define lockdep_depth(tsk)        (0)

#define lockdep_is_held_type(l, r)                (1)

#define lockdep_assert_held(l)                        do { (void)(l); } while (0)
#define lockdep_assert_held_write(l)        do { (void)(l); } while (0)
#define lockdep_assert_held_read(l)                do { (void)(l); } while (0)
#define lockdep_assert_held_once(l)                do { (void)(l); } while (0)
#define lockdep_assert_none_held_once()        do { } while (0)

#define lockdep_recursing(tsk)                        (0)

#define NIL_COOKIE (struct pin_cookie){ }

#define lockdep_pin_lock(l)                        ({ struct pin_cookie cookie = { }; cookie; })
#define lockdep_repin_lock(l, c)                do { (void)(l); (void)(c); } while (0)
#define lockdep_unpin_lock(l, c)                do { (void)(l); (void)(c); } while (0)

#endif /* !LOCKDEP */

enum xhlock_context_t {
        XHLOCK_HARD,
        XHLOCK_SOFT,
        XHLOCK_CTX_NR,
};

#define lockdep_init_map_crosslock(m, n, k, s) do {} while (0)
/*
 * To initialize a lockdep_map statically use this macro.
 * Note that _name must not be NULL.
 */
#define STATIC_LOCKDEP_MAP_INIT(_name, _key) \
        { .name = (_name), .key = (void *)(_key), }

static inline void lockdep_invariant_state(bool force) {}
static inline void lockdep_free_task(struct task_struct *task) {}

#ifdef CONFIG_LOCK_STAT

extern void lock_contended(struct lockdep_map *lock, unsigned long ip);
extern void lock_acquired(struct lockdep_map *lock, unsigned long ip);

#define LOCK_CONTENDED(_lock, try, lock)                        \
do {                                                                \
        if (!try(_lock)) {                                        \
                lock_contended(&(_lock)->dep_map, _RET_IP_);        \
                lock(_lock);                                        \
        }                                                        \
        lock_acquired(&(_lock)->dep_map, _RET_IP_);                        \
} while (0)

#define LOCK_CONTENDED_RETURN(_lock, try, lock)                        \
({                                                                \
        int ____err = 0;                                        \
        if (!try(_lock)) {                                        \
                lock_contended(&(_lock)->dep_map, _RET_IP_);        \
                ____err = lock(_lock);                                \
        }                                                        \
        if (!____err)                                                \
                lock_acquired(&(_lock)->dep_map, _RET_IP_);        \
        ____err;                                                \
})

#else /* CONFIG_LOCK_STAT */

#define lock_contended(lockdep_map, ip) do {} while (0)
#define lock_acquired(lockdep_map, ip) do {} while (0)

#define LOCK_CONTENDED(_lock, try, lock) \
        lock(_lock)

#define LOCK_CONTENDED_RETURN(_lock, try, lock) \
        lock(_lock)

#endif /* CONFIG_LOCK_STAT */

#ifdef CONFIG_LOCKDEP

/*
 * On lockdep we dont want the hand-coded irq-enable of
 * _raw_*_lock_flags() code, because lockdep assumes
 * that interrupts are not re-enabled during lock-acquire:
 */
#define LOCK_CONTENDED_FLAGS(_lock, try, lock, lockfl, flags) \
        LOCK_CONTENDED((_lock), (try), (lock))

#else /* CONFIG_LOCKDEP */

#define LOCK_CONTENDED_FLAGS(_lock, try, lock, lockfl, flags) \
        lockfl((_lock), (flags))

#endif /* CONFIG_LOCKDEP */

#ifdef CONFIG_PROVE_LOCKING
extern void print_irqtrace_events(struct task_struct *curr);
#else
static inline void print_irqtrace_events(struct task_struct *curr)
{
}
#endif

/* Variable used to make lockdep treat read_lock() as recursive in selftests */
#ifdef CONFIG_DEBUG_LOCKING_API_SELFTESTS
extern unsigned int force_read_lock_recursive;
#else /* CONFIG_DEBUG_LOCKING_API_SELFTESTS */
#define force_read_lock_recursive 0
#endif /* CONFIG_DEBUG_LOCKING_API_SELFTESTS */

#ifdef CONFIG_LOCKDEP
extern bool read_lock_is_recursive(void);
#else /* CONFIG_LOCKDEP */
/* If !LOCKDEP, the value is meaningless */
#define read_lock_is_recursive() 0
#endif

/*
 * For trivial one-depth nesting of a lock-class, the following
 * global define can be used. (Subsystems with multiple levels
 * of nesting should define their own lock-nesting subclasses.)
 */
#define SINGLE_DEPTH_NESTING                        1

/*
 * Map the dependency ops to NOP or to real lockdep ops, depending
 * on the per lock-class debug mode:
 */

#define lock_acquire_exclusive(l, s, t, n, i)                lock_acquire(l, s, t, 0, 1, n, i)
#define lock_acquire_shared(l, s, t, n, i)                lock_acquire(l, s, t, 1, 1, n, i)
#define lock_acquire_shared_recursive(l, s, t, n, i)        lock_acquire(l, s, t, 2, 1, n, i)

#define spin_acquire(l, s, t, i)                lock_acquire_exclusive(l, s, t, NULL, i)
#define spin_acquire_nest(l, s, t, n, i)        lock_acquire_exclusive(l, s, t, n, i)
#define spin_release(l, i)                        lock_release(l, i)

#define rwlock_acquire(l, s, t, i)                lock_acquire_exclusive(l, s, t, NULL, i)
#define rwlock_acquire_read(l, s, t, i)                                        \
do {                                                                        \
        if (read_lock_is_recursive())                                        \
                lock_acquire_shared_recursive(l, s, t, NULL, i);        \
        else                                                                \
                lock_acquire_shared(l, s, t, NULL, i);                        \
} while (0)

#define rwlock_release(l, i)                        lock_release(l, i)

#define seqcount_acquire(l, s, t, i)                lock_acquire_exclusive(l, s, t, NULL, i)
#define seqcount_acquire_read(l, s, t, i)        lock_acquire_shared_recursive(l, s, t, NULL, i)
#define seqcount_release(l, i)                        lock_release(l, i)

#define mutex_acquire(l, s, t, i)                lock_acquire_exclusive(l, s, t, NULL, i)
#define mutex_acquire_nest(l, s, t, n, i)        lock_acquire_exclusive(l, s, t, n, i)
#define mutex_release(l, i)                        lock_release(l, i)

#define rwsem_acquire(l, s, t, i)                lock_acquire_exclusive(l, s, t, NULL, i)
#define rwsem_acquire_nest(l, s, t, n, i)        lock_acquire_exclusive(l, s, t, n, i)
#define rwsem_acquire_read(l, s, t, i)                lock_acquire_shared(l, s, t, NULL, i)
#define rwsem_release(l, i)                        lock_release(l, i)

#define lock_map_acquire(l)                        lock_acquire_exclusive(l, 0, 0, NULL, _THIS_IP_)
#define lock_map_acquire_read(l)                lock_acquire_shared_recursive(l, 0, 0, NULL, _THIS_IP_)
#define lock_map_acquire_tryread(l)                lock_acquire_shared_recursive(l, 0, 1, NULL, _THIS_IP_)
#define lock_map_release(l)                        lock_release(l, _THIS_IP_)

#ifdef CONFIG_PROVE_LOCKING
# define might_lock(lock)                                                \
do {                                                                        \
        typecheck(struct lockdep_map *, &(lock)->dep_map);                \
        lock_acquire(&(lock)->dep_map, 0, 0, 0, 1, NULL, _THIS_IP_);        \
        lock_release(&(lock)->dep_map, _THIS_IP_);                        \
} while (0)
# define might_lock_read(lock)                                                \
do {                                                                        \
        typecheck(struct lockdep_map *, &(lock)->dep_map);                \
        lock_acquire(&(lock)->dep_map, 0, 0, 1, 1, NULL, _THIS_IP_);        \
        lock_release(&(lock)->dep_map, _THIS_IP_);                        \
} while (0)
# define might_lock_nested(lock, subclass)                                \
do {                                                                        \
        typecheck(struct lockdep_map *, &(lock)->dep_map);                \
        lock_acquire(&(lock)->dep_map, subclass, 0, 1, 1, NULL,                \
                     _THIS_IP_);                                        \
        lock_release(&(lock)->dep_map, _THIS_IP_);                        \
} while (0)

DECLARE_PER_CPU(int, hardirqs_enabled);
DECLARE_PER_CPU(int, hardirq_context);
DECLARE_PER_CPU(unsigned int, lockdep_recursion);

#define __lockdep_enabled        (debug_locks && !this_cpu_read(lockdep_recursion))

#define lockdep_assert_irqs_enabled()                                        \
do {                                                                        \
        WARN_ON_ONCE(__lockdep_enabled && !this_cpu_read(hardirqs_enabled)); \
} while (0)

#define lockdep_assert_irqs_disabled()                                        \
do {                                                                        \
        WARN_ON_ONCE(__lockdep_enabled && this_cpu_read(hardirqs_enabled)); \
} while (0)

#define lockdep_assert_in_irq()                                                \
do {                                                                        \
        WARN_ON_ONCE(__lockdep_enabled && !this_cpu_read(hardirq_context)); \
} while (0)

#define lockdep_assert_preemption_enabled()                                \
do {                                                                        \
        WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT)        &&                \
                     __lockdep_enabled                        &&                \
                     (preempt_count() != 0                ||                \
                      !this_cpu_read(hardirqs_enabled)));                \
} while (0)

#define lockdep_assert_preemption_disabled()                                \
do {                                                                        \
        WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT)        &&                \
                     __lockdep_enabled                        &&                \
                     (preempt_count() == 0                &&                \
                      this_cpu_read(hardirqs_enabled)));                \
} while (0)

#else
# define might_lock(lock) do { } while (0)
# define might_lock_read(lock) do { } while (0)
# define might_lock_nested(lock, subclass) do { } while (0)

# define lockdep_assert_irqs_enabled() do { } while (0)
# define lockdep_assert_irqs_disabled() do { } while (0)
# define lockdep_assert_in_irq() do { } while (0)

# define lockdep_assert_preemption_enabled() do { } while (0)
# define lockdep_assert_preemption_disabled() do { } while (0)
#endif

#ifdef CONFIG_PROVE_RAW_LOCK_NESTING

# define lockdep_assert_RT_in_threaded_ctx() do {                        \
                WARN_ONCE(debug_locks && !current->lockdep_recursion &&        \
                          lockdep_hardirq_context() &&                        \
                          !(current->hardirq_threaded || current->irq_config),        \
                          "Not in threaded context on PREEMPT_RT as expected\n");        \
} while (0)

#else

# define lockdep_assert_RT_in_threaded_ctx() do { } while (0)

#endif

#ifdef CONFIG_LOCKDEP
void lockdep_rcu_suspicious(const char *file, const int line, const char *s);
#else
static inline void
lockdep_rcu_suspicious(const char *file, const int line, const char *s)
{
}
#endif

#endif /* __LINUX_LOCKDEP_H */








































































































































































































































































































































































































































































    1 
    1 




    1 

















































































































    1 
































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
// SPDX-License-Identifier: GPL-2.0
/*
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  This file contains the interface functions for the various time related
 *  system calls: time, stime, gettimeofday, settimeofday, adjtime
 *
 * Modification history:
 *
 * 1993-09-02    Philip Gladstone
 *      Created file with time related functions from sched/core.c and adjtimex()
 * 1993-10-08    Torsten Duwe
 *      adjtime interface update and CMOS clock write code
 * 1995-08-13    Torsten Duwe
 *      kernel PLL updated to 1994-12-13 specs (rfc-1589)
 * 1999-01-16    Ulrich Windl
 *        Introduced error checking for many cases in adjtimex().
 *        Updated NTP code according to technical memorandum Jan '96
 *        "A Kernel Model for Precision Timekeeping" by Dave Mills
 *        Allow time_constant larger than MAXTC(6) for NTP v4 (MAXTC == 10)
 *        (Even though the technical memorandum forbids it)
 * 2004-07-14         Christoph Lameter
 *        Added getnstimeofday to allow the posix timer functions to return
 *        with nanosecond accuracy
 */

#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/timex.h>
#include <linux/capability.h>
#include <linux/timekeeper_internal.h>
#include <linux/errno.h>
#include <linux/syscalls.h>
#include <linux/security.h>
#include <linux/fs.h>
#include <linux/math64.h>
#include <linux/ptrace.h>

#include <linux/uaccess.h>
#include <linux/compat.h>
#include <asm/unistd.h>

#include <generated/timeconst.h>
#include "timekeeping.h"

/*
 * The timezone where the local system is located.  Used as a default by some
 * programs who obtain this value by using gettimeofday.
 */
struct timezone sys_tz;

EXPORT_SYMBOL(sys_tz);

#ifdef __ARCH_WANT_SYS_TIME

/*
 * sys_time() can be implemented in user-level using
 * sys_gettimeofday().  Is this for backwards compatibility?  If so,
 * why not move it into the appropriate arch directory (for those
 * architectures that need it).
 */
SYSCALL_DEFINE1(time, __kernel_old_time_t __user *, tloc)
{
        __kernel_old_time_t i = (__kernel_old_time_t)ktime_get_real_seconds();

        if (tloc) {
                if (put_user(i,tloc))
                        return -EFAULT;
        }
        force_successful_syscall_return();
        return i;
}

/*
 * sys_stime() can be implemented in user-level using
 * sys_settimeofday().  Is this for backwards compatibility?  If so,
 * why not move it into the appropriate arch directory (for those
 * architectures that need it).
 */

SYSCALL_DEFINE1(stime, __kernel_old_time_t __user *, tptr)
{
        struct timespec64 tv;
        int err;

        if (get_user(tv.tv_sec, tptr))
                return -EFAULT;

        tv.tv_nsec = 0;

        err = security_settime64(&tv, NULL);
        if (err)
                return err;

        do_settimeofday64(&tv);
        return 0;
}

#endif /* __ARCH_WANT_SYS_TIME */

#ifdef CONFIG_COMPAT_32BIT_TIME
#ifdef __ARCH_WANT_SYS_TIME32

/* old_time32_t is a 32 bit "long" and needs to get converted. */
SYSCALL_DEFINE1(time32, old_time32_t __user *, tloc)
{
        old_time32_t i;

        i = (old_time32_t)ktime_get_real_seconds();

        if (tloc) {
                if (put_user(i,tloc))
                        return -EFAULT;
        }
        force_successful_syscall_return();
        return i;
}

SYSCALL_DEFINE1(stime32, old_time32_t __user *, tptr)
{
        struct timespec64 tv;
        int err;

        if (get_user(tv.tv_sec, tptr))
                return -EFAULT;

        tv.tv_nsec = 0;

        err = security_settime64(&tv, NULL);
        if (err)
                return err;

        do_settimeofday64(&tv);
        return 0;
}

#endif /* __ARCH_WANT_SYS_TIME32 */
#endif

SYSCALL_DEFINE2(gettimeofday, struct __kernel_old_timeval __user *, tv,
                struct timezone __user *, tz)
{
        if (likely(tv != NULL)) {
                struct timespec64 ts;

                ktime_get_real_ts64(&ts);
                if (put_user(ts.tv_sec, &tv->tv_sec) ||
                    put_user(ts.tv_nsec / 1000, &tv->tv_usec))
                        return -EFAULT;
        }
        if (unlikely(tz != NULL)) {
                if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
                        return -EFAULT;
        }
        return 0;
}

/*
 * In case for some reason the CMOS clock has not already been running
 * in UTC, but in some local time: The first time we set the timezone,
 * we will warp the clock so that it is ticking UTC time instead of
 * local time. Presumably, if someone is setting the timezone then we
 * are running in an environment where the programs understand about
 * timezones. This should be done at boot time in the /etc/rc script,
 * as soon as possible, so that the clock can be set right. Otherwise,
 * various programs will get confused when the clock gets warped.
 */

int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz)
{
        static int firsttime = 1;
        int error = 0;

        if (tv && !timespec64_valid_settod(tv))
                return -EINVAL;

        error = security_settime64(tv, tz);
        if (error)
                return error;

        if (tz) {
                /* Verify we're within the +-15 hrs range */
                if (tz->tz_minuteswest > 15*60 || tz->tz_minuteswest < -15*60)
                        return -EINVAL;

                sys_tz = *tz;
                update_vsyscall_tz();
                if (firsttime) {
                        firsttime = 0;
                        if (!tv)
                                timekeeping_warp_clock();
                }
        }
        if (tv)
                return do_settimeofday64(tv);
        return 0;
}

SYSCALL_DEFINE2(settimeofday, struct __kernel_old_timeval __user *, tv,
                struct timezone __user *, tz)
{
        struct timespec64 new_ts;
        struct timezone new_tz;

        if (tv) {
                if (get_user(new_ts.tv_sec, &tv->tv_sec) ||
                    get_user(new_ts.tv_nsec, &tv->tv_usec))
                        return -EFAULT;

                if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0)
                        return -EINVAL;

                new_ts.tv_nsec *= NSEC_PER_USEC;
        }
        if (tz) {
                if (copy_from_user(&new_tz, tz, sizeof(*tz)))
                        return -EFAULT;
        }

        return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(gettimeofday, struct old_timeval32 __user *, tv,
                       struct timezone __user *, tz)
{
        if (tv) {
                struct timespec64 ts;

                ktime_get_real_ts64(&ts);
                if (put_user(ts.tv_sec, &tv->tv_sec) ||
                    put_user(ts.tv_nsec / 1000, &tv->tv_usec))
                        return -EFAULT;
        }
        if (tz) {
                if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
                        return -EFAULT;
        }

        return 0;
}

COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv,
                       struct timezone __user *, tz)
{
        struct timespec64 new_ts;
        struct timezone new_tz;

        if (tv) {
                if (get_user(new_ts.tv_sec, &tv->tv_sec) ||
                    get_user(new_ts.tv_nsec, &tv->tv_usec))
                        return -EFAULT;

                if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0)
                        return -EINVAL;

                new_ts.tv_nsec *= NSEC_PER_USEC;
        }
        if (tz) {
                if (copy_from_user(&new_tz, tz, sizeof(*tz)))
                        return -EFAULT;
        }

        return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
}
#endif

#ifdef CONFIG_64BIT
SYSCALL_DEFINE1(adjtimex, struct __kernel_timex __user *, txc_p)
{
        struct __kernel_timex txc;                /* Local copy of parameter */
        int ret;

        /* Copy the user data space into the kernel copy
         * structure. But bear in mind that the structures
         * may change
         */
        if (copy_from_user(&txc, txc_p, sizeof(struct __kernel_timex)))
                return -EFAULT;
        ret = do_adjtimex(&txc);
        return copy_to_user(txc_p, &txc, sizeof(struct __kernel_timex)) ? -EFAULT : ret;
}
#endif

#ifdef CONFIG_COMPAT_32BIT_TIME
int get_old_timex32(struct __kernel_timex *txc, const struct old_timex32 __user *utp)
{
        struct old_timex32 tx32;

        memset(txc, 0, sizeof(struct __kernel_timex));
        if (copy_from_user(&tx32, utp, sizeof(struct old_timex32)))
                return -EFAULT;

        txc->modes = tx32.modes;
        txc->offset = tx32.offset;
        txc->freq = tx32.freq;
        txc->maxerror = tx32.maxerror;
        txc->esterror = tx32.esterror;
        txc->status = tx32.status;
        txc->constant = tx32.constant;
        txc->precision = tx32.precision;
        txc->tolerance = tx32.tolerance;
        txc->time.tv_sec = tx32.time.tv_sec;
        txc->time.tv_usec = tx32.time.tv_usec;
        txc->tick = tx32.tick;
        txc->ppsfreq = tx32.ppsfreq;
        txc->jitter = tx32.jitter;
        txc->shift = tx32.shift;
        txc->stabil = tx32.stabil;
        txc->jitcnt = tx32.jitcnt;
        txc->calcnt = tx32.calcnt;
        txc->errcnt = tx32.errcnt;
        txc->stbcnt = tx32.stbcnt;

        return 0;
}

int put_old_timex32(struct old_timex32 __user *utp, const struct __kernel_timex *txc)
{
        struct old_timex32 tx32;

        memset(&tx32, 0, sizeof(struct old_timex32));
        tx32.modes = txc->modes;
        tx32.offset = txc->offset;
        tx32.freq = txc->freq;
        tx32.maxerror = txc->maxerror;
        tx32.esterror = txc->esterror;
        tx32.status = txc->status;
        tx32.constant = txc->constant;
        tx32.precision = txc->precision;
        tx32.tolerance = txc->tolerance;
        tx32.time.tv_sec = txc->time.tv_sec;
        tx32.time.tv_usec = txc->time.tv_usec;
        tx32.tick = txc->tick;
        tx32.ppsfreq = txc->ppsfreq;
        tx32.jitter = txc->jitter;
        tx32.shift = txc->shift;
        tx32.stabil = txc->stabil;
        tx32.jitcnt = txc->jitcnt;
        tx32.calcnt = txc->calcnt;
        tx32.errcnt = txc->errcnt;
        tx32.stbcnt = txc->stbcnt;
        tx32.tai = txc->tai;
        if (copy_to_user(utp, &tx32, sizeof(struct old_timex32)))
                return -EFAULT;
        return 0;
}

SYSCALL_DEFINE1(adjtimex_time32, struct old_timex32 __user *, utp)
{
        struct __kernel_timex txc;
        int err, ret;

        err = get_old_timex32(&txc, utp);
        if (err)
                return err;

        ret = do_adjtimex(&txc);

        err = put_old_timex32(utp, &txc);
        if (err)
                return err;

        return ret;
}
#endif

/**
 * jiffies_to_msecs - Convert jiffies to milliseconds
 * @j: jiffies value
 *
 * Avoid unnecessary multiplications/divisions in the
 * two most common HZ cases.
 *
 * Return: milliseconds value
 */
unsigned int jiffies_to_msecs(const unsigned long j)
{
#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
        return (MSEC_PER_SEC / HZ) * j;
#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
        return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
#else
# if BITS_PER_LONG == 32
        return (HZ_TO_MSEC_MUL32 * j + (1ULL << HZ_TO_MSEC_SHR32) - 1) >>
               HZ_TO_MSEC_SHR32;
# else
        return DIV_ROUND_UP(j * HZ_TO_MSEC_NUM, HZ_TO_MSEC_DEN);
# endif
#endif
}
EXPORT_SYMBOL(jiffies_to_msecs);

/**
 * jiffies_to_usecs - Convert jiffies to microseconds
 * @j: jiffies value
 *
 * Return: microseconds value
 */
unsigned int jiffies_to_usecs(const unsigned long j)
{
        /*
         * Hz usually doesn't go much further MSEC_PER_SEC.
         * jiffies_to_usecs() and usecs_to_jiffies() depend on that.
         */
        BUILD_BUG_ON(HZ > USEC_PER_SEC);

#if !(USEC_PER_SEC % HZ)
        return (USEC_PER_SEC / HZ) * j;
#else
# if BITS_PER_LONG == 32
        return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32;
# else
        return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN;
# endif
#endif
}
EXPORT_SYMBOL(jiffies_to_usecs);

/**
 * mktime64 - Converts date to seconds.
 * @year0: year to convert
 * @mon0: month to convert
 * @day: day to convert
 * @hour: hour to convert
 * @min: minute to convert
 * @sec: second to convert
 *
 * Converts Gregorian date to seconds since 1970-01-01 00:00:00.
 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
 *
 * [For the Julian calendar (which was used in Russia before 1917,
 * Britain & colonies before 1752, anywhere else before 1582,
 * and is still in use by some communities) leave out the
 * -year/100+year/400 terms, and add 10.]
 *
 * This algorithm was first published by Gauss (I think).
 *
 * A leap second can be indicated by calling this function with sec as
 * 60 (allowable under ISO 8601).  The leap second is treated the same
 * as the following second since they don't exist in UNIX time.
 *
 * An encoding of midnight at the end of the day as 24:00:00 - ie. midnight
 * tomorrow - (allowable under ISO 8601) is supported.
 *
 * Return: seconds since the epoch time for the given input date
 */
time64_t mktime64(const unsigned int year0, const unsigned int mon0,
                const unsigned int day, const unsigned int hour,
                const unsigned int min, const unsigned int sec)
{
        unsigned int mon = mon0, year = year0;

        /* 1..12 -> 11,12,1..10 */
        if (0 >= (int) (mon -= 2)) {
                mon += 12;        /* Puts Feb last since it has leap day */
                year -= 1;
        }

        return ((((time64_t)
                  (year/4 - year/100 + year/400 + 367*mon/12 + day) +
                  year*365 - 719499
            )*24 + hour /* now have hours - midnight tomorrow handled here */
          )*60 + min /* now have minutes */
        )*60 + sec; /* finally seconds */
}
EXPORT_SYMBOL(mktime64);

struct __kernel_old_timeval ns_to_kernel_old_timeval(const s64 nsec)
{
        struct timespec64 ts = ns_to_timespec64(nsec);
        struct __kernel_old_timeval tv;

        tv.tv_sec = ts.tv_sec;
        tv.tv_usec = (suseconds_t)ts.tv_nsec / 1000;

        return tv;
}
EXPORT_SYMBOL(ns_to_kernel_old_timeval);

/**
 * set_normalized_timespec - set timespec sec and nsec parts and normalize
 *
 * @ts:                pointer to timespec variable to be set
 * @sec:        seconds to set
 * @nsec:        nanoseconds to set
 *
 * Set seconds and nanoseconds field of a timespec variable and
 * normalize to the timespec storage format
 *
 * Note: The tv_nsec part is always in the range of 0 <= tv_nsec < NSEC_PER_SEC.
 * For negative values only the tv_sec field is negative !
 */
void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec)
{
        while (nsec >= NSEC_PER_SEC) {
                /*
                 * The following asm() prevents the compiler from
                 * optimising this loop into a modulo operation. See
                 * also __iter_div_u64_rem() in include/linux/time.h
                 */
                asm("" : "+rm"(nsec));
                nsec -= NSEC_PER_SEC;
                ++sec;
        }
        while (nsec < 0) {
                asm("" : "+rm"(nsec));
                nsec += NSEC_PER_SEC;
                --sec;
        }
        ts->tv_sec = sec;
        ts->tv_nsec = nsec;
}
EXPORT_SYMBOL(set_normalized_timespec64);

/**
 * ns_to_timespec64 - Convert nanoseconds to timespec64
 * @nsec:       the nanoseconds value to be converted
 *
 * Return: the timespec64 representation of the nsec parameter.
 */
struct timespec64 ns_to_timespec64(const s64 nsec)
{
        struct timespec64 ts = { 0, 0 };
        s32 rem;

        if (likely(nsec > 0)) {
                ts.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
                ts.tv_nsec = rem;
        } else if (nsec < 0) {
                /*
                 * With negative times, tv_sec points to the earlier
                 * second, and tv_nsec counts the nanoseconds since
                 * then, so tv_nsec is always a positive number.
                 */
                ts.tv_sec = -div_u64_rem(-nsec - 1, NSEC_PER_SEC, &rem) - 1;
                ts.tv_nsec = NSEC_PER_SEC - rem - 1;
        }

        return ts;
}
EXPORT_SYMBOL(ns_to_timespec64);

/**
 * msecs_to_jiffies: - convert milliseconds to jiffies
 * @m:        time in milliseconds
 *
 * conversion is done as follows:
 *
 * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET)
 *
 * - 'too large' values [that would result in larger than
 *   MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
 *
 * - all other values are converted to jiffies by either multiplying
 *   the input value by a factor or dividing it with a factor and
 *   handling any 32-bit overflows.
 *   for the details see _msecs_to_jiffies()
 *
 * msecs_to_jiffies() checks for the passed in value being a constant
 * via __builtin_constant_p() allowing gcc to eliminate most of the
 * code, __msecs_to_jiffies() is called if the value passed does not
 * allow constant folding and the actual conversion must be done at
 * runtime.
 * the _msecs_to_jiffies helpers are the HZ dependent conversion
 * routines found in include/linux/jiffies.h
 *
 * Return: jiffies value
 */
unsigned long __msecs_to_jiffies(const unsigned int m)
{
        /*
         * Negative value, means infinite timeout:
         */
        if ((int)m < 0)
                return MAX_JIFFY_OFFSET;
        return _msecs_to_jiffies(m);
}
EXPORT_SYMBOL(__msecs_to_jiffies);

/**
 * __usecs_to_jiffies: - convert microseconds to jiffies
 * @u:        time in milliseconds
 *
 * Return: jiffies value
 */
unsigned long __usecs_to_jiffies(const unsigned int u)
{
        if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
                return MAX_JIFFY_OFFSET;
        return _usecs_to_jiffies(u);
}
EXPORT_SYMBOL(__usecs_to_jiffies);

/**
 * timespec64_to_jiffies - convert a timespec64 value to jiffies
 * @value: pointer to &struct timespec64
 *
 * The TICK_NSEC - 1 rounds up the value to the next resolution.  Note
 * that a remainder subtract here would not do the right thing as the
 * resolution values don't fall on second boundries.  I.e. the line:
 * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding.
 * Note that due to the small error in the multiplier here, this
 * rounding is incorrect for sufficiently large values of tv_nsec, but
 * well formed timespecs should have tv_nsec < NSEC_PER_SEC, so we're
 * OK.
 *
 * Rather, we just shift the bits off the right.
 *
 * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec
 * value to a scaled second value.
 *
 * Return: jiffies value
 */
unsigned long
timespec64_to_jiffies(const struct timespec64 *value)
{
        u64 sec = value->tv_sec;
        long nsec = value->tv_nsec + TICK_NSEC - 1;

        if (sec >= MAX_SEC_IN_JIFFIES){
                sec = MAX_SEC_IN_JIFFIES;
                nsec = 0;
        }
        return ((sec * SEC_CONVERSION) +
                (((u64)nsec * NSEC_CONVERSION) >>
                 (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;

}
EXPORT_SYMBOL(timespec64_to_jiffies);

/**
 * jiffies_to_timespec64 - convert jiffies value to &struct timespec64
 * @jiffies: jiffies value
 * @value: pointer to &struct timespec64
 */
void
jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value)
{
        /*
         * Convert jiffies to nanoseconds and separate with
         * one divide.
         */
        u32 rem;
        value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC,
                                    NSEC_PER_SEC, &rem);
        value->tv_nsec = rem;
}
EXPORT_SYMBOL(jiffies_to_timespec64);

/*
 * Convert jiffies/jiffies_64 to clock_t and back.
 */

/**
 * jiffies_to_clock_t - Convert jiffies to clock_t
 * @x: jiffies value
 *
 * Return: jiffies converted to clock_t (CLOCKS_PER_SEC)
 */
clock_t jiffies_to_clock_t(unsigned long x)
{
#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
# if HZ < USER_HZ
        return x * (USER_HZ / HZ);
# else
        return x / (HZ / USER_HZ);
# endif
#else
        return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ);
#endif
}
EXPORT_SYMBOL(jiffies_to_clock_t);

/**
 * clock_t_to_jiffies - Convert clock_t to jiffies
 * @x: clock_t value
 *
 * Return: clock_t value converted to jiffies
 */
unsigned long clock_t_to_jiffies(unsigned long x)
{
#if (HZ % USER_HZ)==0
        if (x >= ~0UL / (HZ / USER_HZ))
                return ~0UL;
        return x * (HZ / USER_HZ);
#else
        /* Don't worry about loss of precision here .. */
        if (x >= ~0UL / HZ * USER_HZ)
                return ~0UL;

        /* .. but do try to contain it here */
        return div_u64((u64)x * HZ, USER_HZ);
#endif
}
EXPORT_SYMBOL(clock_t_to_jiffies);

/**
 * jiffies_64_to_clock_t - Convert jiffies_64 to clock_t
 * @x: jiffies_64 value
 *
 * Return: jiffies_64 value converted to 64-bit "clock_t" (CLOCKS_PER_SEC)
 */
notrace u64 jiffies_64_to_clock_t(u64 x)
{
#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
# if HZ < USER_HZ
        x = div_u64(x * USER_HZ, HZ);
# elif HZ > USER_HZ
        x = div_u64(x, HZ / USER_HZ);
# else
        /* Nothing to do */
# endif
#else
        /*
         * There are better ways that don't overflow early,
         * but even this doesn't overflow in hundreds of years
         * in 64 bits, so..
         */
        x = div_u64(x * TICK_NSEC, (NSEC_PER_SEC / USER_HZ));
#endif
        return x;
}
EXPORT_SYMBOL(jiffies_64_to_clock_t);

/**
 * nsec_to_clock_t - Convert nsec value to clock_t
 * @x: nsec value
 *
 * Return: nsec value converted to 64-bit "clock_t" (CLOCKS_PER_SEC)
 */
u64 nsec_to_clock_t(u64 x)
{
#if (NSEC_PER_SEC % USER_HZ) == 0
        return div_u64(x, NSEC_PER_SEC / USER_HZ);
#elif (USER_HZ % 512) == 0
        return div_u64(x * USER_HZ / 512, NSEC_PER_SEC / 512);
#else
        /*
         * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024,
         * overflow after 64.99 years.
         * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ...
         */
        return div_u64(x * 9, (9ull * NSEC_PER_SEC + (USER_HZ / 2)) / USER_HZ);
#endif
}

/**
 * jiffies64_to_nsecs - Convert jiffies64 to nanoseconds
 * @j: jiffies64 value
 *
 * Return: nanoseconds value
 */
u64 jiffies64_to_nsecs(u64 j)
{
#if !(NSEC_PER_SEC % HZ)
        return (NSEC_PER_SEC / HZ) * j;
# else
        return div_u64(j * HZ_TO_NSEC_NUM, HZ_TO_NSEC_DEN);
#endif
}
EXPORT_SYMBOL(jiffies64_to_nsecs);

/**
 * jiffies64_to_msecs - Convert jiffies64 to milliseconds
 * @j: jiffies64 value
 *
 * Return: milliseconds value
 */
u64 jiffies64_to_msecs(const u64 j)
{
#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
        return (MSEC_PER_SEC / HZ) * j;
#else
        return div_u64(j * HZ_TO_MSEC_NUM, HZ_TO_MSEC_DEN);
#endif
}
EXPORT_SYMBOL(jiffies64_to_msecs);

/**
 * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64
 *
 * @n:        nsecs in u64
 *
 * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
 * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
 * for scheduler, not for use in device drivers to calculate timeout value.
 *
 * note:
 *   NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
 *   ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
 *
 * Return: nsecs converted to jiffies64 value
 */
u64 nsecs_to_jiffies64(u64 n)
{
#if (NSEC_PER_SEC % HZ) == 0
        /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
        return div_u64(n, NSEC_PER_SEC / HZ);
#elif (HZ % 512) == 0
        /* overflow after 292 years if HZ = 1024 */
        return div_u64(n * HZ / 512, NSEC_PER_SEC / 512);
#else
        /*
         * Generic case - optimized for cases where HZ is a multiple of 3.
         * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc.
         */
        return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ);
#endif
}
EXPORT_SYMBOL(nsecs_to_jiffies64);

/**
 * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
 *
 * @n:        nsecs in u64
 *
 * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
 * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
 * for scheduler, not for use in device drivers to calculate timeout value.
 *
 * note:
 *   NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
 *   ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
 *
 * Return: nsecs converted to jiffies value
 */
unsigned long nsecs_to_jiffies(u64 n)
{
        return (unsigned long)nsecs_to_jiffies64(n);
}
EXPORT_SYMBOL_GPL(nsecs_to_jiffies);

/**
 * timespec64_add_safe - Add two timespec64 values and do a safety check
 * for overflow.
 * @lhs: first (left) timespec64 to add
 * @rhs: second (right) timespec64 to add
 *
 * It's assumed that both values are valid (>= 0).
 * And, each timespec64 is in normalized form.
 *
 * Return: sum of @lhs + @rhs
 */
struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
                                const struct timespec64 rhs)
{
        struct timespec64 res;

        set_normalized_timespec64(&res, (timeu64_t) lhs.tv_sec + rhs.tv_sec,
                        lhs.tv_nsec + rhs.tv_nsec);

        if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) {
                res.tv_sec = TIME64_MAX;
                res.tv_nsec = 0;
        }

        return res;
}

/**
 * get_timespec64 - get user's time value into kernel space
 * @ts: destination &struct timespec64
 * @uts: user's time value as &struct __kernel_timespec
 *
 * Handles compat or 32-bit modes.
 *
 * Return: %0 on success or negative errno on error
 */
int get_timespec64(struct timespec64 *ts,
                   const struct __kernel_timespec __user *uts)
{
        struct __kernel_timespec kts;
        int ret;

        ret = copy_from_user(&kts, uts, sizeof(kts));
        if (ret)
                return -EFAULT;

        ts->tv_sec = kts.tv_sec;

        /* Zero out the padding in compat mode */
        if (in_compat_syscall())
                kts.tv_nsec &= 0xFFFFFFFFUL;

        /* In 32-bit mode, this drops the padding */
        ts->tv_nsec = kts.tv_nsec;

        return 0;
}
EXPORT_SYMBOL_GPL(get_timespec64);

/**
 * put_timespec64 - convert timespec64 value to __kernel_timespec format and
 *                     copy the latter to userspace
 * @ts: input &struct timespec64
 * @uts: user's &struct __kernel_timespec
 *
 * Return: %0 on success or negative errno on error
 */
int put_timespec64(const struct timespec64 *ts,
                   struct __kernel_timespec __user *uts)
{
        struct __kernel_timespec kts = {
                .tv_sec = ts->tv_sec,
                .tv_nsec = ts->tv_nsec
        };

        return copy_to_user(uts, &kts, sizeof(kts)) ? -EFAULT : 0;
}
EXPORT_SYMBOL_GPL(put_timespec64);

static int __get_old_timespec32(struct timespec64 *ts64,
                                   const struct old_timespec32 __user *cts)
{
        struct old_timespec32 ts;
        int ret;

        ret = copy_from_user(&ts, cts, sizeof(ts));
        if (ret)
                return -EFAULT;

        ts64->tv_sec = ts.tv_sec;
        ts64->tv_nsec = ts.tv_nsec;

        return 0;
}

static int __put_old_timespec32(const struct timespec64 *ts64,
                                   struct old_timespec32 __user *cts)
{
        struct old_timespec32 ts = {
                .tv_sec = ts64->tv_sec,
                .tv_nsec = ts64->tv_nsec
        };
        return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0;
}

/**
 * get_old_timespec32 - get user's old-format time value into kernel space
 * @ts: destination &struct timespec64
 * @uts: user's old-format time value (&struct old_timespec32)
 *
 * Handles X86_X32_ABI compatibility conversion.
 *
 * Return: %0 on success or negative errno on error
 */
int get_old_timespec32(struct timespec64 *ts, const void __user *uts)
{
        if (COMPAT_USE_64BIT_TIME)
                return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0;
        else
                return __get_old_timespec32(ts, uts);
}
EXPORT_SYMBOL_GPL(get_old_timespec32);

/**
 * put_old_timespec32 - convert timespec64 value to &struct old_timespec32 and
 *                         copy the latter to userspace
 * @ts: input &struct timespec64
 * @uts: user's &struct old_timespec32
 *
 * Handles X86_X32_ABI compatibility conversion.
 *
 * Return: %0 on success or negative errno on error
 */
int put_old_timespec32(const struct timespec64 *ts, void __user *uts)
{
        if (COMPAT_USE_64BIT_TIME)
                return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0;
        else
                return __put_old_timespec32(ts, uts);
}
EXPORT_SYMBOL_GPL(put_old_timespec32);

/**
 * get_itimerspec64 - get user's &struct __kernel_itimerspec into kernel space
 * @it: destination &struct itimerspec64
 * @uit: user's &struct __kernel_itimerspec
 *
 * Return: %0 on success or negative errno on error
 */
int get_itimerspec64(struct itimerspec64 *it,
                        const struct __kernel_itimerspec __user *uit)
{
        int ret;

        ret = get_timespec64(&it->it_interval, &uit->it_interval);
        if (ret)
                return ret;

        ret = get_timespec64(&it->it_value, &uit->it_value);

        return ret;
}
EXPORT_SYMBOL_GPL(get_itimerspec64);

/**
 * put_itimerspec64 - convert &struct itimerspec64 to __kernel_itimerspec format
 *                       and copy the latter to userspace
 * @it: input &struct itimerspec64
 * @uit: user's &struct __kernel_itimerspec
 *
 * Return: %0 on success or negative errno on error
 */
int put_itimerspec64(const struct itimerspec64 *it,
                        struct __kernel_itimerspec __user *uit)
{
        int ret;

        ret = put_timespec64(&it->it_interval, &uit->it_interval);
        if (ret)
                return ret;

        ret = put_timespec64(&it->it_value, &uit->it_value);

        return ret;
}
EXPORT_SYMBOL_GPL(put_itimerspec64);

/**
 * get_old_itimerspec32 - get user's &struct old_itimerspec32 into kernel space
 * @its: destination &struct itimerspec64
 * @uits: user's &struct old_itimerspec32
 *
 * Return: %0 on success or negative errno on error
 */
int get_old_itimerspec32(struct itimerspec64 *its,
                        const struct old_itimerspec32 __user *uits)
{

        if (__get_old_timespec32(&its->it_interval, &uits->it_interval) ||
            __get_old_timespec32(&its->it_value, &uits->it_value))
                return -EFAULT;
        return 0;
}
EXPORT_SYMBOL_GPL(get_old_itimerspec32);

/**
 * put_old_itimerspec32 - convert &struct itimerspec64 to &struct
 *                          old_itimerspec32 and copy the latter to userspace
 * @its: input &struct itimerspec64
 * @uits: user's &struct old_itimerspec32
 *
 * Return: %0 on success or negative errno on error
 */
int put_old_itimerspec32(const struct itimerspec64 *its,
                        struct old_itimerspec32 __user *uits)
{
        if (__put_old_timespec32(&its->it_interval, &uits->it_interval) ||
            __put_old_timespec32(&its->it_value, &uits->it_value))
                return -EFAULT;
        return 0;
}
EXPORT_SYMBOL_GPL(put_old_itimerspec32);






































































































    4 


    4 


    4 


    4 
    3 

    2 


































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * A hash table (hashtab) maintains associations between
 * key values and datum values.  The type of the key values
 * and the type of the datum values is arbitrary.  The
 * functions for hash computation and key comparison are
 * provided by the creator of the table.
 *
 * Author : Stephen Smalley, <sds@tycho.nsa.gov>
 */
#ifndef _SS_HASHTAB_H_
#define _SS_HASHTAB_H_

#include <linux/types.h>
#include <linux/errno.h>
#include <linux/sched.h>

#define HASHTAB_MAX_NODES        U32_MAX

struct hashtab_key_params {
        u32 (*hash)(const void *key);        /* hash function */
        int (*cmp)(const void *key1, const void *key2);
                                        /* key comparison function */
};

struct hashtab_node {
        void *key;
        void *datum;
        struct hashtab_node *next;
};

struct hashtab {
        struct hashtab_node **htable;        /* hash table */
        u32 size;                        /* number of slots in hash table */
        u32 nel;                        /* number of elements in hash table */
};

struct hashtab_info {
        u32 slots_used;
        u32 max_chain_len;
};

/*
 * Initializes a new hash table with the specified characteristics.
 *
 * Returns -ENOMEM if insufficient space is available or 0 otherwise.
 */
int hashtab_init(struct hashtab *h, u32 nel_hint);

int __hashtab_insert(struct hashtab *h, struct hashtab_node **dst,
                     void *key, void *datum);

/*
 * Inserts the specified (key, datum) pair into the specified hash table.
 *
 * Returns -ENOMEM on memory allocation error,
 * -EEXIST if there is already an entry with the same key,
 * -EINVAL for general errors or
  0 otherwise.
 */
static inline int hashtab_insert(struct hashtab *h, void *key, void *datum,
                                 struct hashtab_key_params key_params)
{
        u32 hvalue;
        struct hashtab_node *prev, *cur;

        cond_resched();

        if (!h->size || h->nel == HASHTAB_MAX_NODES)
                return -EINVAL;

        hvalue = key_params.hash(key) & (h->size - 1);
        prev = NULL;
        cur = h->htable[hvalue];
        while (cur) {
                int cmp = key_params.cmp(key, cur->key);

                if (cmp == 0)
                        return -EEXIST;
                if (cmp < 0)
                        break;
                prev = cur;
                cur = cur->next;
        }

        return __hashtab_insert(h, prev ? &prev->next : &h->htable[hvalue],
                                key, datum);
}

/*
 * Searches for the entry with the specified key in the hash table.
 *
 * Returns NULL if no entry has the specified key or
 * the datum of the entry otherwise.
 */
static inline void *hashtab_search(struct hashtab *h, const void *key,
                                   struct hashtab_key_params key_params)
{
        u32 hvalue;
        struct hashtab_node *cur;

        if (!h->size)
                return NULL;

        hvalue = key_params.hash(key) & (h->size - 1);
        cur = h->htable[hvalue];
        while (cur) {
                int cmp = key_params.cmp(key, cur->key);

                if (cmp == 0)
                        return cur->datum;
                if (cmp < 0)
                        break;
                cur = cur->next;
        }
        return NULL;
}

/*
 * Destroys the specified hash table.
 */
void hashtab_destroy(struct hashtab *h);

/*
 * Applies the specified apply function to (key,datum,args)
 * for each entry in the specified hash table.
 *
 * The order in which the function is applied to the entries
 * is dependent upon the internal structure of the hash table.
 *
 * If apply returns a non-zero status, then hashtab_map will cease
 * iterating through the hash table and will propagate the error
 * return to its caller.
 */
int hashtab_map(struct hashtab *h,
                int (*apply)(void *k, void *d, void *args),
                void *args);

int hashtab_duplicate(struct hashtab *new, struct hashtab *orig,
                int (*copy)(struct hashtab_node *new,
                        struct hashtab_node *orig, void *args),
                int (*destroy)(void *k, void *d, void *args),
                void *args);

/* Fill info with some hash table statistics */
void hashtab_stat(struct hashtab *h, struct hashtab_info *info);

#endif        /* _SS_HASHTAB_H */













































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM net

#if !defined(_TRACE_NET_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_NET_H

#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/if_vlan.h>
#include <linux/ip.h>
#include <linux/tracepoint.h>

TRACE_EVENT(net_dev_start_xmit,

        TP_PROTO(const struct sk_buff *skb, const struct net_device *dev),

        TP_ARGS(skb, dev),

        TP_STRUCT__entry(
                __string(        name,                        dev->name        )
                __field(        u16,                        queue_mapping        )
                __field(        const void *,                skbaddr                )
                __field(        bool,                        vlan_tagged        )
                __field(        u16,                        vlan_proto        )
                __field(        u16,                        vlan_tci        )
                __field(        u16,                        protocol        )
                __field(        u8,                        ip_summed        )
                __field(        unsigned int,                len                )
                __field(        unsigned int,                data_len        )
                __field(        int,                        network_offset        )
                __field(        bool,                        transport_offset_valid)
                __field(        int,                        transport_offset)
                __field(        u8,                        tx_flags        )
                __field(        u16,                        gso_size        )
                __field(        u16,                        gso_segs        )
                __field(        u16,                        gso_type        )
        ),

        TP_fast_assign(
                __assign_str(name, dev->name);
                __entry->queue_mapping = skb->queue_mapping;
                __entry->skbaddr = skb;
                __entry->vlan_tagged = skb_vlan_tag_present(skb);
                __entry->vlan_proto = ntohs(skb->vlan_proto);
                __entry->vlan_tci = skb_vlan_tag_get(skb);
                __entry->protocol = ntohs(skb->protocol);
                __entry->ip_summed = skb->ip_summed;
                __entry->len = skb->len;
                __entry->data_len = skb->data_len;
                __entry->network_offset = skb_network_offset(skb);
                __entry->transport_offset_valid =
                        skb_transport_header_was_set(skb);
                __entry->transport_offset = skb_transport_offset(skb);
                __entry->tx_flags = skb_shinfo(skb)->tx_flags;
                __entry->gso_size = skb_shinfo(skb)->gso_size;
                __entry->gso_segs = skb_shinfo(skb)->gso_segs;
                __entry->gso_type = skb_shinfo(skb)->gso_type;
        ),

        TP_printk("dev=%s queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d len=%u data_len=%u network_offset=%d transport_offset_valid=%d transport_offset=%d tx_flags=%d gso_size=%d gso_segs=%d gso_type=%#x",
                  __get_str(name), __entry->queue_mapping, __entry->skbaddr,
                  __entry->vlan_tagged, __entry->vlan_proto, __entry->vlan_tci,
                  __entry->protocol, __entry->ip_summed, __entry->len,
                  __entry->data_len,
                  __entry->network_offset, __entry->transport_offset_valid,
                  __entry->transport_offset, __entry->tx_flags,
                  __entry->gso_size, __entry->gso_segs, __entry->gso_type)
);

TRACE_EVENT(net_dev_xmit,

        TP_PROTO(struct sk_buff *skb,
                 int rc,
                 struct net_device *dev,
                 unsigned int skb_len),

        TP_ARGS(skb, rc, dev, skb_len),

        TP_STRUCT__entry(
                __field(        void *,                skbaddr                )
                __field(        unsigned int,        len                )
                __field(        int,                rc                )
                __string(        name,                dev->name        )
        ),

        TP_fast_assign(
                __entry->skbaddr = skb;
                __entry->len = skb_len;
                __entry->rc = rc;
                __assign_str(name, dev->name);
        ),

        TP_printk("dev=%s skbaddr=%p len=%u rc=%d",
                __get_str(name), __entry->skbaddr, __entry->len, __entry->rc)
);

TRACE_EVENT(net_dev_xmit_timeout,

        TP_PROTO(struct net_device *dev,
                 int queue_index),

        TP_ARGS(dev, queue_index),

        TP_STRUCT__entry(
                __string(        name,                dev->name        )
                __string(        driver,                netdev_drivername(dev))
                __field(        int,                queue_index        )
        ),

        TP_fast_assign(
                __assign_str(name, dev->name);
                __assign_str(driver, netdev_drivername(dev));
                __entry->queue_index = queue_index;
        ),

        TP_printk("dev=%s driver=%s queue=%d",
                __get_str(name), __get_str(driver), __entry->queue_index)
);

DECLARE_EVENT_CLASS(net_dev_template,

        TP_PROTO(struct sk_buff *skb),

        TP_ARGS(skb),

        TP_STRUCT__entry(
                __field(        void *,                skbaddr                )
                __field(        unsigned int,        len                )
                __string(        name,                skb->dev->name        )
        ),

        TP_fast_assign(
                __entry->skbaddr = skb;
                __entry->len = skb->len;
                __assign_str(name, skb->dev->name);
        ),

        TP_printk("dev=%s skbaddr=%p len=%u",
                __get_str(name), __entry->skbaddr, __entry->len)
)

DEFINE_EVENT(net_dev_template, net_dev_queue,

        TP_PROTO(struct sk_buff *skb),

        TP_ARGS(skb)
);

DEFINE_EVENT(net_dev_template, netif_receive_skb,

        TP_PROTO(struct sk_buff *skb),

        TP_ARGS(skb)
);

DEFINE_EVENT(net_dev_template, netif_rx,

        TP_PROTO(struct sk_buff *skb),

        TP_ARGS(skb)
);

DECLARE_EVENT_CLASS(net_dev_rx_verbose_template,

        TP_PROTO(const struct sk_buff *skb),

        TP_ARGS(skb),

        TP_STRUCT__entry(
                __string(        name,                        skb->dev->name        )
                __field(        unsigned int,                napi_id                )
                __field(        u16,                        queue_mapping        )
                __field(        const void *,                skbaddr                )
                __field(        bool,                        vlan_tagged        )
                __field(        u16,                        vlan_proto        )
                __field(        u16,                        vlan_tci        )
                __field(        u16,                        protocol        )
                __field(        u8,                        ip_summed        )
                __field(        u32,                        hash                )
                __field(        bool,                        l4_hash                )
                __field(        unsigned int,                len                )
                __field(        unsigned int,                data_len        )
                __field(        unsigned int,                truesize        )
                __field(        bool,                        mac_header_valid)
                __field(        int,                        mac_header        )
                __field(        unsigned char,                nr_frags        )
                __field(        u16,                        gso_size        )
                __field(        u16,                        gso_type        )
        ),

        TP_fast_assign(
                __assign_str(name, skb->dev->name);
#ifdef CONFIG_NET_RX_BUSY_POLL
                __entry->napi_id = skb->napi_id;
#else
                __entry->napi_id = 0;
#endif
                __entry->queue_mapping = skb->queue_mapping;
                __entry->skbaddr = skb;
                __entry->vlan_tagged = skb_vlan_tag_present(skb);
                __entry->vlan_proto = ntohs(skb->vlan_proto);
                __entry->vlan_tci = skb_vlan_tag_get(skb);
                __entry->protocol = ntohs(skb->protocol);
                __entry->ip_summed = skb->ip_summed;
                __entry->hash = skb->hash;
                __entry->l4_hash = skb->l4_hash;
                __entry->len = skb->len;
                __entry->data_len = skb->data_len;
                __entry->truesize = skb->truesize;
                __entry->mac_header_valid = skb_mac_header_was_set(skb);
                __entry->mac_header = skb_mac_header(skb) - skb->data;
                __entry->nr_frags = skb_shinfo(skb)->nr_frags;
                __entry->gso_size = skb_shinfo(skb)->gso_size;
                __entry->gso_type = skb_shinfo(skb)->gso_type;
        ),

        TP_printk("dev=%s napi_id=%#x queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d hash=0x%08x l4_hash=%d len=%u data_len=%u truesize=%u mac_header_valid=%d mac_header=%d nr_frags=%d gso_size=%d gso_type=%#x",
                  __get_str(name), __entry->napi_id, __entry->queue_mapping,
                  __entry->skbaddr, __entry->vlan_tagged, __entry->vlan_proto,
                  __entry->vlan_tci, __entry->protocol, __entry->ip_summed,
                  __entry->hash, __entry->l4_hash, __entry->len,
                  __entry->data_len, __entry->truesize,
                  __entry->mac_header_valid, __entry->mac_header,
                  __entry->nr_frags, __entry->gso_size, __entry->gso_type)
);

DEFINE_EVENT(net_dev_rx_verbose_template, napi_gro_frags_entry,

        TP_PROTO(const struct sk_buff *skb),

        TP_ARGS(skb)
);

DEFINE_EVENT(net_dev_rx_verbose_template, napi_gro_receive_entry,

        TP_PROTO(const struct sk_buff *skb),

        TP_ARGS(skb)
);

DEFINE_EVENT(net_dev_rx_verbose_template, netif_receive_skb_entry,

        TP_PROTO(const struct sk_buff *skb),

        TP_ARGS(skb)
);

DEFINE_EVENT(net_dev_rx_verbose_template, netif_receive_skb_list_entry,

        TP_PROTO(const struct sk_buff *skb),

        TP_ARGS(skb)
);

DEFINE_EVENT(net_dev_rx_verbose_template, netif_rx_entry,

        TP_PROTO(const struct sk_buff *skb),

        TP_ARGS(skb)
);

DEFINE_EVENT(net_dev_rx_verbose_template, netif_rx_ni_entry,

        TP_PROTO(const struct sk_buff *skb),

        TP_ARGS(skb)
);

DECLARE_EVENT_CLASS(net_dev_rx_exit_template,

        TP_PROTO(int ret),

        TP_ARGS(ret),

        TP_STRUCT__entry(
                __field(int,        ret)
        ),

        TP_fast_assign(
                __entry->ret = ret;
        ),

        TP_printk("ret=%d", __entry->ret)
);

DEFINE_EVENT(net_dev_rx_exit_template, napi_gro_frags_exit,

        TP_PROTO(int ret),

        TP_ARGS(ret)
);

DEFINE_EVENT(net_dev_rx_exit_template, napi_gro_receive_exit,

        TP_PROTO(int ret),

        TP_ARGS(ret)
);

DEFINE_EVENT(net_dev_rx_exit_template, netif_receive_skb_exit,

        TP_PROTO(int ret),

        TP_ARGS(ret)
);

DEFINE_EVENT(net_dev_rx_exit_template, netif_rx_exit,

        TP_PROTO(int ret),

        TP_ARGS(ret)
);

DEFINE_EVENT(net_dev_rx_exit_template, netif_rx_ni_exit,

        TP_PROTO(int ret),

        TP_ARGS(ret)
);

DEFINE_EVENT(net_dev_rx_exit_template, netif_receive_skb_list_exit,

        TP_PROTO(int ret),

        TP_ARGS(ret)
);

#endif /* _TRACE_NET_H */

/* This part must be outside protection */
#include <trace/define_trace.h>

























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
/* SPDX-License-Identifier: GPL-2.0 */
/*
 *                INETPEER - A storage for permanent information about peers
 *
 *  Authors:        Andrey V. Savochkin <saw@msu.ru>
 */

#ifndef _NET_INETPEER_H
#define _NET_INETPEER_H

#include <linux/types.h>
#include <linux/init.h>
#include <linux/jiffies.h>
#include <linux/spinlock.h>
#include <linux/rtnetlink.h>
#include <net/ipv6.h>
#include <linux/atomic.h>

/* IPv4 address key for cache lookups */
struct ipv4_addr_key {
        __be32        addr;
        int        vif;
};

#define INETPEER_MAXKEYSZ   (sizeof(struct in6_addr) / sizeof(u32))

struct inetpeer_addr {
        union {
                struct ipv4_addr_key        a4;
                struct in6_addr                a6;
                u32                        key[INETPEER_MAXKEYSZ];
        };
        __u16                                family;
};

struct inet_peer {
        struct rb_node                rb_node;
        struct inetpeer_addr        daddr;

        u32                        metrics[RTAX_MAX];
        u32                        rate_tokens;        /* rate limiting for ICMP */
        u32                        n_redirects;
        unsigned long                rate_last;
        /*
         * Once inet_peer is queued for deletion (refcnt == 0), following field
         * is not available: rid
         * We can share memory with rcu_head to help keep inet_peer small.
         */
        union {
                struct {
                        atomic_t                        rid;                /* Frag reception counter */
                };
                struct rcu_head         rcu;
        };

        /* following fields might be frequently dirtied */
        __u32                        dtime;        /* the time of last use of not referenced entries */
        refcount_t                refcnt;
};

struct inet_peer_base {
        struct rb_root                rb_root;
        seqlock_t                lock;
        int                        total;
};

void inet_peer_base_init(struct inet_peer_base *);

void inet_initpeers(void) __init;

#define INETPEER_METRICS_NEW        (~(u32) 0)

static inline void inetpeer_set_addr_v4(struct inetpeer_addr *iaddr, __be32 ip)
{
        iaddr->a4.addr = ip;
        iaddr->a4.vif = 0;
        iaddr->family = AF_INET;
}

static inline __be32 inetpeer_get_addr_v4(struct inetpeer_addr *iaddr)
{
        return iaddr->a4.addr;
}

static inline void inetpeer_set_addr_v6(struct inetpeer_addr *iaddr,
                                        struct in6_addr *in6)
{
        iaddr->a6 = *in6;
        iaddr->family = AF_INET6;
}

static inline struct in6_addr *inetpeer_get_addr_v6(struct inetpeer_addr *iaddr)
{
        return &iaddr->a6;
}

/* can be called with or without local BH being disabled */
struct inet_peer *inet_getpeer(struct inet_peer_base *base,
                               const struct inetpeer_addr *daddr,
                               int create);

static inline struct inet_peer *inet_getpeer_v4(struct inet_peer_base *base,
                                                __be32 v4daddr,
                                                int vif, int create)
{
        struct inetpeer_addr daddr;

        daddr.a4.addr = v4daddr;
        daddr.a4.vif = vif;
        daddr.family = AF_INET;
        return inet_getpeer(base, &daddr, create);
}

static inline struct inet_peer *inet_getpeer_v6(struct inet_peer_base *base,
                                                const struct in6_addr *v6daddr,
                                                int create)
{
        struct inetpeer_addr daddr;

        daddr.a6 = *v6daddr;
        daddr.family = AF_INET6;
        return inet_getpeer(base, &daddr, create);
}

static inline int inetpeer_addr_cmp(const struct inetpeer_addr *a,
                                    const struct inetpeer_addr *b)
{
        int i, n;

        if (a->family == AF_INET)
                n = sizeof(a->a4) / sizeof(u32);
        else
                n = sizeof(a->a6) / sizeof(u32);

        for (i = 0; i < n; i++) {
                if (a->key[i] == b->key[i])
                        continue;
                if (a->key[i] < b->key[i])
                        return -1;
                return 1;
        }

        return 0;
}

/* can be called from BH context or outside */
void inet_putpeer(struct inet_peer *p);
bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout);

void inetpeer_invalidate_tree(struct inet_peer_base *);

#endif /* _NET_INETPEER_H */












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Portions of this file
* Copyright(c) 2016 Intel Deutschland GmbH
* Copyright (C) 2018 - 2019 Intel Corporation
*/

#ifndef __MAC80211_DRIVER_OPS
#define __MAC80211_DRIVER_OPS

#include <net/mac80211.h>
#include "ieee80211_i.h"
#include "trace.h"

#define check_sdata_in_driver(sdata)        ({                                        \
        !WARN_ONCE(!(sdata->flags & IEEE80211_SDATA_IN_DRIVER),                        \
                   "%s: Failed check-sdata-in-driver check, flags: 0x%x\n",        \
                   sdata->dev ? sdata->dev->name : sdata->name, sdata->flags);        \
})

static inline struct ieee80211_sub_if_data *
get_bss_sdata(struct ieee80211_sub_if_data *sdata)
{
        if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
                sdata = container_of(sdata->bss, struct ieee80211_sub_if_data,
                                     u.ap);

        return sdata;
}

static inline void drv_tx(struct ieee80211_local *local,
                          struct ieee80211_tx_control *control,
                          struct sk_buff *skb)
{
        local->ops->tx(&local->hw, control, skb);
}

static inline void drv_sync_rx_queues(struct ieee80211_local *local,
                                      struct sta_info *sta)
{
        if (local->ops->sync_rx_queues) {
                trace_drv_sync_rx_queues(local, sta->sdata, &sta->sta);
                local->ops->sync_rx_queues(&local->hw);
                trace_drv_return_void(local);
        }
}

static inline void drv_get_et_strings(struct ieee80211_sub_if_data *sdata,
                                      u32 sset, u8 *data)
{
        struct ieee80211_local *local = sdata->local;
        if (local->ops->get_et_strings) {
                trace_drv_get_et_strings(local, sset);
                local->ops->get_et_strings(&local->hw, &sdata->vif, sset, data);
                trace_drv_return_void(local);
        }
}

static inline void drv_get_et_stats(struct ieee80211_sub_if_data *sdata,
                                    struct ethtool_stats *stats,
                                    u64 *data)
{
        struct ieee80211_local *local = sdata->local;
        if (local->ops->get_et_stats) {
                trace_drv_get_et_stats(local);
                local->ops->get_et_stats(&local->hw, &sdata->vif, stats, data);
                trace_drv_return_void(local);
        }
}

static inline int drv_get_et_sset_count(struct ieee80211_sub_if_data *sdata,
                                        int sset)
{
        struct ieee80211_local *local = sdata->local;
        int rv = 0;
        if (local->ops->get_et_sset_count) {
                trace_drv_get_et_sset_count(local, sset);
                rv = local->ops->get_et_sset_count(&local->hw, &sdata->vif,
                                                   sset);
                trace_drv_return_int(local, rv);
        }
        return rv;
}

int drv_start(struct ieee80211_local *local);
void drv_stop(struct ieee80211_local *local);

#ifdef CONFIG_PM
static inline int drv_suspend(struct ieee80211_local *local,
                              struct cfg80211_wowlan *wowlan)
{
        int ret;

        might_sleep();

        trace_drv_suspend(local);
        ret = local->ops->suspend(&local->hw, wowlan);
        trace_drv_return_int(local, ret);
        return ret;
}

static inline int drv_resume(struct ieee80211_local *local)
{
        int ret;

        might_sleep();

        trace_drv_resume(local);
        ret = local->ops->resume(&local->hw);
        trace_drv_return_int(local, ret);
        return ret;
}

static inline void drv_set_wakeup(struct ieee80211_local *local,
                                  bool enabled)
{
        might_sleep();

        if (!local->ops->set_wakeup)
                return;

        trace_drv_set_wakeup(local, enabled);
        local->ops->set_wakeup(&local->hw, enabled);
        trace_drv_return_void(local);
}
#endif

int drv_add_interface(struct ieee80211_local *local,
                      struct ieee80211_sub_if_data *sdata);

int drv_change_interface(struct ieee80211_local *local,
                         struct ieee80211_sub_if_data *sdata,
                         enum nl80211_iftype type, bool p2p);

void drv_remove_interface(struct ieee80211_local *local,
                          struct ieee80211_sub_if_data *sdata);

static inline int drv_config(struct ieee80211_local *local, u32 changed)
{
        int ret;

        might_sleep();

        trace_drv_config(local, changed);
        ret = local->ops->config(&local->hw, changed);
        trace_drv_return_int(local, ret);
        return ret;
}

static inline void drv_bss_info_changed(struct ieee80211_local *local,
                                        struct ieee80211_sub_if_data *sdata,
                                        struct ieee80211_bss_conf *info,
                                        u32 changed)
{
        might_sleep();

        if (WARN_ON_ONCE(changed & (BSS_CHANGED_BEACON |
                                    BSS_CHANGED_BEACON_ENABLED) &&
                         sdata->vif.type != NL80211_IFTYPE_AP &&
                         sdata->vif.type != NL80211_IFTYPE_ADHOC &&
                         sdata->vif.type != NL80211_IFTYPE_MESH_POINT &&
                         sdata->vif.type != NL80211_IFTYPE_OCB))
                return;

        if (WARN_ON_ONCE(sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE ||
                         sdata->vif.type == NL80211_IFTYPE_NAN ||
                         (sdata->vif.type == NL80211_IFTYPE_MONITOR &&
                          !sdata->vif.mu_mimo_owner &&
                          !(changed & BSS_CHANGED_TXPOWER))))
                return;

        if (!check_sdata_in_driver(sdata))
                return;

        trace_drv_bss_info_changed(local, sdata, info, changed);
        if (local->ops->bss_info_changed)
                local->ops->bss_info_changed(&local->hw, &sdata->vif, info, changed);
        trace_drv_return_void(local);
}

static inline u64 drv_prepare_multicast(struct ieee80211_local *local,
                                        struct netdev_hw_addr_list *mc_list)
{
        u64 ret = 0;

        trace_drv_prepare_multicast(local, mc_list->count);

        if (local->ops->prepare_multicast)
                ret = local->ops->prepare_multicast(&local->hw, mc_list);

        trace_drv_return_u64(local, ret);

        return ret;
}

static inline void drv_configure_filter(struct ieee80211_local *local,
                                        unsigned int changed_flags,
                                        unsigned int *total_flags,
                                        u64 multicast)
{
        might_sleep();

        trace_drv_configure_filter(local, changed_flags, total_flags,
                                   multicast);
        local->ops->configure_filter(&local->hw, changed_flags, total_flags,
                                     multicast);
        trace_drv_return_void(local);
}

static inline void drv_config_iface_filter(struct ieee80211_local *local,
                                           struct ieee80211_sub_if_data *sdata,
                                           unsigned int filter_flags,
                                           unsigned int changed_flags)
{
        might_sleep();

        trace_drv_config_iface_filter(local, sdata, filter_flags,
                                      changed_flags);
        if (local->ops->config_iface_filter)
                local->ops->config_iface_filter(&local->hw, &sdata->vif,
                                                filter_flags,
                                                changed_flags);
        trace_drv_return_void(local);
}

static inline int drv_set_tim(struct ieee80211_local *local,
                              struct ieee80211_sta *sta, bool set)
{
        int ret = 0;
        trace_drv_set_tim(local, sta, set);
        if (local->ops->set_tim)
                ret = local->ops->set_tim(&local->hw, sta, set);
        trace_drv_return_int(local, ret);
        return ret;
}

static inline int drv_set_key(struct ieee80211_local *local,
                              enum set_key_cmd cmd,
                              struct ieee80211_sub_if_data *sdata,
                              struct ieee80211_sta *sta,
                              struct ieee80211_key_conf *key)
{
        int ret;

        might_sleep();

        sdata = get_bss_sdata(sdata);
        if (!check_sdata_in_driver(sdata))
                return -EIO;

        trace_drv_set_key(local, cmd, sdata, sta, key);
        ret = local->ops->set_key(&local->hw, cmd, &sdata->vif, sta, key);
        trace_drv_return_int(local, ret);
        return ret;
}

static inline void drv_update_tkip_key(struct ieee80211_local *local,
                                       struct ieee80211_sub_if_data *sdata,
                                       struct ieee80211_key_conf *conf,
                                       struct sta_info *sta, u32 iv32,
                                       u16 *phase1key)
{
        struct ieee80211_sta *ista = NULL;

        if (sta)
                ista = &sta->sta;

        sdata = get_bss_sdata(sdata);
        if (!check_sdata_in_driver(sdata))
                return;

        trace_drv_update_tkip_key(local, sdata, conf, ista, iv32);
        if (local->ops->update_tkip_key)
                local->ops->update_tkip_key(&local->hw, &sdata->vif, conf,
                                            ista, iv32, phase1key);
        trace_drv_return_void(local);
}

static inline int drv_hw_scan(struct ieee80211_local *local,
                              struct ieee80211_sub_if_data *sdata,
                              struct ieee80211_scan_request *req)
{
        int ret;

        might_sleep();

        if (!check_sdata_in_driver(sdata))
                return -EIO;

        trace_drv_hw_scan(local, sdata);
        ret = local->ops->hw_scan(&local->hw, &sdata->vif, req);
        trace_drv_return_int(local, ret);
        return ret;
}

static inline void drv_cancel_hw_scan(struct ieee80211_local *local,
                                      struct ieee80211_sub_if_data *sdata)
{
        might_sleep();

        if (!check_sdata_in_driver(sdata))
                return;

        trace_drv_cancel_hw_scan(local, sdata);
        local->ops->cancel_hw_scan(&local->hw, &sdata->vif);
        trace_drv_return_void(local);
}

static inline int
drv_sched_scan_start(struct ieee80211_local *local,
                     struct ieee80211_sub_if_data *sdata,
                     struct cfg80211_sched_scan_request *req,
                     struct ieee80211_scan_ies *ies)
{
        int ret;

        might_sleep();

        if (!check_sdata_in_driver(sdata))
                return -EIO;

        trace_drv_sched_scan_start(local, sdata);
        ret = local->ops->sched_scan_start(&local->hw, &sdata->vif,
                                              req, ies);
        trace_drv_return_int(local, ret);
        return ret;
}

static inline int drv_sched_scan_stop(struct ieee80211_local *local,
                                      struct ieee80211_sub_if_data *sdata)
{
        int ret;

        might_sleep();

        if (!check_sdata_in_driver(sdata))
                return -EIO;

        trace_drv_sched_scan_stop(local, sdata);
        ret = local->ops->sched_scan_stop(&local->hw, &sdata->vif);
        trace_drv_return_int(local, ret);

        return ret;
}

static inline void drv_sw_scan_start(struct ieee80211_local *local,
                                     struct ieee80211_sub_if_data *sdata,
                                     const u8 *mac_addr)
{
        might_sleep();

        trace_drv_sw_scan_start(local, sdata, mac_addr);
        if (local->ops->sw_scan_start)
                local->ops->sw_scan_start(&local->hw, &sdata->vif, mac_addr);
        trace_drv_return_void(local);
}

static inline void drv_sw_scan_complete(struct ieee80211_local *local,
                                        struct ieee80211_sub_if_data *sdata)
{
        might_sleep();

        trace_drv_sw_scan_complete(local, sdata);
        if (local->ops->sw_scan_complete)
                local->ops->sw_scan_complete(&local->hw, &sdata->vif);
        trace_drv_return_void(local);
}

static inline int drv_get_stats(struct ieee80211_local *local,
                                struct ieee80211_low_level_stats *stats)
{
        int ret = -EOPNOTSUPP;

        might_sleep();

        if (local->ops->get_stats)
                ret = local->ops->get_stats(&local->hw, stats);
        trace_drv_get_stats(local, stats, ret);

        return ret;
}

static inline void drv_get_key_seq(struct ieee80211_local *local,
                                   struct ieee80211_key *key,
                                   struct ieee80211_key_seq *seq)
{
        if (local->ops->get_key_seq)
                local->ops->get_key_seq(&local->hw, &key->conf, seq);
        trace_drv_get_key_seq(local, &key->conf);
}

static inline int drv_set_frag_threshold(struct ieee80211_local *local,
                                        u32 value)
{
        int ret = 0;

        might_sleep();

        trace_drv_set_frag_threshold(local, value);
        if (local->ops->set_frag_threshold)
                ret = local->ops->set_frag_threshold(&local->hw, value);
        trace_drv_return_int(local, ret);
        return ret;
}

static inline int drv_set_rts_threshold(struct ieee80211_local *local,
                                        u32 value)
{
        int ret = 0;

        might_sleep();

        trace_drv_set_rts_threshold(local, value);
        if (local->ops->set_rts_threshold)
                ret = local->ops->set_rts_threshold(&local->hw, value);
        trace_drv_return_int(local, ret);
        return ret;
}

static inline int drv_set_coverage_class(struct ieee80211_local *local,
                                         s16 value)
{
        int ret = 0;
        might_sleep();

        trace_drv_set_coverage_class(local, value);
        if (local->ops->set_coverage_class)
                local->ops->set_coverage_class(&local->hw, value);
        else
                ret = -EOPNOTSUPP;

        trace_drv_return_int(local, ret);
        return ret;
}

static inline void drv_sta_notify(struct ieee80211_local *local,
                                  struct ieee80211_sub_if_data *sdata,
                                  enum sta_notify_cmd cmd,
                                  struct ieee80211_sta *sta)
{
        sdata = get_bss_sdata(sdata);
        if (!check_sdata_in_driver(sdata))
                return;

        trace_drv_sta_notify(local, sdata, cmd, sta);
        if (local->ops->sta_notify)
                local->ops->sta_notify(&local->hw, &sdata->vif, cmd, sta);
        trace_drv_return_void(local);
}

static inline int drv_sta_add(struct ieee80211_local *local,
                              struct ieee80211_sub_if_data *sdata,
                              struct ieee80211_sta *sta)
{
        int ret = 0;

        might_sleep();

        sdata = get_bss_sdata(sdata);
        if (!check_sdata_in_driver(sdata))
                return -EIO;

        trace_drv_sta_add(local, sdata, sta);
        if (local->ops->sta_add)
                ret = local->ops->sta_add(&local->hw, &sdata->vif, sta);

        trace_drv_return_int(local, ret);

        return ret;
}

static inline void drv_sta_remove(struct ieee80211_local *local,
                                  struct ieee80211_sub_if_data *sdata,
                                  struct ieee80211_sta *sta)
{
        might_sleep();

        sdata = get_bss_sdata(sdata);
        if (!check_sdata_in_driver(sdata))
                return;

        trace_drv_sta_remove(local, sdata, sta);
        if (local->ops->sta_remove)
                local->ops->sta_remove(&local->hw, &sdata->vif, sta);

        trace_drv_return_void(local);
}

#ifdef CONFIG_MAC80211_DEBUGFS
static inline void drv_sta_add_debugfs(struct ieee80211_local *local,
                                       struct ieee80211_sub_if_data *sdata,
                                       struct ieee80211_sta *sta,
                                       struct dentry *dir)
{
        might_sleep();

        sdata = get_bss_sdata(sdata);
        if (!check_sdata_in_driver(sdata))
                return;

        if (local->ops->sta_add_debugfs)
                local->ops->sta_add_debugfs(&local->hw, &sdata->vif,
                                            sta, dir);
}
#endif

static inline void drv_sta_pre_rcu_remove(struct ieee80211_local *local,
                                          struct ieee80211_sub_if_data *sdata,
                                          struct sta_info *sta)
{
        might_sleep();

        sdata = get_bss_sdata(sdata);
        if (!check_sdata_in_driver(sdata))
                return;

        trace_drv_sta_pre_rcu_remove(local, sdata, &sta->sta);
        if (local->ops->sta_pre_rcu_remove)
                local->ops->sta_pre_rcu_remove(&local->hw, &sdata->vif,
                                               &sta->sta);
        trace_drv_return_void(local);
}

__must_check
int drv_sta_state(struct ieee80211_local *local,
                  struct ieee80211_sub_if_data *sdata,
                  struct sta_info *sta,
                  enum ieee80211_sta_state old_state,
                  enum ieee80211_sta_state new_state);

__must_check
int drv_sta_set_txpwr(struct ieee80211_local *local,
                      struct ieee80211_sub_if_data *sdata,
                      struct sta_info *sta);

void drv_sta_rc_update(struct ieee80211_local *local,
                       struct ieee80211_sub_if_data *sdata,
                       struct ieee80211_sta *sta, u32 changed);

static inline void drv_sta_rate_tbl_update(struct ieee80211_local *local,
                                           struct ieee80211_sub_if_data *sdata,
                                           struct ieee80211_sta *sta)
{
        sdata = get_bss_sdata(sdata);
        if (!check_sdata_in_driver(sdata))
                return;

        trace_drv_sta_rate_tbl_update(local, sdata, sta);
        if (local->ops->sta_rate_tbl_update)
                local->ops->sta_rate_tbl_update(&local->hw, &sdata->vif, sta);

        trace_drv_return_void(local);
}

static inline void drv_sta_statistics(struct ieee80211_local *local,
                                      struct ieee80211_sub_if_data *sdata,
                                      struct ieee80211_sta *sta,
                                      struct station_info *sinfo)
{
        sdata = get_bss_sdata(sdata);
        if (!check_sdata_in_driver(sdata))
                return;

        trace_drv_sta_statistics(local, sdata, sta);
        if (local->ops->sta_statistics)
                local->ops->sta_statistics(&local->hw, &sdata->vif, sta, sinfo);
        trace_drv_return_void(local);
}

int drv_conf_tx(struct ieee80211_local *local,
                struct ieee80211_sub_if_data *sdata, u16 ac,
                const struct ieee80211_tx_queue_params *params);

u64 drv_get_tsf(struct ieee80211_local *local,
                struct ieee80211_sub_if_data *sdata);
void drv_set_tsf(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 u64 tsf);
void drv_offset_tsf(struct ieee80211_local *local,
                    struct ieee80211_sub_if_data *sdata,
                    s64 offset);
void drv_reset_tsf(struct ieee80211_local *local,
                   struct ieee80211_sub_if_data *sdata);

static inline int drv_tx_last_beacon(struct ieee80211_local *local)
{
        int ret = 0; /* default unsupported op for less congestion */

        might_sleep();

        trace_drv_tx_last_beacon(local);
        if (local->ops->tx_last_beacon)
                ret = local->ops->tx_last_beacon(&local->hw);
        trace_drv_return_int(local, ret);
        return ret;
}

int drv_ampdu_action(struct ieee80211_local *local,
                     struct ieee80211_sub_if_data *sdata,
                     struct ieee80211_ampdu_params *params);

static inline int drv_get_survey(struct ieee80211_local *local, int idx,
                                struct survey_info *survey)
{
        int ret = -EOPNOTSUPP;

        trace_drv_get_survey(local, idx, survey);

        if (local->ops->get_survey)
                ret = local->ops->get_survey(&local->hw, idx, survey);

        trace_drv_return_int(local, ret);

        return ret;
}

static inline void drv_rfkill_poll(struct ieee80211_local *local)
{
        might_sleep();

        if (local->ops->rfkill_poll)
                local->ops->rfkill_poll(&local->hw);
}

static inline void drv_flush(struct ieee80211_local *local,
                             struct ieee80211_sub_if_data *sdata,
                             u32 queues, bool drop)
{
        struct ieee80211_vif *vif = sdata ? &sdata->vif : NULL;

        might_sleep();

        if (sdata && !check_sdata_in_driver(sdata))
                return;

        trace_drv_flush(local, queues, drop);
        if (local->ops->flush)
                local->ops->flush(&local->hw, vif, queues, drop);
        trace_drv_return_void(local);
}

static inline void drv_channel_switch(struct ieee80211_local *local,
                                      struct ieee80211_sub_if_data *sdata,
                                      struct ieee80211_channel_switch *ch_switch)
{
        might_sleep();

        trace_drv_channel_switch(local, sdata, ch_switch);
        local->ops->channel_switch(&local->hw, &sdata->vif, ch_switch);
        trace_drv_return_void(local);
}


static inline int drv_set_antenna(struct ieee80211_local *local,
                                  u32 tx_ant, u32 rx_ant)
{
        int ret = -EOPNOTSUPP;
        might_sleep();
        if (local->ops->set_antenna)
                ret = local->ops->set_antenna(&local->hw, tx_ant, rx_ant);
        trace_drv_set_antenna(local, tx_ant, rx_ant, ret);
        return ret;
}

static inline int drv_get_antenna(struct ieee80211_local *local,
                                  u32 *tx_ant, u32 *rx_ant)
{
        int ret = -EOPNOTSUPP;
        might_sleep();
        if (local->ops->get_antenna)
                ret = local->ops->get_antenna(&local->hw, tx_ant, rx_ant);
        trace_drv_get_antenna(local, *tx_ant, *rx_ant, ret);
        return ret;
}

static inline int drv_remain_on_channel(struct ieee80211_local *local,
                                        struct ieee80211_sub_if_data *sdata,
                                        struct ieee80211_channel *chan,
                                        unsigned int duration,
                                        enum ieee80211_roc_type type)
{
        int ret;

        might_sleep();

        trace_drv_remain_on_channel(local, sdata, chan, duration, type);
        ret = local->ops->remain_on_channel(&local->hw, &sdata->vif,
                                            chan, duration, type);
        trace_drv_return_int(local, ret);

        return ret;
}

static inline int
drv_cancel_remain_on_channel(struct ieee80211_local *local,
                             struct ieee80211_sub_if_data *sdata)
{
        int ret;

        might_sleep();

        trace_drv_cancel_remain_on_channel(local, sdata);
        ret = local->ops->cancel_remain_on_channel(&local->hw, &sdata->vif);
        trace_drv_return_int(local, ret);

        return ret;
}

static inline int drv_set_ringparam(struct ieee80211_local *local,
                                    u32 tx, u32 rx)
{
        int ret = -ENOTSUPP;

        might_sleep();

        trace_drv_set_ringparam(local, tx, rx);
        if (local->ops->set_ringparam)
                ret = local->ops->set_ringparam(&local->hw, tx, rx);
        trace_drv_return_int(local, ret);

        return ret;
}

static inline void drv_get_ringparam(struct ieee80211_local *local,
                                     u32 *tx, u32 *tx_max, u32 *rx, u32 *rx_max)
{
        might_sleep();

        trace_drv_get_ringparam(local, tx, tx_max, rx, rx_max);
        if (local->ops->get_ringparam)
                local->ops->get_ringparam(&local->hw, tx, tx_max, rx, rx_max);
        trace_drv_return_void(local);
}

static inline bool drv_tx_frames_pending(struct ieee80211_local *local)
{
        bool ret = false;

        might_sleep();

        trace_drv_tx_frames_pending(local);
        if (local->ops->tx_frames_pending)
                ret = local->ops->tx_frames_pending(&local->hw);
        trace_drv_return_bool(local, ret);

        return ret;
}

static inline int drv_set_bitrate_mask(struct ieee80211_local *local,
                                       struct ieee80211_sub_if_data *sdata,
                                       const struct cfg80211_bitrate_mask *mask)
{
        int ret = -EOPNOTSUPP;

        might_sleep();

        if (!check_sdata_in_driver(sdata))
                return -EIO;

        trace_drv_set_bitrate_mask(local, sdata, mask);
        if (local->ops->set_bitrate_mask)
                ret = local->ops->set_bitrate_mask(&local->hw,
                                                   &sdata->vif, mask);
        trace_drv_return_int(local, ret);

        return ret;
}

static inline void drv_set_rekey_data(struct ieee80211_local *local,
                                      struct ieee80211_sub_if_data *sdata,
                                      struct cfg80211_gtk_rekey_data *data)
{
        if (!check_sdata_in_driver(sdata))
                return;

        trace_drv_set_rekey_data(local, sdata, data);
        if (local->ops->set_rekey_data)
                local->ops->set_rekey_data(&local->hw, &sdata->vif, data);
        trace_drv_return_void(local);
}

static inline void drv_event_callback(struct ieee80211_local *local,
                                      struct ieee80211_sub_if_data *sdata,
                                      const struct ieee80211_event *event)
{
        trace_drv_event_callback(local, sdata, event);
        if (local->ops->event_callback)
                local->ops->event_callback(&local->hw, &sdata->vif, event);
        trace_drv_return_void(local);
}

static inline void
drv_release_buffered_frames(struct ieee80211_local *local,
                            struct sta_info *sta, u16 tids, int num_frames,
                            enum ieee80211_frame_release_type reason,
                            bool more_data)
{
        trace_drv_release_buffered_frames(local, &sta->sta, tids, num_frames,
                                          reason, more_data);
        if (local->ops->release_buffered_frames)
                local->ops->release_buffered_frames(&local->hw, &sta->sta, tids,
                                                    num_frames, reason,
                                                    more_data);
        trace_drv_return_void(local);
}

static inline void
drv_allow_buffered_frames(struct ieee80211_local *local,
                          struct sta_info *sta, u16 tids, int num_frames,
                          enum ieee80211_frame_release_type reason,
                          bool more_data)
{
        trace_drv_allow_buffered_frames(local, &sta->sta, tids, num_frames,
                                        reason, more_data);
        if (local->ops->allow_buffered_frames)
                local->ops->allow_buffered_frames(&local->hw, &sta->sta,
                                                  tids, num_frames, reason,
                                                  more_data);
        trace_drv_return_void(local);
}

static inline void drv_mgd_prepare_tx(struct ieee80211_local *local,
                                      struct ieee80211_sub_if_data *sdata,
                                      u16 duration)
{
        might_sleep();

        if (!check_sdata_in_driver(sdata))
                return;
        WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION);

        trace_drv_mgd_prepare_tx(local, sdata, duration);
        if (local->ops->mgd_prepare_tx)
                local->ops->mgd_prepare_tx(&local->hw, &sdata->vif, duration);
        trace_drv_return_void(local);
}

static inline void
drv_mgd_protect_tdls_discover(struct ieee80211_local *local,
                              struct ieee80211_sub_if_data *sdata)
{
        might_sleep();

        if (!check_sdata_in_driver(sdata))
                return;
        WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION);

        trace_drv_mgd_protect_tdls_discover(local, sdata);
        if (local->ops->mgd_protect_tdls_discover)
                local->ops->mgd_protect_tdls_discover(&local->hw, &sdata->vif);
        trace_drv_return_void(local);
}

static inline int drv_add_chanctx(struct ieee80211_local *local,
                                  struct ieee80211_chanctx *ctx)
{
        int ret = -EOPNOTSUPP;

        might_sleep();

        trace_drv_add_chanctx(local, ctx);
        if (local->ops->add_chanctx)
                ret = local->ops->add_chanctx(&local->hw, &ctx->conf);
        trace_drv_return_int(local, ret);
        if (!ret)
                ctx->driver_present = true;

        return ret;
}

static inline void drv_remove_chanctx(struct ieee80211_local *local,
                                      struct ieee80211_chanctx *ctx)
{
        might_sleep();

        if (WARN_ON(!ctx->driver_present))
                return;

        trace_drv_remove_chanctx(local, ctx);
        if (local->ops->remove_chanctx)
                local->ops->remove_chanctx(&local->hw, &ctx->conf);
        trace_drv_return_void(local);
        ctx->driver_present = false;
}

static inline void drv_change_chanctx(struct ieee80211_local *local,
                                      struct ieee80211_chanctx *ctx,
                                      u32 changed)
{
        might_sleep();

        trace_drv_change_chanctx(local, ctx, changed);
        if (local->ops->change_chanctx) {
                WARN_ON_ONCE(!ctx->driver_present);
                local->ops->change_chanctx(&local->hw, &ctx->conf, changed);
        }
        trace_drv_return_void(local);
}

static inline int drv_assign_vif_chanctx(struct ieee80211_local *local,
                                         struct ieee80211_sub_if_data *sdata,
                                         struct ieee80211_chanctx *ctx)
{
        int ret = 0;

        if (!check_sdata_in_driver(sdata))
                return -EIO;

        trace_drv_assign_vif_chanctx(local, sdata, ctx);
        if (local->ops->assign_vif_chanctx) {
                WARN_ON_ONCE(!ctx->driver_present);
                ret = local->ops->assign_vif_chanctx(&local->hw,
                                                     &sdata->vif,
                                                     &ctx->conf);
        }
        trace_drv_return_int(local, ret);

        return ret;
}

static inline void drv_unassign_vif_chanctx(struct ieee80211_local *local,
                                            struct ieee80211_sub_if_data *sdata,
                                            struct ieee80211_chanctx *ctx)
{
        might_sleep();

        if (!check_sdata_in_driver(sdata))
                return;

        trace_drv_unassign_vif_chanctx(local, sdata, ctx);
        if (local->ops->unassign_vif_chanctx) {
                WARN_ON_ONCE(!ctx->driver_present);
                local->ops->unassign_vif_chanctx(&local->hw,
                                                 &sdata->vif,
                                                 &ctx->conf);
        }
        trace_drv_return_void(local);
}

int drv_switch_vif_chanctx(struct ieee80211_local *local,
                           struct ieee80211_vif_chanctx_switch *vifs,
                           int n_vifs, enum ieee80211_chanctx_switch_mode mode);

static inline int drv_start_ap(struct ieee80211_local *local,
                               struct ieee80211_sub_if_data *sdata)
{
        int ret = 0;

        might_sleep();

        if (!check_sdata_in_driver(sdata))
                return -EIO;

        trace_drv_start_ap(local, sdata, &sdata->vif.bss_conf);
        if (local->ops->start_ap)
                ret = local->ops->start_ap(&local->hw, &sdata->vif);
        trace_drv_return_int(local, ret);
        return ret;
}

static inline void drv_stop_ap(struct ieee80211_local *local,
                               struct ieee80211_sub_if_data *sdata)
{
        if (!check_sdata_in_driver(sdata))
                return;

        trace_drv_stop_ap(local, sdata);
        if (local->ops->stop_ap)
                local->ops->stop_ap(&local->hw, &sdata->vif);
        trace_drv_return_void(local);
}

static inline void
drv_reconfig_complete(struct ieee80211_local *local,
                      enum ieee80211_reconfig_type reconfig_type)
{
        might_sleep();

        trace_drv_reconfig_complete(local, reconfig_type);
        if (local->ops->reconfig_complete)
                local->ops->reconfig_complete(&local->hw, reconfig_type);
        trace_drv_return_void(local);
}

static inline void
drv_set_default_unicast_key(struct ieee80211_local *local,
                            struct ieee80211_sub_if_data *sdata,
                            int key_idx)
{
        if (!check_sdata_in_driver(sdata))
                return;

        WARN_ON_ONCE(key_idx < -1 || key_idx > 3);

        trace_drv_set_default_unicast_key(local, sdata, key_idx);
        if (local->ops->set_default_unicast_key)
                local->ops->set_default_unicast_key(&local->hw, &sdata->vif,
                                                    key_idx);
        trace_drv_return_void(local);
}

#if IS_ENABLED(CONFIG_IPV6)
static inline void drv_ipv6_addr_change(struct ieee80211_local *local,
                                        struct ieee80211_sub_if_data *sdata,
                                        struct inet6_dev *idev)
{
        trace_drv_ipv6_addr_change(local, sdata);
        if (local->ops->ipv6_addr_change)
                local->ops->ipv6_addr_change(&local->hw, &sdata->vif, idev);
        trace_drv_return_void(local);
}
#endif

static inline void
drv_channel_switch_beacon(struct ieee80211_sub_if_data *sdata,
                          struct cfg80211_chan_def *chandef)
{
        struct ieee80211_local *local = sdata->local;

        if (local->ops->channel_switch_beacon) {
                trace_drv_channel_switch_beacon(local, sdata, chandef);
                local->ops->channel_switch_beacon(&local->hw, &sdata->vif,
                                                  chandef);
        }
}

static inline int
drv_pre_channel_switch(struct ieee80211_sub_if_data *sdata,
                       struct ieee80211_channel_switch *ch_switch)
{
        struct ieee80211_local *local = sdata->local;
        int ret = 0;

        if (!check_sdata_in_driver(sdata))
                return -EIO;

        trace_drv_pre_channel_switch(local, sdata, ch_switch);
        if (local->ops->pre_channel_switch)
                ret = local->ops->pre_channel_switch(&local->hw, &sdata->vif,
                                                     ch_switch);
        trace_drv_return_int(local, ret);
        return ret;
}

static inline int
drv_post_channel_switch(struct ieee80211_sub_if_data *sdata)
{
        struct ieee80211_local *local = sdata->local;
        int ret = 0;

        if (!check_sdata_in_driver(sdata))
                return -EIO;

        trace_drv_post_channel_switch(local, sdata);
        if (local->ops->post_channel_switch)
                ret = local->ops->post_channel_switch(&local->hw, &sdata->vif);
        trace_drv_return_int(local, ret);
        return ret;
}

static inline void
drv_abort_channel_switch(struct ieee80211_sub_if_data *sdata)
{
        struct ieee80211_local *local = sdata->local;

        if (!check_sdata_in_driver(sdata))
                return;

        trace_drv_abort_channel_switch(local, sdata);

        if (local->ops->abort_channel_switch)
                local->ops->abort_channel_switch(&local->hw, &sdata->vif);
}

static inline void
drv_channel_switch_rx_beacon(struct ieee80211_sub_if_data *sdata,
                             struct ieee80211_channel_switch *ch_switch)
{
        struct ieee80211_local *local = sdata->local;

        if (!check_sdata_in_driver(sdata))
                return;

        trace_drv_channel_switch_rx_beacon(local, sdata, ch_switch);
        if (local->ops->channel_switch_rx_beacon)
                local->ops->channel_switch_rx_beacon(&local->hw, &sdata->vif,
                                                     ch_switch);
}

static inline int drv_join_ibss(struct ieee80211_local *local,
                                struct ieee80211_sub_if_data *sdata)
{
        int ret = 0;

        might_sleep();
        if (!check_sdata_in_driver(sdata))
                return -EIO;

        trace_drv_join_ibss(local, sdata, &sdata->vif.bss_conf);
        if (local->ops->join_ibss)
                ret = local->ops->join_ibss(&local->hw, &sdata->vif);
        trace_drv_return_int(local, ret);
        return ret;
}

static inline void drv_leave_ibss(struct ieee80211_local *local,
                                  struct ieee80211_sub_if_data *sdata)
{
        might_sleep();
        if (!check_sdata_in_driver(sdata))
                return;

        trace_drv_leave_ibss(local, sdata);
        if (local->ops->leave_ibss)
                local->ops->leave_ibss(&local->hw, &sdata->vif);
        trace_drv_return_void(local);
}

static inline u32 drv_get_expected_throughput(struct ieee80211_local *local,
                                              struct sta_info *sta)
{
        u32 ret = 0;

        trace_drv_get_expected_throughput(&sta->sta);
        if (local->ops->get_expected_throughput && sta->uploaded)
                ret = local->ops->get_expected_throughput(&local->hw, &sta->sta);
        trace_drv_return_u32(local, ret);

        return ret;
}

static inline int drv_get_txpower(struct ieee80211_local *local,
                                  struct ieee80211_sub_if_data *sdata, int *dbm)
{
        int ret;

        if (!local->ops->get_txpower)
                return -EOPNOTSUPP;

        ret = local->ops->get_txpower(&local->hw, &sdata->vif, dbm);
        trace_drv_get_txpower(local, sdata, *dbm, ret);

        return ret;
}

static inline int
drv_tdls_channel_switch(struct ieee80211_local *local,
                        struct ieee80211_sub_if_data *sdata,
                        struct ieee80211_sta *sta, u8 oper_class,
                        struct cfg80211_chan_def *chandef,
                        struct sk_buff *tmpl_skb, u32 ch_sw_tm_ie)
{
        int ret;

        might_sleep();
        if (!check_sdata_in_driver(sdata))
                return -EIO;

        if (!local->ops->tdls_channel_switch)
                return -EOPNOTSUPP;

        trace_drv_tdls_channel_switch(local, sdata, sta, oper_class, chandef);
        ret = local->ops->tdls_channel_switch(&local->hw, &sdata->vif, sta,
                                              oper_class, chandef, tmpl_skb,
                                              ch_sw_tm_ie);
        trace_drv_return_int(local, ret);
        return ret;
}

static inline void
drv_tdls_cancel_channel_switch(struct ieee80211_local *local,
                               struct ieee80211_sub_if_data *sdata,
                               struct ieee80211_sta *sta)
{
        might_sleep();
        if (!check_sdata_in_driver(sdata))
                return;

        if (!local->ops->tdls_cancel_channel_switch)
                return;

        trace_drv_tdls_cancel_channel_switch(local, sdata, sta);
        local->ops->tdls_cancel_channel_switch(&local->hw, &sdata->vif, sta);
        trace_drv_return_void(local);
}

static inline void
drv_tdls_recv_channel_switch(struct ieee80211_local *local,
                             struct ieee80211_sub_if_data *sdata,
                             struct ieee80211_tdls_ch_sw_params *params)
{
        trace_drv_tdls_recv_channel_switch(local, sdata, params);
        if (local->ops->tdls_recv_channel_switch)
                local->ops->tdls_recv_channel_switch(&local->hw, &sdata->vif,
                                                     params);
        trace_drv_return_void(local);
}

static inline void drv_wake_tx_queue(struct ieee80211_local *local,
                                     struct txq_info *txq)
{
        struct ieee80211_sub_if_data *sdata = vif_to_sdata(txq->txq.vif);

        /* In reconfig don't transmit now, but mark for waking later */
        if (local->in_reconfig) {
                set_bit(IEEE80211_TXQ_STOP_NETIF_TX, &txq->flags);
                return;
        }

        if (!check_sdata_in_driver(sdata))
                return;

        trace_drv_wake_tx_queue(local, sdata, txq);
        local->ops->wake_tx_queue(&local->hw, &txq->txq);
}

static inline void schedule_and_wake_txq(struct ieee80211_local *local,
                                         struct txq_info *txqi)
{
        ieee80211_schedule_txq(&local->hw, &txqi->txq);
        drv_wake_tx_queue(local, txqi);
}

static inline int drv_can_aggregate_in_amsdu(struct ieee80211_local *local,
                                             struct sk_buff *head,
                                             struct sk_buff *skb)
{
        if (!local->ops->can_aggregate_in_amsdu)
                return true;

        return local->ops->can_aggregate_in_amsdu(&local->hw, head, skb);
}

static inline int
drv_get_ftm_responder_stats(struct ieee80211_local *local,
                            struct ieee80211_sub_if_data *sdata,
                            struct cfg80211_ftm_responder_stats *ftm_stats)
{
        int ret = -EOPNOTSUPP;

        if (local->ops->get_ftm_responder_stats)
                ret = local->ops->get_ftm_responder_stats(&local->hw,
                                                         &sdata->vif,
                                                         ftm_stats);
        trace_drv_get_ftm_responder_stats(local, sdata, ftm_stats);

        return ret;
}

static inline int drv_start_pmsr(struct ieee80211_local *local,
                                 struct ieee80211_sub_if_data *sdata,
                                 struct cfg80211_pmsr_request *request)
{
        int ret = -EOPNOTSUPP;

        might_sleep();
        if (!check_sdata_in_driver(sdata))
                return -EIO;

        trace_drv_start_pmsr(local, sdata);

        if (local->ops->start_pmsr)
                ret = local->ops->start_pmsr(&local->hw, &sdata->vif, request);
        trace_drv_return_int(local, ret);

        return ret;
}

static inline void drv_abort_pmsr(struct ieee80211_local *local,
                                  struct ieee80211_sub_if_data *sdata,
                                  struct cfg80211_pmsr_request *request)
{
        trace_drv_abort_pmsr(local, sdata);

        might_sleep();
        if (!check_sdata_in_driver(sdata))
                return;

        if (local->ops->abort_pmsr)
                local->ops->abort_pmsr(&local->hw, &sdata->vif, request);
        trace_drv_return_void(local);
}

static inline int drv_start_nan(struct ieee80211_local *local,
                                struct ieee80211_sub_if_data *sdata,
                                struct cfg80211_nan_conf *conf)
{
        int ret;

        might_sleep();
        check_sdata_in_driver(sdata);

        trace_drv_start_nan(local, sdata, conf);
        ret = local->ops->start_nan(&local->hw, &sdata->vif, conf);
        trace_drv_return_int(local, ret);
        return ret;
}

static inline void drv_stop_nan(struct ieee80211_local *local,
                                struct ieee80211_sub_if_data *sdata)
{
        might_sleep();
        check_sdata_in_driver(sdata);

        trace_drv_stop_nan(local, sdata);
        local->ops->stop_nan(&local->hw, &sdata->vif);
        trace_drv_return_void(local);
}

static inline int drv_nan_change_conf(struct ieee80211_local *local,
                                       struct ieee80211_sub_if_data *sdata,
                                       struct cfg80211_nan_conf *conf,
                                       u32 changes)
{
        int ret;

        might_sleep();
        check_sdata_in_driver(sdata);

        if (!local->ops->nan_change_conf)
                return -EOPNOTSUPP;

        trace_drv_nan_change_conf(local, sdata, conf, changes);
        ret = local->ops->nan_change_conf(&local->hw, &sdata->vif, conf,
                                          changes);
        trace_drv_return_int(local, ret);

        return ret;
}

static inline int drv_add_nan_func(struct ieee80211_local *local,
                                   struct ieee80211_sub_if_data *sdata,
                                   const struct cfg80211_nan_func *nan_func)
{
        int ret;

        might_sleep();
        check_sdata_in_driver(sdata);

        if (!local->ops->add_nan_func)
                return -EOPNOTSUPP;

        trace_drv_add_nan_func(local, sdata, nan_func);
        ret = local->ops->add_nan_func(&local->hw, &sdata->vif, nan_func);
        trace_drv_return_int(local, ret);

        return ret;
}

static inline void drv_del_nan_func(struct ieee80211_local *local,
                                   struct ieee80211_sub_if_data *sdata,
                                   u8 instance_id)
{
        might_sleep();
        check_sdata_in_driver(sdata);

        trace_drv_del_nan_func(local, sdata, instance_id);
        if (local->ops->del_nan_func)
                local->ops->del_nan_func(&local->hw, &sdata->vif, instance_id);
        trace_drv_return_void(local);
}

static inline int drv_set_tid_config(struct ieee80211_local *local,
                                     struct ieee80211_sub_if_data *sdata,
                                     struct ieee80211_sta *sta,
                                     struct cfg80211_tid_config *tid_conf)
{
        int ret;

        might_sleep();
        ret = local->ops->set_tid_config(&local->hw, &sdata->vif, sta,
                                         tid_conf);
        trace_drv_return_int(local, ret);

        return ret;
}

static inline int drv_reset_tid_config(struct ieee80211_local *local,
                                       struct ieee80211_sub_if_data *sdata,
                                       struct ieee80211_sta *sta, u8 tids)
{
        int ret;

        might_sleep();
        ret = local->ops->reset_tid_config(&local->hw, &sdata->vif, sta, tids);
        trace_drv_return_int(local, ret);

        return ret;
}

static inline void drv_update_vif_offload(struct ieee80211_local *local,
                                          struct ieee80211_sub_if_data *sdata)
{
        might_sleep();
        check_sdata_in_driver(sdata);

        if (!local->ops->update_vif_offload)
                return;

        trace_drv_update_vif_offload(local, sdata);
        local->ops->update_vif_offload(&local->hw, &sdata->vif);
        trace_drv_return_void(local);
}

static inline void drv_sta_set_4addr(struct ieee80211_local *local,
                                     struct ieee80211_sub_if_data *sdata,
                                     struct ieee80211_sta *sta, bool enabled)
{
        sdata = get_bss_sdata(sdata);
        if (!check_sdata_in_driver(sdata))
                return;

        trace_drv_sta_set_4addr(local, sdata, sta, enabled);
        if (local->ops->sta_set_4addr)
                local->ops->sta_set_4addr(&local->hw, &sdata->vif, sta, enabled);
        trace_drv_return_void(local);
}

#endif /* __MAC80211_DRIVER_OPS */

























































































































































    2 

























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RMAP_H
#define _LINUX_RMAP_H
/*
 * Declarations for Reverse Mapping functions in mm/rmap.c
 */

#include <linux/list.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/rwsem.h>
#include <linux/memcontrol.h>
#include <linux/highmem.h>

/*
 * The anon_vma heads a list of private "related" vmas, to scan if
 * an anonymous page pointing to this anon_vma needs to be unmapped:
 * the vmas on the list will be related by forking, or by splitting.
 *
 * Since vmas come and go as they are split and merged (particularly
 * in mprotect), the mapping field of an anonymous page cannot point
 * directly to a vma: instead it points to an anon_vma, on whose list
 * the related vmas can be easily linked or unlinked.
 *
 * After unlinking the last vma on the list, we must garbage collect
 * the anon_vma object itself: we're guaranteed no page can be
 * pointing to this anon_vma once its vma list is empty.
 */
struct anon_vma {
        struct anon_vma *root;                /* Root of this anon_vma tree */
        struct rw_semaphore rwsem;        /* W: modification, R: walking the list */
        /*
         * The refcount is taken on an anon_vma when there is no
         * guarantee that the vma of page tables will exist for
         * the duration of the operation. A caller that takes
         * the reference is responsible for clearing up the
         * anon_vma if they are the last user on release
         */
        atomic_t refcount;

        /*
         * Count of child anon_vmas. Equals to the count of all anon_vmas that
         * have ->parent pointing to this one, including itself.
         *
         * This counter is used for making decision about reusing anon_vma
         * instead of forking new one. See comments in function anon_vma_clone.
         */
        unsigned long num_children;
        /* Count of VMAs whose ->anon_vma pointer points to this object. */
        unsigned long num_active_vmas;

        struct anon_vma *parent;        /* Parent of this anon_vma */

        /*
         * NOTE: the LSB of the rb_root.rb_node is set by
         * mm_take_all_locks() _after_ taking the above lock. So the
         * rb_root must only be read/written after taking the above lock
         * to be sure to see a valid next pointer. The LSB bit itself
         * is serialized by a system wide lock only visible to
         * mm_take_all_locks() (mm_all_locks_mutex).
         */

        /* Interval tree of private "related" vmas */
        struct rb_root_cached rb_root;
};

/*
 * The copy-on-write semantics of fork mean that an anon_vma
 * can become associated with multiple processes. Furthermore,
 * each child process will have its own anon_vma, where new
 * pages for that process are instantiated.
 *
 * This structure allows us to find the anon_vmas associated
 * with a VMA, or the VMAs associated with an anon_vma.
 * The "same_vma" list contains the anon_vma_chains linking
 * all the anon_vmas associated with this VMA.
 * The "rb" field indexes on an interval tree the anon_vma_chains
 * which link all the VMAs associated with this anon_vma.
 */
struct anon_vma_chain {
        struct vm_area_struct *vma;
        struct anon_vma *anon_vma;
        struct list_head same_vma;   /* locked by mmap_lock & page_table_lock */
        struct rb_node rb;                        /* locked by anon_vma->rwsem */
        unsigned long rb_subtree_last;
#ifdef CONFIG_DEBUG_VM_RB
        unsigned long cached_vma_start, cached_vma_last;
#endif
};

enum ttu_flags {
        TTU_MIGRATION                = 0x1,        /* migration mode */
        TTU_MUNLOCK                = 0x2,        /* munlock mode */

        TTU_SPLIT_HUGE_PMD        = 0x4,        /* split huge PMD if any */
        TTU_IGNORE_MLOCK        = 0x8,        /* ignore mlock */
        TTU_SYNC                = 0x10,        /* avoid racy checks with PVMW_SYNC */
        TTU_IGNORE_HWPOISON        = 0x20,        /* corrupted page is recoverable */
        TTU_BATCH_FLUSH                = 0x40,        /* Batch TLB flushes where possible
                                         * and caller guarantees they will
                                         * do a final flush if necessary */
        TTU_RMAP_LOCKED                = 0x80,        /* do not grab rmap lock:
                                         * caller holds it */
        TTU_SPLIT_FREEZE        = 0x100,                /* freeze pte under splitting thp */
};

#ifdef CONFIG_MMU
static inline void get_anon_vma(struct anon_vma *anon_vma)
{
        atomic_inc(&anon_vma->refcount);
}

void __put_anon_vma(struct anon_vma *anon_vma);

static inline void put_anon_vma(struct anon_vma *anon_vma)
{
        if (atomic_dec_and_test(&anon_vma->refcount))
                __put_anon_vma(anon_vma);
}

static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
{
        down_write(&anon_vma->root->rwsem);
}

static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
{
        up_write(&anon_vma->root->rwsem);
}

static inline void anon_vma_lock_read(struct anon_vma *anon_vma)
{
        down_read(&anon_vma->root->rwsem);
}

static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
{
        up_read(&anon_vma->root->rwsem);
}


/*
 * anon_vma helper functions.
 */
void anon_vma_init(void);        /* create anon_vma_cachep */
int  __anon_vma_prepare(struct vm_area_struct *);
void unlink_anon_vmas(struct vm_area_struct *);
int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *);
int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);

static inline int anon_vma_prepare(struct vm_area_struct *vma)
{
        if (likely(vma->anon_vma))
                return 0;

        return __anon_vma_prepare(vma);
}

static inline void anon_vma_merge(struct vm_area_struct *vma,
                                  struct vm_area_struct *next)
{
        VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma);
        unlink_anon_vmas(next);
}

struct anon_vma *page_get_anon_vma(struct page *page);

/* bitflags for do_page_add_anon_rmap() */
#define RMAP_EXCLUSIVE 0x01
#define RMAP_COMPOUND 0x02

/*
 * rmap interfaces called when adding or removing pte of page
 */
void page_move_anon_rmap(struct page *, struct vm_area_struct *);
void page_add_anon_rmap(struct page *, struct vm_area_struct *,
                unsigned long, bool);
void do_page_add_anon_rmap(struct page *, struct vm_area_struct *,
                           unsigned long, int);
void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
                unsigned long, bool);
void page_add_file_rmap(struct page *, bool);
void page_remove_rmap(struct page *, bool);

void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
                            unsigned long);
void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *,
                                unsigned long);

static inline void page_dup_rmap(struct page *page, bool compound)
{
        atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount);
}

/*
 * Called from mm/vmscan.c to handle paging out
 */
int page_referenced(struct page *, int is_locked,
                        struct mem_cgroup *memcg, unsigned long *vm_flags);

bool try_to_unmap(struct page *, enum ttu_flags flags);

/* Avoid racy checks */
#define PVMW_SYNC                (1 << 0)
/* Look for migarion entries rather than present PTEs */
#define PVMW_MIGRATION                (1 << 1)

struct page_vma_mapped_walk {
        struct page *page;
        struct vm_area_struct *vma;
        unsigned long address;
        pmd_t *pmd;
        pte_t *pte;
        spinlock_t *ptl;
        unsigned int flags;
};

static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
{
        /* HugeTLB pte is set to the relevant page table entry without pte_mapped. */
        if (pvmw->pte && !PageHuge(pvmw->page))
                pte_unmap(pvmw->pte);
        if (pvmw->ptl)
                spin_unlock(pvmw->ptl);
}

bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw);

/*
 * Used by swapoff to help locate where page is expected in vma.
 */
unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);

/*
 * Cleans the PTEs of shared mappings.
 * (and since clean PTEs should also be readonly, write protects them too)
 *
 * returns the number of cleaned PTEs.
 */
int page_mkclean(struct page *);

/*
 * called in munlock()/munmap() path to check for other vmas holding
 * the page mlocked.
 */
void try_to_munlock(struct page *);

void remove_migration_ptes(struct page *old, struct page *new, bool locked);

/*
 * Called by memory-failure.c to kill processes.
 */
struct anon_vma *page_lock_anon_vma_read(struct page *page);
void page_unlock_anon_vma_read(struct anon_vma *anon_vma);
int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);

/*
 * rmap_walk_control: To control rmap traversing for specific needs
 *
 * arg: passed to rmap_one() and invalid_vma()
 * rmap_one: executed on each vma where page is mapped
 * done: for checking traversing termination condition
 * anon_lock: for getting anon_lock by optimized way rather than default
 * invalid_vma: for skipping uninterested vma
 */
struct rmap_walk_control {
        void *arg;
        /*
         * Return false if page table scanning in rmap_walk should be stopped.
         * Otherwise, return true.
         */
        bool (*rmap_one)(struct page *page, struct vm_area_struct *vma,
                                        unsigned long addr, void *arg);
        int (*done)(struct page *page);
        struct anon_vma *(*anon_lock)(struct page *page);
        bool (*invalid_vma)(struct vm_area_struct *vma, void *arg);
};

void rmap_walk(struct page *page, struct rmap_walk_control *rwc);
void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc);

#else        /* !CONFIG_MMU */

#define anon_vma_init()                do {} while (0)
#define anon_vma_prepare(vma)        (0)
#define anon_vma_link(vma)        do {} while (0)

static inline int page_referenced(struct page *page, int is_locked,
                                  struct mem_cgroup *memcg,
                                  unsigned long *vm_flags)
{
        *vm_flags = 0;
        return 0;
}

#define try_to_unmap(page, refs) false

static inline int page_mkclean(struct page *page)
{
        return 0;
}


#endif        /* CONFIG_MMU */

#endif        /* _LINUX_RMAP_H */













































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * linux/include/linux/relay.h
 *
 * Copyright (C) 2002, 2003 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
 * Copyright (C) 1999, 2000, 2001, 2002 - Karim Yaghmour (karim@opersys.com)
 *
 * CONFIG_RELAY definitions and declarations
 */

#ifndef _LINUX_RELAY_H
#define _LINUX_RELAY_H

#include <linux/types.h>
#include <linux/sched.h>
#include <linux/timer.h>
#include <linux/wait.h>
#include <linux/list.h>
#include <linux/irq_work.h>
#include <linux/bug.h>
#include <linux/fs.h>
#include <linux/poll.h>
#include <linux/kref.h>
#include <linux/percpu.h>

/*
 * Tracks changes to rchan/rchan_buf structs
 */
#define RELAYFS_CHANNEL_VERSION                7

/*
 * Per-cpu relay channel buffer
 */
struct rchan_buf
{
        void *start;                        /* start of channel buffer */
        void *data;                        /* start of current sub-buffer */
        size_t offset;                        /* current offset into sub-buffer */
        size_t subbufs_produced;        /* count of sub-buffers produced */
        size_t subbufs_consumed;        /* count of sub-buffers consumed */
        struct rchan *chan;                /* associated channel */
        wait_queue_head_t read_wait;        /* reader wait queue */
        struct irq_work wakeup_work;        /* reader wakeup */
        struct dentry *dentry;                /* channel file dentry */
        struct kref kref;                /* channel buffer refcount */
        struct page **page_array;        /* array of current buffer pages */
        unsigned int page_count;        /* number of current buffer pages */
        unsigned int finalized;                /* buffer has been finalized */
        size_t *padding;                /* padding counts per sub-buffer */
        size_t prev_padding;                /* temporary variable */
        size_t bytes_consumed;                /* bytes consumed in cur read subbuf */
        size_t early_bytes;                /* bytes consumed before VFS inited */
        unsigned int cpu;                /* this buf's cpu */
} ____cacheline_aligned;

/*
 * Relay channel data structure
 */
struct rchan
{
        u32 version;                        /* the version of this struct */
        size_t subbuf_size;                /* sub-buffer size */
        size_t n_subbufs;                /* number of sub-buffers per buffer */
        size_t alloc_size;                /* total buffer size allocated */
        struct rchan_callbacks *cb;        /* client callbacks */
        struct kref kref;                /* channel refcount */
        void *private_data;                /* for user-defined data */
        size_t last_toobig;                /* tried to log event > subbuf size */
        struct rchan_buf * __percpu *buf; /* per-cpu channel buffers */
        int is_global;                        /* One global buffer ? */
        struct list_head list;                /* for channel list */
        struct dentry *parent;                /* parent dentry passed to open */
        int has_base_filename;                /* has a filename associated? */
        char base_filename[NAME_MAX];        /* saved base filename */
};

/*
 * Relay channel client callbacks
 */
struct rchan_callbacks
{
        /*
         * subbuf_start - called on buffer-switch to a new sub-buffer
         * @buf: the channel buffer containing the new sub-buffer
         * @subbuf: the start of the new sub-buffer
         * @prev_subbuf: the start of the previous sub-buffer
         * @prev_padding: unused space at the end of previous sub-buffer
         *
         * The client should return 1 to continue logging, 0 to stop
         * logging.
         *
         * NOTE: subbuf_start will also be invoked when the buffer is
         *       created, so that the first sub-buffer can be initialized
         *       if necessary.  In this case, prev_subbuf will be NULL.
         *
         * NOTE: the client can reserve bytes at the beginning of the new
         *       sub-buffer by calling subbuf_start_reserve() in this callback.
         */
        int (*subbuf_start) (struct rchan_buf *buf,
                             void *subbuf,
                             void *prev_subbuf,
                             size_t prev_padding);

        /*
         * buf_mapped - relay buffer mmap notification
         * @buf: the channel buffer
         * @filp: relay file pointer
         *
         * Called when a relay file is successfully mmapped
         */
        void (*buf_mapped)(struct rchan_buf *buf,
                           struct file *filp);

        /*
         * buf_unmapped - relay buffer unmap notification
         * @buf: the channel buffer
         * @filp: relay file pointer
         *
         * Called when a relay file is successfully unmapped
         */
        void (*buf_unmapped)(struct rchan_buf *buf,
                             struct file *filp);
        /*
         * create_buf_file - create file to represent a relay channel buffer
         * @filename: the name of the file to create
         * @parent: the parent of the file to create
         * @mode: the mode of the file to create
         * @buf: the channel buffer
         * @is_global: outparam - set non-zero if the buffer should be global
         *
         * Called during relay_open(), once for each per-cpu buffer,
         * to allow the client to create a file to be used to
         * represent the corresponding channel buffer.  If the file is
         * created outside of relay, the parent must also exist in
         * that filesystem.
         *
         * The callback should return the dentry of the file created
         * to represent the relay buffer.
         *
         * Setting the is_global outparam to a non-zero value will
         * cause relay_open() to create a single global buffer rather
         * than the default set of per-cpu buffers.
         *
         * See Documentation/filesystems/relay.rst for more info.
         */
        struct dentry *(*create_buf_file)(const char *filename,
                                          struct dentry *parent,
                                          umode_t mode,
                                          struct rchan_buf *buf,
                                          int *is_global);

        /*
         * remove_buf_file - remove file representing a relay channel buffer
         * @dentry: the dentry of the file to remove
         *
         * Called during relay_close(), once for each per-cpu buffer,
         * to allow the client to remove a file used to represent a
         * channel buffer.
         *
         * The callback should return 0 if successful, negative if not.
         */
        int (*remove_buf_file)(struct dentry *dentry);
};

/*
 * CONFIG_RELAY kernel API, kernel/relay.c
 */

struct rchan *relay_open(const char *base_filename,
                         struct dentry *parent,
                         size_t subbuf_size,
                         size_t n_subbufs,
                         struct rchan_callbacks *cb,
                         void *private_data);
extern int relay_late_setup_files(struct rchan *chan,
                                  const char *base_filename,
                                  struct dentry *parent);
extern void relay_close(struct rchan *chan);
extern void relay_flush(struct rchan *chan);
extern void relay_subbufs_consumed(struct rchan *chan,
                                   unsigned int cpu,
                                   size_t consumed);
extern void relay_reset(struct rchan *chan);
extern int relay_buf_full(struct rchan_buf *buf);

extern size_t relay_switch_subbuf(struct rchan_buf *buf,
                                  size_t length);

/**
 *        relay_write - write data into the channel
 *        @chan: relay channel
 *        @data: data to be written
 *        @length: number of bytes to write
 *
 *        Writes data into the current cpu's channel buffer.
 *
 *        Protects the buffer by disabling interrupts.  Use this
 *        if you might be logging from interrupt context.  Try
 *        __relay_write() if you know you        won't be logging from
 *        interrupt context.
 */
static inline void relay_write(struct rchan *chan,
                               const void *data,
                               size_t length)
{
        unsigned long flags;
        struct rchan_buf *buf;

        local_irq_save(flags);
        buf = *this_cpu_ptr(chan->buf);
        if (unlikely(buf->offset + length > chan->subbuf_size))
                length = relay_switch_subbuf(buf, length);
        memcpy(buf->data + buf->offset, data, length);
        buf->offset += length;
        local_irq_restore(flags);
}

/**
 *        __relay_write - write data into the channel
 *        @chan: relay channel
 *        @data: data to be written
 *        @length: number of bytes to write
 *
 *        Writes data into the current cpu's channel buffer.
 *
 *        Protects the buffer by disabling preemption.  Use
 *        relay_write() if you might be logging from interrupt
 *        context.
 */
static inline void __relay_write(struct rchan *chan,
                                 const void *data,
                                 size_t length)
{
        struct rchan_buf *buf;

        buf = *get_cpu_ptr(chan->buf);
        if (unlikely(buf->offset + length > buf->chan->subbuf_size))
                length = relay_switch_subbuf(buf, length);
        memcpy(buf->data + buf->offset, data, length);
        buf->offset += length;
        put_cpu_ptr(chan->buf);
}

/**
 *        relay_reserve - reserve slot in channel buffer
 *        @chan: relay channel
 *        @length: number of bytes to reserve
 *
 *        Returns pointer to reserved slot, NULL if full.
 *
 *        Reserves a slot in the current cpu's channel buffer.
 *        Does not protect the buffer at all - caller must provide
 *        appropriate synchronization.
 */
static inline void *relay_reserve(struct rchan *chan, size_t length)
{
        void *reserved = NULL;
        struct rchan_buf *buf = *get_cpu_ptr(chan->buf);

        if (unlikely(buf->offset + length > buf->chan->subbuf_size)) {
                length = relay_switch_subbuf(buf, length);
                if (!length)
                        goto end;
        }
        reserved = buf->data + buf->offset;
        buf->offset += length;

end:
        put_cpu_ptr(chan->buf);
        return reserved;
}

/**
 *        subbuf_start_reserve - reserve bytes at the start of a sub-buffer
 *        @buf: relay channel buffer
 *        @length: number of bytes to reserve
 *
 *        Helper function used to reserve bytes at the beginning of
 *        a sub-buffer in the subbuf_start() callback.
 */
static inline void subbuf_start_reserve(struct rchan_buf *buf,
                                        size_t length)
{
        BUG_ON(length >= buf->chan->subbuf_size - 1);
        buf->offset = length;
}

/*
 * exported relay file operations, kernel/relay.c
 */
extern const struct file_operations relay_file_operations;

#ifdef CONFIG_RELAY
int relay_prepare_cpu(unsigned int cpu);
#else
#define relay_prepare_cpu     NULL
#endif

#endif /* _LINUX_RELAY_H */

































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
    1 






























































    1 
    1 










































    1 






    1 
    1 




































    1 
    1 










    1 
    1 










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
 *
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/blktrace_api.h>
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/debugfs.h>
#include <linux/export.h>
#include <linux/time.h>
#include <linux/uaccess.h>
#include <linux/list.h>
#include <linux/blk-cgroup.h>

#include "../../block/blk.h"

#include <trace/events/block.h>

#include "trace_output.h"

#ifdef CONFIG_BLK_DEV_IO_TRACE

static unsigned int blktrace_seq __read_mostly = 1;

static struct trace_array *blk_tr;
static bool blk_tracer_enabled __read_mostly;

static LIST_HEAD(running_trace_list);
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock);

/* Select an alternative, minimalistic output than the original one */
#define TRACE_BLK_OPT_CLASSIC        0x1
#define TRACE_BLK_OPT_CGROUP        0x2
#define TRACE_BLK_OPT_CGNAME        0x4

static struct tracer_opt blk_tracer_opts[] = {
        /* Default disable the minimalistic output */
        { TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) },
#ifdef CONFIG_BLK_CGROUP
        { TRACER_OPT(blk_cgroup, TRACE_BLK_OPT_CGROUP) },
        { TRACER_OPT(blk_cgname, TRACE_BLK_OPT_CGNAME) },
#endif
        { }
};

static struct tracer_flags blk_tracer_flags = {
        .val  = 0,
        .opts = blk_tracer_opts,
};

/* Global reference count of probes */
static DEFINE_MUTEX(blk_probe_mutex);
static int blk_probes_ref;

static void blk_register_tracepoints(void);
static void blk_unregister_tracepoints(void);

/*
 * Send out a notify message.
 */
static void trace_note(struct blk_trace *bt, pid_t pid, int action,
                       const void *data, size_t len, u64 cgid)
{
        struct blk_io_trace *t;
        struct ring_buffer_event *event = NULL;
        struct trace_buffer *buffer = NULL;
        int pc = 0;
        int cpu = smp_processor_id();
        bool blk_tracer = blk_tracer_enabled;
        ssize_t cgid_len = cgid ? sizeof(cgid) : 0;

        if (blk_tracer) {
                buffer = blk_tr->array_buffer.buffer;
                pc = preempt_count();
                event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
                                                  sizeof(*t) + len + cgid_len,
                                                  0, pc);
                if (!event)
                        return;
                t = ring_buffer_event_data(event);
                goto record_it;
        }

        if (!bt->rchan)
                return;

        t = relay_reserve(bt->rchan, sizeof(*t) + len + cgid_len);
        if (t) {
                t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
                t->time = ktime_to_ns(ktime_get());
record_it:
                t->device = bt->dev;
                t->action = action | (cgid ? __BLK_TN_CGROUP : 0);
                t->pid = pid;
                t->cpu = cpu;
                t->pdu_len = len + cgid_len;
                if (cgid_len)
                        memcpy((void *)t + sizeof(*t), &cgid, cgid_len);
                memcpy((void *) t + sizeof(*t) + cgid_len, data, len);

                if (blk_tracer)
                        trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc);
        }
}

/*
 * Send out a notify for this process, if we haven't done so since a trace
 * started
 */
static void trace_note_tsk(struct task_struct *tsk)
{
        unsigned long flags;
        struct blk_trace *bt;

        tsk->btrace_seq = blktrace_seq;
        spin_lock_irqsave(&running_trace_lock, flags);
        list_for_each_entry(bt, &running_trace_list, running_list) {
                trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm,
                           sizeof(tsk->comm), 0);
        }
        spin_unlock_irqrestore(&running_trace_lock, flags);
}

static void trace_note_time(struct blk_trace *bt)
{
        struct timespec64 now;
        unsigned long flags;
        u32 words[2];

        /* need to check user space to see if this breaks in y2038 or y2106 */
        ktime_get_real_ts64(&now);
        words[0] = (u32)now.tv_sec;
        words[1] = now.tv_nsec;

        local_irq_save(flags);
        trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), 0);
        local_irq_restore(flags);
}

void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg,
        const char *fmt, ...)
{
        int n;
        va_list args;
        unsigned long flags;
        char *buf;

        if (unlikely(bt->trace_state != Blktrace_running &&
                     !blk_tracer_enabled))
                return;

        /*
         * If the BLK_TC_NOTIFY action mask isn't set, don't send any note
         * message to the trace.
         */
        if (!(bt->act_mask & BLK_TC_NOTIFY))
                return;

        local_irq_save(flags);
        buf = this_cpu_ptr(bt->msg_data);
        va_start(args, fmt);
        n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
        va_end(args);

        if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
                blkcg = NULL;
#ifdef CONFIG_BLK_CGROUP
        trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n,
                   blkcg ? cgroup_id(blkcg->css.cgroup) : 1);
#else
        trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, 0);
#endif
        local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(__trace_note_message);

static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
                         pid_t pid)
{
        if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
                return 1;
        if (sector && (sector < bt->start_lba || sector > bt->end_lba))
                return 1;
        if (bt->pid && pid != bt->pid)
                return 1;

        return 0;
}

/*
 * Data direction bit lookup
 */
static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
                                 BLK_TC_ACT(BLK_TC_WRITE) };

#define BLK_TC_RAHEAD                BLK_TC_AHEAD
#define BLK_TC_PREFLUSH                BLK_TC_FLUSH

/* The ilog2() calls fall out because they're constant */
#define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \
          (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name))

/*
 * The worker for the various blk_add_trace*() types. Fills out a
 * blk_io_trace structure and places it in a per-cpu subbuffer.
 */
static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
                     int op, int op_flags, u32 what, int error, int pdu_len,
                     void *pdu_data, u64 cgid)
{
        struct task_struct *tsk = current;
        struct ring_buffer_event *event = NULL;
        struct trace_buffer *buffer = NULL;
        struct blk_io_trace *t;
        unsigned long flags = 0;
        unsigned long *sequence;
        pid_t pid;
        int cpu, pc = 0;
        bool blk_tracer = blk_tracer_enabled;
        ssize_t cgid_len = cgid ? sizeof(cgid) : 0;

        if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
                return;

        what |= ddir_act[op_is_write(op) ? WRITE : READ];
        what |= MASK_TC_BIT(op_flags, SYNC);
        what |= MASK_TC_BIT(op_flags, RAHEAD);
        what |= MASK_TC_BIT(op_flags, META);
        what |= MASK_TC_BIT(op_flags, PREFLUSH);
        what |= MASK_TC_BIT(op_flags, FUA);
        if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)
                what |= BLK_TC_ACT(BLK_TC_DISCARD);
        if (op == REQ_OP_FLUSH)
                what |= BLK_TC_ACT(BLK_TC_FLUSH);
        if (cgid)
                what |= __BLK_TA_CGROUP;

        pid = tsk->pid;
        if (act_log_check(bt, what, sector, pid))
                return;
        cpu = raw_smp_processor_id();

        if (blk_tracer) {
                tracing_record_cmdline(current);

                buffer = blk_tr->array_buffer.buffer;
                pc = preempt_count();
                event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
                                                  sizeof(*t) + pdu_len + cgid_len,
                                                  0, pc);
                if (!event)
                        return;
                t = ring_buffer_event_data(event);
                goto record_it;
        }

        if (unlikely(tsk->btrace_seq != blktrace_seq))
                trace_note_tsk(tsk);

        /*
         * A word about the locking here - we disable interrupts to reserve
         * some space in the relay per-cpu buffer, to prevent an irq
         * from coming in and stepping on our toes.
         */
        local_irq_save(flags);
        t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len + cgid_len);
        if (t) {
                sequence = per_cpu_ptr(bt->sequence, cpu);

                t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
                t->sequence = ++(*sequence);
                t->time = ktime_to_ns(ktime_get());
record_it:
                /*
                 * These two are not needed in ftrace as they are in the
                 * generic trace_entry, filled by tracing_generic_entry_update,
                 * but for the trace_event->bin() synthesizer benefit we do it
                 * here too.
                 */
                t->cpu = cpu;
                t->pid = pid;

                t->sector = sector;
                t->bytes = bytes;
                t->action = what;
                t->device = bt->dev;
                t->error = error;
                t->pdu_len = pdu_len + cgid_len;

                if (cgid_len)
                        memcpy((void *)t + sizeof(*t), &cgid, cgid_len);
                if (pdu_len)
                        memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len);

                if (blk_tracer) {
                        trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc);
                        return;
                }
        }

        local_irq_restore(flags);
}

static void blk_trace_free(struct blk_trace *bt)
{
        debugfs_remove(bt->msg_file);
        debugfs_remove(bt->dropped_file);
        relay_close(bt->rchan);
        debugfs_remove(bt->dir);
        free_percpu(bt->sequence);
        free_percpu(bt->msg_data);
        kfree(bt);
}

static void get_probe_ref(void)
{
        mutex_lock(&blk_probe_mutex);
        if (++blk_probes_ref == 1)
                blk_register_tracepoints();
        mutex_unlock(&blk_probe_mutex);
}

static void put_probe_ref(void)
{
        mutex_lock(&blk_probe_mutex);
        if (!--blk_probes_ref)
                blk_unregister_tracepoints();
        mutex_unlock(&blk_probe_mutex);
}

static void blk_trace_cleanup(struct blk_trace *bt)
{
        synchronize_rcu();
        blk_trace_free(bt);
        put_probe_ref();
}

static int __blk_trace_remove(struct request_queue *q)
{
        struct blk_trace *bt;

        bt = rcu_replace_pointer(q->blk_trace, NULL,
                                 lockdep_is_held(&q->debugfs_mutex));
        if (!bt)
                return -EINVAL;

        if (bt->trace_state != Blktrace_running)
                blk_trace_cleanup(bt);

        return 0;
}

int blk_trace_remove(struct request_queue *q)
{
        int ret;

        mutex_lock(&q->debugfs_mutex);
        ret = __blk_trace_remove(q);
        mutex_unlock(&q->debugfs_mutex);

        return ret;
}
EXPORT_SYMBOL_GPL(blk_trace_remove);

static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
                                size_t count, loff_t *ppos)
{
        struct blk_trace *bt = filp->private_data;
        char buf[16];

        snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));

        return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
}

static const struct file_operations blk_dropped_fops = {
        .owner =        THIS_MODULE,
        .open =                simple_open,
        .read =                blk_dropped_read,
        .llseek =        default_llseek,
};

static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
                                size_t count, loff_t *ppos)
{
        char *msg;
        struct blk_trace *bt;

        if (count >= BLK_TN_MAX_MSG)
                return -EINVAL;

        msg = memdup_user_nul(buffer, count);
        if (IS_ERR(msg))
                return PTR_ERR(msg);

        bt = filp->private_data;
        __trace_note_message(bt, NULL, "%s", msg);
        kfree(msg);

        return count;
}

static const struct file_operations blk_msg_fops = {
        .owner =        THIS_MODULE,
        .open =                simple_open,
        .write =        blk_msg_write,
        .llseek =        noop_llseek,
};

/*
 * Keep track of how many times we encountered a full subbuffer, to aid
 * the user space app in telling how many lost events there were.
 */
static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
                                     void *prev_subbuf, size_t prev_padding)
{
        struct blk_trace *bt;

        if (!relay_buf_full(buf))
                return 1;

        bt = buf->chan->private_data;
        atomic_inc(&bt->dropped);
        return 0;
}

static int blk_remove_buf_file_callback(struct dentry *dentry)
{
        debugfs_remove(dentry);

        return 0;
}

static struct dentry *blk_create_buf_file_callback(const char *filename,
                                                   struct dentry *parent,
                                                   umode_t mode,
                                                   struct rchan_buf *buf,
                                                   int *is_global)
{
        return debugfs_create_file(filename, mode, parent, buf,
                                        &relay_file_operations);
}

static struct rchan_callbacks blk_relay_callbacks = {
        .subbuf_start                = blk_subbuf_start_callback,
        .create_buf_file        = blk_create_buf_file_callback,
        .remove_buf_file        = blk_remove_buf_file_callback,
};

static void blk_trace_setup_lba(struct blk_trace *bt,
                                struct block_device *bdev)
{
        struct hd_struct *part = NULL;

        if (bdev)
                part = bdev->bd_part;

        if (part) {
                bt->start_lba = part->start_sect;
                bt->end_lba = part->start_sect + part->nr_sects;
        } else {
                bt->start_lba = 0;
                bt->end_lba = -1ULL;
        }
}

/*
 * Setup everything required to start tracing
 */
static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
                              struct block_device *bdev,
                              struct blk_user_trace_setup *buts)
{
        struct blk_trace *bt = NULL;
        struct dentry *dir = NULL;
        int ret;

        lockdep_assert_held(&q->debugfs_mutex);

        if (!buts->buf_size || !buts->buf_nr)
                return -EINVAL;

        strncpy(buts->name, name, BLKTRACE_BDEV_SIZE);
        buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0';

        /*
         * some device names have larger paths - convert the slashes
         * to underscores for this to work as expected
         */
        strreplace(buts->name, '/', '_');

        /*
         * bdev can be NULL, as with scsi-generic, this is a helpful as
         * we can be.
         */
        if (rcu_dereference_protected(q->blk_trace,
                                      lockdep_is_held(&q->debugfs_mutex))) {
                pr_warn("Concurrent blktraces are not allowed on %s\n",
                        buts->name);
                return -EBUSY;
        }

        bt = kzalloc(sizeof(*bt), GFP_KERNEL);
        if (!bt)
                return -ENOMEM;

        ret = -ENOMEM;
        bt->sequence = alloc_percpu(unsigned long);
        if (!bt->sequence)
                goto err;

        bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
        if (!bt->msg_data)
                goto err;

        /*
         * When tracing the whole disk reuse the existing debugfs directory
         * created by the block layer on init. For partitions block devices,
         * and scsi-generic block devices we create a temporary new debugfs
         * directory that will be removed once the trace ends.
         */
        if (bdev && !bdev_is_partition(bdev))
                dir = q->debugfs_dir;
        else
                bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root);

        /*
         * As blktrace relies on debugfs for its interface the debugfs directory
         * is required, contrary to the usual mantra of not checking for debugfs
         * files or directories.
         */
        if (IS_ERR_OR_NULL(dir)) {
                pr_warn("debugfs_dir not present for %s so skipping\n",
                        buts->name);
                ret = -ENOENT;
                goto err;
        }

        bt->dev = dev;
        atomic_set(&bt->dropped, 0);
        INIT_LIST_HEAD(&bt->running_list);

        ret = -EIO;
        bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt,
                                               &blk_dropped_fops);

        bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);

        bt->rchan = relay_open("trace", dir, buts->buf_size,
                                buts->buf_nr, &blk_relay_callbacks, bt);
        if (!bt->rchan)
                goto err;

        bt->act_mask = buts->act_mask;
        if (!bt->act_mask)
                bt->act_mask = (u16) -1;

        blk_trace_setup_lba(bt, bdev);

        /* overwrite with user settings */
        if (buts->start_lba)
                bt->start_lba = buts->start_lba;
        if (buts->end_lba)
                bt->end_lba = buts->end_lba;

        bt->pid = buts->pid;
        bt->trace_state = Blktrace_setup;

        rcu_assign_pointer(q->blk_trace, bt);
        get_probe_ref();

        ret = 0;
err:
        if (ret)
                blk_trace_free(bt);
        return ret;
}

static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
                             struct block_device *bdev, char __user *arg)
{
        struct blk_user_trace_setup buts;
        int ret;

        ret = copy_from_user(&buts, arg, sizeof(buts));
        if (ret)
                return -EFAULT;

        ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
        if (ret)
                return ret;

        if (copy_to_user(arg, &buts, sizeof(buts))) {
                __blk_trace_remove(q);
                return -EFAULT;
        }
        return 0;
}

int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
                    struct block_device *bdev,
                    char __user *arg)
{
        int ret;

        mutex_lock(&q->debugfs_mutex);
        ret = __blk_trace_setup(q, name, dev, bdev, arg);
        mutex_unlock(&q->debugfs_mutex);

        return ret;
}
EXPORT_SYMBOL_GPL(blk_trace_setup);

#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
static int compat_blk_trace_setup(struct request_queue *q, char *name,
                                  dev_t dev, struct block_device *bdev,
                                  char __user *arg)
{
        struct blk_user_trace_setup buts;
        struct compat_blk_user_trace_setup cbuts;
        int ret;

        if (copy_from_user(&cbuts, arg, sizeof(cbuts)))
                return -EFAULT;

        buts = (struct blk_user_trace_setup) {
                .act_mask = cbuts.act_mask,
                .buf_size = cbuts.buf_size,
                .buf_nr = cbuts.buf_nr,
                .start_lba = cbuts.start_lba,
                .end_lba = cbuts.end_lba,
                .pid = cbuts.pid,
        };

        ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
        if (ret)
                return ret;

        if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) {
                __blk_trace_remove(q);
                return -EFAULT;
        }

        return 0;
}
#endif

static int __blk_trace_startstop(struct request_queue *q, int start)
{
        int ret;
        struct blk_trace *bt;

        bt = rcu_dereference_protected(q->blk_trace,
                                       lockdep_is_held(&q->debugfs_mutex));
        if (bt == NULL)
                return -EINVAL;

        /*
         * For starting a trace, we can transition from a setup or stopped
         * trace. For stopping a trace, the state must be running
         */
        ret = -EINVAL;
        if (start) {
                if (bt->trace_state == Blktrace_setup ||
                    bt->trace_state == Blktrace_stopped) {
                        blktrace_seq++;
                        smp_mb();
                        bt->trace_state = Blktrace_running;
                        spin_lock_irq(&running_trace_lock);
                        list_add(&bt->running_list, &running_trace_list);
                        spin_unlock_irq(&running_trace_lock);

                        trace_note_time(bt);
                        ret = 0;
                }
        } else {
                if (bt->trace_state == Blktrace_running) {
                        bt->trace_state = Blktrace_stopped;
                        spin_lock_irq(&running_trace_lock);
                        list_del_init(&bt->running_list);
                        spin_unlock_irq(&running_trace_lock);
                        relay_flush(bt->rchan);
                        ret = 0;
                }
        }

        return ret;
}

int blk_trace_startstop(struct request_queue *q, int start)
{
        int ret;

        mutex_lock(&q->debugfs_mutex);
        ret = __blk_trace_startstop(q, start);
        mutex_unlock(&q->debugfs_mutex);

        return ret;
}
EXPORT_SYMBOL_GPL(blk_trace_startstop);

/*
 * When reading or writing the blktrace sysfs files, the references to the
 * opened sysfs or device files should prevent the underlying block device
 * from being removed. So no further delete protection is really needed.
 */

/**
 * blk_trace_ioctl: - handle the ioctls associated with tracing
 * @bdev:        the block device
 * @cmd:        the ioctl cmd
 * @arg:        the argument data, if any
 *
 **/
int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
{
        struct request_queue *q;
        int ret, start = 0;
        char b[BDEVNAME_SIZE];

        q = bdev_get_queue(bdev);
        if (!q)
                return -ENXIO;

        mutex_lock(&q->debugfs_mutex);

        switch (cmd) {
        case BLKTRACESETUP:
                bdevname(bdev, b);
                ret = __blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
                break;
#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
        case BLKTRACESETUP32:
                bdevname(bdev, b);
                ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
                break;
#endif
        case BLKTRACESTART:
                start = 1;
                fallthrough;
        case BLKTRACESTOP:
                ret = __blk_trace_startstop(q, start);
                break;
        case BLKTRACETEARDOWN:
                ret = __blk_trace_remove(q);
                break;
        default:
                ret = -ENOTTY;
                break;
        }

        mutex_unlock(&q->debugfs_mutex);
        return ret;
}

/**
 * blk_trace_shutdown: - stop and cleanup trace structures
 * @q:    the request queue associated with the device
 *
 **/
void blk_trace_shutdown(struct request_queue *q)
{
        mutex_lock(&q->debugfs_mutex);
        if (rcu_dereference_protected(q->blk_trace,
                                      lockdep_is_held(&q->debugfs_mutex))) {
                __blk_trace_startstop(q, 0);
                __blk_trace_remove(q);
        }

        mutex_unlock(&q->debugfs_mutex);
}

#ifdef CONFIG_BLK_CGROUP
static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
{
        struct blk_trace *bt;

        /* We don't use the 'bt' value here except as an optimization... */
        bt = rcu_dereference_protected(q->blk_trace, 1);
        if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
                return 0;

        if (!bio->bi_blkg)
                return 0;
        return cgroup_id(bio_blkcg(bio)->css.cgroup);
}
#else
static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
{
        return 0;
}
#endif

static u64
blk_trace_request_get_cgid(struct request *rq)
{
        if (!rq->bio)
                return 0;
        /* Use the first bio */
        return blk_trace_bio_get_cgid(rq->q, rq->bio);
}

/*
 * blktrace probes
 */

/**
 * blk_add_trace_rq - Add a trace for a request oriented action
 * @rq:                the source request
 * @error:        return status to log
 * @nr_bytes:        number of completed bytes
 * @what:        the action
 * @cgid:        the cgroup info
 *
 * Description:
 *     Records an action against a request. Will log the bio offset + size.
 *
 **/
static void blk_add_trace_rq(struct request *rq, int error,
                             unsigned int nr_bytes, u32 what, u64 cgid)
{
        struct blk_trace *bt;

        rcu_read_lock();
        bt = rcu_dereference(rq->q->blk_trace);
        if (likely(!bt)) {
                rcu_read_unlock();
                return;
        }

        if (blk_rq_is_passthrough(rq))
                what |= BLK_TC_ACT(BLK_TC_PC);
        else
                what |= BLK_TC_ACT(BLK_TC_FS);

        __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
                        rq->cmd_flags, what, error, 0, NULL, cgid);
        rcu_read_unlock();
}

static void blk_add_trace_rq_insert(void *ignore, struct request *rq)
{
        blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT,
                         blk_trace_request_get_cgid(rq));
}

static void blk_add_trace_rq_issue(void *ignore, struct request *rq)
{
        blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE,
                         blk_trace_request_get_cgid(rq));
}

static void blk_add_trace_rq_merge(void *ignore, struct request *rq)
{
        blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_BACKMERGE,
                         blk_trace_request_get_cgid(rq));
}

static void blk_add_trace_rq_requeue(void *ignore, struct request *rq)
{
        blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE,
                         blk_trace_request_get_cgid(rq));
}

static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
                        int error, unsigned int nr_bytes)
{
        blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE,
                         blk_trace_request_get_cgid(rq));
}

/**
 * blk_add_trace_bio - Add a trace for a bio oriented action
 * @q:                queue the io is for
 * @bio:        the source bio
 * @what:        the action
 * @error:        error, if any
 *
 * Description:
 *     Records an action against a bio. Will log the bio offset + size.
 *
 **/
static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
                              u32 what, int error)
{
        struct blk_trace *bt;

        rcu_read_lock();
        bt = rcu_dereference(q->blk_trace);
        if (likely(!bt)) {
                rcu_read_unlock();
                return;
        }

        __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
                        bio_op(bio), bio->bi_opf, what, error, 0, NULL,
                        blk_trace_bio_get_cgid(q, bio));
        rcu_read_unlock();
}

static void blk_add_trace_bio_bounce(void *ignore,
                                     struct request_queue *q, struct bio *bio)
{
        blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
}

static void blk_add_trace_bio_complete(void *ignore,
                                       struct request_queue *q, struct bio *bio)
{
        blk_add_trace_bio(q, bio, BLK_TA_COMPLETE,
                          blk_status_to_errno(bio->bi_status));
}

static void blk_add_trace_bio_backmerge(void *ignore,
                                        struct request_queue *q,
                                        struct request *rq,
                                        struct bio *bio)
{
        blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
}

static void blk_add_trace_bio_frontmerge(void *ignore,
                                         struct request_queue *q,
                                         struct request *rq,
                                         struct bio *bio)
{
        blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
}

static void blk_add_trace_bio_queue(void *ignore,
                                    struct request_queue *q, struct bio *bio)
{
        blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);
}

static void blk_add_trace_getrq(void *ignore,
                                struct request_queue *q,
                                struct bio *bio, int rw)
{
        if (bio)
                blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
        else {
                struct blk_trace *bt;

                rcu_read_lock();
                bt = rcu_dereference(q->blk_trace);
                if (bt)
                        __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0,
                                        NULL, 0);
                rcu_read_unlock();
        }
}


static void blk_add_trace_sleeprq(void *ignore,
                                  struct request_queue *q,
                                  struct bio *bio, int rw)
{
        if (bio)
                blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
        else {
                struct blk_trace *bt;

                rcu_read_lock();
                bt = rcu_dereference(q->blk_trace);
                if (bt)
                        __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ,
                                        0, 0, NULL, 0);
                rcu_read_unlock();
        }
}

static void blk_add_trace_plug(void *ignore, struct request_queue *q)
{
        struct blk_trace *bt;

        rcu_read_lock();
        bt = rcu_dereference(q->blk_trace);
        if (bt)
                __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0);
        rcu_read_unlock();
}

static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
                                    unsigned int depth, bool explicit)
{
        struct blk_trace *bt;

        rcu_read_lock();
        bt = rcu_dereference(q->blk_trace);
        if (bt) {
                __be64 rpdu = cpu_to_be64(depth);
                u32 what;

                if (explicit)
                        what = BLK_TA_UNPLUG_IO;
                else
                        what = BLK_TA_UNPLUG_TIMER;

                __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0);
        }
        rcu_read_unlock();
}

static void blk_add_trace_split(void *ignore,
                                struct request_queue *q, struct bio *bio,
                                unsigned int pdu)
{
        struct blk_trace *bt;

        rcu_read_lock();
        bt = rcu_dereference(q->blk_trace);
        if (bt) {
                __be64 rpdu = cpu_to_be64(pdu);

                __blk_add_trace(bt, bio->bi_iter.bi_sector,
                                bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf,
                                BLK_TA_SPLIT,
                                blk_status_to_errno(bio->bi_status),
                                sizeof(rpdu), &rpdu,
                                blk_trace_bio_get_cgid(q, bio));
        }
        rcu_read_unlock();
}

/**
 * blk_add_trace_bio_remap - Add a trace for a bio-remap operation
 * @ignore:        trace callback data parameter (not used)
 * @q:                queue the io is for
 * @bio:        the source bio
 * @dev:        target device
 * @from:        source sector
 *
 * Description:
 *     Device mapper or raid target sometimes need to split a bio because
 *     it spans a stripe (or similar). Add a trace for that action.
 *
 **/
static void blk_add_trace_bio_remap(void *ignore,
                                    struct request_queue *q, struct bio *bio,
                                    dev_t dev, sector_t from)
{
        struct blk_trace *bt;
        struct blk_io_trace_remap r;

        rcu_read_lock();
        bt = rcu_dereference(q->blk_trace);
        if (likely(!bt)) {
                rcu_read_unlock();
                return;
        }

        r.device_from = cpu_to_be32(dev);
        r.device_to   = cpu_to_be32(bio_dev(bio));
        r.sector_from = cpu_to_be64(from);

        __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
                        bio_op(bio), bio->bi_opf, BLK_TA_REMAP,
                        blk_status_to_errno(bio->bi_status),
                        sizeof(r), &r, blk_trace_bio_get_cgid(q, bio));
        rcu_read_unlock();
}

/**
 * blk_add_trace_rq_remap - Add a trace for a request-remap operation
 * @ignore:        trace callback data parameter (not used)
 * @q:                queue the io is for
 * @rq:                the source request
 * @dev:        target device
 * @from:        source sector
 *
 * Description:
 *     Device mapper remaps request to other devices.
 *     Add a trace for that action.
 *
 **/
static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev,
                                   sector_t from)
{
        struct blk_trace *bt;
        struct blk_io_trace_remap r;

        rcu_read_lock();
        bt = rcu_dereference(rq->q->blk_trace);
        if (likely(!bt)) {
                rcu_read_unlock();
                return;
        }

        r.device_from = cpu_to_be32(dev);
        r.device_to   = cpu_to_be32(disk_devt(rq->rq_disk));
        r.sector_from = cpu_to_be64(from);

        __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
                        req_op(rq), rq->cmd_flags, BLK_TA_REMAP, 0,
                        sizeof(r), &r, blk_trace_request_get_cgid(rq));
        rcu_read_unlock();
}

/**
 * blk_add_driver_data - Add binary message with driver-specific data
 * @rq:                io request
 * @data:        driver-specific data
 * @len:        length of driver-specific data
 *
 * Description:
 *     Some drivers might want to write driver-specific data per request.
 *
 **/
void blk_add_driver_data(struct request *rq, void *data, size_t len)
{
        struct blk_trace *bt;

        rcu_read_lock();
        bt = rcu_dereference(rq->q->blk_trace);
        if (likely(!bt)) {
                rcu_read_unlock();
                return;
        }

        __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
                                BLK_TA_DRV_DATA, 0, len, data,
                                blk_trace_request_get_cgid(rq));
        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(blk_add_driver_data);

static void blk_register_tracepoints(void)
{
        int ret;

        ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
        WARN_ON(ret);
        ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
        WARN_ON(ret);
        ret = register_trace_block_rq_merge(blk_add_trace_rq_merge, NULL);
        WARN_ON(ret);
        ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
        WARN_ON(ret);
        ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
        WARN_ON(ret);
        ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
        WARN_ON(ret);
        ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
        WARN_ON(ret);
        ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
        WARN_ON(ret);
        ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
        WARN_ON(ret);
        ret = register_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
        WARN_ON(ret);
        ret = register_trace_block_getrq(blk_add_trace_getrq, NULL);
        WARN_ON(ret);
        ret = register_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
        WARN_ON(ret);
        ret = register_trace_block_plug(blk_add_trace_plug, NULL);
        WARN_ON(ret);
        ret = register_trace_block_unplug(blk_add_trace_unplug, NULL);
        WARN_ON(ret);
        ret = register_trace_block_split(blk_add_trace_split, NULL);
        WARN_ON(ret);
        ret = register_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
        WARN_ON(ret);
        ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
        WARN_ON(ret);
}

static void blk_unregister_tracepoints(void)
{
        unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
        unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
        unregister_trace_block_split(blk_add_trace_split, NULL);
        unregister_trace_block_unplug(blk_add_trace_unplug, NULL);
        unregister_trace_block_plug(blk_add_trace_plug, NULL);
        unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
        unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
        unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
        unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
        unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
        unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
        unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
        unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
        unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
        unregister_trace_block_rq_merge(blk_add_trace_rq_merge, NULL);
        unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
        unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);

        tracepoint_synchronize_unregister();
}

/*
 * struct blk_io_tracer formatting routines
 */

static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
{
        int i = 0;
        int tc = t->action >> BLK_TC_SHIFT;

        if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) {
                rwbs[i++] = 'N';
                goto out;
        }

        if (tc & BLK_TC_FLUSH)
                rwbs[i++] = 'F';

        if (tc & BLK_TC_DISCARD)
                rwbs[i++] = 'D';
        else if (tc & BLK_TC_WRITE)
                rwbs[i++] = 'W';
        else if (t->bytes)
                rwbs[i++] = 'R';
        else
                rwbs[i++] = 'N';

        if (tc & BLK_TC_FUA)
                rwbs[i++] = 'F';
        if (tc & BLK_TC_AHEAD)
                rwbs[i++] = 'A';
        if (tc & BLK_TC_SYNC)
                rwbs[i++] = 'S';
        if (tc & BLK_TC_META)
                rwbs[i++] = 'M';
out:
        rwbs[i] = '\0';
}

static inline
const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent)
{
        return (const struct blk_io_trace *)ent;
}

static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg)
{
        return (void *)(te_blk_io_trace(ent) + 1) + (has_cg ? sizeof(u64) : 0);
}

static inline u64 t_cgid(const struct trace_entry *ent)
{
        return *(u64 *)(te_blk_io_trace(ent) + 1);
}

static inline int pdu_real_len(const struct trace_entry *ent, bool has_cg)
{
        return te_blk_io_trace(ent)->pdu_len - (has_cg ? sizeof(u64) : 0);
}

static inline u32 t_action(const struct trace_entry *ent)
{
        return te_blk_io_trace(ent)->action;
}

static inline u32 t_bytes(const struct trace_entry *ent)
{
        return te_blk_io_trace(ent)->bytes;
}

static inline u32 t_sec(const struct trace_entry *ent)
{
        return te_blk_io_trace(ent)->bytes >> 9;
}

static inline unsigned long long t_sector(const struct trace_entry *ent)
{
        return te_blk_io_trace(ent)->sector;
}

static inline __u16 t_error(const struct trace_entry *ent)
{
        return te_blk_io_trace(ent)->error;
}

static __u64 get_pdu_int(const struct trace_entry *ent, bool has_cg)
{
        const __be64 *val = pdu_start(ent, has_cg);
        return be64_to_cpu(*val);
}

typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act,
        bool has_cg);

static void blk_log_action_classic(struct trace_iterator *iter, const char *act,
        bool has_cg)
{
        char rwbs[RWBS_LEN];
        unsigned long long ts  = iter->ts;
        unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC);
        unsigned secs               = (unsigned long)ts;
        const struct blk_io_trace *t = te_blk_io_trace(iter->ent);

        fill_rwbs(rwbs, t);

        trace_seq_printf(&iter->seq,
                         "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ",
                         MAJOR(t->device), MINOR(t->device), iter->cpu,
                         secs, nsec_rem, iter->ent->pid, act, rwbs);
}

static void blk_log_action(struct trace_iterator *iter, const char *act,
        bool has_cg)
{
        char rwbs[RWBS_LEN];
        const struct blk_io_trace *t = te_blk_io_trace(iter->ent);

        fill_rwbs(rwbs, t);
        if (has_cg) {
                u64 id = t_cgid(iter->ent);

                if (blk_tracer_flags.val & TRACE_BLK_OPT_CGNAME) {
                        char blkcg_name_buf[NAME_MAX + 1] = "<...>";

                        cgroup_path_from_kernfs_id(id, blkcg_name_buf,
                                sizeof(blkcg_name_buf));
                        trace_seq_printf(&iter->seq, "%3d,%-3d %s %2s %3s ",
                                 MAJOR(t->device), MINOR(t->device),
                                 blkcg_name_buf, act, rwbs);
                } else {
                        /*
                         * The cgid portion used to be "INO,GEN".  Userland
                         * builds a FILEID_INO32_GEN fid out of them and
                         * opens the cgroup using open_by_handle_at(2).
                         * While 32bit ino setups are still the same, 64bit
                         * ones now use the 64bit ino as the whole ID and
                         * no longer use generation.
                         *
                         * Regarldess of the content, always output
                         * "LOW32,HIGH32" so that FILEID_INO32_GEN fid can
                         * be mapped back to @id on both 64 and 32bit ino
                         * setups.  See __kernfs_fh_to_dentry().
                         */
                        trace_seq_printf(&iter->seq,
                                 "%3d,%-3d %llx,%-llx %2s %3s ",
                                 MAJOR(t->device), MINOR(t->device),
                                 id & U32_MAX, id >> 32, act, rwbs);
                }
        } else
                trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
                                 MAJOR(t->device), MINOR(t->device), act, rwbs);
}

static void blk_log_dump_pdu(struct trace_seq *s,
        const struct trace_entry *ent, bool has_cg)
{
        const unsigned char *pdu_buf;
        int pdu_len;
        int i, end;

        pdu_buf = pdu_start(ent, has_cg);
        pdu_len = pdu_real_len(ent, has_cg);

        if (!pdu_len)
                return;

        /* find the last zero that needs to be printed */
        for (end = pdu_len - 1; end >= 0; end--)
                if (pdu_buf[end])
                        break;
        end++;

        trace_seq_putc(s, '(');

        for (i = 0; i < pdu_len; i++) {

                trace_seq_printf(s, "%s%02x",
                                 i == 0 ? "" : " ", pdu_buf[i]);

                /*
                 * stop when the rest is just zeroes and indicate so
                 * with a ".." appended
                 */
                if (i == end && end != pdu_len - 1) {
                        trace_seq_puts(s, " ..) ");
                        return;
                }
        }

        trace_seq_puts(s, ") ");
}

static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
        char cmd[TASK_COMM_LEN];

        trace_find_cmdline(ent->pid, cmd);

        if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
                trace_seq_printf(s, "%u ", t_bytes(ent));
                blk_log_dump_pdu(s, ent, has_cg);
                trace_seq_printf(s, "[%s]\n", cmd);
        } else {
                if (t_sec(ent))
                        trace_seq_printf(s, "%llu + %u [%s]\n",
                                                t_sector(ent), t_sec(ent), cmd);
                else
                        trace_seq_printf(s, "[%s]\n", cmd);
        }
}

static void blk_log_with_error(struct trace_seq *s,
                              const struct trace_entry *ent, bool has_cg)
{
        if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
                blk_log_dump_pdu(s, ent, has_cg);
                trace_seq_printf(s, "[%d]\n", t_error(ent));
        } else {
                if (t_sec(ent))
                        trace_seq_printf(s, "%llu + %u [%d]\n",
                                         t_sector(ent),
                                         t_sec(ent), t_error(ent));
                else
                        trace_seq_printf(s, "%llu [%d]\n",
                                         t_sector(ent), t_error(ent));
        }
}

static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
        const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg);

        trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
                         t_sector(ent), t_sec(ent),
                         MAJOR(be32_to_cpu(__r->device_from)),
                         MINOR(be32_to_cpu(__r->device_from)),
                         be64_to_cpu(__r->sector_from));
}

static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
        char cmd[TASK_COMM_LEN];

        trace_find_cmdline(ent->pid, cmd);

        trace_seq_printf(s, "[%s]\n", cmd);
}

static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
        char cmd[TASK_COMM_LEN];

        trace_find_cmdline(ent->pid, cmd);

        trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent, has_cg));
}

static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
        char cmd[TASK_COMM_LEN];

        trace_find_cmdline(ent->pid, cmd);

        trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
                         get_pdu_int(ent, has_cg), cmd);
}

static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent,
                        bool has_cg)
{

        trace_seq_putmem(s, pdu_start(ent, has_cg),
                pdu_real_len(ent, has_cg));
        trace_seq_putc(s, '\n');
}

/*
 * struct tracer operations
 */

static void blk_tracer_print_header(struct seq_file *m)
{
        if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
                return;
        seq_puts(m, "# DEV   CPU TIMESTAMP     PID ACT FLG\n"
                    "#  |     |     |           |   |   |\n");
}

static void blk_tracer_start(struct trace_array *tr)
{
        blk_tracer_enabled = true;
}

static int blk_tracer_init(struct trace_array *tr)
{
        blk_tr = tr;
        blk_tracer_start(tr);
        return 0;
}

static void blk_tracer_stop(struct trace_array *tr)
{
        blk_tracer_enabled = false;
}

static void blk_tracer_reset(struct trace_array *tr)
{
        blk_tracer_stop(tr);
}

static const struct {
        const char *act[2];
        void           (*print)(struct trace_seq *s, const struct trace_entry *ent,
                            bool has_cg);
} what2act[] = {
        [__BLK_TA_QUEUE]        = {{  "Q", "queue" },           blk_log_generic },
        [__BLK_TA_BACKMERGE]        = {{  "M", "backmerge" },  blk_log_generic },
        [__BLK_TA_FRONTMERGE]        = {{  "F", "frontmerge" }, blk_log_generic },
        [__BLK_TA_GETRQ]        = {{  "G", "getrq" },           blk_log_generic },
        [__BLK_TA_SLEEPRQ]        = {{  "S", "sleeprq" },           blk_log_generic },
        [__BLK_TA_REQUEUE]        = {{  "R", "requeue" },           blk_log_with_error },
        [__BLK_TA_ISSUE]        = {{  "D", "issue" },           blk_log_generic },
        [__BLK_TA_COMPLETE]        = {{  "C", "complete" },   blk_log_with_error },
        [__BLK_TA_PLUG]                = {{  "P", "plug" },           blk_log_plug },
        [__BLK_TA_UNPLUG_IO]        = {{  "U", "unplug_io" },  blk_log_unplug },
        [__BLK_TA_UNPLUG_TIMER]        = {{ "UT", "unplug_timer" }, blk_log_unplug },
        [__BLK_TA_INSERT]        = {{  "I", "insert" },           blk_log_generic },
        [__BLK_TA_SPLIT]        = {{  "X", "split" },           blk_log_split },
        [__BLK_TA_BOUNCE]        = {{  "B", "bounce" },           blk_log_generic },
        [__BLK_TA_REMAP]        = {{  "A", "remap" },           blk_log_remap },
};

static enum print_line_t print_one_line(struct trace_iterator *iter,
                                        bool classic)
{
        struct trace_array *tr = iter->tr;
        struct trace_seq *s = &iter->seq;
        const struct blk_io_trace *t;
        u16 what;
        bool long_act;
        blk_log_action_t *log_action;
        bool has_cg;

        t           = te_blk_io_trace(iter->ent);
        what           = (t->action & ((1 << BLK_TC_SHIFT) - 1)) & ~__BLK_TA_CGROUP;
        long_act   = !!(tr->trace_flags & TRACE_ITER_VERBOSE);
        log_action = classic ? &blk_log_action_classic : &blk_log_action;
        has_cg           = t->action & __BLK_TA_CGROUP;

        if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) {
                log_action(iter, long_act ? "message" : "m", has_cg);
                blk_log_msg(s, iter->ent, has_cg);
                return trace_handle_return(s);
        }

        if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
                trace_seq_printf(s, "Unknown action %x\n", what);
        else {
                log_action(iter, what2act[what].act[long_act], has_cg);
                what2act[what].print(s, iter->ent, has_cg);
        }

        return trace_handle_return(s);
}

static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
                                               int flags, struct trace_event *event)
{
        return print_one_line(iter, false);
}

static void blk_trace_synthesize_old_trace(struct trace_iterator *iter)
{
        struct trace_seq *s = &iter->seq;
        struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
        const int offset = offsetof(struct blk_io_trace, sector);
        struct blk_io_trace old = {
                .magic          = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION,
                .time     = iter->ts,
        };

        trace_seq_putmem(s, &old, offset);
        trace_seq_putmem(s, &t->sector,
                         sizeof(old) - offset + t->pdu_len);
}

static enum print_line_t
blk_trace_event_print_binary(struct trace_iterator *iter, int flags,
                             struct trace_event *event)
{
        blk_trace_synthesize_old_trace(iter);

        return trace_handle_return(&iter->seq);
}

static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
{
        if ((iter->ent->type != TRACE_BLK) ||
            !(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
                return TRACE_TYPE_UNHANDLED;

        return print_one_line(iter, true);
}

static int
blk_tracer_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
{
        /* don't output context-info for blk_classic output */
        if (bit == TRACE_BLK_OPT_CLASSIC) {
                if (set)
                        tr->trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
                else
                        tr->trace_flags |= TRACE_ITER_CONTEXT_INFO;
        }
        return 0;
}

static struct tracer blk_tracer __read_mostly = {
        .name                = "blk",
        .init                = blk_tracer_init,
        .reset                = blk_tracer_reset,
        .start                = blk_tracer_start,
        .stop                = blk_tracer_stop,
        .print_header        = blk_tracer_print_header,
        .print_line        = blk_tracer_print_line,
        .flags                = &blk_tracer_flags,
        .set_flag        = blk_tracer_set_flag,
};

static struct trace_event_functions trace_blk_event_funcs = {
        .trace                = blk_trace_event_print,
        .binary                = blk_trace_event_print_binary,
};

static struct trace_event trace_blk_event = {
        .type                = TRACE_BLK,
        .funcs                = &trace_blk_event_funcs,
};

static int __init init_blk_tracer(void)
{
        if (!register_trace_event(&trace_blk_event)) {
                pr_warn("Warning: could not register block events\n");
                return 1;
        }

        if (register_tracer(&blk_tracer) != 0) {
                pr_warn("Warning: could not register the block tracer\n");
                unregister_trace_event(&trace_blk_event);
                return 1;
        }

        return 0;
}

device_initcall(init_blk_tracer);

static int blk_trace_remove_queue(struct request_queue *q)
{
        struct blk_trace *bt;

        bt = rcu_replace_pointer(q->blk_trace, NULL,
                                 lockdep_is_held(&q->debugfs_mutex));
        if (bt == NULL)
                return -EINVAL;

        if (bt->trace_state == Blktrace_running) {
                bt->trace_state = Blktrace_stopped;
                spin_lock_irq(&running_trace_lock);
                list_del_init(&bt->running_list);
                spin_unlock_irq(&running_trace_lock);
                relay_flush(bt->rchan);
        }

        put_probe_ref();
        synchronize_rcu();
        blk_trace_free(bt);
        return 0;
}

/*
 * Setup everything required to start tracing
 */
static int blk_trace_setup_queue(struct request_queue *q,
                                 struct block_device *bdev)
{
        struct blk_trace *bt = NULL;
        int ret = -ENOMEM;

        bt = kzalloc(sizeof(*bt), GFP_KERNEL);
        if (!bt)
                return -ENOMEM;

        bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
        if (!bt->msg_data)
                goto free_bt;

        bt->dev = bdev->bd_dev;
        bt->act_mask = (u16)-1;

        blk_trace_setup_lba(bt, bdev);

        rcu_assign_pointer(q->blk_trace, bt);
        get_probe_ref();
        return 0;

free_bt:
        blk_trace_free(bt);
        return ret;
}

/*
 * sysfs interface to enable and configure tracing
 */

static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
                                         struct device_attribute *attr,
                                         char *buf);
static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
                                          struct device_attribute *attr,
                                          const char *buf, size_t count);
#define BLK_TRACE_DEVICE_ATTR(_name) \
        DEVICE_ATTR(_name, S_IRUGO | S_IWUSR, \
                    sysfs_blk_trace_attr_show, \
                    sysfs_blk_trace_attr_store)

static BLK_TRACE_DEVICE_ATTR(enable);
static BLK_TRACE_DEVICE_ATTR(act_mask);
static BLK_TRACE_DEVICE_ATTR(pid);
static BLK_TRACE_DEVICE_ATTR(start_lba);
static BLK_TRACE_DEVICE_ATTR(end_lba);

static struct attribute *blk_trace_attrs[] = {
        &dev_attr_enable.attr,
        &dev_attr_act_mask.attr,
        &dev_attr_pid.attr,
        &dev_attr_start_lba.attr,
        &dev_attr_end_lba.attr,
        NULL
};

struct attribute_group blk_trace_attr_group = {
        .name  = "trace",
        .attrs = blk_trace_attrs,
};

static const struct {
        int mask;
        const char *str;
} mask_maps[] = {
        { BLK_TC_READ,                "read"                },
        { BLK_TC_WRITE,                "write"                },
        { BLK_TC_FLUSH,                "flush"                },
        { BLK_TC_SYNC,                "sync"                },
        { BLK_TC_QUEUE,                "queue"                },
        { BLK_TC_REQUEUE,        "requeue"        },
        { BLK_TC_ISSUE,                "issue"                },
        { BLK_TC_COMPLETE,        "complete"        },
        { BLK_TC_FS,                "fs"                },
        { BLK_TC_PC,                "pc"                },
        { BLK_TC_NOTIFY,        "notify"        },
        { BLK_TC_AHEAD,                "ahead"                },
        { BLK_TC_META,                "meta"                },
        { BLK_TC_DISCARD,        "discard"        },
        { BLK_TC_DRV_DATA,        "drv_data"        },
        { BLK_TC_FUA,                "fua"                },
};

static int blk_trace_str2mask(const char *str)
{
        int i;
        int mask = 0;
        char *buf, *s, *token;

        buf = kstrdup(str, GFP_KERNEL);
        if (buf == NULL)
                return -ENOMEM;
        s = strstrip(buf);

        while (1) {
                token = strsep(&s, ",");
                if (token == NULL)
                        break;

                if (*token == '\0')
                        continue;

                for (i = 0; i < ARRAY_SIZE(mask_maps); i++) {
                        if (strcasecmp(token, mask_maps[i].str) == 0) {
                                mask |= mask_maps[i].mask;
                                break;
                        }
                }
                if (i == ARRAY_SIZE(mask_maps)) {
                        mask = -EINVAL;
                        break;
                }
        }
        kfree(buf);

        return mask;
}

static ssize_t blk_trace_mask2str(char *buf, int mask)
{
        int i;
        char *p = buf;

        for (i = 0; i < ARRAY_SIZE(mask_maps); i++) {
                if (mask & mask_maps[i].mask) {
                        p += sprintf(p, "%s%s",
                                    (p == buf) ? "" : ",", mask_maps[i].str);
                }
        }
        *p++ = '\n';

        return p - buf;
}

static struct request_queue *blk_trace_get_queue(struct block_device *bdev)
{
        if (bdev->bd_disk == NULL)
                return NULL;

        return bdev_get_queue(bdev);
}

static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
                                         struct device_attribute *attr,
                                         char *buf)
{
        struct block_device *bdev = bdget_part(dev_to_part(dev));
        struct request_queue *q;
        struct blk_trace *bt;
        ssize_t ret = -ENXIO;

        if (bdev == NULL)
                goto out;

        q = blk_trace_get_queue(bdev);
        if (q == NULL)
                goto out_bdput;

        mutex_lock(&q->debugfs_mutex);

        bt = rcu_dereference_protected(q->blk_trace,
                                       lockdep_is_held(&q->debugfs_mutex));
        if (attr == &dev_attr_enable) {
                ret = sprintf(buf, "%u\n", !!bt);
                goto out_unlock_bdev;
        }

        if (bt == NULL)
                ret = sprintf(buf, "disabled\n");
        else if (attr == &dev_attr_act_mask)
                ret = blk_trace_mask2str(buf, bt->act_mask);
        else if (attr == &dev_attr_pid)
                ret = sprintf(buf, "%u\n", bt->pid);
        else if (attr == &dev_attr_start_lba)
                ret = sprintf(buf, "%llu\n", bt->start_lba);
        else if (attr == &dev_attr_end_lba)
                ret = sprintf(buf, "%llu\n", bt->end_lba);

out_unlock_bdev:
        mutex_unlock(&q->debugfs_mutex);
out_bdput:
        bdput(bdev);
out:
        return ret;
}

static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
                                          struct device_attribute *attr,
                                          const char *buf, size_t count)
{
        struct block_device *bdev;
        struct request_queue *q;
        struct blk_trace *bt;
        u64 value;
        ssize_t ret = -EINVAL;

        if (count == 0)
                goto out;

        if (attr == &dev_attr_act_mask) {
                if (kstrtoull(buf, 0, &value)) {
                        /* Assume it is a list of trace category names */
                        ret = blk_trace_str2mask(buf);
                        if (ret < 0)
                                goto out;
                        value = ret;
                }
        } else if (kstrtoull(buf, 0, &value))
                goto out;

        ret = -ENXIO;
        bdev = bdget_part(dev_to_part(dev));
        if (bdev == NULL)
                goto out;

        q = blk_trace_get_queue(bdev);
        if (q == NULL)
                goto out_bdput;

        mutex_lock(&q->debugfs_mutex);

        bt = rcu_dereference_protected(q->blk_trace,
                                       lockdep_is_held(&q->debugfs_mutex));
        if (attr == &dev_attr_enable) {
                if (!!value == !!bt) {
                        ret = 0;
                        goto out_unlock_bdev;
                }
                if (value)
                        ret = blk_trace_setup_queue(q, bdev);
                else
                        ret = blk_trace_remove_queue(q);
                goto out_unlock_bdev;
        }

        ret = 0;
        if (bt == NULL) {
                ret = blk_trace_setup_queue(q, bdev);
                bt = rcu_dereference_protected(q->blk_trace,
                                lockdep_is_held(&q->debugfs_mutex));
        }

        if (ret == 0) {
                if (attr == &dev_attr_act_mask)
                        bt->act_mask = value;
                else if (attr == &dev_attr_pid)
                        bt->pid = value;
                else if (attr == &dev_attr_start_lba)
                        bt->start_lba = value;
                else if (attr == &dev_attr_end_lba)
                        bt->end_lba = value;
        }

out_unlock_bdev:
        mutex_unlock(&q->debugfs_mutex);
out_bdput:
        bdput(bdev);
out:
        return ret ? ret : count;
}

int blk_trace_init_sysfs(struct device *dev)
{
        return sysfs_create_group(&dev->kobj, &blk_trace_attr_group);
}

void blk_trace_remove_sysfs(struct device *dev)
{
        sysfs_remove_group(&dev->kobj, &blk_trace_attr_group);
}

#endif /* CONFIG_BLK_DEV_IO_TRACE */

#ifdef CONFIG_EVENT_TRACING

void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes)
{
        int i = 0;

        if (op & REQ_PREFLUSH)
                rwbs[i++] = 'F';

        switch (op & REQ_OP_MASK) {
        case REQ_OP_WRITE:
        case REQ_OP_WRITE_SAME:
                rwbs[i++] = 'W';
                break;
        case REQ_OP_DISCARD:
                rwbs[i++] = 'D';
                break;
        case REQ_OP_SECURE_ERASE:
                rwbs[i++] = 'D';
                rwbs[i++] = 'E';
                break;
        case REQ_OP_FLUSH:
                rwbs[i++] = 'F';
                break;
        case REQ_OP_READ:
                rwbs[i++] = 'R';
                break;
        default:
                rwbs[i++] = 'N';
        }

        if (op & REQ_FUA)
                rwbs[i++] = 'F';
        if (op & REQ_RAHEAD)
                rwbs[i++] = 'A';
        if (op & REQ_SYNC)
                rwbs[i++] = 'S';
        if (op & REQ_META)
                rwbs[i++] = 'M';

        rwbs[i] = '\0';
}
EXPORT_SYMBOL_GPL(blk_fill_rwbs);

#endif /* CONFIG_EVENT_TRACING */



























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 







    1 












    1 







    1 









    1 









    1 
    1 
    1 

































    1 










































    1 







    1 
    1 





    1 







    1 
    1 



















































































































    1 





























    1 













































    1 














    1 










    1 























































































































    1 









    1 








    1 






    1 



























    1 









    1 

    1 

    1 
















    1 






    1 
















    1 






    1 
























    1 





    1 



























    1 




    1 

    1 
    1 
    1 











































    1 













































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 1999 Eric Youngdale
 * Copyright (C) 2014 Christoph Hellwig
 *
 *  SCSI queueing library.
 *      Initial versions: Eric Youngdale (eric@andante.org).
 *                        Based upon conversations with large numbers
 *                        of people at Linux Expo.
 */

#include <linux/bio.h>
#include <linux/bitops.h>
#include <linux/blkdev.h>
#include <linux/completion.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/init.h>
#include <linux/pci.h>
#include <linux/delay.h>
#include <linux/hardirq.h>
#include <linux/scatterlist.h>
#include <linux/blk-mq.h>
#include <linux/ratelimit.h>
#include <asm/unaligned.h>

#include <scsi/scsi.h>
#include <scsi/scsi_cmnd.h>
#include <scsi/scsi_dbg.h>
#include <scsi/scsi_device.h>
#include <scsi/scsi_driver.h>
#include <scsi/scsi_eh.h>
#include <scsi/scsi_host.h>
#include <scsi/scsi_transport.h> /* __scsi_init_queue() */
#include <scsi/scsi_dh.h>

#include <trace/events/scsi.h>

#include "scsi_debugfs.h"
#include "scsi_priv.h"
#include "scsi_logging.h"

/*
 * Size of integrity metadata is usually small, 1 inline sg should
 * cover normal cases.
 */
#ifdef CONFIG_ARCH_NO_SG_CHAIN
#define  SCSI_INLINE_PROT_SG_CNT  0
#define  SCSI_INLINE_SG_CNT  0
#else
#define  SCSI_INLINE_PROT_SG_CNT  1
#define  SCSI_INLINE_SG_CNT  2
#endif

static struct kmem_cache *scsi_sense_cache;
static struct kmem_cache *scsi_sense_isadma_cache;
static DEFINE_MUTEX(scsi_sense_cache_mutex);

static void scsi_mq_uninit_cmd(struct scsi_cmnd *cmd);

static inline struct kmem_cache *
scsi_select_sense_cache(bool unchecked_isa_dma)
{
        return unchecked_isa_dma ? scsi_sense_isadma_cache : scsi_sense_cache;
}

static void scsi_free_sense_buffer(bool unchecked_isa_dma,
                                   unsigned char *sense_buffer)
{
        kmem_cache_free(scsi_select_sense_cache(unchecked_isa_dma),
                        sense_buffer);
}

static unsigned char *scsi_alloc_sense_buffer(bool unchecked_isa_dma,
        gfp_t gfp_mask, int numa_node)
{
        return kmem_cache_alloc_node(scsi_select_sense_cache(unchecked_isa_dma),
                                     gfp_mask, numa_node);
}

int scsi_init_sense_cache(struct Scsi_Host *shost)
{
        struct kmem_cache *cache;
        int ret = 0;

        mutex_lock(&scsi_sense_cache_mutex);
        cache = scsi_select_sense_cache(shost->unchecked_isa_dma);
        if (cache)
                goto exit;

        if (shost->unchecked_isa_dma) {
                scsi_sense_isadma_cache =
                        kmem_cache_create("scsi_sense_cache(DMA)",
                                SCSI_SENSE_BUFFERSIZE, 0,
                                SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA, NULL);
                if (!scsi_sense_isadma_cache)
                        ret = -ENOMEM;
        } else {
                scsi_sense_cache =
                        kmem_cache_create_usercopy("scsi_sense_cache",
                                SCSI_SENSE_BUFFERSIZE, 0, SLAB_HWCACHE_ALIGN,
                                0, SCSI_SENSE_BUFFERSIZE, NULL);
                if (!scsi_sense_cache)
                        ret = -ENOMEM;
        }
 exit:
        mutex_unlock(&scsi_sense_cache_mutex);
        return ret;
}

/*
 * When to reinvoke queueing after a resource shortage. It's 3 msecs to
 * not change behaviour from the previous unplug mechanism, experimentation
 * may prove this needs changing.
 */
#define SCSI_QUEUE_DELAY        3

static void
scsi_set_blocked(struct scsi_cmnd *cmd, int reason)
{
        struct Scsi_Host *host = cmd->device->host;
        struct scsi_device *device = cmd->device;
        struct scsi_target *starget = scsi_target(device);

        /*
         * Set the appropriate busy bit for the device/host.
         *
         * If the host/device isn't busy, assume that something actually
         * completed, and that we should be able to queue a command now.
         *
         * Note that the prior mid-layer assumption that any host could
         * always queue at least one command is now broken.  The mid-layer
         * will implement a user specifiable stall (see
         * scsi_host.max_host_blocked and scsi_device.max_device_blocked)
         * if a command is requeued with no other commands outstanding
         * either for the device or for the host.
         */
        switch (reason) {
        case SCSI_MLQUEUE_HOST_BUSY:
                atomic_set(&host->host_blocked, host->max_host_blocked);
                break;
        case SCSI_MLQUEUE_DEVICE_BUSY:
        case SCSI_MLQUEUE_EH_RETRY:
                atomic_set(&device->device_blocked,
                           device->max_device_blocked);
                break;
        case SCSI_MLQUEUE_TARGET_BUSY:
                atomic_set(&starget->target_blocked,
                           starget->max_target_blocked);
                break;
        }
}

static void scsi_mq_requeue_cmd(struct scsi_cmnd *cmd)
{
        if (cmd->request->rq_flags & RQF_DONTPREP) {
                cmd->request->rq_flags &= ~RQF_DONTPREP;
                scsi_mq_uninit_cmd(cmd);
        } else {
                WARN_ON_ONCE(true);
        }
        blk_mq_requeue_request(cmd->request, true);
}

/**
 * __scsi_queue_insert - private queue insertion
 * @cmd: The SCSI command being requeued
 * @reason:  The reason for the requeue
 * @unbusy: Whether the queue should be unbusied
 *
 * This is a private queue insertion.  The public interface
 * scsi_queue_insert() always assumes the queue should be unbusied
 * because it's always called before the completion.  This function is
 * for a requeue after completion, which should only occur in this
 * file.
 */
static void __scsi_queue_insert(struct scsi_cmnd *cmd, int reason, bool unbusy)
{
        struct scsi_device *device = cmd->device;

        SCSI_LOG_MLQUEUE(1, scmd_printk(KERN_INFO, cmd,
                "Inserting command %p into mlqueue\n", cmd));

        scsi_set_blocked(cmd, reason);

        /*
         * Decrement the counters, since these commands are no longer
         * active on the host/device.
         */
        if (unbusy)
                scsi_device_unbusy(device, cmd);

        /*
         * Requeue this command.  It will go before all other commands
         * that are already in the queue. Schedule requeue work under
         * lock such that the kblockd_schedule_work() call happens
         * before blk_cleanup_queue() finishes.
         */
        cmd->result = 0;

        blk_mq_requeue_request(cmd->request, true);
}

/**
 * scsi_queue_insert - Reinsert a command in the queue.
 * @cmd:    command that we are adding to queue.
 * @reason: why we are inserting command to queue.
 *
 * We do this for one of two cases. Either the host is busy and it cannot accept
 * any more commands for the time being, or the device returned QUEUE_FULL and
 * can accept no more commands.
 *
 * Context: This could be called either from an interrupt context or a normal
 * process context.
 */
void scsi_queue_insert(struct scsi_cmnd *cmd, int reason)
{
        __scsi_queue_insert(cmd, reason, true);
}


/**
 * __scsi_execute - insert request and wait for the result
 * @sdev:        scsi device
 * @cmd:        scsi command
 * @data_direction: data direction
 * @buffer:        data buffer
 * @bufflen:        len of buffer
 * @sense:        optional sense buffer
 * @sshdr:        optional decoded sense header
 * @timeout:        request timeout in seconds
 * @retries:        number of times to retry request
 * @flags:        flags for ->cmd_flags
 * @rq_flags:        flags for ->rq_flags
 * @resid:        optional residual length
 *
 * Returns the scsi_cmnd result field if a command was executed, or a negative
 * Linux error code if we didn't get that far.
 */
int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
                 int data_direction, void *buffer, unsigned bufflen,
                 unsigned char *sense, struct scsi_sense_hdr *sshdr,
                 int timeout, int retries, u64 flags, req_flags_t rq_flags,
                 int *resid)
{
        struct request *req;
        struct scsi_request *rq;
        int ret = DRIVER_ERROR << 24;

        req = blk_get_request(sdev->request_queue,
                        data_direction == DMA_TO_DEVICE ?
                        REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN,
                        rq_flags & RQF_PM ? BLK_MQ_REQ_PM : 0);
        if (IS_ERR(req))
                return ret;
        rq = scsi_req(req);

        if (bufflen &&        blk_rq_map_kern(sdev->request_queue, req,
                                        buffer, bufflen, GFP_NOIO))
                goto out;

        rq->cmd_len = COMMAND_SIZE(cmd[0]);
        memcpy(rq->cmd, cmd, rq->cmd_len);
        rq->retries = retries;
        req->timeout = timeout;
        req->cmd_flags |= flags;
        req->rq_flags |= rq_flags | RQF_QUIET;

        /*
         * head injection *required* here otherwise quiesce won't work
         */
        blk_execute_rq(req->q, NULL, req, 1);

        /*
         * Some devices (USB mass-storage in particular) may transfer
         * garbage data together with a residue indicating that the data
         * is invalid.  Prevent the garbage from being misinterpreted
         * and prevent security leaks by zeroing out the excess data.
         */
        if (unlikely(rq->resid_len > 0 && rq->resid_len <= bufflen))
                memset(buffer + (bufflen - rq->resid_len), 0, rq->resid_len);

        if (resid)
                *resid = rq->resid_len;
        if (sense && rq->sense_len)
                memcpy(sense, rq->sense, SCSI_SENSE_BUFFERSIZE);
        if (sshdr)
                scsi_normalize_sense(rq->sense, rq->sense_len, sshdr);
        ret = rq->result;
 out:
        blk_put_request(req);

        return ret;
}
EXPORT_SYMBOL(__scsi_execute);

/*
 * Wake up the error handler if necessary. Avoid as follows that the error
 * handler is not woken up if host in-flight requests number ==
 * shost->host_failed: use call_rcu() in scsi_eh_scmd_add() in combination
 * with an RCU read lock in this function to ensure that this function in
 * its entirety either finishes before scsi_eh_scmd_add() increases the
 * host_failed counter or that it notices the shost state change made by
 * scsi_eh_scmd_add().
 */
static void scsi_dec_host_busy(struct Scsi_Host *shost, struct scsi_cmnd *cmd)
{
        unsigned long flags;

        rcu_read_lock();
        __clear_bit(SCMD_STATE_INFLIGHT, &cmd->state);
        if (unlikely(scsi_host_in_recovery(shost))) {
                unsigned int busy;
                /*
                 * Ensure the clear of SCMD_STATE_INFLIGHT is visible to
                 * other CPUs before counting busy requests. Otherwise,
                 * reordering can cause CPUs to race and miss an eh wakeup
                 * when no CPU sees all busy requests as done or timed out.
                 */
                smp_mb();

                busy = scsi_host_busy(shost);

                spin_lock_irqsave(shost->host_lock, flags);
                if (shost->host_failed || shost->host_eh_scheduled)
                        scsi_eh_wakeup(shost, busy);
                spin_unlock_irqrestore(shost->host_lock, flags);
        }
        rcu_read_unlock();
}

void scsi_device_unbusy(struct scsi_device *sdev, struct scsi_cmnd *cmd)
{
        struct Scsi_Host *shost = sdev->host;
        struct scsi_target *starget = scsi_target(sdev);

        scsi_dec_host_busy(shost, cmd);

        if (starget->can_queue > 0)
                atomic_dec(&starget->target_busy);

        atomic_dec(&sdev->device_busy);
}

static void scsi_kick_queue(struct request_queue *q)
{
        blk_mq_run_hw_queues(q, false);
}

/*
 * Called for single_lun devices on IO completion. Clear starget_sdev_user,
 * and call blk_run_queue for all the scsi_devices on the target -
 * including current_sdev first.
 *
 * Called with *no* scsi locks held.
 */
static void scsi_single_lun_run(struct scsi_device *current_sdev)
{
        struct Scsi_Host *shost = current_sdev->host;
        struct scsi_device *sdev, *tmp;
        struct scsi_target *starget = scsi_target(current_sdev);
        unsigned long flags;

        spin_lock_irqsave(shost->host_lock, flags);
        starget->starget_sdev_user = NULL;
        spin_unlock_irqrestore(shost->host_lock, flags);

        /*
         * Call blk_run_queue for all LUNs on the target, starting with
         * current_sdev. We race with others (to set starget_sdev_user),
         * but in most cases, we will be first. Ideally, each LU on the
         * target would get some limited time or requests on the target.
         */
        scsi_kick_queue(current_sdev->request_queue);

        spin_lock_irqsave(shost->host_lock, flags);
        if (starget->starget_sdev_user)
                goto out;
        list_for_each_entry_safe(sdev, tmp, &starget->devices,
                        same_target_siblings) {
                if (sdev == current_sdev)
                        continue;
                if (scsi_device_get(sdev))
                        continue;

                spin_unlock_irqrestore(shost->host_lock, flags);
                scsi_kick_queue(sdev->request_queue);
                spin_lock_irqsave(shost->host_lock, flags);

                scsi_device_put(sdev);
        }
 out:
        spin_unlock_irqrestore(shost->host_lock, flags);
}

static inline bool scsi_device_is_busy(struct scsi_device *sdev)
{
        if (atomic_read(&sdev->device_busy) >= sdev->queue_depth)
                return true;
        if (atomic_read(&sdev->device_blocked) > 0)
                return true;
        return false;
}

static inline bool scsi_target_is_busy(struct scsi_target *starget)
{
        if (starget->can_queue > 0) {
                if (atomic_read(&starget->target_busy) >= starget->can_queue)
                        return true;
                if (atomic_read(&starget->target_blocked) > 0)
                        return true;
        }
        return false;
}

static inline bool scsi_host_is_busy(struct Scsi_Host *shost)
{
        if (atomic_read(&shost->host_blocked) > 0)
                return true;
        if (shost->host_self_blocked)
                return true;
        return false;
}

static void scsi_starved_list_run(struct Scsi_Host *shost)
{
        LIST_HEAD(starved_list);
        struct scsi_device *sdev;
        unsigned long flags;

        spin_lock_irqsave(shost->host_lock, flags);
        list_splice_init(&shost->starved_list, &starved_list);

        while (!list_empty(&starved_list)) {
                struct request_queue *slq;

                /*
                 * As long as shost is accepting commands and we have
                 * starved queues, call blk_run_queue. scsi_request_fn
                 * drops the queue_lock and can add us back to the
                 * starved_list.
                 *
                 * host_lock protects the starved_list and starved_entry.
                 * scsi_request_fn must get the host_lock before checking
                 * or modifying starved_list or starved_entry.
                 */
                if (scsi_host_is_busy(shost))
                        break;

                sdev = list_entry(starved_list.next,
                                  struct scsi_device, starved_entry);
                list_del_init(&sdev->starved_entry);
                if (scsi_target_is_busy(scsi_target(sdev))) {
                        list_move_tail(&sdev->starved_entry,
                                       &shost->starved_list);
                        continue;
                }

                /*
                 * Once we drop the host lock, a racing scsi_remove_device()
                 * call may remove the sdev from the starved list and destroy
                 * it and the queue.  Mitigate by taking a reference to the
                 * queue and never touching the sdev again after we drop the
                 * host lock.  Note: if __scsi_remove_device() invokes
                 * blk_cleanup_queue() before the queue is run from this
                 * function then blk_run_queue() will return immediately since
                 * blk_cleanup_queue() marks the queue with QUEUE_FLAG_DYING.
                 */
                slq = sdev->request_queue;
                if (!blk_get_queue(slq))
                        continue;
                spin_unlock_irqrestore(shost->host_lock, flags);

                scsi_kick_queue(slq);
                blk_put_queue(slq);

                spin_lock_irqsave(shost->host_lock, flags);
        }
        /* put any unprocessed entries back */
        list_splice(&starved_list, &shost->starved_list);
        spin_unlock_irqrestore(shost->host_lock, flags);
}

/**
 * scsi_run_queue - Select a proper request queue to serve next.
 * @q:  last request's queue
 *
 * The previous command was completely finished, start a new one if possible.
 */
static void scsi_run_queue(struct request_queue *q)
{
        struct scsi_device *sdev = q->queuedata;

        if (scsi_target(sdev)->single_lun)
                scsi_single_lun_run(sdev);
        if (!list_empty(&sdev->host->starved_list))
                scsi_starved_list_run(sdev->host);

        blk_mq_run_hw_queues(q, false);
}

void scsi_requeue_run_queue(struct work_struct *work)
{
        struct scsi_device *sdev;
        struct request_queue *q;

        sdev = container_of(work, struct scsi_device, requeue_work);
        q = sdev->request_queue;
        scsi_run_queue(q);
}

void scsi_run_host_queues(struct Scsi_Host *shost)
{
        struct scsi_device *sdev;

        shost_for_each_device(sdev, shost)
                scsi_run_queue(sdev->request_queue);
}

static void scsi_uninit_cmd(struct scsi_cmnd *cmd)
{
        if (!blk_rq_is_passthrough(cmd->request)) {
                struct scsi_driver *drv = scsi_cmd_to_driver(cmd);

                if (drv->uninit_command)
                        drv->uninit_command(cmd);
        }
}

void scsi_free_sgtables(struct scsi_cmnd *cmd)
{
        if (cmd->sdb.table.nents)
                sg_free_table_chained(&cmd->sdb.table,
                                SCSI_INLINE_SG_CNT);
        if (scsi_prot_sg_count(cmd))
                sg_free_table_chained(&cmd->prot_sdb->table,
                                SCSI_INLINE_PROT_SG_CNT);
}
EXPORT_SYMBOL_GPL(scsi_free_sgtables);

static void scsi_mq_uninit_cmd(struct scsi_cmnd *cmd)
{
        scsi_free_sgtables(cmd);
        scsi_uninit_cmd(cmd);
}

static void scsi_run_queue_async(struct scsi_device *sdev)
{
        if (scsi_target(sdev)->single_lun ||
            !list_empty(&sdev->host->starved_list)) {
                kblockd_schedule_work(&sdev->requeue_work);
        } else {
                /*
                 * smp_mb() present in sbitmap_queue_clear() or implied in
                 * .end_io is for ordering writing .device_busy in
                 * scsi_device_unbusy() and reading sdev->restarts.
                 */
                int old = atomic_read(&sdev->restarts);

                /*
                 * ->restarts has to be kept as non-zero if new budget
                 *  contention occurs.
                 *
                 *  No need to run queue when either another re-run
                 *  queue wins in updating ->restarts or a new budget
                 *  contention occurs.
                 */
                if (old && atomic_cmpxchg(&sdev->restarts, old, 0) == old)
                        blk_mq_run_hw_queues(sdev->request_queue, true);
        }
}

/* Returns false when no more bytes to process, true if there are more */
static bool scsi_end_request(struct request *req, blk_status_t error,
                unsigned int bytes)
{
        struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
        struct scsi_device *sdev = cmd->device;
        struct request_queue *q = sdev->request_queue;

        if (blk_update_request(req, error, bytes))
                return true;

        if (blk_queue_add_random(q))
                add_disk_randomness(req->rq_disk);

        if (!blk_rq_is_scsi(req)) {
                WARN_ON_ONCE(!(cmd->flags & SCMD_INITIALIZED));
                cmd->flags &= ~SCMD_INITIALIZED;
        }

        /*
         * Calling rcu_barrier() is not necessary here because the
         * SCSI error handler guarantees that the function called by
         * call_rcu() has been called before scsi_end_request() is
         * called.
         */
        destroy_rcu_head(&cmd->rcu);

        /*
         * In the MQ case the command gets freed by __blk_mq_end_request,
         * so we have to do all cleanup that depends on it earlier.
         *
         * We also can't kick the queues from irq context, so we
         * will have to defer it to a workqueue.
         */
        scsi_mq_uninit_cmd(cmd);

        /*
         * queue is still alive, so grab the ref for preventing it
         * from being cleaned up during running queue.
         */
        percpu_ref_get(&q->q_usage_counter);

        __blk_mq_end_request(req, error);

        scsi_run_queue_async(sdev);

        percpu_ref_put(&q->q_usage_counter);
        return false;
}

/**
 * scsi_result_to_blk_status - translate a SCSI result code into blk_status_t
 * @cmd:        SCSI command
 * @result:        scsi error code
 *
 * Translate a SCSI result code into a blk_status_t value. May reset the host
 * byte of @cmd->result.
 */
static blk_status_t scsi_result_to_blk_status(struct scsi_cmnd *cmd, int result)
{
        switch (host_byte(result)) {
        case DID_OK:
                /*
                 * Also check the other bytes than the status byte in result
                 * to handle the case when a SCSI LLD sets result to
                 * DRIVER_SENSE << 24 without setting SAM_STAT_CHECK_CONDITION.
                 */
                if (scsi_status_is_good(result) && (result & ~0xff) == 0)
                        return BLK_STS_OK;
                return BLK_STS_IOERR;
        case DID_TRANSPORT_FAILFAST:
                return BLK_STS_TRANSPORT;
        case DID_TARGET_FAILURE:
                set_host_byte(cmd, DID_OK);
                return BLK_STS_TARGET;
        case DID_NEXUS_FAILURE:
                set_host_byte(cmd, DID_OK);
                return BLK_STS_NEXUS;
        case DID_ALLOC_FAILURE:
                set_host_byte(cmd, DID_OK);
                return BLK_STS_NOSPC;
        case DID_MEDIUM_ERROR:
                set_host_byte(cmd, DID_OK);
                return BLK_STS_MEDIUM;
        default:
                return BLK_STS_IOERR;
        }
}

/* Helper for scsi_io_completion() when "reprep" action required. */
static void scsi_io_completion_reprep(struct scsi_cmnd *cmd,
                                      struct request_queue *q)
{
        /* A new command will be prepared and issued. */
        scsi_mq_requeue_cmd(cmd);
}

static bool scsi_cmd_runtime_exceeced(struct scsi_cmnd *cmd)
{
        struct request *req = cmd->request;
        unsigned long wait_for;

        if (cmd->allowed == SCSI_CMD_RETRIES_NO_LIMIT)
                return false;

        wait_for = (cmd->allowed + 1) * req->timeout;
        if (time_before(cmd->jiffies_at_alloc + wait_for, jiffies)) {
                scmd_printk(KERN_ERR, cmd, "timing out command, waited %lus\n",
                            wait_for/HZ);
                return true;
        }
        return false;
}

/* Helper for scsi_io_completion() when special action required. */
static void scsi_io_completion_action(struct scsi_cmnd *cmd, int result)
{
        struct request_queue *q = cmd->device->request_queue;
        struct request *req = cmd->request;
        int level = 0;
        enum {ACTION_FAIL, ACTION_REPREP, ACTION_RETRY,
              ACTION_DELAYED_RETRY} action;
        struct scsi_sense_hdr sshdr;
        bool sense_valid;
        bool sense_current = true;      /* false implies "deferred sense" */
        blk_status_t blk_stat;

        sense_valid = scsi_command_normalize_sense(cmd, &sshdr);
        if (sense_valid)
                sense_current = !scsi_sense_is_deferred(&sshdr);

        blk_stat = scsi_result_to_blk_status(cmd, result);

        if (host_byte(result) == DID_RESET) {
                /* Third party bus reset or reset for error recovery
                 * reasons.  Just retry the command and see what
                 * happens.
                 */
                action = ACTION_RETRY;
        } else if (sense_valid && sense_current) {
                switch (sshdr.sense_key) {
                case UNIT_ATTENTION:
                        if (cmd->device->removable) {
                                /* Detected disc change.  Set a bit
                                 * and quietly refuse further access.
                                 */
                                cmd->device->changed = 1;
                                action = ACTION_FAIL;
                        } else {
                                /* Must have been a power glitch, or a
                                 * bus reset.  Could not have been a
                                 * media change, so we just retry the
                                 * command and see what happens.
                                 */
                                action = ACTION_RETRY;
                        }
                        break;
                case ILLEGAL_REQUEST:
                        /* If we had an ILLEGAL REQUEST returned, then
                         * we may have performed an unsupported
                         * command.  The only thing this should be
                         * would be a ten byte read where only a six
                         * byte read was supported.  Also, on a system
                         * where READ CAPACITY failed, we may have
                         * read past the end of the disk.
                         */
                        if ((cmd->device->use_10_for_rw &&
                            sshdr.asc == 0x20 && sshdr.ascq == 0x00) &&
                            (cmd->cmnd[0] == READ_10 ||
                             cmd->cmnd[0] == WRITE_10)) {
                                /* This will issue a new 6-byte command. */
                                cmd->device->use_10_for_rw = 0;
                                action = ACTION_REPREP;
                        } else if (sshdr.asc == 0x10) /* DIX */ {
                                action = ACTION_FAIL;
                                blk_stat = BLK_STS_PROTECTION;
                        /* INVALID COMMAND OPCODE or INVALID FIELD IN CDB */
                        } else if (sshdr.asc == 0x20 || sshdr.asc == 0x24) {
                                action = ACTION_FAIL;
                                blk_stat = BLK_STS_TARGET;
                        } else
                                action = ACTION_FAIL;
                        break;
                case ABORTED_COMMAND:
                        action = ACTION_FAIL;
                        if (sshdr.asc == 0x10) /* DIF */
                                blk_stat = BLK_STS_PROTECTION;
                        break;
                case NOT_READY:
                        /* If the device is in the process of becoming
                         * ready, or has a temporary blockage, retry.
                         */
                        if (sshdr.asc == 0x04) {
                                switch (sshdr.ascq) {
                                case 0x01: /* becoming ready */
                                case 0x04: /* format in progress */
                                case 0x05: /* rebuild in progress */
                                case 0x06: /* recalculation in progress */
                                case 0x07: /* operation in progress */
                                case 0x08: /* Long write in progress */
                                case 0x09: /* self test in progress */
                                case 0x11: /* notify (enable spinup) required */
                                case 0x14: /* space allocation in progress */
                                case 0x1a: /* start stop unit in progress */
                                case 0x1b: /* sanitize in progress */
                                case 0x1d: /* configuration in progress */
                                case 0x24: /* depopulation in progress */
                                        action = ACTION_DELAYED_RETRY;
                                        break;
                                default:
                                        action = ACTION_FAIL;
                                        break;
                                }
                        } else
                                action = ACTION_FAIL;
                        break;
                case VOLUME_OVERFLOW:
                        /* See SSC3rXX or current. */
                        action = ACTION_FAIL;
                        break;
                case DATA_PROTECT:
                        action = ACTION_FAIL;
                        if ((sshdr.asc == 0x0C && sshdr.ascq == 0x12) ||
                            (sshdr.asc == 0x55 &&
                             (sshdr.ascq == 0x0E || sshdr.ascq == 0x0F))) {
                                /* Insufficient zone resources */
                                blk_stat = BLK_STS_ZONE_OPEN_RESOURCE;
                        }
                        break;
                default:
                        action = ACTION_FAIL;
                        break;
                }
        } else
                action = ACTION_FAIL;

        if (action != ACTION_FAIL && scsi_cmd_runtime_exceeced(cmd))
                action = ACTION_FAIL;

        switch (action) {
        case ACTION_FAIL:
                /* Give up and fail the remainder of the request */
                if (!(req->rq_flags & RQF_QUIET)) {
                        static DEFINE_RATELIMIT_STATE(_rs,
                                        DEFAULT_RATELIMIT_INTERVAL,
                                        DEFAULT_RATELIMIT_BURST);

                        if (unlikely(scsi_logging_level))
                                level =
                                     SCSI_LOG_LEVEL(SCSI_LOG_MLCOMPLETE_SHIFT,
                                                    SCSI_LOG_MLCOMPLETE_BITS);

                        /*
                         * if logging is enabled the failure will be printed
                         * in scsi_log_completion(), so avoid duplicate messages
                         */
                        if (!level && __ratelimit(&_rs)) {
                                scsi_print_result(cmd, NULL, FAILED);
                                if (driver_byte(result) == DRIVER_SENSE)
                                        scsi_print_sense(cmd);
                                scsi_print_command(cmd);
                        }
                }
                if (!scsi_end_request(req, blk_stat, blk_rq_err_bytes(req)))
                        return;
                fallthrough;
        case ACTION_REPREP:
                scsi_io_completion_reprep(cmd, q);
                break;
        case ACTION_RETRY:
                /* Retry the same command immediately */
                __scsi_queue_insert(cmd, SCSI_MLQUEUE_EH_RETRY, false);
                break;
        case ACTION_DELAYED_RETRY:
                /* Retry the same command after a delay */
                __scsi_queue_insert(cmd, SCSI_MLQUEUE_DEVICE_BUSY, false);
                break;
        }
}

/*
 * Helper for scsi_io_completion() when cmd->result is non-zero. Returns a
 * new result that may suppress further error checking. Also modifies
 * *blk_statp in some cases.
 */
static int scsi_io_completion_nz_result(struct scsi_cmnd *cmd, int result,
                                        blk_status_t *blk_statp)
{
        bool sense_valid;
        bool sense_current = true;        /* false implies "deferred sense" */
        struct request *req = cmd->request;
        struct scsi_sense_hdr sshdr;

        sense_valid = scsi_command_normalize_sense(cmd, &sshdr);
        if (sense_valid)
                sense_current = !scsi_sense_is_deferred(&sshdr);

        if (blk_rq_is_passthrough(req)) {
                if (sense_valid) {
                        /*
                         * SG_IO wants current and deferred errors
                         */
                        scsi_req(req)->sense_len =
                                min(8 + cmd->sense_buffer[7],
                                    SCSI_SENSE_BUFFERSIZE);
                }
                if (sense_current)
                        *blk_statp = scsi_result_to_blk_status(cmd, result);
        } else if (blk_rq_bytes(req) == 0 && sense_current) {
                /*
                 * Flush commands do not transfers any data, and thus cannot use
                 * good_bytes != blk_rq_bytes(req) as the signal for an error.
                 * This sets *blk_statp explicitly for the problem case.
                 */
                *blk_statp = scsi_result_to_blk_status(cmd, result);
        }
        /*
         * Recovered errors need reporting, but they're always treated as
         * success, so fiddle the result code here.  For passthrough requests
         * we already took a copy of the original into sreq->result which
         * is what gets returned to the user
         */
        if (sense_valid && (sshdr.sense_key == RECOVERED_ERROR)) {
                bool do_print = true;
                /*
                 * if ATA PASS-THROUGH INFORMATION AVAILABLE [0x0, 0x1d]
                 * skip print since caller wants ATA registers. Only occurs
                 * on SCSI ATA PASS_THROUGH commands when CK_COND=1
                 */
                if ((sshdr.asc == 0x0) && (sshdr.ascq == 0x1d))
                        do_print = false;
                else if (req->rq_flags & RQF_QUIET)
                        do_print = false;
                if (do_print)
                        scsi_print_sense(cmd);
                result = 0;
                /* for passthrough, *blk_statp may be set */
                *blk_statp = BLK_STS_OK;
        }
        /*
         * Another corner case: the SCSI status byte is non-zero but 'good'.
         * Example: PRE-FETCH command returns SAM_STAT_CONDITION_MET when
         * it is able to fit nominated LBs in its cache (and SAM_STAT_GOOD
         * if it can't fit). Treat SAM_STAT_CONDITION_MET and the related
         * intermediate statuses (both obsolete in SAM-4) as good.
         */
        if (status_byte(result) && scsi_status_is_good(result)) {
                result = 0;
                *blk_statp = BLK_STS_OK;
        }
        return result;
}

/**
 * scsi_io_completion - Completion processing for SCSI commands.
 * @cmd:        command that is finished.
 * @good_bytes:        number of processed bytes.
 *
 * We will finish off the specified number of sectors. If we are done, the
 * command block will be released and the queue function will be goosed. If we
 * are not done then we have to figure out what to do next:
 *
 *   a) We can call scsi_io_completion_reprep().  The request will be
 *        unprepared and put back on the queue.  Then a new command will
 *        be created for it.  This should be used if we made forward
 *        progress, or if we want to switch from READ(10) to READ(6) for
 *        example.
 *
 *   b) We can call scsi_io_completion_action().  The request will be
 *        put back on the queue and retried using the same command as
 *        before, possibly after a delay.
 *
 *   c) We can call scsi_end_request() with blk_stat other than
 *        BLK_STS_OK, to fail the remainder of the request.
 */
void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
{
        int result = cmd->result;
        struct request_queue *q = cmd->device->request_queue;
        struct request *req = cmd->request;
        blk_status_t blk_stat = BLK_STS_OK;

        if (unlikely(result))        /* a nz result may or may not be an error */
                result = scsi_io_completion_nz_result(cmd, result, &blk_stat);

        if (unlikely(blk_rq_is_passthrough(req))) {
                /*
                 * scsi_result_to_blk_status may have reset the host_byte
                 */
                scsi_req(req)->result = cmd->result;
        }

        /*
         * Next deal with any sectors which we were able to correctly
         * handle.
         */
        SCSI_LOG_HLCOMPLETE(1, scmd_printk(KERN_INFO, cmd,
                "%u sectors total, %d bytes done.\n",
                blk_rq_sectors(req), good_bytes));

        /*
         * Failed, zero length commands always need to drop down
         * to retry code. Fast path should return in this block.
         */
        if (likely(blk_rq_bytes(req) > 0 || blk_stat == BLK_STS_OK)) {
                if (likely(!scsi_end_request(req, blk_stat, good_bytes)))
                        return; /* no bytes remaining */
        }

        /* Kill remainder if no retries. */
        if (unlikely(blk_stat && scsi_noretry_cmd(cmd))) {
                if (scsi_end_request(req, blk_stat, blk_rq_bytes(req)))
                        WARN_ONCE(true,
                            "Bytes remaining after failed, no-retry command");
                return;
        }

        /*
         * If there had been no error, but we have leftover bytes in the
         * requeues just queue the command up again.
         */
        if (likely(result == 0))
                scsi_io_completion_reprep(cmd, q);
        else
                scsi_io_completion_action(cmd, result);
}

static inline bool scsi_cmd_needs_dma_drain(struct scsi_device *sdev,
                struct request *rq)
{
        return sdev->dma_drain_len && blk_rq_is_passthrough(rq) &&
               !op_is_write(req_op(rq)) &&
               sdev->host->hostt->dma_need_drain(rq);
}

/**
 * scsi_alloc_sgtables - allocate S/G tables for a command
 * @cmd:  command descriptor we wish to initialize
 *
 * Returns:
 * * BLK_STS_OK       - on success
 * * BLK_STS_RESOURCE - if the failure is retryable
 * * BLK_STS_IOERR    - if the failure is fatal
 */
blk_status_t scsi_alloc_sgtables(struct scsi_cmnd *cmd)
{
        struct scsi_device *sdev = cmd->device;
        struct request *rq = cmd->request;
        unsigned short nr_segs = blk_rq_nr_phys_segments(rq);
        struct scatterlist *last_sg = NULL;
        blk_status_t ret;
        bool need_drain = scsi_cmd_needs_dma_drain(sdev, rq);
        int count;

        if (WARN_ON_ONCE(!nr_segs))
                return BLK_STS_IOERR;

        /*
         * Make sure there is space for the drain.  The driver must adjust
         * max_hw_segments to be prepared for this.
         */
        if (need_drain)
                nr_segs++;

        /*
         * If sg table allocation fails, requeue request later.
         */
        if (unlikely(sg_alloc_table_chained(&cmd->sdb.table, nr_segs,
                        cmd->sdb.table.sgl, SCSI_INLINE_SG_CNT)))
                return BLK_STS_RESOURCE;

        /*
         * Next, walk the list, and fill in the addresses and sizes of
         * each segment.
         */
        count = __blk_rq_map_sg(rq->q, rq, cmd->sdb.table.sgl, &last_sg);

        if (blk_rq_bytes(rq) & rq->q->dma_pad_mask) {
                unsigned int pad_len =
                        (rq->q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1;

                last_sg->length += pad_len;
                cmd->extra_len += pad_len;
        }

        if (need_drain) {
                sg_unmark_end(last_sg);
                last_sg = sg_next(last_sg);
                sg_set_buf(last_sg, sdev->dma_drain_buf, sdev->dma_drain_len);
                sg_mark_end(last_sg);

                cmd->extra_len += sdev->dma_drain_len;
                count++;
        }

        BUG_ON(count > cmd->sdb.table.nents);
        cmd->sdb.table.nents = count;
        cmd->sdb.length = blk_rq_payload_bytes(rq);

        if (blk_integrity_rq(rq)) {
                struct scsi_data_buffer *prot_sdb = cmd->prot_sdb;
                int ivecs;

                if (WARN_ON_ONCE(!prot_sdb)) {
                        /*
                         * This can happen if someone (e.g. multipath)
                         * queues a command to a device on an adapter
                         * that does not support DIX.
                         */
                        ret = BLK_STS_IOERR;
                        goto out_free_sgtables;
                }

                ivecs = blk_rq_count_integrity_sg(rq->q, rq->bio);

                if (sg_alloc_table_chained(&prot_sdb->table, ivecs,
                                prot_sdb->table.sgl,
                                SCSI_INLINE_PROT_SG_CNT)) {
                        ret = BLK_STS_RESOURCE;
                        goto out_free_sgtables;
                }

                count = blk_rq_map_integrity_sg(rq->q, rq->bio,
                                                prot_sdb->table.sgl);
                BUG_ON(count > ivecs);
                BUG_ON(count > queue_max_integrity_segments(rq->q));

                cmd->prot_sdb = prot_sdb;
                cmd->prot_sdb->table.nents = count;
        }

        return BLK_STS_OK;
out_free_sgtables:
        scsi_free_sgtables(cmd);
        return ret;
}
EXPORT_SYMBOL(scsi_alloc_sgtables);

/**
 * scsi_initialize_rq - initialize struct scsi_cmnd partially
 * @rq: Request associated with the SCSI command to be initialized.
 *
 * This function initializes the members of struct scsi_cmnd that must be
 * initialized before request processing starts and that won't be
 * reinitialized if a SCSI command is requeued.
 *
 * Called from inside blk_get_request() for pass-through requests and from
 * inside scsi_init_command() for filesystem requests.
 */
static void scsi_initialize_rq(struct request *rq)
{
        struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq);

        scsi_req_init(&cmd->req);
        init_rcu_head(&cmd->rcu);
        cmd->jiffies_at_alloc = jiffies;
        cmd->retries = 0;
}

/*
 * Only called when the request isn't completed by SCSI, and not freed by
 * SCSI
 */
static void scsi_cleanup_rq(struct request *rq)
{
        if (rq->rq_flags & RQF_DONTPREP) {
                scsi_mq_uninit_cmd(blk_mq_rq_to_pdu(rq));
                rq->rq_flags &= ~RQF_DONTPREP;
        }
}

/* Called before a request is prepared. See also scsi_mq_prep_fn(). */
void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd)
{
        void *buf = cmd->sense_buffer;
        void *prot = cmd->prot_sdb;
        struct request *rq = blk_mq_rq_from_pdu(cmd);
        unsigned int flags = cmd->flags & SCMD_PRESERVED_FLAGS;
        unsigned long jiffies_at_alloc;
        int retries, to_clear;
        bool in_flight;

        if (!blk_rq_is_scsi(rq) && !(flags & SCMD_INITIALIZED)) {
                flags |= SCMD_INITIALIZED;
                scsi_initialize_rq(rq);
        }

        jiffies_at_alloc = cmd->jiffies_at_alloc;
        retries = cmd->retries;
        in_flight = test_bit(SCMD_STATE_INFLIGHT, &cmd->state);
        /*
         * Zero out the cmd, except for the embedded scsi_request. Only clear
         * the driver-private command data if the LLD does not supply a
         * function to initialize that data.
         */
        to_clear = sizeof(*cmd) - sizeof(cmd->req);
        if (!dev->host->hostt->init_cmd_priv)
                to_clear += dev->host->hostt->cmd_size;
        memset((char *)cmd + sizeof(cmd->req), 0, to_clear);

        cmd->device = dev;
        cmd->sense_buffer = buf;
        cmd->prot_sdb = prot;
        cmd->flags = flags;
        INIT_DELAYED_WORK(&cmd->abort_work, scmd_eh_abort_handler);
        cmd->jiffies_at_alloc = jiffies_at_alloc;
        cmd->retries = retries;
        if (in_flight)
                __set_bit(SCMD_STATE_INFLIGHT, &cmd->state);

}

static blk_status_t scsi_setup_scsi_cmnd(struct scsi_device *sdev,
                struct request *req)
{
        struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);

        /*
         * Passthrough requests may transfer data, in which case they must
         * a bio attached to them.  Or they might contain a SCSI command
         * that does not transfer data, in which case they may optionally
         * submit a request without an attached bio.
         */
        if (req->bio) {
                blk_status_t ret = scsi_alloc_sgtables(cmd);
                if (unlikely(ret != BLK_STS_OK))
                        return ret;
        } else {
                BUG_ON(blk_rq_bytes(req));

                memset(&cmd->sdb, 0, sizeof(cmd->sdb));
        }

        cmd->cmd_len = scsi_req(req)->cmd_len;
        cmd->cmnd = scsi_req(req)->cmd;
        cmd->transfersize = blk_rq_bytes(req);
        cmd->allowed = scsi_req(req)->retries;
        return BLK_STS_OK;
}

static blk_status_t
scsi_device_state_check(struct scsi_device *sdev, struct request *req)
{
        switch (sdev->sdev_state) {
        case SDEV_CREATED:
                return BLK_STS_OK;
        case SDEV_OFFLINE:
        case SDEV_TRANSPORT_OFFLINE:
                /*
                 * If the device is offline we refuse to process any
                 * commands.  The device must be brought online
                 * before trying any recovery commands.
                 */
                if (!sdev->offline_already) {
                        sdev->offline_already = true;
                        sdev_printk(KERN_ERR, sdev,
                                    "rejecting I/O to offline device\n");
                }
                return BLK_STS_IOERR;
        case SDEV_DEL:
                /*
                 * If the device is fully deleted, we refuse to
                 * process any commands as well.
                 */
                sdev_printk(KERN_ERR, sdev,
                            "rejecting I/O to dead device\n");
                return BLK_STS_IOERR;
        case SDEV_BLOCK:
        case SDEV_CREATED_BLOCK:
                return BLK_STS_RESOURCE;
        case SDEV_QUIESCE:
                /*
                 * If the device is blocked we only accept power management
                 * commands.
                 */
                if (req && WARN_ON_ONCE(!(req->rq_flags & RQF_PM)))
                        return BLK_STS_RESOURCE;
                return BLK_STS_OK;
        default:
                /*
                 * For any other not fully online state we only allow
                 * power management commands.
                 */
                if (req && !(req->rq_flags & RQF_PM))
                        return BLK_STS_IOERR;
                return BLK_STS_OK;
        }
}

/*
 * scsi_dev_queue_ready: if we can send requests to sdev, return 1 else
 * return 0.
 *
 * Called with the queue_lock held.
 */
static inline int scsi_dev_queue_ready(struct request_queue *q,
                                  struct scsi_device *sdev)
{
        unsigned int busy;

        busy = atomic_inc_return(&sdev->device_busy) - 1;
        if (atomic_read(&sdev->device_blocked)) {
                if (busy)
                        goto out_dec;

                /*
                 * unblock after device_blocked iterates to zero
                 */
                if (atomic_dec_return(&sdev->device_blocked) > 0)
                        goto out_dec;
                SCSI_LOG_MLQUEUE(3, sdev_printk(KERN_INFO, sdev,
                                   "unblocking device at zero depth\n"));
        }

        if (busy >= sdev->queue_depth)
                goto out_dec;

        return 1;
out_dec:
        atomic_dec(&sdev->device_busy);
        return 0;
}

/*
 * scsi_target_queue_ready: checks if there we can send commands to target
 * @sdev: scsi device on starget to check.
 */
static inline int scsi_target_queue_ready(struct Scsi_Host *shost,
                                           struct scsi_device *sdev)
{
        struct scsi_target *starget = scsi_target(sdev);
        unsigned int busy;

        if (starget->single_lun) {
                spin_lock_irq(shost->host_lock);
                if (starget->starget_sdev_user &&
                    starget->starget_sdev_user != sdev) {
                        spin_unlock_irq(shost->host_lock);
                        return 0;
                }
                starget->starget_sdev_user = sdev;
                spin_unlock_irq(shost->host_lock);
        }

        if (starget->can_queue <= 0)
                return 1;

        busy = atomic_inc_return(&starget->target_busy) - 1;
        if (atomic_read(&starget->target_blocked) > 0) {
                if (busy)
                        goto starved;

                /*
                 * unblock after target_blocked iterates to zero
                 */
                if (atomic_dec_return(&starget->target_blocked) > 0)
                        goto out_dec;

                SCSI_LOG_MLQUEUE(3, starget_printk(KERN_INFO, starget,
                                 "unblocking target at zero depth\n"));
        }

        if (busy >= starget->can_queue)
                goto starved;

        return 1;

starved:
        spin_lock_irq(shost->host_lock);
        list_move_tail(&sdev->starved_entry, &shost->starved_list);
        spin_unlock_irq(shost->host_lock);
out_dec:
        if (starget->can_queue > 0)
                atomic_dec(&starget->target_busy);
        return 0;
}

/*
 * scsi_host_queue_ready: if we can send requests to shost, return 1 else
 * return 0. We must end up running the queue again whenever 0 is
 * returned, else IO can hang.
 */
static inline int scsi_host_queue_ready(struct request_queue *q,
                                   struct Scsi_Host *shost,
                                   struct scsi_device *sdev,
                                   struct scsi_cmnd *cmd)
{
        if (scsi_host_in_recovery(shost))
                return 0;

        if (atomic_read(&shost->host_blocked) > 0) {
                if (scsi_host_busy(shost) > 0)
                        goto starved;

                /*
                 * unblock after host_blocked iterates to zero
                 */
                if (atomic_dec_return(&shost->host_blocked) > 0)
                        goto out_dec;

                SCSI_LOG_MLQUEUE(3,
                        shost_printk(KERN_INFO, shost,
                                     "unblocking host at zero depth\n"));
        }

        if (shost->host_self_blocked)
                goto starved;

        /* We're OK to process the command, so we can't be starved */
        if (!list_empty(&sdev->starved_entry)) {
                spin_lock_irq(shost->host_lock);
                if (!list_empty(&sdev->starved_entry))
                        list_del_init(&sdev->starved_entry);
                spin_unlock_irq(shost->host_lock);
        }

        __set_bit(SCMD_STATE_INFLIGHT, &cmd->state);

        return 1;

starved:
        spin_lock_irq(shost->host_lock);
        if (list_empty(&sdev->starved_entry))
                list_add_tail(&sdev->starved_entry, &shost->starved_list);
        spin_unlock_irq(shost->host_lock);
out_dec:
        scsi_dec_host_busy(shost, cmd);
        return 0;
}

/*
 * Busy state exporting function for request stacking drivers.
 *
 * For efficiency, no lock is taken to check the busy state of
 * shost/starget/sdev, since the returned value is not guaranteed and
 * may be changed after request stacking drivers call the function,
 * regardless of taking lock or not.
 *
 * When scsi can't dispatch I/Os anymore and needs to kill I/Os scsi
 * needs to return 'not busy'. Otherwise, request stacking drivers
 * may hold requests forever.
 */
static bool scsi_mq_lld_busy(struct request_queue *q)
{
        struct scsi_device *sdev = q->queuedata;
        struct Scsi_Host *shost;

        if (blk_queue_dying(q))
                return false;

        shost = sdev->host;

        /*
         * Ignore host/starget busy state.
         * Since block layer does not have a concept of fairness across
         * multiple queues, congestion of host/starget needs to be handled
         * in SCSI layer.
         */
        if (scsi_host_in_recovery(shost) || scsi_device_is_busy(sdev))
                return true;

        return false;
}

static void scsi_softirq_done(struct request *rq)
{
        struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq);
        enum scsi_disposition disposition;

        INIT_LIST_HEAD(&cmd->eh_entry);

        atomic_inc(&cmd->device->iodone_cnt);
        if (cmd->result)
                atomic_inc(&cmd->device->ioerr_cnt);

        disposition = scsi_decide_disposition(cmd);
        if (disposition != SUCCESS && scsi_cmd_runtime_exceeced(cmd))
                disposition = SUCCESS;

        scsi_log_completion(cmd, disposition);

        switch (disposition) {
        case SUCCESS:
                scsi_finish_command(cmd);
                break;
        case NEEDS_RETRY:
                scsi_queue_insert(cmd, SCSI_MLQUEUE_EH_RETRY);
                break;
        case ADD_TO_MLQUEUE:
                scsi_queue_insert(cmd, SCSI_MLQUEUE_DEVICE_BUSY);
                break;
        default:
                scsi_eh_scmd_add(cmd);
                break;
        }
}

/**
 * scsi_dispatch_command - Dispatch a command to the low-level driver.
 * @cmd: command block we are dispatching.
 *
 * Return: nonzero return request was rejected and device's queue needs to be
 * plugged.
 */
static int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
{
        struct Scsi_Host *host = cmd->device->host;
        int rtn = 0;

        atomic_inc(&cmd->device->iorequest_cnt);

        /* check if the device is still usable */
        if (unlikely(cmd->device->sdev_state == SDEV_DEL)) {
                /* in SDEV_DEL we error all commands. DID_NO_CONNECT
                 * returns an immediate error upwards, and signals
                 * that the device is no longer present */
                cmd->result = DID_NO_CONNECT << 16;
                goto done;
        }

        /* Check to see if the scsi lld made this device blocked. */
        if (unlikely(scsi_device_blocked(cmd->device))) {
                /*
                 * in blocked state, the command is just put back on
                 * the device queue.  The suspend state has already
                 * blocked the queue so future requests should not
                 * occur until the device transitions out of the
                 * suspend state.
                 */
                SCSI_LOG_MLQUEUE(3, scmd_printk(KERN_INFO, cmd,
                        "queuecommand : device blocked\n"));
                atomic_dec(&cmd->device->iorequest_cnt);
                return SCSI_MLQUEUE_DEVICE_BUSY;
        }

        /* Store the LUN value in cmnd, if needed. */
        if (cmd->device->lun_in_cdb)
                cmd->cmnd[1] = (cmd->cmnd[1] & 0x1f) |
                               (cmd->device->lun << 5 & 0xe0);

        scsi_log_send(cmd);

        /*
         * Before we queue this command, check if the command
         * length exceeds what the host adapter can handle.
         */
        if (cmd->cmd_len > cmd->device->host->max_cmd_len) {
                SCSI_LOG_MLQUEUE(3, scmd_printk(KERN_INFO, cmd,
                               "queuecommand : command too long. "
                               "cdb_size=%d host->max_cmd_len=%d\n",
                               cmd->cmd_len, cmd->device->host->max_cmd_len));
                cmd->result = (DID_ABORT << 16);
                goto done;
        }

        if (unlikely(host->shost_state == SHOST_DEL)) {
                cmd->result = (DID_NO_CONNECT << 16);
                goto done;

        }

        trace_scsi_dispatch_cmd_start(cmd);
        rtn = host->hostt->queuecommand(host, cmd);
        if (rtn) {
                atomic_dec(&cmd->device->iorequest_cnt);
                trace_scsi_dispatch_cmd_error(cmd, rtn);
                if (rtn != SCSI_MLQUEUE_DEVICE_BUSY &&
                    rtn != SCSI_MLQUEUE_TARGET_BUSY)
                        rtn = SCSI_MLQUEUE_HOST_BUSY;

                SCSI_LOG_MLQUEUE(3, scmd_printk(KERN_INFO, cmd,
                        "queuecommand : request rejected\n"));
        }

        return rtn;
 done:
        cmd->scsi_done(cmd);
        return 0;
}

/* Size in bytes of the sg-list stored in the scsi-mq command-private data. */
static unsigned int scsi_mq_inline_sgl_size(struct Scsi_Host *shost)
{
        return min_t(unsigned int, shost->sg_tablesize, SCSI_INLINE_SG_CNT) *
                sizeof(struct scatterlist);
}

static blk_status_t scsi_prepare_cmd(struct request *req)
{
        struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
        struct scsi_device *sdev = req->q->queuedata;
        struct Scsi_Host *shost = sdev->host;
        struct scatterlist *sg;

        scsi_init_command(sdev, cmd);

        cmd->request = req;
        cmd->tag = req->tag;
        cmd->prot_op = SCSI_PROT_NORMAL;
        if (blk_rq_bytes(req))
                cmd->sc_data_direction = rq_dma_dir(req);
        else
                cmd->sc_data_direction = DMA_NONE;

        sg = (void *)cmd + sizeof(struct scsi_cmnd) + shost->hostt->cmd_size;
        cmd->sdb.table.sgl = sg;

        if (scsi_host_get_prot(shost)) {
                memset(cmd->prot_sdb, 0, sizeof(struct scsi_data_buffer));

                cmd->prot_sdb->table.sgl =
                        (struct scatterlist *)(cmd->prot_sdb + 1);
        }

        /*
         * Special handling for passthrough commands, which don't go to the ULP
         * at all:
         */
        if (blk_rq_is_scsi(req))
                return scsi_setup_scsi_cmnd(sdev, req);

        if (sdev->handler && sdev->handler->prep_fn) {
                blk_status_t ret = sdev->handler->prep_fn(sdev, req);

                if (ret != BLK_STS_OK)
                        return ret;
        }

        cmd->cmnd = scsi_req(req)->cmd = scsi_req(req)->__cmd;
        memset(cmd->cmnd, 0, BLK_MAX_CDB);
        return scsi_cmd_to_driver(cmd)->init_command(cmd);
}

static void scsi_mq_done(struct scsi_cmnd *cmd)
{
        if (unlikely(blk_should_fake_timeout(cmd->request->q)))
                return;
        if (unlikely(test_and_set_bit(SCMD_STATE_COMPLETE, &cmd->state)))
                return;
        trace_scsi_dispatch_cmd_done(cmd);
        blk_mq_complete_request(cmd->request);
}

static void scsi_mq_put_budget(struct request_queue *q)
{
        struct scsi_device *sdev = q->queuedata;

        atomic_dec(&sdev->device_busy);
}

static bool scsi_mq_get_budget(struct request_queue *q)
{
        struct scsi_device *sdev = q->queuedata;

        if (scsi_dev_queue_ready(q, sdev))
                return true;

        atomic_inc(&sdev->restarts);

        /*
         * Orders atomic_inc(&sdev->restarts) and atomic_read(&sdev->device_busy).
         * .restarts must be incremented before .device_busy is read because the
         * code in scsi_run_queue_async() depends on the order of these operations.
         */
        smp_mb__after_atomic();

        /*
         * If all in-flight requests originated from this LUN are completed
         * before reading .device_busy, sdev->device_busy will be observed as
         * zero, then blk_mq_delay_run_hw_queues() will dispatch this request
         * soon. Otherwise, completion of one of these requests will observe
         * the .restarts flag, and the request queue will be run for handling
         * this request, see scsi_end_request().
         */
        if (unlikely(atomic_read(&sdev->device_busy) == 0 &&
                                !scsi_device_blocked(sdev)))
                blk_mq_delay_run_hw_queues(sdev->request_queue, SCSI_QUEUE_DELAY);
        return false;
}

static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
                         const struct blk_mq_queue_data *bd)
{
        struct request *req = bd->rq;
        struct request_queue *q = req->q;
        struct scsi_device *sdev = q->queuedata;
        struct Scsi_Host *shost = sdev->host;
        struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
        blk_status_t ret;
        int reason;

        /*
         * If the device is not in running state we will reject some or all
         * commands.
         */
        if (unlikely(sdev->sdev_state != SDEV_RUNNING)) {
                ret = scsi_device_state_check(sdev, req);
                if (ret != BLK_STS_OK)
                        goto out_put_budget;
        }

        ret = BLK_STS_RESOURCE;
        if (!scsi_target_queue_ready(shost, sdev))
                goto out_put_budget;
        if (!scsi_host_queue_ready(q, shost, sdev, cmd))
                goto out_dec_target_busy;

        if (!(req->rq_flags & RQF_DONTPREP)) {
                ret = scsi_prepare_cmd(req);
                if (ret != BLK_STS_OK)
                        goto out_dec_host_busy;
                req->rq_flags |= RQF_DONTPREP;
        } else {
                clear_bit(SCMD_STATE_COMPLETE, &cmd->state);
        }

        cmd->flags &= SCMD_PRESERVED_FLAGS;
        if (sdev->simple_tags)
                cmd->flags |= SCMD_TAGGED;
        if (bd->last)
                cmd->flags |= SCMD_LAST;

        scsi_set_resid(cmd, 0);
        memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE);
        cmd->scsi_done = scsi_mq_done;

        blk_mq_start_request(req);
        reason = scsi_dispatch_cmd(cmd);
        if (reason) {
                scsi_set_blocked(cmd, reason);
                ret = BLK_STS_RESOURCE;
                goto out_dec_host_busy;
        }

        return BLK_STS_OK;

out_dec_host_busy:
        scsi_dec_host_busy(shost, cmd);
out_dec_target_busy:
        if (scsi_target(sdev)->can_queue > 0)
                atomic_dec(&scsi_target(sdev)->target_busy);
out_put_budget:
        scsi_mq_put_budget(q);
        switch (ret) {
        case BLK_STS_OK:
                break;
        case BLK_STS_RESOURCE:
        case BLK_STS_ZONE_RESOURCE:
                if (scsi_device_blocked(sdev))
                        ret = BLK_STS_DEV_RESOURCE;
                break;
        default:
                if (unlikely(!scsi_device_online(sdev)))
                        scsi_req(req)->result = DID_NO_CONNECT << 16;
                else
                        scsi_req(req)->result = DID_ERROR << 16;
                /*
                 * Make sure to release all allocated resources when
                 * we hit an error, as we will never see this command
                 * again.
                 */
                if (req->rq_flags & RQF_DONTPREP)
                        scsi_mq_uninit_cmd(cmd);
                scsi_run_queue_async(sdev);
                break;
        }
        return ret;
}

static enum blk_eh_timer_return scsi_timeout(struct request *req,
                bool reserved)
{
        if (reserved)
                return BLK_EH_RESET_TIMER;
        return scsi_times_out(req);
}

static int scsi_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
                                unsigned int hctx_idx, unsigned int numa_node)
{
        struct Scsi_Host *shost = set->driver_data;
        const bool unchecked_isa_dma = shost->unchecked_isa_dma;
        struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq);
        struct scatterlist *sg;
        int ret = 0;

        if (unchecked_isa_dma)
                cmd->flags |= SCMD_UNCHECKED_ISA_DMA;
        cmd->sense_buffer = scsi_alloc_sense_buffer(unchecked_isa_dma,
                                                    GFP_KERNEL, numa_node);
        if (!cmd->sense_buffer)
                return -ENOMEM;
        cmd->req.sense = cmd->sense_buffer;

        if (scsi_host_get_prot(shost)) {
                sg = (void *)cmd + sizeof(struct scsi_cmnd) +
                        shost->hostt->cmd_size;
                cmd->prot_sdb = (void *)sg + scsi_mq_inline_sgl_size(shost);
        }

        if (shost->hostt->init_cmd_priv) {
                ret = shost->hostt->init_cmd_priv(shost, cmd);
                if (ret < 0)
                        scsi_free_sense_buffer(unchecked_isa_dma,
                                               cmd->sense_buffer);
        }

        return ret;
}

static void scsi_mq_exit_request(struct blk_mq_tag_set *set, struct request *rq,
                                 unsigned int hctx_idx)
{
        struct Scsi_Host *shost = set->driver_data;
        struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq);

        if (shost->hostt->exit_cmd_priv)
                shost->hostt->exit_cmd_priv(shost, cmd);
        scsi_free_sense_buffer(cmd->flags & SCMD_UNCHECKED_ISA_DMA,
                               cmd->sense_buffer);
}

static int scsi_map_queues(struct blk_mq_tag_set *set)
{
        struct Scsi_Host *shost = container_of(set, struct Scsi_Host, tag_set);

        if (shost->hostt->map_queues)
                return shost->hostt->map_queues(shost);
        return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
}

void __scsi_init_queue(struct Scsi_Host *shost, struct request_queue *q)
{
        struct device *dev = shost->dma_dev;

        /*
         * this limit is imposed by hardware restrictions
         */
        blk_queue_max_segments(q, min_t(unsigned short, shost->sg_tablesize,
                                        SG_MAX_SEGMENTS));

        if (scsi_host_prot_dma(shost)) {
                shost->sg_prot_tablesize =
                        min_not_zero(shost->sg_prot_tablesize,
                                     (unsigned short)SCSI_MAX_PROT_SG_SEGMENTS);
                BUG_ON(shost->sg_prot_tablesize < shost->sg_tablesize);
                blk_queue_max_integrity_segments(q, shost->sg_prot_tablesize);
        }

        if (dev->dma_mask) {
                shost->max_sectors = min_t(unsigned int, shost->max_sectors,
                                dma_max_mapping_size(dev) >> SECTOR_SHIFT);
        }
        blk_queue_max_hw_sectors(q, shost->max_sectors);
        if (shost->unchecked_isa_dma)
                blk_queue_bounce_limit(q, BLK_BOUNCE_ISA);
        blk_queue_segment_boundary(q, shost->dma_boundary);
        dma_set_seg_boundary(dev, shost->dma_boundary);

        blk_queue_max_segment_size(q, shost->max_segment_size);
        blk_queue_virt_boundary(q, shost->virt_boundary_mask);
        dma_set_max_seg_size(dev, queue_max_segment_size(q));

        /*
         * Set a reasonable default alignment:  The larger of 32-byte (dword),
         * which is a common minimum for HBAs, and the minimum DMA alignment,
         * which is set by the platform.
         *
         * Devices that require a bigger alignment can increase it later.
         */
        blk_queue_dma_alignment(q, max(4, dma_get_cache_alignment()) - 1);
}
EXPORT_SYMBOL_GPL(__scsi_init_queue);

static const struct blk_mq_ops scsi_mq_ops_no_commit = {
        .get_budget        = scsi_mq_get_budget,
        .put_budget        = scsi_mq_put_budget,
        .queue_rq        = scsi_queue_rq,
        .complete        = scsi_softirq_done,
        .timeout        = scsi_timeout,
#ifdef CONFIG_BLK_DEBUG_FS
        .show_rq        = scsi_show_rq,
#endif
        .init_request        = scsi_mq_init_request,
        .exit_request        = scsi_mq_exit_request,
        .initialize_rq_fn = scsi_initialize_rq,
        .cleanup_rq        = scsi_cleanup_rq,
        .busy                = scsi_mq_lld_busy,
        .map_queues        = scsi_map_queues,
};


static void scsi_commit_rqs(struct blk_mq_hw_ctx *hctx)
{
        struct request_queue *q = hctx->queue;
        struct scsi_device *sdev = q->queuedata;
        struct Scsi_Host *shost = sdev->host;

        shost->hostt->commit_rqs(shost, hctx->queue_num);
}

static const struct blk_mq_ops scsi_mq_ops = {
        .get_budget        = scsi_mq_get_budget,
        .put_budget        = scsi_mq_put_budget,
        .queue_rq        = scsi_queue_rq,
        .commit_rqs        = scsi_commit_rqs,
        .complete        = scsi_softirq_done,
        .timeout        = scsi_timeout,
#ifdef CONFIG_BLK_DEBUG_FS
        .show_rq        = scsi_show_rq,
#endif
        .init_request        = scsi_mq_init_request,
        .exit_request        = scsi_mq_exit_request,
        .initialize_rq_fn = scsi_initialize_rq,
        .cleanup_rq        = scsi_cleanup_rq,
        .busy                = scsi_mq_lld_busy,
        .map_queues        = scsi_map_queues,
};

struct request_queue *scsi_mq_alloc_queue(struct scsi_device *sdev)
{
        sdev->request_queue = blk_mq_init_queue(&sdev->host->tag_set);
        if (IS_ERR(sdev->request_queue))
                return NULL;

        sdev->request_queue->queuedata = sdev;
        __scsi_init_queue(sdev->host, sdev->request_queue);
        blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, sdev->request_queue);
        return sdev->request_queue;
}

int scsi_mq_setup_tags(struct Scsi_Host *shost)
{
        unsigned int cmd_size, sgl_size;
        struct blk_mq_tag_set *tag_set = &shost->tag_set;

        sgl_size = max_t(unsigned int, sizeof(struct scatterlist),
                                scsi_mq_inline_sgl_size(shost));
        cmd_size = sizeof(struct scsi_cmnd) + shost->hostt->cmd_size + sgl_size;
        if (scsi_host_get_prot(shost))
                cmd_size += sizeof(struct scsi_data_buffer) +
                        sizeof(struct scatterlist) * SCSI_INLINE_PROT_SG_CNT;

        memset(tag_set, 0, sizeof(*tag_set));
        if (shost->hostt->commit_rqs)
                tag_set->ops = &scsi_mq_ops;
        else
                tag_set->ops = &scsi_mq_ops_no_commit;
        tag_set->nr_hw_queues = shost->nr_hw_queues ? : 1;
        tag_set->queue_depth = shost->can_queue;
        tag_set->cmd_size = cmd_size;
        tag_set->numa_node = NUMA_NO_NODE;
        tag_set->flags = BLK_MQ_F_SHOULD_MERGE;
        tag_set->flags |=
                BLK_ALLOC_POLICY_TO_MQ_FLAG(shost->hostt->tag_alloc_policy);
        tag_set->driver_data = shost;
        if (shost->host_tagset)
                tag_set->flags |= BLK_MQ_F_TAG_HCTX_SHARED;

        return blk_mq_alloc_tag_set(tag_set);
}

void scsi_mq_free_tags(struct kref *kref)
{
        struct Scsi_Host *shost = container_of(kref, typeof(*shost),
                                               tagset_refcnt);

        blk_mq_free_tag_set(&shost->tag_set);
        complete(&shost->tagset_freed);
}

/**
 * scsi_device_from_queue - return sdev associated with a request_queue
 * @q: The request queue to return the sdev from
 *
 * Return the sdev associated with a request queue or NULL if the
 * request_queue does not reference a SCSI device.
 */
struct scsi_device *scsi_device_from_queue(struct request_queue *q)
{
        struct scsi_device *sdev = NULL;

        if (q->mq_ops == &scsi_mq_ops_no_commit ||
            q->mq_ops == &scsi_mq_ops)
                sdev = q->queuedata;
        if (!sdev || !get_device(&sdev->sdev_gendev))
                sdev = NULL;

        return sdev;
}

/**
 * scsi_block_requests - Utility function used by low-level drivers to prevent
 * further commands from being queued to the device.
 * @shost:  host in question
 *
 * There is no timer nor any other means by which the requests get unblocked
 * other than the low-level driver calling scsi_unblock_requests().
 */
void scsi_block_requests(struct Scsi_Host *shost)
{
        shost->host_self_blocked = 1;
}
EXPORT_SYMBOL(scsi_block_requests);

/**
 * scsi_unblock_requests - Utility function used by low-level drivers to allow
 * further commands to be queued to the device.
 * @shost:  host in question
 *
 * There is no timer nor any other means by which the requests get unblocked
 * other than the low-level driver calling scsi_unblock_requests(). This is done
 * as an API function so that changes to the internals of the scsi mid-layer
 * won't require wholesale changes to drivers that use this feature.
 */
void scsi_unblock_requests(struct Scsi_Host *shost)
{
        shost->host_self_blocked = 0;
        scsi_run_host_queues(shost);
}
EXPORT_SYMBOL(scsi_unblock_requests);

void scsi_exit_queue(void)
{
        kmem_cache_destroy(scsi_sense_cache);
        kmem_cache_destroy(scsi_sense_isadma_cache);
}

/**
 *        scsi_mode_select - issue a mode select
 *        @sdev:        SCSI device to be queried
 *        @pf:        Page format bit (1 == standard, 0 == vendor specific)
 *        @sp:        Save page bit (0 == don't save, 1 == save)
 *        @modepage: mode page being requested
 *        @buffer: request buffer (may not be smaller than eight bytes)
 *        @len:        length of request buffer.
 *        @timeout: command timeout
 *        @retries: number of retries before failing
 *        @data: returns a structure abstracting the mode header data
 *        @sshdr: place to put sense data (or NULL if no sense to be collected).
 *                must be SCSI_SENSE_BUFFERSIZE big.
 *
 *        Returns zero if successful; negative error number or scsi
 *        status on error
 *
 */
int
scsi_mode_select(struct scsi_device *sdev, int pf, int sp, int modepage,
                 unsigned char *buffer, int len, int timeout, int retries,
                 struct scsi_mode_data *data, struct scsi_sense_hdr *sshdr)
{
        unsigned char cmd[10];
        unsigned char *real_buffer;
        int ret;

        memset(cmd, 0, sizeof(cmd));
        cmd[1] = (pf ? 0x10 : 0) | (sp ? 0x01 : 0);

        /*
         * Use MODE SELECT(10) if the device asked for it or if the mode page
         * and the mode select header cannot fit within the maximumm 255 bytes
         * of the MODE SELECT(6) command.
         */
        if (sdev->use_10_for_ms ||
            len + 4 > 255 ||
            data->block_descriptor_length > 255) {
                if (len > 65535 - 8)
                        return -EINVAL;
                real_buffer = kmalloc(8 + len, GFP_KERNEL);
                if (!real_buffer)
                        return -ENOMEM;
                memcpy(real_buffer + 8, buffer, len);
                len += 8;
                real_buffer[0] = 0;
                real_buffer[1] = 0;
                real_buffer[2] = data->medium_type;
                real_buffer[3] = data->device_specific;
                real_buffer[4] = data->longlba ? 0x01 : 0;
                real_buffer[5] = 0;
                put_unaligned_be16(data->block_descriptor_length,
                                   &real_buffer[6]);

                cmd[0] = MODE_SELECT_10;
                put_unaligned_be16(len, &cmd[7]);
        } else {
                if (data->longlba)
                        return -EINVAL;

                real_buffer = kmalloc(4 + len, GFP_KERNEL);
                if (!real_buffer)
                        return -ENOMEM;
                memcpy(real_buffer + 4, buffer, len);
                len += 4;
                real_buffer[0] = 0;
                real_buffer[1] = data->medium_type;
                real_buffer[2] = data->device_specific;
                real_buffer[3] = data->block_descriptor_length;

                cmd[0] = MODE_SELECT;
                cmd[4] = len;
        }

        ret = scsi_execute_req(sdev, cmd, DMA_TO_DEVICE, real_buffer, len,
                               sshdr, timeout, retries, NULL);
        kfree(real_buffer);
        return ret;
}
EXPORT_SYMBOL_GPL(scsi_mode_select);

/**
 *        scsi_mode_sense - issue a mode sense, falling back from 10 to six bytes if necessary.
 *        @sdev:        SCSI device to be queried
 *        @dbd:        set if mode sense will allow block descriptors to be returned
 *        @modepage: mode page being requested
 *        @buffer: request buffer (may not be smaller than eight bytes)
 *        @len:        length of request buffer.
 *        @timeout: command timeout
 *        @retries: number of retries before failing
 *        @data: returns a structure abstracting the mode header data
 *        @sshdr: place to put sense data (or NULL if no sense to be collected).
 *                must be SCSI_SENSE_BUFFERSIZE big.
 *
 *        Returns zero if successful, or a negative error number on failure
 */
int
scsi_mode_sense(struct scsi_device *sdev, int dbd, int modepage,
                  unsigned char *buffer, int len, int timeout, int retries,
                  struct scsi_mode_data *data, struct scsi_sense_hdr *sshdr)
{
        unsigned char cmd[12];
        int use_10_for_ms;
        int header_length;
        int result, retry_count = retries;
        struct scsi_sense_hdr my_sshdr;

        memset(data, 0, sizeof(*data));
        memset(&cmd[0], 0, 12);

        dbd = sdev->set_dbd_for_ms ? 8 : dbd;
        cmd[1] = dbd & 0x18;        /* allows DBD and LLBA bits */
        cmd[2] = modepage;

        /* caller might not be interested in sense, but we need it */
        if (!sshdr)
                sshdr = &my_sshdr;

 retry:
        use_10_for_ms = sdev->use_10_for_ms;

        if (use_10_for_ms) {
                if (len < 8)
                        len = 8;

                cmd[0] = MODE_SENSE_10;
                cmd[8] = len;
                header_length = 8;
        } else {
                if (len < 4)
                        len = 4;

                cmd[0] = MODE_SENSE;
                cmd[4] = len;
                header_length = 4;
        }

        memset(buffer, 0, len);

        result = scsi_execute_req(sdev, cmd, DMA_FROM_DEVICE, buffer, len,
                                  sshdr, timeout, retries, NULL);
        if (result < 0)
                return result;

        /* This code looks awful: what it's doing is making sure an
         * ILLEGAL REQUEST sense return identifies the actual command
         * byte as the problem.  MODE_SENSE commands can return
         * ILLEGAL REQUEST if the code page isn't supported */

        if (use_10_for_ms && !scsi_status_is_good(result) &&
            driver_byte(result) == DRIVER_SENSE) {
                if (scsi_sense_valid(sshdr)) {
                        if ((sshdr->sense_key == ILLEGAL_REQUEST) &&
                            (sshdr->asc == 0x20) && (sshdr->ascq == 0)) {
                                /*
                                 * Invalid command operation code
                                 */
                                sdev->use_10_for_ms = 0;
                                goto retry;
                        }
                }
        }

        if (scsi_status_is_good(result)) {
                if (unlikely(buffer[0] == 0x86 && buffer[1] == 0x0b &&
                             (modepage == 6 || modepage == 8))) {
                        /* Initio breakage? */
                        header_length = 0;
                        data->length = 13;
                        data->medium_type = 0;
                        data->device_specific = 0;
                        data->longlba = 0;
                        data->block_descriptor_length = 0;
                } else if (use_10_for_ms) {
                        data->length = buffer[0]*256 + buffer[1] + 2;
                        data->medium_type = buffer[2];
                        data->device_specific = buffer[3];
                        data->longlba = buffer[4] & 0x01;
                        data->block_descriptor_length = buffer[6]*256
                                + buffer[7];
                } else {
                        data->length = buffer[0] + 1;
                        data->medium_type = buffer[1];
                        data->device_specific = buffer[2];
                        data->block_descriptor_length = buffer[3];
                }
                data->header_length = header_length;
                result = 0;
        } else if ((status_byte(result) == CHECK_CONDITION) &&
                   scsi_sense_valid(sshdr) &&
                   sshdr->sense_key == UNIT_ATTENTION && retry_count) {
                retry_count--;
                goto retry;
        }
        if (result > 0)
                result = -EIO;
        return result;
}
EXPORT_SYMBOL(scsi_mode_sense);

/**
 *        scsi_test_unit_ready - test if unit is ready
 *        @sdev:        scsi device to change the state of.
 *        @timeout: command timeout
 *        @retries: number of retries before failing
 *        @sshdr: outpout pointer for decoded sense information.
 *
 *        Returns zero if unsuccessful or an error if TUR failed.  For
 *        removable media, UNIT_ATTENTION sets ->changed flag.
 **/
int
scsi_test_unit_ready(struct scsi_device *sdev, int timeout, int retries,
                     struct scsi_sense_hdr *sshdr)
{
        char cmd[] = {
                TEST_UNIT_READY, 0, 0, 0, 0, 0,
        };
        int result;

        /* try to eat the UNIT_ATTENTION if there are enough retries */
        do {
                result = scsi_execute_req(sdev, cmd, DMA_NONE, NULL, 0, sshdr,
                                          timeout, 1, NULL);
                if (sdev->removable && scsi_sense_valid(sshdr) &&
                    sshdr->sense_key == UNIT_ATTENTION)
                        sdev->changed = 1;
        } while (scsi_sense_valid(sshdr) &&
                 sshdr->sense_key == UNIT_ATTENTION && --retries);

        return result;
}
EXPORT_SYMBOL(scsi_test_unit_ready);

/**
 *        scsi_device_set_state - Take the given device through the device state model.
 *        @sdev:        scsi device to change the state of.
 *        @state:        state to change to.
 *
 *        Returns zero if successful or an error if the requested
 *        transition is illegal.
 */
int
scsi_device_set_state(struct scsi_device *sdev, enum scsi_device_state state)
{
        enum scsi_device_state oldstate = sdev->sdev_state;

        if (state == oldstate)
                return 0;

        switch (state) {
        case SDEV_CREATED:
                switch (oldstate) {
                case SDEV_CREATED_BLOCK:
                        break;
                default:
                        goto illegal;
                }
                break;

        case SDEV_RUNNING:
                switch (oldstate) {
                case SDEV_CREATED:
                case SDEV_OFFLINE:
                case SDEV_TRANSPORT_OFFLINE:
                case SDEV_QUIESCE:
                case SDEV_BLOCK:
                        break;
                default:
                        goto illegal;
                }
                break;

        case SDEV_QUIESCE:
                switch (oldstate) {
                case SDEV_RUNNING:
                case SDEV_OFFLINE:
                case SDEV_TRANSPORT_OFFLINE:
                        break;
                default:
                        goto illegal;
                }
                break;

        case SDEV_OFFLINE:
        case SDEV_TRANSPORT_OFFLINE:
                switch (oldstate) {
                case SDEV_CREATED:
                case SDEV_RUNNING:
                case SDEV_QUIESCE:
                case SDEV_BLOCK:
                        break;
                default:
                        goto illegal;
                }
                break;

        case SDEV_BLOCK:
                switch (oldstate) {
                case SDEV_RUNNING:
                case SDEV_CREATED_BLOCK:
                case SDEV_QUIESCE:
                case SDEV_OFFLINE:
                        break;
                default:
                        goto illegal;
                }
                break;

        case SDEV_CREATED_BLOCK:
                switch (oldstate) {
                case SDEV_CREATED:
                        break;
                default:
                        goto illegal;
                }
                break;

        case SDEV_CANCEL:
                switch (oldstate) {
                case SDEV_CREATED:
                case SDEV_RUNNING:
                case SDEV_QUIESCE:
                case SDEV_OFFLINE:
                case SDEV_TRANSPORT_OFFLINE:
                        break;
                default:
                        goto illegal;
                }
                break;

        case SDEV_DEL:
                switch (oldstate) {
                case SDEV_CREATED:
                case SDEV_RUNNING:
                case SDEV_OFFLINE:
                case SDEV_TRANSPORT_OFFLINE:
                case SDEV_CANCEL:
                case SDEV_BLOCK:
                case SDEV_CREATED_BLOCK:
                        break;
                default:
                        goto illegal;
                }
                break;

        }
        sdev->offline_already = false;
        sdev->sdev_state = state;
        return 0;

 illegal:
        SCSI_LOG_ERROR_RECOVERY(1,
                                sdev_printk(KERN_ERR, sdev,
                                            "Illegal state transition %s->%s",
                                            scsi_device_state_name(oldstate),
                                            scsi_device_state_name(state))
                                );
        return -EINVAL;
}
EXPORT_SYMBOL(scsi_device_set_state);

/**
 *         sdev_evt_emit - emit a single SCSI device uevent
 *        @sdev: associated SCSI device
 *        @evt: event to emit
 *
 *        Send a single uevent (scsi_event) to the associated scsi_device.
 */
static void scsi_evt_emit(struct scsi_device *sdev, struct scsi_event *evt)
{
        int idx = 0;
        char *envp[3];

        switch (evt->evt_type) {
        case SDEV_EVT_MEDIA_CHANGE:
                envp[idx++] = "SDEV_MEDIA_CHANGE=1";
                break;
        case SDEV_EVT_INQUIRY_CHANGE_REPORTED:
                scsi_rescan_device(&sdev->sdev_gendev);
                envp[idx++] = "SDEV_UA=INQUIRY_DATA_HAS_CHANGED";
                break;
        case SDEV_EVT_CAPACITY_CHANGE_REPORTED:
                envp[idx++] = "SDEV_UA=CAPACITY_DATA_HAS_CHANGED";
                break;
        case SDEV_EVT_SOFT_THRESHOLD_REACHED_REPORTED:
               envp[idx++] = "SDEV_UA=THIN_PROVISIONING_SOFT_THRESHOLD_REACHED";
                break;
        case SDEV_EVT_MODE_PARAMETER_CHANGE_REPORTED:
                envp[idx++] = "SDEV_UA=MODE_PARAMETERS_CHANGED";
                break;
        case SDEV_EVT_LUN_CHANGE_REPORTED:
                envp[idx++] = "SDEV_UA=REPORTED_LUNS_DATA_HAS_CHANGED";
                break;
        case SDEV_EVT_ALUA_STATE_CHANGE_REPORTED:
                envp[idx++] = "SDEV_UA=ASYMMETRIC_ACCESS_STATE_CHANGED";
                break;
        case SDEV_EVT_POWER_ON_RESET_OCCURRED:
                envp[idx++] = "SDEV_UA=POWER_ON_RESET_OCCURRED";
                break;
        default:
                /* do nothing */
                break;
        }

        envp[idx++] = NULL;

        kobject_uevent_env(&sdev->sdev_gendev.kobj, KOBJ_CHANGE, envp);
}

/**
 *         sdev_evt_thread - send a uevent for each scsi event
 *        @work: work struct for scsi_device
 *
 *        Dispatch queued events to their associated scsi_device kobjects
 *        as uevents.
 */
void scsi_evt_thread(struct work_struct *work)
{
        struct scsi_device *sdev;
        enum scsi_device_event evt_type;
        LIST_HEAD(event_list);

        sdev = container_of(work, struct scsi_device, event_work);

        for (evt_type = SDEV_EVT_FIRST; evt_type <= SDEV_EVT_LAST; evt_type++)
                if (test_and_clear_bit(evt_type, sdev->pending_events))
                        sdev_evt_send_simple(sdev, evt_type, GFP_KERNEL);

        while (1) {
                struct scsi_event *evt;
                struct list_head *this, *tmp;
                unsigned long flags;

                spin_lock_irqsave(&sdev->list_lock, flags);
                list_splice_init(&sdev->event_list, &event_list);
                spin_unlock_irqrestore(&sdev->list_lock, flags);

                if (list_empty(&event_list))
                        break;

                list_for_each_safe(this, tmp, &event_list) {
                        evt = list_entry(this, struct scsi_event, node);
                        list_del(&evt->node);
                        scsi_evt_emit(sdev, evt);
                        kfree(evt);
                }
        }
}

/**
 *         sdev_evt_send - send asserted event to uevent thread
 *        @sdev: scsi_device event occurred on
 *        @evt: event to send
 *
 *        Assert scsi device event asynchronously.
 */
void sdev_evt_send(struct scsi_device *sdev, struct scsi_event *evt)
{
        unsigned long flags;

#if 0
        /* FIXME: currently this check eliminates all media change events
         * for polled devices.  Need to update to discriminate between AN
         * and polled events */
        if (!test_bit(evt->evt_type, sdev->supported_events)) {
                kfree(evt);
                return;
        }
#endif

        spin_lock_irqsave(&sdev->list_lock, flags);
        list_add_tail(&evt->node, &sdev->event_list);
        schedule_work(&sdev->event_work);
        spin_unlock_irqrestore(&sdev->list_lock, flags);
}
EXPORT_SYMBOL_GPL(sdev_evt_send);

/**
 *         sdev_evt_alloc - allocate a new scsi event
 *        @evt_type: type of event to allocate
 *        @gfpflags: GFP flags for allocation
 *
 *        Allocates and returns a new scsi_event.
 */
struct scsi_event *sdev_evt_alloc(enum scsi_device_event evt_type,
                                  gfp_t gfpflags)
{
        struct scsi_event *evt = kzalloc(sizeof(struct scsi_event), gfpflags);
        if (!evt)
                return NULL;

        evt->evt_type = evt_type;
        INIT_LIST_HEAD(&evt->node);

        /* evt_type-specific initialization, if any */
        switch (evt_type) {
        case SDEV_EVT_MEDIA_CHANGE:
        case SDEV_EVT_INQUIRY_CHANGE_REPORTED:
        case SDEV_EVT_CAPACITY_CHANGE_REPORTED:
        case SDEV_EVT_SOFT_THRESHOLD_REACHED_REPORTED:
        case SDEV_EVT_MODE_PARAMETER_CHANGE_REPORTED:
        case SDEV_EVT_LUN_CHANGE_REPORTED:
        case SDEV_EVT_ALUA_STATE_CHANGE_REPORTED:
        case SDEV_EVT_POWER_ON_RESET_OCCURRED:
        default:
                /* do nothing */
                break;
        }

        return evt;
}
EXPORT_SYMBOL_GPL(sdev_evt_alloc);

/**
 *         sdev_evt_send_simple - send asserted event to uevent thread
 *        @sdev: scsi_device event occurred on
 *        @evt_type: type of event to send
 *        @gfpflags: GFP flags for allocation
 *
 *        Assert scsi device event asynchronously, given an event type.
 */
void sdev_evt_send_simple(struct scsi_device *sdev,
                          enum scsi_device_event evt_type, gfp_t gfpflags)
{
        struct scsi_event *evt = sdev_evt_alloc(evt_type, gfpflags);
        if (!evt) {
                sdev_printk(KERN_ERR, sdev, "event %d eaten due to OOM\n",
                            evt_type);
                return;
        }

        sdev_evt_send(sdev, evt);
}
EXPORT_SYMBOL_GPL(sdev_evt_send_simple);

/**
 *        scsi_device_quiesce - Block all commands except power management.
 *        @sdev:        scsi device to quiesce.
 *
 *        This works by trying to transition to the SDEV_QUIESCE state
 *        (which must be a legal transition).  When the device is in this
 *        state, only power management requests will be accepted, all others will
 *        be deferred.
 *
 *        Must be called with user context, may sleep.
 *
 *        Returns zero if unsuccessful or an error if not.
 */
int
scsi_device_quiesce(struct scsi_device *sdev)
{
        struct request_queue *q = sdev->request_queue;
        int err;

        /*
         * It is allowed to call scsi_device_quiesce() multiple times from
         * the same context but concurrent scsi_device_quiesce() calls are
         * not allowed.
         */
        WARN_ON_ONCE(sdev->quiesced_by && sdev->quiesced_by != current);

        if (sdev->quiesced_by == current)
                return 0;

        blk_set_pm_only(q);

        blk_mq_freeze_queue(q);
        /*
         * Ensure that the effect of blk_set_pm_only() will be visible
         * for percpu_ref_tryget() callers that occur after the queue
         * unfreeze even if the queue was already frozen before this function
         * was called. See also https://lwn.net/Articles/573497/.
         */
        synchronize_rcu();
        blk_mq_unfreeze_queue(q);

        mutex_lock(&sdev->state_mutex);
        err = scsi_device_set_state(sdev, SDEV_QUIESCE);
        if (err == 0)
                sdev->quiesced_by = current;
        else
                blk_clear_pm_only(q);
        mutex_unlock(&sdev->state_mutex);

        return err;
}
EXPORT_SYMBOL(scsi_device_quiesce);

/**
 *        scsi_device_resume - Restart user issued commands to a quiesced device.
 *        @sdev:        scsi device to resume.
 *
 *        Moves the device from quiesced back to running and restarts the
 *        queues.
 *
 *        Must be called with user context, may sleep.
 */
void scsi_device_resume(struct scsi_device *sdev)
{
        /* check if the device state was mutated prior to resume, and if
         * so assume the state is being managed elsewhere (for example
         * device deleted during suspend)
         */
        mutex_lock(&sdev->state_mutex);
        if (sdev->sdev_state == SDEV_QUIESCE)
                scsi_device_set_state(sdev, SDEV_RUNNING);
        if (sdev->quiesced_by) {
                sdev->quiesced_by = NULL;
                blk_clear_pm_only(sdev->request_queue);
        }
        mutex_unlock(&sdev->state_mutex);
}
EXPORT_SYMBOL(scsi_device_resume);

static void
device_quiesce_fn(struct scsi_device *sdev, void *data)
{
        scsi_device_quiesce(sdev);
}

void
scsi_target_quiesce(struct scsi_target *starget)
{
        starget_for_each_device(starget, NULL, device_quiesce_fn);
}
EXPORT_SYMBOL(scsi_target_quiesce);

static void
device_resume_fn(struct scsi_device *sdev, void *data)
{
        scsi_device_resume(sdev);
}

void
scsi_target_resume(struct scsi_target *starget)
{
        starget_for_each_device(starget, NULL, device_resume_fn);
}
EXPORT_SYMBOL(scsi_target_resume);

/**
 * scsi_internal_device_block_nowait - try to transition to the SDEV_BLOCK state
 * @sdev: device to block
 *
 * Pause SCSI command processing on the specified device. Does not sleep.
 *
 * Returns zero if successful or a negative error code upon failure.
 *
 * Notes:
 * This routine transitions the device to the SDEV_BLOCK state (which must be
 * a legal transition). When the device is in this state, command processing
 * is paused until the device leaves the SDEV_BLOCK state. See also
 * scsi_internal_device_unblock_nowait().
 */
int scsi_internal_device_block_nowait(struct scsi_device *sdev)
{
        struct request_queue *q = sdev->request_queue;
        int err = 0;

        err = scsi_device_set_state(sdev, SDEV_BLOCK);
        if (err) {
                err = scsi_device_set_state(sdev, SDEV_CREATED_BLOCK);

                if (err)
                        return err;
        }

        /*
         * The device has transitioned to SDEV_BLOCK.  Stop the
         * block layer from calling the midlayer with this device's
         * request queue.
         */
        blk_mq_quiesce_queue_nowait(q);
        return 0;
}
EXPORT_SYMBOL_GPL(scsi_internal_device_block_nowait);

/**
 * scsi_internal_device_block - try to transition to the SDEV_BLOCK state
 * @sdev: device to block
 *
 * Pause SCSI command processing on the specified device and wait until all
 * ongoing scsi_request_fn() / scsi_queue_rq() calls have finished. May sleep.
 *
 * Returns zero if successful or a negative error code upon failure.
 *
 * Note:
 * This routine transitions the device to the SDEV_BLOCK state (which must be
 * a legal transition). When the device is in this state, command processing
 * is paused until the device leaves the SDEV_BLOCK state. See also
 * scsi_internal_device_unblock().
 */
static int scsi_internal_device_block(struct scsi_device *sdev)
{
        struct request_queue *q = sdev->request_queue;
        int err;

        mutex_lock(&sdev->state_mutex);
        err = scsi_internal_device_block_nowait(sdev);
        if (err == 0)
                blk_mq_quiesce_queue(q);
        mutex_unlock(&sdev->state_mutex);

        return err;
}

void scsi_start_queue(struct scsi_device *sdev)
{
        struct request_queue *q = sdev->request_queue;

        blk_mq_unquiesce_queue(q);
}

/**
 * scsi_internal_device_unblock_nowait - resume a device after a block request
 * @sdev:        device to resume
 * @new_state:        state to set the device to after unblocking
 *
 * Restart the device queue for a previously suspended SCSI device. Does not
 * sleep.
 *
 * Returns zero if successful or a negative error code upon failure.
 *
 * Notes:
 * This routine transitions the device to the SDEV_RUNNING state or to one of
 * the offline states (which must be a legal transition) allowing the midlayer
 * to goose the queue for this device.
 */
int scsi_internal_device_unblock_nowait(struct scsi_device *sdev,
                                        enum scsi_device_state new_state)
{
        switch (new_state) {
        case SDEV_RUNNING:
        case SDEV_TRANSPORT_OFFLINE:
                break;
        default:
                return -EINVAL;
        }

        /*
         * Try to transition the scsi device to SDEV_RUNNING or one of the
         * offlined states and goose the device queue if successful.
         */
        switch (sdev->sdev_state) {
        case SDEV_BLOCK:
        case SDEV_TRANSPORT_OFFLINE:
                sdev->sdev_state = new_state;
                break;
        case SDEV_CREATED_BLOCK:
                if (new_state == SDEV_TRANSPORT_OFFLINE ||
                    new_state == SDEV_OFFLINE)
                        sdev->sdev_state = new_state;
                else
                        sdev->sdev_state = SDEV_CREATED;
                break;
        case SDEV_CANCEL:
        case SDEV_OFFLINE:
                break;
        default:
                return -EINVAL;
        }
        scsi_start_queue(sdev);

        return 0;
}
EXPORT_SYMBOL_GPL(scsi_internal_device_unblock_nowait);

/**
 * scsi_internal_device_unblock - resume a device after a block request
 * @sdev:        device to resume
 * @new_state:        state to set the device to after unblocking
 *
 * Restart the device queue for a previously suspended SCSI device. May sleep.
 *
 * Returns zero if successful or a negative error code upon failure.
 *
 * Notes:
 * This routine transitions the device to the SDEV_RUNNING state or to one of
 * the offline states (which must be a legal transition) allowing the midlayer
 * to goose the queue for this device.
 */
static int scsi_internal_device_unblock(struct scsi_device *sdev,
                                        enum scsi_device_state new_state)
{
        int ret;

        mutex_lock(&sdev->state_mutex);
        ret = scsi_internal_device_unblock_nowait(sdev, new_state);
        mutex_unlock(&sdev->state_mutex);

        return ret;
}

static void
device_block(struct scsi_device *sdev, void *data)
{
        int ret;

        ret = scsi_internal_device_block(sdev);

        WARN_ONCE(ret, "scsi_internal_device_block(%s) failed: ret = %d\n",
                  dev_name(&sdev->sdev_gendev), ret);
}

static int
target_block(struct device *dev, void *data)
{
        if (scsi_is_target_device(dev))
                starget_for_each_device(to_scsi_target(dev), NULL,
                                        device_block);
        return 0;
}

void
scsi_target_block(struct device *dev)
{
        if (scsi_is_target_device(dev))
                starget_for_each_device(to_scsi_target(dev), NULL,
                                        device_block);
        else
                device_for_each_child(dev, NULL, target_block);
}
EXPORT_SYMBOL_GPL(scsi_target_block);

static void
device_unblock(struct scsi_device *sdev, void *data)
{
        scsi_internal_device_unblock(sdev, *(enum scsi_device_state *)data);
}

static int
target_unblock(struct device *dev, void *data)
{
        if (scsi_is_target_device(dev))
                starget_for_each_device(to_scsi_target(dev), data,
                                        device_unblock);
        return 0;
}

void
scsi_target_unblock(struct device *dev, enum scsi_device_state new_state)
{
        if (scsi_is_target_device(dev))
                starget_for_each_device(to_scsi_target(dev), &new_state,
                                        device_unblock);
        else
                device_for_each_child(dev, &new_state, target_unblock);
}
EXPORT_SYMBOL_GPL(scsi_target_unblock);

int
scsi_host_block(struct Scsi_Host *shost)
{
        struct scsi_device *sdev;
        int ret = 0;

        /*
         * Call scsi_internal_device_block_nowait so we can avoid
         * calling synchronize_rcu() for each LUN.
         */
        shost_for_each_device(sdev, shost) {
                mutex_lock(&sdev->state_mutex);
                ret = scsi_internal_device_block_nowait(sdev);
                mutex_unlock(&sdev->state_mutex);
                if (ret) {
                        scsi_device_put(sdev);
                        break;
                }
        }

        /*
         * SCSI never enables blk-mq's BLK_MQ_F_BLOCKING flag so
         * calling synchronize_rcu() once is enough.
         */
        WARN_ON_ONCE(shost->tag_set.flags & BLK_MQ_F_BLOCKING);

        if (!ret)
                synchronize_rcu();

        return ret;
}
EXPORT_SYMBOL_GPL(scsi_host_block);

int
scsi_host_unblock(struct Scsi_Host *shost, int new_state)
{
        struct scsi_device *sdev;
        int ret = 0;

        shost_for_each_device(sdev, shost) {
                ret = scsi_internal_device_unblock(sdev, new_state);
                if (ret) {
                        scsi_device_put(sdev);
                        break;
                }
        }
        return ret;
}
EXPORT_SYMBOL_GPL(scsi_host_unblock);

/**
 * scsi_kmap_atomic_sg - find and atomically map an sg-elemnt
 * @sgl:        scatter-gather list
 * @sg_count:        number of segments in sg
 * @offset:        offset in bytes into sg, on return offset into the mapped area
 * @len:        bytes to map, on return number of bytes mapped
 *
 * Returns virtual address of the start of the mapped page
 */
void *scsi_kmap_atomic_sg(struct scatterlist *sgl, int sg_count,
                          size_t *offset, size_t *len)
{
        int i;
        size_t sg_len = 0, len_complete = 0;
        struct scatterlist *sg;
        struct page *page;

        WARN_ON(!irqs_disabled());

        for_each_sg(sgl, sg, sg_count, i) {
                len_complete = sg_len; /* Complete sg-entries */
                sg_len += sg->length;
                if (sg_len > *offset)
                        break;
        }

        if (unlikely(i == sg_count)) {
                printk(KERN_ERR "%s: Bytes in sg: %zu, requested offset %zu, "
                        "elements %d\n",
                       __func__, sg_len, *offset, sg_count);
                WARN_ON(1);
                return NULL;
        }

        /* Offset starting from the beginning of first page in this sg-entry */
        *offset = *offset - len_complete + sg->offset;

        /* Assumption: contiguous pages can be accessed as "page + i" */
        page = nth_page(sg_page(sg), (*offset >> PAGE_SHIFT));
        *offset &= ~PAGE_MASK;

        /* Bytes in this sg-entry from *offset to the end of the page */
        sg_len = PAGE_SIZE - *offset;
        if (*len > sg_len)
                *len = sg_len;

        return kmap_atomic(page);
}
EXPORT_SYMBOL(scsi_kmap_atomic_sg);

/**
 * scsi_kunmap_atomic_sg - atomically unmap a virtual address, previously mapped with scsi_kmap_atomic_sg
 * @virt:        virtual address to be unmapped
 */
void scsi_kunmap_atomic_sg(void *virt)
{
        kunmap_atomic(virt);
}
EXPORT_SYMBOL(scsi_kunmap_atomic_sg);

void sdev_disable_disk_events(struct scsi_device *sdev)
{
        atomic_inc(&sdev->disk_events_disable_depth);
}
EXPORT_SYMBOL(sdev_disable_disk_events);

void sdev_enable_disk_events(struct scsi_device *sdev)
{
        if (WARN_ON_ONCE(atomic_read(&sdev->disk_events_disable_depth) <= 0))
                return;
        atomic_dec(&sdev->disk_events_disable_depth);
}
EXPORT_SYMBOL(sdev_enable_disk_events);

static unsigned char designator_prio(const unsigned char *d)
{
        if (d[1] & 0x30)
                /* not associated with LUN */
                return 0;

        if (d[3] == 0)
                /* invalid length */
                return 0;

        /*
         * Order of preference for lun descriptor:
         * - SCSI name string
         * - NAA IEEE Registered Extended
         * - EUI-64 based 16-byte
         * - EUI-64 based 12-byte
         * - NAA IEEE Registered
         * - NAA IEEE Extended
         * - EUI-64 based 8-byte
         * - SCSI name string (truncated)
         * - T10 Vendor ID
         * as longer descriptors reduce the likelyhood
         * of identification clashes.
         */

        switch (d[1] & 0xf) {
        case 8:
                /* SCSI name string, variable-length UTF-8 */
                return 9;
        case 3:
                switch (d[4] >> 4) {
                case 6:
                        /* NAA registered extended */
                        return 8;
                case 5:
                        /* NAA registered */
                        return 5;
                case 4:
                        /* NAA extended */
                        return 4;
                case 3:
                        /* NAA locally assigned */
                        return 1;
                default:
                        break;
                }
                break;
        case 2:
                switch (d[3]) {
                case 16:
                        /* EUI64-based, 16 byte */
                        return 7;
                case 12:
                        /* EUI64-based, 12 byte */
                        return 6;
                case 8:
                        /* EUI64-based, 8 byte */
                        return 3;
                default:
                        break;
                }
                break;
        case 1:
                /* T10 vendor ID */
                return 1;
        default:
                break;
        }

        return 0;
}

/**
 * scsi_vpd_lun_id - return a unique device identification
 * @sdev: SCSI device
 * @id:   buffer for the identification
 * @id_len:  length of the buffer
 *
 * Copies a unique device identification into @id based
 * on the information in the VPD page 0x83 of the device.
 * The string will be formatted as a SCSI name string.
 *
 * Returns the length of the identification or error on failure.
 * If the identifier is longer than the supplied buffer the actual
 * identifier length is returned and the buffer is not zero-padded.
 */
int scsi_vpd_lun_id(struct scsi_device *sdev, char *id, size_t id_len)
{
        u8 cur_id_prio = 0;
        u8 cur_id_size = 0;
        const unsigned char *d, *cur_id_str;
        const struct scsi_vpd *vpd_pg83;
        int id_size = -EINVAL;

        rcu_read_lock();
        vpd_pg83 = rcu_dereference(sdev->vpd_pg83);
        if (!vpd_pg83) {
                rcu_read_unlock();
                return -ENXIO;
        }

        /* The id string must be at least 20 bytes + terminating NULL byte */
        if (id_len < 21) {
                rcu_read_unlock();
                return -EINVAL;
        }

        memset(id, 0, id_len);
        d = vpd_pg83->data + 4;
        while (d < vpd_pg83->data + vpd_pg83->len) {
                u8 prio = designator_prio(d);

                if (prio == 0 || cur_id_prio > prio)
                        goto next_desig;

                switch (d[1] & 0xf) {
                case 0x1:
                        /* T10 Vendor ID */
                        if (cur_id_size > d[3])
                                break;
                        cur_id_prio = prio;
                        cur_id_size = d[3];
                        if (cur_id_size + 4 > id_len)
                                cur_id_size = id_len - 4;
                        cur_id_str = d + 4;
                        id_size = snprintf(id, id_len, "t10.%*pE",
                                           cur_id_size, cur_id_str);
                        break;
                case 0x2:
                        /* EUI-64 */
                        cur_id_prio = prio;
                        cur_id_size = d[3];
                        cur_id_str = d + 4;
                        switch (cur_id_size) {
                        case 8:
                                id_size = snprintf(id, id_len,
                                                   "eui.%8phN",
                                                   cur_id_str);
                                break;
                        case 12:
                                id_size = snprintf(id, id_len,
                                                   "eui.%12phN",
                                                   cur_id_str);
                                break;
                        case 16:
                                id_size = snprintf(id, id_len,
                                                   "eui.%16phN",
                                                   cur_id_str);
                                break;
                        default:
                                break;
                        }
                        break;
                case 0x3:
                        /* NAA */
                        cur_id_prio = prio;
                        cur_id_size = d[3];
                        cur_id_str = d + 4;
                        switch (cur_id_size) {
                        case 8:
                                id_size = snprintf(id, id_len,
                                                   "naa.%8phN",
                                                   cur_id_str);
                                break;
                        case 16:
                                id_size = snprintf(id, id_len,
                                                   "naa.%16phN",
                                                   cur_id_str);
                                break;
                        default:
                                break;
                        }
                        break;
                case 0x8:
                        /* SCSI name string */
                        if (cur_id_size > d[3])
                                break;
                        /* Prefer others for truncated descriptor */
                        if (d[3] > id_len) {
                                prio = 2;
                                if (cur_id_prio > prio)
                                        break;
                        }
                        cur_id_prio = prio;
                        cur_id_size = id_size = d[3];
                        cur_id_str = d + 4;
                        if (cur_id_size >= id_len)
                                cur_id_size = id_len - 1;
                        memcpy(id, cur_id_str, cur_id_size);
                        break;
                default:
                        break;
                }
next_desig:
                d += d[3] + 4;
        }
        rcu_read_unlock();

        return id_size;
}
EXPORT_SYMBOL(scsi_vpd_lun_id);

/*
 * scsi_vpd_tpg_id - return a target port group identifier
 * @sdev: SCSI device
 *
 * Returns the Target Port Group identifier from the information
 * froom VPD page 0x83 of the device.
 *
 * Returns the identifier or error on failure.
 */
int scsi_vpd_tpg_id(struct scsi_device *sdev, int *rel_id)
{
        const unsigned char *d;
        const struct scsi_vpd *vpd_pg83;
        int group_id = -EAGAIN, rel_port = -1;

        rcu_read_lock();
        vpd_pg83 = rcu_dereference(sdev->vpd_pg83);
        if (!vpd_pg83) {
                rcu_read_unlock();
                return -ENXIO;
        }

        d = vpd_pg83->data + 4;
        while (d < vpd_pg83->data + vpd_pg83->len) {
                switch (d[1] & 0xf) {
                case 0x4:
                        /* Relative target port */
                        rel_port = get_unaligned_be16(&d[6]);
                        break;
                case 0x5:
                        /* Target port group */
                        group_id = get_unaligned_be16(&d[6]);
                        break;
                default:
                        break;
                }
                d += d[3] + 4;
        }
        rcu_read_unlock();

        if (group_id >= 0 && rel_id && rel_port != -1)
                *rel_id = rel_port;

        return group_id;
}
EXPORT_SYMBOL(scsi_vpd_tpg_id);






































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
/* SPDX-License-Identifier: GPL-2.0 */
/* thread_info.h: low-level thread information
 *
 * Copyright (C) 2002  David Howells (dhowells@redhat.com)
 * - Incorporating suggestions made by Linus Torvalds and Dave Miller
 */

#ifndef _ASM_X86_THREAD_INFO_H
#define _ASM_X86_THREAD_INFO_H

#include <linux/compiler.h>
#include <asm/page.h>
#include <asm/percpu.h>
#include <asm/types.h>

/*
 * TOP_OF_KERNEL_STACK_PADDING is a number of unused bytes that we
 * reserve at the top of the kernel stack.  We do it because of a nasty
 * 32-bit corner case.  On x86_32, the hardware stack frame is
 * variable-length.  Except for vm86 mode, struct pt_regs assumes a
 * maximum-length frame.  If we enter from CPL 0, the top 8 bytes of
 * pt_regs don't actually exist.  Ordinarily this doesn't matter, but it
 * does in at least one case:
 *
 * If we take an NMI early enough in SYSENTER, then we can end up with
 * pt_regs that extends above sp0.  On the way out, in the espfix code,
 * we can read the saved SS value, but that value will be above sp0.
 * Without this offset, that can result in a page fault.  (We are
 * careful that, in this case, the value we read doesn't matter.)
 *
 * In vm86 mode, the hardware frame is much longer still, so add 16
 * bytes to make room for the real-mode segments.
 *
 * x86_64 has a fixed-length stack frame.
 */
#ifdef CONFIG_X86_32
# ifdef CONFIG_VM86
#  define TOP_OF_KERNEL_STACK_PADDING 16
# else
#  define TOP_OF_KERNEL_STACK_PADDING 8
# endif
#else
# define TOP_OF_KERNEL_STACK_PADDING 0
#endif

/*
 * low level task data that entry.S needs immediate access to
 * - this struct should fit entirely inside of one cache line
 * - this struct shares the supervisor stack pages
 */
#ifndef __ASSEMBLY__
struct task_struct;
#include <asm/cpufeature.h>
#include <linux/atomic.h>

struct thread_info {
        unsigned long                flags;                /* low level flags */
        u32                        status;                /* thread synchronous flags */
};

#define INIT_THREAD_INFO(tsk)                        \
{                                                \
        .flags                = 0,                        \
}

#else /* !__ASSEMBLY__ */

#include <asm/asm-offsets.h>

#endif

/*
 * thread information flags
 * - these are process state flags that various assembly files
 *   may need to access
 */
#define TIF_SYSCALL_TRACE        0        /* syscall trace active */
#define TIF_NOTIFY_RESUME        1        /* callback before returning to user */
#define TIF_SIGPENDING                2        /* signal pending */
#define TIF_NEED_RESCHED        3        /* rescheduling necessary */
#define TIF_SINGLESTEP                4        /* reenable singlestep on user return*/
#define TIF_SSBD                5        /* Speculative store bypass disable */
#define TIF_SYSCALL_EMU                6        /* syscall emulation active */
#define TIF_SYSCALL_AUDIT        7        /* syscall auditing active */
#define TIF_SECCOMP                8        /* secure computing */
#define TIF_SPEC_IB                9        /* Indirect branch speculation mitigation */
#define TIF_SPEC_FORCE_UPDATE        10        /* Force speculation MSR update in context switch */
#define TIF_USER_RETURN_NOTIFY        11        /* notify kernel of userspace return */
#define TIF_UPROBE                12        /* breakpointed or singlestepping */
#define TIF_PATCH_PENDING        13        /* pending live patching update */
#define TIF_NEED_FPU_LOAD        14        /* load FPU on return to userspace */
#define TIF_NOCPUID                15        /* CPUID is not accessible in userland */
#define TIF_NOTSC                16        /* TSC is not accessible in userland */
#define TIF_IA32                17        /* IA32 compatibility process */
#define TIF_SLD                        18        /* Restore split lock detection on context switch */
#define TIF_NOTIFY_SIGNAL        19        /* signal notifications exist */
#define TIF_MEMDIE                20        /* is terminating due to OOM killer */
#define TIF_POLLING_NRFLAG        21        /* idle is polling for TIF_NEED_RESCHED */
#define TIF_IO_BITMAP                22        /* uses I/O bitmap */
#define TIF_FORCED_TF                24        /* true if TF in eflags artificially */
#define TIF_BLOCKSTEP                25        /* set when we want DEBUGCTLMSR_BTF */
#define TIF_LAZY_MMU_UPDATES        27        /* task is updating the mmu lazily */
#define TIF_SYSCALL_TRACEPOINT        28        /* syscall tracepoint instrumentation */
#define TIF_ADDR32                29        /* 32-bit address space on 64 bits */
#define TIF_X32                        30        /* 32-bit native x86-64 binary */

#define _TIF_SYSCALL_TRACE        (1 << TIF_SYSCALL_TRACE)
#define _TIF_NOTIFY_RESUME        (1 << TIF_NOTIFY_RESUME)
#define _TIF_SIGPENDING                (1 << TIF_SIGPENDING)
#define _TIF_NEED_RESCHED        (1 << TIF_NEED_RESCHED)
#define _TIF_SINGLESTEP                (1 << TIF_SINGLESTEP)
#define _TIF_SSBD                (1 << TIF_SSBD)
#define _TIF_SYSCALL_EMU        (1 << TIF_SYSCALL_EMU)
#define _TIF_SYSCALL_AUDIT        (1 << TIF_SYSCALL_AUDIT)
#define _TIF_SECCOMP                (1 << TIF_SECCOMP)
#define _TIF_SPEC_IB                (1 << TIF_SPEC_IB)
#define _TIF_SPEC_FORCE_UPDATE        (1 << TIF_SPEC_FORCE_UPDATE)
#define _TIF_USER_RETURN_NOTIFY        (1 << TIF_USER_RETURN_NOTIFY)
#define _TIF_UPROBE                (1 << TIF_UPROBE)
#define _TIF_PATCH_PENDING        (1 << TIF_PATCH_PENDING)
#define _TIF_NEED_FPU_LOAD        (1 << TIF_NEED_FPU_LOAD)
#define _TIF_NOCPUID                (1 << TIF_NOCPUID)
#define _TIF_NOTSC                (1 << TIF_NOTSC)
#define _TIF_IA32                (1 << TIF_IA32)
#define _TIF_NOTIFY_SIGNAL        (1 << TIF_NOTIFY_SIGNAL)
#define _TIF_SLD                (1 << TIF_SLD)
#define _TIF_POLLING_NRFLAG        (1 << TIF_POLLING_NRFLAG)
#define _TIF_IO_BITMAP                (1 << TIF_IO_BITMAP)
#define _TIF_FORCED_TF                (1 << TIF_FORCED_TF)
#define _TIF_BLOCKSTEP                (1 << TIF_BLOCKSTEP)
#define _TIF_LAZY_MMU_UPDATES        (1 << TIF_LAZY_MMU_UPDATES)
#define _TIF_SYSCALL_TRACEPOINT        (1 << TIF_SYSCALL_TRACEPOINT)
#define _TIF_ADDR32                (1 << TIF_ADDR32)
#define _TIF_X32                (1 << TIF_X32)

/* flags to check in __switch_to() */
#define _TIF_WORK_CTXSW_BASE                                        \
        (_TIF_NOCPUID | _TIF_NOTSC | _TIF_BLOCKSTEP |                \
         _TIF_SSBD | _TIF_SPEC_FORCE_UPDATE | _TIF_SLD)

/*
 * Avoid calls to __switch_to_xtra() on UP as STIBP is not evaluated.
 */
#ifdef CONFIG_SMP
# define _TIF_WORK_CTXSW        (_TIF_WORK_CTXSW_BASE | _TIF_SPEC_IB)
#else
# define _TIF_WORK_CTXSW        (_TIF_WORK_CTXSW_BASE)
#endif

#ifdef CONFIG_X86_IOPL_IOPERM
# define _TIF_WORK_CTXSW_PREV        (_TIF_WORK_CTXSW| _TIF_USER_RETURN_NOTIFY | \
                                 _TIF_IO_BITMAP)
#else
# define _TIF_WORK_CTXSW_PREV        (_TIF_WORK_CTXSW| _TIF_USER_RETURN_NOTIFY)
#endif

#define _TIF_WORK_CTXSW_NEXT        (_TIF_WORK_CTXSW)

#define STACK_WARN                (THREAD_SIZE/8)

/*
 * macros/functions for gaining access to the thread information structure
 *
 * preempt_count needs to be 1 initially, until the scheduler is functional.
 */
#ifndef __ASSEMBLY__

/*
 * Walks up the stack frames to make sure that the specified object is
 * entirely contained by a single stack frame.
 *
 * Returns:
 *        GOOD_FRAME        if within a frame
 *        BAD_STACK        if placed across a frame boundary (or outside stack)
 *        NOT_STACK        unable to determine (no frame pointers, etc)
 */
static inline int arch_within_stack_frames(const void * const stack,
                                           const void * const stackend,
                                           const void *obj, unsigned long len)
{
#if defined(CONFIG_FRAME_POINTER)
        const void *frame = NULL;
        const void *oldframe;

        oldframe = __builtin_frame_address(1);
        if (oldframe)
                frame = __builtin_frame_address(2);
        /*
         * low ----------------------------------------------> high
         * [saved bp][saved ip][args][local vars][saved bp][saved ip]
         *                     ^----------------^
         *               allow copies only within here
         */
        while (stack <= frame && frame < stackend) {
                /*
                 * If obj + len extends past the last frame, this
                 * check won't pass and the next frame will be 0,
                 * causing us to bail out and correctly report
                 * the copy as invalid.
                 */
                if (obj + len <= frame)
                        return obj >= oldframe + 2 * sizeof(void *) ?
                                GOOD_FRAME : BAD_STACK;
                oldframe = frame;
                frame = *(const void * const *)frame;
        }
        return BAD_STACK;
#else
        return NOT_STACK;
#endif
}

#else /* !__ASSEMBLY__ */

#ifdef CONFIG_X86_64
# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1)
#endif

#endif

/*
 * Thread-synchronous status.
 *
 * This is different from the flags in that nobody else
 * ever touches our thread-synchronous status, so we don't
 * have to worry about atomic accesses.
 */
#define TS_COMPAT                0x0002        /* 32bit syscall active (64BIT)*/

#ifndef __ASSEMBLY__
#ifdef CONFIG_COMPAT
#define TS_I386_REGS_POKED        0x0004        /* regs poked by 32-bit ptracer */
#define TS_COMPAT_RESTART        0x0008

#define arch_set_restart_data        arch_set_restart_data

static inline void arch_set_restart_data(struct restart_block *restart)
{
        struct thread_info *ti = current_thread_info();
        if (ti->status & TS_COMPAT)
                ti->status |= TS_COMPAT_RESTART;
        else
                ti->status &= ~TS_COMPAT_RESTART;
}
#endif

#ifdef CONFIG_X86_32
#define in_ia32_syscall() true
#else
#define in_ia32_syscall() (IS_ENABLED(CONFIG_IA32_EMULATION) && \
                           current_thread_info()->status & TS_COMPAT)
#endif

extern void arch_task_cache_init(void);
extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
extern void arch_release_task_struct(struct task_struct *tsk);
extern void arch_setup_new_exec(void);
#define arch_setup_new_exec arch_setup_new_exec
#endif        /* !__ASSEMBLY__ */

#endif /* _ASM_X86_THREAD_INFO_H */













































































































    2 


    1 



















    1 

    1 

















    1 
























    1 


    1 




    1 




    1 

















    1 
























    1 










    1 


















    1 

    1 


    1 





    1 




































    1 












    3 
    2 








    1 
































































    1 

    1 

    1 
    1 

    1 








    1 


























    1 









    1 
























    2 

    2 
    2 







    1 









    2 

    2 








    1 









































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Generic pidhash and scalable, time-bounded PID allocator
 *
 * (C) 2002-2003 Nadia Yvette Chambers, IBM
 * (C) 2004 Nadia Yvette Chambers, Oracle
 * (C) 2002-2004 Ingo Molnar, Red Hat
 *
 * pid-structures are backing objects for tasks sharing a given ID to chain
 * against. There is very little to them aside from hashing them and
 * parking tasks using given ID's on a list.
 *
 * The hash is always changed with the tasklist_lock write-acquired,
 * and the hash is only accessed with the tasklist_lock at least
 * read-acquired, so there's no additional SMP locking needed here.
 *
 * We have a list of bitmap pages, which bitmaps represent the PID space.
 * Allocating and freeing PIDs is completely lockless. The worst-case
 * allocation scenario when all but one out of 1 million PIDs possible are
 * allocated already: the scanning of 32 list entries and at most PAGE_SIZE
 * bytes. The typical fastpath is a single successful setbit. Freeing is O(1).
 *
 * Pid namespaces:
 *    (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
 *    (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
 *     Many thanks to Oleg Nesterov for comments and help
 *
 */

#include <linux/mm.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/rculist.h>
#include <linux/memblock.h>
#include <linux/pid_namespace.h>
#include <linux/init_task.h>
#include <linux/syscalls.h>
#include <linux/proc_ns.h>
#include <linux/refcount.h>
#include <linux/anon_inodes.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/idr.h>
#include <net/sock.h>
#include <uapi/linux/pidfd.h>

struct pid init_struct_pid = {
        .count                = REFCOUNT_INIT(1),
        .tasks                = {
                { .first = NULL },
                { .first = NULL },
                { .first = NULL },
        },
        .level                = 0,
        .numbers        = { {
                .nr                = 0,
                .ns                = &init_pid_ns,
        }, }
};

int pid_max = PID_MAX_DEFAULT;

#define RESERVED_PIDS                300

int pid_max_min = RESERVED_PIDS + 1;
int pid_max_max = PID_MAX_LIMIT;

/*
 * PID-map pages start out as NULL, they get allocated upon
 * first use and are never deallocated. This way a low pid_max
 * value does not cause lots of bitmaps to be allocated, but
 * the scheme scales to up to 4 million PIDs, runtime.
 */
struct pid_namespace init_pid_ns = {
        .kref = KREF_INIT(2),
        .idr = IDR_INIT(init_pid_ns.idr),
        .pid_allocated = PIDNS_ADDING,
        .level = 0,
        .child_reaper = &init_task,
        .user_ns = &init_user_ns,
        .ns.inum = PROC_PID_INIT_INO,
#ifdef CONFIG_PID_NS
        .ns.ops = &pidns_operations,
#endif
};
EXPORT_SYMBOL_GPL(init_pid_ns);

/*
 * Note: disable interrupts while the pidmap_lock is held as an
 * interrupt might come in and do read_lock(&tasklist_lock).
 *
 * If we don't disable interrupts there is a nasty deadlock between
 * detach_pid()->free_pid() and another cpu that does
 * spin_lock(&pidmap_lock) followed by an interrupt routine that does
 * read_lock(&tasklist_lock);
 *
 * After we clean up the tasklist_lock and know there are no
 * irq handlers that take it we can leave the interrupts enabled.
 * For now it is easier to be safe than to prove it can't happen.
 */

static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);

void put_pid(struct pid *pid)
{
        struct pid_namespace *ns;

        if (!pid)
                return;

        ns = pid->numbers[pid->level].ns;
        if (refcount_dec_and_test(&pid->count)) {
                kmem_cache_free(ns->pid_cachep, pid);
                put_pid_ns(ns);
        }
}
EXPORT_SYMBOL_GPL(put_pid);

static void delayed_put_pid(struct rcu_head *rhp)
{
        struct pid *pid = container_of(rhp, struct pid, rcu);
        put_pid(pid);
}

void free_pid(struct pid *pid)
{
        /* We can be called with write_lock_irq(&tasklist_lock) held */
        int i;
        unsigned long flags;

        spin_lock_irqsave(&pidmap_lock, flags);
        for (i = 0; i <= pid->level; i++) {
                struct upid *upid = pid->numbers + i;
                struct pid_namespace *ns = upid->ns;
                switch (--ns->pid_allocated) {
                case 2:
                case 1:
                        /* When all that is left in the pid namespace
                         * is the reaper wake up the reaper.  The reaper
                         * may be sleeping in zap_pid_ns_processes().
                         */
                        wake_up_process(ns->child_reaper);
                        break;
                case PIDNS_ADDING:
                        /* Handle a fork failure of the first process */
                        WARN_ON(ns->child_reaper);
                        ns->pid_allocated = 0;
                        break;
                }

                idr_remove(&ns->idr, upid->nr);
        }
        spin_unlock_irqrestore(&pidmap_lock, flags);

        call_rcu(&pid->rcu, delayed_put_pid);
}

struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
                      size_t set_tid_size)
{
        struct pid *pid;
        enum pid_type type;
        int i, nr;
        struct pid_namespace *tmp;
        struct upid *upid;
        int retval = -ENOMEM;

        /*
         * set_tid_size contains the size of the set_tid array. Starting at
         * the most nested currently active PID namespace it tells alloc_pid()
         * which PID to set for a process in that most nested PID namespace
         * up to set_tid_size PID namespaces. It does not have to set the PID
         * for a process in all nested PID namespaces but set_tid_size must
         * never be greater than the current ns->level + 1.
         */
        if (set_tid_size > ns->level + 1)
                return ERR_PTR(-EINVAL);

        pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
        if (!pid)
                return ERR_PTR(retval);

        tmp = ns;
        pid->level = ns->level;

        for (i = ns->level; i >= 0; i--) {
                int tid = 0;

                if (set_tid_size) {
                        tid = set_tid[ns->level - i];

                        retval = -EINVAL;
                        if (tid < 1 || tid >= pid_max)
                                goto out_free;
                        /*
                         * Also fail if a PID != 1 is requested and
                         * no PID 1 exists.
                         */
                        if (tid != 1 && !tmp->child_reaper)
                                goto out_free;
                        retval = -EPERM;
                        if (!checkpoint_restore_ns_capable(tmp->user_ns))
                                goto out_free;
                        set_tid_size--;
                }

                idr_preload(GFP_KERNEL);
                spin_lock_irq(&pidmap_lock);

                if (tid) {
                        nr = idr_alloc(&tmp->idr, NULL, tid,
                                       tid + 1, GFP_ATOMIC);
                        /*
                         * If ENOSPC is returned it means that the PID is
                         * alreay in use. Return EEXIST in that case.
                         */
                        if (nr == -ENOSPC)
                                nr = -EEXIST;
                } else {
                        int pid_min = 1;
                        /*
                         * init really needs pid 1, but after reaching the
                         * maximum wrap back to RESERVED_PIDS
                         */
                        if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
                                pid_min = RESERVED_PIDS;

                        /*
                         * Store a null pointer so find_pid_ns does not find
                         * a partially initialized PID (see below).
                         */
                        nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
                                              pid_max, GFP_ATOMIC);
                }
                spin_unlock_irq(&pidmap_lock);
                idr_preload_end();

                if (nr < 0) {
                        retval = (nr == -ENOSPC) ? -EAGAIN : nr;
                        goto out_free;
                }

                pid->numbers[i].nr = nr;
                pid->numbers[i].ns = tmp;
                tmp = tmp->parent;
        }

        /*
         * ENOMEM is not the most obvious choice especially for the case
         * where the child subreaper has already exited and the pid
         * namespace denies the creation of any new processes. But ENOMEM
         * is what we have exposed to userspace for a long time and it is
         * documented behavior for pid namespaces. So we can't easily
         * change it even if there were an error code better suited.
         */
        retval = -ENOMEM;

        get_pid_ns(ns);
        refcount_set(&pid->count, 1);
        spin_lock_init(&pid->lock);
        for (type = 0; type < PIDTYPE_MAX; ++type)
                INIT_HLIST_HEAD(&pid->tasks[type]);

        init_waitqueue_head(&pid->wait_pidfd);
        INIT_HLIST_HEAD(&pid->inodes);

        upid = pid->numbers + ns->level;
        spin_lock_irq(&pidmap_lock);
        if (!(ns->pid_allocated & PIDNS_ADDING))
                goto out_unlock;
        for ( ; upid >= pid->numbers; --upid) {
                /* Make the PID visible to find_pid_ns. */
                idr_replace(&upid->ns->idr, pid, upid->nr);
                upid->ns->pid_allocated++;
        }
        spin_unlock_irq(&pidmap_lock);

        return pid;

out_unlock:
        spin_unlock_irq(&pidmap_lock);
        put_pid_ns(ns);

out_free:
        spin_lock_irq(&pidmap_lock);
        while (++i <= ns->level) {
                upid = pid->numbers + i;
                idr_remove(&upid->ns->idr, upid->nr);
        }

        /* On failure to allocate the first pid, reset the state */
        if (ns->pid_allocated == PIDNS_ADDING)
                idr_set_cursor(&ns->idr, 0);

        spin_unlock_irq(&pidmap_lock);

        kmem_cache_free(ns->pid_cachep, pid);
        return ERR_PTR(retval);
}

void disable_pid_allocation(struct pid_namespace *ns)
{
        spin_lock_irq(&pidmap_lock);
        ns->pid_allocated &= ~PIDNS_ADDING;
        spin_unlock_irq(&pidmap_lock);
}

struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
{
        return idr_find(&ns->idr, nr);
}
EXPORT_SYMBOL_GPL(find_pid_ns);

struct pid *find_vpid(int nr)
{
        return find_pid_ns(nr, task_active_pid_ns(current));
}
EXPORT_SYMBOL_GPL(find_vpid);

static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type)
{
        return (type == PIDTYPE_PID) ?
                &task->thread_pid :
                &task->signal->pids[type];
}

/*
 * attach_pid() must be called with the tasklist_lock write-held.
 */
void attach_pid(struct task_struct *task, enum pid_type type)
{
        struct pid *pid = *task_pid_ptr(task, type);
        hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]);
}

static void __change_pid(struct task_struct *task, enum pid_type type,
                        struct pid *new)
{
        struct pid **pid_ptr = task_pid_ptr(task, type);
        struct pid *pid;
        int tmp;

        pid = *pid_ptr;

        hlist_del_rcu(&task->pid_links[type]);
        *pid_ptr = new;

        for (tmp = PIDTYPE_MAX; --tmp >= 0; )
                if (pid_has_task(pid, tmp))
                        return;

        free_pid(pid);
}

void detach_pid(struct task_struct *task, enum pid_type type)
{
        __change_pid(task, type, NULL);
}

void change_pid(struct task_struct *task, enum pid_type type,
                struct pid *pid)
{
        __change_pid(task, type, pid);
        attach_pid(task, type);
}

void exchange_tids(struct task_struct *left, struct task_struct *right)
{
        struct pid *pid1 = left->thread_pid;
        struct pid *pid2 = right->thread_pid;
        struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID];
        struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID];

        /* Swap the single entry tid lists */
        hlists_swap_heads_rcu(head1, head2);

        /* Swap the per task_struct pid */
        rcu_assign_pointer(left->thread_pid, pid2);
        rcu_assign_pointer(right->thread_pid, pid1);

        /* Swap the cached value */
        WRITE_ONCE(left->pid, pid_nr(pid2));
        WRITE_ONCE(right->pid, pid_nr(pid1));
}

/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
void transfer_pid(struct task_struct *old, struct task_struct *new,
                           enum pid_type type)
{
        if (type == PIDTYPE_PID)
                new->thread_pid = old->thread_pid;
        hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]);
}

struct task_struct *pid_task(struct pid *pid, enum pid_type type)
{
        struct task_struct *result = NULL;
        if (pid) {
                struct hlist_node *first;
                first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
                                              lockdep_tasklist_lock_is_held());
                if (first)
                        result = hlist_entry(first, struct task_struct, pid_links[(type)]);
        }
        return result;
}
EXPORT_SYMBOL(pid_task);

/*
 * Must be called under rcu_read_lock().
 */
struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
{
        RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
                         "find_task_by_pid_ns() needs rcu_read_lock() protection");
        return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
}

struct task_struct *find_task_by_vpid(pid_t vnr)
{
        return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
}

struct task_struct *find_get_task_by_vpid(pid_t nr)
{
        struct task_struct *task;

        rcu_read_lock();
        task = find_task_by_vpid(nr);
        if (task)
                get_task_struct(task);
        rcu_read_unlock();

        return task;
}

struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
{
        struct pid *pid;
        rcu_read_lock();
        pid = get_pid(rcu_dereference(*task_pid_ptr(task, type)));
        rcu_read_unlock();
        return pid;
}
EXPORT_SYMBOL_GPL(get_task_pid);

struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
{
        struct task_struct *result;
        rcu_read_lock();
        result = pid_task(pid, type);
        if (result)
                get_task_struct(result);
        rcu_read_unlock();
        return result;
}
EXPORT_SYMBOL_GPL(get_pid_task);

struct pid *find_get_pid(pid_t nr)
{
        struct pid *pid;

        rcu_read_lock();
        pid = get_pid(find_vpid(nr));
        rcu_read_unlock();

        return pid;
}
EXPORT_SYMBOL_GPL(find_get_pid);

pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
{
        struct upid *upid;
        pid_t nr = 0;

        if (pid && ns && ns->level <= pid->level) {
                upid = &pid->numbers[ns->level];
                if (upid->ns == ns)
                        nr = upid->nr;
        }
        return nr;
}
EXPORT_SYMBOL_GPL(pid_nr_ns);

pid_t pid_vnr(struct pid *pid)
{
        return pid_nr_ns(pid, task_active_pid_ns(current));
}
EXPORT_SYMBOL_GPL(pid_vnr);

pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
                        struct pid_namespace *ns)
{
        pid_t nr = 0;

        rcu_read_lock();
        if (!ns)
                ns = task_active_pid_ns(current);
        nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns);
        rcu_read_unlock();

        return nr;
}
EXPORT_SYMBOL(__task_pid_nr_ns);

struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
{
        return ns_of_pid(task_pid(tsk));
}
EXPORT_SYMBOL_GPL(task_active_pid_ns);

/*
 * Used by proc to find the first pid that is greater than or equal to nr.
 *
 * If there is a pid at nr this function is exactly the same as find_pid_ns.
 */
struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
{
        return idr_get_next(&ns->idr, &nr);
}

struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
{
        struct fd f;
        struct pid *pid;

        f = fdget(fd);
        if (!f.file)
                return ERR_PTR(-EBADF);

        pid = pidfd_pid(f.file);
        if (!IS_ERR(pid)) {
                get_pid(pid);
                *flags = f.file->f_flags;
        }

        fdput(f);
        return pid;
}

/**
 * pidfd_create() - Create a new pid file descriptor.
 *
 * @pid:   struct pid that the pidfd will reference
 * @flags: flags to pass
 *
 * This creates a new pid file descriptor with the O_CLOEXEC flag set.
 *
 * Note, that this function can only be called after the fd table has
 * been unshared to avoid leaking the pidfd to the new process.
 *
 * This symbol should not be explicitly exported to loadable modules.
 *
 * Return: On success, a cloexec pidfd is returned.
 *         On error, a negative errno number will be returned.
 */
int pidfd_create(struct pid *pid, unsigned int flags)
{
        int fd;

        if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
                return -EINVAL;

        if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC))
                return -EINVAL;

        fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
                              flags | O_RDWR | O_CLOEXEC);
        if (fd < 0)
                put_pid(pid);

        return fd;
}

/**
 * pidfd_open() - Open new pid file descriptor.
 *
 * @pid:   pid for which to retrieve a pidfd
 * @flags: flags to pass
 *
 * This creates a new pid file descriptor with the O_CLOEXEC flag set for
 * the process identified by @pid. Currently, the process identified by
 * @pid must be a thread-group leader. This restriction currently exists
 * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot
 * be used with CLONE_THREAD) and pidfd polling (only supports thread group
 * leaders).
 *
 * Return: On success, a cloexec pidfd is returned.
 *         On error, a negative errno number will be returned.
 */
SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
{
        int fd;
        struct pid *p;

        if (flags & ~PIDFD_NONBLOCK)
                return -EINVAL;

        if (pid <= 0)
                return -EINVAL;

        p = find_get_pid(pid);
        if (!p)
                return -ESRCH;

        fd = pidfd_create(p, flags);

        put_pid(p);
        return fd;
}

void __init pid_idr_init(void)
{
        /* Verify no one has done anything silly: */
        BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);

        /* bump default and minimum pid_max based on number of cpus */
        pid_max = min(pid_max_max, max_t(int, pid_max,
                                PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
        pid_max_min = max_t(int, pid_max_min,
                                PIDS_PER_CPU_MIN * num_possible_cpus());
        pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);

        idr_init(&init_pid_ns.idr);

        init_pid_ns.pid_cachep = KMEM_CACHE(pid,
                        SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
}

static struct file *__pidfd_fget(struct task_struct *task, int fd)
{
        struct file *file;
        int ret;

        ret = down_read_killable(&task->signal->exec_update_lock);
        if (ret)
                return ERR_PTR(ret);

        if (ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS))
                file = fget_task(task, fd);
        else
                file = ERR_PTR(-EPERM);

        up_read(&task->signal->exec_update_lock);

        return file ?: ERR_PTR(-EBADF);
}

static int pidfd_getfd(struct pid *pid, int fd)
{
        struct task_struct *task;
        struct file *file;
        int ret;

        task = get_pid_task(pid, PIDTYPE_PID);
        if (!task)
                return -ESRCH;

        file = __pidfd_fget(task, fd);
        put_task_struct(task);
        if (IS_ERR(file))
                return PTR_ERR(file);

        ret = receive_fd(file, O_CLOEXEC);
        fput(file);

        return ret;
}

/**
 * sys_pidfd_getfd() - Get a file descriptor from another process
 *
 * @pidfd:        the pidfd file descriptor of the process
 * @fd:                the file descriptor number to get
 * @flags:        flags on how to get the fd (reserved)
 *
 * This syscall gets a copy of a file descriptor from another process
 * based on the pidfd, and file descriptor number. It requires that
 * the calling process has the ability to ptrace the process represented
 * by the pidfd. The process which is having its file descriptor copied
 * is otherwise unaffected.
 *
 * Return: On success, a cloexec file descriptor is returned.
 *         On error, a negative errno number will be returned.
 */
SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd,
                unsigned int, flags)
{
        struct pid *pid;
        struct fd f;
        int ret;

        /* flags is currently unused - make sure it's unset */
        if (flags)
                return -EINVAL;

        f = fdget(pidfd);
        if (!f.file)
                return -EBADF;

        pid = pidfd_pid(f.file);
        if (IS_ERR(pid))
                ret = PTR_ERR(pid);
        else
                ret = pidfd_getfd(pid, fd);

        fdput(f);
        return ret;
}



















































































    3 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_BIT_SPINLOCK_H
#define __LINUX_BIT_SPINLOCK_H

#include <linux/kernel.h>
#include <linux/preempt.h>
#include <linux/atomic.h>
#include <linux/bug.h>

/*
 *  bit-based spin_lock()
 *
 * Don't use this unless you really need to: spin_lock() and spin_unlock()
 * are significantly faster.
 */
static inline void bit_spin_lock(int bitnum, unsigned long *addr)
{
        /*
         * Assuming the lock is uncontended, this never enters
         * the body of the outer loop. If it is contended, then
         * within the inner loop a non-atomic test is used to
         * busywait with less bus contention for a good time to
         * attempt to acquire the lock bit.
         */
        preempt_disable();
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
        while (unlikely(test_and_set_bit_lock(bitnum, addr))) {
                preempt_enable();
                do {
                        cpu_relax();
                } while (test_bit(bitnum, addr));
                preempt_disable();
        }
#endif
        __acquire(bitlock);
}

/*
 * Return true if it was acquired
 */
static inline int bit_spin_trylock(int bitnum, unsigned long *addr)
{
        preempt_disable();
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
        if (unlikely(test_and_set_bit_lock(bitnum, addr))) {
                preempt_enable();
                return 0;
        }
#endif
        __acquire(bitlock);
        return 1;
}

/*
 *  bit-based spin_unlock()
 */
static inline void bit_spin_unlock(int bitnum, unsigned long *addr)
{
#ifdef CONFIG_DEBUG_SPINLOCK
        BUG_ON(!test_bit(bitnum, addr));
#endif
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
        clear_bit_unlock(bitnum, addr);
#endif
        preempt_enable();
        __release(bitlock);
}

/*
 *  bit-based spin_unlock()
 *  non-atomic version, which can be used eg. if the bit lock itself is
 *  protecting the rest of the flags in the word.
 */
static inline void __bit_spin_unlock(int bitnum, unsigned long *addr)
{
#ifdef CONFIG_DEBUG_SPINLOCK
        BUG_ON(!test_bit(bitnum, addr));
#endif
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
        __clear_bit_unlock(bitnum, addr);
#endif
        preempt_enable();
        __release(bitlock);
}

/*
 * Return true if the lock is held.
 */
static inline int bit_spin_is_locked(int bitnum, unsigned long *addr)
{
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
        return test_bit(bitnum, addr);
#elif defined CONFIG_PREEMPT_COUNT
        return preempt_count();
#else
        return 1;
#endif
}

#endif /* __LINUX_BIT_SPINLOCK_H */



























































    2 
    2 
    2 








    1 







    1 































    2 


    2 






























    1 




























































































    2 




    2 




    2 


























































































































































    1 



























































































































































































    2 














































    2 















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright (C) 2001 Jens Axboe <axboe@suse.de>
 */
#ifndef __LINUX_BIO_H
#define __LINUX_BIO_H

#include <linux/highmem.h>
#include <linux/mempool.h>
#include <linux/ioprio.h>
/* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */
#include <linux/blk_types.h>

#define BIO_DEBUG

#ifdef BIO_DEBUG
#define BIO_BUG_ON        BUG_ON
#else
#define BIO_BUG_ON
#endif

#define BIO_MAX_PAGES                256

#define bio_prio(bio)                        (bio)->bi_ioprio
#define bio_set_prio(bio, prio)                ((bio)->bi_ioprio = prio)

#define bio_iter_iovec(bio, iter)                                \
        bvec_iter_bvec((bio)->bi_io_vec, (iter))

#define bio_iter_page(bio, iter)                                \
        bvec_iter_page((bio)->bi_io_vec, (iter))
#define bio_iter_len(bio, iter)                                        \
        bvec_iter_len((bio)->bi_io_vec, (iter))
#define bio_iter_offset(bio, iter)                                \
        bvec_iter_offset((bio)->bi_io_vec, (iter))

#define bio_page(bio)                bio_iter_page((bio), (bio)->bi_iter)
#define bio_offset(bio)                bio_iter_offset((bio), (bio)->bi_iter)
#define bio_iovec(bio)                bio_iter_iovec((bio), (bio)->bi_iter)

#define bvec_iter_sectors(iter)        ((iter).bi_size >> 9)
#define bvec_iter_end_sector(iter) ((iter).bi_sector + bvec_iter_sectors((iter)))

#define bio_sectors(bio)        bvec_iter_sectors((bio)->bi_iter)
#define bio_end_sector(bio)        bvec_iter_end_sector((bio)->bi_iter)

/*
 * Return the data direction, READ or WRITE.
 */
#define bio_data_dir(bio) \
        (op_is_write(bio_op(bio)) ? WRITE : READ)

/*
 * Check whether this bio carries any data or not. A NULL bio is allowed.
 */
static inline bool bio_has_data(struct bio *bio)
{
        if (bio &&
            bio->bi_iter.bi_size &&
            bio_op(bio) != REQ_OP_DISCARD &&
            bio_op(bio) != REQ_OP_SECURE_ERASE &&
            bio_op(bio) != REQ_OP_WRITE_ZEROES)
                return true;

        return false;
}

static inline bool bio_no_advance_iter(const struct bio *bio)
{
        return bio_op(bio) == REQ_OP_DISCARD ||
               bio_op(bio) == REQ_OP_SECURE_ERASE ||
               bio_op(bio) == REQ_OP_WRITE_SAME ||
               bio_op(bio) == REQ_OP_WRITE_ZEROES;
}

static inline bool bio_mergeable(struct bio *bio)
{
        if (bio->bi_opf & REQ_NOMERGE_FLAGS)
                return false;

        return true;
}

static inline unsigned int bio_cur_bytes(struct bio *bio)
{
        if (bio_has_data(bio))
                return bio_iovec(bio).bv_len;
        else /* dataless requests such as discard */
                return bio->bi_iter.bi_size;
}

static inline void *bio_data(struct bio *bio)
{
        if (bio_has_data(bio))
                return page_address(bio_page(bio)) + bio_offset(bio);

        return NULL;
}

/**
 * bio_full - check if the bio is full
 * @bio:        bio to check
 * @len:        length of one segment to be added
 *
 * Return true if @bio is full and one segment with @len bytes can't be
 * added to the bio, otherwise return false
 */
static inline bool bio_full(struct bio *bio, unsigned len)
{
        if (bio->bi_vcnt >= bio->bi_max_vecs)
                return true;

        if (bio->bi_iter.bi_size > UINT_MAX - len)
                return true;

        return false;
}

static inline bool bio_next_segment(const struct bio *bio,
                                    struct bvec_iter_all *iter)
{
        if (iter->idx >= bio->bi_vcnt)
                return false;

        bvec_advance(&bio->bi_io_vec[iter->idx], iter);
        return true;
}

/*
 * drivers should _never_ use the all version - the bio may have been split
 * before it got to the driver and the driver won't own all of it
 */
#define bio_for_each_segment_all(bvl, bio, iter) \
        for (bvl = bvec_init_iter_all(&iter); bio_next_segment((bio), &iter); )

static inline void bio_advance_iter(const struct bio *bio,
                                    struct bvec_iter *iter, unsigned int bytes)
{
        iter->bi_sector += bytes >> 9;

        if (bio_no_advance_iter(bio))
                iter->bi_size -= bytes;
        else
                bvec_iter_advance(bio->bi_io_vec, iter, bytes);
                /* TODO: It is reasonable to complete bio with error here. */
}

#define __bio_for_each_segment(bvl, bio, iter, start)                        \
        for (iter = (start);                                                \
             (iter).bi_size &&                                                \
                ((bvl = bio_iter_iovec((bio), (iter))), 1);                \
             bio_advance_iter((bio), &(iter), (bvl).bv_len))

#define bio_for_each_segment(bvl, bio, iter)                                \
        __bio_for_each_segment(bvl, bio, iter, (bio)->bi_iter)

#define __bio_for_each_bvec(bvl, bio, iter, start)                \
        for (iter = (start);                                                \
             (iter).bi_size &&                                                \
                ((bvl = mp_bvec_iter_bvec((bio)->bi_io_vec, (iter))), 1); \
             bio_advance_iter((bio), &(iter), (bvl).bv_len))

/* iterate over multi-page bvec */
#define bio_for_each_bvec(bvl, bio, iter)                        \
        __bio_for_each_bvec(bvl, bio, iter, (bio)->bi_iter)

/*
 * Iterate over all multi-page bvecs. Drivers shouldn't use this version for the
 * same reasons as bio_for_each_segment_all().
 */
#define bio_for_each_bvec_all(bvl, bio, i)                \
        for (i = 0, bvl = bio_first_bvec_all(bio);        \
             i < (bio)->bi_vcnt; i++, bvl++)                \

#define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len)

static inline unsigned bio_segments(struct bio *bio)
{
        unsigned segs = 0;
        struct bio_vec bv;
        struct bvec_iter iter;

        /*
         * We special case discard/write same/write zeroes, because they
         * interpret bi_size differently:
         */

        switch (bio_op(bio)) {
        case REQ_OP_DISCARD:
        case REQ_OP_SECURE_ERASE:
        case REQ_OP_WRITE_ZEROES:
                return 0;
        case REQ_OP_WRITE_SAME:
                return 1;
        default:
                break;
        }

        bio_for_each_segment(bv, bio, iter)
                segs++;

        return segs;
}

/*
 * get a reference to a bio, so it won't disappear. the intended use is
 * something like:
 *
 * bio_get(bio);
 * submit_bio(rw, bio);
 * if (bio->bi_flags ...)
 *        do_something
 * bio_put(bio);
 *
 * without the bio_get(), it could potentially complete I/O before submit_bio
 * returns. and then bio would be freed memory when if (bio->bi_flags ...)
 * runs
 */
static inline void bio_get(struct bio *bio)
{
        bio->bi_flags |= (1 << BIO_REFFED);
        smp_mb__before_atomic();
        atomic_inc(&bio->__bi_cnt);
}

static inline void bio_cnt_set(struct bio *bio, unsigned int count)
{
        if (count != 1) {
                bio->bi_flags |= (1 << BIO_REFFED);
                smp_mb();
        }
        atomic_set(&bio->__bi_cnt, count);
}

static inline bool bio_flagged(struct bio *bio, unsigned int bit)
{
        return (bio->bi_flags & (1U << bit)) != 0;
}

static inline void bio_set_flag(struct bio *bio, unsigned int bit)
{
        bio->bi_flags |= (1U << bit);
}

static inline void bio_clear_flag(struct bio *bio, unsigned int bit)
{
        bio->bi_flags &= ~(1U << bit);
}

static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv)
{
        *bv = mp_bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
}

static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
{
        struct bvec_iter iter = bio->bi_iter;
        int idx;

        bio_get_first_bvec(bio, bv);
        if (bv->bv_len == bio->bi_iter.bi_size)
                return;                /* this bio only has a single bvec */

        bio_advance_iter(bio, &iter, iter.bi_size);

        if (!iter.bi_bvec_done)
                idx = iter.bi_idx - 1;
        else        /* in the middle of bvec */
                idx = iter.bi_idx;

        *bv = bio->bi_io_vec[idx];

        /*
         * iter.bi_bvec_done records actual length of the last bvec
         * if this bio ends in the middle of one io vector
         */
        if (iter.bi_bvec_done)
                bv->bv_len = iter.bi_bvec_done;
}

static inline struct bio_vec *bio_first_bvec_all(struct bio *bio)
{
        WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
        return bio->bi_io_vec;
}

static inline struct page *bio_first_page_all(struct bio *bio)
{
        return bio_first_bvec_all(bio)->bv_page;
}

static inline struct bio_vec *bio_last_bvec_all(struct bio *bio)
{
        WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
        return &bio->bi_io_vec[bio->bi_vcnt - 1];
}

enum bip_flags {
        BIP_BLOCK_INTEGRITY        = 1 << 0, /* block layer owns integrity data */
        BIP_MAPPED_INTEGRITY        = 1 << 1, /* ref tag has been remapped */
        BIP_CTRL_NOCHECK        = 1 << 2, /* disable HBA integrity checking */
        BIP_DISK_NOCHECK        = 1 << 3, /* disable disk integrity checking */
        BIP_IP_CHECKSUM                = 1 << 4, /* IP checksum */
};

/*
 * bio integrity payload
 */
struct bio_integrity_payload {
        struct bio                *bip_bio;        /* parent bio */

        struct bvec_iter        bip_iter;

        unsigned short                bip_slab;        /* slab the bip came from */
        unsigned short                bip_vcnt;        /* # of integrity bio_vecs */
        unsigned short                bip_max_vcnt;        /* integrity bio_vec slots */
        unsigned short                bip_flags;        /* control flags */

        struct bvec_iter        bio_iter;        /* for rewinding parent bio */

        struct work_struct        bip_work;        /* I/O completion */

        struct bio_vec                *bip_vec;
        struct bio_vec                bip_inline_vecs[];/* embedded bvec array */
};

#if defined(CONFIG_BLK_DEV_INTEGRITY)

static inline struct bio_integrity_payload *bio_integrity(struct bio *bio)
{
        if (bio->bi_opf & REQ_INTEGRITY)
                return bio->bi_integrity;

        return NULL;
}

static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag)
{
        struct bio_integrity_payload *bip = bio_integrity(bio);

        if (bip)
                return bip->bip_flags & flag;

        return false;
}

static inline sector_t bip_get_seed(struct bio_integrity_payload *bip)
{
        return bip->bip_iter.bi_sector;
}

static inline void bip_set_seed(struct bio_integrity_payload *bip,
                                sector_t seed)
{
        bip->bip_iter.bi_sector = seed;
}

#endif /* CONFIG_BLK_DEV_INTEGRITY */

extern void bio_trim(struct bio *bio, int offset, int size);
extern struct bio *bio_split(struct bio *bio, int sectors,
                             gfp_t gfp, struct bio_set *bs);

/**
 * bio_next_split - get next @sectors from a bio, splitting if necessary
 * @bio:        bio to split
 * @sectors:        number of sectors to split from the front of @bio
 * @gfp:        gfp mask
 * @bs:                bio set to allocate from
 *
 * Returns a bio representing the next @sectors of @bio - if the bio is smaller
 * than @sectors, returns the original bio unchanged.
 */
static inline struct bio *bio_next_split(struct bio *bio, int sectors,
                                         gfp_t gfp, struct bio_set *bs)
{
        if (sectors >= bio_sectors(bio))
                return bio;

        return bio_split(bio, sectors, gfp, bs);
}

enum {
        BIOSET_NEED_BVECS = BIT(0),
        BIOSET_NEED_RESCUER = BIT(1),
};
extern int bioset_init(struct bio_set *, unsigned int, unsigned int, int flags);
extern void bioset_exit(struct bio_set *);
extern int biovec_init_pool(mempool_t *pool, int pool_entries);
extern int bioset_init_from_src(struct bio_set *bs, struct bio_set *src);

extern struct bio *bio_alloc_bioset(gfp_t, unsigned int, struct bio_set *);
extern void bio_put(struct bio *);

extern void __bio_clone_fast(struct bio *, struct bio *);
extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *);

extern struct bio_set fs_bio_set;

static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
{
        return bio_alloc_bioset(gfp_mask, nr_iovecs, &fs_bio_set);
}

static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs)
{
        return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL);
}

extern blk_qc_t submit_bio(struct bio *);

extern void bio_endio(struct bio *);

static inline void bio_io_error(struct bio *bio)
{
        bio->bi_status = BLK_STS_IOERR;
        bio_endio(bio);
}

static inline void bio_wouldblock_error(struct bio *bio)
{
        bio_set_flag(bio, BIO_QUIET);
        bio->bi_status = BLK_STS_AGAIN;
        bio_endio(bio);
}

struct request_queue;

extern int submit_bio_wait(struct bio *bio);
extern void bio_advance(struct bio *, unsigned);

extern void bio_init(struct bio *bio, struct bio_vec *table,
                     unsigned short max_vecs);
extern void bio_uninit(struct bio *);
extern void bio_reset(struct bio *);
void bio_chain(struct bio *, struct bio *);

extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int);
extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
                           unsigned int, unsigned int);
bool __bio_try_merge_page(struct bio *bio, struct page *page,
                unsigned int len, unsigned int off, bool *same_page);
void __bio_add_page(struct bio *bio, struct page *page,
                unsigned int len, unsigned int off);
int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter);
void bio_release_pages(struct bio *bio, bool mark_dirty);
extern void bio_set_pages_dirty(struct bio *bio);
extern void bio_check_pages_dirty(struct bio *bio);

extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
                               struct bio *src, struct bvec_iter *src_iter);
extern void bio_copy_data(struct bio *dst, struct bio *src);
extern void bio_list_copy_data(struct bio *dst, struct bio *src);
extern void bio_free_pages(struct bio *bio);
void zero_fill_bio_iter(struct bio *bio, struct bvec_iter iter);
void bio_truncate(struct bio *bio, unsigned new_size);
void guard_bio_eod(struct bio *bio);

static inline void zero_fill_bio(struct bio *bio)
{
        zero_fill_bio_iter(bio, bio->bi_iter);
}

extern struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *);
extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int);
extern unsigned int bvec_nr_vecs(unsigned short idx);
extern const char *bio_devname(struct bio *bio, char *buffer);

#define bio_set_dev(bio, bdev)                         \
do {                                                \
        if ((bio)->bi_disk != (bdev)->bd_disk)        \
                bio_clear_flag(bio, BIO_THROTTLED);\
        (bio)->bi_disk = (bdev)->bd_disk;        \
        (bio)->bi_partno = (bdev)->bd_partno;        \
        bio_associate_blkg(bio);                \
} while (0)

#define bio_copy_dev(dst, src)                        \
do {                                                \
        (dst)->bi_disk = (src)->bi_disk;        \
        (dst)->bi_partno = (src)->bi_partno;        \
        bio_clone_blkg_association(dst, src);        \
} while (0)

#define bio_dev(bio) \
        disk_devt((bio)->bi_disk)

#ifdef CONFIG_BLK_CGROUP
void bio_associate_blkg(struct bio *bio);
void bio_associate_blkg_from_css(struct bio *bio,
                                 struct cgroup_subsys_state *css);
void bio_clone_blkg_association(struct bio *dst, struct bio *src);
#else        /* CONFIG_BLK_CGROUP */
static inline void bio_associate_blkg(struct bio *bio) { }
static inline void bio_associate_blkg_from_css(struct bio *bio,
                                               struct cgroup_subsys_state *css)
{ }
static inline void bio_clone_blkg_association(struct bio *dst,
                                              struct bio *src) { }
#endif        /* CONFIG_BLK_CGROUP */

#ifdef CONFIG_HIGHMEM
/*
 * remember never ever reenable interrupts between a bvec_kmap_irq and
 * bvec_kunmap_irq!
 */
static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags)
{
        unsigned long addr;

        /*
         * might not be a highmem page, but the preempt/irq count
         * balancing is a lot nicer this way
         */
        local_irq_save(*flags);
        addr = (unsigned long) kmap_atomic(bvec->bv_page);

        BUG_ON(addr & ~PAGE_MASK);

        return (char *) addr + bvec->bv_offset;
}

static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags)
{
        unsigned long ptr = (unsigned long) buffer & PAGE_MASK;

        kunmap_atomic((void *) ptr);
        local_irq_restore(*flags);
}

#else
static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags)
{
        return page_address(bvec->bv_page) + bvec->bv_offset;
}

static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags)
{
        *flags = 0;
}
#endif

/*
 * BIO list management for use by remapping drivers (e.g. DM or MD) and loop.
 *
 * A bio_list anchors a singly-linked list of bios chained through the bi_next
 * member of the bio.  The bio_list also caches the last list member to allow
 * fast access to the tail.
 */
struct bio_list {
        struct bio *head;
        struct bio *tail;
};

static inline int bio_list_empty(const struct bio_list *bl)
{
        return bl->head == NULL;
}

static inline void bio_list_init(struct bio_list *bl)
{
        bl->head = bl->tail = NULL;
}

#define BIO_EMPTY_LIST        { NULL, NULL }

#define bio_list_for_each(bio, bl) \
        for (bio = (bl)->head; bio; bio = bio->bi_next)

static inline unsigned bio_list_size(const struct bio_list *bl)
{
        unsigned sz = 0;
        struct bio *bio;

        bio_list_for_each(bio, bl)
                sz++;

        return sz;
}

static inline void bio_list_add(struct bio_list *bl, struct bio *bio)
{
        bio->bi_next = NULL;

        if (bl->tail)
                bl->tail->bi_next = bio;
        else
                bl->head = bio;

        bl->tail = bio;
}

static inline void bio_list_add_head(struct bio_list *bl, struct bio *bio)
{
        bio->bi_next = bl->head;

        bl->head = bio;

        if (!bl->tail)
                bl->tail = bio;
}

static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2)
{
        if (!bl2->head)
                return;

        if (bl->tail)
                bl->tail->bi_next = bl2->head;
        else
                bl->head = bl2->head;

        bl->tail = bl2->tail;
}

static inline void bio_list_merge_head(struct bio_list *bl,
                                       struct bio_list *bl2)
{
        if (!bl2->head)
                return;

        if (bl->head)
                bl2->tail->bi_next = bl->head;
        else
                bl->tail = bl2->tail;

        bl->head = bl2->head;
}

static inline struct bio *bio_list_peek(struct bio_list *bl)
{
        return bl->head;
}

static inline struct bio *bio_list_pop(struct bio_list *bl)
{
        struct bio *bio = bl->head;

        if (bio) {
                bl->head = bl->head->bi_next;
                if (!bl->head)
                        bl->tail = NULL;

                bio->bi_next = NULL;
        }

        return bio;
}

static inline struct bio *bio_list_get(struct bio_list *bl)
{
        struct bio *bio = bl->head;

        bl->head = bl->tail = NULL;

        return bio;
}

/*
 * Increment chain count for the bio. Make sure the CHAIN flag update
 * is visible before the raised count.
 */
static inline void bio_inc_remaining(struct bio *bio)
{
        bio_set_flag(bio, BIO_CHAIN);
        smp_mb__before_atomic();
        atomic_inc(&bio->__bi_remaining);
}

/*
 * bio_set is used to allow other portions of the IO system to
 * allocate their own private memory pools for bio and iovec structures.
 * These memory pools in turn all allocate from the bio_slab
 * and the bvec_slabs[].
 */
#define BIO_POOL_SIZE 2

struct bio_set {
        struct kmem_cache *bio_slab;
        unsigned int front_pad;

        mempool_t bio_pool;
        mempool_t bvec_pool;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
        mempool_t bio_integrity_pool;
        mempool_t bvec_integrity_pool;
#endif

        /*
         * Deadlock avoidance for stacking block drivers: see comments in
         * bio_alloc_bioset() for details
         */
        spinlock_t                rescue_lock;
        struct bio_list                rescue_list;
        struct work_struct        rescue_work;
        struct workqueue_struct        *rescue_workqueue;
};

struct biovec_slab {
        int nr_vecs;
        char *name;
        struct kmem_cache *slab;
};

static inline bool bioset_initialized(struct bio_set *bs)
{
        return bs->bio_slab != NULL;
}

/*
 * a small number of entries is fine, not going to be performance critical.
 * basically we just need to survive
 */
#define BIO_SPLIT_ENTRIES 2

#if defined(CONFIG_BLK_DEV_INTEGRITY)

#define bip_for_each_vec(bvl, bip, iter)                                \
        for_each_bvec(bvl, (bip)->bip_vec, iter, (bip)->bip_iter)

#define bio_for_each_integrity_vec(_bvl, _bio, _iter)                        \
        for_each_bio(_bio)                                                \
                bip_for_each_vec(_bvl, _bio->bi_integrity, _iter)

extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int);
extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int);
extern bool bio_integrity_prep(struct bio *);
extern void bio_integrity_advance(struct bio *, unsigned int);
extern void bio_integrity_trim(struct bio *);
extern int bio_integrity_clone(struct bio *, struct bio *, gfp_t);
extern int bioset_integrity_create(struct bio_set *, int);
extern void bioset_integrity_free(struct bio_set *);
extern void bio_integrity_init(void);

#else /* CONFIG_BLK_DEV_INTEGRITY */

static inline void *bio_integrity(struct bio *bio)
{
        return NULL;
}

static inline int bioset_integrity_create(struct bio_set *bs, int pool_size)
{
        return 0;
}

static inline void bioset_integrity_free (struct bio_set *bs)
{
        return;
}

static inline bool bio_integrity_prep(struct bio *bio)
{
        return true;
}

static inline int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
                                      gfp_t gfp_mask)
{
        return 0;
}

static inline void bio_integrity_advance(struct bio *bio,
                                         unsigned int bytes_done)
{
        return;
}

static inline void bio_integrity_trim(struct bio *bio)
{
        return;
}

static inline void bio_integrity_init(void)
{
        return;
}

static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag)
{
        return false;
}

static inline void *bio_integrity_alloc(struct bio * bio, gfp_t gfp,
                                                                unsigned int nr)
{
        return ERR_PTR(-EINVAL);
}

static inline int bio_integrity_add_page(struct bio *bio, struct page *page,
                                        unsigned int len, unsigned int offset)
{
        return 0;
}

#endif /* CONFIG_BLK_DEV_INTEGRITY */

/*
 * Mark a bio as polled. Note that for async polled IO, the caller must
 * expect -EWOULDBLOCK if we cannot allocate a request (or other resources).
 * We cannot block waiting for requests on polled IO, as those completions
 * must be found by the caller. This is different than IRQ driven IO, where
 * it's safe to wait for IO to complete.
 */
static inline void bio_set_polled(struct bio *bio, struct kiocb *kiocb)
{
        bio->bi_opf |= REQ_HIPRI;
        if (!is_sync_kiocb(kiocb))
                bio->bi_opf |= REQ_NOWAIT;
}

#endif /* __LINUX_BIO_H */


































































    1 




































































    1 

















    1 




















































































































































    1 




    1 

























    1 






    1 























































    1 








    1 















    1 












    1 
    1 































































    1 



























































































































































































































































































































































































































































































































































































































































































































































































































































































    1 









    1 




    1 


    1 























































    1 




































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 





    1 







    1 



    1 




    1 








    1 








































































    1 












































    1 



    1 
    1 







    1 

    1 



















    1 

























































































































































































    1 











    1 
    1 















    1 











































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  libata-sff.c - helper library for PCI IDE BMDMA
 *
 *  Copyright 2003-2006 Red Hat, Inc.  All rights reserved.
 *  Copyright 2003-2006 Jeff Garzik
 *
 *  libata documentation is available via 'make {ps|pdf}docs',
 *  as Documentation/driver-api/libata.rst
 *
 *  Hardware documentation available from http://www.t13.org/ and
 *  http://www.sata-io.org/
 */

#include <linux/kernel.h>
#include <linux/gfp.h>
#include <linux/pci.h>
#include <linux/module.h>
#include <linux/libata.h>
#include <linux/highmem.h>

#include "libata.h"

static struct workqueue_struct *ata_sff_wq;

const struct ata_port_operations ata_sff_port_ops = {
        .inherits                = &ata_base_port_ops,

        .qc_prep                = ata_noop_qc_prep,
        .qc_issue                = ata_sff_qc_issue,
        .qc_fill_rtf                = ata_sff_qc_fill_rtf,

        .freeze                        = ata_sff_freeze,
        .thaw                        = ata_sff_thaw,
        .prereset                = ata_sff_prereset,
        .softreset                = ata_sff_softreset,
        .hardreset                = sata_sff_hardreset,
        .postreset                = ata_sff_postreset,
        .error_handler                = ata_sff_error_handler,

        .sff_dev_select                = ata_sff_dev_select,
        .sff_check_status        = ata_sff_check_status,
        .sff_tf_load                = ata_sff_tf_load,
        .sff_tf_read                = ata_sff_tf_read,
        .sff_exec_command        = ata_sff_exec_command,
        .sff_data_xfer                = ata_sff_data_xfer,
        .sff_drain_fifo                = ata_sff_drain_fifo,

        .lost_interrupt                = ata_sff_lost_interrupt,
};
EXPORT_SYMBOL_GPL(ata_sff_port_ops);

/**
 *        ata_sff_check_status - Read device status reg & clear interrupt
 *        @ap: port where the device is
 *
 *        Reads ATA taskfile status register for currently-selected device
 *        and return its value. This also clears pending interrupts
 *      from this device
 *
 *        LOCKING:
 *        Inherited from caller.
 */
u8 ata_sff_check_status(struct ata_port *ap)
{
        return ioread8(ap->ioaddr.status_addr);
}
EXPORT_SYMBOL_GPL(ata_sff_check_status);

/**
 *        ata_sff_altstatus - Read device alternate status reg
 *        @ap: port where the device is
 *
 *        Reads ATA taskfile alternate status register for
 *        currently-selected device and return its value.
 *
 *        Note: may NOT be used as the check_altstatus() entry in
 *        ata_port_operations.
 *
 *        LOCKING:
 *        Inherited from caller.
 */
static u8 ata_sff_altstatus(struct ata_port *ap)
{
        if (ap->ops->sff_check_altstatus)
                return ap->ops->sff_check_altstatus(ap);

        return ioread8(ap->ioaddr.altstatus_addr);
}

/**
 *        ata_sff_irq_status - Check if the device is busy
 *        @ap: port where the device is
 *
 *        Determine if the port is currently busy. Uses altstatus
 *        if available in order to avoid clearing shared IRQ status
 *        when finding an IRQ source. Non ctl capable devices don't
 *        share interrupt lines fortunately for us.
 *
 *        LOCKING:
 *        Inherited from caller.
 */
static u8 ata_sff_irq_status(struct ata_port *ap)
{
        u8 status;

        if (ap->ops->sff_check_altstatus || ap->ioaddr.altstatus_addr) {
                status = ata_sff_altstatus(ap);
                /* Not us: We are busy */
                if (status & ATA_BUSY)
                        return status;
        }
        /* Clear INTRQ latch */
        status = ap->ops->sff_check_status(ap);
        return status;
}

/**
 *        ata_sff_sync - Flush writes
 *        @ap: Port to wait for.
 *
 *        CAUTION:
 *        If we have an mmio device with no ctl and no altstatus
 *        method this will fail. No such devices are known to exist.
 *
 *        LOCKING:
 *        Inherited from caller.
 */

static void ata_sff_sync(struct ata_port *ap)
{
        if (ap->ops->sff_check_altstatus)
                ap->ops->sff_check_altstatus(ap);
        else if (ap->ioaddr.altstatus_addr)
                ioread8(ap->ioaddr.altstatus_addr);
}

/**
 *        ata_sff_pause                -        Flush writes and wait 400nS
 *        @ap: Port to pause for.
 *
 *        CAUTION:
 *        If we have an mmio device with no ctl and no altstatus
 *        method this will fail. No such devices are known to exist.
 *
 *        LOCKING:
 *        Inherited from caller.
 */

void ata_sff_pause(struct ata_port *ap)
{
        ata_sff_sync(ap);
        ndelay(400);
}
EXPORT_SYMBOL_GPL(ata_sff_pause);

/**
 *        ata_sff_dma_pause        -        Pause before commencing DMA
 *        @ap: Port to pause for.
 *
 *        Perform I/O fencing and ensure sufficient cycle delays occur
 *        for the HDMA1:0 transition
 */

void ata_sff_dma_pause(struct ata_port *ap)
{
        if (ap->ops->sff_check_altstatus || ap->ioaddr.altstatus_addr) {
                /* An altstatus read will cause the needed delay without
                   messing up the IRQ status */
                ata_sff_altstatus(ap);
                return;
        }
        /* There are no DMA controllers without ctl. BUG here to ensure
           we never violate the HDMA1:0 transition timing and risk
           corruption. */
        BUG();
}
EXPORT_SYMBOL_GPL(ata_sff_dma_pause);

/**
 *        ata_sff_busy_sleep - sleep until BSY clears, or timeout
 *        @ap: port containing status register to be polled
 *        @tmout_pat: impatience timeout in msecs
 *        @tmout: overall timeout in msecs
 *
 *        Sleep until ATA Status register bit BSY clears,
 *        or a timeout occurs.
 *
 *        LOCKING:
 *        Kernel thread context (may sleep).
 *
 *        RETURNS:
 *        0 on success, -errno otherwise.
 */
int ata_sff_busy_sleep(struct ata_port *ap,
                       unsigned long tmout_pat, unsigned long tmout)
{
        unsigned long timer_start, timeout;
        u8 status;

        status = ata_sff_busy_wait(ap, ATA_BUSY, 300);
        timer_start = jiffies;
        timeout = ata_deadline(timer_start, tmout_pat);
        while (status != 0xff && (status & ATA_BUSY) &&
               time_before(jiffies, timeout)) {
                ata_msleep(ap, 50);
                status = ata_sff_busy_wait(ap, ATA_BUSY, 3);
        }

        if (status != 0xff && (status & ATA_BUSY))
                ata_port_warn(ap,
                              "port is slow to respond, please be patient (Status 0x%x)\n",
                              status);

        timeout = ata_deadline(timer_start, tmout);
        while (status != 0xff && (status & ATA_BUSY) &&
               time_before(jiffies, timeout)) {
                ata_msleep(ap, 50);
                status = ap->ops->sff_check_status(ap);
        }

        if (status == 0xff)
                return -ENODEV;

        if (status & ATA_BUSY) {
                ata_port_err(ap,
                             "port failed to respond (%lu secs, Status 0x%x)\n",
                             DIV_ROUND_UP(tmout, 1000), status);
                return -EBUSY;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(ata_sff_busy_sleep);

static int ata_sff_check_ready(struct ata_link *link)
{
        u8 status = link->ap->ops->sff_check_status(link->ap);

        return ata_check_ready(status);
}

/**
 *        ata_sff_wait_ready - sleep until BSY clears, or timeout
 *        @link: SFF link to wait ready status for
 *        @deadline: deadline jiffies for the operation
 *
 *        Sleep until ATA Status register bit BSY clears, or timeout
 *        occurs.
 *
 *        LOCKING:
 *        Kernel thread context (may sleep).
 *
 *        RETURNS:
 *        0 on success, -errno otherwise.
 */
int ata_sff_wait_ready(struct ata_link *link, unsigned long deadline)
{
        return ata_wait_ready(link, deadline, ata_sff_check_ready);
}
EXPORT_SYMBOL_GPL(ata_sff_wait_ready);

/**
 *        ata_sff_set_devctl - Write device control reg
 *        @ap: port where the device is
 *        @ctl: value to write
 *
 *        Writes ATA taskfile device control register.
 *
 *        Note: may NOT be used as the sff_set_devctl() entry in
 *        ata_port_operations.
 *
 *        LOCKING:
 *        Inherited from caller.
 */
static void ata_sff_set_devctl(struct ata_port *ap, u8 ctl)
{
        if (ap->ops->sff_set_devctl)
                ap->ops->sff_set_devctl(ap, ctl);
        else
                iowrite8(ctl, ap->ioaddr.ctl_addr);
}

/**
 *        ata_sff_dev_select - Select device 0/1 on ATA bus
 *        @ap: ATA channel to manipulate
 *        @device: ATA device (numbered from zero) to select
 *
 *        Use the method defined in the ATA specification to
 *        make either device 0, or device 1, active on the
 *        ATA channel.  Works with both PIO and MMIO.
 *
 *        May be used as the dev_select() entry in ata_port_operations.
 *
 *        LOCKING:
 *        caller.
 */
void ata_sff_dev_select(struct ata_port *ap, unsigned int device)
{
        u8 tmp;

        if (device == 0)
                tmp = ATA_DEVICE_OBS;
        else
                tmp = ATA_DEVICE_OBS | ATA_DEV1;

        iowrite8(tmp, ap->ioaddr.device_addr);
        ata_sff_pause(ap);        /* needed; also flushes, for mmio */
}
EXPORT_SYMBOL_GPL(ata_sff_dev_select);

/**
 *        ata_dev_select - Select device 0/1 on ATA bus
 *        @ap: ATA channel to manipulate
 *        @device: ATA device (numbered from zero) to select
 *        @wait: non-zero to wait for Status register BSY bit to clear
 *        @can_sleep: non-zero if context allows sleeping
 *
 *        Use the method defined in the ATA specification to
 *        make either device 0, or device 1, active on the
 *        ATA channel.
 *
 *        This is a high-level version of ata_sff_dev_select(), which
 *        additionally provides the services of inserting the proper
 *        pauses and status polling, where needed.
 *
 *        LOCKING:
 *        caller.
 */
static void ata_dev_select(struct ata_port *ap, unsigned int device,
                           unsigned int wait, unsigned int can_sleep)
{
        if (ata_msg_probe(ap))
                ata_port_info(ap, "ata_dev_select: ENTER, device %u, wait %u\n",
                              device, wait);

        if (wait)
                ata_wait_idle(ap);

        ap->ops->sff_dev_select(ap, device);

        if (wait) {
                if (can_sleep && ap->link.device[device].class == ATA_DEV_ATAPI)
                        ata_msleep(ap, 150);
                ata_wait_idle(ap);
        }
}

/**
 *        ata_sff_irq_on - Enable interrupts on a port.
 *        @ap: Port on which interrupts are enabled.
 *
 *        Enable interrupts on a legacy IDE device using MMIO or PIO,
 *        wait for idle, clear any pending interrupts.
 *
 *        Note: may NOT be used as the sff_irq_on() entry in
 *        ata_port_operations.
 *
 *        LOCKING:
 *        Inherited from caller.
 */
void ata_sff_irq_on(struct ata_port *ap)
{
        struct ata_ioports *ioaddr = &ap->ioaddr;

        if (ap->ops->sff_irq_on) {
                ap->ops->sff_irq_on(ap);
                return;
        }

        ap->ctl &= ~ATA_NIEN;
        ap->last_ctl = ap->ctl;

        if (ap->ops->sff_set_devctl || ioaddr->ctl_addr)
                ata_sff_set_devctl(ap, ap->ctl);
        ata_wait_idle(ap);

        if (ap->ops->sff_irq_clear)
                ap->ops->sff_irq_clear(ap);
}
EXPORT_SYMBOL_GPL(ata_sff_irq_on);

/**
 *        ata_sff_tf_load - send taskfile registers to host controller
 *        @ap: Port to which output is sent
 *        @tf: ATA taskfile register set
 *
 *        Outputs ATA taskfile to standard ATA host controller.
 *
 *        LOCKING:
 *        Inherited from caller.
 */
void ata_sff_tf_load(struct ata_port *ap, const struct ata_taskfile *tf)
{
        struct ata_ioports *ioaddr = &ap->ioaddr;
        unsigned int is_addr = tf->flags & ATA_TFLAG_ISADDR;

        if (tf->ctl != ap->last_ctl) {
                if (ioaddr->ctl_addr)
                        iowrite8(tf->ctl, ioaddr->ctl_addr);
                ap->last_ctl = tf->ctl;
                ata_wait_idle(ap);
        }

        if (is_addr && (tf->flags & ATA_TFLAG_LBA48)) {
                WARN_ON_ONCE(!ioaddr->ctl_addr);
                iowrite8(tf->hob_feature, ioaddr->feature_addr);
                iowrite8(tf->hob_nsect, ioaddr->nsect_addr);
                iowrite8(tf->hob_lbal, ioaddr->lbal_addr);
                iowrite8(tf->hob_lbam, ioaddr->lbam_addr);
                iowrite8(tf->hob_lbah, ioaddr->lbah_addr);
                VPRINTK("hob: feat 0x%X nsect 0x%X, lba 0x%X 0x%X 0x%X\n",
                        tf->hob_feature,
                        tf->hob_nsect,
                        tf->hob_lbal,
                        tf->hob_lbam,
                        tf->hob_lbah);
        }

        if (is_addr) {
                iowrite8(tf->feature, ioaddr->feature_addr);
                iowrite8(tf->nsect, ioaddr->nsect_addr);
                iowrite8(tf->lbal, ioaddr->lbal_addr);
                iowrite8(tf->lbam, ioaddr->lbam_addr);
                iowrite8(tf->lbah, ioaddr->lbah_addr);
                VPRINTK("feat 0x%X nsect 0x%X lba 0x%X 0x%X 0x%X\n",
                        tf->feature,
                        tf->nsect,
                        tf->lbal,
                        tf->lbam,
                        tf->lbah);
        }

        if (tf->flags & ATA_TFLAG_DEVICE) {
                iowrite8(tf->device, ioaddr->device_addr);
                VPRINTK("device 0x%X\n", tf->device);
        }

        ata_wait_idle(ap);
}
EXPORT_SYMBOL_GPL(ata_sff_tf_load);

/**
 *        ata_sff_tf_read - input device's ATA taskfile shadow registers
 *        @ap: Port from which input is read
 *        @tf: ATA taskfile register set for storing input
 *
 *        Reads ATA taskfile registers for currently-selected device
 *        into @tf. Assumes the device has a fully SFF compliant task file
 *        layout and behaviour. If you device does not (eg has a different
 *        status method) then you will need to provide a replacement tf_read
 *
 *        LOCKING:
 *        Inherited from caller.
 */
void ata_sff_tf_read(struct ata_port *ap, struct ata_taskfile *tf)
{
        struct ata_ioports *ioaddr = &ap->ioaddr;

        tf->command = ata_sff_check_status(ap);
        tf->feature = ioread8(ioaddr->error_addr);
        tf->nsect = ioread8(ioaddr->nsect_addr);
        tf->lbal = ioread8(ioaddr->lbal_addr);
        tf->lbam = ioread8(ioaddr->lbam_addr);
        tf->lbah = ioread8(ioaddr->lbah_addr);
        tf->device = ioread8(ioaddr->device_addr);

        if (tf->flags & ATA_TFLAG_LBA48) {
                if (likely(ioaddr->ctl_addr)) {
                        iowrite8(tf->ctl | ATA_HOB, ioaddr->ctl_addr);
                        tf->hob_feature = ioread8(ioaddr->error_addr);
                        tf->hob_nsect = ioread8(ioaddr->nsect_addr);
                        tf->hob_lbal = ioread8(ioaddr->lbal_addr);
                        tf->hob_lbam = ioread8(ioaddr->lbam_addr);
                        tf->hob_lbah = ioread8(ioaddr->lbah_addr);
                        iowrite8(tf->ctl, ioaddr->ctl_addr);
                        ap->last_ctl = tf->ctl;
                } else
                        WARN_ON_ONCE(1);
        }
}
EXPORT_SYMBOL_GPL(ata_sff_tf_read);

/**
 *        ata_sff_exec_command - issue ATA command to host controller
 *        @ap: port to which command is being issued
 *        @tf: ATA taskfile register set
 *
 *        Issues ATA command, with proper synchronization with interrupt
 *        handler / other threads.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 */
void ata_sff_exec_command(struct ata_port *ap, const struct ata_taskfile *tf)
{
        DPRINTK("ata%u: cmd 0x%X\n", ap->print_id, tf->command);

        iowrite8(tf->command, ap->ioaddr.command_addr);
        ata_sff_pause(ap);
}
EXPORT_SYMBOL_GPL(ata_sff_exec_command);

/**
 *        ata_tf_to_host - issue ATA taskfile to host controller
 *        @ap: port to which command is being issued
 *        @tf: ATA taskfile register set
 *
 *        Issues ATA taskfile register set to ATA host controller,
 *        with proper synchronization with interrupt handler and
 *        other threads.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 */
static inline void ata_tf_to_host(struct ata_port *ap,
                                  const struct ata_taskfile *tf)
{
        ap->ops->sff_tf_load(ap, tf);
        ap->ops->sff_exec_command(ap, tf);
}

/**
 *        ata_sff_data_xfer - Transfer data by PIO
 *        @qc: queued command
 *        @buf: data buffer
 *        @buflen: buffer length
 *        @rw: read/write
 *
 *        Transfer data from/to the device data register by PIO.
 *
 *        LOCKING:
 *        Inherited from caller.
 *
 *        RETURNS:
 *        Bytes consumed.
 */
unsigned int ata_sff_data_xfer(struct ata_queued_cmd *qc, unsigned char *buf,
                               unsigned int buflen, int rw)
{
        struct ata_port *ap = qc->dev->link->ap;
        void __iomem *data_addr = ap->ioaddr.data_addr;
        unsigned int words = buflen >> 1;

        /* Transfer multiple of 2 bytes */
        if (rw == READ)
                ioread16_rep(data_addr, buf, words);
        else
                iowrite16_rep(data_addr, buf, words);

        /* Transfer trailing byte, if any. */
        if (unlikely(buflen & 0x01)) {
                unsigned char pad[2] = { };

                /* Point buf to the tail of buffer */
                buf += buflen - 1;

                /*
                 * Use io*16_rep() accessors here as well to avoid pointlessly
                 * swapping bytes to and from on the big endian machines...
                 */
                if (rw == READ) {
                        ioread16_rep(data_addr, pad, 1);
                        *buf = pad[0];
                } else {
                        pad[0] = *buf;
                        iowrite16_rep(data_addr, pad, 1);
                }
                words++;
        }

        return words << 1;
}
EXPORT_SYMBOL_GPL(ata_sff_data_xfer);

/**
 *        ata_sff_data_xfer32 - Transfer data by PIO
 *        @qc: queued command
 *        @buf: data buffer
 *        @buflen: buffer length
 *        @rw: read/write
 *
 *        Transfer data from/to the device data register by PIO using 32bit
 *        I/O operations.
 *
 *        LOCKING:
 *        Inherited from caller.
 *
 *        RETURNS:
 *        Bytes consumed.
 */

unsigned int ata_sff_data_xfer32(struct ata_queued_cmd *qc, unsigned char *buf,
                               unsigned int buflen, int rw)
{
        struct ata_device *dev = qc->dev;
        struct ata_port *ap = dev->link->ap;
        void __iomem *data_addr = ap->ioaddr.data_addr;
        unsigned int words = buflen >> 2;
        int slop = buflen & 3;

        if (!(ap->pflags & ATA_PFLAG_PIO32))
                return ata_sff_data_xfer(qc, buf, buflen, rw);

        /* Transfer multiple of 4 bytes */
        if (rw == READ)
                ioread32_rep(data_addr, buf, words);
        else
                iowrite32_rep(data_addr, buf, words);

        /* Transfer trailing bytes, if any */
        if (unlikely(slop)) {
                unsigned char pad[4] = { };

                /* Point buf to the tail of buffer */
                buf += buflen - slop;

                /*
                 * Use io*_rep() accessors here as well to avoid pointlessly
                 * swapping bytes to and from on the big endian machines...
                 */
                if (rw == READ) {
                        if (slop < 3)
                                ioread16_rep(data_addr, pad, 1);
                        else
                                ioread32_rep(data_addr, pad, 1);
                        memcpy(buf, pad, slop);
                } else {
                        memcpy(pad, buf, slop);
                        if (slop < 3)
                                iowrite16_rep(data_addr, pad, 1);
                        else
                                iowrite32_rep(data_addr, pad, 1);
                }
        }
        return (buflen + 1) & ~1;
}
EXPORT_SYMBOL_GPL(ata_sff_data_xfer32);

static void ata_pio_xfer(struct ata_queued_cmd *qc, struct page *page,
                unsigned int offset, size_t xfer_size)
{
        bool do_write = (qc->tf.flags & ATA_TFLAG_WRITE);
        unsigned char *buf;

        buf = kmap_atomic(page);
        qc->ap->ops->sff_data_xfer(qc, buf + offset, xfer_size, do_write);
        kunmap_atomic(buf);

        if (!do_write && !PageSlab(page))
                flush_dcache_page(page);
}

/**
 *        ata_pio_sector - Transfer a sector of data.
 *        @qc: Command on going
 *
 *        Transfer qc->sect_size bytes of data from/to the ATA device.
 *
 *        LOCKING:
 *        Inherited from caller.
 */
static void ata_pio_sector(struct ata_queued_cmd *qc)
{
        struct ata_port *ap = qc->ap;
        struct page *page;
        unsigned int offset;

        if (!qc->cursg) {
                qc->curbytes = qc->nbytes;
                return;
        }
        if (qc->curbytes == qc->nbytes - qc->sect_size)
                ap->hsm_task_state = HSM_ST_LAST;

        page = sg_page(qc->cursg);
        offset = qc->cursg->offset + qc->cursg_ofs;

        /* get the current page and offset */
        page = nth_page(page, (offset >> PAGE_SHIFT));
        offset %= PAGE_SIZE;

        DPRINTK("data %s\n", qc->tf.flags & ATA_TFLAG_WRITE ? "write" : "read");

        /*
         * Split the transfer when it splits a page boundary.  Note that the
         * split still has to be dword aligned like all ATA data transfers.
         */
        WARN_ON_ONCE(offset % 4);
        if (offset + qc->sect_size > PAGE_SIZE) {
                unsigned int split_len = PAGE_SIZE - offset;

                ata_pio_xfer(qc, page, offset, split_len);
                ata_pio_xfer(qc, nth_page(page, 1), 0,
                             qc->sect_size - split_len);
        } else {
                ata_pio_xfer(qc, page, offset, qc->sect_size);
        }

        qc->curbytes += qc->sect_size;
        qc->cursg_ofs += qc->sect_size;

        if (qc->cursg_ofs == qc->cursg->length) {
                qc->cursg = sg_next(qc->cursg);
                if (!qc->cursg)
                        ap->hsm_task_state = HSM_ST_LAST;
                qc->cursg_ofs = 0;
        }
}

/**
 *        ata_pio_sectors - Transfer one or many sectors.
 *        @qc: Command on going
 *
 *        Transfer one or many sectors of data from/to the
 *        ATA device for the DRQ request.
 *
 *        LOCKING:
 *        Inherited from caller.
 */
static void ata_pio_sectors(struct ata_queued_cmd *qc)
{
        if (is_multi_taskfile(&qc->tf)) {
                /* READ/WRITE MULTIPLE */
                unsigned int nsect;

                WARN_ON_ONCE(qc->dev->multi_count == 0);

                nsect = min((qc->nbytes - qc->curbytes) / qc->sect_size,
                            qc->dev->multi_count);
                while (nsect--)
                        ata_pio_sector(qc);
        } else
                ata_pio_sector(qc);

        ata_sff_sync(qc->ap); /* flush */
}

/**
 *        atapi_send_cdb - Write CDB bytes to hardware
 *        @ap: Port to which ATAPI device is attached.
 *        @qc: Taskfile currently active
 *
 *        When device has indicated its readiness to accept
 *        a CDB, this function is called.  Send the CDB.
 *
 *        LOCKING:
 *        caller.
 */
static void atapi_send_cdb(struct ata_port *ap, struct ata_queued_cmd *qc)
{
        /* send SCSI cdb */
        DPRINTK("send cdb\n");
        WARN_ON_ONCE(qc->dev->cdb_len < 12);

        ap->ops->sff_data_xfer(qc, qc->cdb, qc->dev->cdb_len, 1);
        ata_sff_sync(ap);
        /* FIXME: If the CDB is for DMA do we need to do the transition delay
           or is bmdma_start guaranteed to do it ? */
        switch (qc->tf.protocol) {
        case ATAPI_PROT_PIO:
                ap->hsm_task_state = HSM_ST;
                break;
        case ATAPI_PROT_NODATA:
                ap->hsm_task_state = HSM_ST_LAST;
                break;
#ifdef CONFIG_ATA_BMDMA
        case ATAPI_PROT_DMA:
                ap->hsm_task_state = HSM_ST_LAST;
                /* initiate bmdma */
                ap->ops->bmdma_start(qc);
                break;
#endif /* CONFIG_ATA_BMDMA */
        default:
                BUG();
        }
}

/**
 *        __atapi_pio_bytes - Transfer data from/to the ATAPI device.
 *        @qc: Command on going
 *        @bytes: number of bytes
 *
 *        Transfer Transfer data from/to the ATAPI device.
 *
 *        LOCKING:
 *        Inherited from caller.
 *
 */
static int __atapi_pio_bytes(struct ata_queued_cmd *qc, unsigned int bytes)
{
        int rw = (qc->tf.flags & ATA_TFLAG_WRITE) ? WRITE : READ;
        struct ata_port *ap = qc->ap;
        struct ata_device *dev = qc->dev;
        struct ata_eh_info *ehi = &dev->link->eh_info;
        struct scatterlist *sg;
        struct page *page;
        unsigned char *buf;
        unsigned int offset, count, consumed;

next_sg:
        sg = qc->cursg;
        if (unlikely(!sg)) {
                ata_ehi_push_desc(ehi, "unexpected or too much trailing data "
                                  "buf=%u cur=%u bytes=%u",
                                  qc->nbytes, qc->curbytes, bytes);
                return -1;
        }

        page = sg_page(sg);
        offset = sg->offset + qc->cursg_ofs;

        /* get the current page and offset */
        page = nth_page(page, (offset >> PAGE_SHIFT));
        offset %= PAGE_SIZE;

        /* don't overrun current sg */
        count = min(sg->length - qc->cursg_ofs, bytes);

        /* don't cross page boundaries */
        count = min(count, (unsigned int)PAGE_SIZE - offset);

        DPRINTK("data %s\n", qc->tf.flags & ATA_TFLAG_WRITE ? "write" : "read");

        /* do the actual data transfer */
        buf = kmap_atomic(page);
        consumed = ap->ops->sff_data_xfer(qc, buf + offset, count, rw);
        kunmap_atomic(buf);

        bytes -= min(bytes, consumed);
        qc->curbytes += count;
        qc->cursg_ofs += count;

        if (qc->cursg_ofs == sg->length) {
                qc->cursg = sg_next(qc->cursg);
                qc->cursg_ofs = 0;
        }

        /*
         * There used to be a  WARN_ON_ONCE(qc->cursg && count != consumed);
         * Unfortunately __atapi_pio_bytes doesn't know enough to do the WARN
         * check correctly as it doesn't know if it is the last request being
         * made. Somebody should implement a proper sanity check.
         */
        if (bytes)
                goto next_sg;
        return 0;
}

/**
 *        atapi_pio_bytes - Transfer data from/to the ATAPI device.
 *        @qc: Command on going
 *
 *        Transfer Transfer data from/to the ATAPI device.
 *
 *        LOCKING:
 *        Inherited from caller.
 */
static void atapi_pio_bytes(struct ata_queued_cmd *qc)
{
        struct ata_port *ap = qc->ap;
        struct ata_device *dev = qc->dev;
        struct ata_eh_info *ehi = &dev->link->eh_info;
        unsigned int ireason, bc_lo, bc_hi, bytes;
        int i_write, do_write = (qc->tf.flags & ATA_TFLAG_WRITE) ? 1 : 0;

        /* Abuse qc->result_tf for temp storage of intermediate TF
         * here to save some kernel stack usage.
         * For normal completion, qc->result_tf is not relevant. For
         * error, qc->result_tf is later overwritten by ata_qc_complete().
         * So, the correctness of qc->result_tf is not affected.
         */
        ap->ops->sff_tf_read(ap, &qc->result_tf);
        ireason = qc->result_tf.nsect;
        bc_lo = qc->result_tf.lbam;
        bc_hi = qc->result_tf.lbah;
        bytes = (bc_hi << 8) | bc_lo;

        /* shall be cleared to zero, indicating xfer of data */
        if (unlikely(ireason & ATAPI_COD))
                goto atapi_check;

        /* make sure transfer direction matches expected */
        i_write = ((ireason & ATAPI_IO) == 0) ? 1 : 0;
        if (unlikely(do_write != i_write))
                goto atapi_check;

        if (unlikely(!bytes))
                goto atapi_check;

        if (unlikely(__atapi_pio_bytes(qc, bytes)))
                goto err_out;
        ata_sff_sync(ap); /* flush */

        return;

 atapi_check:
        ata_ehi_push_desc(ehi, "ATAPI check failed (ireason=0x%x bytes=%u)",
                          ireason, bytes);
 err_out:
        qc->err_mask |= AC_ERR_HSM;
        ap->hsm_task_state = HSM_ST_ERR;
}

/**
 *        ata_hsm_ok_in_wq - Check if the qc can be handled in the workqueue.
 *        @ap: the target ata_port
 *        @qc: qc on going
 *
 *        RETURNS:
 *        1 if ok in workqueue, 0 otherwise.
 */
static inline int ata_hsm_ok_in_wq(struct ata_port *ap,
                                                struct ata_queued_cmd *qc)
{
        if (qc->tf.flags & ATA_TFLAG_POLLING)
                return 1;

        if (ap->hsm_task_state == HSM_ST_FIRST) {
                if (qc->tf.protocol == ATA_PROT_PIO &&
                   (qc->tf.flags & ATA_TFLAG_WRITE))
                    return 1;

                if (ata_is_atapi(qc->tf.protocol) &&
                   !(qc->dev->flags & ATA_DFLAG_CDB_INTR))
                        return 1;
        }

        return 0;
}

/**
 *        ata_hsm_qc_complete - finish a qc running on standard HSM
 *        @qc: Command to complete
 *        @in_wq: 1 if called from workqueue, 0 otherwise
 *
 *        Finish @qc which is running on standard HSM.
 *
 *        LOCKING:
 *        If @in_wq is zero, spin_lock_irqsave(host lock).
 *        Otherwise, none on entry and grabs host lock.
 */
static void ata_hsm_qc_complete(struct ata_queued_cmd *qc, int in_wq)
{
        struct ata_port *ap = qc->ap;

        if (ap->ops->error_handler) {
                if (in_wq) {
                        /* EH might have kicked in while host lock is
                         * released.
                         */
                        qc = ata_qc_from_tag(ap, qc->tag);
                        if (qc) {
                                if (likely(!(qc->err_mask & AC_ERR_HSM))) {
                                        ata_sff_irq_on(ap);
                                        ata_qc_complete(qc);
                                } else
                                        ata_port_freeze(ap);
                        }
                } else {
                        if (likely(!(qc->err_mask & AC_ERR_HSM)))
                                ata_qc_complete(qc);
                        else
                                ata_port_freeze(ap);
                }
        } else {
                if (in_wq) {
                        ata_sff_irq_on(ap);
                        ata_qc_complete(qc);
                } else
                        ata_qc_complete(qc);
        }
}

/**
 *        ata_sff_hsm_move - move the HSM to the next state.
 *        @ap: the target ata_port
 *        @qc: qc on going
 *        @status: current device status
 *        @in_wq: 1 if called from workqueue, 0 otherwise
 *
 *        RETURNS:
 *        1 when poll next status needed, 0 otherwise.
 */
int ata_sff_hsm_move(struct ata_port *ap, struct ata_queued_cmd *qc,
                     u8 status, int in_wq)
{
        struct ata_link *link = qc->dev->link;
        struct ata_eh_info *ehi = &link->eh_info;
        int poll_next;

        lockdep_assert_held(ap->lock);

        WARN_ON_ONCE((qc->flags & ATA_QCFLAG_ACTIVE) == 0);

        /* Make sure ata_sff_qc_issue() does not throw things
         * like DMA polling into the workqueue. Notice that
         * in_wq is not equivalent to (qc->tf.flags & ATA_TFLAG_POLLING).
         */
        WARN_ON_ONCE(in_wq != ata_hsm_ok_in_wq(ap, qc));

fsm_start:
        DPRINTK("ata%u: protocol %d task_state %d (dev_stat 0x%X)\n",
                ap->print_id, qc->tf.protocol, ap->hsm_task_state, status);

        switch (ap->hsm_task_state) {
        case HSM_ST_FIRST:
                /* Send first data block or PACKET CDB */

                /* If polling, we will stay in the work queue after
                 * sending the data. Otherwise, interrupt handler
                 * takes over after sending the data.
                 */
                poll_next = (qc->tf.flags & ATA_TFLAG_POLLING);

                /* check device status */
                if (unlikely((status & ATA_DRQ) == 0)) {
                        /* handle BSY=0, DRQ=0 as error */
                        if (likely(status & (ATA_ERR | ATA_DF)))
                                /* device stops HSM for abort/error */
                                qc->err_mask |= AC_ERR_DEV;
                        else {
                                /* HSM violation. Let EH handle this */
                                ata_ehi_push_desc(ehi,
                                        "ST_FIRST: !(DRQ|ERR|DF)");
                                qc->err_mask |= AC_ERR_HSM;
                        }

                        ap->hsm_task_state = HSM_ST_ERR;
                        goto fsm_start;
                }

                /* Device should not ask for data transfer (DRQ=1)
                 * when it finds something wrong.
                 * We ignore DRQ here and stop the HSM by
                 * changing hsm_task_state to HSM_ST_ERR and
                 * let the EH abort the command or reset the device.
                 */
                if (unlikely(status & (ATA_ERR | ATA_DF))) {
                        /* Some ATAPI tape drives forget to clear the ERR bit
                         * when doing the next command (mostly request sense).
                         * We ignore ERR here to workaround and proceed sending
                         * the CDB.
                         */
                        if (!(qc->dev->horkage & ATA_HORKAGE_STUCK_ERR)) {
                                ata_ehi_push_desc(ehi, "ST_FIRST: "
                                        "DRQ=1 with device error, "
                                        "dev_stat 0x%X", status);
                                qc->err_mask |= AC_ERR_HSM;
                                ap->hsm_task_state = HSM_ST_ERR;
                                goto fsm_start;
                        }
                }

                if (qc->tf.protocol == ATA_PROT_PIO) {
                        /* PIO data out protocol.
                         * send first data block.
                         */

                        /* ata_pio_sectors() might change the state
                         * to HSM_ST_LAST. so, the state is changed here
                         * before ata_pio_sectors().
                         */
                        ap->hsm_task_state = HSM_ST;
                        ata_pio_sectors(qc);
                } else
                        /* send CDB */
                        atapi_send_cdb(ap, qc);

                /* if polling, ata_sff_pio_task() handles the rest.
                 * otherwise, interrupt handler takes over from here.
                 */
                break;

        case HSM_ST:
                /* complete command or read/write the data register */
                if (qc->tf.protocol == ATAPI_PROT_PIO) {
                        /* ATAPI PIO protocol */
                        if ((status & ATA_DRQ) == 0) {
                                /* No more data to transfer or device error.
                                 * Device error will be tagged in HSM_ST_LAST.
                                 */
                                ap->hsm_task_state = HSM_ST_LAST;
                                goto fsm_start;
                        }

                        /* Device should not ask for data transfer (DRQ=1)
                         * when it finds something wrong.
                         * We ignore DRQ here and stop the HSM by
                         * changing hsm_task_state to HSM_ST_ERR and
                         * let the EH abort the command or reset the device.
                         */
                        if (unlikely(status & (ATA_ERR | ATA_DF))) {
                                ata_ehi_push_desc(ehi, "ST-ATAPI: "
                                        "DRQ=1 with device error, "
                                        "dev_stat 0x%X", status);
                                qc->err_mask |= AC_ERR_HSM;
                                ap->hsm_task_state = HSM_ST_ERR;
                                goto fsm_start;
                        }

                        atapi_pio_bytes(qc);

                        if (unlikely(ap->hsm_task_state == HSM_ST_ERR))
                                /* bad ireason reported by device */
                                goto fsm_start;

                } else {
                        /* ATA PIO protocol */
                        if (unlikely((status & ATA_DRQ) == 0)) {
                                /* handle BSY=0, DRQ=0 as error */
                                if (likely(status & (ATA_ERR | ATA_DF))) {
                                        /* device stops HSM for abort/error */
                                        qc->err_mask |= AC_ERR_DEV;

                                        /* If diagnostic failed and this is
                                         * IDENTIFY, it's likely a phantom
                                         * device.  Mark hint.
                                         */
                                        if (qc->dev->horkage &
                                            ATA_HORKAGE_DIAGNOSTIC)
                                                qc->err_mask |=
                                                        AC_ERR_NODEV_HINT;
                                } else {
                                        /* HSM violation. Let EH handle this.
                                         * Phantom devices also trigger this
                                         * condition.  Mark hint.
                                         */
                                        ata_ehi_push_desc(ehi, "ST-ATA: "
                                                "DRQ=0 without device error, "
                                                "dev_stat 0x%X", status);
                                        qc->err_mask |= AC_ERR_HSM |
                                                        AC_ERR_NODEV_HINT;
                                }

                                ap->hsm_task_state = HSM_ST_ERR;
                                goto fsm_start;
                        }

                        /* For PIO reads, some devices may ask for
                         * data transfer (DRQ=1) alone with ERR=1.
                         * We respect DRQ here and transfer one
                         * block of junk data before changing the
                         * hsm_task_state to HSM_ST_ERR.
                         *
                         * For PIO writes, ERR=1 DRQ=1 doesn't make
                         * sense since the data block has been
                         * transferred to the device.
                         */
                        if (unlikely(status & (ATA_ERR | ATA_DF))) {
                                /* data might be corrputed */
                                qc->err_mask |= AC_ERR_DEV;

                                if (!(qc->tf.flags & ATA_TFLAG_WRITE)) {
                                        ata_pio_sectors(qc);
                                        status = ata_wait_idle(ap);
                                }

                                if (status & (ATA_BUSY | ATA_DRQ)) {
                                        ata_ehi_push_desc(ehi, "ST-ATA: "
                                                "BUSY|DRQ persists on ERR|DF, "
                                                "dev_stat 0x%X", status);
                                        qc->err_mask |= AC_ERR_HSM;
                                }

                                /* There are oddball controllers with
                                 * status register stuck at 0x7f and
                                 * lbal/m/h at zero which makes it
                                 * pass all other presence detection
                                 * mechanisms we have.  Set NODEV_HINT
                                 * for it.  Kernel bz#7241.
                                 */
                                if (status == 0x7f)
                                        qc->err_mask |= AC_ERR_NODEV_HINT;

                                /* ata_pio_sectors() might change the
                                 * state to HSM_ST_LAST. so, the state
                                 * is changed after ata_pio_sectors().
                                 */
                                ap->hsm_task_state = HSM_ST_ERR;
                                goto fsm_start;
                        }

                        ata_pio_sectors(qc);

                        if (ap->hsm_task_state == HSM_ST_LAST &&
                            (!(qc->tf.flags & ATA_TFLAG_WRITE))) {
                                /* all data read */
                                status = ata_wait_idle(ap);
                                goto fsm_start;
                        }
                }

                poll_next = 1;
                break;

        case HSM_ST_LAST:
                if (unlikely(!ata_ok(status))) {
                        qc->err_mask |= __ac_err_mask(status);
                        ap->hsm_task_state = HSM_ST_ERR;
                        goto fsm_start;
                }

                /* no more data to transfer */
                DPRINTK("ata%u: dev %u command complete, drv_stat 0x%x\n",
                        ap->print_id, qc->dev->devno, status);

                WARN_ON_ONCE(qc->err_mask & (AC_ERR_DEV | AC_ERR_HSM));

                ap->hsm_task_state = HSM_ST_IDLE;

                /* complete taskfile transaction */
                ata_hsm_qc_complete(qc, in_wq);

                poll_next = 0;
                break;

        case HSM_ST_ERR:
                ap->hsm_task_state = HSM_ST_IDLE;

                /* complete taskfile transaction */
                ata_hsm_qc_complete(qc, in_wq);

                poll_next = 0;
                break;
        default:
                poll_next = 0;
                WARN(true, "ata%d: SFF host state machine in invalid state %d",
                     ap->print_id, ap->hsm_task_state);
        }

        return poll_next;
}
EXPORT_SYMBOL_GPL(ata_sff_hsm_move);

void ata_sff_queue_work(struct work_struct *work)
{
        queue_work(ata_sff_wq, work);
}
EXPORT_SYMBOL_GPL(ata_sff_queue_work);

void ata_sff_queue_delayed_work(struct delayed_work *dwork, unsigned long delay)
{
        queue_delayed_work(ata_sff_wq, dwork, delay);
}
EXPORT_SYMBOL_GPL(ata_sff_queue_delayed_work);

void ata_sff_queue_pio_task(struct ata_link *link, unsigned long delay)
{
        struct ata_port *ap = link->ap;

        WARN_ON((ap->sff_pio_task_link != NULL) &&
                (ap->sff_pio_task_link != link));
        ap->sff_pio_task_link = link;

        /* may fail if ata_sff_flush_pio_task() in progress */
        ata_sff_queue_delayed_work(&ap->sff_pio_task, msecs_to_jiffies(delay));
}
EXPORT_SYMBOL_GPL(ata_sff_queue_pio_task);

void ata_sff_flush_pio_task(struct ata_port *ap)
{
        DPRINTK("ENTER\n");

        cancel_delayed_work_sync(&ap->sff_pio_task);

        /*
         * We wanna reset the HSM state to IDLE.  If we do so without
         * grabbing the port lock, critical sections protected by it which
         * expect the HSM state to stay stable may get surprised.  For
         * example, we may set IDLE in between the time
         * __ata_sff_port_intr() checks for HSM_ST_IDLE and before it calls
         * ata_sff_hsm_move() causing ata_sff_hsm_move() to BUG().
         */
        spin_lock_irq(ap->lock);
        ap->hsm_task_state = HSM_ST_IDLE;
        spin_unlock_irq(ap->lock);

        ap->sff_pio_task_link = NULL;

        if (ata_msg_ctl(ap))
                ata_port_dbg(ap, "%s: EXIT\n", __func__);
}

static void ata_sff_pio_task(struct work_struct *work)
{
        struct ata_port *ap =
                container_of(work, struct ata_port, sff_pio_task.work);
        struct ata_link *link = ap->sff_pio_task_link;
        struct ata_queued_cmd *qc;
        u8 status;
        int poll_next;

        spin_lock_irq(ap->lock);

        BUG_ON(ap->sff_pio_task_link == NULL);
        /* qc can be NULL if timeout occurred */
        qc = ata_qc_from_tag(ap, link->active_tag);
        if (!qc) {
                ap->sff_pio_task_link = NULL;
                goto out_unlock;
        }

fsm_start:
        WARN_ON_ONCE(ap->hsm_task_state == HSM_ST_IDLE);

        /*
         * This is purely heuristic.  This is a fast path.
         * Sometimes when we enter, BSY will be cleared in
         * a chk-status or two.  If not, the drive is probably seeking
         * or something.  Snooze for a couple msecs, then
         * chk-status again.  If still busy, queue delayed work.
         */
        status = ata_sff_busy_wait(ap, ATA_BUSY, 5);
        if (status & ATA_BUSY) {
                spin_unlock_irq(ap->lock);
                ata_msleep(ap, 2);
                spin_lock_irq(ap->lock);

                status = ata_sff_busy_wait(ap, ATA_BUSY, 10);
                if (status & ATA_BUSY) {
                        ata_sff_queue_pio_task(link, ATA_SHORT_PAUSE);
                        goto out_unlock;
                }
        }

        /*
         * hsm_move() may trigger another command to be processed.
         * clean the link beforehand.
         */
        ap->sff_pio_task_link = NULL;
        /* move the HSM */
        poll_next = ata_sff_hsm_move(ap, qc, status, 1);

        /* another command or interrupt handler
         * may be running at this point.
         */
        if (poll_next)
                goto fsm_start;
out_unlock:
        spin_unlock_irq(ap->lock);
}

/**
 *        ata_sff_qc_issue - issue taskfile to a SFF controller
 *        @qc: command to issue to device
 *
 *        This function issues a PIO or NODATA command to a SFF
 *        controller.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 *
 *        RETURNS:
 *        Zero on success, AC_ERR_* mask on failure
 */
unsigned int ata_sff_qc_issue(struct ata_queued_cmd *qc)
{
        struct ata_port *ap = qc->ap;
        struct ata_link *link = qc->dev->link;

        /* Use polling pio if the LLD doesn't handle
         * interrupt driven pio and atapi CDB interrupt.
         */
        if (ap->flags & ATA_FLAG_PIO_POLLING)
                qc->tf.flags |= ATA_TFLAG_POLLING;

        /* select the device */
        ata_dev_select(ap, qc->dev->devno, 1, 0);

        /* start the command */
        switch (qc->tf.protocol) {
        case ATA_PROT_NODATA:
                if (qc->tf.flags & ATA_TFLAG_POLLING)
                        ata_qc_set_polling(qc);

                ata_tf_to_host(ap, &qc->tf);
                ap->hsm_task_state = HSM_ST_LAST;

                if (qc->tf.flags & ATA_TFLAG_POLLING)
                        ata_sff_queue_pio_task(link, 0);

                break;

        case ATA_PROT_PIO:
                if (qc->tf.flags & ATA_TFLAG_POLLING)
                        ata_qc_set_polling(qc);

                ata_tf_to_host(ap, &qc->tf);

                if (qc->tf.flags & ATA_TFLAG_WRITE) {
                        /* PIO data out protocol */
                        ap->hsm_task_state = HSM_ST_FIRST;
                        ata_sff_queue_pio_task(link, 0);

                        /* always send first data block using the
                         * ata_sff_pio_task() codepath.
                         */
                } else {
                        /* PIO data in protocol */
                        ap->hsm_task_state = HSM_ST;

                        if (qc->tf.flags & ATA_TFLAG_POLLING)
                                ata_sff_queue_pio_task(link, 0);

                        /* if polling, ata_sff_pio_task() handles the
                         * rest.  otherwise, interrupt handler takes
                         * over from here.
                         */
                }

                break;

        case ATAPI_PROT_PIO:
        case ATAPI_PROT_NODATA:
                if (qc->tf.flags & ATA_TFLAG_POLLING)
                        ata_qc_set_polling(qc);

                ata_tf_to_host(ap, &qc->tf);

                ap->hsm_task_state = HSM_ST_FIRST;

                /* send cdb by polling if no cdb interrupt */
                if ((!(qc->dev->flags & ATA_DFLAG_CDB_INTR)) ||
                    (qc->tf.flags & ATA_TFLAG_POLLING))
                        ata_sff_queue_pio_task(link, 0);
                break;

        default:
                return AC_ERR_SYSTEM;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(ata_sff_qc_issue);

/**
 *        ata_sff_qc_fill_rtf - fill result TF using ->sff_tf_read
 *        @qc: qc to fill result TF for
 *
 *        @qc is finished and result TF needs to be filled.  Fill it
 *        using ->sff_tf_read.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 *
 *        RETURNS:
 *        true indicating that result TF is successfully filled.
 */
bool ata_sff_qc_fill_rtf(struct ata_queued_cmd *qc)
{
        qc->ap->ops->sff_tf_read(qc->ap, &qc->result_tf);
        return true;
}
EXPORT_SYMBOL_GPL(ata_sff_qc_fill_rtf);

static unsigned int ata_sff_idle_irq(struct ata_port *ap)
{
        ap->stats.idle_irq++;

#ifdef ATA_IRQ_TRAP
        if ((ap->stats.idle_irq % 1000) == 0) {
                ap->ops->sff_check_status(ap);
                if (ap->ops->sff_irq_clear)
                        ap->ops->sff_irq_clear(ap);
                ata_port_warn(ap, "irq trap\n");
                return 1;
        }
#endif
        return 0;        /* irq not handled */
}

static unsigned int __ata_sff_port_intr(struct ata_port *ap,
                                        struct ata_queued_cmd *qc,
                                        bool hsmv_on_idle)
{
        u8 status;

        VPRINTK("ata%u: protocol %d task_state %d\n",
                ap->print_id, qc->tf.protocol, ap->hsm_task_state);

        /* Check whether we are expecting interrupt in this state */
        switch (ap->hsm_task_state) {
        case HSM_ST_FIRST:
                /* Some pre-ATAPI-4 devices assert INTRQ
                 * at this state when ready to receive CDB.
                 */

                /* Check the ATA_DFLAG_CDB_INTR flag is enough here.
                 * The flag was turned on only for atapi devices.  No
                 * need to check ata_is_atapi(qc->tf.protocol) again.
                 */
                if (!(qc->dev->flags & ATA_DFLAG_CDB_INTR))
                        return ata_sff_idle_irq(ap);
                break;
        case HSM_ST_IDLE:
                return ata_sff_idle_irq(ap);
        default:
                break;
        }

        /* check main status, clearing INTRQ if needed */
        status = ata_sff_irq_status(ap);
        if (status & ATA_BUSY) {
                if (hsmv_on_idle) {
                        /* BMDMA engine is already stopped, we're screwed */
                        qc->err_mask |= AC_ERR_HSM;
                        ap->hsm_task_state = HSM_ST_ERR;
                } else
                        return ata_sff_idle_irq(ap);
        }

        /* clear irq events */
        if (ap->ops->sff_irq_clear)
                ap->ops->sff_irq_clear(ap);

        ata_sff_hsm_move(ap, qc, status, 0);

        return 1;        /* irq handled */
}

/**
 *        ata_sff_port_intr - Handle SFF port interrupt
 *        @ap: Port on which interrupt arrived (possibly...)
 *        @qc: Taskfile currently active in engine
 *
 *        Handle port interrupt for given queued command.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 *
 *        RETURNS:
 *        One if interrupt was handled, zero if not (shared irq).
 */
unsigned int ata_sff_port_intr(struct ata_port *ap, struct ata_queued_cmd *qc)
{
        return __ata_sff_port_intr(ap, qc, false);
}
EXPORT_SYMBOL_GPL(ata_sff_port_intr);

static inline irqreturn_t __ata_sff_interrupt(int irq, void *dev_instance,
        unsigned int (*port_intr)(struct ata_port *, struct ata_queued_cmd *))
{
        struct ata_host *host = dev_instance;
        bool retried = false;
        unsigned int i;
        unsigned int handled, idle, polling;
        unsigned long flags;

        /* TODO: make _irqsave conditional on x86 PCI IDE legacy mode */
        spin_lock_irqsave(&host->lock, flags);

retry:
        handled = idle = polling = 0;
        for (i = 0; i < host->n_ports; i++) {
                struct ata_port *ap = host->ports[i];
                struct ata_queued_cmd *qc;

                qc = ata_qc_from_tag(ap, ap->link.active_tag);
                if (qc) {
                        if (!(qc->tf.flags & ATA_TFLAG_POLLING))
                                handled |= port_intr(ap, qc);
                        else
                                polling |= 1 << i;
                } else
                        idle |= 1 << i;
        }

        /*
         * If no port was expecting IRQ but the controller is actually
         * asserting IRQ line, nobody cared will ensue.  Check IRQ
         * pending status if available and clear spurious IRQ.
         */
        if (!handled && !retried) {
                bool retry = false;

                for (i = 0; i < host->n_ports; i++) {
                        struct ata_port *ap = host->ports[i];

                        if (polling & (1 << i))
                                continue;

                        if (!ap->ops->sff_irq_check ||
                            !ap->ops->sff_irq_check(ap))
                                continue;

                        if (idle & (1 << i)) {
                                ap->ops->sff_check_status(ap);
                                if (ap->ops->sff_irq_clear)
                                        ap->ops->sff_irq_clear(ap);
                        } else {
                                /* clear INTRQ and check if BUSY cleared */
                                if (!(ap->ops->sff_check_status(ap) & ATA_BUSY))
                                        retry |= true;
                                /*
                                 * With command in flight, we can't do
                                 * sff_irq_clear() w/o racing with completion.
                                 */
                        }
                }

                if (retry) {
                        retried = true;
                        goto retry;
                }
        }

        spin_unlock_irqrestore(&host->lock, flags);

        return IRQ_RETVAL(handled);
}

/**
 *        ata_sff_interrupt - Default SFF ATA host interrupt handler
 *        @irq: irq line (unused)
 *        @dev_instance: pointer to our ata_host information structure
 *
 *        Default interrupt handler for PCI IDE devices.  Calls
 *        ata_sff_port_intr() for each port that is not disabled.
 *
 *        LOCKING:
 *        Obtains host lock during operation.
 *
 *        RETURNS:
 *        IRQ_NONE or IRQ_HANDLED.
 */
irqreturn_t ata_sff_interrupt(int irq, void *dev_instance)
{
        return __ata_sff_interrupt(irq, dev_instance, ata_sff_port_intr);
}
EXPORT_SYMBOL_GPL(ata_sff_interrupt);

/**
 *        ata_sff_lost_interrupt        -        Check for an apparent lost interrupt
 *        @ap: port that appears to have timed out
 *
 *        Called from the libata error handlers when the core code suspects
 *        an interrupt has been lost. If it has complete anything we can and
 *        then return. Interface must support altstatus for this faster
 *        recovery to occur.
 *
 *        Locking:
 *        Caller holds host lock
 */

void ata_sff_lost_interrupt(struct ata_port *ap)
{
        u8 status;
        struct ata_queued_cmd *qc;

        /* Only one outstanding command per SFF channel */
        qc = ata_qc_from_tag(ap, ap->link.active_tag);
        /* We cannot lose an interrupt on a non-existent or polled command */
        if (!qc || qc->tf.flags & ATA_TFLAG_POLLING)
                return;
        /* See if the controller thinks it is still busy - if so the command
           isn't a lost IRQ but is still in progress */
        status = ata_sff_altstatus(ap);
        if (status & ATA_BUSY)
                return;

        /* There was a command running, we are no longer busy and we have
           no interrupt. */
        ata_port_warn(ap, "lost interrupt (Status 0x%x)\n",
                                                                status);
        /* Run the host interrupt logic as if the interrupt had not been
           lost */
        ata_sff_port_intr(ap, qc);
}
EXPORT_SYMBOL_GPL(ata_sff_lost_interrupt);

/**
 *        ata_sff_freeze - Freeze SFF controller port
 *        @ap: port to freeze
 *
 *        Freeze SFF controller port.
 *
 *        LOCKING:
 *        Inherited from caller.
 */
void ata_sff_freeze(struct ata_port *ap)
{
        ap->ctl |= ATA_NIEN;
        ap->last_ctl = ap->ctl;

        if (ap->ops->sff_set_devctl || ap->ioaddr.ctl_addr)
                ata_sff_set_devctl(ap, ap->ctl);

        /* Under certain circumstances, some controllers raise IRQ on
         * ATA_NIEN manipulation.  Also, many controllers fail to mask
         * previously pending IRQ on ATA_NIEN assertion.  Clear it.
         */
        ap->ops->sff_check_status(ap);

        if (ap->ops->sff_irq_clear)
                ap->ops->sff_irq_clear(ap);
}
EXPORT_SYMBOL_GPL(ata_sff_freeze);

/**
 *        ata_sff_thaw - Thaw SFF controller port
 *        @ap: port to thaw
 *
 *        Thaw SFF controller port.
 *
 *        LOCKING:
 *        Inherited from caller.
 */
void ata_sff_thaw(struct ata_port *ap)
{
        /* clear & re-enable interrupts */
        ap->ops->sff_check_status(ap);
        if (ap->ops->sff_irq_clear)
                ap->ops->sff_irq_clear(ap);
        ata_sff_irq_on(ap);
}
EXPORT_SYMBOL_GPL(ata_sff_thaw);

/**
 *        ata_sff_prereset - prepare SFF link for reset
 *        @link: SFF link to be reset
 *        @deadline: deadline jiffies for the operation
 *
 *        SFF link @link is about to be reset.  Initialize it.  It first
 *        calls ata_std_prereset() and wait for !BSY if the port is
 *        being softreset.
 *
 *        LOCKING:
 *        Kernel thread context (may sleep)
 *
 *        RETURNS:
 *        0 on success, -errno otherwise.
 */
int ata_sff_prereset(struct ata_link *link, unsigned long deadline)
{
        struct ata_eh_context *ehc = &link->eh_context;
        int rc;

        rc = ata_std_prereset(link, deadline);
        if (rc)
                return rc;

        /* if we're about to do hardreset, nothing more to do */
        if (ehc->i.action & ATA_EH_HARDRESET)
                return 0;

        /* wait for !BSY if we don't know that no device is attached */
        if (!ata_link_offline(link)) {
                rc = ata_sff_wait_ready(link, deadline);
                if (rc && rc != -ENODEV) {
                        ata_link_warn(link,
                                      "device not ready (errno=%d), forcing hardreset\n",
                                      rc);
                        ehc->i.action |= ATA_EH_HARDRESET;
                }
        }

        return 0;
}
EXPORT_SYMBOL_GPL(ata_sff_prereset);

/**
 *        ata_devchk - PATA device presence detection
 *        @ap: ATA channel to examine
 *        @device: Device to examine (starting at zero)
 *
 *        This technique was originally described in
 *        Hale Landis's ATADRVR (www.ata-atapi.com), and
 *        later found its way into the ATA/ATAPI spec.
 *
 *        Write a pattern to the ATA shadow registers,
 *        and if a device is present, it will respond by
 *        correctly storing and echoing back the
 *        ATA shadow register contents.
 *
 *        LOCKING:
 *        caller.
 */
static unsigned int ata_devchk(struct ata_port *ap, unsigned int device)
{
        struct ata_ioports *ioaddr = &ap->ioaddr;
        u8 nsect, lbal;

        ap->ops->sff_dev_select(ap, device);

        iowrite8(0x55, ioaddr->nsect_addr);
        iowrite8(0xaa, ioaddr->lbal_addr);

        iowrite8(0xaa, ioaddr->nsect_addr);
        iowrite8(0x55, ioaddr->lbal_addr);

        iowrite8(0x55, ioaddr->nsect_addr);
        iowrite8(0xaa, ioaddr->lbal_addr);

        nsect = ioread8(ioaddr->nsect_addr);
        lbal = ioread8(ioaddr->lbal_addr);

        if ((nsect == 0x55) && (lbal == 0xaa))
                return 1;        /* we found a device */

        return 0;                /* nothing found */
}

/**
 *        ata_sff_dev_classify - Parse returned ATA device signature
 *        @dev: ATA device to classify (starting at zero)
 *        @present: device seems present
 *        @r_err: Value of error register on completion
 *
 *        After an event -- SRST, E.D.D., or SATA COMRESET -- occurs,
 *        an ATA/ATAPI-defined set of values is placed in the ATA
 *        shadow registers, indicating the results of device detection
 *        and diagnostics.
 *
 *        Select the ATA device, and read the values from the ATA shadow
 *        registers.  Then parse according to the Error register value,
 *        and the spec-defined values examined by ata_dev_classify().
 *
 *        LOCKING:
 *        caller.
 *
 *        RETURNS:
 *        Device type - %ATA_DEV_ATA, %ATA_DEV_ATAPI or %ATA_DEV_NONE.
 */
unsigned int ata_sff_dev_classify(struct ata_device *dev, int present,
                                  u8 *r_err)
{
        struct ata_port *ap = dev->link->ap;
        struct ata_taskfile tf;
        unsigned int class;
        u8 err;

        ap->ops->sff_dev_select(ap, dev->devno);

        memset(&tf, 0, sizeof(tf));

        ap->ops->sff_tf_read(ap, &tf);
        err = tf.feature;
        if (r_err)
                *r_err = err;

        /* see if device passed diags: continue and warn later */
        if (err == 0)
                /* diagnostic fail : do nothing _YET_ */
                dev->horkage |= ATA_HORKAGE_DIAGNOSTIC;
        else if (err == 1)
                /* do nothing */ ;
        else if ((dev->devno == 0) && (err == 0x81))
                /* do nothing */ ;
        else
                return ATA_DEV_NONE;

        /* determine if device is ATA or ATAPI */
        class = ata_dev_classify(&tf);

        if (class == ATA_DEV_UNKNOWN) {
                /* If the device failed diagnostic, it's likely to
                 * have reported incorrect device signature too.
                 * Assume ATA device if the device seems present but
                 * device signature is invalid with diagnostic
                 * failure.
                 */
                if (present && (dev->horkage & ATA_HORKAGE_DIAGNOSTIC))
                        class = ATA_DEV_ATA;
                else
                        class = ATA_DEV_NONE;
        } else if ((class == ATA_DEV_ATA) &&
                   (ap->ops->sff_check_status(ap) == 0))
                class = ATA_DEV_NONE;

        return class;
}
EXPORT_SYMBOL_GPL(ata_sff_dev_classify);

/**
 *        ata_sff_wait_after_reset - wait for devices to become ready after reset
 *        @link: SFF link which is just reset
 *        @devmask: mask of present devices
 *        @deadline: deadline jiffies for the operation
 *
 *        Wait devices attached to SFF @link to become ready after
 *        reset.  It contains preceding 150ms wait to avoid accessing TF
 *        status register too early.
 *
 *        LOCKING:
 *        Kernel thread context (may sleep).
 *
 *        RETURNS:
 *        0 on success, -ENODEV if some or all of devices in @devmask
 *        don't seem to exist.  -errno on other errors.
 */
int ata_sff_wait_after_reset(struct ata_link *link, unsigned int devmask,
                             unsigned long deadline)
{
        struct ata_port *ap = link->ap;
        struct ata_ioports *ioaddr = &ap->ioaddr;
        unsigned int dev0 = devmask & (1 << 0);
        unsigned int dev1 = devmask & (1 << 1);
        int rc, ret = 0;

        ata_msleep(ap, ATA_WAIT_AFTER_RESET);

        /* always check readiness of the master device */
        rc = ata_sff_wait_ready(link, deadline);
        /* -ENODEV means the odd clown forgot the D7 pulldown resistor
         * and TF status is 0xff, bail out on it too.
         */
        if (rc)
                return rc;

        /* if device 1 was found in ata_devchk, wait for register
         * access briefly, then wait for BSY to clear.
         */
        if (dev1) {
                int i;

                ap->ops->sff_dev_select(ap, 1);

                /* Wait for register access.  Some ATAPI devices fail
                 * to set nsect/lbal after reset, so don't waste too
                 * much time on it.  We're gonna wait for !BSY anyway.
                 */
                for (i = 0; i < 2; i++) {
                        u8 nsect, lbal;

                        nsect = ioread8(ioaddr->nsect_addr);
                        lbal = ioread8(ioaddr->lbal_addr);
                        if ((nsect == 1) && (lbal == 1))
                                break;
                        ata_msleep(ap, 50);        /* give drive a breather */
                }

                rc = ata_sff_wait_ready(link, deadline);
                if (rc) {
                        if (rc != -ENODEV)
                                return rc;
                        ret = rc;
                }
        }

        /* is all this really necessary? */
        ap->ops->sff_dev_select(ap, 0);
        if (dev1)
                ap->ops->sff_dev_select(ap, 1);
        if (dev0)
                ap->ops->sff_dev_select(ap, 0);

        return ret;
}
EXPORT_SYMBOL_GPL(ata_sff_wait_after_reset);

static int ata_bus_softreset(struct ata_port *ap, unsigned int devmask,
                             unsigned long deadline)
{
        struct ata_ioports *ioaddr = &ap->ioaddr;

        DPRINTK("ata%u: bus reset via SRST\n", ap->print_id);

        if (ap->ioaddr.ctl_addr) {
                /* software reset.  causes dev0 to be selected */
                iowrite8(ap->ctl, ioaddr->ctl_addr);
                udelay(20);        /* FIXME: flush */
                iowrite8(ap->ctl | ATA_SRST, ioaddr->ctl_addr);
                udelay(20);        /* FIXME: flush */
                iowrite8(ap->ctl, ioaddr->ctl_addr);
                ap->last_ctl = ap->ctl;
        }

        /* wait the port to become ready */
        return ata_sff_wait_after_reset(&ap->link, devmask, deadline);
}

/**
 *        ata_sff_softreset - reset host port via ATA SRST
 *        @link: ATA link to reset
 *        @classes: resulting classes of attached devices
 *        @deadline: deadline jiffies for the operation
 *
 *        Reset host port using ATA SRST.
 *
 *        LOCKING:
 *        Kernel thread context (may sleep)
 *
 *        RETURNS:
 *        0 on success, -errno otherwise.
 */
int ata_sff_softreset(struct ata_link *link, unsigned int *classes,
                      unsigned long deadline)
{
        struct ata_port *ap = link->ap;
        unsigned int slave_possible = ap->flags & ATA_FLAG_SLAVE_POSS;
        unsigned int devmask = 0;
        int rc;
        u8 err;

        DPRINTK("ENTER\n");

        /* determine if device 0/1 are present */
        if (ata_devchk(ap, 0))
                devmask |= (1 << 0);
        if (slave_possible && ata_devchk(ap, 1))
                devmask |= (1 << 1);

        /* select device 0 again */
        ap->ops->sff_dev_select(ap, 0);

        /* issue bus reset */
        DPRINTK("about to softreset, devmask=%x\n", devmask);
        rc = ata_bus_softreset(ap, devmask, deadline);
        /* if link is occupied, -ENODEV too is an error */
        if (rc && (rc != -ENODEV || sata_scr_valid(link))) {
                ata_link_err(link, "SRST failed (errno=%d)\n", rc);
                return rc;
        }

        /* determine by signature whether we have ATA or ATAPI devices */
        classes[0] = ata_sff_dev_classify(&link->device[0],
                                          devmask & (1 << 0), &err);
        if (slave_possible && err != 0x81)
                classes[1] = ata_sff_dev_classify(&link->device[1],
                                                  devmask & (1 << 1), &err);

        DPRINTK("EXIT, classes[0]=%u [1]=%u\n", classes[0], classes[1]);
        return 0;
}
EXPORT_SYMBOL_GPL(ata_sff_softreset);

/**
 *        sata_sff_hardreset - reset host port via SATA phy reset
 *        @link: link to reset
 *        @class: resulting class of attached device
 *        @deadline: deadline jiffies for the operation
 *
 *        SATA phy-reset host port using DET bits of SControl register,
 *        wait for !BSY and classify the attached device.
 *
 *        LOCKING:
 *        Kernel thread context (may sleep)
 *
 *        RETURNS:
 *        0 on success, -errno otherwise.
 */
int sata_sff_hardreset(struct ata_link *link, unsigned int *class,
                       unsigned long deadline)
{
        struct ata_eh_context *ehc = &link->eh_context;
        const unsigned long *timing = sata_ehc_deb_timing(ehc);
        bool online;
        int rc;

        rc = sata_link_hardreset(link, timing, deadline, &online,
                                 ata_sff_check_ready);
        if (online)
                *class = ata_sff_dev_classify(link->device, 1, NULL);

        DPRINTK("EXIT, class=%u\n", *class);
        return rc;
}
EXPORT_SYMBOL_GPL(sata_sff_hardreset);

/**
 *        ata_sff_postreset - SFF postreset callback
 *        @link: the target SFF ata_link
 *        @classes: classes of attached devices
 *
 *        This function is invoked after a successful reset.  It first
 *        calls ata_std_postreset() and performs SFF specific postreset
 *        processing.
 *
 *        LOCKING:
 *        Kernel thread context (may sleep)
 */
void ata_sff_postreset(struct ata_link *link, unsigned int *classes)
{
        struct ata_port *ap = link->ap;

        ata_std_postreset(link, classes);

        /* is double-select really necessary? */
        if (classes[0] != ATA_DEV_NONE)
                ap->ops->sff_dev_select(ap, 1);
        if (classes[1] != ATA_DEV_NONE)
                ap->ops->sff_dev_select(ap, 0);

        /* bail out if no device is present */
        if (classes[0] == ATA_DEV_NONE && classes[1] == ATA_DEV_NONE) {
                DPRINTK("EXIT, no device\n");
                return;
        }

        /* set up device control */
        if (ap->ops->sff_set_devctl || ap->ioaddr.ctl_addr) {
                ata_sff_set_devctl(ap, ap->ctl);
                ap->last_ctl = ap->ctl;
        }
}
EXPORT_SYMBOL_GPL(ata_sff_postreset);

/**
 *        ata_sff_drain_fifo - Stock FIFO drain logic for SFF controllers
 *        @qc: command
 *
 *        Drain the FIFO and device of any stuck data following a command
 *        failing to complete. In some cases this is necessary before a
 *        reset will recover the device.
 *
 */

void ata_sff_drain_fifo(struct ata_queued_cmd *qc)
{
        int count;
        struct ata_port *ap;

        /* We only need to flush incoming data when a command was running */
        if (qc == NULL || qc->dma_dir == DMA_TO_DEVICE)
                return;

        ap = qc->ap;
        /* Drain up to 64K of data before we give up this recovery method */
        for (count = 0; (ap->ops->sff_check_status(ap) & ATA_DRQ)
                                                && count < 65536; count += 2)
                ioread16(ap->ioaddr.data_addr);

        /* Can become DEBUG later */
        if (count)
                ata_port_dbg(ap, "drained %d bytes to clear DRQ\n", count);

}
EXPORT_SYMBOL_GPL(ata_sff_drain_fifo);

/**
 *        ata_sff_error_handler - Stock error handler for SFF controller
 *        @ap: port to handle error for
 *
 *        Stock error handler for SFF controller.  It can handle both
 *        PATA and SATA controllers.  Many controllers should be able to
 *        use this EH as-is or with some added handling before and
 *        after.
 *
 *        LOCKING:
 *        Kernel thread context (may sleep)
 */
void ata_sff_error_handler(struct ata_port *ap)
{
        ata_reset_fn_t softreset = ap->ops->softreset;
        ata_reset_fn_t hardreset = ap->ops->hardreset;
        struct ata_queued_cmd *qc;
        unsigned long flags;

        qc = __ata_qc_from_tag(ap, ap->link.active_tag);
        if (qc && !(qc->flags & ATA_QCFLAG_FAILED))
                qc = NULL;

        spin_lock_irqsave(ap->lock, flags);

        /*
         * We *MUST* do FIFO draining before we issue a reset as
         * several devices helpfully clear their internal state and
         * will lock solid if we touch the data port post reset. Pass
         * qc in case anyone wants to do different PIO/DMA recovery or
         * has per command fixups
         */
        if (ap->ops->sff_drain_fifo)
                ap->ops->sff_drain_fifo(qc);

        spin_unlock_irqrestore(ap->lock, flags);

        /* ignore built-in hardresets if SCR access is not available */
        if ((hardreset == sata_std_hardreset ||
             hardreset == sata_sff_hardreset) && !sata_scr_valid(&ap->link))
                hardreset = NULL;

        ata_do_eh(ap, ap->ops->prereset, softreset, hardreset,
                  ap->ops->postreset);
}
EXPORT_SYMBOL_GPL(ata_sff_error_handler);

/**
 *        ata_sff_std_ports - initialize ioaddr with standard port offsets.
 *        @ioaddr: IO address structure to be initialized
 *
 *        Utility function which initializes data_addr, error_addr,
 *        feature_addr, nsect_addr, lbal_addr, lbam_addr, lbah_addr,
 *        device_addr, status_addr, and command_addr to standard offsets
 *        relative to cmd_addr.
 *
 *        Does not set ctl_addr, altstatus_addr, bmdma_addr, or scr_addr.
 */
void ata_sff_std_ports(struct ata_ioports *ioaddr)
{
        ioaddr->data_addr = ioaddr->cmd_addr + ATA_REG_DATA;
        ioaddr->error_addr = ioaddr->cmd_addr + ATA_REG_ERR;
        ioaddr->feature_addr = ioaddr->cmd_addr + ATA_REG_FEATURE;
        ioaddr->nsect_addr = ioaddr->cmd_addr + ATA_REG_NSECT;
        ioaddr->lbal_addr = ioaddr->cmd_addr + ATA_REG_LBAL;
        ioaddr->lbam_addr = ioaddr->cmd_addr + ATA_REG_LBAM;
        ioaddr->lbah_addr = ioaddr->cmd_addr + ATA_REG_LBAH;
        ioaddr->device_addr = ioaddr->cmd_addr + ATA_REG_DEVICE;
        ioaddr->status_addr = ioaddr->cmd_addr + ATA_REG_STATUS;
        ioaddr->command_addr = ioaddr->cmd_addr + ATA_REG_CMD;
}
EXPORT_SYMBOL_GPL(ata_sff_std_ports);

#ifdef CONFIG_PCI

static int ata_resources_present(struct pci_dev *pdev, int port)
{
        int i;

        /* Check the PCI resources for this channel are enabled */
        port = port * 2;
        for (i = 0; i < 2; i++) {
                if (pci_resource_start(pdev, port + i) == 0 ||
                    pci_resource_len(pdev, port + i) == 0)
                        return 0;
        }
        return 1;
}

/**
 *        ata_pci_sff_init_host - acquire native PCI ATA resources and init host
 *        @host: target ATA host
 *
 *        Acquire native PCI ATA resources for @host and initialize the
 *        first two ports of @host accordingly.  Ports marked dummy are
 *        skipped and allocation failure makes the port dummy.
 *
 *        Note that native PCI resources are valid even for legacy hosts
 *        as we fix up pdev resources array early in boot, so this
 *        function can be used for both native and legacy SFF hosts.
 *
 *        LOCKING:
 *        Inherited from calling layer (may sleep).
 *
 *        RETURNS:
 *        0 if at least one port is initialized, -ENODEV if no port is
 *        available.
 */
int ata_pci_sff_init_host(struct ata_host *host)
{
        struct device *gdev = host->dev;
        struct pci_dev *pdev = to_pci_dev(gdev);
        unsigned int mask = 0;
        int i, rc;

        /* request, iomap BARs and init port addresses accordingly */
        for (i = 0; i < 2; i++) {
                struct ata_port *ap = host->ports[i];
                int base = i * 2;
                void __iomem * const *iomap;

                if (ata_port_is_dummy(ap))
                        continue;

                /* Discard disabled ports.  Some controllers show
                 * their unused channels this way.  Disabled ports are
                 * made dummy.
                 */
                if (!ata_resources_present(pdev, i)) {
                        ap->ops = &ata_dummy_port_ops;
                        continue;
                }

                rc = pcim_iomap_regions(pdev, 0x3 << base,
                                        dev_driver_string(gdev));
                if (rc) {
                        dev_warn(gdev,
                                 "failed to request/iomap BARs for port %d (errno=%d)\n",
                                 i, rc);
                        if (rc == -EBUSY)
                                pcim_pin_device(pdev);
                        ap->ops = &ata_dummy_port_ops;
                        continue;
                }
                host->iomap = iomap = pcim_iomap_table(pdev);

                ap->ioaddr.cmd_addr = iomap[base];
                ap->ioaddr.altstatus_addr =
                ap->ioaddr.ctl_addr = (void __iomem *)
                        ((unsigned long)iomap[base + 1] | ATA_PCI_CTL_OFS);
                ata_sff_std_ports(&ap->ioaddr);

                ata_port_desc(ap, "cmd 0x%llx ctl 0x%llx",
                        (unsigned long long)pci_resource_start(pdev, base),
                        (unsigned long long)pci_resource_start(pdev, base + 1));

                mask |= 1 << i;
        }

        if (!mask) {
                dev_err(gdev, "no available native port\n");
                return -ENODEV;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(ata_pci_sff_init_host);

/**
 *        ata_pci_sff_prepare_host - helper to prepare PCI PIO-only SFF ATA host
 *        @pdev: target PCI device
 *        @ppi: array of port_info, must be enough for two ports
 *        @r_host: out argument for the initialized ATA host
 *
 *        Helper to allocate PIO-only SFF ATA host for @pdev, acquire
 *        all PCI resources and initialize it accordingly in one go.
 *
 *        LOCKING:
 *        Inherited from calling layer (may sleep).
 *
 *        RETURNS:
 *        0 on success, -errno otherwise.
 */
int ata_pci_sff_prepare_host(struct pci_dev *pdev,
                             const struct ata_port_info * const *ppi,
                             struct ata_host **r_host)
{
        struct ata_host *host;
        int rc;

        if (!devres_open_group(&pdev->dev, NULL, GFP_KERNEL))
                return -ENOMEM;

        host = ata_host_alloc_pinfo(&pdev->dev, ppi, 2);
        if (!host) {
                dev_err(&pdev->dev, "failed to allocate ATA host\n");
                rc = -ENOMEM;
                goto err_out;
        }

        rc = ata_pci_sff_init_host(host);
        if (rc)
                goto err_out;

        devres_remove_group(&pdev->dev, NULL);
        *r_host = host;
        return 0;

err_out:
        devres_release_group(&pdev->dev, NULL);
        return rc;
}
EXPORT_SYMBOL_GPL(ata_pci_sff_prepare_host);

/**
 *        ata_pci_sff_activate_host - start SFF host, request IRQ and register it
 *        @host: target SFF ATA host
 *        @irq_handler: irq_handler used when requesting IRQ(s)
 *        @sht: scsi_host_template to use when registering the host
 *
 *        This is the counterpart of ata_host_activate() for SFF ATA
 *        hosts.  This separate helper is necessary because SFF hosts
 *        use two separate interrupts in legacy mode.
 *
 *        LOCKING:
 *        Inherited from calling layer (may sleep).
 *
 *        RETURNS:
 *        0 on success, -errno otherwise.
 */
int ata_pci_sff_activate_host(struct ata_host *host,
                              irq_handler_t irq_handler,
                              struct scsi_host_template *sht)
{
        struct device *dev = host->dev;
        struct pci_dev *pdev = to_pci_dev(dev);
        const char *drv_name = dev_driver_string(host->dev);
        int legacy_mode = 0, rc;

        rc = ata_host_start(host);
        if (rc)
                return rc;

        if ((pdev->class >> 8) == PCI_CLASS_STORAGE_IDE) {
                u8 tmp8, mask = 0;

                /*
                 * ATA spec says we should use legacy mode when one
                 * port is in legacy mode, but disabled ports on some
                 * PCI hosts appear as fixed legacy ports, e.g SB600/700
                 * on which the secondary port is not wired, so
                 * ignore ports that are marked as 'dummy' during
                 * this check
                 */
                pci_read_config_byte(pdev, PCI_CLASS_PROG, &tmp8);
                if (!ata_port_is_dummy(host->ports[0]))
                        mask |= (1 << 0);
                if (!ata_port_is_dummy(host->ports[1]))
                        mask |= (1 << 2);
                if ((tmp8 & mask) != mask)
                        legacy_mode = 1;
        }

        if (!devres_open_group(dev, NULL, GFP_KERNEL))
                return -ENOMEM;

        if (!legacy_mode && pdev->irq) {
                int i;

                rc = devm_request_irq(dev, pdev->irq, irq_handler,
                                      IRQF_SHARED, drv_name, host);
                if (rc)
                        goto out;

                for (i = 0; i < 2; i++) {
                        if (ata_port_is_dummy(host->ports[i]))
                                continue;
                        ata_port_desc(host->ports[i], "irq %d", pdev->irq);
                }
        } else if (legacy_mode) {
                if (!ata_port_is_dummy(host->ports[0])) {
                        rc = devm_request_irq(dev, ATA_PRIMARY_IRQ(pdev),
                                              irq_handler, IRQF_SHARED,
                                              drv_name, host);
                        if (rc)
                                goto out;

                        ata_port_desc(host->ports[0], "irq %d",
                                      ATA_PRIMARY_IRQ(pdev));
                }

                if (!ata_port_is_dummy(host->ports[1])) {
                        rc = devm_request_irq(dev, ATA_SECONDARY_IRQ(pdev),
                                              irq_handler, IRQF_SHARED,
                                              drv_name, host);
                        if (rc)
                                goto out;

                        ata_port_desc(host->ports[1], "irq %d",
                                      ATA_SECONDARY_IRQ(pdev));
                }
        }

        rc = ata_host_register(host, sht);
out:
        if (rc == 0)
                devres_remove_group(dev, NULL);
        else
                devres_release_group(dev, NULL);

        return rc;
}
EXPORT_SYMBOL_GPL(ata_pci_sff_activate_host);

static const struct ata_port_info *ata_sff_find_valid_pi(
                                        const struct ata_port_info * const *ppi)
{
        int i;

        /* look up the first valid port_info */
        for (i = 0; i < 2 && ppi[i]; i++)
                if (ppi[i]->port_ops != &ata_dummy_port_ops)
                        return ppi[i];

        return NULL;
}

static int ata_pci_init_one(struct pci_dev *pdev,
                const struct ata_port_info * const *ppi,
                struct scsi_host_template *sht, void *host_priv,
                int hflags, bool bmdma)
{
        struct device *dev = &pdev->dev;
        const struct ata_port_info *pi;
        struct ata_host *host = NULL;
        int rc;

        DPRINTK("ENTER\n");

        pi = ata_sff_find_valid_pi(ppi);
        if (!pi) {
                dev_err(&pdev->dev, "no valid port_info specified\n");
                return -EINVAL;
        }

        if (!devres_open_group(dev, NULL, GFP_KERNEL))
                return -ENOMEM;

        rc = pcim_enable_device(pdev);
        if (rc)
                goto out;

#ifdef CONFIG_ATA_BMDMA
        if (bmdma)
                /* prepare and activate BMDMA host */
                rc = ata_pci_bmdma_prepare_host(pdev, ppi, &host);
        else
#endif
                /* prepare and activate SFF host */
                rc = ata_pci_sff_prepare_host(pdev, ppi, &host);
        if (rc)
                goto out;
        host->private_data = host_priv;
        host->flags |= hflags;

#ifdef CONFIG_ATA_BMDMA
        if (bmdma) {
                pci_set_master(pdev);
                rc = ata_pci_sff_activate_host(host, ata_bmdma_interrupt, sht);
        } else
#endif
                rc = ata_pci_sff_activate_host(host, ata_sff_interrupt, sht);
out:
        if (rc == 0)
                devres_remove_group(&pdev->dev, NULL);
        else
                devres_release_group(&pdev->dev, NULL);

        return rc;
}

/**
 *        ata_pci_sff_init_one - Initialize/register PIO-only PCI IDE controller
 *        @pdev: Controller to be initialized
 *        @ppi: array of port_info, must be enough for two ports
 *        @sht: scsi_host_template to use when registering the host
 *        @host_priv: host private_data
 *        @hflag: host flags
 *
 *        This is a helper function which can be called from a driver's
 *        xxx_init_one() probe function if the hardware uses traditional
 *        IDE taskfile registers and is PIO only.
 *
 *        ASSUMPTION:
 *        Nobody makes a single channel controller that appears solely as
 *        the secondary legacy port on PCI.
 *
 *        LOCKING:
 *        Inherited from PCI layer (may sleep).
 *
 *        RETURNS:
 *        Zero on success, negative on errno-based value on error.
 */
int ata_pci_sff_init_one(struct pci_dev *pdev,
                 const struct ata_port_info * const *ppi,
                 struct scsi_host_template *sht, void *host_priv, int hflag)
{
        return ata_pci_init_one(pdev, ppi, sht, host_priv, hflag, 0);
}
EXPORT_SYMBOL_GPL(ata_pci_sff_init_one);

#endif /* CONFIG_PCI */

/*
 *        BMDMA support
 */

#ifdef CONFIG_ATA_BMDMA

const struct ata_port_operations ata_bmdma_port_ops = {
        .inherits                = &ata_sff_port_ops,

        .error_handler                = ata_bmdma_error_handler,
        .post_internal_cmd        = ata_bmdma_post_internal_cmd,

        .qc_prep                = ata_bmdma_qc_prep,
        .qc_issue                = ata_bmdma_qc_issue,

        .sff_irq_clear                = ata_bmdma_irq_clear,
        .bmdma_setup                = ata_bmdma_setup,
        .bmdma_start                = ata_bmdma_start,
        .bmdma_stop                = ata_bmdma_stop,
        .bmdma_status                = ata_bmdma_status,

        .port_start                = ata_bmdma_port_start,
};
EXPORT_SYMBOL_GPL(ata_bmdma_port_ops);

const struct ata_port_operations ata_bmdma32_port_ops = {
        .inherits                = &ata_bmdma_port_ops,

        .sff_data_xfer                = ata_sff_data_xfer32,
        .port_start                = ata_bmdma_port_start32,
};
EXPORT_SYMBOL_GPL(ata_bmdma32_port_ops);

/**
 *        ata_bmdma_fill_sg - Fill PCI IDE PRD table
 *        @qc: Metadata associated with taskfile to be transferred
 *
 *        Fill PCI IDE PRD (scatter-gather) table with segments
 *        associated with the current disk command.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 *
 */
static void ata_bmdma_fill_sg(struct ata_queued_cmd *qc)
{
        struct ata_port *ap = qc->ap;
        struct ata_bmdma_prd *prd = ap->bmdma_prd;
        struct scatterlist *sg;
        unsigned int si, pi;

        pi = 0;
        for_each_sg(qc->sg, sg, qc->n_elem, si) {
                u32 addr, offset;
                u32 sg_len, len;

                /* determine if physical DMA addr spans 64K boundary.
                 * Note h/w doesn't support 64-bit, so we unconditionally
                 * truncate dma_addr_t to u32.
                 */
                addr = (u32) sg_dma_address(sg);
                sg_len = sg_dma_len(sg);

                while (sg_len) {
                        offset = addr & 0xffff;
                        len = sg_len;
                        if ((offset + sg_len) > 0x10000)
                                len = 0x10000 - offset;

                        prd[pi].addr = cpu_to_le32(addr);
                        prd[pi].flags_len = cpu_to_le32(len & 0xffff);

                        pi++;
                        sg_len -= len;
                        addr += len;
                }
        }

        prd[pi - 1].flags_len |= cpu_to_le32(ATA_PRD_EOT);
}

/**
 *        ata_bmdma_fill_sg_dumb - Fill PCI IDE PRD table
 *        @qc: Metadata associated with taskfile to be transferred
 *
 *        Fill PCI IDE PRD (scatter-gather) table with segments
 *        associated with the current disk command. Perform the fill
 *        so that we avoid writing any length 64K records for
 *        controllers that don't follow the spec.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 *
 */
static void ata_bmdma_fill_sg_dumb(struct ata_queued_cmd *qc)
{
        struct ata_port *ap = qc->ap;
        struct ata_bmdma_prd *prd = ap->bmdma_prd;
        struct scatterlist *sg;
        unsigned int si, pi;

        pi = 0;
        for_each_sg(qc->sg, sg, qc->n_elem, si) {
                u32 addr, offset;
                u32 sg_len, len, blen;

                /* determine if physical DMA addr spans 64K boundary.
                 * Note h/w doesn't support 64-bit, so we unconditionally
                 * truncate dma_addr_t to u32.
                 */
                addr = (u32) sg_dma_address(sg);
                sg_len = sg_dma_len(sg);

                while (sg_len) {
                        offset = addr & 0xffff;
                        len = sg_len;
                        if ((offset + sg_len) > 0x10000)
                                len = 0x10000 - offset;

                        blen = len & 0xffff;
                        prd[pi].addr = cpu_to_le32(addr);
                        if (blen == 0) {
                                /* Some PATA chipsets like the CS5530 can't
                                   cope with 0x0000 meaning 64K as the spec
                                   says */
                                prd[pi].flags_len = cpu_to_le32(0x8000);
                                blen = 0x8000;
                                prd[++pi].addr = cpu_to_le32(addr + 0x8000);
                        }
                        prd[pi].flags_len = cpu_to_le32(blen);

                        pi++;
                        sg_len -= len;
                        addr += len;
                }
        }

        prd[pi - 1].flags_len |= cpu_to_le32(ATA_PRD_EOT);
}

/**
 *        ata_bmdma_qc_prep - Prepare taskfile for submission
 *        @qc: Metadata associated with taskfile to be prepared
 *
 *        Prepare ATA taskfile for submission.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 */
enum ata_completion_errors ata_bmdma_qc_prep(struct ata_queued_cmd *qc)
{
        if (!(qc->flags & ATA_QCFLAG_DMAMAP))
                return AC_ERR_OK;

        ata_bmdma_fill_sg(qc);

        return AC_ERR_OK;
}
EXPORT_SYMBOL_GPL(ata_bmdma_qc_prep);

/**
 *        ata_bmdma_dumb_qc_prep - Prepare taskfile for submission
 *        @qc: Metadata associated with taskfile to be prepared
 *
 *        Prepare ATA taskfile for submission.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 */
enum ata_completion_errors ata_bmdma_dumb_qc_prep(struct ata_queued_cmd *qc)
{
        if (!(qc->flags & ATA_QCFLAG_DMAMAP))
                return AC_ERR_OK;

        ata_bmdma_fill_sg_dumb(qc);

        return AC_ERR_OK;
}
EXPORT_SYMBOL_GPL(ata_bmdma_dumb_qc_prep);

/**
 *        ata_bmdma_qc_issue - issue taskfile to a BMDMA controller
 *        @qc: command to issue to device
 *
 *        This function issues a PIO, NODATA or DMA command to a
 *        SFF/BMDMA controller.  PIO and NODATA are handled by
 *        ata_sff_qc_issue().
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 *
 *        RETURNS:
 *        Zero on success, AC_ERR_* mask on failure
 */
unsigned int ata_bmdma_qc_issue(struct ata_queued_cmd *qc)
{
        struct ata_port *ap = qc->ap;
        struct ata_link *link = qc->dev->link;

        /* defer PIO handling to sff_qc_issue */
        if (!ata_is_dma(qc->tf.protocol))
                return ata_sff_qc_issue(qc);

        /* select the device */
        ata_dev_select(ap, qc->dev->devno, 1, 0);

        /* start the command */
        switch (qc->tf.protocol) {
        case ATA_PROT_DMA:
                WARN_ON_ONCE(qc->tf.flags & ATA_TFLAG_POLLING);

                ap->ops->sff_tf_load(ap, &qc->tf);  /* load tf registers */
                ap->ops->bmdma_setup(qc);            /* set up bmdma */
                ap->ops->bmdma_start(qc);            /* initiate bmdma */
                ap->hsm_task_state = HSM_ST_LAST;
                break;

        case ATAPI_PROT_DMA:
                WARN_ON_ONCE(qc->tf.flags & ATA_TFLAG_POLLING);

                ap->ops->sff_tf_load(ap, &qc->tf);  /* load tf registers */
                ap->ops->bmdma_setup(qc);            /* set up bmdma */
                ap->hsm_task_state = HSM_ST_FIRST;

                /* send cdb by polling if no cdb interrupt */
                if (!(qc->dev->flags & ATA_DFLAG_CDB_INTR))
                        ata_sff_queue_pio_task(link, 0);
                break;

        default:
                WARN_ON(1);
                return AC_ERR_SYSTEM;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(ata_bmdma_qc_issue);

/**
 *        ata_bmdma_port_intr - Handle BMDMA port interrupt
 *        @ap: Port on which interrupt arrived (possibly...)
 *        @qc: Taskfile currently active in engine
 *
 *        Handle port interrupt for given queued command.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 *
 *        RETURNS:
 *        One if interrupt was handled, zero if not (shared irq).
 */
unsigned int ata_bmdma_port_intr(struct ata_port *ap, struct ata_queued_cmd *qc)
{
        struct ata_eh_info *ehi = &ap->link.eh_info;
        u8 host_stat = 0;
        bool bmdma_stopped = false;
        unsigned int handled;

        if (ap->hsm_task_state == HSM_ST_LAST && ata_is_dma(qc->tf.protocol)) {
                /* check status of DMA engine */
                host_stat = ap->ops->bmdma_status(ap);
                VPRINTK("ata%u: host_stat 0x%X\n", ap->print_id, host_stat);

                /* if it's not our irq... */
                if (!(host_stat & ATA_DMA_INTR))
                        return ata_sff_idle_irq(ap);

                /* before we do anything else, clear DMA-Start bit */
                ap->ops->bmdma_stop(qc);
                bmdma_stopped = true;

                if (unlikely(host_stat & ATA_DMA_ERR)) {
                        /* error when transferring data to/from memory */
                        qc->err_mask |= AC_ERR_HOST_BUS;
                        ap->hsm_task_state = HSM_ST_ERR;
                }
        }

        handled = __ata_sff_port_intr(ap, qc, bmdma_stopped);

        if (unlikely(qc->err_mask) && ata_is_dma(qc->tf.protocol))
                ata_ehi_push_desc(ehi, "BMDMA stat 0x%x", host_stat);

        return handled;
}
EXPORT_SYMBOL_GPL(ata_bmdma_port_intr);

/**
 *        ata_bmdma_interrupt - Default BMDMA ATA host interrupt handler
 *        @irq: irq line (unused)
 *        @dev_instance: pointer to our ata_host information structure
 *
 *        Default interrupt handler for PCI IDE devices.  Calls
 *        ata_bmdma_port_intr() for each port that is not disabled.
 *
 *        LOCKING:
 *        Obtains host lock during operation.
 *
 *        RETURNS:
 *        IRQ_NONE or IRQ_HANDLED.
 */
irqreturn_t ata_bmdma_interrupt(int irq, void *dev_instance)
{
        return __ata_sff_interrupt(irq, dev_instance, ata_bmdma_port_intr);
}
EXPORT_SYMBOL_GPL(ata_bmdma_interrupt);

/**
 *        ata_bmdma_error_handler - Stock error handler for BMDMA controller
 *        @ap: port to handle error for
 *
 *        Stock error handler for BMDMA controller.  It can handle both
 *        PATA and SATA controllers.  Most BMDMA controllers should be
 *        able to use this EH as-is or with some added handling before
 *        and after.
 *
 *        LOCKING:
 *        Kernel thread context (may sleep)
 */
void ata_bmdma_error_handler(struct ata_port *ap)
{
        struct ata_queued_cmd *qc;
        unsigned long flags;
        bool thaw = false;

        qc = __ata_qc_from_tag(ap, ap->link.active_tag);
        if (qc && !(qc->flags & ATA_QCFLAG_FAILED))
                qc = NULL;

        /* reset PIO HSM and stop DMA engine */
        spin_lock_irqsave(ap->lock, flags);

        if (qc && ata_is_dma(qc->tf.protocol)) {
                u8 host_stat;

                host_stat = ap->ops->bmdma_status(ap);

                /* BMDMA controllers indicate host bus error by
                 * setting DMA_ERR bit and timing out.  As it wasn't
                 * really a timeout event, adjust error mask and
                 * cancel frozen state.
                 */
                if (qc->err_mask == AC_ERR_TIMEOUT && (host_stat & ATA_DMA_ERR)) {
                        qc->err_mask = AC_ERR_HOST_BUS;
                        thaw = true;
                }

                ap->ops->bmdma_stop(qc);

                /* if we're gonna thaw, make sure IRQ is clear */
                if (thaw) {
                        ap->ops->sff_check_status(ap);
                        if (ap->ops->sff_irq_clear)
                                ap->ops->sff_irq_clear(ap);
                }
        }

        spin_unlock_irqrestore(ap->lock, flags);

        if (thaw)
                ata_eh_thaw_port(ap);

        ata_sff_error_handler(ap);
}
EXPORT_SYMBOL_GPL(ata_bmdma_error_handler);

/**
 *        ata_bmdma_post_internal_cmd - Stock post_internal_cmd for BMDMA
 *        @qc: internal command to clean up
 *
 *        LOCKING:
 *        Kernel thread context (may sleep)
 */
void ata_bmdma_post_internal_cmd(struct ata_queued_cmd *qc)
{
        struct ata_port *ap = qc->ap;
        unsigned long flags;

        if (ata_is_dma(qc->tf.protocol)) {
                spin_lock_irqsave(ap->lock, flags);
                ap->ops->bmdma_stop(qc);
                spin_unlock_irqrestore(ap->lock, flags);
        }
}
EXPORT_SYMBOL_GPL(ata_bmdma_post_internal_cmd);

/**
 *        ata_bmdma_irq_clear - Clear PCI IDE BMDMA interrupt.
 *        @ap: Port associated with this ATA transaction.
 *
 *        Clear interrupt and error flags in DMA status register.
 *
 *        May be used as the irq_clear() entry in ata_port_operations.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 */
void ata_bmdma_irq_clear(struct ata_port *ap)
{
        void __iomem *mmio = ap->ioaddr.bmdma_addr;

        if (!mmio)
                return;

        iowrite8(ioread8(mmio + ATA_DMA_STATUS), mmio + ATA_DMA_STATUS);
}
EXPORT_SYMBOL_GPL(ata_bmdma_irq_clear);

/**
 *        ata_bmdma_setup - Set up PCI IDE BMDMA transaction
 *        @qc: Info associated with this ATA transaction.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 */
void ata_bmdma_setup(struct ata_queued_cmd *qc)
{
        struct ata_port *ap = qc->ap;
        unsigned int rw = (qc->tf.flags & ATA_TFLAG_WRITE);
        u8 dmactl;

        /* load PRD table addr. */
        mb();        /* make sure PRD table writes are visible to controller */
        iowrite32(ap->bmdma_prd_dma, ap->ioaddr.bmdma_addr + ATA_DMA_TABLE_OFS);

        /* specify data direction, triple-check start bit is clear */
        dmactl = ioread8(ap->ioaddr.bmdma_addr + ATA_DMA_CMD);
        dmactl &= ~(ATA_DMA_WR | ATA_DMA_START);
        if (!rw)
                dmactl |= ATA_DMA_WR;
        iowrite8(dmactl, ap->ioaddr.bmdma_addr + ATA_DMA_CMD);

        /* issue r/w command */
        ap->ops->sff_exec_command(ap, &qc->tf);
}
EXPORT_SYMBOL_GPL(ata_bmdma_setup);

/**
 *        ata_bmdma_start - Start a PCI IDE BMDMA transaction
 *        @qc: Info associated with this ATA transaction.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 */
void ata_bmdma_start(struct ata_queued_cmd *qc)
{
        struct ata_port *ap = qc->ap;
        u8 dmactl;

        /* start host DMA transaction */
        dmactl = ioread8(ap->ioaddr.bmdma_addr + ATA_DMA_CMD);
        iowrite8(dmactl | ATA_DMA_START, ap->ioaddr.bmdma_addr + ATA_DMA_CMD);

        /* Strictly, one may wish to issue an ioread8() here, to
         * flush the mmio write.  However, control also passes
         * to the hardware at this point, and it will interrupt
         * us when we are to resume control.  So, in effect,
         * we don't care when the mmio write flushes.
         * Further, a read of the DMA status register _immediately_
         * following the write may not be what certain flaky hardware
         * is expected, so I think it is best to not add a readb()
         * without first all the MMIO ATA cards/mobos.
         * Or maybe I'm just being paranoid.
         *
         * FIXME: The posting of this write means I/O starts are
         * unnecessarily delayed for MMIO
         */
}
EXPORT_SYMBOL_GPL(ata_bmdma_start);

/**
 *        ata_bmdma_stop - Stop PCI IDE BMDMA transfer
 *        @qc: Command we are ending DMA for
 *
 *        Clears the ATA_DMA_START flag in the dma control register
 *
 *        May be used as the bmdma_stop() entry in ata_port_operations.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 */
void ata_bmdma_stop(struct ata_queued_cmd *qc)
{
        struct ata_port *ap = qc->ap;
        void __iomem *mmio = ap->ioaddr.bmdma_addr;

        /* clear start/stop bit */
        iowrite8(ioread8(mmio + ATA_DMA_CMD) & ~ATA_DMA_START,
                 mmio + ATA_DMA_CMD);

        /* one-PIO-cycle guaranteed wait, per spec, for HDMA1:0 transition */
        ata_sff_dma_pause(ap);
}
EXPORT_SYMBOL_GPL(ata_bmdma_stop);

/**
 *        ata_bmdma_status - Read PCI IDE BMDMA status
 *        @ap: Port associated with this ATA transaction.
 *
 *        Read and return BMDMA status register.
 *
 *        May be used as the bmdma_status() entry in ata_port_operations.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 */
u8 ata_bmdma_status(struct ata_port *ap)
{
        return ioread8(ap->ioaddr.bmdma_addr + ATA_DMA_STATUS);
}
EXPORT_SYMBOL_GPL(ata_bmdma_status);


/**
 *        ata_bmdma_port_start - Set port up for bmdma.
 *        @ap: Port to initialize
 *
 *        Called just after data structures for each port are
 *        initialized.  Allocates space for PRD table.
 *
 *        May be used as the port_start() entry in ata_port_operations.
 *
 *        LOCKING:
 *        Inherited from caller.
 */
int ata_bmdma_port_start(struct ata_port *ap)
{
        if (ap->mwdma_mask || ap->udma_mask) {
                ap->bmdma_prd =
                        dmam_alloc_coherent(ap->host->dev, ATA_PRD_TBL_SZ,
                                            &ap->bmdma_prd_dma, GFP_KERNEL);
                if (!ap->bmdma_prd)
                        return -ENOMEM;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(ata_bmdma_port_start);

/**
 *        ata_bmdma_port_start32 - Set port up for dma.
 *        @ap: Port to initialize
 *
 *        Called just after data structures for each port are
 *        initialized.  Enables 32bit PIO and allocates space for PRD
 *        table.
 *
 *        May be used as the port_start() entry in ata_port_operations for
 *        devices that are capable of 32bit PIO.
 *
 *        LOCKING:
 *        Inherited from caller.
 */
int ata_bmdma_port_start32(struct ata_port *ap)
{
        ap->pflags |= ATA_PFLAG_PIO32 | ATA_PFLAG_PIO32CHANGE;
        return ata_bmdma_port_start(ap);
}
EXPORT_SYMBOL_GPL(ata_bmdma_port_start32);

#ifdef CONFIG_PCI

/**
 *        ata_pci_bmdma_clear_simplex -        attempt to kick device out of simplex
 *        @pdev: PCI device
 *
 *        Some PCI ATA devices report simplex mode but in fact can be told to
 *        enter non simplex mode. This implements the necessary logic to
 *        perform the task on such devices. Calling it on other devices will
 *        have -undefined- behaviour.
 */
int ata_pci_bmdma_clear_simplex(struct pci_dev *pdev)
{
        unsigned long bmdma = pci_resource_start(pdev, 4);
        u8 simplex;

        if (bmdma == 0)
                return -ENOENT;

        simplex = inb(bmdma + 0x02);
        outb(simplex & 0x60, bmdma + 0x02);
        simplex = inb(bmdma + 0x02);
        if (simplex & 0x80)
                return -EOPNOTSUPP;
        return 0;
}
EXPORT_SYMBOL_GPL(ata_pci_bmdma_clear_simplex);

static void ata_bmdma_nodma(struct ata_host *host, const char *reason)
{
        int i;

        dev_err(host->dev, "BMDMA: %s, falling back to PIO\n", reason);

        for (i = 0; i < 2; i++) {
                host->ports[i]->mwdma_mask = 0;
                host->ports[i]->udma_mask = 0;
        }
}

/**
 *        ata_pci_bmdma_init - acquire PCI BMDMA resources and init ATA host
 *        @host: target ATA host
 *
 *        Acquire PCI BMDMA resources and initialize @host accordingly.
 *
 *        LOCKING:
 *        Inherited from calling layer (may sleep).
 */
void ata_pci_bmdma_init(struct ata_host *host)
{
        struct device *gdev = host->dev;
        struct pci_dev *pdev = to_pci_dev(gdev);
        int i, rc;

        /* No BAR4 allocation: No DMA */
        if (pci_resource_start(pdev, 4) == 0) {
                ata_bmdma_nodma(host, "BAR4 is zero");
                return;
        }

        /*
         * Some controllers require BMDMA region to be initialized
         * even if DMA is not in use to clear IRQ status via
         * ->sff_irq_clear method.  Try to initialize bmdma_addr
         * regardless of dma masks.
         */
        rc = dma_set_mask_and_coherent(&pdev->dev, ATA_DMA_MASK);
        if (rc)
                ata_bmdma_nodma(host, "failed to set dma mask");

        /* request and iomap DMA region */
        rc = pcim_iomap_regions(pdev, 1 << 4, dev_driver_string(gdev));
        if (rc) {
                ata_bmdma_nodma(host, "failed to request/iomap BAR4");
                return;
        }
        host->iomap = pcim_iomap_table(pdev);

        for (i = 0; i < 2; i++) {
                struct ata_port *ap = host->ports[i];
                void __iomem *bmdma = host->iomap[4] + 8 * i;

                if (ata_port_is_dummy(ap))
                        continue;

                ap->ioaddr.bmdma_addr = bmdma;
                if ((!(ap->flags & ATA_FLAG_IGN_SIMPLEX)) &&
                    (ioread8(bmdma + 2) & 0x80))
                        host->flags |= ATA_HOST_SIMPLEX;

                ata_port_desc(ap, "bmdma 0x%llx",
                    (unsigned long long)pci_resource_start(pdev, 4) + 8 * i);
        }
}
EXPORT_SYMBOL_GPL(ata_pci_bmdma_init);

/**
 *        ata_pci_bmdma_prepare_host - helper to prepare PCI BMDMA ATA host
 *        @pdev: target PCI device
 *        @ppi: array of port_info, must be enough for two ports
 *        @r_host: out argument for the initialized ATA host
 *
 *        Helper to allocate BMDMA ATA host for @pdev, acquire all PCI
 *        resources and initialize it accordingly in one go.
 *
 *        LOCKING:
 *        Inherited from calling layer (may sleep).
 *
 *        RETURNS:
 *        0 on success, -errno otherwise.
 */
int ata_pci_bmdma_prepare_host(struct pci_dev *pdev,
                               const struct ata_port_info * const * ppi,
                               struct ata_host **r_host)
{
        int rc;

        rc = ata_pci_sff_prepare_host(pdev, ppi, r_host);
        if (rc)
                return rc;

        ata_pci_bmdma_init(*r_host);
        return 0;
}
EXPORT_SYMBOL_GPL(ata_pci_bmdma_prepare_host);

/**
 *        ata_pci_bmdma_init_one - Initialize/register BMDMA PCI IDE controller
 *        @pdev: Controller to be initialized
 *        @ppi: array of port_info, must be enough for two ports
 *        @sht: scsi_host_template to use when registering the host
 *        @host_priv: host private_data
 *        @hflags: host flags
 *
 *        This function is similar to ata_pci_sff_init_one() but also
 *        takes care of BMDMA initialization.
 *
 *        LOCKING:
 *        Inherited from PCI layer (may sleep).
 *
 *        RETURNS:
 *        Zero on success, negative on errno-based value on error.
 */
int ata_pci_bmdma_init_one(struct pci_dev *pdev,
                           const struct ata_port_info * const * ppi,
                           struct scsi_host_template *sht, void *host_priv,
                           int hflags)
{
        return ata_pci_init_one(pdev, ppi, sht, host_priv, hflags, 1);
}
EXPORT_SYMBOL_GPL(ata_pci_bmdma_init_one);

#endif /* CONFIG_PCI */
#endif /* CONFIG_ATA_BMDMA */

/**
 *        ata_sff_port_init - Initialize SFF/BMDMA ATA port
 *        @ap: Port to initialize
 *
 *        Called on port allocation to initialize SFF/BMDMA specific
 *        fields.
 *
 *        LOCKING:
 *        None.
 */
void ata_sff_port_init(struct ata_port *ap)
{
        INIT_DELAYED_WORK(&ap->sff_pio_task, ata_sff_pio_task);
        ap->ctl = ATA_DEVCTL_OBS;
        ap->last_ctl = 0xFF;
}

int __init ata_sff_init(void)
{
        ata_sff_wq = alloc_workqueue("ata_sff", WQ_MEM_RECLAIM, WQ_MAX_ACTIVE);
        if (!ata_sff_wq)
                return -ENOMEM;

        return 0;
}

void ata_sff_exit(void)
{
        destroy_workqueue(ata_sff_wq);
}











































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef _UAPI_LINUX_BYTEORDER_LITTLE_ENDIAN_H
#define _UAPI_LINUX_BYTEORDER_LITTLE_ENDIAN_H

#ifndef __LITTLE_ENDIAN
#define __LITTLE_ENDIAN 1234
#endif
#ifndef __LITTLE_ENDIAN_BITFIELD
#define __LITTLE_ENDIAN_BITFIELD
#endif

#include <linux/types.h>
#include <linux/swab.h>

#define __constant_htonl(x) ((__force __be32)___constant_swab32((x)))
#define __constant_ntohl(x) ___constant_swab32((__force __be32)(x))
#define __constant_htons(x) ((__force __be16)___constant_swab16((x)))
#define __constant_ntohs(x) ___constant_swab16((__force __be16)(x))
#define __constant_cpu_to_le64(x) ((__force __le64)(__u64)(x))
#define __constant_le64_to_cpu(x) ((__force __u64)(__le64)(x))
#define __constant_cpu_to_le32(x) ((__force __le32)(__u32)(x))
#define __constant_le32_to_cpu(x) ((__force __u32)(__le32)(x))
#define __constant_cpu_to_le16(x) ((__force __le16)(__u16)(x))
#define __constant_le16_to_cpu(x) ((__force __u16)(__le16)(x))
#define __constant_cpu_to_be64(x) ((__force __be64)___constant_swab64((x)))
#define __constant_be64_to_cpu(x) ___constant_swab64((__force __u64)(__be64)(x))
#define __constant_cpu_to_be32(x) ((__force __be32)___constant_swab32((x)))
#define __constant_be32_to_cpu(x) ___constant_swab32((__force __u32)(__be32)(x))
#define __constant_cpu_to_be16(x) ((__force __be16)___constant_swab16((x)))
#define __constant_be16_to_cpu(x) ___constant_swab16((__force __u16)(__be16)(x))
#define __cpu_to_le64(x) ((__force __le64)(__u64)(x))
#define __le64_to_cpu(x) ((__force __u64)(__le64)(x))
#define __cpu_to_le32(x) ((__force __le32)(__u32)(x))
#define __le32_to_cpu(x) ((__force __u32)(__le32)(x))
#define __cpu_to_le16(x) ((__force __le16)(__u16)(x))
#define __le16_to_cpu(x) ((__force __u16)(__le16)(x))
#define __cpu_to_be64(x) ((__force __be64)__swab64((x)))
#define __be64_to_cpu(x) __swab64((__force __u64)(__be64)(x))
#define __cpu_to_be32(x) ((__force __be32)__swab32((x)))
#define __be32_to_cpu(x) __swab32((__force __u32)(__be32)(x))
#define __cpu_to_be16(x) ((__force __be16)__swab16((x)))
#define __be16_to_cpu(x) __swab16((__force __u16)(__be16)(x))

static __always_inline __le64 __cpu_to_le64p(const __u64 *p)
{
        return (__force __le64)*p;
}
static __always_inline __u64 __le64_to_cpup(const __le64 *p)
{
        return (__force __u64)*p;
}
static __always_inline __le32 __cpu_to_le32p(const __u32 *p)
{
        return (__force __le32)*p;
}
static __always_inline __u32 __le32_to_cpup(const __le32 *p)
{
        return (__force __u32)*p;
}
static __always_inline __le16 __cpu_to_le16p(const __u16 *p)
{
        return (__force __le16)*p;
}
static __always_inline __u16 __le16_to_cpup(const __le16 *p)
{
        return (__force __u16)*p;
}
static __always_inline __be64 __cpu_to_be64p(const __u64 *p)
{
        return (__force __be64)__swab64p(p);
}
static __always_inline __u64 __be64_to_cpup(const __be64 *p)
{
        return __swab64p((__u64 *)p);
}
static __always_inline __be32 __cpu_to_be32p(const __u32 *p)
{
        return (__force __be32)__swab32p(p);
}
static __always_inline __u32 __be32_to_cpup(const __be32 *p)
{
        return __swab32p((__u32 *)p);
}
static __always_inline __be16 __cpu_to_be16p(const __u16 *p)
{
        return (__force __be16)__swab16p(p);
}
static __always_inline __u16 __be16_to_cpup(const __be16 *p)
{
        return __swab16p((__u16 *)p);
}
#define __cpu_to_le64s(x) do { (void)(x); } while (0)
#define __le64_to_cpus(x) do { (void)(x); } while (0)
#define __cpu_to_le32s(x) do { (void)(x); } while (0)
#define __le32_to_cpus(x) do { (void)(x); } while (0)
#define __cpu_to_le16s(x) do { (void)(x); } while (0)
#define __le16_to_cpus(x) do { (void)(x); } while (0)
#define __cpu_to_be64s(x) __swab64s((x))
#define __be64_to_cpus(x) __swab64s((x))
#define __cpu_to_be32s(x) __swab32s((x))
#define __be32_to_cpus(x) __swab32s((x))
#define __cpu_to_be16s(x) __swab16s((x))
#define __be16_to_cpus(x) __swab16s((x))


#endif /* _UAPI_LINUX_BYTEORDER_LITTLE_ENDIAN_H */


















































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
/* SPDX-License-Identifier: GPL-2.0 */
/* Based on net/mac80211/trace.h */

#undef TRACE_SYSTEM
#define TRACE_SYSTEM mac802154

#if !defined(__MAC802154_DRIVER_TRACE) || defined(TRACE_HEADER_MULTI_READ)
#define __MAC802154_DRIVER_TRACE

#include <linux/tracepoint.h>

#include <net/mac802154.h>
#include "ieee802154_i.h"

#define MAXNAME                32
#define LOCAL_ENTRY        __array(char, wpan_phy_name, MAXNAME)
#define LOCAL_ASSIGN        strlcpy(__entry->wpan_phy_name, \
                                wpan_phy_name(local->hw.phy), MAXNAME)
#define LOCAL_PR_FMT        "%s"
#define LOCAL_PR_ARG        __entry->wpan_phy_name

#define CCA_ENTRY __field(enum nl802154_cca_modes, cca_mode) \
                  __field(enum nl802154_cca_opts, cca_opt)
#define CCA_ASSIGN \
        do {                                     \
                (__entry->cca_mode) = cca->mode; \
                (__entry->cca_opt) = cca->opt;   \
        } while (0)
#define CCA_PR_FMT "cca_mode: %d, cca_opt: %d"
#define CCA_PR_ARG __entry->cca_mode, __entry->cca_opt

#define BOOL_TO_STR(bo) (bo) ? "true" : "false"

/* Tracing for driver callbacks */

DECLARE_EVENT_CLASS(local_only_evt4,
        TP_PROTO(struct ieee802154_local *local),
        TP_ARGS(local),
        TP_STRUCT__entry(
                LOCAL_ENTRY
        ),
        TP_fast_assign(
                LOCAL_ASSIGN;
        ),
        TP_printk(LOCAL_PR_FMT, LOCAL_PR_ARG)
);

DEFINE_EVENT(local_only_evt4, 802154_drv_return_void,
        TP_PROTO(struct ieee802154_local *local),
        TP_ARGS(local)
);

TRACE_EVENT(802154_drv_return_int,
        TP_PROTO(struct ieee802154_local *local, int ret),
        TP_ARGS(local, ret),
        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(int, ret)
        ),
        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->ret = ret;
        ),
        TP_printk(LOCAL_PR_FMT ", returned: %d", LOCAL_PR_ARG,
                  __entry->ret)
);

DEFINE_EVENT(local_only_evt4, 802154_drv_start,
        TP_PROTO(struct ieee802154_local *local),
        TP_ARGS(local)
);

DEFINE_EVENT(local_only_evt4, 802154_drv_stop,
        TP_PROTO(struct ieee802154_local *local),
        TP_ARGS(local)
);

TRACE_EVENT(802154_drv_set_channel,
        TP_PROTO(struct ieee802154_local *local, u8 page, u8 channel),
        TP_ARGS(local, page, channel),
        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(u8, page)
                __field(u8, channel)
        ),
        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->page = page;
                __entry->channel = channel;
        ),
        TP_printk(LOCAL_PR_FMT ", page: %d, channel: %d", LOCAL_PR_ARG,
                  __entry->page, __entry->channel)
);

TRACE_EVENT(802154_drv_set_cca_mode,
        TP_PROTO(struct ieee802154_local *local,
                 const struct wpan_phy_cca *cca),
        TP_ARGS(local, cca),
        TP_STRUCT__entry(
                LOCAL_ENTRY
                CCA_ENTRY
        ),
        TP_fast_assign(
                LOCAL_ASSIGN;
                CCA_ASSIGN;
        ),
        TP_printk(LOCAL_PR_FMT ", " CCA_PR_FMT, LOCAL_PR_ARG,
                  CCA_PR_ARG)
);

TRACE_EVENT(802154_drv_set_cca_ed_level,
        TP_PROTO(struct ieee802154_local *local, s32 mbm),
        TP_ARGS(local, mbm),
        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(s32, mbm)
        ),
        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->mbm = mbm;
        ),
        TP_printk(LOCAL_PR_FMT ", ed level: %d", LOCAL_PR_ARG,
                  __entry->mbm)
);

TRACE_EVENT(802154_drv_set_tx_power,
        TP_PROTO(struct ieee802154_local *local, s32 power),
        TP_ARGS(local, power),
        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(s32, power)
        ),
        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->power = power;
        ),
        TP_printk(LOCAL_PR_FMT ", mbm: %d", LOCAL_PR_ARG,
                 __entry->power)
);

TRACE_EVENT(802154_drv_set_lbt_mode,
        TP_PROTO(struct ieee802154_local *local, bool mode),
        TP_ARGS(local, mode),
        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(bool, mode)
        ),
        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->mode = mode;
        ),
        TP_printk(LOCAL_PR_FMT ", lbt mode: %s", LOCAL_PR_ARG,
                  BOOL_TO_STR(__entry->mode))
);

TRACE_EVENT(802154_drv_set_short_addr,
        TP_PROTO(struct ieee802154_local *local, __le16 short_addr),
        TP_ARGS(local, short_addr),
        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(__le16, short_addr)
        ),
        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->short_addr = short_addr;
        ),
        TP_printk(LOCAL_PR_FMT ", short addr: 0x%04x", LOCAL_PR_ARG,
                  le16_to_cpu(__entry->short_addr))
);

TRACE_EVENT(802154_drv_set_pan_id,
        TP_PROTO(struct ieee802154_local *local, __le16 pan_id),
        TP_ARGS(local, pan_id),
        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(__le16, pan_id)
        ),
        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->pan_id = pan_id;
        ),
        TP_printk(LOCAL_PR_FMT ", pan id: 0x%04x", LOCAL_PR_ARG,
                  le16_to_cpu(__entry->pan_id))
);

TRACE_EVENT(802154_drv_set_extended_addr,
        TP_PROTO(struct ieee802154_local *local, __le64 extended_addr),
        TP_ARGS(local, extended_addr),
        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(__le64, extended_addr)
        ),
        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->extended_addr = extended_addr;
        ),
        TP_printk(LOCAL_PR_FMT ", extended addr: 0x%llx", LOCAL_PR_ARG,
                  le64_to_cpu(__entry->extended_addr))
);

TRACE_EVENT(802154_drv_set_pan_coord,
        TP_PROTO(struct ieee802154_local *local, bool is_coord),
        TP_ARGS(local, is_coord),
        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(bool, is_coord)
        ),
        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->is_coord = is_coord;
        ),
        TP_printk(LOCAL_PR_FMT ", is_coord: %s", LOCAL_PR_ARG,
                  BOOL_TO_STR(__entry->is_coord))
);

TRACE_EVENT(802154_drv_set_csma_params,
        TP_PROTO(struct ieee802154_local *local, u8 min_be, u8 max_be,
                 u8 max_csma_backoffs),
        TP_ARGS(local, min_be, max_be, max_csma_backoffs),
        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(u8, min_be)
                __field(u8, max_be)
                __field(u8, max_csma_backoffs)
        ),
        TP_fast_assign(
                LOCAL_ASSIGN,
                __entry->min_be = min_be;
                __entry->max_be = max_be;
                __entry->max_csma_backoffs = max_csma_backoffs;
        ),
        TP_printk(LOCAL_PR_FMT ", min be: %d, max be: %d, max csma backoffs: %d",
                  LOCAL_PR_ARG, __entry->min_be, __entry->max_be,
                  __entry->max_csma_backoffs)
);

TRACE_EVENT(802154_drv_set_max_frame_retries,
        TP_PROTO(struct ieee802154_local *local, s8 max_frame_retries),
        TP_ARGS(local, max_frame_retries),
        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(s8, max_frame_retries)
        ),
        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->max_frame_retries = max_frame_retries;
        ),
        TP_printk(LOCAL_PR_FMT ", max frame retries: %d", LOCAL_PR_ARG,
                  __entry->max_frame_retries)
);

TRACE_EVENT(802154_drv_set_promiscuous_mode,
        TP_PROTO(struct ieee802154_local *local, bool on),
        TP_ARGS(local, on),
        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(bool, on)
        ),
        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->on = on;
        ),
        TP_printk(LOCAL_PR_FMT ", promiscuous mode: %s", LOCAL_PR_ARG,
                  BOOL_TO_STR(__entry->on))
);

#endif /* !__MAC802154_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */

#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH .
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE trace
#include <trace/define_trace.h>















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#ifndef _LINUX_UNALIGNED_PACKED_STRUCT_H
#define _LINUX_UNALIGNED_PACKED_STRUCT_H

#include <linux/kernel.h>

struct __una_u16 { u16 x; } __packed;
struct __una_u32 { u32 x; } __packed;
struct __una_u64 { u64 x; } __packed;

static inline u16 __get_unaligned_cpu16(const void *p)
{
        const struct __una_u16 *ptr = (const struct __una_u16 *)p;
        return ptr->x;
}

static inline u32 __get_unaligned_cpu32(const void *p)
{
        const struct __una_u32 *ptr = (const struct __una_u32 *)p;
        return ptr->x;
}

static inline u64 __get_unaligned_cpu64(const void *p)
{
        const struct __una_u64 *ptr = (const struct __una_u64 *)p;
        return ptr->x;
}

static inline void __put_unaligned_cpu16(u16 val, void *p)
{
        struct __una_u16 *ptr = (struct __una_u16 *)p;
        ptr->x = val;
}

static inline void __put_unaligned_cpu32(u32 val, void *p)
{
        struct __una_u32 *ptr = (struct __una_u32 *)p;
        ptr->x = val;
}

static inline void __put_unaligned_cpu64(u64 val, void *p)
{
        struct __una_u64 *ptr = (struct __una_u64 *)p;
        ptr->x = val;
}

#endif /* _LINUX_UNALIGNED_PACKED_STRUCT_H */





























































































































































































































































    1 


    1 









    1 













































































































































    1 
    1 


    1 






    1 


    1 












    1 

    1 
    1 
















    1 
    1 







































    1 






    1 




    1 
































































    1 











    1 

































































    1 


    1 








    1 






































































































































































































































































































































































































































































































































































































































































































































    1 

    1 







    1 





    1 
    1 







    1 






    1 













    1 

    1 






    1 













    1 






    1 



    1 


































































    1 

    1 








    1 


    1 























































































































































































































































































































































































































































































































































































































































































































































    1 






    1 







    1 


    1 








    1 



    1 


    1 


    1 










    1 

    1 


    1 


















    1 

    1 









    1 













    1 









    1 






    1 












    1 


    1 








    1 







    1 











    1 








    1 


    1 


















    1 


    1 



    1 

    1 

    1 


    1 




















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * NET                An implementation of the SOCKET network access protocol.
 *
 * Version:        @(#)socket.c        1.1.93        18/02/95
 *
 * Authors:        Orest Zborowski, <obz@Kodak.COM>
 *                Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *
 * Fixes:
 *                Anonymous        :        NOTSOCK/BADF cleanup. Error fix in
 *                                        shutdown()
 *                Alan Cox        :        verify_area() fixes
 *                Alan Cox        :        Removed DDI
 *                Jonathan Kamens        :        SOCK_DGRAM reconnect bug
 *                Alan Cox        :        Moved a load of checks to the very
 *                                        top level.
 *                Alan Cox        :        Move address structures to/from user
 *                                        mode above the protocol layers.
 *                Rob Janssen        :        Allow 0 length sends.
 *                Alan Cox        :        Asynchronous I/O support (cribbed from the
 *                                        tty drivers).
 *                Niibe Yutaka        :        Asynchronous I/O for writes (4.4BSD style)
 *                Jeff Uphoff        :        Made max number of sockets command-line
 *                                        configurable.
 *                Matti Aarnio        :        Made the number of sockets dynamic,
 *                                        to be allocated when needed, and mr.
 *                                        Uphoff's max is used as max to be
 *                                        allowed to allocate.
 *                Linus                :        Argh. removed all the socket allocation
 *                                        altogether: it's in the inode now.
 *                Alan Cox        :        Made sock_alloc()/sock_release() public
 *                                        for NetROM and future kernel nfsd type
 *                                        stuff.
 *                Alan Cox        :        sendmsg/recvmsg basics.
 *                Tom Dyas        :        Export net symbols.
 *                Marcin Dalecki        :        Fixed problems with CONFIG_NET="n".
 *                Alan Cox        :        Added thread locking to sys_* calls
 *                                        for sockets. May have errors at the
 *                                        moment.
 *                Kevin Buhr        :        Fixed the dumb errors in the above.
 *                Andi Kleen        :        Some small cleanups, optimizations,
 *                                        and fixed a copy_from_user() bug.
 *                Tigran Aivazian        :        sys_send(args) calls sys_sendto(args, NULL, 0)
 *                Tigran Aivazian        :        Made listen(2) backlog sanity checks
 *                                        protocol-independent
 *
 *        This module is effectively the top level interface to the BSD socket
 *        paradigm.
 *
 *        Based upon Swansea University Computer Society NET3.039
 */

#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/file.h>
#include <linux/net.h>
#include <linux/interrupt.h>
#include <linux/thread_info.h>
#include <linux/rcupdate.h>
#include <linux/netdevice.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/mutex.h>
#include <linux/if_bridge.h>
#include <linux/if_frad.h>
#include <linux/if_vlan.h>
#include <linux/ptp_classify.h>
#include <linux/init.h>
#include <linux/poll.h>
#include <linux/cache.h>
#include <linux/module.h>
#include <linux/highmem.h>
#include <linux/mount.h>
#include <linux/pseudo_fs.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/compat.h>
#include <linux/kmod.h>
#include <linux/audit.h>
#include <linux/wireless.h>
#include <linux/nsproxy.h>
#include <linux/magic.h>
#include <linux/slab.h>
#include <linux/xattr.h>
#include <linux/nospec.h>
#include <linux/indirect_call_wrapper.h>

#include <linux/uaccess.h>
#include <asm/unistd.h>

#include <net/compat.h>
#include <net/wext.h>
#include <net/cls_cgroup.h>

#include <net/sock.h>
#include <linux/netfilter.h>

#include <linux/if_tun.h>
#include <linux/ipv6_route.h>
#include <linux/route.h>
#include <linux/termios.h>
#include <linux/sockios.h>
#include <net/busy_poll.h>
#include <linux/errqueue.h>

#ifdef CONFIG_NET_RX_BUSY_POLL
unsigned int sysctl_net_busy_read __read_mostly;
unsigned int sysctl_net_busy_poll __read_mostly;
#endif

static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to);
static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from);
static int sock_mmap(struct file *file, struct vm_area_struct *vma);

static int sock_close(struct inode *inode, struct file *file);
static __poll_t sock_poll(struct file *file,
                              struct poll_table_struct *wait);
static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
#ifdef CONFIG_COMPAT
static long compat_sock_ioctl(struct file *file,
                              unsigned int cmd, unsigned long arg);
#endif
static int sock_fasync(int fd, struct file *filp, int on);
static ssize_t sock_sendpage(struct file *file, struct page *page,
                             int offset, size_t size, loff_t *ppos, int more);
static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
                                struct pipe_inode_info *pipe, size_t len,
                                unsigned int flags);

#ifdef CONFIG_PROC_FS
static void sock_show_fdinfo(struct seq_file *m, struct file *f)
{
        struct socket *sock = f->private_data;

        if (sock->ops->show_fdinfo)
                sock->ops->show_fdinfo(m, sock);
}
#else
#define sock_show_fdinfo NULL
#endif

/*
 *        Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
 *        in the operation structures but are done directly via the socketcall() multiplexor.
 */

static const struct file_operations socket_file_ops = {
        .owner =        THIS_MODULE,
        .llseek =        no_llseek,
        .read_iter =        sock_read_iter,
        .write_iter =        sock_write_iter,
        .poll =                sock_poll,
        .unlocked_ioctl = sock_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl = compat_sock_ioctl,
#endif
        .mmap =                sock_mmap,
        .release =        sock_close,
        .fasync =        sock_fasync,
        .sendpage =        sock_sendpage,
        .splice_write = generic_splice_sendpage,
        .splice_read =        sock_splice_read,
        .show_fdinfo =        sock_show_fdinfo,
};

/*
 *        The protocol list. Each protocol is registered in here.
 */

static DEFINE_SPINLOCK(net_family_lock);
static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly;

/*
 * Support routines.
 * Move socket addresses back and forth across the kernel/user
 * divide and look after the messy bits.
 */

/**
 *        move_addr_to_kernel        -        copy a socket address into kernel space
 *        @uaddr: Address in user space
 *        @kaddr: Address in kernel space
 *        @ulen: Length in user space
 *
 *        The address is copied into kernel space. If the provided address is
 *        too long an error code of -EINVAL is returned. If the copy gives
 *        invalid addresses -EFAULT is returned. On a success 0 is returned.
 */

int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr)
{
        if (ulen < 0 || ulen > sizeof(struct sockaddr_storage))
                return -EINVAL;
        if (ulen == 0)
                return 0;
        if (copy_from_user(kaddr, uaddr, ulen))
                return -EFAULT;
        return audit_sockaddr(ulen, kaddr);
}

/**
 *        move_addr_to_user        -        copy an address to user space
 *        @kaddr: kernel space address
 *        @klen: length of address in kernel
 *        @uaddr: user space address
 *        @ulen: pointer to user length field
 *
 *        The value pointed to by ulen on entry is the buffer length available.
 *        This is overwritten with the buffer space used. -EINVAL is returned
 *        if an overlong buffer is specified or a negative buffer size. -EFAULT
 *        is returned if either the buffer or the length field are not
 *        accessible.
 *        After copying the data up to the limit the user specifies, the true
 *        length of the data is written over the length limit the user
 *        specified. Zero is returned for a success.
 */

static int move_addr_to_user(struct sockaddr_storage *kaddr, int klen,
                             void __user *uaddr, int __user *ulen)
{
        int err;
        int len;

        BUG_ON(klen > sizeof(struct sockaddr_storage));
        err = get_user(len, ulen);
        if (err)
                return err;
        if (len > klen)
                len = klen;
        if (len < 0)
                return -EINVAL;
        if (len) {
                if (audit_sockaddr(klen, kaddr))
                        return -ENOMEM;
                if (copy_to_user(uaddr, kaddr, len))
                        return -EFAULT;
        }
        /*
         *      "fromlen shall refer to the value before truncation.."
         *                      1003.1g
         */
        return __put_user(klen, ulen);
}

static struct kmem_cache *sock_inode_cachep __ro_after_init;

static struct inode *sock_alloc_inode(struct super_block *sb)
{
        struct socket_alloc *ei;

        ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
        if (!ei)
                return NULL;
        init_waitqueue_head(&ei->socket.wq.wait);
        ei->socket.wq.fasync_list = NULL;
        ei->socket.wq.flags = 0;

        ei->socket.state = SS_UNCONNECTED;
        ei->socket.flags = 0;
        ei->socket.ops = NULL;
        ei->socket.sk = NULL;
        ei->socket.file = NULL;

        return &ei->vfs_inode;
}

static void sock_free_inode(struct inode *inode)
{
        struct socket_alloc *ei;

        ei = container_of(inode, struct socket_alloc, vfs_inode);
        kmem_cache_free(sock_inode_cachep, ei);
}

static void init_once(void *foo)
{
        struct socket_alloc *ei = (struct socket_alloc *)foo;

        inode_init_once(&ei->vfs_inode);
}

static void init_inodecache(void)
{
        sock_inode_cachep = kmem_cache_create("sock_inode_cache",
                                              sizeof(struct socket_alloc),
                                              0,
                                              (SLAB_HWCACHE_ALIGN |
                                               SLAB_RECLAIM_ACCOUNT |
                                               SLAB_MEM_SPREAD | SLAB_ACCOUNT),
                                              init_once);
        BUG_ON(sock_inode_cachep == NULL);
}

static const struct super_operations sockfs_ops = {
        .alloc_inode        = sock_alloc_inode,
        .free_inode        = sock_free_inode,
        .statfs                = simple_statfs,
};

/*
 * sockfs_dname() is called from d_path().
 */
static char *sockfs_dname(struct dentry *dentry, char *buffer, int buflen)
{
        return dynamic_dname(dentry, buffer, buflen, "socket:[%lu]",
                                d_inode(dentry)->i_ino);
}

static const struct dentry_operations sockfs_dentry_operations = {
        .d_dname  = sockfs_dname,
};

static int sockfs_xattr_get(const struct xattr_handler *handler,
                            struct dentry *dentry, struct inode *inode,
                            const char *suffix, void *value, size_t size)
{
        if (value) {
                if (dentry->d_name.len + 1 > size)
                        return -ERANGE;
                memcpy(value, dentry->d_name.name, dentry->d_name.len + 1);
        }
        return dentry->d_name.len + 1;
}

#define XATTR_SOCKPROTONAME_SUFFIX "sockprotoname"
#define XATTR_NAME_SOCKPROTONAME (XATTR_SYSTEM_PREFIX XATTR_SOCKPROTONAME_SUFFIX)
#define XATTR_NAME_SOCKPROTONAME_LEN (sizeof(XATTR_NAME_SOCKPROTONAME)-1)

static const struct xattr_handler sockfs_xattr_handler = {
        .name = XATTR_NAME_SOCKPROTONAME,
        .get = sockfs_xattr_get,
};

static int sockfs_security_xattr_set(const struct xattr_handler *handler,
                                     struct dentry *dentry, struct inode *inode,
                                     const char *suffix, const void *value,
                                     size_t size, int flags)
{
        /* Handled by LSM. */
        return -EAGAIN;
}

static const struct xattr_handler sockfs_security_xattr_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .set = sockfs_security_xattr_set,
};

static const struct xattr_handler *sockfs_xattr_handlers[] = {
        &sockfs_xattr_handler,
        &sockfs_security_xattr_handler,
        NULL
};

static int sockfs_init_fs_context(struct fs_context *fc)
{
        struct pseudo_fs_context *ctx = init_pseudo(fc, SOCKFS_MAGIC);
        if (!ctx)
                return -ENOMEM;
        ctx->ops = &sockfs_ops;
        ctx->dops = &sockfs_dentry_operations;
        ctx->xattr = sockfs_xattr_handlers;
        return 0;
}

static struct vfsmount *sock_mnt __read_mostly;

static struct file_system_type sock_fs_type = {
        .name =                "sockfs",
        .init_fs_context = sockfs_init_fs_context,
        .kill_sb =        kill_anon_super,
};

/*
 *        Obtains the first available file descriptor and sets it up for use.
 *
 *        These functions create file structures and maps them to fd space
 *        of the current process. On success it returns file descriptor
 *        and file struct implicitly stored in sock->file.
 *        Note that another thread may close file descriptor before we return
 *        from this function. We use the fact that now we do not refer
 *        to socket after mapping. If one day we will need it, this
 *        function will increment ref. count on file by 1.
 *
 *        In any case returned fd MAY BE not valid!
 *        This race condition is unavoidable
 *        with shared fd spaces, we cannot solve it inside kernel,
 *        but we take care of internal coherence yet.
 */

/**
 *        sock_alloc_file - Bind a &socket to a &file
 *        @sock: socket
 *        @flags: file status flags
 *        @dname: protocol name
 *
 *        Returns the &file bound with @sock, implicitly storing it
 *        in sock->file. If dname is %NULL, sets to "".
 *        On failure the return is a ERR pointer (see linux/err.h).
 *        This function uses GFP_KERNEL internally.
 */

struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
{
        struct file *file;

        if (!dname)
                dname = sock->sk ? sock->sk->sk_prot_creator->name : "";

        file = alloc_file_pseudo(SOCK_INODE(sock), sock_mnt, dname,
                                O_RDWR | (flags & O_NONBLOCK),
                                &socket_file_ops);
        if (IS_ERR(file)) {
                sock_release(sock);
                return file;
        }

        sock->file = file;
        file->private_data = sock;
        stream_open(SOCK_INODE(sock), file);
        return file;
}
EXPORT_SYMBOL(sock_alloc_file);

static int sock_map_fd(struct socket *sock, int flags)
{
        struct file *newfile;
        int fd = get_unused_fd_flags(flags);
        if (unlikely(fd < 0)) {
                sock_release(sock);
                return fd;
        }

        newfile = sock_alloc_file(sock, flags, NULL);
        if (!IS_ERR(newfile)) {
                fd_install(fd, newfile);
                return fd;
        }

        put_unused_fd(fd);
        return PTR_ERR(newfile);
}

/**
 *        sock_from_file - Return the &socket bounded to @file.
 *        @file: file
 *        @err: pointer to an error code return
 *
 *        On failure returns %NULL and assigns -ENOTSOCK to @err.
 */

struct socket *sock_from_file(struct file *file, int *err)
{
        if (file->f_op == &socket_file_ops)
                return file->private_data;        /* set in sock_map_fd */

        *err = -ENOTSOCK;
        return NULL;
}
EXPORT_SYMBOL(sock_from_file);

/**
 *        sockfd_lookup - Go from a file number to its socket slot
 *        @fd: file handle
 *        @err: pointer to an error code return
 *
 *        The file handle passed in is locked and the socket it is bound
 *        to is returned. If an error occurs the err pointer is overwritten
 *        with a negative errno code and NULL is returned. The function checks
 *        for both invalid handles and passing a handle which is not a socket.
 *
 *        On a success the socket object pointer is returned.
 */

struct socket *sockfd_lookup(int fd, int *err)
{
        struct file *file;
        struct socket *sock;

        file = fget(fd);
        if (!file) {
                *err = -EBADF;
                return NULL;
        }

        sock = sock_from_file(file, err);
        if (!sock)
                fput(file);
        return sock;
}
EXPORT_SYMBOL(sockfd_lookup);

static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed)
{
        struct fd f = fdget(fd);
        struct socket *sock;

        *err = -EBADF;
        if (f.file) {
                sock = sock_from_file(f.file, err);
                if (likely(sock)) {
                        *fput_needed = f.flags & FDPUT_FPUT;
                        return sock;
                }
                fdput(f);
        }
        return NULL;
}

static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer,
                                size_t size)
{
        ssize_t len;
        ssize_t used = 0;

        len = security_inode_listsecurity(d_inode(dentry), buffer, size);
        if (len < 0)
                return len;
        used += len;
        if (buffer) {
                if (size < used)
                        return -ERANGE;
                buffer += len;
        }

        len = (XATTR_NAME_SOCKPROTONAME_LEN + 1);
        used += len;
        if (buffer) {
                if (size < used)
                        return -ERANGE;
                memcpy(buffer, XATTR_NAME_SOCKPROTONAME, len);
                buffer += len;
        }

        return used;
}

static int sockfs_setattr(struct dentry *dentry, struct iattr *iattr)
{
        int err = simple_setattr(dentry, iattr);

        if (!err && (iattr->ia_valid & ATTR_UID)) {
                struct socket *sock = SOCKET_I(d_inode(dentry));

                if (sock->sk)
                        sock->sk->sk_uid = iattr->ia_uid;
                else
                        err = -ENOENT;
        }

        return err;
}

static const struct inode_operations sockfs_inode_ops = {
        .listxattr = sockfs_listxattr,
        .setattr = sockfs_setattr,
};

/**
 *        sock_alloc - allocate a socket
 *
 *        Allocate a new inode and socket object. The two are bound together
 *        and initialised. The socket is then returned. If we are out of inodes
 *        NULL is returned. This functions uses GFP_KERNEL internally.
 */

struct socket *sock_alloc(void)
{
        struct inode *inode;
        struct socket *sock;

        inode = new_inode_pseudo(sock_mnt->mnt_sb);
        if (!inode)
                return NULL;

        sock = SOCKET_I(inode);

        inode->i_ino = get_next_ino();
        inode->i_mode = S_IFSOCK | S_IRWXUGO;
        inode->i_uid = current_fsuid();
        inode->i_gid = current_fsgid();
        inode->i_op = &sockfs_inode_ops;

        return sock;
}
EXPORT_SYMBOL(sock_alloc);

static void __sock_release(struct socket *sock, struct inode *inode)
{
        if (sock->ops) {
                struct module *owner = sock->ops->owner;

                if (inode)
                        inode_lock(inode);
                sock->ops->release(sock);
                sock->sk = NULL;
                if (inode)
                        inode_unlock(inode);
                sock->ops = NULL;
                module_put(owner);
        }

        if (sock->wq.fasync_list)
                pr_err("%s: fasync list not empty!\n", __func__);

        if (!sock->file) {
                iput(SOCK_INODE(sock));
                return;
        }
        sock->file = NULL;
}

/**
 *        sock_release - close a socket
 *        @sock: socket to close
 *
 *        The socket is released from the protocol stack if it has a release
 *        callback, and the inode is then released if the socket is bound to
 *        an inode not a file.
 */
void sock_release(struct socket *sock)
{
        __sock_release(sock, NULL);
}
EXPORT_SYMBOL(sock_release);

void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags)
{
        u8 flags = *tx_flags;

        if (tsflags & SOF_TIMESTAMPING_TX_HARDWARE)
                flags |= SKBTX_HW_TSTAMP;

        if (tsflags & SOF_TIMESTAMPING_TX_SOFTWARE)
                flags |= SKBTX_SW_TSTAMP;

        if (tsflags & SOF_TIMESTAMPING_TX_SCHED)
                flags |= SKBTX_SCHED_TSTAMP;

        *tx_flags = flags;
}
EXPORT_SYMBOL(__sock_tx_timestamp);

INDIRECT_CALLABLE_DECLARE(int inet_sendmsg(struct socket *, struct msghdr *,
                                           size_t));
INDIRECT_CALLABLE_DECLARE(int inet6_sendmsg(struct socket *, struct msghdr *,
                                            size_t));
static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg)
{
        int ret = INDIRECT_CALL_INET(sock->ops->sendmsg, inet6_sendmsg,
                                     inet_sendmsg, sock, msg,
                                     msg_data_left(msg));
        BUG_ON(ret == -EIOCBQUEUED);
        return ret;
}

static int __sock_sendmsg(struct socket *sock, struct msghdr *msg)
{
        int err = security_socket_sendmsg(sock, msg,
                                          msg_data_left(msg));

        return err ?: sock_sendmsg_nosec(sock, msg);
}

/**
 *        sock_sendmsg - send a message through @sock
 *        @sock: socket
 *        @msg: message to send
 *
 *        Sends @msg through @sock, passing through LSM.
 *        Returns the number of bytes sent, or an error code.
 */
int sock_sendmsg(struct socket *sock, struct msghdr *msg)
{
        struct sockaddr_storage *save_addr = (struct sockaddr_storage *)msg->msg_name;
        struct sockaddr_storage address;
        int save_len = msg->msg_namelen;
        int ret;

        if (msg->msg_name) {
                memcpy(&address, msg->msg_name, msg->msg_namelen);
                msg->msg_name = &address;
        }

        ret = __sock_sendmsg(sock, msg);
        msg->msg_name = save_addr;
        msg->msg_namelen = save_len;

        return ret;
}
EXPORT_SYMBOL(sock_sendmsg);

/**
 *        kernel_sendmsg - send a message through @sock (kernel-space)
 *        @sock: socket
 *        @msg: message header
 *        @vec: kernel vec
 *        @num: vec array length
 *        @size: total message data size
 *
 *        Builds the message data with @vec and sends it through @sock.
 *        Returns the number of bytes sent, or an error code.
 */

int kernel_sendmsg(struct socket *sock, struct msghdr *msg,
                   struct kvec *vec, size_t num, size_t size)
{
        iov_iter_kvec(&msg->msg_iter, WRITE, vec, num, size);
        return sock_sendmsg(sock, msg);
}
EXPORT_SYMBOL(kernel_sendmsg);

/**
 *        kernel_sendmsg_locked - send a message through @sock (kernel-space)
 *        @sk: sock
 *        @msg: message header
 *        @vec: output s/g array
 *        @num: output s/g array length
 *        @size: total message data size
 *
 *        Builds the message data with @vec and sends it through @sock.
 *        Returns the number of bytes sent, or an error code.
 *        Caller must hold @sk.
 */

int kernel_sendmsg_locked(struct sock *sk, struct msghdr *msg,
                          struct kvec *vec, size_t num, size_t size)
{
        struct socket *sock = sk->sk_socket;

        if (!sock->ops->sendmsg_locked)
                return sock_no_sendmsg_locked(sk, msg, size);

        iov_iter_kvec(&msg->msg_iter, WRITE, vec, num, size);

        return sock->ops->sendmsg_locked(sk, msg, msg_data_left(msg));
}
EXPORT_SYMBOL(kernel_sendmsg_locked);

static bool skb_is_err_queue(const struct sk_buff *skb)
{
        /* pkt_type of skbs enqueued on the error queue are set to
         * PACKET_OUTGOING in skb_set_err_queue(). This is only safe to do
         * in recvmsg, since skbs received on a local socket will never
         * have a pkt_type of PACKET_OUTGOING.
         */
        return skb->pkt_type == PACKET_OUTGOING;
}

/* On transmit, software and hardware timestamps are returned independently.
 * As the two skb clones share the hardware timestamp, which may be updated
 * before the software timestamp is received, a hardware TX timestamp may be
 * returned only if there is no software TX timestamp. Ignore false software
 * timestamps, which may be made in the __sock_recv_timestamp() call when the
 * option SO_TIMESTAMP_OLD(NS) is enabled on the socket, even when the skb has a
 * hardware timestamp.
 */
static bool skb_is_swtx_tstamp(const struct sk_buff *skb, int false_tstamp)
{
        return skb->tstamp && !false_tstamp && skb_is_err_queue(skb);
}

static void put_ts_pktinfo(struct msghdr *msg, struct sk_buff *skb)
{
        struct scm_ts_pktinfo ts_pktinfo;
        struct net_device *orig_dev;

        if (!skb_mac_header_was_set(skb))
                return;

        memset(&ts_pktinfo, 0, sizeof(ts_pktinfo));

        rcu_read_lock();
        orig_dev = dev_get_by_napi_id(skb_napi_id(skb));
        if (orig_dev)
                ts_pktinfo.if_index = orig_dev->ifindex;
        rcu_read_unlock();

        ts_pktinfo.pkt_length = skb->len - skb_mac_offset(skb);
        put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_PKTINFO,
                 sizeof(ts_pktinfo), &ts_pktinfo);
}

/*
 * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP)
 */
void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
        struct sk_buff *skb)
{
        int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP);
        int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW);
        struct scm_timestamping_internal tss;

        int empty = 1, false_tstamp = 0;
        struct skb_shared_hwtstamps *shhwtstamps =
                skb_hwtstamps(skb);

        /* Race occurred between timestamp enabling and packet
           receiving.  Fill in the current time for now. */
        if (need_software_tstamp && skb->tstamp == 0) {
                __net_timestamp(skb);
                false_tstamp = 1;
        }

        if (need_software_tstamp) {
                if (!sock_flag(sk, SOCK_RCVTSTAMPNS)) {
                        if (new_tstamp) {
                                struct __kernel_sock_timeval tv;

                                skb_get_new_timestamp(skb, &tv);
                                put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
                                         sizeof(tv), &tv);
                        } else {
                                struct __kernel_old_timeval tv;

                                skb_get_timestamp(skb, &tv);
                                put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
                                         sizeof(tv), &tv);
                        }
                } else {
                        if (new_tstamp) {
                                struct __kernel_timespec ts;

                                skb_get_new_timestampns(skb, &ts);
                                put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
                                         sizeof(ts), &ts);
                        } else {
                                struct __kernel_old_timespec ts;

                                skb_get_timestampns(skb, &ts);
                                put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
                                         sizeof(ts), &ts);
                        }
                }
        }

        memset(&tss, 0, sizeof(tss));
        if ((sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) &&
            ktime_to_timespec64_cond(skb->tstamp, tss.ts + 0))
                empty = 0;
        if (shhwtstamps &&
            (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
            !skb_is_swtx_tstamp(skb, false_tstamp) &&
            ktime_to_timespec64_cond(shhwtstamps->hwtstamp, tss.ts + 2)) {
                empty = 0;
                if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) &&
                    !skb_is_err_queue(skb))
                        put_ts_pktinfo(msg, skb);
        }
        if (!empty) {
                if (sock_flag(sk, SOCK_TSTAMP_NEW))
                        put_cmsg_scm_timestamping64(msg, &tss);
                else
                        put_cmsg_scm_timestamping(msg, &tss);

                if (skb_is_err_queue(skb) && skb->len &&
                    SKB_EXT_ERR(skb)->opt_stats)
                        put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_OPT_STATS,
                                 skb->len, skb->data);
        }
}
EXPORT_SYMBOL_GPL(__sock_recv_timestamp);

void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
        struct sk_buff *skb)
{
        int ack;

        if (!sock_flag(sk, SOCK_WIFI_STATUS))
                return;
        if (!skb->wifi_acked_valid)
                return;

        ack = skb->wifi_acked;

        put_cmsg(msg, SOL_SOCKET, SCM_WIFI_STATUS, sizeof(ack), &ack);
}
EXPORT_SYMBOL_GPL(__sock_recv_wifi_status);

static inline void sock_recv_drops(struct msghdr *msg, struct sock *sk,
                                   struct sk_buff *skb)
{
        if (sock_flag(sk, SOCK_RXQ_OVFL) && skb && SOCK_SKB_CB(skb)->dropcount)
                put_cmsg(msg, SOL_SOCKET, SO_RXQ_OVFL,
                        sizeof(__u32), &SOCK_SKB_CB(skb)->dropcount);
}

void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
        struct sk_buff *skb)
{
        sock_recv_timestamp(msg, sk, skb);
        sock_recv_drops(msg, sk, skb);
}
EXPORT_SYMBOL_GPL(__sock_recv_ts_and_drops);

INDIRECT_CALLABLE_DECLARE(int inet_recvmsg(struct socket *, struct msghdr *,
                                           size_t, int));
INDIRECT_CALLABLE_DECLARE(int inet6_recvmsg(struct socket *, struct msghdr *,
                                            size_t, int));
static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
                                     int flags)
{
        return INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg,
                                  inet_recvmsg, sock, msg, msg_data_left(msg),
                                  flags);
}

/**
 *        sock_recvmsg - receive a message from @sock
 *        @sock: socket
 *        @msg: message to receive
 *        @flags: message flags
 *
 *        Receives @msg from @sock, passing through LSM. Returns the total number
 *        of bytes received, or an error.
 */
int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags)
{
        int err = security_socket_recvmsg(sock, msg, msg_data_left(msg), flags);

        return err ?: sock_recvmsg_nosec(sock, msg, flags);
}
EXPORT_SYMBOL(sock_recvmsg);

/**
 *        kernel_recvmsg - Receive a message from a socket (kernel space)
 *        @sock: The socket to receive the message from
 *        @msg: Received message
 *        @vec: Input s/g array for message data
 *        @num: Size of input s/g array
 *        @size: Number of bytes to read
 *        @flags: Message flags (MSG_DONTWAIT, etc...)
 *
 *        On return the msg structure contains the scatter/gather array passed in the
 *        vec argument. The array is modified so that it consists of the unfilled
 *        portion of the original array.
 *
 *        The returned value is the total number of bytes received, or an error.
 */

int kernel_recvmsg(struct socket *sock, struct msghdr *msg,
                   struct kvec *vec, size_t num, size_t size, int flags)
{
        msg->msg_control_is_user = false;
        iov_iter_kvec(&msg->msg_iter, READ, vec, num, size);
        return sock_recvmsg(sock, msg, flags);
}
EXPORT_SYMBOL(kernel_recvmsg);

static ssize_t sock_sendpage(struct file *file, struct page *page,
                             int offset, size_t size, loff_t *ppos, int more)
{
        struct socket *sock;
        int flags;

        sock = file->private_data;

        flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
        /* more is a combination of MSG_MORE and MSG_SENDPAGE_NOTLAST */
        flags |= more;

        return kernel_sendpage(sock, page, offset, size, flags);
}

static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
                                struct pipe_inode_info *pipe, size_t len,
                                unsigned int flags)
{
        struct socket *sock = file->private_data;

        if (unlikely(!sock->ops->splice_read))
                return generic_file_splice_read(file, ppos, pipe, len, flags);

        return sock->ops->splice_read(sock, ppos, pipe, len, flags);
}

static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
        struct file *file = iocb->ki_filp;
        struct socket *sock = file->private_data;
        struct msghdr msg = {.msg_iter = *to,
                             .msg_iocb = iocb};
        ssize_t res;

        if (file->f_flags & O_NONBLOCK || (iocb->ki_flags & IOCB_NOWAIT))
                msg.msg_flags = MSG_DONTWAIT;

        if (iocb->ki_pos != 0)
                return -ESPIPE;

        if (!iov_iter_count(to))        /* Match SYS5 behaviour */
                return 0;

        res = sock_recvmsg(sock, &msg, msg.msg_flags);
        *to = msg.msg_iter;
        return res;
}

static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        struct file *file = iocb->ki_filp;
        struct socket *sock = file->private_data;
        struct msghdr msg = {.msg_iter = *from,
                             .msg_iocb = iocb};
        ssize_t res;

        if (iocb->ki_pos != 0)
                return -ESPIPE;

        if (file->f_flags & O_NONBLOCK || (iocb->ki_flags & IOCB_NOWAIT))
                msg.msg_flags = MSG_DONTWAIT;

        if (sock->type == SOCK_SEQPACKET)
                msg.msg_flags |= MSG_EOR;

        res = __sock_sendmsg(sock, &msg);
        *from = msg.msg_iter;
        return res;
}

/*
 * Atomic setting of ioctl hooks to avoid race
 * with module unload.
 */

static DEFINE_MUTEX(br_ioctl_mutex);
static int (*br_ioctl_hook) (struct net *, unsigned int cmd, void __user *arg);

void brioctl_set(int (*hook) (struct net *, unsigned int, void __user *))
{
        mutex_lock(&br_ioctl_mutex);
        br_ioctl_hook = hook;
        mutex_unlock(&br_ioctl_mutex);
}
EXPORT_SYMBOL(brioctl_set);

static DEFINE_MUTEX(vlan_ioctl_mutex);
static int (*vlan_ioctl_hook) (struct net *, void __user *arg);

void vlan_ioctl_set(int (*hook) (struct net *, void __user *))
{
        mutex_lock(&vlan_ioctl_mutex);
        vlan_ioctl_hook = hook;
        mutex_unlock(&vlan_ioctl_mutex);
}
EXPORT_SYMBOL(vlan_ioctl_set);

static DEFINE_MUTEX(dlci_ioctl_mutex);
static int (*dlci_ioctl_hook) (unsigned int, void __user *);

void dlci_ioctl_set(int (*hook) (unsigned int, void __user *))
{
        mutex_lock(&dlci_ioctl_mutex);
        dlci_ioctl_hook = hook;
        mutex_unlock(&dlci_ioctl_mutex);
}
EXPORT_SYMBOL(dlci_ioctl_set);

static long sock_do_ioctl(struct net *net, struct socket *sock,
                          unsigned int cmd, unsigned long arg)
{
        int err;
        void __user *argp = (void __user *)arg;

        err = sock->ops->ioctl(sock, cmd, arg);

        /*
         * If this ioctl is unknown try to hand it down
         * to the NIC driver.
         */
        if (err != -ENOIOCTLCMD)
                return err;

        if (cmd == SIOCGIFCONF) {
                struct ifconf ifc;
                if (copy_from_user(&ifc, argp, sizeof(struct ifconf)))
                        return -EFAULT;
                rtnl_lock();
                err = dev_ifconf(net, &ifc, sizeof(struct ifreq));
                rtnl_unlock();
                if (!err && copy_to_user(argp, &ifc, sizeof(struct ifconf)))
                        err = -EFAULT;
        } else if (is_socket_ioctl_cmd(cmd)) {
                struct ifreq ifr;
                bool need_copyout;
                if (copy_from_user(&ifr, argp, sizeof(struct ifreq)))
                        return -EFAULT;
                err = dev_ioctl(net, cmd, &ifr, &need_copyout);
                if (!err && need_copyout)
                        if (copy_to_user(argp, &ifr, sizeof(struct ifreq)))
                                return -EFAULT;
        } else {
                err = -ENOTTY;
        }
        return err;
}

/*
 *        With an ioctl, arg may well be a user mode pointer, but we don't know
 *        what to do with it - that's up to the protocol still.
 */

static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
        struct socket *sock;
        struct sock *sk;
        void __user *argp = (void __user *)arg;
        int pid, err;
        struct net *net;

        sock = file->private_data;
        sk = sock->sk;
        net = sock_net(sk);
        if (unlikely(cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15))) {
                struct ifreq ifr;
                bool need_copyout;
                if (copy_from_user(&ifr, argp, sizeof(struct ifreq)))
                        return -EFAULT;
                err = dev_ioctl(net, cmd, &ifr, &need_copyout);
                if (!err && need_copyout)
                        if (copy_to_user(argp, &ifr, sizeof(struct ifreq)))
                                return -EFAULT;
        } else
#ifdef CONFIG_WEXT_CORE
        if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
                err = wext_handle_ioctl(net, cmd, argp);
        } else
#endif
                switch (cmd) {
                case FIOSETOWN:
                case SIOCSPGRP:
                        err = -EFAULT;
                        if (get_user(pid, (int __user *)argp))
                                break;
                        err = f_setown(sock->file, pid, 1);
                        break;
                case FIOGETOWN:
                case SIOCGPGRP:
                        err = put_user(f_getown(sock->file),
                                       (int __user *)argp);
                        break;
                case SIOCGIFBR:
                case SIOCSIFBR:
                case SIOCBRADDBR:
                case SIOCBRDELBR:
                        err = -ENOPKG;
                        if (!br_ioctl_hook)
                                request_module("bridge");

                        mutex_lock(&br_ioctl_mutex);
                        if (br_ioctl_hook)
                                err = br_ioctl_hook(net, cmd, argp);
                        mutex_unlock(&br_ioctl_mutex);
                        break;
                case SIOCGIFVLAN:
                case SIOCSIFVLAN:
                        err = -ENOPKG;
                        if (!vlan_ioctl_hook)
                                request_module("8021q");

                        mutex_lock(&vlan_ioctl_mutex);
                        if (vlan_ioctl_hook)
                                err = vlan_ioctl_hook(net, argp);
                        mutex_unlock(&vlan_ioctl_mutex);
                        break;
                case SIOCADDDLCI:
                case SIOCDELDLCI:
                        err = -ENOPKG;
                        if (!dlci_ioctl_hook)
                                request_module("dlci");

                        mutex_lock(&dlci_ioctl_mutex);
                        if (dlci_ioctl_hook)
                                err = dlci_ioctl_hook(cmd, argp);
                        mutex_unlock(&dlci_ioctl_mutex);
                        break;
                case SIOCGSKNS:
                        err = -EPERM;
                        if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                                break;

                        err = open_related_ns(&net->ns, get_net_ns);
                        break;
                case SIOCGSTAMP_OLD:
                case SIOCGSTAMPNS_OLD:
                        if (!sock->ops->gettstamp) {
                                err = -ENOIOCTLCMD;
                                break;
                        }
                        err = sock->ops->gettstamp(sock, argp,
                                                   cmd == SIOCGSTAMP_OLD,
                                                   !IS_ENABLED(CONFIG_64BIT));
                        break;
                case SIOCGSTAMP_NEW:
                case SIOCGSTAMPNS_NEW:
                        if (!sock->ops->gettstamp) {
                                err = -ENOIOCTLCMD;
                                break;
                        }
                        err = sock->ops->gettstamp(sock, argp,
                                                   cmd == SIOCGSTAMP_NEW,
                                                   false);
                        break;
                default:
                        err = sock_do_ioctl(net, sock, cmd, arg);
                        break;
                }
        return err;
}

/**
 *        sock_create_lite - creates a socket
 *        @family: protocol family (AF_INET, ...)
 *        @type: communication type (SOCK_STREAM, ...)
 *        @protocol: protocol (0, ...)
 *        @res: new socket
 *
 *        Creates a new socket and assigns it to @res, passing through LSM.
 *        The new socket initialization is not complete, see kernel_accept().
 *        Returns 0 or an error. On failure @res is set to %NULL.
 *        This function internally uses GFP_KERNEL.
 */

int sock_create_lite(int family, int type, int protocol, struct socket **res)
{
        int err;
        struct socket *sock = NULL;

        err = security_socket_create(family, type, protocol, 1);
        if (err)
                goto out;

        sock = sock_alloc();
        if (!sock) {
                err = -ENOMEM;
                goto out;
        }

        sock->type = type;
        err = security_socket_post_create(sock, family, type, protocol, 1);
        if (err)
                goto out_release;

out:
        *res = sock;
        return err;
out_release:
        sock_release(sock);
        sock = NULL;
        goto out;
}
EXPORT_SYMBOL(sock_create_lite);

/* No kernel lock held - perfect */
static __poll_t sock_poll(struct file *file, poll_table *wait)
{
        struct socket *sock = file->private_data;
        __poll_t events = poll_requested_events(wait), flag = 0;

        if (!sock->ops->poll)
                return 0;

        if (sk_can_busy_loop(sock->sk)) {
                /* poll once if requested by the syscall */
                if (events & POLL_BUSY_LOOP)
                        sk_busy_loop(sock->sk, 1);

                /* if this socket can poll_ll, tell the system call */
                flag = POLL_BUSY_LOOP;
        }

        return sock->ops->poll(file, sock, wait) | flag;
}

static int sock_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct socket *sock = file->private_data;

        return sock->ops->mmap(file, sock, vma);
}

static int sock_close(struct inode *inode, struct file *filp)
{
        __sock_release(SOCKET_I(inode), inode);
        return 0;
}

/*
 *        Update the socket async list
 *
 *        Fasync_list locking strategy.
 *
 *        1. fasync_list is modified only under process context socket lock
 *           i.e. under semaphore.
 *        2. fasync_list is used under read_lock(&sk->sk_callback_lock)
 *           or under socket lock
 */

static int sock_fasync(int fd, struct file *filp, int on)
{
        struct socket *sock = filp->private_data;
        struct sock *sk = sock->sk;
        struct socket_wq *wq = &sock->wq;

        if (sk == NULL)
                return -EINVAL;

        lock_sock(sk);
        fasync_helper(fd, filp, on, &wq->fasync_list);

        if (!wq->fasync_list)
                sock_reset_flag(sk, SOCK_FASYNC);
        else
                sock_set_flag(sk, SOCK_FASYNC);

        release_sock(sk);
        return 0;
}

/* This function may be called only under rcu_lock */

int sock_wake_async(struct socket_wq *wq, int how, int band)
{
        if (!wq || !wq->fasync_list)
                return -1;

        switch (how) {
        case SOCK_WAKE_WAITD:
                if (test_bit(SOCKWQ_ASYNC_WAITDATA, &wq->flags))
                        break;
                goto call_kill;
        case SOCK_WAKE_SPACE:
                if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags))
                        break;
                fallthrough;
        case SOCK_WAKE_IO:
call_kill:
                kill_fasync(&wq->fasync_list, SIGIO, band);
                break;
        case SOCK_WAKE_URG:
                kill_fasync(&wq->fasync_list, SIGURG, band);
        }

        return 0;
}
EXPORT_SYMBOL(sock_wake_async);

/**
 *        __sock_create - creates a socket
 *        @net: net namespace
 *        @family: protocol family (AF_INET, ...)
 *        @type: communication type (SOCK_STREAM, ...)
 *        @protocol: protocol (0, ...)
 *        @res: new socket
 *        @kern: boolean for kernel space sockets
 *
 *        Creates a new socket and assigns it to @res, passing through LSM.
 *        Returns 0 or an error. On failure @res is set to %NULL. @kern must
 *        be set to true if the socket resides in kernel space.
 *        This function internally uses GFP_KERNEL.
 */

int __sock_create(struct net *net, int family, int type, int protocol,
                         struct socket **res, int kern)
{
        int err;
        struct socket *sock;
        const struct net_proto_family *pf;

        /*
         *      Check protocol is in range
         */
        if (family < 0 || family >= NPROTO)
                return -EAFNOSUPPORT;
        if (type < 0 || type >= SOCK_MAX)
                return -EINVAL;

        /* Compatibility.

           This uglymoron is moved from INET layer to here to avoid
           deadlock in module load.
         */
        if (family == PF_INET && type == SOCK_PACKET) {
                pr_info_once("%s uses obsolete (PF_INET,SOCK_PACKET)\n",
                             current->comm);
                family = PF_PACKET;
        }

        err = security_socket_create(family, type, protocol, kern);
        if (err)
                return err;

        /*
         *        Allocate the socket and allow the family to set things up. if
         *        the protocol is 0, the family is instructed to select an appropriate
         *        default.
         */
        sock = sock_alloc();
        if (!sock) {
                net_warn_ratelimited("socket: no more sockets\n");
                return -ENFILE;        /* Not exactly a match, but its the
                                   closest posix thing */
        }

        sock->type = type;

#ifdef CONFIG_MODULES
        /* Attempt to load a protocol module if the find failed.
         *
         * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
         * requested real, full-featured networking support upon configuration.
         * Otherwise module support will break!
         */
        if (rcu_access_pointer(net_families[family]) == NULL)
                request_module("net-pf-%d", family);
#endif

        rcu_read_lock();
        pf = rcu_dereference(net_families[family]);
        err = -EAFNOSUPPORT;
        if (!pf)
                goto out_release;

        /*
         * We will call the ->create function, that possibly is in a loadable
         * module, so we have to bump that loadable module refcnt first.
         */
        if (!try_module_get(pf->owner))
                goto out_release;

        /* Now protected by module ref count */
        rcu_read_unlock();

        err = pf->create(net, sock, protocol, kern);
        if (err < 0)
                goto out_module_put;

        /*
         * Now to bump the refcnt of the [loadable] module that owns this
         * socket at sock_release time we decrement its refcnt.
         */
        if (!try_module_get(sock->ops->owner))
                goto out_module_busy;

        /*
         * Now that we're done with the ->create function, the [loadable]
         * module can have its refcnt decremented
         */
        module_put(pf->owner);
        err = security_socket_post_create(sock, family, type, protocol, kern);
        if (err)
                goto out_sock_release;
        *res = sock;

        return 0;

out_module_busy:
        err = -EAFNOSUPPORT;
out_module_put:
        sock->ops = NULL;
        module_put(pf->owner);
out_sock_release:
        sock_release(sock);
        return err;

out_release:
        rcu_read_unlock();
        goto out_sock_release;
}
EXPORT_SYMBOL(__sock_create);

/**
 *        sock_create - creates a socket
 *        @family: protocol family (AF_INET, ...)
 *        @type: communication type (SOCK_STREAM, ...)
 *        @protocol: protocol (0, ...)
 *        @res: new socket
 *
 *        A wrapper around __sock_create().
 *        Returns 0 or an error. This function internally uses GFP_KERNEL.
 */

int sock_create(int family, int type, int protocol, struct socket **res)
{
        return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
}
EXPORT_SYMBOL(sock_create);

/**
 *        sock_create_kern - creates a socket (kernel space)
 *        @net: net namespace
 *        @family: protocol family (AF_INET, ...)
 *        @type: communication type (SOCK_STREAM, ...)
 *        @protocol: protocol (0, ...)
 *        @res: new socket
 *
 *        A wrapper around __sock_create().
 *        Returns 0 or an error. This function internally uses GFP_KERNEL.
 */

int sock_create_kern(struct net *net, int family, int type, int protocol, struct socket **res)
{
        return __sock_create(net, family, type, protocol, res, 1);
}
EXPORT_SYMBOL(sock_create_kern);

int __sys_socket(int family, int type, int protocol)
{
        int retval;
        struct socket *sock;
        int flags;

        /* Check the SOCK_* constants for consistency.  */
        BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
        BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
        BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
        BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);

        flags = type & ~SOCK_TYPE_MASK;
        if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
                return -EINVAL;
        type &= SOCK_TYPE_MASK;

        if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
                flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

        retval = sock_create(family, type, protocol, &sock);
        if (retval < 0)
                return retval;

        return sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
}

SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
        return __sys_socket(family, type, protocol);
}

/*
 *        Create a pair of connected sockets.
 */

int __sys_socketpair(int family, int type, int protocol, int __user *usockvec)
{
        struct socket *sock1, *sock2;
        int fd1, fd2, err;
        struct file *newfile1, *newfile2;
        int flags;

        flags = type & ~SOCK_TYPE_MASK;
        if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
                return -EINVAL;
        type &= SOCK_TYPE_MASK;

        if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
                flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

        /*
         * reserve descriptors and make sure we won't fail
         * to return them to userland.
         */
        fd1 = get_unused_fd_flags(flags);
        if (unlikely(fd1 < 0))
                return fd1;

        fd2 = get_unused_fd_flags(flags);
        if (unlikely(fd2 < 0)) {
                put_unused_fd(fd1);
                return fd2;
        }

        err = put_user(fd1, &usockvec[0]);
        if (err)
                goto out;

        err = put_user(fd2, &usockvec[1]);
        if (err)
                goto out;

        /*
         * Obtain the first socket and check if the underlying protocol
         * supports the socketpair call.
         */

        err = sock_create(family, type, protocol, &sock1);
        if (unlikely(err < 0))
                goto out;

        err = sock_create(family, type, protocol, &sock2);
        if (unlikely(err < 0)) {
                sock_release(sock1);
                goto out;
        }

        err = security_socket_socketpair(sock1, sock2);
        if (unlikely(err)) {
                sock_release(sock2);
                sock_release(sock1);
                goto out;
        }

        err = sock1->ops->socketpair(sock1, sock2);
        if (unlikely(err < 0)) {
                sock_release(sock2);
                sock_release(sock1);
                goto out;
        }

        newfile1 = sock_alloc_file(sock1, flags, NULL);
        if (IS_ERR(newfile1)) {
                err = PTR_ERR(newfile1);
                sock_release(sock2);
                goto out;
        }

        newfile2 = sock_alloc_file(sock2, flags, NULL);
        if (IS_ERR(newfile2)) {
                err = PTR_ERR(newfile2);
                fput(newfile1);
                goto out;
        }

        audit_fd_pair(fd1, fd2);

        fd_install(fd1, newfile1);
        fd_install(fd2, newfile2);
        return 0;

out:
        put_unused_fd(fd2);
        put_unused_fd(fd1);
        return err;
}

SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol,
                int __user *, usockvec)
{
        return __sys_socketpair(family, type, protocol, usockvec);
}

/*
 *        Bind a name to a socket. Nothing much to do here since it's
 *        the protocol's responsibility to handle the local address.
 *
 *        We move the socket address to kernel space before we call
 *        the protocol layer (having also checked the address is ok).
 */

int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen)
{
        struct socket *sock;
        struct sockaddr_storage address;
        int err, fput_needed;

        sock = sockfd_lookup_light(fd, &err, &fput_needed);
        if (sock) {
                err = move_addr_to_kernel(umyaddr, addrlen, &address);
                if (!err) {
                        err = security_socket_bind(sock,
                                                   (struct sockaddr *)&address,
                                                   addrlen);
                        if (!err)
                                err = sock->ops->bind(sock,
                                                      (struct sockaddr *)
                                                      &address, addrlen);
                }
                fput_light(sock->file, fput_needed);
        }
        return err;
}

SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
{
        return __sys_bind(fd, umyaddr, addrlen);
}

/*
 *        Perform a listen. Basically, we allow the protocol to do anything
 *        necessary for a listen, and if that works, we mark the socket as
 *        ready for listening.
 */

int __sys_listen(int fd, int backlog)
{
        struct socket *sock;
        int err, fput_needed;
        int somaxconn;

        sock = sockfd_lookup_light(fd, &err, &fput_needed);
        if (sock) {
                somaxconn = READ_ONCE(sock_net(sock->sk)->core.sysctl_somaxconn);
                if ((unsigned int)backlog > somaxconn)
                        backlog = somaxconn;

                err = security_socket_listen(sock, backlog);
                if (!err)
                        err = sock->ops->listen(sock, backlog);

                fput_light(sock->file, fput_needed);
        }
        return err;
}

SYSCALL_DEFINE2(listen, int, fd, int, backlog)
{
        return __sys_listen(fd, backlog);
}

struct file *do_accept(struct file *file, unsigned file_flags,
                       struct sockaddr __user *upeer_sockaddr,
                       int __user *upeer_addrlen, int flags)
{
        struct socket *sock, *newsock;
        struct file *newfile;
        int err, len;
        struct sockaddr_storage address;

        sock = sock_from_file(file, &err);
        if (!sock)
                return ERR_PTR(err);

        newsock = sock_alloc();
        if (!newsock)
                return ERR_PTR(-ENFILE);

        newsock->type = sock->type;
        newsock->ops = sock->ops;

        /*
         * We don't need try_module_get here, as the listening socket (sock)
         * has the protocol module (sock->ops->owner) held.
         */
        __module_get(newsock->ops->owner);

        newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name);
        if (IS_ERR(newfile))
                return newfile;

        err = security_socket_accept(sock, newsock);
        if (err)
                goto out_fd;

        err = sock->ops->accept(sock, newsock, sock->file->f_flags | file_flags,
                                        false);
        if (err < 0)
                goto out_fd;

        if (upeer_sockaddr) {
                len = newsock->ops->getname(newsock,
                                        (struct sockaddr *)&address, 2);
                if (len < 0) {
                        err = -ECONNABORTED;
                        goto out_fd;
                }
                err = move_addr_to_user(&address,
                                        len, upeer_sockaddr, upeer_addrlen);
                if (err < 0)
                        goto out_fd;
        }

        /* File flags are not inherited via accept() unlike another OSes. */
        return newfile;
out_fd:
        fput(newfile);
        return ERR_PTR(err);
}

int __sys_accept4_file(struct file *file, unsigned file_flags,
                       struct sockaddr __user *upeer_sockaddr,
                       int __user *upeer_addrlen, int flags,
                       unsigned long nofile)
{
        struct file *newfile;
        int newfd;

        if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
                return -EINVAL;

        if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
                flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

        newfd = __get_unused_fd_flags(flags, nofile);
        if (unlikely(newfd < 0))
                return newfd;

        newfile = do_accept(file, file_flags, upeer_sockaddr, upeer_addrlen,
                            flags);
        if (IS_ERR(newfile)) {
                put_unused_fd(newfd);
                return PTR_ERR(newfile);
        }
        fd_install(newfd, newfile);
        return newfd;
}

/*
 *        For accept, we attempt to create a new socket, set up the link
 *        with the client, wake up the client, then return the new
 *        connected fd. We collect the address of the connector in kernel
 *        space and move it to user at the very end. This is unclean because
 *        we open the socket then return an error.
 *
 *        1003.1g adds the ability to recvmsg() to query connection pending
 *        status to recvmsg. We need to add that support in a way thats
 *        clean when we restructure accept also.
 */

int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
                  int __user *upeer_addrlen, int flags)
{
        int ret = -EBADF;
        struct fd f;

        f = fdget(fd);
        if (f.file) {
                ret = __sys_accept4_file(f.file, 0, upeer_sockaddr,
                                                upeer_addrlen, flags,
                                                rlimit(RLIMIT_NOFILE));
                fdput(f);
        }

        return ret;
}

SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr,
                int __user *, upeer_addrlen, int, flags)
{
        return __sys_accept4(fd, upeer_sockaddr, upeer_addrlen, flags);
}

SYSCALL_DEFINE3(accept, int, fd, struct sockaddr __user *, upeer_sockaddr,
                int __user *, upeer_addrlen)
{
        return __sys_accept4(fd, upeer_sockaddr, upeer_addrlen, 0);
}

/*
 *        Attempt to connect to a socket with the server address.  The address
 *        is in user space so we verify it is OK and move it to kernel space.
 *
 *        For 1003.1g we need to add clean support for a bind to AF_UNSPEC to
 *        break bindings
 *
 *        NOTE: 1003.1g draft 6.3 is broken with respect to AX.25/NetROM and
 *        other SEQPACKET protocols that take time to connect() as it doesn't
 *        include the -EINPROGRESS status for such sockets.
 */

int __sys_connect_file(struct file *file, struct sockaddr_storage *address,
                       int addrlen, int file_flags)
{
        struct socket *sock;
        int err;

        sock = sock_from_file(file, &err);
        if (!sock)
                goto out;

        err =
            security_socket_connect(sock, (struct sockaddr *)address, addrlen);
        if (err)
                goto out;

        err = sock->ops->connect(sock, (struct sockaddr *)address, addrlen,
                                 sock->file->f_flags | file_flags);
out:
        return err;
}

int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen)
{
        int ret = -EBADF;
        struct fd f;

        f = fdget(fd);
        if (f.file) {
                struct sockaddr_storage address;

                ret = move_addr_to_kernel(uservaddr, addrlen, &address);
                if (!ret)
                        ret = __sys_connect_file(f.file, &address, addrlen, 0);
                fdput(f);
        }

        return ret;
}

SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr,
                int, addrlen)
{
        return __sys_connect(fd, uservaddr, addrlen);
}

/*
 *        Get the local address ('name') of a socket object. Move the obtained
 *        name to user space.
 */

int __sys_getsockname(int fd, struct sockaddr __user *usockaddr,
                      int __user *usockaddr_len)
{
        struct socket *sock;
        struct sockaddr_storage address;
        int err, fput_needed;

        sock = sockfd_lookup_light(fd, &err, &fput_needed);
        if (!sock)
                goto out;

        err = security_socket_getsockname(sock);
        if (err)
                goto out_put;

        err = sock->ops->getname(sock, (struct sockaddr *)&address, 0);
        if (err < 0)
                goto out_put;
        /* "err" is actually length in this case */
        err = move_addr_to_user(&address, err, usockaddr, usockaddr_len);

out_put:
        fput_light(sock->file, fput_needed);
out:
        return err;
}

SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr,
                int __user *, usockaddr_len)
{
        return __sys_getsockname(fd, usockaddr, usockaddr_len);
}

/*
 *        Get the remote address ('name') of a socket object. Move the obtained
 *        name to user space.
 */

int __sys_getpeername(int fd, struct sockaddr __user *usockaddr,
                      int __user *usockaddr_len)
{
        struct socket *sock;
        struct sockaddr_storage address;
        int err, fput_needed;

        sock = sockfd_lookup_light(fd, &err, &fput_needed);
        if (sock != NULL) {
                err = security_socket_getpeername(sock);
                if (err) {
                        fput_light(sock->file, fput_needed);
                        return err;
                }

                err = sock->ops->getname(sock, (struct sockaddr *)&address, 1);
                if (err >= 0)
                        /* "err" is actually length in this case */
                        err = move_addr_to_user(&address, err, usockaddr,
                                                usockaddr_len);
                fput_light(sock->file, fput_needed);
        }
        return err;
}

SYSCALL_DEFINE3(getpeername, int, fd, struct sockaddr __user *, usockaddr,
                int __user *, usockaddr_len)
{
        return __sys_getpeername(fd, usockaddr, usockaddr_len);
}

/*
 *        Send a datagram to a given address. We move the address into kernel
 *        space and check the user space data area is readable before invoking
 *        the protocol.
 */
int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags,
                 struct sockaddr __user *addr,  int addr_len)
{
        struct socket *sock;
        struct sockaddr_storage address;
        int err;
        struct msghdr msg;
        struct iovec iov;
        int fput_needed;

        err = import_single_range(WRITE, buff, len, &iov, &msg.msg_iter);
        if (unlikely(err))
                return err;
        sock = sockfd_lookup_light(fd, &err, &fput_needed);
        if (!sock)
                goto out;

        msg.msg_name = NULL;
        msg.msg_control = NULL;
        msg.msg_controllen = 0;
        msg.msg_namelen = 0;
        if (addr) {
                err = move_addr_to_kernel(addr, addr_len, &address);
                if (err < 0)
                        goto out_put;
                msg.msg_name = (struct sockaddr *)&address;
                msg.msg_namelen = addr_len;
        }
        if (sock->file->f_flags & O_NONBLOCK)
                flags |= MSG_DONTWAIT;
        msg.msg_flags = flags;
        err = __sock_sendmsg(sock, &msg);

out_put:
        fput_light(sock->file, fput_needed);
out:
        return err;
}

SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len,
                unsigned int, flags, struct sockaddr __user *, addr,
                int, addr_len)
{
        return __sys_sendto(fd, buff, len, flags, addr, addr_len);
}

/*
 *        Send a datagram down a socket.
 */

SYSCALL_DEFINE4(send, int, fd, void __user *, buff, size_t, len,
                unsigned int, flags)
{
        return __sys_sendto(fd, buff, len, flags, NULL, 0);
}

/*
 *        Receive a frame from the socket and optionally record the address of the
 *        sender. We verify the buffers are writable and if needed move the
 *        sender address from kernel to user space.
 */
int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags,
                   struct sockaddr __user *addr, int __user *addr_len)
{
        struct socket *sock;
        struct iovec iov;
        struct msghdr msg;
        struct sockaddr_storage address;
        int err, err2;
        int fput_needed;

        err = import_single_range(READ, ubuf, size, &iov, &msg.msg_iter);
        if (unlikely(err))
                return err;
        sock = sockfd_lookup_light(fd, &err, &fput_needed);
        if (!sock)
                goto out;

        msg.msg_control = NULL;
        msg.msg_controllen = 0;
        /* Save some cycles and don't copy the address if not needed */
        msg.msg_name = addr ? (struct sockaddr *)&address : NULL;
        /* We assume all kernel code knows the size of sockaddr_storage */
        msg.msg_namelen = 0;
        msg.msg_iocb = NULL;
        msg.msg_flags = 0;
        if (sock->file->f_flags & O_NONBLOCK)
                flags |= MSG_DONTWAIT;
        err = sock_recvmsg(sock, &msg, flags);

        if (err >= 0 && addr != NULL) {
                err2 = move_addr_to_user(&address,
                                         msg.msg_namelen, addr, addr_len);
                if (err2 < 0)
                        err = err2;
        }

        fput_light(sock->file, fput_needed);
out:
        return err;
}

SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size,
                unsigned int, flags, struct sockaddr __user *, addr,
                int __user *, addr_len)
{
        return __sys_recvfrom(fd, ubuf, size, flags, addr, addr_len);
}

/*
 *        Receive a datagram from a socket.
 */

SYSCALL_DEFINE4(recv, int, fd, void __user *, ubuf, size_t, size,
                unsigned int, flags)
{
        return __sys_recvfrom(fd, ubuf, size, flags, NULL, NULL);
}

static bool sock_use_custom_sol_socket(const struct socket *sock)
{
        const struct sock *sk = sock->sk;

        /* Use sock->ops->setsockopt() for MPTCP */
        return IS_ENABLED(CONFIG_MPTCP) &&
               sk->sk_protocol == IPPROTO_MPTCP &&
               sk->sk_type == SOCK_STREAM &&
               (sk->sk_family == AF_INET || sk->sk_family == AF_INET6);
}

/*
 *        Set a socket option. Because we don't know the option lengths we have
 *        to pass the user mode parameter for the protocols to sort out.
 */
int __sys_setsockopt(int fd, int level, int optname, char __user *user_optval,
                int optlen)
{
        sockptr_t optval = USER_SOCKPTR(user_optval);
        char *kernel_optval = NULL;
        int err, fput_needed;
        struct socket *sock;

        if (optlen < 0)
                return -EINVAL;

        sock = sockfd_lookup_light(fd, &err, &fput_needed);
        if (!sock)
                return err;

        err = security_socket_setsockopt(sock, level, optname);
        if (err)
                goto out_put;

        if (!in_compat_syscall())
                err = BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock->sk, &level, &optname,
                                                     user_optval, &optlen,
                                                     &kernel_optval);
        if (err < 0)
                goto out_put;
        if (err > 0) {
                err = 0;
                goto out_put;
        }

        if (kernel_optval)
                optval = KERNEL_SOCKPTR(kernel_optval);
        if (level == SOL_SOCKET && !sock_use_custom_sol_socket(sock))
                err = sock_setsockopt(sock, level, optname, optval, optlen);
        else if (unlikely(!sock->ops->setsockopt))
                err = -EOPNOTSUPP;
        else
                err = sock->ops->setsockopt(sock, level, optname, optval,
                                            optlen);
        kfree(kernel_optval);
out_put:
        fput_light(sock->file, fput_needed);
        return err;
}

SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname,
                char __user *, optval, int, optlen)
{
        return __sys_setsockopt(fd, level, optname, optval, optlen);
}

INDIRECT_CALLABLE_DECLARE(bool tcp_bpf_bypass_getsockopt(int level,
                                                         int optname));

/*
 *        Get a socket option. Because we don't know the option lengths we have
 *        to pass a user mode parameter for the protocols to sort out.
 */
int __sys_getsockopt(int fd, int level, int optname, char __user *optval,
                int __user *optlen)
{
        int err, fput_needed;
        struct socket *sock;
        int max_optlen;

        sock = sockfd_lookup_light(fd, &err, &fput_needed);
        if (!sock)
                return err;

        err = security_socket_getsockopt(sock, level, optname);
        if (err)
                goto out_put;

        if (!in_compat_syscall())
                max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen);

        if (level == SOL_SOCKET)
                err = sock_getsockopt(sock, level, optname, optval, optlen);
        else if (unlikely(!sock->ops->getsockopt))
                err = -EOPNOTSUPP;
        else
                err = sock->ops->getsockopt(sock, level, optname, optval,
                                            optlen);

        if (!in_compat_syscall())
                err = BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock->sk, level, optname,
                                                     optval, optlen, max_optlen,
                                                     err);
out_put:
        fput_light(sock->file, fput_needed);
        return err;
}

SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname,
                char __user *, optval, int __user *, optlen)
{
        return __sys_getsockopt(fd, level, optname, optval, optlen);
}

/*
 *        Shutdown a socket.
 */

int __sys_shutdown_sock(struct socket *sock, int how)
{
        int err;

        err = security_socket_shutdown(sock, how);
        if (!err)
                err = sock->ops->shutdown(sock, how);

        return err;
}

int __sys_shutdown(int fd, int how)
{
        int err, fput_needed;
        struct socket *sock;

        sock = sockfd_lookup_light(fd, &err, &fput_needed);
        if (sock != NULL) {
                err = __sys_shutdown_sock(sock, how);
                fput_light(sock->file, fput_needed);
        }
        return err;
}

SYSCALL_DEFINE2(shutdown, int, fd, int, how)
{
        return __sys_shutdown(fd, how);
}

/* A couple of helpful macros for getting the address of the 32/64 bit
 * fields which are the same type (int / unsigned) on our platforms.
 */
#define COMPAT_MSG(msg, member)        ((MSG_CMSG_COMPAT & flags) ? &msg##_compat->member : &msg->member)
#define COMPAT_NAMELEN(msg)        COMPAT_MSG(msg, msg_namelen)
#define COMPAT_FLAGS(msg)        COMPAT_MSG(msg, msg_flags)

struct used_address {
        struct sockaddr_storage name;
        unsigned int name_len;
};

int __copy_msghdr_from_user(struct msghdr *kmsg,
                            struct user_msghdr __user *umsg,
                            struct sockaddr __user **save_addr,
                            struct iovec __user **uiov, size_t *nsegs)
{
        struct user_msghdr msg;
        ssize_t err;

        if (copy_from_user(&msg, umsg, sizeof(*umsg)))
                return -EFAULT;

        kmsg->msg_control_is_user = true;
        kmsg->msg_control_user = msg.msg_control;
        kmsg->msg_controllen = msg.msg_controllen;
        kmsg->msg_flags = msg.msg_flags;

        kmsg->msg_namelen = msg.msg_namelen;
        if (!msg.msg_name)
                kmsg->msg_namelen = 0;

        if (kmsg->msg_namelen < 0)
                return -EINVAL;

        if (kmsg->msg_namelen > sizeof(struct sockaddr_storage))
                kmsg->msg_namelen = sizeof(struct sockaddr_storage);

        if (save_addr)
                *save_addr = msg.msg_name;

        if (msg.msg_name && kmsg->msg_namelen) {
                if (!save_addr) {
                        err = move_addr_to_kernel(msg.msg_name,
                                                  kmsg->msg_namelen,
                                                  kmsg->msg_name);
                        if (err < 0)
                                return err;
                }
        } else {
                kmsg->msg_name = NULL;
                kmsg->msg_namelen = 0;
        }

        if (msg.msg_iovlen > UIO_MAXIOV)
                return -EMSGSIZE;

        kmsg->msg_iocb = NULL;
        *uiov = msg.msg_iov;
        *nsegs = msg.msg_iovlen;
        return 0;
}

static int copy_msghdr_from_user(struct msghdr *kmsg,
                                 struct user_msghdr __user *umsg,
                                 struct sockaddr __user **save_addr,
                                 struct iovec **iov)
{
        struct user_msghdr msg;
        ssize_t err;

        err = __copy_msghdr_from_user(kmsg, umsg, save_addr, &msg.msg_iov,
                                        &msg.msg_iovlen);
        if (err)
                return err;

        err = import_iovec(save_addr ? READ : WRITE,
                            msg.msg_iov, msg.msg_iovlen,
                            UIO_FASTIOV, iov, &kmsg->msg_iter);
        return err < 0 ? err : 0;
}

static int ____sys_sendmsg(struct socket *sock, struct msghdr *msg_sys,
                           unsigned int flags, struct used_address *used_address,
                           unsigned int allowed_msghdr_flags)
{
        unsigned char ctl[sizeof(struct cmsghdr) + 20]
                                __aligned(sizeof(__kernel_size_t));
        /* 20 is size of ipv6_pktinfo */
        unsigned char *ctl_buf = ctl;
        int ctl_len;
        ssize_t err;

        err = -ENOBUFS;

        if (msg_sys->msg_controllen > INT_MAX)
                goto out;
        flags |= (msg_sys->msg_flags & allowed_msghdr_flags);
        ctl_len = msg_sys->msg_controllen;
        if ((MSG_CMSG_COMPAT & flags) && ctl_len) {
                err =
                    cmsghdr_from_user_compat_to_kern(msg_sys, sock->sk, ctl,
                                                     sizeof(ctl));
                if (err)
                        goto out;
                ctl_buf = msg_sys->msg_control;
                ctl_len = msg_sys->msg_controllen;
        } else if (ctl_len) {
                BUILD_BUG_ON(sizeof(struct cmsghdr) !=
                             CMSG_ALIGN(sizeof(struct cmsghdr)));
                if (ctl_len > sizeof(ctl)) {
                        ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL);
                        if (ctl_buf == NULL)
                                goto out;
                }
                err = -EFAULT;
                if (copy_from_user(ctl_buf, msg_sys->msg_control_user, ctl_len))
                        goto out_freectl;
                msg_sys->msg_control = ctl_buf;
                msg_sys->msg_control_is_user = false;
        }
        msg_sys->msg_flags = flags;

        if (sock->file->f_flags & O_NONBLOCK)
                msg_sys->msg_flags |= MSG_DONTWAIT;
        /*
         * If this is sendmmsg() and current destination address is same as
         * previously succeeded address, omit asking LSM's decision.
         * used_address->name_len is initialized to UINT_MAX so that the first
         * destination address never matches.
         */
        if (used_address && msg_sys->msg_name &&
            used_address->name_len == msg_sys->msg_namelen &&
            !memcmp(&used_address->name, msg_sys->msg_name,
                    used_address->name_len)) {
                err = sock_sendmsg_nosec(sock, msg_sys);
                goto out_freectl;
        }
        err = __sock_sendmsg(sock, msg_sys);
        /*
         * If this is sendmmsg() and sending to current destination address was
         * successful, remember it.
         */
        if (used_address && err >= 0) {
                used_address->name_len = msg_sys->msg_namelen;
                if (msg_sys->msg_name)
                        memcpy(&used_address->name, msg_sys->msg_name,
                               used_address->name_len);
        }

out_freectl:
        if (ctl_buf != ctl)
                sock_kfree_s(sock->sk, ctl_buf, ctl_len);
out:
        return err;
}

int sendmsg_copy_msghdr(struct msghdr *msg,
                        struct user_msghdr __user *umsg, unsigned flags,
                        struct iovec **iov)
{
        int err;

        if (flags & MSG_CMSG_COMPAT) {
                struct compat_msghdr __user *msg_compat;

                msg_compat = (struct compat_msghdr __user *) umsg;
                err = get_compat_msghdr(msg, msg_compat, NULL, iov);
        } else {
                err = copy_msghdr_from_user(msg, umsg, NULL, iov);
        }
        if (err < 0)
                return err;

        return 0;
}

static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
                         struct msghdr *msg_sys, unsigned int flags,
                         struct used_address *used_address,
                         unsigned int allowed_msghdr_flags)
{
        struct sockaddr_storage address;
        struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
        ssize_t err;

        msg_sys->msg_name = &address;

        err = sendmsg_copy_msghdr(msg_sys, msg, flags, &iov);
        if (err < 0)
                return err;

        err = ____sys_sendmsg(sock, msg_sys, flags, used_address,
                                allowed_msghdr_flags);
        kfree(iov);
        return err;
}

/*
 *        BSD sendmsg interface
 */
long __sys_sendmsg_sock(struct socket *sock, struct msghdr *msg,
                        unsigned int flags)
{
        return ____sys_sendmsg(sock, msg, flags, NULL, 0);
}

long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned int flags,
                   bool forbid_cmsg_compat)
{
        int fput_needed, err;
        struct msghdr msg_sys;
        struct socket *sock;

        if (forbid_cmsg_compat && (flags & MSG_CMSG_COMPAT))
                return -EINVAL;

        sock = sockfd_lookup_light(fd, &err, &fput_needed);
        if (!sock)
                goto out;

        err = ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL, 0);

        fput_light(sock->file, fput_needed);
out:
        return err;
}

SYSCALL_DEFINE3(sendmsg, int, fd, struct user_msghdr __user *, msg, unsigned int, flags)
{
        return __sys_sendmsg(fd, msg, flags, true);
}

/*
 *        Linux sendmmsg interface
 */

int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
                   unsigned int flags, bool forbid_cmsg_compat)
{
        int fput_needed, err, datagrams;
        struct socket *sock;
        struct mmsghdr __user *entry;
        struct compat_mmsghdr __user *compat_entry;
        struct msghdr msg_sys;
        struct used_address used_address;
        unsigned int oflags = flags;

        if (forbid_cmsg_compat && (flags & MSG_CMSG_COMPAT))
                return -EINVAL;

        if (vlen > UIO_MAXIOV)
                vlen = UIO_MAXIOV;

        datagrams = 0;

        sock = sockfd_lookup_light(fd, &err, &fput_needed);
        if (!sock)
                return err;

        used_address.name_len = UINT_MAX;
        entry = mmsg;
        compat_entry = (struct compat_mmsghdr __user *)mmsg;
        err = 0;
        flags |= MSG_BATCH;

        while (datagrams < vlen) {
                if (datagrams == vlen - 1)
                        flags = oflags;

                if (MSG_CMSG_COMPAT & flags) {
                        err = ___sys_sendmsg(sock, (struct user_msghdr __user *)compat_entry,
                                             &msg_sys, flags, &used_address, MSG_EOR);
                        if (err < 0)
                                break;
                        err = __put_user(err, &compat_entry->msg_len);
                        ++compat_entry;
                } else {
                        err = ___sys_sendmsg(sock,
                                             (struct user_msghdr __user *)entry,
                                             &msg_sys, flags, &used_address, MSG_EOR);
                        if (err < 0)
                                break;
                        err = put_user(err, &entry->msg_len);
                        ++entry;
                }

                if (err)
                        break;
                ++datagrams;
                if (msg_data_left(&msg_sys))
                        break;
                cond_resched();
        }

        fput_light(sock->file, fput_needed);

        /* We only return an error if no datagrams were able to be sent */
        if (datagrams != 0)
                return datagrams;

        return err;
}

SYSCALL_DEFINE4(sendmmsg, int, fd, struct mmsghdr __user *, mmsg,
                unsigned int, vlen, unsigned int, flags)
{
        return __sys_sendmmsg(fd, mmsg, vlen, flags, true);
}

int recvmsg_copy_msghdr(struct msghdr *msg,
                        struct user_msghdr __user *umsg, unsigned flags,
                        struct sockaddr __user **uaddr,
                        struct iovec **iov)
{
        ssize_t err;

        if (MSG_CMSG_COMPAT & flags) {
                struct compat_msghdr __user *msg_compat;

                msg_compat = (struct compat_msghdr __user *) umsg;
                err = get_compat_msghdr(msg, msg_compat, uaddr, iov);
        } else {
                err = copy_msghdr_from_user(msg, umsg, uaddr, iov);
        }
        if (err < 0)
                return err;

        return 0;
}

static int ____sys_recvmsg(struct socket *sock, struct msghdr *msg_sys,
                           struct user_msghdr __user *msg,
                           struct sockaddr __user *uaddr,
                           unsigned int flags, int nosec)
{
        struct compat_msghdr __user *msg_compat =
                                        (struct compat_msghdr __user *) msg;
        int __user *uaddr_len = COMPAT_NAMELEN(msg);
        struct sockaddr_storage addr;
        unsigned long cmsg_ptr;
        int len;
        ssize_t err;

        msg_sys->msg_name = &addr;
        cmsg_ptr = (unsigned long)msg_sys->msg_control;
        msg_sys->msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);

        /* We assume all kernel code knows the size of sockaddr_storage */
        msg_sys->msg_namelen = 0;

        if (sock->file->f_flags & O_NONBLOCK)
                flags |= MSG_DONTWAIT;

        if (unlikely(nosec))
                err = sock_recvmsg_nosec(sock, msg_sys, flags);
        else
                err = sock_recvmsg(sock, msg_sys, flags);

        if (err < 0)
                goto out;
        len = err;

        if (uaddr != NULL) {
                err = move_addr_to_user(&addr,
                                        msg_sys->msg_namelen, uaddr,
                                        uaddr_len);
                if (err < 0)
                        goto out;
        }
        err = __put_user((msg_sys->msg_flags & ~MSG_CMSG_COMPAT),
                         COMPAT_FLAGS(msg));
        if (err)
                goto out;
        if (MSG_CMSG_COMPAT & flags)
                err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr,
                                 &msg_compat->msg_controllen);
        else
                err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr,
                                 &msg->msg_controllen);
        if (err)
                goto out;
        err = len;
out:
        return err;
}

static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg,
                         struct msghdr *msg_sys, unsigned int flags, int nosec)
{
        struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
        /* user mode address pointers */
        struct sockaddr __user *uaddr;
        ssize_t err;

        err = recvmsg_copy_msghdr(msg_sys, msg, flags, &uaddr, &iov);
        if (err < 0)
                return err;

        err = ____sys_recvmsg(sock, msg_sys, msg, uaddr, flags, nosec);
        kfree(iov);
        return err;
}

/*
 *        BSD recvmsg interface
 */

long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg,
                        struct user_msghdr __user *umsg,
                        struct sockaddr __user *uaddr, unsigned int flags)
{
        return ____sys_recvmsg(sock, msg, umsg, uaddr, flags, 0);
}

long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned int flags,
                   bool forbid_cmsg_compat)
{
        int fput_needed, err;
        struct msghdr msg_sys;
        struct socket *sock;

        if (forbid_cmsg_compat && (flags & MSG_CMSG_COMPAT))
                return -EINVAL;

        sock = sockfd_lookup_light(fd, &err, &fput_needed);
        if (!sock)
                goto out;

        err = ___sys_recvmsg(sock, msg, &msg_sys, flags, 0);

        fput_light(sock->file, fput_needed);
out:
        return err;
}

SYSCALL_DEFINE3(recvmsg, int, fd, struct user_msghdr __user *, msg,
                unsigned int, flags)
{
        return __sys_recvmsg(fd, msg, flags, true);
}

/*
 *     Linux recvmmsg interface
 */

static int do_recvmmsg(int fd, struct mmsghdr __user *mmsg,
                          unsigned int vlen, unsigned int flags,
                          struct timespec64 *timeout)
{
        int fput_needed, err, datagrams;
        struct socket *sock;
        struct mmsghdr __user *entry;
        struct compat_mmsghdr __user *compat_entry;
        struct msghdr msg_sys;
        struct timespec64 end_time;
        struct timespec64 timeout64;

        if (timeout &&
            poll_select_set_timeout(&end_time, timeout->tv_sec,
                                    timeout->tv_nsec))
                return -EINVAL;

        datagrams = 0;

        sock = sockfd_lookup_light(fd, &err, &fput_needed);
        if (!sock)
                return err;

        if (likely(!(flags & MSG_ERRQUEUE))) {
                err = sock_error(sock->sk);
                if (err) {
                        datagrams = err;
                        goto out_put;
                }
        }

        entry = mmsg;
        compat_entry = (struct compat_mmsghdr __user *)mmsg;

        while (datagrams < vlen) {
                /*
                 * No need to ask LSM for more than the first datagram.
                 */
                if (MSG_CMSG_COMPAT & flags) {
                        err = ___sys_recvmsg(sock, (struct user_msghdr __user *)compat_entry,
                                             &msg_sys, flags & ~MSG_WAITFORONE,
                                             datagrams);
                        if (err < 0)
                                break;
                        err = __put_user(err, &compat_entry->msg_len);
                        ++compat_entry;
                } else {
                        err = ___sys_recvmsg(sock,
                                             (struct user_msghdr __user *)entry,
                                             &msg_sys, flags & ~MSG_WAITFORONE,
                                             datagrams);
                        if (err < 0)
                                break;
                        err = put_user(err, &entry->msg_len);
                        ++entry;
                }

                if (err)
                        break;
                ++datagrams;

                /* MSG_WAITFORONE turns on MSG_DONTWAIT after one packet */
                if (flags & MSG_WAITFORONE)
                        flags |= MSG_DONTWAIT;

                if (timeout) {
                        ktime_get_ts64(&timeout64);
                        *timeout = timespec64_sub(end_time, timeout64);
                        if (timeout->tv_sec < 0) {
                                timeout->tv_sec = timeout->tv_nsec = 0;
                                break;
                        }

                        /* Timeout, return less than vlen datagrams */
                        if (timeout->tv_nsec == 0 && timeout->tv_sec == 0)
                                break;
                }

                /* Out of band data, return right away */
                if (msg_sys.msg_flags & MSG_OOB)
                        break;
                cond_resched();
        }

        if (err == 0)
                goto out_put;

        if (datagrams == 0) {
                datagrams = err;
                goto out_put;
        }

        /*
         * We may return less entries than requested (vlen) if the
         * sock is non block and there aren't enough datagrams...
         */
        if (err != -EAGAIN) {
                /*
                 * ... or  if recvmsg returns an error after we
                 * received some datagrams, where we record the
                 * error to return on the next call or if the
                 * app asks about it using getsockopt(SO_ERROR).
                 */
                WRITE_ONCE(sock->sk->sk_err, -err);
        }
out_put:
        fput_light(sock->file, fput_needed);

        return datagrams;
}

int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg,
                   unsigned int vlen, unsigned int flags,
                   struct __kernel_timespec __user *timeout,
                   struct old_timespec32 __user *timeout32)
{
        int datagrams;
        struct timespec64 timeout_sys;

        if (timeout && get_timespec64(&timeout_sys, timeout))
                return -EFAULT;

        if (timeout32 && get_old_timespec32(&timeout_sys, timeout32))
                return -EFAULT;

        if (!timeout && !timeout32)
                return do_recvmmsg(fd, mmsg, vlen, flags, NULL);

        datagrams = do_recvmmsg(fd, mmsg, vlen, flags, &timeout_sys);

        if (datagrams <= 0)
                return datagrams;

        if (timeout && put_timespec64(&timeout_sys, timeout))
                datagrams = -EFAULT;

        if (timeout32 && put_old_timespec32(&timeout_sys, timeout32))
                datagrams = -EFAULT;

        return datagrams;
}

SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg,
                unsigned int, vlen, unsigned int, flags,
                struct __kernel_timespec __user *, timeout)
{
        if (flags & MSG_CMSG_COMPAT)
                return -EINVAL;

        return __sys_recvmmsg(fd, mmsg, vlen, flags, timeout, NULL);
}

#ifdef CONFIG_COMPAT_32BIT_TIME
SYSCALL_DEFINE5(recvmmsg_time32, int, fd, struct mmsghdr __user *, mmsg,
                unsigned int, vlen, unsigned int, flags,
                struct old_timespec32 __user *, timeout)
{
        if (flags & MSG_CMSG_COMPAT)
                return -EINVAL;

        return __sys_recvmmsg(fd, mmsg, vlen, flags, NULL, timeout);
}
#endif

#ifdef __ARCH_WANT_SYS_SOCKETCALL
/* Argument list sizes for sys_socketcall */
#define AL(x) ((x) * sizeof(unsigned long))
static const unsigned char nargs[21] = {
        AL(0), AL(3), AL(3), AL(3), AL(2), AL(3),
        AL(3), AL(3), AL(4), AL(4), AL(4), AL(6),
        AL(6), AL(2), AL(5), AL(5), AL(3), AL(3),
        AL(4), AL(5), AL(4)
};

#undef AL

/*
 *        System call vectors.
 *
 *        Argument checking cleaned up. Saved 20% in size.
 *  This function doesn't need to set the kernel lock because
 *  it is set by the callees.
 */

SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
{
        unsigned long a[AUDITSC_ARGS];
        unsigned long a0, a1;
        int err;
        unsigned int len;

        if (call < 1 || call > SYS_SENDMMSG)
                return -EINVAL;
        call = array_index_nospec(call, SYS_SENDMMSG + 1);

        len = nargs[call];
        if (len > sizeof(a))
                return -EINVAL;

        /* copy_from_user should be SMP safe. */
        if (copy_from_user(a, args, len))
                return -EFAULT;

        err = audit_socketcall(nargs[call] / sizeof(unsigned long), a);
        if (err)
                return err;

        a0 = a[0];
        a1 = a[1];

        switch (call) {
        case SYS_SOCKET:
                err = __sys_socket(a0, a1, a[2]);
                break;
        case SYS_BIND:
                err = __sys_bind(a0, (struct sockaddr __user *)a1, a[2]);
                break;
        case SYS_CONNECT:
                err = __sys_connect(a0, (struct sockaddr __user *)a1, a[2]);
                break;
        case SYS_LISTEN:
                err = __sys_listen(a0, a1);
                break;
        case SYS_ACCEPT:
                err = __sys_accept4(a0, (struct sockaddr __user *)a1,
                                    (int __user *)a[2], 0);
                break;
        case SYS_GETSOCKNAME:
                err =
                    __sys_getsockname(a0, (struct sockaddr __user *)a1,
                                      (int __user *)a[2]);
                break;
        case SYS_GETPEERNAME:
                err =
                    __sys_getpeername(a0, (struct sockaddr __user *)a1,
                                      (int __user *)a[2]);
                break;
        case SYS_SOCKETPAIR:
                err = __sys_socketpair(a0, a1, a[2], (int __user *)a[3]);
                break;
        case SYS_SEND:
                err = __sys_sendto(a0, (void __user *)a1, a[2], a[3],
                                   NULL, 0);
                break;
        case SYS_SENDTO:
                err = __sys_sendto(a0, (void __user *)a1, a[2], a[3],
                                   (struct sockaddr __user *)a[4], a[5]);
                break;
        case SYS_RECV:
                err = __sys_recvfrom(a0, (void __user *)a1, a[2], a[3],
                                     NULL, NULL);
                break;
        case SYS_RECVFROM:
                err = __sys_recvfrom(a0, (void __user *)a1, a[2], a[3],
                                     (struct sockaddr __user *)a[4],
                                     (int __user *)a[5]);
                break;
        case SYS_SHUTDOWN:
                err = __sys_shutdown(a0, a1);
                break;
        case SYS_SETSOCKOPT:
                err = __sys_setsockopt(a0, a1, a[2], (char __user *)a[3],
                                       a[4]);
                break;
        case SYS_GETSOCKOPT:
                err =
                    __sys_getsockopt(a0, a1, a[2], (char __user *)a[3],
                                     (int __user *)a[4]);
                break;
        case SYS_SENDMSG:
                err = __sys_sendmsg(a0, (struct user_msghdr __user *)a1,
                                    a[2], true);
                break;
        case SYS_SENDMMSG:
                err = __sys_sendmmsg(a0, (struct mmsghdr __user *)a1, a[2],
                                     a[3], true);
                break;
        case SYS_RECVMSG:
                err = __sys_recvmsg(a0, (struct user_msghdr __user *)a1,
                                    a[2], true);
                break;
        case SYS_RECVMMSG:
                if (IS_ENABLED(CONFIG_64BIT))
                        err = __sys_recvmmsg(a0, (struct mmsghdr __user *)a1,
                                             a[2], a[3],
                                             (struct __kernel_timespec __user *)a[4],
                                             NULL);
                else
                        err = __sys_recvmmsg(a0, (struct mmsghdr __user *)a1,
                                             a[2], a[3], NULL,
                                             (struct old_timespec32 __user *)a[4]);
                break;
        case SYS_ACCEPT4:
                err = __sys_accept4(a0, (struct sockaddr __user *)a1,
                                    (int __user *)a[2], a[3]);
                break;
        default:
                err = -EINVAL;
                break;
        }
        return err;
}

#endif                                /* __ARCH_WANT_SYS_SOCKETCALL */

/**
 *        sock_register - add a socket protocol handler
 *        @ops: description of protocol
 *
 *        This function is called by a protocol handler that wants to
 *        advertise its address family, and have it linked into the
 *        socket interface. The value ops->family corresponds to the
 *        socket system call protocol family.
 */
int sock_register(const struct net_proto_family *ops)
{
        int err;

        if (ops->family >= NPROTO) {
                pr_crit("protocol %d >= NPROTO(%d)\n", ops->family, NPROTO);
                return -ENOBUFS;
        }

        spin_lock(&net_family_lock);
        if (rcu_dereference_protected(net_families[ops->family],
                                      lockdep_is_held(&net_family_lock)))
                err = -EEXIST;
        else {
                rcu_assign_pointer(net_families[ops->family], ops);
                err = 0;
        }
        spin_unlock(&net_family_lock);

        pr_info("NET: Registered protocol family %d\n", ops->family);
        return err;
}
EXPORT_SYMBOL(sock_register);

/**
 *        sock_unregister - remove a protocol handler
 *        @family: protocol family to remove
 *
 *        This function is called by a protocol handler that wants to
 *        remove its address family, and have it unlinked from the
 *        new socket creation.
 *
 *        If protocol handler is a module, then it can use module reference
 *        counts to protect against new references. If protocol handler is not
 *        a module then it needs to provide its own protection in
 *        the ops->create routine.
 */
void sock_unregister(int family)
{
        BUG_ON(family < 0 || family >= NPROTO);

        spin_lock(&net_family_lock);
        RCU_INIT_POINTER(net_families[family], NULL);
        spin_unlock(&net_family_lock);

        synchronize_rcu();

        pr_info("NET: Unregistered protocol family %d\n", family);
}
EXPORT_SYMBOL(sock_unregister);

bool sock_is_registered(int family)
{
        return family < NPROTO && rcu_access_pointer(net_families[family]);
}

static int __init sock_init(void)
{
        int err;
        /*
         *      Initialize the network sysctl infrastructure.
         */
        err = net_sysctl_init();
        if (err)
                goto out;

        /*
         *      Initialize skbuff SLAB cache
         */
        skb_init();

        /*
         *      Initialize the protocols module.
         */

        init_inodecache();

        err = register_filesystem(&sock_fs_type);
        if (err)
                goto out;
        sock_mnt = kern_mount(&sock_fs_type);
        if (IS_ERR(sock_mnt)) {
                err = PTR_ERR(sock_mnt);
                goto out_mount;
        }

        /* The real protocol initialization is performed in later initcalls.
         */

#ifdef CONFIG_NETFILTER
        err = netfilter_init();
        if (err)
                goto out;
#endif

        ptp_classifier_init();

out:
        return err;

out_mount:
        unregister_filesystem(&sock_fs_type);
        goto out;
}

core_initcall(sock_init);        /* early initcall */

#ifdef CONFIG_PROC_FS
void socket_seq_show(struct seq_file *seq)
{
        seq_printf(seq, "sockets: used %d\n",
                   sock_inuse_get(seq->private));
}
#endif                                /* CONFIG_PROC_FS */

#ifdef CONFIG_COMPAT
static int compat_dev_ifconf(struct net *net, struct compat_ifconf __user *uifc32)
{
        struct compat_ifconf ifc32;
        struct ifconf ifc;
        int err;

        if (copy_from_user(&ifc32, uifc32, sizeof(struct compat_ifconf)))
                return -EFAULT;

        ifc.ifc_len = ifc32.ifc_len;
        ifc.ifc_req = compat_ptr(ifc32.ifcbuf);

        rtnl_lock();
        err = dev_ifconf(net, &ifc, sizeof(struct compat_ifreq));
        rtnl_unlock();
        if (err)
                return err;

        ifc32.ifc_len = ifc.ifc_len;
        if (copy_to_user(uifc32, &ifc32, sizeof(struct compat_ifconf)))
                return -EFAULT;

        return 0;
}

static int compat_siocwandev(struct net *net, struct compat_ifreq __user *uifr32)
{
        compat_uptr_t uptr32;
        struct ifreq ifr;
        void __user *saved;
        int err;

        if (copy_from_user(&ifr, uifr32, sizeof(struct compat_ifreq)))
                return -EFAULT;

        if (get_user(uptr32, &uifr32->ifr_settings.ifs_ifsu))
                return -EFAULT;

        saved = ifr.ifr_settings.ifs_ifsu.raw_hdlc;
        ifr.ifr_settings.ifs_ifsu.raw_hdlc = compat_ptr(uptr32);

        err = dev_ioctl(net, SIOCWANDEV, &ifr, NULL);
        if (!err) {
                ifr.ifr_settings.ifs_ifsu.raw_hdlc = saved;
                if (copy_to_user(uifr32, &ifr, sizeof(struct compat_ifreq)))
                        err = -EFAULT;
        }
        return err;
}

/* Handle ioctls that use ifreq::ifr_data and just need struct ifreq converted */
static int compat_ifr_data_ioctl(struct net *net, unsigned int cmd,
                                 struct compat_ifreq __user *u_ifreq32)
{
        struct ifreq ifreq;
        u32 data32;

        if (!is_socket_ioctl_cmd(cmd))
                return -ENOTTY;
        if (copy_from_user(ifreq.ifr_name, u_ifreq32->ifr_name, IFNAMSIZ))
                return -EFAULT;
        if (get_user(data32, &u_ifreq32->ifr_data))
                return -EFAULT;
        ifreq.ifr_data = compat_ptr(data32);

        return dev_ioctl(net, cmd, &ifreq, NULL);
}

static int compat_ifreq_ioctl(struct net *net, struct socket *sock,
                              unsigned int cmd,
                              struct compat_ifreq __user *uifr32)
{
        struct ifreq __user *uifr;
        int err;

        /* Handle the fact that while struct ifreq has the same *layout* on
         * 32/64 for everything but ifreq::ifru_ifmap and ifreq::ifru_data,
         * which are handled elsewhere, it still has different *size* due to
         * ifreq::ifru_ifmap (which is 16 bytes on 32 bit, 24 bytes on 64-bit,
         * resulting in struct ifreq being 32 and 40 bytes respectively).
         * As a result, if the struct happens to be at the end of a page and
         * the next page isn't readable/writable, we get a fault. To prevent
         * that, copy back and forth to the full size.
         */

        uifr = compat_alloc_user_space(sizeof(*uifr));
        if (copy_in_user(uifr, uifr32, sizeof(*uifr32)))
                return -EFAULT;

        err = sock_do_ioctl(net, sock, cmd, (unsigned long)uifr);

        if (!err) {
                switch (cmd) {
                case SIOCGIFFLAGS:
                case SIOCGIFMETRIC:
                case SIOCGIFMTU:
                case SIOCGIFMEM:
                case SIOCGIFHWADDR:
                case SIOCGIFINDEX:
                case SIOCGIFADDR:
                case SIOCGIFBRDADDR:
                case SIOCGIFDSTADDR:
                case SIOCGIFNETMASK:
                case SIOCGIFPFLAGS:
                case SIOCGIFTXQLEN:
                case SIOCGMIIPHY:
                case SIOCGMIIREG:
                case SIOCGIFNAME:
                        if (copy_in_user(uifr32, uifr, sizeof(*uifr32)))
                                err = -EFAULT;
                        break;
                }
        }
        return err;
}

static int compat_sioc_ifmap(struct net *net, unsigned int cmd,
                        struct compat_ifreq __user *uifr32)
{
        struct ifreq ifr;
        struct compat_ifmap __user *uifmap32;
        int err;

        uifmap32 = &uifr32->ifr_ifru.ifru_map;
        err = copy_from_user(&ifr, uifr32, sizeof(ifr.ifr_name));
        err |= get_user(ifr.ifr_map.mem_start, &uifmap32->mem_start);
        err |= get_user(ifr.ifr_map.mem_end, &uifmap32->mem_end);
        err |= get_user(ifr.ifr_map.base_addr, &uifmap32->base_addr);
        err |= get_user(ifr.ifr_map.irq, &uifmap32->irq);
        err |= get_user(ifr.ifr_map.dma, &uifmap32->dma);
        err |= get_user(ifr.ifr_map.port, &uifmap32->port);
        if (err)
                return -EFAULT;

        err = dev_ioctl(net, cmd, &ifr, NULL);

        if (cmd == SIOCGIFMAP && !err) {
                err = copy_to_user(uifr32, &ifr, sizeof(ifr.ifr_name));
                err |= put_user(ifr.ifr_map.mem_start, &uifmap32->mem_start);
                err |= put_user(ifr.ifr_map.mem_end, &uifmap32->mem_end);
                err |= put_user(ifr.ifr_map.base_addr, &uifmap32->base_addr);
                err |= put_user(ifr.ifr_map.irq, &uifmap32->irq);
                err |= put_user(ifr.ifr_map.dma, &uifmap32->dma);
                err |= put_user(ifr.ifr_map.port, &uifmap32->port);
                if (err)
                        err = -EFAULT;
        }
        return err;
}

/* Since old style bridge ioctl's endup using SIOCDEVPRIVATE
 * for some operations; this forces use of the newer bridge-utils that
 * use compatible ioctls
 */
static int old_bridge_ioctl(compat_ulong_t __user *argp)
{
        compat_ulong_t tmp;

        if (get_user(tmp, argp))
                return -EFAULT;
        if (tmp == BRCTL_GET_VERSION)
                return BRCTL_VERSION + 1;
        return -EINVAL;
}

static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
                         unsigned int cmd, unsigned long arg)
{
        void __user *argp = compat_ptr(arg);
        struct sock *sk = sock->sk;
        struct net *net = sock_net(sk);

        if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15))
                return compat_ifr_data_ioctl(net, cmd, argp);

        switch (cmd) {
        case SIOCSIFBR:
        case SIOCGIFBR:
                return old_bridge_ioctl(argp);
        case SIOCGIFCONF:
                return compat_dev_ifconf(net, argp);
        case SIOCWANDEV:
                return compat_siocwandev(net, argp);
        case SIOCGIFMAP:
        case SIOCSIFMAP:
                return compat_sioc_ifmap(net, cmd, argp);
        case SIOCGSTAMP_OLD:
        case SIOCGSTAMPNS_OLD:
                if (!sock->ops->gettstamp)
                        return -ENOIOCTLCMD;
                return sock->ops->gettstamp(sock, argp, cmd == SIOCGSTAMP_OLD,
                                            !COMPAT_USE_64BIT_TIME);

        case SIOCETHTOOL:
        case SIOCBONDSLAVEINFOQUERY:
        case SIOCBONDINFOQUERY:
        case SIOCSHWTSTAMP:
        case SIOCGHWTSTAMP:
                return compat_ifr_data_ioctl(net, cmd, argp);

        case FIOSETOWN:
        case SIOCSPGRP:
        case FIOGETOWN:
        case SIOCGPGRP:
        case SIOCBRADDBR:
        case SIOCBRDELBR:
        case SIOCGIFVLAN:
        case SIOCSIFVLAN:
        case SIOCADDDLCI:
        case SIOCDELDLCI:
        case SIOCGSKNS:
        case SIOCGSTAMP_NEW:
        case SIOCGSTAMPNS_NEW:
                return sock_ioctl(file, cmd, arg);

        case SIOCGIFFLAGS:
        case SIOCSIFFLAGS:
        case SIOCGIFMETRIC:
        case SIOCSIFMETRIC:
        case SIOCGIFMTU:
        case SIOCSIFMTU:
        case SIOCGIFMEM:
        case SIOCSIFMEM:
        case SIOCGIFHWADDR:
        case SIOCSIFHWADDR:
        case SIOCADDMULTI:
        case SIOCDELMULTI:
        case SIOCGIFINDEX:
        case SIOCGIFADDR:
        case SIOCSIFADDR:
        case SIOCSIFHWBROADCAST:
        case SIOCDIFADDR:
        case SIOCGIFBRDADDR:
        case SIOCSIFBRDADDR:
        case SIOCGIFDSTADDR:
        case SIOCSIFDSTADDR:
        case SIOCGIFNETMASK:
        case SIOCSIFNETMASK:
        case SIOCSIFPFLAGS:
        case SIOCGIFPFLAGS:
        case SIOCGIFTXQLEN:
        case SIOCSIFTXQLEN:
        case SIOCBRADDIF:
        case SIOCBRDELIF:
        case SIOCGIFNAME:
        case SIOCSIFNAME:
        case SIOCGMIIPHY:
        case SIOCGMIIREG:
        case SIOCSMIIREG:
        case SIOCBONDENSLAVE:
        case SIOCBONDRELEASE:
        case SIOCBONDSETHWADDR:
        case SIOCBONDCHANGEACTIVE:
                return compat_ifreq_ioctl(net, sock, cmd, argp);

        case SIOCSARP:
        case SIOCGARP:
        case SIOCDARP:
        case SIOCOUTQ:
        case SIOCOUTQNSD:
        case SIOCATMARK:
                return sock_do_ioctl(net, sock, cmd, arg);
        }

        return -ENOIOCTLCMD;
}

static long compat_sock_ioctl(struct file *file, unsigned int cmd,
                              unsigned long arg)
{
        struct socket *sock = file->private_data;
        int ret = -ENOIOCTLCMD;
        struct sock *sk;
        struct net *net;

        sk = sock->sk;
        net = sock_net(sk);

        if (sock->ops->compat_ioctl)
                ret = sock->ops->compat_ioctl(sock, cmd, arg);

        if (ret == -ENOIOCTLCMD &&
            (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST))
                ret = compat_wext_handle_ioctl(net, cmd, arg);

        if (ret == -ENOIOCTLCMD)
                ret = compat_sock_ioctl_trans(file, sock, cmd, arg);

        return ret;
}
#endif

/**
 *        kernel_bind - bind an address to a socket (kernel space)
 *        @sock: socket
 *        @addr: address
 *        @addrlen: length of address
 *
 *        Returns 0 or an error.
 */

int kernel_bind(struct socket *sock, struct sockaddr *addr, int addrlen)
{
        struct sockaddr_storage address;

        memcpy(&address, addr, addrlen);

        return sock->ops->bind(sock, (struct sockaddr *)&address, addrlen);
}
EXPORT_SYMBOL(kernel_bind);

/**
 *        kernel_listen - move socket to listening state (kernel space)
 *        @sock: socket
 *        @backlog: pending connections queue size
 *
 *        Returns 0 or an error.
 */

int kernel_listen(struct socket *sock, int backlog)
{
        return sock->ops->listen(sock, backlog);
}
EXPORT_SYMBOL(kernel_listen);

/**
 *        kernel_accept - accept a connection (kernel space)
 *        @sock: listening socket
 *        @newsock: new connected socket
 *        @flags: flags
 *
 *        @flags must be SOCK_CLOEXEC, SOCK_NONBLOCK or 0.
 *        If it fails, @newsock is guaranteed to be %NULL.
 *        Returns 0 or an error.
 */

int kernel_accept(struct socket *sock, struct socket **newsock, int flags)
{
        struct sock *sk = sock->sk;
        int err;

        err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol,
                               newsock);
        if (err < 0)
                goto done;

        err = sock->ops->accept(sock, *newsock, flags, true);
        if (err < 0) {
                sock_release(*newsock);
                *newsock = NULL;
                goto done;
        }

        (*newsock)->ops = sock->ops;
        __module_get((*newsock)->ops->owner);

done:
        return err;
}
EXPORT_SYMBOL(kernel_accept);

/**
 *        kernel_connect - connect a socket (kernel space)
 *        @sock: socket
 *        @addr: address
 *        @addrlen: address length
 *        @flags: flags (O_NONBLOCK, ...)
 *
 *        For datagram sockets, @addr is the addres to which datagrams are sent
 *        by default, and the only address from which datagrams are received.
 *        For stream sockets, attempts to connect to @addr.
 *        Returns 0 or an error code.
 */

int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen,
                   int flags)
{
        struct sockaddr_storage address;

        memcpy(&address, addr, addrlen);

        return sock->ops->connect(sock, (struct sockaddr *)&address, addrlen, flags);
}
EXPORT_SYMBOL(kernel_connect);

/**
 *        kernel_getsockname - get the address which the socket is bound (kernel space)
 *        @sock: socket
 *        @addr: address holder
 *
 *         Fills the @addr pointer with the address which the socket is bound.
 *        Returns 0 or an error code.
 */

int kernel_getsockname(struct socket *sock, struct sockaddr *addr)
{
        return sock->ops->getname(sock, addr, 0);
}
EXPORT_SYMBOL(kernel_getsockname);

/**
 *        kernel_getpeername - get the address which the socket is connected (kernel space)
 *        @sock: socket
 *        @addr: address holder
 *
 *         Fills the @addr pointer with the address which the socket is connected.
 *        Returns 0 or an error code.
 */

int kernel_getpeername(struct socket *sock, struct sockaddr *addr)
{
        return sock->ops->getname(sock, addr, 1);
}
EXPORT_SYMBOL(kernel_getpeername);

/**
 *        kernel_sendpage - send a &page through a socket (kernel space)
 *        @sock: socket
 *        @page: page
 *        @offset: page offset
 *        @size: total size in bytes
 *        @flags: flags (MSG_DONTWAIT, ...)
 *
 *        Returns the total amount sent in bytes or an error.
 */

int kernel_sendpage(struct socket *sock, struct page *page, int offset,
                    size_t size, int flags)
{
        if (sock->ops->sendpage) {
                /* Warn in case the improper page to zero-copy send */
                WARN_ONCE(!sendpage_ok(page), "improper page for zero-copy send");
                return sock->ops->sendpage(sock, page, offset, size, flags);
        }
        return sock_no_sendpage(sock, page, offset, size, flags);
}
EXPORT_SYMBOL(kernel_sendpage);

/**
 *        kernel_sendpage_locked - send a &page through the locked sock (kernel space)
 *        @sk: sock
 *        @page: page
 *        @offset: page offset
 *        @size: total size in bytes
 *        @flags: flags (MSG_DONTWAIT, ...)
 *
 *        Returns the total amount sent in bytes or an error.
 *        Caller must hold @sk.
 */

int kernel_sendpage_locked(struct sock *sk, struct page *page, int offset,
                           size_t size, int flags)
{
        struct socket *sock = sk->sk_socket;

        if (sock->ops->sendpage_locked)
                return sock->ops->sendpage_locked(sk, page, offset, size,
                                                  flags);

        return sock_no_sendpage_locked(sk, page, offset, size, flags);
}
EXPORT_SYMBOL(kernel_sendpage_locked);

/**
 *        kernel_sock_shutdown - shut down part of a full-duplex connection (kernel space)
 *        @sock: socket
 *        @how: connection part
 *
 *        Returns 0 or an error.
 */

int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how)
{
        return sock->ops->shutdown(sock, how);
}
EXPORT_SYMBOL(kernel_sock_shutdown);

/**
 *        kernel_sock_ip_overhead - returns the IP overhead imposed by a socket
 *        @sk: socket
 *
 *        This routine returns the IP overhead imposed by a socket i.e.
 *        the length of the underlying IP header, depending on whether
 *        this is an IPv4 or IPv6 socket and the length from IP options turned
 *        on at the socket. Assumes that the caller has a lock on the socket.
 */

u32 kernel_sock_ip_overhead(struct sock *sk)
{
        struct inet_sock *inet;
        struct ip_options_rcu *opt;
        u32 overhead = 0;
#if IS_ENABLED(CONFIG_IPV6)
        struct ipv6_pinfo *np;
        struct ipv6_txoptions *optv6 = NULL;
#endif /* IS_ENABLED(CONFIG_IPV6) */

        if (!sk)
                return overhead;

        switch (sk->sk_family) {
        case AF_INET:
                inet = inet_sk(sk);
                overhead += sizeof(struct iphdr);
                opt = rcu_dereference_protected(inet->inet_opt,
                                                sock_owned_by_user(sk));
                if (opt)
                        overhead += opt->opt.optlen;
                return overhead;
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                np = inet6_sk(sk);
                overhead += sizeof(struct ipv6hdr);
                if (np)
                        optv6 = rcu_dereference_protected(np->opt,
                                                          sock_owned_by_user(sk));
                if (optv6)
                        overhead += (optv6->opt_flen + optv6->opt_nflen);
                return overhead;
#endif /* IS_ENABLED(CONFIG_IPV6) */
        default: /* Returns 0 overhead if the socket is not ipv4 or ipv6 */
                return overhead;
        }
}
EXPORT_SYMBOL(kernel_sock_ip_overhead);































































































































































































































































































    1 





    1 

    1 





    2 

    2 





    3 

    3 





    1 

    1 





    2 

    2 













    2 

    2 





























    1 

    1 





    2 

    2 


































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 



    2 



























    2 





















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 





    3 

    3 






    3 


    3 
    3 


































    3 


























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ext4/super.c
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  from
 *
 *  linux/fs/minix/inode.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  Big-endian to little-endian byte-swapping/bitmaps by
 *        David S. Miller (davem@caip.rutgers.edu), 1995
 */

#include <linux/module.h>
#include <linux/string.h>
#include <linux/fs.h>
#include <linux/time.h>
#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/parser.h>
#include <linux/buffer_head.h>
#include <linux/exportfs.h>
#include <linux/vfs.h>
#include <linux/random.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/quotaops.h>
#include <linux/seq_file.h>
#include <linux/ctype.h>
#include <linux/log2.h>
#include <linux/crc16.h>
#include <linux/dax.h>
#include <linux/cleancache.h>
#include <linux/uaccess.h>
#include <linux/iversion.h>
#include <linux/unicode.h>
#include <linux/part_stat.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/fsnotify.h>

#include "ext4.h"
#include "ext4_extents.h"        /* Needed for trace points definition */
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
#include "mballoc.h"
#include "fsmap.h"

#define CREATE_TRACE_POINTS
#include <trace/events/ext4.h>

static struct ext4_lazy_init *ext4_li_info;
static struct mutex ext4_li_mtx;
static struct ratelimit_state ext4_mount_msg_ratelimit;

static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
                             unsigned long journal_devnum);
static int ext4_show_options(struct seq_file *seq, struct dentry *root);
static int ext4_commit_super(struct super_block *sb, int sync);
static int ext4_mark_recovery_complete(struct super_block *sb,
                                        struct ext4_super_block *es);
static int ext4_clear_journal_err(struct super_block *sb,
                                  struct ext4_super_block *es);
static int ext4_sync_fs(struct super_block *sb, int wait);
static int ext4_remount(struct super_block *sb, int *flags, char *data);
static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
static int ext4_unfreeze(struct super_block *sb);
static int ext4_freeze(struct super_block *sb);
static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
                       const char *dev_name, void *data);
static inline int ext2_feature_set_ok(struct super_block *sb);
static inline int ext3_feature_set_ok(struct super_block *sb);
static int ext4_feature_set_ok(struct super_block *sb, int readonly);
static void ext4_destroy_lazyinit_thread(void);
static void ext4_unregister_li_request(struct super_block *sb);
static void ext4_clear_request_list(void);
static struct inode *ext4_get_journal_inode(struct super_block *sb,
                                            unsigned int journal_inum);

/*
 * Lock ordering
 *
 * Note the difference between i_mmap_sem (EXT4_I(inode)->i_mmap_sem) and
 * i_mmap_rwsem (inode->i_mmap_rwsem)!
 *
 * page fault path:
 * mmap_lock -> sb_start_pagefault -> i_mmap_sem (r) -> transaction start ->
 *   page lock -> i_data_sem (rw)
 *
 * buffered write path:
 * sb_start_write -> i_mutex -> mmap_lock
 * sb_start_write -> i_mutex -> transaction start -> page lock ->
 *   i_data_sem (rw)
 *
 * truncate:
 * sb_start_write -> i_mutex -> i_mmap_sem (w) -> i_mmap_rwsem (w) -> page lock
 * sb_start_write -> i_mutex -> i_mmap_sem (w) -> transaction start ->
 *   i_data_sem (rw)
 *
 * direct IO:
 * sb_start_write -> i_mutex -> mmap_lock
 * sb_start_write -> i_mutex -> transaction start -> i_data_sem (rw)
 *
 * writepages:
 * transaction start -> page lock(s) -> i_data_sem (rw)
 */

#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
static struct file_system_type ext2_fs_type = {
        .owner                = THIS_MODULE,
        .name                = "ext2",
        .mount                = ext4_mount,
        .kill_sb        = kill_block_super,
        .fs_flags        = FS_REQUIRES_DEV,
};
MODULE_ALIAS_FS("ext2");
MODULE_ALIAS("ext2");
#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type)
#else
#define IS_EXT2_SB(sb) (0)
#endif


static struct file_system_type ext3_fs_type = {
        .owner                = THIS_MODULE,
        .name                = "ext3",
        .mount                = ext4_mount,
        .kill_sb        = kill_block_super,
        .fs_flags        = FS_REQUIRES_DEV,
};
MODULE_ALIAS_FS("ext3");
MODULE_ALIAS("ext3");
#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)


static inline void __ext4_read_bh(struct buffer_head *bh, int op_flags,
                                  bh_end_io_t *end_io)
{
        /*
         * buffer's verified bit is no longer valid after reading from
         * disk again due to write out error, clear it to make sure we
         * recheck the buffer contents.
         */
        clear_buffer_verified(bh);

        bh->b_end_io = end_io ? end_io : end_buffer_read_sync;
        get_bh(bh);
        submit_bh(REQ_OP_READ, op_flags, bh);
}

void ext4_read_bh_nowait(struct buffer_head *bh, int op_flags,
                         bh_end_io_t *end_io)
{
        BUG_ON(!buffer_locked(bh));

        if (ext4_buffer_uptodate(bh)) {
                unlock_buffer(bh);
                return;
        }
        __ext4_read_bh(bh, op_flags, end_io);
}

int ext4_read_bh(struct buffer_head *bh, int op_flags, bh_end_io_t *end_io)
{
        BUG_ON(!buffer_locked(bh));

        if (ext4_buffer_uptodate(bh)) {
                unlock_buffer(bh);
                return 0;
        }

        __ext4_read_bh(bh, op_flags, end_io);

        wait_on_buffer(bh);
        if (buffer_uptodate(bh))
                return 0;
        return -EIO;
}

int ext4_read_bh_lock(struct buffer_head *bh, int op_flags, bool wait)
{
        lock_buffer(bh);
        if (!wait) {
                ext4_read_bh_nowait(bh, op_flags, NULL);
                return 0;
        }
        return ext4_read_bh(bh, op_flags, NULL);
}

/*
 * This works like __bread_gfp() except it uses ERR_PTR for error
 * returns.  Currently with sb_bread it's impossible to distinguish
 * between ENOMEM and EIO situations (since both result in a NULL
 * return.
 */
static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb,
                                               sector_t block, int op_flags,
                                               gfp_t gfp)
{
        struct buffer_head *bh;
        int ret;

        bh = sb_getblk_gfp(sb, block, gfp);
        if (bh == NULL)
                return ERR_PTR(-ENOMEM);
        if (ext4_buffer_uptodate(bh))
                return bh;

        ret = ext4_read_bh_lock(bh, REQ_META | op_flags, true);
        if (ret) {
                put_bh(bh);
                return ERR_PTR(ret);
        }
        return bh;
}

struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block,
                                   int op_flags)
{
        return __ext4_sb_bread_gfp(sb, block, op_flags, __GFP_MOVABLE);
}

struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
                                            sector_t block)
{
        return __ext4_sb_bread_gfp(sb, block, 0, 0);
}

void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block)
{
        struct buffer_head *bh = sb_getblk_gfp(sb, block, 0);

        if (likely(bh)) {
                if (trylock_buffer(bh))
                        ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL);
                brelse(bh);
        }
}

static int ext4_verify_csum_type(struct super_block *sb,
                                 struct ext4_super_block *es)
{
        if (!ext4_has_feature_metadata_csum(sb))
                return 1;

        return es->s_checksum_type == EXT4_CRC32C_CHKSUM;
}

static __le32 ext4_superblock_csum(struct super_block *sb,
                                   struct ext4_super_block *es)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int offset = offsetof(struct ext4_super_block, s_checksum);
        __u32 csum;

        csum = ext4_chksum(sbi, ~0, (char *)es, offset);

        return cpu_to_le32(csum);
}

static int ext4_superblock_csum_verify(struct super_block *sb,
                                       struct ext4_super_block *es)
{
        if (!ext4_has_metadata_csum(sb))
                return 1;

        return es->s_checksum == ext4_superblock_csum(sb, es);
}

void ext4_superblock_csum_set(struct super_block *sb)
{
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;

        if (!ext4_has_metadata_csum(sb))
                return;

        es->s_checksum = ext4_superblock_csum(sb, es);
}

ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
                               struct ext4_group_desc *bg)
{
        return le32_to_cpu(bg->bg_block_bitmap_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
                 (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
}

ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
                               struct ext4_group_desc *bg)
{
        return le32_to_cpu(bg->bg_inode_bitmap_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
                 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
}

ext4_fsblk_t ext4_inode_table(struct super_block *sb,
                              struct ext4_group_desc *bg)
{
        return le32_to_cpu(bg->bg_inode_table_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
                 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
}

__u32 ext4_free_group_clusters(struct super_block *sb,
                               struct ext4_group_desc *bg)
{
        return le16_to_cpu(bg->bg_free_blocks_count_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
                 (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
}

__u32 ext4_free_inodes_count(struct super_block *sb,
                              struct ext4_group_desc *bg)
{
        return le16_to_cpu(READ_ONCE(bg->bg_free_inodes_count_lo)) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
                 (__u32)le16_to_cpu(READ_ONCE(bg->bg_free_inodes_count_hi)) << 16 : 0);
}

__u32 ext4_used_dirs_count(struct super_block *sb,
                              struct ext4_group_desc *bg)
{
        return le16_to_cpu(bg->bg_used_dirs_count_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
                 (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
}

__u32 ext4_itable_unused_count(struct super_block *sb,
                              struct ext4_group_desc *bg)
{
        return le16_to_cpu(bg->bg_itable_unused_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
                 (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
}

void ext4_block_bitmap_set(struct super_block *sb,
                           struct ext4_group_desc *bg, ext4_fsblk_t blk)
{
        bg->bg_block_bitmap_lo = cpu_to_le32((u32)blk);
        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
                bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32);
}

void ext4_inode_bitmap_set(struct super_block *sb,
                           struct ext4_group_desc *bg, ext4_fsblk_t blk)
{
        bg->bg_inode_bitmap_lo  = cpu_to_le32((u32)blk);
        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
                bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32);
}

void ext4_inode_table_set(struct super_block *sb,
                          struct ext4_group_desc *bg, ext4_fsblk_t blk)
{
        bg->bg_inode_table_lo = cpu_to_le32((u32)blk);
        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
                bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
}

void ext4_free_group_clusters_set(struct super_block *sb,
                                  struct ext4_group_desc *bg, __u32 count)
{
        bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
                bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16);
}

void ext4_free_inodes_set(struct super_block *sb,
                          struct ext4_group_desc *bg, __u32 count)
{
        WRITE_ONCE(bg->bg_free_inodes_count_lo, cpu_to_le16((__u16)count));
        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
                WRITE_ONCE(bg->bg_free_inodes_count_hi, cpu_to_le16(count >> 16));
}

void ext4_used_dirs_set(struct super_block *sb,
                          struct ext4_group_desc *bg, __u32 count)
{
        bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count);
        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
                bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16);
}

void ext4_itable_unused_set(struct super_block *sb,
                          struct ext4_group_desc *bg, __u32 count)
{
        bg->bg_itable_unused_lo = cpu_to_le16((__u16)count);
        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
                bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
}

static void __ext4_update_tstamp(__le32 *lo, __u8 *hi)
{
        time64_t now = ktime_get_real_seconds();

        now = clamp_val(now, 0, (1ull << 40) - 1);

        *lo = cpu_to_le32(lower_32_bits(now));
        *hi = upper_32_bits(now);
}

static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi)
{
        return ((time64_t)(*hi) << 32) + le32_to_cpu(*lo);
}
#define ext4_update_tstamp(es, tstamp) \
        __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)
#define ext4_get_tstamp(es, tstamp) \
        __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)

/*
 * The del_gendisk() function uninitializes the disk-specific data
 * structures, including the bdi structure, without telling anyone
 * else.  Once this happens, any attempt to call mark_buffer_dirty()
 * (for example, by ext4_commit_super), will cause a kernel OOPS.
 * This is a kludge to prevent these oops until we can put in a proper
 * hook in del_gendisk() to inform the VFS and file system layers.
 */
static int block_device_ejected(struct super_block *sb)
{
        struct inode *bd_inode = sb->s_bdev->bd_inode;
        struct backing_dev_info *bdi = inode_to_bdi(bd_inode);

        return bdi->dev == NULL;
}

static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
{
        struct super_block                *sb = journal->j_private;
        struct ext4_sb_info                *sbi = EXT4_SB(sb);
        int                                error = is_journal_aborted(journal);
        struct ext4_journal_cb_entry        *jce;

        BUG_ON(txn->t_state == T_FINISHED);

        ext4_process_freed_data(sb, txn->t_tid);

        spin_lock(&sbi->s_md_lock);
        while (!list_empty(&txn->t_private_list)) {
                jce = list_entry(txn->t_private_list.next,
                                 struct ext4_journal_cb_entry, jce_list);
                list_del_init(&jce->jce_list);
                spin_unlock(&sbi->s_md_lock);
                jce->jce_func(sb, jce, error);
                spin_lock(&sbi->s_md_lock);
        }
        spin_unlock(&sbi->s_md_lock);
}

/*
 * This writepage callback for write_cache_pages()
 * takes care of a few cases after page cleaning.
 *
 * write_cache_pages() already checks for dirty pages
 * and calls clear_page_dirty_for_io(), which we want,
 * to write protect the pages.
 *
 * However, we may have to redirty a page (see below.)
 */
static int ext4_journalled_writepage_callback(struct page *page,
                                              struct writeback_control *wbc,
                                              void *data)
{
        transaction_t *transaction = (transaction_t *) data;
        struct buffer_head *bh, *head;
        struct journal_head *jh;

        bh = head = page_buffers(page);
        do {
                /*
                 * We have to redirty a page in these cases:
                 * 1) If buffer is dirty, it means the page was dirty because it
                 * contains a buffer that needs checkpointing. So the dirty bit
                 * needs to be preserved so that checkpointing writes the buffer
                 * properly.
                 * 2) If buffer is not part of the committing transaction
                 * (we may have just accidentally come across this buffer because
                 * inode range tracking is not exact) or if the currently running
                 * transaction already contains this buffer as well, dirty bit
                 * needs to be preserved so that the buffer gets writeprotected
                 * properly on running transaction's commit.
                 */
                jh = bh2jh(bh);
                if (buffer_dirty(bh) ||
                    (jh && (jh->b_transaction != transaction ||
                            jh->b_next_transaction))) {
                        redirty_page_for_writepage(wbc, page);
                        goto out;
                }
        } while ((bh = bh->b_this_page) != head);

out:
        return AOP_WRITEPAGE_ACTIVATE;
}

static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode)
{
        struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
        struct writeback_control wbc = {
                .sync_mode =  WB_SYNC_ALL,
                .nr_to_write = LONG_MAX,
                .range_start = jinode->i_dirty_start,
                .range_end = jinode->i_dirty_end,
        };

        return write_cache_pages(mapping, &wbc,
                                 ext4_journalled_writepage_callback,
                                 jinode->i_transaction);
}

static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
{
        int ret;

        if (ext4_should_journal_data(jinode->i_vfs_inode))
                ret = ext4_journalled_submit_inode_data_buffers(jinode);
        else
                ret = jbd2_journal_submit_inode_data_buffers(jinode);

        return ret;
}

static int ext4_journal_finish_inode_data_buffers(struct jbd2_inode *jinode)
{
        int ret = 0;

        if (!ext4_should_journal_data(jinode->i_vfs_inode))
                ret = jbd2_journal_finish_inode_data_buffers(jinode);

        return ret;
}

static bool system_going_down(void)
{
        return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF
                || system_state == SYSTEM_RESTART;
}

struct ext4_err_translation {
        int code;
        int errno;
};

#define EXT4_ERR_TRANSLATE(err) { .code = EXT4_ERR_##err, .errno = err }

static struct ext4_err_translation err_translation[] = {
        EXT4_ERR_TRANSLATE(EIO),
        EXT4_ERR_TRANSLATE(ENOMEM),
        EXT4_ERR_TRANSLATE(EFSBADCRC),
        EXT4_ERR_TRANSLATE(EFSCORRUPTED),
        EXT4_ERR_TRANSLATE(ENOSPC),
        EXT4_ERR_TRANSLATE(ENOKEY),
        EXT4_ERR_TRANSLATE(EROFS),
        EXT4_ERR_TRANSLATE(EFBIG),
        EXT4_ERR_TRANSLATE(EEXIST),
        EXT4_ERR_TRANSLATE(ERANGE),
        EXT4_ERR_TRANSLATE(EOVERFLOW),
        EXT4_ERR_TRANSLATE(EBUSY),
        EXT4_ERR_TRANSLATE(ENOTDIR),
        EXT4_ERR_TRANSLATE(ENOTEMPTY),
        EXT4_ERR_TRANSLATE(ESHUTDOWN),
        EXT4_ERR_TRANSLATE(EFAULT),
};

static int ext4_errno_to_code(int errno)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(err_translation); i++)
                if (err_translation[i].errno == errno)
                        return err_translation[i].code;
        return EXT4_ERR_UNKNOWN;
}

static void __save_error_info(struct super_block *sb, int error,
                              __u32 ino, __u64 block,
                              const char *func, unsigned int line)
{
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;

        EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
        if (bdev_read_only(sb->s_bdev))
                return;
        /* We default to EFSCORRUPTED error... */
        if (error == 0)
                error = EFSCORRUPTED;
        es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
        ext4_update_tstamp(es, s_last_error_time);
        strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
        es->s_last_error_line = cpu_to_le32(line);
        es->s_last_error_ino = cpu_to_le32(ino);
        es->s_last_error_block = cpu_to_le64(block);
        es->s_last_error_errcode = ext4_errno_to_code(error);
        if (!es->s_first_error_time) {
                es->s_first_error_time = es->s_last_error_time;
                es->s_first_error_time_hi = es->s_last_error_time_hi;
                strncpy(es->s_first_error_func, func,
                        sizeof(es->s_first_error_func));
                es->s_first_error_line = cpu_to_le32(line);
                es->s_first_error_ino = es->s_last_error_ino;
                es->s_first_error_block = es->s_last_error_block;
                es->s_first_error_errcode = es->s_last_error_errcode;
        }
        /*
         * Start the daily error reporting function if it hasn't been
         * started already
         */
        if (!es->s_error_count)
                mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ);
        le32_add_cpu(&es->s_error_count, 1);
}

static void save_error_info(struct super_block *sb, int error,
                            __u32 ino, __u64 block,
                            const char *func, unsigned int line)
{
        __save_error_info(sb, error, ino, block, func, line);
        if (!bdev_read_only(sb->s_bdev))
                ext4_commit_super(sb, 1);
}

/* Deal with the reporting of failure conditions on a filesystem such as
 * inconsistencies detected or read IO failures.
 *
 * On ext2, we can store the error state of the filesystem in the
 * superblock.  That is not possible on ext4, because we may have other
 * write ordering constraints on the superblock which prevent us from
 * writing it out straight away; and given that the journal is about to
 * be aborted, we can't rely on the current, or future, transactions to
 * write out the superblock safely.
 *
 * We'll just use the jbd2_journal_abort() error code to record an error in
 * the journal instead.  On recovery, the journal will complain about
 * that error until we've noted it down and cleared it.
 */

static void ext4_handle_error(struct super_block *sb)
{
        journal_t *journal = EXT4_SB(sb)->s_journal;

        if (test_opt(sb, WARN_ON_ERROR))
                WARN_ON_ONCE(1);

        if (sb_rdonly(sb) || test_opt(sb, ERRORS_CONT))
                return;

        ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED);
        if (journal)
                jbd2_journal_abort(journal, -EIO);
        /*
         * We force ERRORS_RO behavior when system is rebooting. Otherwise we
         * could panic during 'reboot -f' as the underlying device got already
         * disabled.
         */
        if (test_opt(sb, ERRORS_RO) || system_going_down()) {
                ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
                /*
                 * Make sure updated value of ->s_mount_flags will be visible
                 * before ->s_flags update
                 */
                smp_wmb();
                sb->s_flags |= SB_RDONLY;
        } else if (test_opt(sb, ERRORS_PANIC)) {
                panic("EXT4-fs (device %s): panic forced after error\n",
                        sb->s_id);
        }
}

#define ext4_error_ratelimit(sb)                                        \
                ___ratelimit(&(EXT4_SB(sb)->s_err_ratelimit_state),        \
                             "EXT4-fs error")

void __ext4_error(struct super_block *sb, const char *function,
                  unsigned int line, int error, __u64 block,
                  const char *fmt, ...)
{
        struct va_format vaf;
        va_list args;

        if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
                return;

        trace_ext4_error(sb, function, line);
        if (ext4_error_ratelimit(sb)) {
                va_start(args, fmt);
                vaf.fmt = fmt;
                vaf.va = &args;
                printk(KERN_CRIT
                       "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
                       sb->s_id, function, line, current->comm, &vaf);
                va_end(args);
        }
        fsnotify_sb_error(sb, NULL, error ? error : EFSCORRUPTED);
        save_error_info(sb, error, 0, block, function, line);
        ext4_handle_error(sb);
}

void __ext4_error_inode(struct inode *inode, const char *function,
                        unsigned int line, ext4_fsblk_t block, int error,
                        const char *fmt, ...)
{
        va_list args;
        struct va_format vaf;

        if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
                return;

        trace_ext4_error(inode->i_sb, function, line);
        if (ext4_error_ratelimit(inode->i_sb)) {
                va_start(args, fmt);
                vaf.fmt = fmt;
                vaf.va = &args;
                if (block)
                        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
                               "inode #%lu: block %llu: comm %s: %pV\n",
                               inode->i_sb->s_id, function, line, inode->i_ino,
                               block, current->comm, &vaf);
                else
                        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
                               "inode #%lu: comm %s: %pV\n",
                               inode->i_sb->s_id, function, line, inode->i_ino,
                               current->comm, &vaf);
                va_end(args);
        }
        fsnotify_sb_error(inode->i_sb, inode, error ? error : EFSCORRUPTED);
        save_error_info(inode->i_sb, error, inode->i_ino, block,
                        function, line);
        ext4_handle_error(inode->i_sb);
}

void __ext4_error_file(struct file *file, const char *function,
                       unsigned int line, ext4_fsblk_t block,
                       const char *fmt, ...)
{
        va_list args;
        struct va_format vaf;
        struct inode *inode = file_inode(file);
        char pathname[80], *path;

        if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
                return;

        trace_ext4_error(inode->i_sb, function, line);
        if (ext4_error_ratelimit(inode->i_sb)) {
                path = file_path(file, pathname, sizeof(pathname));
                if (IS_ERR(path))
                        path = "(unknown)";
                va_start(args, fmt);
                vaf.fmt = fmt;
                vaf.va = &args;
                if (block)
                        printk(KERN_CRIT
                               "EXT4-fs error (device %s): %s:%d: inode #%lu: "
                               "block %llu: comm %s: path %s: %pV\n",
                               inode->i_sb->s_id, function, line, inode->i_ino,
                               block, current->comm, path, &vaf);
                else
                        printk(KERN_CRIT
                               "EXT4-fs error (device %s): %s:%d: inode #%lu: "
                               "comm %s: path %s: %pV\n",
                               inode->i_sb->s_id, function, line, inode->i_ino,
                               current->comm, path, &vaf);
                va_end(args);
        }
        fsnotify_sb_error(inode->i_sb, inode, EFSCORRUPTED);
        save_error_info(inode->i_sb, EFSCORRUPTED, inode->i_ino, block,
                        function, line);
        ext4_handle_error(inode->i_sb);
}

const char *ext4_decode_error(struct super_block *sb, int errno,
                              char nbuf[16])
{
        char *errstr = NULL;

        switch (errno) {
        case -EFSCORRUPTED:
                errstr = "Corrupt filesystem";
                break;
        case -EFSBADCRC:
                errstr = "Filesystem failed CRC";
                break;
        case -EIO:
                errstr = "IO failure";
                break;
        case -ENOMEM:
                errstr = "Out of memory";
                break;
        case -EROFS:
                if (!sb || (EXT4_SB(sb)->s_journal &&
                            EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT))
                        errstr = "Journal has aborted";
                else
                        errstr = "Readonly filesystem";
                break;
        default:
                /* If the caller passed in an extra buffer for unknown
                 * errors, textualise them now.  Else we just return
                 * NULL. */
                if (nbuf) {
                        /* Check for truncated error codes... */
                        if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
                                errstr = nbuf;
                }
                break;
        }

        return errstr;
}

/* __ext4_std_error decodes expected errors from journaling functions
 * automatically and invokes the appropriate error response.  */

void __ext4_std_error(struct super_block *sb, const char *function,
                      unsigned int line, int errno)
{
        char nbuf[16];
        const char *errstr;

        if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
                return;

        /* Special case: if the error is EROFS, and we're not already
         * inside a transaction, then there's really no point in logging
         * an error. */
        if (errno == -EROFS && journal_current_handle() == NULL && sb_rdonly(sb))
                return;

        if (ext4_error_ratelimit(sb)) {
                errstr = ext4_decode_error(sb, errno, nbuf);
                printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
                       sb->s_id, function, line, errstr);
        }
        fsnotify_sb_error(sb, NULL, errno ? errno : EFSCORRUPTED);
        save_error_info(sb, -errno, 0, 0, function, line);
        ext4_handle_error(sb);
}

/*
 * ext4_abort is a much stronger failure handler than ext4_error.  The
 * abort function may be used to deal with unrecoverable failures such
 * as journal IO errors or ENOMEM at a critical moment in log management.
 *
 * We unconditionally force the filesystem into an ABORT|READONLY state,
 * unless the error response on the fs has been set to panic in which
 * case we take the easy way out and panic immediately.
 */

void __ext4_abort(struct super_block *sb, const char *function,
                  unsigned int line, int error, const char *fmt, ...)
{
        struct va_format vaf;
        va_list args;

        if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
                return;

        fsnotify_sb_error(sb, NULL, error ? error : EFSCORRUPTED);
        save_error_info(sb, error, 0, 0, function, line);
        va_start(args, fmt);
        vaf.fmt = fmt;
        vaf.va = &args;
        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: %pV\n",
               sb->s_id, function, line, &vaf);
        va_end(args);

        if (sb_rdonly(sb) == 0) {
                ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED);
                if (EXT4_SB(sb)->s_journal)
                        jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);

                ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
                /*
                 * Make sure updated value of ->s_mount_flags will be visible
                 * before ->s_flags update
                 */
                smp_wmb();
                sb->s_flags |= SB_RDONLY;
        }
        if (test_opt(sb, ERRORS_PANIC) && !system_going_down())
                panic("EXT4-fs panic from previous error\n");
}

void __ext4_msg(struct super_block *sb,
                const char *prefix, const char *fmt, ...)
{
        struct va_format vaf;
        va_list args;

        atomic_inc(&EXT4_SB(sb)->s_msg_count);
        if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs"))
                return;

        va_start(args, fmt);
        vaf.fmt = fmt;
        vaf.va = &args;
        printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
        va_end(args);
}

static int ext4_warning_ratelimit(struct super_block *sb)
{
        atomic_inc(&EXT4_SB(sb)->s_warning_count);
        return ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state),
                            "EXT4-fs warning");
}

void __ext4_warning(struct super_block *sb, const char *function,
                    unsigned int line, const char *fmt, ...)
{
        struct va_format vaf;
        va_list args;

        if (!ext4_warning_ratelimit(sb))
                return;

        va_start(args, fmt);
        vaf.fmt = fmt;
        vaf.va = &args;
        printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n",
               sb->s_id, function, line, &vaf);
        va_end(args);
}

void __ext4_warning_inode(const struct inode *inode, const char *function,
                          unsigned int line, const char *fmt, ...)
{
        struct va_format vaf;
        va_list args;

        if (!ext4_warning_ratelimit(inode->i_sb))
                return;

        va_start(args, fmt);
        vaf.fmt = fmt;
        vaf.va = &args;
        printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: "
               "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id,
               function, line, inode->i_ino, current->comm, &vaf);
        va_end(args);
}

void __ext4_grp_locked_error(const char *function, unsigned int line,
                             struct super_block *sb, ext4_group_t grp,
                             unsigned long ino, ext4_fsblk_t block,
                             const char *fmt, ...)
__releases(bitlock)
__acquires(bitlock)
{
        struct va_format vaf;
        va_list args;

        if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
                return;

        trace_ext4_error(sb, function, line);
        __save_error_info(sb, EFSCORRUPTED, ino, block, function, line);

        if (ext4_error_ratelimit(sb)) {
                va_start(args, fmt);
                vaf.fmt = fmt;
                vaf.va = &args;
                printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ",
                       sb->s_id, function, line, grp);
                if (ino)
                        printk(KERN_CONT "inode %lu: ", ino);
                if (block)
                        printk(KERN_CONT "block %llu:",
                               (unsigned long long) block);
                printk(KERN_CONT "%pV\n", &vaf);
                va_end(args);
        }

        if (test_opt(sb, WARN_ON_ERROR))
                WARN_ON_ONCE(1);

        if (test_opt(sb, ERRORS_CONT)) {
                ext4_commit_super(sb, 0);
                return;
        }

        ext4_unlock_group(sb, grp);
        ext4_commit_super(sb, 1);
        ext4_handle_error(sb);
        /*
         * We only get here in the ERRORS_RO case; relocking the group
         * may be dangerous, but nothing bad will happen since the
         * filesystem will have already been marked read/only and the
         * journal has been aborted.  We return 1 as a hint to callers
         * who might what to use the return value from
         * ext4_grp_locked_error() to distinguish between the
         * ERRORS_CONT and ERRORS_RO case, and perhaps return more
         * aggressively from the ext4 function in question, with a
         * more appropriate error code.
         */
        ext4_lock_group(sb, grp);
        return;
}

void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
                                     ext4_group_t group,
                                     unsigned int flags)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
        struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
        int ret;

        if (!grp || !gdp)
                return;
        if (flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) {
                ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
                                            &grp->bb_state);
                if (!ret)
                        percpu_counter_sub(&sbi->s_freeclusters_counter,
                                           grp->bb_free);
        }

        if (flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) {
                ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT,
                                            &grp->bb_state);
                if (!ret && gdp) {
                        int count;

                        count = ext4_free_inodes_count(sb, gdp);
                        percpu_counter_sub(&sbi->s_freeinodes_counter,
                                           count);
                }
        }
}

void ext4_update_dynamic_rev(struct super_block *sb)
{
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;

        if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
                return;

        ext4_warning(sb,
                     "updating to rev %d because of new feature flag, "
                     "running e2fsck is recommended",
                     EXT4_DYNAMIC_REV);

        es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO);
        es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE);
        es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV);
        /* leave es->s_feature_*compat flags alone */
        /* es->s_uuid will be set by e2fsck if empty */

        /*
         * The rest of the superblock fields should be zero, and if not it
         * means they are likely already in use, so leave them alone.  We
         * can leave it up to e2fsck to clean up any inconsistencies there.
         */
}

/*
 * Open the external journal device
 */
static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
{
        struct block_device *bdev;

        bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
        if (IS_ERR(bdev))
                goto fail;
        return bdev;

fail:
        ext4_msg(sb, KERN_ERR,
                 "failed to open journal device unknown-block(%u,%u) %ld",
                 MAJOR(dev), MINOR(dev), PTR_ERR(bdev));
        return NULL;
}

/*
 * Release the journal device
 */
static void ext4_blkdev_put(struct block_device *bdev)
{
        blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
}

static void ext4_blkdev_remove(struct ext4_sb_info *sbi)
{
        struct block_device *bdev;
        bdev = sbi->s_journal_bdev;
        if (bdev) {
                /*
                 * Invalidate the journal device's buffers.  We don't want them
                 * floating about in memory - the physical journal device may
                 * hotswapped, and it breaks the `ro-after' testing code.
                 */
                invalidate_bdev(bdev);
                ext4_blkdev_put(bdev);
                sbi->s_journal_bdev = NULL;
        }
}

static inline struct inode *orphan_list_entry(struct list_head *l)
{
        return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode;
}

static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
{
        struct list_head *l;

        ext4_msg(sb, KERN_ERR, "sb orphan head is %d",
                 le32_to_cpu(sbi->s_es->s_last_orphan));

        printk(KERN_ERR "sb_info orphan list:\n");
        list_for_each(l, &sbi->s_orphan) {
                struct inode *inode = orphan_list_entry(l);
                printk(KERN_ERR "  "
                       "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
                       inode->i_sb->s_id, inode->i_ino, inode,
                       inode->i_mode, inode->i_nlink,
                       NEXT_ORPHAN(inode));
        }
}

#ifdef CONFIG_QUOTA
static int ext4_quota_off(struct super_block *sb, int type);

static inline void ext4_quota_off_umount(struct super_block *sb)
{
        int type;

        /* Use our quota_off function to clear inode flags etc. */
        for (type = 0; type < EXT4_MAXQUOTAS; type++)
                ext4_quota_off(sb, type);
}

/*
 * This is a helper function which is used in the mount/remount
 * codepaths (which holds s_umount) to fetch the quota file name.
 */
static inline char *get_qf_name(struct super_block *sb,
                                struct ext4_sb_info *sbi,
                                int type)
{
        return rcu_dereference_protected(sbi->s_qf_names[type],
                                         lockdep_is_held(&sb->s_umount));
}
#else
static inline void ext4_quota_off_umount(struct super_block *sb)
{
}
#endif

static void ext4_put_super(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        struct buffer_head **group_desc;
        struct flex_groups **flex_groups;
        int aborted = 0;
        int i, err;

        /*
         * Unregister sysfs before destroying jbd2 journal.
         * Since we could still access attr_journal_task attribute via sysfs
         * path which could have sbi->s_journal->j_task as NULL
         * Unregister sysfs before flush sbi->s_error_work.
         * Since user may read /proc/fs/ext4/xx/mb_groups during umount, If
         * read metadata verify failed then will queue error work.
         * flush_stashed_error_work will call start_this_handle may trigger
         * BUG_ON.
         */
        ext4_unregister_sysfs(sb);

        ext4_unregister_li_request(sb);
        ext4_quota_off_umount(sb);

        destroy_workqueue(sbi->rsv_conversion_wq);

        if (sbi->s_journal) {
                aborted = is_journal_aborted(sbi->s_journal);
                err = jbd2_journal_destroy(sbi->s_journal);
                sbi->s_journal = NULL;
                if ((err < 0) && !aborted) {
                        ext4_abort(sb, -err, "Couldn't clean up the journal");
                }
        }

        ext4_es_unregister_shrinker(sbi);
        del_timer_sync(&sbi->s_err_report);
        ext4_release_system_zone(sb);
        ext4_mb_release(sb);
        ext4_ext_release(sb);

        if (!sb_rdonly(sb) && !aborted) {
                ext4_clear_feature_journal_needs_recovery(sb);
                es->s_state = cpu_to_le16(sbi->s_mount_state);
        }
        if (!sb_rdonly(sb))
                ext4_commit_super(sb, 1);

        group_desc = rcu_access_pointer(sbi->s_group_desc);
        for (i = 0; i < sbi->s_gdb_count; i++)
                brelse(group_desc[i]);
        kvfree(group_desc);
        flex_groups = rcu_access_pointer(sbi->s_flex_groups);
        if (flex_groups) {
                for (i = 0; i < sbi->s_flex_groups_allocated; i++)
                        kvfree(flex_groups[i]);
                kvfree(flex_groups);
        }
        percpu_counter_destroy(&sbi->s_freeclusters_counter);
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
        percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
        percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit);
        percpu_free_rwsem(&sbi->s_writepages_rwsem);
#ifdef CONFIG_QUOTA
        for (i = 0; i < EXT4_MAXQUOTAS; i++)
                kfree(get_qf_name(sb, sbi, i));
#endif

        /* Debugging code just in case the in-memory inode orphan list
         * isn't empty.  The on-disk one can be non-empty if we've
         * detected an error and taken the fs readonly, but the
         * in-memory list had better be clean by this point. */
        if (!list_empty(&sbi->s_orphan))
                dump_orphan_list(sb, sbi);
        J_ASSERT(list_empty(&sbi->s_orphan));

        sync_blockdev(sb->s_bdev);
        invalidate_bdev(sb->s_bdev);
        if (sbi->s_journal_bdev && sbi->s_journal_bdev != sb->s_bdev) {
                sync_blockdev(sbi->s_journal_bdev);
                ext4_blkdev_remove(sbi);
        }

        ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
        sbi->s_ea_inode_cache = NULL;

        ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
        sbi->s_ea_block_cache = NULL;

        ext4_stop_mmpd(sbi);

        brelse(sbi->s_sbh);
        sb->s_fs_info = NULL;
        /*
         * Now that we are completely done shutting down the
         * superblock, we need to actually destroy the kobject.
         */
        kobject_put(&sbi->s_kobj);
        wait_for_completion(&sbi->s_kobj_unregister);
        if (sbi->s_chksum_driver)
                crypto_free_shash(sbi->s_chksum_driver);
        kfree(sbi->s_blockgroup_lock);
        fs_put_dax(sbi->s_daxdev);
        fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
#ifdef CONFIG_UNICODE
        utf8_unload(sb->s_encoding);
#endif
        kfree(sbi);
}

static struct kmem_cache *ext4_inode_cachep;

/*
 * Called inside transaction, so use GFP_NOFS
 */
static struct inode *ext4_alloc_inode(struct super_block *sb)
{
        struct ext4_inode_info *ei;

        ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
        if (!ei)
                return NULL;

        inode_set_iversion(&ei->vfs_inode, 1);
        ei->i_flags = 0;
        spin_lock_init(&ei->i_raw_lock);
        INIT_LIST_HEAD(&ei->i_prealloc_list);
        atomic_set(&ei->i_prealloc_active, 0);
        spin_lock_init(&ei->i_prealloc_lock);
        ext4_es_init_tree(&ei->i_es_tree);
        rwlock_init(&ei->i_es_lock);
        INIT_LIST_HEAD(&ei->i_es_list);
        ei->i_es_all_nr = 0;
        ei->i_es_shk_nr = 0;
        ei->i_es_shrink_lblk = 0;
        ei->i_reserved_data_blocks = 0;
        spin_lock_init(&(ei->i_block_reservation_lock));
        ext4_init_pending_tree(&ei->i_pending_tree);
#ifdef CONFIG_QUOTA
        ei->i_reserved_quota = 0;
        memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
#endif
        ei->jinode = NULL;
        INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
        spin_lock_init(&ei->i_completed_io_lock);
        ei->i_sync_tid = 0;
        ei->i_datasync_tid = 0;
        atomic_set(&ei->i_unwritten, 0);
        INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
        ext4_fc_init_inode(&ei->vfs_inode);
        mutex_init(&ei->i_fc_lock);
        return &ei->vfs_inode;
}

static int ext4_drop_inode(struct inode *inode)
{
        int drop = generic_drop_inode(inode);

        if (!drop)
                drop = fscrypt_drop_inode(inode);

        trace_ext4_drop_inode(inode, drop);
        return drop;
}

static void ext4_free_in_core_inode(struct inode *inode)
{
        fscrypt_free_inode(inode);
        if (!list_empty(&(EXT4_I(inode)->i_fc_list))) {
                pr_warn("%s: inode %ld still in fc list",
                        __func__, inode->i_ino);
        }
        kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
}

static void ext4_destroy_inode(struct inode *inode)
{
        if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
                ext4_msg(inode->i_sb, KERN_ERR,
                         "Inode %lu (%p): orphan list check failed!",
                         inode->i_ino, EXT4_I(inode));
                print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
                                EXT4_I(inode), sizeof(struct ext4_inode_info),
                                true);
                dump_stack();
        }

        if (EXT4_I(inode)->i_reserved_data_blocks)
                ext4_msg(inode->i_sb, KERN_ERR,
                         "Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!",
                         inode->i_ino, EXT4_I(inode),
                         EXT4_I(inode)->i_reserved_data_blocks);
}

static void init_once(void *foo)
{
        struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;

        INIT_LIST_HEAD(&ei->i_orphan);
        init_rwsem(&ei->xattr_sem);
        init_rwsem(&ei->i_data_sem);
        init_rwsem(&ei->i_mmap_sem);
        inode_init_once(&ei->vfs_inode);
        ext4_fc_init_inode(&ei->vfs_inode);
}

static int __init init_inodecache(void)
{
        ext4_inode_cachep = kmem_cache_create_usercopy("ext4_inode_cache",
                                sizeof(struct ext4_inode_info), 0,
                                (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
                                        SLAB_ACCOUNT),
                                offsetof(struct ext4_inode_info, i_data),
                                sizeof_field(struct ext4_inode_info, i_data),
                                init_once);
        if (ext4_inode_cachep == NULL)
                return -ENOMEM;
        return 0;
}

static void destroy_inodecache(void)
{
        /*
         * Make sure all delayed rcu free inodes are flushed before we
         * destroy cache.
         */
        rcu_barrier();
        kmem_cache_destroy(ext4_inode_cachep);
}

void ext4_clear_inode(struct inode *inode)
{
        ext4_fc_del(inode);
        invalidate_inode_buffers(inode);
        clear_inode(inode);
        ext4_discard_preallocations(inode, 0);
        ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
        dquot_drop(inode);
        if (EXT4_I(inode)->jinode) {
                jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
                                               EXT4_I(inode)->jinode);
                jbd2_free_inode(EXT4_I(inode)->jinode);
                EXT4_I(inode)->jinode = NULL;
        }
        fscrypt_put_encryption_info(inode);
        fsverity_cleanup_inode(inode);
}

static struct inode *ext4_nfs_get_inode(struct super_block *sb,
                                        u64 ino, u32 generation)
{
        struct inode *inode;

        /*
         * Currently we don't know the generation for parent directory, so
         * a generation of 0 means "accept any"
         */
        inode = ext4_iget(sb, ino, EXT4_IGET_HANDLE);
        if (IS_ERR(inode))
                return ERR_CAST(inode);
        if (generation && inode->i_generation != generation) {
                iput(inode);
                return ERR_PTR(-ESTALE);
        }

        return inode;
}

static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid,
                                        int fh_len, int fh_type)
{
        return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
                                    ext4_nfs_get_inode);
}

static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
                                        int fh_len, int fh_type)
{
        return generic_fh_to_parent(sb, fid, fh_len, fh_type,
                                    ext4_nfs_get_inode);
}

static int ext4_nfs_commit_metadata(struct inode *inode)
{
        struct writeback_control wbc = {
                .sync_mode = WB_SYNC_ALL
        };

        trace_ext4_nfs_commit_metadata(inode);
        return ext4_write_inode(inode, &wbc);
}

/*
 * Try to release metadata pages (indirect blocks, directories) which are
 * mapped via the block device.  Since these pages could have journal heads
 * which would prevent try_to_free_buffers() from freeing them, we must use
 * jbd2 layer's try_to_free_buffers() function to release them.
 */
static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
                                 gfp_t wait)
{
        journal_t *journal = EXT4_SB(sb)->s_journal;

        WARN_ON(PageChecked(page));
        if (!page_has_buffers(page))
                return 0;
        if (journal)
                return jbd2_journal_try_to_free_buffers(journal, page);

        return try_to_free_buffers(page);
}

#ifdef CONFIG_FS_ENCRYPTION
static int ext4_get_context(struct inode *inode, void *ctx, size_t len)
{
        return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
                                 EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len);
}

static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
                                                        void *fs_data)
{
        handle_t *handle = fs_data;
        int res, res2, credits, retries = 0;

        /*
         * Encrypting the root directory is not allowed because e2fsck expects
         * lost+found to exist and be unencrypted, and encrypting the root
         * directory would imply encrypting the lost+found directory as well as
         * the filename "lost+found" itself.
         */
        if (inode->i_ino == EXT4_ROOT_INO)
                return -EPERM;

        if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode)))
                return -EINVAL;

        if (ext4_test_inode_flag(inode, EXT4_INODE_DAX))
                return -EOPNOTSUPP;

        res = ext4_convert_inline_data(inode);
        if (res)
                return res;

        /*
         * If a journal handle was specified, then the encryption context is
         * being set on a new inode via inheritance and is part of a larger
         * transaction to create the inode.  Otherwise the encryption context is
         * being set on an existing inode in its own transaction.  Only in the
         * latter case should the "retry on ENOSPC" logic be used.
         */

        if (handle) {
                res = ext4_xattr_set_handle(handle, inode,
                                            EXT4_XATTR_INDEX_ENCRYPTION,
                                            EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
                                            ctx, len, 0);
                if (!res) {
                        ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
                        ext4_clear_inode_state(inode,
                                        EXT4_STATE_MAY_INLINE_DATA);
                        /*
                         * Update inode->i_flags - S_ENCRYPTED will be enabled,
                         * S_DAX may be disabled
                         */
                        ext4_set_inode_flags(inode, false);
                }
                return res;
        }

        res = dquot_initialize(inode);
        if (res)
                return res;
retry:
        res = ext4_xattr_set_credits(inode, len, false /* is_create */,
                                     &credits);
        if (res)
                return res;

        handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
        if (IS_ERR(handle))
                return PTR_ERR(handle);

        res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION,
                                    EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
                                    ctx, len, 0);
        if (!res) {
                ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
                /*
                 * Update inode->i_flags - S_ENCRYPTED will be enabled,
                 * S_DAX may be disabled
                 */
                ext4_set_inode_flags(inode, false);
                res = ext4_mark_inode_dirty(handle, inode);
                if (res)
                        EXT4_ERROR_INODE(inode, "Failed to mark inode dirty");
        }
        res2 = ext4_journal_stop(handle);

        if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
                goto retry;
        if (!res)
                res = res2;
        return res;
}

static const union fscrypt_policy *ext4_get_dummy_policy(struct super_block *sb)
{
        return EXT4_SB(sb)->s_dummy_enc_policy.policy;
}

static bool ext4_has_stable_inodes(struct super_block *sb)
{
        return ext4_has_feature_stable_inodes(sb);
}

static void ext4_get_ino_and_lblk_bits(struct super_block *sb,
                                       int *ino_bits_ret, int *lblk_bits_ret)
{
        *ino_bits_ret = 8 * sizeof(EXT4_SB(sb)->s_es->s_inodes_count);
        *lblk_bits_ret = 8 * sizeof(ext4_lblk_t);
}

static const struct fscrypt_operations ext4_cryptops = {
        .key_prefix                = "ext4:",
        .get_context                = ext4_get_context,
        .set_context                = ext4_set_context,
        .get_dummy_policy        = ext4_get_dummy_policy,
        .empty_dir                = ext4_empty_dir,
        .max_namelen                = EXT4_NAME_LEN,
        .has_stable_inodes        = ext4_has_stable_inodes,
        .get_ino_and_lblk_bits        = ext4_get_ino_and_lblk_bits,
};
#endif

#ifdef CONFIG_QUOTA
static const char * const quotatypes[] = INITQFNAMES;
#define QTYPE2NAME(t) (quotatypes[t])

static int ext4_write_dquot(struct dquot *dquot);
static int ext4_acquire_dquot(struct dquot *dquot);
static int ext4_release_dquot(struct dquot *dquot);
static int ext4_mark_dquot_dirty(struct dquot *dquot);
static int ext4_write_info(struct super_block *sb, int type);
static int ext4_quota_on(struct super_block *sb, int type, int format_id,
                         const struct path *path);
static int ext4_quota_on_mount(struct super_block *sb, int type);
static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
                               size_t len, loff_t off);
static ssize_t ext4_quota_write(struct super_block *sb, int type,
                                const char *data, size_t len, loff_t off);
static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
                             unsigned int flags);
static int ext4_enable_quotas(struct super_block *sb);

static struct dquot **ext4_get_dquots(struct inode *inode)
{
        return EXT4_I(inode)->i_dquot;
}

static const struct dquot_operations ext4_quota_operations = {
        .get_reserved_space        = ext4_get_reserved_space,
        .write_dquot                = ext4_write_dquot,
        .acquire_dquot                = ext4_acquire_dquot,
        .release_dquot                = ext4_release_dquot,
        .mark_dirty                = ext4_mark_dquot_dirty,
        .write_info                = ext4_write_info,
        .alloc_dquot                = dquot_alloc,
        .destroy_dquot                = dquot_destroy,
        .get_projid                = ext4_get_projid,
        .get_inode_usage        = ext4_get_inode_usage,
        .get_next_id                = dquot_get_next_id,
};

static const struct quotactl_ops ext4_qctl_operations = {
        .quota_on        = ext4_quota_on,
        .quota_off        = ext4_quota_off,
        .quota_sync        = dquot_quota_sync,
        .get_state        = dquot_get_state,
        .set_info        = dquot_set_dqinfo,
        .get_dqblk        = dquot_get_dqblk,
        .set_dqblk        = dquot_set_dqblk,
        .get_nextdqblk        = dquot_get_next_dqblk,
};
#endif

static const struct super_operations ext4_sops = {
        .alloc_inode        = ext4_alloc_inode,
        .free_inode        = ext4_free_in_core_inode,
        .destroy_inode        = ext4_destroy_inode,
        .write_inode        = ext4_write_inode,
        .dirty_inode        = ext4_dirty_inode,
        .drop_inode        = ext4_drop_inode,
        .evict_inode        = ext4_evict_inode,
        .put_super        = ext4_put_super,
        .sync_fs        = ext4_sync_fs,
        .freeze_fs        = ext4_freeze,
        .unfreeze_fs        = ext4_unfreeze,
        .statfs                = ext4_statfs,
        .remount_fs        = ext4_remount,
        .show_options        = ext4_show_options,
#ifdef CONFIG_QUOTA
        .quota_read        = ext4_quota_read,
        .quota_write        = ext4_quota_write,
        .get_dquots        = ext4_get_dquots,
#endif
        .bdev_try_to_free_page = bdev_try_to_free_page,
};

static const struct export_operations ext4_export_ops = {
        .fh_to_dentry = ext4_fh_to_dentry,
        .fh_to_parent = ext4_fh_to_parent,
        .get_parent = ext4_get_parent,
        .commit_metadata = ext4_nfs_commit_metadata,
};

enum {
        Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
        Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
        Opt_nouid32, Opt_debug, Opt_removed,
        Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
        Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
        Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev,
        Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
        Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
        Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption,
        Opt_inlinecrypt,
        Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
        Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
        Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version,
        Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never,
        Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error,
        Opt_nowarn_on_error, Opt_mblk_io_submit,
        Opt_lazytime, Opt_nolazytime, Opt_debug_want_extra_isize,
        Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
        Opt_inode_readahead_blks, Opt_journal_ioprio,
        Opt_dioread_nolock, Opt_dioread_lock,
        Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
        Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
        Opt_prefetch_block_bitmaps,
#ifdef CONFIG_EXT4_DEBUG
        Opt_fc_debug_max_replay, Opt_fc_debug_force
#endif
};

static const match_table_t tokens = {
        {Opt_bsd_df, "bsddf"},
        {Opt_minix_df, "minixdf"},
        {Opt_grpid, "grpid"},
        {Opt_grpid, "bsdgroups"},
        {Opt_nogrpid, "nogrpid"},
        {Opt_nogrpid, "sysvgroups"},
        {Opt_resgid, "resgid=%u"},
        {Opt_resuid, "resuid=%u"},
        {Opt_sb, "sb=%u"},
        {Opt_err_cont, "errors=continue"},
        {Opt_err_panic, "errors=panic"},
        {Opt_err_ro, "errors=remount-ro"},
        {Opt_nouid32, "nouid32"},
        {Opt_debug, "debug"},
        {Opt_removed, "oldalloc"},
        {Opt_removed, "orlov"},
        {Opt_user_xattr, "user_xattr"},
        {Opt_nouser_xattr, "nouser_xattr"},
        {Opt_acl, "acl"},
        {Opt_noacl, "noacl"},
        {Opt_noload, "norecovery"},
        {Opt_noload, "noload"},
        {Opt_removed, "nobh"},
        {Opt_removed, "bh"},
        {Opt_commit, "commit=%u"},
        {Opt_min_batch_time, "min_batch_time=%u"},
        {Opt_max_batch_time, "max_batch_time=%u"},
        {Opt_journal_dev, "journal_dev=%u"},
        {Opt_journal_path, "journal_path=%s"},
        {Opt_journal_checksum, "journal_checksum"},
        {Opt_nojournal_checksum, "nojournal_checksum"},
        {Opt_journal_async_commit, "journal_async_commit"},
        {Opt_abort, "abort"},
        {Opt_data_journal, "data=journal"},
        {Opt_data_ordered, "data=ordered"},
        {Opt_data_writeback, "data=writeback"},
        {Opt_data_err_abort, "data_err=abort"},
        {Opt_data_err_ignore, "data_err=ignore"},
        {Opt_offusrjquota, "usrjquota="},
        {Opt_usrjquota, "usrjquota=%s"},
        {Opt_offgrpjquota, "grpjquota="},
        {Opt_grpjquota, "grpjquota=%s"},
        {Opt_jqfmt_vfsold, "jqfmt=vfsold"},
        {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
        {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
        {Opt_grpquota, "grpquota"},
        {Opt_noquota, "noquota"},
        {Opt_quota, "quota"},
        {Opt_usrquota, "usrquota"},
        {Opt_prjquota, "prjquota"},
        {Opt_barrier, "barrier=%u"},
        {Opt_barrier, "barrier"},
        {Opt_nobarrier, "nobarrier"},
        {Opt_i_version, "i_version"},
        {Opt_dax, "dax"},
        {Opt_dax_always, "dax=always"},
        {Opt_dax_inode, "dax=inode"},
        {Opt_dax_never, "dax=never"},
        {Opt_stripe, "stripe=%u"},
        {Opt_delalloc, "delalloc"},
        {Opt_warn_on_error, "warn_on_error"},
        {Opt_nowarn_on_error, "nowarn_on_error"},
        {Opt_lazytime, "lazytime"},
        {Opt_nolazytime, "nolazytime"},
        {Opt_debug_want_extra_isize, "debug_want_extra_isize=%u"},
        {Opt_nodelalloc, "nodelalloc"},
        {Opt_removed, "mblk_io_submit"},
        {Opt_removed, "nomblk_io_submit"},
        {Opt_block_validity, "block_validity"},
        {Opt_noblock_validity, "noblock_validity"},
        {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
        {Opt_journal_ioprio, "journal_ioprio=%u"},
        {Opt_auto_da_alloc, "auto_da_alloc=%u"},
        {Opt_auto_da_alloc, "auto_da_alloc"},
        {Opt_noauto_da_alloc, "noauto_da_alloc"},
        {Opt_dioread_nolock, "dioread_nolock"},
        {Opt_dioread_lock, "nodioread_nolock"},
        {Opt_dioread_lock, "dioread_lock"},
        {Opt_discard, "discard"},
        {Opt_nodiscard, "nodiscard"},
        {Opt_init_itable, "init_itable=%u"},
        {Opt_init_itable, "init_itable"},
        {Opt_noinit_itable, "noinit_itable"},
#ifdef CONFIG_EXT4_DEBUG
        {Opt_fc_debug_force, "fc_debug_force"},
        {Opt_fc_debug_max_replay, "fc_debug_max_replay=%u"},
#endif
        {Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
        {Opt_test_dummy_encryption, "test_dummy_encryption=%s"},
        {Opt_test_dummy_encryption, "test_dummy_encryption"},
        {Opt_inlinecrypt, "inlinecrypt"},
        {Opt_nombcache, "nombcache"},
        {Opt_nombcache, "no_mbcache"},        /* for backward compatibility */
        {Opt_prefetch_block_bitmaps, "prefetch_block_bitmaps"},
        {Opt_removed, "check=none"},        /* mount option from ext2/3 */
        {Opt_removed, "nocheck"},        /* mount option from ext2/3 */
        {Opt_removed, "reservation"},        /* mount option from ext2/3 */
        {Opt_removed, "noreservation"}, /* mount option from ext2/3 */
        {Opt_removed, "journal=%u"},        /* mount option from ext2/3 */
        {Opt_err, NULL},
};

static ext4_fsblk_t get_sb_block(void **data)
{
        ext4_fsblk_t        sb_block;
        char                *options = (char *) *data;

        if (!options || strncmp(options, "sb=", 3) != 0)
                return 1;        /* Default location */

        options += 3;
        /* TODO: use simple_strtoll with >32bit ext4 */
        sb_block = simple_strtoul(options, &options, 0);
        if (*options && *options != ',') {
                printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
                       (char *) *data);
                return 1;
        }
        if (*options == ',')
                options++;
        *data = (void *) options;

        return sb_block;
}

#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
static const char deprecated_msg[] =
        "Mount option \"%s\" will be removed by %s\n"
        "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n";

#ifdef CONFIG_QUOTA
static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        char *qname, *old_qname = get_qf_name(sb, sbi, qtype);
        int ret = -1;

        if (sb_any_quota_loaded(sb) && !old_qname) {
                ext4_msg(sb, KERN_ERR,
                        "Cannot change journaled "
                        "quota options when quota turned on");
                return -1;
        }
        if (ext4_has_feature_quota(sb)) {
                ext4_msg(sb, KERN_INFO, "Journaled quota options "
                         "ignored when QUOTA feature is enabled");
                return 1;
        }
        qname = match_strdup(args);
        if (!qname) {
                ext4_msg(sb, KERN_ERR,
                        "Not enough memory for storing quotafile name");
                return -1;
        }
        if (old_qname) {
                if (strcmp(old_qname, qname) == 0)
                        ret = 1;
                else
                        ext4_msg(sb, KERN_ERR,
                                 "%s quota file already specified",
                                 QTYPE2NAME(qtype));
                goto errout;
        }
        if (strchr(qname, '/')) {
                ext4_msg(sb, KERN_ERR,
                        "quotafile must be on filesystem root");
                goto errout;
        }
        rcu_assign_pointer(sbi->s_qf_names[qtype], qname);
        set_opt(sb, QUOTA);
        return 1;
errout:
        kfree(qname);
        return ret;
}

static int clear_qf_name(struct super_block *sb, int qtype)
{

        struct ext4_sb_info *sbi = EXT4_SB(sb);
        char *old_qname = get_qf_name(sb, sbi, qtype);

        if (sb_any_quota_loaded(sb) && old_qname) {
                ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options"
                        " when quota turned on");
                return -1;
        }
        rcu_assign_pointer(sbi->s_qf_names[qtype], NULL);
        synchronize_rcu();
        kfree(old_qname);
        return 1;
}
#endif

#define MOPT_SET        0x0001
#define MOPT_CLEAR        0x0002
#define MOPT_NOSUPPORT        0x0004
#define MOPT_EXPLICIT        0x0008
#define MOPT_CLEAR_ERR        0x0010
#define MOPT_GTE0        0x0020
#ifdef CONFIG_QUOTA
#define MOPT_Q                0
#define MOPT_QFMT        0x0040
#else
#define MOPT_Q                MOPT_NOSUPPORT
#define MOPT_QFMT        MOPT_NOSUPPORT
#endif
#define MOPT_DATAJ        0x0080
#define MOPT_NO_EXT2        0x0100
#define MOPT_NO_EXT3        0x0200
#define MOPT_EXT4_ONLY        (MOPT_NO_EXT2 | MOPT_NO_EXT3)
#define MOPT_STRING        0x0400
#define MOPT_SKIP        0x0800
#define        MOPT_2                0x1000

static const struct mount_opts {
        int        token;
        int        mount_opt;
        int        flags;
} ext4_mount_opts[] = {
        {Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET},
        {Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR},
        {Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET},
        {Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR},
        {Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET},
        {Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR},
        {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK,
         MOPT_EXT4_ONLY | MOPT_SET},
        {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK,
         MOPT_EXT4_ONLY | MOPT_CLEAR},
        {Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET},
        {Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR},
        {Opt_delalloc, EXT4_MOUNT_DELALLOC,
         MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
        {Opt_nodelalloc, EXT4_MOUNT_DELALLOC,
         MOPT_EXT4_ONLY | MOPT_CLEAR},
        {Opt_warn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_SET},
        {Opt_nowarn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_CLEAR},
        {Opt_commit, 0, MOPT_NO_EXT2},
        {Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
         MOPT_EXT4_ONLY | MOPT_CLEAR},
        {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
         MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
        {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
                                    EXT4_MOUNT_JOURNAL_CHECKSUM),
         MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
        {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET},
        {Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR},
        {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
        {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR},
        {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT,
         MOPT_NO_EXT2},
        {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT,
         MOPT_NO_EXT2},
        {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
        {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
        {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
        {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
        {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
        {Opt_commit, 0, MOPT_GTE0},
        {Opt_max_batch_time, 0, MOPT_GTE0},
        {Opt_min_batch_time, 0, MOPT_GTE0},
        {Opt_inode_readahead_blks, 0, MOPT_GTE0},
        {Opt_init_itable, 0, MOPT_GTE0},
        {Opt_dax, EXT4_MOUNT_DAX_ALWAYS, MOPT_SET | MOPT_SKIP},
        {Opt_dax_always, EXT4_MOUNT_DAX_ALWAYS,
                MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP},
        {Opt_dax_inode, EXT4_MOUNT2_DAX_INODE,
                MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP},
        {Opt_dax_never, EXT4_MOUNT2_DAX_NEVER,
                MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP},
        {Opt_stripe, 0, MOPT_GTE0},
        {Opt_resuid, 0, MOPT_GTE0},
        {Opt_resgid, 0, MOPT_GTE0},
        {Opt_journal_dev, 0, MOPT_NO_EXT2 | MOPT_GTE0},
        {Opt_journal_path, 0, MOPT_NO_EXT2 | MOPT_STRING},
        {Opt_journal_ioprio, 0, MOPT_NO_EXT2 | MOPT_GTE0},
        {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
        {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
        {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA,
         MOPT_NO_EXT2 | MOPT_DATAJ},
        {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
        {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR},
#ifdef CONFIG_EXT4_FS_POSIX_ACL
        {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET},
        {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR},
#else
        {Opt_acl, 0, MOPT_NOSUPPORT},
        {Opt_noacl, 0, MOPT_NOSUPPORT},
#endif
        {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET},
        {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET},
        {Opt_debug_want_extra_isize, 0, MOPT_GTE0},
        {Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q},
        {Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA,
                                                        MOPT_SET | MOPT_Q},
        {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA,
                                                        MOPT_SET | MOPT_Q},
        {Opt_prjquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_PRJQUOTA,
                                                        MOPT_SET | MOPT_Q},
        {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
                       EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA),
                                                        MOPT_CLEAR | MOPT_Q},
        {Opt_usrjquota, 0, MOPT_Q | MOPT_STRING},
        {Opt_grpjquota, 0, MOPT_Q | MOPT_STRING},
        {Opt_offusrjquota, 0, MOPT_Q},
        {Opt_offgrpjquota, 0, MOPT_Q},
        {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT},
        {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
        {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
        {Opt_max_dir_size_kb, 0, MOPT_GTE0},
        {Opt_test_dummy_encryption, 0, MOPT_STRING},
        {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
        {Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS,
         MOPT_SET},
#ifdef CONFIG_EXT4_DEBUG
        {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT,
         MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY},
        {Opt_fc_debug_max_replay, 0, MOPT_GTE0},
#endif
        {Opt_err, 0, 0}
};

#ifdef CONFIG_UNICODE
static const struct ext4_sb_encodings {
        __u16 magic;
        char *name;
        char *version;
} ext4_sb_encoding_map[] = {
        {EXT4_ENC_UTF8_12_1, "utf8", "12.1.0"},
};

static int ext4_sb_read_encoding(const struct ext4_super_block *es,
                                 const struct ext4_sb_encodings **encoding,
                                 __u16 *flags)
{
        __u16 magic = le16_to_cpu(es->s_encoding);
        int i;

        for (i = 0; i < ARRAY_SIZE(ext4_sb_encoding_map); i++)
                if (magic == ext4_sb_encoding_map[i].magic)
                        break;

        if (i >= ARRAY_SIZE(ext4_sb_encoding_map))
                return -EINVAL;

        *encoding = &ext4_sb_encoding_map[i];
        *flags = le16_to_cpu(es->s_encoding_flags);

        return 0;
}
#endif

static int ext4_set_test_dummy_encryption(struct super_block *sb,
                                          const char *opt,
                                          const substring_t *arg,
                                          bool is_remount)
{
#ifdef CONFIG_FS_ENCRYPTION
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int err;

        if (!ext4_has_feature_encrypt(sb)) {
                ext4_msg(sb, KERN_WARNING,
                         "test_dummy_encryption requires encrypt feature");
                return -1;
        }

        /*
         * This mount option is just for testing, and it's not worthwhile to
         * implement the extra complexity (e.g. RCU protection) that would be
         * needed to allow it to be set or changed during remount.  We do allow
         * it to be specified during remount, but only if there is no change.
         */
        if (is_remount && !sbi->s_dummy_enc_policy.policy) {
                ext4_msg(sb, KERN_WARNING,
                         "Can't set test_dummy_encryption on remount");
                return -1;
        }
        err = fscrypt_set_test_dummy_encryption(sb, arg->from,
                                                &sbi->s_dummy_enc_policy);
        if (err) {
                if (err == -EEXIST)
                        ext4_msg(sb, KERN_WARNING,
                                 "Can't change test_dummy_encryption on remount");
                else if (err == -EINVAL)
                        ext4_msg(sb, KERN_WARNING,
                                 "Value of option \"%s\" is unrecognized", opt);
                else
                        ext4_msg(sb, KERN_WARNING,
                                 "Error processing option \"%s\" [%d]",
                                 opt, err);
                return -1;
        }
        ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled");
        return 1;
#else
        ext4_msg(sb, KERN_WARNING,
                 "test_dummy_encryption option not supported");
        return -1;

#endif
}

static int handle_mount_opt(struct super_block *sb, char *opt, int token,
                            substring_t *args, unsigned long *journal_devnum,
                            unsigned int *journal_ioprio, int is_remount)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        const struct mount_opts *m;
        kuid_t uid;
        kgid_t gid;
        int arg = 0;

#ifdef CONFIG_QUOTA
        if (token == Opt_usrjquota)
                return set_qf_name(sb, USRQUOTA, &args[0]);
        else if (token == Opt_grpjquota)
                return set_qf_name(sb, GRPQUOTA, &args[0]);
        else if (token == Opt_offusrjquota)
                return clear_qf_name(sb, USRQUOTA);
        else if (token == Opt_offgrpjquota)
                return clear_qf_name(sb, GRPQUOTA);
#endif
        switch (token) {
        case Opt_noacl:
        case Opt_nouser_xattr:
                ext4_msg(sb, KERN_WARNING, deprecated_msg, opt, "3.5");
                break;
        case Opt_sb:
                return 1;        /* handled by get_sb_block() */
        case Opt_removed:
                ext4_msg(sb, KERN_WARNING, "Ignoring removed %s option", opt);
                return 1;
        case Opt_abort:
                ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED);
                return 1;
        case Opt_i_version:
                sb->s_flags |= SB_I_VERSION;
                return 1;
        case Opt_lazytime:
                sb->s_flags |= SB_LAZYTIME;
                return 1;
        case Opt_nolazytime:
                sb->s_flags &= ~SB_LAZYTIME;
                return 1;
        case Opt_inlinecrypt:
#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
                sb->s_flags |= SB_INLINECRYPT;
#else
                ext4_msg(sb, KERN_ERR, "inline encryption not supported");
#endif
                return 1;
        }

        for (m = ext4_mount_opts; m->token != Opt_err; m++)
                if (token == m->token)
                        break;

        if (m->token == Opt_err) {
                ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" "
                         "or missing value", opt);
                return -1;
        }

        if ((m->flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) {
                ext4_msg(sb, KERN_ERR,
                         "Mount option \"%s\" incompatible with ext2", opt);
                return -1;
        }
        if ((m->flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) {
                ext4_msg(sb, KERN_ERR,
                         "Mount option \"%s\" incompatible with ext3", opt);
                return -1;
        }

        if (args->from && !(m->flags & MOPT_STRING) && match_int(args, &arg))
                return -1;
        if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
                return -1;
        if (m->flags & MOPT_EXPLICIT) {
                if (m->mount_opt & EXT4_MOUNT_DELALLOC) {
                        set_opt2(sb, EXPLICIT_DELALLOC);
                } else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) {
                        set_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM);
                } else
                        return -1;
        }
        if (m->flags & MOPT_CLEAR_ERR)
                clear_opt(sb, ERRORS_MASK);
        if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
                ext4_msg(sb, KERN_ERR, "Cannot change quota "
                         "options when quota turned on");
                return -1;
        }

        if (m->flags & MOPT_NOSUPPORT) {
                ext4_msg(sb, KERN_ERR, "%s option not supported", opt);
        } else if (token == Opt_commit) {
                if (arg == 0)
                        arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
                else if (arg > INT_MAX / HZ) {
                        ext4_msg(sb, KERN_ERR,
                                 "Invalid commit interval %d, "
                                 "must be smaller than %d",
                                 arg, INT_MAX / HZ);
                        return -1;
                }
                sbi->s_commit_interval = HZ * arg;
        } else if (token == Opt_debug_want_extra_isize) {
                if ((arg & 1) ||
                    (arg < 4) ||
                    (arg > (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE))) {
                        ext4_msg(sb, KERN_ERR,
                                 "Invalid want_extra_isize %d", arg);
                        return -1;
                }
                sbi->s_want_extra_isize = arg;
        } else if (token == Opt_max_batch_time) {
                sbi->s_max_batch_time = arg;
        } else if (token == Opt_min_batch_time) {
                sbi->s_min_batch_time = arg;
        } else if (token == Opt_inode_readahead_blks) {
                if (arg && (arg > (1 << 30) || !is_power_of_2(arg))) {
                        ext4_msg(sb, KERN_ERR,
                                 "EXT4-fs: inode_readahead_blks must be "
                                 "0 or a power of 2 smaller than 2^31");
                        return -1;
                }
                sbi->s_inode_readahead_blks = arg;
        } else if (token == Opt_init_itable) {
                set_opt(sb, INIT_INODE_TABLE);
                if (!args->from)
                        arg = EXT4_DEF_LI_WAIT_MULT;
                sbi->s_li_wait_mult = arg;
        } else if (token == Opt_max_dir_size_kb) {
                sbi->s_max_dir_size_kb = arg;
#ifdef CONFIG_EXT4_DEBUG
        } else if (token == Opt_fc_debug_max_replay) {
                sbi->s_fc_debug_max_replay = arg;
#endif
        } else if (token == Opt_stripe) {
                sbi->s_stripe = arg;
        } else if (token == Opt_resuid) {
                uid = make_kuid(current_user_ns(), arg);
                if (!uid_valid(uid)) {
                        ext4_msg(sb, KERN_ERR, "Invalid uid value %d", arg);
                        return -1;
                }
                sbi->s_resuid = uid;
        } else if (token == Opt_resgid) {
                gid = make_kgid(current_user_ns(), arg);
                if (!gid_valid(gid)) {
                        ext4_msg(sb, KERN_ERR, "Invalid gid value %d", arg);
                        return -1;
                }
                sbi->s_resgid = gid;
        } else if (token == Opt_journal_dev) {
                if (is_remount) {
                        ext4_msg(sb, KERN_ERR,
                                 "Cannot specify journal on remount");
                        return -1;
                }
                *journal_devnum = arg;
        } else if (token == Opt_journal_path) {
                char *journal_path;
                struct inode *journal_inode;
                struct path path;
                int error;

                if (is_remount) {
                        ext4_msg(sb, KERN_ERR,
                                 "Cannot specify journal on remount");
                        return -1;
                }
                journal_path = match_strdup(&args[0]);
                if (!journal_path) {
                        ext4_msg(sb, KERN_ERR, "error: could not dup "
                                "journal device string");
                        return -1;
                }

                error = kern_path(journal_path, LOOKUP_FOLLOW, &path);
                if (error) {
                        ext4_msg(sb, KERN_ERR, "error: could not find "
                                "journal device path: error %d", error);
                        kfree(journal_path);
                        return -1;
                }

                journal_inode = d_inode(path.dentry);
                if (!S_ISBLK(journal_inode->i_mode)) {
                        ext4_msg(sb, KERN_ERR, "error: journal path %s "
                                "is not a block device", journal_path);
                        path_put(&path);
                        kfree(journal_path);
                        return -1;
                }

                *journal_devnum = new_encode_dev(journal_inode->i_rdev);
                path_put(&path);
                kfree(journal_path);
        } else if (token == Opt_journal_ioprio) {
                if (arg > 7) {
                        ext4_msg(sb, KERN_ERR, "Invalid journal IO priority"
                                 " (must be 0-7)");
                        return -1;
                }
                *journal_ioprio =
                        IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
        } else if (token == Opt_test_dummy_encryption) {
                return ext4_set_test_dummy_encryption(sb, opt, &args[0],
                                                      is_remount);
        } else if (m->flags & MOPT_DATAJ) {
                if (is_remount) {
                        if (!sbi->s_journal)
                                ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option");
                        else if (test_opt(sb, DATA_FLAGS) != m->mount_opt) {
                                ext4_msg(sb, KERN_ERR,
                                         "Cannot change data mode on remount");
                                return -1;
                        }
                } else {
                        clear_opt(sb, DATA_FLAGS);
                        sbi->s_mount_opt |= m->mount_opt;
                }
#ifdef CONFIG_QUOTA
        } else if (m->flags & MOPT_QFMT) {
                if (sb_any_quota_loaded(sb) &&
                    sbi->s_jquota_fmt != m->mount_opt) {
                        ext4_msg(sb, KERN_ERR, "Cannot change journaled "
                                 "quota options when quota turned on");
                        return -1;
                }
                if (ext4_has_feature_quota(sb)) {
                        ext4_msg(sb, KERN_INFO,
                                 "Quota format mount options ignored "
                                 "when QUOTA feature is enabled");
                        return 1;
                }
                sbi->s_jquota_fmt = m->mount_opt;
#endif
        } else if (token == Opt_dax || token == Opt_dax_always ||
                   token == Opt_dax_inode || token == Opt_dax_never) {
#ifdef CONFIG_FS_DAX
                switch (token) {
                case Opt_dax:
                case Opt_dax_always:
                        if (is_remount &&
                            (!(sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
                             (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER))) {
                        fail_dax_change_remount:
                                ext4_msg(sb, KERN_ERR, "can't change "
                                         "dax mount option while remounting");
                                return -1;
                        }
                        if (is_remount &&
                            (test_opt(sb, DATA_FLAGS) ==
                             EXT4_MOUNT_JOURNAL_DATA)) {
                                    ext4_msg(sb, KERN_ERR, "can't mount with "
                                             "both data=journal and dax");
                                    return -1;
                        }
                        ext4_msg(sb, KERN_WARNING,
                                "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
                        sbi->s_mount_opt |= EXT4_MOUNT_DAX_ALWAYS;
                        sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER;
                        break;
                case Opt_dax_never:
                        if (is_remount &&
                            (!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
                             (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS)))
                                goto fail_dax_change_remount;
                        sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER;
                        sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS;
                        break;
                case Opt_dax_inode:
                        if (is_remount &&
                            ((sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
                             (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
                             !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE)))
                                goto fail_dax_change_remount;
                        sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS;
                        sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER;
                        /* Strictly for printing options */
                        sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_INODE;
                        break;
                }
#else
                ext4_msg(sb, KERN_INFO, "dax option not supported");
                sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER;
                sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS;
                return -1;
#endif
        } else if (token == Opt_data_err_abort) {
                sbi->s_mount_opt |= m->mount_opt;
        } else if (token == Opt_data_err_ignore) {
                sbi->s_mount_opt &= ~m->mount_opt;
        } else {
                if (!args->from)
                        arg = 1;
                if (m->flags & MOPT_CLEAR)
                        arg = !arg;
                else if (unlikely(!(m->flags & MOPT_SET))) {
                        ext4_msg(sb, KERN_WARNING,
                                 "buggy handling of option %s", opt);
                        WARN_ON(1);
                        return -1;
                }
                if (m->flags & MOPT_2) {
                        if (arg != 0)
                                sbi->s_mount_opt2 |= m->mount_opt;
                        else
                                sbi->s_mount_opt2 &= ~m->mount_opt;
                } else {
                        if (arg != 0)
                                sbi->s_mount_opt |= m->mount_opt;
                        else
                                sbi->s_mount_opt &= ~m->mount_opt;
                }
        }
        return 1;
}

static int parse_options(char *options, struct super_block *sb,
                         unsigned long *journal_devnum,
                         unsigned int *journal_ioprio,
                         int is_remount)
{
        struct ext4_sb_info __maybe_unused *sbi = EXT4_SB(sb);
        char *p, __maybe_unused *usr_qf_name, __maybe_unused *grp_qf_name;
        substring_t args[MAX_OPT_ARGS];
        int token;

        if (!options)
                return 1;

        while ((p = strsep(&options, ",")) != NULL) {
                if (!*p)
                        continue;
                /*
                 * Initialize args struct so we know whether arg was
                 * found; some options take optional arguments.
                 */
                args[0].to = args[0].from = NULL;
                token = match_token(p, tokens, args);
                if (handle_mount_opt(sb, p, token, args, journal_devnum,
                                     journal_ioprio, is_remount) < 0)
                        return 0;
        }
#ifdef CONFIG_QUOTA
        /*
         * We do the test below only for project quotas. 'usrquota' and
         * 'grpquota' mount options are allowed even without quota feature
         * to support legacy quotas in quota files.
         */
        if (test_opt(sb, PRJQUOTA) && !ext4_has_feature_project(sb)) {
                ext4_msg(sb, KERN_ERR, "Project quota feature not enabled. "
                         "Cannot enable project quota enforcement.");
                return 0;
        }
        usr_qf_name = get_qf_name(sb, sbi, USRQUOTA);
        grp_qf_name = get_qf_name(sb, sbi, GRPQUOTA);
        if (usr_qf_name || grp_qf_name) {
                if (test_opt(sb, USRQUOTA) && usr_qf_name)
                        clear_opt(sb, USRQUOTA);

                if (test_opt(sb, GRPQUOTA) && grp_qf_name)
                        clear_opt(sb, GRPQUOTA);

                if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
                        ext4_msg(sb, KERN_ERR, "old and new quota "
                                        "format mixing");
                        return 0;
                }

                if (!sbi->s_jquota_fmt) {
                        ext4_msg(sb, KERN_ERR, "journaled quota format "
                                        "not specified");
                        return 0;
                }
        }
#endif
        if (test_opt(sb, DIOREAD_NOLOCK)) {
                int blocksize =
                        BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
                if (blocksize < PAGE_SIZE)
                        ext4_msg(sb, KERN_WARNING, "Warning: mounting with an "
                                 "experimental mount option 'dioread_nolock' "
                                 "for blocksize < PAGE_SIZE");
        }
        return 1;
}

static inline void ext4_show_quota_options(struct seq_file *seq,
                                           struct super_block *sb)
{
#if defined(CONFIG_QUOTA)
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        char *usr_qf_name, *grp_qf_name;

        if (sbi->s_jquota_fmt) {
                char *fmtname = "";

                switch (sbi->s_jquota_fmt) {
                case QFMT_VFS_OLD:
                        fmtname = "vfsold";
                        break;
                case QFMT_VFS_V0:
                        fmtname = "vfsv0";
                        break;
                case QFMT_VFS_V1:
                        fmtname = "vfsv1";
                        break;
                }
                seq_printf(seq, ",jqfmt=%s", fmtname);
        }

        rcu_read_lock();
        usr_qf_name = rcu_dereference(sbi->s_qf_names[USRQUOTA]);
        grp_qf_name = rcu_dereference(sbi->s_qf_names[GRPQUOTA]);
        if (usr_qf_name)
                seq_show_option(seq, "usrjquota", usr_qf_name);
        if (grp_qf_name)
                seq_show_option(seq, "grpjquota", grp_qf_name);
        rcu_read_unlock();
#endif
}

static const char *token2str(int token)
{
        const struct match_token *t;

        for (t = tokens; t->token != Opt_err; t++)
                if (t->token == token && !strchr(t->pattern, '='))
                        break;
        return t->pattern;
}

/*
 * Show an option if
 *  - it's set to a non-default value OR
 *  - if the per-sb default is different from the global default
 */
static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
                              int nodefs)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        int def_errors, def_mount_opt = sbi->s_def_mount_opt;
        const struct mount_opts *m;
        char sep = nodefs ? '\n' : ',';

#define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep)
#define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg)

        if (sbi->s_sb_block != 1)
                SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block);

        for (m = ext4_mount_opts; m->token != Opt_err; m++) {
                int want_set = m->flags & MOPT_SET;
                if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
                    (m->flags & MOPT_CLEAR_ERR) || m->flags & MOPT_SKIP)
                        continue;
                if (!nodefs && !(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)))
                        continue; /* skip if same as the default */
                if ((want_set &&
                     (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) ||
                    (!want_set && (sbi->s_mount_opt & m->mount_opt)))
                        continue; /* select Opt_noFoo vs Opt_Foo */
                SEQ_OPTS_PRINT("%s", token2str(m->token));
        }

        if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) ||
            le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID)
                SEQ_OPTS_PRINT("resuid=%u",
                                from_kuid_munged(&init_user_ns, sbi->s_resuid));
        if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) ||
            le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID)
                SEQ_OPTS_PRINT("resgid=%u",
                                from_kgid_munged(&init_user_ns, sbi->s_resgid));
        def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors);
        if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO)
                SEQ_OPTS_PUTS("errors=remount-ro");
        if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
                SEQ_OPTS_PUTS("errors=continue");
        if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
                SEQ_OPTS_PUTS("errors=panic");
        if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ)
                SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ);
        if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME)
                SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
        if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
                SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
        if (sb->s_flags & SB_I_VERSION)
                SEQ_OPTS_PUTS("i_version");
        if (nodefs || sbi->s_stripe)
                SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
        if (nodefs || EXT4_MOUNT_DATA_FLAGS &
                        (sbi->s_mount_opt ^ def_mount_opt)) {
                if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
                        SEQ_OPTS_PUTS("data=journal");
                else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
                        SEQ_OPTS_PUTS("data=ordered");
                else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
                        SEQ_OPTS_PUTS("data=writeback");
        }
        if (nodefs ||
            sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
                SEQ_OPTS_PRINT("inode_readahead_blks=%u",
                               sbi->s_inode_readahead_blks);

        if (test_opt(sb, INIT_INODE_TABLE) && (nodefs ||
                       (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
                SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
        if (nodefs || sbi->s_max_dir_size_kb)
                SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
        if (test_opt(sb, DATA_ERR_ABORT))
                SEQ_OPTS_PUTS("data_err=abort");

        fscrypt_show_test_dummy_encryption(seq, sep, sb);

        if (sb->s_flags & SB_INLINECRYPT)
                SEQ_OPTS_PUTS("inlinecrypt");

        if (test_opt(sb, DAX_ALWAYS)) {
                if (IS_EXT2_SB(sb))
                        SEQ_OPTS_PUTS("dax");
                else
                        SEQ_OPTS_PUTS("dax=always");
        } else if (test_opt2(sb, DAX_NEVER)) {
                SEQ_OPTS_PUTS("dax=never");
        } else if (test_opt2(sb, DAX_INODE)) {
                SEQ_OPTS_PUTS("dax=inode");
        }
        ext4_show_quota_options(seq, sb);
        return 0;
}

static int ext4_show_options(struct seq_file *seq, struct dentry *root)
{
        return _ext4_show_options(seq, root->d_sb, 0);
}

int ext4_seq_options_show(struct seq_file *seq, void *offset)
{
        struct super_block *sb = seq->private;
        int rc;

        seq_puts(seq, sb_rdonly(sb) ? "ro" : "rw");
        rc = _ext4_show_options(seq, sb, 1);
        seq_puts(seq, "\n");
        return rc;
}

static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
                            int read_only)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int err = 0;

        if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
                ext4_msg(sb, KERN_ERR, "revision level too high, "
                         "forcing read-only mode");
                err = -EROFS;
                goto done;
        }
        if (read_only)
                goto done;
        if (!(sbi->s_mount_state & EXT4_VALID_FS))
                ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
                         "running e2fsck is recommended");
        else if (sbi->s_mount_state & EXT4_ERROR_FS)
                ext4_msg(sb, KERN_WARNING,
                         "warning: mounting fs with errors, "
                         "running e2fsck is recommended");
        else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
                 le16_to_cpu(es->s_mnt_count) >=
                 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
                ext4_msg(sb, KERN_WARNING,
                         "warning: maximal mount count reached, "
                         "running e2fsck is recommended");
        else if (le32_to_cpu(es->s_checkinterval) &&
                 (ext4_get_tstamp(es, s_lastcheck) +
                  le32_to_cpu(es->s_checkinterval) <= ktime_get_real_seconds()))
                ext4_msg(sb, KERN_WARNING,
                         "warning: checktime reached, "
                         "running e2fsck is recommended");
        if (!sbi->s_journal)
                es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
        if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
                es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
        le16_add_cpu(&es->s_mnt_count, 1);
        ext4_update_tstamp(es, s_mtime);
        if (sbi->s_journal)
                ext4_set_feature_journal_needs_recovery(sb);

        err = ext4_commit_super(sb, 1);
done:
        if (test_opt(sb, DEBUG))
                printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
                                "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
                        sb->s_blocksize,
                        sbi->s_groups_count,
                        EXT4_BLOCKS_PER_GROUP(sb),
                        EXT4_INODES_PER_GROUP(sb),
                        sbi->s_mount_opt, sbi->s_mount_opt2);

        cleancache_init_fs(sb);
        return err;
}

int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct flex_groups **old_groups, **new_groups;
        int size, i, j;

        if (!sbi->s_log_groups_per_flex)
                return 0;

        size = ext4_flex_group(sbi, ngroup - 1) + 1;
        if (size <= sbi->s_flex_groups_allocated)
                return 0;

        new_groups = kvzalloc(roundup_pow_of_two(size *
                              sizeof(*sbi->s_flex_groups)), GFP_KERNEL);
        if (!new_groups) {
                ext4_msg(sb, KERN_ERR,
                         "not enough memory for %d flex group pointers", size);
                return -ENOMEM;
        }
        for (i = sbi->s_flex_groups_allocated; i < size; i++) {
                new_groups[i] = kvzalloc(roundup_pow_of_two(
                                         sizeof(struct flex_groups)),
                                         GFP_KERNEL);
                if (!new_groups[i]) {
                        for (j = sbi->s_flex_groups_allocated; j < i; j++)
                                kvfree(new_groups[j]);
                        kvfree(new_groups);
                        ext4_msg(sb, KERN_ERR,
                                 "not enough memory for %d flex groups", size);
                        return -ENOMEM;
                }
        }
        rcu_read_lock();
        old_groups = rcu_dereference(sbi->s_flex_groups);
        if (old_groups)
                memcpy(new_groups, old_groups,
                       (sbi->s_flex_groups_allocated *
                        sizeof(struct flex_groups *)));
        rcu_read_unlock();
        rcu_assign_pointer(sbi->s_flex_groups, new_groups);
        sbi->s_flex_groups_allocated = size;
        if (old_groups)
                ext4_kvfree_array_rcu(old_groups);
        return 0;
}

static int ext4_fill_flex_info(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_group_desc *gdp = NULL;
        struct flex_groups *fg;
        ext4_group_t flex_group;
        int i, err;

        sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
        if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) {
                sbi->s_log_groups_per_flex = 0;
                return 1;
        }

        err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);
        if (err)
                goto failed;

        for (i = 0; i < sbi->s_groups_count; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);

                flex_group = ext4_flex_group(sbi, i);
                fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group);
                atomic_add(ext4_free_inodes_count(sb, gdp), &fg->free_inodes);
                atomic64_add(ext4_free_group_clusters(sb, gdp),
                             &fg->free_clusters);
                atomic_add(ext4_used_dirs_count(sb, gdp), &fg->used_dirs);
        }

        return 1;
failed:
        return 0;
}

static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group,
                                   struct ext4_group_desc *gdp)
{
        int offset = offsetof(struct ext4_group_desc, bg_checksum);
        __u16 crc = 0;
        __le32 le_group = cpu_to_le32(block_group);
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (ext4_has_metadata_csum(sbi->s_sb)) {
                /* Use new metadata_csum algorithm */
                __u32 csum32;
                __u16 dummy_csum = 0;

                csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
                                     sizeof(le_group));
                csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, offset);
                csum32 = ext4_chksum(sbi, csum32, (__u8 *)&dummy_csum,
                                     sizeof(dummy_csum));
                offset += sizeof(dummy_csum);
                if (offset < sbi->s_desc_size)
                        csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp + offset,
                                             sbi->s_desc_size - offset);

                crc = csum32 & 0xFFFF;
                goto out;
        }

        /* old crc16 code */
        if (!ext4_has_feature_gdt_csum(sb))
                return 0;

        crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
        crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group));
        crc = crc16(crc, (__u8 *)gdp, offset);
        offset += sizeof(gdp->bg_checksum); /* skip checksum */
        /* for checksum of struct ext4_group_desc do the rest...*/
        if (ext4_has_feature_64bit(sb) && offset < sbi->s_desc_size)
                crc = crc16(crc, (__u8 *)gdp + offset,
                            sbi->s_desc_size - offset);

out:
        return cpu_to_le16(crc);
}

int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group,
                                struct ext4_group_desc *gdp)
{
        if (ext4_has_group_desc_csum(sb) &&
            (gdp->bg_checksum != ext4_group_desc_csum(sb, block_group, gdp)))
                return 0;

        return 1;
}

void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group,
                              struct ext4_group_desc *gdp)
{
        if (!ext4_has_group_desc_csum(sb))
                return;
        gdp->bg_checksum = ext4_group_desc_csum(sb, block_group, gdp);
}

/* Called at mount-time, super-block is locked */
static int ext4_check_descriptors(struct super_block *sb,
                                  ext4_fsblk_t sb_block,
                                  ext4_group_t *first_not_zeroed)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
        ext4_fsblk_t last_block;
        ext4_fsblk_t last_bg_block = sb_block + ext4_bg_num_gdb(sb, 0);
        ext4_fsblk_t block_bitmap;
        ext4_fsblk_t inode_bitmap;
        ext4_fsblk_t inode_table;
        int flexbg_flag = 0;
        ext4_group_t i, grp = sbi->s_groups_count;

        if (ext4_has_feature_flex_bg(sb))
                flexbg_flag = 1;

        ext4_debug("Checking group descriptors");

        for (i = 0; i < sbi->s_groups_count; i++) {
                struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);

                if (i == sbi->s_groups_count - 1 || flexbg_flag)
                        last_block = ext4_blocks_count(sbi->s_es) - 1;
                else
                        last_block = first_block +
                                (EXT4_BLOCKS_PER_GROUP(sb) - 1);

                if ((grp == sbi->s_groups_count) &&
                   !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
                        grp = i;

                block_bitmap = ext4_block_bitmap(sb, gdp);
                if (block_bitmap == sb_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                                 "Block bitmap for group %u overlaps "
                                 "superblock", i);
                        if (!sb_rdonly(sb))
                                return 0;
                }
                if (block_bitmap >= sb_block + 1 &&
                    block_bitmap <= last_bg_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                                 "Block bitmap for group %u overlaps "
                                 "block group descriptors", i);
                        if (!sb_rdonly(sb))
                                return 0;
                }
                if (block_bitmap < first_block || block_bitmap > last_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                               "Block bitmap for group %u not in group "
                               "(block %llu)!", i, block_bitmap);
                        return 0;
                }
                inode_bitmap = ext4_inode_bitmap(sb, gdp);
                if (inode_bitmap == sb_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                                 "Inode bitmap for group %u overlaps "
                                 "superblock", i);
                        if (!sb_rdonly(sb))
                                return 0;
                }
                if (inode_bitmap >= sb_block + 1 &&
                    inode_bitmap <= last_bg_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                                 "Inode bitmap for group %u overlaps "
                                 "block group descriptors", i);
                        if (!sb_rdonly(sb))
                                return 0;
                }
                if (inode_bitmap < first_block || inode_bitmap > last_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                               "Inode bitmap for group %u not in group "
                               "(block %llu)!", i, inode_bitmap);
                        return 0;
                }
                inode_table = ext4_inode_table(sb, gdp);
                if (inode_table == sb_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                                 "Inode table for group %u overlaps "
                                 "superblock", i);
                        if (!sb_rdonly(sb))
                                return 0;
                }
                if (inode_table >= sb_block + 1 &&
                    inode_table <= last_bg_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                                 "Inode table for group %u overlaps "
                                 "block group descriptors", i);
                        if (!sb_rdonly(sb))
                                return 0;
                }
                if (inode_table < first_block ||
                    inode_table + sbi->s_itb_per_group - 1 > last_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                               "Inode table for group %u not in group "
                               "(block %llu)!", i, inode_table);
                        return 0;
                }
                ext4_lock_group(sb, i);
                if (!ext4_group_desc_csum_verify(sb, i, gdp)) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                                 "Checksum for group %u failed (%u!=%u)",
                                 i, le16_to_cpu(ext4_group_desc_csum(sb, i,
                                     gdp)), le16_to_cpu(gdp->bg_checksum));
                        if (!sb_rdonly(sb)) {
                                ext4_unlock_group(sb, i);
                                return 0;
                        }
                }
                ext4_unlock_group(sb, i);
                if (!flexbg_flag)
                        first_block += EXT4_BLOCKS_PER_GROUP(sb);
        }
        if (NULL != first_not_zeroed)
                *first_not_zeroed = grp;
        return 1;
}

/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at
 * the superblock) which were deleted from all directories, but held open by
 * a process at the time of a crash.  We walk the list and try to delete these
 * inodes at recovery time (only with a read-write filesystem).
 *
 * In order to keep the orphan inode chain consistent during traversal (in
 * case of crash during recovery), we link each inode into the superblock
 * orphan list_head and handle it the same way as an inode deletion during
 * normal operation (which journals the operations for us).
 *
 * We only do an iget() and an iput() on each inode, which is very safe if we
 * accidentally point at an in-use or already deleted inode.  The worst that
 * can happen in this case is that we get a "bit already cleared" message from
 * ext4_free_inode().  The only reason we would point at a wrong inode is if
 * e2fsck was run on this filesystem, and it must have already done the orphan
 * inode cleanup for us, so we can safely abort without any further action.
 */
static void ext4_orphan_cleanup(struct super_block *sb,
                                struct ext4_super_block *es)
{
        unsigned int s_flags = sb->s_flags;
        int ret, nr_orphans = 0, nr_truncates = 0;
#ifdef CONFIG_QUOTA
        int quota_update = 0;
        int i;
#endif
        if (!es->s_last_orphan) {
                jbd_debug(4, "no orphan inodes to clean up\n");
                return;
        }

        if (bdev_read_only(sb->s_bdev)) {
                ext4_msg(sb, KERN_ERR, "write access "
                        "unavailable, skipping orphan cleanup");
                return;
        }

        /* Check if feature set would not allow a r/w mount */
        if (!ext4_feature_set_ok(sb, 0)) {
                ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
                         "unknown ROCOMPAT features");
                return;
        }

        if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
                /* don't clear list on RO mount w/ errors */
                if (es->s_last_orphan && !(s_flags & SB_RDONLY)) {
                        ext4_msg(sb, KERN_INFO, "Errors on filesystem, "
                                  "clearing orphan list.\n");
                        es->s_last_orphan = 0;
                }
                jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
                return;
        }

        if (s_flags & SB_RDONLY) {
                ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
                sb->s_flags &= ~SB_RDONLY;
        }
#ifdef CONFIG_QUOTA
        /*
         * Turn on quotas which were not enabled for read-only mounts if
         * filesystem has quota feature, so that they are updated correctly.
         */
        if (ext4_has_feature_quota(sb) && (s_flags & SB_RDONLY)) {
                int ret = ext4_enable_quotas(sb);

                if (!ret)
                        quota_update = 1;
                else
                        ext4_msg(sb, KERN_ERR,
                                "Cannot turn on quotas: error %d", ret);
        }

        /* Turn on journaled quotas used for old sytle */
        for (i = 0; i < EXT4_MAXQUOTAS; i++) {
                if (EXT4_SB(sb)->s_qf_names[i]) {
                        int ret = ext4_quota_on_mount(sb, i);

                        if (!ret)
                                quota_update = 1;
                        else
                                ext4_msg(sb, KERN_ERR,
                                        "Cannot turn on journaled "
                                        "quota: type %d: error %d", i, ret);
                }
        }
#endif

        while (es->s_last_orphan) {
                struct inode *inode;

                /*
                 * We may have encountered an error during cleanup; if
                 * so, skip the rest.
                 */
                if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
                        jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
                        es->s_last_orphan = 0;
                        break;
                }

                inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan));
                if (IS_ERR(inode)) {
                        es->s_last_orphan = 0;
                        break;
                }

                list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
                dquot_initialize(inode);
                if (inode->i_nlink) {
                        if (test_opt(sb, DEBUG))
                                ext4_msg(sb, KERN_DEBUG,
                                        "%s: truncating inode %lu to %lld bytes",
                                        __func__, inode->i_ino, inode->i_size);
                        jbd_debug(2, "truncating inode %lu to %lld bytes\n",
                                  inode->i_ino, inode->i_size);
                        inode_lock(inode);
                        truncate_inode_pages(inode->i_mapping, inode->i_size);
                        ret = ext4_truncate(inode);
                        if (ret) {
                                /*
                                 * We need to clean up the in-core orphan list
                                 * manually if ext4_truncate() failed to get a
                                 * transaction handle.
                                 */
                                ext4_orphan_del(NULL, inode);
                                ext4_std_error(inode->i_sb, ret);
                        }
                        inode_unlock(inode);
                        nr_truncates++;
                } else {
                        if (test_opt(sb, DEBUG))
                                ext4_msg(sb, KERN_DEBUG,
                                        "%s: deleting unreferenced inode %lu",
                                        __func__, inode->i_ino);
                        jbd_debug(2, "deleting unreferenced inode %lu\n",
                                  inode->i_ino);
                        nr_orphans++;
                }
                iput(inode);  /* The delete magic happens here! */
        }

#define PLURAL(x) (x), ((x) == 1) ? "" : "s"

        if (nr_orphans)
                ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
                       PLURAL(nr_orphans));
        if (nr_truncates)
                ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
                       PLURAL(nr_truncates));
#ifdef CONFIG_QUOTA
        /* Turn off quotas if they were enabled for orphan cleanup */
        if (quota_update) {
                for (i = 0; i < EXT4_MAXQUOTAS; i++) {
                        if (sb_dqopt(sb)->files[i])
                                dquot_quota_off(sb, i);
                }
        }
#endif
        sb->s_flags = s_flags; /* Restore SB_RDONLY status */
}

/*
 * Maximal extent format file size.
 * Resulting logical blkno at s_maxbytes must fit in our on-disk
 * extent format containers, within a sector_t, and within i_blocks
 * in the vfs.  ext4 inode has 48 bits of i_block in fsblock units,
 * so that won't be a limiting factor.
 *
 * However there is other limiting factor. We do store extents in the form
 * of starting block and length, hence the resulting length of the extent
 * covering maximum file size must fit into on-disk format containers as
 * well. Given that length is always by 1 unit bigger than max unit (because
 * we count 0 as well) we have to lower the s_maxbytes by one fs block.
 *
 * Note, this does *not* consider any metadata overhead for vfs i_blocks.
 */
static loff_t ext4_max_size(int blkbits, int has_huge_files)
{
        loff_t res;
        loff_t upper_limit = MAX_LFS_FILESIZE;

        BUILD_BUG_ON(sizeof(blkcnt_t) < sizeof(u64));

        if (!has_huge_files) {
                upper_limit = (1LL << 32) - 1;

                /* total blocks in file system block size */
                upper_limit >>= (blkbits - 9);
                upper_limit <<= blkbits;
        }

        /*
         * 32-bit extent-start container, ee_block. We lower the maxbytes
         * by one fs block, so ee_len can cover the extent of maximum file
         * size
         */
        res = (1LL << 32) - 1;
        res <<= blkbits;

        /* Sanity check against vm- & vfs- imposed limits */
        if (res > upper_limit)
                res = upper_limit;

        return res;
}

/*
 * Maximal bitmap file size.  There is a direct, and {,double-,triple-}indirect
 * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks.
 * We need to be 1 filesystem block less than the 2^48 sector limit.
 */
static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
{
        unsigned long long upper_limit, res = EXT4_NDIR_BLOCKS;
        int meta_blocks;

        /*
         * This is calculated to be the largest file size for a dense, block
         * mapped file such that the file's total number of 512-byte sectors,
         * including data and all indirect blocks, does not exceed (2^48 - 1).
         *
         * __u32 i_blocks_lo and _u16 i_blocks_high represent the total
         * number of 512-byte sectors of the file.
         */
        if (!has_huge_files) {
                /*
                 * !has_huge_files or implies that the inode i_block field
                 * represents total file blocks in 2^32 512-byte sectors ==
                 * size of vfs inode i_blocks * 8
                 */
                upper_limit = (1LL << 32) - 1;

                /* total blocks in file system block size */
                upper_limit >>= (bits - 9);

        } else {
                /*
                 * We use 48 bit ext4_inode i_blocks
                 * With EXT4_HUGE_FILE_FL set the i_blocks
                 * represent total number of blocks in
                 * file system block size
                 */
                upper_limit = (1LL << 48) - 1;

        }

        /* indirect blocks */
        meta_blocks = 1;
        /* double indirect blocks */
        meta_blocks += 1 + (1LL << (bits-2));
        /* tripple indirect blocks */
        meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));

        upper_limit -= meta_blocks;
        upper_limit <<= bits;

        res += 1LL << (bits-2);
        res += 1LL << (2*(bits-2));
        res += 1LL << (3*(bits-2));
        res <<= bits;
        if (res > upper_limit)
                res = upper_limit;

        if (res > MAX_LFS_FILESIZE)
                res = MAX_LFS_FILESIZE;

        return (loff_t)res;
}

static ext4_fsblk_t descriptor_loc(struct super_block *sb,
                                   ext4_fsblk_t logical_sb_block, int nr)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_group_t bg, first_meta_bg;
        int has_super = 0;

        first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);

        if (!ext4_has_feature_meta_bg(sb) || nr < first_meta_bg)
                return logical_sb_block + nr + 1;
        bg = sbi->s_desc_per_block * nr;
        if (ext4_bg_has_super(sb, bg))
                has_super = 1;

        /*
         * If we have a meta_bg fs with 1k blocks, group 0's GDT is at
         * block 2, not 1.  If s_first_data_block == 0 (bigalloc is enabled
         * on modern mke2fs or blksize > 1k on older mke2fs) then we must
         * compensate.
         */
        if (sb->s_blocksize == 1024 && nr == 0 &&
            le32_to_cpu(sbi->s_es->s_first_data_block) == 0)
                has_super++;

        return (has_super + ext4_group_first_block_no(sb, bg));
}

/**
 * ext4_get_stripe_size: Get the stripe size.
 * @sbi: In memory super block info
 *
 * If we have specified it via mount option, then
 * use the mount option value. If the value specified at mount time is
 * greater than the blocks per group use the super block value.
 * If the super block value is greater than blocks per group return 0.
 * Allocator needs it be less than blocks per group.
 *
 */
static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
{
        unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
        unsigned long stripe_width =
                        le32_to_cpu(sbi->s_es->s_raid_stripe_width);
        int ret;

        if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group)
                ret = sbi->s_stripe;
        else if (stripe_width && stripe_width <= sbi->s_blocks_per_group)
                ret = stripe_width;
        else if (stride && stride <= sbi->s_blocks_per_group)
                ret = stride;
        else
                ret = 0;

        /*
         * If the stripe width is 1, this makes no sense and
         * we set it to 0 to turn off stripe handling code.
         */
        if (ret <= 1)
                ret = 0;

        return ret;
}

/*
 * Check whether this filesystem can be mounted based on
 * the features present and the RDONLY/RDWR mount requested.
 * Returns 1 if this filesystem can be mounted as requested,
 * 0 if it cannot be.
 */
static int ext4_feature_set_ok(struct super_block *sb, int readonly)
{
        if (ext4_has_unknown_ext4_incompat_features(sb)) {
                ext4_msg(sb, KERN_ERR,
                        "Couldn't mount because of "
                        "unsupported optional features (%x)",
                        (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
                        ~EXT4_FEATURE_INCOMPAT_SUPP));
                return 0;
        }

#ifndef CONFIG_UNICODE
        if (ext4_has_feature_casefold(sb)) {
                ext4_msg(sb, KERN_ERR,
                         "Filesystem with casefold feature cannot be "
                         "mounted without CONFIG_UNICODE");
                return 0;
        }
#endif

        if (readonly)
                return 1;

        if (ext4_has_feature_readonly(sb)) {
                ext4_msg(sb, KERN_INFO, "filesystem is read-only");
                sb->s_flags |= SB_RDONLY;
                return 1;
        }

        /* Check that feature set is OK for a read-write mount */
        if (ext4_has_unknown_ext4_ro_compat_features(sb)) {
                ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
                         "unsupported optional features (%x)",
                         (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
                                ~EXT4_FEATURE_RO_COMPAT_SUPP));
                return 0;
        }
        if (ext4_has_feature_bigalloc(sb) && !ext4_has_feature_extents(sb)) {
                ext4_msg(sb, KERN_ERR,
                         "Can't support bigalloc feature without "
                         "extents feature\n");
                return 0;
        }
        if (ext4_has_feature_bigalloc(sb) &&
            le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
                ext4_msg(sb, KERN_WARNING,
                         "bad geometry: bigalloc file system with non-zero "
                         "first_data_block\n");
                return 0;
        }

#if !IS_ENABLED(CONFIG_QUOTA) || !IS_ENABLED(CONFIG_QFMT_V2)
        if (!readonly && (ext4_has_feature_quota(sb) ||
                          ext4_has_feature_project(sb))) {
                ext4_msg(sb, KERN_ERR,
                         "The kernel was not built with CONFIG_QUOTA and CONFIG_QFMT_V2");
                return 0;
        }
#endif  /* CONFIG_QUOTA */
        return 1;
}

/*
 * This function is called once a day if we have errors logged
 * on the file system
 */
static void print_daily_error_info(struct timer_list *t)
{
        struct ext4_sb_info *sbi = from_timer(sbi, t, s_err_report);
        struct super_block *sb = sbi->s_sb;
        struct ext4_super_block *es = sbi->s_es;

        if (es->s_error_count)
                /* fsck newer than v1.41.13 is needed to clean this condition. */
                ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u",
                         le32_to_cpu(es->s_error_count));
        if (es->s_first_error_time) {
                printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %llu: %.*s:%d",
                       sb->s_id,
                       ext4_get_tstamp(es, s_first_error_time),
                       (int) sizeof(es->s_first_error_func),
                       es->s_first_error_func,
                       le32_to_cpu(es->s_first_error_line));
                if (es->s_first_error_ino)
                        printk(KERN_CONT ": inode %u",
                               le32_to_cpu(es->s_first_error_ino));
                if (es->s_first_error_block)
                        printk(KERN_CONT ": block %llu", (unsigned long long)
                               le64_to_cpu(es->s_first_error_block));
                printk(KERN_CONT "\n");
        }
        if (es->s_last_error_time) {
                printk(KERN_NOTICE "EXT4-fs (%s): last error at time %llu: %.*s:%d",
                       sb->s_id,
                       ext4_get_tstamp(es, s_last_error_time),
                       (int) sizeof(es->s_last_error_func),
                       es->s_last_error_func,
                       le32_to_cpu(es->s_last_error_line));
                if (es->s_last_error_ino)
                        printk(KERN_CONT ": inode %u",
                               le32_to_cpu(es->s_last_error_ino));
                if (es->s_last_error_block)
                        printk(KERN_CONT ": block %llu", (unsigned long long)
                               le64_to_cpu(es->s_last_error_block));
                printk(KERN_CONT "\n");
        }
        mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);  /* Once a day */
}

/* Find next suitable group and run ext4_init_inode_table */
static int ext4_run_li_request(struct ext4_li_request *elr)
{
        struct ext4_group_desc *gdp = NULL;
        struct super_block *sb = elr->lr_super;
        ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
        ext4_group_t group = elr->lr_next_group;
        unsigned int prefetch_ios = 0;
        int ret = 0;
        u64 start_time;

        if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) {
                elr->lr_next_group = ext4_mb_prefetch(sb, group,
                                EXT4_SB(sb)->s_mb_prefetch, &prefetch_ios);
                if (prefetch_ios)
                        ext4_mb_prefetch_fini(sb, elr->lr_next_group,
                                              prefetch_ios);
                trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group,
                                            prefetch_ios);
                if (group >= elr->lr_next_group) {
                        ret = 1;
                        if (elr->lr_first_not_zeroed != ngroups &&
                            !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) {
                                elr->lr_next_group = elr->lr_first_not_zeroed;
                                elr->lr_mode = EXT4_LI_MODE_ITABLE;
                                ret = 0;
                        }
                }
                return ret;
        }

        for (; group < ngroups; group++) {
                gdp = ext4_get_group_desc(sb, group, NULL);
                if (!gdp) {
                        ret = 1;
                        break;
                }

                if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
                        break;
        }

        if (group >= ngroups)
                ret = 1;

        if (!ret) {
                start_time = ktime_get_real_ns();
                ret = ext4_init_inode_table(sb, group,
                                            elr->lr_timeout ? 0 : 1);
                trace_ext4_lazy_itable_init(sb, group);
                if (elr->lr_timeout == 0) {
                        elr->lr_timeout = nsecs_to_jiffies((ktime_get_real_ns() - start_time) *
                                EXT4_SB(elr->lr_super)->s_li_wait_mult);
                }
                elr->lr_next_sched = jiffies + elr->lr_timeout;
                elr->lr_next_group = group + 1;
        }
        return ret;
}

/*
 * Remove lr_request from the list_request and free the
 * request structure. Should be called with li_list_mtx held
 */
static void ext4_remove_li_request(struct ext4_li_request *elr)
{
        if (!elr)
                return;

        list_del(&elr->lr_request);
        EXT4_SB(elr->lr_super)->s_li_request = NULL;
        kfree(elr);
}

static void ext4_unregister_li_request(struct super_block *sb)
{
        mutex_lock(&ext4_li_mtx);
        if (!ext4_li_info) {
                mutex_unlock(&ext4_li_mtx);
                return;
        }

        mutex_lock(&ext4_li_info->li_list_mtx);
        ext4_remove_li_request(EXT4_SB(sb)->s_li_request);
        mutex_unlock(&ext4_li_info->li_list_mtx);
        mutex_unlock(&ext4_li_mtx);
}

static struct task_struct *ext4_lazyinit_task;

/*
 * This is the function where ext4lazyinit thread lives. It walks
 * through the request list searching for next scheduled filesystem.
 * When such a fs is found, run the lazy initialization request
 * (ext4_rn_li_request) and keep track of the time spend in this
 * function. Based on that time we compute next schedule time of
 * the request. When walking through the list is complete, compute
 * next waking time and put itself into sleep.
 */
static int ext4_lazyinit_thread(void *arg)
{
        struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
        struct list_head *pos, *n;
        struct ext4_li_request *elr;
        unsigned long next_wakeup, cur;

        BUG_ON(NULL == eli);
        set_freezable();

cont_thread:
        while (true) {
                next_wakeup = MAX_JIFFY_OFFSET;

                mutex_lock(&eli->li_list_mtx);
                if (list_empty(&eli->li_request_list)) {
                        mutex_unlock(&eli->li_list_mtx);
                        goto exit_thread;
                }
                list_for_each_safe(pos, n, &eli->li_request_list) {
                        int err = 0;
                        int progress = 0;
                        elr = list_entry(pos, struct ext4_li_request,
                                         lr_request);

                        if (time_before(jiffies, elr->lr_next_sched)) {
                                if (time_before(elr->lr_next_sched, next_wakeup))
                                        next_wakeup = elr->lr_next_sched;
                                continue;
                        }
                        if (down_read_trylock(&elr->lr_super->s_umount)) {
                                if (sb_start_write_trylock(elr->lr_super)) {
                                        progress = 1;
                                        /*
                                         * We hold sb->s_umount, sb can not
                                         * be removed from the list, it is
                                         * now safe to drop li_list_mtx
                                         */
                                        mutex_unlock(&eli->li_list_mtx);
                                        err = ext4_run_li_request(elr);
                                        sb_end_write(elr->lr_super);
                                        mutex_lock(&eli->li_list_mtx);
                                        n = pos->next;
                                }
                                up_read((&elr->lr_super->s_umount));
                        }
                        /* error, remove the lazy_init job */
                        if (err) {
                                ext4_remove_li_request(elr);
                                continue;
                        }
                        if (!progress) {
                                elr->lr_next_sched = jiffies +
                                        (prandom_u32()
                                         % (EXT4_DEF_LI_MAX_START_DELAY * HZ));
                        }
                        if (time_before(elr->lr_next_sched, next_wakeup))
                                next_wakeup = elr->lr_next_sched;
                }
                mutex_unlock(&eli->li_list_mtx);

                try_to_freeze();

                cur = jiffies;
                if ((time_after_eq(cur, next_wakeup)) ||
                    (MAX_JIFFY_OFFSET == next_wakeup)) {
                        cond_resched();
                        continue;
                }

                schedule_timeout_interruptible(next_wakeup - cur);

                if (kthread_should_stop()) {
                        ext4_clear_request_list();
                        goto exit_thread;
                }
        }

exit_thread:
        /*
         * It looks like the request list is empty, but we need
         * to check it under the li_list_mtx lock, to prevent any
         * additions into it, and of course we should lock ext4_li_mtx
         * to atomically free the list and ext4_li_info, because at
         * this point another ext4 filesystem could be registering
         * new one.
         */
        mutex_lock(&ext4_li_mtx);
        mutex_lock(&eli->li_list_mtx);
        if (!list_empty(&eli->li_request_list)) {
                mutex_unlock(&eli->li_list_mtx);
                mutex_unlock(&ext4_li_mtx);
                goto cont_thread;
        }
        mutex_unlock(&eli->li_list_mtx);
        kfree(ext4_li_info);
        ext4_li_info = NULL;
        mutex_unlock(&ext4_li_mtx);

        return 0;
}

static void ext4_clear_request_list(void)
{
        struct list_head *pos, *n;
        struct ext4_li_request *elr;

        mutex_lock(&ext4_li_info->li_list_mtx);
        list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
                elr = list_entry(pos, struct ext4_li_request,
                                 lr_request);
                ext4_remove_li_request(elr);
        }
        mutex_unlock(&ext4_li_info->li_list_mtx);
}

static int ext4_run_lazyinit_thread(void)
{
        ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread,
                                         ext4_li_info, "ext4lazyinit");
        if (IS_ERR(ext4_lazyinit_task)) {
                int err = PTR_ERR(ext4_lazyinit_task);
                ext4_clear_request_list();
                kfree(ext4_li_info);
                ext4_li_info = NULL;
                printk(KERN_CRIT "EXT4-fs: error %d creating inode table "
                                 "initialization thread\n",
                                 err);
                return err;
        }
        ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
        return 0;
}

/*
 * Check whether it make sense to run itable init. thread or not.
 * If there is at least one uninitialized inode table, return
 * corresponding group number, else the loop goes through all
 * groups and return total number of groups.
 */
static ext4_group_t ext4_has_uninit_itable(struct super_block *sb)
{
        ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count;
        struct ext4_group_desc *gdp = NULL;

        if (!ext4_has_group_desc_csum(sb))
                return ngroups;

        for (group = 0; group < ngroups; group++) {
                gdp = ext4_get_group_desc(sb, group, NULL);
                if (!gdp)
                        continue;

                if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
                        break;
        }

        return group;
}

static int ext4_li_info_new(void)
{
        struct ext4_lazy_init *eli = NULL;

        eli = kzalloc(sizeof(*eli), GFP_KERNEL);
        if (!eli)
                return -ENOMEM;

        INIT_LIST_HEAD(&eli->li_request_list);
        mutex_init(&eli->li_list_mtx);

        eli->li_state |= EXT4_LAZYINIT_QUIT;

        ext4_li_info = eli;

        return 0;
}

static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
                                            ext4_group_t start)
{
        struct ext4_li_request *elr;

        elr = kzalloc(sizeof(*elr), GFP_KERNEL);
        if (!elr)
                return NULL;

        elr->lr_super = sb;
        elr->lr_first_not_zeroed = start;
        if (test_opt(sb, PREFETCH_BLOCK_BITMAPS))
                elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP;
        else {
                elr->lr_mode = EXT4_LI_MODE_ITABLE;
                elr->lr_next_group = start;
        }

        /*
         * Randomize first schedule time of the request to
         * spread the inode table initialization requests
         * better.
         */
        elr->lr_next_sched = jiffies + (prandom_u32() %
                                (EXT4_DEF_LI_MAX_START_DELAY * HZ));
        return elr;
}

int ext4_register_li_request(struct super_block *sb,
                             ext4_group_t first_not_zeroed)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_li_request *elr = NULL;
        ext4_group_t ngroups = sbi->s_groups_count;
        int ret = 0;

        mutex_lock(&ext4_li_mtx);
        if (sbi->s_li_request != NULL) {
                /*
                 * Reset timeout so it can be computed again, because
                 * s_li_wait_mult might have changed.
                 */
                sbi->s_li_request->lr_timeout = 0;
                goto out;
        }

        if (!test_opt(sb, PREFETCH_BLOCK_BITMAPS) &&
            (first_not_zeroed == ngroups || sb_rdonly(sb) ||
             !test_opt(sb, INIT_INODE_TABLE)))
                goto out;

        elr = ext4_li_request_new(sb, first_not_zeroed);
        if (!elr) {
                ret = -ENOMEM;
                goto out;
        }

        if (NULL == ext4_li_info) {
                ret = ext4_li_info_new();
                if (ret)
                        goto out;
        }

        mutex_lock(&ext4_li_info->li_list_mtx);
        list_add(&elr->lr_request, &ext4_li_info->li_request_list);
        mutex_unlock(&ext4_li_info->li_list_mtx);

        sbi->s_li_request = elr;
        /*
         * set elr to NULL here since it has been inserted to
         * the request_list and the removal and free of it is
         * handled by ext4_clear_request_list from now on.
         */
        elr = NULL;

        if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
                ret = ext4_run_lazyinit_thread();
                if (ret)
                        goto out;
        }
out:
        mutex_unlock(&ext4_li_mtx);
        if (ret)
                kfree(elr);
        return ret;
}

/*
 * We do not need to lock anything since this is called on
 * module unload.
 */
static void ext4_destroy_lazyinit_thread(void)
{
        /*
         * If thread exited earlier
         * there's nothing to be done.
         */
        if (!ext4_li_info || !ext4_lazyinit_task)
                return;

        kthread_stop(ext4_lazyinit_task);
}

static int set_journal_csum_feature_set(struct super_block *sb)
{
        int ret = 1;
        int compat, incompat;
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (ext4_has_metadata_csum(sb)) {
                /* journal checksum v3 */
                compat = 0;
                incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3;
        } else {
                /* journal checksum v1 */
                compat = JBD2_FEATURE_COMPAT_CHECKSUM;
                incompat = 0;
        }

        jbd2_journal_clear_features(sbi->s_journal,
                        JBD2_FEATURE_COMPAT_CHECKSUM, 0,
                        JBD2_FEATURE_INCOMPAT_CSUM_V3 |
                        JBD2_FEATURE_INCOMPAT_CSUM_V2);
        if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
                ret = jbd2_journal_set_features(sbi->s_journal,
                                compat, 0,
                                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
                                incompat);
        } else if (test_opt(sb, JOURNAL_CHECKSUM)) {
                ret = jbd2_journal_set_features(sbi->s_journal,
                                compat, 0,
                                incompat);
                jbd2_journal_clear_features(sbi->s_journal, 0, 0,
                                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
        } else {
                jbd2_journal_clear_features(sbi->s_journal, 0, 0,
                                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
        }

        return ret;
}

/*
 * Note: calculating the overhead so we can be compatible with
 * historical BSD practice is quite difficult in the face of
 * clusters/bigalloc.  This is because multiple metadata blocks from
 * different block group can end up in the same allocation cluster.
 * Calculating the exact overhead in the face of clustered allocation
 * requires either O(all block bitmaps) in memory or O(number of block
 * groups**2) in time.  We will still calculate the superblock for
 * older file systems --- and if we come across with a bigalloc file
 * system with zero in s_overhead_clusters the estimate will be close to
 * correct especially for very large cluster sizes --- but for newer
 * file systems, it's better to calculate this figure once at mkfs
 * time, and store it in the superblock.  If the superblock value is
 * present (even for non-bigalloc file systems), we will use it.
 */
static int count_overhead(struct super_block *sb, ext4_group_t grp,
                          char *buf)
{
        struct ext4_sb_info        *sbi = EXT4_SB(sb);
        struct ext4_group_desc        *gdp;
        ext4_fsblk_t                first_block, last_block, b;
        ext4_group_t                i, ngroups = ext4_get_groups_count(sb);
        int                        s, j, count = 0;
        int                        has_super = ext4_bg_has_super(sb, grp);

        if (!ext4_has_feature_bigalloc(sb))
                return (has_super + ext4_bg_num_gdb(sb, grp) +
                        (has_super ? le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0) +
                        sbi->s_itb_per_group + 2);

        first_block = le32_to_cpu(sbi->s_es->s_first_data_block) +
                (grp * EXT4_BLOCKS_PER_GROUP(sb));
        last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1;
        for (i = 0; i < ngroups; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);
                b = ext4_block_bitmap(sb, gdp);
                if (b >= first_block && b <= last_block) {
                        ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
                        count++;
                }
                b = ext4_inode_bitmap(sb, gdp);
                if (b >= first_block && b <= last_block) {
                        ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
                        count++;
                }
                b = ext4_inode_table(sb, gdp);
                if (b >= first_block && b + sbi->s_itb_per_group <= last_block)
                        for (j = 0; j < sbi->s_itb_per_group; j++, b++) {
                                int c = EXT4_B2C(sbi, b - first_block);
                                ext4_set_bit(c, buf);
                                count++;
                        }
                if (i != grp)
                        continue;
                s = 0;
                if (ext4_bg_has_super(sb, grp)) {
                        ext4_set_bit(s++, buf);
                        count++;
                }
                j = ext4_bg_num_gdb(sb, grp);
                if (s + j > EXT4_BLOCKS_PER_GROUP(sb)) {
                        ext4_error(sb, "Invalid number of block group "
                                   "descriptor blocks: %d", j);
                        j = EXT4_BLOCKS_PER_GROUP(sb) - s;
                }
                count += j;
                for (; j > 0; j--)
                        ext4_set_bit(EXT4_B2C(sbi, s++), buf);
        }
        if (!count)
                return 0;
        return EXT4_CLUSTERS_PER_GROUP(sb) -
                ext4_count_free(buf, EXT4_CLUSTERS_PER_GROUP(sb) / 8);
}

/*
 * Compute the overhead and stash it in sbi->s_overhead
 */
int ext4_calculate_overhead(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        struct inode *j_inode;
        unsigned int j_blocks, j_inum = le32_to_cpu(es->s_journal_inum);
        ext4_group_t i, ngroups = ext4_get_groups_count(sb);
        ext4_fsblk_t overhead = 0;
        char *buf = (char *) get_zeroed_page(GFP_NOFS);

        if (!buf)
                return -ENOMEM;

        /*
         * Compute the overhead (FS structures).  This is constant
         * for a given filesystem unless the number of block groups
         * changes so we cache the previous value until it does.
         */

        /*
         * All of the blocks before first_data_block are overhead
         */
        overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block));

        /*
         * Add the overhead found in each block group
         */
        for (i = 0; i < ngroups; i++) {
                int blks;

                blks = count_overhead(sb, i, buf);
                overhead += blks;
                if (blks)
                        memset(buf, 0, PAGE_SIZE);
                cond_resched();
        }

        /*
         * Add the internal journal blocks whether the journal has been
         * loaded or not
         */
        if (sbi->s_journal && !sbi->s_journal_bdev)
                overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len);
        else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) {
                /* j_inum for internal journal is non-zero */
                j_inode = ext4_get_journal_inode(sb, j_inum);
                if (j_inode) {
                        j_blocks = j_inode->i_size >> sb->s_blocksize_bits;
                        overhead += EXT4_NUM_B2C(sbi, j_blocks);
                        iput(j_inode);
                } else {
                        ext4_msg(sb, KERN_ERR, "can't get journal size");
                }
        }
        sbi->s_overhead = overhead;
        smp_wmb();
        free_page((unsigned long) buf);
        return 0;
}

static void ext4_set_resv_clusters(struct super_block *sb)
{
        ext4_fsblk_t resv_clusters;
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        /*
         * There's no need to reserve anything when we aren't using extents.
         * The space estimates are exact, there are no unwritten extents,
         * hole punching doesn't need new metadata... This is needed especially
         * to keep ext2/3 backward compatibility.
         */
        if (!ext4_has_feature_extents(sb))
                return;
        /*
         * By default we reserve 2% or 4096 clusters, whichever is smaller.
         * This should cover the situations where we can not afford to run
         * out of space like for example punch hole, or converting
         * unwritten extents in delalloc path. In most cases such
         * allocation would require 1, or 2 blocks, higher numbers are
         * very rare.
         */
        resv_clusters = (ext4_blocks_count(sbi->s_es) >>
                         sbi->s_cluster_bits);

        do_div(resv_clusters, 50);
        resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);

        atomic64_set(&sbi->s_resv_clusters, resv_clusters);
}

static int ext4_fill_super(struct super_block *sb, void *data, int silent)
{
        struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
        char *orig_data = kstrdup(data, GFP_KERNEL);
        struct buffer_head *bh, **group_desc;
        struct ext4_super_block *es = NULL;
        struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        struct flex_groups **flex_groups;
        ext4_fsblk_t block;
        ext4_fsblk_t sb_block = get_sb_block(&data);
        ext4_fsblk_t logical_sb_block;
        unsigned long offset = 0;
        unsigned long journal_devnum = 0;
        unsigned long def_mount_opts;
        struct inode *root;
        const char *descr;
        int ret = -ENOMEM;
        int blocksize, clustersize;
        unsigned int db_count;
        unsigned int i;
        int needs_recovery, has_huge_files;
        __u64 blocks_count;
        int err = 0;
        unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
        ext4_group_t first_not_zeroed;

        if ((data && !orig_data) || !sbi)
                goto out_free_base;

        sbi->s_daxdev = dax_dev;
        sbi->s_blockgroup_lock =
                kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
        if (!sbi->s_blockgroup_lock)
                goto out_free_base;

        sb->s_fs_info = sbi;
        sbi->s_sb = sb;
        sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
        sbi->s_sb_block = sb_block;
        if (sb->s_bdev->bd_part)
                sbi->s_sectors_written_start =
                        part_stat_read(sb->s_bdev->bd_part, sectors[STAT_WRITE]);

        /* Cleanup superblock name */
        strreplace(sb->s_id, '/', '!');

        /* -EINVAL is default */
        ret = -EINVAL;
        blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
        if (!blocksize) {
                ext4_msg(sb, KERN_ERR, "unable to set blocksize");
                goto out_fail;
        }

        /*
         * The ext4 superblock will not be buffer aligned for other than 1kB
         * block sizes.  We need to calculate the offset from buffer start.
         */
        if (blocksize != EXT4_MIN_BLOCK_SIZE) {
                logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
                offset = do_div(logical_sb_block, blocksize);
        } else {
                logical_sb_block = sb_block;
        }

        bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
        if (IS_ERR(bh)) {
                ext4_msg(sb, KERN_ERR, "unable to read superblock");
                ret = PTR_ERR(bh);
                bh = NULL;
                goto out_fail;
        }
        /*
         * Note: s_es must be initialized as soon as possible because
         *       some ext4 macro-instructions depend on its value
         */
        es = (struct ext4_super_block *) (bh->b_data + offset);
        sbi->s_es = es;
        sb->s_magic = le16_to_cpu(es->s_magic);
        if (sb->s_magic != EXT4_SUPER_MAGIC)
                goto cantfind_ext4;
        sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);

        /* Warn if metadata_csum and gdt_csum are both set. */
        if (ext4_has_feature_metadata_csum(sb) &&
            ext4_has_feature_gdt_csum(sb))
                ext4_warning(sb, "metadata_csum and uninit_bg are "
                             "redundant flags; please run fsck.");

        /* Check for a known checksum algorithm */
        if (!ext4_verify_csum_type(sb, es)) {
                ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
                         "unknown checksum algorithm.");
                silent = 1;
                goto cantfind_ext4;
        }

        /* Load the checksum driver */
        sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
        if (IS_ERR(sbi->s_chksum_driver)) {
                ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
                ret = PTR_ERR(sbi->s_chksum_driver);
                sbi->s_chksum_driver = NULL;
                goto failed_mount;
        }

        /* Check superblock checksum */
        if (!ext4_superblock_csum_verify(sb, es)) {
                ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
                         "invalid superblock checksum.  Run e2fsck?");
                silent = 1;
                ret = -EFSBADCRC;
                goto cantfind_ext4;
        }

        /* Precompute checksum seed for all metadata */
        if (ext4_has_feature_csum_seed(sb))
                sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
        else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb))
                sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
                                               sizeof(es->s_uuid));

        /* Set defaults before we parse the mount options */
        def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
        set_opt(sb, INIT_INODE_TABLE);
        if (def_mount_opts & EXT4_DEFM_DEBUG)
                set_opt(sb, DEBUG);
        if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
                set_opt(sb, GRPID);
        if (def_mount_opts & EXT4_DEFM_UID16)
                set_opt(sb, NO_UID32);
        /* xattr user namespace & acls are now defaulted on */
        set_opt(sb, XATTR_USER);
#ifdef CONFIG_EXT4_FS_POSIX_ACL
        set_opt(sb, POSIX_ACL);
#endif
        if (ext4_has_feature_fast_commit(sb))
                set_opt2(sb, JOURNAL_FAST_COMMIT);
        /* don't forget to enable journal_csum when metadata_csum is enabled. */
        if (ext4_has_metadata_csum(sb))
                set_opt(sb, JOURNAL_CHECKSUM);

        if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
                set_opt(sb, JOURNAL_DATA);
        else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
                set_opt(sb, ORDERED_DATA);
        else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
                set_opt(sb, WRITEBACK_DATA);

        if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
                set_opt(sb, ERRORS_PANIC);
        else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
                set_opt(sb, ERRORS_CONT);
        else
                set_opt(sb, ERRORS_RO);
        /* block_validity enabled by default; disable with noblock_validity */
        set_opt(sb, BLOCK_VALIDITY);
        if (def_mount_opts & EXT4_DEFM_DISCARD)
                set_opt(sb, DISCARD);

        sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
        sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
        sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
        sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
        sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;

        if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
                set_opt(sb, BARRIER);

        /*
         * enable delayed allocation by default
         * Use -o nodelalloc to turn it off
         */
        if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) &&
            ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
                set_opt(sb, DELALLOC);

        /*
         * set default s_li_wait_mult for lazyinit, for the case there is
         * no mount option specified.
         */
        sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;

        if (le32_to_cpu(es->s_log_block_size) >
            (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
                ext4_msg(sb, KERN_ERR,
                         "Invalid log block size: %u",
                         le32_to_cpu(es->s_log_block_size));
                goto failed_mount;
        }
        if (le32_to_cpu(es->s_log_cluster_size) >
            (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
                ext4_msg(sb, KERN_ERR,
                         "Invalid log cluster size: %u",
                         le32_to_cpu(es->s_log_cluster_size));
                goto failed_mount;
        }

        blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);

        if (blocksize == PAGE_SIZE)
                set_opt(sb, DIOREAD_NOLOCK);

        if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
                sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
                sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO;
        } else {
                sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
                sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
                if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) {
                        ext4_msg(sb, KERN_ERR, "invalid first ino: %u",
                                 sbi->s_first_ino);
                        goto failed_mount;
                }
                if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
                    (!is_power_of_2(sbi->s_inode_size)) ||
                    (sbi->s_inode_size > blocksize)) {
                        ext4_msg(sb, KERN_ERR,
                               "unsupported inode size: %d",
                               sbi->s_inode_size);
                        ext4_msg(sb, KERN_ERR, "blocksize: %d", blocksize);
                        goto failed_mount;
                }
                /*
                 * i_atime_extra is the last extra field available for
                 * [acm]times in struct ext4_inode. Checking for that
                 * field should suffice to ensure we have extra space
                 * for all three.
                 */
                if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) +
                        sizeof(((struct ext4_inode *)0)->i_atime_extra)) {
                        sb->s_time_gran = 1;
                        sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX;
                } else {
                        sb->s_time_gran = NSEC_PER_SEC;
                        sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX;
                }
                sb->s_time_min = EXT4_TIMESTAMP_MIN;
        }
        if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
                sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
                        EXT4_GOOD_OLD_INODE_SIZE;
                if (ext4_has_feature_extra_isize(sb)) {
                        unsigned v, max = (sbi->s_inode_size -
                                           EXT4_GOOD_OLD_INODE_SIZE);

                        v = le16_to_cpu(es->s_want_extra_isize);
                        if (v > max) {
                                ext4_msg(sb, KERN_ERR,
                                         "bad s_want_extra_isize: %d", v);
                                goto failed_mount;
                        }
                        if (sbi->s_want_extra_isize < v)
                                sbi->s_want_extra_isize = v;

                        v = le16_to_cpu(es->s_min_extra_isize);
                        if (v > max) {
                                ext4_msg(sb, KERN_ERR,
                                         "bad s_min_extra_isize: %d", v);
                                goto failed_mount;
                        }
                        if (sbi->s_want_extra_isize < v)
                                sbi->s_want_extra_isize = v;
                }
        }

        if (sbi->s_es->s_mount_opts[0]) {
                char s_mount_opts[64];

                if (strscpy_pad(s_mount_opts, sbi->s_es->s_mount_opts,
                                sizeof(s_mount_opts)) < 0)
                        goto failed_mount;
                if (!parse_options(s_mount_opts, sb, &journal_devnum,
                                   &journal_ioprio, 0)) {
                        ext4_msg(sb, KERN_WARNING,
                                 "failed to parse options in superblock: %s",
                                 s_mount_opts);
                }
        }
        sbi->s_def_mount_opt = sbi->s_mount_opt;
        if (!parse_options((char *) data, sb, &journal_devnum,
                           &journal_ioprio, 0))
                goto failed_mount;

#ifdef CONFIG_UNICODE
        if (ext4_has_feature_casefold(sb) && !sb->s_encoding) {
                const struct ext4_sb_encodings *encoding_info;
                struct unicode_map *encoding;
                __u16 encoding_flags;

                if (ext4_has_feature_encrypt(sb)) {
                        ext4_msg(sb, KERN_ERR,
                                 "Can't mount with encoding and encryption");
                        goto failed_mount;
                }

                if (ext4_sb_read_encoding(es, &encoding_info,
                                          &encoding_flags)) {
                        ext4_msg(sb, KERN_ERR,
                                 "Encoding requested by superblock is unknown");
                        goto failed_mount;
                }

                encoding = utf8_load(encoding_info->version);
                if (IS_ERR(encoding)) {
                        ext4_msg(sb, KERN_ERR,
                                 "can't mount with superblock charset: %s-%s "
                                 "not supported by the kernel. flags: 0x%x.",
                                 encoding_info->name, encoding_info->version,
                                 encoding_flags);
                        goto failed_mount;
                }
                ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: "
                         "%s-%s with flags 0x%hx", encoding_info->name,
                         encoding_info->version?:"\b", encoding_flags);

                sb->s_encoding = encoding;
                sb->s_encoding_flags = encoding_flags;
        }
#endif

        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
                printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, O_DIRECT and fast_commit support!\n");
                /* can't mount with both data=journal and dioread_nolock. */
                clear_opt(sb, DIOREAD_NOLOCK);
                clear_opt2(sb, JOURNAL_FAST_COMMIT);
                if (test_opt2(sb, EXPLICIT_DELALLOC)) {
                        ext4_msg(sb, KERN_ERR, "can't mount with "
                                 "both data=journal and delalloc");
                        goto failed_mount;
                }
                if (test_opt(sb, DAX_ALWAYS)) {
                        ext4_msg(sb, KERN_ERR, "can't mount with "
                                 "both data=journal and dax");
                        goto failed_mount;
                }
                if (ext4_has_feature_encrypt(sb)) {
                        ext4_msg(sb, KERN_WARNING,
                                 "encrypted files will use data=ordered "
                                 "instead of data journaling mode");
                }
                if (test_opt(sb, DELALLOC))
                        clear_opt(sb, DELALLOC);
        } else {
                sb->s_iflags |= SB_I_CGROUPWB;
        }

        sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
                (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);

        if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
            (ext4_has_compat_features(sb) ||
             ext4_has_ro_compat_features(sb) ||
             ext4_has_incompat_features(sb)))
                ext4_msg(sb, KERN_WARNING,
                       "feature flags set on rev 0 fs, "
                       "running e2fsck is recommended");

        if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) {
                set_opt2(sb, HURD_COMPAT);
                if (ext4_has_feature_64bit(sb)) {
                        ext4_msg(sb, KERN_ERR,
                                 "The Hurd can't support 64-bit file systems");
                        goto failed_mount;
                }

                /*
                 * ea_inode feature uses l_i_version field which is not
                 * available in HURD_COMPAT mode.
                 */
                if (ext4_has_feature_ea_inode(sb)) {
                        ext4_msg(sb, KERN_ERR,
                                 "ea_inode feature is not supported for Hurd");
                        goto failed_mount;
                }
        }

        if (IS_EXT2_SB(sb)) {
                if (ext2_feature_set_ok(sb))
                        ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
                                 "using the ext4 subsystem");
                else {
                        /*
                         * If we're probing be silent, if this looks like
                         * it's actually an ext[34] filesystem.
                         */
                        if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb)))
                                goto failed_mount;
                        ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
                                 "to feature incompatibilities");
                        goto failed_mount;
                }
        }

        if (IS_EXT3_SB(sb)) {
                if (ext3_feature_set_ok(sb))
                        ext4_msg(sb, KERN_INFO, "mounting ext3 file system "
                                 "using the ext4 subsystem");
                else {
                        /*
                         * If we're probing be silent, if this looks like
                         * it's actually an ext4 filesystem.
                         */
                        if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb)))
                                goto failed_mount;
                        ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
                                 "to feature incompatibilities");
                        goto failed_mount;
                }
        }

        /*
         * Check feature flags regardless of the revision level, since we
         * previously didn't change the revision level when setting the flags,
         * so there is a chance incompat flags are set on a rev 0 filesystem.
         */
        if (!ext4_feature_set_ok(sb, (sb_rdonly(sb))))
                goto failed_mount;

        if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (blocksize / 4)) {
                ext4_msg(sb, KERN_ERR,
                         "Number of reserved GDT blocks insanely large: %d",
                         le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks));
                goto failed_mount;
        }

        if (bdev_dax_supported(sb->s_bdev, blocksize))
                set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags);

        if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) {
                if (ext4_has_feature_inline_data(sb)) {
                        ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem"
                                        " that may contain inline data");
                        goto failed_mount;
                }
                if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) {
                        ext4_msg(sb, KERN_ERR,
                                "DAX unsupported by block device.");
                        goto failed_mount;
                }
        }

        if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) {
                ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d",
                         es->s_encryption_level);
                goto failed_mount;
        }

        if (sb->s_blocksize != blocksize) {
                /*
                 * bh must be released before kill_bdev(), otherwise
                 * it won't be freed and its page also. kill_bdev()
                 * is called by sb_set_blocksize().
                 */
                brelse(bh);
                /* Validate the filesystem blocksize */
                if (!sb_set_blocksize(sb, blocksize)) {
                        ext4_msg(sb, KERN_ERR, "bad block size %d",
                                        blocksize);
                        bh = NULL;
                        goto failed_mount;
                }

                logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
                offset = do_div(logical_sb_block, blocksize);
                bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
                if (IS_ERR(bh)) {
                        ext4_msg(sb, KERN_ERR,
                               "Can't read superblock on 2nd try");
                        ret = PTR_ERR(bh);
                        bh = NULL;
                        goto failed_mount;
                }
                es = (struct ext4_super_block *)(bh->b_data + offset);
                sbi->s_es = es;
                if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
                        ext4_msg(sb, KERN_ERR,
                               "Magic mismatch, very weird!");
                        goto failed_mount;
                }
        }

        has_huge_files = ext4_has_feature_huge_file(sb);
        sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
                                                      has_huge_files);
        sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);

        sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
        if (ext4_has_feature_64bit(sb)) {
                if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
                    sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
                    !is_power_of_2(sbi->s_desc_size)) {
                        ext4_msg(sb, KERN_ERR,
                               "unsupported descriptor size %lu",
                               sbi->s_desc_size);
                        goto failed_mount;
                }
        } else
                sbi->s_desc_size = EXT4_MIN_DESC_SIZE;

        sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
        sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);

        sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
        if (sbi->s_inodes_per_block == 0)
                goto cantfind_ext4;
        if (sbi->s_inodes_per_group < sbi->s_inodes_per_block ||
            sbi->s_inodes_per_group > blocksize * 8) {
                ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n",
                         sbi->s_inodes_per_group);
                goto failed_mount;
        }
        sbi->s_itb_per_group = sbi->s_inodes_per_group /
                                        sbi->s_inodes_per_block;
        sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb);
        sbi->s_sbh = bh;
        sbi->s_mount_state = le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY;
        sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
        sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));

        for (i = 0; i < 4; i++)
                sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
        sbi->s_def_hash_version = es->s_def_hash_version;
        if (ext4_has_feature_dir_index(sb)) {
                i = le32_to_cpu(es->s_flags);
                if (i & EXT2_FLAGS_UNSIGNED_HASH)
                        sbi->s_hash_unsigned = 3;
                else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
#ifdef __CHAR_UNSIGNED__
                        if (!sb_rdonly(sb))
                                es->s_flags |=
                                        cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
                        sbi->s_hash_unsigned = 3;
#else
                        if (!sb_rdonly(sb))
                                es->s_flags |=
                                        cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
#endif
                }
        }

        /* Handle clustersize */
        clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
        if (ext4_has_feature_bigalloc(sb)) {
                if (clustersize < blocksize) {
                        ext4_msg(sb, KERN_ERR,
                                 "cluster size (%d) smaller than "
                                 "block size (%d)", clustersize, blocksize);
                        goto failed_mount;
                }
                sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
                        le32_to_cpu(es->s_log_block_size);
                sbi->s_clusters_per_group =
                        le32_to_cpu(es->s_clusters_per_group);
                if (sbi->s_clusters_per_group > blocksize * 8) {
                        ext4_msg(sb, KERN_ERR,
                                 "#clusters per group too big: %lu",
                                 sbi->s_clusters_per_group);
                        goto failed_mount;
                }
                if (sbi->s_blocks_per_group !=
                    (sbi->s_clusters_per_group * (clustersize / blocksize))) {
                        ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and "
                                 "clusters per group (%lu) inconsistent",
                                 sbi->s_blocks_per_group,
                                 sbi->s_clusters_per_group);
                        goto failed_mount;
                }
        } else {
                if (clustersize != blocksize) {
                        ext4_msg(sb, KERN_ERR,
                                 "fragment/cluster size (%d) != "
                                 "block size (%d)", clustersize, blocksize);
                        goto failed_mount;
                }
                if (sbi->s_blocks_per_group > blocksize * 8) {
                        ext4_msg(sb, KERN_ERR,
                                 "#blocks per group too big: %lu",
                                 sbi->s_blocks_per_group);
                        goto failed_mount;
                }
                sbi->s_clusters_per_group = sbi->s_blocks_per_group;
                sbi->s_cluster_bits = 0;
        }
        sbi->s_cluster_ratio = clustersize / blocksize;

        /* Do we have standard group size of clustersize * 8 blocks ? */
        if (sbi->s_blocks_per_group == clustersize << 3)
                set_opt2(sb, STD_GROUP_SIZE);

        /*
         * Test whether we have more sectors than will fit in sector_t,
         * and whether the max offset is addressable by the page cache.
         */
        err = generic_check_addressable(sb->s_blocksize_bits,
                                        ext4_blocks_count(es));
        if (err) {
                ext4_msg(sb, KERN_ERR, "filesystem"
                         " too large to mount safely on this system");
                goto failed_mount;
        }

        if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
                goto cantfind_ext4;

        /* check blocks count against device size */
        blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
        if (blocks_count && ext4_blocks_count(es) > blocks_count) {
                ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
                       "exceeds size of device (%llu blocks)",
                       ext4_blocks_count(es), blocks_count);
                goto failed_mount;
        }

        /*
         * It makes no sense for the first data block to be beyond the end
         * of the filesystem.
         */
        if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
                ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
                         "block %u is beyond end of filesystem (%llu)",
                         le32_to_cpu(es->s_first_data_block),
                         ext4_blocks_count(es));
                goto failed_mount;
        }
        if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) &&
            (sbi->s_cluster_ratio == 1)) {
                ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
                         "block is 0 with a 1k block and cluster size");
                goto failed_mount;
        }

        blocks_count = (ext4_blocks_count(es) -
                        le32_to_cpu(es->s_first_data_block) +
                        EXT4_BLOCKS_PER_GROUP(sb) - 1);
        do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
        if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
                ext4_msg(sb, KERN_WARNING, "groups count too large: %llu "
                       "(block count %llu, first data block %u, "
                       "blocks per group %lu)", blocks_count,
                       ext4_blocks_count(es),
                       le32_to_cpu(es->s_first_data_block),
                       EXT4_BLOCKS_PER_GROUP(sb));
                goto failed_mount;
        }
        sbi->s_groups_count = blocks_count;
        sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
                        (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
        if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) !=
            le32_to_cpu(es->s_inodes_count)) {
                ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu",
                         le32_to_cpu(es->s_inodes_count),
                         ((u64)sbi->s_groups_count * sbi->s_inodes_per_group));
                ret = -EINVAL;
                goto failed_mount;
        }
        db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
                   EXT4_DESC_PER_BLOCK(sb);
        if (ext4_has_feature_meta_bg(sb)) {
                if (le32_to_cpu(es->s_first_meta_bg) > db_count) {
                        ext4_msg(sb, KERN_WARNING,
                                 "first meta block group too large: %u "
                                 "(group descriptor block count %u)",
                                 le32_to_cpu(es->s_first_meta_bg), db_count);
                        goto failed_mount;
                }
        }
        rcu_assign_pointer(sbi->s_group_desc,
                           kvmalloc_array(db_count,
                                          sizeof(struct buffer_head *),
                                          GFP_KERNEL));
        if (sbi->s_group_desc == NULL) {
                ext4_msg(sb, KERN_ERR, "not enough memory");
                ret = -ENOMEM;
                goto failed_mount;
        }

        bgl_lock_init(sbi->s_blockgroup_lock);

        /* Pre-read the descriptors into the buffer cache */
        for (i = 0; i < db_count; i++) {
                block = descriptor_loc(sb, logical_sb_block, i);
                ext4_sb_breadahead_unmovable(sb, block);
        }

        for (i = 0; i < db_count; i++) {
                struct buffer_head *bh;

                block = descriptor_loc(sb, logical_sb_block, i);
                bh = ext4_sb_bread_unmovable(sb, block);
                if (IS_ERR(bh)) {
                        ext4_msg(sb, KERN_ERR,
                               "can't read group descriptor %d", i);
                        db_count = i;
                        ret = PTR_ERR(bh);
                        bh = NULL;
                        goto failed_mount2;
                }
                rcu_read_lock();
                rcu_dereference(sbi->s_group_desc)[i] = bh;
                rcu_read_unlock();
        }
        sbi->s_gdb_count = db_count;
        if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) {
                ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
                ret = -EFSCORRUPTED;
                goto failed_mount2;
        }

        timer_setup(&sbi->s_err_report, print_daily_error_info, 0);

        /* Register extent status tree shrinker */
        if (ext4_es_register_shrinker(sbi))
                goto failed_mount3;

        sbi->s_stripe = ext4_get_stripe_size(sbi);
        sbi->s_extent_max_zeroout_kb = 32;

        /*
         * set up enough so that it can read an inode
         */
        sb->s_op = &ext4_sops;
        sb->s_export_op = &ext4_export_ops;
        sb->s_xattr = ext4_xattr_handlers;
#ifdef CONFIG_FS_ENCRYPTION
        sb->s_cop = &ext4_cryptops;
#endif
#ifdef CONFIG_FS_VERITY
        sb->s_vop = &ext4_verityops;
#endif
#ifdef CONFIG_QUOTA
        sb->dq_op = &ext4_quota_operations;
        if (ext4_has_feature_quota(sb))
                sb->s_qcop = &dquot_quotactl_sysfile_ops;
        else
                sb->s_qcop = &ext4_qctl_operations;
        sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
#endif
        memcpy(&sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));

        INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
        mutex_init(&sbi->s_orphan_lock);

        /* Initialize fast commit stuff */
        atomic_set(&sbi->s_fc_subtid, 0);
        atomic_set(&sbi->s_fc_ineligible_updates, 0);
        INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]);
        INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]);
        INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]);
        INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]);
        sbi->s_fc_bytes = 0;
        ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
        ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
        spin_lock_init(&sbi->s_fc_lock);
        memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats));
        sbi->s_fc_replay_state.fc_regions = NULL;
        sbi->s_fc_replay_state.fc_regions_size = 0;
        sbi->s_fc_replay_state.fc_regions_used = 0;
        sbi->s_fc_replay_state.fc_regions_valid = 0;
        sbi->s_fc_replay_state.fc_modified_inodes = NULL;
        sbi->s_fc_replay_state.fc_modified_inodes_size = 0;
        sbi->s_fc_replay_state.fc_modified_inodes_used = 0;

        sb->s_root = NULL;

        needs_recovery = (es->s_last_orphan != 0 ||
                          ext4_has_feature_journal_needs_recovery(sb));

        if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb)) {
                err = ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block));
                if (err)
                        goto failed_mount3a;
        }

        /*
         * The first inode we look at is the journal inode.  Don't try
         * root first: it may be modified in the journal!
         */
        if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) {
                err = ext4_load_journal(sb, es, journal_devnum);
                if (err)
                        goto failed_mount3a;
        } else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) &&
                   ext4_has_feature_journal_needs_recovery(sb)) {
                ext4_msg(sb, KERN_ERR, "required journal recovery "
                       "suppressed and not mounted read-only");
                goto failed_mount3a;
        } else {
                /* Nojournal mode, all journal mount options are illegal */
                if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
                        ext4_msg(sb, KERN_ERR, "can't mount with "
                                 "journal_async_commit, fs mounted w/o journal");
                        goto failed_mount3a;
                }

                if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) {
                        ext4_msg(sb, KERN_ERR, "can't mount with "
                                 "journal_checksum, fs mounted w/o journal");
                        goto failed_mount3a;
                }
                if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
                        ext4_msg(sb, KERN_ERR, "can't mount with "
                                 "commit=%lu, fs mounted w/o journal",
                                 sbi->s_commit_interval / HZ);
                        goto failed_mount3a;
                }
                if (EXT4_MOUNT_DATA_FLAGS &
                    (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) {
                        ext4_msg(sb, KERN_ERR, "can't mount with "
                                 "data=, fs mounted w/o journal");
                        goto failed_mount3a;
                }
                sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM;
                clear_opt(sb, JOURNAL_CHECKSUM);
                clear_opt(sb, DATA_FLAGS);
                clear_opt2(sb, JOURNAL_FAST_COMMIT);
                sbi->s_journal = NULL;
                needs_recovery = 0;
                goto no_journal;
        }

        if (ext4_has_feature_64bit(sb) &&
            !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
                                       JBD2_FEATURE_INCOMPAT_64BIT)) {
                ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
                goto failed_mount_wq;
        }

        if (!set_journal_csum_feature_set(sb)) {
                ext4_msg(sb, KERN_ERR, "Failed to set journal checksum "
                         "feature set");
                goto failed_mount_wq;
        }

        if (test_opt2(sb, JOURNAL_FAST_COMMIT) &&
                !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
                                          JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) {
                ext4_msg(sb, KERN_ERR,
                        "Failed to set fast commit journal feature");
                goto failed_mount_wq;
        }

        /* We have now updated the journal if required, so we can
         * validate the data journaling mode. */
        switch (test_opt(sb, DATA_FLAGS)) {
        case 0:
                /* No mode set, assume a default based on the journal
                 * capabilities: ORDERED_DATA if the journal can
                 * cope, else JOURNAL_DATA
                 */
                if (jbd2_journal_check_available_features
                    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
                        set_opt(sb, ORDERED_DATA);
                        sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA;
                } else {
                        set_opt(sb, JOURNAL_DATA);
                        sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA;
                }
                break;

        case EXT4_MOUNT_ORDERED_DATA:
        case EXT4_MOUNT_WRITEBACK_DATA:
                if (!jbd2_journal_check_available_features
                    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
                        ext4_msg(sb, KERN_ERR, "Journal does not support "
                               "requested data journaling mode");
                        goto failed_mount_wq;
                }
        default:
                break;
        }

        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
            test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
                ext4_msg(sb, KERN_ERR, "can't mount with "
                        "journal_async_commit in data=ordered mode");
                goto failed_mount_wq;
        }

        set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);

        sbi->s_journal->j_submit_inode_data_buffers =
                ext4_journal_submit_inode_data_buffers;
        sbi->s_journal->j_finish_inode_data_buffers =
                ext4_journal_finish_inode_data_buffers;

no_journal:
        if (!test_opt(sb, NO_MBCACHE)) {
                sbi->s_ea_block_cache = ext4_xattr_create_cache();
                if (!sbi->s_ea_block_cache) {
                        ext4_msg(sb, KERN_ERR,
                                 "Failed to create ea_block_cache");
                        goto failed_mount_wq;
                }

                if (ext4_has_feature_ea_inode(sb)) {
                        sbi->s_ea_inode_cache = ext4_xattr_create_cache();
                        if (!sbi->s_ea_inode_cache) {
                                ext4_msg(sb, KERN_ERR,
                                         "Failed to create ea_inode_cache");
                                goto failed_mount_wq;
                        }
                }
        }

        if (ext4_has_feature_verity(sb) && blocksize != PAGE_SIZE) {
                ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity");
                goto failed_mount_wq;
        }

        /*
         * Get the # of file system overhead blocks from the
         * superblock if present.
         */
        sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
        /* ignore the precalculated value if it is ridiculous */
        if (sbi->s_overhead > ext4_blocks_count(es))
                sbi->s_overhead = 0;
        /*
         * If the bigalloc feature is not enabled recalculating the
         * overhead doesn't take long, so we might as well just redo
         * it to make sure we are using the correct value.
         */
        if (!ext4_has_feature_bigalloc(sb))
                sbi->s_overhead = 0;
        if (sbi->s_overhead == 0) {
                err = ext4_calculate_overhead(sb);
                if (err)
                        goto failed_mount_wq;
        }

        /*
         * The maximum number of concurrent works can be high and
         * concurrency isn't really necessary.  Limit it to 1.
         */
        EXT4_SB(sb)->rsv_conversion_wq =
                alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
        if (!EXT4_SB(sb)->rsv_conversion_wq) {
                printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
                ret = -ENOMEM;
                goto failed_mount4;
        }

        /*
         * The jbd2_journal_load will have done any necessary log recovery,
         * so we can safely mount the rest of the filesystem now.
         */

        root = ext4_iget(sb, EXT4_ROOT_INO, EXT4_IGET_SPECIAL);
        if (IS_ERR(root)) {
                ext4_msg(sb, KERN_ERR, "get root inode failed");
                ret = PTR_ERR(root);
                root = NULL;
                goto failed_mount4;
        }
        if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
                ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
                iput(root);
                goto failed_mount4;
        }

#ifdef CONFIG_UNICODE
        if (sb->s_encoding)
                sb->s_d_op = &ext4_dentry_ops;
#endif

        sb->s_root = d_make_root(root);
        if (!sb->s_root) {
                ext4_msg(sb, KERN_ERR, "get root dentry failed");
                ret = -ENOMEM;
                goto failed_mount4;
        }

        ret = ext4_setup_super(sb, es, sb_rdonly(sb));
        if (ret == -EROFS) {
                sb->s_flags |= SB_RDONLY;
                ret = 0;
        } else if (ret)
                goto failed_mount4a;

        ext4_set_resv_clusters(sb);

        if (test_opt(sb, BLOCK_VALIDITY)) {
                err = ext4_setup_system_zone(sb);
                if (err) {
                        ext4_msg(sb, KERN_ERR, "failed to initialize system "
                                 "zone (%d)", err);
                        goto failed_mount4a;
                }
        }
        ext4_fc_replay_cleanup(sb);

        ext4_ext_init(sb);
        err = ext4_mb_init(sb);
        if (err) {
                ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
                         err);
                goto failed_mount5;
        }

        /*
         * We can only set up the journal commit callback once
         * mballoc is initialized
         */
        if (sbi->s_journal)
                sbi->s_journal->j_commit_callback =
                        ext4_journal_commit_callback;

        block = ext4_count_free_clusters(sb);
        ext4_free_blocks_count_set(sbi->s_es, 
                                   EXT4_C2B(sbi, block));
        ext4_superblock_csum_set(sb);
        err = percpu_counter_init(&sbi->s_freeclusters_counter, block,
                                  GFP_KERNEL);
        if (!err) {
                unsigned long freei = ext4_count_free_inodes(sb);
                sbi->s_es->s_free_inodes_count = cpu_to_le32(freei);
                ext4_superblock_csum_set(sb);
                err = percpu_counter_init(&sbi->s_freeinodes_counter, freei,
                                          GFP_KERNEL);
        }
        if (!err)
                err = percpu_counter_init(&sbi->s_dirs_counter,
                                          ext4_count_dirs(sb), GFP_KERNEL);
        if (!err)
                err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
                                          GFP_KERNEL);
        if (!err)
                err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0,
                                          GFP_KERNEL);
        if (!err)
                err = percpu_init_rwsem(&sbi->s_writepages_rwsem);

        if (err) {
                ext4_msg(sb, KERN_ERR, "insufficient memory");
                goto failed_mount6;
        }

        if (ext4_has_feature_flex_bg(sb))
                if (!ext4_fill_flex_info(sb)) {
                        ext4_msg(sb, KERN_ERR,
                               "unable to initialize "
                               "flex_bg meta info!");
                        ret = -ENOMEM;
                        goto failed_mount6;
                }

        err = ext4_register_li_request(sb, first_not_zeroed);
        if (err)
                goto failed_mount6;

        err = ext4_register_sysfs(sb);
        if (err)
                goto failed_mount7;

#ifdef CONFIG_QUOTA
        /* Enable quota usage during mount. */
        if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) {
                err = ext4_enable_quotas(sb);
                if (err)
                        goto failed_mount8;
        }
#endif  /* CONFIG_QUOTA */

        /*
         * Save the original bdev mapping's wb_err value which could be
         * used to detect the metadata async write error.
         */
        spin_lock_init(&sbi->s_bdev_wb_lock);
        errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err,
                                 &sbi->s_bdev_wb_err);
        sb->s_bdev->bd_super = sb;
        EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
        ext4_orphan_cleanup(sb, es);
        EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
        if (needs_recovery) {
                ext4_msg(sb, KERN_INFO, "recovery complete");
                err = ext4_mark_recovery_complete(sb, es);
                if (err)
                        goto failed_mount8;
        }
        if (EXT4_SB(sb)->s_journal) {
                if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
                        descr = " journalled data mode";
                else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
                        descr = " ordered data mode";
                else
                        descr = " writeback data mode";
        } else
                descr = "out journal";

        if (test_opt(sb, DISCARD)) {
                struct request_queue *q = bdev_get_queue(sb->s_bdev);
                if (!blk_queue_discard(q))
                        ext4_msg(sb, KERN_WARNING,
                                 "mounting with \"discard\" option, but "
                                 "the device does not support discard");
        }

        if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount"))
                ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
                         "Opts: %.*s%s%s", descr,
                         (int) sizeof(sbi->s_es->s_mount_opts),
                         sbi->s_es->s_mount_opts,
                         *sbi->s_es->s_mount_opts ? "; " : "", orig_data);

        if (es->s_error_count)
                mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */

        /* Enable message ratelimiting. Default is 10 messages per 5 secs. */
        ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10);
        ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10);
        ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10);
        atomic_set(&sbi->s_warning_count, 0);
        atomic_set(&sbi->s_msg_count, 0);

        kfree(orig_data);
        return 0;

cantfind_ext4:
        if (!silent)
                ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
        goto failed_mount;

failed_mount8:
        ext4_unregister_sysfs(sb);
        kobject_put(&sbi->s_kobj);
failed_mount7:
        ext4_unregister_li_request(sb);
failed_mount6:
        ext4_mb_release(sb);
        rcu_read_lock();
        flex_groups = rcu_dereference(sbi->s_flex_groups);
        if (flex_groups) {
                for (i = 0; i < sbi->s_flex_groups_allocated; i++)
                        kvfree(flex_groups[i]);
                kvfree(flex_groups);
        }
        rcu_read_unlock();
        percpu_counter_destroy(&sbi->s_freeclusters_counter);
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
        percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
        percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit);
        percpu_free_rwsem(&sbi->s_writepages_rwsem);
failed_mount5:
        ext4_ext_release(sb);
        ext4_release_system_zone(sb);
failed_mount4a:
        dput(sb->s_root);
        sb->s_root = NULL;
failed_mount4:
        ext4_msg(sb, KERN_ERR, "mount failed");
        if (EXT4_SB(sb)->rsv_conversion_wq)
                destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
failed_mount_wq:
        ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
        sbi->s_ea_inode_cache = NULL;

        ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
        sbi->s_ea_block_cache = NULL;

        if (sbi->s_journal) {
                jbd2_journal_destroy(sbi->s_journal);
                sbi->s_journal = NULL;
        }
failed_mount3a:
        ext4_es_unregister_shrinker(sbi);
failed_mount3:
        ext4_stop_mmpd(sbi);
        del_timer_sync(&sbi->s_err_report);
failed_mount2:
        rcu_read_lock();
        group_desc = rcu_dereference(sbi->s_group_desc);
        for (i = 0; i < db_count; i++)
                brelse(group_desc[i]);
        kvfree(group_desc);
        rcu_read_unlock();
failed_mount:
        if (sbi->s_chksum_driver)
                crypto_free_shash(sbi->s_chksum_driver);

#ifdef CONFIG_UNICODE
        utf8_unload(sb->s_encoding);
#endif

#ifdef CONFIG_QUOTA
        for (i = 0; i < EXT4_MAXQUOTAS; i++)
                kfree(get_qf_name(sb, sbi, i));
#endif
        fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
        /* ext4_blkdev_remove() calls kill_bdev(), release bh before it. */
        brelse(bh);
        ext4_blkdev_remove(sbi);
out_fail:
        invalidate_bdev(sb->s_bdev);
        sb->s_fs_info = NULL;
        kfree(sbi->s_blockgroup_lock);
out_free_base:
        kfree(sbi);
        kfree(orig_data);
        fs_put_dax(dax_dev);
        return err ? err : ret;
}

/*
 * Setup any per-fs journal parameters now.  We'll do this both on
 * initial mount, once the journal has been initialised but before we've
 * done any recovery; and again on any subsequent remount.
 */
static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        journal->j_commit_interval = sbi->s_commit_interval;
        journal->j_min_batch_time = sbi->s_min_batch_time;
        journal->j_max_batch_time = sbi->s_max_batch_time;
        ext4_fc_init(sb, journal);

        write_lock(&journal->j_state_lock);
        if (test_opt(sb, BARRIER))
                journal->j_flags |= JBD2_BARRIER;
        else
                journal->j_flags &= ~JBD2_BARRIER;
        if (test_opt(sb, DATA_ERR_ABORT))
                journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
        else
                journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
        write_unlock(&journal->j_state_lock);
}

static struct inode *ext4_get_journal_inode(struct super_block *sb,
                                             unsigned int journal_inum)
{
        struct inode *journal_inode;

        /*
         * Test for the existence of a valid inode on disk.  Bad things
         * happen if we iget() an unused inode, as the subsequent iput()
         * will try to delete it.
         */
        journal_inode = ext4_iget(sb, journal_inum, EXT4_IGET_SPECIAL);
        if (IS_ERR(journal_inode)) {
                ext4_msg(sb, KERN_ERR, "no journal found");
                return NULL;
        }
        if (!journal_inode->i_nlink) {
                make_bad_inode(journal_inode);
                iput(journal_inode);
                ext4_msg(sb, KERN_ERR, "journal inode is deleted");
                return NULL;
        }

        jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
                  journal_inode, journal_inode->i_size);
        if (!S_ISREG(journal_inode->i_mode) || IS_ENCRYPTED(journal_inode)) {
                ext4_msg(sb, KERN_ERR, "invalid journal inode");
                iput(journal_inode);
                return NULL;
        }
        return journal_inode;
}

static journal_t *ext4_get_journal(struct super_block *sb,
                                   unsigned int journal_inum)
{
        struct inode *journal_inode;
        journal_t *journal;

        if (WARN_ON_ONCE(!ext4_has_feature_journal(sb)))
                return NULL;

        journal_inode = ext4_get_journal_inode(sb, journal_inum);
        if (!journal_inode)
                return NULL;

        journal = jbd2_journal_init_inode(journal_inode);
        if (!journal) {
                ext4_msg(sb, KERN_ERR, "Could not load journal inode");
                iput(journal_inode);
                return NULL;
        }
        journal->j_private = sb;
        ext4_init_journal_params(sb, journal);
        return journal;
}

static journal_t *ext4_get_dev_journal(struct super_block *sb,
                                       dev_t j_dev)
{
        struct buffer_head *bh;
        journal_t *journal;
        ext4_fsblk_t start;
        ext4_fsblk_t len;
        int hblock, blocksize;
        ext4_fsblk_t sb_block;
        unsigned long offset;
        struct ext4_super_block *es;
        struct block_device *bdev;

        if (WARN_ON_ONCE(!ext4_has_feature_journal(sb)))
                return NULL;

        bdev = ext4_blkdev_get(j_dev, sb);
        if (bdev == NULL)
                return NULL;

        blocksize = sb->s_blocksize;
        hblock = bdev_logical_block_size(bdev);
        if (blocksize < hblock) {
                ext4_msg(sb, KERN_ERR,
                        "blocksize too small for journal device");
                goto out_bdev;
        }

        sb_block = EXT4_MIN_BLOCK_SIZE / blocksize;
        offset = EXT4_MIN_BLOCK_SIZE % blocksize;
        set_blocksize(bdev, blocksize);
        if (!(bh = __bread(bdev, sb_block, blocksize))) {
                ext4_msg(sb, KERN_ERR, "couldn't read superblock of "
                       "external journal");
                goto out_bdev;
        }

        es = (struct ext4_super_block *) (bh->b_data + offset);
        if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
            !(le32_to_cpu(es->s_feature_incompat) &
              EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
                ext4_msg(sb, KERN_ERR, "external journal has "
                                        "bad superblock");
                brelse(bh);
                goto out_bdev;
        }

        if ((le32_to_cpu(es->s_feature_ro_compat) &
             EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
            es->s_checksum != ext4_superblock_csum(sb, es)) {
                ext4_msg(sb, KERN_ERR, "external journal has "
                                       "corrupt superblock");
                brelse(bh);
                goto out_bdev;
        }

        if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
                ext4_msg(sb, KERN_ERR, "journal UUID does not match");
                brelse(bh);
                goto out_bdev;
        }

        len = ext4_blocks_count(es);
        start = sb_block + 1;
        brelse(bh);        /* we're done with the superblock */

        journal = jbd2_journal_init_dev(bdev, sb->s_bdev,
                                        start, len, blocksize);
        if (!journal) {
                ext4_msg(sb, KERN_ERR, "failed to create device journal");
                goto out_bdev;
        }
        journal->j_private = sb;
        if (ext4_read_bh_lock(journal->j_sb_buffer, REQ_META | REQ_PRIO, true)) {
                ext4_msg(sb, KERN_ERR, "I/O error on journal device");
                goto out_journal;
        }
        if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
                ext4_msg(sb, KERN_ERR, "External journal has more than one "
                                        "user (unsupported) - %d",
                        be32_to_cpu(journal->j_superblock->s_nr_users));
                goto out_journal;
        }
        EXT4_SB(sb)->s_journal_bdev = bdev;
        ext4_init_journal_params(sb, journal);
        return journal;

out_journal:
        jbd2_journal_destroy(journal);
out_bdev:
        ext4_blkdev_put(bdev);
        return NULL;
}

static int ext4_load_journal(struct super_block *sb,
                             struct ext4_super_block *es,
                             unsigned long journal_devnum)
{
        journal_t *journal;
        unsigned int journal_inum = le32_to_cpu(es->s_journal_inum);
        dev_t journal_dev;
        int err = 0;
        int really_read_only;
        int journal_dev_ro;

        if (WARN_ON_ONCE(!ext4_has_feature_journal(sb)))
                return -EFSCORRUPTED;

        if (journal_devnum &&
            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
                ext4_msg(sb, KERN_INFO, "external journal device major/minor "
                        "numbers have changed");
                journal_dev = new_decode_dev(journal_devnum);
        } else
                journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));

        if (journal_inum && journal_dev) {
                ext4_msg(sb, KERN_ERR,
                         "filesystem has both journal inode and journal device!");
                return -EINVAL;
        }

        if (journal_inum) {
                journal = ext4_get_journal(sb, journal_inum);
                if (!journal)
                        return -EINVAL;
        } else {
                journal = ext4_get_dev_journal(sb, journal_dev);
                if (!journal)
                        return -EINVAL;
        }

        journal_dev_ro = bdev_read_only(journal->j_dev);
        really_read_only = bdev_read_only(sb->s_bdev) | journal_dev_ro;

        if (journal_dev_ro && !sb_rdonly(sb)) {
                ext4_msg(sb, KERN_ERR,
                         "journal device read-only, try mounting with '-o ro'");
                err = -EROFS;
                goto err_out;
        }

        /*
         * Are we loading a blank journal or performing recovery after a
         * crash?  For recovery, we need to check in advance whether we
         * can get read-write access to the device.
         */
        if (ext4_has_feature_journal_needs_recovery(sb)) {
                if (sb_rdonly(sb)) {
                        ext4_msg(sb, KERN_INFO, "INFO: recovery "
                                        "required on readonly filesystem");
                        if (really_read_only) {
                                ext4_msg(sb, KERN_ERR, "write access "
                                        "unavailable, cannot proceed "
                                        "(try mounting with noload)");
                                err = -EROFS;
                                goto err_out;
                        }
                        ext4_msg(sb, KERN_INFO, "write access will "
                               "be enabled during recovery");
                }
        }

        if (!(journal->j_flags & JBD2_BARRIER))
                ext4_msg(sb, KERN_INFO, "barriers disabled");

        if (!ext4_has_feature_journal_needs_recovery(sb))
                err = jbd2_journal_wipe(journal, !really_read_only);
        if (!err) {
                char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL);
                if (save)
                        memcpy(save, ((char *) es) +
                               EXT4_S_ERR_START, EXT4_S_ERR_LEN);
                err = jbd2_journal_load(journal);
                if (save)
                        memcpy(((char *) es) + EXT4_S_ERR_START,
                               save, EXT4_S_ERR_LEN);
                kfree(save);
        }

        if (err) {
                ext4_msg(sb, KERN_ERR, "error loading journal");
                goto err_out;
        }

        EXT4_SB(sb)->s_journal = journal;
        err = ext4_clear_journal_err(sb, es);
        if (err) {
                EXT4_SB(sb)->s_journal = NULL;
                jbd2_journal_destroy(journal);
                return err;
        }

        if (!really_read_only && journal_devnum &&
            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
                es->s_journal_dev = cpu_to_le32(journal_devnum);

                /* Make sure we flush the recovery flag to disk. */
                ext4_commit_super(sb, 1);
        }

        return 0;

err_out:
        jbd2_journal_destroy(journal);
        return err;
}

static int ext4_commit_super(struct super_block *sb, int sync)
{
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
        struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
        int error = 0;

        if (!sbh)
                return -EINVAL;
        if (block_device_ejected(sb))
                return -ENODEV;

        /*
         * If the file system is mounted read-only, don't update the
         * superblock write time.  This avoids updating the superblock
         * write time when we are mounting the root file system
         * read/only but we need to replay the journal; at that point,
         * for people who are east of GMT and who make their clock
         * tick in localtime for Windows bug-for-bug compatibility,
         * the clock is set in the future, and this will cause e2fsck
         * to complain and force a full file system check.
         */
        if (!(sb->s_flags & SB_RDONLY))
                ext4_update_tstamp(es, s_wtime);
        if (sb->s_bdev->bd_part)
                es->s_kbytes_written =
                        cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
                            ((part_stat_read(sb->s_bdev->bd_part,
                                             sectors[STAT_WRITE]) -
                              EXT4_SB(sb)->s_sectors_written_start) >> 1));
        else
                es->s_kbytes_written =
                        cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
        if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeclusters_counter))
                ext4_free_blocks_count_set(es,
                        EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive(
                                &EXT4_SB(sb)->s_freeclusters_counter)));
        if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter))
                es->s_free_inodes_count =
                        cpu_to_le32(percpu_counter_sum_positive(
                                &EXT4_SB(sb)->s_freeinodes_counter));
        BUFFER_TRACE(sbh, "marking dirty");
        ext4_superblock_csum_set(sb);
        if (sync)
                lock_buffer(sbh);
        if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) {
                /*
                 * Oh, dear.  A previous attempt to write the
                 * superblock failed.  This could happen because the
                 * USB device was yanked out.  Or it could happen to
                 * be a transient write error and maybe the block will
                 * be remapped.  Nothing we can do but to retry the
                 * write and hope for the best.
                 */
                ext4_msg(sb, KERN_ERR, "previous I/O error to "
                       "superblock detected");
                clear_buffer_write_io_error(sbh);
                set_buffer_uptodate(sbh);
        }
        mark_buffer_dirty(sbh);
        if (sync) {
                unlock_buffer(sbh);
                error = __sync_dirty_buffer(sbh,
                        REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0));
                if (buffer_write_io_error(sbh)) {
                        ext4_msg(sb, KERN_ERR, "I/O error while writing "
                               "superblock");
                        clear_buffer_write_io_error(sbh);
                        set_buffer_uptodate(sbh);
                }
        }
        return error;
}

/*
 * Have we just finished recovery?  If so, and if we are mounting (or
 * remounting) the filesystem readonly, then we will end up with a
 * consistent fs on disk.  Record that fact.
 */
static int ext4_mark_recovery_complete(struct super_block *sb,
                                       struct ext4_super_block *es)
{
        int err;
        journal_t *journal = EXT4_SB(sb)->s_journal;

        if (!ext4_has_feature_journal(sb)) {
                if (journal != NULL) {
                        ext4_error(sb, "Journal got removed while the fs was "
                                   "mounted!");
                        return -EFSCORRUPTED;
                }
                return 0;
        }
        jbd2_journal_lock_updates(journal);
        err = jbd2_journal_flush(journal);
        if (err < 0)
                goto out;

        if (ext4_has_feature_journal_needs_recovery(sb) && sb_rdonly(sb)) {
                ext4_clear_feature_journal_needs_recovery(sb);
                ext4_commit_super(sb, 1);
        }
out:
        jbd2_journal_unlock_updates(journal);
        return err;
}

/*
 * If we are mounting (or read-write remounting) a filesystem whose journal
 * has recorded an error from a previous lifetime, move that error to the
 * main filesystem now.
 */
static int ext4_clear_journal_err(struct super_block *sb,
                                   struct ext4_super_block *es)
{
        journal_t *journal;
        int j_errno;
        const char *errstr;

        if (!ext4_has_feature_journal(sb)) {
                ext4_error(sb, "Journal got removed while the fs was mounted!");
                return -EFSCORRUPTED;
        }

        journal = EXT4_SB(sb)->s_journal;

        /*
         * Now check for any error status which may have been recorded in the
         * journal by a prior ext4_error() or ext4_abort()
         */

        j_errno = jbd2_journal_errno(journal);
        if (j_errno) {
                char nbuf[16];

                errstr = ext4_decode_error(sb, j_errno, nbuf);
                ext4_warning(sb, "Filesystem error recorded "
                             "from previous mount: %s", errstr);
                ext4_warning(sb, "Marking fs in need of filesystem check.");

                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
                ext4_commit_super(sb, 1);

                jbd2_journal_clear_err(journal);
                jbd2_journal_update_sb_errno(journal);
        }
        return 0;
}

/*
 * Force the running and committing transactions to commit,
 * and wait on the commit.
 */
int ext4_force_commit(struct super_block *sb)
{
        journal_t *journal;

        if (sb_rdonly(sb))
                return 0;

        journal = EXT4_SB(sb)->s_journal;
        return ext4_journal_force_commit(journal);
}

static int ext4_sync_fs(struct super_block *sb, int wait)
{
        int ret = 0;
        tid_t target;
        bool needs_barrier = false;
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (unlikely(ext4_forced_shutdown(sbi)))
                return 0;

        trace_ext4_sync_fs(sb, wait);
        flush_workqueue(sbi->rsv_conversion_wq);
        /*
         * Writeback quota in non-journalled quota case - journalled quota has
         * no dirty dquots
         */
        dquot_writeback_dquots(sb, -1);
        /*
         * Data writeback is possible w/o journal transaction, so barrier must
         * being sent at the end of the function. But we can skip it if
         * transaction_commit will do it for us.
         */
        if (sbi->s_journal) {
                target = jbd2_get_latest_transaction(sbi->s_journal);
                if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
                    !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
                        needs_barrier = true;

                if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
                        if (wait)
                                ret = jbd2_log_wait_commit(sbi->s_journal,
                                                           target);
                }
        } else if (wait && test_opt(sb, BARRIER))
                needs_barrier = true;
        if (needs_barrier) {
                int err;
                err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
                if (!ret)
                        ret = err;
        }

        return ret;
}

/*
 * LVM calls this function before a (read-only) snapshot is created.  This
 * gives us a chance to flush the journal completely and mark the fs clean.
 *
 * Note that only this function cannot bring a filesystem to be in a clean
 * state independently. It relies on upper layer to stop all data & metadata
 * modifications.
 */
static int ext4_freeze(struct super_block *sb)
{
        int error = 0;
        journal_t *journal;

        if (sb_rdonly(sb))
                return 0;

        journal = EXT4_SB(sb)->s_journal;

        if (journal) {
                /* Now we set up the journal barrier. */
                jbd2_journal_lock_updates(journal);

                /*
                 * Don't clear the needs_recovery flag if we failed to
                 * flush the journal.
                 */
                error = jbd2_journal_flush(journal);
                if (error < 0)
                        goto out;

                /* Journal blocked and flushed, clear needs_recovery flag. */
                ext4_clear_feature_journal_needs_recovery(sb);
        }

        error = ext4_commit_super(sb, 1);
out:
        if (journal)
                /* we rely on upper layer to stop further updates */
                jbd2_journal_unlock_updates(journal);
        return error;
}

/*
 * Called by LVM after the snapshot is done.  We need to reset the RECOVER
 * flag here, even though the filesystem is not technically dirty yet.
 */
static int ext4_unfreeze(struct super_block *sb)
{
        if (sb_rdonly(sb) || ext4_forced_shutdown(EXT4_SB(sb)))
                return 0;

        if (EXT4_SB(sb)->s_journal) {
                /* Reset the needs_recovery flag before the fs is unlocked. */
                ext4_set_feature_journal_needs_recovery(sb);
        }

        ext4_commit_super(sb, 1);
        return 0;
}

/*
 * Structure to save mount options for ext4_remount's benefit
 */
struct ext4_mount_options {
        unsigned long s_mount_opt;
        unsigned long s_mount_opt2;
        kuid_t s_resuid;
        kgid_t s_resgid;
        unsigned long s_commit_interval;
        u32 s_min_batch_time, s_max_batch_time;
#ifdef CONFIG_QUOTA
        int s_jquota_fmt;
        char *s_qf_names[EXT4_MAXQUOTAS];
#endif
};

static int ext4_remount(struct super_block *sb, int *flags, char *data)
{
        struct ext4_super_block *es;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        unsigned long old_sb_flags, vfs_flags;
        struct ext4_mount_options old_opts;
        ext4_group_t g;
        unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
        int err = 0;
#ifdef CONFIG_QUOTA
        int enable_quota = 0;
        int i, j;
        char *to_free[EXT4_MAXQUOTAS];
#endif
        char *orig_data = kstrdup(data, GFP_KERNEL);

        if (data && !orig_data)
                return -ENOMEM;

        /* Store the original options */
        old_sb_flags = sb->s_flags;
        old_opts.s_mount_opt = sbi->s_mount_opt;
        old_opts.s_mount_opt2 = sbi->s_mount_opt2;
        old_opts.s_resuid = sbi->s_resuid;
        old_opts.s_resgid = sbi->s_resgid;
        old_opts.s_commit_interval = sbi->s_commit_interval;
        old_opts.s_min_batch_time = sbi->s_min_batch_time;
        old_opts.s_max_batch_time = sbi->s_max_batch_time;
#ifdef CONFIG_QUOTA
        old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
        for (i = 0; i < EXT4_MAXQUOTAS; i++)
                if (sbi->s_qf_names[i]) {
                        char *qf_name = get_qf_name(sb, sbi, i);

                        old_opts.s_qf_names[i] = kstrdup(qf_name, GFP_KERNEL);
                        if (!old_opts.s_qf_names[i]) {
                                for (j = 0; j < i; j++)
                                        kfree(old_opts.s_qf_names[j]);
                                kfree(orig_data);
                                return -ENOMEM;
                        }
                } else
                        old_opts.s_qf_names[i] = NULL;
#endif
        if (sbi->s_journal && sbi->s_journal->j_task->io_context)
                journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;

        /*
         * Some options can be enabled by ext4 and/or by VFS mount flag
         * either way we need to make sure it matches in both *flags and
         * s_flags. Copy those selected flags from *flags to s_flags
         */
        vfs_flags = SB_LAZYTIME | SB_I_VERSION;
        sb->s_flags = (sb->s_flags & ~vfs_flags) | (*flags & vfs_flags);

        if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) {
                err = -EINVAL;
                goto restore_opts;
        }

        if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
            test_opt(sb, JOURNAL_CHECKSUM)) {
                ext4_msg(sb, KERN_ERR, "changing journal_checksum "
                         "during remount not supported; ignoring");
                sbi->s_mount_opt ^= EXT4_MOUNT_JOURNAL_CHECKSUM;
        }

        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
                if (test_opt2(sb, EXPLICIT_DELALLOC)) {
                        ext4_msg(sb, KERN_ERR, "can't mount with "
                                 "both data=journal and delalloc");
                        err = -EINVAL;
                        goto restore_opts;
                }
                if (test_opt(sb, DIOREAD_NOLOCK)) {
                        ext4_msg(sb, KERN_ERR, "can't mount with "
                                 "both data=journal and dioread_nolock");
                        err = -EINVAL;
                        goto restore_opts;
                }
        } else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) {
                if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
                        ext4_msg(sb, KERN_ERR, "can't mount with "
                                "journal_async_commit in data=ordered mode");
                        err = -EINVAL;
                        goto restore_opts;
                }
        }

        if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_NO_MBCACHE) {
                ext4_msg(sb, KERN_ERR, "can't enable nombcache during remount");
                err = -EINVAL;
                goto restore_opts;
        }

        if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
                ext4_abort(sb, ESHUTDOWN, "Abort forced by user");

        sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
                (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);

        es = sbi->s_es;

        if (sbi->s_journal) {
                ext4_init_journal_params(sb, sbi->s_journal);
                set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
        }

        if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) {
                if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) {
                        err = -EROFS;
                        goto restore_opts;
                }

                if (*flags & SB_RDONLY) {
                        err = sync_filesystem(sb);
                        if (err < 0)
                                goto restore_opts;
                        err = dquot_suspend(sb, -1);
                        if (err < 0)
                                goto restore_opts;

                        /*
                         * First of all, the unconditional stuff we have to do
                         * to disable replay of the journal when we next remount
                         */
                        sb->s_flags |= SB_RDONLY;

                        /*
                         * OK, test if we are remounting a valid rw partition
                         * readonly, and if so set the rdonly flag and then
                         * mark the partition as valid again.
                         */
                        if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) &&
                            (sbi->s_mount_state & EXT4_VALID_FS))
                                es->s_state = cpu_to_le16(sbi->s_mount_state);

                        if (sbi->s_journal) {
                                /*
                                 * We let remount-ro finish even if marking fs
                                 * as clean failed...
                                 */
                                ext4_mark_recovery_complete(sb, es);
                        }
                } else {
                        /* Make sure we can mount this feature set readwrite */
                        if (ext4_has_feature_readonly(sb) ||
                            !ext4_feature_set_ok(sb, 0)) {
                                err = -EROFS;
                                goto restore_opts;
                        }
                        /*
                         * Make sure the group descriptor checksums
                         * are sane.  If they aren't, refuse to remount r/w.
                         */
                        for (g = 0; g < sbi->s_groups_count; g++) {
                                struct ext4_group_desc *gdp =
                                        ext4_get_group_desc(sb, g, NULL);

                                if (!ext4_group_desc_csum_verify(sb, g, gdp)) {
                                        ext4_msg(sb, KERN_ERR,
               "ext4_remount: Checksum for group %u failed (%u!=%u)",
                g, le16_to_cpu(ext4_group_desc_csum(sb, g, gdp)),
                                               le16_to_cpu(gdp->bg_checksum));
                                        err = -EFSBADCRC;
                                        goto restore_opts;
                                }
                        }

                        /*
                         * If we have an unprocessed orphan list hanging
                         * around from a previously readonly bdev mount,
                         * require a full umount/remount for now.
                         */
                        if (es->s_last_orphan) {
                                ext4_msg(sb, KERN_WARNING, "Couldn't "
                                       "remount RDWR because of unprocessed "
                                       "orphan inode list.  Please "
                                       "umount/remount instead");
                                err = -EINVAL;
                                goto restore_opts;
                        }

                        /*
                         * Mounting a RDONLY partition read-write, so reread
                         * and store the current valid flag.  (It may have
                         * been changed by e2fsck since we originally mounted
                         * the partition.)
                         */
                        if (sbi->s_journal) {
                                err = ext4_clear_journal_err(sb, es);
                                if (err)
                                        goto restore_opts;
                        }
                        sbi->s_mount_state = (le16_to_cpu(es->s_state) &
                                              ~EXT4_FC_REPLAY);

                        err = ext4_setup_super(sb, es, 0);
                        if (err)
                                goto restore_opts;

                        sb->s_flags &= ~SB_RDONLY;
                        if (ext4_has_feature_mmp(sb)) {
                                err = ext4_multi_mount_protect(sb,
                                                le64_to_cpu(es->s_mmp_block));
                                if (err)
                                        goto restore_opts;
                        }
#ifdef CONFIG_QUOTA
                        enable_quota = 1;
#endif
                }
        }

        /*
         * Handle creation of system zone data early because it can fail.
         * Releasing of existing data is done when we are sure remount will
         * succeed.
         */
        if (test_opt(sb, BLOCK_VALIDITY) && !sbi->s_system_blks) {
                err = ext4_setup_system_zone(sb);
                if (err)
                        goto restore_opts;
        }

        if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) {
                err = ext4_commit_super(sb, 1);
                if (err)
                        goto restore_opts;
        }

#ifdef CONFIG_QUOTA
        if (enable_quota) {
                if (sb_any_quota_suspended(sb))
                        dquot_resume(sb, -1);
                else if (ext4_has_feature_quota(sb)) {
                        err = ext4_enable_quotas(sb);
                        if (err)
                                goto restore_opts;
                }
        }
        /* Release old quota file names */
        for (i = 0; i < EXT4_MAXQUOTAS; i++)
                kfree(old_opts.s_qf_names[i]);
#endif
        if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks)
                ext4_release_system_zone(sb);

        /*
         * Reinitialize lazy itable initialization thread based on
         * current settings
         */
        if (sb_rdonly(sb) || !test_opt(sb, INIT_INODE_TABLE))
                ext4_unregister_li_request(sb);
        else {
                ext4_group_t first_not_zeroed;
                first_not_zeroed = ext4_has_uninit_itable(sb);
                ext4_register_li_request(sb, first_not_zeroed);
        }

        if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb))
                ext4_stop_mmpd(sbi);

        /*
         * Some options can be enabled by ext4 and/or by VFS mount flag
         * either way we need to make sure it matches in both *flags and
         * s_flags. Copy those selected flags from s_flags to *flags
         */
        *flags = (*flags & ~vfs_flags) | (sb->s_flags & vfs_flags);

        ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
        kfree(orig_data);
        return 0;

restore_opts:
        /*
         * If there was a failing r/w to ro transition, we may need to
         * re-enable quota
         */
        if ((sb->s_flags & SB_RDONLY) && !(old_sb_flags & SB_RDONLY) &&
            sb_any_quota_suspended(sb))
                dquot_resume(sb, -1);
        sb->s_flags = old_sb_flags;
        sbi->s_mount_opt = old_opts.s_mount_opt;
        sbi->s_mount_opt2 = old_opts.s_mount_opt2;
        sbi->s_resuid = old_opts.s_resuid;
        sbi->s_resgid = old_opts.s_resgid;
        sbi->s_commit_interval = old_opts.s_commit_interval;
        sbi->s_min_batch_time = old_opts.s_min_batch_time;
        sbi->s_max_batch_time = old_opts.s_max_batch_time;
        if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks)
                ext4_release_system_zone(sb);
#ifdef CONFIG_QUOTA
        sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
        for (i = 0; i < EXT4_MAXQUOTAS; i++) {
                to_free[i] = get_qf_name(sb, sbi, i);
                rcu_assign_pointer(sbi->s_qf_names[i], old_opts.s_qf_names[i]);
        }
        synchronize_rcu();
        for (i = 0; i < EXT4_MAXQUOTAS; i++)
                kfree(to_free[i]);
#endif
        if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb))
                ext4_stop_mmpd(sbi);
        kfree(orig_data);
        return err;
}

#ifdef CONFIG_QUOTA
static int ext4_statfs_project(struct super_block *sb,
                               kprojid_t projid, struct kstatfs *buf)
{
        struct kqid qid;
        struct dquot *dquot;
        u64 limit;
        u64 curblock;

        qid = make_kqid_projid(projid);
        dquot = dqget(sb, qid);
        if (IS_ERR(dquot))
                return PTR_ERR(dquot);
        spin_lock(&dquot->dq_dqb_lock);

        limit = min_not_zero(dquot->dq_dqb.dqb_bsoftlimit,
                             dquot->dq_dqb.dqb_bhardlimit);
        limit >>= sb->s_blocksize_bits;

        if (limit) {
                uint64_t        remaining = 0;

                curblock = (dquot->dq_dqb.dqb_curspace +
                            dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits;
                if (limit > curblock)
                        remaining = limit - curblock;

                buf->f_blocks = min(buf->f_blocks, limit);
                buf->f_bfree = min(buf->f_bfree, remaining);
                buf->f_bavail = min(buf->f_bavail, remaining);
        }

        limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit,
                             dquot->dq_dqb.dqb_ihardlimit);
        if (limit) {
                uint64_t        remaining = 0;

                if (limit > dquot->dq_dqb.dqb_curinodes)
                        remaining = limit - dquot->dq_dqb.dqb_curinodes;

                buf->f_files = min(buf->f_files, limit);
                buf->f_ffree = min(buf->f_ffree, remaining);
        }

        spin_unlock(&dquot->dq_dqb_lock);
        dqput(dquot);
        return 0;
}
#endif

static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
{
        struct super_block *sb = dentry->d_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        ext4_fsblk_t overhead = 0, resv_blocks;
        u64 fsid;
        s64 bfree;
        resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));

        if (!test_opt(sb, MINIX_DF))
                overhead = sbi->s_overhead;

        buf->f_type = EXT4_SUPER_MAGIC;
        buf->f_bsize = sb->s_blocksize;
        buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead);
        bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
                percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
        /* prevent underflow in case that few free space is available */
        buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
        buf->f_bavail = buf->f_bfree -
                        (ext4_r_blocks_count(es) + resv_blocks);
        if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks))
                buf->f_bavail = 0;
        buf->f_files = le32_to_cpu(es->s_inodes_count);
        buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
        buf->f_namelen = EXT4_NAME_LEN;
        fsid = le64_to_cpup((void *)es->s_uuid) ^
               le64_to_cpup((void *)es->s_uuid + sizeof(u64));
        buf->f_fsid = u64_to_fsid(fsid);

#ifdef CONFIG_QUOTA
        if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) &&
            sb_has_quota_limits_enabled(sb, PRJQUOTA))
                ext4_statfs_project(sb, EXT4_I(dentry->d_inode)->i_projid, buf);
#endif
        return 0;
}


#ifdef CONFIG_QUOTA

/*
 * Helper functions so that transaction is started before we acquire dqio_sem
 * to keep correct lock ordering of transaction > dqio_sem
 */
static inline struct inode *dquot_to_inode(struct dquot *dquot)
{
        return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type];
}

static int ext4_write_dquot(struct dquot *dquot)
{
        int ret, err;
        handle_t *handle;
        struct inode *inode;

        inode = dquot_to_inode(dquot);
        handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
                                    EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
        ret = dquot_commit(dquot);
        if (ret < 0)
                ext4_error_err(dquot->dq_sb, -ret,
                               "Failed to commit dquot type %d",
                               dquot->dq_id.type);
        err = ext4_journal_stop(handle);
        if (!ret)
                ret = err;
        return ret;
}

static int ext4_acquire_dquot(struct dquot *dquot)
{
        int ret, err;
        handle_t *handle;

        handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
                                    EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
        ret = dquot_acquire(dquot);
        if (ret < 0)
                ext4_error_err(dquot->dq_sb, -ret,
                              "Failed to acquire dquot type %d",
                              dquot->dq_id.type);
        err = ext4_journal_stop(handle);
        if (!ret)
                ret = err;
        return ret;
}

static int ext4_release_dquot(struct dquot *dquot)
{
        int ret, err;
        handle_t *handle;
        bool freeze_protected = false;

        /*
         * Trying to sb_start_intwrite() in a running transaction
         * can result in a deadlock. Further, running transactions
         * are already protected from freezing.
         */
        if (!ext4_journal_current_handle()) {
                sb_start_intwrite(dquot->dq_sb);
                freeze_protected = true;
        }

        handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
                                    EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
        if (IS_ERR(handle)) {
                /* Release dquot anyway to avoid endless cycle in dqput() */
                dquot_release(dquot);
                if (freeze_protected)
                        sb_end_intwrite(dquot->dq_sb);
                return PTR_ERR(handle);
        }
        ret = dquot_release(dquot);
        if (ret < 0)
                ext4_error_err(dquot->dq_sb, -ret,
                               "Failed to release dquot type %d",
                               dquot->dq_id.type);
        err = ext4_journal_stop(handle);
        if (!ret)
                ret = err;

        if (freeze_protected)
                sb_end_intwrite(dquot->dq_sb);

        return ret;
}

static int ext4_mark_dquot_dirty(struct dquot *dquot)
{
        struct super_block *sb = dquot->dq_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        /* Are we journaling quotas? */
        if (ext4_has_feature_quota(sb) ||
            sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
                dquot_mark_dquot_dirty(dquot);
                return ext4_write_dquot(dquot);
        } else {
                return dquot_mark_dquot_dirty(dquot);
        }
}

static int ext4_write_info(struct super_block *sb, int type)
{
        int ret, err;
        handle_t *handle;

        /* Data block + inode block */
        handle = ext4_journal_start_sb(sb, EXT4_HT_QUOTA, 2);
        if (IS_ERR(handle))
                return PTR_ERR(handle);
        ret = dquot_commit_info(sb, type);
        err = ext4_journal_stop(handle);
        if (!ret)
                ret = err;
        return ret;
}

/*
 * Turn on quotas during mount time - we need to find
 * the quota file and such...
 */
static int ext4_quota_on_mount(struct super_block *sb, int type)
{
        return dquot_quota_on_mount(sb, get_qf_name(sb, EXT4_SB(sb), type),
                                        EXT4_SB(sb)->s_jquota_fmt, type);
}

static void lockdep_set_quota_inode(struct inode *inode, int subclass)
{
        struct ext4_inode_info *ei = EXT4_I(inode);

        /* The first argument of lockdep_set_subclass has to be
         * *exactly* the same as the argument to init_rwsem() --- in
         * this case, in init_once() --- or lockdep gets unhappy
         * because the name of the lock is set using the
         * stringification of the argument to init_rwsem().
         */
        (void) ei;        /* shut up clang warning if !CONFIG_LOCKDEP */
        lockdep_set_subclass(&ei->i_data_sem, subclass);
}

/*
 * Standard function to be called on quota_on
 */
static int ext4_quota_on(struct super_block *sb, int type, int format_id,
                         const struct path *path)
{
        int err;

        if (!test_opt(sb, QUOTA))
                return -EINVAL;

        /* Quotafile not on the same filesystem? */
        if (path->dentry->d_sb != sb)
                return -EXDEV;

        /* Quota already enabled for this file? */
        if (IS_NOQUOTA(d_inode(path->dentry)))
                return -EBUSY;

        /* Journaling quota? */
        if (EXT4_SB(sb)->s_qf_names[type]) {
                /* Quotafile not in fs root? */
                if (path->dentry->d_parent != sb->s_root)
                        ext4_msg(sb, KERN_WARNING,
                                "Quota file not on filesystem root. "
                                "Journaled quota will not work");
                sb_dqopt(sb)->flags |= DQUOT_NOLIST_DIRTY;
        } else {
                /*
                 * Clear the flag just in case mount options changed since
                 * last time.
                 */
                sb_dqopt(sb)->flags &= ~DQUOT_NOLIST_DIRTY;
        }

        /*
         * When we journal data on quota file, we have to flush journal to see
         * all updates to the file when we bypass pagecache...
         */
        if (EXT4_SB(sb)->s_journal &&
            ext4_should_journal_data(d_inode(path->dentry))) {
                /*
                 * We don't need to lock updates but journal_flush() could
                 * otherwise be livelocked...
                 */
                jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
                err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
                jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
                if (err)
                        return err;
        }

        lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA);
        err = dquot_quota_on(sb, type, format_id, path);
        if (!err) {
                struct inode *inode = d_inode(path->dentry);
                handle_t *handle;

                /*
                 * Set inode flags to prevent userspace from messing with quota
                 * files. If this fails, we return success anyway since quotas
                 * are already enabled and this is not a hard failure.
                 */
                inode_lock(inode);
                handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
                if (IS_ERR(handle))
                        goto unlock_inode;
                EXT4_I(inode)->i_flags |= EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL;
                inode_set_flags(inode, S_NOATIME | S_IMMUTABLE,
                                S_NOATIME | S_IMMUTABLE);
                err = ext4_mark_inode_dirty(handle, inode);
                ext4_journal_stop(handle);
        unlock_inode:
                inode_unlock(inode);
                if (err)
                        dquot_quota_off(sb, type);
        }
        if (err)
                lockdep_set_quota_inode(path->dentry->d_inode,
                                             I_DATA_SEM_NORMAL);
        return err;
}

static inline bool ext4_check_quota_inum(int type, unsigned long qf_inum)
{
        switch (type) {
        case USRQUOTA:
                return qf_inum == EXT4_USR_QUOTA_INO;
        case GRPQUOTA:
                return qf_inum == EXT4_GRP_QUOTA_INO;
        case PRJQUOTA:
                return qf_inum >= EXT4_GOOD_OLD_FIRST_INO;
        default:
                BUG();
        }
}

static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
                             unsigned int flags)
{
        int err;
        struct inode *qf_inode;
        unsigned long qf_inums[EXT4_MAXQUOTAS] = {
                le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
                le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
                le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
        };

        BUG_ON(!ext4_has_feature_quota(sb));

        if (!qf_inums[type])
                return -EPERM;

        if (!ext4_check_quota_inum(type, qf_inums[type])) {
                ext4_error(sb, "Bad quota inum: %lu, type: %d",
                                qf_inums[type], type);
                return -EUCLEAN;
        }

        qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL);
        if (IS_ERR(qf_inode)) {
                ext4_error(sb, "Bad quota inode: %lu, type: %d",
                                qf_inums[type], type);
                return PTR_ERR(qf_inode);
        }

        /* Don't account quota for quota files to avoid recursion */
        qf_inode->i_flags |= S_NOQUOTA;
        lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA);
        err = dquot_load_quota_inode(qf_inode, type, format_id, flags);
        if (err)
                lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL);
        iput(qf_inode);

        return err;
}

/* Enable usage tracking for all quota types. */
static int ext4_enable_quotas(struct super_block *sb)
{
        int type, err = 0;
        unsigned long qf_inums[EXT4_MAXQUOTAS] = {
                le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
                le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
                le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
        };
        bool quota_mopt[EXT4_MAXQUOTAS] = {
                test_opt(sb, USRQUOTA),
                test_opt(sb, GRPQUOTA),
                test_opt(sb, PRJQUOTA),
        };

        sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY;
        for (type = 0; type < EXT4_MAXQUOTAS; type++) {
                if (qf_inums[type]) {
                        err = ext4_quota_enable(sb, type, QFMT_VFS_V1,
                                DQUOT_USAGE_ENABLED |
                                (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0));
                        if (err) {
                                ext4_warning(sb,
                                        "Failed to enable quota tracking "
                                        "(type=%d, err=%d, ino=%lu). "
                                        "Please run e2fsck to fix.", type,
                                        err, qf_inums[type]);
                                for (type--; type >= 0; type--) {
                                        struct inode *inode;

                                        inode = sb_dqopt(sb)->files[type];
                                        if (inode)
                                                inode = igrab(inode);
                                        dquot_quota_off(sb, type);
                                        if (inode) {
                                                lockdep_set_quota_inode(inode,
                                                        I_DATA_SEM_NORMAL);
                                                iput(inode);
                                        }
                                }

                                return err;
                        }
                }
        }
        return 0;
}

static int ext4_quota_off(struct super_block *sb, int type)
{
        struct inode *inode = sb_dqopt(sb)->files[type];
        handle_t *handle;
        int err;

        /* Force all delayed allocation blocks to be allocated.
         * Caller already holds s_umount sem */
        if (test_opt(sb, DELALLOC))
                sync_filesystem(sb);

        if (!inode || !igrab(inode))
                goto out;

        err = dquot_quota_off(sb, type);
        if (err || ext4_has_feature_quota(sb))
                goto out_put;

        inode_lock(inode);
        /*
         * Update modification times of quota files when userspace can
         * start looking at them. If we fail, we return success anyway since
         * this is not a hard failure and quotas are already disabled.
         */
        handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
        if (IS_ERR(handle)) {
                err = PTR_ERR(handle);
                goto out_unlock;
        }
        EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL);
        inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE);
        inode->i_mtime = inode->i_ctime = current_time(inode);
        err = ext4_mark_inode_dirty(handle, inode);
        ext4_journal_stop(handle);
out_unlock:
        inode_unlock(inode);
out_put:
        lockdep_set_quota_inode(inode, I_DATA_SEM_NORMAL);
        iput(inode);
        return err;
out:
        return dquot_quota_off(sb, type);
}

/* Read data from quotafile - avoid pagecache and such because we cannot afford
 * acquiring the locks... As quota files are never truncated and quota code
 * itself serializes the operations (and no one else should touch the files)
 * we don't have to be afraid of races */
static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
                               size_t len, loff_t off)
{
        struct inode *inode = sb_dqopt(sb)->files[type];
        ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
        int offset = off & (sb->s_blocksize - 1);
        int tocopy;
        size_t toread;
        struct buffer_head *bh;
        loff_t i_size = i_size_read(inode);

        if (off > i_size)
                return 0;
        if (off+len > i_size)
                len = i_size-off;
        toread = len;
        while (toread > 0) {
                tocopy = sb->s_blocksize - offset < toread ?
                                sb->s_blocksize - offset : toread;
                bh = ext4_bread(NULL, inode, blk, 0);
                if (IS_ERR(bh))
                        return PTR_ERR(bh);
                if (!bh)        /* A hole? */
                        memset(data, 0, tocopy);
                else
                        memcpy(data, bh->b_data+offset, tocopy);
                brelse(bh);
                offset = 0;
                toread -= tocopy;
                data += tocopy;
                blk++;
        }
        return len;
}

/* Write to quotafile (we know the transaction is already started and has
 * enough credits) */
static ssize_t ext4_quota_write(struct super_block *sb, int type,
                                const char *data, size_t len, loff_t off)
{
        struct inode *inode = sb_dqopt(sb)->files[type];
        ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
        int err = 0, err2 = 0, offset = off & (sb->s_blocksize - 1);
        int retries = 0;
        struct buffer_head *bh;
        handle_t *handle = journal_current_handle();

        if (!handle) {
                ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
                        " cancelled because transaction is not started",
                        (unsigned long long)off, (unsigned long long)len);
                return -EIO;
        }
        /*
         * Since we account only one data block in transaction credits,
         * then it is impossible to cross a block boundary.
         */
        if (sb->s_blocksize - offset < len) {
                ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
                        " cancelled because not block aligned",
                        (unsigned long long)off, (unsigned long long)len);
                return -EIO;
        }

        do {
                bh = ext4_bread(handle, inode, blk,
                                EXT4_GET_BLOCKS_CREATE |
                                EXT4_GET_BLOCKS_METADATA_NOFAIL);
        } while (PTR_ERR(bh) == -ENOSPC &&
                 ext4_should_retry_alloc(inode->i_sb, &retries));
        if (IS_ERR(bh))
                return PTR_ERR(bh);
        if (!bh)
                goto out;
        BUFFER_TRACE(bh, "get write access");
        err = ext4_journal_get_write_access(handle, bh);
        if (err) {
                brelse(bh);
                return err;
        }
        lock_buffer(bh);
        memcpy(bh->b_data+offset, data, len);
        flush_dcache_page(bh->b_page);
        unlock_buffer(bh);
        err = ext4_handle_dirty_metadata(handle, NULL, bh);
        brelse(bh);
out:
        if (inode->i_size < off + len) {
                i_size_write(inode, off + len);
                EXT4_I(inode)->i_disksize = inode->i_size;
                err2 = ext4_mark_inode_dirty(handle, inode);
                if (unlikely(err2 && !err))
                        err = err2;
        }
        return err ? err : len;
}
#endif

static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
                       const char *dev_name, void *data)
{
        return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
}

#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
static inline void register_as_ext2(void)
{
        int err = register_filesystem(&ext2_fs_type);
        if (err)
                printk(KERN_WARNING
                       "EXT4-fs: Unable to register as ext2 (%d)\n", err);
}

static inline void unregister_as_ext2(void)
{
        unregister_filesystem(&ext2_fs_type);
}

static inline int ext2_feature_set_ok(struct super_block *sb)
{
        if (ext4_has_unknown_ext2_incompat_features(sb))
                return 0;
        if (sb_rdonly(sb))
                return 1;
        if (ext4_has_unknown_ext2_ro_compat_features(sb))
                return 0;
        return 1;
}
#else
static inline void register_as_ext2(void) { }
static inline void unregister_as_ext2(void) { }
static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
#endif

static inline void register_as_ext3(void)
{
        int err = register_filesystem(&ext3_fs_type);
        if (err)
                printk(KERN_WARNING
                       "EXT4-fs: Unable to register as ext3 (%d)\n", err);
}

static inline void unregister_as_ext3(void)
{
        unregister_filesystem(&ext3_fs_type);
}

static inline int ext3_feature_set_ok(struct super_block *sb)
{
        if (ext4_has_unknown_ext3_incompat_features(sb))
                return 0;
        if (!ext4_has_feature_journal(sb))
                return 0;
        if (sb_rdonly(sb))
                return 1;
        if (ext4_has_unknown_ext3_ro_compat_features(sb))
                return 0;
        return 1;
}

static struct file_system_type ext4_fs_type = {
        .owner                = THIS_MODULE,
        .name                = "ext4",
        .mount                = ext4_mount,
        .kill_sb        = kill_block_super,
        .fs_flags        = FS_REQUIRES_DEV,
};
MODULE_ALIAS_FS("ext4");

/* Shared across all ext4 file systems */
wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];

static int __init ext4_init_fs(void)
{
        int i, err;

        ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64);
        ext4_li_info = NULL;
        mutex_init(&ext4_li_mtx);

        /* Build-time check for flags consistency */
        ext4_check_flag_values();

        for (i = 0; i < EXT4_WQ_HASH_SZ; i++)
                init_waitqueue_head(&ext4__ioend_wq[i]);

        err = ext4_init_es();
        if (err)
                return err;

        err = ext4_init_pending();
        if (err)
                goto out7;

        err = ext4_init_post_read_processing();
        if (err)
                goto out6;

        err = ext4_init_pageio();
        if (err)
                goto out5;

        err = ext4_init_system_zone();
        if (err)
                goto out4;

        err = ext4_init_sysfs();
        if (err)
                goto out3;

        err = ext4_init_mballoc();
        if (err)
                goto out2;
        err = init_inodecache();
        if (err)
                goto out1;

        err = ext4_fc_init_dentry_cache();
        if (err)
                goto out05;

        register_as_ext3();
        register_as_ext2();
        err = register_filesystem(&ext4_fs_type);
        if (err)
                goto out;

        return 0;
out:
        unregister_as_ext2();
        unregister_as_ext3();
        ext4_fc_destroy_dentry_cache();
out05:
        destroy_inodecache();
out1:
        ext4_exit_mballoc();
out2:
        ext4_exit_sysfs();
out3:
        ext4_exit_system_zone();
out4:
        ext4_exit_pageio();
out5:
        ext4_exit_post_read_processing();
out6:
        ext4_exit_pending();
out7:
        ext4_exit_es();

        return err;
}

static void __exit ext4_exit_fs(void)
{
        ext4_destroy_lazyinit_thread();
        unregister_as_ext2();
        unregister_as_ext3();
        unregister_filesystem(&ext4_fs_type);
        ext4_fc_destroy_dentry_cache();
        destroy_inodecache();
        ext4_exit_mballoc();
        ext4_exit_sysfs();
        ext4_exit_system_zone();
        ext4_exit_pageio();
        ext4_exit_post_read_processing();
        ext4_exit_es();
        ext4_exit_pending();
}

MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
MODULE_DESCRIPTION("Fourth Extended Filesystem");
MODULE_LICENSE("GPL");
MODULE_SOFTDEP("pre: crc32c");
module_init(ext4_init_fs)
module_exit(ext4_exit_fs)




























































    1 





















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __FS_NOTIFY_FSNOTIFY_H_
#define __FS_NOTIFY_FSNOTIFY_H_

#include <linux/list.h>
#include <linux/fsnotify.h>
#include <linux/srcu.h>
#include <linux/types.h>

#include "../mount.h"

static inline struct inode *fsnotify_conn_inode(
                                struct fsnotify_mark_connector *conn)
{
        return container_of(conn->obj, struct inode, i_fsnotify_marks);
}

static inline struct mount *fsnotify_conn_mount(
                                struct fsnotify_mark_connector *conn)
{
        return container_of(conn->obj, struct mount, mnt_fsnotify_marks);
}

static inline struct super_block *fsnotify_conn_sb(
                                struct fsnotify_mark_connector *conn)
{
        return container_of(conn->obj, struct super_block, s_fsnotify_marks);
}

static inline struct super_block *fsnotify_connector_sb(
                                struct fsnotify_mark_connector *conn)
{
        switch (conn->type) {
        case FSNOTIFY_OBJ_TYPE_INODE:
                return fsnotify_conn_inode(conn)->i_sb;
        case FSNOTIFY_OBJ_TYPE_VFSMOUNT:
                return fsnotify_conn_mount(conn)->mnt.mnt_sb;
        case FSNOTIFY_OBJ_TYPE_SB:
                return fsnotify_conn_sb(conn);
        default:
                return NULL;
        }
}

/* destroy all events sitting in this groups notification queue */
extern void fsnotify_flush_notify(struct fsnotify_group *group);

/* protects reads of inode and vfsmount marks list */
extern struct srcu_struct fsnotify_mark_srcu;

/* compare two groups for sorting of marks lists */
extern int fsnotify_compare_groups(struct fsnotify_group *a,
                                   struct fsnotify_group *b);

/* Destroy all marks attached to an object via connector */
extern void fsnotify_destroy_marks(fsnotify_connp_t *connp);
/* run the list of all marks associated with inode and destroy them */
static inline void fsnotify_clear_marks_by_inode(struct inode *inode)
{
        fsnotify_destroy_marks(&inode->i_fsnotify_marks);
}
/* run the list of all marks associated with vfsmount and destroy them */
static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
{
        fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks);
}
/* run the list of all marks associated with sb and destroy them */
static inline void fsnotify_clear_marks_by_sb(struct super_block *sb)
{
        fsnotify_destroy_marks(&sb->s_fsnotify_marks);
}

/*
 * update the dentry->d_flags of all of inode's children to indicate if inode cares
 * about events that happen to its children.
 */
extern void fsnotify_set_children_dentry_flags(struct inode *inode);

extern struct kmem_cache *fsnotify_mark_connector_cachep;

#endif        /* __FS_NOTIFY_FSNOTIFY_H_ */








































































































































































































    1 




































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM scsi

#if !defined(_TRACE_SCSI_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_SCSI_H

#include <scsi/scsi_cmnd.h>
#include <scsi/scsi_host.h>
#include <linux/tracepoint.h>
#include <linux/trace_seq.h>

#define scsi_opcode_name(opcode)        { opcode, #opcode }
#define show_opcode_name(val)                                        \
        __print_symbolic(val,                                        \
                scsi_opcode_name(TEST_UNIT_READY),                \
                scsi_opcode_name(REZERO_UNIT),                        \
                scsi_opcode_name(REQUEST_SENSE),                \
                scsi_opcode_name(FORMAT_UNIT),                        \
                scsi_opcode_name(READ_BLOCK_LIMITS),                \
                scsi_opcode_name(REASSIGN_BLOCKS),                \
                scsi_opcode_name(INITIALIZE_ELEMENT_STATUS),        \
                scsi_opcode_name(READ_6),                        \
                scsi_opcode_name(WRITE_6),                        \
                scsi_opcode_name(SEEK_6),                        \
                scsi_opcode_name(READ_REVERSE),                        \
                scsi_opcode_name(WRITE_FILEMARKS),                \
                scsi_opcode_name(SPACE),                        \
                scsi_opcode_name(INQUIRY),                        \
                scsi_opcode_name(RECOVER_BUFFERED_DATA),        \
                scsi_opcode_name(MODE_SELECT),                        \
                scsi_opcode_name(RESERVE),                        \
                scsi_opcode_name(RELEASE),                        \
                scsi_opcode_name(COPY),                                \
                scsi_opcode_name(ERASE),                        \
                scsi_opcode_name(MODE_SENSE),                        \
                scsi_opcode_name(START_STOP),                        \
                scsi_opcode_name(RECEIVE_DIAGNOSTIC),                \
                scsi_opcode_name(SEND_DIAGNOSTIC),                \
                scsi_opcode_name(ALLOW_MEDIUM_REMOVAL),                \
                scsi_opcode_name(SET_WINDOW),                        \
                scsi_opcode_name(READ_CAPACITY),                \
                scsi_opcode_name(READ_10),                        \
                scsi_opcode_name(WRITE_10),                        \
                scsi_opcode_name(SEEK_10),                        \
                scsi_opcode_name(POSITION_TO_ELEMENT),                \
                scsi_opcode_name(WRITE_VERIFY),                        \
                scsi_opcode_name(VERIFY),                        \
                scsi_opcode_name(SEARCH_HIGH),                        \
                scsi_opcode_name(SEARCH_EQUAL),                        \
                scsi_opcode_name(SEARCH_LOW),                        \
                scsi_opcode_name(SET_LIMITS),                        \
                scsi_opcode_name(PRE_FETCH),                        \
                scsi_opcode_name(READ_POSITION),                \
                scsi_opcode_name(SYNCHRONIZE_CACHE),                \
                scsi_opcode_name(LOCK_UNLOCK_CACHE),                \
                scsi_opcode_name(READ_DEFECT_DATA),                \
                scsi_opcode_name(MEDIUM_SCAN),                        \
                scsi_opcode_name(COMPARE),                        \
                scsi_opcode_name(COPY_VERIFY),                        \
                scsi_opcode_name(WRITE_BUFFER),                        \
                scsi_opcode_name(READ_BUFFER),                        \
                scsi_opcode_name(UPDATE_BLOCK),                        \
                scsi_opcode_name(READ_LONG),                        \
                scsi_opcode_name(WRITE_LONG),                        \
                scsi_opcode_name(CHANGE_DEFINITION),                \
                scsi_opcode_name(WRITE_SAME),                        \
                scsi_opcode_name(UNMAP),                        \
                scsi_opcode_name(READ_TOC),                        \
                scsi_opcode_name(LOG_SELECT),                        \
                scsi_opcode_name(LOG_SENSE),                        \
                scsi_opcode_name(XDWRITEREAD_10),                \
                scsi_opcode_name(MODE_SELECT_10),                \
                scsi_opcode_name(RESERVE_10),                        \
                scsi_opcode_name(RELEASE_10),                        \
                scsi_opcode_name(MODE_SENSE_10),                \
                scsi_opcode_name(PERSISTENT_RESERVE_IN),        \
                scsi_opcode_name(PERSISTENT_RESERVE_OUT),        \
                scsi_opcode_name(VARIABLE_LENGTH_CMD),                \
                scsi_opcode_name(REPORT_LUNS),                        \
                scsi_opcode_name(MAINTENANCE_IN),                \
                scsi_opcode_name(MAINTENANCE_OUT),                \
                scsi_opcode_name(MOVE_MEDIUM),                        \
                scsi_opcode_name(EXCHANGE_MEDIUM),                \
                scsi_opcode_name(READ_12),                        \
                scsi_opcode_name(WRITE_12),                        \
                scsi_opcode_name(WRITE_VERIFY_12),                \
                scsi_opcode_name(SEARCH_HIGH_12),                \
                scsi_opcode_name(SEARCH_EQUAL_12),                \
                scsi_opcode_name(SEARCH_LOW_12),                \
                scsi_opcode_name(READ_ELEMENT_STATUS),                \
                scsi_opcode_name(SEND_VOLUME_TAG),                \
                scsi_opcode_name(WRITE_LONG_2),                        \
                scsi_opcode_name(READ_16),                        \
                scsi_opcode_name(WRITE_16),                        \
                scsi_opcode_name(VERIFY_16),                        \
                scsi_opcode_name(WRITE_SAME_16),                \
                scsi_opcode_name(ZBC_OUT),                        \
                scsi_opcode_name(ZBC_IN),                        \
                scsi_opcode_name(SERVICE_ACTION_IN_16),                \
                scsi_opcode_name(READ_32),                        \
                scsi_opcode_name(WRITE_32),                        \
                scsi_opcode_name(WRITE_SAME_32),                \
                scsi_opcode_name(ATA_16),                        \
                scsi_opcode_name(ATA_12))

#define scsi_hostbyte_name(result)        { result, #result }
#define show_hostbyte_name(val)                                        \
        __print_symbolic(val,                                        \
                scsi_hostbyte_name(DID_OK),                        \
                scsi_hostbyte_name(DID_NO_CONNECT),                \
                scsi_hostbyte_name(DID_BUS_BUSY),                \
                scsi_hostbyte_name(DID_TIME_OUT),                \
                scsi_hostbyte_name(DID_BAD_TARGET),                \
                scsi_hostbyte_name(DID_ABORT),                        \
                scsi_hostbyte_name(DID_PARITY),                        \
                scsi_hostbyte_name(DID_ERROR),                        \
                scsi_hostbyte_name(DID_RESET),                        \
                scsi_hostbyte_name(DID_BAD_INTR),                \
                scsi_hostbyte_name(DID_PASSTHROUGH),                \
                scsi_hostbyte_name(DID_SOFT_ERROR),                \
                scsi_hostbyte_name(DID_IMM_RETRY),                \
                scsi_hostbyte_name(DID_REQUEUE),                \
                scsi_hostbyte_name(DID_TRANSPORT_DISRUPTED),        \
                scsi_hostbyte_name(DID_TRANSPORT_FAILFAST))

#define scsi_driverbyte_name(result)        { result, #result }
#define show_driverbyte_name(val)                                \
        __print_symbolic(val,                                        \
                scsi_driverbyte_name(DRIVER_OK),                \
                scsi_driverbyte_name(DRIVER_BUSY),                \
                scsi_driverbyte_name(DRIVER_SOFT),                \
                scsi_driverbyte_name(DRIVER_MEDIA),                \
                scsi_driverbyte_name(DRIVER_ERROR),                \
                scsi_driverbyte_name(DRIVER_INVALID),                \
                scsi_driverbyte_name(DRIVER_TIMEOUT),                \
                scsi_driverbyte_name(DRIVER_HARD),                \
                scsi_driverbyte_name(DRIVER_SENSE))

#define scsi_msgbyte_name(result)        { result, #result }
#define show_msgbyte_name(val)                                        \
        __print_symbolic(val,                                        \
                scsi_msgbyte_name(COMMAND_COMPLETE),                \
                scsi_msgbyte_name(EXTENDED_MESSAGE),                \
                scsi_msgbyte_name(SAVE_POINTERS),                \
                scsi_msgbyte_name(RESTORE_POINTERS),                \
                scsi_msgbyte_name(DISCONNECT),                        \
                scsi_msgbyte_name(INITIATOR_ERROR),                \
                scsi_msgbyte_name(ABORT_TASK_SET),                \
                scsi_msgbyte_name(MESSAGE_REJECT),                \
                scsi_msgbyte_name(NOP),                                \
                scsi_msgbyte_name(MSG_PARITY_ERROR),                \
                scsi_msgbyte_name(LINKED_CMD_COMPLETE),                \
                scsi_msgbyte_name(LINKED_FLG_CMD_COMPLETE),        \
                scsi_msgbyte_name(TARGET_RESET),                \
                scsi_msgbyte_name(ABORT_TASK),                        \
                scsi_msgbyte_name(CLEAR_TASK_SET),                \
                scsi_msgbyte_name(INITIATE_RECOVERY),                \
                scsi_msgbyte_name(RELEASE_RECOVERY),                \
                scsi_msgbyte_name(CLEAR_ACA),                        \
                scsi_msgbyte_name(LOGICAL_UNIT_RESET),                \
                scsi_msgbyte_name(SIMPLE_QUEUE_TAG),                \
                scsi_msgbyte_name(HEAD_OF_QUEUE_TAG),                \
                scsi_msgbyte_name(ORDERED_QUEUE_TAG),                \
                scsi_msgbyte_name(IGNORE_WIDE_RESIDUE),                \
                scsi_msgbyte_name(ACA),                                \
                scsi_msgbyte_name(QAS_REQUEST),                        \
                scsi_msgbyte_name(BUS_DEVICE_RESET),                \
                scsi_msgbyte_name(ABORT))

#define scsi_statusbyte_name(result)        { result, #result }
#define show_statusbyte_name(val)                                \
        __print_symbolic(val,                                        \
                scsi_statusbyte_name(SAM_STAT_GOOD),                \
                scsi_statusbyte_name(SAM_STAT_CHECK_CONDITION),        \
                scsi_statusbyte_name(SAM_STAT_CONDITION_MET),        \
                scsi_statusbyte_name(SAM_STAT_BUSY),                \
                scsi_statusbyte_name(SAM_STAT_INTERMEDIATE),        \
                scsi_statusbyte_name(SAM_STAT_INTERMEDIATE_CONDITION_MET), \
                scsi_statusbyte_name(SAM_STAT_RESERVATION_CONFLICT),        \
                scsi_statusbyte_name(SAM_STAT_COMMAND_TERMINATED),        \
                scsi_statusbyte_name(SAM_STAT_TASK_SET_FULL),        \
                scsi_statusbyte_name(SAM_STAT_ACA_ACTIVE),        \
                scsi_statusbyte_name(SAM_STAT_TASK_ABORTED))

#define scsi_prot_op_name(result)        { result, #result }
#define show_prot_op_name(val)                                        \
        __print_symbolic(val,                                        \
                scsi_prot_op_name(SCSI_PROT_NORMAL),                \
                scsi_prot_op_name(SCSI_PROT_READ_INSERT),        \
                scsi_prot_op_name(SCSI_PROT_WRITE_STRIP),        \
                scsi_prot_op_name(SCSI_PROT_READ_STRIP),        \
                scsi_prot_op_name(SCSI_PROT_WRITE_INSERT),        \
                scsi_prot_op_name(SCSI_PROT_READ_PASS),                \
                scsi_prot_op_name(SCSI_PROT_WRITE_PASS))

const char *scsi_trace_parse_cdb(struct trace_seq*, unsigned char*, int);
#define __parse_cdb(cdb, len) scsi_trace_parse_cdb(p, cdb, len)

TRACE_EVENT(scsi_dispatch_cmd_start,

        TP_PROTO(struct scsi_cmnd *cmd),

        TP_ARGS(cmd),

        TP_STRUCT__entry(
                __field( unsigned int,        host_no        )
                __field( unsigned int,        channel        )
                __field( unsigned int,        id        )
                __field( unsigned int,        lun        )
                __field( unsigned int,        opcode        )
                __field( unsigned int,        cmd_len )
                __field( unsigned int,        data_sglen )
                __field( unsigned int,        prot_sglen )
                __field( unsigned char,        prot_op )
                __dynamic_array(unsigned char,        cmnd, cmd->cmd_len)
        ),

        TP_fast_assign(
                __entry->host_no        = cmd->device->host->host_no;
                __entry->channel        = cmd->device->channel;
                __entry->id                = cmd->device->id;
                __entry->lun                = cmd->device->lun;
                __entry->opcode                = cmd->cmnd[0];
                __entry->cmd_len        = cmd->cmd_len;
                __entry->data_sglen        = scsi_sg_count(cmd);
                __entry->prot_sglen        = scsi_prot_sg_count(cmd);
                __entry->prot_op        = scsi_get_prot_op(cmd);
                memcpy(__get_dynamic_array(cmnd), cmd->cmnd, cmd->cmd_len);
        ),

        TP_printk("host_no=%u channel=%u id=%u lun=%u data_sgl=%u prot_sgl=%u" \
                  " prot_op=%s cmnd=(%s %s raw=%s)",
                  __entry->host_no, __entry->channel, __entry->id,
                  __entry->lun, __entry->data_sglen, __entry->prot_sglen,
                  show_prot_op_name(__entry->prot_op),
                  show_opcode_name(__entry->opcode),
                  __parse_cdb(__get_dynamic_array(cmnd), __entry->cmd_len),
                  __print_hex(__get_dynamic_array(cmnd), __entry->cmd_len))
);

TRACE_EVENT(scsi_dispatch_cmd_error,

        TP_PROTO(struct scsi_cmnd *cmd, int rtn),

        TP_ARGS(cmd, rtn),

        TP_STRUCT__entry(
                __field( unsigned int,        host_no        )
                __field( unsigned int,        channel        )
                __field( unsigned int,        id        )
                __field( unsigned int,        lun        )
                __field( int,                rtn        )
                __field( unsigned int,        opcode        )
                __field( unsigned int,        cmd_len )
                __field( unsigned int,        data_sglen )
                __field( unsigned int,        prot_sglen )
                __field( unsigned char,        prot_op )
                __dynamic_array(unsigned char,        cmnd, cmd->cmd_len)
        ),

        TP_fast_assign(
                __entry->host_no        = cmd->device->host->host_no;
                __entry->channel        = cmd->device->channel;
                __entry->id                = cmd->device->id;
                __entry->lun                = cmd->device->lun;
                __entry->rtn                = rtn;
                __entry->opcode                = cmd->cmnd[0];
                __entry->cmd_len        = cmd->cmd_len;
                __entry->data_sglen        = scsi_sg_count(cmd);
                __entry->prot_sglen        = scsi_prot_sg_count(cmd);
                __entry->prot_op        = scsi_get_prot_op(cmd);
                memcpy(__get_dynamic_array(cmnd), cmd->cmnd, cmd->cmd_len);
        ),

        TP_printk("host_no=%u channel=%u id=%u lun=%u data_sgl=%u prot_sgl=%u" \
                  " prot_op=%s cmnd=(%s %s raw=%s) rtn=%d",
                  __entry->host_no, __entry->channel, __entry->id,
                  __entry->lun, __entry->data_sglen, __entry->prot_sglen,
                  show_prot_op_name(__entry->prot_op),
                  show_opcode_name(__entry->opcode),
                  __parse_cdb(__get_dynamic_array(cmnd), __entry->cmd_len),
                  __print_hex(__get_dynamic_array(cmnd), __entry->cmd_len),
                  __entry->rtn)
);

DECLARE_EVENT_CLASS(scsi_cmd_done_timeout_template,

        TP_PROTO(struct scsi_cmnd *cmd),

        TP_ARGS(cmd),

        TP_STRUCT__entry(
                __field( unsigned int,        host_no        )
                __field( unsigned int,        channel        )
                __field( unsigned int,        id        )
                __field( unsigned int,        lun        )
                __field( int,                result        )
                __field( unsigned int,        opcode        )
                __field( unsigned int,        cmd_len )
                __field( unsigned int,        data_sglen )
                __field( unsigned int,        prot_sglen )
                __field( unsigned char,        prot_op )
                __dynamic_array(unsigned char,        cmnd, cmd->cmd_len)
        ),

        TP_fast_assign(
                __entry->host_no        = cmd->device->host->host_no;
                __entry->channel        = cmd->device->channel;
                __entry->id                = cmd->device->id;
                __entry->lun                = cmd->device->lun;
                __entry->result                = cmd->result;
                __entry->opcode                = cmd->cmnd[0];
                __entry->cmd_len        = cmd->cmd_len;
                __entry->data_sglen        = scsi_sg_count(cmd);
                __entry->prot_sglen        = scsi_prot_sg_count(cmd);
                __entry->prot_op        = scsi_get_prot_op(cmd);
                memcpy(__get_dynamic_array(cmnd), cmd->cmnd, cmd->cmd_len);
        ),

        TP_printk("host_no=%u channel=%u id=%u lun=%u data_sgl=%u " \
                  "prot_sgl=%u prot_op=%s cmnd=(%s %s raw=%s) result=(driver=" \
                  "%s host=%s message=%s status=%s)",
                  __entry->host_no, __entry->channel, __entry->id,
                  __entry->lun, __entry->data_sglen, __entry->prot_sglen,
                  show_prot_op_name(__entry->prot_op),
                  show_opcode_name(__entry->opcode),
                  __parse_cdb(__get_dynamic_array(cmnd), __entry->cmd_len),
                  __print_hex(__get_dynamic_array(cmnd), __entry->cmd_len),
                  show_driverbyte_name(((__entry->result) >> 24) & 0xff),
                  show_hostbyte_name(((__entry->result) >> 16) & 0xff),
                  show_msgbyte_name(((__entry->result) >> 8) & 0xff),
                  show_statusbyte_name(__entry->result & 0xff))
);

DEFINE_EVENT(scsi_cmd_done_timeout_template, scsi_dispatch_cmd_done,
             TP_PROTO(struct scsi_cmnd *cmd),
             TP_ARGS(cmd));

DEFINE_EVENT(scsi_cmd_done_timeout_template, scsi_dispatch_cmd_timeout,
             TP_PROTO(struct scsi_cmnd *cmd),
             TP_ARGS(cmd));

TRACE_EVENT(scsi_eh_wakeup,

        TP_PROTO(struct Scsi_Host *shost),

        TP_ARGS(shost),

        TP_STRUCT__entry(
                __field( unsigned int,        host_no        )
        ),

        TP_fast_assign(
                __entry->host_no        = shost->host_no;
        ),

        TP_printk("host_no=%u", __entry->host_no)
);

#endif /*  _TRACE_SCSI_H */

/* This part must be outside protection */
#include <trace/define_trace.h>















































































































































































































































































































    1 


























































































































































    1 





















    1 
























    2 







    2 




    1 

    1 

    1 

    1 




















    2 




































    2 










    2 
















    2 





















































































































































































    3 



    3 































    1 
    1 
    2 











    1 









    2 



















    2 





    2 








    2 




















    1 
    1 

    1 

    1 




























    2 





    2 


















    2 






    2 





    1 






































    2 



    2 
    2 


    2 

    2 







    1 

    1 












    2 






    2 





    2 

    2 



































































    1 























    1 



































































    1 




    1 



























    1 


    1 


































































































































    1 
    1 
    1 
    1 
















    1 










































































































































































































































































































































































































































































    1 



















    1 








    1 













    1 





























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
// SPDX-License-Identifier: GPL-2.0
/*
 *  Kernel internal timers
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  1997-01-28  Modified by Finn Arne Gangstad to make timers scale better.
 *
 *  1997-09-10  Updated NTP code according to technical memorandum Jan '96
 *              "A Kernel Model for Precision Timekeeping" by Dave Mills
 *  1998-12-24  Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
 *              serialize accesses to xtime/lost_ticks).
 *                              Copyright (C) 1998  Andrea Arcangeli
 *  1999-03-10  Improved NTP compatibility by Ulrich Windl
 *  2002-05-31        Move sys_sysinfo here and make its locking sane, Robert Love
 *  2000-10-05  Implemented scalable SMP per-CPU timer handling.
 *                              Copyright (C) 2000, 2001, 2002  Ingo Molnar
 *              Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar
 */

#include <linux/kernel_stat.h>
#include <linux/export.h>
#include <linux/interrupt.h>
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/pid_namespace.h>
#include <linux/notifier.h>
#include <linux/thread_info.h>
#include <linux/time.h>
#include <linux/jiffies.h>
#include <linux/posix-timers.h>
#include <linux/cpu.h>
#include <linux/syscalls.h>
#include <linux/delay.h>
#include <linux/tick.h>
#include <linux/kallsyms.h>
#include <linux/irq_work.h>
#include <linux/sched/signal.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/nohz.h>
#include <linux/sched/debug.h>
#include <linux/slab.h>
#include <linux/compat.h>
#include <linux/random.h>

#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <asm/div64.h>
#include <asm/timex.h>
#include <asm/io.h>

#include "tick-internal.h"

#define CREATE_TRACE_POINTS
#include <trace/events/timer.h>

__visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;

EXPORT_SYMBOL(jiffies_64);

/*
 * The timer wheel has LVL_DEPTH array levels. Each level provides an array of
 * LVL_SIZE buckets. Each level is driven by its own clock and therefor each
 * level has a different granularity.
 *
 * The level granularity is:                LVL_CLK_DIV ^ lvl
 * The level clock frequency is:        HZ / (LVL_CLK_DIV ^ level)
 *
 * The array level of a newly armed timer depends on the relative expiry
 * time. The farther the expiry time is away the higher the array level and
 * therefor the granularity becomes.
 *
 * Contrary to the original timer wheel implementation, which aims for 'exact'
 * expiry of the timers, this implementation removes the need for recascading
 * the timers into the lower array levels. The previous 'classic' timer wheel
 * implementation of the kernel already violated the 'exact' expiry by adding
 * slack to the expiry time to provide batched expiration. The granularity
 * levels provide implicit batching.
 *
 * This is an optimization of the original timer wheel implementation for the
 * majority of the timer wheel use cases: timeouts. The vast majority of
 * timeout timers (networking, disk I/O ...) are canceled before expiry. If
 * the timeout expires it indicates that normal operation is disturbed, so it
 * does not matter much whether the timeout comes with a slight delay.
 *
 * The only exception to this are networking timers with a small expiry
 * time. They rely on the granularity. Those fit into the first wheel level,
 * which has HZ granularity.
 *
 * We don't have cascading anymore. timers with a expiry time above the
 * capacity of the last wheel level are force expired at the maximum timeout
 * value of the last wheel level. From data sampling we know that the maximum
 * value observed is 5 days (network connection tracking), so this should not
 * be an issue.
 *
 * The currently chosen array constants values are a good compromise between
 * array size and granularity.
 *
 * This results in the following granularity and range levels:
 *
 * HZ 1000 steps
 * Level Offset  Granularity            Range
 *  0      0         1 ms                0 ms -         63 ms
 *  1     64         8 ms               64 ms -        511 ms
 *  2    128        64 ms              512 ms -       4095 ms (512ms - ~4s)
 *  3    192       512 ms             4096 ms -      32767 ms (~4s - ~32s)
 *  4    256      4096 ms (~4s)      32768 ms -     262143 ms (~32s - ~4m)
 *  5    320     32768 ms (~32s)    262144 ms -    2097151 ms (~4m - ~34m)
 *  6    384    262144 ms (~4m)    2097152 ms -   16777215 ms (~34m - ~4h)
 *  7    448   2097152 ms (~34m)  16777216 ms -  134217727 ms (~4h - ~1d)
 *  8    512  16777216 ms (~4h)  134217728 ms - 1073741822 ms (~1d - ~12d)
 *
 * HZ  300
 * Level Offset  Granularity            Range
 *  0           0         3 ms                0 ms -        210 ms
 *  1          64        26 ms              213 ms -       1703 ms (213ms - ~1s)
 *  2         128       213 ms             1706 ms -      13650 ms (~1s - ~13s)
 *  3         192      1706 ms (~1s)      13653 ms -     109223 ms (~13s - ~1m)
 *  4         256     13653 ms (~13s)    109226 ms -     873810 ms (~1m - ~14m)
 *  5         320    109226 ms (~1m)     873813 ms -    6990503 ms (~14m - ~1h)
 *  6         384    873813 ms (~14m)   6990506 ms -   55924050 ms (~1h - ~15h)
 *  7         448   6990506 ms (~1h)   55924053 ms -  447392423 ms (~15h - ~5d)
 *  8    512  55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d)
 *
 * HZ  250
 * Level Offset  Granularity            Range
 *  0           0         4 ms                0 ms -        255 ms
 *  1          64        32 ms              256 ms -       2047 ms (256ms - ~2s)
 *  2         128       256 ms             2048 ms -      16383 ms (~2s - ~16s)
 *  3         192      2048 ms (~2s)      16384 ms -     131071 ms (~16s - ~2m)
 *  4         256     16384 ms (~16s)    131072 ms -    1048575 ms (~2m - ~17m)
 *  5         320    131072 ms (~2m)    1048576 ms -    8388607 ms (~17m - ~2h)
 *  6         384   1048576 ms (~17m)   8388608 ms -   67108863 ms (~2h - ~18h)
 *  7         448   8388608 ms (~2h)   67108864 ms -  536870911 ms (~18h - ~6d)
 *  8    512  67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d)
 *
 * HZ  100
 * Level Offset  Granularity            Range
 *  0           0         10 ms               0 ms -        630 ms
 *  1          64         80 ms             640 ms -       5110 ms (640ms - ~5s)
 *  2         128        640 ms            5120 ms -      40950 ms (~5s - ~40s)
 *  3         192       5120 ms (~5s)     40960 ms -     327670 ms (~40s - ~5m)
 *  4         256      40960 ms (~40s)   327680 ms -    2621430 ms (~5m - ~43m)
 *  5         320     327680 ms (~5m)   2621440 ms -   20971510 ms (~43m - ~5h)
 *  6         384    2621440 ms (~43m) 20971520 ms -  167772150 ms (~5h - ~1d)
 *  7         448   20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d)
 */

/* Clock divisor for the next level */
#define LVL_CLK_SHIFT        3
#define LVL_CLK_DIV        (1UL << LVL_CLK_SHIFT)
#define LVL_CLK_MASK        (LVL_CLK_DIV - 1)
#define LVL_SHIFT(n)        ((n) * LVL_CLK_SHIFT)
#define LVL_GRAN(n)        (1UL << LVL_SHIFT(n))

/*
 * The time start value for each level to select the bucket at enqueue
 * time. We start from the last possible delta of the previous level
 * so that we can later add an extra LVL_GRAN(n) to n (see calc_index()).
 */
#define LVL_START(n)        ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT))

/* Size of each clock level */
#define LVL_BITS        6
#define LVL_SIZE        (1UL << LVL_BITS)
#define LVL_MASK        (LVL_SIZE - 1)
#define LVL_OFFS(n)        ((n) * LVL_SIZE)

/* Level depth */
#if HZ > 100
# define LVL_DEPTH        9
# else
# define LVL_DEPTH        8
#endif

/* The cutoff (max. capacity of the wheel) */
#define WHEEL_TIMEOUT_CUTOFF        (LVL_START(LVL_DEPTH))
#define WHEEL_TIMEOUT_MAX        (WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1))

/*
 * The resulting wheel size. If NOHZ is configured we allocate two
 * wheels so we have a separate storage for the deferrable timers.
 */
#define WHEEL_SIZE        (LVL_SIZE * LVL_DEPTH)

#ifdef CONFIG_NO_HZ_COMMON
# define NR_BASES        2
# define BASE_STD        0
# define BASE_DEF        1
#else
# define NR_BASES        1
# define BASE_STD        0
# define BASE_DEF        0
#endif

struct timer_base {
        raw_spinlock_t                lock;
        struct timer_list        *running_timer;
#ifdef CONFIG_PREEMPT_RT
        spinlock_t                expiry_lock;
        atomic_t                timer_waiters;
#endif
        unsigned long                clk;
        unsigned long                next_expiry;
        unsigned int                cpu;
        bool                        next_expiry_recalc;
        bool                        is_idle;
        bool                        timers_pending;
        DECLARE_BITMAP(pending_map, WHEEL_SIZE);
        struct hlist_head        vectors[WHEEL_SIZE];
} ____cacheline_aligned;

static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);

#ifdef CONFIG_NO_HZ_COMMON

static DEFINE_STATIC_KEY_FALSE(timers_nohz_active);
static DEFINE_MUTEX(timer_keys_mutex);

static void timer_update_keys(struct work_struct *work);
static DECLARE_WORK(timer_update_work, timer_update_keys);

#ifdef CONFIG_SMP
unsigned int sysctl_timer_migration = 1;

DEFINE_STATIC_KEY_FALSE(timers_migration_enabled);

static void timers_update_migration(void)
{
        if (sysctl_timer_migration && tick_nohz_active)
                static_branch_enable(&timers_migration_enabled);
        else
                static_branch_disable(&timers_migration_enabled);
}
#else
static inline void timers_update_migration(void) { }
#endif /* !CONFIG_SMP */

static void timer_update_keys(struct work_struct *work)
{
        mutex_lock(&timer_keys_mutex);
        timers_update_migration();
        static_branch_enable(&timers_nohz_active);
        mutex_unlock(&timer_keys_mutex);
}

void timers_update_nohz(void)
{
        schedule_work(&timer_update_work);
}

int timer_migration_handler(struct ctl_table *table, int write,
                            void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret;

        mutex_lock(&timer_keys_mutex);
        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (!ret && write)
                timers_update_migration();
        mutex_unlock(&timer_keys_mutex);
        return ret;
}

static inline bool is_timers_nohz_active(void)
{
        return static_branch_unlikely(&timers_nohz_active);
}
#else
static inline bool is_timers_nohz_active(void) { return false; }
#endif /* NO_HZ_COMMON */

static unsigned long round_jiffies_common(unsigned long j, int cpu,
                bool force_up)
{
        int rem;
        unsigned long original = j;

        /*
         * We don't want all cpus firing their timers at once hitting the
         * same lock or cachelines, so we skew each extra cpu with an extra
         * 3 jiffies. This 3 jiffies came originally from the mm/ code which
         * already did this.
         * The skew is done by adding 3*cpunr, then round, then subtract this
         * extra offset again.
         */
        j += cpu * 3;

        rem = j % HZ;

        /*
         * If the target jiffie is just after a whole second (which can happen
         * due to delays of the timer irq, long irq off times etc etc) then
         * we should round down to the whole second, not up. Use 1/4th second
         * as cutoff for this rounding as an extreme upper bound for this.
         * But never round down if @force_up is set.
         */
        if (rem < HZ/4 && !force_up) /* round down */
                j = j - rem;
        else /* round up */
                j = j - rem + HZ;

        /* now that we have rounded, subtract the extra skew again */
        j -= cpu * 3;

        /*
         * Make sure j is still in the future. Otherwise return the
         * unmodified value.
         */
        return time_is_after_jiffies(j) ? j : original;
}

/**
 * __round_jiffies - function to round jiffies to a full second
 * @j: the time in (absolute) jiffies that should be rounded
 * @cpu: the processor number on which the timeout will happen
 *
 * __round_jiffies() rounds an absolute time in the future (in jiffies)
 * up or down to (approximately) full seconds. This is useful for timers
 * for which the exact time they fire does not matter too much, as long as
 * they fire approximately every X seconds.
 *
 * By rounding these timers to whole seconds, all such timers will fire
 * at the same time, rather than at various times spread out. The goal
 * of this is to have the CPU wake up less, which saves power.
 *
 * The exact rounding is skewed for each processor to avoid all
 * processors firing at the exact same time, which could lead
 * to lock contention or spurious cache line bouncing.
 *
 * The return value is the rounded version of the @j parameter.
 */
unsigned long __round_jiffies(unsigned long j, int cpu)
{
        return round_jiffies_common(j, cpu, false);
}
EXPORT_SYMBOL_GPL(__round_jiffies);

/**
 * __round_jiffies_relative - function to round jiffies to a full second
 * @j: the time in (relative) jiffies that should be rounded
 * @cpu: the processor number on which the timeout will happen
 *
 * __round_jiffies_relative() rounds a time delta  in the future (in jiffies)
 * up or down to (approximately) full seconds. This is useful for timers
 * for which the exact time they fire does not matter too much, as long as
 * they fire approximately every X seconds.
 *
 * By rounding these timers to whole seconds, all such timers will fire
 * at the same time, rather than at various times spread out. The goal
 * of this is to have the CPU wake up less, which saves power.
 *
 * The exact rounding is skewed for each processor to avoid all
 * processors firing at the exact same time, which could lead
 * to lock contention or spurious cache line bouncing.
 *
 * The return value is the rounded version of the @j parameter.
 */
unsigned long __round_jiffies_relative(unsigned long j, int cpu)
{
        unsigned long j0 = jiffies;

        /* Use j0 because jiffies might change while we run */
        return round_jiffies_common(j + j0, cpu, false) - j0;
}
EXPORT_SYMBOL_GPL(__round_jiffies_relative);

/**
 * round_jiffies - function to round jiffies to a full second
 * @j: the time in (absolute) jiffies that should be rounded
 *
 * round_jiffies() rounds an absolute time in the future (in jiffies)
 * up or down to (approximately) full seconds. This is useful for timers
 * for which the exact time they fire does not matter too much, as long as
 * they fire approximately every X seconds.
 *
 * By rounding these timers to whole seconds, all such timers will fire
 * at the same time, rather than at various times spread out. The goal
 * of this is to have the CPU wake up less, which saves power.
 *
 * The return value is the rounded version of the @j parameter.
 */
unsigned long round_jiffies(unsigned long j)
{
        return round_jiffies_common(j, raw_smp_processor_id(), false);
}
EXPORT_SYMBOL_GPL(round_jiffies);

/**
 * round_jiffies_relative - function to round jiffies to a full second
 * @j: the time in (relative) jiffies that should be rounded
 *
 * round_jiffies_relative() rounds a time delta  in the future (in jiffies)
 * up or down to (approximately) full seconds. This is useful for timers
 * for which the exact time they fire does not matter too much, as long as
 * they fire approximately every X seconds.
 *
 * By rounding these timers to whole seconds, all such timers will fire
 * at the same time, rather than at various times spread out. The goal
 * of this is to have the CPU wake up less, which saves power.
 *
 * The return value is the rounded version of the @j parameter.
 */
unsigned long round_jiffies_relative(unsigned long j)
{
        return __round_jiffies_relative(j, raw_smp_processor_id());
}
EXPORT_SYMBOL_GPL(round_jiffies_relative);

/**
 * __round_jiffies_up - function to round jiffies up to a full second
 * @j: the time in (absolute) jiffies that should be rounded
 * @cpu: the processor number on which the timeout will happen
 *
 * This is the same as __round_jiffies() except that it will never
 * round down.  This is useful for timeouts for which the exact time
 * of firing does not matter too much, as long as they don't fire too
 * early.
 */
unsigned long __round_jiffies_up(unsigned long j, int cpu)
{
        return round_jiffies_common(j, cpu, true);
}
EXPORT_SYMBOL_GPL(__round_jiffies_up);

/**
 * __round_jiffies_up_relative - function to round jiffies up to a full second
 * @j: the time in (relative) jiffies that should be rounded
 * @cpu: the processor number on which the timeout will happen
 *
 * This is the same as __round_jiffies_relative() except that it will never
 * round down.  This is useful for timeouts for which the exact time
 * of firing does not matter too much, as long as they don't fire too
 * early.
 */
unsigned long __round_jiffies_up_relative(unsigned long j, int cpu)
{
        unsigned long j0 = jiffies;

        /* Use j0 because jiffies might change while we run */
        return round_jiffies_common(j + j0, cpu, true) - j0;
}
EXPORT_SYMBOL_GPL(__round_jiffies_up_relative);

/**
 * round_jiffies_up - function to round jiffies up to a full second
 * @j: the time in (absolute) jiffies that should be rounded
 *
 * This is the same as round_jiffies() except that it will never
 * round down.  This is useful for timeouts for which the exact time
 * of firing does not matter too much, as long as they don't fire too
 * early.
 */
unsigned long round_jiffies_up(unsigned long j)
{
        return round_jiffies_common(j, raw_smp_processor_id(), true);
}
EXPORT_SYMBOL_GPL(round_jiffies_up);

/**
 * round_jiffies_up_relative - function to round jiffies up to a full second
 * @j: the time in (relative) jiffies that should be rounded
 *
 * This is the same as round_jiffies_relative() except that it will never
 * round down.  This is useful for timeouts for which the exact time
 * of firing does not matter too much, as long as they don't fire too
 * early.
 */
unsigned long round_jiffies_up_relative(unsigned long j)
{
        return __round_jiffies_up_relative(j, raw_smp_processor_id());
}
EXPORT_SYMBOL_GPL(round_jiffies_up_relative);


static inline unsigned int timer_get_idx(struct timer_list *timer)
{
        return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT;
}

static inline void timer_set_idx(struct timer_list *timer, unsigned int idx)
{
        timer->flags = (timer->flags & ~TIMER_ARRAYMASK) |
                        idx << TIMER_ARRAYSHIFT;
}

/*
 * Helper function to calculate the array index for a given expiry
 * time.
 */
static inline unsigned calc_index(unsigned long expires, unsigned lvl,
                                  unsigned long *bucket_expiry)
{

        /*
         * The timer wheel has to guarantee that a timer does not fire
         * early. Early expiry can happen due to:
         * - Timer is armed at the edge of a tick
         * - Truncation of the expiry time in the outer wheel levels
         *
         * Round up with level granularity to prevent this.
         */
        expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl);
        *bucket_expiry = expires << LVL_SHIFT(lvl);
        return LVL_OFFS(lvl) + (expires & LVL_MASK);
}

static int calc_wheel_index(unsigned long expires, unsigned long clk,
                            unsigned long *bucket_expiry)
{
        unsigned long delta = expires - clk;
        unsigned int idx;

        if (delta < LVL_START(1)) {
                idx = calc_index(expires, 0, bucket_expiry);
        } else if (delta < LVL_START(2)) {
                idx = calc_index(expires, 1, bucket_expiry);
        } else if (delta < LVL_START(3)) {
                idx = calc_index(expires, 2, bucket_expiry);
        } else if (delta < LVL_START(4)) {
                idx = calc_index(expires, 3, bucket_expiry);
        } else if (delta < LVL_START(5)) {
                idx = calc_index(expires, 4, bucket_expiry);
        } else if (delta < LVL_START(6)) {
                idx = calc_index(expires, 5, bucket_expiry);
        } else if (delta < LVL_START(7)) {
                idx = calc_index(expires, 6, bucket_expiry);
        } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) {
                idx = calc_index(expires, 7, bucket_expiry);
        } else if ((long) delta < 0) {
                idx = clk & LVL_MASK;
                *bucket_expiry = clk;
        } else {
                /*
                 * Force expire obscene large timeouts to expire at the
                 * capacity limit of the wheel.
                 */
                if (delta >= WHEEL_TIMEOUT_CUTOFF)
                        expires = clk + WHEEL_TIMEOUT_MAX;

                idx = calc_index(expires, LVL_DEPTH - 1, bucket_expiry);
        }
        return idx;
}

static void
trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
{
        if (!is_timers_nohz_active())
                return;

        /*
         * TODO: This wants some optimizing similar to the code below, but we
         * will do that when we switch from push to pull for deferrable timers.
         */
        if (timer->flags & TIMER_DEFERRABLE) {
                if (tick_nohz_full_cpu(base->cpu))
                        wake_up_nohz_cpu(base->cpu);
                return;
        }

        /*
         * We might have to IPI the remote CPU if the base is idle and the
         * timer is not deferrable. If the other CPU is on the way to idle
         * then it can't set base->is_idle as we hold the base lock:
         */
        if (base->is_idle)
                wake_up_nohz_cpu(base->cpu);
}

/*
 * Enqueue the timer into the hash bucket, mark it pending in
 * the bitmap, store the index in the timer flags then wake up
 * the target CPU if needed.
 */
static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
                          unsigned int idx, unsigned long bucket_expiry)
{

        hlist_add_head(&timer->entry, base->vectors + idx);
        __set_bit(idx, base->pending_map);
        timer_set_idx(timer, idx);

        trace_timer_start(timer, timer->expires, timer->flags);

        /*
         * Check whether this is the new first expiring timer. The
         * effective expiry time of the timer is required here
         * (bucket_expiry) instead of timer->expires.
         */
        if (time_before(bucket_expiry, base->next_expiry)) {
                /*
                 * Set the next expiry time and kick the CPU so it
                 * can reevaluate the wheel:
                 */
                base->next_expiry = bucket_expiry;
                base->timers_pending = true;
                base->next_expiry_recalc = false;
                trigger_dyntick_cpu(base, timer);
        }
}

static void internal_add_timer(struct timer_base *base, struct timer_list *timer)
{
        unsigned long bucket_expiry;
        unsigned int idx;

        idx = calc_wheel_index(timer->expires, base->clk, &bucket_expiry);
        enqueue_timer(base, timer, idx, bucket_expiry);
}

#ifdef CONFIG_DEBUG_OBJECTS_TIMERS

static const struct debug_obj_descr timer_debug_descr;

static void *timer_debug_hint(void *addr)
{
        return ((struct timer_list *) addr)->function;
}

static bool timer_is_static_object(void *addr)
{
        struct timer_list *timer = addr;

        return (timer->entry.pprev == NULL &&
                timer->entry.next == TIMER_ENTRY_STATIC);
}

/*
 * fixup_init is called when:
 * - an active object is initialized
 */
static bool timer_fixup_init(void *addr, enum debug_obj_state state)
{
        struct timer_list *timer = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                del_timer_sync(timer);
                debug_object_init(timer, &timer_debug_descr);
                return true;
        default:
                return false;
        }
}

/* Stub timer callback for improperly used timers. */
static void stub_timer(struct timer_list *unused)
{
        WARN_ON(1);
}

/*
 * fixup_activate is called when:
 * - an active object is activated
 * - an unknown non-static object is activated
 */
static bool timer_fixup_activate(void *addr, enum debug_obj_state state)
{
        struct timer_list *timer = addr;

        switch (state) {
        case ODEBUG_STATE_NOTAVAILABLE:
                timer_setup(timer, stub_timer, 0);
                return true;

        case ODEBUG_STATE_ACTIVE:
                WARN_ON(1);
                fallthrough;
        default:
                return false;
        }
}

/*
 * fixup_free is called when:
 * - an active object is freed
 */
static bool timer_fixup_free(void *addr, enum debug_obj_state state)
{
        struct timer_list *timer = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                del_timer_sync(timer);
                debug_object_free(timer, &timer_debug_descr);
                return true;
        default:
                return false;
        }
}

/*
 * fixup_assert_init is called when:
 * - an untracked/uninit-ed object is found
 */
static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state)
{
        struct timer_list *timer = addr;

        switch (state) {
        case ODEBUG_STATE_NOTAVAILABLE:
                timer_setup(timer, stub_timer, 0);
                return true;
        default:
                return false;
        }
}

static const struct debug_obj_descr timer_debug_descr = {
        .name                        = "timer_list",
        .debug_hint                = timer_debug_hint,
        .is_static_object        = timer_is_static_object,
        .fixup_init                = timer_fixup_init,
        .fixup_activate                = timer_fixup_activate,
        .fixup_free                = timer_fixup_free,
        .fixup_assert_init        = timer_fixup_assert_init,
};

static inline void debug_timer_init(struct timer_list *timer)
{
        debug_object_init(timer, &timer_debug_descr);
}

static inline void debug_timer_activate(struct timer_list *timer)
{
        debug_object_activate(timer, &timer_debug_descr);
}

static inline void debug_timer_deactivate(struct timer_list *timer)
{
        debug_object_deactivate(timer, &timer_debug_descr);
}

static inline void debug_timer_assert_init(struct timer_list *timer)
{
        debug_object_assert_init(timer, &timer_debug_descr);
}

static void do_init_timer(struct timer_list *timer,
                          void (*func)(struct timer_list *),
                          unsigned int flags,
                          const char *name, struct lock_class_key *key);

void init_timer_on_stack_key(struct timer_list *timer,
                             void (*func)(struct timer_list *),
                             unsigned int flags,
                             const char *name, struct lock_class_key *key)
{
        debug_object_init_on_stack(timer, &timer_debug_descr);
        do_init_timer(timer, func, flags, name, key);
}
EXPORT_SYMBOL_GPL(init_timer_on_stack_key);

void destroy_timer_on_stack(struct timer_list *timer)
{
        debug_object_free(timer, &timer_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_timer_on_stack);

#else
static inline void debug_timer_init(struct timer_list *timer) { }
static inline void debug_timer_activate(struct timer_list *timer) { }
static inline void debug_timer_deactivate(struct timer_list *timer) { }
static inline void debug_timer_assert_init(struct timer_list *timer) { }
#endif

static inline void debug_init(struct timer_list *timer)
{
        debug_timer_init(timer);
        trace_timer_init(timer);
}

static inline void debug_deactivate(struct timer_list *timer)
{
        debug_timer_deactivate(timer);
        trace_timer_cancel(timer);
}

static inline void debug_assert_init(struct timer_list *timer)
{
        debug_timer_assert_init(timer);
}

static void do_init_timer(struct timer_list *timer,
                          void (*func)(struct timer_list *),
                          unsigned int flags,
                          const char *name, struct lock_class_key *key)
{
        timer->entry.pprev = NULL;
        timer->function = func;
        if (WARN_ON_ONCE(flags & ~TIMER_INIT_FLAGS))
                flags &= TIMER_INIT_FLAGS;
        timer->flags = flags | raw_smp_processor_id();
        lockdep_init_map(&timer->lockdep_map, name, key, 0);
}

/**
 * init_timer_key - initialize a timer
 * @timer: the timer to be initialized
 * @func: timer callback function
 * @flags: timer flags
 * @name: name of the timer
 * @key: lockdep class key of the fake lock used for tracking timer
 *       sync lock dependencies
 *
 * init_timer_key() must be done to a timer prior calling *any* of the
 * other timer functions.
 */
void init_timer_key(struct timer_list *timer,
                    void (*func)(struct timer_list *), unsigned int flags,
                    const char *name, struct lock_class_key *key)
{
        debug_init(timer);
        do_init_timer(timer, func, flags, name, key);
}
EXPORT_SYMBOL(init_timer_key);

static inline void detach_timer(struct timer_list *timer, bool clear_pending)
{
        struct hlist_node *entry = &timer->entry;

        debug_deactivate(timer);

        __hlist_del(entry);
        if (clear_pending)
                entry->pprev = NULL;
        entry->next = LIST_POISON2;
}

static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
                             bool clear_pending)
{
        unsigned idx = timer_get_idx(timer);

        if (!timer_pending(timer))
                return 0;

        if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) {
                __clear_bit(idx, base->pending_map);
                base->next_expiry_recalc = true;
        }

        detach_timer(timer, clear_pending);
        return 1;
}

static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
{
        struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu);

        /*
         * If the timer is deferrable and NO_HZ_COMMON is set then we need
         * to use the deferrable base.
         */
        if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
                base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu);
        return base;
}

static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
{
        struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);

        /*
         * If the timer is deferrable and NO_HZ_COMMON is set then we need
         * to use the deferrable base.
         */
        if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
                base = this_cpu_ptr(&timer_bases[BASE_DEF]);
        return base;
}

static inline struct timer_base *get_timer_base(u32 tflags)
{
        return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK);
}

static inline struct timer_base *
get_target_base(struct timer_base *base, unsigned tflags)
{
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
        if (static_branch_likely(&timers_migration_enabled) &&
            !(tflags & TIMER_PINNED))
                return get_timer_cpu_base(tflags, get_nohz_timer_target());
#endif
        return get_timer_this_cpu_base(tflags);
}

static inline void forward_timer_base(struct timer_base *base)
{
        unsigned long jnow = READ_ONCE(jiffies);

        /*
         * No need to forward if we are close enough below jiffies.
         * Also while executing timers, base->clk is 1 offset ahead
         * of jiffies to avoid endless requeuing to current jffies.
         */
        if ((long)(jnow - base->clk) < 1)
                return;

        /*
         * If the next expiry value is > jiffies, then we fast forward to
         * jiffies otherwise we forward to the next expiry value.
         */
        if (time_after(base->next_expiry, jnow)) {
                base->clk = jnow;
        } else {
                if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk)))
                        return;
                base->clk = base->next_expiry;
        }
}


/*
 * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
 * that all timers which are tied to this base are locked, and the base itself
 * is locked too.
 *
 * So __run_timers/migrate_timers can safely modify all timers which could
 * be found in the base->vectors array.
 *
 * When a timer is migrating then the TIMER_MIGRATING flag is set and we need
 * to wait until the migration is done.
 */
static struct timer_base *lock_timer_base(struct timer_list *timer,
                                          unsigned long *flags)
        __acquires(timer->base->lock)
{
        for (;;) {
                struct timer_base *base;
                u32 tf;

                /*
                 * We need to use READ_ONCE() here, otherwise the compiler
                 * might re-read @tf between the check for TIMER_MIGRATING
                 * and spin_lock().
                 */
                tf = READ_ONCE(timer->flags);

                if (!(tf & TIMER_MIGRATING)) {
                        base = get_timer_base(tf);
                        raw_spin_lock_irqsave(&base->lock, *flags);
                        if (timer->flags == tf)
                                return base;
                        raw_spin_unlock_irqrestore(&base->lock, *flags);
                }
                cpu_relax();
        }
}

#define MOD_TIMER_PENDING_ONLY                0x01
#define MOD_TIMER_REDUCE                0x02
#define MOD_TIMER_NOTPENDING                0x04

static inline int
__mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options)
{
        unsigned long clk = 0, flags, bucket_expiry;
        struct timer_base *base, *new_base;
        unsigned int idx = UINT_MAX;
        int ret = 0;

        BUG_ON(!timer->function);

        /*
         * This is a common optimization triggered by the networking code - if
         * the timer is re-modified to have the same timeout or ends up in the
         * same array bucket then just return:
         */
        if (!(options & MOD_TIMER_NOTPENDING) && timer_pending(timer)) {
                /*
                 * The downside of this optimization is that it can result in
                 * larger granularity than you would get from adding a new
                 * timer with this expiry.
                 */
                long diff = timer->expires - expires;

                if (!diff)
                        return 1;
                if (options & MOD_TIMER_REDUCE && diff <= 0)
                        return 1;

                /*
                 * We lock timer base and calculate the bucket index right
                 * here. If the timer ends up in the same bucket, then we
                 * just update the expiry time and avoid the whole
                 * dequeue/enqueue dance.
                 */
                base = lock_timer_base(timer, &flags);
                forward_timer_base(base);

                if (timer_pending(timer) && (options & MOD_TIMER_REDUCE) &&
                    time_before_eq(timer->expires, expires)) {
                        ret = 1;
                        goto out_unlock;
                }

                clk = base->clk;
                idx = calc_wheel_index(expires, clk, &bucket_expiry);

                /*
                 * Retrieve and compare the array index of the pending
                 * timer. If it matches set the expiry to the new value so a
                 * subsequent call will exit in the expires check above.
                 */
                if (idx == timer_get_idx(timer)) {
                        if (!(options & MOD_TIMER_REDUCE))
                                timer->expires = expires;
                        else if (time_after(timer->expires, expires))
                                timer->expires = expires;
                        ret = 1;
                        goto out_unlock;
                }
        } else {
                base = lock_timer_base(timer, &flags);
                forward_timer_base(base);
        }

        ret = detach_if_pending(timer, base, false);
        if (!ret && (options & MOD_TIMER_PENDING_ONLY))
                goto out_unlock;

        new_base = get_target_base(base, timer->flags);

        if (base != new_base) {
                /*
                 * We are trying to schedule the timer on the new base.
                 * However we can't change timer's base while it is running,
                 * otherwise timer_delete_sync() can't detect that the timer's
                 * handler yet has not finished. This also guarantees that the
                 * timer is serialized wrt itself.
                 */
                if (likely(base->running_timer != timer)) {
                        /* See the comment in lock_timer_base() */
                        timer->flags |= TIMER_MIGRATING;

                        raw_spin_unlock(&base->lock);
                        base = new_base;
                        raw_spin_lock(&base->lock);
                        WRITE_ONCE(timer->flags,
                                   (timer->flags & ~TIMER_BASEMASK) | base->cpu);
                        forward_timer_base(base);
                }
        }

        debug_timer_activate(timer);

        timer->expires = expires;
        /*
         * If 'idx' was calculated above and the base time did not advance
         * between calculating 'idx' and possibly switching the base, only
         * enqueue_timer() is required. Otherwise we need to (re)calculate
         * the wheel index via internal_add_timer().
         */
        if (idx != UINT_MAX && clk == base->clk)
                enqueue_timer(base, timer, idx, bucket_expiry);
        else
                internal_add_timer(base, timer);

out_unlock:
        raw_spin_unlock_irqrestore(&base->lock, flags);

        return ret;
}

/**
 * mod_timer_pending - Modify a pending timer's timeout
 * @timer:        The pending timer to be modified
 * @expires:        New absolute timeout in jiffies
 *
 * mod_timer_pending() is the same for pending timers as mod_timer(), but
 * will not activate inactive timers.
 *
 * Return:
 * * %0 - The timer was inactive and not modified
 * * %1 - The timer was active and requeued to expire at @expires
 */
int mod_timer_pending(struct timer_list *timer, unsigned long expires)
{
        return __mod_timer(timer, expires, MOD_TIMER_PENDING_ONLY);
}
EXPORT_SYMBOL(mod_timer_pending);

/**
 * mod_timer - Modify a timer's timeout
 * @timer:        The timer to be modified
 * @expires:        New absolute timeout in jiffies
 *
 * mod_timer(timer, expires) is equivalent to:
 *
 *     del_timer(timer); timer->expires = expires; add_timer(timer);
 *
 * mod_timer() is more efficient than the above open coded sequence. In
 * case that the timer is inactive, the del_timer() part is a NOP. The
 * timer is in any case activated with the new expiry time @expires.
 *
 * Note that if there are multiple unserialized concurrent users of the
 * same timer, then mod_timer() is the only safe way to modify the timeout,
 * since add_timer() cannot modify an already running timer.
 *
 * Return:
 * * %0 - The timer was inactive and started
 * * %1 - The timer was active and requeued to expire at @expires or
 *          the timer was active and not modified because @expires did
 *          not change the effective expiry time
 */
int mod_timer(struct timer_list *timer, unsigned long expires)
{
        return __mod_timer(timer, expires, 0);
}
EXPORT_SYMBOL(mod_timer);

/**
 * timer_reduce - Modify a timer's timeout if it would reduce the timeout
 * @timer:        The timer to be modified
 * @expires:        New absolute timeout in jiffies
 *
 * timer_reduce() is very similar to mod_timer(), except that it will only
 * modify an enqueued timer if that would reduce the expiration time. If
 * @timer is not enqueued it starts the timer.
 *
 * Return:
 * * %0 - The timer was inactive and started
 * * %1 - The timer was active and requeued to expire at @expires or
 *          the timer was active and not modified because @expires
 *          did not change the effective expiry time such that the
 *          timer would expire earlier than already scheduled
 */
int timer_reduce(struct timer_list *timer, unsigned long expires)
{
        return __mod_timer(timer, expires, MOD_TIMER_REDUCE);
}
EXPORT_SYMBOL(timer_reduce);

/**
 * add_timer - Start a timer
 * @timer:        The timer to be started
 *
 * Start @timer to expire at @timer->expires in the future. @timer->expires
 * is the absolute expiry time measured in 'jiffies'. When the timer expires
 * timer->function(timer) will be invoked from soft interrupt context.
 *
 * The @timer->expires and @timer->function fields must be set prior
 * to calling this function.
 *
 * If @timer->expires is already in the past @timer will be queued to
 * expire at the next timer tick.
 *
 * This can only operate on an inactive timer. Attempts to invoke this on
 * an active timer are rejected with a warning.
 */
void add_timer(struct timer_list *timer)
{
        BUG_ON(timer_pending(timer));
        __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
}
EXPORT_SYMBOL(add_timer);

/**
 * add_timer_on - Start a timer on a particular CPU
 * @timer:        The timer to be started
 * @cpu:        The CPU to start it on
 *
 * Same as add_timer() except that it starts the timer on the given CPU.
 *
 * See add_timer() for further details.
 */
void add_timer_on(struct timer_list *timer, int cpu)
{
        struct timer_base *new_base, *base;
        unsigned long flags;

        BUG_ON(timer_pending(timer) || !timer->function);

        new_base = get_timer_cpu_base(timer->flags, cpu);

        /*
         * If @timer was on a different CPU, it should be migrated with the
         * old base locked to prevent other operations proceeding with the
         * wrong base locked.  See lock_timer_base().
         */
        base = lock_timer_base(timer, &flags);
        if (base != new_base) {
                timer->flags |= TIMER_MIGRATING;

                raw_spin_unlock(&base->lock);
                base = new_base;
                raw_spin_lock(&base->lock);
                WRITE_ONCE(timer->flags,
                           (timer->flags & ~TIMER_BASEMASK) | cpu);
        }
        forward_timer_base(base);

        debug_timer_activate(timer);
        internal_add_timer(base, timer);
        raw_spin_unlock_irqrestore(&base->lock, flags);
}
EXPORT_SYMBOL_GPL(add_timer_on);

/**
 * del_timer - Deactivate a timer.
 * @timer:        The timer to be deactivated
 *
 * The function only deactivates a pending timer, but contrary to
 * timer_delete_sync() it does not take into account whether the timer's
 * callback function is concurrently executed on a different CPU or not.
 * It neither prevents rearming of the timer. If @timer can be rearmed
 * concurrently then the return value of this function is meaningless.
 *
 * Return:
 * * %0 - The timer was not pending
 * * %1 - The timer was pending and deactivated
 */
int del_timer(struct timer_list *timer)
{
        struct timer_base *base;
        unsigned long flags;
        int ret = 0;

        debug_assert_init(timer);

        if (timer_pending(timer)) {
                base = lock_timer_base(timer, &flags);
                ret = detach_if_pending(timer, base, true);
                raw_spin_unlock_irqrestore(&base->lock, flags);
        }

        return ret;
}
EXPORT_SYMBOL(del_timer);

/**
 * try_to_del_timer_sync - Try to deactivate a timer
 * @timer:        Timer to deactivate
 *
 * This function tries to deactivate a timer. On success the timer is not
 * queued and the timer callback function is not running on any CPU.
 *
 * This function does not guarantee that the timer cannot be rearmed right
 * after dropping the base lock. That needs to be prevented by the calling
 * code if necessary.
 *
 * Return:
 * * %0  - The timer was not pending
 * * %1  - The timer was pending and deactivated
 * * %-1 - The timer callback function is running on a different CPU
 */
int try_to_del_timer_sync(struct timer_list *timer)
{
        struct timer_base *base;
        unsigned long flags;
        int ret = -1;

        debug_assert_init(timer);

        base = lock_timer_base(timer, &flags);

        if (base->running_timer != timer)
                ret = detach_if_pending(timer, base, true);

        raw_spin_unlock_irqrestore(&base->lock, flags);

        return ret;
}
EXPORT_SYMBOL(try_to_del_timer_sync);

#ifdef CONFIG_PREEMPT_RT
static __init void timer_base_init_expiry_lock(struct timer_base *base)
{
        spin_lock_init(&base->expiry_lock);
}

static inline void timer_base_lock_expiry(struct timer_base *base)
{
        spin_lock(&base->expiry_lock);
}

static inline void timer_base_unlock_expiry(struct timer_base *base)
{
        spin_unlock(&base->expiry_lock);
}

/*
 * The counterpart to del_timer_wait_running().
 *
 * If there is a waiter for base->expiry_lock, then it was waiting for the
 * timer callback to finish. Drop expiry_lock and reaquire it. That allows
 * the waiter to acquire the lock and make progress.
 */
static void timer_sync_wait_running(struct timer_base *base)
{
        if (atomic_read(&base->timer_waiters)) {
                raw_spin_unlock_irq(&base->lock);
                spin_unlock(&base->expiry_lock);
                spin_lock(&base->expiry_lock);
                raw_spin_lock_irq(&base->lock);
        }
}

/*
 * This function is called on PREEMPT_RT kernels when the fast path
 * deletion of a timer failed because the timer callback function was
 * running.
 *
 * This prevents priority inversion, if the softirq thread on a remote CPU
 * got preempted, and it prevents a life lock when the task which tries to
 * delete a timer preempted the softirq thread running the timer callback
 * function.
 */
static void del_timer_wait_running(struct timer_list *timer)
{
        u32 tf;

        tf = READ_ONCE(timer->flags);
        if (!(tf & TIMER_MIGRATING)) {
                struct timer_base *base = get_timer_base(tf);

                /*
                 * Mark the base as contended and grab the expiry lock,
                 * which is held by the softirq across the timer
                 * callback. Drop the lock immediately so the softirq can
                 * expire the next timer. In theory the timer could already
                 * be running again, but that's more than unlikely and just
                 * causes another wait loop.
                 */
                atomic_inc(&base->timer_waiters);
                spin_lock_bh(&base->expiry_lock);
                atomic_dec(&base->timer_waiters);
                spin_unlock_bh(&base->expiry_lock);
        }
}
#else
static inline void timer_base_init_expiry_lock(struct timer_base *base) { }
static inline void timer_base_lock_expiry(struct timer_base *base) { }
static inline void timer_base_unlock_expiry(struct timer_base *base) { }
static inline void timer_sync_wait_running(struct timer_base *base) { }
static inline void del_timer_wait_running(struct timer_list *timer) { }
#endif

/**
 * timer_delete_sync - Deactivate a timer and wait for the handler to finish.
 * @timer:        The timer to be deactivated
 *
 * Synchronization rules: Callers must prevent restarting of the timer,
 * otherwise this function is meaningless. It must not be called from
 * interrupt contexts unless the timer is an irqsafe one. The caller must
 * not hold locks which would prevent completion of the timer's callback
 * function. The timer's handler must not call add_timer_on(). Upon exit
 * the timer is not queued and the handler is not running on any CPU.
 *
 * For !irqsafe timers, the caller must not hold locks that are held in
 * interrupt context. Even if the lock has nothing to do with the timer in
 * question.  Here's why::
 *
 *    CPU0                             CPU1
 *    ----                             ----
 *                                     <SOFTIRQ>
 *                                       call_timer_fn();
 *                                       base->running_timer = mytimer;
 *    spin_lock_irq(somelock);
 *                                     <IRQ>
 *                                        spin_lock(somelock);
 *    timer_delete_sync(mytimer);
 *    while (base->running_timer == mytimer);
 *
 * Now timer_delete_sync() will never return and never release somelock.
 * The interrupt on the other CPU is waiting to grab somelock but it has
 * interrupted the softirq that CPU0 is waiting to finish.
 *
 * This function cannot guarantee that the timer is not rearmed again by
 * some concurrent or preempting code, right after it dropped the base
 * lock. If there is the possibility of a concurrent rearm then the return
 * value of the function is meaningless.
 *
 * Return:
 * * %0        - The timer was not pending
 * * %1        - The timer was pending and deactivated
 */
int timer_delete_sync(struct timer_list *timer)
{
        int ret;

#ifdef CONFIG_LOCKDEP
        unsigned long flags;

        /*
         * If lockdep gives a backtrace here, please reference
         * the synchronization rules above.
         */
        local_irq_save(flags);
        lock_map_acquire(&timer->lockdep_map);
        lock_map_release(&timer->lockdep_map);
        local_irq_restore(flags);
#endif
        /*
         * don't use it in hardirq context, because it
         * could lead to deadlock.
         */
        WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE));

        do {
                ret = try_to_del_timer_sync(timer);

                if (unlikely(ret < 0)) {
                        del_timer_wait_running(timer);
                        cpu_relax();
                }
        } while (ret < 0);

        return ret;
}
EXPORT_SYMBOL(timer_delete_sync);

static void call_timer_fn(struct timer_list *timer,
                          void (*fn)(struct timer_list *),
                          unsigned long baseclk)
{
        int count = preempt_count();

#ifdef CONFIG_LOCKDEP
        /*
         * It is permissible to free the timer from inside the
         * function that is called from it, this we need to take into
         * account for lockdep too. To avoid bogus "held lock freed"
         * warnings as well as problems when looking into
         * timer->lockdep_map, make a copy and use that here.
         */
        struct lockdep_map lockdep_map;

        lockdep_copy_map(&lockdep_map, &timer->lockdep_map);
#endif
        /*
         * Couple the lock chain with the lock chain at
         * timer_delete_sync() by acquiring the lock_map around the fn()
         * call here and in timer_delete_sync().
         */
        lock_map_acquire(&lockdep_map);

        trace_timer_expire_entry(timer, baseclk);
        fn(timer);
        trace_timer_expire_exit(timer);

        lock_map_release(&lockdep_map);

        if (count != preempt_count()) {
                WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n",
                          fn, count, preempt_count());
                /*
                 * Restore the preempt count. That gives us a decent
                 * chance to survive and extract information. If the
                 * callback kept a lock held, bad luck, but not worse
                 * than the BUG() we had.
                 */
                preempt_count_set(count);
        }
}

static void expire_timers(struct timer_base *base, struct hlist_head *head)
{
        /*
         * This value is required only for tracing. base->clk was
         * incremented directly before expire_timers was called. But expiry
         * is related to the old base->clk value.
         */
        unsigned long baseclk = base->clk - 1;

        while (!hlist_empty(head)) {
                struct timer_list *timer;
                void (*fn)(struct timer_list *);

                timer = hlist_entry(head->first, struct timer_list, entry);

                base->running_timer = timer;
                detach_timer(timer, true);

                fn = timer->function;

                if (timer->flags & TIMER_IRQSAFE) {
                        raw_spin_unlock(&base->lock);
                        call_timer_fn(timer, fn, baseclk);
                        raw_spin_lock(&base->lock);
                        base->running_timer = NULL;
                } else {
                        raw_spin_unlock_irq(&base->lock);
                        call_timer_fn(timer, fn, baseclk);
                        raw_spin_lock_irq(&base->lock);
                        base->running_timer = NULL;
                        timer_sync_wait_running(base);
                }
        }
}

static int collect_expired_timers(struct timer_base *base,
                                  struct hlist_head *heads)
{
        unsigned long clk = base->clk = base->next_expiry;
        struct hlist_head *vec;
        int i, levels = 0;
        unsigned int idx;

        for (i = 0; i < LVL_DEPTH; i++) {
                idx = (clk & LVL_MASK) + i * LVL_SIZE;

                if (__test_and_clear_bit(idx, base->pending_map)) {
                        vec = base->vectors + idx;
                        hlist_move_list(vec, heads++);
                        levels++;
                }
                /* Is it time to look at the next level? */
                if (clk & LVL_CLK_MASK)
                        break;
                /* Shift clock for the next level granularity */
                clk >>= LVL_CLK_SHIFT;
        }
        return levels;
}

/*
 * Find the next pending bucket of a level. Search from level start (@offset)
 * + @clk upwards and if nothing there, search from start of the level
 * (@offset) up to @offset + clk.
 */
static int next_pending_bucket(struct timer_base *base, unsigned offset,
                               unsigned clk)
{
        unsigned pos, start = offset + clk;
        unsigned end = offset + LVL_SIZE;

        pos = find_next_bit(base->pending_map, end, start);
        if (pos < end)
                return pos - start;

        pos = find_next_bit(base->pending_map, start, offset);
        return pos < start ? pos + LVL_SIZE - start : -1;
}

/*
 * Search the first expiring timer in the various clock levels. Caller must
 * hold base->lock.
 */
static unsigned long __next_timer_interrupt(struct timer_base *base)
{
        unsigned long clk, next, adj;
        unsigned lvl, offset = 0;

        next = base->clk + NEXT_TIMER_MAX_DELTA;
        clk = base->clk;
        for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
                int pos = next_pending_bucket(base, offset, clk & LVL_MASK);
                unsigned long lvl_clk = clk & LVL_CLK_MASK;

                if (pos >= 0) {
                        unsigned long tmp = clk + (unsigned long) pos;

                        tmp <<= LVL_SHIFT(lvl);
                        if (time_before(tmp, next))
                                next = tmp;

                        /*
                         * If the next expiration happens before we reach
                         * the next level, no need to check further.
                         */
                        if (pos <= ((LVL_CLK_DIV - lvl_clk) & LVL_CLK_MASK))
                                break;
                }
                /*
                 * Clock for the next level. If the current level clock lower
                 * bits are zero, we look at the next level as is. If not we
                 * need to advance it by one because that's going to be the
                 * next expiring bucket in that level. base->clk is the next
                 * expiring jiffie. So in case of:
                 *
                 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
                 *  0    0    0    0    0    0
                 *
                 * we have to look at all levels @index 0. With
                 *
                 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
                 *  0    0    0    0    0    2
                 *
                 * LVL0 has the next expiring bucket @index 2. The upper
                 * levels have the next expiring bucket @index 1.
                 *
                 * In case that the propagation wraps the next level the same
                 * rules apply:
                 *
                 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
                 *  0    0    0    0    F    2
                 *
                 * So after looking at LVL0 we get:
                 *
                 * LVL5 LVL4 LVL3 LVL2 LVL1
                 *  0    0    0    1    0
                 *
                 * So no propagation from LVL1 to LVL2 because that happened
                 * with the add already, but then we need to propagate further
                 * from LVL2 to LVL3.
                 *
                 * So the simple check whether the lower bits of the current
                 * level are 0 or not is sufficient for all cases.
                 */
                adj = lvl_clk ? 1 : 0;
                clk >>= LVL_CLK_SHIFT;
                clk += adj;
        }

        base->next_expiry_recalc = false;
        base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA);

        return next;
}

#ifdef CONFIG_NO_HZ_COMMON
/*
 * Check, if the next hrtimer event is before the next timer wheel
 * event:
 */
static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
{
        u64 nextevt = hrtimer_get_next_event();

        /*
         * If high resolution timers are enabled
         * hrtimer_get_next_event() returns KTIME_MAX.
         */
        if (expires <= nextevt)
                return expires;

        /*
         * If the next timer is already expired, return the tick base
         * time so the tick is fired immediately.
         */
        if (nextevt <= basem)
                return basem;

        /*
         * Round up to the next jiffie. High resolution timers are
         * off, so the hrtimers are expired in the tick and we need to
         * make sure that this tick really expires the timer to avoid
         * a ping pong of the nohz stop code.
         *
         * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3
         */
        return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC;
}

/**
 * get_next_timer_interrupt - return the time (clock mono) of the next timer
 * @basej:        base time jiffies
 * @basem:        base time clock monotonic
 *
 * Returns the tick aligned clock monotonic time of the next pending
 * timer or KTIME_MAX if no timer is pending.
 */
u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
{
        struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
        u64 expires = KTIME_MAX;
        unsigned long nextevt;

        /*
         * Pretend that there is no timer pending if the cpu is offline.
         * Possible pending timers will be migrated later to an active cpu.
         */
        if (cpu_is_offline(smp_processor_id()))
                return expires;

        raw_spin_lock(&base->lock);
        if (base->next_expiry_recalc)
                base->next_expiry = __next_timer_interrupt(base);
        nextevt = base->next_expiry;

        /*
         * We have a fresh next event. Check whether we can forward the
         * base. We can only do that when @basej is past base->clk
         * otherwise we might rewind base->clk.
         */
        if (time_after(basej, base->clk)) {
                if (time_after(nextevt, basej))
                        base->clk = basej;
                else if (time_after(nextevt, base->clk))
                        base->clk = nextevt;
        }

        if (time_before_eq(nextevt, basej)) {
                expires = basem;
                base->is_idle = false;
        } else {
                if (base->timers_pending)
                        expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
                /*
                 * If we expect to sleep more than a tick, mark the base idle.
                 * Also the tick is stopped so any added timer must forward
                 * the base clk itself to keep granularity small. This idle
                 * logic is only maintained for the BASE_STD base, deferrable
                 * timers may still see large granularity skew (by design).
                 */
                if ((expires - basem) > TICK_NSEC)
                        base->is_idle = true;
        }
        raw_spin_unlock(&base->lock);

        return cmp_next_hrtimer_event(basem, expires);
}

/**
 * timer_clear_idle - Clear the idle state of the timer base
 *
 * Called with interrupts disabled
 */
void timer_clear_idle(void)
{
        struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);

        /*
         * We do this unlocked. The worst outcome is a remote enqueue sending
         * a pointless IPI, but taking the lock would just make the window for
         * sending the IPI a few instructions smaller for the cost of taking
         * the lock in the exit from idle path.
         */
        base->is_idle = false;
}
#endif

/*
 * Called from the timer interrupt handler to charge one tick to the current
 * process.  user_tick is 1 if the tick is user time, 0 for system.
 */
void update_process_times(int user_tick)
{
        struct task_struct *p = current;

        PRANDOM_ADD_NOISE(jiffies, user_tick, p, 0);

        /* Note: this timer irq context must be accounted for as well. */
        account_process_tick(p, user_tick);
        run_local_timers();
        rcu_sched_clock_irq(user_tick);
#ifdef CONFIG_IRQ_WORK
        if (in_irq())
                irq_work_tick();
#endif
        scheduler_tick();
        if (IS_ENABLED(CONFIG_POSIX_TIMERS))
                run_posix_cpu_timers();
}

/**
 * __run_timers - run all expired timers (if any) on this CPU.
 * @base: the timer vector to be processed.
 */
static inline void __run_timers(struct timer_base *base)
{
        struct hlist_head heads[LVL_DEPTH];
        int levels;

        if (time_before(jiffies, base->next_expiry))
                return;

        timer_base_lock_expiry(base);
        raw_spin_lock_irq(&base->lock);

        while (time_after_eq(jiffies, base->clk) &&
               time_after_eq(jiffies, base->next_expiry)) {
                levels = collect_expired_timers(base, heads);
                /*
                 * The two possible reasons for not finding any expired
                 * timer at this clk are that all matching timers have been
                 * dequeued or no timer has been queued since
                 * base::next_expiry was set to base::clk +
                 * NEXT_TIMER_MAX_DELTA.
                 */
                WARN_ON_ONCE(!levels && !base->next_expiry_recalc
                             && base->timers_pending);
                base->clk++;
                base->next_expiry = __next_timer_interrupt(base);

                while (levels--)
                        expire_timers(base, heads + levels);
        }
        raw_spin_unlock_irq(&base->lock);
        timer_base_unlock_expiry(base);
}

/*
 * This function runs timers and the timer-tq in bottom half context.
 */
static __latent_entropy void run_timer_softirq(struct softirq_action *h)
{
        struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);

        __run_timers(base);
        if (IS_ENABLED(CONFIG_NO_HZ_COMMON))
                __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
}

/*
 * Called by the local, per-CPU timer interrupt on SMP.
 */
void run_local_timers(void)
{
        struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);

        hrtimer_run_queues();
        /* Raise the softirq only if required. */
        if (time_before(jiffies, base->next_expiry)) {
                if (!IS_ENABLED(CONFIG_NO_HZ_COMMON))
                        return;
                /* CPU is awake, so check the deferrable base. */
                base++;
                if (time_before(jiffies, base->next_expiry))
                        return;
        }
        raise_softirq(TIMER_SOFTIRQ);
}

/*
 * Since schedule_timeout()'s timer is defined on the stack, it must store
 * the target task on the stack as well.
 */
struct process_timer {
        struct timer_list timer;
        struct task_struct *task;
};

static void process_timeout(struct timer_list *t)
{
        struct process_timer *timeout = from_timer(timeout, t, timer);

        wake_up_process(timeout->task);
}

/**
 * schedule_timeout - sleep until timeout
 * @timeout: timeout value in jiffies
 *
 * Make the current task sleep until @timeout jiffies have elapsed.
 * The function behavior depends on the current task state
 * (see also set_current_state() description):
 *
 * %TASK_RUNNING - the scheduler is called, but the task does not sleep
 * at all. That happens because sched_submit_work() does nothing for
 * tasks in %TASK_RUNNING state.
 *
 * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
 * pass before the routine returns unless the current task is explicitly
 * woken up, (e.g. by wake_up_process()).
 *
 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
 * delivered to the current task or the current task is explicitly woken
 * up.
 *
 * The current task state is guaranteed to be %TASK_RUNNING when this
 * routine returns.
 *
 * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
 * the CPU away without a bound on the timeout. In this case the return
 * value will be %MAX_SCHEDULE_TIMEOUT.
 *
 * Returns 0 when the timer has expired otherwise the remaining time in
 * jiffies will be returned. In all cases the return value is guaranteed
 * to be non-negative.
 */
signed long __sched schedule_timeout(signed long timeout)
{
        struct process_timer timer;
        unsigned long expire;

        switch (timeout)
        {
        case MAX_SCHEDULE_TIMEOUT:
                /*
                 * These two special cases are useful to be comfortable
                 * in the caller. Nothing more. We could take
                 * MAX_SCHEDULE_TIMEOUT from one of the negative value
                 * but I' d like to return a valid offset (>=0) to allow
                 * the caller to do everything it want with the retval.
                 */
                schedule();
                goto out;
        default:
                /*
                 * Another bit of PARANOID. Note that the retval will be
                 * 0 since no piece of kernel is supposed to do a check
                 * for a negative retval of schedule_timeout() (since it
                 * should never happens anyway). You just have the printk()
                 * that will tell you if something is gone wrong and where.
                 */
                if (timeout < 0) {
                        printk(KERN_ERR "schedule_timeout: wrong timeout "
                                "value %lx\n", timeout);
                        dump_stack();
                        current->state = TASK_RUNNING;
                        goto out;
                }
        }

        expire = timeout + jiffies;

        timer.task = current;
        timer_setup_on_stack(&timer.timer, process_timeout, 0);
        __mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING);
        schedule();
        del_singleshot_timer_sync(&timer.timer);

        /* Remove the timer from the object tracker */
        destroy_timer_on_stack(&timer.timer);

        timeout = expire - jiffies;

 out:
        return timeout < 0 ? 0 : timeout;
}
EXPORT_SYMBOL(schedule_timeout);

/*
 * We can use __set_current_state() here because schedule_timeout() calls
 * schedule() unconditionally.
 */
signed long __sched schedule_timeout_interruptible(signed long timeout)
{
        __set_current_state(TASK_INTERRUPTIBLE);
        return schedule_timeout(timeout);
}
EXPORT_SYMBOL(schedule_timeout_interruptible);

signed long __sched schedule_timeout_killable(signed long timeout)
{
        __set_current_state(TASK_KILLABLE);
        return schedule_timeout(timeout);
}
EXPORT_SYMBOL(schedule_timeout_killable);

signed long __sched schedule_timeout_uninterruptible(signed long timeout)
{
        __set_current_state(TASK_UNINTERRUPTIBLE);
        return schedule_timeout(timeout);
}
EXPORT_SYMBOL(schedule_timeout_uninterruptible);

/*
 * Like schedule_timeout_uninterruptible(), except this task will not contribute
 * to load average.
 */
signed long __sched schedule_timeout_idle(signed long timeout)
{
        __set_current_state(TASK_IDLE);
        return schedule_timeout(timeout);
}
EXPORT_SYMBOL(schedule_timeout_idle);

#ifdef CONFIG_HOTPLUG_CPU
static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head)
{
        struct timer_list *timer;
        int cpu = new_base->cpu;

        while (!hlist_empty(head)) {
                timer = hlist_entry(head->first, struct timer_list, entry);
                detach_timer(timer, false);
                timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
                internal_add_timer(new_base, timer);
        }
}

int timers_prepare_cpu(unsigned int cpu)
{
        struct timer_base *base;
        int b;

        for (b = 0; b < NR_BASES; b++) {
                base = per_cpu_ptr(&timer_bases[b], cpu);
                base->clk = jiffies;
                base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
                base->timers_pending = false;
                base->is_idle = false;
        }
        return 0;
}

int timers_dead_cpu(unsigned int cpu)
{
        struct timer_base *old_base;
        struct timer_base *new_base;
        int b, i;

        BUG_ON(cpu_online(cpu));

        for (b = 0; b < NR_BASES; b++) {
                old_base = per_cpu_ptr(&timer_bases[b], cpu);
                new_base = get_cpu_ptr(&timer_bases[b]);
                /*
                 * The caller is globally serialized and nobody else
                 * takes two locks at once, deadlock is not possible.
                 */
                raw_spin_lock_irq(&new_base->lock);
                raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);

                /*
                 * The current CPUs base clock might be stale. Update it
                 * before moving the timers over.
                 */
                forward_timer_base(new_base);

                BUG_ON(old_base->running_timer);

                for (i = 0; i < WHEEL_SIZE; i++)
                        migrate_timer_list(new_base, old_base->vectors + i);

                raw_spin_unlock(&old_base->lock);
                raw_spin_unlock_irq(&new_base->lock);
                put_cpu_ptr(&timer_bases);
        }
        return 0;
}

#endif /* CONFIG_HOTPLUG_CPU */

static void __init init_timer_cpu(int cpu)
{
        struct timer_base *base;
        int i;

        for (i = 0; i < NR_BASES; i++) {
                base = per_cpu_ptr(&timer_bases[i], cpu);
                base->cpu = cpu;
                raw_spin_lock_init(&base->lock);
                base->clk = jiffies;
                base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
                timer_base_init_expiry_lock(base);
        }
}

static void __init init_timer_cpus(void)
{
        int cpu;

        for_each_possible_cpu(cpu)
                init_timer_cpu(cpu);
}

void __init init_timers(void)
{
        init_timer_cpus();
        posix_cputimers_init_work();
        open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
}

/**
 * msleep - sleep safely even with waitqueue interruptions
 * @msecs: Time in milliseconds to sleep for
 */
void msleep(unsigned int msecs)
{
        unsigned long timeout = msecs_to_jiffies(msecs) + 1;

        while (timeout)
                timeout = schedule_timeout_uninterruptible(timeout);
}

EXPORT_SYMBOL(msleep);

/**
 * msleep_interruptible - sleep waiting for signals
 * @msecs: Time in milliseconds to sleep for
 */
unsigned long msleep_interruptible(unsigned int msecs)
{
        unsigned long timeout = msecs_to_jiffies(msecs) + 1;

        while (timeout && !signal_pending(current))
                timeout = schedule_timeout_interruptible(timeout);
        return jiffies_to_msecs(timeout);
}

EXPORT_SYMBOL(msleep_interruptible);

/**
 * usleep_range - Sleep for an approximate time
 * @min: Minimum time in usecs to sleep
 * @max: Maximum time in usecs to sleep
 *
 * In non-atomic context where the exact wakeup time is flexible, use
 * usleep_range() instead of udelay().  The sleep improves responsiveness
 * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces
 * power usage by allowing hrtimers to take advantage of an already-
 * scheduled interrupt instead of scheduling a new one just for this sleep.
 */
void __sched usleep_range(unsigned long min, unsigned long max)
{
        ktime_t exp = ktime_add_us(ktime_get(), min);
        u64 delta = (u64)(max - min) * NSEC_PER_USEC;

        for (;;) {
                __set_current_state(TASK_UNINTERRUPTIBLE);
                /* Do not return before the requested sleep time has elapsed */
                if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS))
                        break;
        }
}
EXPORT_SYMBOL(usleep_range);





























    1 
    1 














    1 

















    1 
    1 





    1 







































    1 


    1 


















    1 





























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
// SPDX-License-Identifier: GPL-2.0-or-later
/* Filesystem parameter parser.
 *
 * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#include <linux/export.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/slab.h>
#include <linux/security.h>
#include <linux/namei.h>
#include "internal.h"

static const struct constant_table bool_names[] = {
        { "0",                false },
        { "1",                true },
        { "false",        false },
        { "no",                false },
        { "true",        true },
        { "yes",        true },
        { },
};

static const struct constant_table *
__lookup_constant(const struct constant_table *tbl, const char *name)
{
        for ( ; tbl->name; tbl++)
                if (strcmp(name, tbl->name) == 0)
                        return tbl;
        return NULL;
}

/**
 * lookup_constant - Look up a constant by name in an ordered table
 * @tbl: The table of constants to search.
 * @name: The name to look up.
 * @not_found: The value to return if the name is not found.
 */
int lookup_constant(const struct constant_table *tbl, const char *name, int not_found)
{
        const struct constant_table *p = __lookup_constant(tbl, name);

        return p ? p->value : not_found;
}
EXPORT_SYMBOL(lookup_constant);

static inline bool is_flag(const struct fs_parameter_spec *p)
{
        return p->type == NULL;
}

static const struct fs_parameter_spec *fs_lookup_key(
        const struct fs_parameter_spec *desc,
        struct fs_parameter *param, bool *negated)
{
        const struct fs_parameter_spec *p, *other = NULL;
        const char *name = param->key;
        bool want_flag = param->type == fs_value_is_flag;

        *negated = false;
        for (p = desc; p->name; p++) {
                if (strcmp(p->name, name) != 0)
                        continue;
                if (likely(is_flag(p) == want_flag))
                        return p;
                other = p;
        }
        if (want_flag) {
                if (name[0] == 'n' && name[1] == 'o' && name[2]) {
                        for (p = desc; p->name; p++) {
                                if (strcmp(p->name, name + 2) != 0)
                                        continue;
                                if (!(p->flags & fs_param_neg_with_no))
                                        continue;
                                *negated = true;
                                return p;
                        }
                }
        }
        return other;
}

/*
 * fs_parse - Parse a filesystem configuration parameter
 * @fc: The filesystem context to log errors through.
 * @desc: The parameter description to use.
 * @param: The parameter.
 * @result: Where to place the result of the parse
 *
 * Parse a filesystem configuration parameter and attempt a conversion for a
 * simple parameter for which this is requested.  If successful, the determined
 * parameter ID is placed into @result->key, the desired type is indicated in
 * @result->t and any converted value is placed into an appropriate member of
 * the union in @result.
 *
 * The function returns the parameter number if the parameter was matched,
 * -ENOPARAM if it wasn't matched and @desc->ignore_unknown indicated that
 * unknown parameters are okay and -EINVAL if there was a conversion issue or
 * the parameter wasn't recognised and unknowns aren't okay.
 */
int __fs_parse(struct p_log *log,
             const struct fs_parameter_spec *desc,
             struct fs_parameter *param,
             struct fs_parse_result *result)
{
        const struct fs_parameter_spec *p;

        result->uint_64 = 0;

        p = fs_lookup_key(desc, param, &result->negated);
        if (!p)
                return -ENOPARAM;

        if (p->flags & fs_param_deprecated)
                warn_plog(log, "Deprecated parameter '%s'", param->key);

        /* Try to turn the type we were given into the type desired by the
         * parameter and give an error if we can't.
         */
        if (is_flag(p)) {
                if (param->type != fs_value_is_flag)
                        return inval_plog(log, "Unexpected value for '%s'",
                                      param->key);
                result->boolean = !result->negated;
        } else  {
                int ret = p->type(log, p, param, result);
                if (ret)
                        return ret;
        }
        return p->opt;
}
EXPORT_SYMBOL(__fs_parse);

/**
 * fs_lookup_param - Look up a path referred to by a parameter
 * @fc: The filesystem context to log errors through.
 * @param: The parameter.
 * @want_bdev: T if want a blockdev
 * @_path: The result of the lookup
 */
int fs_lookup_param(struct fs_context *fc,
                    struct fs_parameter *param,
                    bool want_bdev,
                    struct path *_path)
{
        struct filename *f;
        unsigned int flags = 0;
        bool put_f;
        int ret;

        switch (param->type) {
        case fs_value_is_string:
                f = getname_kernel(param->string);
                if (IS_ERR(f))
                        return PTR_ERR(f);
                put_f = true;
                break;
        case fs_value_is_filename:
                f = param->name;
                put_f = false;
                break;
        default:
                return invalf(fc, "%s: not usable as path", param->key);
        }

        f->refcnt++; /* filename_lookup() drops our ref. */
        ret = filename_lookup(param->dirfd, f, flags, _path, NULL);
        if (ret < 0) {
                errorf(fc, "%s: Lookup failure for '%s'", param->key, f->name);
                goto out;
        }

        if (want_bdev &&
            !S_ISBLK(d_backing_inode(_path->dentry)->i_mode)) {
                path_put(_path);
                _path->dentry = NULL;
                _path->mnt = NULL;
                errorf(fc, "%s: Non-blockdev passed as '%s'",
                       param->key, f->name);
                ret = -ENOTBLK;
        }

out:
        if (put_f)
                putname(f);
        return ret;
}
EXPORT_SYMBOL(fs_lookup_param);

static int fs_param_bad_value(struct p_log *log, struct fs_parameter *param)
{
        return inval_plog(log, "Bad value for '%s'", param->key);
}

int fs_param_is_bool(struct p_log *log, const struct fs_parameter_spec *p,
                     struct fs_parameter *param, struct fs_parse_result *result)
{
        int b;
        if (param->type != fs_value_is_string)
                return fs_param_bad_value(log, param);
        b = lookup_constant(bool_names, param->string, -1);
        if (b == -1)
                return fs_param_bad_value(log, param);
        result->boolean = b;
        return 0;
}
EXPORT_SYMBOL(fs_param_is_bool);

int fs_param_is_u32(struct p_log *log, const struct fs_parameter_spec *p,
                    struct fs_parameter *param, struct fs_parse_result *result)
{
        int base = (unsigned long)p->data;
        if (param->type != fs_value_is_string ||
            kstrtouint(param->string, base, &result->uint_32) < 0)
                return fs_param_bad_value(log, param);
        return 0;
}
EXPORT_SYMBOL(fs_param_is_u32);

int fs_param_is_s32(struct p_log *log, const struct fs_parameter_spec *p,
                    struct fs_parameter *param, struct fs_parse_result *result)
{
        if (param->type != fs_value_is_string ||
            kstrtoint(param->string, 0, &result->int_32) < 0)
                return fs_param_bad_value(log, param);
        return 0;
}
EXPORT_SYMBOL(fs_param_is_s32);

int fs_param_is_u64(struct p_log *log, const struct fs_parameter_spec *p,
                    struct fs_parameter *param, struct fs_parse_result *result)
{
        if (param->type != fs_value_is_string ||
            kstrtoull(param->string, 0, &result->uint_64) < 0)
                return fs_param_bad_value(log, param);
        return 0;
}
EXPORT_SYMBOL(fs_param_is_u64);

int fs_param_is_enum(struct p_log *log, const struct fs_parameter_spec *p,
                     struct fs_parameter *param, struct fs_parse_result *result)
{
        const struct constant_table *c;
        if (param->type != fs_value_is_string)
                return fs_param_bad_value(log, param);
        c = __lookup_constant(p->data, param->string);
        if (!c)
                return fs_param_bad_value(log, param);
        result->uint_32 = c->value;
        return 0;
}
EXPORT_SYMBOL(fs_param_is_enum);

int fs_param_is_string(struct p_log *log, const struct fs_parameter_spec *p,
                       struct fs_parameter *param, struct fs_parse_result *result)
{
        if (param->type != fs_value_is_string || !*param->string)
                return fs_param_bad_value(log, param);
        return 0;
}
EXPORT_SYMBOL(fs_param_is_string);

int fs_param_is_blob(struct p_log *log, const struct fs_parameter_spec *p,
                     struct fs_parameter *param, struct fs_parse_result *result)
{
        if (param->type != fs_value_is_blob)
                return fs_param_bad_value(log, param);
        return 0;
}
EXPORT_SYMBOL(fs_param_is_blob);

int fs_param_is_fd(struct p_log *log, const struct fs_parameter_spec *p,
                  struct fs_parameter *param, struct fs_parse_result *result)
{
        switch (param->type) {
        case fs_value_is_string:
                if (kstrtouint(param->string, 0, &result->uint_32) < 0)
                        break;
                if (result->uint_32 <= INT_MAX)
                        return 0;
                break;
        case fs_value_is_file:
                result->uint_32 = param->dirfd;
                if (result->uint_32 <= INT_MAX)
                        return 0;
                break;
        default:
                break;
        }
        return fs_param_bad_value(log, param);
}
EXPORT_SYMBOL(fs_param_is_fd);

int fs_param_is_blockdev(struct p_log *log, const struct fs_parameter_spec *p,
                  struct fs_parameter *param, struct fs_parse_result *result)
{
        return 0;
}
EXPORT_SYMBOL(fs_param_is_blockdev);

int fs_param_is_path(struct p_log *log, const struct fs_parameter_spec *p,
                     struct fs_parameter *param, struct fs_parse_result *result)
{
        return 0;
}
EXPORT_SYMBOL(fs_param_is_path);

#ifdef CONFIG_VALIDATE_FS_PARSER
/**
 * validate_constant_table - Validate a constant table
 * @name: Name to use in reporting
 * @tbl: The constant table to validate.
 * @tbl_size: The size of the table.
 * @low: The lowest permissible value.
 * @high: The highest permissible value.
 * @special: One special permissible value outside of the range.
 */
bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size,
                             int low, int high, int special)
{
        size_t i;
        bool good = true;

        if (tbl_size == 0) {
                pr_warn("VALIDATE C-TBL: Empty\n");
                return true;
        }

        for (i = 0; i < tbl_size; i++) {
                if (!tbl[i].name) {
                        pr_err("VALIDATE C-TBL[%zu]: Null\n", i);
                        good = false;
                } else if (i > 0 && tbl[i - 1].name) {
                        int c = strcmp(tbl[i-1].name, tbl[i].name);

                        if (c == 0) {
                                pr_err("VALIDATE C-TBL[%zu]: Duplicate %s\n",
                                       i, tbl[i].name);
                                good = false;
                        }
                        if (c > 0) {
                                pr_err("VALIDATE C-TBL[%zu]: Missorted %s>=%s\n",
                                       i, tbl[i-1].name, tbl[i].name);
                                good = false;
                        }
                }

                if (tbl[i].value != special &&
                    (tbl[i].value < low || tbl[i].value > high)) {
                        pr_err("VALIDATE C-TBL[%zu]: %s->%d const out of range (%d-%d)\n",
                               i, tbl[i].name, tbl[i].value, low, high);
                        good = false;
                }
        }

        return good;
}

/**
 * fs_validate_description - Validate a parameter description
 * @desc: The parameter description to validate.
 */
bool fs_validate_description(const char *name,
        const struct fs_parameter_spec *desc)
{
        const struct fs_parameter_spec *param, *p2;
        bool good = true;

        for (param = desc; param->name; param++) {
                /* Check for duplicate parameter names */
                for (p2 = desc; p2 < param; p2++) {
                        if (strcmp(param->name, p2->name) == 0) {
                                if (is_flag(param) != is_flag(p2))
                                        continue;
                                pr_err("VALIDATE %s: PARAM[%s]: Duplicate\n",
                                       name, param->name);
                                good = false;
                        }
                }
        }
        return good;
}
#endif /* CONFIG_VALIDATE_FS_PARSER */



























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 























    1 


    1 













    1 






























































































































































































































































    1 






    1 



    1 
    1 


    1 
    1 



    1 


    1 
    1 
    1 







    1 







    1 
    1 





    1 

    1 





    1 


    1 











    1 

    1 














































































    1 





    1 






























    1 










    1 





    1 























    1 


























































































    1 










    1 



    1 

































































    1 






















    1 








    1 







    1 

    1 









    1 



















































































    1 


    1 










    1 


















    1 



    1 













    1 
    1 

    1 






    1 




    1 


    1 
    1 
    1 
    1 

    1 









































































































































    1 

    1 
    1 

    1 


    1 




    1 







    1 






    1 





    1 







    1 


    1 






    1 





    1 





    1 










    1 

    1 













    1 
    1 
























    1 
















    1 






















































































































































































































































































































































































































































































































































































































    1 

















    1 

    1 








    1 


    1 












    1 








    1 








    1 
    1 
    1 









    1 
    1 


    1 







    1 

    1 








    1 

    1 



    1 














    1 

    1 

































































































































































































































































































    1 










    1 


    1 
    1 






    1 












    1 













































































































    1 










































































































































































































































































    1 

    1 























    1 














    1 





    1 





    1 

















































    1 

    1 
    1 









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 

    1 
























    1 































    1 















































































































































































































    1 





























    1 






    1 















    1 
















    1 

    1 
    1 






























    1 






    1 




















    1 
    1 





    1 




    1 











    1 


    1 






    1 

    1 






    1 















































































































































































































































































































































































































































































































































































































































































    1 






























































































































































































    1 













    1 

































    1 
    1 



    1 




    1 
    1 

    1 




    1 






    1 











    1 































































































    1 













    1 




    1 














    1 



















































































    1 




    1 

    1 









    1 












































































    1 






    1 




    1 







    1 












    1 



    1 









    1 


    1 
    1 





















    1 




































    1 




    1 


    1 


    1 













    1 
    1 
    1 



    1 
    1 






    1 





































    1 




    1 










    1 




















    1 
















    1 

    1 

    1 
    1 








    1 







    1 



















    1 












    1 














    1 


    1 


    1 















    1 




    1 











    1 














































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 



























































































































































































































































    1 








    1 










    1 



    1 




    1 








    1 














    1 



    1 


































    1 






































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057
8058
8059
8060
8061
8062
8063
8064
8065
8066
8067
8068
8069
8070
8071
8072
8073
8074
8075
8076
8077
8078
8079
8080
8081
8082
8083
8084
8085
8086
8087
8088
8089
8090
8091
8092
8093
8094
8095
8096
8097
8098
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118
8119
8120
8121
8122
8123
8124
8125
8126
8127
8128
8129
8130
8131
8132
8133
8134
8135
8136
8137
8138
8139
8140
8141
8142
8143
8144
8145
8146
8147
8148
8149
8150
8151
8152
8153
8154
8155
8156
8157
8158
8159
8160
8161
8162
8163
8164
8165
8166
8167
8168
8169
8170
8171
8172
8173
8174
8175
8176
8177
8178
8179
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195
8196
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213
8214
8215
8216
8217
8218
8219
8220
8221
8222
8223
8224
8225
8226
8227
8228
8229
8230
8231
8232
8233
8234
8235
8236
8237
8238
8239
8240
8241
8242
8243
8244
8245
8246
8247
8248
8249
8250
8251
8252
8253
8254
8255
8256
8257
8258
8259
8260
8261
8262
8263
8264
8265
8266
8267
8268
8269
8270
8271
8272
8273
8274
8275
8276
8277
8278
8279
8280
8281
8282
8283
8284
8285
8286
8287
8288
8289
8290
8291
8292
8293
8294
8295
8296
8297
8298
8299
8300
8301
8302
8303
8304
8305
8306
8307
8308
8309
8310
8311
8312
8313
8314
8315
8316
8317
8318
8319
8320
8321
8322
8323
8324
8325
8326
8327
8328
8329
8330
8331
8332
8333
8334
8335
8336
8337
8338
8339
8340
8341
8342
8343
8344
8345
8346
8347
8348
8349
8350
8351
8352
8353
8354
8355
8356
8357
8358
8359
8360
8361
8362
8363
8364
8365
8366
8367
8368
8369
8370
8371
8372
8373
8374
8375
8376
8377
8378
8379
8380
8381
8382
8383
8384
8385
8386
8387
8388
8389
8390
8391
8392
8393
8394
8395
8396
8397
8398
8399
8400
8401
8402
8403
8404
8405
8406
8407
8408
8409
8410
8411
8412
8413
8414
8415
8416
8417
8418
8419
8420
8421
8422
8423
8424
8425
8426
8427
8428
8429
8430
8431
8432
8433
8434
8435
8436
8437
8438
8439
8440
8441
8442
8443
8444
8445
8446
8447
8448
8449
8450
8451
8452
8453
8454
8455
8456
8457
8458
8459
8460
8461
8462
8463
8464
8465
8466
8467
8468
8469
8470
8471
8472
8473
8474
8475
8476
8477
8478
8479
8480
8481
8482
8483
8484
8485
8486
8487
8488
8489
8490
8491
8492
8493
8494
8495
8496
8497
8498
8499
8500
8501
8502
8503
8504
8505
8506
8507
8508
8509
8510
8511
8512
8513
8514
8515
8516
8517
8518
8519
8520
8521
8522
8523
8524
8525
8526
8527
8528
8529
8530
8531
8532
8533
8534
8535
8536
8537
8538
8539
8540
8541
8542
8543
8544
8545
8546
8547
8548
8549
8550
8551
8552
8553
8554
8555
8556
8557
8558
8559
8560
8561
8562
8563
8564
8565
8566
8567
8568
8569
8570
8571
8572
8573
8574
8575
8576
8577
8578
8579
8580
8581
8582
8583
8584
8585
8586
8587
8588
8589
8590
8591
8592
8593
8594
8595
8596
8597
8598
8599
8600
8601
8602
8603
8604
8605
8606
8607
8608
8609
8610
8611
8612
8613
8614
8615
8616
8617
8618
8619
8620
8621
8622
8623
8624
8625
8626
8627
8628
8629
8630
8631
8632
8633
8634
8635
8636
8637
8638
8639
8640
8641
8642
8643
8644
8645
8646
8647
8648
8649
8650
8651
8652
8653
8654
8655
8656
8657
8658
8659
8660
8661
8662
8663
8664
8665
8666
8667
8668
8669
8670
8671
8672
8673
8674
8675
8676
8677
8678
8679
8680
8681
8682
8683
8684
8685
8686
8687
8688
8689
8690
8691
8692
8693
8694
8695
8696
8697
8698
8699
8700
8701
8702
8703
8704
8705
8706
8707
8708
8709
8710
8711
8712
8713
8714
8715
8716
8717
8718
8719
8720
8721
8722
8723
8724
8725
8726
8727
8728
8729
8730
8731
8732
8733
8734
8735
8736
8737
8738
8739
8740
8741
8742
8743
8744
8745
8746
8747
8748
8749
8750
8751
8752
8753
8754
8755
8756
8757
8758
8759
8760
8761
8762
8763
8764
8765
8766
8767
8768
8769
8770
8771
8772
8773
8774
8775
8776
8777
8778
8779
8780
8781
8782
8783
8784
8785
8786
8787
8788
8789
8790
8791
8792
8793
8794
8795
8796
8797
8798
8799
8800
8801
8802
8803
8804
8805
8806
8807
8808
8809
8810
8811
8812
8813
8814
8815
8816
8817
8818
8819
8820
8821
8822
8823
8824
8825
8826
8827
8828
8829
8830
8831
8832
8833
8834
8835
8836
8837
8838
8839
8840
8841
8842
8843
8844
8845
8846
8847
8848
8849
8850
8851
8852
8853
8854
8855
8856
8857
8858
8859
8860
8861
8862
8863
8864
8865
8866
8867
8868
8869
8870
8871
8872
8873
8874
8875
8876
8877
8878
8879
8880
8881
8882
8883
8884
8885
8886
8887
8888
8889
8890
8891
8892
8893
8894
8895
8896
8897
8898
8899
8900
8901
8902
8903
8904
8905
8906
8907
8908
8909
8910
8911
8912
8913
8914
8915
8916
8917
8918
8919
8920
8921
8922
8923
8924
8925
8926
8927
8928
8929
8930
8931
8932
8933
8934
8935
8936
8937
8938
8939
8940
8941
8942
8943
8944
8945
8946
8947
8948
8949
8950
8951
8952
8953
8954
8955
8956
8957
8958
8959
8960
8961
8962
8963
8964
8965
8966
8967
8968
8969
8970
8971
8972
8973
8974
8975
8976
8977
8978
8979
8980
8981
8982
8983
8984
8985
8986
8987
8988
8989
8990
8991
8992
8993
8994
8995
8996
8997
8998
8999
9000
9001
9002
9003
9004
9005
9006
9007
9008
9009
9010
9011
9012
9013
9014
9015
9016
9017
9018
9019
9020
9021
9022
9023
9024
9025
9026
9027
9028
9029
9030
9031
9032
9033
9034
9035
9036
9037
9038
9039
9040
9041
9042
9043
9044
9045
9046
9047
9048
9049
9050
9051
9052
9053
9054
9055
9056
9057
9058
9059
9060
9061
9062
9063
9064
9065
9066
9067
9068
9069
9070
9071
9072
9073
9074
9075
9076
9077
9078
9079
9080
9081
9082
9083
9084
9085
9086
9087
9088
9089
9090
9091
9092
9093
9094
9095
9096
9097
9098
9099
9100
9101
9102
9103
9104
9105
9106
9107
9108
9109
9110
9111
9112
9113
9114
9115
9116
9117
9118
9119
9120
9121
9122
9123
9124
9125
9126
9127
9128
9129
9130
9131
9132
9133
9134
9135
9136
9137
9138
9139
9140
9141
9142
9143
9144
9145
9146
9147
9148
9149
9150
9151
9152
9153
9154
9155
9156
9157
9158
9159
9160
9161
9162
9163
9164
9165
9166
9167
9168
9169
9170
9171
9172
9173
9174
9175
9176
9177
9178
9179
9180
9181
9182
9183
9184
9185
9186
9187
9188
9189
9190
9191
9192
9193
9194
9195
9196
9197
9198
9199
9200
9201
9202
9203
9204
9205
9206
9207
9208
9209
9210
9211
9212
9213
9214
9215
9216
9217
9218
9219
9220
9221
9222
9223
9224
9225
9226
9227
9228
9229
9230
9231
9232
9233
9234
9235
9236
9237
9238
9239
9240
9241
9242
9243
9244
9245
9246
9247
9248
9249
9250
9251
9252
9253
9254
9255
9256
9257
9258
9259
9260
9261
9262
9263
9264
9265
9266
9267
9268
9269
9270
9271
9272
9273
9274
9275
9276
9277
9278
9279
9280
9281
9282
9283
9284
9285
9286
9287
9288
9289
9290
9291
9292
9293
9294
9295
9296
9297
9298
9299
9300
9301
9302
9303
9304
9305
9306
9307
9308
9309
9310
9311
9312
9313
9314
9315
9316
9317
9318
9319
9320
9321
9322
9323
9324
9325
9326
9327
9328
9329
9330
9331
9332
9333
9334
9335
9336
9337
9338
9339
9340
9341
9342
9343
9344
9345
9346
9347
9348
9349
9350
9351
9352
9353
9354
9355
9356
9357
9358
9359
9360
9361
9362
9363
9364
9365
9366
9367
9368
9369
9370
9371
9372
9373
9374
9375
9376
9377
9378
9379
9380
9381
9382
9383
9384
9385
9386
9387
9388
9389
9390
9391
9392
9393
9394
9395
9396
9397
9398
9399
9400
9401
9402
9403
9404
9405
9406
9407
9408
9409
9410
9411
9412
9413
9414
9415
9416
9417
9418
9419
9420
9421
9422
9423
9424
9425
9426
9427
9428
9429
9430
9431
9432
9433
9434
9435
9436
9437
9438
9439
9440
9441
9442
9443
9444
9445
9446
9447
9448
9449
9450
9451
9452
9453
9454
9455
9456
9457
9458
9459
9460
9461
9462
9463
9464
9465
9466
9467
9468
9469
9470
9471
9472
9473
9474
9475
9476
9477
9478
9479
9480
9481
9482
9483
9484
9485
9486
9487
9488
9489
9490
9491
9492
9493
9494
9495
9496
9497
9498
9499
9500
9501
9502
9503
9504
9505
9506
9507
9508
9509
9510
9511
9512
9513
9514
9515
9516
9517
9518
9519
9520
9521
9522
9523
9524
9525
9526
9527
9528
9529
9530
9531
9532
9533
9534
9535
9536
9537
9538
9539
9540
9541
9542
9543
9544
9545
9546
9547
9548
9549
9550
9551
9552
9553
9554
9555
9556
9557
9558
9559
9560
9561
9562
9563
9564
9565
9566
9567
9568
9569
9570
9571
9572
9573
9574
9575
9576
9577
9578
9579
9580
9581
9582
9583
9584
9585
9586
9587
9588
9589
9590
9591
9592
9593
9594
9595
9596
9597
9598
9599
9600
9601
9602
9603
9604
9605
9606
9607
9608
9609
9610
9611
9612
9613
9614
9615
9616
9617
9618
9619
9620
9621
9622
9623
9624
9625
9626
9627
9628
9629
9630
9631
9632
9633
9634
9635
9636
9637
9638
9639
9640
9641
9642
9643
9644
9645
9646
9647
9648
9649
9650
9651
9652
9653
9654
9655
9656
9657
9658
9659
9660
9661
9662
9663
9664
9665
9666
9667
9668
9669
9670
9671
9672
9673
9674
9675
9676
9677
9678
9679
9680
9681
9682
9683
9684
9685
9686
9687
9688
9689
9690
9691
9692
9693
9694
9695
9696
9697
9698
9699
9700
9701
9702
9703
9704
9705
9706
9707
9708
9709
9710
9711
9712
9713
9714
9715
9716
9717
9718
9719
9720
9721
9722
9723
9724
9725
9726
9727
9728
9729
9730
9731
9732
9733
9734
9735
9736
9737
9738
9739
9740
9741
9742
9743
9744
9745
9746
9747
9748
9749
9750
9751
9752
9753
9754
9755
9756
9757
9758
9759
9760
9761
9762
9763
9764
9765
9766
9767
9768
9769
9770
9771
9772
9773
9774
9775
9776
9777
9778
9779
9780
9781
9782
9783
9784
9785
9786
9787
9788
9789
9790
9791
9792
9793
9794
9795
9796
9797
9798
9799
9800
9801
9802
9803
9804
9805
9806
9807
9808
9809
9810
9811
9812
9813
9814
9815
9816
9817
9818
9819
9820
9821
9822
9823
9824
9825
9826
9827
9828
9829
9830
9831
9832
9833
9834
9835
9836
9837
9838
9839
9840
9841
9842
9843
9844
9845
9846
9847
9848
9849
9850
9851
9852
9853
9854
9855
9856
9857
9858
9859
9860
9861
9862
9863
9864
9865
9866
9867
9868
9869
9870
9871
9872
9873
9874
9875
9876
9877
9878
9879
9880
9881
9882
9883
9884
9885
9886
9887
9888
9889
9890
9891
9892
9893
9894
9895
9896
9897
9898
9899
9900
9901
9902
9903
9904
9905
9906
9907
9908
9909
9910
9911
9912
9913
9914
9915
9916
9917
9918
9919
9920
9921
9922
9923
9924
9925
9926
9927
9928
9929
9930
9931
9932
9933
9934
9935
9936
9937
9938
9939
9940
9941
9942
9943
9944
9945
9946
9947
9948
9949
9950
9951
9952
9953
9954
9955
9956
9957
9958
9959
9960
9961
9962
9963
9964
9965
9966
9967
9968
9969
9970
9971
9972
9973
9974
9975
9976
9977
9978
9979
9980
9981
9982
9983
9984
9985
9986
9987
9988
9989
9990
9991
9992
9993
9994
9995
9996
9997
9998
9999
10000
10001
10002
10003
10004
10005
10006
10007
10008
10009
10010
10011
10012
10013
10014
10015
10016
10017
10018
10019
10020
10021
10022
10023
10024
10025
10026
10027
10028
10029
10030
10031
10032
10033
10034
10035
10036
10037
10038
10039
10040
10041
10042
10043
10044
10045
10046
10047
10048
10049
10050
10051
10052
10053
10054
10055
10056
10057
10058
10059
10060
10061
10062
10063
10064
10065
10066
10067
10068
10069
10070
10071
10072
10073
10074
10075
10076
10077
10078
10079
10080
10081
10082
10083
10084
10085
10086
10087
10088
10089
10090
10091
10092
10093
10094
10095
10096
10097
10098
10099
10100
10101
10102
10103
10104
10105
10106
10107
10108
10109
10110
10111
10112
10113
10114
10115
10116
10117
10118
10119
10120
10121
10122
10123
10124
10125
10126
10127
10128
10129
10130
10131
10132
10133
10134
10135
10136
10137
10138
10139
10140
10141
10142
10143
10144
10145
10146
10147
10148
10149
10150
10151
10152
10153
10154
10155
10156
10157
10158
10159
10160
10161
10162
10163
10164
10165
10166
10167
10168
10169
10170
10171
10172
10173
10174
10175
10176
10177
10178
10179
10180
10181
10182
10183
10184
10185
10186
10187
10188
10189
10190
10191
10192
10193
10194
10195
10196
10197
10198
10199
10200
10201
10202
10203
10204
10205
10206
10207
10208
10209
10210
10211
10212
10213
10214
10215
10216
10217
10218
10219
10220
10221
10222
10223
10224
10225
10226
10227
10228
10229
10230
10231
10232
10233
10234
10235
10236
10237
10238
10239
10240
10241
10242
10243
10244
10245
10246
10247
10248
10249
10250
10251
10252
10253
10254
10255
10256
10257
10258
10259
10260
10261
10262
10263
10264
10265
10266
10267
10268
10269
10270
10271
10272
10273
10274
10275
10276
10277
10278
10279
10280
10281
10282
10283
10284
10285
10286
10287
10288
10289
10290
10291
10292
10293
10294
10295
10296
10297
10298
10299
10300
10301
10302
10303
10304
10305
10306
10307
10308
10309
10310
10311
10312
10313
10314
10315
10316
10317
10318
10319
10320
10321
10322
10323
10324
10325
10326
10327
10328
10329
10330
10331
10332
10333
10334
10335
10336
10337
10338
10339
10340
10341
10342
10343
10344
10345
10346
10347
10348
10349
10350
10351
10352
10353
10354
10355
10356
10357
10358
10359
10360
10361
10362
10363
10364
10365
10366
10367
10368
10369
10370
10371
10372
10373
10374
10375
10376
10377
10378
10379
10380
10381
10382
10383
10384
10385
10386
10387
10388
10389
10390
10391
10392
10393
10394
10395
10396
10397
10398
10399
10400
10401
10402
10403
10404
10405
10406
10407
10408
10409
10410
10411
10412
10413
10414
10415
10416
10417
10418
10419
10420
10421
10422
10423
10424
10425
10426
10427
10428
10429
10430
10431
10432
10433
10434
10435
10436
10437
10438
10439
10440
10441
10442
10443
10444
10445
10446
10447
10448
10449
10450
10451
10452
10453
10454
10455
10456
10457
10458
10459
10460
10461
10462
10463
10464
10465
10466
10467
10468
10469
10470
10471
10472
10473
10474
10475
10476
10477
10478
10479
10480
10481
10482
10483
10484
10485
10486
10487
10488
10489
10490
10491
10492
10493
10494
10495
10496
10497
10498
10499
10500
10501
10502
10503
10504
10505
10506
10507
10508
10509
10510
10511
10512
10513
10514
10515
10516
10517
10518
10519
10520
10521
10522
10523
10524
10525
10526
10527
10528
10529
10530
10531
10532
10533
10534
10535
10536
10537
10538
10539
10540
10541
10542
10543
10544
10545
10546
10547
10548
10549
10550
10551
10552
10553
10554
10555
10556
10557
10558
10559
10560
10561
10562
10563
10564
10565
10566
10567
10568
10569
10570
10571
10572
10573
10574
10575
10576
10577
10578
10579
10580
10581
10582
10583
10584
10585
10586
10587
10588
10589
10590
10591
10592
10593
10594
10595
10596
10597
10598
10599
10600
10601
10602
10603
10604
10605
10606
10607
10608
10609
10610
10611
10612
10613
10614
10615
10616
10617
10618
10619
10620
10621
10622
10623
10624
10625
10626
10627
10628
10629
10630
10631
10632
10633
10634
10635
10636
10637
10638
10639
10640
10641
10642
10643
10644
10645
10646
10647
10648
10649
10650
10651
10652
10653
10654
10655
10656
10657
10658
10659
10660
10661
10662
10663
10664
10665
10666
10667
10668
10669
10670
10671
10672
10673
10674
10675
10676
10677
10678
10679
10680
10681
10682
10683
10684
10685
10686
10687
10688
10689
10690
10691
10692
10693
10694
10695
10696
10697
10698
10699
10700
10701
10702
10703
10704
10705
10706
10707
10708
10709
10710
10711
10712
10713
10714
10715
10716
10717
10718
10719
10720
10721
10722
10723
10724
10725
10726
10727
10728
10729
10730
10731
10732
10733
10734
10735
10736
10737
10738
10739
10740
10741
10742
10743
10744
10745
10746
10747
10748
10749
10750
10751
10752
10753
10754
10755
10756
10757
10758
10759
10760
10761
10762
10763
10764
10765
10766
10767
10768
10769
10770
10771
10772
10773
10774
10775
10776
10777
10778
10779
10780
10781
10782
10783
10784
10785
10786
10787
10788
10789
10790
10791
10792
10793
10794
10795
10796
10797
10798
10799
10800
10801
10802
10803
10804
10805
10806
10807
10808
10809
10810
10811
10812
10813
10814
10815
10816
10817
10818
10819
10820
10821
10822
10823
10824
10825
10826
10827
10828
10829
10830
10831
10832
10833
10834
10835
10836
10837
10838
10839
10840
10841
10842
10843
10844
10845
10846
10847
10848
10849
10850
10851
10852
10853
10854
10855
10856
10857
10858
10859
10860
10861
10862
10863
10864
10865
10866
10867
10868
10869
10870
10871
10872
10873
10874
10875
10876
10877
10878
10879
10880
10881
10882
10883
10884
10885
10886
10887
10888
10889
10890
10891
10892
10893
10894
10895
10896
10897
10898
10899
10900
10901
10902
10903
10904
10905
10906
10907
10908
10909
10910
10911
10912
10913
10914
10915
10916
10917
10918
10919
10920
10921
10922
10923
10924
10925
10926
10927
10928
10929
10930
10931
10932
10933
10934
10935
10936
10937
10938
10939
10940
10941
10942
10943
10944
10945
10946
10947
10948
10949
10950
10951
10952
10953
10954
10955
10956
10957
10958
10959
10960
10961
10962
10963
10964
10965
10966
10967
10968
10969
10970
10971
10972
10973
10974
10975
10976
10977
10978
10979
10980
10981
10982
10983
10984
10985
10986
10987
10988
10989
10990
10991
10992
10993
10994
10995
10996
10997
10998
// SPDX-License-Identifier: GPL-2.0
/*
 * Shared application/kernel submission and completion ring pairs, for
 * supporting fast/efficient IO.
 *
 * A note on the read/write ordering memory barriers that are matched between
 * the application and kernel side.
 *
 * After the application reads the CQ ring tail, it must use an
 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
 * before writing the tail (using smp_load_acquire to read the tail will
 * do). It also needs a smp_mb() before updating CQ head (ordering the
 * entry load(s) with the head store), pairing with an implicit barrier
 * through a control-dependency in io_get_cqe (smp_store_release to
 * store head will do). Failure to do so could lead to reading invalid
 * CQ entries.
 *
 * Likewise, the application must use an appropriate smp_wmb() before
 * writing the SQ tail (ordering SQ entry stores with the tail store),
 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
 * to store the tail will do). And it needs a barrier ordering the SQ
 * head load before writing new SQ entries (smp_load_acquire to read
 * head will do).
 *
 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
 * updating the SQ tail; a full memory barrier smp_mb() is needed
 * between.
 *
 * Also see the examples in the liburing library:
 *
 *        git://git.kernel.dk/liburing
 *
 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
 * from data shared between the kernel and application. This is done both
 * for ordering purposes, but also to ensure that once a value is loaded from
 * data that the application could potentially modify, it remains stable.
 *
 * Copyright (C) 2018-2019 Jens Axboe
 * Copyright (c) 2018-2019 Christoph Hellwig
 */
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/syscalls.h>
#include <linux/compat.h>
#include <net/compat.h>
#include <linux/refcount.h>
#include <linux/uio.h>
#include <linux/bits.h>

#include <linux/sched/signal.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/percpu.h>
#include <linux/cpuset.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/bvec.h>
#include <linux/net.h>
#include <net/sock.h>
#include <net/af_unix.h>
#include <linux/anon_inodes.h>
#include <linux/sched/mm.h>
#include <linux/uaccess.h>
#include <linux/nospec.h>
#include <linux/sizes.h>
#include <linux/hugetlb.h>
#include <linux/highmem.h>
#include <linux/namei.h>
#include <linux/fsnotify.h>
#include <linux/fadvise.h>
#include <linux/eventpoll.h>
#include <linux/splice.h>
#include <linux/task_work.h>
#include <linux/pagemap.h>
#include <linux/io_uring.h>
#include <linux/tracehook.h>

#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>

#include <uapi/linux/io_uring.h>

#include "../fs/internal.h"
#include "io-wq.h"

#define IORING_MAX_ENTRIES        32768
#define IORING_MAX_CQ_ENTRIES        (2 * IORING_MAX_ENTRIES)
#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8

/* only define max */
#define IORING_MAX_FIXED_FILES        (1U << 15)
#define IORING_MAX_RESTRICTIONS        (IORING_RESTRICTION_LAST + \
                                 IORING_REGISTER_LAST + IORING_OP_LAST)

#define IO_RSRC_TAG_TABLE_SHIFT        (PAGE_SHIFT - 3)
#define IO_RSRC_TAG_TABLE_MAX        (1U << IO_RSRC_TAG_TABLE_SHIFT)
#define IO_RSRC_TAG_TABLE_MASK        (IO_RSRC_TAG_TABLE_MAX - 1)

#define IORING_MAX_REG_BUFFERS        (1U << 14)

#define SQE_VALID_FLAGS        (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK|        \
                                IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
                                IOSQE_BUFFER_SELECT)
#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
                                REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS)

#define IO_TCTX_REFS_CACHE_NR        (1U << 10)

struct io_uring {
        u32 head ____cacheline_aligned_in_smp;
        u32 tail ____cacheline_aligned_in_smp;
};

/*
 * This data is shared with the application through the mmap at offsets
 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
 *
 * The offsets to the member fields are published through struct
 * io_sqring_offsets when calling io_uring_setup.
 */
struct io_rings {
        /*
         * Head and tail offsets into the ring; the offsets need to be
         * masked to get valid indices.
         *
         * The kernel controls head of the sq ring and the tail of the cq ring,
         * and the application controls tail of the sq ring and the head of the
         * cq ring.
         */
        struct io_uring                sq, cq;
        /*
         * Bitmasks to apply to head and tail offsets (constant, equals
         * ring_entries - 1)
         */
        u32                        sq_ring_mask, cq_ring_mask;
        /* Ring sizes (constant, power of 2) */
        u32                        sq_ring_entries, cq_ring_entries;
        /*
         * Number of invalid entries dropped by the kernel due to
         * invalid index stored in array
         *
         * Written by the kernel, shouldn't be modified by the
         * application (i.e. get number of "new events" by comparing to
         * cached value).
         *
         * After a new SQ head value was read by the application this
         * counter includes all submissions that were dropped reaching
         * the new SQ head (and possibly more).
         */
        u32                        sq_dropped;
        /*
         * Runtime SQ flags
         *
         * Written by the kernel, shouldn't be modified by the
         * application.
         *
         * The application needs a full memory barrier before checking
         * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
         */
        u32                        sq_flags;
        /*
         * Runtime CQ flags
         *
         * Written by the application, shouldn't be modified by the
         * kernel.
         */
        u32                        cq_flags;
        /*
         * Number of completion events lost because the queue was full;
         * this should be avoided by the application by making sure
         * there are not more requests pending than there is space in
         * the completion queue.
         *
         * Written by the kernel, shouldn't be modified by the
         * application (i.e. get number of "new events" by comparing to
         * cached value).
         *
         * As completion events come in out of order this counter is not
         * ordered with any other data.
         */
        u32                        cq_overflow;
        /*
         * Ring buffer of completion events.
         *
         * The kernel writes completion events fresh every time they are
         * produced, so the application is allowed to modify pending
         * entries.
         */
        struct io_uring_cqe        cqes[] ____cacheline_aligned_in_smp;
};

enum io_uring_cmd_flags {
        IO_URING_F_NONBLOCK                = 1,
        IO_URING_F_COMPLETE_DEFER        = 2,
};

struct io_mapped_ubuf {
        u64                ubuf;
        u64                ubuf_end;
        unsigned int        nr_bvecs;
        unsigned long        acct_pages;
        struct bio_vec        bvec[];
};

struct io_ring_ctx;

struct io_overflow_cqe {
        struct io_uring_cqe cqe;
        struct list_head list;
};

struct io_fixed_file {
        /* file * with additional FFS_* flags */
        unsigned long file_ptr;
};

struct io_rsrc_put {
        struct list_head list;
        u64 tag;
        union {
                void *rsrc;
                struct file *file;
                struct io_mapped_ubuf *buf;
        };
};

struct io_file_table {
        struct io_fixed_file *files;
};

struct io_rsrc_node {
        struct percpu_ref                refs;
        struct list_head                node;
        struct list_head                rsrc_list;
        struct io_rsrc_data                *rsrc_data;
        struct llist_node                llist;
        bool                                done;
};

typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);

struct io_rsrc_data {
        struct io_ring_ctx                *ctx;

        u64                                **tags;
        unsigned int                        nr;
        rsrc_put_fn                        *do_put;
        atomic_t                        refs;
        struct completion                done;
        bool                                quiesce;
};

struct io_buffer {
        struct list_head list;
        __u64 addr;
        __u32 len;
        __u16 bid;
};

struct io_restriction {
        DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
        DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
        u8 sqe_flags_allowed;
        u8 sqe_flags_required;
        bool registered;
};

enum {
        IO_SQ_THREAD_SHOULD_STOP = 0,
        IO_SQ_THREAD_SHOULD_PARK,
};

struct io_sq_data {
        refcount_t                refs;
        atomic_t                park_pending;
        struct mutex                lock;

        /* ctx's that are using this sqd */
        struct list_head        ctx_list;

        struct task_struct        *thread;
        struct wait_queue_head        wait;

        unsigned                sq_thread_idle;
        int                        sq_cpu;
        pid_t                        task_pid;
        pid_t                        task_tgid;

        unsigned long                state;
        struct completion        exited;
};

#define IO_COMPL_BATCH                        32
#define IO_REQ_CACHE_SIZE                32
#define IO_REQ_ALLOC_BATCH                8

struct io_submit_link {
        struct io_kiocb                *head;
        struct io_kiocb                *last;
};

struct io_submit_state {
        struct blk_plug                plug;
        struct io_submit_link        link;

        /*
         * io_kiocb alloc cache
         */
        void                        *reqs[IO_REQ_CACHE_SIZE];
        unsigned int                free_reqs;

        bool                        plug_started;

        /*
         * Batch completion logic
         */
        struct io_kiocb                *compl_reqs[IO_COMPL_BATCH];
        unsigned int                compl_nr;
        /* inline/task_work completion list, under ->uring_lock */
        struct list_head        free_list;

        unsigned int                ios_left;
};

struct io_ring_ctx {
        /* const or read-mostly hot data */
        struct {
                struct percpu_ref        refs;

                struct io_rings                *rings;
                unsigned int                flags;
                unsigned int                compat: 1;
                unsigned int                drain_next: 1;
                unsigned int                eventfd_async: 1;
                unsigned int                restricted: 1;
                unsigned int                off_timeout_used: 1;
                unsigned int                drain_active: 1;
        } ____cacheline_aligned_in_smp;

        /* submission data */
        struct {
                struct mutex                uring_lock;

                /*
                 * Ring buffer of indices into array of io_uring_sqe, which is
                 * mmapped by the application using the IORING_OFF_SQES offset.
                 *
                 * This indirection could e.g. be used to assign fixed
                 * io_uring_sqe entries to operations and only submit them to
                 * the queue when needed.
                 *
                 * The kernel modifies neither the indices array nor the entries
                 * array.
                 */
                u32                        *sq_array;
                struct io_uring_sqe        *sq_sqes;
                unsigned                cached_sq_head;
                unsigned                sq_entries;
                struct list_head        defer_list;

                /*
                 * Fixed resources fast path, should be accessed only under
                 * uring_lock, and updated through io_uring_register(2)
                 */
                struct io_rsrc_node        *rsrc_node;
                struct io_file_table        file_table;
                unsigned                nr_user_files;
                unsigned                nr_user_bufs;
                struct io_mapped_ubuf        **user_bufs;

                struct io_submit_state        submit_state;
                struct list_head        timeout_list;
                struct list_head        ltimeout_list;
                struct list_head        cq_overflow_list;
                struct xarray                io_buffers;
                struct xarray                personalities;
                u32                        pers_next;
                unsigned                sq_thread_idle;
        } ____cacheline_aligned_in_smp;

        /* IRQ completion list, under ->completion_lock */
        struct list_head        locked_free_list;
        unsigned int                locked_free_nr;

        const struct cred        *sq_creds;        /* cred used for __io_sq_thread() */
        struct io_sq_data        *sq_data;        /* if using sq thread polling */

        struct wait_queue_head        sqo_sq_wait;
        struct list_head        sqd_list;

        unsigned long                check_cq_overflow;

        struct {
                unsigned                cached_cq_tail;
                unsigned                cq_entries;
                struct eventfd_ctx        *cq_ev_fd;
                struct wait_queue_head        poll_wait;
                struct wait_queue_head        cq_wait;
                unsigned                cq_extra;
                atomic_t                cq_timeouts;
                unsigned                cq_last_tm_flush;
        } ____cacheline_aligned_in_smp;

        struct {
                spinlock_t                completion_lock;

                spinlock_t                timeout_lock;

                /*
                 * ->iopoll_list is protected by the ctx->uring_lock for
                 * io_uring instances that don't use IORING_SETUP_SQPOLL.
                 * For SQPOLL, only the single threaded io_sq_thread() will
                 * manipulate the list, hence no extra locking is needed there.
                 */
                struct list_head        iopoll_list;
                struct hlist_head        *cancel_hash;
                unsigned                cancel_hash_bits;
                bool                        poll_multi_queue;
        } ____cacheline_aligned_in_smp;

        struct io_restriction                restrictions;

        /* slow path rsrc auxilary data, used by update/register */
        struct {
                struct io_rsrc_node                *rsrc_backup_node;
                struct io_mapped_ubuf                *dummy_ubuf;
                struct io_rsrc_data                *file_data;
                struct io_rsrc_data                *buf_data;

                struct delayed_work                rsrc_put_work;
                struct llist_head                rsrc_put_llist;
                struct list_head                rsrc_ref_list;
                spinlock_t                        rsrc_ref_lock;
        };

        /* Keep this last, we don't need it for the fast path */
        struct {
                /* hashed buffered write serialization */
                struct io_wq_hash                *hash_map;

                /* Only used for accounting purposes */
                struct user_struct                *user;
                struct mm_struct                *mm_account;

                /* ctx exit and cancelation */
                struct llist_head                fallback_llist;
                struct delayed_work                fallback_work;
                struct work_struct                exit_work;
                struct list_head                tctx_list;
                struct completion                ref_comp;
                u32                                iowq_limits[2];
                bool                                iowq_limits_set;
        };
};

struct io_uring_task {
        /* submission side */
        int                        cached_refs;
        struct xarray                xa;
        struct wait_queue_head        wait;
        const struct io_ring_ctx *last;
        struct io_wq                *io_wq;
        struct percpu_counter        inflight;
        atomic_t                inflight_tracked;
        atomic_t                in_idle;

        spinlock_t                task_lock;
        struct io_wq_work_list        task_list;
        struct callback_head        task_work;
        bool                        task_running;
};

/*
 * First field must be the file pointer in all the
 * iocb unions! See also 'struct kiocb' in <linux/fs.h>
 */
struct io_poll_iocb {
        struct file                        *file;
        struct wait_queue_head                *head;
        __poll_t                        events;
        int                                retries;
        struct wait_queue_entry                wait;
};

struct io_poll_update {
        struct file                        *file;
        u64                                old_user_data;
        u64                                new_user_data;
        __poll_t                        events;
        bool                                update_events;
        bool                                update_user_data;
};

struct io_close {
        struct file                        *file;
        int                                fd;
        u32                                file_slot;
};

struct io_timeout_data {
        struct io_kiocb                        *req;
        struct hrtimer                        timer;
        struct timespec64                ts;
        enum hrtimer_mode                mode;
        u32                                flags;
};

struct io_accept {
        struct file                        *file;
        struct sockaddr __user                *addr;
        int __user                        *addr_len;
        int                                flags;
        u32                                file_slot;
        unsigned long                        nofile;
};

struct io_sync {
        struct file                        *file;
        loff_t                                len;
        loff_t                                off;
        int                                flags;
        int                                mode;
};

struct io_cancel {
        struct file                        *file;
        u64                                addr;
};

struct io_timeout {
        struct file                        *file;
        u32                                off;
        u32                                target_seq;
        struct list_head                list;
        /* head of the link, used by linked timeouts only */
        struct io_kiocb                        *head;
        /* for linked completions */
        struct io_kiocb                        *prev;
};

struct io_timeout_rem {
        struct file                        *file;
        u64                                addr;

        /* timeout update */
        struct timespec64                ts;
        u32                                flags;
        bool                                ltimeout;
};

struct io_rw {
        /* NOTE: kiocb has the file as the first member, so don't do it here */
        struct kiocb                        kiocb;
        u64                                addr;
        u64                                len;
};

struct io_connect {
        struct file                        *file;
        struct sockaddr __user                *addr;
        int                                addr_len;
};

struct io_sr_msg {
        struct file                        *file;
        union {
                struct compat_msghdr __user        *umsg_compat;
                struct user_msghdr __user        *umsg;
                void __user                        *buf;
        };
        int                                msg_flags;
        int                                bgid;
        size_t                                len;
        size_t                                done_io;
        struct io_buffer                *kbuf;
        void __user                        *msg_control;
};

struct io_open {
        struct file                        *file;
        int                                dfd;
        u32                                file_slot;
        struct filename                        *filename;
        struct open_how                        how;
        unsigned long                        nofile;
};

struct io_rsrc_update {
        struct file                        *file;
        u64                                arg;
        u32                                nr_args;
        u32                                offset;
};

struct io_fadvise {
        struct file                        *file;
        u64                                offset;
        u32                                len;
        u32                                advice;
};

struct io_madvise {
        struct file                        *file;
        u64                                addr;
        u32                                len;
        u32                                advice;
};

struct io_epoll {
        struct file                        *file;
        int                                epfd;
        int                                op;
        int                                fd;
        struct epoll_event                event;
};

struct io_splice {
        struct file                        *file_out;
        loff_t                                off_out;
        loff_t                                off_in;
        u64                                len;
        int                                splice_fd_in;
        unsigned int                        flags;
};

struct io_provide_buf {
        struct file                        *file;
        __u64                                addr;
        __u32                                len;
        __u32                                bgid;
        __u16                                nbufs;
        __u16                                bid;
};

struct io_statx {
        struct file                        *file;
        int                                dfd;
        unsigned int                        mask;
        unsigned int                        flags;
        const char __user                *filename;
        struct statx __user                *buffer;
};

struct io_shutdown {
        struct file                        *file;
        int                                how;
};

struct io_rename {
        struct file                        *file;
        int                                old_dfd;
        int                                new_dfd;
        struct filename                        *oldpath;
        struct filename                        *newpath;
        int                                flags;
};

struct io_unlink {
        struct file                        *file;
        int                                dfd;
        int                                flags;
        struct filename                        *filename;
};

struct io_mkdir {
        struct file                        *file;
        int                                dfd;
        umode_t                                mode;
        struct filename                        *filename;
};

struct io_symlink {
        struct file                        *file;
        int                                new_dfd;
        struct filename                        *oldpath;
        struct filename                        *newpath;
};

struct io_hardlink {
        struct file                        *file;
        int                                old_dfd;
        int                                new_dfd;
        struct filename                        *oldpath;
        struct filename                        *newpath;
        int                                flags;
};

struct io_completion {
        struct file                        *file;
        u32                                cflags;
};

struct io_async_connect {
        struct sockaddr_storage                address;
};

struct io_async_msghdr {
        struct iovec                        fast_iov[UIO_FASTIOV];
        /* points to an allocated iov, if NULL we use fast_iov instead */
        struct iovec                        *free_iov;
        struct sockaddr __user                *uaddr;
        struct msghdr                        msg;
        struct sockaddr_storage                addr;
};

struct io_async_rw {
        struct iovec                        fast_iov[UIO_FASTIOV];
        const struct iovec                *free_iovec;
        struct iov_iter                        iter;
        struct iov_iter_state                iter_state;
        size_t                                bytes_done;
        struct wait_page_queue                wpq;
};

enum {
        REQ_F_FIXED_FILE_BIT        = IOSQE_FIXED_FILE_BIT,
        REQ_F_IO_DRAIN_BIT        = IOSQE_IO_DRAIN_BIT,
        REQ_F_LINK_BIT                = IOSQE_IO_LINK_BIT,
        REQ_F_HARDLINK_BIT        = IOSQE_IO_HARDLINK_BIT,
        REQ_F_FORCE_ASYNC_BIT        = IOSQE_ASYNC_BIT,
        REQ_F_BUFFER_SELECT_BIT        = IOSQE_BUFFER_SELECT_BIT,

        /* first byte is taken by user flags, shift it to not overlap */
        REQ_F_FAIL_BIT                = 8,
        REQ_F_INFLIGHT_BIT,
        REQ_F_CUR_POS_BIT,
        REQ_F_NOWAIT_BIT,
        REQ_F_LINK_TIMEOUT_BIT,
        REQ_F_NEED_CLEANUP_BIT,
        REQ_F_POLLED_BIT,
        REQ_F_BUFFER_SELECTED_BIT,
        REQ_F_COMPLETE_INLINE_BIT,
        REQ_F_REISSUE_BIT,
        REQ_F_CREDS_BIT,
        REQ_F_REFCOUNT_BIT,
        REQ_F_ARM_LTIMEOUT_BIT,
        REQ_F_PARTIAL_IO_BIT,
        /* keep async read/write and isreg together and in order */
        REQ_F_NOWAIT_READ_BIT,
        REQ_F_NOWAIT_WRITE_BIT,
        REQ_F_ISREG_BIT,

        /* not a real bit, just to check we're not overflowing the space */
        __REQ_F_LAST_BIT,
};

enum {
        /* ctx owns file */
        REQ_F_FIXED_FILE        = BIT(REQ_F_FIXED_FILE_BIT),
        /* drain existing IO first */
        REQ_F_IO_DRAIN                = BIT(REQ_F_IO_DRAIN_BIT),
        /* linked sqes */
        REQ_F_LINK                = BIT(REQ_F_LINK_BIT),
        /* doesn't sever on completion < 0 */
        REQ_F_HARDLINK                = BIT(REQ_F_HARDLINK_BIT),
        /* IOSQE_ASYNC */
        REQ_F_FORCE_ASYNC        = BIT(REQ_F_FORCE_ASYNC_BIT),
        /* IOSQE_BUFFER_SELECT */
        REQ_F_BUFFER_SELECT        = BIT(REQ_F_BUFFER_SELECT_BIT),

        /* fail rest of links */
        REQ_F_FAIL                = BIT(REQ_F_FAIL_BIT),
        /* on inflight list, should be cancelled and waited on exit reliably */
        REQ_F_INFLIGHT                = BIT(REQ_F_INFLIGHT_BIT),
        /* read/write uses file position */
        REQ_F_CUR_POS                = BIT(REQ_F_CUR_POS_BIT),
        /* must not punt to workers */
        REQ_F_NOWAIT                = BIT(REQ_F_NOWAIT_BIT),
        /* has or had linked timeout */
        REQ_F_LINK_TIMEOUT        = BIT(REQ_F_LINK_TIMEOUT_BIT),
        /* needs cleanup */
        REQ_F_NEED_CLEANUP        = BIT(REQ_F_NEED_CLEANUP_BIT),
        /* already went through poll handler */
        REQ_F_POLLED                = BIT(REQ_F_POLLED_BIT),
        /* buffer already selected */
        REQ_F_BUFFER_SELECTED        = BIT(REQ_F_BUFFER_SELECTED_BIT),
        /* completion is deferred through io_comp_state */
        REQ_F_COMPLETE_INLINE        = BIT(REQ_F_COMPLETE_INLINE_BIT),
        /* caller should reissue async */
        REQ_F_REISSUE                = BIT(REQ_F_REISSUE_BIT),
        /* supports async reads */
        REQ_F_NOWAIT_READ        = BIT(REQ_F_NOWAIT_READ_BIT),
        /* supports async writes */
        REQ_F_NOWAIT_WRITE        = BIT(REQ_F_NOWAIT_WRITE_BIT),
        /* regular file */
        REQ_F_ISREG                = BIT(REQ_F_ISREG_BIT),
        /* has creds assigned */
        REQ_F_CREDS                = BIT(REQ_F_CREDS_BIT),
        /* skip refcounting if not set */
        REQ_F_REFCOUNT                = BIT(REQ_F_REFCOUNT_BIT),
        /* there is a linked timeout that has to be armed */
        REQ_F_ARM_LTIMEOUT        = BIT(REQ_F_ARM_LTIMEOUT_BIT),
        /* request has already done partial IO */
        REQ_F_PARTIAL_IO        = BIT(REQ_F_PARTIAL_IO_BIT),
};

struct async_poll {
        struct io_poll_iocb        poll;
        struct io_poll_iocb        *double_poll;
};

typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);

struct io_task_work {
        union {
                struct io_wq_work_node        node;
                struct llist_node        fallback_node;
        };
        io_req_tw_func_t                func;
};

enum {
        IORING_RSRC_FILE                = 0,
        IORING_RSRC_BUFFER                = 1,
};

/*
 * NOTE! Each of the iocb union members has the file pointer
 * as the first entry in their struct definition. So you can
 * access the file pointer through any of the sub-structs,
 * or directly as just 'ki_filp' in this struct.
 */
struct io_kiocb {
        union {
                struct file                *file;
                struct io_rw                rw;
                struct io_poll_iocb        poll;
                struct io_poll_update        poll_update;
                struct io_accept        accept;
                struct io_sync                sync;
                struct io_cancel        cancel;
                struct io_timeout        timeout;
                struct io_timeout_rem        timeout_rem;
                struct io_connect        connect;
                struct io_sr_msg        sr_msg;
                struct io_open                open;
                struct io_close                close;
                struct io_rsrc_update        rsrc_update;
                struct io_fadvise        fadvise;
                struct io_madvise        madvise;
                struct io_epoll                epoll;
                struct io_splice        splice;
                struct io_provide_buf        pbuf;
                struct io_statx                statx;
                struct io_shutdown        shutdown;
                struct io_rename        rename;
                struct io_unlink        unlink;
                struct io_mkdir                mkdir;
                struct io_symlink        symlink;
                struct io_hardlink        hardlink;
                /* use only after cleaning per-op data, see io_clean_op() */
                struct io_completion        compl;
        };

        /* opcode allocated if it needs to store data for async defer */
        void                                *async_data;
        u8                                opcode;
        /* polled IO has completed */
        u8                                iopoll_completed;

        u16                                buf_index;
        u32                                result;

        struct io_ring_ctx                *ctx;
        unsigned int                        flags;
        atomic_t                        refs;
        struct task_struct                *task;
        u64                                user_data;

        struct io_kiocb                        *link;
        struct percpu_ref                *fixed_rsrc_refs;

        /* used with ctx->iopoll_list with reads/writes */
        struct list_head                inflight_entry;
        struct io_task_work                io_task_work;
        /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
        struct hlist_node                hash_node;
        struct async_poll                *apoll;
        struct io_wq_work                work;
        const struct cred                *creds;

        /* store used ubuf, so we can prevent reloading */
        struct io_mapped_ubuf                *imu;
        /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
        struct io_buffer                *kbuf;
        atomic_t                        poll_refs;
};

struct io_tctx_node {
        struct list_head        ctx_node;
        struct task_struct        *task;
        struct io_ring_ctx        *ctx;
};

struct io_defer_entry {
        struct list_head        list;
        struct io_kiocb                *req;
        u32                        seq;
};

struct io_op_def {
        /* needs req->file assigned */
        unsigned                needs_file : 1;
        /* hash wq insertion if file is a regular file */
        unsigned                hash_reg_file : 1;
        /* unbound wq insertion if file is a non-regular file */
        unsigned                unbound_nonreg_file : 1;
        /* opcode is not supported by this kernel */
        unsigned                not_supported : 1;
        /* set if opcode supports polled "wait" */
        unsigned                pollin : 1;
        unsigned                pollout : 1;
        /* op supports buffer selection */
        unsigned                buffer_select : 1;
        /* do prep async if is going to be punted */
        unsigned                needs_async_setup : 1;
        /* should block plug */
        unsigned                plug : 1;
        /* size of async data needed, if any */
        unsigned short                async_size;
};

static const struct io_op_def io_op_defs[] = {
        [IORING_OP_NOP] = {},
        [IORING_OP_READV] = {
                .needs_file                = 1,
                .unbound_nonreg_file        = 1,
                .pollin                        = 1,
                .buffer_select                = 1,
                .needs_async_setup        = 1,
                .plug                        = 1,
                .async_size                = sizeof(struct io_async_rw),
        },
        [IORING_OP_WRITEV] = {
                .needs_file                = 1,
                .hash_reg_file                = 1,
                .unbound_nonreg_file        = 1,
                .pollout                = 1,
                .needs_async_setup        = 1,
                .plug                        = 1,
                .async_size                = sizeof(struct io_async_rw),
        },
        [IORING_OP_FSYNC] = {
                .needs_file                = 1,
        },
        [IORING_OP_READ_FIXED] = {
                .needs_file                = 1,
                .unbound_nonreg_file        = 1,
                .pollin                        = 1,
                .plug                        = 1,
                .async_size                = sizeof(struct io_async_rw),
        },
        [IORING_OP_WRITE_FIXED] = {
                .needs_file                = 1,
                .hash_reg_file                = 1,
                .unbound_nonreg_file        = 1,
                .pollout                = 1,
                .plug                        = 1,
                .async_size                = sizeof(struct io_async_rw),
        },
        [IORING_OP_POLL_ADD] = {
                .needs_file                = 1,
                .unbound_nonreg_file        = 1,
        },
        [IORING_OP_POLL_REMOVE] = {},
        [IORING_OP_SYNC_FILE_RANGE] = {
                .needs_file                = 1,
        },
        [IORING_OP_SENDMSG] = {
                .needs_file                = 1,
                .unbound_nonreg_file        = 1,
                .pollout                = 1,
                .needs_async_setup        = 1,
                .async_size                = sizeof(struct io_async_msghdr),
        },
        [IORING_OP_RECVMSG] = {
                .needs_file                = 1,
                .unbound_nonreg_file        = 1,
                .pollin                        = 1,
                .buffer_select                = 1,
                .needs_async_setup        = 1,
                .async_size                = sizeof(struct io_async_msghdr),
        },
        [IORING_OP_TIMEOUT] = {
                .async_size                = sizeof(struct io_timeout_data),
        },
        [IORING_OP_TIMEOUT_REMOVE] = {
                /* used by timeout updates' prep() */
        },
        [IORING_OP_ACCEPT] = {
                .needs_file                = 1,
                .unbound_nonreg_file        = 1,
                .pollin                        = 1,
        },
        [IORING_OP_ASYNC_CANCEL] = {},
        [IORING_OP_LINK_TIMEOUT] = {
                .async_size                = sizeof(struct io_timeout_data),
        },
        [IORING_OP_CONNECT] = {
                .needs_file                = 1,
                .unbound_nonreg_file        = 1,
                .pollout                = 1,
                .needs_async_setup        = 1,
                .async_size                = sizeof(struct io_async_connect),
        },
        [IORING_OP_FALLOCATE] = {
                .needs_file                = 1,
        },
        [IORING_OP_OPENAT] = {},
        [IORING_OP_CLOSE] = {},
        [IORING_OP_FILES_UPDATE] = {},
        [IORING_OP_STATX] = {},
        [IORING_OP_READ] = {
                .needs_file                = 1,
                .unbound_nonreg_file        = 1,
                .pollin                        = 1,
                .buffer_select                = 1,
                .plug                        = 1,
                .async_size                = sizeof(struct io_async_rw),
        },
        [IORING_OP_WRITE] = {
                .needs_file                = 1,
                .hash_reg_file                = 1,
                .unbound_nonreg_file        = 1,
                .pollout                = 1,
                .plug                        = 1,
                .async_size                = sizeof(struct io_async_rw),
        },
        [IORING_OP_FADVISE] = {
                .needs_file                = 1,
        },
        [IORING_OP_MADVISE] = {},
        [IORING_OP_SEND] = {
                .needs_file                = 1,
                .unbound_nonreg_file        = 1,
                .pollout                = 1,
        },
        [IORING_OP_RECV] = {
                .needs_file                = 1,
                .unbound_nonreg_file        = 1,
                .pollin                        = 1,
                .buffer_select                = 1,
        },
        [IORING_OP_OPENAT2] = {
        },
        [IORING_OP_EPOLL_CTL] = {
                .unbound_nonreg_file        = 1,
        },
        [IORING_OP_SPLICE] = {
                .needs_file                = 1,
                .hash_reg_file                = 1,
                .unbound_nonreg_file        = 1,
        },
        [IORING_OP_PROVIDE_BUFFERS] = {},
        [IORING_OP_REMOVE_BUFFERS] = {},
        [IORING_OP_TEE] = {
                .needs_file                = 1,
                .hash_reg_file                = 1,
                .unbound_nonreg_file        = 1,
        },
        [IORING_OP_SHUTDOWN] = {
                .needs_file                = 1,
        },
        [IORING_OP_RENAMEAT] = {},
        [IORING_OP_UNLINKAT] = {},
};

/* requests with any of those set should undergo io_disarm_next() */
#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)

static bool io_disarm_next(struct io_kiocb *req);
static void io_uring_del_tctx_node(unsigned long index);
static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
                                         struct task_struct *task,
                                         bool cancel_all);
static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);

static void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags);

static void io_put_req(struct io_kiocb *req);
static void io_put_req_deferred(struct io_kiocb *req);
static void io_dismantle_req(struct io_kiocb *req);
static void io_queue_linked_timeout(struct io_kiocb *req);
static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
                                     struct io_uring_rsrc_update2 *up,
                                     unsigned nr_args);
static void io_clean_op(struct io_kiocb *req);
static struct file *io_file_get(struct io_ring_ctx *ctx,
                                struct io_kiocb *req, int fd, bool fixed,
                                unsigned int issue_flags);
static void __io_queue_sqe(struct io_kiocb *req);
static void io_rsrc_put_work(struct work_struct *work);

static void io_req_task_queue(struct io_kiocb *req);
static void io_submit_flush_completions(struct io_ring_ctx *ctx);
static int io_req_prep_async(struct io_kiocb *req);

static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
                                 unsigned int issue_flags, u32 slot_index);
static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);

static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);

static struct kmem_cache *req_cachep;

static const struct file_operations io_uring_fops;

static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
{
        if (!*locked) {
                mutex_lock(&ctx->uring_lock);
                *locked = true;
        }
}

#define io_for_each_link(pos, head) \
        for (pos = (head); pos; pos = pos->link)

/*
 * Shamelessly stolen from the mm implementation of page reference checking,
 * see commit f958d7b528b1 for details.
 */
#define req_ref_zero_or_close_to_overflow(req)        \
        ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)

static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
{
        WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
        return atomic_inc_not_zero(&req->refs);
}

static inline bool req_ref_put_and_test(struct io_kiocb *req)
{
        if (likely(!(req->flags & REQ_F_REFCOUNT)))
                return true;

        WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
        return atomic_dec_and_test(&req->refs);
}

static inline void req_ref_get(struct io_kiocb *req)
{
        WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
        WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
        atomic_inc(&req->refs);
}

static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
{
        if (!(req->flags & REQ_F_REFCOUNT)) {
                req->flags |= REQ_F_REFCOUNT;
                atomic_set(&req->refs, nr);
        }
}

static inline void io_req_set_refcount(struct io_kiocb *req)
{
        __io_req_set_refcount(req, 1);
}

static inline void io_req_set_rsrc_node(struct io_kiocb *req)
{
        struct io_ring_ctx *ctx = req->ctx;

        if (!req->fixed_rsrc_refs) {
                req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
                percpu_ref_get(req->fixed_rsrc_refs);
        }
}

static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl)
{
        bool got = percpu_ref_tryget(ref);

        /* already at zero, wait for ->release() */
        if (!got)
                wait_for_completion(compl);
        percpu_ref_resurrect(ref);
        if (got)
                percpu_ref_put(ref);
}

static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
                          bool cancel_all)
        __must_hold(&req->ctx->timeout_lock)
{
        struct io_kiocb *req;

        if (task && head->task != task)
                return false;
        if (cancel_all)
                return true;

        io_for_each_link(req, head) {
                if (req->flags & REQ_F_INFLIGHT)
                        return true;
        }
        return false;
}

static bool io_match_linked(struct io_kiocb *head)
{
        struct io_kiocb *req;

        io_for_each_link(req, head) {
                if (req->flags & REQ_F_INFLIGHT)
                        return true;
        }
        return false;
}

/*
 * As io_match_task() but protected against racing with linked timeouts.
 * User must not hold timeout_lock.
 */
static bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
                               bool cancel_all)
{
        bool matched;

        if (task && head->task != task)
                return false;
        if (cancel_all)
                return true;

        if (head->flags & REQ_F_LINK_TIMEOUT) {
                struct io_ring_ctx *ctx = head->ctx;

                /* protect against races with linked timeouts */
                spin_lock_irq(&ctx->timeout_lock);
                matched = io_match_linked(head);
                spin_unlock_irq(&ctx->timeout_lock);
        } else {
                matched = io_match_linked(head);
        }
        return matched;
}

static inline void req_set_fail(struct io_kiocb *req)
{
        req->flags |= REQ_F_FAIL;
}

static inline void req_fail_link_node(struct io_kiocb *req, int res)
{
        req_set_fail(req);
        req->result = res;
}

static void io_ring_ctx_ref_free(struct percpu_ref *ref)
{
        struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);

        complete(&ctx->ref_comp);
}

static inline bool io_is_timeout_noseq(struct io_kiocb *req)
{
        return !req->timeout.off;
}

static void io_fallback_req_func(struct work_struct *work)
{
        struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
                                                fallback_work.work);
        struct llist_node *node = llist_del_all(&ctx->fallback_llist);
        struct io_kiocb *req, *tmp;
        bool locked = false;

        percpu_ref_get(&ctx->refs);
        llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
                req->io_task_work.func(req, &locked);

        if (locked) {
                if (ctx->submit_state.compl_nr)
                        io_submit_flush_completions(ctx);
                mutex_unlock(&ctx->uring_lock);
        }
        percpu_ref_put(&ctx->refs);

}

static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
{
        struct io_ring_ctx *ctx;
        int hash_bits;

        ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
        if (!ctx)
                return NULL;

        /*
         * Use 5 bits less than the max cq entries, that should give us around
         * 32 entries per hash list if totally full and uniformly spread.
         */
        hash_bits = ilog2(p->cq_entries);
        hash_bits -= 5;
        if (hash_bits <= 0)
                hash_bits = 1;
        ctx->cancel_hash_bits = hash_bits;
        ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
                                        GFP_KERNEL);
        if (!ctx->cancel_hash)
                goto err;
        __hash_init(ctx->cancel_hash, 1U << hash_bits);

        ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
        if (!ctx->dummy_ubuf)
                goto err;
        /* set invalid range, so io_import_fixed() fails meeting it */
        ctx->dummy_ubuf->ubuf = -1UL;

        if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
                            PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
                goto err;

        ctx->flags = p->flags;
        init_waitqueue_head(&ctx->sqo_sq_wait);
        INIT_LIST_HEAD(&ctx->sqd_list);
        init_waitqueue_head(&ctx->poll_wait);
        INIT_LIST_HEAD(&ctx->cq_overflow_list);
        init_completion(&ctx->ref_comp);
        xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
        xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
        mutex_init(&ctx->uring_lock);
        init_waitqueue_head(&ctx->cq_wait);
        spin_lock_init(&ctx->completion_lock);
        spin_lock_init(&ctx->timeout_lock);
        INIT_LIST_HEAD(&ctx->iopoll_list);
        INIT_LIST_HEAD(&ctx->defer_list);
        INIT_LIST_HEAD(&ctx->timeout_list);
        INIT_LIST_HEAD(&ctx->ltimeout_list);
        spin_lock_init(&ctx->rsrc_ref_lock);
        INIT_LIST_HEAD(&ctx->rsrc_ref_list);
        INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
        init_llist_head(&ctx->rsrc_put_llist);
        INIT_LIST_HEAD(&ctx->tctx_list);
        INIT_LIST_HEAD(&ctx->submit_state.free_list);
        INIT_LIST_HEAD(&ctx->locked_free_list);
        INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
        return ctx;
err:
        kfree(ctx->dummy_ubuf);
        kfree(ctx->cancel_hash);
        kfree(ctx);
        return NULL;
}

static void io_account_cq_overflow(struct io_ring_ctx *ctx)
{
        struct io_rings *r = ctx->rings;

        WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
        ctx->cq_extra--;
}

static bool req_need_defer(struct io_kiocb *req, u32 seq)
{
        if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
                struct io_ring_ctx *ctx = req->ctx;

                return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
        }

        return false;
}

#define FFS_ASYNC_READ                0x1UL
#define FFS_ASYNC_WRITE                0x2UL
#ifdef CONFIG_64BIT
#define FFS_ISREG                0x4UL
#else
#define FFS_ISREG                0x0UL
#endif
#define FFS_MASK                ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG)

static inline bool io_req_ffs_set(struct io_kiocb *req)
{
        return IS_ENABLED(CONFIG_64BIT) && (req->flags & REQ_F_FIXED_FILE);
}

static void io_req_track_inflight(struct io_kiocb *req)
{
        if (!(req->flags & REQ_F_INFLIGHT)) {
                req->flags |= REQ_F_INFLIGHT;
                atomic_inc(&req->task->io_uring->inflight_tracked);
        }
}

static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
{
        if (WARN_ON_ONCE(!req->link))
                return NULL;

        req->flags &= ~REQ_F_ARM_LTIMEOUT;
        req->flags |= REQ_F_LINK_TIMEOUT;

        /* linked timeouts should have two refs once prep'ed */
        io_req_set_refcount(req);
        __io_req_set_refcount(req->link, 2);
        return req->link;
}

static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
{
        if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
                return NULL;
        return __io_prep_linked_timeout(req);
}

static void io_prep_async_work(struct io_kiocb *req)
{
        const struct io_op_def *def = &io_op_defs[req->opcode];
        struct io_ring_ctx *ctx = req->ctx;

        if (!(req->flags & REQ_F_CREDS)) {
                req->flags |= REQ_F_CREDS;
                req->creds = get_current_cred();
        }

        req->work.list.next = NULL;
        req->work.flags = 0;
        if (req->flags & REQ_F_FORCE_ASYNC)
                req->work.flags |= IO_WQ_WORK_CONCURRENT;

        if (req->flags & REQ_F_ISREG) {
                if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
                        io_wq_hash_work(&req->work, file_inode(req->file));
        } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
                if (def->unbound_nonreg_file)
                        req->work.flags |= IO_WQ_WORK_UNBOUND;
        }
}

static void io_prep_async_link(struct io_kiocb *req)
{
        struct io_kiocb *cur;

        if (req->flags & REQ_F_LINK_TIMEOUT) {
                struct io_ring_ctx *ctx = req->ctx;

                spin_lock_irq(&ctx->timeout_lock);
                io_for_each_link(cur, req)
                        io_prep_async_work(cur);
                spin_unlock_irq(&ctx->timeout_lock);
        } else {
                io_for_each_link(cur, req)
                        io_prep_async_work(cur);
        }
}

static void io_queue_async_work(struct io_kiocb *req, bool *locked)
{
        struct io_ring_ctx *ctx = req->ctx;
        struct io_kiocb *link = io_prep_linked_timeout(req);
        struct io_uring_task *tctx = req->task->io_uring;

        /* must not take the lock, NULL it as a precaution */
        locked = NULL;

        BUG_ON(!tctx);
        BUG_ON(!tctx->io_wq);

        /* init ->work of the whole link before punting */
        io_prep_async_link(req);

        /*
         * Not expected to happen, but if we do have a bug where this _can_
         * happen, catch it here and ensure the request is marked as
         * canceled. That will make io-wq go through the usual work cancel
         * procedure rather than attempt to run this request (or create a new
         * worker for it).
         */
        if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
                req->work.flags |= IO_WQ_WORK_CANCEL;

        trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
                                        &req->work, req->flags);
        io_wq_enqueue(tctx->io_wq, &req->work);
        if (link)
                io_queue_linked_timeout(link);
}

static void io_kill_timeout(struct io_kiocb *req, int status)
        __must_hold(&req->ctx->completion_lock)
        __must_hold(&req->ctx->timeout_lock)
{
        struct io_timeout_data *io = req->async_data;

        if (hrtimer_try_to_cancel(&io->timer) != -1) {
                if (status)
                        req_set_fail(req);
                atomic_set(&req->ctx->cq_timeouts,
                        atomic_read(&req->ctx->cq_timeouts) + 1);
                list_del_init(&req->timeout.list);
                io_fill_cqe_req(req, status, 0);
                io_put_req_deferred(req);
        }
}

static void io_queue_deferred(struct io_ring_ctx *ctx)
{
        lockdep_assert_held(&ctx->completion_lock);

        while (!list_empty(&ctx->defer_list)) {
                struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
                                                struct io_defer_entry, list);

                if (req_need_defer(de->req, de->seq))
                        break;
                list_del_init(&de->list);
                io_req_task_queue(de->req);
                kfree(de);
        }
}

static void io_flush_timeouts(struct io_ring_ctx *ctx)
        __must_hold(&ctx->completion_lock)
{
        u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
        struct io_kiocb *req, *tmp;

        spin_lock_irq(&ctx->timeout_lock);
        list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
                u32 events_needed, events_got;

                if (io_is_timeout_noseq(req))
                        break;

                /*
                 * Since seq can easily wrap around over time, subtract
                 * the last seq at which timeouts were flushed before comparing.
                 * Assuming not more than 2^31-1 events have happened since,
                 * these subtractions won't have wrapped, so we can check if
                 * target is in [last_seq, current_seq] by comparing the two.
                 */
                events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
                events_got = seq - ctx->cq_last_tm_flush;
                if (events_got < events_needed)
                        break;

                io_kill_timeout(req, 0);
        }
        ctx->cq_last_tm_flush = seq;
        spin_unlock_irq(&ctx->timeout_lock);
}

static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
{
        if (ctx->off_timeout_used)
                io_flush_timeouts(ctx);
        if (ctx->drain_active)
                io_queue_deferred(ctx);
}

static inline bool io_commit_needs_flush(struct io_ring_ctx *ctx)
{
        return ctx->off_timeout_used || ctx->drain_active;
}

static inline void __io_commit_cqring(struct io_ring_ctx *ctx)
{
        /* order cqe stores with ring update */
        smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
}

static inline void io_commit_cqring(struct io_ring_ctx *ctx)
{
        if (unlikely(io_commit_needs_flush(ctx)))
                __io_commit_cqring_flush(ctx);
        __io_commit_cqring(ctx);
}

static inline bool io_sqring_full(struct io_ring_ctx *ctx)
{
        struct io_rings *r = ctx->rings;

        /*
         * SQPOLL must use the actual sqring head, as using the cached_sq_head
         * is race prone if the SQPOLL thread has grabbed entries but not yet
         * committed them to the ring. For !SQPOLL, this doesn't matter, but
         * since this helper is just used for SQPOLL sqring waits (or POLLOUT),
         * just read the actual sqring head unconditionally.
         */
        return READ_ONCE(r->sq.tail) - READ_ONCE(r->sq.head) == ctx->sq_entries;
}

static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
{
        return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
}

static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
{
        struct io_rings *rings = ctx->rings;
        unsigned tail, mask = ctx->cq_entries - 1;

        /*
         * writes to the cq entry need to come after reading head; the
         * control dependency is enough as we're using WRITE_ONCE to
         * fill the cq entry
         */
        if (__io_cqring_events(ctx) == ctx->cq_entries)
                return NULL;

        tail = ctx->cached_cq_tail++;
        return &rings->cqes[tail & mask];
}

static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
{
        if (likely(!ctx->cq_ev_fd))
                return false;
        if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
                return false;
        return !ctx->eventfd_async || io_wq_current_is_worker();
}

/*
 * This should only get called when at least one event has been posted.
 * Some applications rely on the eventfd notification count only changing
 * IFF a new CQE has been added to the CQ ring. There's no depedency on
 * 1:1 relationship between how many times this function is called (and
 * hence the eventfd count) and number of CQEs posted to the CQ ring.
 */
static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
{
        /*
         * wake_up_all() may seem excessive, but io_wake_function() and
         * io_should_wake() handle the termination of the loop and only
         * wake as many waiters as we need to.
         */
        if (wq_has_sleeper(&ctx->cq_wait))
                __wake_up(&ctx->cq_wait, TASK_NORMAL, 0,
                                poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
        if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
                wake_up(&ctx->sq_data->wait);
        if (io_should_trigger_evfd(ctx))
                eventfd_signal_mask(ctx->cq_ev_fd, 1, EPOLL_URING_WAKE);
        if (waitqueue_active(&ctx->poll_wait))
                __wake_up(&ctx->poll_wait, TASK_INTERRUPTIBLE, 0,
                                poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
}

static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
{
        /* see waitqueue_active() comment */
        smp_mb();

        if (ctx->flags & IORING_SETUP_SQPOLL) {
                if (waitqueue_active(&ctx->cq_wait))
                        __wake_up(&ctx->cq_wait, TASK_NORMAL, 0,
                                  poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
        }
        if (io_should_trigger_evfd(ctx))
                eventfd_signal_mask(ctx->cq_ev_fd, 1, EPOLL_URING_WAKE);
        if (waitqueue_active(&ctx->poll_wait))
                __wake_up(&ctx->poll_wait, TASK_INTERRUPTIBLE, 0,
                                poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
}

/* Returns true if there are no backlogged entries after the flush */
static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
{
        bool all_flushed, posted;

        if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
                return false;

        posted = false;
        spin_lock(&ctx->completion_lock);
        while (!list_empty(&ctx->cq_overflow_list)) {
                struct io_uring_cqe *cqe = io_get_cqe(ctx);
                struct io_overflow_cqe *ocqe;

                if (!cqe && !force)
                        break;
                ocqe = list_first_entry(&ctx->cq_overflow_list,
                                        struct io_overflow_cqe, list);
                if (cqe)
                        memcpy(cqe, &ocqe->cqe, sizeof(*cqe));
                else
                        io_account_cq_overflow(ctx);

                posted = true;
                list_del(&ocqe->list);
                kfree(ocqe);
        }

        all_flushed = list_empty(&ctx->cq_overflow_list);
        if (all_flushed) {
                clear_bit(0, &ctx->check_cq_overflow);
                WRITE_ONCE(ctx->rings->sq_flags,
                           ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW);
        }

        if (posted)
                io_commit_cqring(ctx);
        spin_unlock(&ctx->completion_lock);
        if (posted)
                io_cqring_ev_posted(ctx);
        return all_flushed;
}

static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
{
        bool ret = true;

        if (test_bit(0, &ctx->check_cq_overflow)) {
                /* iopoll syncs against uring_lock, not completion_lock */
                if (ctx->flags & IORING_SETUP_IOPOLL)
                        mutex_lock(&ctx->uring_lock);
                ret = __io_cqring_overflow_flush(ctx, false);
                if (ctx->flags & IORING_SETUP_IOPOLL)
                        mutex_unlock(&ctx->uring_lock);
        }

        return ret;
}

/* must to be called somewhat shortly after putting a request */
static inline void io_put_task(struct task_struct *task, int nr)
{
        struct io_uring_task *tctx = task->io_uring;

        if (likely(task == current)) {
                tctx->cached_refs += nr;
        } else {
                percpu_counter_sub(&tctx->inflight, nr);
                if (unlikely(atomic_read(&tctx->in_idle)))
                        wake_up(&tctx->wait);
                put_task_struct_many(task, nr);
        }
}

static void io_task_refs_refill(struct io_uring_task *tctx)
{
        unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;

        percpu_counter_add(&tctx->inflight, refill);
        refcount_add(refill, &current->usage);
        tctx->cached_refs += refill;
}

static inline void io_get_task_refs(int nr)
{
        struct io_uring_task *tctx = current->io_uring;

        tctx->cached_refs -= nr;
        if (unlikely(tctx->cached_refs < 0))
                io_task_refs_refill(tctx);
}

static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
{
        struct io_uring_task *tctx = task->io_uring;
        unsigned int refs = tctx->cached_refs;

        if (refs) {
                tctx->cached_refs = 0;
                percpu_counter_sub(&tctx->inflight, refs);
                put_task_struct_many(task, refs);
        }
}

static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
                                     s32 res, u32 cflags)
{
        struct io_overflow_cqe *ocqe;

        ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
        if (!ocqe) {
                /*
                 * If we're in ring overflow flush mode, or in task cancel mode,
                 * or cannot allocate an overflow entry, then we need to drop it
                 * on the floor.
                 */
                io_account_cq_overflow(ctx);
                return false;
        }
        if (list_empty(&ctx->cq_overflow_list)) {
                set_bit(0, &ctx->check_cq_overflow);
                WRITE_ONCE(ctx->rings->sq_flags,
                           ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW);

        }
        ocqe->cqe.user_data = user_data;
        ocqe->cqe.res = res;
        ocqe->cqe.flags = cflags;
        list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
        return true;
}

static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
                                 s32 res, u32 cflags)
{
        struct io_uring_cqe *cqe;

        trace_io_uring_complete(ctx, user_data, res, cflags);

        /*
         * If we can't get a cq entry, userspace overflowed the
         * submission (by quite a lot). Increment the overflow count in
         * the ring.
         */
        cqe = io_get_cqe(ctx);
        if (likely(cqe)) {
                WRITE_ONCE(cqe->user_data, user_data);
                WRITE_ONCE(cqe->res, res);
                WRITE_ONCE(cqe->flags, cflags);
                return true;
        }
        return io_cqring_event_overflow(ctx, user_data, res, cflags);
}

static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
{
        __io_fill_cqe(req->ctx, req->user_data, res, cflags);
}

static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data,
                                     s32 res, u32 cflags)
{
        ctx->cq_extra++;
        return __io_fill_cqe(ctx, user_data, res, cflags);
}

static void io_req_complete_post(struct io_kiocb *req, s32 res,
                                 u32 cflags)
{
        struct io_ring_ctx *ctx = req->ctx;

        spin_lock(&ctx->completion_lock);
        __io_fill_cqe(ctx, req->user_data, res, cflags);
        /*
         * If we're the last reference to this request, add to our locked
         * free_list cache.
         */
        if (req_ref_put_and_test(req)) {
                if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
                        if (req->flags & IO_DISARM_MASK)
                                io_disarm_next(req);
                        if (req->link) {
                                io_req_task_queue(req->link);
                                req->link = NULL;
                        }
                }
                io_dismantle_req(req);
                io_put_task(req->task, 1);
                list_add(&req->inflight_entry, &ctx->locked_free_list);
                ctx->locked_free_nr++;
        } else {
                if (!percpu_ref_tryget(&ctx->refs))
                        req = NULL;
        }
        io_commit_cqring(ctx);
        spin_unlock(&ctx->completion_lock);

        if (req) {
                io_cqring_ev_posted(ctx);
                percpu_ref_put(&ctx->refs);
        }
}

static inline bool io_req_needs_clean(struct io_kiocb *req)
{
        return req->flags & IO_REQ_CLEAN_FLAGS;
}

static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
                                         u32 cflags)
{
        if (io_req_needs_clean(req))
                io_clean_op(req);
        req->result = res;
        req->compl.cflags = cflags;
        req->flags |= REQ_F_COMPLETE_INLINE;
}

static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
                                     s32 res, u32 cflags)
{
        if (issue_flags & IO_URING_F_COMPLETE_DEFER)
                io_req_complete_state(req, res, cflags);
        else
                io_req_complete_post(req, res, cflags);
}

static inline void io_req_complete(struct io_kiocb *req, s32 res)
{
        __io_req_complete(req, 0, res, 0);
}

static void io_req_complete_failed(struct io_kiocb *req, s32 res)
{
        req_set_fail(req);
        io_req_complete_post(req, res, 0);
}

static void io_req_complete_fail_submit(struct io_kiocb *req)
{
        /*
         * We don't submit, fail them all, for that replace hardlinks with
         * normal links. Extra REQ_F_LINK is tolerated.
         */
        req->flags &= ~REQ_F_HARDLINK;
        req->flags |= REQ_F_LINK;
        io_req_complete_failed(req, req->result);
}

/*
 * Don't initialise the fields below on every allocation, but do that in
 * advance and keep them valid across allocations.
 */
static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
{
        req->ctx = ctx;
        req->link = NULL;
        req->async_data = NULL;
        /* not necessary, but safer to zero */
        req->result = 0;
}

static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
                                        struct io_submit_state *state)
{
        spin_lock(&ctx->completion_lock);
        list_splice_init(&ctx->locked_free_list, &state->free_list);
        ctx->locked_free_nr = 0;
        spin_unlock(&ctx->completion_lock);
}

/* Returns true IFF there are requests in the cache */
static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
{
        struct io_submit_state *state = &ctx->submit_state;
        int nr;

        /*
         * If we have more than a batch's worth of requests in our IRQ side
         * locked cache, grab the lock and move them over to our submission
         * side cache.
         */
        if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH)
                io_flush_cached_locked_reqs(ctx, state);

        nr = state->free_reqs;
        while (!list_empty(&state->free_list)) {
                struct io_kiocb *req = list_first_entry(&state->free_list,
                                        struct io_kiocb, inflight_entry);

                list_del(&req->inflight_entry);
                state->reqs[nr++] = req;
                if (nr == ARRAY_SIZE(state->reqs))
                        break;
        }

        state->free_reqs = nr;
        return nr != 0;
}

/*
 * A request might get retired back into the request caches even before opcode
 * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
 * Because of that, io_alloc_req() should be called only under ->uring_lock
 * and with extra caution to not get a request that is still worked on.
 */
static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
        __must_hold(&ctx->uring_lock)
{
        struct io_submit_state *state = &ctx->submit_state;
        gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
        int ret, i;

        BUILD_BUG_ON(ARRAY_SIZE(state->reqs) < IO_REQ_ALLOC_BATCH);

        if (likely(state->free_reqs || io_flush_cached_reqs(ctx)))
                goto got_req;

        ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH,
                                    state->reqs);

        /*
         * Bulk alloc is all-or-nothing. If we fail to get a batch,
         * retry single alloc to be on the safe side.
         */
        if (unlikely(ret <= 0)) {
                state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
                if (!state->reqs[0])
                        return NULL;
                ret = 1;
        }

        for (i = 0; i < ret; i++)
                io_preinit_req(state->reqs[i], ctx);
        state->free_reqs = ret;
got_req:
        state->free_reqs--;
        return state->reqs[state->free_reqs];
}

static inline void io_put_file(struct file *file)
{
        if (file)
                fput(file);
}

static void io_dismantle_req(struct io_kiocb *req)
{
        unsigned int flags = req->flags;

        if (io_req_needs_clean(req))
                io_clean_op(req);
        if (!(flags & REQ_F_FIXED_FILE))
                io_put_file(req->file);
        if (req->fixed_rsrc_refs)
                percpu_ref_put(req->fixed_rsrc_refs);
        if (req->async_data) {
                kfree(req->async_data);
                req->async_data = NULL;
        }
}

static void __io_free_req(struct io_kiocb *req)
{
        struct io_ring_ctx *ctx = req->ctx;

        io_dismantle_req(req);
        io_put_task(req->task, 1);

        spin_lock(&ctx->completion_lock);
        list_add(&req->inflight_entry, &ctx->locked_free_list);
        ctx->locked_free_nr++;
        spin_unlock(&ctx->completion_lock);

        percpu_ref_put(&ctx->refs);
}

static inline void io_remove_next_linked(struct io_kiocb *req)
{
        struct io_kiocb *nxt = req->link;

        req->link = nxt->link;
        nxt->link = NULL;
}

static bool io_kill_linked_timeout(struct io_kiocb *req)
        __must_hold(&req->ctx->completion_lock)
        __must_hold(&req->ctx->timeout_lock)
{
        struct io_kiocb *link = req->link;

        if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
                struct io_timeout_data *io = link->async_data;

                io_remove_next_linked(req);
                link->timeout.head = NULL;
                if (hrtimer_try_to_cancel(&io->timer) != -1) {
                        list_del(&link->timeout.list);
                        io_fill_cqe_req(link, -ECANCELED, 0);
                        io_put_req_deferred(link);
                        return true;
                }
        }
        return false;
}

static void io_fail_links(struct io_kiocb *req)
        __must_hold(&req->ctx->completion_lock)
{
        struct io_kiocb *nxt, *link = req->link;

        req->link = NULL;
        while (link) {
                long res = -ECANCELED;

                if (link->flags & REQ_F_FAIL)
                        res = link->result;

                nxt = link->link;
                link->link = NULL;

                trace_io_uring_fail_link(req, link);
                io_fill_cqe_req(link, res, 0);
                io_put_req_deferred(link);
                link = nxt;
        }
}

static bool io_disarm_next(struct io_kiocb *req)
        __must_hold(&req->ctx->completion_lock)
{
        bool posted = false;

        if (req->flags & REQ_F_ARM_LTIMEOUT) {
                struct io_kiocb *link = req->link;

                req->flags &= ~REQ_F_ARM_LTIMEOUT;
                if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
                        io_remove_next_linked(req);
                        io_fill_cqe_req(link, -ECANCELED, 0);
                        io_put_req_deferred(link);
                        posted = true;
                }
        } else if (req->flags & REQ_F_LINK_TIMEOUT) {
                struct io_ring_ctx *ctx = req->ctx;

                spin_lock_irq(&ctx->timeout_lock);
                posted = io_kill_linked_timeout(req);
                spin_unlock_irq(&ctx->timeout_lock);
        }
        if (unlikely((req->flags & REQ_F_FAIL) &&
                     !(req->flags & REQ_F_HARDLINK))) {
                posted |= (req->link != NULL);
                io_fail_links(req);
        }
        return posted;
}

static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
{
        struct io_kiocb *nxt;

        /*
         * If LINK is set, we have dependent requests in this chain. If we
         * didn't fail this request, queue the first one up, moving any other
         * dependencies to the next request. In case of failure, fail the rest
         * of the chain.
         */
        if (req->flags & IO_DISARM_MASK) {
                struct io_ring_ctx *ctx = req->ctx;
                bool posted;

                spin_lock(&ctx->completion_lock);
                posted = io_disarm_next(req);
                if (posted)
                        io_commit_cqring(req->ctx);
                spin_unlock(&ctx->completion_lock);
                if (posted)
                        io_cqring_ev_posted(ctx);
        }
        nxt = req->link;
        req->link = NULL;
        return nxt;
}

static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
{
        if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
                return NULL;
        return __io_req_find_next(req);
}

static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
{
        if (!ctx)
                return;
        if (*locked) {
                if (ctx->submit_state.compl_nr)
                        io_submit_flush_completions(ctx);
                mutex_unlock(&ctx->uring_lock);
                *locked = false;
        }
        percpu_ref_put(&ctx->refs);
}

static void tctx_task_work(struct callback_head *cb)
{
        bool locked = false;
        struct io_ring_ctx *ctx = NULL;
        struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
                                                  task_work);

        while (1) {
                struct io_wq_work_node *node;

                if (!tctx->task_list.first && locked && ctx->submit_state.compl_nr)
                        io_submit_flush_completions(ctx);

                spin_lock_irq(&tctx->task_lock);
                node = tctx->task_list.first;
                INIT_WQ_LIST(&tctx->task_list);
                if (!node)
                        tctx->task_running = false;
                spin_unlock_irq(&tctx->task_lock);
                if (!node)
                        break;

                do {
                        struct io_wq_work_node *next = node->next;
                        struct io_kiocb *req = container_of(node, struct io_kiocb,
                                                            io_task_work.node);

                        if (req->ctx != ctx) {
                                ctx_flush_and_put(ctx, &locked);
                                ctx = req->ctx;
                                /* if not contended, grab and improve batching */
                                locked = mutex_trylock(&ctx->uring_lock);
                                percpu_ref_get(&ctx->refs);
                        }
                        req->io_task_work.func(req, &locked);
                        node = next;
                        if (unlikely(need_resched())) {
                                ctx_flush_and_put(ctx, &locked);
                                ctx = NULL;
                                cond_resched();
                        }
                } while (node);
        }

        ctx_flush_and_put(ctx, &locked);

        /* relaxed read is enough as only the task itself sets ->in_idle */
        if (unlikely(atomic_read(&tctx->in_idle)))
                io_uring_drop_tctx_refs(current);
}

static void io_req_task_work_add(struct io_kiocb *req)
{
        struct task_struct *tsk = req->task;
        struct io_uring_task *tctx = tsk->io_uring;
        enum task_work_notify_mode notify;
        struct io_wq_work_node *node;
        unsigned long flags;
        bool running;

        WARN_ON_ONCE(!tctx);

        spin_lock_irqsave(&tctx->task_lock, flags);
        wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
        running = tctx->task_running;
        if (!running)
                tctx->task_running = true;
        spin_unlock_irqrestore(&tctx->task_lock, flags);

        /* task_work already pending, we're done */
        if (running)
                return;

        /*
         * SQPOLL kernel thread doesn't need notification, just a wakeup. For
         * all other cases, use TWA_SIGNAL unconditionally to ensure we're
         * processing task_work. There's no reliable way to tell if TWA_RESUME
         * will do the job.
         */
        notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
        if (!task_work_add(tsk, &tctx->task_work, notify)) {
                wake_up_process(tsk);
                return;
        }

        spin_lock_irqsave(&tctx->task_lock, flags);
        tctx->task_running = false;
        node = tctx->task_list.first;
        INIT_WQ_LIST(&tctx->task_list);
        spin_unlock_irqrestore(&tctx->task_lock, flags);

        while (node) {
                req = container_of(node, struct io_kiocb, io_task_work.node);
                node = node->next;
                if (llist_add(&req->io_task_work.fallback_node,
                              &req->ctx->fallback_llist))
                        schedule_delayed_work(&req->ctx->fallback_work, 1);
        }
}

static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
{
        struct io_ring_ctx *ctx = req->ctx;

        /* not needed for normal modes, but SQPOLL depends on it */
        io_tw_lock(ctx, locked);
        io_req_complete_failed(req, req->result);
}

static void io_req_task_submit(struct io_kiocb *req, bool *locked)
{
        struct io_ring_ctx *ctx = req->ctx;

        io_tw_lock(ctx, locked);
        /* req->task == current here, checking PF_EXITING is safe */
        if (likely(!(req->task->flags & PF_EXITING)))
                __io_queue_sqe(req);
        else
                io_req_complete_failed(req, -EFAULT);
}

static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
{
        req->result = ret;
        req->io_task_work.func = io_req_task_cancel;
        io_req_task_work_add(req);
}

static void io_req_task_queue(struct io_kiocb *req)
{
        req->io_task_work.func = io_req_task_submit;
        io_req_task_work_add(req);
}

static void io_req_task_queue_reissue(struct io_kiocb *req)
{
        req->io_task_work.func = io_queue_async_work;
        io_req_task_work_add(req);
}

static inline void io_queue_next(struct io_kiocb *req)
{
        struct io_kiocb *nxt = io_req_find_next(req);

        if (nxt)
                io_req_task_queue(nxt);
}

static void io_free_req(struct io_kiocb *req)
{
        io_queue_next(req);
        __io_free_req(req);
}

static void io_free_req_work(struct io_kiocb *req, bool *locked)
{
        io_free_req(req);
}

struct req_batch {
        struct task_struct        *task;
        int                        task_refs;
        int                        ctx_refs;
};

static inline void io_init_req_batch(struct req_batch *rb)
{
        rb->task_refs = 0;
        rb->ctx_refs = 0;
        rb->task = NULL;
}

static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
                                     struct req_batch *rb)
{
        if (rb->ctx_refs)
                percpu_ref_put_many(&ctx->refs, rb->ctx_refs);
        if (rb->task)
                io_put_task(rb->task, rb->task_refs);
}

static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
                              struct io_submit_state *state)
{
        io_queue_next(req);
        io_dismantle_req(req);

        if (req->task != rb->task) {
                if (rb->task)
                        io_put_task(rb->task, rb->task_refs);
                rb->task = req->task;
                rb->task_refs = 0;
        }
        rb->task_refs++;
        rb->ctx_refs++;

        if (state->free_reqs != ARRAY_SIZE(state->reqs))
                state->reqs[state->free_reqs++] = req;
        else
                list_add(&req->inflight_entry, &state->free_list);
}

static void io_submit_flush_completions(struct io_ring_ctx *ctx)
        __must_hold(&ctx->uring_lock)
{
        struct io_submit_state *state = &ctx->submit_state;
        int i, nr = state->compl_nr;
        struct req_batch rb;

        spin_lock(&ctx->completion_lock);
        for (i = 0; i < nr; i++) {
                struct io_kiocb *req = state->compl_reqs[i];

                __io_fill_cqe(ctx, req->user_data, req->result,
                              req->compl.cflags);
        }
        io_commit_cqring(ctx);
        spin_unlock(&ctx->completion_lock);
        io_cqring_ev_posted(ctx);

        io_init_req_batch(&rb);
        for (i = 0; i < nr; i++) {
                struct io_kiocb *req = state->compl_reqs[i];

                if (req_ref_put_and_test(req))
                        io_req_free_batch(&rb, req, &ctx->submit_state);
        }

        io_req_free_batch_finish(ctx, &rb);
        state->compl_nr = 0;
}

/*
 * Drop reference to request, return next in chain (if there is one) if this
 * was the last reference to this request.
 */
static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
{
        struct io_kiocb *nxt = NULL;

        if (req_ref_put_and_test(req)) {
                nxt = io_req_find_next(req);
                __io_free_req(req);
        }
        return nxt;
}

static inline void io_put_req(struct io_kiocb *req)
{
        if (req_ref_put_and_test(req))
                io_free_req(req);
}

static inline void io_put_req_deferred(struct io_kiocb *req)
{
        if (req_ref_put_and_test(req)) {
                req->io_task_work.func = io_free_req_work;
                io_req_task_work_add(req);
        }
}

static unsigned io_cqring_events(struct io_ring_ctx *ctx)
{
        /* See comment at the top of this file */
        smp_rmb();
        return __io_cqring_events(ctx);
}

static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
{
        struct io_rings *rings = ctx->rings;

        /* make sure SQ entry isn't read before tail */
        return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
}

static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf)
{
        unsigned int cflags;

        cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
        cflags |= IORING_CQE_F_BUFFER;
        req->flags &= ~REQ_F_BUFFER_SELECTED;
        kfree(kbuf);
        return cflags;
}

static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
{
        struct io_buffer *kbuf;

        if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
                return 0;
        kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
        return io_put_kbuf(req, kbuf);
}

static inline bool io_run_task_work(void)
{
        /*
         * PF_IO_WORKER never returns to userspace, so check here if we have
         * notify work that needs processing.
         */
        if (current->flags & PF_IO_WORKER &&
            test_thread_flag(TIF_NOTIFY_RESUME)) {
                __set_current_state(TASK_RUNNING);
                tracehook_notify_resume(NULL);
        }
        if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) {
                __set_current_state(TASK_RUNNING);
                tracehook_notify_signal();
                return true;
        }

        return false;
}

/*
 * Find and free completed poll iocbs
 */
static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
                               struct list_head *done)
{
        struct req_batch rb;
        struct io_kiocb *req;

        /* order with ->result store in io_complete_rw_iopoll() */
        smp_rmb();

        io_init_req_batch(&rb);
        while (!list_empty(done)) {
                struct io_uring_cqe *cqe;
                unsigned cflags;

                req = list_first_entry(done, struct io_kiocb, inflight_entry);
                list_del(&req->inflight_entry);
                cflags = io_put_rw_kbuf(req);
                (*nr_events)++;

                cqe = io_get_cqe(ctx);
                if (cqe) {
                        WRITE_ONCE(cqe->user_data, req->user_data);
                        WRITE_ONCE(cqe->res, req->result);
                        WRITE_ONCE(cqe->flags, cflags);
                } else {
                        spin_lock(&ctx->completion_lock);
                        io_cqring_event_overflow(ctx, req->user_data,
                                                        req->result, cflags);
                        spin_unlock(&ctx->completion_lock);
                }

                if (req_ref_put_and_test(req))
                        io_req_free_batch(&rb, req, &ctx->submit_state);
        }

        if (io_commit_needs_flush(ctx)) {
                spin_lock(&ctx->completion_lock);
                __io_commit_cqring_flush(ctx);
                spin_unlock(&ctx->completion_lock);
        }
        __io_commit_cqring(ctx);
        io_cqring_ev_posted_iopoll(ctx);
        io_req_free_batch_finish(ctx, &rb);
}

static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
                        long min)
{
        struct io_kiocb *req, *tmp;
        LIST_HEAD(done);
        bool spin;

        /*
         * Only spin for completions if we don't have multiple devices hanging
         * off our complete list, and we're under the requested amount.
         */
        spin = !ctx->poll_multi_queue && *nr_events < min;

        list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
                struct kiocb *kiocb = &req->rw.kiocb;
                int ret;

                /*
                 * Move completed and retryable entries to our local lists.
                 * If we find a request that requires polling, break out
                 * and complete those lists first, if we have entries there.
                 */
                if (READ_ONCE(req->iopoll_completed)) {
                        list_move_tail(&req->inflight_entry, &done);
                        continue;
                }
                if (!list_empty(&done))
                        break;

                ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
                if (unlikely(ret < 0))
                        return ret;
                else if (ret)
                        spin = false;

                /* iopoll may have completed current req */
                if (READ_ONCE(req->iopoll_completed))
                        list_move_tail(&req->inflight_entry, &done);
        }

        if (!list_empty(&done))
                io_iopoll_complete(ctx, nr_events, &done);

        return 0;
}

/*
 * We can't just wait for polled events to come to us, we have to actively
 * find and complete them.
 */
static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
{
        if (!(ctx->flags & IORING_SETUP_IOPOLL))
                return;

        mutex_lock(&ctx->uring_lock);
        while (!list_empty(&ctx->iopoll_list)) {
                unsigned int nr_events = 0;

                io_do_iopoll(ctx, &nr_events, 0);

                /* let it sleep and repeat later if can't complete a request */
                if (nr_events == 0)
                        break;
                /*
                 * Ensure we allow local-to-the-cpu processing to take place,
                 * in this case we need to ensure that we reap all events.
                 * Also let task_work, etc. to progress by releasing the mutex
                 */
                if (need_resched()) {
                        mutex_unlock(&ctx->uring_lock);
                        cond_resched();
                        mutex_lock(&ctx->uring_lock);
                }
        }
        mutex_unlock(&ctx->uring_lock);
}

static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
{
        unsigned int nr_events = 0;
        int ret = 0;

        /*
         * We disallow the app entering submit/complete with polling, but we
         * still need to lock the ring to prevent racing with polled issue
         * that got punted to a workqueue.
         */
        mutex_lock(&ctx->uring_lock);
        /*
         * Don't enter poll loop if we already have events pending.
         * If we do, we can potentially be spinning for commands that
         * already triggered a CQE (eg in error).
         */
        if (test_bit(0, &ctx->check_cq_overflow))
                __io_cqring_overflow_flush(ctx, false);
        if (io_cqring_events(ctx))
                goto out;
        do {
                /*
                 * If a submit got punted to a workqueue, we can have the
                 * application entering polling for a command before it gets
                 * issued. That app will hold the uring_lock for the duration
                 * of the poll right here, so we need to take a breather every
                 * now and then to ensure that the issue has a chance to add
                 * the poll to the issued list. Otherwise we can spin here
                 * forever, while the workqueue is stuck trying to acquire the
                 * very same mutex.
                 */
                if (list_empty(&ctx->iopoll_list)) {
                        u32 tail = ctx->cached_cq_tail;

                        mutex_unlock(&ctx->uring_lock);
                        io_run_task_work();
                        mutex_lock(&ctx->uring_lock);

                        /* some requests don't go through iopoll_list */
                        if (tail != ctx->cached_cq_tail ||
                            list_empty(&ctx->iopoll_list))
                                break;
                }
                ret = io_do_iopoll(ctx, &nr_events, min);

                if (task_sigpending(current)) {
                        ret = -EINTR;
                        goto out;
                }
        } while (!ret && nr_events < min && !need_resched());
out:
        mutex_unlock(&ctx->uring_lock);
        return ret;
}

static void io_req_end_write(struct io_kiocb *req)
{
        if (req->flags & REQ_F_ISREG) {
                struct io_rw *rw = &req->rw;

                kiocb_end_write(&rw->kiocb);
        }
}

#ifdef CONFIG_BLOCK
static bool io_resubmit_prep(struct io_kiocb *req)
{
        struct io_async_rw *rw = req->async_data;

        if (!rw)
                return !io_req_prep_async(req);
        iov_iter_restore(&rw->iter, &rw->iter_state);
        return true;
}

static bool io_rw_should_reissue(struct io_kiocb *req)
{
        umode_t mode = file_inode(req->file)->i_mode;
        struct io_ring_ctx *ctx = req->ctx;

        if (!S_ISBLK(mode) && !S_ISREG(mode))
                return false;
        if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
            !(ctx->flags & IORING_SETUP_IOPOLL)))
                return false;
        /*
         * If ref is dying, we might be running poll reap from the exit work.
         * Don't attempt to reissue from that path, just let it fail with
         * -EAGAIN.
         */
        if (percpu_ref_is_dying(&ctx->refs))
                return false;
        /*
         * Play it safe and assume not safe to re-import and reissue if we're
         * not in the original thread group (or in task context).
         */
        if (!same_thread_group(req->task, current) || !in_task())
                return false;
        return true;
}
#else
static bool io_resubmit_prep(struct io_kiocb *req)
{
        return false;
}
static bool io_rw_should_reissue(struct io_kiocb *req)
{
        return false;
}
#endif

/*
 * Trigger the notifications after having done some IO, and finish the write
 * accounting, if any.
 */
static void io_req_io_end(struct io_kiocb *req)
{
        struct io_rw *rw = &req->rw;

        if (rw->kiocb.ki_flags & IOCB_WRITE) {
                io_req_end_write(req);
                fsnotify_modify(req->file);
        } else {
                fsnotify_access(req->file);
        }
}

static bool __io_complete_rw_common(struct io_kiocb *req, long res)
{
        if (res != req->result) {
                if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
                    io_rw_should_reissue(req)) {
                        /*
                         * Reissue will start accounting again, finish the
                         * current cycle.
                         */
                        io_req_io_end(req);
                        req->flags |= REQ_F_REISSUE;
                        return true;
                }
                req_set_fail(req);
                req->result = res;
        }
        return false;
}

static inline int io_fixup_rw_res(struct io_kiocb *req, long res)
{
        struct io_async_rw *io = req->async_data;

        /* add previously done IO, if any */
        if (io && io->bytes_done > 0) {
                if (res < 0)
                        res = io->bytes_done;
                else
                        res += io->bytes_done;
        }
        return res;
}

static void io_req_task_complete(struct io_kiocb *req, bool *locked)
{
        unsigned int cflags = io_put_rw_kbuf(req);
        int res = req->result;

        if (*locked) {
                struct io_ring_ctx *ctx = req->ctx;
                struct io_submit_state *state = &ctx->submit_state;

                io_req_complete_state(req, res, cflags);
                state->compl_reqs[state->compl_nr++] = req;
                if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
                        io_submit_flush_completions(ctx);
        } else {
                io_req_complete_post(req, res, cflags);
        }
}

static void io_req_rw_complete(struct io_kiocb *req, bool *locked)
{
        io_req_io_end(req);
        io_req_task_complete(req, locked);
}

static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
{
        struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);

        if (__io_complete_rw_common(req, res))
                return;
        req->result = io_fixup_rw_res(req, res);
        req->io_task_work.func = io_req_rw_complete;
        io_req_task_work_add(req);
}

static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
{
        struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);

        if (kiocb->ki_flags & IOCB_WRITE)
                io_req_end_write(req);
        if (unlikely(res != req->result)) {
                if (res == -EAGAIN && io_rw_should_reissue(req)) {
                        req->flags |= REQ_F_REISSUE;
                        return;
                }
        }

        WRITE_ONCE(req->result, res);
        /* order with io_iopoll_complete() checking ->result */
        smp_wmb();
        WRITE_ONCE(req->iopoll_completed, 1);
}

/*
 * After the iocb has been issued, it's safe to be found on the poll list.
 * Adding the kiocb to the list AFTER submission ensures that we don't
 * find it from a io_do_iopoll() thread before the issuer is done
 * accessing the kiocb cookie.
 */
static void io_iopoll_req_issued(struct io_kiocb *req)
{
        struct io_ring_ctx *ctx = req->ctx;
        const bool in_async = io_wq_current_is_worker();

        /* workqueue context doesn't hold uring_lock, grab it now */
        if (unlikely(in_async))
                mutex_lock(&ctx->uring_lock);

        /*
         * Track whether we have multiple files in our lists. This will impact
         * how we do polling eventually, not spinning if we're on potentially
         * different devices.
         */
        if (list_empty(&ctx->iopoll_list)) {
                ctx->poll_multi_queue = false;
        } else if (!ctx->poll_multi_queue) {
                struct io_kiocb *list_req;
                unsigned int queue_num0, queue_num1;

                list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb,
                                                inflight_entry);

                if (list_req->file != req->file) {
                        ctx->poll_multi_queue = true;
                } else {
                        queue_num0 = blk_qc_t_to_queue_num(list_req->rw.kiocb.ki_cookie);
                        queue_num1 = blk_qc_t_to_queue_num(req->rw.kiocb.ki_cookie);
                        if (queue_num0 != queue_num1)
                                ctx->poll_multi_queue = true;
                }
        }

        /*
         * For fast devices, IO may have already completed. If it has, add
         * it to the front so we find it first.
         */
        if (READ_ONCE(req->iopoll_completed))
                list_add(&req->inflight_entry, &ctx->iopoll_list);
        else
                list_add_tail(&req->inflight_entry, &ctx->iopoll_list);

        if (unlikely(in_async)) {
                /*
                 * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
                 * in sq thread task context or in io worker task context. If
                 * current task context is sq thread, we don't need to check
                 * whether should wake up sq thread.
                 */
                if ((ctx->flags & IORING_SETUP_SQPOLL) &&
                    wq_has_sleeper(&ctx->sq_data->wait))
                        wake_up(&ctx->sq_data->wait);

                mutex_unlock(&ctx->uring_lock);
        }
}

static bool io_bdev_nowait(struct block_device *bdev)
{
        return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
}

/*
 * If we tracked the file through the SCM inflight mechanism, we could support
 * any file. For now, just ensure that anything potentially problematic is done
 * inline.
 */
static bool __io_file_supports_nowait(struct file *file, int rw)
{
        umode_t mode = file_inode(file)->i_mode;

        if (S_ISBLK(mode)) {
                if (IS_ENABLED(CONFIG_BLOCK) &&
                    io_bdev_nowait(I_BDEV(file->f_mapping->host)))
                        return true;
                return false;
        }
        if (S_ISSOCK(mode))
                return true;
        if (S_ISREG(mode)) {
                if (IS_ENABLED(CONFIG_BLOCK) &&
                    io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
                    file->f_op != &io_uring_fops)
                        return true;
                return false;
        }

        /* any ->read/write should understand O_NONBLOCK */
        if (file->f_flags & O_NONBLOCK)
                return true;

        if (!(file->f_mode & FMODE_NOWAIT))
                return false;

        if (rw == READ)
                return file->f_op->read_iter != NULL;

        return file->f_op->write_iter != NULL;
}

static bool io_file_supports_nowait(struct io_kiocb *req, int rw)
{
        if (rw == READ && (req->flags & REQ_F_NOWAIT_READ))
                return true;
        else if (rw == WRITE && (req->flags & REQ_F_NOWAIT_WRITE))
                return true;

        return __io_file_supports_nowait(req->file, rw);
}

static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
                      int rw)
{
        struct io_ring_ctx *ctx = req->ctx;
        struct kiocb *kiocb = &req->rw.kiocb;
        struct file *file = req->file;
        unsigned ioprio;
        int ret;

        if (!io_req_ffs_set(req) && S_ISREG(file_inode(file)->i_mode))
                req->flags |= REQ_F_ISREG;

        kiocb->ki_pos = READ_ONCE(sqe->off);
        kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
        kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
        ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
        if (unlikely(ret))
                return ret;

        /*
         * If the file is marked O_NONBLOCK, still allow retry for it if it
         * supports async. Otherwise it's impossible to use O_NONBLOCK files
         * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
         */
        if ((kiocb->ki_flags & IOCB_NOWAIT) ||
            ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req, rw)))
                req->flags |= REQ_F_NOWAIT;

        ioprio = READ_ONCE(sqe->ioprio);
        if (ioprio) {
                ret = ioprio_check_cap(ioprio);
                if (ret)
                        return ret;

                kiocb->ki_ioprio = ioprio;
        } else
                kiocb->ki_ioprio = get_current_ioprio();

        if (ctx->flags & IORING_SETUP_IOPOLL) {
                if (!(kiocb->ki_flags & IOCB_DIRECT) ||
                    !kiocb->ki_filp->f_op->iopoll)
                        return -EOPNOTSUPP;

                kiocb->ki_flags |= IOCB_HIPRI;
                kiocb->ki_complete = io_complete_rw_iopoll;
                req->iopoll_completed = 0;
        } else {
                if (kiocb->ki_flags & IOCB_HIPRI)
                        return -EINVAL;
                kiocb->ki_complete = io_complete_rw;
        }

        /* used for fixed read/write too - just read unconditionally */
        req->buf_index = READ_ONCE(sqe->buf_index);
        req->imu = NULL;

        if (req->opcode == IORING_OP_READ_FIXED ||
            req->opcode == IORING_OP_WRITE_FIXED) {
                struct io_ring_ctx *ctx = req->ctx;
                u16 index;

                if (unlikely(req->buf_index >= ctx->nr_user_bufs))
                        return -EFAULT;
                index = array_index_nospec(req->buf_index, ctx->nr_user_bufs);
                req->imu = ctx->user_bufs[index];
                io_req_set_rsrc_node(req);
        }

        req->rw.addr = READ_ONCE(sqe->addr);
        req->rw.len = READ_ONCE(sqe->len);
        return 0;
}

static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
{
        switch (ret) {
        case -EIOCBQUEUED:
                break;
        case -ERESTARTSYS:
        case -ERESTARTNOINTR:
        case -ERESTARTNOHAND:
        case -ERESTART_RESTARTBLOCK:
                /*
                 * We can't just restart the syscall, since previously
                 * submitted sqes may already be in progress. Just fail this
                 * IO with EINTR.
                 */
                ret = -EINTR;
                fallthrough;
        default:
                kiocb->ki_complete(kiocb, ret, 0);
        }
}

static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
{
        struct kiocb *kiocb = &req->rw.kiocb;

        if (kiocb->ki_pos != -1)
                return &kiocb->ki_pos;

        if (!(req->file->f_mode & FMODE_STREAM)) {
                req->flags |= REQ_F_CUR_POS;
                kiocb->ki_pos = req->file->f_pos;
                return &kiocb->ki_pos;
        }

        kiocb->ki_pos = 0;
        return NULL;
}

static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
                       unsigned int issue_flags)
{
        struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);

        if (req->flags & REQ_F_CUR_POS)
                req->file->f_pos = kiocb->ki_pos;
        if (ret >= 0 && (kiocb->ki_complete == io_complete_rw)) {
                if (!__io_complete_rw_common(req, ret)) {
                        /*
                         * Safe to call io_end from here as we're inline
                         * from the submission path.
                         */
                        io_req_io_end(req);
                        __io_req_complete(req, issue_flags,
                                          io_fixup_rw_res(req, ret),
                                          io_put_rw_kbuf(req));
                }
        } else {
                io_rw_done(kiocb, ret);
        }

        if (req->flags & REQ_F_REISSUE) {
                req->flags &= ~REQ_F_REISSUE;
                if (io_resubmit_prep(req)) {
                        io_req_task_queue_reissue(req);
                } else {
                        unsigned int cflags = io_put_rw_kbuf(req);
                        struct io_ring_ctx *ctx = req->ctx;

                        ret = io_fixup_rw_res(req, ret);
                        req_set_fail(req);
                        if (!(issue_flags & IO_URING_F_NONBLOCK)) {
                                mutex_lock(&ctx->uring_lock);
                                __io_req_complete(req, issue_flags, ret, cflags);
                                mutex_unlock(&ctx->uring_lock);
                        } else {
                                __io_req_complete(req, issue_flags, ret, cflags);
                        }
                }
        }
}

static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
                             struct io_mapped_ubuf *imu)
{
        size_t len = req->rw.len;
        u64 buf_end, buf_addr = req->rw.addr;
        size_t offset;

        if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
                return -EFAULT;
        /* not inside the mapped region */
        if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
                return -EFAULT;

        /*
         * May not be a start of buffer, set size appropriately
         * and advance us to the beginning.
         */
        offset = buf_addr - imu->ubuf;
        iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);

        if (offset) {
                /*
                 * Don't use iov_iter_advance() here, as it's really slow for
                 * using the latter parts of a big fixed buffer - it iterates
                 * over each segment manually. We can cheat a bit here, because
                 * we know that:
                 *
                 * 1) it's a BVEC iter, we set it up
                 * 2) all bvecs are PAGE_SIZE in size, except potentially the
                 *    first and last bvec
                 *
                 * So just find our index, and adjust the iterator afterwards.
                 * If the offset is within the first bvec (or the whole first
                 * bvec, just use iov_iter_advance(). This makes it easier
                 * since we can just skip the first segment, which may not
                 * be PAGE_SIZE aligned.
                 */
                const struct bio_vec *bvec = imu->bvec;

                if (offset < bvec->bv_len) {
                        iov_iter_advance(iter, offset);
                } else {
                        unsigned long seg_skip;

                        /* skip first vec */
                        offset -= bvec->bv_len;
                        seg_skip = 1 + (offset >> PAGE_SHIFT);

                        iter->bvec = bvec + seg_skip;
                        iter->nr_segs -= seg_skip;
                        iter->count -= bvec->bv_len + offset;
                        iter->iov_offset = offset & ~PAGE_MASK;
                }
        }

        return 0;
}

static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
{
        if (WARN_ON_ONCE(!req->imu))
                return -EFAULT;
        return __io_import_fixed(req, rw, iter, req->imu);
}

static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
{
        if (needs_lock)
                mutex_unlock(&ctx->uring_lock);
}

static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
{
        /*
         * "Normal" inline submissions always hold the uring_lock, since we
         * grab it from the system call. Same is true for the SQPOLL offload.
         * The only exception is when we've detached the request and issue it
         * from an async worker thread, grab the lock for that case.
         */
        if (needs_lock)
                mutex_lock(&ctx->uring_lock);
}

static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
                                          int bgid, struct io_buffer *kbuf,
                                          bool needs_lock)
{
        struct io_buffer *head;

        if (req->flags & REQ_F_BUFFER_SELECTED)
                return kbuf;

        io_ring_submit_lock(req->ctx, needs_lock);

        lockdep_assert_held(&req->ctx->uring_lock);

        head = xa_load(&req->ctx->io_buffers, bgid);
        if (head) {
                if (!list_empty(&head->list)) {
                        kbuf = list_last_entry(&head->list, struct io_buffer,
                                                        list);
                        list_del(&kbuf->list);
                } else {
                        kbuf = head;
                        xa_erase(&req->ctx->io_buffers, bgid);
                }
                if (*len > kbuf->len)
                        *len = kbuf->len;
        } else {
                kbuf = ERR_PTR(-ENOBUFS);
        }

        io_ring_submit_unlock(req->ctx, needs_lock);

        return kbuf;
}

static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
                                        bool needs_lock)
{
        struct io_buffer *kbuf;
        u16 bgid;

        kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
        bgid = req->buf_index;
        kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
        if (IS_ERR(kbuf))
                return kbuf;
        req->rw.addr = (u64) (unsigned long) kbuf;
        req->flags |= REQ_F_BUFFER_SELECTED;
        return u64_to_user_ptr(kbuf->addr);
}

#ifdef CONFIG_COMPAT
static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
                                bool needs_lock)
{
        struct compat_iovec __user *uiov;
        compat_ssize_t clen;
        void __user *buf;
        ssize_t len;

        uiov = u64_to_user_ptr(req->rw.addr);
        if (!access_ok(uiov, sizeof(*uiov)))
                return -EFAULT;
        if (__get_user(clen, &uiov->iov_len))
                return -EFAULT;
        if (clen < 0)
                return -EINVAL;

        len = clen;
        buf = io_rw_buffer_select(req, &len, needs_lock);
        if (IS_ERR(buf))
                return PTR_ERR(buf);
        iov[0].iov_base = buf;
        iov[0].iov_len = (compat_size_t) len;
        return 0;
}
#endif

static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
                                      bool needs_lock)
{
        struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
        void __user *buf;
        ssize_t len;

        if (copy_from_user(iov, uiov, sizeof(*uiov)))
                return -EFAULT;

        len = iov[0].iov_len;
        if (len < 0)
                return -EINVAL;
        buf = io_rw_buffer_select(req, &len, needs_lock);
        if (IS_ERR(buf))
                return PTR_ERR(buf);
        iov[0].iov_base = buf;
        iov[0].iov_len = len;
        return 0;
}

static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
                                    bool needs_lock)
{
        if (req->flags & REQ_F_BUFFER_SELECTED) {
                struct io_buffer *kbuf;

                kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
                iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
                iov[0].iov_len = kbuf->len;
                return 0;
        }
        if (req->rw.len != 1)
                return -EINVAL;

#ifdef CONFIG_COMPAT
        if (req->ctx->compat)
                return io_compat_import(req, iov, needs_lock);
#endif

        return __io_iov_buffer_select(req, iov, needs_lock);
}

static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
                           struct iov_iter *iter, bool needs_lock)
{
        void __user *buf = u64_to_user_ptr(req->rw.addr);
        size_t sqe_len = req->rw.len;
        u8 opcode = req->opcode;
        ssize_t ret;

        if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
                *iovec = NULL;
                return io_import_fixed(req, rw, iter);
        }

        /* buffer index only valid with fixed read/write, or buffer select  */
        if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
                return -EINVAL;

        if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
                if (req->flags & REQ_F_BUFFER_SELECT) {
                        buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
                        if (IS_ERR(buf))
                                return PTR_ERR(buf);
                        req->rw.len = sqe_len;
                }

                ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
                *iovec = NULL;
                return ret;
        }

        if (req->flags & REQ_F_BUFFER_SELECT) {
                ret = io_iov_buffer_select(req, *iovec, needs_lock);
                if (!ret)
                        iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len);
                *iovec = NULL;
                return ret;
        }

        return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter,
                              req->ctx->compat);
}

static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
{
        return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
}

/*
 * For files that don't have ->read_iter() and ->write_iter(), handle them
 * by looping over ->read() or ->write() manually.
 */
static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
{
        struct kiocb *kiocb = &req->rw.kiocb;
        struct file *file = req->file;
        ssize_t ret = 0;
        loff_t *ppos;

        /*
         * Don't support polled IO through this interface, and we can't
         * support non-blocking either. For the latter, this just causes
         * the kiocb to be handled from an async context.
         */
        if (kiocb->ki_flags & IOCB_HIPRI)
                return -EOPNOTSUPP;
        if (kiocb->ki_flags & IOCB_NOWAIT)
                return -EAGAIN;

        ppos = io_kiocb_ppos(kiocb);

        while (iov_iter_count(iter)) {
                struct iovec iovec;
                ssize_t nr;

                if (!iov_iter_is_bvec(iter)) {
                        iovec = iov_iter_iovec(iter);
                } else {
                        iovec.iov_base = u64_to_user_ptr(req->rw.addr);
                        iovec.iov_len = req->rw.len;
                }

                if (rw == READ) {
                        nr = file->f_op->read(file, iovec.iov_base,
                                              iovec.iov_len, ppos);
                } else {
                        nr = file->f_op->write(file, iovec.iov_base,
                                               iovec.iov_len, ppos);
                }

                if (nr < 0) {
                        if (!ret)
                                ret = nr;
                        break;
                }
                ret += nr;
                if (!iov_iter_is_bvec(iter)) {
                        iov_iter_advance(iter, nr);
                } else {
                        req->rw.addr += nr;
                        req->rw.len -= nr;
                        if (!req->rw.len)
                                break;
                }
                if (nr != iovec.iov_len)
                        break;
        }

        return ret;
}

static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
                          const struct iovec *fast_iov, struct iov_iter *iter)
{
        struct io_async_rw *rw = req->async_data;

        memcpy(&rw->iter, iter, sizeof(*iter));
        rw->free_iovec = iovec;
        rw->bytes_done = 0;
        /* can only be fixed buffers, no need to do anything */
        if (iov_iter_is_bvec(iter))
                return;
        if (!iovec) {
                unsigned iov_off = 0;

                rw->iter.iov = rw->fast_iov;
                if (iter->iov != fast_iov) {
                        iov_off = iter->iov - fast_iov;
                        rw->iter.iov += iov_off;
                }
                if (rw->fast_iov != fast_iov)
                        memcpy(rw->fast_iov + iov_off, fast_iov + iov_off,
                               sizeof(struct iovec) * iter->nr_segs);
        } else {
                req->flags |= REQ_F_NEED_CLEANUP;
        }
}

static inline int io_alloc_async_data(struct io_kiocb *req)
{
        WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
        req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
        return req->async_data == NULL;
}

static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
                             const struct iovec *fast_iov,
                             struct iov_iter *iter, bool force)
{
        if (!force && !io_op_defs[req->opcode].needs_async_setup)
                return 0;
        if (!req->async_data) {
                struct io_async_rw *iorw;

                if (io_alloc_async_data(req)) {
                        kfree(iovec);
                        return -ENOMEM;
                }

                io_req_map_rw(req, iovec, fast_iov, iter);
                iorw = req->async_data;
                /* we've copied and mapped the iter, ensure state is saved */
                iov_iter_save_state(&iorw->iter, &iorw->iter_state);
        }
        return 0;
}

static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
{
        struct io_async_rw *iorw = req->async_data;
        struct iovec *iov = iorw->fast_iov;
        int ret;

        iorw->bytes_done = 0;
        iorw->free_iovec = NULL;

        ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
        if (unlikely(ret < 0))
                return ret;

        if (iov) {
                iorw->free_iovec = iov;
                req->flags |= REQ_F_NEED_CLEANUP;
        }
        iov_iter_save_state(&iorw->iter, &iorw->iter_state);
        return 0;
}

static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        if (unlikely(!(req->file->f_mode & FMODE_READ)))
                return -EBADF;
        return io_prep_rw(req, sqe, READ);
}

/*
 * This is our waitqueue callback handler, registered through lock_page_async()
 * when we initially tried to do the IO with the iocb armed our waitqueue.
 * This gets called when the page is unlocked, and we generally expect that to
 * happen when the page IO is completed and the page is now uptodate. This will
 * queue a task_work based retry of the operation, attempting to copy the data
 * again. If the latter fails because the page was NOT uptodate, then we will
 * do a thread based blocking retry of the operation. That's the unexpected
 * slow path.
 */
static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
                             int sync, void *arg)
{
        struct wait_page_queue *wpq;
        struct io_kiocb *req = wait->private;
        struct wait_page_key *key = arg;

        wpq = container_of(wait, struct wait_page_queue, wait);

        if (!wake_page_match(wpq, key))
                return 0;

        req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
        list_del_init(&wait->entry);
        io_req_task_queue(req);
        return 1;
}

/*
 * This controls whether a given IO request should be armed for async page
 * based retry. If we return false here, the request is handed to the async
 * worker threads for retry. If we're doing buffered reads on a regular file,
 * we prepare a private wait_page_queue entry and retry the operation. This
 * will either succeed because the page is now uptodate and unlocked, or it
 * will register a callback when the page is unlocked at IO completion. Through
 * that callback, io_uring uses task_work to setup a retry of the operation.
 * That retry will attempt the buffered read again. The retry will generally
 * succeed, or in rare cases where it fails, we then fall back to using the
 * async worker threads for a blocking retry.
 */
static bool io_rw_should_retry(struct io_kiocb *req)
{
        struct io_async_rw *rw = req->async_data;
        struct wait_page_queue *wait = &rw->wpq;
        struct kiocb *kiocb = &req->rw.kiocb;

        /* never retry for NOWAIT, we just complete with -EAGAIN */
        if (req->flags & REQ_F_NOWAIT)
                return false;

        /* Only for buffered IO */
        if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
                return false;

        /*
         * just use poll if we can, and don't attempt if the fs doesn't
         * support callback based unlocks
         */
        if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
                return false;

        wait->wait.func = io_async_buf_func;
        wait->wait.private = req;
        wait->wait.flags = 0;
        INIT_LIST_HEAD(&wait->wait.entry);
        kiocb->ki_flags |= IOCB_WAITQ;
        kiocb->ki_flags &= ~IOCB_NOWAIT;
        kiocb->ki_waitq = wait;
        return true;
}

static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
{
        if (req->file->f_op->read_iter)
                return call_read_iter(req->file, &req->rw.kiocb, iter);
        else if (req->file->f_op->read)
                return loop_rw_iter(READ, req, iter);
        else
                return -EINVAL;
}

static bool need_read_all(struct io_kiocb *req)
{
        return req->flags & REQ_F_ISREG ||
                S_ISBLK(file_inode(req->file)->i_mode);
}

static int io_read(struct io_kiocb *req, unsigned int issue_flags)
{
        struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
        struct kiocb *kiocb = &req->rw.kiocb;
        struct iov_iter __iter, *iter = &__iter;
        struct io_async_rw *rw = req->async_data;
        bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
        struct iov_iter_state __state, *state;
        ssize_t ret, ret2;
        loff_t *ppos;

        if (rw) {
                iter = &rw->iter;
                state = &rw->iter_state;
                /*
                 * We come here from an earlier attempt, restore our state to
                 * match in case it doesn't. It's cheap enough that we don't
                 * need to make this conditional.
                 */
                iov_iter_restore(iter, state);
                iovec = NULL;
        } else {
                ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
                if (ret < 0)
                        return ret;
                state = &__state;
                iov_iter_save_state(iter, state);
        }
        req->result = iov_iter_count(iter);

        /* Ensure we clear previously set non-block flag */
        if (!force_nonblock)
                kiocb->ki_flags &= ~IOCB_NOWAIT;
        else
                kiocb->ki_flags |= IOCB_NOWAIT;

        /* If the file doesn't support async, just async punt */
        if (force_nonblock && !io_file_supports_nowait(req, READ)) {
                ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
                return ret ?: -EAGAIN;
        }

        ppos = io_kiocb_update_pos(req);

        ret = rw_verify_area(READ, req->file, ppos, req->result);
        if (unlikely(ret)) {
                kfree(iovec);
                return ret;
        }

        ret = io_iter_do_read(req, iter);

        if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
                req->flags &= ~REQ_F_REISSUE;
                /* IOPOLL retry should happen for io-wq threads */
                if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
                        goto done;
                /* no retry on NONBLOCK nor RWF_NOWAIT */
                if (req->flags & REQ_F_NOWAIT)
                        goto done;
                ret = 0;
        } else if (ret == -EIOCBQUEUED) {
                goto out_free;
        } else if (ret <= 0 || ret == req->result || !force_nonblock ||
                   (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
                /* read all, failed, already did sync or don't want to retry */
                goto done;
        }

        /*
         * Don't depend on the iter state matching what was consumed, or being
         * untouched in case of error. Restore it and we'll advance it
         * manually if we need to.
         */
        iov_iter_restore(iter, state);

        ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
        if (ret2)
                return ret2;

        iovec = NULL;
        rw = req->async_data;
        /*
         * Now use our persistent iterator and state, if we aren't already.
         * We've restored and mapped the iter to match.
         */
        if (iter != &rw->iter) {
                iter = &rw->iter;
                state = &rw->iter_state;
        }

        do {
                /*
                 * We end up here because of a partial read, either from
                 * above or inside this loop. Advance the iter by the bytes
                 * that were consumed.
                 */
                iov_iter_advance(iter, ret);
                if (!iov_iter_count(iter))
                        break;
                rw->bytes_done += ret;
                iov_iter_save_state(iter, state);

                /* if we can retry, do so with the callbacks armed */
                if (!io_rw_should_retry(req)) {
                        kiocb->ki_flags &= ~IOCB_WAITQ;
                        return -EAGAIN;
                }

                req->result = iov_iter_count(iter);
                /*
                 * Now retry read with the IOCB_WAITQ parts set in the iocb. If
                 * we get -EIOCBQUEUED, then we'll get a notification when the
                 * desired page gets unlocked. We can also get a partial read
                 * here, and if we do, then just retry at the new offset.
                 */
                ret = io_iter_do_read(req, iter);
                if (ret == -EIOCBQUEUED)
                        return 0;
                /* we got some bytes, but not all. retry. */
                kiocb->ki_flags &= ~IOCB_WAITQ;
                iov_iter_restore(iter, state);
        } while (ret > 0);
done:
        kiocb_done(kiocb, ret, issue_flags);
out_free:
        /* it's faster to check here then delegate to kfree */
        if (iovec)
                kfree(iovec);
        return 0;
}

static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
                return -EBADF;
        return io_prep_rw(req, sqe, WRITE);
}

static bool io_kiocb_start_write(struct io_kiocb *req, struct kiocb *kiocb)
{
        struct inode *inode;
        bool ret;

        if (!(req->flags & REQ_F_ISREG))
                return true;
        if (!(kiocb->ki_flags & IOCB_NOWAIT)) {
                kiocb_start_write(kiocb);
                return true;
        }

        inode = file_inode(kiocb->ki_filp);
        ret = sb_start_write_trylock(inode->i_sb);
        if (ret)
                __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE);
        return ret;
}

static int io_write(struct io_kiocb *req, unsigned int issue_flags)
{
        struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
        struct kiocb *kiocb = &req->rw.kiocb;
        struct iov_iter __iter, *iter = &__iter;
        struct io_async_rw *rw = req->async_data;
        bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
        struct iov_iter_state __state, *state;
        ssize_t ret, ret2;
        loff_t *ppos;

        if (rw) {
                iter = &rw->iter;
                state = &rw->iter_state;
                iov_iter_restore(iter, state);
                iovec = NULL;
        } else {
                ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
                if (ret < 0)
                        return ret;
                state = &__state;
                iov_iter_save_state(iter, state);
        }
        req->result = iov_iter_count(iter);

        /* Ensure we clear previously set non-block flag */
        if (!force_nonblock)
                kiocb->ki_flags &= ~IOCB_NOWAIT;
        else
                kiocb->ki_flags |= IOCB_NOWAIT;

        /* If the file doesn't support async, just async punt */
        if (force_nonblock && !io_file_supports_nowait(req, WRITE))
                goto copy_iov;

        /* file path doesn't support NOWAIT for non-direct_IO */
        if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
            (req->flags & REQ_F_ISREG))
                goto copy_iov;

        ppos = io_kiocb_update_pos(req);

        ret = rw_verify_area(WRITE, req->file, ppos, req->result);
        if (unlikely(ret))
                goto out_free;

        if (unlikely(!io_kiocb_start_write(req, kiocb)))
                goto copy_iov;
        kiocb->ki_flags |= IOCB_WRITE;

        if (req->file->f_op->write_iter)
                ret2 = call_write_iter(req->file, kiocb, iter);
        else if (req->file->f_op->write)
                ret2 = loop_rw_iter(WRITE, req, iter);
        else
                ret2 = -EINVAL;

        if (req->flags & REQ_F_REISSUE) {
                req->flags &= ~REQ_F_REISSUE;
                ret2 = -EAGAIN;
        }

        /*
         * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
         * retry them without IOCB_NOWAIT.
         */
        if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
                ret2 = -EAGAIN;
        /* no retry on NONBLOCK nor RWF_NOWAIT */
        if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
                goto done;
        if (!force_nonblock || ret2 != -EAGAIN) {
                /* IOPOLL retry should happen for io-wq threads */
                if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
                        goto copy_iov;
done:
                kiocb_done(kiocb, ret2, issue_flags);
        } else {
copy_iov:
                iov_iter_restore(iter, state);
                ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
                if (!ret) {
                        if (kiocb->ki_flags & IOCB_WRITE)
                                io_req_end_write(req);
                        return -EAGAIN;
                }
                return ret;
        }
out_free:
        /* it's reportedly faster than delegating the null check to kfree() */
        if (iovec)
                kfree(iovec);
        return ret;
}

static int io_renameat_prep(struct io_kiocb *req,
                            const struct io_uring_sqe *sqe)
{
        struct io_rename *ren = &req->rename;
        const char __user *oldf, *newf;

        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
                return -EINVAL;
        if (unlikely(req->flags & REQ_F_FIXED_FILE))
                return -EBADF;

        ren->old_dfd = READ_ONCE(sqe->fd);
        oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
        newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
        ren->new_dfd = READ_ONCE(sqe->len);
        ren->flags = READ_ONCE(sqe->rename_flags);

        ren->oldpath = getname(oldf);
        if (IS_ERR(ren->oldpath))
                return PTR_ERR(ren->oldpath);

        ren->newpath = getname(newf);
        if (IS_ERR(ren->newpath)) {
                putname(ren->oldpath);
                return PTR_ERR(ren->newpath);
        }

        req->flags |= REQ_F_NEED_CLEANUP;
        return 0;
}

static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_rename *ren = &req->rename;
        int ret;

        if (issue_flags & IO_URING_F_NONBLOCK)
                return -EAGAIN;

        ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
                                ren->newpath, ren->flags);

        req->flags &= ~REQ_F_NEED_CLEANUP;
        if (ret < 0)
                req_set_fail(req);
        io_req_complete(req, ret);
        return 0;
}

static int io_unlinkat_prep(struct io_kiocb *req,
                            const struct io_uring_sqe *sqe)
{
        struct io_unlink *un = &req->unlink;
        const char __user *fname;

        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
            sqe->splice_fd_in)
                return -EINVAL;
        if (unlikely(req->flags & REQ_F_FIXED_FILE))
                return -EBADF;

        un->dfd = READ_ONCE(sqe->fd);

        un->flags = READ_ONCE(sqe->unlink_flags);
        if (un->flags & ~AT_REMOVEDIR)
                return -EINVAL;

        fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
        un->filename = getname(fname);
        if (IS_ERR(un->filename))
                return PTR_ERR(un->filename);

        req->flags |= REQ_F_NEED_CLEANUP;
        return 0;
}

static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_unlink *un = &req->unlink;
        int ret;

        if (issue_flags & IO_URING_F_NONBLOCK)
                return -EAGAIN;

        if (un->flags & AT_REMOVEDIR)
                ret = do_rmdir(un->dfd, un->filename);
        else
                ret = do_unlinkat(un->dfd, un->filename);

        req->flags &= ~REQ_F_NEED_CLEANUP;
        if (ret < 0)
                req_set_fail(req);
        io_req_complete(req, ret);
        return 0;
}

static int io_shutdown_prep(struct io_kiocb *req,
                            const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_NET)
        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
                     sqe->buf_index || sqe->splice_fd_in))
                return -EINVAL;

        req->shutdown.how = READ_ONCE(sqe->len);
        return 0;
#else
        return -EOPNOTSUPP;
#endif
}

static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
{
#if defined(CONFIG_NET)
        struct socket *sock;
        int ret;

        if (issue_flags & IO_URING_F_NONBLOCK)
                return -EAGAIN;

        sock = sock_from_file(req->file, &ret);
        if (unlikely(!sock))
                return ret;

        ret = __sys_shutdown_sock(sock, req->shutdown.how);
        if (ret < 0)
                req_set_fail(req);
        io_req_complete(req, ret);
        return 0;
#else
        return -EOPNOTSUPP;
#endif
}

static int __io_splice_prep(struct io_kiocb *req,
                            const struct io_uring_sqe *sqe)
{
        struct io_splice *sp = &req->splice;
        unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;

        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;

        sp->len = READ_ONCE(sqe->len);
        sp->flags = READ_ONCE(sqe->splice_flags);
        if (unlikely(sp->flags & ~valid_flags))
                return -EINVAL;
        sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in);
        return 0;
}

static int io_tee_prep(struct io_kiocb *req,
                       const struct io_uring_sqe *sqe)
{
        if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
                return -EINVAL;
        return __io_splice_prep(req, sqe);
}

static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_splice *sp = &req->splice;
        struct file *out = sp->file_out;
        unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
        struct file *in;
        long ret = 0;

        if (issue_flags & IO_URING_F_NONBLOCK)
                return -EAGAIN;

        in = io_file_get(req->ctx, req, sp->splice_fd_in,
                         (sp->flags & SPLICE_F_FD_IN_FIXED), issue_flags);
        if (!in) {
                ret = -EBADF;
                goto done;
        }

        if (sp->len)
                ret = do_tee(in, out, sp->len, flags);

        if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
                io_put_file(in);
done:
        if (ret != sp->len)
                req_set_fail(req);
        io_req_complete(req, ret);
        return 0;
}

static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        struct io_splice *sp = &req->splice;

        sp->off_in = READ_ONCE(sqe->splice_off_in);
        sp->off_out = READ_ONCE(sqe->off);
        return __io_splice_prep(req, sqe);
}

static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_splice *sp = &req->splice;
        struct file *out = sp->file_out;
        unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
        loff_t *poff_in, *poff_out;
        struct file *in;
        long ret = 0;

        if (issue_flags & IO_URING_F_NONBLOCK)
                return -EAGAIN;

        in = io_file_get(req->ctx, req, sp->splice_fd_in,
                         (sp->flags & SPLICE_F_FD_IN_FIXED), issue_flags);
        if (!in) {
                ret = -EBADF;
                goto done;
        }

        poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
        poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;

        if (sp->len)
                ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);

        if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
                io_put_file(in);
done:
        if (ret != sp->len)
                req_set_fail(req);
        io_req_complete(req, ret);
        return 0;
}

/*
 * IORING_OP_NOP just posts a completion event, nothing else.
 */
static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_ring_ctx *ctx = req->ctx;

        if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;

        __io_req_complete(req, issue_flags, 0, 0);
        return 0;
}

static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        struct io_ring_ctx *ctx = req->ctx;

        if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
                     sqe->splice_fd_in))
                return -EINVAL;

        req->sync.flags = READ_ONCE(sqe->fsync_flags);
        if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
                return -EINVAL;

        req->sync.off = READ_ONCE(sqe->off);
        req->sync.len = READ_ONCE(sqe->len);
        return 0;
}

static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
{
        loff_t end = req->sync.off + req->sync.len;
        int ret;

        /* fsync always requires a blocking context */
        if (issue_flags & IO_URING_F_NONBLOCK)
                return -EAGAIN;

        ret = vfs_fsync_range(req->file, req->sync.off,
                                end > 0 ? end : LLONG_MAX,
                                req->sync.flags & IORING_FSYNC_DATASYNC);
        if (ret < 0)
                req_set_fail(req);
        io_req_complete(req, ret);
        return 0;
}

static int io_fallocate_prep(struct io_kiocb *req,
                             const struct io_uring_sqe *sqe)
{
        if (sqe->ioprio || sqe->buf_index || sqe->rw_flags ||
            sqe->splice_fd_in)
                return -EINVAL;
        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;

        req->sync.off = READ_ONCE(sqe->off);
        req->sync.len = READ_ONCE(sqe->addr);
        req->sync.mode = READ_ONCE(sqe->len);
        return 0;
}

static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
{
        int ret;

        /* fallocate always requiring blocking context */
        if (issue_flags & IO_URING_F_NONBLOCK)
                return -EAGAIN;
        ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
                                req->sync.len);
        if (ret < 0)
                req_set_fail(req);
        else
                fsnotify_modify(req->file);
        io_req_complete(req, ret);
        return 0;
}

static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        const char __user *fname;
        int ret;

        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (unlikely(sqe->ioprio || sqe->buf_index))
                return -EINVAL;
        if (unlikely(req->flags & REQ_F_FIXED_FILE))
                return -EBADF;

        /* open.how should be already initialised */
        if (!(req->open.how.flags & O_PATH) && force_o_largefile())
                req->open.how.flags |= O_LARGEFILE;

        req->open.dfd = READ_ONCE(sqe->fd);
        fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
        req->open.filename = getname(fname);
        if (IS_ERR(req->open.filename)) {
                ret = PTR_ERR(req->open.filename);
                req->open.filename = NULL;
                return ret;
        }
        req->flags |= REQ_F_NEED_CLEANUP;

        req->open.file_slot = READ_ONCE(sqe->file_index);
        if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC))
                return -EINVAL;

        req->open.nofile = rlimit(RLIMIT_NOFILE);
        return 0;
}

static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        u64 mode = READ_ONCE(sqe->len);
        u64 flags = READ_ONCE(sqe->open_flags);

        req->open.how = build_open_how(flags, mode);
        return __io_openat_prep(req, sqe);
}

static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        struct open_how __user *how;
        size_t len;
        int ret;

        how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
        len = READ_ONCE(sqe->len);
        if (len < OPEN_HOW_SIZE_VER0)
                return -EINVAL;

        ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
                                        len);
        if (ret)
                return ret;

        return __io_openat_prep(req, sqe);
}

static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
{
        struct open_flags op;
        struct file *file;
        bool resolve_nonblock, nonblock_set;
        bool fixed = !!req->open.file_slot;
        int ret;

        ret = build_open_flags(&req->open.how, &op);
        if (ret)
                goto err;
        nonblock_set = op.open_flag & O_NONBLOCK;
        resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED;
        if (issue_flags & IO_URING_F_NONBLOCK) {
                /*
                 * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
                 * it'll always -EAGAIN. Note that we test for __O_TMPFILE
                 * because O_TMPFILE includes O_DIRECTORY, which isn't a flag
                 * we need to force async for.
                 */
                if (req->open.how.flags & (O_TRUNC | O_CREAT | __O_TMPFILE))
                        return -EAGAIN;
                op.lookup_flags |= LOOKUP_CACHED;
                op.open_flag |= O_NONBLOCK;
        }

        if (!fixed) {
                ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
                if (ret < 0)
                        goto err;
        }

        file = do_filp_open(req->open.dfd, req->open.filename, &op);
        if (IS_ERR(file)) {
                /*
                 * We could hang on to this 'fd' on retrying, but seems like
                 * marginal gain for something that is now known to be a slower
                 * path. So just put it, and we'll get a new one when we retry.
                 */
                if (!fixed)
                        put_unused_fd(ret);

                ret = PTR_ERR(file);
                /* only retry if RESOLVE_CACHED wasn't already set by application */
                if (ret == -EAGAIN &&
                    (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)))
                        return -EAGAIN;
                goto err;
        }

        if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
                file->f_flags &= ~O_NONBLOCK;
        fsnotify_open(file);

        if (!fixed)
                fd_install(ret, file);
        else
                ret = io_install_fixed_file(req, file, issue_flags,
                                            req->open.file_slot - 1);
err:
        putname(req->open.filename);
        req->flags &= ~REQ_F_NEED_CLEANUP;
        if (ret < 0)
                req_set_fail(req);
        __io_req_complete(req, issue_flags, ret, 0);
        return 0;
}

static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
{
        return io_openat2(req, issue_flags);
}

static int io_remove_buffers_prep(struct io_kiocb *req,
                                  const struct io_uring_sqe *sqe)
{
        struct io_provide_buf *p = &req->pbuf;
        u64 tmp;

        if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
            sqe->splice_fd_in)
                return -EINVAL;

        tmp = READ_ONCE(sqe->fd);
        if (!tmp || tmp > USHRT_MAX)
                return -EINVAL;

        memset(p, 0, sizeof(*p));
        p->nbufs = tmp;
        p->bgid = READ_ONCE(sqe->buf_group);
        return 0;
}

static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
                               int bgid, unsigned nbufs)
{
        unsigned i = 0;

        /* shouldn't happen */
        if (!nbufs)
                return 0;

        /* the head kbuf is the list itself */
        while (!list_empty(&buf->list)) {
                struct io_buffer *nxt;

                nxt = list_first_entry(&buf->list, struct io_buffer, list);
                list_del(&nxt->list);
                kfree(nxt);
                if (++i == nbufs)
                        return i;
                cond_resched();
        }
        i++;
        kfree(buf);
        xa_erase(&ctx->io_buffers, bgid);

        return i;
}

static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_provide_buf *p = &req->pbuf;
        struct io_ring_ctx *ctx = req->ctx;
        struct io_buffer *head;
        int ret = 0;
        bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;

        io_ring_submit_lock(ctx, !force_nonblock);

        lockdep_assert_held(&ctx->uring_lock);

        ret = -ENOENT;
        head = xa_load(&ctx->io_buffers, p->bgid);
        if (head)
                ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
        if (ret < 0)
                req_set_fail(req);

        /* complete before unlock, IOPOLL may need the lock */
        __io_req_complete(req, issue_flags, ret, 0);
        io_ring_submit_unlock(ctx, !force_nonblock);
        return 0;
}

static int io_provide_buffers_prep(struct io_kiocb *req,
                                   const struct io_uring_sqe *sqe)
{
        unsigned long size, tmp_check;
        struct io_provide_buf *p = &req->pbuf;
        u64 tmp;

        if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
                return -EINVAL;

        tmp = READ_ONCE(sqe->fd);
        if (!tmp || tmp > USHRT_MAX)
                return -E2BIG;
        p->nbufs = tmp;
        p->addr = READ_ONCE(sqe->addr);
        p->len = READ_ONCE(sqe->len);

        if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
                                &size))
                return -EOVERFLOW;
        if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
                return -EOVERFLOW;

        size = (unsigned long)p->len * p->nbufs;
        if (!access_ok(u64_to_user_ptr(p->addr), size))
                return -EFAULT;

        p->bgid = READ_ONCE(sqe->buf_group);
        tmp = READ_ONCE(sqe->off);
        if (tmp > USHRT_MAX)
                return -E2BIG;
        p->bid = tmp;
        return 0;
}

static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
{
        struct io_buffer *buf;
        u64 addr = pbuf->addr;
        int i, bid = pbuf->bid;

        for (i = 0; i < pbuf->nbufs; i++) {
                buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);
                if (!buf)
                        break;

                buf->addr = addr;
                buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
                buf->bid = bid;
                addr += pbuf->len;
                bid++;
                if (!*head) {
                        INIT_LIST_HEAD(&buf->list);
                        *head = buf;
                } else {
                        list_add_tail(&buf->list, &(*head)->list);
                }
                cond_resched();
        }

        return i ? i : -ENOMEM;
}

static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_provide_buf *p = &req->pbuf;
        struct io_ring_ctx *ctx = req->ctx;
        struct io_buffer *head, *list;
        int ret = 0;
        bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;

        io_ring_submit_lock(ctx, !force_nonblock);

        lockdep_assert_held(&ctx->uring_lock);

        list = head = xa_load(&ctx->io_buffers, p->bgid);

        ret = io_add_buffers(p, &head);
        if (ret >= 0 && !list) {
                ret = xa_insert(&ctx->io_buffers, p->bgid, head,
                                GFP_KERNEL_ACCOUNT);
                if (ret < 0)
                        __io_remove_buffers(ctx, head, p->bgid, -1U);
        }
        if (ret < 0)
                req_set_fail(req);
        /* complete before unlock, IOPOLL may need the lock */
        __io_req_complete(req, issue_flags, ret, 0);
        io_ring_submit_unlock(ctx, !force_nonblock);
        return 0;
}

static int io_epoll_ctl_prep(struct io_kiocb *req,
                             const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_EPOLL)
        if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
                return -EINVAL;
        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;

        req->epoll.epfd = READ_ONCE(sqe->fd);
        req->epoll.op = READ_ONCE(sqe->len);
        req->epoll.fd = READ_ONCE(sqe->off);

        if (ep_op_has_event(req->epoll.op)) {
                struct epoll_event __user *ev;

                ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
                if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
                        return -EFAULT;
        }

        return 0;
#else
        return -EOPNOTSUPP;
#endif
}

static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
{
#if defined(CONFIG_EPOLL)
        struct io_epoll *ie = &req->epoll;
        int ret;
        bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;

        ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
        if (force_nonblock && ret == -EAGAIN)
                return -EAGAIN;

        if (ret < 0)
                req_set_fail(req);
        __io_req_complete(req, issue_flags, ret, 0);
        return 0;
#else
        return -EOPNOTSUPP;
#endif
}

static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
        if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in)
                return -EINVAL;
        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;

        req->madvise.addr = READ_ONCE(sqe->addr);
        req->madvise.len = READ_ONCE(sqe->len);
        req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
        return 0;
#else
        return -EOPNOTSUPP;
#endif
}

static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
{
#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
        struct io_madvise *ma = &req->madvise;
        int ret;

        if (issue_flags & IO_URING_F_NONBLOCK)
                return -EAGAIN;

        ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
        if (ret < 0)
                req_set_fail(req);
        io_req_complete(req, ret);
        return 0;
#else
        return -EOPNOTSUPP;
#endif
}

static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in)
                return -EINVAL;
        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;

        req->fadvise.offset = READ_ONCE(sqe->off);
        req->fadvise.len = READ_ONCE(sqe->len);
        req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
        return 0;
}

static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_fadvise *fa = &req->fadvise;
        int ret;

        if (issue_flags & IO_URING_F_NONBLOCK) {
                switch (fa->advice) {
                case POSIX_FADV_NORMAL:
                case POSIX_FADV_RANDOM:
                case POSIX_FADV_SEQUENTIAL:
                        break;
                default:
                        return -EAGAIN;
                }
        }

        ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
        if (ret < 0)
                req_set_fail(req);
        __io_req_complete(req, issue_flags, ret, 0);
        return 0;
}

static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
                return -EINVAL;
        if (req->flags & REQ_F_FIXED_FILE)
                return -EBADF;

        req->statx.dfd = READ_ONCE(sqe->fd);
        req->statx.mask = READ_ONCE(sqe->len);
        req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
        req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
        req->statx.flags = READ_ONCE(sqe->statx_flags);

        return 0;
}

static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_statx *ctx = &req->statx;
        int ret;

        if (issue_flags & IO_URING_F_NONBLOCK)
                return -EAGAIN;

        ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
                       ctx->buffer);

        if (ret < 0)
                req_set_fail(req);
        io_req_complete(req, ret);
        return 0;
}

static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
            sqe->rw_flags || sqe->buf_index)
                return -EINVAL;
        if (req->flags & REQ_F_FIXED_FILE)
                return -EBADF;

        req->close.fd = READ_ONCE(sqe->fd);
        req->close.file_slot = READ_ONCE(sqe->file_index);
        if (req->close.file_slot && req->close.fd)
                return -EINVAL;

        return 0;
}

static int io_close(struct io_kiocb *req, unsigned int issue_flags)
{
        struct files_struct *files = current->files;
        struct io_close *close = &req->close;
        struct fdtable *fdt;
        struct file *file = NULL;
        int ret = -EBADF;

        if (req->close.file_slot) {
                ret = io_close_fixed(req, issue_flags);
                goto err;
        }

        spin_lock(&files->file_lock);
        fdt = files_fdtable(files);
        if (close->fd >= fdt->max_fds) {
                spin_unlock(&files->file_lock);
                goto err;
        }
        file = fdt->fd[close->fd];
        if (!file || file->f_op == &io_uring_fops) {
                spin_unlock(&files->file_lock);
                file = NULL;
                goto err;
        }

        /* if the file has a flush method, be safe and punt to async */
        if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
                spin_unlock(&files->file_lock);
                return -EAGAIN;
        }

        ret = __close_fd_get_file(close->fd, &file);
        spin_unlock(&files->file_lock);
        if (ret < 0) {
                if (ret == -ENOENT)
                        ret = -EBADF;
                goto err;
        }

        /* No ->flush() or already async, safely close from here */
        ret = filp_close(file, current->files);
err:
        if (ret < 0)
                req_set_fail(req);
        if (file)
                fput(file);
        __io_req_complete(req, issue_flags, ret, 0);
        return 0;
}

static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        struct io_ring_ctx *ctx = req->ctx;

        if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
                     sqe->splice_fd_in))
                return -EINVAL;

        req->sync.off = READ_ONCE(sqe->off);
        req->sync.len = READ_ONCE(sqe->len);
        req->sync.flags = READ_ONCE(sqe->sync_range_flags);
        return 0;
}

static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
{
        int ret;

        /* sync_file_range always requires a blocking context */
        if (issue_flags & IO_URING_F_NONBLOCK)
                return -EAGAIN;

        ret = sync_file_range(req->file, req->sync.off, req->sync.len,
                                req->sync.flags);
        if (ret < 0)
                req_set_fail(req);
        io_req_complete(req, ret);
        return 0;
}

#if defined(CONFIG_NET)
static bool io_net_retry(struct socket *sock, int flags)
{
        if (!(flags & MSG_WAITALL))
                return false;
        return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
}

static int io_setup_async_msg(struct io_kiocb *req,
                              struct io_async_msghdr *kmsg)
{
        struct io_async_msghdr *async_msg = req->async_data;

        if (async_msg)
                return -EAGAIN;
        if (io_alloc_async_data(req)) {
                kfree(kmsg->free_iov);
                return -ENOMEM;
        }
        async_msg = req->async_data;
        req->flags |= REQ_F_NEED_CLEANUP;
        memcpy(async_msg, kmsg, sizeof(*kmsg));
        if (async_msg->msg.msg_name)
                async_msg->msg.msg_name = &async_msg->addr;
        /* if were using fast_iov, set it to the new one */
        if (!kmsg->free_iov) {
                size_t fast_idx = kmsg->msg.msg_iter.iov - kmsg->fast_iov;
                async_msg->msg.msg_iter.iov = &async_msg->fast_iov[fast_idx];
        }

        return -EAGAIN;
}

static int io_sendmsg_copy_hdr(struct io_kiocb *req,
                               struct io_async_msghdr *iomsg)
{
        struct io_sr_msg *sr = &req->sr_msg;
        int ret;

        iomsg->msg.msg_name = &iomsg->addr;
        iomsg->free_iov = iomsg->fast_iov;
        ret = sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
                                   req->sr_msg.msg_flags, &iomsg->free_iov);
        /* save msg_control as sys_sendmsg() overwrites it */
        sr->msg_control = iomsg->msg.msg_control;
        return ret;
}

static int io_sendmsg_prep_async(struct io_kiocb *req)
{
        int ret;

        ret = io_sendmsg_copy_hdr(req, req->async_data);
        if (!ret)
                req->flags |= REQ_F_NEED_CLEANUP;
        return ret;
}

static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        struct io_sr_msg *sr = &req->sr_msg;

        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (unlikely(sqe->addr2 || sqe->file_index))
                return -EINVAL;
        if (unlikely(sqe->addr2 || sqe->file_index || sqe->ioprio))
                return -EINVAL;

        sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
        sr->len = READ_ONCE(sqe->len);
        sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
        if (sr->msg_flags & MSG_DONTWAIT)
                req->flags |= REQ_F_NOWAIT;

#ifdef CONFIG_COMPAT
        if (req->ctx->compat)
                sr->msg_flags |= MSG_CMSG_COMPAT;
#endif
        sr->done_io = 0;
        return 0;
}

static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_async_msghdr iomsg, *kmsg;
        struct io_sr_msg *sr = &req->sr_msg;
        struct socket *sock;
        unsigned flags;
        int min_ret = 0;
        int ret;

        sock = sock_from_file(req->file, &ret);
        if (unlikely(!sock))
                return ret;

        kmsg = req->async_data;
        if (!kmsg) {
                ret = io_sendmsg_copy_hdr(req, &iomsg);
                if (ret)
                        return ret;
                kmsg = &iomsg;
        } else {
                kmsg->msg.msg_control = sr->msg_control;
        }

        flags = req->sr_msg.msg_flags;
        if (issue_flags & IO_URING_F_NONBLOCK)
                flags |= MSG_DONTWAIT;
        if (flags & MSG_WAITALL)
                min_ret = iov_iter_count(&kmsg->msg.msg_iter);

        ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);

        if (ret < min_ret) {
                if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
                        return io_setup_async_msg(req, kmsg);
                if (ret == -ERESTARTSYS)
                        ret = -EINTR;
                if (ret > 0 && io_net_retry(sock, flags)) {
                        sr->done_io += ret;
                        req->flags |= REQ_F_PARTIAL_IO;
                        return io_setup_async_msg(req, kmsg);
                }
                req_set_fail(req);
        }
        /* fast path, check for non-NULL to avoid function call */
        if (kmsg->free_iov)
                kfree(kmsg->free_iov);
        req->flags &= ~REQ_F_NEED_CLEANUP;
        if (ret >= 0)
                ret += sr->done_io;
        else if (sr->done_io)
                ret = sr->done_io;
        __io_req_complete(req, issue_flags, ret, 0);
        return 0;
}

static int io_send(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_sr_msg *sr = &req->sr_msg;
        struct msghdr msg;
        struct iovec iov;
        struct socket *sock;
        unsigned flags;
        int min_ret = 0;
        int ret;

        sock = sock_from_file(req->file, &ret);
        if (unlikely(!sock))
                return ret;

        ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
        if (unlikely(ret))
                return ret;

        msg.msg_name = NULL;
        msg.msg_control = NULL;
        msg.msg_controllen = 0;
        msg.msg_namelen = 0;

        flags = req->sr_msg.msg_flags;
        if (issue_flags & IO_URING_F_NONBLOCK)
                flags |= MSG_DONTWAIT;
        if (flags & MSG_WAITALL)
                min_ret = iov_iter_count(&msg.msg_iter);

        msg.msg_flags = flags;
        ret = sock_sendmsg(sock, &msg);
        if (ret < min_ret) {
                if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
                        return -EAGAIN;
                if (ret == -ERESTARTSYS)
                        ret = -EINTR;
                if (ret > 0 && io_net_retry(sock, flags)) {
                        sr->len -= ret;
                        sr->buf += ret;
                        sr->done_io += ret;
                        req->flags |= REQ_F_PARTIAL_IO;
                        return -EAGAIN;
                }
                req_set_fail(req);
        }
        if (ret >= 0)
                ret += sr->done_io;
        else if (sr->done_io)
                ret = sr->done_io;
        __io_req_complete(req, issue_flags, ret, 0);
        return 0;
}

static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
                                 struct io_async_msghdr *iomsg)
{
        struct io_sr_msg *sr = &req->sr_msg;
        struct iovec __user *uiov;
        size_t iov_len;
        int ret;

        ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
                                        &iomsg->uaddr, &uiov, &iov_len);
        if (ret)
                return ret;

        if (req->flags & REQ_F_BUFFER_SELECT) {
                if (iov_len > 1)
                        return -EINVAL;
                if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
                        return -EFAULT;
                sr->len = iomsg->fast_iov[0].iov_len;
                iomsg->free_iov = NULL;
        } else {
                iomsg->free_iov = iomsg->fast_iov;
                ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
                                     &iomsg->free_iov, &iomsg->msg.msg_iter,
                                     false);
                if (ret > 0)
                        ret = 0;
        }

        return ret;
}

#ifdef CONFIG_COMPAT
static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
                                        struct io_async_msghdr *iomsg)
{
        struct io_sr_msg *sr = &req->sr_msg;
        struct compat_iovec __user *uiov;
        compat_uptr_t ptr;
        compat_size_t len;
        int ret;

        ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr,
                                  &ptr, &len);
        if (ret)
                return ret;

        uiov = compat_ptr(ptr);
        if (req->flags & REQ_F_BUFFER_SELECT) {
                compat_ssize_t clen;

                if (len > 1)
                        return -EINVAL;
                if (!access_ok(uiov, sizeof(*uiov)))
                        return -EFAULT;
                if (__get_user(clen, &uiov->iov_len))
                        return -EFAULT;
                if (clen < 0)
                        return -EINVAL;
                sr->len = clen;
                iomsg->free_iov = NULL;
        } else {
                iomsg->free_iov = iomsg->fast_iov;
                ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
                                   UIO_FASTIOV, &iomsg->free_iov,
                                   &iomsg->msg.msg_iter, true);
                if (ret < 0)
                        return ret;
        }

        return 0;
}
#endif

static int io_recvmsg_copy_hdr(struct io_kiocb *req,
                               struct io_async_msghdr *iomsg)
{
        iomsg->msg.msg_name = &iomsg->addr;

#ifdef CONFIG_COMPAT
        if (req->ctx->compat)
                return __io_compat_recvmsg_copy_hdr(req, iomsg);
#endif

        return __io_recvmsg_copy_hdr(req, iomsg);
}

static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
                                               bool needs_lock)
{
        struct io_sr_msg *sr = &req->sr_msg;
        struct io_buffer *kbuf;

        kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
        if (IS_ERR(kbuf))
                return kbuf;

        sr->kbuf = kbuf;
        req->flags |= REQ_F_BUFFER_SELECTED;
        return kbuf;
}

static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
{
        return io_put_kbuf(req, req->sr_msg.kbuf);
}

static int io_recvmsg_prep_async(struct io_kiocb *req)
{
        int ret;

        ret = io_recvmsg_copy_hdr(req, req->async_data);
        if (!ret)
                req->flags |= REQ_F_NEED_CLEANUP;
        return ret;
}

static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        struct io_sr_msg *sr = &req->sr_msg;

        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (unlikely(sqe->addr2 || sqe->file_index))
                return -EINVAL;
        if (unlikely(sqe->addr2 || sqe->file_index || sqe->ioprio))
                return -EINVAL;

        sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
        sr->len = READ_ONCE(sqe->len);
        sr->bgid = READ_ONCE(sqe->buf_group);
        sr->msg_flags = READ_ONCE(sqe->msg_flags);
        if (sr->msg_flags & MSG_DONTWAIT)
                req->flags |= REQ_F_NOWAIT;

#ifdef CONFIG_COMPAT
        if (req->ctx->compat)
                sr->msg_flags |= MSG_CMSG_COMPAT;
#endif
        sr->done_io = 0;
        return 0;
}

static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_async_msghdr iomsg, *kmsg;
        struct io_sr_msg *sr = &req->sr_msg;
        struct socket *sock;
        struct io_buffer *kbuf;
        unsigned flags;
        int min_ret = 0;
        int ret, cflags = 0;
        bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;

        sock = sock_from_file(req->file, &ret);
        if (unlikely(!sock))
                return ret;

        kmsg = req->async_data;
        if (!kmsg) {
                ret = io_recvmsg_copy_hdr(req, &iomsg);
                if (ret)
                        return ret;
                kmsg = &iomsg;
        }

        if (req->flags & REQ_F_BUFFER_SELECT) {
                kbuf = io_recv_buffer_select(req, !force_nonblock);
                if (IS_ERR(kbuf))
                        return PTR_ERR(kbuf);
                kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
                kmsg->fast_iov[0].iov_len = req->sr_msg.len;
                iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov,
                                1, req->sr_msg.len);
        }

        flags = req->sr_msg.msg_flags;
        if (force_nonblock)
                flags |= MSG_DONTWAIT;
        if (flags & MSG_WAITALL && !kmsg->msg.msg_controllen)
                min_ret = iov_iter_count(&kmsg->msg.msg_iter);

        ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
                                        kmsg->uaddr, flags);
        if (ret < min_ret) {
                if (ret == -EAGAIN && force_nonblock)
                        return io_setup_async_msg(req, kmsg);
                if (ret == -ERESTARTSYS)
                        ret = -EINTR;
                if (ret > 0 && io_net_retry(sock, flags)) {
                        kmsg->msg.msg_controllen = 0;
                        kmsg->msg.msg_control = NULL;
                        sr->done_io += ret;
                        req->flags |= REQ_F_PARTIAL_IO;
                        return io_setup_async_msg(req, kmsg);
                }
                req_set_fail(req);
        } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
                req_set_fail(req);
        }

        if (req->flags & REQ_F_BUFFER_SELECTED)
                cflags = io_put_recv_kbuf(req);
        /* fast path, check for non-NULL to avoid function call */
        if (kmsg->free_iov)
                kfree(kmsg->free_iov);
        req->flags &= ~REQ_F_NEED_CLEANUP;
        if (ret >= 0)
                ret += sr->done_io;
        else if (sr->done_io)
                ret = sr->done_io;
        __io_req_complete(req, issue_flags, ret, cflags);
        return 0;
}

static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_buffer *kbuf;
        struct io_sr_msg *sr = &req->sr_msg;
        struct msghdr msg;
        void __user *buf = sr->buf;
        struct socket *sock;
        struct iovec iov;
        unsigned flags;
        int min_ret = 0;
        int ret, cflags = 0;
        bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;

        sock = sock_from_file(req->file, &ret);
        if (unlikely(!sock))
                return ret;

        if (req->flags & REQ_F_BUFFER_SELECT) {
                kbuf = io_recv_buffer_select(req, !force_nonblock);
                if (IS_ERR(kbuf))
                        return PTR_ERR(kbuf);
                buf = u64_to_user_ptr(kbuf->addr);
        }

        ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
        if (unlikely(ret))
                goto out_free;

        msg.msg_name = NULL;
        msg.msg_control = NULL;
        msg.msg_controllen = 0;
        msg.msg_namelen = 0;
        msg.msg_iocb = NULL;
        msg.msg_flags = 0;

        flags = req->sr_msg.msg_flags;
        if (force_nonblock)
                flags |= MSG_DONTWAIT;
        if (flags & MSG_WAITALL)
                min_ret = iov_iter_count(&msg.msg_iter);

        ret = sock_recvmsg(sock, &msg, flags);
        if (ret < min_ret) {
                if (ret == -EAGAIN && force_nonblock)
                        return -EAGAIN;
                if (ret == -ERESTARTSYS)
                        ret = -EINTR;
                if (ret > 0 && io_net_retry(sock, flags)) {
                        sr->len -= ret;
                        sr->buf += ret;
                        sr->done_io += ret;
                        req->flags |= REQ_F_PARTIAL_IO;
                        return -EAGAIN;
                }
                req_set_fail(req);
        } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
out_free:
                req_set_fail(req);
        }
        if (req->flags & REQ_F_BUFFER_SELECTED)
                cflags = io_put_recv_kbuf(req);
        if (ret >= 0)
                ret += sr->done_io;
        else if (sr->done_io)
                ret = sr->done_io;
        __io_req_complete(req, issue_flags, ret, cflags);
        return 0;
}

static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        struct io_accept *accept = &req->accept;

        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (sqe->ioprio || sqe->len || sqe->buf_index)
                return -EINVAL;

        accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
        accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
        accept->flags = READ_ONCE(sqe->accept_flags);
        accept->nofile = rlimit(RLIMIT_NOFILE);

        accept->file_slot = READ_ONCE(sqe->file_index);
        if (accept->file_slot && (accept->flags & SOCK_CLOEXEC))
                return -EINVAL;
        if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
                return -EINVAL;
        if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
                accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
        return 0;
}

static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_accept *accept = &req->accept;
        bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
        unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
        bool fixed = !!accept->file_slot;
        struct file *file;
        int ret, fd;

        if (!fixed) {
                fd = __get_unused_fd_flags(accept->flags, accept->nofile);
                if (unlikely(fd < 0))
                        return fd;
        }
        file = do_accept(req->file, file_flags, accept->addr, accept->addr_len,
                         accept->flags);

        if (IS_ERR(file)) {
                if (!fixed)
                        put_unused_fd(fd);
                ret = PTR_ERR(file);
                /* safe to retry */
                req->flags |= REQ_F_PARTIAL_IO;
                if (ret == -EAGAIN && force_nonblock)
                        return -EAGAIN;
                if (ret == -ERESTARTSYS)
                        ret = -EINTR;
                req_set_fail(req);
        } else if (!fixed) {
                fd_install(fd, file);
                ret = fd;
        } else {
                ret = io_install_fixed_file(req, file, issue_flags,
                                            accept->file_slot - 1);
        }
        __io_req_complete(req, issue_flags, ret, 0);
        return 0;
}

static int io_connect_prep_async(struct io_kiocb *req)
{
        struct io_async_connect *io = req->async_data;
        struct io_connect *conn = &req->connect;

        return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
}

static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        struct io_connect *conn = &req->connect;

        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags ||
            sqe->splice_fd_in)
                return -EINVAL;

        conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
        conn->addr_len =  READ_ONCE(sqe->addr2);
        return 0;
}

static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_async_connect __io, *io;
        unsigned file_flags;
        int ret;
        bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;

        if (req->async_data) {
                io = req->async_data;
        } else {
                ret = move_addr_to_kernel(req->connect.addr,
                                                req->connect.addr_len,
                                                &__io.address);
                if (ret)
                        goto out;
                io = &__io;
        }

        file_flags = force_nonblock ? O_NONBLOCK : 0;

        ret = __sys_connect_file(req->file, &io->address,
                                        req->connect.addr_len, file_flags);
        if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
                if (req->async_data)
                        return -EAGAIN;
                if (io_alloc_async_data(req)) {
                        ret = -ENOMEM;
                        goto out;
                }
                memcpy(req->async_data, &__io, sizeof(__io));
                return -EAGAIN;
        }
        if (ret == -ERESTARTSYS)
                ret = -EINTR;
out:
        if (ret < 0)
                req_set_fail(req);
        __io_req_complete(req, issue_flags, ret, 0);
        return 0;
}
#else /* !CONFIG_NET */
#define IO_NETOP_FN(op)                                                        \
static int io_##op(struct io_kiocb *req, unsigned int issue_flags)        \
{                                                                        \
        return -EOPNOTSUPP;                                                \
}

#define IO_NETOP_PREP(op)                                                \
IO_NETOP_FN(op)                                                                \
static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \
{                                                                        \
        return -EOPNOTSUPP;                                                \
}                                                                        \

#define IO_NETOP_PREP_ASYNC(op)                                                \
IO_NETOP_PREP(op)                                                        \
static int io_##op##_prep_async(struct io_kiocb *req)                        \
{                                                                        \
        return -EOPNOTSUPP;                                                \
}

IO_NETOP_PREP_ASYNC(sendmsg);
IO_NETOP_PREP_ASYNC(recvmsg);
IO_NETOP_PREP_ASYNC(connect);
IO_NETOP_PREP(accept);
IO_NETOP_FN(send);
IO_NETOP_FN(recv);
#endif /* CONFIG_NET */

struct io_poll_table {
        struct poll_table_struct pt;
        struct io_kiocb *req;
        int nr_entries;
        int error;
};

#define IO_POLL_CANCEL_FLAG        BIT(31)
#define IO_POLL_RETRY_FLAG        BIT(30)
#define IO_POLL_REF_MASK        GENMASK(29, 0)

/*
 * We usually have 1-2 refs taken, 128 is more than enough and we want to
 * maximise the margin between this amount and the moment when it overflows.
 */
#define IO_POLL_REF_BIAS       128

static bool io_poll_get_ownership_slowpath(struct io_kiocb *req)
{
        int v;

        /*
         * poll_refs are already elevated and we don't have much hope for
         * grabbing the ownership. Instead of incrementing set a retry flag
         * to notify the loop that there might have been some change.
         */
        v = atomic_fetch_or(IO_POLL_RETRY_FLAG, &req->poll_refs);
        if (v & IO_POLL_REF_MASK)
                return false;
        return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
}

/*
 * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can
 * bump it and acquire ownership. It's disallowed to modify requests while not
 * owning it, that prevents from races for enqueueing task_work's and b/w
 * arming poll and wakeups.
 */
static inline bool io_poll_get_ownership(struct io_kiocb *req)
{
        if (unlikely(atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS))
                return io_poll_get_ownership_slowpath(req);
        return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
}

static void io_poll_mark_cancelled(struct io_kiocb *req)
{
        atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs);
}

static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
{
        /* pure poll stashes this in ->async_data, poll driven retry elsewhere */
        if (req->opcode == IORING_OP_POLL_ADD)
                return req->async_data;
        return req->apoll->double_poll;
}

static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
{
        if (req->opcode == IORING_OP_POLL_ADD)
                return &req->poll;
        return &req->apoll->poll;
}

static void io_poll_req_insert(struct io_kiocb *req)
{
        struct io_ring_ctx *ctx = req->ctx;
        struct hlist_head *list;

        list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
        hlist_add_head(&req->hash_node, list);
}

static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
                              wait_queue_func_t wake_func)
{
        poll->head = NULL;
#define IO_POLL_UNMASK        (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
        /* mask in events that we always want/need */
        poll->events = events | IO_POLL_UNMASK;
        INIT_LIST_HEAD(&poll->wait.entry);
        init_waitqueue_func_entry(&poll->wait, wake_func);
}

static inline void io_poll_remove_entry(struct io_poll_iocb *poll)
{
        struct wait_queue_head *head = smp_load_acquire(&poll->head);

        if (head) {
                spin_lock_irq(&head->lock);
                list_del_init(&poll->wait.entry);
                poll->head = NULL;
                spin_unlock_irq(&head->lock);
        }
}

static void io_poll_remove_entries(struct io_kiocb *req)
{
        struct io_poll_iocb *poll = io_poll_get_single(req);
        struct io_poll_iocb *poll_double = io_poll_get_double(req);

        /*
         * While we hold the waitqueue lock and the waitqueue is nonempty,
         * wake_up_pollfree() will wait for us.  However, taking the waitqueue
         * lock in the first place can race with the waitqueue being freed.
         *
         * We solve this as eventpoll does: by taking advantage of the fact that
         * all users of wake_up_pollfree() will RCU-delay the actual free.  If
         * we enter rcu_read_lock() and see that the pointer to the queue is
         * non-NULL, we can then lock it without the memory being freed out from
         * under us.
         *
         * Keep holding rcu_read_lock() as long as we hold the queue lock, in
         * case the caller deletes the entry from the queue, leaving it empty.
         * In that case, only RCU prevents the queue memory from being freed.
         */
        rcu_read_lock();
        io_poll_remove_entry(poll);
        if (poll_double)
                io_poll_remove_entry(poll_double);
        rcu_read_unlock();
}

/*
 * All poll tw should go through this. Checks for poll events, manages
 * references, does rewait, etc.
 *
 * Returns a negative error on failure. >0 when no action require, which is
 * either spurious wakeup or multishot CQE is served. 0 when it's done with
 * the request, then the mask is stored in req->result.
 */
static int io_poll_check_events(struct io_kiocb *req)
{
        struct io_ring_ctx *ctx = req->ctx;
        struct io_poll_iocb *poll = io_poll_get_single(req);
        int v;

        /* req->task == current here, checking PF_EXITING is safe */
        if (unlikely(req->task->flags & PF_EXITING))
                io_poll_mark_cancelled(req);

        do {
                v = atomic_read(&req->poll_refs);

                /* tw handler should be the owner, and so have some references */
                if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK)))
                        return 0;
                if (v & IO_POLL_CANCEL_FLAG)
                        return -ECANCELED;
                /*
                 * cqe.res contains only events of the first wake up
                 * and all others are be lost. Redo vfs_poll() to get
                 * up to date state.
                 */
                if ((v & IO_POLL_REF_MASK) != 1)
                        req->result = 0;
                if (v & IO_POLL_RETRY_FLAG) {
                        req->result = 0;
                        /*
                         * We won't find new events that came in between
                         * vfs_poll and the ref put unless we clear the
                         * flag in advance.
                         */
                        atomic_andnot(IO_POLL_RETRY_FLAG, &req->poll_refs);
                        v &= ~IO_POLL_RETRY_FLAG;
                }

                if (!req->result) {
                        struct poll_table_struct pt = { ._key = poll->events };

                        req->result = vfs_poll(req->file, &pt) & poll->events;
                }

                /* multishot, just fill an CQE and proceed */
                if (req->result && !(poll->events & EPOLLONESHOT)) {
                        __poll_t mask = mangle_poll(req->result & poll->events);
                        bool filled;

                        spin_lock(&ctx->completion_lock);
                        filled = io_fill_cqe_aux(ctx, req->user_data, mask,
                                                 IORING_CQE_F_MORE);
                        io_commit_cqring(ctx);
                        spin_unlock(&ctx->completion_lock);
                        if (unlikely(!filled))
                                return -ECANCELED;
                        io_cqring_ev_posted(ctx);
                } else if (req->result) {
                        return 0;
                }

                /* force the next iteration to vfs_poll() */
                req->result = 0;

                /*
                 * Release all references, retry if someone tried to restart
                 * task_work while we were executing it.
                 */
        } while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs) &
                                        IO_POLL_REF_MASK);

        return 1;
}

static void io_poll_task_func(struct io_kiocb *req, bool *locked)
{
        struct io_ring_ctx *ctx = req->ctx;
        int ret;

        ret = io_poll_check_events(req);
        if (ret > 0)
                return;

        if (!ret) {
                req->result = mangle_poll(req->result & req->poll.events);
        } else {
                req->result = ret;
                req_set_fail(req);
        }

        io_poll_remove_entries(req);
        spin_lock(&ctx->completion_lock);
        hash_del(&req->hash_node);
        spin_unlock(&ctx->completion_lock);
        io_req_complete_post(req, req->result, 0);
}

static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
{
        struct io_ring_ctx *ctx = req->ctx;
        int ret;

        ret = io_poll_check_events(req);
        if (ret > 0)
                return;

        io_tw_lock(req->ctx, locked);
        io_poll_remove_entries(req);
        spin_lock(&ctx->completion_lock);
        hash_del(&req->hash_node);
        spin_unlock(&ctx->completion_lock);

        if (!ret)
                io_req_task_submit(req, locked);
        else
                io_req_complete_failed(req, ret);
}

static void __io_poll_execute(struct io_kiocb *req, int mask)
{
        req->result = mask;
        if (req->opcode == IORING_OP_POLL_ADD)
                req->io_task_work.func = io_poll_task_func;
        else
                req->io_task_work.func = io_apoll_task_func;

        trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
        io_req_task_work_add(req);
}

static inline void io_poll_execute(struct io_kiocb *req, int res)
{
        if (io_poll_get_ownership(req))
                __io_poll_execute(req, res);
}

static void io_poll_cancel_req(struct io_kiocb *req)
{
        io_poll_mark_cancelled(req);
        /* kick tw, which should complete the request */
        io_poll_execute(req, 0);
}

static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
                        void *key)
{
        struct io_kiocb *req = wait->private;
        struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
                                                 wait);
        __poll_t mask = key_to_poll(key);

        if (unlikely(mask & POLLFREE)) {
                io_poll_mark_cancelled(req);
                /* we have to kick tw in case it's not already */
                io_poll_execute(req, 0);

                /*
                 * If the waitqueue is being freed early but someone is already
                 * holds ownership over it, we have to tear down the request as
                 * best we can. That means immediately removing the request from
                 * its waitqueue and preventing all further accesses to the
                 * waitqueue via the request.
                 */
                list_del_init(&poll->wait.entry);

                /*
                 * Careful: this *must* be the last step, since as soon
                 * as req->head is NULL'ed out, the request can be
                 * completed and freed, since aio_poll_complete_work()
                 * will no longer need to take the waitqueue lock.
                 */
                smp_store_release(&poll->head, NULL);
                return 1;
        }

        /* for instances that support it check for an event match first */
        if (mask && !(mask & poll->events))
                return 0;

        if (io_poll_get_ownership(req)) {
                /*
                 * If we trigger a multishot poll off our own wakeup path,
                 * disable multishot as there is a circular dependency between
                 * CQ posting and triggering the event.
                 */
                if (mask & EPOLL_URING_WAKE)
                        poll->events |= EPOLLONESHOT;

                __io_poll_execute(req, mask);
        }
        return 1;
}

static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
                            struct wait_queue_head *head,
                            struct io_poll_iocb **poll_ptr)
{
        struct io_kiocb *req = pt->req;

        /*
         * The file being polled uses multiple waitqueues for poll handling
         * (e.g. one for read, one for write). Setup a separate io_poll_iocb
         * if this happens.
         */
        if (unlikely(pt->nr_entries)) {
                struct io_poll_iocb *first = poll;

                /* double add on the same waitqueue head, ignore */
                if (first->head == head)
                        return;
                /* already have a 2nd entry, fail a third attempt */
                if (*poll_ptr) {
                        if ((*poll_ptr)->head == head)
                                return;
                        pt->error = -EINVAL;
                        return;
                }

                poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
                if (!poll) {
                        pt->error = -ENOMEM;
                        return;
                }
                io_init_poll_iocb(poll, first->events, first->wait.func);
                *poll_ptr = poll;
        }

        pt->nr_entries++;
        poll->head = head;
        poll->wait.private = req;

        if (poll->events & EPOLLEXCLUSIVE)
                add_wait_queue_exclusive(head, &poll->wait);
        else
                add_wait_queue(head, &poll->wait);
}

static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
                               struct poll_table_struct *p)
{
        struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);

        __io_queue_proc(&pt->req->poll, pt, head,
                        (struct io_poll_iocb **) &pt->req->async_data);
}

static int __io_arm_poll_handler(struct io_kiocb *req,
                                 struct io_poll_iocb *poll,
                                 struct io_poll_table *ipt, __poll_t mask)
{
        struct io_ring_ctx *ctx = req->ctx;

        INIT_HLIST_NODE(&req->hash_node);
        io_init_poll_iocb(poll, mask, io_poll_wake);
        poll->file = req->file;
        poll->wait.private = req;

        ipt->pt._key = mask;
        ipt->req = req;
        ipt->error = 0;
        ipt->nr_entries = 0;

        /*
         * Take the ownership to delay any tw execution up until we're done
         * with poll arming. see io_poll_get_ownership().
         */
        atomic_set(&req->poll_refs, 1);
        mask = vfs_poll(req->file, &ipt->pt) & poll->events;

        if (mask && (poll->events & EPOLLONESHOT)) {
                io_poll_remove_entries(req);
                /* no one else has access to the req, forget about the ref */
                return mask;
        }
        if (!mask && unlikely(ipt->error || !ipt->nr_entries)) {
                io_poll_remove_entries(req);
                if (!ipt->error)
                        ipt->error = -EINVAL;
                return 0;
        }

        spin_lock(&ctx->completion_lock);
        io_poll_req_insert(req);
        spin_unlock(&ctx->completion_lock);

        if (mask) {
                /* can't multishot if failed, just queue the event we've got */
                if (unlikely(ipt->error || !ipt->nr_entries)) {
                        poll->events |= EPOLLONESHOT;
                        ipt->error = 0;
                }
                __io_poll_execute(req, mask);
                return 0;
        }

        /*
         * Try to release ownership. If we see a change of state, e.g.
         * poll was waken up, queue up a tw, it'll deal with it.
         */
        if (atomic_cmpxchg(&req->poll_refs, 1, 0) != 1)
                __io_poll_execute(req, 0);
        return 0;
}

static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
                               struct poll_table_struct *p)
{
        struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
        struct async_poll *apoll = pt->req->apoll;

        __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
}

enum {
        IO_APOLL_OK,
        IO_APOLL_ABORTED,
        IO_APOLL_READY
};

/*
 * We can't reliably detect loops in repeated poll triggers and issue
 * subsequently failing. But rather than fail these immediately, allow a
 * certain amount of retries before we give up. Given that this condition
 * should _rarely_ trigger even once, we should be fine with a larger value.
 */
#define APOLL_MAX_RETRY                128

static int io_arm_poll_handler(struct io_kiocb *req)
{
        const struct io_op_def *def = &io_op_defs[req->opcode];
        struct io_ring_ctx *ctx = req->ctx;
        struct async_poll *apoll;
        struct io_poll_table ipt;
        __poll_t mask = EPOLLONESHOT | POLLERR | POLLPRI;
        int ret;

        if (!req->file || !file_can_poll(req->file))
                return IO_APOLL_ABORTED;
        if (!def->pollin && !def->pollout)
                return IO_APOLL_ABORTED;

        if (def->pollin) {
                mask |= POLLIN | POLLRDNORM;

                /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
                if ((req->opcode == IORING_OP_RECVMSG) &&
                    (req->sr_msg.msg_flags & MSG_ERRQUEUE))
                        mask &= ~POLLIN;
        } else {
                mask |= POLLOUT | POLLWRNORM;
        }

        if (req->flags & REQ_F_POLLED) {
                apoll = req->apoll;
                kfree(apoll->double_poll);
                if (unlikely(!--apoll->poll.retries)) {
                        apoll->double_poll = NULL;
                        return IO_APOLL_ABORTED;
                }
        } else {
                apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
                if (unlikely(!apoll))
                        return IO_APOLL_ABORTED;
                apoll->poll.retries = APOLL_MAX_RETRY;
        }
        apoll->double_poll = NULL;
        req->apoll = apoll;
        req->flags |= REQ_F_POLLED;
        ipt.pt._qproc = io_async_queue_proc;

        ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask);
        if (ret || ipt.error)
                return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;

        trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data,
                                mask, apoll->poll.events);
        return IO_APOLL_OK;
}

/*
 * Returns true if we found and killed one or more poll requests
 */
static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
                               bool cancel_all)
{
        struct hlist_node *tmp;
        struct io_kiocb *req;
        bool found = false;
        int i;

        spin_lock(&ctx->completion_lock);
        for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
                struct hlist_head *list;

                list = &ctx->cancel_hash[i];
                hlist_for_each_entry_safe(req, tmp, list, hash_node) {
                        if (io_match_task_safe(req, tsk, cancel_all)) {
                                hlist_del_init(&req->hash_node);
                                io_poll_cancel_req(req);
                                found = true;
                        }
                }
        }
        spin_unlock(&ctx->completion_lock);
        return found;
}

static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr,
                                     bool poll_only)
        __must_hold(&ctx->completion_lock)
{
        struct hlist_head *list;
        struct io_kiocb *req;

        list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
        hlist_for_each_entry(req, list, hash_node) {
                if (sqe_addr != req->user_data)
                        continue;
                if (poll_only && req->opcode != IORING_OP_POLL_ADD)
                        continue;
                return req;
        }
        return NULL;
}

static bool io_poll_disarm(struct io_kiocb *req)
        __must_hold(&ctx->completion_lock)
{
        if (!io_poll_get_ownership(req))
                return false;
        io_poll_remove_entries(req);
        hash_del(&req->hash_node);
        return true;
}

static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr,
                          bool poll_only)
        __must_hold(&ctx->completion_lock)
{
        struct io_kiocb *req = io_poll_find(ctx, sqe_addr, poll_only);

        if (!req)
                return -ENOENT;
        io_poll_cancel_req(req);
        return 0;
}

static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
                                     unsigned int flags)
{
        u32 events;

        events = READ_ONCE(sqe->poll32_events);
#ifdef __BIG_ENDIAN
        events = swahw32(events);
#endif
        if (!(flags & IORING_POLL_ADD_MULTI))
                events |= EPOLLONESHOT;
        return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
}

static int io_poll_update_prep(struct io_kiocb *req,
                               const struct io_uring_sqe *sqe)
{
        struct io_poll_update *upd = &req->poll_update;
        u32 flags;

        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
                return -EINVAL;
        flags = READ_ONCE(sqe->len);
        if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
                      IORING_POLL_ADD_MULTI))
                return -EINVAL;
        /* meaningless without update */
        if (flags == IORING_POLL_ADD_MULTI)
                return -EINVAL;

        upd->old_user_data = READ_ONCE(sqe->addr);
        upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
        upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;

        upd->new_user_data = READ_ONCE(sqe->off);
        if (!upd->update_user_data && upd->new_user_data)
                return -EINVAL;
        if (upd->update_events)
                upd->events = io_poll_parse_events(sqe, flags);
        else if (sqe->poll32_events)
                return -EINVAL;

        return 0;
}

static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        struct io_poll_iocb *poll = &req->poll;
        u32 flags;

        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr)
                return -EINVAL;
        flags = READ_ONCE(sqe->len);
        if (flags & ~IORING_POLL_ADD_MULTI)
                return -EINVAL;

        io_req_set_refcount(req);
        poll->events = io_poll_parse_events(sqe, flags);
        return 0;
}

static int __io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_poll_iocb *poll = &req->poll;
        struct io_poll_table ipt;
        int ret;

        ipt.pt._qproc = io_poll_queue_proc;

        ret = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events);
        if (!ret && ipt.error)
                req_set_fail(req);
        ret = ret ?: ipt.error;
        if (ret > 0) {
                __io_req_complete(req, issue_flags, ret, 0);
                return ret;
        }
        return 0;
}

static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
{
        int ret;

        ret = __io_poll_add(req, issue_flags);
        return ret < 0 ? ret : 0;
}

static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_ring_ctx *ctx = req->ctx;
        struct io_kiocb *preq;
        int ret2, ret = 0;

        io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));

        spin_lock(&ctx->completion_lock);
        preq = io_poll_find(ctx, req->poll_update.old_user_data, true);
        if (!preq || !io_poll_disarm(preq)) {
                spin_unlock(&ctx->completion_lock);
                ret = preq ? -EALREADY : -ENOENT;
                goto out;
        }
        preq->result = -ECANCELED;
        spin_unlock(&ctx->completion_lock);

        if (req->poll_update.update_events || req->poll_update.update_user_data) {
                /* only mask one event flags, keep behavior flags */
                if (req->poll_update.update_events) {
                        preq->poll.events &= ~0xffff;
                        preq->poll.events |= req->poll_update.events & 0xffff;
                        preq->poll.events |= IO_POLL_UNMASK;
                }
                if (req->poll_update.update_user_data)
                        preq->user_data = req->poll_update.new_user_data;

                ret2 = __io_poll_add(preq, issue_flags);
                /* successfully updated, don't complete poll request */
                if (!ret2)
                        goto out;
                preq->result = ret2;

        }
        if (preq->result < 0)
                req_set_fail(preq);
        io_req_complete(preq, preq->result);
out:
        /* complete update request, we're done with it */
        io_req_complete(req, ret);
        io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
        return 0;
}

static void io_req_task_timeout(struct io_kiocb *req, bool *locked)
{
        req_set_fail(req);
        io_req_complete_post(req, -ETIME, 0);
}

static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
{
        struct io_timeout_data *data = container_of(timer,
                                                struct io_timeout_data, timer);
        struct io_kiocb *req = data->req;
        struct io_ring_ctx *ctx = req->ctx;
        unsigned long flags;

        spin_lock_irqsave(&ctx->timeout_lock, flags);
        list_del_init(&req->timeout.list);
        atomic_set(&req->ctx->cq_timeouts,
                atomic_read(&req->ctx->cq_timeouts) + 1);
        spin_unlock_irqrestore(&ctx->timeout_lock, flags);

        req->io_task_work.func = io_req_task_timeout;
        io_req_task_work_add(req);
        return HRTIMER_NORESTART;
}

static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
                                           __u64 user_data)
        __must_hold(&ctx->timeout_lock)
{
        struct io_timeout_data *io;
        struct io_kiocb *req;
        bool found = false;

        list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
                found = user_data == req->user_data;
                if (found)
                        break;
        }
        if (!found)
                return ERR_PTR(-ENOENT);

        io = req->async_data;
        if (hrtimer_try_to_cancel(&io->timer) == -1)
                return ERR_PTR(-EALREADY);
        list_del_init(&req->timeout.list);
        return req;
}

static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
        __must_hold(&ctx->completion_lock)
        __must_hold(&ctx->timeout_lock)
{
        struct io_kiocb *req = io_timeout_extract(ctx, user_data);

        if (IS_ERR(req))
                return PTR_ERR(req);

        req_set_fail(req);
        io_fill_cqe_req(req, -ECANCELED, 0);
        io_put_req_deferred(req);
        return 0;
}

static clockid_t io_timeout_get_clock(struct io_timeout_data *data)
{
        switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) {
        case IORING_TIMEOUT_BOOTTIME:
                return CLOCK_BOOTTIME;
        case IORING_TIMEOUT_REALTIME:
                return CLOCK_REALTIME;
        default:
                /* can't happen, vetted at prep time */
                WARN_ON_ONCE(1);
                fallthrough;
        case 0:
                return CLOCK_MONOTONIC;
        }
}

static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
                                    struct timespec64 *ts, enum hrtimer_mode mode)
        __must_hold(&ctx->timeout_lock)
{
        struct io_timeout_data *io;
        struct io_kiocb *req;
        bool found = false;

        list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) {
                found = user_data == req->user_data;
                if (found)
                        break;
        }
        if (!found)
                return -ENOENT;

        io = req->async_data;
        if (hrtimer_try_to_cancel(&io->timer) == -1)
                return -EALREADY;
        hrtimer_init(&io->timer, io_timeout_get_clock(io), mode);
        io->timer.function = io_link_timeout_fn;
        hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode);
        return 0;
}

static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
                             struct timespec64 *ts, enum hrtimer_mode mode)
        __must_hold(&ctx->timeout_lock)
{
        struct io_kiocb *req = io_timeout_extract(ctx, user_data);
        struct io_timeout_data *data;

        if (IS_ERR(req))
                return PTR_ERR(req);

        req->timeout.off = 0; /* noseq */
        data = req->async_data;
        list_add_tail(&req->timeout.list, &ctx->timeout_list);
        hrtimer_init(&data->timer, io_timeout_get_clock(data), mode);
        data->timer.function = io_timeout_fn;
        hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
        return 0;
}

static int io_timeout_remove_prep(struct io_kiocb *req,
                                  const struct io_uring_sqe *sqe)
{
        struct io_timeout_rem *tr = &req->timeout_rem;

        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
                return -EINVAL;
        if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in)
                return -EINVAL;

        tr->ltimeout = false;
        tr->addr = READ_ONCE(sqe->addr);
        tr->flags = READ_ONCE(sqe->timeout_flags);
        if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) {
                if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
                        return -EINVAL;
                if (tr->flags & IORING_LINK_TIMEOUT_UPDATE)
                        tr->ltimeout = true;
                if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS))
                        return -EINVAL;
                if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
                        return -EFAULT;
        } else if (tr->flags) {
                /* timeout removal doesn't support flags */
                return -EINVAL;
        }

        return 0;
}

static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
{
        return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
                                            : HRTIMER_MODE_REL;
}

/*
 * Remove or update an existing timeout command
 */
static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_timeout_rem *tr = &req->timeout_rem;
        struct io_ring_ctx *ctx = req->ctx;
        int ret;

        if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) {
                spin_lock(&ctx->completion_lock);
                spin_lock_irq(&ctx->timeout_lock);
                ret = io_timeout_cancel(ctx, tr->addr);
                spin_unlock_irq(&ctx->timeout_lock);
                spin_unlock(&ctx->completion_lock);
        } else {
                enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags);

                spin_lock_irq(&ctx->timeout_lock);
                if (tr->ltimeout)
                        ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode);
                else
                        ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
                spin_unlock_irq(&ctx->timeout_lock);
        }

        if (ret < 0)
                req_set_fail(req);
        io_req_complete_post(req, ret, 0);
        return 0;
}

static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
                           bool is_timeout_link)
{
        struct io_timeout_data *data;
        unsigned flags;
        u32 off = READ_ONCE(sqe->off);

        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (sqe->ioprio || sqe->buf_index || sqe->len != 1 ||
            sqe->splice_fd_in)
                return -EINVAL;
        if (off && is_timeout_link)
                return -EINVAL;
        flags = READ_ONCE(sqe->timeout_flags);
        if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK))
                return -EINVAL;
        /* more than one clock specified is invalid, obviously */
        if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
                return -EINVAL;

        INIT_LIST_HEAD(&req->timeout.list);
        req->timeout.off = off;
        if (unlikely(off && !req->ctx->off_timeout_used))
                req->ctx->off_timeout_used = true;

        if (!req->async_data && io_alloc_async_data(req))
                return -ENOMEM;

        data = req->async_data;
        data->req = req;
        data->flags = flags;

        if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
                return -EFAULT;

        INIT_LIST_HEAD(&req->timeout.list);
        data->mode = io_translate_timeout_mode(flags);
        hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);

        if (is_timeout_link) {
                struct io_submit_link *link = &req->ctx->submit_state.link;

                if (!link->head)
                        return -EINVAL;
                if (link->last->opcode == IORING_OP_LINK_TIMEOUT)
                        return -EINVAL;
                req->timeout.head = link->last;
                link->last->flags |= REQ_F_ARM_LTIMEOUT;
        }
        return 0;
}

static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_ring_ctx *ctx = req->ctx;
        struct io_timeout_data *data = req->async_data;
        struct list_head *entry;
        u32 tail, off = req->timeout.off;

        spin_lock_irq(&ctx->timeout_lock);

        /*
         * sqe->off holds how many events that need to occur for this
         * timeout event to be satisfied. If it isn't set, then this is
         * a pure timeout request, sequence isn't used.
         */
        if (io_is_timeout_noseq(req)) {
                entry = ctx->timeout_list.prev;
                goto add;
        }

        tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
        req->timeout.target_seq = tail + off;

        /* Update the last seq here in case io_flush_timeouts() hasn't.
         * This is safe because ->completion_lock is held, and submissions
         * and completions are never mixed in the same ->completion_lock section.
         */
        ctx->cq_last_tm_flush = tail;

        /*
         * Insertion sort, ensuring the first entry in the list is always
         * the one we need first.
         */
        list_for_each_prev(entry, &ctx->timeout_list) {
                struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
                                                  timeout.list);

                if (io_is_timeout_noseq(nxt))
                        continue;
                /* nxt.seq is behind @tail, otherwise would've been completed */
                if (off >= nxt->timeout.target_seq - tail)
                        break;
        }
add:
        list_add(&req->timeout.list, entry);
        data->timer.function = io_timeout_fn;
        hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
        spin_unlock_irq(&ctx->timeout_lock);
        return 0;
}

struct io_cancel_data {
        struct io_ring_ctx *ctx;
        u64 user_data;
};

static bool io_cancel_cb(struct io_wq_work *work, void *data)
{
        struct io_kiocb *req = container_of(work, struct io_kiocb, work);
        struct io_cancel_data *cd = data;

        return req->ctx == cd->ctx && req->user_data == cd->user_data;
}

static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data,
                               struct io_ring_ctx *ctx)
{
        struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, };
        enum io_wq_cancel cancel_ret;
        int ret = 0;

        if (!tctx || !tctx->io_wq)
                return -ENOENT;

        cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false);
        switch (cancel_ret) {
        case IO_WQ_CANCEL_OK:
                ret = 0;
                break;
        case IO_WQ_CANCEL_RUNNING:
                ret = -EALREADY;
                break;
        case IO_WQ_CANCEL_NOTFOUND:
                ret = -ENOENT;
                break;
        }

        return ret;
}

static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
{
        struct io_ring_ctx *ctx = req->ctx;
        int ret;

        WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);

        ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
        if (ret != -ENOENT)
                return ret;

        spin_lock(&ctx->completion_lock);
        spin_lock_irq(&ctx->timeout_lock);
        ret = io_timeout_cancel(ctx, sqe_addr);
        spin_unlock_irq(&ctx->timeout_lock);
        if (ret != -ENOENT)
                goto out;
        ret = io_poll_cancel(ctx, sqe_addr, false);
out:
        spin_unlock(&ctx->completion_lock);
        return ret;
}

static int io_async_cancel_prep(struct io_kiocb *req,
                                const struct io_uring_sqe *sqe)
{
        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
                return -EINVAL;
        if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags ||
            sqe->splice_fd_in)
                return -EINVAL;

        req->cancel.addr = READ_ONCE(sqe->addr);
        return 0;
}

static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_ring_ctx *ctx = req->ctx;
        u64 sqe_addr = req->cancel.addr;
        struct io_tctx_node *node;
        int ret;

        ret = io_try_cancel_userdata(req, sqe_addr);
        if (ret != -ENOENT)
                goto done;

        /* slow path, try all io-wq's */
        io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
        ret = -ENOENT;
        list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
                struct io_uring_task *tctx = node->task->io_uring;

                ret = io_async_cancel_one(tctx, req->cancel.addr, ctx);
                if (ret != -ENOENT)
                        break;
        }
        io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
done:
        if (ret < 0)
                req_set_fail(req);
        io_req_complete_post(req, ret, 0);
        return 0;
}

static int io_rsrc_update_prep(struct io_kiocb *req,
                                const struct io_uring_sqe *sqe)
{
        if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
                return -EINVAL;
        if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
                return -EINVAL;

        req->rsrc_update.offset = READ_ONCE(sqe->off);
        req->rsrc_update.nr_args = READ_ONCE(sqe->len);
        if (!req->rsrc_update.nr_args)
                return -EINVAL;
        req->rsrc_update.arg = READ_ONCE(sqe->addr);
        return 0;
}

static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_ring_ctx *ctx = req->ctx;
        struct io_uring_rsrc_update2 up;
        int ret;

        up.offset = req->rsrc_update.offset;
        up.data = req->rsrc_update.arg;
        up.nr = 0;
        up.tags = 0;
        up.resv = 0;
        up.resv2 = 0;

        io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
        ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
                                        &up, req->rsrc_update.nr_args);
        io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));

        if (ret < 0)
                req_set_fail(req);
        __io_req_complete(req, issue_flags, ret, 0);
        return 0;
}

static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        switch (req->opcode) {
        case IORING_OP_NOP:
                if (READ_ONCE(sqe->rw_flags))
                        return -EINVAL;
                return 0;
        case IORING_OP_READV:
        case IORING_OP_READ_FIXED:
        case IORING_OP_READ:
                return io_read_prep(req, sqe);
        case IORING_OP_WRITEV:
        case IORING_OP_WRITE_FIXED:
        case IORING_OP_WRITE:
                return io_write_prep(req, sqe);
        case IORING_OP_POLL_ADD:
                return io_poll_add_prep(req, sqe);
        case IORING_OP_POLL_REMOVE:
                return io_poll_update_prep(req, sqe);
        case IORING_OP_FSYNC:
                return io_fsync_prep(req, sqe);
        case IORING_OP_SYNC_FILE_RANGE:
                return io_sfr_prep(req, sqe);
        case IORING_OP_SENDMSG:
        case IORING_OP_SEND:
                return io_sendmsg_prep(req, sqe);
        case IORING_OP_RECVMSG:
        case IORING_OP_RECV:
                return io_recvmsg_prep(req, sqe);
        case IORING_OP_CONNECT:
                return io_connect_prep(req, sqe);
        case IORING_OP_TIMEOUT:
                return io_timeout_prep(req, sqe, false);
        case IORING_OP_TIMEOUT_REMOVE:
                return io_timeout_remove_prep(req, sqe);
        case IORING_OP_ASYNC_CANCEL:
                return io_async_cancel_prep(req, sqe);
        case IORING_OP_LINK_TIMEOUT:
                return io_timeout_prep(req, sqe, true);
        case IORING_OP_ACCEPT:
                return io_accept_prep(req, sqe);
        case IORING_OP_FALLOCATE:
                return io_fallocate_prep(req, sqe);
        case IORING_OP_OPENAT:
                return io_openat_prep(req, sqe);
        case IORING_OP_CLOSE:
                return io_close_prep(req, sqe);
        case IORING_OP_FILES_UPDATE:
                return io_rsrc_update_prep(req, sqe);
        case IORING_OP_STATX:
                return io_statx_prep(req, sqe);
        case IORING_OP_FADVISE:
                return io_fadvise_prep(req, sqe);
        case IORING_OP_MADVISE:
                return io_madvise_prep(req, sqe);
        case IORING_OP_OPENAT2:
                return io_openat2_prep(req, sqe);
        case IORING_OP_EPOLL_CTL:
                return io_epoll_ctl_prep(req, sqe);
        case IORING_OP_SPLICE:
                return io_splice_prep(req, sqe);
        case IORING_OP_PROVIDE_BUFFERS:
                return io_provide_buffers_prep(req, sqe);
        case IORING_OP_REMOVE_BUFFERS:
                return io_remove_buffers_prep(req, sqe);
        case IORING_OP_TEE:
                return io_tee_prep(req, sqe);
        case IORING_OP_SHUTDOWN:
                return io_shutdown_prep(req, sqe);
        case IORING_OP_RENAMEAT:
                return io_renameat_prep(req, sqe);
        case IORING_OP_UNLINKAT:
                return io_unlinkat_prep(req, sqe);
        }

        printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
                        req->opcode);
        return -EINVAL;
}

static int io_req_prep_async(struct io_kiocb *req)
{
        if (!io_op_defs[req->opcode].needs_async_setup)
                return 0;
        if (WARN_ON_ONCE(req->async_data))
                return -EFAULT;
        if (io_alloc_async_data(req))
                return -EAGAIN;

        switch (req->opcode) {
        case IORING_OP_READV:
                return io_rw_prep_async(req, READ);
        case IORING_OP_WRITEV:
                return io_rw_prep_async(req, WRITE);
        case IORING_OP_SENDMSG:
                return io_sendmsg_prep_async(req);
        case IORING_OP_RECVMSG:
                return io_recvmsg_prep_async(req);
        case IORING_OP_CONNECT:
                return io_connect_prep_async(req);
        }
        printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n",
                    req->opcode);
        return -EFAULT;
}

static u32 io_get_sequence(struct io_kiocb *req)
{
        u32 seq = req->ctx->cached_sq_head;

        /* need original cached_sq_head, but it was increased for each req */
        io_for_each_link(req, req)
                seq--;
        return seq;
}

static bool io_drain_req(struct io_kiocb *req)
{
        struct io_kiocb *pos;
        struct io_ring_ctx *ctx = req->ctx;
        struct io_defer_entry *de;
        int ret;
        u32 seq;

        if (req->flags & REQ_F_FAIL) {
                io_req_complete_fail_submit(req);
                return true;
        }

        /*
         * If we need to drain a request in the middle of a link, drain the
         * head request and the next request/link after the current link.
         * Considering sequential execution of links, IOSQE_IO_DRAIN will be
         * maintained for every request of our link.
         */
        if (ctx->drain_next) {
                req->flags |= REQ_F_IO_DRAIN;
                ctx->drain_next = false;
        }
        /* not interested in head, start from the first linked */
        io_for_each_link(pos, req->link) {
                if (pos->flags & REQ_F_IO_DRAIN) {
                        ctx->drain_next = true;
                        req->flags |= REQ_F_IO_DRAIN;
                        break;
                }
        }

        /* Still need defer if there is pending req in defer list. */
        spin_lock(&ctx->completion_lock);
        if (likely(list_empty_careful(&ctx->defer_list) &&
                !(req->flags & REQ_F_IO_DRAIN))) {
                spin_unlock(&ctx->completion_lock);
                ctx->drain_active = false;
                return false;
        }
        spin_unlock(&ctx->completion_lock);

        seq = io_get_sequence(req);
        /* Still a chance to pass the sequence check */
        if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
                return false;

        ret = io_req_prep_async(req);
        if (ret)
                goto fail;
        io_prep_async_link(req);
        de = kmalloc(sizeof(*de), GFP_KERNEL);
        if (!de) {
                ret = -ENOMEM;
fail:
                io_req_complete_failed(req, ret);
                return true;
        }

        spin_lock(&ctx->completion_lock);
        if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
                spin_unlock(&ctx->completion_lock);
                kfree(de);
                io_queue_async_work(req, NULL);
                return true;
        }

        trace_io_uring_defer(ctx, req, req->user_data);
        de->req = req;
        de->seq = seq;
        list_add_tail(&de->list, &ctx->defer_list);
        spin_unlock(&ctx->completion_lock);
        return true;
}

static void io_clean_op(struct io_kiocb *req)
{
        if (req->flags & REQ_F_BUFFER_SELECTED) {
                switch (req->opcode) {
                case IORING_OP_READV:
                case IORING_OP_READ_FIXED:
                case IORING_OP_READ:
                        kfree((void *)(unsigned long)req->rw.addr);
                        break;
                case IORING_OP_RECVMSG:
                case IORING_OP_RECV:
                        kfree(req->sr_msg.kbuf);
                        break;
                }
        }

        if (req->flags & REQ_F_NEED_CLEANUP) {
                switch (req->opcode) {
                case IORING_OP_READV:
                case IORING_OP_READ_FIXED:
                case IORING_OP_READ:
                case IORING_OP_WRITEV:
                case IORING_OP_WRITE_FIXED:
                case IORING_OP_WRITE: {
                        struct io_async_rw *io = req->async_data;

                        kfree(io->free_iovec);
                        break;
                        }
                case IORING_OP_RECVMSG:
                case IORING_OP_SENDMSG: {
                        struct io_async_msghdr *io = req->async_data;

                        kfree(io->free_iov);
                        break;
                        }
                case IORING_OP_OPENAT:
                case IORING_OP_OPENAT2:
                        if (req->open.filename)
                                putname(req->open.filename);
                        break;
                case IORING_OP_RENAMEAT:
                        putname(req->rename.oldpath);
                        putname(req->rename.newpath);
                        break;
                case IORING_OP_UNLINKAT:
                        putname(req->unlink.filename);
                        break;
                }
        }
        if ((req->flags & REQ_F_POLLED) && req->apoll) {
                kfree(req->apoll->double_poll);
                kfree(req->apoll);
                req->apoll = NULL;
        }
        if (req->flags & REQ_F_INFLIGHT) {
                struct io_uring_task *tctx = req->task->io_uring;

                atomic_dec(&tctx->inflight_tracked);
        }
        if (req->flags & REQ_F_CREDS)
                put_cred(req->creds);

        req->flags &= ~IO_REQ_CLEAN_FLAGS;
}

static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_ring_ctx *ctx = req->ctx;
        const struct cred *creds = NULL;
        int ret;

        if ((req->flags & REQ_F_CREDS) && req->creds != current_cred())
                creds = override_creds(req->creds);

        switch (req->opcode) {
        case IORING_OP_NOP:
                ret = io_nop(req, issue_flags);
                break;
        case IORING_OP_READV:
        case IORING_OP_READ_FIXED:
        case IORING_OP_READ:
                ret = io_read(req, issue_flags);
                break;
        case IORING_OP_WRITEV:
        case IORING_OP_WRITE_FIXED:
        case IORING_OP_WRITE:
                ret = io_write(req, issue_flags);
                break;
        case IORING_OP_FSYNC:
                ret = io_fsync(req, issue_flags);
                break;
        case IORING_OP_POLL_ADD:
                ret = io_poll_add(req, issue_flags);
                break;
        case IORING_OP_POLL_REMOVE:
                ret = io_poll_update(req, issue_flags);
                break;
        case IORING_OP_SYNC_FILE_RANGE:
                ret = io_sync_file_range(req, issue_flags);
                break;
        case IORING_OP_SENDMSG:
                ret = io_sendmsg(req, issue_flags);
                break;
        case IORING_OP_SEND:
                ret = io_send(req, issue_flags);
                break;
        case IORING_OP_RECVMSG:
                ret = io_recvmsg(req, issue_flags);
                break;
        case IORING_OP_RECV:
                ret = io_recv(req, issue_flags);
                break;
        case IORING_OP_TIMEOUT:
                ret = io_timeout(req, issue_flags);
                break;
        case IORING_OP_TIMEOUT_REMOVE:
                ret = io_timeout_remove(req, issue_flags);
                break;
        case IORING_OP_ACCEPT:
                ret = io_accept(req, issue_flags);
                break;
        case IORING_OP_CONNECT:
                ret = io_connect(req, issue_flags);
                break;
        case IORING_OP_ASYNC_CANCEL:
                ret = io_async_cancel(req, issue_flags);
                break;
        case IORING_OP_FALLOCATE:
                ret = io_fallocate(req, issue_flags);
                break;
        case IORING_OP_OPENAT:
                ret = io_openat(req, issue_flags);
                break;
        case IORING_OP_CLOSE:
                ret = io_close(req, issue_flags);
                break;
        case IORING_OP_FILES_UPDATE:
                ret = io_files_update(req, issue_flags);
                break;
        case IORING_OP_STATX:
                ret = io_statx(req, issue_flags);
                break;
        case IORING_OP_FADVISE:
                ret = io_fadvise(req, issue_flags);
                break;
        case IORING_OP_MADVISE:
                ret = io_madvise(req, issue_flags);
                break;
        case IORING_OP_OPENAT2:
                ret = io_openat2(req, issue_flags);
                break;
        case IORING_OP_EPOLL_CTL:
                ret = io_epoll_ctl(req, issue_flags);
                break;
        case IORING_OP_SPLICE:
                ret = io_splice(req, issue_flags);
                break;
        case IORING_OP_PROVIDE_BUFFERS:
                ret = io_provide_buffers(req, issue_flags);
                break;
        case IORING_OP_REMOVE_BUFFERS:
                ret = io_remove_buffers(req, issue_flags);
                break;
        case IORING_OP_TEE:
                ret = io_tee(req, issue_flags);
                break;
        case IORING_OP_SHUTDOWN:
                ret = io_shutdown(req, issue_flags);
                break;
        case IORING_OP_RENAMEAT:
                ret = io_renameat(req, issue_flags);
                break;
        case IORING_OP_UNLINKAT:
                ret = io_unlinkat(req, issue_flags);
                break;
        default:
                ret = -EINVAL;
                break;
        }

        if (creds)
                revert_creds(creds);
        if (ret)
                return ret;
        /* If the op doesn't have a file, we're not polling for it */
        if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file)
                io_iopoll_req_issued(req);

        return 0;
}

static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
{
        struct io_kiocb *req = container_of(work, struct io_kiocb, work);

        req = io_put_req_find_next(req);
        return req ? &req->work : NULL;
}

static void io_wq_submit_work(struct io_wq_work *work)
{
        struct io_kiocb *req = container_of(work, struct io_kiocb, work);
        struct io_kiocb *timeout;
        int ret = 0;

        /* one will be dropped by ->io_free_work() after returning to io-wq */
        if (!(req->flags & REQ_F_REFCOUNT))
                __io_req_set_refcount(req, 2);
        else
                req_ref_get(req);

        timeout = io_prep_linked_timeout(req);
        if (timeout)
                io_queue_linked_timeout(timeout);

        /* either cancelled or io-wq is dying, so don't touch tctx->iowq */
        if (work->flags & IO_WQ_WORK_CANCEL)
                ret = -ECANCELED;

        if (!ret) {
                do {
                        ret = io_issue_sqe(req, 0);
                        /*
                         * We can get EAGAIN for polled IO even though we're
                         * forcing a sync submission from here, since we can't
                         * wait for request slots on the block side.
                         */
                        if (ret != -EAGAIN || !(req->ctx->flags & IORING_SETUP_IOPOLL))
                                break;
                        if (io_wq_worker_stopped())
                                break;
                        /*
                         * If REQ_F_NOWAIT is set, then don't wait or retry with
                         * poll. -EAGAIN is final for that case.
                         */
                        if (req->flags & REQ_F_NOWAIT)
                                break;

                        cond_resched();
                } while (1);
        }

        /* avoid locking problems by failing it from a clean context */
        if (ret)
                io_req_task_queue_fail(req, ret);
}

static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table,
                                                       unsigned i)
{
        return &table->files[i];
}

static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
                                              int index)
{
        struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index);

        return (struct file *) (slot->file_ptr & FFS_MASK);
}

static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file)
{
        unsigned long file_ptr = (unsigned long) file;

        if (__io_file_supports_nowait(file, READ))
                file_ptr |= FFS_ASYNC_READ;
        if (__io_file_supports_nowait(file, WRITE))
                file_ptr |= FFS_ASYNC_WRITE;
        if (S_ISREG(file_inode(file)->i_mode))
                file_ptr |= FFS_ISREG;
        file_slot->file_ptr = file_ptr;
}

static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
                                             struct io_kiocb *req, int fd,
                                             unsigned int issue_flags)
{
        struct file *file = NULL;
        unsigned long file_ptr;

        io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));

        if (unlikely((unsigned int)fd >= ctx->nr_user_files))
                goto out;
        fd = array_index_nospec(fd, ctx->nr_user_files);
        file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
        file = (struct file *) (file_ptr & FFS_MASK);
        file_ptr &= ~FFS_MASK;
        /* mask in overlapping REQ_F and FFS bits */
        req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT);
        io_req_set_rsrc_node(req);
out:
        io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
        return file;
}

static struct file *io_file_get_normal(struct io_ring_ctx *ctx,
                                       struct io_kiocb *req, int fd)
{
        struct file *file = fget(fd);

        trace_io_uring_file_get(ctx, fd);

        /* we don't allow fixed io_uring files */
        if (file && unlikely(file->f_op == &io_uring_fops))
                io_req_track_inflight(req);
        return file;
}

static inline struct file *io_file_get(struct io_ring_ctx *ctx,
                                       struct io_kiocb *req, int fd, bool fixed,
                                       unsigned int issue_flags)
{
        if (fixed)
                return io_file_get_fixed(ctx, req, fd, issue_flags);
        else
                return io_file_get_normal(ctx, req, fd);
}

static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
{
        struct io_kiocb *prev = req->timeout.prev;
        int ret = -ENOENT;

        if (prev) {
                if (!(req->task->flags & PF_EXITING))
                        ret = io_try_cancel_userdata(req, prev->user_data);
                io_req_complete_post(req, ret ?: -ETIME, 0);
                io_put_req(prev);
        } else {
                io_req_complete_post(req, -ETIME, 0);
        }
}

static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
{
        struct io_timeout_data *data = container_of(timer,
                                                struct io_timeout_data, timer);
        struct io_kiocb *prev, *req = data->req;
        struct io_ring_ctx *ctx = req->ctx;
        unsigned long flags;

        spin_lock_irqsave(&ctx->timeout_lock, flags);
        prev = req->timeout.head;
        req->timeout.head = NULL;

        /*
         * We don't expect the list to be empty, that will only happen if we
         * race with the completion of the linked work.
         */
        if (prev) {
                io_remove_next_linked(prev);
                if (!req_ref_inc_not_zero(prev))
                        prev = NULL;
        }
        list_del(&req->timeout.list);
        req->timeout.prev = prev;
        spin_unlock_irqrestore(&ctx->timeout_lock, flags);

        req->io_task_work.func = io_req_task_link_timeout;
        io_req_task_work_add(req);
        return HRTIMER_NORESTART;
}

static void io_queue_linked_timeout(struct io_kiocb *req)
{
        struct io_ring_ctx *ctx = req->ctx;

        spin_lock_irq(&ctx->timeout_lock);
        /*
         * If the back reference is NULL, then our linked request finished
         * before we got a chance to setup the timer
         */
        if (req->timeout.head) {
                struct io_timeout_data *data = req->async_data;

                data->timer.function = io_link_timeout_fn;
                hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
                                data->mode);
                list_add_tail(&req->timeout.list, &ctx->ltimeout_list);
        }
        spin_unlock_irq(&ctx->timeout_lock);
        /* drop submission reference */
        io_put_req(req);
}

static void __io_queue_sqe(struct io_kiocb *req)
        __must_hold(&req->ctx->uring_lock)
{
        struct io_kiocb *linked_timeout;
        int ret;

issue_sqe:
        ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);

        /*
         * We async punt it if the file wasn't marked NOWAIT, or if the file
         * doesn't support non-blocking read/write attempts
         */
        if (likely(!ret)) {
                if (req->flags & REQ_F_COMPLETE_INLINE) {
                        struct io_ring_ctx *ctx = req->ctx;
                        struct io_submit_state *state = &ctx->submit_state;

                        state->compl_reqs[state->compl_nr++] = req;
                        if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
                                io_submit_flush_completions(ctx);
                        return;
                }

                linked_timeout = io_prep_linked_timeout(req);
                if (linked_timeout)
                        io_queue_linked_timeout(linked_timeout);
        } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
                linked_timeout = io_prep_linked_timeout(req);

                switch (io_arm_poll_handler(req)) {
                case IO_APOLL_READY:
                        if (linked_timeout)
                                io_queue_linked_timeout(linked_timeout);
                        goto issue_sqe;
                case IO_APOLL_ABORTED:
                        /*
                         * Queued up for async execution, worker will release
                         * submit reference when the iocb is actually submitted.
                         */
                        io_queue_async_work(req, NULL);
                        break;
                }

                if (linked_timeout)
                        io_queue_linked_timeout(linked_timeout);
        } else {
                io_req_complete_failed(req, ret);
        }
}

static inline void io_queue_sqe(struct io_kiocb *req)
        __must_hold(&req->ctx->uring_lock)
{
        if (unlikely(req->ctx->drain_active) && io_drain_req(req))
                return;

        if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) {
                __io_queue_sqe(req);
        } else if (req->flags & REQ_F_FAIL) {
                io_req_complete_fail_submit(req);
        } else {
                int ret = io_req_prep_async(req);

                if (unlikely(ret))
                        io_req_complete_failed(req, ret);
                else
                        io_queue_async_work(req, NULL);
        }
}

/*
 * Check SQE restrictions (opcode and flags).
 *
 * Returns 'true' if SQE is allowed, 'false' otherwise.
 */
static inline bool io_check_restriction(struct io_ring_ctx *ctx,
                                        struct io_kiocb *req,
                                        unsigned int sqe_flags)
{
        if (likely(!ctx->restricted))
                return true;

        if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
                return false;

        if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
            ctx->restrictions.sqe_flags_required)
                return false;

        if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
                          ctx->restrictions.sqe_flags_required))
                return false;

        return true;
}

static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
                       const struct io_uring_sqe *sqe)
        __must_hold(&ctx->uring_lock)
{
        struct io_submit_state *state;
        unsigned int sqe_flags;
        int personality, ret = 0;

        /* req is partially pre-initialised, see io_preinit_req() */
        req->opcode = READ_ONCE(sqe->opcode);
        /* same numerical values with corresponding REQ_F_*, safe to copy */
        req->flags = sqe_flags = READ_ONCE(sqe->flags);
        req->user_data = READ_ONCE(sqe->user_data);
        req->file = NULL;
        req->fixed_rsrc_refs = NULL;
        req->task = current;

        /* enforce forwards compatibility on users */
        if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
                return -EINVAL;
        if (unlikely(req->opcode >= IORING_OP_LAST))
                return -EINVAL;
        if (!io_check_restriction(ctx, req, sqe_flags))
                return -EACCES;

        if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
            !io_op_defs[req->opcode].buffer_select)
                return -EOPNOTSUPP;
        if (unlikely(sqe_flags & IOSQE_IO_DRAIN))
                ctx->drain_active = true;

        personality = READ_ONCE(sqe->personality);
        if (personality) {
                req->creds = xa_load(&ctx->personalities, personality);
                if (!req->creds)
                        return -EINVAL;
                get_cred(req->creds);
                req->flags |= REQ_F_CREDS;
        }
        state = &ctx->submit_state;

        /*
         * Plug now if we have more than 1 IO left after this, and the target
         * is potentially a read/write to block based storage.
         */
        if (!state->plug_started && state->ios_left > 1 &&
            io_op_defs[req->opcode].plug) {
                blk_start_plug(&state->plug);
                state->plug_started = true;
        }

        if (io_op_defs[req->opcode].needs_file) {
                req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd),
                                        (sqe_flags & IOSQE_FIXED_FILE),
                                        IO_URING_F_NONBLOCK);
                if (unlikely(!req->file))
                        ret = -EBADF;
        }

        state->ios_left--;
        return ret;
}

static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
                         const struct io_uring_sqe *sqe)
        __must_hold(&ctx->uring_lock)
{
        struct io_submit_link *link = &ctx->submit_state.link;
        int ret;

        ret = io_init_req(ctx, req, sqe);
        if (unlikely(ret)) {
fail_req:
                /* fail even hard links since we don't submit */
                if (link->head) {
                        /*
                         * we can judge a link req is failed or cancelled by if
                         * REQ_F_FAIL is set, but the head is an exception since
                         * it may be set REQ_F_FAIL because of other req's failure
                         * so let's leverage req->result to distinguish if a head
                         * is set REQ_F_FAIL because of its failure or other req's
                         * failure so that we can set the correct ret code for it.
                         * init result here to avoid affecting the normal path.
                         */
                        if (!(link->head->flags & REQ_F_FAIL))
                                req_fail_link_node(link->head, -ECANCELED);
                } else if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
                        /*
                         * the current req is a normal req, we should return
                         * error and thus break the submittion loop.
                         */
                        io_req_complete_failed(req, ret);
                        return ret;
                }
                req_fail_link_node(req, ret);
        } else {
                ret = io_req_prep(req, sqe);
                if (unlikely(ret))
                        goto fail_req;
        }

        /* don't need @sqe from now on */
        trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data,
                                  req->flags, true,
                                  ctx->flags & IORING_SETUP_SQPOLL);

        /*
         * If we already have a head request, queue this one for async
         * submittal once the head completes. If we don't have a head but
         * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
         * submitted sync once the chain is complete. If none of those
         * conditions are true (normal request), then just queue it.
         */
        if (link->head) {
                struct io_kiocb *head = link->head;

                if (!(req->flags & REQ_F_FAIL)) {
                        ret = io_req_prep_async(req);
                        if (unlikely(ret)) {
                                req_fail_link_node(req, ret);
                                if (!(head->flags & REQ_F_FAIL))
                                        req_fail_link_node(head, -ECANCELED);
                        }
                }
                trace_io_uring_link(ctx, req, head);
                link->last->link = req;
                link->last = req;

                /* last request of a link, enqueue the link */
                if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
                        link->head = NULL;
                        io_queue_sqe(head);
                }
        } else {
                if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
                        link->head = req;
                        link->last = req;
                } else {
                        io_queue_sqe(req);
                }
        }

        return 0;
}

/*
 * Batched submission is done, ensure local IO is flushed out.
 */
static void io_submit_state_end(struct io_submit_state *state,
                                struct io_ring_ctx *ctx)
{
        if (state->link.head)
                io_queue_sqe(state->link.head);
        if (state->compl_nr)
                io_submit_flush_completions(ctx);
        if (state->plug_started)
                blk_finish_plug(&state->plug);
}

/*
 * Start submission side cache.
 */
static void io_submit_state_start(struct io_submit_state *state,
                                  unsigned int max_ios)
{
        state->plug_started = false;
        state->ios_left = max_ios;
        /* set only head, no need to init link_last in advance */
        state->link.head = NULL;
}

static void io_commit_sqring(struct io_ring_ctx *ctx)
{
        struct io_rings *rings = ctx->rings;

        /*
         * Ensure any loads from the SQEs are done at this point,
         * since once we write the new head, the application could
         * write new data to them.
         */
        smp_store_release(&rings->sq.head, ctx->cached_sq_head);
}

/*
 * Fetch an sqe, if one is available. Note this returns a pointer to memory
 * that is mapped by userspace. This means that care needs to be taken to
 * ensure that reads are stable, as we cannot rely on userspace always
 * being a good citizen. If members of the sqe are validated and then later
 * used, it's important that those reads are done through READ_ONCE() to
 * prevent a re-load down the line.
 */
static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
{
        unsigned head, mask = ctx->sq_entries - 1;
        unsigned sq_idx = ctx->cached_sq_head++ & mask;

        /*
         * The cached sq head (or cq tail) serves two purposes:
         *
         * 1) allows us to batch the cost of updating the user visible
         *    head updates.
         * 2) allows the kernel side to track the head on its own, even
         *    though the application is the one updating it.
         */
        head = READ_ONCE(ctx->sq_array[sq_idx]);
        if (likely(head < ctx->sq_entries))
                return &ctx->sq_sqes[head];

        /* drop invalid entries */
        ctx->cq_extra--;
        WRITE_ONCE(ctx->rings->sq_dropped,
                   READ_ONCE(ctx->rings->sq_dropped) + 1);
        return NULL;
}

static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
        __must_hold(&ctx->uring_lock)
{
        int submitted = 0;

        /* make sure SQ entry isn't read before tail */
        nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
        if (!percpu_ref_tryget_many(&ctx->refs, nr))
                return -EAGAIN;
        io_get_task_refs(nr);

        io_submit_state_start(&ctx->submit_state, nr);
        while (submitted < nr) {
                const struct io_uring_sqe *sqe;
                struct io_kiocb *req;

                req = io_alloc_req(ctx);
                if (unlikely(!req)) {
                        if (!submitted)
                                submitted = -EAGAIN;
                        break;
                }
                sqe = io_get_sqe(ctx);
                if (unlikely(!sqe)) {
                        list_add(&req->inflight_entry, &ctx->submit_state.free_list);
                        break;
                }
                /* will complete beyond this point, count as submitted */
                submitted++;
                if (io_submit_sqe(ctx, req, sqe))
                        break;
        }

        if (unlikely(submitted != nr)) {
                int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
                int unused = nr - ref_used;

                current->io_uring->cached_refs += unused;
                percpu_ref_put_many(&ctx->refs, unused);
        }

        io_submit_state_end(&ctx->submit_state, ctx);
         /* Commit SQ ring head once we've consumed and submitted all SQEs */
        io_commit_sqring(ctx);

        return submitted;
}

static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
{
        return READ_ONCE(sqd->state);
}

static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
{
        /* Tell userspace we may need a wakeup call */
        spin_lock(&ctx->completion_lock);
        WRITE_ONCE(ctx->rings->sq_flags,
                   ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP);
        spin_unlock(&ctx->completion_lock);
}

static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
{
        spin_lock(&ctx->completion_lock);
        WRITE_ONCE(ctx->rings->sq_flags,
                   ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP);
        spin_unlock(&ctx->completion_lock);
}

static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
{
        unsigned int to_submit;
        int ret = 0;

        to_submit = io_sqring_entries(ctx);
        /* if we're handling multiple rings, cap submit size for fairness */
        if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
                to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;

        if (!list_empty(&ctx->iopoll_list) || to_submit) {
                unsigned nr_events = 0;
                const struct cred *creds = NULL;

                if (ctx->sq_creds != current_cred())
                        creds = override_creds(ctx->sq_creds);

                mutex_lock(&ctx->uring_lock);
                if (!list_empty(&ctx->iopoll_list))
                        io_do_iopoll(ctx, &nr_events, 0);

                /*
                 * Don't submit if refs are dying, good for io_uring_register(),
                 * but also it is relied upon by io_ring_exit_work()
                 */
                if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
                    !(ctx->flags & IORING_SETUP_R_DISABLED))
                        ret = io_submit_sqes(ctx, to_submit);
                mutex_unlock(&ctx->uring_lock);

                if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
                        wake_up(&ctx->sqo_sq_wait);
                if (creds)
                        revert_creds(creds);
        }

        return ret;
}

static void io_sqd_update_thread_idle(struct io_sq_data *sqd)
{
        struct io_ring_ctx *ctx;
        unsigned sq_thread_idle = 0;

        list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
                sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle);
        sqd->sq_thread_idle = sq_thread_idle;
}

static bool io_sqd_handle_event(struct io_sq_data *sqd)
{
        bool did_sig = false;
        struct ksignal ksig;

        if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) ||
            signal_pending(current)) {
                mutex_unlock(&sqd->lock);
                if (signal_pending(current))
                        did_sig = get_signal(&ksig);
                cond_resched();
                mutex_lock(&sqd->lock);
        }
        return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
}

static int io_sq_thread(void *data)
{
        struct io_sq_data *sqd = data;
        struct io_ring_ctx *ctx;
        unsigned long timeout = 0;
        char buf[TASK_COMM_LEN];
        DEFINE_WAIT(wait);

        snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
        set_task_comm(current, buf);

        if (sqd->sq_cpu != -1)
                set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
        else
                set_cpus_allowed_ptr(current, cpu_online_mask);
        current->flags |= PF_NO_SETAFFINITY;

        mutex_lock(&sqd->lock);
        while (1) {
                bool cap_entries, sqt_spin = false;

                if (io_sqd_events_pending(sqd) || signal_pending(current)) {
                        if (io_sqd_handle_event(sqd))
                                break;
                        timeout = jiffies + sqd->sq_thread_idle;
                }

                cap_entries = !list_is_singular(&sqd->ctx_list);
                list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
                        int ret = __io_sq_thread(ctx, cap_entries);

                        if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
                                sqt_spin = true;
                }
                if (io_run_task_work())
                        sqt_spin = true;

                if (sqt_spin || !time_after(jiffies, timeout)) {
                        cond_resched();
                        if (sqt_spin)
                                timeout = jiffies + sqd->sq_thread_idle;
                        continue;
                }

                prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
                if (!io_sqd_events_pending(sqd) && !current->task_works) {
                        bool needs_sched = true;

                        list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
                                io_ring_set_wakeup_flag(ctx);

                                if ((ctx->flags & IORING_SETUP_IOPOLL) &&
                                    !list_empty_careful(&ctx->iopoll_list)) {
                                        needs_sched = false;
                                        break;
                                }
                                if (io_sqring_entries(ctx)) {
                                        needs_sched = false;
                                        break;
                                }
                        }

                        if (needs_sched) {
                                mutex_unlock(&sqd->lock);
                                schedule();
                                mutex_lock(&sqd->lock);
                        }
                        list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
                                io_ring_clear_wakeup_flag(ctx);
                }

                finish_wait(&sqd->wait, &wait);
                timeout = jiffies + sqd->sq_thread_idle;
        }

        io_uring_cancel_generic(true, sqd);
        sqd->thread = NULL;
        list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
                io_ring_set_wakeup_flag(ctx);
        io_run_task_work();
        mutex_unlock(&sqd->lock);

        complete(&sqd->exited);
        do_exit(0);
}

struct io_wait_queue {
        struct wait_queue_entry wq;
        struct io_ring_ctx *ctx;
        unsigned cq_tail;
        unsigned nr_timeouts;
};

static inline bool io_should_wake(struct io_wait_queue *iowq)
{
        struct io_ring_ctx *ctx = iowq->ctx;
        int dist = ctx->cached_cq_tail - (int) iowq->cq_tail;

        /*
         * Wake up if we have enough events, or if a timeout occurred since we
         * started waiting. For timeouts, we always want to return to userspace,
         * regardless of event count.
         */
        return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
}

static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
                            int wake_flags, void *key)
{
        struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
                                                        wq);

        /*
         * Cannot safely flush overflowed CQEs from here, ensure we wake up
         * the task, and the next invocation will do it.
         */
        if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->check_cq_overflow))
                return autoremove_wake_function(curr, mode, wake_flags, key);
        return -1;
}

static int io_run_task_work_sig(void)
{
        if (io_run_task_work())
                return 1;
        if (!signal_pending(current))
                return 0;
        if (test_thread_flag(TIF_NOTIFY_SIGNAL))
                return -ERESTARTSYS;
        return -EINTR;
}

static bool current_pending_io(void)
{
        struct io_uring_task *tctx = current->io_uring;

        if (!tctx)
                return false;
        return percpu_counter_read_positive(&tctx->inflight);
}

/* when returns >0, the caller should retry */
static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
                                          struct io_wait_queue *iowq,
                                          ktime_t *timeout)
{
        int ret;

        /* make sure we run task_work before checking for signals */
        ret = io_run_task_work_sig();
        if (ret || io_should_wake(iowq))
                return ret;
        /* let the caller flush overflows, retry */
        if (test_bit(0, &ctx->check_cq_overflow))
                return 1;

        /*
         * Mark us as being in io_wait if we have pending requests, so cpufreq
         * can take into account that the task is waiting for IO - turns out
         * to be important for low QD IO.
         */
        if (current_pending_io())
                current->in_iowait = 1;
        ret = 1;
        if (!schedule_hrtimeout(timeout, HRTIMER_MODE_ABS))
                ret = -ETIME;
        current->in_iowait = 0;
        return ret;
}

/*
 * Wait until events become available, if we don't already have some. The
 * application must reap them itself, as they reside on the shared cq ring.
 */
static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
                          const sigset_t __user *sig, size_t sigsz,
                          struct __kernel_timespec __user *uts)
{
        struct io_wait_queue iowq;
        struct io_rings *rings = ctx->rings;
        ktime_t timeout = KTIME_MAX;
        int ret;

        do {
                io_cqring_overflow_flush(ctx);
                if (io_cqring_events(ctx) >= min_events)
                        return 0;
                if (!io_run_task_work())
                        break;
        } while (1);

        if (uts) {
                struct timespec64 ts;

                if (get_timespec64(&ts, uts))
                        return -EFAULT;
                timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
        }

        if (sig) {
#ifdef CONFIG_COMPAT
                if (in_compat_syscall())
                        ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
                                                      sigsz);
                else
#endif
                        ret = set_user_sigmask(sig, sigsz);

                if (ret)
                        return ret;
        }

        init_waitqueue_func_entry(&iowq.wq, io_wake_function);
        iowq.wq.private = current;
        INIT_LIST_HEAD(&iowq.wq.entry);
        iowq.ctx = ctx;
        iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
        iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;

        trace_io_uring_cqring_wait(ctx, min_events);
        do {
                /* if we can't even flush overflow, don't wait for more */
                if (!io_cqring_overflow_flush(ctx)) {
                        ret = -EBUSY;
                        break;
                }
                prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
                                                TASK_INTERRUPTIBLE);
                ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
                finish_wait(&ctx->cq_wait, &iowq.wq);
                cond_resched();
        } while (ret > 0);

        restore_saved_sigmask_unless(ret == -EINTR);

        return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
}

static void io_free_page_table(void **table, size_t size)
{
        unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);

        for (i = 0; i < nr_tables; i++)
                kfree(table[i]);
        kfree(table);
}

static void **io_alloc_page_table(size_t size)
{
        unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
        size_t init_size = size;
        void **table;

        table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
        if (!table)
                return NULL;

        for (i = 0; i < nr_tables; i++) {
                unsigned int this_size = min_t(size_t, size, PAGE_SIZE);

                table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
                if (!table[i]) {
                        io_free_page_table(table, init_size);
                        return NULL;
                }
                size -= this_size;
        }
        return table;
}

static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
{
        percpu_ref_exit(&ref_node->refs);
        kfree(ref_node);
}

static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
{
        struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
        struct io_ring_ctx *ctx = node->rsrc_data->ctx;
        unsigned long flags;
        bool first_add = false;
        unsigned long delay = HZ;

        spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
        node->done = true;

        /* if we are mid-quiesce then do not delay */
        if (node->rsrc_data->quiesce)
                delay = 0;

        while (!list_empty(&ctx->rsrc_ref_list)) {
                node = list_first_entry(&ctx->rsrc_ref_list,
                                            struct io_rsrc_node, node);
                /* recycle ref nodes in order */
                if (!node->done)
                        break;
                list_del(&node->node);
                first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
        }
        spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);

        if (first_add)
                mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
}

static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
{
        struct io_rsrc_node *ref_node;

        ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
        if (!ref_node)
                return NULL;

        if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
                            0, GFP_KERNEL)) {
                kfree(ref_node);
                return NULL;
        }
        INIT_LIST_HEAD(&ref_node->node);
        INIT_LIST_HEAD(&ref_node->rsrc_list);
        ref_node->done = false;
        return ref_node;
}

static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
                                struct io_rsrc_data *data_to_kill)
{
        WARN_ON_ONCE(!ctx->rsrc_backup_node);
        WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);

        if (data_to_kill) {
                struct io_rsrc_node *rsrc_node = ctx->rsrc_node;

                rsrc_node->rsrc_data = data_to_kill;
                spin_lock_irq(&ctx->rsrc_ref_lock);
                list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
                spin_unlock_irq(&ctx->rsrc_ref_lock);

                atomic_inc(&data_to_kill->refs);
                percpu_ref_kill(&rsrc_node->refs);
                ctx->rsrc_node = NULL;
        }

        if (!ctx->rsrc_node) {
                ctx->rsrc_node = ctx->rsrc_backup_node;
                ctx->rsrc_backup_node = NULL;
        }
}

static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
{
        if (ctx->rsrc_backup_node)
                return 0;
        ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx);
        return ctx->rsrc_backup_node ? 0 : -ENOMEM;
}

static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ctx)
{
        int ret;

        /* As we may drop ->uring_lock, other task may have started quiesce */
        if (data->quiesce)
                return -ENXIO;

        data->quiesce = true;
        do {
                ret = io_rsrc_node_switch_start(ctx);
                if (ret)
                        break;
                io_rsrc_node_switch(ctx, data);

                /* kill initial ref, already quiesced if zero */
                if (atomic_dec_and_test(&data->refs))
                        break;
                mutex_unlock(&ctx->uring_lock);
                flush_delayed_work(&ctx->rsrc_put_work);
                ret = wait_for_completion_interruptible(&data->done);
                if (!ret) {
                        mutex_lock(&ctx->uring_lock);
                        if (atomic_read(&data->refs) > 0) {
                                /*
                                 * it has been revived by another thread while
                                 * we were unlocked
                                 */
                                mutex_unlock(&ctx->uring_lock);
                        } else {
                                break;
                        }
                }

                atomic_inc(&data->refs);
                /* wait for all works potentially completing data->done */
                flush_delayed_work(&ctx->rsrc_put_work);
                reinit_completion(&data->done);

                ret = io_run_task_work_sig();
                mutex_lock(&ctx->uring_lock);
        } while (ret >= 0);
        data->quiesce = false;

        return ret;
}

static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
{
        unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
        unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;

        return &data->tags[table_idx][off];
}

static void io_rsrc_data_free(struct io_rsrc_data *data)
{
        size_t size = data->nr * sizeof(data->tags[0][0]);

        if (data->tags)
                io_free_page_table((void **)data->tags, size);
        kfree(data);
}

static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
                              u64 __user *utags, unsigned nr,
                              struct io_rsrc_data **pdata)
{
        struct io_rsrc_data *data;
        int ret = -ENOMEM;
        unsigned i;

        data = kzalloc(sizeof(*data), GFP_KERNEL);
        if (!data)
                return -ENOMEM;
        data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
        if (!data->tags) {
                kfree(data);
                return -ENOMEM;
        }

        data->nr = nr;
        data->ctx = ctx;
        data->do_put = do_put;
        if (utags) {
                ret = -EFAULT;
                for (i = 0; i < nr; i++) {
                        u64 *tag_slot = io_get_tag_slot(data, i);

                        if (copy_from_user(tag_slot, &utags[i],
                                           sizeof(*tag_slot)))
                                goto fail;
                }
        }

        atomic_set(&data->refs, 1);
        init_completion(&data->done);
        *pdata = data;
        return 0;
fail:
        io_rsrc_data_free(data);
        return ret;
}

static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
{
        table->files = kvcalloc(nr_files, sizeof(table->files[0]),
                                GFP_KERNEL_ACCOUNT);
        return !!table->files;
}

static void io_free_file_tables(struct io_file_table *table)
{
        kvfree(table->files);
        table->files = NULL;
}

static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
        int i;

        for (i = 0; i < ctx->nr_user_files; i++) {
                struct file *file;

                file = io_file_from_index(ctx, i);
                if (file)
                        fput(file);
        }
        io_free_file_tables(&ctx->file_table);
        io_rsrc_data_free(ctx->file_data);
        ctx->file_data = NULL;
        ctx->nr_user_files = 0;
}

static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
        unsigned nr = ctx->nr_user_files;
        int ret;

        if (!ctx->file_data)
                return -ENXIO;

        /*
         * Quiesce may unlock ->uring_lock, and while it's not held
         * prevent new requests using the table.
         */
        ctx->nr_user_files = 0;
        ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
        ctx->nr_user_files = nr;
        if (!ret)
                __io_sqe_files_unregister(ctx);
        return ret;
}

static void io_sq_thread_unpark(struct io_sq_data *sqd)
        __releases(&sqd->lock)
{
        WARN_ON_ONCE(sqd->thread == current);

        /*
         * Do the dance but not conditional clear_bit() because it'd race with
         * other threads incrementing park_pending and setting the bit.
         */
        clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
        if (atomic_dec_return(&sqd->park_pending))
                set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
        mutex_unlock(&sqd->lock);
}

static void io_sq_thread_park(struct io_sq_data *sqd)
        __acquires(&sqd->lock)
{
        WARN_ON_ONCE(sqd->thread == current);

        atomic_inc(&sqd->park_pending);
        set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
        mutex_lock(&sqd->lock);
        if (sqd->thread)
                wake_up_process(sqd->thread);
}

static void io_sq_thread_stop(struct io_sq_data *sqd)
{
        WARN_ON_ONCE(sqd->thread == current);
        WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state));

        set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
        mutex_lock(&sqd->lock);
        if (sqd->thread)
                wake_up_process(sqd->thread);
        mutex_unlock(&sqd->lock);
        wait_for_completion(&sqd->exited);
}

static void io_put_sq_data(struct io_sq_data *sqd)
{
        if (refcount_dec_and_test(&sqd->refs)) {
                WARN_ON_ONCE(atomic_read(&sqd->park_pending));

                io_sq_thread_stop(sqd);
                kfree(sqd);
        }
}

static void io_sq_thread_finish(struct io_ring_ctx *ctx)
{
        struct io_sq_data *sqd = ctx->sq_data;

        if (sqd) {
                io_sq_thread_park(sqd);
                list_del_init(&ctx->sqd_list);
                io_sqd_update_thread_idle(sqd);
                io_sq_thread_unpark(sqd);

                io_put_sq_data(sqd);
                ctx->sq_data = NULL;
        }
}

static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
{
        struct io_ring_ctx *ctx_attach;
        struct io_sq_data *sqd;
        struct fd f;

        f = fdget(p->wq_fd);
        if (!f.file)
                return ERR_PTR(-ENXIO);
        if (f.file->f_op != &io_uring_fops) {
                fdput(f);
                return ERR_PTR(-EINVAL);
        }

        ctx_attach = f.file->private_data;
        sqd = ctx_attach->sq_data;
        if (!sqd) {
                fdput(f);
                return ERR_PTR(-EINVAL);
        }
        if (sqd->task_tgid != current->tgid) {
                fdput(f);
                return ERR_PTR(-EPERM);
        }

        refcount_inc(&sqd->refs);
        fdput(f);
        return sqd;
}

static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
                                         bool *attached)
{
        struct io_sq_data *sqd;

        *attached = false;
        if (p->flags & IORING_SETUP_ATTACH_WQ) {
                sqd = io_attach_sq_data(p);
                if (!IS_ERR(sqd)) {
                        *attached = true;
                        return sqd;
                }
                /* fall through for EPERM case, setup new sqd/task */
                if (PTR_ERR(sqd) != -EPERM)
                        return sqd;
        }

        sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
        if (!sqd)
                return ERR_PTR(-ENOMEM);

        atomic_set(&sqd->park_pending, 0);
        refcount_set(&sqd->refs, 1);
        INIT_LIST_HEAD(&sqd->ctx_list);
        mutex_init(&sqd->lock);
        init_waitqueue_head(&sqd->wait);
        init_completion(&sqd->exited);
        return sqd;
}

static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
{
        struct file *file = prsrc->file;

        fput(file);
}

static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
{
        struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
        struct io_ring_ctx *ctx = rsrc_data->ctx;
        struct io_rsrc_put *prsrc, *tmp;

        list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
                list_del(&prsrc->list);

                if (prsrc->tag) {
                        bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL;

                        io_ring_submit_lock(ctx, lock_ring);
                        spin_lock(&ctx->completion_lock);
                        io_fill_cqe_aux(ctx, prsrc->tag, 0, 0);
                        io_commit_cqring(ctx);
                        spin_unlock(&ctx->completion_lock);
                        io_cqring_ev_posted(ctx);
                        io_ring_submit_unlock(ctx, lock_ring);
                }

                rsrc_data->do_put(ctx, prsrc);
                kfree(prsrc);
        }

        io_rsrc_node_destroy(ref_node);
        if (atomic_dec_and_test(&rsrc_data->refs))
                complete(&rsrc_data->done);
}

static void io_rsrc_put_work(struct work_struct *work)
{
        struct io_ring_ctx *ctx;
        struct llist_node *node;

        ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
        node = llist_del_all(&ctx->rsrc_put_llist);

        while (node) {
                struct io_rsrc_node *ref_node;
                struct llist_node *next = node->next;

                ref_node = llist_entry(node, struct io_rsrc_node, llist);
                __io_rsrc_put_work(ref_node);
                node = next;
        }
}

static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
                                 unsigned nr_args, u64 __user *tags)
{
        __s32 __user *fds = (__s32 __user *) arg;
        struct file *file;
        int fd, ret;
        unsigned i;

        if (ctx->file_data)
                return -EBUSY;
        if (!nr_args)
                return -EINVAL;
        if (nr_args > IORING_MAX_FIXED_FILES)
                return -EMFILE;
        if (nr_args > rlimit(RLIMIT_NOFILE))
                return -EMFILE;
        ret = io_rsrc_node_switch_start(ctx);
        if (ret)
                return ret;
        ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
                                 &ctx->file_data);
        if (ret)
                return ret;

        ret = -ENOMEM;
        if (!io_alloc_file_tables(&ctx->file_table, nr_args))
                goto out_free;

        for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
                if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
                        ret = -EFAULT;
                        goto out_fput;
                }
                /* allow sparse sets */
                if (fd == -1) {
                        ret = -EINVAL;
                        if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
                                goto out_fput;
                        continue;
                }

                file = fget(fd);
                ret = -EBADF;
                if (unlikely(!file))
                        goto out_fput;

                /*
                 * Don't allow io_uring instances to be registered. If UNIX
                 * isn't enabled, then this causes a reference cycle and this
                 * instance can never get freed. If UNIX is enabled we'll
                 * handle it just fine, but there's still no point in allowing
                 * a ring fd as it doesn't support regular read/write anyway.
                 */
                if (file->f_op == &io_uring_fops) {
                        fput(file);
                        goto out_fput;
                }
                io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file);
        }

        io_rsrc_node_switch(ctx, NULL);
        return 0;
out_fput:
        for (i = 0; i < ctx->nr_user_files; i++) {
                file = io_file_from_index(ctx, i);
                if (file)
                        fput(file);
        }
        io_free_file_tables(&ctx->file_table);
        ctx->nr_user_files = 0;
out_free:
        io_rsrc_data_free(ctx->file_data);
        ctx->file_data = NULL;
        return ret;
}

static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
                                 struct io_rsrc_node *node, void *rsrc)
{
        u64 *tag_slot = io_get_tag_slot(data, idx);
        struct io_rsrc_put *prsrc;

        prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
        if (!prsrc)
                return -ENOMEM;

        prsrc->tag = *tag_slot;
        *tag_slot = 0;
        prsrc->rsrc = rsrc;
        list_add(&prsrc->list, &node->rsrc_list);
        return 0;
}

static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
                                 unsigned int issue_flags, u32 slot_index)
{
        struct io_ring_ctx *ctx = req->ctx;
        bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
        bool needs_switch = false;
        struct io_fixed_file *file_slot;
        int ret = -EBADF;

        io_ring_submit_lock(ctx, !force_nonblock);
        if (file->f_op == &io_uring_fops)
                goto err;
        ret = -ENXIO;
        if (!ctx->file_data)
                goto err;
        ret = -EINVAL;
        if (slot_index >= ctx->nr_user_files)
                goto err;

        slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
        file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);

        if (file_slot->file_ptr) {
                struct file *old_file;

                ret = io_rsrc_node_switch_start(ctx);
                if (ret)
                        goto err;

                old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
                ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
                                            ctx->rsrc_node, old_file);
                if (ret)
                        goto err;
                file_slot->file_ptr = 0;
                needs_switch = true;
        }

        *io_get_tag_slot(ctx->file_data, slot_index) = 0;
        io_fixed_file_set(file_slot, file);
        ret = 0;
err:
        if (needs_switch)
                io_rsrc_node_switch(ctx, ctx->file_data);
        io_ring_submit_unlock(ctx, !force_nonblock);
        if (ret)
                fput(file);
        return ret;
}

static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
{
        unsigned int offset = req->close.file_slot - 1;
        struct io_ring_ctx *ctx = req->ctx;
        struct io_fixed_file *file_slot;
        struct file *file;
        int ret;

        io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
        ret = -ENXIO;
        if (unlikely(!ctx->file_data))
                goto out;
        ret = -EINVAL;
        if (offset >= ctx->nr_user_files)
                goto out;
        ret = io_rsrc_node_switch_start(ctx);
        if (ret)
                goto out;

        offset = array_index_nospec(offset, ctx->nr_user_files);
        file_slot = io_fixed_file_slot(&ctx->file_table, offset);
        ret = -EBADF;
        if (!file_slot->file_ptr)
                goto out;

        file = (struct file *)(file_slot->file_ptr & FFS_MASK);
        ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file);
        if (ret)
                goto out;

        file_slot->file_ptr = 0;
        io_rsrc_node_switch(ctx, ctx->file_data);
        ret = 0;
out:
        io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
        return ret;
}

static int __io_sqe_files_update(struct io_ring_ctx *ctx,
                                 struct io_uring_rsrc_update2 *up,
                                 unsigned nr_args)
{
        u64 __user *tags = u64_to_user_ptr(up->tags);
        __s32 __user *fds = u64_to_user_ptr(up->data);
        struct io_rsrc_data *data = ctx->file_data;
        struct io_fixed_file *file_slot;
        struct file *file;
        int fd, i, err = 0;
        unsigned int done;
        bool needs_switch = false;

        if (!ctx->file_data)
                return -ENXIO;
        if (up->offset + nr_args > ctx->nr_user_files)
                return -EINVAL;

        for (done = 0; done < nr_args; done++) {
                u64 tag = 0;

                if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
                    copy_from_user(&fd, &fds[done], sizeof(fd))) {
                        err = -EFAULT;
                        break;
                }
                if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
                        err = -EINVAL;
                        break;
                }
                if (fd == IORING_REGISTER_FILES_SKIP)
                        continue;

                i = array_index_nospec(up->offset + done, ctx->nr_user_files);
                file_slot = io_fixed_file_slot(&ctx->file_table, i);

                if (file_slot->file_ptr) {
                        file = (struct file *)(file_slot->file_ptr & FFS_MASK);
                        err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file);
                        if (err)
                                break;
                        file_slot->file_ptr = 0;
                        needs_switch = true;
                }
                if (fd != -1) {
                        file = fget(fd);
                        if (!file) {
                                err = -EBADF;
                                break;
                        }
                        /*
                         * Don't allow io_uring instances to be registered. If
                         * UNIX isn't enabled, then this causes a reference
                         * cycle and this instance can never get freed. If UNIX
                         * is enabled we'll handle it just fine, but there's
                         * still no point in allowing a ring fd as it doesn't
                         * support regular read/write anyway.
                         */
                        if (file->f_op == &io_uring_fops) {
                                fput(file);
                                err = -EBADF;
                                break;
                        }
                        *io_get_tag_slot(data, i) = tag;
                        io_fixed_file_set(file_slot, file);
                }
        }

        if (needs_switch)
                io_rsrc_node_switch(ctx, data);
        return done ? done : err;
}

static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
                                        struct task_struct *task)
{
        struct io_wq_hash *hash;
        struct io_wq_data data;
        unsigned int concurrency;

        mutex_lock(&ctx->uring_lock);
        hash = ctx->hash_map;
        if (!hash) {
                hash = kzalloc(sizeof(*hash), GFP_KERNEL);
                if (!hash) {
                        mutex_unlock(&ctx->uring_lock);
                        return ERR_PTR(-ENOMEM);
                }
                refcount_set(&hash->refs, 1);
                init_waitqueue_head(&hash->wait);
                ctx->hash_map = hash;
        }
        mutex_unlock(&ctx->uring_lock);

        data.hash = hash;
        data.task = task;
        data.free_work = io_wq_free_work;
        data.do_work = io_wq_submit_work;

        /* Do QD, or 4 * CPUS, whatever is smallest */
        concurrency = min(ctx->sq_entries, 4 * num_online_cpus());

        return io_wq_create(concurrency, &data);
}

static int io_uring_alloc_task_context(struct task_struct *task,
                                       struct io_ring_ctx *ctx)
{
        struct io_uring_task *tctx;
        int ret;

        tctx = kzalloc(sizeof(*tctx), GFP_KERNEL);
        if (unlikely(!tctx))
                return -ENOMEM;

        ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
        if (unlikely(ret)) {
                kfree(tctx);
                return ret;
        }

        tctx->io_wq = io_init_wq_offload(ctx, task);
        if (IS_ERR(tctx->io_wq)) {
                ret = PTR_ERR(tctx->io_wq);
                percpu_counter_destroy(&tctx->inflight);
                kfree(tctx);
                return ret;
        }

        xa_init(&tctx->xa);
        init_waitqueue_head(&tctx->wait);
        atomic_set(&tctx->in_idle, 0);
        atomic_set(&tctx->inflight_tracked, 0);
        task->io_uring = tctx;
        spin_lock_init(&tctx->task_lock);
        INIT_WQ_LIST(&tctx->task_list);
        init_task_work(&tctx->task_work, tctx_task_work);
        return 0;
}

void __io_uring_free(struct task_struct *tsk)
{
        struct io_uring_task *tctx = tsk->io_uring;
        struct io_tctx_node *node;
        unsigned long index;

        /*
         * Fault injection forcing allocation errors in the xa_store() path
         * can lead to xa_empty() returning false, even though no actual
         * node is stored in the xarray. Until that gets sorted out, attempt
         * an iteration here and warn if any entries are found.
         */
        xa_for_each(&tctx->xa, index, node) {
                WARN_ON_ONCE(1);
                break;
        }
        WARN_ON_ONCE(tctx->io_wq);
        WARN_ON_ONCE(tctx->cached_refs);

        percpu_counter_destroy(&tctx->inflight);
        kfree(tctx);
        tsk->io_uring = NULL;
}

static int io_sq_offload_create(struct io_ring_ctx *ctx,
                                struct io_uring_params *p)
{
        int ret;

        /* Retain compatibility with failing for an invalid attach attempt */
        if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
                                IORING_SETUP_ATTACH_WQ) {
                struct fd f;

                f = fdget(p->wq_fd);
                if (!f.file)
                        return -ENXIO;
                if (f.file->f_op != &io_uring_fops) {
                        fdput(f);
                        return -EINVAL;
                }
                fdput(f);
        }
        if (ctx->flags & IORING_SETUP_SQPOLL) {
                struct task_struct *tsk;
                struct io_sq_data *sqd;
                bool attached;

                sqd = io_get_sq_data(p, &attached);
                if (IS_ERR(sqd)) {
                        ret = PTR_ERR(sqd);
                        goto err;
                }

                ctx->sq_creds = get_current_cred();
                ctx->sq_data = sqd;
                ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
                if (!ctx->sq_thread_idle)
                        ctx->sq_thread_idle = HZ;

                io_sq_thread_park(sqd);
                list_add(&ctx->sqd_list, &sqd->ctx_list);
                io_sqd_update_thread_idle(sqd);
                /* don't attach to a dying SQPOLL thread, would be racy */
                ret = (attached && !sqd->thread) ? -ENXIO : 0;
                io_sq_thread_unpark(sqd);

                if (ret < 0)
                        goto err;
                if (attached)
                        return 0;

                if (p->flags & IORING_SETUP_SQ_AFF) {
                        cpumask_var_t allowed_mask;
                        int cpu = p->sq_thread_cpu;

                        ret = -EINVAL;
                        if (cpu >= nr_cpu_ids || !cpu_online(cpu))
                                goto err_sqpoll;
                        ret = -ENOMEM;
                        if (!alloc_cpumask_var(&allowed_mask, GFP_KERNEL))
                                goto err_sqpoll;
                        ret = -EINVAL;
                        cpuset_cpus_allowed(current, allowed_mask);
                        if (!cpumask_test_cpu(cpu, allowed_mask)) {
                                free_cpumask_var(allowed_mask);
                                goto err_sqpoll;
                        }
                        free_cpumask_var(allowed_mask);
                        sqd->sq_cpu = cpu;
                } else {
                        sqd->sq_cpu = -1;
                }

                sqd->task_pid = current->pid;
                sqd->task_tgid = current->tgid;
                tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
                if (IS_ERR(tsk)) {
                        ret = PTR_ERR(tsk);
                        goto err_sqpoll;
                }

                sqd->thread = tsk;
                ret = io_uring_alloc_task_context(tsk, ctx);
                wake_up_new_task(tsk);
                if (ret)
                        goto err;
        } else if (p->flags & IORING_SETUP_SQ_AFF) {
                /* Can't have SQ_AFF without SQPOLL */
                ret = -EINVAL;
                goto err;
        }

        return 0;
err_sqpoll:
        complete(&ctx->sq_data->exited);
err:
        io_sq_thread_finish(ctx);
        return ret;
}

static inline void __io_unaccount_mem(struct user_struct *user,
                                      unsigned long nr_pages)
{
        atomic_long_sub(nr_pages, &user->locked_vm);
}

static inline int __io_account_mem(struct user_struct *user,
                                   unsigned long nr_pages)
{
        unsigned long page_limit, cur_pages, new_pages;

        /* Don't allow more pages than we can safely lock */
        page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;

        do {
                cur_pages = atomic_long_read(&user->locked_vm);
                new_pages = cur_pages + nr_pages;
                if (new_pages > page_limit)
                        return -ENOMEM;
        } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
                                        new_pages) != cur_pages);

        return 0;
}

static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
{
        if (ctx->user)
                __io_unaccount_mem(ctx->user, nr_pages);

        if (ctx->mm_account)
                atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
}

static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
{
        int ret;

        if (ctx->user) {
                ret = __io_account_mem(ctx->user, nr_pages);
                if (ret)
                        return ret;
        }

        if (ctx->mm_account)
                atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);

        return 0;
}

static void io_mem_free(void *ptr)
{
        struct page *page;

        if (!ptr)
                return;

        page = virt_to_head_page(ptr);
        if (put_page_testzero(page))
                free_compound_page(page);
}

static void *io_mem_alloc(size_t size)
{
        gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;

        return (void *) __get_free_pages(gfp, get_order(size));
}

static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
                                size_t *sq_offset)
{
        struct io_rings *rings;
        size_t off, sq_array_size;

        off = struct_size(rings, cqes, cq_entries);
        if (off == SIZE_MAX)
                return SIZE_MAX;

#ifdef CONFIG_SMP
        off = ALIGN(off, SMP_CACHE_BYTES);
        if (off == 0)
                return SIZE_MAX;
#endif

        if (sq_offset)
                *sq_offset = off;

        sq_array_size = array_size(sizeof(u32), sq_entries);
        if (sq_array_size == SIZE_MAX)
                return SIZE_MAX;

        if (check_add_overflow(off, sq_array_size, &off))
                return SIZE_MAX;

        return off;
}

static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
{
        struct io_mapped_ubuf *imu = *slot;
        unsigned int i;

        if (imu != ctx->dummy_ubuf) {
                for (i = 0; i < imu->nr_bvecs; i++)
                        unpin_user_page(imu->bvec[i].bv_page);
                if (imu->acct_pages)
                        io_unaccount_mem(ctx, imu->acct_pages);
                kvfree(imu);
        }
        *slot = NULL;
}

static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
{
        io_buffer_unmap(ctx, &prsrc->buf);
        prsrc->buf = NULL;
}

static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
{
        unsigned int i;

        for (i = 0; i < ctx->nr_user_bufs; i++)
                io_buffer_unmap(ctx, &ctx->user_bufs[i]);
        kfree(ctx->user_bufs);
        io_rsrc_data_free(ctx->buf_data);
        ctx->user_bufs = NULL;
        ctx->buf_data = NULL;
        ctx->nr_user_bufs = 0;
}

static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
{
        unsigned nr = ctx->nr_user_bufs;
        int ret;

        if (!ctx->buf_data)
                return -ENXIO;

        /*
         * Quiesce may unlock ->uring_lock, and while it's not held
         * prevent new requests using the table.
         */
        ctx->nr_user_bufs = 0;
        ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
        ctx->nr_user_bufs = nr;
        if (!ret)
                __io_sqe_buffers_unregister(ctx);
        return ret;
}

static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
                       void __user *arg, unsigned index)
{
        struct iovec __user *src;

#ifdef CONFIG_COMPAT
        if (ctx->compat) {
                struct compat_iovec __user *ciovs;
                struct compat_iovec ciov;

                ciovs = (struct compat_iovec __user *) arg;
                if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
                        return -EFAULT;

                dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
                dst->iov_len = ciov.iov_len;
                return 0;
        }
#endif
        src = (struct iovec __user *) arg;
        if (copy_from_user(dst, &src[index], sizeof(*dst)))
                return -EFAULT;
        return 0;
}

/*
 * Not super efficient, but this is just a registration time. And we do cache
 * the last compound head, so generally we'll only do a full search if we don't
 * match that one.
 *
 * We check if the given compound head page has already been accounted, to
 * avoid double accounting it. This allows us to account the full size of the
 * page, not just the constituent pages of a huge page.
 */
static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
                                  int nr_pages, struct page *hpage)
{
        int i, j;

        /* check current page array */
        for (i = 0; i < nr_pages; i++) {
                if (!PageCompound(pages[i]))
                        continue;
                if (compound_head(pages[i]) == hpage)
                        return true;
        }

        /* check previously registered pages */
        for (i = 0; i < ctx->nr_user_bufs; i++) {
                struct io_mapped_ubuf *imu = ctx->user_bufs[i];

                for (j = 0; j < imu->nr_bvecs; j++) {
                        if (!PageCompound(imu->bvec[j].bv_page))
                                continue;
                        if (compound_head(imu->bvec[j].bv_page) == hpage)
                                return true;
                }
        }

        return false;
}

static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
                                 int nr_pages, struct io_mapped_ubuf *imu,
                                 struct page **last_hpage)
{
        int i, ret;

        imu->acct_pages = 0;
        for (i = 0; i < nr_pages; i++) {
                if (!PageCompound(pages[i])) {
                        imu->acct_pages++;
                } else {
                        struct page *hpage;

                        hpage = compound_head(pages[i]);
                        if (hpage == *last_hpage)
                                continue;
                        *last_hpage = hpage;
                        if (headpage_already_acct(ctx, pages, i, hpage))
                                continue;
                        imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
                }
        }

        if (!imu->acct_pages)
                return 0;

        ret = io_account_mem(ctx, imu->acct_pages);
        if (ret)
                imu->acct_pages = 0;
        return ret;
}

static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
                                  struct io_mapped_ubuf **pimu,
                                  struct page **last_hpage)
{
        struct io_mapped_ubuf *imu = NULL;
        struct vm_area_struct **vmas = NULL;
        struct page **pages = NULL;
        unsigned long off, start, end, ubuf;
        size_t size;
        int ret, pret, nr_pages, i;

        if (!iov->iov_base) {
                *pimu = ctx->dummy_ubuf;
                return 0;
        }

        ubuf = (unsigned long) iov->iov_base;
        end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
        start = ubuf >> PAGE_SHIFT;
        nr_pages = end - start;

        *pimu = NULL;
        ret = -ENOMEM;

        pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
        if (!pages)
                goto done;

        vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
                              GFP_KERNEL);
        if (!vmas)
                goto done;

        imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
        if (!imu)
                goto done;

        ret = 0;
        mmap_read_lock(current->mm);
        pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
                              pages, vmas);
        if (pret == nr_pages) {
                struct file *file = vmas[0]->vm_file;

                /* don't support file backed memory */
                for (i = 0; i < nr_pages; i++) {
                        if (vmas[i]->vm_file != file) {
                                ret = -EINVAL;
                                break;
                        }
                        if (!file)
                                continue;
                        if (!vma_is_shmem(vmas[i]) && !is_file_hugepages(file)) {
                                ret = -EOPNOTSUPP;
                                break;
                        }
                }
        } else {
                ret = pret < 0 ? pret : -EFAULT;
        }
        mmap_read_unlock(current->mm);
        if (ret) {
                /*
                 * if we did partial map, or found file backed vmas,
                 * release any pages we did get
                 */
                if (pret > 0)
                        unpin_user_pages(pages, pret);
                goto done;
        }

        ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
        if (ret) {
                unpin_user_pages(pages, pret);
                goto done;
        }

        off = ubuf & ~PAGE_MASK;
        size = iov->iov_len;
        for (i = 0; i < nr_pages; i++) {
                size_t vec_len;

                vec_len = min_t(size_t, size, PAGE_SIZE - off);
                imu->bvec[i].bv_page = pages[i];
                imu->bvec[i].bv_len = vec_len;
                imu->bvec[i].bv_offset = off;
                off = 0;
                size -= vec_len;
        }
        /* store original address for later verification */
        imu->ubuf = ubuf;
        imu->ubuf_end = ubuf + iov->iov_len;
        imu->nr_bvecs = nr_pages;
        *pimu = imu;
        ret = 0;
done:
        if (ret)
                kvfree(imu);
        kvfree(pages);
        kvfree(vmas);
        return ret;
}

static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
{
        ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
        return ctx->user_bufs ? 0 : -ENOMEM;
}

static int io_buffer_validate(struct iovec *iov)
{
        unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);

        /*
         * Don't impose further limits on the size and buffer
         * constraints here, we'll -EINVAL later when IO is
         * submitted if they are wrong.
         */
        if (!iov->iov_base)
                return iov->iov_len ? -EFAULT : 0;
        if (!iov->iov_len)
                return -EFAULT;

        /* arbitrary limit, but we need something */
        if (iov->iov_len > SZ_1G)
                return -EFAULT;

        if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
                return -EOVERFLOW;

        return 0;
}

static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
                                   unsigned int nr_args, u64 __user *tags)
{
        struct page *last_hpage = NULL;
        struct io_rsrc_data *data;
        int i, ret;
        struct iovec iov;

        if (ctx->user_bufs)
                return -EBUSY;
        if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
                return -EINVAL;
        ret = io_rsrc_node_switch_start(ctx);
        if (ret)
                return ret;
        ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
        if (ret)
                return ret;
        ret = io_buffers_map_alloc(ctx, nr_args);
        if (ret) {
                io_rsrc_data_free(data);
                return ret;
        }

        for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
                ret = io_copy_iov(ctx, &iov, arg, i);
                if (ret)
                        break;
                ret = io_buffer_validate(&iov);
                if (ret)
                        break;
                if (!iov.iov_base && *io_get_tag_slot(data, i)) {
                        ret = -EINVAL;
                        break;
                }

                ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
                                             &last_hpage);
                if (ret)
                        break;
        }

        WARN_ON_ONCE(ctx->buf_data);

        ctx->buf_data = data;
        if (ret)
                __io_sqe_buffers_unregister(ctx);
        else
                io_rsrc_node_switch(ctx, NULL);
        return ret;
}

static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
                                   struct io_uring_rsrc_update2 *up,
                                   unsigned int nr_args)
{
        u64 __user *tags = u64_to_user_ptr(up->tags);
        struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
        struct page *last_hpage = NULL;
        bool needs_switch = false;
        __u32 done;
        int i, err;

        if (!ctx->buf_data)
                return -ENXIO;
        if (up->offset + nr_args > ctx->nr_user_bufs)
                return -EINVAL;

        for (done = 0; done < nr_args; done++) {
                struct io_mapped_ubuf *imu;
                int offset = up->offset + done;
                u64 tag = 0;

                err = io_copy_iov(ctx, &iov, iovs, done);
                if (err)
                        break;
                if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
                        err = -EFAULT;
                        break;
                }
                err = io_buffer_validate(&iov);
                if (err)
                        break;
                if (!iov.iov_base && tag) {
                        err = -EINVAL;
                        break;
                }
                err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
                if (err)
                        break;

                i = array_index_nospec(offset, ctx->nr_user_bufs);
                if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
                        err = io_queue_rsrc_removal(ctx->buf_data, i,
                                                    ctx->rsrc_node, ctx->user_bufs[i]);
                        if (unlikely(err)) {
                                io_buffer_unmap(ctx, &imu);
                                break;
                        }
                        ctx->user_bufs[i] = NULL;
                        needs_switch = true;
                }

                ctx->user_bufs[i] = imu;
                *io_get_tag_slot(ctx->buf_data, offset) = tag;
        }

        if (needs_switch)
                io_rsrc_node_switch(ctx, ctx->buf_data);
        return done ? done : err;
}

static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
{
        __s32 __user *fds = arg;
        int fd;

        if (ctx->cq_ev_fd)
                return -EBUSY;

        if (copy_from_user(&fd, fds, sizeof(*fds)))
                return -EFAULT;

        ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
        if (IS_ERR(ctx->cq_ev_fd)) {
                int ret = PTR_ERR(ctx->cq_ev_fd);

                ctx->cq_ev_fd = NULL;
                return ret;
        }

        return 0;
}

static int io_eventfd_unregister(struct io_ring_ctx *ctx)
{
        if (ctx->cq_ev_fd) {
                eventfd_ctx_put(ctx->cq_ev_fd);
                ctx->cq_ev_fd = NULL;
                return 0;
        }

        return -ENXIO;
}

static void io_destroy_buffers(struct io_ring_ctx *ctx)
{
        struct io_buffer *buf;
        unsigned long index;

        xa_for_each(&ctx->io_buffers, index, buf)
                __io_remove_buffers(ctx, buf, index, -1U);
}

static void io_req_cache_free(struct list_head *list)
{
        struct io_kiocb *req, *nxt;

        list_for_each_entry_safe(req, nxt, list, inflight_entry) {
                list_del(&req->inflight_entry);
                kmem_cache_free(req_cachep, req);
        }
}

static void io_req_caches_free(struct io_ring_ctx *ctx)
{
        struct io_submit_state *state = &ctx->submit_state;

        mutex_lock(&ctx->uring_lock);

        if (state->free_reqs) {
                kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
                state->free_reqs = 0;
        }

        io_flush_cached_locked_reqs(ctx, state);
        io_req_cache_free(&state->free_list);
        mutex_unlock(&ctx->uring_lock);
}

static void io_wait_rsrc_data(struct io_rsrc_data *data)
{
        if (data && !atomic_dec_and_test(&data->refs))
                wait_for_completion(&data->done);
}

static void io_ring_ctx_free(struct io_ring_ctx *ctx)
{
        io_sq_thread_finish(ctx);

        /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
        io_wait_rsrc_data(ctx->buf_data);
        io_wait_rsrc_data(ctx->file_data);

        mutex_lock(&ctx->uring_lock);
        if (ctx->buf_data)
                __io_sqe_buffers_unregister(ctx);
        if (ctx->file_data)
                __io_sqe_files_unregister(ctx);
        if (ctx->rings)
                __io_cqring_overflow_flush(ctx, true);
        mutex_unlock(&ctx->uring_lock);
        io_eventfd_unregister(ctx);
        io_destroy_buffers(ctx);
        if (ctx->sq_creds)
                put_cred(ctx->sq_creds);

        /* there are no registered resources left, nobody uses it */
        if (ctx->rsrc_node)
                io_rsrc_node_destroy(ctx->rsrc_node);
        if (ctx->rsrc_backup_node)
                io_rsrc_node_destroy(ctx->rsrc_backup_node);
        flush_delayed_work(&ctx->rsrc_put_work);

        WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
        WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));

        WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));

        if (ctx->mm_account) {
                mmdrop(ctx->mm_account);
                ctx->mm_account = NULL;
        }

        io_mem_free(ctx->rings);
        io_mem_free(ctx->sq_sqes);

        percpu_ref_exit(&ctx->refs);
        free_uid(ctx->user);
        io_req_caches_free(ctx);
        if (ctx->hash_map)
                io_wq_put_hash(ctx->hash_map);
        kfree(ctx->cancel_hash);
        kfree(ctx->dummy_ubuf);
        kfree(ctx);
}

static __poll_t io_uring_poll(struct file *file, poll_table *wait)
{
        struct io_ring_ctx *ctx = file->private_data;
        __poll_t mask = 0;

        poll_wait(file, &ctx->poll_wait, wait);
        /*
         * synchronizes with barrier from wq_has_sleeper call in
         * io_commit_cqring
         */
        smp_rmb();
        if (!io_sqring_full(ctx))
                mask |= EPOLLOUT | EPOLLWRNORM;

        /*
         * Don't flush cqring overflow list here, just do a simple check.
         * Otherwise there could possible be ABBA deadlock:
         *      CPU0                    CPU1
         *      ----                    ----
         * lock(&ctx->uring_lock);
         *                              lock(&ep->mtx);
         *                              lock(&ctx->uring_lock);
         * lock(&ep->mtx);
         *
         * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
         * pushs them to do the flush.
         */
        if (io_cqring_events(ctx) || test_bit(0, &ctx->check_cq_overflow))
                mask |= EPOLLIN | EPOLLRDNORM;

        return mask;
}

static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
{
        const struct cred *creds;

        creds = xa_erase(&ctx->personalities, id);
        if (creds) {
                put_cred(creds);
                return 0;
        }

        return -EINVAL;
}

struct io_tctx_exit {
        struct callback_head                task_work;
        struct completion                completion;
        struct io_ring_ctx                *ctx;
};

static void io_tctx_exit_cb(struct callback_head *cb)
{
        struct io_uring_task *tctx = current->io_uring;
        struct io_tctx_exit *work;

        work = container_of(cb, struct io_tctx_exit, task_work);
        /*
         * When @in_idle, we're in cancellation and it's racy to remove the
         * node. It'll be removed by the end of cancellation, just ignore it.
         * tctx can be NULL if the queueing of this task_work raced with
         * work cancelation off the exec path.
         */
        if (tctx && !atomic_read(&tctx->in_idle))
                io_uring_del_tctx_node((unsigned long)work->ctx);
        complete(&work->completion);
}

static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
{
        struct io_kiocb *req = container_of(work, struct io_kiocb, work);

        return req->ctx == data;
}

static void io_ring_exit_work(struct work_struct *work)
{
        struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
        unsigned long timeout = jiffies + HZ * 60 * 5;
        unsigned long interval = HZ / 20;
        struct io_tctx_exit exit;
        struct io_tctx_node *node;
        int ret;

        /*
         * If we're doing polled IO and end up having requests being
         * submitted async (out-of-line), then completions can come in while
         * we're waiting for refs to drop. We need to reap these manually,
         * as nobody else will be looking for them.
         */
        do {
                io_uring_try_cancel_requests(ctx, NULL, true);
                if (ctx->sq_data) {
                        struct io_sq_data *sqd = ctx->sq_data;
                        struct task_struct *tsk;

                        io_sq_thread_park(sqd);
                        tsk = sqd->thread;
                        if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
                                io_wq_cancel_cb(tsk->io_uring->io_wq,
                                                io_cancel_ctx_cb, ctx, true);
                        io_sq_thread_unpark(sqd);
                }

                if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
                        /* there is little hope left, don't run it too often */
                        interval = HZ * 60;
                }
                /*
                 * This is really an uninterruptible wait, as it has to be
                 * complete. But it's also run from a kworker, which doesn't
                 * take signals, so it's fine to make it interruptible. This
                 * avoids scenarios where we knowingly can wait much longer
                 * on completions, for example if someone does a SIGSTOP on
                 * a task that needs to finish task_work to make this loop
                 * complete. That's a synthetic situation that should not
                 * cause a stuck task backtrace, and hence a potential panic
                 * on stuck tasks if that is enabled.
                 */
        } while (!wait_for_completion_interruptible_timeout(&ctx->ref_comp, interval));

        init_completion(&exit.completion);
        init_task_work(&exit.task_work, io_tctx_exit_cb);
        exit.ctx = ctx;
        /*
         * Some may use context even when all refs and requests have been put,
         * and they are free to do so while still holding uring_lock or
         * completion_lock, see io_req_task_submit(). Apart from other work,
         * this lock/unlock section also waits them to finish.
         */
        mutex_lock(&ctx->uring_lock);
        while (!list_empty(&ctx->tctx_list)) {
                WARN_ON_ONCE(time_after(jiffies, timeout));

                node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
                                        ctx_node);
                /* don't spin on a single task if cancellation failed */
                list_rotate_left(&ctx->tctx_list);
                ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
                if (WARN_ON_ONCE(ret))
                        continue;
                wake_up_process(node->task);

                mutex_unlock(&ctx->uring_lock);
                /*
                 * See comment above for
                 * wait_for_completion_interruptible_timeout() on why this
                 * wait is marked as interruptible.
                 */
                wait_for_completion_interruptible(&exit.completion);
                mutex_lock(&ctx->uring_lock);
        }
        mutex_unlock(&ctx->uring_lock);
        spin_lock(&ctx->completion_lock);
        spin_unlock(&ctx->completion_lock);

        io_ring_ctx_free(ctx);
}

/* Returns true if we found and killed one or more timeouts */
static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
                             bool cancel_all)
{
        struct io_kiocb *req, *tmp;
        int canceled = 0;

        spin_lock(&ctx->completion_lock);
        spin_lock_irq(&ctx->timeout_lock);
        list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
                if (io_match_task(req, tsk, cancel_all)) {
                        io_kill_timeout(req, -ECANCELED);
                        canceled++;
                }
        }
        spin_unlock_irq(&ctx->timeout_lock);
        if (canceled != 0)
                io_commit_cqring(ctx);
        spin_unlock(&ctx->completion_lock);
        if (canceled != 0)
                io_cqring_ev_posted(ctx);
        return canceled != 0;
}

static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
{
        unsigned long index;
        struct creds *creds;

        mutex_lock(&ctx->uring_lock);
        percpu_ref_kill(&ctx->refs);
        if (ctx->rings)
                __io_cqring_overflow_flush(ctx, true);
        xa_for_each(&ctx->personalities, index, creds)
                io_unregister_personality(ctx, index);
        mutex_unlock(&ctx->uring_lock);

        io_kill_timeouts(ctx, NULL, true);
        io_poll_remove_all(ctx, NULL, true);

        /* if we failed setting up the ctx, we might not have any rings */
        io_iopoll_try_reap_events(ctx);

        /* drop cached put refs after potentially doing completions */
        if (current->io_uring)
                io_uring_drop_tctx_refs(current);

        INIT_WORK(&ctx->exit_work, io_ring_exit_work);
        /*
         * Use system_unbound_wq to avoid spawning tons of event kworkers
         * if we're exiting a ton of rings at the same time. It just adds
         * noise and overhead, there's no discernable change in runtime
         * over using system_wq.
         */
        queue_work(system_unbound_wq, &ctx->exit_work);
}

static int io_uring_release(struct inode *inode, struct file *file)
{
        struct io_ring_ctx *ctx = file->private_data;

        file->private_data = NULL;
        io_ring_ctx_wait_and_kill(ctx);
        return 0;
}

struct io_task_cancel {
        struct task_struct *task;
        bool all;
};

static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
{
        struct io_kiocb *req = container_of(work, struct io_kiocb, work);
        struct io_task_cancel *cancel = data;

        return io_match_task_safe(req, cancel->task, cancel->all);
}

static bool io_cancel_defer_files(struct io_ring_ctx *ctx,
                                  struct task_struct *task, bool cancel_all)
{
        struct io_defer_entry *de;
        LIST_HEAD(list);

        spin_lock(&ctx->completion_lock);
        list_for_each_entry_reverse(de, &ctx->defer_list, list) {
                if (io_match_task_safe(de->req, task, cancel_all)) {
                        list_cut_position(&list, &ctx->defer_list, &de->list);
                        break;
                }
        }
        spin_unlock(&ctx->completion_lock);
        if (list_empty(&list))
                return false;

        while (!list_empty(&list)) {
                de = list_first_entry(&list, struct io_defer_entry, list);
                list_del_init(&de->list);
                io_req_complete_failed(de->req, -ECANCELED);
                kfree(de);
        }
        return true;
}

static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
{
        struct io_tctx_node *node;
        enum io_wq_cancel cret;
        bool ret = false;

        mutex_lock(&ctx->uring_lock);
        list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
                struct io_uring_task *tctx = node->task->io_uring;

                /*
                 * io_wq will stay alive while we hold uring_lock, because it's
                 * killed after ctx nodes, which requires to take the lock.
                 */
                if (!tctx || !tctx->io_wq)
                        continue;
                cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
                ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
        }
        mutex_unlock(&ctx->uring_lock);

        return ret;
}

static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
                                         struct task_struct *task,
                                         bool cancel_all)
{
        struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
        struct io_uring_task *tctx = task ? task->io_uring : NULL;

        while (1) {
                enum io_wq_cancel cret;
                bool ret = false;

                if (!task) {
                        ret |= io_uring_try_cancel_iowq(ctx);
                } else if (tctx && tctx->io_wq) {
                        /*
                         * Cancels requests of all rings, not only @ctx, but
                         * it's fine as the task is in exit/exec.
                         */
                        cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
                                               &cancel, true);
                        ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
                }

                /* SQPOLL thread does its own polling */
                if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
                    (ctx->sq_data && ctx->sq_data->thread == current)) {
                        while (!list_empty_careful(&ctx->iopoll_list)) {
                                io_iopoll_try_reap_events(ctx);
                                ret = true;
                                cond_resched();
                        }
                }

                ret |= io_cancel_defer_files(ctx, task, cancel_all);
                ret |= io_poll_remove_all(ctx, task, cancel_all);
                ret |= io_kill_timeouts(ctx, task, cancel_all);
                if (task)
                        ret |= io_run_task_work();
                if (!ret)
                        break;
                cond_resched();
        }
}

static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
{
        struct io_uring_task *tctx = current->io_uring;
        struct io_tctx_node *node;
        int ret;

        if (unlikely(!tctx)) {
                ret = io_uring_alloc_task_context(current, ctx);
                if (unlikely(ret))
                        return ret;

                tctx = current->io_uring;
                if (ctx->iowq_limits_set) {
                        unsigned int limits[2] = { ctx->iowq_limits[0],
                                                   ctx->iowq_limits[1], };

                        ret = io_wq_max_workers(tctx->io_wq, limits);
                        if (ret)
                                return ret;
                }
        }
        if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
                node = kmalloc(sizeof(*node), GFP_KERNEL);
                if (!node)
                        return -ENOMEM;
                node->ctx = ctx;
                node->task = current;

                ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
                                        node, GFP_KERNEL));
                if (ret) {
                        kfree(node);
                        return ret;
                }

                mutex_lock(&ctx->uring_lock);
                list_add(&node->ctx_node, &ctx->tctx_list);
                mutex_unlock(&ctx->uring_lock);
        }
        tctx->last = ctx;
        return 0;
}

/*
 * Note that this task has used io_uring. We use it for cancelation purposes.
 */
static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
{
        struct io_uring_task *tctx = current->io_uring;

        if (likely(tctx && tctx->last == ctx))
                return 0;
        return __io_uring_add_tctx_node(ctx);
}

/*
 * Remove this io_uring_file -> task mapping.
 */
static void io_uring_del_tctx_node(unsigned long index)
{
        struct io_uring_task *tctx = current->io_uring;
        struct io_tctx_node *node;

        if (!tctx)
                return;
        node = xa_erase(&tctx->xa, index);
        if (!node)
                return;

        WARN_ON_ONCE(current != node->task);
        WARN_ON_ONCE(list_empty(&node->ctx_node));

        mutex_lock(&node->ctx->uring_lock);
        list_del(&node->ctx_node);
        mutex_unlock(&node->ctx->uring_lock);

        if (tctx->last == node->ctx)
                tctx->last = NULL;
        kfree(node);
}

static void io_uring_clean_tctx(struct io_uring_task *tctx)
{
        struct io_wq *wq = tctx->io_wq;
        struct io_tctx_node *node;
        unsigned long index;

        xa_for_each(&tctx->xa, index, node) {
                io_uring_del_tctx_node(index);
                cond_resched();
        }
        if (wq) {
                /*
                 * Must be after io_uring_del_task_file() (removes nodes under
                 * uring_lock) to avoid race with io_uring_try_cancel_iowq().
                 */
                io_wq_put_and_exit(wq);
                tctx->io_wq = NULL;
        }
}

static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
{
        if (tracked)
                return atomic_read(&tctx->inflight_tracked);
        return percpu_counter_sum(&tctx->inflight);
}

/*
 * Find any io_uring ctx that this task has registered or done IO on, and cancel
 * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation.
 */
static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
{
        struct io_uring_task *tctx = current->io_uring;
        struct io_ring_ctx *ctx;
        s64 inflight;
        DEFINE_WAIT(wait);

        WARN_ON_ONCE(sqd && sqd->thread != current);

        if (!current->io_uring)
                return;
        if (tctx->io_wq)
                io_wq_exit_start(tctx->io_wq);

        atomic_inc(&tctx->in_idle);
        do {
                io_uring_drop_tctx_refs(current);
                /* read completions before cancelations */
                inflight = tctx_inflight(tctx, !cancel_all);
                if (!inflight)
                        break;

                if (!sqd) {
                        struct io_tctx_node *node;
                        unsigned long index;

                        xa_for_each(&tctx->xa, index, node) {
                                /* sqpoll task will cancel all its requests */
                                if (node->ctx->sq_data)
                                        continue;
                                io_uring_try_cancel_requests(node->ctx, current,
                                                             cancel_all);
                        }
                } else {
                        list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
                                io_uring_try_cancel_requests(ctx, current,
                                                             cancel_all);
                }

                prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE);
                io_run_task_work();
                io_uring_drop_tctx_refs(current);

                /*
                 * If we've seen completions, retry without waiting. This
                 * avoids a race where a completion comes in before we did
                 * prepare_to_wait().
                 */
                if (inflight == tctx_inflight(tctx, !cancel_all))
                        schedule();
                finish_wait(&tctx->wait, &wait);
        } while (1);

        io_uring_clean_tctx(tctx);
        if (cancel_all) {
                /*
                 * We shouldn't run task_works after cancel, so just leave
                 * ->in_idle set for normal exit.
                 */
                atomic_dec(&tctx->in_idle);
                /* for exec all current's requests should be gone, kill tctx */
                __io_uring_free(current);
        }
}

void __io_uring_cancel(bool cancel_all)
{
        io_uring_cancel_generic(cancel_all, NULL);
}

static void *io_uring_validate_mmap_request(struct file *file,
                                            loff_t pgoff, size_t sz)
{
        struct io_ring_ctx *ctx = file->private_data;
        loff_t offset = pgoff << PAGE_SHIFT;
        struct page *page;
        void *ptr;

        switch (offset) {
        case IORING_OFF_SQ_RING:
        case IORING_OFF_CQ_RING:
                ptr = ctx->rings;
                break;
        case IORING_OFF_SQES:
                ptr = ctx->sq_sqes;
                break;
        default:
                return ERR_PTR(-EINVAL);
        }

        page = virt_to_head_page(ptr);
        if (sz > page_size(page))
                return ERR_PTR(-EINVAL);

        return ptr;
}

#ifdef CONFIG_MMU

static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
{
        size_t sz = vma->vm_end - vma->vm_start;
        unsigned long pfn;
        void *ptr;

        ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
        if (IS_ERR(ptr))
                return PTR_ERR(ptr);

        pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
        return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
}

#else /* !CONFIG_MMU */

static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
{
        return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
}

static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
{
        return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
}

static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
        unsigned long addr, unsigned long len,
        unsigned long pgoff, unsigned long flags)
{
        void *ptr;

        ptr = io_uring_validate_mmap_request(file, pgoff, len);
        if (IS_ERR(ptr))
                return PTR_ERR(ptr);

        return (unsigned long) ptr;
}

#endif /* !CONFIG_MMU */

static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
{
        DEFINE_WAIT(wait);

        do {
                if (!io_sqring_full(ctx))
                        break;
                prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);

                if (!io_sqring_full(ctx))
                        break;
                schedule();
        } while (!signal_pending(current));

        finish_wait(&ctx->sqo_sq_wait, &wait);
        return 0;
}

static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
                          struct __kernel_timespec __user **ts,
                          const sigset_t __user **sig)
{
        struct io_uring_getevents_arg arg;

        /*
         * If EXT_ARG isn't set, then we have no timespec and the argp pointer
         * is just a pointer to the sigset_t.
         */
        if (!(flags & IORING_ENTER_EXT_ARG)) {
                *sig = (const sigset_t __user *) argp;
                *ts = NULL;
                return 0;
        }

        /*
         * EXT_ARG is set - ensure we agree on the size of it and copy in our
         * timespec and sigset_t pointers if good.
         */
        if (*argsz != sizeof(arg))
                return -EINVAL;
        if (copy_from_user(&arg, argp, sizeof(arg)))
                return -EFAULT;
        if (arg.pad)
                return -EINVAL;
        *sig = u64_to_user_ptr(arg.sigmask);
        *argsz = arg.sigmask_sz;
        *ts = u64_to_user_ptr(arg.ts);
        return 0;
}

SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
                u32, min_complete, u32, flags, const void __user *, argp,
                size_t, argsz)
{
        struct io_ring_ctx *ctx;
        int submitted = 0;
        struct fd f;
        long ret;

        io_run_task_work();

        if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
                               IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
                return -EINVAL;

        f = fdget(fd);
        if (unlikely(!f.file))
                return -EBADF;

        ret = -EOPNOTSUPP;
        if (unlikely(f.file->f_op != &io_uring_fops))
                goto out_fput;

        ret = -ENXIO;
        ctx = f.file->private_data;
        if (unlikely(!percpu_ref_tryget(&ctx->refs)))
                goto out_fput;

        ret = -EBADFD;
        if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
                goto out;

        /*
         * For SQ polling, the thread will do all submissions and completions.
         * Just return the requested submit count, and wake the thread if
         * we were asked to.
         */
        ret = 0;
        if (ctx->flags & IORING_SETUP_SQPOLL) {
                io_cqring_overflow_flush(ctx);

                if (unlikely(ctx->sq_data->thread == NULL)) {
                        ret = -EOWNERDEAD;
                        goto out;
                }
                if (flags & IORING_ENTER_SQ_WAKEUP)
                        wake_up(&ctx->sq_data->wait);
                if (flags & IORING_ENTER_SQ_WAIT) {
                        ret = io_sqpoll_wait_sq(ctx);
                        if (ret)
                                goto out;
                }
                submitted = to_submit;
        } else if (to_submit) {
                ret = io_uring_add_tctx_node(ctx);
                if (unlikely(ret))
                        goto out;
                mutex_lock(&ctx->uring_lock);
                submitted = io_submit_sqes(ctx, to_submit);
                mutex_unlock(&ctx->uring_lock);

                if (submitted != to_submit)
                        goto out;
        }
        if (flags & IORING_ENTER_GETEVENTS) {
                const sigset_t __user *sig;
                struct __kernel_timespec __user *ts;

                ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
                if (unlikely(ret))
                        goto out;

                min_complete = min(min_complete, ctx->cq_entries);

                /*
                 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
                 * space applications don't need to do io completion events
                 * polling again, they can rely on io_sq_thread to do polling
                 * work, which can reduce cpu usage and uring_lock contention.
                 */
                if (ctx->flags & IORING_SETUP_IOPOLL &&
                    !(ctx->flags & IORING_SETUP_SQPOLL)) {
                        ret = io_iopoll_check(ctx, min_complete);
                } else {
                        ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
                }
        }

out:
        percpu_ref_put(&ctx->refs);
out_fput:
        fdput(f);
        return submitted ? submitted : ret;
}

#ifdef CONFIG_PROC_FS
static int io_uring_show_cred(struct seq_file *m, unsigned int id,
                const struct cred *cred)
{
        struct user_namespace *uns = seq_user_ns(m);
        struct group_info *gi;
        kernel_cap_t cap;
        unsigned __capi;
        int g;

        seq_printf(m, "%5d\n", id);
        seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
        seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
        seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
        seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
        seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
        seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
        seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
        seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
        seq_puts(m, "\n\tGroups:\t");
        gi = cred->group_info;
        for (g = 0; g < gi->ngroups; g++) {
                seq_put_decimal_ull(m, g ? " " : "",
                                        from_kgid_munged(uns, gi->gid[g]));
        }
        seq_puts(m, "\n\tCapEff:\t");
        cap = cred->cap_effective;
        CAP_FOR_EACH_U32(__capi)
                seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
        seq_putc(m, '\n');
        return 0;
}

static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
{
        int sq_pid = -1, sq_cpu = -1;
        bool has_lock;
        int i;

        /*
         * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
         * since fdinfo case grabs it in the opposite direction of normal use
         * cases. If we fail to get the lock, we just don't iterate any
         * structures that could be going away outside the io_uring mutex.
         */
        has_lock = mutex_trylock(&ctx->uring_lock);

        if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
                struct io_sq_data *sq = ctx->sq_data;

                if (mutex_trylock(&sq->lock)) {
                        if (sq->thread) {
                                sq_pid = task_pid_nr(sq->thread);
                                sq_cpu = task_cpu(sq->thread);
                        }
                        mutex_unlock(&sq->lock);
                }
        }

        seq_printf(m, "SqThread:\t%d\n", sq_pid);
        seq_printf(m, "SqThreadCpu:\t%d\n", sq_cpu);
        seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
        for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
                struct file *f = io_file_from_index(ctx, i);

                if (f)
                        seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
                else
                        seq_printf(m, "%5u: <none>\n", i);
        }
        seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
        for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
                struct io_mapped_ubuf *buf = ctx->user_bufs[i];
                unsigned int len = buf->ubuf_end - buf->ubuf;

                seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len);
        }
        if (has_lock && !xa_empty(&ctx->personalities)) {
                unsigned long index;
                const struct cred *cred;

                seq_printf(m, "Personalities:\n");
                xa_for_each(&ctx->personalities, index, cred)
                        io_uring_show_cred(m, index, cred);
        }
        seq_printf(m, "PollList:\n");
        spin_lock(&ctx->completion_lock);
        for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
                struct hlist_head *list = &ctx->cancel_hash[i];
                struct io_kiocb *req;

                hlist_for_each_entry(req, list, hash_node)
                        seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
                                        req->task->task_works != NULL);
        }
        spin_unlock(&ctx->completion_lock);
        if (has_lock)
                mutex_unlock(&ctx->uring_lock);
}

static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
{
        struct io_ring_ctx *ctx = f->private_data;

        if (percpu_ref_tryget(&ctx->refs)) {
                __io_uring_show_fdinfo(ctx, m);
                percpu_ref_put(&ctx->refs);
        }
}
#endif

static const struct file_operations io_uring_fops = {
        .release        = io_uring_release,
        .mmap                = io_uring_mmap,
#ifndef CONFIG_MMU
        .get_unmapped_area = io_uring_nommu_get_unmapped_area,
        .mmap_capabilities = io_uring_nommu_mmap_capabilities,
#endif
        .poll                = io_uring_poll,
#ifdef CONFIG_PROC_FS
        .show_fdinfo        = io_uring_show_fdinfo,
#endif
};

bool io_is_uring_fops(struct file *file)
{
        return file->f_op == &io_uring_fops;
}

static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
                                  struct io_uring_params *p)
{
        struct io_rings *rings;
        size_t size, sq_array_offset;

        /* make sure these are sane, as we already accounted them */
        ctx->sq_entries = p->sq_entries;
        ctx->cq_entries = p->cq_entries;

        size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
        if (size == SIZE_MAX)
                return -EOVERFLOW;

        rings = io_mem_alloc(size);
        if (!rings)
                return -ENOMEM;

        ctx->rings = rings;
        ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
        rings->sq_ring_mask = p->sq_entries - 1;
        rings->cq_ring_mask = p->cq_entries - 1;
        rings->sq_ring_entries = p->sq_entries;
        rings->cq_ring_entries = p->cq_entries;

        size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
        if (size == SIZE_MAX) {
                io_mem_free(ctx->rings);
                ctx->rings = NULL;
                return -EOVERFLOW;
        }

        ctx->sq_sqes = io_mem_alloc(size);
        if (!ctx->sq_sqes) {
                io_mem_free(ctx->rings);
                ctx->rings = NULL;
                return -ENOMEM;
        }

        return 0;
}

static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
{
        int ret, fd;

        fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
        if (fd < 0)
                return fd;

        ret = io_uring_add_tctx_node(ctx);
        if (ret) {
                put_unused_fd(fd);
                return ret;
        }
        fd_install(fd, file);
        return fd;
}

/*
 * Allocate an anonymous fd, this is what constitutes the application
 * visible backing of an io_uring instance. The application mmaps this
 * fd to gain access to the SQ/CQ ring details.
 */
static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
{
        return anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
                                  O_RDWR | O_CLOEXEC);
}

static int io_uring_create(unsigned entries, struct io_uring_params *p,
                           struct io_uring_params __user *params)
{
        struct io_ring_ctx *ctx;
        struct file *file;
        int ret;

        if (!entries)
                return -EINVAL;
        if (entries > IORING_MAX_ENTRIES) {
                if (!(p->flags & IORING_SETUP_CLAMP))
                        return -EINVAL;
                entries = IORING_MAX_ENTRIES;
        }

        /*
         * Use twice as many entries for the CQ ring. It's possible for the
         * application to drive a higher depth than the size of the SQ ring,
         * since the sqes are only used at submission time. This allows for
         * some flexibility in overcommitting a bit. If the application has
         * set IORING_SETUP_CQSIZE, it will have passed in the desired number
         * of CQ ring entries manually.
         */
        p->sq_entries = roundup_pow_of_two(entries);
        if (p->flags & IORING_SETUP_CQSIZE) {
                /*
                 * If IORING_SETUP_CQSIZE is set, we do the same roundup
                 * to a power-of-two, if it isn't already. We do NOT impose
                 * any cq vs sq ring sizing.
                 */
                if (!p->cq_entries)
                        return -EINVAL;
                if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
                        if (!(p->flags & IORING_SETUP_CLAMP))
                                return -EINVAL;
                        p->cq_entries = IORING_MAX_CQ_ENTRIES;
                }
                p->cq_entries = roundup_pow_of_two(p->cq_entries);
                if (p->cq_entries < p->sq_entries)
                        return -EINVAL;
        } else {
                p->cq_entries = 2 * p->sq_entries;
        }

        ctx = io_ring_ctx_alloc(p);
        if (!ctx)
                return -ENOMEM;
        ctx->compat = in_compat_syscall();
        if (!ns_capable_noaudit(&init_user_ns, CAP_IPC_LOCK))
                ctx->user = get_uid(current_user());

        /*
         * This is just grabbed for accounting purposes. When a process exits,
         * the mm is exited and dropped before the files, hence we need to hang
         * on to this mm purely for the purposes of being able to unaccount
         * memory (locked/pinned vm). It's not used for anything else.
         */
        mmgrab(current->mm);
        ctx->mm_account = current->mm;

        ret = io_allocate_scq_urings(ctx, p);
        if (ret)
                goto err;

        ret = io_sq_offload_create(ctx, p);
        if (ret)
                goto err;
        /* always set a rsrc node */
        ret = io_rsrc_node_switch_start(ctx);
        if (ret)
                goto err;
        io_rsrc_node_switch(ctx, NULL);

        memset(&p->sq_off, 0, sizeof(p->sq_off));
        p->sq_off.head = offsetof(struct io_rings, sq.head);
        p->sq_off.tail = offsetof(struct io_rings, sq.tail);
        p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
        p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
        p->sq_off.flags = offsetof(struct io_rings, sq_flags);
        p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
        p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;

        memset(&p->cq_off, 0, sizeof(p->cq_off));
        p->cq_off.head = offsetof(struct io_rings, cq.head);
        p->cq_off.tail = offsetof(struct io_rings, cq.tail);
        p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
        p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
        p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
        p->cq_off.cqes = offsetof(struct io_rings, cqes);
        p->cq_off.flags = offsetof(struct io_rings, cq_flags);

        p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
                        IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
                        IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
                        IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
                        IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
                        IORING_FEAT_RSRC_TAGS;

        if (copy_to_user(params, p, sizeof(*p))) {
                ret = -EFAULT;
                goto err;
        }

        file = io_uring_get_file(ctx);
        if (IS_ERR(file)) {
                ret = PTR_ERR(file);
                goto err;
        }

        /*
         * Install ring fd as the very last thing, so we don't risk someone
         * having closed it before we finish setup
         */
        ret = io_uring_install_fd(ctx, file);
        if (ret < 0) {
                /* fput will clean it up */
                fput(file);
                return ret;
        }

        trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
        return ret;
err:
        io_ring_ctx_wait_and_kill(ctx);
        return ret;
}

/*
 * Sets up an aio uring context, and returns the fd. Applications asks for a
 * ring size, we return the actual sq/cq ring sizes (among other things) in the
 * params structure passed in.
 */
static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
{
        struct io_uring_params p;
        int i;

        if (copy_from_user(&p, params, sizeof(p)))
                return -EFAULT;
        for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
                if (p.resv[i])
                        return -EINVAL;
        }

        if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
                        IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
                        IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
                        IORING_SETUP_R_DISABLED))
                return -EINVAL;

        return  io_uring_create(entries, &p, params);
}

SYSCALL_DEFINE2(io_uring_setup, u32, entries,
                struct io_uring_params __user *, params)
{
        return io_uring_setup(entries, params);
}

static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
{
        struct io_uring_probe *p;
        size_t size;
        int i, ret;

        size = struct_size(p, ops, nr_args);
        if (size == SIZE_MAX)
                return -EOVERFLOW;
        p = kzalloc(size, GFP_KERNEL);
        if (!p)
                return -ENOMEM;

        ret = -EFAULT;
        if (copy_from_user(p, arg, size))
                goto out;
        ret = -EINVAL;
        if (memchr_inv(p, 0, size))
                goto out;

        p->last_op = IORING_OP_LAST - 1;
        if (nr_args > IORING_OP_LAST)
                nr_args = IORING_OP_LAST;

        for (i = 0; i < nr_args; i++) {
                p->ops[i].op = i;
                if (!io_op_defs[i].not_supported)
                        p->ops[i].flags = IO_URING_OP_SUPPORTED;
        }
        p->ops_len = i;

        ret = 0;
        if (copy_to_user(arg, p, size))
                ret = -EFAULT;
out:
        kfree(p);
        return ret;
}

static int io_register_personality(struct io_ring_ctx *ctx)
{
        const struct cred *creds;
        u32 id;
        int ret;

        creds = get_current_cred();

        ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
                        XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
        if (ret < 0) {
                put_cred(creds);
                return ret;
        }
        return id;
}

static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg,
                                    unsigned int nr_args)
{
        struct io_uring_restriction *res;
        size_t size;
        int i, ret;

        /* Restrictions allowed only if rings started disabled */
        if (!(ctx->flags & IORING_SETUP_R_DISABLED))
                return -EBADFD;

        /* We allow only a single restrictions registration */
        if (ctx->restrictions.registered)
                return -EBUSY;

        if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
                return -EINVAL;

        size = array_size(nr_args, sizeof(*res));
        if (size == SIZE_MAX)
                return -EOVERFLOW;

        res = memdup_user(arg, size);
        if (IS_ERR(res))
                return PTR_ERR(res);

        ret = 0;

        for (i = 0; i < nr_args; i++) {
                switch (res[i].opcode) {
                case IORING_RESTRICTION_REGISTER_OP:
                        if (res[i].register_op >= IORING_REGISTER_LAST) {
                                ret = -EINVAL;
                                goto out;
                        }

                        __set_bit(res[i].register_op,
                                  ctx->restrictions.register_op);
                        break;
                case IORING_RESTRICTION_SQE_OP:
                        if (res[i].sqe_op >= IORING_OP_LAST) {
                                ret = -EINVAL;
                                goto out;
                        }

                        __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
                        break;
                case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
                        ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
                        break;
                case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
                        ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
                        break;
                default:
                        ret = -EINVAL;
                        goto out;
                }
        }

out:
        /* Reset all restrictions if an error happened */
        if (ret != 0)
                memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
        else
                ctx->restrictions.registered = true;

        kfree(res);
        return ret;
}

static int io_register_enable_rings(struct io_ring_ctx *ctx)
{
        if (!(ctx->flags & IORING_SETUP_R_DISABLED))
                return -EBADFD;

        if (ctx->restrictions.registered)
                ctx->restricted = 1;

        ctx->flags &= ~IORING_SETUP_R_DISABLED;
        if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
                wake_up(&ctx->sq_data->wait);
        return 0;
}

static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
                                     struct io_uring_rsrc_update2 *up,
                                     unsigned nr_args)
{
        __u32 tmp;
        int err;

        if (check_add_overflow(up->offset, nr_args, &tmp))
                return -EOVERFLOW;
        err = io_rsrc_node_switch_start(ctx);
        if (err)
                return err;

        switch (type) {
        case IORING_RSRC_FILE:
                return __io_sqe_files_update(ctx, up, nr_args);
        case IORING_RSRC_BUFFER:
                return __io_sqe_buffers_update(ctx, up, nr_args);
        }
        return -EINVAL;
}

static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
                                    unsigned nr_args)
{
        struct io_uring_rsrc_update2 up;

        if (!nr_args)
                return -EINVAL;
        memset(&up, 0, sizeof(up));
        if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
                return -EFAULT;
        if (up.resv || up.resv2)
                return -EINVAL;
        return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
}

static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
                                   unsigned size, unsigned type)
{
        struct io_uring_rsrc_update2 up;

        if (size != sizeof(up))
                return -EINVAL;
        if (copy_from_user(&up, arg, sizeof(up)))
                return -EFAULT;
        if (!up.nr || up.resv || up.resv2)
                return -EINVAL;
        return __io_register_rsrc_update(ctx, type, &up, up.nr);
}

static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
                            unsigned int size, unsigned int type)
{
        struct io_uring_rsrc_register rr;

        /* keep it extendible */
        if (size != sizeof(rr))
                return -EINVAL;

        memset(&rr, 0, sizeof(rr));
        if (copy_from_user(&rr, arg, size))
                return -EFAULT;
        if (!rr.nr || rr.resv || rr.resv2)
                return -EINVAL;

        switch (type) {
        case IORING_RSRC_FILE:
                return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
                                             rr.nr, u64_to_user_ptr(rr.tags));
        case IORING_RSRC_BUFFER:
                return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
                                               rr.nr, u64_to_user_ptr(rr.tags));
        }
        return -EINVAL;
}

static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg,
                                unsigned len)
{
        struct io_uring_task *tctx = current->io_uring;
        cpumask_var_t new_mask;
        int ret;

        if (!tctx || !tctx->io_wq)
                return -EINVAL;

        if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
                return -ENOMEM;

        cpumask_clear(new_mask);
        if (len > cpumask_size())
                len = cpumask_size();

#ifdef CONFIG_COMPAT
        if (in_compat_syscall()) {
                ret = compat_get_bitmap(cpumask_bits(new_mask),
                                        (const compat_ulong_t __user *)arg,
                                        len * 8 /* CHAR_BIT */);
        } else {
                ret = copy_from_user(new_mask, arg, len);
        }
#else
        ret = copy_from_user(new_mask, arg, len);
#endif

        if (ret) {
                free_cpumask_var(new_mask);
                return -EFAULT;
        }

        ret = io_wq_cpu_affinity(tctx->io_wq, new_mask);
        free_cpumask_var(new_mask);
        return ret;
}

static int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
{
        struct io_uring_task *tctx = current->io_uring;

        if (!tctx || !tctx->io_wq)
                return -EINVAL;

        return io_wq_cpu_affinity(tctx->io_wq, NULL);
}

static int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
                                        void __user *arg)
        __must_hold(&ctx->uring_lock)
{
        struct io_tctx_node *node;
        struct io_uring_task *tctx = NULL;
        struct io_sq_data *sqd = NULL;
        __u32 new_count[2];
        int i, ret;

        if (copy_from_user(new_count, arg, sizeof(new_count)))
                return -EFAULT;
        for (i = 0; i < ARRAY_SIZE(new_count); i++)
                if (new_count[i] > INT_MAX)
                        return -EINVAL;

        if (ctx->flags & IORING_SETUP_SQPOLL) {
                sqd = ctx->sq_data;
                if (sqd) {
                        /*
                         * Observe the correct sqd->lock -> ctx->uring_lock
                         * ordering. Fine to drop uring_lock here, we hold
                         * a ref to the ctx.
                         */
                        refcount_inc(&sqd->refs);
                        mutex_unlock(&ctx->uring_lock);
                        mutex_lock(&sqd->lock);
                        mutex_lock(&ctx->uring_lock);
                        if (sqd->thread)
                                tctx = sqd->thread->io_uring;
                }
        } else {
                tctx = current->io_uring;
        }

        BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));

        for (i = 0; i < ARRAY_SIZE(new_count); i++)
                if (new_count[i])
                        ctx->iowq_limits[i] = new_count[i];
        ctx->iowq_limits_set = true;

        ret = -EINVAL;
        if (tctx && tctx->io_wq) {
                ret = io_wq_max_workers(tctx->io_wq, new_count);
                if (ret)
                        goto err;
        } else {
                memset(new_count, 0, sizeof(new_count));
        }

        if (sqd) {
                mutex_unlock(&ctx->uring_lock);
                mutex_unlock(&sqd->lock);
                io_put_sq_data(sqd);
                mutex_lock(&ctx->uring_lock);
        }

        if (copy_to_user(arg, new_count, sizeof(new_count)))
                return -EFAULT;

        /* that's it for SQPOLL, only the SQPOLL task creates requests */
        if (sqd)
                return 0;

        /* now propagate the restriction to all registered users */
        list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
                struct io_uring_task *tctx = node->task->io_uring;

                if (WARN_ON_ONCE(!tctx->io_wq))
                        continue;

                for (i = 0; i < ARRAY_SIZE(new_count); i++)
                        new_count[i] = ctx->iowq_limits[i];
                /* ignore errors, it always returns zero anyway */
                (void)io_wq_max_workers(tctx->io_wq, new_count);
        }
        return 0;
err:
        if (sqd) {
                mutex_unlock(&ctx->uring_lock);
                mutex_unlock(&sqd->lock);
                io_put_sq_data(sqd);
                mutex_lock(&ctx->uring_lock);

        }
        return ret;
}

static bool io_register_op_must_quiesce(int op)
{
        switch (op) {
        case IORING_REGISTER_BUFFERS:
        case IORING_UNREGISTER_BUFFERS:
        case IORING_REGISTER_FILES:
        case IORING_UNREGISTER_FILES:
        case IORING_REGISTER_FILES_UPDATE:
        case IORING_REGISTER_PROBE:
        case IORING_REGISTER_PERSONALITY:
        case IORING_UNREGISTER_PERSONALITY:
        case IORING_REGISTER_FILES2:
        case IORING_REGISTER_FILES_UPDATE2:
        case IORING_REGISTER_BUFFERS2:
        case IORING_REGISTER_BUFFERS_UPDATE:
        case IORING_REGISTER_IOWQ_AFF:
        case IORING_UNREGISTER_IOWQ_AFF:
        case IORING_REGISTER_IOWQ_MAX_WORKERS:
                return false;
        default:
                return true;
        }
}

static int io_ctx_quiesce(struct io_ring_ctx *ctx)
{
        long ret;

        percpu_ref_kill(&ctx->refs);

        /*
         * Drop uring mutex before waiting for references to exit. If another
         * thread is currently inside io_uring_enter() it might need to grab the
         * uring_lock to make progress. If we hold it here across the drain
         * wait, then we can deadlock. It's safe to drop the mutex here, since
         * no new references will come in after we've killed the percpu ref.
         */
        mutex_unlock(&ctx->uring_lock);
        do {
                ret = wait_for_completion_interruptible(&ctx->ref_comp);
                if (!ret)
                        break;
                ret = io_run_task_work_sig();
        } while (ret >= 0);
        mutex_lock(&ctx->uring_lock);

        if (ret)
                io_refs_resurrect(&ctx->refs, &ctx->ref_comp);
        return ret;
}

static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
                               void __user *arg, unsigned nr_args)
        __releases(ctx->uring_lock)
        __acquires(ctx->uring_lock)
{
        int ret;

        /*
         * We're inside the ring mutex, if the ref is already dying, then
         * someone else killed the ctx or is already going through
         * io_uring_register().
         */
        if (percpu_ref_is_dying(&ctx->refs))
                return -ENXIO;

        if (ctx->restricted) {
                opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
                if (!test_bit(opcode, ctx->restrictions.register_op))
                        return -EACCES;
        }

        if (io_register_op_must_quiesce(opcode)) {
                ret = io_ctx_quiesce(ctx);
                if (ret)
                        return ret;
        }

        switch (opcode) {
        case IORING_REGISTER_BUFFERS:
                ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
                break;
        case IORING_UNREGISTER_BUFFERS:
                ret = -EINVAL;
                if (arg || nr_args)
                        break;
                ret = io_sqe_buffers_unregister(ctx);
                break;
        case IORING_REGISTER_FILES:
                ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
                break;
        case IORING_UNREGISTER_FILES:
                ret = -EINVAL;
                if (arg || nr_args)
                        break;
                ret = io_sqe_files_unregister(ctx);
                break;
        case IORING_REGISTER_FILES_UPDATE:
                ret = io_register_files_update(ctx, arg, nr_args);
                break;
        case IORING_REGISTER_EVENTFD:
        case IORING_REGISTER_EVENTFD_ASYNC:
                ret = -EINVAL;
                if (nr_args != 1)
                        break;
                ret = io_eventfd_register(ctx, arg);
                if (ret)
                        break;
                if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
                        ctx->eventfd_async = 1;
                else
                        ctx->eventfd_async = 0;
                break;
        case IORING_UNREGISTER_EVENTFD:
                ret = -EINVAL;
                if (arg || nr_args)
                        break;
                ret = io_eventfd_unregister(ctx);
                break;
        case IORING_REGISTER_PROBE:
                ret = -EINVAL;
                if (!arg || nr_args > 256)
                        break;
                ret = io_probe(ctx, arg, nr_args);
                break;
        case IORING_REGISTER_PERSONALITY:
                ret = -EINVAL;
                if (arg || nr_args)
                        break;
                ret = io_register_personality(ctx);
                break;
        case IORING_UNREGISTER_PERSONALITY:
                ret = -EINVAL;
                if (arg)
                        break;
                ret = io_unregister_personality(ctx, nr_args);
                break;
        case IORING_REGISTER_ENABLE_RINGS:
                ret = -EINVAL;
                if (arg || nr_args)
                        break;
                ret = io_register_enable_rings(ctx);
                break;
        case IORING_REGISTER_RESTRICTIONS:
                ret = io_register_restrictions(ctx, arg, nr_args);
                break;
        case IORING_REGISTER_FILES2:
                ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
                break;
        case IORING_REGISTER_FILES_UPDATE2:
                ret = io_register_rsrc_update(ctx, arg, nr_args,
                                              IORING_RSRC_FILE);
                break;
        case IORING_REGISTER_BUFFERS2:
                ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
                break;
        case IORING_REGISTER_BUFFERS_UPDATE:
                ret = io_register_rsrc_update(ctx, arg, nr_args,
                                              IORING_RSRC_BUFFER);
                break;
        case IORING_REGISTER_IOWQ_AFF:
                ret = -EINVAL;
                if (!arg || !nr_args)
                        break;
                ret = io_register_iowq_aff(ctx, arg, nr_args);
                break;
        case IORING_UNREGISTER_IOWQ_AFF:
                ret = -EINVAL;
                if (arg || nr_args)
                        break;
                ret = io_unregister_iowq_aff(ctx);
                break;
        case IORING_REGISTER_IOWQ_MAX_WORKERS:
                ret = -EINVAL;
                if (!arg || nr_args != 2)
                        break;
                ret = io_register_iowq_max_workers(ctx, arg);
                break;
        default:
                ret = -EINVAL;
                break;
        }

        if (io_register_op_must_quiesce(opcode)) {
                /* bring the ctx back to life */
                percpu_ref_reinit(&ctx->refs);
                reinit_completion(&ctx->ref_comp);
        }
        return ret;
}

SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
                void __user *, arg, unsigned int, nr_args)
{
        struct io_ring_ctx *ctx;
        long ret = -EBADF;
        struct fd f;

        if (opcode >= IORING_REGISTER_LAST)
                return -EINVAL;

        f = fdget(fd);
        if (!f.file)
                return -EBADF;

        ret = -EOPNOTSUPP;
        if (f.file->f_op != &io_uring_fops)
                goto out_fput;

        ctx = f.file->private_data;

        io_run_task_work();

        mutex_lock(&ctx->uring_lock);
        ret = __io_uring_register(ctx, opcode, arg, nr_args);
        mutex_unlock(&ctx->uring_lock);
        trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
                                                        ctx->cq_ev_fd != NULL, ret);
out_fput:
        fdput(f);
        return ret;
}

static int __init io_uring_init(void)
{
#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
        BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
        BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
} while (0)

#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
        __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
        BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
        BUILD_BUG_SQE_ELEM(0,  __u8,   opcode);
        BUILD_BUG_SQE_ELEM(1,  __u8,   flags);
        BUILD_BUG_SQE_ELEM(2,  __u16,  ioprio);
        BUILD_BUG_SQE_ELEM(4,  __s32,  fd);
        BUILD_BUG_SQE_ELEM(8,  __u64,  off);
        BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
        BUILD_BUG_SQE_ELEM(16, __u64,  addr);
        BUILD_BUG_SQE_ELEM(16, __u64,  splice_off_in);
        BUILD_BUG_SQE_ELEM(24, __u32,  len);
        BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
        BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
        BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
        BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
        BUILD_BUG_SQE_ELEM(28, /* compat */ __u16,  poll_events);
        BUILD_BUG_SQE_ELEM(28, __u32,  poll32_events);
        BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
        BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
        BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
        BUILD_BUG_SQE_ELEM(28, __u32,  accept_flags);
        BUILD_BUG_SQE_ELEM(28, __u32,  cancel_flags);
        BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
        BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
        BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
        BUILD_BUG_SQE_ELEM(28, __u32,  splice_flags);
        BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
        BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
        BUILD_BUG_SQE_ELEM(40, __u16,  buf_group);
        BUILD_BUG_SQE_ELEM(42, __u16,  personality);
        BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
        BUILD_BUG_SQE_ELEM(44, __u32,  file_index);

        BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
                     sizeof(struct io_uring_rsrc_update));
        BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
                     sizeof(struct io_uring_rsrc_update2));

        /* ->buf_index is u16 */
        BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));

        /* should fit into one byte */
        BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));

        BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
        BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));

        req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
                                SLAB_ACCOUNT);
        return 0;
};
__initcall(io_uring_init);

























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_MROUTE_H
#define __LINUX_MROUTE_H

#include <linux/in.h>
#include <linux/pim.h>
#include <net/fib_rules.h>
#include <net/fib_notifier.h>
#include <uapi/linux/mroute.h>
#include <linux/mroute_base.h>
#include <linux/sockptr.h>

#ifdef CONFIG_IP_MROUTE
static inline int ip_mroute_opt(int opt)
{
        return opt >= MRT_BASE && opt <= MRT_MAX;
}

int ip_mroute_setsockopt(struct sock *, int, sockptr_t, unsigned int);
int ip_mroute_getsockopt(struct sock *, int, sockptr_t, sockptr_t);
int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg);
int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg);
int ip_mr_init(void);
bool ipmr_rule_default(const struct fib_rule *rule);
#else
static inline int ip_mroute_setsockopt(struct sock *sock, int optname,
                                       sockptr_t optval, unsigned int optlen)
{
        return -ENOPROTOOPT;
}

static inline int ip_mroute_getsockopt(struct sock *sk, int optname,
                                       sockptr_t optval, sockptr_t optlen)
{
        return -ENOPROTOOPT;
}

static inline int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
{
        return -ENOIOCTLCMD;
}

static inline int ip_mr_init(void)
{
        return 0;
}

static inline int ip_mroute_opt(int opt)
{
        return 0;
}

static inline bool ipmr_rule_default(const struct fib_rule *rule)
{
        return true;
}
#endif

#define VIFF_STATIC 0x8000

struct mfc_cache_cmp_arg {
        __be32 mfc_mcastgrp;
        __be32 mfc_origin;
};

/**
 * struct mfc_cache - multicast routing entries
 * @_c: Common multicast routing information; has to be first [for casting]
 * @mfc_mcastgrp: destination multicast group address
 * @mfc_origin: source address
 * @cmparg: used for rhashtable comparisons
 */
struct mfc_cache {
        struct mr_mfc _c;
        union {
                struct {
                        __be32 mfc_mcastgrp;
                        __be32 mfc_origin;
                };
                struct mfc_cache_cmp_arg cmparg;
        };
};

struct rtmsg;
int ipmr_get_route(struct net *net, struct sk_buff *skb,
                   __be32 saddr, __be32 daddr,
                   struct rtmsg *rtm, u32 portid);
#endif



















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef LINUX_MLD_H
#define LINUX_MLD_H

#include <linux/in6.h>
#include <linux/icmpv6.h>

/* MLDv1 Query/Report/Done */
struct mld_msg {
        struct icmp6hdr                mld_hdr;
        struct in6_addr                mld_mca;
};

#define mld_type                mld_hdr.icmp6_type
#define mld_code                mld_hdr.icmp6_code
#define mld_cksum                mld_hdr.icmp6_cksum
#define mld_maxdelay                mld_hdr.icmp6_maxdelay
#define mld_reserved                mld_hdr.icmp6_dataun.un_data16[1]

/* Multicast Listener Discovery version 2 headers */
/* MLDv2 Report */
struct mld2_grec {
        __u8                grec_type;
        __u8                grec_auxwords;
        __be16                grec_nsrcs;
        struct in6_addr        grec_mca;
        struct in6_addr        grec_src[];
};

struct mld2_report {
        struct icmp6hdr                mld2r_hdr;
        struct mld2_grec        mld2r_grec[];
};

#define mld2r_type                mld2r_hdr.icmp6_type
#define mld2r_resv1                mld2r_hdr.icmp6_code
#define mld2r_cksum                mld2r_hdr.icmp6_cksum
#define mld2r_resv2                mld2r_hdr.icmp6_dataun.un_data16[0]
#define mld2r_ngrec                mld2r_hdr.icmp6_dataun.un_data16[1]

/* MLDv2 Query */
struct mld2_query {
        struct icmp6hdr                mld2q_hdr;
        struct in6_addr                mld2q_mca;
#if defined(__LITTLE_ENDIAN_BITFIELD)
        __u8                        mld2q_qrv:3,
                                mld2q_suppress:1,
                                mld2q_resv2:4;
#elif defined(__BIG_ENDIAN_BITFIELD)
        __u8                        mld2q_resv2:4,
                                mld2q_suppress:1,
                                mld2q_qrv:3;
#else
#error "Please fix <asm/byteorder.h>"
#endif
        __u8                        mld2q_qqic;
        __be16                        mld2q_nsrcs;
        struct in6_addr                mld2q_srcs[];
};

#define mld2q_type                mld2q_hdr.icmp6_type
#define mld2q_code                mld2q_hdr.icmp6_code
#define mld2q_cksum                mld2q_hdr.icmp6_cksum
#define mld2q_mrc                mld2q_hdr.icmp6_maxdelay
#define mld2q_resv1                mld2q_hdr.icmp6_dataun.un_data16[1]

/* RFC3810, 5.1.3. Maximum Response Code:
 *
 * If Maximum Response Code >= 32768, Maximum Response Code represents a
 * floating-point value as follows:
 *
 *  0 1 2 3 4 5 6 7 8 9 A B C D E F
 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 * |1| exp |          mant         |
 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 */
#define MLDV2_MRC_EXP(value)        (((value) >> 12) & 0x0007)
#define MLDV2_MRC_MAN(value)        ((value) & 0x0fff)

/* RFC3810, 5.1.9. QQIC (Querier's Query Interval Code):
 *
 * If QQIC >= 128, QQIC represents a floating-point value as follows:
 *
 *  0 1 2 3 4 5 6 7
 * +-+-+-+-+-+-+-+-+
 * |1| exp | mant  |
 * +-+-+-+-+-+-+-+-+
 */
#define MLDV2_QQIC_EXP(value)        (((value) >> 4) & 0x07)
#define MLDV2_QQIC_MAN(value)        ((value) & 0x0f)

#define MLD_EXP_MIN_LIMIT        32768UL
#define MLDV1_MRD_MAX_COMPAT        (MLD_EXP_MIN_LIMIT - 1)

static inline unsigned long mldv2_mrc(const struct mld2_query *mlh2)
{
        /* RFC3810, 5.1.3. Maximum Response Code */
        unsigned long ret, mc_mrc = ntohs(mlh2->mld2q_mrc);

        if (mc_mrc < MLD_EXP_MIN_LIMIT) {
                ret = mc_mrc;
        } else {
                unsigned long mc_man, mc_exp;

                mc_exp = MLDV2_MRC_EXP(mc_mrc);
                mc_man = MLDV2_MRC_MAN(mc_mrc);

                ret = (mc_man | 0x1000) << (mc_exp + 3);
        }

        return ret;
}

#endif












































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef PM_TRACE_H
#define PM_TRACE_H

#include <linux/types.h>
#ifdef CONFIG_PM_TRACE
#include <asm/pm-trace.h>

extern int pm_trace_enabled;
extern bool pm_trace_rtc_abused;

static inline bool pm_trace_rtc_valid(void)
{
        return !pm_trace_rtc_abused;
}

static inline int pm_trace_is_enabled(void)
{
       return pm_trace_enabled;
}

struct device;
extern void set_trace_device(struct device *);
extern void generate_pm_trace(const void *tracedata, unsigned int user);
extern int show_trace_dev_match(char *buf, size_t size);

#define TRACE_DEVICE(dev) do { \
        if (pm_trace_enabled) \
                set_trace_device(dev); \
        } while(0)

#else

static inline bool pm_trace_rtc_valid(void) { return true; }
static inline int pm_trace_is_enabled(void) { return 0; }

#define TRACE_DEVICE(dev) do { } while (0)
#define TRACE_RESUME(dev) do { } while (0)
#define TRACE_SUSPEND(dev) do { } while (0)

#endif

#endif













































































































































































































































































































































































































































    2 











    2 
    2 

































    2 
































































































































































































    2 










































    2 












    2 

    2 
    2 
    2 










































































    2 






    1 






    2 


    2 


    2 



    2 









    2 









    2 

    2 











































    2 





















    2 






















































































    2 
















    2 

















    2 













    2 






























    2 


    2 




    2 



    2 















    2 











    2 









































































































































    2 

    2 
    2 

    2 












































































































































































































































































































































































































    1 




































    2 

















    2 







































    2 


    1 
    2 















    2 








    1 





























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 1991, 1992 Linus Torvalds
 * Copyright (C) 1994,      Karl Keyte: Added support for disk statistics
 * Elevator latency, (C) 2000  Andrea Arcangeli <andrea@suse.de> SuSE
 * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
 * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au>
 *        -  July2000
 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
 */

/*
 * This handles all read/write requests to block devices
 */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/backing-dev.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
#include <linux/blk-pm.h>
#include <linux/highmem.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/kernel_stat.h>
#include <linux/string.h>
#include <linux/init.h>
#include <linux/completion.h>
#include <linux/slab.h>
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/fault-inject.h>
#include <linux/list_sort.h>
#include <linux/delay.h>
#include <linux/ratelimit.h>
#include <linux/pm_runtime.h>
#include <linux/blk-cgroup.h>
#include <linux/t10-pi.h>
#include <linux/debugfs.h>
#include <linux/bpf.h>
#include <linux/psi.h>
#include <linux/sched/sysctl.h>
#include <linux/blk-crypto.h>

#define CREATE_TRACE_POINTS
#include <trace/events/block.h>

#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-sched.h"
#include "blk-pm.h"
#include "blk-rq-qos.h"

struct dentry *blk_debugfs_root;

EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_split);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);

DEFINE_IDA(blk_queue_ida);

/*
 * For queue allocation
 */
struct kmem_cache *blk_requestq_cachep;

/*
 * Controlling structure to kblockd
 */
static struct workqueue_struct *kblockd_workqueue;

/**
 * blk_queue_flag_set - atomically set a queue flag
 * @flag: flag to be set
 * @q: request queue
 */
void blk_queue_flag_set(unsigned int flag, struct request_queue *q)
{
        set_bit(flag, &q->queue_flags);
}
EXPORT_SYMBOL(blk_queue_flag_set);

/**
 * blk_queue_flag_clear - atomically clear a queue flag
 * @flag: flag to be cleared
 * @q: request queue
 */
void blk_queue_flag_clear(unsigned int flag, struct request_queue *q)
{
        clear_bit(flag, &q->queue_flags);
}
EXPORT_SYMBOL(blk_queue_flag_clear);

/**
 * blk_queue_flag_test_and_set - atomically test and set a queue flag
 * @flag: flag to be set
 * @q: request queue
 *
 * Returns the previous value of @flag - 0 if the flag was not set and 1 if
 * the flag was already set.
 */
bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q)
{
        return test_and_set_bit(flag, &q->queue_flags);
}
EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set);

void blk_rq_init(struct request_queue *q, struct request *rq)
{
        memset(rq, 0, sizeof(*rq));

        INIT_LIST_HEAD(&rq->queuelist);
        rq->q = q;
        rq->__sector = (sector_t) -1;
        INIT_HLIST_NODE(&rq->hash);
        RB_CLEAR_NODE(&rq->rb_node);
        rq->tag = BLK_MQ_NO_TAG;
        rq->internal_tag = BLK_MQ_NO_TAG;
        rq->start_time_ns = ktime_get_ns();
        rq->part = NULL;
        blk_crypto_rq_set_defaults(rq);
}
EXPORT_SYMBOL(blk_rq_init);

#define REQ_OP_NAME(name) [REQ_OP_##name] = #name
static const char *const blk_op_name[] = {
        REQ_OP_NAME(READ),
        REQ_OP_NAME(WRITE),
        REQ_OP_NAME(FLUSH),
        REQ_OP_NAME(DISCARD),
        REQ_OP_NAME(SECURE_ERASE),
        REQ_OP_NAME(ZONE_RESET),
        REQ_OP_NAME(ZONE_RESET_ALL),
        REQ_OP_NAME(ZONE_OPEN),
        REQ_OP_NAME(ZONE_CLOSE),
        REQ_OP_NAME(ZONE_FINISH),
        REQ_OP_NAME(ZONE_APPEND),
        REQ_OP_NAME(WRITE_SAME),
        REQ_OP_NAME(WRITE_ZEROES),
        REQ_OP_NAME(SCSI_IN),
        REQ_OP_NAME(SCSI_OUT),
        REQ_OP_NAME(DRV_IN),
        REQ_OP_NAME(DRV_OUT),
};
#undef REQ_OP_NAME

/**
 * blk_op_str - Return string XXX in the REQ_OP_XXX.
 * @op: REQ_OP_XXX.
 *
 * Description: Centralize block layer function to convert REQ_OP_XXX into
 * string format. Useful in the debugging and tracing bio or request. For
 * invalid REQ_OP_XXX it returns string "UNKNOWN".
 */
inline const char *blk_op_str(unsigned int op)
{
        const char *op_str = "UNKNOWN";

        if (op < ARRAY_SIZE(blk_op_name) && blk_op_name[op])
                op_str = blk_op_name[op];

        return op_str;
}
EXPORT_SYMBOL_GPL(blk_op_str);

static const struct {
        int                errno;
        const char        *name;
} blk_errors[] = {
        [BLK_STS_OK]                = { 0,                "" },
        [BLK_STS_NOTSUPP]        = { -EOPNOTSUPP, "operation not supported" },
        [BLK_STS_TIMEOUT]        = { -ETIMEDOUT,        "timeout" },
        [BLK_STS_NOSPC]                = { -ENOSPC,        "critical space allocation" },
        [BLK_STS_TRANSPORT]        = { -ENOLINK,        "recoverable transport" },
        [BLK_STS_TARGET]        = { -EREMOTEIO,        "critical target" },
        [BLK_STS_NEXUS]                = { -EBADE,        "critical nexus" },
        [BLK_STS_MEDIUM]        = { -ENODATA,        "critical medium" },
        [BLK_STS_PROTECTION]        = { -EILSEQ,        "protection" },
        [BLK_STS_RESOURCE]        = { -ENOMEM,        "kernel resource" },
        [BLK_STS_DEV_RESOURCE]        = { -EBUSY,        "device resource" },
        [BLK_STS_AGAIN]                = { -EAGAIN,        "nonblocking retry" },

        /* device mapper special case, should not leak out: */
        [BLK_STS_DM_REQUEUE]        = { -EREMCHG, "dm internal retry" },

        /* zone device specific errors */
        [BLK_STS_ZONE_OPEN_RESOURCE]        = { -ETOOMANYREFS, "open zones exceeded" },
        [BLK_STS_ZONE_ACTIVE_RESOURCE]        = { -EOVERFLOW, "active zones exceeded" },

        /* everything else not covered above: */
        [BLK_STS_IOERR]                = { -EIO,        "I/O" },
};

blk_status_t errno_to_blk_status(int errno)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(blk_errors); i++) {
                if (blk_errors[i].errno == errno)
                        return (__force blk_status_t)i;
        }

        return BLK_STS_IOERR;
}
EXPORT_SYMBOL_GPL(errno_to_blk_status);

int blk_status_to_errno(blk_status_t status)
{
        int idx = (__force int)status;

        if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
                return -EIO;
        return blk_errors[idx].errno;
}
EXPORT_SYMBOL_GPL(blk_status_to_errno);

static void print_req_error(struct request *req, blk_status_t status,
                const char *caller)
{
        int idx = (__force int)status;

        if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
                return;

        printk_ratelimited(KERN_ERR
                "%s: %s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
                "phys_seg %u prio class %u\n",
                caller, blk_errors[idx].name,
                req->rq_disk ? req->rq_disk->disk_name : "?",
                blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)),
                req->cmd_flags & ~REQ_OP_MASK,
                req->nr_phys_segments,
                IOPRIO_PRIO_CLASS(req->ioprio));
}

static void req_bio_endio(struct request *rq, struct bio *bio,
                          unsigned int nbytes, blk_status_t error)
{
        if (error)
                bio->bi_status = error;

        if (unlikely(rq->rq_flags & RQF_QUIET))
                bio_set_flag(bio, BIO_QUIET);

        bio_advance(bio, nbytes);

        if (req_op(rq) == REQ_OP_ZONE_APPEND && error == BLK_STS_OK) {
                /*
                 * Partial zone append completions cannot be supported as the
                 * BIO fragments may end up not being written sequentially.
                 */
                if (bio->bi_iter.bi_size)
                        bio->bi_status = BLK_STS_IOERR;
                else
                        bio->bi_iter.bi_sector = rq->__sector;
        }

        /* don't actually finish bio if it's part of flush sequence */
        if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
                bio_endio(bio);
}

void blk_dump_rq_flags(struct request *rq, char *msg)
{
        printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg,
                rq->rq_disk ? rq->rq_disk->disk_name : "?",
                (unsigned long long) rq->cmd_flags);

        printk(KERN_INFO "  sector %llu, nr/cnr %u/%u\n",
               (unsigned long long)blk_rq_pos(rq),
               blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
        printk(KERN_INFO "  bio %p, biotail %p, len %u\n",
               rq->bio, rq->biotail, blk_rq_bytes(rq));
}
EXPORT_SYMBOL(blk_dump_rq_flags);

/**
 * blk_sync_queue - cancel any pending callbacks on a queue
 * @q: the queue
 *
 * Description:
 *     The block layer may perform asynchronous callback activity
 *     on a queue, such as calling the unplug function after a timeout.
 *     A block device may call blk_sync_queue to ensure that any
 *     such activity is cancelled, thus allowing it to release resources
 *     that the callbacks might use. The caller must already have made sure
 *     that its ->submit_bio will not re-add plugging prior to calling
 *     this function.
 *
 *     This function does not cancel any asynchronous activity arising
 *     out of elevator or throttling code. That would require elevator_exit()
 *     and blkcg_exit_queue() to be called with queue lock initialized.
 *
 */
void blk_sync_queue(struct request_queue *q)
{
        del_timer_sync(&q->timeout);
        cancel_work_sync(&q->timeout_work);
}
EXPORT_SYMBOL(blk_sync_queue);

/**
 * blk_set_pm_only - increment pm_only counter
 * @q: request queue pointer
 */
void blk_set_pm_only(struct request_queue *q)
{
        atomic_inc(&q->pm_only);
}
EXPORT_SYMBOL_GPL(blk_set_pm_only);

void blk_clear_pm_only(struct request_queue *q)
{
        int pm_only;

        pm_only = atomic_dec_return(&q->pm_only);
        WARN_ON_ONCE(pm_only < 0);
        if (pm_only == 0)
                wake_up_all(&q->mq_freeze_wq);
}
EXPORT_SYMBOL_GPL(blk_clear_pm_only);

/**
 * blk_put_queue - decrement the request_queue refcount
 * @q: the request_queue structure to decrement the refcount for
 *
 * Decrements the refcount of the request_queue kobject. When this reaches 0
 * we'll have blk_release_queue() called.
 *
 * Context: Any context, but the last reference must not be dropped from
 *          atomic context.
 */
void blk_put_queue(struct request_queue *q)
{
        kobject_put(&q->kobj);
}
EXPORT_SYMBOL(blk_put_queue);

void blk_set_queue_dying(struct request_queue *q)
{
        blk_queue_flag_set(QUEUE_FLAG_DYING, q);

        /*
         * When queue DYING flag is set, we need to block new req
         * entering queue, so we call blk_freeze_queue_start() to
         * prevent I/O from crossing blk_queue_enter().
         */
        blk_freeze_queue_start(q);

        if (queue_is_mq(q))
                blk_mq_wake_waiters(q);

        /* Make blk_queue_enter() reexamine the DYING flag. */
        wake_up_all(&q->mq_freeze_wq);
}
EXPORT_SYMBOL_GPL(blk_set_queue_dying);

/**
 * blk_cleanup_queue - shutdown a request queue
 * @q: request queue to shutdown
 *
 * Mark @q DYING, drain all pending requests, mark @q DEAD, destroy and
 * put it.  All future requests will be failed immediately with -ENODEV.
 *
 * Context: can sleep
 */
void blk_cleanup_queue(struct request_queue *q)
{
        /* cannot be called from atomic context */
        might_sleep();

        WARN_ON_ONCE(blk_queue_registered(q));

        /* mark @q DYING, no new request or merges will be allowed afterwards */
        blk_set_queue_dying(q);

        blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q);
        blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q);

        /*
         * Drain all requests queued before DYING marking. Set DEAD flag to
         * prevent that blk_mq_run_hw_queues() accesses the hardware queues
         * after draining finished.
         */
        blk_freeze_queue(q);

        rq_qos_exit(q);

        blk_queue_flag_set(QUEUE_FLAG_DEAD, q);

        /* for synchronous bio-based driver finish in-flight integrity i/o */
        blk_flush_integrity();

        /* @q won't process any more request, flush async actions */
        del_timer_sync(&q->backing_dev_info->laptop_mode_wb_timer);
        blk_sync_queue(q);

        if (queue_is_mq(q))
                blk_mq_exit_queue(q);

        /*
         * In theory, request pool of sched_tags belongs to request queue.
         * However, the current implementation requires tag_set for freeing
         * requests, so free the pool now.
         *
         * Queue has become frozen, there can't be any in-queue requests, so
         * it is safe to free requests now.
         */
        mutex_lock(&q->sysfs_lock);
        if (q->elevator)
                blk_mq_sched_free_requests(q);
        mutex_unlock(&q->sysfs_lock);

        /* @q is and will stay empty, shutdown and put */
        blk_put_queue(q);
}
EXPORT_SYMBOL(blk_cleanup_queue);

/**
 * blk_queue_enter() - try to increase q->q_usage_counter
 * @q: request queue pointer
 * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PM
 */
int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
{
        const bool pm = flags & BLK_MQ_REQ_PM;

        while (true) {
                bool success = false;

                rcu_read_lock();
                if (percpu_ref_tryget_live(&q->q_usage_counter)) {
                        /*
                         * The code that increments the pm_only counter is
                         * responsible for ensuring that that counter is
                         * globally visible before the queue is unfrozen.
                         */
                        if ((pm && queue_rpm_status(q) != RPM_SUSPENDED) ||
                            !blk_queue_pm_only(q)) {
                                success = true;
                        } else {
                                percpu_ref_put(&q->q_usage_counter);
                        }
                }
                rcu_read_unlock();

                if (success)
                        return 0;

                if (flags & BLK_MQ_REQ_NOWAIT)
                        return -EBUSY;

                /*
                 * read pair of barrier in blk_freeze_queue_start(),
                 * we need to order reading __PERCPU_REF_DEAD flag of
                 * .q_usage_counter and reading .mq_freeze_depth or
                 * queue dying flag, otherwise the following wait may
                 * never return if the two reads are reordered.
                 */
                smp_rmb();

                wait_event(q->mq_freeze_wq,
                           (!q->mq_freeze_depth &&
                            blk_pm_resume_queue(pm, q)) ||
                           blk_queue_dying(q));
                if (blk_queue_dying(q))
                        return -ENODEV;
        }
}

static inline int bio_queue_enter(struct bio *bio)
{
        struct request_queue *q = bio->bi_disk->queue;
        bool nowait = bio->bi_opf & REQ_NOWAIT;
        int ret;

        ret = blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0);
        if (unlikely(ret)) {
                if (nowait && !blk_queue_dying(q))
                        bio_wouldblock_error(bio);
                else
                        bio_io_error(bio);
        }

        return ret;
}

void blk_queue_exit(struct request_queue *q)
{
        percpu_ref_put(&q->q_usage_counter);
}

static void blk_queue_usage_counter_release(struct percpu_ref *ref)
{
        struct request_queue *q =
                container_of(ref, struct request_queue, q_usage_counter);

        wake_up_all(&q->mq_freeze_wq);
}

static void blk_rq_timed_out_timer(struct timer_list *t)
{
        struct request_queue *q = from_timer(q, t, timeout);

        kblockd_schedule_work(&q->timeout_work);
}

static void blk_timeout_work(struct work_struct *work)
{
}

struct request_queue *blk_alloc_queue(int node_id)
{
        struct request_queue *q;
        int ret;

        q = kmem_cache_alloc_node(blk_requestq_cachep,
                                GFP_KERNEL | __GFP_ZERO, node_id);
        if (!q)
                return NULL;

        q->last_merge = NULL;

        q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL);
        if (q->id < 0)
                goto fail_q;

        ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
        if (ret)
                goto fail_id;

        q->backing_dev_info = bdi_alloc(node_id);
        if (!q->backing_dev_info)
                goto fail_split;

        q->stats = blk_alloc_queue_stats();
        if (!q->stats)
                goto fail_stats;

        q->node = node_id;

        atomic_set(&q->nr_active_requests_shared_sbitmap, 0);

        timer_setup(&q->backing_dev_info->laptop_mode_wb_timer,
                    laptop_mode_timer_fn, 0);
        timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
        INIT_WORK(&q->timeout_work, blk_timeout_work);
        INIT_LIST_HEAD(&q->icq_list);
#ifdef CONFIG_BLK_CGROUP
        INIT_LIST_HEAD(&q->blkg_list);
#endif

        kobject_init(&q->kobj, &blk_queue_ktype);

        mutex_init(&q->debugfs_mutex);
        mutex_init(&q->sysfs_lock);
        mutex_init(&q->sysfs_dir_lock);
        spin_lock_init(&q->queue_lock);

        init_waitqueue_head(&q->mq_freeze_wq);
        mutex_init(&q->mq_freeze_lock);

        /*
         * Init percpu_ref in atomic mode so that it's faster to shutdown.
         * See blk_register_queue() for details.
         */
        if (percpu_ref_init(&q->q_usage_counter,
                                blk_queue_usage_counter_release,
                                PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
                goto fail_bdi;

        if (blkcg_init_queue(q))
                goto fail_ref;

        blk_queue_dma_alignment(q, 511);
        blk_set_default_limits(&q->limits);
        q->nr_requests = BLKDEV_MAX_RQ;

        return q;

fail_ref:
        percpu_ref_exit(&q->q_usage_counter);
fail_bdi:
        blk_free_queue_stats(q->stats);
fail_stats:
        bdi_put(q->backing_dev_info);
fail_split:
        bioset_exit(&q->bio_split);
fail_id:
        ida_simple_remove(&blk_queue_ida, q->id);
fail_q:
        kmem_cache_free(blk_requestq_cachep, q);
        return NULL;
}
EXPORT_SYMBOL(blk_alloc_queue);

/**
 * blk_get_queue - increment the request_queue refcount
 * @q: the request_queue structure to increment the refcount for
 *
 * Increment the refcount of the request_queue kobject.
 *
 * Context: Any context.
 */
bool blk_get_queue(struct request_queue *q)
{
        if (likely(!blk_queue_dying(q))) {
                __blk_get_queue(q);
                return true;
        }

        return false;
}
EXPORT_SYMBOL(blk_get_queue);

/**
 * blk_get_request - allocate a request
 * @q: request queue to allocate a request for
 * @op: operation (REQ_OP_*) and REQ_* flags, e.g. REQ_SYNC.
 * @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT.
 */
struct request *blk_get_request(struct request_queue *q, unsigned int op,
                                blk_mq_req_flags_t flags)
{
        struct request *req;

        WARN_ON_ONCE(op & REQ_NOWAIT);
        WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PM));

        req = blk_mq_alloc_request(q, op, flags);
        if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn)
                q->mq_ops->initialize_rq_fn(req);

        return req;
}
EXPORT_SYMBOL(blk_get_request);

void blk_put_request(struct request *req)
{
        blk_mq_free_request(req);
}
EXPORT_SYMBOL(blk_put_request);

static void handle_bad_sector(struct bio *bio, sector_t maxsector)
{
        char b[BDEVNAME_SIZE];

        pr_info_ratelimited("attempt to access beyond end of device\n"
                            "%s: rw=%d, want=%llu, limit=%llu\n",
                            bio_devname(bio, b), bio->bi_opf,
                            bio_end_sector(bio), maxsector);
}

#ifdef CONFIG_FAIL_MAKE_REQUEST

static DECLARE_FAULT_ATTR(fail_make_request);

static int __init setup_fail_make_request(char *str)
{
        return setup_fault_attr(&fail_make_request, str);
}
__setup("fail_make_request=", setup_fail_make_request);

static bool should_fail_request(struct hd_struct *part, unsigned int bytes)
{
        return part->make_it_fail && should_fail(&fail_make_request, bytes);
}

static int __init fail_make_request_debugfs(void)
{
        struct dentry *dir = fault_create_debugfs_attr("fail_make_request",
                                                NULL, &fail_make_request);

        return PTR_ERR_OR_ZERO(dir);
}

late_initcall(fail_make_request_debugfs);

#else /* CONFIG_FAIL_MAKE_REQUEST */

static inline bool should_fail_request(struct hd_struct *part,
                                        unsigned int bytes)
{
        return false;
}

#endif /* CONFIG_FAIL_MAKE_REQUEST */

static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part)
{
        const int op = bio_op(bio);

        if (part->policy && op_is_write(op)) {
                char b[BDEVNAME_SIZE];

                if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
                        return false;
                pr_warn("Trying to write to read-only block-device %s (partno %d)\n",
                        bio_devname(bio, b), part->partno);
                /* Older lvm-tools actually trigger this */
                return false;
        }

        return false;
}

static noinline int should_fail_bio(struct bio *bio)
{
        if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size))
                return -EIO;
        return 0;
}
ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO);

/*
 * Check whether this bio extends beyond the end of the device or partition.
 * This may well happen - the kernel calls bread() without checking the size of
 * the device, e.g., when mounting a file system.
 */
static inline int bio_check_eod(struct bio *bio, sector_t maxsector)
{
        unsigned int nr_sectors = bio_sectors(bio);

        if (nr_sectors && maxsector &&
            (nr_sectors > maxsector ||
             bio->bi_iter.bi_sector > maxsector - nr_sectors)) {
                handle_bad_sector(bio, maxsector);
                return -EIO;
        }
        return 0;
}

/*
 * Remap block n of partition p to block n+start(p) of the disk.
 */
static inline int blk_partition_remap(struct bio *bio)
{
        struct hd_struct *p;
        int ret = -EIO;

        rcu_read_lock();
        p = __disk_get_part(bio->bi_disk, bio->bi_partno);
        if (unlikely(!p))
                goto out;
        if (unlikely(should_fail_request(p, bio->bi_iter.bi_size)))
                goto out;
        if (unlikely(bio_check_ro(bio, p)))
                goto out;

        if (bio_sectors(bio)) {
                if (bio_check_eod(bio, part_nr_sects_read(p)))
                        goto out;
                bio->bi_iter.bi_sector += p->start_sect;
                trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
                                      bio->bi_iter.bi_sector - p->start_sect);
        }
        bio->bi_partno = 0;
        ret = 0;
out:
        rcu_read_unlock();
        return ret;
}

/*
 * Check write append to a zoned block device.
 */
static inline blk_status_t blk_check_zone_append(struct request_queue *q,
                                                 struct bio *bio)
{
        sector_t pos = bio->bi_iter.bi_sector;
        int nr_sectors = bio_sectors(bio);

        /* Only applicable to zoned block devices */
        if (!blk_queue_is_zoned(q))
                return BLK_STS_NOTSUPP;

        /* The bio sector must point to the start of a sequential zone */
        if (pos & (blk_queue_zone_sectors(q) - 1) ||
            !blk_queue_zone_is_seq(q, pos))
                return BLK_STS_IOERR;

        /*
         * Not allowed to cross zone boundaries. Otherwise, the BIO will be
         * split and could result in non-contiguous sectors being written in
         * different zones.
         */
        if (nr_sectors > q->limits.chunk_sectors)
                return BLK_STS_IOERR;

        /* Make sure the BIO is small enough and will not get split */
        if (nr_sectors > q->limits.max_zone_append_sectors)
                return BLK_STS_IOERR;

        bio->bi_opf |= REQ_NOMERGE;

        return BLK_STS_OK;
}

static noinline_for_stack bool submit_bio_checks(struct bio *bio)
{
        struct request_queue *q = bio->bi_disk->queue;
        blk_status_t status = BLK_STS_IOERR;
        struct blk_plug *plug;

        might_sleep();

        plug = blk_mq_plug(q, bio);
        if (plug && plug->nowait)
                bio->bi_opf |= REQ_NOWAIT;

        /*
         * For a REQ_NOWAIT based request, return -EOPNOTSUPP
         * if queue does not support NOWAIT.
         */
        if ((bio->bi_opf & REQ_NOWAIT) && !blk_queue_nowait(q))
                goto not_supported;

        if (should_fail_bio(bio))
                goto end_io;

        if (bio->bi_partno) {
                if (unlikely(blk_partition_remap(bio)))
                        goto end_io;
        } else {
                if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0)))
                        goto end_io;
                if (unlikely(bio_check_eod(bio, get_capacity(bio->bi_disk))))
                        goto end_io;
        }

        /*
         * Filter flush bio's early so that bio based drivers without flush
         * support don't have to worry about them.
         */
        if (op_is_flush(bio->bi_opf) &&
            !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
                bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA);
                if (!bio_sectors(bio)) {
                        status = BLK_STS_OK;
                        goto end_io;
                }
        }

        if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
                bio->bi_opf &= ~REQ_HIPRI;

        switch (bio_op(bio)) {
        case REQ_OP_DISCARD:
                if (!blk_queue_discard(q))
                        goto not_supported;
                break;
        case REQ_OP_SECURE_ERASE:
                if (!blk_queue_secure_erase(q))
                        goto not_supported;
                break;
        case REQ_OP_WRITE_SAME:
                if (!q->limits.max_write_same_sectors)
                        goto not_supported;
                break;
        case REQ_OP_ZONE_APPEND:
                status = blk_check_zone_append(q, bio);
                if (status != BLK_STS_OK)
                        goto end_io;
                break;
        case REQ_OP_ZONE_RESET:
        case REQ_OP_ZONE_OPEN:
        case REQ_OP_ZONE_CLOSE:
        case REQ_OP_ZONE_FINISH:
                if (!blk_queue_is_zoned(q))
                        goto not_supported;
                break;
        case REQ_OP_ZONE_RESET_ALL:
                if (!blk_queue_is_zoned(q) || !blk_queue_zone_resetall(q))
                        goto not_supported;
                break;
        case REQ_OP_WRITE_ZEROES:
                if (!q->limits.max_write_zeroes_sectors)
                        goto not_supported;
                break;
        default:
                break;
        }

        /*
         * Various block parts want %current->io_context, so allocate it up
         * front rather than dealing with lots of pain to allocate it only
         * where needed. This may fail and the block layer knows how to live
         * with it.
         */
        if (unlikely(!current->io_context))
                create_task_io_context(current, GFP_ATOMIC, q->node);

        if (blk_throtl_bio(bio))
                return false;

        blk_cgroup_bio_start(bio);
        blkcg_bio_issue_init(bio);

        if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
                trace_block_bio_queue(q, bio);
                /* Now that enqueuing has been traced, we need to trace
                 * completion as well.
                 */
                bio_set_flag(bio, BIO_TRACE_COMPLETION);
        }
        return true;

not_supported:
        status = BLK_STS_NOTSUPP;
end_io:
        bio->bi_status = status;
        bio_endio(bio);
        return false;
}

static blk_qc_t __submit_bio(struct bio *bio)
{
        struct gendisk *disk = bio->bi_disk;
        blk_qc_t ret = BLK_QC_T_NONE;

        if (blk_crypto_bio_prep(&bio)) {
                if (!disk->fops->submit_bio)
                        return blk_mq_submit_bio(bio);
                ret = disk->fops->submit_bio(bio);
        }
        blk_queue_exit(disk->queue);
        return ret;
}

/*
 * The loop in this function may be a bit non-obvious, and so deserves some
 * explanation:
 *
 *  - Before entering the loop, bio->bi_next is NULL (as all callers ensure
 *    that), so we have a list with a single bio.
 *  - We pretend that we have just taken it off a longer list, so we assign
 *    bio_list to a pointer to the bio_list_on_stack, thus initialising the
 *    bio_list of new bios to be added.  ->submit_bio() may indeed add some more
 *    bios through a recursive call to submit_bio_noacct.  If it did, we find a
 *    non-NULL value in bio_list and re-enter the loop from the top.
 *  - In this case we really did just take the bio of the top of the list (no
 *    pretending) and so remove it from bio_list, and call into ->submit_bio()
 *    again.
 *
 * bio_list_on_stack[0] contains bios submitted by the current ->submit_bio.
 * bio_list_on_stack[1] contains bios that were submitted before the current
 *        ->submit_bio_bio, but that haven't been processed yet.
 */
static blk_qc_t __submit_bio_noacct(struct bio *bio)
{
        struct bio_list bio_list_on_stack[2];
        blk_qc_t ret = BLK_QC_T_NONE;

        BUG_ON(bio->bi_next);

        bio_list_init(&bio_list_on_stack[0]);
        current->bio_list = bio_list_on_stack;

        do {
                struct request_queue *q = bio->bi_disk->queue;
                struct bio_list lower, same;

                if (unlikely(bio_queue_enter(bio) != 0))
                        continue;

                /*
                 * Create a fresh bio_list for all subordinate requests.
                 */
                bio_list_on_stack[1] = bio_list_on_stack[0];
                bio_list_init(&bio_list_on_stack[0]);

                ret = __submit_bio(bio);

                /*
                 * Sort new bios into those for a lower level and those for the
                 * same level.
                 */
                bio_list_init(&lower);
                bio_list_init(&same);
                while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
                        if (q == bio->bi_disk->queue)
                                bio_list_add(&same, bio);
                        else
                                bio_list_add(&lower, bio);

                /*
                 * Now assemble so we handle the lowest level first.
                 */
                bio_list_merge(&bio_list_on_stack[0], &lower);
                bio_list_merge(&bio_list_on_stack[0], &same);
                bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
        } while ((bio = bio_list_pop(&bio_list_on_stack[0])));

        current->bio_list = NULL;
        return ret;
}

static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
{
        struct bio_list bio_list[2] = { };
        blk_qc_t ret = BLK_QC_T_NONE;

        current->bio_list = bio_list;

        do {
                struct gendisk *disk = bio->bi_disk;

                if (unlikely(bio_queue_enter(bio) != 0))
                        continue;

                if (!blk_crypto_bio_prep(&bio)) {
                        blk_queue_exit(disk->queue);
                        ret = BLK_QC_T_NONE;
                        continue;
                }

                ret = blk_mq_submit_bio(bio);
        } while ((bio = bio_list_pop(&bio_list[0])));

        current->bio_list = NULL;
        return ret;
}

/**
 * submit_bio_noacct - re-submit a bio to the block device layer for I/O
 * @bio:  The bio describing the location in memory and on the device.
 *
 * This is a version of submit_bio() that shall only be used for I/O that is
 * resubmitted to lower level drivers by stacking block drivers.  All file
 * systems and other upper level users of the block layer should use
 * submit_bio() instead.
 */
blk_qc_t submit_bio_noacct(struct bio *bio)
{
        if (!submit_bio_checks(bio))
                return BLK_QC_T_NONE;

        /*
         * We only want one ->submit_bio to be active at a time, else stack
         * usage with stacked devices could be a problem.  Use current->bio_list
         * to collect a list of requests submited by a ->submit_bio method while
         * it is active, and then process them after it returned.
         */
        if (current->bio_list) {
                bio_list_add(&current->bio_list[0], bio);
                return BLK_QC_T_NONE;
        }

        if (!bio->bi_disk->fops->submit_bio)
                return __submit_bio_noacct_mq(bio);
        return __submit_bio_noacct(bio);
}
EXPORT_SYMBOL(submit_bio_noacct);

/**
 * submit_bio - submit a bio to the block device layer for I/O
 * @bio: The &struct bio which describes the I/O
 *
 * submit_bio() is used to submit I/O requests to block devices.  It is passed a
 * fully set up &struct bio that describes the I/O that needs to be done.  The
 * bio will be send to the device described by the bi_disk and bi_partno fields.
 *
 * The success/failure status of the request, along with notification of
 * completion, is delivered asynchronously through the ->bi_end_io() callback
 * in @bio.  The bio must NOT be touched by thecaller until ->bi_end_io() has
 * been called.
 */
blk_qc_t submit_bio(struct bio *bio)
{
        if (blkcg_punt_bio_submit(bio))
                return BLK_QC_T_NONE;

        /*
         * If it's a regular read/write or a barrier with data attached,
         * go through the normal accounting stuff before submission.
         */
        if (bio_has_data(bio)) {
                unsigned int count;

                if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
                        count = queue_logical_block_size(bio->bi_disk->queue) >> 9;
                else
                        count = bio_sectors(bio);

                if (op_is_write(bio_op(bio))) {
                        count_vm_events(PGPGOUT, count);
                } else {
                        task_io_account_read(bio->bi_iter.bi_size);
                        count_vm_events(PGPGIN, count);
                }

                if (unlikely(block_dump)) {
                        char b[BDEVNAME_SIZE];
                        printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
                        current->comm, task_pid_nr(current),
                                op_is_write(bio_op(bio)) ? "WRITE" : "READ",
                                (unsigned long long)bio->bi_iter.bi_sector,
                                bio_devname(bio, b), count);
                }
        }

        /*
         * If we're reading data that is part of the userspace workingset, count
         * submission time as memory stall.  When the device is congested, or
         * the submitting cgroup IO-throttled, submission can be a significant
         * part of overall IO time.
         */
        if (unlikely(bio_op(bio) == REQ_OP_READ &&
            bio_flagged(bio, BIO_WORKINGSET))) {
                unsigned long pflags;
                blk_qc_t ret;

                psi_memstall_enter(&pflags);
                ret = submit_bio_noacct(bio);
                psi_memstall_leave(&pflags);

                return ret;
        }

        return submit_bio_noacct(bio);
}
EXPORT_SYMBOL(submit_bio);

/**
 * blk_cloned_rq_check_limits - Helper function to check a cloned request
 *                              for the new queue limits
 * @q:  the queue
 * @rq: the request being checked
 *
 * Description:
 *    @rq may have been made based on weaker limitations of upper-level queues
 *    in request stacking drivers, and it may violate the limitation of @q.
 *    Since the block layer and the underlying device driver trust @rq
 *    after it is inserted to @q, it should be checked against @q before
 *    the insertion using this generic function.
 *
 *    Request stacking drivers like request-based dm may change the queue
 *    limits when retrying requests on other queues. Those requests need
 *    to be checked against the new queue limits again during dispatch.
 */
static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q,
                                      struct request *rq)
{
        unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq));

        if (blk_rq_sectors(rq) > max_sectors) {
                /*
                 * SCSI device does not have a good way to return if
                 * Write Same/Zero is actually supported. If a device rejects
                 * a non-read/write command (discard, write same,etc.) the
                 * low-level device driver will set the relevant queue limit to
                 * 0 to prevent blk-lib from issuing more of the offending
                 * operations. Commands queued prior to the queue limit being
                 * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O
                 * errors being propagated to upper layers.
                 */
                if (max_sectors == 0)
                        return BLK_STS_NOTSUPP;

                printk(KERN_ERR "%s: over max size limit. (%u > %u)\n",
                        __func__, blk_rq_sectors(rq), max_sectors);
                return BLK_STS_IOERR;
        }

        /*
         * queue's settings related to segment counting like q->bounce_pfn
         * may differ from that of other stacking queues.
         * Recalculate it to check the request correctly on this queue's
         * limitation.
         */
        rq->nr_phys_segments = blk_recalc_rq_segments(rq);
        if (rq->nr_phys_segments > queue_max_segments(q)) {
                printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n",
                        __func__, rq->nr_phys_segments, queue_max_segments(q));
                return BLK_STS_IOERR;
        }

        return BLK_STS_OK;
}

/**
 * blk_insert_cloned_request - Helper for stacking drivers to submit a request
 * @q:  the queue to submit the request
 * @rq: the request being queued
 */
blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq)
{
        blk_status_t ret;

        ret = blk_cloned_rq_check_limits(q, rq);
        if (ret != BLK_STS_OK)
                return ret;

        if (rq->rq_disk &&
            should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq)))
                return BLK_STS_IOERR;

        if (blk_crypto_insert_cloned_request(rq))
                return BLK_STS_IOERR;

        if (blk_queue_io_stat(q))
                blk_account_io_start(rq);

        /*
         * Since we have a scheduler attached on the top device,
         * bypass a potential scheduler on the bottom device for
         * insert.
         */
        return blk_mq_request_issue_directly(rq, true);
}
EXPORT_SYMBOL_GPL(blk_insert_cloned_request);

/**
 * blk_rq_err_bytes - determine number of bytes till the next failure boundary
 * @rq: request to examine
 *
 * Description:
 *     A request could be merge of IOs which require different failure
 *     handling.  This function determines the number of bytes which
 *     can be failed from the beginning of the request without
 *     crossing into area which need to be retried further.
 *
 * Return:
 *     The number of bytes to fail.
 */
unsigned int blk_rq_err_bytes(const struct request *rq)
{
        unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;
        unsigned int bytes = 0;
        struct bio *bio;

        if (!(rq->rq_flags & RQF_MIXED_MERGE))
                return blk_rq_bytes(rq);

        /*
         * Currently the only 'mixing' which can happen is between
         * different fastfail types.  We can safely fail portions
         * which have all the failfast bits that the first one has -
         * the ones which are at least as eager to fail as the first
         * one.
         */
        for (bio = rq->bio; bio; bio = bio->bi_next) {
                if ((bio->bi_opf & ff) != ff)
                        break;
                bytes += bio->bi_iter.bi_size;
        }

        /* this could lead to infinite loop */
        BUG_ON(blk_rq_bytes(rq) && !bytes);
        return bytes;
}
EXPORT_SYMBOL_GPL(blk_rq_err_bytes);

static void update_io_ticks(struct hd_struct *part, unsigned long now, bool end)
{
        unsigned long stamp;
again:
        stamp = READ_ONCE(part->stamp);
        if (unlikely(stamp != now)) {
                if (likely(cmpxchg(&part->stamp, stamp, now) == stamp))
                        __part_stat_add(part, io_ticks, end ? now - stamp : 1);
        }
        if (part->partno) {
                part = &part_to_disk(part)->part0;
                goto again;
        }
}

static void blk_account_io_completion(struct request *req, unsigned int bytes)
{
        if (req->part && blk_do_io_stat(req)) {
                const int sgrp = op_stat_group(req_op(req));
                struct hd_struct *part;

                part_stat_lock();
                part = req->part;
                part_stat_add(part, sectors[sgrp], bytes >> 9);
                part_stat_unlock();
        }
}

void blk_account_io_done(struct request *req, u64 now)
{
        /*
         * Account IO completion.  flush_rq isn't accounted as a
         * normal IO on queueing nor completion.  Accounting the
         * containing request is enough.
         */
        if (req->part && blk_do_io_stat(req) &&
            !(req->rq_flags & RQF_FLUSH_SEQ)) {
                const int sgrp = op_stat_group(req_op(req));
                struct hd_struct *part;

                part_stat_lock();
                part = req->part;

                update_io_ticks(part, jiffies, true);
                part_stat_inc(part, ios[sgrp]);
                part_stat_add(part, nsecs[sgrp], now - req->start_time_ns);
                part_stat_unlock();

                hd_struct_put(part);
        }
}

void blk_account_io_start(struct request *rq)
{
        if (!blk_do_io_stat(rq))
                return;

        rq->part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));

        part_stat_lock();
        update_io_ticks(rq->part, jiffies, false);
        part_stat_unlock();
}

static unsigned long __part_start_io_acct(struct hd_struct *part,
                                          unsigned int sectors, unsigned int op)
{
        const int sgrp = op_stat_group(op);
        unsigned long now = READ_ONCE(jiffies);

        part_stat_lock();
        update_io_ticks(part, now, false);
        part_stat_inc(part, ios[sgrp]);
        part_stat_add(part, sectors[sgrp], sectors);
        part_stat_local_inc(part, in_flight[op_is_write(op)]);
        part_stat_unlock();

        return now;
}

unsigned long part_start_io_acct(struct gendisk *disk, struct hd_struct **part,
                                 struct bio *bio)
{
        *part = disk_map_sector_rcu(disk, bio->bi_iter.bi_sector);

        return __part_start_io_acct(*part, bio_sectors(bio), bio_op(bio));
}
EXPORT_SYMBOL_GPL(part_start_io_acct);

unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,
                                 unsigned int op)
{
        return __part_start_io_acct(&disk->part0, sectors, op);
}
EXPORT_SYMBOL(disk_start_io_acct);

static void __part_end_io_acct(struct hd_struct *part, unsigned int op,
                               unsigned long start_time)
{
        const int sgrp = op_stat_group(op);
        unsigned long now = READ_ONCE(jiffies);
        unsigned long duration = now - start_time;

        part_stat_lock();
        update_io_ticks(part, now, true);
        part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration));
        part_stat_local_dec(part, in_flight[op_is_write(op)]);
        part_stat_unlock();
}

void part_end_io_acct(struct hd_struct *part, struct bio *bio,
                      unsigned long start_time)
{
        __part_end_io_acct(part, bio_op(bio), start_time);
        hd_struct_put(part);
}
EXPORT_SYMBOL_GPL(part_end_io_acct);

void disk_end_io_acct(struct gendisk *disk, unsigned int op,
                      unsigned long start_time)
{
        __part_end_io_acct(&disk->part0, op, start_time);
}
EXPORT_SYMBOL(disk_end_io_acct);

/*
 * Steal bios from a request and add them to a bio list.
 * The request must not have been partially completed before.
 */
void blk_steal_bios(struct bio_list *list, struct request *rq)
{
        if (rq->bio) {
                if (list->tail)
                        list->tail->bi_next = rq->bio;
                else
                        list->head = rq->bio;
                list->tail = rq->biotail;

                rq->bio = NULL;
                rq->biotail = NULL;
        }

        rq->__data_len = 0;
}
EXPORT_SYMBOL_GPL(blk_steal_bios);

/**
 * blk_update_request - Special helper function for request stacking drivers
 * @req:      the request being processed
 * @error:    block status code
 * @nr_bytes: number of bytes to complete @req
 *
 * Description:
 *     Ends I/O on a number of bytes attached to @req, but doesn't complete
 *     the request structure even if @req doesn't have leftover.
 *     If @req has leftover, sets it up for the next range of segments.
 *
 *     This special helper function is only for request stacking drivers
 *     (e.g. request-based dm) so that they can handle partial completion.
 *     Actual device drivers should use blk_mq_end_request instead.
 *
 *     Passing the result of blk_rq_bytes() as @nr_bytes guarantees
 *     %false return from this function.
 *
 * Note:
 *        The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in both
 *        blk_rq_bytes() and in blk_update_request().
 *
 * Return:
 *     %false - this request doesn't have any more data
 *     %true  - this request has more data
 **/
bool blk_update_request(struct request *req, blk_status_t error,
                unsigned int nr_bytes)
{
        int total_bytes;

        trace_block_rq_complete(req, blk_status_to_errno(error), nr_bytes);

        if (!req->bio)
                return false;

#ifdef CONFIG_BLK_DEV_INTEGRITY
        if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
            error == BLK_STS_OK)
                req->q->integrity.profile->complete_fn(req, nr_bytes);
#endif

        /*
         * Upper layers may call blk_crypto_evict_key() anytime after the last
         * bio_endio().  Therefore, the keyslot must be released before that.
         */
        if (blk_crypto_rq_has_keyslot(req) && nr_bytes >= blk_rq_bytes(req))
                __blk_crypto_rq_put_keyslot(req);

        if (unlikely(error && !blk_rq_is_passthrough(req) &&
                     !(req->rq_flags & RQF_QUIET)))
                print_req_error(req, error, __func__);

        blk_account_io_completion(req, nr_bytes);

        total_bytes = 0;
        while (req->bio) {
                struct bio *bio = req->bio;
                unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);

                if (bio_bytes == bio->bi_iter.bi_size)
                        req->bio = bio->bi_next;

                /* Completion has already been traced */
                bio_clear_flag(bio, BIO_TRACE_COMPLETION);
                req_bio_endio(req, bio, bio_bytes, error);

                total_bytes += bio_bytes;
                nr_bytes -= bio_bytes;

                if (!nr_bytes)
                        break;
        }

        /*
         * completely done
         */
        if (!req->bio) {
                /*
                 * Reset counters so that the request stacking driver
                 * can find how many bytes remain in the request
                 * later.
                 */
                req->__data_len = 0;
                return false;
        }

        req->__data_len -= total_bytes;

        /* update sector only for requests with clear definition of sector */
        if (!blk_rq_is_passthrough(req))
                req->__sector += total_bytes >> 9;

        /* mixed attributes always follow the first bio */
        if (req->rq_flags & RQF_MIXED_MERGE) {
                req->cmd_flags &= ~REQ_FAILFAST_MASK;
                req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK;
        }

        if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) {
                /*
                 * If total number of sectors is less than the first segment
                 * size, something has gone terribly wrong.
                 */
                if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
                        blk_dump_rq_flags(req, "request botched");
                        req->__data_len = blk_rq_cur_bytes(req);
                }

                /* recalculate the number of segments */
                req->nr_phys_segments = blk_recalc_rq_segments(req);
        }

        return true;
}
EXPORT_SYMBOL_GPL(blk_update_request);

#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
/**
 * rq_flush_dcache_pages - Helper function to flush all pages in a request
 * @rq: the request to be flushed
 *
 * Description:
 *     Flush all pages in @rq.
 */
void rq_flush_dcache_pages(struct request *rq)
{
        struct req_iterator iter;
        struct bio_vec bvec;

        rq_for_each_segment(bvec, rq, iter)
                flush_dcache_page(bvec.bv_page);
}
EXPORT_SYMBOL_GPL(rq_flush_dcache_pages);
#endif

/**
 * blk_lld_busy - Check if underlying low-level drivers of a device are busy
 * @q : the queue of the device being checked
 *
 * Description:
 *    Check if underlying low-level drivers of a device are busy.
 *    If the drivers want to export their busy state, they must set own
 *    exporting function using blk_queue_lld_busy() first.
 *
 *    Basically, this function is used only by request stacking drivers
 *    to stop dispatching requests to underlying devices when underlying
 *    devices are busy.  This behavior helps more I/O merging on the queue
 *    of the request stacking driver and prevents I/O throughput regression
 *    on burst I/O load.
 *
 * Return:
 *    0 - Not busy (The request stacking driver should dispatch request)
 *    1 - Busy (The request stacking driver should stop dispatching request)
 */
int blk_lld_busy(struct request_queue *q)
{
        if (queue_is_mq(q) && q->mq_ops->busy)
                return q->mq_ops->busy(q);

        return 0;
}
EXPORT_SYMBOL_GPL(blk_lld_busy);

/**
 * blk_rq_unprep_clone - Helper function to free all bios in a cloned request
 * @rq: the clone request to be cleaned up
 *
 * Description:
 *     Free all bios in @rq for a cloned request.
 */
void blk_rq_unprep_clone(struct request *rq)
{
        struct bio *bio;

        while ((bio = rq->bio) != NULL) {
                rq->bio = bio->bi_next;

                bio_put(bio);
        }
}
EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);

/**
 * blk_rq_prep_clone - Helper function to setup clone request
 * @rq: the request to be setup
 * @rq_src: original request to be cloned
 * @bs: bio_set that bios for clone are allocated from
 * @gfp_mask: memory allocation mask for bio
 * @bio_ctr: setup function to be called for each clone bio.
 *           Returns %0 for success, non %0 for failure.
 * @data: private data to be passed to @bio_ctr
 *
 * Description:
 *     Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
 *     Also, pages which the original bios are pointing to are not copied
 *     and the cloned bios just point same pages.
 *     So cloned bios must be completed before original bios, which means
 *     the caller must complete @rq before @rq_src.
 */
int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
                      struct bio_set *bs, gfp_t gfp_mask,
                      int (*bio_ctr)(struct bio *, struct bio *, void *),
                      void *data)
{
        struct bio *bio, *bio_src;

        if (!bs)
                bs = &fs_bio_set;

        __rq_for_each_bio(bio_src, rq_src) {
                bio = bio_clone_fast(bio_src, gfp_mask, bs);
                if (!bio)
                        goto free_and_out;

                if (bio_ctr && bio_ctr(bio, bio_src, data))
                        goto free_and_out;

                if (rq->bio) {
                        rq->biotail->bi_next = bio;
                        rq->biotail = bio;
                } else {
                        rq->bio = rq->biotail = bio;
                }
                bio = NULL;
        }

        /* Copy attributes of the original request to the clone request. */
        rq->__sector = blk_rq_pos(rq_src);
        rq->__data_len = blk_rq_bytes(rq_src);
        if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) {
                rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
                rq->special_vec = rq_src->special_vec;
        }
        rq->nr_phys_segments = rq_src->nr_phys_segments;
        rq->ioprio = rq_src->ioprio;

        if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0)
                goto free_and_out;

        return 0;

free_and_out:
        if (bio)
                bio_put(bio);
        blk_rq_unprep_clone(rq);

        return -ENOMEM;
}
EXPORT_SYMBOL_GPL(blk_rq_prep_clone);

int kblockd_schedule_work(struct work_struct *work)
{
        return queue_work(kblockd_workqueue, work);
}
EXPORT_SYMBOL(kblockd_schedule_work);

int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
                                unsigned long delay)
{
        return mod_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
}
EXPORT_SYMBOL(kblockd_mod_delayed_work_on);

/**
 * blk_start_plug - initialize blk_plug and track it inside the task_struct
 * @plug:        The &struct blk_plug that needs to be initialized
 *
 * Description:
 *   blk_start_plug() indicates to the block layer an intent by the caller
 *   to submit multiple I/O requests in a batch.  The block layer may use
 *   this hint to defer submitting I/Os from the caller until blk_finish_plug()
 *   is called.  However, the block layer may choose to submit requests
 *   before a call to blk_finish_plug() if the number of queued I/Os
 *   exceeds %BLK_MAX_REQUEST_COUNT, or if the size of the I/O is larger than
 *   %BLK_PLUG_FLUSH_SIZE.  The queued I/Os may also be submitted early if
 *   the task schedules (see below).
 *
 *   Tracking blk_plug inside the task_struct will help with auto-flushing the
 *   pending I/O should the task end up blocking between blk_start_plug() and
 *   blk_finish_plug(). This is important from a performance perspective, but
 *   also ensures that we don't deadlock. For instance, if the task is blocking
 *   for a memory allocation, memory reclaim could end up wanting to free a
 *   page belonging to that request that is currently residing in our private
 *   plug. By flushing the pending I/O when the process goes to sleep, we avoid
 *   this kind of deadlock.
 */
void blk_start_plug(struct blk_plug *plug)
{
        struct task_struct *tsk = current;

        /*
         * If this is a nested plug, don't actually assign it.
         */
        if (tsk->plug)
                return;

        INIT_LIST_HEAD(&plug->mq_list);
        INIT_LIST_HEAD(&plug->cb_list);
        plug->rq_count = 0;
        plug->multiple_queues = false;
        plug->nowait = false;

        /*
         * Store ordering should not be needed here, since a potential
         * preempt will imply a full memory barrier
         */
        tsk->plug = plug;
}
EXPORT_SYMBOL(blk_start_plug);

static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
{
        LIST_HEAD(callbacks);

        while (!list_empty(&plug->cb_list)) {
                list_splice_init(&plug->cb_list, &callbacks);

                while (!list_empty(&callbacks)) {
                        struct blk_plug_cb *cb = list_first_entry(&callbacks,
                                                          struct blk_plug_cb,
                                                          list);
                        list_del(&cb->list);
                        cb->callback(cb, from_schedule);
                }
        }
}

struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data,
                                      int size)
{
        struct blk_plug *plug = current->plug;
        struct blk_plug_cb *cb;

        if (!plug)
                return NULL;

        list_for_each_entry(cb, &plug->cb_list, list)
                if (cb->callback == unplug && cb->data == data)
                        return cb;

        /* Not currently on the callback list */
        BUG_ON(size < sizeof(*cb));
        cb = kzalloc(size, GFP_ATOMIC);
        if (cb) {
                cb->data = data;
                cb->callback = unplug;
                list_add(&cb->list, &plug->cb_list);
        }
        return cb;
}
EXPORT_SYMBOL(blk_check_plugged);

void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
{
        flush_plug_callbacks(plug, from_schedule);

        if (!list_empty(&plug->mq_list))
                blk_mq_flush_plug_list(plug, from_schedule);
}

/**
 * blk_finish_plug - mark the end of a batch of submitted I/O
 * @plug:        The &struct blk_plug passed to blk_start_plug()
 *
 * Description:
 * Indicate that a batch of I/O submissions is complete.  This function
 * must be paired with an initial call to blk_start_plug().  The intent
 * is to allow the block layer to optimize I/O submission.  See the
 * documentation for blk_start_plug() for more information.
 */
void blk_finish_plug(struct blk_plug *plug)
{
        if (plug != current->plug)
                return;
        blk_flush_plug_list(plug, false);

        current->plug = NULL;
}
EXPORT_SYMBOL(blk_finish_plug);

void blk_io_schedule(void)
{
        /* Prevent hang_check timer from firing at us during very long I/O */
        unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2;

        if (timeout)
                io_schedule_timeout(timeout);
        else
                io_schedule();
}
EXPORT_SYMBOL_GPL(blk_io_schedule);

int __init blk_dev_init(void)
{
        BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS));
        BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
                        sizeof_field(struct request, cmd_flags));
        BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
                        sizeof_field(struct bio, bi_opf));

        /* used for unplugging and affects IO latency/throughput - HIGHPRI */
        kblockd_workqueue = alloc_workqueue("kblockd",
                                            WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
        if (!kblockd_workqueue)
                panic("Failed to create kblockd\n");

        blk_requestq_cachep = kmem_cache_create("request_queue",
                        sizeof(struct request_queue), 0, SLAB_PANIC, NULL);

        blk_debugfs_root = debugfs_create_dir("block", NULL);

        return 0;
}








































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __PACKET_INTERNAL_H__
#define __PACKET_INTERNAL_H__

#include <linux/refcount.h>

struct packet_mclist {
        struct packet_mclist        *next;
        int                        ifindex;
        int                        count;
        unsigned short                type;
        unsigned short                alen;
        unsigned char                addr[MAX_ADDR_LEN];
};

/* kbdq - kernel block descriptor queue */
struct tpacket_kbdq_core {
        struct pgv        *pkbdq;
        unsigned int        feature_req_word;
        unsigned int        hdrlen;
        unsigned char        reset_pending_on_curr_blk;
        unsigned char   delete_blk_timer;
        unsigned short        kactive_blk_num;
        unsigned short        blk_sizeof_priv;

        /* last_kactive_blk_num:
         * trick to see if user-space has caught up
         * in order to avoid refreshing timer when every single pkt arrives.
         */
        unsigned short        last_kactive_blk_num;

        char                *pkblk_start;
        char                *pkblk_end;
        int                kblk_size;
        unsigned int        max_frame_len;
        unsigned int        knum_blocks;
        uint64_t        knxt_seq_num;
        char                *prev;
        char                *nxt_offset;
        struct sk_buff        *skb;

        rwlock_t        blk_fill_in_prog_lock;

        /* Default is set to 8ms */
#define DEFAULT_PRB_RETIRE_TOV        (8)

        unsigned short  retire_blk_tov;
        unsigned short  version;
        unsigned long        tov_in_jiffies;

        /* timer to retire an outstanding block */
        struct timer_list retire_blk_timer;
};

struct pgv {
        char *buffer;
};

struct packet_ring_buffer {
        struct pgv                *pg_vec;

        unsigned int                head;
        unsigned int                frames_per_block;
        unsigned int                frame_size;
        unsigned int                frame_max;

        unsigned int                pg_vec_order;
        unsigned int                pg_vec_pages;
        unsigned int                pg_vec_len;

        unsigned int __percpu        *pending_refcnt;

        union {
                unsigned long                        *rx_owner_map;
                struct tpacket_kbdq_core        prb_bdqc;
        };
};

extern struct mutex fanout_mutex;
#define PACKET_FANOUT_MAX        (1 << 16)

struct packet_fanout {
        possible_net_t                net;
        unsigned int                num_members;
        u32                        max_num_members;
        u16                        id;
        u8                        type;
        u8                        flags;
        union {
                atomic_t                rr_cur;
                struct bpf_prog __rcu        *bpf_prog;
        };
        struct list_head        list;
        spinlock_t                lock;
        refcount_t                sk_ref;
        struct packet_type        prot_hook ____cacheline_aligned_in_smp;
        struct sock        __rcu        *arr[];
};

struct packet_rollover {
        int                        sock;
        atomic_long_t                num;
        atomic_long_t                num_huge;
        atomic_long_t                num_failed;
#define ROLLOVER_HLEN        (L1_CACHE_BYTES / sizeof(u32))
        u32                        history[ROLLOVER_HLEN] ____cacheline_aligned;
} ____cacheline_aligned_in_smp;

struct packet_sock {
        /* struct sock has to be the first member of packet_sock */
        struct sock                sk;
        struct packet_fanout        *fanout;
        union  tpacket_stats_u        stats;
        struct packet_ring_buffer        rx_ring;
        struct packet_ring_buffer        tx_ring;
        int                        copy_thresh;
        spinlock_t                bind_lock;
        struct mutex                pg_vec_lock;
        unsigned long                flags;
        unsigned int                running;        /* bind_lock must be held */
        unsigned int                has_vnet_hdr:1, /* writer must hold sock lock */
                                tp_loss:1,
                                tp_tx_has_off:1;
        int                        pressure;
        int                        ifindex;        /* bound device                */
        __be16                        num;
        struct packet_rollover        *rollover;
        struct packet_mclist        *mclist;
        atomic_long_t                mapped;
        enum tpacket_versions        tp_version;
        unsigned int                tp_hdrlen;
        unsigned int                tp_reserve;
        unsigned int                tp_tstamp;
        struct completion        skb_completion;
        struct net_device __rcu        *cached_dev;
        int                        (*xmit)(struct sk_buff *skb);
        struct packet_type        prot_hook ____cacheline_aligned_in_smp;
        atomic_t                tp_drops ____cacheline_aligned_in_smp;
};

static struct packet_sock *pkt_sk(struct sock *sk)
{
        return (struct packet_sock *)sk;
}

enum packet_sock_flags {
        PACKET_SOCK_ORIGDEV,
        PACKET_SOCK_AUXDATA,
};

static inline void packet_sock_flag_set(struct packet_sock *po,
                                        enum packet_sock_flags flag,
                                        bool val)
{
        if (val)
                set_bit(flag, &po->flags);
        else
                clear_bit(flag, &po->flags);
}

static inline bool packet_sock_flag(const struct packet_sock *po,
                                    enum packet_sock_flags flag)
{
        return test_bit(flag, &po->flags);
}

#endif



































































































































































































































    1 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/bad_inode.c
 *
 *  Copyright (C) 1997, Stephen Tweedie
 *
 *  Provide stub functions for unreadable inodes
 *
 *  Fabian Frederick : August 2003 - All file operations assigned to EIO
 */

#include <linux/fs.h>
#include <linux/export.h>
#include <linux/stat.h>
#include <linux/time.h>
#include <linux/namei.h>
#include <linux/poll.h>
#include <linux/fiemap.h>

static int bad_file_open(struct inode *inode, struct file *filp)
{
        return -EIO;
}

static const struct file_operations bad_file_ops =
{
        .open                = bad_file_open,
};

static int bad_inode_create (struct inode *dir, struct dentry *dentry,
                umode_t mode, bool excl)
{
        return -EIO;
}

static struct dentry *bad_inode_lookup(struct inode *dir,
                        struct dentry *dentry, unsigned int flags)
{
        return ERR_PTR(-EIO);
}

static int bad_inode_link (struct dentry *old_dentry, struct inode *dir,
                struct dentry *dentry)
{
        return -EIO;
}

static int bad_inode_unlink(struct inode *dir, struct dentry *dentry)
{
        return -EIO;
}

static int bad_inode_symlink (struct inode *dir, struct dentry *dentry,
                const char *symname)
{
        return -EIO;
}

static int bad_inode_mkdir(struct inode *dir, struct dentry *dentry,
                        umode_t mode)
{
        return -EIO;
}

static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry)
{
        return -EIO;
}

static int bad_inode_mknod (struct inode *dir, struct dentry *dentry,
                        umode_t mode, dev_t rdev)
{
        return -EIO;
}

static int bad_inode_rename2(struct inode *old_dir, struct dentry *old_dentry,
                             struct inode *new_dir, struct dentry *new_dentry,
                             unsigned int flags)
{
        return -EIO;
}

static int bad_inode_readlink(struct dentry *dentry, char __user *buffer,
                int buflen)
{
        return -EIO;
}

static int bad_inode_permission(struct inode *inode, int mask)
{
        return -EIO;
}

static int bad_inode_getattr(const struct path *path, struct kstat *stat,
                             u32 request_mask, unsigned int query_flags)
{
        return -EIO;
}

static int bad_inode_setattr(struct dentry *direntry, struct iattr *attrs)
{
        return -EIO;
}

static ssize_t bad_inode_listxattr(struct dentry *dentry, char *buffer,
                        size_t buffer_size)
{
        return -EIO;
}

static const char *bad_inode_get_link(struct dentry *dentry,
                                      struct inode *inode,
                                      struct delayed_call *done)
{
        return ERR_PTR(-EIO);
}

static struct posix_acl *bad_inode_get_acl(struct inode *inode, int type)
{
        return ERR_PTR(-EIO);
}

static int bad_inode_fiemap(struct inode *inode,
                            struct fiemap_extent_info *fieinfo, u64 start,
                            u64 len)
{
        return -EIO;
}

static int bad_inode_update_time(struct inode *inode, struct timespec64 *time,
                                 int flags)
{
        return -EIO;
}

static int bad_inode_atomic_open(struct inode *inode, struct dentry *dentry,
                                 struct file *file, unsigned int open_flag,
                                 umode_t create_mode)
{
        return -EIO;
}

static int bad_inode_tmpfile(struct inode *inode, struct dentry *dentry,
                             umode_t mode)
{
        return -EIO;
}

static int bad_inode_set_acl(struct inode *inode, struct posix_acl *acl,
                             int type)
{
        return -EIO;
}

static const struct inode_operations bad_inode_ops =
{
        .create                = bad_inode_create,
        .lookup                = bad_inode_lookup,
        .link                = bad_inode_link,
        .unlink                = bad_inode_unlink,
        .symlink        = bad_inode_symlink,
        .mkdir                = bad_inode_mkdir,
        .rmdir                = bad_inode_rmdir,
        .mknod                = bad_inode_mknod,
        .rename                = bad_inode_rename2,
        .readlink        = bad_inode_readlink,
        .permission        = bad_inode_permission,
        .getattr        = bad_inode_getattr,
        .setattr        = bad_inode_setattr,
        .listxattr        = bad_inode_listxattr,
        .get_link        = bad_inode_get_link,
        .get_acl        = bad_inode_get_acl,
        .fiemap                = bad_inode_fiemap,
        .update_time        = bad_inode_update_time,
        .atomic_open        = bad_inode_atomic_open,
        .tmpfile        = bad_inode_tmpfile,
        .set_acl        = bad_inode_set_acl,
};


/*
 * When a filesystem is unable to read an inode due to an I/O error in
 * its read_inode() function, it can call make_bad_inode() to return a
 * set of stubs which will return EIO errors as required. 
 *
 * We only need to do limited initialisation: all other fields are
 * preinitialised to zero automatically.
 */
 
/**
 *        make_bad_inode - mark an inode bad due to an I/O error
 *        @inode: Inode to mark bad
 *
 *        When an inode cannot be read due to a media or remote network
 *        failure this function makes the inode "bad" and causes I/O operations
 *        on it to fail from this point on.
 */
 
void make_bad_inode(struct inode *inode)
{
        remove_inode_hash(inode);

        inode->i_mode = S_IFREG;
        inode->i_atime = inode->i_mtime = inode->i_ctime =
                current_time(inode);
        inode->i_op = &bad_inode_ops;        
        inode->i_opflags &= ~IOP_XATTR;
        inode->i_fop = &bad_file_ops;        
}
EXPORT_SYMBOL(make_bad_inode);

/*
 * This tests whether an inode has been flagged as bad. The test uses
 * &bad_inode_ops to cover the case of invalidated inodes as well as
 * those created by make_bad_inode() above.
 */
 
/**
 *        is_bad_inode - is an inode errored
 *        @inode: inode to test
 *
 *        Returns true if the inode in question has been marked as bad.
 */
 
bool is_bad_inode(struct inode *inode)
{
        return (inode->i_op == &bad_inode_ops);        
}

EXPORT_SYMBOL(is_bad_inode);

/**
 * iget_failed - Mark an under-construction inode as dead and release it
 * @inode: The inode to discard
 *
 * Mark an under-construction inode as dead and release it.
 */
void iget_failed(struct inode *inode)
{
        make_bad_inode(inode);
        unlock_new_inode(inode);
        iput(inode);
}
EXPORT_SYMBOL(iget_failed);











































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PVCLOCK_H
#define _ASM_X86_PVCLOCK_H

#include <asm/clocksource.h>
#include <asm/pvclock-abi.h>

/* some helper functions for xen and kvm pv clock sources */
u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src);
void pvclock_set_flags(u8 flags);
unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
                            struct pvclock_vcpu_time_info *vcpu,
                            struct timespec64 *ts);
void pvclock_resume(void);

void pvclock_touch_watchdogs(void);

static __always_inline
unsigned pvclock_read_begin(const struct pvclock_vcpu_time_info *src)
{
        unsigned version = src->version & ~1;
        /* Make sure that the version is read before the data. */
        virt_rmb();
        return version;
}

static __always_inline
bool pvclock_read_retry(const struct pvclock_vcpu_time_info *src,
                        unsigned version)
{
        /* Make sure that the version is re-read after the data. */
        virt_rmb();
        return unlikely(version != src->version);
}

/*
 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
 * yielding a 64-bit result.
 */
static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
{
        u64 product;
#ifdef __i386__
        u32 tmp1, tmp2;
#else
        ulong tmp;
#endif

        if (shift < 0)
                delta >>= -shift;
        else
                delta <<= shift;

#ifdef __i386__
        __asm__ (
                "mul  %5       ; "
                "mov  %4,%%eax ; "
                "mov  %%edx,%4 ; "
                "mul  %5       ; "
                "xor  %5,%5    ; "
                "add  %4,%%eax ; "
                "adc  %5,%%edx ; "
                : "=A" (product), "=r" (tmp1), "=r" (tmp2)
                : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
#elif defined(__x86_64__)
        __asm__ (
                "mulq %[mul_frac] ; shrd $32, %[hi], %[lo]"
                : [lo]"=a"(product),
                  [hi]"=d"(tmp)
                : "0"(delta),
                  [mul_frac]"rm"((u64)mul_frac));
#else
#error implement me!
#endif

        return product;
}

static __always_inline
u64 __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, u64 tsc)
{
        u64 delta = tsc - src->tsc_timestamp;
        u64 offset = pvclock_scale_delta(delta, src->tsc_to_system_mul,
                                             src->tsc_shift);
        return src->system_time + offset;
}

struct pvclock_vsyscall_time_info {
        struct pvclock_vcpu_time_info pvti;
} __attribute__((__aligned__(SMP_CACHE_BYTES)));

#define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)

#ifdef CONFIG_PARAVIRT_CLOCK
void pvclock_set_pvti_cpu0_va(struct pvclock_vsyscall_time_info *pvti);
struct pvclock_vsyscall_time_info *pvclock_get_pvti_cpu0_va(void);
#else
static inline struct pvclock_vsyscall_time_info *pvclock_get_pvti_cpu0_va(void)
{
        return NULL;
}
#endif

#endif /* _ASM_X86_PVCLOCK_H */












































    2 









    1 

    1 

    1 









































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_TIME64_H
#define _LINUX_TIME64_H

#include <linux/math64.h>
#include <vdso/time64.h>

typedef __s64 time64_t;
typedef __u64 timeu64_t;

#include <uapi/linux/time.h>

struct timespec64 {
        time64_t        tv_sec;                        /* seconds */
        long                tv_nsec;                /* nanoseconds */
};

struct itimerspec64 {
        struct timespec64 it_interval;
        struct timespec64 it_value;
};

/* Located here for timespec[64]_valid_strict */
#define TIME64_MAX                        ((s64)~((u64)1 << 63))
#define TIME64_MIN                        (-TIME64_MAX - 1)

#define KTIME_MAX                        ((s64)~((u64)1 << 63))
#define KTIME_SEC_MAX                        (KTIME_MAX / NSEC_PER_SEC)

/*
 * Limits for settimeofday():
 *
 * To prevent setting the time close to the wraparound point time setting
 * is limited so a reasonable uptime can be accomodated. Uptime of 30 years
 * should be really sufficient, which means the cutoff is 2232. At that
 * point the cutoff is just a small part of the larger problem.
 */
#define TIME_UPTIME_SEC_MAX                (30LL * 365 * 24 *3600)
#define TIME_SETTOD_SEC_MAX                (KTIME_SEC_MAX - TIME_UPTIME_SEC_MAX)

static inline int timespec64_equal(const struct timespec64 *a,
                                   const struct timespec64 *b)
{
        return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec);
}

/*
 * lhs < rhs:  return <0
 * lhs == rhs: return 0
 * lhs > rhs:  return >0
 */
static inline int timespec64_compare(const struct timespec64 *lhs, const struct timespec64 *rhs)
{
        if (lhs->tv_sec < rhs->tv_sec)
                return -1;
        if (lhs->tv_sec > rhs->tv_sec)
                return 1;
        return lhs->tv_nsec - rhs->tv_nsec;
}

extern void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec);

static inline struct timespec64 timespec64_add(struct timespec64 lhs,
                                                struct timespec64 rhs)
{
        struct timespec64 ts_delta;
        set_normalized_timespec64(&ts_delta, lhs.tv_sec + rhs.tv_sec,
                                lhs.tv_nsec + rhs.tv_nsec);
        return ts_delta;
}

/*
 * sub = lhs - rhs, in normalized form
 */
static inline struct timespec64 timespec64_sub(struct timespec64 lhs,
                                                struct timespec64 rhs)
{
        struct timespec64 ts_delta;
        set_normalized_timespec64(&ts_delta, lhs.tv_sec - rhs.tv_sec,
                                lhs.tv_nsec - rhs.tv_nsec);
        return ts_delta;
}

/*
 * Returns true if the timespec64 is norm, false if denorm:
 */
static inline bool timespec64_valid(const struct timespec64 *ts)
{
        /* Dates before 1970 are bogus */
        if (ts->tv_sec < 0)
                return false;
        /* Can't have more nanoseconds then a second */
        if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
                return false;
        return true;
}

static inline bool timespec64_valid_strict(const struct timespec64 *ts)
{
        if (!timespec64_valid(ts))
                return false;
        /* Disallow values that could overflow ktime_t */
        if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX)
                return false;
        return true;
}

static inline bool timespec64_valid_settod(const struct timespec64 *ts)
{
        if (!timespec64_valid(ts))
                return false;
        /* Disallow values which cause overflow issues vs. CLOCK_REALTIME */
        if ((unsigned long long)ts->tv_sec >= TIME_SETTOD_SEC_MAX)
                return false;
        return true;
}

/**
 * timespec64_to_ns - Convert timespec64 to nanoseconds
 * @ts:                pointer to the timespec64 variable to be converted
 *
 * Returns the scalar nanosecond representation of the timespec64
 * parameter.
 */
static inline s64 timespec64_to_ns(const struct timespec64 *ts)
{
        /* Prevent multiplication overflow */
        if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX)
                return KTIME_MAX;

        return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec;
}

/**
 * ns_to_timespec64 - Convert nanoseconds to timespec64
 * @nsec:        the nanoseconds value to be converted
 *
 * Returns the timespec64 representation of the nsec parameter.
 */
extern struct timespec64 ns_to_timespec64(const s64 nsec);

/**
 * timespec64_add_ns - Adds nanoseconds to a timespec64
 * @a:                pointer to timespec64 to be incremented
 * @ns:                unsigned nanoseconds value to be added
 *
 * This must always be inlined because its used from the x86-64 vdso,
 * which cannot call other kernel functions.
 */
static __always_inline void timespec64_add_ns(struct timespec64 *a, u64 ns)
{
        a->tv_sec += __iter_div_u64_rem(a->tv_nsec + ns, NSEC_PER_SEC, &ns);
        a->tv_nsec = ns;
}

/*
 * timespec64_add_safe assumes both values are positive and checks for
 * overflow. It will return TIME64_MAX in case of overflow.
 */
extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
                                         const struct timespec64 rhs);

#endif /* _LINUX_TIME64_H */










































































































































































































































































    1 









    1 
    1 









    1 


    1 


    1 










    1 




    1 





    1 







    1 




    1 








    1 







    1 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
/*
 * memfd_create system call and file sealing support
 *
 * Code was originally included in shmem.c, and broken out to facilitate
 * use by hugetlbfs as well as tmpfs.
 *
 * This file is released under the GPL.
 */

#include <linux/fs.h>
#include <linux/vfs.h>
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/sched/signal.h>
#include <linux/khugepaged.h>
#include <linux/syscalls.h>
#include <linux/hugetlb.h>
#include <linux/shmem_fs.h>
#include <linux/memfd.h>
#include <uapi/linux/memfd.h>

/*
 * We need a tag: a new tag would expand every xa_node by 8 bytes,
 * so reuse a tag which we firmly believe is never set or cleared on tmpfs
 * or hugetlbfs because they are memory only filesystems.
 */
#define MEMFD_TAG_PINNED        PAGECACHE_TAG_TOWRITE
#define LAST_SCAN               4       /* about 150ms max */

static void memfd_tag_pins(struct xa_state *xas)
{
        struct page *page;
        int latency = 0;
        int cache_count;

        lru_add_drain();

        xas_lock_irq(xas);
        xas_for_each(xas, page, ULONG_MAX) {
                cache_count = 1;
                if (!xa_is_value(page) &&
                    PageTransHuge(page) && !PageHuge(page))
                        cache_count = HPAGE_PMD_NR;

                if (!xa_is_value(page) &&
                    page_count(page) - total_mapcount(page) != cache_count)
                        xas_set_mark(xas, MEMFD_TAG_PINNED);
                if (cache_count != 1)
                        xas_set(xas, page->index + cache_count);

                latency += cache_count;
                if (latency < XA_CHECK_SCHED)
                        continue;
                latency = 0;

                xas_pause(xas);
                xas_unlock_irq(xas);
                cond_resched();
                xas_lock_irq(xas);
        }
        xas_unlock_irq(xas);
}

/*
 * Setting SEAL_WRITE requires us to verify there's no pending writer. However,
 * via get_user_pages(), drivers might have some pending I/O without any active
 * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages
 * and see whether it has an elevated ref-count. If so, we tag them and wait for
 * them to be dropped.
 * The caller must guarantee that no new user will acquire writable references
 * to those pages to avoid races.
 */
static int memfd_wait_for_pins(struct address_space *mapping)
{
        XA_STATE(xas, &mapping->i_pages, 0);
        struct page *page;
        int error, scan;

        memfd_tag_pins(&xas);

        error = 0;
        for (scan = 0; scan <= LAST_SCAN; scan++) {
                int latency = 0;
                int cache_count;

                if (!xas_marked(&xas, MEMFD_TAG_PINNED))
                        break;

                if (!scan)
                        lru_add_drain_all();
                else if (schedule_timeout_killable((HZ << scan) / 200))
                        scan = LAST_SCAN;

                xas_set(&xas, 0);
                xas_lock_irq(&xas);
                xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) {
                        bool clear = true;

                        cache_count = 1;
                        if (!xa_is_value(page) &&
                            PageTransHuge(page) && !PageHuge(page))
                                cache_count = HPAGE_PMD_NR;

                        if (!xa_is_value(page) && cache_count !=
                            page_count(page) - total_mapcount(page)) {
                                /*
                                 * On the last scan, we clean up all those tags
                                 * we inserted; but make a note that we still
                                 * found pages pinned.
                                 */
                                if (scan == LAST_SCAN)
                                        error = -EBUSY;
                                else
                                        clear = false;
                        }
                        if (clear)
                                xas_clear_mark(&xas, MEMFD_TAG_PINNED);

                        latency += cache_count;
                        if (latency < XA_CHECK_SCHED)
                                continue;
                        latency = 0;

                        xas_pause(&xas);
                        xas_unlock_irq(&xas);
                        cond_resched();
                        xas_lock_irq(&xas);
                }
                xas_unlock_irq(&xas);
        }

        return error;
}

unsigned int *memfd_file_seals_ptr(struct file *file)
{
        if (shmem_file(file))
                return &SHMEM_I(file_inode(file))->seals;

#ifdef CONFIG_HUGETLBFS
        if (is_file_hugepages(file))
                return &HUGETLBFS_I(file_inode(file))->seals;
#endif

        return NULL;
}

#define F_ALL_SEALS (F_SEAL_SEAL | \
                     F_SEAL_SHRINK | \
                     F_SEAL_GROW | \
                     F_SEAL_WRITE | \
                     F_SEAL_FUTURE_WRITE)

static int memfd_add_seals(struct file *file, unsigned int seals)
{
        struct inode *inode = file_inode(file);
        unsigned int *file_seals;
        int error;

        /*
         * SEALING
         * Sealing allows multiple parties to share a tmpfs or hugetlbfs file
         * but restrict access to a specific subset of file operations. Seals
         * can only be added, but never removed. This way, mutually untrusted
         * parties can share common memory regions with a well-defined policy.
         * A malicious peer can thus never perform unwanted operations on a
         * shared object.
         *
         * Seals are only supported on special tmpfs or hugetlbfs files and
         * always affect the whole underlying inode. Once a seal is set, it
         * may prevent some kinds of access to the file. Currently, the
         * following seals are defined:
         *   SEAL_SEAL: Prevent further seals from being set on this file
         *   SEAL_SHRINK: Prevent the file from shrinking
         *   SEAL_GROW: Prevent the file from growing
         *   SEAL_WRITE: Prevent write access to the file
         *
         * As we don't require any trust relationship between two parties, we
         * must prevent seals from being removed. Therefore, sealing a file
         * only adds a given set of seals to the file, it never touches
         * existing seals. Furthermore, the "setting seals"-operation can be
         * sealed itself, which basically prevents any further seal from being
         * added.
         *
         * Semantics of sealing are only defined on volatile files. Only
         * anonymous tmpfs and hugetlbfs files support sealing. More
         * importantly, seals are never written to disk. Therefore, there's
         * no plan to support it on other file types.
         */

        if (!(file->f_mode & FMODE_WRITE))
                return -EPERM;
        if (seals & ~(unsigned int)F_ALL_SEALS)
                return -EINVAL;

        inode_lock(inode);

        file_seals = memfd_file_seals_ptr(file);
        if (!file_seals) {
                error = -EINVAL;
                goto unlock;
        }

        if (*file_seals & F_SEAL_SEAL) {
                error = -EPERM;
                goto unlock;
        }

        if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) {
                error = mapping_deny_writable(file->f_mapping);
                if (error)
                        goto unlock;

                error = memfd_wait_for_pins(file->f_mapping);
                if (error) {
                        mapping_allow_writable(file->f_mapping);
                        goto unlock;
                }
        }

        *file_seals |= seals;
        error = 0;

unlock:
        inode_unlock(inode);
        return error;
}

static int memfd_get_seals(struct file *file)
{
        unsigned int *seals = memfd_file_seals_ptr(file);

        return seals ? *seals : -EINVAL;
}

long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
{
        long error;

        switch (cmd) {
        case F_ADD_SEALS:
                /* disallow upper 32bit */
                if (arg > UINT_MAX)
                        return -EINVAL;

                error = memfd_add_seals(file, arg);
                break;
        case F_GET_SEALS:
                error = memfd_get_seals(file);
                break;
        default:
                error = -EINVAL;
                break;
        }

        return error;
}

#define MFD_NAME_PREFIX "memfd:"
#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)

#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)

SYSCALL_DEFINE2(memfd_create,
                const char __user *, uname,
                unsigned int, flags)
{
        unsigned int *file_seals;
        struct file *file;
        int fd, error;
        char *name;
        long len;

        if (!(flags & MFD_HUGETLB)) {
                if (flags & ~(unsigned int)MFD_ALL_FLAGS)
                        return -EINVAL;
        } else {
                /* Allow huge page size encoding in flags. */
                if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
                                (MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
                        return -EINVAL;
        }

        /* length includes terminating zero */
        len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
        if (len <= 0)
                return -EFAULT;
        if (len > MFD_NAME_MAX_LEN + 1)
                return -EINVAL;

        name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL);
        if (!name)
                return -ENOMEM;

        strcpy(name, MFD_NAME_PREFIX);
        if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
                error = -EFAULT;
                goto err_name;
        }

        /* terminating-zero may have changed after strnlen_user() returned */
        if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
                error = -EFAULT;
                goto err_name;
        }

        fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
        if (fd < 0) {
                error = fd;
                goto err_name;
        }

        if (flags & MFD_HUGETLB) {
                struct user_struct *user = NULL;

                file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user,
                                        HUGETLB_ANONHUGE_INODE,
                                        (flags >> MFD_HUGE_SHIFT) &
                                        MFD_HUGE_MASK);
        } else
                file = shmem_file_setup(name, 0, VM_NORESERVE);
        if (IS_ERR(file)) {
                error = PTR_ERR(file);
                goto err_fd;
        }
        file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
        file->f_flags |= O_LARGEFILE;

        if (flags & MFD_ALLOW_SEALING) {
                file_seals = memfd_file_seals_ptr(file);
                if (file_seals)
                        *file_seals &= ~F_SEAL_SEAL;
        }

        fd_install(fd, file);
        kfree(name);
        return fd;

err_fd:
        put_unused_fd(fd);
err_name:
        kfree(name);
        return error;
}














































































































































































































































































































































































































































































    4 












    3 
















    4 






    4 


























    4 

























































































































































































































































































































































































































    4 
    4 

    4 


    4 


    4 



    1 
    1 






    1 



    1 




    4 






















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Implementation of the policy database.
 *
 * Author : Stephen Smalley, <sds@tycho.nsa.gov>
 */

/*
 * Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com>
 *
 *        Support for enhanced MLS infrastructure.
 *
 * Updated: Frank Mayer <mayerf@tresys.com> and Karl MacMillan <kmacmillan@tresys.com>
 *
 *        Added conditional policy language extensions
 *
 * Updated: Hewlett-Packard <paul@paul-moore.com>
 *
 *      Added support for the policy capability bitmap
 *
 * Update: Mellanox Techonologies
 *
 *        Added Infiniband support
 *
 * Copyright (C) 2016 Mellanox Techonologies
 * Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
 * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc.
 * Copyright (C) 2003 - 2004 Tresys Technology, LLC
 */

#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/audit.h>
#include "security.h"

#include "policydb.h"
#include "conditional.h"
#include "mls.h"
#include "services.h"

#define _DEBUG_HASHES

#ifdef DEBUG_HASHES
static const char *symtab_name[SYM_NUM] = {
        "common prefixes",
        "classes",
        "roles",
        "types",
        "users",
        "bools",
        "levels",
        "categories",
};
#endif

struct policydb_compat_info {
        int version;
        int sym_num;
        int ocon_num;
};

/* These need to be updated if SYM_NUM or OCON_NUM changes */
static struct policydb_compat_info policydb_compat[] = {
        {
                .version        = POLICYDB_VERSION_BASE,
                .sym_num        = SYM_NUM - 3,
                .ocon_num        = OCON_NUM - 3,
        },
        {
                .version        = POLICYDB_VERSION_BOOL,
                .sym_num        = SYM_NUM - 2,
                .ocon_num        = OCON_NUM - 3,
        },
        {
                .version        = POLICYDB_VERSION_IPV6,
                .sym_num        = SYM_NUM - 2,
                .ocon_num        = OCON_NUM - 2,
        },
        {
                .version        = POLICYDB_VERSION_NLCLASS,
                .sym_num        = SYM_NUM - 2,
                .ocon_num        = OCON_NUM - 2,
        },
        {
                .version        = POLICYDB_VERSION_MLS,
                .sym_num        = SYM_NUM,
                .ocon_num        = OCON_NUM - 2,
        },
        {
                .version        = POLICYDB_VERSION_AVTAB,
                .sym_num        = SYM_NUM,
                .ocon_num        = OCON_NUM - 2,
        },
        {
                .version        = POLICYDB_VERSION_RANGETRANS,
                .sym_num        = SYM_NUM,
                .ocon_num        = OCON_NUM - 2,
        },
        {
                .version        = POLICYDB_VERSION_POLCAP,
                .sym_num        = SYM_NUM,
                .ocon_num        = OCON_NUM - 2,
        },
        {
                .version        = POLICYDB_VERSION_PERMISSIVE,
                .sym_num        = SYM_NUM,
                .ocon_num        = OCON_NUM - 2,
        },
        {
                .version        = POLICYDB_VERSION_BOUNDARY,
                .sym_num        = SYM_NUM,
                .ocon_num        = OCON_NUM - 2,
        },
        {
                .version        = POLICYDB_VERSION_FILENAME_TRANS,
                .sym_num        = SYM_NUM,
                .ocon_num        = OCON_NUM - 2,
        },
        {
                .version        = POLICYDB_VERSION_ROLETRANS,
                .sym_num        = SYM_NUM,
                .ocon_num        = OCON_NUM - 2,
        },
        {
                .version        = POLICYDB_VERSION_NEW_OBJECT_DEFAULTS,
                .sym_num        = SYM_NUM,
                .ocon_num        = OCON_NUM - 2,
        },
        {
                .version        = POLICYDB_VERSION_DEFAULT_TYPE,
                .sym_num        = SYM_NUM,
                .ocon_num        = OCON_NUM - 2,
        },
        {
                .version        = POLICYDB_VERSION_CONSTRAINT_NAMES,
                .sym_num        = SYM_NUM,
                .ocon_num        = OCON_NUM - 2,
        },
        {
                .version        = POLICYDB_VERSION_XPERMS_IOCTL,
                .sym_num        = SYM_NUM,
                .ocon_num        = OCON_NUM - 2,
        },
        {
                .version        = POLICYDB_VERSION_INFINIBAND,
                .sym_num        = SYM_NUM,
                .ocon_num        = OCON_NUM,
        },
        {
                .version        = POLICYDB_VERSION_GLBLUB,
                .sym_num        = SYM_NUM,
                .ocon_num        = OCON_NUM,
        },
        {
                .version        = POLICYDB_VERSION_COMP_FTRANS,
                .sym_num        = SYM_NUM,
                .ocon_num        = OCON_NUM,
        },
};

static struct policydb_compat_info *policydb_lookup_compat(int version)
{
        int i;
        struct policydb_compat_info *info = NULL;

        for (i = 0; i < ARRAY_SIZE(policydb_compat); i++) {
                if (policydb_compat[i].version == version) {
                        info = &policydb_compat[i];
                        break;
                }
        }
        return info;
}

/*
 * The following *_destroy functions are used to
 * free any memory allocated for each kind of
 * symbol data in the policy database.
 */

static int perm_destroy(void *key, void *datum, void *p)
{
        kfree(key);
        kfree(datum);
        return 0;
}

static int common_destroy(void *key, void *datum, void *p)
{
        struct common_datum *comdatum;

        kfree(key);
        if (datum) {
                comdatum = datum;
                hashtab_map(&comdatum->permissions.table, perm_destroy, NULL);
                hashtab_destroy(&comdatum->permissions.table);
        }
        kfree(datum);
        return 0;
}

static void constraint_expr_destroy(struct constraint_expr *expr)
{
        if (expr) {
                ebitmap_destroy(&expr->names);
                if (expr->type_names) {
                        ebitmap_destroy(&expr->type_names->types);
                        ebitmap_destroy(&expr->type_names->negset);
                        kfree(expr->type_names);
                }
                kfree(expr);
        }
}

static int cls_destroy(void *key, void *datum, void *p)
{
        struct class_datum *cladatum;
        struct constraint_node *constraint, *ctemp;
        struct constraint_expr *e, *etmp;

        kfree(key);
        if (datum) {
                cladatum = datum;
                hashtab_map(&cladatum->permissions.table, perm_destroy, NULL);
                hashtab_destroy(&cladatum->permissions.table);
                constraint = cladatum->constraints;
                while (constraint) {
                        e = constraint->expr;
                        while (e) {
                                etmp = e;
                                e = e->next;
                                constraint_expr_destroy(etmp);
                        }
                        ctemp = constraint;
                        constraint = constraint->next;
                        kfree(ctemp);
                }

                constraint = cladatum->validatetrans;
                while (constraint) {
                        e = constraint->expr;
                        while (e) {
                                etmp = e;
                                e = e->next;
                                constraint_expr_destroy(etmp);
                        }
                        ctemp = constraint;
                        constraint = constraint->next;
                        kfree(ctemp);
                }
                kfree(cladatum->comkey);
        }
        kfree(datum);
        return 0;
}

static int role_destroy(void *key, void *datum, void *p)
{
        struct role_datum *role;

        kfree(key);
        if (datum) {
                role = datum;
                ebitmap_destroy(&role->dominates);
                ebitmap_destroy(&role->types);
        }
        kfree(datum);
        return 0;
}

static int type_destroy(void *key, void *datum, void *p)
{
        kfree(key);
        kfree(datum);
        return 0;
}

static int user_destroy(void *key, void *datum, void *p)
{
        struct user_datum *usrdatum;

        kfree(key);
        if (datum) {
                usrdatum = datum;
                ebitmap_destroy(&usrdatum->roles);
                ebitmap_destroy(&usrdatum->range.level[0].cat);
                ebitmap_destroy(&usrdatum->range.level[1].cat);
                ebitmap_destroy(&usrdatum->dfltlevel.cat);
        }
        kfree(datum);
        return 0;
}

static int sens_destroy(void *key, void *datum, void *p)
{
        struct level_datum *levdatum;

        kfree(key);
        if (datum) {
                levdatum = datum;
                if (levdatum->level)
                        ebitmap_destroy(&levdatum->level->cat);
                kfree(levdatum->level);
        }
        kfree(datum);
        return 0;
}

static int cat_destroy(void *key, void *datum, void *p)
{
        kfree(key);
        kfree(datum);
        return 0;
}

static int (*destroy_f[SYM_NUM]) (void *key, void *datum, void *datap) =
{
        common_destroy,
        cls_destroy,
        role_destroy,
        type_destroy,
        user_destroy,
        cond_destroy_bool,
        sens_destroy,
        cat_destroy,
};

static int filenametr_destroy(void *key, void *datum, void *p)
{
        struct filename_trans_key *ft = key;
        struct filename_trans_datum *next, *d = datum;

        kfree(ft->name);
        kfree(key);
        do {
                ebitmap_destroy(&d->stypes);
                next = d->next;
                kfree(d);
                d = next;
        } while (unlikely(d));
        cond_resched();
        return 0;
}

static int range_tr_destroy(void *key, void *datum, void *p)
{
        struct mls_range *rt = datum;

        kfree(key);
        ebitmap_destroy(&rt->level[0].cat);
        ebitmap_destroy(&rt->level[1].cat);
        kfree(datum);
        cond_resched();
        return 0;
}

static int role_tr_destroy(void *key, void *datum, void *p)
{
        kfree(key);
        kfree(datum);
        return 0;
}

static void ocontext_destroy(struct ocontext *c, int i)
{
        if (!c)
                return;

        context_destroy(&c->context[0]);
        context_destroy(&c->context[1]);
        if (i == OCON_ISID || i == OCON_FS ||
            i == OCON_NETIF || i == OCON_FSUSE)
                kfree(c->u.name);
        kfree(c);
}

/*
 * Initialize the role table.
 */
static int roles_init(struct policydb *p)
{
        char *key = NULL;
        int rc;
        struct role_datum *role;

        role = kzalloc(sizeof(*role), GFP_KERNEL);
        if (!role)
                return -ENOMEM;

        rc = -EINVAL;
        role->value = ++p->p_roles.nprim;
        if (role->value != OBJECT_R_VAL)
                goto out;

        rc = -ENOMEM;
        key = kstrdup(OBJECT_R, GFP_KERNEL);
        if (!key)
                goto out;

        rc = symtab_insert(&p->p_roles, key, role);
        if (rc)
                goto out;

        return 0;
out:
        kfree(key);
        kfree(role);
        return rc;
}

static u32 filenametr_hash(const void *k)
{
        const struct filename_trans_key *ft = k;
        unsigned long hash;
        unsigned int byte_num;
        unsigned char focus;

        hash = ft->ttype ^ ft->tclass;

        byte_num = 0;
        while ((focus = ft->name[byte_num++]))
                hash = partial_name_hash(focus, hash);
        return hash;
}

static int filenametr_cmp(const void *k1, const void *k2)
{
        const struct filename_trans_key *ft1 = k1;
        const struct filename_trans_key *ft2 = k2;
        int v;

        v = ft1->ttype - ft2->ttype;
        if (v)
                return v;

        v = ft1->tclass - ft2->tclass;
        if (v)
                return v;

        return strcmp(ft1->name, ft2->name);

}

static const struct hashtab_key_params filenametr_key_params = {
        .hash = filenametr_hash,
        .cmp = filenametr_cmp,
};

struct filename_trans_datum *policydb_filenametr_search(
        struct policydb *p, struct filename_trans_key *key)
{
        return hashtab_search(&p->filename_trans, key, filenametr_key_params);
}

static u32 rangetr_hash(const void *k)
{
        const struct range_trans *key = k;

        return key->source_type + (key->target_type << 3) +
                (key->target_class << 5);
}

static int rangetr_cmp(const void *k1, const void *k2)
{
        const struct range_trans *key1 = k1, *key2 = k2;
        int v;

        v = key1->source_type - key2->source_type;
        if (v)
                return v;

        v = key1->target_type - key2->target_type;
        if (v)
                return v;

        v = key1->target_class - key2->target_class;

        return v;
}

static const struct hashtab_key_params rangetr_key_params = {
        .hash = rangetr_hash,
        .cmp = rangetr_cmp,
};

struct mls_range *policydb_rangetr_search(struct policydb *p,
                                          struct range_trans *key)
{
        return hashtab_search(&p->range_tr, key, rangetr_key_params);
}

static u32 role_trans_hash(const void *k)
{
        const struct role_trans_key *key = k;

        return key->role + (key->type << 3) + (key->tclass << 5);
}

static int role_trans_cmp(const void *k1, const void *k2)
{
        const struct role_trans_key *key1 = k1, *key2 = k2;
        int v;

        v = key1->role - key2->role;
        if (v)
                return v;

        v = key1->type - key2->type;
        if (v)
                return v;

        return key1->tclass - key2->tclass;
}

static const struct hashtab_key_params roletr_key_params = {
        .hash = role_trans_hash,
        .cmp = role_trans_cmp,
};

struct role_trans_datum *policydb_roletr_search(struct policydb *p,
                                                struct role_trans_key *key)
{
        return hashtab_search(&p->role_tr, key, roletr_key_params);
}

/*
 * Initialize a policy database structure.
 */
static void policydb_init(struct policydb *p)
{
        memset(p, 0, sizeof(*p));

        avtab_init(&p->te_avtab);
        cond_policydb_init(p);

        ebitmap_init(&p->filename_trans_ttypes);
        ebitmap_init(&p->policycaps);
        ebitmap_init(&p->permissive_map);
}

/*
 * The following *_index functions are used to
 * define the val_to_name and val_to_struct arrays
 * in a policy database structure.  The val_to_name
 * arrays are used when converting security context
 * structures into string representations.  The
 * val_to_struct arrays are used when the attributes
 * of a class, role, or user are needed.
 */

static int common_index(void *key, void *datum, void *datap)
{
        struct policydb *p;
        struct common_datum *comdatum;

        comdatum = datum;
        p = datap;
        if (!comdatum->value || comdatum->value > p->p_commons.nprim)
                return -EINVAL;

        p->sym_val_to_name[SYM_COMMONS][comdatum->value - 1] = key;

        return 0;
}

static int class_index(void *key, void *datum, void *datap)
{
        struct policydb *p;
        struct class_datum *cladatum;

        cladatum = datum;
        p = datap;
        if (!cladatum->value || cladatum->value > p->p_classes.nprim)
                return -EINVAL;

        p->sym_val_to_name[SYM_CLASSES][cladatum->value - 1] = key;
        p->class_val_to_struct[cladatum->value - 1] = cladatum;
        return 0;
}

static int role_index(void *key, void *datum, void *datap)
{
        struct policydb *p;
        struct role_datum *role;

        role = datum;
        p = datap;
        if (!role->value
            || role->value > p->p_roles.nprim
            || role->bounds > p->p_roles.nprim)
                return -EINVAL;

        p->sym_val_to_name[SYM_ROLES][role->value - 1] = key;
        p->role_val_to_struct[role->value - 1] = role;
        return 0;
}

static int type_index(void *key, void *datum, void *datap)
{
        struct policydb *p;
        struct type_datum *typdatum;

        typdatum = datum;
        p = datap;

        if (typdatum->primary) {
                if (!typdatum->value
                    || typdatum->value > p->p_types.nprim
                    || typdatum->bounds > p->p_types.nprim)
                        return -EINVAL;
                p->sym_val_to_name[SYM_TYPES][typdatum->value - 1] = key;
                p->type_val_to_struct[typdatum->value - 1] = typdatum;
        }

        return 0;
}

static int user_index(void *key, void *datum, void *datap)
{
        struct policydb *p;
        struct user_datum *usrdatum;

        usrdatum = datum;
        p = datap;
        if (!usrdatum->value
            || usrdatum->value > p->p_users.nprim
            || usrdatum->bounds > p->p_users.nprim)
                return -EINVAL;

        p->sym_val_to_name[SYM_USERS][usrdatum->value - 1] = key;
        p->user_val_to_struct[usrdatum->value - 1] = usrdatum;
        return 0;
}

static int sens_index(void *key, void *datum, void *datap)
{
        struct policydb *p;
        struct level_datum *levdatum;

        levdatum = datum;
        p = datap;

        if (!levdatum->isalias) {
                if (!levdatum->level->sens ||
                    levdatum->level->sens > p->p_levels.nprim)
                        return -EINVAL;

                p->sym_val_to_name[SYM_LEVELS][levdatum->level->sens - 1] = key;
        }

        return 0;
}

static int cat_index(void *key, void *datum, void *datap)
{
        struct policydb *p;
        struct cat_datum *catdatum;

        catdatum = datum;
        p = datap;

        if (!catdatum->isalias) {
                if (!catdatum->value || catdatum->value > p->p_cats.nprim)
                        return -EINVAL;

                p->sym_val_to_name[SYM_CATS][catdatum->value - 1] = key;
        }

        return 0;
}

static int (*index_f[SYM_NUM]) (void *key, void *datum, void *datap) =
{
        common_index,
        class_index,
        role_index,
        type_index,
        user_index,
        cond_index_bool,
        sens_index,
        cat_index,
};

#ifdef DEBUG_HASHES
static void hash_eval(struct hashtab *h, const char *hash_name)
{
        struct hashtab_info info;

        hashtab_stat(h, &info);
        pr_debug("SELinux: %s:  %d entries and %d/%d buckets used, longest chain length %d\n",
                 hash_name, h->nel, info.slots_used, h->size,
                 info.max_chain_len);
}

static void symtab_hash_eval(struct symtab *s)
{
        int i;

        for (i = 0; i < SYM_NUM; i++)
                hash_eval(&s[i].table, symtab_name[i]);
}

#else
static inline void hash_eval(struct hashtab *h, char *hash_name)
{
}
#endif

/*
 * Define the other val_to_name and val_to_struct arrays
 * in a policy database structure.
 *
 * Caller must clean up on failure.
 */
static int policydb_index(struct policydb *p)
{
        int i, rc;

        if (p->mls_enabled)
                pr_debug("SELinux:  %d users, %d roles, %d types, %d bools, %d sens, %d cats\n",
                         p->p_users.nprim, p->p_roles.nprim, p->p_types.nprim,
                         p->p_bools.nprim, p->p_levels.nprim, p->p_cats.nprim);
        else
                pr_debug("SELinux:  %d users, %d roles, %d types, %d bools\n",
                         p->p_users.nprim, p->p_roles.nprim, p->p_types.nprim,
                         p->p_bools.nprim);

        pr_debug("SELinux:  %d classes, %d rules\n",
                 p->p_classes.nprim, p->te_avtab.nel);

#ifdef DEBUG_HASHES
        avtab_hash_eval(&p->te_avtab, "rules");
        symtab_hash_eval(p->symtab);
#endif

        p->class_val_to_struct = kcalloc(p->p_classes.nprim,
                                         sizeof(*p->class_val_to_struct),
                                         GFP_KERNEL);
        if (!p->class_val_to_struct)
                return -ENOMEM;

        p->role_val_to_struct = kcalloc(p->p_roles.nprim,
                                        sizeof(*p->role_val_to_struct),
                                        GFP_KERNEL);
        if (!p->role_val_to_struct)
                return -ENOMEM;

        p->user_val_to_struct = kcalloc(p->p_users.nprim,
                                        sizeof(*p->user_val_to_struct),
                                        GFP_KERNEL);
        if (!p->user_val_to_struct)
                return -ENOMEM;

        p->type_val_to_struct = kvcalloc(p->p_types.nprim,
                                         sizeof(*p->type_val_to_struct),
                                         GFP_KERNEL);
        if (!p->type_val_to_struct)
                return -ENOMEM;

        rc = cond_init_bool_indexes(p);
        if (rc)
                goto out;

        for (i = 0; i < SYM_NUM; i++) {
                p->sym_val_to_name[i] = kvcalloc(p->symtab[i].nprim,
                                                 sizeof(char *),
                                                 GFP_KERNEL);
                if (!p->sym_val_to_name[i])
                        return -ENOMEM;

                rc = hashtab_map(&p->symtab[i].table, index_f[i], p);
                if (rc)
                        goto out;
        }
        rc = 0;
out:
        return rc;
}

/*
 * Free any memory allocated by a policy database structure.
 */
void policydb_destroy(struct policydb *p)
{
        struct ocontext *c, *ctmp;
        struct genfs *g, *gtmp;
        int i;
        struct role_allow *ra, *lra = NULL;

        for (i = 0; i < SYM_NUM; i++) {
                cond_resched();
                hashtab_map(&p->symtab[i].table, destroy_f[i], NULL);
                hashtab_destroy(&p->symtab[i].table);
        }

        for (i = 0; i < SYM_NUM; i++)
                kvfree(p->sym_val_to_name[i]);

        kfree(p->class_val_to_struct);
        kfree(p->role_val_to_struct);
        kfree(p->user_val_to_struct);
        kvfree(p->type_val_to_struct);

        avtab_destroy(&p->te_avtab);

        for (i = 0; i < OCON_NUM; i++) {
                cond_resched();
                c = p->ocontexts[i];
                while (c) {
                        ctmp = c;
                        c = c->next;
                        ocontext_destroy(ctmp, i);
                }
                p->ocontexts[i] = NULL;
        }

        g = p->genfs;
        while (g) {
                cond_resched();
                kfree(g->fstype);
                c = g->head;
                while (c) {
                        ctmp = c;
                        c = c->next;
                        ocontext_destroy(ctmp, OCON_FSUSE);
                }
                gtmp = g;
                g = g->next;
                kfree(gtmp);
        }
        p->genfs = NULL;

        cond_policydb_destroy(p);

        hashtab_map(&p->role_tr, role_tr_destroy, NULL);
        hashtab_destroy(&p->role_tr);

        for (ra = p->role_allow; ra; ra = ra->next) {
                cond_resched();
                kfree(lra);
                lra = ra;
        }
        kfree(lra);

        hashtab_map(&p->filename_trans, filenametr_destroy, NULL);
        hashtab_destroy(&p->filename_trans);

        hashtab_map(&p->range_tr, range_tr_destroy, NULL);
        hashtab_destroy(&p->range_tr);

        if (p->type_attr_map_array) {
                for (i = 0; i < p->p_types.nprim; i++)
                        ebitmap_destroy(&p->type_attr_map_array[i]);
                kvfree(p->type_attr_map_array);
        }

        ebitmap_destroy(&p->filename_trans_ttypes);
        ebitmap_destroy(&p->policycaps);
        ebitmap_destroy(&p->permissive_map);
}

/*
 * Load the initial SIDs specified in a policy database
 * structure into a SID table.
 */
int policydb_load_isids(struct policydb *p, struct sidtab *s)
{
        struct ocontext *head, *c;
        int rc;

        rc = sidtab_init(s);
        if (rc) {
                pr_err("SELinux:  out of memory on SID table init\n");
                return rc;
        }

        head = p->ocontexts[OCON_ISID];
        for (c = head; c; c = c->next) {
                u32 sid = c->sid[0];
                const char *name = security_get_initial_sid_context(sid);

                if (sid == SECSID_NULL) {
                        pr_err("SELinux:  SID 0 was assigned a context.\n");
                        sidtab_destroy(s);
                        return -EINVAL;
                }

                /* Ignore initial SIDs unused by this kernel. */
                if (!name)
                        continue;

                rc = sidtab_set_initial(s, sid, &c->context[0]);
                if (rc) {
                        pr_err("SELinux:  unable to load initial SID %s.\n",
                               name);
                        sidtab_destroy(s);
                        return rc;
                }
        }
        return 0;
}

int policydb_class_isvalid(struct policydb *p, unsigned int class)
{
        if (!class || class > p->p_classes.nprim)
                return 0;
        return 1;
}

int policydb_role_isvalid(struct policydb *p, unsigned int role)
{
        if (!role || role > p->p_roles.nprim)
                return 0;
        return 1;
}

int policydb_type_isvalid(struct policydb *p, unsigned int type)
{
        if (!type || type > p->p_types.nprim)
                return 0;
        return 1;
}

/*
 * Return 1 if the fields in the security context
 * structure `c' are valid.  Return 0 otherwise.
 */
int policydb_context_isvalid(struct policydb *p, struct context *c)
{
        struct role_datum *role;
        struct user_datum *usrdatum;

        if (!c->role || c->role > p->p_roles.nprim)
                return 0;

        if (!c->user || c->user > p->p_users.nprim)
                return 0;

        if (!c->type || c->type > p->p_types.nprim)
                return 0;

        if (c->role != OBJECT_R_VAL) {
                /*
                 * Role must be authorized for the type.
                 */
                role = p->role_val_to_struct[c->role - 1];
                if (!role || !ebitmap_get_bit(&role->types, c->type - 1))
                        /* role may not be associated with type */
                        return 0;

                /*
                 * User must be authorized for the role.
                 */
                usrdatum = p->user_val_to_struct[c->user - 1];
                if (!usrdatum)
                        return 0;

                if (!ebitmap_get_bit(&usrdatum->roles, c->role - 1))
                        /* user may not be associated with role */
                        return 0;
        }

        if (!mls_context_isvalid(p, c))
                return 0;

        return 1;
}

/*
 * Read a MLS range structure from a policydb binary
 * representation file.
 */
static int mls_read_range_helper(struct mls_range *r, void *fp)
{
        __le32 buf[2];
        u32 items;
        int rc;

        rc = next_entry(buf, fp, sizeof(u32));
        if (rc)
                goto out;

        rc = -EINVAL;
        items = le32_to_cpu(buf[0]);
        if (items > ARRAY_SIZE(buf)) {
                pr_err("SELinux: mls:  range overflow\n");
                goto out;
        }

        rc = next_entry(buf, fp, sizeof(u32) * items);
        if (rc) {
                pr_err("SELinux: mls:  truncated range\n");
                goto out;
        }

        r->level[0].sens = le32_to_cpu(buf[0]);
        if (items > 1)
                r->level[1].sens = le32_to_cpu(buf[1]);
        else
                r->level[1].sens = r->level[0].sens;

        rc = ebitmap_read(&r->level[0].cat, fp);
        if (rc) {
                pr_err("SELinux: mls:  error reading low categories\n");
                goto out;
        }
        if (items > 1) {
                rc = ebitmap_read(&r->level[1].cat, fp);
                if (rc) {
                        pr_err("SELinux: mls:  error reading high categories\n");
                        goto bad_high;
                }
        } else {
                rc = ebitmap_cpy(&r->level[1].cat, &r->level[0].cat);
                if (rc) {
                        pr_err("SELinux: mls:  out of memory\n");
                        goto bad_high;
                }
        }

        return 0;
bad_high:
        ebitmap_destroy(&r->level[0].cat);
out:
        return rc;
}

/*
 * Read and validate a security context structure
 * from a policydb binary representation file.
 */
static int context_read_and_validate(struct context *c,
                                     struct policydb *p,
                                     void *fp)
{
        __le32 buf[3];
        int rc;

        rc = next_entry(buf, fp, sizeof buf);
        if (rc) {
                pr_err("SELinux: context truncated\n");
                goto out;
        }
        c->user = le32_to_cpu(buf[0]);
        c->role = le32_to_cpu(buf[1]);
        c->type = le32_to_cpu(buf[2]);
        if (p->policyvers >= POLICYDB_VERSION_MLS) {
                rc = mls_read_range_helper(&c->range, fp);
                if (rc) {
                        pr_err("SELinux: error reading MLS range of context\n");
                        goto out;
                }
        }

        rc = -EINVAL;
        if (!policydb_context_isvalid(p, c)) {
                pr_err("SELinux:  invalid security context\n");
                context_destroy(c);
                goto out;
        }
        rc = 0;
out:
        return rc;
}

/*
 * The following *_read functions are used to
 * read the symbol data from a policy database
 * binary representation file.
 */

static int str_read(char **strp, gfp_t flags, void *fp, u32 len)
{
        int rc;
        char *str;

        if ((len == 0) || (len == (u32)-1))
                return -EINVAL;

        str = kmalloc(len + 1, flags | __GFP_NOWARN);
        if (!str)
                return -ENOMEM;

        rc = next_entry(str, fp, len);
        if (rc) {
                kfree(str);
                return rc;
        }

        str[len] = '\0';
        *strp = str;
        return 0;
}

static int perm_read(struct policydb *p, struct symtab *s, void *fp)
{
        char *key = NULL;
        struct perm_datum *perdatum;
        int rc;
        __le32 buf[2];
        u32 len;

        perdatum = kzalloc(sizeof(*perdatum), GFP_KERNEL);
        if (!perdatum)
                return -ENOMEM;

        rc = next_entry(buf, fp, sizeof buf);
        if (rc)
                goto bad;

        len = le32_to_cpu(buf[0]);
        perdatum->value = le32_to_cpu(buf[1]);

        rc = str_read(&key, GFP_KERNEL, fp, len);
        if (rc)
                goto bad;

        rc = symtab_insert(s, key, perdatum);
        if (rc)
                goto bad;

        return 0;
bad:
        perm_destroy(key, perdatum, NULL);
        return rc;
}

static int common_read(struct policydb *p, struct symtab *s, void *fp)
{
        char *key = NULL;
        struct common_datum *comdatum;
        __le32 buf[4];
        u32 len, nel;
        int i, rc;

        comdatum = kzalloc(sizeof(*comdatum), GFP_KERNEL);
        if (!comdatum)
                return -ENOMEM;

        rc = next_entry(buf, fp, sizeof buf);
        if (rc)
                goto bad;

        len = le32_to_cpu(buf[0]);
        comdatum->value = le32_to_cpu(buf[1]);
        nel = le32_to_cpu(buf[3]);

        rc = symtab_init(&comdatum->permissions, nel);
        if (rc)
                goto bad;
        comdatum->permissions.nprim = le32_to_cpu(buf[2]);

        rc = str_read(&key, GFP_KERNEL, fp, len);
        if (rc)
                goto bad;

        for (i = 0; i < nel; i++) {
                rc = perm_read(p, &comdatum->permissions, fp);
                if (rc)
                        goto bad;
        }

        rc = symtab_insert(s, key, comdatum);
        if (rc)
                goto bad;
        return 0;
bad:
        common_destroy(key, comdatum, NULL);
        return rc;
}

static void type_set_init(struct type_set *t)
{
        ebitmap_init(&t->types);
        ebitmap_init(&t->negset);
}

static int type_set_read(struct type_set *t, void *fp)
{
        __le32 buf[1];
        int rc;

        if (ebitmap_read(&t->types, fp))
                return -EINVAL;
        if (ebitmap_read(&t->negset, fp))
                return -EINVAL;

        rc = next_entry(buf, fp, sizeof(u32));
        if (rc < 0)
                return -EINVAL;
        t->flags = le32_to_cpu(buf[0]);

        return 0;
}


static int read_cons_helper(struct policydb *p,
                                struct constraint_node **nodep,
                                int ncons, int allowxtarget, void *fp)
{
        struct constraint_node *c, *lc;
        struct constraint_expr *e, *le;
        __le32 buf[3];
        u32 nexpr;
        int rc, i, j, depth;

        lc = NULL;
        for (i = 0; i < ncons; i++) {
                c = kzalloc(sizeof(*c), GFP_KERNEL);
                if (!c)
                        return -ENOMEM;

                if (lc)
                        lc->next = c;
                else
                        *nodep = c;

                rc = next_entry(buf, fp, (sizeof(u32) * 2));
                if (rc)
                        return rc;
                c->permissions = le32_to_cpu(buf[0]);
                nexpr = le32_to_cpu(buf[1]);
                le = NULL;
                depth = -1;
                for (j = 0; j < nexpr; j++) {
                        e = kzalloc(sizeof(*e), GFP_KERNEL);
                        if (!e)
                                return -ENOMEM;

                        if (le)
                                le->next = e;
                        else
                                c->expr = e;

                        rc = next_entry(buf, fp, (sizeof(u32) * 3));
                        if (rc)
                                return rc;
                        e->expr_type = le32_to_cpu(buf[0]);
                        e->attr = le32_to_cpu(buf[1]);
                        e->op = le32_to_cpu(buf[2]);

                        switch (e->expr_type) {
                        case CEXPR_NOT:
                                if (depth < 0)
                                        return -EINVAL;
                                break;
                        case CEXPR_AND:
                        case CEXPR_OR:
                                if (depth < 1)
                                        return -EINVAL;
                                depth--;
                                break;
                        case CEXPR_ATTR:
                                if (depth == (CEXPR_MAXDEPTH - 1))
                                        return -EINVAL;
                                depth++;
                                break;
                        case CEXPR_NAMES:
                                if (!allowxtarget && (e->attr & CEXPR_XTARGET))
                                        return -EINVAL;
                                if (depth == (CEXPR_MAXDEPTH - 1))
                                        return -EINVAL;
                                depth++;
                                rc = ebitmap_read(&e->names, fp);
                                if (rc)
                                        return rc;
                                if (p->policyvers >=
                                    POLICYDB_VERSION_CONSTRAINT_NAMES) {
                                        e->type_names = kzalloc(sizeof
                                                (*e->type_names), GFP_KERNEL);
                                        if (!e->type_names)
                                                return -ENOMEM;
                                        type_set_init(e->type_names);
                                        rc = type_set_read(e->type_names, fp);
                                        if (rc)
                                                return rc;
                                }
                                break;
                        default:
                                return -EINVAL;
                        }
                        le = e;
                }
                if (depth != 0)
                        return -EINVAL;
                lc = c;
        }

        return 0;
}

static int class_read(struct policydb *p, struct symtab *s, void *fp)
{
        char *key = NULL;
        struct class_datum *cladatum;
        __le32 buf[6];
        u32 len, len2, ncons, nel;
        int i, rc;

        cladatum = kzalloc(sizeof(*cladatum), GFP_KERNEL);
        if (!cladatum)
                return -ENOMEM;

        rc = next_entry(buf, fp, sizeof(u32)*6);
        if (rc)
                goto bad;

        len = le32_to_cpu(buf[0]);
        len2 = le32_to_cpu(buf[1]);
        cladatum->value = le32_to_cpu(buf[2]);
        nel = le32_to_cpu(buf[4]);

        rc = symtab_init(&cladatum->permissions, nel);
        if (rc)
                goto bad;
        cladatum->permissions.nprim = le32_to_cpu(buf[3]);

        ncons = le32_to_cpu(buf[5]);

        rc = str_read(&key, GFP_KERNEL, fp, len);
        if (rc)
                goto bad;

        if (len2) {
                rc = str_read(&cladatum->comkey, GFP_KERNEL, fp, len2);
                if (rc)
                        goto bad;

                rc = -EINVAL;
                cladatum->comdatum = symtab_search(&p->p_commons,
                                                   cladatum->comkey);
                if (!cladatum->comdatum) {
                        pr_err("SELinux:  unknown common %s\n",
                               cladatum->comkey);
                        goto bad;
                }
        }
        for (i = 0; i < nel; i++) {
                rc = perm_read(p, &cladatum->permissions, fp);
                if (rc)
                        goto bad;
        }

        rc = read_cons_helper(p, &cladatum->constraints, ncons, 0, fp);
        if (rc)
                goto bad;

        if (p->policyvers >= POLICYDB_VERSION_VALIDATETRANS) {
                /* grab the validatetrans rules */
                rc = next_entry(buf, fp, sizeof(u32));
                if (rc)
                        goto bad;
                ncons = le32_to_cpu(buf[0]);
                rc = read_cons_helper(p, &cladatum->validatetrans,
                                ncons, 1, fp);
                if (rc)
                        goto bad;
        }

        if (p->policyvers >= POLICYDB_VERSION_NEW_OBJECT_DEFAULTS) {
                rc = next_entry(buf, fp, sizeof(u32) * 3);
                if (rc)
                        goto bad;

                cladatum->default_user = le32_to_cpu(buf[0]);
                cladatum->default_role = le32_to_cpu(buf[1]);
                cladatum->default_range = le32_to_cpu(buf[2]);
        }

        if (p->policyvers >= POLICYDB_VERSION_DEFAULT_TYPE) {
                rc = next_entry(buf, fp, sizeof(u32) * 1);
                if (rc)
                        goto bad;
                cladatum->default_type = le32_to_cpu(buf[0]);
        }

        rc = symtab_insert(s, key, cladatum);
        if (rc)
                goto bad;

        return 0;
bad:
        cls_destroy(key, cladatum, NULL);
        return rc;
}

static int role_read(struct policydb *p, struct symtab *s, void *fp)
{
        char *key = NULL;
        struct role_datum *role;
        int rc, to_read = 2;
        __le32 buf[3];
        u32 len;

        role = kzalloc(sizeof(*role), GFP_KERNEL);
        if (!role)
                return -ENOMEM;

        if (p->policyvers >= POLICYDB_VERSION_BOUNDARY)
                to_read = 3;

        rc = next_entry(buf, fp, sizeof(buf[0]) * to_read);
        if (rc)
                goto bad;

        len = le32_to_cpu(buf[0]);
        role->value = le32_to_cpu(buf[1]);
        if (p->policyvers >= POLICYDB_VERSION_BOUNDARY)
                role->bounds = le32_to_cpu(buf[2]);

        rc = str_read(&key, GFP_KERNEL, fp, len);
        if (rc)
                goto bad;

        rc = ebitmap_read(&role->dominates, fp);
        if (rc)
                goto bad;

        rc = ebitmap_read(&role->types, fp);
        if (rc)
                goto bad;

        if (strcmp(key, OBJECT_R) == 0) {
                rc = -EINVAL;
                if (role->value != OBJECT_R_VAL) {
                        pr_err("SELinux: Role %s has wrong value %d\n",
                               OBJECT_R, role->value);
                        goto bad;
                }
                rc = 0;
                goto bad;
        }

        rc = symtab_insert(s, key, role);
        if (rc)
                goto bad;
        return 0;
bad:
        role_destroy(key, role, NULL);
        return rc;
}

static int type_read(struct policydb *p, struct symtab *s, void *fp)
{
        char *key = NULL;
        struct type_datum *typdatum;
        int rc, to_read = 3;
        __le32 buf[4];
        u32 len;

        typdatum = kzalloc(sizeof(*typdatum), GFP_KERNEL);
        if (!typdatum)
                return -ENOMEM;

        if (p->policyvers >= POLICYDB_VERSION_BOUNDARY)
                to_read = 4;

        rc = next_entry(buf, fp, sizeof(buf[0]) * to_read);
        if (rc)
                goto bad;

        len = le32_to_cpu(buf[0]);
        typdatum->value = le32_to_cpu(buf[1]);
        if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) {
                u32 prop = le32_to_cpu(buf[2]);

                if (prop & TYPEDATUM_PROPERTY_PRIMARY)
                        typdatum->primary = 1;
                if (prop & TYPEDATUM_PROPERTY_ATTRIBUTE)
                        typdatum->attribute = 1;

                typdatum->bounds = le32_to_cpu(buf[3]);
        } else {
                typdatum->primary = le32_to_cpu(buf[2]);
        }

        rc = str_read(&key, GFP_KERNEL, fp, len);
        if (rc)
                goto bad;

        rc = symtab_insert(s, key, typdatum);
        if (rc)
                goto bad;
        return 0;
bad:
        type_destroy(key, typdatum, NULL);
        return rc;
}


/*
 * Read a MLS level structure from a policydb binary
 * representation file.
 */
static int mls_read_level(struct mls_level *lp, void *fp)
{
        __le32 buf[1];
        int rc;

        memset(lp, 0, sizeof(*lp));

        rc = next_entry(buf, fp, sizeof buf);
        if (rc) {
                pr_err("SELinux: mls: truncated level\n");
                return rc;
        }
        lp->sens = le32_to_cpu(buf[0]);

        rc = ebitmap_read(&lp->cat, fp);
        if (rc) {
                pr_err("SELinux: mls:  error reading level categories\n");
                return rc;
        }
        return 0;
}

static int user_read(struct policydb *p, struct symtab *s, void *fp)
{
        char *key = NULL;
        struct user_datum *usrdatum;
        int rc, to_read = 2;
        __le32 buf[3];
        u32 len;

        usrdatum = kzalloc(sizeof(*usrdatum), GFP_KERNEL);
        if (!usrdatum)
                return -ENOMEM;

        if (p->policyvers >= POLICYDB_VERSION_BOUNDARY)
                to_read = 3;

        rc = next_entry(buf, fp, sizeof(buf[0]) * to_read);
        if (rc)
                goto bad;

        len = le32_to_cpu(buf[0]);
        usrdatum->value = le32_to_cpu(buf[1]);
        if (p->policyvers >= POLICYDB_VERSION_BOUNDARY)
                usrdatum->bounds = le32_to_cpu(buf[2]);

        rc = str_read(&key, GFP_KERNEL, fp, len);
        if (rc)
                goto bad;

        rc = ebitmap_read(&usrdatum->roles, fp);
        if (rc)
                goto bad;

        if (p->policyvers >= POLICYDB_VERSION_MLS) {
                rc = mls_read_range_helper(&usrdatum->range, fp);
                if (rc)
                        goto bad;
                rc = mls_read_level(&usrdatum->dfltlevel, fp);
                if (rc)
                        goto bad;
        }

        rc = symtab_insert(s, key, usrdatum);
        if (rc)
                goto bad;
        return 0;
bad:
        user_destroy(key, usrdatum, NULL);
        return rc;
}

static int sens_read(struct policydb *p, struct symtab *s, void *fp)
{
        char *key = NULL;
        struct level_datum *levdatum;
        int rc;
        __le32 buf[2];
        u32 len;

        levdatum = kzalloc(sizeof(*levdatum), GFP_ATOMIC);
        if (!levdatum)
                return -ENOMEM;

        rc = next_entry(buf, fp, sizeof buf);
        if (rc)
                goto bad;

        len = le32_to_cpu(buf[0]);
        levdatum->isalias = le32_to_cpu(buf[1]);

        rc = str_read(&key, GFP_ATOMIC, fp, len);
        if (rc)
                goto bad;

        rc = -ENOMEM;
        levdatum->level = kmalloc(sizeof(*levdatum->level), GFP_ATOMIC);
        if (!levdatum->level)
                goto bad;

        rc = mls_read_level(levdatum->level, fp);
        if (rc)
                goto bad;

        rc = symtab_insert(s, key, levdatum);
        if (rc)
                goto bad;
        return 0;
bad:
        sens_destroy(key, levdatum, NULL);
        return rc;
}

static int cat_read(struct policydb *p, struct symtab *s, void *fp)
{
        char *key = NULL;
        struct cat_datum *catdatum;
        int rc;
        __le32 buf[3];
        u32 len;

        catdatum = kzalloc(sizeof(*catdatum), GFP_ATOMIC);
        if (!catdatum)
                return -ENOMEM;

        rc = next_entry(buf, fp, sizeof buf);
        if (rc)
                goto bad;

        len = le32_to_cpu(buf[0]);
        catdatum->value = le32_to_cpu(buf[1]);
        catdatum->isalias = le32_to_cpu(buf[2]);

        rc = str_read(&key, GFP_ATOMIC, fp, len);
        if (rc)
                goto bad;

        rc = symtab_insert(s, key, catdatum);
        if (rc)
                goto bad;
        return 0;
bad:
        cat_destroy(key, catdatum, NULL);
        return rc;
}

static int (*read_f[SYM_NUM]) (struct policydb *p, struct symtab *s, void *fp) =
{
        common_read,
        class_read,
        role_read,
        type_read,
        user_read,
        cond_read_bool,
        sens_read,
        cat_read,
};

static int user_bounds_sanity_check(void *key, void *datum, void *datap)
{
        struct user_datum *upper, *user;
        struct policydb *p = datap;
        int depth = 0;

        upper = user = datum;
        while (upper->bounds) {
                struct ebitmap_node *node;
                unsigned long bit;

                if (++depth == POLICYDB_BOUNDS_MAXDEPTH) {
                        pr_err("SELinux: user %s: "
                               "too deep or looped boundary",
                               (char *) key);
                        return -EINVAL;
                }

                upper = p->user_val_to_struct[upper->bounds - 1];
                ebitmap_for_each_positive_bit(&user->roles, node, bit) {
                        if (ebitmap_get_bit(&upper->roles, bit))
                                continue;

                        pr_err("SELinux: boundary violated policy: "
                               "user=%s role=%s bounds=%s\n",
                               sym_name(p, SYM_USERS, user->value - 1),
                               sym_name(p, SYM_ROLES, bit),
                               sym_name(p, SYM_USERS, upper->value - 1));

                        return -EINVAL;
                }
        }

        return 0;
}

static int role_bounds_sanity_check(void *key, void *datum, void *datap)
{
        struct role_datum *upper, *role;
        struct policydb *p = datap;
        int depth = 0;

        upper = role = datum;
        while (upper->bounds) {
                struct ebitmap_node *node;
                unsigned long bit;

                if (++depth == POLICYDB_BOUNDS_MAXDEPTH) {
                        pr_err("SELinux: role %s: "
                               "too deep or looped bounds\n",
                               (char *) key);
                        return -EINVAL;
                }

                upper = p->role_val_to_struct[upper->bounds - 1];
                ebitmap_for_each_positive_bit(&role->types, node, bit) {
                        if (ebitmap_get_bit(&upper->types, bit))
                                continue;

                        pr_err("SELinux: boundary violated policy: "
                               "role=%s type=%s bounds=%s\n",
                               sym_name(p, SYM_ROLES, role->value - 1),
                               sym_name(p, SYM_TYPES, bit),
                               sym_name(p, SYM_ROLES, upper->value - 1));

                        return -EINVAL;
                }
        }

        return 0;
}

static int type_bounds_sanity_check(void *key, void *datum, void *datap)
{
        struct type_datum *upper;
        struct policydb *p = datap;
        int depth = 0;

        upper = datum;
        while (upper->bounds) {
                if (++depth == POLICYDB_BOUNDS_MAXDEPTH) {
                        pr_err("SELinux: type %s: "
                               "too deep or looped boundary\n",
                               (char *) key);
                        return -EINVAL;
                }

                upper = p->type_val_to_struct[upper->bounds - 1];
                BUG_ON(!upper);

                if (upper->attribute) {
                        pr_err("SELinux: type %s: "
                               "bounded by attribute %s",
                               (char *) key,
                               sym_name(p, SYM_TYPES, upper->value - 1));
                        return -EINVAL;
                }
        }

        return 0;
}

static int policydb_bounds_sanity_check(struct policydb *p)
{
        int rc;

        if (p->policyvers < POLICYDB_VERSION_BOUNDARY)
                return 0;

        rc = hashtab_map(&p->p_users.table, user_bounds_sanity_check, p);
        if (rc)
                return rc;

        rc = hashtab_map(&p->p_roles.table, role_bounds_sanity_check, p);
        if (rc)
                return rc;

        rc = hashtab_map(&p->p_types.table, type_bounds_sanity_check, p);
        if (rc)
                return rc;

        return 0;
}

u16 string_to_security_class(struct policydb *p, const char *name)
{
        struct class_datum *cladatum;

        cladatum = symtab_search(&p->p_classes, name);
        if (!cladatum)
                return 0;

        return cladatum->value;
}

u32 string_to_av_perm(struct policydb *p, u16 tclass, const char *name)
{
        struct class_datum *cladatum;
        struct perm_datum *perdatum = NULL;
        struct common_datum *comdatum;

        if (!tclass || tclass > p->p_classes.nprim)
                return 0;

        cladatum = p->class_val_to_struct[tclass-1];
        comdatum = cladatum->comdatum;
        if (comdatum)
                perdatum = symtab_search(&comdatum->permissions, name);
        if (!perdatum)
                perdatum = symtab_search(&cladatum->permissions, name);
        if (!perdatum)
                return 0;

        return 1U << (perdatum->value-1);
}

static int range_read(struct policydb *p, void *fp)
{
        struct range_trans *rt = NULL;
        struct mls_range *r = NULL;
        int i, rc;
        __le32 buf[2];
        u32 nel;

        if (p->policyvers < POLICYDB_VERSION_MLS)
                return 0;

        rc = next_entry(buf, fp, sizeof(u32));
        if (rc)
                return rc;

        nel = le32_to_cpu(buf[0]);

        rc = hashtab_init(&p->range_tr, nel);
        if (rc)
                return rc;

        for (i = 0; i < nel; i++) {
                rc = -ENOMEM;
                rt = kzalloc(sizeof(*rt), GFP_KERNEL);
                if (!rt)
                        goto out;

                rc = next_entry(buf, fp, (sizeof(u32) * 2));
                if (rc)
                        goto out;

                rt->source_type = le32_to_cpu(buf[0]);
                rt->target_type = le32_to_cpu(buf[1]);
                if (p->policyvers >= POLICYDB_VERSION_RANGETRANS) {
                        rc = next_entry(buf, fp, sizeof(u32));
                        if (rc)
                                goto out;
                        rt->target_class = le32_to_cpu(buf[0]);
                } else
                        rt->target_class = p->process_class;

                rc = -EINVAL;
                if (!policydb_type_isvalid(p, rt->source_type) ||
                    !policydb_type_isvalid(p, rt->target_type) ||
                    !policydb_class_isvalid(p, rt->target_class))
                        goto out;

                rc = -ENOMEM;
                r = kzalloc(sizeof(*r), GFP_KERNEL);
                if (!r)
                        goto out;

                rc = mls_read_range_helper(r, fp);
                if (rc)
                        goto out;

                rc = -EINVAL;
                if (!mls_range_isvalid(p, r)) {
                        pr_warn("SELinux:  rangetrans:  invalid range\n");
                        goto out;
                }

                rc = hashtab_insert(&p->range_tr, rt, r, rangetr_key_params);
                if (rc)
                        goto out;

                rt = NULL;
                r = NULL;
        }
        hash_eval(&p->range_tr, "rangetr");
        rc = 0;
out:
        kfree(rt);
        kfree(r);
        return rc;
}

static int filename_trans_read_helper_compat(struct policydb *p, void *fp)
{
        struct filename_trans_key key, *ft = NULL;
        struct filename_trans_datum *last, *datum = NULL;
        char *name = NULL;
        u32 len, stype, otype;
        __le32 buf[4];
        int rc;

        /* length of the path component string */
        rc = next_entry(buf, fp, sizeof(u32));
        if (rc)
                return rc;
        len = le32_to_cpu(buf[0]);

        /* path component string */
        rc = str_read(&name, GFP_KERNEL, fp, len);
        if (rc)
                return rc;

        rc = next_entry(buf, fp, sizeof(u32) * 4);
        if (rc)
                goto out;

        stype = le32_to_cpu(buf[0]);
        key.ttype = le32_to_cpu(buf[1]);
        key.tclass = le32_to_cpu(buf[2]);
        key.name = name;

        otype = le32_to_cpu(buf[3]);

        last = NULL;
        datum = policydb_filenametr_search(p, &key);
        while (datum) {
                if (unlikely(ebitmap_get_bit(&datum->stypes, stype - 1))) {
                        /* conflicting/duplicate rules are ignored */
                        datum = NULL;
                        goto out;
                }
                if (likely(datum->otype == otype))
                        break;
                last = datum;
                datum = datum->next;
        }
        if (!datum) {
                rc = -ENOMEM;
                datum = kmalloc(sizeof(*datum), GFP_KERNEL);
                if (!datum)
                        goto out;

                ebitmap_init(&datum->stypes);
                datum->otype = otype;
                datum->next = NULL;

                if (unlikely(last)) {
                        last->next = datum;
                } else {
                        rc = -ENOMEM;
                        ft = kmemdup(&key, sizeof(key), GFP_KERNEL);
                        if (!ft)
                                goto out;

                        rc = hashtab_insert(&p->filename_trans, ft, datum,
                                            filenametr_key_params);
                        if (rc)
                                goto out;
                        name = NULL;

                        rc = ebitmap_set_bit(&p->filename_trans_ttypes,
                                             key.ttype, 1);
                        if (rc)
                                return rc;
                }
        }
        kfree(name);
        return ebitmap_set_bit(&datum->stypes, stype - 1, 1);

out:
        kfree(ft);
        kfree(name);
        kfree(datum);
        return rc;
}

static int filename_trans_read_helper(struct policydb *p, void *fp)
{
        struct filename_trans_key *ft = NULL;
        struct filename_trans_datum **dst, *datum, *first = NULL;
        char *name = NULL;
        u32 len, ttype, tclass, ndatum, i;
        __le32 buf[3];
        int rc;

        /* length of the path component string */
        rc = next_entry(buf, fp, sizeof(u32));
        if (rc)
                return rc;
        len = le32_to_cpu(buf[0]);

        /* path component string */
        rc = str_read(&name, GFP_KERNEL, fp, len);
        if (rc)
                return rc;

        rc = next_entry(buf, fp, sizeof(u32) * 3);
        if (rc)
                goto out;

        ttype = le32_to_cpu(buf[0]);
        tclass = le32_to_cpu(buf[1]);

        ndatum = le32_to_cpu(buf[2]);
        if (ndatum == 0) {
                pr_err("SELinux:  Filename transition key with no datum\n");
                rc = -ENOENT;
                goto out;
        }

        dst = &first;
        for (i = 0; i < ndatum; i++) {
                rc = -ENOMEM;
                datum = kmalloc(sizeof(*datum), GFP_KERNEL);
                if (!datum)
                        goto out;

                datum->next = NULL;
                *dst = datum;

                /* ebitmap_read() will at least init the bitmap */
                rc = ebitmap_read(&datum->stypes, fp);
                if (rc)
                        goto out;

                rc = next_entry(buf, fp, sizeof(u32));
                if (rc)
                        goto out;

                datum->otype = le32_to_cpu(buf[0]);

                dst = &datum->next;
        }

        rc = -ENOMEM;
        ft = kmalloc(sizeof(*ft), GFP_KERNEL);
        if (!ft)
                goto out;

        ft->ttype = ttype;
        ft->tclass = tclass;
        ft->name = name;

        rc = hashtab_insert(&p->filename_trans, ft, first,
                            filenametr_key_params);
        if (rc == -EEXIST)
                pr_err("SELinux:  Duplicate filename transition key\n");
        if (rc)
                goto out;

        return ebitmap_set_bit(&p->filename_trans_ttypes, ttype, 1);

out:
        kfree(ft);
        kfree(name);
        while (first) {
                datum = first;
                first = first->next;

                ebitmap_destroy(&datum->stypes);
                kfree(datum);
        }
        return rc;
}

static int filename_trans_read(struct policydb *p, void *fp)
{
        u32 nel;
        __le32 buf[1];
        int rc, i;

        if (p->policyvers < POLICYDB_VERSION_FILENAME_TRANS)
                return 0;

        rc = next_entry(buf, fp, sizeof(u32));
        if (rc)
                return rc;
        nel = le32_to_cpu(buf[0]);

        if (p->policyvers < POLICYDB_VERSION_COMP_FTRANS) {
                p->compat_filename_trans_count = nel;

                rc = hashtab_init(&p->filename_trans, (1 << 11));
                if (rc)
                        return rc;

                for (i = 0; i < nel; i++) {
                        rc = filename_trans_read_helper_compat(p, fp);
                        if (rc)
                                return rc;
                }
        } else {
                rc = hashtab_init(&p->filename_trans, nel);
                if (rc)
                        return rc;

                for (i = 0; i < nel; i++) {
                        rc = filename_trans_read_helper(p, fp);
                        if (rc)
                                return rc;
                }
        }
        hash_eval(&p->filename_trans, "filenametr");
        return 0;
}

static int genfs_read(struct policydb *p, void *fp)
{
        int i, j, rc;
        u32 nel, nel2, len, len2;
        __le32 buf[1];
        struct ocontext *l, *c;
        struct ocontext *newc = NULL;
        struct genfs *genfs_p, *genfs;
        struct genfs *newgenfs = NULL;

        rc = next_entry(buf, fp, sizeof(u32));
        if (rc)
                return rc;
        nel = le32_to_cpu(buf[0]);

        for (i = 0; i < nel; i++) {
                rc = next_entry(buf, fp, sizeof(u32));
                if (rc)
                        goto out;
                len = le32_to_cpu(buf[0]);

                rc = -ENOMEM;
                newgenfs = kzalloc(sizeof(*newgenfs), GFP_KERNEL);
                if (!newgenfs)
                        goto out;

                rc = str_read(&newgenfs->fstype, GFP_KERNEL, fp, len);
                if (rc)
                        goto out;

                for (genfs_p = NULL, genfs = p->genfs; genfs;
                     genfs_p = genfs, genfs = genfs->next) {
                        rc = -EINVAL;
                        if (strcmp(newgenfs->fstype, genfs->fstype) == 0) {
                                pr_err("SELinux:  dup genfs fstype %s\n",
                                       newgenfs->fstype);
                                goto out;
                        }
                        if (strcmp(newgenfs->fstype, genfs->fstype) < 0)
                                break;
                }
                newgenfs->next = genfs;
                if (genfs_p)
                        genfs_p->next = newgenfs;
                else
                        p->genfs = newgenfs;
                genfs = newgenfs;
                newgenfs = NULL;

                rc = next_entry(buf, fp, sizeof(u32));
                if (rc)
                        goto out;

                nel2 = le32_to_cpu(buf[0]);
                for (j = 0; j < nel2; j++) {
                        rc = next_entry(buf, fp, sizeof(u32));
                        if (rc)
                                goto out;
                        len = le32_to_cpu(buf[0]);

                        rc = -ENOMEM;
                        newc = kzalloc(sizeof(*newc), GFP_KERNEL);
                        if (!newc)
                                goto out;

                        rc = str_read(&newc->u.name, GFP_KERNEL, fp, len);
                        if (rc)
                                goto out;

                        rc = next_entry(buf, fp, sizeof(u32));
                        if (rc)
                                goto out;

                        newc->v.sclass = le32_to_cpu(buf[0]);
                        rc = context_read_and_validate(&newc->context[0], p, fp);
                        if (rc)
                                goto out;

                        for (l = NULL, c = genfs->head; c;
                             l = c, c = c->next) {
                                rc = -EINVAL;
                                if (!strcmp(newc->u.name, c->u.name) &&
                                    (!c->v.sclass || !newc->v.sclass ||
                                     newc->v.sclass == c->v.sclass)) {
                                        pr_err("SELinux:  dup genfs entry (%s,%s)\n",
                                               genfs->fstype, c->u.name);
                                        goto out;
                                }
                                len = strlen(newc->u.name);
                                len2 = strlen(c->u.name);
                                if (len > len2)
                                        break;
                        }

                        newc->next = c;
                        if (l)
                                l->next = newc;
                        else
                                genfs->head = newc;
                        newc = NULL;
                }
        }
        rc = 0;
out:
        if (newgenfs) {
                kfree(newgenfs->fstype);
                kfree(newgenfs);
        }
        ocontext_destroy(newc, OCON_FSUSE);

        return rc;
}

static int ocontext_read(struct policydb *p, struct policydb_compat_info *info,
                         void *fp)
{
        int i, j, rc;
        u32 nel, len;
        __be64 prefixbuf[1];
        __le32 buf[3];
        struct ocontext *l, *c;
        u32 nodebuf[8];

        for (i = 0; i < info->ocon_num; i++) {
                rc = next_entry(buf, fp, sizeof(u32));
                if (rc)
                        goto out;
                nel = le32_to_cpu(buf[0]);

                l = NULL;
                for (j = 0; j < nel; j++) {
                        rc = -ENOMEM;
                        c = kzalloc(sizeof(*c), GFP_KERNEL);
                        if (!c)
                                goto out;
                        if (l)
                                l->next = c;
                        else
                                p->ocontexts[i] = c;
                        l = c;

                        switch (i) {
                        case OCON_ISID:
                                rc = next_entry(buf, fp, sizeof(u32));
                                if (rc)
                                        goto out;

                                c->sid[0] = le32_to_cpu(buf[0]);
                                rc = context_read_and_validate(&c->context[0], p, fp);
                                if (rc)
                                        goto out;
                                break;
                        case OCON_FS:
                        case OCON_NETIF:
                                rc = next_entry(buf, fp, sizeof(u32));
                                if (rc)
                                        goto out;
                                len = le32_to_cpu(buf[0]);

                                rc = str_read(&c->u.name, GFP_KERNEL, fp, len);
                                if (rc)
                                        goto out;

                                rc = context_read_and_validate(&c->context[0], p, fp);
                                if (rc)
                                        goto out;
                                rc = context_read_and_validate(&c->context[1], p, fp);
                                if (rc)
                                        goto out;
                                break;
                        case OCON_PORT:
                                rc = next_entry(buf, fp, sizeof(u32)*3);
                                if (rc)
                                        goto out;
                                c->u.port.protocol = le32_to_cpu(buf[0]);
                                c->u.port.low_port = le32_to_cpu(buf[1]);
                                c->u.port.high_port = le32_to_cpu(buf[2]);
                                rc = context_read_and_validate(&c->context[0], p, fp);
                                if (rc)
                                        goto out;
                                break;
                        case OCON_NODE:
                                rc = next_entry(nodebuf, fp, sizeof(u32) * 2);
                                if (rc)
                                        goto out;
                                c->u.node.addr = nodebuf[0]; /* network order */
                                c->u.node.mask = nodebuf[1]; /* network order */
                                rc = context_read_and_validate(&c->context[0], p, fp);
                                if (rc)
                                        goto out;
                                break;
                        case OCON_FSUSE:
                                rc = next_entry(buf, fp, sizeof(u32)*2);
                                if (rc)
                                        goto out;

                                rc = -EINVAL;
                                c->v.behavior = le32_to_cpu(buf[0]);
                                /* Determined at runtime, not in policy DB. */
                                if (c->v.behavior == SECURITY_FS_USE_MNTPOINT)
                                        goto out;
                                if (c->v.behavior > SECURITY_FS_USE_MAX)
                                        goto out;

                                len = le32_to_cpu(buf[1]);
                                rc = str_read(&c->u.name, GFP_KERNEL, fp, len);
                                if (rc)
                                        goto out;

                                rc = context_read_and_validate(&c->context[0], p, fp);
                                if (rc)
                                        goto out;
                                break;
                        case OCON_NODE6: {
                                int k;

                                rc = next_entry(nodebuf, fp, sizeof(u32) * 8);
                                if (rc)
                                        goto out;
                                for (k = 0; k < 4; k++)
                                        c->u.node6.addr[k] = nodebuf[k];
                                for (k = 0; k < 4; k++)
                                        c->u.node6.mask[k] = nodebuf[k+4];
                                rc = context_read_and_validate(&c->context[0], p, fp);
                                if (rc)
                                        goto out;
                                break;
                        }
                        case OCON_IBPKEY: {
                                u32 pkey_lo, pkey_hi;

                                rc = next_entry(prefixbuf, fp, sizeof(u64));
                                if (rc)
                                        goto out;

                                /* we need to have subnet_prefix in CPU order */
                                c->u.ibpkey.subnet_prefix = be64_to_cpu(prefixbuf[0]);

                                rc = next_entry(buf, fp, sizeof(u32) * 2);
                                if (rc)
                                        goto out;

                                pkey_lo = le32_to_cpu(buf[0]);
                                pkey_hi = le32_to_cpu(buf[1]);

                                if (pkey_lo > U16_MAX || pkey_hi > U16_MAX) {
                                        rc = -EINVAL;
                                        goto out;
                                }

                                c->u.ibpkey.low_pkey  = pkey_lo;
                                c->u.ibpkey.high_pkey = pkey_hi;

                                rc = context_read_and_validate(&c->context[0],
                                                               p,
                                                               fp);
                                if (rc)
                                        goto out;
                                break;
                        }
                        case OCON_IBENDPORT: {
                                u32 port;

                                rc = next_entry(buf, fp, sizeof(u32) * 2);
                                if (rc)
                                        goto out;
                                len = le32_to_cpu(buf[0]);

                                rc = str_read(&c->u.ibendport.dev_name, GFP_KERNEL, fp, len);
                                if (rc)
                                        goto out;

                                port = le32_to_cpu(buf[1]);
                                if (port > U8_MAX || port == 0) {
                                        rc = -EINVAL;
                                        goto out;
                                }

                                c->u.ibendport.port = port;

                                rc = context_read_and_validate(&c->context[0],
                                                               p,
                                                               fp);
                                if (rc)
                                        goto out;
                                break;
                        } /* end case */
                        } /* end switch */
                }
        }
        rc = 0;
out:
        return rc;
}

/*
 * Read the configuration data from a policy database binary
 * representation file into a policy database structure.
 */
int policydb_read(struct policydb *p, void *fp)
{
        struct role_allow *ra, *lra;
        struct role_trans_key *rtk = NULL;
        struct role_trans_datum *rtd = NULL;
        int i, j, rc;
        __le32 buf[4];
        u32 len, nprim, nel, perm;

        char *policydb_str;
        struct policydb_compat_info *info;

        policydb_init(p);

        /* Read the magic number and string length. */
        rc = next_entry(buf, fp, sizeof(u32) * 2);
        if (rc)
                goto bad;

        rc = -EINVAL;
        if (le32_to_cpu(buf[0]) != POLICYDB_MAGIC) {
                pr_err("SELinux:  policydb magic number 0x%x does "
                       "not match expected magic number 0x%x\n",
                       le32_to_cpu(buf[0]), POLICYDB_MAGIC);
                goto bad;
        }

        rc = -EINVAL;
        len = le32_to_cpu(buf[1]);
        if (len != strlen(POLICYDB_STRING)) {
                pr_err("SELinux:  policydb string length %d does not "
                       "match expected length %zu\n",
                       len, strlen(POLICYDB_STRING));
                goto bad;
        }

        rc = -ENOMEM;
        policydb_str = kmalloc(len + 1, GFP_KERNEL);
        if (!policydb_str) {
                pr_err("SELinux:  unable to allocate memory for policydb "
                       "string of length %d\n", len);
                goto bad;
        }

        rc = next_entry(policydb_str, fp, len);
        if (rc) {
                pr_err("SELinux:  truncated policydb string identifier\n");
                kfree(policydb_str);
                goto bad;
        }

        rc = -EINVAL;
        policydb_str[len] = '\0';
        if (strcmp(policydb_str, POLICYDB_STRING)) {
                pr_err("SELinux:  policydb string %s does not match "
                       "my string %s\n", policydb_str, POLICYDB_STRING);
                kfree(policydb_str);
                goto bad;
        }
        /* Done with policydb_str. */
        kfree(policydb_str);
        policydb_str = NULL;

        /* Read the version and table sizes. */
        rc = next_entry(buf, fp, sizeof(u32)*4);
        if (rc)
                goto bad;

        rc = -EINVAL;
        p->policyvers = le32_to_cpu(buf[0]);
        if (p->policyvers < POLICYDB_VERSION_MIN ||
            p->policyvers > POLICYDB_VERSION_MAX) {
                pr_err("SELinux:  policydb version %d does not match "
                       "my version range %d-%d\n",
                       le32_to_cpu(buf[0]), POLICYDB_VERSION_MIN, POLICYDB_VERSION_MAX);
                goto bad;
        }

        if ((le32_to_cpu(buf[1]) & POLICYDB_CONFIG_MLS)) {
                p->mls_enabled = 1;

                rc = -EINVAL;
                if (p->policyvers < POLICYDB_VERSION_MLS) {
                        pr_err("SELinux: security policydb version %d "
                                "(MLS) not backwards compatible\n",
                                p->policyvers);
                        goto bad;
                }
        }
        p->reject_unknown = !!(le32_to_cpu(buf[1]) & REJECT_UNKNOWN);
        p->allow_unknown = !!(le32_to_cpu(buf[1]) & ALLOW_UNKNOWN);

        if (p->policyvers >= POLICYDB_VERSION_POLCAP) {
                rc = ebitmap_read(&p->policycaps, fp);
                if (rc)
                        goto bad;
        }

        if (p->policyvers >= POLICYDB_VERSION_PERMISSIVE) {
                rc = ebitmap_read(&p->permissive_map, fp);
                if (rc)
                        goto bad;
        }

        rc = -EINVAL;
        info = policydb_lookup_compat(p->policyvers);
        if (!info) {
                pr_err("SELinux:  unable to find policy compat info "
                       "for version %d\n", p->policyvers);
                goto bad;
        }

        rc = -EINVAL;
        if (le32_to_cpu(buf[2]) != info->sym_num ||
                le32_to_cpu(buf[3]) != info->ocon_num) {
                pr_err("SELinux:  policydb table sizes (%d,%d) do "
                       "not match mine (%d,%d)\n", le32_to_cpu(buf[2]),
                        le32_to_cpu(buf[3]),
                       info->sym_num, info->ocon_num);
                goto bad;
        }

        for (i = 0; i < info->sym_num; i++) {
                rc = next_entry(buf, fp, sizeof(u32)*2);
                if (rc)
                        goto bad;
                nprim = le32_to_cpu(buf[0]);
                nel = le32_to_cpu(buf[1]);

                rc = symtab_init(&p->symtab[i], nel);
                if (rc)
                        goto out;

                if (i == SYM_ROLES) {
                        rc = roles_init(p);
                        if (rc)
                                goto out;
                }

                for (j = 0; j < nel; j++) {
                        rc = read_f[i](p, &p->symtab[i], fp);
                        if (rc)
                                goto bad;
                }

                p->symtab[i].nprim = nprim;
        }

        rc = -EINVAL;
        p->process_class = string_to_security_class(p, "process");
        if (!p->process_class) {
                pr_err("SELinux: process class is required, not defined in policy\n");
                goto bad;
        }

        rc = avtab_read(&p->te_avtab, fp, p);
        if (rc)
                goto bad;

        if (p->policyvers >= POLICYDB_VERSION_BOOL) {
                rc = cond_read_list(p, fp);
                if (rc)
                        goto bad;
        }

        rc = next_entry(buf, fp, sizeof(u32));
        if (rc)
                goto bad;
        nel = le32_to_cpu(buf[0]);

        rc = hashtab_init(&p->role_tr, nel);
        if (rc)
                goto bad;
        for (i = 0; i < nel; i++) {
                rc = -ENOMEM;
                rtk = kmalloc(sizeof(*rtk), GFP_KERNEL);
                if (!rtk)
                        goto bad;

                rc = -ENOMEM;
                rtd = kmalloc(sizeof(*rtd), GFP_KERNEL);
                if (!rtd)
                        goto bad;

                rc = next_entry(buf, fp, sizeof(u32)*3);
                if (rc)
                        goto bad;

                rc = -EINVAL;
                rtk->role = le32_to_cpu(buf[0]);
                rtk->type = le32_to_cpu(buf[1]);
                rtd->new_role = le32_to_cpu(buf[2]);
                if (p->policyvers >= POLICYDB_VERSION_ROLETRANS) {
                        rc = next_entry(buf, fp, sizeof(u32));
                        if (rc)
                                goto bad;
                        rtk->tclass = le32_to_cpu(buf[0]);
                } else
                        rtk->tclass = p->process_class;

                rc = -EINVAL;
                if (!policydb_role_isvalid(p, rtk->role) ||
                    !policydb_type_isvalid(p, rtk->type) ||
                    !policydb_class_isvalid(p, rtk->tclass) ||
                    !policydb_role_isvalid(p, rtd->new_role))
                        goto bad;

                rc = hashtab_insert(&p->role_tr, rtk, rtd, roletr_key_params);
                if (rc)
                        goto bad;

                rtk = NULL;
                rtd = NULL;
        }

        rc = next_entry(buf, fp, sizeof(u32));
        if (rc)
                goto bad;
        nel = le32_to_cpu(buf[0]);
        lra = NULL;
        for (i = 0; i < nel; i++) {
                rc = -ENOMEM;
                ra = kzalloc(sizeof(*ra), GFP_KERNEL);
                if (!ra)
                        goto bad;
                if (lra)
                        lra->next = ra;
                else
                        p->role_allow = ra;
                rc = next_entry(buf, fp, sizeof(u32)*2);
                if (rc)
                        goto bad;

                rc = -EINVAL;
                ra->role = le32_to_cpu(buf[0]);
                ra->new_role = le32_to_cpu(buf[1]);
                if (!policydb_role_isvalid(p, ra->role) ||
                    !policydb_role_isvalid(p, ra->new_role))
                        goto bad;
                lra = ra;
        }

        rc = filename_trans_read(p, fp);
        if (rc)
                goto bad;

        rc = policydb_index(p);
        if (rc)
                goto bad;

        rc = -EINVAL;
        perm = string_to_av_perm(p, p->process_class, "transition");
        if (!perm) {
                pr_err("SELinux: process transition permission is required, not defined in policy\n");
                goto bad;
        }
        p->process_trans_perms = perm;
        perm = string_to_av_perm(p, p->process_class, "dyntransition");
        if (!perm) {
                pr_err("SELinux: process dyntransition permission is required, not defined in policy\n");
                goto bad;
        }
        p->process_trans_perms |= perm;

        rc = ocontext_read(p, info, fp);
        if (rc)
                goto bad;

        rc = genfs_read(p, fp);
        if (rc)
                goto bad;

        rc = range_read(p, fp);
        if (rc)
                goto bad;

        rc = -ENOMEM;
        p->type_attr_map_array = kvcalloc(p->p_types.nprim,
                                          sizeof(*p->type_attr_map_array),
                                          GFP_KERNEL);
        if (!p->type_attr_map_array)
                goto bad;

        /* just in case ebitmap_init() becomes more than just a memset(0): */
        for (i = 0; i < p->p_types.nprim; i++)
                ebitmap_init(&p->type_attr_map_array[i]);

        for (i = 0; i < p->p_types.nprim; i++) {
                struct ebitmap *e = &p->type_attr_map_array[i];

                if (p->policyvers >= POLICYDB_VERSION_AVTAB) {
                        rc = ebitmap_read(e, fp);
                        if (rc)
                                goto bad;
                }
                /* add the type itself as the degenerate case */
                rc = ebitmap_set_bit(e, i, 1);
                if (rc)
                        goto bad;
        }

        rc = policydb_bounds_sanity_check(p);
        if (rc)
                goto bad;

        rc = 0;
out:
        return rc;
bad:
        kfree(rtk);
        kfree(rtd);
        policydb_destroy(p);
        goto out;
}

/*
 * Write a MLS level structure to a policydb binary
 * representation file.
 */
static int mls_write_level(struct mls_level *l, void *fp)
{
        __le32 buf[1];
        int rc;

        buf[0] = cpu_to_le32(l->sens);
        rc = put_entry(buf, sizeof(u32), 1, fp);
        if (rc)
                return rc;

        rc = ebitmap_write(&l->cat, fp);
        if (rc)
                return rc;

        return 0;
}

/*
 * Write a MLS range structure to a policydb binary
 * representation file.
 */
static int mls_write_range_helper(struct mls_range *r, void *fp)
{
        __le32 buf[3];
        size_t items;
        int rc, eq;

        eq = mls_level_eq(&r->level[1], &r->level[0]);

        if (eq)
                items = 2;
        else
                items = 3;
        buf[0] = cpu_to_le32(items-1);
        buf[1] = cpu_to_le32(r->level[0].sens);
        if (!eq)
                buf[2] = cpu_to_le32(r->level[1].sens);

        BUG_ON(items > ARRAY_SIZE(buf));

        rc = put_entry(buf, sizeof(u32), items, fp);
        if (rc)
                return rc;

        rc = ebitmap_write(&r->level[0].cat, fp);
        if (rc)
                return rc;
        if (!eq) {
                rc = ebitmap_write(&r->level[1].cat, fp);
                if (rc)
                        return rc;
        }

        return 0;
}

static int sens_write(void *vkey, void *datum, void *ptr)
{
        char *key = vkey;
        struct level_datum *levdatum = datum;
        struct policy_data *pd = ptr;
        void *fp = pd->fp;
        __le32 buf[2];
        size_t len;
        int rc;

        len = strlen(key);
        buf[0] = cpu_to_le32(len);
        buf[1] = cpu_to_le32(levdatum->isalias);
        rc = put_entry(buf, sizeof(u32), 2, fp);
        if (rc)
                return rc;

        rc = put_entry(key, 1, len, fp);
        if (rc)
                return rc;

        rc = mls_write_level(levdatum->level, fp);
        if (rc)
                return rc;

        return 0;
}

static int cat_write(void *vkey, void *datum, void *ptr)
{
        char *key = vkey;
        struct cat_datum *catdatum = datum;
        struct policy_data *pd = ptr;
        void *fp = pd->fp;
        __le32 buf[3];
        size_t len;
        int rc;

        len = strlen(key);
        buf[0] = cpu_to_le32(len);
        buf[1] = cpu_to_le32(catdatum->value);
        buf[2] = cpu_to_le32(catdatum->isalias);
        rc = put_entry(buf, sizeof(u32), 3, fp);
        if (rc)
                return rc;

        rc = put_entry(key, 1, len, fp);
        if (rc)
                return rc;

        return 0;
}

static int role_trans_write_one(void *key, void *datum, void *ptr)
{
        struct role_trans_key *rtk = key;
        struct role_trans_datum *rtd = datum;
        struct policy_data *pd = ptr;
        void *fp = pd->fp;
        struct policydb *p = pd->p;
        __le32 buf[3];
        int rc;

        buf[0] = cpu_to_le32(rtk->role);
        buf[1] = cpu_to_le32(rtk->type);
        buf[2] = cpu_to_le32(rtd->new_role);
        rc = put_entry(buf, sizeof(u32), 3, fp);
        if (rc)
                return rc;
        if (p->policyvers >= POLICYDB_VERSION_ROLETRANS) {
                buf[0] = cpu_to_le32(rtk->tclass);
                rc = put_entry(buf, sizeof(u32), 1, fp);
                if (rc)
                        return rc;
        }
        return 0;
}

static int role_trans_write(struct policydb *p, void *fp)
{
        struct policy_data pd = { .p = p, .fp = fp };
        __le32 buf[1];
        int rc;

        buf[0] = cpu_to_le32(p->role_tr.nel);
        rc = put_entry(buf, sizeof(u32), 1, fp);
        if (rc)
                return rc;

        return hashtab_map(&p->role_tr, role_trans_write_one, &pd);
}

static int role_allow_write(struct role_allow *r, void *fp)
{
        struct role_allow *ra;
        __le32 buf[2];
        size_t nel;
        int rc;

        nel = 0;
        for (ra = r; ra; ra = ra->next)
                nel++;
        buf[0] = cpu_to_le32(nel);
        rc = put_entry(buf, sizeof(u32), 1, fp);
        if (rc)
                return rc;
        for (ra = r; ra; ra = ra->next) {
                buf[0] = cpu_to_le32(ra->role);
                buf[1] = cpu_to_le32(ra->new_role);
                rc = put_entry(buf, sizeof(u32), 2, fp);
                if (rc)
                        return rc;
        }
        return 0;
}

/*
 * Write a security context structure
 * to a policydb binary representation file.
 */
static int context_write(struct policydb *p, struct context *c,
                         void *fp)
{
        int rc;
        __le32 buf[3];

        buf[0] = cpu_to_le32(c->user);
        buf[1] = cpu_to_le32(c->role);
        buf[2] = cpu_to_le32(c->type);

        rc = put_entry(buf, sizeof(u32), 3, fp);
        if (rc)
                return rc;

        rc = mls_write_range_helper(&c->range, fp);
        if (rc)
                return rc;

        return 0;
}

/*
 * The following *_write functions are used to
 * write the symbol data to a policy database
 * binary representation file.
 */

static int perm_write(void *vkey, void *datum, void *fp)
{
        char *key = vkey;
        struct perm_datum *perdatum = datum;
        __le32 buf[2];
        size_t len;
        int rc;

        len = strlen(key);
        buf[0] = cpu_to_le32(len);
        buf[1] = cpu_to_le32(perdatum->value);
        rc = put_entry(buf, sizeof(u32), 2, fp);
        if (rc)
                return rc;

        rc = put_entry(key, 1, len, fp);
        if (rc)
                return rc;

        return 0;
}

static int common_write(void *vkey, void *datum, void *ptr)
{
        char *key = vkey;
        struct common_datum *comdatum = datum;
        struct policy_data *pd = ptr;
        void *fp = pd->fp;
        __le32 buf[4];
        size_t len;
        int rc;

        len = strlen(key);
        buf[0] = cpu_to_le32(len);
        buf[1] = cpu_to_le32(comdatum->value);
        buf[2] = cpu_to_le32(comdatum->permissions.nprim);
        buf[3] = cpu_to_le32(comdatum->permissions.table.nel);
        rc = put_entry(buf, sizeof(u32), 4, fp);
        if (rc)
                return rc;

        rc = put_entry(key, 1, len, fp);
        if (rc)
                return rc;

        rc = hashtab_map(&comdatum->permissions.table, perm_write, fp);
        if (rc)
                return rc;

        return 0;
}

static int type_set_write(struct type_set *t, void *fp)
{
        int rc;
        __le32 buf[1];

        if (ebitmap_write(&t->types, fp))
                return -EINVAL;
        if (ebitmap_write(&t->negset, fp))
                return -EINVAL;

        buf[0] = cpu_to_le32(t->flags);
        rc = put_entry(buf, sizeof(u32), 1, fp);
        if (rc)
                return -EINVAL;

        return 0;
}

static int write_cons_helper(struct policydb *p, struct constraint_node *node,
                             void *fp)
{
        struct constraint_node *c;
        struct constraint_expr *e;
        __le32 buf[3];
        u32 nel;
        int rc;

        for (c = node; c; c = c->next) {
                nel = 0;
                for (e = c->expr; e; e = e->next)
                        nel++;
                buf[0] = cpu_to_le32(c->permissions);
                buf[1] = cpu_to_le32(nel);
                rc = put_entry(buf, sizeof(u32), 2, fp);
                if (rc)
                        return rc;
                for (e = c->expr; e; e = e->next) {
                        buf[0] = cpu_to_le32(e->expr_type);
                        buf[1] = cpu_to_le32(e->attr);
                        buf[2] = cpu_to_le32(e->op);
                        rc = put_entry(buf, sizeof(u32), 3, fp);
                        if (rc)
                                return rc;

                        switch (e->expr_type) {
                        case CEXPR_NAMES:
                                rc = ebitmap_write(&e->names, fp);
                                if (rc)
                                        return rc;
                                if (p->policyvers >=
                                        POLICYDB_VERSION_CONSTRAINT_NAMES) {
                                        rc = type_set_write(e->type_names, fp);
                                        if (rc)
                                                return rc;
                                }
                                break;
                        default:
                                break;
                        }
                }
        }

        return 0;
}

static int class_write(void *vkey, void *datum, void *ptr)
{
        char *key = vkey;
        struct class_datum *cladatum = datum;
        struct policy_data *pd = ptr;
        void *fp = pd->fp;
        struct policydb *p = pd->p;
        struct constraint_node *c;
        __le32 buf[6];
        u32 ncons;
        size_t len, len2;
        int rc;

        len = strlen(key);
        if (cladatum->comkey)
                len2 = strlen(cladatum->comkey);
        else
                len2 = 0;

        ncons = 0;
        for (c = cladatum->constraints; c; c = c->next)
                ncons++;

        buf[0] = cpu_to_le32(len);
        buf[1] = cpu_to_le32(len2);
        buf[2] = cpu_to_le32(cladatum->value);
        buf[3] = cpu_to_le32(cladatum->permissions.nprim);
        buf[4] = cpu_to_le32(cladatum->permissions.table.nel);
        buf[5] = cpu_to_le32(ncons);
        rc = put_entry(buf, sizeof(u32), 6, fp);
        if (rc)
                return rc;

        rc = put_entry(key, 1, len, fp);
        if (rc)
                return rc;

        if (cladatum->comkey) {
                rc = put_entry(cladatum->comkey, 1, len2, fp);
                if (rc)
                        return rc;
        }

        rc = hashtab_map(&cladatum->permissions.table, perm_write, fp);
        if (rc)
                return rc;

        rc = write_cons_helper(p, cladatum->constraints, fp);
        if (rc)
                return rc;

        /* write out the validatetrans rule */
        ncons = 0;
        for (c = cladatum->validatetrans; c; c = c->next)
                ncons++;

        buf[0] = cpu_to_le32(ncons);
        rc = put_entry(buf, sizeof(u32), 1, fp);
        if (rc)
                return rc;

        rc = write_cons_helper(p, cladatum->validatetrans, fp);
        if (rc)
                return rc;

        if (p->policyvers >= POLICYDB_VERSION_NEW_OBJECT_DEFAULTS) {
                buf[0] = cpu_to_le32(cladatum->default_user);
                buf[1] = cpu_to_le32(cladatum->default_role);
                buf[2] = cpu_to_le32(cladatum->default_range);

                rc = put_entry(buf, sizeof(uint32_t), 3, fp);
                if (rc)
                        return rc;
        }

        if (p->policyvers >= POLICYDB_VERSION_DEFAULT_TYPE) {
                buf[0] = cpu_to_le32(cladatum->default_type);
                rc = put_entry(buf, sizeof(uint32_t), 1, fp);
                if (rc)
                        return rc;
        }

        return 0;
}

static int role_write(void *vkey, void *datum, void *ptr)
{
        char *key = vkey;
        struct role_datum *role = datum;
        struct policy_data *pd = ptr;
        void *fp = pd->fp;
        struct policydb *p = pd->p;
        __le32 buf[3];
        size_t items, len;
        int rc;

        len = strlen(key);
        items = 0;
        buf[items++] = cpu_to_le32(len);
        buf[items++] = cpu_to_le32(role->value);
        if (p->policyvers >= POLICYDB_VERSION_BOUNDARY)
                buf[items++] = cpu_to_le32(role->bounds);

        BUG_ON(items > ARRAY_SIZE(buf));

        rc = put_entry(buf, sizeof(u32), items, fp);
        if (rc)
                return rc;

        rc = put_entry(key, 1, len, fp);
        if (rc)
                return rc;

        rc = ebitmap_write(&role->dominates, fp);
        if (rc)
                return rc;

        rc = ebitmap_write(&role->types, fp);
        if (rc)
                return rc;

        return 0;
}

static int type_write(void *vkey, void *datum, void *ptr)
{
        char *key = vkey;
        struct type_datum *typdatum = datum;
        struct policy_data *pd = ptr;
        struct policydb *p = pd->p;
        void *fp = pd->fp;
        __le32 buf[4];
        int rc;
        size_t items, len;

        len = strlen(key);
        items = 0;
        buf[items++] = cpu_to_le32(len);
        buf[items++] = cpu_to_le32(typdatum->value);
        if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) {
                u32 properties = 0;

                if (typdatum->primary)
                        properties |= TYPEDATUM_PROPERTY_PRIMARY;

                if (typdatum->attribute)
                        properties |= TYPEDATUM_PROPERTY_ATTRIBUTE;

                buf[items++] = cpu_to_le32(properties);
                buf[items++] = cpu_to_le32(typdatum->bounds);
        } else {
                buf[items++] = cpu_to_le32(typdatum->primary);
        }
        BUG_ON(items > ARRAY_SIZE(buf));
        rc = put_entry(buf, sizeof(u32), items, fp);
        if (rc)
                return rc;

        rc = put_entry(key, 1, len, fp);
        if (rc)
                return rc;

        return 0;
}

static int user_write(void *vkey, void *datum, void *ptr)
{
        char *key = vkey;
        struct user_datum *usrdatum = datum;
        struct policy_data *pd = ptr;
        struct policydb *p = pd->p;
        void *fp = pd->fp;
        __le32 buf[3];
        size_t items, len;
        int rc;

        len = strlen(key);
        items = 0;
        buf[items++] = cpu_to_le32(len);
        buf[items++] = cpu_to_le32(usrdatum->value);
        if (p->policyvers >= POLICYDB_VERSION_BOUNDARY)
                buf[items++] = cpu_to_le32(usrdatum->bounds);
        BUG_ON(items > ARRAY_SIZE(buf));
        rc = put_entry(buf, sizeof(u32), items, fp);
        if (rc)
                return rc;

        rc = put_entry(key, 1, len, fp);
        if (rc)
                return rc;

        rc = ebitmap_write(&usrdatum->roles, fp);
        if (rc)
                return rc;

        rc = mls_write_range_helper(&usrdatum->range, fp);
        if (rc)
                return rc;

        rc = mls_write_level(&usrdatum->dfltlevel, fp);
        if (rc)
                return rc;

        return 0;
}

static int (*write_f[SYM_NUM]) (void *key, void *datum,
                                void *datap) =
{
        common_write,
        class_write,
        role_write,
        type_write,
        user_write,
        cond_write_bool,
        sens_write,
        cat_write,
};

static int ocontext_write(struct policydb *p, struct policydb_compat_info *info,
                          void *fp)
{
        unsigned int i, j, rc;
        size_t nel, len;
        __be64 prefixbuf[1];
        __le32 buf[3];
        u32 nodebuf[8];
        struct ocontext *c;
        for (i = 0; i < info->ocon_num; i++) {
                nel = 0;
                for (c = p->ocontexts[i]; c; c = c->next)
                        nel++;
                buf[0] = cpu_to_le32(nel);
                rc = put_entry(buf, sizeof(u32), 1, fp);
                if (rc)
                        return rc;
                for (c = p->ocontexts[i]; c; c = c->next) {
                        switch (i) {
                        case OCON_ISID:
                                buf[0] = cpu_to_le32(c->sid[0]);
                                rc = put_entry(buf, sizeof(u32), 1, fp);
                                if (rc)
                                        return rc;
                                rc = context_write(p, &c->context[0], fp);
                                if (rc)
                                        return rc;
                                break;
                        case OCON_FS:
                        case OCON_NETIF:
                                len = strlen(c->u.name);
                                buf[0] = cpu_to_le32(len);
                                rc = put_entry(buf, sizeof(u32), 1, fp);
                                if (rc)
                                        return rc;
                                rc = put_entry(c->u.name, 1, len, fp);
                                if (rc)
                                        return rc;
                                rc = context_write(p, &c->context[0], fp);
                                if (rc)
                                        return rc;
                                rc = context_write(p, &c->context[1], fp);
                                if (rc)
                                        return rc;
                                break;
                        case OCON_PORT:
                                buf[0] = cpu_to_le32(c->u.port.protocol);
                                buf[1] = cpu_to_le32(c->u.port.low_port);
                                buf[2] = cpu_to_le32(c->u.port.high_port);
                                rc = put_entry(buf, sizeof(u32), 3, fp);
                                if (rc)
                                        return rc;
                                rc = context_write(p, &c->context[0], fp);
                                if (rc)
                                        return rc;
                                break;
                        case OCON_NODE:
                                nodebuf[0] = c->u.node.addr; /* network order */
                                nodebuf[1] = c->u.node.mask; /* network order */
                                rc = put_entry(nodebuf, sizeof(u32), 2, fp);
                                if (rc)
                                        return rc;
                                rc = context_write(p, &c->context[0], fp);
                                if (rc)
                                        return rc;
                                break;
                        case OCON_FSUSE:
                                buf[0] = cpu_to_le32(c->v.behavior);
                                len = strlen(c->u.name);
                                buf[1] = cpu_to_le32(len);
                                rc = put_entry(buf, sizeof(u32), 2, fp);
                                if (rc)
                                        return rc;
                                rc = put_entry(c->u.name, 1, len, fp);
                                if (rc)
                                        return rc;
                                rc = context_write(p, &c->context[0], fp);
                                if (rc)
                                        return rc;
                                break;
                        case OCON_NODE6:
                                for (j = 0; j < 4; j++)
                                        nodebuf[j] = c->u.node6.addr[j]; /* network order */
                                for (j = 0; j < 4; j++)
                                        nodebuf[j + 4] = c->u.node6.mask[j]; /* network order */
                                rc = put_entry(nodebuf, sizeof(u32), 8, fp);
                                if (rc)
                                        return rc;
                                rc = context_write(p, &c->context[0], fp);
                                if (rc)
                                        return rc;
                                break;
                        case OCON_IBPKEY:
                                /* subnet_prefix is in CPU order */
                                prefixbuf[0] = cpu_to_be64(c->u.ibpkey.subnet_prefix);

                                rc = put_entry(prefixbuf, sizeof(u64), 1, fp);
                                if (rc)
                                        return rc;

                                buf[0] = cpu_to_le32(c->u.ibpkey.low_pkey);
                                buf[1] = cpu_to_le32(c->u.ibpkey.high_pkey);

                                rc = put_entry(buf, sizeof(u32), 2, fp);
                                if (rc)
                                        return rc;
                                rc = context_write(p, &c->context[0], fp);
                                if (rc)
                                        return rc;
                                break;
                        case OCON_IBENDPORT:
                                len = strlen(c->u.ibendport.dev_name);
                                buf[0] = cpu_to_le32(len);
                                buf[1] = cpu_to_le32(c->u.ibendport.port);
                                rc = put_entry(buf, sizeof(u32), 2, fp);
                                if (rc)
                                        return rc;
                                rc = put_entry(c->u.ibendport.dev_name, 1, len, fp);
                                if (rc)
                                        return rc;
                                rc = context_write(p, &c->context[0], fp);
                                if (rc)
                                        return rc;
                                break;
                        }
                }
        }
        return 0;
}

static int genfs_write(struct policydb *p, void *fp)
{
        struct genfs *genfs;
        struct ocontext *c;
        size_t len;
        __le32 buf[1];
        int rc;

        len = 0;
        for (genfs = p->genfs; genfs; genfs = genfs->next)
                len++;
        buf[0] = cpu_to_le32(len);
        rc = put_entry(buf, sizeof(u32), 1, fp);
        if (rc)
                return rc;
        for (genfs = p->genfs; genfs; genfs = genfs->next) {
                len = strlen(genfs->fstype);
                buf[0] = cpu_to_le32(len);
                rc = put_entry(buf, sizeof(u32), 1, fp);
                if (rc)
                        return rc;
                rc = put_entry(genfs->fstype, 1, len, fp);
                if (rc)
                        return rc;
                len = 0;
                for (c = genfs->head; c; c = c->next)
                        len++;
                buf[0] = cpu_to_le32(len);
                rc = put_entry(buf, sizeof(u32), 1, fp);
                if (rc)
                        return rc;
                for (c = genfs->head; c; c = c->next) {
                        len = strlen(c->u.name);
                        buf[0] = cpu_to_le32(len);
                        rc = put_entry(buf, sizeof(u32), 1, fp);
                        if (rc)
                                return rc;
                        rc = put_entry(c->u.name, 1, len, fp);
                        if (rc)
                                return rc;
                        buf[0] = cpu_to_le32(c->v.sclass);
                        rc = put_entry(buf, sizeof(u32), 1, fp);
                        if (rc)
                                return rc;
                        rc = context_write(p, &c->context[0], fp);
                        if (rc)
                                return rc;
                }
        }
        return 0;
}

static int range_write_helper(void *key, void *data, void *ptr)
{
        __le32 buf[2];
        struct range_trans *rt = key;
        struct mls_range *r = data;
        struct policy_data *pd = ptr;
        void *fp = pd->fp;
        struct policydb *p = pd->p;
        int rc;

        buf[0] = cpu_to_le32(rt->source_type);
        buf[1] = cpu_to_le32(rt->target_type);
        rc = put_entry(buf, sizeof(u32), 2, fp);
        if (rc)
                return rc;
        if (p->policyvers >= POLICYDB_VERSION_RANGETRANS) {
                buf[0] = cpu_to_le32(rt->target_class);
                rc = put_entry(buf, sizeof(u32), 1, fp);
                if (rc)
                        return rc;
        }
        rc = mls_write_range_helper(r, fp);
        if (rc)
                return rc;

        return 0;
}

static int range_write(struct policydb *p, void *fp)
{
        __le32 buf[1];
        int rc;
        struct policy_data pd;

        pd.p = p;
        pd.fp = fp;

        buf[0] = cpu_to_le32(p->range_tr.nel);
        rc = put_entry(buf, sizeof(u32), 1, fp);
        if (rc)
                return rc;

        /* actually write all of the entries */
        rc = hashtab_map(&p->range_tr, range_write_helper, &pd);
        if (rc)
                return rc;

        return 0;
}

static int filename_write_helper_compat(void *key, void *data, void *ptr)
{
        struct filename_trans_key *ft = key;
        struct filename_trans_datum *datum = data;
        struct ebitmap_node *node;
        void *fp = ptr;
        __le32 buf[4];
        int rc;
        u32 bit, len = strlen(ft->name);

        do {
                ebitmap_for_each_positive_bit(&datum->stypes, node, bit) {
                        buf[0] = cpu_to_le32(len);
                        rc = put_entry(buf, sizeof(u32), 1, fp);
                        if (rc)
                                return rc;

                        rc = put_entry(ft->name, sizeof(char), len, fp);
                        if (rc)
                                return rc;

                        buf[0] = cpu_to_le32(bit + 1);
                        buf[1] = cpu_to_le32(ft->ttype);
                        buf[2] = cpu_to_le32(ft->tclass);
                        buf[3] = cpu_to_le32(datum->otype);

                        rc = put_entry(buf, sizeof(u32), 4, fp);
                        if (rc)
                                return rc;
                }

                datum = datum->next;
        } while (unlikely(datum));

        return 0;
}

static int filename_write_helper(void *key, void *data, void *ptr)
{
        struct filename_trans_key *ft = key;
        struct filename_trans_datum *datum;
        void *fp = ptr;
        __le32 buf[3];
        int rc;
        u32 ndatum, len = strlen(ft->name);

        buf[0] = cpu_to_le32(len);
        rc = put_entry(buf, sizeof(u32), 1, fp);
        if (rc)
                return rc;

        rc = put_entry(ft->name, sizeof(char), len, fp);
        if (rc)
                return rc;

        ndatum = 0;
        datum = data;
        do {
                ndatum++;
                datum = datum->next;
        } while (unlikely(datum));

        buf[0] = cpu_to_le32(ft->ttype);
        buf[1] = cpu_to_le32(ft->tclass);
        buf[2] = cpu_to_le32(ndatum);
        rc = put_entry(buf, sizeof(u32), 3, fp);
        if (rc)
                return rc;

        datum = data;
        do {
                rc = ebitmap_write(&datum->stypes, fp);
                if (rc)
                        return rc;

                buf[0] = cpu_to_le32(datum->otype);
                rc = put_entry(buf, sizeof(u32), 1, fp);
                if (rc)
                        return rc;

                datum = datum->next;
        } while (unlikely(datum));

        return 0;
}

static int filename_trans_write(struct policydb *p, void *fp)
{
        __le32 buf[1];
        int rc;

        if (p->policyvers < POLICYDB_VERSION_FILENAME_TRANS)
                return 0;

        if (p->policyvers < POLICYDB_VERSION_COMP_FTRANS) {
                buf[0] = cpu_to_le32(p->compat_filename_trans_count);
                rc = put_entry(buf, sizeof(u32), 1, fp);
                if (rc)
                        return rc;

                rc = hashtab_map(&p->filename_trans,
                                 filename_write_helper_compat, fp);
        } else {
                buf[0] = cpu_to_le32(p->filename_trans.nel);
                rc = put_entry(buf, sizeof(u32), 1, fp);
                if (rc)
                        return rc;

                rc = hashtab_map(&p->filename_trans, filename_write_helper, fp);
        }
        return rc;
}

/*
 * Write the configuration data in a policy database
 * structure to a policy database binary representation
 * file.
 */
int policydb_write(struct policydb *p, void *fp)
{
        unsigned int i, num_syms;
        int rc;
        __le32 buf[4];
        u32 config;
        size_t len;
        struct policydb_compat_info *info;

        /*
         * refuse to write policy older than compressed avtab
         * to simplify the writer.  There are other tests dropped
         * since we assume this throughout the writer code.  Be
         * careful if you ever try to remove this restriction
         */
        if (p->policyvers < POLICYDB_VERSION_AVTAB) {
                pr_err("SELinux: refusing to write policy version %d."
                       "  Because it is less than version %d\n", p->policyvers,
                       POLICYDB_VERSION_AVTAB);
                return -EINVAL;
        }

        config = 0;
        if (p->mls_enabled)
                config |= POLICYDB_CONFIG_MLS;

        if (p->reject_unknown)
                config |= REJECT_UNKNOWN;
        if (p->allow_unknown)
                config |= ALLOW_UNKNOWN;

        /* Write the magic number and string identifiers. */
        buf[0] = cpu_to_le32(POLICYDB_MAGIC);
        len = strlen(POLICYDB_STRING);
        buf[1] = cpu_to_le32(len);
        rc = put_entry(buf, sizeof(u32), 2, fp);
        if (rc)
                return rc;
        rc = put_entry(POLICYDB_STRING, 1, len, fp);
        if (rc)
                return rc;

        /* Write the version, config, and table sizes. */
        info = policydb_lookup_compat(p->policyvers);
        if (!info) {
                pr_err("SELinux: compatibility lookup failed for policy "
                    "version %d", p->policyvers);
                return -EINVAL;
        }

        buf[0] = cpu_to_le32(p->policyvers);
        buf[1] = cpu_to_le32(config);
        buf[2] = cpu_to_le32(info->sym_num);
        buf[3] = cpu_to_le32(info->ocon_num);

        rc = put_entry(buf, sizeof(u32), 4, fp);
        if (rc)
                return rc;

        if (p->policyvers >= POLICYDB_VERSION_POLCAP) {
                rc = ebitmap_write(&p->policycaps, fp);
                if (rc)
                        return rc;
        }

        if (p->policyvers >= POLICYDB_VERSION_PERMISSIVE) {
                rc = ebitmap_write(&p->permissive_map, fp);
                if (rc)
                        return rc;
        }

        num_syms = info->sym_num;
        for (i = 0; i < num_syms; i++) {
                struct policy_data pd;

                pd.fp = fp;
                pd.p = p;

                buf[0] = cpu_to_le32(p->symtab[i].nprim);
                buf[1] = cpu_to_le32(p->symtab[i].table.nel);

                rc = put_entry(buf, sizeof(u32), 2, fp);
                if (rc)
                        return rc;
                rc = hashtab_map(&p->symtab[i].table, write_f[i], &pd);
                if (rc)
                        return rc;
        }

        rc = avtab_write(p, &p->te_avtab, fp);
        if (rc)
                return rc;

        rc = cond_write_list(p, fp);
        if (rc)
                return rc;

        rc = role_trans_write(p, fp);
        if (rc)
                return rc;

        rc = role_allow_write(p->role_allow, fp);
        if (rc)
                return rc;

        rc = filename_trans_write(p, fp);
        if (rc)
                return rc;

        rc = ocontext_write(p, info, fp);
        if (rc)
                return rc;

        rc = genfs_write(p, fp);
        if (rc)
                return rc;

        rc = range_write(p, fp);
        if (rc)
                return rc;

        for (i = 0; i < p->p_types.nprim; i++) {
                struct ebitmap *e = &p->type_attr_map_array[i];

                rc = ebitmap_write(e, fp);
                if (rc)
                        return rc;
        }

        return 0;
}


































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Portions of this file
 * Copyright (C) 2018 Intel Corporation
 */
#ifndef __NET_WIRELESS_NL80211_H
#define __NET_WIRELESS_NL80211_H

#include "core.h"

int nl80211_init(void);
void nl80211_exit(void);

void *nl80211hdr_put(struct sk_buff *skb, u32 portid, u32 seq,
                     int flags, u8 cmd);
bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info,
                          int attr);

static inline u64 wdev_id(struct wireless_dev *wdev)
{
        return (u64)wdev->identifier |
               ((u64)wiphy_to_rdev(wdev->wiphy)->wiphy_idx << 32);
}

int nl80211_prepare_wdev_dump(struct netlink_callback *cb,
                              struct cfg80211_registered_device **rdev,
                              struct wireless_dev **wdev);

int nl80211_parse_chandef(struct cfg80211_registered_device *rdev,
                          struct genl_info *info,
                          struct cfg80211_chan_def *chandef);
int nl80211_parse_random_mac(struct nlattr **attrs,
                             u8 *mac_addr, u8 *mac_addr_mask);

void nl80211_notify_wiphy(struct cfg80211_registered_device *rdev,
                          enum nl80211_commands cmd);
void nl80211_notify_iface(struct cfg80211_registered_device *rdev,
                          struct wireless_dev *wdev,
                          enum nl80211_commands cmd);
void nl80211_send_scan_start(struct cfg80211_registered_device *rdev,
                             struct wireless_dev *wdev);
struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev,
                                       struct wireless_dev *wdev, bool aborted);
void nl80211_send_scan_msg(struct cfg80211_registered_device *rdev,
                           struct sk_buff *msg);
void nl80211_send_sched_scan(struct cfg80211_sched_scan_request *req, u32 cmd);
void nl80211_common_reg_change_event(enum nl80211_commands cmd_id,
                                     struct regulatory_request *request);

static inline void
nl80211_send_reg_change_event(struct regulatory_request *request)
{
        nl80211_common_reg_change_event(NL80211_CMD_REG_CHANGE, request);
}

static inline void
nl80211_send_wiphy_reg_change_event(struct regulatory_request *request)
{
        nl80211_common_reg_change_event(NL80211_CMD_WIPHY_REG_CHANGE, request);
}

void nl80211_send_rx_auth(struct cfg80211_registered_device *rdev,
                          struct net_device *netdev,
                          const u8 *buf, size_t len, gfp_t gfp);
void nl80211_send_rx_assoc(struct cfg80211_registered_device *rdev,
                           struct net_device *netdev,
                           const u8 *buf, size_t len, gfp_t gfp,
                           int uapsd_queues,
                           const u8 *req_ies, size_t req_ies_len);
void nl80211_send_deauth(struct cfg80211_registered_device *rdev,
                         struct net_device *netdev,
                         const u8 *buf, size_t len, gfp_t gfp);
void nl80211_send_disassoc(struct cfg80211_registered_device *rdev,
                           struct net_device *netdev,
                           const u8 *buf, size_t len, gfp_t gfp);
void nl80211_send_auth_timeout(struct cfg80211_registered_device *rdev,
                               struct net_device *netdev,
                               const u8 *addr, gfp_t gfp);
void nl80211_send_assoc_timeout(struct cfg80211_registered_device *rdev,
                                struct net_device *netdev,
                                const u8 *addr, gfp_t gfp);
void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
                                 struct net_device *netdev,
                                 struct cfg80211_connect_resp_params *params,
                                 gfp_t gfp);
void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
                         struct net_device *netdev,
                         struct cfg80211_roam_info *info, gfp_t gfp);
void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev,
                                  struct net_device *netdev, const u8 *bssid);
void nl80211_send_disconnected(struct cfg80211_registered_device *rdev,
                               struct net_device *netdev, u16 reason,
                               const u8 *ie, size_t ie_len, bool from_ap);

void
nl80211_michael_mic_failure(struct cfg80211_registered_device *rdev,
                            struct net_device *netdev, const u8 *addr,
                            enum nl80211_key_type key_type,
                            int key_id, const u8 *tsc, gfp_t gfp);

void
nl80211_send_beacon_hint_event(struct wiphy *wiphy,
                               struct ieee80211_channel *channel_before,
                               struct ieee80211_channel *channel_after);

void nl80211_send_ibss_bssid(struct cfg80211_registered_device *rdev,
                             struct net_device *netdev, const u8 *bssid,
                             gfp_t gfp);

int nl80211_send_mgmt(struct cfg80211_registered_device *rdev,
                      struct wireless_dev *wdev, u32 nlpid,
                      int freq, int sig_dbm,
                      const u8 *buf, size_t len, u32 flags, gfp_t gfp);

void
nl80211_radar_notify(struct cfg80211_registered_device *rdev,
                     const struct cfg80211_chan_def *chandef,
                     enum nl80211_radar_event event,
                     struct net_device *netdev, gfp_t gfp);

void nl80211_send_ap_stopped(struct wireless_dev *wdev);

void cfg80211_rdev_free_coalesce(struct cfg80211_registered_device *rdev);

/* peer measurement */
int nl80211_pmsr_start(struct sk_buff *skb, struct genl_info *info);
int nl80211_pmsr_dump_results(struct sk_buff *skb, struct netlink_callback *cb);

#endif /* __NET_WIRELESS_NL80211_H */





























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_TIMERQUEUE_H
#define _LINUX_TIMERQUEUE_H

#include <linux/rbtree.h>
#include <linux/ktime.h>


struct timerqueue_node {
        struct rb_node node;
        ktime_t expires;
};

struct timerqueue_head {
        struct rb_root_cached rb_root;
};


extern bool timerqueue_add(struct timerqueue_head *head,
                           struct timerqueue_node *node);
extern bool timerqueue_del(struct timerqueue_head *head,
                           struct timerqueue_node *node);
extern struct timerqueue_node *timerqueue_iterate_next(
                                                struct timerqueue_node *node);

/**
 * timerqueue_getnext - Returns the timer with the earliest expiration time
 *
 * @head: head of timerqueue
 *
 * Returns a pointer to the timer node that has the earliest expiration time.
 */
static inline
struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head)
{
        struct rb_node *leftmost = rb_first_cached(&head->rb_root);

        return rb_entry_safe(leftmost, struct timerqueue_node, node);
}

static inline void timerqueue_init(struct timerqueue_node *node)
{
        RB_CLEAR_NODE(&node->node);
}

static inline bool timerqueue_node_queued(struct timerqueue_node *node)
{
        return !RB_EMPTY_NODE(&node->node);
}

static inline bool timerqueue_node_expires(struct timerqueue_node *node)
{
        return node->expires;
}

static inline void timerqueue_init_head(struct timerqueue_head *head)
{
        head->rb_root = RB_ROOT_CACHED;
}
#endif /* _LINUX_TIMERQUEUE_H */


























































































































































































































































































































































































































































































































































































































































































































































































































    1 

    1 







































































































    1 

    1 



    1 









    1 





    1 


    1 













































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 

    1 
    1 


    1 

    1 























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 














































    1 










    1 







































































    1 






    1 


    1 



























    1 


    1 





    1 


    1 



    1 

    1 







































    1 






    1 
    1 
































    1 












    1 




























































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
/*
 *  Generic process-grouping system.
 *
 *  Based originally on the cpuset system, extracted by Paul Menage
 *  Copyright (C) 2006 Google, Inc
 *
 *  Notifications support
 *  Copyright (C) 2009 Nokia Corporation
 *  Author: Kirill A. Shutemov
 *
 *  Copyright notices from the original cpuset code:
 *  --------------------------------------------------
 *  Copyright (C) 2003 BULL SA.
 *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
 *
 *  Portions derived from Patrick Mochel's sysfs code.
 *  sysfs is Copyright (c) 2001-3 Patrick Mochel
 *
 *  2003-10-10 Written by Simon Derr.
 *  2003-10-22 Updates by Stephen Hemminger.
 *  2004 May-July Rework by Paul Jackson.
 *  ---------------------------------------------------
 *
 *  This file is subject to the terms and conditions of the GNU General Public
 *  License.  See the file COPYING in the main directory of the Linux
 *  distribution for more details.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include "cgroup-internal.h"

#include <linux/cred.h>
#include <linux/errno.h>
#include <linux/init_task.h>
#include <linux/kernel.h>
#include <linux/magic.h>
#include <linux/mutex.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
#include <linux/proc_fs.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/percpu-rwsem.h>
#include <linux/string.h>
#include <linux/hashtable.h>
#include <linux/idr.h>
#include <linux/kthread.h>
#include <linux/atomic.h>
#include <linux/cpuset.h>
#include <linux/proc_ns.h>
#include <linux/nsproxy.h>
#include <linux/file.h>
#include <linux/fs_parser.h>
#include <linux/sched/cputime.h>
#include <linux/sched/deadline.h>
#include <linux/psi.h>
#include <net/sock.h>

#define CREATE_TRACE_POINTS
#include <trace/events/cgroup.h>

#define CGROUP_FILE_NAME_MAX                (MAX_CGROUP_TYPE_NAMELEN +        \
                                         MAX_CFTYPE_NAME + 2)
/* let's not notify more than 100 times per second */
#define CGROUP_FILE_NOTIFY_MIN_INTV        DIV_ROUND_UP(HZ, 100)

/*
 * cgroup_mutex is the master lock.  Any modification to cgroup or its
 * hierarchy must be performed while holding it.
 *
 * css_set_lock protects task->cgroups pointer, the list of css_set
 * objects, and the chain of tasks off each css_set.
 *
 * These locks are exported if CONFIG_PROVE_RCU so that accessors in
 * cgroup.h can use them for lockdep annotations.
 */
DEFINE_MUTEX(cgroup_mutex);
DEFINE_SPINLOCK(css_set_lock);

#if (defined CONFIG_PROVE_RCU || defined CONFIG_LOCKDEP)
EXPORT_SYMBOL_GPL(cgroup_mutex);
EXPORT_SYMBOL_GPL(css_set_lock);
#endif

DEFINE_SPINLOCK(trace_cgroup_path_lock);
char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
bool cgroup_debug __read_mostly;

/*
 * Protects cgroup_idr and css_idr so that IDs can be released without
 * grabbing cgroup_mutex.
 */
static DEFINE_SPINLOCK(cgroup_idr_lock);

/*
 * Protects cgroup_file->kn for !self csses.  It synchronizes notifications
 * against file removal/re-creation across css hiding.
 */
static DEFINE_SPINLOCK(cgroup_file_kn_lock);

DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);

#define cgroup_assert_mutex_or_rcu_locked()                                \
        RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                        \
                           !lockdep_is_held(&cgroup_mutex),                \
                           "cgroup_mutex or RCU read lock required");

/*
 * cgroup destruction makes heavy use of work items and there can be a lot
 * of concurrent destructions.  Use a separate workqueue so that cgroup
 * destruction work items don't end up filling up max_active of system_wq
 * which may lead to deadlock.
 *
 * A cgroup destruction should enqueue work sequentially to:
 * cgroup_offline_wq: use for css offline work
 * cgroup_release_wq: use for css release work
 * cgroup_free_wq: use for free work
 *
 * Rationale for using separate workqueues:
 * The cgroup root free work may depend on completion of other css offline
 * operations. If all tasks were enqueued to a single workqueue, this could
 * create a deadlock scenario where:
 * - Free work waits for other css offline work to complete.
 * - But other css offline work is queued after free work in the same queue.
 *
 * Example deadlock scenario with single workqueue (cgroup_destroy_wq):
 * 1. umount net_prio
 * 2. net_prio root destruction enqueues work to cgroup_destroy_wq (CPUx)
 * 3. perf_event CSS A offline enqueues work to same cgroup_destroy_wq (CPUx)
 * 4. net_prio cgroup_destroy_root->cgroup_lock_and_drain_offline.
 * 5. net_prio root destruction blocks waiting for perf_event CSS A offline,
 *    which can never complete as it's behind in the same queue and
 *    workqueue's max_active is 1.
 */
static struct workqueue_struct *cgroup_offline_wq;
static struct workqueue_struct *cgroup_release_wq;
static struct workqueue_struct *cgroup_free_wq;

/* generate an array of cgroup subsystem pointers */
#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
struct cgroup_subsys *cgroup_subsys[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS

/* array of cgroup subsystem names */
#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
static const char *cgroup_subsys_name[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS

/* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */
#define SUBSYS(_x)                                                                \
        DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key);                        \
        DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key);                        \
        EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key);                        \
        EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
#include <linux/cgroup_subsys.h>
#undef SUBSYS

#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
static struct static_key_true *cgroup_subsys_enabled_key[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS

#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS

static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu);

/* the default hierarchy */
struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu };
EXPORT_SYMBOL_GPL(cgrp_dfl_root);

/*
 * The default hierarchy always exists but is hidden until mounted for the
 * first time.  This is for backward compatibility.
 */
static bool cgrp_dfl_visible;

/* some controllers are not supported in the default hierarchy */
static u16 cgrp_dfl_inhibit_ss_mask;

/* some controllers are implicitly enabled on the default hierarchy */
static u16 cgrp_dfl_implicit_ss_mask;

/* some controllers can be threaded on the default hierarchy */
static u16 cgrp_dfl_threaded_ss_mask;

/* The list of hierarchy roots */
LIST_HEAD(cgroup_roots);
static int cgroup_root_count;

/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
static DEFINE_IDR(cgroup_hierarchy_idr);

/*
 * Assign a monotonically increasing serial number to csses.  It guarantees
 * cgroups with bigger numbers are newer than those with smaller numbers.
 * Also, as csses are always appended to the parent's ->children list, it
 * guarantees that sibling csses are always sorted in the ascending serial
 * number order on the list.  Protected by cgroup_mutex.
 */
static u64 css_serial_nr_next = 1;

/*
 * These bitmasks identify subsystems with specific features to avoid
 * having to do iterative checks repeatedly.
 */
static u16 have_fork_callback __read_mostly;
static u16 have_exit_callback __read_mostly;
static u16 have_release_callback __read_mostly;
static u16 have_canfork_callback __read_mostly;

/* cgroup namespace for init task */
struct cgroup_namespace init_cgroup_ns = {
        .count                = REFCOUNT_INIT(2),
        .user_ns        = &init_user_ns,
        .ns.ops                = &cgroupns_operations,
        .ns.inum        = PROC_CGROUP_INIT_INO,
        .root_cset        = &init_css_set,
};

static struct file_system_type cgroup2_fs_type;
static struct cftype cgroup_base_files[];

static int cgroup_apply_control(struct cgroup *cgrp);
static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
static void css_task_iter_skip(struct css_task_iter *it,
                               struct task_struct *task);
static int cgroup_destroy_locked(struct cgroup *cgrp);
static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
                                              struct cgroup_subsys *ss);
static void css_release(struct percpu_ref *ref);
static void kill_css(struct cgroup_subsys_state *css);
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
                              struct cgroup *cgrp, struct cftype cfts[],
                              bool is_add);

/**
 * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
 * @ssid: subsys ID of interest
 *
 * cgroup_subsys_enabled() can only be used with literal subsys names which
 * is fine for individual subsystems but unsuitable for cgroup core.  This
 * is slower static_key_enabled() based test indexed by @ssid.
 */
bool cgroup_ssid_enabled(int ssid)
{
        if (CGROUP_SUBSYS_COUNT == 0)
                return false;

        return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
}

/**
 * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
 * @cgrp: the cgroup of interest
 *
 * The default hierarchy is the v2 interface of cgroup and this function
 * can be used to test whether a cgroup is on the default hierarchy for
 * cases where a subsystem should behave differnetly depending on the
 * interface version.
 *
 * List of changed behaviors:
 *
 * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
 *   and "name" are disallowed.
 *
 * - When mounting an existing superblock, mount options should match.
 *
 * - Remount is disallowed.
 *
 * - rename(2) is disallowed.
 *
 * - "tasks" is removed.  Everything should be at process granularity.  Use
 *   "cgroup.procs" instead.
 *
 * - "cgroup.procs" is not sorted.  pids will be unique unless they got
 *   recycled inbetween reads.
 *
 * - "release_agent" and "notify_on_release" are removed.  Replacement
 *   notification mechanism will be implemented.
 *
 * - "cgroup.clone_children" is removed.
 *
 * - "cgroup.subtree_populated" is available.  Its value is 0 if the cgroup
 *   and its descendants contain no task; otherwise, 1.  The file also
 *   generates kernfs notification which can be monitored through poll and
 *   [di]notify when the value of the file changes.
 *
 * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
 *   take masks of ancestors with non-empty cpus/mems, instead of being
 *   moved to an ancestor.
 *
 * - cpuset: a task can be moved into an empty cpuset, and again it takes
 *   masks of ancestors.
 *
 * - memcg: use_hierarchy is on by default and the cgroup file for the flag
 *   is not created.
 *
 * - blkcg: blk-throttle becomes properly hierarchical.
 *
 * - debug: disallowed on the default hierarchy.
 */
bool cgroup_on_dfl(const struct cgroup *cgrp)
{
        return cgrp->root == &cgrp_dfl_root;
}

/* IDR wrappers which synchronize using cgroup_idr_lock */
static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
                            gfp_t gfp_mask)
{
        int ret;

        idr_preload(gfp_mask);
        spin_lock_bh(&cgroup_idr_lock);
        ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
        spin_unlock_bh(&cgroup_idr_lock);
        idr_preload_end();
        return ret;
}

static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
{
        void *ret;

        spin_lock_bh(&cgroup_idr_lock);
        ret = idr_replace(idr, ptr, id);
        spin_unlock_bh(&cgroup_idr_lock);
        return ret;
}

static void cgroup_idr_remove(struct idr *idr, int id)
{
        spin_lock_bh(&cgroup_idr_lock);
        idr_remove(idr, id);
        spin_unlock_bh(&cgroup_idr_lock);
}

static bool cgroup_has_tasks(struct cgroup *cgrp)
{
        return cgrp->nr_populated_csets;
}

bool cgroup_is_threaded(struct cgroup *cgrp)
{
        return cgrp->dom_cgrp != cgrp;
}

/* can @cgrp host both domain and threaded children? */
static bool cgroup_is_mixable(struct cgroup *cgrp)
{
        /*
         * Root isn't under domain level resource control exempting it from
         * the no-internal-process constraint, so it can serve as a thread
         * root and a parent of resource domains at the same time.
         */
        return !cgroup_parent(cgrp);
}

/* can @cgrp become a thread root? should always be true for a thread root */
static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
{
        /* mixables don't care */
        if (cgroup_is_mixable(cgrp))
                return true;

        /* domain roots can't be nested under threaded */
        if (cgroup_is_threaded(cgrp))
                return false;

        /* can only have either domain or threaded children */
        if (cgrp->nr_populated_domain_children)
                return false;

        /* and no domain controllers can be enabled */
        if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
                return false;

        return true;
}

/* is @cgrp root of a threaded subtree? */
bool cgroup_is_thread_root(struct cgroup *cgrp)
{
        /* thread root should be a domain */
        if (cgroup_is_threaded(cgrp))
                return false;

        /* a domain w/ threaded children is a thread root */
        if (cgrp->nr_threaded_children)
                return true;

        /*
         * A domain which has tasks and explicit threaded controllers
         * enabled is a thread root.
         */
        if (cgroup_has_tasks(cgrp) &&
            (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask))
                return true;

        return false;
}

/* a domain which isn't connected to the root w/o brekage can't be used */
static bool cgroup_is_valid_domain(struct cgroup *cgrp)
{
        /* the cgroup itself can be a thread root */
        if (cgroup_is_threaded(cgrp))
                return false;

        /* but the ancestors can't be unless mixable */
        while ((cgrp = cgroup_parent(cgrp))) {
                if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp))
                        return false;
                if (cgroup_is_threaded(cgrp))
                        return false;
        }

        return true;
}

/* subsystems visibly enabled on a cgroup */
static u16 cgroup_control(struct cgroup *cgrp)
{
        struct cgroup *parent = cgroup_parent(cgrp);
        u16 root_ss_mask = cgrp->root->subsys_mask;

        if (parent) {
                u16 ss_mask = parent->subtree_control;

                /* threaded cgroups can only have threaded controllers */
                if (cgroup_is_threaded(cgrp))
                        ss_mask &= cgrp_dfl_threaded_ss_mask;
                return ss_mask;
        }

        if (cgroup_on_dfl(cgrp))
                root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
                                  cgrp_dfl_implicit_ss_mask);
        return root_ss_mask;
}

/* subsystems enabled on a cgroup */
static u16 cgroup_ss_mask(struct cgroup *cgrp)
{
        struct cgroup *parent = cgroup_parent(cgrp);

        if (parent) {
                u16 ss_mask = parent->subtree_ss_mask;

                /* threaded cgroups can only have threaded controllers */
                if (cgroup_is_threaded(cgrp))
                        ss_mask &= cgrp_dfl_threaded_ss_mask;
                return ss_mask;
        }

        return cgrp->root->subsys_mask;
}

/**
 * cgroup_css - obtain a cgroup's css for the specified subsystem
 * @cgrp: the cgroup of interest
 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
 *
 * Return @cgrp's css (cgroup_subsys_state) associated with @ss.  This
 * function must be called either under cgroup_mutex or rcu_read_lock() and
 * the caller is responsible for pinning the returned css if it wants to
 * keep accessing it outside the said locks.  This function may return
 * %NULL if @cgrp doesn't have @subsys_id enabled.
 */
static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
                                              struct cgroup_subsys *ss)
{
        if (ss)
                return rcu_dereference_check(cgrp->subsys[ss->id],
                                        lockdep_is_held(&cgroup_mutex));
        else
                return &cgrp->self;
}

/**
 * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
 * @cgrp: the cgroup of interest
 * @ss: the subsystem of interest
 *
 * Find and get @cgrp's css assocaited with @ss.  If the css doesn't exist
 * or is offline, %NULL is returned.
 */
static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
                                                     struct cgroup_subsys *ss)
{
        struct cgroup_subsys_state *css;

        rcu_read_lock();
        css = cgroup_css(cgrp, ss);
        if (css && !css_tryget_online(css))
                css = NULL;
        rcu_read_unlock();

        return css;
}

/**
 * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss
 * @cgrp: the cgroup of interest
 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
 *
 * Similar to cgroup_css() but returns the effective css, which is defined
 * as the matching css of the nearest ancestor including self which has @ss
 * enabled.  If @ss is associated with the hierarchy @cgrp is on, this
 * function is guaranteed to return non-NULL css.
 */
static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp,
                                                        struct cgroup_subsys *ss)
{
        lockdep_assert_held(&cgroup_mutex);

        if (!ss)
                return &cgrp->self;

        /*
         * This function is used while updating css associations and thus
         * can't test the csses directly.  Test ss_mask.
         */
        while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
                cgrp = cgroup_parent(cgrp);
                if (!cgrp)
                        return NULL;
        }

        return cgroup_css(cgrp, ss);
}

/**
 * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
 * @cgrp: the cgroup of interest
 * @ss: the subsystem of interest
 *
 * Find and get the effective css of @cgrp for @ss.  The effective css is
 * defined as the matching css of the nearest ancestor including self which
 * has @ss enabled.  If @ss is not mounted on the hierarchy @cgrp is on,
 * the root css is returned, so this function always returns a valid css.
 *
 * The returned css is not guaranteed to be online, and therefore it is the
 * callers responsiblity to tryget a reference for it.
 */
struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
                                         struct cgroup_subsys *ss)
{
        struct cgroup_subsys_state *css;

        do {
                css = cgroup_css(cgrp, ss);

                if (css)
                        return css;
                cgrp = cgroup_parent(cgrp);
        } while (cgrp);

        return init_css_set.subsys[ss->id];
}

/**
 * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
 * @cgrp: the cgroup of interest
 * @ss: the subsystem of interest
 *
 * Find and get the effective css of @cgrp for @ss.  The effective css is
 * defined as the matching css of the nearest ancestor including self which
 * has @ss enabled.  If @ss is not mounted on the hierarchy @cgrp is on,
 * the root css is returned, so this function always returns a valid css.
 * The returned css must be put using css_put().
 */
struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
                                             struct cgroup_subsys *ss)
{
        struct cgroup_subsys_state *css;

        rcu_read_lock();

        do {
                css = cgroup_css(cgrp, ss);

                if (css && css_tryget_online(css))
                        goto out_unlock;
                cgrp = cgroup_parent(cgrp);
        } while (cgrp);

        css = init_css_set.subsys[ss->id];
        css_get(css);
out_unlock:
        rcu_read_unlock();
        return css;
}

static void cgroup_get_live(struct cgroup *cgrp)
{
        WARN_ON_ONCE(cgroup_is_dead(cgrp));
        css_get(&cgrp->self);
}

/**
 * __cgroup_task_count - count the number of tasks in a cgroup. The caller
 * is responsible for taking the css_set_lock.
 * @cgrp: the cgroup in question
 */
int __cgroup_task_count(const struct cgroup *cgrp)
{
        int count = 0;
        struct cgrp_cset_link *link;

        lockdep_assert_held(&css_set_lock);

        list_for_each_entry(link, &cgrp->cset_links, cset_link)
                count += link->cset->nr_tasks;

        return count;
}

/**
 * cgroup_task_count - count the number of tasks in a cgroup.
 * @cgrp: the cgroup in question
 */
int cgroup_task_count(const struct cgroup *cgrp)
{
        int count;

        spin_lock_irq(&css_set_lock);
        count = __cgroup_task_count(cgrp);
        spin_unlock_irq(&css_set_lock);

        return count;
}

struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
{
        struct cgroup *cgrp = of->kn->parent->priv;
        struct cftype *cft = of_cft(of);

        /*
         * This is open and unprotected implementation of cgroup_css().
         * seq_css() is only called from a kernfs file operation which has
         * an active reference on the file.  Because all the subsystem
         * files are drained before a css is disassociated with a cgroup,
         * the matching css from the cgroup's subsys table is guaranteed to
         * be and stay valid until the enclosing operation is complete.
         */
        if (cft->ss)
                return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
        else
                return &cgrp->self;
}
EXPORT_SYMBOL_GPL(of_css);

/**
 * for_each_css - iterate all css's of a cgroup
 * @css: the iteration cursor
 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
 * @cgrp: the target cgroup to iterate css's of
 *
 * Should be called under cgroup_[tree_]mutex.
 */
#define for_each_css(css, ssid, cgrp)                                        \
        for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)        \
                if (!((css) = rcu_dereference_check(                        \
                                (cgrp)->subsys[(ssid)],                        \
                                lockdep_is_held(&cgroup_mutex)))) { }        \
                else

/**
 * for_each_e_css - iterate all effective css's of a cgroup
 * @css: the iteration cursor
 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
 * @cgrp: the target cgroup to iterate css's of
 *
 * Should be called under cgroup_[tree_]mutex.
 */
#define for_each_e_css(css, ssid, cgrp)                                            \
        for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)            \
                if (!((css) = cgroup_e_css_by_mask(cgrp,                    \
                                                   cgroup_subsys[(ssid)]))) \
                        ;                                                    \
                else

/**
 * do_each_subsys_mask - filter for_each_subsys with a bitmask
 * @ss: the iteration cursor
 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
 * @ss_mask: the bitmask
 *
 * The block will only run for cases where the ssid-th bit (1 << ssid) of
 * @ss_mask is set.
 */
#define do_each_subsys_mask(ss, ssid, ss_mask) do {                        \
        unsigned long __ss_mask = (ss_mask);                                \
        if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */        \
                (ssid) = 0;                                                \
                break;                                                        \
        }                                                                \
        for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) {        \
                (ss) = cgroup_subsys[ssid];                                \
                {

#define while_each_subsys_mask()                                        \
                }                                                        \
        }                                                                \
} while (false)

/* iterate over child cgrps, lock should be held throughout iteration */
#define cgroup_for_each_live_child(child, cgrp)                                \
        list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
                if (({ lockdep_assert_held(&cgroup_mutex);                \
                       cgroup_is_dead(child); }))                        \
                        ;                                                \
                else

/* walk live descendants in preorder */
#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)                \
        css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL))        \
                if (({ lockdep_assert_held(&cgroup_mutex);                \
                       (dsct) = (d_css)->cgroup;                        \
                       cgroup_is_dead(dsct); }))                        \
                        ;                                                \
                else

/* walk live descendants in postorder */
#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp)                \
        css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL))        \
                if (({ lockdep_assert_held(&cgroup_mutex);                \
                       (dsct) = (d_css)->cgroup;                        \
                       cgroup_is_dead(dsct); }))                        \
                        ;                                                \
                else

/*
 * The default css_set - used by init and its children prior to any
 * hierarchies being mounted. It contains a pointer to the root state
 * for each subsystem. Also used to anchor the list of css_sets. Not
 * reference-counted, to improve performance when child cgroups
 * haven't been created.
 */
struct css_set init_css_set = {
        .refcount                = REFCOUNT_INIT(1),
        .dom_cset                = &init_css_set,
        .tasks                        = LIST_HEAD_INIT(init_css_set.tasks),
        .mg_tasks                = LIST_HEAD_INIT(init_css_set.mg_tasks),
        .dying_tasks                = LIST_HEAD_INIT(init_css_set.dying_tasks),
        .task_iters                = LIST_HEAD_INIT(init_css_set.task_iters),
        .threaded_csets                = LIST_HEAD_INIT(init_css_set.threaded_csets),
        .cgrp_links                = LIST_HEAD_INIT(init_css_set.cgrp_links),
        .mg_src_preload_node        = LIST_HEAD_INIT(init_css_set.mg_src_preload_node),
        .mg_dst_preload_node        = LIST_HEAD_INIT(init_css_set.mg_dst_preload_node),
        .mg_node                = LIST_HEAD_INIT(init_css_set.mg_node),

        /*
         * The following field is re-initialized when this cset gets linked
         * in cgroup_init().  However, let's initialize the field
         * statically too so that the default cgroup can be accessed safely
         * early during boot.
         */
        .dfl_cgrp                = &cgrp_dfl_root.cgrp,
};

static int css_set_count        = 1;        /* 1 for init_css_set */

static bool css_set_threaded(struct css_set *cset)
{
        return cset->dom_cset != cset;
}

/**
 * css_set_populated - does a css_set contain any tasks?
 * @cset: target css_set
 *
 * css_set_populated() should be the same as !!cset->nr_tasks at steady
 * state. However, css_set_populated() can be called while a task is being
 * added to or removed from the linked list before the nr_tasks is
 * properly updated. Hence, we can't just look at ->nr_tasks here.
 */
static bool css_set_populated(struct css_set *cset)
{
        lockdep_assert_held(&css_set_lock);

        return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
}

/**
 * cgroup_update_populated - update the populated count of a cgroup
 * @cgrp: the target cgroup
 * @populated: inc or dec populated count
 *
 * One of the css_sets associated with @cgrp is either getting its first
 * task or losing the last.  Update @cgrp->nr_populated_* accordingly.  The
 * count is propagated towards root so that a given cgroup's
 * nr_populated_children is zero iff none of its descendants contain any
 * tasks.
 *
 * @cgrp's interface file "cgroup.populated" is zero if both
 * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and
 * 1 otherwise.  When the sum changes from or to zero, userland is notified
 * that the content of the interface file has changed.  This can be used to
 * detect when @cgrp and its descendants become populated or empty.
 */
static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
{
        struct cgroup *child = NULL;
        int adj = populated ? 1 : -1;

        lockdep_assert_held(&css_set_lock);

        do {
                bool was_populated = cgroup_is_populated(cgrp);

                if (!child) {
                        cgrp->nr_populated_csets += adj;
                } else {
                        if (cgroup_is_threaded(child))
                                cgrp->nr_populated_threaded_children += adj;
                        else
                                cgrp->nr_populated_domain_children += adj;
                }

                if (was_populated == cgroup_is_populated(cgrp))
                        break;

                cgroup1_check_for_release(cgrp);
                TRACE_CGROUP_PATH(notify_populated, cgrp,
                                  cgroup_is_populated(cgrp));
                cgroup_file_notify(&cgrp->events_file);

                child = cgrp;
                cgrp = cgroup_parent(cgrp);
        } while (cgrp);
}

/**
 * css_set_update_populated - update populated state of a css_set
 * @cset: target css_set
 * @populated: whether @cset is populated or depopulated
 *
 * @cset is either getting the first task or losing the last.  Update the
 * populated counters of all associated cgroups accordingly.
 */
static void css_set_update_populated(struct css_set *cset, bool populated)
{
        struct cgrp_cset_link *link;

        lockdep_assert_held(&css_set_lock);

        list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
                cgroup_update_populated(link->cgrp, populated);
}

/*
 * @task is leaving, advance task iterators which are pointing to it so
 * that they can resume at the next position.  Advancing an iterator might
 * remove it from the list, use safe walk.  See css_task_iter_skip() for
 * details.
 */
static void css_set_skip_task_iters(struct css_set *cset,
                                    struct task_struct *task)
{
        struct css_task_iter *it, *pos;

        list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node)
                css_task_iter_skip(it, task);
}

/**
 * css_set_move_task - move a task from one css_set to another
 * @task: task being moved
 * @from_cset: css_set @task currently belongs to (may be NULL)
 * @to_cset: new css_set @task is being moved to (may be NULL)
 * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
 *
 * Move @task from @from_cset to @to_cset.  If @task didn't belong to any
 * css_set, @from_cset can be NULL.  If @task is being disassociated
 * instead of moved, @to_cset can be NULL.
 *
 * This function automatically handles populated counter updates and
 * css_task_iter adjustments but the caller is responsible for managing
 * @from_cset and @to_cset's reference counts.
 */
static void css_set_move_task(struct task_struct *task,
                              struct css_set *from_cset, struct css_set *to_cset,
                              bool use_mg_tasks)
{
        lockdep_assert_held(&css_set_lock);

        if (to_cset && !css_set_populated(to_cset))
                css_set_update_populated(to_cset, true);

        if (from_cset) {
                WARN_ON_ONCE(list_empty(&task->cg_list));

                css_set_skip_task_iters(from_cset, task);
                list_del_init(&task->cg_list);
                if (!css_set_populated(from_cset))
                        css_set_update_populated(from_cset, false);
        } else {
                WARN_ON_ONCE(!list_empty(&task->cg_list));
        }

        if (to_cset) {
                /*
                 * We are synchronized through cgroup_threadgroup_rwsem
                 * against PF_EXITING setting such that we can't race
                 * against cgroup_exit()/cgroup_free() dropping the css_set.
                 */
                WARN_ON_ONCE(task->flags & PF_EXITING);

                cgroup_move_task(task, to_cset);
                list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
                                                             &to_cset->tasks);
        }
}

/*
 * hash table for cgroup groups. This improves the performance to find
 * an existing css_set. This hash doesn't (currently) take into
 * account cgroups in empty hierarchies.
 */
#define CSS_SET_HASH_BITS        7
static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);

static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
{
        unsigned long key = 0UL;
        struct cgroup_subsys *ss;
        int i;

        for_each_subsys(ss, i)
                key += (unsigned long)css[i];
        key = (key >> 16) ^ key;

        return key;
}

void put_css_set_locked(struct css_set *cset)
{
        struct cgrp_cset_link *link, *tmp_link;
        struct cgroup_subsys *ss;
        int ssid;

        lockdep_assert_held(&css_set_lock);

        if (!refcount_dec_and_test(&cset->refcount))
                return;

        WARN_ON_ONCE(!list_empty(&cset->threaded_csets));

        /* This css_set is dead. unlink it and release cgroup and css refs */
        for_each_subsys(ss, ssid) {
                list_del(&cset->e_cset_node[ssid]);
                css_put(cset->subsys[ssid]);
        }
        hash_del(&cset->hlist);
        css_set_count--;

        list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
                list_del(&link->cset_link);
                list_del(&link->cgrp_link);
                if (cgroup_parent(link->cgrp))
                        cgroup_put(link->cgrp);
                kfree(link);
        }

        if (css_set_threaded(cset)) {
                list_del(&cset->threaded_csets_node);
                put_css_set_locked(cset->dom_cset);
        }

        kfree_rcu(cset, rcu_head);
}

/**
 * compare_css_sets - helper function for find_existing_css_set().
 * @cset: candidate css_set being tested
 * @old_cset: existing css_set for a task
 * @new_cgrp: cgroup that's being entered by the task
 * @template: desired set of css pointers in css_set (pre-calculated)
 *
 * Returns true if "cset" matches "old_cset" except for the hierarchy
 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
 */
static bool compare_css_sets(struct css_set *cset,
                             struct css_set *old_cset,
                             struct cgroup *new_cgrp,
                             struct cgroup_subsys_state *template[])
{
        struct cgroup *new_dfl_cgrp;
        struct list_head *l1, *l2;

        /*
         * On the default hierarchy, there can be csets which are
         * associated with the same set of cgroups but different csses.
         * Let's first ensure that csses match.
         */
        if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
                return false;


        /* @cset's domain should match the default cgroup's */
        if (cgroup_on_dfl(new_cgrp))
                new_dfl_cgrp = new_cgrp;
        else
                new_dfl_cgrp = old_cset->dfl_cgrp;

        if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp)
                return false;

        /*
         * Compare cgroup pointers in order to distinguish between
         * different cgroups in hierarchies.  As different cgroups may
         * share the same effective css, this comparison is always
         * necessary.
         */
        l1 = &cset->cgrp_links;
        l2 = &old_cset->cgrp_links;
        while (1) {
                struct cgrp_cset_link *link1, *link2;
                struct cgroup *cgrp1, *cgrp2;

                l1 = l1->next;
                l2 = l2->next;
                /* See if we reached the end - both lists are equal length. */
                if (l1 == &cset->cgrp_links) {
                        BUG_ON(l2 != &old_cset->cgrp_links);
                        break;
                } else {
                        BUG_ON(l2 == &old_cset->cgrp_links);
                }
                /* Locate the cgroups associated with these links. */
                link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
                link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
                cgrp1 = link1->cgrp;
                cgrp2 = link2->cgrp;
                /* Hierarchies should be linked in the same order. */
                BUG_ON(cgrp1->root != cgrp2->root);

                /*
                 * If this hierarchy is the hierarchy of the cgroup
                 * that's changing, then we need to check that this
                 * css_set points to the new cgroup; if it's any other
                 * hierarchy, then this css_set should point to the
                 * same cgroup as the old css_set.
                 */
                if (cgrp1->root == new_cgrp->root) {
                        if (cgrp1 != new_cgrp)
                                return false;
                } else {
                        if (cgrp1 != cgrp2)
                                return false;
                }
        }
        return true;
}

/**
 * find_existing_css_set - init css array and find the matching css_set
 * @old_cset: the css_set that we're using before the cgroup transition
 * @cgrp: the cgroup that we're moving into
 * @template: out param for the new set of csses, should be clear on entry
 */
static struct css_set *find_existing_css_set(struct css_set *old_cset,
                                        struct cgroup *cgrp,
                                        struct cgroup_subsys_state *template[])
{
        struct cgroup_root *root = cgrp->root;
        struct cgroup_subsys *ss;
        struct css_set *cset;
        unsigned long key;
        int i;

        /*
         * Build the set of subsystem state objects that we want to see in the
         * new css_set. while subsystems can change globally, the entries here
         * won't change, so no need for locking.
         */
        for_each_subsys(ss, i) {
                if (root->subsys_mask & (1UL << i)) {
                        /*
                         * @ss is in this hierarchy, so we want the
                         * effective css from @cgrp.
                         */
                        template[i] = cgroup_e_css_by_mask(cgrp, ss);
                } else {
                        /*
                         * @ss is not in this hierarchy, so we don't want
                         * to change the css.
                         */
                        template[i] = old_cset->subsys[i];
                }
        }

        key = css_set_hash(template);
        hash_for_each_possible(css_set_table, cset, hlist, key) {
                if (!compare_css_sets(cset, old_cset, cgrp, template))
                        continue;

                /* This css_set matches what we need */
                return cset;
        }

        /* No existing cgroup group matched */
        return NULL;
}

static void free_cgrp_cset_links(struct list_head *links_to_free)
{
        struct cgrp_cset_link *link, *tmp_link;

        list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
                list_del(&link->cset_link);
                kfree(link);
        }
}

/**
 * allocate_cgrp_cset_links - allocate cgrp_cset_links
 * @count: the number of links to allocate
 * @tmp_links: list_head the allocated links are put on
 *
 * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
 * through ->cset_link.  Returns 0 on success or -errno.
 */
static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
{
        struct cgrp_cset_link *link;
        int i;

        INIT_LIST_HEAD(tmp_links);

        for (i = 0; i < count; i++) {
                link = kzalloc(sizeof(*link), GFP_KERNEL);
                if (!link) {
                        free_cgrp_cset_links(tmp_links);
                        return -ENOMEM;
                }
                list_add(&link->cset_link, tmp_links);
        }
        return 0;
}

/**
 * link_css_set - a helper function to link a css_set to a cgroup
 * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
 * @cset: the css_set to be linked
 * @cgrp: the destination cgroup
 */
static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
                         struct cgroup *cgrp)
{
        struct cgrp_cset_link *link;

        BUG_ON(list_empty(tmp_links));

        if (cgroup_on_dfl(cgrp))
                cset->dfl_cgrp = cgrp;

        link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
        link->cset = cset;
        link->cgrp = cgrp;

        /*
         * Always add links to the tail of the lists so that the lists are
         * in choronological order.
         */
        list_move_tail(&link->cset_link, &cgrp->cset_links);
        list_add_tail(&link->cgrp_link, &cset->cgrp_links);

        if (cgroup_parent(cgrp))
                cgroup_get_live(cgrp);
}

/**
 * find_css_set - return a new css_set with one cgroup updated
 * @old_cset: the baseline css_set
 * @cgrp: the cgroup to be updated
 *
 * Return a new css_set that's equivalent to @old_cset, but with @cgrp
 * substituted into the appropriate hierarchy.
 */
static struct css_set *find_css_set(struct css_set *old_cset,
                                    struct cgroup *cgrp)
{
        struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
        struct css_set *cset;
        struct list_head tmp_links;
        struct cgrp_cset_link *link;
        struct cgroup_subsys *ss;
        unsigned long key;
        int ssid;

        lockdep_assert_held(&cgroup_mutex);

        /* First see if we already have a cgroup group that matches
         * the desired set */
        spin_lock_irq(&css_set_lock);
        cset = find_existing_css_set(old_cset, cgrp, template);
        if (cset)
                get_css_set(cset);
        spin_unlock_irq(&css_set_lock);

        if (cset)
                return cset;

        cset = kzalloc(sizeof(*cset), GFP_KERNEL);
        if (!cset)
                return NULL;

        /* Allocate all the cgrp_cset_link objects that we'll need */
        if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
                kfree(cset);
                return NULL;
        }

        refcount_set(&cset->refcount, 1);
        cset->dom_cset = cset;
        INIT_LIST_HEAD(&cset->tasks);
        INIT_LIST_HEAD(&cset->mg_tasks);
        INIT_LIST_HEAD(&cset->dying_tasks);
        INIT_LIST_HEAD(&cset->task_iters);
        INIT_LIST_HEAD(&cset->threaded_csets);
        INIT_HLIST_NODE(&cset->hlist);
        INIT_LIST_HEAD(&cset->cgrp_links);
        INIT_LIST_HEAD(&cset->mg_src_preload_node);
        INIT_LIST_HEAD(&cset->mg_dst_preload_node);
        INIT_LIST_HEAD(&cset->mg_node);

        /* Copy the set of subsystem state objects generated in
         * find_existing_css_set() */
        memcpy(cset->subsys, template, sizeof(cset->subsys));

        spin_lock_irq(&css_set_lock);
        /* Add reference counts and links from the new css_set. */
        list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
                struct cgroup *c = link->cgrp;

                if (c->root == cgrp->root)
                        c = cgrp;
                link_css_set(&tmp_links, cset, c);
        }

        BUG_ON(!list_empty(&tmp_links));

        css_set_count++;

        /* Add @cset to the hash table */
        key = css_set_hash(cset->subsys);
        hash_add(css_set_table, &cset->hlist, key);

        for_each_subsys(ss, ssid) {
                struct cgroup_subsys_state *css = cset->subsys[ssid];

                list_add_tail(&cset->e_cset_node[ssid],
                              &css->cgroup->e_csets[ssid]);
                css_get(css);
        }

        spin_unlock_irq(&css_set_lock);

        /*
         * If @cset should be threaded, look up the matching dom_cset and
         * link them up.  We first fully initialize @cset then look for the
         * dom_cset.  It's simpler this way and safe as @cset is guaranteed
         * to stay empty until we return.
         */
        if (cgroup_is_threaded(cset->dfl_cgrp)) {
                struct css_set *dcset;

                dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp);
                if (!dcset) {
                        put_css_set(cset);
                        return NULL;
                }

                spin_lock_irq(&css_set_lock);
                cset->dom_cset = dcset;
                list_add_tail(&cset->threaded_csets_node,
                              &dcset->threaded_csets);
                spin_unlock_irq(&css_set_lock);
        }

        return cset;
}

struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
{
        struct cgroup *root_cgrp = kf_root->kn->priv;

        return root_cgrp->root;
}

static int cgroup_init_root_id(struct cgroup_root *root)
{
        int id;

        lockdep_assert_held(&cgroup_mutex);

        id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
        if (id < 0)
                return id;

        root->hierarchy_id = id;
        return 0;
}

static void cgroup_exit_root_id(struct cgroup_root *root)
{
        lockdep_assert_held(&cgroup_mutex);

        idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
}

void cgroup_free_root(struct cgroup_root *root)
{
        kfree_rcu(root, rcu);
}

static void cgroup_destroy_root(struct cgroup_root *root)
{
        struct cgroup *cgrp = &root->cgrp;
        struct cgrp_cset_link *link, *tmp_link;

        trace_cgroup_destroy_root(root);

        cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);

        BUG_ON(atomic_read(&root->nr_cgrps));
        BUG_ON(!list_empty(&cgrp->self.children));

        /* Rebind all subsystems back to the default hierarchy */
        WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));

        /*
         * Release all the links from cset_links to this hierarchy's
         * root cgroup
         */
        spin_lock_irq(&css_set_lock);

        list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
                list_del(&link->cset_link);
                list_del(&link->cgrp_link);
                kfree(link);
        }

        spin_unlock_irq(&css_set_lock);

        if (!list_empty(&root->root_list)) {
                list_del_rcu(&root->root_list);
                cgroup_root_count--;
        }

        cgroup_exit_root_id(root);

        mutex_unlock(&cgroup_mutex);

        kernfs_destroy_root(root->kf_root);
        cgroup_free_root(root);
}

/*
 * look up cgroup associated with current task's cgroup namespace on the
 * specified hierarchy
 */
static struct cgroup *
current_cgns_cgroup_from_root(struct cgroup_root *root)
{
        struct cgroup *res = NULL;
        struct css_set *cset;

        lockdep_assert_held(&css_set_lock);

        rcu_read_lock();

        cset = current->nsproxy->cgroup_ns->root_cset;
        if (cset == &init_css_set) {
                res = &root->cgrp;
        } else if (root == &cgrp_dfl_root) {
                res = cset->dfl_cgrp;
        } else {
                struct cgrp_cset_link *link;

                list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
                        struct cgroup *c = link->cgrp;

                        if (c->root == root) {
                                res = c;
                                break;
                        }
                }
        }
        rcu_read_unlock();

        return res;
}

/* look up cgroup associated with given css_set on the specified hierarchy */
static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
                                            struct cgroup_root *root)
{
        struct cgroup *res = NULL;

        lockdep_assert_held(&css_set_lock);

        if (cset == &init_css_set) {
                res = &root->cgrp;
        } else if (root == &cgrp_dfl_root) {
                res = cset->dfl_cgrp;
        } else {
                struct cgrp_cset_link *link;

                list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
                        struct cgroup *c = link->cgrp;

                        if (c->root == root) {
                                res = c;
                                break;
                        }
                }
        }

        BUG_ON(!res);
        return res;
}

/*
 * Return the cgroup for "task" from the given hierarchy. Must be
 * called with css_set_lock held to prevent task's groups from being modified.
 * Must be called with either cgroup_mutex or rcu read lock to prevent the
 * cgroup root from being destroyed.
 */
struct cgroup *task_cgroup_from_root(struct task_struct *task,
                                     struct cgroup_root *root)
{
        /*
         * No need to lock the task - since we hold css_set_lock the
         * task can't change groups.
         */
        return cset_cgroup_from_root(task_css_set(task), root);
}

/*
 * A task must hold cgroup_mutex to modify cgroups.
 *
 * Any task can increment and decrement the count field without lock.
 * So in general, code holding cgroup_mutex can't rely on the count
 * field not changing.  However, if the count goes to zero, then only
 * cgroup_attach_task() can increment it again.  Because a count of zero
 * means that no tasks are currently attached, therefore there is no
 * way a task attached to that cgroup can fork (the other way to
 * increment the count).  So code holding cgroup_mutex can safely
 * assume that if the count is zero, it will stay zero. Similarly, if
 * a task holds cgroup_mutex on a cgroup with zero count, it
 * knows that the cgroup won't be removed, as cgroup_rmdir()
 * needs that mutex.
 *
 * A cgroup can only be deleted if both its 'count' of using tasks
 * is zero, and its list of 'children' cgroups is empty.  Since all
 * tasks in the system use _some_ cgroup, and since there is always at
 * least one task in the system (init, pid == 1), therefore, root cgroup
 * always has either children cgroups and/or using tasks.  So we don't
 * need a special hack to ensure that root cgroup cannot be deleted.
 *
 * P.S.  One more locking exception.  RCU is used to guard the
 * update of a tasks cgroup pointer by cgroup_attach_task()
 */

static struct kernfs_syscall_ops cgroup_kf_syscall_ops;

static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
                              char *buf)
{
        struct cgroup_subsys *ss = cft->ss;

        if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
            !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
                const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : "";

                snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s",
                         dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
                         cft->name);
        } else {
                strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
        }
        return buf;
}

/**
 * cgroup_file_mode - deduce file mode of a control file
 * @cft: the control file in question
 *
 * S_IRUGO for read, S_IWUSR for write.
 */
static umode_t cgroup_file_mode(const struct cftype *cft)
{
        umode_t mode = 0;

        if (cft->read_u64 || cft->read_s64 || cft->seq_show)
                mode |= S_IRUGO;

        if (cft->write_u64 || cft->write_s64 || cft->write) {
                if (cft->flags & CFTYPE_WORLD_WRITABLE)
                        mode |= S_IWUGO;
                else
                        mode |= S_IWUSR;
        }

        return mode;
}

/**
 * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask
 * @subtree_control: the new subtree_control mask to consider
 * @this_ss_mask: available subsystems
 *
 * On the default hierarchy, a subsystem may request other subsystems to be
 * enabled together through its ->depends_on mask.  In such cases, more
 * subsystems than specified in "cgroup.subtree_control" may be enabled.
 *
 * This function calculates which subsystems need to be enabled if
 * @subtree_control is to be applied while restricted to @this_ss_mask.
 */
static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
{
        u16 cur_ss_mask = subtree_control;
        struct cgroup_subsys *ss;
        int ssid;

        lockdep_assert_held(&cgroup_mutex);

        cur_ss_mask |= cgrp_dfl_implicit_ss_mask;

        while (true) {
                u16 new_ss_mask = cur_ss_mask;

                do_each_subsys_mask(ss, ssid, cur_ss_mask) {
                        new_ss_mask |= ss->depends_on;
                } while_each_subsys_mask();

                /*
                 * Mask out subsystems which aren't available.  This can
                 * happen only if some depended-upon subsystems were bound
                 * to non-default hierarchies.
                 */
                new_ss_mask &= this_ss_mask;

                if (new_ss_mask == cur_ss_mask)
                        break;
                cur_ss_mask = new_ss_mask;
        }

        return cur_ss_mask;
}

/**
 * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
 * @kn: the kernfs_node being serviced
 *
 * This helper undoes cgroup_kn_lock_live() and should be invoked before
 * the method finishes if locking succeeded.  Note that once this function
 * returns the cgroup returned by cgroup_kn_lock_live() may become
 * inaccessible any time.  If the caller intends to continue to access the
 * cgroup, it should pin it before invoking this function.
 */
void cgroup_kn_unlock(struct kernfs_node *kn)
{
        struct cgroup *cgrp;

        if (kernfs_type(kn) == KERNFS_DIR)
                cgrp = kn->priv;
        else
                cgrp = kn->parent->priv;

        mutex_unlock(&cgroup_mutex);

        kernfs_unbreak_active_protection(kn);
        cgroup_put(cgrp);
}

/**
 * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
 * @kn: the kernfs_node being serviced
 * @drain_offline: perform offline draining on the cgroup
 *
 * This helper is to be used by a cgroup kernfs method currently servicing
 * @kn.  It breaks the active protection, performs cgroup locking and
 * verifies that the associated cgroup is alive.  Returns the cgroup if
 * alive; otherwise, %NULL.  A successful return should be undone by a
 * matching cgroup_kn_unlock() invocation.  If @drain_offline is %true, the
 * cgroup is drained of offlining csses before return.
 *
 * Any cgroup kernfs method implementation which requires locking the
 * associated cgroup should use this helper.  It avoids nesting cgroup
 * locking under kernfs active protection and allows all kernfs operations
 * including self-removal.
 */
struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
{
        struct cgroup *cgrp;

        if (kernfs_type(kn) == KERNFS_DIR)
                cgrp = kn->priv;
        else
                cgrp = kn->parent->priv;

        /*
         * We're gonna grab cgroup_mutex which nests outside kernfs
         * active_ref.  cgroup liveliness check alone provides enough
         * protection against removal.  Ensure @cgrp stays accessible and
         * break the active_ref protection.
         */
        if (!cgroup_tryget(cgrp))
                return NULL;
        kernfs_break_active_protection(kn);

        if (drain_offline)
                cgroup_lock_and_drain_offline(cgrp);
        else
                mutex_lock(&cgroup_mutex);

        if (!cgroup_is_dead(cgrp))
                return cgrp;

        cgroup_kn_unlock(kn);
        return NULL;
}

static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
{
        char name[CGROUP_FILE_NAME_MAX];

        lockdep_assert_held(&cgroup_mutex);

        if (cft->file_offset) {
                struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss);
                struct cgroup_file *cfile = (void *)css + cft->file_offset;

                spin_lock_irq(&cgroup_file_kn_lock);
                cfile->kn = NULL;
                spin_unlock_irq(&cgroup_file_kn_lock);

                del_timer_sync(&cfile->notify_timer);
        }

        kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
}

/**
 * css_clear_dir - remove subsys files in a cgroup directory
 * @css: taget css
 */
static void css_clear_dir(struct cgroup_subsys_state *css)
{
        struct cgroup *cgrp = css->cgroup;
        struct cftype *cfts;

        if (!(css->flags & CSS_VISIBLE))
                return;

        css->flags &= ~CSS_VISIBLE;

        if (!css->ss) {
                if (cgroup_on_dfl(cgrp))
                        cfts = cgroup_base_files;
                else
                        cfts = cgroup1_base_files;

                cgroup_addrm_files(css, cgrp, cfts, false);
        } else {
                list_for_each_entry(cfts, &css->ss->cfts, node)
                        cgroup_addrm_files(css, cgrp, cfts, false);
        }
}

/**
 * css_populate_dir - create subsys files in a cgroup directory
 * @css: target css
 *
 * On failure, no file is added.
 */
static int css_populate_dir(struct cgroup_subsys_state *css)
{
        struct cgroup *cgrp = css->cgroup;
        struct cftype *cfts, *failed_cfts;
        int ret;

        if ((css->flags & CSS_VISIBLE) || !cgrp->kn)
                return 0;

        if (!css->ss) {
                if (cgroup_on_dfl(cgrp))
                        cfts = cgroup_base_files;
                else
                        cfts = cgroup1_base_files;

                ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
                if (ret < 0)
                        return ret;
        } else {
                list_for_each_entry(cfts, &css->ss->cfts, node) {
                        ret = cgroup_addrm_files(css, cgrp, cfts, true);
                        if (ret < 0) {
                                failed_cfts = cfts;
                                goto err;
                        }
                }
        }

        css->flags |= CSS_VISIBLE;

        return 0;
err:
        list_for_each_entry(cfts, &css->ss->cfts, node) {
                if (cfts == failed_cfts)
                        break;
                cgroup_addrm_files(css, cgrp, cfts, false);
        }
        return ret;
}

int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
{
        struct cgroup *dcgrp = &dst_root->cgrp;
        struct cgroup_subsys *ss;
        int ssid, ret;
        u16 dfl_disable_ss_mask = 0;

        lockdep_assert_held(&cgroup_mutex);

        do_each_subsys_mask(ss, ssid, ss_mask) {
                /*
                 * If @ss has non-root csses attached to it, can't move.
                 * If @ss is an implicit controller, it is exempt from this
                 * rule and can be stolen.
                 */
                if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&
                    !ss->implicit_on_dfl)
                        return -EBUSY;

                /* can't move between two non-dummy roots either */
                if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
                        return -EBUSY;

                /*
                 * Collect ssid's that need to be disabled from default
                 * hierarchy.
                 */
                if (ss->root == &cgrp_dfl_root)
                        dfl_disable_ss_mask |= 1 << ssid;

        } while_each_subsys_mask();

        if (dfl_disable_ss_mask) {
                struct cgroup *scgrp = &cgrp_dfl_root.cgrp;

                /*
                 * Controllers from default hierarchy that need to be rebound
                 * are all disabled together in one go.
                 */
                cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask;
                WARN_ON(cgroup_apply_control(scgrp));
                cgroup_finalize_control(scgrp, 0);
        }

        do_each_subsys_mask(ss, ssid, ss_mask) {
                struct cgroup_root *src_root = ss->root;
                struct cgroup *scgrp = &src_root->cgrp;
                struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
                struct css_set *cset, *cset_pos;
                struct css_task_iter *it;

                WARN_ON(!css || cgroup_css(dcgrp, ss));

                if (src_root != &cgrp_dfl_root) {
                        /* disable from the source */
                        src_root->subsys_mask &= ~(1 << ssid);
                        WARN_ON(cgroup_apply_control(scgrp));
                        cgroup_finalize_control(scgrp, 0);
                }

                /* rebind */
                RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
                rcu_assign_pointer(dcgrp->subsys[ssid], css);
                ss->root = dst_root;

                spin_lock_irq(&css_set_lock);
                css->cgroup = dcgrp;
                WARN_ON(!list_empty(&dcgrp->e_csets[ss->id]));
                list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id],
                                         e_cset_node[ss->id]) {
                        list_move_tail(&cset->e_cset_node[ss->id],
                                       &dcgrp->e_csets[ss->id]);
                        /*
                         * all css_sets of scgrp together in same order to dcgrp,
                         * patch in-flight iterators to preserve correct iteration.
                         * since the iterator is always advanced right away and
                         * finished when it->cset_pos meets it->cset_head, so only
                         * update it->cset_head is enough here.
                         */
                        list_for_each_entry(it, &cset->task_iters, iters_node)
                                if (it->cset_head == &scgrp->e_csets[ss->id])
                                        it->cset_head = &dcgrp->e_csets[ss->id];
                }
                spin_unlock_irq(&css_set_lock);

                /* default hierarchy doesn't enable controllers by default */
                dst_root->subsys_mask |= 1 << ssid;
                if (dst_root == &cgrp_dfl_root) {
                        static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
                } else {
                        dcgrp->subtree_control |= 1 << ssid;
                        static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
                }

                ret = cgroup_apply_control(dcgrp);
                if (ret)
                        pr_warn("partial failure to rebind %s controller (err=%d)\n",
                                ss->name, ret);

                if (ss->bind)
                        ss->bind(css);
        } while_each_subsys_mask();

        kernfs_activate(dcgrp->kn);
        return 0;
}

int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
                     struct kernfs_root *kf_root)
{
        int len = 0;
        char *buf = NULL;
        struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
        struct cgroup *ns_cgroup;

        buf = kmalloc(PATH_MAX, GFP_KERNEL);
        if (!buf)
                return -ENOMEM;

        spin_lock_irq(&css_set_lock);
        ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
        len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
        spin_unlock_irq(&css_set_lock);

        if (len >= PATH_MAX)
                len = -ERANGE;
        else if (len > 0) {
                seq_escape(sf, buf, " \t\n\\");
                len = 0;
        }
        kfree(buf);
        return len;
}

enum cgroup2_param {
        Opt_nsdelegate,
        Opt_memory_localevents,
        Opt_memory_recursiveprot,
        nr__cgroup2_params
};

static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
        fsparam_flag("nsdelegate",                Opt_nsdelegate),
        fsparam_flag("memory_localevents",        Opt_memory_localevents),
        fsparam_flag("memory_recursiveprot",        Opt_memory_recursiveprot),
        {}
};

static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
        struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
        struct fs_parse_result result;
        int opt;

        opt = fs_parse(fc, cgroup2_fs_parameters, param, &result);
        if (opt < 0)
                return opt;

        switch (opt) {
        case Opt_nsdelegate:
                ctx->flags |= CGRP_ROOT_NS_DELEGATE;
                return 0;
        case Opt_memory_localevents:
                ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
                return 0;
        case Opt_memory_recursiveprot:
                ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
                return 0;
        }
        return -EINVAL;
}

static void apply_cgroup_root_flags(unsigned int root_flags)
{
        if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
                if (root_flags & CGRP_ROOT_NS_DELEGATE)
                        cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
                else
                        cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;

                if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
                        cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
                else
                        cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;

                if (root_flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
                        cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
                else
                        cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT;
        }
}

static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
{
        if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
                seq_puts(seq, ",nsdelegate");
        if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
                seq_puts(seq, ",memory_localevents");
        if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
                seq_puts(seq, ",memory_recursiveprot");
        return 0;
}

static int cgroup_reconfigure(struct fs_context *fc)
{
        struct cgroup_fs_context *ctx = cgroup_fc2context(fc);

        apply_cgroup_root_flags(ctx->flags);
        return 0;
}

static void init_cgroup_housekeeping(struct cgroup *cgrp)
{
        struct cgroup_subsys *ss;
        int ssid;

        INIT_LIST_HEAD(&cgrp->self.sibling);
        INIT_LIST_HEAD(&cgrp->self.children);
        INIT_LIST_HEAD(&cgrp->cset_links);
        INIT_LIST_HEAD(&cgrp->pidlists);
        mutex_init(&cgrp->pidlist_mutex);
        cgrp->self.cgroup = cgrp;
        cgrp->self.flags |= CSS_ONLINE;
        cgrp->dom_cgrp = cgrp;
        cgrp->max_descendants = INT_MAX;
        cgrp->max_depth = INT_MAX;
        INIT_LIST_HEAD(&cgrp->rstat_css_list);
        prev_cputime_init(&cgrp->prev_cputime);

        for_each_subsys(ss, ssid)
                INIT_LIST_HEAD(&cgrp->e_csets[ssid]);

        init_waitqueue_head(&cgrp->offline_waitq);
        INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
}

void init_cgroup_root(struct cgroup_fs_context *ctx)
{
        struct cgroup_root *root = ctx->root;
        struct cgroup *cgrp = &root->cgrp;

        INIT_LIST_HEAD_RCU(&root->root_list);
        atomic_set(&root->nr_cgrps, 1);
        cgrp->root = root;
        init_cgroup_housekeeping(cgrp);

        root->flags = ctx->flags;
        if (ctx->release_agent)
                strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
        if (ctx->name)
                strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN);
        if (ctx->cpuset_clone_children)
                set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}

int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
{
        LIST_HEAD(tmp_links);
        struct cgroup *root_cgrp = &root->cgrp;
        struct kernfs_syscall_ops *kf_sops;
        struct css_set *cset;
        int i, ret;

        lockdep_assert_held(&cgroup_mutex);

        ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
                              0, GFP_KERNEL);
        if (ret)
                goto out;

        /*
         * We're accessing css_set_count without locking css_set_lock here,
         * but that's OK - it can only be increased by someone holding
         * cgroup_lock, and that's us.  Later rebinding may disable
         * controllers on the default hierarchy and thus create new csets,
         * which can't be more than the existing ones.  Allocate 2x.
         */
        ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links);
        if (ret)
                goto cancel_ref;

        ret = cgroup_init_root_id(root);
        if (ret)
                goto cancel_ref;

        kf_sops = root == &cgrp_dfl_root ?
                &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;

        root->kf_root = kernfs_create_root(kf_sops,
                                           KERNFS_ROOT_CREATE_DEACTIVATED |
                                           KERNFS_ROOT_SUPPORT_EXPORTOP |
                                           KERNFS_ROOT_SUPPORT_USER_XATTR,
                                           root_cgrp);
        if (IS_ERR(root->kf_root)) {
                ret = PTR_ERR(root->kf_root);
                goto exit_root_id;
        }
        root_cgrp->kn = root->kf_root->kn;
        WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
        root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp);

        ret = css_populate_dir(&root_cgrp->self);
        if (ret)
                goto destroy_root;

        ret = rebind_subsystems(root, ss_mask);
        if (ret)
                goto destroy_root;

        if (root == &cgrp_dfl_root) {
                ret = cgroup_bpf_inherit(root_cgrp);
                WARN_ON_ONCE(ret);
        }

        trace_cgroup_setup_root(root);

        /*
         * There must be no failure case after here, since rebinding takes
         * care of subsystems' refcounts, which are explicitly dropped in
         * the failure exit path.
         */
        list_add_rcu(&root->root_list, &cgroup_roots);
        cgroup_root_count++;

        /*
         * Link the root cgroup in this hierarchy into all the css_set
         * objects.
         */
        spin_lock_irq(&css_set_lock);
        hash_for_each(css_set_table, i, cset, hlist) {
                link_css_set(&tmp_links, cset, root_cgrp);
                if (css_set_populated(cset))
                        cgroup_update_populated(root_cgrp, true);
        }
        spin_unlock_irq(&css_set_lock);

        BUG_ON(!list_empty(&root_cgrp->self.children));
        BUG_ON(atomic_read(&root->nr_cgrps) != 1);

        ret = 0;
        goto out;

destroy_root:
        kernfs_destroy_root(root->kf_root);
        root->kf_root = NULL;
exit_root_id:
        cgroup_exit_root_id(root);
cancel_ref:
        percpu_ref_exit(&root_cgrp->self.refcnt);
out:
        free_cgrp_cset_links(&tmp_links);
        return ret;
}

int cgroup_do_get_tree(struct fs_context *fc)
{
        struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
        int ret;

        ctx->kfc.root = ctx->root->kf_root;
        if (fc->fs_type == &cgroup2_fs_type)
                ctx->kfc.magic = CGROUP2_SUPER_MAGIC;
        else
                ctx->kfc.magic = CGROUP_SUPER_MAGIC;
        ret = kernfs_get_tree(fc);

        /*
         * In non-init cgroup namespace, instead of root cgroup's dentry,
         * we return the dentry corresponding to the cgroupns->root_cgrp.
         */
        if (!ret && ctx->ns != &init_cgroup_ns) {
                struct dentry *nsdentry;
                struct super_block *sb = fc->root->d_sb;
                struct cgroup *cgrp;

                mutex_lock(&cgroup_mutex);
                spin_lock_irq(&css_set_lock);

                cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root);

                spin_unlock_irq(&css_set_lock);
                mutex_unlock(&cgroup_mutex);

                nsdentry = kernfs_node_dentry(cgrp->kn, sb);
                dput(fc->root);
                if (IS_ERR(nsdentry)) {
                        deactivate_locked_super(sb);
                        ret = PTR_ERR(nsdentry);
                        nsdentry = NULL;
                }
                fc->root = nsdentry;
        }

        if (!ctx->kfc.new_sb_created)
                cgroup_put(&ctx->root->cgrp);

        return ret;
}

/*
 * Destroy a cgroup filesystem context.
 */
static void cgroup_fs_context_free(struct fs_context *fc)
{
        struct cgroup_fs_context *ctx = cgroup_fc2context(fc);

        kfree(ctx->name);
        kfree(ctx->release_agent);
        put_cgroup_ns(ctx->ns);
        kernfs_free_fs_context(fc);
        kfree(ctx);
}

static int cgroup_get_tree(struct fs_context *fc)
{
        struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
        int ret;

        cgrp_dfl_visible = true;
        cgroup_get_live(&cgrp_dfl_root.cgrp);
        ctx->root = &cgrp_dfl_root;

        ret = cgroup_do_get_tree(fc);
        if (!ret)
                apply_cgroup_root_flags(ctx->flags);
        return ret;
}

static const struct fs_context_operations cgroup_fs_context_ops = {
        .free                = cgroup_fs_context_free,
        .parse_param        = cgroup2_parse_param,
        .get_tree        = cgroup_get_tree,
        .reconfigure        = cgroup_reconfigure,
};

static const struct fs_context_operations cgroup1_fs_context_ops = {
        .free                = cgroup_fs_context_free,
        .parse_param        = cgroup1_parse_param,
        .get_tree        = cgroup1_get_tree,
        .reconfigure        = cgroup1_reconfigure,
};

/*
 * Initialise the cgroup filesystem creation/reconfiguration context.  Notably,
 * we select the namespace we're going to use.
 */
static int cgroup_init_fs_context(struct fs_context *fc)
{
        struct cgroup_fs_context *ctx;

        ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL);
        if (!ctx)
                return -ENOMEM;

        ctx->ns = current->nsproxy->cgroup_ns;
        get_cgroup_ns(ctx->ns);
        fc->fs_private = &ctx->kfc;
        if (fc->fs_type == &cgroup2_fs_type)
                fc->ops = &cgroup_fs_context_ops;
        else
                fc->ops = &cgroup1_fs_context_ops;
        put_user_ns(fc->user_ns);
        fc->user_ns = get_user_ns(ctx->ns->user_ns);
        fc->global = true;
        return 0;
}

static void cgroup_kill_sb(struct super_block *sb)
{
        struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
        struct cgroup_root *root = cgroup_root_from_kf(kf_root);

        /*
         * If @root doesn't have any children, start killing it.
         * This prevents new mounts by disabling percpu_ref_tryget_live().
         * cgroup_mount() may wait for @root's release.
         *
         * And don't kill the default root.
         */
        if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
            !percpu_ref_is_dying(&root->cgrp.self.refcnt))
                percpu_ref_kill(&root->cgrp.self.refcnt);
        cgroup_put(&root->cgrp);
        kernfs_kill_sb(sb);
}

struct file_system_type cgroup_fs_type = {
        .name                        = "cgroup",
        .init_fs_context        = cgroup_init_fs_context,
        .parameters                = cgroup1_fs_parameters,
        .kill_sb                = cgroup_kill_sb,
        .fs_flags                = FS_USERNS_MOUNT,
};

static struct file_system_type cgroup2_fs_type = {
        .name                        = "cgroup2",
        .init_fs_context        = cgroup_init_fs_context,
        .parameters                = cgroup2_fs_parameters,
        .kill_sb                = cgroup_kill_sb,
        .fs_flags                = FS_USERNS_MOUNT,
};

#ifdef CONFIG_CPUSETS
static const struct fs_context_operations cpuset_fs_context_ops = {
        .get_tree        = cgroup1_get_tree,
        .free                = cgroup_fs_context_free,
};

/*
 * This is ugly, but preserves the userspace API for existing cpuset
 * users. If someone tries to mount the "cpuset" filesystem, we
 * silently switch it to mount "cgroup" instead
 */
static int cpuset_init_fs_context(struct fs_context *fc)
{
        char *agent = kstrdup("/sbin/cpuset_release_agent", GFP_USER);
        struct cgroup_fs_context *ctx;
        int err;

        err = cgroup_init_fs_context(fc);
        if (err) {
                kfree(agent);
                return err;
        }

        fc->ops = &cpuset_fs_context_ops;

        ctx = cgroup_fc2context(fc);
        ctx->subsys_mask = 1 << cpuset_cgrp_id;
        ctx->flags |= CGRP_ROOT_NOPREFIX;
        ctx->release_agent = agent;

        get_filesystem(&cgroup_fs_type);
        put_filesystem(fc->fs_type);
        fc->fs_type = &cgroup_fs_type;

        return 0;
}

static struct file_system_type cpuset_fs_type = {
        .name                        = "cpuset",
        .init_fs_context        = cpuset_init_fs_context,
        .fs_flags                = FS_USERNS_MOUNT,
};
#endif

int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
                          struct cgroup_namespace *ns)
{
        struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);

        return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
}

int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
                   struct cgroup_namespace *ns)
{
        int ret;

        mutex_lock(&cgroup_mutex);
        spin_lock_irq(&css_set_lock);

        ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);

        spin_unlock_irq(&css_set_lock);
        mutex_unlock(&cgroup_mutex);

        return ret;
}
EXPORT_SYMBOL_GPL(cgroup_path_ns);

/**
 * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
 * @task: target task
 * @buf: the buffer to write the path into
 * @buflen: the length of the buffer
 *
 * Determine @task's cgroup on the first (the one with the lowest non-zero
 * hierarchy_id) cgroup hierarchy and copy its path into @buf.  This
 * function grabs cgroup_mutex and shouldn't be used inside locks used by
 * cgroup controller callbacks.
 *
 * Return value is the same as kernfs_path().
 */
int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
{
        struct cgroup_root *root;
        struct cgroup *cgrp;
        int hierarchy_id = 1;
        int ret;

        mutex_lock(&cgroup_mutex);
        spin_lock_irq(&css_set_lock);

        root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);

        if (root) {
                cgrp = task_cgroup_from_root(task, root);
                ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
        } else {
                /* if no hierarchy exists, everyone is in "/" */
                ret = strlcpy(buf, "/", buflen);
        }

        spin_unlock_irq(&css_set_lock);
        mutex_unlock(&cgroup_mutex);
        return ret;
}
EXPORT_SYMBOL_GPL(task_cgroup_path);

/**
 * cgroup_attach_lock - Lock for ->attach()
 * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem
 *
 * cgroup migration sometimes needs to stabilize threadgroups against forks and
 * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
 * implementations (e.g. cpuset), also need to disable CPU hotplug.
 * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can
 * lead to deadlocks.
 *
 * Bringing up a CPU may involve creating and destroying tasks which requires
 * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside
 * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while
 * write-locking threadgroup_rwsem, the locking order is reversed and we end up
 * waiting for an on-going CPU hotplug operation which in turn is waiting for
 * the threadgroup_rwsem to be released to create new tasks. For more details:
 *
 *   http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu
 *
 * Resolve the situation by always acquiring cpus_read_lock() before optionally
 * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
 * CPU hotplug is disabled on entry.
 */
static void cgroup_attach_lock(bool lock_threadgroup)
{
        cpus_read_lock();
        if (lock_threadgroup)
                percpu_down_write(&cgroup_threadgroup_rwsem);
}

/**
 * cgroup_attach_unlock - Undo cgroup_attach_lock()
 * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
 */
static void cgroup_attach_unlock(bool lock_threadgroup)
{
        if (lock_threadgroup)
                percpu_up_write(&cgroup_threadgroup_rwsem);
        cpus_read_unlock();
}

/**
 * cgroup_migrate_add_task - add a migration target task to a migration context
 * @task: target task
 * @mgctx: target migration context
 *
 * Add @task, which is a migration target, to @mgctx->tset.  This function
 * becomes noop if @task doesn't need to be migrated.  @task's css_set
 * should have been added as a migration source and @task->cg_list will be
 * moved from the css_set's tasks list to mg_tasks one.
 */
static void cgroup_migrate_add_task(struct task_struct *task,
                                    struct cgroup_mgctx *mgctx)
{
        struct css_set *cset;

        lockdep_assert_held(&css_set_lock);

        /* @task either already exited or can't exit until the end */
        if (task->flags & PF_EXITING)
                return;

        /* cgroup_threadgroup_rwsem protects racing against forks */
        WARN_ON_ONCE(list_empty(&task->cg_list));

        cset = task_css_set(task);
        if (!cset->mg_src_cgrp)
                return;

        mgctx->tset.nr_tasks++;

        css_set_skip_task_iters(cset, task);
        list_move_tail(&task->cg_list, &cset->mg_tasks);
        if (list_empty(&cset->mg_node))
                list_add_tail(&cset->mg_node,
                              &mgctx->tset.src_csets);
        if (list_empty(&cset->mg_dst_cset->mg_node))
                list_add_tail(&cset->mg_dst_cset->mg_node,
                              &mgctx->tset.dst_csets);
}

/**
 * cgroup_taskset_first - reset taskset and return the first task
 * @tset: taskset of interest
 * @dst_cssp: output variable for the destination css
 *
 * @tset iteration is initialized and the first task is returned.
 */
struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
                                         struct cgroup_subsys_state **dst_cssp)
{
        tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
        tset->cur_task = NULL;

        return cgroup_taskset_next(tset, dst_cssp);
}

/**
 * cgroup_taskset_next - iterate to the next task in taskset
 * @tset: taskset of interest
 * @dst_cssp: output variable for the destination css
 *
 * Return the next task in @tset.  Iteration must have been initialized
 * with cgroup_taskset_first().
 */
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
                                        struct cgroup_subsys_state **dst_cssp)
{
        struct css_set *cset = tset->cur_cset;
        struct task_struct *task = tset->cur_task;

        while (&cset->mg_node != tset->csets) {
                if (!task)
                        task = list_first_entry(&cset->mg_tasks,
                                                struct task_struct, cg_list);
                else
                        task = list_next_entry(task, cg_list);

                if (&task->cg_list != &cset->mg_tasks) {
                        tset->cur_cset = cset;
                        tset->cur_task = task;

                        /*
                         * This function may be called both before and
                         * after cgroup_taskset_migrate().  The two cases
                         * can be distinguished by looking at whether @cset
                         * has its ->mg_dst_cset set.
                         */
                        if (cset->mg_dst_cset)
                                *dst_cssp = cset->mg_dst_cset->subsys[tset->ssid];
                        else
                                *dst_cssp = cset->subsys[tset->ssid];

                        return task;
                }

                cset = list_next_entry(cset, mg_node);
                task = NULL;
        }

        return NULL;
}

/**
 * cgroup_taskset_migrate - migrate a taskset
 * @mgctx: migration context
 *
 * Migrate tasks in @mgctx as setup by migration preparation functions.
 * This function fails iff one of the ->can_attach callbacks fails and
 * guarantees that either all or none of the tasks in @mgctx are migrated.
 * @mgctx is consumed regardless of success.
 */
static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
{
        struct cgroup_taskset *tset = &mgctx->tset;
        struct cgroup_subsys *ss;
        struct task_struct *task, *tmp_task;
        struct css_set *cset, *tmp_cset;
        int ssid, failed_ssid, ret;

        /* check that we can legitimately attach to the cgroup */
        if (tset->nr_tasks) {
                do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
                        if (ss->can_attach) {
                                tset->ssid = ssid;
                                ret = ss->can_attach(tset);
                                if (ret) {
                                        failed_ssid = ssid;
                                        goto out_cancel_attach;
                                }
                        }
                } while_each_subsys_mask();
        }

        /*
         * Now that we're guaranteed success, proceed to move all tasks to
         * the new cgroup.  There are no failure cases after here, so this
         * is the commit point.
         */
        spin_lock_irq(&css_set_lock);
        list_for_each_entry(cset, &tset->src_csets, mg_node) {
                list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
                        struct css_set *from_cset = task_css_set(task);
                        struct css_set *to_cset = cset->mg_dst_cset;

                        get_css_set(to_cset);
                        to_cset->nr_tasks++;
                        css_set_move_task(task, from_cset, to_cset, true);
                        from_cset->nr_tasks--;
                        /*
                         * If the source or destination cgroup is frozen,
                         * the task might require to change its state.
                         */
                        cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp,
                                                    to_cset->dfl_cgrp);
                        put_css_set_locked(from_cset);

                }
        }
        spin_unlock_irq(&css_set_lock);

        /*
         * Migration is committed, all target tasks are now on dst_csets.
         * Nothing is sensitive to fork() after this point.  Notify
         * controllers that migration is complete.
         */
        tset->csets = &tset->dst_csets;

        if (tset->nr_tasks) {
                do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
                        if (ss->attach) {
                                tset->ssid = ssid;
                                ss->attach(tset);
                        }
                } while_each_subsys_mask();
        }

        ret = 0;
        goto out_release_tset;

out_cancel_attach:
        if (tset->nr_tasks) {
                do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
                        if (ssid == failed_ssid)
                                break;
                        if (ss->cancel_attach) {
                                tset->ssid = ssid;
                                ss->cancel_attach(tset);
                        }
                } while_each_subsys_mask();
        }
out_release_tset:
        spin_lock_irq(&css_set_lock);
        list_splice_init(&tset->dst_csets, &tset->src_csets);
        list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
                list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
                list_del_init(&cset->mg_node);
        }
        spin_unlock_irq(&css_set_lock);

        /*
         * Re-initialize the cgroup_taskset structure in case it is reused
         * again in another cgroup_migrate_add_task()/cgroup_migrate_execute()
         * iteration.
         */
        tset->nr_tasks = 0;
        tset->csets    = &tset->src_csets;
        return ret;
}

/**
 * cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination
 * @dst_cgrp: destination cgroup to test
 *
 * On the default hierarchy, except for the mixable, (possible) thread root
 * and threaded cgroups, subtree_control must be zero for migration
 * destination cgroups with tasks so that child cgroups don't compete
 * against tasks.
 */
int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
{
        /* v1 doesn't have any restriction */
        if (!cgroup_on_dfl(dst_cgrp))
                return 0;

        /* verify @dst_cgrp can host resources */
        if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
                return -EOPNOTSUPP;

        /* mixables don't care */
        if (cgroup_is_mixable(dst_cgrp))
                return 0;

        /*
         * If @dst_cgrp is already or can become a thread root or is
         * threaded, it doesn't matter.
         */
        if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp))
                return 0;

        /* apply no-internal-process constraint */
        if (dst_cgrp->subtree_control)
                return -EBUSY;

        return 0;
}

/**
 * cgroup_migrate_finish - cleanup after attach
 * @mgctx: migration context
 *
 * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst().  See
 * those functions for details.
 */
void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
{
        struct css_set *cset, *tmp_cset;

        lockdep_assert_held(&cgroup_mutex);

        spin_lock_irq(&css_set_lock);

        list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets,
                                 mg_src_preload_node) {
                cset->mg_src_cgrp = NULL;
                cset->mg_dst_cgrp = NULL;
                cset->mg_dst_cset = NULL;
                list_del_init(&cset->mg_src_preload_node);
                put_css_set_locked(cset);
        }

        list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets,
                                 mg_dst_preload_node) {
                cset->mg_src_cgrp = NULL;
                cset->mg_dst_cgrp = NULL;
                cset->mg_dst_cset = NULL;
                list_del_init(&cset->mg_dst_preload_node);
                put_css_set_locked(cset);
        }

        spin_unlock_irq(&css_set_lock);
}

/**
 * cgroup_migrate_add_src - add a migration source css_set
 * @src_cset: the source css_set to add
 * @dst_cgrp: the destination cgroup
 * @mgctx: migration context
 *
 * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp.  Pin
 * @src_cset and add it to @mgctx->src_csets, which should later be cleaned
 * up by cgroup_migrate_finish().
 *
 * This function may be called without holding cgroup_threadgroup_rwsem
 * even if the target is a process.  Threads may be created and destroyed
 * but as long as cgroup_mutex is not dropped, no new css_set can be put
 * into play and the preloaded css_sets are guaranteed to cover all
 * migrations.
 */
void cgroup_migrate_add_src(struct css_set *src_cset,
                            struct cgroup *dst_cgrp,
                            struct cgroup_mgctx *mgctx)
{
        struct cgroup *src_cgrp;

        lockdep_assert_held(&cgroup_mutex);
        lockdep_assert_held(&css_set_lock);

        /*
         * If ->dead, @src_set is associated with one or more dead cgroups
         * and doesn't contain any migratable tasks.  Ignore it early so
         * that the rest of migration path doesn't get confused by it.
         */
        if (src_cset->dead)
                return;

        src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);

        if (!list_empty(&src_cset->mg_src_preload_node))
                return;

        WARN_ON(src_cset->mg_src_cgrp);
        WARN_ON(src_cset->mg_dst_cgrp);
        WARN_ON(!list_empty(&src_cset->mg_tasks));
        WARN_ON(!list_empty(&src_cset->mg_node));

        src_cset->mg_src_cgrp = src_cgrp;
        src_cset->mg_dst_cgrp = dst_cgrp;
        get_css_set(src_cset);
        list_add_tail(&src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets);
}

/**
 * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
 * @mgctx: migration context
 *
 * Tasks are about to be moved and all the source css_sets have been
 * preloaded to @mgctx->preloaded_src_csets.  This function looks up and
 * pins all destination css_sets, links each to its source, and append them
 * to @mgctx->preloaded_dst_csets.
 *
 * This function must be called after cgroup_migrate_add_src() has been
 * called on each migration source css_set.  After migration is performed
 * using cgroup_migrate(), cgroup_migrate_finish() must be called on
 * @mgctx.
 */
int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
{
        struct css_set *src_cset, *tmp_cset;

        lockdep_assert_held(&cgroup_mutex);

        /* look up the dst cset for each src cset and link it to src */
        list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
                                 mg_src_preload_node) {
                struct css_set *dst_cset;
                struct cgroup_subsys *ss;
                int ssid;

                dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
                if (!dst_cset)
                        return -ENOMEM;

                WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);

                /*
                 * If src cset equals dst, it's noop.  Drop the src.
                 * cgroup_migrate() will skip the cset too.  Note that we
                 * can't handle src == dst as some nodes are used by both.
                 */
                if (src_cset == dst_cset) {
                        src_cset->mg_src_cgrp = NULL;
                        src_cset->mg_dst_cgrp = NULL;
                        list_del_init(&src_cset->mg_src_preload_node);
                        put_css_set(src_cset);
                        put_css_set(dst_cset);
                        continue;
                }

                src_cset->mg_dst_cset = dst_cset;

                if (list_empty(&dst_cset->mg_dst_preload_node))
                        list_add_tail(&dst_cset->mg_dst_preload_node,
                                      &mgctx->preloaded_dst_csets);
                else
                        put_css_set(dst_cset);

                for_each_subsys(ss, ssid)
                        if (src_cset->subsys[ssid] != dst_cset->subsys[ssid])
                                mgctx->ss_mask |= 1 << ssid;
        }

        return 0;
}

/**
 * cgroup_migrate - migrate a process or task to a cgroup
 * @leader: the leader of the process or the task to migrate
 * @threadgroup: whether @leader points to the whole process or a single task
 * @mgctx: migration context
 *
 * Migrate a process or task denoted by @leader.  If migrating a process,
 * the caller must be holding cgroup_threadgroup_rwsem.  The caller is also
 * responsible for invoking cgroup_migrate_add_src() and
 * cgroup_migrate_prepare_dst() on the targets before invoking this
 * function and following up with cgroup_migrate_finish().
 *
 * As long as a controller's ->can_attach() doesn't fail, this function is
 * guaranteed to succeed.  This means that, excluding ->can_attach()
 * failure, when migrating multiple targets, the success or failure can be
 * decided for all targets by invoking group_migrate_prepare_dst() before
 * actually starting migrating.
 */
int cgroup_migrate(struct task_struct *leader, bool threadgroup,
                   struct cgroup_mgctx *mgctx)
{
        struct task_struct *task;

        /*
         * Prevent freeing of tasks while we take a snapshot. Tasks that are
         * already PF_EXITING could be freed from underneath us unless we
         * take an rcu_read_lock.
         */
        spin_lock_irq(&css_set_lock);
        rcu_read_lock();
        task = leader;
        do {
                cgroup_migrate_add_task(task, mgctx);
                if (!threadgroup)
                        break;
        } while_each_thread(leader, task);
        rcu_read_unlock();
        spin_unlock_irq(&css_set_lock);

        return cgroup_migrate_execute(mgctx);
}

/**
 * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
 * @dst_cgrp: the cgroup to attach to
 * @leader: the task or the leader of the threadgroup to be attached
 * @threadgroup: attach the whole threadgroup?
 *
 * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
 */
int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
                       bool threadgroup)
{
        DEFINE_CGROUP_MGCTX(mgctx);
        struct task_struct *task;
        int ret = 0;

        /* look up all src csets */
        spin_lock_irq(&css_set_lock);
        rcu_read_lock();
        task = leader;
        do {
                cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
                if (!threadgroup)
                        break;
        } while_each_thread(leader, task);
        rcu_read_unlock();
        spin_unlock_irq(&css_set_lock);

        /* prepare dst csets and commit */
        ret = cgroup_migrate_prepare_dst(&mgctx);
        if (!ret)
                ret = cgroup_migrate(leader, threadgroup, &mgctx);

        cgroup_migrate_finish(&mgctx);

        if (!ret)
                TRACE_CGROUP_PATH(attach_task, dst_cgrp, leader, threadgroup);

        return ret;
}

struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
                                             bool *threadgroup_locked)
{
        struct task_struct *tsk;
        pid_t pid;

        if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
                return ERR_PTR(-EINVAL);

        /*
         * If we migrate a single thread, we don't care about threadgroup
         * stability. If the thread is `current`, it won't exit(2) under our
         * hands or change PID through exec(2). We exclude
         * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write
         * callers by cgroup_mutex.
         * Therefore, we can skip the global lock.
         */
        lockdep_assert_held(&cgroup_mutex);
        *threadgroup_locked = pid || threadgroup;
        cgroup_attach_lock(*threadgroup_locked);

        rcu_read_lock();
        if (pid) {
                tsk = find_task_by_vpid(pid);
                if (!tsk) {
                        tsk = ERR_PTR(-ESRCH);
                        goto out_unlock_threadgroup;
                }
        } else {
                tsk = current;
        }

        if (threadgroup)
                tsk = tsk->group_leader;

        /*
         * kthreads may acquire PF_NO_SETAFFINITY during initialization.
         * If userland migrates such a kthread to a non-root cgroup, it can
         * become trapped in a cpuset, or RT kthread may be born in a
         * cgroup with no rt_runtime allocated.  Just say no.
         */
        if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
                tsk = ERR_PTR(-EINVAL);
                goto out_unlock_threadgroup;
        }

        get_task_struct(tsk);
        goto out_unlock_rcu;

out_unlock_threadgroup:
        cgroup_attach_unlock(*threadgroup_locked);
        *threadgroup_locked = false;
out_unlock_rcu:
        rcu_read_unlock();
        return tsk;
}

void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked)
{
        struct cgroup_subsys *ss;
        int ssid;

        /* release reference from cgroup_procs_write_start() */
        put_task_struct(task);

        cgroup_attach_unlock(threadgroup_locked);

        for_each_subsys(ss, ssid)
                if (ss->post_attach)
                        ss->post_attach();
}

static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
{
        struct cgroup_subsys *ss;
        bool printed = false;
        int ssid;

        do_each_subsys_mask(ss, ssid, ss_mask) {
                if (printed)
                        seq_putc(seq, ' ');
                seq_puts(seq, ss->name);
                printed = true;
        } while_each_subsys_mask();
        if (printed)
                seq_putc(seq, '\n');
}

/* show controllers which are enabled from the parent */
static int cgroup_controllers_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;

        cgroup_print_ss_mask(seq, cgroup_control(cgrp));
        return 0;
}

/* show controllers which are enabled for a given cgroup's children */
static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;

        cgroup_print_ss_mask(seq, cgrp->subtree_control);
        return 0;
}

/**
 * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
 * @cgrp: root of the subtree to update csses for
 *
 * @cgrp's control masks have changed and its subtree's css associations
 * need to be updated accordingly.  This function looks up all css_sets
 * which are attached to the subtree, creates the matching updated css_sets
 * and migrates the tasks to the new ones.
 */
static int cgroup_update_dfl_csses(struct cgroup *cgrp)
{
        DEFINE_CGROUP_MGCTX(mgctx);
        struct cgroup_subsys_state *d_css;
        struct cgroup *dsct;
        struct css_set *src_cset;
        bool has_tasks;
        int ret;

        lockdep_assert_held(&cgroup_mutex);

        /* look up all csses currently attached to @cgrp's subtree */
        spin_lock_irq(&css_set_lock);
        cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
                struct cgrp_cset_link *link;

                list_for_each_entry(link, &dsct->cset_links, cset_link)
                        cgroup_migrate_add_src(link->cset, dsct, &mgctx);
        }
        spin_unlock_irq(&css_set_lock);

        /*
         * We need to write-lock threadgroup_rwsem while migrating tasks.
         * However, if there are no source csets for @cgrp, changing its
         * controllers isn't gonna produce any task migrations and the
         * write-locking can be skipped safely.
         */
        has_tasks = !list_empty(&mgctx.preloaded_src_csets);
        cgroup_attach_lock(has_tasks);

        /* NULL dst indicates self on default hierarchy */
        ret = cgroup_migrate_prepare_dst(&mgctx);
        if (ret)
                goto out_finish;

        spin_lock_irq(&css_set_lock);
        list_for_each_entry(src_cset, &mgctx.preloaded_src_csets,
                            mg_src_preload_node) {
                struct task_struct *task, *ntask;

                /* all tasks in src_csets need to be migrated */
                list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
                        cgroup_migrate_add_task(task, &mgctx);
        }
        spin_unlock_irq(&css_set_lock);

        ret = cgroup_migrate_execute(&mgctx);
out_finish:
        cgroup_migrate_finish(&mgctx);
        cgroup_attach_unlock(has_tasks);
        return ret;
}

/**
 * cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses
 * @cgrp: root of the target subtree
 *
 * Because css offlining is asynchronous, userland may try to re-enable a
 * controller while the previous css is still around.  This function grabs
 * cgroup_mutex and drains the previous css instances of @cgrp's subtree.
 */
void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
        __acquires(&cgroup_mutex)
{
        struct cgroup *dsct;
        struct cgroup_subsys_state *d_css;
        struct cgroup_subsys *ss;
        int ssid;

restart:
        mutex_lock(&cgroup_mutex);

        cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
                for_each_subsys(ss, ssid) {
                        struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
                        DEFINE_WAIT(wait);

                        if (!css || !percpu_ref_is_dying(&css->refcnt))
                                continue;

                        cgroup_get_live(dsct);
                        prepare_to_wait(&dsct->offline_waitq, &wait,
                                        TASK_UNINTERRUPTIBLE);

                        mutex_unlock(&cgroup_mutex);
                        schedule();
                        finish_wait(&dsct->offline_waitq, &wait);

                        cgroup_put(dsct);
                        goto restart;
                }
        }
}

/**
 * cgroup_save_control - save control masks and dom_cgrp of a subtree
 * @cgrp: root of the target subtree
 *
 * Save ->subtree_control, ->subtree_ss_mask and ->dom_cgrp to the
 * respective old_ prefixed fields for @cgrp's subtree including @cgrp
 * itself.
 */
static void cgroup_save_control(struct cgroup *cgrp)
{
        struct cgroup *dsct;
        struct cgroup_subsys_state *d_css;

        cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
                dsct->old_subtree_control = dsct->subtree_control;
                dsct->old_subtree_ss_mask = dsct->subtree_ss_mask;
                dsct->old_dom_cgrp = dsct->dom_cgrp;
        }
}

/**
 * cgroup_propagate_control - refresh control masks of a subtree
 * @cgrp: root of the target subtree
 *
 * For @cgrp and its subtree, ensure ->subtree_ss_mask matches
 * ->subtree_control and propagate controller availability through the
 * subtree so that descendants don't have unavailable controllers enabled.
 */
static void cgroup_propagate_control(struct cgroup *cgrp)
{
        struct cgroup *dsct;
        struct cgroup_subsys_state *d_css;

        cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
                dsct->subtree_control &= cgroup_control(dsct);
                dsct->subtree_ss_mask =
                        cgroup_calc_subtree_ss_mask(dsct->subtree_control,
                                                    cgroup_ss_mask(dsct));
        }
}

/**
 * cgroup_restore_control - restore control masks and dom_cgrp of a subtree
 * @cgrp: root of the target subtree
 *
 * Restore ->subtree_control, ->subtree_ss_mask and ->dom_cgrp from the
 * respective old_ prefixed fields for @cgrp's subtree including @cgrp
 * itself.
 */
static void cgroup_restore_control(struct cgroup *cgrp)
{
        struct cgroup *dsct;
        struct cgroup_subsys_state *d_css;

        cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
                dsct->subtree_control = dsct->old_subtree_control;
                dsct->subtree_ss_mask = dsct->old_subtree_ss_mask;
                dsct->dom_cgrp = dsct->old_dom_cgrp;
        }
}

static bool css_visible(struct cgroup_subsys_state *css)
{
        struct cgroup_subsys *ss = css->ss;
        struct cgroup *cgrp = css->cgroup;

        if (cgroup_control(cgrp) & (1 << ss->id))
                return true;
        if (!(cgroup_ss_mask(cgrp) & (1 << ss->id)))
                return false;
        return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl;
}

/**
 * cgroup_apply_control_enable - enable or show csses according to control
 * @cgrp: root of the target subtree
 *
 * Walk @cgrp's subtree and create new csses or make the existing ones
 * visible.  A css is created invisible if it's being implicitly enabled
 * through dependency.  An invisible css is made visible when the userland
 * explicitly enables it.
 *
 * Returns 0 on success, -errno on failure.  On failure, csses which have
 * been processed already aren't cleaned up.  The caller is responsible for
 * cleaning up with cgroup_apply_control_disable().
 */
static int cgroup_apply_control_enable(struct cgroup *cgrp)
{
        struct cgroup *dsct;
        struct cgroup_subsys_state *d_css;
        struct cgroup_subsys *ss;
        int ssid, ret;

        cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
                for_each_subsys(ss, ssid) {
                        struct cgroup_subsys_state *css = cgroup_css(dsct, ss);

                        if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
                                continue;

                        if (!css) {
                                css = css_create(dsct, ss);
                                if (IS_ERR(css))
                                        return PTR_ERR(css);
                        }

                        WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));

                        if (css_visible(css)) {
                                ret = css_populate_dir(css);
                                if (ret)
                                        return ret;
                        }
                }
        }

        return 0;
}

/**
 * cgroup_apply_control_disable - kill or hide csses according to control
 * @cgrp: root of the target subtree
 *
 * Walk @cgrp's subtree and kill and hide csses so that they match
 * cgroup_ss_mask() and cgroup_visible_mask().
 *
 * A css is hidden when the userland requests it to be disabled while other
 * subsystems are still depending on it.  The css must not actively control
 * resources and be in the vanilla state if it's made visible again later.
 * Controllers which may be depended upon should provide ->css_reset() for
 * this purpose.
 */
static void cgroup_apply_control_disable(struct cgroup *cgrp)
{
        struct cgroup *dsct;
        struct cgroup_subsys_state *d_css;
        struct cgroup_subsys *ss;
        int ssid;

        cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
                for_each_subsys(ss, ssid) {
                        struct cgroup_subsys_state *css = cgroup_css(dsct, ss);

                        if (!css)
                                continue;

                        WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));

                        if (css->parent &&
                            !(cgroup_ss_mask(dsct) & (1 << ss->id))) {
                                kill_css(css);
                        } else if (!css_visible(css)) {
                                css_clear_dir(css);
                                if (ss->css_reset)
                                        ss->css_reset(css);
                        }
                }
        }
}

/**
 * cgroup_apply_control - apply control mask updates to the subtree
 * @cgrp: root of the target subtree
 *
 * subsystems can be enabled and disabled in a subtree using the following
 * steps.
 *
 * 1. Call cgroup_save_control() to stash the current state.
 * 2. Update ->subtree_control masks in the subtree as desired.
 * 3. Call cgroup_apply_control() to apply the changes.
 * 4. Optionally perform other related operations.
 * 5. Call cgroup_finalize_control() to finish up.
 *
 * This function implements step 3 and propagates the mask changes
 * throughout @cgrp's subtree, updates csses accordingly and perform
 * process migrations.
 */
static int cgroup_apply_control(struct cgroup *cgrp)
{
        int ret;

        cgroup_propagate_control(cgrp);

        ret = cgroup_apply_control_enable(cgrp);
        if (ret)
                return ret;

        /*
         * At this point, cgroup_e_css_by_mask() results reflect the new csses
         * making the following cgroup_update_dfl_csses() properly update
         * css associations of all tasks in the subtree.
         */
        ret = cgroup_update_dfl_csses(cgrp);
        if (ret)
                return ret;

        return 0;
}

/**
 * cgroup_finalize_control - finalize control mask update
 * @cgrp: root of the target subtree
 * @ret: the result of the update
 *
 * Finalize control mask update.  See cgroup_apply_control() for more info.
 */
static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
{
        if (ret) {
                cgroup_restore_control(cgrp);
                cgroup_propagate_control(cgrp);
        }

        cgroup_apply_control_disable(cgrp);
}

static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
{
        u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;

        /* if nothing is getting enabled, nothing to worry about */
        if (!enable)
                return 0;

        /* can @cgrp host any resources? */
        if (!cgroup_is_valid_domain(cgrp->dom_cgrp))
                return -EOPNOTSUPP;

        /* mixables don't care */
        if (cgroup_is_mixable(cgrp))
                return 0;

        if (domain_enable) {
                /* can't enable domain controllers inside a thread subtree */
                if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp))
                        return -EOPNOTSUPP;
        } else {
                /*
                 * Threaded controllers can handle internal competitions
                 * and are always allowed inside a (prospective) thread
                 * subtree.
                 */
                if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp))
                        return 0;
        }

        /*
         * Controllers can't be enabled for a cgroup with tasks to avoid
         * child cgroups competing against tasks.
         */
        if (cgroup_has_tasks(cgrp))
                return -EBUSY;

        return 0;
}

/* change the enabled child controllers for a cgroup in the default hierarchy */
static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
                                            char *buf, size_t nbytes,
                                            loff_t off)
{
        u16 enable = 0, disable = 0;
        struct cgroup *cgrp, *child;
        struct cgroup_subsys *ss;
        char *tok;
        int ssid, ret;

        /*
         * Parse input - space separated list of subsystem names prefixed
         * with either + or -.
         */
        buf = strstrip(buf);
        while ((tok = strsep(&buf, " "))) {
                if (tok[0] == '\0')
                        continue;
                do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) {
                        if (!cgroup_ssid_enabled(ssid) ||
                            strcmp(tok + 1, ss->name))
                                continue;

                        if (*tok == '+') {
                                enable |= 1 << ssid;
                                disable &= ~(1 << ssid);
                        } else if (*tok == '-') {
                                disable |= 1 << ssid;
                                enable &= ~(1 << ssid);
                        } else {
                                return -EINVAL;
                        }
                        break;
                } while_each_subsys_mask();
                if (ssid == CGROUP_SUBSYS_COUNT)
                        return -EINVAL;
        }

        cgrp = cgroup_kn_lock_live(of->kn, true);
        if (!cgrp)
                return -ENODEV;

        for_each_subsys(ss, ssid) {
                if (enable & (1 << ssid)) {
                        if (cgrp->subtree_control & (1 << ssid)) {
                                enable &= ~(1 << ssid);
                                continue;
                        }

                        if (!(cgroup_control(cgrp) & (1 << ssid))) {
                                ret = -ENOENT;
                                goto out_unlock;
                        }
                } else if (disable & (1 << ssid)) {
                        if (!(cgrp->subtree_control & (1 << ssid))) {
                                disable &= ~(1 << ssid);
                                continue;
                        }

                        /* a child has it enabled? */
                        cgroup_for_each_live_child(child, cgrp) {
                                if (child->subtree_control & (1 << ssid)) {
                                        ret = -EBUSY;
                                        goto out_unlock;
                                }
                        }
                }
        }

        if (!enable && !disable) {
                ret = 0;
                goto out_unlock;
        }

        ret = cgroup_vet_subtree_control_enable(cgrp, enable);
        if (ret)
                goto out_unlock;

        /* save and update control masks and prepare csses */
        cgroup_save_control(cgrp);

        cgrp->subtree_control |= enable;
        cgrp->subtree_control &= ~disable;

        ret = cgroup_apply_control(cgrp);
        cgroup_finalize_control(cgrp, ret);
        if (ret)
                goto out_unlock;

        kernfs_activate(cgrp->kn);
out_unlock:
        cgroup_kn_unlock(of->kn);
        return ret ?: nbytes;
}

/**
 * cgroup_enable_threaded - make @cgrp threaded
 * @cgrp: the target cgroup
 *
 * Called when "threaded" is written to the cgroup.type interface file and
 * tries to make @cgrp threaded and join the parent's resource domain.
 * This function is never called on the root cgroup as cgroup.type doesn't
 * exist on it.
 */
static int cgroup_enable_threaded(struct cgroup *cgrp)
{
        struct cgroup *parent = cgroup_parent(cgrp);
        struct cgroup *dom_cgrp = parent->dom_cgrp;
        struct cgroup *dsct;
        struct cgroup_subsys_state *d_css;
        int ret;

        lockdep_assert_held(&cgroup_mutex);

        /* noop if already threaded */
        if (cgroup_is_threaded(cgrp))
                return 0;

        /*
         * If @cgroup is populated or has domain controllers enabled, it
         * can't be switched.  While the below cgroup_can_be_thread_root()
         * test can catch the same conditions, that's only when @parent is
         * not mixable, so let's check it explicitly.
         */
        if (cgroup_is_populated(cgrp) ||
            cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
                return -EOPNOTSUPP;

        /* we're joining the parent's domain, ensure its validity */
        if (!cgroup_is_valid_domain(dom_cgrp) ||
            !cgroup_can_be_thread_root(dom_cgrp))
                return -EOPNOTSUPP;

        /*
         * The following shouldn't cause actual migrations and should
         * always succeed.
         */
        cgroup_save_control(cgrp);

        cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)
                if (dsct == cgrp || cgroup_is_threaded(dsct))
                        dsct->dom_cgrp = dom_cgrp;

        ret = cgroup_apply_control(cgrp);
        if (!ret)
                parent->nr_threaded_children++;

        cgroup_finalize_control(cgrp, ret);
        return ret;
}

static int cgroup_type_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;

        if (cgroup_is_threaded(cgrp))
                seq_puts(seq, "threaded\n");
        else if (!cgroup_is_valid_domain(cgrp))
                seq_puts(seq, "domain invalid\n");
        else if (cgroup_is_thread_root(cgrp))
                seq_puts(seq, "domain threaded\n");
        else
                seq_puts(seq, "domain\n");

        return 0;
}

static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
                                 size_t nbytes, loff_t off)
{
        struct cgroup *cgrp;
        int ret;

        /* only switching to threaded mode is supported */
        if (strcmp(strstrip(buf), "threaded"))
                return -EINVAL;

        /* drain dying csses before we re-apply (threaded) subtree control */
        cgrp = cgroup_kn_lock_live(of->kn, true);
        if (!cgrp)
                return -ENOENT;

        /* threaded can only be enabled */
        ret = cgroup_enable_threaded(cgrp);

        cgroup_kn_unlock(of->kn);
        return ret ?: nbytes;
}

static int cgroup_max_descendants_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;
        int descendants = READ_ONCE(cgrp->max_descendants);

        if (descendants == INT_MAX)
                seq_puts(seq, "max\n");
        else
                seq_printf(seq, "%d\n", descendants);

        return 0;
}

static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of,
                                           char *buf, size_t nbytes, loff_t off)
{
        struct cgroup *cgrp;
        int descendants;
        ssize_t ret;

        buf = strstrip(buf);
        if (!strcmp(buf, "max")) {
                descendants = INT_MAX;
        } else {
                ret = kstrtoint(buf, 0, &descendants);
                if (ret)
                        return ret;
        }

        if (descendants < 0)
                return -ERANGE;

        cgrp = cgroup_kn_lock_live(of->kn, false);
        if (!cgrp)
                return -ENOENT;

        cgrp->max_descendants = descendants;

        cgroup_kn_unlock(of->kn);

        return nbytes;
}

static int cgroup_max_depth_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;
        int depth = READ_ONCE(cgrp->max_depth);

        if (depth == INT_MAX)
                seq_puts(seq, "max\n");
        else
                seq_printf(seq, "%d\n", depth);

        return 0;
}

static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
                                      char *buf, size_t nbytes, loff_t off)
{
        struct cgroup *cgrp;
        ssize_t ret;
        int depth;

        buf = strstrip(buf);
        if (!strcmp(buf, "max")) {
                depth = INT_MAX;
        } else {
                ret = kstrtoint(buf, 0, &depth);
                if (ret)
                        return ret;
        }

        if (depth < 0)
                return -ERANGE;

        cgrp = cgroup_kn_lock_live(of->kn, false);
        if (!cgrp)
                return -ENOENT;

        cgrp->max_depth = depth;

        cgroup_kn_unlock(of->kn);

        return nbytes;
}

static int cgroup_events_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;

        seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp));
        seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags));

        return 0;
}

static int cgroup_stat_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgroup = seq_css(seq)->cgroup;

        seq_printf(seq, "nr_descendants %d\n",
                   cgroup->nr_descendants);
        seq_printf(seq, "nr_dying_descendants %d\n",
                   cgroup->nr_dying_descendants);

        return 0;
}

static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
                                                 struct cgroup *cgrp, int ssid)
{
        struct cgroup_subsys *ss = cgroup_subsys[ssid];
        struct cgroup_subsys_state *css;
        int ret;

        if (!ss->css_extra_stat_show)
                return 0;

        css = cgroup_tryget_css(cgrp, ss);
        if (!css)
                return 0;

        ret = ss->css_extra_stat_show(seq, css);
        css_put(css);
        return ret;
}

static int cpu_stat_show(struct seq_file *seq, void *v)
{
        struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
        int ret = 0;

        cgroup_base_stat_cputime_show(seq);
#ifdef CONFIG_CGROUP_SCHED
        ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
#endif
        return ret;
}

#ifdef CONFIG_PSI
static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;
        struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;

        return psi_show(seq, psi, PSI_IO);
}
static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;
        struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;

        return psi_show(seq, psi, PSI_MEM);
}
static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;
        struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;

        return psi_show(seq, psi, PSI_CPU);
}

static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
                                          size_t nbytes, enum psi_res res)
{
        struct cgroup_file_ctx *ctx = of->priv;
        struct psi_trigger *new;
        struct cgroup *cgrp;
        struct psi_group *psi;

        cgrp = cgroup_kn_lock_live(of->kn, false);
        if (!cgrp)
                return -ENODEV;

        cgroup_get(cgrp);
        cgroup_kn_unlock(of->kn);

        /* Allow only one trigger per file descriptor */
        if (ctx->psi.trigger) {
                cgroup_put(cgrp);
                return -EBUSY;
        }

        psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
        new = psi_trigger_create(psi, buf, nbytes, res);
        if (IS_ERR(new)) {
                cgroup_put(cgrp);
                return PTR_ERR(new);
        }

        smp_store_release(&ctx->psi.trigger, new);
        cgroup_put(cgrp);

        return nbytes;
}

static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
                                          char *buf, size_t nbytes,
                                          loff_t off)
{
        return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
}

static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
                                          char *buf, size_t nbytes,
                                          loff_t off)
{
        return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
}

static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
                                          char *buf, size_t nbytes,
                                          loff_t off)
{
        return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
}

static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
                                          poll_table *pt)
{
        struct cgroup_file_ctx *ctx = of->priv;
        return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
}

static void cgroup_pressure_release(struct kernfs_open_file *of)
{
        struct cgroup_file_ctx *ctx = of->priv;

        psi_trigger_destroy(ctx->psi.trigger);
}
#endif /* CONFIG_PSI */

static int cgroup_freeze_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;

        seq_printf(seq, "%d\n", cgrp->freezer.freeze);

        return 0;
}

static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
                                   char *buf, size_t nbytes, loff_t off)
{
        struct cgroup *cgrp;
        ssize_t ret;
        int freeze;

        ret = kstrtoint(strstrip(buf), 0, &freeze);
        if (ret)
                return ret;

        if (freeze < 0 || freeze > 1)
                return -ERANGE;

        cgrp = cgroup_kn_lock_live(of->kn, false);
        if (!cgrp)
                return -ENOENT;

        cgroup_freeze(cgrp, freeze);

        cgroup_kn_unlock(of->kn);

        return nbytes;
}

static int cgroup_file_open(struct kernfs_open_file *of)
{
        struct cftype *cft = of->kn->priv;
        struct cgroup_file_ctx *ctx;
        int ret;

        ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
        if (!ctx)
                return -ENOMEM;

        ctx->ns = current->nsproxy->cgroup_ns;
        get_cgroup_ns(ctx->ns);
        of->priv = ctx;

        if (!cft->open)
                return 0;

        ret = cft->open(of);
        if (ret) {
                put_cgroup_ns(ctx->ns);
                kfree(ctx);
        }
        return ret;
}

static void cgroup_file_release(struct kernfs_open_file *of)
{
        struct cftype *cft = of->kn->priv;
        struct cgroup_file_ctx *ctx = of->priv;

        if (cft->release)
                cft->release(of);
        put_cgroup_ns(ctx->ns);
        kfree(ctx);
}

static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
                                 size_t nbytes, loff_t off)
{
        struct cgroup_file_ctx *ctx = of->priv;
        struct cgroup *cgrp = of->kn->parent->priv;
        struct cftype *cft = of->kn->priv;
        struct cgroup_subsys_state *css;
        int ret;

        if (!nbytes)
                return 0;

        /*
         * If namespaces are delegation boundaries, disallow writes to
         * files in an non-init namespace root from inside the namespace
         * except for the files explicitly marked delegatable -
         * cgroup.procs and cgroup.subtree_control.
         */
        if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
            !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
            ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp)
                return -EPERM;

        if (cft->write)
                return cft->write(of, buf, nbytes, off);

        /*
         * kernfs guarantees that a file isn't deleted with operations in
         * flight, which means that the matching css is and stays alive and
         * doesn't need to be pinned.  The RCU locking is not necessary
         * either.  It's just for the convenience of using cgroup_css().
         */
        rcu_read_lock();
        css = cgroup_css(cgrp, cft->ss);
        rcu_read_unlock();

        if (cft->write_u64) {
                unsigned long long v;
                ret = kstrtoull(buf, 0, &v);
                if (!ret)
                        ret = cft->write_u64(css, cft, v);
        } else if (cft->write_s64) {
                long long v;
                ret = kstrtoll(buf, 0, &v);
                if (!ret)
                        ret = cft->write_s64(css, cft, v);
        } else {
                ret = -EINVAL;
        }

        return ret ?: nbytes;
}

static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt)
{
        struct cftype *cft = of->kn->priv;

        if (cft->poll)
                return cft->poll(of, pt);

        return kernfs_generic_poll(of, pt);
}

static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
{
        return seq_cft(seq)->seq_start(seq, ppos);
}

static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
{
        return seq_cft(seq)->seq_next(seq, v, ppos);
}

static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
{
        if (seq_cft(seq)->seq_stop)
                seq_cft(seq)->seq_stop(seq, v);
}

static int cgroup_seqfile_show(struct seq_file *m, void *arg)
{
        struct cftype *cft = seq_cft(m);
        struct cgroup_subsys_state *css = seq_css(m);

        if (cft->seq_show)
                return cft->seq_show(m, arg);

        if (cft->read_u64)
                seq_printf(m, "%llu\n", cft->read_u64(css, cft));
        else if (cft->read_s64)
                seq_printf(m, "%lld\n", cft->read_s64(css, cft));
        else
                return -EINVAL;
        return 0;
}

static struct kernfs_ops cgroup_kf_single_ops = {
        .atomic_write_len        = PAGE_SIZE,
        .open                        = cgroup_file_open,
        .release                = cgroup_file_release,
        .write                        = cgroup_file_write,
        .poll                        = cgroup_file_poll,
        .seq_show                = cgroup_seqfile_show,
};

static struct kernfs_ops cgroup_kf_ops = {
        .atomic_write_len        = PAGE_SIZE,
        .open                        = cgroup_file_open,
        .release                = cgroup_file_release,
        .write                        = cgroup_file_write,
        .poll                        = cgroup_file_poll,
        .seq_start                = cgroup_seqfile_start,
        .seq_next                = cgroup_seqfile_next,
        .seq_stop                = cgroup_seqfile_stop,
        .seq_show                = cgroup_seqfile_show,
};

/* set uid and gid of cgroup dirs and files to that of the creator */
static int cgroup_kn_set_ugid(struct kernfs_node *kn)
{
        struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
                               .ia_uid = current_fsuid(),
                               .ia_gid = current_fsgid(), };

        if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
            gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
                return 0;

        return kernfs_setattr(kn, &iattr);
}

static void cgroup_file_notify_timer(struct timer_list *timer)
{
        cgroup_file_notify(container_of(timer, struct cgroup_file,
                                        notify_timer));
}

static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
                           struct cftype *cft)
{
        char name[CGROUP_FILE_NAME_MAX];
        struct kernfs_node *kn;
        struct lock_class_key *key = NULL;
        int ret;

#ifdef CONFIG_DEBUG_LOCK_ALLOC
        key = &cft->lockdep_key;
#endif
        kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
                                  cgroup_file_mode(cft),
                                  GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
                                  0, cft->kf_ops, cft,
                                  NULL, key);
        if (IS_ERR(kn))
                return PTR_ERR(kn);

        ret = cgroup_kn_set_ugid(kn);
        if (ret) {
                kernfs_remove(kn);
                return ret;
        }

        if (cft->file_offset) {
                struct cgroup_file *cfile = (void *)css + cft->file_offset;

                timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);

                spin_lock_irq(&cgroup_file_kn_lock);
                cfile->kn = kn;
                spin_unlock_irq(&cgroup_file_kn_lock);
        }

        return 0;
}

/**
 * cgroup_addrm_files - add or remove files to a cgroup directory
 * @css: the target css
 * @cgrp: the target cgroup (usually css->cgroup)
 * @cfts: array of cftypes to be added
 * @is_add: whether to add or remove
 *
 * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
 * For removals, this function never fails.
 */
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
                              struct cgroup *cgrp, struct cftype cfts[],
                              bool is_add)
{
        struct cftype *cft, *cft_end = NULL;
        int ret = 0;

        lockdep_assert_held(&cgroup_mutex);

restart:
        for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
                /* does cft->flags tell us to skip this file on @cgrp? */
                if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
                        continue;
                if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
                        continue;
                if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
                        continue;
                if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
                        continue;
                if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug)
                        continue;
                if (is_add) {
                        ret = cgroup_add_file(css, cgrp, cft);
                        if (ret) {
                                pr_warn("%s: failed to add %s, err=%d\n",
                                        __func__, cft->name, ret);
                                cft_end = cft;
                                is_add = false;
                                goto restart;
                        }
                } else {
                        cgroup_rm_file(cgrp, cft);
                }
        }
        return ret;
}

static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
{
        struct cgroup_subsys *ss = cfts[0].ss;
        struct cgroup *root = &ss->root->cgrp;
        struct cgroup_subsys_state *css;
        int ret = 0;

        lockdep_assert_held(&cgroup_mutex);

        /* add/rm files for all cgroups created before */
        css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
                struct cgroup *cgrp = css->cgroup;

                if (!(css->flags & CSS_VISIBLE))
                        continue;

                ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
                if (ret)
                        break;
        }

        if (is_add && !ret)
                kernfs_activate(root->kn);
        return ret;
}

static void cgroup_exit_cftypes(struct cftype *cfts)
{
        struct cftype *cft;

        for (cft = cfts; cft->name[0] != '\0'; cft++) {
                /* free copy for custom atomic_write_len, see init_cftypes() */
                if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
                        kfree(cft->kf_ops);
                cft->kf_ops = NULL;
                cft->ss = NULL;

                /* revert flags set by cgroup core while adding @cfts */
                cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
        }
}

static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
        struct cftype *cft;

        for (cft = cfts; cft->name[0] != '\0'; cft++) {
                struct kernfs_ops *kf_ops;

                WARN_ON(cft->ss || cft->kf_ops);

                if (cft->seq_start)
                        kf_ops = &cgroup_kf_ops;
                else
                        kf_ops = &cgroup_kf_single_ops;

                /*
                 * Ugh... if @cft wants a custom max_write_len, we need to
                 * make a copy of kf_ops to set its atomic_write_len.
                 */
                if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
                        kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
                        if (!kf_ops) {
                                cgroup_exit_cftypes(cfts);
                                return -ENOMEM;
                        }
                        kf_ops->atomic_write_len = cft->max_write_len;
                }

                cft->kf_ops = kf_ops;
                cft->ss = ss;
        }

        return 0;
}

static int cgroup_rm_cftypes_locked(struct cftype *cfts)
{
        lockdep_assert_held(&cgroup_mutex);

        if (!cfts || !cfts[0].ss)
                return -ENOENT;

        list_del(&cfts->node);
        cgroup_apply_cftypes(cfts, false);
        cgroup_exit_cftypes(cfts);
        return 0;
}

/**
 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
 * @cfts: zero-length name terminated array of cftypes
 *
 * Unregister @cfts.  Files described by @cfts are removed from all
 * existing cgroups and all future cgroups won't have them either.  This
 * function can be called anytime whether @cfts' subsys is attached or not.
 *
 * Returns 0 on successful unregistration, -ENOENT if @cfts is not
 * registered.
 */
int cgroup_rm_cftypes(struct cftype *cfts)
{
        int ret;

        mutex_lock(&cgroup_mutex);
        ret = cgroup_rm_cftypes_locked(cfts);
        mutex_unlock(&cgroup_mutex);
        return ret;
}

/**
 * cgroup_add_cftypes - add an array of cftypes to a subsystem
 * @ss: target cgroup subsystem
 * @cfts: zero-length name terminated array of cftypes
 *
 * Register @cfts to @ss.  Files described by @cfts are created for all
 * existing cgroups to which @ss is attached and all future cgroups will
 * have them too.  This function can be called anytime whether @ss is
 * attached or not.
 *
 * Returns 0 on successful registration, -errno on failure.  Note that this
 * function currently returns 0 as long as @cfts registration is successful
 * even if some file creation attempts on existing cgroups fail.
 */
static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
        int ret;

        if (!cgroup_ssid_enabled(ss->id))
                return 0;

        if (!cfts || cfts[0].name[0] == '\0')
                return 0;

        ret = cgroup_init_cftypes(ss, cfts);
        if (ret)
                return ret;

        mutex_lock(&cgroup_mutex);

        list_add_tail(&cfts->node, &ss->cfts);
        ret = cgroup_apply_cftypes(cfts, true);
        if (ret)
                cgroup_rm_cftypes_locked(cfts);

        mutex_unlock(&cgroup_mutex);
        return ret;
}

/**
 * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
 * @ss: target cgroup subsystem
 * @cfts: zero-length name terminated array of cftypes
 *
 * Similar to cgroup_add_cftypes() but the added files are only used for
 * the default hierarchy.
 */
int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
        struct cftype *cft;

        for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
                cft->flags |= __CFTYPE_ONLY_ON_DFL;
        return cgroup_add_cftypes(ss, cfts);
}

/**
 * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
 * @ss: target cgroup subsystem
 * @cfts: zero-length name terminated array of cftypes
 *
 * Similar to cgroup_add_cftypes() but the added files are only used for
 * the legacy hierarchies.
 */
int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
        struct cftype *cft;

        for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
                cft->flags |= __CFTYPE_NOT_ON_DFL;
        return cgroup_add_cftypes(ss, cfts);
}

/**
 * cgroup_file_notify - generate a file modified event for a cgroup_file
 * @cfile: target cgroup_file
 *
 * @cfile must have been obtained by setting cftype->file_offset.
 */
void cgroup_file_notify(struct cgroup_file *cfile)
{
        unsigned long flags;

        spin_lock_irqsave(&cgroup_file_kn_lock, flags);
        if (cfile->kn) {
                unsigned long last = cfile->notified_at;
                unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV;

                if (time_in_range(jiffies, last, next)) {
                        timer_reduce(&cfile->notify_timer, next);
                } else {
                        kernfs_notify(cfile->kn);
                        cfile->notified_at = jiffies;
                }
        }
        spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
}

/**
 * css_next_child - find the next child of a given css
 * @pos: the current position (%NULL to initiate traversal)
 * @parent: css whose children to walk
 *
 * This function returns the next child of @parent and should be called
 * under either cgroup_mutex or RCU read lock.  The only requirement is
 * that @parent and @pos are accessible.  The next sibling is guaranteed to
 * be returned regardless of their states.
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
 */
struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
                                           struct cgroup_subsys_state *parent)
{
        struct cgroup_subsys_state *next;

        cgroup_assert_mutex_or_rcu_locked();

        /*
         * @pos could already have been unlinked from the sibling list.
         * Once a cgroup is removed, its ->sibling.next is no longer
         * updated when its next sibling changes.  CSS_RELEASED is set when
         * @pos is taken off list, at which time its next pointer is valid,
         * and, as releases are serialized, the one pointed to by the next
         * pointer is guaranteed to not have started release yet.  This
         * implies that if we observe !CSS_RELEASED on @pos in this RCU
         * critical section, the one pointed to by its next pointer is
         * guaranteed to not have finished its RCU grace period even if we
         * have dropped rcu_read_lock() inbetween iterations.
         *
         * If @pos has CSS_RELEASED set, its next pointer can't be
         * dereferenced; however, as each css is given a monotonically
         * increasing unique serial number and always appended to the
         * sibling list, the next one can be found by walking the parent's
         * children until the first css with higher serial number than
         * @pos's.  While this path can be slower, it happens iff iteration
         * races against release and the race window is very small.
         */
        if (!pos) {
                next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
        } else if (likely(!(pos->flags & CSS_RELEASED))) {
                next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
        } else {
                list_for_each_entry_rcu(next, &parent->children, sibling,
                                        lockdep_is_held(&cgroup_mutex))
                        if (next->serial_nr > pos->serial_nr)
                                break;
        }

        /*
         * @next, if not pointing to the head, can be dereferenced and is
         * the next sibling.
         */
        if (&next->sibling != &parent->children)
                return next;
        return NULL;
}

/**
 * css_next_descendant_pre - find the next descendant for pre-order walk
 * @pos: the current position (%NULL to initiate traversal)
 * @root: css whose descendants to walk
 *
 * To be used by css_for_each_descendant_pre().  Find the next descendant
 * to visit for pre-order traversal of @root's descendants.  @root is
 * included in the iteration and the first node to be visited.
 *
 * While this function requires cgroup_mutex or RCU read locking, it
 * doesn't require the whole traversal to be contained in a single critical
 * section.  This function will return the correct next descendant as long
 * as both @pos and @root are accessible and @pos is a descendant of @root.
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
 */
struct cgroup_subsys_state *
css_next_descendant_pre(struct cgroup_subsys_state *pos,
                        struct cgroup_subsys_state *root)
{
        struct cgroup_subsys_state *next;

        cgroup_assert_mutex_or_rcu_locked();

        /* if first iteration, visit @root */
        if (!pos)
                return root;

        /* visit the first child if exists */
        next = css_next_child(NULL, pos);
        if (next)
                return next;

        /* no child, visit my or the closest ancestor's next sibling */
        while (pos != root) {
                next = css_next_child(pos, pos->parent);
                if (next)
                        return next;
                pos = pos->parent;
        }

        return NULL;
}
EXPORT_SYMBOL_GPL(css_next_descendant_pre);

/**
 * css_rightmost_descendant - return the rightmost descendant of a css
 * @pos: css of interest
 *
 * Return the rightmost descendant of @pos.  If there's no descendant, @pos
 * is returned.  This can be used during pre-order traversal to skip
 * subtree of @pos.
 *
 * While this function requires cgroup_mutex or RCU read locking, it
 * doesn't require the whole traversal to be contained in a single critical
 * section.  This function will return the correct rightmost descendant as
 * long as @pos is accessible.
 */
struct cgroup_subsys_state *
css_rightmost_descendant(struct cgroup_subsys_state *pos)
{
        struct cgroup_subsys_state *last, *tmp;

        cgroup_assert_mutex_or_rcu_locked();

        do {
                last = pos;
                /* ->prev isn't RCU safe, walk ->next till the end */
                pos = NULL;
                css_for_each_child(tmp, last)
                        pos = tmp;
        } while (pos);

        return last;
}

static struct cgroup_subsys_state *
css_leftmost_descendant(struct cgroup_subsys_state *pos)
{
        struct cgroup_subsys_state *last;

        do {
                last = pos;
                pos = css_next_child(NULL, pos);
        } while (pos);

        return last;
}

/**
 * css_next_descendant_post - find the next descendant for post-order walk
 * @pos: the current position (%NULL to initiate traversal)
 * @root: css whose descendants to walk
 *
 * To be used by css_for_each_descendant_post().  Find the next descendant
 * to visit for post-order traversal of @root's descendants.  @root is
 * included in the iteration and the last node to be visited.
 *
 * While this function requires cgroup_mutex or RCU read locking, it
 * doesn't require the whole traversal to be contained in a single critical
 * section.  This function will return the correct next descendant as long
 * as both @pos and @cgroup are accessible and @pos is a descendant of
 * @cgroup.
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
 */
struct cgroup_subsys_state *
css_next_descendant_post(struct cgroup_subsys_state *pos,
                         struct cgroup_subsys_state *root)
{
        struct cgroup_subsys_state *next;

        cgroup_assert_mutex_or_rcu_locked();

        /* if first iteration, visit leftmost descendant which may be @root */
        if (!pos)
                return css_leftmost_descendant(root);

        /* if we visited @root, we're done */
        if (pos == root)
                return NULL;

        /* if there's an unvisited sibling, visit its leftmost descendant */
        next = css_next_child(pos, pos->parent);
        if (next)
                return css_leftmost_descendant(next);

        /* no sibling left, visit parent */
        return pos->parent;
}

/**
 * css_has_online_children - does a css have online children
 * @css: the target css
 *
 * Returns %true if @css has any online children; otherwise, %false.  This
 * function can be called from any context but the caller is responsible
 * for synchronizing against on/offlining as necessary.
 */
bool css_has_online_children(struct cgroup_subsys_state *css)
{
        struct cgroup_subsys_state *child;
        bool ret = false;

        rcu_read_lock();
        css_for_each_child(child, css) {
                if (child->flags & CSS_ONLINE) {
                        ret = true;
                        break;
                }
        }
        rcu_read_unlock();
        return ret;
}

static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)
{
        struct list_head *l;
        struct cgrp_cset_link *link;
        struct css_set *cset;

        lockdep_assert_held(&css_set_lock);

        /* find the next threaded cset */
        if (it->tcset_pos) {
                l = it->tcset_pos->next;

                if (l != it->tcset_head) {
                        it->tcset_pos = l;
                        return container_of(l, struct css_set,
                                            threaded_csets_node);
                }

                it->tcset_pos = NULL;
        }

        /* find the next cset */
        l = it->cset_pos;
        l = l->next;
        if (l == it->cset_head) {
                it->cset_pos = NULL;
                return NULL;
        }

        if (it->ss) {
                cset = container_of(l, struct css_set, e_cset_node[it->ss->id]);
        } else {
                link = list_entry(l, struct cgrp_cset_link, cset_link);
                cset = link->cset;
        }

        it->cset_pos = l;

        /* initialize threaded css_set walking */
        if (it->flags & CSS_TASK_ITER_THREADED) {
                if (it->cur_dcset)
                        put_css_set_locked(it->cur_dcset);
                it->cur_dcset = cset;
                get_css_set(cset);

                it->tcset_head = &cset->threaded_csets;
                it->tcset_pos = &cset->threaded_csets;
        }

        return cset;
}

/**
 * css_task_iter_advance_css_set - advance a task itererator to the next css_set
 * @it: the iterator to advance
 *
 * Advance @it to the next css_set to walk.
 */
static void css_task_iter_advance_css_set(struct css_task_iter *it)
{
        struct css_set *cset;

        lockdep_assert_held(&css_set_lock);

        /* Advance to the next non-empty css_set and find first non-empty tasks list*/
        while ((cset = css_task_iter_next_css_set(it))) {
                if (!list_empty(&cset->tasks)) {
                        it->cur_tasks_head = &cset->tasks;
                        break;
                } else if (!list_empty(&cset->mg_tasks)) {
                        it->cur_tasks_head = &cset->mg_tasks;
                        break;
                } else if (!list_empty(&cset->dying_tasks)) {
                        it->cur_tasks_head = &cset->dying_tasks;
                        break;
                }
        }
        if (!cset) {
                it->task_pos = NULL;
                return;
        }
        it->task_pos = it->cur_tasks_head->next;

        /*
         * We don't keep css_sets locked across iteration steps and thus
         * need to take steps to ensure that iteration can be resumed after
         * the lock is re-acquired.  Iteration is performed at two levels -
         * css_sets and tasks in them.
         *
         * Once created, a css_set never leaves its cgroup lists, so a
         * pinned css_set is guaranteed to stay put and we can resume
         * iteration afterwards.
         *
         * Tasks may leave @cset across iteration steps.  This is resolved
         * by registering each iterator with the css_set currently being
         * walked and making css_set_move_task() advance iterators whose
         * next task is leaving.
         */
        if (it->cur_cset) {
                list_del(&it->iters_node);
                put_css_set_locked(it->cur_cset);
        }
        get_css_set(cset);
        it->cur_cset = cset;
        list_add(&it->iters_node, &cset->task_iters);
}

static void css_task_iter_skip(struct css_task_iter *it,
                               struct task_struct *task)
{
        lockdep_assert_held(&css_set_lock);

        if (it->task_pos == &task->cg_list) {
                it->task_pos = it->task_pos->next;
                it->flags |= CSS_TASK_ITER_SKIPPED;
        }
}

static void css_task_iter_advance(struct css_task_iter *it)
{
        struct task_struct *task;

        lockdep_assert_held(&css_set_lock);
repeat:
        if (it->task_pos) {
                /*
                 * Advance iterator to find next entry. We go through cset
                 * tasks, mg_tasks and dying_tasks, when consumed we move onto
                 * the next cset.
                 */
                if (it->flags & CSS_TASK_ITER_SKIPPED)
                        it->flags &= ~CSS_TASK_ITER_SKIPPED;
                else
                        it->task_pos = it->task_pos->next;

                if (it->task_pos == &it->cur_cset->tasks) {
                        it->cur_tasks_head = &it->cur_cset->mg_tasks;
                        it->task_pos = it->cur_tasks_head->next;
                }
                if (it->task_pos == &it->cur_cset->mg_tasks) {
                        it->cur_tasks_head = &it->cur_cset->dying_tasks;
                        it->task_pos = it->cur_tasks_head->next;
                }
                if (it->task_pos == &it->cur_cset->dying_tasks)
                        css_task_iter_advance_css_set(it);
        } else {
                /* called from start, proceed to the first cset */
                css_task_iter_advance_css_set(it);
        }

        if (!it->task_pos)
                return;

        task = list_entry(it->task_pos, struct task_struct, cg_list);

        if (it->flags & CSS_TASK_ITER_PROCS) {
                /* if PROCS, skip over tasks which aren't group leaders */
                if (!thread_group_leader(task))
                        goto repeat;

                /* and dying leaders w/o live member threads */
                if (it->cur_tasks_head == &it->cur_cset->dying_tasks &&
                    !atomic_read(&task->signal->live))
                        goto repeat;
        } else {
                /* skip all dying ones */
                if (it->cur_tasks_head == &it->cur_cset->dying_tasks)
                        goto repeat;
        }
}

/**
 * css_task_iter_start - initiate task iteration
 * @css: the css to walk tasks of
 * @flags: CSS_TASK_ITER_* flags
 * @it: the task iterator to use
 *
 * Initiate iteration through the tasks of @css.  The caller can call
 * css_task_iter_next() to walk through the tasks until the function
 * returns NULL.  On completion of iteration, css_task_iter_end() must be
 * called.
 */
void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
                         struct css_task_iter *it)
{
        memset(it, 0, sizeof(*it));

        spin_lock_irq(&css_set_lock);

        it->ss = css->ss;
        it->flags = flags;

        if (it->ss)
                it->cset_pos = &css->cgroup->e_csets[css->ss->id];
        else
                it->cset_pos = &css->cgroup->cset_links;

        it->cset_head = it->cset_pos;

        css_task_iter_advance(it);

        spin_unlock_irq(&css_set_lock);
}

/**
 * css_task_iter_next - return the next task for the iterator
 * @it: the task iterator being iterated
 *
 * The "next" function for task iteration.  @it should have been
 * initialized via css_task_iter_start().  Returns NULL when the iteration
 * reaches the end.
 */
struct task_struct *css_task_iter_next(struct css_task_iter *it)
{
        if (it->cur_task) {
                put_task_struct(it->cur_task);
                it->cur_task = NULL;
        }

        spin_lock_irq(&css_set_lock);

        /* @it may be half-advanced by skips, finish advancing */
        if (it->flags & CSS_TASK_ITER_SKIPPED)
                css_task_iter_advance(it);

        if (it->task_pos) {
                it->cur_task = list_entry(it->task_pos, struct task_struct,
                                          cg_list);
                get_task_struct(it->cur_task);
                css_task_iter_advance(it);
        }

        spin_unlock_irq(&css_set_lock);

        return it->cur_task;
}

/**
 * css_task_iter_end - finish task iteration
 * @it: the task iterator to finish
 *
 * Finish task iteration started by css_task_iter_start().
 */
void css_task_iter_end(struct css_task_iter *it)
{
        if (it->cur_cset) {
                spin_lock_irq(&css_set_lock);
                list_del(&it->iters_node);
                put_css_set_locked(it->cur_cset);
                spin_unlock_irq(&css_set_lock);
        }

        if (it->cur_dcset)
                put_css_set(it->cur_dcset);

        if (it->cur_task)
                put_task_struct(it->cur_task);
}

static void cgroup_procs_release(struct kernfs_open_file *of)
{
        struct cgroup_file_ctx *ctx = of->priv;

        if (ctx->procs.started)
                css_task_iter_end(&ctx->procs.iter);
}

static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
{
        struct kernfs_open_file *of = s->private;
        struct cgroup_file_ctx *ctx = of->priv;

        if (pos)
                (*pos)++;

        return css_task_iter_next(&ctx->procs.iter);
}

static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
                                  unsigned int iter_flags)
{
        struct kernfs_open_file *of = s->private;
        struct cgroup *cgrp = seq_css(s)->cgroup;
        struct cgroup_file_ctx *ctx = of->priv;
        struct css_task_iter *it = &ctx->procs.iter;

        /*
         * When a seq_file is seeked, it's always traversed sequentially
         * from position 0, so we can simply keep iterating on !0 *pos.
         */
        if (!ctx->procs.started) {
                if (WARN_ON_ONCE((*pos)))
                        return ERR_PTR(-EINVAL);
                css_task_iter_start(&cgrp->self, iter_flags, it);
                ctx->procs.started = true;
        } else if (!(*pos)) {
                css_task_iter_end(it);
                css_task_iter_start(&cgrp->self, iter_flags, it);
        } else
                return it->cur_task;

        return cgroup_procs_next(s, NULL, NULL);
}

static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
{
        struct cgroup *cgrp = seq_css(s)->cgroup;

        /*
         * All processes of a threaded subtree belong to the domain cgroup
         * of the subtree.  Only threads can be distributed across the
         * subtree.  Reject reads on cgroup.procs in the subtree proper.
         * They're always empty anyway.
         */
        if (cgroup_is_threaded(cgrp))
                return ERR_PTR(-EOPNOTSUPP);

        return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS |
                                            CSS_TASK_ITER_THREADED);
}

static int cgroup_procs_show(struct seq_file *s, void *v)
{
        seq_printf(s, "%d\n", task_pid_vnr(v));
        return 0;
}

static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
{
        int ret;
        struct inode *inode;

        lockdep_assert_held(&cgroup_mutex);

        inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
        if (!inode)
                return -ENOMEM;

        ret = inode_permission(inode, MAY_WRITE);
        iput(inode);
        return ret;
}

static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
                                         struct cgroup *dst_cgrp,
                                         struct super_block *sb,
                                         struct cgroup_namespace *ns)
{
        struct cgroup *com_cgrp = src_cgrp;
        int ret;

        lockdep_assert_held(&cgroup_mutex);

        /* find the common ancestor */
        while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
                com_cgrp = cgroup_parent(com_cgrp);

        /* %current should be authorized to migrate to the common ancestor */
        ret = cgroup_may_write(com_cgrp, sb);
        if (ret)
                return ret;

        /*
         * If namespaces are delegation boundaries, %current must be able
         * to see both source and destination cgroups from its namespace.
         */
        if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
            (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) ||
             !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp)))
                return -ENOENT;

        return 0;
}

static int cgroup_attach_permissions(struct cgroup *src_cgrp,
                                     struct cgroup *dst_cgrp,
                                     struct super_block *sb, bool threadgroup,
                                     struct cgroup_namespace *ns)
{
        int ret = 0;

        ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns);
        if (ret)
                return ret;

        ret = cgroup_migrate_vet_dst(dst_cgrp);
        if (ret)
                return ret;

        if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp))
                ret = -EOPNOTSUPP;

        return ret;
}

static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
                                  char *buf, size_t nbytes, loff_t off)
{
        struct cgroup_file_ctx *ctx = of->priv;
        struct cgroup *src_cgrp, *dst_cgrp;
        struct task_struct *task;
        const struct cred *saved_cred;
        ssize_t ret;
        bool threadgroup_locked;

        dst_cgrp = cgroup_kn_lock_live(of->kn, false);
        if (!dst_cgrp)
                return -ENODEV;

        task = cgroup_procs_write_start(buf, true, &threadgroup_locked);
        ret = PTR_ERR_OR_ZERO(task);
        if (ret)
                goto out_unlock;

        /* find the source cgroup */
        spin_lock_irq(&css_set_lock);
        src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
        spin_unlock_irq(&css_set_lock);

        /*
         * Process and thread migrations follow same delegation rule. Check
         * permissions using the credentials from file open to protect against
         * inherited fd attacks.
         */
        saved_cred = override_creds(of->file->f_cred);
        ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
                                        of->file->f_path.dentry->d_sb, true,
                                        ctx->ns);
        revert_creds(saved_cred);
        if (ret)
                goto out_finish;

        ret = cgroup_attach_task(dst_cgrp, task, true);

out_finish:
        cgroup_procs_write_finish(task, threadgroup_locked);
out_unlock:
        cgroup_kn_unlock(of->kn);

        return ret ?: nbytes;
}

static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
{
        return __cgroup_procs_start(s, pos, 0);
}

static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
                                    char *buf, size_t nbytes, loff_t off)
{
        struct cgroup_file_ctx *ctx = of->priv;
        struct cgroup *src_cgrp, *dst_cgrp;
        struct task_struct *task;
        const struct cred *saved_cred;
        ssize_t ret;
        bool locked;

        buf = strstrip(buf);

        dst_cgrp = cgroup_kn_lock_live(of->kn, false);
        if (!dst_cgrp)
                return -ENODEV;

        task = cgroup_procs_write_start(buf, false, &locked);
        ret = PTR_ERR_OR_ZERO(task);
        if (ret)
                goto out_unlock;

        /* find the source cgroup */
        spin_lock_irq(&css_set_lock);
        src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
        spin_unlock_irq(&css_set_lock);

        /*
         * Process and thread migrations follow same delegation rule. Check
         * permissions using the credentials from file open to protect against
         * inherited fd attacks.
         */
        saved_cred = override_creds(of->file->f_cred);
        ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
                                        of->file->f_path.dentry->d_sb, false,
                                        ctx->ns);
        revert_creds(saved_cred);
        if (ret)
                goto out_finish;

        ret = cgroup_attach_task(dst_cgrp, task, false);

out_finish:
        cgroup_procs_write_finish(task, locked);
out_unlock:
        cgroup_kn_unlock(of->kn);

        return ret ?: nbytes;
}

/* cgroup core interface files for the default hierarchy */
static struct cftype cgroup_base_files[] = {
        {
                .name = "cgroup.type",
                .flags = CFTYPE_NOT_ON_ROOT,
                .seq_show = cgroup_type_show,
                .write = cgroup_type_write,
        },
        {
                .name = "cgroup.procs",
                .flags = CFTYPE_NS_DELEGATABLE,
                .file_offset = offsetof(struct cgroup, procs_file),
                .release = cgroup_procs_release,
                .seq_start = cgroup_procs_start,
                .seq_next = cgroup_procs_next,
                .seq_show = cgroup_procs_show,
                .write = cgroup_procs_write,
        },
        {
                .name = "cgroup.threads",
                .flags = CFTYPE_NS_DELEGATABLE,
                .release = cgroup_procs_release,
                .seq_start = cgroup_threads_start,
                .seq_next = cgroup_procs_next,
                .seq_show = cgroup_procs_show,
                .write = cgroup_threads_write,
        },
        {
                .name = "cgroup.controllers",
                .seq_show = cgroup_controllers_show,
        },
        {
                .name = "cgroup.subtree_control",
                .flags = CFTYPE_NS_DELEGATABLE,
                .seq_show = cgroup_subtree_control_show,
                .write = cgroup_subtree_control_write,
        },
        {
                .name = "cgroup.events",
                .flags = CFTYPE_NOT_ON_ROOT,
                .file_offset = offsetof(struct cgroup, events_file),
                .seq_show = cgroup_events_show,
        },
        {
                .name = "cgroup.max.descendants",
                .seq_show = cgroup_max_descendants_show,
                .write = cgroup_max_descendants_write,
        },
        {
                .name = "cgroup.max.depth",
                .seq_show = cgroup_max_depth_show,
                .write = cgroup_max_depth_write,
        },
        {
                .name = "cgroup.stat",
                .seq_show = cgroup_stat_show,
        },
        {
                .name = "cgroup.freeze",
                .flags = CFTYPE_NOT_ON_ROOT,
                .seq_show = cgroup_freeze_show,
                .write = cgroup_freeze_write,
        },
        {
                .name = "cpu.stat",
                .seq_show = cpu_stat_show,
        },
#ifdef CONFIG_PSI
        {
                .name = "io.pressure",
                .seq_show = cgroup_io_pressure_show,
                .write = cgroup_io_pressure_write,
                .poll = cgroup_pressure_poll,
                .release = cgroup_pressure_release,
        },
        {
                .name = "memory.pressure",
                .seq_show = cgroup_memory_pressure_show,
                .write = cgroup_memory_pressure_write,
                .poll = cgroup_pressure_poll,
                .release = cgroup_pressure_release,
        },
        {
                .name = "cpu.pressure",
                .seq_show = cgroup_cpu_pressure_show,
                .write = cgroup_cpu_pressure_write,
                .poll = cgroup_pressure_poll,
                .release = cgroup_pressure_release,
        },
#endif /* CONFIG_PSI */
        { }        /* terminate */
};

/*
 * css destruction is four-stage process.
 *
 * 1. Destruction starts.  Killing of the percpu_ref is initiated.
 *    Implemented in kill_css().
 *
 * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
 *    and thus css_tryget_online() is guaranteed to fail, the css can be
 *    offlined by invoking offline_css().  After offlining, the base ref is
 *    put.  Implemented in css_killed_work_fn().
 *
 * 3. When the percpu_ref reaches zero, the only possible remaining
 *    accessors are inside RCU read sections.  css_release() schedules the
 *    RCU callback.
 *
 * 4. After the grace period, the css can be freed.  Implemented in
 *    css_free_work_fn().
 *
 * It is actually hairier because both step 2 and 4 require process context
 * and thus involve punting to css->destroy_work adding two additional
 * steps to the already complex sequence.
 */
static void css_free_rwork_fn(struct work_struct *work)
{
        struct cgroup_subsys_state *css = container_of(to_rcu_work(work),
                                struct cgroup_subsys_state, destroy_rwork);
        struct cgroup_subsys *ss = css->ss;
        struct cgroup *cgrp = css->cgroup;

        percpu_ref_exit(&css->refcnt);

        if (ss) {
                /* css free path */
                struct cgroup_subsys_state *parent = css->parent;
                int id = css->id;

                ss->css_free(css);
                cgroup_idr_remove(&ss->css_idr, id);
                cgroup_put(cgrp);

                if (parent)
                        css_put(parent);
        } else {
                /* cgroup free path */
                atomic_dec(&cgrp->root->nr_cgrps);
                cgroup1_pidlist_destroy_all(cgrp);
                cancel_work_sync(&cgrp->release_agent_work);

                if (cgroup_parent(cgrp)) {
                        /*
                         * We get a ref to the parent, and put the ref when
                         * this cgroup is being freed, so it's guaranteed
                         * that the parent won't be destroyed before its
                         * children.
                         */
                        cgroup_put(cgroup_parent(cgrp));
                        kernfs_put(cgrp->kn);
                        psi_cgroup_free(cgrp);
                        if (cgroup_on_dfl(cgrp))
                                cgroup_rstat_exit(cgrp);
                        kfree(cgrp);
                } else {
                        /*
                         * This is root cgroup's refcnt reaching zero,
                         * which indicates that the root should be
                         * released.
                         */
                        cgroup_destroy_root(cgrp->root);
                }
        }
}

static void css_release_work_fn(struct work_struct *work)
{
        struct cgroup_subsys_state *css =
                container_of(work, struct cgroup_subsys_state, destroy_work);
        struct cgroup_subsys *ss = css->ss;
        struct cgroup *cgrp = css->cgroup;

        mutex_lock(&cgroup_mutex);

        css->flags |= CSS_RELEASED;
        list_del_rcu(&css->sibling);

        if (ss) {
                /* css release path */
                if (!list_empty(&css->rstat_css_node)) {
                        cgroup_rstat_flush(cgrp);
                        list_del_rcu(&css->rstat_css_node);
                }

                cgroup_idr_replace(&ss->css_idr, NULL, css->id);
                if (ss->css_released)
                        ss->css_released(css);
        } else {
                struct cgroup *tcgrp;

                /* cgroup release path */
                TRACE_CGROUP_PATH(release, cgrp);

                if (cgroup_on_dfl(cgrp))
                        cgroup_rstat_flush(cgrp);

                spin_lock_irq(&css_set_lock);
                for (tcgrp = cgroup_parent(cgrp); tcgrp;
                     tcgrp = cgroup_parent(tcgrp))
                        tcgrp->nr_dying_descendants--;
                spin_unlock_irq(&css_set_lock);

                /*
                 * There are two control paths which try to determine
                 * cgroup from dentry without going through kernfs -
                 * cgroupstats_build() and css_tryget_online_from_dir().
                 * Those are supported by RCU protecting clearing of
                 * cgrp->kn->priv backpointer.
                 */
                if (cgrp->kn)
                        RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
                                         NULL);
        }

        mutex_unlock(&cgroup_mutex);

        INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
        queue_rcu_work(cgroup_free_wq, &css->destroy_rwork);
}

static void css_release(struct percpu_ref *ref)
{
        struct cgroup_subsys_state *css =
                container_of(ref, struct cgroup_subsys_state, refcnt);

        INIT_WORK(&css->destroy_work, css_release_work_fn);
        queue_work(cgroup_release_wq, &css->destroy_work);
}

static void init_and_link_css(struct cgroup_subsys_state *css,
                              struct cgroup_subsys *ss, struct cgroup *cgrp)
{
        lockdep_assert_held(&cgroup_mutex);

        cgroup_get_live(cgrp);

        memset(css, 0, sizeof(*css));
        css->cgroup = cgrp;
        css->ss = ss;
        css->id = -1;
        INIT_LIST_HEAD(&css->sibling);
        INIT_LIST_HEAD(&css->children);
        INIT_LIST_HEAD(&css->rstat_css_node);
        css->serial_nr = css_serial_nr_next++;
        atomic_set(&css->online_cnt, 0);

        if (cgroup_parent(cgrp)) {
                css->parent = cgroup_css(cgroup_parent(cgrp), ss);
                css_get(css->parent);
        }

        if (cgroup_on_dfl(cgrp) && ss->css_rstat_flush)
                list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list);

        BUG_ON(cgroup_css(cgrp, ss));
}

/* invoke ->css_online() on a new CSS and mark it online if successful */
static int online_css(struct cgroup_subsys_state *css)
{
        struct cgroup_subsys *ss = css->ss;
        int ret = 0;

        lockdep_assert_held(&cgroup_mutex);

        if (ss->css_online)
                ret = ss->css_online(css);
        if (!ret) {
                css->flags |= CSS_ONLINE;
                rcu_assign_pointer(css->cgroup->subsys[ss->id], css);

                atomic_inc(&css->online_cnt);
                if (css->parent)
                        atomic_inc(&css->parent->online_cnt);
        }
        return ret;
}

/* if the CSS is online, invoke ->css_offline() on it and mark it offline */
static void offline_css(struct cgroup_subsys_state *css)
{
        struct cgroup_subsys *ss = css->ss;

        lockdep_assert_held(&cgroup_mutex);

        if (!(css->flags & CSS_ONLINE))
                return;

        if (ss->css_offline)
                ss->css_offline(css);

        css->flags &= ~CSS_ONLINE;
        RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);

        wake_up_all(&css->cgroup->offline_waitq);
}

/**
 * css_create - create a cgroup_subsys_state
 * @cgrp: the cgroup new css will be associated with
 * @ss: the subsys of new css
 *
 * Create a new css associated with @cgrp - @ss pair.  On success, the new
 * css is online and installed in @cgrp.  This function doesn't create the
 * interface files.  Returns 0 on success, -errno on failure.
 */
static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
                                              struct cgroup_subsys *ss)
{
        struct cgroup *parent = cgroup_parent(cgrp);
        struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
        struct cgroup_subsys_state *css;
        int err;

        lockdep_assert_held(&cgroup_mutex);

        css = ss->css_alloc(parent_css);
        if (!css)
                css = ERR_PTR(-ENOMEM);
        if (IS_ERR(css))
                return css;

        init_and_link_css(css, ss, cgrp);

        err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
        if (err)
                goto err_free_css;

        err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
        if (err < 0)
                goto err_free_css;
        css->id = err;

        /* @css is ready to be brought online now, make it visible */
        list_add_tail_rcu(&css->sibling, &parent_css->children);
        cgroup_idr_replace(&ss->css_idr, css, css->id);

        err = online_css(css);
        if (err)
                goto err_list_del;

        if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
            cgroup_parent(parent)) {
                pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
                        current->comm, current->pid, ss->name);
                if (!strcmp(ss->name, "memory"))
                        pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
                ss->warned_broken_hierarchy = true;
        }

        return css;

err_list_del:
        list_del_rcu(&css->sibling);
err_free_css:
        list_del_rcu(&css->rstat_css_node);
        INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
        queue_rcu_work(cgroup_free_wq, &css->destroy_rwork);
        return ERR_PTR(err);
}

/*
 * The returned cgroup is fully initialized including its control mask, but
 * it isn't associated with its kernfs_node and doesn't have the control
 * mask applied.
 */
static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
                                    umode_t mode)
{
        struct cgroup_root *root = parent->root;
        struct cgroup *cgrp, *tcgrp;
        struct kernfs_node *kn;
        int level = parent->level + 1;
        int ret;

        /* allocate the cgroup and its ID, 0 is reserved for the root */
        cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)),
                       GFP_KERNEL);
        if (!cgrp)
                return ERR_PTR(-ENOMEM);

        ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
        if (ret)
                goto out_free_cgrp;

        if (cgroup_on_dfl(parent)) {
                ret = cgroup_rstat_init(cgrp);
                if (ret)
                        goto out_cancel_ref;
        }

        /* create the directory */
        kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
        if (IS_ERR(kn)) {
                ret = PTR_ERR(kn);
                goto out_stat_exit;
        }
        cgrp->kn = kn;

        init_cgroup_housekeeping(cgrp);

        cgrp->self.parent = &parent->self;
        cgrp->root = root;
        cgrp->level = level;

        ret = psi_cgroup_alloc(cgrp);
        if (ret)
                goto out_kernfs_remove;

        if (cgrp->root == &cgrp_dfl_root) {
                ret = cgroup_bpf_inherit(cgrp);
                if (ret)
                        goto out_psi_free;
        }

        /*
         * New cgroup inherits effective freeze counter, and
         * if the parent has to be frozen, the child has too.
         */
        cgrp->freezer.e_freeze = parent->freezer.e_freeze;
        if (cgrp->freezer.e_freeze) {
                /*
                 * Set the CGRP_FREEZE flag, so when a process will be
                 * attached to the child cgroup, it will become frozen.
                 * At this point the new cgroup is unpopulated, so we can
                 * consider it frozen immediately.
                 */
                set_bit(CGRP_FREEZE, &cgrp->flags);
                set_bit(CGRP_FROZEN, &cgrp->flags);
        }

        spin_lock_irq(&css_set_lock);
        for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
                cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp);

                if (tcgrp != cgrp) {
                        tcgrp->nr_descendants++;

                        /*
                         * If the new cgroup is frozen, all ancestor cgroups
                         * get a new frozen descendant, but their state can't
                         * change because of this.
                         */
                        if (cgrp->freezer.e_freeze)
                                tcgrp->freezer.nr_frozen_descendants++;
                }
        }
        spin_unlock_irq(&css_set_lock);

        if (notify_on_release(parent))
                set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);

        if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
                set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);

        cgrp->self.serial_nr = css_serial_nr_next++;

        /* allocation complete, commit to creation */
        list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
        atomic_inc(&root->nr_cgrps);
        cgroup_get_live(parent);

        /*
         * On the default hierarchy, a child doesn't automatically inherit
         * subtree_control from the parent.  Each is configured manually.
         */
        if (!cgroup_on_dfl(cgrp))
                cgrp->subtree_control = cgroup_control(cgrp);

        cgroup_propagate_control(cgrp);

        return cgrp;

out_psi_free:
        psi_cgroup_free(cgrp);
out_kernfs_remove:
        kernfs_remove(cgrp->kn);
out_stat_exit:
        if (cgroup_on_dfl(parent))
                cgroup_rstat_exit(cgrp);
out_cancel_ref:
        percpu_ref_exit(&cgrp->self.refcnt);
out_free_cgrp:
        kfree(cgrp);
        return ERR_PTR(ret);
}

static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
{
        struct cgroup *cgroup;
        int ret = false;
        int level = 0;

        lockdep_assert_held(&cgroup_mutex);

        for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) {
                if (cgroup->nr_descendants >= cgroup->max_descendants)
                        goto fail;

                if (level >= cgroup->max_depth)
                        goto fail;

                level++;
        }

        ret = true;
fail:
        return ret;
}

int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
{
        struct cgroup *parent, *cgrp;
        int ret;

        /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
        if (strchr(name, '\n'))
                return -EINVAL;

        parent = cgroup_kn_lock_live(parent_kn, false);
        if (!parent)
                return -ENODEV;

        if (!cgroup_check_hierarchy_limits(parent)) {
                ret = -EAGAIN;
                goto out_unlock;
        }

        cgrp = cgroup_create(parent, name, mode);
        if (IS_ERR(cgrp)) {
                ret = PTR_ERR(cgrp);
                goto out_unlock;
        }

        /*
         * This extra ref will be put in cgroup_free_fn() and guarantees
         * that @cgrp->kn is always accessible.
         */
        kernfs_get(cgrp->kn);

        ret = cgroup_kn_set_ugid(cgrp->kn);
        if (ret)
                goto out_destroy;

        ret = css_populate_dir(&cgrp->self);
        if (ret)
                goto out_destroy;

        ret = cgroup_apply_control_enable(cgrp);
        if (ret)
                goto out_destroy;

        TRACE_CGROUP_PATH(mkdir, cgrp);

        /* let's create and online css's */
        kernfs_activate(cgrp->kn);

        ret = 0;
        goto out_unlock;

out_destroy:
        cgroup_destroy_locked(cgrp);
out_unlock:
        cgroup_kn_unlock(parent_kn);
        return ret;
}

/*
 * This is called when the refcnt of a css is confirmed to be killed.
 * css_tryget_online() is now guaranteed to fail.  Tell the subsystem to
 * initate destruction and put the css ref from kill_css().
 */
static void css_killed_work_fn(struct work_struct *work)
{
        struct cgroup_subsys_state *css =
                container_of(work, struct cgroup_subsys_state, destroy_work);

        mutex_lock(&cgroup_mutex);

        do {
                offline_css(css);
                css_put(css);
                /* @css can't go away while we're holding cgroup_mutex */
                css = css->parent;
        } while (css && atomic_dec_and_test(&css->online_cnt));

        mutex_unlock(&cgroup_mutex);
}

/* css kill confirmation processing requires process context, bounce */
static void css_killed_ref_fn(struct percpu_ref *ref)
{
        struct cgroup_subsys_state *css =
                container_of(ref, struct cgroup_subsys_state, refcnt);

        if (atomic_dec_and_test(&css->online_cnt)) {
                INIT_WORK(&css->destroy_work, css_killed_work_fn);
                queue_work(cgroup_offline_wq, &css->destroy_work);
        }
}

/**
 * kill_css - destroy a css
 * @css: css to destroy
 *
 * This function initiates destruction of @css by removing cgroup interface
 * files and putting its base reference.  ->css_offline() will be invoked
 * asynchronously once css_tryget_online() is guaranteed to fail and when
 * the reference count reaches zero, @css will be released.
 */
static void kill_css(struct cgroup_subsys_state *css)
{
        lockdep_assert_held(&cgroup_mutex);

        if (css->flags & CSS_DYING)
                return;

        css->flags |= CSS_DYING;

        /*
         * This must happen before css is disassociated with its cgroup.
         * See seq_css() for details.
         */
        css_clear_dir(css);

        /*
         * Killing would put the base ref, but we need to keep it alive
         * until after ->css_offline().
         */
        css_get(css);

        /*
         * cgroup core guarantees that, by the time ->css_offline() is
         * invoked, no new css reference will be given out via
         * css_tryget_online().  We can't simply call percpu_ref_kill() and
         * proceed to offlining css's because percpu_ref_kill() doesn't
         * guarantee that the ref is seen as killed on all CPUs on return.
         *
         * Use percpu_ref_kill_and_confirm() to get notifications as each
         * css is confirmed to be seen as killed on all CPUs.
         */
        percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
}

/**
 * cgroup_destroy_locked - the first stage of cgroup destruction
 * @cgrp: cgroup to be destroyed
 *
 * css's make use of percpu refcnts whose killing latency shouldn't be
 * exposed to userland and are RCU protected.  Also, cgroup core needs to
 * guarantee that css_tryget_online() won't succeed by the time
 * ->css_offline() is invoked.  To satisfy all the requirements,
 * destruction is implemented in the following two steps.
 *
 * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
 *     userland visible parts and start killing the percpu refcnts of
 *     css's.  Set up so that the next stage will be kicked off once all
 *     the percpu refcnts are confirmed to be killed.
 *
 * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
 *     rest of destruction.  Once all cgroup references are gone, the
 *     cgroup is RCU-freed.
 *
 * This function implements s1.  After this step, @cgrp is gone as far as
 * the userland is concerned and a new cgroup with the same name may be
 * created.  As cgroup doesn't care about the names internally, this
 * doesn't cause any problem.
 */
static int cgroup_destroy_locked(struct cgroup *cgrp)
        __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
        struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
        struct cgroup_subsys_state *css;
        struct cgrp_cset_link *link;
        int ssid;

        lockdep_assert_held(&cgroup_mutex);

        /*
         * Only migration can raise populated from zero and we're already
         * holding cgroup_mutex.
         */
        if (cgroup_is_populated(cgrp))
                return -EBUSY;

        /*
         * Make sure there's no live children.  We can't test emptiness of
         * ->self.children as dead children linger on it while being
         * drained; otherwise, "rmdir parent/child parent" may fail.
         */
        if (css_has_online_children(&cgrp->self))
                return -EBUSY;

        /*
         * Mark @cgrp and the associated csets dead.  The former prevents
         * further task migration and child creation by disabling
         * cgroup_lock_live_group().  The latter makes the csets ignored by
         * the migration path.
         */
        cgrp->self.flags &= ~CSS_ONLINE;

        spin_lock_irq(&css_set_lock);
        list_for_each_entry(link, &cgrp->cset_links, cset_link)
                link->cset->dead = true;
        spin_unlock_irq(&css_set_lock);

        /* initiate massacre of all css's */
        for_each_css(css, ssid, cgrp)
                kill_css(css);

        /* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */
        css_clear_dir(&cgrp->self);
        kernfs_remove(cgrp->kn);

        if (parent && cgroup_is_threaded(cgrp))
                parent->nr_threaded_children--;

        spin_lock_irq(&css_set_lock);
        for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
                tcgrp->nr_descendants--;
                tcgrp->nr_dying_descendants++;
                /*
                 * If the dying cgroup is frozen, decrease frozen descendants
                 * counters of ancestor cgroups.
                 */
                if (test_bit(CGRP_FROZEN, &cgrp->flags))
                        tcgrp->freezer.nr_frozen_descendants--;
        }
        spin_unlock_irq(&css_set_lock);

        cgroup1_check_for_release(parent);

        if (cgrp->root == &cgrp_dfl_root)
                cgroup_bpf_offline(cgrp);

        /* put the base reference */
        percpu_ref_kill(&cgrp->self.refcnt);

        return 0;
};

int cgroup_rmdir(struct kernfs_node *kn)
{
        struct cgroup *cgrp;
        int ret = 0;

        cgrp = cgroup_kn_lock_live(kn, false);
        if (!cgrp)
                return 0;

        ret = cgroup_destroy_locked(cgrp);
        if (!ret)
                TRACE_CGROUP_PATH(rmdir, cgrp);

        cgroup_kn_unlock(kn);
        return ret;
}

static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
        .show_options                = cgroup_show_options,
        .mkdir                        = cgroup_mkdir,
        .rmdir                        = cgroup_rmdir,
        .show_path                = cgroup_show_path,
};

static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
{
        struct cgroup_subsys_state *css;

        pr_debug("Initializing cgroup subsys %s\n", ss->name);

        mutex_lock(&cgroup_mutex);

        idr_init(&ss->css_idr);
        INIT_LIST_HEAD(&ss->cfts);

        /* Create the root cgroup state for this subsystem */
        ss->root = &cgrp_dfl_root;
        css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
        /* We don't handle early failures gracefully */
        BUG_ON(IS_ERR(css));
        init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);

        /*
         * Root csses are never destroyed and we can't initialize
         * percpu_ref during early init.  Disable refcnting.
         */
        css->flags |= CSS_NO_REF;

        if (early) {
                /* allocation can't be done safely during early init */
                css->id = 1;
        } else {
                css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
                BUG_ON(css->id < 0);
        }

        /* Update the init_css_set to contain a subsys
         * pointer to this state - since the subsystem is
         * newly registered, all tasks and hence the
         * init_css_set is in the subsystem's root cgroup. */
        init_css_set.subsys[ss->id] = css;

        have_fork_callback |= (bool)ss->fork << ss->id;
        have_exit_callback |= (bool)ss->exit << ss->id;
        have_release_callback |= (bool)ss->release << ss->id;
        have_canfork_callback |= (bool)ss->can_fork << ss->id;

        /* At system boot, before all subsystems have been
         * registered, no tasks have been forked, so we don't
         * need to invoke fork callbacks here. */
        BUG_ON(!list_empty(&init_task.tasks));

        BUG_ON(online_css(css));

        mutex_unlock(&cgroup_mutex);
}

/**
 * cgroup_init_early - cgroup initialization at system boot
 *
 * Initialize cgroups at system boot, and initialize any
 * subsystems that request early init.
 */
int __init cgroup_init_early(void)
{
        static struct cgroup_fs_context __initdata ctx;
        struct cgroup_subsys *ss;
        int i;

        ctx.root = &cgrp_dfl_root;
        init_cgroup_root(&ctx);
        cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;

        RCU_INIT_POINTER(init_task.cgroups, &init_css_set);

        for_each_subsys(ss, i) {
                WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
                     "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n",
                     i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
                     ss->id, ss->name);
                WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
                     "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);

                ss->id = i;
                ss->name = cgroup_subsys_name[i];
                if (!ss->legacy_name)
                        ss->legacy_name = cgroup_subsys_name[i];

                if (ss->early_init)
                        cgroup_init_subsys(ss, true);
        }
        return 0;
}

/**
 * cgroup_init - cgroup initialization
 *
 * Register cgroup filesystem and /proc file, and initialize
 * any subsystems that didn't request early init.
 */
int __init cgroup_init(void)
{
        struct cgroup_subsys *ss;
        int ssid;

        BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
        BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
        BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));

        cgroup_rstat_boot();

        /*
         * The latency of the synchronize_rcu() is too high for cgroups,
         * avoid it at the cost of forcing all readers into the slow path.
         */
        rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);

        get_user_ns(init_cgroup_ns.user_ns);

        mutex_lock(&cgroup_mutex);

        /*
         * Add init_css_set to the hash table so that dfl_root can link to
         * it during init.
         */
        hash_add(css_set_table, &init_css_set.hlist,
                 css_set_hash(init_css_set.subsys));

        BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));

        mutex_unlock(&cgroup_mutex);

        for_each_subsys(ss, ssid) {
                if (ss->early_init) {
                        struct cgroup_subsys_state *css =
                                init_css_set.subsys[ss->id];

                        css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
                                                   GFP_KERNEL);
                        BUG_ON(css->id < 0);
                } else {
                        cgroup_init_subsys(ss, false);
                }

                list_add_tail(&init_css_set.e_cset_node[ssid],
                              &cgrp_dfl_root.cgrp.e_csets[ssid]);

                /*
                 * Setting dfl_root subsys_mask needs to consider the
                 * disabled flag and cftype registration needs kmalloc,
                 * both of which aren't available during early_init.
                 */
                if (!cgroup_ssid_enabled(ssid))
                        continue;

                if (cgroup1_ssid_disabled(ssid))
                        printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
                               ss->name);

                cgrp_dfl_root.subsys_mask |= 1 << ss->id;

                /* implicit controllers must be threaded too */
                WARN_ON(ss->implicit_on_dfl && !ss->threaded);

                if (ss->implicit_on_dfl)
                        cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
                else if (!ss->dfl_cftypes)
                        cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;

                if (ss->threaded)
                        cgrp_dfl_threaded_ss_mask |= 1 << ss->id;

                if (ss->dfl_cftypes == ss->legacy_cftypes) {
                        WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
                } else {
                        WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
                        WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
                }

                if (ss->bind)
                        ss->bind(init_css_set.subsys[ssid]);

                mutex_lock(&cgroup_mutex);
                css_populate_dir(init_css_set.subsys[ssid]);
                mutex_unlock(&cgroup_mutex);
        }

        /* init_css_set.subsys[] has been updated, re-hash */
        hash_del(&init_css_set.hlist);
        hash_add(css_set_table, &init_css_set.hlist,
                 css_set_hash(init_css_set.subsys));

        WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
        WARN_ON(register_filesystem(&cgroup_fs_type));
        WARN_ON(register_filesystem(&cgroup2_fs_type));
        WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));
#ifdef CONFIG_CPUSETS
        WARN_ON(register_filesystem(&cpuset_fs_type));
#endif

        return 0;
}

static int __init cgroup_wq_init(void)
{
        /*
         * There isn't much point in executing destruction path in
         * parallel.  Good chunk is serialized with cgroup_mutex anyway.
         * Use 1 for @max_active.
         *
         * We would prefer to do this in cgroup_init() above, but that
         * is called before init_workqueues(): so leave this until after.
         */
        cgroup_offline_wq = alloc_workqueue("cgroup_offline", 0, 1);
        BUG_ON(!cgroup_offline_wq);

        cgroup_release_wq = alloc_workqueue("cgroup_release", 0, 1);
        BUG_ON(!cgroup_release_wq);

        cgroup_free_wq = alloc_workqueue("cgroup_free", 0, 1);
        BUG_ON(!cgroup_free_wq);
        return 0;
}
core_initcall(cgroup_wq_init);

void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
{
        struct kernfs_node *kn;

        kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
        if (!kn)
                return;
        kernfs_path(kn, buf, buflen);
        kernfs_put(kn);
}

/*
 * proc_cgroup_show()
 *  - Print task's cgroup paths into seq_file, one line for each hierarchy
 *  - Used for /proc/<pid>/cgroup.
 */
int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
                     struct pid *pid, struct task_struct *tsk)
{
        char *buf;
        int retval;
        struct cgroup_root *root;

        retval = -ENOMEM;
        buf = kmalloc(PATH_MAX, GFP_KERNEL);
        if (!buf)
                goto out;

        mutex_lock(&cgroup_mutex);
        spin_lock_irq(&css_set_lock);

        for_each_root(root) {
                struct cgroup_subsys *ss;
                struct cgroup *cgrp;
                int ssid, count = 0;

                if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
                        continue;

                seq_printf(m, "%d:", root->hierarchy_id);
                if (root != &cgrp_dfl_root)
                        for_each_subsys(ss, ssid)
                                if (root->subsys_mask & (1 << ssid))
                                        seq_printf(m, "%s%s", count++ ? "," : "",
                                                   ss->legacy_name);
                if (strlen(root->name))
                        seq_printf(m, "%sname=%s", count ? "," : "",
                                   root->name);
                seq_putc(m, ':');

                cgrp = task_cgroup_from_root(tsk, root);

                /*
                 * On traditional hierarchies, all zombie tasks show up as
                 * belonging to the root cgroup.  On the default hierarchy,
                 * while a zombie doesn't show up in "cgroup.procs" and
                 * thus can't be migrated, its /proc/PID/cgroup keeps
                 * reporting the cgroup it belonged to before exiting.  If
                 * the cgroup is removed before the zombie is reaped,
                 * " (deleted)" is appended to the cgroup path.
                 */
                if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
                        retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
                                                current->nsproxy->cgroup_ns);
                        if (retval >= PATH_MAX)
                                retval = -ENAMETOOLONG;
                        if (retval < 0)
                                goto out_unlock;

                        seq_puts(m, buf);
                } else {
                        seq_puts(m, "/");
                }

                if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
                        seq_puts(m, " (deleted)\n");
                else
                        seq_putc(m, '\n');
        }

        retval = 0;
out_unlock:
        spin_unlock_irq(&css_set_lock);
        mutex_unlock(&cgroup_mutex);
        kfree(buf);
out:
        return retval;
}

/**
 * cgroup_fork - initialize cgroup related fields during copy_process()
 * @child: pointer to task_struct of forking parent process.
 *
 * A task is associated with the init_css_set until cgroup_post_fork()
 * attaches it to the target css_set.
 */
void cgroup_fork(struct task_struct *child)
{
        RCU_INIT_POINTER(child->cgroups, &init_css_set);
        INIT_LIST_HEAD(&child->cg_list);
}

static struct cgroup *cgroup_get_from_file(struct file *f)
{
        struct cgroup_subsys_state *css;
        struct cgroup *cgrp;

        css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
        if (IS_ERR(css))
                return ERR_CAST(css);

        cgrp = css->cgroup;
        if (!cgroup_on_dfl(cgrp)) {
                cgroup_put(cgrp);
                return ERR_PTR(-EBADF);
        }

        return cgrp;
}

/**
 * cgroup_css_set_fork - find or create a css_set for a child process
 * @kargs: the arguments passed to create the child process
 *
 * This functions finds or creates a new css_set which the child
 * process will be attached to in cgroup_post_fork(). By default,
 * the child process will be given the same css_set as its parent.
 *
 * If CLONE_INTO_CGROUP is specified this function will try to find an
 * existing css_set which includes the requested cgroup and if not create
 * a new css_set that the child will be attached to later. If this function
 * succeeds it will hold cgroup_threadgroup_rwsem on return. If
 * CLONE_INTO_CGROUP is requested this function will grab cgroup mutex
 * before grabbing cgroup_threadgroup_rwsem and will hold a reference
 * to the target cgroup.
 */
static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
        __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
{
        int ret;
        struct cgroup *dst_cgrp = NULL;
        struct css_set *cset;
        struct super_block *sb;
        struct file *f;

        if (kargs->flags & CLONE_INTO_CGROUP)
                mutex_lock(&cgroup_mutex);

        cgroup_threadgroup_change_begin(current);

        spin_lock_irq(&css_set_lock);
        cset = task_css_set(current);
        get_css_set(cset);
        spin_unlock_irq(&css_set_lock);

        if (!(kargs->flags & CLONE_INTO_CGROUP)) {
                kargs->cset = cset;
                return 0;
        }

        f = fget_raw(kargs->cgroup);
        if (!f) {
                ret = -EBADF;
                goto err;
        }
        sb = f->f_path.dentry->d_sb;

        dst_cgrp = cgroup_get_from_file(f);
        if (IS_ERR(dst_cgrp)) {
                ret = PTR_ERR(dst_cgrp);
                dst_cgrp = NULL;
                goto err;
        }

        if (cgroup_is_dead(dst_cgrp)) {
                ret = -ENODEV;
                goto err;
        }

        /*
         * Verify that we the target cgroup is writable for us. This is
         * usually done by the vfs layer but since we're not going through
         * the vfs layer here we need to do it "manually".
         */
        ret = cgroup_may_write(dst_cgrp, sb);
        if (ret)
                goto err;

        ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
                                        !(kargs->flags & CLONE_THREAD),
                                        current->nsproxy->cgroup_ns);
        if (ret)
                goto err;

        kargs->cset = find_css_set(cset, dst_cgrp);
        if (!kargs->cset) {
                ret = -ENOMEM;
                goto err;
        }

        put_css_set(cset);
        fput(f);
        kargs->cgrp = dst_cgrp;
        return ret;

err:
        cgroup_threadgroup_change_end(current);
        mutex_unlock(&cgroup_mutex);
        if (f)
                fput(f);
        if (dst_cgrp)
                cgroup_put(dst_cgrp);
        put_css_set(cset);
        if (kargs->cset)
                put_css_set(kargs->cset);
        return ret;
}

/**
 * cgroup_css_set_put_fork - drop references we took during fork
 * @kargs: the arguments passed to create the child process
 *
 * Drop references to the prepared css_set and target cgroup if
 * CLONE_INTO_CGROUP was requested.
 */
static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
        __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{
        struct cgroup *cgrp = kargs->cgrp;
        struct css_set *cset = kargs->cset;

        cgroup_threadgroup_change_end(current);

        if (cset) {
                put_css_set(cset);
                kargs->cset = NULL;
        }

        if (kargs->flags & CLONE_INTO_CGROUP) {
                mutex_unlock(&cgroup_mutex);
                if (cgrp) {
                        cgroup_put(cgrp);
                        kargs->cgrp = NULL;
                }
        }
}

/**
 * cgroup_can_fork - called on a new task before the process is exposed
 * @child: the child process
 *
 * This prepares a new css_set for the child process which the child will
 * be attached to in cgroup_post_fork().
 * This calls the subsystem can_fork() callbacks. If the cgroup_can_fork()
 * callback returns an error, the fork aborts with that error code. This
 * allows for a cgroup subsystem to conditionally allow or deny new forks.
 */
int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
{
        struct cgroup_subsys *ss;
        int i, j, ret;

        ret = cgroup_css_set_fork(kargs);
        if (ret)
                return ret;

        do_each_subsys_mask(ss, i, have_canfork_callback) {
                ret = ss->can_fork(child, kargs->cset);
                if (ret)
                        goto out_revert;
        } while_each_subsys_mask();

        return 0;

out_revert:
        for_each_subsys(ss, j) {
                if (j >= i)
                        break;
                if (ss->cancel_fork)
                        ss->cancel_fork(child, kargs->cset);
        }

        cgroup_css_set_put_fork(kargs);

        return ret;
}

/**
 * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
 * @child: the child process
 * @kargs: the arguments passed to create the child process
 *
 * This calls the cancel_fork() callbacks if a fork failed *after*
 * cgroup_can_fork() succeded and cleans up references we took to
 * prepare a new css_set for the child process in cgroup_can_fork().
 */
void cgroup_cancel_fork(struct task_struct *child,
                        struct kernel_clone_args *kargs)
{
        struct cgroup_subsys *ss;
        int i;

        for_each_subsys(ss, i)
                if (ss->cancel_fork)
                        ss->cancel_fork(child, kargs->cset);

        cgroup_css_set_put_fork(kargs);
}

/**
 * cgroup_post_fork - finalize cgroup setup for the child process
 * @child: the child process
 *
 * Attach the child process to its css_set calling the subsystem fork()
 * callbacks.
 */
void cgroup_post_fork(struct task_struct *child,
                      struct kernel_clone_args *kargs)
        __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{
        struct cgroup_subsys *ss;
        struct css_set *cset;
        int i;

        cset = kargs->cset;
        kargs->cset = NULL;

        spin_lock_irq(&css_set_lock);

        /* init tasks are special, only link regular threads */
        if (likely(child->pid)) {
                WARN_ON_ONCE(!list_empty(&child->cg_list));
                cset->nr_tasks++;
                css_set_move_task(child, NULL, cset, false);
        } else {
                put_css_set(cset);
                cset = NULL;
        }

        /*
         * If the cgroup has to be frozen, the new task has too.  Let's set
         * the JOBCTL_TRAP_FREEZE jobctl bit to get the task into the
         * frozen state.
         */
        if (unlikely(cgroup_task_freeze(child))) {
                spin_lock(&child->sighand->siglock);
                WARN_ON_ONCE(child->frozen);
                child->jobctl |= JOBCTL_TRAP_FREEZE;
                spin_unlock(&child->sighand->siglock);

                /*
                 * Calling cgroup_update_frozen() isn't required here,
                 * because it will be called anyway a bit later from
                 * do_freezer_trap(). So we avoid cgroup's transient switch
                 * from the frozen state and back.
                 */
        }

        spin_unlock_irq(&css_set_lock);

        /*
         * Call ss->fork().  This must happen after @child is linked on
         * css_set; otherwise, @child might change state between ->fork()
         * and addition to css_set.
         */
        do_each_subsys_mask(ss, i, have_fork_callback) {
                ss->fork(child);
        } while_each_subsys_mask();

        /* Make the new cset the root_cset of the new cgroup namespace. */
        if (kargs->flags & CLONE_NEWCGROUP) {
                struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;

                get_css_set(cset);
                child->nsproxy->cgroup_ns->root_cset = cset;
                put_css_set(rcset);
        }

        cgroup_css_set_put_fork(kargs);
}

/**
 * cgroup_exit - detach cgroup from exiting task
 * @tsk: pointer to task_struct of exiting process
 *
 * Description: Detach cgroup from @tsk.
 *
 */
void cgroup_exit(struct task_struct *tsk)
{
        struct cgroup_subsys *ss;
        struct css_set *cset;
        int i;

        spin_lock_irq(&css_set_lock);

        WARN_ON_ONCE(list_empty(&tsk->cg_list));
        cset = task_css_set(tsk);
        css_set_move_task(tsk, cset, NULL, false);
        list_add_tail(&tsk->cg_list, &cset->dying_tasks);
        cset->nr_tasks--;

        if (dl_task(tsk))
                dec_dl_tasks_cs(tsk);

        WARN_ON_ONCE(cgroup_task_frozen(tsk));
        if (unlikely(cgroup_task_freeze(tsk)))
                cgroup_update_frozen(task_dfl_cgroup(tsk));

        spin_unlock_irq(&css_set_lock);

        /* see cgroup_post_fork() for details */
        do_each_subsys_mask(ss, i, have_exit_callback) {
                ss->exit(tsk);
        } while_each_subsys_mask();
}

void cgroup_release(struct task_struct *task)
{
        struct cgroup_subsys *ss;
        int ssid;

        do_each_subsys_mask(ss, ssid, have_release_callback) {
                ss->release(task);
        } while_each_subsys_mask();

        spin_lock_irq(&css_set_lock);
        css_set_skip_task_iters(task_css_set(task), task);
        list_del_init(&task->cg_list);
        spin_unlock_irq(&css_set_lock);
}

void cgroup_free(struct task_struct *task)
{
        struct css_set *cset = task_css_set(task);
        put_css_set(cset);
}

static int __init cgroup_disable(char *str)
{
        struct cgroup_subsys *ss;
        char *token;
        int i;

        while ((token = strsep(&str, ",")) != NULL) {
                if (!*token)
                        continue;

                for_each_subsys(ss, i) {
                        if (strcmp(token, ss->name) &&
                            strcmp(token, ss->legacy_name))
                                continue;

                        static_branch_disable(cgroup_subsys_enabled_key[i]);
                        pr_info("Disabling %s control group subsystem\n",
                                ss->name);
                }
        }
        return 1;
}
__setup("cgroup_disable=", cgroup_disable);

void __init __weak enable_debug_cgroup(void) { }

static int __init enable_cgroup_debug(char *str)
{
        cgroup_debug = true;
        enable_debug_cgroup();
        return 1;
}
__setup("cgroup_debug", enable_cgroup_debug);

/**
 * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
 * @dentry: directory dentry of interest
 * @ss: subsystem of interest
 *
 * If @dentry is a directory for a cgroup which has @ss enabled on it, try
 * to get the corresponding css and return it.  If such css doesn't exist
 * or can't be pinned, an ERR_PTR value is returned.
 */
struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
                                                       struct cgroup_subsys *ss)
{
        struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
        struct file_system_type *s_type = dentry->d_sb->s_type;
        struct cgroup_subsys_state *css = NULL;
        struct cgroup *cgrp;

        /* is @dentry a cgroup dir? */
        if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) ||
            !kn || kernfs_type(kn) != KERNFS_DIR)
                return ERR_PTR(-EBADF);

        rcu_read_lock();

        /*
         * This path doesn't originate from kernfs and @kn could already
         * have been or be removed at any point.  @kn->priv is RCU
         * protected for this access.  See css_release_work_fn() for details.
         */
        cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
        if (cgrp)
                css = cgroup_css(cgrp, ss);

        if (!css || !css_tryget_online(css))
                css = ERR_PTR(-ENOENT);

        rcu_read_unlock();
        return css;
}

/**
 * css_from_id - lookup css by id
 * @id: the cgroup id
 * @ss: cgroup subsys to be looked into
 *
 * Returns the css if there's valid one with @id, otherwise returns NULL.
 * Should be called under rcu_read_lock().
 */
struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
{
        WARN_ON_ONCE(!rcu_read_lock_held());
        return idr_find(&ss->css_idr, id);
}

/**
 * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path
 * @path: path on the default hierarchy
 *
 * Find the cgroup at @path on the default hierarchy, increment its
 * reference count and return it.  Returns pointer to the found cgroup on
 * success, ERR_PTR(-ENOENT) if @path doens't exist and ERR_PTR(-ENOTDIR)
 * if @path points to a non-directory.
 */
struct cgroup *cgroup_get_from_path(const char *path)
{
        struct kernfs_node *kn;
        struct cgroup *cgrp;

        mutex_lock(&cgroup_mutex);

        kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
        if (kn) {
                if (kernfs_type(kn) == KERNFS_DIR) {
                        cgrp = kn->priv;
                        cgroup_get_live(cgrp);
                } else {
                        cgrp = ERR_PTR(-ENOTDIR);
                }
                kernfs_put(kn);
        } else {
                cgrp = ERR_PTR(-ENOENT);
        }

        mutex_unlock(&cgroup_mutex);
        return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_path);

/**
 * cgroup_get_from_fd - get a cgroup pointer from a fd
 * @fd: fd obtained by open(cgroup2_dir)
 *
 * Find the cgroup from a fd which should be obtained
 * by opening a cgroup directory.  Returns a pointer to the
 * cgroup on success. ERR_PTR is returned if the cgroup
 * cannot be found.
 */
struct cgroup *cgroup_get_from_fd(int fd)
{
        struct cgroup *cgrp;
        struct file *f;

        f = fget_raw(fd);
        if (!f)
                return ERR_PTR(-EBADF);

        cgrp = cgroup_get_from_file(f);
        fput(f);
        return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_fd);

static u64 power_of_ten(int power)
{
        u64 v = 1;
        while (power--)
                v *= 10;
        return v;
}

/**
 * cgroup_parse_float - parse a floating number
 * @input: input string
 * @dec_shift: number of decimal digits to shift
 * @v: output
 *
 * Parse a decimal floating point number in @input and store the result in
 * @v with decimal point right shifted @dec_shift times.  For example, if
 * @input is "12.3456" and @dec_shift is 3, *@v will be set to 12345.
 * Returns 0 on success, -errno otherwise.
 *
 * There's nothing cgroup specific about this function except that it's
 * currently the only user.
 */
int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
{
        s64 whole, frac = 0;
        int fstart = 0, fend = 0, flen;

        if (!sscanf(input, "%lld.%n%lld%n", &whole, &fstart, &frac, &fend))
                return -EINVAL;
        if (frac < 0)
                return -EINVAL;

        flen = fend > fstart ? fend - fstart : 0;
        if (flen < dec_shift)
                frac *= power_of_ten(dec_shift - flen);
        else
                frac = DIV_ROUND_CLOSEST_ULL(frac, power_of_ten(flen - dec_shift));

        *v = whole * power_of_ten(dec_shift) + frac;
        return 0;
}

/*
 * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
 * definition in cgroup-defs.h.
 */
#ifdef CONFIG_SOCK_CGROUP_DATA

void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
{
        struct cgroup *cgroup;

        rcu_read_lock();
        /* Don't associate the sock with unrelated interrupted task's cgroup. */
        if (in_interrupt()) {
                cgroup = &cgrp_dfl_root.cgrp;
                cgroup_get(cgroup);
                goto out;
        }

        while (true) {
                struct css_set *cset;

                cset = task_css_set(current);
                if (likely(cgroup_tryget(cset->dfl_cgrp))) {
                        cgroup = cset->dfl_cgrp;
                        break;
                }
                cpu_relax();
        }
out:
        skcd->cgroup = cgroup;
        cgroup_bpf_get(cgroup);
        rcu_read_unlock();
}

void cgroup_sk_clone(struct sock_cgroup_data *skcd)
{
        struct cgroup *cgrp = sock_cgroup_ptr(skcd);

        /*
         * We might be cloning a socket which is left in an empty
         * cgroup and the cgroup might have already been rmdir'd.
         * Don't use cgroup_get_live().
         */
        cgroup_get(cgrp);
        cgroup_bpf_get(cgrp);
}

void cgroup_sk_free(struct sock_cgroup_data *skcd)
{
        struct cgroup *cgrp = sock_cgroup_ptr(skcd);

        cgroup_bpf_put(cgrp);
        cgroup_put(cgrp);
}

#endif        /* CONFIG_SOCK_CGROUP_DATA */

#ifdef CONFIG_CGROUP_BPF
int cgroup_bpf_attach(struct cgroup *cgrp,
                      struct bpf_prog *prog, struct bpf_prog *replace_prog,
                      struct bpf_cgroup_link *link,
                      enum bpf_attach_type type,
                      u32 flags)
{
        int ret;

        mutex_lock(&cgroup_mutex);
        ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
        mutex_unlock(&cgroup_mutex);
        return ret;
}

int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
                      enum bpf_attach_type type)
{
        int ret;

        mutex_lock(&cgroup_mutex);
        ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
        mutex_unlock(&cgroup_mutex);
        return ret;
}

int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
                     union bpf_attr __user *uattr)
{
        int ret;

        mutex_lock(&cgroup_mutex);
        ret = __cgroup_bpf_query(cgrp, attr, uattr);
        mutex_unlock(&cgroup_mutex);
        return ret;
}
#endif /* CONFIG_CGROUP_BPF */

#ifdef CONFIG_SYSFS
static ssize_t show_delegatable_files(struct cftype *files, char *buf,
                                      ssize_t size, const char *prefix)
{
        struct cftype *cft;
        ssize_t ret = 0;

        for (cft = files; cft && cft->name[0] != '\0'; cft++) {
                if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
                        continue;

                if (prefix)
                        ret += snprintf(buf + ret, size - ret, "%s.", prefix);

                ret += snprintf(buf + ret, size - ret, "%s\n", cft->name);

                if (WARN_ON(ret >= size))
                        break;
        }

        return ret;
}

static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
                              char *buf)
{
        struct cgroup_subsys *ss;
        int ssid;
        ssize_t ret = 0;

        ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret,
                                     NULL);

        for_each_subsys(ss, ssid)
                ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
                                              PAGE_SIZE - ret,
                                              cgroup_subsys_name[ssid]);

        return ret;
}
static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);

static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
                             char *buf)
{
        return snprintf(buf, PAGE_SIZE,
                        "nsdelegate\n"
                        "memory_localevents\n"
                        "memory_recursiveprot\n");
}
static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);

static struct attribute *cgroup_sysfs_attrs[] = {
        &cgroup_delegate_attr.attr,
        &cgroup_features_attr.attr,
        NULL,
};

static const struct attribute_group cgroup_sysfs_attr_group = {
        .attrs = cgroup_sysfs_attrs,
        .name = "cgroup",
};

static int __init cgroup_sysfs_init(void)
{
        return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group);
}
subsys_initcall(cgroup_sysfs_init);

#endif /* CONFIG_SYSFS */



















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PART_STAT_H
#define _LINUX_PART_STAT_H

#include <linux/genhd.h>

struct disk_stats {
        u64 nsecs[NR_STAT_GROUPS];
        unsigned long sectors[NR_STAT_GROUPS];
        unsigned long ios[NR_STAT_GROUPS];
        unsigned long merges[NR_STAT_GROUPS];
        unsigned long io_ticks;
        local_t in_flight[2];
};

/*
 * Macros to operate on percpu disk statistics:
 *
 * {disk|part|all}_stat_{add|sub|inc|dec}() modify the stat counters and should
 * be called between disk_stat_lock() and disk_stat_unlock().
 *
 * part_stat_read() can be called at any time.
 */
#define part_stat_lock()        preempt_disable()
#define part_stat_unlock()        preempt_enable()

#define part_stat_get_cpu(part, field, cpu)                                \
        (per_cpu_ptr((part)->dkstats, (cpu))->field)

#define part_stat_get(part, field)                                        \
        part_stat_get_cpu(part, field, smp_processor_id())

#define part_stat_read(part, field)                                        \
({                                                                        \
        typeof((part)->dkstats->field) res = 0;                                \
        unsigned int _cpu;                                                \
        for_each_possible_cpu(_cpu)                                        \
                res += per_cpu_ptr((part)->dkstats, _cpu)->field;        \
        res;                                                                \
})

static inline void part_stat_set_all(struct hd_struct *part, int value)
{
        int i;

        for_each_possible_cpu(i)
                memset(per_cpu_ptr(part->dkstats, i), value,
                                sizeof(struct disk_stats));
}

#define part_stat_read_accum(part, field)                                \
        (part_stat_read(part, field[STAT_READ]) +                        \
         part_stat_read(part, field[STAT_WRITE]) +                        \
         part_stat_read(part, field[STAT_DISCARD]))

#define __part_stat_add(part, field, addnd)                                \
        __this_cpu_add((part)->dkstats->field, addnd)

#define part_stat_add(part, field, addnd)        do {                        \
        __part_stat_add((part), field, addnd);                                \
        if ((part)->partno)                                                \
                __part_stat_add(&part_to_disk((part))->part0,                \
                                field, addnd);                                \
} while (0)

#define part_stat_dec(gendiskp, field)                                        \
        part_stat_add(gendiskp, field, -1)
#define part_stat_inc(gendiskp, field)                                        \
        part_stat_add(gendiskp, field, 1)
#define part_stat_sub(gendiskp, field, subnd)                                \
        part_stat_add(gendiskp, field, -subnd)

#define part_stat_local_dec(gendiskp, field)                                \
        local_dec(&(part_stat_get(gendiskp, field)))
#define part_stat_local_inc(gendiskp, field)                                \
        local_inc(&(part_stat_get(gendiskp, field)))
#define part_stat_local_read(gendiskp, field)                                \
        local_read(&(part_stat_get(gendiskp, field)))
#define part_stat_local_read_cpu(gendiskp, field, cpu)                        \
        local_read(&(part_stat_get_cpu(gendiskp, field, cpu)))

#endif /* _LINUX_PART_STAT_H */











































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the IP module.
 *
 * Version:        @(#)ip.h        1.0.2        05/07/93
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Alan Cox, <gw4pts@gw4pts.ampr.org>
 *
 * Changes:
 *                Mike McLagan    :       Routing by source
 */
#ifndef _IP_H
#define _IP_H

#include <linux/types.h>
#include <linux/ip.h>
#include <linux/in.h>
#include <linux/skbuff.h>
#include <linux/jhash.h>
#include <linux/sockptr.h>

#include <net/inet_sock.h>
#include <net/route.h>
#include <net/snmp.h>
#include <net/flow.h>
#include <net/flow_dissector.h>
#include <net/netns/hash.h>
#include <net/lwtunnel.h>

#define IPV4_MAX_PMTU                65535U                /* RFC 2675, Section 5.1 */
#define IPV4_MIN_MTU                68                        /* RFC 791 */

extern unsigned int sysctl_fib_sync_mem;
extern unsigned int sysctl_fib_sync_mem_min;
extern unsigned int sysctl_fib_sync_mem_max;

struct sock;

struct inet_skb_parm {
        int                        iif;
        struct ip_options        opt;                /* Compiled IP options                */
        u16                        flags;

#define IPSKB_FORWARDED                BIT(0)
#define IPSKB_XFRM_TUNNEL_SIZE        BIT(1)
#define IPSKB_XFRM_TRANSFORMED        BIT(2)
#define IPSKB_FRAG_COMPLETE        BIT(3)
#define IPSKB_REROUTED                BIT(4)
#define IPSKB_DOREDIRECT        BIT(5)
#define IPSKB_FRAG_PMTU                BIT(6)
#define IPSKB_L3SLAVE                BIT(7)
#define IPSKB_NOPOLICY                BIT(8)
#define IPSKB_MULTIPATH                BIT(9)

        u16                        frag_max_size;
};

static inline bool ipv4_l3mdev_skb(u16 flags)
{
        return !!(flags & IPSKB_L3SLAVE);
}

static inline unsigned int ip_hdrlen(const struct sk_buff *skb)
{
        return ip_hdr(skb)->ihl * 4;
}

struct ipcm_cookie {
        struct sockcm_cookie        sockc;
        __be32                        addr;
        int                        oif;
        struct ip_options_rcu        *opt;
        __u8                        protocol;
        __u8                        ttl;
        __s16                        tos;
        char                        priority;
        __u16                        gso_size;
};

static inline void ipcm_init(struct ipcm_cookie *ipcm)
{
        *ipcm = (struct ipcm_cookie) { .tos = -1 };
}

static inline void ipcm_init_sk(struct ipcm_cookie *ipcm,
                                const struct inet_sock *inet)
{
        ipcm_init(ipcm);

        ipcm->sockc.mark = inet->sk.sk_mark;
        ipcm->sockc.tsflags = inet->sk.sk_tsflags;
        ipcm->oif = inet->sk.sk_bound_dev_if;
        ipcm->addr = inet->inet_saddr;
        ipcm->protocol = inet->inet_num;
}

#define IPCB(skb) ((struct inet_skb_parm*)((skb)->cb))
#define PKTINFO_SKB_CB(skb) ((struct in_pktinfo *)((skb)->cb))

/* return enslaved device index if relevant */
static inline int inet_sdif(struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
        if (skb && ipv4_l3mdev_skb(IPCB(skb)->flags))
                return IPCB(skb)->iif;
#endif
        return 0;
}

/* Special input handler for packets caught by router alert option.
   They are selected only by protocol field, and then processed likely
   local ones; but only if someone wants them! Otherwise, router
   not running rsvpd will kill RSVP.

   It is user level problem, what it will make with them.
   I have no idea, how it will masquearde or NAT them (it is joke, joke :-)),
   but receiver should be enough clever f.e. to forward mtrace requests,
   sent to multicast group to reach destination designated router.
 */

struct ip_ra_chain {
        struct ip_ra_chain __rcu *next;
        struct sock                *sk;
        union {
                void                        (*destructor)(struct sock *);
                struct sock                *saved_sk;
        };
        struct rcu_head                rcu;
};

/* IP flags. */
#define IP_CE                0x8000                /* Flag: "Congestion"                */
#define IP_DF                0x4000                /* Flag: "Don't Fragment"        */
#define IP_MF                0x2000                /* Flag: "More Fragments"        */
#define IP_OFFSET        0x1FFF                /* "Fragment Offset" part        */

#define IP_FRAG_TIME        (30 * HZ)                /* fragment lifetime        */

struct msghdr;
struct net_device;
struct packet_type;
struct rtable;
struct sockaddr;

int igmp_mc_init(void);

/*
 *        Functions provided by ip.c
 */

int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
                          __be32 saddr, __be32 daddr,
                          struct ip_options_rcu *opt, u8 tos);
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
           struct net_device *orig_dev);
void ip_list_rcv(struct list_head *head, struct packet_type *pt,
                 struct net_device *orig_dev);
int ip_local_deliver(struct sk_buff *skb);
void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int proto);
int ip_mr_input(struct sk_buff *skb);
int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb);
int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb);
int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
                   int (*output)(struct net *, struct sock *, struct sk_buff *));

struct ip_fraglist_iter {
        struct sk_buff        *frag;
        struct iphdr        *iph;
        int                offset;
        unsigned int        hlen;
};

void ip_fraglist_init(struct sk_buff *skb, struct iphdr *iph,
                      unsigned int hlen, struct ip_fraglist_iter *iter);
void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter);

static inline struct sk_buff *ip_fraglist_next(struct ip_fraglist_iter *iter)
{
        struct sk_buff *skb = iter->frag;

        iter->frag = skb->next;
        skb_mark_not_on_list(skb);

        return skb;
}

struct ip_frag_state {
        bool                DF;
        unsigned int        hlen;
        unsigned int        ll_rs;
        unsigned int        mtu;
        unsigned int        left;
        int                offset;
        int                ptr;
        __be16                not_last_frag;
};

void ip_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int ll_rs,
                  unsigned int mtu, bool DF, struct ip_frag_state *state);
struct sk_buff *ip_frag_next(struct sk_buff *skb,
                             struct ip_frag_state *state);

void ip_send_check(struct iphdr *ip);
int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb);
int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb);

int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
                    __u8 tos);
void ip_init(void);
int ip_append_data(struct sock *sk, struct flowi4 *fl4,
                   int getfrag(void *from, char *to, int offset, int len,
                               int odd, struct sk_buff *skb),
                   void *from, int len, int protolen,
                   struct ipcm_cookie *ipc,
                   struct rtable **rt,
                   unsigned int flags);
int ip_generic_getfrag(void *from, char *to, int offset, int len, int odd,
                       struct sk_buff *skb);
ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
                       int offset, size_t size, int flags);
struct sk_buff *__ip_make_skb(struct sock *sk, struct flowi4 *fl4,
                              struct sk_buff_head *queue,
                              struct inet_cork *cork);
int ip_send_skb(struct net *net, struct sk_buff *skb);
int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4);
void ip_flush_pending_frames(struct sock *sk);
struct sk_buff *ip_make_skb(struct sock *sk, struct flowi4 *fl4,
                            int getfrag(void *from, char *to, int offset,
                                        int len, int odd, struct sk_buff *skb),
                            void *from, int length, int transhdrlen,
                            struct ipcm_cookie *ipc, struct rtable **rtp,
                            struct inet_cork *cork, unsigned int flags);

int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl);

static inline struct sk_buff *ip_finish_skb(struct sock *sk, struct flowi4 *fl4)
{
        return __ip_make_skb(sk, fl4, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
}

static inline __u8 get_rttos(struct ipcm_cookie* ipc, struct inet_sock *inet)
{
        return (ipc->tos != -1) ? RT_TOS(ipc->tos) : RT_TOS(inet->tos);
}

static inline __u8 get_rtconn_flags(struct ipcm_cookie* ipc, struct sock* sk)
{
        return (ipc->tos != -1) ? RT_CONN_FLAGS_TOS(sk, ipc->tos) : RT_CONN_FLAGS(sk);
}

/* datagram.c */
int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);

void ip4_datagram_release_cb(struct sock *sk);

struct ip_reply_arg {
        struct kvec iov[1];
        int            flags;
        __wsum             csum;
        int            csumoffset; /* u16 offset of csum in iov[0].iov_base */
                                /* -1 if not needed */
        int            bound_dev_if;
        u8              tos;
        kuid_t            uid;
};

#define IP_REPLY_ARG_NOSRCCHECK 1

static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg)
{
        return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0;
}

void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
                           const struct ip_options *sopt,
                           __be32 daddr, __be32 saddr,
                           const struct ip_reply_arg *arg,
                           unsigned int len, u64 transmit_time);

#define IP_INC_STATS(net, field)        SNMP_INC_STATS64((net)->mib.ip_statistics, field)
#define __IP_INC_STATS(net, field)        __SNMP_INC_STATS64((net)->mib.ip_statistics, field)
#define IP_ADD_STATS(net, field, val)        SNMP_ADD_STATS64((net)->mib.ip_statistics, field, val)
#define __IP_ADD_STATS(net, field, val) __SNMP_ADD_STATS64((net)->mib.ip_statistics, field, val)
#define IP_UPD_PO_STATS(net, field, val) SNMP_UPD_PO_STATS64((net)->mib.ip_statistics, field, val)
#define __IP_UPD_PO_STATS(net, field, val) __SNMP_UPD_PO_STATS64((net)->mib.ip_statistics, field, val)
#define NET_INC_STATS(net, field)        SNMP_INC_STATS((net)->mib.net_statistics, field)
#define __NET_INC_STATS(net, field)        __SNMP_INC_STATS((net)->mib.net_statistics, field)
#define NET_ADD_STATS(net, field, adnd)        SNMP_ADD_STATS((net)->mib.net_statistics, field, adnd)
#define __NET_ADD_STATS(net, field, adnd) __SNMP_ADD_STATS((net)->mib.net_statistics, field, adnd)

u64 snmp_get_cpu_field(void __percpu *mib, int cpu, int offct);
unsigned long snmp_fold_field(void __percpu *mib, int offt);
#if BITS_PER_LONG==32
u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offct,
                         size_t syncp_offset);
u64 snmp_fold_field64(void __percpu *mib, int offt, size_t sync_off);
#else
static inline u64  snmp_get_cpu_field64(void __percpu *mib, int cpu, int offct,
                                        size_t syncp_offset)
{
        return snmp_get_cpu_field(mib, cpu, offct);

}

static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_off)
{
        return snmp_fold_field(mib, offt);
}
#endif

#define snmp_get_cpu_field64_batch(buff64, stats_list, mib_statistic, offset) \
{ \
        int i, c; \
        for_each_possible_cpu(c) { \
                for (i = 0; stats_list[i].name; i++) \
                        buff64[i] += snmp_get_cpu_field64( \
                                        mib_statistic, \
                                        c, stats_list[i].entry, \
                                        offset); \
        } \
}

#define snmp_get_cpu_field_batch(buff, stats_list, mib_statistic) \
{ \
        int i, c; \
        for_each_possible_cpu(c) { \
                for (i = 0; stats_list[i].name; i++) \
                        buff[i] += snmp_get_cpu_field( \
                                                mib_statistic, \
                                                c, stats_list[i].entry); \
        } \
}

void inet_get_local_port_range(struct net *net, int *low, int *high);

#ifdef CONFIG_SYSCTL
static inline bool inet_is_local_reserved_port(struct net *net, unsigned short port)
{
        if (!net->ipv4.sysctl_local_reserved_ports)
                return false;
        return test_bit(port, net->ipv4.sysctl_local_reserved_ports);
}

static inline bool sysctl_dev_name_is_allowed(const char *name)
{
        return strcmp(name, "default") != 0  && strcmp(name, "all") != 0;
}

static inline bool inet_port_requires_bind_service(struct net *net, unsigned short port)
{
        return port < READ_ONCE(net->ipv4.sysctl_ip_prot_sock);
}

#else
static inline bool inet_is_local_reserved_port(struct net *net, unsigned short port)
{
        return false;
}

static inline bool inet_port_requires_bind_service(struct net *net, unsigned short port)
{
        return port < PROT_SOCK;
}
#endif

__be32 inet_current_timestamp(void);

/* From inetpeer.c */
extern int inet_peer_threshold;
extern int inet_peer_minttl;
extern int inet_peer_maxttl;

void ipfrag_init(void);

void ip_static_sysctl_init(void);

#define IP4_REPLY_MARK(net, mark) \
        (READ_ONCE((net)->ipv4.sysctl_fwmark_reflect) ? (mark) : 0)

static inline bool ip_is_fragment(const struct iphdr *iph)
{
        return (iph->frag_off & htons(IP_MF | IP_OFFSET)) != 0;
}

#ifdef CONFIG_INET
#include <net/dst.h>

/* The function in 2.2 was invalid, producing wrong result for
 * check=0xFEFF. It was noticed by Arthur Skawina _year_ ago. --ANK(000625) */
static inline
int ip_decrease_ttl(struct iphdr *iph)
{
        u32 check = (__force u32)iph->check;
        check += (__force u32)htons(0x0100);
        iph->check = (__force __sum16)(check + (check>=0xFFFF));
        return --iph->ttl;
}

static inline int ip_mtu_locked(const struct dst_entry *dst)
{
        const struct rtable *rt = (const struct rtable *)dst;

        return rt->rt_mtu_locked || dst_metric_locked(dst, RTAX_MTU);
}

static inline
int ip_dont_fragment(const struct sock *sk, const struct dst_entry *dst)
{
        u8 pmtudisc = READ_ONCE(inet_sk(sk)->pmtudisc);

        return  pmtudisc == IP_PMTUDISC_DO ||
                (pmtudisc == IP_PMTUDISC_WANT &&
                 !ip_mtu_locked(dst));
}

static inline bool ip_sk_accept_pmtu(const struct sock *sk)
{
        return inet_sk(sk)->pmtudisc != IP_PMTUDISC_INTERFACE &&
               inet_sk(sk)->pmtudisc != IP_PMTUDISC_OMIT;
}

static inline bool ip_sk_use_pmtu(const struct sock *sk)
{
        return inet_sk(sk)->pmtudisc < IP_PMTUDISC_PROBE;
}

static inline bool ip_sk_ignore_df(const struct sock *sk)
{
        return inet_sk(sk)->pmtudisc < IP_PMTUDISC_DO ||
               inet_sk(sk)->pmtudisc == IP_PMTUDISC_OMIT;
}

static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
                                                    bool forwarding)
{
        struct net *net = dev_net(dst->dev);
        unsigned int mtu;

        if (READ_ONCE(net->ipv4.sysctl_ip_fwd_use_pmtu) ||
            ip_mtu_locked(dst) ||
            !forwarding)
                return dst_mtu(dst);

        /* 'forwarding = true' case should always honour route mtu */
        mtu = dst_metric_raw(dst, RTAX_MTU);
        if (!mtu)
                mtu = min(READ_ONCE(dst->dev->mtu), IP_MAX_MTU);

        return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
}

static inline unsigned int ip_skb_dst_mtu(struct sock *sk,
                                          const struct sk_buff *skb)
{
        unsigned int mtu;

        if (!sk || !sk_fullsock(sk) || ip_sk_use_pmtu(sk)) {
                bool forwarding = IPCB(skb)->flags & IPSKB_FORWARDED;

                return ip_dst_mtu_maybe_forward(skb_dst(skb), forwarding);
        }

        mtu = min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU);
        return mtu - lwtunnel_headroom(skb_dst(skb)->lwtstate, mtu);
}

struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx,
                                        int fc_mx_len,
                                        struct netlink_ext_ack *extack);
static inline void ip_fib_metrics_put(struct dst_metrics *fib_metrics)
{
        if (fib_metrics != &dst_default_metrics &&
            refcount_dec_and_test(&fib_metrics->refcnt))
                kfree(fib_metrics);
}

/* ipv4 and ipv6 both use refcounted metrics if it is not the default */
static inline
void ip_dst_init_metrics(struct dst_entry *dst, struct dst_metrics *fib_metrics)
{
        dst_init_metrics(dst, fib_metrics->metrics, true);

        if (fib_metrics != &dst_default_metrics) {
                dst->_metrics |= DST_METRICS_REFCOUNTED;
                refcount_inc(&fib_metrics->refcnt);
        }
}

static inline
void ip_dst_metrics_put(struct dst_entry *dst)
{
        struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);

        if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
                kfree(p);
}

u32 ip_idents_reserve(u32 hash, int segs);
void __ip_select_ident(struct net *net, struct iphdr *iph, int segs);

static inline void ip_select_ident_segs(struct net *net, struct sk_buff *skb,
                                        struct sock *sk, int segs)
{
        struct iphdr *iph = ip_hdr(skb);

        /* We had many attacks based on IPID, use the private
         * generator as much as we can.
         */
        if (sk && inet_sk(sk)->inet_daddr) {
                iph->id = htons(inet_sk(sk)->inet_id);
                inet_sk(sk)->inet_id += segs;
                return;
        }
        if ((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) {
                iph->id = 0;
        } else {
                /* Unfortunately we need the big hammer to get a suitable IPID */
                __ip_select_ident(net, iph, segs);
        }
}

static inline void ip_select_ident(struct net *net, struct sk_buff *skb,
                                   struct sock *sk)
{
        ip_select_ident_segs(net, skb, sk, 1);
}

static inline __wsum inet_compute_pseudo(struct sk_buff *skb, int proto)
{
        return csum_tcpudp_nofold(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
                                  skb->len, proto, 0);
}

/* copy IPv4 saddr & daddr to flow_keys, possibly using 64bit load/store
 * Equivalent to :        flow->v4addrs.src = iph->saddr;
 *                        flow->v4addrs.dst = iph->daddr;
 */
static inline void iph_to_flow_copy_v4addrs(struct flow_keys *flow,
                                            const struct iphdr *iph)
{
        BUILD_BUG_ON(offsetof(typeof(flow->addrs), v4addrs.dst) !=
                     offsetof(typeof(flow->addrs), v4addrs.src) +
                              sizeof(flow->addrs.v4addrs.src));
        memcpy(&flow->addrs.v4addrs, &iph->addrs, sizeof(flow->addrs.v4addrs));
        flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
}

static inline __wsum inet_gro_compute_pseudo(struct sk_buff *skb, int proto)
{
        const struct iphdr *iph = skb_gro_network_header(skb);

        return csum_tcpudp_nofold(iph->saddr, iph->daddr,
                                  skb_gro_len(skb), proto, 0);
}

/*
 *        Map a multicast IP onto multicast MAC for type ethernet.
 */

static inline void ip_eth_mc_map(__be32 naddr, char *buf)
{
        __u32 addr=ntohl(naddr);
        buf[0]=0x01;
        buf[1]=0x00;
        buf[2]=0x5e;
        buf[5]=addr&0xFF;
        addr>>=8;
        buf[4]=addr&0xFF;
        addr>>=8;
        buf[3]=addr&0x7F;
}

/*
 *        Map a multicast IP onto multicast MAC for type IP-over-InfiniBand.
 *        Leave P_Key as 0 to be filled in by driver.
 */

static inline void ip_ib_mc_map(__be32 naddr, const unsigned char *broadcast, char *buf)
{
        __u32 addr;
        unsigned char scope = broadcast[5] & 0xF;

        buf[0]  = 0;                /* Reserved */
        buf[1]  = 0xff;                /* Multicast QPN */
        buf[2]  = 0xff;
        buf[3]  = 0xff;
        addr    = ntohl(naddr);
        buf[4]  = 0xff;
        buf[5]  = 0x10 | scope;        /* scope from broadcast address */
        buf[6]  = 0x40;                /* IPv4 signature */
        buf[7]  = 0x1b;
        buf[8]  = broadcast[8];                /* P_Key */
        buf[9]  = broadcast[9];
        buf[10] = 0;
        buf[11] = 0;
        buf[12] = 0;
        buf[13] = 0;
        buf[14] = 0;
        buf[15] = 0;
        buf[19] = addr & 0xff;
        addr  >>= 8;
        buf[18] = addr & 0xff;
        addr  >>= 8;
        buf[17] = addr & 0xff;
        addr  >>= 8;
        buf[16] = addr & 0x0f;
}

static inline void ip_ipgre_mc_map(__be32 naddr, const unsigned char *broadcast, char *buf)
{
        if ((broadcast[0] | broadcast[1] | broadcast[2] | broadcast[3]) != 0)
                memcpy(buf, broadcast, 4);
        else
                memcpy(buf, &naddr, sizeof(naddr));
}

#if IS_ENABLED(CONFIG_IPV6)
#include <linux/ipv6.h>
#endif

static __inline__ void inet_reset_saddr(struct sock *sk)
{
        inet_sk(sk)->inet_rcv_saddr = inet_sk(sk)->inet_saddr = 0;
#if IS_ENABLED(CONFIG_IPV6)
        if (sk->sk_family == PF_INET6) {
                struct ipv6_pinfo *np = inet6_sk(sk);

                memset(&np->saddr, 0, sizeof(np->saddr));
                memset(&sk->sk_v6_rcv_saddr, 0, sizeof(sk->sk_v6_rcv_saddr));
        }
#endif
}

#endif

static inline unsigned int ipv4_addr_hash(__be32 ip)
{
        return (__force unsigned int) ip;
}

static inline u32 ipv4_portaddr_hash(const struct net *net,
                                     __be32 saddr,
                                     unsigned int port)
{
        return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port;
}

bool ip_call_ra_chain(struct sk_buff *skb);

/*
 *        Functions provided by ip_fragment.c
 */

enum ip_defrag_users {
        IP_DEFRAG_LOCAL_DELIVER,
        IP_DEFRAG_CALL_RA_CHAIN,
        IP_DEFRAG_CONNTRACK_IN,
        __IP_DEFRAG_CONNTRACK_IN_END        = IP_DEFRAG_CONNTRACK_IN + USHRT_MAX,
        IP_DEFRAG_CONNTRACK_OUT,
        __IP_DEFRAG_CONNTRACK_OUT_END        = IP_DEFRAG_CONNTRACK_OUT + USHRT_MAX,
        IP_DEFRAG_CONNTRACK_BRIDGE_IN,
        __IP_DEFRAG_CONNTRACK_BRIDGE_IN = IP_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX,
        IP_DEFRAG_VS_IN,
        IP_DEFRAG_VS_OUT,
        IP_DEFRAG_VS_FWD,
        IP_DEFRAG_AF_PACKET,
        IP_DEFRAG_MACVLAN,
};

/* Return true if the value of 'user' is between 'lower_bond'
 * and 'upper_bond' inclusively.
 */
static inline bool ip_defrag_user_in_between(u32 user,
                                             enum ip_defrag_users lower_bond,
                                             enum ip_defrag_users upper_bond)
{
        return user >= lower_bond && user <= upper_bond;
}

int ip_defrag(struct net *net, struct sk_buff *skb, u32 user);
#ifdef CONFIG_INET
struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user);
#else
static inline struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user)
{
        return skb;
}
#endif

/*
 *        Functions provided by ip_forward.c
 */

int ip_forward(struct sk_buff *skb);

/*
 *        Functions provided by ip_options.c
 */

void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
                      __be32 daddr, struct rtable *rt, int is_frag);

int __ip_options_echo(struct net *net, struct ip_options *dopt,
                      struct sk_buff *skb, const struct ip_options *sopt);
static inline int ip_options_echo(struct net *net, struct ip_options *dopt,
                                  struct sk_buff *skb)
{
        return __ip_options_echo(net, dopt, skb, &IPCB(skb)->opt);
}

void ip_options_fragment(struct sk_buff *skb);
int __ip_options_compile(struct net *net, struct ip_options *opt,
                         struct sk_buff *skb, __be32 *info);
int ip_options_compile(struct net *net, struct ip_options *opt,
                       struct sk_buff *skb);
int ip_options_get(struct net *net, struct ip_options_rcu **optp,
                   sockptr_t data, int optlen);
void ip_options_undo(struct ip_options *opt);
void ip_forward_options(struct sk_buff *skb);
int ip_options_rcv_srr(struct sk_buff *skb, struct net_device *dev);

/*
 *        Functions provided by ip_sockglue.c
 */

void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb);
void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk,
                         struct sk_buff *skb, int tlen, int offset);
int ip_cmsg_send(struct sock *sk, struct msghdr *msg,
                 struct ipcm_cookie *ipc, bool allow_ipv6);
int ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
                  unsigned int optlen);
int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
                  int __user *optlen);
int ip_ra_control(struct sock *sk, unsigned char on,
                  void (*destructor)(struct sock *));

int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len);
void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port,
                   u32 info, u8 *payload);
void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 dport,
                    u32 info);

static inline void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
{
        ip_cmsg_recv_offset(msg, skb->sk, skb, 0, 0);
}

bool icmp_global_allow(void);
extern int sysctl_icmp_msgs_per_sec;
extern int sysctl_icmp_msgs_burst;

#ifdef CONFIG_PROC_FS
int ip_misc_proc_init(void);
#endif

int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto, u8 family,
                                struct netlink_ext_ack *extack);

static inline bool inetdev_valid_mtu(unsigned int mtu)
{
        return likely(mtu >= IPV4_MIN_MTU);
}

void ip_sock_set_freebind(struct sock *sk);
int ip_sock_set_mtu_discover(struct sock *sk, int val);
void ip_sock_set_pktinfo(struct sock *sk);
void ip_sock_set_recverr(struct sock *sk);
void ip_sock_set_tos(struct sock *sk, int val);

#endif        /* _IP_H */






























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#undef TRACE_SYSTEM
#define TRACE_SYSTEM qdisc

#if !defined(_TRACE_QDISC_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_QDISC_H

#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/tracepoint.h>
#include <linux/ftrace.h>
#include <linux/pkt_sched.h>
#include <net/sch_generic.h>

TRACE_EVENT(qdisc_dequeue,

        TP_PROTO(struct Qdisc *qdisc, const struct netdev_queue *txq,
                 int packets, struct sk_buff *skb),

        TP_ARGS(qdisc, txq, packets, skb),

        TP_STRUCT__entry(
                __field(        struct Qdisc *,                qdisc        )
                __field(const        struct netdev_queue *,        txq        )
                __field(        int,                        packets        )
                __field(        void *,                        skbaddr        )
                __field(        int,                        ifindex        )
                __field(        u32,                        handle        )
                __field(        u32,                        parent        )
                __field(        unsigned long,                txq_state)
        ),

        /* skb==NULL indicate packets dequeued was 0, even when packets==1 */
        TP_fast_assign(
                __entry->qdisc                = qdisc;
                __entry->txq                = txq;
                __entry->packets        = skb ? packets : 0;
                __entry->skbaddr        = skb;
                __entry->ifindex        = txq->dev ? txq->dev->ifindex : 0;
                __entry->handle                = qdisc->handle;
                __entry->parent                = qdisc->parent;
                __entry->txq_state        = txq->state;
        ),

        TP_printk("dequeue ifindex=%d qdisc handle=0x%X parent=0x%X txq_state=0x%lX packets=%d skbaddr=%p",
                  __entry->ifindex, __entry->handle, __entry->parent,
                  __entry->txq_state, __entry->packets, __entry->skbaddr )
);

TRACE_EVENT(qdisc_reset,

        TP_PROTO(struct Qdisc *q),

        TP_ARGS(q),

        TP_STRUCT__entry(
                __string(        dev,                qdisc_dev(q)->name        )
                __string(        kind,                q->ops->id                )
                __field(        u32,                parent                        )
                __field(        u32,                handle                        )
        ),

        TP_fast_assign(
                __assign_str(dev, qdisc_dev(q)->name);
                __assign_str(kind, q->ops->id);
                __entry->parent = q->parent;
                __entry->handle = q->handle;
        ),

        TP_printk("dev=%s kind=%s parent=%x:%x handle=%x:%x", __get_str(dev),
                  __get_str(kind), TC_H_MAJ(__entry->parent) >> 16, TC_H_MIN(__entry->parent),
                  TC_H_MAJ(__entry->handle) >> 16, TC_H_MIN(__entry->handle))
);

TRACE_EVENT(qdisc_destroy,

        TP_PROTO(struct Qdisc *q),

        TP_ARGS(q),

        TP_STRUCT__entry(
                __string(        dev,                qdisc_dev(q) ? qdisc_dev(q)->name : "(null)"        )
                __string(        kind,                q->ops->id                )
                __field(        u32,                parent                        )
                __field(        u32,                handle                        )
        ),

        TP_fast_assign(
                __assign_str(dev, qdisc_dev(q) ? qdisc_dev(q)->name : "(null)");
                __assign_str(kind, q->ops->id);
                __entry->parent = q->parent;
                __entry->handle = q->handle;
        ),

        TP_printk("dev=%s kind=%s parent=%x:%x handle=%x:%x", __get_str(dev),
                  __get_str(kind), TC_H_MAJ(__entry->parent) >> 16, TC_H_MIN(__entry->parent),
                  TC_H_MAJ(__entry->handle) >> 16, TC_H_MIN(__entry->handle))
);

TRACE_EVENT(qdisc_create,

        TP_PROTO(const struct Qdisc_ops *ops, struct net_device *dev, u32 parent),

        TP_ARGS(ops, dev, parent),

        TP_STRUCT__entry(
                __string(        dev,                dev->name        )
                __string(        kind,                ops->id                )
                __field(        u32,                parent                )
        ),

        TP_fast_assign(
                __assign_str(dev, dev->name);
                __assign_str(kind, ops->id);
                __entry->parent = parent;
        ),

        TP_printk("dev=%s kind=%s parent=%x:%x",
                  __get_str(dev), __get_str(kind),
                  TC_H_MAJ(__entry->parent) >> 16, TC_H_MIN(__entry->parent))
);

#endif /* _TRACE_QDISC_H */

/* This part must be outside protection */
#include <trace/define_trace.h>


















































































































    1 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_CLEANCACHE_H
#define _LINUX_CLEANCACHE_H

#include <linux/fs.h>
#include <linux/exportfs.h>
#include <linux/mm.h>

#define CLEANCACHE_NO_POOL                -1
#define CLEANCACHE_NO_BACKEND                -2
#define CLEANCACHE_NO_BACKEND_SHARED        -3

#define CLEANCACHE_KEY_MAX 6

/*
 * cleancache requires every file with a page in cleancache to have a
 * unique key unless/until the file is removed/truncated.  For some
 * filesystems, the inode number is unique, but for "modern" filesystems
 * an exportable filehandle is required (see exportfs.h)
 */
struct cleancache_filekey {
        union {
                ino_t ino;
                __u32 fh[CLEANCACHE_KEY_MAX];
                u32 key[CLEANCACHE_KEY_MAX];
        } u;
};

struct cleancache_ops {
        int (*init_fs)(size_t);
        int (*init_shared_fs)(uuid_t *uuid, size_t);
        int (*get_page)(int, struct cleancache_filekey,
                        pgoff_t, struct page *);
        void (*put_page)(int, struct cleancache_filekey,
                        pgoff_t, struct page *);
        void (*invalidate_page)(int, struct cleancache_filekey, pgoff_t);
        void (*invalidate_inode)(int, struct cleancache_filekey);
        void (*invalidate_fs)(int);
};

extern int cleancache_register_ops(const struct cleancache_ops *ops);
extern void __cleancache_init_fs(struct super_block *);
extern void __cleancache_init_shared_fs(struct super_block *);
extern int  __cleancache_get_page(struct page *);
extern void __cleancache_put_page(struct page *);
extern void __cleancache_invalidate_page(struct address_space *, struct page *);
extern void __cleancache_invalidate_inode(struct address_space *);
extern void __cleancache_invalidate_fs(struct super_block *);

#ifdef CONFIG_CLEANCACHE
#define cleancache_enabled (1)
static inline bool cleancache_fs_enabled_mapping(struct address_space *mapping)
{
        return mapping->host->i_sb->cleancache_poolid >= 0;
}
static inline bool cleancache_fs_enabled(struct page *page)
{
        return cleancache_fs_enabled_mapping(page->mapping);
}
#else
#define cleancache_enabled (0)
#define cleancache_fs_enabled(_page) (0)
#define cleancache_fs_enabled_mapping(_page) (0)
#endif

/*
 * The shim layer provided by these inline functions allows the compiler
 * to reduce all cleancache hooks to nothingness if CONFIG_CLEANCACHE
 * is disabled, to a single global variable check if CONFIG_CLEANCACHE
 * is enabled but no cleancache "backend" has dynamically enabled it,
 * and, for the most frequent cleancache ops, to a single global variable
 * check plus a superblock element comparison if CONFIG_CLEANCACHE is enabled
 * and a cleancache backend has dynamically enabled cleancache, but the
 * filesystem referenced by that cleancache op has not enabled cleancache.
 * As a result, CONFIG_CLEANCACHE can be enabled by default with essentially
 * no measurable performance impact.
 */

static inline void cleancache_init_fs(struct super_block *sb)
{
        if (cleancache_enabled)
                __cleancache_init_fs(sb);
}

static inline void cleancache_init_shared_fs(struct super_block *sb)
{
        if (cleancache_enabled)
                __cleancache_init_shared_fs(sb);
}

static inline int cleancache_get_page(struct page *page)
{
        if (cleancache_enabled && cleancache_fs_enabled(page))
                return __cleancache_get_page(page);
        return -1;
}

static inline void cleancache_put_page(struct page *page)
{
        if (cleancache_enabled && cleancache_fs_enabled(page))
                __cleancache_put_page(page);
}

static inline void cleancache_invalidate_page(struct address_space *mapping,
                                        struct page *page)
{
        /* careful... page->mapping is NULL sometimes when this is called */
        if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping))
                __cleancache_invalidate_page(mapping, page);
}

static inline void cleancache_invalidate_inode(struct address_space *mapping)
{
        if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping))
                __cleancache_invalidate_inode(mapping);
}

static inline void cleancache_invalidate_fs(struct super_block *sb)
{
        if (cleancache_enabled)
                __cleancache_invalidate_fs(sb);
}

#endif /* _LINUX_CLEANCACHE_H */




























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
/* SPDX-License-Identifier: GPL-2.0 */
/*
 *        Definitions of structures and functions for quota formats using trie
 */

#ifndef _LINUX_DQBLK_QTREE_H
#define _LINUX_DQBLK_QTREE_H

#include <linux/types.h>

/* Numbers of blocks needed for updates - we count with the smallest
 * possible block size (1024) */
#define QTREE_INIT_ALLOC 4
#define QTREE_INIT_REWRITE 2
#define QTREE_DEL_ALLOC 0
#define QTREE_DEL_REWRITE 6

struct dquot;
struct kqid;

/* Operations */
struct qtree_fmt_operations {
        void (*mem2disk_dqblk)(void *disk, struct dquot *dquot);        /* Convert given entry from in memory format to disk one */
        void (*disk2mem_dqblk)(struct dquot *dquot, void *disk);        /* Convert given entry from disk format to in memory one */
        int (*is_id)(void *disk, struct dquot *dquot);        /* Is this structure for given id? */
};

/* Inmemory copy of version specific information */
struct qtree_mem_dqinfo {
        struct super_block *dqi_sb;        /* Sb quota is on */
        int dqi_type;                        /* Quota type */
        unsigned int dqi_blocks;        /* # of blocks in quota file */
        unsigned int dqi_free_blk;        /* First block in list of free blocks */
        unsigned int dqi_free_entry;        /* First block with free entry */
        unsigned int dqi_blocksize_bits;        /* Block size of quota file */
        unsigned int dqi_entry_size;        /* Size of quota entry in quota file */
        unsigned int dqi_usable_bs;        /* Space usable in block for quota data */
        unsigned int dqi_qtree_depth;        /* Precomputed depth of quota tree */
        const struct qtree_fmt_operations *dqi_ops; /* Operations for entry manipulation */
};

int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot);
int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot);
int qtree_delete_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot);
int qtree_release_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot);
int qtree_entry_unused(struct qtree_mem_dqinfo *info, char *disk);
static inline int qtree_depth(struct qtree_mem_dqinfo *info)
{
        unsigned int epb = info->dqi_usable_bs >> 2;
        unsigned long long entries = epb;
        int i;

        for (i = 1; entries < (1ULL << 32); i++)
                entries *= epb;
        return i;
}
int qtree_get_next_id(struct qtree_mem_dqinfo *info, struct kqid *qid);

#endif /* _LINUX_DQBLK_QTREE_H */












































































































































































































































































































































































































































    1 

































































    1 













    1 







    1 


    1 
    1 














































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
    1 




































    1 





























































































    1 
    1 
    1 
    1 


    1 
    1 
































    1 





























    1 









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
// SPDX-License-Identifier: GPL-2.0-only
/*
 * xfrm_policy.c
 *
 * Changes:
 *        Mitsuru KANDA @USAGI
 *         Kazunori MIYAZAWA @USAGI
 *         Kunihiro Ishiguro <kunihiro@ipinfusion.com>
 *                 IPv6 support
 *         Kazunori MIYAZAWA @USAGI
 *         YOSHIFUJI Hideaki
 *                 Split up af-specific portion
 *        Derek Atkins <derek@ihtfp.com>                Add the post_input processor
 *
 */

#include <linux/err.h>
#include <linux/slab.h>
#include <linux/kmod.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/workqueue.h>
#include <linux/notifier.h>
#include <linux/netdevice.h>
#include <linux/netfilter.h>
#include <linux/module.h>
#include <linux/cache.h>
#include <linux/cpu.h>
#include <linux/audit.h>
#include <linux/rhashtable.h>
#include <linux/if_tunnel.h>
#include <net/dst.h>
#include <net/flow.h>
#include <net/inet_ecn.h>
#include <net/xfrm.h>
#include <net/ip.h>
#include <net/gre.h>
#if IS_ENABLED(CONFIG_IPV6_MIP6)
#include <net/mip6.h>
#endif
#ifdef CONFIG_XFRM_STATISTICS
#include <net/snmp.h>
#endif
#ifdef CONFIG_XFRM_ESPINTCP
#include <net/espintcp.h>
#endif

#include "xfrm_hash.h"

#define XFRM_QUEUE_TMO_MIN ((unsigned)(HZ/10))
#define XFRM_QUEUE_TMO_MAX ((unsigned)(60*HZ))
#define XFRM_MAX_QUEUE_LEN        100

struct xfrm_flo {
        struct dst_entry *dst_orig;
        u8 flags;
};

/* prefixes smaller than this are stored in lists, not trees. */
#define INEXACT_PREFIXLEN_IPV4        16
#define INEXACT_PREFIXLEN_IPV6        48

struct xfrm_pol_inexact_node {
        struct rb_node node;
        union {
                xfrm_address_t addr;
                struct rcu_head rcu;
        };
        u8 prefixlen;

        struct rb_root root;

        /* the policies matching this node, can be empty list */
        struct hlist_head hhead;
};

/* xfrm inexact policy search tree:
 * xfrm_pol_inexact_bin = hash(dir,type,family,if_id);
 *  |
 * +---- root_d: sorted by daddr:prefix
 * |                 |
 * |        xfrm_pol_inexact_node
 * |                 |
 * |                 +- root: sorted by saddr/prefix
 * |                 |              |
 * |                 |         xfrm_pol_inexact_node
 * |                 |              |
 * |                 |              + root: unused
 * |                 |              |
 * |                 |              + hhead: saddr:daddr policies
 * |                 |
 * |                 +- coarse policies and all any:daddr policies
 * |
 * +---- root_s: sorted by saddr:prefix
 * |                 |
 * |        xfrm_pol_inexact_node
 * |                 |
 * |                 + root: unused
 * |                 |
 * |                 + hhead: saddr:any policies
 * |
 * +---- coarse policies and all any:any policies
 *
 * Lookups return four candidate lists:
 * 1. any:any list from top-level xfrm_pol_inexact_bin
 * 2. any:daddr list from daddr tree
 * 3. saddr:daddr list from 2nd level daddr tree
 * 4. saddr:any list from saddr tree
 *
 * This result set then needs to be searched for the policy with
 * the lowest priority.  If two results have same prio, youngest one wins.
 */

struct xfrm_pol_inexact_key {
        possible_net_t net;
        u32 if_id;
        u16 family;
        u8 dir, type;
};

struct xfrm_pol_inexact_bin {
        struct xfrm_pol_inexact_key k;
        struct rhash_head head;
        /* list containing '*:*' policies */
        struct hlist_head hhead;

        seqcount_spinlock_t count;
        /* tree sorted by daddr/prefix */
        struct rb_root root_d;

        /* tree sorted by saddr/prefix */
        struct rb_root root_s;

        /* slow path below */
        struct list_head inexact_bins;
        struct rcu_head rcu;
};

enum xfrm_pol_inexact_candidate_type {
        XFRM_POL_CAND_BOTH,
        XFRM_POL_CAND_SADDR,
        XFRM_POL_CAND_DADDR,
        XFRM_POL_CAND_ANY,

        XFRM_POL_CAND_MAX,
};

struct xfrm_pol_inexact_candidates {
        struct hlist_head *res[XFRM_POL_CAND_MAX];
};

static DEFINE_SPINLOCK(xfrm_if_cb_lock);
static struct xfrm_if_cb const __rcu *xfrm_if_cb __read_mostly;

static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock);
static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1]
                                                __read_mostly;

static struct kmem_cache *xfrm_dst_cache __ro_after_init;

static struct rhashtable xfrm_policy_inexact_table;
static const struct rhashtable_params xfrm_pol_inexact_params;

static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr);
static int stale_bundle(struct dst_entry *dst);
static int xfrm_bundle_ok(struct xfrm_dst *xdst);
static void xfrm_policy_queue_process(struct timer_list *t);

static void __xfrm_policy_link(struct xfrm_policy *pol, int dir);
static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
                                                int dir);

static struct xfrm_pol_inexact_bin *
xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, u8 dir,
                           u32 if_id);

static struct xfrm_pol_inexact_bin *
xfrm_policy_inexact_lookup_rcu(struct net *net,
                               u8 type, u16 family, u8 dir, u32 if_id);
static struct xfrm_policy *
xfrm_policy_insert_list(struct hlist_head *chain, struct xfrm_policy *policy,
                        bool excl);
static void xfrm_policy_insert_inexact_list(struct hlist_head *chain,
                                            struct xfrm_policy *policy);

static bool
xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand,
                                    struct xfrm_pol_inexact_bin *b,
                                    const xfrm_address_t *saddr,
                                    const xfrm_address_t *daddr);

static inline bool xfrm_pol_hold_rcu(struct xfrm_policy *policy)
{
        return refcount_inc_not_zero(&policy->refcnt);
}

static inline bool
__xfrm4_selector_match(const struct xfrm_selector *sel, const struct flowi *fl)
{
        const struct flowi4 *fl4 = &fl->u.ip4;

        return  addr4_match(fl4->daddr, sel->daddr.a4, sel->prefixlen_d) &&
                addr4_match(fl4->saddr, sel->saddr.a4, sel->prefixlen_s) &&
                !((xfrm_flowi_dport(fl, &fl4->uli) ^ sel->dport) & sel->dport_mask) &&
                !((xfrm_flowi_sport(fl, &fl4->uli) ^ sel->sport) & sel->sport_mask) &&
                (fl4->flowi4_proto == sel->proto || !sel->proto) &&
                (fl4->flowi4_oif == sel->ifindex || !sel->ifindex);
}

static inline bool
__xfrm6_selector_match(const struct xfrm_selector *sel, const struct flowi *fl)
{
        const struct flowi6 *fl6 = &fl->u.ip6;

        return  addr_match(&fl6->daddr, &sel->daddr, sel->prefixlen_d) &&
                addr_match(&fl6->saddr, &sel->saddr, sel->prefixlen_s) &&
                !((xfrm_flowi_dport(fl, &fl6->uli) ^ sel->dport) & sel->dport_mask) &&
                !((xfrm_flowi_sport(fl, &fl6->uli) ^ sel->sport) & sel->sport_mask) &&
                (fl6->flowi6_proto == sel->proto || !sel->proto) &&
                (fl6->flowi6_oif == sel->ifindex || !sel->ifindex);
}

bool xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi *fl,
                         unsigned short family)
{
        switch (family) {
        case AF_INET:
                return __xfrm4_selector_match(sel, fl);
        case AF_INET6:
                return __xfrm6_selector_match(sel, fl);
        }
        return false;
}

static const struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family)
{
        const struct xfrm_policy_afinfo *afinfo;

        if (unlikely(family >= ARRAY_SIZE(xfrm_policy_afinfo)))
                return NULL;
        rcu_read_lock();
        afinfo = rcu_dereference(xfrm_policy_afinfo[family]);
        if (unlikely(!afinfo))
                rcu_read_unlock();
        return afinfo;
}

/* Called with rcu_read_lock(). */
static const struct xfrm_if_cb *xfrm_if_get_cb(void)
{
        return rcu_dereference(xfrm_if_cb);
}

struct dst_entry *__xfrm_dst_lookup(int family,
                                    const struct xfrm_dst_lookup_params *params)
{
        const struct xfrm_policy_afinfo *afinfo;
        struct dst_entry *dst;

        afinfo = xfrm_policy_get_afinfo(family);
        if (unlikely(afinfo == NULL))
                return ERR_PTR(-EAFNOSUPPORT);

        dst = afinfo->dst_lookup(params);

        rcu_read_unlock();

        return dst;
}
EXPORT_SYMBOL(__xfrm_dst_lookup);

static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x,
                                                int tos, int oif,
                                                xfrm_address_t *prev_saddr,
                                                xfrm_address_t *prev_daddr,
                                                int family, u32 mark)
{
        struct xfrm_dst_lookup_params params;
        struct net *net = xs_net(x);
        xfrm_address_t *saddr = &x->props.saddr;
        xfrm_address_t *daddr = &x->id.daddr;
        struct dst_entry *dst;

        if (x->type->flags & XFRM_TYPE_LOCAL_COADDR) {
                saddr = x->coaddr;
                daddr = prev_daddr;
        }
        if (x->type->flags & XFRM_TYPE_REMOTE_COADDR) {
                saddr = prev_saddr;
                daddr = x->coaddr;
        }

        params.net = net;
        params.saddr = saddr;
        params.daddr = daddr;
        params.tos = tos;
        params.oif = oif;
        params.mark = mark;
        params.ipproto = x->id.proto;
        if (x->encap) {
                switch (x->encap->encap_type) {
                case UDP_ENCAP_ESPINUDP:
                        params.ipproto = IPPROTO_UDP;
                        params.uli.ports.sport = x->encap->encap_sport;
                        params.uli.ports.dport = x->encap->encap_dport;
                        break;
                case TCP_ENCAP_ESPINTCP:
                        params.ipproto = IPPROTO_TCP;
                        params.uli.ports.sport = x->encap->encap_sport;
                        params.uli.ports.dport = x->encap->encap_dport;
                        break;
                }
        }

        dst = __xfrm_dst_lookup(family, &params);

        if (!IS_ERR(dst)) {
                if (prev_saddr != saddr)
                        memcpy(prev_saddr, saddr,  sizeof(*prev_saddr));
                if (prev_daddr != daddr)
                        memcpy(prev_daddr, daddr,  sizeof(*prev_daddr));
        }

        return dst;
}

static inline unsigned long make_jiffies(long secs)
{
        if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ)
                return MAX_SCHEDULE_TIMEOUT-1;
        else
                return secs*HZ;
}

static void xfrm_policy_timer(struct timer_list *t)
{
        struct xfrm_policy *xp = from_timer(xp, t, timer);
        time64_t now = ktime_get_real_seconds();
        time64_t next = TIME64_MAX;
        int warn = 0;
        int dir;

        read_lock(&xp->lock);

        if (unlikely(xp->walk.dead))
                goto out;

        dir = xfrm_policy_id2dir(xp->index);

        if (xp->lft.hard_add_expires_seconds) {
                time64_t tmo = xp->lft.hard_add_expires_seconds +
                        xp->curlft.add_time - now;
                if (tmo <= 0)
                        goto expired;
                if (tmo < next)
                        next = tmo;
        }
        if (xp->lft.hard_use_expires_seconds) {
                time64_t tmo = xp->lft.hard_use_expires_seconds +
                        (xp->curlft.use_time ? : xp->curlft.add_time) - now;
                if (tmo <= 0)
                        goto expired;
                if (tmo < next)
                        next = tmo;
        }
        if (xp->lft.soft_add_expires_seconds) {
                time64_t tmo = xp->lft.soft_add_expires_seconds +
                        xp->curlft.add_time - now;
                if (tmo <= 0) {
                        warn = 1;
                        tmo = XFRM_KM_TIMEOUT;
                }
                if (tmo < next)
                        next = tmo;
        }
        if (xp->lft.soft_use_expires_seconds) {
                time64_t tmo = xp->lft.soft_use_expires_seconds +
                        (xp->curlft.use_time ? : xp->curlft.add_time) - now;
                if (tmo <= 0) {
                        warn = 1;
                        tmo = XFRM_KM_TIMEOUT;
                }
                if (tmo < next)
                        next = tmo;
        }

        if (warn)
                km_policy_expired(xp, dir, 0, 0);
        if (next != TIME64_MAX &&
            !mod_timer(&xp->timer, jiffies + make_jiffies(next)))
                xfrm_pol_hold(xp);

out:
        read_unlock(&xp->lock);
        xfrm_pol_put(xp);
        return;

expired:
        read_unlock(&xp->lock);
        if (!xfrm_policy_delete(xp, dir))
                km_policy_expired(xp, dir, 1, 0);
        xfrm_pol_put(xp);
}

/* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2
 * SPD calls.
 */

struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp)
{
        struct xfrm_policy *policy;

        policy = kzalloc(sizeof(struct xfrm_policy), gfp);

        if (policy) {
                write_pnet(&policy->xp_net, net);
                INIT_LIST_HEAD(&policy->walk.all);
                INIT_HLIST_NODE(&policy->bydst_inexact_list);
                INIT_HLIST_NODE(&policy->bydst);
                INIT_HLIST_NODE(&policy->byidx);
                rwlock_init(&policy->lock);
                refcount_set(&policy->refcnt, 1);
                skb_queue_head_init(&policy->polq.hold_queue);
                timer_setup(&policy->timer, xfrm_policy_timer, 0);
                timer_setup(&policy->polq.hold_timer,
                            xfrm_policy_queue_process, 0);
        }
        return policy;
}
EXPORT_SYMBOL(xfrm_policy_alloc);

static void xfrm_policy_destroy_rcu(struct rcu_head *head)
{
        struct xfrm_policy *policy = container_of(head, struct xfrm_policy, rcu);

        security_xfrm_policy_free(policy->security);
        kfree(policy);
}

/* Destroy xfrm_policy: descendant resources must be released to this moment. */

void xfrm_policy_destroy(struct xfrm_policy *policy)
{
        BUG_ON(!policy->walk.dead);

        if (del_timer(&policy->timer) || del_timer(&policy->polq.hold_timer))
                BUG();

        call_rcu(&policy->rcu, xfrm_policy_destroy_rcu);
}
EXPORT_SYMBOL(xfrm_policy_destroy);

/* Rule must be locked. Release descendant resources, announce
 * entry dead. The rule must be unlinked from lists to the moment.
 */

static void xfrm_policy_kill(struct xfrm_policy *policy)
{
        write_lock_bh(&policy->lock);
        policy->walk.dead = 1;
        write_unlock_bh(&policy->lock);

        atomic_inc(&policy->genid);

        if (del_timer(&policy->polq.hold_timer))
                xfrm_pol_put(policy);
        skb_queue_purge(&policy->polq.hold_queue);

        if (del_timer(&policy->timer))
                xfrm_pol_put(policy);

        xfrm_pol_put(policy);
}

static unsigned int xfrm_policy_hashmax __read_mostly = 1 * 1024 * 1024;

static inline unsigned int idx_hash(struct net *net, u32 index)
{
        return __idx_hash(index, net->xfrm.policy_idx_hmask);
}

/* calculate policy hash thresholds */
static void __get_hash_thresh(struct net *net,
                              unsigned short family, int dir,
                              u8 *dbits, u8 *sbits)
{
        switch (family) {
        case AF_INET:
                *dbits = net->xfrm.policy_bydst[dir].dbits4;
                *sbits = net->xfrm.policy_bydst[dir].sbits4;
                break;

        case AF_INET6:
                *dbits = net->xfrm.policy_bydst[dir].dbits6;
                *sbits = net->xfrm.policy_bydst[dir].sbits6;
                break;

        default:
                *dbits = 0;
                *sbits = 0;
        }
}

static struct hlist_head *policy_hash_bysel(struct net *net,
                                            const struct xfrm_selector *sel,
                                            unsigned short family, int dir)
{
        unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
        unsigned int hash;
        u8 dbits;
        u8 sbits;

        __get_hash_thresh(net, family, dir, &dbits, &sbits);
        hash = __sel_hash(sel, family, hmask, dbits, sbits);

        if (hash == hmask + 1)
                return NULL;

        return rcu_dereference_check(net->xfrm.policy_bydst[dir].table,
                     lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash;
}

static struct hlist_head *policy_hash_direct(struct net *net,
                                             const xfrm_address_t *daddr,
                                             const xfrm_address_t *saddr,
                                             unsigned short family, int dir)
{
        unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
        unsigned int hash;
        u8 dbits;
        u8 sbits;

        __get_hash_thresh(net, family, dir, &dbits, &sbits);
        hash = __addr_hash(daddr, saddr, family, hmask, dbits, sbits);

        return rcu_dereference_check(net->xfrm.policy_bydst[dir].table,
                     lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash;
}

static void xfrm_dst_hash_transfer(struct net *net,
                                   struct hlist_head *list,
                                   struct hlist_head *ndsttable,
                                   unsigned int nhashmask,
                                   int dir)
{
        struct hlist_node *tmp, *entry0 = NULL;
        struct xfrm_policy *pol;
        unsigned int h0 = 0;
        u8 dbits;
        u8 sbits;

redo:
        hlist_for_each_entry_safe(pol, tmp, list, bydst) {
                unsigned int h;

                __get_hash_thresh(net, pol->family, dir, &dbits, &sbits);
                h = __addr_hash(&pol->selector.daddr, &pol->selector.saddr,
                                pol->family, nhashmask, dbits, sbits);
                if (!entry0) {
                        hlist_del_rcu(&pol->bydst);
                        hlist_add_head_rcu(&pol->bydst, ndsttable + h);
                        h0 = h;
                } else {
                        if (h != h0)
                                continue;
                        hlist_del_rcu(&pol->bydst);
                        hlist_add_behind_rcu(&pol->bydst, entry0);
                }
                entry0 = &pol->bydst;
        }
        if (!hlist_empty(list)) {
                entry0 = NULL;
                goto redo;
        }
}

static void xfrm_idx_hash_transfer(struct hlist_head *list,
                                   struct hlist_head *nidxtable,
                                   unsigned int nhashmask)
{
        struct hlist_node *tmp;
        struct xfrm_policy *pol;

        hlist_for_each_entry_safe(pol, tmp, list, byidx) {
                unsigned int h;

                h = __idx_hash(pol->index, nhashmask);
                hlist_add_head(&pol->byidx, nidxtable+h);
        }
}

static unsigned long xfrm_new_hash_mask(unsigned int old_hmask)
{
        return ((old_hmask + 1) << 1) - 1;
}

static void xfrm_bydst_resize(struct net *net, int dir)
{
        unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
        unsigned int nhashmask = xfrm_new_hash_mask(hmask);
        unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head);
        struct hlist_head *ndst = xfrm_hash_alloc(nsize);
        struct hlist_head *odst;
        int i;

        if (!ndst)
                return;

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        write_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation);

        odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table,
                                lockdep_is_held(&net->xfrm.xfrm_policy_lock));

        for (i = hmask; i >= 0; i--)
                xfrm_dst_hash_transfer(net, odst + i, ndst, nhashmask, dir);

        rcu_assign_pointer(net->xfrm.policy_bydst[dir].table, ndst);
        net->xfrm.policy_bydst[dir].hmask = nhashmask;

        write_seqcount_end(&net->xfrm.xfrm_policy_hash_generation);
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);

        synchronize_rcu();

        xfrm_hash_free(odst, (hmask + 1) * sizeof(struct hlist_head));
}

static void xfrm_byidx_resize(struct net *net, int total)
{
        unsigned int hmask = net->xfrm.policy_idx_hmask;
        unsigned int nhashmask = xfrm_new_hash_mask(hmask);
        unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head);
        struct hlist_head *oidx = net->xfrm.policy_byidx;
        struct hlist_head *nidx = xfrm_hash_alloc(nsize);
        int i;

        if (!nidx)
                return;

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);

        for (i = hmask; i >= 0; i--)
                xfrm_idx_hash_transfer(oidx + i, nidx, nhashmask);

        net->xfrm.policy_byidx = nidx;
        net->xfrm.policy_idx_hmask = nhashmask;

        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);

        xfrm_hash_free(oidx, (hmask + 1) * sizeof(struct hlist_head));
}

static inline int xfrm_bydst_should_resize(struct net *net, int dir, int *total)
{
        unsigned int cnt = net->xfrm.policy_count[dir];
        unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;

        if (total)
                *total += cnt;

        if ((hmask + 1) < xfrm_policy_hashmax &&
            cnt > hmask)
                return 1;

        return 0;
}

static inline int xfrm_byidx_should_resize(struct net *net, int total)
{
        unsigned int hmask = net->xfrm.policy_idx_hmask;

        if ((hmask + 1) < xfrm_policy_hashmax &&
            total > hmask)
                return 1;

        return 0;
}

void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si)
{
        si->incnt = net->xfrm.policy_count[XFRM_POLICY_IN];
        si->outcnt = net->xfrm.policy_count[XFRM_POLICY_OUT];
        si->fwdcnt = net->xfrm.policy_count[XFRM_POLICY_FWD];
        si->inscnt = net->xfrm.policy_count[XFRM_POLICY_IN+XFRM_POLICY_MAX];
        si->outscnt = net->xfrm.policy_count[XFRM_POLICY_OUT+XFRM_POLICY_MAX];
        si->fwdscnt = net->xfrm.policy_count[XFRM_POLICY_FWD+XFRM_POLICY_MAX];
        si->spdhcnt = net->xfrm.policy_idx_hmask;
        si->spdhmcnt = xfrm_policy_hashmax;
}
EXPORT_SYMBOL(xfrm_spd_getinfo);

static DEFINE_MUTEX(hash_resize_mutex);
static void xfrm_hash_resize(struct work_struct *work)
{
        struct net *net = container_of(work, struct net, xfrm.policy_hash_work);
        int dir, total;

        mutex_lock(&hash_resize_mutex);

        total = 0;
        for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
                if (xfrm_bydst_should_resize(net, dir, &total))
                        xfrm_bydst_resize(net, dir);
        }
        if (xfrm_byidx_should_resize(net, total))
                xfrm_byidx_resize(net, total);

        mutex_unlock(&hash_resize_mutex);
}

/* Make sure *pol can be inserted into fastbin.
 * Useful to check that later insert requests will be sucessful
 * (provided xfrm_policy_lock is held throughout).
 */
static struct xfrm_pol_inexact_bin *
xfrm_policy_inexact_alloc_bin(const struct xfrm_policy *pol, u8 dir)
{
        struct xfrm_pol_inexact_bin *bin, *prev;
        struct xfrm_pol_inexact_key k = {
                .family = pol->family,
                .type = pol->type,
                .dir = dir,
                .if_id = pol->if_id,
        };
        struct net *net = xp_net(pol);

        lockdep_assert_held(&net->xfrm.xfrm_policy_lock);

        write_pnet(&k.net, net);
        bin = rhashtable_lookup_fast(&xfrm_policy_inexact_table, &k,
                                     xfrm_pol_inexact_params);
        if (bin)
                return bin;

        bin = kzalloc(sizeof(*bin), GFP_ATOMIC);
        if (!bin)
                return NULL;

        bin->k = k;
        INIT_HLIST_HEAD(&bin->hhead);
        bin->root_d = RB_ROOT;
        bin->root_s = RB_ROOT;
        seqcount_spinlock_init(&bin->count, &net->xfrm.xfrm_policy_lock);

        prev = rhashtable_lookup_get_insert_key(&xfrm_policy_inexact_table,
                                                &bin->k, &bin->head,
                                                xfrm_pol_inexact_params);
        if (!prev) {
                list_add(&bin->inexact_bins, &net->xfrm.inexact_bins);
                return bin;
        }

        kfree(bin);

        return IS_ERR(prev) ? NULL : prev;
}

static bool xfrm_pol_inexact_addr_use_any_list(const xfrm_address_t *addr,
                                               int family, u8 prefixlen)
{
        if (xfrm_addr_any(addr, family))
                return true;

        if (family == AF_INET6 && prefixlen < INEXACT_PREFIXLEN_IPV6)
                return true;

        if (family == AF_INET && prefixlen < INEXACT_PREFIXLEN_IPV4)
                return true;

        return false;
}

static bool
xfrm_policy_inexact_insert_use_any_list(const struct xfrm_policy *policy)
{
        const xfrm_address_t *addr;
        bool saddr_any, daddr_any;
        u8 prefixlen;

        addr = &policy->selector.saddr;
        prefixlen = policy->selector.prefixlen_s;

        saddr_any = xfrm_pol_inexact_addr_use_any_list(addr,
                                                       policy->family,
                                                       prefixlen);
        addr = &policy->selector.daddr;
        prefixlen = policy->selector.prefixlen_d;
        daddr_any = xfrm_pol_inexact_addr_use_any_list(addr,
                                                       policy->family,
                                                       prefixlen);
        return saddr_any && daddr_any;
}

static void xfrm_pol_inexact_node_init(struct xfrm_pol_inexact_node *node,
                                       const xfrm_address_t *addr, u8 prefixlen)
{
        node->addr = *addr;
        node->prefixlen = prefixlen;
}

static struct xfrm_pol_inexact_node *
xfrm_pol_inexact_node_alloc(const xfrm_address_t *addr, u8 prefixlen)
{
        struct xfrm_pol_inexact_node *node;

        node = kzalloc(sizeof(*node), GFP_ATOMIC);
        if (node)
                xfrm_pol_inexact_node_init(node, addr, prefixlen);

        return node;
}

static int xfrm_policy_addr_delta(const xfrm_address_t *a,
                                  const xfrm_address_t *b,
                                  u8 prefixlen, u16 family)
{
        u32 ma, mb, mask;
        unsigned int pdw, pbi;
        int delta = 0;

        switch (family) {
        case AF_INET:
                if (prefixlen == 0)
                        return 0;
                mask = ~0U << (32 - prefixlen);
                ma = ntohl(a->a4) & mask;
                mb = ntohl(b->a4) & mask;
                if (ma < mb)
                        delta = -1;
                else if (ma > mb)
                        delta = 1;
                break;
        case AF_INET6:
                pdw = prefixlen >> 5;
                pbi = prefixlen & 0x1f;

                if (pdw) {
                        delta = memcmp(a->a6, b->a6, pdw << 2);
                        if (delta)
                                return delta;
                }
                if (pbi) {
                        mask = ~0U << (32 - pbi);
                        ma = ntohl(a->a6[pdw]) & mask;
                        mb = ntohl(b->a6[pdw]) & mask;
                        if (ma < mb)
                                delta = -1;
                        else if (ma > mb)
                                delta = 1;
                }
                break;
        default:
                break;
        }

        return delta;
}

static void xfrm_policy_inexact_list_reinsert(struct net *net,
                                              struct xfrm_pol_inexact_node *n,
                                              u16 family)
{
        unsigned int matched_s, matched_d;
        struct xfrm_policy *policy, *p;

        matched_s = 0;
        matched_d = 0;

        list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) {
                struct hlist_node *newpos = NULL;
                bool matches_s, matches_d;

                if (!policy->bydst_reinsert)
                        continue;

                WARN_ON_ONCE(policy->family != family);

                policy->bydst_reinsert = false;
                hlist_for_each_entry(p, &n->hhead, bydst) {
                        if (policy->priority > p->priority)
                                newpos = &p->bydst;
                        else if (policy->priority == p->priority &&
                                 policy->pos > p->pos)
                                newpos = &p->bydst;
                        else
                                break;
                }

                if (newpos)
                        hlist_add_behind_rcu(&policy->bydst, newpos);
                else
                        hlist_add_head_rcu(&policy->bydst, &n->hhead);

                /* paranoia checks follow.
                 * Check that the reinserted policy matches at least
                 * saddr or daddr for current node prefix.
                 *
                 * Matching both is fine, matching saddr in one policy
                 * (but not daddr) and then matching only daddr in another
                 * is a bug.
                 */
                matches_s = xfrm_policy_addr_delta(&policy->selector.saddr,
                                                   &n->addr,
                                                   n->prefixlen,
                                                   family) == 0;
                matches_d = xfrm_policy_addr_delta(&policy->selector.daddr,
                                                   &n->addr,
                                                   n->prefixlen,
                                                   family) == 0;
                if (matches_s && matches_d)
                        continue;

                WARN_ON_ONCE(!matches_s && !matches_d);
                if (matches_s)
                        matched_s++;
                if (matches_d)
                        matched_d++;
                WARN_ON_ONCE(matched_s && matched_d);
        }
}

static void xfrm_policy_inexact_node_reinsert(struct net *net,
                                              struct xfrm_pol_inexact_node *n,
                                              struct rb_root *new,
                                              u16 family)
{
        struct xfrm_pol_inexact_node *node;
        struct rb_node **p, *parent;

        /* we should not have another subtree here */
        WARN_ON_ONCE(!RB_EMPTY_ROOT(&n->root));
restart:
        parent = NULL;
        p = &new->rb_node;
        while (*p) {
                u8 prefixlen;
                int delta;

                parent = *p;
                node = rb_entry(*p, struct xfrm_pol_inexact_node, node);

                prefixlen = min(node->prefixlen, n->prefixlen);

                delta = xfrm_policy_addr_delta(&n->addr, &node->addr,
                                               prefixlen, family);
                if (delta < 0) {
                        p = &parent->rb_left;
                } else if (delta > 0) {
                        p = &parent->rb_right;
                } else {
                        bool same_prefixlen = node->prefixlen == n->prefixlen;
                        struct xfrm_policy *tmp;

                        hlist_for_each_entry(tmp, &n->hhead, bydst) {
                                tmp->bydst_reinsert = true;
                                hlist_del_rcu(&tmp->bydst);
                        }

                        node->prefixlen = prefixlen;

                        xfrm_policy_inexact_list_reinsert(net, node, family);

                        if (same_prefixlen) {
                                kfree_rcu(n, rcu);
                                return;
                        }

                        rb_erase(*p, new);
                        kfree_rcu(n, rcu);
                        n = node;
                        goto restart;
                }
        }

        rb_link_node_rcu(&n->node, parent, p);
        rb_insert_color(&n->node, new);
}

/* merge nodes v and n */
static void xfrm_policy_inexact_node_merge(struct net *net,
                                           struct xfrm_pol_inexact_node *v,
                                           struct xfrm_pol_inexact_node *n,
                                           u16 family)
{
        struct xfrm_pol_inexact_node *node;
        struct xfrm_policy *tmp;
        struct rb_node *rnode;

        /* To-be-merged node v has a subtree.
         *
         * Dismantle it and insert its nodes to n->root.
         */
        while ((rnode = rb_first(&v->root)) != NULL) {
                node = rb_entry(rnode, struct xfrm_pol_inexact_node, node);
                rb_erase(&node->node, &v->root);
                xfrm_policy_inexact_node_reinsert(net, node, &n->root,
                                                  family);
        }

        hlist_for_each_entry(tmp, &v->hhead, bydst) {
                tmp->bydst_reinsert = true;
                hlist_del_rcu(&tmp->bydst);
        }

        xfrm_policy_inexact_list_reinsert(net, n, family);
}

static struct xfrm_pol_inexact_node *
xfrm_policy_inexact_insert_node(struct net *net,
                                struct rb_root *root,
                                xfrm_address_t *addr,
                                u16 family, u8 prefixlen, u8 dir)
{
        struct xfrm_pol_inexact_node *cached = NULL;
        struct rb_node **p, *parent = NULL;
        struct xfrm_pol_inexact_node *node;

        p = &root->rb_node;
        while (*p) {
                int delta;

                parent = *p;
                node = rb_entry(*p, struct xfrm_pol_inexact_node, node);

                delta = xfrm_policy_addr_delta(addr, &node->addr,
                                               node->prefixlen,
                                               family);
                if (delta == 0 && prefixlen >= node->prefixlen) {
                        WARN_ON_ONCE(cached); /* ipsec policies got lost */
                        return node;
                }

                if (delta < 0)
                        p = &parent->rb_left;
                else
                        p = &parent->rb_right;

                if (prefixlen < node->prefixlen) {
                        delta = xfrm_policy_addr_delta(addr, &node->addr,
                                                       prefixlen,
                                                       family);
                        if (delta)
                                continue;

                        /* This node is a subnet of the new prefix. It needs
                         * to be removed and re-inserted with the smaller
                         * prefix and all nodes that are now also covered
                         * by the reduced prefixlen.
                         */
                        rb_erase(&node->node, root);

                        if (!cached) {
                                xfrm_pol_inexact_node_init(node, addr,
                                                           prefixlen);
                                cached = node;
                        } else {
                                /* This node also falls within the new
                                 * prefixlen. Merge the to-be-reinserted
                                 * node and this one.
                                 */
                                xfrm_policy_inexact_node_merge(net, node,
                                                               cached, family);
                                kfree_rcu(node, rcu);
                        }

                        /* restart */
                        p = &root->rb_node;
                        parent = NULL;
                }
        }

        node = cached;
        if (!node) {
                node = xfrm_pol_inexact_node_alloc(addr, prefixlen);
                if (!node)
                        return NULL;
        }

        rb_link_node_rcu(&node->node, parent, p);
        rb_insert_color(&node->node, root);

        return node;
}

static void xfrm_policy_inexact_gc_tree(struct rb_root *r, bool rm)
{
        struct xfrm_pol_inexact_node *node;
        struct rb_node *rn = rb_first(r);

        while (rn) {
                node = rb_entry(rn, struct xfrm_pol_inexact_node, node);

                xfrm_policy_inexact_gc_tree(&node->root, rm);
                rn = rb_next(rn);

                if (!hlist_empty(&node->hhead) || !RB_EMPTY_ROOT(&node->root)) {
                        WARN_ON_ONCE(rm);
                        continue;
                }

                rb_erase(&node->node, r);
                kfree_rcu(node, rcu);
        }
}

static void __xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b, bool net_exit)
{
        write_seqcount_begin(&b->count);
        xfrm_policy_inexact_gc_tree(&b->root_d, net_exit);
        xfrm_policy_inexact_gc_tree(&b->root_s, net_exit);
        write_seqcount_end(&b->count);

        if (!RB_EMPTY_ROOT(&b->root_d) || !RB_EMPTY_ROOT(&b->root_s) ||
            !hlist_empty(&b->hhead)) {
                WARN_ON_ONCE(net_exit);
                return;
        }

        if (rhashtable_remove_fast(&xfrm_policy_inexact_table, &b->head,
                                   xfrm_pol_inexact_params) == 0) {
                list_del(&b->inexact_bins);
                kfree_rcu(b, rcu);
        }
}

static void xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b)
{
        struct net *net = read_pnet(&b->k.net);

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        __xfrm_policy_inexact_prune_bin(b, false);
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
}

static void __xfrm_policy_inexact_flush(struct net *net)
{
        struct xfrm_pol_inexact_bin *bin, *t;

        lockdep_assert_held(&net->xfrm.xfrm_policy_lock);

        list_for_each_entry_safe(bin, t, &net->xfrm.inexact_bins, inexact_bins)
                __xfrm_policy_inexact_prune_bin(bin, false);
}

static struct hlist_head *
xfrm_policy_inexact_alloc_chain(struct xfrm_pol_inexact_bin *bin,
                                struct xfrm_policy *policy, u8 dir)
{
        struct xfrm_pol_inexact_node *n;
        struct net *net;

        net = xp_net(policy);
        lockdep_assert_held(&net->xfrm.xfrm_policy_lock);

        if (xfrm_policy_inexact_insert_use_any_list(policy))
                return &bin->hhead;

        if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.daddr,
                                               policy->family,
                                               policy->selector.prefixlen_d)) {
                write_seqcount_begin(&bin->count);
                n = xfrm_policy_inexact_insert_node(net,
                                                    &bin->root_s,
                                                    &policy->selector.saddr,
                                                    policy->family,
                                                    policy->selector.prefixlen_s,
                                                    dir);
                write_seqcount_end(&bin->count);
                if (!n)
                        return NULL;

                return &n->hhead;
        }

        /* daddr is fixed */
        write_seqcount_begin(&bin->count);
        n = xfrm_policy_inexact_insert_node(net,
                                            &bin->root_d,
                                            &policy->selector.daddr,
                                            policy->family,
                                            policy->selector.prefixlen_d, dir);
        write_seqcount_end(&bin->count);
        if (!n)
                return NULL;

        /* saddr is wildcard */
        if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.saddr,
                                               policy->family,
                                               policy->selector.prefixlen_s))
                return &n->hhead;

        write_seqcount_begin(&bin->count);
        n = xfrm_policy_inexact_insert_node(net,
                                            &n->root,
                                            &policy->selector.saddr,
                                            policy->family,
                                            policy->selector.prefixlen_s, dir);
        write_seqcount_end(&bin->count);
        if (!n)
                return NULL;

        return &n->hhead;
}

static struct xfrm_policy *
xfrm_policy_inexact_insert(struct xfrm_policy *policy, u8 dir, int excl)
{
        struct xfrm_pol_inexact_bin *bin;
        struct xfrm_policy *delpol;
        struct hlist_head *chain;
        struct net *net;

        bin = xfrm_policy_inexact_alloc_bin(policy, dir);
        if (!bin)
                return ERR_PTR(-ENOMEM);

        net = xp_net(policy);
        lockdep_assert_held(&net->xfrm.xfrm_policy_lock);

        chain = xfrm_policy_inexact_alloc_chain(bin, policy, dir);
        if (!chain) {
                __xfrm_policy_inexact_prune_bin(bin, false);
                return ERR_PTR(-ENOMEM);
        }

        delpol = xfrm_policy_insert_list(chain, policy, excl);
        if (delpol && excl) {
                __xfrm_policy_inexact_prune_bin(bin, false);
                return ERR_PTR(-EEXIST);
        }

        chain = &net->xfrm.policy_inexact[dir];
        xfrm_policy_insert_inexact_list(chain, policy);

        if (delpol)
                __xfrm_policy_inexact_prune_bin(bin, false);

        return delpol;
}

static void xfrm_hash_rebuild(struct work_struct *work)
{
        struct net *net = container_of(work, struct net,
                                       xfrm.policy_hthresh.work);
        unsigned int hmask;
        struct xfrm_policy *pol;
        struct xfrm_policy *policy;
        struct hlist_head *chain;
        struct hlist_head *odst;
        struct hlist_node *newpos;
        int i;
        int dir;
        unsigned seq;
        u8 lbits4, rbits4, lbits6, rbits6;

        mutex_lock(&hash_resize_mutex);

        /* read selector prefixlen thresholds */
        do {
                seq = read_seqbegin(&net->xfrm.policy_hthresh.lock);

                lbits4 = net->xfrm.policy_hthresh.lbits4;
                rbits4 = net->xfrm.policy_hthresh.rbits4;
                lbits6 = net->xfrm.policy_hthresh.lbits6;
                rbits6 = net->xfrm.policy_hthresh.rbits6;
        } while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq));

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        write_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation);

        /* make sure that we can insert the indirect policies again before
         * we start with destructive action.
         */
        list_for_each_entry(policy, &net->xfrm.policy_all, walk.all) {
                struct xfrm_pol_inexact_bin *bin;
                u8 dbits, sbits;

                dir = xfrm_policy_id2dir(policy->index);
                if (policy->walk.dead || dir >= XFRM_POLICY_MAX)
                        continue;

                if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) {
                        if (policy->family == AF_INET) {
                                dbits = rbits4;
                                sbits = lbits4;
                        } else {
                                dbits = rbits6;
                                sbits = lbits6;
                        }
                } else {
                        if (policy->family == AF_INET) {
                                dbits = lbits4;
                                sbits = rbits4;
                        } else {
                                dbits = lbits6;
                                sbits = rbits6;
                        }
                }

                if (policy->selector.prefixlen_d < dbits ||
                    policy->selector.prefixlen_s < sbits)
                        continue;

                bin = xfrm_policy_inexact_alloc_bin(policy, dir);
                if (!bin)
                        goto out_unlock;

                if (!xfrm_policy_inexact_alloc_chain(bin, policy, dir))
                        goto out_unlock;
        }

        /* reset the bydst and inexact table in all directions */
        for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
                struct hlist_node *n;

                hlist_for_each_entry_safe(policy, n,
                                          &net->xfrm.policy_inexact[dir],
                                          bydst_inexact_list) {
                        hlist_del_rcu(&policy->bydst);
                        hlist_del_init(&policy->bydst_inexact_list);
                }

                hmask = net->xfrm.policy_bydst[dir].hmask;
                odst = net->xfrm.policy_bydst[dir].table;
                for (i = hmask; i >= 0; i--) {
                        hlist_for_each_entry_safe(policy, n, odst + i, bydst)
                                hlist_del_rcu(&policy->bydst);
                }
                if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) {
                        /* dir out => dst = remote, src = local */
                        net->xfrm.policy_bydst[dir].dbits4 = rbits4;
                        net->xfrm.policy_bydst[dir].sbits4 = lbits4;
                        net->xfrm.policy_bydst[dir].dbits6 = rbits6;
                        net->xfrm.policy_bydst[dir].sbits6 = lbits6;
                } else {
                        /* dir in/fwd => dst = local, src = remote */
                        net->xfrm.policy_bydst[dir].dbits4 = lbits4;
                        net->xfrm.policy_bydst[dir].sbits4 = rbits4;
                        net->xfrm.policy_bydst[dir].dbits6 = lbits6;
                        net->xfrm.policy_bydst[dir].sbits6 = rbits6;
                }
        }

        /* re-insert all policies by order of creation */
        list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) {
                if (policy->walk.dead)
                        continue;
                dir = xfrm_policy_id2dir(policy->index);
                if (dir >= XFRM_POLICY_MAX) {
                        /* skip socket policies */
                        continue;
                }
                newpos = NULL;
                chain = policy_hash_bysel(net, &policy->selector,
                                          policy->family, dir);

                if (!chain) {
                        void *p = xfrm_policy_inexact_insert(policy, dir, 0);

                        WARN_ONCE(IS_ERR(p), "reinsert: %ld\n", PTR_ERR(p));
                        continue;
                }

                hlist_for_each_entry(pol, chain, bydst) {
                        if (policy->priority >= pol->priority)
                                newpos = &pol->bydst;
                        else
                                break;
                }
                if (newpos)
                        hlist_add_behind_rcu(&policy->bydst, newpos);
                else
                        hlist_add_head_rcu(&policy->bydst, chain);
        }

out_unlock:
        __xfrm_policy_inexact_flush(net);
        write_seqcount_end(&net->xfrm.xfrm_policy_hash_generation);
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);

        mutex_unlock(&hash_resize_mutex);
}

void xfrm_policy_hash_rebuild(struct net *net)
{
        schedule_work(&net->xfrm.policy_hthresh.work);
}
EXPORT_SYMBOL(xfrm_policy_hash_rebuild);

/* Generate new index... KAME seems to generate them ordered by cost
 * of an absolute inpredictability of ordering of rules. This will not pass. */
static u32 xfrm_gen_index(struct net *net, int dir, u32 index)
{
        for (;;) {
                struct hlist_head *list;
                struct xfrm_policy *p;
                u32 idx;
                int found;

                if (!index) {
                        idx = (net->xfrm.idx_generator | dir);
                        net->xfrm.idx_generator += 8;
                } else {
                        idx = index;
                        index = 0;
                }

                if (idx == 0)
                        idx = 8;
                list = net->xfrm.policy_byidx + idx_hash(net, idx);
                found = 0;
                hlist_for_each_entry(p, list, byidx) {
                        if (p->index == idx) {
                                found = 1;
                                break;
                        }
                }
                if (!found)
                        return idx;
        }
}

static inline int selector_cmp(struct xfrm_selector *s1, struct xfrm_selector *s2)
{
        u32 *p1 = (u32 *) s1;
        u32 *p2 = (u32 *) s2;
        int len = sizeof(struct xfrm_selector) / sizeof(u32);
        int i;

        for (i = 0; i < len; i++) {
                if (p1[i] != p2[i])
                        return 1;
        }

        return 0;
}

static void xfrm_policy_requeue(struct xfrm_policy *old,
                                struct xfrm_policy *new)
{
        struct xfrm_policy_queue *pq = &old->polq;
        struct sk_buff_head list;

        if (skb_queue_empty(&pq->hold_queue))
                return;

        __skb_queue_head_init(&list);

        spin_lock_bh(&pq->hold_queue.lock);
        skb_queue_splice_init(&pq->hold_queue, &list);
        if (del_timer(&pq->hold_timer))
                xfrm_pol_put(old);
        spin_unlock_bh(&pq->hold_queue.lock);

        pq = &new->polq;

        spin_lock_bh(&pq->hold_queue.lock);
        skb_queue_splice(&list, &pq->hold_queue);
        pq->timeout = XFRM_QUEUE_TMO_MIN;
        if (!mod_timer(&pq->hold_timer, jiffies))
                xfrm_pol_hold(new);
        spin_unlock_bh(&pq->hold_queue.lock);
}

static inline bool xfrm_policy_mark_match(const struct xfrm_mark *mark,
                                          struct xfrm_policy *pol)
{
        return mark->v == pol->mark.v && mark->m == pol->mark.m;
}

static u32 xfrm_pol_bin_key(const void *data, u32 len, u32 seed)
{
        const struct xfrm_pol_inexact_key *k = data;
        u32 a = k->type << 24 | k->dir << 16 | k->family;

        return jhash_3words(a, k->if_id, net_hash_mix(read_pnet(&k->net)),
                            seed);
}

static u32 xfrm_pol_bin_obj(const void *data, u32 len, u32 seed)
{
        const struct xfrm_pol_inexact_bin *b = data;

        return xfrm_pol_bin_key(&b->k, 0, seed);
}

static int xfrm_pol_bin_cmp(struct rhashtable_compare_arg *arg,
                            const void *ptr)
{
        const struct xfrm_pol_inexact_key *key = arg->key;
        const struct xfrm_pol_inexact_bin *b = ptr;
        int ret;

        if (!net_eq(read_pnet(&b->k.net), read_pnet(&key->net)))
                return -1;

        ret = b->k.dir ^ key->dir;
        if (ret)
                return ret;

        ret = b->k.type ^ key->type;
        if (ret)
                return ret;

        ret = b->k.family ^ key->family;
        if (ret)
                return ret;

        return b->k.if_id ^ key->if_id;
}

static const struct rhashtable_params xfrm_pol_inexact_params = {
        .head_offset                = offsetof(struct xfrm_pol_inexact_bin, head),
        .hashfn                        = xfrm_pol_bin_key,
        .obj_hashfn                = xfrm_pol_bin_obj,
        .obj_cmpfn                = xfrm_pol_bin_cmp,
        .automatic_shrinking        = true,
};

static void xfrm_policy_insert_inexact_list(struct hlist_head *chain,
                                            struct xfrm_policy *policy)
{
        struct xfrm_policy *pol, *delpol = NULL;
        struct hlist_node *newpos = NULL;
        int i = 0;

        hlist_for_each_entry(pol, chain, bydst_inexact_list) {
                if (pol->type == policy->type &&
                    pol->if_id == policy->if_id &&
                    !selector_cmp(&pol->selector, &policy->selector) &&
                    xfrm_policy_mark_match(&policy->mark, pol) &&
                    xfrm_sec_ctx_match(pol->security, policy->security) &&
                    !WARN_ON(delpol)) {
                        delpol = pol;
                        if (policy->priority > pol->priority)
                                continue;
                } else if (policy->priority >= pol->priority) {
                        newpos = &pol->bydst_inexact_list;
                        continue;
                }
                if (delpol)
                        break;
        }

        if (newpos)
                hlist_add_behind_rcu(&policy->bydst_inexact_list, newpos);
        else
                hlist_add_head_rcu(&policy->bydst_inexact_list, chain);

        hlist_for_each_entry(pol, chain, bydst_inexact_list) {
                pol->pos = i;
                i++;
        }
}

static struct xfrm_policy *xfrm_policy_insert_list(struct hlist_head *chain,
                                                   struct xfrm_policy *policy,
                                                   bool excl)
{
        struct xfrm_policy *pol, *newpos = NULL, *delpol = NULL;

        hlist_for_each_entry(pol, chain, bydst) {
                if (pol->type == policy->type &&
                    pol->if_id == policy->if_id &&
                    !selector_cmp(&pol->selector, &policy->selector) &&
                    xfrm_policy_mark_match(&policy->mark, pol) &&
                    xfrm_sec_ctx_match(pol->security, policy->security) &&
                    !WARN_ON(delpol)) {
                        if (excl)
                                return ERR_PTR(-EEXIST);
                        delpol = pol;
                        if (policy->priority > pol->priority)
                                continue;
                } else if (policy->priority >= pol->priority) {
                        newpos = pol;
                        continue;
                }
                if (delpol)
                        break;
        }

        if (newpos)
                hlist_add_behind_rcu(&policy->bydst, &newpos->bydst);
        else
                hlist_add_head_rcu(&policy->bydst, chain);

        return delpol;
}

int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
{
        struct net *net = xp_net(policy);
        struct xfrm_policy *delpol;
        struct hlist_head *chain;

        /* Sanitize mark before store */
        policy->mark.v &= policy->mark.m;

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        chain = policy_hash_bysel(net, &policy->selector, policy->family, dir);
        if (chain)
                delpol = xfrm_policy_insert_list(chain, policy, excl);
        else
                delpol = xfrm_policy_inexact_insert(policy, dir, excl);

        if (IS_ERR(delpol)) {
                spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
                return PTR_ERR(delpol);
        }

        __xfrm_policy_link(policy, dir);

        /* After previous checking, family can either be AF_INET or AF_INET6 */
        if (policy->family == AF_INET)
                rt_genid_bump_ipv4(net);
        else
                rt_genid_bump_ipv6(net);

        if (delpol) {
                xfrm_policy_requeue(delpol, policy);
                __xfrm_policy_unlink(delpol, dir);
        }
        policy->index = delpol ? delpol->index : xfrm_gen_index(net, dir, policy->index);
        hlist_add_head(&policy->byidx, net->xfrm.policy_byidx+idx_hash(net, policy->index));
        policy->curlft.add_time = ktime_get_real_seconds();
        policy->curlft.use_time = 0;
        if (!mod_timer(&policy->timer, jiffies + HZ))
                xfrm_pol_hold(policy);
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);

        if (delpol)
                xfrm_policy_kill(delpol);
        else if (xfrm_bydst_should_resize(net, dir, NULL))
                schedule_work(&net->xfrm.policy_hash_work);

        return 0;
}
EXPORT_SYMBOL(xfrm_policy_insert);

static struct xfrm_policy *
__xfrm_policy_bysel_ctx(struct hlist_head *chain, const struct xfrm_mark *mark,
                        u32 if_id, u8 type, int dir, struct xfrm_selector *sel,
                        struct xfrm_sec_ctx *ctx)
{
        struct xfrm_policy *pol;

        if (!chain)
                return NULL;

        hlist_for_each_entry(pol, chain, bydst) {
                if (pol->type == type &&
                    pol->if_id == if_id &&
                    xfrm_policy_mark_match(mark, pol) &&
                    !selector_cmp(sel, &pol->selector) &&
                    xfrm_sec_ctx_match(ctx, pol->security))
                        return pol;
        }

        return NULL;
}

struct xfrm_policy *
xfrm_policy_bysel_ctx(struct net *net, const struct xfrm_mark *mark, u32 if_id,
                      u8 type, int dir, struct xfrm_selector *sel,
                      struct xfrm_sec_ctx *ctx, int delete, int *err)
{
        struct xfrm_pol_inexact_bin *bin = NULL;
        struct xfrm_policy *pol, *ret = NULL;
        struct hlist_head *chain;

        *err = 0;
        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        chain = policy_hash_bysel(net, sel, sel->family, dir);
        if (!chain) {
                struct xfrm_pol_inexact_candidates cand;
                int i;

                bin = xfrm_policy_inexact_lookup(net, type,
                                                 sel->family, dir, if_id);
                if (!bin) {
                        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
                        return NULL;
                }

                if (!xfrm_policy_find_inexact_candidates(&cand, bin,
                                                         &sel->saddr,
                                                         &sel->daddr)) {
                        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
                        return NULL;
                }

                pol = NULL;
                for (i = 0; i < ARRAY_SIZE(cand.res); i++) {
                        struct xfrm_policy *tmp;

                        tmp = __xfrm_policy_bysel_ctx(cand.res[i], mark,
                                                      if_id, type, dir,
                                                      sel, ctx);
                        if (!tmp)
                                continue;

                        if (!pol || tmp->pos < pol->pos)
                                pol = tmp;
                }
        } else {
                pol = __xfrm_policy_bysel_ctx(chain, mark, if_id, type, dir,
                                              sel, ctx);
        }

        if (pol) {
                xfrm_pol_hold(pol);
                if (delete) {
                        *err = security_xfrm_policy_delete(pol->security);
                        if (*err) {
                                spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
                                return pol;
                        }
                        __xfrm_policy_unlink(pol, dir);
                }
                ret = pol;
        }
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);

        if (ret && delete)
                xfrm_policy_kill(ret);
        if (bin && delete)
                xfrm_policy_inexact_prune_bin(bin);
        return ret;
}
EXPORT_SYMBOL(xfrm_policy_bysel_ctx);

struct xfrm_policy *
xfrm_policy_byid(struct net *net, const struct xfrm_mark *mark, u32 if_id,
                 u8 type, int dir, u32 id, int delete, int *err)
{
        struct xfrm_policy *pol, *ret;
        struct hlist_head *chain;

        *err = -ENOENT;
        if (xfrm_policy_id2dir(id) != dir)
                return NULL;

        *err = 0;
        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        chain = net->xfrm.policy_byidx + idx_hash(net, id);
        ret = NULL;
        hlist_for_each_entry(pol, chain, byidx) {
                if (pol->type == type && pol->index == id &&
                    pol->if_id == if_id && xfrm_policy_mark_match(mark, pol)) {
                        xfrm_pol_hold(pol);
                        if (delete) {
                                *err = security_xfrm_policy_delete(
                                                                pol->security);
                                if (*err) {
                                        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
                                        return pol;
                                }
                                __xfrm_policy_unlink(pol, dir);
                        }
                        ret = pol;
                        break;
                }
        }
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);

        if (ret && delete)
                xfrm_policy_kill(ret);
        return ret;
}
EXPORT_SYMBOL(xfrm_policy_byid);

#ifdef CONFIG_SECURITY_NETWORK_XFRM
static inline int
xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid)
{
        struct xfrm_policy *pol;
        int err = 0;

        list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) {
                if (pol->walk.dead ||
                    xfrm_policy_id2dir(pol->index) >= XFRM_POLICY_MAX ||
                    pol->type != type)
                        continue;

                err = security_xfrm_policy_delete(pol->security);
                if (err) {
                        xfrm_audit_policy_delete(pol, 0, task_valid);
                        return err;
                }
        }
        return err;
}
#else
static inline int
xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid)
{
        return 0;
}
#endif

int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
{
        int dir, err = 0, cnt = 0;
        struct xfrm_policy *pol;

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);

        err = xfrm_policy_flush_secctx_check(net, type, task_valid);
        if (err)
                goto out;

again:
        list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) {
                dir = xfrm_policy_id2dir(pol->index);
                if (pol->walk.dead ||
                    dir >= XFRM_POLICY_MAX ||
                    pol->type != type)
                        continue;

                __xfrm_policy_unlink(pol, dir);
                spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
                cnt++;
                xfrm_audit_policy_delete(pol, 1, task_valid);
                xfrm_policy_kill(pol);
                spin_lock_bh(&net->xfrm.xfrm_policy_lock);
                goto again;
        }
        if (cnt)
                __xfrm_policy_inexact_flush(net);
        else
                err = -ESRCH;
out:
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
        return err;
}
EXPORT_SYMBOL(xfrm_policy_flush);

int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk,
                     int (*func)(struct xfrm_policy *, int, int, void*),
                     void *data)
{
        struct xfrm_policy *pol;
        struct xfrm_policy_walk_entry *x;
        int error = 0;

        if (walk->type >= XFRM_POLICY_TYPE_MAX &&
            walk->type != XFRM_POLICY_TYPE_ANY)
                return -EINVAL;

        if (list_empty(&walk->walk.all) && walk->seq != 0)
                return 0;

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        if (list_empty(&walk->walk.all))
                x = list_first_entry(&net->xfrm.policy_all, struct xfrm_policy_walk_entry, all);
        else
                x = list_first_entry(&walk->walk.all,
                                     struct xfrm_policy_walk_entry, all);

        list_for_each_entry_from(x, &net->xfrm.policy_all, all) {
                if (x->dead)
                        continue;
                pol = container_of(x, struct xfrm_policy, walk);
                if (walk->type != XFRM_POLICY_TYPE_ANY &&
                    walk->type != pol->type)
                        continue;
                error = func(pol, xfrm_policy_id2dir(pol->index),
                             walk->seq, data);
                if (error) {
                        list_move_tail(&walk->walk.all, &x->all);
                        goto out;
                }
                walk->seq++;
        }
        if (walk->seq == 0) {
                error = -ENOENT;
                goto out;
        }
        list_del_init(&walk->walk.all);
out:
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
        return error;
}
EXPORT_SYMBOL(xfrm_policy_walk);

void xfrm_policy_walk_init(struct xfrm_policy_walk *walk, u8 type)
{
        INIT_LIST_HEAD(&walk->walk.all);
        walk->walk.dead = 1;
        walk->type = type;
        walk->seq = 0;
}
EXPORT_SYMBOL(xfrm_policy_walk_init);

void xfrm_policy_walk_done(struct xfrm_policy_walk *walk, struct net *net)
{
        if (list_empty(&walk->walk.all))
                return;

        spin_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME where is net? */
        list_del(&walk->walk.all);
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
}
EXPORT_SYMBOL(xfrm_policy_walk_done);

/*
 * Find policy to apply to this flow.
 *
 * Returns 0 if policy found, else an -errno.
 */
static int xfrm_policy_match(const struct xfrm_policy *pol,
                             const struct flowi *fl,
                             u8 type, u16 family, int dir, u32 if_id)
{
        const struct xfrm_selector *sel = &pol->selector;
        int ret = -ESRCH;
        bool match;

        if (pol->family != family ||
            pol->if_id != if_id ||
            (fl->flowi_mark & pol->mark.m) != pol->mark.v ||
            pol->type != type)
                return ret;

        match = xfrm_selector_match(sel, fl, family);
        if (match)
                ret = security_xfrm_policy_lookup(pol->security, fl->flowi_secid,
                                                  dir);
        return ret;
}

static struct xfrm_pol_inexact_node *
xfrm_policy_lookup_inexact_addr(const struct rb_root *r,
                                seqcount_spinlock_t *count,
                                const xfrm_address_t *addr, u16 family)
{
        const struct rb_node *parent;
        int seq;

again:
        seq = read_seqcount_begin(count);

        parent = rcu_dereference_raw(r->rb_node);
        while (parent) {
                struct xfrm_pol_inexact_node *node;
                int delta;

                node = rb_entry(parent, struct xfrm_pol_inexact_node, node);

                delta = xfrm_policy_addr_delta(addr, &node->addr,
                                               node->prefixlen, family);
                if (delta < 0) {
                        parent = rcu_dereference_raw(parent->rb_left);
                        continue;
                } else if (delta > 0) {
                        parent = rcu_dereference_raw(parent->rb_right);
                        continue;
                }

                return node;
        }

        if (read_seqcount_retry(count, seq))
                goto again;

        return NULL;
}

static bool
xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand,
                                    struct xfrm_pol_inexact_bin *b,
                                    const xfrm_address_t *saddr,
                                    const xfrm_address_t *daddr)
{
        struct xfrm_pol_inexact_node *n;
        u16 family;

        if (!b)
                return false;

        family = b->k.family;
        memset(cand, 0, sizeof(*cand));
        cand->res[XFRM_POL_CAND_ANY] = &b->hhead;

        n = xfrm_policy_lookup_inexact_addr(&b->root_d, &b->count, daddr,
                                            family);
        if (n) {
                cand->res[XFRM_POL_CAND_DADDR] = &n->hhead;
                n = xfrm_policy_lookup_inexact_addr(&n->root, &b->count, saddr,
                                                    family);
                if (n)
                        cand->res[XFRM_POL_CAND_BOTH] = &n->hhead;
        }

        n = xfrm_policy_lookup_inexact_addr(&b->root_s, &b->count, saddr,
                                            family);
        if (n)
                cand->res[XFRM_POL_CAND_SADDR] = &n->hhead;

        return true;
}

static struct xfrm_pol_inexact_bin *
xfrm_policy_inexact_lookup_rcu(struct net *net, u8 type, u16 family,
                               u8 dir, u32 if_id)
{
        struct xfrm_pol_inexact_key k = {
                .family = family,
                .type = type,
                .dir = dir,
                .if_id = if_id,
        };

        write_pnet(&k.net, net);

        return rhashtable_lookup(&xfrm_policy_inexact_table, &k,
                                 xfrm_pol_inexact_params);
}

static struct xfrm_pol_inexact_bin *
xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family,
                           u8 dir, u32 if_id)
{
        struct xfrm_pol_inexact_bin *bin;

        lockdep_assert_held(&net->xfrm.xfrm_policy_lock);

        rcu_read_lock();
        bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id);
        rcu_read_unlock();

        return bin;
}

static struct xfrm_policy *
__xfrm_policy_eval_candidates(struct hlist_head *chain,
                              struct xfrm_policy *prefer,
                              const struct flowi *fl,
                              u8 type, u16 family, int dir, u32 if_id)
{
        u32 priority = prefer ? prefer->priority : ~0u;
        struct xfrm_policy *pol;

        if (!chain)
                return NULL;

        hlist_for_each_entry_rcu(pol, chain, bydst) {
                int err;

                if (pol->priority > priority)
                        break;

                err = xfrm_policy_match(pol, fl, type, family, dir, if_id);
                if (err) {
                        if (err != -ESRCH)
                                return ERR_PTR(err);

                        continue;
                }

                if (prefer) {
                        /* matches.  Is it older than *prefer? */
                        if (pol->priority == priority &&
                            prefer->pos < pol->pos)
                                return prefer;
                }

                return pol;
        }

        return NULL;
}

static struct xfrm_policy *
xfrm_policy_eval_candidates(struct xfrm_pol_inexact_candidates *cand,
                            struct xfrm_policy *prefer,
                            const struct flowi *fl,
                            u8 type, u16 family, int dir, u32 if_id)
{
        struct xfrm_policy *tmp;
        int i;

        for (i = 0; i < ARRAY_SIZE(cand->res); i++) {
                tmp = __xfrm_policy_eval_candidates(cand->res[i],
                                                    prefer,
                                                    fl, type, family, dir,
                                                    if_id);
                if (!tmp)
                        continue;

                if (IS_ERR(tmp))
                        return tmp;
                prefer = tmp;
        }

        return prefer;
}

static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
                                                     const struct flowi *fl,
                                                     u16 family, u8 dir,
                                                     u32 if_id)
{
        struct xfrm_pol_inexact_candidates cand;
        const xfrm_address_t *daddr, *saddr;
        struct xfrm_pol_inexact_bin *bin;
        struct xfrm_policy *pol, *ret;
        struct hlist_head *chain;
        unsigned int sequence;
        int err;

        daddr = xfrm_flowi_daddr(fl, family);
        saddr = xfrm_flowi_saddr(fl, family);
        if (unlikely(!daddr || !saddr))
                return NULL;

        rcu_read_lock();
 retry:
        do {
                sequence = read_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation);
                chain = policy_hash_direct(net, daddr, saddr, family, dir);
        } while (read_seqcount_retry(&net->xfrm.xfrm_policy_hash_generation, sequence));

        ret = NULL;
        hlist_for_each_entry_rcu(pol, chain, bydst) {
                err = xfrm_policy_match(pol, fl, type, family, dir, if_id);
                if (err) {
                        if (err == -ESRCH)
                                continue;
                        else {
                                ret = ERR_PTR(err);
                                goto fail;
                        }
                } else {
                        ret = pol;
                        break;
                }
        }
        bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id);
        if (!bin || !xfrm_policy_find_inexact_candidates(&cand, bin, saddr,
                                                         daddr))
                goto skip_inexact;

        pol = xfrm_policy_eval_candidates(&cand, ret, fl, type,
                                          family, dir, if_id);
        if (pol) {
                ret = pol;
                if (IS_ERR(pol))
                        goto fail;
        }

skip_inexact:
        if (read_seqcount_retry(&net->xfrm.xfrm_policy_hash_generation, sequence))
                goto retry;

        if (ret && !xfrm_pol_hold_rcu(ret))
                goto retry;
fail:
        rcu_read_unlock();

        return ret;
}

static struct xfrm_policy *xfrm_policy_lookup(struct net *net,
                                              const struct flowi *fl,
                                              u16 family, u8 dir, u32 if_id)
{
#ifdef CONFIG_XFRM_SUB_POLICY
        struct xfrm_policy *pol;

        pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family,
                                        dir, if_id);
        if (pol != NULL)
                return pol;
#endif
        return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family,
                                         dir, if_id);
}

static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
                                                 const struct flowi *fl,
                                                 u16 family, u32 if_id)
{
        struct xfrm_policy *pol;

        rcu_read_lock();
 again:
        pol = rcu_dereference(sk->sk_policy[dir]);
        if (pol != NULL) {
                bool match;
                int err = 0;

                if (pol->family != family) {
                        pol = NULL;
                        goto out;
                }

                match = xfrm_selector_match(&pol->selector, fl, family);
                if (match) {
                        if ((sk->sk_mark & pol->mark.m) != pol->mark.v ||
                            pol->if_id != if_id) {
                                pol = NULL;
                                goto out;
                        }
                        err = security_xfrm_policy_lookup(pol->security,
                                                      fl->flowi_secid,
                                                      dir);
                        if (!err) {
                                if (!xfrm_pol_hold_rcu(pol))
                                        goto again;
                        } else if (err == -ESRCH) {
                                pol = NULL;
                        } else {
                                pol = ERR_PTR(err);
                        }
                } else
                        pol = NULL;
        }
out:
        rcu_read_unlock();
        return pol;
}

static void __xfrm_policy_link(struct xfrm_policy *pol, int dir)
{
        struct net *net = xp_net(pol);

        list_add(&pol->walk.all, &net->xfrm.policy_all);
        net->xfrm.policy_count[dir]++;
        xfrm_pol_hold(pol);
}

static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
                                                int dir)
{
        struct net *net = xp_net(pol);

        if (list_empty(&pol->walk.all))
                return NULL;

        /* Socket policies are not hashed. */
        if (!hlist_unhashed(&pol->bydst)) {
                hlist_del_rcu(&pol->bydst);
                hlist_del_init(&pol->bydst_inexact_list);
                hlist_del(&pol->byidx);
        }

        list_del_init(&pol->walk.all);
        net->xfrm.policy_count[dir]--;

        return pol;
}

static void xfrm_sk_policy_link(struct xfrm_policy *pol, int dir)
{
        __xfrm_policy_link(pol, XFRM_POLICY_MAX + dir);
}

static void xfrm_sk_policy_unlink(struct xfrm_policy *pol, int dir)
{
        __xfrm_policy_unlink(pol, XFRM_POLICY_MAX + dir);
}

int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
{
        struct net *net = xp_net(pol);

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        pol = __xfrm_policy_unlink(pol, dir);
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
        if (pol) {
                xfrm_policy_kill(pol);
                return 0;
        }
        return -ENOENT;
}
EXPORT_SYMBOL(xfrm_policy_delete);

int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
{
        struct net *net = sock_net(sk);
        struct xfrm_policy *old_pol;

#ifdef CONFIG_XFRM_SUB_POLICY
        if (pol && pol->type != XFRM_POLICY_TYPE_MAIN)
                return -EINVAL;
#endif

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        old_pol = rcu_dereference_protected(sk->sk_policy[dir],
                                lockdep_is_held(&net->xfrm.xfrm_policy_lock));
        if (pol) {
                pol->curlft.add_time = ktime_get_real_seconds();
                pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir, 0);
                xfrm_sk_policy_link(pol, dir);
        }
        rcu_assign_pointer(sk->sk_policy[dir], pol);
        if (old_pol) {
                if (pol)
                        xfrm_policy_requeue(old_pol, pol);

                /* Unlinking succeeds always. This is the only function
                 * allowed to delete or replace socket policy.
                 */
                xfrm_sk_policy_unlink(old_pol, dir);
        }
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);

        if (old_pol) {
                xfrm_policy_kill(old_pol);
        }
        return 0;
}

static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir)
{
        struct xfrm_policy *newp = xfrm_policy_alloc(xp_net(old), GFP_ATOMIC);
        struct net *net = xp_net(old);

        if (newp) {
                newp->selector = old->selector;
                if (security_xfrm_policy_clone(old->security,
                                               &newp->security)) {
                        kfree(newp);
                        return NULL;  /* ENOMEM */
                }
                newp->lft = old->lft;
                newp->curlft = old->curlft;
                newp->mark = old->mark;
                newp->if_id = old->if_id;
                newp->action = old->action;
                newp->flags = old->flags;
                newp->xfrm_nr = old->xfrm_nr;
                newp->index = old->index;
                newp->type = old->type;
                newp->family = old->family;
                memcpy(newp->xfrm_vec, old->xfrm_vec,
                       newp->xfrm_nr*sizeof(struct xfrm_tmpl));
                spin_lock_bh(&net->xfrm.xfrm_policy_lock);
                xfrm_sk_policy_link(newp, dir);
                spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
                xfrm_pol_put(newp);
        }
        return newp;
}

int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk)
{
        const struct xfrm_policy *p;
        struct xfrm_policy *np;
        int i, ret = 0;

        rcu_read_lock();
        for (i = 0; i < 2; i++) {
                p = rcu_dereference(osk->sk_policy[i]);
                if (p) {
                        np = clone_policy(p, i);
                        if (unlikely(!np)) {
                                ret = -ENOMEM;
                                break;
                        }
                        rcu_assign_pointer(sk->sk_policy[i], np);
                }
        }
        rcu_read_unlock();
        return ret;
}

static int
xfrm_get_saddr(unsigned short family, xfrm_address_t *saddr,
               const struct xfrm_dst_lookup_params *params)
{
        int err;
        const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);

        if (unlikely(afinfo == NULL))
                return -EINVAL;
        err = afinfo->get_saddr(saddr, params);
        rcu_read_unlock();
        return err;
}

/* Resolve list of templates for the flow, given policy. */

static int
xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl,
                      struct xfrm_state **xfrm, unsigned short family)
{
        struct net *net = xp_net(policy);
        int nx;
        int i, error;
        xfrm_address_t *daddr = xfrm_flowi_daddr(fl, family);
        xfrm_address_t *saddr = xfrm_flowi_saddr(fl, family);
        xfrm_address_t tmp;

        for (nx = 0, i = 0; i < policy->xfrm_nr; i++) {
                struct xfrm_state *x;
                xfrm_address_t *remote = daddr;
                xfrm_address_t *local  = saddr;
                struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i];

                if (tmpl->mode == XFRM_MODE_TUNNEL ||
                    tmpl->mode == XFRM_MODE_BEET) {
                        remote = &tmpl->id.daddr;
                        local = &tmpl->saddr;
                        if (xfrm_addr_any(local, tmpl->encap_family)) {
                                struct xfrm_dst_lookup_params params;

                                memset(&params, 0, sizeof(params));
                                params.net = net;
                                params.oif = fl->flowi_oif;
                                params.daddr = remote;
                                error = xfrm_get_saddr(tmpl->encap_family, &tmp,
                                                       &params);
                                if (error)
                                        goto fail;
                                local = &tmp;
                        }
                }

                x = xfrm_state_find(remote, local, fl, tmpl, policy, &error,
                                    family, policy->if_id);

                if (x && x->km.state == XFRM_STATE_VALID) {
                        xfrm[nx++] = x;
                        daddr = remote;
                        saddr = local;
                        continue;
                }
                if (x) {
                        error = (x->km.state == XFRM_STATE_ERROR ?
                                 -EINVAL : -EAGAIN);
                        xfrm_state_put(x);
                } else if (error == -ESRCH) {
                        error = -EAGAIN;
                }

                if (!tmpl->optional)
                        goto fail;
        }
        return nx;

fail:
        for (nx--; nx >= 0; nx--)
                xfrm_state_put(xfrm[nx]);
        return error;
}

static int
xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl,
                  struct xfrm_state **xfrm, unsigned short family)
{
        struct xfrm_state *tp[XFRM_MAX_DEPTH];
        struct xfrm_state **tpp = (npols > 1) ? tp : xfrm;
        int cnx = 0;
        int error;
        int ret;
        int i;

        for (i = 0; i < npols; i++) {
                if (cnx + pols[i]->xfrm_nr >= XFRM_MAX_DEPTH) {
                        error = -ENOBUFS;
                        goto fail;
                }

                ret = xfrm_tmpl_resolve_one(pols[i], fl, &tpp[cnx], family);
                if (ret < 0) {
                        error = ret;
                        goto fail;
                } else
                        cnx += ret;
        }

        /* found states are sorted for outbound processing */
        if (npols > 1)
                xfrm_state_sort(xfrm, tpp, cnx, family);

        return cnx;

 fail:
        for (cnx--; cnx >= 0; cnx--)
                xfrm_state_put(tpp[cnx]);
        return error;

}

static int xfrm_get_tos(const struct flowi *fl, int family)
{
        if (family == AF_INET)
                return IPTOS_RT_MASK & fl->u.ip4.flowi4_tos;

        return 0;
}

static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
{
        const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
        struct dst_ops *dst_ops;
        struct xfrm_dst *xdst;

        if (!afinfo)
                return ERR_PTR(-EINVAL);

        switch (family) {
        case AF_INET:
                dst_ops = &net->xfrm.xfrm4_dst_ops;
                break;
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                dst_ops = &net->xfrm.xfrm6_dst_ops;
                break;
#endif
        default:
                BUG();
        }
        xdst = dst_alloc(dst_ops, NULL, 1, DST_OBSOLETE_NONE, 0);

        if (likely(xdst)) {
                struct dst_entry *dst = &xdst->u.dst;

                memset(dst + 1, 0, sizeof(*xdst) - sizeof(*dst));
        } else
                xdst = ERR_PTR(-ENOBUFS);

        rcu_read_unlock();

        return xdst;
}

static void xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst,
                           int nfheader_len)
{
        if (dst->ops->family == AF_INET6) {
                struct rt6_info *rt = (struct rt6_info *)dst;
                path->path_cookie = rt6_get_cookie(rt);
                path->u.rt6.rt6i_nfheader_len = nfheader_len;
        }
}

static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
                                const struct flowi *fl)
{
        const struct xfrm_policy_afinfo *afinfo =
                xfrm_policy_get_afinfo(xdst->u.dst.ops->family);
        int err;

        if (!afinfo)
                return -EINVAL;

        err = afinfo->fill_dst(xdst, dev, fl);

        rcu_read_unlock();

        return err;
}


/* Allocate chain of dst_entry's, attach known xfrm's, calculate
 * all the metrics... Shortly, bundle a bundle.
 */

static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
                                            struct xfrm_state **xfrm,
                                            struct xfrm_dst **bundle,
                                            int nx,
                                            const struct flowi *fl,
                                            struct dst_entry *dst)
{
        const struct xfrm_state_afinfo *afinfo;
        const struct xfrm_mode *inner_mode;
        struct net *net = xp_net(policy);
        unsigned long now = jiffies;
        struct net_device *dev;
        struct xfrm_dst *xdst_prev = NULL;
        struct xfrm_dst *xdst0 = NULL;
        int i = 0;
        int err;
        int header_len = 0;
        int nfheader_len = 0;
        int trailer_len = 0;
        int tos;
        int family = policy->selector.family;
        xfrm_address_t saddr, daddr;

        xfrm_flowi_addr_get(fl, &saddr, &daddr, family);

        tos = xfrm_get_tos(fl, family);

        dst_hold(dst);

        for (; i < nx; i++) {
                struct xfrm_dst *xdst = xfrm_alloc_dst(net, family);
                struct dst_entry *dst1 = &xdst->u.dst;

                err = PTR_ERR(xdst);
                if (IS_ERR(xdst)) {
                        dst_release(dst);
                        goto put_states;
                }

                bundle[i] = xdst;
                if (!xdst_prev)
                        xdst0 = xdst;
                else
                        /* Ref count is taken during xfrm_alloc_dst()
                         * No need to do dst_clone() on dst1
                         */
                        xfrm_dst_set_child(xdst_prev, &xdst->u.dst);

                if (xfrm[i]->sel.family == AF_UNSPEC) {
                        inner_mode = xfrm_ip2inner_mode(xfrm[i],
                                                        xfrm_af2proto(family));
                        if (!inner_mode) {
                                err = -EAFNOSUPPORT;
                                dst_release(dst);
                                goto put_states;
                        }
                } else
                        inner_mode = &xfrm[i]->inner_mode;

                xdst->route = dst;
                dst_copy_metrics(dst1, dst);

                if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) {
                        __u32 mark = 0;
                        int oif;

                        if (xfrm[i]->props.smark.v || xfrm[i]->props.smark.m)
                                mark = xfrm_smark_get(fl->flowi_mark, xfrm[i]);

                        family = xfrm[i]->props.family;
                        oif = fl->flowi_oif ? : fl->flowi_l3mdev;
                        dst = xfrm_dst_lookup(xfrm[i], tos, oif,
                                              &saddr, &daddr, family, mark);
                        err = PTR_ERR(dst);
                        if (IS_ERR(dst))
                                goto put_states;
                } else
                        dst_hold(dst);

                dst1->xfrm = xfrm[i];
                xdst->xfrm_genid = xfrm[i]->genid;

                dst1->obsolete = DST_OBSOLETE_FORCE_CHK;
                dst1->lastuse = now;

                dst1->input = dst_discard;

                rcu_read_lock();
                afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family);
                if (likely(afinfo))
                        dst1->output = afinfo->output;
                else
                        dst1->output = dst_discard_out;
                rcu_read_unlock();

                xdst_prev = xdst;

                header_len += xfrm[i]->props.header_len;
                if (xfrm[i]->type->flags & XFRM_TYPE_NON_FRAGMENT)
                        nfheader_len += xfrm[i]->props.header_len;
                trailer_len += xfrm[i]->props.trailer_len;
        }

        xfrm_dst_set_child(xdst_prev, dst);
        xdst0->path = dst;

        err = -ENODEV;
        dev = dst->dev;
        if (!dev)
                goto free_dst;

        xfrm_init_path(xdst0, dst, nfheader_len);
        xfrm_init_pmtu(bundle, nx);

        for (xdst_prev = xdst0; xdst_prev != (struct xfrm_dst *)dst;
             xdst_prev = (struct xfrm_dst *) xfrm_dst_child(&xdst_prev->u.dst)) {
                err = xfrm_fill_dst(xdst_prev, dev, fl);
                if (err)
                        goto free_dst;

                xdst_prev->u.dst.header_len = header_len;
                xdst_prev->u.dst.trailer_len = trailer_len;
                header_len -= xdst_prev->u.dst.xfrm->props.header_len;
                trailer_len -= xdst_prev->u.dst.xfrm->props.trailer_len;
        }

        return &xdst0->u.dst;

put_states:
        for (; i < nx; i++)
                xfrm_state_put(xfrm[i]);
free_dst:
        if (xdst0)
                dst_release_immediate(&xdst0->u.dst);

        return ERR_PTR(err);
}

static int xfrm_expand_policies(const struct flowi *fl, u16 family,
                                struct xfrm_policy **pols,
                                int *num_pols, int *num_xfrms)
{
        int i;

        if (*num_pols == 0 || !pols[0]) {
                *num_pols = 0;
                *num_xfrms = 0;
                return 0;
        }
        if (IS_ERR(pols[0])) {
                *num_pols = 0;
                return PTR_ERR(pols[0]);
        }

        *num_xfrms = pols[0]->xfrm_nr;

#ifdef CONFIG_XFRM_SUB_POLICY
        if (pols[0] && pols[0]->action == XFRM_POLICY_ALLOW &&
            pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
                pols[1] = xfrm_policy_lookup_bytype(xp_net(pols[0]),
                                                    XFRM_POLICY_TYPE_MAIN,
                                                    fl, family,
                                                    XFRM_POLICY_OUT,
                                                    pols[0]->if_id);
                if (pols[1]) {
                        if (IS_ERR(pols[1])) {
                                xfrm_pols_put(pols, *num_pols);
                                *num_pols = 0;
                                return PTR_ERR(pols[1]);
                        }
                        (*num_pols)++;
                        (*num_xfrms) += pols[1]->xfrm_nr;
                }
        }
#endif
        for (i = 0; i < *num_pols; i++) {
                if (pols[i]->action != XFRM_POLICY_ALLOW) {
                        *num_xfrms = -1;
                        break;
                }
        }

        return 0;

}

static struct xfrm_dst *
xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
                               const struct flowi *fl, u16 family,
                               struct dst_entry *dst_orig)
{
        struct net *net = xp_net(pols[0]);
        struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
        struct xfrm_dst *bundle[XFRM_MAX_DEPTH];
        struct xfrm_dst *xdst;
        struct dst_entry *dst;
        int err;

        /* Try to instantiate a bundle */
        err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family);
        if (err <= 0) {
                if (err == 0)
                        return NULL;

                if (err != -EAGAIN)
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
                return ERR_PTR(err);
        }

        dst = xfrm_bundle_create(pols[0], xfrm, bundle, err, fl, dst_orig);
        if (IS_ERR(dst)) {
                XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR);
                return ERR_CAST(dst);
        }

        xdst = (struct xfrm_dst *)dst;
        xdst->num_xfrms = err;
        xdst->num_pols = num_pols;
        memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols);
        xdst->policy_genid = atomic_read(&pols[0]->genid);

        return xdst;
}

static void xfrm_policy_queue_process(struct timer_list *t)
{
        struct sk_buff *skb;
        struct sock *sk;
        struct dst_entry *dst;
        struct xfrm_policy *pol = from_timer(pol, t, polq.hold_timer);
        struct net *net = xp_net(pol);
        struct xfrm_policy_queue *pq = &pol->polq;
        struct flowi fl;
        struct sk_buff_head list;
        __u32 skb_mark;

        spin_lock(&pq->hold_queue.lock);
        skb = skb_peek(&pq->hold_queue);
        if (!skb) {
                spin_unlock(&pq->hold_queue.lock);
                goto out;
        }
        dst = skb_dst(skb);
        sk = skb->sk;

        /* Fixup the mark to support VTI. */
        skb_mark = skb->mark;
        skb->mark = pol->mark.v;
        xfrm_decode_session(skb, &fl, dst->ops->family);
        skb->mark = skb_mark;
        spin_unlock(&pq->hold_queue.lock);

        dst_hold(xfrm_dst_path(dst));
        dst = xfrm_lookup(net, xfrm_dst_path(dst), &fl, sk, XFRM_LOOKUP_QUEUE);
        if (IS_ERR(dst))
                goto purge_queue;

        if (dst->flags & DST_XFRM_QUEUE) {
                dst_release(dst);

                if (pq->timeout >= XFRM_QUEUE_TMO_MAX)
                        goto purge_queue;

                pq->timeout = pq->timeout << 1;
                if (!mod_timer(&pq->hold_timer, jiffies + pq->timeout))
                        xfrm_pol_hold(pol);
                goto out;
        }

        dst_release(dst);

        __skb_queue_head_init(&list);

        spin_lock(&pq->hold_queue.lock);
        pq->timeout = 0;
        skb_queue_splice_init(&pq->hold_queue, &list);
        spin_unlock(&pq->hold_queue.lock);

        while (!skb_queue_empty(&list)) {
                skb = __skb_dequeue(&list);

                /* Fixup the mark to support VTI. */
                skb_mark = skb->mark;
                skb->mark = pol->mark.v;
                xfrm_decode_session(skb, &fl, skb_dst(skb)->ops->family);
                skb->mark = skb_mark;

                dst_hold(xfrm_dst_path(skb_dst(skb)));
                dst = xfrm_lookup(net, xfrm_dst_path(skb_dst(skb)), &fl, skb->sk, 0);
                if (IS_ERR(dst)) {
                        kfree_skb(skb);
                        continue;
                }

                nf_reset_ct(skb);
                skb_dst_drop(skb);
                skb_dst_set(skb, dst);

                dst_output(net, skb_to_full_sk(skb), skb);
        }

out:
        xfrm_pol_put(pol);
        return;

purge_queue:
        pq->timeout = 0;
        skb_queue_purge(&pq->hold_queue);
        xfrm_pol_put(pol);
}

static int xdst_queue_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        unsigned long sched_next;
        struct dst_entry *dst = skb_dst(skb);
        struct xfrm_dst *xdst = (struct xfrm_dst *) dst;
        struct xfrm_policy *pol = xdst->pols[0];
        struct xfrm_policy_queue *pq = &pol->polq;

        if (unlikely(skb_fclone_busy(sk, skb))) {
                kfree_skb(skb);
                return 0;
        }

        if (pq->hold_queue.qlen > XFRM_MAX_QUEUE_LEN) {
                kfree_skb(skb);
                return -EAGAIN;
        }

        skb_dst_force(skb);

        spin_lock_bh(&pq->hold_queue.lock);

        if (!pq->timeout)
                pq->timeout = XFRM_QUEUE_TMO_MIN;

        sched_next = jiffies + pq->timeout;

        if (del_timer(&pq->hold_timer)) {
                if (time_before(pq->hold_timer.expires, sched_next))
                        sched_next = pq->hold_timer.expires;
                xfrm_pol_put(pol);
        }

        __skb_queue_tail(&pq->hold_queue, skb);
        if (!mod_timer(&pq->hold_timer, sched_next))
                xfrm_pol_hold(pol);

        spin_unlock_bh(&pq->hold_queue.lock);

        return 0;
}

static struct xfrm_dst *xfrm_create_dummy_bundle(struct net *net,
                                                 struct xfrm_flo *xflo,
                                                 const struct flowi *fl,
                                                 int num_xfrms,
                                                 u16 family)
{
        int err;
        struct net_device *dev;
        struct dst_entry *dst;
        struct dst_entry *dst1;
        struct xfrm_dst *xdst;

        xdst = xfrm_alloc_dst(net, family);
        if (IS_ERR(xdst))
                return xdst;

        if (!(xflo->flags & XFRM_LOOKUP_QUEUE) ||
            net->xfrm.sysctl_larval_drop ||
            num_xfrms <= 0)
                return xdst;

        dst = xflo->dst_orig;
        dst1 = &xdst->u.dst;
        dst_hold(dst);
        xdst->route = dst;

        dst_copy_metrics(dst1, dst);

        dst1->obsolete = DST_OBSOLETE_FORCE_CHK;
        dst1->flags |= DST_XFRM_QUEUE;
        dst1->lastuse = jiffies;

        dst1->input = dst_discard;
        dst1->output = xdst_queue_output;

        dst_hold(dst);
        xfrm_dst_set_child(xdst, dst);
        xdst->path = dst;

        xfrm_init_path((struct xfrm_dst *)dst1, dst, 0);

        err = -ENODEV;
        dev = dst->dev;
        if (!dev)
                goto free_dst;

        err = xfrm_fill_dst(xdst, dev, fl);
        if (err)
                goto free_dst;

out:
        return xdst;

free_dst:
        dst_release(dst1);
        xdst = ERR_PTR(err);
        goto out;
}

static struct xfrm_dst *xfrm_bundle_lookup(struct net *net,
                                           const struct flowi *fl,
                                           u16 family, u8 dir,
                                           struct xfrm_flo *xflo, u32 if_id)
{
        struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
        int num_pols = 0, num_xfrms = 0, err;
        struct xfrm_dst *xdst;

        /* Resolve policies to use if we couldn't get them from
         * previous cache entry */
        num_pols = 1;
        pols[0] = xfrm_policy_lookup(net, fl, family, dir, if_id);
        err = xfrm_expand_policies(fl, family, pols,
                                           &num_pols, &num_xfrms);
        if (err < 0)
                goto inc_error;
        if (num_pols == 0)
                return NULL;
        if (num_xfrms <= 0)
                goto make_dummy_bundle;

        xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family,
                                              xflo->dst_orig);
        if (IS_ERR(xdst)) {
                err = PTR_ERR(xdst);
                if (err == -EREMOTE) {
                        xfrm_pols_put(pols, num_pols);
                        return NULL;
                }

                if (err != -EAGAIN)
                        goto error;
                goto make_dummy_bundle;
        } else if (xdst == NULL) {
                num_xfrms = 0;
                goto make_dummy_bundle;
        }

        return xdst;

make_dummy_bundle:
        /* We found policies, but there's no bundles to instantiate:
         * either because the policy blocks, has no transformations or
         * we could not build template (no xfrm_states).*/
        xdst = xfrm_create_dummy_bundle(net, xflo, fl, num_xfrms, family);
        if (IS_ERR(xdst)) {
                xfrm_pols_put(pols, num_pols);
                return ERR_CAST(xdst);
        }
        xdst->num_pols = num_pols;
        xdst->num_xfrms = num_xfrms;
        memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols);

        return xdst;

inc_error:
        XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
error:
        xfrm_pols_put(pols, num_pols);
        return ERR_PTR(err);
}

static struct dst_entry *make_blackhole(struct net *net, u16 family,
                                        struct dst_entry *dst_orig)
{
        const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
        struct dst_entry *ret;

        if (!afinfo) {
                dst_release(dst_orig);
                return ERR_PTR(-EINVAL);
        } else {
                ret = afinfo->blackhole_route(net, dst_orig);
        }
        rcu_read_unlock();

        return ret;
}

/* Finds/creates a bundle for given flow and if_id
 *
 * At the moment we eat a raw IP route. Mostly to speed up lookups
 * on interfaces with disabled IPsec.
 *
 * xfrm_lookup uses an if_id of 0 by default, and is provided for
 * compatibility
 */
struct dst_entry *xfrm_lookup_with_ifid(struct net *net,
                                        struct dst_entry *dst_orig,
                                        const struct flowi *fl,
                                        const struct sock *sk,
                                        int flags, u32 if_id)
{
        struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
        struct xfrm_dst *xdst;
        struct dst_entry *dst, *route;
        u16 family = dst_orig->ops->family;
        u8 dir = XFRM_POLICY_OUT;
        int i, err, num_pols, num_xfrms = 0, drop_pols = 0;

        dst = NULL;
        xdst = NULL;
        route = NULL;

        sk = sk_const_to_full_sk(sk);
        if (sk && sk->sk_policy[XFRM_POLICY_OUT]) {
                num_pols = 1;
                pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, family,
                                                if_id);
                err = xfrm_expand_policies(fl, family, pols,
                                           &num_pols, &num_xfrms);
                if (err < 0)
                        goto dropdst;

                if (num_pols) {
                        if (num_xfrms <= 0) {
                                drop_pols = num_pols;
                                goto no_transform;
                        }

                        xdst = xfrm_resolve_and_create_bundle(
                                        pols, num_pols, fl,
                                        family, dst_orig);

                        if (IS_ERR(xdst)) {
                                xfrm_pols_put(pols, num_pols);
                                err = PTR_ERR(xdst);
                                if (err == -EREMOTE)
                                        goto nopol;

                                goto dropdst;
                        } else if (xdst == NULL) {
                                num_xfrms = 0;
                                drop_pols = num_pols;
                                goto no_transform;
                        }

                        route = xdst->route;
                }
        }

        if (xdst == NULL) {
                struct xfrm_flo xflo;

                xflo.dst_orig = dst_orig;
                xflo.flags = flags;

                /* To accelerate a bit...  */
                if (!if_id && ((dst_orig->flags & DST_NOXFRM) ||
                               !net->xfrm.policy_count[XFRM_POLICY_OUT]))
                        goto nopol;

                xdst = xfrm_bundle_lookup(net, fl, family, dir, &xflo, if_id);
                if (xdst == NULL)
                        goto nopol;
                if (IS_ERR(xdst)) {
                        err = PTR_ERR(xdst);
                        goto dropdst;
                }

                num_pols = xdst->num_pols;
                num_xfrms = xdst->num_xfrms;
                memcpy(pols, xdst->pols, sizeof(struct xfrm_policy *) * num_pols);
                route = xdst->route;
        }

        dst = &xdst->u.dst;
        if (route == NULL && num_xfrms > 0) {
                /* The only case when xfrm_bundle_lookup() returns a
                 * bundle with null route, is when the template could
                 * not be resolved. It means policies are there, but
                 * bundle could not be created, since we don't yet
                 * have the xfrm_state's. We need to wait for KM to
                 * negotiate new SA's or bail out with error.*/
                if (net->xfrm.sysctl_larval_drop) {
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
                        err = -EREMOTE;
                        goto error;
                }

                err = -EAGAIN;

                XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
                goto error;
        }

no_transform:
        if (num_pols == 0)
                goto nopol;

        if ((flags & XFRM_LOOKUP_ICMP) &&
            !(pols[0]->flags & XFRM_POLICY_ICMP)) {
                err = -ENOENT;
                goto error;
        }

        for (i = 0; i < num_pols; i++)
                pols[i]->curlft.use_time = ktime_get_real_seconds();

        if (num_xfrms < 0) {
                /* Prohibit the flow */
                XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLBLOCK);
                err = -EPERM;
                goto error;
        } else if (num_xfrms > 0) {
                /* Flow transformed */
                dst_release(dst_orig);
        } else {
                /* Flow passes untransformed */
                dst_release(dst);
                dst = dst_orig;
        }
ok:
        xfrm_pols_put(pols, drop_pols);
        if (dst && dst->xfrm &&
            dst->xfrm->props.mode == XFRM_MODE_TUNNEL)
                dst->flags |= DST_XFRM_TUNNEL;
        return dst;

nopol:
        if ((!dst_orig->dev || !(dst_orig->dev->flags & IFF_LOOPBACK)) &&
            net->xfrm.policy_default[dir] == XFRM_USERPOLICY_BLOCK) {
                err = -EPERM;
                goto error;
        }
        if (!(flags & XFRM_LOOKUP_ICMP)) {
                dst = dst_orig;
                goto ok;
        }
        err = -ENOENT;
error:
        dst_release(dst);
dropdst:
        if (!(flags & XFRM_LOOKUP_KEEP_DST_REF))
                dst_release(dst_orig);
        xfrm_pols_put(pols, drop_pols);
        return ERR_PTR(err);
}
EXPORT_SYMBOL(xfrm_lookup_with_ifid);

/* Main function: finds/creates a bundle for given flow.
 *
 * At the moment we eat a raw IP route. Mostly to speed up lookups
 * on interfaces with disabled IPsec.
 */
struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
                              const struct flowi *fl, const struct sock *sk,
                              int flags)
{
        return xfrm_lookup_with_ifid(net, dst_orig, fl, sk, flags, 0);
}
EXPORT_SYMBOL(xfrm_lookup);

/* Callers of xfrm_lookup_route() must ensure a call to dst_output().
 * Otherwise we may send out blackholed packets.
 */
struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig,
                                    const struct flowi *fl,
                                    const struct sock *sk, int flags)
{
        struct dst_entry *dst = xfrm_lookup(net, dst_orig, fl, sk,
                                            flags | XFRM_LOOKUP_QUEUE |
                                            XFRM_LOOKUP_KEEP_DST_REF);

        if (PTR_ERR(dst) == -EREMOTE)
                return make_blackhole(net, dst_orig->ops->family, dst_orig);

        if (IS_ERR(dst))
                dst_release(dst_orig);

        return dst;
}
EXPORT_SYMBOL(xfrm_lookup_route);

static inline int
xfrm_secpath_reject(int idx, struct sk_buff *skb, const struct flowi *fl)
{
        struct sec_path *sp = skb_sec_path(skb);
        struct xfrm_state *x;

        if (!sp || idx < 0 || idx >= sp->len)
                return 0;
        x = sp->xvec[idx];
        if (!x->type->reject)
                return 0;
        return x->type->reject(x, skb, fl);
}

/* When skb is transformed back to its "native" form, we have to
 * check policy restrictions. At the moment we make this in maximally
 * stupid way. Shame on me. :-) Of course, connected sockets must
 * have policy cached at them.
 */

static inline int
xfrm_state_ok(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x,
              unsigned short family, u32 if_id)
{
        if (xfrm_state_kern(x))
                return tmpl->optional && !xfrm_state_addr_cmp(tmpl, x, tmpl->encap_family);
        return        x->id.proto == tmpl->id.proto &&
                (x->id.spi == tmpl->id.spi || !tmpl->id.spi) &&
                (x->props.reqid == tmpl->reqid || !tmpl->reqid) &&
                x->props.mode == tmpl->mode &&
                (tmpl->allalgs || (tmpl->aalgos & (1<<x->props.aalgo)) ||
                 !(xfrm_id_proto_match(tmpl->id.proto, IPSEC_PROTO_ANY))) &&
                !(x->props.mode != XFRM_MODE_TRANSPORT &&
                  xfrm_state_addr_cmp(tmpl, x, family)) &&
                (if_id == 0 || if_id == x->if_id);
}

/*
 * 0 or more than 0 is returned when validation is succeeded (either bypass
 * because of optional transport mode, or next index of the mathced secpath
 * state with the template.
 * -1 is returned when no matching template is found.
 * Otherwise "-2 - errored_index" is returned.
 */
static inline int
xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int start,
               unsigned short family, u32 if_id)
{
        int idx = start;

        if (tmpl->optional) {
                if (tmpl->mode == XFRM_MODE_TRANSPORT)
                        return start;
        } else
                start = -1;
        for (; idx < sp->len; idx++) {
                if (xfrm_state_ok(tmpl, sp->xvec[idx], family, if_id))
                        return ++idx;
                if (sp->xvec[idx]->props.mode != XFRM_MODE_TRANSPORT) {
                        if (idx < sp->verified_cnt) {
                                /* Secpath entry previously verified, consider optional and
                                 * continue searching
                                 */
                                continue;
                        }

                        if (start == -1)
                                start = -2-idx;
                        break;
                }
        }
        return start;
}

static void
decode_session4(struct sk_buff *skb, struct flowi *fl, bool reverse)
{
        const struct iphdr *iph = ip_hdr(skb);
        int ihl = iph->ihl;
        u8 *xprth = skb_network_header(skb) + ihl * 4;
        struct flowi4 *fl4 = &fl->u.ip4;
        int oif = 0;

        if (skb_dst(skb) && skb_dst(skb)->dev)
                oif = skb_dst(skb)->dev->ifindex;

        memset(fl4, 0, sizeof(struct flowi4));
        fl4->flowi4_mark = skb->mark;
        fl4->flowi4_oif = reverse ? skb->skb_iif : oif;

        fl4->flowi4_proto = iph->protocol;
        fl4->daddr = reverse ? iph->saddr : iph->daddr;
        fl4->saddr = reverse ? iph->daddr : iph->saddr;
        fl4->flowi4_tos = iph->tos & ~INET_ECN_MASK;

        if (!ip_is_fragment(iph)) {
                switch (iph->protocol) {
                case IPPROTO_UDP:
                case IPPROTO_UDPLITE:
                case IPPROTO_TCP:
                case IPPROTO_SCTP:
                case IPPROTO_DCCP:
                        if (xprth + 4 < skb->data ||
                            pskb_may_pull(skb, xprth + 4 - skb->data)) {
                                __be16 *ports;

                                xprth = skb_network_header(skb) + ihl * 4;
                                ports = (__be16 *)xprth;

                                fl4->fl4_sport = ports[!!reverse];
                                fl4->fl4_dport = ports[!reverse];
                        }
                        break;
                case IPPROTO_ICMP:
                        if (xprth + 2 < skb->data ||
                            pskb_may_pull(skb, xprth + 2 - skb->data)) {
                                u8 *icmp;

                                xprth = skb_network_header(skb) + ihl * 4;
                                icmp = xprth;

                                fl4->fl4_icmp_type = icmp[0];
                                fl4->fl4_icmp_code = icmp[1];
                        }
                        break;
                case IPPROTO_ESP:
                        if (xprth + 4 < skb->data ||
                            pskb_may_pull(skb, xprth + 4 - skb->data)) {
                                __be32 *ehdr;

                                xprth = skb_network_header(skb) + ihl * 4;
                                ehdr = (__be32 *)xprth;

                                fl4->fl4_ipsec_spi = ehdr[0];
                        }
                        break;
                case IPPROTO_AH:
                        if (xprth + 8 < skb->data ||
                            pskb_may_pull(skb, xprth + 8 - skb->data)) {
                                __be32 *ah_hdr;

                                xprth = skb_network_header(skb) + ihl * 4;
                                ah_hdr = (__be32 *)xprth;

                                fl4->fl4_ipsec_spi = ah_hdr[1];
                        }
                        break;
                case IPPROTO_COMP:
                        if (xprth + 4 < skb->data ||
                            pskb_may_pull(skb, xprth + 4 - skb->data)) {
                                __be16 *ipcomp_hdr;

                                xprth = skb_network_header(skb) + ihl * 4;
                                ipcomp_hdr = (__be16 *)xprth;

                                fl4->fl4_ipsec_spi = htonl(ntohs(ipcomp_hdr[1]));
                        }
                        break;
                case IPPROTO_GRE:
                        if (xprth + 12 < skb->data ||
                            pskb_may_pull(skb, xprth + 12 - skb->data)) {
                                __be16 *greflags;
                                __be32 *gre_hdr;

                                xprth = skb_network_header(skb) + ihl * 4;
                                greflags = (__be16 *)xprth;
                                gre_hdr = (__be32 *)xprth;

                                if (greflags[0] & GRE_KEY) {
                                        if (greflags[0] & GRE_CSUM)
                                                gre_hdr++;
                                        fl4->fl4_gre_key = gre_hdr[1];
                                }
                        }
                        break;
                default:
                        fl4->fl4_ipsec_spi = 0;
                        break;
                }
        }
}

#if IS_ENABLED(CONFIG_IPV6)
static void
decode_session6(struct sk_buff *skb, struct flowi *fl, bool reverse)
{
        struct flowi6 *fl6 = &fl->u.ip6;
        int onlyproto = 0;
        const struct ipv6hdr *hdr = ipv6_hdr(skb);
        u32 offset = sizeof(*hdr);
        struct ipv6_opt_hdr *exthdr;
        const unsigned char *nh = skb_network_header(skb);
        u16 nhoff = IP6CB(skb)->nhoff;
        int oif = 0;
        u8 nexthdr;

        if (!nhoff)
                nhoff = offsetof(struct ipv6hdr, nexthdr);

        nexthdr = nh[nhoff];

        if (skb_dst(skb) && skb_dst(skb)->dev)
                oif = skb_dst(skb)->dev->ifindex;

        memset(fl6, 0, sizeof(struct flowi6));
        fl6->flowi6_mark = skb->mark;
        fl6->flowi6_oif = reverse ? skb->skb_iif : oif;

        fl6->daddr = reverse ? hdr->saddr : hdr->daddr;
        fl6->saddr = reverse ? hdr->daddr : hdr->saddr;

        while (nh + offset + sizeof(*exthdr) < skb->data ||
               pskb_may_pull(skb, nh + offset + sizeof(*exthdr) - skb->data)) {
                nh = skb_network_header(skb);
                exthdr = (struct ipv6_opt_hdr *)(nh + offset);

                switch (nexthdr) {
                case NEXTHDR_FRAGMENT:
                        onlyproto = 1;
                        fallthrough;
                case NEXTHDR_ROUTING:
                case NEXTHDR_HOP:
                case NEXTHDR_DEST:
                        offset += ipv6_optlen(exthdr);
                        nexthdr = exthdr->nexthdr;
                        exthdr = (struct ipv6_opt_hdr *)(nh + offset);
                        break;
                case IPPROTO_UDP:
                case IPPROTO_UDPLITE:
                case IPPROTO_TCP:
                case IPPROTO_SCTP:
                case IPPROTO_DCCP:
                        if (!onlyproto && (nh + offset + 4 < skb->data ||
                             pskb_may_pull(skb, nh + offset + 4 - skb->data))) {
                                __be16 *ports;

                                nh = skb_network_header(skb);
                                ports = (__be16 *)(nh + offset);
                                fl6->fl6_sport = ports[!!reverse];
                                fl6->fl6_dport = ports[!reverse];
                        }
                        fl6->flowi6_proto = nexthdr;
                        return;
                case IPPROTO_ICMPV6:
                        if (!onlyproto && (nh + offset + 2 < skb->data ||
                            pskb_may_pull(skb, nh + offset + 2 - skb->data))) {
                                u8 *icmp;

                                nh = skb_network_header(skb);
                                icmp = (u8 *)(nh + offset);
                                fl6->fl6_icmp_type = icmp[0];
                                fl6->fl6_icmp_code = icmp[1];
                        }
                        fl6->flowi6_proto = nexthdr;
                        return;
                case IPPROTO_GRE:
                        if (!onlyproto &&
                            (nh + offset + 12 < skb->data ||
                             pskb_may_pull(skb, nh + offset + 12 - skb->data))) {
                                struct gre_base_hdr *gre_hdr;
                                __be32 *gre_key;

                                nh = skb_network_header(skb);
                                gre_hdr = (struct gre_base_hdr *)(nh + offset);
                                gre_key = (__be32 *)(gre_hdr + 1);

                                if (gre_hdr->flags & GRE_KEY) {
                                        if (gre_hdr->flags & GRE_CSUM)
                                                gre_key++;
                                        fl6->fl6_gre_key = *gre_key;
                                }
                        }
                        fl6->flowi6_proto = nexthdr;
                        return;

#if IS_ENABLED(CONFIG_IPV6_MIP6)
                case IPPROTO_MH:
                        offset += ipv6_optlen(exthdr);
                        if (!onlyproto && (nh + offset + 3 < skb->data ||
                            pskb_may_pull(skb, nh + offset + 3 - skb->data))) {
                                struct ip6_mh *mh;

                                nh = skb_network_header(skb);
                                mh = (struct ip6_mh *)(nh + offset);
                                fl6->fl6_mh_type = mh->ip6mh_type;
                        }
                        fl6->flowi6_proto = nexthdr;
                        return;
#endif
                /* XXX Why are there these headers? */
                case IPPROTO_AH:
                case IPPROTO_ESP:
                case IPPROTO_COMP:
                default:
                        fl6->fl6_ipsec_spi = 0;
                        fl6->flowi6_proto = nexthdr;
                        return;
                }
        }
}
#endif

int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl,
                          unsigned int family, int reverse)
{
        switch (family) {
        case AF_INET:
                decode_session4(skb, fl, reverse);
                break;
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                decode_session6(skb, fl, reverse);
                break;
#endif
        default:
                return -EAFNOSUPPORT;
        }

        return security_xfrm_decode_session(skb, &fl->flowi_secid);
}
EXPORT_SYMBOL(__xfrm_decode_session);

static inline int secpath_has_nontransport(const struct sec_path *sp, int k, int *idxp)
{
        for (; k < sp->len; k++) {
                if (sp->xvec[k]->props.mode != XFRM_MODE_TRANSPORT) {
                        *idxp = k;
                        return 1;
                }
        }

        return 0;
}

int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
                        unsigned short family)
{
        struct net *net = dev_net(skb->dev);
        struct xfrm_policy *pol;
        struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
        int npols = 0;
        int xfrm_nr;
        int pi;
        int reverse;
        struct flowi fl;
        int xerr_idx = -1;
        const struct xfrm_if_cb *ifcb;
        struct sec_path *sp;
        struct xfrm_if *xi;
        u32 if_id = 0;

        rcu_read_lock();
        ifcb = xfrm_if_get_cb();

        if (ifcb) {
                xi = ifcb->decode_session(skb, family);
                if (xi) {
                        if_id = xi->p.if_id;
                        net = xi->net;
                }
        }
        rcu_read_unlock();

        reverse = dir & ~XFRM_POLICY_MASK;
        dir &= XFRM_POLICY_MASK;

        if (__xfrm_decode_session(skb, &fl, family, reverse) < 0) {
                XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR);
                return 0;
        }

        nf_nat_decode_session(skb, &fl, family);

        /* First, check used SA against their selectors. */
        sp = skb_sec_path(skb);
        if (sp) {
                int i;

                for (i = sp->len - 1; i >= 0; i--) {
                        struct xfrm_state *x = sp->xvec[i];
                        if (!xfrm_selector_match(&x->sel, &fl, family)) {
                                XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMISMATCH);
                                return 0;
                        }
                }
        }

        pol = NULL;
        sk = sk_to_full_sk(sk);
        if (sk && sk->sk_policy[dir]) {
                pol = xfrm_sk_policy_lookup(sk, dir, &fl, family, if_id);
                if (IS_ERR(pol)) {
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
                        return 0;
                }
        }

        if (!pol)
                pol = xfrm_policy_lookup(net, &fl, family, dir, if_id);

        if (IS_ERR(pol)) {
                XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
                return 0;
        }

        if (!pol) {
                if (net->xfrm.policy_default[dir] == XFRM_USERPOLICY_BLOCK) {
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS);
                        return 0;
                }

                if (sp && secpath_has_nontransport(sp, 0, &xerr_idx)) {
                        xfrm_secpath_reject(xerr_idx, skb, &fl);
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS);
                        return 0;
                }
                return 1;
        }

        pol->curlft.use_time = ktime_get_real_seconds();

        pols[0] = pol;
        npols++;
#ifdef CONFIG_XFRM_SUB_POLICY
        if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
                pols[1] = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN,
                                                    &fl, family,
                                                    XFRM_POLICY_IN, if_id);
                if (pols[1]) {
                        if (IS_ERR(pols[1])) {
                                XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
                                xfrm_pol_put(pols[0]);
                                return 0;
                        }
                        pols[1]->curlft.use_time = ktime_get_real_seconds();
                        npols++;
                }
        }
#endif

        if (pol->action == XFRM_POLICY_ALLOW) {
                static struct sec_path dummy;
                struct xfrm_tmpl *tp[XFRM_MAX_DEPTH];
                struct xfrm_tmpl *stp[XFRM_MAX_DEPTH];
                struct xfrm_tmpl **tpp = tp;
                int ti = 0;
                int i, k;

                sp = skb_sec_path(skb);
                if (!sp)
                        sp = &dummy;

                for (pi = 0; pi < npols; pi++) {
                        if (pols[pi] != pol &&
                            pols[pi]->action != XFRM_POLICY_ALLOW) {
                                XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK);
                                goto reject;
                        }
                        if (ti + pols[pi]->xfrm_nr >= XFRM_MAX_DEPTH) {
                                XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
                                goto reject_error;
                        }
                        for (i = 0; i < pols[pi]->xfrm_nr; i++)
                                tpp[ti++] = &pols[pi]->xfrm_vec[i];
                }
                xfrm_nr = ti;

                if (npols > 1) {
                        xfrm_tmpl_sort(stp, tpp, xfrm_nr, family);
                        tpp = stp;
                }

                /* For each tunnel xfrm, find the first matching tmpl.
                 * For each tmpl before that, find corresponding xfrm.
                 * Order is _important_. Later we will implement
                 * some barriers, but at the moment barriers
                 * are implied between each two transformations.
                 * Upon success, marks secpath entries as having been
                 * verified to allow them to be skipped in future policy
                 * checks (e.g. nested tunnels).
                 */
                for (i = xfrm_nr-1, k = 0; i >= 0; i--) {
                        k = xfrm_policy_ok(tpp[i], sp, k, family, if_id);
                        if (k < 0) {
                                if (k < -1)
                                        /* "-2 - errored_index" returned */
                                        xerr_idx = -(2+k);
                                XFRM_INC_STATS(net, LINUX_MIB_XFRMINTMPLMISMATCH);
                                goto reject;
                        }
                }

                if (secpath_has_nontransport(sp, k, &xerr_idx)) {
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINTMPLMISMATCH);
                        goto reject;
                }

                xfrm_pols_put(pols, npols);
                sp->verified_cnt = k;

                return 1;
        }
        XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK);

reject:
        xfrm_secpath_reject(xerr_idx, skb, &fl);
reject_error:
        xfrm_pols_put(pols, npols);
        return 0;
}
EXPORT_SYMBOL(__xfrm_policy_check);

int __xfrm_route_forward(struct sk_buff *skb, unsigned short family)
{
        struct net *net = dev_net(skb->dev);
        struct flowi fl;
        struct dst_entry *dst;
        int res = 1;

        if (xfrm_decode_session(skb, &fl, family) < 0) {
                XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR);
                return 0;
        }

        skb_dst_force(skb);
        if (!skb_dst(skb)) {
                XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR);
                return 0;
        }

        dst = xfrm_lookup(net, skb_dst(skb), &fl, NULL, XFRM_LOOKUP_QUEUE);
        if (IS_ERR(dst)) {
                res = 0;
                dst = NULL;
        }
        skb_dst_set(skb, dst);
        return res;
}
EXPORT_SYMBOL(__xfrm_route_forward);

/* Optimize later using cookies and generation ids. */

static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie)
{
        /* Code (such as __xfrm4_bundle_create()) sets dst->obsolete
         * to DST_OBSOLETE_FORCE_CHK to force all XFRM destinations to
         * get validated by dst_ops->check on every use.  We do this
         * because when a normal route referenced by an XFRM dst is
         * obsoleted we do not go looking around for all parent
         * referencing XFRM dsts so that we can invalidate them.  It
         * is just too much work.  Instead we make the checks here on
         * every use.  For example:
         *
         *        XFRM dst A --> IPv4 dst X
         *
         * X is the "xdst->route" of A (X is also the "dst->path" of A
         * in this example).  If X is marked obsolete, "A" will not
         * notice.  That's what we are validating here via the
         * stale_bundle() check.
         *
         * When a dst is removed from the fib tree, DST_OBSOLETE_DEAD will
         * be marked on it.
         * This will force stale_bundle() to fail on any xdst bundle with
         * this dst linked in it.
         */
        if (dst->obsolete < 0 && !stale_bundle(dst))
                return dst;

        return NULL;
}

static int stale_bundle(struct dst_entry *dst)
{
        return !xfrm_bundle_ok((struct xfrm_dst *)dst);
}

void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev)
{
        while ((dst = xfrm_dst_child(dst)) && dst->xfrm && dst->dev == dev) {
                dst->dev = dev_net(dev)->loopback_dev;
                dev_hold(dst->dev);
                dev_put(dev);
        }
}
EXPORT_SYMBOL(xfrm_dst_ifdown);

static void xfrm_link_failure(struct sk_buff *skb)
{
        /* Impossible. Such dst must be popped before reaches point of failure. */
}

static void xfrm_negative_advice(struct sock *sk, struct dst_entry *dst)
{
        if (dst->obsolete)
                sk_dst_reset(sk);
}

static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr)
{
        while (nr--) {
                struct xfrm_dst *xdst = bundle[nr];
                u32 pmtu, route_mtu_cached;
                struct dst_entry *dst;

                dst = &xdst->u.dst;
                pmtu = dst_mtu(xfrm_dst_child(dst));
                xdst->child_mtu_cached = pmtu;

                pmtu = xfrm_state_mtu(dst->xfrm, pmtu);

                route_mtu_cached = dst_mtu(xdst->route);
                xdst->route_mtu_cached = route_mtu_cached;

                if (pmtu > route_mtu_cached)
                        pmtu = route_mtu_cached;

                dst_metric_set(dst, RTAX_MTU, pmtu);
        }
}

/* Check that the bundle accepts the flow and its components are
 * still valid.
 */

static int xfrm_bundle_ok(struct xfrm_dst *first)
{
        struct xfrm_dst *bundle[XFRM_MAX_DEPTH];
        struct dst_entry *dst = &first->u.dst;
        struct xfrm_dst *xdst;
        int start_from, nr;
        u32 mtu;

        if (!dst_check(xfrm_dst_path(dst), ((struct xfrm_dst *)dst)->path_cookie) ||
            (dst->dev && !netif_running(dst->dev)))
                return 0;

        if (dst->flags & DST_XFRM_QUEUE)
                return 1;

        start_from = nr = 0;
        do {
                struct xfrm_dst *xdst = (struct xfrm_dst *)dst;

                if (dst->xfrm->km.state != XFRM_STATE_VALID)
                        return 0;
                if (xdst->xfrm_genid != dst->xfrm->genid)
                        return 0;
                if (xdst->num_pols > 0 &&
                    xdst->policy_genid != atomic_read(&xdst->pols[0]->genid))
                        return 0;

                bundle[nr++] = xdst;

                mtu = dst_mtu(xfrm_dst_child(dst));
                if (xdst->child_mtu_cached != mtu) {
                        start_from = nr;
                        xdst->child_mtu_cached = mtu;
                }

                if (!dst_check(xdst->route, xdst->route_cookie))
                        return 0;
                mtu = dst_mtu(xdst->route);
                if (xdst->route_mtu_cached != mtu) {
                        start_from = nr;
                        xdst->route_mtu_cached = mtu;
                }

                dst = xfrm_dst_child(dst);
        } while (dst->xfrm);

        if (likely(!start_from))
                return 1;

        xdst = bundle[start_from - 1];
        mtu = xdst->child_mtu_cached;
        while (start_from--) {
                dst = &xdst->u.dst;

                mtu = xfrm_state_mtu(dst->xfrm, mtu);
                if (mtu > xdst->route_mtu_cached)
                        mtu = xdst->route_mtu_cached;
                dst_metric_set(dst, RTAX_MTU, mtu);
                if (!start_from)
                        break;

                xdst = bundle[start_from - 1];
                xdst->child_mtu_cached = mtu;
        }

        return 1;
}

static unsigned int xfrm_default_advmss(const struct dst_entry *dst)
{
        return dst_metric_advmss(xfrm_dst_path(dst));
}

static unsigned int xfrm_mtu(const struct dst_entry *dst)
{
        unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);

        return mtu ? : dst_mtu(xfrm_dst_path(dst));
}

static const void *xfrm_get_dst_nexthop(const struct dst_entry *dst,
                                        const void *daddr)
{
        while (dst->xfrm) {
                const struct xfrm_state *xfrm = dst->xfrm;

                dst = xfrm_dst_child(dst);

                if (xfrm->props.mode == XFRM_MODE_TRANSPORT)
                        continue;
                if (xfrm->type->flags & XFRM_TYPE_REMOTE_COADDR)
                        daddr = xfrm->coaddr;
                else if (!(xfrm->type->flags & XFRM_TYPE_LOCAL_COADDR))
                        daddr = &xfrm->id.daddr;
        }
        return daddr;
}

static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst,
                                           struct sk_buff *skb,
                                           const void *daddr)
{
        const struct dst_entry *path = xfrm_dst_path(dst);

        if (!skb)
                daddr = xfrm_get_dst_nexthop(dst, daddr);
        return path->ops->neigh_lookup(path, skb, daddr);
}

static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr)
{
        const struct dst_entry *path = xfrm_dst_path(dst);

        daddr = xfrm_get_dst_nexthop(dst, daddr);
        path->ops->confirm_neigh(path, daddr);
}

int xfrm_policy_register_afinfo(const struct xfrm_policy_afinfo *afinfo, int family)
{
        int err = 0;

        if (WARN_ON(family >= ARRAY_SIZE(xfrm_policy_afinfo)))
                return -EAFNOSUPPORT;

        spin_lock(&xfrm_policy_afinfo_lock);
        if (unlikely(xfrm_policy_afinfo[family] != NULL))
                err = -EEXIST;
        else {
                struct dst_ops *dst_ops = afinfo->dst_ops;
                if (likely(dst_ops->kmem_cachep == NULL))
                        dst_ops->kmem_cachep = xfrm_dst_cache;
                if (likely(dst_ops->check == NULL))
                        dst_ops->check = xfrm_dst_check;
                if (likely(dst_ops->default_advmss == NULL))
                        dst_ops->default_advmss = xfrm_default_advmss;
                if (likely(dst_ops->mtu == NULL))
                        dst_ops->mtu = xfrm_mtu;
                if (likely(dst_ops->negative_advice == NULL))
                        dst_ops->negative_advice = xfrm_negative_advice;
                if (likely(dst_ops->link_failure == NULL))
                        dst_ops->link_failure = xfrm_link_failure;
                if (likely(dst_ops->neigh_lookup == NULL))
                        dst_ops->neigh_lookup = xfrm_neigh_lookup;
                if (likely(!dst_ops->confirm_neigh))
                        dst_ops->confirm_neigh = xfrm_confirm_neigh;
                rcu_assign_pointer(xfrm_policy_afinfo[family], afinfo);
        }
        spin_unlock(&xfrm_policy_afinfo_lock);

        return err;
}
EXPORT_SYMBOL(xfrm_policy_register_afinfo);

void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo)
{
        struct dst_ops *dst_ops = afinfo->dst_ops;
        int i;

        for (i = 0; i < ARRAY_SIZE(xfrm_policy_afinfo); i++) {
                if (xfrm_policy_afinfo[i] != afinfo)
                        continue;
                RCU_INIT_POINTER(xfrm_policy_afinfo[i], NULL);
                break;
        }

        synchronize_rcu();

        dst_ops->kmem_cachep = NULL;
        dst_ops->check = NULL;
        dst_ops->negative_advice = NULL;
        dst_ops->link_failure = NULL;
}
EXPORT_SYMBOL(xfrm_policy_unregister_afinfo);

void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb)
{
        spin_lock(&xfrm_if_cb_lock);
        rcu_assign_pointer(xfrm_if_cb, ifcb);
        spin_unlock(&xfrm_if_cb_lock);
}
EXPORT_SYMBOL(xfrm_if_register_cb);

void xfrm_if_unregister_cb(void)
{
        RCU_INIT_POINTER(xfrm_if_cb, NULL);
        synchronize_rcu();
}
EXPORT_SYMBOL(xfrm_if_unregister_cb);

#ifdef CONFIG_XFRM_STATISTICS
static int __net_init xfrm_statistics_init(struct net *net)
{
        int rv;
        net->mib.xfrm_statistics = alloc_percpu(struct linux_xfrm_mib);
        if (!net->mib.xfrm_statistics)
                return -ENOMEM;
        rv = xfrm_proc_init(net);
        if (rv < 0)
                free_percpu(net->mib.xfrm_statistics);
        return rv;
}

static void xfrm_statistics_fini(struct net *net)
{
        xfrm_proc_fini(net);
        free_percpu(net->mib.xfrm_statistics);
}
#else
static int __net_init xfrm_statistics_init(struct net *net)
{
        return 0;
}

static void xfrm_statistics_fini(struct net *net)
{
}
#endif

static int __net_init xfrm_policy_init(struct net *net)
{
        unsigned int hmask, sz;
        int dir, err;

        if (net_eq(net, &init_net)) {
                xfrm_dst_cache = kmem_cache_create("xfrm_dst_cache",
                                           sizeof(struct xfrm_dst),
                                           0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
                                           NULL);
                err = rhashtable_init(&xfrm_policy_inexact_table,
                                      &xfrm_pol_inexact_params);
                BUG_ON(err);
        }

        hmask = 8 - 1;
        sz = (hmask+1) * sizeof(struct hlist_head);

        net->xfrm.policy_byidx = xfrm_hash_alloc(sz);
        if (!net->xfrm.policy_byidx)
                goto out_byidx;
        net->xfrm.policy_idx_hmask = hmask;

        for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
                struct xfrm_policy_hash *htab;

                net->xfrm.policy_count[dir] = 0;
                net->xfrm.policy_count[XFRM_POLICY_MAX + dir] = 0;
                INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]);

                htab = &net->xfrm.policy_bydst[dir];
                htab->table = xfrm_hash_alloc(sz);
                if (!htab->table)
                        goto out_bydst;
                htab->hmask = hmask;
                htab->dbits4 = 32;
                htab->sbits4 = 32;
                htab->dbits6 = 128;
                htab->sbits6 = 128;
        }
        net->xfrm.policy_hthresh.lbits4 = 32;
        net->xfrm.policy_hthresh.rbits4 = 32;
        net->xfrm.policy_hthresh.lbits6 = 128;
        net->xfrm.policy_hthresh.rbits6 = 128;

        seqlock_init(&net->xfrm.policy_hthresh.lock);

        INIT_LIST_HEAD(&net->xfrm.policy_all);
        INIT_LIST_HEAD(&net->xfrm.inexact_bins);
        INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize);
        INIT_WORK(&net->xfrm.policy_hthresh.work, xfrm_hash_rebuild);
        return 0;

out_bydst:
        for (dir--; dir >= 0; dir--) {
                struct xfrm_policy_hash *htab;

                htab = &net->xfrm.policy_bydst[dir];
                xfrm_hash_free(htab->table, sz);
        }
        xfrm_hash_free(net->xfrm.policy_byidx, sz);
out_byidx:
        return -ENOMEM;
}

static void xfrm_policy_fini(struct net *net)
{
        struct xfrm_pol_inexact_bin *b, *t;
        unsigned int sz;
        int dir;

        flush_work(&net->xfrm.policy_hash_work);
#ifdef CONFIG_XFRM_SUB_POLICY
        xfrm_policy_flush(net, XFRM_POLICY_TYPE_SUB, false);
#endif
        xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, false);

        WARN_ON(!list_empty(&net->xfrm.policy_all));

        for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
                struct xfrm_policy_hash *htab;

                WARN_ON(!hlist_empty(&net->xfrm.policy_inexact[dir]));

                htab = &net->xfrm.policy_bydst[dir];
                sz = (htab->hmask + 1) * sizeof(struct hlist_head);
                WARN_ON(!hlist_empty(htab->table));
                xfrm_hash_free(htab->table, sz);
        }

        sz = (net->xfrm.policy_idx_hmask + 1) * sizeof(struct hlist_head);
        WARN_ON(!hlist_empty(net->xfrm.policy_byidx));
        xfrm_hash_free(net->xfrm.policy_byidx, sz);

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        list_for_each_entry_safe(b, t, &net->xfrm.inexact_bins, inexact_bins)
                __xfrm_policy_inexact_prune_bin(b, true);
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
}

static int __net_init xfrm_net_init(struct net *net)
{
        int rv;

        /* Initialize the per-net locks here */
        spin_lock_init(&net->xfrm.xfrm_state_lock);
        spin_lock_init(&net->xfrm.xfrm_policy_lock);
        seqcount_spinlock_init(&net->xfrm.xfrm_policy_hash_generation, &net->xfrm.xfrm_policy_lock);
        mutex_init(&net->xfrm.xfrm_cfg_mutex);
        net->xfrm.policy_default[XFRM_POLICY_IN] = XFRM_USERPOLICY_ACCEPT;
        net->xfrm.policy_default[XFRM_POLICY_FWD] = XFRM_USERPOLICY_ACCEPT;
        net->xfrm.policy_default[XFRM_POLICY_OUT] = XFRM_USERPOLICY_ACCEPT;

        rv = xfrm_statistics_init(net);
        if (rv < 0)
                goto out_statistics;
        rv = xfrm_state_init(net);
        if (rv < 0)
                goto out_state;
        rv = xfrm_policy_init(net);
        if (rv < 0)
                goto out_policy;
        rv = xfrm_sysctl_init(net);
        if (rv < 0)
                goto out_sysctl;

        return 0;

out_sysctl:
        xfrm_policy_fini(net);
out_policy:
        xfrm_state_fini(net);
out_state:
        xfrm_statistics_fini(net);
out_statistics:
        return rv;
}

static void __net_exit xfrm_net_exit(struct net *net)
{
        xfrm_sysctl_fini(net);
        xfrm_policy_fini(net);
        xfrm_state_fini(net);
        xfrm_statistics_fini(net);
}

static struct pernet_operations __net_initdata xfrm_net_ops = {
        .init = xfrm_net_init,
        .exit = xfrm_net_exit,
};

void __init xfrm_init(void)
{
        register_pernet_subsys(&xfrm_net_ops);
        xfrm_dev_init();
        xfrm_input_init();

#ifdef CONFIG_XFRM_ESPINTCP
        espintcp_init();
#endif

        RCU_INIT_POINTER(xfrm_if_cb, NULL);
        synchronize_rcu();
}

#ifdef CONFIG_AUDITSYSCALL
static void xfrm_audit_common_policyinfo(struct xfrm_policy *xp,
                                         struct audit_buffer *audit_buf)
{
        struct xfrm_sec_ctx *ctx = xp->security;
        struct xfrm_selector *sel = &xp->selector;

        if (ctx)
                audit_log_format(audit_buf, " sec_alg=%u sec_doi=%u sec_obj=%s",
                                 ctx->ctx_alg, ctx->ctx_doi, ctx->ctx_str);

        switch (sel->family) {
        case AF_INET:
                audit_log_format(audit_buf, " src=%pI4", &sel->saddr.a4);
                if (sel->prefixlen_s != 32)
                        audit_log_format(audit_buf, " src_prefixlen=%d",
                                         sel->prefixlen_s);
                audit_log_format(audit_buf, " dst=%pI4", &sel->daddr.a4);
                if (sel->prefixlen_d != 32)
                        audit_log_format(audit_buf, " dst_prefixlen=%d",
                                         sel->prefixlen_d);
                break;
        case AF_INET6:
                audit_log_format(audit_buf, " src=%pI6", sel->saddr.a6);
                if (sel->prefixlen_s != 128)
                        audit_log_format(audit_buf, " src_prefixlen=%d",
                                         sel->prefixlen_s);
                audit_log_format(audit_buf, " dst=%pI6", sel->daddr.a6);
                if (sel->prefixlen_d != 128)
                        audit_log_format(audit_buf, " dst_prefixlen=%d",
                                         sel->prefixlen_d);
                break;
        }
}

void xfrm_audit_policy_add(struct xfrm_policy *xp, int result, bool task_valid)
{
        struct audit_buffer *audit_buf;

        audit_buf = xfrm_audit_start("SPD-add");
        if (audit_buf == NULL)
                return;
        xfrm_audit_helper_usrinfo(task_valid, audit_buf);
        audit_log_format(audit_buf, " res=%u", result);
        xfrm_audit_common_policyinfo(xp, audit_buf);
        audit_log_end(audit_buf);
}
EXPORT_SYMBOL_GPL(xfrm_audit_policy_add);

void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result,
                              bool task_valid)
{
        struct audit_buffer *audit_buf;

        audit_buf = xfrm_audit_start("SPD-delete");
        if (audit_buf == NULL)
                return;
        xfrm_audit_helper_usrinfo(task_valid, audit_buf);
        audit_log_format(audit_buf, " res=%u", result);
        xfrm_audit_common_policyinfo(xp, audit_buf);
        audit_log_end(audit_buf);
}
EXPORT_SYMBOL_GPL(xfrm_audit_policy_delete);
#endif

#ifdef CONFIG_XFRM_MIGRATE
static bool xfrm_migrate_selector_match(const struct xfrm_selector *sel_cmp,
                                        const struct xfrm_selector *sel_tgt)
{
        if (sel_cmp->proto == IPSEC_ULPROTO_ANY) {
                if (sel_tgt->family == sel_cmp->family &&
                    xfrm_addr_equal(&sel_tgt->daddr, &sel_cmp->daddr,
                                    sel_cmp->family) &&
                    xfrm_addr_equal(&sel_tgt->saddr, &sel_cmp->saddr,
                                    sel_cmp->family) &&
                    sel_tgt->prefixlen_d == sel_cmp->prefixlen_d &&
                    sel_tgt->prefixlen_s == sel_cmp->prefixlen_s) {
                        return true;
                }
        } else {
                if (memcmp(sel_tgt, sel_cmp, sizeof(*sel_tgt)) == 0) {
                        return true;
                }
        }
        return false;
}

static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *sel,
                                                    u8 dir, u8 type, struct net *net, u32 if_id)
{
        struct xfrm_policy *pol, *ret = NULL;
        struct hlist_head *chain;
        u32 priority = ~0U;

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        chain = policy_hash_direct(net, &sel->daddr, &sel->saddr, sel->family, dir);
        hlist_for_each_entry(pol, chain, bydst) {
                if ((if_id == 0 || pol->if_id == if_id) &&
                    xfrm_migrate_selector_match(sel, &pol->selector) &&
                    pol->type == type) {
                        ret = pol;
                        priority = ret->priority;
                        break;
                }
        }
        chain = &net->xfrm.policy_inexact[dir];
        hlist_for_each_entry(pol, chain, bydst_inexact_list) {
                if ((pol->priority >= priority) && ret)
                        break;

                if ((if_id == 0 || pol->if_id == if_id) &&
                    xfrm_migrate_selector_match(sel, &pol->selector) &&
                    pol->type == type) {
                        ret = pol;
                        break;
                }
        }

        xfrm_pol_hold(ret);

        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);

        return ret;
}

static int migrate_tmpl_match(const struct xfrm_migrate *m, const struct xfrm_tmpl *t)
{
        int match = 0;

        if (t->mode == m->mode && t->id.proto == m->proto &&
            (m->reqid == 0 || t->reqid == m->reqid)) {
                switch (t->mode) {
                case XFRM_MODE_TUNNEL:
                case XFRM_MODE_BEET:
                        if (xfrm_addr_equal(&t->id.daddr, &m->old_daddr,
                                            m->old_family) &&
                            xfrm_addr_equal(&t->saddr, &m->old_saddr,
                                            m->old_family)) {
                                match = 1;
                        }
                        break;
                case XFRM_MODE_TRANSPORT:
                        /* in case of transport mode, template does not store
                           any IP addresses, hence we just compare mode and
                           protocol */
                        match = 1;
                        break;
                default:
                        break;
                }
        }
        return match;
}

/* update endpoint address(es) of template(s) */
static int xfrm_policy_migrate(struct xfrm_policy *pol,
                               struct xfrm_migrate *m, int num_migrate)
{
        struct xfrm_migrate *mp;
        int i, j, n = 0;

        write_lock_bh(&pol->lock);
        if (unlikely(pol->walk.dead)) {
                /* target policy has been deleted */
                write_unlock_bh(&pol->lock);
                return -ENOENT;
        }

        for (i = 0; i < pol->xfrm_nr; i++) {
                for (j = 0, mp = m; j < num_migrate; j++, mp++) {
                        if (!migrate_tmpl_match(mp, &pol->xfrm_vec[i]))
                                continue;
                        n++;
                        if (pol->xfrm_vec[i].mode != XFRM_MODE_TUNNEL &&
                            pol->xfrm_vec[i].mode != XFRM_MODE_BEET)
                                continue;
                        /* update endpoints */
                        memcpy(&pol->xfrm_vec[i].id.daddr, &mp->new_daddr,
                               sizeof(pol->xfrm_vec[i].id.daddr));
                        memcpy(&pol->xfrm_vec[i].saddr, &mp->new_saddr,
                               sizeof(pol->xfrm_vec[i].saddr));
                        pol->xfrm_vec[i].encap_family = mp->new_family;
                        /* flush bundles */
                        atomic_inc(&pol->genid);
                }
        }

        write_unlock_bh(&pol->lock);

        if (!n)
                return -ENODATA;

        return 0;
}

static int xfrm_migrate_check(const struct xfrm_migrate *m, int num_migrate)
{
        int i, j;

        if (num_migrate < 1 || num_migrate > XFRM_MAX_DEPTH)
                return -EINVAL;

        for (i = 0; i < num_migrate; i++) {
                if (xfrm_addr_any(&m[i].new_daddr, m[i].new_family) ||
                    xfrm_addr_any(&m[i].new_saddr, m[i].new_family))
                        return -EINVAL;

                /* check if there is any duplicated entry */
                for (j = i + 1; j < num_migrate; j++) {
                        if (!memcmp(&m[i].old_daddr, &m[j].old_daddr,
                                    sizeof(m[i].old_daddr)) &&
                            !memcmp(&m[i].old_saddr, &m[j].old_saddr,
                                    sizeof(m[i].old_saddr)) &&
                            m[i].proto == m[j].proto &&
                            m[i].mode == m[j].mode &&
                            m[i].reqid == m[j].reqid &&
                            m[i].old_family == m[j].old_family)
                                return -EINVAL;
                }
        }

        return 0;
}

int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
                 struct xfrm_migrate *m, int num_migrate,
                 struct xfrm_kmaddress *k, struct net *net,
                 struct xfrm_encap_tmpl *encap, u32 if_id)
{
        int i, err, nx_cur = 0, nx_new = 0;
        struct xfrm_policy *pol = NULL;
        struct xfrm_state *x, *xc;
        struct xfrm_state *x_cur[XFRM_MAX_DEPTH];
        struct xfrm_state *x_new[XFRM_MAX_DEPTH];
        struct xfrm_migrate *mp;

        /* Stage 0 - sanity checks */
        if ((err = xfrm_migrate_check(m, num_migrate)) < 0)
                goto out;

        if (dir >= XFRM_POLICY_MAX) {
                err = -EINVAL;
                goto out;
        }

        /* Stage 1 - find policy */
        if ((pol = xfrm_migrate_policy_find(sel, dir, type, net, if_id)) == NULL) {
                err = -ENOENT;
                goto out;
        }

        /* Stage 2 - find and update state(s) */
        for (i = 0, mp = m; i < num_migrate; i++, mp++) {
                if ((x = xfrm_migrate_state_find(mp, net, if_id))) {
                        x_cur[nx_cur] = x;
                        nx_cur++;
                        xc = xfrm_state_migrate(x, mp, encap);
                        if (xc) {
                                x_new[nx_new] = xc;
                                nx_new++;
                        } else {
                                err = -ENODATA;
                                goto restore_state;
                        }
                }
        }

        /* Stage 3 - update policy */
        if ((err = xfrm_policy_migrate(pol, m, num_migrate)) < 0)
                goto restore_state;

        /* Stage 4 - delete old state(s) */
        if (nx_cur) {
                xfrm_states_put(x_cur, nx_cur);
                xfrm_states_delete(x_cur, nx_cur);
        }

        /* Stage 5 - announce */
        km_migrate(sel, dir, type, m, num_migrate, k, encap);

        xfrm_pol_put(pol);

        return 0;
out:
        return err;

restore_state:
        if (pol)
                xfrm_pol_put(pol);
        if (nx_cur)
                xfrm_states_put(x_cur, nx_cur);
        if (nx_new)
                xfrm_states_delete(x_new, nx_new);

        return err;
}
EXPORT_SYMBOL(xfrm_migrate);
#endif










































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ADDRCONF_H
#define _ADDRCONF_H

#define MAX_RTR_SOLICITATIONS                -1                /* unlimited */
#define RTR_SOLICITATION_INTERVAL        (4*HZ)
#define RTR_SOLICITATION_MAX_INTERVAL        (3600*HZ)        /* 1 hour */

#define MIN_VALID_LIFETIME                (2*3600)        /* 2 hours */

#define TEMP_VALID_LIFETIME                (7*86400)
#define TEMP_PREFERRED_LIFETIME                (86400)
#define REGEN_MAX_RETRY                        (3)
#define MAX_DESYNC_FACTOR                (600)

#define ADDR_CHECK_FREQUENCY                (120*HZ)

#define IPV6_MAX_ADDRESSES                16

#define ADDRCONF_TIMER_FUZZ_MINUS        (HZ > 50 ? HZ / 50 : 1)
#define ADDRCONF_TIMER_FUZZ                (HZ / 4)
#define ADDRCONF_TIMER_FUZZ_MAX                (HZ)

#define ADDRCONF_NOTIFY_PRIORITY        0

#include <linux/in.h>
#include <linux/in6.h>

struct prefix_info {
        __u8                        type;
        __u8                        length;
        __u8                        prefix_len;

        union __packed {
                __u8                flags;
                struct __packed {
#if defined(__BIG_ENDIAN_BITFIELD)
                        __u8        onlink : 1,
                                 autoconf : 1,
                                reserved : 6;
#elif defined(__LITTLE_ENDIAN_BITFIELD)
                        __u8        reserved : 6,
                                autoconf : 1,
                                onlink : 1;
#else
#error "Please fix <asm/byteorder.h>"
#endif
                };
        };
        __be32                        valid;
        __be32                        prefered;
        __be32                        reserved2;

        struct in6_addr                prefix;
};

/* rfc4861 4.6.2: IPv6 PIO is 32 bytes in size */
static_assert(sizeof(struct prefix_info) == 32);

#include <linux/ipv6.h>
#include <linux/netdevice.h>
#include <net/if_inet6.h>
#include <net/ipv6.h>

struct in6_validator_info {
        struct in6_addr                i6vi_addr;
        struct inet6_dev        *i6vi_dev;
        struct netlink_ext_ack        *extack;
};

struct ifa6_config {
        const struct in6_addr        *pfx;
        unsigned int                plen;

        const struct in6_addr        *peer_pfx;

        u32                        rt_priority;
        u32                        ifa_flags;
        u32                        preferred_lft;
        u32                        valid_lft;
        u16                        scope;
};

int addrconf_init(void);
void addrconf_cleanup(void);

int addrconf_add_ifaddr(struct net *net, void __user *arg);
int addrconf_del_ifaddr(struct net *net, void __user *arg);
int addrconf_set_dstaddr(struct net *net, void __user *arg);

int ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
                  const struct net_device *dev, int strict);
int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
                            const struct net_device *dev, bool skip_dev_check,
                            int strict, u32 banned_flags);

#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr);
#endif

int ipv6_chk_rpl_srh_loop(struct net *net, const struct in6_addr *segs,
                          unsigned char nsegs);

bool ipv6_chk_custom_prefix(const struct in6_addr *addr,
                                   const unsigned int prefix_len,
                                   struct net_device *dev);

int ipv6_chk_prefix(const struct in6_addr *addr, struct net_device *dev);

struct net_device *ipv6_dev_find(struct net *net, const struct in6_addr *addr,
                                 struct net_device *dev);

struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net,
                                     const struct in6_addr *addr,
                                     struct net_device *dev, int strict);

int ipv6_dev_get_saddr(struct net *net, const struct net_device *dev,
                       const struct in6_addr *daddr, unsigned int srcprefs,
                       struct in6_addr *saddr);
int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr,
                      u32 banned_flags);
int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr,
                    u32 banned_flags);
bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
                          bool match_wildcard);
bool inet_rcv_saddr_any(const struct sock *sk);
void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr);
void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr);

void addrconf_add_linklocal(struct inet6_dev *idev,
                            const struct in6_addr *addr, u32 flags);

int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
                                 const struct prefix_info *pinfo,
                                 struct inet6_dev *in6_dev,
                                 const struct in6_addr *addr, int addr_type,
                                 u32 addr_flags, bool sllao, bool tokenized,
                                 __u32 valid_lft, u32 prefered_lft);

static inline void addrconf_addr_eui48_base(u8 *eui, const char *const addr)
{
        memcpy(eui, addr, 3);
        eui[3] = 0xFF;
        eui[4] = 0xFE;
        memcpy(eui + 5, addr + 3, 3);
}

static inline void addrconf_addr_eui48(u8 *eui, const char *const addr)
{
        addrconf_addr_eui48_base(eui, addr);
        eui[0] ^= 2;
}

static inline int addrconf_ifid_eui48(u8 *eui, struct net_device *dev)
{
        if (dev->addr_len != ETH_ALEN)
                return -1;

        /*
         * The zSeries OSA network cards can be shared among various
         * OS instances, but the OSA cards have only one MAC address.
         * This leads to duplicate address conflicts in conjunction
         * with IPv6 if more than one instance uses the same card.
         *
         * The driver for these cards can deliver a unique 16-bit
         * identifier for each instance sharing the same card.  It is
         * placed instead of 0xFFFE in the interface identifier.  The
         * "u" bit of the interface identifier is not inverted in this
         * case.  Hence the resulting interface identifier has local
         * scope according to RFC2373.
         */

        addrconf_addr_eui48_base(eui, dev->dev_addr);

        if (dev->dev_id) {
                eui[3] = (dev->dev_id >> 8) & 0xFF;
                eui[4] = dev->dev_id & 0xFF;
        } else {
                eui[0] ^= 2;
        }

        return 0;
}

static inline unsigned long addrconf_timeout_fixup(u32 timeout,
                                                   unsigned int unit)
{
        if (timeout == 0xffffffff)
                return ~0UL;

        /*
         * Avoid arithmetic overflow.
         * Assuming unit is constant and non-zero, this "if" statement
         * will go away on 64bit archs.
         */
        if (0xfffffffe > LONG_MAX / unit && timeout > LONG_MAX / unit)
                return LONG_MAX / unit;

        return timeout;
}

static inline int addrconf_finite_timeout(unsigned long timeout)
{
        return ~timeout;
}

/*
 *        IPv6 Address Label subsystem (addrlabel.c)
 */
int ipv6_addr_label_init(void);
void ipv6_addr_label_cleanup(void);
int ipv6_addr_label_rtnl_register(void);
u32 ipv6_addr_label(struct net *net, const struct in6_addr *addr,
                    int type, int ifindex);

/*
 *        multicast prototypes (mcast.c)
 */
static inline bool ipv6_mc_may_pull(struct sk_buff *skb,
                                    unsigned int len)
{
        if (skb_transport_offset(skb) + ipv6_transport_len(skb) < len)
                return false;

        return pskb_may_pull(skb, len);
}

int ipv6_sock_mc_join(struct sock *sk, int ifindex,
                      const struct in6_addr *addr);
int ipv6_sock_mc_drop(struct sock *sk, int ifindex,
                      const struct in6_addr *addr);
void __ipv6_sock_mc_close(struct sock *sk);
void ipv6_sock_mc_close(struct sock *sk);
bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr,
                    const struct in6_addr *src_addr);

int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr);
int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr);
int ipv6_dev_mc_dec(struct net_device *dev, const struct in6_addr *addr);
void ipv6_mc_up(struct inet6_dev *idev);
void ipv6_mc_down(struct inet6_dev *idev);
void ipv6_mc_unmap(struct inet6_dev *idev);
void ipv6_mc_remap(struct inet6_dev *idev);
void ipv6_mc_init_dev(struct inet6_dev *idev);
void ipv6_mc_destroy_dev(struct inet6_dev *idev);
int ipv6_mc_check_mld(struct sk_buff *skb);
void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp);

bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group,
                         const struct in6_addr *src_addr);

void ipv6_mc_dad_complete(struct inet6_dev *idev);

/*
 * identify MLD packets for MLD filter exceptions
 */
static inline bool ipv6_is_mld(struct sk_buff *skb, int nexthdr, int offset)
{
        struct icmp6hdr *hdr;

        if (nexthdr != IPPROTO_ICMPV6 ||
            !pskb_network_may_pull(skb, offset + sizeof(struct icmp6hdr)))
                return false;

        hdr = (struct icmp6hdr *)(skb_network_header(skb) + offset);

        switch (hdr->icmp6_type) {
        case ICMPV6_MGM_QUERY:
        case ICMPV6_MGM_REPORT:
        case ICMPV6_MGM_REDUCTION:
        case ICMPV6_MLD2_REPORT:
                return true;
        default:
                break;
        }
        return false;
}

void addrconf_prefix_rcv(struct net_device *dev,
                         u8 *opt, int len, bool sllao);

/*
 *        anycast prototypes (anycast.c)
 */
int ipv6_sock_ac_join(struct sock *sk, int ifindex,
                      const struct in6_addr *addr);
int ipv6_sock_ac_drop(struct sock *sk, int ifindex,
                      const struct in6_addr *addr);
void __ipv6_sock_ac_close(struct sock *sk);
void ipv6_sock_ac_close(struct sock *sk);

int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr);
int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr);
void ipv6_ac_destroy_dev(struct inet6_dev *idev);
bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev,
                         const struct in6_addr *addr);
bool ipv6_chk_acast_addr_src(struct net *net, struct net_device *dev,
                             const struct in6_addr *addr);
int ipv6_anycast_init(void);
void ipv6_anycast_cleanup(void);

/* Device notifier */
int register_inet6addr_notifier(struct notifier_block *nb);
int unregister_inet6addr_notifier(struct notifier_block *nb);
int inet6addr_notifier_call_chain(unsigned long val, void *v);

int register_inet6addr_validator_notifier(struct notifier_block *nb);
int unregister_inet6addr_validator_notifier(struct notifier_block *nb);
int inet6addr_validator_notifier_call_chain(unsigned long val, void *v);

void inet6_netconf_notify_devconf(struct net *net, int event, int type,
                                  int ifindex, struct ipv6_devconf *devconf);

/**
 * __in6_dev_get - get inet6_dev pointer from netdevice
 * @dev: network device
 *
 * Caller must hold rcu_read_lock or RTNL, because this function
 * does not take a reference on the inet6_dev.
 */
static inline struct inet6_dev *__in6_dev_get(const struct net_device *dev)
{
        return rcu_dereference_rtnl(dev->ip6_ptr);
}

/**
 * __in6_dev_stats_get - get inet6_dev pointer for stats
 * @dev: network device
 * @skb: skb for original incoming interface if neeeded
 *
 * Caller must hold rcu_read_lock or RTNL, because this function
 * does not take a reference on the inet6_dev.
 */
static inline struct inet6_dev *__in6_dev_stats_get(const struct net_device *dev,
                                                    const struct sk_buff *skb)
{
        if (netif_is_l3_master(dev))
                dev = dev_get_by_index_rcu(dev_net(dev), inet6_iif(skb));
        return __in6_dev_get(dev);
}

/**
 * __in6_dev_get_safely - get inet6_dev pointer from netdevice
 * @dev: network device
 *
 * This is a safer version of __in6_dev_get
 */
static inline struct inet6_dev *__in6_dev_get_safely(const struct net_device *dev)
{
        if (likely(dev))
                return rcu_dereference_rtnl(dev->ip6_ptr);
        else
                return NULL;
}

/**
 * in6_dev_get - get inet6_dev pointer from netdevice
 * @dev: network device
 *
 * This version can be used in any context, and takes a reference
 * on the inet6_dev. Callers must use in6_dev_put() later to
 * release this reference.
 */
static inline struct inet6_dev *in6_dev_get(const struct net_device *dev)
{
        struct inet6_dev *idev;

        rcu_read_lock();
        idev = rcu_dereference(dev->ip6_ptr);
        if (idev)
                refcount_inc(&idev->refcnt);
        rcu_read_unlock();
        return idev;
}

static inline struct neigh_parms *__in6_dev_nd_parms_get_rcu(const struct net_device *dev)
{
        struct inet6_dev *idev = __in6_dev_get(dev);

        return idev ? idev->nd_parms : NULL;
}

void in6_dev_finish_destroy(struct inet6_dev *idev);

static inline void in6_dev_put(struct inet6_dev *idev)
{
        if (refcount_dec_and_test(&idev->refcnt))
                in6_dev_finish_destroy(idev);
}

static inline void in6_dev_put_clear(struct inet6_dev **pidev)
{
        struct inet6_dev *idev = *pidev;

        if (idev) {
                in6_dev_put(idev);
                *pidev = NULL;
        }
}

static inline void __in6_dev_put(struct inet6_dev *idev)
{
        refcount_dec(&idev->refcnt);
}

static inline void in6_dev_hold(struct inet6_dev *idev)
{
        refcount_inc(&idev->refcnt);
}

/* called with rcu_read_lock held */
static inline bool ip6_ignore_linkdown(const struct net_device *dev)
{
        const struct inet6_dev *idev = __in6_dev_get(dev);

        if (unlikely(!idev))
                return true;

        return !!idev->cnf.ignore_routes_with_linkdown;
}

void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp);

static inline void in6_ifa_put(struct inet6_ifaddr *ifp)
{
        if (refcount_dec_and_test(&ifp->refcnt))
                inet6_ifa_finish_destroy(ifp);
}

static inline void __in6_ifa_put(struct inet6_ifaddr *ifp)
{
        refcount_dec(&ifp->refcnt);
}

static inline void in6_ifa_hold(struct inet6_ifaddr *ifp)
{
        refcount_inc(&ifp->refcnt);
}

static inline bool in6_ifa_hold_safe(struct inet6_ifaddr *ifp)
{
        return refcount_inc_not_zero(&ifp->refcnt);
}

/*
 *        compute link-local solicited-node multicast address
 */

static inline void addrconf_addr_solict_mult(const struct in6_addr *addr,
                                             struct in6_addr *solicited)
{
        ipv6_addr_set(solicited,
                      htonl(0xFF020000), 0,
                      htonl(0x1),
                      htonl(0xFF000000) | addr->s6_addr32[3]);
}

static inline bool ipv6_addr_is_ll_all_nodes(const struct in6_addr *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        __be64 *p = (__force __be64 *)addr;
        return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | (p[1] ^ cpu_to_be64(1))) == 0UL;
#else
        return ((addr->s6_addr32[0] ^ htonl(0xff020000)) |
                addr->s6_addr32[1] | addr->s6_addr32[2] |
                (addr->s6_addr32[3] ^ htonl(0x00000001))) == 0;
#endif
}

static inline bool ipv6_addr_is_ll_all_routers(const struct in6_addr *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        __be64 *p = (__force __be64 *)addr;
        return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | (p[1] ^ cpu_to_be64(2))) == 0UL;
#else
        return ((addr->s6_addr32[0] ^ htonl(0xff020000)) |
                addr->s6_addr32[1] | addr->s6_addr32[2] |
                (addr->s6_addr32[3] ^ htonl(0x00000002))) == 0;
#endif
}

static inline bool ipv6_addr_is_isatap(const struct in6_addr *addr)
{
        return (addr->s6_addr32[2] | htonl(0x02000000)) == htonl(0x02005EFE);
}

static inline bool ipv6_addr_is_solict_mult(const struct in6_addr *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        __be64 *p = (__force __be64 *)addr;
        return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) |
                ((p[1] ^ cpu_to_be64(0x00000001ff000000UL)) &
                 cpu_to_be64(0xffffffffff000000UL))) == 0UL;
#else
        return ((addr->s6_addr32[0] ^ htonl(0xff020000)) |
                addr->s6_addr32[1] |
                (addr->s6_addr32[2] ^ htonl(0x00000001)) |
                (addr->s6_addr[12] ^ 0xff)) == 0;
#endif
}

static inline bool ipv6_addr_is_all_snoopers(const struct in6_addr *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        __be64 *p = (__force __be64 *)addr;

        return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) |
                (p[1] ^ cpu_to_be64(0x6a))) == 0UL;
#else
        return ((addr->s6_addr32[0] ^ htonl(0xff020000)) |
                addr->s6_addr32[1] | addr->s6_addr32[2] |
                (addr->s6_addr32[3] ^ htonl(0x0000006a))) == 0;
#endif
}

#ifdef CONFIG_PROC_FS
int if6_proc_init(void);
void if6_proc_exit(void);
#endif

#endif






    1 



    1 


    1 


    1 

    1 




    1 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
#include <linux/dcache.h>
#include "internal.h"

unsigned name_to_int(const struct qstr *qstr)
{
        const char *name = qstr->name;
        int len = qstr->len;
        unsigned n = 0;

        if (len > 1 && *name == '0')
                goto out;
        do {
                unsigned c = *name++ - '0';
                if (c > 9)
                        goto out;
                if (n >= (~0U-9)/10)
                        goto out;
                n *= 10;
                n += c;
        } while (--len > 0);
        return n;
out:
        return ~0U;
}


























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef _ASM_X86_INSN_H
#define _ASM_X86_INSN_H
/*
 * x86 instruction analysis
 *
 * Copyright (C) IBM Corporation, 2009
 */

/* insn_attr_t is defined in inat.h */
#include <asm/inat.h> /* __ignore_sync_check__ */

struct insn_field {
        union {
                insn_value_t value;
                insn_byte_t bytes[4];
        };
        /* !0 if we've run insn_get_xxx() for this field */
        unsigned char got;
        unsigned char nbytes;
};

struct insn {
        struct insn_field prefixes;        /*
                                         * Prefixes
                                         * prefixes.bytes[3]: last prefix
                                         */
        struct insn_field rex_prefix;        /* REX prefix */
        struct insn_field vex_prefix;        /* VEX prefix */
        struct insn_field opcode;        /*
                                         * opcode.bytes[0]: opcode1
                                         * opcode.bytes[1]: opcode2
                                         * opcode.bytes[2]: opcode3
                                         */
        struct insn_field modrm;
        struct insn_field sib;
        struct insn_field displacement;
        union {
                struct insn_field immediate;
                struct insn_field moffset1;        /* for 64bit MOV */
                struct insn_field immediate1;        /* for 64bit imm or off16/32 */
        };
        union {
                struct insn_field moffset2;        /* for 64bit MOV */
                struct insn_field immediate2;        /* for 64bit imm or seg16 */
        };

        int        emulate_prefix_size;
        insn_attr_t attr;
        unsigned char opnd_bytes;
        unsigned char addr_bytes;
        unsigned char length;
        unsigned char x86_64;

        const insn_byte_t *kaddr;        /* kernel address of insn to analyze */
        const insn_byte_t *end_kaddr;        /* kernel address of last insn in buffer */
        const insn_byte_t *next_byte;
};

#define MAX_INSN_SIZE        15

#define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6)
#define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3)
#define X86_MODRM_RM(modrm) ((modrm) & 0x07)

#define X86_SIB_SCALE(sib) (((sib) & 0xc0) >> 6)
#define X86_SIB_INDEX(sib) (((sib) & 0x38) >> 3)
#define X86_SIB_BASE(sib) ((sib) & 0x07)

#define X86_REX_W(rex) ((rex) & 8)
#define X86_REX_R(rex) ((rex) & 4)
#define X86_REX_X(rex) ((rex) & 2)
#define X86_REX_B(rex) ((rex) & 1)

/* VEX bit flags  */
#define X86_VEX_W(vex)        ((vex) & 0x80)        /* VEX3 Byte2 */
#define X86_VEX_R(vex)        ((vex) & 0x80)        /* VEX2/3 Byte1 */
#define X86_VEX_X(vex)        ((vex) & 0x40)        /* VEX3 Byte1 */
#define X86_VEX_B(vex)        ((vex) & 0x20)        /* VEX3 Byte1 */
#define X86_VEX_L(vex)        ((vex) & 0x04)        /* VEX3 Byte2, VEX2 Byte1 */
/* VEX bit fields */
#define X86_EVEX_M(vex)        ((vex) & 0x03)                /* EVEX Byte1 */
#define X86_VEX3_M(vex)        ((vex) & 0x1f)                /* VEX3 Byte1 */
#define X86_VEX2_M        1                        /* VEX2.M always 1 */
#define X86_VEX_V(vex)        (((vex) & 0x78) >> 3)        /* VEX3 Byte2, VEX2 Byte1 */
#define X86_VEX_P(vex)        ((vex) & 0x03)                /* VEX3 Byte2, VEX2 Byte1 */
#define X86_VEX_M_MAX        0x1f                        /* VEX3.M Maximum value */

extern void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64);
extern int insn_get_prefixes(struct insn *insn);
extern int insn_get_opcode(struct insn *insn);
extern int insn_get_modrm(struct insn *insn);
extern int insn_get_sib(struct insn *insn);
extern int insn_get_displacement(struct insn *insn);
extern int insn_get_immediate(struct insn *insn);
extern int insn_get_length(struct insn *insn);

enum insn_mode {
        INSN_MODE_32,
        INSN_MODE_64,
        /* Mode is determined by the current kernel build. */
        INSN_MODE_KERN,
        INSN_NUM_MODES,
};

extern int insn_decode(struct insn *insn, const void *kaddr, int buf_len, enum insn_mode m);

#define insn_decode_kernel(_insn, _ptr) insn_decode((_insn), (_ptr), MAX_INSN_SIZE, INSN_MODE_KERN)

/* Attribute will be determined after getting ModRM (for opcode groups) */
static inline void insn_get_attribute(struct insn *insn)
{
        insn_get_modrm(insn);
}

/* Instruction uses RIP-relative addressing */
extern int insn_rip_relative(struct insn *insn);

/* Init insn for kernel text */
static inline void kernel_insn_init(struct insn *insn,
                                    const void *kaddr, int buf_len)
{
#ifdef CONFIG_X86_64
        insn_init(insn, kaddr, buf_len, 1);
#else /* CONFIG_X86_32 */
        insn_init(insn, kaddr, buf_len, 0);
#endif
}

static inline int insn_is_avx(struct insn *insn)
{
        if (!insn->prefixes.got)
                insn_get_prefixes(insn);
        return (insn->vex_prefix.value != 0);
}

static inline int insn_is_evex(struct insn *insn)
{
        if (!insn->prefixes.got)
                insn_get_prefixes(insn);
        return (insn->vex_prefix.nbytes == 4);
}

static inline int insn_has_emulate_prefix(struct insn *insn)
{
        return !!insn->emulate_prefix_size;
}

/* Ensure this instruction is decoded completely */
static inline int insn_complete(struct insn *insn)
{
        return insn->opcode.got && insn->modrm.got && insn->sib.got &&
                insn->displacement.got && insn->immediate.got;
}

static inline insn_byte_t insn_vex_m_bits(struct insn *insn)
{
        if (insn->vex_prefix.nbytes == 2)        /* 2 bytes VEX */
                return X86_VEX2_M;
        else if (insn->vex_prefix.nbytes == 3)        /* 3 bytes VEX */
                return X86_VEX3_M(insn->vex_prefix.bytes[1]);
        else                                        /* EVEX */
                return X86_EVEX_M(insn->vex_prefix.bytes[1]);
}

static inline insn_byte_t insn_vex_p_bits(struct insn *insn)
{
        if (insn->vex_prefix.nbytes == 2)        /* 2 bytes VEX */
                return X86_VEX_P(insn->vex_prefix.bytes[1]);
        else
                return X86_VEX_P(insn->vex_prefix.bytes[2]);
}

/* Get the last prefix id from last prefix or VEX prefix */
static inline int insn_last_prefix_id(struct insn *insn)
{
        if (insn_is_avx(insn))
                return insn_vex_p_bits(insn);        /* VEX_p is a SIMD prefix id */

        if (insn->prefixes.bytes[3])
                return inat_get_last_prefix_id(insn->prefixes.bytes[3]);

        return 0;
}

/* Offset of each field from kaddr */
static inline int insn_offset_rex_prefix(struct insn *insn)
{
        return insn->prefixes.nbytes;
}
static inline int insn_offset_vex_prefix(struct insn *insn)
{
        return insn_offset_rex_prefix(insn) + insn->rex_prefix.nbytes;
}
static inline int insn_offset_opcode(struct insn *insn)
{
        return insn_offset_vex_prefix(insn) + insn->vex_prefix.nbytes;
}
static inline int insn_offset_modrm(struct insn *insn)
{
        return insn_offset_opcode(insn) + insn->opcode.nbytes;
}
static inline int insn_offset_sib(struct insn *insn)
{
        return insn_offset_modrm(insn) + insn->modrm.nbytes;
}
static inline int insn_offset_displacement(struct insn *insn)
{
        return insn_offset_sib(insn) + insn->sib.nbytes;
}
static inline int insn_offset_immediate(struct insn *insn)
{
        return insn_offset_displacement(insn) + insn->displacement.nbytes;
}

/**
 * for_each_insn_prefix() -- Iterate prefixes in the instruction
 * @insn: Pointer to struct insn.
 * @idx:  Index storage.
 * @prefix: Prefix byte.
 *
 * Iterate prefix bytes of given @insn. Each prefix byte is stored in @prefix
 * and the index is stored in @idx (note that this @idx is just for a cursor,
 * do not change it.)
 * Since prefixes.nbytes can be bigger than 4 if some prefixes
 * are repeated, it cannot be used for looping over the prefixes.
 */
#define for_each_insn_prefix(insn, idx, prefix)        \
        for (idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++)

#define POP_SS_OPCODE 0x1f
#define MOV_SREG_OPCODE 0x8e

/*
 * Intel SDM Vol.3A 6.8.3 states;
 * "Any single-step trap that would be delivered following the MOV to SS
 * instruction or POP to SS instruction (because EFLAGS.TF is 1) is
 * suppressed."
 * This function returns true if @insn is MOV SS or POP SS. On these
 * instructions, single stepping is suppressed.
 */
static inline int insn_masking_exception(struct insn *insn)
{
        return insn->opcode.bytes[0] == POP_SS_OPCODE ||
                (insn->opcode.bytes[0] == MOV_SREG_OPCODE &&
                 X86_MODRM_REG(insn->modrm.bytes[0]) == 2);
}

#endif /* _ASM_X86_INSN_H */
















































































    4 

























































































    4 































    4 






















































































































































































































    2 

























































































































































































































































































































































































































































































































































































































































































































































































    4 






    1 








    4 








    1 







    1 







    1 






    1 





    4 





    4 




































































































































































    2 
























    4 



























    1 


























    4 
















    1 

    4 



























































    4 




    1 







    4 















    1 

































    1 















    1 


    1 




    1 

    1 

    1 

    1 
    1 











    2 




    2 
    2 



































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
/* SPDX-License-Identifier: GPL-2.0+ */
#ifndef _LINUX_XARRAY_H
#define _LINUX_XARRAY_H
/*
 * eXtensible Arrays
 * Copyright (c) 2017 Microsoft Corporation
 * Author: Matthew Wilcox <willy@infradead.org>
 *
 * See Documentation/core-api/xarray.rst for how to use the XArray.
 */

#include <linux/bug.h>
#include <linux/compiler.h>
#include <linux/gfp.h>
#include <linux/kconfig.h>
#include <linux/kernel.h>
#include <linux/rcupdate.h>
#include <linux/sched/mm.h>
#include <linux/spinlock.h>
#include <linux/types.h>

/*
 * The bottom two bits of the entry determine how the XArray interprets
 * the contents:
 *
 * 00: Pointer entry
 * 10: Internal entry
 * x1: Value entry or tagged pointer
 *
 * Attempting to store internal entries in the XArray is a bug.
 *
 * Most internal entries are pointers to the next node in the tree.
 * The following internal entries have a special meaning:
 *
 * 0-62: Sibling entries
 * 256: Retry entry
 * 257: Zero entry
 *
 * Errors are also represented as internal entries, but use the negative
 * space (-4094 to -2).  They're never stored in the slots array; only
 * returned by the normal API.
 */

#define BITS_PER_XA_VALUE        (BITS_PER_LONG - 1)

/**
 * xa_mk_value() - Create an XArray entry from an integer.
 * @v: Value to store in XArray.
 *
 * Context: Any context.
 * Return: An entry suitable for storing in the XArray.
 */
static inline void *xa_mk_value(unsigned long v)
{
        WARN_ON((long)v < 0);
        return (void *)((v << 1) | 1);
}

/**
 * xa_to_value() - Get value stored in an XArray entry.
 * @entry: XArray entry.
 *
 * Context: Any context.
 * Return: The value stored in the XArray entry.
 */
static inline unsigned long xa_to_value(const void *entry)
{
        return (unsigned long)entry >> 1;
}

/**
 * xa_is_value() - Determine if an entry is a value.
 * @entry: XArray entry.
 *
 * Context: Any context.
 * Return: True if the entry is a value, false if it is a pointer.
 */
static inline bool xa_is_value(const void *entry)
{
        return (unsigned long)entry & 1;
}

/**
 * xa_tag_pointer() - Create an XArray entry for a tagged pointer.
 * @p: Plain pointer.
 * @tag: Tag value (0, 1 or 3).
 *
 * If the user of the XArray prefers, they can tag their pointers instead
 * of storing value entries.  Three tags are available (0, 1 and 3).
 * These are distinct from the xa_mark_t as they are not replicated up
 * through the array and cannot be searched for.
 *
 * Context: Any context.
 * Return: An XArray entry.
 */
static inline void *xa_tag_pointer(void *p, unsigned long tag)
{
        return (void *)((unsigned long)p | tag);
}

/**
 * xa_untag_pointer() - Turn an XArray entry into a plain pointer.
 * @entry: XArray entry.
 *
 * If you have stored a tagged pointer in the XArray, call this function
 * to get the untagged version of the pointer.
 *
 * Context: Any context.
 * Return: A pointer.
 */
static inline void *xa_untag_pointer(void *entry)
{
        return (void *)((unsigned long)entry & ~3UL);
}

/**
 * xa_pointer_tag() - Get the tag stored in an XArray entry.
 * @entry: XArray entry.
 *
 * If you have stored a tagged pointer in the XArray, call this function
 * to get the tag of that pointer.
 *
 * Context: Any context.
 * Return: A tag.
 */
static inline unsigned int xa_pointer_tag(void *entry)
{
        return (unsigned long)entry & 3UL;
}

/*
 * xa_mk_internal() - Create an internal entry.
 * @v: Value to turn into an internal entry.
 *
 * Internal entries are used for a number of purposes.  Entries 0-255 are
 * used for sibling entries (only 0-62 are used by the current code).  256
 * is used for the retry entry.  257 is used for the reserved / zero entry.
 * Negative internal entries are used to represent errnos.  Node pointers
 * are also tagged as internal entries in some situations.
 *
 * Context: Any context.
 * Return: An XArray internal entry corresponding to this value.
 */
static inline void *xa_mk_internal(unsigned long v)
{
        return (void *)((v << 2) | 2);
}

/*
 * xa_to_internal() - Extract the value from an internal entry.
 * @entry: XArray entry.
 *
 * Context: Any context.
 * Return: The value which was stored in the internal entry.
 */
static inline unsigned long xa_to_internal(const void *entry)
{
        return (unsigned long)entry >> 2;
}

/*
 * xa_is_internal() - Is the entry an internal entry?
 * @entry: XArray entry.
 *
 * Context: Any context.
 * Return: %true if the entry is an internal entry.
 */
static inline bool xa_is_internal(const void *entry)
{
        return ((unsigned long)entry & 3) == 2;
}

#define XA_ZERO_ENTRY                xa_mk_internal(257)

/**
 * xa_is_zero() - Is the entry a zero entry?
 * @entry: Entry retrieved from the XArray
 *
 * The normal API will return NULL as the contents of a slot containing
 * a zero entry.  You can only see zero entries by using the advanced API.
 *
 * Return: %true if the entry is a zero entry.
 */
static inline bool xa_is_zero(const void *entry)
{
        return unlikely(entry == XA_ZERO_ENTRY);
}

/**
 * xa_is_err() - Report whether an XArray operation returned an error
 * @entry: Result from calling an XArray function
 *
 * If an XArray operation cannot complete an operation, it will return
 * a special value indicating an error.  This function tells you
 * whether an error occurred; xa_err() tells you which error occurred.
 *
 * Context: Any context.
 * Return: %true if the entry indicates an error.
 */
static inline bool xa_is_err(const void *entry)
{
        return unlikely(xa_is_internal(entry) &&
                        entry >= xa_mk_internal(-MAX_ERRNO));
}

/**
 * xa_err() - Turn an XArray result into an errno.
 * @entry: Result from calling an XArray function.
 *
 * If an XArray operation cannot complete an operation, it will return
 * a special pointer value which encodes an errno.  This function extracts
 * the errno from the pointer value, or returns 0 if the pointer does not
 * represent an errno.
 *
 * Context: Any context.
 * Return: A negative errno or 0.
 */
static inline int xa_err(void *entry)
{
        /* xa_to_internal() would not do sign extension. */
        if (xa_is_err(entry))
                return (long)entry >> 2;
        return 0;
}

/**
 * struct xa_limit - Represents a range of IDs.
 * @min: The lowest ID to allocate (inclusive).
 * @max: The maximum ID to allocate (inclusive).
 *
 * This structure is used either directly or via the XA_LIMIT() macro
 * to communicate the range of IDs that are valid for allocation.
 * Two common ranges are predefined for you:
 * * xa_limit_32b        - [0 - UINT_MAX]
 * * xa_limit_31b        - [0 - INT_MAX]
 */
struct xa_limit {
        u32 max;
        u32 min;
};

#define XA_LIMIT(_min, _max) (struct xa_limit) { .min = _min, .max = _max }

#define xa_limit_32b        XA_LIMIT(0, UINT_MAX)
#define xa_limit_31b        XA_LIMIT(0, INT_MAX)

typedef unsigned __bitwise xa_mark_t;
#define XA_MARK_0                ((__force xa_mark_t)0U)
#define XA_MARK_1                ((__force xa_mark_t)1U)
#define XA_MARK_2                ((__force xa_mark_t)2U)
#define XA_PRESENT                ((__force xa_mark_t)8U)
#define XA_MARK_MAX                XA_MARK_2
#define XA_FREE_MARK                XA_MARK_0

enum xa_lock_type {
        XA_LOCK_IRQ = 1,
        XA_LOCK_BH = 2,
};

/*
 * Values for xa_flags.  The radix tree stores its GFP flags in the xa_flags,
 * and we remain compatible with that.
 */
#define XA_FLAGS_LOCK_IRQ        ((__force gfp_t)XA_LOCK_IRQ)
#define XA_FLAGS_LOCK_BH        ((__force gfp_t)XA_LOCK_BH)
#define XA_FLAGS_TRACK_FREE        ((__force gfp_t)4U)
#define XA_FLAGS_ZERO_BUSY        ((__force gfp_t)8U)
#define XA_FLAGS_ALLOC_WRAPPED        ((__force gfp_t)16U)
#define XA_FLAGS_ACCOUNT        ((__force gfp_t)32U)
#define XA_FLAGS_MARK(mark)        ((__force gfp_t)((1U << __GFP_BITS_SHIFT) << \
                                                (__force unsigned)(mark)))

/* ALLOC is for a normal 0-based alloc.  ALLOC1 is for an 1-based alloc */
#define XA_FLAGS_ALLOC        (XA_FLAGS_TRACK_FREE | XA_FLAGS_MARK(XA_FREE_MARK))
#define XA_FLAGS_ALLOC1        (XA_FLAGS_TRACK_FREE | XA_FLAGS_ZERO_BUSY)

/**
 * struct xarray - The anchor of the XArray.
 * @xa_lock: Lock that protects the contents of the XArray.
 *
 * To use the xarray, define it statically or embed it in your data structure.
 * It is a very small data structure, so it does not usually make sense to
 * allocate it separately and keep a pointer to it in your data structure.
 *
 * You may use the xa_lock to protect your own data structures as well.
 */
/*
 * If all of the entries in the array are NULL, @xa_head is a NULL pointer.
 * If the only non-NULL entry in the array is at index 0, @xa_head is that
 * entry.  If any other entry in the array is non-NULL, @xa_head points
 * to an @xa_node.
 */
struct xarray {
        spinlock_t        xa_lock;
/* private: The rest of the data structure is not to be used directly. */
        gfp_t                xa_flags;
        void __rcu *        xa_head;
};

#define XARRAY_INIT(name, flags) {                                \
        .xa_lock = __SPIN_LOCK_UNLOCKED(name.xa_lock),                \
        .xa_flags = flags,                                        \
        .xa_head = NULL,                                        \
}

/**
 * DEFINE_XARRAY_FLAGS() - Define an XArray with custom flags.
 * @name: A string that names your XArray.
 * @flags: XA_FLAG values.
 *
 * This is intended for file scope definitions of XArrays.  It declares
 * and initialises an empty XArray with the chosen name and flags.  It is
 * equivalent to calling xa_init_flags() on the array, but it does the
 * initialisation at compiletime instead of runtime.
 */
#define DEFINE_XARRAY_FLAGS(name, flags)                                \
        struct xarray name = XARRAY_INIT(name, flags)

/**
 * DEFINE_XARRAY() - Define an XArray.
 * @name: A string that names your XArray.
 *
 * This is intended for file scope definitions of XArrays.  It declares
 * and initialises an empty XArray with the chosen name.  It is equivalent
 * to calling xa_init() on the array, but it does the initialisation at
 * compiletime instead of runtime.
 */
#define DEFINE_XARRAY(name) DEFINE_XARRAY_FLAGS(name, 0)

/**
 * DEFINE_XARRAY_ALLOC() - Define an XArray which allocates IDs starting at 0.
 * @name: A string that names your XArray.
 *
 * This is intended for file scope definitions of allocating XArrays.
 * See also DEFINE_XARRAY().
 */
#define DEFINE_XARRAY_ALLOC(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC)

/**
 * DEFINE_XARRAY_ALLOC1() - Define an XArray which allocates IDs starting at 1.
 * @name: A string that names your XArray.
 *
 * This is intended for file scope definitions of allocating XArrays.
 * See also DEFINE_XARRAY().
 */
#define DEFINE_XARRAY_ALLOC1(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC1)

void *xa_load(struct xarray *, unsigned long index);
void *xa_store(struct xarray *, unsigned long index, void *entry, gfp_t);
void *xa_erase(struct xarray *, unsigned long index);
void *xa_store_range(struct xarray *, unsigned long first, unsigned long last,
                        void *entry, gfp_t);
bool xa_get_mark(struct xarray *, unsigned long index, xa_mark_t);
void xa_set_mark(struct xarray *, unsigned long index, xa_mark_t);
void xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t);
void *xa_find(struct xarray *xa, unsigned long *index,
                unsigned long max, xa_mark_t) __attribute__((nonnull(2)));
void *xa_find_after(struct xarray *xa, unsigned long *index,
                unsigned long max, xa_mark_t) __attribute__((nonnull(2)));
unsigned int xa_extract(struct xarray *, void **dst, unsigned long start,
                unsigned long max, unsigned int n, xa_mark_t);
void xa_destroy(struct xarray *);

/**
 * xa_init_flags() - Initialise an empty XArray with flags.
 * @xa: XArray.
 * @flags: XA_FLAG values.
 *
 * If you need to initialise an XArray with special flags (eg you need
 * to take the lock from interrupt context), use this function instead
 * of xa_init().
 *
 * Context: Any context.
 */
static inline void xa_init_flags(struct xarray *xa, gfp_t flags)
{
        spin_lock_init(&xa->xa_lock);
        xa->xa_flags = flags;
        xa->xa_head = NULL;
}

/**
 * xa_init() - Initialise an empty XArray.
 * @xa: XArray.
 *
 * An empty XArray is full of NULL entries.
 *
 * Context: Any context.
 */
static inline void xa_init(struct xarray *xa)
{
        xa_init_flags(xa, 0);
}

/**
 * xa_empty() - Determine if an array has any present entries.
 * @xa: XArray.
 *
 * Context: Any context.
 * Return: %true if the array contains only NULL pointers.
 */
static inline bool xa_empty(const struct xarray *xa)
{
        return xa->xa_head == NULL;
}

/**
 * xa_marked() - Inquire whether any entry in this array has a mark set
 * @xa: Array
 * @mark: Mark value
 *
 * Context: Any context.
 * Return: %true if any entry has this mark set.
 */
static inline bool xa_marked(const struct xarray *xa, xa_mark_t mark)
{
        return xa->xa_flags & XA_FLAGS_MARK(mark);
}

/**
 * xa_for_each_range() - Iterate over a portion of an XArray.
 * @xa: XArray.
 * @index: Index of @entry.
 * @entry: Entry retrieved from array.
 * @start: First index to retrieve from array.
 * @last: Last index to retrieve from array.
 *
 * During the iteration, @entry will have the value of the entry stored
 * in @xa at @index.  You may modify @index during the iteration if you
 * want to skip or reprocess indices.  It is safe to modify the array
 * during the iteration.  At the end of the iteration, @entry will be set
 * to NULL and @index will have a value less than or equal to max.
 *
 * xa_for_each_range() is O(n.log(n)) while xas_for_each() is O(n).  You have
 * to handle your own locking with xas_for_each(), and if you have to unlock
 * after each iteration, it will also end up being O(n.log(n)).
 * xa_for_each_range() will spin if it hits a retry entry; if you intend to
 * see retry entries, you should use the xas_for_each() iterator instead.
 * The xas_for_each() iterator will expand into more inline code than
 * xa_for_each_range().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 */
#define xa_for_each_range(xa, index, entry, start, last)                \
        for (index = start,                                                \
             entry = xa_find(xa, &index, last, XA_PRESENT);                \
             entry;                                                        \
             entry = xa_find_after(xa, &index, last, XA_PRESENT))

/**
 * xa_for_each_start() - Iterate over a portion of an XArray.
 * @xa: XArray.
 * @index: Index of @entry.
 * @entry: Entry retrieved from array.
 * @start: First index to retrieve from array.
 *
 * During the iteration, @entry will have the value of the entry stored
 * in @xa at @index.  You may modify @index during the iteration if you
 * want to skip or reprocess indices.  It is safe to modify the array
 * during the iteration.  At the end of the iteration, @entry will be set
 * to NULL and @index will have a value less than or equal to max.
 *
 * xa_for_each_start() is O(n.log(n)) while xas_for_each() is O(n).  You have
 * to handle your own locking with xas_for_each(), and if you have to unlock
 * after each iteration, it will also end up being O(n.log(n)).
 * xa_for_each_start() will spin if it hits a retry entry; if you intend to
 * see retry entries, you should use the xas_for_each() iterator instead.
 * The xas_for_each() iterator will expand into more inline code than
 * xa_for_each_start().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 */
#define xa_for_each_start(xa, index, entry, start) \
        xa_for_each_range(xa, index, entry, start, ULONG_MAX)

/**
 * xa_for_each() - Iterate over present entries in an XArray.
 * @xa: XArray.
 * @index: Index of @entry.
 * @entry: Entry retrieved from array.
 *
 * During the iteration, @entry will have the value of the entry stored
 * in @xa at @index.  You may modify @index during the iteration if you want
 * to skip or reprocess indices.  It is safe to modify the array during the
 * iteration.  At the end of the iteration, @entry will be set to NULL and
 * @index will have a value less than or equal to max.
 *
 * xa_for_each() is O(n.log(n)) while xas_for_each() is O(n).  You have
 * to handle your own locking with xas_for_each(), and if you have to unlock
 * after each iteration, it will also end up being O(n.log(n)).  xa_for_each()
 * will spin if it hits a retry entry; if you intend to see retry entries,
 * you should use the xas_for_each() iterator instead.  The xas_for_each()
 * iterator will expand into more inline code than xa_for_each().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 */
#define xa_for_each(xa, index, entry) \
        xa_for_each_start(xa, index, entry, 0)

/**
 * xa_for_each_marked() - Iterate over marked entries in an XArray.
 * @xa: XArray.
 * @index: Index of @entry.
 * @entry: Entry retrieved from array.
 * @filter: Selection criterion.
 *
 * During the iteration, @entry will have the value of the entry stored
 * in @xa at @index.  The iteration will skip all entries in the array
 * which do not match @filter.  You may modify @index during the iteration
 * if you want to skip or reprocess indices.  It is safe to modify the array
 * during the iteration.  At the end of the iteration, @entry will be set to
 * NULL and @index will have a value less than or equal to max.
 *
 * xa_for_each_marked() is O(n.log(n)) while xas_for_each_marked() is O(n).
 * You have to handle your own locking with xas_for_each(), and if you have
 * to unlock after each iteration, it will also end up being O(n.log(n)).
 * xa_for_each_marked() will spin if it hits a retry entry; if you intend to
 * see retry entries, you should use the xas_for_each_marked() iterator
 * instead.  The xas_for_each_marked() iterator will expand into more inline
 * code than xa_for_each_marked().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 */
#define xa_for_each_marked(xa, index, entry, filter) \
        for (index = 0, entry = xa_find(xa, &index, ULONG_MAX, filter); \
             entry; entry = xa_find_after(xa, &index, ULONG_MAX, filter))

#define xa_trylock(xa)                spin_trylock(&(xa)->xa_lock)
#define xa_lock(xa)                spin_lock(&(xa)->xa_lock)
#define xa_unlock(xa)                spin_unlock(&(xa)->xa_lock)
#define xa_lock_bh(xa)                spin_lock_bh(&(xa)->xa_lock)
#define xa_unlock_bh(xa)        spin_unlock_bh(&(xa)->xa_lock)
#define xa_lock_irq(xa)                spin_lock_irq(&(xa)->xa_lock)
#define xa_unlock_irq(xa)        spin_unlock_irq(&(xa)->xa_lock)
#define xa_lock_irqsave(xa, flags) \
                                spin_lock_irqsave(&(xa)->xa_lock, flags)
#define xa_unlock_irqrestore(xa, flags) \
                                spin_unlock_irqrestore(&(xa)->xa_lock, flags)
#define xa_lock_nested(xa, subclass) \
                                spin_lock_nested(&(xa)->xa_lock, subclass)
#define xa_lock_bh_nested(xa, subclass) \
                                spin_lock_bh_nested(&(xa)->xa_lock, subclass)
#define xa_lock_irq_nested(xa, subclass) \
                                spin_lock_irq_nested(&(xa)->xa_lock, subclass)
#define xa_lock_irqsave_nested(xa, flags, subclass) \
                spin_lock_irqsave_nested(&(xa)->xa_lock, flags, subclass)

/*
 * Versions of the normal API which require the caller to hold the
 * xa_lock.  If the GFP flags allow it, they will drop the lock to
 * allocate memory, then reacquire it afterwards.  These functions
 * may also re-enable interrupts if the XArray flags indicate the
 * locking should be interrupt safe.
 */
void *__xa_erase(struct xarray *, unsigned long index);
void *__xa_store(struct xarray *, unsigned long index, void *entry, gfp_t);
void *__xa_cmpxchg(struct xarray *, unsigned long index, void *old,
                void *entry, gfp_t);
int __must_check __xa_insert(struct xarray *, unsigned long index,
                void *entry, gfp_t);
int __must_check __xa_alloc(struct xarray *, u32 *id, void *entry,
                struct xa_limit, gfp_t);
int __must_check __xa_alloc_cyclic(struct xarray *, u32 *id, void *entry,
                struct xa_limit, u32 *next, gfp_t);
void __xa_set_mark(struct xarray *, unsigned long index, xa_mark_t);
void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t);

/**
 * xa_store_bh() - Store this entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * This function is like calling xa_store() except it disables softirqs
 * while holding the array lock.
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.
 * Return: The old entry at this index or xa_err() if an error happened.
 */
static inline void *xa_store_bh(struct xarray *xa, unsigned long index,
                void *entry, gfp_t gfp)
{
        void *curr;

        might_alloc(gfp);
        xa_lock_bh(xa);
        curr = __xa_store(xa, index, entry, gfp);
        xa_unlock_bh(xa);

        return curr;
}

/**
 * xa_store_irq() - Store this entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * This function is like calling xa_store() except it disables interrupts
 * while holding the array lock.
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.
 * Return: The old entry at this index or xa_err() if an error happened.
 */
static inline void *xa_store_irq(struct xarray *xa, unsigned long index,
                void *entry, gfp_t gfp)
{
        void *curr;

        might_alloc(gfp);
        xa_lock_irq(xa);
        curr = __xa_store(xa, index, entry, gfp);
        xa_unlock_irq(xa);

        return curr;
}

/**
 * xa_erase_bh() - Erase this entry from the XArray.
 * @xa: XArray.
 * @index: Index of entry.
 *
 * After this function returns, loading from @index will return %NULL.
 * If the index is part of a multi-index entry, all indices will be erased
 * and none of the entries will be part of a multi-index entry.
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.
 * Return: The entry which used to be at this index.
 */
static inline void *xa_erase_bh(struct xarray *xa, unsigned long index)
{
        void *entry;

        xa_lock_bh(xa);
        entry = __xa_erase(xa, index);
        xa_unlock_bh(xa);

        return entry;
}

/**
 * xa_erase_irq() - Erase this entry from the XArray.
 * @xa: XArray.
 * @index: Index of entry.
 *
 * After this function returns, loading from @index will return %NULL.
 * If the index is part of a multi-index entry, all indices will be erased
 * and none of the entries will be part of a multi-index entry.
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.
 * Return: The entry which used to be at this index.
 */
static inline void *xa_erase_irq(struct xarray *xa, unsigned long index)
{
        void *entry;

        xa_lock_irq(xa);
        entry = __xa_erase(xa, index);
        xa_unlock_irq(xa);

        return entry;
}

/**
 * xa_cmpxchg() - Conditionally replace an entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @old: Old value to test against.
 * @entry: New value to place in array.
 * @gfp: Memory allocation flags.
 *
 * If the entry at @index is the same as @old, replace it with @entry.
 * If the return value is equal to @old, then the exchange was successful.
 *
 * Context: Any context.  Takes and releases the xa_lock.  May sleep
 * if the @gfp flags permit.
 * Return: The old value at this index or xa_err() if an error happened.
 */
static inline void *xa_cmpxchg(struct xarray *xa, unsigned long index,
                        void *old, void *entry, gfp_t gfp)
{
        void *curr;

        might_alloc(gfp);
        xa_lock(xa);
        curr = __xa_cmpxchg(xa, index, old, entry, gfp);
        xa_unlock(xa);

        return curr;
}

/**
 * xa_cmpxchg_bh() - Conditionally replace an entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @old: Old value to test against.
 * @entry: New value to place in array.
 * @gfp: Memory allocation flags.
 *
 * This function is like calling xa_cmpxchg() except it disables softirqs
 * while holding the array lock.
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.  May sleep if the @gfp flags permit.
 * Return: The old value at this index or xa_err() if an error happened.
 */
static inline void *xa_cmpxchg_bh(struct xarray *xa, unsigned long index,
                        void *old, void *entry, gfp_t gfp)
{
        void *curr;

        might_alloc(gfp);
        xa_lock_bh(xa);
        curr = __xa_cmpxchg(xa, index, old, entry, gfp);
        xa_unlock_bh(xa);

        return curr;
}

/**
 * xa_cmpxchg_irq() - Conditionally replace an entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @old: Old value to test against.
 * @entry: New value to place in array.
 * @gfp: Memory allocation flags.
 *
 * This function is like calling xa_cmpxchg() except it disables interrupts
 * while holding the array lock.
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.  May sleep if the @gfp flags permit.
 * Return: The old value at this index or xa_err() if an error happened.
 */
static inline void *xa_cmpxchg_irq(struct xarray *xa, unsigned long index,
                        void *old, void *entry, gfp_t gfp)
{
        void *curr;

        might_alloc(gfp);
        xa_lock_irq(xa);
        curr = __xa_cmpxchg(xa, index, old, entry, gfp);
        xa_unlock_irq(xa);

        return curr;
}

/**
 * xa_insert() - Store this entry in the XArray unless another entry is
 *                        already present.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * Inserting a NULL entry will store a reserved entry (like xa_reserve())
 * if no entry is present.  Inserting will fail if a reserved entry is
 * present, even though loading from this index will return NULL.
 *
 * Context: Any context.  Takes and releases the xa_lock.  May sleep if
 * the @gfp flags permit.
 * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
 * -ENOMEM if memory could not be allocated.
 */
static inline int __must_check xa_insert(struct xarray *xa,
                unsigned long index, void *entry, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock(xa);
        err = __xa_insert(xa, index, entry, gfp);
        xa_unlock(xa);

        return err;
}

/**
 * xa_insert_bh() - Store this entry in the XArray unless another entry is
 *                        already present.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * Inserting a NULL entry will store a reserved entry (like xa_reserve())
 * if no entry is present.  Inserting will fail if a reserved entry is
 * present, even though loading from this index will return NULL.
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.  May sleep if the @gfp flags permit.
 * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
 * -ENOMEM if memory could not be allocated.
 */
static inline int __must_check xa_insert_bh(struct xarray *xa,
                unsigned long index, void *entry, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock_bh(xa);
        err = __xa_insert(xa, index, entry, gfp);
        xa_unlock_bh(xa);

        return err;
}

/**
 * xa_insert_irq() - Store this entry in the XArray unless another entry is
 *                        already present.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * Inserting a NULL entry will store a reserved entry (like xa_reserve())
 * if no entry is present.  Inserting will fail if a reserved entry is
 * present, even though loading from this index will return NULL.
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.  May sleep if the @gfp flags permit.
 * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
 * -ENOMEM if memory could not be allocated.
 */
static inline int __must_check xa_insert_irq(struct xarray *xa,
                unsigned long index, void *entry, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock_irq(xa);
        err = __xa_insert(xa, index, entry, gfp);
        xa_unlock_irq(xa);

        return err;
}

/**
 * xa_alloc() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 *
 * Context: Any context.  Takes and releases the xa_lock.  May sleep if
 * the @gfp flags permit.
 * Return: 0 on success, -ENOMEM if memory could not be allocated or
 * -EBUSY if there are no free entries in @limit.
 */
static inline __must_check int xa_alloc(struct xarray *xa, u32 *id,
                void *entry, struct xa_limit limit, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock(xa);
        err = __xa_alloc(xa, id, entry, limit, gfp);
        xa_unlock(xa);

        return err;
}

/**
 * xa_alloc_bh() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.  May sleep if the @gfp flags permit.
 * Return: 0 on success, -ENOMEM if memory could not be allocated or
 * -EBUSY if there are no free entries in @limit.
 */
static inline int __must_check xa_alloc_bh(struct xarray *xa, u32 *id,
                void *entry, struct xa_limit limit, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock_bh(xa);
        err = __xa_alloc(xa, id, entry, limit, gfp);
        xa_unlock_bh(xa);

        return err;
}

/**
 * xa_alloc_irq() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.  May sleep if the @gfp flags permit.
 * Return: 0 on success, -ENOMEM if memory could not be allocated or
 * -EBUSY if there are no free entries in @limit.
 */
static inline int __must_check xa_alloc_irq(struct xarray *xa, u32 *id,
                void *entry, struct xa_limit limit, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock_irq(xa);
        err = __xa_alloc(xa, id, entry, limit, gfp);
        xa_unlock_irq(xa);

        return err;
}

/**
 * xa_alloc_cyclic() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of allocated ID.
 * @next: Pointer to next ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 * The search for an empty entry will start at @next and will wrap
 * around if necessary.
 *
 * Context: Any context.  Takes and releases the xa_lock.  May sleep if
 * the @gfp flags permit.
 * Return: 0 if the allocation succeeded without wrapping.  1 if the
 * allocation succeeded after wrapping, -ENOMEM if memory could not be
 * allocated or -EBUSY if there are no free entries in @limit.
 */
static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry,
                struct xa_limit limit, u32 *next, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock(xa);
        err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp);
        xa_unlock(xa);

        return err;
}

/**
 * xa_alloc_cyclic_bh() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of allocated ID.
 * @next: Pointer to next ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 * The search for an empty entry will start at @next and will wrap
 * around if necessary.
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.  May sleep if the @gfp flags permit.
 * Return: 0 if the allocation succeeded without wrapping.  1 if the
 * allocation succeeded after wrapping, -ENOMEM if memory could not be
 * allocated or -EBUSY if there are no free entries in @limit.
 */
static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry,
                struct xa_limit limit, u32 *next, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock_bh(xa);
        err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp);
        xa_unlock_bh(xa);

        return err;
}

/**
 * xa_alloc_cyclic_irq() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of allocated ID.
 * @next: Pointer to next ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 * The search for an empty entry will start at @next and will wrap
 * around if necessary.
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.  May sleep if the @gfp flags permit.
 * Return: 0 if the allocation succeeded without wrapping.  1 if the
 * allocation succeeded after wrapping, -ENOMEM if memory could not be
 * allocated or -EBUSY if there are no free entries in @limit.
 */
static inline int xa_alloc_cyclic_irq(struct xarray *xa, u32 *id, void *entry,
                struct xa_limit limit, u32 *next, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock_irq(xa);
        err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp);
        xa_unlock_irq(xa);

        return err;
}

/**
 * xa_reserve() - Reserve this index in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @gfp: Memory allocation flags.
 *
 * Ensures there is somewhere to store an entry at @index in the array.
 * If there is already something stored at @index, this function does
 * nothing.  If there was nothing there, the entry is marked as reserved.
 * Loading from a reserved entry returns a %NULL pointer.
 *
 * If you do not use the entry that you have reserved, call xa_release()
 * or xa_erase() to free any unnecessary memory.
 *
 * Context: Any context.  Takes and releases the xa_lock.
 * May sleep if the @gfp flags permit.
 * Return: 0 if the reservation succeeded or -ENOMEM if it failed.
 */
static inline __must_check
int xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp)
{
        return xa_err(xa_cmpxchg(xa, index, NULL, XA_ZERO_ENTRY, gfp));
}

/**
 * xa_reserve_bh() - Reserve this index in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @gfp: Memory allocation flags.
 *
 * A softirq-disabling version of xa_reserve().
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.
 * Return: 0 if the reservation succeeded or -ENOMEM if it failed.
 */
static inline __must_check
int xa_reserve_bh(struct xarray *xa, unsigned long index, gfp_t gfp)
{
        return xa_err(xa_cmpxchg_bh(xa, index, NULL, XA_ZERO_ENTRY, gfp));
}

/**
 * xa_reserve_irq() - Reserve this index in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @gfp: Memory allocation flags.
 *
 * An interrupt-disabling version of xa_reserve().
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.
 * Return: 0 if the reservation succeeded or -ENOMEM if it failed.
 */
static inline __must_check
int xa_reserve_irq(struct xarray *xa, unsigned long index, gfp_t gfp)
{
        return xa_err(xa_cmpxchg_irq(xa, index, NULL, XA_ZERO_ENTRY, gfp));
}

/**
 * xa_release() - Release a reserved entry.
 * @xa: XArray.
 * @index: Index of entry.
 *
 * After calling xa_reserve(), you can call this function to release the
 * reservation.  If the entry at @index has been stored to, this function
 * will do nothing.
 */
static inline void xa_release(struct xarray *xa, unsigned long index)
{
        xa_cmpxchg(xa, index, XA_ZERO_ENTRY, NULL, 0);
}

/* Everything below here is the Advanced API.  Proceed with caution. */

/*
 * The xarray is constructed out of a set of 'chunks' of pointers.  Choosing
 * the best chunk size requires some tradeoffs.  A power of two recommends
 * itself so that we can walk the tree based purely on shifts and masks.
 * Generally, the larger the better; as the number of slots per level of the
 * tree increases, the less tall the tree needs to be.  But that needs to be
 * balanced against the memory consumption of each node.  On a 64-bit system,
 * xa_node is currently 576 bytes, and we get 7 of them per 4kB page.  If we
 * doubled the number of slots per node, we'd get only 3 nodes per 4kB page.
 */
#ifndef XA_CHUNK_SHIFT
#define XA_CHUNK_SHIFT                (CONFIG_BASE_SMALL ? 4 : 6)
#endif
#define XA_CHUNK_SIZE                (1UL << XA_CHUNK_SHIFT)
#define XA_CHUNK_MASK                (XA_CHUNK_SIZE - 1)
#define XA_MAX_MARKS                3
#define XA_MARK_LONGS                DIV_ROUND_UP(XA_CHUNK_SIZE, BITS_PER_LONG)

/*
 * @count is the count of every non-NULL element in the ->slots array
 * whether that is a value entry, a retry entry, a user pointer,
 * a sibling entry or a pointer to the next level of the tree.
 * @nr_values is the count of every element in ->slots which is
 * either a value entry or a sibling of a value entry.
 */
struct xa_node {
        unsigned char        shift;                /* Bits remaining in each slot */
        unsigned char        offset;                /* Slot offset in parent */
        unsigned char        count;                /* Total entry count */
        unsigned char        nr_values;        /* Value entry count */
        struct xa_node __rcu *parent;        /* NULL at top of tree */
        struct xarray        *array;                /* The array we belong to */
        union {
                struct list_head private_list;        /* For tree user */
                struct rcu_head        rcu_head;        /* Used when freeing node */
        };
        void __rcu        *slots[XA_CHUNK_SIZE];
        union {
                unsigned long        tags[XA_MAX_MARKS][XA_MARK_LONGS];
                unsigned long        marks[XA_MAX_MARKS][XA_MARK_LONGS];
        };
};

void xa_dump(const struct xarray *);
void xa_dump_node(const struct xa_node *);

#ifdef XA_DEBUG
#define XA_BUG_ON(xa, x) do {                                        \
                if (x) {                                        \
                        xa_dump(xa);                                \
                        BUG();                                        \
                }                                                \
        } while (0)
#define XA_NODE_BUG_ON(node, x) do {                                \
                if (x) {                                        \
                        if (node) xa_dump_node(node);                \
                        BUG();                                        \
                }                                                \
        } while (0)
#else
#define XA_BUG_ON(xa, x)        do { } while (0)
#define XA_NODE_BUG_ON(node, x)        do { } while (0)
#endif

/* Private */
static inline void *xa_head(const struct xarray *xa)
{
        return rcu_dereference_check(xa->xa_head,
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline void *xa_head_locked(const struct xarray *xa)
{
        return rcu_dereference_protected(xa->xa_head,
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline void *xa_entry(const struct xarray *xa,
                                const struct xa_node *node, unsigned int offset)
{
        XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE);
        return rcu_dereference_check(node->slots[offset],
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline void *xa_entry_locked(const struct xarray *xa,
                                const struct xa_node *node, unsigned int offset)
{
        XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE);
        return rcu_dereference_protected(node->slots[offset],
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline struct xa_node *xa_parent(const struct xarray *xa,
                                        const struct xa_node *node)
{
        return rcu_dereference_check(node->parent,
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline struct xa_node *xa_parent_locked(const struct xarray *xa,
                                        const struct xa_node *node)
{
        return rcu_dereference_protected(node->parent,
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline void *xa_mk_node(const struct xa_node *node)
{
        return (void *)((unsigned long)node | 2);
}

/* Private */
static inline struct xa_node *xa_to_node(const void *entry)
{
        return (struct xa_node *)((unsigned long)entry - 2);
}

/* Private */
static inline bool xa_is_node(const void *entry)
{
        return xa_is_internal(entry) && (unsigned long)entry > 4096;
}

/* Private */
static inline void *xa_mk_sibling(unsigned int offset)
{
        return xa_mk_internal(offset);
}

/* Private */
static inline unsigned long xa_to_sibling(const void *entry)
{
        return xa_to_internal(entry);
}

/**
 * xa_is_sibling() - Is the entry a sibling entry?
 * @entry: Entry retrieved from the XArray
 *
 * Return: %true if the entry is a sibling entry.
 */
static inline bool xa_is_sibling(const void *entry)
{
        return IS_ENABLED(CONFIG_XARRAY_MULTI) && xa_is_internal(entry) &&
                (entry < xa_mk_sibling(XA_CHUNK_SIZE - 1));
}

#define XA_RETRY_ENTRY                xa_mk_internal(256)

/**
 * xa_is_retry() - Is the entry a retry entry?
 * @entry: Entry retrieved from the XArray
 *
 * Return: %true if the entry is a retry entry.
 */
static inline bool xa_is_retry(const void *entry)
{
        return unlikely(entry == XA_RETRY_ENTRY);
}

/**
 * xa_is_advanced() - Is the entry only permitted for the advanced API?
 * @entry: Entry to be stored in the XArray.
 *
 * Return: %true if the entry cannot be stored by the normal API.
 */
static inline bool xa_is_advanced(const void *entry)
{
        return xa_is_internal(entry) && (entry <= XA_RETRY_ENTRY);
}

/**
 * typedef xa_update_node_t - A callback function from the XArray.
 * @node: The node which is being processed
 *
 * This function is called every time the XArray updates the count of
 * present and value entries in a node.  It allows advanced users to
 * maintain the private_list in the node.
 *
 * Context: The xa_lock is held and interrupts may be disabled.
 *            Implementations should not drop the xa_lock, nor re-enable
 *            interrupts.
 */
typedef void (*xa_update_node_t)(struct xa_node *node);

void xa_delete_node(struct xa_node *, xa_update_node_t);

/*
 * The xa_state is opaque to its users.  It contains various different pieces
 * of state involved in the current operation on the XArray.  It should be
 * declared on the stack and passed between the various internal routines.
 * The various elements in it should not be accessed directly, but only
 * through the provided accessor functions.  The below documentation is for
 * the benefit of those working on the code, not for users of the XArray.
 *
 * @xa_node usually points to the xa_node containing the slot we're operating
 * on (and @xa_offset is the offset in the slots array).  If there is a
 * single entry in the array at index 0, there are no allocated xa_nodes to
 * point to, and so we store %NULL in @xa_node.  @xa_node is set to
 * the value %XAS_RESTART if the xa_state is not walked to the correct
 * position in the tree of nodes for this operation.  If an error occurs
 * during an operation, it is set to an %XAS_ERROR value.  If we run off the
 * end of the allocated nodes, it is set to %XAS_BOUNDS.
 */
struct xa_state {
        struct xarray *xa;
        unsigned long xa_index;
        unsigned char xa_shift;
        unsigned char xa_sibs;
        unsigned char xa_offset;
        unsigned char xa_pad;                /* Helps gcc generate better code */
        struct xa_node *xa_node;
        struct xa_node *xa_alloc;
        xa_update_node_t xa_update;
};

/*
 * We encode errnos in the xas->xa_node.  If an error has happened, we need to
 * drop the lock to fix it, and once we've done so the xa_state is invalid.
 */
#define XA_ERROR(errno) ((struct xa_node *)(((unsigned long)errno << 2) | 2UL))
#define XAS_BOUNDS        ((struct xa_node *)1UL)
#define XAS_RESTART        ((struct xa_node *)3UL)

#define __XA_STATE(array, index, shift, sibs)  {        \
        .xa = array,                                        \
        .xa_index = index,                                \
        .xa_shift = shift,                                \
        .xa_sibs = sibs,                                \
        .xa_offset = 0,                                        \
        .xa_pad = 0,                                        \
        .xa_node = XAS_RESTART,                                \
        .xa_alloc = NULL,                                \
        .xa_update = NULL                                \
}

/**
 * XA_STATE() - Declare an XArray operation state.
 * @name: Name of this operation state (usually xas).
 * @array: Array to operate on.
 * @index: Initial index of interest.
 *
 * Declare and initialise an xa_state on the stack.
 */
#define XA_STATE(name, array, index)                                \
        struct xa_state name = __XA_STATE(array, index, 0, 0)

/**
 * XA_STATE_ORDER() - Declare an XArray operation state.
 * @name: Name of this operation state (usually xas).
 * @array: Array to operate on.
 * @index: Initial index of interest.
 * @order: Order of entry.
 *
 * Declare and initialise an xa_state on the stack.  This variant of
 * XA_STATE() allows you to specify the 'order' of the element you
 * want to operate on.`
 */
#define XA_STATE_ORDER(name, array, index, order)                \
        struct xa_state name = __XA_STATE(array,                \
                        (index >> order) << order,                \
                        order - (order % XA_CHUNK_SHIFT),        \
                        (1U << (order % XA_CHUNK_SHIFT)) - 1)

#define xas_marked(xas, mark)        xa_marked((xas)->xa, (mark))
#define xas_trylock(xas)        xa_trylock((xas)->xa)
#define xas_lock(xas)                xa_lock((xas)->xa)
#define xas_unlock(xas)                xa_unlock((xas)->xa)
#define xas_lock_bh(xas)        xa_lock_bh((xas)->xa)
#define xas_unlock_bh(xas)        xa_unlock_bh((xas)->xa)
#define xas_lock_irq(xas)        xa_lock_irq((xas)->xa)
#define xas_unlock_irq(xas)        xa_unlock_irq((xas)->xa)
#define xas_lock_irqsave(xas, flags) \
                                xa_lock_irqsave((xas)->xa, flags)
#define xas_unlock_irqrestore(xas, flags) \
                                xa_unlock_irqrestore((xas)->xa, flags)

/**
 * xas_error() - Return an errno stored in the xa_state.
 * @xas: XArray operation state.
 *
 * Return: 0 if no error has been noted.  A negative errno if one has.
 */
static inline int xas_error(const struct xa_state *xas)
{
        return xa_err(xas->xa_node);
}

/**
 * xas_set_err() - Note an error in the xa_state.
 * @xas: XArray operation state.
 * @err: Negative error number.
 *
 * Only call this function with a negative @err; zero or positive errors
 * will probably not behave the way you think they should.  If you want
 * to clear the error from an xa_state, use xas_reset().
 */
static inline void xas_set_err(struct xa_state *xas, long err)
{
        xas->xa_node = XA_ERROR(err);
}

/**
 * xas_invalid() - Is the xas in a retry or error state?
 * @xas: XArray operation state.
 *
 * Return: %true if the xas cannot be used for operations.
 */
static inline bool xas_invalid(const struct xa_state *xas)
{
        return (unsigned long)xas->xa_node & 3;
}

/**
 * xas_valid() - Is the xas a valid cursor into the array?
 * @xas: XArray operation state.
 *
 * Return: %true if the xas can be used for operations.
 */
static inline bool xas_valid(const struct xa_state *xas)
{
        return !xas_invalid(xas);
}

/**
 * xas_is_node() - Does the xas point to a node?
 * @xas: XArray operation state.
 *
 * Return: %true if the xas currently references a node.
 */
static inline bool xas_is_node(const struct xa_state *xas)
{
        return xas_valid(xas) && xas->xa_node;
}

/* True if the pointer is something other than a node */
static inline bool xas_not_node(struct xa_node *node)
{
        return ((unsigned long)node & 3) || !node;
}

/* True if the node represents RESTART or an error */
static inline bool xas_frozen(struct xa_node *node)
{
        return (unsigned long)node & 2;
}

/* True if the node represents head-of-tree, RESTART or BOUNDS */
static inline bool xas_top(struct xa_node *node)
{
        return node <= XAS_RESTART;
}

/**
 * xas_reset() - Reset an XArray operation state.
 * @xas: XArray operation state.
 *
 * Resets the error or walk state of the @xas so future walks of the
 * array will start from the root.  Use this if you have dropped the
 * xarray lock and want to reuse the xa_state.
 *
 * Context: Any context.
 */
static inline void xas_reset(struct xa_state *xas)
{
        xas->xa_node = XAS_RESTART;
}

/**
 * xas_retry() - Retry the operation if appropriate.
 * @xas: XArray operation state.
 * @entry: Entry from xarray.
 *
 * The advanced functions may sometimes return an internal entry, such as
 * a retry entry or a zero entry.  This function sets up the @xas to restart
 * the walk from the head of the array if needed.
 *
 * Context: Any context.
 * Return: true if the operation needs to be retried.
 */
static inline bool xas_retry(struct xa_state *xas, const void *entry)
{
        if (xa_is_zero(entry))
                return true;
        if (!xa_is_retry(entry))
                return false;
        xas_reset(xas);
        return true;
}

void *xas_load(struct xa_state *);
void *xas_store(struct xa_state *, void *entry);
void *xas_find(struct xa_state *, unsigned long max);
void *xas_find_conflict(struct xa_state *);

bool xas_get_mark(const struct xa_state *, xa_mark_t);
void xas_set_mark(const struct xa_state *, xa_mark_t);
void xas_clear_mark(const struct xa_state *, xa_mark_t);
void *xas_find_marked(struct xa_state *, unsigned long max, xa_mark_t);
void xas_init_marks(const struct xa_state *);

bool xas_nomem(struct xa_state *, gfp_t);
void xas_pause(struct xa_state *);

void xas_create_range(struct xa_state *);

#ifdef CONFIG_XARRAY_MULTI
int xa_get_order(struct xarray *, unsigned long index);
void xas_split(struct xa_state *, void *entry, unsigned int order);
void xas_split_alloc(struct xa_state *, void *entry, unsigned int order, gfp_t);
#else
static inline int xa_get_order(struct xarray *xa, unsigned long index)
{
        return 0;
}

static inline void xas_split(struct xa_state *xas, void *entry,
                unsigned int order)
{
        xas_store(xas, entry);
}

static inline void xas_split_alloc(struct xa_state *xas, void *entry,
                unsigned int order, gfp_t gfp)
{
}
#endif

/**
 * xas_reload() - Refetch an entry from the xarray.
 * @xas: XArray operation state.
 *
 * Use this function to check that a previously loaded entry still has
 * the same value.  This is useful for the lockless pagecache lookup where
 * we walk the array with only the RCU lock to protect us, lock the page,
 * then check that the page hasn't moved since we looked it up.
 *
 * The caller guarantees that @xas is still valid.  If it may be in an
 * error or restart state, call xas_load() instead.
 *
 * Return: The entry at this location in the xarray.
 */
static inline void *xas_reload(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;
        void *entry;
        char offset;

        if (!node)
                return xa_head(xas->xa);
        if (IS_ENABLED(CONFIG_XARRAY_MULTI)) {
                offset = (xas->xa_index >> node->shift) & XA_CHUNK_MASK;
                entry = xa_entry(xas->xa, node, offset);
                if (!xa_is_sibling(entry))
                        return entry;
                offset = xa_to_sibling(entry);
        } else {
                offset = xas->xa_offset;
        }
        return xa_entry(xas->xa, node, offset);
}

/**
 * xas_set() - Set up XArray operation state for a different index.
 * @xas: XArray operation state.
 * @index: New index into the XArray.
 *
 * Move the operation state to refer to a different index.  This will
 * have the effect of starting a walk from the top; see xas_next()
 * to move to an adjacent index.
 */
static inline void xas_set(struct xa_state *xas, unsigned long index)
{
        xas->xa_index = index;
        xas->xa_node = XAS_RESTART;
}

/**
 * xas_set_order() - Set up XArray operation state for a multislot entry.
 * @xas: XArray operation state.
 * @index: Target of the operation.
 * @order: Entry occupies 2^@order indices.
 */
static inline void xas_set_order(struct xa_state *xas, unsigned long index,
                                        unsigned int order)
{
#ifdef CONFIG_XARRAY_MULTI
        xas->xa_index = order < BITS_PER_LONG ? (index >> order) << order : 0;
        xas->xa_shift = order - (order % XA_CHUNK_SHIFT);
        xas->xa_sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1;
        xas->xa_node = XAS_RESTART;
#else
        BUG_ON(order > 0);
        xas_set(xas, index);
#endif
}

/**
 * xas_set_update() - Set up XArray operation state for a callback.
 * @xas: XArray operation state.
 * @update: Function to call when updating a node.
 *
 * The XArray can notify a caller after it has updated an xa_node.
 * This is advanced functionality and is only needed by the page cache.
 */
static inline void xas_set_update(struct xa_state *xas, xa_update_node_t update)
{
        xas->xa_update = update;
}

/**
 * xas_next_entry() - Advance iterator to next present entry.
 * @xas: XArray operation state.
 * @max: Highest index to return.
 *
 * xas_next_entry() is an inline function to optimise xarray traversal for
 * speed.  It is equivalent to calling xas_find(), and will call xas_find()
 * for all the hard cases.
 *
 * Return: The next present entry after the one currently referred to by @xas.
 */
static inline void *xas_next_entry(struct xa_state *xas, unsigned long max)
{
        struct xa_node *node = xas->xa_node;
        void *entry;

        if (unlikely(xas_not_node(node) || node->shift ||
                        xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK)))
                return xas_find(xas, max);

        do {
                if (unlikely(xas->xa_index >= max))
                        return xas_find(xas, max);
                if (unlikely(xas->xa_offset == XA_CHUNK_MASK))
                        return xas_find(xas, max);
                entry = xa_entry(xas->xa, node, xas->xa_offset + 1);
                if (unlikely(xa_is_internal(entry)))
                        return xas_find(xas, max);
                xas->xa_offset++;
                xas->xa_index++;
        } while (!entry);

        return entry;
}

/* Private */
static inline unsigned int xas_find_chunk(struct xa_state *xas, bool advance,
                xa_mark_t mark)
{
        unsigned long *addr = xas->xa_node->marks[(__force unsigned)mark];
        unsigned int offset = xas->xa_offset;

        if (advance)
                offset++;
        if (XA_CHUNK_SIZE == BITS_PER_LONG) {
                if (offset < XA_CHUNK_SIZE) {
                        unsigned long data = *addr & (~0UL << offset);
                        if (data)
                                return __ffs(data);
                }
                return XA_CHUNK_SIZE;
        }

        return find_next_bit(addr, XA_CHUNK_SIZE, offset);
}

/**
 * xas_next_marked() - Advance iterator to next marked entry.
 * @xas: XArray operation state.
 * @max: Highest index to return.
 * @mark: Mark to search for.
 *
 * xas_next_marked() is an inline function to optimise xarray traversal for
 * speed.  It is equivalent to calling xas_find_marked(), and will call
 * xas_find_marked() for all the hard cases.
 *
 * Return: The next marked entry after the one currently referred to by @xas.
 */
static inline void *xas_next_marked(struct xa_state *xas, unsigned long max,
                                                                xa_mark_t mark)
{
        struct xa_node *node = xas->xa_node;
        void *entry;
        unsigned int offset;

        if (unlikely(xas_not_node(node) || node->shift))
                return xas_find_marked(xas, max, mark);
        offset = xas_find_chunk(xas, true, mark);
        xas->xa_offset = offset;
        xas->xa_index = (xas->xa_index & ~XA_CHUNK_MASK) + offset;
        if (xas->xa_index > max)
                return NULL;
        if (offset == XA_CHUNK_SIZE)
                return xas_find_marked(xas, max, mark);
        entry = xa_entry(xas->xa, node, offset);
        if (!entry)
                return xas_find_marked(xas, max, mark);
        return entry;
}

/*
 * If iterating while holding a lock, drop the lock and reschedule
 * every %XA_CHECK_SCHED loops.
 */
enum {
        XA_CHECK_SCHED = 4096,
};

/**
 * xas_for_each() - Iterate over a range of an XArray.
 * @xas: XArray operation state.
 * @entry: Entry retrieved from the array.
 * @max: Maximum index to retrieve from array.
 *
 * The loop body will be executed for each entry present in the xarray
 * between the current xas position and @max.  @entry will be set to
 * the entry retrieved from the xarray.  It is safe to delete entries
 * from the array in the loop body.  You should hold either the RCU lock
 * or the xa_lock while iterating.  If you need to drop the lock, call
 * xas_pause() first.
 */
#define xas_for_each(xas, entry, max) \
        for (entry = xas_find(xas, max); entry; \
             entry = xas_next_entry(xas, max))

/**
 * xas_for_each_marked() - Iterate over a range of an XArray.
 * @xas: XArray operation state.
 * @entry: Entry retrieved from the array.
 * @max: Maximum index to retrieve from array.
 * @mark: Mark to search for.
 *
 * The loop body will be executed for each marked entry in the xarray
 * between the current xas position and @max.  @entry will be set to
 * the entry retrieved from the xarray.  It is safe to delete entries
 * from the array in the loop body.  You should hold either the RCU lock
 * or the xa_lock while iterating.  If you need to drop the lock, call
 * xas_pause() first.
 */
#define xas_for_each_marked(xas, entry, max, mark) \
        for (entry = xas_find_marked(xas, max, mark); entry; \
             entry = xas_next_marked(xas, max, mark))

/**
 * xas_for_each_conflict() - Iterate over a range of an XArray.
 * @xas: XArray operation state.
 * @entry: Entry retrieved from the array.
 *
 * The loop body will be executed for each entry in the XArray that
 * lies within the range specified by @xas.  If the loop terminates
 * normally, @entry will be %NULL.  The user may break out of the loop,
 * which will leave @entry set to the conflicting entry.  The caller
 * may also call xa_set_err() to exit the loop while setting an error
 * to record the reason.
 */
#define xas_for_each_conflict(xas, entry) \
        while ((entry = xas_find_conflict(xas)))

void *__xas_next(struct xa_state *);
void *__xas_prev(struct xa_state *);

/**
 * xas_prev() - Move iterator to previous index.
 * @xas: XArray operation state.
 *
 * If the @xas was in an error state, it will remain in an error state
 * and this function will return %NULL.  If the @xas has never been walked,
 * it will have the effect of calling xas_load().  Otherwise one will be
 * subtracted from the index and the state will be walked to the correct
 * location in the array for the next operation.
 *
 * If the iterator was referencing index 0, this function wraps
 * around to %ULONG_MAX.
 *
 * Return: The entry at the new index.  This may be %NULL or an internal
 * entry.
 */
static inline void *xas_prev(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;

        if (unlikely(xas_not_node(node) || node->shift ||
                                xas->xa_offset == 0))
                return __xas_prev(xas);

        xas->xa_index--;
        xas->xa_offset--;
        return xa_entry(xas->xa, node, xas->xa_offset);
}

/**
 * xas_next() - Move state to next index.
 * @xas: XArray operation state.
 *
 * If the @xas was in an error state, it will remain in an error state
 * and this function will return %NULL.  If the @xas has never been walked,
 * it will have the effect of calling xas_load().  Otherwise one will be
 * added to the index and the state will be walked to the correct
 * location in the array for the next operation.
 *
 * If the iterator was referencing index %ULONG_MAX, this function wraps
 * around to 0.
 *
 * Return: The entry at the new index.  This may be %NULL or an internal
 * entry.
 */
static inline void *xas_next(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;

        if (unlikely(xas_not_node(node) || node->shift ||
                                xas->xa_offset == XA_CHUNK_MASK))
                return __xas_next(xas);

        xas->xa_index++;
        xas->xa_offset++;
        return xa_entry(xas->xa, node, xas->xa_offset);
}

#endif /* _LINUX_XARRAY_H */

































































































































































































































    3 







































































    1 

    1 
    1 




























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/kernel/exit.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/sched/autogroup.h>
#include <linux/sched/mm.h>
#include <linux/sched/stat.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
#include <linux/interrupt.h>
#include <linux/module.h>
#include <linux/capability.h>
#include <linux/completion.h>
#include <linux/personality.h>
#include <linux/tty.h>
#include <linux/iocontext.h>
#include <linux/key.h>
#include <linux/cpu.h>
#include <linux/acct.h>
#include <linux/tsacct_kern.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/freezer.h>
#include <linux/binfmts.h>
#include <linux/nsproxy.h>
#include <linux/pid_namespace.h>
#include <linux/ptrace.h>
#include <linux/profile.h>
#include <linux/mount.h>
#include <linux/proc_fs.h>
#include <linux/kthread.h>
#include <linux/mempolicy.h>
#include <linux/taskstats_kern.h>
#include <linux/delayacct.h>
#include <linux/cgroup.h>
#include <linux/syscalls.h>
#include <linux/signal.h>
#include <linux/posix-timers.h>
#include <linux/cn_proc.h>
#include <linux/mutex.h>
#include <linux/futex.h>
#include <linux/pipe_fs_i.h>
#include <linux/audit.h> /* for audit_free() */
#include <linux/resource.h>
#include <linux/blkdev.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/tracehook.h>
#include <linux/fs_struct.h>
#include <linux/init_task.h>
#include <linux/perf_event.h>
#include <trace/events/sched.h>
#include <linux/hw_breakpoint.h>
#include <linux/oom.h>
#include <linux/writeback.h>
#include <linux/shm.h>
#include <linux/kcov.h>
#include <linux/random.h>
#include <linux/rcuwait.h>
#include <linux/compat.h>
#include <linux/io_uring.h>
#include <linux/sysfs.h>

#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <asm/mmu_context.h>

/*
 * The default value should be high enough to not crash a system that randomly
 * crashes its kernel from time to time, but low enough to at least not permit
 * overflowing 32-bit refcounts or the ldsem writer count.
 */
static unsigned int oops_limit = 10000;

#ifdef CONFIG_SYSCTL
static struct ctl_table kern_exit_table[] = {
        {
                .procname       = "oops_limit",
                .data           = &oops_limit,
                .maxlen         = sizeof(oops_limit),
                .mode           = 0644,
                .proc_handler   = proc_douintvec,
        },
        { }
};

static __init int kernel_exit_sysctls_init(void)
{
        register_sysctl_init("kernel", kern_exit_table);
        return 0;
}
late_initcall(kernel_exit_sysctls_init);
#endif

static atomic_t oops_count = ATOMIC_INIT(0);

#ifdef CONFIG_SYSFS
static ssize_t oops_count_show(struct kobject *kobj, struct kobj_attribute *attr,
                               char *page)
{
        return sysfs_emit(page, "%d\n", atomic_read(&oops_count));
}

static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count);

static __init int kernel_exit_sysfs_init(void)
{
        sysfs_add_file_to_group(kernel_kobj, &oops_count_attr.attr, NULL);
        return 0;
}
late_initcall(kernel_exit_sysfs_init);
#endif

static void __unhash_process(struct task_struct *p, bool group_dead)
{
        nr_threads--;
        detach_pid(p, PIDTYPE_PID);
        if (group_dead) {
                detach_pid(p, PIDTYPE_TGID);
                detach_pid(p, PIDTYPE_PGID);
                detach_pid(p, PIDTYPE_SID);

                list_del_rcu(&p->tasks);
                list_del_init(&p->sibling);
                __this_cpu_dec(process_counts);
        }
        list_del_rcu(&p->thread_group);
        list_del_rcu(&p->thread_node);
}

/*
 * This function expects the tasklist_lock write-locked.
 */
static void __exit_signal(struct task_struct *tsk)
{
        struct signal_struct *sig = tsk->signal;
        bool group_dead = thread_group_leader(tsk);
        struct sighand_struct *sighand;
        struct tty_struct *tty;
        u64 utime, stime;

        sighand = rcu_dereference_check(tsk->sighand,
                                        lockdep_tasklist_lock_is_held());
        spin_lock(&sighand->siglock);

#ifdef CONFIG_POSIX_TIMERS
        posix_cpu_timers_exit(tsk);
        if (group_dead)
                posix_cpu_timers_exit_group(tsk);
#endif

        if (group_dead) {
                tty = sig->tty;
                sig->tty = NULL;
        } else {
                /*
                 * If there is any task waiting for the group exit
                 * then notify it:
                 */
                if (sig->notify_count > 0 && !--sig->notify_count)
                        wake_up_process(sig->group_exit_task);

                if (tsk == sig->curr_target)
                        sig->curr_target = next_thread(tsk);
        }

        add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
                              sizeof(unsigned long long));

        /*
         * Accumulate here the counters for all threads as they die. We could
         * skip the group leader because it is the last user of signal_struct,
         * but we want to avoid the race with thread_group_cputime() which can
         * see the empty ->thread_head list.
         */
        task_cputime(tsk, &utime, &stime);
        write_seqlock(&sig->stats_lock);
        sig->utime += utime;
        sig->stime += stime;
        sig->gtime += task_gtime(tsk);
        sig->min_flt += tsk->min_flt;
        sig->maj_flt += tsk->maj_flt;
        sig->nvcsw += tsk->nvcsw;
        sig->nivcsw += tsk->nivcsw;
        sig->inblock += task_io_get_inblock(tsk);
        sig->oublock += task_io_get_oublock(tsk);
        task_io_accounting_add(&sig->ioac, &tsk->ioac);
        sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
        sig->nr_threads--;
        __unhash_process(tsk, group_dead);
        write_sequnlock(&sig->stats_lock);

        /*
         * Do this under ->siglock, we can race with another thread
         * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
         */
        flush_sigqueue(&tsk->pending);
        tsk->sighand = NULL;
        spin_unlock(&sighand->siglock);

        __cleanup_sighand(sighand);
        clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
        if (group_dead) {
                flush_sigqueue(&sig->shared_pending);
                tty_kref_put(tty);
        }
}

static void delayed_put_task_struct(struct rcu_head *rhp)
{
        struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);

        perf_event_delayed_put(tsk);
        trace_sched_process_free(tsk);
        put_task_struct(tsk);
}

void put_task_struct_rcu_user(struct task_struct *task)
{
        if (refcount_dec_and_test(&task->rcu_users))
                call_rcu(&task->rcu, delayed_put_task_struct);
}

void release_task(struct task_struct *p)
{
        struct task_struct *leader;
        struct pid *thread_pid;
        int zap_leader;
repeat:
        /* don't need to get the RCU readlock here - the process is dead and
         * can't be modifying its own credentials. But shut RCU-lockdep up */
        rcu_read_lock();
        atomic_dec(&__task_cred(p)->user->processes);
        rcu_read_unlock();

        cgroup_release(p);

        write_lock_irq(&tasklist_lock);
        ptrace_release_task(p);
        thread_pid = get_pid(p->thread_pid);
        __exit_signal(p);

        /*
         * If we are the last non-leader member of the thread
         * group, and the leader is zombie, then notify the
         * group leader's parent process. (if it wants notification.)
         */
        zap_leader = 0;
        leader = p->group_leader;
        if (leader != p && thread_group_empty(leader)
                        && leader->exit_state == EXIT_ZOMBIE) {
                /*
                 * If we were the last child thread and the leader has
                 * exited already, and the leader's parent ignores SIGCHLD,
                 * then we are the one who should release the leader.
                 */
                zap_leader = do_notify_parent(leader, leader->exit_signal);
                if (zap_leader)
                        leader->exit_state = EXIT_DEAD;
        }

        write_unlock_irq(&tasklist_lock);
        seccomp_filter_release(p);
        proc_flush_pid(thread_pid);
        put_pid(thread_pid);
        release_thread(p);
        put_task_struct_rcu_user(p);

        p = leader;
        if (unlikely(zap_leader))
                goto repeat;
}

int rcuwait_wake_up(struct rcuwait *w)
{
        int ret = 0;
        struct task_struct *task;

        rcu_read_lock();

        /*
         * Order condition vs @task, such that everything prior to the load
         * of @task is visible. This is the condition as to why the user called
         * rcuwait_wake() in the first place. Pairs with set_current_state()
         * barrier (A) in rcuwait_wait_event().
         *
         *    WAIT                WAKE
         *    [S] tsk = current          [S] cond = true
         *        MB (A)              MB (B)
         *    [L] cond                  [L] tsk
         */
        smp_mb(); /* (B) */

        task = rcu_dereference(w->task);
        if (task)
                ret = wake_up_process(task);
        rcu_read_unlock();

        return ret;
}
EXPORT_SYMBOL_GPL(rcuwait_wake_up);

/*
 * Determine if a process group is "orphaned", according to the POSIX
 * definition in 2.2.2.52.  Orphaned process groups are not to be affected
 * by terminal-generated stop signals.  Newly orphaned process groups are
 * to receive a SIGHUP and a SIGCONT.
 *
 * "I ask you, have you ever known what it is to be an orphan?"
 */
static int will_become_orphaned_pgrp(struct pid *pgrp,
                                        struct task_struct *ignored_task)
{
        struct task_struct *p;

        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
                if ((p == ignored_task) ||
                    (p->exit_state && thread_group_empty(p)) ||
                    is_global_init(p->real_parent))
                        continue;

                if (task_pgrp(p->real_parent) != pgrp &&
                    task_session(p->real_parent) == task_session(p))
                        return 0;
        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);

        return 1;
}

int is_current_pgrp_orphaned(void)
{
        int retval;

        read_lock(&tasklist_lock);
        retval = will_become_orphaned_pgrp(task_pgrp(current), NULL);
        read_unlock(&tasklist_lock);

        return retval;
}

static bool has_stopped_jobs(struct pid *pgrp)
{
        struct task_struct *p;

        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
                if (p->signal->flags & SIGNAL_STOP_STOPPED)
                        return true;
        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);

        return false;
}

/*
 * Check to see if any process groups have become orphaned as
 * a result of our exiting, and if they have any stopped jobs,
 * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
 */
static void
kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
{
        struct pid *pgrp = task_pgrp(tsk);
        struct task_struct *ignored_task = tsk;

        if (!parent)
                /* exit: our father is in a different pgrp than
                 * we are and we were the only connection outside.
                 */
                parent = tsk->real_parent;
        else
                /* reparent: our child is in a different pgrp than
                 * we are, and it was the only connection outside.
                 */
                ignored_task = NULL;

        if (task_pgrp(parent) != pgrp &&
            task_session(parent) == task_session(tsk) &&
            will_become_orphaned_pgrp(pgrp, ignored_task) &&
            has_stopped_jobs(pgrp)) {
                __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
                __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
        }
}

#ifdef CONFIG_MEMCG
/*
 * A task is exiting.   If it owned this mm, find a new owner for the mm.
 */
void mm_update_next_owner(struct mm_struct *mm)
{
        struct task_struct *c, *g, *p = current;

retry:
        /*
         * If the exiting or execing task is not the owner, it's
         * someone else's problem.
         */
        if (mm->owner != p)
                return;
        /*
         * The current owner is exiting/execing and there are no other
         * candidates.  Do not leave the mm pointing to a possibly
         * freed task structure.
         */
        if (atomic_read(&mm->mm_users) <= 1) {
                WRITE_ONCE(mm->owner, NULL);
                return;
        }

        read_lock(&tasklist_lock);
        /*
         * Search in the children
         */
        list_for_each_entry(c, &p->children, sibling) {
                if (c->mm == mm)
                        goto assign_new_owner;
        }

        /*
         * Search in the siblings
         */
        list_for_each_entry(c, &p->real_parent->children, sibling) {
                if (c->mm == mm)
                        goto assign_new_owner;
        }

        /*
         * Search through everything else, we should not get here often.
         */
        for_each_process(g) {
                if (atomic_read(&mm->mm_users) <= 1)
                        break;
                if (g->flags & PF_KTHREAD)
                        continue;
                for_each_thread(g, c) {
                        if (c->mm == mm)
                                goto assign_new_owner;
                        if (c->mm)
                                break;
                }
        }
        read_unlock(&tasklist_lock);
        /*
         * We found no owner yet mm_users > 1: this implies that we are
         * most likely racing with swapoff (try_to_unuse()) or /proc or
         * ptrace or page migration (get_task_mm()).  Mark owner as NULL.
         */
        WRITE_ONCE(mm->owner, NULL);
        return;

assign_new_owner:
        BUG_ON(c == p);
        get_task_struct(c);
        /*
         * The task_lock protects c->mm from changing.
         * We always want mm->owner->mm == mm
         */
        task_lock(c);
        /*
         * Delay read_unlock() till we have the task_lock()
         * to ensure that c does not slip away underneath us
         */
        read_unlock(&tasklist_lock);
        if (c->mm != mm) {
                task_unlock(c);
                put_task_struct(c);
                goto retry;
        }
        WRITE_ONCE(mm->owner, c);
        task_unlock(c);
        put_task_struct(c);
}
#endif /* CONFIG_MEMCG */

/*
 * Turn us into a lazy TLB process if we
 * aren't already..
 */
static void exit_mm(void)
{
        struct mm_struct *mm = current->mm;
        struct core_state *core_state;

        exit_mm_release(current, mm);
        if (!mm)
                return;
        sync_mm_rss(mm);
        /*
         * Serialize with any possible pending coredump.
         * We must hold mmap_lock around checking core_state
         * and clearing tsk->mm.  The core-inducing thread
         * will increment ->nr_threads for each thread in the
         * group with ->mm != NULL.
         */
        mmap_read_lock(mm);
        core_state = mm->core_state;
        if (core_state) {
                struct core_thread self;

                mmap_read_unlock(mm);

                self.task = current;
                if (self.task->flags & PF_SIGNALED)
                        self.next = xchg(&core_state->dumper.next, &self);
                else
                        self.task = NULL;
                /*
                 * Implies mb(), the result of xchg() must be visible
                 * to core_state->dumper.
                 */
                if (atomic_dec_and_test(&core_state->nr_threads))
                        complete(&core_state->startup);

                for (;;) {
                        set_current_state(TASK_UNINTERRUPTIBLE);
                        if (!self.task) /* see coredump_finish() */
                                break;
                        freezable_schedule();
                }
                __set_current_state(TASK_RUNNING);
                mmap_read_lock(mm);
        }
        mmgrab(mm);
        BUG_ON(mm != current->active_mm);
        /* more a memory barrier than a real lock */
        task_lock(current);
        current->user_dumpable = (get_dumpable(mm) == SUID_DUMP_USER);
        current->mm = NULL;
        mmap_read_unlock(mm);
        enter_lazy_tlb(mm, current);
        task_unlock(current);
        mm_update_next_owner(mm);
        mmput(mm);
        if (test_thread_flag(TIF_MEMDIE))
                exit_oom_victim();
}

static struct task_struct *find_alive_thread(struct task_struct *p)
{
        struct task_struct *t;

        for_each_thread(p, t) {
                if (!(t->flags & PF_EXITING))
                        return t;
        }
        return NULL;
}

static struct task_struct *find_child_reaper(struct task_struct *father,
                                                struct list_head *dead)
        __releases(&tasklist_lock)
        __acquires(&tasklist_lock)
{
        struct pid_namespace *pid_ns = task_active_pid_ns(father);
        struct task_struct *reaper = pid_ns->child_reaper;
        struct task_struct *p, *n;

        if (likely(reaper != father))
                return reaper;

        reaper = find_alive_thread(father);
        if (reaper) {
                pid_ns->child_reaper = reaper;
                return reaper;
        }

        write_unlock_irq(&tasklist_lock);

        list_for_each_entry_safe(p, n, dead, ptrace_entry) {
                list_del_init(&p->ptrace_entry);
                release_task(p);
        }

        zap_pid_ns_processes(pid_ns);
        write_lock_irq(&tasklist_lock);

        return father;
}

/*
 * When we die, we re-parent all our children, and try to:
 * 1. give them to another thread in our thread group, if such a member exists
 * 2. give it to the first ancestor process which prctl'd itself as a
 *    child_subreaper for its children (like a service manager)
 * 3. give it to the init process (PID 1) in our pid namespace
 */
static struct task_struct *find_new_reaper(struct task_struct *father,
                                           struct task_struct *child_reaper)
{
        struct task_struct *thread, *reaper;

        thread = find_alive_thread(father);
        if (thread)
                return thread;

        if (father->signal->has_child_subreaper) {
                unsigned int ns_level = task_pid(father)->level;
                /*
                 * Find the first ->is_child_subreaper ancestor in our pid_ns.
                 * We can't check reaper != child_reaper to ensure we do not
                 * cross the namespaces, the exiting parent could be injected
                 * by setns() + fork().
                 * We check pid->level, this is slightly more efficient than
                 * task_active_pid_ns(reaper) != task_active_pid_ns(father).
                 */
                for (reaper = father->real_parent;
                     task_pid(reaper)->level == ns_level;
                     reaper = reaper->real_parent) {
                        if (reaper == &init_task)
                                break;
                        if (!reaper->signal->is_child_subreaper)
                                continue;
                        thread = find_alive_thread(reaper);
                        if (thread)
                                return thread;
                }
        }

        return child_reaper;
}

/*
* Any that need to be release_task'd are put on the @dead list.
 */
static void reparent_leader(struct task_struct *father, struct task_struct *p,
                                struct list_head *dead)
{
        if (unlikely(p->exit_state == EXIT_DEAD))
                return;

        /* We don't want people slaying init. */
        p->exit_signal = SIGCHLD;

        /* If it has exited notify the new parent about this child's death. */
        if (!p->ptrace &&
            p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
                if (do_notify_parent(p, p->exit_signal)) {
                        p->exit_state = EXIT_DEAD;
                        list_add(&p->ptrace_entry, dead);
                }
        }

        kill_orphaned_pgrp(p, father);
}

/*
 * This does two things:
 *
 * A.  Make init inherit all the child processes
 * B.  Check to see if any process groups have become orphaned
 *        as a result of our exiting, and if they have any stopped
 *        jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
 */
static void forget_original_parent(struct task_struct *father,
                                        struct list_head *dead)
{
        struct task_struct *p, *t, *reaper;

        if (unlikely(!list_empty(&father->ptraced)))
                exit_ptrace(father, dead);

        /* Can drop and reacquire tasklist_lock */
        reaper = find_child_reaper(father, dead);
        if (list_empty(&father->children))
                return;

        reaper = find_new_reaper(father, reaper);
        list_for_each_entry(p, &father->children, sibling) {
                for_each_thread(p, t) {
                        RCU_INIT_POINTER(t->real_parent, reaper);
                        BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father));
                        if (likely(!t->ptrace))
                                t->parent = t->real_parent;
                        if (t->pdeath_signal)
                                group_send_sig_info(t->pdeath_signal,
                                                    SEND_SIG_NOINFO, t,
                                                    PIDTYPE_TGID);
                }
                /*
                 * If this is a threaded reparent there is no need to
                 * notify anyone anything has happened.
                 */
                if (!same_thread_group(reaper, father))
                        reparent_leader(father, p, dead);
        }
        list_splice_tail_init(&father->children, &reaper->children);
}

/*
 * Send signals to all our closest relatives so that they know
 * to properly mourn us..
 */
static void exit_notify(struct task_struct *tsk, int group_dead)
{
        bool autoreap;
        struct task_struct *p, *n;
        LIST_HEAD(dead);

        write_lock_irq(&tasklist_lock);
        forget_original_parent(tsk, &dead);

        if (group_dead)
                kill_orphaned_pgrp(tsk->group_leader, NULL);

        tsk->exit_state = EXIT_ZOMBIE;
        if (unlikely(tsk->ptrace)) {
                int sig = thread_group_leader(tsk) &&
                                thread_group_empty(tsk) &&
                                !ptrace_reparented(tsk) ?
                        tsk->exit_signal : SIGCHLD;
                autoreap = do_notify_parent(tsk, sig);
        } else if (thread_group_leader(tsk)) {
                autoreap = thread_group_empty(tsk) &&
                        do_notify_parent(tsk, tsk->exit_signal);
        } else {
                autoreap = true;
        }

        if (autoreap) {
                tsk->exit_state = EXIT_DEAD;
                list_add(&tsk->ptrace_entry, &dead);
        }

        /* mt-exec, de_thread() is waiting for group leader */
        if (unlikely(tsk->signal->notify_count < 0))
                wake_up_process(tsk->signal->group_exit_task);
        write_unlock_irq(&tasklist_lock);

        list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
                list_del_init(&p->ptrace_entry);
                release_task(p);
        }
}

#ifdef CONFIG_DEBUG_STACK_USAGE
static void check_stack_usage(void)
{
        static DEFINE_SPINLOCK(low_water_lock);
        static int lowest_to_date = THREAD_SIZE;
        unsigned long free;

        free = stack_not_used(current);

        if (free >= lowest_to_date)
                return;

        spin_lock(&low_water_lock);
        if (free < lowest_to_date) {
                pr_info("%s (%d) used greatest stack depth: %lu bytes left\n",
                        current->comm, task_pid_nr(current), free);
                lowest_to_date = free;
        }
        spin_unlock(&low_water_lock);
}
#else
static inline void check_stack_usage(void) {}
#endif

void __noreturn do_exit(long code)
{
        struct task_struct *tsk = current;
        int group_dead;

        /*
         * We can get here from a kernel oops, sometimes with preemption off.
         * Start by checking for critical errors.
         * Then fix up important state like USER_DS and preemption.
         * Then do everything else.
         */

        WARN_ON(blk_needs_flush_plug(tsk));

        if (unlikely(in_interrupt()))
                panic("Aiee, killing interrupt handler!");
        if (unlikely(!tsk->pid))
                panic("Attempted to kill the idle task!");

        /*
         * If do_exit is called because this processes oopsed, it's possible
         * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
         * continuing. Amongst other possible reasons, this is to prevent
         * mm_release()->clear_child_tid() from writing to a user-controlled
         * kernel address.
         */
        force_uaccess_begin();

        if (unlikely(in_atomic())) {
                pr_info("note: %s[%d] exited with preempt_count %d\n",
                        current->comm, task_pid_nr(current),
                        preempt_count());
                preempt_count_set(PREEMPT_ENABLED);
        }

        profile_task_exit(tsk);
        kcov_task_exit(tsk);

        ptrace_event(PTRACE_EVENT_EXIT, code);

        validate_creds_for_do_exit(tsk);

        /*
         * We're taking recursive faults here in do_exit. Safest is to just
         * leave this task alone and wait for reboot.
         */
        if (unlikely(tsk->flags & PF_EXITING)) {
                pr_alert("Fixing recursive fault but reboot is needed!\n");
                futex_exit_recursive(tsk);
                set_current_state(TASK_UNINTERRUPTIBLE);
                schedule();
        }

        io_uring_files_cancel();
        exit_signals(tsk);  /* sets PF_EXITING */

        /* sync mm's RSS info before statistics gathering */
        if (tsk->mm)
                sync_mm_rss(tsk->mm);
        acct_update_integrals(tsk);
        group_dead = atomic_dec_and_test(&tsk->signal->live);
        if (group_dead) {
                /*
                 * If the last thread of global init has exited, panic
                 * immediately to get a useable coredump.
                 */
                if (unlikely(is_global_init(tsk)))
                        panic("Attempted to kill init! exitcode=0x%08x\n",
                                tsk->signal->group_exit_code ?: (int)code);

#ifdef CONFIG_POSIX_TIMERS
                hrtimer_cancel(&tsk->signal->real_timer);
                exit_itimers(tsk);
#endif
                if (tsk->mm)
                        setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
        }
        acct_collect(code, group_dead);
        if (group_dead)
                tty_audit_exit();
        audit_free(tsk);

        tsk->exit_code = code;
        taskstats_exit(tsk, group_dead);

        /*
         * Since sampling can touch ->mm, make sure to stop everything before we
         * tear it down.
         *
         * Also flushes inherited counters to the parent - before the parent
         * gets woken up by child-exit notifications.
         */
        perf_event_exit_task(tsk);

        exit_mm();

        if (group_dead)
                acct_process();
        trace_sched_process_exit(tsk);

        exit_sem(tsk);
        exit_shm(tsk);
        exit_files(tsk);
        exit_fs(tsk);
        if (group_dead)
                disassociate_ctty(1);
        exit_task_namespaces(tsk);
        exit_task_work(tsk);
        exit_thread(tsk);

        sched_autogroup_exit_task(tsk);
        cgroup_exit(tsk);

        /*
         * FIXME: do that only when needed, using sched_exit tracepoint
         */
        flush_ptrace_hw_breakpoint(tsk);

        exit_tasks_rcu_start();
        exit_notify(tsk, group_dead);
        proc_exit_connector(tsk);
        mpol_put_task_policy(tsk);
#ifdef CONFIG_FUTEX
        if (unlikely(current->pi_state_cache))
                kfree(current->pi_state_cache);
#endif
        /*
         * Make sure we are holding no locks:
         */
        debug_check_no_locks_held();

        if (tsk->io_context)
                exit_io_context(tsk);

        if (tsk->splice_pipe)
                free_pipe_info(tsk->splice_pipe);

        if (tsk->task_frag.page)
                put_page(tsk->task_frag.page);

        validate_creds_for_do_exit(tsk);

        check_stack_usage();
        preempt_disable();
        if (tsk->nr_dirtied)
                __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
        exit_rcu();
        exit_tasks_rcu_finish();

        lockdep_free_task(tsk);
        do_task_dead();
}
EXPORT_SYMBOL_GPL(do_exit);

void __noreturn make_task_dead(int signr)
{
        /*
         * Take the task off the cpu after something catastrophic has
         * happened.
         */
        unsigned int limit;

        /*
         * Every time the system oopses, if the oops happens while a reference
         * to an object was held, the reference leaks.
         * If the oops doesn't also leak memory, repeated oopsing can cause
         * reference counters to wrap around (if they're not using refcount_t).
         * This means that repeated oopsing can make unexploitable-looking bugs
         * exploitable through repeated oopsing.
         * To make sure this can't happen, place an upper bound on how often the
         * kernel may oops without panic().
         */
        limit = READ_ONCE(oops_limit);
        if (atomic_inc_return(&oops_count) >= limit && limit)
                panic("Oopsed too often (kernel.oops_limit is %d)", limit);

        do_exit(signr);
}

void complete_and_exit(struct completion *comp, long code)
{
        if (comp)
                complete(comp);

        do_exit(code);
}
EXPORT_SYMBOL(complete_and_exit);

SYSCALL_DEFINE1(exit, int, error_code)
{
        do_exit((error_code&0xff)<<8);
}

/*
 * Take down every thread in the group.  This is called by fatal signals
 * as well as by sys_exit_group (below).
 */
void
do_group_exit(int exit_code)
{
        struct signal_struct *sig = current->signal;

        BUG_ON(exit_code & 0x80); /* core dumps don't get here */

        if (signal_group_exit(sig))
                exit_code = sig->group_exit_code;
        else if (!thread_group_empty(current)) {
                struct sighand_struct *const sighand = current->sighand;

                spin_lock_irq(&sighand->siglock);
                if (signal_group_exit(sig))
                        /* Another thread got here before we took the lock.  */
                        exit_code = sig->group_exit_code;
                else {
                        sig->group_exit_code = exit_code;
                        sig->flags = SIGNAL_GROUP_EXIT;
                        zap_other_threads(current);
                }
                spin_unlock_irq(&sighand->siglock);
        }

        do_exit(exit_code);
        /* NOTREACHED */
}

/*
 * this kills every thread in the thread group. Note that any externally
 * wait4()-ing process will get the correct exit code - even if this
 * thread is not the thread group leader.
 */
SYSCALL_DEFINE1(exit_group, int, error_code)
{
        do_group_exit((error_code & 0xff) << 8);
        /* NOTREACHED */
        return 0;
}

struct waitid_info {
        pid_t pid;
        uid_t uid;
        int status;
        int cause;
};

struct wait_opts {
        enum pid_type                wo_type;
        int                        wo_flags;
        struct pid                *wo_pid;

        struct waitid_info        *wo_info;
        int                        wo_stat;
        struct rusage                *wo_rusage;

        wait_queue_entry_t                child_wait;
        int                        notask_error;
};

static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
{
        return        wo->wo_type == PIDTYPE_MAX ||
                task_pid_type(p, wo->wo_type) == wo->wo_pid;
}

static int
eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p)
{
        if (!eligible_pid(wo, p))
                return 0;

        /*
         * Wait for all children (clone and not) if __WALL is set or
         * if it is traced by us.
         */
        if (ptrace || (wo->wo_flags & __WALL))
                return 1;

        /*
         * Otherwise, wait for clone children *only* if __WCLONE is set;
         * otherwise, wait for non-clone children *only*.
         *
         * Note: a "clone" child here is one that reports to its parent
         * using a signal other than SIGCHLD, or a non-leader thread which
         * we can only see if it is traced by us.
         */
        if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
                return 0;

        return 1;
}

/*
 * Handle sys_wait4 work for one task in state EXIT_ZOMBIE.  We hold
 * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
 * the lock and this task is uninteresting.  If we return nonzero, we have
 * released the lock and the system call should return.
 */
static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
{
        int state, status;
        pid_t pid = task_pid_vnr(p);
        uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
        struct waitid_info *infop;

        if (!likely(wo->wo_flags & WEXITED))
                return 0;

        if (unlikely(wo->wo_flags & WNOWAIT)) {
                status = p->exit_code;
                get_task_struct(p);
                read_unlock(&tasklist_lock);
                sched_annotate_sleep();
                if (wo->wo_rusage)
                        getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
                put_task_struct(p);
                goto out_info;
        }
        /*
         * Move the task's state to DEAD/TRACE, only one thread can do this.
         */
        state = (ptrace_reparented(p) && thread_group_leader(p)) ?
                EXIT_TRACE : EXIT_DEAD;
        if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
                return 0;
        /*
         * We own this thread, nobody else can reap it.
         */
        read_unlock(&tasklist_lock);
        sched_annotate_sleep();

        /*
         * Check thread_group_leader() to exclude the traced sub-threads.
         */
        if (state == EXIT_DEAD && thread_group_leader(p)) {
                struct signal_struct *sig = p->signal;
                struct signal_struct *psig = current->signal;
                unsigned long maxrss;
                u64 tgutime, tgstime;

                /*
                 * The resource counters for the group leader are in its
                 * own task_struct.  Those for dead threads in the group
                 * are in its signal_struct, as are those for the child
                 * processes it has previously reaped.  All these
                 * accumulate in the parent's signal_struct c* fields.
                 *
                 * We don't bother to take a lock here to protect these
                 * p->signal fields because the whole thread group is dead
                 * and nobody can change them.
                 *
                 * psig->stats_lock also protects us from our sub-theads
                 * which can reap other children at the same time. Until
                 * we change k_getrusage()-like users to rely on this lock
                 * we have to take ->siglock as well.
                 *
                 * We use thread_group_cputime_adjusted() to get times for
                 * the thread group, which consolidates times for all threads
                 * in the group including the group leader.
                 */
                thread_group_cputime_adjusted(p, &tgutime, &tgstime);
                spin_lock_irq(&current->sighand->siglock);
                write_seqlock(&psig->stats_lock);
                psig->cutime += tgutime + sig->cutime;
                psig->cstime += tgstime + sig->cstime;
                psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
                psig->cmin_flt +=
                        p->min_flt + sig->min_flt + sig->cmin_flt;
                psig->cmaj_flt +=
                        p->maj_flt + sig->maj_flt + sig->cmaj_flt;
                psig->cnvcsw +=
                        p->nvcsw + sig->nvcsw + sig->cnvcsw;
                psig->cnivcsw +=
                        p->nivcsw + sig->nivcsw + sig->cnivcsw;
                psig->cinblock +=
                        task_io_get_inblock(p) +
                        sig->inblock + sig->cinblock;
                psig->coublock +=
                        task_io_get_oublock(p) +
                        sig->oublock + sig->coublock;
                maxrss = max(sig->maxrss, sig->cmaxrss);
                if (psig->cmaxrss < maxrss)
                        psig->cmaxrss = maxrss;
                task_io_accounting_add(&psig->ioac, &p->ioac);
                task_io_accounting_add(&psig->ioac, &sig->ioac);
                write_sequnlock(&psig->stats_lock);
                spin_unlock_irq(&current->sighand->siglock);
        }

        if (wo->wo_rusage)
                getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
        status = (p->signal->flags & SIGNAL_GROUP_EXIT)
                ? p->signal->group_exit_code : p->exit_code;
        wo->wo_stat = status;

        if (state == EXIT_TRACE) {
                write_lock_irq(&tasklist_lock);
                /* We dropped tasklist, ptracer could die and untrace */
                ptrace_unlink(p);

                /* If parent wants a zombie, don't release it now */
                state = EXIT_ZOMBIE;
                if (do_notify_parent(p, p->exit_signal))
                        state = EXIT_DEAD;
                p->exit_state = state;
                write_unlock_irq(&tasklist_lock);
        }
        if (state == EXIT_DEAD)
                release_task(p);

out_info:
        infop = wo->wo_info;
        if (infop) {
                if ((status & 0x7f) == 0) {
                        infop->cause = CLD_EXITED;
                        infop->status = status >> 8;
                } else {
                        infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
                        infop->status = status & 0x7f;
                }
                infop->pid = pid;
                infop->uid = uid;
        }

        return pid;
}

static int *task_stopped_code(struct task_struct *p, bool ptrace)
{
        if (ptrace) {
                if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING))
                        return &p->exit_code;
        } else {
                if (p->signal->flags & SIGNAL_STOP_STOPPED)
                        return &p->signal->group_exit_code;
        }
        return NULL;
}

/**
 * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
 * @wo: wait options
 * @ptrace: is the wait for ptrace
 * @p: task to wait for
 *
 * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
 *
 * CONTEXT:
 * read_lock(&tasklist_lock), which is released if return value is
 * non-zero.  Also, grabs and releases @p->sighand->siglock.
 *
 * RETURNS:
 * 0 if wait condition didn't exist and search for other wait conditions
 * should continue.  Non-zero return, -errno on failure and @p's pid on
 * success, implies that tasklist_lock is released and wait condition
 * search should terminate.
 */
static int wait_task_stopped(struct wait_opts *wo,
                                int ptrace, struct task_struct *p)
{
        struct waitid_info *infop;
        int exit_code, *p_code, why;
        uid_t uid = 0; /* unneeded, required by compiler */
        pid_t pid;

        /*
         * Traditionally we see ptrace'd stopped tasks regardless of options.
         */
        if (!ptrace && !(wo->wo_flags & WUNTRACED))
                return 0;

        if (!task_stopped_code(p, ptrace))
                return 0;

        exit_code = 0;
        spin_lock_irq(&p->sighand->siglock);

        p_code = task_stopped_code(p, ptrace);
        if (unlikely(!p_code))
                goto unlock_sig;

        exit_code = *p_code;
        if (!exit_code)
                goto unlock_sig;

        if (!unlikely(wo->wo_flags & WNOWAIT))
                *p_code = 0;

        uid = from_kuid_munged(current_user_ns(), task_uid(p));
unlock_sig:
        spin_unlock_irq(&p->sighand->siglock);
        if (!exit_code)
                return 0;

        /*
         * Now we are pretty sure this task is interesting.
         * Make sure it doesn't get reaped out from under us while we
         * give up the lock and then examine it below.  We don't want to
         * keep holding onto the tasklist_lock while we call getrusage and
         * possibly take page faults for user memory.
         */
        get_task_struct(p);
        pid = task_pid_vnr(p);
        why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
        read_unlock(&tasklist_lock);
        sched_annotate_sleep();
        if (wo->wo_rusage)
                getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
        put_task_struct(p);

        if (likely(!(wo->wo_flags & WNOWAIT)))
                wo->wo_stat = (exit_code << 8) | 0x7f;

        infop = wo->wo_info;
        if (infop) {
                infop->cause = why;
                infop->status = exit_code;
                infop->pid = pid;
                infop->uid = uid;
        }
        return pid;
}

/*
 * Handle do_wait work for one task in a live, non-stopped state.
 * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
 * the lock and this task is uninteresting.  If we return nonzero, we have
 * released the lock and the system call should return.
 */
static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
{
        struct waitid_info *infop;
        pid_t pid;
        uid_t uid;

        if (!unlikely(wo->wo_flags & WCONTINUED))
                return 0;

        if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
                return 0;

        spin_lock_irq(&p->sighand->siglock);
        /* Re-check with the lock held.  */
        if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
                spin_unlock_irq(&p->sighand->siglock);
                return 0;
        }
        if (!unlikely(wo->wo_flags & WNOWAIT))
                p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
        uid = from_kuid_munged(current_user_ns(), task_uid(p));
        spin_unlock_irq(&p->sighand->siglock);

        pid = task_pid_vnr(p);
        get_task_struct(p);
        read_unlock(&tasklist_lock);
        sched_annotate_sleep();
        if (wo->wo_rusage)
                getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
        put_task_struct(p);

        infop = wo->wo_info;
        if (!infop) {
                wo->wo_stat = 0xffff;
        } else {
                infop->cause = CLD_CONTINUED;
                infop->pid = pid;
                infop->uid = uid;
                infop->status = SIGCONT;
        }
        return pid;
}

/*
 * Consider @p for a wait by @parent.
 *
 * -ECHILD should be in ->notask_error before the first call.
 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
 * Returns zero if the search for a child should continue;
 * then ->notask_error is 0 if @p is an eligible child,
 * or still -ECHILD.
 */
static int wait_consider_task(struct wait_opts *wo, int ptrace,
                                struct task_struct *p)
{
        /*
         * We can race with wait_task_zombie() from another thread.
         * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
         * can't confuse the checks below.
         */
        int exit_state = READ_ONCE(p->exit_state);
        int ret;

        if (unlikely(exit_state == EXIT_DEAD))
                return 0;

        ret = eligible_child(wo, ptrace, p);
        if (!ret)
                return ret;

        if (unlikely(exit_state == EXIT_TRACE)) {
                /*
                 * ptrace == 0 means we are the natural parent. In this case
                 * we should clear notask_error, debugger will notify us.
                 */
                if (likely(!ptrace))
                        wo->notask_error = 0;
                return 0;
        }

        if (likely(!ptrace) && unlikely(p->ptrace)) {
                /*
                 * If it is traced by its real parent's group, just pretend
                 * the caller is ptrace_do_wait() and reap this child if it
                 * is zombie.
                 *
                 * This also hides group stop state from real parent; otherwise
                 * a single stop can be reported twice as group and ptrace stop.
                 * If a ptracer wants to distinguish these two events for its
                 * own children it should create a separate process which takes
                 * the role of real parent.
                 */
                if (!ptrace_reparented(p))
                        ptrace = 1;
        }

        /* slay zombie? */
        if (exit_state == EXIT_ZOMBIE) {
                /* we don't reap group leaders with subthreads */
                if (!delay_group_leader(p)) {
                        /*
                         * A zombie ptracee is only visible to its ptracer.
                         * Notification and reaping will be cascaded to the
                         * real parent when the ptracer detaches.
                         */
                        if (unlikely(ptrace) || likely(!p->ptrace))
                                return wait_task_zombie(wo, p);
                }

                /*
                 * Allow access to stopped/continued state via zombie by
                 * falling through.  Clearing of notask_error is complex.
                 *
                 * When !@ptrace:
                 *
                 * If WEXITED is set, notask_error should naturally be
                 * cleared.  If not, subset of WSTOPPED|WCONTINUED is set,
                 * so, if there are live subthreads, there are events to
                 * wait for.  If all subthreads are dead, it's still safe
                 * to clear - this function will be called again in finite
                 * amount time once all the subthreads are released and
                 * will then return without clearing.
                 *
                 * When @ptrace:
                 *
                 * Stopped state is per-task and thus can't change once the
                 * target task dies.  Only continued and exited can happen.
                 * Clear notask_error if WCONTINUED | WEXITED.
                 */
                if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
                        wo->notask_error = 0;
        } else {
                /*
                 * @p is alive and it's gonna stop, continue or exit, so
                 * there always is something to wait for.
                 */
                wo->notask_error = 0;
        }

        /*
         * Wait for stopped.  Depending on @ptrace, different stopped state
         * is used and the two don't interact with each other.
         */
        ret = wait_task_stopped(wo, ptrace, p);
        if (ret)
                return ret;

        /*
         * Wait for continued.  There's only one continued state and the
         * ptracer can consume it which can confuse the real parent.  Don't
         * use WCONTINUED from ptracer.  You don't need or want it.
         */
        return wait_task_continued(wo, p);
}

/*
 * Do the work of do_wait() for one thread in the group, @tsk.
 *
 * -ECHILD should be in ->notask_error before the first call.
 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
 * Returns zero if the search for a child should continue; then
 * ->notask_error is 0 if there were any eligible children,
 * or still -ECHILD.
 */
static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
{
        struct task_struct *p;

        list_for_each_entry(p, &tsk->children, sibling) {
                int ret = wait_consider_task(wo, 0, p);

                if (ret)
                        return ret;
        }

        return 0;
}

static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
{
        struct task_struct *p;

        list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
                int ret = wait_consider_task(wo, 1, p);

                if (ret)
                        return ret;
        }

        return 0;
}

static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
                                int sync, void *key)
{
        struct wait_opts *wo = container_of(wait, struct wait_opts,
                                                child_wait);
        struct task_struct *p = key;

        if (!eligible_pid(wo, p))
                return 0;

        if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
                return 0;

        return default_wake_function(wait, mode, sync, key);
}

void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
{
        __wake_up_sync_key(&parent->signal->wait_chldexit,
                           TASK_INTERRUPTIBLE, p);
}

static long do_wait(struct wait_opts *wo)
{
        struct task_struct *tsk;
        int retval;

        trace_sched_process_wait(wo->wo_pid);

        init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
        wo->child_wait.private = current;
        add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
repeat:
        /*
         * If there is nothing that can match our criteria, just get out.
         * We will clear ->notask_error to zero if we see any child that
         * might later match our criteria, even if we are not able to reap
         * it yet.
         */
        wo->notask_error = -ECHILD;
        if ((wo->wo_type < PIDTYPE_MAX) &&
           (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type)))
                goto notask;

        set_current_state(TASK_INTERRUPTIBLE);
        read_lock(&tasklist_lock);
        tsk = current;
        do {
                retval = do_wait_thread(wo, tsk);
                if (retval)
                        goto end;

                retval = ptrace_do_wait(wo, tsk);
                if (retval)
                        goto end;

                if (wo->wo_flags & __WNOTHREAD)
                        break;
        } while_each_thread(current, tsk);
        read_unlock(&tasklist_lock);

notask:
        retval = wo->notask_error;
        if (!retval && !(wo->wo_flags & WNOHANG)) {
                retval = -ERESTARTSYS;
                if (!signal_pending(current)) {
                        schedule();
                        goto repeat;
                }
        }
end:
        __set_current_state(TASK_RUNNING);
        remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
        return retval;
}

static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
                          int options, struct rusage *ru)
{
        struct wait_opts wo;
        struct pid *pid = NULL;
        enum pid_type type;
        long ret;
        unsigned int f_flags = 0;

        if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
                        __WNOTHREAD|__WCLONE|__WALL))
                return -EINVAL;
        if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
                return -EINVAL;

        switch (which) {
        case P_ALL:
                type = PIDTYPE_MAX;
                break;
        case P_PID:
                type = PIDTYPE_PID;
                if (upid <= 0)
                        return -EINVAL;

                pid = find_get_pid(upid);
                break;
        case P_PGID:
                type = PIDTYPE_PGID;
                if (upid < 0)
                        return -EINVAL;

                if (upid)
                        pid = find_get_pid(upid);
                else
                        pid = get_task_pid(current, PIDTYPE_PGID);
                break;
        case P_PIDFD:
                type = PIDTYPE_PID;
                if (upid < 0)
                        return -EINVAL;

                pid = pidfd_get_pid(upid, &f_flags);
                if (IS_ERR(pid))
                        return PTR_ERR(pid);

                break;
        default:
                return -EINVAL;
        }

        wo.wo_type        = type;
        wo.wo_pid        = pid;
        wo.wo_flags        = options;
        wo.wo_info        = infop;
        wo.wo_rusage        = ru;
        if (f_flags & O_NONBLOCK)
                wo.wo_flags |= WNOHANG;

        ret = do_wait(&wo);
        if (!ret && !(options & WNOHANG) && (f_flags & O_NONBLOCK))
                ret = -EAGAIN;

        put_pid(pid);
        return ret;
}

SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
                infop, int, options, struct rusage __user *, ru)
{
        struct rusage r;
        struct waitid_info info = {.status = 0};
        long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);
        int signo = 0;

        if (err > 0) {
                signo = SIGCHLD;
                err = 0;
                if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
                        return -EFAULT;
        }
        if (!infop)
                return err;

        if (!user_write_access_begin(infop, sizeof(*infop)))
                return -EFAULT;

        unsafe_put_user(signo, &infop->si_signo, Efault);
        unsafe_put_user(0, &infop->si_errno, Efault);
        unsafe_put_user(info.cause, &infop->si_code, Efault);
        unsafe_put_user(info.pid, &infop->si_pid, Efault);
        unsafe_put_user(info.uid, &infop->si_uid, Efault);
        unsafe_put_user(info.status, &infop->si_status, Efault);
        user_write_access_end();
        return err;
Efault:
        user_write_access_end();
        return -EFAULT;
}

long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
                  struct rusage *ru)
{
        struct wait_opts wo;
        struct pid *pid = NULL;
        enum pid_type type;
        long ret;

        if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
                        __WNOTHREAD|__WCLONE|__WALL))
                return -EINVAL;

        /* -INT_MIN is not defined */
        if (upid == INT_MIN)
                return -ESRCH;

        if (upid == -1)
                type = PIDTYPE_MAX;
        else if (upid < 0) {
                type = PIDTYPE_PGID;
                pid = find_get_pid(-upid);
        } else if (upid == 0) {
                type = PIDTYPE_PGID;
                pid = get_task_pid(current, PIDTYPE_PGID);
        } else /* upid > 0 */ {
                type = PIDTYPE_PID;
                pid = find_get_pid(upid);
        }

        wo.wo_type        = type;
        wo.wo_pid        = pid;
        wo.wo_flags        = options | WEXITED;
        wo.wo_info        = NULL;
        wo.wo_stat        = 0;
        wo.wo_rusage        = ru;
        ret = do_wait(&wo);
        put_pid(pid);
        if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr))
                ret = -EFAULT;

        return ret;
}

int kernel_wait(pid_t pid, int *stat)
{
        struct wait_opts wo = {
                .wo_type        = PIDTYPE_PID,
                .wo_pid                = find_get_pid(pid),
                .wo_flags        = WEXITED,
        };
        int ret;

        ret = do_wait(&wo);
        if (ret > 0 && wo.wo_stat)
                *stat = wo.wo_stat;
        put_pid(wo.wo_pid);
        return ret;
}

SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
                int, options, struct rusage __user *, ru)
{
        struct rusage r;
        long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL);

        if (err > 0) {
                if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
                        return -EFAULT;
        }
        return err;
}

#ifdef __ARCH_WANT_SYS_WAITPID

/*
 * sys_waitpid() remains for compatibility. waitpid() should be
 * implemented by calling sys_wait4() from libc.a.
 */
SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
{
        return kernel_wait4(pid, stat_addr, options, NULL);
}

#endif

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(wait4,
        compat_pid_t, pid,
        compat_uint_t __user *, stat_addr,
        int, options,
        struct compat_rusage __user *, ru)
{
        struct rusage r;
        long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL);
        if (err > 0) {
                if (ru && put_compat_rusage(&r, ru))
                        return -EFAULT;
        }
        return err;
}

COMPAT_SYSCALL_DEFINE5(waitid,
                int, which, compat_pid_t, pid,
                struct compat_siginfo __user *, infop, int, options,
                struct compat_rusage __user *, uru)
{
        struct rusage ru;
        struct waitid_info info = {.status = 0};
        long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL);
        int signo = 0;
        if (err > 0) {
                signo = SIGCHLD;
                err = 0;
                if (uru) {
                        /* kernel_waitid() overwrites everything in ru */
                        if (COMPAT_USE_64BIT_TIME)
                                err = copy_to_user(uru, &ru, sizeof(ru));
                        else
                                err = put_compat_rusage(&ru, uru);
                        if (err)
                                return -EFAULT;
                }
        }

        if (!infop)
                return err;

        if (!user_write_access_begin(infop, sizeof(*infop)))
                return -EFAULT;

        unsafe_put_user(signo, &infop->si_signo, Efault);
        unsafe_put_user(0, &infop->si_errno, Efault);
        unsafe_put_user(info.cause, &infop->si_code, Efault);
        unsafe_put_user(info.pid, &infop->si_pid, Efault);
        unsafe_put_user(info.uid, &infop->si_uid, Efault);
        unsafe_put_user(info.status, &infop->si_status, Efault);
        user_write_access_end();
        return err;
Efault:
        user_write_access_end();
        return -EFAULT;
}
#endif

/**
 * thread_group_exited - check that a thread group has exited
 * @pid: tgid of thread group to be checked.
 *
 * Test if the thread group represented by tgid has exited (all
 * threads are zombies, dead or completely gone).
 *
 * Return: true if the thread group has exited. false otherwise.
 */
bool thread_group_exited(struct pid *pid)
{
        struct task_struct *task;
        bool exited;

        rcu_read_lock();
        task = pid_task(pid, PIDTYPE_PID);
        exited = !task ||
                (READ_ONCE(task->exit_state) && thread_group_empty(task));
        rcu_read_unlock();

        return exited;
}
EXPORT_SYMBOL(thread_group_exited);

__weak void abort(void)
{
        BUG();

        /* if that doesn't kill us, halt */
        panic("Oops failed to kill thread");
}
EXPORT_SYMBOL(abort);














   14 
   14 





   14 



   14 
    2 
   14 








































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
// SPDX-License-Identifier: GPL-2.0
#include <linux/bitops.h>
#include <linux/fault-inject-usercopy.h>
#include <linux/instrumented.h>
#include <linux/uaccess.h>
#include <linux/nospec.h>

/* out-of-line parts */

#ifndef INLINE_COPY_FROM_USER
unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n)
{
        unsigned long res = n;
        might_fault();
        if (!should_fail_usercopy() && likely(access_ok(from, n))) {
                /*
                 * Ensure that bad access_ok() speculation will not
                 * lead to nasty side effects *after* the copy is
                 * finished:
                 */
                barrier_nospec();
                instrument_copy_from_user(to, from, n);
                res = raw_copy_from_user(to, from, n);
        }
        if (unlikely(res))
                memset(to + (n - res), 0, res);
        return res;
}
EXPORT_SYMBOL(_copy_from_user);
#endif

#ifndef INLINE_COPY_TO_USER
unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n)
{
        might_fault();
        if (should_fail_usercopy())
                return n;
        if (likely(access_ok(to, n))) {
                instrument_copy_to_user(to, from, n);
                n = raw_copy_to_user(to, from, n);
        }
        return n;
}
EXPORT_SYMBOL(_copy_to_user);
#endif

/**
 * check_zeroed_user: check if a userspace buffer only contains zero bytes
 * @from: Source address, in userspace.
 * @size: Size of buffer.
 *
 * This is effectively shorthand for "memchr_inv(from, 0, size) == NULL" for
 * userspace addresses (and is more efficient because we don't care where the
 * first non-zero byte is).
 *
 * Returns:
 *  * 0: There were non-zero bytes present in the buffer.
 *  * 1: The buffer was full of zero bytes.
 *  * -EFAULT: access to userspace failed.
 */
int check_zeroed_user(const void __user *from, size_t size)
{
        unsigned long val;
        uintptr_t align = (uintptr_t) from % sizeof(unsigned long);

        if (unlikely(size == 0))
                return 1;

        from -= align;
        size += align;

        if (!user_read_access_begin(from, size))
                return -EFAULT;

        unsafe_get_user(val, (unsigned long __user *) from, err_fault);
        if (align)
                val &= ~aligned_byte_mask(align);

        while (size > sizeof(unsigned long)) {
                if (unlikely(val))
                        goto done;

                from += sizeof(unsigned long);
                size -= sizeof(unsigned long);

                unsafe_get_user(val, (unsigned long __user *) from, err_fault);
        }

        if (size < sizeof(unsigned long))
                val &= aligned_byte_mask(size);

done:
        user_read_access_end();
        return (val == 0);
err_fault:
        user_read_access_end();
        return -EFAULT;
}
EXPORT_SYMBOL(check_zeroed_user);

























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Common header file for generic dynamic events.
 */

#ifndef _TRACE_DYNEVENT_H
#define _TRACE_DYNEVENT_H

#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/seq_file.h>

#include "trace.h"

struct dyn_event;

/**
 * struct dyn_event_operations - Methods for each type of dynamic events
 *
 * These methods must be set for each type, since there is no default method.
 * Before using this for dyn_event_init(), it must be registered by
 * dyn_event_register().
 *
 * @create: Parse and create event method. This is invoked when user passes
 *  a event definition to dynamic_events interface. This must not destruct
 *  the arguments and return -ECANCELED if given arguments doesn't match its
 *  command prefix.
 * @show: Showing method. This is invoked when user reads the event definitions
 *  via dynamic_events interface.
 * @is_busy: Check whether given event is busy so that it can not be deleted.
 *  Return true if it is busy, otherwides false.
 * @free: Delete the given event. Return 0 if success, otherwides error.
 * @match: Check whether given event and system name match this event. The argc
 *  and argv is used for exact match. Return true if it matches, otherwides
 *  false.
 *
 * Except for @create, these methods are called under holding event_mutex.
 */
struct dyn_event_operations {
        struct list_head        list;
        int (*create)(int argc, const char *argv[]);
        int (*show)(struct seq_file *m, struct dyn_event *ev);
        bool (*is_busy)(struct dyn_event *ev);
        int (*free)(struct dyn_event *ev);
        bool (*match)(const char *system, const char *event,
                      int argc, const char **argv, struct dyn_event *ev);
};

/* Register new dyn_event type -- must be called at first */
int dyn_event_register(struct dyn_event_operations *ops);

/**
 * struct dyn_event - Dynamic event list header
 *
 * The dyn_event structure encapsulates a list and a pointer to the operators
 * for making a global list of dynamic events.
 * User must includes this in each event structure, so that those events can
 * be added/removed via dynamic_events interface.
 */
struct dyn_event {
        struct list_head                list;
        struct dyn_event_operations        *ops;
};

extern struct list_head dyn_event_list;

static inline
int dyn_event_init(struct dyn_event *ev, struct dyn_event_operations *ops)
{
        if (!ev || !ops)
                return -EINVAL;

        INIT_LIST_HEAD(&ev->list);
        ev->ops = ops;
        return 0;
}

static inline int dyn_event_add(struct dyn_event *ev)
{
        lockdep_assert_held(&event_mutex);

        if (!ev || !ev->ops)
                return -EINVAL;

        list_add_tail(&ev->list, &dyn_event_list);
        return 0;
}

static inline void dyn_event_remove(struct dyn_event *ev)
{
        lockdep_assert_held(&event_mutex);
        list_del_init(&ev->list);
}

void *dyn_event_seq_start(struct seq_file *m, loff_t *pos);
void *dyn_event_seq_next(struct seq_file *m, void *v, loff_t *pos);
void dyn_event_seq_stop(struct seq_file *m, void *v);
int dyn_events_release_all(struct dyn_event_operations *type);
int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type);

/*
 * for_each_dyn_event        -        iterate over the dyn_event list
 * @pos:        the struct dyn_event * to use as a loop cursor
 *
 * This is just a basement of for_each macro. Wrap this for
 * each actual event structure with ops filtering.
 */
#define for_each_dyn_event(pos)        \
        list_for_each_entry(pos, &dyn_event_list, list)

/*
 * for_each_dyn_event        -        iterate over the dyn_event list safely
 * @pos:        the struct dyn_event * to use as a loop cursor
 * @n:                the struct dyn_event * to use as temporary storage
 */
#define for_each_dyn_event_safe(pos, n)        \
        list_for_each_entry_safe(pos, n, &dyn_event_list, list)

extern void dynevent_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen,
                              enum dynevent_type type,
                              dynevent_create_fn_t run_command);

typedef int (*dynevent_check_arg_fn_t)(void *data);

struct dynevent_arg {
        const char                *str;
        char                        separator; /* e.g. ';', ',', or nothing */
};

extern void dynevent_arg_init(struct dynevent_arg *arg,
                              char separator);
extern int dynevent_arg_add(struct dynevent_cmd *cmd,
                            struct dynevent_arg *arg,
                            dynevent_check_arg_fn_t check_arg);

struct dynevent_arg_pair {
        const char                *lhs;
        const char                *rhs;
        char                        operator; /* e.g. '=' or nothing */
        char                        separator; /* e.g. ';', ',', or nothing */
};

extern void dynevent_arg_pair_init(struct dynevent_arg_pair *arg_pair,
                                   char operator, char separator);

extern int dynevent_arg_pair_add(struct dynevent_cmd *cmd,
                                 struct dynevent_arg_pair *arg_pair,
                                 dynevent_check_arg_fn_t check_arg);
extern int dynevent_str_add(struct dynevent_cmd *cmd, const char *str);

#endif












































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_TIMENS_H
#define _LINUX_TIMENS_H


#include <linux/sched.h>
#include <linux/kref.h>
#include <linux/nsproxy.h>
#include <linux/ns_common.h>
#include <linux/err.h>

struct user_namespace;
extern struct user_namespace init_user_ns;

struct timens_offsets {
        struct timespec64 monotonic;
        struct timespec64 boottime;
};

struct time_namespace {
        struct kref                kref;
        struct user_namespace        *user_ns;
        struct ucounts                *ucounts;
        struct ns_common        ns;
        struct timens_offsets        offsets;
        struct page                *vvar_page;
        /* If set prevents changing offsets after any task joined namespace. */
        bool                        frozen_offsets;
} __randomize_layout;

extern struct time_namespace init_time_ns;

#ifdef CONFIG_TIME_NS
extern int vdso_join_timens(struct task_struct *task,
                            struct time_namespace *ns);
extern void timens_commit(struct task_struct *tsk, struct time_namespace *ns);

static inline struct time_namespace *get_time_ns(struct time_namespace *ns)
{
        kref_get(&ns->kref);
        return ns;
}

struct time_namespace *copy_time_ns(unsigned long flags,
                                    struct user_namespace *user_ns,
                                    struct time_namespace *old_ns);
void free_time_ns(struct kref *kref);
int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk);
struct vdso_data *arch_get_vdso_data(void *vvar_page);

static inline void put_time_ns(struct time_namespace *ns)
{
        kref_put(&ns->kref, free_time_ns);
}

void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m);

struct proc_timens_offset {
        int                        clockid;
        struct timespec64        val;
};

int proc_timens_set_offset(struct file *file, struct task_struct *p,
                           struct proc_timens_offset *offsets, int n);

static inline void timens_add_monotonic(struct timespec64 *ts)
{
        struct timens_offsets *ns_offsets = &current->nsproxy->time_ns->offsets;

        *ts = timespec64_add(*ts, ns_offsets->monotonic);
}

static inline void timens_add_boottime(struct timespec64 *ts)
{
        struct timens_offsets *ns_offsets = &current->nsproxy->time_ns->offsets;

        *ts = timespec64_add(*ts, ns_offsets->boottime);
}

ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim,
                                struct timens_offsets *offsets);

static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim)
{
        struct time_namespace *ns = current->nsproxy->time_ns;

        if (likely(ns == &init_time_ns))
                return tim;

        return do_timens_ktime_to_host(clockid, tim, &ns->offsets);
}

#else
static inline int vdso_join_timens(struct task_struct *task,
                                   struct time_namespace *ns)
{
        return 0;
}

static inline void timens_commit(struct task_struct *tsk,
                                 struct time_namespace *ns)
{
}

static inline struct time_namespace *get_time_ns(struct time_namespace *ns)
{
        return NULL;
}

static inline void put_time_ns(struct time_namespace *ns)
{
}

static inline
struct time_namespace *copy_time_ns(unsigned long flags,
                                    struct user_namespace *user_ns,
                                    struct time_namespace *old_ns)
{
        if (flags & CLONE_NEWTIME)
                return ERR_PTR(-EINVAL);

        return old_ns;
}

static inline int timens_on_fork(struct nsproxy *nsproxy,
                                 struct task_struct *tsk)
{
        return 0;
}

static inline void timens_add_monotonic(struct timespec64 *ts) { }
static inline void timens_add_boottime(struct timespec64 *ts) { }
static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim)
{
        return tim;
}
#endif

#endif /* _LINUX_TIMENS_H */





















































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Wireless configuration interface internals.
 *
 * Copyright 2006-2010        Johannes Berg <johannes@sipsolutions.net>
 * Copyright (C) 2018-2020 Intel Corporation
 */
#ifndef __NET_WIRELESS_CORE_H
#define __NET_WIRELESS_CORE_H
#include <linux/list.h>
#include <linux/netdevice.h>
#include <linux/rbtree.h>
#include <linux/debugfs.h>
#include <linux/rfkill.h>
#include <linux/workqueue.h>
#include <linux/rtnetlink.h>
#include <net/genetlink.h>
#include <net/cfg80211.h>
#include "reg.h"


#define WIPHY_IDX_INVALID        -1

struct cfg80211_registered_device {
        const struct cfg80211_ops *ops;
        struct list_head list;

        /* rfkill support */
        struct rfkill_ops rfkill_ops;
        struct rfkill *rfkill;
        struct work_struct rfkill_block;

        /* ISO / IEC 3166 alpha2 for which this device is receiving
         * country IEs on, this can help disregard country IEs from APs
         * on the same alpha2 quickly. The alpha2 may differ from
         * cfg80211_regdomain's alpha2 when an intersection has occurred.
         * If the AP is reconfigured this can also be used to tell us if
         * the country on the country IE changed. */
        char country_ie_alpha2[2];

        /*
         * the driver requests the regulatory core to set this regulatory
         * domain as the wiphy's. Only used for %REGULATORY_WIPHY_SELF_MANAGED
         * devices using the regulatory_set_wiphy_regd() API
         */
        const struct ieee80211_regdomain *requested_regd;

        /* If a Country IE has been received this tells us the environment
         * which its telling us its in. This defaults to ENVIRON_ANY */
        enum environment_cap env;

        /* wiphy index, internal only */
        int wiphy_idx;

        /* protected by RTNL */
        int devlist_generation, wdev_id;
        int opencount;
        wait_queue_head_t dev_wait;

        struct list_head beacon_registrations;
        spinlock_t beacon_registrations_lock;

        /* protected by RTNL only */
        int num_running_ifaces;
        int num_running_monitor_ifaces;
        u64 cookie_counter;

        /* BSSes/scanning */
        spinlock_t bss_lock;
        struct list_head bss_list;
        struct rb_root bss_tree;
        u32 bss_generation;
        u32 bss_entries;
        struct cfg80211_scan_request *scan_req; /* protected by RTNL */
        struct cfg80211_scan_request *int_scan_req;
        struct sk_buff *scan_msg;
        struct list_head sched_scan_req_list;
        time64_t suspend_at;
        struct work_struct scan_done_wk;

        struct genl_info *cur_cmd_info;

        struct work_struct conn_work;
        struct work_struct event_work;

        struct delayed_work dfs_update_channels_wk;

        /* netlink port which started critical protocol (0 means not started) */
        u32 crit_proto_nlportid;

        struct cfg80211_coalesce *coalesce;

        struct work_struct destroy_work;
        struct work_struct sched_scan_stop_wk;
        struct work_struct sched_scan_res_wk;

        struct cfg80211_chan_def radar_chandef;
        struct work_struct propagate_radar_detect_wk;

        struct cfg80211_chan_def cac_done_chandef;
        struct work_struct propagate_cac_done_wk;

        struct work_struct mgmt_registrations_update_wk;
        /* lock for all wdev lists */
        spinlock_t mgmt_registrations_lock;

        /* must be last because of the way we do wiphy_priv(),
         * and it should at least be aligned to NETDEV_ALIGN */
        struct wiphy wiphy __aligned(NETDEV_ALIGN);
};

static inline
struct cfg80211_registered_device *wiphy_to_rdev(struct wiphy *wiphy)
{
        BUG_ON(!wiphy);
        return container_of(wiphy, struct cfg80211_registered_device, wiphy);
}

static inline void
cfg80211_rdev_free_wowlan(struct cfg80211_registered_device *rdev)
{
#ifdef CONFIG_PM
        int i;

        if (!rdev->wiphy.wowlan_config)
                return;
        for (i = 0; i < rdev->wiphy.wowlan_config->n_patterns; i++)
                kfree(rdev->wiphy.wowlan_config->patterns[i].mask);
        kfree(rdev->wiphy.wowlan_config->patterns);
        if (rdev->wiphy.wowlan_config->tcp &&
            rdev->wiphy.wowlan_config->tcp->sock)
                sock_release(rdev->wiphy.wowlan_config->tcp->sock);
        kfree(rdev->wiphy.wowlan_config->tcp);
        kfree(rdev->wiphy.wowlan_config->nd_config);
        kfree(rdev->wiphy.wowlan_config);
#endif
}

static inline u64 cfg80211_assign_cookie(struct cfg80211_registered_device *rdev)
{
        u64 r = ++rdev->cookie_counter;

        if (WARN_ON(r == 0))
                r = ++rdev->cookie_counter;

        return r;
}

extern struct workqueue_struct *cfg80211_wq;
extern struct list_head cfg80211_rdev_list;
extern int cfg80211_rdev_list_generation;

struct cfg80211_internal_bss {
        struct list_head list;
        struct list_head hidden_list;
        struct rb_node rbn;
        u64 ts_boottime;
        unsigned long ts;
        unsigned long refcount;
        atomic_t hold;

        /* time at the start of the reception of the first octet of the
         * timestamp field of the last beacon/probe received for this BSS.
         * The time is the TSF of the BSS specified by %parent_bssid.
         */
        u64 parent_tsf;

        /* the BSS according to which %parent_tsf is set. This is set to
         * the BSS that the interface that requested the scan was connected to
         * when the beacon/probe was received.
         */
        u8 parent_bssid[ETH_ALEN] __aligned(2);

        /* must be last because of priv member */
        struct cfg80211_bss pub;
};

static inline struct cfg80211_internal_bss *bss_from_pub(struct cfg80211_bss *pub)
{
        return container_of(pub, struct cfg80211_internal_bss, pub);
}

static inline void cfg80211_hold_bss(struct cfg80211_internal_bss *bss)
{
        atomic_inc(&bss->hold);
        if (bss->pub.transmitted_bss) {
                bss = container_of(bss->pub.transmitted_bss,
                                   struct cfg80211_internal_bss, pub);
                atomic_inc(&bss->hold);
        }
}

static inline void cfg80211_unhold_bss(struct cfg80211_internal_bss *bss)
{
        int r = atomic_dec_return(&bss->hold);
        WARN_ON(r < 0);
        if (bss->pub.transmitted_bss) {
                bss = container_of(bss->pub.transmitted_bss,
                                   struct cfg80211_internal_bss, pub);
                r = atomic_dec_return(&bss->hold);
                WARN_ON(r < 0);
        }
}


struct cfg80211_registered_device *cfg80211_rdev_by_wiphy_idx(int wiphy_idx);
int get_wiphy_idx(struct wiphy *wiphy);

struct wiphy *wiphy_idx_to_wiphy(int wiphy_idx);

int cfg80211_switch_netns(struct cfg80211_registered_device *rdev,
                          struct net *net);

void cfg80211_init_wdev(struct wireless_dev *wdev);
void cfg80211_register_wdev(struct cfg80211_registered_device *rdev,
                            struct wireless_dev *wdev);

static inline void wdev_lock(struct wireless_dev *wdev)
        __acquires(wdev)
{
        mutex_lock(&wdev->mtx);
        __acquire(wdev->mtx);
}

static inline void wdev_unlock(struct wireless_dev *wdev)
        __releases(wdev)
{
        __release(wdev->mtx);
        mutex_unlock(&wdev->mtx);
}

#define ASSERT_WDEV_LOCK(wdev) lockdep_assert_held(&(wdev)->mtx)

static inline bool cfg80211_has_monitors_only(struct cfg80211_registered_device *rdev)
{
        ASSERT_RTNL();

        return rdev->num_running_ifaces == rdev->num_running_monitor_ifaces &&
               rdev->num_running_ifaces > 0;
}

enum cfg80211_event_type {
        EVENT_CONNECT_RESULT,
        EVENT_ROAMED,
        EVENT_DISCONNECTED,
        EVENT_IBSS_JOINED,
        EVENT_STOPPED,
        EVENT_PORT_AUTHORIZED,
};

struct cfg80211_event {
        struct list_head list;
        enum cfg80211_event_type type;

        union {
                struct cfg80211_connect_resp_params cr;
                struct cfg80211_roam_info rm;
                struct {
                        const u8 *ie;
                        size_t ie_len;
                        u16 reason;
                        bool locally_generated;
                } dc;
                struct {
                        u8 bssid[ETH_ALEN];
                        struct ieee80211_channel *channel;
                } ij;
                struct {
                        u8 bssid[ETH_ALEN];
                } pa;
        };
};

struct cfg80211_cached_keys {
        struct key_params params[CFG80211_MAX_WEP_KEYS];
        u8 data[CFG80211_MAX_WEP_KEYS][WLAN_KEY_LEN_WEP104];
        int def;
};

enum cfg80211_chan_mode {
        CHAN_MODE_UNDEFINED,
        CHAN_MODE_SHARED,
        CHAN_MODE_EXCLUSIVE,
};

struct cfg80211_beacon_registration {
        struct list_head list;
        u32 nlportid;
};

struct cfg80211_cqm_config {
        u32 rssi_hyst;
        s32 last_rssi_event_value;
        int n_rssi_thresholds;
        s32 rssi_thresholds[];
};

void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev);

/* free object */
void cfg80211_dev_free(struct cfg80211_registered_device *rdev);

int cfg80211_dev_rename(struct cfg80211_registered_device *rdev,
                        char *newname);

void ieee80211_set_bitrate_flags(struct wiphy *wiphy);

void cfg80211_bss_expire(struct cfg80211_registered_device *rdev);
void cfg80211_bss_age(struct cfg80211_registered_device *rdev,
                      unsigned long age_secs);
void cfg80211_update_assoc_bss_entry(struct wireless_dev *wdev,
                                     struct ieee80211_channel *channel);

/* IBSS */
int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
                         struct net_device *dev,
                         struct cfg80211_ibss_params *params,
                         struct cfg80211_cached_keys *connkeys);
void cfg80211_clear_ibss(struct net_device *dev, bool nowext);
int __cfg80211_leave_ibss(struct cfg80211_registered_device *rdev,
                          struct net_device *dev, bool nowext);
int cfg80211_leave_ibss(struct cfg80211_registered_device *rdev,
                        struct net_device *dev, bool nowext);
void __cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid,
                            struct ieee80211_channel *channel);
int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev,
                            struct wireless_dev *wdev);

/* mesh */
extern const struct mesh_config default_mesh_config;
extern const struct mesh_setup default_mesh_setup;
int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev,
                         struct net_device *dev,
                         struct mesh_setup *setup,
                         const struct mesh_config *conf);
int __cfg80211_leave_mesh(struct cfg80211_registered_device *rdev,
                          struct net_device *dev);
int cfg80211_leave_mesh(struct cfg80211_registered_device *rdev,
                        struct net_device *dev);
int cfg80211_set_mesh_channel(struct cfg80211_registered_device *rdev,
                              struct wireless_dev *wdev,
                              struct cfg80211_chan_def *chandef);

/* OCB */
int __cfg80211_join_ocb(struct cfg80211_registered_device *rdev,
                        struct net_device *dev,
                        struct ocb_setup *setup);
int cfg80211_join_ocb(struct cfg80211_registered_device *rdev,
                      struct net_device *dev,
                      struct ocb_setup *setup);
int __cfg80211_leave_ocb(struct cfg80211_registered_device *rdev,
                         struct net_device *dev);
int cfg80211_leave_ocb(struct cfg80211_registered_device *rdev,
                       struct net_device *dev);

/* AP */
int __cfg80211_stop_ap(struct cfg80211_registered_device *rdev,
                       struct net_device *dev, bool notify);
int cfg80211_stop_ap(struct cfg80211_registered_device *rdev,
                     struct net_device *dev, bool notify);

/* MLME */
int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev,
                       struct net_device *dev,
                       struct ieee80211_channel *chan,
                       enum nl80211_auth_type auth_type,
                       const u8 *bssid,
                       const u8 *ssid, int ssid_len,
                       const u8 *ie, int ie_len,
                       const u8 *key, int key_len, int key_idx,
                       const u8 *auth_data, int auth_data_len);
int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev,
                        struct net_device *dev,
                        struct ieee80211_channel *chan,
                        const u8 *bssid,
                        const u8 *ssid, int ssid_len,
                        struct cfg80211_assoc_request *req);
int cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev,
                         struct net_device *dev, const u8 *bssid,
                         const u8 *ie, int ie_len, u16 reason,
                         bool local_state_change);
int cfg80211_mlme_disassoc(struct cfg80211_registered_device *rdev,
                           struct net_device *dev, const u8 *bssid,
                           const u8 *ie, int ie_len, u16 reason,
                           bool local_state_change);
void cfg80211_mlme_down(struct cfg80211_registered_device *rdev,
                        struct net_device *dev);
int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_pid,
                                u16 frame_type, const u8 *match_data,
                                int match_len, bool multicast_rx,
                                struct netlink_ext_ack *extack);
void cfg80211_mgmt_registrations_update_wk(struct work_struct *wk);
void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlpid);
void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev);
int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
                          struct wireless_dev *wdev,
                          struct cfg80211_mgmt_tx_params *params,
                          u64 *cookie);
void cfg80211_oper_and_ht_capa(struct ieee80211_ht_cap *ht_capa,
                               const struct ieee80211_ht_cap *ht_capa_mask);
void cfg80211_oper_and_vht_capa(struct ieee80211_vht_cap *vht_capa,
                                const struct ieee80211_vht_cap *vht_capa_mask);

/* SME events */
int cfg80211_connect(struct cfg80211_registered_device *rdev,
                     struct net_device *dev,
                     struct cfg80211_connect_params *connect,
                     struct cfg80211_cached_keys *connkeys,
                     const u8 *prev_bssid);
void __cfg80211_connect_result(struct net_device *dev,
                               struct cfg80211_connect_resp_params *params,
                               bool wextev);
void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
                             size_t ie_len, u16 reason, bool from_ap);
int cfg80211_disconnect(struct cfg80211_registered_device *rdev,
                        struct net_device *dev, u16 reason,
                        bool wextev);
void __cfg80211_roamed(struct wireless_dev *wdev,
                       struct cfg80211_roam_info *info);
void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid);
int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev,
                              struct wireless_dev *wdev);
void cfg80211_autodisconnect_wk(struct work_struct *work);

/* SME implementation */
void cfg80211_conn_work(struct work_struct *work);
void cfg80211_sme_scan_done(struct net_device *dev);
bool cfg80211_sme_rx_assoc_resp(struct wireless_dev *wdev, u16 status);
void cfg80211_sme_rx_auth(struct wireless_dev *wdev, const u8 *buf, size_t len);
void cfg80211_sme_disassoc(struct wireless_dev *wdev);
void cfg80211_sme_deauth(struct wireless_dev *wdev);
void cfg80211_sme_auth_timeout(struct wireless_dev *wdev);
void cfg80211_sme_assoc_timeout(struct wireless_dev *wdev);
void cfg80211_sme_abandon_assoc(struct wireless_dev *wdev);

/* internal helpers */
bool cfg80211_supported_cipher_suite(struct wiphy *wiphy, u32 cipher);
bool cfg80211_valid_key_idx(struct cfg80211_registered_device *rdev,
                            int key_idx, bool pairwise);
int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
                                   struct key_params *params, int key_idx,
                                   bool pairwise, const u8 *mac_addr);
void __cfg80211_scan_done(struct work_struct *wk);
void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev,
                           bool send_message);
void cfg80211_add_sched_scan_req(struct cfg80211_registered_device *rdev,
                                 struct cfg80211_sched_scan_request *req);
int cfg80211_sched_scan_req_possible(struct cfg80211_registered_device *rdev,
                                     bool want_multi);
void cfg80211_sched_scan_results_wk(struct work_struct *work);
int cfg80211_stop_sched_scan_req(struct cfg80211_registered_device *rdev,
                                 struct cfg80211_sched_scan_request *req,
                                 bool driver_initiated);
int __cfg80211_stop_sched_scan(struct cfg80211_registered_device *rdev,
                               u64 reqid, bool driver_initiated);
void cfg80211_upload_connect_keys(struct wireless_dev *wdev);
int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
                          struct net_device *dev, enum nl80211_iftype ntype,
                          struct vif_params *params);
void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev);
void cfg80211_process_wdev_events(struct wireless_dev *wdev);

bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range,
                                u32 center_freq_khz, u32 bw_khz);

int cfg80211_scan(struct cfg80211_registered_device *rdev);

extern struct work_struct cfg80211_disconnect_work;

/**
 * cfg80211_chandef_dfs_usable - checks if chandef is DFS usable
 * @wiphy: the wiphy to validate against
 * @chandef: the channel definition to check
 *
 * Checks if chandef is usable and we can/need start CAC on such channel.
 *
 * Return: true if all channels available and at least
 *           one channel requires CAC (NL80211_DFS_USABLE)
 */
bool cfg80211_chandef_dfs_usable(struct wiphy *wiphy,
                                 const struct cfg80211_chan_def *chandef);

void cfg80211_set_dfs_state(struct wiphy *wiphy,
                            const struct cfg80211_chan_def *chandef,
                            enum nl80211_dfs_state dfs_state);

void cfg80211_dfs_channels_update_work(struct work_struct *work);

unsigned int
cfg80211_chandef_dfs_cac_time(struct wiphy *wiphy,
                              const struct cfg80211_chan_def *chandef);

void cfg80211_sched_dfs_chan_update(struct cfg80211_registered_device *rdev);

bool cfg80211_any_wiphy_oper_chan(struct wiphy *wiphy,
                                  struct ieee80211_channel *chan);

bool cfg80211_beaconing_iface_active(struct wireless_dev *wdev);

bool cfg80211_is_sub_chan(struct cfg80211_chan_def *chandef,
                          struct ieee80211_channel *chan);

static inline unsigned int elapsed_jiffies_msecs(unsigned long start)
{
        unsigned long end = jiffies;

        if (end >= start)
                return jiffies_to_msecs(end - start);

        return jiffies_to_msecs(end + (ULONG_MAX - start) + 1);
}

void
cfg80211_get_chan_state(struct wireless_dev *wdev,
                        struct ieee80211_channel **chan,
                        enum cfg80211_chan_mode *chanmode,
                        u8 *radar_detect);

int cfg80211_set_monitor_channel(struct cfg80211_registered_device *rdev,
                                 struct cfg80211_chan_def *chandef);

int ieee80211_get_ratemask(struct ieee80211_supported_band *sband,
                           const u8 *rates, unsigned int n_rates,
                           u32 *mask);

int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev,
                                 enum nl80211_iftype iftype, u32 beacon_int);

void cfg80211_update_iface_num(struct cfg80211_registered_device *rdev,
                               enum nl80211_iftype iftype, int num);

void __cfg80211_leave(struct cfg80211_registered_device *rdev,
                      struct wireless_dev *wdev);
void cfg80211_leave(struct cfg80211_registered_device *rdev,
                    struct wireless_dev *wdev);

void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev,
                              struct wireless_dev *wdev);

void cfg80211_stop_nan(struct cfg80211_registered_device *rdev,
                       struct wireless_dev *wdev);

struct cfg80211_internal_bss *
cfg80211_bss_update(struct cfg80211_registered_device *rdev,
                    struct cfg80211_internal_bss *tmp,
                    bool signal_valid, unsigned long ts);
#ifdef CONFIG_CFG80211_DEVELOPER_WARNINGS
#define CFG80211_DEV_WARN_ON(cond)        WARN_ON(cond)
#else
/*
 * Trick to enable using it as a condition,
 * and also not give a warning when it's
 * not used that way.
 */
#define CFG80211_DEV_WARN_ON(cond)        ({bool __r = (cond); __r; })
#endif

void cfg80211_cqm_config_free(struct wireless_dev *wdev);

void cfg80211_release_pmsr(struct wireless_dev *wdev, u32 portid);
void cfg80211_pmsr_wdev_down(struct wireless_dev *wdev);
void cfg80211_pmsr_free_wk(struct work_struct *work);

#endif /* __NET_WIRELESS_CORE_H */





















































































































































































































































































































































































































































































































    2 
    1 

    1 























































































































































































































































































































































































    3 





































































































































































































































































































































































































































































































































































































































































































































































































    3 



























































































































































    3 














    3 
















































































































































































    3 

    3 

    3 











    2 









































































































































































































































    2 












    2 

































    3 




    3 



















































    3 




    1 






















    1 



























































































































































































































































































    2 





















































































































































































































































































































































































































































































    3 


















    3 




    1 
































    3 








    1 



















    2 




    3 




























    1 

    1 

    1 
    1 







    1 


































































    1 














    3 









































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
// SPDX-License-Identifier: GPL-2.0
/*
 *  ext4.h
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  from
 *
 *  linux/include/linux/minix_fs.h
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#ifndef _EXT4_H
#define _EXT4_H

#include <linux/types.h>
#include <linux/blkdev.h>
#include <linux/magic.h>
#include <linux/jbd2.h>
#include <linux/quota.h>
#include <linux/rwsem.h>
#include <linux/rbtree.h>
#include <linux/seqlock.h>
#include <linux/mutex.h>
#include <linux/timer.h>
#include <linux/wait.h>
#include <linux/sched/signal.h>
#include <linux/blockgroup_lock.h>
#include <linux/percpu_counter.h>
#include <linux/ratelimit.h>
#include <crypto/hash.h>
#include <linux/falloc.h>
#include <linux/percpu-rwsem.h>
#include <linux/fiemap.h>
#ifdef __KERNEL__
#include <linux/compat.h>
#endif

#include <linux/fscrypt.h>
#include <linux/fsverity.h>

#include <linux/compiler.h>

/*
 * The fourth extended filesystem constants/structures
 */

/*
 * with AGGRESSIVE_CHECK allocator runs consistency checks over
 * structures. these checks slow things down a lot
 */
#define AGGRESSIVE_CHECK__

/*
 * with DOUBLE_CHECK defined mballoc creates persistent in-core
 * bitmaps, maintains and uses them to check for double allocations
 */
#define DOUBLE_CHECK__

/*
 * Define EXT4FS_DEBUG to produce debug messages
 */
#undef EXT4FS_DEBUG

/*
 * Debug code
 */
#ifdef EXT4FS_DEBUG
#define ext4_debug(f, a...)                                                \
        do {                                                                \
                printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:",        \
                        __FILE__, __LINE__, __func__);                        \
                printk(KERN_DEBUG f, ## a);                                \
        } while (0)
#else
#define ext4_debug(fmt, ...)        no_printk(fmt, ##__VA_ARGS__)
#endif

 /*
  * Turn on EXT_DEBUG to enable ext4_ext_show_path/leaf/move in extents.c
  */
#define EXT_DEBUG__

/*
 * Dynamic printk for controlled extents debugging.
 */
#ifdef CONFIG_EXT4_DEBUG
#define ext_debug(ino, fmt, ...)                                        \
        pr_debug("[%s/%d] EXT4-fs (%s): ino %lu: (%s, %d): %s:" fmt,        \
                 current->comm, task_pid_nr(current),                        \
                 ino->i_sb->s_id, ino->i_ino, __FILE__, __LINE__,        \
                 __func__, ##__VA_ARGS__)
#else
#define ext_debug(ino, fmt, ...)        no_printk(fmt, ##__VA_ARGS__)
#endif

/* data type for block offset of block group */
typedef int ext4_grpblk_t;

/* data type for filesystem-wide blocks number */
typedef unsigned long long ext4_fsblk_t;

/* data type for file logical block number */
typedef __u32 ext4_lblk_t;

/* data type for block group number */
typedef unsigned int ext4_group_t;

enum SHIFT_DIRECTION {
        SHIFT_LEFT = 0,
        SHIFT_RIGHT,
};

/*
 * Flags used in mballoc's allocation_context flags field.
 *
 * Also used to show what's going on for debugging purposes when the
 * flag field is exported via the traceport interface
 */

/* prefer goal again. length */
#define EXT4_MB_HINT_MERGE                0x0001
/* blocks already reserved */
#define EXT4_MB_HINT_RESERVED                0x0002
/* metadata is being allocated */
#define EXT4_MB_HINT_METADATA                0x0004
/* first blocks in the file */
#define EXT4_MB_HINT_FIRST                0x0008
/* search for the best chunk */
#define EXT4_MB_HINT_BEST                0x0010
/* data is being allocated */
#define EXT4_MB_HINT_DATA                0x0020
/* don't preallocate (for tails) */
#define EXT4_MB_HINT_NOPREALLOC                0x0040
/* allocate for locality group */
#define EXT4_MB_HINT_GROUP_ALLOC        0x0080
/* allocate goal blocks or none */
#define EXT4_MB_HINT_GOAL_ONLY                0x0100
/* goal is meaningful */
#define EXT4_MB_HINT_TRY_GOAL                0x0200
/* blocks already pre-reserved by delayed allocation */
#define EXT4_MB_DELALLOC_RESERVED        0x0400
/* We are doing stream allocation */
#define EXT4_MB_STREAM_ALLOC                0x0800
/* Use reserved root blocks if needed */
#define EXT4_MB_USE_ROOT_BLOCKS                0x1000
/* Use blocks from reserved pool */
#define EXT4_MB_USE_RESERVED                0x2000
/* Do strict check for free blocks while retrying block allocation */
#define EXT4_MB_STRICT_CHECK                0x4000

struct ext4_allocation_request {
        /* target inode for block we're allocating */
        struct inode *inode;
        /* how many blocks we want to allocate */
        unsigned int len;
        /* logical block in target inode */
        ext4_lblk_t logical;
        /* the closest logical allocated block to the left */
        ext4_lblk_t lleft;
        /* the closest logical allocated block to the right */
        ext4_lblk_t lright;
        /* phys. target (a hint) */
        ext4_fsblk_t goal;
        /* phys. block for the closest logical allocated block to the left */
        ext4_fsblk_t pleft;
        /* phys. block for the closest logical allocated block to the right */
        ext4_fsblk_t pright;
        /* flags. see above EXT4_MB_HINT_* */
        unsigned int flags;
};

/*
 * Logical to physical block mapping, used by ext4_map_blocks()
 *
 * This structure is used to pass requests into ext4_map_blocks() as
 * well as to store the information returned by ext4_map_blocks().  It
 * takes less room on the stack than a struct buffer_head.
 */
#define EXT4_MAP_NEW                BIT(BH_New)
#define EXT4_MAP_MAPPED                BIT(BH_Mapped)
#define EXT4_MAP_UNWRITTEN        BIT(BH_Unwritten)
#define EXT4_MAP_BOUNDARY        BIT(BH_Boundary)
#define EXT4_MAP_FLAGS                (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
                                 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY)

struct ext4_map_blocks {
        ext4_fsblk_t m_pblk;
        ext4_lblk_t m_lblk;
        unsigned int m_len;
        unsigned int m_flags;
};

/*
 * Block validity checking, system zone rbtree.
 */
struct ext4_system_blocks {
        struct rb_root root;
        struct rcu_head rcu;
};

/*
 * Flags for ext4_io_end->flags
 */
#define        EXT4_IO_END_UNWRITTEN        0x0001

struct ext4_io_end_vec {
        struct list_head list;                /* list of io_end_vec */
        loff_t offset;                        /* offset in the file */
        ssize_t size;                        /* size of the extent */
};

/*
 * For converting unwritten extents on a work queue. 'handle' is used for
 * buffered writeback.
 */
typedef struct ext4_io_end {
        struct list_head        list;                /* per-file finished IO list */
        handle_t                *handle;        /* handle reserved for extent
                                                 * conversion */
        struct inode                *inode;                /* file being written to */
        struct bio                *bio;                /* Linked list of completed
                                                 * bios covering the extent */
        unsigned int                flag;                /* unwritten or not */
        atomic_t                count;                /* reference counter */
        struct list_head        list_vec;        /* list of ext4_io_end_vec */
} ext4_io_end_t;

struct ext4_io_submit {
        struct writeback_control *io_wbc;
        struct bio                *io_bio;
        ext4_io_end_t                *io_end;
        sector_t                io_next_block;
};

/*
 * Special inodes numbers
 */
#define        EXT4_BAD_INO                 1        /* Bad blocks inode */
#define EXT4_ROOT_INO                 2        /* Root inode */
#define EXT4_USR_QUOTA_INO         3        /* User quota inode */
#define EXT4_GRP_QUOTA_INO         4        /* Group quota inode */
#define EXT4_BOOT_LOADER_INO         5        /* Boot loader inode */
#define EXT4_UNDEL_DIR_INO         6        /* Undelete directory inode */
#define EXT4_RESIZE_INO                 7        /* Reserved group descriptors inode */
#define EXT4_JOURNAL_INO         8        /* Journal inode */

/* First non-reserved inode for old ext4 filesystems */
#define EXT4_GOOD_OLD_FIRST_INO        11

/*
 * Maximal count of links to a file
 */
#define EXT4_LINK_MAX                65000

/*
 * Macro-instructions used to manage several block sizes
 */
#define EXT4_MIN_BLOCK_SIZE                1024
#define        EXT4_MAX_BLOCK_SIZE                65536
#define EXT4_MIN_BLOCK_LOG_SIZE                10
#define EXT4_MAX_BLOCK_LOG_SIZE                16
#define EXT4_MAX_CLUSTER_LOG_SIZE        30
#ifdef __KERNEL__
# define EXT4_BLOCK_SIZE(s)                ((s)->s_blocksize)
#else
# define EXT4_BLOCK_SIZE(s)                (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size)
#endif
#define        EXT4_ADDR_PER_BLOCK(s)                (EXT4_BLOCK_SIZE(s) / sizeof(__u32))
#define EXT4_CLUSTER_SIZE(s)                (EXT4_BLOCK_SIZE(s) << \
                                         EXT4_SB(s)->s_cluster_bits)
#ifdef __KERNEL__
# define EXT4_BLOCK_SIZE_BITS(s)        ((s)->s_blocksize_bits)
# define EXT4_CLUSTER_BITS(s)                (EXT4_SB(s)->s_cluster_bits)
#else
# define EXT4_BLOCK_SIZE_BITS(s)        ((s)->s_log_block_size + 10)
#endif
#ifdef __KERNEL__
#define        EXT4_ADDR_PER_BLOCK_BITS(s)        (EXT4_SB(s)->s_addr_per_block_bits)
#define EXT4_INODE_SIZE(s)                (EXT4_SB(s)->s_inode_size)
#define EXT4_FIRST_INO(s)                (EXT4_SB(s)->s_first_ino)
#else
#define EXT4_INODE_SIZE(s)        (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \
                                 EXT4_GOOD_OLD_INODE_SIZE : \
                                 (s)->s_inode_size)
#define EXT4_FIRST_INO(s)        (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \
                                 EXT4_GOOD_OLD_FIRST_INO : \
                                 (s)->s_first_ino)
#endif
#define EXT4_BLOCK_ALIGN(size, blkbits)                ALIGN((size), (1 << (blkbits)))
#define EXT4_MAX_BLOCKS(size, offset, blkbits) \
        ((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \
                                                                  blkbits))

/* Translate a block number to a cluster number */
#define EXT4_B2C(sbi, blk)        ((blk) >> (sbi)->s_cluster_bits)
/* Translate a cluster number to a block number */
#define EXT4_C2B(sbi, cluster)        ((cluster) << (sbi)->s_cluster_bits)
/* Translate # of blks to # of clusters */
#define EXT4_NUM_B2C(sbi, blks)        (((blks) + (sbi)->s_cluster_ratio - 1) >> \
                                 (sbi)->s_cluster_bits)
/* Mask out the low bits to get the starting block of the cluster */
#define EXT4_PBLK_CMASK(s, pblk) ((pblk) &                                \
                                  ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
#define EXT4_LBLK_CMASK(s, lblk) ((lblk) &                                \
                                  ~((ext4_lblk_t) (s)->s_cluster_ratio - 1))
/* Fill in the low bits to get the last block of the cluster */
#define EXT4_LBLK_CFILL(sbi, lblk) ((lblk) |                                \
                                    ((ext4_lblk_t) (sbi)->s_cluster_ratio - 1))
/* Get the cluster offset */
#define EXT4_PBLK_COFF(s, pblk) ((pblk) &                                \
                                 ((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
#define EXT4_LBLK_COFF(s, lblk) ((lblk) &                                \
                                 ((ext4_lblk_t) (s)->s_cluster_ratio - 1))

/*
 * Structure of a blocks group descriptor
 */
struct ext4_group_desc
{
        __le32        bg_block_bitmap_lo;        /* Blocks bitmap block */
        __le32        bg_inode_bitmap_lo;        /* Inodes bitmap block */
        __le32        bg_inode_table_lo;        /* Inodes table block */
        __le16        bg_free_blocks_count_lo;/* Free blocks count */
        __le16        bg_free_inodes_count_lo;/* Free inodes count */
        __le16        bg_used_dirs_count_lo;        /* Directories count */
        __le16        bg_flags;                /* EXT4_BG_flags (INODE_UNINIT, etc) */
        __le32  bg_exclude_bitmap_lo;   /* Exclude bitmap for snapshots */
        __le16  bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bbitmap) LE */
        __le16  bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+ibitmap) LE */
        __le16  bg_itable_unused_lo;        /* Unused inodes count */
        __le16  bg_checksum;                /* crc16(sb_uuid+group+desc) */
        __le32        bg_block_bitmap_hi;        /* Blocks bitmap block MSB */
        __le32        bg_inode_bitmap_hi;        /* Inodes bitmap block MSB */
        __le32        bg_inode_table_hi;        /* Inodes table block MSB */
        __le16        bg_free_blocks_count_hi;/* Free blocks count MSB */
        __le16        bg_free_inodes_count_hi;/* Free inodes count MSB */
        __le16        bg_used_dirs_count_hi;        /* Directories count MSB */
        __le16  bg_itable_unused_hi;    /* Unused inodes count MSB */
        __le32  bg_exclude_bitmap_hi;   /* Exclude bitmap block MSB */
        __le16  bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bbitmap) BE */
        __le16  bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+ibitmap) BE */
        __u32   bg_reserved;
};

#define EXT4_BG_INODE_BITMAP_CSUM_HI_END        \
        (offsetof(struct ext4_group_desc, bg_inode_bitmap_csum_hi) + \
         sizeof(__le16))
#define EXT4_BG_BLOCK_BITMAP_CSUM_HI_END        \
        (offsetof(struct ext4_group_desc, bg_block_bitmap_csum_hi) + \
         sizeof(__le16))

/*
 * Structure of a flex block group info
 */

struct flex_groups {
        atomic64_t        free_clusters;
        atomic_t        free_inodes;
        atomic_t        used_dirs;
};

#define EXT4_BG_INODE_UNINIT        0x0001 /* Inode table/bitmap not in use */
#define EXT4_BG_BLOCK_UNINIT        0x0002 /* Block bitmap not in use */
#define EXT4_BG_INODE_ZEROED        0x0004 /* On-disk itable initialized to zero */

/*
 * Macro-instructions used to manage group descriptors
 */
#define EXT4_MIN_DESC_SIZE                32
#define EXT4_MIN_DESC_SIZE_64BIT        64
#define        EXT4_MAX_DESC_SIZE                EXT4_MIN_BLOCK_SIZE
#define EXT4_DESC_SIZE(s)                (EXT4_SB(s)->s_desc_size)
#ifdef __KERNEL__
# define EXT4_BLOCKS_PER_GROUP(s)        (EXT4_SB(s)->s_blocks_per_group)
# define EXT4_CLUSTERS_PER_GROUP(s)        (EXT4_SB(s)->s_clusters_per_group)
# define EXT4_DESC_PER_BLOCK(s)                (EXT4_SB(s)->s_desc_per_block)
# define EXT4_INODES_PER_GROUP(s)        (EXT4_SB(s)->s_inodes_per_group)
# define EXT4_DESC_PER_BLOCK_BITS(s)        (EXT4_SB(s)->s_desc_per_block_bits)
#else
# define EXT4_BLOCKS_PER_GROUP(s)        ((s)->s_blocks_per_group)
# define EXT4_DESC_PER_BLOCK(s)                (EXT4_BLOCK_SIZE(s) / EXT4_DESC_SIZE(s))
# define EXT4_INODES_PER_GROUP(s)        ((s)->s_inodes_per_group)
#endif

/*
 * Constants relative to the data blocks
 */
#define        EXT4_NDIR_BLOCKS                12
#define        EXT4_IND_BLOCK                        EXT4_NDIR_BLOCKS
#define        EXT4_DIND_BLOCK                        (EXT4_IND_BLOCK + 1)
#define        EXT4_TIND_BLOCK                        (EXT4_DIND_BLOCK + 1)
#define        EXT4_N_BLOCKS                        (EXT4_TIND_BLOCK + 1)

/*
 * Inode flags
 */
#define        EXT4_SECRM_FL                        0x00000001 /* Secure deletion */
#define        EXT4_UNRM_FL                        0x00000002 /* Undelete */
#define        EXT4_COMPR_FL                        0x00000004 /* Compress file */
#define EXT4_SYNC_FL                        0x00000008 /* Synchronous updates */
#define EXT4_IMMUTABLE_FL                0x00000010 /* Immutable file */
#define EXT4_APPEND_FL                        0x00000020 /* writes to file may only append */
#define EXT4_NODUMP_FL                        0x00000040 /* do not dump file */
#define EXT4_NOATIME_FL                        0x00000080 /* do not update atime */
/* Reserved for compression usage... */
#define EXT4_DIRTY_FL                        0x00000100
#define EXT4_COMPRBLK_FL                0x00000200 /* One or more compressed clusters */
#define EXT4_NOCOMPR_FL                        0x00000400 /* Don't compress */
        /* nb: was previously EXT2_ECOMPR_FL */
#define EXT4_ENCRYPT_FL                        0x00000800 /* encrypted file */
/* End compression flags --- maybe not all used */
#define EXT4_INDEX_FL                        0x00001000 /* hash-indexed directory */
#define EXT4_IMAGIC_FL                        0x00002000 /* AFS directory */
#define EXT4_JOURNAL_DATA_FL                0x00004000 /* file data should be journaled */
#define EXT4_NOTAIL_FL                        0x00008000 /* file tail should not be merged */
#define EXT4_DIRSYNC_FL                        0x00010000 /* dirsync behaviour (directories only) */
#define EXT4_TOPDIR_FL                        0x00020000 /* Top of directory hierarchies*/
#define EXT4_HUGE_FILE_FL               0x00040000 /* Set to each huge file */
#define EXT4_EXTENTS_FL                        0x00080000 /* Inode uses extents */
#define EXT4_VERITY_FL                        0x00100000 /* Verity protected inode */
#define EXT4_EA_INODE_FL                0x00200000 /* Inode used for large EA */
/* 0x00400000 was formerly EXT4_EOFBLOCKS_FL */

#define EXT4_DAX_FL                        0x02000000 /* Inode is DAX */

#define EXT4_INLINE_DATA_FL                0x10000000 /* Inode has inline data. */
#define EXT4_PROJINHERIT_FL                0x20000000 /* Create with parents projid */
#define EXT4_CASEFOLD_FL                0x40000000 /* Casefolded directory */
#define EXT4_RESERVED_FL                0x80000000 /* reserved for ext4 lib */

/* User modifiable flags */
#define EXT4_FL_USER_MODIFIABLE                (EXT4_SECRM_FL | \
                                         EXT4_UNRM_FL | \
                                         EXT4_COMPR_FL | \
                                         EXT4_SYNC_FL | \
                                         EXT4_IMMUTABLE_FL | \
                                         EXT4_APPEND_FL | \
                                         EXT4_NODUMP_FL | \
                                         EXT4_NOATIME_FL | \
                                         EXT4_JOURNAL_DATA_FL | \
                                         EXT4_NOTAIL_FL | \
                                         EXT4_DIRSYNC_FL | \
                                         EXT4_TOPDIR_FL | \
                                         EXT4_EXTENTS_FL | \
                                         0x00400000 /* EXT4_EOFBLOCKS_FL */ | \
                                         EXT4_DAX_FL | \
                                         EXT4_PROJINHERIT_FL | \
                                         EXT4_CASEFOLD_FL)

/* User visible flags */
#define EXT4_FL_USER_VISIBLE                (EXT4_FL_USER_MODIFIABLE | \
                                         EXT4_DIRTY_FL | \
                                         EXT4_COMPRBLK_FL | \
                                         EXT4_NOCOMPR_FL | \
                                         EXT4_ENCRYPT_FL | \
                                         EXT4_INDEX_FL | \
                                         EXT4_VERITY_FL | \
                                         EXT4_INLINE_DATA_FL)

/* Flags we can manipulate with through FS_IOC_FSSETXATTR */
#define EXT4_FL_XFLAG_VISIBLE                (EXT4_SYNC_FL | \
                                         EXT4_IMMUTABLE_FL | \
                                         EXT4_APPEND_FL | \
                                         EXT4_NODUMP_FL | \
                                         EXT4_NOATIME_FL | \
                                         EXT4_PROJINHERIT_FL | \
                                         EXT4_DAX_FL)

/* Flags that should be inherited by new inodes from their parent. */
#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
                           EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
                           EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
                           EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\
                           EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL |\
                           EXT4_DAX_FL)

/* Flags that are appropriate for regular files (all but dir-specific ones). */
#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL | EXT4_CASEFOLD_FL |\
                           EXT4_PROJINHERIT_FL))

/* Flags that are appropriate for non-directories/regular files. */
#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL)

/* The only flags that should be swapped */
#define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL)

/* Flags which are mutually exclusive to DAX */
#define EXT4_DAX_MUT_EXCL (EXT4_VERITY_FL | EXT4_ENCRYPT_FL |\
                           EXT4_JOURNAL_DATA_FL | EXT4_INLINE_DATA_FL)

/* Mask out flags that are inappropriate for the given type of inode. */
static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
{
        if (S_ISDIR(mode))
                return flags;
        else if (S_ISREG(mode))
                return flags & EXT4_REG_FLMASK;
        else
                return flags & EXT4_OTHER_FLMASK;
}

/*
 * Inode flags used for atomic set/get
 */
enum {
        EXT4_INODE_SECRM        = 0,        /* Secure deletion */
        EXT4_INODE_UNRM                = 1,        /* Undelete */
        EXT4_INODE_COMPR        = 2,        /* Compress file */
        EXT4_INODE_SYNC                = 3,        /* Synchronous updates */
        EXT4_INODE_IMMUTABLE        = 4,        /* Immutable file */
        EXT4_INODE_APPEND        = 5,        /* writes to file may only append */
        EXT4_INODE_NODUMP        = 6,        /* do not dump file */
        EXT4_INODE_NOATIME        = 7,        /* do not update atime */
/* Reserved for compression usage... */
        EXT4_INODE_DIRTY        = 8,
        EXT4_INODE_COMPRBLK        = 9,        /* One or more compressed clusters */
        EXT4_INODE_NOCOMPR        = 10,        /* Don't compress */
        EXT4_INODE_ENCRYPT        = 11,        /* Encrypted file */
/* End compression flags --- maybe not all used */
        EXT4_INODE_INDEX        = 12,        /* hash-indexed directory */
        EXT4_INODE_IMAGIC        = 13,        /* AFS directory */
        EXT4_INODE_JOURNAL_DATA        = 14,        /* file data should be journaled */
        EXT4_INODE_NOTAIL        = 15,        /* file tail should not be merged */
        EXT4_INODE_DIRSYNC        = 16,        /* dirsync behaviour (directories only) */
        EXT4_INODE_TOPDIR        = 17,        /* Top of directory hierarchies*/
        EXT4_INODE_HUGE_FILE        = 18,        /* Set to each huge file */
        EXT4_INODE_EXTENTS        = 19,        /* Inode uses extents */
        EXT4_INODE_VERITY        = 20,        /* Verity protected inode */
        EXT4_INODE_EA_INODE        = 21,        /* Inode used for large EA */
/* 22 was formerly EXT4_INODE_EOFBLOCKS */
        EXT4_INODE_DAX                = 25,        /* Inode is DAX */
        EXT4_INODE_INLINE_DATA        = 28,        /* Data in inode. */
        EXT4_INODE_PROJINHERIT        = 29,        /* Create with parents projid */
        EXT4_INODE_CASEFOLD        = 30,        /* Casefolded directory */
        EXT4_INODE_RESERVED        = 31,        /* reserved for ext4 lib */
};

/*
 * Since it's pretty easy to mix up bit numbers and hex values, we use a
 * build-time check to make sure that EXT4_XXX_FL is consistent with respect to
 * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost
 * any extra space in the compiled kernel image, otherwise, the build will fail.
 * It's important that these values are the same, since we are using
 * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent
 * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk
 * values found in ext2, ext3 and ext4 filesystems, and of course the values
 * defined in e2fsprogs.
 *
 * It's not paranoia if the Murphy's Law really *is* out to get you.  :-)
 */
#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1U << EXT4_INODE_##FLAG))
#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG))

static inline void ext4_check_flag_values(void)
{
        CHECK_FLAG_VALUE(SECRM);
        CHECK_FLAG_VALUE(UNRM);
        CHECK_FLAG_VALUE(COMPR);
        CHECK_FLAG_VALUE(SYNC);
        CHECK_FLAG_VALUE(IMMUTABLE);
        CHECK_FLAG_VALUE(APPEND);
        CHECK_FLAG_VALUE(NODUMP);
        CHECK_FLAG_VALUE(NOATIME);
        CHECK_FLAG_VALUE(DIRTY);
        CHECK_FLAG_VALUE(COMPRBLK);
        CHECK_FLAG_VALUE(NOCOMPR);
        CHECK_FLAG_VALUE(ENCRYPT);
        CHECK_FLAG_VALUE(INDEX);
        CHECK_FLAG_VALUE(IMAGIC);
        CHECK_FLAG_VALUE(JOURNAL_DATA);
        CHECK_FLAG_VALUE(NOTAIL);
        CHECK_FLAG_VALUE(DIRSYNC);
        CHECK_FLAG_VALUE(TOPDIR);
        CHECK_FLAG_VALUE(HUGE_FILE);
        CHECK_FLAG_VALUE(EXTENTS);
        CHECK_FLAG_VALUE(VERITY);
        CHECK_FLAG_VALUE(EA_INODE);
        CHECK_FLAG_VALUE(INLINE_DATA);
        CHECK_FLAG_VALUE(PROJINHERIT);
        CHECK_FLAG_VALUE(CASEFOLD);
        CHECK_FLAG_VALUE(RESERVED);
}

/* Used to pass group descriptor data when online resize is done */
struct ext4_new_group_input {
        __u32 group;                /* Group number for this data */
        __u64 block_bitmap;        /* Absolute block number of block bitmap */
        __u64 inode_bitmap;        /* Absolute block number of inode bitmap */
        __u64 inode_table;        /* Absolute block number of inode table start */
        __u32 blocks_count;        /* Total number of blocks in this group */
        __u16 reserved_blocks;        /* Number of reserved blocks in this group */
        __u16 unused;
};

#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
struct compat_ext4_new_group_input {
        u32 group;
        compat_u64 block_bitmap;
        compat_u64 inode_bitmap;
        compat_u64 inode_table;
        u32 blocks_count;
        u16 reserved_blocks;
        u16 unused;
};
#endif

/* The struct ext4_new_group_input in kernel space, with free_blocks_count */
struct ext4_new_group_data {
        __u32 group;
        __u64 block_bitmap;
        __u64 inode_bitmap;
        __u64 inode_table;
        __u32 blocks_count;
        __u16 reserved_blocks;
        __u16 mdata_blocks;
        __u32 free_clusters_count;
};

/* Indexes used to index group tables in ext4_new_group_data */
enum {
        BLOCK_BITMAP = 0,        /* block bitmap */
        INODE_BITMAP,                /* inode bitmap */
        INODE_TABLE,                /* inode tables */
        GROUP_TABLE_COUNT,
};

/*
 * Flags used by ext4_map_blocks()
 */
        /* Allocate any needed blocks and/or convert an unwritten
           extent to be an initialized ext4 */
#define EXT4_GET_BLOCKS_CREATE                        0x0001
        /* Request the creation of an unwritten extent */
#define EXT4_GET_BLOCKS_UNWRIT_EXT                0x0002
#define EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT        (EXT4_GET_BLOCKS_UNWRIT_EXT|\
                                                 EXT4_GET_BLOCKS_CREATE)
        /* Caller is from the delayed allocation writeout path
         * finally doing the actual allocation of delayed blocks */
#define EXT4_GET_BLOCKS_DELALLOC_RESERVE        0x0004
        /* caller is from the direct IO path, request to creation of an
        unwritten extents if not allocated, split the unwritten
        extent if blocks has been preallocated already*/
#define EXT4_GET_BLOCKS_PRE_IO                        0x0008
#define EXT4_GET_BLOCKS_CONVERT                        0x0010
#define EXT4_GET_BLOCKS_IO_CREATE_EXT                (EXT4_GET_BLOCKS_PRE_IO|\
                                         EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
        /* Convert extent to initialized after IO complete */
#define EXT4_GET_BLOCKS_IO_CONVERT_EXT                (EXT4_GET_BLOCKS_CONVERT|\
                                         EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
        /* Eventual metadata allocation (due to growing extent tree)
         * should not fail, so try to use reserved blocks for that.*/
#define EXT4_GET_BLOCKS_METADATA_NOFAIL                0x0020
        /* Don't normalize allocation size (used for fallocate) */
#define EXT4_GET_BLOCKS_NO_NORMALIZE                0x0040
        /* Convert written extents to unwritten */
#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN        0x0100
        /* Write zeros to newly created written extents */
#define EXT4_GET_BLOCKS_ZERO                        0x0200
#define EXT4_GET_BLOCKS_CREATE_ZERO                (EXT4_GET_BLOCKS_CREATE |\
                                        EXT4_GET_BLOCKS_ZERO)
        /* Caller will submit data before dropping transaction handle. This
         * allows jbd2 to avoid submitting data before commit. */
#define EXT4_GET_BLOCKS_IO_SUBMIT                0x0400

/*
 * The bit position of these flags must not overlap with any of the
 * EXT4_GET_BLOCKS_*.  They are used by ext4_find_extent(),
 * read_extent_tree_block(), ext4_split_extent_at(),
 * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf().
 * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be
 * caching the extents when reading from the extent tree while a
 * truncate or punch hole operation is in progress.
 */
#define EXT4_EX_NOCACHE                                0x40000000
#define EXT4_EX_FORCE_CACHE                        0x20000000
#define EXT4_EX_NOFAIL                                0x10000000

/*
 * Flags used by ext4_free_blocks
 */
#define EXT4_FREE_BLOCKS_METADATA                0x0001
#define EXT4_FREE_BLOCKS_FORGET                        0x0002
#define EXT4_FREE_BLOCKS_VALIDATED                0x0004
#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE                0x0008
#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER        0x0010
#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER        0x0020
#define EXT4_FREE_BLOCKS_RERESERVE_CLUSTER      0x0040

/*
 * ioctl commands
 */
#define        EXT4_IOC_GETVERSION                _IOR('f', 3, long)
#define        EXT4_IOC_SETVERSION                _IOW('f', 4, long)
#define        EXT4_IOC_GETVERSION_OLD                FS_IOC_GETVERSION
#define        EXT4_IOC_SETVERSION_OLD                FS_IOC_SETVERSION
#define EXT4_IOC_GETRSVSZ                _IOR('f', 5, long)
#define EXT4_IOC_SETRSVSZ                _IOW('f', 6, long)
#define EXT4_IOC_GROUP_EXTEND                _IOW('f', 7, unsigned long)
#define EXT4_IOC_GROUP_ADD                _IOW('f', 8, struct ext4_new_group_input)
#define EXT4_IOC_MIGRATE                _IO('f', 9)
 /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */
 /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
#define EXT4_IOC_ALLOC_DA_BLKS                _IO('f', 12)
#define EXT4_IOC_MOVE_EXT                _IOWR('f', 15, struct move_extent)
#define EXT4_IOC_RESIZE_FS                _IOW('f', 16, __u64)
#define EXT4_IOC_SWAP_BOOT                _IO('f', 17)
#define EXT4_IOC_PRECACHE_EXTENTS        _IO('f', 18)
/* ioctl codes 19--39 are reserved for fscrypt */
#define EXT4_IOC_CLEAR_ES_CACHE                _IO('f', 40)
#define EXT4_IOC_GETSTATE                _IOW('f', 41, __u32)
#define EXT4_IOC_GET_ES_CACHE                _IOWR('f', 42, struct fiemap)

#define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32)

/*
 * Flags for going down operation
 */
#define EXT4_GOING_FLAGS_DEFAULT                0x0        /* going down */
#define EXT4_GOING_FLAGS_LOGFLUSH                0x1        /* flush log but not data */
#define EXT4_GOING_FLAGS_NOLOGFLUSH                0x2        /* don't flush log nor data */

/*
 * Flags returned by EXT4_IOC_GETSTATE
 *
 * We only expose to userspace a subset of the state flags in
 * i_state_flags
 */
#define EXT4_STATE_FLAG_EXT_PRECACHED        0x00000001
#define EXT4_STATE_FLAG_NEW                0x00000002
#define EXT4_STATE_FLAG_NEWENTRY        0x00000004
#define EXT4_STATE_FLAG_DA_ALLOC_CLOSE        0x00000008

#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
 * ioctl commands in 32 bit emulation
 */
#define EXT4_IOC32_GETVERSION                _IOR('f', 3, int)
#define EXT4_IOC32_SETVERSION                _IOW('f', 4, int)
#define EXT4_IOC32_GETRSVSZ                _IOR('f', 5, int)
#define EXT4_IOC32_SETRSVSZ                _IOW('f', 6, int)
#define EXT4_IOC32_GROUP_EXTEND                _IOW('f', 7, unsigned int)
#define EXT4_IOC32_GROUP_ADD                _IOW('f', 8, struct compat_ext4_new_group_input)
#define EXT4_IOC32_GETVERSION_OLD        FS_IOC32_GETVERSION
#define EXT4_IOC32_SETVERSION_OLD        FS_IOC32_SETVERSION
#endif

/*
 * Returned by EXT4_IOC_GET_ES_CACHE as an additional possible flag.
 * It indicates that the entry in extent status cache is for a hole.
 */
#define EXT4_FIEMAP_EXTENT_HOLE                0x08000000

/* Max physical block we can address w/o extents */
#define EXT4_MAX_BLOCK_FILE_PHYS        0xFFFFFFFF

/* Max logical block we can support */
#define EXT4_MAX_LOGICAL_BLOCK                0xFFFFFFFE

/*
 * Structure of an inode on the disk
 */
struct ext4_inode {
        __le16        i_mode;                /* File mode */
        __le16        i_uid;                /* Low 16 bits of Owner Uid */
        __le32        i_size_lo;        /* Size in bytes */
        __le32        i_atime;        /* Access time */
        __le32        i_ctime;        /* Inode Change time */
        __le32        i_mtime;        /* Modification time */
        __le32        i_dtime;        /* Deletion Time */
        __le16        i_gid;                /* Low 16 bits of Group Id */
        __le16        i_links_count;        /* Links count */
        __le32        i_blocks_lo;        /* Blocks count */
        __le32        i_flags;        /* File flags */
        union {
                struct {
                        __le32  l_i_version;
                } linux1;
                struct {
                        __u32  h_i_translator;
                } hurd1;
                struct {
                        __u32  m_i_reserved1;
                } masix1;
        } osd1;                                /* OS dependent 1 */
        __le32        i_block[EXT4_N_BLOCKS];/* Pointers to blocks */
        __le32        i_generation;        /* File version (for NFS) */
        __le32        i_file_acl_lo;        /* File ACL */
        __le32        i_size_high;
        __le32        i_obso_faddr;        /* Obsoleted fragment address */
        union {
                struct {
                        __le16        l_i_blocks_high; /* were l_i_reserved1 */
                        __le16        l_i_file_acl_high;
                        __le16        l_i_uid_high;        /* these 2 fields */
                        __le16        l_i_gid_high;        /* were reserved2[0] */
                        __le16        l_i_checksum_lo;/* crc32c(uuid+inum+inode) LE */
                        __le16        l_i_reserved;
                } linux2;
                struct {
                        __le16        h_i_reserved1;        /* Obsoleted fragment number/size which are removed in ext4 */
                        __u16        h_i_mode_high;
                        __u16        h_i_uid_high;
                        __u16        h_i_gid_high;
                        __u32        h_i_author;
                } hurd2;
                struct {
                        __le16        h_i_reserved1;        /* Obsoleted fragment number/size which are removed in ext4 */
                        __le16        m_i_file_acl_high;
                        __u32        m_i_reserved2[2];
                } masix2;
        } osd2;                                /* OS dependent 2 */
        __le16        i_extra_isize;
        __le16        i_checksum_hi;        /* crc32c(uuid+inum+inode) BE */
        __le32  i_ctime_extra;  /* extra Change time      (nsec << 2 | epoch) */
        __le32  i_mtime_extra;  /* extra Modification time(nsec << 2 | epoch) */
        __le32  i_atime_extra;  /* extra Access time      (nsec << 2 | epoch) */
        __le32  i_crtime;       /* File Creation time */
        __le32  i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */
        __le32  i_version_hi;        /* high 32 bits for 64-bit version */
        __le32        i_projid;        /* Project ID */
};

struct move_extent {
        __u32 reserved;                /* should be zero */
        __u32 donor_fd;                /* donor file descriptor */
        __u64 orig_start;        /* logical start offset in block for orig */
        __u64 donor_start;        /* logical start offset in block for donor */
        __u64 len;                /* block length to be moved */
        __u64 moved_len;        /* moved block length */
};

#define EXT4_EPOCH_BITS 2
#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
#define EXT4_NSEC_MASK  (~0UL << EXT4_EPOCH_BITS)

/*
 * Extended fields will fit into an inode if the filesystem was formatted
 * with large inodes (-I 256 or larger) and there are not currently any EAs
 * consuming all of the available space. For new inodes we always reserve
 * enough space for the kernel's known extended fields, but for inodes
 * created with an old kernel this might not have been the case. None of
 * the extended inode fields is critical for correct filesystem operation.
 * This macro checks if a certain field fits in the inode. Note that
 * inode-size = GOOD_OLD_INODE_SIZE + i_extra_isize
 */
#define EXT4_FITS_IN_INODE(ext4_inode, einode, field)        \
        ((offsetof(typeof(*ext4_inode), field) +        \
          sizeof((ext4_inode)->field))                        \
        <= (EXT4_GOOD_OLD_INODE_SIZE +                        \
            (einode)->i_extra_isize))                        \

/*
 * We use an encoding that preserves the times for extra epoch "00":
 *
 * extra  msb of                         adjust for signed
 * epoch  32-bit                         32-bit tv_sec to
 * bits   time    decoded 64-bit tv_sec  64-bit tv_sec      valid time range
 * 0 0    1    -0x80000000..-0x00000001  0x000000000 1901-12-13..1969-12-31
 * 0 0    0    0x000000000..0x07fffffff  0x000000000 1970-01-01..2038-01-19
 * 0 1    1    0x080000000..0x0ffffffff  0x100000000 2038-01-19..2106-02-07
 * 0 1    0    0x100000000..0x17fffffff  0x100000000 2106-02-07..2174-02-25
 * 1 0    1    0x180000000..0x1ffffffff  0x200000000 2174-02-25..2242-03-16
 * 1 0    0    0x200000000..0x27fffffff  0x200000000 2242-03-16..2310-04-04
 * 1 1    1    0x280000000..0x2ffffffff  0x300000000 2310-04-04..2378-04-22
 * 1 1    0    0x300000000..0x37fffffff  0x300000000 2378-04-22..2446-05-10
 *
 * Note that previous versions of the kernel on 64-bit systems would
 * incorrectly use extra epoch bits 1,1 for dates between 1901 and
 * 1970.  e2fsck will correct this, assuming that it is run on the
 * affected filesystem before 2242.
 */

static inline __le32 ext4_encode_extra_time(struct timespec64 *time)
{
        u32 extra =((time->tv_sec - (s32)time->tv_sec) >> 32) & EXT4_EPOCH_MASK;
        return cpu_to_le32(extra | (time->tv_nsec << EXT4_EPOCH_BITS));
}

static inline void ext4_decode_extra_time(struct timespec64 *time,
                                          __le32 extra)
{
        if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK)))
                time->tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32;
        time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
}

#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode)                                \
do {                                                                                \
        if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra))     {\
                (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec);        \
                (raw_inode)->xtime ## _extra =                                        \
                                ext4_encode_extra_time(&(inode)->xtime);        \
                }                                                                \
        else        \
                (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (inode)->xtime.tv_sec, S32_MIN, S32_MAX));        \
} while (0)

#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode)                               \
do {                                                                               \
        if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime))                       \
                (raw_inode)->xtime = cpu_to_le32((einode)->xtime.tv_sec);      \
        if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra))               \
                (raw_inode)->xtime ## _extra =                                       \
                                ext4_encode_extra_time(&(einode)->xtime);      \
} while (0)

#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode)                                \
do {                                                                                \
        (inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime);        \
        if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) {        \
                ext4_decode_extra_time(&(inode)->xtime,                                \
                                       raw_inode->xtime ## _extra);                \
                }                                                                \
        else                                                                        \
                (inode)->xtime.tv_nsec = 0;                                        \
} while (0)


#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode)                               \
do {                                                                               \
        if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime))                       \
                (einode)->xtime.tv_sec =                                        \
                        (signed)le32_to_cpu((raw_inode)->xtime);               \
        else                                                                       \
                (einode)->xtime.tv_sec = 0;                                       \
        if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra))               \
                ext4_decode_extra_time(&(einode)->xtime,                       \
                                       raw_inode->xtime ## _extra);               \
        else                                                                       \
                (einode)->xtime.tv_nsec = 0;                                       \
} while (0)

#define i_disk_version osd1.linux1.l_i_version

#if defined(__KERNEL__) || defined(__linux__)
#define i_reserved1        osd1.linux1.l_i_reserved1
#define i_file_acl_high        osd2.linux2.l_i_file_acl_high
#define i_blocks_high        osd2.linux2.l_i_blocks_high
#define i_uid_low        i_uid
#define i_gid_low        i_gid
#define i_uid_high        osd2.linux2.l_i_uid_high
#define i_gid_high        osd2.linux2.l_i_gid_high
#define i_checksum_lo        osd2.linux2.l_i_checksum_lo

#elif defined(__GNU__)

#define i_translator        osd1.hurd1.h_i_translator
#define i_uid_high        osd2.hurd2.h_i_uid_high
#define i_gid_high        osd2.hurd2.h_i_gid_high
#define i_author        osd2.hurd2.h_i_author

#elif defined(__masix__)

#define i_reserved1        osd1.masix1.m_i_reserved1
#define i_file_acl_high        osd2.masix2.m_i_file_acl_high
#define i_reserved2        osd2.masix2.m_i_reserved2

#endif /* defined(__KERNEL__) || defined(__linux__) */

#include "extents_status.h"
#include "fast_commit.h"

/*
 * Lock subclasses for i_data_sem in the ext4_inode_info structure.
 *
 * These are needed to avoid lockdep false positives when we need to
 * allocate blocks to the quota inode during ext4_map_blocks(), while
 * holding i_data_sem for a normal (non-quota) inode.  Since we don't
 * do quota tracking for the quota inode, this avoids deadlock (as
 * well as infinite recursion, since it isn't turtles all the way
 * down...)
 *
 *  I_DATA_SEM_NORMAL - Used for most inodes
 *  I_DATA_SEM_OTHER  - Used by move_inode.c for the second normal inode
 *                          where the second inode has larger inode number
 *                          than the first
 *  I_DATA_SEM_QUOTA  - Used for quota inodes only
 *  I_DATA_SEM_EA     - Used for ea_inodes only
 */
enum {
        I_DATA_SEM_NORMAL = 0,
        I_DATA_SEM_OTHER,
        I_DATA_SEM_QUOTA,
        I_DATA_SEM_EA
};


/*
 * fourth extended file system inode data in memory
 */
struct ext4_inode_info {
        __le32        i_data[15];        /* unconverted */
        __u32        i_dtime;
        ext4_fsblk_t        i_file_acl;

        /*
         * i_block_group is the number of the block group which contains
         * this file's inode.  Constant across the lifetime of the inode,
         * it is used for making block allocation decisions - we try to
         * place a file's data blocks near its inode block, and new inodes
         * near to their parent directory's inode.
         */
        ext4_group_t        i_block_group;
        ext4_lblk_t        i_dir_start_lookup;
#if (BITS_PER_LONG < 64)
        unsigned long        i_state_flags;                /* Dynamic state flags */
#endif
        unsigned long        i_flags;

        /*
         * Extended attributes can be read independently of the main file
         * data. Taking i_mutex even when reading would cause contention
         * between readers of EAs and writers of regular file data, so
         * instead we synchronize on xattr_sem when reading or changing
         * EAs.
         */
        struct rw_semaphore xattr_sem;

        struct list_head i_orphan;        /* unlinked but open inodes */

        /* Fast commit related info */

        struct list_head i_fc_list;        /*
                                         * inodes that need fast commit
                                         * protected by sbi->s_fc_lock.
                                         */

        /* Start of lblk range that needs to be committed in this fast commit */
        ext4_lblk_t i_fc_lblk_start;

        /* End of lblk range that needs to be committed in this fast commit */
        ext4_lblk_t i_fc_lblk_len;

        /* Number of ongoing updates on this inode */
        atomic_t  i_fc_updates;

        /* Fast commit wait queue for this inode */
        wait_queue_head_t i_fc_wait;

        /* Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len */
        struct mutex i_fc_lock;

        /*
         * i_disksize keeps track of what the inode size is ON DISK, not
         * in memory.  During truncate, i_size is set to the new size by
         * the VFS prior to calling ext4_truncate(), but the filesystem won't
         * set i_disksize to 0 until the truncate is actually under way.
         *
         * The intent is that i_disksize always represents the blocks which
         * are used by this file.  This allows recovery to restart truncate
         * on orphans if we crash during truncate.  We actually write i_disksize
         * into the on-disk inode when writing inodes out, instead of i_size.
         *
         * The only time when i_disksize and i_size may be different is when
         * a truncate is in progress.  The only things which change i_disksize
         * are ext4_get_block (growth) and ext4_truncate (shrinkth).
         */
        loff_t        i_disksize;

        /*
         * i_data_sem is for serialising ext4_truncate() against
         * ext4_getblock().  In the 2.4 ext2 design, great chunks of inode's
         * data tree are chopped off during truncate. We can't do that in
         * ext4 because whenever we perform intermediate commits during
         * truncate, the inode and all the metadata blocks *must* be in a
         * consistent state which allows truncation of the orphans to restart
         * during recovery.  Hence we must fix the get_block-vs-truncate race
         * by other means, so we have i_data_sem.
         */
        struct rw_semaphore i_data_sem;
        /*
         * i_mmap_sem is for serializing page faults with truncate / punch hole
         * operations. We have to make sure that new page cannot be faulted in
         * a section of the inode that is being punched. We cannot easily use
         * i_data_sem for this since we need protection for the whole punch
         * operation and i_data_sem ranks below transaction start so we have
         * to occasionally drop it.
         */
        struct rw_semaphore i_mmap_sem;
        struct inode vfs_inode;
        struct jbd2_inode *jinode;

        spinlock_t i_raw_lock;        /* protects updates to the raw inode */

        /*
         * File creation time. Its function is same as that of
         * struct timespec64 i_{a,c,m}time in the generic inode.
         */
        struct timespec64 i_crtime;

        /* mballoc */
        atomic_t i_prealloc_active;
        struct list_head i_prealloc_list;
        spinlock_t i_prealloc_lock;

        /* extents status tree */
        struct ext4_es_tree i_es_tree;
        rwlock_t i_es_lock;
        struct list_head i_es_list;
        unsigned int i_es_all_nr;        /* protected by i_es_lock */
        unsigned int i_es_shk_nr;        /* protected by i_es_lock */
        ext4_lblk_t i_es_shrink_lblk;        /* Offset where we start searching for
                                           extents to shrink. Protected by
                                           i_es_lock  */

        /* ialloc */
        ext4_group_t        i_last_alloc_group;

        /* allocation reservation info for delalloc */
        /* In case of bigalloc, this refer to clusters rather than blocks */
        unsigned int i_reserved_data_blocks;

        /* pending cluster reservations for bigalloc file systems */
        struct ext4_pending_tree i_pending_tree;

        /* on-disk additional length */
        __u16 i_extra_isize;

        /* Indicate the inline data space. */
        u16 i_inline_off;
        u16 i_inline_size;

#ifdef CONFIG_QUOTA
        /* quota space reservation, managed internally by quota code */
        qsize_t i_reserved_quota;
#endif

        /* Lock protecting lists below */
        spinlock_t i_completed_io_lock;
        /*
         * Completed IOs that need unwritten extents handling and have
         * transaction reserved
         */
        struct list_head i_rsv_conversion_list;
        struct work_struct i_rsv_conversion_work;
        atomic_t i_unwritten; /* Nr. of inflight conversions pending */

        spinlock_t i_block_reservation_lock;

        /*
         * Transactions that contain inode's metadata needed to complete
         * fsync and fdatasync, respectively.
         */
        tid_t i_sync_tid;
        tid_t i_datasync_tid;

#ifdef CONFIG_QUOTA
        struct dquot *i_dquot[MAXQUOTAS];
#endif

        /* Precomputed uuid+inum+igen checksum for seeding inode checksums */
        __u32 i_csum_seed;

        kprojid_t i_projid;
};

/*
 * File system states
 */
#define        EXT4_VALID_FS                        0x0001        /* Unmounted cleanly */
#define        EXT4_ERROR_FS                        0x0002        /* Errors detected */
#define        EXT4_ORPHAN_FS                        0x0004        /* Orphans being recovered */
#define EXT4_FC_REPLAY                        0x0020        /* Fast commit replay ongoing */

/*
 * Misc. filesystem flags
 */
#define EXT2_FLAGS_SIGNED_HASH                0x0001  /* Signed dirhash in use */
#define EXT2_FLAGS_UNSIGNED_HASH        0x0002  /* Unsigned dirhash in use */
#define EXT2_FLAGS_TEST_FILESYS                0x0004        /* to test development code */

/*
 * Mount flags set via mount options or defaults
 */
#define EXT4_MOUNT_NO_MBCACHE                0x00001 /* Do not use mbcache */
#define EXT4_MOUNT_GRPID                0x00004        /* Create files with directory's group */
#define EXT4_MOUNT_DEBUG                0x00008        /* Some debugging messages */
#define EXT4_MOUNT_ERRORS_CONT                0x00010        /* Continue on errors */
#define EXT4_MOUNT_ERRORS_RO                0x00020        /* Remount fs ro on errors */
#define EXT4_MOUNT_ERRORS_PANIC                0x00040        /* Panic on errors */
#define EXT4_MOUNT_ERRORS_MASK                0x00070
#define EXT4_MOUNT_MINIX_DF                0x00080        /* Mimics the Minix statfs */
#define EXT4_MOUNT_NOLOAD                0x00100        /* Don't use existing journal*/
#ifdef CONFIG_FS_DAX
#define EXT4_MOUNT_DAX_ALWAYS                0x00200        /* Direct Access */
#else
#define EXT4_MOUNT_DAX_ALWAYS                0
#endif
#define EXT4_MOUNT_DATA_FLAGS                0x00C00        /* Mode for data writes: */
#define EXT4_MOUNT_JOURNAL_DATA                0x00400        /* Write data to journal */
#define EXT4_MOUNT_ORDERED_DATA                0x00800        /* Flush data before commit */
#define EXT4_MOUNT_WRITEBACK_DATA        0x00C00        /* No data ordering */
#define EXT4_MOUNT_UPDATE_JOURNAL        0x01000        /* Update the journal format */
#define EXT4_MOUNT_NO_UID32                0x02000  /* Disable 32-bit UIDs */
#define EXT4_MOUNT_XATTR_USER                0x04000        /* Extended user attributes */
#define EXT4_MOUNT_POSIX_ACL                0x08000        /* POSIX Access Control Lists */
#define EXT4_MOUNT_NO_AUTO_DA_ALLOC        0x10000        /* No auto delalloc mapping */
#define EXT4_MOUNT_BARRIER                0x20000 /* Use block barriers */
#define EXT4_MOUNT_QUOTA                0x40000 /* Some quota option set */
#define EXT4_MOUNT_USRQUOTA                0x80000 /* "old" user quota,
                                                 * enable enforcement for hidden
                                                 * quota files */
#define EXT4_MOUNT_GRPQUOTA                0x100000 /* "old" group quota, enable
                                                  * enforcement for hidden quota
                                                  * files */
#define EXT4_MOUNT_PRJQUOTA                0x200000 /* Enable project quota
                                                  * enforcement */
#define EXT4_MOUNT_DIOREAD_NOLOCK        0x400000 /* Enable support for dio read nolocking */
#define EXT4_MOUNT_JOURNAL_CHECKSUM        0x800000 /* Journal checksums */
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT        0x1000000 /* Journal Async Commit */
#define EXT4_MOUNT_WARN_ON_ERROR        0x2000000 /* Trigger WARN_ON on error */
#define EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS 0x4000000
#define EXT4_MOUNT_DELALLOC                0x8000000 /* Delalloc support */
#define EXT4_MOUNT_DATA_ERR_ABORT        0x10000000 /* Abort on file data write */
#define EXT4_MOUNT_BLOCK_VALIDITY        0x20000000 /* Block validity checking */
#define EXT4_MOUNT_DISCARD                0x40000000 /* Issue DISCARD requests */
#define EXT4_MOUNT_INIT_INODE_TABLE        0x80000000 /* Initialize uninitialized itables */

/*
 * Mount flags set either automatically (could not be set by mount option)
 * based on per file system feature or property or in special cases such as
 * distinguishing between explicit mount option definition and default.
 */
#define EXT4_MOUNT2_EXPLICIT_DELALLOC        0x00000001 /* User explicitly
                                                      specified delalloc */
#define EXT4_MOUNT2_STD_GROUP_SIZE        0x00000002 /* We have standard group
                                                      size of blocksize * 8
                                                      blocks */
#define EXT4_MOUNT2_HURD_COMPAT                0x00000004 /* Support HURD-castrated
                                                      file systems */
#define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM        0x00000008 /* User explicitly
                                                specified journal checksum */

#define EXT4_MOUNT2_JOURNAL_FAST_COMMIT        0x00000010 /* Journal fast commit */
#define EXT4_MOUNT2_DAX_NEVER                0x00000020 /* Do not allow Direct Access */
#define EXT4_MOUNT2_DAX_INODE                0x00000040 /* For printing options only */


#define clear_opt(sb, opt)                EXT4_SB(sb)->s_mount_opt &= \
                                                ~EXT4_MOUNT_##opt
#define set_opt(sb, opt)                EXT4_SB(sb)->s_mount_opt |= \
                                                EXT4_MOUNT_##opt
#define test_opt(sb, opt)                (EXT4_SB(sb)->s_mount_opt & \
                                         EXT4_MOUNT_##opt)

#define clear_opt2(sb, opt)                EXT4_SB(sb)->s_mount_opt2 &= \
                                                ~EXT4_MOUNT2_##opt
#define set_opt2(sb, opt)                EXT4_SB(sb)->s_mount_opt2 |= \
                                                EXT4_MOUNT2_##opt
#define test_opt2(sb, opt)                (EXT4_SB(sb)->s_mount_opt2 & \
                                         EXT4_MOUNT2_##opt)

#define ext4_test_and_set_bit                __test_and_set_bit_le
#define ext4_set_bit                        __set_bit_le
#define ext4_set_bit_atomic                ext2_set_bit_atomic
#define ext4_test_and_clear_bit                __test_and_clear_bit_le
#define ext4_clear_bit                        __clear_bit_le
#define ext4_clear_bit_atomic                ext2_clear_bit_atomic
#define ext4_test_bit                        test_bit_le
#define ext4_find_next_zero_bit                find_next_zero_bit_le
#define ext4_find_next_bit                find_next_bit_le

extern void ext4_set_bits(void *bm, int cur, int len);

/*
 * Maximal mount counts between two filesystem checks
 */
#define EXT4_DFL_MAX_MNT_COUNT                20        /* Allow 20 mounts */
#define EXT4_DFL_CHECKINTERVAL                0        /* Don't use interval check */

/*
 * Behaviour when detecting errors
 */
#define EXT4_ERRORS_CONTINUE                1        /* Continue execution */
#define EXT4_ERRORS_RO                        2        /* Remount fs read-only */
#define EXT4_ERRORS_PANIC                3        /* Panic */
#define EXT4_ERRORS_DEFAULT                EXT4_ERRORS_CONTINUE

/* Metadata checksum algorithm codes */
#define EXT4_CRC32C_CHKSUM                1

/*
 * Structure of the super block
 */
struct ext4_super_block {
/*00*/        __le32        s_inodes_count;                /* Inodes count */
        __le32        s_blocks_count_lo;        /* Blocks count */
        __le32        s_r_blocks_count_lo;        /* Reserved blocks count */
        __le32        s_free_blocks_count_lo;        /* Free blocks count */
/*10*/        __le32        s_free_inodes_count;        /* Free inodes count */
        __le32        s_first_data_block;        /* First Data Block */
        __le32        s_log_block_size;        /* Block size */
        __le32        s_log_cluster_size;        /* Allocation cluster size */
/*20*/        __le32        s_blocks_per_group;        /* # Blocks per group */
        __le32        s_clusters_per_group;        /* # Clusters per group */
        __le32        s_inodes_per_group;        /* # Inodes per group */
        __le32        s_mtime;                /* Mount time */
/*30*/        __le32        s_wtime;                /* Write time */
        __le16        s_mnt_count;                /* Mount count */
        __le16        s_max_mnt_count;        /* Maximal mount count */
        __le16        s_magic;                /* Magic signature */
        __le16        s_state;                /* File system state */
        __le16        s_errors;                /* Behaviour when detecting errors */
        __le16        s_minor_rev_level;        /* minor revision level */
/*40*/        __le32        s_lastcheck;                /* time of last check */
        __le32        s_checkinterval;        /* max. time between checks */
        __le32        s_creator_os;                /* OS */
        __le32        s_rev_level;                /* Revision level */
/*50*/        __le16        s_def_resuid;                /* Default uid for reserved blocks */
        __le16        s_def_resgid;                /* Default gid for reserved blocks */
        /*
         * These fields are for EXT4_DYNAMIC_REV superblocks only.
         *
         * Note: the difference between the compatible feature set and
         * the incompatible feature set is that if there is a bit set
         * in the incompatible feature set that the kernel doesn't
         * know about, it should refuse to mount the filesystem.
         *
         * e2fsck's requirements are more strict; if it doesn't know
         * about a feature in either the compatible or incompatible
         * feature set, it must abort and not try to meddle with
         * things it doesn't understand...
         */
        __le32        s_first_ino;                /* First non-reserved inode */
        __le16  s_inode_size;                /* size of inode structure */
        __le16        s_block_group_nr;        /* block group # of this superblock */
        __le32        s_feature_compat;        /* compatible feature set */
/*60*/        __le32        s_feature_incompat;        /* incompatible feature set */
        __le32        s_feature_ro_compat;        /* readonly-compatible feature set */
/*68*/        __u8        s_uuid[16];                /* 128-bit uuid for volume */
/*78*/        char        s_volume_name[16];        /* volume name */
/*88*/        char        s_last_mounted[64] __nonstring;        /* directory where last mounted */
/*C8*/        __le32        s_algorithm_usage_bitmap; /* For compression */
        /*
         * Performance hints.  Directory preallocation should only
         * happen if the EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on.
         */
        __u8        s_prealloc_blocks;        /* Nr of blocks to try to preallocate*/
        __u8        s_prealloc_dir_blocks;        /* Nr to preallocate for dirs */
        __le16        s_reserved_gdt_blocks;        /* Per group desc for online growth */
        /*
         * Journaling support valid if EXT4_FEATURE_COMPAT_HAS_JOURNAL set.
         */
/*D0*/        __u8        s_journal_uuid[16];        /* uuid of journal superblock */
/*E0*/        __le32        s_journal_inum;                /* inode number of journal file */
        __le32        s_journal_dev;                /* device number of journal file */
        __le32        s_last_orphan;                /* start of list of inodes to delete */
        __le32        s_hash_seed[4];                /* HTREE hash seed */
        __u8        s_def_hash_version;        /* Default hash version to use */
        __u8        s_jnl_backup_type;
        __le16  s_desc_size;                /* size of group descriptor */
/*100*/        __le32        s_default_mount_opts;
        __le32        s_first_meta_bg;        /* First metablock block group */
        __le32        s_mkfs_time;                /* When the filesystem was created */
        __le32        s_jnl_blocks[17];        /* Backup of the journal inode */
        /* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */
/*150*/        __le32        s_blocks_count_hi;        /* Blocks count */
        __le32        s_r_blocks_count_hi;        /* Reserved blocks count */
        __le32        s_free_blocks_count_hi;        /* Free blocks count */
        __le16        s_min_extra_isize;        /* All inodes have at least # bytes */
        __le16        s_want_extra_isize;         /* New inodes should reserve # bytes */
        __le32        s_flags;                /* Miscellaneous flags */
        __le16  s_raid_stride;                /* RAID stride */
        __le16  s_mmp_update_interval;  /* # seconds to wait in MMP checking */
        __le64  s_mmp_block;            /* Block for multi-mount protection */
        __le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
        __u8        s_log_groups_per_flex;  /* FLEX_BG group size */
        __u8        s_checksum_type;        /* metadata checksum algorithm used */
        __u8        s_encryption_level;        /* versioning level for encryption */
        __u8        s_reserved_pad;                /* Padding to next 32bits */
        __le64        s_kbytes_written;        /* nr of lifetime kilobytes written */
        __le32        s_snapshot_inum;        /* Inode number of active snapshot */
        __le32        s_snapshot_id;                /* sequential ID of active snapshot */
        __le64        s_snapshot_r_blocks_count; /* reserved blocks for active
                                              snapshot's future use */
        __le32        s_snapshot_list;        /* inode number of the head of the
                                           on-disk snapshot list */
#define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count)
        __le32        s_error_count;                /* number of fs errors */
        __le32        s_first_error_time;        /* first time an error happened */
        __le32        s_first_error_ino;        /* inode involved in first error */
        __le64        s_first_error_block;        /* block involved of first error */
        __u8        s_first_error_func[32] __nonstring;        /* function where the error happened */
        __le32        s_first_error_line;        /* line number where error happened */
        __le32        s_last_error_time;        /* most recent time of an error */
        __le32        s_last_error_ino;        /* inode involved in last error */
        __le32        s_last_error_line;        /* line number where error happened */
        __le64        s_last_error_block;        /* block involved of last error */
        __u8        s_last_error_func[32] __nonstring;        /* function where the error happened */
#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts)
        __u8        s_mount_opts[64];
        __le32        s_usr_quota_inum;        /* inode for tracking user quota */
        __le32        s_grp_quota_inum;        /* inode for tracking group quota */
        __le32        s_overhead_clusters;        /* overhead blocks/clusters in fs */
        __le32        s_backup_bgs[2];        /* groups with sparse_super2 SBs */
        __u8        s_encrypt_algos[4];        /* Encryption algorithms in use  */
        __u8        s_encrypt_pw_salt[16];        /* Salt used for string2key algorithm */
        __le32        s_lpf_ino;                /* Location of the lost+found inode */
        __le32        s_prj_quota_inum;        /* inode for tracking project quota */
        __le32        s_checksum_seed;        /* crc32c(uuid) if csum_seed set */
        __u8        s_wtime_hi;
        __u8        s_mtime_hi;
        __u8        s_mkfs_time_hi;
        __u8        s_lastcheck_hi;
        __u8        s_first_error_time_hi;
        __u8        s_last_error_time_hi;
        __u8        s_first_error_errcode;
        __u8    s_last_error_errcode;
        __le16  s_encoding;                /* Filename charset encoding */
        __le16  s_encoding_flags;        /* Filename charset encoding flags */
        __le32        s_reserved[95];                /* Padding to the end of the block */
        __le32        s_checksum;                /* crc32c(superblock) */
};

#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START)

#ifdef __KERNEL__

/* Number of quota types we support */
#define EXT4_MAXQUOTAS 3

#define EXT4_ENC_UTF8_12_1        1

/*
 * fourth extended-fs super-block data in memory
 */
struct ext4_sb_info {
        unsigned long s_desc_size;        /* Size of a group descriptor in bytes */
        unsigned long s_inodes_per_block;/* Number of inodes per block */
        unsigned long s_blocks_per_group;/* Number of blocks in a group */
        unsigned long s_clusters_per_group; /* Number of clusters in a group */
        unsigned long s_inodes_per_group;/* Number of inodes in a group */
        unsigned long s_itb_per_group;        /* Number of inode table blocks per group */
        unsigned long s_gdb_count;        /* Number of group descriptor blocks */
        unsigned long s_desc_per_block;        /* Number of group descriptors per block */
        ext4_group_t s_groups_count;        /* Number of groups in the fs */
        ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */
        unsigned long s_overhead;  /* # of fs overhead clusters */
        unsigned int s_cluster_ratio;        /* Number of blocks per cluster */
        unsigned int s_cluster_bits;        /* log2 of s_cluster_ratio */
        loff_t s_bitmap_maxbytes;        /* max bytes for bitmap files */
        struct buffer_head * s_sbh;        /* Buffer containing the super block */
        struct ext4_super_block *s_es;        /* Pointer to the super block in the buffer */
        struct buffer_head * __rcu *s_group_desc;
        unsigned int s_mount_opt;
        unsigned int s_mount_opt2;
        unsigned long s_mount_flags;
        unsigned int s_def_mount_opt;
        ext4_fsblk_t s_sb_block;
        atomic64_t s_resv_clusters;
        kuid_t s_resuid;
        kgid_t s_resgid;
        unsigned short s_mount_state;
        unsigned short s_pad;
        int s_addr_per_block_bits;
        int s_desc_per_block_bits;
        int s_inode_size;
        int s_first_ino;
        unsigned int s_inode_readahead_blks;
        unsigned int s_inode_goal;
        u32 s_hash_seed[4];
        int s_def_hash_version;
        int s_hash_unsigned;        /* 3 if hash should be signed, 0 if not */
        struct percpu_counter s_freeclusters_counter;
        struct percpu_counter s_freeinodes_counter;
        struct percpu_counter s_dirs_counter;
        struct percpu_counter s_dirtyclusters_counter;
        struct percpu_counter s_sra_exceeded_retry_limit;
        struct blockgroup_lock *s_blockgroup_lock;
        struct proc_dir_entry *s_proc;
        struct kobject s_kobj;
        struct completion s_kobj_unregister;
        struct super_block *s_sb;
        struct buffer_head *s_mmp_bh;

        /* Journaling */
        struct journal_s *s_journal;
        struct list_head s_orphan;
        struct mutex s_orphan_lock;
        unsigned long s_ext4_flags;                /* Ext4 superblock flags */
        unsigned long s_commit_interval;
        u32 s_max_batch_time;
        u32 s_min_batch_time;
        struct block_device *s_journal_bdev;
#ifdef CONFIG_QUOTA
        /* Names of quota files with journalled quota */
        char __rcu *s_qf_names[EXT4_MAXQUOTAS];
        int s_jquota_fmt;                        /* Format of quota to use */
#endif
        unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
        struct ext4_system_blocks __rcu *s_system_blks;

#ifdef EXTENTS_STATS
        /* ext4 extents stats */
        unsigned long s_ext_min;
        unsigned long s_ext_max;
        unsigned long s_depth_max;
        spinlock_t s_ext_stats_lock;
        unsigned long s_ext_blocks;
        unsigned long s_ext_extents;
#endif

        /* for buddy allocator */
        struct ext4_group_info ** __rcu *s_group_info;
        struct inode *s_buddy_cache;
        spinlock_t s_md_lock;
        unsigned short *s_mb_offsets;
        unsigned int *s_mb_maxs;
        unsigned int s_group_info_size;
        unsigned int s_mb_free_pending;
        struct list_head s_freed_data_list;        /* List of blocks to be freed
                                                   after commit completed */

        /* tunables */
        unsigned long s_stripe;
        unsigned int s_mb_stream_request;
        unsigned int s_mb_max_to_scan;
        unsigned int s_mb_min_to_scan;
        unsigned int s_mb_stats;
        unsigned int s_mb_order2_reqs;
        unsigned int s_mb_group_prealloc;
        unsigned int s_mb_max_inode_prealloc;
        unsigned int s_max_dir_size_kb;
        /* where last allocation was done - for stream allocation */
        unsigned long s_mb_last_group;
        unsigned long s_mb_last_start;
        unsigned int s_mb_prefetch;
        unsigned int s_mb_prefetch_limit;

        /* stats for buddy allocator */
        atomic_t s_bal_reqs;        /* number of reqs with len > 1 */
        atomic_t s_bal_success;        /* we found long enough chunks */
        atomic_t s_bal_allocated;        /* in blocks */
        atomic_t s_bal_ex_scanned;        /* total extents scanned */
        atomic_t s_bal_groups_scanned;        /* number of groups scanned */
        atomic_t s_bal_goals;        /* goal hits */
        atomic_t s_bal_breaks;        /* too long searches */
        atomic_t s_bal_2orders;        /* 2^order hits */
        atomic64_t s_bal_cX_groups_considered[4];
        atomic64_t s_bal_cX_hits[4];
        atomic64_t s_bal_cX_failed[4];                /* cX loop didn't find blocks */
        atomic_t s_mb_buddies_generated;        /* number of buddies generated */
        atomic64_t s_mb_generation_time;
        atomic_t s_mb_lost_chunks;
        atomic_t s_mb_preallocated;
        atomic_t s_mb_discarded;
        atomic_t s_lock_busy;

        /* locality groups */
        struct ext4_locality_group __percpu *s_locality_groups;

        /* for write statistics */
        unsigned long s_sectors_written_start;
        u64 s_kbytes_written;

        /* the size of zero-out chunk */
        unsigned int s_extent_max_zeroout_kb;

        unsigned int s_log_groups_per_flex;
        struct flex_groups * __rcu *s_flex_groups;
        ext4_group_t s_flex_groups_allocated;

        /* workqueue for reserved extent conversions (buffered io) */
        struct workqueue_struct *rsv_conversion_wq;

        /* timer for periodic error stats printing */
        struct timer_list s_err_report;

        /* Lazy inode table initialization info */
        struct ext4_li_request *s_li_request;
        /* Wait multiplier for lazy initialization thread */
        unsigned int s_li_wait_mult;

        /* Kernel thread for multiple mount protection */
        struct task_struct *s_mmp_tsk;

        /* record the last minlen when FITRIM is called. */
        unsigned long s_last_trim_minblks;

        /* Reference to checksum algorithm driver via cryptoapi */
        struct crypto_shash *s_chksum_driver;

        /* Precomputed FS UUID checksum for seeding other checksums */
        __u32 s_csum_seed;

        /* Reclaim extents from extent status tree */
        struct shrinker s_es_shrinker;
        struct list_head s_es_list;        /* List of inodes with reclaimable extents */
        long s_es_nr_inode;
        struct ext4_es_stats s_es_stats;
        struct mb_cache *s_ea_block_cache;
        struct mb_cache *s_ea_inode_cache;
        spinlock_t s_es_lock ____cacheline_aligned_in_smp;

        /* Ratelimit ext4 messages. */
        struct ratelimit_state s_err_ratelimit_state;
        struct ratelimit_state s_warning_ratelimit_state;
        struct ratelimit_state s_msg_ratelimit_state;
        atomic_t s_warning_count;
        atomic_t s_msg_count;

        /* Encryption policy for '-o test_dummy_encryption' */
        struct fscrypt_dummy_policy s_dummy_enc_policy;

        /*
         * Barrier between writepages ops and changing any inode's JOURNAL_DATA
         * or EXTENTS flag.
         */
        struct percpu_rw_semaphore s_writepages_rwsem;
        struct dax_device *s_daxdev;
#ifdef CONFIG_EXT4_DEBUG
        unsigned long s_simulate_fail;
#endif
        /* Record the errseq of the backing block device */
        errseq_t s_bdev_wb_err;
        spinlock_t s_bdev_wb_lock;

        /* Ext4 fast commit stuff */
        atomic_t s_fc_subtid;
        atomic_t s_fc_ineligible_updates;
        /*
         * After commit starts, the main queue gets locked, and the further
         * updates get added in the staging queue.
         */
#define FC_Q_MAIN        0
#define FC_Q_STAGING        1
        struct list_head s_fc_q[2];        /* Inodes staged for fast commit
                                         * that have data changes in them.
                                         */
        struct list_head s_fc_dentry_q[2];        /* directory entry updates */
        unsigned int s_fc_bytes;
        /*
         * Main fast commit lock. This lock protects accesses to the
         * following fields:
         * ei->i_fc_list, s_fc_dentry_q, s_fc_q, s_fc_bytes, s_fc_bh.
         */
        spinlock_t s_fc_lock;
        struct buffer_head *s_fc_bh;
        struct ext4_fc_stats s_fc_stats;
        u64 s_fc_avg_commit_time;
#ifdef CONFIG_EXT4_DEBUG
        int s_fc_debug_max_replay;
#endif
        struct ext4_fc_replay_state s_fc_replay_state;
};

static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
{
        return sb->s_fs_info;
}
static inline struct ext4_inode_info *EXT4_I(struct inode *inode)
{
        return container_of(inode, struct ext4_inode_info, vfs_inode);
}

static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
{
        return ino == EXT4_ROOT_INO ||
                (ino >= EXT4_FIRST_INO(sb) &&
                 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
}

/*
 * Returns: sbi->field[index]
 * Used to access an array element from the following sbi fields which require
 * rcu protection to avoid dereferencing an invalid pointer due to reassignment
 * - s_group_desc
 * - s_group_info
 * - s_flex_group
 */
#define sbi_array_rcu_deref(sbi, field, index)                                   \
({                                                                           \
        typeof(*((sbi)->field)) _v;                                           \
        rcu_read_lock();                                                   \
        _v = ((typeof(_v)*)rcu_dereference((sbi)->field))[index];           \
        rcu_read_unlock();                                                   \
        _v;                                                                   \
})

/*
 * run-time mount flags
 */
enum {
        EXT4_MF_MNTDIR_SAMPLED,
        EXT4_MF_FS_ABORTED,        /* Fatal error detected */
        EXT4_MF_FC_INELIGIBLE,        /* Fast commit ineligible */
        EXT4_MF_FC_COMMITTING        /* File system underoing a fast
                                 * commit.
                                 */
};

static inline void ext4_set_mount_flag(struct super_block *sb, int bit)
{
        set_bit(bit, &EXT4_SB(sb)->s_mount_flags);
}

static inline void ext4_clear_mount_flag(struct super_block *sb, int bit)
{
        clear_bit(bit, &EXT4_SB(sb)->s_mount_flags);
}

static inline int ext4_test_mount_flag(struct super_block *sb, int bit)
{
        return test_bit(bit, &EXT4_SB(sb)->s_mount_flags);
}


/*
 * Simulate_fail codes
 */
#define EXT4_SIM_BBITMAP_EIO        1
#define EXT4_SIM_BBITMAP_CRC        2
#define EXT4_SIM_IBITMAP_EIO        3
#define EXT4_SIM_IBITMAP_CRC        4
#define EXT4_SIM_INODE_EIO        5
#define EXT4_SIM_INODE_CRC        6
#define EXT4_SIM_DIRBLOCK_EIO        7
#define EXT4_SIM_DIRBLOCK_CRC        8

static inline bool ext4_simulate_fail(struct super_block *sb,
                                     unsigned long code)
{
#ifdef CONFIG_EXT4_DEBUG
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (unlikely(sbi->s_simulate_fail == code)) {
                sbi->s_simulate_fail = 0;
                return true;
        }
#endif
        return false;
}

static inline void ext4_simulate_fail_bh(struct super_block *sb,
                                         struct buffer_head *bh,
                                         unsigned long code)
{
        if (!IS_ERR(bh) && ext4_simulate_fail(sb, code))
                clear_buffer_uptodate(bh);
}

/*
 * Error number codes for s_{first,last}_error_errno
 *
 * Linux errno numbers are architecture specific, so we need to translate
 * them into something which is architecture independent.   We don't define
 * codes for all errno's; just the ones which are most likely to be the cause
 * of an ext4_error() call.
 */
#define EXT4_ERR_UNKNOWN         1
#define EXT4_ERR_EIO                 2
#define EXT4_ERR_ENOMEM                 3
#define EXT4_ERR_EFSBADCRC         4
#define EXT4_ERR_EFSCORRUPTED         5
#define EXT4_ERR_ENOSPC                 6
#define EXT4_ERR_ENOKEY                 7
#define EXT4_ERR_EROFS                 8
#define EXT4_ERR_EFBIG                 9
#define EXT4_ERR_EEXIST                10
#define EXT4_ERR_ERANGE                11
#define EXT4_ERR_EOVERFLOW        12
#define EXT4_ERR_EBUSY                13
#define EXT4_ERR_ENOTDIR        14
#define EXT4_ERR_ENOTEMPTY        15
#define EXT4_ERR_ESHUTDOWN        16
#define EXT4_ERR_EFAULT                17

/*
 * Inode dynamic state flags
 */
enum {
        EXT4_STATE_JDATA,                /* journaled data exists */
        EXT4_STATE_NEW,                        /* inode is newly created */
        EXT4_STATE_XATTR,                /* has in-inode xattrs */
        EXT4_STATE_NO_EXPAND,                /* No space for expansion */
        EXT4_STATE_DA_ALLOC_CLOSE,        /* Alloc DA blks on close */
        EXT4_STATE_EXT_MIGRATE,                /* Inode is migrating */
        EXT4_STATE_NEWENTRY,                /* File just added to dir */
        EXT4_STATE_MAY_INLINE_DATA,        /* may have in-inode data */
        EXT4_STATE_EXT_PRECACHED,        /* extents have been precached */
        EXT4_STATE_LUSTRE_EA_INODE,        /* Lustre-style ea_inode */
        EXT4_STATE_VERITY_IN_PROGRESS,        /* building fs-verity Merkle tree */
        EXT4_STATE_FC_COMMITTING,        /* Fast commit ongoing */
};

#define EXT4_INODE_BIT_FNS(name, field, offset)                                \
static inline int ext4_test_inode_##name(struct inode *inode, int bit)        \
{                                                                        \
        return test_bit(bit + (offset), &EXT4_I(inode)->i_##field);        \
}                                                                        \
static inline void ext4_set_inode_##name(struct inode *inode, int bit)        \
{                                                                        \
        set_bit(bit + (offset), &EXT4_I(inode)->i_##field);                \
}                                                                        \
static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
{                                                                        \
        clear_bit(bit + (offset), &EXT4_I(inode)->i_##field);                \
}

/* Add these declarations here only so that these functions can be
 * found by name.  Otherwise, they are very hard to locate. */
static inline int ext4_test_inode_flag(struct inode *inode, int bit);
static inline void ext4_set_inode_flag(struct inode *inode, int bit);
static inline void ext4_clear_inode_flag(struct inode *inode, int bit);
EXT4_INODE_BIT_FNS(flag, flags, 0)

/* Add these declarations here only so that these functions can be
 * found by name.  Otherwise, they are very hard to locate. */
static inline int ext4_test_inode_state(struct inode *inode, int bit);
static inline void ext4_set_inode_state(struct inode *inode, int bit);
static inline void ext4_clear_inode_state(struct inode *inode, int bit);
#if (BITS_PER_LONG < 64)
EXT4_INODE_BIT_FNS(state, state_flags, 0)

static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
{
        (ei)->i_state_flags = 0;
}
#else
EXT4_INODE_BIT_FNS(state, flags, 32)

static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
{
        /* We depend on the fact that callers will set i_flags */
}
#endif
#else
/* Assume that user mode programs are passing in an ext4fs superblock, not
 * a kernel struct super_block.  This will allow us to call the feature-test
 * macros from user land. */
#define EXT4_SB(sb)        (sb)
#endif

static inline bool ext4_verity_in_progress(struct inode *inode)
{
        return IS_ENABLED(CONFIG_FS_VERITY) &&
               ext4_test_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
}

#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime

/*
 * Codes for operating systems
 */
#define EXT4_OS_LINUX                0
#define EXT4_OS_HURD                1
#define EXT4_OS_MASIX                2
#define EXT4_OS_FREEBSD                3
#define EXT4_OS_LITES                4

/*
 * Revision levels
 */
#define EXT4_GOOD_OLD_REV        0        /* The good old (original) format */
#define EXT4_DYNAMIC_REV        1        /* V2 format w/ dynamic inode sizes */

#define EXT4_CURRENT_REV        EXT4_GOOD_OLD_REV
#define EXT4_MAX_SUPP_REV        EXT4_DYNAMIC_REV

#define EXT4_GOOD_OLD_INODE_SIZE 128

#define EXT4_EXTRA_TIMESTAMP_MAX        (((s64)1 << 34) - 1  + S32_MIN)
#define EXT4_NON_EXTRA_TIMESTAMP_MAX        S32_MAX
#define EXT4_TIMESTAMP_MIN                S32_MIN

/*
 * Feature set definitions
 */

#define EXT4_FEATURE_COMPAT_DIR_PREALLOC        0x0001
#define EXT4_FEATURE_COMPAT_IMAGIC_INODES        0x0002
#define EXT4_FEATURE_COMPAT_HAS_JOURNAL                0x0004
#define EXT4_FEATURE_COMPAT_EXT_ATTR                0x0008
#define EXT4_FEATURE_COMPAT_RESIZE_INODE        0x0010
#define EXT4_FEATURE_COMPAT_DIR_INDEX                0x0020
#define EXT4_FEATURE_COMPAT_SPARSE_SUPER2        0x0200
/*
 * The reason why "FAST_COMMIT" is a compat feature is that, FS becomes
 * incompatible only if fast commit blocks are present in the FS. Since we
 * clear the journal (and thus the fast commit blocks), we don't mark FS as
 * incompatible. We also have a JBD2 incompat feature, which gets set when
 * there are fast commit blocks present in the journal.
 */
#define EXT4_FEATURE_COMPAT_FAST_COMMIT                0x0400
#define EXT4_FEATURE_COMPAT_STABLE_INODES        0x0800

#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER        0x0001
#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE        0x0002
#define EXT4_FEATURE_RO_COMPAT_BTREE_DIR        0x0004
#define EXT4_FEATURE_RO_COMPAT_HUGE_FILE        0x0008
#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM                0x0010
#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK        0x0020
#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE        0x0040
#define EXT4_FEATURE_RO_COMPAT_QUOTA                0x0100
#define EXT4_FEATURE_RO_COMPAT_BIGALLOC                0x0200
/*
 * METADATA_CSUM also enables group descriptor checksums (GDT_CSUM).  When
 * METADATA_CSUM is set, group descriptor checksums use the same algorithm as
 * all other data structures' checksums.  However, the METADATA_CSUM and
 * GDT_CSUM bits are mutually exclusive.
 */
#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM        0x0400
#define EXT4_FEATURE_RO_COMPAT_READONLY                0x1000
#define EXT4_FEATURE_RO_COMPAT_PROJECT                0x2000
#define EXT4_FEATURE_RO_COMPAT_VERITY                0x8000

#define EXT4_FEATURE_INCOMPAT_COMPRESSION        0x0001
#define EXT4_FEATURE_INCOMPAT_FILETYPE                0x0002
#define EXT4_FEATURE_INCOMPAT_RECOVER                0x0004 /* Needs recovery */
#define EXT4_FEATURE_INCOMPAT_JOURNAL_DEV        0x0008 /* Journal device */
#define EXT4_FEATURE_INCOMPAT_META_BG                0x0010
#define EXT4_FEATURE_INCOMPAT_EXTENTS                0x0040 /* extents support */
#define EXT4_FEATURE_INCOMPAT_64BIT                0x0080
#define EXT4_FEATURE_INCOMPAT_MMP               0x0100
#define EXT4_FEATURE_INCOMPAT_FLEX_BG                0x0200
#define EXT4_FEATURE_INCOMPAT_EA_INODE                0x0400 /* EA in inode */
#define EXT4_FEATURE_INCOMPAT_DIRDATA                0x1000 /* data in dirent */
#define EXT4_FEATURE_INCOMPAT_CSUM_SEED                0x2000
#define EXT4_FEATURE_INCOMPAT_LARGEDIR                0x4000 /* >2GB or 3-lvl htree */
#define EXT4_FEATURE_INCOMPAT_INLINE_DATA        0x8000 /* data in inode */
#define EXT4_FEATURE_INCOMPAT_ENCRYPT                0x10000
#define EXT4_FEATURE_INCOMPAT_CASEFOLD                0x20000

extern void ext4_update_dynamic_rev(struct super_block *sb);

#define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \
static inline bool ext4_has_feature_##name(struct super_block *sb) \
{ \
        return ((EXT4_SB(sb)->s_es->s_feature_compat & \
                cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname)) != 0); \
} \
static inline void ext4_set_feature_##name(struct super_block *sb) \
{ \
        ext4_update_dynamic_rev(sb); \
        EXT4_SB(sb)->s_es->s_feature_compat |= \
                cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \
} \
static inline void ext4_clear_feature_##name(struct super_block *sb) \
{ \
        EXT4_SB(sb)->s_es->s_feature_compat &= \
                ~cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \
}

#define EXT4_FEATURE_RO_COMPAT_FUNCS(name, flagname) \
static inline bool ext4_has_feature_##name(struct super_block *sb) \
{ \
        return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \
                cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname)) != 0); \
} \
static inline void ext4_set_feature_##name(struct super_block *sb) \
{ \
        ext4_update_dynamic_rev(sb); \
        EXT4_SB(sb)->s_es->s_feature_ro_compat |= \
                cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \
} \
static inline void ext4_clear_feature_##name(struct super_block *sb) \
{ \
        EXT4_SB(sb)->s_es->s_feature_ro_compat &= \
                ~cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \
}

#define EXT4_FEATURE_INCOMPAT_FUNCS(name, flagname) \
static inline bool ext4_has_feature_##name(struct super_block *sb) \
{ \
        return ((EXT4_SB(sb)->s_es->s_feature_incompat & \
                cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname)) != 0); \
} \
static inline void ext4_set_feature_##name(struct super_block *sb) \
{ \
        ext4_update_dynamic_rev(sb); \
        EXT4_SB(sb)->s_es->s_feature_incompat |= \
                cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \
} \
static inline void ext4_clear_feature_##name(struct super_block *sb) \
{ \
        EXT4_SB(sb)->s_es->s_feature_incompat &= \
                ~cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \
}

EXT4_FEATURE_COMPAT_FUNCS(dir_prealloc,                DIR_PREALLOC)
EXT4_FEATURE_COMPAT_FUNCS(imagic_inodes,        IMAGIC_INODES)
EXT4_FEATURE_COMPAT_FUNCS(journal,                HAS_JOURNAL)
EXT4_FEATURE_COMPAT_FUNCS(xattr,                EXT_ATTR)
EXT4_FEATURE_COMPAT_FUNCS(resize_inode,                RESIZE_INODE)
EXT4_FEATURE_COMPAT_FUNCS(dir_index,                DIR_INDEX)
EXT4_FEATURE_COMPAT_FUNCS(sparse_super2,        SPARSE_SUPER2)
EXT4_FEATURE_COMPAT_FUNCS(fast_commit,                FAST_COMMIT)
EXT4_FEATURE_COMPAT_FUNCS(stable_inodes,        STABLE_INODES)

EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super,        SPARSE_SUPER)
EXT4_FEATURE_RO_COMPAT_FUNCS(large_file,        LARGE_FILE)
EXT4_FEATURE_RO_COMPAT_FUNCS(btree_dir,                BTREE_DIR)
EXT4_FEATURE_RO_COMPAT_FUNCS(huge_file,                HUGE_FILE)
EXT4_FEATURE_RO_COMPAT_FUNCS(gdt_csum,                GDT_CSUM)
EXT4_FEATURE_RO_COMPAT_FUNCS(dir_nlink,                DIR_NLINK)
EXT4_FEATURE_RO_COMPAT_FUNCS(extra_isize,        EXTRA_ISIZE)
EXT4_FEATURE_RO_COMPAT_FUNCS(quota,                QUOTA)
EXT4_FEATURE_RO_COMPAT_FUNCS(bigalloc,                BIGALLOC)
EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum,        METADATA_CSUM)
EXT4_FEATURE_RO_COMPAT_FUNCS(readonly,                READONLY)
EXT4_FEATURE_RO_COMPAT_FUNCS(project,                PROJECT)
EXT4_FEATURE_RO_COMPAT_FUNCS(verity,                VERITY)

EXT4_FEATURE_INCOMPAT_FUNCS(compression,        COMPRESSION)
EXT4_FEATURE_INCOMPAT_FUNCS(filetype,                FILETYPE)
EXT4_FEATURE_INCOMPAT_FUNCS(journal_needs_recovery,        RECOVER)
EXT4_FEATURE_INCOMPAT_FUNCS(journal_dev,        JOURNAL_DEV)
EXT4_FEATURE_INCOMPAT_FUNCS(meta_bg,                META_BG)
EXT4_FEATURE_INCOMPAT_FUNCS(extents,                EXTENTS)
EXT4_FEATURE_INCOMPAT_FUNCS(64bit,                64BIT)
EXT4_FEATURE_INCOMPAT_FUNCS(mmp,                MMP)
EXT4_FEATURE_INCOMPAT_FUNCS(flex_bg,                FLEX_BG)
EXT4_FEATURE_INCOMPAT_FUNCS(ea_inode,                EA_INODE)
EXT4_FEATURE_INCOMPAT_FUNCS(dirdata,                DIRDATA)
EXT4_FEATURE_INCOMPAT_FUNCS(csum_seed,                CSUM_SEED)
EXT4_FEATURE_INCOMPAT_FUNCS(largedir,                LARGEDIR)
EXT4_FEATURE_INCOMPAT_FUNCS(inline_data,        INLINE_DATA)
EXT4_FEATURE_INCOMPAT_FUNCS(encrypt,                ENCRYPT)
EXT4_FEATURE_INCOMPAT_FUNCS(casefold,                CASEFOLD)

#define EXT2_FEATURE_COMPAT_SUPP        EXT4_FEATURE_COMPAT_EXT_ATTR
#define EXT2_FEATURE_INCOMPAT_SUPP        (EXT4_FEATURE_INCOMPAT_FILETYPE| \
                                         EXT4_FEATURE_INCOMPAT_META_BG)
#define EXT2_FEATURE_RO_COMPAT_SUPP        (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
                                         EXT4_FEATURE_RO_COMPAT_BTREE_DIR)

#define EXT3_FEATURE_COMPAT_SUPP        EXT4_FEATURE_COMPAT_EXT_ATTR
#define EXT3_FEATURE_INCOMPAT_SUPP        (EXT4_FEATURE_INCOMPAT_FILETYPE| \
                                         EXT4_FEATURE_INCOMPAT_RECOVER| \
                                         EXT4_FEATURE_INCOMPAT_META_BG)
#define EXT3_FEATURE_RO_COMPAT_SUPP        (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
                                         EXT4_FEATURE_RO_COMPAT_BTREE_DIR)

#define EXT4_FEATURE_COMPAT_SUPP        EXT4_FEATURE_COMPAT_EXT_ATTR
#define EXT4_FEATURE_INCOMPAT_SUPP        (EXT4_FEATURE_INCOMPAT_FILETYPE| \
                                         EXT4_FEATURE_INCOMPAT_RECOVER| \
                                         EXT4_FEATURE_INCOMPAT_META_BG| \
                                         EXT4_FEATURE_INCOMPAT_EXTENTS| \
                                         EXT4_FEATURE_INCOMPAT_64BIT| \
                                         EXT4_FEATURE_INCOMPAT_FLEX_BG| \
                                         EXT4_FEATURE_INCOMPAT_EA_INODE| \
                                         EXT4_FEATURE_INCOMPAT_MMP | \
                                         EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
                                         EXT4_FEATURE_INCOMPAT_ENCRYPT | \
                                         EXT4_FEATURE_INCOMPAT_CASEFOLD | \
                                         EXT4_FEATURE_INCOMPAT_CSUM_SEED | \
                                         EXT4_FEATURE_INCOMPAT_LARGEDIR)
#define EXT4_FEATURE_RO_COMPAT_SUPP        (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
                                         EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
                                         EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \
                                         EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
                                         EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\
                                         EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
                                         EXT4_FEATURE_RO_COMPAT_BIGALLOC |\
                                         EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\
                                         EXT4_FEATURE_RO_COMPAT_QUOTA |\
                                         EXT4_FEATURE_RO_COMPAT_PROJECT |\
                                         EXT4_FEATURE_RO_COMPAT_VERITY)

#define EXTN_FEATURE_FUNCS(ver) \
static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \
{ \
        return ((EXT4_SB(sb)->s_es->s_feature_compat & \
                cpu_to_le32(~EXT##ver##_FEATURE_COMPAT_SUPP)) != 0); \
} \
static inline bool ext4_has_unknown_ext##ver##_ro_compat_features(struct super_block *sb) \
{ \
        return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \
                cpu_to_le32(~EXT##ver##_FEATURE_RO_COMPAT_SUPP)) != 0); \
} \
static inline bool ext4_has_unknown_ext##ver##_incompat_features(struct super_block *sb) \
{ \
        return ((EXT4_SB(sb)->s_es->s_feature_incompat & \
                cpu_to_le32(~EXT##ver##_FEATURE_INCOMPAT_SUPP)) != 0); \
}

EXTN_FEATURE_FUNCS(2)
EXTN_FEATURE_FUNCS(3)
EXTN_FEATURE_FUNCS(4)

static inline bool ext4_has_compat_features(struct super_block *sb)
{
        return (EXT4_SB(sb)->s_es->s_feature_compat != 0);
}
static inline bool ext4_has_ro_compat_features(struct super_block *sb)
{
        return (EXT4_SB(sb)->s_es->s_feature_ro_compat != 0);
}
static inline bool ext4_has_incompat_features(struct super_block *sb)
{
        return (EXT4_SB(sb)->s_es->s_feature_incompat != 0);
}

/*
 * Superblock flags
 */
#define EXT4_FLAGS_RESIZING        0
#define EXT4_FLAGS_SHUTDOWN        1
#define EXT4_FLAGS_BDEV_IS_DAX        2

static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi)
{
        return test_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
}


/*
 * Default values for user and/or group using reserved blocks
 */
#define        EXT4_DEF_RESUID                0
#define        EXT4_DEF_RESGID                0

/*
 * Default project ID
 */
#define        EXT4_DEF_PROJID                0

#define EXT4_DEF_INODE_READAHEAD_BLKS        32

/*
 * Default mount options
 */
#define EXT4_DEFM_DEBUG                0x0001
#define EXT4_DEFM_BSDGROUPS        0x0002
#define EXT4_DEFM_XATTR_USER        0x0004
#define EXT4_DEFM_ACL                0x0008
#define EXT4_DEFM_UID16                0x0010
#define EXT4_DEFM_JMODE                0x0060
#define EXT4_DEFM_JMODE_DATA        0x0020
#define EXT4_DEFM_JMODE_ORDERED        0x0040
#define EXT4_DEFM_JMODE_WBACK        0x0060
#define EXT4_DEFM_NOBARRIER        0x0100
#define EXT4_DEFM_BLOCK_VALIDITY 0x0200
#define EXT4_DEFM_DISCARD        0x0400
#define EXT4_DEFM_NODELALLOC        0x0800

/*
 * Default journal batch times
 */
#define EXT4_DEF_MIN_BATCH_TIME        0
#define EXT4_DEF_MAX_BATCH_TIME        15000 /* 15ms */

/*
 * Minimum number of groups in a flexgroup before we separate out
 * directories into the first block group of a flexgroup
 */
#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME        4

/*
 * Structure of a directory entry
 */
#define EXT4_NAME_LEN 255
/*
 * Base length of the ext4 directory entry excluding the name length
 */
#define EXT4_BASE_DIR_LEN (sizeof(struct ext4_dir_entry_2) - EXT4_NAME_LEN)

struct ext4_dir_entry {
        __le32        inode;                        /* Inode number */
        __le16        rec_len;                /* Directory entry length */
        __le16        name_len;                /* Name length */
        char        name[EXT4_NAME_LEN];        /* File name */
};

/*
 * The new version of the directory entry.  Since EXT4 structures are
 * stored in intel byte order, and the name_len field could never be
 * bigger than 255 chars, it's safe to reclaim the extra byte for the
 * file_type field.
 */
struct ext4_dir_entry_2 {
        __le32        inode;                        /* Inode number */
        __le16        rec_len;                /* Directory entry length */
        __u8        name_len;                /* Name length */
        __u8        file_type;                /* See file type macros EXT4_FT_* below */
        char        name[EXT4_NAME_LEN];        /* File name */
};

/*
 * This is a bogus directory entry at the end of each leaf block that
 * records checksums.
 */
struct ext4_dir_entry_tail {
        __le32        det_reserved_zero1;        /* Pretend to be unused */
        __le16        det_rec_len;                /* 12 */
        __u8        det_reserved_zero2;        /* Zero name length */
        __u8        det_reserved_ft;        /* 0xDE, fake file type */
        __le32        det_checksum;                /* crc32c(uuid+inum+dirblock) */
};

#define EXT4_DIRENT_TAIL(block, blocksize) \
        ((struct ext4_dir_entry_tail *)(((void *)(block)) + \
                                        ((blocksize) - \
                                         sizeof(struct ext4_dir_entry_tail))))

/*
 * Ext4 directory file types.  Only the low 3 bits are used.  The
 * other bits are reserved for now.
 */
#define EXT4_FT_UNKNOWN                0
#define EXT4_FT_REG_FILE        1
#define EXT4_FT_DIR                2
#define EXT4_FT_CHRDEV                3
#define EXT4_FT_BLKDEV                4
#define EXT4_FT_FIFO                5
#define EXT4_FT_SOCK                6
#define EXT4_FT_SYMLINK                7

#define EXT4_FT_MAX                8

#define EXT4_FT_DIR_CSUM        0xDE

/*
 * EXT4_DIR_PAD defines the directory entries boundaries
 *
 * NOTE: It must be a multiple of 4
 */
#define EXT4_DIR_PAD                        4
#define EXT4_DIR_ROUND                        (EXT4_DIR_PAD - 1)
#define EXT4_DIR_REC_LEN(name_len)        (((name_len) + 8 + EXT4_DIR_ROUND) & \
                                         ~EXT4_DIR_ROUND)
#define EXT4_MAX_REC_LEN                ((1<<16)-1)

/*
 * If we ever get support for fs block sizes > page_size, we'll need
 * to remove the #if statements in the next two functions...
 */
static inline unsigned int
ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
{
        unsigned len = le16_to_cpu(dlen);

#if (PAGE_SIZE >= 65536)
        if (len == EXT4_MAX_REC_LEN || len == 0)
                return blocksize;
        return (len & 65532) | ((len & 3) << 16);
#else
        return len;
#endif
}

static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
{
        if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
                BUG();
#if (PAGE_SIZE >= 65536)
        if (len < 65536)
                return cpu_to_le16(len);
        if (len == blocksize) {
                if (blocksize == 65536)
                        return cpu_to_le16(EXT4_MAX_REC_LEN);
                else
                        return cpu_to_le16(0);
        }
        return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
#else
        return cpu_to_le16(len);
#endif
}

/*
 * Hash Tree Directory indexing
 * (c) Daniel Phillips, 2001
 */

#define is_dx(dir) (ext4_has_feature_dir_index((dir)->i_sb) && \
                    ext4_test_inode_flag((dir), EXT4_INODE_INDEX))
#define EXT4_DIR_LINK_MAX(dir) unlikely((dir)->i_nlink >= EXT4_LINK_MAX && \
                    !(ext4_has_feature_dir_nlink((dir)->i_sb) && is_dx(dir)))
#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)

/* Legal values for the dx_root hash_version field: */

#define DX_HASH_LEGACY                        0
#define DX_HASH_HALF_MD4                1
#define DX_HASH_TEA                        2
#define DX_HASH_LEGACY_UNSIGNED                3
#define DX_HASH_HALF_MD4_UNSIGNED        4
#define DX_HASH_TEA_UNSIGNED                5

static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc,
                              const void *address, unsigned int length)
{
        struct {
                struct shash_desc shash;
                char ctx[4];
        } desc;

        BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver)!=sizeof(desc.ctx));

        desc.shash.tfm = sbi->s_chksum_driver;
        *(u32 *)desc.ctx = crc;

        BUG_ON(crypto_shash_update(&desc.shash, address, length));

        return *(u32 *)desc.ctx;
}

#ifdef __KERNEL__

/* hash info structure used by the directory hash */
struct dx_hash_info
{
        u32                hash;
        u32                minor_hash;
        int                hash_version;
        u32                *seed;
};


/* 32 and 64 bit signed EOF for dx directories */
#define EXT4_HTREE_EOF_32BIT   ((1UL  << (32 - 1)) - 1)
#define EXT4_HTREE_EOF_64BIT   ((1ULL << (64 - 1)) - 1)


/*
 * Control parameters used by ext4_htree_next_block
 */
#define HASH_NB_ALWAYS                1

struct ext4_filename {
        const struct qstr *usr_fname;
        struct fscrypt_str disk_name;
        struct dx_hash_info hinfo;
#ifdef CONFIG_FS_ENCRYPTION
        struct fscrypt_str crypto_buf;
#endif
#ifdef CONFIG_UNICODE
        struct fscrypt_str cf_name;
#endif
};

#define fname_name(p) ((p)->disk_name.name)
#define fname_len(p)  ((p)->disk_name.len)

/*
 * Describe an inode's exact location on disk and in memory
 */
struct ext4_iloc
{
        struct buffer_head *bh;
        unsigned long offset;
        ext4_group_t block_group;
};

static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc)
{
        return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset);
}

static inline bool ext4_is_quota_file(struct inode *inode)
{
        return IS_NOQUOTA(inode) &&
               !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL);
}

/*
 * This structure is stuffed into the struct file's private_data field
 * for directories.  It is where we put information so that we can do
 * readdir operations in hash tree order.
 */
struct dir_private_info {
        struct rb_root        root;
        struct rb_node        *curr_node;
        struct fname        *extra_fname;
        loff_t                last_pos;
        __u32                curr_hash;
        __u32                curr_minor_hash;
        __u32                next_hash;
};

/* calculate the first block number of the group */
static inline ext4_fsblk_t
ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
{
        return group_no * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) +
                le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
}

/*
 * Special error return code only used by dx_probe() and its callers.
 */
#define ERR_BAD_DX_DIR        (-(MAX_ERRNO - 1))

/* htree levels for ext4 */
#define        EXT4_HTREE_LEVEL_COMPAT        2
#define        EXT4_HTREE_LEVEL        3

static inline int ext4_dir_htree_level(struct super_block *sb)
{
        return ext4_has_feature_largedir(sb) ?
                EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
}

/*
 * Timeout and state flag for lazy initialization inode thread.
 */
#define EXT4_DEF_LI_WAIT_MULT                        10
#define EXT4_DEF_LI_MAX_START_DELAY                5
#define EXT4_LAZYINIT_QUIT                        0x0001
#define EXT4_LAZYINIT_RUNNING                        0x0002

/*
 * Lazy inode table initialization info
 */
struct ext4_lazy_init {
        unsigned long                li_state;
        struct list_head        li_request_list;
        struct mutex                li_list_mtx;
};

enum ext4_li_mode {
        EXT4_LI_MODE_PREFETCH_BBITMAP,
        EXT4_LI_MODE_ITABLE,
};

struct ext4_li_request {
        struct super_block        *lr_super;
        enum ext4_li_mode        lr_mode;
        ext4_group_t                lr_first_not_zeroed;
        ext4_group_t                lr_next_group;
        struct list_head        lr_request;
        unsigned long                lr_next_sched;
        unsigned long                lr_timeout;
};

struct ext4_features {
        struct kobject f_kobj;
        struct completion f_kobj_unregister;
};

/*
 * This structure will be used for multiple mount protection. It will be
 * written into the block number saved in the s_mmp_block field in the
 * superblock. Programs that check MMP should assume that if
 * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe
 * to use the filesystem, regardless of how old the timestamp is.
 */
#define EXT4_MMP_MAGIC     0x004D4D50U /* ASCII for MMP */
#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */
#define EXT4_MMP_SEQ_FSCK  0xE24D4D50U /* mmp_seq value when being fscked */
#define EXT4_MMP_SEQ_MAX   0xE24D4D4FU /* maximum valid mmp_seq value */

struct mmp_struct {
        __le32        mmp_magic;                /* Magic number for MMP */
        __le32        mmp_seq;                /* Sequence no. updated periodically */

        /*
         * mmp_time, mmp_nodename & mmp_bdevname are only used for information
         * purposes and do not affect the correctness of the algorithm
         */
        __le64        mmp_time;                /* Time last updated */
        char        mmp_nodename[64];        /* Node which last updated MMP block */
        char        mmp_bdevname[32];        /* Bdev which last updated MMP block */

        /*
         * mmp_check_interval is used to verify if the MMP block has been
         * updated on the block device. The value is updated based on the
         * maximum time to write the MMP block during an update cycle.
         */
        __le16        mmp_check_interval;

        __le16        mmp_pad1;
        __le32        mmp_pad2[226];
        __le32        mmp_checksum;                /* crc32c(uuid+mmp_block) */
};

/* arguments passed to the mmp thread */
struct mmpd_data {
        struct buffer_head *bh; /* bh from initial read_mmp_block() */
        struct super_block *sb;  /* super block of the fs */
};

/*
 * Check interval multiplier
 * The MMP block is written every update interval and initially checked every
 * update interval x the multiplier (the value is then adapted based on the
 * write latency). The reason is that writes can be delayed under load and we
 * don't want readers to incorrectly assume that the filesystem is no longer
 * in use.
 */
#define EXT4_MMP_CHECK_MULT                2UL

/*
 * Minimum interval for MMP checking in seconds.
 */
#define EXT4_MMP_MIN_CHECK_INTERVAL        5UL

/*
 * Maximum interval for MMP checking in seconds.
 */
#define EXT4_MMP_MAX_CHECK_INTERVAL        300UL

/*
 * Function prototypes
 */

/*
 * Ok, these declarations are also in <linux/kernel.h> but none of the
 * ext4 source programs needs to include it so they are duplicated here.
 */
# define NORET_TYPE        /**/
# define ATTRIB_NORET        __attribute__((noreturn))
# define NORET_AND        noreturn,

/* bitmap.c */
extern unsigned int ext4_count_free(char *bitmap, unsigned numchars);
void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
                                struct ext4_group_desc *gdp,
                                struct buffer_head *bh, int sz);
int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
                                  struct ext4_group_desc *gdp,
                                  struct buffer_head *bh, int sz);
void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
                                struct ext4_group_desc *gdp,
                                struct buffer_head *bh);
int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
                                  struct ext4_group_desc *gdp,
                                  struct buffer_head *bh);

/* balloc.c */
extern void ext4_get_group_no_and_offset(struct super_block *sb,
                                         ext4_fsblk_t blocknr,
                                         ext4_group_t *blockgrpp,
                                         ext4_grpblk_t *offsetp);
extern ext4_group_t ext4_get_group_number(struct super_block *sb,
                                          ext4_fsblk_t block);

extern unsigned int ext4_block_group(struct super_block *sb,
                        ext4_fsblk_t blocknr);
extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
                        ext4_fsblk_t blocknr);
extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
                        ext4_group_t group);
extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
                                         ext4_fsblk_t goal,
                                         unsigned int flags,
                                         unsigned long *count,
                                         int *errp);
extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
                                    s64 nclusters, unsigned int flags);
extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *);
extern void ext4_check_blocks_bitmap(struct super_block *);
extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
                                                    ext4_group_t block_group,
                                                    struct buffer_head ** bh);
extern struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
                                                   ext4_group_t group);
extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);

extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
                                                ext4_group_t block_group,
                                                bool ignore_locked);
extern int ext4_wait_block_bitmap(struct super_block *sb,
                                  ext4_group_t block_group,
                                  struct buffer_head *bh);
extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
                                                  ext4_group_t block_group);
extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
                                              ext4_group_t block_group,
                                              struct ext4_group_desc *gdp);
ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);

#ifdef CONFIG_UNICODE
extern void ext4_fname_setup_ci_filename(struct inode *dir,
                                         const struct qstr *iname,
                                         struct fscrypt_str *fname);
#endif

#ifdef CONFIG_FS_ENCRYPTION
static inline void ext4_fname_from_fscrypt_name(struct ext4_filename *dst,
                                                const struct fscrypt_name *src)
{
        memset(dst, 0, sizeof(*dst));

        dst->usr_fname = src->usr_fname;
        dst->disk_name = src->disk_name;
        dst->hinfo.hash = src->hash;
        dst->hinfo.minor_hash = src->minor_hash;
        dst->crypto_buf = src->crypto_buf;
}

static inline int ext4_fname_setup_filename(struct inode *dir,
                                            const struct qstr *iname,
                                            int lookup,
                                            struct ext4_filename *fname)
{
        struct fscrypt_name name;
        int err;

        err = fscrypt_setup_filename(dir, iname, lookup, &name);
        if (err)
                return err;

        ext4_fname_from_fscrypt_name(fname, &name);

#ifdef CONFIG_UNICODE
        ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name);
#endif
        return 0;
}

static inline int ext4_fname_prepare_lookup(struct inode *dir,
                                            struct dentry *dentry,
                                            struct ext4_filename *fname)
{
        struct fscrypt_name name;
        int err;

        err = fscrypt_prepare_lookup(dir, dentry, &name);
        if (err)
                return err;

        ext4_fname_from_fscrypt_name(fname, &name);

#ifdef CONFIG_UNICODE
        ext4_fname_setup_ci_filename(dir, &dentry->d_name, &fname->cf_name);
#endif
        return 0;
}

static inline void ext4_fname_free_filename(struct ext4_filename *fname)
{
        struct fscrypt_name name;

        name.crypto_buf = fname->crypto_buf;
        fscrypt_free_filename(&name);

        fname->crypto_buf.name = NULL;
        fname->usr_fname = NULL;
        fname->disk_name.name = NULL;

#ifdef CONFIG_UNICODE
        kfree(fname->cf_name.name);
        fname->cf_name.name = NULL;
#endif
}
#else /* !CONFIG_FS_ENCRYPTION */
static inline int ext4_fname_setup_filename(struct inode *dir,
                                            const struct qstr *iname,
                                            int lookup,
                                            struct ext4_filename *fname)
{
        fname->usr_fname = iname;
        fname->disk_name.name = (unsigned char *) iname->name;
        fname->disk_name.len = iname->len;

#ifdef CONFIG_UNICODE
        ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name);
#endif

        return 0;
}

static inline int ext4_fname_prepare_lookup(struct inode *dir,
                                            struct dentry *dentry,
                                            struct ext4_filename *fname)
{
        return ext4_fname_setup_filename(dir, &dentry->d_name, 1, fname);
}

static inline void ext4_fname_free_filename(struct ext4_filename *fname)
{
#ifdef CONFIG_UNICODE
        kfree(fname->cf_name.name);
        fname->cf_name.name = NULL;
#endif
}
#endif /* !CONFIG_FS_ENCRYPTION */

/* dir.c */
extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
                                  struct file *,
                                  struct ext4_dir_entry_2 *,
                                  struct buffer_head *, char *, int,
                                  unsigned int);
#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset)        \
        unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
                                        (de), (bh), (buf), (size), (offset)))
extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
                                __u32 minor_hash,
                                struct ext4_dir_entry_2 *dirent,
                                struct fscrypt_str *ent_name);
extern void ext4_htree_free_dir_info(struct dir_private_info *p);
extern int ext4_find_dest_de(struct inode *dir, struct inode *inode,
                             struct buffer_head *bh,
                             void *buf, int buf_size,
                             struct ext4_filename *fname,
                             struct ext4_dir_entry_2 **dest_de);
void ext4_insert_dentry(struct inode *inode,
                        struct ext4_dir_entry_2 *de,
                        int buf_size,
                        struct ext4_filename *fname);
static inline void ext4_update_dx_flag(struct inode *inode)
{
        if (!ext4_has_feature_dir_index(inode->i_sb) &&
            ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) {
                /* ext4_iget() should have caught this... */
                WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb));
                ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
        }
}
static const unsigned char ext4_filetype_table[] = {
        DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
};

static inline  unsigned char get_dtype(struct super_block *sb, int filetype)
{
        if (!ext4_has_feature_filetype(sb) || filetype >= EXT4_FT_MAX)
                return DT_UNKNOWN;

        return ext4_filetype_table[filetype];
}
extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh,
                             void *buf, int buf_size);

/* fsync.c */
extern int ext4_sync_file(struct file *, loff_t, loff_t, int);

/* hash.c */
extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
                          struct dx_hash_info *hinfo);

/* ialloc.c */
extern int ext4_mark_inode_used(struct super_block *sb, int ino);
extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t,
                                      const struct qstr *qstr, __u32 goal,
                                      uid_t *owner, __u32 i_flags,
                                      int handle_type, unsigned int line_no,
                                      int nblocks);

#define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags) \
        __ext4_new_inode((handle), (dir), (mode), (qstr), (goal), (owner), \
                         i_flags, 0, 0, 0)
#define ext4_new_inode_start_handle(dir, mode, qstr, goal, owner, \
                                    type, nblocks)                    \
        __ext4_new_inode(NULL, (dir), (mode), (qstr), (goal), (owner), \
                         0, (type), __LINE__, (nblocks))


extern void ext4_free_inode(handle_t *, struct inode *);
extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
extern unsigned long ext4_count_free_inodes(struct super_block *);
extern unsigned long ext4_count_dirs(struct super_block *);
extern void ext4_check_inodes_bitmap(struct super_block *);
extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
extern int ext4_init_inode_table(struct super_block *sb,
                                 ext4_group_t group, int barrier);
extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);

/* fast_commit.c */
int ext4_fc_info_show(struct seq_file *seq, void *v);
void ext4_fc_init(struct super_block *sb, journal_t *journal);
void ext4_fc_init_inode(struct inode *inode);
void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
                         ext4_lblk_t end);
void __ext4_fc_track_unlink(handle_t *handle, struct inode *inode,
        struct dentry *dentry);
void __ext4_fc_track_link(handle_t *handle, struct inode *inode,
        struct dentry *dentry);
void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry);
void ext4_fc_track_link(handle_t *handle, struct dentry *dentry);
void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
                            struct dentry *dentry);
void ext4_fc_track_create(handle_t *handle, struct dentry *dentry);
void ext4_fc_track_inode(handle_t *handle, struct inode *inode);
void ext4_fc_mark_ineligible(struct super_block *sb, int reason);
void ext4_fc_start_ineligible(struct super_block *sb, int reason);
void ext4_fc_stop_ineligible(struct super_block *sb);
void ext4_fc_start_update(struct inode *inode);
void ext4_fc_stop_update(struct inode *inode);
void ext4_fc_del(struct inode *inode);
bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t block);
void ext4_fc_replay_cleanup(struct super_block *sb);
int ext4_fc_commit(journal_t *journal, tid_t commit_tid);
int __init ext4_fc_init_dentry_cache(void);
void ext4_fc_destroy_dentry_cache(void);
int ext4_fc_record_regions(struct super_block *sb, int ino,
                           ext4_lblk_t lblk, ext4_fsblk_t pblk,
                           int len, int replay);

/* mballoc.c */
extern const struct seq_operations ext4_mb_seq_groups_ops;
extern long ext4_mb_stats;
extern long ext4_mb_max_to_scan;
extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset);
extern int ext4_mb_init(struct super_block *);
extern int ext4_mb_release(struct super_block *);
extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
                                struct ext4_allocation_request *, int *);
extern int ext4_mb_reserve_blocks(struct super_block *, int);
extern void ext4_discard_preallocations(struct inode *, unsigned int);
extern int __init ext4_init_mballoc(void);
extern void ext4_exit_mballoc(void);
extern ext4_group_t ext4_mb_prefetch(struct super_block *sb,
                                     ext4_group_t group,
                                     unsigned int nr, int *cnt);
extern void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
                                  unsigned int nr);

extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
                             struct buffer_head *bh, ext4_fsblk_t block,
                             unsigned long count, int flags);
extern int ext4_mb_alloc_groupinfo(struct super_block *sb,
                                   ext4_group_t ngroups);
extern int ext4_mb_add_groupinfo(struct super_block *sb,
                ext4_group_t i, struct ext4_group_desc *desc);
extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
                                ext4_fsblk_t block, unsigned long count);
extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid);
extern void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
                       int len, int state);

/* inode.c */
void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
                         struct ext4_inode_info *ei);
int ext4_inode_is_fast_symlink(struct inode *inode);
struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count,
                     bool wait, struct buffer_head **bhs);
int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
                             struct buffer_head *bh_result, int create);
int ext4_get_block(struct inode *inode, sector_t iblock,
                   struct buffer_head *bh_result, int create);
int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                           struct buffer_head *bh, int create);
int ext4_walk_page_buffers(handle_t *handle,
                           struct buffer_head *head,
                           unsigned from,
                           unsigned to,
                           int *partial,
                           int (*fn)(handle_t *handle,
                                     struct buffer_head *bh));
int do_journal_get_write_access(handle_t *handle,
                                struct buffer_head *bh);
#define FALL_BACK_TO_NONDELALLOC 1
#define CONVERT_INLINE_DATA         2

typedef enum {
        EXT4_IGET_NORMAL =        0,
        EXT4_IGET_SPECIAL =        0x0001, /* OK to iget a system inode */
        EXT4_IGET_HANDLE =         0x0002,        /* Inode # is from a handle */
        EXT4_IGET_BAD =                0x0004, /* Allow to iget a bad inode */
        EXT4_IGET_EA_INODE =        0x0008        /* Inode should contain an EA value */
} ext4_iget_flags;

extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
                                 ext4_iget_flags flags, const char *function,
                                 unsigned int line);

#define ext4_iget(sb, ino, flags) \
        __ext4_iget((sb), (ino), (flags), __func__, __LINE__)

extern int  ext4_write_inode(struct inode *, struct writeback_control *);
extern int  ext4_setattr(struct dentry *, struct iattr *);
extern int  ext4_getattr(const struct path *, struct kstat *, u32, unsigned int);
extern void ext4_evict_inode(struct inode *);
extern void ext4_clear_inode(struct inode *);
extern int  ext4_file_getattr(const struct path *, struct kstat *, u32, unsigned int);
extern int  ext4_sync_inode(handle_t *, struct inode *);
extern void ext4_dirty_inode(struct inode *, int);
extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
extern int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
                          struct ext4_iloc *iloc);
extern int ext4_inode_attach_jinode(struct inode *inode);
extern int ext4_can_truncate(struct inode *inode);
extern int ext4_truncate(struct inode *);
extern int ext4_break_layouts(struct inode *);
extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
extern void ext4_set_inode_flags(struct inode *, bool init);
extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
                             loff_t lstart, loff_t lend);
extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf);
extern vm_fault_t ext4_filemap_fault(struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);
extern void ext4_da_release_space(struct inode *inode, int to_free);
extern void ext4_da_update_reserve_space(struct inode *inode,
                                        int used, int quota_claim);
extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk,
                              ext4_fsblk_t pblk, ext4_lblk_t len);

/* indirect.c */
extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
                                struct ext4_map_blocks *map, int flags);
extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
extern void ext4_ind_truncate(handle_t *, struct inode *inode);
extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
                                 ext4_lblk_t start, ext4_lblk_t end);

/* ioctl.c */
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
extern void ext4_reset_inode_seed(struct inode *inode);

/* migrate.c */
extern int ext4_ext_migrate(struct inode *);
extern int ext4_ind_migrate(struct inode *inode);

/* namei.c */
extern int ext4_init_new_dir(handle_t *handle, struct inode *dir,
                             struct inode *inode);
extern int ext4_dirblock_csum_verify(struct inode *inode,
                                     struct buffer_head *bh);
extern int ext4_orphan_add(handle_t *, struct inode *);
extern int ext4_orphan_del(handle_t *, struct inode *);
extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
                                __u32 start_minor_hash, __u32 *next_hash);
extern int ext4_search_dir(struct buffer_head *bh,
                           char *search_buf,
                           int buf_size,
                           struct inode *dir,
                           struct ext4_filename *fname,
                           unsigned int offset,
                           struct ext4_dir_entry_2 **res_dir);
extern int ext4_generic_delete_entry(struct inode *dir,
                                     struct ext4_dir_entry_2 *de_del,
                                     struct buffer_head *bh,
                                     void *entry_buf,
                                     int buf_size,
                                     int csum_size);
extern bool ext4_empty_dir(struct inode *inode);

/* resize.c */
extern void ext4_kvfree_array_rcu(void *to_free);
extern int ext4_group_add(struct super_block *sb,
                                struct ext4_new_group_data *input);
extern int ext4_group_extend(struct super_block *sb,
                                struct ext4_super_block *es,
                                ext4_fsblk_t n_blocks_count);
extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);

/* super.c */
extern struct buffer_head *ext4_sb_bread(struct super_block *sb,
                                         sector_t block, int op_flags);
extern struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
                                                   sector_t block);
extern void ext4_read_bh_nowait(struct buffer_head *bh, int op_flags,
                                bh_end_io_t *end_io);
extern int ext4_read_bh(struct buffer_head *bh, int op_flags,
                        bh_end_io_t *end_io);
extern int ext4_read_bh_lock(struct buffer_head *bh, int op_flags, bool wait);
extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block);
extern int ext4_seq_options_show(struct seq_file *seq, void *offset);
extern int ext4_calculate_overhead(struct super_block *sb);
extern void ext4_superblock_csum_set(struct super_block *sb);
extern int ext4_alloc_flex_bg_array(struct super_block *sb,
                                    ext4_group_t ngroup);
extern const char *ext4_decode_error(struct super_block *sb, int errno,
                                     char nbuf[16]);
extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
                                             ext4_group_t block_group,
                                             unsigned int flags);
extern unsigned int ext4_num_base_meta_blocks(struct super_block *sb,
                                              ext4_group_t block_group);

extern __printf(6, 7)
void __ext4_error(struct super_block *, const char *, unsigned int, int, __u64,
                  const char *, ...);
extern __printf(6, 7)
void __ext4_error_inode(struct inode *, const char *, unsigned int,
                        ext4_fsblk_t, int, const char *, ...);
extern __printf(5, 6)
void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
                     const char *, ...);
extern void __ext4_std_error(struct super_block *, const char *,
                             unsigned int, int);
extern __printf(5, 6)
void __ext4_abort(struct super_block *, const char *, unsigned int, int,
                  const char *, ...);
extern __printf(4, 5)
void __ext4_warning(struct super_block *, const char *, unsigned int,
                    const char *, ...);
extern __printf(4, 5)
void __ext4_warning_inode(const struct inode *inode, const char *function,
                          unsigned int line, const char *fmt, ...);
extern __printf(3, 4)
void __ext4_msg(struct super_block *, const char *, const char *, ...);
extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
                           const char *, unsigned int, const char *);
extern __printf(7, 8)
void __ext4_grp_locked_error(const char *, unsigned int,
                             struct super_block *, ext4_group_t,
                             unsigned long, ext4_fsblk_t,
                             const char *, ...);

#define EXT4_ERROR_INODE(inode, fmt, a...) \
        ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a)

#define EXT4_ERROR_INODE_ERR(inode, err, fmt, a...)                        \
        __ext4_error_inode((inode), __func__, __LINE__, 0, (err), (fmt), ## a)

#define ext4_error_inode_block(inode, block, err, fmt, a...)                \
        __ext4_error_inode((inode), __func__, __LINE__, (block), (err),        \
                           (fmt), ## a)

#define EXT4_ERROR_FILE(file, block, fmt, a...)                                \
        ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a)

#ifdef CONFIG_PRINTK

#define ext4_error_inode(inode, func, line, block, fmt, ...)                \
        __ext4_error_inode(inode, func, line, block, 0, fmt, ##__VA_ARGS__)
#define ext4_error_inode_err(inode, func, line, block, err, fmt, ...)        \
        __ext4_error_inode((inode), (func), (line), (block),                 \
                           (err), (fmt), ##__VA_ARGS__)
#define ext4_error_file(file, func, line, block, fmt, ...)                \
        __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__)
#define ext4_error(sb, fmt, ...)                                        \
        __ext4_error((sb), __func__, __LINE__, 0, 0, (fmt), ##__VA_ARGS__)
#define ext4_error_err(sb, err, fmt, ...)                                \
        __ext4_error((sb), __func__, __LINE__, (err), 0, (fmt), ##__VA_ARGS__)
#define ext4_abort(sb, err, fmt, ...)                                        \
        __ext4_abort((sb), __func__, __LINE__, (err), (fmt), ##__VA_ARGS__)
#define ext4_warning(sb, fmt, ...)                                        \
        __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
#define ext4_warning_inode(inode, fmt, ...)                                \
        __ext4_warning_inode(inode, __func__, __LINE__, fmt, ##__VA_ARGS__)
#define ext4_msg(sb, level, fmt, ...)                                \
        __ext4_msg(sb, level, fmt, ##__VA_ARGS__)
#define dump_mmp_msg(sb, mmp, msg)                                        \
        __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg)
#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...)                \
        __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \
                                fmt, ##__VA_ARGS__)

#else

#define ext4_error_inode(inode, func, line, block, fmt, ...)                \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_error_inode(inode, "", 0, block, 0, " ");                \
} while (0)
#define ext4_error_inode_err(inode, func, line, block, err, fmt, ...)        \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_error_inode(inode, "", 0, block, err, " ");                \
} while (0)
#define ext4_error_file(file, func, line, block, fmt, ...)                \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_error_file(file, "", 0, block, " ");                        \
} while (0)
#define ext4_error(sb, fmt, ...)                                        \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_error(sb, "", 0, 0, 0, " ");                                \
} while (0)
#define ext4_error_err(sb, err, fmt, ...)                                \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_error(sb, "", 0, err, 0, " ");                                \
} while (0)
#define ext4_abort(sb, err, fmt, ...)                                        \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_abort(sb, "", 0, err, " ");                                \
} while (0)
#define ext4_warning(sb, fmt, ...)                                        \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_warning(sb, "", 0, " ");                                        \
} while (0)
#define ext4_warning_inode(inode, fmt, ...)                                \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_warning_inode(inode, "", 0, " ");                        \
} while (0)
#define ext4_msg(sb, level, fmt, ...)                                        \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_msg(sb, "", " ");                                        \
} while (0)
#define dump_mmp_msg(sb, mmp, msg)                                        \
        __dump_mmp_msg(sb, mmp, "", 0, "")
#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...)                \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                \
        __ext4_grp_locked_error("", 0, sb, grp, ino, block, " ");        \
} while (0)

#endif

extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
                                      struct ext4_group_desc *bg);
extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
                                      struct ext4_group_desc *bg);
extern ext4_fsblk_t ext4_inode_table(struct super_block *sb,
                                     struct ext4_group_desc *bg);
extern __u32 ext4_free_group_clusters(struct super_block *sb,
                                      struct ext4_group_desc *bg);
extern __u32 ext4_free_inodes_count(struct super_block *sb,
                                 struct ext4_group_desc *bg);
extern __u32 ext4_used_dirs_count(struct super_block *sb,
                                struct ext4_group_desc *bg);
extern __u32 ext4_itable_unused_count(struct super_block *sb,
                                   struct ext4_group_desc *bg);
extern void ext4_block_bitmap_set(struct super_block *sb,
                                  struct ext4_group_desc *bg, ext4_fsblk_t blk);
extern void ext4_inode_bitmap_set(struct super_block *sb,
                                  struct ext4_group_desc *bg, ext4_fsblk_t blk);
extern void ext4_inode_table_set(struct super_block *sb,
                                 struct ext4_group_desc *bg, ext4_fsblk_t blk);
extern void ext4_free_group_clusters_set(struct super_block *sb,
                                         struct ext4_group_desc *bg,
                                         __u32 count);
extern void ext4_free_inodes_set(struct super_block *sb,
                                struct ext4_group_desc *bg, __u32 count);
extern void ext4_used_dirs_set(struct super_block *sb,
                                struct ext4_group_desc *bg, __u32 count);
extern void ext4_itable_unused_set(struct super_block *sb,
                                   struct ext4_group_desc *bg, __u32 count);
extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group,
                                       struct ext4_group_desc *gdp);
extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
                                     struct ext4_group_desc *gdp);
extern int ext4_register_li_request(struct super_block *sb,
                                    ext4_group_t first_not_zeroed);

static inline int ext4_has_metadata_csum(struct super_block *sb)
{
        WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) &&
                     !EXT4_SB(sb)->s_chksum_driver);

        return ext4_has_feature_metadata_csum(sb) &&
               (EXT4_SB(sb)->s_chksum_driver != NULL);
}

static inline int ext4_has_group_desc_csum(struct super_block *sb)
{
        return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb);
}

#define ext4_read_incompat_64bit_val(es, name) \
        (((es)->s_feature_incompat & cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT) \
                ? (ext4_fsblk_t)le32_to_cpu(es->name##_hi) << 32 : 0) | \
                le32_to_cpu(es->name##_lo))

static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
{
        return ext4_read_incompat_64bit_val(es, s_blocks_count);
}

static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es)
{
        return ext4_read_incompat_64bit_val(es, s_r_blocks_count);
}

static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es)
{
        return ext4_read_incompat_64bit_val(es, s_free_blocks_count);
}

static inline void ext4_blocks_count_set(struct ext4_super_block *es,
                                         ext4_fsblk_t blk)
{
        es->s_blocks_count_lo = cpu_to_le32((u32)blk);
        es->s_blocks_count_hi = cpu_to_le32(blk >> 32);
}

static inline void ext4_free_blocks_count_set(struct ext4_super_block *es,
                                              ext4_fsblk_t blk)
{
        es->s_free_blocks_count_lo = cpu_to_le32((u32)blk);
        es->s_free_blocks_count_hi = cpu_to_le32(blk >> 32);
}

static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
                                           ext4_fsblk_t blk)
{
        es->s_r_blocks_count_lo = cpu_to_le32((u32)blk);
        es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
}

static inline loff_t ext4_isize(struct super_block *sb,
                                struct ext4_inode *raw_inode)
{
        if (ext4_has_feature_largedir(sb) ||
            S_ISREG(le16_to_cpu(raw_inode->i_mode)))
                return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
                        le32_to_cpu(raw_inode->i_size_lo);

        return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
}

static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
{
        raw_inode->i_size_lo = cpu_to_le32(i_size);
        raw_inode->i_size_high = cpu_to_le32(i_size >> 32);
}

/*
 * Reading s_groups_count requires using smp_rmb() afterwards.  See
 * the locking protocol documented in the comments of ext4_group_add()
 * in resize.c
 */
static inline ext4_group_t ext4_get_groups_count(struct super_block *sb)
{
        ext4_group_t        ngroups = EXT4_SB(sb)->s_groups_count;

        smp_rmb();
        return ngroups;
}

static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
                                             ext4_group_t block_group)
{
        return block_group >> sbi->s_log_groups_per_flex;
}

static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
{
        return 1 << sbi->s_log_groups_per_flex;
}

static inline loff_t ext4_get_maxbytes(struct inode *inode)
{
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                return inode->i_sb->s_maxbytes;
        return EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
}

#define ext4_std_error(sb, errno)                                \
do {                                                                \
        if ((errno))                                                \
                __ext4_std_error((sb), __func__, __LINE__, (errno));        \
} while (0)

#ifdef CONFIG_SMP
/* Each CPU can accumulate percpu_counter_batch clusters in their local
 * counters. So we need to make sure we have free clusters more
 * than percpu_counter_batch  * nr_cpu_ids. Also add a window of 4 times.
 */
#define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids))
#else
#define EXT4_FREECLUSTERS_WATERMARK 0
#endif

/* Update i_disksize. Requires i_mutex to avoid races with truncate */
static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
{
        WARN_ON_ONCE(S_ISREG(inode->i_mode) &&
                     !inode_is_locked(inode));
        down_write(&EXT4_I(inode)->i_data_sem);
        if (newsize > EXT4_I(inode)->i_disksize)
                WRITE_ONCE(EXT4_I(inode)->i_disksize, newsize);
        up_write(&EXT4_I(inode)->i_data_sem);
}

/* Update i_size, i_disksize. Requires i_mutex to avoid races with truncate */
static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)
{
        int changed = 0;

        if (newsize > inode->i_size) {
                i_size_write(inode, newsize);
                changed = 1;
        }
        if (newsize > EXT4_I(inode)->i_disksize) {
                ext4_update_i_disksize(inode, newsize);
                changed |= 2;
        }
        return changed;
}

int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
                                      loff_t len);

struct ext4_group_info {
        unsigned long   bb_state;
#ifdef AGGRESSIVE_CHECK
        unsigned long        bb_check_counter;
#endif
        struct rb_root  bb_free_root;
        ext4_grpblk_t        bb_first_free;        /* first free block */
        ext4_grpblk_t        bb_free;        /* total free blocks */
        ext4_grpblk_t        bb_fragments;        /* nr of freespace fragments */
        ext4_grpblk_t        bb_largest_free_order;/* order of largest frag in BG */
        struct          list_head bb_prealloc_list;
#ifdef DOUBLE_CHECK
        void            *bb_bitmap;
#endif
        struct rw_semaphore alloc_sem;
        ext4_grpblk_t        bb_counters[];        /* Nr of free power-of-two-block
                                         * regions, index is order.
                                         * bb_counters[3] = 5 means
                                         * 5 free 8-block regions. */
};

#define EXT4_GROUP_INFO_NEED_INIT_BIT                0
#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT                1
#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT        2
#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT        3
#define EXT4_GROUP_INFO_BBITMAP_CORRUPT                \
        (1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT)
#define EXT4_GROUP_INFO_IBITMAP_CORRUPT                \
        (1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT)
#define EXT4_GROUP_INFO_BBITMAP_READ_BIT        4

#define EXT4_MB_GRP_NEED_INIT(grp)        \
        (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_BBITMAP_CORRUPT(grp)        \
        (test_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_IBITMAP_CORRUPT(grp)        \
        (test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state)))

#define EXT4_MB_GRP_WAS_TRIMMED(grp)        \
        (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_SET_TRIMMED(grp)        \
        (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_CLEAR_TRIMMED(grp)        \
        (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_TEST_AND_SET_READ(grp)        \
        (test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state)))

#define EXT4_MAX_CONTENTION                8
#define EXT4_CONTENTION_THRESHOLD        2

static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
                                              ext4_group_t group)
{
        return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
}

/*
 * Returns true if the filesystem is busy enough that attempts to
 * access the block group locks has run into contention.
 */
static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi)
{
        return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD);
}

static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
{
        spinlock_t *lock = ext4_group_lock_ptr(sb, group);
        if (spin_trylock(lock))
                /*
                 * We're able to grab the lock right away, so drop the
                 * lock contention counter.
                 */
                atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
        else {
                /*
                 * The lock is busy, so bump the contention counter,
                 * and then wait on the spin lock.
                 */
                atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1,
                                  EXT4_MAX_CONTENTION);
                spin_lock(lock);
        }
}

static inline void ext4_unlock_group(struct super_block *sb,
                                        ext4_group_t group)
{
        spin_unlock(ext4_group_lock_ptr(sb, group));
}

/*
 * Block validity checking
 */
#define ext4_check_indirect_blockref(inode, bh)                                \
        ext4_check_blockref(__func__, __LINE__, inode,                        \
                            (__le32 *)(bh)->b_data,                        \
                            EXT4_ADDR_PER_BLOCK((inode)->i_sb))

#define ext4_ind_check_inode(inode)                                        \
        ext4_check_blockref(__func__, __LINE__, inode,                        \
                            EXT4_I(inode)->i_data,                        \
                            EXT4_NDIR_BLOCKS)

/*
 * Inodes and files operations
 */

/* dir.c */
extern const struct file_operations ext4_dir_operations;

#ifdef CONFIG_UNICODE
extern const struct dentry_operations ext4_dentry_ops;
#endif

/* file.c */
extern const struct inode_operations ext4_file_inode_operations;
extern const struct file_operations ext4_file_operations;
extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);

/* inline.c */
extern int ext4_get_max_inline_size(struct inode *inode);
extern int ext4_find_inline_data_nolock(struct inode *inode);
extern int ext4_init_inline_data(handle_t *handle, struct inode *inode,
                                 unsigned int len);
extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode);

extern int ext4_readpage_inline(struct inode *inode, struct page *page);
extern int ext4_try_to_write_inline_data(struct address_space *mapping,
                                         struct inode *inode,
                                         loff_t pos, unsigned len,
                                         unsigned flags,
                                         struct page **pagep);
extern int ext4_write_inline_data_end(struct inode *inode,
                                      loff_t pos, unsigned len,
                                      unsigned copied,
                                      struct page *page);
extern struct buffer_head *
ext4_journalled_write_inline_data(struct inode *inode,
                                  unsigned len,
                                  struct page *page);
extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
                                           struct inode *inode,
                                           loff_t pos, unsigned len,
                                           unsigned flags,
                                           struct page **pagep,
                                           void **fsdata);
extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
                                         unsigned len, unsigned copied,
                                         struct page *page);
extern int ext4_try_add_inline_entry(handle_t *handle,
                                     struct ext4_filename *fname,
                                     struct inode *dir, struct inode *inode);
extern int ext4_try_create_inline_dir(handle_t *handle,
                                      struct inode *parent,
                                      struct inode *inode);
extern int ext4_read_inline_dir(struct file *filp,
                                struct dir_context *ctx,
                                int *has_inline_data);
extern int ext4_inlinedir_to_tree(struct file *dir_file,
                                  struct inode *dir, ext4_lblk_t block,
                                  struct dx_hash_info *hinfo,
                                  __u32 start_hash, __u32 start_minor_hash,
                                  int *has_inline_data);
extern struct buffer_head *ext4_find_inline_entry(struct inode *dir,
                                        struct ext4_filename *fname,
                                        struct ext4_dir_entry_2 **res_dir,
                                        int *has_inline_data);
extern int ext4_delete_inline_entry(handle_t *handle,
                                    struct inode *dir,
                                    struct ext4_dir_entry_2 *de_del,
                                    struct buffer_head *bh,
                                    int *has_inline_data);
extern bool empty_inline_dir(struct inode *dir, int *has_inline_data);
extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
                                        struct ext4_dir_entry_2 **parent_de,
                                        int *retval);
extern int ext4_inline_data_fiemap(struct inode *inode,
                                   struct fiemap_extent_info *fieinfo,
                                   int *has_inline, __u64 start, __u64 len);

struct iomap;
extern int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap);

extern int ext4_inline_data_truncate(struct inode *inode, int *has_inline);

extern int ext4_convert_inline_data(struct inode *inode);

static inline int ext4_has_inline_data(struct inode *inode)
{
        return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) &&
               EXT4_I(inode)->i_inline_off;
}

/* namei.c */
extern const struct inode_operations ext4_dir_inode_operations;
extern const struct inode_operations ext4_special_inode_operations;
extern struct dentry *ext4_get_parent(struct dentry *child);
extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
                                 struct ext4_dir_entry_2 *de,
                                 int blocksize, int csum_size,
                                 unsigned int parent_ino, int dotdot_real_len);
extern void ext4_initialize_dirent_tail(struct buffer_head *bh,
                                        unsigned int blocksize);
extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode,
                                      struct buffer_head *bh);
extern int ext4_ci_compare(const struct inode *parent,
                           const struct qstr *fname,
                           const struct qstr *entry, bool quick);
extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
                         struct inode *inode, struct dentry *dentry);
extern int __ext4_link(struct inode *dir, struct inode *inode,
                       struct dentry *dentry);

#define S_SHIFT 12
static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = {
        [S_IFREG >> S_SHIFT]        = EXT4_FT_REG_FILE,
        [S_IFDIR >> S_SHIFT]        = EXT4_FT_DIR,
        [S_IFCHR >> S_SHIFT]        = EXT4_FT_CHRDEV,
        [S_IFBLK >> S_SHIFT]        = EXT4_FT_BLKDEV,
        [S_IFIFO >> S_SHIFT]        = EXT4_FT_FIFO,
        [S_IFSOCK >> S_SHIFT]        = EXT4_FT_SOCK,
        [S_IFLNK >> S_SHIFT]        = EXT4_FT_SYMLINK,
};

static inline void ext4_set_de_type(struct super_block *sb,
                                struct ext4_dir_entry_2 *de,
                                umode_t mode) {
        if (ext4_has_feature_filetype(sb))
                de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
}

/* readpages.c */
extern int ext4_mpage_readpages(struct inode *inode,
                struct readahead_control *rac, struct page *page);
extern int __init ext4_init_post_read_processing(void);
extern void ext4_exit_post_read_processing(void);

/* symlink.c */
extern const struct inode_operations ext4_encrypted_symlink_inode_operations;
extern const struct inode_operations ext4_symlink_inode_operations;
extern const struct inode_operations ext4_fast_symlink_inode_operations;

/* sysfs.c */
extern int ext4_register_sysfs(struct super_block *sb);
extern void ext4_unregister_sysfs(struct super_block *sb);
extern int __init ext4_init_sysfs(void);
extern void ext4_exit_sysfs(void);

/* block_validity */
extern void ext4_release_system_zone(struct super_block *sb);
extern int ext4_setup_system_zone(struct super_block *sb);
extern int __init ext4_init_system_zone(void);
extern void ext4_exit_system_zone(void);
extern int ext4_inode_block_valid(struct inode *inode,
                                  ext4_fsblk_t start_blk,
                                  unsigned int count);
extern int ext4_check_blockref(const char *, unsigned int,
                               struct inode *, __le32 *, unsigned int);
extern int ext4_sb_block_valid(struct super_block *sb, struct inode *inode,
                                ext4_fsblk_t start_blk, unsigned int count);


/* extents.c */
struct ext4_ext_path;
struct ext4_extent;

/*
 * Maximum number of logical blocks in a file; ext4_extent's ee_block is
 * __le32.
 */
#define EXT_MAX_BLOCKS        0xffffffff

extern void ext4_ext_tree_init(handle_t *handle, struct inode *inode);
extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                               struct ext4_map_blocks *map, int flags);
extern int ext4_ext_truncate(handle_t *, struct inode *);
extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
                                 ext4_lblk_t end);
extern void ext4_ext_init(struct super_block *);
extern void ext4_ext_release(struct super_block *);
extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
                          loff_t len);
extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
                                          loff_t offset, ssize_t len);
extern int ext4_convert_unwritten_io_end_vec(handle_t *handle,
                                             ext4_io_end_t *io_end);
extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
                           struct ext4_map_blocks *map, int flags);
extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
                                                   int num,
                                                   struct ext4_ext_path *path);
extern int ext4_ext_insert_extent(handle_t *, struct inode *,
                                  struct ext4_ext_path **,
                                  struct ext4_extent *, int);
extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t,
                                              struct ext4_ext_path **,
                                              int flags);
extern void ext4_ext_drop_refs(struct ext4_ext_path *);
extern int ext4_ext_check_inode(struct inode *inode);
extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path);
extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        __u64 start, __u64 len);
extern int ext4_get_es_cache(struct inode *inode,
                             struct fiemap_extent_info *fieinfo,
                             __u64 start, __u64 len);
extern int ext4_ext_precache(struct inode *inode);
extern int ext4_swap_extents(handle_t *handle, struct inode *inode1,
                                struct inode *inode2, ext4_lblk_t lblk1,
                             ext4_lblk_t lblk2,  ext4_lblk_t count,
                             int mark_unwritten,int *err);
extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu);
extern int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
                                       int check_cred, int restart_cred,
                                       int revoke_cred);
extern void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end);
extern int ext4_ext_replay_set_iblocks(struct inode *inode);
extern int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start,
                int len, int unwritten, ext4_fsblk_t pblk);
extern int ext4_ext_clear_bb(struct inode *inode);


/* move_extent.c */
extern void ext4_double_down_write_data_sem(struct inode *first,
                                            struct inode *second);
extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
                                          struct inode *donor_inode);
extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
                             __u64 start_orig, __u64 start_donor,
                             __u64 len, __u64 *moved_len);

/* page-io.c */
extern int __init ext4_init_pageio(void);
extern void ext4_exit_pageio(void);
extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end);
extern int ext4_put_io_end(ext4_io_end_t *io_end);
extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
extern void ext4_io_submit_init(struct ext4_io_submit *io,
                                struct writeback_control *wbc);
extern void ext4_end_io_rsv_work(struct work_struct *work);
extern void ext4_io_submit(struct ext4_io_submit *io);
extern int ext4_bio_write_page(struct ext4_io_submit *io,
                               struct page *page,
                               int len,
                               struct writeback_control *wbc,
                               bool keep_towrite);
extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);

/* mmp.c */
extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);

/* mmp.c */
extern void ext4_stop_mmpd(struct ext4_sb_info *sbi);

/* verity.c */
extern const struct fsverity_operations ext4_verityops;

/*
 * Add new method to test whether block and inode bitmaps are properly
 * initialized. With uninit_bg reading the block from disk is not enough
 * to mark the bitmap uptodate. We need to also zero-out the bitmap
 */
#define BH_BITMAP_UPTODATE BH_JBDPrivateStart

static inline int bitmap_uptodate(struct buffer_head *bh)
{
        return (buffer_uptodate(bh) &&
                        test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state));
}
static inline void set_bitmap_uptodate(struct buffer_head *bh)
{
        set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
}

/* For ioend & aio unwritten conversion wait queues */
#define EXT4_WQ_HASH_SZ                37
#define ext4_ioend_wq(v)   (&ext4__ioend_wq[((unsigned long)(v)) %\
                                            EXT4_WQ_HASH_SZ])
extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];

extern int ext4_resize_begin(struct super_block *sb);
extern void ext4_resize_end(struct super_block *sb);

static inline void ext4_set_io_unwritten_flag(struct inode *inode,
                                              struct ext4_io_end *io_end)
{
        if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
                io_end->flag |= EXT4_IO_END_UNWRITTEN;
                atomic_inc(&EXT4_I(inode)->i_unwritten);
        }
}

static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
{
        struct inode *inode = io_end->inode;

        if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
                io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
                /* Wake up anyone waiting on unwritten extent conversion */
                if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
                        wake_up_all(ext4_ioend_wq(inode));
        }
}

extern const struct iomap_ops ext4_iomap_ops;
extern const struct iomap_ops ext4_iomap_overwrite_ops;
extern const struct iomap_ops ext4_iomap_report_ops;

static inline int ext4_buffer_uptodate(struct buffer_head *bh)
{
        /*
         * If the buffer has the write error flag, we have failed
         * to write out data in the block.  In this  case, we don't
         * have to read the block because we may read the old data
         * successfully.
         */
        if (!buffer_uptodate(bh) && buffer_write_io_error(bh))
                set_buffer_uptodate(bh);
        return buffer_uptodate(bh);
}

#endif        /* __KERNEL__ */

#define EFSBADCRC        EBADMSG                /* Bad CRC detected */
#define EFSCORRUPTED        EUCLEAN                /* Filesystem is corrupted */

#endif        /* _EXT4_H */






















































































































































































    1 














    1 













































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _SCSI_DISK_H
#define _SCSI_DISK_H

/*
 * More than enough for everybody ;)  The huge number of majors
 * is a leftover from 16bit dev_t days, we don't really need that
 * much numberspace.
 */
#define SD_MAJORS        16

/*
 * Time out in seconds for disks and Magneto-opticals (which are slower).
 */
#define SD_TIMEOUT                (30 * HZ)
#define SD_MOD_TIMEOUT                (75 * HZ)
/*
 * Flush timeout is a multiplier over the standard device timeout which is
 * user modifiable via sysfs but initially set to SD_TIMEOUT
 */
#define SD_FLUSH_TIMEOUT_MULTIPLIER        2
#define SD_WRITE_SAME_TIMEOUT        (120 * HZ)

/*
 * Number of allowed retries
 */
#define SD_MAX_RETRIES                5
#define SD_PASSTHROUGH_RETRIES        1
#define SD_MAX_MEDIUM_TIMEOUTS        2

/*
 * Size of the initial data buffer for mode and read capacity data
 */
#define SD_BUF_SIZE                512

/*
 * Number of sectors at the end of the device to avoid multi-sector
 * accesses to in the case of last_sector_bug
 */
#define SD_LAST_BUGGY_SECTORS        8

enum {
        SD_EXT_CDB_SIZE = 32,        /* Extended CDB size */
        SD_MEMPOOL_SIZE = 2,        /* CDB pool size */
};

enum {
        SD_DEF_XFER_BLOCKS = 0xffff,
        SD_MAX_XFER_BLOCKS = 0xffffffff,
        SD_MAX_WS10_BLOCKS = 0xffff,
        SD_MAX_WS16_BLOCKS = 0x7fffff,
};

enum {
        SD_LBP_FULL = 0,        /* Full logical block provisioning */
        SD_LBP_UNMAP,                /* Use UNMAP command */
        SD_LBP_WS16,                /* Use WRITE SAME(16) with UNMAP bit */
        SD_LBP_WS10,                /* Use WRITE SAME(10) with UNMAP bit */
        SD_LBP_ZERO,                /* Use WRITE SAME(10) with zero payload */
        SD_LBP_DISABLE,                /* Discard disabled due to failed cmd */
};

enum {
        SD_ZERO_WRITE = 0,        /* Use WRITE(10/16) command */
        SD_ZERO_WS,                /* Use WRITE SAME(10/16) command */
        SD_ZERO_WS16_UNMAP,        /* Use WRITE SAME(16) with UNMAP */
        SD_ZERO_WS10_UNMAP,        /* Use WRITE SAME(10) with UNMAP */
};

struct scsi_disk {
        struct scsi_driver *driver;        /* always &sd_template */
        struct scsi_device *device;
        struct device        dev;
        struct gendisk        *disk;
        struct opal_dev *opal_dev;
#ifdef CONFIG_BLK_DEV_ZONED
        u32                nr_zones;
        u32                rev_nr_zones;
        u32                zone_blocks;
        u32                rev_zone_blocks;
        u32                zones_optimal_open;
        u32                zones_optimal_nonseq;
        u32                zones_max_open;
        u32                *zones_wp_offset;
        spinlock_t        zones_wp_offset_lock;
        u32                *rev_wp_offset;
        struct mutex        rev_mutex;
        struct work_struct zone_wp_offset_work;
        char                *zone_wp_update_buf;
#endif
        atomic_t        openers;
        sector_t        capacity;        /* size in logical blocks */
        int                max_retries;
        u32                max_xfer_blocks;
        u32                opt_xfer_blocks;
        u32                max_ws_blocks;
        u32                max_unmap_blocks;
        u32                unmap_granularity;
        u32                unmap_alignment;
        u32                index;
        unsigned int        physical_block_size;
        unsigned int        max_medium_access_timeouts;
        unsigned int        medium_access_timed_out;
        u8                media_present;
        u8                write_prot;
        u8                protection_type;/* Data Integrity Field */
        u8                provisioning_mode;
        u8                zeroing_mode;
        unsigned        ATO : 1;        /* state of disk ATO bit */
        unsigned        cache_override : 1; /* temp override of WCE,RCD */
        unsigned        WCE : 1;        /* state of disk WCE bit */
        unsigned        RCD : 1;        /* state of disk RCD bit, unused */
        unsigned        DPOFUA : 1;        /* state of disk DPOFUA bit */
        unsigned        first_scan : 1;
        unsigned        lbpme : 1;
        unsigned        lbprz : 1;
        unsigned        lbpu : 1;
        unsigned        lbpws : 1;
        unsigned        lbpws10 : 1;
        unsigned        lbpvpd : 1;
        unsigned        ws10 : 1;
        unsigned        ws16 : 1;
        unsigned        rc_basis: 2;
        unsigned        zoned: 2;
        unsigned        urswrz : 1;
        unsigned        security : 1;
        unsigned        ignore_medium_access_errors : 1;
};
#define to_scsi_disk(obj) container_of(obj,struct scsi_disk,dev)

static inline struct scsi_disk *scsi_disk(struct gendisk *disk)
{
        return container_of(disk->private_data, struct scsi_disk, driver);
}

#define sd_printk(prefix, sdsk, fmt, a...)                                \
        (sdsk)->disk ?                                                        \
              sdev_prefix_printk(prefix, (sdsk)->device,                \
                                 (sdsk)->disk->disk_name, fmt, ##a) :        \
              sdev_printk(prefix, (sdsk)->device, fmt, ##a)

#define sd_first_printk(prefix, sdsk, fmt, a...)                        \
        do {                                                                \
                if ((sdsk)->first_scan)                                        \
                        sd_printk(prefix, sdsk, fmt, ##a);                \
        } while (0)

static inline int scsi_medium_access_command(struct scsi_cmnd *scmd)
{
        switch (scmd->cmnd[0]) {
        case READ_6:
        case READ_10:
        case READ_12:
        case READ_16:
        case SYNCHRONIZE_CACHE:
        case VERIFY:
        case VERIFY_12:
        case VERIFY_16:
        case WRITE_6:
        case WRITE_10:
        case WRITE_12:
        case WRITE_16:
        case WRITE_SAME:
        case WRITE_SAME_16:
        case UNMAP:
                return 1;
        case VARIABLE_LENGTH_CMD:
                switch (scmd->cmnd[9]) {
                case READ_32:
                case VERIFY_32:
                case WRITE_32:
                case WRITE_SAME_32:
                        return 1;
                }
        }

        return 0;
}

static inline sector_t logical_to_sectors(struct scsi_device *sdev, sector_t blocks)
{
        return blocks << (ilog2(sdev->sector_size) - 9);
}

static inline unsigned int logical_to_bytes(struct scsi_device *sdev, sector_t blocks)
{
        return blocks * sdev->sector_size;
}

static inline sector_t bytes_to_logical(struct scsi_device *sdev, unsigned int bytes)
{
        return bytes >> ilog2(sdev->sector_size);
}

static inline sector_t sectors_to_logical(struct scsi_device *sdev, sector_t sector)
{
        return sector >> (ilog2(sdev->sector_size) - 9);
}

#ifdef CONFIG_BLK_DEV_INTEGRITY

extern void sd_dif_config_host(struct scsi_disk *);

#else /* CONFIG_BLK_DEV_INTEGRITY */

static inline void sd_dif_config_host(struct scsi_disk *disk)
{
}

#endif /* CONFIG_BLK_DEV_INTEGRITY */

static inline int sd_is_zoned(struct scsi_disk *sdkp)
{
        return sdkp->zoned == 1 || sdkp->device->type == TYPE_ZBC;
}

#ifdef CONFIG_BLK_DEV_ZONED

void sd_zbc_release_disk(struct scsi_disk *sdkp);
int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buffer);
int sd_zbc_revalidate_zones(struct scsi_disk *sdkp);
blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd,
                                         unsigned char op, bool all);
unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes,
                             struct scsi_sense_hdr *sshdr);
int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
                unsigned int nr_zones, report_zones_cb cb, void *data);

blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd, sector_t *lba,
                                        unsigned int nr_blocks);

#else /* CONFIG_BLK_DEV_ZONED */

static inline void sd_zbc_release_disk(struct scsi_disk *sdkp) {}

static inline int sd_zbc_read_zones(struct scsi_disk *sdkp,
                                    unsigned char *buf)
{
        return 0;
}

static inline int sd_zbc_revalidate_zones(struct scsi_disk *sdkp)
{
        return 0;
}

static inline blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd,
                                                       unsigned char op,
                                                       bool all)
{
        return BLK_STS_TARGET;
}

static inline unsigned int sd_zbc_complete(struct scsi_cmnd *cmd,
                        unsigned int good_bytes, struct scsi_sense_hdr *sshdr)
{
        return good_bytes;
}

static inline blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd,
                                                      sector_t *lba,
                                                      unsigned int nr_blocks)
{
        return BLK_STS_TARGET;
}

#define sd_zbc_report_zones NULL

#endif /* CONFIG_BLK_DEV_ZONED */

void sd_print_sense_hdr(struct scsi_disk *sdkp, struct scsi_sense_hdr *sshdr);
void sd_print_result(const struct scsi_disk *sdkp, const char *msg, int result);

#endif /* _SCSI_DISK_H */



















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_GENERIC_SECTIONS_H_
#define _ASM_GENERIC_SECTIONS_H_

/* References to section boundaries */

#include <linux/compiler.h>
#include <linux/types.h>

/*
 * Usage guidelines:
 * _text, _data: architecture specific, don't use them in arch-independent code
 * [_stext, _etext]: contains .text.* sections, may also contain .rodata.*
 *                   and/or .init.* sections
 * [_sdata, _edata]: contains .data.* sections, may also contain .rodata.*
 *                   and/or .init.* sections.
 * [__start_rodata, __end_rodata]: contains .rodata.* sections
 * [__start_ro_after_init, __end_ro_after_init]:
 *                     contains .data..ro_after_init section
 * [__init_begin, __init_end]: contains .init.* sections, but .init.text.*
 *                   may be out of this range on some architectures.
 * [_sinittext, _einittext]: contains .init.text.* sections
 * [__bss_start, __bss_stop]: contains BSS sections
 *
 * Following global variables are optional and may be unavailable on some
 * architectures and/or kernel configurations.
 *        _text, _data
 *        __kprobes_text_start, __kprobes_text_end
 *        __entry_text_start, __entry_text_end
 *        __ctors_start, __ctors_end
 *        __irqentry_text_start, __irqentry_text_end
 *        __softirqentry_text_start, __softirqentry_text_end
 *        __start_opd, __end_opd
 */
extern char _text[], _stext[], _etext[];
extern char _data[], _sdata[], _edata[];
extern char __bss_start[], __bss_stop[];
extern char __init_begin[], __init_end[];
extern char _sinittext[], _einittext[];
extern char __start_ro_after_init[], __end_ro_after_init[];
extern char _end[];
extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[];
extern char __kprobes_text_start[], __kprobes_text_end[];
extern char __entry_text_start[], __entry_text_end[];
extern char __start_rodata[], __end_rodata[];
extern char __irqentry_text_start[], __irqentry_text_end[];
extern char __softirqentry_text_start[], __softirqentry_text_end[];
extern char __start_once[], __end_once[];

/* Start and end of .ctors section - used for constructor calls. */
extern char __ctors_start[], __ctors_end[];

/* Start and end of .opd section - used for function descriptors. */
extern char __start_opd[], __end_opd[];

/* Start and end of instrumentation protected text section */
extern char __noinstr_text_start[], __noinstr_text_end[];

extern __visible const void __nosave_begin, __nosave_end;

/* Function descriptor handling (if any).  Override in asm/sections.h */
#ifndef dereference_function_descriptor
#define dereference_function_descriptor(p) ((void *)(p))
#define dereference_kernel_function_descriptor(p) ((void *)(p))
#endif

/* random extra sections (if any).  Override
 * in asm/sections.h */
#ifndef arch_is_kernel_text
static inline int arch_is_kernel_text(unsigned long addr)
{
        return 0;
}
#endif

#ifndef arch_is_kernel_data
static inline int arch_is_kernel_data(unsigned long addr)
{
        return 0;
}
#endif

/*
 * Check if an address is part of freed initmem. This is needed on architectures
 * with virt == phys kernel mapping, for code that wants to check if an address
 * is part of a static object within [_stext, _end]. After initmem is freed,
 * memory can be allocated from it, and such allocations would then have
 * addresses within the range [_stext, _end].
 */
#ifndef arch_is_kernel_initmem_freed
static inline int arch_is_kernel_initmem_freed(unsigned long addr)
{
        return 0;
}
#endif

/**
 * memory_contains - checks if an object is contained within a memory region
 * @begin: virtual address of the beginning of the memory region
 * @end: virtual address of the end of the memory region
 * @virt: virtual address of the memory object
 * @size: size of the memory object
 *
 * Returns: true if the object specified by @virt and @size is entirely
 * contained within the memory region defined by @begin and @end, false
 * otherwise.
 */
static inline bool memory_contains(void *begin, void *end, void *virt,
                                   size_t size)
{
        return virt >= begin && virt + size <= end;
}

/**
 * memory_intersects - checks if the region occupied by an object intersects
 *                     with another memory region
 * @begin: virtual address of the beginning of the memory region
 * @end: virtual address of the end of the memory region
 * @virt: virtual address of the memory object
 * @size: size of the memory object
 *
 * Returns: true if an object's memory region, specified by @virt and @size,
 * intersects with the region specified by @begin and @end, false otherwise.
 */
static inline bool memory_intersects(void *begin, void *end, void *virt,
                                     size_t size)
{
        void *vend = virt + size;

        if (virt < end && vend > begin)
                return true;

        return false;
}

/**
 * init_section_contains - checks if an object is contained within the init
 *                         section
 * @virt: virtual address of the memory object
 * @size: size of the memory object
 *
 * Returns: true if the object specified by @virt and @size is entirely
 * contained within the init section, false otherwise.
 */
static inline bool init_section_contains(void *virt, size_t size)
{
        return memory_contains(__init_begin, __init_end, virt, size);
}

/**
 * init_section_intersects - checks if the region occupied by an object
 *                           intersects with the init section
 * @virt: virtual address of the memory object
 * @size: size of the memory object
 *
 * Returns: true if an object's memory region, specified by @virt and @size,
 * intersects with the init section, false otherwise.
 */
static inline bool init_section_intersects(void *virt, size_t size)
{
        return memory_intersects(__init_begin, __init_end, virt, size);
}

/**
 * is_kernel_rodata - checks if the pointer address is located in the
 *                    .rodata section
 *
 * @addr: address to check
 *
 * Returns: true if the address is located in .rodata, false otherwise.
 */
static inline bool is_kernel_rodata(unsigned long addr)
{
        return addr >= (unsigned long)__start_rodata &&
               addr < (unsigned long)__end_rodata;
}

#endif /* _ASM_GENERIC_SECTIONS_H_ */




































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 












































    1 















    1 


    1 































































































































    1 


































































































































    1 



















































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Linux Socket Filter - Kernel level socket filtering
 *
 * Based on the design of the Berkeley Packet Filter. The new
 * internal format has been designed by PLUMgrid:
 *
 *        Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
 *
 * Authors:
 *
 *        Jay Schulist <jschlst@samba.org>
 *        Alexei Starovoitov <ast@plumgrid.com>
 *        Daniel Borkmann <dborkman@redhat.com>
 *
 * Andi Kleen - Fix a few bad bugs and races.
 * Kris Katterjohn - Added many additional checks in bpf_check_classic()
 */

#include <uapi/linux/btf.h>
#include <linux/filter.h>
#include <linux/skbuff.h>
#include <linux/vmalloc.h>
#include <linux/random.h>
#include <linux/moduleloader.h>
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/objtool.h>
#include <linux/rbtree_latch.h>
#include <linux/kallsyms.h>
#include <linux/rcupdate.h>
#include <linux/perf_event.h>
#include <linux/extable.h>
#include <linux/log2.h>
#include <linux/nospec.h>

#include <asm/barrier.h>
#include <asm/unaligned.h>

/* Registers */
#define BPF_R0        regs[BPF_REG_0]
#define BPF_R1        regs[BPF_REG_1]
#define BPF_R2        regs[BPF_REG_2]
#define BPF_R3        regs[BPF_REG_3]
#define BPF_R4        regs[BPF_REG_4]
#define BPF_R5        regs[BPF_REG_5]
#define BPF_R6        regs[BPF_REG_6]
#define BPF_R7        regs[BPF_REG_7]
#define BPF_R8        regs[BPF_REG_8]
#define BPF_R9        regs[BPF_REG_9]
#define BPF_R10        regs[BPF_REG_10]

/* Named registers */
#define DST        regs[insn->dst_reg]
#define SRC        regs[insn->src_reg]
#define FP        regs[BPF_REG_FP]
#define AX        regs[BPF_REG_AX]
#define ARG1        regs[BPF_REG_ARG1]
#define CTX        regs[BPF_REG_CTX]
#define IMM        insn->imm

/* No hurry in this branch
 *
 * Exported for the bpf jit load helper.
 */
void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size)
{
        u8 *ptr = NULL;

        if (k >= SKF_NET_OFF) {
                ptr = skb_network_header(skb) + k - SKF_NET_OFF;
        } else if (k >= SKF_LL_OFF) {
                if (unlikely(!skb_mac_header_was_set(skb)))
                        return NULL;
                ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
        }
        if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
                return ptr;

        return NULL;
}

struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)
{
        gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
        struct bpf_prog_aux *aux;
        struct bpf_prog *fp;

        size = round_up(size, PAGE_SIZE);
        fp = __vmalloc(size, gfp_flags);
        if (fp == NULL)
                return NULL;

        aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags);
        if (aux == NULL) {
                vfree(fp);
                return NULL;
        }

        fp->pages = size / PAGE_SIZE;
        fp->aux = aux;
        fp->aux->prog = fp;
        fp->jit_requested = ebpf_jit_enabled();

        INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode);
        mutex_init(&fp->aux->used_maps_mutex);
        mutex_init(&fp->aux->dst_mutex);

        return fp;
}

struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
{
        gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
        struct bpf_prog *prog;
        int cpu;

        prog = bpf_prog_alloc_no_stats(size, gfp_extra_flags);
        if (!prog)
                return NULL;

        prog->aux->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags);
        if (!prog->aux->stats) {
                kfree(prog->aux);
                vfree(prog);
                return NULL;
        }

        for_each_possible_cpu(cpu) {
                struct bpf_prog_stats *pstats;

                pstats = per_cpu_ptr(prog->aux->stats, cpu);
                u64_stats_init(&pstats->syncp);
        }
        return prog;
}
EXPORT_SYMBOL_GPL(bpf_prog_alloc);

int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog)
{
        if (!prog->aux->nr_linfo || !prog->jit_requested)
                return 0;

        prog->aux->jited_linfo = kcalloc(prog->aux->nr_linfo,
                                         sizeof(*prog->aux->jited_linfo),
                                         GFP_KERNEL | __GFP_NOWARN);
        if (!prog->aux->jited_linfo)
                return -ENOMEM;

        return 0;
}

void bpf_prog_free_jited_linfo(struct bpf_prog *prog)
{
        kfree(prog->aux->jited_linfo);
        prog->aux->jited_linfo = NULL;
}

void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog)
{
        if (prog->aux->jited_linfo && !prog->aux->jited_linfo[0])
                bpf_prog_free_jited_linfo(prog);
}

/* The jit engine is responsible to provide an array
 * for insn_off to the jited_off mapping (insn_to_jit_off).
 *
 * The idx to this array is the insn_off.  Hence, the insn_off
 * here is relative to the prog itself instead of the main prog.
 * This array has one entry for each xlated bpf insn.
 *
 * jited_off is the byte off to the last byte of the jited insn.
 *
 * Hence, with
 * insn_start:
 *      The first bpf insn off of the prog.  The insn off
 *      here is relative to the main prog.
 *      e.g. if prog is a subprog, insn_start > 0
 * linfo_idx:
 *      The prog's idx to prog->aux->linfo and jited_linfo
 *
 * jited_linfo[linfo_idx] = prog->bpf_func
 *
 * For i > linfo_idx,
 *
 * jited_linfo[i] = prog->bpf_func +
 *        insn_to_jit_off[linfo[i].insn_off - insn_start - 1]
 */
void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
                               const u32 *insn_to_jit_off)
{
        u32 linfo_idx, insn_start, insn_end, nr_linfo, i;
        const struct bpf_line_info *linfo;
        void **jited_linfo;

        if (!prog->aux->jited_linfo)
                /* Userspace did not provide linfo */
                return;

        linfo_idx = prog->aux->linfo_idx;
        linfo = &prog->aux->linfo[linfo_idx];
        insn_start = linfo[0].insn_off;
        insn_end = insn_start + prog->len;

        jited_linfo = &prog->aux->jited_linfo[linfo_idx];
        jited_linfo[0] = prog->bpf_func;

        nr_linfo = prog->aux->nr_linfo - linfo_idx;

        for (i = 1; i < nr_linfo && linfo[i].insn_off < insn_end; i++)
                /* The verifier ensures that linfo[i].insn_off is
                 * strictly increasing
                 */
                jited_linfo[i] = prog->bpf_func +
                        insn_to_jit_off[linfo[i].insn_off - insn_start - 1];
}

void bpf_prog_free_linfo(struct bpf_prog *prog)
{
        bpf_prog_free_jited_linfo(prog);
        kvfree(prog->aux->linfo);
}

struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
                                  gfp_t gfp_extra_flags)
{
        gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
        struct bpf_prog *fp;
        u32 pages, delta;
        int ret;

        size = round_up(size, PAGE_SIZE);
        pages = size / PAGE_SIZE;
        if (pages <= fp_old->pages)
                return fp_old;

        delta = pages - fp_old->pages;
        ret = __bpf_prog_charge(fp_old->aux->user, delta);
        if (ret)
                return NULL;

        fp = __vmalloc(size, gfp_flags);
        if (fp == NULL) {
                __bpf_prog_uncharge(fp_old->aux->user, delta);
        } else {
                memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
                fp->pages = pages;
                fp->aux->prog = fp;

                /* We keep fp->aux from fp_old around in the new
                 * reallocated structure.
                 */
                fp_old->aux = NULL;
                __bpf_prog_free(fp_old);
        }

        return fp;
}

void __bpf_prog_free(struct bpf_prog *fp)
{
        if (fp->aux) {
                mutex_destroy(&fp->aux->used_maps_mutex);
                mutex_destroy(&fp->aux->dst_mutex);
                free_percpu(fp->aux->stats);
                kfree(fp->aux->poke_tab);
                kfree(fp->aux);
        }
        vfree(fp);
}

int bpf_prog_calc_tag(struct bpf_prog *fp)
{
        const u32 bits_offset = SHA1_BLOCK_SIZE - sizeof(__be64);
        u32 raw_size = bpf_prog_tag_scratch_size(fp);
        u32 digest[SHA1_DIGEST_WORDS];
        u32 ws[SHA1_WORKSPACE_WORDS];
        u32 i, bsize, psize, blocks;
        struct bpf_insn *dst;
        bool was_ld_map;
        u8 *raw, *todo;
        __be32 *result;
        __be64 *bits;

        raw = vmalloc(raw_size);
        if (!raw)
                return -ENOMEM;

        sha1_init(digest);
        memset(ws, 0, sizeof(ws));

        /* We need to take out the map fd for the digest calculation
         * since they are unstable from user space side.
         */
        dst = (void *)raw;
        for (i = 0, was_ld_map = false; i < fp->len; i++) {
                dst[i] = fp->insnsi[i];
                if (!was_ld_map &&
                    dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) &&
                    (dst[i].src_reg == BPF_PSEUDO_MAP_FD ||
                     dst[i].src_reg == BPF_PSEUDO_MAP_VALUE)) {
                        was_ld_map = true;
                        dst[i].imm = 0;
                } else if (was_ld_map &&
                           dst[i].code == 0 &&
                           dst[i].dst_reg == 0 &&
                           dst[i].src_reg == 0 &&
                           dst[i].off == 0) {
                        was_ld_map = false;
                        dst[i].imm = 0;
                } else {
                        was_ld_map = false;
                }
        }

        psize = bpf_prog_insn_size(fp);
        memset(&raw[psize], 0, raw_size - psize);
        raw[psize++] = 0x80;

        bsize  = round_up(psize, SHA1_BLOCK_SIZE);
        blocks = bsize / SHA1_BLOCK_SIZE;
        todo   = raw;
        if (bsize - psize >= sizeof(__be64)) {
                bits = (__be64 *)(todo + bsize - sizeof(__be64));
        } else {
                bits = (__be64 *)(todo + bsize + bits_offset);
                blocks++;
        }
        *bits = cpu_to_be64((psize - 1) << 3);

        while (blocks--) {
                sha1_transform(digest, todo, ws);
                todo += SHA1_BLOCK_SIZE;
        }

        result = (__force __be32 *)digest;
        for (i = 0; i < SHA1_DIGEST_WORDS; i++)
                result[i] = cpu_to_be32(digest[i]);
        memcpy(fp->tag, result, sizeof(fp->tag));

        vfree(raw);
        return 0;
}

static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old,
                                s32 end_new, s32 curr, const bool probe_pass)
{
        const s64 imm_min = S32_MIN, imm_max = S32_MAX;
        s32 delta = end_new - end_old;
        s64 imm = insn->imm;

        if (curr < pos && curr + imm + 1 >= end_old)
                imm += delta;
        else if (curr >= end_new && curr + imm + 1 < end_new)
                imm -= delta;
        if (imm < imm_min || imm > imm_max)
                return -ERANGE;
        if (!probe_pass)
                insn->imm = imm;
        return 0;
}

static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old,
                                s32 end_new, s32 curr, const bool probe_pass)
{
        const s32 off_min = S16_MIN, off_max = S16_MAX;
        s32 delta = end_new - end_old;
        s32 off = insn->off;

        if (curr < pos && curr + off + 1 >= end_old)
                off += delta;
        else if (curr >= end_new && curr + off + 1 < end_new)
                off -= delta;
        if (off < off_min || off > off_max)
                return -ERANGE;
        if (!probe_pass)
                insn->off = off;
        return 0;
}

static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, s32 end_old,
                            s32 end_new, const bool probe_pass)
{
        u32 i, insn_cnt = prog->len + (probe_pass ? end_new - end_old : 0);
        struct bpf_insn *insn = prog->insnsi;
        int ret = 0;

        for (i = 0; i < insn_cnt; i++, insn++) {
                u8 code;

                /* In the probing pass we still operate on the original,
                 * unpatched image in order to check overflows before we
                 * do any other adjustments. Therefore skip the patchlet.
                 */
                if (probe_pass && i == pos) {
                        i = end_new;
                        insn = prog->insnsi + end_old;
                }
                code = insn->code;
                if ((BPF_CLASS(code) != BPF_JMP &&
                     BPF_CLASS(code) != BPF_JMP32) ||
                    BPF_OP(code) == BPF_EXIT)
                        continue;
                /* Adjust offset of jmps if we cross patch boundaries. */
                if (BPF_OP(code) == BPF_CALL) {
                        if (insn->src_reg != BPF_PSEUDO_CALL)
                                continue;
                        ret = bpf_adj_delta_to_imm(insn, pos, end_old,
                                                   end_new, i, probe_pass);
                } else {
                        ret = bpf_adj_delta_to_off(insn, pos, end_old,
                                                   end_new, i, probe_pass);
                }
                if (ret)
                        break;
        }

        return ret;
}

static void bpf_adj_linfo(struct bpf_prog *prog, u32 off, u32 delta)
{
        struct bpf_line_info *linfo;
        u32 i, nr_linfo;

        nr_linfo = prog->aux->nr_linfo;
        if (!nr_linfo || !delta)
                return;

        linfo = prog->aux->linfo;

        for (i = 0; i < nr_linfo; i++)
                if (off < linfo[i].insn_off)
                        break;

        /* Push all off < linfo[i].insn_off by delta */
        for (; i < nr_linfo; i++)
                linfo[i].insn_off += delta;
}

struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
                                       const struct bpf_insn *patch, u32 len)
{
        u32 insn_adj_cnt, insn_rest, insn_delta = len - 1;
        const u32 cnt_max = S16_MAX;
        struct bpf_prog *prog_adj;
        int err;

        /* Since our patchlet doesn't expand the image, we're done. */
        if (insn_delta == 0) {
                memcpy(prog->insnsi + off, patch, sizeof(*patch));
                return prog;
        }

        insn_adj_cnt = prog->len + insn_delta;

        /* Reject anything that would potentially let the insn->off
         * target overflow when we have excessive program expansions.
         * We need to probe here before we do any reallocation where
         * we afterwards may not fail anymore.
         */
        if (insn_adj_cnt > cnt_max &&
            (err = bpf_adj_branches(prog, off, off + 1, off + len, true)))
                return ERR_PTR(err);

        /* Several new instructions need to be inserted. Make room
         * for them. Likely, there's no need for a new allocation as
         * last page could have large enough tailroom.
         */
        prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt),
                                    GFP_USER);
        if (!prog_adj)
                return ERR_PTR(-ENOMEM);

        prog_adj->len = insn_adj_cnt;

        /* Patching happens in 3 steps:
         *
         * 1) Move over tail of insnsi from next instruction onwards,
         *    so we can patch the single target insn with one or more
         *    new ones (patching is always from 1 to n insns, n > 0).
         * 2) Inject new instructions at the target location.
         * 3) Adjust branch offsets if necessary.
         */
        insn_rest = insn_adj_cnt - off - len;

        memmove(prog_adj->insnsi + off + len, prog_adj->insnsi + off + 1,
                sizeof(*patch) * insn_rest);
        memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len);

        /* We are guaranteed to not fail at this point, otherwise
         * the ship has sailed to reverse to the original state. An
         * overflow cannot happen at this point.
         */
        BUG_ON(bpf_adj_branches(prog_adj, off, off + 1, off + len, false));

        bpf_adj_linfo(prog_adj, off, insn_delta);

        return prog_adj;
}

int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt)
{
        int err;

        /* Branch offsets can't overflow when program is shrinking, no need
         * to call bpf_adj_branches(..., true) here
         */
        memmove(prog->insnsi + off, prog->insnsi + off + cnt,
                sizeof(struct bpf_insn) * (prog->len - off - cnt));
        prog->len -= cnt;

        err = bpf_adj_branches(prog, off, off + cnt, off, false);
        WARN_ON_ONCE(err);
        return err;
}

static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp)
{
        int i;

        for (i = 0; i < fp->aux->func_cnt; i++)
                bpf_prog_kallsyms_del(fp->aux->func[i]);
}

void bpf_prog_kallsyms_del_all(struct bpf_prog *fp)
{
        bpf_prog_kallsyms_del_subprogs(fp);
        bpf_prog_kallsyms_del(fp);
}

#ifdef CONFIG_BPF_JIT
/* All BPF JIT sysctl knobs here. */
int bpf_jit_enable   __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
int bpf_jit_harden   __read_mostly;
long bpf_jit_limit   __read_mostly;
long bpf_jit_limit_max __read_mostly;

static void
bpf_prog_ksym_set_addr(struct bpf_prog *prog)
{
        const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(prog);
        unsigned long addr = (unsigned long)hdr;

        WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog));

        prog->aux->ksym.start = (unsigned long) prog->bpf_func;
        prog->aux->ksym.end   = addr + hdr->pages * PAGE_SIZE;
}

static void
bpf_prog_ksym_set_name(struct bpf_prog *prog)
{
        char *sym = prog->aux->ksym.name;
        const char *end = sym + KSYM_NAME_LEN;
        const struct btf_type *type;
        const char *func_name;

        BUILD_BUG_ON(sizeof("bpf_prog_") +
                     sizeof(prog->tag) * 2 +
                     /* name has been null terminated.
                      * We should need +1 for the '_' preceding
                      * the name.  However, the null character
                      * is double counted between the name and the
                      * sizeof("bpf_prog_") above, so we omit
                      * the +1 here.
                      */
                     sizeof(prog->aux->name) > KSYM_NAME_LEN);

        sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_");
        sym  = bin2hex(sym, prog->tag, sizeof(prog->tag));

        /* prog->aux->name will be ignored if full btf name is available */
        if (prog->aux->func_info_cnt) {
                type = btf_type_by_id(prog->aux->btf,
                                      prog->aux->func_info[prog->aux->func_idx].type_id);
                func_name = btf_name_by_offset(prog->aux->btf, type->name_off);
                snprintf(sym, (size_t)(end - sym), "_%s", func_name);
                return;
        }

        if (prog->aux->name[0])
                snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name);
        else
                *sym = 0;
}

static unsigned long bpf_get_ksym_start(struct latch_tree_node *n)
{
        return container_of(n, struct bpf_ksym, tnode)->start;
}

static __always_inline bool bpf_tree_less(struct latch_tree_node *a,
                                          struct latch_tree_node *b)
{
        return bpf_get_ksym_start(a) < bpf_get_ksym_start(b);
}

static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n)
{
        unsigned long val = (unsigned long)key;
        const struct bpf_ksym *ksym;

        ksym = container_of(n, struct bpf_ksym, tnode);

        if (val < ksym->start)
                return -1;
        /* Ensure that we detect return addresses as part of the program, when
         * the final instruction is a call for a program part of the stack
         * trace. Therefore, do val > ksym->end instead of val >= ksym->end.
         */
        if (val > ksym->end)
                return  1;

        return 0;
}

static const struct latch_tree_ops bpf_tree_ops = {
        .less        = bpf_tree_less,
        .comp        = bpf_tree_comp,
};

static DEFINE_SPINLOCK(bpf_lock);
static LIST_HEAD(bpf_kallsyms);
static struct latch_tree_root bpf_tree __cacheline_aligned;

void bpf_ksym_add(struct bpf_ksym *ksym)
{
        spin_lock_bh(&bpf_lock);
        WARN_ON_ONCE(!list_empty(&ksym->lnode));
        list_add_tail_rcu(&ksym->lnode, &bpf_kallsyms);
        latch_tree_insert(&ksym->tnode, &bpf_tree, &bpf_tree_ops);
        spin_unlock_bh(&bpf_lock);
}

static void __bpf_ksym_del(struct bpf_ksym *ksym)
{
        if (list_empty(&ksym->lnode))
                return;

        latch_tree_erase(&ksym->tnode, &bpf_tree, &bpf_tree_ops);
        list_del_rcu(&ksym->lnode);
}

void bpf_ksym_del(struct bpf_ksym *ksym)
{
        spin_lock_bh(&bpf_lock);
        __bpf_ksym_del(ksym);
        spin_unlock_bh(&bpf_lock);
}

static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp)
{
        return fp->jited && !bpf_prog_was_classic(fp);
}

static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
{
        return list_empty(&fp->aux->ksym.lnode) ||
               fp->aux->ksym.lnode.prev == LIST_POISON2;
}

void bpf_prog_kallsyms_add(struct bpf_prog *fp)
{
        if (!bpf_prog_kallsyms_candidate(fp) ||
            !bpf_capable())
                return;

        bpf_prog_ksym_set_addr(fp);
        bpf_prog_ksym_set_name(fp);
        fp->aux->ksym.prog = true;

        bpf_ksym_add(&fp->aux->ksym);
}

void bpf_prog_kallsyms_del(struct bpf_prog *fp)
{
        if (!bpf_prog_kallsyms_candidate(fp))
                return;

        bpf_ksym_del(&fp->aux->ksym);
}

static struct bpf_ksym *bpf_ksym_find(unsigned long addr)
{
        struct latch_tree_node *n;

        n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops);
        return n ? container_of(n, struct bpf_ksym, tnode) : NULL;
}

const char *__bpf_address_lookup(unsigned long addr, unsigned long *size,
                                 unsigned long *off, char *sym)
{
        struct bpf_ksym *ksym;
        char *ret = NULL;

        rcu_read_lock();
        ksym = bpf_ksym_find(addr);
        if (ksym) {
                unsigned long symbol_start = ksym->start;
                unsigned long symbol_end = ksym->end;

                strncpy(sym, ksym->name, KSYM_NAME_LEN);

                ret = sym;
                if (size)
                        *size = symbol_end - symbol_start;
                if (off)
                        *off  = addr - symbol_start;
        }
        rcu_read_unlock();

        return ret;
}

bool is_bpf_text_address(unsigned long addr)
{
        bool ret;

        rcu_read_lock();
        ret = bpf_ksym_find(addr) != NULL;
        rcu_read_unlock();

        return ret;
}

static struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
{
        struct bpf_ksym *ksym = bpf_ksym_find(addr);

        return ksym && ksym->prog ?
               container_of(ksym, struct bpf_prog_aux, ksym)->prog :
               NULL;
}

const struct exception_table_entry *search_bpf_extables(unsigned long addr)
{
        const struct exception_table_entry *e = NULL;
        struct bpf_prog *prog;

        rcu_read_lock();
        prog = bpf_prog_ksym_find(addr);
        if (!prog)
                goto out;
        if (!prog->aux->num_exentries)
                goto out;

        e = search_extable(prog->aux->extable, prog->aux->num_exentries, addr);
out:
        rcu_read_unlock();
        return e;
}

int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
                    char *sym)
{
        struct bpf_ksym *ksym;
        unsigned int it = 0;
        int ret = -ERANGE;

        if (!bpf_jit_kallsyms_enabled())
                return ret;

        rcu_read_lock();
        list_for_each_entry_rcu(ksym, &bpf_kallsyms, lnode) {
                if (it++ != symnum)
                        continue;

                strncpy(sym, ksym->name, KSYM_NAME_LEN);

                *value = ksym->start;
                *type  = BPF_SYM_ELF_TYPE;

                ret = 0;
                break;
        }
        rcu_read_unlock();

        return ret;
}

int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
                                struct bpf_jit_poke_descriptor *poke)
{
        struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab;
        static const u32 poke_tab_max = 1024;
        u32 slot = prog->aux->size_poke_tab;
        u32 size = slot + 1;

        if (size > poke_tab_max)
                return -ENOSPC;
        if (poke->tailcall_target || poke->tailcall_target_stable ||
            poke->tailcall_bypass || poke->adj_off || poke->bypass_addr)
                return -EINVAL;

        switch (poke->reason) {
        case BPF_POKE_REASON_TAIL_CALL:
                if (!poke->tail_call.map)
                        return -EINVAL;
                break;
        default:
                return -EINVAL;
        }

        tab = krealloc(tab, size * sizeof(*poke), GFP_KERNEL);
        if (!tab)
                return -ENOMEM;

        memcpy(&tab[slot], poke, sizeof(*poke));
        prog->aux->size_poke_tab = size;
        prog->aux->poke_tab = tab;

        return slot;
}

static atomic_long_t bpf_jit_current;

/* Can be overridden by an arch's JIT compiler if it has a custom,
 * dedicated BPF backend memory area, or if neither of the two
 * below apply.
 */
u64 __weak bpf_jit_alloc_exec_limit(void)
{
#if defined(MODULES_VADDR)
        return MODULES_END - MODULES_VADDR;
#else
        return VMALLOC_END - VMALLOC_START;
#endif
}

static int __init bpf_jit_charge_init(void)
{
        /* Only used as heuristic here to derive limit. */
        bpf_jit_limit_max = bpf_jit_alloc_exec_limit();
        bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 1,
                                            PAGE_SIZE), LONG_MAX);
        return 0;
}
pure_initcall(bpf_jit_charge_init);

int bpf_jit_charge_modmem(u32 pages)
{
        if (atomic_long_add_return(pages, &bpf_jit_current) >
            (bpf_jit_limit >> PAGE_SHIFT)) {
                if (!bpf_capable()) {
                        atomic_long_sub(pages, &bpf_jit_current);
                        return -EPERM;
                }
        }

        return 0;
}

void bpf_jit_uncharge_modmem(u32 pages)
{
        atomic_long_sub(pages, &bpf_jit_current);
}

void *__weak bpf_jit_alloc_exec(unsigned long size)
{
        return module_alloc(size);
}

void __weak bpf_jit_free_exec(void *addr)
{
        module_memfree(addr);
}

struct bpf_binary_header *
bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
                     unsigned int alignment,
                     bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
        struct bpf_binary_header *hdr;
        u32 size, hole, start, pages;

        WARN_ON_ONCE(!is_power_of_2(alignment) ||
                     alignment > BPF_IMAGE_ALIGNMENT);

        /* Most of BPF filters are really small, but if some of them
         * fill a page, allow at least 128 extra bytes to insert a
         * random section of illegal instructions.
         */
        size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
        pages = size / PAGE_SIZE;

        if (bpf_jit_charge_modmem(pages))
                return NULL;
        hdr = bpf_jit_alloc_exec(size);
        if (!hdr) {
                bpf_jit_uncharge_modmem(pages);
                return NULL;
        }

        /* Fill space with illegal/arch-dep instructions. */
        bpf_fill_ill_insns(hdr, size);

        hdr->pages = pages;
        hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
                     PAGE_SIZE - sizeof(*hdr));
        start = (get_random_int() % hole) & ~(alignment - 1);

        /* Leave a random number of instructions before BPF code. */
        *image_ptr = &hdr->image[start];

        return hdr;
}

void bpf_jit_binary_free(struct bpf_binary_header *hdr)
{
        u32 pages = hdr->pages;

        bpf_jit_free_exec(hdr);
        bpf_jit_uncharge_modmem(pages);
}

/* This symbol is only overridden by archs that have different
 * requirements than the usual eBPF JITs, f.e. when they only
 * implement cBPF JIT, do not set images read-only, etc.
 */
void __weak bpf_jit_free(struct bpf_prog *fp)
{
        if (fp->jited) {
                struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);

                bpf_jit_binary_free(hdr);

                WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
        }

        bpf_prog_unlock_free(fp);
}

int bpf_jit_get_func_addr(const struct bpf_prog *prog,
                          const struct bpf_insn *insn, bool extra_pass,
                          u64 *func_addr, bool *func_addr_fixed)
{
        s16 off = insn->off;
        s32 imm = insn->imm;
        u8 *addr;

        *func_addr_fixed = insn->src_reg != BPF_PSEUDO_CALL;
        if (!*func_addr_fixed) {
                /* Place-holder address till the last pass has collected
                 * all addresses for JITed subprograms in which case we
                 * can pick them up from prog->aux.
                 */
                if (!extra_pass)
                        addr = NULL;
                else if (prog->aux->func &&
                         off >= 0 && off < prog->aux->func_cnt)
                        addr = (u8 *)prog->aux->func[off]->bpf_func;
                else
                        return -EINVAL;
        } else {
                /* Address of a BPF helper call. Since part of the core
                 * kernel, it's always at a fixed location. __bpf_call_base
                 * and the helper with imm relative to it are both in core
                 * kernel.
                 */
                addr = (u8 *)__bpf_call_base + imm;
        }

        *func_addr = (unsigned long)addr;
        return 0;
}

static int bpf_jit_blind_insn(const struct bpf_insn *from,
                              const struct bpf_insn *aux,
                              struct bpf_insn *to_buff,
                              bool emit_zext)
{
        struct bpf_insn *to = to_buff;
        u32 imm_rnd = get_random_int();
        s16 off;

        BUILD_BUG_ON(BPF_REG_AX  + 1 != MAX_BPF_JIT_REG);
        BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG);

        /* Constraints on AX register:
         *
         * AX register is inaccessible from user space. It is mapped in
         * all JITs, and used here for constant blinding rewrites. It is
         * typically "stateless" meaning its contents are only valid within
         * the executed instruction, but not across several instructions.
         * There are a few exceptions however which are further detailed
         * below.
         *
         * Constant blinding is only used by JITs, not in the interpreter.
         * The interpreter uses AX in some occasions as a local temporary
         * register e.g. in DIV or MOD instructions.
         *
         * In restricted circumstances, the verifier can also use the AX
         * register for rewrites as long as they do not interfere with
         * the above cases!
         */
        if (from->dst_reg == BPF_REG_AX || from->src_reg == BPF_REG_AX)
                goto out;

        if (from->imm == 0 &&
            (from->code == (BPF_ALU   | BPF_MOV | BPF_K) ||
             from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) {
                *to++ = BPF_ALU64_REG(BPF_XOR, from->dst_reg, from->dst_reg);
                goto out;
        }

        switch (from->code) {
        case BPF_ALU | BPF_ADD | BPF_K:
        case BPF_ALU | BPF_SUB | BPF_K:
        case BPF_ALU | BPF_AND | BPF_K:
        case BPF_ALU | BPF_OR  | BPF_K:
        case BPF_ALU | BPF_XOR | BPF_K:
        case BPF_ALU | BPF_MUL | BPF_K:
        case BPF_ALU | BPF_MOV | BPF_K:
        case BPF_ALU | BPF_DIV | BPF_K:
        case BPF_ALU | BPF_MOD | BPF_K:
                *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
                *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
                *to++ = BPF_ALU32_REG(from->code, from->dst_reg, BPF_REG_AX);
                break;

        case BPF_ALU64 | BPF_ADD | BPF_K:
        case BPF_ALU64 | BPF_SUB | BPF_K:
        case BPF_ALU64 | BPF_AND | BPF_K:
        case BPF_ALU64 | BPF_OR  | BPF_K:
        case BPF_ALU64 | BPF_XOR | BPF_K:
        case BPF_ALU64 | BPF_MUL | BPF_K:
        case BPF_ALU64 | BPF_MOV | BPF_K:
        case BPF_ALU64 | BPF_DIV | BPF_K:
        case BPF_ALU64 | BPF_MOD | BPF_K:
                *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
                *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
                *to++ = BPF_ALU64_REG(from->code, from->dst_reg, BPF_REG_AX);
                break;

        case BPF_JMP | BPF_JEQ  | BPF_K:
        case BPF_JMP | BPF_JNE  | BPF_K:
        case BPF_JMP | BPF_JGT  | BPF_K:
        case BPF_JMP | BPF_JLT  | BPF_K:
        case BPF_JMP | BPF_JGE  | BPF_K:
        case BPF_JMP | BPF_JLE  | BPF_K:
        case BPF_JMP | BPF_JSGT | BPF_K:
        case BPF_JMP | BPF_JSLT | BPF_K:
        case BPF_JMP | BPF_JSGE | BPF_K:
        case BPF_JMP | BPF_JSLE | BPF_K:
        case BPF_JMP | BPF_JSET | BPF_K:
                /* Accommodate for extra offset in case of a backjump. */
                off = from->off;
                if (off < 0)
                        off -= 2;
                *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
                *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
                *to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off);
                break;

        case BPF_JMP32 | BPF_JEQ  | BPF_K:
        case BPF_JMP32 | BPF_JNE  | BPF_K:
        case BPF_JMP32 | BPF_JGT  | BPF_K:
        case BPF_JMP32 | BPF_JLT  | BPF_K:
        case BPF_JMP32 | BPF_JGE  | BPF_K:
        case BPF_JMP32 | BPF_JLE  | BPF_K:
        case BPF_JMP32 | BPF_JSGT | BPF_K:
        case BPF_JMP32 | BPF_JSLT | BPF_K:
        case BPF_JMP32 | BPF_JSGE | BPF_K:
        case BPF_JMP32 | BPF_JSLE | BPF_K:
        case BPF_JMP32 | BPF_JSET | BPF_K:
                /* Accommodate for extra offset in case of a backjump. */
                off = from->off;
                if (off < 0)
                        off -= 2;
                *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
                *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
                *to++ = BPF_JMP32_REG(from->code, from->dst_reg, BPF_REG_AX,
                                      off);
                break;

        case BPF_LD | BPF_IMM | BPF_DW:
                *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm);
                *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
                *to++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32);
                *to++ = BPF_ALU64_REG(BPF_MOV, aux[0].dst_reg, BPF_REG_AX);
                break;
        case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */
                *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm);
                *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
                if (emit_zext)
                        *to++ = BPF_ZEXT_REG(BPF_REG_AX);
                *to++ = BPF_ALU64_REG(BPF_OR,  aux[0].dst_reg, BPF_REG_AX);
                break;

        case BPF_ST | BPF_MEM | BPF_DW:
        case BPF_ST | BPF_MEM | BPF_W:
        case BPF_ST | BPF_MEM | BPF_H:
        case BPF_ST | BPF_MEM | BPF_B:
                *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
                *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
                *to++ = BPF_STX_MEM(from->code, from->dst_reg, BPF_REG_AX, from->off);
                break;
        }
out:
        return to - to_buff;
}

static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other,
                                              gfp_t gfp_extra_flags)
{
        gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
        struct bpf_prog *fp;

        fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags);
        if (fp != NULL) {
                /* aux->prog still points to the fp_other one, so
                 * when promoting the clone to the real program,
                 * this still needs to be adapted.
                 */
                memcpy(fp, fp_other, fp_other->pages * PAGE_SIZE);
        }

        return fp;
}

static void bpf_prog_clone_free(struct bpf_prog *fp)
{
        /* aux was stolen by the other clone, so we cannot free
         * it from this path! It will be freed eventually by the
         * other program on release.
         *
         * At this point, we don't need a deferred release since
         * clone is guaranteed to not be locked.
         */
        fp->aux = NULL;
        __bpf_prog_free(fp);
}

void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other)
{
        /* We have to repoint aux->prog to self, as we don't
         * know whether fp here is the clone or the original.
         */
        fp->aux->prog = fp;
        bpf_prog_clone_free(fp_other);
}

struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
{
        struct bpf_insn insn_buff[16], aux[2];
        struct bpf_prog *clone, *tmp;
        int insn_delta, insn_cnt;
        struct bpf_insn *insn;
        int i, rewritten;

        if (!bpf_jit_blinding_enabled(prog) || prog->blinded)
                return prog;

        clone = bpf_prog_clone_create(prog, GFP_USER);
        if (!clone)
                return ERR_PTR(-ENOMEM);

        insn_cnt = clone->len;
        insn = clone->insnsi;

        for (i = 0; i < insn_cnt; i++, insn++) {
                /* We temporarily need to hold the original ld64 insn
                 * so that we can still access the first part in the
                 * second blinding run.
                 */
                if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW) &&
                    insn[1].code == 0)
                        memcpy(aux, insn, sizeof(aux));

                rewritten = bpf_jit_blind_insn(insn, aux, insn_buff,
                                                clone->aux->verifier_zext);
                if (!rewritten)
                        continue;

                tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten);
                if (IS_ERR(tmp)) {
                        /* Patching may have repointed aux->prog during
                         * realloc from the original one, so we need to
                         * fix it up here on error.
                         */
                        bpf_jit_prog_release_other(prog, clone);
                        return tmp;
                }

                clone = tmp;
                insn_delta = rewritten - 1;

                /* Walk new program and skip insns we just inserted. */
                insn = clone->insnsi + i + insn_delta;
                insn_cnt += insn_delta;
                i        += insn_delta;
        }

        clone->blinded = 1;
        return clone;
}
#endif /* CONFIG_BPF_JIT */

/* Base function for offset calculation. Needs to go into .text section,
 * therefore keeping it non-static as well; will also be used by JITs
 * anyway later on, so do not let the compiler omit it. This also needs
 * to go into kallsyms for correlation from e.g. bpftool, so naming
 * must not change.
 */
noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
{
        return 0;
}
EXPORT_SYMBOL_GPL(__bpf_call_base);

/* All UAPI available opcodes. */
#define BPF_INSN_MAP(INSN_2, INSN_3)                \
        /* 32 bit ALU operations. */                \
        /*   Register based. */                        \
        INSN_3(ALU, ADD,  X),                        \
        INSN_3(ALU, SUB,  X),                        \
        INSN_3(ALU, AND,  X),                        \
        INSN_3(ALU, OR,   X),                        \
        INSN_3(ALU, LSH,  X),                        \
        INSN_3(ALU, RSH,  X),                        \
        INSN_3(ALU, XOR,  X),                        \
        INSN_3(ALU, MUL,  X),                        \
        INSN_3(ALU, MOV,  X),                        \
        INSN_3(ALU, ARSH, X),                        \
        INSN_3(ALU, DIV,  X),                        \
        INSN_3(ALU, MOD,  X),                        \
        INSN_2(ALU, NEG),                        \
        INSN_3(ALU, END, TO_BE),                \
        INSN_3(ALU, END, TO_LE),                \
        /*   Immediate based. */                \
        INSN_3(ALU, ADD,  K),                        \
        INSN_3(ALU, SUB,  K),                        \
        INSN_3(ALU, AND,  K),                        \
        INSN_3(ALU, OR,   K),                        \
        INSN_3(ALU, LSH,  K),                        \
        INSN_3(ALU, RSH,  K),                        \
        INSN_3(ALU, XOR,  K),                        \
        INSN_3(ALU, MUL,  K),                        \
        INSN_3(ALU, MOV,  K),                        \
        INSN_3(ALU, ARSH, K),                        \
        INSN_3(ALU, DIV,  K),                        \
        INSN_3(ALU, MOD,  K),                        \
        /* 64 bit ALU operations. */                \
        /*   Register based. */                        \
        INSN_3(ALU64, ADD,  X),                        \
        INSN_3(ALU64, SUB,  X),                        \
        INSN_3(ALU64, AND,  X),                        \
        INSN_3(ALU64, OR,   X),                        \
        INSN_3(ALU64, LSH,  X),                        \
        INSN_3(ALU64, RSH,  X),                        \
        INSN_3(ALU64, XOR,  X),                        \
        INSN_3(ALU64, MUL,  X),                        \
        INSN_3(ALU64, MOV,  X),                        \
        INSN_3(ALU64, ARSH, X),                        \
        INSN_3(ALU64, DIV,  X),                        \
        INSN_3(ALU64, MOD,  X),                        \
        INSN_2(ALU64, NEG),                        \
        /*   Immediate based. */                \
        INSN_3(ALU64, ADD,  K),                        \
        INSN_3(ALU64, SUB,  K),                        \
        INSN_3(ALU64, AND,  K),                        \
        INSN_3(ALU64, OR,   K),                        \
        INSN_3(ALU64, LSH,  K),                        \
        INSN_3(ALU64, RSH,  K),                        \
        INSN_3(ALU64, XOR,  K),                        \
        INSN_3(ALU64, MUL,  K),                        \
        INSN_3(ALU64, MOV,  K),                        \
        INSN_3(ALU64, ARSH, K),                        \
        INSN_3(ALU64, DIV,  K),                        \
        INSN_3(ALU64, MOD,  K),                        \
        /* Call instruction. */                        \
        INSN_2(JMP, CALL),                        \
        /* Exit instruction. */                        \
        INSN_2(JMP, EXIT),                        \
        /* 32-bit Jump instructions. */                \
        /*   Register based. */                        \
        INSN_3(JMP32, JEQ,  X),                        \
        INSN_3(JMP32, JNE,  X),                        \
        INSN_3(JMP32, JGT,  X),                        \
        INSN_3(JMP32, JLT,  X),                        \
        INSN_3(JMP32, JGE,  X),                        \
        INSN_3(JMP32, JLE,  X),                        \
        INSN_3(JMP32, JSGT, X),                        \
        INSN_3(JMP32, JSLT, X),                        \
        INSN_3(JMP32, JSGE, X),                        \
        INSN_3(JMP32, JSLE, X),                        \
        INSN_3(JMP32, JSET, X),                        \
        /*   Immediate based. */                \
        INSN_3(JMP32, JEQ,  K),                        \
        INSN_3(JMP32, JNE,  K),                        \
        INSN_3(JMP32, JGT,  K),                        \
        INSN_3(JMP32, JLT,  K),                        \
        INSN_3(JMP32, JGE,  K),                        \
        INSN_3(JMP32, JLE,  K),                        \
        INSN_3(JMP32, JSGT, K),                        \
        INSN_3(JMP32, JSLT, K),                        \
        INSN_3(JMP32, JSGE, K),                        \
        INSN_3(JMP32, JSLE, K),                        \
        INSN_3(JMP32, JSET, K),                        \
        /* Jump instructions. */                \
        /*   Register based. */                        \
        INSN_3(JMP, JEQ,  X),                        \
        INSN_3(JMP, JNE,  X),                        \
        INSN_3(JMP, JGT,  X),                        \
        INSN_3(JMP, JLT,  X),                        \
        INSN_3(JMP, JGE,  X),                        \
        INSN_3(JMP, JLE,  X),                        \
        INSN_3(JMP, JSGT, X),                        \
        INSN_3(JMP, JSLT, X),                        \
        INSN_3(JMP, JSGE, X),                        \
        INSN_3(JMP, JSLE, X),                        \
        INSN_3(JMP, JSET, X),                        \
        /*   Immediate based. */                \
        INSN_3(JMP, JEQ,  K),                        \
        INSN_3(JMP, JNE,  K),                        \
        INSN_3(JMP, JGT,  K),                        \
        INSN_3(JMP, JLT,  K),                        \
        INSN_3(JMP, JGE,  K),                        \
        INSN_3(JMP, JLE,  K),                        \
        INSN_3(JMP, JSGT, K),                        \
        INSN_3(JMP, JSLT, K),                        \
        INSN_3(JMP, JSGE, K),                        \
        INSN_3(JMP, JSLE, K),                        \
        INSN_3(JMP, JSET, K),                        \
        INSN_2(JMP, JA),                        \
        /* Store instructions. */                \
        /*   Register based. */                        \
        INSN_3(STX, MEM,  B),                        \
        INSN_3(STX, MEM,  H),                        \
        INSN_3(STX, MEM,  W),                        \
        INSN_3(STX, MEM,  DW),                        \
        INSN_3(STX, XADD, W),                        \
        INSN_3(STX, XADD, DW),                        \
        /*   Immediate based. */                \
        INSN_3(ST, MEM, B),                        \
        INSN_3(ST, MEM, H),                        \
        INSN_3(ST, MEM, W),                        \
        INSN_3(ST, MEM, DW),                        \
        /* Load instructions. */                \
        /*   Register based. */                        \
        INSN_3(LDX, MEM, B),                        \
        INSN_3(LDX, MEM, H),                        \
        INSN_3(LDX, MEM, W),                        \
        INSN_3(LDX, MEM, DW),                        \
        /*   Immediate based. */                \
        INSN_3(LD, IMM, DW)

bool bpf_opcode_in_insntable(u8 code)
{
#define BPF_INSN_2_TBL(x, y)    [BPF_##x | BPF_##y] = true
#define BPF_INSN_3_TBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = true
        static const bool public_insntable[256] = {
                [0 ... 255] = false,
                /* Now overwrite non-defaults ... */
                BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL),
                /* UAPI exposed, but rewritten opcodes. cBPF carry-over. */
                [BPF_LD | BPF_ABS | BPF_B] = true,
                [BPF_LD | BPF_ABS | BPF_H] = true,
                [BPF_LD | BPF_ABS | BPF_W] = true,
                [BPF_LD | BPF_IND | BPF_B] = true,
                [BPF_LD | BPF_IND | BPF_H] = true,
                [BPF_LD | BPF_IND | BPF_W] = true,
        };
#undef BPF_INSN_3_TBL
#undef BPF_INSN_2_TBL
        return public_insntable[code];
}

#ifndef CONFIG_BPF_JIT_ALWAYS_ON
u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr)
{
        memset(dst, 0, size);
        return -EFAULT;
}

/**
 *        __bpf_prog_run - run eBPF program on a given context
 *        @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers
 *        @insn: is the array of eBPF instructions
 *        @stack: is the eBPF storage stack
 *
 * Decode and execute eBPF instructions.
 */
static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
{
#define BPF_INSN_2_LBL(x, y)    [BPF_##x | BPF_##y] = &&x##_##y
#define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z
        static const void * const jumptable[256] __annotate_jump_table = {
                [0 ... 255] = &&default_label,
                /* Now overwrite non-defaults ... */
                BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL),
                /* Non-UAPI available opcodes. */
                [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS,
                [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
                [BPF_ST  | BPF_NOSPEC] = &&ST_NOSPEC,
                [BPF_LDX | BPF_PROBE_MEM | BPF_B] = &&LDX_PROBE_MEM_B,
                [BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H,
                [BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W,
                [BPF_LDX | BPF_PROBE_MEM | BPF_DW] = &&LDX_PROBE_MEM_DW,
        };
#undef BPF_INSN_3_LBL
#undef BPF_INSN_2_LBL
        u32 tail_call_cnt = 0;

#define CONT         ({ insn++; goto select_insn; })
#define CONT_JMP ({ insn++; goto select_insn; })

select_insn:
        goto *jumptable[insn->code];

        /* Explicitly mask the register-based shift amounts with 63 or 31
         * to avoid undefined behavior. Normally this won't affect the
         * generated code, for example, in case of native 64 bit archs such
         * as x86-64 or arm64, the compiler is optimizing the AND away for
         * the interpreter. In case of JITs, each of the JIT backends compiles
         * the BPF shift operations to machine instructions which produce
         * implementation-defined results in such a case; the resulting
         * contents of the register may be arbitrary, but program behaviour
         * as a whole remains defined. In other words, in case of JIT backends,
         * the AND must /not/ be added to the emitted LSH/RSH/ARSH translation.
         */
        /* ALU (shifts) */
#define SHT(OPCODE, OP)                                        \
        ALU64_##OPCODE##_X:                                \
                DST = DST OP (SRC & 63);                \
                CONT;                                        \
        ALU_##OPCODE##_X:                                \
                DST = (u32) DST OP ((u32) SRC & 31);        \
                CONT;                                        \
        ALU64_##OPCODE##_K:                                \
                DST = DST OP IMM;                        \
                CONT;                                        \
        ALU_##OPCODE##_K:                                \
                DST = (u32) DST OP (u32) IMM;                \
                CONT;
        /* ALU (rest) */
#define ALU(OPCODE, OP)                                        \
        ALU64_##OPCODE##_X:                                \
                DST = DST OP SRC;                        \
                CONT;                                        \
        ALU_##OPCODE##_X:                                \
                DST = (u32) DST OP (u32) SRC;                \
                CONT;                                        \
        ALU64_##OPCODE##_K:                                \
                DST = DST OP IMM;                        \
                CONT;                                        \
        ALU_##OPCODE##_K:                                \
                DST = (u32) DST OP (u32) IMM;                \
                CONT;
        ALU(ADD,  +)
        ALU(SUB,  -)
        ALU(AND,  &)
        ALU(OR,   |)
        ALU(XOR,  ^)
        ALU(MUL,  *)
        SHT(LSH, <<)
        SHT(RSH, >>)
#undef SHT
#undef ALU
        ALU_NEG:
                DST = (u32) -DST;
                CONT;
        ALU64_NEG:
                DST = -DST;
                CONT;
        ALU_MOV_X:
                DST = (u32) SRC;
                CONT;
        ALU_MOV_K:
                DST = (u32) IMM;
                CONT;
        ALU64_MOV_X:
                DST = SRC;
                CONT;
        ALU64_MOV_K:
                DST = IMM;
                CONT;
        LD_IMM_DW:
                DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32;
                insn++;
                CONT;
        ALU_ARSH_X:
                DST = (u64) (u32) (((s32) DST) >> (SRC & 31));
                CONT;
        ALU_ARSH_K:
                DST = (u64) (u32) (((s32) DST) >> IMM);
                CONT;
        ALU64_ARSH_X:
                (*(s64 *) &DST) >>= (SRC & 63);
                CONT;
        ALU64_ARSH_K:
                (*(s64 *) &DST) >>= IMM;
                CONT;
        ALU64_MOD_X:
                div64_u64_rem(DST, SRC, &AX);
                DST = AX;
                CONT;
        ALU_MOD_X:
                AX = (u32) DST;
                DST = do_div(AX, (u32) SRC);
                CONT;
        ALU64_MOD_K:
                div64_u64_rem(DST, IMM, &AX);
                DST = AX;
                CONT;
        ALU_MOD_K:
                AX = (u32) DST;
                DST = do_div(AX, (u32) IMM);
                CONT;
        ALU64_DIV_X:
                DST = div64_u64(DST, SRC);
                CONT;
        ALU_DIV_X:
                AX = (u32) DST;
                do_div(AX, (u32) SRC);
                DST = (u32) AX;
                CONT;
        ALU64_DIV_K:
                DST = div64_u64(DST, IMM);
                CONT;
        ALU_DIV_K:
                AX = (u32) DST;
                do_div(AX, (u32) IMM);
                DST = (u32) AX;
                CONT;
        ALU_END_TO_BE:
                switch (IMM) {
                case 16:
                        DST = (__force u16) cpu_to_be16(DST);
                        break;
                case 32:
                        DST = (__force u32) cpu_to_be32(DST);
                        break;
                case 64:
                        DST = (__force u64) cpu_to_be64(DST);
                        break;
                }
                CONT;
        ALU_END_TO_LE:
                switch (IMM) {
                case 16:
                        DST = (__force u16) cpu_to_le16(DST);
                        break;
                case 32:
                        DST = (__force u32) cpu_to_le32(DST);
                        break;
                case 64:
                        DST = (__force u64) cpu_to_le64(DST);
                        break;
                }
                CONT;

        /* CALL */
        JMP_CALL:
                /* Function call scratches BPF_R1-BPF_R5 registers,
                 * preserves BPF_R6-BPF_R9, and stores return value
                 * into BPF_R0.
                 */
                BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3,
                                                       BPF_R4, BPF_R5);
                CONT;

        JMP_CALL_ARGS:
                BPF_R0 = (__bpf_call_base_args + insn->imm)(BPF_R1, BPF_R2,
                                                            BPF_R3, BPF_R4,
                                                            BPF_R5,
                                                            insn + insn->off + 1);
                CONT;

        JMP_TAIL_CALL: {
                struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
                struct bpf_array *array = container_of(map, struct bpf_array, map);
                struct bpf_prog *prog;
                u32 index = BPF_R3;

                if (unlikely(index >= array->map.max_entries))
                        goto out;
                if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
                        goto out;

                tail_call_cnt++;

                prog = READ_ONCE(array->ptrs[index]);
                if (!prog)
                        goto out;

                /* ARG1 at this point is guaranteed to point to CTX from
                 * the verifier side due to the fact that the tail call is
                 * handled like a helper, that is, bpf_tail_call_proto,
                 * where arg1_type is ARG_PTR_TO_CTX.
                 */
                insn = prog->insnsi;
                goto select_insn;
out:
                CONT;
        }
        JMP_JA:
                insn += insn->off;
                CONT;
        JMP_EXIT:
                return BPF_R0;
        /* JMP */
#define COND_JMP(SIGN, OPCODE, CMP_OP)                                \
        JMP_##OPCODE##_X:                                        \
                if ((SIGN##64) DST CMP_OP (SIGN##64) SRC) {        \
                        insn += insn->off;                        \
                        CONT_JMP;                                \
                }                                                \
                CONT;                                                \
        JMP32_##OPCODE##_X:                                        \
                if ((SIGN##32) DST CMP_OP (SIGN##32) SRC) {        \
                        insn += insn->off;                        \
                        CONT_JMP;                                \
                }                                                \
                CONT;                                                \
        JMP_##OPCODE##_K:                                        \
                if ((SIGN##64) DST CMP_OP (SIGN##64) IMM) {        \
                        insn += insn->off;                        \
                        CONT_JMP;                                \
                }                                                \
                CONT;                                                \
        JMP32_##OPCODE##_K:                                        \
                if ((SIGN##32) DST CMP_OP (SIGN##32) IMM) {        \
                        insn += insn->off;                        \
                        CONT_JMP;                                \
                }                                                \
                CONT;
        COND_JMP(u, JEQ, ==)
        COND_JMP(u, JNE, !=)
        COND_JMP(u, JGT, >)
        COND_JMP(u, JLT, <)
        COND_JMP(u, JGE, >=)
        COND_JMP(u, JLE, <=)
        COND_JMP(u, JSET, &)
        COND_JMP(s, JSGT, >)
        COND_JMP(s, JSLT, <)
        COND_JMP(s, JSGE, >=)
        COND_JMP(s, JSLE, <=)
#undef COND_JMP
        /* ST, STX and LDX*/
        ST_NOSPEC:
                /* Speculation barrier for mitigating Speculative Store Bypass.
                 * In case of arm64, we rely on the firmware mitigation as
                 * controlled via the ssbd kernel parameter. Whenever the
                 * mitigation is enabled, it works for all of the kernel code
                 * with no need to provide any additional instructions here.
                 * In case of x86, we use 'lfence' insn for mitigation. We
                 * reuse preexisting logic from Spectre v1 mitigation that
                 * happens to produce the required code on x86 for v4 as well.
                 */
                barrier_nospec();
                CONT;
#define LDST(SIZEOP, SIZE)                                                \
        STX_MEM_##SIZEOP:                                                \
                *(SIZE *)(unsigned long) (DST + insn->off) = SRC;        \
                CONT;                                                        \
        ST_MEM_##SIZEOP:                                                \
                *(SIZE *)(unsigned long) (DST + insn->off) = IMM;        \
                CONT;                                                        \
        LDX_MEM_##SIZEOP:                                                \
                DST = *(SIZE *)(unsigned long) (SRC + insn->off);        \
                CONT;                                                        \
        LDX_PROBE_MEM_##SIZEOP:                                                \
                bpf_probe_read_kernel(&DST, sizeof(SIZE),                \
                                      (const void *)(long) (SRC + insn->off));        \
                DST = *((SIZE *)&DST);                                        \
                CONT;

        LDST(B,   u8)
        LDST(H,  u16)
        LDST(W,  u32)
        LDST(DW, u64)
#undef LDST

        STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */
                atomic_add((u32) SRC, (atomic_t *)(unsigned long)
                           (DST + insn->off));
                CONT;
        STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */
                atomic64_add((u64) SRC, (atomic64_t *)(unsigned long)
                             (DST + insn->off));
                CONT;

        default_label:
                /* If we ever reach this, we have a bug somewhere. Die hard here
                 * instead of just returning 0; we could be somewhere in a subprog,
                 * so execution could continue otherwise which we do /not/ want.
                 *
                 * Note, verifier whitelists all opcodes in bpf_opcode_in_insntable().
                 */
                pr_warn("BPF interpreter: unknown opcode %02x\n", insn->code);
                BUG_ON(1);
                return 0;
}

#define PROG_NAME(stack_size) __bpf_prog_run##stack_size
#define DEFINE_BPF_PROG_RUN(stack_size) \
static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \
{ \
        u64 stack[stack_size / sizeof(u64)]; \
        u64 regs[MAX_BPF_EXT_REG]; \
\
        FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
        ARG1 = (u64) (unsigned long) ctx; \
        return ___bpf_prog_run(regs, insn, stack); \
}

#define PROG_NAME_ARGS(stack_size) __bpf_prog_run_args##stack_size
#define DEFINE_BPF_PROG_RUN_ARGS(stack_size) \
static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \
                                      const struct bpf_insn *insn) \
{ \
        u64 stack[stack_size / sizeof(u64)]; \
        u64 regs[MAX_BPF_EXT_REG]; \
\
        FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
        BPF_R1 = r1; \
        BPF_R2 = r2; \
        BPF_R3 = r3; \
        BPF_R4 = r4; \
        BPF_R5 = r5; \
        return ___bpf_prog_run(regs, insn, stack); \
}

#define EVAL1(FN, X) FN(X)
#define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y)
#define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y)
#define EVAL4(FN, X, Y...) FN(X) EVAL3(FN, Y)
#define EVAL5(FN, X, Y...) FN(X) EVAL4(FN, Y)
#define EVAL6(FN, X, Y...) FN(X) EVAL5(FN, Y)

EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192);
EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384);
EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512);

EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 32, 64, 96, 128, 160, 192);
EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 224, 256, 288, 320, 352, 384);
EVAL4(DEFINE_BPF_PROG_RUN_ARGS, 416, 448, 480, 512);

#define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size),

static unsigned int (*interpreters[])(const void *ctx,
                                      const struct bpf_insn *insn) = {
EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
};
#undef PROG_NAME_LIST
#define PROG_NAME_LIST(stack_size) PROG_NAME_ARGS(stack_size),
static u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5,
                                  const struct bpf_insn *insn) = {
EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
};
#undef PROG_NAME_LIST

void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth)
{
        stack_depth = max_t(u32, stack_depth, 1);
        insn->off = (s16) insn->imm;
        insn->imm = interpreters_args[(round_up(stack_depth, 32) / 32) - 1] -
                __bpf_call_base_args;
        insn->code = BPF_JMP | BPF_CALL_ARGS;
}

#else
static unsigned int __bpf_prog_ret0_warn(const void *ctx,
                                         const struct bpf_insn *insn)
{
        /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON
         * is not working properly, so warn about it!
         */
        WARN_ON_ONCE(1);
        return 0;
}
#endif

bool bpf_prog_array_compatible(struct bpf_array *array,
                               const struct bpf_prog *fp)
{
        bool ret;

        if (fp->kprobe_override)
                return false;

        spin_lock(&array->aux->owner.lock);

        if (!array->aux->owner.type) {
                /* There's no owner yet where we could check for
                 * compatibility.
                 */
                array->aux->owner.type  = fp->type;
                array->aux->owner.jited = fp->jited;
                ret = true;
        } else {
                ret = array->aux->owner.type  == fp->type &&
                      array->aux->owner.jited == fp->jited;
        }
        spin_unlock(&array->aux->owner.lock);
        return ret;
}

static int bpf_check_tail_call(const struct bpf_prog *fp)
{
        struct bpf_prog_aux *aux = fp->aux;
        int i, ret = 0;

        mutex_lock(&aux->used_maps_mutex);
        for (i = 0; i < aux->used_map_cnt; i++) {
                struct bpf_map *map = aux->used_maps[i];
                struct bpf_array *array;

                if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
                        continue;

                array = container_of(map, struct bpf_array, map);
                if (!bpf_prog_array_compatible(array, fp)) {
                        ret = -EINVAL;
                        goto out;
                }
        }

out:
        mutex_unlock(&aux->used_maps_mutex);
        return ret;
}

static void bpf_prog_select_func(struct bpf_prog *fp)
{
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
        u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);

        fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1];
#else
        fp->bpf_func = __bpf_prog_ret0_warn;
#endif
}

/**
 *        bpf_prog_select_runtime - select exec runtime for BPF program
 *        @fp: bpf_prog populated with internal BPF program
 *        @err: pointer to error variable
 *
 * Try to JIT eBPF program, if JIT is not available, use interpreter.
 * The BPF program will be executed via BPF_PROG_RUN() macro.
 */
struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
{
        /* In case of BPF to BPF calls, verifier did all the prep
         * work with regards to JITing, etc.
         */
        if (fp->bpf_func)
                goto finalize;

        bpf_prog_select_func(fp);

        /* eBPF JITs can rewrite the program in case constant
         * blinding is active. However, in case of error during
         * blinding, bpf_int_jit_compile() must always return a
         * valid program, which in this case would simply not
         * be JITed, but falls back to the interpreter.
         */
        if (!bpf_prog_is_dev_bound(fp->aux)) {
                *err = bpf_prog_alloc_jited_linfo(fp);
                if (*err)
                        return fp;

                fp = bpf_int_jit_compile(fp);
                if (!fp->jited) {
                        bpf_prog_free_jited_linfo(fp);
#ifdef CONFIG_BPF_JIT_ALWAYS_ON
                        *err = -ENOTSUPP;
                        return fp;
#endif
                } else {
                        bpf_prog_free_unused_jited_linfo(fp);
                }
        } else {
                *err = bpf_prog_offload_compile(fp);
                if (*err)
                        return fp;
        }

finalize:
        bpf_prog_lock_ro(fp);

        /* The tail call compatibility check can only be done at
         * this late stage as we need to determine, if we deal
         * with JITed or non JITed program concatenations and not
         * all eBPF JITs might immediately support all features.
         */
        *err = bpf_check_tail_call(fp);

        return fp;
}
EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);

static unsigned int __bpf_prog_ret1(const void *ctx,
                                    const struct bpf_insn *insn)
{
        return 1;
}

static struct bpf_prog_dummy {
        struct bpf_prog prog;
} dummy_bpf_prog = {
        .prog = {
                .bpf_func = __bpf_prog_ret1,
        },
};

/* to avoid allocating empty bpf_prog_array for cgroups that
 * don't have bpf program attached use one global 'empty_prog_array'
 * It will not be modified the caller of bpf_prog_array_alloc()
 * (since caller requested prog_cnt == 0)
 * that pointer should be 'freed' by bpf_prog_array_free()
 */
static struct {
        struct bpf_prog_array hdr;
        struct bpf_prog *null_prog;
} empty_prog_array = {
        .null_prog = NULL,
};

struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
{
        if (prog_cnt)
                return kzalloc(sizeof(struct bpf_prog_array) +
                               sizeof(struct bpf_prog_array_item) *
                               (prog_cnt + 1),
                               flags);

        return &empty_prog_array.hdr;
}

void bpf_prog_array_free(struct bpf_prog_array *progs)
{
        if (!progs || progs == &empty_prog_array.hdr)
                return;
        kfree_rcu(progs, rcu);
}

int bpf_prog_array_length(struct bpf_prog_array *array)
{
        struct bpf_prog_array_item *item;
        u32 cnt = 0;

        for (item = array->items; item->prog; item++)
                if (item->prog != &dummy_bpf_prog.prog)
                        cnt++;
        return cnt;
}

bool bpf_prog_array_is_empty(struct bpf_prog_array *array)
{
        struct bpf_prog_array_item *item;

        for (item = array->items; item->prog; item++)
                if (item->prog != &dummy_bpf_prog.prog)
                        return false;
        return true;
}

static bool bpf_prog_array_copy_core(struct bpf_prog_array *array,
                                     u32 *prog_ids,
                                     u32 request_cnt)
{
        struct bpf_prog_array_item *item;
        int i = 0;

        for (item = array->items; item->prog; item++) {
                if (item->prog == &dummy_bpf_prog.prog)
                        continue;
                prog_ids[i] = item->prog->aux->id;
                if (++i == request_cnt) {
                        item++;
                        break;
                }
        }

        return !!(item->prog);
}

int bpf_prog_array_copy_to_user(struct bpf_prog_array *array,
                                __u32 __user *prog_ids, u32 cnt)
{
        unsigned long err = 0;
        bool nospc;
        u32 *ids;

        /* users of this function are doing:
         * cnt = bpf_prog_array_length();
         * if (cnt > 0)
         *     bpf_prog_array_copy_to_user(..., cnt);
         * so below kcalloc doesn't need extra cnt > 0 check.
         */
        ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN);
        if (!ids)
                return -ENOMEM;
        nospc = bpf_prog_array_copy_core(array, ids, cnt);
        err = copy_to_user(prog_ids, ids, cnt * sizeof(u32));
        kfree(ids);
        if (err)
                return -EFAULT;
        if (nospc)
                return -ENOSPC;
        return 0;
}

void bpf_prog_array_delete_safe(struct bpf_prog_array *array,
                                struct bpf_prog *old_prog)
{
        struct bpf_prog_array_item *item;

        for (item = array->items; item->prog; item++)
                if (item->prog == old_prog) {
                        WRITE_ONCE(item->prog, &dummy_bpf_prog.prog);
                        break;
                }
}

/**
 * bpf_prog_array_delete_safe_at() - Replaces the program at the given
 *                                   index into the program array with
 *                                   a dummy no-op program.
 * @array: a bpf_prog_array
 * @index: the index of the program to replace
 *
 * Skips over dummy programs, by not counting them, when calculating
 * the position of the program to replace.
 *
 * Return:
 * * 0                - Success
 * * -EINVAL        - Invalid index value. Must be a non-negative integer.
 * * -ENOENT        - Index out of range
 */
int bpf_prog_array_delete_safe_at(struct bpf_prog_array *array, int index)
{
        return bpf_prog_array_update_at(array, index, &dummy_bpf_prog.prog);
}

/**
 * bpf_prog_array_update_at() - Updates the program at the given index
 *                              into the program array.
 * @array: a bpf_prog_array
 * @index: the index of the program to update
 * @prog: the program to insert into the array
 *
 * Skips over dummy programs, by not counting them, when calculating
 * the position of the program to update.
 *
 * Return:
 * * 0                - Success
 * * -EINVAL        - Invalid index value. Must be a non-negative integer.
 * * -ENOENT        - Index out of range
 */
int bpf_prog_array_update_at(struct bpf_prog_array *array, int index,
                             struct bpf_prog *prog)
{
        struct bpf_prog_array_item *item;

        if (unlikely(index < 0))
                return -EINVAL;

        for (item = array->items; item->prog; item++) {
                if (item->prog == &dummy_bpf_prog.prog)
                        continue;
                if (!index) {
                        WRITE_ONCE(item->prog, prog);
                        return 0;
                }
                index--;
        }
        return -ENOENT;
}

int bpf_prog_array_copy(struct bpf_prog_array *old_array,
                        struct bpf_prog *exclude_prog,
                        struct bpf_prog *include_prog,
                        struct bpf_prog_array **new_array)
{
        int new_prog_cnt, carry_prog_cnt = 0;
        struct bpf_prog_array_item *existing;
        struct bpf_prog_array *array;
        bool found_exclude = false;
        int new_prog_idx = 0;

        /* Figure out how many existing progs we need to carry over to
         * the new array.
         */
        if (old_array) {
                existing = old_array->items;
                for (; existing->prog; existing++) {
                        if (existing->prog == exclude_prog) {
                                found_exclude = true;
                                continue;
                        }
                        if (existing->prog != &dummy_bpf_prog.prog)
                                carry_prog_cnt++;
                        if (existing->prog == include_prog)
                                return -EEXIST;
                }
        }

        if (exclude_prog && !found_exclude)
                return -ENOENT;

        /* How many progs (not NULL) will be in the new array? */
        new_prog_cnt = carry_prog_cnt;
        if (include_prog)
                new_prog_cnt += 1;

        /* Do we have any prog (not NULL) in the new array? */
        if (!new_prog_cnt) {
                *new_array = NULL;
                return 0;
        }

        /* +1 as the end of prog_array is marked with NULL */
        array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL);
        if (!array)
                return -ENOMEM;

        /* Fill in the new prog array */
        if (carry_prog_cnt) {
                existing = old_array->items;
                for (; existing->prog; existing++)
                        if (existing->prog != exclude_prog &&
                            existing->prog != &dummy_bpf_prog.prog) {
                                array->items[new_prog_idx++].prog =
                                        existing->prog;
                        }
        }
        if (include_prog)
                array->items[new_prog_idx++].prog = include_prog;
        array->items[new_prog_idx].prog = NULL;
        *new_array = array;
        return 0;
}

int bpf_prog_array_copy_info(struct bpf_prog_array *array,
                             u32 *prog_ids, u32 request_cnt,
                             u32 *prog_cnt)
{
        u32 cnt = 0;

        if (array)
                cnt = bpf_prog_array_length(array);

        *prog_cnt = cnt;

        /* return early if user requested only program count or nothing to copy */
        if (!request_cnt || !cnt)
                return 0;

        /* this function is called under trace/bpf_trace.c: bpf_event_mutex */
        return bpf_prog_array_copy_core(array, prog_ids, request_cnt) ? -ENOSPC
                                                                     : 0;
}

void __bpf_free_used_maps(struct bpf_prog_aux *aux,
                          struct bpf_map **used_maps, u32 len)
{
        struct bpf_map *map;
        u32 i;

        for (i = 0; i < len; i++) {
                map = used_maps[i];
                if (map->ops->map_poke_untrack)
                        map->ops->map_poke_untrack(map, aux);
                bpf_map_put(map);
        }
}

static void bpf_free_used_maps(struct bpf_prog_aux *aux)
{
        __bpf_free_used_maps(aux, aux->used_maps, aux->used_map_cnt);
        kfree(aux->used_maps);
}

static void bpf_prog_free_deferred(struct work_struct *work)
{
        struct bpf_prog_aux *aux;
        int i;

        aux = container_of(work, struct bpf_prog_aux, work);
        bpf_free_used_maps(aux);
        if (bpf_prog_is_dev_bound(aux))
                bpf_prog_offload_destroy(aux->prog);
#ifdef CONFIG_PERF_EVENTS
        if (aux->prog->has_callchain_buf)
                put_callchain_buffers();
#endif
        if (aux->dst_trampoline)
                bpf_trampoline_put(aux->dst_trampoline);
        for (i = 0; i < aux->func_cnt; i++) {
                /* We can just unlink the subprog poke descriptor table as
                 * it was originally linked to the main program and is also
                 * released along with it.
                 */
                aux->func[i]->aux->poke_tab = NULL;
                bpf_jit_free(aux->func[i]);
        }
        if (aux->func_cnt) {
                kfree(aux->func);
                bpf_prog_unlock_free(aux->prog);
        } else {
                bpf_jit_free(aux->prog);
        }
}

/* Free internal BPF program */
void bpf_prog_free(struct bpf_prog *fp)
{
        struct bpf_prog_aux *aux = fp->aux;

        if (aux->dst_prog)
                bpf_prog_put(aux->dst_prog);
        INIT_WORK(&aux->work, bpf_prog_free_deferred);
        schedule_work(&aux->work);
}
EXPORT_SYMBOL_GPL(bpf_prog_free);

/* RNG for unpriviledged user space with separated state from prandom_u32(). */
static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state);

void bpf_user_rnd_init_once(void)
{
        prandom_init_once(&bpf_user_rnd_state);
}

BPF_CALL_0(bpf_user_rnd_u32)
{
        /* Should someone ever have the rather unwise idea to use some
         * of the registers passed into this function, then note that
         * this function is called from native eBPF and classic-to-eBPF
         * transformations. Register assignments from both sides are
         * different, f.e. classic always sets fn(ctx, A, X) here.
         */
        struct rnd_state *state;
        u32 res;

        state = &get_cpu_var(bpf_user_rnd_state);
        res = prandom_u32_state(state);
        put_cpu_var(bpf_user_rnd_state);

        return res;
}

BPF_CALL_0(bpf_get_raw_cpu_id)
{
        return raw_smp_processor_id();
}

/* Weak definitions of helper functions in case we don't have bpf syscall. */
const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;
const struct bpf_func_proto bpf_map_update_elem_proto __weak;
const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
const struct bpf_func_proto bpf_map_push_elem_proto __weak;
const struct bpf_func_proto bpf_map_pop_elem_proto __weak;
const struct bpf_func_proto bpf_map_peek_elem_proto __weak;
const struct bpf_func_proto bpf_spin_lock_proto __weak;
const struct bpf_func_proto bpf_spin_unlock_proto __weak;
const struct bpf_func_proto bpf_jiffies64_proto __weak;

const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
const struct bpf_func_proto bpf_get_numa_node_id_proto __weak;
const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak;

const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
const struct bpf_func_proto bpf_get_current_comm_proto __weak;
const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto __weak;
const struct bpf_func_proto bpf_get_local_storage_proto __weak;
const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_snprintf_btf_proto __weak;
const struct bpf_func_proto bpf_seq_printf_btf_proto __weak;

const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
{
        return NULL;
}

u64 __weak
bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
                 void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
{
        return -ENOTSUPP;
}
EXPORT_SYMBOL_GPL(bpf_event_output);

/* Always built-in helper functions. */
const struct bpf_func_proto bpf_tail_call_proto = {
        .func                = NULL,
        .gpl_only        = false,
        .ret_type        = RET_VOID,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
};

/* Stub for JITs that only support cBPF. eBPF programs are interpreted.
 * It is encouraged to implement bpf_int_jit_compile() instead, so that
 * eBPF and implicitly also cBPF can get JITed!
 */
struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog)
{
        return prog;
}

/* Stub for JITs that support eBPF. All cBPF code gets transformed into
 * eBPF by the kernel and is later compiled by bpf_int_jit_compile().
 */
void __weak bpf_jit_compile(struct bpf_prog *prog)
{
}

bool __weak bpf_helper_changes_pkt_data(void *func)
{
        return false;
}

/* Return TRUE if the JIT backend wants verifier to enable sub-register usage
 * analysis code and wants explicit zero extension inserted by verifier.
 * Otherwise, return FALSE.
 */
bool __weak bpf_jit_needs_zext(void)
{
        return false;
}

/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
 * skb_copy_bits(), so provide a weak definition of it for NET-less config.
 */
int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
                         int len)
{
        return -EFAULT;
}

int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
                              void *addr1, void *addr2)
{
        return -ENOTSUPP;
}

DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
EXPORT_SYMBOL(bpf_stats_enabled_key);

/* All definitions of tracepoints related to BPF. */
#define CREATE_TRACE_POINTS
#include <linux/bpf_trace.h>

EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx);








































































































































































































































































































































































































































































































    1 







    1 
    1 





    1 















    1 
































































































































    1 

    1 



    1 
    1 







    1 




    1 




    1 












    1 







    1 
    1 
    1 
    1 













    1 
    1 
    1 

    1 

    1 



    1 
























































































    1 

    1 

    1 
    1 

    1 















    1 

    1 
    1 

    1 



    1 
    1 

    1 



    1 






















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 


















































































    1 

    1 


    1 



    1 

    1 

















































    2 










































































    2 



    2 







    2 




    3 



    3 




























































    3 
    3 
















    1 

































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
// SPDX-License-Identifier: GPL-2.0+
/*
 * linux/fs/jbd2/journal.c
 *
 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
 *
 * Copyright 1998 Red Hat corp --- All Rights Reserved
 *
 * Generic filesystem journal-writing code; part of the ext2fs
 * journaling system.
 *
 * This file manages journals: areas of disk reserved for logging
 * transactional updates.  This includes the kernel journaling thread
 * which is responsible for scheduling updates to the log.
 *
 * We do not actually manage the physical storage of the journal in this
 * file: that is left to a per-journal policy function, which allows us
 * to store the journal within a filesystem-specified area for ext2
 * journaling (ext2 can use a reserved inode for storing the log).
 */

#include <linux/module.h>
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd2.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/freezer.h>
#include <linux/pagemap.h>
#include <linux/kthread.h>
#include <linux/poison.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/math64.h>
#include <linux/hash.h>
#include <linux/log2.h>
#include <linux/vmalloc.h>
#include <linux/backing-dev.h>
#include <linux/bitops.h>
#include <linux/ratelimit.h>
#include <linux/sched/mm.h>

#define CREATE_TRACE_POINTS
#include <trace/events/jbd2.h>

#include <linux/uaccess.h>
#include <asm/page.h>

#ifdef CONFIG_JBD2_DEBUG
ushort jbd2_journal_enable_debug __read_mostly;
EXPORT_SYMBOL(jbd2_journal_enable_debug);

module_param_named(jbd2_debug, jbd2_journal_enable_debug, ushort, 0644);
MODULE_PARM_DESC(jbd2_debug, "Debugging level for jbd2");
#endif

EXPORT_SYMBOL(jbd2_journal_extend);
EXPORT_SYMBOL(jbd2_journal_stop);
EXPORT_SYMBOL(jbd2_journal_lock_updates);
EXPORT_SYMBOL(jbd2_journal_unlock_updates);
EXPORT_SYMBOL(jbd2_journal_get_write_access);
EXPORT_SYMBOL(jbd2_journal_get_create_access);
EXPORT_SYMBOL(jbd2_journal_get_undo_access);
EXPORT_SYMBOL(jbd2_journal_set_triggers);
EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
EXPORT_SYMBOL(jbd2_journal_forget);
EXPORT_SYMBOL(jbd2_journal_flush);
EXPORT_SYMBOL(jbd2_journal_revoke);

EXPORT_SYMBOL(jbd2_journal_init_dev);
EXPORT_SYMBOL(jbd2_journal_init_inode);
EXPORT_SYMBOL(jbd2_journal_check_used_features);
EXPORT_SYMBOL(jbd2_journal_check_available_features);
EXPORT_SYMBOL(jbd2_journal_set_features);
EXPORT_SYMBOL(jbd2_journal_load);
EXPORT_SYMBOL(jbd2_journal_destroy);
EXPORT_SYMBOL(jbd2_journal_abort);
EXPORT_SYMBOL(jbd2_journal_errno);
EXPORT_SYMBOL(jbd2_journal_ack_err);
EXPORT_SYMBOL(jbd2_journal_clear_err);
EXPORT_SYMBOL(jbd2_log_wait_commit);
EXPORT_SYMBOL(jbd2_log_start_commit);
EXPORT_SYMBOL(jbd2_journal_start_commit);
EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
EXPORT_SYMBOL(jbd2_journal_wipe);
EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
EXPORT_SYMBOL(jbd2_journal_invalidatepage);
EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
EXPORT_SYMBOL(jbd2_journal_force_commit);
EXPORT_SYMBOL(jbd2_journal_inode_ranged_write);
EXPORT_SYMBOL(jbd2_journal_inode_ranged_wait);
EXPORT_SYMBOL(jbd2_journal_submit_inode_data_buffers);
EXPORT_SYMBOL(jbd2_journal_finish_inode_data_buffers);
EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
EXPORT_SYMBOL(jbd2_inode_cache);

static int jbd2_journal_create_slab(size_t slab_size);

#ifdef CONFIG_JBD2_DEBUG
void __jbd2_debug(int level, const char *file, const char *func,
                  unsigned int line, const char *fmt, ...)
{
        struct va_format vaf;
        va_list args;

        if (level > jbd2_journal_enable_debug)
                return;
        va_start(args, fmt);
        vaf.fmt = fmt;
        vaf.va = &args;
        printk(KERN_DEBUG "%s: (%s, %u): %pV", file, func, line, &vaf);
        va_end(args);
}
EXPORT_SYMBOL(__jbd2_debug);
#endif

/* Checksumming functions */
static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
{
        if (!jbd2_journal_has_csum_v2or3_feature(j))
                return 1;

        return sb->s_checksum_type == JBD2_CRC32C_CHKSUM;
}

static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
{
        __u32 csum;
        __be32 old_csum;

        old_csum = sb->s_checksum;
        sb->s_checksum = 0;
        csum = jbd2_chksum(j, ~0, (char *)sb, sizeof(journal_superblock_t));
        sb->s_checksum = old_csum;

        return cpu_to_be32(csum);
}

/*
 * Helper function used to manage commit timeouts
 */

static void commit_timeout(struct timer_list *t)
{
        journal_t *journal = from_timer(journal, t, j_commit_timer);

        wake_up_process(journal->j_task);
}

/*
 * kjournald2: The main thread function used to manage a logging device
 * journal.
 *
 * This kernel thread is responsible for two things:
 *
 * 1) COMMIT:  Every so often we need to commit the current state of the
 *    filesystem to disk.  The journal thread is responsible for writing
 *    all of the metadata buffers to disk. If a fast commit is ongoing
 *    journal thread waits until it's done and then continues from
 *    there on.
 *
 * 2) CHECKPOINT: We cannot reuse a used section of the log file until all
 *    of the data in that part of the log has been rewritten elsewhere on
 *    the disk.  Flushing these old buffers to reclaim space in the log is
 *    known as checkpointing, and this thread is responsible for that job.
 */

static int kjournald2(void *arg)
{
        journal_t *journal = arg;
        transaction_t *transaction;

        /*
         * Set up an interval timer which can be used to trigger a commit wakeup
         * after the commit interval expires
         */
        timer_setup(&journal->j_commit_timer, commit_timeout, 0);

        set_freezable();

        /* Record that the journal thread is running */
        journal->j_task = current;
        wake_up(&journal->j_wait_done_commit);

        /*
         * Make sure that no allocations from this kernel thread will ever
         * recurse to the fs layer because we are responsible for the
         * transaction commit and any fs involvement might get stuck waiting for
         * the trasn. commit.
         */
        memalloc_nofs_save();

        /*
         * And now, wait forever for commit wakeup events.
         */
        write_lock(&journal->j_state_lock);

loop:
        if (journal->j_flags & JBD2_UNMOUNT)
                goto end_loop;

        jbd_debug(1, "commit_sequence=%u, commit_request=%u\n",
                journal->j_commit_sequence, journal->j_commit_request);

        if (journal->j_commit_sequence != journal->j_commit_request) {
                jbd_debug(1, "OK, requests differ\n");
                write_unlock(&journal->j_state_lock);
                del_timer_sync(&journal->j_commit_timer);
                jbd2_journal_commit_transaction(journal);
                write_lock(&journal->j_state_lock);
                goto loop;
        }

        wake_up(&journal->j_wait_done_commit);
        if (freezing(current)) {
                /*
                 * The simpler the better. Flushing journal isn't a
                 * good idea, because that depends on threads that may
                 * be already stopped.
                 */
                jbd_debug(1, "Now suspending kjournald2\n");
                write_unlock(&journal->j_state_lock);
                try_to_freeze();
                write_lock(&journal->j_state_lock);
        } else {
                /*
                 * We assume on resume that commits are already there,
                 * so we don't sleep
                 */
                DEFINE_WAIT(wait);
                int should_sleep = 1;

                prepare_to_wait(&journal->j_wait_commit, &wait,
                                TASK_INTERRUPTIBLE);
                if (journal->j_commit_sequence != journal->j_commit_request)
                        should_sleep = 0;
                transaction = journal->j_running_transaction;
                if (transaction && time_after_eq(jiffies,
                                                transaction->t_expires))
                        should_sleep = 0;
                if (journal->j_flags & JBD2_UNMOUNT)
                        should_sleep = 0;
                if (should_sleep) {
                        write_unlock(&journal->j_state_lock);
                        schedule();
                        write_lock(&journal->j_state_lock);
                }
                finish_wait(&journal->j_wait_commit, &wait);
        }

        jbd_debug(1, "kjournald2 wakes\n");

        /*
         * Were we woken up by a commit wakeup event?
         */
        transaction = journal->j_running_transaction;
        if (transaction && time_after_eq(jiffies, transaction->t_expires)) {
                journal->j_commit_request = transaction->t_tid;
                jbd_debug(1, "woke because of timeout\n");
        }
        goto loop;

end_loop:
        del_timer_sync(&journal->j_commit_timer);
        journal->j_task = NULL;
        wake_up(&journal->j_wait_done_commit);
        jbd_debug(1, "Journal thread exiting.\n");
        write_unlock(&journal->j_state_lock);
        return 0;
}

static int jbd2_journal_start_thread(journal_t *journal)
{
        struct task_struct *t;

        t = kthread_run(kjournald2, journal, "jbd2/%s",
                        journal->j_devname);
        if (IS_ERR(t))
                return PTR_ERR(t);

        wait_event(journal->j_wait_done_commit, journal->j_task != NULL);
        return 0;
}

static void journal_kill_thread(journal_t *journal)
{
        write_lock(&journal->j_state_lock);
        journal->j_flags |= JBD2_UNMOUNT;

        while (journal->j_task) {
                write_unlock(&journal->j_state_lock);
                wake_up(&journal->j_wait_commit);
                wait_event(journal->j_wait_done_commit, journal->j_task == NULL);
                write_lock(&journal->j_state_lock);
        }
        write_unlock(&journal->j_state_lock);
}

/*
 * jbd2_journal_write_metadata_buffer: write a metadata buffer to the journal.
 *
 * Writes a metadata buffer to a given disk block.  The actual IO is not
 * performed but a new buffer_head is constructed which labels the data
 * to be written with the correct destination disk block.
 *
 * Any magic-number escaping which needs to be done will cause a
 * copy-out here.  If the buffer happens to start with the
 * JBD2_MAGIC_NUMBER, then we can't write it to the log directly: the
 * magic number is only written to the log for descripter blocks.  In
 * this case, we copy the data and replace the first word with 0, and we
 * return a result code which indicates that this buffer needs to be
 * marked as an escaped buffer in the corresponding log descriptor
 * block.  The missing word can then be restored when the block is read
 * during recovery.
 *
 * If the source buffer has already been modified by a new transaction
 * since we took the last commit snapshot, we use the frozen copy of
 * that data for IO. If we end up using the existing buffer_head's data
 * for the write, then we have to make sure nobody modifies it while the
 * IO is in progress. do_get_write_access() handles this.
 *
 * The function returns a pointer to the buffer_head to be used for IO.
 *
 *
 * Return value:
 *  <0: Error
 * >=0: Finished OK
 *
 * On success:
 * Bit 0 set == escape performed on the data
 * Bit 1 set == buffer copy-out performed (kfree the data after IO)
 */

int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
                                  struct journal_head  *jh_in,
                                  struct buffer_head **bh_out,
                                  sector_t blocknr)
{
        int need_copy_out = 0;
        int done_copy_out = 0;
        int do_escape = 0;
        char *mapped_data;
        struct buffer_head *new_bh;
        struct page *new_page;
        unsigned int new_offset;
        struct buffer_head *bh_in = jh2bh(jh_in);
        journal_t *journal = transaction->t_journal;

        /*
         * The buffer really shouldn't be locked: only the current committing
         * transaction is allowed to write it, so nobody else is allowed
         * to do any IO.
         *
         * akpm: except if we're journalling data, and write() output is
         * also part of a shared mapping, and another thread has
         * decided to launch a writepage() against this buffer.
         */
        J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));

        new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);

        /* keep subsequent assertions sane */
        atomic_set(&new_bh->b_count, 1);

        spin_lock(&jh_in->b_state_lock);
repeat:
        /*
         * If a new transaction has already done a buffer copy-out, then
         * we use that version of the data for the commit.
         */
        if (jh_in->b_frozen_data) {
                done_copy_out = 1;
                new_page = virt_to_page(jh_in->b_frozen_data);
                new_offset = offset_in_page(jh_in->b_frozen_data);
        } else {
                new_page = jh2bh(jh_in)->b_page;
                new_offset = offset_in_page(jh2bh(jh_in)->b_data);
        }

        mapped_data = kmap_atomic(new_page);
        /*
         * Fire data frozen trigger if data already wasn't frozen.  Do this
         * before checking for escaping, as the trigger may modify the magic
         * offset.  If a copy-out happens afterwards, it will have the correct
         * data in the buffer.
         */
        if (!done_copy_out)
                jbd2_buffer_frozen_trigger(jh_in, mapped_data + new_offset,
                                           jh_in->b_triggers);

        /*
         * Check for escaping
         */
        if (*((__be32 *)(mapped_data + new_offset)) ==
                                cpu_to_be32(JBD2_MAGIC_NUMBER)) {
                need_copy_out = 1;
                do_escape = 1;
        }
        kunmap_atomic(mapped_data);

        /*
         * Do we need to do a data copy?
         */
        if (need_copy_out && !done_copy_out) {
                char *tmp;

                spin_unlock(&jh_in->b_state_lock);
                tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
                if (!tmp) {
                        brelse(new_bh);
                        free_buffer_head(new_bh);
                        return -ENOMEM;
                }
                spin_lock(&jh_in->b_state_lock);
                if (jh_in->b_frozen_data) {
                        jbd2_free(tmp, bh_in->b_size);
                        goto repeat;
                }

                jh_in->b_frozen_data = tmp;
                mapped_data = kmap_atomic(new_page);
                memcpy(tmp, mapped_data + new_offset, bh_in->b_size);
                kunmap_atomic(mapped_data);

                new_page = virt_to_page(tmp);
                new_offset = offset_in_page(tmp);
                done_copy_out = 1;

                /*
                 * This isn't strictly necessary, as we're using frozen
                 * data for the escaping, but it keeps consistency with
                 * b_frozen_data usage.
                 */
                jh_in->b_frozen_triggers = jh_in->b_triggers;
        }

        /*
         * Did we need to do an escaping?  Now we've done all the
         * copying, we can finally do so.
         */
        if (do_escape) {
                mapped_data = kmap_atomic(new_page);
                *((unsigned int *)(mapped_data + new_offset)) = 0;
                kunmap_atomic(mapped_data);
        }

        set_bh_page(new_bh, new_page, new_offset);
        new_bh->b_size = bh_in->b_size;
        new_bh->b_bdev = journal->j_dev;
        new_bh->b_blocknr = blocknr;
        new_bh->b_private = bh_in;
        set_buffer_mapped(new_bh);
        set_buffer_dirty(new_bh);

        *bh_out = new_bh;

        /*
         * The to-be-written buffer needs to get moved to the io queue,
         * and the original buffer whose contents we are shadowing or
         * copying is moved to the transaction's shadow queue.
         */
        JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
        spin_lock(&journal->j_list_lock);
        __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
        spin_unlock(&journal->j_list_lock);
        set_buffer_shadow(bh_in);
        spin_unlock(&jh_in->b_state_lock);

        return do_escape | (done_copy_out << 1);
}

/*
 * Allocation code for the journal file.  Manage the space left in the
 * journal, so that we can begin checkpointing when appropriate.
 */

/*
 * Called with j_state_lock locked for writing.
 * Returns true if a transaction commit was started.
 */
int __jbd2_log_start_commit(journal_t *journal, tid_t target)
{
        /* Return if the txn has already requested to be committed */
        if (journal->j_commit_request == target)
                return 0;

        /*
         * The only transaction we can possibly wait upon is the
         * currently running transaction (if it exists).  Otherwise,
         * the target tid must be an old one.
         */
        if (journal->j_running_transaction &&
            journal->j_running_transaction->t_tid == target) {
                /*
                 * We want a new commit: OK, mark the request and wakeup the
                 * commit thread.  We do _not_ do the commit ourselves.
                 */

                journal->j_commit_request = target;
                jbd_debug(1, "JBD2: requesting commit %u/%u\n",
                          journal->j_commit_request,
                          journal->j_commit_sequence);
                journal->j_running_transaction->t_requested = jiffies;
                wake_up(&journal->j_wait_commit);
                return 1;
        } else if (!tid_geq(journal->j_commit_request, target))
                /* This should never happen, but if it does, preserve
                   the evidence before kjournald goes into a loop and
                   increments j_commit_sequence beyond all recognition. */
                WARN_ONCE(1, "JBD2: bad log_start_commit: %u %u %u %u\n",
                          journal->j_commit_request,
                          journal->j_commit_sequence,
                          target, journal->j_running_transaction ?
                          journal->j_running_transaction->t_tid : 0);
        return 0;
}

int jbd2_log_start_commit(journal_t *journal, tid_t tid)
{
        int ret;

        write_lock(&journal->j_state_lock);
        ret = __jbd2_log_start_commit(journal, tid);
        write_unlock(&journal->j_state_lock);
        return ret;
}

/*
 * Force and wait any uncommitted transactions.  We can only force the running
 * transaction if we don't have an active handle, otherwise, we will deadlock.
 * Returns: <0 in case of error,
 *           0 if nothing to commit,
 *           1 if transaction was successfully committed.
 */
static int __jbd2_journal_force_commit(journal_t *journal)
{
        transaction_t *transaction = NULL;
        tid_t tid;
        int need_to_start = 0, ret = 0;

        read_lock(&journal->j_state_lock);
        if (journal->j_running_transaction && !current->journal_info) {
                transaction = journal->j_running_transaction;
                if (!tid_geq(journal->j_commit_request, transaction->t_tid))
                        need_to_start = 1;
        } else if (journal->j_committing_transaction)
                transaction = journal->j_committing_transaction;

        if (!transaction) {
                /* Nothing to commit */
                read_unlock(&journal->j_state_lock);
                return 0;
        }
        tid = transaction->t_tid;
        read_unlock(&journal->j_state_lock);
        if (need_to_start)
                jbd2_log_start_commit(journal, tid);
        ret = jbd2_log_wait_commit(journal, tid);
        if (!ret)
                ret = 1;

        return ret;
}

/**
 * jbd2_journal_force_commit_nested - Force and wait upon a commit if the
 * calling process is not within transaction.
 *
 * @journal: journal to force
 * Returns true if progress was made.
 *
 * This is used for forcing out undo-protected data which contains
 * bitmaps, when the fs is running out of space.
 */
int jbd2_journal_force_commit_nested(journal_t *journal)
{
        int ret;

        ret = __jbd2_journal_force_commit(journal);
        return ret > 0;
}

/**
 * jbd2_journal_force_commit() - force any uncommitted transactions
 * @journal: journal to force
 *
 * Caller want unconditional commit. We can only force the running transaction
 * if we don't have an active handle, otherwise, we will deadlock.
 */
int jbd2_journal_force_commit(journal_t *journal)
{
        int ret;

        J_ASSERT(!current->journal_info);
        ret = __jbd2_journal_force_commit(journal);
        if (ret > 0)
                ret = 0;
        return ret;
}

/*
 * Start a commit of the current running transaction (if any).  Returns true
 * if a transaction is going to be committed (or is currently already
 * committing), and fills its tid in at *ptid
 */
int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
{
        int ret = 0;

        write_lock(&journal->j_state_lock);
        if (journal->j_running_transaction) {
                tid_t tid = journal->j_running_transaction->t_tid;

                __jbd2_log_start_commit(journal, tid);
                /* There's a running transaction and we've just made sure
                 * it's commit has been scheduled. */
                if (ptid)
                        *ptid = tid;
                ret = 1;
        } else if (journal->j_committing_transaction) {
                /*
                 * If commit has been started, then we have to wait for
                 * completion of that transaction.
                 */
                if (ptid)
                        *ptid = journal->j_committing_transaction->t_tid;
                ret = 1;
        }
        write_unlock(&journal->j_state_lock);
        return ret;
}

/*
 * Return 1 if a given transaction has not yet sent barrier request
 * connected with a transaction commit. If 0 is returned, transaction
 * may or may not have sent the barrier. Used to avoid sending barrier
 * twice in common cases.
 */
int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
{
        int ret = 0;
        transaction_t *commit_trans;

        if (!(journal->j_flags & JBD2_BARRIER))
                return 0;
        read_lock(&journal->j_state_lock);
        /* Transaction already committed? */
        if (tid_geq(journal->j_commit_sequence, tid))
                goto out;
        commit_trans = journal->j_committing_transaction;
        if (!commit_trans || commit_trans->t_tid != tid) {
                ret = 1;
                goto out;
        }
        /*
         * Transaction is being committed and we already proceeded to
         * submitting a flush to fs partition?
         */
        if (journal->j_fs_dev != journal->j_dev) {
                if (!commit_trans->t_need_data_flush ||
                    commit_trans->t_state >= T_COMMIT_DFLUSH)
                        goto out;
        } else {
                if (commit_trans->t_state >= T_COMMIT_JFLUSH)
                        goto out;
        }
        ret = 1;
out:
        read_unlock(&journal->j_state_lock);
        return ret;
}
EXPORT_SYMBOL(jbd2_trans_will_send_data_barrier);

/*
 * Wait for a specified commit to complete.
 * The caller may not hold the journal lock.
 */
int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
{
        int err = 0;

        read_lock(&journal->j_state_lock);
#ifdef CONFIG_PROVE_LOCKING
        /*
         * Some callers make sure transaction is already committing and in that
         * case we cannot block on open handles anymore. So don't warn in that
         * case.
         */
        if (tid_gt(tid, journal->j_commit_sequence) &&
            (!journal->j_committing_transaction ||
             journal->j_committing_transaction->t_tid != tid)) {
                read_unlock(&journal->j_state_lock);
                jbd2_might_wait_for_commit(journal);
                read_lock(&journal->j_state_lock);
        }
#endif
#ifdef CONFIG_JBD2_DEBUG
        if (!tid_geq(journal->j_commit_request, tid)) {
                printk(KERN_ERR
                       "%s: error: j_commit_request=%u, tid=%u\n",
                       __func__, journal->j_commit_request, tid);
        }
#endif
        while (tid_gt(tid, journal->j_commit_sequence)) {
                jbd_debug(1, "JBD2: want %u, j_commit_sequence=%u\n",
                                  tid, journal->j_commit_sequence);
                read_unlock(&journal->j_state_lock);
                wake_up(&journal->j_wait_commit);
                wait_event(journal->j_wait_done_commit,
                                !tid_gt(tid, journal->j_commit_sequence));
                read_lock(&journal->j_state_lock);
        }
        read_unlock(&journal->j_state_lock);

        if (unlikely(is_journal_aborted(journal)))
                err = -EIO;
        return err;
}

/*
 * Start a fast commit. If there's an ongoing fast or full commit wait for
 * it to complete. Returns 0 if a new fast commit was started. Returns -EALREADY
 * if a fast commit is not needed, either because there's an already a commit
 * going on or this tid has already been committed. Returns -EINVAL if no jbd2
 * commit has yet been performed.
 */
int jbd2_fc_begin_commit(journal_t *journal, tid_t tid)
{
        if (unlikely(is_journal_aborted(journal)))
                return -EIO;
        /*
         * Fast commits only allowed if at least one full commit has
         * been processed.
         */
        if (!journal->j_stats.ts_tid)
                return -EINVAL;

        write_lock(&journal->j_state_lock);
        if (tid_geq(journal->j_commit_sequence, tid)) {
                write_unlock(&journal->j_state_lock);
                return -EALREADY;
        }

        if (journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
            (journal->j_flags & JBD2_FAST_COMMIT_ONGOING)) {
                DEFINE_WAIT(wait);

                prepare_to_wait(&journal->j_fc_wait, &wait,
                                TASK_UNINTERRUPTIBLE);
                write_unlock(&journal->j_state_lock);
                schedule();
                finish_wait(&journal->j_fc_wait, &wait);
                return -EALREADY;
        }
        journal->j_flags |= JBD2_FAST_COMMIT_ONGOING;
        write_unlock(&journal->j_state_lock);
        jbd2_journal_lock_updates(journal);

        return 0;
}
EXPORT_SYMBOL(jbd2_fc_begin_commit);

/*
 * Stop a fast commit. If fallback is set, this function starts commit of
 * TID tid before any other fast commit can start.
 */
static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback)
{
        jbd2_journal_unlock_updates(journal);
        if (journal->j_fc_cleanup_callback)
                journal->j_fc_cleanup_callback(journal, 0);
        write_lock(&journal->j_state_lock);
        journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING;
        if (fallback)
                journal->j_flags |= JBD2_FULL_COMMIT_ONGOING;
        write_unlock(&journal->j_state_lock);
        wake_up(&journal->j_fc_wait);
        if (fallback)
                return jbd2_complete_transaction(journal, tid);
        return 0;
}

int jbd2_fc_end_commit(journal_t *journal)
{
        return __jbd2_fc_end_commit(journal, 0, false);
}
EXPORT_SYMBOL(jbd2_fc_end_commit);

int jbd2_fc_end_commit_fallback(journal_t *journal)
{
        tid_t tid;

        read_lock(&journal->j_state_lock);
        tid = journal->j_running_transaction ?
                journal->j_running_transaction->t_tid : 0;
        read_unlock(&journal->j_state_lock);
        return __jbd2_fc_end_commit(journal, tid, true);
}
EXPORT_SYMBOL(jbd2_fc_end_commit_fallback);

/* Return 1 when transaction with given tid has already committed. */
int jbd2_transaction_committed(journal_t *journal, tid_t tid)
{
        int ret = 1;

        read_lock(&journal->j_state_lock);
        if (journal->j_running_transaction &&
            journal->j_running_transaction->t_tid == tid)
                ret = 0;
        if (journal->j_committing_transaction &&
            journal->j_committing_transaction->t_tid == tid)
                ret = 0;
        read_unlock(&journal->j_state_lock);
        return ret;
}
EXPORT_SYMBOL(jbd2_transaction_committed);

/*
 * When this function returns the transaction corresponding to tid
 * will be completed.  If the transaction has currently running, start
 * committing that transaction before waiting for it to complete.  If
 * the transaction id is stale, it is by definition already completed,
 * so just return SUCCESS.
 */
int jbd2_complete_transaction(journal_t *journal, tid_t tid)
{
        int        need_to_wait = 1;

        read_lock(&journal->j_state_lock);
        if (journal->j_running_transaction &&
            journal->j_running_transaction->t_tid == tid) {
                if (journal->j_commit_request != tid) {
                        /* transaction not yet started, so request it */
                        read_unlock(&journal->j_state_lock);
                        jbd2_log_start_commit(journal, tid);
                        goto wait_commit;
                }
        } else if (!(journal->j_committing_transaction &&
                     journal->j_committing_transaction->t_tid == tid))
                need_to_wait = 0;
        read_unlock(&journal->j_state_lock);
        if (!need_to_wait)
                return 0;
wait_commit:
        return jbd2_log_wait_commit(journal, tid);
}
EXPORT_SYMBOL(jbd2_complete_transaction);

/*
 * Log buffer allocation routines:
 */

int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp)
{
        unsigned long blocknr;

        write_lock(&journal->j_state_lock);
        J_ASSERT(journal->j_free > 1);

        blocknr = journal->j_head;
        journal->j_head++;
        journal->j_free--;
        if (journal->j_head == journal->j_last)
                journal->j_head = journal->j_first;
        write_unlock(&journal->j_state_lock);
        return jbd2_journal_bmap(journal, blocknr, retp);
}

/* Map one fast commit buffer for use by the file system */
int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out)
{
        unsigned long long pblock;
        unsigned long blocknr;
        int ret = 0;
        struct buffer_head *bh;
        int fc_off;

        *bh_out = NULL;

        if (journal->j_fc_off + journal->j_fc_first < journal->j_fc_last) {
                fc_off = journal->j_fc_off;
                blocknr = journal->j_fc_first + fc_off;
                journal->j_fc_off++;
        } else {
                ret = -EINVAL;
        }

        if (ret)
                return ret;

        ret = jbd2_journal_bmap(journal, blocknr, &pblock);
        if (ret)
                return ret;

        bh = __getblk(journal->j_dev, pblock, journal->j_blocksize);
        if (!bh)
                return -ENOMEM;


        journal->j_fc_wbuf[fc_off] = bh;

        *bh_out = bh;

        return 0;
}
EXPORT_SYMBOL(jbd2_fc_get_buf);

/*
 * Wait on fast commit buffers that were allocated by jbd2_fc_get_buf
 * for completion.
 */
int jbd2_fc_wait_bufs(journal_t *journal, int num_blks)
{
        struct buffer_head *bh;
        int i, j_fc_off;

        j_fc_off = journal->j_fc_off;

        /*
         * Wait in reverse order to minimize chances of us being woken up before
         * all IOs have completed
         */
        for (i = j_fc_off - 1; i >= j_fc_off - num_blks; i--) {
                bh = journal->j_fc_wbuf[i];
                wait_on_buffer(bh);
                /*
                 * Update j_fc_off so jbd2_fc_release_bufs can release remain
                 * buffer head.
                 */
                if (unlikely(!buffer_uptodate(bh))) {
                        journal->j_fc_off = i + 1;
                        return -EIO;
                }
                put_bh(bh);
                journal->j_fc_wbuf[i] = NULL;
        }

        return 0;
}
EXPORT_SYMBOL(jbd2_fc_wait_bufs);

/*
 * Wait on fast commit buffers that were allocated by jbd2_fc_get_buf
 * for completion.
 */
int jbd2_fc_release_bufs(journal_t *journal)
{
        struct buffer_head *bh;
        int i, j_fc_off;

        j_fc_off = journal->j_fc_off;

        /*
         * Wait in reverse order to minimize chances of us being woken up before
         * all IOs have completed
         */
        for (i = j_fc_off - 1; i >= 0; i--) {
                bh = journal->j_fc_wbuf[i];
                if (!bh)
                        break;
                put_bh(bh);
                journal->j_fc_wbuf[i] = NULL;
        }

        return 0;
}
EXPORT_SYMBOL(jbd2_fc_release_bufs);

/*
 * Conversion of logical to physical block numbers for the journal
 *
 * On external journals the journal blocks are identity-mapped, so
 * this is a no-op.  If needed, we can use j_blk_offset - everything is
 * ready.
 */
int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
                 unsigned long long *retp)
{
        int err = 0;
        unsigned long long ret;
        sector_t block = 0;

        if (journal->j_inode) {
                block = blocknr;
                ret = bmap(journal->j_inode, &block);

                if (ret || !block) {
                        printk(KERN_ALERT "%s: journal block not found "
                                        "at offset %lu on %s\n",
                               __func__, blocknr, journal->j_devname);
                        err = -EIO;
                        jbd2_journal_abort(journal, err);
                } else {
                        *retp = block;
                }

        } else {
                *retp = blocknr; /* +journal->j_blk_offset */
        }
        return err;
}

/*
 * We play buffer_head aliasing tricks to write data/metadata blocks to
 * the journal without copying their contents, but for journal
 * descriptor blocks we do need to generate bona fide buffers.
 *
 * After the caller of jbd2_journal_get_descriptor_buffer() has finished modifying
 * the buffer's contents they really should run flush_dcache_page(bh->b_page).
 * But we don't bother doing that, so there will be coherency problems with
 * mmaps of blockdevs which hold live JBD-controlled filesystems.
 */
struct buffer_head *
jbd2_journal_get_descriptor_buffer(transaction_t *transaction, int type)
{
        journal_t *journal = transaction->t_journal;
        struct buffer_head *bh;
        unsigned long long blocknr;
        journal_header_t *header;
        int err;

        err = jbd2_journal_next_log_block(journal, &blocknr);

        if (err)
                return NULL;

        bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
        if (!bh)
                return NULL;
        atomic_dec(&transaction->t_outstanding_credits);
        lock_buffer(bh);
        memset(bh->b_data, 0, journal->j_blocksize);
        header = (journal_header_t *)bh->b_data;
        header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
        header->h_blocktype = cpu_to_be32(type);
        header->h_sequence = cpu_to_be32(transaction->t_tid);
        set_buffer_uptodate(bh);
        unlock_buffer(bh);
        BUFFER_TRACE(bh, "return this buffer");
        return bh;
}

void jbd2_descriptor_block_csum_set(journal_t *j, struct buffer_head *bh)
{
        struct jbd2_journal_block_tail *tail;
        __u32 csum;

        if (!jbd2_journal_has_csum_v2or3(j))
                return;

        tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
                        sizeof(struct jbd2_journal_block_tail));
        tail->t_checksum = 0;
        csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
        tail->t_checksum = cpu_to_be32(csum);
}

/*
 * Return tid of the oldest transaction in the journal and block in the journal
 * where the transaction starts.
 *
 * If the journal is now empty, return which will be the next transaction ID
 * we will write and where will that transaction start.
 *
 * The return value is 0 if journal tail cannot be pushed any further, 1 if
 * it can.
 */
int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
                              unsigned long *block)
{
        transaction_t *transaction;
        int ret;

        read_lock(&journal->j_state_lock);
        spin_lock(&journal->j_list_lock);
        transaction = journal->j_checkpoint_transactions;
        if (transaction) {
                *tid = transaction->t_tid;
                *block = transaction->t_log_start;
        } else if ((transaction = journal->j_committing_transaction) != NULL) {
                *tid = transaction->t_tid;
                *block = transaction->t_log_start;
        } else if ((transaction = journal->j_running_transaction) != NULL) {
                *tid = transaction->t_tid;
                *block = journal->j_head;
        } else {
                *tid = journal->j_transaction_sequence;
                *block = journal->j_head;
        }
        ret = tid_gt(*tid, journal->j_tail_sequence);
        spin_unlock(&journal->j_list_lock);
        read_unlock(&journal->j_state_lock);

        return ret;
}

/*
 * Update information in journal structure and in on disk journal superblock
 * about log tail. This function does not check whether information passed in
 * really pushes log tail further. It's responsibility of the caller to make
 * sure provided log tail information is valid (e.g. by holding
 * j_checkpoint_mutex all the time between computing log tail and calling this
 * function as is the case with jbd2_cleanup_journal_tail()).
 *
 * Requires j_checkpoint_mutex
 */
int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
{
        unsigned long freed;
        int ret;

        BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));

        /*
         * We cannot afford for write to remain in drive's caches since as
         * soon as we update j_tail, next transaction can start reusing journal
         * space and if we lose sb update during power failure we'd replay
         * old transaction with possibly newly overwritten data.
         */
        ret = jbd2_journal_update_sb_log_tail(journal, tid, block,
                                              REQ_SYNC | REQ_FUA);
        if (ret)
                goto out;

        write_lock(&journal->j_state_lock);
        freed = block - journal->j_tail;
        if (block < journal->j_tail)
                freed += journal->j_last - journal->j_first;

        trace_jbd2_update_log_tail(journal, tid, block, freed);
        jbd_debug(1,
                  "Cleaning journal tail from %u to %u (offset %lu), "
                  "freeing %lu\n",
                  journal->j_tail_sequence, tid, block, freed);

        journal->j_free += freed;
        journal->j_tail_sequence = tid;
        journal->j_tail = block;
        write_unlock(&journal->j_state_lock);

out:
        return ret;
}

/*
 * This is a variation of __jbd2_update_log_tail which checks for validity of
 * provided log tail and locks j_checkpoint_mutex. So it is safe against races
 * with other threads updating log tail.
 */
void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
{
        mutex_lock_io(&journal->j_checkpoint_mutex);
        if (tid_gt(tid, journal->j_tail_sequence))
                __jbd2_update_log_tail(journal, tid, block);
        mutex_unlock(&journal->j_checkpoint_mutex);
}

struct jbd2_stats_proc_session {
        journal_t *journal;
        struct transaction_stats_s *stats;
        int start;
        int max;
};

static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos)
{
        return *pos ? NULL : SEQ_START_TOKEN;
}

static void *jbd2_seq_info_next(struct seq_file *seq, void *v, loff_t *pos)
{
        (*pos)++;
        return NULL;
}

static int jbd2_seq_info_show(struct seq_file *seq, void *v)
{
        struct jbd2_stats_proc_session *s = seq->private;

        if (v != SEQ_START_TOKEN)
                return 0;
        seq_printf(seq, "%lu transactions (%lu requested), "
                   "each up to %u blocks\n",
                   s->stats->ts_tid, s->stats->ts_requested,
                   s->journal->j_max_transaction_buffers);
        if (s->stats->ts_tid == 0)
                return 0;
        seq_printf(seq, "average: \n  %ums waiting for transaction\n",
            jiffies_to_msecs(s->stats->run.rs_wait / s->stats->ts_tid));
        seq_printf(seq, "  %ums request delay\n",
            (s->stats->ts_requested == 0) ? 0 :
            jiffies_to_msecs(s->stats->run.rs_request_delay /
                             s->stats->ts_requested));
        seq_printf(seq, "  %ums running transaction\n",
            jiffies_to_msecs(s->stats->run.rs_running / s->stats->ts_tid));
        seq_printf(seq, "  %ums transaction was being locked\n",
            jiffies_to_msecs(s->stats->run.rs_locked / s->stats->ts_tid));
        seq_printf(seq, "  %ums flushing data (in ordered mode)\n",
            jiffies_to_msecs(s->stats->run.rs_flushing / s->stats->ts_tid));
        seq_printf(seq, "  %ums logging transaction\n",
            jiffies_to_msecs(s->stats->run.rs_logging / s->stats->ts_tid));
        seq_printf(seq, "  %lluus average transaction commit time\n",
                   div_u64(s->journal->j_average_commit_time, 1000));
        seq_printf(seq, "  %lu handles per transaction\n",
            s->stats->run.rs_handle_count / s->stats->ts_tid);
        seq_printf(seq, "  %lu blocks per transaction\n",
            s->stats->run.rs_blocks / s->stats->ts_tid);
        seq_printf(seq, "  %lu logged blocks per transaction\n",
            s->stats->run.rs_blocks_logged / s->stats->ts_tid);
        return 0;
}

static void jbd2_seq_info_stop(struct seq_file *seq, void *v)
{
}

static const struct seq_operations jbd2_seq_info_ops = {
        .start  = jbd2_seq_info_start,
        .next   = jbd2_seq_info_next,
        .stop   = jbd2_seq_info_stop,
        .show   = jbd2_seq_info_show,
};

static int jbd2_seq_info_open(struct inode *inode, struct file *file)
{
        journal_t *journal = PDE_DATA(inode);
        struct jbd2_stats_proc_session *s;
        int rc, size;

        s = kmalloc(sizeof(*s), GFP_KERNEL);
        if (s == NULL)
                return -ENOMEM;
        size = sizeof(struct transaction_stats_s);
        s->stats = kmalloc(size, GFP_KERNEL);
        if (s->stats == NULL) {
                kfree(s);
                return -ENOMEM;
        }
        spin_lock(&journal->j_history_lock);
        memcpy(s->stats, &journal->j_stats, size);
        s->journal = journal;
        spin_unlock(&journal->j_history_lock);

        rc = seq_open(file, &jbd2_seq_info_ops);
        if (rc == 0) {
                struct seq_file *m = file->private_data;
                m->private = s;
        } else {
                kfree(s->stats);
                kfree(s);
        }
        return rc;

}

static int jbd2_seq_info_release(struct inode *inode, struct file *file)
{
        struct seq_file *seq = file->private_data;
        struct jbd2_stats_proc_session *s = seq->private;
        kfree(s->stats);
        kfree(s);
        return seq_release(inode, file);
}

static const struct proc_ops jbd2_info_proc_ops = {
        .proc_open        = jbd2_seq_info_open,
        .proc_read        = seq_read,
        .proc_lseek        = seq_lseek,
        .proc_release        = jbd2_seq_info_release,
};

static struct proc_dir_entry *proc_jbd2_stats;

static void jbd2_stats_proc_init(journal_t *journal)
{
        journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats);
        if (journal->j_proc_entry) {
                proc_create_data("info", S_IRUGO, journal->j_proc_entry,
                                 &jbd2_info_proc_ops, journal);
        }
}

static void jbd2_stats_proc_exit(journal_t *journal)
{
        remove_proc_entry("info", journal->j_proc_entry);
        remove_proc_entry(journal->j_devname, proc_jbd2_stats);
}

/* Minimum size of descriptor tag */
static int jbd2_min_tag_size(void)
{
        /*
         * Tag with 32-bit block numbers does not use last four bytes of the
         * structure
         */
        return sizeof(journal_block_tag_t) - 4;
}

/*
 * Management for journal control blocks: functions to create and
 * destroy journal_t structures, and to initialise and read existing
 * journal blocks from disk.  */

/* First: create and setup a journal_t object in memory.  We initialise
 * very few fields yet: that has to wait until we have created the
 * journal structures from from scratch, or loaded them from disk. */

static journal_t *journal_init_common(struct block_device *bdev,
                        struct block_device *fs_dev,
                        unsigned long long start, int len, int blocksize)
{
        static struct lock_class_key jbd2_trans_commit_key;
        journal_t *journal;
        int err;
        struct buffer_head *bh;
        int n;

        journal = kzalloc(sizeof(*journal), GFP_KERNEL);
        if (!journal)
                return NULL;

        init_waitqueue_head(&journal->j_wait_transaction_locked);
        init_waitqueue_head(&journal->j_wait_done_commit);
        init_waitqueue_head(&journal->j_wait_commit);
        init_waitqueue_head(&journal->j_wait_updates);
        init_waitqueue_head(&journal->j_wait_reserved);
        init_waitqueue_head(&journal->j_fc_wait);
        mutex_init(&journal->j_abort_mutex);
        mutex_init(&journal->j_barrier);
        mutex_init(&journal->j_checkpoint_mutex);
        spin_lock_init(&journal->j_revoke_lock);
        spin_lock_init(&journal->j_list_lock);
        rwlock_init(&journal->j_state_lock);

        journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
        journal->j_min_batch_time = 0;
        journal->j_max_batch_time = 15000; /* 15ms */
        atomic_set(&journal->j_reserved_credits, 0);

        /* The journal is marked for error until we succeed with recovery! */
        journal->j_flags = JBD2_ABORT;

        /* Set up a default-sized revoke table for the new mount. */
        err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
        if (err)
                goto err_cleanup;

        spin_lock_init(&journal->j_history_lock);

        lockdep_init_map(&journal->j_trans_commit_map, "jbd2_handle",
                         &jbd2_trans_commit_key, 0);

        /* journal descriptor can store up to n blocks -bzzz */
        journal->j_blocksize = blocksize;
        journal->j_dev = bdev;
        journal->j_fs_dev = fs_dev;
        journal->j_blk_offset = start;
        journal->j_total_len = len;
        /* We need enough buffers to write out full descriptor block. */
        n = journal->j_blocksize / jbd2_min_tag_size();
        journal->j_wbufsize = n;
        journal->j_fc_wbuf = NULL;
        journal->j_wbuf = kmalloc_array(n, sizeof(struct buffer_head *),
                                        GFP_KERNEL);
        if (!journal->j_wbuf)
                goto err_cleanup;

        bh = getblk_unmovable(journal->j_dev, start, journal->j_blocksize);
        if (!bh) {
                pr_err("%s: Cannot get buffer for journal superblock\n",
                        __func__);
                goto err_cleanup;
        }
        journal->j_sb_buffer = bh;
        journal->j_superblock = (journal_superblock_t *)bh->b_data;

        return journal;

err_cleanup:
        kfree(journal->j_wbuf);
        jbd2_journal_destroy_revoke(journal);
        kfree(journal);
        return NULL;
}

/* jbd2_journal_init_dev and jbd2_journal_init_inode:
 *
 * Create a journal structure assigned some fixed set of disk blocks to
 * the journal.  We don't actually touch those disk blocks yet, but we
 * need to set up all of the mapping information to tell the journaling
 * system where the journal blocks are.
 *
 */

/**
 *  journal_t * jbd2_journal_init_dev() - creates and initialises a journal structure
 *  @bdev: Block device on which to create the journal
 *  @fs_dev: Device which hold journalled filesystem for this journal.
 *  @start: Block nr Start of journal.
 *  @len:  Length of the journal in blocks.
 *  @blocksize: blocksize of journalling device
 *
 *  Returns: a newly created journal_t *
 *
 *  jbd2_journal_init_dev creates a journal which maps a fixed contiguous
 *  range of blocks on an arbitrary block device.
 *
 */
journal_t *jbd2_journal_init_dev(struct block_device *bdev,
                        struct block_device *fs_dev,
                        unsigned long long start, int len, int blocksize)
{
        journal_t *journal;

        journal = journal_init_common(bdev, fs_dev, start, len, blocksize);
        if (!journal)
                return NULL;

        bdevname(journal->j_dev, journal->j_devname);
        strreplace(journal->j_devname, '/', '!');
        jbd2_stats_proc_init(journal);

        return journal;
}

/**
 *  journal_t * jbd2_journal_init_inode () - creates a journal which maps to a inode.
 *  @inode: An inode to create the journal in
 *
 * jbd2_journal_init_inode creates a journal which maps an on-disk inode as
 * the journal.  The inode must exist already, must support bmap() and
 * must have all data blocks preallocated.
 */
journal_t *jbd2_journal_init_inode(struct inode *inode)
{
        journal_t *journal;
        sector_t blocknr;
        char *p;
        int err = 0;

        blocknr = 0;
        err = bmap(inode, &blocknr);

        if (err || !blocknr) {
                pr_err("%s: Cannot locate journal superblock\n",
                        __func__);
                return NULL;
        }

        jbd_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n",
                  inode->i_sb->s_id, inode->i_ino, (long long) inode->i_size,
                  inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);

        journal = journal_init_common(inode->i_sb->s_bdev, inode->i_sb->s_bdev,
                        blocknr, inode->i_size >> inode->i_sb->s_blocksize_bits,
                        inode->i_sb->s_blocksize);
        if (!journal)
                return NULL;

        journal->j_inode = inode;
        bdevname(journal->j_dev, journal->j_devname);
        p = strreplace(journal->j_devname, '/', '!');
        sprintf(p, "-%lu", journal->j_inode->i_ino);
        jbd2_stats_proc_init(journal);

        return journal;
}

/*
 * If the journal init or create aborts, we need to mark the journal
 * superblock as being NULL to prevent the journal destroy from writing
 * back a bogus superblock.
 */
static void journal_fail_superblock(journal_t *journal)
{
        struct buffer_head *bh = journal->j_sb_buffer;
        brelse(bh);
        journal->j_sb_buffer = NULL;
}

static int jbd2_journal_get_max_txn_bufs(journal_t *journal)
{
        return (journal->j_total_len - journal->j_fc_wbufsize) / 4;
}

/*
 * Given a journal_t structure, initialise the various fields for
 * startup of a new journaling session.  We use this both when creating
 * a journal, and after recovering an old journal to reset it for
 * subsequent use.
 */

static int journal_reset(journal_t *journal)
{
        journal_superblock_t *sb = journal->j_superblock;
        unsigned long long first, last;

        first = be32_to_cpu(sb->s_first);
        last = be32_to_cpu(sb->s_maxlen);
        if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) {
                printk(KERN_ERR "JBD2: Journal too short (blocks %llu-%llu).\n",
                       first, last);
                journal_fail_superblock(journal);
                return -EINVAL;
        }

        journal->j_first = first;
        journal->j_last = last;

        journal->j_head = journal->j_first;
        journal->j_tail = journal->j_first;
        journal->j_free = journal->j_last - journal->j_first;

        journal->j_tail_sequence = journal->j_transaction_sequence;
        journal->j_commit_sequence = journal->j_transaction_sequence - 1;
        journal->j_commit_request = journal->j_commit_sequence;

        journal->j_max_transaction_buffers = jbd2_journal_get_max_txn_bufs(journal);

        /*
         * Now that journal recovery is done, turn fast commits off here. This
         * way, if fast commit was enabled before the crash but if now FS has
         * disabled it, we don't enable fast commits.
         */
        jbd2_clear_feature_fast_commit(journal);

        /*
         * As a special case, if the on-disk copy is already marked as needing
         * no recovery (s_start == 0), then we can safely defer the superblock
         * update until the next commit by setting JBD2_FLUSHED.  This avoids
         * attempting a write to a potential-readonly device.
         */
        if (sb->s_start == 0) {
                jbd_debug(1, "JBD2: Skipping superblock update on recovered sb "
                        "(start %ld, seq %u, errno %d)\n",
                        journal->j_tail, journal->j_tail_sequence,
                        journal->j_errno);
                journal->j_flags |= JBD2_FLUSHED;
        } else {
                /* Lock here to make assertions happy... */
                mutex_lock_io(&journal->j_checkpoint_mutex);
                /*
                 * Update log tail information. We use REQ_FUA since new
                 * transaction will start reusing journal space and so we
                 * must make sure information about current log tail is on
                 * disk before that.
                 */
                jbd2_journal_update_sb_log_tail(journal,
                                                journal->j_tail_sequence,
                                                journal->j_tail,
                                                REQ_SYNC | REQ_FUA);
                mutex_unlock(&journal->j_checkpoint_mutex);
        }
        return jbd2_journal_start_thread(journal);
}

/*
 * This function expects that the caller will have locked the journal
 * buffer head, and will return with it unlocked
 */
static int jbd2_write_superblock(journal_t *journal, int write_flags)
{
        struct buffer_head *bh = journal->j_sb_buffer;
        journal_superblock_t *sb = journal->j_superblock;
        int ret;

        /* Buffer got discarded which means block device got invalidated */
        if (!buffer_mapped(bh)) {
                unlock_buffer(bh);
                return -EIO;
        }

        if (!(journal->j_flags & JBD2_BARRIER))
                write_flags &= ~(REQ_FUA | REQ_PREFLUSH);

        trace_jbd2_write_superblock(journal, write_flags);

        if (buffer_write_io_error(bh)) {
                /*
                 * Oh, dear.  A previous attempt to write the journal
                 * superblock failed.  This could happen because the
                 * USB device was yanked out.  Or it could happen to
                 * be a transient write error and maybe the block will
                 * be remapped.  Nothing we can do but to retry the
                 * write and hope for the best.
                 */
                printk(KERN_ERR "JBD2: previous I/O error detected "
                       "for journal superblock update for %s.\n",
                       journal->j_devname);
                clear_buffer_write_io_error(bh);
                set_buffer_uptodate(bh);
        }
        if (jbd2_journal_has_csum_v2or3(journal))
                sb->s_checksum = jbd2_superblock_csum(journal, sb);
        get_bh(bh);
        bh->b_end_io = end_buffer_write_sync;
        ret = submit_bh(REQ_OP_WRITE, write_flags, bh);
        wait_on_buffer(bh);
        if (buffer_write_io_error(bh)) {
                clear_buffer_write_io_error(bh);
                set_buffer_uptodate(bh);
                ret = -EIO;
        }
        if (ret) {
                printk(KERN_ERR "JBD2: Error %d detected when updating "
                       "journal superblock for %s.\n", ret,
                       journal->j_devname);
                if (!is_journal_aborted(journal))
                        jbd2_journal_abort(journal, ret);
        }

        return ret;
}

/**
 * jbd2_journal_update_sb_log_tail() - Update log tail in journal sb on disk.
 * @journal: The journal to update.
 * @tail_tid: TID of the new transaction at the tail of the log
 * @tail_block: The first block of the transaction at the tail of the log
 * @write_op: With which operation should we write the journal sb
 *
 * Update a journal's superblock information about log tail and write it to
 * disk, waiting for the IO to complete.
 */
int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
                                     unsigned long tail_block, int write_op)
{
        journal_superblock_t *sb = journal->j_superblock;
        int ret;

        if (is_journal_aborted(journal))
                return -EIO;

        BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
        jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
                  tail_block, tail_tid);

        lock_buffer(journal->j_sb_buffer);
        sb->s_sequence = cpu_to_be32(tail_tid);
        sb->s_start    = cpu_to_be32(tail_block);

        ret = jbd2_write_superblock(journal, write_op);
        if (ret)
                goto out;

        /* Log is no longer empty */
        write_lock(&journal->j_state_lock);
        journal->j_flags &= ~JBD2_FLUSHED;
        write_unlock(&journal->j_state_lock);

out:
        return ret;
}

/**
 * jbd2_mark_journal_empty() - Mark on disk journal as empty.
 * @journal: The journal to update.
 * @write_op: With which operation should we write the journal sb
 *
 * Update a journal's dynamic superblock fields to show that journal is empty.
 * Write updated superblock to disk waiting for IO to complete.
 */
static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
{
        journal_superblock_t *sb = journal->j_superblock;
        bool had_fast_commit = false;

        BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
        lock_buffer(journal->j_sb_buffer);
        if (sb->s_start == 0) {                /* Is it already empty? */
                unlock_buffer(journal->j_sb_buffer);
                return;
        }

        jbd_debug(1, "JBD2: Marking journal as empty (seq %u)\n",
                  journal->j_tail_sequence);

        sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
        sb->s_start    = cpu_to_be32(0);
        if (jbd2_has_feature_fast_commit(journal)) {
                /*
                 * When journal is clean, no need to commit fast commit flag and
                 * make file system incompatible with older kernels.
                 */
                jbd2_clear_feature_fast_commit(journal);
                had_fast_commit = true;
        }

        jbd2_write_superblock(journal, write_op);

        if (had_fast_commit)
                jbd2_set_feature_fast_commit(journal);

        /* Log is no longer empty */
        write_lock(&journal->j_state_lock);
        journal->j_flags |= JBD2_FLUSHED;
        write_unlock(&journal->j_state_lock);
}


/**
 * jbd2_journal_update_sb_errno() - Update error in the journal.
 * @journal: The journal to update.
 *
 * Update a journal's errno.  Write updated superblock to disk waiting for IO
 * to complete.
 */
void jbd2_journal_update_sb_errno(journal_t *journal)
{
        journal_superblock_t *sb = journal->j_superblock;
        int errcode;

        lock_buffer(journal->j_sb_buffer);
        errcode = journal->j_errno;
        if (errcode == -ESHUTDOWN)
                errcode = 0;
        jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode);
        sb->s_errno    = cpu_to_be32(errcode);

        jbd2_write_superblock(journal, REQ_SYNC | REQ_FUA);
}
EXPORT_SYMBOL(jbd2_journal_update_sb_errno);

static int journal_revoke_records_per_block(journal_t *journal)
{
        int record_size;
        int space = journal->j_blocksize - sizeof(jbd2_journal_revoke_header_t);

        if (jbd2_has_feature_64bit(journal))
                record_size = 8;
        else
                record_size = 4;

        if (jbd2_journal_has_csum_v2or3(journal))
                space -= sizeof(struct jbd2_journal_block_tail);
        return space / record_size;
}

/*
 * Read the superblock for a given journal, performing initial
 * validation of the format.
 */
static int journal_get_superblock(journal_t *journal)
{
        struct buffer_head *bh;
        journal_superblock_t *sb;
        int err = -EIO;

        bh = journal->j_sb_buffer;

        J_ASSERT(bh != NULL);
        if (!buffer_uptodate(bh)) {
                ll_rw_block(REQ_OP_READ, 0, 1, &bh);
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh)) {
                        printk(KERN_ERR
                                "JBD2: IO error reading journal superblock\n");
                        goto out;
                }
        }

        if (buffer_verified(bh))
                return 0;

        sb = journal->j_superblock;

        err = -EINVAL;

        if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) ||
            sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
                printk(KERN_WARNING "JBD2: no valid journal superblock found\n");
                goto out;
        }

        switch(be32_to_cpu(sb->s_header.h_blocktype)) {
        case JBD2_SUPERBLOCK_V1:
                journal->j_format_version = 1;
                break;
        case JBD2_SUPERBLOCK_V2:
                journal->j_format_version = 2;
                break;
        default:
                printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n");
                goto out;
        }

        if (be32_to_cpu(sb->s_maxlen) < journal->j_total_len)
                journal->j_total_len = be32_to_cpu(sb->s_maxlen);
        else if (be32_to_cpu(sb->s_maxlen) > journal->j_total_len) {
                printk(KERN_WARNING "JBD2: journal file too short\n");
                goto out;
        }

        if (be32_to_cpu(sb->s_first) == 0 ||
            be32_to_cpu(sb->s_first) >= journal->j_total_len) {
                printk(KERN_WARNING
                        "JBD2: Invalid start block of journal: %u\n",
                        be32_to_cpu(sb->s_first));
                goto out;
        }

        if (jbd2_has_feature_csum2(journal) &&
            jbd2_has_feature_csum3(journal)) {
                /* Can't have checksum v2 and v3 at the same time! */
                printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 "
                       "at the same time!\n");
                goto out;
        }

        if (jbd2_journal_has_csum_v2or3_feature(journal) &&
            jbd2_has_feature_checksum(journal)) {
                /* Can't have checksum v1 and v2 on at the same time! */
                printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 "
                       "at the same time!\n");
                goto out;
        }

        if (!jbd2_verify_csum_type(journal, sb)) {
                printk(KERN_ERR "JBD2: Unknown checksum type\n");
                goto out;
        }

        /* Load the checksum driver */
        if (jbd2_journal_has_csum_v2or3_feature(journal)) {
                journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
                if (IS_ERR(journal->j_chksum_driver)) {
                        printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
                        err = PTR_ERR(journal->j_chksum_driver);
                        journal->j_chksum_driver = NULL;
                        goto out;
                }
        }

        if (jbd2_journal_has_csum_v2or3(journal)) {
                /* Check superblock checksum */
                if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) {
                        printk(KERN_ERR "JBD2: journal checksum error\n");
                        err = -EFSBADCRC;
                        goto out;
                }

                /* Precompute checksum seed for all metadata */
                journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
                                                   sizeof(sb->s_uuid));
        }

        journal->j_revoke_records_per_block =
                                journal_revoke_records_per_block(journal);
        set_buffer_verified(bh);

        return 0;

out:
        journal_fail_superblock(journal);
        return err;
}

/*
 * Load the on-disk journal superblock and read the key fields into the
 * journal_t.
 */

static int load_superblock(journal_t *journal)
{
        int err;
        journal_superblock_t *sb;
        int num_fc_blocks;

        err = journal_get_superblock(journal);
        if (err)
                return err;

        sb = journal->j_superblock;

        journal->j_tail_sequence = be32_to_cpu(sb->s_sequence);
        journal->j_tail = be32_to_cpu(sb->s_start);
        journal->j_first = be32_to_cpu(sb->s_first);
        journal->j_errno = be32_to_cpu(sb->s_errno);
        journal->j_last = be32_to_cpu(sb->s_maxlen);

        if (jbd2_has_feature_fast_commit(journal)) {
                journal->j_fc_last = be32_to_cpu(sb->s_maxlen);
                num_fc_blocks = be32_to_cpu(sb->s_num_fc_blks);
                if (!num_fc_blocks)
                        num_fc_blocks = JBD2_MIN_FC_BLOCKS;
                if (journal->j_last - num_fc_blocks >= JBD2_MIN_JOURNAL_BLOCKS)
                        journal->j_last = journal->j_fc_last - num_fc_blocks;
                journal->j_fc_first = journal->j_last + 1;
                journal->j_fc_off = 0;
        }

        return 0;
}


/**
 * jbd2_journal_load() - Read journal from disk.
 * @journal: Journal to act on.
 *
 * Given a journal_t structure which tells us which disk blocks contain
 * a journal, read the journal from disk to initialise the in-memory
 * structures.
 */
int jbd2_journal_load(journal_t *journal)
{
        int err;
        journal_superblock_t *sb;

        err = load_superblock(journal);
        if (err)
                return err;

        sb = journal->j_superblock;
        /* If this is a V2 superblock, then we have to check the
         * features flags on it. */

        if (journal->j_format_version >= 2) {
                if ((sb->s_feature_ro_compat &
                     ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) ||
                    (sb->s_feature_incompat &
                     ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) {
                        printk(KERN_WARNING
                                "JBD2: Unrecognised features on journal\n");
                        return -EINVAL;
                }
        }

        /*
         * Create a slab for this blocksize
         */
        err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize));
        if (err)
                return err;

        /* Let the recovery code check whether it needs to recover any
         * data from the journal. */
        if (jbd2_journal_recover(journal))
                goto recovery_error;

        if (journal->j_failed_commit) {
                printk(KERN_ERR "JBD2: journal transaction %u on %s "
                       "is corrupt.\n", journal->j_failed_commit,
                       journal->j_devname);
                return -EFSCORRUPTED;
        }
        /*
         * clear JBD2_ABORT flag initialized in journal_init_common
         * here to update log tail information with the newest seq.
         */
        journal->j_flags &= ~JBD2_ABORT;

        /* OK, we've finished with the dynamic journal bits:
         * reinitialise the dynamic contents of the superblock in memory
         * and reset them on disk. */
        if (journal_reset(journal))
                goto recovery_error;

        journal->j_flags |= JBD2_LOADED;
        return 0;

recovery_error:
        printk(KERN_WARNING "JBD2: recovery failed\n");
        return -EIO;
}

/**
 * jbd2_journal_destroy() - Release a journal_t structure.
 * @journal: Journal to act on.
 *
 * Release a journal_t structure once it is no longer in use by the
 * journaled object.
 * Return <0 if we couldn't clean up the journal.
 */
int jbd2_journal_destroy(journal_t *journal)
{
        int err = 0;

        /* Wait for the commit thread to wake up and die. */
        journal_kill_thread(journal);

        /* Force a final log commit */
        if (journal->j_running_transaction)
                jbd2_journal_commit_transaction(journal);

        /* Force any old transactions to disk */

        /* Totally anal locking here... */
        spin_lock(&journal->j_list_lock);
        while (journal->j_checkpoint_transactions != NULL) {
                spin_unlock(&journal->j_list_lock);
                mutex_lock_io(&journal->j_checkpoint_mutex);
                err = jbd2_log_do_checkpoint(journal);
                mutex_unlock(&journal->j_checkpoint_mutex);
                /*
                 * If checkpointing failed, just free the buffers to avoid
                 * looping forever
                 */
                if (err) {
                        jbd2_journal_destroy_checkpoint(journal);
                        spin_lock(&journal->j_list_lock);
                        break;
                }
                spin_lock(&journal->j_list_lock);
        }

        J_ASSERT(journal->j_running_transaction == NULL);
        J_ASSERT(journal->j_committing_transaction == NULL);
        J_ASSERT(journal->j_checkpoint_transactions == NULL);
        spin_unlock(&journal->j_list_lock);

        if (journal->j_sb_buffer) {
                if (!is_journal_aborted(journal)) {
                        mutex_lock_io(&journal->j_checkpoint_mutex);

                        write_lock(&journal->j_state_lock);
                        journal->j_tail_sequence =
                                ++journal->j_transaction_sequence;
                        write_unlock(&journal->j_state_lock);

                        jbd2_mark_journal_empty(journal,
                                        REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
                        mutex_unlock(&journal->j_checkpoint_mutex);
                } else
                        err = -EIO;
                brelse(journal->j_sb_buffer);
        }

        if (journal->j_proc_entry)
                jbd2_stats_proc_exit(journal);
        iput(journal->j_inode);
        if (journal->j_revoke)
                jbd2_journal_destroy_revoke(journal);
        if (journal->j_chksum_driver)
                crypto_free_shash(journal->j_chksum_driver);
        kfree(journal->j_fc_wbuf);
        kfree(journal->j_wbuf);
        kfree(journal);

        return err;
}


/**
 * jbd2_journal_check_used_features() - Check if features specified are used.
 * @journal: Journal to check.
 * @compat: bitmask of compatible features
 * @ro: bitmask of features that force read-only mount
 * @incompat: bitmask of incompatible features
 *
 * Check whether the journal uses all of a given set of
 * features.  Return true (non-zero) if it does.
 **/

int jbd2_journal_check_used_features(journal_t *journal, unsigned long compat,
                                 unsigned long ro, unsigned long incompat)
{
        journal_superblock_t *sb;

        if (!compat && !ro && !incompat)
                return 1;
        /* Load journal superblock if it is not loaded yet. */
        if (journal->j_format_version == 0 &&
            journal_get_superblock(journal) != 0)
                return 0;
        if (journal->j_format_version == 1)
                return 0;

        sb = journal->j_superblock;

        if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) &&
            ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) &&
            ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat))
                return 1;

        return 0;
}

/**
 * jbd2_journal_check_available_features() - Check feature set in journalling layer
 * @journal: Journal to check.
 * @compat: bitmask of compatible features
 * @ro: bitmask of features that force read-only mount
 * @incompat: bitmask of incompatible features
 *
 * Check whether the journaling code supports the use of
 * all of a given set of features on this journal.  Return true
 * (non-zero) if it can. */

int jbd2_journal_check_available_features(journal_t *journal, unsigned long compat,
                                      unsigned long ro, unsigned long incompat)
{
        if (!compat && !ro && !incompat)
                return 1;

        /* We can support any known requested features iff the
         * superblock is in version 2.  Otherwise we fail to support any
         * extended sb features. */

        if (journal->j_format_version != 2)
                return 0;

        if ((compat   & JBD2_KNOWN_COMPAT_FEATURES) == compat &&
            (ro       & JBD2_KNOWN_ROCOMPAT_FEATURES) == ro &&
            (incompat & JBD2_KNOWN_INCOMPAT_FEATURES) == incompat)
                return 1;

        return 0;
}

static int
jbd2_journal_initialize_fast_commit(journal_t *journal)
{
        journal_superblock_t *sb = journal->j_superblock;
        unsigned long long num_fc_blks;

        num_fc_blks = be32_to_cpu(sb->s_num_fc_blks);
        if (num_fc_blks == 0)
                num_fc_blks = JBD2_MIN_FC_BLOCKS;
        if (journal->j_last - num_fc_blks < JBD2_MIN_JOURNAL_BLOCKS)
                return -ENOSPC;

        /* Are we called twice? */
        WARN_ON(journal->j_fc_wbuf != NULL);
        journal->j_fc_wbuf = kmalloc_array(num_fc_blks,
                                sizeof(struct buffer_head *), GFP_KERNEL);
        if (!journal->j_fc_wbuf)
                return -ENOMEM;

        journal->j_fc_wbufsize = num_fc_blks;
        journal->j_fc_last = journal->j_last;
        journal->j_last = journal->j_fc_last - num_fc_blks;
        journal->j_fc_first = journal->j_last + 1;
        journal->j_fc_off = 0;
        journal->j_free = journal->j_last - journal->j_first;
        journal->j_max_transaction_buffers =
                jbd2_journal_get_max_txn_bufs(journal);

        return 0;
}

/**
 * jbd2_journal_set_features() - Mark a given journal feature in the superblock
 * @journal: Journal to act on.
 * @compat: bitmask of compatible features
 * @ro: bitmask of features that force read-only mount
 * @incompat: bitmask of incompatible features
 *
 * Mark a given journal feature as present on the
 * superblock.  Returns true if the requested features could be set.
 *
 */

int jbd2_journal_set_features(journal_t *journal, unsigned long compat,
                          unsigned long ro, unsigned long incompat)
{
#define INCOMPAT_FEATURE_ON(f) \
                ((incompat & (f)) && !(sb->s_feature_incompat & cpu_to_be32(f)))
#define COMPAT_FEATURE_ON(f) \
                ((compat & (f)) && !(sb->s_feature_compat & cpu_to_be32(f)))
        journal_superblock_t *sb;

        if (jbd2_journal_check_used_features(journal, compat, ro, incompat))
                return 1;

        if (!jbd2_journal_check_available_features(journal, compat, ro, incompat))
                return 0;

        /* If enabling v2 checksums, turn on v3 instead */
        if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2) {
                incompat &= ~JBD2_FEATURE_INCOMPAT_CSUM_V2;
                incompat |= JBD2_FEATURE_INCOMPAT_CSUM_V3;
        }

        /* Asking for checksumming v3 and v1?  Only give them v3. */
        if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V3 &&
            compat & JBD2_FEATURE_COMPAT_CHECKSUM)
                compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM;

        jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
                  compat, ro, incompat);

        sb = journal->j_superblock;

        if (incompat & JBD2_FEATURE_INCOMPAT_FAST_COMMIT) {
                if (jbd2_journal_initialize_fast_commit(journal)) {
                        pr_err("JBD2: Cannot enable fast commits.\n");
                        return 0;
                }
        }

        /* Load the checksum driver if necessary */
        if ((journal->j_chksum_driver == NULL) &&
            INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
                journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
                if (IS_ERR(journal->j_chksum_driver)) {
                        printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
                        journal->j_chksum_driver = NULL;
                        return 0;
                }
                /* Precompute checksum seed for all metadata */
                journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
                                                   sizeof(sb->s_uuid));
        }

        lock_buffer(journal->j_sb_buffer);

        /* If enabling v3 checksums, update superblock */
        if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
                sb->s_checksum_type = JBD2_CRC32C_CHKSUM;
                sb->s_feature_compat &=
                        ~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM);
        }

        /* If enabling v1 checksums, downgrade superblock */
        if (COMPAT_FEATURE_ON(JBD2_FEATURE_COMPAT_CHECKSUM))
                sb->s_feature_incompat &=
                        ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2 |
                                     JBD2_FEATURE_INCOMPAT_CSUM_V3);

        sb->s_feature_compat    |= cpu_to_be32(compat);
        sb->s_feature_ro_compat |= cpu_to_be32(ro);
        sb->s_feature_incompat  |= cpu_to_be32(incompat);
        /*
         * Update the checksum now so that it is valid even for read-only
         * filesystems where jbd2_write_superblock() doesn't get called.
         */
        if (jbd2_journal_has_csum_v2or3(journal))
                sb->s_checksum = jbd2_superblock_csum(journal, sb);
        unlock_buffer(journal->j_sb_buffer);
        journal->j_revoke_records_per_block =
                                journal_revoke_records_per_block(journal);

        return 1;
#undef COMPAT_FEATURE_ON
#undef INCOMPAT_FEATURE_ON
}

/*
 * jbd2_journal_clear_features() - Clear a given journal feature in the
 *                                     superblock
 * @journal: Journal to act on.
 * @compat: bitmask of compatible features
 * @ro: bitmask of features that force read-only mount
 * @incompat: bitmask of incompatible features
 *
 * Clear a given journal feature as present on the
 * superblock.
 */
void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
                                unsigned long ro, unsigned long incompat)
{
        journal_superblock_t *sb;

        jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n",
                  compat, ro, incompat);

        sb = journal->j_superblock;

        lock_buffer(journal->j_sb_buffer);
        sb->s_feature_compat    &= ~cpu_to_be32(compat);
        sb->s_feature_ro_compat &= ~cpu_to_be32(ro);
        sb->s_feature_incompat  &= ~cpu_to_be32(incompat);
        /*
         * Update the checksum now so that it is valid even for read-only
         * filesystems where jbd2_write_superblock() doesn't get called.
         */
        if (jbd2_journal_has_csum_v2or3(journal))
                sb->s_checksum = jbd2_superblock_csum(journal, sb);
        unlock_buffer(journal->j_sb_buffer);
        journal->j_revoke_records_per_block =
                                journal_revoke_records_per_block(journal);
}
EXPORT_SYMBOL(jbd2_journal_clear_features);

/**
 * jbd2_journal_flush() - Flush journal
 * @journal: Journal to act on.
 *
 * Flush all data for a given journal to disk and empty the journal.
 * Filesystems can use this when remounting readonly to ensure that
 * recovery does not need to happen on remount.
 */

int jbd2_journal_flush(journal_t *journal)
{
        int err = 0;
        transaction_t *transaction = NULL;

        write_lock(&journal->j_state_lock);

        /* Force everything buffered to the log... */
        if (journal->j_running_transaction) {
                transaction = journal->j_running_transaction;
                __jbd2_log_start_commit(journal, transaction->t_tid);
        } else if (journal->j_committing_transaction)
                transaction = journal->j_committing_transaction;

        /* Wait for the log commit to complete... */
        if (transaction) {
                tid_t tid = transaction->t_tid;

                write_unlock(&journal->j_state_lock);
                jbd2_log_wait_commit(journal, tid);
        } else {
                write_unlock(&journal->j_state_lock);
        }

        /* ...and flush everything in the log out to disk. */
        spin_lock(&journal->j_list_lock);
        while (!err && journal->j_checkpoint_transactions != NULL) {
                spin_unlock(&journal->j_list_lock);
                mutex_lock_io(&journal->j_checkpoint_mutex);
                err = jbd2_log_do_checkpoint(journal);
                mutex_unlock(&journal->j_checkpoint_mutex);
                spin_lock(&journal->j_list_lock);
        }
        spin_unlock(&journal->j_list_lock);

        if (is_journal_aborted(journal))
                return -EIO;

        mutex_lock_io(&journal->j_checkpoint_mutex);
        if (!err) {
                err = jbd2_cleanup_journal_tail(journal);
                if (err < 0) {
                        mutex_unlock(&journal->j_checkpoint_mutex);
                        goto out;
                }
                err = 0;
        }

        /* Finally, mark the journal as really needing no recovery.
         * This sets s_start==0 in the underlying superblock, which is
         * the magic code for a fully-recovered superblock.  Any future
         * commits of data to the journal will restore the current
         * s_start value. */
        jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
        mutex_unlock(&journal->j_checkpoint_mutex);
        write_lock(&journal->j_state_lock);
        J_ASSERT(!journal->j_running_transaction);
        J_ASSERT(!journal->j_committing_transaction);
        J_ASSERT(!journal->j_checkpoint_transactions);
        J_ASSERT(journal->j_head == journal->j_tail);
        J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
        write_unlock(&journal->j_state_lock);
out:
        return err;
}

/**
 * jbd2_journal_wipe() - Wipe journal contents
 * @journal: Journal to act on.
 * @write: flag (see below)
 *
 * Wipe out all of the contents of a journal, safely.  This will produce
 * a warning if the journal contains any valid recovery information.
 * Must be called between journal_init_*() and jbd2_journal_load().
 *
 * If 'write' is non-zero, then we wipe out the journal on disk; otherwise
 * we merely suppress recovery.
 */

int jbd2_journal_wipe(journal_t *journal, int write)
{
        int err = 0;

        J_ASSERT (!(journal->j_flags & JBD2_LOADED));

        err = load_superblock(journal);
        if (err)
                return err;

        if (!journal->j_tail)
                goto no_recovery;

        printk(KERN_WARNING "JBD2: %s recovery information on journal\n",
                write ? "Clearing" : "Ignoring");

        err = jbd2_journal_skip_recovery(journal);
        if (write) {
                /* Lock to make assertions happy... */
                mutex_lock_io(&journal->j_checkpoint_mutex);
                jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
                mutex_unlock(&journal->j_checkpoint_mutex);
        }

 no_recovery:
        return err;
}

/**
 * jbd2_journal_abort () - Shutdown the journal immediately.
 * @journal: the journal to shutdown.
 * @errno:   an error number to record in the journal indicating
 *           the reason for the shutdown.
 *
 * Perform a complete, immediate shutdown of the ENTIRE
 * journal (not of a single transaction).  This operation cannot be
 * undone without closing and reopening the journal.
 *
 * The jbd2_journal_abort function is intended to support higher level error
 * recovery mechanisms such as the ext2/ext3 remount-readonly error
 * mode.
 *
 * Journal abort has very specific semantics.  Any existing dirty,
 * unjournaled buffers in the main filesystem will still be written to
 * disk by bdflush, but the journaling mechanism will be suspended
 * immediately and no further transaction commits will be honoured.
 *
 * Any dirty, journaled buffers will be written back to disk without
 * hitting the journal.  Atomicity cannot be guaranteed on an aborted
 * filesystem, but we _do_ attempt to leave as much data as possible
 * behind for fsck to use for cleanup.
 *
 * Any attempt to get a new transaction handle on a journal which is in
 * ABORT state will just result in an -EROFS error return.  A
 * jbd2_journal_stop on an existing handle will return -EIO if we have
 * entered abort state during the update.
 *
 * Recursive transactions are not disturbed by journal abort until the
 * final jbd2_journal_stop, which will receive the -EIO error.
 *
 * Finally, the jbd2_journal_abort call allows the caller to supply an errno
 * which will be recorded (if possible) in the journal superblock.  This
 * allows a client to record failure conditions in the middle of a
 * transaction without having to complete the transaction to record the
 * failure to disk.  ext3_error, for example, now uses this
 * functionality.
 *
 */

void jbd2_journal_abort(journal_t *journal, int errno)
{
        transaction_t *transaction;

        /*
         * Lock the aborting procedure until everything is done, this avoid
         * races between filesystem's error handling flow (e.g. ext4_abort()),
         * ensure panic after the error info is written into journal's
         * superblock.
         */
        mutex_lock(&journal->j_abort_mutex);
        /*
         * ESHUTDOWN always takes precedence because a file system check
         * caused by any other journal abort error is not required after
         * a shutdown triggered.
         */
        write_lock(&journal->j_state_lock);
        if (journal->j_flags & JBD2_ABORT) {
                int old_errno = journal->j_errno;

                write_unlock(&journal->j_state_lock);
                if (old_errno != -ESHUTDOWN && errno == -ESHUTDOWN) {
                        journal->j_errno = errno;
                        jbd2_journal_update_sb_errno(journal);
                }
                mutex_unlock(&journal->j_abort_mutex);
                return;
        }

        /*
         * Mark the abort as occurred and start current running transaction
         * to release all journaled buffer.
         */
        pr_err("Aborting journal on device %s.\n", journal->j_devname);

        journal->j_flags |= JBD2_ABORT;
        journal->j_errno = errno;
        transaction = journal->j_running_transaction;
        if (transaction)
                __jbd2_log_start_commit(journal, transaction->t_tid);
        write_unlock(&journal->j_state_lock);

        /*
         * Record errno to the journal super block, so that fsck and jbd2
         * layer could realise that a filesystem check is needed.
         */
        jbd2_journal_update_sb_errno(journal);
        mutex_unlock(&journal->j_abort_mutex);
}

/**
 * jbd2_journal_errno() - returns the journal's error state.
 * @journal: journal to examine.
 *
 * This is the errno number set with jbd2_journal_abort(), the last
 * time the journal was mounted - if the journal was stopped
 * without calling abort this will be 0.
 *
 * If the journal has been aborted on this mount time -EROFS will
 * be returned.
 */
int jbd2_journal_errno(journal_t *journal)
{
        int err;

        read_lock(&journal->j_state_lock);
        if (journal->j_flags & JBD2_ABORT)
                err = -EROFS;
        else
                err = journal->j_errno;
        read_unlock(&journal->j_state_lock);
        return err;
}

/**
 * jbd2_journal_clear_err() - clears the journal's error state
 * @journal: journal to act on.
 *
 * An error must be cleared or acked to take a FS out of readonly
 * mode.
 */
int jbd2_journal_clear_err(journal_t *journal)
{
        int err = 0;

        write_lock(&journal->j_state_lock);
        if (journal->j_flags & JBD2_ABORT)
                err = -EROFS;
        else
                journal->j_errno = 0;
        write_unlock(&journal->j_state_lock);
        return err;
}

/**
 * jbd2_journal_ack_err() - Ack journal err.
 * @journal: journal to act on.
 *
 * An error must be cleared or acked to take a FS out of readonly
 * mode.
 */
void jbd2_journal_ack_err(journal_t *journal)
{
        write_lock(&journal->j_state_lock);
        if (journal->j_errno)
                journal->j_flags |= JBD2_ACK_ERR;
        write_unlock(&journal->j_state_lock);
}

int jbd2_journal_blocks_per_page(struct inode *inode)
{
        return 1 << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
}

/*
 * helper functions to deal with 32 or 64bit block numbers.
 */
size_t journal_tag_bytes(journal_t *journal)
{
        size_t sz;

        if (jbd2_has_feature_csum3(journal))
                return sizeof(journal_block_tag3_t);

        sz = sizeof(journal_block_tag_t);

        if (jbd2_has_feature_csum2(journal))
                sz += sizeof(__u16);

        if (jbd2_has_feature_64bit(journal))
                return sz;
        else
                return sz - sizeof(__u32);
}

/*
 * JBD memory management
 *
 * These functions are used to allocate block-sized chunks of memory
 * used for making copies of buffer_head data.  Very often it will be
 * page-sized chunks of data, but sometimes it will be in
 * sub-page-size chunks.  (For example, 16k pages on Power systems
 * with a 4k block file system.)  For blocks smaller than a page, we
 * use a SLAB allocator.  There are slab caches for each block size,
 * which are allocated at mount time, if necessary, and we only free
 * (all of) the slab caches when/if the jbd2 module is unloaded.  For
 * this reason we don't need to a mutex to protect access to
 * jbd2_slab[] allocating or releasing memory; only in
 * jbd2_journal_create_slab().
 */
#define JBD2_MAX_SLABS 8
static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];

static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
        "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
        "jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k"
};


static void jbd2_journal_destroy_slabs(void)
{
        int i;

        for (i = 0; i < JBD2_MAX_SLABS; i++) {
                kmem_cache_destroy(jbd2_slab[i]);
                jbd2_slab[i] = NULL;
        }
}

static int jbd2_journal_create_slab(size_t size)
{
        static DEFINE_MUTEX(jbd2_slab_create_mutex);
        int i = order_base_2(size) - 10;
        size_t slab_size;

        if (size == PAGE_SIZE)
                return 0;

        if (i >= JBD2_MAX_SLABS)
                return -EINVAL;

        if (unlikely(i < 0))
                i = 0;
        mutex_lock(&jbd2_slab_create_mutex);
        if (jbd2_slab[i]) {
                mutex_unlock(&jbd2_slab_create_mutex);
                return 0;        /* Already created */
        }

        slab_size = 1 << (i+10);
        jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
                                         slab_size, 0, NULL);
        mutex_unlock(&jbd2_slab_create_mutex);
        if (!jbd2_slab[i]) {
                printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
                return -ENOMEM;
        }
        return 0;
}

static struct kmem_cache *get_slab(size_t size)
{
        int i = order_base_2(size) - 10;

        BUG_ON(i >= JBD2_MAX_SLABS);
        if (unlikely(i < 0))
                i = 0;
        BUG_ON(jbd2_slab[i] == NULL);
        return jbd2_slab[i];
}

void *jbd2_alloc(size_t size, gfp_t flags)
{
        void *ptr;

        BUG_ON(size & (size-1)); /* Must be a power of 2 */

        if (size < PAGE_SIZE)
                ptr = kmem_cache_alloc(get_slab(size), flags);
        else
                ptr = (void *)__get_free_pages(flags, get_order(size));

        /* Check alignment; SLUB has gotten this wrong in the past,
         * and this can lead to user data corruption! */
        BUG_ON(((unsigned long) ptr) & (size-1));

        return ptr;
}

void jbd2_free(void *ptr, size_t size)
{
        if (size < PAGE_SIZE)
                kmem_cache_free(get_slab(size), ptr);
        else
                free_pages((unsigned long)ptr, get_order(size));
};

/*
 * Journal_head storage management
 */
static struct kmem_cache *jbd2_journal_head_cache;
#ifdef CONFIG_JBD2_DEBUG
static atomic_t nr_journal_heads = ATOMIC_INIT(0);
#endif

static int __init jbd2_journal_init_journal_head_cache(void)
{
        J_ASSERT(!jbd2_journal_head_cache);
        jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head",
                                sizeof(struct journal_head),
                                0,                /* offset */
                                SLAB_TEMPORARY | SLAB_TYPESAFE_BY_RCU,
                                NULL);                /* ctor */
        if (!jbd2_journal_head_cache) {
                printk(KERN_EMERG "JBD2: no memory for journal_head cache\n");
                return -ENOMEM;
        }
        return 0;
}

static void jbd2_journal_destroy_journal_head_cache(void)
{
        kmem_cache_destroy(jbd2_journal_head_cache);
        jbd2_journal_head_cache = NULL;
}

/*
 * journal_head splicing and dicing
 */
static struct journal_head *journal_alloc_journal_head(void)
{
        struct journal_head *ret;

#ifdef CONFIG_JBD2_DEBUG
        atomic_inc(&nr_journal_heads);
#endif
        ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
        if (!ret) {
                jbd_debug(1, "out of memory for journal_head\n");
                pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
                ret = kmem_cache_zalloc(jbd2_journal_head_cache,
                                GFP_NOFS | __GFP_NOFAIL);
        }
        if (ret)
                spin_lock_init(&ret->b_state_lock);
        return ret;
}

static void journal_free_journal_head(struct journal_head *jh)
{
#ifdef CONFIG_JBD2_DEBUG
        atomic_dec(&nr_journal_heads);
        memset(jh, JBD2_POISON_FREE, sizeof(*jh));
#endif
        kmem_cache_free(jbd2_journal_head_cache, jh);
}

/*
 * A journal_head is attached to a buffer_head whenever JBD has an
 * interest in the buffer.
 *
 * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit
 * is set.  This bit is tested in core kernel code where we need to take
 * JBD-specific actions.  Testing the zeroness of ->b_private is not reliable
 * there.
 *
 * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one.
 *
 * When a buffer has its BH_JBD bit set it is immune from being released by
 * core kernel code, mainly via ->b_count.
 *
 * A journal_head is detached from its buffer_head when the journal_head's
 * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint
 * transaction (b_cp_transaction) hold their references to b_jcount.
 *
 * Various places in the kernel want to attach a journal_head to a buffer_head
 * _before_ attaching the journal_head to a transaction.  To protect the
 * journal_head in this situation, jbd2_journal_add_journal_head elevates the
 * journal_head's b_jcount refcount by one.  The caller must call
 * jbd2_journal_put_journal_head() to undo this.
 *
 * So the typical usage would be:
 *
 *        (Attach a journal_head if needed.  Increments b_jcount)
 *        struct journal_head *jh = jbd2_journal_add_journal_head(bh);
 *        ...
 *      (Get another reference for transaction)
 *        jbd2_journal_grab_journal_head(bh);
 *        jh->b_transaction = xxx;
 *        (Put original reference)
 *        jbd2_journal_put_journal_head(jh);
 */

/*
 * Give a buffer_head a journal_head.
 *
 * May sleep.
 */
struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh)
{
        struct journal_head *jh;
        struct journal_head *new_jh = NULL;

repeat:
        if (!buffer_jbd(bh))
                new_jh = journal_alloc_journal_head();

        jbd_lock_bh_journal_head(bh);
        if (buffer_jbd(bh)) {
                jh = bh2jh(bh);
        } else {
                J_ASSERT_BH(bh,
                        (atomic_read(&bh->b_count) > 0) ||
                        (bh->b_page && bh->b_page->mapping));

                if (!new_jh) {
                        jbd_unlock_bh_journal_head(bh);
                        goto repeat;
                }

                jh = new_jh;
                new_jh = NULL;                /* We consumed it */
                set_buffer_jbd(bh);
                bh->b_private = jh;
                jh->b_bh = bh;
                get_bh(bh);
                BUFFER_TRACE(bh, "added journal_head");
        }
        jh->b_jcount++;
        jbd_unlock_bh_journal_head(bh);
        if (new_jh)
                journal_free_journal_head(new_jh);
        return bh->b_private;
}

/*
 * Grab a ref against this buffer_head's journal_head.  If it ended up not
 * having a journal_head, return NULL
 */
struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh)
{
        struct journal_head *jh = NULL;

        jbd_lock_bh_journal_head(bh);
        if (buffer_jbd(bh)) {
                jh = bh2jh(bh);
                jh->b_jcount++;
        }
        jbd_unlock_bh_journal_head(bh);
        return jh;
}
EXPORT_SYMBOL(jbd2_journal_grab_journal_head);

static void __journal_remove_journal_head(struct buffer_head *bh)
{
        struct journal_head *jh = bh2jh(bh);

        J_ASSERT_JH(jh, jh->b_transaction == NULL);
        J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
        J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
        J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
        J_ASSERT_BH(bh, buffer_jbd(bh));
        J_ASSERT_BH(bh, jh2bh(jh) == bh);
        BUFFER_TRACE(bh, "remove journal_head");

        /* Unlink before dropping the lock */
        bh->b_private = NULL;
        jh->b_bh = NULL;        /* debug, really */
        clear_buffer_jbd(bh);
}

static void journal_release_journal_head(struct journal_head *jh, size_t b_size)
{
        if (jh->b_frozen_data) {
                printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__);
                jbd2_free(jh->b_frozen_data, b_size);
        }
        if (jh->b_committed_data) {
                printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__);
                jbd2_free(jh->b_committed_data, b_size);
        }
        journal_free_journal_head(jh);
}

/*
 * Drop a reference on the passed journal_head.  If it fell to zero then
 * release the journal_head from the buffer_head.
 */
void jbd2_journal_put_journal_head(struct journal_head *jh)
{
        struct buffer_head *bh = jh2bh(jh);

        jbd_lock_bh_journal_head(bh);
        J_ASSERT_JH(jh, jh->b_jcount > 0);
        --jh->b_jcount;
        if (!jh->b_jcount) {
                __journal_remove_journal_head(bh);
                jbd_unlock_bh_journal_head(bh);
                journal_release_journal_head(jh, bh->b_size);
                __brelse(bh);
        } else {
                jbd_unlock_bh_journal_head(bh);
        }
}
EXPORT_SYMBOL(jbd2_journal_put_journal_head);

/*
 * Initialize jbd inode head
 */
void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
{
        jinode->i_transaction = NULL;
        jinode->i_next_transaction = NULL;
        jinode->i_vfs_inode = inode;
        jinode->i_flags = 0;
        jinode->i_dirty_start = 0;
        jinode->i_dirty_end = 0;
        INIT_LIST_HEAD(&jinode->i_list);
}

/*
 * Function to be called before we start removing inode from memory (i.e.,
 * clear_inode() is a fine place to be called from). It removes inode from
 * transaction's lists.
 */
void jbd2_journal_release_jbd_inode(journal_t *journal,
                                    struct jbd2_inode *jinode)
{
        if (!journal)
                return;
restart:
        spin_lock(&journal->j_list_lock);
        /* Is commit writing out inode - we have to wait */
        if (jinode->i_flags & JI_COMMIT_RUNNING) {
                wait_queue_head_t *wq;
                DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
                wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
                prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
                spin_unlock(&journal->j_list_lock);
                schedule();
                finish_wait(wq, &wait.wq_entry);
                goto restart;
        }

        if (jinode->i_transaction) {
                list_del(&jinode->i_list);
                jinode->i_transaction = NULL;
        }
        spin_unlock(&journal->j_list_lock);
}


#ifdef CONFIG_PROC_FS

#define JBD2_STATS_PROC_NAME "fs/jbd2"

static void __init jbd2_create_jbd_stats_proc_entry(void)
{
        proc_jbd2_stats = proc_mkdir(JBD2_STATS_PROC_NAME, NULL);
}

static void __exit jbd2_remove_jbd_stats_proc_entry(void)
{
        if (proc_jbd2_stats)
                remove_proc_entry(JBD2_STATS_PROC_NAME, NULL);
}

#else

#define jbd2_create_jbd_stats_proc_entry() do {} while (0)
#define jbd2_remove_jbd_stats_proc_entry() do {} while (0)

#endif

struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache;

static int __init jbd2_journal_init_inode_cache(void)
{
        J_ASSERT(!jbd2_inode_cache);
        jbd2_inode_cache = KMEM_CACHE(jbd2_inode, 0);
        if (!jbd2_inode_cache) {
                pr_emerg("JBD2: failed to create inode cache\n");
                return -ENOMEM;
        }
        return 0;
}

static int __init jbd2_journal_init_handle_cache(void)
{
        J_ASSERT(!jbd2_handle_cache);
        jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY);
        if (!jbd2_handle_cache) {
                printk(KERN_EMERG "JBD2: failed to create handle cache\n");
                return -ENOMEM;
        }
        return 0;
}

static void jbd2_journal_destroy_inode_cache(void)
{
        kmem_cache_destroy(jbd2_inode_cache);
        jbd2_inode_cache = NULL;
}

static void jbd2_journal_destroy_handle_cache(void)
{
        kmem_cache_destroy(jbd2_handle_cache);
        jbd2_handle_cache = NULL;
}

/*
 * Module startup and shutdown
 */

static int __init journal_init_caches(void)
{
        int ret;

        ret = jbd2_journal_init_revoke_record_cache();
        if (ret == 0)
                ret = jbd2_journal_init_revoke_table_cache();
        if (ret == 0)
                ret = jbd2_journal_init_journal_head_cache();
        if (ret == 0)
                ret = jbd2_journal_init_handle_cache();
        if (ret == 0)
                ret = jbd2_journal_init_inode_cache();
        if (ret == 0)
                ret = jbd2_journal_init_transaction_cache();
        return ret;
}

static void jbd2_journal_destroy_caches(void)
{
        jbd2_journal_destroy_revoke_record_cache();
        jbd2_journal_destroy_revoke_table_cache();
        jbd2_journal_destroy_journal_head_cache();
        jbd2_journal_destroy_handle_cache();
        jbd2_journal_destroy_inode_cache();
        jbd2_journal_destroy_transaction_cache();
        jbd2_journal_destroy_slabs();
}

static int __init journal_init(void)
{
        int ret;

        BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024);

        ret = journal_init_caches();
        if (ret == 0) {
                jbd2_create_jbd_stats_proc_entry();
        } else {
                jbd2_journal_destroy_caches();
        }
        return ret;
}

static void __exit journal_exit(void)
{
#ifdef CONFIG_JBD2_DEBUG
        int n = atomic_read(&nr_journal_heads);
        if (n)
                printk(KERN_ERR "JBD2: leaked %d journal_heads!\n", n);
#endif
        jbd2_remove_jbd_stats_proc_entry();
        jbd2_journal_destroy_caches();
}

MODULE_LICENSE("GPL");
module_init(journal_init);
module_exit(journal_exit);































    2 
    2 























    2 


    2 


    2 

    2 


    2 
    2 
    2 




    2 







































































































































































































































































































    2 


    2 

    2 



















































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
// SPDX-License-Identifier: GPL-2.0-only
/*
  File: fs/xattr.c

  Extended attribute handling.

  Copyright (C) 2001 by Andreas Gruenbacher <a.gruenbacher@computer.org>
  Copyright (C) 2001 SGI - Silicon Graphics, Inc <linux-xfs@oss.sgi.com>
  Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
 */
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/xattr.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/security.h>
#include <linux/evm.h>
#include <linux/syscalls.h>
#include <linux/export.h>
#include <linux/fsnotify.h>
#include <linux/audit.h>
#include <linux/vmalloc.h>
#include <linux/posix_acl_xattr.h>

#include <linux/uaccess.h>

static const char *
strcmp_prefix(const char *a, const char *a_prefix)
{
        while (*a_prefix && *a == *a_prefix) {
                a++;
                a_prefix++;
        }
        return *a_prefix ? NULL : a;
}

/*
 * In order to implement different sets of xattr operations for each xattr
 * prefix, a filesystem should create a null-terminated array of struct
 * xattr_handler (one for each prefix) and hang a pointer to it off of the
 * s_xattr field of the superblock.
 */
#define for_each_xattr_handler(handlers, handler)                \
        if (handlers)                                                \
                for ((handler) = *(handlers)++;                        \
                        (handler) != NULL;                        \
                        (handler) = *(handlers)++)

/*
 * Find the xattr_handler with the matching prefix.
 */
static const struct xattr_handler *
xattr_resolve_name(struct inode *inode, const char **name)
{
        const struct xattr_handler **handlers = inode->i_sb->s_xattr;
        const struct xattr_handler *handler;

        if (!(inode->i_opflags & IOP_XATTR)) {
                if (unlikely(is_bad_inode(inode)))
                        return ERR_PTR(-EIO);
                return ERR_PTR(-EOPNOTSUPP);
        }
        for_each_xattr_handler(handlers, handler) {
                const char *n;

                n = strcmp_prefix(*name, xattr_prefix(handler));
                if (n) {
                        if (!handler->prefix ^ !*n) {
                                if (*n)
                                        continue;
                                return ERR_PTR(-EINVAL);
                        }
                        *name = n;
                        return handler;
                }
        }
        return ERR_PTR(-EOPNOTSUPP);
}

/*
 * Check permissions for extended attribute access.  This is a bit complicated
 * because different namespaces have very different rules.
 */
static int
xattr_permission(struct inode *inode, const char *name, int mask)
{
        /*
         * We can never set or remove an extended attribute on a read-only
         * filesystem  or on an immutable / append-only inode.
         */
        if (mask & MAY_WRITE) {
                if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
                        return -EPERM;
                /*
                 * Updating an xattr will likely cause i_uid and i_gid
                 * to be writen back improperly if their true value is
                 * unknown to the vfs.
                 */
                if (HAS_UNMAPPED_ID(inode))
                        return -EPERM;
        }

        /*
         * No restriction for security.* and system.* from the VFS.  Decision
         * on these is left to the underlying filesystem / security module.
         */
        if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) ||
            !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
                return 0;

        /*
         * The trusted.* namespace can only be accessed by privileged users.
         */
        if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) {
                if (!capable(CAP_SYS_ADMIN))
                        return (mask & MAY_WRITE) ? -EPERM : -ENODATA;
                return 0;
        }

        /*
         * In the user.* namespace, only regular files and directories can have
         * extended attributes. For sticky directories, only the owner and
         * privileged users can write attributes.
         */
        if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) {
                if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
                        return (mask & MAY_WRITE) ? -EPERM : -ENODATA;
                if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) &&
                    (mask & MAY_WRITE) && !inode_owner_or_capable(inode))
                        return -EPERM;
        }

        return inode_permission(inode, mask);
}

/*
 * Look for any handler that deals with the specified namespace.
 */
int
xattr_supported_namespace(struct inode *inode, const char *prefix)
{
        const struct xattr_handler **handlers = inode->i_sb->s_xattr;
        const struct xattr_handler *handler;
        size_t preflen;

        if (!(inode->i_opflags & IOP_XATTR)) {
                if (unlikely(is_bad_inode(inode)))
                        return -EIO;
                return -EOPNOTSUPP;
        }

        preflen = strlen(prefix);

        for_each_xattr_handler(handlers, handler) {
                if (!strncmp(xattr_prefix(handler), prefix, preflen))
                        return 0;
        }

        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(xattr_supported_namespace);

int
__vfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
               const void *value, size_t size, int flags)
{
        const struct xattr_handler *handler;

        handler = xattr_resolve_name(inode, &name);
        if (IS_ERR(handler))
                return PTR_ERR(handler);
        if (!handler->set)
                return -EOPNOTSUPP;
        if (size == 0)
                value = "";  /* empty EA, do not remove */
        return handler->set(handler, dentry, inode, name, value, size, flags);
}
EXPORT_SYMBOL(__vfs_setxattr);

/**
 *  __vfs_setxattr_noperm - perform setxattr operation without performing
 *  permission checks.
 *
 *  @dentry - object to perform setxattr on
 *  @name - xattr name to set
 *  @value - value to set @name to
 *  @size - size of @value
 *  @flags - flags to pass into filesystem operations
 *
 *  returns the result of the internal setxattr or setsecurity operations.
 *
 *  This function requires the caller to lock the inode's i_mutex before it
 *  is executed. It also assumes that the caller will make the appropriate
 *  permission checks.
 */
int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
                const void *value, size_t size, int flags)
{
        struct inode *inode = dentry->d_inode;
        int error = -EAGAIN;
        int issec = !strncmp(name, XATTR_SECURITY_PREFIX,
                                   XATTR_SECURITY_PREFIX_LEN);

        if (issec)
                inode->i_flags &= ~S_NOSEC;
        if (inode->i_opflags & IOP_XATTR) {
                error = __vfs_setxattr(dentry, inode, name, value, size, flags);
                if (!error) {
                        fsnotify_xattr(dentry);
                        security_inode_post_setxattr(dentry, name, value,
                                                     size, flags);
                }
        } else {
                if (unlikely(is_bad_inode(inode)))
                        return -EIO;
        }
        if (error == -EAGAIN) {
                error = -EOPNOTSUPP;

                if (issec) {
                        const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;

                        error = security_inode_setsecurity(inode, suffix, value,
                                                           size, flags);
                        if (!error)
                                fsnotify_xattr(dentry);
                }
        }

        return error;
}

/**
 * __vfs_setxattr_locked - set an extended attribute while holding the inode
 * lock
 *
 *  @dentry: object to perform setxattr on
 *  @name: xattr name to set
 *  @value: value to set @name to
 *  @size: size of @value
 *  @flags: flags to pass into filesystem operations
 *  @delegated_inode: on return, will contain an inode pointer that
 *  a delegation was broken on, NULL if none.
 */
int
__vfs_setxattr_locked(struct dentry *dentry, const char *name,
                const void *value, size_t size, int flags,
                struct inode **delegated_inode)
{
        struct inode *inode = dentry->d_inode;
        int error;

        error = xattr_permission(inode, name, MAY_WRITE);
        if (error)
                return error;

        error = security_inode_setxattr(dentry, name, value, size, flags);
        if (error)
                goto out;

        error = try_break_deleg(inode, delegated_inode);
        if (error)
                goto out;

        error = __vfs_setxattr_noperm(dentry, name, value, size, flags);

out:
        return error;
}
EXPORT_SYMBOL_GPL(__vfs_setxattr_locked);

int
vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
                size_t size, int flags)
{
        struct inode *inode = dentry->d_inode;
        struct inode *delegated_inode = NULL;
        int error;

retry_deleg:
        inode_lock(inode);
        error = __vfs_setxattr_locked(dentry, name, value, size, flags,
            &delegated_inode);
        inode_unlock(inode);

        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }
        return error;
}
EXPORT_SYMBOL_GPL(vfs_setxattr);

static ssize_t
xattr_getsecurity(struct inode *inode, const char *name, void *value,
                        size_t size)
{
        void *buffer = NULL;
        ssize_t len;

        if (!value || !size) {
                len = security_inode_getsecurity(inode, name, &buffer, false);
                goto out_noalloc;
        }

        len = security_inode_getsecurity(inode, name, &buffer, true);
        if (len < 0)
                return len;
        if (size < len) {
                len = -ERANGE;
                goto out;
        }
        memcpy(value, buffer, len);
out:
        kfree(buffer);
out_noalloc:
        return len;
}

/*
 * vfs_getxattr_alloc - allocate memory, if necessary, before calling getxattr
 *
 * Allocate memory, if not already allocated, or re-allocate correct size,
 * before retrieving the extended attribute.
 *
 * Returns the result of alloc, if failed, or the getxattr operation.
 */
ssize_t
vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value,
                   size_t xattr_size, gfp_t flags)
{
        const struct xattr_handler *handler;
        struct inode *inode = dentry->d_inode;
        char *value = *xattr_value;
        int error;

        error = xattr_permission(inode, name, MAY_READ);
        if (error)
                return error;

        handler = xattr_resolve_name(inode, &name);
        if (IS_ERR(handler))
                return PTR_ERR(handler);
        if (!handler->get)
                return -EOPNOTSUPP;
        error = handler->get(handler, dentry, inode, name, NULL, 0);
        if (error < 0)
                return error;

        if (!value || (error > xattr_size)) {
                value = krealloc(*xattr_value, error + 1, flags);
                if (!value)
                        return -ENOMEM;
                memset(value, 0, error + 1);
        }

        error = handler->get(handler, dentry, inode, name, value, error);
        *xattr_value = value;
        return error;
}

ssize_t
__vfs_getxattr(struct dentry *dentry, struct inode *inode, const char *name,
               void *value, size_t size)
{
        const struct xattr_handler *handler;

        handler = xattr_resolve_name(inode, &name);
        if (IS_ERR(handler))
                return PTR_ERR(handler);
        if (!handler->get)
                return -EOPNOTSUPP;
        return handler->get(handler, dentry, inode, name, value, size);
}
EXPORT_SYMBOL(__vfs_getxattr);

ssize_t
vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size)
{
        struct inode *inode = dentry->d_inode;
        int error;

        error = xattr_permission(inode, name, MAY_READ);
        if (error)
                return error;

        error = security_inode_getxattr(dentry, name);
        if (error)
                return error;

        if (!strncmp(name, XATTR_SECURITY_PREFIX,
                                XATTR_SECURITY_PREFIX_LEN)) {
                const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
                int ret = xattr_getsecurity(inode, suffix, value, size);
                /*
                 * Only overwrite the return value if a security module
                 * is actually active.
                 */
                if (ret == -EOPNOTSUPP)
                        goto nolsm;
                return ret;
        }
nolsm:
        return __vfs_getxattr(dentry, inode, name, value, size);
}
EXPORT_SYMBOL_GPL(vfs_getxattr);

ssize_t
vfs_listxattr(struct dentry *dentry, char *list, size_t size)
{
        struct inode *inode = d_inode(dentry);
        ssize_t error;

        error = security_inode_listxattr(dentry);
        if (error)
                return error;
        if (inode->i_op->listxattr && (inode->i_opflags & IOP_XATTR)) {
                error = inode->i_op->listxattr(dentry, list, size);
        } else {
                error = security_inode_listsecurity(inode, list, size);
                if (size && error > size)
                        error = -ERANGE;
        }
        return error;
}
EXPORT_SYMBOL_GPL(vfs_listxattr);

int
__vfs_removexattr(struct dentry *dentry, const char *name)
{
        struct inode *inode = d_inode(dentry);
        const struct xattr_handler *handler;

        handler = xattr_resolve_name(inode, &name);
        if (IS_ERR(handler))
                return PTR_ERR(handler);
        if (!handler->set)
                return -EOPNOTSUPP;
        return handler->set(handler, dentry, inode, name, NULL, 0, XATTR_REPLACE);
}
EXPORT_SYMBOL(__vfs_removexattr);

/**
 * __vfs_removexattr_locked - set an extended attribute while holding the inode
 * lock
 *
 *  @dentry: object to perform setxattr on
 *  @name: name of xattr to remove
 *  @delegated_inode: on return, will contain an inode pointer that
 *  a delegation was broken on, NULL if none.
 */
int
__vfs_removexattr_locked(struct dentry *dentry, const char *name,
                struct inode **delegated_inode)
{
        struct inode *inode = dentry->d_inode;
        int error;

        error = xattr_permission(inode, name, MAY_WRITE);
        if (error)
                return error;

        error = security_inode_removexattr(dentry, name);
        if (error)
                goto out;

        error = try_break_deleg(inode, delegated_inode);
        if (error)
                goto out;

        error = __vfs_removexattr(dentry, name);

        if (!error) {
                fsnotify_xattr(dentry);
                evm_inode_post_removexattr(dentry, name);
        }

out:
        return error;
}
EXPORT_SYMBOL_GPL(__vfs_removexattr_locked);

int
vfs_removexattr(struct dentry *dentry, const char *name)
{
        struct inode *inode = dentry->d_inode;
        struct inode *delegated_inode = NULL;
        int error;

retry_deleg:
        inode_lock(inode);
        error = __vfs_removexattr_locked(dentry, name, &delegated_inode);
        inode_unlock(inode);

        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }

        return error;
}
EXPORT_SYMBOL_GPL(vfs_removexattr);

/*
 * Extended attribute SET operations
 */
static long
setxattr(struct dentry *d, const char __user *name, const void __user *value,
         size_t size, int flags)
{
        int error;
        void *kvalue = NULL;
        char kname[XATTR_NAME_MAX + 1];

        if (flags & ~(XATTR_CREATE|XATTR_REPLACE))
                return -EINVAL;

        error = strncpy_from_user(kname, name, sizeof(kname));
        if (error == 0 || error == sizeof(kname))
                error = -ERANGE;
        if (error < 0)
                return error;

        if (size) {
                if (size > XATTR_SIZE_MAX)
                        return -E2BIG;
                kvalue = kvmalloc(size, GFP_KERNEL);
                if (!kvalue)
                        return -ENOMEM;
                if (copy_from_user(kvalue, value, size)) {
                        error = -EFAULT;
                        goto out;
                }
                if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
                    (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
                        posix_acl_fix_xattr_from_user(kvalue, size);
                else if (strcmp(kname, XATTR_NAME_CAPS) == 0) {
                        error = cap_convert_nscap(d, &kvalue, size);
                        if (error < 0)
                                goto out;
                        size = error;
                }
        }

        error = vfs_setxattr(d, kname, kvalue, size, flags);
out:
        kvfree(kvalue);

        return error;
}

static int path_setxattr(const char __user *pathname,
                         const char __user *name, const void __user *value,
                         size_t size, int flags, unsigned int lookup_flags)
{
        struct path path;
        int error;
retry:
        error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
        if (error)
                return error;
        error = mnt_want_write(path.mnt);
        if (!error) {
                error = setxattr(path.dentry, name, value, size, flags);
                mnt_drop_write(path.mnt);
        }
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
        return error;
}

SYSCALL_DEFINE5(setxattr, const char __user *, pathname,
                const char __user *, name, const void __user *, value,
                size_t, size, int, flags)
{
        return path_setxattr(pathname, name, value, size, flags, LOOKUP_FOLLOW);
}

SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,
                const char __user *, name, const void __user *, value,
                size_t, size, int, flags)
{
        return path_setxattr(pathname, name, value, size, flags, 0);
}

SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
                const void __user *,value, size_t, size, int, flags)
{
        struct fd f = fdget(fd);
        int error = -EBADF;

        if (!f.file)
                return error;
        audit_file(f.file);
        error = mnt_want_write_file(f.file);
        if (!error) {
                error = setxattr(f.file->f_path.dentry, name, value, size, flags);
                mnt_drop_write_file(f.file);
        }
        fdput(f);
        return error;
}

/*
 * Extended attribute GET operations
 */
static ssize_t
getxattr(struct dentry *d, const char __user *name, void __user *value,
         size_t size)
{
        ssize_t error;
        void *kvalue = NULL;
        char kname[XATTR_NAME_MAX + 1];

        error = strncpy_from_user(kname, name, sizeof(kname));
        if (error == 0 || error == sizeof(kname))
                error = -ERANGE;
        if (error < 0)
                return error;

        if (size) {
                if (size > XATTR_SIZE_MAX)
                        size = XATTR_SIZE_MAX;
                kvalue = kvzalloc(size, GFP_KERNEL);
                if (!kvalue)
                        return -ENOMEM;
        }

        error = vfs_getxattr(d, kname, kvalue, size);
        if (error > 0) {
                if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
                    (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
                        posix_acl_fix_xattr_to_user(kvalue, error);
                if (size && copy_to_user(value, kvalue, error))
                        error = -EFAULT;
        } else if (error == -ERANGE && size >= XATTR_SIZE_MAX) {
                /* The file system tried to returned a value bigger
                   than XATTR_SIZE_MAX bytes. Not possible. */
                error = -E2BIG;
        }

        kvfree(kvalue);

        return error;
}

static ssize_t path_getxattr(const char __user *pathname,
                             const char __user *name, void __user *value,
                             size_t size, unsigned int lookup_flags)
{
        struct path path;
        ssize_t error;
retry:
        error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
        if (error)
                return error;
        error = getxattr(path.dentry, name, value, size);
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
        return error;
}

SYSCALL_DEFINE4(getxattr, const char __user *, pathname,
                const char __user *, name, void __user *, value, size_t, size)
{
        return path_getxattr(pathname, name, value, size, LOOKUP_FOLLOW);
}

SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname,
                const char __user *, name, void __user *, value, size_t, size)
{
        return path_getxattr(pathname, name, value, size, 0);
}

SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name,
                void __user *, value, size_t, size)
{
        struct fd f = fdget(fd);
        ssize_t error = -EBADF;

        if (!f.file)
                return error;
        audit_file(f.file);
        error = getxattr(f.file->f_path.dentry, name, value, size);
        fdput(f);
        return error;
}

/*
 * Extended attribute LIST operations
 */
static ssize_t
listxattr(struct dentry *d, char __user *list, size_t size)
{
        ssize_t error;
        char *klist = NULL;

        if (size) {
                if (size > XATTR_LIST_MAX)
                        size = XATTR_LIST_MAX;
                klist = kvmalloc(size, GFP_KERNEL);
                if (!klist)
                        return -ENOMEM;
        }

        error = vfs_listxattr(d, klist, size);
        if (error > 0) {
                if (size && copy_to_user(list, klist, error))
                        error = -EFAULT;
        } else if (error == -ERANGE && size >= XATTR_LIST_MAX) {
                /* The file system tried to returned a list bigger
                   than XATTR_LIST_MAX bytes. Not possible. */
                error = -E2BIG;
        }

        kvfree(klist);

        return error;
}

static ssize_t path_listxattr(const char __user *pathname, char __user *list,
                              size_t size, unsigned int lookup_flags)
{
        struct path path;
        ssize_t error;
retry:
        error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
        if (error)
                return error;
        error = listxattr(path.dentry, list, size);
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
        return error;
}

SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list,
                size_t, size)
{
        return path_listxattr(pathname, list, size, LOOKUP_FOLLOW);
}

SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list,
                size_t, size)
{
        return path_listxattr(pathname, list, size, 0);
}

SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)
{
        struct fd f = fdget(fd);
        ssize_t error = -EBADF;

        if (!f.file)
                return error;
        audit_file(f.file);
        error = listxattr(f.file->f_path.dentry, list, size);
        fdput(f);
        return error;
}

/*
 * Extended attribute REMOVE operations
 */
static long
removexattr(struct dentry *d, const char __user *name)
{
        int error;
        char kname[XATTR_NAME_MAX + 1];

        error = strncpy_from_user(kname, name, sizeof(kname));
        if (error == 0 || error == sizeof(kname))
                error = -ERANGE;
        if (error < 0)
                return error;

        return vfs_removexattr(d, kname);
}

static int path_removexattr(const char __user *pathname,
                            const char __user *name, unsigned int lookup_flags)
{
        struct path path;
        int error;
retry:
        error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
        if (error)
                return error;
        error = mnt_want_write(path.mnt);
        if (!error) {
                error = removexattr(path.dentry, name);
                mnt_drop_write(path.mnt);
        }
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
        return error;
}

SYSCALL_DEFINE2(removexattr, const char __user *, pathname,
                const char __user *, name)
{
        return path_removexattr(pathname, name, LOOKUP_FOLLOW);
}

SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
                const char __user *, name)
{
        return path_removexattr(pathname, name, 0);
}

SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
{
        struct fd f = fdget(fd);
        int error = -EBADF;

        if (!f.file)
                return error;
        audit_file(f.file);
        error = mnt_want_write_file(f.file);
        if (!error) {
                error = removexattr(f.file->f_path.dentry, name);
                mnt_drop_write_file(f.file);
        }
        fdput(f);
        return error;
}

/*
 * Combine the results of the list() operation from every xattr_handler in the
 * list.
 */
ssize_t
generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
{
        const struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr;
        unsigned int size = 0;

        if (!buffer) {
                for_each_xattr_handler(handlers, handler) {
                        if (!handler->name ||
                            (handler->list && !handler->list(dentry)))
                                continue;
                        size += strlen(handler->name) + 1;
                }
        } else {
                char *buf = buffer;
                size_t len;

                for_each_xattr_handler(handlers, handler) {
                        if (!handler->name ||
                            (handler->list && !handler->list(dentry)))
                                continue;
                        len = strlen(handler->name);
                        if (len + 1 > buffer_size)
                                return -ERANGE;
                        memcpy(buf, handler->name, len + 1);
                        buf += len + 1;
                        buffer_size -= len + 1;
                }
                size = buf - buffer;
        }
        return size;
}
EXPORT_SYMBOL(generic_listxattr);

/**
 * xattr_full_name  -  Compute full attribute name from suffix
 *
 * @handler:        handler of the xattr_handler operation
 * @name:        name passed to the xattr_handler operation
 *
 * The get and set xattr handler operations are called with the remainder of
 * the attribute name after skipping the handler's prefix: for example, "foo"
 * is passed to the get operation of a handler with prefix "user." to get
 * attribute "user.foo".  The full name is still "there" in the name though.
 *
 * Note: the list xattr handler operation when called from the vfs is passed a
 * NULL name; some file systems use this operation internally, with varying
 * semantics.
 */
const char *xattr_full_name(const struct xattr_handler *handler,
                            const char *name)
{
        size_t prefix_len = strlen(xattr_prefix(handler));

        return name - prefix_len;
}
EXPORT_SYMBOL(xattr_full_name);

/*
 * Allocate new xattr and copy in the value; but leave the name to callers.
 */
struct simple_xattr *simple_xattr_alloc(const void *value, size_t size)
{
        struct simple_xattr *new_xattr;
        size_t len;

        /* wrap around? */
        len = sizeof(*new_xattr) + size;
        if (len < sizeof(*new_xattr))
                return NULL;

        new_xattr = kvmalloc(len, GFP_KERNEL);
        if (!new_xattr)
                return NULL;

        new_xattr->size = size;
        memcpy(new_xattr->value, value, size);
        return new_xattr;
}

/*
 * xattr GET operation for in-memory/pseudo filesystems
 */
int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
                     void *buffer, size_t size)
{
        struct simple_xattr *xattr;
        int ret = -ENODATA;

        spin_lock(&xattrs->lock);
        list_for_each_entry(xattr, &xattrs->head, list) {
                if (strcmp(name, xattr->name))
                        continue;

                ret = xattr->size;
                if (buffer) {
                        if (size < xattr->size)
                                ret = -ERANGE;
                        else
                                memcpy(buffer, xattr->value, xattr->size);
                }
                break;
        }
        spin_unlock(&xattrs->lock);
        return ret;
}

/**
 * simple_xattr_set - xattr SET operation for in-memory/pseudo filesystems
 * @xattrs: target simple_xattr list
 * @name: name of the extended attribute
 * @value: value of the xattr. If %NULL, will remove the attribute.
 * @size: size of the new xattr
 * @flags: %XATTR_{CREATE|REPLACE}
 * @removed_size: returns size of the removed xattr, -1 if none removed
 *
 * %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails
 * with -EEXIST.  If %XATTR_REPLACE is set, the xattr should exist;
 * otherwise, fails with -ENODATA.
 *
 * Returns 0 on success, -errno on failure.
 */
int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
                     const void *value, size_t size, int flags,
                     ssize_t *removed_size)
{
        struct simple_xattr *xattr;
        struct simple_xattr *new_xattr = NULL;
        int err = 0;

        if (removed_size)
                *removed_size = -1;

        /* value == NULL means remove */
        if (value) {
                new_xattr = simple_xattr_alloc(value, size);
                if (!new_xattr)
                        return -ENOMEM;

                new_xattr->name = kstrdup(name, GFP_KERNEL);
                if (!new_xattr->name) {
                        kvfree(new_xattr);
                        return -ENOMEM;
                }
        }

        spin_lock(&xattrs->lock);
        list_for_each_entry(xattr, &xattrs->head, list) {
                if (!strcmp(name, xattr->name)) {
                        if (flags & XATTR_CREATE) {
                                xattr = new_xattr;
                                err = -EEXIST;
                        } else if (new_xattr) {
                                list_replace(&xattr->list, &new_xattr->list);
                                if (removed_size)
                                        *removed_size = xattr->size;
                        } else {
                                list_del(&xattr->list);
                                if (removed_size)
                                        *removed_size = xattr->size;
                        }
                        goto out;
                }
        }
        if (flags & XATTR_REPLACE) {
                xattr = new_xattr;
                err = -ENODATA;
        } else {
                list_add(&new_xattr->list, &xattrs->head);
                xattr = NULL;
        }
out:
        spin_unlock(&xattrs->lock);
        if (xattr) {
                kfree(xattr->name);
                kvfree(xattr);
        }
        return err;

}

static bool xattr_is_trusted(const char *name)
{
        return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
}

static int xattr_list_one(char **buffer, ssize_t *remaining_size,
                          const char *name)
{
        size_t len = strlen(name) + 1;
        if (*buffer) {
                if (*remaining_size < len)
                        return -ERANGE;
                memcpy(*buffer, name, len);
                *buffer += len;
        }
        *remaining_size -= len;
        return 0;
}

/*
 * xattr LIST operation for in-memory/pseudo filesystems
 */
ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
                          char *buffer, size_t size)
{
        bool trusted = ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN);
        struct simple_xattr *xattr;
        ssize_t remaining_size = size;
        int err = 0;

#ifdef CONFIG_FS_POSIX_ACL
        if (IS_POSIXACL(inode)) {
                if (inode->i_acl) {
                        err = xattr_list_one(&buffer, &remaining_size,
                                             XATTR_NAME_POSIX_ACL_ACCESS);
                        if (err)
                                return err;
                }
                if (inode->i_default_acl) {
                        err = xattr_list_one(&buffer, &remaining_size,
                                             XATTR_NAME_POSIX_ACL_DEFAULT);
                        if (err)
                                return err;
                }
        }
#endif

        spin_lock(&xattrs->lock);
        list_for_each_entry(xattr, &xattrs->head, list) {
                /* skip "trusted." attributes for unprivileged callers */
                if (!trusted && xattr_is_trusted(xattr->name))
                        continue;

                err = xattr_list_one(&buffer, &remaining_size, xattr->name);
                if (err)
                        break;
        }
        spin_unlock(&xattrs->lock);

        return err ? err : size - remaining_size;
}

/*
 * Adds an extended attribute to the list
 */
void simple_xattr_list_add(struct simple_xattrs *xattrs,
                           struct simple_xattr *new_xattr)
{
        spin_lock(&xattrs->lock);
        list_add(&new_xattr->list, &xattrs->head);
        spin_unlock(&xattrs->lock);
}



























































































































































































































































































































































































































































































































































































































































































































































    1 

    1 











































































































    1 


    1 













































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _SCSI_SCSI_HOST_H
#define _SCSI_SCSI_HOST_H

#include <linux/device.h>
#include <linux/list.h>
#include <linux/types.h>
#include <linux/workqueue.h>
#include <linux/mutex.h>
#include <linux/seq_file.h>
#include <linux/blk-mq.h>
#include <scsi/scsi.h>

struct block_device;
struct completion;
struct module;
struct scsi_cmnd;
struct scsi_device;
struct scsi_host_cmd_pool;
struct scsi_target;
struct Scsi_Host;
struct scsi_host_cmd_pool;
struct scsi_transport_template;


#define SG_ALL        SG_CHUNK_SIZE

#define MODE_UNKNOWN 0x00
#define MODE_INITIATOR 0x01
#define MODE_TARGET 0x02

struct scsi_host_template {
        struct module *module;
        const char *name;

        /*
         * The info function will return whatever useful information the
         * developer sees fit.  If not provided, then the name field will
         * be used instead.
         *
         * Status: OPTIONAL
         */
        const char *(* info)(struct Scsi_Host *);

        /*
         * Ioctl interface
         *
         * Status: OPTIONAL
         */
        int (*ioctl)(struct scsi_device *dev, unsigned int cmd,
                     void __user *arg);


#ifdef CONFIG_COMPAT
        /* 
         * Compat handler. Handle 32bit ABI.
         * When unknown ioctl is passed return -ENOIOCTLCMD.
         *
         * Status: OPTIONAL
         */
        int (*compat_ioctl)(struct scsi_device *dev, unsigned int cmd,
                            void __user *arg);
#endif

        int (*init_cmd_priv)(struct Scsi_Host *shost, struct scsi_cmnd *cmd);
        int (*exit_cmd_priv)(struct Scsi_Host *shost, struct scsi_cmnd *cmd);

        /*
         * The queuecommand function is used to queue up a scsi
         * command block to the LLDD.  When the driver finished
         * processing the command the done callback is invoked.
         *
         * If queuecommand returns 0, then the driver has accepted the
         * command.  It must also push it to the HBA if the scsi_cmnd
         * flag SCMD_LAST is set, or if the driver does not implement
         * commit_rqs.  The done() function must be called on the command
         * when the driver has finished with it. (you may call done on the
         * command before queuecommand returns, but in this case you
         * *must* return 0 from queuecommand).
         *
         * Queuecommand may also reject the command, in which case it may
         * not touch the command and must not call done() for it.
         *
         * There are two possible rejection returns:
         *
         *   SCSI_MLQUEUE_DEVICE_BUSY: Block this device temporarily, but
         *   allow commands to other devices serviced by this host.
         *
         *   SCSI_MLQUEUE_HOST_BUSY: Block all devices served by this
         *   host temporarily.
         *
         * For compatibility, any other non-zero return is treated the
         * same as SCSI_MLQUEUE_HOST_BUSY.
         *
         * NOTE: "temporarily" means either until the next command for#
         * this device/host completes, or a period of time determined by
         * I/O pressure in the system if there are no other outstanding
         * commands.
         *
         * STATUS: REQUIRED
         */
        int (* queuecommand)(struct Scsi_Host *, struct scsi_cmnd *);

        /*
         * The commit_rqs function is used to trigger a hardware
         * doorbell after some requests have been queued with
         * queuecommand, when an error is encountered before sending
         * the request with SCMD_LAST set.
         *
         * STATUS: OPTIONAL
         */
        void (*commit_rqs)(struct Scsi_Host *, u16);

        /*
         * This is an error handling strategy routine.  You don't need to
         * define one of these if you don't want to - there is a default
         * routine that is present that should work in most cases.  For those
         * driver authors that have the inclination and ability to write their
         * own strategy routine, this is where it is specified.  Note - the
         * strategy routine is *ALWAYS* run in the context of the kernel eh
         * thread.  Thus you are guaranteed to *NOT* be in an interrupt
         * handler when you execute this, and you are also guaranteed to
         * *NOT* have any other commands being queued while you are in the
         * strategy routine. When you return from this function, operations
         * return to normal.
         *
         * See scsi_error.c scsi_unjam_host for additional comments about
         * what this function should and should not be attempting to do.
         *
         * Status: REQUIRED        (at least one of them)
         */
        int (* eh_abort_handler)(struct scsi_cmnd *);
        int (* eh_device_reset_handler)(struct scsi_cmnd *);
        int (* eh_target_reset_handler)(struct scsi_cmnd *);
        int (* eh_bus_reset_handler)(struct scsi_cmnd *);
        int (* eh_host_reset_handler)(struct scsi_cmnd *);

        /*
         * Before the mid layer attempts to scan for a new device where none
         * currently exists, it will call this entry in your driver.  Should
         * your driver need to allocate any structs or perform any other init
         * items in order to send commands to a currently unused target/lun
         * combo, then this is where you can perform those allocations.  This
         * is specifically so that drivers won't have to perform any kind of
         * "is this a new device" checks in their queuecommand routine,
         * thereby making the hot path a bit quicker.
         *
         * Return values: 0 on success, non-0 on failure
         *
         * Deallocation:  If we didn't find any devices at this ID, you will
         * get an immediate call to slave_destroy().  If we find something
         * here then you will get a call to slave_configure(), then the
         * device will be used for however long it is kept around, then when
         * the device is removed from the system (or * possibly at reboot
         * time), you will then get a call to slave_destroy().  This is
         * assuming you implement slave_configure and slave_destroy.
         * However, if you allocate memory and hang it off the device struct,
         * then you must implement the slave_destroy() routine at a minimum
         * in order to avoid leaking memory
         * each time a device is tore down.
         *
         * Status: OPTIONAL
         */
        int (* slave_alloc)(struct scsi_device *);

        /*
         * Once the device has responded to an INQUIRY and we know the
         * device is online, we call into the low level driver with the
         * struct scsi_device *.  If the low level device driver implements
         * this function, it *must* perform the task of setting the queue
         * depth on the device.  All other tasks are optional and depend
         * on what the driver supports and various implementation details.
         * 
         * Things currently recommended to be handled at this time include:
         *
         * 1.  Setting the device queue depth.  Proper setting of this is
         *     described in the comments for scsi_change_queue_depth.
         * 2.  Determining if the device supports the various synchronous
         *     negotiation protocols.  The device struct will already have
         *     responded to INQUIRY and the results of the standard items
         *     will have been shoved into the various device flag bits, eg.
         *     device->sdtr will be true if the device supports SDTR messages.
         * 3.  Allocating command structs that the device will need.
         * 4.  Setting the default timeout on this device (if needed).
         * 5.  Anything else the low level driver might want to do on a device
         *     specific setup basis...
         * 6.  Return 0 on success, non-0 on error.  The device will be marked
         *     as offline on error so that no access will occur.  If you return
         *     non-0, your slave_destroy routine will never get called for this
         *     device, so don't leave any loose memory hanging around, clean
         *     up after yourself before returning non-0
         *
         * Status: OPTIONAL
         */
        int (* slave_configure)(struct scsi_device *);

        /*
         * Immediately prior to deallocating the device and after all activity
         * has ceased the mid layer calls this point so that the low level
         * driver may completely detach itself from the scsi device and vice
         * versa.  The low level driver is responsible for freeing any memory
         * it allocated in the slave_alloc or slave_configure calls. 
         *
         * Status: OPTIONAL
         */
        void (* slave_destroy)(struct scsi_device *);

        /*
         * Before the mid layer attempts to scan for a new device attached
         * to a target where no target currently exists, it will call this
         * entry in your driver.  Should your driver need to allocate any
         * structs or perform any other init items in order to send commands
         * to a currently unused target, then this is where you can perform
         * those allocations.
         *
         * Return values: 0 on success, non-0 on failure
         *
         * Status: OPTIONAL
         */
        int (* target_alloc)(struct scsi_target *);

        /*
         * Immediately prior to deallocating the target structure, and
         * after all activity to attached scsi devices has ceased, the
         * midlayer calls this point so that the driver may deallocate
         * and terminate any references to the target.
         *
         * Status: OPTIONAL
         */
        void (* target_destroy)(struct scsi_target *);

        /*
         * If a host has the ability to discover targets on its own instead
         * of scanning the entire bus, it can fill in this function and
         * call scsi_scan_host().  This function will be called periodically
         * until it returns 1 with the scsi_host and the elapsed time of
         * the scan in jiffies.
         *
         * Status: OPTIONAL
         */
        int (* scan_finished)(struct Scsi_Host *, unsigned long);

        /*
         * If the host wants to be called before the scan starts, but
         * after the midlayer has set up ready for the scan, it can fill
         * in this function.
         *
         * Status: OPTIONAL
         */
        void (* scan_start)(struct Scsi_Host *);

        /*
         * Fill in this function to allow the queue depth of this host
         * to be changeable (on a per device basis).  Returns either
         * the current queue depth setting (may be different from what
         * was passed in) or an error.  An error should only be
         * returned if the requested depth is legal but the driver was
         * unable to set it.  If the requested depth is illegal, the
         * driver should set and return the closest legal queue depth.
         *
         * Status: OPTIONAL
         */
        int (* change_queue_depth)(struct scsi_device *, int);

        /*
         * This functions lets the driver expose the queue mapping
         * to the block layer.
         *
         * Status: OPTIONAL
         */
        int (* map_queues)(struct Scsi_Host *shost);

        /*
         * Check if scatterlists need to be padded for DMA draining.
         *
         * Status: OPTIONAL
         */
        bool (* dma_need_drain)(struct request *rq);

        /*
         * This function determines the BIOS parameters for a given
         * harddisk.  These tend to be numbers that are made up by
         * the host adapter.  Parameters:
         * size, device, list (heads, sectors, cylinders)
         *
         * Status: OPTIONAL
         */
        int (* bios_param)(struct scsi_device *, struct block_device *,
                        sector_t, int []);

        /*
         * This function is called when one or more partitions on the
         * device reach beyond the end of the device.
         *
         * Status: OPTIONAL
         */
        void (*unlock_native_capacity)(struct scsi_device *);

        /*
         * Can be used to export driver statistics and other infos to the
         * world outside the kernel ie. userspace and it also provides an
         * interface to feed the driver with information.
         *
         * Status: OBSOLETE
         */
        int (*show_info)(struct seq_file *, struct Scsi_Host *);
        int (*write_info)(struct Scsi_Host *, char *, int);

        /*
         * This is an optional routine that allows the transport to become
         * involved when a scsi io timer fires. The return value tells the
         * timer routine how to finish the io timeout handling.
         *
         * Status: OPTIONAL
         */
        enum blk_eh_timer_return (*eh_timed_out)(struct scsi_cmnd *);

        /* This is an optional routine that allows transport to initiate
         * LLD adapter or firmware reset using sysfs attribute.
         *
         * Return values: 0 on success, -ve value on failure.
         *
         * Status: OPTIONAL
         */

        int (*host_reset)(struct Scsi_Host *shost, int reset_type);
#define SCSI_ADAPTER_RESET        1
#define SCSI_FIRMWARE_RESET        2


        /*
         * Name of proc directory
         */
        const char *proc_name;

        /*
         * Used to store the procfs directory if a driver implements the
         * show_info method.
         */
        struct proc_dir_entry *proc_dir;

        /*
         * This determines if we will use a non-interrupt driven
         * or an interrupt driven scheme.  It is set to the maximum number
         * of simultaneous commands a single hw queue in HBA will accept.
         */
        int can_queue;

        /*
         * In many instances, especially where disconnect / reconnect are
         * supported, our host also has an ID on the SCSI bus.  If this is
         * the case, then it must be reserved.  Please set this_id to -1 if
         * your setup is in single initiator mode, and the host lacks an
         * ID.
         */
        int this_id;

        /*
         * This determines the degree to which the host adapter is capable
         * of scatter-gather.
         */
        unsigned short sg_tablesize;
        unsigned short sg_prot_tablesize;

        /*
         * Set this if the host adapter has limitations beside segment count.
         */
        unsigned int max_sectors;

        /*
         * Maximum size in bytes of a single segment.
         */
        unsigned int max_segment_size;

        /*
         * DMA scatter gather segment boundary limit. A segment crossing this
         * boundary will be split in two.
         */
        unsigned long dma_boundary;

        unsigned long virt_boundary_mask;

        /*
         * This specifies "machine infinity" for host templates which don't
         * limit the transfer size.  Note this limit represents an absolute
         * maximum, and may be over the transfer limits allowed for
         * individual devices (e.g. 256 for SCSI-1).
         */
#define SCSI_DEFAULT_MAX_SECTORS        1024

        /*
         * True if this host adapter can make good use of linked commands.
         * This will allow more than one command to be queued to a given
         * unit on a given host.  Set this to the maximum number of command
         * blocks to be provided for each device.  Set this to 1 for one
         * command block per lun, 2 for two, etc.  Do not set this to 0.
         * You should make sure that the host adapter will do the right thing
         * before you try setting this above 1.
         */
        short cmd_per_lun;

        /*
         * present contains counter indicating how many boards of this
         * type were found when we did the scan.
         */
        unsigned char present;

        /* If use block layer to manage tags, this is tag allocation policy */
        int tag_alloc_policy;

        /*
         * Track QUEUE_FULL events and reduce queue depth on demand.
         */
        unsigned track_queue_depth:1;

        /*
         * This specifies the mode that a LLD supports.
         */
        unsigned supported_mode:2;

        /*
         * True if this host adapter uses unchecked DMA onto an ISA bus.
         */
        unsigned unchecked_isa_dma:1;

        /*
         * True for emulated SCSI host adapters (e.g. ATAPI).
         */
        unsigned emulated:1;

        /*
         * True if the low-level driver performs its own reset-settle delays.
         */
        unsigned skip_settle_delay:1;

        /* True if the controller does not support WRITE SAME */
        unsigned no_write_same:1;

        /* True if the host uses host-wide tagspace */
        unsigned host_tagset:1;

        /*
         * Countdown for host blocking with no commands outstanding.
         */
        unsigned int max_host_blocked;

        /*
         * Default value for the blocking.  If the queue is empty,
         * host_blocked counts down in the request_fn until it restarts
         * host operations as zero is reached.  
         *
         * FIXME: This should probably be a value in the template
         */
#define SCSI_DEFAULT_HOST_BLOCKED        7

        /*
         * Pointer to the sysfs class properties for this host, NULL terminated.
         */
        struct device_attribute **shost_attrs;

        /*
         * Pointer to the SCSI device properties for this host, NULL terminated.
         */
        struct device_attribute **sdev_attrs;

        /*
         * Pointer to the SCSI device attribute groups for this host,
         * NULL terminated.
         */
        const struct attribute_group **sdev_groups;

        /*
         * Vendor Identifier associated with the host
         *
         * Note: When specifying vendor_id, be sure to read the
         *   Vendor Type and ID formatting requirements specified in
         *   scsi_netlink.h
         */
        u64 vendor_id;

        /*
         * Additional per-command data allocated for the driver.
         */
        unsigned int cmd_size;
        struct scsi_host_cmd_pool *cmd_pool;

        /* Delay for runtime autosuspend */
        int rpm_autosuspend_delay;
};

/*
 * Temporary #define for host lock push down. Can be removed when all
 * drivers have been updated to take advantage of unlocked
 * queuecommand.
 *
 */
#define DEF_SCSI_QCMD(func_name) \
        int func_name(struct Scsi_Host *shost, struct scsi_cmnd *cmd)        \
        {                                                                \
                unsigned long irq_flags;                                \
                int rc;                                                        \
                spin_lock_irqsave(shost->host_lock, irq_flags);                \
                rc = func_name##_lck (cmd, cmd->scsi_done);                        \
                spin_unlock_irqrestore(shost->host_lock, irq_flags);        \
                return rc;                                                \
        }


/*
 * shost state: If you alter this, you also need to alter scsi_sysfs.c
 * (for the ascii descriptions) and the state model enforcer:
 * scsi_host_set_state()
 */
enum scsi_host_state {
        SHOST_CREATED = 1,
        SHOST_RUNNING,
        SHOST_CANCEL,
        SHOST_DEL,
        SHOST_RECOVERY,
        SHOST_CANCEL_RECOVERY,
        SHOST_DEL_RECOVERY,
};

struct Scsi_Host {
        /*
         * __devices is protected by the host_lock, but you should
         * usually use scsi_device_lookup / shost_for_each_device
         * to access it and don't care about locking yourself.
         * In the rare case of being in irq context you can use
         * their __ prefixed variants with the lock held. NEVER
         * access this list directly from a driver.
         */
        struct list_head        __devices;
        struct list_head        __targets;
        
        struct list_head        starved_list;

        spinlock_t                default_lock;
        spinlock_t                *host_lock;

        struct mutex                scan_mutex;/* serialize scanning activity */

        struct list_head        eh_cmd_q;
        struct task_struct    * ehandler;  /* Error recovery thread. */
        struct completion     * eh_action; /* Wait for specific actions on the
                                              host. */
        wait_queue_head_t       host_wait;
        struct scsi_host_template *hostt;
        struct scsi_transport_template *transportt;

        struct kref                tagset_refcnt;
        struct completion        tagset_freed;
        /* Area to keep a shared tag map */
        struct blk_mq_tag_set        tag_set;

        atomic_t host_blocked;

        unsigned int host_failed;           /* commands that failed.
                                              protected by host_lock */
        unsigned int host_eh_scheduled;    /* EH scheduled without command */
    
        unsigned int host_no;  /* Used for IOCTL_GET_IDLUN, /proc/scsi et al. */

        /* next two fields are used to bound the time spent in error handling */
        int eh_deadline;
        unsigned long last_reset;


        /*
         * These three parameters can be used to allow for wide scsi,
         * and for host adapters that support multiple busses
         * The last two should be set to 1 more than the actual max id
         * or lun (e.g. 8 for SCSI parallel systems).
         */
        unsigned int max_channel;
        unsigned int max_id;
        u64 max_lun;

        /*
         * This is a unique identifier that must be assigned so that we
         * have some way of identifying each detected host adapter properly
         * and uniquely.  For hosts that do not support more than one card
         * in the system at one time, this does not need to be set.  It is
         * initialized to 0 in scsi_register.
         */
        unsigned int unique_id;

        /*
         * The maximum length of SCSI commands that this host can accept.
         * Probably 12 for most host adapters, but could be 16 for others.
         * or 260 if the driver supports variable length cdbs.
         * For drivers that don't set this field, a value of 12 is
         * assumed.
         */
        unsigned short max_cmd_len;

        int this_id;
        int can_queue;
        short cmd_per_lun;
        short unsigned int sg_tablesize;
        short unsigned int sg_prot_tablesize;
        unsigned int max_sectors;
        unsigned int max_segment_size;
        unsigned long dma_boundary;
        unsigned long virt_boundary_mask;
        /*
         * In scsi-mq mode, the number of hardware queues supported by the LLD.
         *
         * Note: it is assumed that each hardware queue has a queue depth of
         * can_queue. In other words, the total queue depth per host
         * is nr_hw_queues * can_queue. However, for when host_tagset is set,
         * the total queue depth is can_queue.
         */
        unsigned nr_hw_queues;
        unsigned active_mode:2;
        unsigned unchecked_isa_dma:1;

        /*
         * Host has requested that no further requests come through for the
         * time being.
         */
        unsigned host_self_blocked:1;
    
        /*
         * Host uses correct SCSI ordering not PC ordering. The bit is
         * set for the minority of drivers whose authors actually read
         * the spec ;).
         */
        unsigned reverse_ordering:1;

        /* Task mgmt function in progress */
        unsigned tmf_in_progress:1;

        /* Asynchronous scan in progress */
        unsigned async_scan:1;

        /* Don't resume host in EH */
        unsigned eh_noresume:1;

        /* The controller does not support WRITE SAME */
        unsigned no_write_same:1;

        /* True if the host uses host-wide tagspace */
        unsigned host_tagset:1;

        /* Host responded with short (<36 bytes) INQUIRY result */
        unsigned short_inquiry:1;

        /* The transport requires the LUN bits NOT to be stored in CDB[1] */
        unsigned no_scsi2_lun_in_cdb:1;

        /*
         * Optional work queue to be utilized by the transport
         */
        char work_q_name[20];
        struct workqueue_struct *work_q;

        /*
         * Task management function work queue
         */
        struct workqueue_struct *tmf_work_q;

        /*
         * Value host_blocked counts down from
         */
        unsigned int max_host_blocked;

        /* Protection Information */
        unsigned int prot_capabilities;
        unsigned char prot_guard_type;

        /* legacy crap */
        unsigned long base;
        unsigned long io_port;
        unsigned char n_io_port;
        unsigned char dma_channel;
        unsigned int  irq;
        

        enum scsi_host_state shost_state;

        /* ldm bits */
        struct device                shost_gendev, shost_dev;

        /*
         * Points to the transport data (if any) which is allocated
         * separately
         */
        void *shost_data;

        /*
         * Points to the physical bus device we'd use to do DMA
         * Needed just in case we have virtual hosts.
         */
        struct device *dma_dev;

        /*
         * We should ensure that this is aligned, both for better performance
         * and also because some compilers (m68k) don't automatically force
         * alignment to a long boundary.
         */
        unsigned long hostdata[]  /* Used for storage of host specific stuff */
                __attribute__ ((aligned (sizeof(unsigned long))));
};

#define                class_to_shost(d)        \
        container_of(d, struct Scsi_Host, shost_dev)

#define shost_printk(prefix, shost, fmt, a...)        \
        dev_printk(prefix, &(shost)->shost_gendev, fmt, ##a)

static inline void *shost_priv(struct Scsi_Host *shost)
{
        return (void *)shost->hostdata;
}

int scsi_is_host_device(const struct device *);

static inline struct Scsi_Host *dev_to_shost(struct device *dev)
{
        while (!scsi_is_host_device(dev)) {
                if (!dev->parent)
                        return NULL;
                dev = dev->parent;
        }
        return container_of(dev, struct Scsi_Host, shost_gendev);
}

static inline int scsi_host_in_recovery(struct Scsi_Host *shost)
{
        return shost->shost_state == SHOST_RECOVERY ||
                shost->shost_state == SHOST_CANCEL_RECOVERY ||
                shost->shost_state == SHOST_DEL_RECOVERY ||
                shost->tmf_in_progress;
}

extern int scsi_queue_work(struct Scsi_Host *, struct work_struct *);
extern void scsi_flush_work(struct Scsi_Host *);

extern struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *, int);
extern int __must_check scsi_add_host_with_dma(struct Scsi_Host *,
                                               struct device *,
                                               struct device *);
extern void scsi_scan_host(struct Scsi_Host *);
extern void scsi_rescan_device(struct device *);
extern void scsi_remove_host(struct Scsi_Host *);
extern struct Scsi_Host *scsi_host_get(struct Scsi_Host *);
extern int scsi_host_busy(struct Scsi_Host *shost);
extern void scsi_host_put(struct Scsi_Host *t);
extern struct Scsi_Host *scsi_host_lookup(unsigned int hostnum);
extern const char *scsi_host_state_name(enum scsi_host_state);
extern void scsi_host_complete_all_commands(struct Scsi_Host *shost,
                                            int status);

static inline int __must_check scsi_add_host(struct Scsi_Host *host,
                                             struct device *dev)
{
        return scsi_add_host_with_dma(host, dev, dev);
}

static inline struct device *scsi_get_device(struct Scsi_Host *shost)
{
        return shost->shost_gendev.parent;
}

/**
 * scsi_host_scan_allowed - Is scanning of this host allowed
 * @shost:        Pointer to Scsi_Host.
 **/
static inline int scsi_host_scan_allowed(struct Scsi_Host *shost)
{
        return shost->shost_state == SHOST_RUNNING ||
               shost->shost_state == SHOST_RECOVERY;
}

extern void scsi_unblock_requests(struct Scsi_Host *);
extern void scsi_block_requests(struct Scsi_Host *);
extern int scsi_host_block(struct Scsi_Host *shost);
extern int scsi_host_unblock(struct Scsi_Host *shost, int new_state);

void scsi_host_busy_iter(struct Scsi_Host *,
                         bool (*fn)(struct scsi_cmnd *, void *, bool), void *priv);

struct class_container;

/*
 * These two functions are used to allocate and free a pseudo device
 * which will connect to the host adapter itself rather than any
 * physical device.  You must deallocate when you are done with the
 * thing.  This physical pseudo-device isn't real and won't be available
 * from any high-level drivers.
 */
extern void scsi_free_host_dev(struct scsi_device *);
extern struct scsi_device *scsi_get_host_dev(struct Scsi_Host *);

/*
 * DIF defines the exchange of protection information between
 * initiator and SBC block device.
 *
 * DIX defines the exchange of protection information between OS and
 * initiator.
 */
enum scsi_host_prot_capabilities {
        SHOST_DIF_TYPE1_PROTECTION = 1 << 0, /* T10 DIF Type 1 */
        SHOST_DIF_TYPE2_PROTECTION = 1 << 1, /* T10 DIF Type 2 */
        SHOST_DIF_TYPE3_PROTECTION = 1 << 2, /* T10 DIF Type 3 */

        SHOST_DIX_TYPE0_PROTECTION = 1 << 3, /* DIX between OS and HBA only */
        SHOST_DIX_TYPE1_PROTECTION = 1 << 4, /* DIX with DIF Type 1 */
        SHOST_DIX_TYPE2_PROTECTION = 1 << 5, /* DIX with DIF Type 2 */
        SHOST_DIX_TYPE3_PROTECTION = 1 << 6, /* DIX with DIF Type 3 */
};

/*
 * SCSI hosts which support the Data Integrity Extensions must
 * indicate their capabilities by setting the prot_capabilities using
 * this call.
 */
static inline void scsi_host_set_prot(struct Scsi_Host *shost, unsigned int mask)
{
        shost->prot_capabilities = mask;
}

static inline unsigned int scsi_host_get_prot(struct Scsi_Host *shost)
{
        return shost->prot_capabilities;
}

static inline int scsi_host_prot_dma(struct Scsi_Host *shost)
{
        return shost->prot_capabilities >= SHOST_DIX_TYPE0_PROTECTION;
}

static inline unsigned int scsi_host_dif_capable(struct Scsi_Host *shost, unsigned int target_type)
{
        static unsigned char cap[] = { 0,
                                       SHOST_DIF_TYPE1_PROTECTION,
                                       SHOST_DIF_TYPE2_PROTECTION,
                                       SHOST_DIF_TYPE3_PROTECTION };

        if (target_type >= ARRAY_SIZE(cap))
                return 0;

        return shost->prot_capabilities & cap[target_type] ? target_type : 0;
}

static inline unsigned int scsi_host_dix_capable(struct Scsi_Host *shost, unsigned int target_type)
{
#if defined(CONFIG_BLK_DEV_INTEGRITY)
        static unsigned char cap[] = { SHOST_DIX_TYPE0_PROTECTION,
                                       SHOST_DIX_TYPE1_PROTECTION,
                                       SHOST_DIX_TYPE2_PROTECTION,
                                       SHOST_DIX_TYPE3_PROTECTION };

        if (target_type >= ARRAY_SIZE(cap))
                return 0;

        return shost->prot_capabilities & cap[target_type];
#endif
        return 0;
}

/*
 * All DIX-capable initiators must support the T10-mandated CRC
 * checksum.  Controllers can optionally implement the IP checksum
 * scheme which has much lower impact on system performance.  Note
 * that the main rationale for the checksum is to match integrity
 * metadata with data.  Detecting bit errors are a job for ECC memory
 * and buses.
 */

enum scsi_host_guard_type {
        SHOST_DIX_GUARD_CRC = 1 << 0,
        SHOST_DIX_GUARD_IP  = 1 << 1,
};

static inline void scsi_host_set_guard(struct Scsi_Host *shost, unsigned char type)
{
        shost->prot_guard_type = type;
}

static inline unsigned char scsi_host_get_guard(struct Scsi_Host *shost)
{
        return shost->prot_guard_type;
}

extern int scsi_host_set_state(struct Scsi_Host *, enum scsi_host_state);

#endif /* _SCSI_SCSI_HOST_H */


























    1 




    1 




















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_KDEV_T_H
#define _LINUX_KDEV_T_H

#include <uapi/linux/kdev_t.h>

#define MINORBITS        20
#define MINORMASK        ((1U << MINORBITS) - 1)

#define MAJOR(dev)        ((unsigned int) ((dev) >> MINORBITS))
#define MINOR(dev)        ((unsigned int) ((dev) & MINORMASK))
#define MKDEV(ma,mi)        (((ma) << MINORBITS) | (mi))

#define print_dev_t(buffer, dev)                                        \
        sprintf((buffer), "%u:%u\n", MAJOR(dev), MINOR(dev))

#define format_dev_t(buffer, dev)                                        \
        ({                                                                \
                sprintf(buffer, "%u:%u", MAJOR(dev), MINOR(dev));        \
                buffer;                                                        \
        })

/* acceptable for old filesystems */
static __always_inline bool old_valid_dev(dev_t dev)
{
        return MAJOR(dev) < 256 && MINOR(dev) < 256;
}

static __always_inline u16 old_encode_dev(dev_t dev)
{
        return (MAJOR(dev) << 8) | MINOR(dev);
}

static __always_inline dev_t old_decode_dev(u16 val)
{
        return MKDEV((val >> 8) & 255, val & 255);
}

static __always_inline u32 new_encode_dev(dev_t dev)
{
        unsigned major = MAJOR(dev);
        unsigned minor = MINOR(dev);
        return (minor & 0xff) | (major << 8) | ((minor & ~0xff) << 12);
}

static __always_inline dev_t new_decode_dev(u32 dev)
{
        unsigned major = (dev & 0xfff00) >> 8;
        unsigned minor = (dev & 0xff) | ((dev >> 12) & 0xfff00);
        return MKDEV(major, minor);
}

static __always_inline u64 huge_encode_dev(dev_t dev)
{
        return new_encode_dev(dev);
}

static __always_inline dev_t huge_decode_dev(u64 dev)
{
        return new_decode_dev(dev);
}

static __always_inline int sysv_valid_dev(dev_t dev)
{
        return MAJOR(dev) < (1<<14) && MINOR(dev) < (1<<18);
}

static __always_inline u32 sysv_encode_dev(dev_t dev)
{
        return MINOR(dev) | (MAJOR(dev) << 18);
}

static __always_inline unsigned sysv_major(u32 dev)
{
        return (dev >> 18) & 0x3fff;
}

static __always_inline unsigned sysv_minor(u32 dev)
{
        return dev & 0x3ffff;
}

#endif



























































































































































































    1 









    1 




    1 













































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _SCSI_SCSI_CMND_H
#define _SCSI_SCSI_CMND_H

#include <linux/dma-mapping.h>
#include <linux/blkdev.h>
#include <linux/t10-pi.h>
#include <linux/list.h>
#include <linux/types.h>
#include <linux/timer.h>
#include <linux/scatterlist.h>
#include <scsi/scsi_device.h>
#include <scsi/scsi_request.h>

struct Scsi_Host;
struct scsi_driver;

/*
 * MAX_COMMAND_SIZE is:
 * The longest fixed-length SCSI CDB as per the SCSI standard.
 * fixed-length means: commands that their size can be determined
 * by their opcode and the CDB does not carry a length specifier, (unlike
 * the VARIABLE_LENGTH_CMD(0x7f) command). This is actually not exactly
 * true and the SCSI standard also defines extended commands and
 * vendor specific commands that can be bigger than 16 bytes. The kernel
 * will support these using the same infrastructure used for VARLEN CDB's.
 * So in effect MAX_COMMAND_SIZE means the maximum size command scsi-ml
 * supports without specifying a cmd_len by ULD's
 */
#define MAX_COMMAND_SIZE 16
#if (MAX_COMMAND_SIZE > BLK_MAX_CDB)
# error MAX_COMMAND_SIZE can not be bigger than BLK_MAX_CDB
#endif

struct scsi_data_buffer {
        struct sg_table table;
        unsigned length;
};

/* embedded in scsi_cmnd */
struct scsi_pointer {
        char *ptr;                /* data pointer */
        int this_residual;        /* left in this buffer */
        struct scatterlist *buffer;        /* which buffer */
        int buffers_residual;        /* how many buffers left */

        dma_addr_t dma_handle;

        volatile int Status;
        volatile int Message;
        volatile int have_data_in;
        volatile int sent_command;
        volatile int phase;
};

/* for scmd->flags */
#define SCMD_TAGGED                (1 << 0)
#define SCMD_UNCHECKED_ISA_DMA        (1 << 1)
#define SCMD_INITIALIZED        (1 << 2)
#define SCMD_LAST                (1 << 3)
/* flags preserved across unprep / reprep */
#define SCMD_PRESERVED_FLAGS        (SCMD_UNCHECKED_ISA_DMA | SCMD_INITIALIZED)

/* for scmd->state */
#define SCMD_STATE_COMPLETE        0
#define SCMD_STATE_INFLIGHT        1

struct scsi_cmnd {
        struct scsi_request req;
        struct scsi_device *device;
        struct list_head eh_entry; /* entry for the host eh_cmd_q */
        struct delayed_work abort_work;

        struct rcu_head rcu;

        int eh_eflags;                /* Used by error handlr */

        /*
         * This is set to jiffies as it was when the command was first
         * allocated.  It is used to time how long the command has
         * been outstanding
         */
        unsigned long jiffies_at_alloc;

        int retries;
        int allowed;

        unsigned char prot_op;
        unsigned char prot_type;
        unsigned char prot_flags;

        unsigned short cmd_len;
        enum dma_data_direction sc_data_direction;

        /* These elements define the operation we are about to perform */
        unsigned char *cmnd;


        /* These elements define the operation we ultimately want to perform */
        struct scsi_data_buffer sdb;
        struct scsi_data_buffer *prot_sdb;

        unsigned underflow;        /* Return error if less than
                                   this amount is transferred */

        unsigned transfersize;        /* How much we are guaranteed to
                                   transfer with each SCSI transfer
                                   (ie, between disconnect / 
                                   reconnects.   Probably == sector
                                   size */

        struct request *request;        /* The command we are
                                              working on */

        unsigned char *sense_buffer;
                                /* obtained by REQUEST SENSE when
                                 * CHECK CONDITION is received on original
                                 * command (auto-sense). Length must be
                                 * SCSI_SENSE_BUFFERSIZE bytes. */

        /* Low-level done function - can be used by low-level driver to point
         *        to completion function.  Not used by mid/upper level code. */
        void (*scsi_done) (struct scsi_cmnd *);

        /*
         * The following fields can be written to by the host specific code. 
         * Everything else should be left alone. 
         */
        struct scsi_pointer SCp;        /* Scratchpad used by some host adapters */

        unsigned char *host_scribble;        /* The host adapter is allowed to
                                         * call scsi_malloc and get some memory
                                         * and hang it here.  The host adapter
                                         * is also expected to call scsi_free
                                         * to release this memory.  (The memory
                                         * obtained by scsi_malloc is guaranteed
                                         * to be at an address < 16Mb). */

        int result;                /* Status code from lower level driver */
        int flags;                /* Command flags */
        unsigned long state;        /* Command completion state */

        unsigned char tag;        /* SCSI-II queued command tag */
        unsigned int extra_len;        /* length of alignment and padding */
};

/* Variant of blk_mq_rq_from_pdu() that verifies the type of its argument. */
static inline struct request *scsi_cmd_to_rq(struct scsi_cmnd *scmd)
{
        return blk_mq_rq_from_pdu(scmd);
}

/*
 * Return the driver private allocation behind the command.
 * Only works if cmd_size is set in the host template.
 */
static inline void *scsi_cmd_priv(struct scsi_cmnd *cmd)
{
        return cmd + 1;
}

/* make sure not to use it with passthrough commands */
static inline struct scsi_driver *scsi_cmd_to_driver(struct scsi_cmnd *cmd)
{
        return *(struct scsi_driver **)cmd->request->rq_disk->private_data;
}

extern void scsi_finish_command(struct scsi_cmnd *cmd);

extern void *scsi_kmap_atomic_sg(struct scatterlist *sg, int sg_count,
                                 size_t *offset, size_t *len);
extern void scsi_kunmap_atomic_sg(void *virt);

blk_status_t scsi_alloc_sgtables(struct scsi_cmnd *cmd);
void scsi_free_sgtables(struct scsi_cmnd *cmd);

#ifdef CONFIG_SCSI_DMA
extern int scsi_dma_map(struct scsi_cmnd *cmd);
extern void scsi_dma_unmap(struct scsi_cmnd *cmd);
#else /* !CONFIG_SCSI_DMA */
static inline int scsi_dma_map(struct scsi_cmnd *cmd) { return -ENOSYS; }
static inline void scsi_dma_unmap(struct scsi_cmnd *cmd) { }
#endif /* !CONFIG_SCSI_DMA */

static inline unsigned scsi_sg_count(struct scsi_cmnd *cmd)
{
        return cmd->sdb.table.nents;
}

static inline struct scatterlist *scsi_sglist(struct scsi_cmnd *cmd)
{
        return cmd->sdb.table.sgl;
}

static inline unsigned scsi_bufflen(struct scsi_cmnd *cmd)
{
        return cmd->sdb.length;
}

static inline void scsi_set_resid(struct scsi_cmnd *cmd, unsigned int resid)
{
        cmd->req.resid_len = resid;
}

static inline unsigned int scsi_get_resid(struct scsi_cmnd *cmd)
{
        return cmd->req.resid_len;
}

#define scsi_for_each_sg(cmd, sg, nseg, __i)                        \
        for_each_sg(scsi_sglist(cmd), sg, nseg, __i)

static inline int scsi_sg_copy_from_buffer(struct scsi_cmnd *cmd,
                                           const void *buf, int buflen)
{
        return sg_copy_from_buffer(scsi_sglist(cmd), scsi_sg_count(cmd),
                                   buf, buflen);
}

static inline int scsi_sg_copy_to_buffer(struct scsi_cmnd *cmd,
                                         void *buf, int buflen)
{
        return sg_copy_to_buffer(scsi_sglist(cmd), scsi_sg_count(cmd),
                                 buf, buflen);
}

/*
 * The operations below are hints that tell the controller driver how
 * to handle I/Os with DIF or similar types of protection information.
 */
enum scsi_prot_operations {
        /* Normal I/O */
        SCSI_PROT_NORMAL = 0,

        /* OS-HBA: Protected, HBA-Target: Unprotected */
        SCSI_PROT_READ_INSERT,
        SCSI_PROT_WRITE_STRIP,

        /* OS-HBA: Unprotected, HBA-Target: Protected */
        SCSI_PROT_READ_STRIP,
        SCSI_PROT_WRITE_INSERT,

        /* OS-HBA: Protected, HBA-Target: Protected */
        SCSI_PROT_READ_PASS,
        SCSI_PROT_WRITE_PASS,
};

static inline void scsi_set_prot_op(struct scsi_cmnd *scmd, unsigned char op)
{
        scmd->prot_op = op;
}

static inline unsigned char scsi_get_prot_op(struct scsi_cmnd *scmd)
{
        return scmd->prot_op;
}

enum scsi_prot_flags {
        SCSI_PROT_TRANSFER_PI                = 1 << 0,
        SCSI_PROT_GUARD_CHECK                = 1 << 1,
        SCSI_PROT_REF_CHECK                = 1 << 2,
        SCSI_PROT_REF_INCREMENT                = 1 << 3,
        SCSI_PROT_IP_CHECKSUM                = 1 << 4,
};

/*
 * The controller usually does not know anything about the target it
 * is communicating with.  However, when DIX is enabled the controller
 * must be know target type so it can verify the protection
 * information passed along with the I/O.
 */
enum scsi_prot_target_type {
        SCSI_PROT_DIF_TYPE0 = 0,
        SCSI_PROT_DIF_TYPE1,
        SCSI_PROT_DIF_TYPE2,
        SCSI_PROT_DIF_TYPE3,
};

static inline void scsi_set_prot_type(struct scsi_cmnd *scmd, unsigned char type)
{
        scmd->prot_type = type;
}

static inline unsigned char scsi_get_prot_type(struct scsi_cmnd *scmd)
{
        return scmd->prot_type;
}

static inline sector_t scsi_get_lba(struct scsi_cmnd *scmd)
{
        return blk_rq_pos(scmd->request);
}

static inline unsigned int scsi_prot_interval(struct scsi_cmnd *scmd)
{
        return scmd->device->sector_size;
}

static inline unsigned scsi_prot_sg_count(struct scsi_cmnd *cmd)
{
        return cmd->prot_sdb ? cmd->prot_sdb->table.nents : 0;
}

static inline struct scatterlist *scsi_prot_sglist(struct scsi_cmnd *cmd)
{
        return cmd->prot_sdb ? cmd->prot_sdb->table.sgl : NULL;
}

static inline struct scsi_data_buffer *scsi_prot(struct scsi_cmnd *cmd)
{
        return cmd->prot_sdb;
}

#define scsi_for_each_prot_sg(cmd, sg, nseg, __i)                \
        for_each_sg(scsi_prot_sglist(cmd), sg, nseg, __i)

static inline void set_msg_byte(struct scsi_cmnd *cmd, char status)
{
        cmd->result = (cmd->result & 0xffff00ff) | (status << 8);
}

static inline void set_host_byte(struct scsi_cmnd *cmd, char status)
{
        cmd->result = (cmd->result & 0xff00ffff) | (status << 16);
}

static inline void set_driver_byte(struct scsi_cmnd *cmd, char status)
{
        cmd->result = (cmd->result & 0x00ffffff) | (status << 24);
}

static inline unsigned scsi_transfer_length(struct scsi_cmnd *scmd)
{
        unsigned int xfer_len = scmd->sdb.length;
        unsigned int prot_interval = scsi_prot_interval(scmd);

        if (scmd->prot_flags & SCSI_PROT_TRANSFER_PI)
                xfer_len += (xfer_len >> ilog2(prot_interval)) * 8;

        return xfer_len;
}

#endif /* _SCSI_SCSI_CMND_H */








































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
#ifndef _LINUX_GENERIC_RADIX_TREE_H
#define _LINUX_GENERIC_RADIX_TREE_H

/**
 * DOC: Generic radix trees/sparse arrays
 *
 * Very simple and minimalistic, supporting arbitrary size entries up to
 * PAGE_SIZE.
 *
 * A genradix is defined with the type it will store, like so:
 *
 * static GENRADIX(struct foo) foo_genradix;
 *
 * The main operations are:
 *
 * - genradix_init(radix) - initialize an empty genradix
 *
 * - genradix_free(radix) - free all memory owned by the genradix and
 *   reinitialize it
 *
 * - genradix_ptr(radix, idx) - gets a pointer to the entry at idx, returning
 *   NULL if that entry does not exist
 *
 * - genradix_ptr_alloc(radix, idx, gfp) - gets a pointer to an entry,
 *   allocating it if necessary
 *
 * - genradix_for_each(radix, iter, p) - iterate over each entry in a genradix
 *
 * The radix tree allocates one page of entries at a time, so entries may exist
 * that were never explicitly allocated - they will be initialized to all
 * zeroes.
 *
 * Internally, a genradix is just a radix tree of pages, and indexing works in
 * terms of byte offsets. The wrappers in this header file use sizeof on the
 * type the radix contains to calculate a byte offset from the index - see
 * __idx_to_offset.
 */

#include <asm/page.h>
#include <linux/bug.h>
#include <linux/kernel.h>
#include <linux/log2.h>

struct genradix_root;

struct __genradix {
        struct genradix_root                *root;
};

/*
 * NOTE: currently, sizeof(_type) must not be larger than PAGE_SIZE:
 */

#define __GENRADIX_INITIALIZER                                        \
        {                                                        \
                .tree = {                                        \
                        .root = NULL,                                \
                }                                                \
        }

/*
 * We use a 0 size array to stash the type we're storing without taking any
 * space at runtime - then the various accessor macros can use typeof() to get
 * to it for casts/sizeof - we also force the alignment so that storing a type
 * with a ridiculous alignment doesn't blow up the alignment or size of the
 * genradix.
 */

#define GENRADIX(_type)                                                \
struct {                                                        \
        struct __genradix        tree;                                \
        _type                        type[0] __aligned(1);                \
}

#define DEFINE_GENRADIX(_name, _type)                                \
        GENRADIX(_type) _name = __GENRADIX_INITIALIZER

/**
 * genradix_init - initialize a genradix
 * @_radix:        genradix to initialize
 *
 * Does not fail
 */
#define genradix_init(_radix)                                        \
do {                                                                \
        *(_radix) = (typeof(*_radix)) __GENRADIX_INITIALIZER;        \
} while (0)

void __genradix_free(struct __genradix *);

/**
 * genradix_free: free all memory owned by a genradix
 * @_radix: the genradix to free
 *
 * After freeing, @_radix will be reinitialized and empty
 */
#define genradix_free(_radix)        __genradix_free(&(_radix)->tree)

static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
{
        if (__builtin_constant_p(obj_size))
                BUILD_BUG_ON(obj_size > PAGE_SIZE);
        else
                BUG_ON(obj_size > PAGE_SIZE);

        if (!is_power_of_2(obj_size)) {
                size_t objs_per_page = PAGE_SIZE / obj_size;

                return (idx / objs_per_page) * PAGE_SIZE +
                        (idx % objs_per_page) * obj_size;
        } else {
                return idx * obj_size;
        }
}

#define __genradix_cast(_radix)                (typeof((_radix)->type[0]) *)
#define __genradix_obj_size(_radix)        sizeof((_radix)->type[0])
#define __genradix_idx_to_offset(_radix, _idx)                        \
        __idx_to_offset(_idx, __genradix_obj_size(_radix))

void *__genradix_ptr(struct __genradix *, size_t);

/**
 * genradix_ptr - get a pointer to a genradix entry
 * @_radix:        genradix to access
 * @_idx:        index to fetch
 *
 * Returns a pointer to entry at @_idx, or NULL if that entry does not exist.
 */
#define genradix_ptr(_radix, _idx)                                \
        (__genradix_cast(_radix)                                \
         __genradix_ptr(&(_radix)->tree,                        \
                        __genradix_idx_to_offset(_radix, _idx)))

void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t);

/**
 * genradix_ptr_alloc - get a pointer to a genradix entry, allocating it
 *                        if necessary
 * @_radix:        genradix to access
 * @_idx:        index to fetch
 * @_gfp:        gfp mask
 *
 * Returns a pointer to entry at @_idx, or NULL on allocation failure
 */
#define genradix_ptr_alloc(_radix, _idx, _gfp)                        \
        (__genradix_cast(_radix)                                \
         __genradix_ptr_alloc(&(_radix)->tree,                        \
                        __genradix_idx_to_offset(_radix, _idx),        \
                        _gfp))

struct genradix_iter {
        size_t                        offset;
        size_t                        pos;
};

/**
 * genradix_iter_init - initialize a genradix_iter
 * @_radix:        genradix that will be iterated over
 * @_idx:        index to start iterating from
 */
#define genradix_iter_init(_radix, _idx)                        \
        ((struct genradix_iter) {                                \
                .pos        = (_idx),                                \
                .offset        = __genradix_idx_to_offset((_radix), (_idx)),\
        })

void *__genradix_iter_peek(struct genradix_iter *, struct __genradix *, size_t);

/**
 * genradix_iter_peek - get first entry at or above iterator's current
 *                        position
 * @_iter:        a genradix_iter
 * @_radix:        genradix being iterated over
 *
 * If no more entries exist at or above @_iter's current position, returns NULL
 */
#define genradix_iter_peek(_iter, _radix)                        \
        (__genradix_cast(_radix)                                \
         __genradix_iter_peek(_iter, &(_radix)->tree,                \
                              PAGE_SIZE / __genradix_obj_size(_radix)))

static inline void __genradix_iter_advance(struct genradix_iter *iter,
                                           size_t obj_size)
{
        iter->offset += obj_size;

        if (!is_power_of_2(obj_size) &&
            (iter->offset & (PAGE_SIZE - 1)) + obj_size > PAGE_SIZE)
                iter->offset = round_up(iter->offset, PAGE_SIZE);

        iter->pos++;
}

#define genradix_iter_advance(_iter, _radix)                        \
        __genradix_iter_advance(_iter, __genradix_obj_size(_radix))

#define genradix_for_each_from(_radix, _iter, _p, _start)        \
        for (_iter = genradix_iter_init(_radix, _start);        \
             (_p = genradix_iter_peek(&_iter, _radix)) != NULL;        \
             genradix_iter_advance(&_iter, _radix))

/**
 * genradix_for_each - iterate over entry in a genradix
 * @_radix:        genradix to iterate over
 * @_iter:        a genradix_iter to track current position
 * @_p:                pointer to genradix entry type
 *
 * On every iteration, @_p will point to the current entry, and @_iter.pos
 * will be the current entry's index.
 */
#define genradix_for_each(_radix, _iter, _p)                        \
        genradix_for_each_from(_radix, _iter, _p, 0)

int __genradix_prealloc(struct __genradix *, size_t, gfp_t);

/**
 * genradix_prealloc - preallocate entries in a generic radix tree
 * @_radix:        genradix to preallocate
 * @_nr:        number of entries to preallocate
 * @_gfp:        gfp mask
 *
 * Returns 0 on success, -ENOMEM on failure
 */
#define genradix_prealloc(_radix, _nr, _gfp)                        \
         __genradix_prealloc(&(_radix)->tree,                        \
                        __genradix_idx_to_offset(_radix, _nr + 1),\
                        _gfp)


#endif /* _LINUX_GENERIC_RADIX_TREE_H */
































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM oom

#if !defined(_TRACE_OOM_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_OOM_H
#include <linux/tracepoint.h>
#include <trace/events/mmflags.h>

#define PG_COUNT_TO_KB(x) ((x) << (PAGE_SHIFT - 10))

TRACE_EVENT(oom_score_adj_update,

        TP_PROTO(struct task_struct *task),

        TP_ARGS(task),

        TP_STRUCT__entry(
                __field(        pid_t,        pid)
                __array(        char,        comm,        TASK_COMM_LEN )
                __field(        short,        oom_score_adj)
        ),

        TP_fast_assign(
                __entry->pid = task->pid;
                memcpy(__entry->comm, task->comm, TASK_COMM_LEN);
                __entry->oom_score_adj = task->signal->oom_score_adj;
        ),

        TP_printk("pid=%d comm=%s oom_score_adj=%hd",
                __entry->pid, __entry->comm, __entry->oom_score_adj)
);

TRACE_EVENT(reclaim_retry_zone,

        TP_PROTO(struct zoneref *zoneref,
                int order,
                unsigned long reclaimable,
                unsigned long available,
                unsigned long min_wmark,
                int no_progress_loops,
                bool wmark_check),

        TP_ARGS(zoneref, order, reclaimable, available, min_wmark, no_progress_loops, wmark_check),

        TP_STRUCT__entry(
                __field(        int, node)
                __field(        int, zone_idx)
                __field(        int,        order)
                __field(        unsigned long,        reclaimable)
                __field(        unsigned long,        available)
                __field(        unsigned long,        min_wmark)
                __field(        int,        no_progress_loops)
                __field(        bool,        wmark_check)
        ),

        TP_fast_assign(
                __entry->node = zone_to_nid(zoneref->zone);
                __entry->zone_idx = zoneref->zone_idx;
                __entry->order = order;
                __entry->reclaimable = reclaimable;
                __entry->available = available;
                __entry->min_wmark = min_wmark;
                __entry->no_progress_loops = no_progress_loops;
                __entry->wmark_check = wmark_check;
        ),

        TP_printk("node=%d zone=%-8s order=%d reclaimable=%lu available=%lu min_wmark=%lu no_progress_loops=%d wmark_check=%d",
                        __entry->node, __print_symbolic(__entry->zone_idx, ZONE_TYPE),
                        __entry->order,
                        __entry->reclaimable, __entry->available, __entry->min_wmark,
                        __entry->no_progress_loops,
                        __entry->wmark_check)
);

TRACE_EVENT(mark_victim,
        TP_PROTO(struct task_struct *task, uid_t uid),

        TP_ARGS(task, uid),

        TP_STRUCT__entry(
                __field(int, pid)
                __string(comm, task->comm)
                __field(unsigned long, total_vm)
                __field(unsigned long, anon_rss)
                __field(unsigned long, file_rss)
                __field(unsigned long, shmem_rss)
                __field(uid_t, uid)
                __field(unsigned long, pgtables)
                __field(short, oom_score_adj)
        ),

        TP_fast_assign(
                __entry->pid = task->pid;
                __assign_str(comm, task->comm);
                __entry->total_vm = PG_COUNT_TO_KB(task->mm->total_vm);
                __entry->anon_rss = PG_COUNT_TO_KB(get_mm_counter(task->mm, MM_ANONPAGES));
                __entry->file_rss = PG_COUNT_TO_KB(get_mm_counter(task->mm, MM_FILEPAGES));
                __entry->shmem_rss = PG_COUNT_TO_KB(get_mm_counter(task->mm, MM_SHMEMPAGES));
                __entry->uid = uid;
                __entry->pgtables = mm_pgtables_bytes(task->mm) >> 10;
                __entry->oom_score_adj = task->signal->oom_score_adj;
        ),

        TP_printk("pid=%d comm=%s total-vm=%lukB anon-rss=%lukB file-rss:%lukB shmem-rss:%lukB uid=%u pgtables=%lukB oom_score_adj=%hd",
                __entry->pid,
                __get_str(comm),
                __entry->total_vm,
                __entry->anon_rss,
                __entry->file_rss,
                __entry->shmem_rss,
                __entry->uid,
                __entry->pgtables,
                __entry->oom_score_adj
        )
);

TRACE_EVENT(wake_reaper,
        TP_PROTO(int pid),

        TP_ARGS(pid),

        TP_STRUCT__entry(
                __field(int, pid)
        ),

        TP_fast_assign(
                __entry->pid = pid;
        ),

        TP_printk("pid=%d", __entry->pid)
);

TRACE_EVENT(start_task_reaping,
        TP_PROTO(int pid),

        TP_ARGS(pid),

        TP_STRUCT__entry(
                __field(int, pid)
        ),

        TP_fast_assign(
                __entry->pid = pid;
        ),

        TP_printk("pid=%d", __entry->pid)
);

TRACE_EVENT(finish_task_reaping,
        TP_PROTO(int pid),

        TP_ARGS(pid),

        TP_STRUCT__entry(
                __field(int, pid)
        ),

        TP_fast_assign(
                __entry->pid = pid;
        ),

        TP_printk("pid=%d", __entry->pid)
);

TRACE_EVENT(skip_task_reaping,
        TP_PROTO(int pid),

        TP_ARGS(pid),

        TP_STRUCT__entry(
                __field(int, pid)
        ),

        TP_fast_assign(
                __entry->pid = pid;
        ),

        TP_printk("pid=%d", __entry->pid)
);

#ifdef CONFIG_COMPACTION
TRACE_EVENT(compact_retry,

        TP_PROTO(int order,
                enum compact_priority priority,
                enum compact_result result,
                int retries,
                int max_retries,
                bool ret),

        TP_ARGS(order, priority, result, retries, max_retries, ret),

        TP_STRUCT__entry(
                __field(        int, order)
                __field(        int, priority)
                __field(        int, result)
                __field(        int, retries)
                __field(        int, max_retries)
                __field(        bool, ret)
        ),

        TP_fast_assign(
                __entry->order = order;
                __entry->priority = priority;
                __entry->result = compact_result_to_feedback(result);
                __entry->retries = retries;
                __entry->max_retries = max_retries;
                __entry->ret = ret;
        ),

        TP_printk("order=%d priority=%s compaction_result=%s retries=%d max_retries=%d should_retry=%d",
                        __entry->order,
                        __print_symbolic(__entry->priority, COMPACTION_PRIORITY),
                        __print_symbolic(__entry->result, COMPACTION_FEEDBACK),
                        __entry->retries, __entry->max_retries,
                        __entry->ret)
);
#endif /* CONFIG_COMPACTION */
#endif

/* This part must be outside protection */
#include <trace/define_trace.h>
























































































































































































































































































































































    1 










































































































































































































    1 





















































































































































































    1 














































    1 




    3 




    1 




    2 



























































    1 





















    2 













































































































































































































































































































    1 






























































































































































    4 
















































































































































































































































































    5 









































































































































































































































































































































































































































    1 





    1 










































































































    4 































    3 




    3 


















    3 

























































































































    1 






































































































































































































































































    1 












    1 


























































































































































































































































































    2 


    1 














































    2 











    1 

    1 




















    2 




























    1 

























































































































































































































    1 












    1 
















































































































































































    2 

    4 
    1 
    4 
    1 
    4 
    1 







































































































































    2 












    2 
    2 




















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_FS_H
#define _LINUX_FS_H

#include <linux/linkage.h>
#include <linux/wait_bit.h>
#include <linux/kdev_t.h>
#include <linux/dcache.h>
#include <linux/path.h>
#include <linux/stat.h>
#include <linux/cache.h>
#include <linux/list.h>
#include <linux/list_lru.h>
#include <linux/llist.h>
#include <linux/radix-tree.h>
#include <linux/xarray.h>
#include <linux/rbtree.h>
#include <linux/init.h>
#include <linux/pid.h>
#include <linux/bug.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/mm_types.h>
#include <linux/capability.h>
#include <linux/semaphore.h>
#include <linux/fcntl.h>
#include <linux/rculist_bl.h>
#include <linux/atomic.h>
#include <linux/shrinker.h>
#include <linux/migrate_mode.h>
#include <linux/uidgid.h>
#include <linux/lockdep.h>
#include <linux/percpu-rwsem.h>
#include <linux/workqueue.h>
#include <linux/delayed_call.h>
#include <linux/uuid.h>
#include <linux/errseq.h>
#include <linux/ioprio.h>
#include <linux/fs_types.h>
#include <linux/build_bug.h>
#include <linux/stddef.h>

#include <asm/byteorder.h>
#include <uapi/linux/fs.h>

struct backing_dev_info;
struct bdi_writeback;
struct bio;
struct export_operations;
struct fiemap_extent_info;
struct hd_geometry;
struct iovec;
struct kiocb;
struct kobject;
struct pipe_inode_info;
struct poll_table_struct;
struct kstatfs;
struct vm_area_struct;
struct vfsmount;
struct cred;
struct swap_info_struct;
struct seq_file;
struct workqueue_struct;
struct iov_iter;
struct fscrypt_info;
struct fscrypt_operations;
struct fsverity_info;
struct fsverity_operations;
struct fs_context;
struct fs_parameter_spec;

extern void __init inode_init(void);
extern void __init inode_init_early(void);
extern void __init files_init(void);
extern void __init files_maxfiles_init(void);

extern struct files_stat_struct files_stat;
extern unsigned long get_max_files(void);
extern unsigned int sysctl_nr_open;
extern struct inodes_stat_t inodes_stat;
extern int leases_enable, lease_break_time;
extern int sysctl_protected_symlinks;
extern int sysctl_protected_hardlinks;
extern int sysctl_protected_fifos;
extern int sysctl_protected_regular;

typedef __kernel_rwf_t rwf_t;

struct buffer_head;
typedef int (get_block_t)(struct inode *inode, sector_t iblock,
                        struct buffer_head *bh_result, int create);
typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
                        ssize_t bytes, void *private);

#define MAY_EXEC                0x00000001
#define MAY_WRITE                0x00000002
#define MAY_READ                0x00000004
#define MAY_APPEND                0x00000008
#define MAY_ACCESS                0x00000010
#define MAY_OPEN                0x00000020
#define MAY_CHDIR                0x00000040
/* called from RCU mode, don't block */
#define MAY_NOT_BLOCK                0x00000080

/*
 * flags in file.f_mode.  Note that FMODE_READ and FMODE_WRITE must correspond
 * to O_WRONLY and O_RDWR via the strange trick in do_dentry_open()
 */

/* file is open for reading */
#define FMODE_READ                ((__force fmode_t)0x1)
/* file is open for writing */
#define FMODE_WRITE                ((__force fmode_t)0x2)
/* file is seekable */
#define FMODE_LSEEK                ((__force fmode_t)0x4)
/* file can be accessed using pread */
#define FMODE_PREAD                ((__force fmode_t)0x8)
/* file can be accessed using pwrite */
#define FMODE_PWRITE                ((__force fmode_t)0x10)
/* File is opened for execution with sys_execve / sys_uselib */
#define FMODE_EXEC                ((__force fmode_t)0x20)
/* File is opened with O_NDELAY (only set for block devices) */
#define FMODE_NDELAY                ((__force fmode_t)0x40)
/* File is opened with O_EXCL (only set for block devices) */
#define FMODE_EXCL                ((__force fmode_t)0x80)
/* File is opened using open(.., 3, ..) and is writeable only for ioctls
   (specialy hack for floppy.c) */
#define FMODE_WRITE_IOCTL        ((__force fmode_t)0x100)
/* 32bit hashes as llseek() offset (for directories) */
#define FMODE_32BITHASH         ((__force fmode_t)0x200)
/* 64bit hashes as llseek() offset (for directories) */
#define FMODE_64BITHASH         ((__force fmode_t)0x400)

/*
 * Don't update ctime and mtime.
 *
 * Currently a special hack for the XFS open_by_handle ioctl, but we'll
 * hopefully graduate it to a proper O_CMTIME flag supported by open(2) soon.
 */
#define FMODE_NOCMTIME                ((__force fmode_t)0x800)

/* Expect random access pattern */
#define FMODE_RANDOM                ((__force fmode_t)0x1000)

/* File is huge (eg. /dev/kmem): treat loff_t as unsigned */
#define FMODE_UNSIGNED_OFFSET        ((__force fmode_t)0x2000)

/* File is opened with O_PATH; almost nothing can be done with it */
#define FMODE_PATH                ((__force fmode_t)0x4000)

/* File needs atomic accesses to f_pos */
#define FMODE_ATOMIC_POS        ((__force fmode_t)0x8000)
/* Write access to underlying fs */
#define FMODE_WRITER                ((__force fmode_t)0x10000)
/* Has read method(s) */
#define FMODE_CAN_READ          ((__force fmode_t)0x20000)
/* Has write method(s) */
#define FMODE_CAN_WRITE         ((__force fmode_t)0x40000)

#define FMODE_OPENED                ((__force fmode_t)0x80000)
#define FMODE_CREATED                ((__force fmode_t)0x100000)

/* File is stream-like */
#define FMODE_STREAM                ((__force fmode_t)0x200000)

/* File was opened by fanotify and shouldn't generate fanotify events */
#define FMODE_NONOTIFY                ((__force fmode_t)0x4000000)

/* File is capable of returning -EAGAIN if I/O will block */
#define FMODE_NOWAIT                ((__force fmode_t)0x8000000)

/* File represents mount that needs unmounting */
#define FMODE_NEED_UNMOUNT        ((__force fmode_t)0x10000000)

/* File does not contribute to nr_files count */
#define FMODE_NOACCOUNT                ((__force fmode_t)0x20000000)

/* File supports async buffered reads */
#define FMODE_BUF_RASYNC        ((__force fmode_t)0x40000000)

/*
 * Attribute flags.  These should be or-ed together to figure out what
 * has been changed!
 */
#define ATTR_MODE        (1 << 0)
#define ATTR_UID        (1 << 1)
#define ATTR_GID        (1 << 2)
#define ATTR_SIZE        (1 << 3)
#define ATTR_ATIME        (1 << 4)
#define ATTR_MTIME        (1 << 5)
#define ATTR_CTIME        (1 << 6)
#define ATTR_ATIME_SET        (1 << 7)
#define ATTR_MTIME_SET        (1 << 8)
#define ATTR_FORCE        (1 << 9) /* Not a change, but a change it */
#define ATTR_KILL_SUID        (1 << 11)
#define ATTR_KILL_SGID        (1 << 12)
#define ATTR_FILE        (1 << 13)
#define ATTR_KILL_PRIV        (1 << 14)
#define ATTR_OPEN        (1 << 15) /* Truncating from open(O_TRUNC) */
#define ATTR_TIMES_SET        (1 << 16)
#define ATTR_TOUCH        (1 << 17)

/*
 * Whiteout is represented by a char device.  The following constants define the
 * mode and device number to use.
 */
#define WHITEOUT_MODE 0
#define WHITEOUT_DEV 0

/*
 * This is the Inode Attributes structure, used for notify_change().  It
 * uses the above definitions as flags, to know which values have changed.
 * Also, in this manner, a Filesystem can look at only the values it cares
 * about.  Basically, these are the attributes that the VFS layer can
 * request to change from the FS layer.
 *
 * Derek Atkins <warlord@MIT.EDU> 94-10-20
 */
struct iattr {
        unsigned int        ia_valid;
        umode_t                ia_mode;
        kuid_t                ia_uid;
        kgid_t                ia_gid;
        loff_t                ia_size;
        struct timespec64 ia_atime;
        struct timespec64 ia_mtime;
        struct timespec64 ia_ctime;

        /*
         * Not an attribute, but an auxiliary info for filesystems wanting to
         * implement an ftruncate() like method.  NOTE: filesystem should
         * check for (ia_valid & ATTR_FILE), and not for (ia_file != NULL).
         */
        struct file        *ia_file;
};

/*
 * Includes for diskquotas.
 */
#include <linux/quota.h>

/*
 * Maximum number of layers of fs stack.  Needs to be limited to
 * prevent kernel stack overflow
 */
#define FILESYSTEM_MAX_STACK_DEPTH 2

/** 
 * enum positive_aop_returns - aop return codes with specific semantics
 *
 * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has
 *                             completed, that the page is still locked, and
 *                             should be considered active.  The VM uses this hint
 *                             to return the page to the active list -- it won't
 *                             be a candidate for writeback again in the near
 *                             future.  Other callers must be careful to unlock
 *                             the page if they get this return.  Returned by
 *                             writepage(); 
 *
 * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has
 *                          unlocked it and the page might have been truncated.
 *                          The caller should back up to acquiring a new page and
 *                          trying again.  The aop will be taking reasonable
 *                          precautions not to livelock.  If the caller held a page
 *                          reference, it should drop it before retrying.  Returned
 *                          by readpage().
 *
 * address_space_operation functions return these large constants to indicate
 * special semantics to the caller.  These are much larger than the bytes in a
 * page to allow for functions that return the number of bytes operated on in a
 * given page.
 */

enum positive_aop_returns {
        AOP_WRITEPAGE_ACTIVATE        = 0x80000,
        AOP_TRUNCATED_PAGE        = 0x80001,
};

#define AOP_FLAG_CONT_EXPAND                0x0001 /* called from cont_expand */
#define AOP_FLAG_NOFS                        0x0002 /* used by filesystem to direct
                                                * helper code (eg buffer layer)
                                                * to clear GFP_FS from alloc */

/*
 * oh the beauties of C type declarations.
 */
struct page;
struct address_space;
struct writeback_control;
struct readahead_control;

/*
 * Write life time hint values.
 * Stored in struct inode as u8.
 */
enum rw_hint {
        WRITE_LIFE_NOT_SET        = 0,
        WRITE_LIFE_NONE                = RWH_WRITE_LIFE_NONE,
        WRITE_LIFE_SHORT        = RWH_WRITE_LIFE_SHORT,
        WRITE_LIFE_MEDIUM        = RWH_WRITE_LIFE_MEDIUM,
        WRITE_LIFE_LONG                = RWH_WRITE_LIFE_LONG,
        WRITE_LIFE_EXTREME        = RWH_WRITE_LIFE_EXTREME,
};

/* Match RWF_* bits to IOCB bits */
#define IOCB_HIPRI                (__force int) RWF_HIPRI
#define IOCB_DSYNC                (__force int) RWF_DSYNC
#define IOCB_SYNC                (__force int) RWF_SYNC
#define IOCB_NOWAIT                (__force int) RWF_NOWAIT
#define IOCB_APPEND                (__force int) RWF_APPEND

/* non-RWF related bits - start at 16 */
#define IOCB_EVENTFD                (1 << 16)
#define IOCB_DIRECT                (1 << 17)
#define IOCB_WRITE                (1 << 18)
/* iocb->ki_waitq is valid */
#define IOCB_WAITQ                (1 << 19)
#define IOCB_NOIO                (1 << 20)
/* kiocb is a read or write operation submitted by fs/aio.c. */
#define IOCB_AIO_RW                (1 << 23)

struct kiocb {
        struct file                *ki_filp;

        /* The 'ki_filp' pointer is shared in a union for aio */
        randomized_struct_fields_start

        loff_t                        ki_pos;
        void (*ki_complete)(struct kiocb *iocb, long ret, long ret2);
        void                        *private;
        int                        ki_flags;
        u16                        ki_hint;
        u16                        ki_ioprio; /* See linux/ioprio.h */
        union {
                unsigned int                ki_cookie; /* for ->iopoll */
                struct wait_page_queue        *ki_waitq; /* for async buffered IO */
        };

        randomized_struct_fields_end
};

static inline bool is_sync_kiocb(struct kiocb *kiocb)
{
        return kiocb->ki_complete == NULL;
}

/*
 * "descriptor" for what we're up to with a read.
 * This allows us to use the same read code yet
 * have multiple different users of the data that
 * we read from a file.
 *
 * The simplest case just copies the data to user
 * mode.
 */
typedef struct {
        size_t written;
        size_t count;
        union {
                char __user *buf;
                void *data;
        } arg;
        int error;
} read_descriptor_t;

typedef int (*read_actor_t)(read_descriptor_t *, struct page *,
                unsigned long, unsigned long);

struct address_space_operations {
        int (*writepage)(struct page *page, struct writeback_control *wbc);
        int (*readpage)(struct file *, struct page *);

        /* Write back some dirty pages from this mapping. */
        int (*writepages)(struct address_space *, struct writeback_control *);

        /* Set a page dirty.  Return true if this dirtied it */
        int (*set_page_dirty)(struct page *page);

        /*
         * Reads in the requested pages. Unlike ->readpage(), this is
         * PURELY used for read-ahead!.
         */
        int (*readpages)(struct file *filp, struct address_space *mapping,
                        struct list_head *pages, unsigned nr_pages);
        void (*readahead)(struct readahead_control *);

        int (*write_begin)(struct file *, struct address_space *mapping,
                                loff_t pos, unsigned len, unsigned flags,
                                struct page **pagep, void **fsdata);
        int (*write_end)(struct file *, struct address_space *mapping,
                                loff_t pos, unsigned len, unsigned copied,
                                struct page *page, void *fsdata);

        /* Unfortunately this kludge is needed for FIBMAP. Don't use it */
        sector_t (*bmap)(struct address_space *, sector_t);
        void (*invalidatepage) (struct page *, unsigned int, unsigned int);
        int (*releasepage) (struct page *, gfp_t);
        void (*freepage)(struct page *);
        ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter);
        /*
         * migrate the contents of a page to the specified target. If
         * migrate_mode is MIGRATE_ASYNC, it must not block.
         */
        int (*migratepage) (struct address_space *,
                        struct page *, struct page *, enum migrate_mode);
        bool (*isolate_page)(struct page *, isolate_mode_t);
        void (*putback_page)(struct page *);
        int (*launder_page) (struct page *);
        int (*is_partially_uptodate) (struct page *, unsigned long,
                                        unsigned long);
        void (*is_dirty_writeback) (struct page *, bool *, bool *);
        int (*error_remove_page)(struct address_space *, struct page *);

        /* swapfile support */
        int (*swap_activate)(struct swap_info_struct *sis, struct file *file,
                                sector_t *span);
        void (*swap_deactivate)(struct file *file);
};

extern const struct address_space_operations empty_aops;

/*
 * pagecache_write_begin/pagecache_write_end must be used by general code
 * to write into the pagecache.
 */
int pagecache_write_begin(struct file *, struct address_space *mapping,
                                loff_t pos, unsigned len, unsigned flags,
                                struct page **pagep, void **fsdata);

int pagecache_write_end(struct file *, struct address_space *mapping,
                                loff_t pos, unsigned len, unsigned copied,
                                struct page *page, void *fsdata);

/**
 * struct address_space - Contents of a cacheable, mappable object.
 * @host: Owner, either the inode or the block_device.
 * @i_pages: Cached pages.
 * @gfp_mask: Memory allocation flags to use for allocating pages.
 * @i_mmap_writable: Number of VM_SHARED, VM_MAYWRITE mappings.
 * @nr_thps: Number of THPs in the pagecache (non-shmem only).
 * @i_mmap: Tree of private and shared mappings.
 * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable.
 * @nrpages: Number of page entries, protected by the i_pages lock.
 * @nrexceptional: Shadow or DAX entries, protected by the i_pages lock.
 * @writeback_index: Writeback starts here.
 * @a_ops: Methods.
 * @flags: Error bits and flags (AS_*).
 * @wb_err: The most recent error which has occurred.
 * @private_lock: For use by the owner of the address_space.
 * @private_list: For use by the owner of the address_space.
 * @private_data: For use by the owner of the address_space.
 */
struct address_space {
        struct inode                *host;
        struct xarray                i_pages;
        gfp_t                        gfp_mask;
        atomic_t                i_mmap_writable;
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
        /* number of thp, only for non-shmem files */
        atomic_t                nr_thps;
#endif
        struct rb_root_cached        i_mmap;
        struct rw_semaphore        i_mmap_rwsem;
        unsigned long                nrpages;
        unsigned long                nrexceptional;
        pgoff_t                        writeback_index;
        const struct address_space_operations *a_ops;
        unsigned long                flags;
        errseq_t                wb_err;
        spinlock_t                private_lock;
        struct list_head        private_list;
        void                        *private_data;
} __attribute__((aligned(sizeof(long)))) __randomize_layout;
        /*
         * On most architectures that alignment is already the case; but
         * must be enforced here for CRIS, to let the least significant bit
         * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON.
         */

/* XArray tags, for tagging dirty and writeback pages in the pagecache. */
#define PAGECACHE_TAG_DIRTY        XA_MARK_0
#define PAGECACHE_TAG_WRITEBACK        XA_MARK_1
#define PAGECACHE_TAG_TOWRITE        XA_MARK_2

/*
 * Returns true if any of the pages in the mapping are marked with the tag.
 */
static inline bool mapping_tagged(struct address_space *mapping, xa_mark_t tag)
{
        return xa_marked(&mapping->i_pages, tag);
}

static inline void i_mmap_lock_write(struct address_space *mapping)
{
        down_write(&mapping->i_mmap_rwsem);
}

static inline int i_mmap_trylock_write(struct address_space *mapping)
{
        return down_write_trylock(&mapping->i_mmap_rwsem);
}

static inline void i_mmap_unlock_write(struct address_space *mapping)
{
        up_write(&mapping->i_mmap_rwsem);
}

static inline void i_mmap_lock_read(struct address_space *mapping)
{
        down_read(&mapping->i_mmap_rwsem);
}

static inline void i_mmap_unlock_read(struct address_space *mapping)
{
        up_read(&mapping->i_mmap_rwsem);
}

static inline void i_mmap_assert_locked(struct address_space *mapping)
{
        lockdep_assert_held(&mapping->i_mmap_rwsem);
}

static inline void i_mmap_assert_write_locked(struct address_space *mapping)
{
        lockdep_assert_held_write(&mapping->i_mmap_rwsem);
}

/*
 * Might pages of this file be mapped into userspace?
 */
static inline int mapping_mapped(struct address_space *mapping)
{
        return        !RB_EMPTY_ROOT(&mapping->i_mmap.rb_root);
}

/*
 * Might pages of this file have been modified in userspace?
 * Note that i_mmap_writable counts all VM_SHARED, VM_MAYWRITE vmas: do_mmap
 * marks vma as VM_SHARED if it is shared, and the file was opened for
 * writing i.e. vma may be mprotected writable even if now readonly.
 *
 * If i_mmap_writable is negative, no new writable mappings are allowed. You
 * can only deny writable mappings, if none exists right now.
 */
static inline int mapping_writably_mapped(struct address_space *mapping)
{
        return atomic_read(&mapping->i_mmap_writable) > 0;
}

static inline int mapping_map_writable(struct address_space *mapping)
{
        return atomic_inc_unless_negative(&mapping->i_mmap_writable) ?
                0 : -EPERM;
}

static inline void mapping_unmap_writable(struct address_space *mapping)
{
        atomic_dec(&mapping->i_mmap_writable);
}

static inline int mapping_deny_writable(struct address_space *mapping)
{
        return atomic_dec_unless_positive(&mapping->i_mmap_writable) ?
                0 : -EBUSY;
}

static inline void mapping_allow_writable(struct address_space *mapping)
{
        atomic_inc(&mapping->i_mmap_writable);
}

/*
 * Use sequence counter to get consistent i_size on 32-bit processors.
 */
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
#include <linux/seqlock.h>
#define __NEED_I_SIZE_ORDERED
#define i_size_ordered_init(inode) seqcount_init(&inode->i_size_seqcount)
#else
#define i_size_ordered_init(inode) do { } while (0)
#endif

struct posix_acl;
#define ACL_NOT_CACHED ((void *)(-1))
#define ACL_DONT_CACHE ((void *)(-3))

static inline struct posix_acl *
uncached_acl_sentinel(struct task_struct *task)
{
        return (void *)task + 1;
}

static inline bool
is_uncached_acl(struct posix_acl *acl)
{
        return (long)acl & 1;
}

#define IOP_FASTPERM        0x0001
#define IOP_LOOKUP        0x0002
#define IOP_NOFOLLOW        0x0004
#define IOP_XATTR        0x0008
#define IOP_DEFAULT_READLINK        0x0010

struct fsnotify_mark_connector;

/*
 * Keep mostly read-only and often accessed (especially for
 * the RCU path lookup and 'stat' data) fields at the beginning
 * of the 'struct inode'
 */
struct inode {
        umode_t                        i_mode;
        unsigned short                i_opflags;
        kuid_t                        i_uid;
        kgid_t                        i_gid;
        unsigned int                i_flags;

#ifdef CONFIG_FS_POSIX_ACL
        struct posix_acl        *i_acl;
        struct posix_acl        *i_default_acl;
#endif

        const struct inode_operations        *i_op;
        struct super_block        *i_sb;
        struct address_space        *i_mapping;

#ifdef CONFIG_SECURITY
        void                        *i_security;
#endif

        /* Stat data, not accessed from path walking */
        unsigned long                i_ino;
        /*
         * Filesystems may only read i_nlink directly.  They shall use the
         * following functions for modification:
         *
         *    (set|clear|inc|drop)_nlink
         *    inode_(inc|dec)_link_count
         */
        union {
                const unsigned int i_nlink;
                unsigned int __i_nlink;
        };
        dev_t                        i_rdev;
        loff_t                        i_size;
        struct timespec64        i_atime;
        struct timespec64        i_mtime;
        struct timespec64        i_ctime;
        spinlock_t                i_lock;        /* i_blocks, i_bytes, maybe i_size */
        unsigned short          i_bytes;
        u8                        i_blkbits;
        u8                        i_write_hint;
        blkcnt_t                i_blocks;

#ifdef __NEED_I_SIZE_ORDERED
        seqcount_t                i_size_seqcount;
#endif

        /* Misc */
        unsigned long                i_state;
        struct rw_semaphore        i_rwsem;

        unsigned long                dirtied_when;        /* jiffies of first dirtying */
        unsigned long                dirtied_time_when;

        struct hlist_node        i_hash;
        struct list_head        i_io_list;        /* backing dev IO list */
#ifdef CONFIG_CGROUP_WRITEBACK
        struct bdi_writeback        *i_wb;                /* the associated cgroup wb */

        /* foreign inode detection, see wbc_detach_inode() */
        int                        i_wb_frn_winner;
        u16                        i_wb_frn_avg_time;
        u16                        i_wb_frn_history;
#endif
        struct list_head        i_lru;                /* inode LRU list */
        struct list_head        i_sb_list;
        struct list_head        i_wb_list;        /* backing dev writeback list */
        union {
                struct hlist_head        i_dentry;
                struct rcu_head                i_rcu;
        };
        atomic64_t                i_version;
        atomic64_t                i_sequence; /* see futex */
        atomic_t                i_count;
        atomic_t                i_dio_count;
        atomic_t                i_writecount;
#if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING)
        atomic_t                i_readcount; /* struct files open RO */
#endif
        union {
                const struct file_operations        *i_fop;        /* former ->i_op->default_file_ops */
                void (*free_inode)(struct inode *);
        };
        struct file_lock_context        *i_flctx;
        struct address_space        i_data;
        struct list_head        i_devices;
        union {
                struct pipe_inode_info        *i_pipe;
                struct block_device        *i_bdev;
                struct cdev                *i_cdev;
                char                        *i_link;
                unsigned                i_dir_seq;
        };

        __u32                        i_generation;

#ifdef CONFIG_FSNOTIFY
        __u32                        i_fsnotify_mask; /* all events this inode cares about */
        struct fsnotify_mark_connector __rcu        *i_fsnotify_marks;
#endif

#ifdef CONFIG_FS_ENCRYPTION
        struct fscrypt_info        *i_crypt_info;
#endif

#ifdef CONFIG_FS_VERITY
        struct fsverity_info        *i_verity_info;
#endif

        void                        *i_private; /* fs or device private pointer */
} __randomize_layout;

struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode);

static inline unsigned int i_blocksize(const struct inode *node)
{
        return (1 << node->i_blkbits);
}

static inline int inode_unhashed(struct inode *inode)
{
        return hlist_unhashed(&inode->i_hash);
}

/*
 * __mark_inode_dirty expects inodes to be hashed.  Since we don't
 * want special inodes in the fileset inode space, we make them
 * appear hashed, but do not put on any lists.  hlist_del()
 * will work fine and require no locking.
 */
static inline void inode_fake_hash(struct inode *inode)
{
        hlist_add_fake(&inode->i_hash);
}

/*
 * inode->i_mutex nesting subclasses for the lock validator:
 *
 * 0: the object of the current VFS operation
 * 1: parent
 * 2: child/target
 * 3: xattr
 * 4: second non-directory
 * 5: second parent (when locking independent directories in rename)
 *
 * I_MUTEX_NONDIR2 is for certain operations (such as rename) which lock two
 * non-directories at once.
 *
 * The locking order between these classes is
 * parent[2] -> child -> grandchild -> normal -> xattr -> second non-directory
 */
enum inode_i_mutex_lock_class
{
        I_MUTEX_NORMAL,
        I_MUTEX_PARENT,
        I_MUTEX_CHILD,
        I_MUTEX_XATTR,
        I_MUTEX_NONDIR2,
        I_MUTEX_PARENT2,
};

static inline void inode_lock(struct inode *inode)
{
        down_write(&inode->i_rwsem);
}

static inline void inode_unlock(struct inode *inode)
{
        up_write(&inode->i_rwsem);
}

static inline void inode_lock_shared(struct inode *inode)
{
        down_read(&inode->i_rwsem);
}

static inline void inode_unlock_shared(struct inode *inode)
{
        up_read(&inode->i_rwsem);
}

static inline int inode_trylock(struct inode *inode)
{
        return down_write_trylock(&inode->i_rwsem);
}

static inline int inode_trylock_shared(struct inode *inode)
{
        return down_read_trylock(&inode->i_rwsem);
}

static inline int inode_is_locked(struct inode *inode)
{
        return rwsem_is_locked(&inode->i_rwsem);
}

static inline void inode_lock_nested(struct inode *inode, unsigned subclass)
{
        down_write_nested(&inode->i_rwsem, subclass);
}

static inline void inode_lock_shared_nested(struct inode *inode, unsigned subclass)
{
        down_read_nested(&inode->i_rwsem, subclass);
}

void lock_two_nondirectories(struct inode *, struct inode*);
void unlock_two_nondirectories(struct inode *, struct inode*);

/*
 * NOTE: in a 32bit arch with a preemptable kernel and
 * an UP compile the i_size_read/write must be atomic
 * with respect to the local cpu (unlike with preempt disabled),
 * but they don't need to be atomic with respect to other cpus like in
 * true SMP (so they need either to either locally disable irq around
 * the read or for example on x86 they can be still implemented as a
 * cmpxchg8b without the need of the lock prefix). For SMP compiles
 * and 64bit archs it makes no difference if preempt is enabled or not.
 */
static inline loff_t i_size_read(const struct inode *inode)
{
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
        loff_t i_size;
        unsigned int seq;

        do {
                seq = read_seqcount_begin(&inode->i_size_seqcount);
                i_size = inode->i_size;
        } while (read_seqcount_retry(&inode->i_size_seqcount, seq));
        return i_size;
#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION)
        loff_t i_size;

        preempt_disable();
        i_size = inode->i_size;
        preempt_enable();
        return i_size;
#else
        return inode->i_size;
#endif
}

/*
 * NOTE: unlike i_size_read(), i_size_write() does need locking around it
 * (normally i_mutex), otherwise on 32bit/SMP an update of i_size_seqcount
 * can be lost, resulting in subsequent i_size_read() calls spinning forever.
 */
static inline void i_size_write(struct inode *inode, loff_t i_size)
{
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
        preempt_disable();
        write_seqcount_begin(&inode->i_size_seqcount);
        inode->i_size = i_size;
        write_seqcount_end(&inode->i_size_seqcount);
        preempt_enable();
#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION)
        preempt_disable();
        inode->i_size = i_size;
        preempt_enable();
#else
        inode->i_size = i_size;
#endif
}

static inline unsigned iminor(const struct inode *inode)
{
        return MINOR(inode->i_rdev);
}

static inline unsigned imajor(const struct inode *inode)
{
        return MAJOR(inode->i_rdev);
}

struct fown_struct {
        rwlock_t lock;          /* protects pid, uid, euid fields */
        struct pid *pid;        /* pid or -pgrp where SIGIO should be sent */
        enum pid_type pid_type;        /* Kind of process group SIGIO should be sent to */
        kuid_t uid, euid;        /* uid/euid of process setting the owner */
        int signum;                /* posix.1b rt signal to be delivered on IO */
};

/*
 * Track a single file's readahead state
 */
struct file_ra_state {
        pgoff_t start;                        /* where readahead started */
        unsigned int size;                /* # of readahead pages */
        unsigned int async_size;        /* do asynchronous readahead when
                                           there are only # of pages ahead */

        unsigned int ra_pages;                /* Maximum readahead window */
        unsigned int mmap_miss;                /* Cache miss stat for mmap accesses */
        loff_t prev_pos;                /* Cache last read() position */
};

/*
 * Check if @index falls in the readahead windows.
 */
static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
{
        return (index >= ra->start &&
                index <  ra->start + ra->size);
}

struct file {
        union {
                struct llist_node        fu_llist;
                struct rcu_head         fu_rcuhead;
        } f_u;
        struct path                f_path;
        struct inode                *f_inode;        /* cached value */
        const struct file_operations        *f_op;

        /*
         * Protects f_ep_links, f_flags.
         * Must not be taken from IRQ context.
         */
        spinlock_t                f_lock;
        enum rw_hint                f_write_hint;
        atomic_long_t                f_count;
        unsigned int                 f_flags;
        fmode_t                        f_mode;
        struct mutex                f_pos_lock;
        loff_t                        f_pos;
        struct fown_struct        f_owner;
        const struct cred        *f_cred;
        struct file_ra_state        f_ra;

        u64                        f_version;
#ifdef CONFIG_SECURITY
        void                        *f_security;
#endif
        /* needed for tty driver, and maybe others */
        void                        *private_data;

#ifdef CONFIG_EPOLL
        /* Used by fs/eventpoll.c to link all the hooks to this file */
        struct list_head        f_ep_links;
        struct list_head        f_tfile_llink;
#endif /* #ifdef CONFIG_EPOLL */
        struct address_space        *f_mapping;
        errseq_t                f_wb_err;
        errseq_t                f_sb_err; /* for syncfs */
} __randomize_layout
  __attribute__((aligned(4)));        /* lest something weird decides that 2 is OK */

struct file_handle {
        __u32 handle_bytes;
        int handle_type;
        /* file identifier */
        unsigned char f_handle[];
};

static inline struct file *get_file(struct file *f)
{
        atomic_long_inc(&f->f_count);
        return f;
}
#define get_file_rcu_many(x, cnt)        \
        atomic_long_add_unless(&(x)->f_count, (cnt), 0)
#define get_file_rcu(x) get_file_rcu_many((x), 1)
#define file_count(x)        atomic_long_read(&(x)->f_count)

#define        MAX_NON_LFS        ((1UL<<31) - 1)

/* Page cache limit. The filesystems should put that into their s_maxbytes 
   limits, otherwise bad things can happen in VM. */ 
#if BITS_PER_LONG==32
#define MAX_LFS_FILESIZE        ((loff_t)ULONG_MAX << PAGE_SHIFT)
#elif BITS_PER_LONG==64
#define MAX_LFS_FILESIZE         ((loff_t)LLONG_MAX)
#endif

#define FL_POSIX        1
#define FL_FLOCK        2
#define FL_DELEG        4        /* NFSv4 delegation */
#define FL_ACCESS        8        /* not trying to lock, just looking */
#define FL_EXISTS        16        /* when unlocking, test for existence */
#define FL_LEASE        32        /* lease held on this file */
#define FL_CLOSE        64        /* unlock on close */
#define FL_SLEEP        128        /* A blocking lock */
#define FL_DOWNGRADE_PENDING        256 /* Lease is being downgraded */
#define FL_UNLOCK_PENDING        512 /* Lease is being broken */
#define FL_OFDLCK        1024        /* lock is "owned" by struct file */
#define FL_LAYOUT        2048        /* outstanding pNFS layout */
#define FL_RECLAIM        4096        /* reclaiming from a reboot server */

#define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE)

/*
 * Special return value from posix_lock_file() and vfs_lock_file() for
 * asynchronous locking.
 */
#define FILE_LOCK_DEFERRED 1

/* legacy typedef, should eventually be removed */
typedef void *fl_owner_t;

struct file_lock;

struct file_lock_operations {
        void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
        void (*fl_release_private)(struct file_lock *);
};

struct lock_manager_operations {
        void *lm_mod_owner;
        fl_owner_t (*lm_get_owner)(fl_owner_t);
        void (*lm_put_owner)(fl_owner_t);
        void (*lm_notify)(struct file_lock *);        /* unblock callback */
        int (*lm_grant)(struct file_lock *, int);
        bool (*lm_break)(struct file_lock *);
        int (*lm_change)(struct file_lock *, int, struct list_head *);
        void (*lm_setup)(struct file_lock *, void **);
        bool (*lm_breaker_owns_lease)(struct file_lock *);
        bool (*lm_lock_expirable)(struct file_lock *cfl);
        void (*lm_expire_lock)(void);
};

struct lock_manager {
        struct list_head list;
        /*
         * NFSv4 and up also want opens blocked during the grace period;
         * NLM doesn't care:
         */
        bool block_opens;
};

struct net;
void locks_start_grace(struct net *, struct lock_manager *);
void locks_end_grace(struct lock_manager *);
bool locks_in_grace(struct net *);
bool opens_in_grace(struct net *);

/* that will die - we need it for nfs_lock_info */
#include <linux/nfs_fs_i.h>

/*
 * struct file_lock represents a generic "file lock". It's used to represent
 * POSIX byte range locks, BSD (flock) locks, and leases. It's important to
 * note that the same struct is used to represent both a request for a lock and
 * the lock itself, but the same object is never used for both.
 *
 * FIXME: should we create a separate "struct lock_request" to help distinguish
 * these two uses?
 *
 * The varous i_flctx lists are ordered by:
 *
 * 1) lock owner
 * 2) lock range start
 * 3) lock range end
 *
 * Obviously, the last two criteria only matter for POSIX locks.
 */
struct file_lock {
        struct file_lock *fl_blocker;        /* The lock, that is blocking us */
        struct list_head fl_list;        /* link into file_lock_context */
        struct hlist_node fl_link;        /* node in global lists */
        struct list_head fl_blocked_requests;        /* list of requests with
                                                 * ->fl_blocker pointing here
                                                 */
        struct list_head fl_blocked_member;        /* node in
                                                 * ->fl_blocker->fl_blocked_requests
                                                 */
        fl_owner_t fl_owner;
        unsigned int fl_flags;
        unsigned char fl_type;
        unsigned int fl_pid;
        int fl_link_cpu;                /* what cpu's list is this on? */
        wait_queue_head_t fl_wait;
        struct file *fl_file;
        loff_t fl_start;
        loff_t fl_end;

        struct fasync_struct *        fl_fasync; /* for lease break notifications */
        /* for lease breaks: */
        unsigned long fl_break_time;
        unsigned long fl_downgrade_time;

        const struct file_lock_operations *fl_ops;        /* Callbacks for filesystems */
        const struct lock_manager_operations *fl_lmops;        /* Callbacks for lockmanagers */
        union {
                struct nfs_lock_info        nfs_fl;
                struct nfs4_lock_info        nfs4_fl;
                struct {
                        struct list_head link;        /* link in AFS vnode's pending_locks list */
                        int state;                /* state of grant or error if -ve */
                        unsigned int        debug_id;
                } afs;
        } fl_u;
} __randomize_layout;

struct file_lock_context {
        spinlock_t                flc_lock;
        struct list_head        flc_flock;
        struct list_head        flc_posix;
        struct list_head        flc_lease;
};

/* The following constant reflects the upper bound of the file/locking space */
#ifndef OFFSET_MAX
#define INT_LIMIT(x)        (~((x)1 << (sizeof(x)*8 - 1)))
#define OFFSET_MAX        INT_LIMIT(loff_t)
#define OFFT_OFFSET_MAX        INT_LIMIT(off_t)
#endif

extern void send_sigio(struct fown_struct *fown, int fd, int band);

#define locks_inode(f) file_inode(f)

#ifdef CONFIG_FILE_LOCKING
extern int fcntl_getlk(struct file *, unsigned int, struct flock *);
extern int fcntl_setlk(unsigned int, struct file *, unsigned int,
                        struct flock *);

#if BITS_PER_LONG == 32
extern int fcntl_getlk64(struct file *, unsigned int, struct flock64 *);
extern int fcntl_setlk64(unsigned int, struct file *, unsigned int,
                        struct flock64 *);
#endif

extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg);
extern int fcntl_getlease(struct file *filp);

/* fs/locks.c */
void locks_free_lock_context(struct inode *inode);
void locks_free_lock(struct file_lock *fl);
extern void locks_init_lock(struct file_lock *);
extern struct file_lock * locks_alloc_lock(void);
extern void locks_copy_lock(struct file_lock *, struct file_lock *);
extern void locks_copy_conflock(struct file_lock *, struct file_lock *);
extern void locks_remove_posix(struct file *, fl_owner_t);
extern void locks_remove_file(struct file *);
extern void locks_release_private(struct file_lock *);
extern void posix_test_lock(struct file *, struct file_lock *);
extern int posix_lock_file(struct file *, struct file_lock *, struct file_lock *);
extern int locks_delete_block(struct file_lock *);
extern int vfs_test_lock(struct file *, struct file_lock *);
extern int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *);
extern int vfs_cancel_lock(struct file *filp, struct file_lock *fl);
bool vfs_inode_has_locks(struct inode *inode);
extern int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl);
extern int __break_lease(struct inode *inode, unsigned int flags, unsigned int type);
extern void lease_get_mtime(struct inode *, struct timespec64 *time);
extern int generic_setlease(struct file *, long, struct file_lock **, void **priv);
extern int vfs_setlease(struct file *, long, struct file_lock **, void **);
extern int lease_modify(struct file_lock *, int, struct list_head *);

struct notifier_block;
extern int lease_register_notifier(struct notifier_block *);
extern void lease_unregister_notifier(struct notifier_block *);

struct files_struct;
extern void show_fd_locks(struct seq_file *f,
                         struct file *filp, struct files_struct *files);
extern bool locks_owner_has_blockers(struct file_lock_context *flctx,
                        fl_owner_t owner);

static inline struct file_lock_context *
locks_inode_context(const struct inode *inode)
{
        return smp_load_acquire(&inode->i_flctx);
}

#else /* !CONFIG_FILE_LOCKING */
static inline int fcntl_getlk(struct file *file, unsigned int cmd,
                              struct flock __user *user)
{
        return -EINVAL;
}

static inline int fcntl_setlk(unsigned int fd, struct file *file,
                              unsigned int cmd, struct flock __user *user)
{
        return -EACCES;
}

#if BITS_PER_LONG == 32
static inline int fcntl_getlk64(struct file *file, unsigned int cmd,
                                struct flock64 __user *user)
{
        return -EINVAL;
}

static inline int fcntl_setlk64(unsigned int fd, struct file *file,
                                unsigned int cmd, struct flock64 __user *user)
{
        return -EACCES;
}
#endif
static inline int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
{
        return -EINVAL;
}

static inline int fcntl_getlease(struct file *filp)
{
        return F_UNLCK;
}

static inline void
locks_free_lock_context(struct inode *inode)
{
}

static inline void locks_init_lock(struct file_lock *fl)
{
        return;
}

static inline void locks_copy_conflock(struct file_lock *new, struct file_lock *fl)
{
        return;
}

static inline void locks_copy_lock(struct file_lock *new, struct file_lock *fl)
{
        return;
}

static inline void locks_remove_posix(struct file *filp, fl_owner_t owner)
{
        return;
}

static inline void locks_remove_file(struct file *filp)
{
        return;
}

static inline void posix_test_lock(struct file *filp, struct file_lock *fl)
{
        return;
}

static inline int posix_lock_file(struct file *filp, struct file_lock *fl,
                                  struct file_lock *conflock)
{
        return -ENOLCK;
}

static inline int locks_delete_block(struct file_lock *waiter)
{
        return -ENOENT;
}

static inline int vfs_test_lock(struct file *filp, struct file_lock *fl)
{
        return 0;
}

static inline int vfs_lock_file(struct file *filp, unsigned int cmd,
                                struct file_lock *fl, struct file_lock *conf)
{
        return -ENOLCK;
}

static inline int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
{
        return 0;
}

static inline bool vfs_inode_has_locks(struct inode *inode)
{
        return false;
}

static inline int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl)
{
        return -ENOLCK;
}

static inline int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
{
        return 0;
}

static inline void lease_get_mtime(struct inode *inode,
                                   struct timespec64 *time)
{
        return;
}

static inline int generic_setlease(struct file *filp, long arg,
                                    struct file_lock **flp, void **priv)
{
        return -EINVAL;
}

static inline int vfs_setlease(struct file *filp, long arg,
                               struct file_lock **lease, void **priv)
{
        return -EINVAL;
}

static inline int lease_modify(struct file_lock *fl, int arg,
                               struct list_head *dispose)
{
        return -EINVAL;
}

struct files_struct;
static inline void show_fd_locks(struct seq_file *f,
                        struct file *filp, struct files_struct *files) {}
static inline bool locks_owner_has_blockers(struct file_lock_context *flctx,
                        fl_owner_t owner)
{
        return false;
}

static inline struct file_lock_context *
locks_inode_context(const struct inode *inode)
{
        return NULL;
}

#endif /* !CONFIG_FILE_LOCKING */

static inline struct inode *file_inode(const struct file *f)
{
        return f->f_inode;
}

static inline struct dentry *file_dentry(const struct file *file)
{
        return d_real(file->f_path.dentry, file_inode(file));
}

static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl)
{
        return locks_lock_inode_wait(locks_inode(filp), fl);
}

struct fasync_struct {
        rwlock_t                fa_lock;
        int                        magic;
        int                        fa_fd;
        struct fasync_struct        *fa_next; /* singly linked list */
        struct file                *fa_file;
        struct rcu_head                fa_rcu;
};

#define FASYNC_MAGIC 0x4601

/* SMP safe fasync helpers: */
extern int fasync_helper(int, struct file *, int, struct fasync_struct **);
extern struct fasync_struct *fasync_insert_entry(int, struct file *, struct fasync_struct **, struct fasync_struct *);
extern int fasync_remove_entry(struct file *, struct fasync_struct **);
extern struct fasync_struct *fasync_alloc(void);
extern void fasync_free(struct fasync_struct *);

/* can be called from interrupts */
extern void kill_fasync(struct fasync_struct **, int, int);

extern void __f_setown(struct file *filp, struct pid *, enum pid_type, int force);
extern int f_setown(struct file *filp, unsigned long arg, int force);
extern void f_delown(struct file *filp);
extern pid_t f_getown(struct file *filp);
extern int send_sigurg(struct fown_struct *fown);

/*
 * sb->s_flags.  Note that these mirror the equivalent MS_* flags where
 * represented in both.
 */
#define SB_RDONLY       BIT(0)        /* Mount read-only */
#define SB_NOSUID       BIT(1)        /* Ignore suid and sgid bits */
#define SB_NODEV        BIT(2)        /* Disallow access to device special files */
#define SB_NOEXEC       BIT(3)        /* Disallow program execution */
#define SB_SYNCHRONOUS  BIT(4)        /* Writes are synced at once */
#define SB_MANDLOCK     BIT(6)        /* Allow mandatory locks on an FS */
#define SB_DIRSYNC      BIT(7)        /* Directory modifications are synchronous */
#define SB_NOATIME      BIT(10)        /* Do not update access times. */
#define SB_NODIRATIME   BIT(11)        /* Do not update directory access times */
#define SB_SILENT       BIT(15)
#define SB_POSIXACL     BIT(16)        /* VFS does not apply the umask */
#define SB_INLINECRYPT  BIT(17)        /* Use blk-crypto for encrypted files */
#define SB_KERNMOUNT    BIT(22)        /* this is a kern_mount call */
#define SB_I_VERSION    BIT(23)        /* Update inode I_version field */
#define SB_LAZYTIME     BIT(25)        /* Update the on-disk [acm]times lazily */

/* These sb flags are internal to the kernel */
#define SB_SUBMOUNT     BIT(26)
#define SB_FORCE        BIT(27)
#define SB_NOSEC        BIT(28)
#define SB_BORN         BIT(29)
#define SB_ACTIVE       BIT(30)
#define SB_NOUSER       BIT(31)

/* These flags relate to encoding and casefolding */
#define SB_ENC_STRICT_MODE_FL        (1 << 0)

#define sb_has_strict_encoding(sb) \
        (sb->s_encoding_flags & SB_ENC_STRICT_MODE_FL)

/*
 *        Umount options
 */

#define MNT_FORCE        0x00000001        /* Attempt to forcibily umount */
#define MNT_DETACH        0x00000002        /* Just detach from the tree */
#define MNT_EXPIRE        0x00000004        /* Mark for expiry */
#define UMOUNT_NOFOLLOW        0x00000008        /* Don't follow symlink on umount */
#define UMOUNT_UNUSED        0x80000000        /* Flag guaranteed to be unused */

/* sb->s_iflags */
#define SB_I_CGROUPWB        0x00000001        /* cgroup-aware writeback enabled */
#define SB_I_NOEXEC        0x00000002        /* Ignore executables on this fs */
#define SB_I_NODEV        0x00000004        /* Ignore devices on this fs */
#define SB_I_STABLE_WRITES 0x00000008        /* don't modify blks until WB is done */

/* sb->s_iflags to limit user namespace mounts */
#define SB_I_USERNS_VISIBLE                0x00000010 /* fstype already mounted */
#define SB_I_IMA_UNVERIFIABLE_SIGNATURE        0x00000020
#define SB_I_UNTRUSTED_MOUNTER                0x00000040

#define SB_I_SKIP_SYNC        0x00000100        /* Skip superblock at global sync */
#define SB_I_PERSB_BDI        0x00000200        /* has a per-sb bdi */
#define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */

/* Possible states of 'frozen' field */
enum {
        SB_UNFROZEN = 0,                /* FS is unfrozen */
        SB_FREEZE_WRITE        = 1,                /* Writes, dir ops, ioctls frozen */
        SB_FREEZE_PAGEFAULT = 2,        /* Page faults stopped as well */
        SB_FREEZE_FS = 3,                /* For internal FS use (e.g. to stop
                                         * internal threads if needed) */
        SB_FREEZE_COMPLETE = 4,                /* ->freeze_fs finished successfully */
};

#define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1)

struct sb_writers {
        int                                frozen;                /* Is sb frozen? */
        wait_queue_head_t                wait_unfrozen;        /* for get_super_thawed() */
        struct percpu_rw_semaphore        rw_sem[SB_FREEZE_LEVELS];
};

struct super_block {
        struct list_head        s_list;                /* Keep this first */
        dev_t                        s_dev;                /* search index; _not_ kdev_t */
        unsigned char                s_blocksize_bits;
        unsigned long                s_blocksize;
        loff_t                        s_maxbytes;        /* Max file size */
        struct file_system_type        *s_type;
        const struct super_operations        *s_op;
        const struct dquot_operations        *dq_op;
        const struct quotactl_ops        *s_qcop;
        const struct export_operations *s_export_op;
        unsigned long                s_flags;
        unsigned long                s_iflags;        /* internal SB_I_* flags */
        unsigned long                s_magic;
        struct dentry                *s_root;
        struct rw_semaphore        s_umount;
        int                        s_count;
        atomic_t                s_active;
#ifdef CONFIG_SECURITY
        void                    *s_security;
#endif
        const struct xattr_handler **s_xattr;
#ifdef CONFIG_FS_ENCRYPTION
        const struct fscrypt_operations        *s_cop;
        struct fscrypt_keyring        *s_master_keys; /* master crypto keys in use */
#endif
#ifdef CONFIG_FS_VERITY
        const struct fsverity_operations *s_vop;
#endif
#ifdef CONFIG_UNICODE
        struct unicode_map *s_encoding;
        __u16 s_encoding_flags;
#endif
        struct hlist_bl_head        s_roots;        /* alternate root dentries for NFS */
        struct list_head        s_mounts;        /* list of mounts; _not_ for fs use */
        struct block_device        *s_bdev;
        struct backing_dev_info *s_bdi;
        struct mtd_info                *s_mtd;
        struct hlist_node        s_instances;
        unsigned int                s_quota_types;        /* Bitmask of supported quota types */
        struct quota_info        s_dquot;        /* Diskquota specific options */

        struct sb_writers        s_writers;

        /*
         * Keep s_fs_info, s_time_gran, s_fsnotify_mask, and
         * s_fsnotify_marks together for cache efficiency. They are frequently
         * accessed and rarely modified.
         */
        void                        *s_fs_info;        /* Filesystem private info */

        /* Granularity of c/m/atime in ns (cannot be worse than a second) */
        u32                        s_time_gran;
        /* Time limits for c/m/atime in seconds */
        time64_t                   s_time_min;
        time64_t                   s_time_max;
#ifdef CONFIG_FSNOTIFY
        __u32                        s_fsnotify_mask;
        struct fsnotify_mark_connector __rcu        *s_fsnotify_marks;
#endif

        char                        s_id[32];        /* Informational name */
        uuid_t                        s_uuid;                /* UUID */

        unsigned int                s_max_links;
        fmode_t                        s_mode;

        /*
         * The next field is for VFS *only*. No filesystems have any business
         * even looking at it. You had been warned.
         */
        struct mutex s_vfs_rename_mutex;        /* Kludge */

        /*
         * Filesystem subtype.  If non-empty the filesystem type field
         * in /proc/mounts will be "type.subtype"
         */
        const char *s_subtype;

        const struct dentry_operations *s_d_op; /* default d_op for dentries */

        /*
         * Saved pool identifier for cleancache (-1 means none)
         */
        int cleancache_poolid;

        struct shrinker s_shrink;        /* per-sb shrinker handle */

        /* Number of inodes with nlink == 0 but still referenced */
        atomic_long_t s_remove_count;

        /*
         * Number of inode/mount/sb objects that are being watched, note that
         * inodes objects are currently double-accounted.
         */
        atomic_long_t s_fsnotify_connectors;

        /* Being remounted read-only */
        int s_readonly_remount;

        /* per-sb errseq_t for reporting writeback errors via syncfs */
        errseq_t s_wb_err;

        /* AIO completions deferred from interrupt context */
        struct workqueue_struct *s_dio_done_wq;
        struct hlist_head s_pins;

        /*
         * Owning user namespace and default context in which to
         * interpret filesystem uids, gids, quotas, device nodes,
         * xattrs and security labels.
         */
        struct user_namespace *s_user_ns;

        /*
         * The list_lru structure is essentially just a pointer to a table
         * of per-node lru lists, each of which has its own spinlock.
         * There is no need to put them into separate cachelines.
         */
        struct list_lru                s_dentry_lru;
        struct list_lru                s_inode_lru;
        struct rcu_head                rcu;
        struct work_struct        destroy_work;

        struct mutex                s_sync_lock;        /* sync serialisation lock */

        /*
         * Indicates how deep in a filesystem stack this SB is
         */
        int s_stack_depth;

        /* s_inode_list_lock protects s_inodes */
        spinlock_t                s_inode_list_lock ____cacheline_aligned_in_smp;
        struct list_head        s_inodes;        /* all inodes */

        spinlock_t                s_inode_wblist_lock;
        struct list_head        s_inodes_wb;        /* writeback inodes */
} __randomize_layout;

/* Helper functions so that in most cases filesystems will
 * not need to deal directly with kuid_t and kgid_t and can
 * instead deal with the raw numeric values that are stored
 * in the filesystem.
 */
static inline uid_t i_uid_read(const struct inode *inode)
{
        return from_kuid(inode->i_sb->s_user_ns, inode->i_uid);
}

static inline gid_t i_gid_read(const struct inode *inode)
{
        return from_kgid(inode->i_sb->s_user_ns, inode->i_gid);
}

static inline void i_uid_write(struct inode *inode, uid_t uid)
{
        inode->i_uid = make_kuid(inode->i_sb->s_user_ns, uid);
}

static inline void i_gid_write(struct inode *inode, gid_t gid)
{
        inode->i_gid = make_kgid(inode->i_sb->s_user_ns, gid);
}

struct timespec64 current_time(struct inode *inode);
struct timespec64 inode_set_ctime_current(struct inode *inode);

/**
 * inode_get_ctime - fetch the current ctime from the inode
 * @inode: inode from which to fetch ctime
 *
 * Grab the current ctime from the inode and return it.
 */
static inline struct timespec64 inode_get_ctime(const struct inode *inode)
{
        return inode->i_ctime;
}

/**
 * inode_set_ctime_to_ts - set the ctime in the inode
 * @inode: inode in which to set the ctime
 * @ts: value to set in the ctime field
 *
 * Set the ctime in @inode to @ts
 */
static inline struct timespec64 inode_set_ctime_to_ts(struct inode *inode,
                                                      struct timespec64 ts)
{
        inode->i_ctime = ts;
        return ts;
}

/**
 * inode_set_ctime - set the ctime in the inode
 * @inode: inode in which to set the ctime
 * @sec: tv_sec value to set
 * @nsec: tv_nsec value to set
 *
 * Set the ctime in @inode to { @sec, @nsec }
 */
static inline struct timespec64 inode_set_ctime(struct inode *inode,
                                                time64_t sec, long nsec)
{
        struct timespec64 ts = { .tv_sec  = sec,
                                 .tv_nsec = nsec };

        return inode_set_ctime_to_ts(inode, ts);
}

/*
 * Snapshotting support.
 */

/*
 * These are internal functions, please use sb_start_{write,pagefault,intwrite}
 * instead.
 */
static inline void __sb_end_write(struct super_block *sb, int level)
{
        percpu_up_read(sb->s_writers.rw_sem + level-1);
}

static inline void __sb_start_write(struct super_block *sb, int level)
{
        percpu_down_read(sb->s_writers.rw_sem + level - 1);
}

static inline bool __sb_start_write_trylock(struct super_block *sb, int level)
{
        return percpu_down_read_trylock(sb->s_writers.rw_sem + level - 1);
}

#define __sb_writers_acquired(sb, lev)        \
        percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
#define __sb_writers_release(sb, lev)        \
        percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)

/**
 * sb_end_write - drop write access to a superblock
 * @sb: the super we wrote to
 *
 * Decrement number of writers to the filesystem. Wake up possible waiters
 * wanting to freeze the filesystem.
 */
static inline void sb_end_write(struct super_block *sb)
{
        __sb_end_write(sb, SB_FREEZE_WRITE);
}

/**
 * sb_end_pagefault - drop write access to a superblock from a page fault
 * @sb: the super we wrote to
 *
 * Decrement number of processes handling write page fault to the filesystem.
 * Wake up possible waiters wanting to freeze the filesystem.
 */
static inline void sb_end_pagefault(struct super_block *sb)
{
        __sb_end_write(sb, SB_FREEZE_PAGEFAULT);
}

/**
 * sb_end_intwrite - drop write access to a superblock for internal fs purposes
 * @sb: the super we wrote to
 *
 * Decrement fs-internal number of writers to the filesystem.  Wake up possible
 * waiters wanting to freeze the filesystem.
 */
static inline void sb_end_intwrite(struct super_block *sb)
{
        __sb_end_write(sb, SB_FREEZE_FS);
}

/**
 * sb_start_write - get write access to a superblock
 * @sb: the super we write to
 *
 * When a process wants to write data or metadata to a file system (i.e. dirty
 * a page or an inode), it should embed the operation in a sb_start_write() -
 * sb_end_write() pair to get exclusion against file system freezing. This
 * function increments number of writers preventing freezing. If the file
 * system is already frozen, the function waits until the file system is
 * thawed.
 *
 * Since freeze protection behaves as a lock, users have to preserve
 * ordering of freeze protection and other filesystem locks. Generally,
 * freeze protection should be the outermost lock. In particular, we have:
 *
 * sb_start_write
 *   -> i_mutex                        (write path, truncate, directory ops, ...)
 *   -> s_umount                (freeze_super, thaw_super)
 */
static inline void sb_start_write(struct super_block *sb)
{
        __sb_start_write(sb, SB_FREEZE_WRITE);
}

static inline bool sb_start_write_trylock(struct super_block *sb)
{
        return __sb_start_write_trylock(sb, SB_FREEZE_WRITE);
}

/**
 * sb_start_pagefault - get write access to a superblock from a page fault
 * @sb: the super we write to
 *
 * When a process starts handling write page fault, it should embed the
 * operation into sb_start_pagefault() - sb_end_pagefault() pair to get
 * exclusion against file system freezing. This is needed since the page fault
 * is going to dirty a page. This function increments number of running page
 * faults preventing freezing. If the file system is already frozen, the
 * function waits until the file system is thawed.
 *
 * Since page fault freeze protection behaves as a lock, users have to preserve
 * ordering of freeze protection and other filesystem locks. It is advised to
 * put sb_start_pagefault() close to mmap_lock in lock ordering. Page fault
 * handling code implies lock dependency:
 *
 * mmap_lock
 *   -> sb_start_pagefault
 */
static inline void sb_start_pagefault(struct super_block *sb)
{
        __sb_start_write(sb, SB_FREEZE_PAGEFAULT);
}

/*
 * sb_start_intwrite - get write access to a superblock for internal fs purposes
 * @sb: the super we write to
 *
 * This is the third level of protection against filesystem freezing. It is
 * free for use by a filesystem. The only requirement is that it must rank
 * below sb_start_pagefault.
 *
 * For example filesystem can call sb_start_intwrite() when starting a
 * transaction which somewhat eases handling of freezing for internal sources
 * of filesystem changes (internal fs threads, discarding preallocation on file
 * close, etc.).
 */
static inline void sb_start_intwrite(struct super_block *sb)
{
        __sb_start_write(sb, SB_FREEZE_FS);
}

static inline bool sb_start_intwrite_trylock(struct super_block *sb)
{
        return __sb_start_write_trylock(sb, SB_FREEZE_FS);
}

/**
 * kiocb_start_write - get write access to a superblock for async file io
 * @iocb: the io context we want to submit the write with
 *
 * This is a variant of sb_start_write() for async io submission.
 * Should be matched with a call to kiocb_end_write().
 */
static inline void kiocb_start_write(struct kiocb *iocb)
{
        struct inode *inode = file_inode(iocb->ki_filp);

        sb_start_write(inode->i_sb);
        /*
         * Fool lockdep by telling it the lock got released so that it
         * doesn't complain about the held lock when we return to userspace.
         */
        __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE);
}

/**
 * kiocb_end_write - drop write access to a superblock after async file io
 * @iocb: the io context we sumbitted the write with
 *
 * Should be matched with a call to kiocb_start_write().
 */
static inline void kiocb_end_write(struct kiocb *iocb)
{
        struct inode *inode = file_inode(iocb->ki_filp);

        /*
         * Tell lockdep we inherited freeze protection from submission thread.
         */
        __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
        sb_end_write(inode->i_sb);
}

extern bool inode_owner_or_capable(const struct inode *inode);

/*
 * VFS helper functions..
 */
extern int vfs_create(struct inode *, struct dentry *, umode_t, bool);
extern int vfs_mkdir(struct inode *, struct dentry *, umode_t);
extern int vfs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
extern int vfs_symlink(struct inode *, struct dentry *, const char *);
extern int vfs_link(struct dentry *, struct inode *, struct dentry *, struct inode **);
extern int vfs_rmdir(struct inode *, struct dentry *);
extern int vfs_unlink(struct inode *, struct dentry *, struct inode **);

struct renamedata {
        struct inode *old_dir;
        struct dentry *old_dentry;
        struct inode *new_dir;
        struct dentry *new_dentry;
        struct inode **delegated_inode;
        unsigned int flags;
} __randomize_layout;

int vfs_rename(struct renamedata *);

static inline int vfs_whiteout(struct inode *dir, struct dentry *dentry)
{
        return vfs_mknod(dir, dentry, S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
}

extern struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode,
                                  int open_flag);

int vfs_mkobj(struct dentry *, umode_t,
                int (*f)(struct dentry *, umode_t, void *),
                void *);

int vfs_fchown(struct file *file, uid_t user, gid_t group);
int vfs_fchmod(struct file *file, umode_t mode);
int vfs_utimes(const struct path *path, struct timespec64 *times);

extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);

#ifdef CONFIG_COMPAT
extern long compat_ptr_ioctl(struct file *file, unsigned int cmd,
                                        unsigned long arg);
#else
#define compat_ptr_ioctl NULL
#endif

/*
 * VFS file helper functions.
 */
extern void inode_init_owner(struct inode *inode, const struct inode *dir,
                        umode_t mode);
extern bool may_open_dev(const struct path *path);
umode_t mode_strip_sgid(const struct inode *dir, umode_t mode);

/*
 * This is the "filldir" function type, used by readdir() to let
 * the kernel specify what kind of dirent layout it wants to have.
 * This allows the kernel to read directories into kernel space or
 * to have different dirent layouts depending on the binary type.
 */
struct dir_context;
typedef int (*filldir_t)(struct dir_context *, const char *, int, loff_t, u64,
                         unsigned);

struct dir_context {
        filldir_t actor;
        loff_t pos;
};

/*
 * These flags let !MMU mmap() govern direct device mapping vs immediate
 * copying more easily for MAP_PRIVATE, especially for ROM filesystems.
 *
 * NOMMU_MAP_COPY:        Copy can be mapped (MAP_PRIVATE)
 * NOMMU_MAP_DIRECT:        Can be mapped directly (MAP_SHARED)
 * NOMMU_MAP_READ:        Can be mapped for reading
 * NOMMU_MAP_WRITE:        Can be mapped for writing
 * NOMMU_MAP_EXEC:        Can be mapped for execution
 */
#define NOMMU_MAP_COPY                0x00000001
#define NOMMU_MAP_DIRECT        0x00000008
#define NOMMU_MAP_READ                VM_MAYREAD
#define NOMMU_MAP_WRITE                VM_MAYWRITE
#define NOMMU_MAP_EXEC                VM_MAYEXEC

#define NOMMU_VMFLAGS \
        (NOMMU_MAP_READ | NOMMU_MAP_WRITE | NOMMU_MAP_EXEC)

/*
 * These flags control the behavior of the remap_file_range function pointer.
 * If it is called with len == 0 that means "remap to end of source file".
 * See Documentation/filesystems/vfs.rst for more details about this call.
 *
 * REMAP_FILE_DEDUP: only remap if contents identical (i.e. deduplicate)
 * REMAP_FILE_CAN_SHORTEN: caller can handle a shortened request
 */
#define REMAP_FILE_DEDUP                (1 << 0)
#define REMAP_FILE_CAN_SHORTEN                (1 << 1)

/*
 * These flags signal that the caller is ok with altering various aspects of
 * the behavior of the remap operation.  The changes must be made by the
 * implementation; the vfs remap helper functions can take advantage of them.
 * Flags in this category exist to preserve the quirky behavior of the hoisted
 * btrfs clone/dedupe ioctls.
 */
#define REMAP_FILE_ADVISORY                (REMAP_FILE_CAN_SHORTEN)

/*
 * These flags control the behavior of vfs_copy_file_range().
 * They are not available to the user via syscall.
 *
 * COPY_FILE_SPLICE: call splice direct instead of fs clone/copy ops
 */
#define COPY_FILE_SPLICE                (1 << 0)

struct iov_iter;

struct file_operations {
        struct module *owner;
        loff_t (*llseek) (struct file *, loff_t, int);
        ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
        ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
        ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
        ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
        int (*iopoll)(struct kiocb *kiocb, bool spin);
        int (*iterate) (struct file *, struct dir_context *);
        int (*iterate_shared) (struct file *, struct dir_context *);
        __poll_t (*poll) (struct file *, struct poll_table_struct *);
        long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
        long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
        int (*mmap) (struct file *, struct vm_area_struct *);
        unsigned long mmap_supported_flags;
        int (*open) (struct inode *, struct file *);
        int (*flush) (struct file *, fl_owner_t id);
        int (*release) (struct inode *, struct file *);
        int (*fsync) (struct file *, loff_t, loff_t, int datasync);
        int (*fasync) (int, struct file *, int);
        int (*lock) (struct file *, int, struct file_lock *);
        ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
        unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
        int (*check_flags)(int);
        int (*flock) (struct file *, int, struct file_lock *);
        ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
        ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
        int (*setlease)(struct file *, long, struct file_lock **, void **);
        long (*fallocate)(struct file *file, int mode, loff_t offset,
                          loff_t len);
        void (*show_fdinfo)(struct seq_file *m, struct file *f);
#ifndef CONFIG_MMU
        unsigned (*mmap_capabilities)(struct file *);
#endif
        ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
                        loff_t, size_t, unsigned int);
        loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in,
                                   struct file *file_out, loff_t pos_out,
                                   loff_t len, unsigned int remap_flags);
        int (*fadvise)(struct file *, loff_t, loff_t, int);
        bool may_pollfree;
} __randomize_layout;

struct inode_operations {
        struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
        const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *);
        int (*permission) (struct inode *, int);
        struct posix_acl * (*get_acl)(struct inode *, int);

        int (*readlink) (struct dentry *, char __user *,int);

        int (*create) (struct inode *,struct dentry *, umode_t, bool);
        int (*link) (struct dentry *,struct inode *,struct dentry *);
        int (*unlink) (struct inode *,struct dentry *);
        int (*symlink) (struct inode *,struct dentry *,const char *);
        int (*mkdir) (struct inode *,struct dentry *,umode_t);
        int (*rmdir) (struct inode *,struct dentry *);
        int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t);
        int (*rename) (struct inode *, struct dentry *,
                        struct inode *, struct dentry *, unsigned int);
        int (*setattr) (struct dentry *, struct iattr *);
        int (*getattr) (const struct path *, struct kstat *, u32, unsigned int);
        ssize_t (*listxattr) (struct dentry *, char *, size_t);
        int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
                      u64 len);
        int (*update_time)(struct inode *, struct timespec64 *, int);
        int (*atomic_open)(struct inode *, struct dentry *,
                           struct file *, unsigned open_flag,
                           umode_t create_mode);
        int (*tmpfile) (struct inode *, struct dentry *, umode_t);
        int (*set_acl)(struct inode *, struct posix_acl *, int);
} ____cacheline_aligned;

static inline ssize_t call_read_iter(struct file *file, struct kiocb *kio,
                                     struct iov_iter *iter)
{
        return file->f_op->read_iter(kio, iter);
}

static inline ssize_t call_write_iter(struct file *file, struct kiocb *kio,
                                      struct iov_iter *iter)
{
        return file->f_op->write_iter(kio, iter);
}

static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
{
        return file->f_op->mmap(file, vma);
}

extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
                                   loff_t, size_t, unsigned int);
extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
                                       struct file *file_out, loff_t pos_out,
                                       size_t len, unsigned int flags);
extern int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
                                         struct file *file_out, loff_t pos_out,
                                         loff_t *count,
                                         unsigned int remap_flags);
extern loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
                                  struct file *file_out, loff_t pos_out,
                                  loff_t len, unsigned int remap_flags);
extern loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
                                   struct file *file_out, loff_t pos_out,
                                   loff_t len, unsigned int remap_flags);
extern int vfs_dedupe_file_range(struct file *file,
                                 struct file_dedupe_range *same);
extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
                                        struct file *dst_file, loff_t dst_pos,
                                        loff_t len, unsigned int remap_flags);


struct super_operations {
           struct inode *(*alloc_inode)(struct super_block *sb);
        void (*destroy_inode)(struct inode *);
        void (*free_inode)(struct inode *);

           void (*dirty_inode) (struct inode *, int flags);
        int (*write_inode) (struct inode *, struct writeback_control *wbc);
        int (*drop_inode) (struct inode *);
        void (*evict_inode) (struct inode *);
        void (*put_super) (struct super_block *);
        int (*sync_fs)(struct super_block *sb, int wait);
        int (*freeze_super) (struct super_block *);
        int (*freeze_fs) (struct super_block *);
        int (*thaw_super) (struct super_block *);
        int (*unfreeze_fs) (struct super_block *);
        int (*statfs) (struct dentry *, struct kstatfs *);
        int (*remount_fs) (struct super_block *, int *, char *);
        void (*umount_begin) (struct super_block *);

        int (*show_options)(struct seq_file *, struct dentry *);
        int (*show_devname)(struct seq_file *, struct dentry *);
        int (*show_path)(struct seq_file *, struct dentry *);
        int (*show_stats)(struct seq_file *, struct dentry *);
#ifdef CONFIG_QUOTA
        ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
        ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
        struct dquot **(*get_dquots)(struct inode *);
#endif
        int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
        long (*nr_cached_objects)(struct super_block *,
                                  struct shrink_control *);
        long (*free_cached_objects)(struct super_block *,
                                    struct shrink_control *);
};

/*
 * Inode flags - they have no relation to superblock flags now
 */
#define S_SYNC                (1 << 0)  /* Writes are synced at once */
#define S_NOATIME        (1 << 1)  /* Do not update access times */
#define S_APPEND        (1 << 2)  /* Append-only file */
#define S_IMMUTABLE        (1 << 3)  /* Immutable file */
#define S_DEAD                (1 << 4)  /* removed, but still open directory */
#define S_NOQUOTA        (1 << 5)  /* Inode is not counted to quota */
#define S_DIRSYNC        (1 << 6)  /* Directory modifications are synchronous */
#define S_NOCMTIME        (1 << 7)  /* Do not update file c/mtime */
#define S_SWAPFILE        (1 << 8)  /* Do not truncate: swapon got its bmaps */
#define S_PRIVATE        (1 << 9)  /* Inode is fs-internal */
#define S_IMA                (1 << 10) /* Inode has an associated IMA struct */
#define S_AUTOMOUNT        (1 << 11) /* Automount/referral quasi-directory */
#define S_NOSEC                (1 << 12) /* no suid or xattr security attributes */
#ifdef CONFIG_FS_DAX
#define S_DAX                (1 << 13) /* Direct Access, avoiding the page cache */
#else
#define S_DAX                0          /* Make all the DAX code disappear */
#endif
#define S_ENCRYPTED        (1 << 14) /* Encrypted file (using fs/crypto/) */
#define S_CASEFOLD        (1 << 15) /* Casefolded file */
#define S_VERITY        (1 << 16) /* Verity file (using fs/verity/) */

/*
 * Note that nosuid etc flags are inode-specific: setting some file-system
 * flags just means all the inodes inherit those flags by default. It might be
 * possible to override it selectively if you really wanted to with some
 * ioctl() that is not currently implemented.
 *
 * Exception: SB_RDONLY is always applied to the entire file system.
 *
 * Unfortunately, it is possible to change a filesystems flags with it mounted
 * with files in use.  This means that all of the inodes will not have their
 * i_flags updated.  Hence, i_flags no longer inherit the superblock mount
 * flags, so these have to be checked separately. -- rmk@arm.uk.linux.org
 */
#define __IS_FLG(inode, flg)        ((inode)->i_sb->s_flags & (flg))

static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags & SB_RDONLY; }
#define IS_RDONLY(inode)        sb_rdonly((inode)->i_sb)
#define IS_SYNC(inode)                (__IS_FLG(inode, SB_SYNCHRONOUS) || \
                                        ((inode)->i_flags & S_SYNC))
#define IS_DIRSYNC(inode)        (__IS_FLG(inode, SB_SYNCHRONOUS|SB_DIRSYNC) || \
                                        ((inode)->i_flags & (S_SYNC|S_DIRSYNC)))
#define IS_MANDLOCK(inode)        __IS_FLG(inode, SB_MANDLOCK)
#define IS_NOATIME(inode)        __IS_FLG(inode, SB_RDONLY|SB_NOATIME)
#define IS_I_VERSION(inode)        __IS_FLG(inode, SB_I_VERSION)

#define IS_NOQUOTA(inode)        ((inode)->i_flags & S_NOQUOTA)
#define IS_APPEND(inode)        ((inode)->i_flags & S_APPEND)
#define IS_IMMUTABLE(inode)        ((inode)->i_flags & S_IMMUTABLE)
#define IS_POSIXACL(inode)        __IS_FLG(inode, SB_POSIXACL)

#define IS_DEADDIR(inode)        ((inode)->i_flags & S_DEAD)
#define IS_NOCMTIME(inode)        ((inode)->i_flags & S_NOCMTIME)
#define IS_SWAPFILE(inode)        ((inode)->i_flags & S_SWAPFILE)
#define IS_PRIVATE(inode)        ((inode)->i_flags & S_PRIVATE)
#define IS_IMA(inode)                ((inode)->i_flags & S_IMA)
#define IS_AUTOMOUNT(inode)        ((inode)->i_flags & S_AUTOMOUNT)
#define IS_NOSEC(inode)                ((inode)->i_flags & S_NOSEC)
#define IS_DAX(inode)                ((inode)->i_flags & S_DAX)
#define IS_ENCRYPTED(inode)        ((inode)->i_flags & S_ENCRYPTED)
#define IS_CASEFOLDED(inode)        ((inode)->i_flags & S_CASEFOLD)
#define IS_VERITY(inode)        ((inode)->i_flags & S_VERITY)

#define IS_WHITEOUT(inode)        (S_ISCHR(inode->i_mode) && \
                                 (inode)->i_rdev == WHITEOUT_DEV)

static inline bool HAS_UNMAPPED_ID(struct inode *inode)
{
        return !uid_valid(inode->i_uid) || !gid_valid(inode->i_gid);
}

static inline enum rw_hint file_write_hint(struct file *file)
{
        if (file->f_write_hint != WRITE_LIFE_NOT_SET)
                return file->f_write_hint;

        return file_inode(file)->i_write_hint;
}

static inline int iocb_flags(struct file *file);

static inline u16 ki_hint_validate(enum rw_hint hint)
{
        typeof(((struct kiocb *)0)->ki_hint) max_hint = -1;

        if (hint <= max_hint)
                return hint;
        return 0;
}

static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
{
        *kiocb = (struct kiocb) {
                .ki_filp = filp,
                .ki_flags = iocb_flags(filp),
                .ki_hint = ki_hint_validate(file_write_hint(filp)),
                .ki_ioprio = get_current_ioprio(),
        };
}

static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
                               struct file *filp)
{
        *kiocb = (struct kiocb) {
                .ki_filp = filp,
                .ki_flags = kiocb_src->ki_flags,
                .ki_hint = kiocb_src->ki_hint,
                .ki_ioprio = kiocb_src->ki_ioprio,
                .ki_pos = kiocb_src->ki_pos,
        };
}

/*
 * Inode state bits.  Protected by inode->i_lock
 *
 * Three bits determine the dirty state of the inode, I_DIRTY_SYNC,
 * I_DIRTY_DATASYNC and I_DIRTY_PAGES.
 *
 * Four bits define the lifetime of an inode.  Initially, inodes are I_NEW,
 * until that flag is cleared.  I_WILL_FREE, I_FREEING and I_CLEAR are set at
 * various stages of removing an inode.
 *
 * Two bits are used for locking and completion notification, I_NEW and I_SYNC.
 *
 * I_DIRTY_SYNC                Inode is dirty, but doesn't have to be written on
 *                        fdatasync().  i_atime is the usual cause.
 * I_DIRTY_DATASYNC        Data-related inode changes pending. We keep track of
 *                        these changes separately from I_DIRTY_SYNC so that we
 *                        don't have to write inode on fdatasync() when only
 *                        mtime has changed in it.
 * I_DIRTY_PAGES        Inode has dirty pages.  Inode itself may be clean.
 * I_NEW                Serves as both a mutex and completion notification.
 *                        New inodes set I_NEW.  If two processes both create
 *                        the same inode, one of them will release its inode and
 *                        wait for I_NEW to be released before returning.
 *                        Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can
 *                        also cause waiting on I_NEW, without I_NEW actually
 *                        being set.  find_inode() uses this to prevent returning
 *                        nearly-dead inodes.
 * I_WILL_FREE                Must be set when calling write_inode_now() if i_count
 *                        is zero.  I_FREEING must be set when I_WILL_FREE is
 *                        cleared.
 * I_FREEING                Set when inode is about to be freed but still has dirty
 *                        pages or buffers attached or the inode itself is still
 *                        dirty.
 * I_CLEAR                Added by clear_inode().  In this state the inode is
 *                        clean and can be destroyed.  Inode keeps I_FREEING.
 *
 *                        Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are
 *                        prohibited for many purposes.  iget() must wait for
 *                        the inode to be completely released, then create it
 *                        anew.  Other functions will just ignore such inodes,
 *                        if appropriate.  I_NEW is used for waiting.
 *
 * I_SYNC                Writeback of inode is running. The bit is set during
 *                        data writeback, and cleared with a wakeup on the bit
 *                        address once it is done. The bit is also used to pin
 *                        the inode in memory for flusher thread.
 *
 * I_REFERENCED                Marks the inode as recently references on the LRU list.
 *
 * I_DIO_WAKEUP                Never set.  Only used as a key for wait_on_bit().
 *
 * I_WB_SWITCH                Cgroup bdi_writeback switching in progress.  Used to
 *                        synchronize competing switching instances and to tell
 *                        wb stat updates to grab the i_pages lock.  See
 *                        inode_switch_wbs_work_fn() for details.
 *
 * I_OVL_INUSE                Used by overlayfs to get exclusive ownership on upper
 *                        and work dirs among overlayfs mounts.
 *
 * I_CREATING                New object's inode in the middle of setting up.
 *
 * I_DONTCACHE                Evict inode as soon as it is not used anymore.
 *
 * I_SYNC_QUEUED        Inode is queued in b_io or b_more_io writeback lists.
 *                        Used to detect that mark_inode_dirty() should not move
 *                         inode between dirty lists.
 *
 * I_LRU_ISOLATING        Inode is pinned being isolated from LRU without holding
 *                        i_count.
 *
 * Q: What is the difference between I_WILL_FREE and I_FREEING?
 */
#define I_DIRTY_SYNC                (1 << 0)
#define I_DIRTY_DATASYNC        (1 << 1)
#define I_DIRTY_PAGES                (1 << 2)
#define __I_NEW                        3
#define I_NEW                        (1 << __I_NEW)
#define I_WILL_FREE                (1 << 4)
#define I_FREEING                (1 << 5)
#define I_CLEAR                        (1 << 6)
#define __I_SYNC                7
#define I_SYNC                        (1 << __I_SYNC)
#define I_REFERENCED                (1 << 8)
#define __I_DIO_WAKEUP                9
#define I_DIO_WAKEUP                (1 << __I_DIO_WAKEUP)
#define I_LINKABLE                (1 << 10)
#define I_DIRTY_TIME                (1 << 11)
#define I_WB_SWITCH                (1 << 13)
#define I_OVL_INUSE                (1 << 14)
#define I_CREATING                (1 << 15)
#define I_DONTCACHE                (1 << 16)
#define I_SYNC_QUEUED                (1 << 17)
#define __I_LRU_ISOLATING        19
#define I_LRU_ISOLATING                (1 << __I_LRU_ISOLATING)

#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
#define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES)
#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME)

extern void __mark_inode_dirty(struct inode *, int);
static inline void mark_inode_dirty(struct inode *inode)
{
        __mark_inode_dirty(inode, I_DIRTY);
}

static inline void mark_inode_dirty_sync(struct inode *inode)
{
        __mark_inode_dirty(inode, I_DIRTY_SYNC);
}

extern void inc_nlink(struct inode *inode);
extern void drop_nlink(struct inode *inode);
extern void clear_nlink(struct inode *inode);
extern void set_nlink(struct inode *inode, unsigned int nlink);

static inline void inode_inc_link_count(struct inode *inode)
{
        inc_nlink(inode);
        mark_inode_dirty(inode);
}

static inline void inode_dec_link_count(struct inode *inode)
{
        drop_nlink(inode);
        mark_inode_dirty(inode);
}

enum file_time_flags {
        S_ATIME = 1,
        S_MTIME = 2,
        S_CTIME = 4,
        S_VERSION = 8,
};

extern bool atime_needs_update(const struct path *, struct inode *);
extern void touch_atime(const struct path *);
int inode_update_time(struct inode *inode, struct timespec64 *time, int flags);

static inline void file_accessed(struct file *file)
{
        if (!(file->f_flags & O_NOATIME))
                touch_atime(&file->f_path);
}

extern int file_modified(struct file *file);

int sync_inode(struct inode *inode, struct writeback_control *wbc);
int sync_inode_metadata(struct inode *inode, int wait);

struct file_system_type {
        const char *name;
        int fs_flags;
#define FS_REQUIRES_DEV                1 
#define FS_BINARY_MOUNTDATA        2
#define FS_HAS_SUBTYPE                4
#define FS_USERNS_MOUNT                8        /* Can be mounted by userns root */
#define FS_DISALLOW_NOTIFY_PERM        16        /* Disable fanotify permission events */
#define FS_THP_SUPPORT                8192        /* Remove once all fs converted */
#define FS_RENAME_DOES_D_MOVE        32768        /* FS will handle d_move() during rename() internally. */
        int (*init_fs_context)(struct fs_context *);
        const struct fs_parameter_spec *parameters;
        struct dentry *(*mount) (struct file_system_type *, int,
                       const char *, void *);
        void (*kill_sb) (struct super_block *);
        struct module *owner;
        struct file_system_type * next;
        struct hlist_head fs_supers;

        struct lock_class_key s_lock_key;
        struct lock_class_key s_umount_key;
        struct lock_class_key s_vfs_rename_key;
        struct lock_class_key s_writers_key[SB_FREEZE_LEVELS];

        struct lock_class_key i_lock_key;
        struct lock_class_key i_mutex_key;
        struct lock_class_key i_mutex_dir_key;
};

#define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME)

extern struct dentry *mount_bdev(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data,
        int (*fill_super)(struct super_block *, void *, int));
extern struct dentry *mount_single(struct file_system_type *fs_type,
        int flags, void *data,
        int (*fill_super)(struct super_block *, void *, int));
extern struct dentry *mount_nodev(struct file_system_type *fs_type,
        int flags, void *data,
        int (*fill_super)(struct super_block *, void *, int));
extern struct dentry *mount_subtree(struct vfsmount *mnt, const char *path);
void generic_shutdown_super(struct super_block *sb);
void kill_block_super(struct super_block *sb);
void kill_anon_super(struct super_block *sb);
void kill_litter_super(struct super_block *sb);
void deactivate_super(struct super_block *sb);
void deactivate_locked_super(struct super_block *sb);
int set_anon_super(struct super_block *s, void *data);
int set_anon_super_fc(struct super_block *s, struct fs_context *fc);
int get_anon_bdev(dev_t *);
void free_anon_bdev(dev_t);
struct super_block *sget_fc(struct fs_context *fc,
                            int (*test)(struct super_block *, struct fs_context *),
                            int (*set)(struct super_block *, struct fs_context *));
struct super_block *sget(struct file_system_type *type,
                        int (*test)(struct super_block *,void *),
                        int (*set)(struct super_block *,void *),
                        int flags, void *data);

/* Alas, no aliases. Too much hassle with bringing module.h everywhere */
#define fops_get(fops) \
        (((fops) && try_module_get((fops)->owner) ? (fops) : NULL))
#define fops_put(fops) \
        do { if (fops) module_put((fops)->owner); } while(0)
/*
 * This one is to be used *ONLY* from ->open() instances.
 * fops must be non-NULL, pinned down *and* module dependencies
 * should be sufficient to pin the caller down as well.
 */
#define replace_fops(f, fops) \
        do {        \
                struct file *__file = (f); \
                fops_put(__file->f_op); \
                BUG_ON(!(__file->f_op = (fops))); \
        } while(0)

extern int register_filesystem(struct file_system_type *);
extern int unregister_filesystem(struct file_system_type *);
extern struct vfsmount *kern_mount(struct file_system_type *);
extern void kern_unmount(struct vfsmount *mnt);
extern int may_umount_tree(struct vfsmount *);
extern int may_umount(struct vfsmount *);
extern long do_mount(const char *, const char __user *,
                     const char *, unsigned long, void *);
extern struct vfsmount *collect_mounts(const struct path *);
extern void drop_collected_mounts(struct vfsmount *);
extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *,
                          struct vfsmount *);
extern int vfs_statfs(const struct path *, struct kstatfs *);
extern int user_statfs(const char __user *, struct kstatfs *);
extern int fd_statfs(int, struct kstatfs *);
extern int freeze_super(struct super_block *super);
extern int thaw_super(struct super_block *super);
extern bool our_mnt(struct vfsmount *mnt);
extern __printf(2, 3)
int super_setup_bdi_name(struct super_block *sb, char *fmt, ...);
extern int super_setup_bdi(struct super_block *sb);

extern int current_umask(void);

extern void ihold(struct inode * inode);
extern void iput(struct inode *);
extern int generic_update_time(struct inode *, struct timespec64 *, int);

/* /sys/fs */
extern struct kobject *fs_kobj;

#define MAX_RW_COUNT (INT_MAX & PAGE_MASK)

#ifdef CONFIG_MANDATORY_FILE_LOCKING
extern int locks_mandatory_locked(struct file *);
extern int locks_mandatory_area(struct inode *, struct file *, loff_t, loff_t, unsigned char);

/*
 * Candidates for mandatory locking have the setgid bit set
 * but no group execute bit -  an otherwise meaningless combination.
 */

static inline int __mandatory_lock(struct inode *ino)
{
        return (ino->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID;
}

/*
 * ... and these candidates should be on SB_MANDLOCK mounted fs,
 * otherwise these will be advisory locks
 */

static inline int mandatory_lock(struct inode *ino)
{
        return IS_MANDLOCK(ino) && __mandatory_lock(ino);
}

static inline int locks_verify_locked(struct file *file)
{
        if (mandatory_lock(locks_inode(file)))
                return locks_mandatory_locked(file);
        return 0;
}

static inline int locks_verify_truncate(struct inode *inode,
                                    struct file *f,
                                    loff_t size)
{
        if (!inode->i_flctx || !mandatory_lock(inode))
                return 0;

        if (size < inode->i_size) {
                return locks_mandatory_area(inode, f, size, inode->i_size - 1,
                                F_WRLCK);
        } else {
                return locks_mandatory_area(inode, f, inode->i_size, size - 1,
                                F_WRLCK);
        }
}

#else /* !CONFIG_MANDATORY_FILE_LOCKING */

static inline int locks_mandatory_locked(struct file *file)
{
        return 0;
}

static inline int locks_mandatory_area(struct inode *inode, struct file *filp,
                                       loff_t start, loff_t end, unsigned char type)
{
        return 0;
}

static inline int __mandatory_lock(struct inode *inode)
{
        return 0;
}

static inline int mandatory_lock(struct inode *inode)
{
        return 0;
}

static inline int locks_verify_locked(struct file *file)
{
        return 0;
}

static inline int locks_verify_truncate(struct inode *inode, struct file *filp,
                                        size_t size)
{
        return 0;
}

#endif /* CONFIG_MANDATORY_FILE_LOCKING */


#ifdef CONFIG_FILE_LOCKING
static inline int break_lease(struct inode *inode, unsigned int mode)
{
        /*
         * Since this check is lockless, we must ensure that any refcounts
         * taken are done before checking i_flctx->flc_lease. Otherwise, we
         * could end up racing with tasks trying to set a new lease on this
         * file.
         */
        smp_mb();
        if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
                return __break_lease(inode, mode, FL_LEASE);
        return 0;
}

static inline int break_deleg(struct inode *inode, unsigned int mode)
{
        /*
         * Since this check is lockless, we must ensure that any refcounts
         * taken are done before checking i_flctx->flc_lease. Otherwise, we
         * could end up racing with tasks trying to set a new lease on this
         * file.
         */
        smp_mb();
        if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
                return __break_lease(inode, mode, FL_DELEG);
        return 0;
}

static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode)
{
        int ret;

        ret = break_deleg(inode, O_WRONLY|O_NONBLOCK);
        if (ret == -EWOULDBLOCK && delegated_inode) {
                *delegated_inode = inode;
                ihold(inode);
        }
        return ret;
}

static inline int break_deleg_wait(struct inode **delegated_inode)
{
        int ret;

        ret = break_deleg(*delegated_inode, O_WRONLY);
        iput(*delegated_inode);
        *delegated_inode = NULL;
        return ret;
}

static inline int break_layout(struct inode *inode, bool wait)
{
        smp_mb();
        if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
                return __break_lease(inode,
                                wait ? O_WRONLY : O_WRONLY | O_NONBLOCK,
                                FL_LAYOUT);
        return 0;
}

#else /* !CONFIG_FILE_LOCKING */
static inline int break_lease(struct inode *inode, unsigned int mode)
{
        return 0;
}

static inline int break_deleg(struct inode *inode, unsigned int mode)
{
        return 0;
}

static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode)
{
        return 0;
}

static inline int break_deleg_wait(struct inode **delegated_inode)
{
        BUG();
        return 0;
}

static inline int break_layout(struct inode *inode, bool wait)
{
        return 0;
}

#endif /* CONFIG_FILE_LOCKING */

/* fs/open.c */
struct audit_names;
struct filename {
        const char                *name;        /* pointer to actual string */
        const __user char        *uptr;        /* original userland pointer */
        int                        refcnt;
        struct audit_names        *aname;
        const char                iname[];
};
static_assert(offsetof(struct filename, iname) % sizeof(long) == 0);

extern long vfs_truncate(const struct path *, loff_t);
extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs,
                       struct file *filp);
extern int vfs_fallocate(struct file *file, int mode, loff_t offset,
                        loff_t len);
extern long do_sys_open(int dfd, const char __user *filename, int flags,
                        umode_t mode);
extern struct file *file_open_name(struct filename *, int, umode_t);
extern struct file *filp_open(const char *, int, umode_t);
extern struct file *file_open_root(struct dentry *, struct vfsmount *,
                                   const char *, int, umode_t);
extern struct file * dentry_open(const struct path *, int, const struct cred *);
extern struct file *dentry_create(const struct path *path, int flags,
                                  umode_t mode, const struct cred *cred);
extern struct file * open_with_fake_path(const struct path *, int,
                                         struct inode*, const struct cred *);
static inline struct file *file_clone_open(struct file *file)
{
        return dentry_open(&file->f_path, file->f_flags, file->f_cred);
}
extern int filp_close(struct file *, fl_owner_t id);

extern struct filename *getname_flags(const char __user *, int, int *);
extern struct filename *getname(const char __user *);
extern struct filename *getname_kernel(const char *);
extern void putname(struct filename *name);

extern int finish_open(struct file *file, struct dentry *dentry,
                        int (*open)(struct inode *, struct file *));
extern int finish_no_open(struct file *file, struct dentry *dentry);

/* fs/dcache.c */
extern void __init vfs_caches_init_early(void);
extern void __init vfs_caches_init(void);

extern struct kmem_cache *names_cachep;

#define __getname()                kmem_cache_alloc(names_cachep, GFP_KERNEL)
#define __putname(name)                kmem_cache_free(names_cachep, (void *)(name))

extern struct super_block *blockdev_superblock;
static inline bool sb_is_blkdev_sb(struct super_block *sb)
{
        return IS_ENABLED(CONFIG_BLOCK) && sb == blockdev_superblock;
}

void emergency_thaw_all(void);
extern int sync_filesystem(struct super_block *);
extern const struct file_operations def_blk_fops;
extern const struct file_operations def_chr_fops;

/* fs/char_dev.c */
#define CHRDEV_MAJOR_MAX 512
/* Marks the bottom of the first segment of free char majors */
#define CHRDEV_MAJOR_DYN_END 234
/* Marks the top and bottom of the second segment of free char majors */
#define CHRDEV_MAJOR_DYN_EXT_START 511
#define CHRDEV_MAJOR_DYN_EXT_END 384

extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *);
extern int register_chrdev_region(dev_t, unsigned, const char *);
extern int __register_chrdev(unsigned int major, unsigned int baseminor,
                             unsigned int count, const char *name,
                             const struct file_operations *fops);
extern void __unregister_chrdev(unsigned int major, unsigned int baseminor,
                                unsigned int count, const char *name);
extern void unregister_chrdev_region(dev_t, unsigned);
extern void chrdev_show(struct seq_file *,off_t);

static inline int register_chrdev(unsigned int major, const char *name,
                                  const struct file_operations *fops)
{
        return __register_chrdev(major, 0, 256, name, fops);
}

static inline void unregister_chrdev(unsigned int major, const char *name)
{
        __unregister_chrdev(major, 0, 256, name);
}

extern void init_special_inode(struct inode *, umode_t, dev_t);

/* Invalid inode operations -- fs/bad_inode.c */
extern void make_bad_inode(struct inode *);
extern bool is_bad_inode(struct inode *);

unsigned long invalidate_mapping_pages(struct address_space *mapping,
                                        pgoff_t start, pgoff_t end);

void invalidate_mapping_pagevec(struct address_space *mapping,
                                pgoff_t start, pgoff_t end,
                                unsigned long *nr_pagevec);

static inline void invalidate_remote_inode(struct inode *inode)
{
        if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
            S_ISLNK(inode->i_mode))
                invalidate_mapping_pages(inode->i_mapping, 0, -1);
}
extern int invalidate_inode_pages2(struct address_space *mapping);
extern int invalidate_inode_pages2_range(struct address_space *mapping,
                                         pgoff_t start, pgoff_t end);
extern int write_inode_now(struct inode *, int);
extern int filemap_fdatawrite(struct address_space *);
extern int filemap_flush(struct address_space *);
extern int filemap_fdatawait_keep_errors(struct address_space *mapping);
extern int filemap_fdatawait_range(struct address_space *, loff_t lstart,
                                   loff_t lend);
extern int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
                loff_t start_byte, loff_t end_byte);

static inline int filemap_fdatawait(struct address_space *mapping)
{
        return filemap_fdatawait_range(mapping, 0, LLONG_MAX);
}

extern bool filemap_range_has_page(struct address_space *, loff_t lstart,
                                  loff_t lend);
extern int filemap_write_and_wait_range(struct address_space *mapping,
                                        loff_t lstart, loff_t lend);
extern int __filemap_fdatawrite_range(struct address_space *mapping,
                                loff_t start, loff_t end, int sync_mode);
extern int filemap_fdatawrite_range(struct address_space *mapping,
                                loff_t start, loff_t end);
extern int filemap_check_errors(struct address_space *mapping);
extern void __filemap_set_wb_err(struct address_space *mapping, int err);

static inline int filemap_write_and_wait(struct address_space *mapping)
{
        return filemap_write_and_wait_range(mapping, 0, LLONG_MAX);
}

extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart,
                                                loff_t lend);
extern int __must_check file_check_and_advance_wb_err(struct file *file);
extern int __must_check file_write_and_wait_range(struct file *file,
                                                loff_t start, loff_t end);

static inline int file_write_and_wait(struct file *file)
{
        return file_write_and_wait_range(file, 0, LLONG_MAX);
}

/**
 * filemap_set_wb_err - set a writeback error on an address_space
 * @mapping: mapping in which to set writeback error
 * @err: error to be set in mapping
 *
 * When writeback fails in some way, we must record that error so that
 * userspace can be informed when fsync and the like are called.  We endeavor
 * to report errors on any file that was open at the time of the error.  Some
 * internal callers also need to know when writeback errors have occurred.
 *
 * When a writeback error occurs, most filesystems will want to call
 * filemap_set_wb_err to record the error in the mapping so that it will be
 * automatically reported whenever fsync is called on the file.
 */
static inline void filemap_set_wb_err(struct address_space *mapping, int err)
{
        /* Fastpath for common case of no error */
        if (unlikely(err))
                __filemap_set_wb_err(mapping, err);
}

/**
 * filemap_check_wb_err - has an error occurred since the mark was sampled?
 * @mapping: mapping to check for writeback errors
 * @since: previously-sampled errseq_t
 *
 * Grab the errseq_t value from the mapping, and see if it has changed "since"
 * the given value was sampled.
 *
 * If it has then report the latest error set, otherwise return 0.
 */
static inline int filemap_check_wb_err(struct address_space *mapping,
                                        errseq_t since)
{
        return errseq_check(&mapping->wb_err, since);
}

/**
 * filemap_sample_wb_err - sample the current errseq_t to test for later errors
 * @mapping: mapping to be sampled
 *
 * Writeback errors are always reported relative to a particular sample point
 * in the past. This function provides those sample points.
 */
static inline errseq_t filemap_sample_wb_err(struct address_space *mapping)
{
        return errseq_sample(&mapping->wb_err);
}

/**
 * file_sample_sb_err - sample the current errseq_t to test for later errors
 * @file: file pointer to be sampled
 *
 * Grab the most current superblock-level errseq_t value for the given
 * struct file.
 */
static inline errseq_t file_sample_sb_err(struct file *file)
{
        return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err);
}

extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end,
                           int datasync);
extern int vfs_fsync(struct file *file, int datasync);

extern int sync_file_range(struct file *file, loff_t offset, loff_t nbytes,
                                unsigned int flags);

/*
 * Sync the bytes written if this was a synchronous write.  Expect ki_pos
 * to already be updated for the write, and will return either the amount
 * of bytes passed in, or an error if syncing the file failed.
 */
static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count)
{
        if (iocb->ki_flags & IOCB_DSYNC) {
                int ret = vfs_fsync_range(iocb->ki_filp,
                                iocb->ki_pos - count, iocb->ki_pos - 1,
                                (iocb->ki_flags & IOCB_SYNC) ? 0 : 1);
                if (ret)
                        return ret;
        }

        return count;
}

extern void emergency_sync(void);
extern void emergency_remount(void);

#ifdef CONFIG_BLOCK
extern int bmap(struct inode *inode, sector_t *block);
#else
static inline int bmap(struct inode *inode,  sector_t *block)
{
        return -EINVAL;
}
#endif

extern int notify_change(struct dentry *, struct iattr *, struct inode **);
extern int inode_permission(struct inode *, int);
extern int generic_permission(struct inode *, int);
static inline int file_permission(struct file *file, int mask)
{
        return inode_permission(file_inode(file), mask);
}
static inline int path_permission(const struct path *path, int mask)
{
        return inode_permission(d_inode(path->dentry), mask);
}
extern int __check_sticky(struct inode *dir, struct inode *inode);

static inline bool execute_ok(struct inode *inode)
{
        return (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode);
}

static inline bool inode_wrong_type(const struct inode *inode, umode_t mode)
{
        return (inode->i_mode ^ mode) & S_IFMT;
}

static inline void file_start_write(struct file *file)
{
        if (!S_ISREG(file_inode(file)->i_mode))
                return;
        sb_start_write(file_inode(file)->i_sb);
}

static inline bool file_start_write_trylock(struct file *file)
{
        if (!S_ISREG(file_inode(file)->i_mode))
                return true;
        return sb_start_write_trylock(file_inode(file)->i_sb);
}

static inline void file_end_write(struct file *file)
{
        if (!S_ISREG(file_inode(file)->i_mode))
                return;
        __sb_end_write(file_inode(file)->i_sb, SB_FREEZE_WRITE);
}

/*
 * get_write_access() gets write permission for a file.
 * put_write_access() releases this write permission.
 * This is used for regular files.
 * We cannot support write (and maybe mmap read-write shared) accesses and
 * MAP_DENYWRITE mmappings simultaneously. The i_writecount field of an inode
 * can have the following values:
 * 0: no writers, no VM_DENYWRITE mappings
 * < 0: (-i_writecount) vm_area_structs with VM_DENYWRITE set exist
 * > 0: (i_writecount) users are writing to the file.
 *
 * Normally we operate on that counter with atomic_{inc,dec} and it's safe
 * except for the cases where we don't hold i_writecount yet. Then we need to
 * use {get,deny}_write_access() - these functions check the sign and refuse
 * to do the change if sign is wrong.
 */
static inline int get_write_access(struct inode *inode)
{
        return atomic_inc_unless_negative(&inode->i_writecount) ? 0 : -ETXTBSY;
}
static inline int deny_write_access(struct file *file)
{
        struct inode *inode = file_inode(file);
        return atomic_dec_unless_positive(&inode->i_writecount) ? 0 : -ETXTBSY;
}
static inline void put_write_access(struct inode * inode)
{
        atomic_dec(&inode->i_writecount);
}
static inline void allow_write_access(struct file *file)
{
        if (file)
                atomic_inc(&file_inode(file)->i_writecount);
}
static inline bool inode_is_open_for_write(const struct inode *inode)
{
        return atomic_read(&inode->i_writecount) > 0;
}

#if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING)
static inline void i_readcount_dec(struct inode *inode)
{
        BUG_ON(!atomic_read(&inode->i_readcount));
        atomic_dec(&inode->i_readcount);
}
static inline void i_readcount_inc(struct inode *inode)
{
        atomic_inc(&inode->i_readcount);
}
#else
static inline void i_readcount_dec(struct inode *inode)
{
        return;
}
static inline void i_readcount_inc(struct inode *inode)
{
        return;
}
#endif
extern int do_pipe_flags(int *, int);

extern ssize_t kernel_read(struct file *, void *, size_t, loff_t *);
ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos);
extern ssize_t kernel_write(struct file *, const void *, size_t, loff_t *);
extern ssize_t __kernel_write(struct file *, const void *, size_t, loff_t *);
extern struct file * open_exec(const char *);
 
/* fs/dcache.c -- generic fs support functions */
extern bool is_subdir(struct dentry *, struct dentry *);
extern bool path_is_under(const struct path *, const struct path *);

extern char *file_path(struct file *, char *, int);

#include <linux/err.h>

/* needed for stackable file system support */
extern loff_t default_llseek(struct file *file, loff_t offset, int whence);

extern loff_t vfs_llseek(struct file *file, loff_t offset, int whence);

extern int inode_init_always(struct super_block *, struct inode *);
extern void inode_init_once(struct inode *);
extern void address_space_init_once(struct address_space *mapping);
extern struct inode * igrab(struct inode *);
extern ino_t iunique(struct super_block *, ino_t);
extern int inode_needs_sync(struct inode *inode);
extern int generic_delete_inode(struct inode *inode);
static inline int generic_drop_inode(struct inode *inode)
{
        return !inode->i_nlink || inode_unhashed(inode);
}
extern void d_mark_dontcache(struct inode *inode);

extern struct inode *ilookup5_nowait(struct super_block *sb,
                unsigned long hashval, int (*test)(struct inode *, void *),
                void *data);
extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
                int (*test)(struct inode *, void *), void *data);
extern struct inode *ilookup(struct super_block *sb, unsigned long ino);

extern struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
                int (*test)(struct inode *, void *),
                int (*set)(struct inode *, void *),
                void *data);
extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *);
extern struct inode * iget_locked(struct super_block *, unsigned long);
extern struct inode *find_inode_nowait(struct super_block *,
                                       unsigned long,
                                       int (*match)(struct inode *,
                                                    unsigned long, void *),
                                       void *data);
extern struct inode *find_inode_rcu(struct super_block *, unsigned long,
                                    int (*)(struct inode *, void *), void *);
extern struct inode *find_inode_by_ino_rcu(struct super_block *, unsigned long);
extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *);
extern int insert_inode_locked(struct inode *);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
extern void lockdep_annotate_inode_mutex_key(struct inode *inode);
#else
static inline void lockdep_annotate_inode_mutex_key(struct inode *inode) { };
#endif
extern void unlock_new_inode(struct inode *);
extern void discard_new_inode(struct inode *);
extern unsigned int get_next_ino(void);
extern void evict_inodes(struct super_block *sb);

/*
 * Userspace may rely on the the inode number being non-zero. For example, glibc
 * simply ignores files with zero i_ino in unlink() and other places.
 *
 * As an additional complication, if userspace was compiled with
 * _FILE_OFFSET_BITS=32 on a 64-bit kernel we'll only end up reading out the
 * lower 32 bits, so we need to check that those aren't zero explicitly. With
 * _FILE_OFFSET_BITS=64, this may cause some harmless false-negatives, but
 * better safe than sorry.
 */
static inline bool is_zero_ino(ino_t ino)
{
        return (u32)ino == 0;
}

extern void __iget(struct inode * inode);
extern void iget_failed(struct inode *);
extern void clear_inode(struct inode *);
extern void __destroy_inode(struct inode *);
extern struct inode *new_inode_pseudo(struct super_block *sb);
extern struct inode *new_inode(struct super_block *sb);
extern void free_inode_nonrcu(struct inode *inode);
extern int setattr_should_drop_suidgid(struct inode *);
extern int file_remove_privs(struct file *);

extern void __insert_inode_hash(struct inode *, unsigned long hashval);
static inline void insert_inode_hash(struct inode *inode)
{
        __insert_inode_hash(inode, inode->i_ino);
}

extern void __remove_inode_hash(struct inode *);
static inline void remove_inode_hash(struct inode *inode)
{
        if (!inode_unhashed(inode) && !hlist_fake(&inode->i_hash))
                __remove_inode_hash(inode);
}

extern void inode_sb_list_add(struct inode *inode);

extern int sb_set_blocksize(struct super_block *, int);
extern int sb_min_blocksize(struct super_block *, int);

extern int generic_file_mmap(struct file *, struct vm_area_struct *);
extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
extern int generic_write_check_limits(struct file *file, loff_t pos,
                loff_t *count);
extern int generic_file_rw_checks(struct file *file_in, struct file *file_out);
extern ssize_t generic_file_buffered_read(struct kiocb *iocb,
                struct iov_iter *to, ssize_t already_read);
extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *);
extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *);
extern ssize_t generic_perform_write(struct file *, struct iov_iter *, loff_t);

ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
                rwf_t flags);
ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
                rwf_t flags);
ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb,
                           struct iov_iter *iter);
ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
                            struct iov_iter *iter);

/* fs/block_dev.c */
extern ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to);
extern ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from);
extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
                        int datasync);
extern void block_sync_page(struct page *page);

/* fs/splice.c */
extern ssize_t generic_file_splice_read(struct file *, loff_t *,
                struct pipe_inode_info *, size_t, unsigned int);
extern ssize_t iter_file_splice_write(struct pipe_inode_info *,
                struct file *, loff_t *, size_t, unsigned int);
extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe,
                struct file *out, loff_t *, size_t len, unsigned int flags);
extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
                loff_t *opos, size_t len, unsigned int flags);


extern void
file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
extern loff_t noop_llseek(struct file *file, loff_t offset, int whence);
extern loff_t no_llseek(struct file *file, loff_t offset, int whence);
extern loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize);
extern loff_t generic_file_llseek(struct file *file, loff_t offset, int whence);
extern loff_t generic_file_llseek_size(struct file *file, loff_t offset,
                int whence, loff_t maxsize, loff_t eof);
extern loff_t fixed_size_llseek(struct file *file, loff_t offset,
                int whence, loff_t size);
extern loff_t no_seek_end_llseek_size(struct file *, loff_t, int, loff_t);
extern loff_t no_seek_end_llseek(struct file *, loff_t, int);
extern int generic_file_open(struct inode * inode, struct file * filp);
extern int nonseekable_open(struct inode * inode, struct file * filp);
extern int stream_open(struct inode * inode, struct file * filp);

#ifdef CONFIG_BLOCK
typedef void (dio_submit_t)(struct bio *bio, struct inode *inode,
                            loff_t file_offset);

enum {
        /* need locking between buffered and direct access */
        DIO_LOCKING        = 0x01,

        /* filesystem does not support filling holes */
        DIO_SKIP_HOLES        = 0x02,
};

ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
                             struct block_device *bdev, struct iov_iter *iter,
                             get_block_t get_block,
                             dio_iodone_t end_io, dio_submit_t submit_io,
                             int flags);

static inline ssize_t blockdev_direct_IO(struct kiocb *iocb,
                                         struct inode *inode,
                                         struct iov_iter *iter,
                                         get_block_t get_block)
{
        return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
                        get_block, NULL, NULL, DIO_LOCKING | DIO_SKIP_HOLES);
}
#endif

void inode_dio_wait(struct inode *inode);

/*
 * inode_dio_begin - signal start of a direct I/O requests
 * @inode: inode the direct I/O happens on
 *
 * This is called once we've finished processing a direct I/O request,
 * and is used to wake up callers waiting for direct I/O to be quiesced.
 */
static inline void inode_dio_begin(struct inode *inode)
{
        atomic_inc(&inode->i_dio_count);
}

/*
 * inode_dio_end - signal finish of a direct I/O requests
 * @inode: inode the direct I/O happens on
 *
 * This is called once we've finished processing a direct I/O request,
 * and is used to wake up callers waiting for direct I/O to be quiesced.
 */
static inline void inode_dio_end(struct inode *inode)
{
        if (atomic_dec_and_test(&inode->i_dio_count))
                wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
}

/*
 * Warn about a page cache invalidation failure diring a direct I/O write.
 */
void dio_warn_stale_pagecache(struct file *filp);

extern void inode_set_flags(struct inode *inode, unsigned int flags,
                            unsigned int mask);

extern const struct file_operations generic_ro_fops;

#define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))

extern int readlink_copy(char __user *, int, const char *);
extern int page_readlink(struct dentry *, char __user *, int);
extern const char *page_get_link_raw(struct dentry *, struct inode *,
                                     struct delayed_call *);
extern const char *page_get_link(struct dentry *, struct inode *,
                                 struct delayed_call *);
extern void page_put_link(void *);
extern int __page_symlink(struct inode *inode, const char *symname, int len,
                int nofs);
extern int page_symlink(struct inode *inode, const char *symname, int len);
extern const struct inode_operations page_symlink_inode_operations;
extern void kfree_link(void *);
extern void generic_fillattr(struct inode *, struct kstat *);
extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int);
extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int);
void __inode_add_bytes(struct inode *inode, loff_t bytes);
void inode_add_bytes(struct inode *inode, loff_t bytes);
void __inode_sub_bytes(struct inode *inode, loff_t bytes);
void inode_sub_bytes(struct inode *inode, loff_t bytes);
static inline loff_t __inode_get_bytes(struct inode *inode)
{
        return (((loff_t)inode->i_blocks) << 9) + inode->i_bytes;
}
loff_t inode_get_bytes(struct inode *inode);
void inode_set_bytes(struct inode *inode, loff_t bytes);
const char *simple_get_link(struct dentry *, struct inode *,
                            struct delayed_call *);
extern const struct inode_operations simple_symlink_inode_operations;

extern int iterate_dir(struct file *, struct dir_context *);

int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
                int flags);
int vfs_fstat(int fd, struct kstat *stat);

static inline int vfs_stat(const char __user *filename, struct kstat *stat)
{
        return vfs_fstatat(AT_FDCWD, filename, stat, 0);
}
static inline int vfs_lstat(const char __user *name, struct kstat *stat)
{
        return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW);
}

extern const char *vfs_get_link(struct dentry *, struct delayed_call *);
extern int vfs_readlink(struct dentry *, char __user *, int);

extern struct file_system_type *get_filesystem(struct file_system_type *fs);
extern void put_filesystem(struct file_system_type *fs);
extern struct file_system_type *get_fs_type(const char *name);
extern struct super_block *get_super(struct block_device *);
extern struct super_block *get_super_thawed(struct block_device *);
extern struct super_block *get_super_exclusive_thawed(struct block_device *bdev);
extern struct super_block *get_active_super(struct block_device *bdev);
extern void drop_super(struct super_block *sb);
extern void drop_super_exclusive(struct super_block *sb);
extern void iterate_supers(void (*)(struct super_block *, void *), void *);
extern void iterate_supers_type(struct file_system_type *,
                                void (*)(struct super_block *, void *), void *);

extern int dcache_dir_open(struct inode *, struct file *);
extern int dcache_dir_close(struct inode *, struct file *);
extern loff_t dcache_dir_lseek(struct file *, loff_t, int);
extern int dcache_readdir(struct file *, struct dir_context *);
extern int simple_setattr(struct dentry *, struct iattr *);
extern int simple_getattr(const struct path *, struct kstat *, u32, unsigned int);
extern int simple_statfs(struct dentry *, struct kstatfs *);
extern int simple_open(struct inode *inode, struct file *file);
extern int simple_link(struct dentry *, struct inode *, struct dentry *);
extern int simple_unlink(struct inode *, struct dentry *);
extern int simple_rmdir(struct inode *, struct dentry *);
extern int simple_rename(struct inode *, struct dentry *,
                         struct inode *, struct dentry *, unsigned int);
extern void simple_recursive_removal(struct dentry *,
                              void (*callback)(struct dentry *));
extern int noop_fsync(struct file *, loff_t, loff_t, int);
extern int noop_set_page_dirty(struct page *page);
extern void noop_invalidatepage(struct page *page, unsigned int offset,
                unsigned int length);
extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
extern int simple_empty(struct dentry *);
extern int simple_readpage(struct file *file, struct page *page);
extern int simple_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata);
extern int simple_write_end(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned copied,
                        struct page *page, void *fsdata);
extern int always_delete_dentry(const struct dentry *);
extern struct inode *alloc_anon_inode(struct super_block *);
extern int simple_nosetlease(struct file *, long, struct file_lock **, void **);
extern const struct dentry_operations simple_dentry_operations;

extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned int flags);
extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *);
extern const struct file_operations simple_dir_operations;
extern const struct inode_operations simple_dir_inode_operations;
extern void make_empty_dir_inode(struct inode *inode);
extern bool is_empty_dir_inode(struct inode *inode);
struct tree_descr { const char *name; const struct file_operations *ops; int mode; };
struct dentry *d_alloc_name(struct dentry *, const char *);
extern int simple_fill_super(struct super_block *, unsigned long,
                             const struct tree_descr *);
extern int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, int *count);
extern void simple_release_fs(struct vfsmount **mount, int *count);

extern ssize_t simple_read_from_buffer(void __user *to, size_t count,
                        loff_t *ppos, const void *from, size_t available);
extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
                const void __user *from, size_t count);

extern int __generic_file_fsync(struct file *, loff_t, loff_t, int);
extern int generic_file_fsync(struct file *, loff_t, loff_t, int);

extern int generic_check_addressable(unsigned, u64);

#ifdef CONFIG_UNICODE
extern int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str);
extern int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
                                const char *str, const struct qstr *name);
#endif

#ifdef CONFIG_MIGRATION
extern int buffer_migrate_page(struct address_space *,
                                struct page *, struct page *,
                                enum migrate_mode);
extern int buffer_migrate_page_norefs(struct address_space *,
                                struct page *, struct page *,
                                enum migrate_mode);
#else
#define buffer_migrate_page NULL
#define buffer_migrate_page_norefs NULL
#endif

extern int setattr_prepare(struct dentry *, struct iattr *);
extern int inode_newsize_ok(const struct inode *, loff_t offset);
extern void setattr_copy(struct inode *inode, const struct iattr *attr);

extern int file_update_time(struct file *file);

static inline bool vma_is_dax(const struct vm_area_struct *vma)
{
        return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host);
}

static inline bool vma_is_fsdax(struct vm_area_struct *vma)
{
        struct inode *inode;

        if (!vma->vm_file)
                return false;
        if (!vma_is_dax(vma))
                return false;
        inode = file_inode(vma->vm_file);
        if (S_ISCHR(inode->i_mode))
                return false; /* device-dax */
        return true;
}

static inline int iocb_flags(struct file *file)
{
        int res = 0;
        if (file->f_flags & O_APPEND)
                res |= IOCB_APPEND;
        if (file->f_flags & O_DIRECT)
                res |= IOCB_DIRECT;
        if ((file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host))
                res |= IOCB_DSYNC;
        if (file->f_flags & __O_SYNC)
                res |= IOCB_SYNC;
        return res;
}

static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
{
        int kiocb_flags = 0;

        /* make sure there's no overlap between RWF and private IOCB flags */
        BUILD_BUG_ON((__force int) RWF_SUPPORTED & IOCB_EVENTFD);

        if (!flags)
                return 0;
        if (unlikely(flags & ~RWF_SUPPORTED))
                return -EOPNOTSUPP;

        if (flags & RWF_NOWAIT) {
                if (!(ki->ki_filp->f_mode & FMODE_NOWAIT))
                        return -EOPNOTSUPP;
                kiocb_flags |= IOCB_NOIO;
        }
        kiocb_flags |= (__force int) (flags & RWF_SUPPORTED);
        if (flags & RWF_SYNC)
                kiocb_flags |= IOCB_DSYNC;

        ki->ki_flags |= kiocb_flags;
        return 0;
}

static inline ino_t parent_ino(struct dentry *dentry)
{
        ino_t res;

        /*
         * Don't strictly need d_lock here? If the parent ino could change
         * then surely we'd have a deeper race in the caller?
         */
        spin_lock(&dentry->d_lock);
        res = dentry->d_parent->d_inode->i_ino;
        spin_unlock(&dentry->d_lock);
        return res;
}

/* Transaction based IO helpers */

/*
 * An argresp is stored in an allocated page and holds the
 * size of the argument or response, along with its content
 */
struct simple_transaction_argresp {
        ssize_t size;
        char data[];
};

#define SIMPLE_TRANSACTION_LIMIT (PAGE_SIZE - sizeof(struct simple_transaction_argresp))

char *simple_transaction_get(struct file *file, const char __user *buf,
                                size_t size);
ssize_t simple_transaction_read(struct file *file, char __user *buf,
                                size_t size, loff_t *pos);
int simple_transaction_release(struct inode *inode, struct file *file);

void simple_transaction_set(struct file *file, size_t n);

/*
 * simple attribute files
 *
 * These attributes behave similar to those in sysfs:
 *
 * Writing to an attribute immediately sets a value, an open file can be
 * written to multiple times.
 *
 * Reading from an attribute creates a buffer from the value that might get
 * read with multiple read calls. When the attribute has been read
 * completely, no further read calls are possible until the file is opened
 * again.
 *
 * All attributes contain a text representation of a numeric value
 * that are accessed with the get() and set() functions.
 */
#define DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, __is_signed)        \
static int __fops ## _open(struct inode *inode, struct file *file)        \
{                                                                        \
        __simple_attr_check_format(__fmt, 0ull);                        \
        return simple_attr_open(inode, file, __get, __set, __fmt);        \
}                                                                        \
static const struct file_operations __fops = {                                \
        .owner         = THIS_MODULE,                                                \
        .open         = __fops ## _open,                                        \
        .release = simple_attr_release,                                        \
        .read         = simple_attr_read,                                        \
        .write         = (__is_signed) ? simple_attr_write_signed : simple_attr_write,        \
        .llseek         = generic_file_llseek,                                        \
}

#define DEFINE_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt)                \
        DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, false)

#define DEFINE_SIMPLE_ATTRIBUTE_SIGNED(__fops, __get, __set, __fmt)        \
        DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, true)

static inline __printf(1, 2)
void __simple_attr_check_format(const char *fmt, ...)
{
        /* don't do anything, just let the compiler check the arguments; */
}

int simple_attr_open(struct inode *inode, struct file *file,
                     int (*get)(void *, u64 *), int (*set)(void *, u64),
                     const char *fmt);
int simple_attr_release(struct inode *inode, struct file *file);
ssize_t simple_attr_read(struct file *file, char __user *buf,
                         size_t len, loff_t *ppos);
ssize_t simple_attr_write(struct file *file, const char __user *buf,
                          size_t len, loff_t *ppos);
ssize_t simple_attr_write_signed(struct file *file, const char __user *buf,
                                 size_t len, loff_t *ppos);

struct ctl_table;
int proc_nr_files(struct ctl_table *table, int write,
                  void *buffer, size_t *lenp, loff_t *ppos);
int proc_nr_dentry(struct ctl_table *table, int write,
                  void *buffer, size_t *lenp, loff_t *ppos);
int proc_nr_inodes(struct ctl_table *table, int write,
                   void *buffer, size_t *lenp, loff_t *ppos);
int __init get_filesystem_list(char *buf);

#define __FMODE_EXEC                ((__force int) FMODE_EXEC)
#define __FMODE_NONOTIFY        ((__force int) FMODE_NONOTIFY)

#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE])
#define OPEN_FMODE(flag) ((__force fmode_t)(((flag + 1) & O_ACCMODE) | \
                                            (flag & __FMODE_NONOTIFY)))

static inline bool is_sxid(umode_t mode)
{
        return mode & (S_ISUID | S_ISGID);
}

static inline int check_sticky(struct inode *dir, struct inode *inode)
{
        if (!(dir->i_mode & S_ISVTX))
                return 0;

        return __check_sticky(dir, inode);
}

static inline void inode_has_no_xattr(struct inode *inode)
{
        if (!is_sxid(inode->i_mode) && (inode->i_sb->s_flags & SB_NOSEC))
                inode->i_flags |= S_NOSEC;
}

static inline bool is_root_inode(struct inode *inode)
{
        return inode == inode->i_sb->s_root->d_inode;
}

static inline bool dir_emit(struct dir_context *ctx,
                            const char *name, int namelen,
                            u64 ino, unsigned type)
{
        return ctx->actor(ctx, name, namelen, ctx->pos, ino, type) == 0;
}
static inline bool dir_emit_dot(struct file *file, struct dir_context *ctx)
{
        return ctx->actor(ctx, ".", 1, ctx->pos,
                          file->f_path.dentry->d_inode->i_ino, DT_DIR) == 0;
}
static inline bool dir_emit_dotdot(struct file *file, struct dir_context *ctx)
{
        return ctx->actor(ctx, "..", 2, ctx->pos,
                          parent_ino(file->f_path.dentry), DT_DIR) == 0;
}
static inline bool dir_emit_dots(struct file *file, struct dir_context *ctx)
{
        if (ctx->pos == 0) {
                if (!dir_emit_dot(file, ctx))
                        return false;
                ctx->pos = 1;
        }
        if (ctx->pos == 1) {
                if (!dir_emit_dotdot(file, ctx))
                        return false;
                ctx->pos = 2;
        }
        return true;
}
static inline bool dir_relax(struct inode *inode)
{
        inode_unlock(inode);
        inode_lock(inode);
        return !IS_DEADDIR(inode);
}

static inline bool dir_relax_shared(struct inode *inode)
{
        inode_unlock_shared(inode);
        inode_lock_shared(inode);
        return !IS_DEADDIR(inode);
}

extern bool path_noexec(const struct path *path);
extern void inode_nohighmem(struct inode *inode);

/* mm/fadvise.c */
extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len,
                       int advice);
extern int generic_fadvise(struct file *file, loff_t offset, loff_t len,
                           int advice);

int vfs_ioc_setflags_prepare(struct inode *inode, unsigned int oldflags,
                             unsigned int flags);

int vfs_ioc_fssetxattr_check(struct inode *inode, const struct fsxattr *old_fa,
                             struct fsxattr *fa);

static inline void simple_fill_fsxattr(struct fsxattr *fa, __u32 xflags)
{
        memset(fa, 0, sizeof(*fa));
        fa->fsx_xflags = xflags;
}

/*
 * Flush file data before changing attributes.  Caller must hold any locks
 * required to prevent further writes to this file until we're done setting
 * flags.
 */
static inline int inode_drain_writes(struct inode *inode)
{
        inode_dio_wait(inode);
        return filemap_write_and_wait(inode->i_mapping);
}

#endif /* _LINUX_FS_H */
































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _FIB_LOOKUP_H
#define _FIB_LOOKUP_H

#include <linux/types.h>
#include <linux/list.h>
#include <net/ip_fib.h>
#include <net/nexthop.h>

struct fib_alias {
        struct hlist_node        fa_list;
        struct fib_info                *fa_info;
        u8                        fa_tos;
        u8                        fa_type;
        u8                        fa_state;
        u8                        fa_slen;
        u32                        tb_id;
        s16                        fa_default;
        u8                        offload:1,
                                trap:1,
                                unused:6;
        struct rcu_head                rcu;
};

#define FA_S_ACCESSED        0x01

/* Dont write on fa_state unless needed, to keep it shared on all cpus */
static inline void fib_alias_accessed(struct fib_alias *fa)
{
        u8 fa_state = READ_ONCE(fa->fa_state);

        if (!(fa_state & FA_S_ACCESSED))
                WRITE_ONCE(fa->fa_state, fa_state | FA_S_ACCESSED);
}

/* Exported by fib_semantics.c */
void fib_release_info(struct fib_info *);
struct fib_info *fib_create_info(struct fib_config *cfg,
                                 struct netlink_ext_ack *extack);
int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi,
                 struct netlink_ext_ack *extack);
bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi);
int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
                  struct fib_rt_info *fri, unsigned int flags);
void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len,
               u32 tb_id, const struct nl_info *info, unsigned int nlm_flags);

static inline void fib_result_assign(struct fib_result *res,
                                     struct fib_info *fi)
{
        /* we used to play games with refcounts, but we now use RCU */
        res->fi = fi;
        res->nhc = fib_info_nhc(fi, 0);
}

struct fib_prop {
        int        error;
        u8        scope;
};

extern const struct fib_prop fib_props[RTN_MAX + 1];

#endif /* _FIB_LOOKUP_H */



























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NF_CONNTRACK_EXTEND_H
#define _NF_CONNTRACK_EXTEND_H

#include <linux/slab.h>

#include <net/netfilter/nf_conntrack.h>

enum nf_ct_ext_id {
        NF_CT_EXT_HELPER,
#if IS_ENABLED(CONFIG_NF_NAT)
        NF_CT_EXT_NAT,
#endif
        NF_CT_EXT_SEQADJ,
        NF_CT_EXT_ACCT,
#ifdef CONFIG_NF_CONNTRACK_EVENTS
        NF_CT_EXT_ECACHE,
#endif
#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
        NF_CT_EXT_TSTAMP,
#endif
#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
        NF_CT_EXT_TIMEOUT,
#endif
#ifdef CONFIG_NF_CONNTRACK_LABELS
        NF_CT_EXT_LABELS,
#endif
#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
        NF_CT_EXT_SYNPROXY,
#endif
        NF_CT_EXT_NUM,
};

#define NF_CT_EXT_HELPER_TYPE struct nf_conn_help
#define NF_CT_EXT_NAT_TYPE struct nf_conn_nat
#define NF_CT_EXT_SEQADJ_TYPE struct nf_conn_seqadj
#define NF_CT_EXT_ACCT_TYPE struct nf_conn_acct
#define NF_CT_EXT_ECACHE_TYPE struct nf_conntrack_ecache
#define NF_CT_EXT_TSTAMP_TYPE struct nf_conn_tstamp
#define NF_CT_EXT_TIMEOUT_TYPE struct nf_conn_timeout
#define NF_CT_EXT_LABELS_TYPE struct nf_conn_labels
#define NF_CT_EXT_SYNPROXY_TYPE struct nf_conn_synproxy

/* Extensions: optional stuff which isn't permanently in struct. */
struct nf_ct_ext {
        u8 offset[NF_CT_EXT_NUM];
        u8 len;
        char data[];
};

static inline bool __nf_ct_ext_exist(const struct nf_ct_ext *ext, u8 id)
{
        return !!ext->offset[id];
}

static inline bool nf_ct_ext_exist(const struct nf_conn *ct, u8 id)
{
        return (ct->ext && __nf_ct_ext_exist(ct->ext, id));
}

static inline void *__nf_ct_ext_find(const struct nf_conn *ct, u8 id)
{
        if (!nf_ct_ext_exist(ct, id))
                return NULL;

        return (void *)ct->ext + ct->ext->offset[id];
}
#define nf_ct_ext_find(ext, id)        \
        ((id##_TYPE *)__nf_ct_ext_find((ext), (id)))

/* Destroy all relationships */
void nf_ct_ext_destroy(struct nf_conn *ct);

/* Add this type, returns pointer to data or NULL. */
void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp);

struct nf_ct_ext_type {
        /* Destroys relationships (can be NULL). */
        void (*destroy)(struct nf_conn *ct);

        enum nf_ct_ext_id id;

        /* Length and min alignment. */
        u8 len;
        u8 align;
};

int nf_ct_extend_register(const struct nf_ct_ext_type *type);
void nf_ct_extend_unregister(const struct nf_ct_ext_type *type);
#endif /* _NF_CONNTRACK_EXTEND_H */















































    1 




    1 
































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM filemap

#if !defined(_TRACE_FILEMAP_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_FILEMAP_H

#include <linux/types.h>
#include <linux/tracepoint.h>
#include <linux/mm.h>
#include <linux/memcontrol.h>
#include <linux/device.h>
#include <linux/kdev_t.h>
#include <linux/errseq.h>

DECLARE_EVENT_CLASS(mm_filemap_op_page_cache,

        TP_PROTO(struct page *page),

        TP_ARGS(page),

        TP_STRUCT__entry(
                __field(unsigned long, pfn)
                __field(unsigned long, i_ino)
                __field(unsigned long, index)
                __field(dev_t, s_dev)
        ),

        TP_fast_assign(
                __entry->pfn = page_to_pfn(page);
                __entry->i_ino = page->mapping->host->i_ino;
                __entry->index = page->index;
                if (page->mapping->host->i_sb)
                        __entry->s_dev = page->mapping->host->i_sb->s_dev;
                else
                        __entry->s_dev = page->mapping->host->i_rdev;
        ),

        TP_printk("dev %d:%d ino %lx page=%p pfn=%lu ofs=%lu",
                MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
                __entry->i_ino,
                pfn_to_page(__entry->pfn),
                __entry->pfn,
                __entry->index << PAGE_SHIFT)
);

DEFINE_EVENT(mm_filemap_op_page_cache, mm_filemap_delete_from_page_cache,
        TP_PROTO(struct page *page),
        TP_ARGS(page)
        );

DEFINE_EVENT(mm_filemap_op_page_cache, mm_filemap_add_to_page_cache,
        TP_PROTO(struct page *page),
        TP_ARGS(page)
        );

TRACE_EVENT(filemap_set_wb_err,
                TP_PROTO(struct address_space *mapping, errseq_t eseq),

                TP_ARGS(mapping, eseq),

                TP_STRUCT__entry(
                        __field(unsigned long, i_ino)
                        __field(dev_t, s_dev)
                        __field(errseq_t, errseq)
                ),

                TP_fast_assign(
                        __entry->i_ino = mapping->host->i_ino;
                        __entry->errseq = eseq;
                        if (mapping->host->i_sb)
                                __entry->s_dev = mapping->host->i_sb->s_dev;
                        else
                                __entry->s_dev = mapping->host->i_rdev;
                ),

                TP_printk("dev=%d:%d ino=0x%lx errseq=0x%x",
                        MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
                        __entry->i_ino, __entry->errseq)
);

TRACE_EVENT(file_check_and_advance_wb_err,
                TP_PROTO(struct file *file, errseq_t old),

                TP_ARGS(file, old),

                TP_STRUCT__entry(
                        __field(struct file *, file)
                        __field(unsigned long, i_ino)
                        __field(dev_t, s_dev)
                        __field(errseq_t, old)
                        __field(errseq_t, new)
                ),

                TP_fast_assign(
                        __entry->file = file;
                        __entry->i_ino = file->f_mapping->host->i_ino;
                        if (file->f_mapping->host->i_sb)
                                __entry->s_dev =
                                        file->f_mapping->host->i_sb->s_dev;
                        else
                                __entry->s_dev =
                                        file->f_mapping->host->i_rdev;
                        __entry->old = old;
                        __entry->new = file->f_wb_err;
                ),

                TP_printk("file=%p dev=%d:%d ino=0x%lx old=0x%x new=0x%x",
                        __entry->file, MAJOR(__entry->s_dev),
                        MINOR(__entry->s_dev), __entry->i_ino, __entry->old,
                        __entry->new)
);
#endif /* _TRACE_FILEMAP_H */

/* This part must be outside protection */
#include <trace/define_trace.h>









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
/*
   BlueZ - Bluetooth protocol stack for Linux
   Copyright (c) 2000-2001, 2010, Code Aurora Forum. All rights reserved.

   Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License version 2 as
   published by the Free Software Foundation;

   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
   IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
   CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

   ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
   COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
   SOFTWARE IS DISCLAIMED.
*/

#ifndef __HCI_CORE_H
#define __HCI_CORE_H

#include <linux/idr.h>
#include <linux/leds.h>
#include <linux/rculist.h>

#include <net/bluetooth/hci.h>
#include <net/bluetooth/hci_sock.h>

/* HCI priority */
#define HCI_PRIO_MAX        7

/* HCI maximum id value */
#define HCI_MAX_ID 10000

/* HCI Core structures */
struct inquiry_data {
        bdaddr_t        bdaddr;
        __u8                pscan_rep_mode;
        __u8                pscan_period_mode;
        __u8                pscan_mode;
        __u8                dev_class[3];
        __le16                clock_offset;
        __s8                rssi;
        __u8                ssp_mode;
};

struct inquiry_entry {
        struct list_head        all;                /* inq_cache.all */
        struct list_head        list;                /* unknown or resolve */
        enum {
                NAME_NOT_KNOWN,
                NAME_NEEDED,
                NAME_PENDING,
                NAME_KNOWN,
        } name_state;
        __u32                        timestamp;
        struct inquiry_data        data;
};

struct discovery_state {
        int                        type;
        enum {
                DISCOVERY_STOPPED,
                DISCOVERY_STARTING,
                DISCOVERY_FINDING,
                DISCOVERY_RESOLVING,
                DISCOVERY_STOPPING,
        } state;
        struct list_head        all;        /* All devices found during inquiry */
        struct list_head        unknown;        /* Name state not known */
        struct list_head        resolve;        /* Name needs to be resolved */
        __u32                        timestamp;
        bdaddr_t                last_adv_addr;
        u8                        last_adv_addr_type;
        s8                        last_adv_rssi;
        u32                        last_adv_flags;
        u8                        last_adv_data[HCI_MAX_AD_LENGTH];
        u8                        last_adv_data_len;
        bool                        report_invalid_rssi;
        bool                        result_filtering;
        bool                        limited;
        s8                        rssi;
        u16                        uuid_count;
        u8                        (*uuids)[16];
        unsigned long                scan_start;
        unsigned long                scan_duration;
};

#define SUSPEND_NOTIFIER_TIMEOUT        msecs_to_jiffies(2000) /* 2 seconds */

enum suspend_tasks {
        SUSPEND_PAUSE_DISCOVERY,
        SUSPEND_UNPAUSE_DISCOVERY,

        SUSPEND_PAUSE_ADVERTISING,
        SUSPEND_UNPAUSE_ADVERTISING,

        SUSPEND_SCAN_DISABLE,
        SUSPEND_SCAN_ENABLE,
        SUSPEND_DISCONNECTING,

        SUSPEND_POWERING_DOWN,

        SUSPEND_PREPARE_NOTIFIER,
        __SUSPEND_NUM_TASKS
};

enum suspended_state {
        BT_RUNNING = 0,
        BT_SUSPEND_DISCONNECT,
        BT_SUSPEND_CONFIGURE_WAKE,
};

struct hci_conn_hash {
        struct list_head list;
        unsigned int     acl_num;
        unsigned int     amp_num;
        unsigned int     sco_num;
        unsigned int     le_num;
        unsigned int     le_num_slave;
};

struct bdaddr_list {
        struct list_head list;
        bdaddr_t bdaddr;
        u8 bdaddr_type;
};

struct bdaddr_list_with_irk {
        struct list_head list;
        bdaddr_t bdaddr;
        u8 bdaddr_type;
        u8 peer_irk[16];
        u8 local_irk[16];
};

struct bdaddr_list_with_flags {
        struct list_head list;
        bdaddr_t bdaddr;
        u8 bdaddr_type;
        u32 current_flags;
};

enum hci_conn_flags {
        HCI_CONN_FLAG_REMOTE_WAKEUP,
        HCI_CONN_FLAG_MAX
};

#define hci_conn_test_flag(nr, flags) ((flags) & (1U << nr))

/* Make sure number of flags doesn't exceed sizeof(current_flags) */
static_assert(HCI_CONN_FLAG_MAX < 32);

struct bt_uuid {
        struct list_head list;
        u8 uuid[16];
        u8 size;
        u8 svc_hint;
};

struct blocked_key {
        struct list_head list;
        struct rcu_head rcu;
        u8 type;
        u8 val[16];
};

struct smp_csrk {
        bdaddr_t bdaddr;
        u8 bdaddr_type;
        u8 type;
        u8 val[16];
};

struct smp_ltk {
        struct list_head list;
        struct rcu_head rcu;
        bdaddr_t bdaddr;
        u8 bdaddr_type;
        u8 authenticated;
        u8 type;
        u8 enc_size;
        __le16 ediv;
        __le64 rand;
        u8 val[16];
};

struct smp_irk {
        struct list_head list;
        struct rcu_head rcu;
        bdaddr_t rpa;
        bdaddr_t bdaddr;
        u8 addr_type;
        u8 val[16];
};

struct link_key {
        struct list_head list;
        struct rcu_head rcu;
        bdaddr_t bdaddr;
        u8 type;
        u8 val[HCI_LINK_KEY_SIZE];
        u8 pin_len;
};

struct oob_data {
        struct list_head list;
        bdaddr_t bdaddr;
        u8 bdaddr_type;
        u8 present;
        u8 hash192[16];
        u8 rand192[16];
        u8 hash256[16];
        u8 rand256[16];
};

struct adv_info {
        struct list_head list;
        bool pending;
        __u8        instance;
        __u32        flags;
        __u16        timeout;
        __u16        remaining_time;
        __u16        duration;
        __u16        adv_data_len;
        __u8        adv_data[HCI_MAX_EXT_AD_LENGTH];
        __u16        scan_rsp_len;
        __u8        scan_rsp_data[HCI_MAX_EXT_AD_LENGTH];
        __s8        tx_power;
        bdaddr_t        random_addr;
        bool                 rpa_expired;
        struct delayed_work        rpa_expired_cb;
};

#define HCI_MAX_ADV_INSTANCES                5
#define HCI_DEFAULT_ADV_DURATION        2

struct adv_pattern {
        struct list_head list;
        __u8 ad_type;
        __u8 offset;
        __u8 length;
        __u8 value[HCI_MAX_AD_LENGTH];
};

struct adv_monitor {
        struct list_head patterns;
        bool                active;
        __u16                handle;
};

#define HCI_MIN_ADV_MONITOR_HANDLE                1
#define HCI_MAX_ADV_MONITOR_NUM_HANDLES        32
#define HCI_MAX_ADV_MONITOR_NUM_PATTERNS        16

#define HCI_MAX_SHORT_NAME_LENGTH        10

/* Min encryption key size to match with SMP */
#define HCI_MIN_ENC_KEY_SIZE                7

/* Default LE RPA expiry time, 15 minutes */
#define HCI_DEFAULT_RPA_TIMEOUT                (15 * 60)

/* Default min/max age of connection information (1s/3s) */
#define DEFAULT_CONN_INFO_MIN_AGE        1000
#define DEFAULT_CONN_INFO_MAX_AGE        3000
/* Default authenticated payload timeout 30s */
#define DEFAULT_AUTH_PAYLOAD_TIMEOUT   0x0bb8

struct amp_assoc {
        __u16        len;
        __u16        offset;
        __u16        rem_len;
        __u16        len_so_far;
        __u8        data[HCI_MAX_AMP_ASSOC_SIZE];
};

#define HCI_MAX_PAGES        3

struct hci_dev {
        struct list_head list;
        struct mutex        lock;

        const char        *name;
        unsigned long        flags;
        __u16                id;
        __u8                bus;
        __u8                dev_type;
        bdaddr_t        bdaddr;
        bdaddr_t        setup_addr;
        bdaddr_t        public_addr;
        bdaddr_t        random_addr;
        bdaddr_t        static_addr;
        __u8                adv_addr_type;
        __u8                dev_name[HCI_MAX_NAME_LENGTH];
        __u8                short_name[HCI_MAX_SHORT_NAME_LENGTH];
        __u8                eir[HCI_MAX_EIR_LENGTH];
        __u16                appearance;
        __u8                dev_class[3];
        __u8                major_class;
        __u8                minor_class;
        __u8                max_page;
        __u8                features[HCI_MAX_PAGES][8];
        __u8                le_features[8];
        __u8                le_accept_list_size;
        __u8                le_resolv_list_size;
        __u8                le_num_of_adv_sets;
        __u8                le_states[8];
        __u8                commands[64];
        __u8                hci_ver;
        __u16                hci_rev;
        __u8                lmp_ver;
        __u16                manufacturer;
        __u16                lmp_subver;
        __u16                voice_setting;
        __u8                num_iac;
        __u8                stored_max_keys;
        __u8                stored_num_keys;
        __u8                io_capability;
        __s8                inq_tx_power;
        __u8                err_data_reporting;
        __u16                page_scan_interval;
        __u16                page_scan_window;
        __u8                page_scan_type;
        __u8                le_adv_channel_map;
        __u16                le_adv_min_interval;
        __u16                le_adv_max_interval;
        __u8                le_scan_type;
        __u16                le_scan_interval;
        __u16                le_scan_window;
        __u16                le_scan_int_suspend;
        __u16                le_scan_window_suspend;
        __u16                le_scan_int_discovery;
        __u16                le_scan_window_discovery;
        __u16                le_scan_int_adv_monitor;
        __u16                le_scan_window_adv_monitor;
        __u16                le_scan_int_connect;
        __u16                le_scan_window_connect;
        __u16                le_conn_min_interval;
        __u16                le_conn_max_interval;
        __u16                le_conn_latency;
        __u16                le_supv_timeout;
        __u16                le_def_tx_len;
        __u16                le_def_tx_time;
        __u16                le_max_tx_len;
        __u16                le_max_tx_time;
        __u16                le_max_rx_len;
        __u16                le_max_rx_time;
        __u8                le_max_key_size;
        __u8                le_min_key_size;
        __u16                discov_interleaved_timeout;
        __u16                conn_info_min_age;
        __u16                conn_info_max_age;
        __u16                auth_payload_timeout;
        __u8                min_enc_key_size;
        __u8                max_enc_key_size;
        __u8                pairing_opts;
        __u8                ssp_debug_mode;
        __u8                hw_error_code;
        __u32                clock;
        __u16                advmon_allowlist_duration;
        __u16                advmon_no_filter_duration;

        __u16                devid_source;
        __u16                devid_vendor;
        __u16                devid_product;
        __u16                devid_version;

        __u8                def_page_scan_type;
        __u16                def_page_scan_int;
        __u16                def_page_scan_window;
        __u8                def_inq_scan_type;
        __u16                def_inq_scan_int;
        __u16                def_inq_scan_window;
        __u16                def_br_lsto;
        __u16                def_page_timeout;
        __u16                def_multi_adv_rotation_duration;
        __u16                def_le_autoconnect_timeout;

        __u16                pkt_type;
        __u16                esco_type;
        __u16                link_policy;
        __u16                link_mode;

        __u32                idle_timeout;
        __u16                sniff_min_interval;
        __u16                sniff_max_interval;

        __u8                amp_status;
        __u32                amp_total_bw;
        __u32                amp_max_bw;
        __u32                amp_min_latency;
        __u32                amp_max_pdu;
        __u8                amp_type;
        __u16                amp_pal_cap;
        __u16                amp_assoc_size;
        __u32                amp_max_flush_to;
        __u32                amp_be_flush_to;

        struct amp_assoc        loc_assoc;

        __u8                flow_ctl_mode;

        unsigned int        auto_accept_delay;

        unsigned long        quirks;

        atomic_t        cmd_cnt;
        unsigned int        acl_cnt;
        unsigned int        sco_cnt;
        unsigned int        le_cnt;

        unsigned int        acl_mtu;
        unsigned int        sco_mtu;
        unsigned int        le_mtu;
        unsigned int        acl_pkts;
        unsigned int        sco_pkts;
        unsigned int        le_pkts;

        __u16                block_len;
        __u16                block_mtu;
        __u16                num_blocks;
        __u16                block_cnt;

        unsigned long        acl_last_tx;
        unsigned long        sco_last_tx;
        unsigned long        le_last_tx;

        __u8                le_tx_def_phys;
        __u8                le_rx_def_phys;

        struct workqueue_struct        *workqueue;
        struct workqueue_struct        *req_workqueue;

        struct work_struct        power_on;
        struct delayed_work        power_off;
        struct work_struct        error_reset;

        __u16                        discov_timeout;
        struct delayed_work        discov_off;

        struct delayed_work        service_cache;

        struct delayed_work        cmd_timer;

        struct work_struct        rx_work;
        struct work_struct        cmd_work;
        struct work_struct        tx_work;

        struct work_struct        discov_update;
        struct work_struct        bg_scan_update;
        struct work_struct        scan_update;
        struct work_struct        connectable_update;
        struct work_struct        discoverable_update;
        struct delayed_work        le_scan_disable;
        struct delayed_work        le_scan_restart;

        struct sk_buff_head        rx_q;
        struct sk_buff_head        raw_q;
        struct sk_buff_head        cmd_q;

        struct sk_buff                *sent_cmd;

        struct mutex                req_lock;
        wait_queue_head_t        req_wait_q;
        __u32                        req_status;
        __u32                        req_result;
        struct sk_buff                *req_skb;

        void                        *smp_data;
        void                        *smp_bredr_data;

        struct discovery_state        discovery;

        int                        discovery_old_state;
        bool                        discovery_paused;
        int                        advertising_old_state;
        bool                        advertising_paused;

        struct notifier_block        suspend_notifier;
        struct work_struct        suspend_prepare;
        enum suspended_state        suspend_state_next;
        enum suspended_state        suspend_state;
        bool                        scanning_paused;
        bool                        suspended;
        u8                        wake_reason;
        bdaddr_t                wake_addr;
        u8                        wake_addr_type;

        wait_queue_head_t        suspend_wait_q;
        DECLARE_BITMAP(suspend_tasks, __SUSPEND_NUM_TASKS);

        struct hci_conn_hash        conn_hash;

        struct list_head        mgmt_pending;
        struct list_head        reject_list;
        struct list_head        accept_list;
        struct list_head        uuids;
        struct list_head        link_keys;
        struct list_head        long_term_keys;
        struct list_head        identity_resolving_keys;
        struct list_head        remote_oob_data;
        struct list_head        le_accept_list;
        struct list_head        le_resolv_list;
        struct list_head        le_conn_params;
        struct list_head        pend_le_conns;
        struct list_head        pend_le_reports;
        struct list_head        blocked_keys;

        struct hci_dev_stats        stat;

        atomic_t                promisc;

        const char                *hw_info;
        const char                *fw_info;
        struct dentry                *debugfs;

        struct device                dev;

        struct rfkill                *rfkill;

        DECLARE_BITMAP(dev_flags, __HCI_NUM_FLAGS);

        __s8                        adv_tx_power;
        __u8                        adv_data[HCI_MAX_EXT_AD_LENGTH];
        __u8                        adv_data_len;
        __u8                        scan_rsp_data[HCI_MAX_EXT_AD_LENGTH];
        __u8                        scan_rsp_data_len;

        struct list_head        adv_instances;
        unsigned int                adv_instance_cnt;
        __u8                        cur_adv_instance;
        __u16                        adv_instance_timeout;
        struct delayed_work        adv_instance_expire;

        struct idr                adv_monitors_idr;
        unsigned int                adv_monitors_cnt;

        __u8                        irk[16];
        __u32                        rpa_timeout;
        struct delayed_work        rpa_expired;
        bdaddr_t                rpa;

        enum {
                INTERLEAVE_SCAN_NONE,
                INTERLEAVE_SCAN_NO_FILTER,
                INTERLEAVE_SCAN_ALLOWLIST
        } interleave_scan_state;

        struct delayed_work        interleave_scan;

#if IS_ENABLED(CONFIG_BT_LEDS)
        struct led_trigger        *power_led;
#endif

#if IS_ENABLED(CONFIG_BT_MSFTEXT)
        __u16                        msft_opcode;
        void                        *msft_data;
#endif

        int (*open)(struct hci_dev *hdev);
        int (*close)(struct hci_dev *hdev);
        int (*flush)(struct hci_dev *hdev);
        int (*setup)(struct hci_dev *hdev);
        int (*shutdown)(struct hci_dev *hdev);
        int (*send)(struct hci_dev *hdev, struct sk_buff *skb);
        void (*notify)(struct hci_dev *hdev, unsigned int evt);
        void (*hw_error)(struct hci_dev *hdev, u8 code);
        int (*post_init)(struct hci_dev *hdev);
        int (*set_diag)(struct hci_dev *hdev, bool enable);
        int (*set_bdaddr)(struct hci_dev *hdev, const bdaddr_t *bdaddr);
        void (*cmd_timeout)(struct hci_dev *hdev);
        bool (*prevent_wake)(struct hci_dev *hdev);
};

#define HCI_PHY_HANDLE(handle)        (handle & 0xff)

enum conn_reasons {
        CONN_REASON_PAIR_DEVICE,
        CONN_REASON_L2CAP_CHAN,
        CONN_REASON_SCO_CONNECT,
};

struct hci_conn {
        struct list_head list;

        atomic_t        refcnt;

        bdaddr_t        dst;
        __u8                dst_type;
        bdaddr_t        src;
        __u8                src_type;
        bdaddr_t        init_addr;
        __u8                init_addr_type;
        bdaddr_t        resp_addr;
        __u8                resp_addr_type;
        __u16                handle;
        __u16                state;
        __u8                mode;
        __u8                type;
        __u8                role;
        bool                out;
        __u8                attempt;
        __u8                dev_class[3];
        __u8                features[HCI_MAX_PAGES][8];
        __u16                pkt_type;
        __u16                link_policy;
        __u8                key_type;
        __u8                auth_type;
        __u8                sec_level;
        __u8                pending_sec_level;
        __u8                pin_length;
        __u8                enc_key_size;
        __u8                io_capability;
        __u32                passkey_notify;
        __u8                passkey_entered;
        __u16                disc_timeout;
        __u16                conn_timeout;
        __u16                setting;
        __u16                auth_payload_timeout;
        __u16                le_conn_min_interval;
        __u16                le_conn_max_interval;
        __u16                le_conn_interval;
        __u16                le_conn_latency;
        __u16                le_supv_timeout;
        __u8                le_adv_data[HCI_MAX_AD_LENGTH];
        __u8                le_adv_data_len;
        __u8                le_tx_phy;
        __u8                le_rx_phy;
        __s8                rssi;
        __s8                tx_power;
        __s8                max_tx_power;
        unsigned long        flags;

        enum conn_reasons conn_reason;

        __u32                clock;
        __u16                clock_accuracy;

        unsigned long        conn_info_timestamp;

        __u8                remote_cap;
        __u8                remote_auth;
        __u8                remote_id;

        unsigned int        sent;

        struct sk_buff_head data_q;
        struct list_head chan_list;

        struct delayed_work disc_work;
        struct delayed_work auto_accept_work;
        struct delayed_work idle_work;
        struct delayed_work le_conn_timeout;
        struct work_struct  le_scan_cleanup;

        struct device        dev;
        struct dentry        *debugfs;

        struct hci_dev        *hdev;
        void                *l2cap_data;
        void                *sco_data;
        struct amp_mgr        *amp_mgr;

        struct hci_conn        *link;

        void (*connect_cfm_cb)        (struct hci_conn *conn, u8 status);
        void (*security_cfm_cb)        (struct hci_conn *conn, u8 status);
        void (*disconn_cfm_cb)        (struct hci_conn *conn, u8 reason);
};

struct hci_chan {
        struct list_head list;
        __u16 handle;
        struct hci_conn *conn;
        struct sk_buff_head data_q;
        unsigned int        sent;
        __u8                state;
        bool                amp;
};

struct hci_conn_params {
        struct list_head list;
        struct list_head action;

        bdaddr_t addr;
        u8 addr_type;

        u16 conn_min_interval;
        u16 conn_max_interval;
        u16 conn_latency;
        u16 supervision_timeout;

        enum {
                HCI_AUTO_CONN_DISABLED,
                HCI_AUTO_CONN_REPORT,
                HCI_AUTO_CONN_DIRECT,
                HCI_AUTO_CONN_ALWAYS,
                HCI_AUTO_CONN_LINK_LOSS,
                HCI_AUTO_CONN_EXPLICIT,
        } auto_connect;

        struct hci_conn *conn;
        bool explicit_connect;
        u32 current_flags;
};

extern struct list_head hci_dev_list;
extern struct list_head hci_cb_list;
extern rwlock_t hci_dev_list_lock;
extern struct mutex hci_cb_list_lock;

#define hci_dev_set_flag(hdev, nr)             set_bit((nr), (hdev)->dev_flags)
#define hci_dev_clear_flag(hdev, nr)           clear_bit((nr), (hdev)->dev_flags)
#define hci_dev_change_flag(hdev, nr)          change_bit((nr), (hdev)->dev_flags)
#define hci_dev_test_flag(hdev, nr)            test_bit((nr), (hdev)->dev_flags)
#define hci_dev_test_and_set_flag(hdev, nr)    test_and_set_bit((nr), (hdev)->dev_flags)
#define hci_dev_test_and_clear_flag(hdev, nr)  test_and_clear_bit((nr), (hdev)->dev_flags)
#define hci_dev_test_and_change_flag(hdev, nr) test_and_change_bit((nr), (hdev)->dev_flags)

#define hci_dev_clear_volatile_flags(hdev)                        \
        do {                                                        \
                hci_dev_clear_flag(hdev, HCI_LE_SCAN);                \
                hci_dev_clear_flag(hdev, HCI_LE_ADV);                \
                hci_dev_clear_flag(hdev, HCI_LL_RPA_RESOLUTION);\
                hci_dev_clear_flag(hdev, HCI_PERIODIC_INQ);        \
        } while (0)

/* ----- HCI interface to upper protocols ----- */
int l2cap_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr);
int l2cap_disconn_ind(struct hci_conn *hcon);
void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags);

#if IS_ENABLED(CONFIG_BT_BREDR)
int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags);
void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb);
#else
static inline int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr,
                                  __u8 *flags)
{
        return 0;
}

static inline void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb)
{
}
#endif

/* ----- Inquiry cache ----- */
#define INQUIRY_CACHE_AGE_MAX   (HZ*30)   /* 30 seconds */
#define INQUIRY_ENTRY_AGE_MAX   (HZ*60)   /* 60 seconds */

static inline void discovery_init(struct hci_dev *hdev)
{
        hdev->discovery.state = DISCOVERY_STOPPED;
        INIT_LIST_HEAD(&hdev->discovery.all);
        INIT_LIST_HEAD(&hdev->discovery.unknown);
        INIT_LIST_HEAD(&hdev->discovery.resolve);
        hdev->discovery.report_invalid_rssi = true;
        hdev->discovery.rssi = HCI_RSSI_INVALID;
}

static inline void hci_discovery_filter_clear(struct hci_dev *hdev)
{
        hdev->discovery.result_filtering = false;
        hdev->discovery.report_invalid_rssi = true;
        hdev->discovery.rssi = HCI_RSSI_INVALID;
        hdev->discovery.uuid_count = 0;
        kfree(hdev->discovery.uuids);
        hdev->discovery.uuids = NULL;
        hdev->discovery.scan_start = 0;
        hdev->discovery.scan_duration = 0;
}

bool hci_discovery_active(struct hci_dev *hdev);

void hci_discovery_set_state(struct hci_dev *hdev, int state);

static inline int inquiry_cache_empty(struct hci_dev *hdev)
{
        return list_empty(&hdev->discovery.all);
}

static inline long inquiry_cache_age(struct hci_dev *hdev)
{
        struct discovery_state *c = &hdev->discovery;
        return jiffies - c->timestamp;
}

static inline long inquiry_entry_age(struct inquiry_entry *e)
{
        return jiffies - e->timestamp;
}

struct inquiry_entry *hci_inquiry_cache_lookup(struct hci_dev *hdev,
                                               bdaddr_t *bdaddr);
struct inquiry_entry *hci_inquiry_cache_lookup_unknown(struct hci_dev *hdev,
                                                       bdaddr_t *bdaddr);
struct inquiry_entry *hci_inquiry_cache_lookup_resolve(struct hci_dev *hdev,
                                                       bdaddr_t *bdaddr,
                                                       int state);
void hci_inquiry_cache_update_resolve(struct hci_dev *hdev,
                                      struct inquiry_entry *ie);
u32 hci_inquiry_cache_update(struct hci_dev *hdev, struct inquiry_data *data,
                             bool name_known);
void hci_inquiry_cache_flush(struct hci_dev *hdev);

/* ----- HCI Connections ----- */
enum {
        HCI_CONN_AUTH_PEND,
        HCI_CONN_ENCRYPT_PEND,
        HCI_CONN_RSWITCH_PEND,
        HCI_CONN_MODE_CHANGE_PEND,
        HCI_CONN_SCO_SETUP_PEND,
        HCI_CONN_MGMT_CONNECTED,
        HCI_CONN_SSP_ENABLED,
        HCI_CONN_SC_ENABLED,
        HCI_CONN_AES_CCM,
        HCI_CONN_POWER_SAVE,
        HCI_CONN_FLUSH_KEY,
        HCI_CONN_ENCRYPT,
        HCI_CONN_AUTH,
        HCI_CONN_SECURE,
        HCI_CONN_FIPS,
        HCI_CONN_STK_ENCRYPT,
        HCI_CONN_AUTH_INITIATOR,
        HCI_CONN_DROP,
        HCI_CONN_PARAM_REMOVAL_PEND,
        HCI_CONN_NEW_LINK_KEY,
        HCI_CONN_SCANNING,
        HCI_CONN_AUTH_FAILURE,
};

static inline bool hci_conn_ssp_enabled(struct hci_conn *conn)
{
        struct hci_dev *hdev = conn->hdev;
        return hci_dev_test_flag(hdev, HCI_SSP_ENABLED) &&
               test_bit(HCI_CONN_SSP_ENABLED, &conn->flags);
}

static inline bool hci_conn_sc_enabled(struct hci_conn *conn)
{
        struct hci_dev *hdev = conn->hdev;
        return hci_dev_test_flag(hdev, HCI_SC_ENABLED) &&
               test_bit(HCI_CONN_SC_ENABLED, &conn->flags);
}

static inline void hci_conn_hash_add(struct hci_dev *hdev, struct hci_conn *c)
{
        struct hci_conn_hash *h = &hdev->conn_hash;
        list_add_rcu(&c->list, &h->list);
        switch (c->type) {
        case ACL_LINK:
                h->acl_num++;
                break;
        case AMP_LINK:
                h->amp_num++;
                break;
        case LE_LINK:
                h->le_num++;
                if (c->role == HCI_ROLE_SLAVE)
                        h->le_num_slave++;
                break;
        case SCO_LINK:
        case ESCO_LINK:
                h->sco_num++;
                break;
        }
}

static inline void hci_conn_hash_del(struct hci_dev *hdev, struct hci_conn *c)
{
        struct hci_conn_hash *h = &hdev->conn_hash;

        list_del_rcu(&c->list);
        synchronize_rcu();

        switch (c->type) {
        case ACL_LINK:
                h->acl_num--;
                break;
        case AMP_LINK:
                h->amp_num--;
                break;
        case LE_LINK:
                h->le_num--;
                if (c->role == HCI_ROLE_SLAVE)
                        h->le_num_slave--;
                break;
        case SCO_LINK:
        case ESCO_LINK:
                h->sco_num--;
                break;
        }
}

static inline unsigned int hci_conn_num(struct hci_dev *hdev, __u8 type)
{
        struct hci_conn_hash *h = &hdev->conn_hash;
        switch (type) {
        case ACL_LINK:
                return h->acl_num;
        case AMP_LINK:
                return h->amp_num;
        case LE_LINK:
                return h->le_num;
        case SCO_LINK:
        case ESCO_LINK:
                return h->sco_num;
        default:
                return 0;
        }
}

static inline unsigned int hci_conn_count(struct hci_dev *hdev)
{
        struct hci_conn_hash *c = &hdev->conn_hash;

        return c->acl_num + c->amp_num + c->sco_num + c->le_num;
}

static inline __u8 hci_conn_lookup_type(struct hci_dev *hdev, __u16 handle)
{
        struct hci_conn_hash *h = &hdev->conn_hash;
        struct hci_conn *c;
        __u8 type = INVALID_LINK;

        rcu_read_lock();

        list_for_each_entry_rcu(c, &h->list, list) {
                if (c->handle == handle) {
                        type = c->type;
                        break;
                }
        }

        rcu_read_unlock();

        return type;
}

static inline struct hci_conn *hci_conn_hash_lookup_handle(struct hci_dev *hdev,
                                                                __u16 handle)
{
        struct hci_conn_hash *h = &hdev->conn_hash;
        struct hci_conn  *c;

        rcu_read_lock();

        list_for_each_entry_rcu(c, &h->list, list) {
                if (c->handle == handle) {
                        rcu_read_unlock();
                        return c;
                }
        }
        rcu_read_unlock();

        return NULL;
}

static inline struct hci_conn *hci_conn_hash_lookup_ba(struct hci_dev *hdev,
                                                        __u8 type, bdaddr_t *ba)
{
        struct hci_conn_hash *h = &hdev->conn_hash;
        struct hci_conn  *c;

        rcu_read_lock();

        list_for_each_entry_rcu(c, &h->list, list) {
                if (c->type == type && !bacmp(&c->dst, ba)) {
                        rcu_read_unlock();
                        return c;
                }
        }

        rcu_read_unlock();

        return NULL;
}

static inline struct hci_conn *hci_conn_hash_lookup_le(struct hci_dev *hdev,
                                                       bdaddr_t *ba,
                                                       __u8 ba_type)
{
        struct hci_conn_hash *h = &hdev->conn_hash;
        struct hci_conn  *c;

        rcu_read_lock();

        list_for_each_entry_rcu(c, &h->list, list) {
                if (c->type != LE_LINK)
                       continue;

                if (ba_type == c->dst_type && !bacmp(&c->dst, ba)) {
                        rcu_read_unlock();
                        return c;
                }
        }

        rcu_read_unlock();

        return NULL;
}

static inline struct hci_conn *hci_conn_hash_lookup_state(struct hci_dev *hdev,
                                                        __u8 type, __u16 state)
{
        struct hci_conn_hash *h = &hdev->conn_hash;
        struct hci_conn  *c;

        rcu_read_lock();

        list_for_each_entry_rcu(c, &h->list, list) {
                if (c->type == type && c->state == state) {
                        rcu_read_unlock();
                        return c;
                }
        }

        rcu_read_unlock();

        return NULL;
}

static inline struct hci_conn *hci_lookup_le_connect(struct hci_dev *hdev)
{
        struct hci_conn_hash *h = &hdev->conn_hash;
        struct hci_conn  *c;

        rcu_read_lock();

        list_for_each_entry_rcu(c, &h->list, list) {
                if (c->type == LE_LINK && c->state == BT_CONNECT &&
                    !test_bit(HCI_CONN_SCANNING, &c->flags)) {
                        rcu_read_unlock();
                        return c;
                }
        }

        rcu_read_unlock();

        return NULL;
}

int hci_disconnect(struct hci_conn *conn, __u8 reason);
bool hci_setup_sync(struct hci_conn *conn, __u16 handle);
void hci_sco_setup(struct hci_conn *conn, __u8 status);

struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
                              u8 role);
int hci_conn_del(struct hci_conn *conn);
void hci_conn_hash_flush(struct hci_dev *hdev);
void hci_conn_check_pending(struct hci_dev *hdev);

struct hci_chan *hci_chan_create(struct hci_conn *conn);
void hci_chan_del(struct hci_chan *chan);
void hci_chan_list_flush(struct hci_conn *conn);
struct hci_chan *hci_chan_lookup_handle(struct hci_dev *hdev, __u16 handle);

struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst,
                                     u8 dst_type, u8 sec_level,
                                     u16 conn_timeout,
                                     enum conn_reasons conn_reason);
struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst,
                                u8 dst_type, u8 sec_level, u16 conn_timeout,
                                u8 role, bdaddr_t *direct_rpa);
struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst,
                                 u8 sec_level, u8 auth_type,
                                 enum conn_reasons conn_reason);
struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst,
                                 __u16 setting);
int hci_conn_check_link_mode(struct hci_conn *conn);
int hci_conn_check_secure(struct hci_conn *conn, __u8 sec_level);
int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type,
                      bool initiator);
int hci_conn_switch_role(struct hci_conn *conn, __u8 role);

void hci_conn_enter_active_mode(struct hci_conn *conn, __u8 force_active);

void hci_le_conn_failed(struct hci_conn *conn, u8 status);

/*
 * hci_conn_get() and hci_conn_put() are used to control the life-time of an
 * "hci_conn" object. They do not guarantee that the hci_conn object is running,
 * working or anything else. They just guarantee that the object is available
 * and can be dereferenced. So you can use its locks, local variables and any
 * other constant data.
 * Before accessing runtime data, you _must_ lock the object and then check that
 * it is still running. As soon as you release the locks, the connection might
 * get dropped, though.
 *
 * On the other hand, hci_conn_hold() and hci_conn_drop() are used to control
 * how long the underlying connection is held. So every channel that runs on the
 * hci_conn object calls this to prevent the connection from disappearing. As
 * long as you hold a device, you must also guarantee that you have a valid
 * reference to the device via hci_conn_get() (or the initial reference from
 * hci_conn_add()).
 * The hold()/drop() ref-count is known to drop below 0 sometimes, which doesn't
 * break because nobody cares for that. But this means, we cannot use
 * _get()/_drop() in it, but require the caller to have a valid ref (FIXME).
 */

static inline struct hci_conn *hci_conn_get(struct hci_conn *conn)
{
        get_device(&conn->dev);
        return conn;
}

static inline void hci_conn_put(struct hci_conn *conn)
{
        put_device(&conn->dev);
}

static inline void hci_conn_hold(struct hci_conn *conn)
{
        BT_DBG("hcon %p orig refcnt %d", conn, atomic_read(&conn->refcnt));

        atomic_inc(&conn->refcnt);
        cancel_delayed_work(&conn->disc_work);
}

static inline void hci_conn_drop(struct hci_conn *conn)
{
        BT_DBG("hcon %p orig refcnt %d", conn, atomic_read(&conn->refcnt));

        if (atomic_dec_and_test(&conn->refcnt)) {
                unsigned long timeo;

                switch (conn->type) {
                case ACL_LINK:
                case LE_LINK:
                        cancel_delayed_work(&conn->idle_work);
                        if (conn->state == BT_CONNECTED) {
                                timeo = conn->disc_timeout;
                                if (!conn->out)
                                        timeo *= 2;
                        } else {
                                timeo = 0;
                        }
                        break;

                case AMP_LINK:
                        timeo = conn->disc_timeout;
                        break;

                default:
                        timeo = 0;
                        break;
                }

                cancel_delayed_work(&conn->disc_work);
                queue_delayed_work(conn->hdev->workqueue,
                                   &conn->disc_work, timeo);
        }
}

/* ----- HCI Devices ----- */
static inline void hci_dev_put(struct hci_dev *d)
{
        BT_DBG("%s orig refcnt %d", d->name,
               kref_read(&d->dev.kobj.kref));

        put_device(&d->dev);
}

static inline struct hci_dev *hci_dev_hold(struct hci_dev *d)
{
        BT_DBG("%s orig refcnt %d", d->name,
               kref_read(&d->dev.kobj.kref));

        get_device(&d->dev);
        return d;
}

#define hci_dev_lock(d)                mutex_lock(&d->lock)
#define hci_dev_unlock(d)        mutex_unlock(&d->lock)

#define to_hci_dev(d) container_of(d, struct hci_dev, dev)
#define to_hci_conn(c) container_of(c, struct hci_conn, dev)

static inline void *hci_get_drvdata(struct hci_dev *hdev)
{
        return dev_get_drvdata(&hdev->dev);
}

static inline void hci_set_drvdata(struct hci_dev *hdev, void *data)
{
        dev_set_drvdata(&hdev->dev, data);
}

struct hci_dev *hci_dev_get(int index);
struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src, u8 src_type);

struct hci_dev *hci_alloc_dev(void);
void hci_free_dev(struct hci_dev *hdev);
int hci_register_dev(struct hci_dev *hdev);
void hci_unregister_dev(struct hci_dev *hdev);
void hci_cleanup_dev(struct hci_dev *hdev);
int hci_suspend_dev(struct hci_dev *hdev);
int hci_resume_dev(struct hci_dev *hdev);
int hci_reset_dev(struct hci_dev *hdev);
int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb);
int hci_recv_diag(struct hci_dev *hdev, struct sk_buff *skb);
__printf(2, 3) void hci_set_hw_info(struct hci_dev *hdev, const char *fmt, ...);
__printf(2, 3) void hci_set_fw_info(struct hci_dev *hdev, const char *fmt, ...);

static inline void hci_set_msft_opcode(struct hci_dev *hdev, __u16 opcode)
{
#if IS_ENABLED(CONFIG_BT_MSFTEXT)
        hdev->msft_opcode = opcode;
#endif
}

int hci_dev_open(__u16 dev);
int hci_dev_close(__u16 dev);
int hci_dev_do_close(struct hci_dev *hdev);
int hci_dev_reset(__u16 dev);
int hci_dev_reset_stat(__u16 dev);
int hci_dev_cmd(unsigned int cmd, void __user *arg);
int hci_get_dev_list(void __user *arg);
int hci_get_dev_info(void __user *arg);
int hci_get_conn_list(void __user *arg);
int hci_get_conn_info(struct hci_dev *hdev, void __user *arg);
int hci_get_auth_info(struct hci_dev *hdev, void __user *arg);
int hci_inquiry(void __user *arg);

struct bdaddr_list *hci_bdaddr_list_lookup(struct list_head *list,
                                           bdaddr_t *bdaddr, u8 type);
struct bdaddr_list_with_irk *hci_bdaddr_list_lookup_with_irk(
                                    struct list_head *list, bdaddr_t *bdaddr,
                                    u8 type);
struct bdaddr_list_with_flags *
hci_bdaddr_list_lookup_with_flags(struct list_head *list, bdaddr_t *bdaddr,
                                  u8 type);
int hci_bdaddr_list_add(struct list_head *list, bdaddr_t *bdaddr, u8 type);
int hci_bdaddr_list_add_with_irk(struct list_head *list, bdaddr_t *bdaddr,
                                 u8 type, u8 *peer_irk, u8 *local_irk);
int hci_bdaddr_list_add_with_flags(struct list_head *list, bdaddr_t *bdaddr,
                                   u8 type, u32 flags);
int hci_bdaddr_list_del(struct list_head *list, bdaddr_t *bdaddr, u8 type);
int hci_bdaddr_list_del_with_irk(struct list_head *list, bdaddr_t *bdaddr,
                                 u8 type);
int hci_bdaddr_list_del_with_flags(struct list_head *list, bdaddr_t *bdaddr,
                                   u8 type);
void hci_bdaddr_list_clear(struct list_head *list);

struct hci_conn_params *hci_conn_params_lookup(struct hci_dev *hdev,
                                               bdaddr_t *addr, u8 addr_type);
struct hci_conn_params *hci_conn_params_add(struct hci_dev *hdev,
                                            bdaddr_t *addr, u8 addr_type);
void hci_conn_params_del(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type);
void hci_conn_params_clear_disabled(struct hci_dev *hdev);

struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list,
                                                  bdaddr_t *addr,
                                                  u8 addr_type);

void hci_uuids_clear(struct hci_dev *hdev);

void hci_link_keys_clear(struct hci_dev *hdev);
struct link_key *hci_find_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr);
struct link_key *hci_add_link_key(struct hci_dev *hdev, struct hci_conn *conn,
                                  bdaddr_t *bdaddr, u8 *val, u8 type,
                                  u8 pin_len, bool *persistent);
struct smp_ltk *hci_add_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr,
                            u8 addr_type, u8 type, u8 authenticated,
                            u8 tk[16], u8 enc_size, __le16 ediv, __le64 rand);
struct smp_ltk *hci_find_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr,
                             u8 addr_type, u8 role);
int hci_remove_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type);
void hci_smp_ltks_clear(struct hci_dev *hdev);
int hci_remove_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr);

struct smp_irk *hci_find_irk_by_rpa(struct hci_dev *hdev, bdaddr_t *rpa);
struct smp_irk *hci_find_irk_by_addr(struct hci_dev *hdev, bdaddr_t *bdaddr,
                                     u8 addr_type);
struct smp_irk *hci_add_irk(struct hci_dev *hdev, bdaddr_t *bdaddr,
                            u8 addr_type, u8 val[16], bdaddr_t *rpa);
void hci_remove_irk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type);
bool hci_is_blocked_key(struct hci_dev *hdev, u8 type, u8 val[16]);
void hci_blocked_keys_clear(struct hci_dev *hdev);
void hci_smp_irks_clear(struct hci_dev *hdev);

bool hci_bdaddr_is_paired(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type);

void hci_remote_oob_data_clear(struct hci_dev *hdev);
struct oob_data *hci_find_remote_oob_data(struct hci_dev *hdev,
                                          bdaddr_t *bdaddr, u8 bdaddr_type);
int hci_add_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr,
                            u8 bdaddr_type, u8 *hash192, u8 *rand192,
                            u8 *hash256, u8 *rand256);
int hci_remove_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr,
                               u8 bdaddr_type);

void hci_adv_instances_clear(struct hci_dev *hdev);
struct adv_info *hci_find_adv_instance(struct hci_dev *hdev, u8 instance);
struct adv_info *hci_get_next_instance(struct hci_dev *hdev, u8 instance);
int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags,
                         u16 adv_data_len, u8 *adv_data,
                         u16 scan_rsp_len, u8 *scan_rsp_data,
                         u16 timeout, u16 duration);
int hci_remove_adv_instance(struct hci_dev *hdev, u8 instance);
void hci_adv_instances_set_rpa_expired(struct hci_dev *hdev, bool rpa_expired);

void hci_adv_monitors_clear(struct hci_dev *hdev);
void hci_free_adv_monitor(struct adv_monitor *monitor);
int hci_add_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor);
int hci_remove_adv_monitor(struct hci_dev *hdev, u16 handle);
bool hci_is_adv_monitoring(struct hci_dev *hdev);

void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb);

void hci_init_sysfs(struct hci_dev *hdev);
void hci_conn_init_sysfs(struct hci_conn *conn);
void hci_conn_add_sysfs(struct hci_conn *conn);
void hci_conn_del_sysfs(struct hci_conn *conn);

#define SET_HCIDEV_DEV(hdev, pdev) ((hdev)->dev.parent = (pdev))

/* ----- LMP capabilities ----- */
#define lmp_encrypt_capable(dev)   ((dev)->features[0][0] & LMP_ENCRYPT)
#define lmp_rswitch_capable(dev)   ((dev)->features[0][0] & LMP_RSWITCH)
#define lmp_hold_capable(dev)      ((dev)->features[0][0] & LMP_HOLD)
#define lmp_sniff_capable(dev)     ((dev)->features[0][0] & LMP_SNIFF)
#define lmp_park_capable(dev)      ((dev)->features[0][1] & LMP_PARK)
#define lmp_inq_rssi_capable(dev)  ((dev)->features[0][3] & LMP_RSSI_INQ)
#define lmp_esco_capable(dev)      ((dev)->features[0][3] & LMP_ESCO)
#define lmp_bredr_capable(dev)     (!((dev)->features[0][4] & LMP_NO_BREDR))
#define lmp_le_capable(dev)        ((dev)->features[0][4] & LMP_LE)
#define lmp_sniffsubr_capable(dev) ((dev)->features[0][5] & LMP_SNIFF_SUBR)
#define lmp_pause_enc_capable(dev) ((dev)->features[0][5] & LMP_PAUSE_ENC)
#define lmp_ext_inq_capable(dev)   ((dev)->features[0][6] & LMP_EXT_INQ)
#define lmp_le_br_capable(dev)     (!!((dev)->features[0][6] & LMP_SIMUL_LE_BR))
#define lmp_ssp_capable(dev)       ((dev)->features[0][6] & LMP_SIMPLE_PAIR)
#define lmp_no_flush_capable(dev)  ((dev)->features[0][6] & LMP_NO_FLUSH)
#define lmp_lsto_capable(dev)      ((dev)->features[0][7] & LMP_LSTO)
#define lmp_inq_tx_pwr_capable(dev) ((dev)->features[0][7] & LMP_INQ_TX_PWR)
#define lmp_ext_feat_capable(dev)  ((dev)->features[0][7] & LMP_EXTFEATURES)
#define lmp_transp_capable(dev)    ((dev)->features[0][2] & LMP_TRANSPARENT)
#define lmp_edr_2m_capable(dev)    ((dev)->features[0][3] & LMP_EDR_2M)
#define lmp_edr_3m_capable(dev)    ((dev)->features[0][3] & LMP_EDR_3M)
#define lmp_edr_3slot_capable(dev) ((dev)->features[0][4] & LMP_EDR_3SLOT)
#define lmp_edr_5slot_capable(dev) ((dev)->features[0][5] & LMP_EDR_5SLOT)

/* ----- Extended LMP capabilities ----- */
#define lmp_csb_master_capable(dev) ((dev)->features[2][0] & LMP_CSB_MASTER)
#define lmp_csb_slave_capable(dev)  ((dev)->features[2][0] & LMP_CSB_SLAVE)
#define lmp_sync_train_capable(dev) ((dev)->features[2][0] & LMP_SYNC_TRAIN)
#define lmp_sync_scan_capable(dev)  ((dev)->features[2][0] & LMP_SYNC_SCAN)
#define lmp_sc_capable(dev)         ((dev)->features[2][1] & LMP_SC)
#define lmp_ping_capable(dev)       ((dev)->features[2][1] & LMP_PING)

/* ----- Host capabilities ----- */
#define lmp_host_ssp_capable(dev)  ((dev)->features[1][0] & LMP_HOST_SSP)
#define lmp_host_sc_capable(dev)   ((dev)->features[1][0] & LMP_HOST_SC)
#define lmp_host_le_capable(dev)   (!!((dev)->features[1][0] & LMP_HOST_LE))
#define lmp_host_le_br_capable(dev) (!!((dev)->features[1][0] & LMP_HOST_LE_BREDR))

#define hdev_is_powered(dev)   (test_bit(HCI_UP, &(dev)->flags) && \
                                !hci_dev_test_flag(dev, HCI_AUTO_OFF))
#define bredr_sc_enabled(dev)  (lmp_sc_capable(dev) && \
                                hci_dev_test_flag(dev, HCI_SC_ENABLED))

#define scan_1m(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_1M) || \
                      ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_1M))

#define scan_2m(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_2M) || \
                      ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_2M))

#define scan_coded(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_CODED) || \
                         ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_CODED))

/* Use LL Privacy based address resolution if supported */
#define use_ll_privacy(dev) ((dev)->le_features[0] & HCI_LE_LL_PRIVACY)

/* Use ext scanning if set ext scan param and ext scan enable is supported */
#define use_ext_scan(dev) (((dev)->commands[37] & 0x20) && \
                           ((dev)->commands[37] & 0x40))
/* Use ext create connection if command is supported */
#define use_ext_conn(dev) ((dev)->commands[37] & 0x80)

/* Extended advertising support */
#define ext_adv_capable(dev) (((dev)->le_features[1] & HCI_LE_EXT_ADV))

/* ----- HCI protocols ----- */
#define HCI_PROTO_DEFER             0x01

static inline int hci_proto_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr,
                                        __u8 type, __u8 *flags)
{
        switch (type) {
        case ACL_LINK:
                return l2cap_connect_ind(hdev, bdaddr);

        case SCO_LINK:
        case ESCO_LINK:
                return sco_connect_ind(hdev, bdaddr, flags);

        default:
                BT_ERR("unknown link type %d", type);
                return -EINVAL;
        }
}

static inline int hci_proto_disconn_ind(struct hci_conn *conn)
{
        if (conn->type != ACL_LINK && conn->type != LE_LINK)
                return HCI_ERROR_REMOTE_USER_TERM;

        return l2cap_disconn_ind(conn);
}

/* ----- HCI callbacks ----- */
struct hci_cb {
        struct list_head list;

        char *name;

        void (*connect_cfm)        (struct hci_conn *conn, __u8 status);
        void (*disconn_cfm)        (struct hci_conn *conn, __u8 status);
        void (*security_cfm)        (struct hci_conn *conn, __u8 status,
                                                                __u8 encrypt);
        void (*key_change_cfm)        (struct hci_conn *conn, __u8 status);
        void (*role_switch_cfm)        (struct hci_conn *conn, __u8 status, __u8 role);
};

static inline void hci_connect_cfm(struct hci_conn *conn, __u8 status)
{
        struct hci_cb *cb;

        mutex_lock(&hci_cb_list_lock);
        list_for_each_entry(cb, &hci_cb_list, list) {
                if (cb->connect_cfm)
                        cb->connect_cfm(conn, status);
        }
        mutex_unlock(&hci_cb_list_lock);

        if (conn->connect_cfm_cb)
                conn->connect_cfm_cb(conn, status);
}

static inline void hci_disconn_cfm(struct hci_conn *conn, __u8 reason)
{
        struct hci_cb *cb;

        mutex_lock(&hci_cb_list_lock);
        list_for_each_entry(cb, &hci_cb_list, list) {
                if (cb->disconn_cfm)
                        cb->disconn_cfm(conn, reason);
        }
        mutex_unlock(&hci_cb_list_lock);

        if (conn->disconn_cfm_cb)
                conn->disconn_cfm_cb(conn, reason);
}

static inline void hci_auth_cfm(struct hci_conn *conn, __u8 status)
{
        struct hci_cb *cb;
        __u8 encrypt;

        if (test_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags))
                return;

        encrypt = test_bit(HCI_CONN_ENCRYPT, &conn->flags) ? 0x01 : 0x00;

        mutex_lock(&hci_cb_list_lock);
        list_for_each_entry(cb, &hci_cb_list, list) {
                if (cb->security_cfm)
                        cb->security_cfm(conn, status, encrypt);
        }
        mutex_unlock(&hci_cb_list_lock);

        if (conn->security_cfm_cb)
                conn->security_cfm_cb(conn, status);
}

static inline void hci_encrypt_cfm(struct hci_conn *conn, __u8 status)
{
        struct hci_cb *cb;
        __u8 encrypt;

        if (conn->state == BT_CONFIG) {
                if (!status)
                        conn->state = BT_CONNECTED;

                hci_connect_cfm(conn, status);
                hci_conn_drop(conn);
                return;
        }

        if (!test_bit(HCI_CONN_ENCRYPT, &conn->flags))
                encrypt = 0x00;
        else if (test_bit(HCI_CONN_AES_CCM, &conn->flags))
                encrypt = 0x02;
        else
                encrypt = 0x01;

        if (!status) {
                if (conn->sec_level == BT_SECURITY_SDP)
                        conn->sec_level = BT_SECURITY_LOW;

                if (conn->pending_sec_level > conn->sec_level)
                        conn->sec_level = conn->pending_sec_level;
        }

        mutex_lock(&hci_cb_list_lock);
        list_for_each_entry(cb, &hci_cb_list, list) {
                if (cb->security_cfm)
                        cb->security_cfm(conn, status, encrypt);
        }
        mutex_unlock(&hci_cb_list_lock);

        if (conn->security_cfm_cb)
                conn->security_cfm_cb(conn, status);
}

static inline void hci_key_change_cfm(struct hci_conn *conn, __u8 status)
{
        struct hci_cb *cb;

        mutex_lock(&hci_cb_list_lock);
        list_for_each_entry(cb, &hci_cb_list, list) {
                if (cb->key_change_cfm)
                        cb->key_change_cfm(conn, status);
        }
        mutex_unlock(&hci_cb_list_lock);
}

static inline void hci_role_switch_cfm(struct hci_conn *conn, __u8 status,
                                                                __u8 role)
{
        struct hci_cb *cb;

        mutex_lock(&hci_cb_list_lock);
        list_for_each_entry(cb, &hci_cb_list, list) {
                if (cb->role_switch_cfm)
                        cb->role_switch_cfm(conn, status, role);
        }
        mutex_unlock(&hci_cb_list_lock);
}

static inline void *eir_get_data(u8 *eir, size_t eir_len, u8 type,
                                 size_t *data_len)
{
        size_t parsed = 0;

        if (eir_len < 2)
                return NULL;

        while (parsed < eir_len - 1) {
                u8 field_len = eir[0];

                if (field_len == 0)
                        break;

                parsed += field_len + 1;

                if (parsed > eir_len)
                        break;

                if (eir[1] != type) {
                        eir += field_len + 1;
                        continue;
                }

                /* Zero length data */
                if (field_len == 1)
                        return NULL;

                if (data_len)
                        *data_len = field_len - 1;

                return &eir[2];
        }

        return NULL;
}

static inline bool hci_bdaddr_is_rpa(bdaddr_t *bdaddr, u8 addr_type)
{
        if (addr_type != ADDR_LE_DEV_RANDOM)
                return false;

        if ((bdaddr->b[5] & 0xc0) == 0x40)
               return true;

        return false;
}

static inline bool hci_is_identity_address(bdaddr_t *addr, u8 addr_type)
{
        if (addr_type == ADDR_LE_DEV_PUBLIC)
                return true;

        /* Check for Random Static address type */
        if ((addr->b[5] & 0xc0) == 0xc0)
                return true;

        return false;
}

static inline struct smp_irk *hci_get_irk(struct hci_dev *hdev,
                                          bdaddr_t *bdaddr, u8 addr_type)
{
        if (!hci_bdaddr_is_rpa(bdaddr, addr_type))
                return NULL;

        return hci_find_irk_by_rpa(hdev, bdaddr);
}

static inline int hci_check_conn_params(u16 min, u16 max, u16 latency,
                                        u16 to_multiplier)
{
        u16 max_latency;

        if (min > max) {
                BT_WARN("min %d > max %d", min, max);
                return -EINVAL;
        }

        if (min < 6) {
                BT_WARN("min %d < 6", min);
                return -EINVAL;
        }

        if (max > 3200) {
                BT_WARN("max %d > 3200", max);
                return -EINVAL;
        }

        if (to_multiplier < 10) {
                BT_WARN("to_multiplier %d < 10", to_multiplier);
                return -EINVAL;
        }

        if (to_multiplier > 3200) {
                BT_WARN("to_multiplier %d > 3200", to_multiplier);
                return -EINVAL;
        }

        if (max >= to_multiplier * 8) {
                BT_WARN("max %d >= to_multiplier %d * 8", max, to_multiplier);
                return -EINVAL;
        }

        max_latency = (to_multiplier * 4 / max) - 1;
        if (latency > 499) {
                BT_WARN("latency %d > 499", latency);
                return -EINVAL;
        }

        if (latency > max_latency) {
                BT_WARN("latency %d > max_latency %d", latency, max_latency);
                return -EINVAL;
        }

        return 0;
}

int hci_register_cb(struct hci_cb *hcb);
int hci_unregister_cb(struct hci_cb *hcb);

struct sk_buff *__hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen,
                               const void *param, u32 timeout);
struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen,
                                  const void *param, u8 event, u32 timeout);
int __hci_cmd_send(struct hci_dev *hdev, u16 opcode, u32 plen,
                   const void *param);

int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen,
                 const void *param);
void hci_send_acl(struct hci_chan *chan, struct sk_buff *skb, __u16 flags);
void hci_send_sco(struct hci_conn *conn, struct sk_buff *skb);

void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode);

struct sk_buff *hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen,
                             const void *param, u32 timeout);

u32 hci_conn_get_phy(struct hci_conn *conn);

/* ----- HCI Sockets ----- */
void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb);
void hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
                         int flag, struct sock *skip_sk);
void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb);
void hci_send_monitor_ctrl_event(struct hci_dev *hdev, u16 event,
                                 void *data, u16 data_len, ktime_t tstamp,
                                 int flag, struct sock *skip_sk);

void hci_sock_dev_event(struct hci_dev *hdev, int event);

#define HCI_MGMT_VAR_LEN        BIT(0)
#define HCI_MGMT_NO_HDEV        BIT(1)
#define HCI_MGMT_UNTRUSTED        BIT(2)
#define HCI_MGMT_UNCONFIGURED        BIT(3)
#define HCI_MGMT_HDEV_OPTIONAL        BIT(4)

struct hci_mgmt_handler {
        int (*func) (struct sock *sk, struct hci_dev *hdev, void *data,
                     u16 data_len);
        size_t data_len;
        unsigned long flags;
};

struct hci_mgmt_chan {
        struct list_head list;
        unsigned short channel;
        size_t handler_count;
        const struct hci_mgmt_handler *handlers;
        void (*hdev_init) (struct sock *sk, struct hci_dev *hdev);
};

int hci_mgmt_chan_register(struct hci_mgmt_chan *c);
void hci_mgmt_chan_unregister(struct hci_mgmt_chan *c);

/* Management interface */
#define DISCOV_TYPE_BREDR                (BIT(BDADDR_BREDR))
#define DISCOV_TYPE_LE                        (BIT(BDADDR_LE_PUBLIC) | \
                                         BIT(BDADDR_LE_RANDOM))
#define DISCOV_TYPE_INTERLEAVED                (BIT(BDADDR_BREDR) | \
                                         BIT(BDADDR_LE_PUBLIC) | \
                                         BIT(BDADDR_LE_RANDOM))

/* These LE scan and inquiry parameters were chosen according to LE General
 * Discovery Procedure specification.
 */
#define DISCOV_LE_SCAN_WIN                0x12
#define DISCOV_LE_SCAN_INT                0x12
#define DISCOV_LE_TIMEOUT                10240        /* msec */
#define DISCOV_INTERLEAVED_TIMEOUT        5120        /* msec */
#define DISCOV_INTERLEAVED_INQUIRY_LEN        0x04
#define DISCOV_BREDR_INQUIRY_LEN        0x08
#define DISCOV_LE_RESTART_DELAY                msecs_to_jiffies(200)        /* msec */
#define DISCOV_LE_FAST_ADV_INT_MIN     100     /* msec */
#define DISCOV_LE_FAST_ADV_INT_MAX     150     /* msec */

void mgmt_fill_version_info(void *ver);
int mgmt_new_settings(struct hci_dev *hdev);
void mgmt_index_added(struct hci_dev *hdev);
void mgmt_index_removed(struct hci_dev *hdev);
void mgmt_set_powered_failed(struct hci_dev *hdev, int err);
void mgmt_power_on(struct hci_dev *hdev, int err);
void __mgmt_power_off(struct hci_dev *hdev);
void mgmt_new_link_key(struct hci_dev *hdev, struct link_key *key,
                       bool persistent);
void mgmt_device_connected(struct hci_dev *hdev, struct hci_conn *conn,
                           u32 flags, u8 *name, u8 name_len);
void mgmt_device_disconnected(struct hci_dev *hdev, bdaddr_t *bdaddr,
                              u8 link_type, u8 addr_type, u8 reason,
                              bool mgmt_connected);
void mgmt_disconnect_failed(struct hci_dev *hdev, bdaddr_t *bdaddr,
                            u8 link_type, u8 addr_type, u8 status);
void mgmt_connect_failed(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type,
                         u8 addr_type, u8 status);
void mgmt_pin_code_request(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 secure);
void mgmt_pin_code_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr,
                                  u8 status);
void mgmt_pin_code_neg_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr,
                                      u8 status);
int mgmt_user_confirm_request(struct hci_dev *hdev, bdaddr_t *bdaddr,
                              u8 link_type, u8 addr_type, u32 value,
                              u8 confirm_hint);
int mgmt_user_confirm_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr,
                                     u8 link_type, u8 addr_type, u8 status);
int mgmt_user_confirm_neg_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr,
                                         u8 link_type, u8 addr_type, u8 status);
int mgmt_user_passkey_request(struct hci_dev *hdev, bdaddr_t *bdaddr,
                              u8 link_type, u8 addr_type);
int mgmt_user_passkey_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr,
                                     u8 link_type, u8 addr_type, u8 status);
int mgmt_user_passkey_neg_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr,
                                         u8 link_type, u8 addr_type, u8 status);
int mgmt_user_passkey_notify(struct hci_dev *hdev, bdaddr_t *bdaddr,
                             u8 link_type, u8 addr_type, u32 passkey,
                             u8 entered);
void mgmt_auth_failed(struct hci_conn *conn, u8 status);
void mgmt_auth_enable_complete(struct hci_dev *hdev, u8 status);
void mgmt_ssp_enable_complete(struct hci_dev *hdev, u8 enable, u8 status);
void mgmt_set_class_of_dev_complete(struct hci_dev *hdev, u8 *dev_class,
                                    u8 status);
void mgmt_set_local_name_complete(struct hci_dev *hdev, u8 *name, u8 status);
void mgmt_start_discovery_complete(struct hci_dev *hdev, u8 status);
void mgmt_stop_discovery_complete(struct hci_dev *hdev, u8 status);
void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type,
                       u8 addr_type, u8 *dev_class, s8 rssi, u32 flags,
                       u8 *eir, u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len);
void mgmt_remote_name(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type,
                      u8 addr_type, s8 rssi, u8 *name, u8 name_len);
void mgmt_discovering(struct hci_dev *hdev, u8 discovering);
void mgmt_suspending(struct hci_dev *hdev, u8 state);
void mgmt_resuming(struct hci_dev *hdev, u8 reason, bdaddr_t *bdaddr,
                   u8 addr_type);
bool mgmt_powering_down(struct hci_dev *hdev);
void mgmt_new_ltk(struct hci_dev *hdev, struct smp_ltk *key, bool persistent);
void mgmt_new_irk(struct hci_dev *hdev, struct smp_irk *irk, bool persistent);
void mgmt_new_csrk(struct hci_dev *hdev, struct smp_csrk *csrk,
                   bool persistent);
void mgmt_new_conn_param(struct hci_dev *hdev, bdaddr_t *bdaddr,
                         u8 bdaddr_type, u8 store_hint, u16 min_interval,
                         u16 max_interval, u16 latency, u16 timeout);
void mgmt_smp_complete(struct hci_conn *conn, bool complete);
bool mgmt_get_connectable(struct hci_dev *hdev);
void mgmt_set_connectable_complete(struct hci_dev *hdev, u8 status);
void mgmt_set_discoverable_complete(struct hci_dev *hdev, u8 status);
u8 mgmt_get_adv_discov_flags(struct hci_dev *hdev);
void mgmt_advertising_added(struct sock *sk, struct hci_dev *hdev,
                            u8 instance);
void mgmt_advertising_removed(struct sock *sk, struct hci_dev *hdev,
                              u8 instance);
int mgmt_phy_configuration_changed(struct hci_dev *hdev, struct sock *skip);

u8 hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency,
                      u16 to_multiplier);
void hci_le_start_enc(struct hci_conn *conn, __le16 ediv, __le64 rand,
                      __u8 ltk[16], __u8 key_size);

void hci_copy_identity_address(struct hci_dev *hdev, bdaddr_t *bdaddr,
                               u8 *bdaddr_type);

#define SCO_AIRMODE_MASK       0x0003
#define SCO_AIRMODE_CVSD       0x0000
#define SCO_AIRMODE_TRANSP     0x0003

#endif /* __HCI_CORE_H */



































































































































































































































































































































































































    1 




































































    1 


    1 





    1 
    1 



    1 


    1 

















































































































































































































































































































    1 
    1 
    1 






































































































































































































































































































































































































































    1 





















    1 



























































































    1 





    1 




    1 

    1 







    1 



    1 



    1 
    1 



    1 























































    1 
    1 
    1 

    1 














































































































































































    1 


    1 

    1 

    1 






    1 












    1 

    1 


    1 









    1 











    1 






    1 








    1 




    1 


    1 


    1 
    1 
    1 
    1 
    1 
    1 
    1 
    1 


    1 
    1 

    1 


    1 










    1 
    1 


    1 
    1 











    1 

    1 


    1 








    1 


    1 
    1 




































    1 








    1 





    1 













    1 


    1 
    1 



    1 

    1 
























    1 
    1 

    1 





    1 







    1 








    1 
    1 






    1 



    1 
    1 



    1 



    1 






































    1 








    1 
    1 
    1 


    1 







    1 
    1 


    1 



    1 










    1 
    1 
    1 
    1 


    1 










    1 













    1 







    1 
    1 


    1 


    1 









    1 

























    1 









    1 

    1 

















    1 


    1 













    1 


    1 



    1 




















































    1 




    1 
    1 

    1 



    1 



    1 

    1 


    1 




    1 


    1 


    1 




    1 























    1 








    1 

    1 

    1 


    1 

    1 

    1 





























    1 


















    1 
    1 










    1 











    1 


    1 












































    1 












    1 


    1 
    1 


    1 







    1 








    1 



    1 
    1 




    1 








    1 





    1 
    1 









    1 













    1 

    1 


    1 

    1 



    1 







    1 















    1 






    1 

    1 
    1 



    1 












    1 



























    1 









    1 


    1 













































































































































































































































































































































































































































































































































































































































































































































































































    1 
    1 

    1 










    1 




    1 







    1 























    1 









    1 


    1 





    1 








    1 


    1 



    1 


    1 



    1 





































































































































    1 
    1 




    1 




    1 








    1 




























    1 

    1 

    1 

    1 

    1 

    1 

    1 

    1 















    1 
















    1 
    1 


    1 






    1 






    1 






































    1 



















    1 





    1 












    1 
    1 

    1 











    1 















    1 















































    1 
















    1 
    1 
    1 
    1 










    1 































    1 
    1 




















    1 







    1 
    1 



    1 



    1 
    1 



    1 






    1 
    1 















    1 


    1 











    1 



    1 








    1 


















































































    1 






    1 


    1 


























    1 




    1 




    1 










    1 
































    1 






    1 
    1 
    1 
    1 





















































    1 

    1 













    1 







    1 





































































    1 


    1 





























































































































































































































































































































    1 






    1 


    1 

    1 































































































    1 












    1 
    1 




    1 





    1 





    1 












    1 















    1 



    1 
























    1 




























































































































    1 


















    1 















    1 

    1 


    1 















    1 






    1 




    1 




    1 

    1 

    1 

    1 
    1 













































































































































    1 




    1 






    1 




    1 
    1 





    1 





    1 





    1 





    1 






    1 






    1 





    1 


    1 







    1 












    1 
    1 
    1 

    1 
    1 




















    1 







    1 
    1 


    1 
    1 

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
 * Written by Alex Tomas <alex@clusterfs.com>
 */


/*
 * mballoc.c contains the multiblocks allocation routines
 */

#include "ext4_jbd2.h"
#include "mballoc.h"
#include <linux/log2.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/nospec.h>
#include <linux/backing-dev.h>
#include <linux/freezer.h>
#include <trace/events/ext4.h>

/*
 * MUSTDO:
 *   - test ext4_ext_search_left() and ext4_ext_search_right()
 *   - search for metadata in few groups
 *
 * TODO v4:
 *   - normalization should take into account whether file is still open
 *   - discard preallocations if no free space left (policy?)
 *   - don't normalize tails
 *   - quota
 *   - reservation for superuser
 *
 * TODO v3:
 *   - bitmap read-ahead (proposed by Oleg Drokin aka green)
 *   - track min/max extents in each group for better group selection
 *   - mb_mark_used() may allocate chunk right after splitting buddy
 *   - tree of groups sorted by number of free blocks
 *   - error handling
 */

/*
 * The allocation request involve request for multiple number of blocks
 * near to the goal(block) value specified.
 *
 * During initialization phase of the allocator we decide to use the
 * group preallocation or inode preallocation depending on the size of
 * the file. The size of the file could be the resulting file size we
 * would have after allocation, or the current file size, which ever
 * is larger. If the size is less than sbi->s_mb_stream_request we
 * select to use the group preallocation. The default value of
 * s_mb_stream_request is 16 blocks. This can also be tuned via
 * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
 * terms of number of blocks.
 *
 * The main motivation for having small file use group preallocation is to
 * ensure that we have small files closer together on the disk.
 *
 * First stage the allocator looks at the inode prealloc list,
 * ext4_inode_info->i_prealloc_list, which contains list of prealloc
 * spaces for this particular inode. The inode prealloc space is
 * represented as:
 *
 * pa_lstart -> the logical start block for this prealloc space
 * pa_pstart -> the physical start block for this prealloc space
 * pa_len    -> length for this prealloc space (in clusters)
 * pa_free   ->  free space available in this prealloc space (in clusters)
 *
 * The inode preallocation space is used looking at the _logical_ start
 * block. If only the logical file block falls within the range of prealloc
 * space we will consume the particular prealloc space. This makes sure that
 * we have contiguous physical blocks representing the file blocks
 *
 * The important thing to be noted in case of inode prealloc space is that
 * we don't modify the values associated to inode prealloc space except
 * pa_free.
 *
 * If we are not able to find blocks in the inode prealloc space and if we
 * have the group allocation flag set then we look at the locality group
 * prealloc space. These are per CPU prealloc list represented as
 *
 * ext4_sb_info.s_locality_groups[smp_processor_id()]
 *
 * The reason for having a per cpu locality group is to reduce the contention
 * between CPUs. It is possible to get scheduled at this point.
 *
 * The locality group prealloc space is used looking at whether we have
 * enough free space (pa_free) within the prealloc space.
 *
 * If we can't allocate blocks via inode prealloc or/and locality group
 * prealloc then we look at the buddy cache. The buddy cache is represented
 * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets
 * mapped to the buddy and bitmap information regarding different
 * groups. The buddy information is attached to buddy cache inode so that
 * we can access them through the page cache. The information regarding
 * each group is loaded via ext4_mb_load_buddy.  The information involve
 * block bitmap and buddy information. The information are stored in the
 * inode as:
 *
 *  {                        page                        }
 *  [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
 *
 *
 * one block each for bitmap and buddy information.  So for each group we
 * take up 2 blocks. A page can contain blocks_per_page (PAGE_SIZE /
 * blocksize) blocks.  So it can have information regarding groups_per_page
 * which is blocks_per_page/2
 *
 * The buddy cache inode is not stored on disk. The inode is thrown
 * away when the filesystem is unmounted.
 *
 * We look for count number of blocks in the buddy cache. If we were able
 * to locate that many free blocks we return with additional information
 * regarding rest of the contiguous physical block available
 *
 * Before allocating blocks via buddy cache we normalize the request
 * blocks. This ensure we ask for more blocks that we needed. The extra
 * blocks that we get after allocation is added to the respective prealloc
 * list. In case of inode preallocation we follow a list of heuristics
 * based on file size. This can be found in ext4_mb_normalize_request. If
 * we are doing a group prealloc we try to normalize the request to
 * sbi->s_mb_group_prealloc.  The default value of s_mb_group_prealloc is
 * dependent on the cluster size; for non-bigalloc file systems, it is
 * 512 blocks. This can be tuned via
 * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
 * terms of number of blocks. If we have mounted the file system with -O
 * stripe=<value> option the group prealloc request is normalized to the
 * smallest multiple of the stripe value (sbi->s_stripe) which is
 * greater than the default mb_group_prealloc.
 *
 * The regular allocator (using the buddy cache) supports a few tunables.
 *
 * /sys/fs/ext4/<partition>/mb_min_to_scan
 * /sys/fs/ext4/<partition>/mb_max_to_scan
 * /sys/fs/ext4/<partition>/mb_order2_req
 *
 * The regular allocator uses buddy scan only if the request len is power of
 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
 * value of s_mb_order2_reqs can be tuned via
 * /sys/fs/ext4/<partition>/mb_order2_req.  If the request len is equal to
 * stripe size (sbi->s_stripe), we try to search for contiguous block in
 * stripe size. This should result in better allocation on RAID setups. If
 * not, we search in the specific group using bitmap for best extents. The
 * tunable min_to_scan and max_to_scan control the behaviour here.
 * min_to_scan indicate how long the mballoc __must__ look for a best
 * extent and max_to_scan indicates how long the mballoc __can__ look for a
 * best extent in the found extents. Searching for the blocks starts with
 * the group specified as the goal value in allocation context via
 * ac_g_ex. Each group is first checked based on the criteria whether it
 * can be used for allocation. ext4_mb_good_group explains how the groups are
 * checked.
 *
 * Both the prealloc space are getting populated as above. So for the first
 * request we will hit the buddy cache which will result in this prealloc
 * space getting filled. The prealloc space is then later used for the
 * subsequent request.
 */

/*
 * mballoc operates on the following data:
 *  - on-disk bitmap
 *  - in-core buddy (actually includes buddy and bitmap)
 *  - preallocation descriptors (PAs)
 *
 * there are two types of preallocations:
 *  - inode
 *    assiged to specific inode and can be used for this inode only.
 *    it describes part of inode's space preallocated to specific
 *    physical blocks. any block from that preallocated can be used
 *    independent. the descriptor just tracks number of blocks left
 *    unused. so, before taking some block from descriptor, one must
 *    make sure corresponded logical block isn't allocated yet. this
 *    also means that freeing any block within descriptor's range
 *    must discard all preallocated blocks.
 *  - locality group
 *    assigned to specific locality group which does not translate to
 *    permanent set of inodes: inode can join and leave group. space
 *    from this type of preallocation can be used for any inode. thus
 *    it's consumed from the beginning to the end.
 *
 * relation between them can be expressed as:
 *    in-core buddy = on-disk bitmap + preallocation descriptors
 *
 * this mean blocks mballoc considers used are:
 *  - allocated blocks (persistent)
 *  - preallocated blocks (non-persistent)
 *
 * consistency in mballoc world means that at any time a block is either
 * free or used in ALL structures. notice: "any time" should not be read
 * literally -- time is discrete and delimited by locks.
 *
 *  to keep it simple, we don't use block numbers, instead we count number of
 *  blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA.
 *
 * all operations can be expressed as:
 *  - init buddy:                        buddy = on-disk + PAs
 *  - new PA:                                buddy += N; PA = N
 *  - use inode PA:                        on-disk += N; PA -= N
 *  - discard inode PA                        buddy -= on-disk - PA; PA = 0
 *  - use locality group PA                on-disk += N; PA -= N
 *  - discard locality group PA                buddy -= PA; PA = 0
 *  note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap
 *        is used in real operation because we can't know actual used
 *        bits from PA, only from on-disk bitmap
 *
 * if we follow this strict logic, then all operations above should be atomic.
 * given some of them can block, we'd have to use something like semaphores
 * killing performance on high-end SMP hardware. let's try to relax it using
 * the following knowledge:
 *  1) if buddy is referenced, it's already initialized
 *  2) while block is used in buddy and the buddy is referenced,
 *     nobody can re-allocate that block
 *  3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has
 *     bit set and PA claims same block, it's OK. IOW, one can set bit in
 *     on-disk bitmap if buddy has same bit set or/and PA covers corresponded
 *     block
 *
 * so, now we're building a concurrency table:
 *  - init buddy vs.
 *    - new PA
 *      blocks for PA are allocated in the buddy, buddy must be referenced
 *      until PA is linked to allocation group to avoid concurrent buddy init
 *    - use inode PA
 *      we need to make sure that either on-disk bitmap or PA has uptodate data
 *      given (3) we care that PA-=N operation doesn't interfere with init
 *    - discard inode PA
 *      the simplest way would be to have buddy initialized by the discard
 *    - use locality group PA
 *      again PA-=N must be serialized with init
 *    - discard locality group PA
 *      the simplest way would be to have buddy initialized by the discard
 *  - new PA vs.
 *    - use inode PA
 *      i_data_sem serializes them
 *    - discard inode PA
 *      discard process must wait until PA isn't used by another process
 *    - use locality group PA
 *      some mutex should serialize them
 *    - discard locality group PA
 *      discard process must wait until PA isn't used by another process
 *  - use inode PA
 *    - use inode PA
 *      i_data_sem or another mutex should serializes them
 *    - discard inode PA
 *      discard process must wait until PA isn't used by another process
 *    - use locality group PA
 *      nothing wrong here -- they're different PAs covering different blocks
 *    - discard locality group PA
 *      discard process must wait until PA isn't used by another process
 *
 * now we're ready to make few consequences:
 *  - PA is referenced and while it is no discard is possible
 *  - PA is referenced until block isn't marked in on-disk bitmap
 *  - PA changes only after on-disk bitmap
 *  - discard must not compete with init. either init is done before
 *    any discard or they're serialized somehow
 *  - buddy init as sum of on-disk bitmap and PAs is done atomically
 *
 * a special case when we've used PA to emptiness. no need to modify buddy
 * in this case, but we should care about concurrent init
 *
 */

 /*
 * Logic in few words:
 *
 *  - allocation:
 *    load group
 *    find blocks
 *    mark bits in on-disk bitmap
 *    release group
 *
 *  - use preallocation:
 *    find proper PA (per-inode or group)
 *    load group
 *    mark bits in on-disk bitmap
 *    release group
 *    release PA
 *
 *  - free:
 *    load group
 *    mark bits in on-disk bitmap
 *    release group
 *
 *  - discard preallocations in group:
 *    mark PAs deleted
 *    move them onto local list
 *    load on-disk bitmap
 *    load group
 *    remove PA from object (inode or locality group)
 *    mark free blocks in-core
 *
 *  - discard inode's preallocations:
 */

/*
 * Locking rules
 *
 * Locks:
 *  - bitlock on a group        (group)
 *  - object (inode/locality)        (object)
 *  - per-pa lock                (pa)
 *
 * Paths:
 *  - new pa
 *    object
 *    group
 *
 *  - find and use pa:
 *    pa
 *
 *  - release consumed pa:
 *    pa
 *    group
 *    object
 *
 *  - generate in-core bitmap:
 *    group
 *        pa
 *
 *  - discard all for given object (inode, locality group):
 *    object
 *        pa
 *    group
 *
 *  - discard all for given group:
 *    group
 *        pa
 *    group
 *        object
 *
 */
static struct kmem_cache *ext4_pspace_cachep;
static struct kmem_cache *ext4_ac_cachep;
static struct kmem_cache *ext4_free_data_cachep;

/* We create slab caches for groupinfo data structures based on the
 * superblock block size.  There will be one per mounted filesystem for
 * each unique s_blocksize_bits */
#define NR_GRPINFO_CACHES 8
static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];

static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
        "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
        "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
        "ext4_groupinfo_64k", "ext4_groupinfo_128k"
};

static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                                        ext4_group_t group);
static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
                                                ext4_group_t group);
static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);

/*
 * The algorithm using this percpu seq counter goes below:
 * 1. We sample the percpu discard_pa_seq counter before trying for block
 *    allocation in ext4_mb_new_blocks().
 * 2. We increment this percpu discard_pa_seq counter when we either allocate
 *    or free these blocks i.e. while marking those blocks as used/free in
 *    mb_mark_used()/mb_free_blocks().
 * 3. We also increment this percpu seq counter when we successfully identify
 *    that the bb_prealloc_list is not empty and hence proceed for discarding
 *    of those PAs inside ext4_mb_discard_group_preallocations().
 *
 * Now to make sure that the regular fast path of block allocation is not
 * affected, as a small optimization we only sample the percpu seq counter
 * on that cpu. Only when the block allocation fails and when freed blocks
 * found were 0, that is when we sample percpu seq counter for all cpus using
 * below function ext4_get_discard_pa_seq_sum(). This happens after making
 * sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty.
 */
static DEFINE_PER_CPU(u64, discard_pa_seq);
static inline u64 ext4_get_discard_pa_seq_sum(void)
{
        int __cpu;
        u64 __seq = 0;

        for_each_possible_cpu(__cpu)
                __seq += per_cpu(discard_pa_seq, __cpu);
        return __seq;
}

static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
{
#if BITS_PER_LONG == 64
        *bit += ((unsigned long) addr & 7UL) << 3;
        addr = (void *) ((unsigned long) addr & ~7UL);
#elif BITS_PER_LONG == 32
        *bit += ((unsigned long) addr & 3UL) << 3;
        addr = (void *) ((unsigned long) addr & ~3UL);
#else
#error "how many bits you are?!"
#endif
        return addr;
}

static inline int mb_test_bit(int bit, void *addr)
{
        /*
         * ext4_test_bit on architecture like powerpc
         * needs unsigned long aligned address
         */
        addr = mb_correct_addr_and_bit(&bit, addr);
        return ext4_test_bit(bit, addr);
}

static inline void mb_set_bit(int bit, void *addr)
{
        addr = mb_correct_addr_and_bit(&bit, addr);
        ext4_set_bit(bit, addr);
}

static inline void mb_clear_bit(int bit, void *addr)
{
        addr = mb_correct_addr_and_bit(&bit, addr);
        ext4_clear_bit(bit, addr);
}

static inline int mb_test_and_clear_bit(int bit, void *addr)
{
        addr = mb_correct_addr_and_bit(&bit, addr);
        return ext4_test_and_clear_bit(bit, addr);
}

static inline int mb_find_next_zero_bit(void *addr, int max, int start)
{
        int fix = 0, ret, tmpmax;
        addr = mb_correct_addr_and_bit(&fix, addr);
        tmpmax = max + fix;
        start += fix;

        ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
        if (ret > max)
                return max;
        return ret;
}

static inline int mb_find_next_bit(void *addr, int max, int start)
{
        int fix = 0, ret, tmpmax;
        addr = mb_correct_addr_and_bit(&fix, addr);
        tmpmax = max + fix;
        start += fix;

        ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
        if (ret > max)
                return max;
        return ret;
}

static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
{
        char *bb;

        BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
        BUG_ON(max == NULL);

        if (order > e4b->bd_blkbits + 1) {
                *max = 0;
                return NULL;
        }

        /* at order 0 we see each particular block */
        if (order == 0) {
                *max = 1 << (e4b->bd_blkbits + 3);
                return e4b->bd_bitmap;
        }

        bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
        *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];

        return bb;
}

#ifdef DOUBLE_CHECK
static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
                           int first, int count)
{
        int i;
        struct super_block *sb = e4b->bd_sb;

        if (unlikely(e4b->bd_info->bb_bitmap == NULL))
                return;
        assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
        for (i = 0; i < count; i++) {
                if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
                        ext4_fsblk_t blocknr;

                        blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
                        blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
                        ext4_grp_locked_error(sb, e4b->bd_group,
                                              inode ? inode->i_ino : 0,
                                              blocknr,
                                              "freeing block already freed "
                                              "(bit %u)",
                                              first + i);
                        ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT);
                }
                mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
        }
}

static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
{
        int i;

        if (unlikely(e4b->bd_info->bb_bitmap == NULL))
                return;
        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
        for (i = 0; i < count; i++) {
                BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
                mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
        }
}

static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
{
        if (unlikely(e4b->bd_info->bb_bitmap == NULL))
                return;
        if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
                unsigned char *b1, *b2;
                int i;
                b1 = (unsigned char *) e4b->bd_info->bb_bitmap;
                b2 = (unsigned char *) bitmap;
                for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
                        if (b1[i] != b2[i]) {
                                ext4_msg(e4b->bd_sb, KERN_ERR,
                                         "corruption in group %u "
                                         "at byte %u(%u): %x in copy != %x "
                                         "on disk/prealloc",
                                         e4b->bd_group, i, i * 8, b1[i], b2[i]);
                                BUG();
                        }
                }
        }
}

static void mb_group_bb_bitmap_alloc(struct super_block *sb,
                        struct ext4_group_info *grp, ext4_group_t group)
{
        struct buffer_head *bh;

        grp->bb_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS);
        if (!grp->bb_bitmap)
                return;

        bh = ext4_read_block_bitmap(sb, group);
        if (IS_ERR_OR_NULL(bh)) {
                kfree(grp->bb_bitmap);
                grp->bb_bitmap = NULL;
                return;
        }

        memcpy(grp->bb_bitmap, bh->b_data, sb->s_blocksize);
        put_bh(bh);
}

static void mb_group_bb_bitmap_free(struct ext4_group_info *grp)
{
        kfree(grp->bb_bitmap);
}

#else
static inline void mb_free_blocks_double(struct inode *inode,
                                struct ext4_buddy *e4b, int first, int count)
{
        return;
}
static inline void mb_mark_used_double(struct ext4_buddy *e4b,
                                                int first, int count)
{
        return;
}
static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
{
        return;
}

static inline void mb_group_bb_bitmap_alloc(struct super_block *sb,
                        struct ext4_group_info *grp, ext4_group_t group)
{
        return;
}

static inline void mb_group_bb_bitmap_free(struct ext4_group_info *grp)
{
        return;
}
#endif

#ifdef AGGRESSIVE_CHECK

#define MB_CHECK_ASSERT(assert)                                                \
do {                                                                        \
        if (!(assert)) {                                                \
                printk(KERN_EMERG                                        \
                        "Assertion failure in %s() at %s:%d: \"%s\"\n",        \
                        function, file, line, # assert);                \
                BUG();                                                        \
        }                                                                \
} while (0)

/*
 * Perform buddy integrity check with the following steps:
 *
 * 1. Top-down validation (from highest order down to order 1, excluding order-0 bitmap):
 *    For each pair of adjacent orders, if a higher-order bit is set (indicating a free block),
 *    at most one of the two corresponding lower-order bits may be clear (free).
 *
 * 2. Order-0 (bitmap) validation, performed on bit pairs:
 *    - If either bit in a pair is set (1, allocated), then all corresponding higher-order bits
 *      must not be free (0).
 *    - If both bits in a pair are clear (0, free), then exactly one of the corresponding
 *      higher-order bits must be free (0).
 *
 * 3. Preallocation (pa) list validation:
 *    For each preallocated block (pa) in the group:
 *    - Verify that pa_pstart falls within the bounds of this block group.
 *    - Ensure the corresponding bit(s) in the order-0 bitmap are marked as allocated (1).
 */
static void __mb_check_buddy(struct ext4_buddy *e4b, char *file,
                                const char *function, int line)
{
        struct super_block *sb = e4b->bd_sb;
        int order = e4b->bd_blkbits + 1;
        int max;
        int max2;
        int i;
        int j;
        int k;
        int count;
        struct ext4_group_info *grp;
        int fragments = 0;
        int fstart;
        struct list_head *cur;
        void *buddy;
        void *buddy2;

        if (e4b->bd_info->bb_check_counter++ % 10)
                return;

        while (order > 1) {
                buddy = mb_find_buddy(e4b, order, &max);
                MB_CHECK_ASSERT(buddy);
                buddy2 = mb_find_buddy(e4b, order - 1, &max2);
                MB_CHECK_ASSERT(buddy2);
                MB_CHECK_ASSERT(buddy != buddy2);
                MB_CHECK_ASSERT(max * 2 == max2);

                count = 0;
                for (i = 0; i < max; i++) {

                        if (mb_test_bit(i, buddy)) {
                                /* only single bit in buddy2 may be 1 */
                                if (!mb_test_bit(i << 1, buddy2)) {
                                        MB_CHECK_ASSERT(
                                                mb_test_bit((i<<1)+1, buddy2));
                                } else if (!mb_test_bit((i << 1) + 1, buddy2)) {
                                        MB_CHECK_ASSERT(
                                                mb_test_bit(i << 1, buddy2));
                                }
                                continue;
                        }

                        count++;
                }
                MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count);
                order--;
        }

        fstart = -1;
        buddy = mb_find_buddy(e4b, 0, &max);
        for (i = 0; i < max; i++) {
                if (!mb_test_bit(i, buddy)) {
                        MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free);
                        if (fstart == -1) {
                                fragments++;
                                fstart = i;
                        }
                } else {
                        fstart = -1;
                }
                if (!(i & 1)) {
                        int in_use, zero_bit_count = 0;

                        in_use = mb_test_bit(i, buddy) || mb_test_bit(i + 1, buddy);
                        for (j = 1; j < e4b->bd_blkbits + 2; j++) {
                                buddy2 = mb_find_buddy(e4b, j, &max2);
                                k = i >> j;
                                MB_CHECK_ASSERT(k < max2);
                                if (!mb_test_bit(k, buddy2))
                                        zero_bit_count++;
                        }
                        MB_CHECK_ASSERT(zero_bit_count == !in_use);
                }
        }
        MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info));
        MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);

        grp = ext4_get_group_info(sb, e4b->bd_group);
        if (!grp)
                return;
        list_for_each(cur, &grp->bb_prealloc_list) {
                ext4_group_t groupnr;
                struct ext4_prealloc_space *pa;
                pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
                if (!pa->pa_len)
                        continue;
                ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k);
                MB_CHECK_ASSERT(groupnr == e4b->bd_group);
                for (i = 0; i < pa->pa_len; i++)
                        MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
        }
}
#undef MB_CHECK_ASSERT
#define mb_check_buddy(e4b) __mb_check_buddy(e4b,        \
                                        __FILE__, __func__, __LINE__)
#else
#define mb_check_buddy(e4b)
#endif

/*
 * Divide blocks started from @first with length @len into
 * smaller chunks with power of 2 blocks.
 * Clear the bits in bitmap which the blocks of the chunk(s) covered,
 * then increase bb_counters[] for corresponded chunk size.
 */
static void ext4_mb_mark_free_simple(struct super_block *sb,
                                void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
                                        struct ext4_group_info *grp)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_grpblk_t min;
        ext4_grpblk_t max;
        ext4_grpblk_t chunk;
        unsigned int border;

        BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));

        border = 2 << sb->s_blocksize_bits;

        while (len > 0) {
                /* find how many blocks can be covered since this position */
                max = ffs(first | border) - 1;

                /* find how many blocks of power 2 we need to mark */
                min = fls(len) - 1;

                if (max < min)
                        min = max;
                chunk = 1 << min;

                /* mark multiblock chunks only */
                grp->bb_counters[min]++;
                if (min > 0)
                        mb_clear_bit(first >> min,
                                     buddy + sbi->s_mb_offsets[min]);

                len -= chunk;
                first += chunk;
        }
}

/*
 * Cache the order of the largest free extent we have available in this block
 * group.
 */
static void
mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
{
        int i;
        int bits;

        grp->bb_largest_free_order = -1; /* uninit */

        bits = sb->s_blocksize_bits + 1;
        for (i = bits; i >= 0; i--) {
                if (grp->bb_counters[i] > 0) {
                        grp->bb_largest_free_order = i;
                        break;
                }
        }
}

static noinline_for_stack
void ext4_mb_generate_buddy(struct super_block *sb,
                            void *buddy, void *bitmap, ext4_group_t group,
                            struct ext4_group_info *grp)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
        ext4_grpblk_t i = 0;
        ext4_grpblk_t first;
        ext4_grpblk_t len;
        unsigned free = 0;
        unsigned fragments = 0;
        unsigned long long period = get_cycles();

        /* initialize buddy from bitmap which is aggregation
         * of on-disk bitmap and preallocations */
        i = mb_find_next_zero_bit(bitmap, max, 0);
        grp->bb_first_free = i;
        while (i < max) {
                fragments++;
                first = i;
                i = mb_find_next_bit(bitmap, max, i);
                len = i - first;
                free += len;
                if (len > 1)
                        ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
                else
                        grp->bb_counters[0]++;
                if (i < max)
                        i = mb_find_next_zero_bit(bitmap, max, i);
        }
        grp->bb_fragments = fragments;

        if (free != grp->bb_free) {
                ext4_grp_locked_error(sb, group, 0, 0,
                                      "block bitmap and bg descriptor "
                                      "inconsistent: %u vs %u free clusters",
                                      free, grp->bb_free);
                /*
                 * If we intend to continue, we consider group descriptor
                 * corrupt and update bb_free using bitmap value
                 */
                grp->bb_free = free;
                ext4_mark_group_bitmap_corrupted(sb, group,
                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT);
        }
        mb_set_largest_free_order(sb, grp);

        clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));

        period = get_cycles() - period;
        atomic_inc(&sbi->s_mb_buddies_generated);
        atomic64_add(period, &sbi->s_mb_generation_time);
}

static void mb_regenerate_buddy(struct ext4_buddy *e4b)
{
        int count;
        int order = 1;
        void *buddy;

        while ((buddy = mb_find_buddy(e4b, order++, &count)))
                ext4_set_bits(buddy, 0, count);

        e4b->bd_info->bb_fragments = 0;
        memset(e4b->bd_info->bb_counters, 0,
                sizeof(*e4b->bd_info->bb_counters) *
                (e4b->bd_sb->s_blocksize_bits + 2));

        ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
                e4b->bd_bitmap, e4b->bd_group, e4b->bd_info);
}

/* The buddy information is attached the buddy cache inode
 * for convenience. The information regarding each group
 * is loaded via ext4_mb_load_buddy. The information involve
 * block bitmap and buddy information. The information are
 * stored in the inode as
 *
 * {                        page                        }
 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
 *
 *
 * one block each for bitmap and buddy information.
 * So for each group we take up 2 blocks. A page can
 * contain blocks_per_page (PAGE_SIZE / blocksize)  blocks.
 * So it can have information regarding groups_per_page which
 * is blocks_per_page/2
 *
 * Locking note:  This routine takes the block group lock of all groups
 * for this page; do not hold this lock when calling this routine!
 */

static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
{
        ext4_group_t ngroups;
        int blocksize;
        int blocks_per_page;
        int groups_per_page;
        int err = 0;
        int i;
        ext4_group_t first_group, group;
        int first_block;
        struct super_block *sb;
        struct buffer_head *bhs;
        struct buffer_head **bh = NULL;
        struct inode *inode;
        char *data;
        char *bitmap;
        struct ext4_group_info *grinfo;

        inode = page->mapping->host;
        sb = inode->i_sb;
        ngroups = ext4_get_groups_count(sb);
        blocksize = i_blocksize(inode);
        blocks_per_page = PAGE_SIZE / blocksize;

        mb_debug(sb, "init page %lu\n", page->index);

        groups_per_page = blocks_per_page >> 1;
        if (groups_per_page == 0)
                groups_per_page = 1;

        /* allocate buffer_heads to read bitmaps */
        if (groups_per_page > 1) {
                i = sizeof(struct buffer_head *) * groups_per_page;
                bh = kzalloc(i, gfp);
                if (bh == NULL) {
                        err = -ENOMEM;
                        goto out;
                }
        } else
                bh = &bhs;

        first_group = page->index * blocks_per_page / 2;

        /* read all groups the page covers into the cache */
        for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
                if (group >= ngroups)
                        break;

                grinfo = ext4_get_group_info(sb, group);
                if (!grinfo)
                        continue;
                /*
                 * If page is uptodate then we came here after online resize
                 * which added some new uninitialized group info structs, so
                 * we must skip all initialized uptodate buddies on the page,
                 * which may be currently in use by an allocating task.
                 */
                if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
                        bh[i] = NULL;
                        continue;
                }
                bh[i] = ext4_read_block_bitmap_nowait(sb, group, false);
                if (IS_ERR(bh[i])) {
                        err = PTR_ERR(bh[i]);
                        bh[i] = NULL;
                        goto out;
                }
                mb_debug(sb, "read bitmap for group %u\n", group);
        }

        /* wait for I/O completion */
        for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
                int err2;

                if (!bh[i])
                        continue;
                err2 = ext4_wait_block_bitmap(sb, group, bh[i]);
                if (!err)
                        err = err2;
        }

        first_block = page->index * blocks_per_page;
        for (i = 0; i < blocks_per_page; i++) {
                group = (first_block + i) >> 1;
                if (group >= ngroups)
                        break;

                if (!bh[group - first_group])
                        /* skip initialized uptodate buddy */
                        continue;

                if (!buffer_verified(bh[group - first_group]))
                        /* Skip faulty bitmaps */
                        continue;
                err = 0;

                /*
                 * data carry information regarding this
                 * particular group in the format specified
                 * above
                 *
                 */
                data = page_address(page) + (i * blocksize);
                bitmap = bh[group - first_group]->b_data;

                /*
                 * We place the buddy block and bitmap block
                 * close together
                 */
                if ((first_block + i) & 1) {
                        /* this is block of buddy */
                        BUG_ON(incore == NULL);
                        mb_debug(sb, "put buddy for group %u in page %lu/%x\n",
                                group, page->index, i * blocksize);
                        trace_ext4_mb_buddy_bitmap_load(sb, group);
                        grinfo = ext4_get_group_info(sb, group);
                        if (!grinfo) {
                                err = -EFSCORRUPTED;
                                goto out;
                        }
                        grinfo->bb_fragments = 0;
                        memset(grinfo->bb_counters, 0,
                               sizeof(*grinfo->bb_counters) *
                                (sb->s_blocksize_bits+2));
                        /*
                         * incore got set to the group block bitmap below
                         */
                        ext4_lock_group(sb, group);
                        /* init the buddy */
                        memset(data, 0xff, blocksize);
                        ext4_mb_generate_buddy(sb, data, incore, group, grinfo);
                        ext4_unlock_group(sb, group);
                        incore = NULL;
                } else {
                        /* this is block of bitmap */
                        BUG_ON(incore != NULL);
                        mb_debug(sb, "put bitmap for group %u in page %lu/%x\n",
                                group, page->index, i * blocksize);
                        trace_ext4_mb_bitmap_load(sb, group);

                        /* see comments in ext4_mb_put_pa() */
                        ext4_lock_group(sb, group);
                        memcpy(data, bitmap, blocksize);

                        /* mark all preallocated blks used in in-core bitmap */
                        ext4_mb_generate_from_pa(sb, data, group);
                        ext4_mb_generate_from_freelist(sb, data, group);
                        ext4_unlock_group(sb, group);

                        /* set incore so that the buddy information can be
                         * generated using this
                         */
                        incore = data;
                }
        }
        SetPageUptodate(page);

out:
        if (bh) {
                for (i = 0; i < groups_per_page; i++)
                        brelse(bh[i]);
                if (bh != &bhs)
                        kfree(bh);
        }
        return err;
}

/*
 * Lock the buddy and bitmap pages. This make sure other parallel init_group
 * on the same buddy page doesn't happen whild holding the buddy page lock.
 * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
 * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
 */
static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
                ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp)
{
        struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
        int block, pnum, poff;
        int blocks_per_page;
        struct page *page;

        e4b->bd_buddy_page = NULL;
        e4b->bd_bitmap_page = NULL;

        blocks_per_page = PAGE_SIZE / sb->s_blocksize;
        /*
         * the buddy cache inode stores the block bitmap
         * and buddy information in consecutive blocks.
         * So for each group we need two blocks.
         */
        block = group * 2;
        pnum = block / blocks_per_page;
        poff = block % blocks_per_page;
        page = find_or_create_page(inode->i_mapping, pnum, gfp);
        if (!page)
                return -ENOMEM;
        BUG_ON(page->mapping != inode->i_mapping);
        e4b->bd_bitmap_page = page;
        e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);

        if (blocks_per_page >= 2) {
                /* buddy and bitmap are on the same page */
                return 0;
        }

        block++;
        pnum = block / blocks_per_page;
        page = find_or_create_page(inode->i_mapping, pnum, gfp);
        if (!page)
                return -ENOMEM;
        BUG_ON(page->mapping != inode->i_mapping);
        e4b->bd_buddy_page = page;
        return 0;
}

static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
{
        if (e4b->bd_bitmap_page) {
                unlock_page(e4b->bd_bitmap_page);
                put_page(e4b->bd_bitmap_page);
        }
        if (e4b->bd_buddy_page) {
                unlock_page(e4b->bd_buddy_page);
                put_page(e4b->bd_buddy_page);
        }
}

/*
 * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
 * block group lock of all groups for this page; do not hold the BG lock when
 * calling this routine!
 */
static noinline_for_stack
int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
{

        struct ext4_group_info *this_grp;
        struct ext4_buddy e4b;
        struct page *page;
        int ret = 0;

        might_sleep();
        mb_debug(sb, "init group %u\n", group);
        this_grp = ext4_get_group_info(sb, group);
        if (!this_grp)
                return -EFSCORRUPTED;

        /*
         * This ensures that we don't reinit the buddy cache
         * page which map to the group from which we are already
         * allocating. If we are looking at the buddy cache we would
         * have taken a reference using ext4_mb_load_buddy and that
         * would have pinned buddy page to page cache.
         * The call to ext4_mb_get_buddy_page_lock will mark the
         * page accessed.
         */
        ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp);
        if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
                /*
                 * somebody initialized the group
                 * return without doing anything
                 */
                goto err;
        }

        page = e4b.bd_bitmap_page;
        ret = ext4_mb_init_cache(page, NULL, gfp);
        if (ret)
                goto err;
        if (!PageUptodate(page)) {
                ret = -EIO;
                goto err;
        }

        if (e4b.bd_buddy_page == NULL) {
                /*
                 * If both the bitmap and buddy are in
                 * the same page we don't need to force
                 * init the buddy
                 */
                ret = 0;
                goto err;
        }
        /* init buddy cache */
        page = e4b.bd_buddy_page;
        ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp);
        if (ret)
                goto err;
        if (!PageUptodate(page)) {
                ret = -EIO;
                goto err;
        }
err:
        ext4_mb_put_buddy_page_lock(&e4b);
        return ret;
}

/*
 * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
 * block group lock of all groups for this page; do not hold the BG lock when
 * calling this routine!
 */
static noinline_for_stack int
ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
                       struct ext4_buddy *e4b, gfp_t gfp)
{
        int blocks_per_page;
        int block;
        int pnum;
        int poff;
        struct page *page;
        int ret;
        struct ext4_group_info *grp;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct inode *inode = sbi->s_buddy_cache;

        might_sleep();
        mb_debug(sb, "load group %u\n", group);

        blocks_per_page = PAGE_SIZE / sb->s_blocksize;
        grp = ext4_get_group_info(sb, group);
        if (!grp)
                return -EFSCORRUPTED;

        e4b->bd_blkbits = sb->s_blocksize_bits;
        e4b->bd_info = grp;
        e4b->bd_sb = sb;
        e4b->bd_group = group;
        e4b->bd_buddy_page = NULL;
        e4b->bd_bitmap_page = NULL;

        if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
                /*
                 * we need full data about the group
                 * to make a good selection
                 */
                ret = ext4_mb_init_group(sb, group, gfp);
                if (ret)
                        return ret;
        }

        /*
         * the buddy cache inode stores the block bitmap
         * and buddy information in consecutive blocks.
         * So for each group we need two blocks.
         */
        block = group * 2;
        pnum = block / blocks_per_page;
        poff = block % blocks_per_page;

        /* we could use find_or_create_page(), but it locks page
         * what we'd like to avoid in fast path ... */
        page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
        if (page == NULL || !PageUptodate(page)) {
                if (page)
                        /*
                         * drop the page reference and try
                         * to get the page with lock. If we
                         * are not uptodate that implies
                         * somebody just created the page but
                         * is yet to initialize the same. So
                         * wait for it to initialize.
                         */
                        put_page(page);
                page = find_or_create_page(inode->i_mapping, pnum, gfp);
                if (page) {
                        BUG_ON(page->mapping != inode->i_mapping);
                        if (!PageUptodate(page)) {
                                ret = ext4_mb_init_cache(page, NULL, gfp);
                                if (ret) {
                                        unlock_page(page);
                                        goto err;
                                }
                                mb_cmp_bitmaps(e4b, page_address(page) +
                                               (poff * sb->s_blocksize));
                        }
                        unlock_page(page);
                }
        }
        if (page == NULL) {
                ret = -ENOMEM;
                goto err;
        }
        if (!PageUptodate(page)) {
                ret = -EIO;
                goto err;
        }

        /* Pages marked accessed already */
        e4b->bd_bitmap_page = page;
        e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);

        block++;
        pnum = block / blocks_per_page;
        poff = block % blocks_per_page;

        page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
        if (page == NULL || !PageUptodate(page)) {
                if (page)
                        put_page(page);
                page = find_or_create_page(inode->i_mapping, pnum, gfp);
                if (page) {
                        BUG_ON(page->mapping != inode->i_mapping);
                        if (!PageUptodate(page)) {
                                ret = ext4_mb_init_cache(page, e4b->bd_bitmap,
                                                         gfp);
                                if (ret) {
                                        unlock_page(page);
                                        goto err;
                                }
                        }
                        unlock_page(page);
                }
        }
        if (page == NULL) {
                ret = -ENOMEM;
                goto err;
        }
        if (!PageUptodate(page)) {
                ret = -EIO;
                goto err;
        }

        /* Pages marked accessed already */
        e4b->bd_buddy_page = page;
        e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);

        return 0;

err:
        if (page)
                put_page(page);
        if (e4b->bd_bitmap_page)
                put_page(e4b->bd_bitmap_page);
        if (e4b->bd_buddy_page)
                put_page(e4b->bd_buddy_page);
        e4b->bd_buddy = NULL;
        e4b->bd_bitmap = NULL;
        return ret;
}

static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
                              struct ext4_buddy *e4b)
{
        return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS);
}

static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
{
        if (e4b->bd_bitmap_page)
                put_page(e4b->bd_bitmap_page);
        if (e4b->bd_buddy_page)
                put_page(e4b->bd_buddy_page);
}


static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
{
        int order = 1;
        int bb_incr = 1 << (e4b->bd_blkbits - 1);
        void *bb;

        BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
        BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));

        bb = e4b->bd_buddy;
        while (order <= e4b->bd_blkbits + 1) {
                block = block >> 1;
                if (!mb_test_bit(block, bb)) {
                        /* this block is part of buddy of order 'order' */
                        return order;
                }
                bb += bb_incr;
                bb_incr >>= 1;
                order++;
        }
        return 0;
}

static void mb_clear_bits(void *bm, int cur, int len)
{
        __u32 *addr;

        len = cur + len;
        while (cur < len) {
                if ((cur & 31) == 0 && (len - cur) >= 32) {
                        /* fast path: clear whole word at once */
                        addr = bm + (cur >> 3);
                        *addr = 0;
                        cur += 32;
                        continue;
                }
                mb_clear_bit(cur, bm);
                cur++;
        }
}

/* clear bits in given range
 * will return first found zero bit if any, -1 otherwise
 */
static int mb_test_and_clear_bits(void *bm, int cur, int len)
{
        __u32 *addr;
        int zero_bit = -1;

        len = cur + len;
        while (cur < len) {
                if ((cur & 31) == 0 && (len - cur) >= 32) {
                        /* fast path: clear whole word at once */
                        addr = bm + (cur >> 3);
                        if (*addr != (__u32)(-1) && zero_bit == -1)
                                zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0);
                        *addr = 0;
                        cur += 32;
                        continue;
                }
                if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1)
                        zero_bit = cur;
                cur++;
        }

        return zero_bit;
}

void ext4_set_bits(void *bm, int cur, int len)
{
        __u32 *addr;

        len = cur + len;
        while (cur < len) {
                if ((cur & 31) == 0 && (len - cur) >= 32) {
                        /* fast path: set whole word at once */
                        addr = bm + (cur >> 3);
                        *addr = 0xffffffff;
                        cur += 32;
                        continue;
                }
                mb_set_bit(cur, bm);
                cur++;
        }
}

static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
{
        if (mb_test_bit(*bit + side, bitmap)) {
                mb_clear_bit(*bit, bitmap);
                (*bit) -= side;
                return 1;
        }
        else {
                (*bit) += side;
                mb_set_bit(*bit, bitmap);
                return -1;
        }
}

static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
{
        int max;
        int order = 1;
        void *buddy = mb_find_buddy(e4b, order, &max);

        while (buddy) {
                void *buddy2;

                /* Bits in range [first; last] are known to be set since
                 * corresponding blocks were allocated. Bits in range
                 * (first; last) will stay set because they form buddies on
                 * upper layer. We just deal with borders if they don't
                 * align with upper layer and then go up.
                 * Releasing entire group is all about clearing
                 * single bit of highest order buddy.
                 */

                /* Example:
                 * ---------------------------------
                 * |   1   |   1   |   1   |   1   |
                 * ---------------------------------
                 * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
                 * ---------------------------------
                 *   0   1   2   3   4   5   6   7
                 *      \_____________________/
                 *
                 * Neither [1] nor [6] is aligned to above layer.
                 * Left neighbour [0] is free, so mark it busy,
                 * decrease bb_counters and extend range to
                 * [0; 6]
                 * Right neighbour [7] is busy. It can't be coaleasced with [6], so
                 * mark [6] free, increase bb_counters and shrink range to
                 * [0; 5].
                 * Then shift range to [0; 2], go up and do the same.
                 */


                if (first & 1)
                        e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1);
                if (!(last & 1))
                        e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1);
                if (first > last)
                        break;
                order++;

                if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) {
                        mb_clear_bits(buddy, first, last - first + 1);
                        e4b->bd_info->bb_counters[order - 1] += last - first + 1;
                        break;
                }
                first >>= 1;
                last >>= 1;
                buddy = buddy2;
        }
}

static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
                           int first, int count)
{
        int left_is_free = 0;
        int right_is_free = 0;
        int block;
        int last = first + count - 1;
        struct super_block *sb = e4b->bd_sb;

        if (WARN_ON(count == 0))
                return;
        BUG_ON(last >= (sb->s_blocksize << 3));
        assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
        /* Don't bother if the block group is corrupt. */
        if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
                return;

        mb_check_buddy(e4b);
        mb_free_blocks_double(inode, e4b, first, count);

        /* access memory sequentially: check left neighbour,
         * clear range and then check right neighbour
         */
        if (first != 0)
                left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap);
        block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count);
        if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0])
                right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);

        if (unlikely(block != -1)) {
                struct ext4_sb_info *sbi = EXT4_SB(sb);
                ext4_fsblk_t blocknr;

                /*
                 * Fastcommit replay can free already freed blocks which
                 * corrupts allocation info. Regenerate it.
                 */
                if (sbi->s_mount_state & EXT4_FC_REPLAY) {
                        mb_regenerate_buddy(e4b);
                        goto check;
                }

                blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
                blocknr += EXT4_C2B(sbi, block);
                ext4_grp_locked_error(sb, e4b->bd_group,
                                      inode ? inode->i_ino : 0, blocknr,
                                      "freeing already freed block (bit %u); block bitmap corrupt.",
                                      block);
                ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
                                EXT4_GROUP_INFO_BBITMAP_CORRUPT);
                return;
        }

        this_cpu_inc(discard_pa_seq);
        e4b->bd_info->bb_free += count;
        if (first < e4b->bd_info->bb_first_free)
                e4b->bd_info->bb_first_free = first;

        /* let's maintain fragments counter */
        if (left_is_free && right_is_free)
                e4b->bd_info->bb_fragments--;
        else if (!left_is_free && !right_is_free)
                e4b->bd_info->bb_fragments++;

        /* buddy[0] == bd_bitmap is a special case, so handle
         * it right away and let mb_buddy_mark_free stay free of
         * zero order checks.
         * Check if neighbours are to be coaleasced,
         * adjust bitmap bb_counters and borders appropriately.
         */
        if (first & 1) {
                first += !left_is_free;
                e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1;
        }
        if (!(last & 1)) {
                last -= !right_is_free;
                e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1;
        }

        if (first <= last)
                mb_buddy_mark_free(e4b, first >> 1, last >> 1);

        mb_set_largest_free_order(sb, e4b->bd_info);
check:
        mb_check_buddy(e4b);
}

static int mb_find_extent(struct ext4_buddy *e4b, int block,
                                int needed, struct ext4_free_extent *ex)
{
        int next = block;
        int max, order;
        void *buddy;

        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
        BUG_ON(ex == NULL);

        buddy = mb_find_buddy(e4b, 0, &max);
        BUG_ON(buddy == NULL);
        BUG_ON(block >= max);
        if (mb_test_bit(block, buddy)) {
                ex->fe_len = 0;
                ex->fe_start = 0;
                ex->fe_group = 0;
                return 0;
        }

        /* find actual order */
        order = mb_find_order_for_block(e4b, block);
        block = block >> order;

        ex->fe_len = 1 << order;
        ex->fe_start = block << order;
        ex->fe_group = e4b->bd_group;

        /* calc difference from given start */
        next = next - ex->fe_start;
        ex->fe_len -= next;
        ex->fe_start += next;

        while (needed > ex->fe_len &&
               mb_find_buddy(e4b, order, &max)) {

                if (block + 1 >= max)
                        break;

                next = (block + 1) * (1 << order);
                if (mb_test_bit(next, e4b->bd_bitmap))
                        break;

                order = mb_find_order_for_block(e4b, next);

                block = next >> order;
                ex->fe_len += 1 << order;
        }

        if (ex->fe_start + ex->fe_len > EXT4_CLUSTERS_PER_GROUP(e4b->bd_sb)) {
                /* Should never happen! (but apparently sometimes does?!?) */
                WARN_ON(1);
                ext4_grp_locked_error(e4b->bd_sb, e4b->bd_group, 0, 0,
                        "corruption or bug in mb_find_extent "
                        "block=%d, order=%d needed=%d ex=%u/%d/%d@%u",
                        block, order, needed, ex->fe_group, ex->fe_start,
                        ex->fe_len, ex->fe_logical);
                ex->fe_len = 0;
                ex->fe_start = 0;
                ex->fe_group = 0;
        }
        return ex->fe_len;
}

static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
{
        int ord;
        int mlen = 0;
        int max = 0;
        int cur;
        int start = ex->fe_start;
        int len = ex->fe_len;
        unsigned ret = 0;
        int len0 = len;
        void *buddy;

        BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
        BUG_ON(e4b->bd_group != ex->fe_group);
        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
        mb_check_buddy(e4b);
        mb_mark_used_double(e4b, start, len);

        this_cpu_inc(discard_pa_seq);
        e4b->bd_info->bb_free -= len;
        if (e4b->bd_info->bb_first_free == start)
                e4b->bd_info->bb_first_free += len;

        /* let's maintain fragments counter */
        if (start != 0)
                mlen = !mb_test_bit(start - 1, e4b->bd_bitmap);
        if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
                max = !mb_test_bit(start + len, e4b->bd_bitmap);
        if (mlen && max)
                e4b->bd_info->bb_fragments++;
        else if (!mlen && !max)
                e4b->bd_info->bb_fragments--;

        /* let's maintain buddy itself */
        while (len) {
                ord = mb_find_order_for_block(e4b, start);

                if (((start >> ord) << ord) == start && len >= (1 << ord)) {
                        /* the whole chunk may be allocated at once! */
                        mlen = 1 << ord;
                        buddy = mb_find_buddy(e4b, ord, &max);
                        BUG_ON((start >> ord) >= max);
                        mb_set_bit(start >> ord, buddy);
                        e4b->bd_info->bb_counters[ord]--;
                        start += mlen;
                        len -= mlen;
                        BUG_ON(len < 0);
                        continue;
                }

                /* store for history */
                if (ret == 0)
                        ret = len | (ord << 16);

                /* we have to split large buddy */
                BUG_ON(ord <= 0);
                buddy = mb_find_buddy(e4b, ord, &max);
                mb_set_bit(start >> ord, buddy);
                e4b->bd_info->bb_counters[ord]--;

                ord--;
                cur = (start >> ord) & ~1U;
                buddy = mb_find_buddy(e4b, ord, &max);
                mb_clear_bit(cur, buddy);
                mb_clear_bit(cur + 1, buddy);
                e4b->bd_info->bb_counters[ord]++;
                e4b->bd_info->bb_counters[ord]++;
        }
        mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);

        ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
        mb_check_buddy(e4b);

        return ret;
}

/*
 * Must be called under group lock!
 */
static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
                                        struct ext4_buddy *e4b)
{
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        int ret;

        BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group);
        BUG_ON(ac->ac_status == AC_STATUS_FOUND);

        ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
        ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical;
        ret = mb_mark_used(e4b, &ac->ac_b_ex);

        /* preallocation can change ac_b_ex, thus we store actually
         * allocated blocks for history */
        ac->ac_f_ex = ac->ac_b_ex;

        ac->ac_status = AC_STATUS_FOUND;
        ac->ac_tail = ret & 0xffff;
        ac->ac_buddy = ret >> 16;

        /*
         * take the page reference. We want the page to be pinned
         * so that we don't get a ext4_mb_init_cache_call for this
         * group until we update the bitmap. That would mean we
         * double allocate blocks. The reference is dropped
         * in ext4_mb_release_context
         */
        ac->ac_bitmap_page = e4b->bd_bitmap_page;
        get_page(ac->ac_bitmap_page);
        ac->ac_buddy_page = e4b->bd_buddy_page;
        get_page(ac->ac_buddy_page);
        /* store last allocated for subsequent stream allocation */
        if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
                spin_lock(&sbi->s_md_lock);
                sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
                sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
                spin_unlock(&sbi->s_md_lock);
        }
        /*
         * As we've just preallocated more space than
         * user requested originally, we store allocated
         * space in a special descriptor.
         */
        if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
                ext4_mb_new_preallocation(ac);

}

static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
                                        struct ext4_buddy *e4b,
                                        int finish_group)
{
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        struct ext4_free_extent *bex = &ac->ac_b_ex;
        struct ext4_free_extent *gex = &ac->ac_g_ex;
        struct ext4_free_extent ex;
        int max;

        if (ac->ac_status == AC_STATUS_FOUND)
                return;
        /*
         * We don't want to scan for a whole year
         */
        if (ac->ac_found > sbi->s_mb_max_to_scan &&
                        !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
                ac->ac_status = AC_STATUS_BREAK;
                return;
        }

        /*
         * Haven't found good chunk so far, let's continue
         */
        if (bex->fe_len < gex->fe_len)
                return;

        if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
                        && bex->fe_group == e4b->bd_group) {
                /* recheck chunk's availability - we don't know
                 * when it was found (within this lock-unlock
                 * period or not) */
                max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex);
                if (max >= gex->fe_len) {
                        ext4_mb_use_best_found(ac, e4b);
                        return;
                }
        }
}

/*
 * The routine checks whether found extent is good enough. If it is,
 * then the extent gets marked used and flag is set to the context
 * to stop scanning. Otherwise, the extent is compared with the
 * previous found extent and if new one is better, then it's stored
 * in the context. Later, the best found extent will be used, if
 * mballoc can't find good enough extent.
 *
 * FIXME: real allocation policy is to be designed yet!
 */
static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
                                        struct ext4_free_extent *ex,
                                        struct ext4_buddy *e4b)
{
        struct ext4_free_extent *bex = &ac->ac_b_ex;
        struct ext4_free_extent *gex = &ac->ac_g_ex;

        BUG_ON(ex->fe_len <= 0);
        BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
        BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
        BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);

        ac->ac_found++;

        /*
         * The special case - take what you catch first
         */
        if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
                *bex = *ex;
                ext4_mb_use_best_found(ac, e4b);
                return;
        }

        /*
         * Let's check whether the chuck is good enough
         */
        if (ex->fe_len == gex->fe_len) {
                *bex = *ex;
                ext4_mb_use_best_found(ac, e4b);
                return;
        }

        /*
         * If this is first found extent, just store it in the context
         */
        if (bex->fe_len == 0) {
                *bex = *ex;
                return;
        }

        /*
         * If new found extent is better, store it in the context
         */
        if (bex->fe_len < gex->fe_len) {
                /* if the request isn't satisfied, any found extent
                 * larger than previous best one is better */
                if (ex->fe_len > bex->fe_len)
                        *bex = *ex;
        } else if (ex->fe_len > gex->fe_len) {
                /* if the request is satisfied, then we try to find
                 * an extent that still satisfy the request, but is
                 * smaller than previous one */
                if (ex->fe_len < bex->fe_len)
                        *bex = *ex;
        }

        ext4_mb_check_limits(ac, e4b, 0);
}

static noinline_for_stack
int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
                                        struct ext4_buddy *e4b)
{
        struct ext4_free_extent ex = ac->ac_b_ex;
        ext4_group_t group = ex.fe_group;
        int max;
        int err;

        BUG_ON(ex.fe_len <= 0);
        err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
        if (err)
                return err;

        ext4_lock_group(ac->ac_sb, group);
        if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
                goto out;

        max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);

        if (max > 0) {
                ac->ac_b_ex = ex;
                ext4_mb_use_best_found(ac, e4b);
        }

out:
        ext4_unlock_group(ac->ac_sb, group);
        ext4_mb_unload_buddy(e4b);

        return 0;
}

static noinline_for_stack
int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
                                struct ext4_buddy *e4b)
{
        ext4_group_t group = ac->ac_g_ex.fe_group;
        int max;
        int err;
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
        struct ext4_free_extent ex;

        if (!grp)
                return -EFSCORRUPTED;
        if (!(ac->ac_flags & (EXT4_MB_HINT_TRY_GOAL | EXT4_MB_HINT_GOAL_ONLY)))
                return 0;
        if (grp->bb_free == 0)
                return 0;

        err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
        if (err) {
                if (EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info) &&
                    !(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
                        return 0;
                return err;
        }

        ext4_lock_group(ac->ac_sb, group);
        if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
                goto out;

        max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
                             ac->ac_g_ex.fe_len, &ex);
        ex.fe_logical = 0xDEADFA11; /* debug value */

        if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
                ext4_fsblk_t start;

                start = ext4_grp_offs_to_block(ac->ac_sb, &ex);
                /* use do_div to get remainder (would be 64-bit modulo) */
                if (do_div(start, sbi->s_stripe) == 0) {
                        ac->ac_found++;
                        ac->ac_b_ex = ex;
                        ext4_mb_use_best_found(ac, e4b);
                }
        } else if (max >= ac->ac_g_ex.fe_len) {
                BUG_ON(ex.fe_len <= 0);
                BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
                BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
                ac->ac_found++;
                ac->ac_b_ex = ex;
                ext4_mb_use_best_found(ac, e4b);
        } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) {
                /* Sometimes, caller may want to merge even small
                 * number of blocks to an existing extent */
                BUG_ON(ex.fe_len <= 0);
                BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
                BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
                ac->ac_found++;
                ac->ac_b_ex = ex;
                ext4_mb_use_best_found(ac, e4b);
        }
out:
        ext4_unlock_group(ac->ac_sb, group);
        ext4_mb_unload_buddy(e4b);

        return 0;
}

/*
 * The routine scans buddy structures (not bitmap!) from given order
 * to max order and tries to find big enough chunk to satisfy the req
 */
static noinline_for_stack
void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
                                        struct ext4_buddy *e4b)
{
        struct super_block *sb = ac->ac_sb;
        struct ext4_group_info *grp = e4b->bd_info;
        void *buddy;
        int i;
        int k;
        int max;

        BUG_ON(ac->ac_2order <= 0);
        for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) {
                if (grp->bb_counters[i] == 0)
                        continue;

                buddy = mb_find_buddy(e4b, i, &max);
                BUG_ON(buddy == NULL);

                k = mb_find_next_zero_bit(buddy, max, 0);
                if (k >= max) {
                        ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0,
                                "%d free clusters of order %d. But found 0",
                                grp->bb_counters[i], i);
                        ext4_mark_group_bitmap_corrupted(ac->ac_sb,
                                         e4b->bd_group,
                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT);
                        break;
                }
                ac->ac_found++;

                ac->ac_b_ex.fe_len = 1 << i;
                ac->ac_b_ex.fe_start = k << i;
                ac->ac_b_ex.fe_group = e4b->bd_group;

                ext4_mb_use_best_found(ac, e4b);

                BUG_ON(ac->ac_f_ex.fe_len != ac->ac_g_ex.fe_len);

                if (EXT4_SB(sb)->s_mb_stats)
                        atomic_inc(&EXT4_SB(sb)->s_bal_2orders);

                break;
        }
}

/*
 * The routine scans the group and measures all found extents.
 * In order to optimize scanning, caller must pass number of
 * free blocks in the group, so the routine can know upper limit.
 */
static noinline_for_stack
void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
                                        struct ext4_buddy *e4b)
{
        struct super_block *sb = ac->ac_sb;
        void *bitmap = e4b->bd_bitmap;
        struct ext4_free_extent ex;
        int i;
        int free;

        free = e4b->bd_info->bb_free;
        if (WARN_ON(free <= 0))
                return;

        i = e4b->bd_info->bb_first_free;

        while (free && ac->ac_status == AC_STATUS_CONTINUE) {
                i = mb_find_next_zero_bit(bitmap,
                                                EXT4_CLUSTERS_PER_GROUP(sb), i);
                if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) {
                        /*
                         * IF we have corrupt bitmap, we won't find any
                         * free blocks even though group info says we
                         * have free blocks
                         */
                        ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
                                        "%d free clusters as per "
                                        "group info. But bitmap says 0",
                                        free);
                        ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT);
                        break;
                }

                mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
                if (WARN_ON(ex.fe_len <= 0))
                        break;
                if (free < ex.fe_len) {
                        ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
                                        "%d free clusters as per "
                                        "group info. But got %d blocks",
                                        free, ex.fe_len);
                        ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT);
                        /*
                         * The number of free blocks differs. This mostly
                         * indicate that the bitmap is corrupt. So exit
                         * without claiming the space.
                         */
                        break;
                }
                ex.fe_logical = 0xDEADC0DE; /* debug value */
                ext4_mb_measure_extent(ac, &ex, e4b);

                i += ex.fe_len;
                free -= ex.fe_len;
        }

        ext4_mb_check_limits(ac, e4b, 1);
}

/*
 * This is a special case for storages like raid5
 * we try to find stripe-aligned chunks for stripe-size-multiple requests
 */
static noinline_for_stack
void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
                                 struct ext4_buddy *e4b)
{
        struct super_block *sb = ac->ac_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        void *bitmap = e4b->bd_bitmap;
        struct ext4_free_extent ex;
        ext4_fsblk_t first_group_block;
        ext4_fsblk_t a;
        ext4_grpblk_t i;
        int max;

        BUG_ON(sbi->s_stripe == 0);

        /* find first stripe-aligned block in group */
        first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);

        a = first_group_block + sbi->s_stripe - 1;
        do_div(a, sbi->s_stripe);
        i = (a * sbi->s_stripe) - first_group_block;

        while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
                if (!mb_test_bit(i, bitmap)) {
                        max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
                        if (max >= sbi->s_stripe) {
                                ac->ac_found++;
                                ex.fe_logical = 0xDEADF00D; /* debug value */
                                ac->ac_b_ex = ex;
                                ext4_mb_use_best_found(ac, e4b);
                                break;
                        }
                }
                i += sbi->s_stripe;
        }
}

/*
 * This is also called BEFORE we load the buddy bitmap.
 * Returns either 1 or 0 indicating that the group is either suitable
 * for the allocation or not.
 */
static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
                                ext4_group_t group, int cr)
{
        ext4_grpblk_t free, fragments;
        int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
        struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);

        BUG_ON(cr < 0 || cr >= 4);

        if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
                return false;

        free = grp->bb_free;
        if (free == 0)
                return false;

        fragments = grp->bb_fragments;
        if (fragments == 0)
                return false;

        switch (cr) {
        case 0:
                BUG_ON(ac->ac_2order == 0);

                /* Avoid using the first bg of a flexgroup for data files */
                if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
                    (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
                    ((group % flex_size) == 0))
                        return false;

                if (free < ac->ac_g_ex.fe_len)
                        return false;

                if (ac->ac_2order > ac->ac_sb->s_blocksize_bits+1)
                        return true;

                if (grp->bb_largest_free_order < ac->ac_2order)
                        return false;

                return true;
        case 1:
                if ((free / fragments) >= ac->ac_g_ex.fe_len)
                        return true;
                break;
        case 2:
                if (free >= ac->ac_g_ex.fe_len)
                        return true;
                break;
        case 3:
                return true;
        default:
                BUG();
        }

        return false;
}

/*
 * This could return negative error code if something goes wrong
 * during ext4_mb_init_group(). This should not be called with
 * ext4_lock_group() held.
 */
static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
                                     ext4_group_t group, int cr)
{
        struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
        struct super_block *sb = ac->ac_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK;
        ext4_grpblk_t free;
        int ret = 0;

        if (!grp)
                return -EFSCORRUPTED;
        if (sbi->s_mb_stats)
                atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]);
        if (should_lock)
                ext4_lock_group(sb, group);
        free = grp->bb_free;
        if (free == 0)
                goto out;
        if (cr <= 2 && free < ac->ac_g_ex.fe_len)
                goto out;
        if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
                goto out;
        if (should_lock)
                ext4_unlock_group(sb, group);

        /* We only do this if the grp has never been initialized */
        if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
                struct ext4_group_desc *gdp =
                        ext4_get_group_desc(sb, group, NULL);
                int ret;

                /* cr=0/1 is a very optimistic search to find large
                 * good chunks almost for free.  If buddy data is not
                 * ready, then this optimization makes no sense.  But
                 * we never skip the first block group in a flex_bg,
                 * since this gets used for metadata block allocation,
                 * and we want to make sure we locate metadata blocks
                 * in the first block group in the flex_bg if possible.
                 */
                if (cr < 2 &&
                    (!sbi->s_log_groups_per_flex ||
                     ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) &&
                    !(ext4_has_group_desc_csum(sb) &&
                      (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))))
                        return 0;
                ret = ext4_mb_init_group(sb, group, GFP_NOFS);
                if (ret)
                        return ret;
        }

        if (should_lock)
                ext4_lock_group(sb, group);
        ret = ext4_mb_good_group(ac, group, cr);
out:
        if (should_lock)
                ext4_unlock_group(sb, group);
        return ret;
}

/*
 * Start prefetching @nr block bitmaps starting at @group.
 * Return the next group which needs to be prefetched.
 */
ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
                              unsigned int nr, int *cnt)
{
        ext4_group_t ngroups = ext4_get_groups_count(sb);
        struct buffer_head *bh;
        struct blk_plug plug;

        blk_start_plug(&plug);
        while (nr-- > 0) {
                struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
                                                                  NULL);
                struct ext4_group_info *grp = ext4_get_group_info(sb, group);

                /*
                 * Prefetch block groups with free blocks; but don't
                 * bother if it is marked uninitialized on disk, since
                 * it won't require I/O to read.  Also only try to
                 * prefetch once, so we avoid getblk() call, which can
                 * be expensive.
                 */
                if (gdp && grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
                    EXT4_MB_GRP_NEED_INIT(grp) &&
                    ext4_free_group_clusters(sb, gdp) > 0 &&
                    !(ext4_has_group_desc_csum(sb) &&
                      (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
                        bh = ext4_read_block_bitmap_nowait(sb, group, true);
                        if (bh && !IS_ERR(bh)) {
                                if (!buffer_uptodate(bh) && cnt)
                                        (*cnt)++;
                                brelse(bh);
                        }
                }
                if (++group >= ngroups)
                        group = 0;
        }
        blk_finish_plug(&plug);
        return group;
}

/*
 * Prefetching reads the block bitmap into the buffer cache; but we
 * need to make sure that the buddy bitmap in the page cache has been
 * initialized.  Note that ext4_mb_init_group() will block if the I/O
 * is not yet completed, or indeed if it was not initiated by
 * ext4_mb_prefetch did not start the I/O.
 *
 * TODO: We should actually kick off the buddy bitmap setup in a work
 * queue when the buffer I/O is completed, so that we don't block
 * waiting for the block allocation bitmap read to finish when
 * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator().
 */
void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
                           unsigned int nr)
{
        while (nr-- > 0) {
                struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
                                                                  NULL);
                struct ext4_group_info *grp = ext4_get_group_info(sb, group);

                if (!group)
                        group = ext4_get_groups_count(sb);
                group--;
                grp = ext4_get_group_info(sb, group);

                if (grp && gdp && EXT4_MB_GRP_NEED_INIT(grp) &&
                    ext4_free_group_clusters(sb, gdp) > 0 &&
                    !(ext4_has_group_desc_csum(sb) &&
                      (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
                        if (ext4_mb_init_group(sb, group, GFP_NOFS))
                                break;
                }
        }
}

static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
        ext4_group_t prefetch_grp = 0, ngroups, group, i;
        int cr = -1;
        int err = 0, first_err = 0;
        unsigned int nr = 0, prefetch_ios = 0;
        struct ext4_sb_info *sbi;
        struct super_block *sb;
        struct ext4_buddy e4b;
        int lost;

        sb = ac->ac_sb;
        sbi = EXT4_SB(sb);
        ngroups = ext4_get_groups_count(sb);
        /* non-extent files are limited to low blocks/groups */
        if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
                ngroups = sbi->s_blockfile_groups;

        BUG_ON(ac->ac_status == AC_STATUS_FOUND);

        /* first, try the goal */
        err = ext4_mb_find_by_goal(ac, &e4b);
        if (err || ac->ac_status == AC_STATUS_FOUND)
                goto out;

        if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
                goto out;

        /*
         * ac->ac_2order is set only if the fe_len is a power of 2
         * if ac->ac_2order is set we also set criteria to 0 so that we
         * try exact allocation using buddy.
         */
        i = fls(ac->ac_g_ex.fe_len);
        ac->ac_2order = 0;
        /*
         * We search using buddy data only if the order of the request
         * is greater than equal to the sbi_s_mb_order2_reqs
         * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
         * We also support searching for power-of-two requests only for
         * requests upto maximum buddy size we have constructed.
         */
        if (i >= sbi->s_mb_order2_reqs && i <= sb->s_blocksize_bits + 2) {
                /*
                 * This should tell if fe_len is exactly power of 2
                 */
                if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
                        ac->ac_2order = array_index_nospec(i - 1,
                                                           sb->s_blocksize_bits + 2);
        }

        /* if stream allocation is enabled, use global goal */
        if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
                /* TBD: may be hot point */
                spin_lock(&sbi->s_md_lock);
                ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
                ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
                spin_unlock(&sbi->s_md_lock);
        }

        /* Let's just scan groups to find more-less suitable blocks */
        cr = ac->ac_2order ? 0 : 1;
        /*
         * cr == 0 try to get exact allocation,
         * cr == 3  try to get anything
         */
repeat:
        for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
                ac->ac_criteria = cr;
                /*
                 * searching for the right group start
                 * from the goal value specified
                 */
                group = ac->ac_g_ex.fe_group;
                prefetch_grp = group;

                for (i = 0; i < ngroups; group++, i++) {
                        int ret = 0;
                        cond_resched();
                        /*
                         * Artificially restricted ngroups for non-extent
                         * files makes group > ngroups possible on first loop.
                         */
                        if (group >= ngroups)
                                group = 0;

                        /*
                         * Batch reads of the block allocation bitmaps
                         * to get multiple READs in flight; limit
                         * prefetching at cr=0/1, otherwise mballoc can
                         * spend a lot of time loading imperfect groups
                         */
                        if ((prefetch_grp == group) &&
                            (cr > 1 ||
                             prefetch_ios < sbi->s_mb_prefetch_limit)) {
                                unsigned int curr_ios = prefetch_ios;

                                nr = sbi->s_mb_prefetch;
                                if (ext4_has_feature_flex_bg(sb)) {
                                        nr = 1 << sbi->s_log_groups_per_flex;
                                        nr -= group & (nr - 1);
                                        nr = min(nr, sbi->s_mb_prefetch);
                                }
                                prefetch_grp = ext4_mb_prefetch(sb, group,
                                                        nr, &prefetch_ios);
                                if (prefetch_ios == curr_ios)
                                        nr = 0;
                        }

                        /* This now checks without needing the buddy page */
                        ret = ext4_mb_good_group_nolock(ac, group, cr);
                        if (ret <= 0) {
                                if (!first_err)
                                        first_err = ret;
                                continue;
                        }

                        err = ext4_mb_load_buddy(sb, group, &e4b);
                        if (err)
                                goto out;

                        ext4_lock_group(sb, group);

                        /*
                         * We need to check again after locking the
                         * block group
                         */
                        ret = ext4_mb_good_group(ac, group, cr);
                        if (ret == 0) {
                                ext4_unlock_group(sb, group);
                                ext4_mb_unload_buddy(&e4b);
                                continue;
                        }

                        ac->ac_groups_scanned++;
                        if (cr == 0)
                                ext4_mb_simple_scan_group(ac, &e4b);
                        else if (cr == 1 && sbi->s_stripe &&
                                        !(ac->ac_g_ex.fe_len % sbi->s_stripe))
                                ext4_mb_scan_aligned(ac, &e4b);
                        else
                                ext4_mb_complex_scan_group(ac, &e4b);

                        ext4_unlock_group(sb, group);
                        ext4_mb_unload_buddy(&e4b);

                        if (ac->ac_status != AC_STATUS_CONTINUE)
                                break;
                }
                /* Processed all groups and haven't found blocks */
                if (sbi->s_mb_stats && i == ngroups)
                        atomic64_inc(&sbi->s_bal_cX_failed[cr]);
        }

        if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
            !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
                /*
                 * We've been searching too long. Let's try to allocate
                 * the best chunk we've found so far
                 */
                ext4_mb_try_best_found(ac, &e4b);
                if (ac->ac_status != AC_STATUS_FOUND) {
                        /*
                         * Someone more lucky has already allocated it.
                         * The only thing we can do is just take first
                         * found block(s)
                         */
                        lost = atomic_inc_return(&sbi->s_mb_lost_chunks);
                        mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n",
                                 ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start,
                                 ac->ac_b_ex.fe_len, lost);

                        ac->ac_b_ex.fe_group = 0;
                        ac->ac_b_ex.fe_start = 0;
                        ac->ac_b_ex.fe_len = 0;
                        ac->ac_status = AC_STATUS_CONTINUE;
                        ac->ac_flags |= EXT4_MB_HINT_FIRST;
                        cr = 3;
                        goto repeat;
                }
        }

        if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND)
                atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]);
out:
        if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
                err = first_err;

        mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n",
                 ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
                 ac->ac_flags, cr, err);

        if (nr)
                ext4_mb_prefetch_fini(sb, prefetch_grp, nr);

        return err;
}

static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
{
        struct super_block *sb = PDE_DATA(file_inode(seq->file));
        ext4_group_t group;

        if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
                return NULL;
        group = *pos + 1;
        return (void *) ((unsigned long) group);
}

static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
{
        struct super_block *sb = PDE_DATA(file_inode(seq->file));
        ext4_group_t group;

        ++*pos;
        if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
                return NULL;
        group = *pos + 1;
        return (void *) ((unsigned long) group);
}

static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
{
        struct super_block *sb = PDE_DATA(file_inode(seq->file));
        ext4_group_t group = (ext4_group_t) ((unsigned long) v);
        int i;
        int err, buddy_loaded = 0;
        struct ext4_buddy e4b;
        struct ext4_group_info *grinfo;
        unsigned char blocksize_bits = min_t(unsigned char,
                                             sb->s_blocksize_bits,
                                             EXT4_MAX_BLOCK_LOG_SIZE);
        struct sg {
                struct ext4_group_info info;
                ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2];
        } sg;

        group--;
        if (group == 0)
                seq_puts(seq, "#group: free  frags first ["
                              " 2^0   2^1   2^2   2^3   2^4   2^5   2^6  "
                              " 2^7   2^8   2^9   2^10  2^11  2^12  2^13  ]\n");

        i = (blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
                sizeof(struct ext4_group_info);

        grinfo = ext4_get_group_info(sb, group);
        if (!grinfo)
                return 0;
        /* Load the group info in memory only if not already loaded. */
        if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
                err = ext4_mb_load_buddy(sb, group, &e4b);
                if (err) {
                        seq_printf(seq, "#%-5u: I/O error\n", group);
                        return 0;
                }
                buddy_loaded = 1;
        }

        memcpy(&sg, grinfo, i);

        if (buddy_loaded)
                ext4_mb_unload_buddy(&e4b);

        seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
                        sg.info.bb_fragments, sg.info.bb_first_free);
        for (i = 0; i <= 13; i++)
                seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ?
                                sg.info.bb_counters[i] : 0);
        seq_puts(seq, " ]");
        if (EXT4_MB_GRP_BBITMAP_CORRUPT(&sg.info))
                seq_puts(seq, " Block bitmap corrupted!");
        seq_puts(seq, "\n");

        return 0;
}

static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
{
}

const struct seq_operations ext4_mb_seq_groups_ops = {
        .start  = ext4_mb_seq_groups_start,
        .next   = ext4_mb_seq_groups_next,
        .stop   = ext4_mb_seq_groups_stop,
        .show   = ext4_mb_seq_groups_show,
};

int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
{
        struct super_block *sb = (struct super_block *)seq->private;
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        seq_puts(seq, "mballoc:\n");
        if (!sbi->s_mb_stats) {
                seq_puts(seq, "\tmb stats collection turned off.\n");
                seq_puts(seq, "\tTo enable, please write \"1\" to sysfs file mb_stats.\n");
                return 0;
        }
        seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
        seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));

        seq_printf(seq, "\tgroups_scanned: %u\n",  atomic_read(&sbi->s_bal_groups_scanned));

        seq_puts(seq, "\tcr0_stats:\n");
        seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[0]));
        seq_printf(seq, "\t\tgroups_considered: %llu\n",
                   atomic64_read(&sbi->s_bal_cX_groups_considered[0]));
        seq_printf(seq, "\t\tuseless_loops: %llu\n",
                   atomic64_read(&sbi->s_bal_cX_failed[0]));

        seq_puts(seq, "\tcr1_stats:\n");
        seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[1]));
        seq_printf(seq, "\t\tgroups_considered: %llu\n",
                   atomic64_read(&sbi->s_bal_cX_groups_considered[1]));
        seq_printf(seq, "\t\tuseless_loops: %llu\n",
                   atomic64_read(&sbi->s_bal_cX_failed[1]));

        seq_puts(seq, "\tcr2_stats:\n");
        seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[2]));
        seq_printf(seq, "\t\tgroups_considered: %llu\n",
                   atomic64_read(&sbi->s_bal_cX_groups_considered[2]));
        seq_printf(seq, "\t\tuseless_loops: %llu\n",
                   atomic64_read(&sbi->s_bal_cX_failed[2]));

        seq_puts(seq, "\tcr3_stats:\n");
        seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[3]));
        seq_printf(seq, "\t\tgroups_considered: %llu\n",
                   atomic64_read(&sbi->s_bal_cX_groups_considered[3]));
        seq_printf(seq, "\t\tuseless_loops: %llu\n",
                   atomic64_read(&sbi->s_bal_cX_failed[3]));
        seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned));
        seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals));
        seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders));
        seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks));
        seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks));

        seq_printf(seq, "\tbuddies_generated: %u/%u\n",
                   atomic_read(&sbi->s_mb_buddies_generated),
                   ext4_get_groups_count(sb));
        seq_printf(seq, "\tbuddies_time_used: %llu\n",
                   atomic64_read(&sbi->s_mb_generation_time));
        seq_printf(seq, "\tpreallocated: %u\n",
                   atomic_read(&sbi->s_mb_preallocated));
        seq_printf(seq, "\tdiscarded: %u\n",
                   atomic_read(&sbi->s_mb_discarded));
        return 0;
}

static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
{
        int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
        struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];

        BUG_ON(!cachep);
        return cachep;
}

/*
 * Allocate the top-level s_group_info array for the specified number
 * of groups
 */
int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        unsigned size;
        struct ext4_group_info ***old_groupinfo, ***new_groupinfo;

        size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
                EXT4_DESC_PER_BLOCK_BITS(sb);
        if (size <= sbi->s_group_info_size)
                return 0;

        size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size);
        new_groupinfo = kvzalloc(size, GFP_KERNEL);
        if (!new_groupinfo) {
                ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
                return -ENOMEM;
        }
        rcu_read_lock();
        old_groupinfo = rcu_dereference(sbi->s_group_info);
        if (old_groupinfo)
                memcpy(new_groupinfo, old_groupinfo,
                       sbi->s_group_info_size * sizeof(*sbi->s_group_info));
        rcu_read_unlock();
        rcu_assign_pointer(sbi->s_group_info, new_groupinfo);
        sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
        if (old_groupinfo)
                ext4_kvfree_array_rcu(old_groupinfo);
        ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", 
                   sbi->s_group_info_size);
        return 0;
}

/* Create and initialize ext4_group_info data for the given group. */
int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
                          struct ext4_group_desc *desc)
{
        int i;
        int metalen = 0;
        int idx = group >> EXT4_DESC_PER_BLOCK_BITS(sb);
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_group_info **meta_group_info;
        struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);

        /*
         * First check if this group is the first of a reserved block.
         * If it's true, we have to allocate a new table of pointers
         * to ext4_group_info structures
         */
        if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
                metalen = sizeof(*meta_group_info) <<
                        EXT4_DESC_PER_BLOCK_BITS(sb);
                meta_group_info = kmalloc(metalen, GFP_NOFS);
                if (meta_group_info == NULL) {
                        ext4_msg(sb, KERN_ERR, "can't allocate mem "
                                 "for a buddy group");
                        goto exit_meta_group_info;
                }
                rcu_read_lock();
                rcu_dereference(sbi->s_group_info)[idx] = meta_group_info;
                rcu_read_unlock();
        }

        meta_group_info = sbi_array_rcu_deref(sbi, s_group_info, idx);
        i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);

        meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS);
        if (meta_group_info[i] == NULL) {
                ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
                goto exit_group_info;
        }
        set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
                &(meta_group_info[i]->bb_state));

        /*
         * initialize bb_free to be able to skip
         * empty groups without initialization
         */
        if (ext4_has_group_desc_csum(sb) &&
            (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
                meta_group_info[i]->bb_free =
                        ext4_free_clusters_after_init(sb, group, desc);
        } else {
                meta_group_info[i]->bb_free =
                        ext4_free_group_clusters(sb, desc);
        }

        INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
        init_rwsem(&meta_group_info[i]->alloc_sem);
        meta_group_info[i]->bb_free_root = RB_ROOT;
        meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */

        mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group);
        return 0;

exit_group_info:
        /* If a meta_group_info table has been allocated, release it now */
        if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
                struct ext4_group_info ***group_info;

                rcu_read_lock();
                group_info = rcu_dereference(sbi->s_group_info);
                kfree(group_info[idx]);
                group_info[idx] = NULL;
                rcu_read_unlock();
        }
exit_meta_group_info:
        return -ENOMEM;
} /* ext4_mb_add_groupinfo */

static int ext4_mb_init_backend(struct super_block *sb)
{
        ext4_group_t ngroups = ext4_get_groups_count(sb);
        ext4_group_t i;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int err;
        struct ext4_group_desc *desc;
        struct ext4_group_info ***group_info;
        struct kmem_cache *cachep;

        err = ext4_mb_alloc_groupinfo(sb, ngroups);
        if (err)
                return err;

        sbi->s_buddy_cache = new_inode(sb);
        if (sbi->s_buddy_cache == NULL) {
                ext4_msg(sb, KERN_ERR, "can't get new inode");
                goto err_freesgi;
        }
        /* To avoid potentially colliding with an valid on-disk inode number,
         * use EXT4_BAD_INO for the buddy cache inode number.  This inode is
         * not in the inode hash, so it should never be found by iget(), but
         * this will avoid confusion if it ever shows up during debugging. */
        sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
        EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
        for (i = 0; i < ngroups; i++) {
                cond_resched();
                desc = ext4_get_group_desc(sb, i, NULL);
                if (desc == NULL) {
                        ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
                        goto err_freebuddy;
                }
                if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
                        goto err_freebuddy;
        }

        if (ext4_has_feature_flex_bg(sb)) {
                /* a single flex group is supposed to be read by a single IO.
                 * 2 ^ s_log_groups_per_flex != UINT_MAX as s_mb_prefetch is
                 * unsigned integer, so the maximum shift is 32.
                 */
                if (sbi->s_es->s_log_groups_per_flex >= 32) {
                        ext4_msg(sb, KERN_ERR, "too many log groups per flexible block group");
                        goto err_freebuddy;
                }
                sbi->s_mb_prefetch = min_t(uint, 1 << sbi->s_es->s_log_groups_per_flex,
                        BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9));
                sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
        } else {
                sbi->s_mb_prefetch = 32;
        }
        if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
                sbi->s_mb_prefetch = ext4_get_groups_count(sb);
        /* now many real IOs to prefetch within a single allocation at cr=0
         * given cr=0 is an CPU-related optimization we shouldn't try to
         * load too many groups, at some point we should start to use what
         * we've got in memory.
         * with an average random access time 5ms, it'd take a second to get
         * 200 groups (* N with flex_bg), so let's make this limit 4
         */
        sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4;
        if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb))
                sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb);

        return 0;

err_freebuddy:
        cachep = get_groupinfo_cache(sb->s_blocksize_bits);
        while (i-- > 0) {
                struct ext4_group_info *grp = ext4_get_group_info(sb, i);

                if (grp)
                        kmem_cache_free(cachep, grp);
        }
        i = sbi->s_group_info_size;
        rcu_read_lock();
        group_info = rcu_dereference(sbi->s_group_info);
        while (i-- > 0)
                kfree(group_info[i]);
        rcu_read_unlock();
        iput(sbi->s_buddy_cache);
err_freesgi:
        kvfree(rcu_access_pointer(sbi->s_group_info));
        return -ENOMEM;
}

static void ext4_groupinfo_destroy_slabs(void)
{
        int i;

        for (i = 0; i < NR_GRPINFO_CACHES; i++) {
                kmem_cache_destroy(ext4_groupinfo_caches[i]);
                ext4_groupinfo_caches[i] = NULL;
        }
}

static int ext4_groupinfo_create_slab(size_t size)
{
        static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
        int slab_size;
        int blocksize_bits = order_base_2(size);
        int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
        struct kmem_cache *cachep;

        if (cache_index >= NR_GRPINFO_CACHES)
                return -EINVAL;

        if (unlikely(cache_index < 0))
                cache_index = 0;

        mutex_lock(&ext4_grpinfo_slab_create_mutex);
        if (ext4_groupinfo_caches[cache_index]) {
                mutex_unlock(&ext4_grpinfo_slab_create_mutex);
                return 0;        /* Already created */
        }

        slab_size = offsetof(struct ext4_group_info,
                                bb_counters[blocksize_bits + 2]);

        cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
                                        slab_size, 0, SLAB_RECLAIM_ACCOUNT,
                                        NULL);

        ext4_groupinfo_caches[cache_index] = cachep;

        mutex_unlock(&ext4_grpinfo_slab_create_mutex);
        if (!cachep) {
                printk(KERN_EMERG
                       "EXT4-fs: no memory for groupinfo slab cache\n");
                return -ENOMEM;
        }

        return 0;
}

int ext4_mb_init(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        unsigned i, j;
        unsigned offset, offset_incr;
        unsigned max;
        int ret;

        i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);

        sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
        if (sbi->s_mb_offsets == NULL) {
                ret = -ENOMEM;
                goto out;
        }

        i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
        sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
        if (sbi->s_mb_maxs == NULL) {
                ret = -ENOMEM;
                goto out;
        }

        ret = ext4_groupinfo_create_slab(sb->s_blocksize);
        if (ret < 0)
                goto out;

        /* order 0 is regular bitmap */
        sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
        sbi->s_mb_offsets[0] = 0;

        i = 1;
        offset = 0;
        offset_incr = 1 << (sb->s_blocksize_bits - 1);
        max = sb->s_blocksize << 2;
        do {
                sbi->s_mb_offsets[i] = offset;
                sbi->s_mb_maxs[i] = max;
                offset += offset_incr;
                offset_incr = offset_incr >> 1;
                max = max >> 1;
                i++;
        } while (i <= sb->s_blocksize_bits + 1);

        spin_lock_init(&sbi->s_md_lock);
        sbi->s_mb_free_pending = 0;
        INIT_LIST_HEAD(&sbi->s_freed_data_list);

        sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
        sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
        sbi->s_mb_stats = MB_DEFAULT_STATS;
        sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
        sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
        sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC;
        /*
         * The default group preallocation is 512, which for 4k block
         * sizes translates to 2 megabytes.  However for bigalloc file
         * systems, this is probably too big (i.e, if the cluster size
         * is 1 megabyte, then group preallocation size becomes half a
         * gigabyte!).  As a default, we will keep a two megabyte
         * group pralloc size for cluster sizes up to 64k, and after
         * that, we will force a minimum group preallocation size of
         * 32 clusters.  This translates to 8 megs when the cluster
         * size is 256k, and 32 megs when the cluster size is 1 meg,
         * which seems reasonable as a default.
         */
        sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >>
                                       sbi->s_cluster_bits, 32);
        /*
         * If there is a s_stripe > 1, then we set the s_mb_group_prealloc
         * to the lowest multiple of s_stripe which is bigger than
         * the s_mb_group_prealloc as determined above. We want
         * the preallocation size to be an exact multiple of the
         * RAID stripe size so that preallocations don't fragment
         * the stripes.
         */
        if (sbi->s_stripe > 1) {
                sbi->s_mb_group_prealloc = roundup(
                        sbi->s_mb_group_prealloc, sbi->s_stripe);
        }

        sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
        if (sbi->s_locality_groups == NULL) {
                ret = -ENOMEM;
                goto out;
        }
        for_each_possible_cpu(i) {
                struct ext4_locality_group *lg;
                lg = per_cpu_ptr(sbi->s_locality_groups, i);
                mutex_init(&lg->lg_mutex);
                for (j = 0; j < PREALLOC_TB_SIZE; j++)
                        INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
                spin_lock_init(&lg->lg_prealloc_lock);
        }

        /* init file for buddy data */
        ret = ext4_mb_init_backend(sb);
        if (ret != 0)
                goto out_free_locality_groups;

        return 0;

out_free_locality_groups:
        free_percpu(sbi->s_locality_groups);
        sbi->s_locality_groups = NULL;
out:
        kfree(sbi->s_mb_offsets);
        sbi->s_mb_offsets = NULL;
        kfree(sbi->s_mb_maxs);
        sbi->s_mb_maxs = NULL;
        return ret;
}

/* need to called with the ext4 group lock held */
static int ext4_mb_cleanup_pa(struct ext4_group_info *grp)
{
        struct ext4_prealloc_space *pa;
        struct list_head *cur, *tmp;
        int count = 0;

        list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) {
                pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
                list_del(&pa->pa_group_list);
                count++;
                kmem_cache_free(ext4_pspace_cachep, pa);
        }
        return count;
}

int ext4_mb_release(struct super_block *sb)
{
        ext4_group_t ngroups = ext4_get_groups_count(sb);
        ext4_group_t i;
        int num_meta_group_infos;
        struct ext4_group_info *grinfo, ***group_info;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
        int count;

        group_info = rcu_access_pointer(sbi->s_group_info);
        if (group_info) {
                for (i = 0; i < ngroups; i++) {
                        cond_resched();
                        grinfo = ext4_get_group_info(sb, i);
                        if (!grinfo)
                                continue;
                        mb_group_bb_bitmap_free(grinfo);
                        ext4_lock_group(sb, i);
                        count = ext4_mb_cleanup_pa(grinfo);
                        if (count)
                                mb_debug(sb, "mballoc: %d PAs left\n",
                                         count);
                        ext4_unlock_group(sb, i);
                        kmem_cache_free(cachep, grinfo);
                }
                num_meta_group_infos = (ngroups +
                                EXT4_DESC_PER_BLOCK(sb) - 1) >>
                        EXT4_DESC_PER_BLOCK_BITS(sb);
                for (i = 0; i < num_meta_group_infos; i++)
                        kfree(group_info[i]);
                kvfree(group_info);
        }
        kfree(sbi->s_mb_offsets);
        kfree(sbi->s_mb_maxs);
        iput(sbi->s_buddy_cache);
        if (sbi->s_mb_stats) {
                ext4_msg(sb, KERN_INFO,
                       "mballoc: %u blocks %u reqs (%u success)",
                                atomic_read(&sbi->s_bal_allocated),
                                atomic_read(&sbi->s_bal_reqs),
                                atomic_read(&sbi->s_bal_success));
                ext4_msg(sb, KERN_INFO,
                      "mballoc: %u extents scanned, %u groups scanned, %u goal hits, "
                                "%u 2^N hits, %u breaks, %u lost",
                                atomic_read(&sbi->s_bal_ex_scanned),
                                atomic_read(&sbi->s_bal_groups_scanned),
                                atomic_read(&sbi->s_bal_goals),
                                atomic_read(&sbi->s_bal_2orders),
                                atomic_read(&sbi->s_bal_breaks),
                                atomic_read(&sbi->s_mb_lost_chunks));
                ext4_msg(sb, KERN_INFO,
                       "mballoc: %u generated and it took %llu",
                                atomic_read(&sbi->s_mb_buddies_generated),
                                atomic64_read(&sbi->s_mb_generation_time));
                ext4_msg(sb, KERN_INFO,
                       "mballoc: %u preallocated, %u discarded",
                                atomic_read(&sbi->s_mb_preallocated),
                                atomic_read(&sbi->s_mb_discarded));
        }

        free_percpu(sbi->s_locality_groups);

        return 0;
}

static inline int ext4_issue_discard(struct super_block *sb,
                ext4_group_t block_group, ext4_grpblk_t cluster, int count,
                struct bio **biop)
{
        ext4_fsblk_t discard_block;

        discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) +
                         ext4_group_first_block_no(sb, block_group));
        count = EXT4_C2B(EXT4_SB(sb), count);
        trace_ext4_discard_blocks(sb,
                        (unsigned long long) discard_block, count);
        if (biop) {
                return __blkdev_issue_discard(sb->s_bdev,
                        (sector_t)discard_block << (sb->s_blocksize_bits - 9),
                        (sector_t)count << (sb->s_blocksize_bits - 9),
                        GFP_NOFS, 0, biop);
        } else
                return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
}

static void ext4_free_data_in_buddy(struct super_block *sb,
                                    struct ext4_free_data *entry)
{
        struct ext4_buddy e4b;
        struct ext4_group_info *db;
        int err, count = 0, count2 = 0;

        mb_debug(sb, "gonna free %u blocks in group %u (0x%p):",
                 entry->efd_count, entry->efd_group, entry);

        err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
        /* we expect to find existing buddy because it's pinned */
        BUG_ON(err != 0);

        spin_lock(&EXT4_SB(sb)->s_md_lock);
        EXT4_SB(sb)->s_mb_free_pending -= entry->efd_count;
        spin_unlock(&EXT4_SB(sb)->s_md_lock);

        db = e4b.bd_info;
        /* there are blocks to put in buddy to make them really free */
        count += entry->efd_count;
        count2++;
        ext4_lock_group(sb, entry->efd_group);
        /* Take it out of per group rb tree */
        rb_erase(&entry->efd_node, &(db->bb_free_root));
        mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);

        /*
         * Clear the trimmed flag for the group so that the next
         * ext4_trim_fs can trim it.
         */
        EXT4_MB_GRP_CLEAR_TRIMMED(db);

        if (!db->bb_free_root.rb_node) {
                /* No more items in the per group rb tree
                 * balance refcounts from ext4_mb_free_metadata()
                 */
                put_page(e4b.bd_buddy_page);
                put_page(e4b.bd_bitmap_page);
        }
        ext4_unlock_group(sb, entry->efd_group);
        kmem_cache_free(ext4_free_data_cachep, entry);
        ext4_mb_unload_buddy(&e4b);

        mb_debug(sb, "freed %d blocks in %d structures\n", count,
                 count2);
}

/*
 * This function is called by the jbd2 layer once the commit has finished,
 * so we know we can free the blocks that were released with that commit.
 */
void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_free_data *entry, *tmp;
        struct bio *discard_bio = NULL;
        struct list_head freed_data_list;
        struct list_head *cut_pos = NULL;
        int err;

        INIT_LIST_HEAD(&freed_data_list);

        spin_lock(&sbi->s_md_lock);
        list_for_each_entry(entry, &sbi->s_freed_data_list, efd_list) {
                if (entry->efd_tid != commit_tid)
                        break;
                cut_pos = &entry->efd_list;
        }
        if (cut_pos)
                list_cut_position(&freed_data_list, &sbi->s_freed_data_list,
                                  cut_pos);
        spin_unlock(&sbi->s_md_lock);

        if (test_opt(sb, DISCARD)) {
                list_for_each_entry(entry, &freed_data_list, efd_list) {
                        err = ext4_issue_discard(sb, entry->efd_group,
                                                 entry->efd_start_cluster,
                                                 entry->efd_count,
                                                 &discard_bio);
                        if (err && err != -EOPNOTSUPP) {
                                ext4_msg(sb, KERN_WARNING, "discard request in"
                                         " group:%d block:%d count:%d failed"
                                         " with %d", entry->efd_group,
                                         entry->efd_start_cluster,
                                         entry->efd_count, err);
                        } else if (err == -EOPNOTSUPP)
                                break;
                }

                if (discard_bio) {
                        submit_bio_wait(discard_bio);
                        bio_put(discard_bio);
                }
        }

        list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list)
                ext4_free_data_in_buddy(sb, entry);
}

int __init ext4_init_mballoc(void)
{
        ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
                                        SLAB_RECLAIM_ACCOUNT);
        if (ext4_pspace_cachep == NULL)
                goto out;

        ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
                                    SLAB_RECLAIM_ACCOUNT);
        if (ext4_ac_cachep == NULL)
                goto out_pa_free;

        ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
                                           SLAB_RECLAIM_ACCOUNT);
        if (ext4_free_data_cachep == NULL)
                goto out_ac_free;

        return 0;

out_ac_free:
        kmem_cache_destroy(ext4_ac_cachep);
out_pa_free:
        kmem_cache_destroy(ext4_pspace_cachep);
out:
        return -ENOMEM;
}

void ext4_exit_mballoc(void)
{
        /*
         * Wait for completion of call_rcu()'s on ext4_pspace_cachep
         * before destroying the slab cache.
         */
        rcu_barrier();
        kmem_cache_destroy(ext4_pspace_cachep);
        kmem_cache_destroy(ext4_ac_cachep);
        kmem_cache_destroy(ext4_free_data_cachep);
        ext4_groupinfo_destroy_slabs();
}


/*
 * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
 * Returns 0 if success or error code
 */
static noinline_for_stack int
ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, handle_t *handle)
{
        struct buffer_head *bitmap_bh = NULL;
        struct ext4_group_desc *gdp;
        struct buffer_head *gdp_bh;
        struct ext4_sb_info *sbi;
        struct super_block *sb;
        ext4_fsblk_t block;
        int err, len;

        BUG_ON(ac->ac_status != AC_STATUS_FOUND);
        BUG_ON(ac->ac_b_ex.fe_len <= 0);

        sb = ac->ac_sb;
        sbi = EXT4_SB(sb);

        bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
        if (IS_ERR(bitmap_bh)) {
                err = PTR_ERR(bitmap_bh);
                bitmap_bh = NULL;
                goto out_err;
        }

        BUFFER_TRACE(bitmap_bh, "getting write access");
        err = ext4_journal_get_write_access(handle, bitmap_bh);
        if (err)
                goto out_err;

        err = -EIO;
        gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh);
        if (!gdp)
                goto out_err;

        ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
                        ext4_free_group_clusters(sb, gdp));

        BUFFER_TRACE(gdp_bh, "get_write_access");
        err = ext4_journal_get_write_access(handle, gdp_bh);
        if (err)
                goto out_err;

        block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);

        len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
        if (!ext4_inode_block_valid(ac->ac_inode, block, len)) {
                ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
                           "fs metadata", block, block+len);
                /* File system mounted not to panic on error
                 * Fix the bitmap and return EFSCORRUPTED
                 * We leak some of the blocks here.
                 */
                ext4_lock_group(sb, ac->ac_b_ex.fe_group);
                ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
                              ac->ac_b_ex.fe_len);
                ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
                err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
                if (!err)
                        err = -EFSCORRUPTED;
                goto out_err;
        }

        ext4_lock_group(sb, ac->ac_b_ex.fe_group);
#ifdef AGGRESSIVE_CHECK
        {
                int i;
                for (i = 0; i < ac->ac_b_ex.fe_len; i++) {
                        BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i,
                                                bitmap_bh->b_data));
                }
        }
#endif
        ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
                      ac->ac_b_ex.fe_len);
        if (ext4_has_group_desc_csum(sb) &&
            (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
                gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
                ext4_free_group_clusters_set(sb, gdp,
                                             ext4_free_clusters_after_init(sb,
                                                ac->ac_b_ex.fe_group, gdp));
        }
        len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
        ext4_free_group_clusters_set(sb, gdp, len);
        ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh);
        ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp);

        ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
        percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);

        if (sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group = ext4_flex_group(sbi,
                                                          ac->ac_b_ex.fe_group);
                atomic64_sub(ac->ac_b_ex.fe_len,
                             &sbi_array_rcu_deref(sbi, s_flex_groups,
                                                  flex_group)->free_clusters);
        }

        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
        if (err)
                goto out_err;
        err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);

out_err:
        brelse(bitmap_bh);
        return err;
}

/*
 * Idempotent helper for Ext4 fast commit replay path to set the state of
 * blocks in bitmaps and update counters.
 */
void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
                        int len, int state)
{
        struct buffer_head *bitmap_bh = NULL;
        struct ext4_group_desc *gdp;
        struct buffer_head *gdp_bh;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_group_t group;
        ext4_grpblk_t blkoff;
        int i, err;
        int already;
        unsigned int clen, clen_changed, thisgrp_len;

        while (len > 0) {
                ext4_get_group_no_and_offset(sb, block, &group, &blkoff);

                /*
                 * Check to see if we are freeing blocks across a group
                 * boundary.
                 * In case of flex_bg, this can happen that (block, len) may
                 * span across more than one group. In that case we need to
                 * get the corresponding group metadata to work with.
                 * For this we have goto again loop.
                 */
                thisgrp_len = min_t(unsigned int, (unsigned int)len,
                        EXT4_BLOCKS_PER_GROUP(sb) - EXT4_C2B(sbi, blkoff));
                clen = EXT4_NUM_B2C(sbi, thisgrp_len);

                bitmap_bh = ext4_read_block_bitmap(sb, group);
                if (IS_ERR(bitmap_bh)) {
                        err = PTR_ERR(bitmap_bh);
                        bitmap_bh = NULL;
                        break;
                }

                err = -EIO;
                gdp = ext4_get_group_desc(sb, group, &gdp_bh);
                if (!gdp)
                        break;

                ext4_lock_group(sb, group);
                already = 0;
                for (i = 0; i < clen; i++)
                        if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) ==
                                         !state)
                                already++;

                clen_changed = clen - already;
                if (state)
                        ext4_set_bits(bitmap_bh->b_data, blkoff, clen);
                else
                        mb_test_and_clear_bits(bitmap_bh->b_data, blkoff, clen);
                if (ext4_has_group_desc_csum(sb) &&
                    (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
                        gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
                        ext4_free_group_clusters_set(sb, gdp,
                             ext4_free_clusters_after_init(sb, group, gdp));
                }
                if (state)
                        clen = ext4_free_group_clusters(sb, gdp) - clen_changed;
                else
                        clen = ext4_free_group_clusters(sb, gdp) + clen_changed;

                ext4_free_group_clusters_set(sb, gdp, clen);
                ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh);
                ext4_group_desc_csum_set(sb, group, gdp);

                ext4_unlock_group(sb, group);

                if (sbi->s_log_groups_per_flex) {
                        ext4_group_t flex_group = ext4_flex_group(sbi, group);
                        struct flex_groups *fg = sbi_array_rcu_deref(sbi,
                                                   s_flex_groups, flex_group);

                        if (state)
                                atomic64_sub(clen_changed, &fg->free_clusters);
                        else
                                atomic64_add(clen_changed, &fg->free_clusters);

                }

                err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
                if (err)
                        break;
                sync_dirty_buffer(bitmap_bh);
                err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
                sync_dirty_buffer(gdp_bh);
                if (err)
                        break;

                block += thisgrp_len;
                len -= thisgrp_len;
                brelse(bitmap_bh);
                BUG_ON(len < 0);
        }

        if (err)
                brelse(bitmap_bh);
}

/*
 * here we normalize request for locality group
 * Group request are normalized to s_mb_group_prealloc, which goes to
 * s_strip if we set the same via mount option.
 * s_mb_group_prealloc can be configured via
 * /sys/fs/ext4/<partition>/mb_group_prealloc
 *
 * XXX: should we try to preallocate more than the group has now?
 */
static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
{
        struct super_block *sb = ac->ac_sb;
        struct ext4_locality_group *lg = ac->ac_lg;

        BUG_ON(lg == NULL);
        ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
        mb_debug(sb, "goal %u blocks for locality group\n", ac->ac_g_ex.fe_len);
}

/*
 * Normalization means making request better in terms of
 * size and alignment
 */
static noinline_for_stack void
ext4_mb_normalize_request(struct ext4_allocation_context *ac,
                                struct ext4_allocation_request *ar)
{
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        struct ext4_super_block *es = sbi->s_es;
        int bsbits, max;
        loff_t size, start_off, end;
        loff_t orig_size __maybe_unused;
        ext4_lblk_t start;
        struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
        struct ext4_prealloc_space *pa;

        /* do normalize only data requests, metadata requests
           do not need preallocation */
        if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
                return;

        /* sometime caller may want exact blocks */
        if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
                return;

        /* caller may indicate that preallocation isn't
         * required (it's a tail, for example) */
        if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC)
                return;

        if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) {
                ext4_mb_normalize_group_request(ac);
                return ;
        }

        bsbits = ac->ac_sb->s_blocksize_bits;

        /* first, let's learn actual file size
         * given current request is allocated */
        size = extent_logical_end(sbi, &ac->ac_o_ex);
        size = size << bsbits;
        if (size < i_size_read(ac->ac_inode))
                size = i_size_read(ac->ac_inode);
        orig_size = size;

        /* max size of free chunks */
        max = 2 << bsbits;

#define NRL_CHECK_SIZE(req, size, max, chunk_size)        \
                (req <= (size) || max <= (chunk_size))

        /* first, try to predict filesize */
        /* XXX: should this table be tunable? */
        start_off = 0;
        if (size <= 16 * 1024) {
                size = 16 * 1024;
        } else if (size <= 32 * 1024) {
                size = 32 * 1024;
        } else if (size <= 64 * 1024) {
                size = 64 * 1024;
        } else if (size <= 128 * 1024) {
                size = 128 * 1024;
        } else if (size <= 256 * 1024) {
                size = 256 * 1024;
        } else if (size <= 512 * 1024) {
                size = 512 * 1024;
        } else if (size <= 1024 * 1024) {
                size = 1024 * 1024;
        } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
                start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
                                                (21 - bsbits)) << 21;
                size = 2 * 1024 * 1024;
        } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
                start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
                                                        (22 - bsbits)) << 22;
                size = 4 * 1024 * 1024;
        } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
                                        (8<<20)>>bsbits, max, 8 * 1024)) {
                start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
                                                        (23 - bsbits)) << 23;
                size = 8 * 1024 * 1024;
        } else {
                start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits;
                size          = (loff_t) EXT4_C2B(EXT4_SB(ac->ac_sb),
                                              ac->ac_o_ex.fe_len) << bsbits;
        }
        size = size >> bsbits;
        start = start_off >> bsbits;

        /*
         * For tiny groups (smaller than 8MB) the chosen allocation
         * alignment may be larger than group size. Make sure the
         * alignment does not move allocation to a different group which
         * makes mballoc fail assertions later.
         */
        start = max(start, rounddown(ac->ac_o_ex.fe_logical,
                        (ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb)));

        /* avoid unnecessary preallocation that may trigger assertions */
        if (start + size > EXT_MAX_BLOCKS)
                size = EXT_MAX_BLOCKS - start;

        /* don't cover already allocated blocks in selected range */
        if (ar->pleft && start <= ar->lleft) {
                size -= ar->lleft + 1 - start;
                start = ar->lleft + 1;
        }
        if (ar->pright && start + size - 1 >= ar->lright)
                size -= start + size - ar->lright;

        /*
         * Trim allocation request for filesystems with artificially small
         * groups.
         */
        if (size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb))
                size = EXT4_BLOCKS_PER_GROUP(ac->ac_sb);

        end = start + size;

        /* check we don't cross already preallocated blocks */
        rcu_read_lock();
        list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
                loff_t pa_end;

                if (pa->pa_deleted)
                        continue;
                spin_lock(&pa->pa_lock);
                if (pa->pa_deleted) {
                        spin_unlock(&pa->pa_lock);
                        continue;
                }

                pa_end = pa_logical_end(EXT4_SB(ac->ac_sb), pa);

                /* PA must not overlap original request */
                BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
                        ac->ac_o_ex.fe_logical < pa->pa_lstart));

                /* skip PAs this normalized request doesn't overlap with */
                if (pa->pa_lstart >= end || pa_end <= start) {
                        spin_unlock(&pa->pa_lock);
                        continue;
                }
                BUG_ON(pa->pa_lstart <= start && pa_end >= end);

                /* adjust start or end to be adjacent to this pa */
                if (pa_end <= ac->ac_o_ex.fe_logical) {
                        BUG_ON(pa_end < start);
                        start = pa_end;
                } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
                        BUG_ON(pa->pa_lstart > end);
                        end = pa->pa_lstart;
                }
                spin_unlock(&pa->pa_lock);
        }
        rcu_read_unlock();
        size = end - start;

        /* XXX: extra loop to check we really don't overlap preallocations */
        rcu_read_lock();
        list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
                loff_t pa_end;

                spin_lock(&pa->pa_lock);
                if (pa->pa_deleted == 0) {
                        pa_end = pa_logical_end(EXT4_SB(ac->ac_sb), pa);
                        BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
                }
                spin_unlock(&pa->pa_lock);
        }
        rcu_read_unlock();

        if (start + size <= ac->ac_o_ex.fe_logical &&
                        start > ac->ac_o_ex.fe_logical) {
                ext4_msg(ac->ac_sb, KERN_ERR,
                         "start %lu, size %lu, fe_logical %lu",
                         (unsigned long) start, (unsigned long) size,
                         (unsigned long) ac->ac_o_ex.fe_logical);
                BUG();
        }
        BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));

        /* now prepare goal request */

        /* XXX: is it better to align blocks WRT to logical
         * placement or satisfy big request as is */
        ac->ac_g_ex.fe_logical = start;
        ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);

        /* define goal start in order to merge */
        if (ar->pright && (ar->lright == (start + size)) &&
            ar->pright >= size &&
            ar->pright - size >= le32_to_cpu(es->s_first_data_block)) {
                /* merge to the right */
                ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
                                                &ac->ac_g_ex.fe_group,
                                                &ac->ac_g_ex.fe_start);
                ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
        }
        if (ar->pleft && (ar->lleft + 1 == start) &&
            ar->pleft + 1 < ext4_blocks_count(es)) {
                /* merge to the left */
                ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
                                                &ac->ac_g_ex.fe_group,
                                                &ac->ac_g_ex.fe_start);
                ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
        }

        mb_debug(ac->ac_sb, "goal: %lld(was %lld) blocks at %u\n", size,
                 orig_size, start);
}

static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
{
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);

        if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) {
                atomic_inc(&sbi->s_bal_reqs);
                atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
                if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
                        atomic_inc(&sbi->s_bal_success);
                atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
                atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned);
                if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
                                ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
                        atomic_inc(&sbi->s_bal_goals);
                if (ac->ac_found > sbi->s_mb_max_to_scan)
                        atomic_inc(&sbi->s_bal_breaks);
        }

        if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
                trace_ext4_mballoc_alloc(ac);
        else
                trace_ext4_mballoc_prealloc(ac);
}

/*
 * Called on failure; free up any blocks from the inode PA for this
 * context.  We don't need this for MB_GROUP_PA because we only change
 * pa_free in ext4_mb_release_context(), but on failure, we've already
 * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
 */
static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
{
        struct ext4_prealloc_space *pa = ac->ac_pa;
        struct ext4_buddy e4b;
        int err;

        if (pa == NULL) {
                if (ac->ac_f_ex.fe_len == 0)
                        return;
                err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b);
                if (err) {
                        /*
                         * This should never happen since we pin the
                         * pages in the ext4_allocation_context so
                         * ext4_mb_load_buddy() should never fail.
                         */
                        WARN(1, "mb_load_buddy failed (%d)", err);
                        return;
                }
                ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
                mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start,
                               ac->ac_f_ex.fe_len);
                ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
                ext4_mb_unload_buddy(&e4b);
                return;
        }
        if (pa->pa_type == MB_INODE_PA)
                pa->pa_free += ac->ac_b_ex.fe_len;
}

/*
 * use blocks preallocated to inode
 */
static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
                                struct ext4_prealloc_space *pa)
{
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        ext4_fsblk_t start;
        ext4_fsblk_t end;
        int len;

        /* found preallocated blocks, use them */
        start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
        end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
                  start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len));
        len = EXT4_NUM_B2C(sbi, end - start);
        ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
                                        &ac->ac_b_ex.fe_start);
        ac->ac_b_ex.fe_len = len;
        ac->ac_status = AC_STATUS_FOUND;
        ac->ac_pa = pa;

        BUG_ON(start < pa->pa_pstart);
        BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
        BUG_ON(pa->pa_free < len);
        BUG_ON(ac->ac_b_ex.fe_len <= 0);
        pa->pa_free -= len;

        mb_debug(ac->ac_sb, "use %llu/%d from inode pa %p\n", start, len, pa);
}

/*
 * use blocks preallocated to locality group
 */
static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
                                struct ext4_prealloc_space *pa)
{
        unsigned int len = ac->ac_o_ex.fe_len;

        ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
                                        &ac->ac_b_ex.fe_group,
                                        &ac->ac_b_ex.fe_start);
        ac->ac_b_ex.fe_len = len;
        ac->ac_status = AC_STATUS_FOUND;
        ac->ac_pa = pa;

        /* we don't correct pa_pstart or pa_plen here to avoid
         * possible race when the group is being loaded concurrently
         * instead we correct pa later, after blocks are marked
         * in on-disk bitmap -- see ext4_mb_release_context()
         * Other CPUs are prevented from allocating from this pa by lg_mutex
         */
        mb_debug(ac->ac_sb, "use %u/%u from group pa %p\n",
                 pa->pa_lstart-len, len, pa);
}

/*
 * Return the prealloc space that have minimal distance
 * from the goal block. @cpa is the prealloc
 * space that is having currently known minimal distance
 * from the goal block.
 */
static struct ext4_prealloc_space *
ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
                        struct ext4_prealloc_space *pa,
                        struct ext4_prealloc_space *cpa)
{
        ext4_fsblk_t cur_distance, new_distance;

        if (cpa == NULL) {
                atomic_inc(&pa->pa_count);
                return pa;
        }
        cur_distance = abs(goal_block - cpa->pa_pstart);
        new_distance = abs(goal_block - pa->pa_pstart);

        if (cur_distance <= new_distance)
                return cpa;

        /* drop the previous reference */
        atomic_dec(&cpa->pa_count);
        atomic_inc(&pa->pa_count);
        return pa;
}

/*
 * search goal blocks in preallocated space
 */
static noinline_for_stack bool
ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
{
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        int order, i;
        struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
        struct ext4_locality_group *lg;
        struct ext4_prealloc_space *pa, *cpa = NULL;
        ext4_fsblk_t goal_block;

        /* only data can be preallocated */
        if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
                return false;

        /* first, try per-file preallocation */
        rcu_read_lock();
        list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {

                /* all fields in this condition don't change,
                 * so we can skip locking for them */
                if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
                    ac->ac_o_ex.fe_logical >= pa_logical_end(sbi, pa))
                        continue;

                /* non-extent files can't have physical blocks past 2^32 */
                if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
                    (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) >
                     EXT4_MAX_BLOCK_FILE_PHYS))
                        continue;

                /* found preallocated blocks, use them */
                spin_lock(&pa->pa_lock);
                if (pa->pa_deleted == 0 && pa->pa_free) {
                        atomic_inc(&pa->pa_count);
                        ext4_mb_use_inode_pa(ac, pa);
                        spin_unlock(&pa->pa_lock);
                        ac->ac_criteria = 10;
                        rcu_read_unlock();
                        return true;
                }
                spin_unlock(&pa->pa_lock);
        }
        rcu_read_unlock();

        /* can we use group allocation? */
        if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
                return false;

        /* inode may have no locality group for some reason */
        lg = ac->ac_lg;
        if (lg == NULL)
                return false;
        order  = fls(ac->ac_o_ex.fe_len) - 1;
        if (order > PREALLOC_TB_SIZE - 1)
                /* The max size of hash table is PREALLOC_TB_SIZE */
                order = PREALLOC_TB_SIZE - 1;

        goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
        /*
         * search for the prealloc space that is having
         * minimal distance from the goal block.
         */
        for (i = order; i < PREALLOC_TB_SIZE; i++) {
                rcu_read_lock();
                list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
                                        pa_inode_list) {
                        spin_lock(&pa->pa_lock);
                        if (pa->pa_deleted == 0 &&
                                        pa->pa_free >= ac->ac_o_ex.fe_len) {

                                cpa = ext4_mb_check_group_pa(goal_block,
                                                                pa, cpa);
                        }
                        spin_unlock(&pa->pa_lock);
                }
                rcu_read_unlock();
        }
        if (cpa) {
                ext4_mb_use_group_pa(ac, cpa);
                ac->ac_criteria = 20;
                return true;
        }
        return false;
}

/*
 * the function goes through all block freed in the group
 * but not yet committed and marks them used in in-core bitmap.
 * buddy must be generated from this bitmap
 * Need to be called with the ext4 group lock held
 */
static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
                                                ext4_group_t group)
{
        struct rb_node *n;
        struct ext4_group_info *grp;
        struct ext4_free_data *entry;

        grp = ext4_get_group_info(sb, group);
        if (!grp)
                return;
        n = rb_first(&(grp->bb_free_root));

        while (n) {
                entry = rb_entry(n, struct ext4_free_data, efd_node);
                ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
                n = rb_next(n);
        }
        return;
}

/*
 * the function goes through all preallocation in this group and marks them
 * used in in-core bitmap. buddy must be generated from this bitmap
 * Need to be called with ext4 group lock held
 */
static noinline_for_stack
void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                                        ext4_group_t group)
{
        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
        struct ext4_prealloc_space *pa;
        struct list_head *cur;
        ext4_group_t groupnr;
        ext4_grpblk_t start;
        int preallocated = 0;
        int len;

        if (!grp)
                return;

        /* all form of preallocation discards first load group,
         * so the only competing code is preallocation use.
         * we don't need any locking here
         * notice we do NOT ignore preallocations with pa_deleted
         * otherwise we could leave used blocks available for
         * allocation in buddy when concurrent ext4_mb_put_pa()
         * is dropping preallocation
         */
        list_for_each(cur, &grp->bb_prealloc_list) {
                pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
                spin_lock(&pa->pa_lock);
                ext4_get_group_no_and_offset(sb, pa->pa_pstart,
                                             &groupnr, &start);
                len = pa->pa_len;
                spin_unlock(&pa->pa_lock);
                if (unlikely(len == 0))
                        continue;
                BUG_ON(groupnr != group);
                ext4_set_bits(bitmap, start, len);
                preallocated += len;
        }
        mb_debug(sb, "preallocated %d for group %u\n", preallocated, group);
}

static void ext4_mb_mark_pa_deleted(struct super_block *sb,
                                    struct ext4_prealloc_space *pa)
{
        struct ext4_inode_info *ei;

        if (pa->pa_deleted) {
                ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n",
                             pa->pa_type, pa->pa_pstart, pa->pa_lstart,
                             pa->pa_len);
                return;
        }

        pa->pa_deleted = 1;

        if (pa->pa_type == MB_INODE_PA) {
                ei = EXT4_I(pa->pa_inode);
                atomic_dec(&ei->i_prealloc_active);
        }
}

static void ext4_mb_pa_callback(struct rcu_head *head)
{
        struct ext4_prealloc_space *pa;
        pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);

        BUG_ON(atomic_read(&pa->pa_count));
        BUG_ON(pa->pa_deleted == 0);
        kmem_cache_free(ext4_pspace_cachep, pa);
}

/*
 * drops a reference to preallocated space descriptor
 * if this was the last reference and the space is consumed
 */
static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
                        struct super_block *sb, struct ext4_prealloc_space *pa)
{
        ext4_group_t grp;
        ext4_fsblk_t grp_blk;

        /* in this short window concurrent discard can set pa_deleted */
        spin_lock(&pa->pa_lock);
        if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) {
                spin_unlock(&pa->pa_lock);
                return;
        }

        if (pa->pa_deleted == 1) {
                spin_unlock(&pa->pa_lock);
                return;
        }

        ext4_mb_mark_pa_deleted(sb, pa);
        spin_unlock(&pa->pa_lock);

        grp_blk = pa->pa_pstart;
        /*
         * If doing group-based preallocation, pa_pstart may be in the
         * next group when pa is used up
         */
        if (pa->pa_type == MB_GROUP_PA)
                grp_blk--;

        grp = ext4_get_group_number(sb, grp_blk);

        /*
         * possible race:
         *
         *  P1 (buddy init)                        P2 (regular allocation)
         *                                        find block B in PA
         *  copy on-disk bitmap to buddy
         *                                          mark B in on-disk bitmap
         *                                        drop PA from group
         *  mark all PAs in buddy
         *
         * thus, P1 initializes buddy with B available. to prevent this
         * we make "copy" and "mark all PAs" atomic and serialize "drop PA"
         * against that pair
         */
        ext4_lock_group(sb, grp);
        list_del(&pa->pa_group_list);
        ext4_unlock_group(sb, grp);

        spin_lock(pa->pa_obj_lock);
        list_del_rcu(&pa->pa_inode_list);
        spin_unlock(pa->pa_obj_lock);

        call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
}

/*
 * creates new preallocated space for given inode
 */
static noinline_for_stack void
ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
{
        struct super_block *sb = ac->ac_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_prealloc_space *pa;
        struct ext4_group_info *grp;
        struct ext4_inode_info *ei;

        /* preallocate only when found space is larger then requested */
        BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
        BUG_ON(ac->ac_status != AC_STATUS_FOUND);
        BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
        BUG_ON(ac->ac_pa == NULL);

        pa = ac->ac_pa;

        if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
                struct ext4_free_extent ex = {
                        .fe_logical = ac->ac_g_ex.fe_logical,
                        .fe_len = ac->ac_g_ex.fe_len,
                };
                loff_t orig_goal_end = extent_logical_end(sbi, &ex);
                loff_t o_ex_end = extent_logical_end(sbi, &ac->ac_o_ex);

                /*
                 * We can't allocate as much as normalizer wants, so we try
                 * to get proper lstart to cover the original request, except
                 * when the goal doesn't cover the original request as below:
                 *
                 * orig_ex:2045/2055(10), isize:8417280 -> normalized:0/2048
                 * best_ex:0/200(200) -> adjusted: 1848/2048(200)
                 */
                BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
                BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);

                /*
                 * Use the below logic for adjusting best extent as it keeps
                 * fragmentation in check while ensuring logical range of best
                 * extent doesn't overflow out of goal extent:
                 *
                 * 1. Check if best ex can be kept at end of goal and still
                 *    cover original start
                 * 2. Else, check if best ex can be kept at start of goal and
                 *    still cover original end
                 * 3. Else, keep the best ex at start of original request.
                 */
                ex.fe_len = ac->ac_b_ex.fe_len;

                ex.fe_logical = orig_goal_end - EXT4_C2B(sbi, ex.fe_len);
                if (ac->ac_o_ex.fe_logical >= ex.fe_logical)
                        goto adjust_bex;

                ex.fe_logical = ac->ac_g_ex.fe_logical;
                if (o_ex_end <= extent_logical_end(sbi, &ex))
                        goto adjust_bex;

                ex.fe_logical = ac->ac_o_ex.fe_logical;
adjust_bex:
                ac->ac_b_ex.fe_logical = ex.fe_logical;

                BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
                BUG_ON(extent_logical_end(sbi, &ex) > orig_goal_end);
        }

        /* preallocation can change ac_b_ex, thus we store actually
         * allocated blocks for history */
        ac->ac_f_ex = ac->ac_b_ex;

        pa->pa_lstart = ac->ac_b_ex.fe_logical;
        pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
        pa->pa_len = ac->ac_b_ex.fe_len;
        pa->pa_free = pa->pa_len;
        spin_lock_init(&pa->pa_lock);
        INIT_LIST_HEAD(&pa->pa_inode_list);
        INIT_LIST_HEAD(&pa->pa_group_list);
        pa->pa_deleted = 0;
        pa->pa_type = MB_INODE_PA;

        mb_debug(sb, "new inode pa %p: %llu/%d for %u\n", pa, pa->pa_pstart,
                 pa->pa_len, pa->pa_lstart);
        trace_ext4_mb_new_inode_pa(ac, pa);

        ext4_mb_use_inode_pa(ac, pa);
        atomic_add(pa->pa_free, &sbi->s_mb_preallocated);

        ei = EXT4_I(ac->ac_inode);
        grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
        if (!grp)
                return;

        pa->pa_obj_lock = &ei->i_prealloc_lock;
        pa->pa_inode = ac->ac_inode;

        list_add(&pa->pa_group_list, &grp->bb_prealloc_list);

        spin_lock(pa->pa_obj_lock);
        list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
        spin_unlock(pa->pa_obj_lock);
        atomic_inc(&ei->i_prealloc_active);
}

/*
 * creates new preallocated space for locality group inodes belongs to
 */
static noinline_for_stack void
ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
{
        struct super_block *sb = ac->ac_sb;
        struct ext4_locality_group *lg;
        struct ext4_prealloc_space *pa;
        struct ext4_group_info *grp;

        /* preallocate only when found space is larger then requested */
        BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
        BUG_ON(ac->ac_status != AC_STATUS_FOUND);
        BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
        BUG_ON(ac->ac_pa == NULL);

        pa = ac->ac_pa;

        /* preallocation can change ac_b_ex, thus we store actually
         * allocated blocks for history */
        ac->ac_f_ex = ac->ac_b_ex;

        pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
        pa->pa_lstart = pa->pa_pstart;
        pa->pa_len = ac->ac_b_ex.fe_len;
        pa->pa_free = pa->pa_len;
        spin_lock_init(&pa->pa_lock);
        INIT_LIST_HEAD(&pa->pa_inode_list);
        INIT_LIST_HEAD(&pa->pa_group_list);
        pa->pa_deleted = 0;
        pa->pa_type = MB_GROUP_PA;

        mb_debug(sb, "new group pa %p: %llu/%d for %u\n", pa, pa->pa_pstart,
                 pa->pa_len, pa->pa_lstart);
        trace_ext4_mb_new_group_pa(ac, pa);

        ext4_mb_use_group_pa(ac, pa);
        atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);

        grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
        if (!grp)
                return;
        lg = ac->ac_lg;
        BUG_ON(lg == NULL);

        pa->pa_obj_lock = &lg->lg_prealloc_lock;
        pa->pa_inode = NULL;

        list_add(&pa->pa_group_list, &grp->bb_prealloc_list);

        /*
         * We will later add the new pa to the right bucket
         * after updating the pa_free in ext4_mb_release_context
         */
}

static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
{
        if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
                ext4_mb_new_group_pa(ac);
        else
                ext4_mb_new_inode_pa(ac);
}

/*
 * finds all unused blocks in on-disk bitmap, frees them in
 * in-core bitmap and buddy.
 * @pa must be unlinked from inode and group lists, so that
 * nobody else can find/use it.
 * the caller MUST hold group/inode locks.
 * TODO: optimize the case when there are no in-core structures yet
 */
static noinline_for_stack int
ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
                        struct ext4_prealloc_space *pa)
{
        struct super_block *sb = e4b->bd_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        unsigned int end;
        unsigned int next;
        ext4_group_t group;
        ext4_grpblk_t bit;
        unsigned long long grp_blk_start;
        int free = 0;

        BUG_ON(pa->pa_deleted == 0);
        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
        grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit);
        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
        end = bit + pa->pa_len;

        while (bit < end) {
                bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
                if (bit >= end)
                        break;
                next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
                mb_debug(sb, "free preallocated %u/%u in group %u\n",
                         (unsigned) ext4_group_first_block_no(sb, group) + bit,
                         (unsigned) next - bit, (unsigned) group);
                free += next - bit;

                trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
                trace_ext4_mb_release_inode_pa(pa, (grp_blk_start +
                                                    EXT4_C2B(sbi, bit)),
                                               next - bit);
                mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
                bit = next + 1;
        }
        if (free != pa->pa_free) {
                ext4_msg(e4b->bd_sb, KERN_CRIT,
                         "pa %p: logic %lu, phys. %lu, len %d",
                         pa, (unsigned long) pa->pa_lstart,
                         (unsigned long) pa->pa_pstart,
                         pa->pa_len);
                ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
                                        free, pa->pa_free);
                /*
                 * pa is already deleted so we use the value obtained
                 * from the bitmap and continue.
                 */
        }
        atomic_add(free, &sbi->s_mb_discarded);

        return 0;
}

static noinline_for_stack int
ext4_mb_release_group_pa(struct ext4_buddy *e4b,
                                struct ext4_prealloc_space *pa)
{
        struct super_block *sb = e4b->bd_sb;
        ext4_group_t group;
        ext4_grpblk_t bit;

        trace_ext4_mb_release_group_pa(sb, pa);
        BUG_ON(pa->pa_deleted == 0);
        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
        if (unlikely(group != e4b->bd_group && pa->pa_len != 0)) {
                ext4_warning(sb, "bad group: expected %u, group %u, pa_start %llu",
                             e4b->bd_group, group, pa->pa_pstart);
                return 0;
        }
        mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
        atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
        trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);

        return 0;
}

/*
 * releases all preallocations in given group
 *
 * first, we need to decide discard policy:
 * - when do we discard
 *   1) ENOSPC
 * - how many do we discard
 *   1) how many requested
 */
static noinline_for_stack int
ext4_mb_discard_group_preallocations(struct super_block *sb,
                                     ext4_group_t group, int *busy)
{
        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
        struct buffer_head *bitmap_bh = NULL;
        struct ext4_prealloc_space *pa, *tmp;
        struct list_head list;
        struct ext4_buddy e4b;
        int err;
        int free = 0;

        if (!grp)
                return 0;
        mb_debug(sb, "discard preallocation for group %u\n", group);
        if (list_empty(&grp->bb_prealloc_list))
                goto out_dbg;

        bitmap_bh = ext4_read_block_bitmap(sb, group);
        if (IS_ERR(bitmap_bh)) {
                err = PTR_ERR(bitmap_bh);
                ext4_error_err(sb, -err,
                               "Error %d reading block bitmap for %u",
                               err, group);
                goto out_dbg;
        }

        err = ext4_mb_load_buddy(sb, group, &e4b);
        if (err) {
                ext4_warning(sb, "Error %d loading buddy information for %u",
                             err, group);
                put_bh(bitmap_bh);
                goto out_dbg;
        }

        INIT_LIST_HEAD(&list);
        ext4_lock_group(sb, group);
        list_for_each_entry_safe(pa, tmp,
                                &grp->bb_prealloc_list, pa_group_list) {
                spin_lock(&pa->pa_lock);
                if (atomic_read(&pa->pa_count)) {
                        spin_unlock(&pa->pa_lock);
                        *busy = 1;
                        continue;
                }
                if (pa->pa_deleted) {
                        spin_unlock(&pa->pa_lock);
                        continue;
                }

                /* seems this one can be freed ... */
                ext4_mb_mark_pa_deleted(sb, pa);

                if (!free)
                        this_cpu_inc(discard_pa_seq);

                /* we can trust pa_free ... */
                free += pa->pa_free;

                spin_unlock(&pa->pa_lock);

                list_del(&pa->pa_group_list);
                list_add(&pa->u.pa_tmp_list, &list);
        }

        /* now free all selected PAs */
        list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {

                /* remove from object (inode or locality group) */
                spin_lock(pa->pa_obj_lock);
                list_del_rcu(&pa->pa_inode_list);
                spin_unlock(pa->pa_obj_lock);

                if (pa->pa_type == MB_GROUP_PA)
                        ext4_mb_release_group_pa(&e4b, pa);
                else
                        ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);

                list_del(&pa->u.pa_tmp_list);
                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
        }

        ext4_unlock_group(sb, group);
        ext4_mb_unload_buddy(&e4b);
        put_bh(bitmap_bh);
out_dbg:
        mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n",
                 free, group, grp->bb_free);
        return free;
}

/*
 * releases all non-used preallocated blocks for given inode
 *
 * It's important to discard preallocations under i_data_sem
 * We don't want another block to be served from the prealloc
 * space when we are discarding the inode prealloc space.
 *
 * FIXME!! Make sure it is valid at all the call sites
 */
void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct super_block *sb = inode->i_sb;
        struct buffer_head *bitmap_bh = NULL;
        struct ext4_prealloc_space *pa, *tmp;
        ext4_group_t group = 0;
        struct list_head list;
        struct ext4_buddy e4b;
        int err;

        if (!S_ISREG(inode->i_mode)) {
                /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
                return;
        }

        if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
                return;

        mb_debug(sb, "discard preallocation for inode %lu\n",
                 inode->i_ino);
        trace_ext4_discard_preallocations(inode,
                        atomic_read(&ei->i_prealloc_active), needed);

        INIT_LIST_HEAD(&list);

        if (needed == 0)
                needed = UINT_MAX;

repeat:
        /* first, collect all pa's in the inode */
        spin_lock(&ei->i_prealloc_lock);
        while (!list_empty(&ei->i_prealloc_list) && needed) {
                pa = list_entry(ei->i_prealloc_list.prev,
                                struct ext4_prealloc_space, pa_inode_list);
                BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
                spin_lock(&pa->pa_lock);
                if (atomic_read(&pa->pa_count)) {
                        /* this shouldn't happen often - nobody should
                         * use preallocation while we're discarding it */
                        spin_unlock(&pa->pa_lock);
                        spin_unlock(&ei->i_prealloc_lock);
                        ext4_msg(sb, KERN_ERR,
                                 "uh-oh! used pa while discarding");
                        WARN_ON(1);
                        schedule_timeout_uninterruptible(HZ);
                        goto repeat;

                }
                if (pa->pa_deleted == 0) {
                        ext4_mb_mark_pa_deleted(sb, pa);
                        spin_unlock(&pa->pa_lock);
                        list_del_rcu(&pa->pa_inode_list);
                        list_add(&pa->u.pa_tmp_list, &list);
                        needed--;
                        continue;
                }

                /* someone is deleting pa right now */
                spin_unlock(&pa->pa_lock);
                spin_unlock(&ei->i_prealloc_lock);

                /* we have to wait here because pa_deleted
                 * doesn't mean pa is already unlinked from
                 * the list. as we might be called from
                 * ->clear_inode() the inode will get freed
                 * and concurrent thread which is unlinking
                 * pa from inode's list may access already
                 * freed memory, bad-bad-bad */

                /* XXX: if this happens too often, we can
                 * add a flag to force wait only in case
                 * of ->clear_inode(), but not in case of
                 * regular truncate */
                schedule_timeout_uninterruptible(HZ);
                goto repeat;
        }
        spin_unlock(&ei->i_prealloc_lock);

        list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
                BUG_ON(pa->pa_type != MB_INODE_PA);
                group = ext4_get_group_number(sb, pa->pa_pstart);

                err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
                                             GFP_NOFS|__GFP_NOFAIL);
                if (err) {
                        ext4_error_err(sb, -err, "Error %d loading buddy information for %u",
                                       err, group);
                        continue;
                }

                bitmap_bh = ext4_read_block_bitmap(sb, group);
                if (IS_ERR(bitmap_bh)) {
                        err = PTR_ERR(bitmap_bh);
                        ext4_error_err(sb, -err, "Error %d reading block bitmap for %u",
                                       err, group);
                        ext4_mb_unload_buddy(&e4b);
                        continue;
                }

                ext4_lock_group(sb, group);
                list_del(&pa->pa_group_list);
                ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
                ext4_unlock_group(sb, group);

                ext4_mb_unload_buddy(&e4b);
                put_bh(bitmap_bh);

                list_del(&pa->u.pa_tmp_list);
                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
        }
}

static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac)
{
        struct ext4_prealloc_space *pa;

        BUG_ON(ext4_pspace_cachep == NULL);
        pa = kmem_cache_zalloc(ext4_pspace_cachep, GFP_NOFS);
        if (!pa)
                return -ENOMEM;
        atomic_set(&pa->pa_count, 1);
        ac->ac_pa = pa;
        return 0;
}

static void ext4_mb_pa_free(struct ext4_allocation_context *ac)
{
        struct ext4_prealloc_space *pa = ac->ac_pa;

        BUG_ON(!pa);
        ac->ac_pa = NULL;
        WARN_ON(!atomic_dec_and_test(&pa->pa_count));
        kmem_cache_free(ext4_pspace_cachep, pa);
}

#ifdef CONFIG_EXT4_DEBUG
static inline void ext4_mb_show_pa(struct super_block *sb)
{
        ext4_group_t i, ngroups;

        if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
                return;

        ngroups = ext4_get_groups_count(sb);
        mb_debug(sb, "groups: ");
        for (i = 0; i < ngroups; i++) {
                struct ext4_group_info *grp = ext4_get_group_info(sb, i);
                struct ext4_prealloc_space *pa;
                ext4_grpblk_t start;
                struct list_head *cur;

                if (!grp)
                        continue;
                ext4_lock_group(sb, i);
                list_for_each(cur, &grp->bb_prealloc_list) {
                        pa = list_entry(cur, struct ext4_prealloc_space,
                                        pa_group_list);
                        spin_lock(&pa->pa_lock);
                        ext4_get_group_no_and_offset(sb, pa->pa_pstart,
                                                     NULL, &start);
                        spin_unlock(&pa->pa_lock);
                        mb_debug(sb, "PA:%u:%d:%d\n", i, start,
                                 pa->pa_len);
                }
                ext4_unlock_group(sb, i);
                mb_debug(sb, "%u: %d/%d\n", i, grp->bb_free,
                         grp->bb_fragments);
        }
}

static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
{
        struct super_block *sb = ac->ac_sb;

        if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
                return;

        mb_debug(sb, "Can't allocate:"
                        " Allocation context details:");
        mb_debug(sb, "status %u flags 0x%x",
                        ac->ac_status, ac->ac_flags);
        mb_debug(sb, "orig %lu/%lu/%lu@%lu, "
                        "goal %lu/%lu/%lu@%lu, "
                        "best %lu/%lu/%lu@%lu cr %d",
                        (unsigned long)ac->ac_o_ex.fe_group,
                        (unsigned long)ac->ac_o_ex.fe_start,
                        (unsigned long)ac->ac_o_ex.fe_len,
                        (unsigned long)ac->ac_o_ex.fe_logical,
                        (unsigned long)ac->ac_g_ex.fe_group,
                        (unsigned long)ac->ac_g_ex.fe_start,
                        (unsigned long)ac->ac_g_ex.fe_len,
                        (unsigned long)ac->ac_g_ex.fe_logical,
                        (unsigned long)ac->ac_b_ex.fe_group,
                        (unsigned long)ac->ac_b_ex.fe_start,
                        (unsigned long)ac->ac_b_ex.fe_len,
                        (unsigned long)ac->ac_b_ex.fe_logical,
                        (int)ac->ac_criteria);
        mb_debug(sb, "%u found", ac->ac_found);
        ext4_mb_show_pa(sb);
}
#else
static inline void ext4_mb_show_pa(struct super_block *sb)
{
        return;
}
static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
{
        ext4_mb_show_pa(ac->ac_sb);
        return;
}
#endif

/*
 * We use locality group preallocation for small size file. The size of the
 * file is determined by the current size or the resulting size after
 * allocation which ever is larger
 *
 * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
 */
static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
{
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        int bsbits = ac->ac_sb->s_blocksize_bits;
        loff_t size, isize;

        if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
                return;

        if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
                return;

        size = extent_logical_end(sbi, &ac->ac_o_ex);
        isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
                >> bsbits;

        if ((size == isize) && !ext4_fs_is_busy(sbi) &&
            !inode_is_open_for_write(ac->ac_inode)) {
                ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
                return;
        }

        if (sbi->s_mb_group_prealloc <= 0) {
                ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
                return;
        }

        /* don't use group allocation for large files */
        size = max(size, isize);
        if (size > sbi->s_mb_stream_request) {
                ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
                return;
        }

        BUG_ON(ac->ac_lg != NULL);
        /*
         * locality group prealloc space are per cpu. The reason for having
         * per cpu locality group is to reduce the contention between block
         * request from multiple CPUs.
         */
        ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups);

        /* we're going to use group allocation */
        ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;

        /* serialize all allocations in the group */
        mutex_lock(&ac->ac_lg->lg_mutex);
}

static noinline_for_stack int
ext4_mb_initialize_context(struct ext4_allocation_context *ac,
                                struct ext4_allocation_request *ar)
{
        struct super_block *sb = ar->inode->i_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        ext4_group_t group;
        unsigned int len;
        ext4_fsblk_t goal;
        ext4_grpblk_t block;

        /* we can't allocate > group size */
        len = ar->len;

        /* just a dirty hack to filter too big requests  */
        if (len >= EXT4_CLUSTERS_PER_GROUP(sb))
                len = EXT4_CLUSTERS_PER_GROUP(sb);

        /* start searching from the goal */
        goal = ar->goal;
        if (goal < le32_to_cpu(es->s_first_data_block) ||
                        goal >= ext4_blocks_count(es))
                goal = le32_to_cpu(es->s_first_data_block);
        ext4_get_group_no_and_offset(sb, goal, &group, &block);

        /* set up allocation goals */
        ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);
        ac->ac_status = AC_STATUS_CONTINUE;
        ac->ac_sb = sb;
        ac->ac_inode = ar->inode;
        ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
        ac->ac_o_ex.fe_group = group;
        ac->ac_o_ex.fe_start = block;
        ac->ac_o_ex.fe_len = len;
        ac->ac_g_ex = ac->ac_o_ex;
        ac->ac_flags = ar->flags;

        /* we have to define context: we'll work with a file or
         * locality group. this is a policy, actually */
        ext4_mb_group_or_file(ac);

        mb_debug(sb, "init ac: %u blocks @ %u, goal %u, flags 0x%x, 2^%d, "
                        "left: %u/%u, right %u/%u to %swritable\n",
                        (unsigned) ar->len, (unsigned) ar->logical,
                        (unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
                        (unsigned) ar->lleft, (unsigned) ar->pleft,
                        (unsigned) ar->lright, (unsigned) ar->pright,
                        inode_is_open_for_write(ar->inode) ? "" : "non-");
        return 0;

}

static noinline_for_stack void
ext4_mb_discard_lg_preallocations(struct super_block *sb,
                                        struct ext4_locality_group *lg,
                                        int order, int total_entries)
{
        ext4_group_t group = 0;
        struct ext4_buddy e4b;
        struct list_head discard_list;
        struct ext4_prealloc_space *pa, *tmp;

        mb_debug(sb, "discard locality group preallocation\n");

        INIT_LIST_HEAD(&discard_list);

        spin_lock(&lg->lg_prealloc_lock);
        list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
                                pa_inode_list,
                                lockdep_is_held(&lg->lg_prealloc_lock)) {
                spin_lock(&pa->pa_lock);
                if (atomic_read(&pa->pa_count)) {
                        /*
                         * This is the pa that we just used
                         * for block allocation. So don't
                         * free that
                         */
                        spin_unlock(&pa->pa_lock);
                        continue;
                }
                if (pa->pa_deleted) {
                        spin_unlock(&pa->pa_lock);
                        continue;
                }
                /* only lg prealloc space */
                BUG_ON(pa->pa_type != MB_GROUP_PA);

                /* seems this one can be freed ... */
                ext4_mb_mark_pa_deleted(sb, pa);
                spin_unlock(&pa->pa_lock);

                list_del_rcu(&pa->pa_inode_list);
                list_add(&pa->u.pa_tmp_list, &discard_list);

                total_entries--;
                if (total_entries <= 5) {
                        /*
                         * we want to keep only 5 entries
                         * allowing it to grow to 8. This
                         * mak sure we don't call discard
                         * soon for this list.
                         */
                        break;
                }
        }
        spin_unlock(&lg->lg_prealloc_lock);

        list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
                int err;

                group = ext4_get_group_number(sb, pa->pa_pstart);
                err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
                                             GFP_NOFS|__GFP_NOFAIL);
                if (err) {
                        ext4_error_err(sb, -err, "Error %d loading buddy information for %u",
                                       err, group);
                        continue;
                }
                ext4_lock_group(sb, group);
                list_del(&pa->pa_group_list);
                ext4_mb_release_group_pa(&e4b, pa);
                ext4_unlock_group(sb, group);

                ext4_mb_unload_buddy(&e4b);
                list_del(&pa->u.pa_tmp_list);
                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
        }
}

/*
 * We have incremented pa_count. So it cannot be freed at this
 * point. Also we hold lg_mutex. So no parallel allocation is
 * possible from this lg. That means pa_free cannot be updated.
 *
 * A parallel ext4_mb_discard_group_preallocations is possible.
 * which can cause the lg_prealloc_list to be updated.
 */

static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
{
        int order, added = 0, lg_prealloc_count = 1;
        struct super_block *sb = ac->ac_sb;
        struct ext4_locality_group *lg = ac->ac_lg;
        struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;

        order = fls(pa->pa_free) - 1;
        if (order > PREALLOC_TB_SIZE - 1)
                /* The max size of hash table is PREALLOC_TB_SIZE */
                order = PREALLOC_TB_SIZE - 1;
        /* Add the prealloc space to lg */
        spin_lock(&lg->lg_prealloc_lock);
        list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
                                pa_inode_list,
                                lockdep_is_held(&lg->lg_prealloc_lock)) {
                spin_lock(&tmp_pa->pa_lock);
                if (tmp_pa->pa_deleted) {
                        spin_unlock(&tmp_pa->pa_lock);
                        continue;
                }
                if (!added && pa->pa_free < tmp_pa->pa_free) {
                        /* Add to the tail of the previous entry */
                        list_add_tail_rcu(&pa->pa_inode_list,
                                                &tmp_pa->pa_inode_list);
                        added = 1;
                        /*
                         * we want to count the total
                         * number of entries in the list
                         */
                }
                spin_unlock(&tmp_pa->pa_lock);
                lg_prealloc_count++;
        }
        if (!added)
                list_add_tail_rcu(&pa->pa_inode_list,
                                        &lg->lg_prealloc_list[order]);
        spin_unlock(&lg->lg_prealloc_lock);

        /* Now trim the list to be not more than 8 elements */
        if (lg_prealloc_count > 8) {
                ext4_mb_discard_lg_preallocations(sb, lg,
                                                  order, lg_prealloc_count);
                return;
        }
        return ;
}

/*
 * if per-inode prealloc list is too long, trim some PA
 */
static void ext4_mb_trim_inode_pa(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        int count, delta;

        count = atomic_read(&ei->i_prealloc_active);
        delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1;
        if (count > sbi->s_mb_max_inode_prealloc + delta) {
                count -= sbi->s_mb_max_inode_prealloc;
                ext4_discard_preallocations(inode, count);
        }
}

/*
 * release all resource we used in allocation
 */
static int ext4_mb_release_context(struct ext4_allocation_context *ac)
{
        struct inode *inode = ac->ac_inode;
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        struct ext4_prealloc_space *pa = ac->ac_pa;
        if (pa) {
                if (pa->pa_type == MB_GROUP_PA) {
                        /* see comment in ext4_mb_use_group_pa() */
                        spin_lock(&pa->pa_lock);
                        pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
                        pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
                        pa->pa_free -= ac->ac_b_ex.fe_len;
                        pa->pa_len -= ac->ac_b_ex.fe_len;
                        spin_unlock(&pa->pa_lock);

                        /*
                         * We want to add the pa to the right bucket.
                         * Remove it from the list and while adding
                         * make sure the list to which we are adding
                         * doesn't grow big.
                         */
                        if (likely(pa->pa_free)) {
                                spin_lock(pa->pa_obj_lock);
                                list_del_rcu(&pa->pa_inode_list);
                                spin_unlock(pa->pa_obj_lock);
                                ext4_mb_add_n_trim(ac);
                        }
                }

                if (pa->pa_type == MB_INODE_PA) {
                        /*
                         * treat per-inode prealloc list as a lru list, then try
                         * to trim the least recently used PA.
                         */
                        spin_lock(pa->pa_obj_lock);
                        list_move(&pa->pa_inode_list, &ei->i_prealloc_list);
                        spin_unlock(pa->pa_obj_lock);
                }

                ext4_mb_put_pa(ac, ac->ac_sb, pa);
        }
        if (ac->ac_bitmap_page)
                put_page(ac->ac_bitmap_page);
        if (ac->ac_buddy_page)
                put_page(ac->ac_buddy_page);
        if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
                mutex_unlock(&ac->ac_lg->lg_mutex);
        ext4_mb_collect_stats(ac);
        ext4_mb_trim_inode_pa(inode);
        return 0;
}

static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
{
        ext4_group_t i, ngroups = ext4_get_groups_count(sb);
        int ret;
        int freed = 0, busy = 0;
        int retry = 0;

        trace_ext4_mb_discard_preallocations(sb, needed);

        if (needed == 0)
                needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
 repeat:
        for (i = 0; i < ngroups && needed > 0; i++) {
                ret = ext4_mb_discard_group_preallocations(sb, i, &busy);
                freed += ret;
                needed -= ret;
                cond_resched();
        }

        if (needed > 0 && busy && ++retry < 3) {
                busy = 0;
                goto repeat;
        }

        return freed;
}

static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb,
                        struct ext4_allocation_context *ac, u64 *seq)
{
        int freed;
        u64 seq_retry = 0;
        bool ret = false;

        freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
        if (freed) {
                ret = true;
                goto out_dbg;
        }
        seq_retry = ext4_get_discard_pa_seq_sum();
        if (!(ac->ac_flags & EXT4_MB_STRICT_CHECK) || seq_retry != *seq) {
                ac->ac_flags |= EXT4_MB_STRICT_CHECK;
                *seq = seq_retry;
                ret = true;
        }

out_dbg:
        mb_debug(sb, "freed %d, retry ? %s\n", freed, ret ? "yes" : "no");
        return ret;
}

/*
 * Simple allocator for Ext4 fast commit replay path. It searches for blocks
 * linearly starting at the goal block and also excludes the blocks which
 * are going to be in use after fast commit replay.
 */
static ext4_fsblk_t
ext4_mb_new_blocks_simple(struct ext4_allocation_request *ar, int *errp)
{
        struct buffer_head *bitmap_bh;
        struct super_block *sb = ar->inode->i_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_group_t group, nr;
        ext4_grpblk_t blkoff;
        ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
        ext4_grpblk_t i = 0;
        ext4_fsblk_t goal, block;
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;

        goal = ar->goal;
        if (goal < le32_to_cpu(es->s_first_data_block) ||
                        goal >= ext4_blocks_count(es))
                goal = le32_to_cpu(es->s_first_data_block);

        ar->len = 0;
        ext4_get_group_no_and_offset(sb, goal, &group, &blkoff);
        for (nr = ext4_get_groups_count(sb); nr > 0; nr--) {
                bitmap_bh = ext4_read_block_bitmap(sb, group);
                if (IS_ERR(bitmap_bh)) {
                        *errp = PTR_ERR(bitmap_bh);
                        pr_warn("Failed to read block bitmap\n");
                        return 0;
                }

                while (1) {
                        i = mb_find_next_zero_bit(bitmap_bh->b_data, max,
                                                blkoff);
                        if (i >= max)
                                break;
                        if (ext4_fc_replay_check_excluded(sb,
                                ext4_group_first_block_no(sb, group) +
                                EXT4_C2B(sbi, i))) {
                                blkoff = i + 1;
                        } else
                                break;
                }
                brelse(bitmap_bh);
                if (i < max)
                        break;

                if (++group >= ext4_get_groups_count(sb))
                        group = 0;

                blkoff = 0;
        }

        if (i >= max) {
                *errp = -ENOSPC;
                return 0;
        }

        block = ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, i);
        ext4_mb_mark_bb(sb, block, 1, 1);
        ar->len = 1;

        *errp = 0;
        return block;
}

/*
 * Main entry point into mballoc to allocate blocks
 * it tries to use preallocation first, then falls back
 * to usual allocation
 */
ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
                                struct ext4_allocation_request *ar, int *errp)
{
        struct ext4_allocation_context *ac = NULL;
        struct ext4_sb_info *sbi;
        struct super_block *sb;
        ext4_fsblk_t block = 0;
        unsigned int inquota = 0;
        unsigned int reserv_clstrs = 0;
        int retries = 0;
        u64 seq;

        might_sleep();
        sb = ar->inode->i_sb;
        sbi = EXT4_SB(sb);

        trace_ext4_request_blocks(ar);
        if (sbi->s_mount_state & EXT4_FC_REPLAY)
                return ext4_mb_new_blocks_simple(ar, errp);

        /* Allow to use superuser reservation for quota file */
        if (ext4_is_quota_file(ar->inode))
                ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;

        if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) {
                /* Without delayed allocation we need to verify
                 * there is enough free blocks to do block allocation
                 * and verify allocation doesn't exceed the quota limits.
                 */
                while (ar->len &&
                        ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {

                        /* let others to free the space */
                        cond_resched();
                        ar->len = ar->len >> 1;
                }
                if (!ar->len) {
                        ext4_mb_show_pa(sb);
                        *errp = -ENOSPC;
                        return 0;
                }
                reserv_clstrs = ar->len;
                if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
                        dquot_alloc_block_nofail(ar->inode,
                                                 EXT4_C2B(sbi, ar->len));
                } else {
                        while (ar->len &&
                                dquot_alloc_block(ar->inode,
                                                  EXT4_C2B(sbi, ar->len))) {

                                ar->flags |= EXT4_MB_HINT_NOPREALLOC;
                                ar->len--;
                        }
                }
                inquota = ar->len;
                if (ar->len == 0) {
                        *errp = -EDQUOT;
                        goto out;
                }
        }

        ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS);
        if (!ac) {
                ar->len = 0;
                *errp = -ENOMEM;
                goto out;
        }

        *errp = ext4_mb_initialize_context(ac, ar);
        if (*errp) {
                ar->len = 0;
                goto out;
        }

        ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
        seq = this_cpu_read(discard_pa_seq);
        if (!ext4_mb_use_preallocated(ac)) {
                ac->ac_op = EXT4_MB_HISTORY_ALLOC;
                ext4_mb_normalize_request(ac, ar);

                *errp = ext4_mb_pa_alloc(ac);
                if (*errp)
                        goto errout;
repeat:
                /* allocate space in core */
                *errp = ext4_mb_regular_allocator(ac);
                /*
                 * pa allocated above is added to grp->bb_prealloc_list only
                 * when we were able to allocate some block i.e. when
                 * ac->ac_status == AC_STATUS_FOUND.
                 * And error from above mean ac->ac_status != AC_STATUS_FOUND
                 * So we have to free this pa here itself.
                 */
                if (*errp) {
                        ext4_mb_pa_free(ac);
                        ext4_discard_allocated_blocks(ac);
                        goto errout;
                }
                if (ac->ac_status == AC_STATUS_FOUND &&
                        ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len)
                        ext4_mb_pa_free(ac);
        }
        if (likely(ac->ac_status == AC_STATUS_FOUND)) {
                *errp = ext4_mb_mark_diskspace_used(ac, handle);
                if (*errp) {
                        ext4_discard_allocated_blocks(ac);
                        goto errout;
                } else {
                        block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
                        ar->len = ac->ac_b_ex.fe_len;
                }
        } else {
                if (++retries < 3 &&
                    ext4_mb_discard_preallocations_should_retry(sb, ac, &seq))
                        goto repeat;
                /*
                 * If block allocation fails then the pa allocated above
                 * needs to be freed here itself.
                 */
                ext4_mb_pa_free(ac);
                *errp = -ENOSPC;
        }

errout:
        if (*errp) {
                ac->ac_b_ex.fe_len = 0;
                ar->len = 0;
                ext4_mb_show_ac(ac);
        }
        ext4_mb_release_context(ac);
out:
        if (ac)
                kmem_cache_free(ext4_ac_cachep, ac);
        if (inquota && ar->len < inquota)
                dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
        /* release any reserved blocks */
        if (reserv_clstrs)
                percpu_counter_sub(&sbi->s_dirtyclusters_counter, reserv_clstrs);

        trace_ext4_allocate_blocks(ar, (unsigned long long)block);

        return block;
}

/*
 * We can merge two free data extents only if the physical blocks
 * are contiguous, AND the extents were freed by the same transaction,
 * AND the blocks are associated with the same group.
 */
static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi,
                                        struct ext4_free_data *entry,
                                        struct ext4_free_data *new_entry,
                                        struct rb_root *entry_rb_root)
{
        if ((entry->efd_tid != new_entry->efd_tid) ||
            (entry->efd_group != new_entry->efd_group))
                return;
        if (entry->efd_start_cluster + entry->efd_count ==
            new_entry->efd_start_cluster) {
                new_entry->efd_start_cluster = entry->efd_start_cluster;
                new_entry->efd_count += entry->efd_count;
        } else if (new_entry->efd_start_cluster + new_entry->efd_count ==
                   entry->efd_start_cluster) {
                new_entry->efd_count += entry->efd_count;
        } else
                return;
        spin_lock(&sbi->s_md_lock);
        list_del(&entry->efd_list);
        spin_unlock(&sbi->s_md_lock);
        rb_erase(&entry->efd_node, entry_rb_root);
        kmem_cache_free(ext4_free_data_cachep, entry);
}

static noinline_for_stack int
ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
                      struct ext4_free_data *new_entry)
{
        ext4_group_t group = e4b->bd_group;
        ext4_grpblk_t cluster;
        ext4_grpblk_t clusters = new_entry->efd_count;
        struct ext4_free_data *entry;
        struct ext4_group_info *db = e4b->bd_info;
        struct super_block *sb = e4b->bd_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct rb_node **n = &db->bb_free_root.rb_node, *node;
        struct rb_node *parent = NULL, *new_node;

        BUG_ON(!ext4_handle_valid(handle));
        BUG_ON(e4b->bd_bitmap_page == NULL);
        BUG_ON(e4b->bd_buddy_page == NULL);

        new_node = &new_entry->efd_node;
        cluster = new_entry->efd_start_cluster;

        if (!*n) {
                /* first free block exent. We need to
                   protect buddy cache from being freed,
                 * otherwise we'll refresh it from
                 * on-disk bitmap and lose not-yet-available
                 * blocks */
                get_page(e4b->bd_buddy_page);
                get_page(e4b->bd_bitmap_page);
        }
        while (*n) {
                parent = *n;
                entry = rb_entry(parent, struct ext4_free_data, efd_node);
                if (cluster < entry->efd_start_cluster)
                        n = &(*n)->rb_left;
                else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
                        n = &(*n)->rb_right;
                else {
                        ext4_grp_locked_error(sb, group, 0,
                                ext4_group_first_block_no(sb, group) +
                                EXT4_C2B(sbi, cluster),
                                "Block already on to-be-freed list");
                        kmem_cache_free(ext4_free_data_cachep, new_entry);
                        return 0;
                }
        }

        rb_link_node(new_node, parent, n);
        rb_insert_color(new_node, &db->bb_free_root);

        /* Now try to see the extent can be merged to left and right */
        node = rb_prev(new_node);
        if (node) {
                entry = rb_entry(node, struct ext4_free_data, efd_node);
                ext4_try_merge_freed_extent(sbi, entry, new_entry,
                                            &(db->bb_free_root));
        }

        node = rb_next(new_node);
        if (node) {
                entry = rb_entry(node, struct ext4_free_data, efd_node);
                ext4_try_merge_freed_extent(sbi, entry, new_entry,
                                            &(db->bb_free_root));
        }

        spin_lock(&sbi->s_md_lock);
        list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list);
        sbi->s_mb_free_pending += clusters;
        spin_unlock(&sbi->s_md_lock);
        return 0;
}

static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block,
                                        unsigned long count)
{
        struct buffer_head *bitmap_bh;
        struct super_block *sb = inode->i_sb;
        struct ext4_group_desc *gdp;
        struct buffer_head *gdp_bh;
        ext4_group_t group;
        ext4_grpblk_t blkoff;
        int already_freed = 0, err, i;

        ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
        bitmap_bh = ext4_read_block_bitmap(sb, group);
        if (IS_ERR(bitmap_bh)) {
                err = PTR_ERR(bitmap_bh);
                pr_warn("Failed to read block bitmap\n");
                return;
        }
        gdp = ext4_get_group_desc(sb, group, &gdp_bh);
        if (!gdp)
                return;

        for (i = 0; i < count; i++) {
                if (!mb_test_bit(blkoff + i, bitmap_bh->b_data))
                        already_freed++;
        }
        mb_clear_bits(bitmap_bh->b_data, blkoff, count);
        err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
        if (err)
                return;
        ext4_free_group_clusters_set(
                sb, gdp, ext4_free_group_clusters(sb, gdp) +
                count - already_freed);
        ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh);
        ext4_group_desc_csum_set(sb, group, gdp);
        ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
        sync_dirty_buffer(bitmap_bh);
        sync_dirty_buffer(gdp_bh);
        brelse(bitmap_bh);
}

/**
 * ext4_mb_clear_bb() -- helper function for freeing blocks.
 *                        Used by ext4_free_blocks()
 * @handle:                handle for this transaction
 * @inode:                inode
 * @bh:                        optional buffer of the block to be freed
 * @block:                starting physical block to be freed
 * @count:                number of blocks to be freed
 * @flags:                flags used by ext4_free_blocks
 */
static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
                               ext4_fsblk_t block, unsigned long count,
                               int flags)
{
        struct buffer_head *bitmap_bh = NULL;
        struct super_block *sb = inode->i_sb;
        struct ext4_group_desc *gdp;
        struct ext4_group_info *grp;
        unsigned int overflow;
        ext4_grpblk_t bit;
        struct buffer_head *gd_bh;
        ext4_group_t block_group;
        struct ext4_sb_info *sbi;
        struct ext4_buddy e4b;
        unsigned int count_clusters;
        int err = 0;
        int ret;

        sbi = EXT4_SB(sb);

        if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
            !ext4_inode_block_valid(inode, block, count)) {
                ext4_error(sb, "Freeing blocks in system zone - "
                           "Block = %llu, count = %lu", block, count);
                /* err = 0. ext4_std_error should be a no op */
                goto error_return;
        }
        flags |= EXT4_FREE_BLOCKS_VALIDATED;

do_more:
        overflow = 0;
        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);

        grp = ext4_get_group_info(sb, block_group);
        if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
                return;

        /*
         * Check to see if we are freeing blocks across a group
         * boundary.
         */
        if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) {
                overflow = EXT4_C2B(sbi, bit) + count -
                        EXT4_BLOCKS_PER_GROUP(sb);
                count -= overflow;
                /* The range changed so it's no longer validated */
                flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
        }
        count_clusters = EXT4_NUM_B2C(sbi, count);
        bitmap_bh = ext4_read_block_bitmap(sb, block_group);
        if (IS_ERR(bitmap_bh)) {
                err = PTR_ERR(bitmap_bh);
                bitmap_bh = NULL;
                goto error_return;
        }
        gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
        if (!gdp) {
                err = -EIO;
                goto error_return;
        }

        if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
            !ext4_inode_block_valid(inode, block, count)) {
                ext4_error(sb, "Freeing blocks in system zone - "
                           "Block = %llu, count = %lu", block, count);
                /* err = 0. ext4_std_error should be a no op */
                goto error_return;
        }

        BUFFER_TRACE(bitmap_bh, "getting write access");
        err = ext4_journal_get_write_access(handle, bitmap_bh);
        if (err)
                goto error_return;

        /*
         * We are about to modify some metadata.  Call the journal APIs
         * to unshare ->b_data if a currently-committing transaction is
         * using it
         */
        BUFFER_TRACE(gd_bh, "get_write_access");
        err = ext4_journal_get_write_access(handle, gd_bh);
        if (err)
                goto error_return;
#ifdef AGGRESSIVE_CHECK
        {
                int i;
                for (i = 0; i < count_clusters; i++)
                        BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
        }
#endif
        trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);

        /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */
        err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b,
                                     GFP_NOFS|__GFP_NOFAIL);
        if (err)
                goto error_return;

        /*
         * We need to make sure we don't reuse the freed block until after the
         * transaction is committed. We make an exception if the inode is to be
         * written in writeback mode since writeback mode has weak data
         * consistency guarantees.
         */
        if (ext4_handle_valid(handle) &&
            ((flags & EXT4_FREE_BLOCKS_METADATA) ||
             !ext4_should_writeback_data(inode))) {
                struct ext4_free_data *new_entry;
                /*
                 * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed
                 * to fail.
                 */
                new_entry = kmem_cache_alloc(ext4_free_data_cachep,
                                GFP_NOFS|__GFP_NOFAIL);
                new_entry->efd_start_cluster = bit;
                new_entry->efd_group = block_group;
                new_entry->efd_count = count_clusters;
                new_entry->efd_tid = handle->h_transaction->t_tid;

                ext4_lock_group(sb, block_group);
                mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
                ext4_mb_free_metadata(handle, &e4b, new_entry);
        } else {
                /* need to update group_info->bb_free and bitmap
                 * with group lock held. generate_buddy look at
                 * them with group lock_held
                 */
                if (test_opt(sb, DISCARD)) {
                        err = ext4_issue_discard(sb, block_group, bit,
                                                 count_clusters, NULL);
                        if (err && err != -EOPNOTSUPP)
                                ext4_msg(sb, KERN_WARNING, "discard request in"
                                         " group:%u block:%d count:%lu failed"
                                         " with %d", block_group, bit, count,
                                         err);
                }

                EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);

                ext4_lock_group(sb, block_group);
                mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
                mb_free_blocks(inode, &e4b, bit, count_clusters);
        }

        ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
        ext4_free_group_clusters_set(sb, gdp, ret);
        ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh);
        ext4_group_desc_csum_set(sb, block_group, gdp);
        ext4_unlock_group(sb, block_group);

        if (sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
                atomic64_add(count_clusters,
                             &sbi_array_rcu_deref(sbi, s_flex_groups,
                                                  flex_group)->free_clusters);
        }

        /*
         * on a bigalloc file system, defer the s_freeclusters_counter
         * update to the caller (ext4_remove_space and friends) so they
         * can determine if a cluster freed here should be rereserved
         */
        if (!(flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)) {
                if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
                        dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
                percpu_counter_add(&sbi->s_freeclusters_counter,
                                   count_clusters);
        }

        ext4_mb_unload_buddy(&e4b);

        /* We dirtied the bitmap block */
        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);

        /* And the group descriptor block */
        BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
        ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
        if (!err)
                err = ret;

        if (overflow && !err) {
                block += count;
                count = overflow;
                put_bh(bitmap_bh);
                /* The range changed so it's no longer validated */
                flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
                goto do_more;
        }
error_return:
        brelse(bitmap_bh);
        ext4_std_error(sb, err);
        return;
}

/**
 * ext4_free_blocks() -- Free given blocks and update quota
 * @handle:                handle for this transaction
 * @inode:                inode
 * @bh:                        optional buffer of the block to be freed
 * @block:                starting physical block to be freed
 * @count:                number of blocks to be freed
 * @flags:                flags used by ext4_free_blocks
 */
void ext4_free_blocks(handle_t *handle, struct inode *inode,
                      struct buffer_head *bh, ext4_fsblk_t block,
                      unsigned long count, int flags)
{
        struct super_block *sb = inode->i_sb;
        unsigned int overflow;
        struct ext4_sb_info *sbi;

        sbi = EXT4_SB(sb);

        if (bh) {
                if (block)
                        BUG_ON(block != bh->b_blocknr);
                else
                        block = bh->b_blocknr;
        }

        if (sbi->s_mount_state & EXT4_FC_REPLAY) {
                ext4_free_blocks_simple(inode, block, EXT4_NUM_B2C(sbi, count));
                return;
        }

        might_sleep();

        if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
            !ext4_inode_block_valid(inode, block, count)) {
                ext4_error(sb, "Freeing blocks not in datazone - "
                           "block = %llu, count = %lu", block, count);
                return;
        }
        flags |= EXT4_FREE_BLOCKS_VALIDATED;

        ext4_debug("freeing block %llu\n", block);
        trace_ext4_free_blocks(inode, block, count, flags);

        if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
                BUG_ON(count > 1);

                ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
                            inode, bh, block);
        }

        /*
         * If the extent to be freed does not begin on a cluster
         * boundary, we need to deal with partial clusters at the
         * beginning and end of the extent.  Normally we will free
         * blocks at the beginning or the end unless we are explicitly
         * requested to avoid doing so.
         */
        overflow = EXT4_PBLK_COFF(sbi, block);
        if (overflow) {
                if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
                        overflow = sbi->s_cluster_ratio - overflow;
                        block += overflow;
                        if (count > overflow)
                                count -= overflow;
                        else
                                return;
                } else {
                        block -= overflow;
                        count += overflow;
                }
                /* The range changed so it's no longer validated */
                flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
        }
        overflow = EXT4_LBLK_COFF(sbi, count);
        if (overflow) {
                if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
                        if (count > overflow)
                                count -= overflow;
                        else
                                return;
                } else
                        count += sbi->s_cluster_ratio - overflow;
                /* The range changed so it's no longer validated */
                flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
        }

        if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
                int i;
                int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA;

                for (i = 0; i < count; i++) {
                        cond_resched();
                        if (is_metadata)
                                bh = sb_find_get_block(inode->i_sb, block + i);
                        ext4_forget(handle, is_metadata, inode, bh, block + i);
                }
        }

        ext4_mb_clear_bb(handle, inode, block, count, flags);
        return;
}

/**
 * ext4_group_add_blocks() -- Add given blocks to an existing group
 * @handle:                        handle to this transaction
 * @sb:                                super block
 * @block:                        start physical block to add to the block group
 * @count:                        number of blocks to free
 *
 * This marks the blocks as free in the bitmap and buddy.
 */
int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
                         ext4_fsblk_t block, unsigned long count)
{
        struct buffer_head *bitmap_bh = NULL;
        struct buffer_head *gd_bh;
        ext4_group_t block_group;
        ext4_grpblk_t bit;
        unsigned int i;
        struct ext4_group_desc *desc;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_buddy e4b;
        int err = 0, ret, free_clusters_count;
        ext4_grpblk_t clusters_freed;
        ext4_fsblk_t first_cluster = EXT4_B2C(sbi, block);
        ext4_fsblk_t last_cluster = EXT4_B2C(sbi, block + count - 1);
        unsigned long cluster_count = last_cluster - first_cluster + 1;

        ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);

        if (count == 0)
                return 0;

        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
        /*
         * Check to see if we are freeing blocks across a group
         * boundary.
         */
        if (bit + cluster_count > EXT4_CLUSTERS_PER_GROUP(sb)) {
                ext4_warning(sb, "too many blocks added to group %u",
                             block_group);
                err = -EINVAL;
                goto error_return;
        }

        bitmap_bh = ext4_read_block_bitmap(sb, block_group);
        if (IS_ERR(bitmap_bh)) {
                err = PTR_ERR(bitmap_bh);
                bitmap_bh = NULL;
                goto error_return;
        }

        desc = ext4_get_group_desc(sb, block_group, &gd_bh);
        if (!desc) {
                err = -EIO;
                goto error_return;
        }

        if (!ext4_sb_block_valid(sb, NULL, block, count)) {
                ext4_error(sb, "Adding blocks in system zones - "
                           "Block = %llu, count = %lu",
                           block, count);
                err = -EINVAL;
                goto error_return;
        }

        BUFFER_TRACE(bitmap_bh, "getting write access");
        err = ext4_journal_get_write_access(handle, bitmap_bh);
        if (err)
                goto error_return;

        /*
         * We are about to modify some metadata.  Call the journal APIs
         * to unshare ->b_data if a currently-committing transaction is
         * using it
         */
        BUFFER_TRACE(gd_bh, "get_write_access");
        err = ext4_journal_get_write_access(handle, gd_bh);
        if (err)
                goto error_return;

        for (i = 0, clusters_freed = 0; i < cluster_count; i++) {
                BUFFER_TRACE(bitmap_bh, "clear bit");
                if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
                        ext4_error(sb, "bit already cleared for block %llu",
                                   (ext4_fsblk_t)(block + i));
                        BUFFER_TRACE(bitmap_bh, "bit already cleared");
                } else {
                        clusters_freed++;
                }
        }

        err = ext4_mb_load_buddy(sb, block_group, &e4b);
        if (err)
                goto error_return;

        /*
         * need to update group_info->bb_free and bitmap
         * with group lock held. generate_buddy look at
         * them with group lock_held
         */
        ext4_lock_group(sb, block_group);
        mb_clear_bits(bitmap_bh->b_data, bit, cluster_count);
        mb_free_blocks(NULL, &e4b, bit, cluster_count);
        free_clusters_count = clusters_freed +
                ext4_free_group_clusters(sb, desc);
        ext4_free_group_clusters_set(sb, desc, free_clusters_count);
        ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh);
        ext4_group_desc_csum_set(sb, block_group, desc);
        ext4_unlock_group(sb, block_group);
        percpu_counter_add(&sbi->s_freeclusters_counter,
                           clusters_freed);

        if (sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
                atomic64_add(clusters_freed,
                             &sbi_array_rcu_deref(sbi, s_flex_groups,
                                                  flex_group)->free_clusters);
        }

        ext4_mb_unload_buddy(&e4b);

        /* We dirtied the bitmap block */
        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);

        /* And the group descriptor block */
        BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
        ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
        if (!err)
                err = ret;

error_return:
        brelse(bitmap_bh);
        ext4_std_error(sb, err);
        return err;
}

/**
 * ext4_trim_extent -- function to TRIM one single free extent in the group
 * @sb:                super block for the file system
 * @start:        starting block of the free extent in the alloc. group
 * @count:        number of blocks to TRIM
 * @e4b:        ext4 buddy for the group
 *
 * Trim "count" blocks starting at "start" in the "group". To assure that no
 * one will allocate those blocks, mark it as used in buddy bitmap. This must
 * be called with under the group lock.
 */
static int ext4_trim_extent(struct super_block *sb,
                int start, int count, struct ext4_buddy *e4b)
__releases(bitlock)
__acquires(bitlock)
{
        struct ext4_free_extent ex;
        ext4_group_t group = e4b->bd_group;
        int ret = 0;

        trace_ext4_trim_extent(sb, group, start, count);

        assert_spin_locked(ext4_group_lock_ptr(sb, group));

        ex.fe_start = start;
        ex.fe_group = group;
        ex.fe_len = count;

        /*
         * Mark blocks used, so no one can reuse them while
         * being trimmed.
         */
        mb_mark_used(e4b, &ex);
        ext4_unlock_group(sb, group);
        ret = ext4_issue_discard(sb, group, start, count, NULL);
        ext4_lock_group(sb, group);
        mb_free_blocks(NULL, e4b, start, ex.fe_len);
        return ret;
}

static ext4_grpblk_t ext4_last_grp_cluster(struct super_block *sb,
                                           ext4_group_t grp)
{
        unsigned long nr_clusters_in_group;

        if (grp < (ext4_get_groups_count(sb) - 1))
                nr_clusters_in_group = EXT4_CLUSTERS_PER_GROUP(sb);
        else
                nr_clusters_in_group = (ext4_blocks_count(EXT4_SB(sb)->s_es) -
                                        ext4_group_first_block_no(sb, grp))
                                       >> EXT4_CLUSTER_BITS(sb);

        return nr_clusters_in_group - 1;
}

static bool ext4_trim_interrupted(void)
{
        return fatal_signal_pending(current) || freezing(current);
}

static int ext4_try_to_trim_range(struct super_block *sb,
                struct ext4_buddy *e4b, ext4_grpblk_t start,
                ext4_grpblk_t max, ext4_grpblk_t minblocks)
{
        ext4_grpblk_t next, count, free_count, last, origin_start;
        bool set_trimmed = false;
        void *bitmap;

        if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
                return 0;

        last = ext4_last_grp_cluster(sb, e4b->bd_group);
        bitmap = e4b->bd_bitmap;
        if (start == 0 && max >= last)
                set_trimmed = true;
        origin_start = start;
        start = max(e4b->bd_info->bb_first_free, start);
        count = 0;
        free_count = 0;

        while (start <= max) {
                start = mb_find_next_zero_bit(bitmap, max + 1, start);
                if (start > max)
                        break;

                next = mb_find_next_bit(bitmap, last + 1, start);
                if (origin_start == 0 && next >= last)
                        set_trimmed = true;

                if ((next - start) >= minblocks) {
                        int ret = ext4_trim_extent(sb, start, next - start, e4b);

                        if (ret && ret != -EOPNOTSUPP)
                                return count;
                        count += next - start;
                }
                free_count += next - start;
                start = next + 1;

                if (ext4_trim_interrupted())
                        return count;

                if (need_resched()) {
                        ext4_unlock_group(sb, e4b->bd_group);
                        cond_resched();
                        ext4_lock_group(sb, e4b->bd_group);
                }

                if ((e4b->bd_info->bb_free - free_count) < minblocks)
                        break;
        }

        if (set_trimmed)
                EXT4_MB_GRP_SET_TRIMMED(e4b->bd_info);

        return count;
}

/**
 * ext4_trim_all_free -- function to trim all free space in alloc. group
 * @sb:                        super block for file system
 * @group:                group to be trimmed
 * @start:                first group block to examine
 * @max:                last group block to examine
 * @minblocks:                minimum extent block count
 *
 * ext4_trim_all_free walks through group's buddy bitmap searching for free
 * extents. When the free block is found, ext4_trim_extent is called to TRIM
 * the extent.
 *
 *
 * ext4_trim_all_free walks through group's block bitmap searching for free
 * extents. When the free extent is found, mark it as used in group buddy
 * bitmap. Then issue a TRIM command on this extent and free the extent in
 * the group buddy bitmap. This is done until whole group is scanned.
 */
static ext4_grpblk_t
ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
                   ext4_grpblk_t start, ext4_grpblk_t max,
                   ext4_grpblk_t minblocks)
{
        struct ext4_buddy e4b;
        int ret;

        trace_ext4_trim_all_free(sb, group, start, max);

        ret = ext4_mb_load_buddy(sb, group, &e4b);
        if (ret) {
                ext4_warning(sb, "Error %d loading buddy information for %u",
                             ret, group);
                return ret;
        }

        ext4_lock_group(sb, group);

        if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) ||
            minblocks < EXT4_SB(sb)->s_last_trim_minblks)
                ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks);
        else
                ret = 0;

        ext4_unlock_group(sb, group);
        ext4_mb_unload_buddy(&e4b);

        ext4_debug("trimmed %d blocks in the group %d\n",
                ret, group);

        return ret;
}

/**
 * ext4_trim_fs() -- trim ioctl handle function
 * @sb:                        superblock for filesystem
 * @range:                fstrim_range structure
 *
 * start:        First Byte to trim
 * len:                number of Bytes to trim from start
 * minlen:        minimum extent length in Bytes
 * ext4_trim_fs goes through all allocation groups containing Bytes from
 * start to start+len. For each such a group ext4_trim_all_free function
 * is invoked to trim all free space.
 */
int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
{
        struct request_queue *q = bdev_get_queue(sb->s_bdev);
        struct ext4_group_info *grp;
        ext4_group_t group, first_group, last_group;
        ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
        uint64_t start, end, minlen, trimmed = 0;
        ext4_fsblk_t first_data_blk =
                        le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
        ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
        int ret = 0;

        start = range->start >> sb->s_blocksize_bits;
        end = start + (range->len >> sb->s_blocksize_bits) - 1;
        minlen = EXT4_NUM_B2C(EXT4_SB(sb),
                              range->minlen >> sb->s_blocksize_bits);

        if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) ||
            start >= max_blks ||
            range->len < sb->s_blocksize)
                return -EINVAL;
        /* No point to try to trim less than discard granularity */
        if (range->minlen < q->limits.discard_granularity) {
                minlen = EXT4_NUM_B2C(EXT4_SB(sb),
                        q->limits.discard_granularity >> sb->s_blocksize_bits);
                if (minlen > EXT4_CLUSTERS_PER_GROUP(sb))
                        goto out;
        }
        if (end >= max_blks - 1)
                end = max_blks - 1;
        if (end <= first_data_blk)
                goto out;
        if (start < first_data_blk)
                start = first_data_blk;

        /* Determine first and last group to examine based on start and end */
        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
                                     &first_group, &first_cluster);
        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end,
                                     &last_group, &last_cluster);

        /* end now represents the last cluster to discard in this group */
        end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;

        for (group = first_group; group <= last_group; group++) {
                if (ext4_trim_interrupted())
                        break;
                grp = ext4_get_group_info(sb, group);
                if (!grp)
                        continue;
                /* We only do this if the grp has never been initialized */
                if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
                        ret = ext4_mb_init_group(sb, group, GFP_NOFS);
                        if (ret)
                                break;
                }

                /*
                 * For all the groups except the last one, last cluster will
                 * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to
                 * change it for the last group, note that last_cluster is
                 * already computed earlier by ext4_get_group_no_and_offset()
                 */
                if (group == last_group)
                        end = last_cluster;
                if (grp->bb_free >= minlen) {
                        cnt = ext4_trim_all_free(sb, group, first_cluster,
                                                 end, minlen);
                        if (cnt < 0) {
                                ret = cnt;
                                break;
                        }
                        trimmed += cnt;
                }

                /*
                 * For every group except the first one, we are sure
                 * that the first cluster to discard will be cluster #0.
                 */
                first_cluster = 0;
        }

        if (!ret)
                EXT4_SB(sb)->s_last_trim_minblks = minlen;

out:
        range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
        return ret;
}

/* Iterate all the free extents in the group. */
int
ext4_mballoc_query_range(
        struct super_block                *sb,
        ext4_group_t                        group,
        ext4_grpblk_t                        first,
        ext4_grpblk_t                        end,
        ext4_mballoc_query_range_fn        meta_formatter,
        ext4_mballoc_query_range_fn        formatter,
        void                                *priv)
{
        void                                *bitmap;
        ext4_grpblk_t                        start, next;
        struct ext4_buddy                e4b;
        int                                error;

        error = ext4_mb_load_buddy(sb, group, &e4b);
        if (error)
                return error;
        bitmap = e4b.bd_bitmap;

        ext4_lock_group(sb, group);

        start = max(e4b.bd_info->bb_first_free, first);
        if (end >= EXT4_CLUSTERS_PER_GROUP(sb))
                end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
        if (meta_formatter && start != first) {
                if (start > end)
                        start = end;
                ext4_unlock_group(sb, group);
                error = meta_formatter(sb, group, first, start - first,
                                       priv);
                if (error)
                        goto out_unload;
                ext4_lock_group(sb, group);
        }
        while (start <= end) {
                start = mb_find_next_zero_bit(bitmap, end + 1, start);
                if (start > end)
                        break;
                next = mb_find_next_bit(bitmap, end + 1, start);

                ext4_unlock_group(sb, group);
                error = formatter(sb, group, start, next - start, priv);
                if (error)
                        goto out_unload;
                ext4_lock_group(sb, group);

                start = next + 1;
        }

        ext4_unlock_group(sb, group);
out_unload:
        ext4_mb_unload_buddy(&e4b);

        return error;
}






































































    1 


    1 





    1 





    1 






















































































    1 



























































































































































































































    1 


    1 





    1 


    1 
    1 















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * SELinux NetLabel Support
 *
 * This file provides the necessary glue to tie NetLabel into the SELinux
 * subsystem.
 *
 * Author: Paul Moore <paul@paul-moore.com>
 */

/*
 * (c) Copyright Hewlett-Packard Development Company, L.P., 2007, 2008
 */

#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/gfp.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <net/sock.h>
#include <net/netlabel.h>
#include <net/ip.h>
#include <net/ipv6.h>

#include "objsec.h"
#include "security.h"
#include "netlabel.h"

/**
 * selinux_netlbl_sidlookup_cached - Cache a SID lookup
 * @skb: the packet
 * @secattr: the NetLabel security attributes
 * @sid: the SID
 *
 * Description:
 * Query the SELinux security server to lookup the correct SID for the given
 * security attributes.  If the query is successful, cache the result to speed
 * up future lookups.  Returns zero on success, negative values on failure.
 *
 */
static int selinux_netlbl_sidlookup_cached(struct sk_buff *skb,
                                           u16 family,
                                           struct netlbl_lsm_secattr *secattr,
                                           u32 *sid)
{
        int rc;

        rc = security_netlbl_secattr_to_sid(&selinux_state, secattr, sid);
        if (rc == 0 &&
            (secattr->flags & NETLBL_SECATTR_CACHEABLE) &&
            (secattr->flags & NETLBL_SECATTR_CACHE))
                netlbl_cache_add(skb, family, secattr);

        return rc;
}

/**
 * selinux_netlbl_sock_genattr - Generate the NetLabel socket secattr
 * @sk: the socket
 *
 * Description:
 * Generate the NetLabel security attributes for a socket, making full use of
 * the socket's attribute cache.  Returns a pointer to the security attributes
 * on success, NULL on failure.
 *
 */
static struct netlbl_lsm_secattr *selinux_netlbl_sock_genattr(struct sock *sk)
{
        int rc;
        struct sk_security_struct *sksec = sk->sk_security;
        struct netlbl_lsm_secattr *secattr;

        if (sksec->nlbl_secattr != NULL)
                return sksec->nlbl_secattr;

        secattr = netlbl_secattr_alloc(GFP_ATOMIC);
        if (secattr == NULL)
                return NULL;
        rc = security_netlbl_sid_to_secattr(&selinux_state, sksec->sid,
                                            secattr);
        if (rc != 0) {
                netlbl_secattr_free(secattr);
                return NULL;
        }
        sksec->nlbl_secattr = secattr;

        return secattr;
}

/**
 * selinux_netlbl_sock_getattr - Get the cached NetLabel secattr
 * @sk: the socket
 * @sid: the SID
 *
 * Query the socket's cached secattr and if the SID matches the cached value
 * return the cache, otherwise return NULL.
 *
 */
static struct netlbl_lsm_secattr *selinux_netlbl_sock_getattr(
                                                        const struct sock *sk,
                                                        u32 sid)
{
        struct sk_security_struct *sksec = sk->sk_security;
        struct netlbl_lsm_secattr *secattr = sksec->nlbl_secattr;

        if (secattr == NULL)
                return NULL;

        if ((secattr->flags & NETLBL_SECATTR_SECID) &&
            (secattr->attr.secid == sid))
                return secattr;

        return NULL;
}

/**
 * selinux_netlbl_cache_invalidate - Invalidate the NetLabel cache
 *
 * Description:
 * Invalidate the NetLabel security attribute mapping cache.
 *
 */
void selinux_netlbl_cache_invalidate(void)
{
        netlbl_cache_invalidate();
}

/**
 * selinux_netlbl_err - Handle a NetLabel packet error
 * @skb: the packet
 * @error: the error code
 * @gateway: true if host is acting as a gateway, false otherwise
 *
 * Description:
 * When a packet is dropped due to a call to avc_has_perm() pass the error
 * code to the NetLabel subsystem so any protocol specific processing can be
 * done.  This is safe to call even if you are unsure if NetLabel labeling is
 * present on the packet, NetLabel is smart enough to only act when it should.
 *
 */
void selinux_netlbl_err(struct sk_buff *skb, u16 family, int error, int gateway)
{
        netlbl_skbuff_err(skb, family, error, gateway);
}

/**
 * selinux_netlbl_sk_security_free - Free the NetLabel fields
 * @sksec: the sk_security_struct
 *
 * Description:
 * Free all of the memory in the NetLabel fields of a sk_security_struct.
 *
 */
void selinux_netlbl_sk_security_free(struct sk_security_struct *sksec)
{
        if (sksec->nlbl_secattr != NULL)
                netlbl_secattr_free(sksec->nlbl_secattr);
}

/**
 * selinux_netlbl_sk_security_reset - Reset the NetLabel fields
 * @sksec: the sk_security_struct
 * @family: the socket family
 *
 * Description:
 * Called when the NetLabel state of a sk_security_struct needs to be reset.
 * The caller is responsible for all the NetLabel sk_security_struct locking.
 *
 */
void selinux_netlbl_sk_security_reset(struct sk_security_struct *sksec)
{
        sksec->nlbl_state = NLBL_UNSET;
}

/**
 * selinux_netlbl_skbuff_getsid - Get the sid of a packet using NetLabel
 * @skb: the packet
 * @family: protocol family
 * @type: NetLabel labeling protocol type
 * @sid: the SID
 *
 * Description:
 * Call the NetLabel mechanism to get the security attributes of the given
 * packet and use those attributes to determine the correct context/SID to
 * assign to the packet.  Returns zero on success, negative values on failure.
 *
 */
int selinux_netlbl_skbuff_getsid(struct sk_buff *skb,
                                 u16 family,
                                 u32 *type,
                                 u32 *sid)
{
        int rc;
        struct netlbl_lsm_secattr secattr;

        if (!netlbl_enabled()) {
                *sid = SECSID_NULL;
                return 0;
        }

        netlbl_secattr_init(&secattr);
        rc = netlbl_skbuff_getattr(skb, family, &secattr);
        if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE)
                rc = selinux_netlbl_sidlookup_cached(skb, family,
                                                     &secattr, sid);
        else
                *sid = SECSID_NULL;
        *type = secattr.type;
        netlbl_secattr_destroy(&secattr);

        return rc;
}

/**
 * selinux_netlbl_skbuff_setsid - Set the NetLabel on a packet given a sid
 * @skb: the packet
 * @family: protocol family
 * @sid: the SID
 *
 * Description
 * Call the NetLabel mechanism to set the label of a packet using @sid.
 * Returns zero on success, negative values on failure.
 *
 */
int selinux_netlbl_skbuff_setsid(struct sk_buff *skb,
                                 u16 family,
                                 u32 sid)
{
        int rc;
        struct netlbl_lsm_secattr secattr_storage;
        struct netlbl_lsm_secattr *secattr = NULL;
        struct sock *sk;

        /* if this is a locally generated packet check to see if it is already
         * being labeled by it's parent socket, if it is just exit */
        sk = skb_to_full_sk(skb);
        if (sk != NULL) {
                struct sk_security_struct *sksec = sk->sk_security;

                if (sksec->nlbl_state != NLBL_REQSKB)
                        return 0;
                secattr = selinux_netlbl_sock_getattr(sk, sid);
        }
        if (secattr == NULL) {
                secattr = &secattr_storage;
                netlbl_secattr_init(secattr);
                rc = security_netlbl_sid_to_secattr(&selinux_state, sid,
                                                    secattr);
                if (rc != 0)
                        goto skbuff_setsid_return;
        }

        rc = netlbl_skbuff_setattr(skb, family, secattr);

skbuff_setsid_return:
        if (secattr == &secattr_storage)
                netlbl_secattr_destroy(secattr);
        return rc;
}

/**
 * selinux_netlbl_sctp_assoc_request - Label an incoming sctp association.
 * @ep: incoming association endpoint.
 * @skb: the packet.
 *
 * Description:
 * A new incoming connection is represented by @ep, ......
 * Returns zero on success, negative values on failure.
 *
 */
int selinux_netlbl_sctp_assoc_request(struct sctp_endpoint *ep,
                                     struct sk_buff *skb)
{
        int rc;
        struct netlbl_lsm_secattr secattr;
        struct sk_security_struct *sksec = ep->base.sk->sk_security;
        struct sockaddr_in addr4;
        struct sockaddr_in6 addr6;

        if (ep->base.sk->sk_family != PF_INET &&
                                ep->base.sk->sk_family != PF_INET6)
                return 0;

        netlbl_secattr_init(&secattr);
        rc = security_netlbl_sid_to_secattr(&selinux_state,
                                            ep->secid, &secattr);
        if (rc != 0)
                goto assoc_request_return;

        /* Move skb hdr address info to a struct sockaddr and then call
         * netlbl_conn_setattr().
         */
        if (ip_hdr(skb)->version == 4) {
                addr4.sin_family = AF_INET;
                addr4.sin_addr.s_addr = ip_hdr(skb)->saddr;
                rc = netlbl_conn_setattr(ep->base.sk, (void *)&addr4, &secattr);
        } else if (IS_ENABLED(CONFIG_IPV6) && ip_hdr(skb)->version == 6) {
                addr6.sin6_family = AF_INET6;
                addr6.sin6_addr = ipv6_hdr(skb)->saddr;
                rc = netlbl_conn_setattr(ep->base.sk, (void *)&addr6, &secattr);
        } else {
                rc = -EAFNOSUPPORT;
        }

        if (rc == 0)
                sksec->nlbl_state = NLBL_LABELED;

assoc_request_return:
        netlbl_secattr_destroy(&secattr);
        return rc;
}

/**
 * selinux_netlbl_inet_conn_request - Label an incoming stream connection
 * @req: incoming connection request socket
 *
 * Description:
 * A new incoming connection request is represented by @req, we need to label
 * the new request_sock here and the stack will ensure the on-the-wire label
 * will get preserved when a full sock is created once the connection handshake
 * is complete.  Returns zero on success, negative values on failure.
 *
 */
int selinux_netlbl_inet_conn_request(struct request_sock *req, u16 family)
{
        int rc;
        struct netlbl_lsm_secattr secattr;

        if (family != PF_INET && family != PF_INET6)
                return 0;

        netlbl_secattr_init(&secattr);
        rc = security_netlbl_sid_to_secattr(&selinux_state, req->secid,
                                            &secattr);
        if (rc != 0)
                goto inet_conn_request_return;
        rc = netlbl_req_setattr(req, &secattr);
inet_conn_request_return:
        netlbl_secattr_destroy(&secattr);
        return rc;
}

/**
 * selinux_netlbl_inet_csk_clone - Initialize the newly created sock
 * @sk: the new sock
 *
 * Description:
 * A new connection has been established using @sk, we've already labeled the
 * socket via the request_sock struct in selinux_netlbl_inet_conn_request() but
 * we need to set the NetLabel state here since we now have a sock structure.
 *
 */
void selinux_netlbl_inet_csk_clone(struct sock *sk, u16 family)
{
        struct sk_security_struct *sksec = sk->sk_security;

        if (family == PF_INET)
                sksec->nlbl_state = NLBL_LABELED;
        else
                sksec->nlbl_state = NLBL_UNSET;
}

/**
 * selinux_netlbl_sctp_sk_clone - Copy state to the newly created sock
 * @sk: current sock
 * @newsk: the new sock
 *
 * Description:
 * Called whenever a new socket is created by accept(2) or sctp_peeloff(3).
 */
void selinux_netlbl_sctp_sk_clone(struct sock *sk, struct sock *newsk)
{
        struct sk_security_struct *sksec = sk->sk_security;
        struct sk_security_struct *newsksec = newsk->sk_security;

        newsksec->nlbl_state = sksec->nlbl_state;
}

/**
 * selinux_netlbl_socket_post_create - Label a socket using NetLabel
 * @sock: the socket to label
 * @family: protocol family
 *
 * Description:
 * Attempt to label a socket using the NetLabel mechanism using the given
 * SID.  Returns zero values on success, negative values on failure.
 *
 */
int selinux_netlbl_socket_post_create(struct sock *sk, u16 family)
{
        int rc;
        struct sk_security_struct *sksec = sk->sk_security;
        struct netlbl_lsm_secattr *secattr;

        if (family != PF_INET && family != PF_INET6)
                return 0;

        secattr = selinux_netlbl_sock_genattr(sk);
        if (secattr == NULL)
                return -ENOMEM;
        rc = netlbl_sock_setattr(sk, family, secattr);
        switch (rc) {
        case 0:
                sksec->nlbl_state = NLBL_LABELED;
                break;
        case -EDESTADDRREQ:
                sksec->nlbl_state = NLBL_REQSKB;
                rc = 0;
                break;
        }

        return rc;
}

/**
 * selinux_netlbl_sock_rcv_skb - Do an inbound access check using NetLabel
 * @sksec: the sock's sk_security_struct
 * @skb: the packet
 * @family: protocol family
 * @ad: the audit data
 *
 * Description:
 * Fetch the NetLabel security attributes from @skb and perform an access check
 * against the receiving socket.  Returns zero on success, negative values on
 * error.
 *
 */
int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
                                struct sk_buff *skb,
                                u16 family,
                                struct common_audit_data *ad)
{
        int rc;
        u32 nlbl_sid;
        u32 perm;
        struct netlbl_lsm_secattr secattr;

        if (!netlbl_enabled())
                return 0;

        netlbl_secattr_init(&secattr);
        rc = netlbl_skbuff_getattr(skb, family, &secattr);
        if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE)
                rc = selinux_netlbl_sidlookup_cached(skb, family,
                                                     &secattr, &nlbl_sid);
        else
                nlbl_sid = SECINITSID_UNLABELED;
        netlbl_secattr_destroy(&secattr);
        if (rc != 0)
                return rc;

        switch (sksec->sclass) {
        case SECCLASS_UDP_SOCKET:
                perm = UDP_SOCKET__RECVFROM;
                break;
        case SECCLASS_TCP_SOCKET:
                perm = TCP_SOCKET__RECVFROM;
                break;
        default:
                perm = RAWIP_SOCKET__RECVFROM;
        }

        rc = avc_has_perm(&selinux_state,
                          sksec->sid, nlbl_sid, sksec->sclass, perm, ad);
        if (rc == 0)
                return 0;

        if (nlbl_sid != SECINITSID_UNLABELED)
                netlbl_skbuff_err(skb, family, rc, 0);
        return rc;
}

/**
 * selinux_netlbl_option - Is this a NetLabel option
 * @level: the socket level or protocol
 * @optname: the socket option name
 *
 * Description:
 * Returns true if @level and @optname refer to a NetLabel option.
 * Helper for selinux_netlbl_socket_setsockopt().
 */
static inline int selinux_netlbl_option(int level, int optname)
{
        return (level == IPPROTO_IP && optname == IP_OPTIONS) ||
                (level == IPPROTO_IPV6 && optname == IPV6_HOPOPTS);
}

/**
 * selinux_netlbl_socket_setsockopt - Do not allow users to remove a NetLabel
 * @sock: the socket
 * @level: the socket level or protocol
 * @optname: the socket option name
 *
 * Description:
 * Check the setsockopt() call and if the user is trying to replace the IP
 * options on a socket and a NetLabel is in place for the socket deny the
 * access; otherwise allow the access.  Returns zero when the access is
 * allowed, -EACCES when denied, and other negative values on error.
 *
 */
int selinux_netlbl_socket_setsockopt(struct socket *sock,
                                     int level,
                                     int optname)
{
        int rc = 0;
        struct sock *sk = sock->sk;
        struct sk_security_struct *sksec = sk->sk_security;
        struct netlbl_lsm_secattr secattr;

        if (selinux_netlbl_option(level, optname) &&
            (sksec->nlbl_state == NLBL_LABELED ||
             sksec->nlbl_state == NLBL_CONNLABELED)) {
                netlbl_secattr_init(&secattr);
                lock_sock(sk);
                /* call the netlabel function directly as we want to see the
                 * on-the-wire label that is assigned via the socket's options
                 * and not the cached netlabel/lsm attributes */
                rc = netlbl_sock_getattr(sk, &secattr);
                release_sock(sk);
                if (rc == 0)
                        rc = -EACCES;
                else if (rc == -ENOMSG)
                        rc = 0;
                netlbl_secattr_destroy(&secattr);
        }

        return rc;
}

/**
 * selinux_netlbl_socket_connect_helper - Help label a client-side socket on
 * connect
 * @sk: the socket to label
 * @addr: the destination address
 *
 * Description:
 * Attempt to label a connected socket with NetLabel using the given address.
 * Returns zero values on success, negative values on failure.
 *
 */
static int selinux_netlbl_socket_connect_helper(struct sock *sk,
                                                struct sockaddr *addr)
{
        int rc;
        struct sk_security_struct *sksec = sk->sk_security;
        struct netlbl_lsm_secattr *secattr;

        /* connected sockets are allowed to disconnect when the address family
         * is set to AF_UNSPEC, if that is what is happening we want to reset
         * the socket */
        if (addr->sa_family == AF_UNSPEC) {
                netlbl_sock_delattr(sk);
                sksec->nlbl_state = NLBL_REQSKB;
                rc = 0;
                return rc;
        }
        secattr = selinux_netlbl_sock_genattr(sk);
        if (secattr == NULL) {
                rc = -ENOMEM;
                return rc;
        }
        rc = netlbl_conn_setattr(sk, addr, secattr);
        if (rc == 0)
                sksec->nlbl_state = NLBL_CONNLABELED;

        return rc;
}

/**
 * selinux_netlbl_socket_connect_locked - Label a client-side socket on
 * connect
 * @sk: the socket to label
 * @addr: the destination address
 *
 * Description:
 * Attempt to label a connected socket that already has the socket locked
 * with NetLabel using the given address.
 * Returns zero values on success, negative values on failure.
 *
 */
int selinux_netlbl_socket_connect_locked(struct sock *sk,
                                         struct sockaddr *addr)
{
        struct sk_security_struct *sksec = sk->sk_security;

        if (sksec->nlbl_state != NLBL_REQSKB &&
            sksec->nlbl_state != NLBL_CONNLABELED)
                return 0;

        return selinux_netlbl_socket_connect_helper(sk, addr);
}

/**
 * selinux_netlbl_socket_connect - Label a client-side socket on connect
 * @sk: the socket to label
 * @addr: the destination address
 *
 * Description:
 * Attempt to label a connected socket with NetLabel using the given address.
 * Returns zero values on success, negative values on failure.
 *
 */
int selinux_netlbl_socket_connect(struct sock *sk, struct sockaddr *addr)
{
        int rc;

        lock_sock(sk);
        rc = selinux_netlbl_socket_connect_locked(sk, addr);
        release_sock(sk);

        return rc;
}















   14 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_CURRENT_H
#define _ASM_X86_CURRENT_H

#include <linux/compiler.h>
#include <asm/percpu.h>

#ifndef __ASSEMBLY__
struct task_struct;

DECLARE_PER_CPU(struct task_struct *, current_task);

static __always_inline struct task_struct *get_current(void)
{
        return this_cpu_read_stable(current_task);
}

#define current get_current()

#endif /* __ASSEMBLY__ */

#endif /* _ASM_X86_CURRENT_H */







































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  NET  is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the Ethernet handlers.
 *
 * Version:        @(#)eth.h        1.0.4        05/13/93
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *
 *                Relocated to include/linux where it belongs by Alan Cox 
 *                                                        <gw4pts@gw4pts.ampr.org>
 */
#ifndef _LINUX_ETHERDEVICE_H
#define _LINUX_ETHERDEVICE_H

#include <linux/if_ether.h>
#include <linux/netdevice.h>
#include <linux/random.h>
#include <linux/crc32.h>
#include <asm/unaligned.h>
#include <asm/bitsperlong.h>

#ifdef __KERNEL__
struct device;
int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr);
unsigned char *arch_get_platform_mac_address(void);
int nvmem_get_mac_address(struct device *dev, void *addrbuf);
u32 eth_get_headlen(const struct net_device *dev, void *data, unsigned int len);
__be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev);
extern const struct header_ops eth_header_ops;

int eth_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
               const void *daddr, const void *saddr, unsigned len);
int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr);
int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh,
                     __be16 type);
void eth_header_cache_update(struct hh_cache *hh, const struct net_device *dev,
                             const unsigned char *haddr);
__be16 eth_header_parse_protocol(const struct sk_buff *skb);
int eth_prepare_mac_addr_change(struct net_device *dev, void *p);
void eth_commit_mac_addr_change(struct net_device *dev, void *p);
int eth_mac_addr(struct net_device *dev, void *p);
int eth_validate_addr(struct net_device *dev);

struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs,
                                            unsigned int rxqs);
#define alloc_etherdev(sizeof_priv) alloc_etherdev_mq(sizeof_priv, 1)
#define alloc_etherdev_mq(sizeof_priv, count) alloc_etherdev_mqs(sizeof_priv, count, count)

struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv,
                                           unsigned int txqs,
                                           unsigned int rxqs);
#define devm_alloc_etherdev(dev, sizeof_priv) devm_alloc_etherdev_mqs(dev, sizeof_priv, 1, 1)

struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb);
int eth_gro_complete(struct sk_buff *skb, int nhoff);

/* Reserved Ethernet Addresses per IEEE 802.1Q */
static const u8 eth_reserved_addr_base[ETH_ALEN] __aligned(2) =
{ 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 };
#define eth_stp_addr eth_reserved_addr_base

/**
 * is_link_local_ether_addr - Determine if given Ethernet address is link-local
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Return true if address is link local reserved addr (01:80:c2:00:00:0X) per
 * IEEE 802.1Q 8.6.3 Frame filtering.
 *
 * Please note: addr must be aligned to u16.
 */
static inline bool is_link_local_ether_addr(const u8 *addr)
{
        __be16 *a = (__be16 *)addr;
        static const __be16 *b = (const __be16 *)eth_reserved_addr_base;
        static const __be16 m = cpu_to_be16(0xfff0);

#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
        return (((*(const u32 *)addr) ^ (*(const u32 *)b)) |
                (__force int)((a[2] ^ b[2]) & m)) == 0;
#else
        return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | ((a[2] ^ b[2]) & m)) == 0;
#endif
}

/**
 * is_zero_ether_addr - Determine if give Ethernet address is all zeros.
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Return true if the address is all zeroes.
 *
 * Please note: addr must be aligned to u16.
 */
static inline bool is_zero_ether_addr(const u8 *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
        return ((*(const u32 *)addr) | (*(const u16 *)(addr + 4))) == 0;
#else
        return (*(const u16 *)(addr + 0) |
                *(const u16 *)(addr + 2) |
                *(const u16 *)(addr + 4)) == 0;
#endif
}

/**
 * is_multicast_ether_addr - Determine if the Ethernet address is a multicast.
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Return true if the address is a multicast address.
 * By definition the broadcast address is also a multicast address.
 */
static inline bool is_multicast_ether_addr(const u8 *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
        u32 a = *(const u32 *)addr;
#else
        u16 a = *(const u16 *)addr;
#endif
#ifdef __BIG_ENDIAN
        return 0x01 & (a >> ((sizeof(a) * 8) - 8));
#else
        return 0x01 & a;
#endif
}

static inline bool is_multicast_ether_addr_64bits(const u8 *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
#ifdef __BIG_ENDIAN
        return 0x01 & ((*(const u64 *)addr) >> 56);
#else
        return 0x01 & (*(const u64 *)addr);
#endif
#else
        return is_multicast_ether_addr(addr);
#endif
}

/**
 * is_local_ether_addr - Determine if the Ethernet address is locally-assigned one (IEEE 802).
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Return true if the address is a local address.
 */
static inline bool is_local_ether_addr(const u8 *addr)
{
        return 0x02 & addr[0];
}

/**
 * is_broadcast_ether_addr - Determine if the Ethernet address is broadcast
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Return true if the address is the broadcast address.
 *
 * Please note: addr must be aligned to u16.
 */
static inline bool is_broadcast_ether_addr(const u8 *addr)
{
        return (*(const u16 *)(addr + 0) &
                *(const u16 *)(addr + 2) &
                *(const u16 *)(addr + 4)) == 0xffff;
}

/**
 * is_unicast_ether_addr - Determine if the Ethernet address is unicast
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Return true if the address is a unicast address.
 */
static inline bool is_unicast_ether_addr(const u8 *addr)
{
        return !is_multicast_ether_addr(addr);
}

/**
 * is_valid_ether_addr - Determine if the given Ethernet address is valid
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Check that the Ethernet address (MAC) is not 00:00:00:00:00:00, is not
 * a multicast address, and is not FF:FF:FF:FF:FF:FF.
 *
 * Return true if the address is valid.
 *
 * Please note: addr must be aligned to u16.
 */
static inline bool is_valid_ether_addr(const u8 *addr)
{
        /* FF:FF:FF:FF:FF:FF is a multicast address so we don't need to
         * explicitly check for it here. */
        return !is_multicast_ether_addr(addr) && !is_zero_ether_addr(addr);
}

/**
 * eth_proto_is_802_3 - Determine if a given Ethertype/length is a protocol
 * @proto: Ethertype/length value to be tested
 *
 * Check that the value from the Ethertype/length field is a valid Ethertype.
 *
 * Return true if the valid is an 802.3 supported Ethertype.
 */
static inline bool eth_proto_is_802_3(__be16 proto)
{
#ifndef __BIG_ENDIAN
        /* if CPU is little endian mask off bits representing LSB */
        proto &= htons(0xFF00);
#endif
        /* cast both to u16 and compare since LSB can be ignored */
        return (__force u16)proto >= (__force u16)htons(ETH_P_802_3_MIN);
}

/**
 * eth_random_addr - Generate software assigned random Ethernet address
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Generate a random Ethernet address (MAC) that is not multicast
 * and has the local assigned bit set.
 */
static inline void eth_random_addr(u8 *addr)
{
        get_random_bytes(addr, ETH_ALEN);
        addr[0] &= 0xfe;        /* clear multicast bit */
        addr[0] |= 0x02;        /* set local assignment bit (IEEE802) */
}

#define random_ether_addr(addr) eth_random_addr(addr)

/**
 * eth_broadcast_addr - Assign broadcast address
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Assign the broadcast address to the given address array.
 */
static inline void eth_broadcast_addr(u8 *addr)
{
        memset(addr, 0xff, ETH_ALEN);
}

/**
 * eth_zero_addr - Assign zero address
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Assign the zero address to the given address array.
 */
static inline void eth_zero_addr(u8 *addr)
{
        memset(addr, 0x00, ETH_ALEN);
}

/**
 * eth_hw_addr_random - Generate software assigned random Ethernet and
 * set device flag
 * @dev: pointer to net_device structure
 *
 * Generate a random Ethernet address (MAC) to be used by a net device
 * and set addr_assign_type so the state can be read by sysfs and be
 * used by userspace.
 */
static inline void eth_hw_addr_random(struct net_device *dev)
{
        dev->addr_assign_type = NET_ADDR_RANDOM;
        eth_random_addr(dev->dev_addr);
}

/**
 * eth_hw_addr_crc - Calculate CRC from netdev_hw_addr
 * @ha: pointer to hardware address
 *
 * Calculate CRC from a hardware address as basis for filter hashes.
 */
static inline u32 eth_hw_addr_crc(struct netdev_hw_addr *ha)
{
        return ether_crc(ETH_ALEN, ha->addr);
}

/**
 * ether_addr_copy - Copy an Ethernet address
 * @dst: Pointer to a six-byte array Ethernet address destination
 * @src: Pointer to a six-byte array Ethernet address source
 *
 * Please note: dst & src must both be aligned to u16.
 */
static inline void ether_addr_copy(u8 *dst, const u8 *src)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
        *(u32 *)dst = *(const u32 *)src;
        *(u16 *)(dst + 4) = *(const u16 *)(src + 4);
#else
        u16 *a = (u16 *)dst;
        const u16 *b = (const u16 *)src;

        a[0] = b[0];
        a[1] = b[1];
        a[2] = b[2];
#endif
}

/**
 * eth_hw_addr_set - Assign Ethernet address to a net_device
 * @dev: pointer to net_device structure
 * @addr: address to assign
 *
 * Assign given address to the net_device, addr_assign_type is not changed.
 */
static inline void eth_hw_addr_set(struct net_device *dev, const u8 *addr)
{
        ether_addr_copy(dev->dev_addr, addr);
}

/**
 * eth_hw_addr_inherit - Copy dev_addr from another net_device
 * @dst: pointer to net_device to copy dev_addr to
 * @src: pointer to net_device to copy dev_addr from
 *
 * Copy the Ethernet address from one net_device to another along with
 * the address attributes (addr_assign_type).
 */
static inline void eth_hw_addr_inherit(struct net_device *dst,
                                       struct net_device *src)
{
        dst->addr_assign_type = src->addr_assign_type;
        ether_addr_copy(dst->dev_addr, src->dev_addr);
}

/**
 * ether_addr_equal - Compare two Ethernet addresses
 * @addr1: Pointer to a six-byte array containing the Ethernet address
 * @addr2: Pointer other six-byte array containing the Ethernet address
 *
 * Compare two Ethernet addresses, returns true if equal
 *
 * Please note: addr1 & addr2 must both be aligned to u16.
 */
static inline bool ether_addr_equal(const u8 *addr1, const u8 *addr2)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
        u32 fold = ((*(const u32 *)addr1) ^ (*(const u32 *)addr2)) |
                   ((*(const u16 *)(addr1 + 4)) ^ (*(const u16 *)(addr2 + 4)));

        return fold == 0;
#else
        const u16 *a = (const u16 *)addr1;
        const u16 *b = (const u16 *)addr2;

        return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2])) == 0;
#endif
}

/**
 * ether_addr_equal_64bits - Compare two Ethernet addresses
 * @addr1: Pointer to an array of 8 bytes
 * @addr2: Pointer to an other array of 8 bytes
 *
 * Compare two Ethernet addresses, returns true if equal, false otherwise.
 *
 * The function doesn't need any conditional branches and possibly uses
 * word memory accesses on CPU allowing cheap unaligned memory reads.
 * arrays = { byte1, byte2, byte3, byte4, byte5, byte6, pad1, pad2 }
 *
 * Please note that alignment of addr1 & addr2 are only guaranteed to be 16 bits.
 */

static inline bool ether_addr_equal_64bits(const u8 *addr1, const u8 *addr2)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        u64 fold = (*(const u64 *)addr1) ^ (*(const u64 *)addr2);

#ifdef __BIG_ENDIAN
        return (fold >> 16) == 0;
#else
        return (fold << 16) == 0;
#endif
#else
        return ether_addr_equal(addr1, addr2);
#endif
}

/**
 * ether_addr_equal_unaligned - Compare two not u16 aligned Ethernet addresses
 * @addr1: Pointer to a six-byte array containing the Ethernet address
 * @addr2: Pointer other six-byte array containing the Ethernet address
 *
 * Compare two Ethernet addresses, returns true if equal
 *
 * Please note: Use only when any Ethernet address may not be u16 aligned.
 */
static inline bool ether_addr_equal_unaligned(const u8 *addr1, const u8 *addr2)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
        return ether_addr_equal(addr1, addr2);
#else
        return memcmp(addr1, addr2, ETH_ALEN) == 0;
#endif
}

/**
 * ether_addr_equal_masked - Compare two Ethernet addresses with a mask
 * @addr1: Pointer to a six-byte array containing the 1st Ethernet address
 * @addr2: Pointer to a six-byte array containing the 2nd Ethernet address
 * @mask: Pointer to a six-byte array containing the Ethernet address bitmask
 *
 * Compare two Ethernet addresses with a mask, returns true if for every bit
 * set in the bitmask the equivalent bits in the ethernet addresses are equal.
 * Using a mask with all bits set is a slower ether_addr_equal.
 */
static inline bool ether_addr_equal_masked(const u8 *addr1, const u8 *addr2,
                                           const u8 *mask)
{
        int i;

        for (i = 0; i < ETH_ALEN; i++) {
                if ((addr1[i] ^ addr2[i]) & mask[i])
                        return false;
        }

        return true;
}

/**
 * ether_addr_to_u64 - Convert an Ethernet address into a u64 value.
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Return a u64 value of the address
 */
static inline u64 ether_addr_to_u64(const u8 *addr)
{
        u64 u = 0;
        int i;

        for (i = 0; i < ETH_ALEN; i++)
                u = u << 8 | addr[i];

        return u;
}

/**
 * u64_to_ether_addr - Convert a u64 to an Ethernet address.
 * @u: u64 to convert to an Ethernet MAC address
 * @addr: Pointer to a six-byte array to contain the Ethernet address
 */
static inline void u64_to_ether_addr(u64 u, u8 *addr)
{
        int i;

        for (i = ETH_ALEN - 1; i >= 0; i--) {
                addr[i] = u & 0xff;
                u = u >> 8;
        }
}

/**
 * eth_addr_dec - Decrement the given MAC address
 *
 * @addr: Pointer to a six-byte array containing Ethernet address to decrement
 */
static inline void eth_addr_dec(u8 *addr)
{
        u64 u = ether_addr_to_u64(addr);

        u--;
        u64_to_ether_addr(u, addr);
}

/**
 * eth_addr_inc() - Increment the given MAC address.
 * @addr: Pointer to a six-byte array containing Ethernet address to increment.
 */
static inline void eth_addr_inc(u8 *addr)
{
        u64 u = ether_addr_to_u64(addr);

        u++;
        u64_to_ether_addr(u, addr);
}

/**
 * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
 * @dev: Pointer to a device structure
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Compare passed address with all addresses of the device. Return true if the
 * address if one of the device addresses.
 *
 * Note that this function calls ether_addr_equal_64bits() so take care of
 * the right padding.
 */
static inline bool is_etherdev_addr(const struct net_device *dev,
                                    const u8 addr[6 + 2])
{
        struct netdev_hw_addr *ha;
        bool res = false;

        rcu_read_lock();
        for_each_dev_addr(dev, ha) {
                res = ether_addr_equal_64bits(addr, ha->addr);
                if (res)
                        break;
        }
        rcu_read_unlock();
        return res;
}
#endif        /* __KERNEL__ */

/**
 * compare_ether_header - Compare two Ethernet headers
 * @a: Pointer to Ethernet header
 * @b: Pointer to Ethernet header
 *
 * Compare two Ethernet headers, returns 0 if equal.
 * This assumes that the network header (i.e., IP header) is 4-byte
 * aligned OR the platform can handle unaligned access.  This is the
 * case for all packets coming into netif_receive_skb or similar
 * entry points.
 */

static inline unsigned long compare_ether_header(const void *a, const void *b)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        unsigned long fold;

        /*
         * We want to compare 14 bytes:
         *  [a0 ... a13] ^ [b0 ... b13]
         * Use two long XOR, ORed together, with an overlap of two bytes.
         *  [a0  a1  a2  a3  a4  a5  a6  a7 ] ^ [b0  b1  b2  b3  b4  b5  b6  b7 ] |
         *  [a6  a7  a8  a9  a10 a11 a12 a13] ^ [b6  b7  b8  b9  b10 b11 b12 b13]
         * This means the [a6 a7] ^ [b6 b7] part is done two times.
        */
        fold = *(unsigned long *)a ^ *(unsigned long *)b;
        fold |= *(unsigned long *)(a + 6) ^ *(unsigned long *)(b + 6);
        return fold;
#else
        u32 *a32 = (u32 *)((u8 *)a + 2);
        u32 *b32 = (u32 *)((u8 *)b + 2);

        return (*(u16 *)a ^ *(u16 *)b) | (a32[0] ^ b32[0]) |
               (a32[1] ^ b32[1]) | (a32[2] ^ b32[2]);
#endif
}

/**
 * eth_skb_pkt_type - Assign packet type if destination address does not match
 * @skb: Assigned a packet type if address does not match @dev address
 * @dev: Network device used to compare packet address against
 *
 * If the destination MAC address of the packet does not match the network
 * device address, assign an appropriate packet type.
 */
static inline void eth_skb_pkt_type(struct sk_buff *skb,
                                    const struct net_device *dev)
{
        const struct ethhdr *eth = eth_hdr(skb);

        if (unlikely(!ether_addr_equal_64bits(eth->h_dest, dev->dev_addr))) {
                if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) {
                        if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast))
                                skb->pkt_type = PACKET_BROADCAST;
                        else
                                skb->pkt_type = PACKET_MULTICAST;
                } else {
                        skb->pkt_type = PACKET_OTHERHOST;
                }
        }
}

/**
 * eth_skb_pad - Pad buffer to mininum number of octets for Ethernet frame
 * @skb: Buffer to pad
 *
 * An Ethernet frame should have a minimum size of 60 bytes.  This function
 * takes short frames and pads them with zeros up to the 60 byte limit.
 */
static inline int eth_skb_pad(struct sk_buff *skb)
{
        return skb_put_padto(skb, ETH_ZLEN);
}

#endif        /* _LINUX_ETHERDEVICE_H */

























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Generic associative array implementation.
 *
 * See Documentation/core-api/assoc_array.rst for information.
 *
 * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#ifndef _LINUX_ASSOC_ARRAY_H
#define _LINUX_ASSOC_ARRAY_H

#ifdef CONFIG_ASSOCIATIVE_ARRAY

#include <linux/types.h>

#define ASSOC_ARRAY_KEY_CHUNK_SIZE BITS_PER_LONG /* Key data retrieved in chunks of this size */

/*
 * Generic associative array.
 */
struct assoc_array {
        struct assoc_array_ptr        *root;                /* The node at the root of the tree */
        unsigned long                nr_leaves_on_tree;
};

/*
 * Operations on objects and index keys for use by array manipulation routines.
 */
struct assoc_array_ops {
        /* Method to get a chunk of an index key from caller-supplied data */
        unsigned long (*get_key_chunk)(const void *index_key, int level);

        /* Method to get a piece of an object's index key */
        unsigned long (*get_object_key_chunk)(const void *object, int level);

        /* Is this the object we're looking for? */
        bool (*compare_object)(const void *object, const void *index_key);

        /* How different is an object from an index key, to a bit position in
         * their keys? (or -1 if they're the same)
         */
        int (*diff_objects)(const void *object, const void *index_key);

        /* Method to free an object. */
        void (*free_object)(void *object);
};

/*
 * Access and manipulation functions.
 */
struct assoc_array_edit;

static inline void assoc_array_init(struct assoc_array *array)
{
        array->root = NULL;
        array->nr_leaves_on_tree = 0;
}

extern int assoc_array_iterate(const struct assoc_array *array,
                               int (*iterator)(const void *object,
                                               void *iterator_data),
                               void *iterator_data);
extern void *assoc_array_find(const struct assoc_array *array,
                              const struct assoc_array_ops *ops,
                              const void *index_key);
extern void assoc_array_destroy(struct assoc_array *array,
                                const struct assoc_array_ops *ops);
extern struct assoc_array_edit *assoc_array_insert(struct assoc_array *array,
                                                   const struct assoc_array_ops *ops,
                                                   const void *index_key,
                                                   void *object);
extern void assoc_array_insert_set_object(struct assoc_array_edit *edit,
                                          void *object);
extern struct assoc_array_edit *assoc_array_delete(struct assoc_array *array,
                                                   const struct assoc_array_ops *ops,
                                                   const void *index_key);
extern struct assoc_array_edit *assoc_array_clear(struct assoc_array *array,
                                                  const struct assoc_array_ops *ops);
extern void assoc_array_apply_edit(struct assoc_array_edit *edit);
extern void assoc_array_cancel_edit(struct assoc_array_edit *edit);
extern int assoc_array_gc(struct assoc_array *array,
                          const struct assoc_array_ops *ops,
                          bool (*iterator)(void *object, void *iterator_data),
                          void *iterator_data);

#endif /* CONFIG_ASSOCIATIVE_ARRAY */
#endif /* _LINUX_ASSOC_ARRAY_H */















































































































































































































































































































































































































































































































































































































































































    2 
























































































































































    2 







































































































































































































































    4 













    2 





    3 



















    4 











































































































    4 
    4 








    4 











    4 












    2 


    3 
    3 




    4 
    2 
    4 

















































    4 


    4 




    4 
















    2 
















    3 
















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
// SPDX-License-Identifier: GPL-2.0
/*
 *  Copyright (C) 1995  Linus Torvalds
 *  Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
 *  Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
 */
#include <linux/sched.h>                /* test_thread_flag(), ...        */
#include <linux/sched/task_stack.h>        /* task_stack_*(), ...                */
#include <linux/kdebug.h>                /* oops_begin/end, ...                */
#include <linux/extable.h>                /* search_exception_tables        */
#include <linux/memblock.h>                /* max_low_pfn                        */
#include <linux/kprobes.h>                /* NOKPROBE_SYMBOL, ...                */
#include <linux/mmiotrace.h>                /* kmmio_handler, ...                */
#include <linux/perf_event.h>                /* perf_sw_event                */
#include <linux/hugetlb.h>                /* hstate_index_to_shift        */
#include <linux/prefetch.h>                /* prefetchw                        */
#include <linux/context_tracking.h>        /* exception_enter(), ...        */
#include <linux/uaccess.h>                /* faulthandler_disabled()        */
#include <linux/efi.h>                        /* efi_recover_from_page_fault()*/
#include <linux/mm_types.h>

#include <asm/cpufeature.h>                /* boot_cpu_has, ...                */
#include <asm/traps.h>                        /* dotraplinkage, ...                */
#include <asm/fixmap.h>                        /* VSYSCALL_ADDR                */
#include <asm/vsyscall.h>                /* emulate_vsyscall                */
#include <asm/vm86.h>                        /* struct vm86                        */
#include <asm/mmu_context.h>                /* vma_pkey()                        */
#include <asm/efi.h>                        /* efi_recover_from_page_fault()*/
#include <asm/desc.h>                        /* store_idt(), ...                */
#include <asm/cpu_entry_area.h>                /* exception stack                */
#include <asm/pgtable_areas.h>                /* VMALLOC_START, ...                */
#include <asm/kvm_para.h>                /* kvm_handle_async_pf                */

#define CREATE_TRACE_POINTS
#include <asm/trace/exceptions.h>

/*
 * Returns 0 if mmiotrace is disabled, or if the fault is not
 * handled by mmiotrace:
 */
static nokprobe_inline int
kmmio_fault(struct pt_regs *regs, unsigned long addr)
{
        if (unlikely(is_kmmio_active()))
                if (kmmio_handler(regs, addr) == 1)
                        return -1;
        return 0;
}

/*
 * Prefetch quirks:
 *
 * 32-bit mode:
 *
 *   Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
 *   Check that here and ignore it.  This is AMD erratum #91.
 *
 * 64-bit mode:
 *
 *   Sometimes the CPU reports invalid exceptions on prefetch.
 *   Check that here and ignore it.
 *
 * Opcode checker based on code by Richard Brunner.
 */
static inline int
check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
                      unsigned char opcode, int *prefetch)
{
        unsigned char instr_hi = opcode & 0xf0;
        unsigned char instr_lo = opcode & 0x0f;

        switch (instr_hi) {
        case 0x20:
        case 0x30:
                /*
                 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
                 * In X86_64 long mode, the CPU will signal invalid
                 * opcode if some of these prefixes are present so
                 * X86_64 will never get here anyway
                 */
                return ((instr_lo & 7) == 0x6);
#ifdef CONFIG_X86_64
        case 0x40:
                /*
                 * In 64-bit mode 0x40..0x4F are valid REX prefixes
                 */
                return (!user_mode(regs) || user_64bit_mode(regs));
#endif
        case 0x60:
                /* 0x64 thru 0x67 are valid prefixes in all modes. */
                return (instr_lo & 0xC) == 0x4;
        case 0xF0:
                /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
                return !instr_lo || (instr_lo>>1) == 1;
        case 0x00:
                /* Prefetch instruction is 0x0F0D or 0x0F18 */
                if (get_kernel_nofault(opcode, instr))
                        return 0;

                *prefetch = (instr_lo == 0xF) &&
                        (opcode == 0x0D || opcode == 0x18);
                return 0;
        default:
                return 0;
        }
}

static int
is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
{
        unsigned char *max_instr;
        unsigned char *instr;
        int prefetch = 0;

        /*
         * If it was a exec (instruction fetch) fault on NX page, then
         * do not ignore the fault:
         */
        if (error_code & X86_PF_INSTR)
                return 0;

        instr = (void *)convert_ip_to_linear(current, regs);
        max_instr = instr + 15;

        /*
         * This code has historically always bailed out if IP points to a
         * not-present page (e.g. due to a race).  No one has ever
         * complained about this.
         */
        pagefault_disable();

        while (instr < max_instr) {
                unsigned char opcode;

                if (user_mode(regs)) {
                        if (get_user(opcode, instr))
                                break;
                } else {
                        if (get_kernel_nofault(opcode, instr))
                                break;
                }

                instr++;

                if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
                        break;
        }

        pagefault_enable();
        return prefetch;
}

DEFINE_SPINLOCK(pgd_lock);
LIST_HEAD(pgd_list);

#ifdef CONFIG_X86_32
static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
{
        unsigned index = pgd_index(address);
        pgd_t *pgd_k;
        p4d_t *p4d, *p4d_k;
        pud_t *pud, *pud_k;
        pmd_t *pmd, *pmd_k;

        pgd += index;
        pgd_k = init_mm.pgd + index;

        if (!pgd_present(*pgd_k))
                return NULL;

        /*
         * set_pgd(pgd, *pgd_k); here would be useless on PAE
         * and redundant with the set_pmd() on non-PAE. As would
         * set_p4d/set_pud.
         */
        p4d = p4d_offset(pgd, address);
        p4d_k = p4d_offset(pgd_k, address);
        if (!p4d_present(*p4d_k))
                return NULL;

        pud = pud_offset(p4d, address);
        pud_k = pud_offset(p4d_k, address);
        if (!pud_present(*pud_k))
                return NULL;

        pmd = pmd_offset(pud, address);
        pmd_k = pmd_offset(pud_k, address);

        if (pmd_present(*pmd) != pmd_present(*pmd_k))
                set_pmd(pmd, *pmd_k);

        if (!pmd_present(*pmd_k))
                return NULL;
        else
                BUG_ON(pmd_pfn(*pmd) != pmd_pfn(*pmd_k));

        return pmd_k;
}

/*
 *   Handle a fault on the vmalloc or module mapping area
 *
 *   This is needed because there is a race condition between the time
 *   when the vmalloc mapping code updates the PMD to the point in time
 *   where it synchronizes this update with the other page-tables in the
 *   system.
 *
 *   In this race window another thread/CPU can map an area on the same
 *   PMD, finds it already present and does not synchronize it with the
 *   rest of the system yet. As a result v[mz]alloc might return areas
 *   which are not mapped in every page-table in the system, causing an
 *   unhandled page-fault when they are accessed.
 */
static noinline int vmalloc_fault(unsigned long address)
{
        unsigned long pgd_paddr;
        pmd_t *pmd_k;
        pte_t *pte_k;

        /* Make sure we are in vmalloc area: */
        if (!(address >= VMALLOC_START && address < VMALLOC_END))
                return -1;

        /*
         * Synchronize this task's top level page-table
         * with the 'reference' page table.
         *
         * Do _not_ use "current" here. We might be inside
         * an interrupt in the middle of a task switch..
         */
        pgd_paddr = read_cr3_pa();
        pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
        if (!pmd_k)
                return -1;

        if (pmd_large(*pmd_k))
                return 0;

        pte_k = pte_offset_kernel(pmd_k, address);
        if (!pte_present(*pte_k))
                return -1;

        return 0;
}
NOKPROBE_SYMBOL(vmalloc_fault);

void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
{
        unsigned long addr;

        for (addr = start & PMD_MASK;
             addr >= TASK_SIZE_MAX && addr < VMALLOC_END;
             addr += PMD_SIZE) {
                struct page *page;

                spin_lock(&pgd_lock);
                list_for_each_entry(page, &pgd_list, lru) {
                        spinlock_t *pgt_lock;

                        /* the pgt_lock only for Xen */
                        pgt_lock = &pgd_page_get_mm(page)->page_table_lock;

                        spin_lock(pgt_lock);
                        vmalloc_sync_one(page_address(page), addr);
                        spin_unlock(pgt_lock);
                }
                spin_unlock(&pgd_lock);
        }
}

/*
 * Did it hit the DOS screen memory VA from vm86 mode?
 */
static inline void
check_v8086_mode(struct pt_regs *regs, unsigned long address,
                 struct task_struct *tsk)
{
#ifdef CONFIG_VM86
        unsigned long bit;

        if (!v8086_mode(regs) || !tsk->thread.vm86)
                return;

        bit = (address - 0xA0000) >> PAGE_SHIFT;
        if (bit < 32)
                tsk->thread.vm86->screen_bitmap |= 1 << bit;
#endif
}

static bool low_pfn(unsigned long pfn)
{
        return pfn < max_low_pfn;
}

static void dump_pagetable(unsigned long address)
{
        pgd_t *base = __va(read_cr3_pa());
        pgd_t *pgd = &base[pgd_index(address)];
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;
        pte_t *pte;

#ifdef CONFIG_X86_PAE
        pr_info("*pdpt = %016Lx ", pgd_val(*pgd));
        if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
                goto out;
#define pr_pde pr_cont
#else
#define pr_pde pr_info
#endif
        p4d = p4d_offset(pgd, address);
        pud = pud_offset(p4d, address);
        pmd = pmd_offset(pud, address);
        pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
#undef pr_pde

        /*
         * We must not directly access the pte in the highpte
         * case if the page table is located in highmem.
         * And let's rather not kmap-atomic the pte, just in case
         * it's allocated already:
         */
        if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd))
                goto out;

        pte = pte_offset_kernel(pmd, address);
        pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
out:
        pr_cont("\n");
}

#else /* CONFIG_X86_64: */

#ifdef CONFIG_CPU_SUP_AMD
static const char errata93_warning[] =
KERN_ERR 
"******* Your BIOS seems to not contain a fix for K8 errata #93\n"
"******* Working around it, but it may cause SEGVs or burn power.\n"
"******* Please consider a BIOS update.\n"
"******* Disabling USB legacy in the BIOS may also help.\n";
#endif

/*
 * No vm86 mode in 64-bit mode:
 */
static inline void
check_v8086_mode(struct pt_regs *regs, unsigned long address,
                 struct task_struct *tsk)
{
}

static int bad_address(void *p)
{
        unsigned long dummy;

        return get_kernel_nofault(dummy, (unsigned long *)p);
}

static void dump_pagetable(unsigned long address)
{
        pgd_t *base = __va(read_cr3_pa());
        pgd_t *pgd = base + pgd_index(address);
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;
        pte_t *pte;

        if (bad_address(pgd))
                goto bad;

        pr_info("PGD %lx ", pgd_val(*pgd));

        if (!pgd_present(*pgd))
                goto out;

        p4d = p4d_offset(pgd, address);
        if (bad_address(p4d))
                goto bad;

        pr_cont("P4D %lx ", p4d_val(*p4d));
        if (!p4d_present(*p4d) || p4d_large(*p4d))
                goto out;

        pud = pud_offset(p4d, address);
        if (bad_address(pud))
                goto bad;

        pr_cont("PUD %lx ", pud_val(*pud));
        if (!pud_present(*pud) || pud_large(*pud))
                goto out;

        pmd = pmd_offset(pud, address);
        if (bad_address(pmd))
                goto bad;

        pr_cont("PMD %lx ", pmd_val(*pmd));
        if (!pmd_present(*pmd) || pmd_large(*pmd))
                goto out;

        pte = pte_offset_kernel(pmd, address);
        if (bad_address(pte))
                goto bad;

        pr_cont("PTE %lx", pte_val(*pte));
out:
        pr_cont("\n");
        return;
bad:
        pr_info("BAD\n");
}

#endif /* CONFIG_X86_64 */

/*
 * Workaround for K8 erratum #93 & buggy BIOS.
 *
 * BIOS SMM functions are required to use a specific workaround
 * to avoid corruption of the 64bit RIP register on C stepping K8.
 *
 * A lot of BIOS that didn't get tested properly miss this.
 *
 * The OS sees this as a page fault with the upper 32bits of RIP cleared.
 * Try to work around it here.
 *
 * Note we only handle faults in kernel here.
 * Does nothing on 32-bit.
 */
static int is_errata93(struct pt_regs *regs, unsigned long address)
{
#if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
        if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
            || boot_cpu_data.x86 != 0xf)
                return 0;

        if (user_mode(regs))
                return 0;

        if (address != regs->ip)
                return 0;

        if ((address >> 32) != 0)
                return 0;

        address |= 0xffffffffUL << 32;
        if ((address >= (u64)_stext && address <= (u64)_etext) ||
            (address >= MODULES_VADDR && address <= MODULES_END)) {
                printk_once(errata93_warning);
                regs->ip = address;
                return 1;
        }
#endif
        return 0;
}

/*
 * Work around K8 erratum #100 K8 in compat mode occasionally jumps
 * to illegal addresses >4GB.
 *
 * We catch this in the page fault handler because these addresses
 * are not reachable. Just detect this case and return.  Any code
 * segment in LDT is compatibility mode.
 */
static int is_errata100(struct pt_regs *regs, unsigned long address)
{
#ifdef CONFIG_X86_64
        if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
                return 1;
#endif
        return 0;
}

/* Pentium F0 0F C7 C8 bug workaround: */
static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
{
#ifdef CONFIG_X86_F00F_BUG
        if (boot_cpu_has_bug(X86_BUG_F00F) && idt_is_f00f_address(address)) {
                handle_invalid_op(regs);
                return 1;
        }
#endif
        return 0;
}

static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index)
{
        u32 offset = (index >> 3) * sizeof(struct desc_struct);
        unsigned long addr;
        struct ldttss_desc desc;

        if (index == 0) {
                pr_alert("%s: NULL\n", name);
                return;
        }

        if (offset + sizeof(struct ldttss_desc) >= gdt->size) {
                pr_alert("%s: 0x%hx -- out of bounds\n", name, index);
                return;
        }

        if (copy_from_kernel_nofault(&desc, (void *)(gdt->address + offset),
                              sizeof(struct ldttss_desc))) {
                pr_alert("%s: 0x%hx -- GDT entry is not readable\n",
                         name, index);
                return;
        }

        addr = desc.base0 | (desc.base1 << 16) | ((unsigned long)desc.base2 << 24);
#ifdef CONFIG_X86_64
        addr |= ((u64)desc.base3 << 32);
#endif
        pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n",
                 name, index, addr, (desc.limit0 | (desc.limit1 << 16)));
}

static void
show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address)
{
        if (!oops_may_print())
                return;

        if (error_code & X86_PF_INSTR) {
                unsigned int level;
                pgd_t *pgd;
                pte_t *pte;

                pgd = __va(read_cr3_pa());
                pgd += pgd_index(address);

                pte = lookup_address_in_pgd(pgd, address, &level);

                if (pte && pte_present(*pte) && !pte_exec(*pte))
                        pr_crit("kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n",
                                from_kuid(&init_user_ns, current_uid()));
                if (pte && pte_present(*pte) && pte_exec(*pte) &&
                                (pgd_flags(*pgd) & _PAGE_USER) &&
                                (__read_cr4() & X86_CR4_SMEP))
                        pr_crit("unable to execute userspace code (SMEP?) (uid: %d)\n",
                                from_kuid(&init_user_ns, current_uid()));
        }

        if (address < PAGE_SIZE && !user_mode(regs))
                pr_alert("BUG: kernel NULL pointer dereference, address: %px\n",
                        (void *)address);
        else
                pr_alert("BUG: unable to handle page fault for address: %px\n",
                        (void *)address);

        pr_alert("#PF: %s %s in %s mode\n",
                 (error_code & X86_PF_USER)  ? "user" : "supervisor",
                 (error_code & X86_PF_INSTR) ? "instruction fetch" :
                 (error_code & X86_PF_WRITE) ? "write access" :
                                               "read access",
                             user_mode(regs) ? "user" : "kernel");
        pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code,
                 !(error_code & X86_PF_PROT) ? "not-present page" :
                 (error_code & X86_PF_RSVD)  ? "reserved bit violation" :
                 (error_code & X86_PF_PK)    ? "protection keys violation" :
                                               "permissions violation");

        if (!(error_code & X86_PF_USER) && user_mode(regs)) {
                struct desc_ptr idt, gdt;
                u16 ldtr, tr;

                /*
                 * This can happen for quite a few reasons.  The more obvious
                 * ones are faults accessing the GDT, or LDT.  Perhaps
                 * surprisingly, if the CPU tries to deliver a benign or
                 * contributory exception from user code and gets a page fault
                 * during delivery, the page fault can be delivered as though
                 * it originated directly from user code.  This could happen
                 * due to wrong permissions on the IDT, GDT, LDT, TSS, or
                 * kernel or IST stack.
                 */
                store_idt(&idt);

                /* Usable even on Xen PV -- it's just slow. */
                native_store_gdt(&gdt);

                pr_alert("IDT: 0x%lx (limit=0x%hx) GDT: 0x%lx (limit=0x%hx)\n",
                         idt.address, idt.size, gdt.address, gdt.size);

                store_ldt(ldtr);
                show_ldttss(&gdt, "LDTR", ldtr);

                store_tr(tr);
                show_ldttss(&gdt, "TR", tr);
        }

        dump_pagetable(address);
}

static noinline void
pgtable_bad(struct pt_regs *regs, unsigned long error_code,
            unsigned long address)
{
        struct task_struct *tsk;
        unsigned long flags;
        int sig;

        flags = oops_begin();
        tsk = current;
        sig = SIGKILL;

        printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
               tsk->comm, address);
        dump_pagetable(address);

        if (__die("Bad pagetable", regs, error_code))
                sig = 0;

        oops_end(flags, regs, sig);
}

static void set_signal_archinfo(unsigned long address,
                                unsigned long error_code)
{
        struct task_struct *tsk = current;

        /*
         * To avoid leaking information about the kernel page
         * table layout, pretend that user-mode accesses to
         * kernel addresses are always protection faults.
         *
         * NB: This means that failed vsyscalls with vsyscall=none
         * will have the PROT bit.  This doesn't leak any
         * information and does not appear to cause any problems.
         */
        if (address >= TASK_SIZE_MAX)
                error_code |= X86_PF_PROT;

        tsk->thread.trap_nr = X86_TRAP_PF;
        tsk->thread.error_code = error_code | X86_PF_USER;
        tsk->thread.cr2 = address;
}

static noinline void
no_context(struct pt_regs *regs, unsigned long error_code,
           unsigned long address, int signal, int si_code)
{
        struct task_struct *tsk = current;
        unsigned long flags;
        int sig;

        if (user_mode(regs)) {
                /*
                 * This is an implicit supervisor-mode access from user
                 * mode.  Bypass all the kernel-mode recovery code and just
                 * OOPS.
                 */
                goto oops;
        }

        /* Are we prepared to handle this kernel fault? */
        if (fixup_exception(regs, X86_TRAP_PF, error_code, address))
                return;

#ifdef CONFIG_VMAP_STACK
        /*
         * Stack overflow?  During boot, we can fault near the initial
         * stack in the direct map, but that's not an overflow -- check
         * that we're in vmalloc space to avoid this.
         */
        if (is_vmalloc_addr((void *)address) &&
            (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) ||
             address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) {
                unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *);
                /*
                 * We're likely to be running with very little stack space
                 * left.  It's plausible that we'd hit this condition but
                 * double-fault even before we get this far, in which case
                 * we're fine: the double-fault handler will deal with it.
                 *
                 * We don't want to make it all the way into the oops code
                 * and then double-fault, though, because we're likely to
                 * break the console driver and lose most of the stack dump.
                 */
                asm volatile ("movq %[stack], %%rsp\n\t"
                              "call handle_stack_overflow\n\t"
                              "1: jmp 1b"
                              : ASM_CALL_CONSTRAINT
                              : "D" ("kernel stack overflow (page fault)"),
                                "S" (regs), "d" (address),
                                [stack] "rm" (stack));
                unreachable();
        }
#endif

        /*
         * 32-bit:
         *
         *   Valid to do another page fault here, because if this fault
         *   had been triggered by is_prefetch fixup_exception would have
         *   handled it.
         *
         * 64-bit:
         *
         *   Hall of shame of CPU/BIOS bugs.
         */
        if (is_prefetch(regs, error_code, address))
                return;

        /*
         * Buggy firmware could access regions which might page fault, try to
         * recover from such faults.
         */
        if (IS_ENABLED(CONFIG_EFI))
                efi_recover_from_page_fault(address);

oops:
        /*
         * Oops. The kernel tried to access some bad page. We'll have to
         * terminate things with extreme prejudice:
         */
        flags = oops_begin();

        show_fault_oops(regs, error_code, address);

        if (task_stack_end_corrupted(tsk))
                printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");

        sig = SIGKILL;
        if (__die("Oops", regs, error_code))
                sig = 0;

        /* Executive summary in case the body of the oops scrolled away */
        printk(KERN_DEFAULT "CR2: %016lx\n", address);

        oops_end(flags, regs, sig);
}

/*
 * Print out info about fatal segfaults, if the show_unhandled_signals
 * sysctl is set:
 */
static inline void
show_signal_msg(struct pt_regs *regs, unsigned long error_code,
                unsigned long address, struct task_struct *tsk)
{
        const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG;

        if (!unhandled_signal(tsk, SIGSEGV))
                return;

        if (!printk_ratelimit())
                return;

        printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
                loglvl, tsk->comm, task_pid_nr(tsk), address,
                (void *)regs->ip, (void *)regs->sp, error_code);

        print_vma_addr(KERN_CONT " in ", regs->ip);

        printk(KERN_CONT "\n");

        show_opcodes(regs, loglvl);
}

static void
__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
                       unsigned long address, u32 pkey, int si_code)
{
        struct task_struct *tsk = current;

        /* User mode accesses just cause a SIGSEGV */
        if (user_mode(regs) && (error_code & X86_PF_USER)) {
                /*
                 * It's possible to have interrupts off here:
                 */
                local_irq_enable();

                /*
                 * Valid to do another page fault here because this one came
                 * from user space:
                 */
                if (is_prefetch(regs, error_code, address))
                        return;

                if (is_errata100(regs, address))
                        return;

                /*
                 * To avoid leaking information about the kernel page table
                 * layout, pretend that user-mode accesses to kernel addresses
                 * are always protection faults.
                 */
                if (address >= TASK_SIZE_MAX)
                        error_code |= X86_PF_PROT;

                if (likely(show_unhandled_signals))
                        show_signal_msg(regs, error_code, address, tsk);

                set_signal_archinfo(address, error_code);

                if (si_code == SEGV_PKUERR)
                        force_sig_pkuerr((void __user *)address, pkey);

                force_sig_fault(SIGSEGV, si_code, (void __user *)address);

                local_irq_disable();

                return;
        }

        if (is_f00f_bug(regs, address))
                return;

        no_context(regs, error_code, address, SIGSEGV, si_code);
}

static noinline void
bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
                     unsigned long address)
{
        __bad_area_nosemaphore(regs, error_code, address, 0, SEGV_MAPERR);
}

static void
__bad_area(struct pt_regs *regs, unsigned long error_code,
           unsigned long address, u32 pkey, int si_code)
{
        struct mm_struct *mm = current->mm;
        /*
         * Something tried to access memory that isn't in our memory map..
         * Fix it, but check if it's kernel or user first..
         */
        mmap_read_unlock(mm);

        __bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
}

static noinline void
bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
{
        __bad_area(regs, error_code, address, 0, SEGV_MAPERR);
}

static inline bool bad_area_access_from_pkeys(unsigned long error_code,
                struct vm_area_struct *vma)
{
        /* This code is always called on the current mm */
        bool foreign = false;

        if (!boot_cpu_has(X86_FEATURE_OSPKE))
                return false;
        if (error_code & X86_PF_PK)
                return true;
        /* this checks permission keys on the VMA: */
        if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
                                       (error_code & X86_PF_INSTR), foreign))
                return true;
        return false;
}

static noinline void
bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
                      unsigned long address, struct vm_area_struct *vma)
{
        /*
         * This OSPKE check is not strictly necessary at runtime.
         * But, doing it this way allows compiler optimizations
         * if pkeys are compiled out.
         */
        if (bad_area_access_from_pkeys(error_code, vma)) {
                /*
                 * A protection key fault means that the PKRU value did not allow
                 * access to some PTE.  Userspace can figure out what PKRU was
                 * from the XSAVE state.  This function captures the pkey from
                 * the vma and passes it to userspace so userspace can discover
                 * which protection key was set on the PTE.
                 *
                 * If we get here, we know that the hardware signaled a X86_PF_PK
                 * fault and that there was a VMA once we got in the fault
                 * handler.  It does *not* guarantee that the VMA we find here
                 * was the one that we faulted on.
                 *
                 * 1. T1   : mprotect_key(foo, PAGE_SIZE, pkey=4);
                 * 2. T1   : set PKRU to deny access to pkey=4, touches page
                 * 3. T1   : faults...
                 * 4.    T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
                 * 5. T1   : enters fault handler, takes mmap_lock, etc...
                 * 6. T1   : reaches here, sees vma_pkey(vma)=5, when we really
                 *             faulted on a pte with its pkey=4.
                 */
                u32 pkey = vma_pkey(vma);

                __bad_area(regs, error_code, address, pkey, SEGV_PKUERR);
        } else {
                __bad_area(regs, error_code, address, 0, SEGV_ACCERR);
        }
}

static void
do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
          vm_fault_t fault)
{
        /* Kernel mode? Handle exceptions or die: */
        if (!(error_code & X86_PF_USER)) {
                no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
                return;
        }

        /* User-space => ok to do another page fault: */
        if (is_prefetch(regs, error_code, address))
                return;

        set_signal_archinfo(address, error_code);

#ifdef CONFIG_MEMORY_FAILURE
        if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
                struct task_struct *tsk = current;
                unsigned lsb = 0;

                pr_err(
        "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
                        tsk->comm, tsk->pid, address);
                if (fault & VM_FAULT_HWPOISON_LARGE)
                        lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
                if (fault & VM_FAULT_HWPOISON)
                        lsb = PAGE_SHIFT;
                force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb);
                return;
        }
#endif
        force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
}

static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
{
        if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
                return 0;

        if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
                return 0;

        return 1;
}

/*
 * Handle a spurious fault caused by a stale TLB entry.
 *
 * This allows us to lazily refresh the TLB when increasing the
 * permissions of a kernel page (RO -> RW or NX -> X).  Doing it
 * eagerly is very expensive since that implies doing a full
 * cross-processor TLB flush, even if no stale TLB entries exist
 * on other processors.
 *
 * Spurious faults may only occur if the TLB contains an entry with
 * fewer permission than the page table entry.  Non-present (P = 0)
 * and reserved bit (R = 1) faults are never spurious.
 *
 * There are no security implications to leaving a stale TLB when
 * increasing the permissions on a page.
 *
 * Returns non-zero if a spurious fault was handled, zero otherwise.
 *
 * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3
 * (Optional Invalidation).
 */
static noinline int
spurious_kernel_fault(unsigned long error_code, unsigned long address)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;
        pte_t *pte;
        int ret;

        /*
         * Only writes to RO or instruction fetches from NX may cause
         * spurious faults.
         *
         * These could be from user or supervisor accesses but the TLB
         * is only lazily flushed after a kernel mapping protection
         * change, so user accesses are not expected to cause spurious
         * faults.
         */
        if (error_code != (X86_PF_WRITE | X86_PF_PROT) &&
            error_code != (X86_PF_INSTR | X86_PF_PROT))
                return 0;

        pgd = init_mm.pgd + pgd_index(address);
        if (!pgd_present(*pgd))
                return 0;

        p4d = p4d_offset(pgd, address);
        if (!p4d_present(*p4d))
                return 0;

        if (p4d_large(*p4d))
                return spurious_kernel_fault_check(error_code, (pte_t *) p4d);

        pud = pud_offset(p4d, address);
        if (!pud_present(*pud))
                return 0;

        if (pud_large(*pud))
                return spurious_kernel_fault_check(error_code, (pte_t *) pud);

        pmd = pmd_offset(pud, address);
        if (!pmd_present(*pmd))
                return 0;

        if (pmd_large(*pmd))
                return spurious_kernel_fault_check(error_code, (pte_t *) pmd);

        pte = pte_offset_kernel(pmd, address);
        if (!pte_present(*pte))
                return 0;

        ret = spurious_kernel_fault_check(error_code, pte);
        if (!ret)
                return 0;

        /*
         * Make sure we have permissions in PMD.
         * If not, then there's a bug in the page tables:
         */
        ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd);
        WARN_ONCE(!ret, "PMD has incorrect permission bits\n");

        return ret;
}
NOKPROBE_SYMBOL(spurious_kernel_fault);

int show_unhandled_signals = 1;

static inline int
access_error(unsigned long error_code, struct vm_area_struct *vma)
{
        /* This is only called for the current mm, so: */
        bool foreign = false;

        /*
         * Read or write was blocked by protection keys.  This is
         * always an unconditional error and can never result in
         * a follow-up action to resolve the fault, like a COW.
         */
        if (error_code & X86_PF_PK)
                return 1;

        /*
         * Make sure to check the VMA so that we do not perform
         * faults just to hit a X86_PF_PK as soon as we fill in a
         * page.
         */
        if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
                                       (error_code & X86_PF_INSTR), foreign))
                return 1;

        if (error_code & X86_PF_WRITE) {
                /* write, present and write, not present: */
                if (unlikely(!(vma->vm_flags & VM_WRITE)))
                        return 1;
                return 0;
        }

        /* read, present: */
        if (unlikely(error_code & X86_PF_PROT))
                return 1;

        /* read, not present: */
        if (unlikely(!vma_is_accessible(vma)))
                return 1;

        return 0;
}

bool fault_in_kernel_space(unsigned long address)
{
        /*
         * On 64-bit systems, the vsyscall page is at an address above
         * TASK_SIZE_MAX, but is not considered part of the kernel
         * address space.
         */
        if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address))
                return false;

        return address >= TASK_SIZE_MAX;
}

/*
 * Called for all faults where 'address' is part of the kernel address
 * space.  Might get called for faults that originate from *code* that
 * ran in userspace or the kernel.
 */
static void
do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
                   unsigned long address)
{
        /*
         * Protection keys exceptions only happen on user pages.  We
         * have no user pages in the kernel portion of the address
         * space, so do not expect them here.
         */
        WARN_ON_ONCE(hw_error_code & X86_PF_PK);

#ifdef CONFIG_X86_32
        /*
         * We can fault-in kernel-space virtual memory on-demand. The
         * 'reference' page table is init_mm.pgd.
         *
         * NOTE! We MUST NOT take any locks for this case. We may
         * be in an interrupt or a critical region, and should
         * only copy the information from the master page table,
         * nothing more.
         *
         * Before doing this on-demand faulting, ensure that the
         * fault is not any of the following:
         * 1. A fault on a PTE with a reserved bit set.
         * 2. A fault caused by a user-mode access.  (Do not demand-
         *    fault kernel memory due to user-mode accesses).
         * 3. A fault caused by a page-level protection violation.
         *    (A demand fault would be on a non-present page which
         *     would have X86_PF_PROT==0).
         *
         * This is only needed to close a race condition on x86-32 in
         * the vmalloc mapping/unmapping code. See the comment above
         * vmalloc_fault() for details. On x86-64 the race does not
         * exist as the vmalloc mappings don't need to be synchronized
         * there.
         */
        if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
                if (vmalloc_fault(address) >= 0)
                        return;
        }
#endif

        /* Was the fault spurious, caused by lazy TLB invalidation? */
        if (spurious_kernel_fault(hw_error_code, address))
                return;

        /* kprobes don't want to hook the spurious faults: */
        if (kprobe_page_fault(regs, X86_TRAP_PF))
                return;

        /*
         * Note, despite being a "bad area", there are quite a few
         * acceptable reasons to get here, such as erratum fixups
         * and handling kernel code that can fault, like get_user().
         *
         * Don't take the mm semaphore here. If we fixup a prefetch
         * fault we could otherwise deadlock:
         */
        bad_area_nosemaphore(regs, hw_error_code, address);
}
NOKPROBE_SYMBOL(do_kern_addr_fault);

/* Handle faults in the user portion of the address space */
static inline
void do_user_addr_fault(struct pt_regs *regs,
                        unsigned long error_code,
                        unsigned long address)
{
        struct vm_area_struct *vma;
        struct task_struct *tsk;
        struct mm_struct *mm;
        vm_fault_t fault;
        unsigned int flags = FAULT_FLAG_DEFAULT;

        tsk = current;
        mm = tsk->mm;

        if (unlikely((error_code & (X86_PF_USER | X86_PF_INSTR)) == X86_PF_INSTR)) {
                /*
                 * Whoops, this is kernel mode code trying to execute from
                 * user memory.  Unless this is AMD erratum #93, which
                 * corrupts RIP such that it looks like a user address,
                 * this is unrecoverable.  Don't even try to look up the
                 * VMA.
                 */
                if (is_errata93(regs, address))
                        return;

                bad_area_nosemaphore(regs, error_code, address);
                return;
        }

        /* kprobes don't want to hook the spurious faults: */
        if (unlikely(kprobe_page_fault(regs, X86_TRAP_PF)))
                return;

        /*
         * Reserved bits are never expected to be set on
         * entries in the user portion of the page tables.
         */
        if (unlikely(error_code & X86_PF_RSVD))
                pgtable_bad(regs, error_code, address);

        /*
         * If SMAP is on, check for invalid kernel (supervisor) access to user
         * pages in the user address space.  The odd case here is WRUSS,
         * which, according to the preliminary documentation, does not respect
         * SMAP and will have the USER bit set so, in all cases, SMAP
         * enforcement appears to be consistent with the USER bit.
         */
        if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) &&
                     !(error_code & X86_PF_USER) &&
                     !(regs->flags & X86_EFLAGS_AC)))
        {
                bad_area_nosemaphore(regs, error_code, address);
                return;
        }

        /*
         * If we're in an interrupt, have no user context or are running
         * in a region with pagefaults disabled then we must not take the fault
         */
        if (unlikely(faulthandler_disabled() || !mm)) {
                bad_area_nosemaphore(regs, error_code, address);
                return;
        }

        /*
         * It's safe to allow irq's after cr2 has been saved and the
         * vmalloc fault has been handled.
         *
         * User-mode registers count as a user access even for any
         * potential system fault or CPU buglet:
         */
        if (user_mode(regs)) {
                local_irq_enable();
                flags |= FAULT_FLAG_USER;
        } else {
                if (regs->flags & X86_EFLAGS_IF)
                        local_irq_enable();
        }

        perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);

        if (error_code & X86_PF_WRITE)
                flags |= FAULT_FLAG_WRITE;
        if (error_code & X86_PF_INSTR)
                flags |= FAULT_FLAG_INSTRUCTION;

#ifdef CONFIG_X86_64
        /*
         * Faults in the vsyscall page might need emulation.  The
         * vsyscall page is at a high address (>PAGE_OFFSET), but is
         * considered to be part of the user address space.
         *
         * The vsyscall page does not have a "real" VMA, so do this
         * emulation before we go searching for VMAs.
         *
         * PKRU never rejects instruction fetches, so we don't need
         * to consider the PF_PK bit.
         */
        if (is_vsyscall_vaddr(address)) {
                if (emulate_vsyscall(error_code, regs, address))
                        return;
        }
#endif

        /*
         * Kernel-mode access to the user address space should only occur
         * on well-defined single instructions listed in the exception
         * tables.  But, an erroneous kernel fault occurring outside one of
         * those areas which also holds mmap_lock might deadlock attempting
         * to validate the fault against the address space.
         *
         * Only do the expensive exception table search when we might be at
         * risk of a deadlock.  This happens if we
         * 1. Failed to acquire mmap_lock, and
         * 2. The access did not originate in userspace.
         */
        if (unlikely(!mmap_read_trylock(mm))) {
                if (!user_mode(regs) && !search_exception_tables(regs->ip)) {
                        /*
                         * Fault from code in kernel from
                         * which we do not expect faults.
                         */
                        bad_area_nosemaphore(regs, error_code, address);
                        return;
                }
retry:
                mmap_read_lock(mm);
        } else {
                /*
                 * The above down_read_trylock() might have succeeded in
                 * which case we'll have missed the might_sleep() from
                 * down_read():
                 */
                might_sleep();
        }

        vma = find_vma(mm, address);
        if (unlikely(!vma)) {
                bad_area(regs, error_code, address);
                return;
        }
        if (likely(vma->vm_start <= address))
                goto good_area;
        if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
                bad_area(regs, error_code, address);
                return;
        }
        if (unlikely(expand_stack(vma, address))) {
                bad_area(regs, error_code, address);
                return;
        }

        /*
         * Ok, we have a good vm_area for this memory access, so
         * we can handle it..
         */
good_area:
        if (unlikely(access_error(error_code, vma))) {
                bad_area_access_error(regs, error_code, address, vma);
                return;
        }

        /*
         * If for any reason at all we couldn't handle the fault,
         * make sure we exit gracefully rather than endlessly redo
         * the fault.  Since we never set FAULT_FLAG_RETRY_NOWAIT, if
         * we get VM_FAULT_RETRY back, the mmap_lock has been unlocked.
         *
         * Note that handle_userfault() may also release and reacquire mmap_lock
         * (and not return with VM_FAULT_RETRY), when returning to userland to
         * repeat the page fault later with a VM_FAULT_NOPAGE retval
         * (potentially after handling any pending signal during the return to
         * userland). The return to userland is identified whenever
         * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags.
         */
        fault = handle_mm_fault(vma, address, flags, regs);

        /* Quick path to respond to signals */
        if (fault_signal_pending(fault, regs)) {
                if (!user_mode(regs))
                        no_context(regs, error_code, address, SIGBUS,
                                   BUS_ADRERR);
                return;
        }

        /*
         * If we need to retry the mmap_lock has already been released,
         * and if there is a fatal signal pending there is no guarantee
         * that we made any progress. Handle this case first.
         */
        if (unlikely((fault & VM_FAULT_RETRY) &&
                     (flags & FAULT_FLAG_ALLOW_RETRY))) {
                flags |= FAULT_FLAG_TRIED;
                goto retry;
        }

        mmap_read_unlock(mm);
        if (likely(!(fault & VM_FAULT_ERROR)))
                return;

        if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
                no_context(regs, error_code, address, 0, 0);
                return;
        }

        if (fault & VM_FAULT_OOM) {
                /* Kernel mode? Handle exceptions or die: */
                if (!(error_code & X86_PF_USER)) {
                        no_context(regs, error_code, address,
                                   SIGSEGV, SEGV_MAPERR);
                        return;
                }

                /*
                 * We ran out of memory, call the OOM killer, and return the
                 * userspace (which will retry the fault, or kill us if we got
                 * oom-killed):
                 */
                pagefault_out_of_memory();
        } else {
                if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
                             VM_FAULT_HWPOISON_LARGE))
                        do_sigbus(regs, error_code, address, fault);
                else if (fault & VM_FAULT_SIGSEGV)
                        bad_area_nosemaphore(regs, error_code, address);
                else
                        BUG();
        }

        check_v8086_mode(regs, address, tsk);
}
NOKPROBE_SYMBOL(do_user_addr_fault);

static __always_inline void
trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code,
                         unsigned long address)
{
        if (!trace_pagefault_enabled())
                return;

        if (user_mode(regs))
                trace_page_fault_user(address, regs, error_code);
        else
                trace_page_fault_kernel(address, regs, error_code);
}

static __always_inline void
handle_page_fault(struct pt_regs *regs, unsigned long error_code,
                              unsigned long address)
{
        trace_page_fault_entries(regs, error_code, address);

        if (unlikely(kmmio_fault(regs, address)))
                return;

        /* Was the fault on kernel-controlled part of the address space? */
        if (unlikely(fault_in_kernel_space(address))) {
                do_kern_addr_fault(regs, error_code, address);
        } else {
                do_user_addr_fault(regs, error_code, address);
                /*
                 * User address page fault handling might have reenabled
                 * interrupts. Fixing up all potential exit points of
                 * do_user_addr_fault() and its leaf functions is just not
                 * doable w/o creating an unholy mess or turning the code
                 * upside down.
                 */
                local_irq_disable();
        }
}

DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
{
        unsigned long address = read_cr2();
        irqentry_state_t state;

        prefetchw(&current->mm->mmap_lock);

        /*
         * KVM uses #PF vector to deliver 'page not present' events to guests
         * (asynchronous page fault mechanism). The event happens when a
         * userspace task is trying to access some valid (from guest's point of
         * view) memory which is not currently mapped by the host (e.g. the
         * memory is swapped out). Note, the corresponding "page ready" event
         * which is injected when the memory becomes available, is delived via
         * an interrupt mechanism and not a #PF exception
         * (see arch/x86/kernel/kvm.c: sysvec_kvm_asyncpf_interrupt()).
         *
         * We are relying on the interrupted context being sane (valid RSP,
         * relevant locks not held, etc.), which is fine as long as the
         * interrupted context had IF=1.  We are also relying on the KVM
         * async pf type field and CR2 being read consistently instead of
         * getting values from real and async page faults mixed up.
         *
         * Fingers crossed.
         *
         * The async #PF handling code takes care of idtentry handling
         * itself.
         */
        if (kvm_handle_async_pf(regs, (u32)address))
                return;

        /*
         * Entry handling for valid #PF from kernel mode is slightly
         * different: RCU is already watching and rcu_irq_enter() must not
         * be invoked because a kernel fault on a user space address might
         * sleep.
         *
         * In case the fault hit a RCU idle region the conditional entry
         * code reenabled RCU to avoid subsequent wreckage which helps
         * debugability.
         */
        state = irqentry_enter(regs);

        instrumentation_begin();
        handle_page_fault(regs, error_code, address);
        instrumentation_end();

        irqentry_exit(regs, state);
}























    2 



































    2 







































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM workqueue

#if !defined(_TRACE_WORKQUEUE_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_WORKQUEUE_H

#include <linux/tracepoint.h>
#include <linux/workqueue.h>

struct pool_workqueue;

/**
 * workqueue_queue_work - called when a work gets queued
 * @req_cpu:        the requested cpu
 * @pwq:        pointer to struct pool_workqueue
 * @work:        pointer to struct work_struct
 *
 * This event occurs when a work is queued immediately or once a
 * delayed work is actually queued on a workqueue (ie: once the delay
 * has been reached).
 */
TRACE_EVENT(workqueue_queue_work,

        TP_PROTO(unsigned int req_cpu, struct pool_workqueue *pwq,
                 struct work_struct *work),

        TP_ARGS(req_cpu, pwq, work),

        TP_STRUCT__entry(
                __field( void *,        work        )
                __field( void *,        function)
                __field( void *,        workqueue)
                __field( unsigned int,        req_cpu        )
                __field( unsigned int,        cpu        )
        ),

        TP_fast_assign(
                __entry->work                = work;
                __entry->function        = work->func;
                __entry->workqueue        = pwq->wq;
                __entry->req_cpu        = req_cpu;
                __entry->cpu                = pwq->pool->cpu;
        ),

        TP_printk("work struct=%p function=%ps workqueue=%p req_cpu=%u cpu=%u",
                  __entry->work, __entry->function, __entry->workqueue,
                  __entry->req_cpu, __entry->cpu)
);

/**
 * workqueue_activate_work - called when a work gets activated
 * @work:        pointer to struct work_struct
 *
 * This event occurs when a queued work is put on the active queue,
 * which happens immediately after queueing unless @max_active limit
 * is reached.
 */
TRACE_EVENT(workqueue_activate_work,

        TP_PROTO(struct work_struct *work),

        TP_ARGS(work),

        TP_STRUCT__entry(
                __field( void *,        work        )
        ),

        TP_fast_assign(
                __entry->work                = work;
        ),

        TP_printk("work struct %p", __entry->work)
);

/**
 * workqueue_execute_start - called immediately before the workqueue callback
 * @work:        pointer to struct work_struct
 *
 * Allows to track workqueue execution.
 */
TRACE_EVENT(workqueue_execute_start,

        TP_PROTO(struct work_struct *work),

        TP_ARGS(work),

        TP_STRUCT__entry(
                __field( void *,        work        )
                __field( void *,        function)
        ),

        TP_fast_assign(
                __entry->work                = work;
                __entry->function        = work->func;
        ),

        TP_printk("work struct %p: function %ps", __entry->work, __entry->function)
);

/**
 * workqueue_execute_end - called immediately after the workqueue callback
 * @work:        pointer to struct work_struct
 * @function:   pointer to worker function
 *
 * Allows to track workqueue execution.
 */
TRACE_EVENT(workqueue_execute_end,

        TP_PROTO(struct work_struct *work, work_func_t function),

        TP_ARGS(work, function),

        TP_STRUCT__entry(
                __field( void *,        work        )
                __field( void *,        function)
        ),

        TP_fast_assign(
                __entry->work                = work;
                __entry->function        = function;
        ),

        TP_printk("work struct %p: function %ps", __entry->work, __entry->function)
);

#endif /*  _TRACE_WORKQUEUE_H */

/* This part must be outside protection */
#include <trace/define_trace.h>




















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PGALLLC_TRACK_H
#define _LINUX_PGALLLC_TRACK_H

#if defined(CONFIG_MMU)
static inline p4d_t *p4d_alloc_track(struct mm_struct *mm, pgd_t *pgd,
                                     unsigned long address,
                                     pgtbl_mod_mask *mod_mask)
{
        if (unlikely(pgd_none(*pgd))) {
                if (__p4d_alloc(mm, pgd, address))
                        return NULL;
                *mod_mask |= PGTBL_PGD_MODIFIED;
        }

        return p4d_offset(pgd, address);
}

static inline pud_t *pud_alloc_track(struct mm_struct *mm, p4d_t *p4d,
                                     unsigned long address,
                                     pgtbl_mod_mask *mod_mask)
{
        if (unlikely(p4d_none(*p4d))) {
                if (__pud_alloc(mm, p4d, address))
                        return NULL;
                *mod_mask |= PGTBL_P4D_MODIFIED;
        }

        return pud_offset(p4d, address);
}

static inline pmd_t *pmd_alloc_track(struct mm_struct *mm, pud_t *pud,
                                     unsigned long address,
                                     pgtbl_mod_mask *mod_mask)
{
        if (unlikely(pud_none(*pud))) {
                if (__pmd_alloc(mm, pud, address))
                        return NULL;
                *mod_mask |= PGTBL_PUD_MODIFIED;
        }

        return pmd_offset(pud, address);
}
#endif /* CONFIG_MMU */

#define pte_alloc_kernel_track(pmd, address, mask)                        \
        ((unlikely(pmd_none(*(pmd))) &&                                        \
          (__pte_alloc_kernel(pmd) || ({*(mask)|=PGTBL_PMD_MODIFIED;0;})))?\
                NULL: pte_offset_kernel(pmd, address))

#endif /* _LINUX_PGALLLC_TRACK_H */













































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the TCP protocol.
 *
 * Version:        @(#)tcp.h        1.0.2        04/28/93
 *
 * Author:        Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 */
#ifndef _LINUX_TCP_H
#define _LINUX_TCP_H


#include <linux/skbuff.h>
#include <linux/win_minmax.h>
#include <net/sock.h>
#include <net/inet_connection_sock.h>
#include <net/inet_timewait_sock.h>
#include <uapi/linux/tcp.h>

static inline struct tcphdr *tcp_hdr(const struct sk_buff *skb)
{
        return (struct tcphdr *)skb_transport_header(skb);
}

static inline unsigned int __tcp_hdrlen(const struct tcphdr *th)
{
        return th->doff * 4;
}

static inline unsigned int tcp_hdrlen(const struct sk_buff *skb)
{
        return __tcp_hdrlen(tcp_hdr(skb));
}

static inline struct tcphdr *inner_tcp_hdr(const struct sk_buff *skb)
{
        return (struct tcphdr *)skb_inner_transport_header(skb);
}

static inline unsigned int inner_tcp_hdrlen(const struct sk_buff *skb)
{
        return inner_tcp_hdr(skb)->doff * 4;
}

static inline unsigned int tcp_optlen(const struct sk_buff *skb)
{
        return (tcp_hdr(skb)->doff - 5) * 4;
}

/* TCP Fast Open */
#define TCP_FASTOPEN_COOKIE_MIN        4        /* Min Fast Open Cookie size in bytes */
#define TCP_FASTOPEN_COOKIE_MAX        16        /* Max Fast Open Cookie size in bytes */
#define TCP_FASTOPEN_COOKIE_SIZE 8        /* the size employed by this impl. */

/* TCP Fast Open Cookie as stored in memory */
struct tcp_fastopen_cookie {
        __le64        val[DIV_ROUND_UP(TCP_FASTOPEN_COOKIE_MAX, sizeof(u64))];
        s8        len;
        bool        exp;        /* In RFC6994 experimental option format */
};

/* This defines a selective acknowledgement block. */
struct tcp_sack_block_wire {
        __be32        start_seq;
        __be32        end_seq;
};

struct tcp_sack_block {
        u32        start_seq;
        u32        end_seq;
};

/*These are used to set the sack_ok field in struct tcp_options_received */
#define TCP_SACK_SEEN     (1 << 0)   /*1 = peer is SACK capable, */
#define TCP_DSACK_SEEN    (1 << 2)   /*1 = DSACK was received from peer*/

struct tcp_options_received {
/*        PAWS/RTTM data        */
        int        ts_recent_stamp;/* Time we stored ts_recent (for aging) */
        u32        ts_recent;        /* Time stamp to echo next                */
        u32        rcv_tsval;        /* Time stamp value                     */
        u32        rcv_tsecr;        /* Time stamp echo reply                */
        u16         saw_tstamp : 1,        /* Saw TIMESTAMP on last packet                */
                tstamp_ok : 1,        /* TIMESTAMP seen on SYN packet                */
                dsack : 1,        /* D-SACK is scheduled                        */
                wscale_ok : 1,        /* Wscale seen on SYN packet                */
                sack_ok : 3,        /* SACK seen on SYN packet                */
                smc_ok : 1,        /* SMC seen on SYN packet                */
                snd_wscale : 4,        /* Window scaling received from sender        */
                rcv_wscale : 4;        /* Window scaling to send to receiver        */
        u8        saw_unknown:1,        /* Received unknown option                */
                unused:7;
        u8        num_sacks;        /* Number of SACK blocks                */
        u16        user_mss;        /* mss requested by user in ioctl        */
        u16        mss_clamp;        /* Maximal mss, negotiated at connection setup */
};

static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
{
        rx_opt->tstamp_ok = rx_opt->sack_ok = 0;
        rx_opt->wscale_ok = rx_opt->snd_wscale = 0;
#if IS_ENABLED(CONFIG_SMC)
        rx_opt->smc_ok = 0;
#endif
}

/* This is the max number of SACKS that we'll generate and process. It's safe
 * to increase this, although since:
 *   size = TCPOLEN_SACK_BASE_ALIGNED (4) + n * TCPOLEN_SACK_PERBLOCK (8)
 * only four options will fit in a standard TCP header */
#define TCP_NUM_SACKS 4

struct tcp_request_sock_ops;

struct tcp_request_sock {
        struct inet_request_sock         req;
        const struct tcp_request_sock_ops *af_specific;
        u64                                snt_synack; /* first SYNACK sent time */
        bool                                tfo_listener;
        bool                                is_mptcp;
#if IS_ENABLED(CONFIG_MPTCP)
        bool                                drop_req;
#endif
        u32                                txhash;
        u32                                rcv_isn;
        u32                                snt_isn;
        u32                                ts_off;
        u32                                last_oow_ack_time; /* last SYNACK */
        u32                                rcv_nxt; /* the ack # by SYNACK. For
                                                  * FastOpen it's the seq#
                                                  * after data-in-SYN.
                                                  */
        u8                                syn_tos;
};

static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
{
        return (struct tcp_request_sock *)req;
}

struct tcp_sock {
        /* inet_connection_sock has to be the first member of tcp_sock */
        struct inet_connection_sock        inet_conn;
        u16        tcp_header_len;        /* Bytes of tcp header to send                */
        u16        gso_segs;        /* Max number of segs per GSO packet        */

/*
 *        Header prediction flags
 *        0x5?10 << 16 + snd_wnd in net byte order
 */
        __be32        pred_flags;

/*
 *        RFC793 variables by their proper names. This means you can
 *        read the code and the spec side by side (and laugh ...)
 *        See RFC793 and RFC1122. The RFC writes these in capitals.
 */
        u64        bytes_received;        /* RFC4898 tcpEStatsAppHCThruOctetsReceived
                                 * sum(delta(rcv_nxt)), or how many bytes
                                 * were acked.
                                 */
        u32        segs_in;        /* RFC4898 tcpEStatsPerfSegsIn
                                 * total number of segments in.
                                 */
        u32        data_segs_in;        /* RFC4898 tcpEStatsPerfDataSegsIn
                                 * total number of data segments in.
                                 */
         u32        rcv_nxt;        /* What we want to receive next         */
        u32        copied_seq;        /* Head of yet unread data                */
        u32        rcv_wup;        /* rcv_nxt on last window update sent        */
         u32        snd_nxt;        /* Next sequence we send                */
        u32        segs_out;        /* RFC4898 tcpEStatsPerfSegsOut
                                 * The total number of segments sent.
                                 */
        u32        data_segs_out;        /* RFC4898 tcpEStatsPerfDataSegsOut
                                 * total number of data segments sent.
                                 */
        u64        bytes_sent;        /* RFC4898 tcpEStatsPerfHCDataOctetsOut
                                 * total number of data bytes sent.
                                 */
        u64        bytes_acked;        /* RFC4898 tcpEStatsAppHCThruOctetsAcked
                                 * sum(delta(snd_una)), or how many bytes
                                 * were acked.
                                 */
        u32        dsack_dups;        /* RFC4898 tcpEStatsStackDSACKDups
                                 * total number of DSACK blocks received
                                 */
         u32        snd_una;        /* First byte we want an ack for        */
         u32        snd_sml;        /* Last byte of the most recently transmitted small packet */
        u32        rcv_tstamp;        /* timestamp of last received ACK (for keepalives) */
        u32        lsndtime;        /* timestamp of last sent data packet (for restart window) */
        u32        last_oow_ack_time;  /* timestamp of last out-of-window ACK */
        u32        compressed_ack_rcv_nxt;

        u32        tsoffset;        /* timestamp offset */

        struct list_head tsq_node; /* anchor in tsq_tasklet.head list */
        struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */

        u32        snd_wl1;        /* Sequence for window update                */
        u32        snd_wnd;        /* The window we expect to receive        */
        u32        max_window;        /* Maximal window ever seen from peer        */
        u32        mss_cache;        /* Cached effective mss, not including SACKS */

        u32        window_clamp;        /* Maximal window to advertise                */
        u32        rcv_ssthresh;        /* Current window clamp                        */

        /* Information of the most recently (s)acked skb */
        struct tcp_rack {
                u64 mstamp; /* (Re)sent time of the skb */
                u32 rtt_us;  /* Associated RTT */
                u32 end_seq; /* Ending TCP sequence of the skb */
                u32 last_delivered; /* tp->delivered at last reo_wnd adj */
                u8 reo_wnd_steps;   /* Allowed reordering window */
#define TCP_RACK_RECOVERY_THRESH 16
                u8 reo_wnd_persist:5, /* No. of recovery since last adj */
                   dsack_seen:1, /* Whether DSACK seen after last adj */
                   advanced:1;         /* mstamp advanced since last lost marking */
        } rack;
        u16        advmss;                /* Advertised MSS                        */
        u8        compressed_ack;
        u8        dup_ack_counter:2,
                tlp_retrans:1,        /* TLP is a retransmission */
                unused:5;
        u32        chrono_start;        /* Start time in jiffies of a TCP chrono */
        u32        chrono_stat[3];        /* Time in jiffies for chrono_stat stats */
        u8        chrono_type:2,        /* current chronograph type */
                rate_app_limited:1,  /* rate_{delivered,interval_us} limited? */
                fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
                fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */
                is_sack_reneg:1,    /* in recovery from loss with SACK reneg? */
                fastopen_client_fail:2; /* reason why fastopen failed */
        u8        nonagle     : 4,/* Disable Nagle algorithm?             */
                thin_lto    : 1,/* Use linear timeouts for thin streams */
                recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */
                repair      : 1,
                frto        : 1;/* F-RTO (RFC5682) activated in CA_Loss */
        u8        repair_queue;
        u8        save_syn:2,        /* Save headers of SYN packet */
                syn_data:1,        /* SYN includes data */
                syn_fastopen:1,        /* SYN includes Fast Open option */
                syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
                syn_fastopen_ch:1, /* Active TFO re-enabling probe */
                syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
                is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */
        u32        tlp_high_seq;        /* snd_nxt at the time of TLP */

        u32        tcp_tx_delay;        /* delay (in usec) added to TX packets */
        u64        tcp_wstamp_ns;        /* departure time for next sent data packet */
        u64        tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */

/* RTT measurement */
        u64        tcp_mstamp;        /* most recent packet received/sent */
        u32        srtt_us;        /* smoothed round trip time << 3 in usecs */
        u32        mdev_us;        /* medium deviation                        */
        u32        mdev_max_us;        /* maximal mdev for the last rtt period        */
        u32        rttvar_us;        /* smoothed mdev_max                        */
        u32        rtt_seq;        /* sequence number to update rttvar        */
        struct  minmax rtt_min;

        u32        packets_out;        /* Packets which are "in flight"        */
        u32        retrans_out;        /* Retransmitted packets out                */
        u32        max_packets_out;  /* max packets_out in last window */
        u32        cwnd_usage_seq;  /* right edge of cwnd usage tracking flight */

        u16        urg_data;        /* Saved octet of OOB data and control flags */
        u8        ecn_flags;        /* ECN status bits.                        */
        u8        keepalive_probes; /* num of allowed keep alive probes        */
        u32        reordering;        /* Packet reordering metric.                */
        u32        reord_seen;        /* number of data packet reordering events */
        u32        snd_up;                /* Urgent pointer                */

/*
 *      Options received (usually on last packet, some only on SYN packets).
 */
        struct tcp_options_received rx_opt;

/*
 *        Slow start and congestion control (see also Nagle, and Karn & Partridge)
 */
         u32        snd_ssthresh;        /* Slow start size threshold                */
         u32        snd_cwnd;        /* Sending congestion window                */
        u32        snd_cwnd_cnt;        /* Linear increase counter                */
        u32        snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */
        u32        snd_cwnd_used;
        u32        snd_cwnd_stamp;
        u32        prior_cwnd;        /* cwnd right before starting loss recovery */
        u32        prr_delivered;        /* Number of newly delivered packets to
                                 * receiver in Recovery. */
        u32        prr_out;        /* Total number of pkts sent during Recovery. */
        u32        delivered;        /* Total data packets delivered incl. rexmits */
        u32        delivered_ce;        /* Like the above but only ECE marked packets */
        u32        lost;                /* Total data packets lost incl. rexmits */
        u32        app_limited;        /* limited until "delivered" reaches this val */
        u64        first_tx_mstamp;  /* start of window send phase */
        u64        delivered_mstamp; /* time we reached "delivered" */
        u32        rate_delivered;    /* saved rate sample: packets delivered */
        u32        rate_interval_us;  /* saved rate sample: time elapsed */

         u32        rcv_wnd;        /* Current receiver window                */
        u32        write_seq;        /* Tail(+1) of data held in tcp send buffer */
        u32        notsent_lowat;        /* TCP_NOTSENT_LOWAT */
        u32        pushed_seq;        /* Last pushed seq, required to talk to windows */
        u32        lost_out;        /* Lost packets                        */
        u32        sacked_out;        /* SACK'd packets                        */

        struct hrtimer        pacing_timer;
        struct hrtimer        compressed_ack_timer;

        /* from STCP, retrans queue hinting */
        struct sk_buff* lost_skb_hint;
        struct sk_buff *retransmit_skb_hint;

        /* OOO segments go in this rbtree. Socket lock must be held. */
        struct rb_root        out_of_order_queue;
        struct sk_buff        *ooo_last_skb; /* cache rb_last(out_of_order_queue) */

        /* SACKs data, these 2 need to be together (see tcp_options_write) */
        struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */
        struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/

        struct tcp_sack_block recv_sack_cache[4];

        struct sk_buff *highest_sack;   /* skb just after the highest
                                         * skb with SACKed bit set
                                         * (validity guaranteed only if
                                         * sacked_out > 0)
                                         */

        int     lost_cnt_hint;

        u32        prior_ssthresh; /* ssthresh saved at recovery start        */
        u32        high_seq;        /* snd_nxt at onset of congestion        */

        u32        retrans_stamp;        /* Timestamp of the last retransmit,
                                 * also used in SYN-SENT to remember stamp of
                                 * the first SYN. */
        u32        undo_marker;        /* snd_una upon a new recovery episode. */
        int        undo_retrans;        /* number of undoable retransmissions. */
        u64        bytes_retrans;        /* RFC4898 tcpEStatsPerfOctetsRetrans
                                 * Total data bytes retransmitted
                                 */
        u32        total_retrans;        /* Total retransmits for entire connection */

        u32        urg_seq;        /* Seq of received urgent pointer */
        unsigned int                keepalive_time;          /* time before keep alive takes place */
        unsigned int                keepalive_intvl;  /* time interval between keep alive probes */

        int                        linger2;


/* Sock_ops bpf program related variables */
#ifdef CONFIG_BPF
        u8        bpf_sock_ops_cb_flags;  /* Control calling BPF programs
                                         * values defined in uapi/linux/tcp.h
                                         */
#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG)
#else
#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0
#endif

        u16 timeout_rehash;        /* Timeout-triggered rehash attempts */

        u32 rcv_ooopack; /* Received out-of-order packets, for tcpinfo */

/* Receiver side RTT estimation */
        u32 rcv_rtt_last_tsecr;
        struct {
                u32        rtt_us;
                u32        seq;
                u64        time;
        } rcv_rtt_est;

/* Receiver queue space */
        struct {
                u32        space;
                u32        seq;
                u64        time;
        } rcvq_space;

/* TCP-specific MTU probe information. */
        struct {
                u32                  probe_seq_start;
                u32                  probe_seq_end;
        } mtu_probe;
        u32        mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG
                           * while socket was owned by user.
                           */
#if IS_ENABLED(CONFIG_MPTCP)
        bool        is_mptcp;
#endif
#if IS_ENABLED(CONFIG_SMC)
        bool        syn_smc;        /* SYN includes SMC */
#endif

#ifdef CONFIG_TCP_MD5SIG
/* TCP AF-Specific parts; only used by MD5 Signature support so far */
        const struct tcp_sock_af_ops        *af_specific;

/* TCP MD5 Signature Option information */
        struct tcp_md5sig_info        __rcu *md5sig_info;
#endif

/* TCP fastopen related information */
        struct tcp_fastopen_request *fastopen_req;
        /* fastopen_rsk points to request_sock that resulted in this big
         * socket. Used to retransmit SYNACKs etc.
         */
        struct request_sock __rcu *fastopen_rsk;
        struct saved_syn *saved_syn;
};

enum tsq_enum {
        TSQ_THROTTLED,
        TSQ_QUEUED,
        TCP_TSQ_DEFERRED,           /* tcp_tasklet_func() found socket was owned */
        TCP_WRITE_TIMER_DEFERRED,  /* tcp_write_timer() found socket was owned */
        TCP_DELACK_TIMER_DEFERRED, /* tcp_delack_timer() found socket was owned */
        TCP_MTU_REDUCED_DEFERRED,  /* tcp_v{4|6}_err() could not call
                                    * tcp_v{4|6}_mtu_reduced()
                                    */
};

enum tsq_flags {
        TSQF_THROTTLED                        = (1UL << TSQ_THROTTLED),
        TSQF_QUEUED                        = (1UL << TSQ_QUEUED),
        TCPF_TSQ_DEFERRED                = (1UL << TCP_TSQ_DEFERRED),
        TCPF_WRITE_TIMER_DEFERRED        = (1UL << TCP_WRITE_TIMER_DEFERRED),
        TCPF_DELACK_TIMER_DEFERRED        = (1UL << TCP_DELACK_TIMER_DEFERRED),
        TCPF_MTU_REDUCED_DEFERRED        = (1UL << TCP_MTU_REDUCED_DEFERRED),
};

static inline struct tcp_sock *tcp_sk(const struct sock *sk)
{
        return (struct tcp_sock *)sk;
}

struct tcp_timewait_sock {
        struct inet_timewait_sock tw_sk;
#define tw_rcv_nxt tw_sk.__tw_common.skc_tw_rcv_nxt
#define tw_snd_nxt tw_sk.__tw_common.skc_tw_snd_nxt
        u32                          tw_rcv_wnd;
        u32                          tw_ts_offset;
        u32                          tw_ts_recent;

        /* The time we sent the last out-of-window ACK: */
        u32                          tw_last_oow_ack_time;

        int                          tw_ts_recent_stamp;
        u32                          tw_tx_delay;
#ifdef CONFIG_TCP_MD5SIG
        struct tcp_md5sig_key          *tw_md5_key;
#endif
};

static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)
{
        return (struct tcp_timewait_sock *)sk;
}

static inline bool tcp_passive_fastopen(const struct sock *sk)
{
        return sk->sk_state == TCP_SYN_RECV &&
               rcu_access_pointer(tcp_sk(sk)->fastopen_rsk) != NULL;
}

static inline void fastopen_queue_tune(struct sock *sk, int backlog)
{
        struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
        int somaxconn = READ_ONCE(sock_net(sk)->core.sysctl_somaxconn);

        WRITE_ONCE(queue->fastopenq.max_qlen, min_t(unsigned int, backlog, somaxconn));
}

static inline void tcp_move_syn(struct tcp_sock *tp,
                                struct request_sock *req)
{
        tp->saved_syn = req->saved_syn;
        req->saved_syn = NULL;
}

static inline void tcp_saved_syn_free(struct tcp_sock *tp)
{
        kfree(tp->saved_syn);
        tp->saved_syn = NULL;
}

static inline u32 tcp_saved_syn_len(const struct saved_syn *saved_syn)
{
        return saved_syn->mac_hdrlen + saved_syn->network_hdrlen +
                saved_syn->tcp_hdrlen;
}

struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
                                               const struct sk_buff *orig_skb);

static inline u16 tcp_mss_clamp(const struct tcp_sock *tp, u16 mss)
{
        /* We use READ_ONCE() here because socket might not be locked.
         * This happens for listeners.
         */
        u16 user_mss = READ_ONCE(tp->rx_opt.user_mss);

        return (user_mss && user_mss < mss) ? user_mss : mss;
}

int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount,
                  int shiftlen);

void tcp_sock_set_cork(struct sock *sk, bool on);
int tcp_sock_set_keepcnt(struct sock *sk, int val);
int tcp_sock_set_keepidle_locked(struct sock *sk, int val);
int tcp_sock_set_keepidle(struct sock *sk, int val);
int tcp_sock_set_keepintvl(struct sock *sk, int val);
void tcp_sock_set_nodelay(struct sock *sk);
void tcp_sock_set_quickack(struct sock *sk, int val);
int tcp_sock_set_syncnt(struct sock *sk, int val);
void tcp_sock_set_user_timeout(struct sock *sk, u32 val);

#endif        /* _LINUX_TCP_H */



















































































































































































































































































    1 
























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * include/linux/idr.h
 * 
 * 2002-10-18  written by Jim Houston jim.houston@ccur.com
 *        Copyright (C) 2002 by Concurrent Computer Corporation
 *
 * Small id to pointer translation service avoiding fixed sized
 * tables.
 */

#ifndef __IDR_H__
#define __IDR_H__

#include <linux/radix-tree.h>
#include <linux/gfp.h>
#include <linux/percpu.h>

struct idr {
        struct radix_tree_root        idr_rt;
        unsigned int                idr_base;
        unsigned int                idr_next;
};

/*
 * The IDR API does not expose the tagging functionality of the radix tree
 * to users.  Use tag 0 to track whether a node has free space below it.
 */
#define IDR_FREE        0

/* Set the IDR flag and the IDR_FREE tag */
#define IDR_RT_MARKER        (ROOT_IS_IDR | (__force gfp_t)                        \
                                        (1 << (ROOT_TAG_SHIFT + IDR_FREE)))

#define IDR_INIT_BASE(name, base) {                                        \
        .idr_rt = RADIX_TREE_INIT(name, IDR_RT_MARKER),                        \
        .idr_base = (base),                                                \
        .idr_next = 0,                                                        \
}

/**
 * IDR_INIT() - Initialise an IDR.
 * @name: Name of IDR.
 *
 * A freshly-initialised IDR contains no IDs.
 */
#define IDR_INIT(name)        IDR_INIT_BASE(name, 0)

/**
 * DEFINE_IDR() - Define a statically-allocated IDR.
 * @name: Name of IDR.
 *
 * An IDR defined using this macro is ready for use with no additional
 * initialisation required.  It contains no IDs.
 */
#define DEFINE_IDR(name)        struct idr name = IDR_INIT(name)

/**
 * idr_get_cursor - Return the current position of the cyclic allocator
 * @idr: idr handle
 *
 * The value returned is the value that will be next returned from
 * idr_alloc_cyclic() if it is free (otherwise the search will start from
 * this position).
 */
static inline unsigned int idr_get_cursor(const struct idr *idr)
{
        return READ_ONCE(idr->idr_next);
}

/**
 * idr_set_cursor - Set the current position of the cyclic allocator
 * @idr: idr handle
 * @val: new position
 *
 * The next call to idr_alloc_cyclic() will return @val if it is free
 * (otherwise the search will start from this position).
 */
static inline void idr_set_cursor(struct idr *idr, unsigned int val)
{
        WRITE_ONCE(idr->idr_next, val);
}

/**
 * DOC: idr sync
 * idr synchronization (stolen from radix-tree.h)
 *
 * idr_find() is able to be called locklessly, using RCU. The caller must
 * ensure calls to this function are made within rcu_read_lock() regions.
 * Other readers (lock-free or otherwise) and modifications may be running
 * concurrently.
 *
 * It is still required that the caller manage the synchronization and
 * lifetimes of the items. So if RCU lock-free lookups are used, typically
 * this would mean that the items have their own locks, or are amenable to
 * lock-free access; and that the items are freed by RCU (or only freed after
 * having been deleted from the idr tree *and* a synchronize_rcu() grace
 * period).
 */

#define idr_lock(idr)                xa_lock(&(idr)->idr_rt)
#define idr_unlock(idr)                xa_unlock(&(idr)->idr_rt)
#define idr_lock_bh(idr)        xa_lock_bh(&(idr)->idr_rt)
#define idr_unlock_bh(idr)        xa_unlock_bh(&(idr)->idr_rt)
#define idr_lock_irq(idr)        xa_lock_irq(&(idr)->idr_rt)
#define idr_unlock_irq(idr)        xa_unlock_irq(&(idr)->idr_rt)
#define idr_lock_irqsave(idr, flags) \
                                xa_lock_irqsave(&(idr)->idr_rt, flags)
#define idr_unlock_irqrestore(idr, flags) \
                                xa_unlock_irqrestore(&(idr)->idr_rt, flags)

void idr_preload(gfp_t gfp_mask);

int idr_alloc(struct idr *, void *ptr, int start, int end, gfp_t);
int __must_check idr_alloc_u32(struct idr *, void *ptr, u32 *id,
                                unsigned long max, gfp_t);
int idr_alloc_cyclic(struct idr *, void *ptr, int start, int end, gfp_t);
void *idr_remove(struct idr *, unsigned long id);
void *idr_find(const struct idr *, unsigned long id);
int idr_for_each(const struct idr *,
                 int (*fn)(int id, void *p, void *data), void *data);
void *idr_get_next(struct idr *, int *nextid);
void *idr_get_next_ul(struct idr *, unsigned long *nextid);
void *idr_replace(struct idr *, void *, unsigned long id);
void idr_destroy(struct idr *);

/**
 * idr_init_base() - Initialise an IDR.
 * @idr: IDR handle.
 * @base: The base value for the IDR.
 *
 * This variation of idr_init() creates an IDR which will allocate IDs
 * starting at %base.
 */
static inline void idr_init_base(struct idr *idr, int base)
{
        INIT_RADIX_TREE(&idr->idr_rt, IDR_RT_MARKER);
        idr->idr_base = base;
        idr->idr_next = 0;
}

/**
 * idr_init() - Initialise an IDR.
 * @idr: IDR handle.
 *
 * Initialise a dynamically allocated IDR.  To initialise a
 * statically allocated IDR, use DEFINE_IDR().
 */
static inline void idr_init(struct idr *idr)
{
        idr_init_base(idr, 0);
}

/**
 * idr_is_empty() - Are there any IDs allocated?
 * @idr: IDR handle.
 *
 * Return: %true if any IDs have been allocated from this IDR.
 */
static inline bool idr_is_empty(const struct idr *idr)
{
        return radix_tree_empty(&idr->idr_rt) &&
                radix_tree_tagged(&idr->idr_rt, IDR_FREE);
}

/**
 * idr_preload_end - end preload section started with idr_preload()
 *
 * Each idr_preload() should be matched with an invocation of this
 * function.  See idr_preload() for details.
 */
static inline void idr_preload_end(void)
{
        local_unlock(&radix_tree_preloads.lock);
}

/**
 * idr_for_each_entry() - Iterate over an IDR's elements of a given type.
 * @idr: IDR handle.
 * @entry: The type * to use as cursor
 * @id: Entry ID.
 *
 * @entry and @id do not need to be initialized before the loop, and
 * after normal termination @entry is left with the value NULL.  This
 * is convenient for a "not found" value.
 */
#define idr_for_each_entry(idr, entry, id)                        \
        for (id = 0; ((entry) = idr_get_next(idr, &(id))) != NULL; id += 1U)

/**
 * idr_for_each_entry_ul() - Iterate over an IDR's elements of a given type.
 * @idr: IDR handle.
 * @entry: The type * to use as cursor.
 * @tmp: A temporary placeholder for ID.
 * @id: Entry ID.
 *
 * @entry and @id do not need to be initialized before the loop, and
 * after normal termination @entry is left with the value NULL.  This
 * is convenient for a "not found" value.
 */
#define idr_for_each_entry_ul(idr, entry, tmp, id)                        \
        for (tmp = 0, id = 0;                                                \
             ((entry) = tmp <= id ? idr_get_next_ul(idr, &(id)) : NULL) != NULL; \
             tmp = id, ++id)

/**
 * idr_for_each_entry_continue() - Continue iteration over an IDR's elements of a given type
 * @idr: IDR handle.
 * @entry: The type * to use as a cursor.
 * @id: Entry ID.
 *
 * Continue to iterate over entries, continuing after the current position.
 */
#define idr_for_each_entry_continue(idr, entry, id)                        \
        for ((entry) = idr_get_next((idr), &(id));                        \
             entry;                                                        \
             ++id, (entry) = idr_get_next((idr), &(id)))

/**
 * idr_for_each_entry_continue_ul() - Continue iteration over an IDR's elements of a given type
 * @idr: IDR handle.
 * @entry: The type * to use as a cursor.
 * @tmp: A temporary placeholder for ID.
 * @id: Entry ID.
 *
 * Continue to iterate over entries, continuing after the current position.
 * After normal termination @entry is left with the value NULL.  This
 * is convenient for a "not found" value.
 */
#define idr_for_each_entry_continue_ul(idr, entry, tmp, id)                \
        for (tmp = id;                                                        \
             ((entry) = tmp <= id ? idr_get_next_ul(idr, &(id)) : NULL) != NULL; \
             tmp = id, ++id)

/*
 * IDA - ID Allocator, use when translation from id to pointer isn't necessary.
 */
#define IDA_CHUNK_SIZE                128        /* 128 bytes per chunk */
#define IDA_BITMAP_LONGS        (IDA_CHUNK_SIZE / sizeof(long))
#define IDA_BITMAP_BITS         (IDA_BITMAP_LONGS * sizeof(long) * 8)

struct ida_bitmap {
        unsigned long                bitmap[IDA_BITMAP_LONGS];
};

struct ida {
        struct xarray xa;
};

#define IDA_INIT_FLAGS        (XA_FLAGS_LOCK_IRQ | XA_FLAGS_ALLOC)

#define IDA_INIT(name)        {                                                \
        .xa = XARRAY_INIT(name, IDA_INIT_FLAGS)                                \
}
#define DEFINE_IDA(name)        struct ida name = IDA_INIT(name)

int ida_alloc_range(struct ida *, unsigned int min, unsigned int max, gfp_t);
void ida_free(struct ida *, unsigned int id);
void ida_destroy(struct ida *ida);

/**
 * ida_alloc() - Allocate an unused ID.
 * @ida: IDA handle.
 * @gfp: Memory allocation flags.
 *
 * Allocate an ID between 0 and %INT_MAX, inclusive.
 *
 * Context: Any context. It is safe to call this function without
 * locking in your code.
 * Return: The allocated ID, or %-ENOMEM if memory could not be allocated,
 * or %-ENOSPC if there are no free IDs.
 */
static inline int ida_alloc(struct ida *ida, gfp_t gfp)
{
        return ida_alloc_range(ida, 0, ~0, gfp);
}

/**
 * ida_alloc_min() - Allocate an unused ID.
 * @ida: IDA handle.
 * @min: Lowest ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Allocate an ID between @min and %INT_MAX, inclusive.
 *
 * Context: Any context. It is safe to call this function without
 * locking in your code.
 * Return: The allocated ID, or %-ENOMEM if memory could not be allocated,
 * or %-ENOSPC if there are no free IDs.
 */
static inline int ida_alloc_min(struct ida *ida, unsigned int min, gfp_t gfp)
{
        return ida_alloc_range(ida, min, ~0, gfp);
}

/**
 * ida_alloc_max() - Allocate an unused ID.
 * @ida: IDA handle.
 * @max: Highest ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Allocate an ID between 0 and @max, inclusive.
 *
 * Context: Any context. It is safe to call this function without
 * locking in your code.
 * Return: The allocated ID, or %-ENOMEM if memory could not be allocated,
 * or %-ENOSPC if there are no free IDs.
 */
static inline int ida_alloc_max(struct ida *ida, unsigned int max, gfp_t gfp)
{
        return ida_alloc_range(ida, 0, max, gfp);
}

static inline void ida_init(struct ida *ida)
{
        xa_init_flags(&ida->xa, IDA_INIT_FLAGS);
}

/*
 * ida_simple_get() and ida_simple_remove() are deprecated. Use
 * ida_alloc() and ida_free() instead respectively.
 */
#define ida_simple_get(ida, start, end, gfp)        \
                        ida_alloc_range(ida, start, (end) - 1, gfp)
#define ida_simple_remove(ida, id)        ida_free(ida, id)

static inline bool ida_is_empty(const struct ida *ida)
{
        return xa_empty(&ida->xa);
}
#endif /* __IDR_H__ */








































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
/*
 * 25-Jul-1998 Major changes to allow for ip chain table
 *
 * 3-Jan-2000 Named tables to allow packet selection for different uses.
 */

/*
 *         Format of an IP firewall descriptor
 *
 *         src, dst, src_mask, dst_mask are always stored in network byte order.
 *         flags are stored in host byte order (of course).
 *         Port numbers are stored in HOST byte order.
 */

#ifndef _UAPI_IPTABLES_H
#define _UAPI_IPTABLES_H

#include <linux/types.h>
#include <linux/compiler.h>
#include <linux/if.h>
#include <linux/netfilter_ipv4.h>

#include <linux/netfilter/x_tables.h>

#ifndef __KERNEL__
#define IPT_FUNCTION_MAXNAMELEN XT_FUNCTION_MAXNAMELEN
#define IPT_TABLE_MAXNAMELEN XT_TABLE_MAXNAMELEN
#define ipt_match xt_match
#define ipt_target xt_target
#define ipt_table xt_table
#define ipt_get_revision xt_get_revision
#define ipt_entry_match xt_entry_match
#define ipt_entry_target xt_entry_target
#define ipt_standard_target xt_standard_target
#define ipt_error_target xt_error_target
#define ipt_counters xt_counters
#define IPT_CONTINUE XT_CONTINUE
#define IPT_RETURN XT_RETURN

/* This group is older than old (iptables < v1.4.0-rc1~89) */
#include <linux/netfilter/xt_tcpudp.h>
#define ipt_udp xt_udp
#define ipt_tcp xt_tcp
#define IPT_TCP_INV_SRCPT        XT_TCP_INV_SRCPT
#define IPT_TCP_INV_DSTPT        XT_TCP_INV_DSTPT
#define IPT_TCP_INV_FLAGS        XT_TCP_INV_FLAGS
#define IPT_TCP_INV_OPTION        XT_TCP_INV_OPTION
#define IPT_TCP_INV_MASK        XT_TCP_INV_MASK
#define IPT_UDP_INV_SRCPT        XT_UDP_INV_SRCPT
#define IPT_UDP_INV_DSTPT        XT_UDP_INV_DSTPT
#define IPT_UDP_INV_MASK        XT_UDP_INV_MASK

/* The argument to IPT_SO_ADD_COUNTERS. */
#define ipt_counters_info xt_counters_info
/* Standard return verdict, or do jump. */
#define IPT_STANDARD_TARGET XT_STANDARD_TARGET
/* Error verdict. */
#define IPT_ERROR_TARGET XT_ERROR_TARGET

/* fn returns 0 to continue iteration */
#define IPT_MATCH_ITERATE(e, fn, args...) \
        XT_MATCH_ITERATE(struct ipt_entry, e, fn, ## args)

/* fn returns 0 to continue iteration */
#define IPT_ENTRY_ITERATE(entries, size, fn, args...) \
        XT_ENTRY_ITERATE(struct ipt_entry, entries, size, fn, ## args)
#endif

/* Yes, Virginia, you have to zero the padding. */
struct ipt_ip {
        /* Source and destination IP addr */
        struct in_addr src, dst;
        /* Mask for src and dest IP addr */
        struct in_addr smsk, dmsk;
        char iniface[IFNAMSIZ], outiface[IFNAMSIZ];
        unsigned char iniface_mask[IFNAMSIZ], outiface_mask[IFNAMSIZ];

        /* Protocol, 0 = ANY */
        __u16 proto;

        /* Flags word */
        __u8 flags;
        /* Inverse flags */
        __u8 invflags;
};

/* Values for "flag" field in struct ipt_ip (general ip structure). */
#define IPT_F_FRAG                0x01        /* Set if rule is a fragment rule */
#define IPT_F_GOTO                0x02        /* Set if jump is a goto */
#define IPT_F_MASK                0x03        /* All possible flag bits mask. */

/* Values for "inv" field in struct ipt_ip. */
#define IPT_INV_VIA_IN                0x01        /* Invert the sense of IN IFACE. */
#define IPT_INV_VIA_OUT                0x02        /* Invert the sense of OUT IFACE */
#define IPT_INV_TOS                0x04        /* Invert the sense of TOS. */
#define IPT_INV_SRCIP                0x08        /* Invert the sense of SRC IP. */
#define IPT_INV_DSTIP                0x10        /* Invert the sense of DST OP. */
#define IPT_INV_FRAG                0x20        /* Invert the sense of FRAG. */
#define IPT_INV_PROTO                XT_INV_PROTO
#define IPT_INV_MASK                0x7F        /* All possible flag bits mask. */

/* This structure defines each of the firewall rules.  Consists of 3
   parts which are 1) general IP header stuff 2) match specific
   stuff 3) the target to perform if the rule matches */
struct ipt_entry {
        struct ipt_ip ip;

        /* Mark with fields that we care about. */
        unsigned int nfcache;

        /* Size of ipt_entry + matches */
        __u16 target_offset;
        /* Size of ipt_entry + matches + target */
        __u16 next_offset;

        /* Back pointer */
        unsigned int comefrom;

        /* Packet and byte counters. */
        struct xt_counters counters;

        /* The matches (if any), then the target. */
        unsigned char elems[0];
};

/*
 * New IP firewall options for [gs]etsockopt at the RAW IP level.
 * Unlike BSD Linux inherits IP options so you don't have to use a raw
 * socket for this. Instead we check rights in the calls.
 *
 * ATTENTION: check linux/in.h before adding new number here.
 */
#define IPT_BASE_CTL                64

#define IPT_SO_SET_REPLACE        (IPT_BASE_CTL)
#define IPT_SO_SET_ADD_COUNTERS        (IPT_BASE_CTL + 1)
#define IPT_SO_SET_MAX                IPT_SO_SET_ADD_COUNTERS

#define IPT_SO_GET_INFO                        (IPT_BASE_CTL)
#define IPT_SO_GET_ENTRIES                (IPT_BASE_CTL + 1)
#define IPT_SO_GET_REVISION_MATCH        (IPT_BASE_CTL + 2)
#define IPT_SO_GET_REVISION_TARGET        (IPT_BASE_CTL + 3)
#define IPT_SO_GET_MAX                        IPT_SO_GET_REVISION_TARGET

/* ICMP matching stuff */
struct ipt_icmp {
        __u8 type;                                /* type to match */
        __u8 code[2];                                /* range of code */
        __u8 invflags;                                /* Inverse flags */
};

/* Values for "inv" field for struct ipt_icmp. */
#define IPT_ICMP_INV        0x01        /* Invert the sense of type/code test */

/* The argument to IPT_SO_GET_INFO */
struct ipt_getinfo {
        /* Which table: caller fills this in. */
        char name[XT_TABLE_MAXNAMELEN];

        /* Kernel fills these in. */
        /* Which hook entry points are valid: bitmask */
        unsigned int valid_hooks;

        /* Hook entry points: one per netfilter hook. */
        unsigned int hook_entry[NF_INET_NUMHOOKS];

        /* Underflow points. */
        unsigned int underflow[NF_INET_NUMHOOKS];

        /* Number of entries */
        unsigned int num_entries;

        /* Size of entries. */
        unsigned int size;
};

/* The argument to IPT_SO_SET_REPLACE. */
struct ipt_replace {
        /* Which table. */
        char name[XT_TABLE_MAXNAMELEN];

        /* Which hook entry points are valid: bitmask.  You can't
           change this. */
        unsigned int valid_hooks;

        /* Number of entries */
        unsigned int num_entries;

        /* Total size of new entries */
        unsigned int size;

        /* Hook entry points. */
        unsigned int hook_entry[NF_INET_NUMHOOKS];

        /* Underflow points. */
        unsigned int underflow[NF_INET_NUMHOOKS];

        /* Information about old entries: */
        /* Number of counters (must be equal to current number of entries). */
        unsigned int num_counters;
        /* The old entries' counters. */
        struct xt_counters __user *counters;

        /* The entries (hang off end: not really an array). */
        struct ipt_entry entries[0];
};

/* The argument to IPT_SO_GET_ENTRIES. */
struct ipt_get_entries {
        /* Which table: user fills this in. */
        char name[XT_TABLE_MAXNAMELEN];

        /* User fills this in: total entry size. */
        unsigned int size;

        /* The entries. */
        struct ipt_entry entrytable[0];
};

/* Helper functions */
static __inline__ struct xt_entry_target *
ipt_get_target(struct ipt_entry *e)
{
        return (struct xt_entry_target *)((char *)e + e->target_offset);
}

/*
 *        Main firewall chains definitions and global var's definitions.
 */
#endif /* _UAPI_IPTABLES_H */










































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_RTNETLINK_H
#define __LINUX_RTNETLINK_H


#include <linux/mutex.h>
#include <linux/netdevice.h>
#include <linux/wait.h>
#include <linux/refcount.h>
#include <uapi/linux/rtnetlink.h>

extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo);
extern int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid);
extern void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid,
                        u32 group, struct nlmsghdr *nlh, gfp_t flags);
extern void rtnl_set_sk_err(struct net *net, u32 group, int error);
extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics);
extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst,
                              u32 id, long expires, u32 error);

void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change, gfp_t flags);
void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change,
                         gfp_t flags, int *new_nsid, int new_ifindex);
struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
                                       unsigned change, u32 event,
                                       gfp_t flags, int *new_nsid,
                                       int new_ifindex);
void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev,
                       gfp_t flags);


/* RTNL is used as a global lock for all changes to network configuration  */
extern void rtnl_lock(void);
extern void rtnl_unlock(void);
extern int rtnl_trylock(void);
extern int rtnl_is_locked(void);
extern int rtnl_lock_killable(void);
extern bool refcount_dec_and_rtnl_lock(refcount_t *r);

extern wait_queue_head_t netdev_unregistering_wq;
extern struct rw_semaphore pernet_ops_rwsem;
extern struct rw_semaphore net_rwsem;

#ifdef CONFIG_PROVE_LOCKING
extern bool lockdep_rtnl_is_held(void);
#else
static inline bool lockdep_rtnl_is_held(void)
{
        return true;
}
#endif /* #ifdef CONFIG_PROVE_LOCKING */

/**
 * rcu_dereference_rtnl - rcu_dereference with debug checking
 * @p: The pointer to read, prior to dereferencing
 *
 * Do an rcu_dereference(p), but check caller either holds rcu_read_lock()
 * or RTNL. Note : Please prefer rtnl_dereference() or rcu_dereference()
 */
#define rcu_dereference_rtnl(p)                                        \
        rcu_dereference_check(p, lockdep_rtnl_is_held())

/**
 * rcu_dereference_bh_rtnl - rcu_dereference_bh with debug checking
 * @p: The pointer to read, prior to dereference
 *
 * Do an rcu_dereference_bh(p), but check caller either holds rcu_read_lock_bh()
 * or RTNL. Note : Please prefer rtnl_dereference() or rcu_dereference_bh()
 */
#define rcu_dereference_bh_rtnl(p)                                \
        rcu_dereference_bh_check(p, lockdep_rtnl_is_held())

/**
 * rtnl_dereference - fetch RCU pointer when updates are prevented by RTNL
 * @p: The pointer to read, prior to dereferencing
 *
 * Return the value of the specified RCU-protected pointer, but omit
 * the READ_ONCE(), because caller holds RTNL.
 */
#define rtnl_dereference(p)                                        \
        rcu_dereference_protected(p, lockdep_rtnl_is_held())

static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev)
{
        return rtnl_dereference(dev->ingress_queue);
}

static inline struct netdev_queue *dev_ingress_queue_rcu(struct net_device *dev)
{
        return rcu_dereference(dev->ingress_queue);
}

struct netdev_queue *dev_ingress_queue_create(struct net_device *dev);

#ifdef CONFIG_NET_INGRESS
void net_inc_ingress_queue(void);
void net_dec_ingress_queue(void);
#endif

#ifdef CONFIG_NET_EGRESS
void net_inc_egress_queue(void);
void net_dec_egress_queue(void);
#endif

void rtnetlink_init(void);
void __rtnl_unlock(void);
void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail);

#define ASSERT_RTNL() \
        WARN_ONCE(!rtnl_is_locked(), \
                  "RTNL: assertion failed at %s (%d)\n", __FILE__,  __LINE__)

extern int ndo_dflt_fdb_dump(struct sk_buff *skb,
                             struct netlink_callback *cb,
                             struct net_device *dev,
                             struct net_device *filter_dev,
                             int *idx);
extern int ndo_dflt_fdb_add(struct ndmsg *ndm,
                            struct nlattr *tb[],
                            struct net_device *dev,
                            const unsigned char *addr,
                            u16 vid,
                            u16 flags);
extern int ndo_dflt_fdb_del(struct ndmsg *ndm,
                            struct nlattr *tb[],
                            struct net_device *dev,
                            const unsigned char *addr,
                            u16 vid);

extern int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
                                   struct net_device *dev, u16 mode,
                                   u32 flags, u32 mask, int nlflags,
                                   u32 filter_mask,
                                   int (*vlan_fill)(struct sk_buff *skb,
                                                    struct net_device *dev,
                                                    u32 filter_mask));
#endif        /* __LINUX_RTNETLINK_H */

































































































































    1 









    1 





































































    1 




























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Internal procfs definitions
 *
 * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#include <linux/proc_fs.h>
#include <linux/proc_ns.h>
#include <linux/refcount.h>
#include <linux/spinlock.h>
#include <linux/atomic.h>
#include <linux/binfmts.h>
#include <linux/sched/coredump.h>
#include <linux/sched/task.h>

struct ctl_table_header;
struct mempolicy;

/*
 * This is not completely implemented yet. The idea is to
 * create an in-memory tree (like the actual /proc filesystem
 * tree) of these proc_dir_entries, so that we can dynamically
 * add new files to /proc.
 *
 * parent/subdir are used for the directory structure (every /proc file has a
 * parent, but "subdir" is empty for all non-directory entries).
 * subdir_node is used to build the rb tree "subdir" of the parent.
 */
struct proc_dir_entry {
        /*
         * number of callers into module in progress;
         * negative -> it's going away RSN
         */
        atomic_t in_use;
        refcount_t refcnt;
        struct list_head pde_openers;        /* who did ->open, but not ->release */
        /* protects ->pde_openers and all struct pde_opener instances */
        spinlock_t pde_unload_lock;
        struct completion *pde_unload_completion;
        const struct inode_operations *proc_iops;
        union {
                const struct proc_ops *proc_ops;
                const struct file_operations *proc_dir_ops;
        };
        const struct dentry_operations *proc_dops;
        union {
                const struct seq_operations *seq_ops;
                int (*single_show)(struct seq_file *, void *);
        };
        proc_write_t write;
        void *data;
        unsigned int state_size;
        unsigned int low_ino;
        nlink_t nlink;
        kuid_t uid;
        kgid_t gid;
        loff_t size;
        struct proc_dir_entry *parent;
        struct rb_root subdir;
        struct rb_node subdir_node;
        char *name;
        umode_t mode;
        u8 flags;
        u8 namelen;
        char inline_name[];
} __randomize_layout;

#define SIZEOF_PDE        (                                \
        sizeof(struct proc_dir_entry) < 128 ? 128 :        \
        sizeof(struct proc_dir_entry) < 192 ? 192 :        \
        sizeof(struct proc_dir_entry) < 256 ? 256 :        \
        sizeof(struct proc_dir_entry) < 512 ? 512 :        \
        0)
#define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE - sizeof(struct proc_dir_entry))

static inline bool pde_is_permanent(const struct proc_dir_entry *pde)
{
        return pde->flags & PROC_ENTRY_PERMANENT;
}

static inline bool pde_has_proc_read_iter(const struct proc_dir_entry *pde)
{
        return pde->flags & PROC_ENTRY_proc_read_iter;
}

static inline bool pde_has_proc_compat_ioctl(const struct proc_dir_entry *pde)
{
#ifdef CONFIG_COMPAT
        return pde->flags & PROC_ENTRY_proc_compat_ioctl;
#else
        return false;
#endif
}

extern struct kmem_cache *proc_dir_entry_cache;
void pde_free(struct proc_dir_entry *pde);

union proc_op {
        int (*proc_get_link)(struct dentry *, struct path *);
        int (*proc_show)(struct seq_file *m,
                struct pid_namespace *ns, struct pid *pid,
                struct task_struct *task);
        const char *lsm;
};

struct proc_inode {
        struct pid *pid;
        unsigned int fd;
        union proc_op op;
        struct proc_dir_entry *pde;
        struct ctl_table_header *sysctl;
        struct ctl_table *sysctl_entry;
        struct hlist_node sibling_inodes;
        const struct proc_ns_operations *ns_ops;
        struct inode vfs_inode;
} __randomize_layout;

/*
 * General functions
 */
static inline struct proc_inode *PROC_I(const struct inode *inode)
{
        return container_of(inode, struct proc_inode, vfs_inode);
}

static inline struct proc_dir_entry *PDE(const struct inode *inode)
{
        return PROC_I(inode)->pde;
}

static inline void *__PDE_DATA(const struct inode *inode)
{
        return PDE(inode)->data;
}

static inline struct pid *proc_pid(const struct inode *inode)
{
        return PROC_I(inode)->pid;
}

static inline struct task_struct *get_proc_task(const struct inode *inode)
{
        return get_pid_task(proc_pid(inode), PIDTYPE_PID);
}

void task_dump_owner(struct task_struct *task, umode_t mode,
                     kuid_t *ruid, kgid_t *rgid);

unsigned name_to_int(const struct qstr *qstr);
/*
 * Offset of the first process in the /proc root directory..
 */
#define FIRST_PROCESS_ENTRY 256

/* Worst case buffer size needed for holding an integer. */
#define PROC_NUMBUF 13

/*
 * array.c
 */
extern const struct file_operations proc_tid_children_operations;

extern void proc_task_name(struct seq_file *m, struct task_struct *p,
                           bool escape);
extern int proc_tid_stat(struct seq_file *, struct pid_namespace *,
                         struct pid *, struct task_struct *);
extern int proc_tgid_stat(struct seq_file *, struct pid_namespace *,
                          struct pid *, struct task_struct *);
extern int proc_pid_status(struct seq_file *, struct pid_namespace *,
                           struct pid *, struct task_struct *);
extern int proc_pid_statm(struct seq_file *, struct pid_namespace *,
                          struct pid *, struct task_struct *);

/*
 * base.c
 */
extern const struct dentry_operations pid_dentry_operations;
extern int pid_getattr(const struct path *, struct kstat *, u32, unsigned int);
extern int proc_setattr(struct dentry *, struct iattr *);
extern void proc_pid_evict_inode(struct proc_inode *);
extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t);
extern void pid_update_inode(struct task_struct *, struct inode *);
extern int pid_delete_dentry(const struct dentry *);
extern int proc_pid_readdir(struct file *, struct dir_context *);
struct dentry *proc_pid_lookup(struct dentry *, unsigned int);
extern loff_t mem_lseek(struct file *, loff_t, int);

/* Lookups */
typedef struct dentry *instantiate_t(struct dentry *,
                                     struct task_struct *, const void *);
bool proc_fill_cache(struct file *, struct dir_context *, const char *, unsigned int,
                           instantiate_t, struct task_struct *, const void *);

/*
 * generic.c
 */
struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode,
                struct proc_dir_entry **parent, void *data);
struct proc_dir_entry *proc_register(struct proc_dir_entry *dir,
                struct proc_dir_entry *dp);
extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int);
struct dentry *proc_lookup_de(struct inode *, struct dentry *, struct proc_dir_entry *);
extern int proc_readdir(struct file *, struct dir_context *);
int proc_readdir_de(struct file *, struct dir_context *, struct proc_dir_entry *);

static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
{
        refcount_inc(&pde->refcnt);
        return pde;
}
extern void pde_put(struct proc_dir_entry *);

static inline bool is_empty_pde(const struct proc_dir_entry *pde)
{
        return S_ISDIR(pde->mode) && !pde->proc_iops;
}
extern ssize_t proc_simple_write(struct file *, const char __user *, size_t, loff_t *);

/*
 * inode.c
 */
struct pde_opener {
        struct list_head lh;
        struct file *file;
        bool closing;
        struct completion *c;
} __randomize_layout;
extern const struct inode_operations proc_link_inode_operations;
extern const struct inode_operations proc_pid_link_inode_operations;
extern const struct super_operations proc_sops;

void proc_init_kmemcache(void);
void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock);
void set_proc_pid_nlink(void);
extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
extern void proc_entry_rundown(struct proc_dir_entry *);

/*
 * proc_namespaces.c
 */
extern const struct inode_operations proc_ns_dir_inode_operations;
extern const struct file_operations proc_ns_dir_operations;

/*
 * proc_net.c
 */
extern const struct file_operations proc_net_operations;
extern const struct inode_operations proc_net_inode_operations;

#ifdef CONFIG_NET
extern int proc_net_init(void);
#else
static inline int proc_net_init(void) { return 0; }
#endif

/*
 * proc_self.c
 */
extern int proc_setup_self(struct super_block *);

/*
 * proc_thread_self.c
 */
extern int proc_setup_thread_self(struct super_block *);
extern void proc_thread_self_init(void);

/*
 * proc_sysctl.c
 */
#ifdef CONFIG_PROC_SYSCTL
extern int proc_sys_init(void);
extern void proc_sys_evict_inode(struct inode *inode,
                                 struct ctl_table_header *head);
#else
static inline void proc_sys_init(void) { }
static inline void proc_sys_evict_inode(struct  inode *inode,
                                        struct ctl_table_header *head) { }
#endif

/*
 * proc_tty.c
 */
#ifdef CONFIG_TTY
extern void proc_tty_init(void);
#else
static inline void proc_tty_init(void) {}
#endif

/*
 * root.c
 */
extern struct proc_dir_entry proc_root;

extern void proc_self_init(void);

/*
 * task_[no]mmu.c
 */
struct mem_size_stats;
struct proc_maps_private {
        struct inode *inode;
        struct task_struct *task;
        struct mm_struct *mm;
#ifdef CONFIG_MMU
        struct vm_area_struct *tail_vma;
#endif
#ifdef CONFIG_NUMA
        struct mempolicy *task_mempolicy;
#endif
} __randomize_layout;

struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode);

extern const struct file_operations proc_pid_maps_operations;
extern const struct file_operations proc_pid_numa_maps_operations;
extern const struct file_operations proc_pid_smaps_operations;
extern const struct file_operations proc_pid_smaps_rollup_operations;
extern const struct file_operations proc_clear_refs_operations;
extern const struct file_operations proc_pagemap_operations;

extern unsigned long task_vsize(struct mm_struct *);
extern unsigned long task_statm(struct mm_struct *,
                                unsigned long *, unsigned long *,
                                unsigned long *, unsigned long *);
extern void task_mem(struct seq_file *, struct mm_struct *);

extern const struct dentry_operations proc_net_dentry_ops;
static inline void pde_force_lookup(struct proc_dir_entry *pde)
{
        /* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */
        pde->proc_dops = &proc_net_dentry_ops;
}































































































    3 



























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * An extensible bitmap is a bitmap that supports an
 * arbitrary number of bits.  Extensible bitmaps are
 * used to represent sets of values, such as types,
 * roles, categories, and classes.
 *
 * Each extensible bitmap is implemented as a linked
 * list of bitmap nodes, where each bitmap node has
 * an explicitly specified starting bit position within
 * the total bitmap.
 *
 * Author : Stephen Smalley, <sds@tycho.nsa.gov>
 */
#ifndef _SS_EBITMAP_H_
#define _SS_EBITMAP_H_

#include <net/netlabel.h>

#ifdef CONFIG_64BIT
#define        EBITMAP_NODE_SIZE        64
#else
#define        EBITMAP_NODE_SIZE        32
#endif

#define EBITMAP_UNIT_NUMS        ((EBITMAP_NODE_SIZE-sizeof(void *)-sizeof(u32))\
                                        / sizeof(unsigned long))
#define EBITMAP_UNIT_SIZE        BITS_PER_LONG
#define EBITMAP_SIZE                (EBITMAP_UNIT_NUMS * EBITMAP_UNIT_SIZE)
#define EBITMAP_BIT                1ULL
#define EBITMAP_SHIFT_UNIT_SIZE(x)                                        \
        (((x) >> EBITMAP_UNIT_SIZE / 2) >> EBITMAP_UNIT_SIZE / 2)

struct ebitmap_node {
        struct ebitmap_node *next;
        unsigned long maps[EBITMAP_UNIT_NUMS];
        u32 startbit;
};

struct ebitmap {
        struct ebitmap_node *node;        /* first node in the bitmap */
        u32 highbit;        /* highest position in the total bitmap */
};

#define ebitmap_length(e) ((e)->highbit)

static inline unsigned int ebitmap_start_positive(struct ebitmap *e,
                                                  struct ebitmap_node **n)
{
        unsigned int ofs;

        for (*n = e->node; *n; *n = (*n)->next) {
                ofs = find_first_bit((*n)->maps, EBITMAP_SIZE);
                if (ofs < EBITMAP_SIZE)
                        return (*n)->startbit + ofs;
        }
        return ebitmap_length(e);
}

static inline void ebitmap_init(struct ebitmap *e)
{
        memset(e, 0, sizeof(*e));
}

static inline unsigned int ebitmap_next_positive(struct ebitmap *e,
                                                 struct ebitmap_node **n,
                                                 unsigned int bit)
{
        unsigned int ofs;

        ofs = find_next_bit((*n)->maps, EBITMAP_SIZE, bit - (*n)->startbit + 1);
        if (ofs < EBITMAP_SIZE)
                return ofs + (*n)->startbit;

        for (*n = (*n)->next; *n; *n = (*n)->next) {
                ofs = find_first_bit((*n)->maps, EBITMAP_SIZE);
                if (ofs < EBITMAP_SIZE)
                        return ofs + (*n)->startbit;
        }
        return ebitmap_length(e);
}

#define EBITMAP_NODE_INDEX(node, bit)        \
        (((bit) - (node)->startbit) / EBITMAP_UNIT_SIZE)
#define EBITMAP_NODE_OFFSET(node, bit)        \
        (((bit) - (node)->startbit) % EBITMAP_UNIT_SIZE)

static inline int ebitmap_node_get_bit(struct ebitmap_node *n,
                                       unsigned int bit)
{
        unsigned int index = EBITMAP_NODE_INDEX(n, bit);
        unsigned int ofs = EBITMAP_NODE_OFFSET(n, bit);

        BUG_ON(index >= EBITMAP_UNIT_NUMS);
        if ((n->maps[index] & (EBITMAP_BIT << ofs)))
                return 1;
        return 0;
}

static inline void ebitmap_node_set_bit(struct ebitmap_node *n,
                                        unsigned int bit)
{
        unsigned int index = EBITMAP_NODE_INDEX(n, bit);
        unsigned int ofs = EBITMAP_NODE_OFFSET(n, bit);

        BUG_ON(index >= EBITMAP_UNIT_NUMS);
        n->maps[index] |= (EBITMAP_BIT << ofs);
}

static inline void ebitmap_node_clr_bit(struct ebitmap_node *n,
                                        unsigned int bit)
{
        unsigned int index = EBITMAP_NODE_INDEX(n, bit);
        unsigned int ofs = EBITMAP_NODE_OFFSET(n, bit);

        BUG_ON(index >= EBITMAP_UNIT_NUMS);
        n->maps[index] &= ~(EBITMAP_BIT << ofs);
}

#define ebitmap_for_each_positive_bit(e, n, bit)        \
        for (bit = ebitmap_start_positive(e, &n);        \
             bit < ebitmap_length(e);                        \
             bit = ebitmap_next_positive(e, &n, bit))        \

int ebitmap_cmp(struct ebitmap *e1, struct ebitmap *e2);
int ebitmap_cpy(struct ebitmap *dst, struct ebitmap *src);
int ebitmap_and(struct ebitmap *dst, struct ebitmap *e1, struct ebitmap *e2);
int ebitmap_contains(struct ebitmap *e1, struct ebitmap *e2, u32 last_e2bit);
int ebitmap_get_bit(struct ebitmap *e, unsigned long bit);
int ebitmap_set_bit(struct ebitmap *e, unsigned long bit, int value);
void ebitmap_destroy(struct ebitmap *e);
int ebitmap_read(struct ebitmap *e, void *fp);
int ebitmap_write(struct ebitmap *e, void *fp);
u32 ebitmap_hash(const struct ebitmap *e, u32 hash);

#ifdef CONFIG_NETLABEL
int ebitmap_netlbl_export(struct ebitmap *ebmap,
                          struct netlbl_lsm_catmap **catmap);
int ebitmap_netlbl_import(struct ebitmap *ebmap,
                          struct netlbl_lsm_catmap *catmap);
#else
static inline int ebitmap_netlbl_export(struct ebitmap *ebmap,
                                        struct netlbl_lsm_catmap **catmap)
{
        return -ENOMEM;
}
static inline int ebitmap_netlbl_import(struct ebitmap *ebmap,
                                        struct netlbl_lsm_catmap *catmap)
{
        return -ENOMEM;
}
#endif

#endif        /* _SS_EBITMAP_H_ */

































































































































































































































































































































































































    3 




























































































































































































































































































































































































































































    3 


    3 





    3 






























    1 





    1 
    1 









































































































































    1 










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    5 
































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
// SPDX-License-Identifier: GPL-2.0
/*
 *  Kernel timekeeping code and accessor functions. Based on code from
 *  timer.c, moved in commit 8524070b7982.
 */
#include <linux/timekeeper_internal.h>
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/nmi.h>
#include <linux/sched.h>
#include <linux/sched/loadavg.h>
#include <linux/sched/clock.h>
#include <linux/syscore_ops.h>
#include <linux/clocksource.h>
#include <linux/jiffies.h>
#include <linux/time.h>
#include <linux/timex.h>
#include <linux/tick.h>
#include <linux/stop_machine.h>
#include <linux/pvclock_gtod.h>
#include <linux/compiler.h>
#include <linux/audit.h>
#include <linux/random.h>

#include "tick-internal.h"
#include "ntp_internal.h"
#include "timekeeping_internal.h"

#define TK_CLEAR_NTP                (1 << 0)
#define TK_MIRROR                (1 << 1)
#define TK_CLOCK_WAS_SET        (1 << 2)

enum timekeeping_adv_mode {
        /* Update timekeeper when a tick has passed */
        TK_ADV_TICK,

        /* Update timekeeper on a direct frequency change */
        TK_ADV_FREQ
};

DEFINE_RAW_SPINLOCK(timekeeper_lock);

/*
 * The most important data for readout fits into a single 64 byte
 * cache line.
 */
static struct {
        seqcount_raw_spinlock_t        seq;
        struct timekeeper        timekeeper;
} tk_core ____cacheline_aligned = {
        .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_core.seq, &timekeeper_lock),
};

static struct timekeeper shadow_timekeeper;

/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended;

/**
 * struct tk_fast - NMI safe timekeeper
 * @seq:        Sequence counter for protecting updates. The lowest bit
 *                is the index for the tk_read_base array
 * @base:        tk_read_base array. Access is indexed by the lowest bit of
 *                @seq.
 *
 * See @update_fast_timekeeper() below.
 */
struct tk_fast {
        seqcount_latch_t        seq;
        struct tk_read_base        base[2];
};

/* Suspend-time cycles value for halted fast timekeeper. */
static u64 cycles_at_suspend;

static u64 dummy_clock_read(struct clocksource *cs)
{
        if (timekeeping_suspended)
                return cycles_at_suspend;
        return local_clock();
}

static struct clocksource dummy_clock = {
        .read = dummy_clock_read,
};

/*
 * Boot time initialization which allows local_clock() to be utilized
 * during early boot when clocksources are not available. local_clock()
 * returns nanoseconds already so no conversion is required, hence mult=1
 * and shift=0. When the first proper clocksource is installed then
 * the fast time keepers are updated with the correct values.
 */
#define FAST_TK_INIT                                                \
        {                                                        \
                .clock                = &dummy_clock,                        \
                .mask                = CLOCKSOURCE_MASK(64),                \
                .mult                = 1,                                \
                .shift                = 0,                                \
        }

static struct tk_fast tk_fast_mono ____cacheline_aligned = {
        .seq     = SEQCNT_LATCH_ZERO(tk_fast_mono.seq),
        .base[0] = FAST_TK_INIT,
        .base[1] = FAST_TK_INIT,
};

static struct tk_fast tk_fast_raw  ____cacheline_aligned = {
        .seq     = SEQCNT_LATCH_ZERO(tk_fast_raw.seq),
        .base[0] = FAST_TK_INIT,
        .base[1] = FAST_TK_INIT,
};

static inline void tk_normalize_xtime(struct timekeeper *tk)
{
        while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) {
                tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
                tk->xtime_sec++;
        }
        while (tk->tkr_raw.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_raw.shift)) {
                tk->tkr_raw.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_raw.shift;
                tk->raw_sec++;
        }
}

static inline struct timespec64 tk_xtime(const struct timekeeper *tk)
{
        struct timespec64 ts;

        ts.tv_sec = tk->xtime_sec;
        ts.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
        return ts;
}

static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
{
        tk->xtime_sec = ts->tv_sec;
        tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift;
}

static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
{
        tk->xtime_sec += ts->tv_sec;
        tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift;
        tk_normalize_xtime(tk);
}

static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm)
{
        struct timespec64 tmp;

        /*
         * Verify consistency of: offset_real = -wall_to_monotonic
         * before modifying anything
         */
        set_normalized_timespec64(&tmp, -tk->wall_to_monotonic.tv_sec,
                                        -tk->wall_to_monotonic.tv_nsec);
        WARN_ON_ONCE(tk->offs_real != timespec64_to_ktime(tmp));
        tk->wall_to_monotonic = wtm;
        set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
        tk->offs_real = timespec64_to_ktime(tmp);
        tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0));
}

static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
{
        tk->offs_boot = ktime_add(tk->offs_boot, delta);
        /*
         * Timespec representation for VDSO update to avoid 64bit division
         * on every update.
         */
        tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot);
}

/*
 * tk_clock_read - atomic clocksource read() helper
 *
 * This helper is necessary to use in the read paths because, while the
 * seqcount ensures we don't return a bad value while structures are updated,
 * it doesn't protect from potential crashes. There is the possibility that
 * the tkr's clocksource may change between the read reference, and the
 * clock reference passed to the read function.  This can cause crashes if
 * the wrong clocksource is passed to the wrong read function.
 * This isn't necessary to use when holding the timekeeper_lock or doing
 * a read of the fast-timekeeper tkrs (which is protected by its own locking
 * and update logic).
 */
static inline u64 tk_clock_read(const struct tk_read_base *tkr)
{
        struct clocksource *clock = READ_ONCE(tkr->clock);

        return clock->read(clock);
}

#ifdef CONFIG_DEBUG_TIMEKEEPING
#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */

static void timekeeping_check_update(struct timekeeper *tk, u64 offset)
{

        u64 max_cycles = tk->tkr_mono.clock->max_cycles;
        const char *name = tk->tkr_mono.clock->name;

        if (offset > max_cycles) {
                printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n",
                                offset, name, max_cycles);
                printk_deferred("         timekeeping: Your kernel is sick, but tries to cope by capping time updates\n");
        } else {
                if (offset > (max_cycles >> 1)) {
                        printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the '%s' clock's 50%% safety margin (%lld)\n",
                                        offset, name, max_cycles >> 1);
                        printk_deferred("      timekeeping: Your kernel is still fine, but is feeling a bit nervous\n");
                }
        }

        if (tk->underflow_seen) {
                if (jiffies - tk->last_warning > WARNING_FREQ) {
                        printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name);
                        printk_deferred("         Please report this, consider using a different clocksource, if possible.\n");
                        printk_deferred("         Your kernel is probably still fine.\n");
                        tk->last_warning = jiffies;
                }
                tk->underflow_seen = 0;
        }

        if (tk->overflow_seen) {
                if (jiffies - tk->last_warning > WARNING_FREQ) {
                        printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name);
                        printk_deferred("         Please report this, consider using a different clocksource, if possible.\n");
                        printk_deferred("         Your kernel is probably still fine.\n");
                        tk->last_warning = jiffies;
                }
                tk->overflow_seen = 0;
        }
}

static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        u64 now, last, mask, max, delta;
        unsigned int seq;

        /*
         * Since we're called holding a seqcount, the data may shift
         * under us while we're doing the calculation. This can cause
         * false positives, since we'd note a problem but throw the
         * results away. So nest another seqcount here to atomically
         * grab the points we are checking with.
         */
        do {
                seq = read_seqcount_begin(&tk_core.seq);
                now = tk_clock_read(tkr);
                last = tkr->cycle_last;
                mask = tkr->mask;
                max = tkr->clock->max_cycles;
        } while (read_seqcount_retry(&tk_core.seq, seq));

        delta = clocksource_delta(now, last, mask);

        /*
         * Try to catch underflows by checking if we are seeing small
         * mask-relative negative values.
         */
        if (unlikely((~delta & mask) < (mask >> 3))) {
                tk->underflow_seen = 1;
                delta = 0;
        }

        /* Cap delta value to the max_cycles values to avoid mult overflows */
        if (unlikely(delta > max)) {
                tk->overflow_seen = 1;
                delta = tkr->clock->max_cycles;
        }

        return delta;
}
#else
static inline void timekeeping_check_update(struct timekeeper *tk, u64 offset)
{
}
static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr)
{
        u64 cycle_now, delta;

        /* read clocksource */
        cycle_now = tk_clock_read(tkr);

        /* calculate the delta since the last update_wall_time */
        delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);

        return delta;
}
#endif

/**
 * tk_setup_internals - Set up internals to use clocksource clock.
 *
 * @tk:                The target timekeeper to setup.
 * @clock:                Pointer to clocksource.
 *
 * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
 * pair and interval request.
 *
 * Unless you're the timekeeping code, you should not be using this!
 */
static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
{
        u64 interval;
        u64 tmp, ntpinterval;
        struct clocksource *old_clock;

        ++tk->cs_was_changed_seq;
        old_clock = tk->tkr_mono.clock;
        tk->tkr_mono.clock = clock;
        tk->tkr_mono.mask = clock->mask;
        tk->tkr_mono.cycle_last = tk_clock_read(&tk->tkr_mono);

        tk->tkr_raw.clock = clock;
        tk->tkr_raw.mask = clock->mask;
        tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last;

        /* Do the ns -> cycle conversion first, using original mult */
        tmp = NTP_INTERVAL_LENGTH;
        tmp <<= clock->shift;
        ntpinterval = tmp;
        tmp += clock->mult/2;
        do_div(tmp, clock->mult);
        if (tmp == 0)
                tmp = 1;

        interval = (u64) tmp;
        tk->cycle_interval = interval;

        /* Go back from cycles -> shifted ns */
        tk->xtime_interval = interval * clock->mult;
        tk->xtime_remainder = ntpinterval - tk->xtime_interval;
        tk->raw_interval = interval * clock->mult;

         /* if changing clocks, convert xtime_nsec shift units */
        if (old_clock) {
                int shift_change = clock->shift - old_clock->shift;
                if (shift_change < 0) {
                        tk->tkr_mono.xtime_nsec >>= -shift_change;
                        tk->tkr_raw.xtime_nsec >>= -shift_change;
                } else {
                        tk->tkr_mono.xtime_nsec <<= shift_change;
                        tk->tkr_raw.xtime_nsec <<= shift_change;
                }
        }

        tk->tkr_mono.shift = clock->shift;
        tk->tkr_raw.shift = clock->shift;

        tk->ntp_error = 0;
        tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
        tk->ntp_tick = ntpinterval << tk->ntp_error_shift;

        /*
         * The timekeeper keeps its own mult values for the currently
         * active clocksource. These value will be adjusted via NTP
         * to counteract clock drifting.
         */
        tk->tkr_mono.mult = clock->mult;
        tk->tkr_raw.mult = clock->mult;
        tk->ntp_err_mult = 0;
        tk->skip_second_overflow = 0;
}

/* Timekeeper helper functions. */

#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
static u32 default_arch_gettimeoffset(void) { return 0; }
u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset;
#else
static inline u32 arch_gettimeoffset(void) { return 0; }
#endif

static inline u64 timekeeping_delta_to_ns(const struct tk_read_base *tkr, u64 delta)
{
        u64 nsec;

        nsec = delta * tkr->mult + tkr->xtime_nsec;
        nsec >>= tkr->shift;

        /* If arch requires, add in get_arch_timeoffset() */
        return nsec + arch_gettimeoffset();
}

static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr)
{
        u64 delta;

        delta = timekeeping_get_delta(tkr);
        return timekeeping_delta_to_ns(tkr, delta);
}

static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles)
{
        u64 delta;

        /* calculate the delta since the last update_wall_time */
        delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask);
        return timekeeping_delta_to_ns(tkr, delta);
}

/**
 * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
 * @tkr: Timekeeping readout base from which we take the update
 *
 * We want to use this from any context including NMI and tracing /
 * instrumenting the timekeeping code itself.
 *
 * Employ the latch technique; see @raw_write_seqcount_latch.
 *
 * So if a NMI hits the update of base[0] then it will use base[1]
 * which is still consistent. In the worst case this can result is a
 * slightly wrong timestamp (a few nanoseconds). See
 * @ktime_get_mono_fast_ns.
 */
static void update_fast_timekeeper(const struct tk_read_base *tkr,
                                   struct tk_fast *tkf)
{
        struct tk_read_base *base = tkf->base;

        /* Force readers off to base[1] */
        raw_write_seqcount_latch(&tkf->seq);

        /* Update base[0] */
        memcpy(base, tkr, sizeof(*base));

        /* Force readers back to base[0] */
        raw_write_seqcount_latch(&tkf->seq);

        /* Update base[1] */
        memcpy(base + 1, base, sizeof(*base));
}

/**
 * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic
 *
 * This timestamp is not guaranteed to be monotonic across an update.
 * The timestamp is calculated by:
 *
 *        now = base_mono + clock_delta * slope
 *
 * So if the update lowers the slope, readers who are forced to the
 * not yet updated second array are still using the old steeper slope.
 *
 * tmono
 * ^
 * |    o  n
 * |   o n
 * |  u
 * | o
 * |o
 * |12345678---> reader order
 *
 * o = old slope
 * u = update
 * n = new slope
 *
 * So reader 6 will observe time going backwards versus reader 5.
 *
 * While other CPUs are likely to be able observe that, the only way
 * for a CPU local observation is when an NMI hits in the middle of
 * the update. Timestamps taken from that NMI context might be ahead
 * of the following timestamps. Callers need to be aware of that and
 * deal with it.
 */
static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
{
        struct tk_read_base *tkr;
        unsigned int seq;
        u64 now;

        do {
                seq = raw_read_seqcount_latch(&tkf->seq);
                tkr = tkf->base + (seq & 0x01);
                now = ktime_to_ns(tkr->base);

                now += timekeeping_delta_to_ns(tkr,
                                clocksource_delta(
                                        tk_clock_read(tkr),
                                        tkr->cycle_last,
                                        tkr->mask));
        } while (read_seqcount_latch_retry(&tkf->seq, seq));

        return now;
}

u64 ktime_get_mono_fast_ns(void)
{
        return __ktime_get_fast_ns(&tk_fast_mono);
}
EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);

u64 ktime_get_raw_fast_ns(void)
{
        return __ktime_get_fast_ns(&tk_fast_raw);
}
EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);

/**
 * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock.
 *
 * To keep it NMI safe since we're accessing from tracing, we're not using a
 * separate timekeeper with updates to monotonic clock and boot offset
 * protected with seqcounts. This has the following minor side effects:
 *
 * (1) Its possible that a timestamp be taken after the boot offset is updated
 * but before the timekeeper is updated. If this happens, the new boot offset
 * is added to the old timekeeping making the clock appear to update slightly
 * earlier:
 *    CPU 0                                        CPU 1
 *    timekeeping_inject_sleeptime64()
 *    __timekeeping_inject_sleeptime(tk, delta);
 *                                                 timestamp();
 *    timekeeping_update(tk, TK_CLEAR_NTP...);
 *
 * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be
 * partially updated.  Since the tk->offs_boot update is a rare event, this
 * should be a rare occurrence which postprocessing should be able to handle.
 */
u64 notrace ktime_get_boot_fast_ns(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;

        return (ktime_get_mono_fast_ns() + ktime_to_ns(tk->offs_boot));
}
EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns);

/*
 * See comment for __ktime_get_fast_ns() vs. timestamp ordering
 */
static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono)
{
        struct tk_read_base *tkr;
        u64 basem, baser, delta;
        unsigned int seq;

        do {
                seq = raw_read_seqcount_latch(&tkf->seq);
                tkr = tkf->base + (seq & 0x01);
                basem = ktime_to_ns(tkr->base);
                baser = ktime_to_ns(tkr->base_real);

                delta = timekeeping_delta_to_ns(tkr,
                                clocksource_delta(tk_clock_read(tkr),
                                tkr->cycle_last, tkr->mask));
        } while (read_seqcount_latch_retry(&tkf->seq, seq));

        if (mono)
                *mono = basem + delta;
        return baser + delta;
}

/**
 * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime.
 */
u64 ktime_get_real_fast_ns(void)
{
        return __ktime_get_real_fast(&tk_fast_mono, NULL);
}
EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns);

/**
 * ktime_get_fast_timestamps: - NMI safe timestamps
 * @snapshot:        Pointer to timestamp storage
 *
 * Stores clock monotonic, boottime and realtime timestamps.
 *
 * Boot time is a racy access on 32bit systems if the sleep time injection
 * happens late during resume and not in timekeeping_resume(). That could
 * be avoided by expanding struct tk_read_base with boot offset for 32bit
 * and adding more overhead to the update. As this is a hard to observe
 * once per resume event which can be filtered with reasonable effort using
 * the accurate mono/real timestamps, it's probably not worth the trouble.
 *
 * Aside of that it might be possible on 32 and 64 bit to observe the
 * following when the sleep time injection happens late:
 *
 * CPU 0                                CPU 1
 * timekeeping_resume()
 * ktime_get_fast_timestamps()
 *        mono, real = __ktime_get_real_fast()
 *                                        inject_sleep_time()
 *                                           update boot offset
 *        boot = mono + bootoffset;
 *
 * That means that boot time already has the sleep time adjustment, but
 * real time does not. On the next readout both are in sync again.
 *
 * Preventing this for 64bit is not really feasible without destroying the
 * careful cache layout of the timekeeper because the sequence count and
 * struct tk_read_base would then need two cache lines instead of one.
 *
 * Access to the time keeper clock source is disabled accross the innermost
 * steps of suspend/resume. The accessors still work, but the timestamps
 * are frozen until time keeping is resumed which happens very early.
 *
 * For regular suspend/resume there is no observable difference vs. sched
 * clock, but it might affect some of the nasty low level debug printks.
 *
 * OTOH, access to sched clock is not guaranteed accross suspend/resume on
 * all systems either so it depends on the hardware in use.
 *
 * If that turns out to be a real problem then this could be mitigated by
 * using sched clock in a similar way as during early boot. But it's not as
 * trivial as on early boot because it needs some careful protection
 * against the clock monotonic timestamp jumping backwards on resume.
 */
void ktime_get_fast_timestamps(struct ktime_timestamps *snapshot)
{
        struct timekeeper *tk = &tk_core.timekeeper;

        snapshot->real = __ktime_get_real_fast(&tk_fast_mono, &snapshot->mono);
        snapshot->boot = snapshot->mono + ktime_to_ns(data_race(tk->offs_boot));
}

/**
 * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.
 * @tk: Timekeeper to snapshot.
 *
 * It generally is unsafe to access the clocksource after timekeeping has been
 * suspended, so take a snapshot of the readout base of @tk and use it as the
 * fast timekeeper's readout base while suspended.  It will return the same
 * number of cycles every time until timekeeping is resumed at which time the
 * proper readout base for the fast timekeeper will be restored automatically.
 */
static void halt_fast_timekeeper(const struct timekeeper *tk)
{
        static struct tk_read_base tkr_dummy;
        const struct tk_read_base *tkr = &tk->tkr_mono;

        memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
        cycles_at_suspend = tk_clock_read(tkr);
        tkr_dummy.clock = &dummy_clock;
        tkr_dummy.base_real = tkr->base + tk->offs_real;
        update_fast_timekeeper(&tkr_dummy, &tk_fast_mono);

        tkr = &tk->tkr_raw;
        memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
        tkr_dummy.clock = &dummy_clock;
        update_fast_timekeeper(&tkr_dummy, &tk_fast_raw);
}

static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);

static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
{
        raw_notifier_call_chain(&pvclock_gtod_chain, was_set, tk);
}

/**
 * pvclock_gtod_register_notifier - register a pvclock timedata update listener
 */
int pvclock_gtod_register_notifier(struct notifier_block *nb)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned long flags;
        int ret;

        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
        update_pvclock_gtod(tk, true);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);

        return ret;
}
EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);

/**
 * pvclock_gtod_unregister_notifier - unregister a pvclock
 * timedata update listener
 */
int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
{
        unsigned long flags;
        int ret;

        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);

        return ret;
}
EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);

/*
 * tk_update_leap_state - helper to update the next_leap_ktime
 */
static inline void tk_update_leap_state(struct timekeeper *tk)
{
        tk->next_leap_ktime = ntp_get_next_leap();
        if (tk->next_leap_ktime != KTIME_MAX)
                /* Convert to monotonic time */
                tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real);
}

/*
 * Update the ktime_t based scalar nsec members of the timekeeper
 */
static inline void tk_update_ktime_data(struct timekeeper *tk)
{
        u64 seconds;
        u32 nsec;

        /*
         * The xtime based monotonic readout is:
         *        nsec = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + now();
         * The ktime based monotonic readout is:
         *        nsec = base_mono + now();
         * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec
         */
        seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec);
        nsec = (u32) tk->wall_to_monotonic.tv_nsec;
        tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);

        /*
         * The sum of the nanoseconds portions of xtime and
         * wall_to_monotonic can be greater/equal one second. Take
         * this into account before updating tk->ktime_sec.
         */
        nsec += (u32)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
        if (nsec >= NSEC_PER_SEC)
                seconds++;
        tk->ktime_sec = seconds;

        /* Update the monotonic raw base */
        tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC);
}

/* must hold timekeeper_lock */
static void timekeeping_update(struct timekeeper *tk, unsigned int action)
{
        if (action & TK_CLEAR_NTP) {
                tk->ntp_error = 0;
                ntp_clear();
        }

        tk_update_leap_state(tk);
        tk_update_ktime_data(tk);

        update_vsyscall(tk);
        update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);

        tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real;
        update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
        update_fast_timekeeper(&tk->tkr_raw,  &tk_fast_raw);

        if (action & TK_CLOCK_WAS_SET)
                tk->clock_was_set_seq++;
        /*
         * The mirroring of the data to the shadow-timekeeper needs
         * to happen last here to ensure we don't over-write the
         * timekeeper structure on the next update with stale data
         */
        if (action & TK_MIRROR)
                memcpy(&shadow_timekeeper, &tk_core.timekeeper,
                       sizeof(tk_core.timekeeper));
}

/**
 * timekeeping_forward_now - update clock to the current time
 *
 * Forward the current clock to update its state since the last call to
 * update_wall_time(). This is useful before significant clock changes,
 * as it avoids having to deal with this time offset explicitly.
 */
static void timekeeping_forward_now(struct timekeeper *tk)
{
        u64 cycle_now, delta;

        cycle_now = tk_clock_read(&tk->tkr_mono);
        delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
        tk->tkr_mono.cycle_last = cycle_now;
        tk->tkr_raw.cycle_last  = cycle_now;

        tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult;

        /* If arch requires, add in get_arch_timeoffset() */
        tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift;


        tk->tkr_raw.xtime_nsec += delta * tk->tkr_raw.mult;

        /* If arch requires, add in get_arch_timeoffset() */
        tk->tkr_raw.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_raw.shift;

        tk_normalize_xtime(tk);
}

/**
 * ktime_get_real_ts64 - Returns the time of day in a timespec64.
 * @ts:                pointer to the timespec to be set
 *
 * Returns the time of day in a timespec64 (WARN if suspended).
 */
void ktime_get_real_ts64(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        u64 nsecs;

        WARN_ON(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                ts->tv_sec = tk->xtime_sec;
                nsecs = timekeeping_get_ns(&tk->tkr_mono);

        } while (read_seqcount_retry(&tk_core.seq, seq));

        ts->tv_nsec = 0;
        timespec64_add_ns(ts, nsecs);
}
EXPORT_SYMBOL(ktime_get_real_ts64);

ktime_t ktime_get(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        ktime_t base;
        u64 nsecs;

        WARN_ON(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                base = tk->tkr_mono.base;
                nsecs = timekeeping_get_ns(&tk->tkr_mono);

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return ktime_add_ns(base, nsecs);
}
EXPORT_SYMBOL_GPL(ktime_get);

u32 ktime_get_resolution_ns(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        u32 nsecs;

        WARN_ON(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                nsecs = tk->tkr_mono.mult >> tk->tkr_mono.shift;
        } while (read_seqcount_retry(&tk_core.seq, seq));

        return nsecs;
}
EXPORT_SYMBOL_GPL(ktime_get_resolution_ns);

static ktime_t *offsets[TK_OFFS_MAX] = {
        [TK_OFFS_REAL]        = &tk_core.timekeeper.offs_real,
        [TK_OFFS_BOOT]        = &tk_core.timekeeper.offs_boot,
        [TK_OFFS_TAI]        = &tk_core.timekeeper.offs_tai,
};

ktime_t ktime_get_with_offset(enum tk_offsets offs)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        ktime_t base, *offset = offsets[offs];
        u64 nsecs;

        WARN_ON(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                base = ktime_add(tk->tkr_mono.base, *offset);
                nsecs = timekeeping_get_ns(&tk->tkr_mono);

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return ktime_add_ns(base, nsecs);

}
EXPORT_SYMBOL_GPL(ktime_get_with_offset);

ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        ktime_t base, *offset = offsets[offs];
        u64 nsecs;

        WARN_ON(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                base = ktime_add(tk->tkr_mono.base, *offset);
                nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return ktime_add_ns(base, nsecs);
}
EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset);

/**
 * ktime_mono_to_any() - convert mononotic time to any other time
 * @tmono:        time to convert.
 * @offs:        which offset to use
 */
ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs)
{
        ktime_t *offset = offsets[offs];
        unsigned int seq;
        ktime_t tconv;

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                tconv = ktime_add(tmono, *offset);
        } while (read_seqcount_retry(&tk_core.seq, seq));

        return tconv;
}
EXPORT_SYMBOL_GPL(ktime_mono_to_any);

/**
 * ktime_get_raw - Returns the raw monotonic time in ktime_t format
 */
ktime_t ktime_get_raw(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        ktime_t base;
        u64 nsecs;

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                base = tk->tkr_raw.base;
                nsecs = timekeeping_get_ns(&tk->tkr_raw);

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return ktime_add_ns(base, nsecs);
}
EXPORT_SYMBOL_GPL(ktime_get_raw);

/**
 * ktime_get_ts64 - get the monotonic clock in timespec64 format
 * @ts:                pointer to timespec variable
 *
 * The function calculates the monotonic clock from the realtime
 * clock and the wall_to_monotonic offset and stores the result
 * in normalized timespec64 format in the variable pointed to by @ts.
 */
void ktime_get_ts64(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        struct timespec64 tomono;
        unsigned int seq;
        u64 nsec;

        WARN_ON(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                ts->tv_sec = tk->xtime_sec;
                nsec = timekeeping_get_ns(&tk->tkr_mono);
                tomono = tk->wall_to_monotonic;

        } while (read_seqcount_retry(&tk_core.seq, seq));

        ts->tv_sec += tomono.tv_sec;
        ts->tv_nsec = 0;
        timespec64_add_ns(ts, nsec + tomono.tv_nsec);
}
EXPORT_SYMBOL_GPL(ktime_get_ts64);

/**
 * ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC
 *
 * Returns the seconds portion of CLOCK_MONOTONIC with a single non
 * serialized read. tk->ktime_sec is of type 'unsigned long' so this
 * works on both 32 and 64 bit systems. On 32 bit systems the readout
 * covers ~136 years of uptime which should be enough to prevent
 * premature wrap arounds.
 */
time64_t ktime_get_seconds(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;

        WARN_ON(timekeeping_suspended);
        return tk->ktime_sec;
}
EXPORT_SYMBOL_GPL(ktime_get_seconds);

/**
 * ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME
 *
 * Returns the wall clock seconds since 1970. This replaces the
 * get_seconds() interface which is not y2038 safe on 32bit systems.
 *
 * For 64bit systems the fast access to tk->xtime_sec is preserved. On
 * 32bit systems the access must be protected with the sequence
 * counter to provide "atomic" access to the 64bit tk->xtime_sec
 * value.
 */
time64_t ktime_get_real_seconds(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        time64_t seconds;
        unsigned int seq;

        if (IS_ENABLED(CONFIG_64BIT))
                return tk->xtime_sec;

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                seconds = tk->xtime_sec;

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return seconds;
}
EXPORT_SYMBOL_GPL(ktime_get_real_seconds);

/**
 * __ktime_get_real_seconds - The same as ktime_get_real_seconds
 * but without the sequence counter protect. This internal function
 * is called just when timekeeping lock is already held.
 */
noinstr time64_t __ktime_get_real_seconds(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;

        return tk->xtime_sec;
}

/**
 * ktime_get_snapshot - snapshots the realtime/monotonic raw clocks with counter
 * @systime_snapshot:        pointer to struct receiving the system time snapshot
 */
void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        ktime_t base_raw;
        ktime_t base_real;
        u64 nsec_raw;
        u64 nsec_real;
        u64 now;

        WARN_ON_ONCE(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                now = tk_clock_read(&tk->tkr_mono);
                systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq;
                systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq;
                base_real = ktime_add(tk->tkr_mono.base,
                                      tk_core.timekeeper.offs_real);
                base_raw = tk->tkr_raw.base;
                nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now);
                nsec_raw  = timekeeping_cycles_to_ns(&tk->tkr_raw, now);
        } while (read_seqcount_retry(&tk_core.seq, seq));

        systime_snapshot->cycles = now;
        systime_snapshot->real = ktime_add_ns(base_real, nsec_real);
        systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw);
}
EXPORT_SYMBOL_GPL(ktime_get_snapshot);

/* Scale base by mult/div checking for overflow */
static int scale64_check_overflow(u64 mult, u64 div, u64 *base)
{
        u64 tmp, rem;

        tmp = div64_u64_rem(*base, div, &rem);

        if (((int)sizeof(u64)*8 - fls64(mult) < fls64(tmp)) ||
            ((int)sizeof(u64)*8 - fls64(mult) < fls64(rem)))
                return -EOVERFLOW;
        tmp *= mult;

        rem = div64_u64(rem * mult, div);
        *base = tmp + rem;
        return 0;
}

/**
 * adjust_historical_crosststamp - adjust crosstimestamp previous to current interval
 * @history:                        Snapshot representing start of history
 * @partial_history_cycles:        Cycle offset into history (fractional part)
 * @total_history_cycles:        Total history length in cycles
 * @discontinuity:                True indicates clock was set on history period
 * @ts:                                Cross timestamp that should be adjusted using
 *        partial/total ratio
 *
 * Helper function used by get_device_system_crosststamp() to correct the
 * crosstimestamp corresponding to the start of the current interval to the
 * system counter value (timestamp point) provided by the driver. The
 * total_history_* quantities are the total history starting at the provided
 * reference point and ending at the start of the current interval. The cycle
 * count between the driver timestamp point and the start of the current
 * interval is partial_history_cycles.
 */
static int adjust_historical_crosststamp(struct system_time_snapshot *history,
                                         u64 partial_history_cycles,
                                         u64 total_history_cycles,
                                         bool discontinuity,
                                         struct system_device_crosststamp *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        u64 corr_raw, corr_real;
        bool interp_forward;
        int ret;

        if (total_history_cycles == 0 || partial_history_cycles == 0)
                return 0;

        /* Interpolate shortest distance from beginning or end of history */
        interp_forward = partial_history_cycles > total_history_cycles / 2;
        partial_history_cycles = interp_forward ?
                total_history_cycles - partial_history_cycles :
                partial_history_cycles;

        /*
         * Scale the monotonic raw time delta by:
         *        partial_history_cycles / total_history_cycles
         */
        corr_raw = (u64)ktime_to_ns(
                ktime_sub(ts->sys_monoraw, history->raw));
        ret = scale64_check_overflow(partial_history_cycles,
                                     total_history_cycles, &corr_raw);
        if (ret)
                return ret;

        /*
         * If there is a discontinuity in the history, scale monotonic raw
         *        correction by:
         *        mult(real)/mult(raw) yielding the realtime correction
         * Otherwise, calculate the realtime correction similar to monotonic
         *        raw calculation
         */
        if (discontinuity) {
                corr_real = mul_u64_u32_div
                        (corr_raw, tk->tkr_mono.mult, tk->tkr_raw.mult);
        } else {
                corr_real = (u64)ktime_to_ns(
                        ktime_sub(ts->sys_realtime, history->real));
                ret = scale64_check_overflow(partial_history_cycles,
                                             total_history_cycles, &corr_real);
                if (ret)
                        return ret;
        }

        /* Fixup monotonic raw and real time time values */
        if (interp_forward) {
                ts->sys_monoraw = ktime_add_ns(history->raw, corr_raw);
                ts->sys_realtime = ktime_add_ns(history->real, corr_real);
        } else {
                ts->sys_monoraw = ktime_sub_ns(ts->sys_monoraw, corr_raw);
                ts->sys_realtime = ktime_sub_ns(ts->sys_realtime, corr_real);
        }

        return 0;
}

/*
 * timestamp_in_interval - true if ts is chronologically in [start, end]
 *
 * True if ts occurs chronologically at or after start, and before or at end.
 */
static bool timestamp_in_interval(u64 start, u64 end, u64 ts)
{
        if (ts >= start && ts <= end)
                return true;
        if (start > end && (ts >= start || ts <= end))
                return true;
        return false;
}

/**
 * get_device_system_crosststamp - Synchronously capture system/device timestamp
 * @get_time_fn:        Callback to get simultaneous device time and
 *        system counter from the device driver
 * @ctx:                Context passed to get_time_fn()
 * @history_begin:        Historical reference point used to interpolate system
 *        time when counter provided by the driver is before the current interval
 * @xtstamp:                Receives simultaneously captured system and device time
 *
 * Reads a timestamp from a device and correlates it to system time
 */
int get_device_system_crosststamp(int (*get_time_fn)
                                  (ktime_t *device_time,
                                   struct system_counterval_t *sys_counterval,
                                   void *ctx),
                                  void *ctx,
                                  struct system_time_snapshot *history_begin,
                                  struct system_device_crosststamp *xtstamp)
{
        struct system_counterval_t system_counterval;
        struct timekeeper *tk = &tk_core.timekeeper;
        u64 cycles, now, interval_start;
        unsigned int clock_was_set_seq = 0;
        ktime_t base_real, base_raw;
        u64 nsec_real, nsec_raw;
        u8 cs_was_changed_seq;
        unsigned int seq;
        bool do_interp;
        int ret;

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                /*
                 * Try to synchronously capture device time and a system
                 * counter value calling back into the device driver
                 */
                ret = get_time_fn(&xtstamp->device, &system_counterval, ctx);
                if (ret)
                        return ret;

                /*
                 * Verify that the clocksource associated with the captured
                 * system counter value is the same as the currently installed
                 * timekeeper clocksource
                 */
                if (tk->tkr_mono.clock != system_counterval.cs)
                        return -ENODEV;
                cycles = system_counterval.cycles;

                /*
                 * Check whether the system counter value provided by the
                 * device driver is on the current timekeeping interval.
                 */
                now = tk_clock_read(&tk->tkr_mono);
                interval_start = tk->tkr_mono.cycle_last;
                if (!timestamp_in_interval(interval_start, now, cycles)) {
                        clock_was_set_seq = tk->clock_was_set_seq;
                        cs_was_changed_seq = tk->cs_was_changed_seq;
                        cycles = interval_start;
                        do_interp = true;
                } else {
                        do_interp = false;
                }

                base_real = ktime_add(tk->tkr_mono.base,
                                      tk_core.timekeeper.offs_real);
                base_raw = tk->tkr_raw.base;

                nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, cycles);
                nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, cycles);
        } while (read_seqcount_retry(&tk_core.seq, seq));

        xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real);
        xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw);

        /*
         * Interpolate if necessary, adjusting back from the start of the
         * current interval
         */
        if (do_interp) {
                u64 partial_history_cycles, total_history_cycles;
                bool discontinuity;

                /*
                 * Check that the counter value is not before the provided
                 * history reference and that the history doesn't cross a
                 * clocksource change
                 */
                if (!history_begin ||
                    !timestamp_in_interval(history_begin->cycles,
                                           cycles, system_counterval.cycles) ||
                    history_begin->cs_was_changed_seq != cs_was_changed_seq)
                        return -EINVAL;
                partial_history_cycles = cycles - system_counterval.cycles;
                total_history_cycles = cycles - history_begin->cycles;
                discontinuity =
                        history_begin->clock_was_set_seq != clock_was_set_seq;

                ret = adjust_historical_crosststamp(history_begin,
                                                    partial_history_cycles,
                                                    total_history_cycles,
                                                    discontinuity, xtstamp);
                if (ret)
                        return ret;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(get_device_system_crosststamp);

/**
 * do_settimeofday64 - Sets the time of day.
 * @ts:     pointer to the timespec64 variable containing the new time
 *
 * Sets the time of day to the new time and update NTP and notify hrtimers
 */
int do_settimeofday64(const struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        struct timespec64 ts_delta, xt;
        unsigned long flags;
        int ret = 0;

        if (!timespec64_valid_settod(ts))
                return -EINVAL;

        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        write_seqcount_begin(&tk_core.seq);

        timekeeping_forward_now(tk);

        xt = tk_xtime(tk);
        ts_delta = timespec64_sub(*ts, xt);

        if (timespec64_compare(&tk->wall_to_monotonic, &ts_delta) > 0) {
                ret = -EINVAL;
                goto out;
        }

        tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta));

        tk_set_xtime(tk, ts);
out:
        timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);

        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);

        /* signal hrtimers about time change */
        clock_was_set();

        if (!ret) {
                audit_tk_injoffset(ts_delta);
                add_device_randomness(ts, sizeof(*ts));
        }

        return ret;
}
EXPORT_SYMBOL(do_settimeofday64);

/**
 * timekeeping_inject_offset - Adds or subtracts from the current time.
 * @tv:                pointer to the timespec variable containing the offset
 *
 * Adds or subtracts an offset value from the current time.
 */
static int timekeeping_inject_offset(const struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned long flags;
        struct timespec64 tmp;
        int ret = 0;

        if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC)
                return -EINVAL;

        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        write_seqcount_begin(&tk_core.seq);

        timekeeping_forward_now(tk);

        /* Make sure the proposed value is valid */
        tmp = timespec64_add(tk_xtime(tk), *ts);
        if (timespec64_compare(&tk->wall_to_monotonic, ts) > 0 ||
            !timespec64_valid_settod(&tmp)) {
                ret = -EINVAL;
                goto error;
        }

        tk_xtime_add(tk, ts);
        tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *ts));

error: /* even if we error out, we forwarded the time, so call update */
        timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);

        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);

        /* signal hrtimers about time change */
        clock_was_set();

        return ret;
}

/*
 * Indicates if there is an offset between the system clock and the hardware
 * clock/persistent clock/rtc.
 */
int persistent_clock_is_local;

/*
 * Adjust the time obtained from the CMOS to be UTC time instead of
 * local time.
 *
 * This is ugly, but preferable to the alternatives.  Otherwise we
 * would either need to write a program to do it in /etc/rc (and risk
 * confusion if the program gets run more than once; it would also be
 * hard to make the program warp the clock precisely n hours)  or
 * compile in the timezone information into the kernel.  Bad, bad....
 *
 *                                                - TYT, 1992-01-01
 *
 * The best thing to do is to keep the CMOS clock in universal time (UTC)
 * as real UNIX machines always do it. This avoids all headaches about
 * daylight saving times and warping kernel clocks.
 */
void timekeeping_warp_clock(void)
{
        if (sys_tz.tz_minuteswest != 0) {
                struct timespec64 adjust;

                persistent_clock_is_local = 1;
                adjust.tv_sec = sys_tz.tz_minuteswest * 60;
                adjust.tv_nsec = 0;
                timekeeping_inject_offset(&adjust);
        }
}

/**
 * __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic
 *
 */
static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
{
        tk->tai_offset = tai_offset;
        tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0));
}

/**
 * change_clocksource - Swaps clocksources if a new one is available
 *
 * Accumulates current time interval and initializes new clocksource
 */
static int change_clocksource(void *data)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        struct clocksource *new, *old;
        unsigned long flags;

        new = (struct clocksource *) data;

        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        write_seqcount_begin(&tk_core.seq);

        timekeeping_forward_now(tk);
        /*
         * If the cs is in module, get a module reference. Succeeds
         * for built-in code (owner == NULL) as well.
         */
        if (try_module_get(new->owner)) {
                if (!new->enable || new->enable(new) == 0) {
                        old = tk->tkr_mono.clock;
                        tk_setup_internals(tk, new);
                        if (old->disable)
                                old->disable(old);
                        module_put(old->owner);
                } else {
                        module_put(new->owner);
                }
        }
        timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);

        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);

        return 0;
}

/**
 * timekeeping_notify - Install a new clock source
 * @clock:                pointer to the clock source
 *
 * This function is called from clocksource.c after a new, better clock
 * source has been registered. The caller holds the clocksource_mutex.
 */
int timekeeping_notify(struct clocksource *clock)
{
        struct timekeeper *tk = &tk_core.timekeeper;

        if (tk->tkr_mono.clock == clock)
                return 0;
        stop_machine(change_clocksource, clock, NULL);
        tick_clock_notify();
        return tk->tkr_mono.clock == clock ? 0 : -1;
}

/**
 * ktime_get_raw_ts64 - Returns the raw monotonic time in a timespec
 * @ts:                pointer to the timespec64 to be set
 *
 * Returns the raw monotonic time (completely un-modified by ntp)
 */
void ktime_get_raw_ts64(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        u64 nsecs;

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                ts->tv_sec = tk->raw_sec;
                nsecs = timekeeping_get_ns(&tk->tkr_raw);

        } while (read_seqcount_retry(&tk_core.seq, seq));

        ts->tv_nsec = 0;
        timespec64_add_ns(ts, nsecs);
}
EXPORT_SYMBOL(ktime_get_raw_ts64);


/**
 * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
 */
int timekeeping_valid_for_hres(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        int ret;

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return ret;
}

/**
 * timekeeping_max_deferment - Returns max time the clocksource can be deferred
 */
u64 timekeeping_max_deferment(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        u64 ret;

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                ret = tk->tkr_mono.clock->max_idle_ns;

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return ret;
}

/**
 * read_persistent_clock64 -  Return time from the persistent clock.
 *
 * Weak dummy function for arches that do not yet support it.
 * Reads the time from the battery backed persistent clock.
 * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
 *
 *  XXX - Do be sure to remove it once all arches implement it.
 */
void __weak read_persistent_clock64(struct timespec64 *ts)
{
        ts->tv_sec = 0;
        ts->tv_nsec = 0;
}

/**
 * read_persistent_wall_and_boot_offset - Read persistent clock, and also offset
 *                                        from the boot.
 *
 * Weak dummy function for arches that do not yet support it.
 * wall_time        - current time as returned by persistent clock
 * boot_offset        - offset that is defined as wall_time - boot_time
 * The default function calculates offset based on the current value of
 * local_clock(). This way architectures that support sched_clock() but don't
 * support dedicated boot time clock will provide the best estimate of the
 * boot time.
 */
void __weak __init
read_persistent_wall_and_boot_offset(struct timespec64 *wall_time,
                                     struct timespec64 *boot_offset)
{
        read_persistent_clock64(wall_time);
        *boot_offset = ns_to_timespec64(local_clock());
}

/*
 * Flag reflecting whether timekeeping_resume() has injected sleeptime.
 *
 * The flag starts of false and is only set when a suspend reaches
 * timekeeping_suspend(), timekeeping_resume() sets it to false when the
 * timekeeper clocksource is not stopping across suspend and has been
 * used to update sleep time. If the timekeeper clocksource has stopped
 * then the flag stays true and is used by the RTC resume code to decide
 * whether sleeptime must be injected and if so the flag gets false then.
 *
 * If a suspend fails before reaching timekeeping_resume() then the flag
 * stays false and prevents erroneous sleeptime injection.
 */
static bool suspend_timing_needed;

/* Flag for if there is a persistent clock on this platform */
static bool persistent_clock_exists;

/*
 * timekeeping_init - Initializes the clocksource and common timekeeping values
 */
void __init timekeeping_init(void)
{
        struct timespec64 wall_time, boot_offset, wall_to_mono;
        struct timekeeper *tk = &tk_core.timekeeper;
        struct clocksource *clock;
        unsigned long flags;

        read_persistent_wall_and_boot_offset(&wall_time, &boot_offset);
        if (timespec64_valid_settod(&wall_time) &&
            timespec64_to_ns(&wall_time) > 0) {
                persistent_clock_exists = true;
        } else if (timespec64_to_ns(&wall_time) != 0) {
                pr_warn("Persistent clock returned invalid value");
                wall_time = (struct timespec64){0};
        }

        if (timespec64_compare(&wall_time, &boot_offset) < 0)
                boot_offset = (struct timespec64){0};

        /*
         * We want set wall_to_mono, so the following is true:
         * wall time + wall_to_mono = boot time
         */
        wall_to_mono = timespec64_sub(boot_offset, wall_time);

        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        write_seqcount_begin(&tk_core.seq);
        ntp_init();

        clock = clocksource_default_clock();
        if (clock->enable)
                clock->enable(clock);
        tk_setup_internals(tk, clock);

        tk_set_xtime(tk, &wall_time);
        tk->raw_sec = 0;

        tk_set_wall_to_mono(tk, wall_to_mono);

        timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);

        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
}

/* time in seconds when suspend began for persistent clock */
static struct timespec64 timekeeping_suspend_time;

/**
 * __timekeeping_inject_sleeptime - Internal function to add sleep interval
 * @delta: pointer to a timespec delta value
 *
 * Takes a timespec offset measuring a suspend interval and properly
 * adds the sleep offset to the timekeeping variables.
 */
static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
                                           const struct timespec64 *delta)
{
        if (!timespec64_valid_strict(delta)) {
                printk_deferred(KERN_WARNING
                                "__timekeeping_inject_sleeptime: Invalid "
                                "sleep delta value!\n");
                return;
        }
        tk_xtime_add(tk, delta);
        tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta));
        tk_update_sleep_time(tk, timespec64_to_ktime(*delta));
        tk_debug_account_sleep_time(delta);
}

#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE)
/**
 * We have three kinds of time sources to use for sleep time
 * injection, the preference order is:
 * 1) non-stop clocksource
 * 2) persistent clock (ie: RTC accessible when irqs are off)
 * 3) RTC
 *
 * 1) and 2) are used by timekeeping, 3) by RTC subsystem.
 * If system has neither 1) nor 2), 3) will be used finally.
 *
 *
 * If timekeeping has injected sleeptime via either 1) or 2),
 * 3) becomes needless, so in this case we don't need to call
 * rtc_resume(), and this is what timekeeping_rtc_skipresume()
 * means.
 */
bool timekeeping_rtc_skipresume(void)
{
        return !suspend_timing_needed;
}

/**
 * 1) can be determined whether to use or not only when doing
 * timekeeping_resume() which is invoked after rtc_suspend(),
 * so we can't skip rtc_suspend() surely if system has 1).
 *
 * But if system has 2), 2) will definitely be used, so in this
 * case we don't need to call rtc_suspend(), and this is what
 * timekeeping_rtc_skipsuspend() means.
 */
bool timekeeping_rtc_skipsuspend(void)
{
        return persistent_clock_exists;
}

/**
 * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values
 * @delta: pointer to a timespec64 delta value
 *
 * This hook is for architectures that cannot support read_persistent_clock64
 * because their RTC/persistent clock is only accessible when irqs are enabled.
 * and also don't have an effective nonstop clocksource.
 *
 * This function should only be called by rtc_resume(), and allows
 * a suspend offset to be injected into the timekeeping values.
 */
void timekeeping_inject_sleeptime64(const struct timespec64 *delta)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned long flags;

        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        write_seqcount_begin(&tk_core.seq);

        suspend_timing_needed = false;

        timekeeping_forward_now(tk);

        __timekeeping_inject_sleeptime(tk, delta);

        timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);

        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);

        /* signal hrtimers about time change */
        clock_was_set();
}
#endif

/**
 * timekeeping_resume - Resumes the generic timekeeping subsystem.
 */
void timekeeping_resume(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        struct clocksource *clock = tk->tkr_mono.clock;
        unsigned long flags;
        struct timespec64 ts_new, ts_delta;
        u64 cycle_now, nsec;
        bool inject_sleeptime = false;

        read_persistent_clock64(&ts_new);

        clockevents_resume();
        clocksource_resume();

        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        write_seqcount_begin(&tk_core.seq);

        /*
         * After system resumes, we need to calculate the suspended time and
         * compensate it for the OS time. There are 3 sources that could be
         * used: Nonstop clocksource during suspend, persistent clock and rtc
         * device.
         *
         * One specific platform may have 1 or 2 or all of them, and the
         * preference will be:
         *        suspend-nonstop clocksource -> persistent clock -> rtc
         * The less preferred source will only be tried if there is no better
         * usable source. The rtc part is handled separately in rtc core code.
         */
        cycle_now = tk_clock_read(&tk->tkr_mono);
        nsec = clocksource_stop_suspend_timing(clock, cycle_now);
        if (nsec > 0) {
                ts_delta = ns_to_timespec64(nsec);
                inject_sleeptime = true;
        } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) {
                ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time);
                inject_sleeptime = true;
        }

        if (inject_sleeptime) {
                suspend_timing_needed = false;
                __timekeeping_inject_sleeptime(tk, &ts_delta);
        }

        /* Re-base the last cycle value */
        tk->tkr_mono.cycle_last = cycle_now;
        tk->tkr_raw.cycle_last  = cycle_now;

        tk->ntp_error = 0;
        timekeeping_suspended = 0;
        timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);

        touch_softlockup_watchdog();

        tick_resume();
        hrtimers_resume();
}

int timekeeping_suspend(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned long flags;
        struct timespec64                delta, delta_delta;
        static struct timespec64        old_delta;
        struct clocksource *curr_clock;
        u64 cycle_now;

        read_persistent_clock64(&timekeeping_suspend_time);

        /*
         * On some systems the persistent_clock can not be detected at
         * timekeeping_init by its return value, so if we see a valid
         * value returned, update the persistent_clock_exists flag.
         */
        if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec)
                persistent_clock_exists = true;

        suspend_timing_needed = true;

        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        write_seqcount_begin(&tk_core.seq);
        timekeeping_forward_now(tk);
        timekeeping_suspended = 1;

        /*
         * Since we've called forward_now, cycle_last stores the value
         * just read from the current clocksource. Save this to potentially
         * use in suspend timing.
         */
        curr_clock = tk->tkr_mono.clock;
        cycle_now = tk->tkr_mono.cycle_last;
        clocksource_start_suspend_timing(curr_clock, cycle_now);

        if (persistent_clock_exists) {
                /*
                 * To avoid drift caused by repeated suspend/resumes,
                 * which each can add ~1 second drift error,
                 * try to compensate so the difference in system time
                 * and persistent_clock time stays close to constant.
                 */
                delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time);
                delta_delta = timespec64_sub(delta, old_delta);
                if (abs(delta_delta.tv_sec) >= 2) {
                        /*
                         * if delta_delta is too large, assume time correction
                         * has occurred and set old_delta to the current delta.
                         */
                        old_delta = delta;
                } else {
                        /* Otherwise try to adjust old_system to compensate */
                        timekeeping_suspend_time =
                                timespec64_add(timekeeping_suspend_time, delta_delta);
                }
        }

        timekeeping_update(tk, TK_MIRROR);
        halt_fast_timekeeper(tk);
        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);

        tick_suspend();
        clocksource_suspend();
        clockevents_suspend();

        return 0;
}

/* sysfs resume/suspend bits for timekeeping */
static struct syscore_ops timekeeping_syscore_ops = {
        .resume                = timekeeping_resume,
        .suspend        = timekeeping_suspend,
};

static int __init timekeeping_init_ops(void)
{
        register_syscore_ops(&timekeeping_syscore_ops);
        return 0;
}
device_initcall(timekeeping_init_ops);

/*
 * Apply a multiplier adjustment to the timekeeper
 */
static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
                                                         s64 offset,
                                                         s32 mult_adj)
{
        s64 interval = tk->cycle_interval;

        if (mult_adj == 0) {
                return;
        } else if (mult_adj == -1) {
                interval = -interval;
                offset = -offset;
        } else if (mult_adj != 1) {
                interval *= mult_adj;
                offset *= mult_adj;
        }

        /*
         * So the following can be confusing.
         *
         * To keep things simple, lets assume mult_adj == 1 for now.
         *
         * When mult_adj != 1, remember that the interval and offset values
         * have been appropriately scaled so the math is the same.
         *
         * The basic idea here is that we're increasing the multiplier
         * by one, this causes the xtime_interval to be incremented by
         * one cycle_interval. This is because:
         *        xtime_interval = cycle_interval * mult
         * So if mult is being incremented by one:
         *        xtime_interval = cycle_interval * (mult + 1)
         * Its the same as:
         *        xtime_interval = (cycle_interval * mult) + cycle_interval
         * Which can be shortened to:
         *        xtime_interval += cycle_interval
         *
         * So offset stores the non-accumulated cycles. Thus the current
         * time (in shifted nanoseconds) is:
         *        now = (offset * adj) + xtime_nsec
         * Now, even though we're adjusting the clock frequency, we have
         * to keep time consistent. In other words, we can't jump back
         * in time, and we also want to avoid jumping forward in time.
         *
         * So given the same offset value, we need the time to be the same
         * both before and after the freq adjustment.
         *        now = (offset * adj_1) + xtime_nsec_1
         *        now = (offset * adj_2) + xtime_nsec_2
         * So:
         *        (offset * adj_1) + xtime_nsec_1 =
         *                (offset * adj_2) + xtime_nsec_2
         * And we know:
         *        adj_2 = adj_1 + 1
         * So:
         *        (offset * adj_1) + xtime_nsec_1 =
         *                (offset * (adj_1+1)) + xtime_nsec_2
         *        (offset * adj_1) + xtime_nsec_1 =
         *                (offset * adj_1) + offset + xtime_nsec_2
         * Canceling the sides:
         *        xtime_nsec_1 = offset + xtime_nsec_2
         * Which gives us:
         *        xtime_nsec_2 = xtime_nsec_1 - offset
         * Which simplfies to:
         *        xtime_nsec -= offset
         */
        if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) {
                /* NTP adjustment caused clocksource mult overflow */
                WARN_ON_ONCE(1);
                return;
        }

        tk->tkr_mono.mult += mult_adj;
        tk->xtime_interval += interval;
        tk->tkr_mono.xtime_nsec -= offset;
}

/*
 * Adjust the timekeeper's multiplier to the correct frequency
 * and also to reduce the accumulated error value.
 */
static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
{
        u32 mult;

        /*
         * Determine the multiplier from the current NTP tick length.
         * Avoid expensive division when the tick length doesn't change.
         */
        if (likely(tk->ntp_tick == ntp_tick_length())) {
                mult = tk->tkr_mono.mult - tk->ntp_err_mult;
        } else {
                tk->ntp_tick = ntp_tick_length();
                mult = div64_u64((tk->ntp_tick >> tk->ntp_error_shift) -
                                 tk->xtime_remainder, tk->cycle_interval);
        }

        /*
         * If the clock is behind the NTP time, increase the multiplier by 1
         * to catch up with it. If it's ahead and there was a remainder in the
         * tick division, the clock will slow down. Otherwise it will stay
         * ahead until the tick length changes to a non-divisible value.
         */
        tk->ntp_err_mult = tk->ntp_error > 0 ? 1 : 0;
        mult += tk->ntp_err_mult;

        timekeeping_apply_adjustment(tk, offset, mult - tk->tkr_mono.mult);

        if (unlikely(tk->tkr_mono.clock->maxadj &&
                (abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult)
                        > tk->tkr_mono.clock->maxadj))) {
                printk_once(KERN_WARNING
                        "Adjusting %s more than 11%% (%ld vs %ld)\n",
                        tk->tkr_mono.clock->name, (long)tk->tkr_mono.mult,
                        (long)tk->tkr_mono.clock->mult + tk->tkr_mono.clock->maxadj);
        }

        /*
         * It may be possible that when we entered this function, xtime_nsec
         * was very small.  Further, if we're slightly speeding the clocksource
         * in the code above, its possible the required corrective factor to
         * xtime_nsec could cause it to underflow.
         *
         * Now, since we have already accumulated the second and the NTP
         * subsystem has been notified via second_overflow(), we need to skip
         * the next update.
         */
        if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) {
                tk->tkr_mono.xtime_nsec += (u64)NSEC_PER_SEC <<
                                                        tk->tkr_mono.shift;
                tk->xtime_sec--;
                tk->skip_second_overflow = 1;
        }
}

/**
 * accumulate_nsecs_to_secs - Accumulates nsecs into secs
 *
 * Helper function that accumulates the nsecs greater than a second
 * from the xtime_nsec field to the xtime_secs field.
 * It also calls into the NTP code to handle leapsecond processing.
 *
 */
static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
{
        u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
        unsigned int clock_set = 0;

        while (tk->tkr_mono.xtime_nsec >= nsecps) {
                int leap;

                tk->tkr_mono.xtime_nsec -= nsecps;
                tk->xtime_sec++;

                /*
                 * Skip NTP update if this second was accumulated before,
                 * i.e. xtime_nsec underflowed in timekeeping_adjust()
                 */
                if (unlikely(tk->skip_second_overflow)) {
                        tk->skip_second_overflow = 0;
                        continue;
                }

                /* Figure out if its a leap sec and apply if needed */
                leap = second_overflow(tk->xtime_sec);
                if (unlikely(leap)) {
                        struct timespec64 ts;

                        tk->xtime_sec += leap;

                        ts.tv_sec = leap;
                        ts.tv_nsec = 0;
                        tk_set_wall_to_mono(tk,
                                timespec64_sub(tk->wall_to_monotonic, ts));

                        __timekeeping_set_tai_offset(tk, tk->tai_offset - leap);

                        clock_set = TK_CLOCK_WAS_SET;
                }
        }
        return clock_set;
}

/**
 * logarithmic_accumulation - shifted accumulation of cycles
 *
 * This functions accumulates a shifted interval of cycles into
 * a shifted interval nanoseconds. Allows for O(log) accumulation
 * loop.
 *
 * Returns the unconsumed cycles.
 */
static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
                                    u32 shift, unsigned int *clock_set)
{
        u64 interval = tk->cycle_interval << shift;
        u64 snsec_per_sec;

        /* If the offset is smaller than a shifted interval, do nothing */
        if (offset < interval)
                return offset;

        /* Accumulate one shifted interval */
        offset -= interval;
        tk->tkr_mono.cycle_last += interval;
        tk->tkr_raw.cycle_last  += interval;

        tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift;
        *clock_set |= accumulate_nsecs_to_secs(tk);

        /* Accumulate raw time */
        tk->tkr_raw.xtime_nsec += tk->raw_interval << shift;
        snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift;
        while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) {
                tk->tkr_raw.xtime_nsec -= snsec_per_sec;
                tk->raw_sec++;
        }

        /* Accumulate error between NTP and clock interval */
        tk->ntp_error += tk->ntp_tick << shift;
        tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) <<
                                                (tk->ntp_error_shift + shift);

        return offset;
}

/*
 * timekeeping_advance - Updates the timekeeper to the current time and
 * current NTP tick length
 */
static void timekeeping_advance(enum timekeeping_adv_mode mode)
{
        struct timekeeper *real_tk = &tk_core.timekeeper;
        struct timekeeper *tk = &shadow_timekeeper;
        u64 offset;
        int shift = 0, maxshift;
        unsigned int clock_set = 0;
        unsigned long flags;

        raw_spin_lock_irqsave(&timekeeper_lock, flags);

        /* Make sure we're fully resumed: */
        if (unlikely(timekeeping_suspended))
                goto out;

#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
        offset = real_tk->cycle_interval;

        if (mode != TK_ADV_TICK)
                goto out;
#else
        offset = clocksource_delta(tk_clock_read(&tk->tkr_mono),
                                   tk->tkr_mono.cycle_last, tk->tkr_mono.mask);

        /* Check if there's really nothing to do */
        if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK)
                goto out;
#endif

        /* Do some additional sanity checking */
        timekeeping_check_update(tk, offset);

        /*
         * With NO_HZ we may have to accumulate many cycle_intervals
         * (think "ticks") worth of time at once. To do this efficiently,
         * we calculate the largest doubling multiple of cycle_intervals
         * that is smaller than the offset.  We then accumulate that
         * chunk in one go, and then try to consume the next smaller
         * doubled multiple.
         */
        shift = ilog2(offset) - ilog2(tk->cycle_interval);
        shift = max(0, shift);
        /* Bound shift to one less than what overflows tick_length */
        maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
        shift = min(shift, maxshift);
        while (offset >= tk->cycle_interval) {
                offset = logarithmic_accumulation(tk, offset, shift,
                                                        &clock_set);
                if (offset < tk->cycle_interval<<shift)
                        shift--;
        }

        /* Adjust the multiplier to correct NTP error */
        timekeeping_adjust(tk, offset);

        /*
         * Finally, make sure that after the rounding
         * xtime_nsec isn't larger than NSEC_PER_SEC
         */
        clock_set |= accumulate_nsecs_to_secs(tk);

        write_seqcount_begin(&tk_core.seq);
        /*
         * Update the real timekeeper.
         *
         * We could avoid this memcpy by switching pointers, but that
         * requires changes to all other timekeeper usage sites as
         * well, i.e. move the timekeeper pointer getter into the
         * spinlocked/seqcount protected sections. And we trade this
         * memcpy under the tk_core.seq against one before we start
         * updating.
         */
        timekeeping_update(tk, clock_set);
        memcpy(real_tk, tk, sizeof(*tk));
        /* The memcpy must come last. Do not put anything here! */
        write_seqcount_end(&tk_core.seq);
out:
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
        if (clock_set)
                /* Have to call _delayed version, since in irq context*/
                clock_was_set_delayed();
}

/**
 * update_wall_time - Uses the current clocksource to increment the wall time
 *
 */
void update_wall_time(void)
{
        timekeeping_advance(TK_ADV_TICK);
}

/**
 * getboottime64 - Return the real time of system boot.
 * @ts:                pointer to the timespec64 to be set
 *
 * Returns the wall-time of boot in a timespec64.
 *
 * This is based on the wall_to_monotonic offset and the total suspend
 * time. Calls to settimeofday will affect the value returned (which
 * basically means that however wrong your real time clock is at boot time,
 * you get the right time here).
 */
void getboottime64(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot);

        *ts = ktime_to_timespec64(t);
}
EXPORT_SYMBOL_GPL(getboottime64);

void ktime_get_coarse_real_ts64(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                *ts = tk_xtime(tk);
        } while (read_seqcount_retry(&tk_core.seq, seq));
}
EXPORT_SYMBOL(ktime_get_coarse_real_ts64);

void ktime_get_coarse_ts64(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        struct timespec64 now, mono;
        unsigned int seq;

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                now = tk_xtime(tk);
                mono = tk->wall_to_monotonic;
        } while (read_seqcount_retry(&tk_core.seq, seq));

        set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec,
                                now.tv_nsec + mono.tv_nsec);
}
EXPORT_SYMBOL(ktime_get_coarse_ts64);

/*
 * Must hold jiffies_lock
 */
void do_timer(unsigned long ticks)
{
        jiffies_64 += ticks;
        calc_global_load();
}

/**
 * ktime_get_update_offsets_now - hrtimer helper
 * @cwsseq:        pointer to check and store the clock was set sequence number
 * @offs_real:        pointer to storage for monotonic -> realtime offset
 * @offs_boot:        pointer to storage for monotonic -> boottime offset
 * @offs_tai:        pointer to storage for monotonic -> clock tai offset
 *
 * Returns current monotonic time and updates the offsets if the
 * sequence number in @cwsseq and timekeeper.clock_was_set_seq are
 * different.
 *
 * Called from hrtimer_interrupt() or retrigger_next_event()
 */
ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
                                     ktime_t *offs_boot, ktime_t *offs_tai)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        ktime_t base;
        u64 nsecs;

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                base = tk->tkr_mono.base;
                nsecs = timekeeping_get_ns(&tk->tkr_mono);
                base = ktime_add_ns(base, nsecs);

                if (*cwsseq != tk->clock_was_set_seq) {
                        *cwsseq = tk->clock_was_set_seq;
                        *offs_real = tk->offs_real;
                        *offs_boot = tk->offs_boot;
                        *offs_tai = tk->offs_tai;
                }

                /* Handle leapsecond insertion adjustments */
                if (unlikely(base >= tk->next_leap_ktime))
                        *offs_real = ktime_sub(tk->offs_real, ktime_set(1, 0));

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return base;
}

/**
 * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex
 */
static int timekeeping_validate_timex(const struct __kernel_timex *txc)
{
        if (txc->modes & ADJ_ADJTIME) {
                /* singleshot must not be used with any other mode bits */
                if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
                        return -EINVAL;
                if (!(txc->modes & ADJ_OFFSET_READONLY) &&
                    !capable(CAP_SYS_TIME))
                        return -EPERM;
        } else {
                /* In order to modify anything, you gotta be super-user! */
                if (txc->modes && !capable(CAP_SYS_TIME))
                        return -EPERM;
                /*
                 * if the quartz is off by more than 10% then
                 * something is VERY wrong!
                 */
                if (txc->modes & ADJ_TICK &&
                    (txc->tick <  900000/USER_HZ ||
                     txc->tick > 1100000/USER_HZ))
                        return -EINVAL;
        }

        if (txc->modes & ADJ_SETOFFSET) {
                /* In order to inject time, you gotta be super-user! */
                if (!capable(CAP_SYS_TIME))
                        return -EPERM;

                /*
                 * Validate if a timespec/timeval used to inject a time
                 * offset is valid.  Offsets can be postive or negative, so
                 * we don't check tv_sec. The value of the timeval/timespec
                 * is the sum of its fields,but *NOTE*:
                 * The field tv_usec/tv_nsec must always be non-negative and
                 * we can't have more nanoseconds/microseconds than a second.
                 */
                if (txc->time.tv_usec < 0)
                        return -EINVAL;

                if (txc->modes & ADJ_NANO) {
                        if (txc->time.tv_usec >= NSEC_PER_SEC)
                                return -EINVAL;
                } else {
                        if (txc->time.tv_usec >= USEC_PER_SEC)
                                return -EINVAL;
                }
        }

        /*
         * Check for potential multiplication overflows that can
         * only happen on 64-bit systems:
         */
        if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) {
                if (LLONG_MIN / PPM_SCALE > txc->freq)
                        return -EINVAL;
                if (LLONG_MAX / PPM_SCALE < txc->freq)
                        return -EINVAL;
        }

        return 0;
}

/**
 * random_get_entropy_fallback - Returns the raw clock source value,
 * used by random.c for platforms with no valid random_get_entropy().
 */
unsigned long random_get_entropy_fallback(void)
{
        struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono;
        struct clocksource *clock = READ_ONCE(tkr->clock);

        if (unlikely(timekeeping_suspended || !clock))
                return 0;
        return clock->read(clock);
}
EXPORT_SYMBOL_GPL(random_get_entropy_fallback);

/**
 * do_adjtimex() - Accessor function to NTP __do_adjtimex function
 */
int do_adjtimex(struct __kernel_timex *txc)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        struct audit_ntp_data ad;
        unsigned long flags;
        struct timespec64 ts;
        s32 orig_tai, tai;
        int ret;

        /* Validate the data before disabling interrupts */
        ret = timekeeping_validate_timex(txc);
        if (ret)
                return ret;
        add_device_randomness(txc, sizeof(*txc));

        if (txc->modes & ADJ_SETOFFSET) {
                struct timespec64 delta;
                delta.tv_sec  = txc->time.tv_sec;
                delta.tv_nsec = txc->time.tv_usec;
                if (!(txc->modes & ADJ_NANO))
                        delta.tv_nsec *= 1000;
                ret = timekeeping_inject_offset(&delta);
                if (ret)
                        return ret;

                audit_tk_injoffset(delta);
        }

        audit_ntp_init(&ad);

        ktime_get_real_ts64(&ts);
        add_device_randomness(&ts, sizeof(ts));

        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        write_seqcount_begin(&tk_core.seq);

        orig_tai = tai = tk->tai_offset;
        ret = __do_adjtimex(txc, &ts, &tai, &ad);

        if (tai != orig_tai) {
                __timekeeping_set_tai_offset(tk, tai);
                timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
        }
        tk_update_leap_state(tk);

        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);

        audit_ntp_log(&ad);

        /* Update the multiplier immediately if frequency was set directly */
        if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK))
                timekeeping_advance(TK_ADV_FREQ);

        if (tai != orig_tai)
                clock_was_set();

        ntp_notify_cmos_timer();

        return ret;
}

#ifdef CONFIG_NTP_PPS
/**
 * hardpps() - Accessor function to NTP __hardpps function
 */
void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
{
        unsigned long flags;

        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        write_seqcount_begin(&tk_core.seq);

        __hardpps(phase_ts, raw_ts);

        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
}
EXPORT_SYMBOL(hardpps);
#endif /* CONFIG_NTP_PPS */

/**
 * xtime_update() - advances the timekeeping infrastructure
 * @ticks:        number of ticks, that have elapsed since the last call.
 *
 * Must be called with interrupts disabled.
 */
void xtime_update(unsigned long ticks)
{
        raw_spin_lock(&jiffies_lock);
        write_seqcount_begin(&jiffies_seq);
        do_timer(ticks);
        write_seqcount_end(&jiffies_seq);
        raw_spin_unlock(&jiffies_lock);
        update_wall_time();
}



































































    1 


    1 
    1 
































    1 




















































    1 




    1 
    1 














    1 










































    1 


    1 

    1 









































































    1 



    1 






















    1 









    1 
    1 
















    1 






    1 
    1 

    1 


    1 


    1 


















    1 
















    1 




    1 



    1 
























































    1 
















































































































































































































































    1 






































































    1 





































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
// SPDX-License-Identifier: GPL-2.0-only
/*
 * mm/truncate.c - code for taking down pages from address_spaces
 *
 * Copyright (C) 2002, Linus Torvalds
 *
 * 10Sep2002        Andrew Morton
 *                Initial version.
 */

#include <linux/kernel.h>
#include <linux/backing-dev.h>
#include <linux/dax.h>
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/export.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/pagevec.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/buffer_head.h>        /* grr. try_to_release_page,
                                   do_invalidatepage */
#include <linux/shmem_fs.h>
#include <linux/cleancache.h>
#include <linux/rmap.h>
#include "internal.h"

/*
 * Regular page slots are stabilized by the page lock even without the tree
 * itself locked.  These unlocked entries need verification under the tree
 * lock.
 */
static inline void __clear_shadow_entry(struct address_space *mapping,
                                pgoff_t index, void *entry)
{
        XA_STATE(xas, &mapping->i_pages, index);

        xas_set_update(&xas, workingset_update_node);
        if (xas_load(&xas) != entry)
                return;
        xas_store(&xas, NULL);
        mapping->nrexceptional--;
}

static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
                               void *entry)
{
        xa_lock_irq(&mapping->i_pages);
        __clear_shadow_entry(mapping, index, entry);
        xa_unlock_irq(&mapping->i_pages);
}

/*
 * Unconditionally remove exceptional entries. Usually called from truncate
 * path. Note that the pagevec may be altered by this function by removing
 * exceptional entries similar to what pagevec_remove_exceptionals does.
 */
static void truncate_exceptional_pvec_entries(struct address_space *mapping,
                                struct pagevec *pvec, pgoff_t *indices,
                                pgoff_t end)
{
        int i, j;
        bool dax, lock;

        /* Handled by shmem itself */
        if (shmem_mapping(mapping))
                return;

        for (j = 0; j < pagevec_count(pvec); j++)
                if (xa_is_value(pvec->pages[j]))
                        break;

        if (j == pagevec_count(pvec))
                return;

        dax = dax_mapping(mapping);
        lock = !dax && indices[j] < end;
        if (lock)
                xa_lock_irq(&mapping->i_pages);

        for (i = j; i < pagevec_count(pvec); i++) {
                struct page *page = pvec->pages[i];
                pgoff_t index = indices[i];

                if (!xa_is_value(page)) {
                        pvec->pages[j++] = page;
                        continue;
                }

                if (index >= end)
                        continue;

                if (unlikely(dax)) {
                        dax_delete_mapping_entry(mapping, index);
                        continue;
                }

                __clear_shadow_entry(mapping, index, page);
        }

        if (lock)
                xa_unlock_irq(&mapping->i_pages);
        pvec->nr = j;
}

/*
 * Invalidate exceptional entry if easily possible. This handles exceptional
 * entries for invalidate_inode_pages().
 */
static int invalidate_exceptional_entry(struct address_space *mapping,
                                        pgoff_t index, void *entry)
{
        /* Handled by shmem itself, or for DAX we do nothing. */
        if (shmem_mapping(mapping) || dax_mapping(mapping))
                return 1;
        clear_shadow_entry(mapping, index, entry);
        return 1;
}

/*
 * Invalidate exceptional entry if clean. This handles exceptional entries for
 * invalidate_inode_pages2() so for DAX it evicts only clean entries.
 */
static int invalidate_exceptional_entry2(struct address_space *mapping,
                                         pgoff_t index, void *entry)
{
        /* Handled by shmem itself */
        if (shmem_mapping(mapping))
                return 1;
        if (dax_mapping(mapping))
                return dax_invalidate_mapping_entry_sync(mapping, index);
        clear_shadow_entry(mapping, index, entry);
        return 1;
}

/**
 * do_invalidatepage - invalidate part or all of a page
 * @page: the page which is affected
 * @offset: start of the range to invalidate
 * @length: length of the range to invalidate
 *
 * do_invalidatepage() is called when all or part of the page has become
 * invalidated by a truncate operation.
 *
 * do_invalidatepage() does not have to release all buffers, but it must
 * ensure that no dirty buffer is left outside @offset and that no I/O
 * is underway against any of the blocks which are outside the truncation
 * point.  Because the caller is about to free (and possibly reuse) those
 * blocks on-disk.
 */
void do_invalidatepage(struct page *page, unsigned int offset,
                       unsigned int length)
{
        void (*invalidatepage)(struct page *, unsigned int, unsigned int);

        invalidatepage = page->mapping->a_ops->invalidatepage;
#ifdef CONFIG_BLOCK
        if (!invalidatepage)
                invalidatepage = block_invalidatepage;
#endif
        if (invalidatepage)
                (*invalidatepage)(page, offset, length);
}

/*
 * If truncate cannot remove the fs-private metadata from the page, the page
 * becomes orphaned.  It will be left on the LRU and may even be mapped into
 * user pagetables if we're racing with filemap_fault().
 *
 * We need to bail out if page->mapping is no longer equal to the original
 * mapping.  This happens a) when the VM reclaimed the page while we waited on
 * its lock, b) when a concurrent invalidate_mapping_pages got there first and
 * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
 */
static void truncate_cleanup_page(struct page *page)
{
        if (page_mapped(page))
                unmap_mapping_page(page);

        if (page_has_private(page))
                do_invalidatepage(page, 0, thp_size(page));

        /*
         * Some filesystems seem to re-dirty the page even after
         * the VM has canceled the dirty bit (eg ext3 journaling).
         * Hence dirty accounting check is placed after invalidation.
         */
        cancel_dirty_page(page);
        ClearPageMappedToDisk(page);
}

/*
 * This is for invalidate_mapping_pages().  That function can be called at
 * any time, and is not supposed to throw away dirty pages.  But pages can
 * be marked dirty at any time too, so use remove_mapping which safely
 * discards clean, unused pages.
 *
 * Returns non-zero if the page was successfully invalidated.
 */
static int
invalidate_complete_page(struct address_space *mapping, struct page *page)
{
        int ret;

        if (page->mapping != mapping)
                return 0;

        if (page_has_private(page) && !try_to_release_page(page, 0))
                return 0;

        ret = remove_mapping(mapping, page);

        return ret;
}

int truncate_inode_page(struct address_space *mapping, struct page *page)
{
        VM_BUG_ON_PAGE(PageTail(page), page);

        if (page->mapping != mapping)
                return -EIO;

        truncate_cleanup_page(page);
        delete_from_page_cache(page);
        return 0;
}

/*
 * Used to get rid of pages on hardware memory corruption.
 */
int generic_error_remove_page(struct address_space *mapping, struct page *page)
{
        if (!mapping)
                return -EINVAL;
        /*
         * Only punch for normal data pages for now.
         * Handling other types like directories would need more auditing.
         */
        if (!S_ISREG(mapping->host->i_mode))
                return -EIO;
        return truncate_inode_page(mapping, page);
}
EXPORT_SYMBOL(generic_error_remove_page);

/*
 * Safely invalidate one page from its pagecache mapping.
 * It only drops clean, unused pages. The page must be locked.
 *
 * Returns 1 if the page is successfully invalidated, otherwise 0.
 */
int invalidate_inode_page(struct page *page)
{
        struct address_space *mapping = page_mapping(page);
        if (!mapping)
                return 0;
        if (PageDirty(page) || PageWriteback(page))
                return 0;
        if (page_mapped(page))
                return 0;
        return invalidate_complete_page(mapping, page);
}

/**
 * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets
 * @mapping: mapping to truncate
 * @lstart: offset from which to truncate
 * @lend: offset to which to truncate (inclusive)
 *
 * Truncate the page cache, removing the pages that are between
 * specified offsets (and zeroing out partial pages
 * if lstart or lend + 1 is not page aligned).
 *
 * Truncate takes two passes - the first pass is nonblocking.  It will not
 * block on page locks and it will not block on writeback.  The second pass
 * will wait.  This is to prevent as much IO as possible in the affected region.
 * The first pass will remove most pages, so the search cost of the second pass
 * is low.
 *
 * We pass down the cache-hot hint to the page freeing code.  Even if the
 * mapping is large, it is probably the case that the final pages are the most
 * recently touched, and freeing happens in ascending file offset order.
 *
 * Note that since ->invalidatepage() accepts range to invalidate
 * truncate_inode_pages_range is able to handle cases where lend + 1 is not
 * page aligned properly.
 */
void truncate_inode_pages_range(struct address_space *mapping,
                                loff_t lstart, loff_t lend)
{
        pgoff_t                start;                /* inclusive */
        pgoff_t                end;                /* exclusive */
        unsigned int        partial_start;        /* inclusive */
        unsigned int        partial_end;        /* exclusive */
        struct pagevec        pvec;
        pgoff_t                indices[PAGEVEC_SIZE];
        pgoff_t                index;
        int                i;

        if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
                goto out;

        /* Offsets within partial pages */
        partial_start = lstart & (PAGE_SIZE - 1);
        partial_end = (lend + 1) & (PAGE_SIZE - 1);

        /*
         * 'start' and 'end' always covers the range of pages to be fully
         * truncated. Partial pages are covered with 'partial_start' at the
         * start of the range and 'partial_end' at the end of the range.
         * Note that 'end' is exclusive while 'lend' is inclusive.
         */
        start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
        if (lend == -1)
                /*
                 * lend == -1 indicates end-of-file so we have to set 'end'
                 * to the highest possible pgoff_t and since the type is
                 * unsigned we're using -1.
                 */
                end = -1;
        else
                end = (lend + 1) >> PAGE_SHIFT;

        pagevec_init(&pvec);
        index = start;
        while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
                        min(end - index, (pgoff_t)PAGEVEC_SIZE),
                        indices)) {
                /*
                 * Pagevec array has exceptional entries and we may also fail
                 * to lock some pages. So we store pages that can be deleted
                 * in a new pagevec.
                 */
                struct pagevec locked_pvec;

                pagevec_init(&locked_pvec);
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        struct page *page = pvec.pages[i];

                        /* We rely upon deletion not changing page->index */
                        index = indices[i];
                        if (index >= end)
                                break;

                        if (xa_is_value(page))
                                continue;

                        if (!trylock_page(page))
                                continue;
                        WARN_ON(page_to_index(page) != index);
                        if (PageWriteback(page)) {
                                unlock_page(page);
                                continue;
                        }
                        if (page->mapping != mapping) {
                                unlock_page(page);
                                continue;
                        }
                        pagevec_add(&locked_pvec, page);
                }
                for (i = 0; i < pagevec_count(&locked_pvec); i++)
                        truncate_cleanup_page(locked_pvec.pages[i]);
                delete_from_page_cache_batch(mapping, &locked_pvec);
                for (i = 0; i < pagevec_count(&locked_pvec); i++)
                        unlock_page(locked_pvec.pages[i]);
                truncate_exceptional_pvec_entries(mapping, &pvec, indices, end);
                pagevec_release(&pvec);
                cond_resched();
                index++;
        }
        if (partial_start) {
                struct page *page = find_lock_page(mapping, start - 1);
                if (page) {
                        unsigned int top = PAGE_SIZE;
                        if (start > end) {
                                /* Truncation within a single page */
                                top = partial_end;
                                partial_end = 0;
                        }
                        wait_on_page_writeback(page);
                        zero_user_segment(page, partial_start, top);
                        cleancache_invalidate_page(mapping, page);
                        if (page_has_private(page))
                                do_invalidatepage(page, partial_start,
                                                  top - partial_start);
                        unlock_page(page);
                        put_page(page);
                }
        }
        if (partial_end) {
                struct page *page = find_lock_page(mapping, end);
                if (page) {
                        wait_on_page_writeback(page);
                        zero_user_segment(page, 0, partial_end);
                        cleancache_invalidate_page(mapping, page);
                        if (page_has_private(page))
                                do_invalidatepage(page, 0,
                                                  partial_end);
                        unlock_page(page);
                        put_page(page);
                }
        }
        /*
         * If the truncation happened within a single page no pages
         * will be released, just zeroed, so we can bail out now.
         */
        if (start >= end)
                goto out;

        index = start;
        for ( ; ; ) {
                cond_resched();
                if (!pagevec_lookup_entries(&pvec, mapping, index,
                        min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) {
                        /* If all gone from start onwards, we're done */
                        if (index == start)
                                break;
                        /* Otherwise restart to make sure all gone */
                        index = start;
                        continue;
                }
                if (index == start && indices[0] >= end) {
                        /* All gone out of hole to be punched, we're done */
                        pagevec_remove_exceptionals(&pvec);
                        pagevec_release(&pvec);
                        break;
                }

                for (i = 0; i < pagevec_count(&pvec); i++) {
                        struct page *page = pvec.pages[i];

                        /* We rely upon deletion not changing page->index */
                        index = indices[i];
                        if (index >= end) {
                                /* Restart punch to make sure all gone */
                                index = start - 1;
                                break;
                        }

                        if (xa_is_value(page))
                                continue;

                        lock_page(page);
                        WARN_ON(page_to_index(page) != index);
                        wait_on_page_writeback(page);
                        truncate_inode_page(mapping, page);
                        unlock_page(page);
                }
                truncate_exceptional_pvec_entries(mapping, &pvec, indices, end);
                pagevec_release(&pvec);
                index++;
        }

out:
        cleancache_invalidate_inode(mapping);
}
EXPORT_SYMBOL(truncate_inode_pages_range);

/**
 * truncate_inode_pages - truncate *all* the pages from an offset
 * @mapping: mapping to truncate
 * @lstart: offset from which to truncate
 *
 * Called under (and serialised by) inode->i_mutex.
 *
 * Note: When this function returns, there can be a page in the process of
 * deletion (inside __delete_from_page_cache()) in the specified range.  Thus
 * mapping->nrpages can be non-zero when this function returns even after
 * truncation of the whole mapping.
 */
void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
{
        truncate_inode_pages_range(mapping, lstart, (loff_t)-1);
}
EXPORT_SYMBOL(truncate_inode_pages);

/**
 * truncate_inode_pages_final - truncate *all* pages before inode dies
 * @mapping: mapping to truncate
 *
 * Called under (and serialized by) inode->i_mutex.
 *
 * Filesystems have to use this in the .evict_inode path to inform the
 * VM that this is the final truncate and the inode is going away.
 */
void truncate_inode_pages_final(struct address_space *mapping)
{
        unsigned long nrexceptional;
        unsigned long nrpages;

        /*
         * Page reclaim can not participate in regular inode lifetime
         * management (can't call iput()) and thus can race with the
         * inode teardown.  Tell it when the address space is exiting,
         * so that it does not install eviction information after the
         * final truncate has begun.
         */
        mapping_set_exiting(mapping);

        /*
         * When reclaim installs eviction entries, it increases
         * nrexceptional first, then decreases nrpages.  Make sure we see
         * this in the right order or we might miss an entry.
         */
        nrpages = mapping->nrpages;
        smp_rmb();
        nrexceptional = mapping->nrexceptional;

        if (nrpages || nrexceptional) {
                /*
                 * As truncation uses a lockless tree lookup, cycle
                 * the tree lock to make sure any ongoing tree
                 * modification that does not see AS_EXITING is
                 * completed before starting the final truncate.
                 */
                xa_lock_irq(&mapping->i_pages);
                xa_unlock_irq(&mapping->i_pages);
        }

        /*
         * Cleancache needs notification even if there are no pages or shadow
         * entries.
         */
        truncate_inode_pages(mapping, 0);
}
EXPORT_SYMBOL(truncate_inode_pages_final);

static unsigned long __invalidate_mapping_pages(struct address_space *mapping,
                pgoff_t start, pgoff_t end, unsigned long *nr_pagevec)
{
        pgoff_t indices[PAGEVEC_SIZE];
        struct pagevec pvec;
        pgoff_t index = start;
        unsigned long ret;
        unsigned long count = 0;
        int i;

        pagevec_init(&pvec);
        while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
                        min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
                        indices)) {
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        struct page *page = pvec.pages[i];

                        /* We rely upon deletion not changing page->index */
                        index = indices[i];
                        if (index > end)
                                break;

                        if (xa_is_value(page)) {
                                invalidate_exceptional_entry(mapping, index,
                                                             page);
                                continue;
                        }

                        if (!trylock_page(page))
                                continue;

                        WARN_ON(page_to_index(page) != index);

                        /* Middle of THP: skip */
                        if (PageTransTail(page)) {
                                unlock_page(page);
                                continue;
                        } else if (PageTransHuge(page)) {
                                index += HPAGE_PMD_NR - 1;
                                i += HPAGE_PMD_NR - 1;
                                /*
                                 * 'end' is in the middle of THP. Don't
                                 * invalidate the page as the part outside of
                                 * 'end' could be still useful.
                                 */
                                if (index > end) {
                                        unlock_page(page);
                                        continue;
                                }

                                /* Take a pin outside pagevec */
                                get_page(page);

                                /*
                                 * Drop extra pins before trying to invalidate
                                 * the huge page.
                                 */
                                pagevec_remove_exceptionals(&pvec);
                                pagevec_release(&pvec);
                        }

                        ret = invalidate_inode_page(page);
                        unlock_page(page);
                        /*
                         * Invalidation is a hint that the page is no longer
                         * of interest and try to speed up its reclaim.
                         */
                        if (!ret) {
                                deactivate_file_page(page);
                                /* It is likely on the pagevec of a remote CPU */
                                if (nr_pagevec)
                                        (*nr_pagevec)++;
                        }

                        if (PageTransHuge(page))
                                put_page(page);
                        count += ret;
                }
                pagevec_remove_exceptionals(&pvec);
                pagevec_release(&pvec);
                cond_resched();
                index++;
        }
        return count;
}

/**
 * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
 * @mapping: the address_space which holds the pages to invalidate
 * @start: the offset 'from' which to invalidate
 * @end: the offset 'to' which to invalidate (inclusive)
 *
 * This function only removes the unlocked pages, if you want to
 * remove all the pages of one inode, you must call truncate_inode_pages.
 *
 * invalidate_mapping_pages() will not block on IO activity. It will not
 * invalidate pages which are dirty, locked, under writeback or mapped into
 * pagetables.
 *
 * Return: the number of the pages that were invalidated
 */
unsigned long invalidate_mapping_pages(struct address_space *mapping,
                pgoff_t start, pgoff_t end)
{
        return __invalidate_mapping_pages(mapping, start, end, NULL);
}
EXPORT_SYMBOL(invalidate_mapping_pages);

/**
 * This helper is similar with the above one, except that it accounts for pages
 * that are likely on a pagevec and count them in @nr_pagevec, which will used by
 * the caller.
 */
void invalidate_mapping_pagevec(struct address_space *mapping,
                pgoff_t start, pgoff_t end, unsigned long *nr_pagevec)
{
        __invalidate_mapping_pages(mapping, start, end, nr_pagevec);
}

/*
 * This is like invalidate_complete_page(), except it ignores the page's
 * refcount.  We do this because invalidate_inode_pages2() needs stronger
 * invalidation guarantees, and cannot afford to leave pages behind because
 * shrink_page_list() has a temp ref on them, or because they're transiently
 * sitting in the lru_cache_add() pagevecs.
 */
static int
invalidate_complete_page2(struct address_space *mapping, struct page *page)
{
        unsigned long flags;

        if (page->mapping != mapping)
                return 0;

        if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
                return 0;

        xa_lock_irqsave(&mapping->i_pages, flags);
        if (PageDirty(page))
                goto failed;

        BUG_ON(page_has_private(page));
        __delete_from_page_cache(page, NULL);
        xa_unlock_irqrestore(&mapping->i_pages, flags);

        if (mapping->a_ops->freepage)
                mapping->a_ops->freepage(page);

        put_page(page);        /* pagecache ref */
        return 1;
failed:
        xa_unlock_irqrestore(&mapping->i_pages, flags);
        return 0;
}

static int do_launder_page(struct address_space *mapping, struct page *page)
{
        if (!PageDirty(page))
                return 0;
        if (page->mapping != mapping || mapping->a_ops->launder_page == NULL)
                return 0;
        return mapping->a_ops->launder_page(page);
}

/**
 * invalidate_inode_pages2_range - remove range of pages from an address_space
 * @mapping: the address_space
 * @start: the page offset 'from' which to invalidate
 * @end: the page offset 'to' which to invalidate (inclusive)
 *
 * Any pages which are found to be mapped into pagetables are unmapped prior to
 * invalidation.
 *
 * Return: -EBUSY if any pages could not be invalidated.
 */
int invalidate_inode_pages2_range(struct address_space *mapping,
                                  pgoff_t start, pgoff_t end)
{
        pgoff_t indices[PAGEVEC_SIZE];
        struct pagevec pvec;
        pgoff_t index;
        int i;
        int ret = 0;
        int ret2 = 0;
        int did_range_unmap = 0;

        if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
                goto out;

        pagevec_init(&pvec);
        index = start;
        while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
                        min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
                        indices)) {
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        struct page *page = pvec.pages[i];

                        /* We rely upon deletion not changing page->index */
                        index = indices[i];
                        if (index > end)
                                break;

                        if (xa_is_value(page)) {
                                if (!invalidate_exceptional_entry2(mapping,
                                                                   index, page))
                                        ret = -EBUSY;
                                continue;
                        }

                        if (!did_range_unmap && page_mapped(page)) {
                                /*
                                 * If page is mapped, before taking its lock,
                                 * zap the rest of the file in one hit.
                                 */
                                unmap_mapping_pages(mapping, index,
                                                (1 + end - index), false);
                                did_range_unmap = 1;
                        }

                        lock_page(page);
                        WARN_ON(page_to_index(page) != index);
                        if (page->mapping != mapping) {
                                unlock_page(page);
                                continue;
                        }
                        wait_on_page_writeback(page);

                        if (page_mapped(page))
                                unmap_mapping_page(page);
                        BUG_ON(page_mapped(page));

                        ret2 = do_launder_page(mapping, page);
                        if (ret2 == 0) {
                                if (!invalidate_complete_page2(mapping, page))
                                        ret2 = -EBUSY;
                        }
                        if (ret2 < 0)
                                ret = ret2;
                        unlock_page(page);
                }
                pagevec_remove_exceptionals(&pvec);
                pagevec_release(&pvec);
                cond_resched();
                index++;
        }
        /*
         * For DAX we invalidate page tables after invalidating page cache.  We
         * could invalidate page tables while invalidating each entry however
         * that would be expensive. And doing range unmapping before doesn't
         * work as we have no cheap way to find whether page cache entry didn't
         * get remapped later.
         */
        if (dax_mapping(mapping)) {
                unmap_mapping_pages(mapping, start, end - start + 1, false);
        }
out:
        cleancache_invalidate_inode(mapping);
        return ret;
}
EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);

/**
 * invalidate_inode_pages2 - remove all pages from an address_space
 * @mapping: the address_space
 *
 * Any pages which are found to be mapped into pagetables are unmapped prior to
 * invalidation.
 *
 * Return: -EBUSY if any pages could not be invalidated.
 */
int invalidate_inode_pages2(struct address_space *mapping)
{
        return invalidate_inode_pages2_range(mapping, 0, -1);
}
EXPORT_SYMBOL_GPL(invalidate_inode_pages2);

/**
 * truncate_pagecache - unmap and remove pagecache that has been truncated
 * @inode: inode
 * @newsize: new file size
 *
 * inode's new i_size must already be written before truncate_pagecache
 * is called.
 *
 * This function should typically be called before the filesystem
 * releases resources associated with the freed range (eg. deallocates
 * blocks). This way, pagecache will always stay logically coherent
 * with on-disk format, and the filesystem would not have to deal with
 * situations such as writepage being called for a page that has already
 * had its underlying blocks deallocated.
 */
void truncate_pagecache(struct inode *inode, loff_t newsize)
{
        struct address_space *mapping = inode->i_mapping;
        loff_t holebegin = round_up(newsize, PAGE_SIZE);

        /*
         * unmap_mapping_range is called twice, first simply for
         * efficiency so that truncate_inode_pages does fewer
         * single-page unmaps.  However after this first call, and
         * before truncate_inode_pages finishes, it is possible for
         * private pages to be COWed, which remain after
         * truncate_inode_pages finishes, hence the second
         * unmap_mapping_range call must be made for correctness.
         */
        unmap_mapping_range(mapping, holebegin, 0, 1);
        truncate_inode_pages(mapping, newsize);
        unmap_mapping_range(mapping, holebegin, 0, 1);
}
EXPORT_SYMBOL(truncate_pagecache);

/**
 * truncate_setsize - update inode and pagecache for a new file size
 * @inode: inode
 * @newsize: new file size
 *
 * truncate_setsize updates i_size and performs pagecache truncation (if
 * necessary) to @newsize. It will be typically be called from the filesystem's
 * setattr function when ATTR_SIZE is passed in.
 *
 * Must be called with a lock serializing truncates and writes (generally
 * i_mutex but e.g. xfs uses a different lock) and before all filesystem
 * specific block truncation has been performed.
 */
void truncate_setsize(struct inode *inode, loff_t newsize)
{
        loff_t oldsize = inode->i_size;

        i_size_write(inode, newsize);
        if (newsize > oldsize)
                pagecache_isize_extended(inode, oldsize, newsize);
        truncate_pagecache(inode, newsize);
}
EXPORT_SYMBOL(truncate_setsize);

/**
 * pagecache_isize_extended - update pagecache after extension of i_size
 * @inode:        inode for which i_size was extended
 * @from:        original inode size
 * @to:                new inode size
 *
 * Handle extension of inode size either caused by extending truncate or by
 * write starting after current i_size. We mark the page straddling current
 * i_size RO so that page_mkwrite() is called on the nearest write access to
 * the page.  This way filesystem can be sure that page_mkwrite() is called on
 * the page before user writes to the page via mmap after the i_size has been
 * changed.
 *
 * The function must be called after i_size is updated so that page fault
 * coming after we unlock the page will already see the new i_size.
 * The function must be called while we still hold i_mutex - this not only
 * makes sure i_size is stable but also that userspace cannot observe new
 * i_size value before we are prepared to store mmap writes at new inode size.
 */
void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
{
        int bsize = i_blocksize(inode);
        loff_t rounded_from;
        struct page *page;
        pgoff_t index;

        WARN_ON(to > inode->i_size);

        if (from >= to || bsize == PAGE_SIZE)
                return;
        /* Page straddling @from will not have any hole block created? */
        rounded_from = round_up(from, bsize);
        if (to <= rounded_from || !(rounded_from & (PAGE_SIZE - 1)))
                return;

        index = from >> PAGE_SHIFT;
        page = find_lock_page(inode->i_mapping, index);
        /* Page not cached? Nothing to do */
        if (!page)
                return;
        /*
         * See clear_page_dirty_for_io() for details why set_page_dirty()
         * is needed.
         */
        if (page_mkclean(page))
                set_page_dirty(page);
        unlock_page(page);
        put_page(page);
}
EXPORT_SYMBOL(pagecache_isize_extended);

/**
 * truncate_pagecache_range - unmap and remove pagecache that is hole-punched
 * @inode: inode
 * @lstart: offset of beginning of hole
 * @lend: offset of last byte of hole
 *
 * This function should typically be called before the filesystem
 * releases resources associated with the freed range (eg. deallocates
 * blocks). This way, pagecache will always stay logically coherent
 * with on-disk format, and the filesystem would not have to deal with
 * situations such as writepage being called for a page that has already
 * had its underlying blocks deallocated.
 */
void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend)
{
        struct address_space *mapping = inode->i_mapping;
        loff_t unmap_start = round_up(lstart, PAGE_SIZE);
        loff_t unmap_end = round_down(1 + lend, PAGE_SIZE) - 1;
        /*
         * This rounding is currently just for example: unmap_mapping_range
         * expands its hole outwards, whereas we want it to contract the hole
         * inwards.  However, existing callers of truncate_pagecache_range are
         * doing their own page rounding first.  Note that unmap_mapping_range
         * allows holelen 0 for all, and we allow lend -1 for end of file.
         */

        /*
         * Unlike in truncate_pagecache, unmap_mapping_range is called only
         * once (before truncating pagecache), and without "even_cows" flag:
         * hole-punching should not remove private COWed pages from the hole.
         */
        if ((u64)unmap_end > (u64)unmap_start)
                unmap_mapping_range(mapping, unmap_start,
                                    1 + unmap_end - unmap_start, 0);
        truncate_inode_pages_range(mapping, lstart, lend);
}
EXPORT_SYMBOL(truncate_pagecache_range);








































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM fib

#if !defined(_TRACE_FIB_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_FIB_H

#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <net/ip_fib.h>
#include <linux/tracepoint.h>

TRACE_EVENT(fib_table_lookup,

        TP_PROTO(u32 tb_id, const struct flowi4 *flp,
                 const struct fib_nh_common *nhc, int err),

        TP_ARGS(tb_id, flp, nhc, err),

        TP_STRUCT__entry(
                __field(        u32,        tb_id                )
                __field(        int,        err                )
                __field(        int,        oif                )
                __field(        int,        iif                )
                __field(        u8,        proto                )
                __field(        __u8,        tos                )
                __field(        __u8,        scope                )
                __field(        __u8,        flags                )
                __array(        __u8,        src,        4        )
                __array(        __u8,        dst,        4        )
                __array(        __u8,        gw4,        4        )
                __array(        __u8,        gw6,        16        )
                __field(        u16,        sport                )
                __field(        u16,        dport                )
                __dynamic_array(char,  name,   IFNAMSIZ )
        ),

        TP_fast_assign(
                struct in6_addr in6_zero = {};
                struct net_device *dev;
                struct in6_addr *in6;
                __be32 *p32;

                __entry->tb_id = tb_id;
                __entry->err = err;
                __entry->oif = flp->flowi4_oif;
                __entry->iif = flp->flowi4_iif;
                __entry->tos = flp->flowi4_tos;
                __entry->scope = flp->flowi4_scope;
                __entry->flags = flp->flowi4_flags;

                p32 = (__be32 *) __entry->src;
                *p32 = flp->saddr;

                p32 = (__be32 *) __entry->dst;
                *p32 = flp->daddr;

                __entry->proto = flp->flowi4_proto;
                if (__entry->proto == IPPROTO_TCP ||
                    __entry->proto == IPPROTO_UDP) {
                        __entry->sport = ntohs(flp->fl4_sport);
                        __entry->dport = ntohs(flp->fl4_dport);
                } else {
                        __entry->sport = 0;
                        __entry->dport = 0;
                }

                dev = nhc ? nhc->nhc_dev : NULL;
                __assign_str(name, dev ? dev->name : "-");

                if (nhc) {
                        if (nhc->nhc_gw_family == AF_INET) {
                                p32 = (__be32 *) __entry->gw4;
                                *p32 = nhc->nhc_gw.ipv4;

                                in6 = (struct in6_addr *)__entry->gw6;
                                *in6 = in6_zero;
                        } else if (nhc->nhc_gw_family == AF_INET6) {
                                p32 = (__be32 *) __entry->gw4;
                                *p32 = 0;

                                in6 = (struct in6_addr *)__entry->gw6;
                                *in6 = nhc->nhc_gw.ipv6;
                        }
                } else {
                        p32 = (__be32 *) __entry->gw4;
                        *p32 = 0;

                        in6 = (struct in6_addr *)__entry->gw6;
                        *in6 = in6_zero;
                }
        ),

        TP_printk("table %u oif %d iif %d proto %u %pI4/%u -> %pI4/%u tos %d scope %d flags %x ==> dev %s gw %pI4/%pI6c err %d",
                  __entry->tb_id, __entry->oif, __entry->iif, __entry->proto,
                  __entry->src, __entry->sport, __entry->dst, __entry->dport,
                  __entry->tos, __entry->scope, __entry->flags,
                  __get_str(name), __entry->gw4, __entry->gw6, __entry->err)
);
#endif /* _TRACE_FIB_H */

/* This part must be outside protection */
#include <trace/define_trace.h>


















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Statically sized hash table implementation
 * (C) 2012  Sasha Levin <levinsasha928@gmail.com>
 */

#ifndef _LINUX_HASHTABLE_H
#define _LINUX_HASHTABLE_H

#include <linux/list.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/hash.h>
#include <linux/rculist.h>

#define DEFINE_HASHTABLE(name, bits)                                                \
        struct hlist_head name[1 << (bits)] =                                        \
                        { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT }

#define DEFINE_READ_MOSTLY_HASHTABLE(name, bits)                                \
        struct hlist_head name[1 << (bits)] __read_mostly =                        \
                        { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT }

#define DECLARE_HASHTABLE(name, bits)                                           \
        struct hlist_head name[1 << (bits)]

#define HASH_SIZE(name) (ARRAY_SIZE(name))
#define HASH_BITS(name) ilog2(HASH_SIZE(name))

/* Use hash_32 when possible to allow for fast 32bit hashing in 64bit kernels. */
#define hash_min(val, bits)                                                        \
        (sizeof(val) <= 4 ? hash_32(val, bits) : hash_long(val, bits))

static inline void __hash_init(struct hlist_head *ht, unsigned int sz)
{
        unsigned int i;

        for (i = 0; i < sz; i++)
                INIT_HLIST_HEAD(&ht[i]);
}

/**
 * hash_init - initialize a hash table
 * @hashtable: hashtable to be initialized
 *
 * Calculates the size of the hashtable from the given parameter, otherwise
 * same as hash_init_size.
 *
 * This has to be a macro since HASH_BITS() will not work on pointers since
 * it calculates the size during preprocessing.
 */
#define hash_init(hashtable) __hash_init(hashtable, HASH_SIZE(hashtable))

/**
 * hash_add - add an object to a hashtable
 * @hashtable: hashtable to add to
 * @node: the &struct hlist_node of the object to be added
 * @key: the key of the object to be added
 */
#define hash_add(hashtable, node, key)                                                \
        hlist_add_head(node, &hashtable[hash_min(key, HASH_BITS(hashtable))])

/**
 * hash_add_rcu - add an object to a rcu enabled hashtable
 * @hashtable: hashtable to add to
 * @node: the &struct hlist_node of the object to be added
 * @key: the key of the object to be added
 */
#define hash_add_rcu(hashtable, node, key)                                        \
        hlist_add_head_rcu(node, &hashtable[hash_min(key, HASH_BITS(hashtable))])

/**
 * hash_hashed - check whether an object is in any hashtable
 * @node: the &struct hlist_node of the object to be checked
 */
static inline bool hash_hashed(struct hlist_node *node)
{
        return !hlist_unhashed(node);
}

static inline bool __hash_empty(struct hlist_head *ht, unsigned int sz)
{
        unsigned int i;

        for (i = 0; i < sz; i++)
                if (!hlist_empty(&ht[i]))
                        return false;

        return true;
}

/**
 * hash_empty - check whether a hashtable is empty
 * @hashtable: hashtable to check
 *
 * This has to be a macro since HASH_BITS() will not work on pointers since
 * it calculates the size during preprocessing.
 */
#define hash_empty(hashtable) __hash_empty(hashtable, HASH_SIZE(hashtable))

/**
 * hash_del - remove an object from a hashtable
 * @node: &struct hlist_node of the object to remove
 */
static inline void hash_del(struct hlist_node *node)
{
        hlist_del_init(node);
}

/**
 * hash_del_rcu - remove an object from a rcu enabled hashtable
 * @node: &struct hlist_node of the object to remove
 */
static inline void hash_del_rcu(struct hlist_node *node)
{
        hlist_del_init_rcu(node);
}

/**
 * hash_for_each - iterate over a hashtable
 * @name: hashtable to iterate
 * @bkt: integer to use as bucket loop cursor
 * @obj: the type * to use as a loop cursor for each entry
 * @member: the name of the hlist_node within the struct
 */
#define hash_for_each(name, bkt, obj, member)                                \
        for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\
                        (bkt)++)\
                hlist_for_each_entry(obj, &name[bkt], member)

/**
 * hash_for_each_rcu - iterate over a rcu enabled hashtable
 * @name: hashtable to iterate
 * @bkt: integer to use as bucket loop cursor
 * @obj: the type * to use as a loop cursor for each entry
 * @member: the name of the hlist_node within the struct
 */
#define hash_for_each_rcu(name, bkt, obj, member)                        \
        for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\
                        (bkt)++)\
                hlist_for_each_entry_rcu(obj, &name[bkt], member)

/**
 * hash_for_each_safe - iterate over a hashtable safe against removal of
 * hash entry
 * @name: hashtable to iterate
 * @bkt: integer to use as bucket loop cursor
 * @tmp: a &struct hlist_node used for temporary storage
 * @obj: the type * to use as a loop cursor for each entry
 * @member: the name of the hlist_node within the struct
 */
#define hash_for_each_safe(name, bkt, tmp, obj, member)                        \
        for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\
                        (bkt)++)\
                hlist_for_each_entry_safe(obj, tmp, &name[bkt], member)

/**
 * hash_for_each_possible - iterate over all possible objects hashing to the
 * same bucket
 * @name: hashtable to iterate
 * @obj: the type * to use as a loop cursor for each entry
 * @member: the name of the hlist_node within the struct
 * @key: the key of the objects to iterate over
 */
#define hash_for_each_possible(name, obj, member, key)                        \
        hlist_for_each_entry(obj, &name[hash_min(key, HASH_BITS(name))], member)

/**
 * hash_for_each_possible_rcu - iterate over all possible objects hashing to the
 * same bucket in an rcu enabled hashtable
 * @name: hashtable to iterate
 * @obj: the type * to use as a loop cursor for each entry
 * @member: the name of the hlist_node within the struct
 * @key: the key of the objects to iterate over
 */
#define hash_for_each_possible_rcu(name, obj, member, key, cond...)        \
        hlist_for_each_entry_rcu(obj, &name[hash_min(key, HASH_BITS(name))],\
                member, ## cond)

/**
 * hash_for_each_possible_rcu_notrace - iterate over all possible objects hashing
 * to the same bucket in an rcu enabled hashtable in a rcu enabled hashtable
 * @name: hashtable to iterate
 * @obj: the type * to use as a loop cursor for each entry
 * @member: the name of the hlist_node within the struct
 * @key: the key of the objects to iterate over
 *
 * This is the same as hash_for_each_possible_rcu() except that it does
 * not do any RCU debugging or tracing.
 */
#define hash_for_each_possible_rcu_notrace(name, obj, member, key) \
        hlist_for_each_entry_rcu_notrace(obj, \
                &name[hash_min(key, HASH_BITS(name))], member)

/**
 * hash_for_each_possible_safe - iterate over all possible objects hashing to the
 * same bucket safe against removals
 * @name: hashtable to iterate
 * @obj: the type * to use as a loop cursor for each entry
 * @tmp: a &struct hlist_node used for temporary storage
 * @member: the name of the hlist_node within the struct
 * @key: the key of the objects to iterate over
 */
#define hash_for_each_possible_safe(name, obj, tmp, member, key)        \
        hlist_for_each_entry_safe(obj, tmp,\
                &name[hash_min(key, HASH_BITS(name))], member)


#endif













1
2
3
4
5
6
7
8
9
10
11
12
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MSDOS_FS_H
#define _LINUX_MSDOS_FS_H

#include <uapi/linux/msdos_fs.h>

/* media of boot sector */
static inline int fat_valid_media(u8 media)
{
        return 0xf8 <= media || media == 0xf0;
}
#endif /* !_LINUX_MSDOS_FS_H */














































    1 
    1 







































    1 






    1 



























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_POLL_H
#define _LINUX_POLL_H


#include <linux/compiler.h>
#include <linux/ktime.h>
#include <linux/wait.h>
#include <linux/string.h>
#include <linux/fs.h>
#include <linux/sysctl.h>
#include <linux/uaccess.h>
#include <uapi/linux/poll.h>
#include <uapi/linux/eventpoll.h>

extern struct ctl_table epoll_table[]; /* for sysctl */
/* ~832 bytes of stack space used max in sys_select/sys_poll before allocating
   additional memory. */
#define MAX_STACK_ALLOC 832
#define FRONTEND_STACK_ALLOC        256
#define SELECT_STACK_ALLOC        FRONTEND_STACK_ALLOC
#define POLL_STACK_ALLOC        FRONTEND_STACK_ALLOC
#define WQUEUES_STACK_ALLOC        (MAX_STACK_ALLOC - FRONTEND_STACK_ALLOC)
#define N_INLINE_POLL_ENTRIES        (WQUEUES_STACK_ALLOC / sizeof(struct poll_table_entry))

#define DEFAULT_POLLMASK (EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM)

struct poll_table_struct;

/* 
 * structures and helpers for f_op->poll implementations
 */
typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);

/*
 * Do not touch the structure directly, use the access functions
 * poll_does_not_wait() and poll_requested_events() instead.
 */
typedef struct poll_table_struct {
        poll_queue_proc _qproc;
        __poll_t _key;
} poll_table;

static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
        if (p && p->_qproc && wait_address) {
                p->_qproc(filp, wait_address, p);
                /*
                 * This memory barrier is paired in the wq_has_sleeper().
                 * See the comment above prepare_to_wait(), we need to
                 * ensure that subsequent tests in this thread can't be
                 * reordered with __add_wait_queue() in _qproc() paths.
                 */
                smp_mb();
        }
}

/*
 * Return true if it is guaranteed that poll will not wait. This is the case
 * if the poll() of another file descriptor in the set got an event, so there
 * is no need for waiting.
 */
static inline bool poll_does_not_wait(const poll_table *p)
{
        return p == NULL || p->_qproc == NULL;
}

/*
 * Return the set of events that the application wants to poll for.
 * This is useful for drivers that need to know whether a DMA transfer has
 * to be started implicitly on poll(). You typically only want to do that
 * if the application is actually polling for POLLIN and/or POLLOUT.
 */
static inline __poll_t poll_requested_events(const poll_table *p)
{
        return p ? p->_key : ~(__poll_t)0;
}

static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
{
        pt->_qproc = qproc;
        pt->_key   = ~(__poll_t)0; /* all events enabled */
}

static inline bool file_can_poll(struct file *file)
{
        return file->f_op->poll;
}

static inline __poll_t vfs_poll(struct file *file, struct poll_table_struct *pt)
{
        if (unlikely(!file->f_op->poll))
                return DEFAULT_POLLMASK;
        return file->f_op->poll(file, pt);
}

struct poll_table_entry {
        struct file *filp;
        __poll_t key;
        wait_queue_entry_t wait;
        wait_queue_head_t *wait_address;
};

/*
 * Structures and helpers for select/poll syscall
 */
struct poll_wqueues {
        poll_table pt;
        struct poll_table_page *table;
        struct task_struct *polling_task;
        int triggered;
        int error;
        int inline_index;
        struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];
};

extern void poll_initwait(struct poll_wqueues *pwq);
extern void poll_freewait(struct poll_wqueues *pwq);
extern u64 select_estimate_accuracy(struct timespec64 *tv);

#define MAX_INT64_SECONDS (((s64)(~((u64)0)>>1)/HZ)-1)

extern int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
                           fd_set __user *exp, struct timespec64 *end_time);

extern int poll_select_set_timeout(struct timespec64 *to, time64_t sec,
                                   long nsec);

#define __MAP(v, from, to) \
        (from < to ? (v & from) * (to/from) : (v & from) / (from/to))

static inline __u16 mangle_poll(__poll_t val)
{
        __u16 v = (__force __u16)val;
#define M(X) __MAP(v, (__force __u16)EPOLL##X, POLL##X)
        return M(IN) | M(OUT) | M(PRI) | M(ERR) | M(NVAL) |
                M(RDNORM) | M(RDBAND) | M(WRNORM) | M(WRBAND) |
                M(HUP) | M(RDHUP) | M(MSG);
#undef M
}

static inline __poll_t demangle_poll(u16 val)
{
#define M(X) (__force __poll_t)__MAP(val, POLL##X, (__force __u16)EPOLL##X)
        return M(IN) | M(OUT) | M(PRI) | M(ERR) | M(NVAL) |
                M(RDNORM) | M(RDBAND) | M(WRNORM) | M(WRBAND) |
                M(HUP) | M(RDHUP) | M(MSG);
#undef M
}
#undef __MAP


#endif /* _LINUX_POLL_H */





















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Asymmetric Public-key cryptography key type interface
 *
 * See Documentation/crypto/asymmetric-keys.rst
 *
 * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#ifndef _KEYS_ASYMMETRIC_TYPE_H
#define _KEYS_ASYMMETRIC_TYPE_H

#include <linux/key-type.h>
#include <linux/verification.h>

extern struct key_type key_type_asymmetric;

/*
 * The key payload is four words.  The asymmetric-type key uses them as
 * follows:
 */
enum asymmetric_payload_bits {
        asym_crypto,                /* The data representing the key */
        asym_subtype,                /* Pointer to an asymmetric_key_subtype struct */
        asym_key_ids,                /* Pointer to an asymmetric_key_ids struct */
        asym_auth                /* The key's authorisation (signature, parent key ID) */
};

/*
 * Identifiers for an asymmetric key ID.  We have three ways of looking up a
 * key derived from an X.509 certificate:
 *
 * (1) Serial Number & Issuer.  Non-optional.  This is the only valid way to
 *     map a PKCS#7 signature to an X.509 certificate.
 *
 * (2) Issuer & Subject Unique IDs.  Optional.  These were the original way to
 *     match X.509 certificates, but have fallen into disuse in favour of (3).
 *
 * (3) Auth & Subject Key Identifiers.  Optional.  SKIDs are only provided on
 *     CA keys that are intended to sign other keys, so don't appear in end
 *     user certificates unless forced.
 *
 * We could also support an PGP key identifier, which is just a SHA1 sum of the
 * public key and certain parameters, but since we don't support PGP keys at
 * the moment, we shall ignore those.
 *
 * What we actually do is provide a place where binary identifiers can be
 * stashed and then compare against them when checking for an id match.
 */
struct asymmetric_key_id {
        unsigned short        len;
        unsigned char        data[];
};

struct asymmetric_key_ids {
        void                *id[2];
};

extern bool asymmetric_key_id_same(const struct asymmetric_key_id *kid1,
                                   const struct asymmetric_key_id *kid2);

extern bool asymmetric_key_id_partial(const struct asymmetric_key_id *kid1,
                                      const struct asymmetric_key_id *kid2);

extern struct asymmetric_key_id *asymmetric_key_generate_id(const void *val_1,
                                                            size_t len_1,
                                                            const void *val_2,
                                                            size_t len_2);
static inline
const struct asymmetric_key_ids *asymmetric_key_ids(const struct key *key)
{
        return key->payload.data[asym_key_ids];
}

extern struct key *find_asymmetric_key(struct key *keyring,
                                       const struct asymmetric_key_id *id_0,
                                       const struct asymmetric_key_id *id_1,
                                       bool partial);

/*
 * The payload is at the discretion of the subtype.
 */

#endif /* _KEYS_ASYMMETRIC_TYPE_H */































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 


































































































































































































































































































































    6 



























































































    6 

    6 



    6 


    6 












































    6 




    6 

























































    6 




















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MMZONE_H
#define _LINUX_MMZONE_H

#ifndef __ASSEMBLY__
#ifndef __GENERATING_BOUNDS_H

#include <linux/spinlock.h>
#include <linux/list.h>
#include <linux/wait.h>
#include <linux/bitops.h>
#include <linux/cache.h>
#include <linux/threads.h>
#include <linux/numa.h>
#include <linux/init.h>
#include <linux/seqlock.h>
#include <linux/nodemask.h>
#include <linux/pageblock-flags.h>
#include <linux/page-flags-layout.h>
#include <linux/atomic.h>
#include <linux/mm_types.h>
#include <linux/page-flags.h>
#include <asm/page.h>

/* Free memory management - zoned buddy allocator.  */
#ifndef CONFIG_FORCE_MAX_ZONEORDER
#define MAX_ORDER 11
#else
#define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER
#endif
#define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1))

/*
 * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
 * costly to service.  That is between allocation orders which should
 * coalesce naturally under reasonable reclaim pressure and those which
 * will not.
 */
#define PAGE_ALLOC_COSTLY_ORDER 3

enum migratetype {
        MIGRATE_UNMOVABLE,
        MIGRATE_MOVABLE,
        MIGRATE_RECLAIMABLE,
        MIGRATE_PCPTYPES,        /* the number of types on the pcp lists */
        MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES,
#ifdef CONFIG_CMA
        /*
         * MIGRATE_CMA migration type is designed to mimic the way
         * ZONE_MOVABLE works.  Only movable pages can be allocated
         * from MIGRATE_CMA pageblocks and page allocator never
         * implicitly change migration type of MIGRATE_CMA pageblock.
         *
         * The way to use it is to change migratetype of a range of
         * pageblocks to MIGRATE_CMA which can be done by
         * __free_pageblock_cma() function.  What is important though
         * is that a range of pageblocks must be aligned to
         * MAX_ORDER_NR_PAGES should biggest page be bigger then
         * a single pageblock.
         */
        MIGRATE_CMA,
#endif
#ifdef CONFIG_MEMORY_ISOLATION
        MIGRATE_ISOLATE,        /* can't allocate from here */
#endif
        MIGRATE_TYPES
};

/* In mm/page_alloc.c; keep in sync also with show_migration_types() there */
extern const char * const migratetype_names[MIGRATE_TYPES];

#ifdef CONFIG_CMA
#  define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
#  define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA)
#else
#  define is_migrate_cma(migratetype) false
#  define is_migrate_cma_page(_page) false
#endif

static inline bool is_migrate_movable(int mt)
{
        return is_migrate_cma(mt) || mt == MIGRATE_MOVABLE;
}

#define for_each_migratetype_order(order, type) \
        for (order = 0; order < MAX_ORDER; order++) \
                for (type = 0; type < MIGRATE_TYPES; type++)

extern int page_group_by_mobility_disabled;

#define MIGRATETYPE_MASK ((1UL << PB_migratetype_bits) - 1)

#define get_pageblock_migratetype(page)                                        \
        get_pfnblock_flags_mask(page, page_to_pfn(page), MIGRATETYPE_MASK)

struct free_area {
        struct list_head        free_list[MIGRATE_TYPES];
        unsigned long                nr_free;
};

static inline struct page *get_page_from_free_area(struct free_area *area,
                                            int migratetype)
{
        return list_first_entry_or_null(&area->free_list[migratetype],
                                        struct page, lru);
}

static inline bool free_area_empty(struct free_area *area, int migratetype)
{
        return list_empty(&area->free_list[migratetype]);
}

struct pglist_data;

/*
 * zone->lock and the zone lru_lock are two of the hottest locks in the kernel.
 * So add a wild amount of padding here to ensure that they fall into separate
 * cachelines.  There are very few zone structures in the machine, so space
 * consumption is not a concern here.
 */
#if defined(CONFIG_SMP)
struct zone_padding {
        char x[0];
} ____cacheline_internodealigned_in_smp;
#define ZONE_PADDING(name)        struct zone_padding name;
#else
#define ZONE_PADDING(name)
#endif

#ifdef CONFIG_NUMA
enum numa_stat_item {
        NUMA_HIT,                /* allocated in intended node */
        NUMA_MISS,                /* allocated in non intended node */
        NUMA_FOREIGN,                /* was intended here, hit elsewhere */
        NUMA_INTERLEAVE_HIT,        /* interleaver preferred this zone */
        NUMA_LOCAL,                /* allocation from local node */
        NUMA_OTHER,                /* allocation from other node */
        NR_VM_NUMA_STAT_ITEMS
};
#else
#define NR_VM_NUMA_STAT_ITEMS 0
#endif

enum zone_stat_item {
        /* First 128 byte cacheline (assuming 64 bit words) */
        NR_FREE_PAGES,
        NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */
        NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE,
        NR_ZONE_ACTIVE_ANON,
        NR_ZONE_INACTIVE_FILE,
        NR_ZONE_ACTIVE_FILE,
        NR_ZONE_UNEVICTABLE,
        NR_ZONE_WRITE_PENDING,        /* Count of dirty, writeback and unstable pages */
        NR_MLOCK,                /* mlock()ed pages found and moved off LRU */
        NR_PAGETABLE,                /* used for pagetables */
        /* Second 128 byte cacheline */
        NR_BOUNCE,
#if IS_ENABLED(CONFIG_ZSMALLOC)
        NR_ZSPAGES,                /* allocated in zsmalloc */
#endif
        NR_FREE_CMA_PAGES,
        NR_VM_ZONE_STAT_ITEMS };

enum node_stat_item {
        NR_LRU_BASE,
        NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
        NR_ACTIVE_ANON,                /*  "     "     "   "       "         */
        NR_INACTIVE_FILE,        /*  "     "     "   "       "         */
        NR_ACTIVE_FILE,                /*  "     "     "   "       "         */
        NR_UNEVICTABLE,                /*  "     "     "   "       "         */
        NR_SLAB_RECLAIMABLE_B,
        NR_SLAB_UNRECLAIMABLE_B,
        NR_ISOLATED_ANON,        /* Temporary isolated pages from anon lru */
        NR_ISOLATED_FILE,        /* Temporary isolated pages from file lru */
        WORKINGSET_NODES,
        WORKINGSET_REFAULT_BASE,
        WORKINGSET_REFAULT_ANON = WORKINGSET_REFAULT_BASE,
        WORKINGSET_REFAULT_FILE,
        WORKINGSET_ACTIVATE_BASE,
        WORKINGSET_ACTIVATE_ANON = WORKINGSET_ACTIVATE_BASE,
        WORKINGSET_ACTIVATE_FILE,
        WORKINGSET_RESTORE_BASE,
        WORKINGSET_RESTORE_ANON = WORKINGSET_RESTORE_BASE,
        WORKINGSET_RESTORE_FILE,
        WORKINGSET_NODERECLAIM,
        NR_ANON_MAPPED,        /* Mapped anonymous pages */
        NR_FILE_MAPPED,        /* pagecache pages mapped into pagetables.
                           only modified from process context */
        NR_FILE_PAGES,
        NR_FILE_DIRTY,
        NR_WRITEBACK,
        NR_WRITEBACK_TEMP,        /* Writeback using temporary buffers */
        NR_SHMEM,                /* shmem pages (included tmpfs/GEM pages) */
        NR_SHMEM_THPS,
        NR_SHMEM_PMDMAPPED,
        NR_FILE_THPS,
        NR_FILE_PMDMAPPED,
        NR_ANON_THPS,
        NR_VMSCAN_WRITE,
        NR_VMSCAN_IMMEDIATE,        /* Prioritise for reclaim when writeback ends */
        NR_DIRTIED,                /* page dirtyings since bootup */
        NR_WRITTEN,                /* page writings since bootup */
        NR_KERNEL_MISC_RECLAIMABLE,        /* reclaimable non-slab kernel pages */
        NR_FOLL_PIN_ACQUIRED,        /* via: pin_user_page(), gup flag: FOLL_PIN */
        NR_FOLL_PIN_RELEASED,        /* pages returned via unpin_user_page() */
        NR_KERNEL_STACK_KB,        /* measured in KiB */
#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
        NR_KERNEL_SCS_KB,        /* measured in KiB */
#endif
        NR_VM_NODE_STAT_ITEMS
};

/*
 * Returns true if the value is measured in bytes (most vmstat values are
 * measured in pages). This defines the API part, the internal representation
 * might be different.
 */
static __always_inline bool vmstat_item_in_bytes(int idx)
{
        /*
         * Global and per-node slab counters track slab pages.
         * It's expected that changes are multiples of PAGE_SIZE.
         * Internally values are stored in pages.
         *
         * Per-memcg and per-lruvec counters track memory, consumed
         * by individual slab objects. These counters are actually
         * byte-precise.
         */
        return (idx == NR_SLAB_RECLAIMABLE_B ||
                idx == NR_SLAB_UNRECLAIMABLE_B);
}

/*
 * We do arithmetic on the LRU lists in various places in the code,
 * so it is important to keep the active lists LRU_ACTIVE higher in
 * the array than the corresponding inactive lists, and to keep
 * the *_FILE lists LRU_FILE higher than the corresponding _ANON lists.
 *
 * This has to be kept in sync with the statistics in zone_stat_item
 * above and the descriptions in vmstat_text in mm/vmstat.c
 */
#define LRU_BASE 0
#define LRU_ACTIVE 1
#define LRU_FILE 2

enum lru_list {
        LRU_INACTIVE_ANON = LRU_BASE,
        LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE,
        LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,
        LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,
        LRU_UNEVICTABLE,
        NR_LRU_LISTS
};

#define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++)

#define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++)

static inline bool is_file_lru(enum lru_list lru)
{
        return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE);
}

static inline bool is_active_lru(enum lru_list lru)
{
        return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE);
}

#define ANON_AND_FILE 2

enum lruvec_flags {
        LRUVEC_CONGESTED,                /* lruvec has many dirty pages
                                         * backed by a congested BDI
                                         */
};

struct lruvec {
        struct list_head                lists[NR_LRU_LISTS];
        /*
         * These track the cost of reclaiming one LRU - file or anon -
         * over the other. As the observed cost of reclaiming one LRU
         * increases, the reclaim scan balance tips toward the other.
         */
        unsigned long                        anon_cost;
        unsigned long                        file_cost;
        /* Non-resident age, driven by LRU movement */
        atomic_long_t                        nonresident_age;
        /* Refaults at the time of last reclaim cycle */
        unsigned long                        refaults[ANON_AND_FILE];
        /* Various lruvec state flags (enum lruvec_flags) */
        unsigned long                        flags;
#ifdef CONFIG_MEMCG
        struct pglist_data *pgdat;
#endif
};

/* Isolate unmapped pages */
#define ISOLATE_UNMAPPED        ((__force isolate_mode_t)0x2)
/* Isolate for asynchronous migration */
#define ISOLATE_ASYNC_MIGRATE        ((__force isolate_mode_t)0x4)
/* Isolate unevictable pages */
#define ISOLATE_UNEVICTABLE        ((__force isolate_mode_t)0x8)

/* LRU Isolation modes. */
typedef unsigned __bitwise isolate_mode_t;

enum zone_watermarks {
        WMARK_MIN,
        WMARK_LOW,
        WMARK_HIGH,
        NR_WMARK
};

#define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost)
#define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost)
#define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
#define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)

struct per_cpu_pages {
        int count;                /* number of pages in the list */
        int high;                /* high watermark, emptying needed */
        int batch;                /* chunk size for buddy add/remove */

        /* Lists of pages, one per migrate type stored on the pcp-lists */
        struct list_head lists[MIGRATE_PCPTYPES];
};

struct per_cpu_pageset {
        struct per_cpu_pages pcp;
#ifdef CONFIG_NUMA
        s8 expire;
        u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
#endif
#ifdef CONFIG_SMP
        s8 stat_threshold;
        s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
#endif
};

struct per_cpu_nodestat {
        s8 stat_threshold;
        s8 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS];
};

#endif /* !__GENERATING_BOUNDS.H */

enum zone_type {
        /*
         * ZONE_DMA and ZONE_DMA32 are used when there are peripherals not able
         * to DMA to all of the addressable memory (ZONE_NORMAL).
         * On architectures where this area covers the whole 32 bit address
         * space ZONE_DMA32 is used. ZONE_DMA is left for the ones with smaller
         * DMA addressing constraints. This distinction is important as a 32bit
         * DMA mask is assumed when ZONE_DMA32 is defined. Some 64-bit
         * platforms may need both zones as they support peripherals with
         * different DMA addressing limitations.
         */
#ifdef CONFIG_ZONE_DMA
        ZONE_DMA,
#endif
#ifdef CONFIG_ZONE_DMA32
        ZONE_DMA32,
#endif
        /*
         * Normal addressable memory is in ZONE_NORMAL. DMA operations can be
         * performed on pages in ZONE_NORMAL if the DMA devices support
         * transfers to all addressable memory.
         */
        ZONE_NORMAL,
#ifdef CONFIG_HIGHMEM
        /*
         * A memory area that is only addressable by the kernel through
         * mapping portions into its own address space. This is for example
         * used by i386 to allow the kernel to address the memory beyond
         * 900MB. The kernel will set up special mappings (page
         * table entries on i386) for each page that the kernel needs to
         * access.
         */
        ZONE_HIGHMEM,
#endif
        /*
         * ZONE_MOVABLE is similar to ZONE_NORMAL, except that it contains
         * movable pages with few exceptional cases described below. Main use
         * cases for ZONE_MOVABLE are to make memory offlining/unplug more
         * likely to succeed, and to locally limit unmovable allocations - e.g.,
         * to increase the number of THP/huge pages. Notable special cases are:
         *
         * 1. Pinned pages: (long-term) pinning of movable pages might
         *    essentially turn such pages unmovable. Memory offlining might
         *    retry a long time.
         * 2. memblock allocations: kernelcore/movablecore setups might create
         *    situations where ZONE_MOVABLE contains unmovable allocations
         *    after boot. Memory offlining and allocations fail early.
         * 3. Memory holes: kernelcore/movablecore setups might create very rare
         *    situations where ZONE_MOVABLE contains memory holes after boot,
         *    for example, if we have sections that are only partially
         *    populated. Memory offlining and allocations fail early.
         * 4. PG_hwpoison pages: while poisoned pages can be skipped during
         *    memory offlining, such pages cannot be allocated.
         * 5. Unmovable PG_offline pages: in paravirtualized environments,
         *    hotplugged memory blocks might only partially be managed by the
         *    buddy (e.g., via XEN-balloon, Hyper-V balloon, virtio-mem). The
         *    parts not manged by the buddy are unmovable PG_offline pages. In
         *    some cases (virtio-mem), such pages can be skipped during
         *    memory offlining, however, cannot be moved/allocated. These
         *    techniques might use alloc_contig_range() to hide previously
         *    exposed pages from the buddy again (e.g., to implement some sort
         *    of memory unplug in virtio-mem).
         *
         * In general, no unmovable allocations that degrade memory offlining
         * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range())
         * have to expect that migrating pages in ZONE_MOVABLE can fail (even
         * if has_unmovable_pages() states that there are no unmovable pages,
         * there can be false negatives).
         */
        ZONE_MOVABLE,
#ifdef CONFIG_ZONE_DEVICE
        ZONE_DEVICE,
#endif
        __MAX_NR_ZONES

};

#ifndef __GENERATING_BOUNDS_H

#define ASYNC_AND_SYNC 2

struct zone {
        /* Read-mostly fields */

        /* zone watermarks, access with *_wmark_pages(zone) macros */
        unsigned long _watermark[NR_WMARK];
        unsigned long watermark_boost;

        unsigned long nr_reserved_highatomic;

        /*
         * We don't know if the memory that we're going to allocate will be
         * freeable or/and it will be released eventually, so to avoid totally
         * wasting several GB of ram we must reserve some of the lower zone
         * memory (otherwise we risk to run OOM on the lower zones despite
         * there being tons of freeable ram on the higher zones).  This array is
         * recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl
         * changes.
         */
        long lowmem_reserve[MAX_NR_ZONES];

#ifdef CONFIG_NEED_MULTIPLE_NODES
        int node;
#endif
        struct pglist_data        *zone_pgdat;
        struct per_cpu_pageset __percpu *pageset;

#ifndef CONFIG_SPARSEMEM
        /*
         * Flags for a pageblock_nr_pages block. See pageblock-flags.h.
         * In SPARSEMEM, this map is stored in struct mem_section
         */
        unsigned long                *pageblock_flags;
#endif /* CONFIG_SPARSEMEM */

        /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
        unsigned long                zone_start_pfn;

        /*
         * spanned_pages is the total pages spanned by the zone, including
         * holes, which is calculated as:
         *         spanned_pages = zone_end_pfn - zone_start_pfn;
         *
         * present_pages is physical pages existing within the zone, which
         * is calculated as:
         *        present_pages = spanned_pages - absent_pages(pages in holes);
         *
         * managed_pages is present pages managed by the buddy system, which
         * is calculated as (reserved_pages includes pages allocated by the
         * bootmem allocator):
         *        managed_pages = present_pages - reserved_pages;
         *
         * So present_pages may be used by memory hotplug or memory power
         * management logic to figure out unmanaged pages by checking
         * (present_pages - managed_pages). And managed_pages should be used
         * by page allocator and vm scanner to calculate all kinds of watermarks
         * and thresholds.
         *
         * Locking rules:
         *
         * zone_start_pfn and spanned_pages are protected by span_seqlock.
         * It is a seqlock because it has to be read outside of zone->lock,
         * and it is done in the main allocator path.  But, it is written
         * quite infrequently.
         *
         * The span_seq lock is declared along with zone->lock because it is
         * frequently read in proximity to zone->lock.  It's good to
         * give them a chance of being in the same cacheline.
         *
         * Write access to present_pages at runtime should be protected by
         * mem_hotplug_begin/end(). Any reader who can't tolerant drift of
         * present_pages should get_online_mems() to get a stable value.
         */
        atomic_long_t                managed_pages;
        unsigned long                spanned_pages;
        unsigned long                present_pages;

        const char                *name;

#ifdef CONFIG_MEMORY_ISOLATION
        /*
         * Number of isolated pageblock. It is used to solve incorrect
         * freepage counting problem due to racy retrieving migratetype
         * of pageblock. Protected by zone->lock.
         */
        unsigned long                nr_isolate_pageblock;
#endif

#ifdef CONFIG_MEMORY_HOTPLUG
        /* see spanned/present_pages for more description */
        seqlock_t                span_seqlock;
#endif

        int initialized;

        /* Write-intensive fields used from the page allocator */
        ZONE_PADDING(_pad1_)

        /* free areas of different sizes */
        struct free_area        free_area[MAX_ORDER];

        /* zone flags, see below */
        unsigned long                flags;

        /* Primarily protects free_area */
        spinlock_t                lock;

        /* Write-intensive fields used by compaction and vmstats. */
        ZONE_PADDING(_pad2_)

        /*
         * When free pages are below this point, additional steps are taken
         * when reading the number of free pages to avoid per-cpu counter
         * drift allowing watermarks to be breached
         */
        unsigned long percpu_drift_mark;

#if defined CONFIG_COMPACTION || defined CONFIG_CMA
        /* pfn where compaction free scanner should start */
        unsigned long                compact_cached_free_pfn;
        /* pfn where compaction migration scanner should start */
        unsigned long                compact_cached_migrate_pfn[ASYNC_AND_SYNC];
        unsigned long                compact_init_migrate_pfn;
        unsigned long                compact_init_free_pfn;
#endif

#ifdef CONFIG_COMPACTION
        /*
         * On compaction failure, 1<<compact_defer_shift compactions
         * are skipped before trying again. The number attempted since
         * last failure is tracked with compact_considered.
         * compact_order_failed is the minimum compaction failed order.
         */
        unsigned int                compact_considered;
        unsigned int                compact_defer_shift;
        int                        compact_order_failed;
#endif

#if defined CONFIG_COMPACTION || defined CONFIG_CMA
        /* Set to true when the PG_migrate_skip bits should be cleared */
        bool                        compact_blockskip_flush;
#endif

        bool                        contiguous;

        ZONE_PADDING(_pad3_)
        /* Zone statistics */
        atomic_long_t                vm_stat[NR_VM_ZONE_STAT_ITEMS];
        atomic_long_t                vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
} ____cacheline_internodealigned_in_smp;

enum pgdat_flags {
        PGDAT_DIRTY,                        /* reclaim scanning has recently found
                                         * many dirty file pages at the tail
                                         * of the LRU.
                                         */
        PGDAT_WRITEBACK,                /* reclaim scanning has recently found
                                         * many pages under writeback
                                         */
        PGDAT_RECLAIM_LOCKED,                /* prevents concurrent reclaim */
};

enum zone_flags {
        ZONE_BOOSTED_WATERMARK,                /* zone recently boosted watermarks.
                                         * Cleared when kswapd is woken.
                                         */
};

static inline unsigned long zone_managed_pages(struct zone *zone)
{
        return (unsigned long)atomic_long_read(&zone->managed_pages);
}

static inline unsigned long zone_end_pfn(const struct zone *zone)
{
        return zone->zone_start_pfn + zone->spanned_pages;
}

static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn)
{
        return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone);
}

static inline bool zone_is_initialized(struct zone *zone)
{
        return zone->initialized;
}

static inline bool zone_is_empty(struct zone *zone)
{
        return zone->spanned_pages == 0;
}

/*
 * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty
 * intersection with the given zone
 */
static inline bool zone_intersects(struct zone *zone,
                unsigned long start_pfn, unsigned long nr_pages)
{
        if (zone_is_empty(zone))
                return false;
        if (start_pfn >= zone_end_pfn(zone) ||
            start_pfn + nr_pages <= zone->zone_start_pfn)
                return false;

        return true;
}

/*
 * The "priority" of VM scanning is how much of the queues we will scan in one
 * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
 * queues ("queue_length >> 12") during an aging round.
 */
#define DEF_PRIORITY 12

/* Maximum number of zones on a zonelist */
#define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES)

enum {
        ZONELIST_FALLBACK,        /* zonelist with fallback */
#ifdef CONFIG_NUMA
        /*
         * The NUMA zonelists are doubled because we need zonelists that
         * restrict the allocations to a single node for __GFP_THISNODE.
         */
        ZONELIST_NOFALLBACK,        /* zonelist without fallback (__GFP_THISNODE) */
#endif
        MAX_ZONELISTS
};

/*
 * This struct contains information about a zone in a zonelist. It is stored
 * here to avoid dereferences into large structures and lookups of tables
 */
struct zoneref {
        struct zone *zone;        /* Pointer to actual zone */
        int zone_idx;                /* zone_idx(zoneref->zone) */
};

/*
 * One allocation request operates on a zonelist. A zonelist
 * is a list of zones, the first one is the 'goal' of the
 * allocation, the other zones are fallback zones, in decreasing
 * priority.
 *
 * To speed the reading of the zonelist, the zonerefs contain the zone index
 * of the entry being read. Helper functions to access information given
 * a struct zoneref are
 *
 * zonelist_zone()        - Return the struct zone * for an entry in _zonerefs
 * zonelist_zone_idx()        - Return the index of the zone for an entry
 * zonelist_node_idx()        - Return the index of the node for an entry
 */
struct zonelist {
        struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
};

#ifndef CONFIG_DISCONTIGMEM
/* The array of struct pages - for discontigmem use pgdat->lmem_map */
extern struct page *mem_map;
#endif

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct deferred_split {
        spinlock_t split_queue_lock;
        struct list_head split_queue;
        unsigned long split_queue_len;
};
#endif

/*
 * On NUMA machines, each NUMA node would have a pg_data_t to describe
 * it's memory layout. On UMA machines there is a single pglist_data which
 * describes the whole memory.
 *
 * Memory statistics and page replacement data structures are maintained on a
 * per-zone basis.
 */
typedef struct pglist_data {
        /*
         * node_zones contains just the zones for THIS node. Not all of the
         * zones may be populated, but it is the full list. It is referenced by
         * this node's node_zonelists as well as other node's node_zonelists.
         */
        struct zone node_zones[MAX_NR_ZONES];

        /*
         * node_zonelists contains references to all zones in all nodes.
         * Generally the first zones will be references to this node's
         * node_zones.
         */
        struct zonelist node_zonelists[MAX_ZONELISTS];

        int nr_zones; /* number of populated zones in this node */
#ifdef CONFIG_FLAT_NODE_MEM_MAP        /* means !SPARSEMEM */
        struct page *node_mem_map;
#ifdef CONFIG_PAGE_EXTENSION
        struct page_ext *node_page_ext;
#endif
#endif
#if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT)
        /*
         * Must be held any time you expect node_start_pfn,
         * node_present_pages, node_spanned_pages or nr_zones to stay constant.
         * Also synchronizes pgdat->first_deferred_pfn during deferred page
         * init.
         *
         * pgdat_resize_lock() and pgdat_resize_unlock() are provided to
         * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG
         * or CONFIG_DEFERRED_STRUCT_PAGE_INIT.
         *
         * Nests above zone->lock and zone->span_seqlock
         */
        spinlock_t node_size_lock;
#endif
        unsigned long node_start_pfn;
        unsigned long node_present_pages; /* total number of physical pages */
        unsigned long node_spanned_pages; /* total size of physical page
                                             range, including holes */
        int node_id;
        wait_queue_head_t kswapd_wait;
        wait_queue_head_t pfmemalloc_wait;
        struct task_struct *kswapd;        /* Protected by
                                           mem_hotplug_begin/end() */
        int kswapd_order;
        enum zone_type kswapd_highest_zoneidx;

        int kswapd_failures;                /* Number of 'reclaimed == 0' runs */

#ifdef CONFIG_COMPACTION
        int kcompactd_max_order;
        enum zone_type kcompactd_highest_zoneidx;
        wait_queue_head_t kcompactd_wait;
        struct task_struct *kcompactd;
#endif
        /*
         * This is a per-node reserve of pages that are not available
         * to userspace allocations.
         */
        unsigned long                totalreserve_pages;

#ifdef CONFIG_NUMA
        /*
         * node reclaim becomes active if more unmapped pages exist.
         */
        unsigned long                min_unmapped_pages;
        unsigned long                min_slab_pages;
#endif /* CONFIG_NUMA */

        /* Write-intensive fields used by page reclaim */
        ZONE_PADDING(_pad1_)
        spinlock_t                lru_lock;

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
        /*
         * If memory initialisation on large machines is deferred then this
         * is the first PFN that needs to be initialised.
         */
        unsigned long first_deferred_pfn;
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        struct deferred_split deferred_split_queue;
#endif

        /* Fields commonly accessed by the page reclaim scanner */

        /*
         * NOTE: THIS IS UNUSED IF MEMCG IS ENABLED.
         *
         * Use mem_cgroup_lruvec() to look up lruvecs.
         */
        struct lruvec                __lruvec;

        unsigned long                flags;

        ZONE_PADDING(_pad2_)

        /* Per-node vmstats */
        struct per_cpu_nodestat __percpu *per_cpu_nodestats;
        atomic_long_t                vm_stat[NR_VM_NODE_STAT_ITEMS];
} pg_data_t;

#define node_present_pages(nid)        (NODE_DATA(nid)->node_present_pages)
#define node_spanned_pages(nid)        (NODE_DATA(nid)->node_spanned_pages)
#ifdef CONFIG_FLAT_NODE_MEM_MAP
#define pgdat_page_nr(pgdat, pagenr)        ((pgdat)->node_mem_map + (pagenr))
#else
#define pgdat_page_nr(pgdat, pagenr)        pfn_to_page((pgdat)->node_start_pfn + (pagenr))
#endif
#define nid_page_nr(nid, pagenr)         pgdat_page_nr(NODE_DATA(nid),(pagenr))

#define node_start_pfn(nid)        (NODE_DATA(nid)->node_start_pfn)
#define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid))

static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
{
        return pgdat->node_start_pfn + pgdat->node_spanned_pages;
}

static inline bool pgdat_is_empty(pg_data_t *pgdat)
{
        return !pgdat->node_start_pfn && !pgdat->node_spanned_pages;
}

#include <linux/memory_hotplug.h>

void build_all_zonelists(pg_data_t *pgdat);
void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order,
                   enum zone_type highest_zoneidx);
bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
                         int highest_zoneidx, unsigned int alloc_flags,
                         long free_pages);
bool zone_watermark_ok(struct zone *z, unsigned int order,
                unsigned long mark, int highest_zoneidx,
                unsigned int alloc_flags);
bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
                unsigned long mark, int highest_zoneidx);
/*
 * Memory initialization context, use to differentiate memory added by
 * the platform statically or via memory hotplug interface.
 */
enum meminit_context {
        MEMINIT_EARLY,
        MEMINIT_HOTPLUG,
};

extern void init_currently_empty_zone(struct zone *zone, unsigned long start_pfn,
                                     unsigned long size);

extern void lruvec_init(struct lruvec *lruvec);

static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
{
#ifdef CONFIG_MEMCG
        return lruvec->pgdat;
#else
        return container_of(lruvec, struct pglist_data, __lruvec);
#endif
}

extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx);

#ifdef CONFIG_HAVE_MEMORYLESS_NODES
int local_memory_node(int node_id);
#else
static inline int local_memory_node(int node_id) { return node_id; };
#endif

/*
 * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc.
 */
#define zone_idx(zone)                ((zone) - (zone)->zone_pgdat->node_zones)

/*
 * Returns true if a zone has pages managed by the buddy allocator.
 * All the reclaim decisions have to use this function rather than
 * populated_zone(). If the whole zone is reserved then we can easily
 * end up with populated_zone() && !managed_zone().
 */
static inline bool managed_zone(struct zone *zone)
{
        return zone_managed_pages(zone);
}

/* Returns true if a zone has memory */
static inline bool populated_zone(struct zone *zone)
{
        return zone->present_pages;
}

#ifdef CONFIG_NEED_MULTIPLE_NODES
static inline int zone_to_nid(struct zone *zone)
{
        return zone->node;
}

static inline void zone_set_nid(struct zone *zone, int nid)
{
        zone->node = nid;
}
#else
static inline int zone_to_nid(struct zone *zone)
{
        return 0;
}

static inline void zone_set_nid(struct zone *zone, int nid) {}
#endif

extern int movable_zone;

#ifdef CONFIG_HIGHMEM
static inline int zone_movable_is_highmem(void)
{
#ifdef CONFIG_NEED_MULTIPLE_NODES
        return movable_zone == ZONE_HIGHMEM;
#else
        return (ZONE_MOVABLE - 1) == ZONE_HIGHMEM;
#endif
}
#endif

static inline int is_highmem_idx(enum zone_type idx)
{
#ifdef CONFIG_HIGHMEM
        return (idx == ZONE_HIGHMEM ||
                (idx == ZONE_MOVABLE && zone_movable_is_highmem()));
#else
        return 0;
#endif
}

#ifdef CONFIG_ZONE_DMA
bool has_managed_dma(void);
#else
static inline bool has_managed_dma(void)
{
        return false;
}
#endif

/**
 * is_highmem - helper function to quickly check if a struct zone is a
 *              highmem zone or not.  This is an attempt to keep references
 *              to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum.
 * @zone - pointer to struct zone variable
 */
static inline int is_highmem(struct zone *zone)
{
#ifdef CONFIG_HIGHMEM
        return is_highmem_idx(zone_idx(zone));
#else
        return 0;
#endif
}

/* These two functions are used to setup the per zone pages min values */
struct ctl_table;

int min_free_kbytes_sysctl_handler(struct ctl_table *, int, void *, size_t *,
                loff_t *);
int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void *,
                size_t *, loff_t *);
extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES];
int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void *,
                size_t *, loff_t *);
int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int,
                void *, size_t *, loff_t *);
int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
                void *, size_t *, loff_t *);
int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
                void *, size_t *, loff_t *);
int numa_zonelist_order_handler(struct ctl_table *, int,
                void *, size_t *, loff_t *);
extern int percpu_pagelist_fraction;
extern char numa_zonelist_order[];
#define NUMA_ZONELIST_ORDER_LEN        16

#ifndef CONFIG_NEED_MULTIPLE_NODES

extern struct pglist_data contig_page_data;
#define NODE_DATA(nid)                (&contig_page_data)
#define NODE_MEM_MAP(nid)        mem_map

#else /* CONFIG_NEED_MULTIPLE_NODES */

#include <asm/mmzone.h>

#endif /* !CONFIG_NEED_MULTIPLE_NODES */

extern struct pglist_data *first_online_pgdat(void);
extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat);
extern struct zone *next_zone(struct zone *zone);

/**
 * for_each_online_pgdat - helper macro to iterate over all online nodes
 * @pgdat - pointer to a pg_data_t variable
 */
#define for_each_online_pgdat(pgdat)                        \
        for (pgdat = first_online_pgdat();                \
             pgdat;                                        \
             pgdat = next_online_pgdat(pgdat))
/**
 * for_each_zone - helper macro to iterate over all memory zones
 * @zone - pointer to struct zone variable
 *
 * The user only needs to declare the zone variable, for_each_zone
 * fills it in.
 */
#define for_each_zone(zone)                                \
        for (zone = (first_online_pgdat())->node_zones; \
             zone;                                        \
             zone = next_zone(zone))

#define for_each_populated_zone(zone)                        \
        for (zone = (first_online_pgdat())->node_zones; \
             zone;                                        \
             zone = next_zone(zone))                        \
                if (!populated_zone(zone))                \
                        ; /* do nothing */                \
                else

static inline struct zone *zonelist_zone(struct zoneref *zoneref)
{
        return zoneref->zone;
}

static inline int zonelist_zone_idx(struct zoneref *zoneref)
{
        return zoneref->zone_idx;
}

static inline int zonelist_node_idx(struct zoneref *zoneref)
{
        return zone_to_nid(zoneref->zone);
}

struct zoneref *__next_zones_zonelist(struct zoneref *z,
                                        enum zone_type highest_zoneidx,
                                        nodemask_t *nodes);

/**
 * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point
 * @z - The cursor used as a starting point for the search
 * @highest_zoneidx - The zone index of the highest zone to return
 * @nodes - An optional nodemask to filter the zonelist with
 *
 * This function returns the next zone at or below a given zone index that is
 * within the allowed nodemask using a cursor as the starting point for the
 * search. The zoneref returned is a cursor that represents the current zone
 * being examined. It should be advanced by one before calling
 * next_zones_zonelist again.
 */
static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z,
                                        enum zone_type highest_zoneidx,
                                        nodemask_t *nodes)
{
        if (likely(!nodes && zonelist_zone_idx(z) <= highest_zoneidx))
                return z;
        return __next_zones_zonelist(z, highest_zoneidx, nodes);
}

/**
 * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist
 * @zonelist - The zonelist to search for a suitable zone
 * @highest_zoneidx - The zone index of the highest zone to return
 * @nodes - An optional nodemask to filter the zonelist with
 * @return - Zoneref pointer for the first suitable zone found (see below)
 *
 * This function returns the first zone at or below a given zone index that is
 * within the allowed nodemask. The zoneref returned is a cursor that can be
 * used to iterate the zonelist with next_zones_zonelist by advancing it by
 * one before calling.
 *
 * When no eligible zone is found, zoneref->zone is NULL (zoneref itself is
 * never NULL). This may happen either genuinely, or due to concurrent nodemask
 * update due to cpuset modification.
 */
static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
                                        enum zone_type highest_zoneidx,
                                        nodemask_t *nodes)
{
        return next_zones_zonelist(zonelist->_zonerefs,
                                                        highest_zoneidx, nodes);
}

/**
 * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask
 * @zone - The current zone in the iterator
 * @z - The current pointer within zonelist->_zonerefs being iterated
 * @zlist - The zonelist being iterated
 * @highidx - The zone index of the highest zone to return
 * @nodemask - Nodemask allowed by the allocator
 *
 * This iterator iterates though all zones at or below a given zone index and
 * within a given nodemask
 */
#define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
        for (z = first_zones_zonelist(zlist, highidx, nodemask), zone = zonelist_zone(z);        \
                zone;                                                        \
                z = next_zones_zonelist(++z, highidx, nodemask),        \
                        zone = zonelist_zone(z))

#define for_next_zone_zonelist_nodemask(zone, z, highidx, nodemask) \
        for (zone = z->zone;        \
                zone;                                                        \
                z = next_zones_zonelist(++z, highidx, nodemask),        \
                        zone = zonelist_zone(z))


/**
 * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
 * @zone - The current zone in the iterator
 * @z - The current pointer within zonelist->zones being iterated
 * @zlist - The zonelist being iterated
 * @highidx - The zone index of the highest zone to return
 *
 * This iterator iterates though all zones at or below a given zone index.
 */
#define for_each_zone_zonelist(zone, z, zlist, highidx) \
        for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL)

/* Whether the 'nodes' are all movable nodes */
static inline bool movable_only_nodes(nodemask_t *nodes)
{
        struct zonelist *zonelist;
        struct zoneref *z;
        int nid;

        if (nodes_empty(*nodes))
                return false;

        /*
         * We can chose arbitrary node from the nodemask to get a
         * zonelist as they are interlinked. We just need to find
         * at least one zone that can satisfy kernel allocations.
         */
        nid = first_node(*nodes);
        zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
        z = first_zones_zonelist(zonelist, ZONE_NORMAL,        nodes);
        return (!z->zone) ? true : false;
}


#ifdef CONFIG_SPARSEMEM
#include <asm/sparsemem.h>
#endif

#ifdef CONFIG_FLATMEM
#define pfn_to_nid(pfn)                (0)
#endif

#ifdef CONFIG_SPARSEMEM

/*
 * SECTION_SHIFT                    #bits space required to store a section #
 *
 * PA_SECTION_SHIFT                physical address to/from section number
 * PFN_SECTION_SHIFT                pfn to/from section number
 */
#define PA_SECTION_SHIFT        (SECTION_SIZE_BITS)
#define PFN_SECTION_SHIFT        (SECTION_SIZE_BITS - PAGE_SHIFT)

#define NR_MEM_SECTIONS                (1UL << SECTIONS_SHIFT)

#define PAGES_PER_SECTION       (1UL << PFN_SECTION_SHIFT)
#define PAGE_SECTION_MASK        (~(PAGES_PER_SECTION-1))

#define SECTION_BLOCKFLAGS_BITS \
        ((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS)

#if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS
#error Allocator MAX_ORDER exceeds SECTION_SIZE
#endif

static inline unsigned long pfn_to_section_nr(unsigned long pfn)
{
        return pfn >> PFN_SECTION_SHIFT;
}
static inline unsigned long section_nr_to_pfn(unsigned long sec)
{
        return sec << PFN_SECTION_SHIFT;
}

#define SECTION_ALIGN_UP(pfn)        (((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK)
#define SECTION_ALIGN_DOWN(pfn)        ((pfn) & PAGE_SECTION_MASK)

#define SUBSECTION_SHIFT 21
#define SUBSECTION_SIZE (1UL << SUBSECTION_SHIFT)

#define PFN_SUBSECTION_SHIFT (SUBSECTION_SHIFT - PAGE_SHIFT)
#define PAGES_PER_SUBSECTION (1UL << PFN_SUBSECTION_SHIFT)
#define PAGE_SUBSECTION_MASK (~(PAGES_PER_SUBSECTION-1))

#if SUBSECTION_SHIFT > SECTION_SIZE_BITS
#error Subsection size exceeds section size
#else
#define SUBSECTIONS_PER_SECTION (1UL << (SECTION_SIZE_BITS - SUBSECTION_SHIFT))
#endif

#define SUBSECTION_ALIGN_UP(pfn) ALIGN((pfn), PAGES_PER_SUBSECTION)
#define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK)

struct mem_section_usage {
        struct rcu_head rcu;
#ifdef CONFIG_SPARSEMEM_VMEMMAP
        DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
#endif
        /* See declaration of similar field in struct zone */
        unsigned long pageblock_flags[0];
};

void subsection_map_init(unsigned long pfn, unsigned long nr_pages);

struct page;
struct page_ext;
struct mem_section {
        /*
         * This is, logically, a pointer to an array of struct
         * pages.  However, it is stored with some other magic.
         * (see sparse.c::sparse_init_one_section())
         *
         * Additionally during early boot we encode node id of
         * the location of the section here to guide allocation.
         * (see sparse.c::memory_present())
         *
         * Making it a UL at least makes someone do a cast
         * before using it wrong.
         */
        unsigned long section_mem_map;

        struct mem_section_usage *usage;
#ifdef CONFIG_PAGE_EXTENSION
        /*
         * If SPARSEMEM, pgdat doesn't have page_ext pointer. We use
         * section. (see page_ext.h about this.)
         */
        struct page_ext *page_ext;
        unsigned long pad;
#endif
        /*
         * WARNING: mem_section must be a power-of-2 in size for the
         * calculation and use of SECTION_ROOT_MASK to make sense.
         */
};

#ifdef CONFIG_SPARSEMEM_EXTREME
#define SECTIONS_PER_ROOT       (PAGE_SIZE / sizeof (struct mem_section))
#else
#define SECTIONS_PER_ROOT        1
#endif

#define SECTION_NR_TO_ROOT(sec)        ((sec) / SECTIONS_PER_ROOT)
#define NR_SECTION_ROOTS        DIV_ROUND_UP(NR_MEM_SECTIONS, SECTIONS_PER_ROOT)
#define SECTION_ROOT_MASK        (SECTIONS_PER_ROOT - 1)

#ifdef CONFIG_SPARSEMEM_EXTREME
extern struct mem_section **mem_section;
#else
extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT];
#endif

static inline unsigned long *section_to_usemap(struct mem_section *ms)
{
        return ms->usage->pageblock_flags;
}

static inline struct mem_section *__nr_to_section(unsigned long nr)
{
        unsigned long root = SECTION_NR_TO_ROOT(nr);

        if (unlikely(root >= NR_SECTION_ROOTS))
                return NULL;

#ifdef CONFIG_SPARSEMEM_EXTREME
        if (!mem_section || !mem_section[root])
                return NULL;
#endif
        return &mem_section[root][nr & SECTION_ROOT_MASK];
}
extern unsigned long __section_nr(struct mem_section *ms);
extern size_t mem_section_usage_size(void);

/*
 * We use the lower bits of the mem_map pointer to store
 * a little bit of information.  The pointer is calculated
 * as mem_map - section_nr_to_pfn(pnum).  The result is
 * aligned to the minimum alignment of the two values:
 *   1. All mem_map arrays are page-aligned.
 *   2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT
 *      lowest bits.  PFN_SECTION_SHIFT is arch-specific
 *      (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the
 *      worst combination is powerpc with 256k pages,
 *      which results in PFN_SECTION_SHIFT equal 6.
 * To sum it up, at least 6 bits are available.
 */
#define        SECTION_MARKED_PRESENT        (1UL<<0)
#define SECTION_HAS_MEM_MAP        (1UL<<1)
#define SECTION_IS_ONLINE        (1UL<<2)
#define SECTION_IS_EARLY        (1UL<<3)
#define SECTION_MAP_LAST_BIT        (1UL<<4)
#define SECTION_MAP_MASK        (~(SECTION_MAP_LAST_BIT-1))
#define SECTION_NID_SHIFT        3

static inline struct page *__section_mem_map_addr(struct mem_section *section)
{
        unsigned long map = section->section_mem_map;
        map &= SECTION_MAP_MASK;
        return (struct page *)map;
}

static inline int present_section(struct mem_section *section)
{
        return (section && (section->section_mem_map & SECTION_MARKED_PRESENT));
}

static inline int present_section_nr(unsigned long nr)
{
        return present_section(__nr_to_section(nr));
}

static inline int valid_section(struct mem_section *section)
{
        return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP));
}

static inline int early_section(struct mem_section *section)
{
        return (section && (section->section_mem_map & SECTION_IS_EARLY));
}

static inline int valid_section_nr(unsigned long nr)
{
        return valid_section(__nr_to_section(nr));
}

static inline int online_section(struct mem_section *section)
{
        return (section && (section->section_mem_map & SECTION_IS_ONLINE));
}

static inline int online_section_nr(unsigned long nr)
{
        return online_section(__nr_to_section(nr));
}

#ifdef CONFIG_MEMORY_HOTPLUG
void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
#ifdef CONFIG_MEMORY_HOTREMOVE
void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
#endif
#endif

static inline struct mem_section *__pfn_to_section(unsigned long pfn)
{
        return __nr_to_section(pfn_to_section_nr(pfn));
}

extern unsigned long __highest_present_section_nr;

static inline int subsection_map_index(unsigned long pfn)
{
        return (pfn & ~(PAGE_SECTION_MASK)) / PAGES_PER_SUBSECTION;
}

#ifdef CONFIG_SPARSEMEM_VMEMMAP
static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
{
        int idx = subsection_map_index(pfn);
        struct mem_section_usage *usage = READ_ONCE(ms->usage);

        return usage ? test_bit(idx, usage->subsection_map) : 0;
}
#else
static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
{
        return 1;
}
#endif

#ifndef CONFIG_HAVE_ARCH_PFN_VALID
static inline int pfn_valid(unsigned long pfn)
{
        struct mem_section *ms;
        int ret;

        if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
                return 0;
        ms = __pfn_to_section(pfn);
        rcu_read_lock();
        if (!valid_section(ms)) {
                rcu_read_unlock();
                return 0;
        }
        /*
         * Traditionally early sections always returned pfn_valid() for
         * the entire section-sized span.
         */
        ret = early_section(ms) || pfn_section_valid(ms, pfn);
        rcu_read_unlock();

        return ret;
}
#endif

static inline int pfn_in_present_section(unsigned long pfn)
{
        if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
                return 0;
        return present_section(__pfn_to_section(pfn));
}

static inline unsigned long next_present_section_nr(unsigned long section_nr)
{
        while (++section_nr <= __highest_present_section_nr) {
                if (present_section_nr(section_nr))
                        return section_nr;
        }

        return -1;
}

/*
 * These are _only_ used during initialisation, therefore they
 * can use __initdata ...  They could have names to indicate
 * this restriction.
 */
#ifdef CONFIG_NUMA
#define pfn_to_nid(pfn)                                                        \
({                                                                        \
        unsigned long __pfn_to_nid_pfn = (pfn);                                \
        page_to_nid(pfn_to_page(__pfn_to_nid_pfn));                        \
})
#else
#define pfn_to_nid(pfn)                (0)
#endif

void sparse_init(void);
#else
#define sparse_init()        do {} while (0)
#define sparse_index_init(_sec, _nid)  do {} while (0)
#define pfn_in_present_section pfn_valid
#define subsection_map_init(_pfn, _nr_pages) do {} while (0)
#endif /* CONFIG_SPARSEMEM */

/*
 * During memory init memblocks map pfns to nids. The search is expensive and
 * this caches recent lookups. The implementation of __early_pfn_to_nid
 * may treat start/end as pfns or sections.
 */
struct mminit_pfnnid_cache {
        unsigned long last_start;
        unsigned long last_end;
        int last_nid;
};

/*
 * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we
 * need to check pfn validity within that MAX_ORDER_NR_PAGES block.
 * pfn_valid_within() should be used in this case; we optimise this away
 * when we have no holes within a MAX_ORDER_NR_PAGES block.
 */
#ifdef CONFIG_HOLES_IN_ZONE
#define pfn_valid_within(pfn) pfn_valid(pfn)
#else
#define pfn_valid_within(pfn) (1)
#endif

#endif /* !__GENERATING_BOUNDS.H */
#endif /* !__ASSEMBLY__ */
#endif /* _LINUX_MMZONE_H */










































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_BLOCKGROUP_LOCK_H
#define _LINUX_BLOCKGROUP_LOCK_H
/*
 * Per-blockgroup locking for ext2 and ext3.
 *
 * Simple hashed spinlocking.
 */

#include <linux/spinlock.h>
#include <linux/cache.h>

#ifdef CONFIG_SMP
#define NR_BG_LOCKS        (4 << ilog2(NR_CPUS < 32 ? NR_CPUS : 32))
#else
#define NR_BG_LOCKS        1
#endif

struct bgl_lock {
        spinlock_t lock;
} ____cacheline_aligned_in_smp;

struct blockgroup_lock {
        struct bgl_lock locks[NR_BG_LOCKS];
};

static inline void bgl_lock_init(struct blockgroup_lock *bgl)
{
        int i;

        for (i = 0; i < NR_BG_LOCKS; i++)
                spin_lock_init(&bgl->locks[i].lock);
}

static inline spinlock_t *
bgl_lock_ptr(struct blockgroup_lock *bgl, unsigned int block_group)
{
        return &bgl->locks[block_group & (NR_BG_LOCKS-1)].lock;
}

#endif


































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM tcp

#if !defined(_TRACE_TCP_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_TCP_H

#include <linux/ipv6.h>
#include <linux/tcp.h>
#include <linux/tracepoint.h>
#include <net/ipv6.h>
#include <net/tcp.h>
#include <linux/sock_diag.h>

#define TP_STORE_V4MAPPED(__entry, saddr, daddr)                \
        do {                                                        \
                struct in6_addr *pin6;                                \
                                                                \
                pin6 = (struct in6_addr *)__entry->saddr_v6;        \
                ipv6_addr_set_v4mapped(saddr, pin6);                \
                pin6 = (struct in6_addr *)__entry->daddr_v6;        \
                ipv6_addr_set_v4mapped(daddr, pin6);                \
        } while (0)

#if IS_ENABLED(CONFIG_IPV6)
#define TP_STORE_ADDRS(__entry, saddr, daddr, saddr6, daddr6)                \
        do {                                                                \
                if (sk->sk_family == AF_INET6) {                        \
                        struct in6_addr *pin6;                                \
                                                                        \
                        pin6 = (struct in6_addr *)__entry->saddr_v6;        \
                        *pin6 = saddr6;                                        \
                        pin6 = (struct in6_addr *)__entry->daddr_v6;        \
                        *pin6 = daddr6;                                        \
                } else {                                                \
                        TP_STORE_V4MAPPED(__entry, saddr, daddr);        \
                }                                                        \
        } while (0)
#else
#define TP_STORE_ADDRS(__entry, saddr, daddr, saddr6, daddr6)        \
        TP_STORE_V4MAPPED(__entry, saddr, daddr)
#endif

/*
 * tcp event with arguments sk and skb
 *
 * Note: this class requires a valid sk pointer; while skb pointer could
 *       be NULL.
 */
DECLARE_EVENT_CLASS(tcp_event_sk_skb,

        TP_PROTO(const struct sock *sk, const struct sk_buff *skb),

        TP_ARGS(sk, skb),

        TP_STRUCT__entry(
                __field(const void *, skbaddr)
                __field(const void *, skaddr)
                __field(int, state)
                __field(__u16, sport)
                __field(__u16, dport)
                __array(__u8, saddr, 4)
                __array(__u8, daddr, 4)
                __array(__u8, saddr_v6, 16)
                __array(__u8, daddr_v6, 16)
        ),

        TP_fast_assign(
                struct inet_sock *inet = inet_sk(sk);
                __be32 *p32;

                __entry->skbaddr = skb;
                __entry->skaddr = sk;
                __entry->state = sk->sk_state;

                __entry->sport = ntohs(inet->inet_sport);
                __entry->dport = ntohs(inet->inet_dport);

                p32 = (__be32 *) __entry->saddr;
                *p32 = inet->inet_saddr;

                p32 = (__be32 *) __entry->daddr;
                *p32 =  inet->inet_daddr;

                TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
                              sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
        ),

        TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c state=%s",
                  __entry->sport, __entry->dport, __entry->saddr, __entry->daddr,
                  __entry->saddr_v6, __entry->daddr_v6,
                  show_tcp_state_name(__entry->state))
);

DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb,

        TP_PROTO(const struct sock *sk, const struct sk_buff *skb),

        TP_ARGS(sk, skb)
);

/*
 * skb of trace_tcp_send_reset is the skb that caused RST. In case of
 * active reset, skb should be NULL
 */
DEFINE_EVENT(tcp_event_sk_skb, tcp_send_reset,

        TP_PROTO(const struct sock *sk, const struct sk_buff *skb),

        TP_ARGS(sk, skb)
);

/*
 * tcp event with arguments sk
 *
 * Note: this class requires a valid sk pointer.
 */
DECLARE_EVENT_CLASS(tcp_event_sk,

        TP_PROTO(struct sock *sk),

        TP_ARGS(sk),

        TP_STRUCT__entry(
                __field(const void *, skaddr)
                __field(__u16, sport)
                __field(__u16, dport)
                __array(__u8, saddr, 4)
                __array(__u8, daddr, 4)
                __array(__u8, saddr_v6, 16)
                __array(__u8, daddr_v6, 16)
                __field(__u64, sock_cookie)
        ),

        TP_fast_assign(
                struct inet_sock *inet = inet_sk(sk);
                __be32 *p32;

                __entry->skaddr = sk;

                __entry->sport = ntohs(inet->inet_sport);
                __entry->dport = ntohs(inet->inet_dport);

                p32 = (__be32 *) __entry->saddr;
                *p32 = inet->inet_saddr;

                p32 = (__be32 *) __entry->daddr;
                *p32 =  inet->inet_daddr;

                TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
                               sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);

                __entry->sock_cookie = sock_gen_cookie(sk);
        ),

        TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c sock_cookie=%llx",
                  __entry->sport, __entry->dport,
                  __entry->saddr, __entry->daddr,
                  __entry->saddr_v6, __entry->daddr_v6,
                  __entry->sock_cookie)
);

DEFINE_EVENT(tcp_event_sk, tcp_receive_reset,

        TP_PROTO(struct sock *sk),

        TP_ARGS(sk)
);

DEFINE_EVENT(tcp_event_sk, tcp_destroy_sock,

        TP_PROTO(struct sock *sk),

        TP_ARGS(sk)
);

DEFINE_EVENT(tcp_event_sk, tcp_rcv_space_adjust,

        TP_PROTO(struct sock *sk),

        TP_ARGS(sk)
);

TRACE_EVENT(tcp_retransmit_synack,

        TP_PROTO(const struct sock *sk, const struct request_sock *req),

        TP_ARGS(sk, req),

        TP_STRUCT__entry(
                __field(const void *, skaddr)
                __field(const void *, req)
                __field(__u16, sport)
                __field(__u16, dport)
                __array(__u8, saddr, 4)
                __array(__u8, daddr, 4)
                __array(__u8, saddr_v6, 16)
                __array(__u8, daddr_v6, 16)
        ),

        TP_fast_assign(
                struct inet_request_sock *ireq = inet_rsk(req);
                __be32 *p32;

                __entry->skaddr = sk;
                __entry->req = req;

                __entry->sport = ireq->ir_num;
                __entry->dport = ntohs(ireq->ir_rmt_port);

                p32 = (__be32 *) __entry->saddr;
                *p32 = ireq->ir_loc_addr;

                p32 = (__be32 *) __entry->daddr;
                *p32 = ireq->ir_rmt_addr;

                TP_STORE_ADDRS(__entry, ireq->ir_loc_addr, ireq->ir_rmt_addr,
                              ireq->ir_v6_loc_addr, ireq->ir_v6_rmt_addr);
        ),

        TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c",
                  __entry->sport, __entry->dport,
                  __entry->saddr, __entry->daddr,
                  __entry->saddr_v6, __entry->daddr_v6)
);

#include <trace/events/net_probe_common.h>

TRACE_EVENT(tcp_probe,

        TP_PROTO(struct sock *sk, struct sk_buff *skb),

        TP_ARGS(sk, skb),

        TP_STRUCT__entry(
                /* sockaddr_in6 is always bigger than sockaddr_in */
                __array(__u8, saddr, sizeof(struct sockaddr_in6))
                __array(__u8, daddr, sizeof(struct sockaddr_in6))
                __field(__u16, sport)
                __field(__u16, dport)
                __field(__u32, mark)
                __field(__u16, data_len)
                __field(__u32, snd_nxt)
                __field(__u32, snd_una)
                __field(__u32, snd_cwnd)
                __field(__u32, ssthresh)
                __field(__u32, snd_wnd)
                __field(__u32, srtt)
                __field(__u32, rcv_wnd)
                __field(__u64, sock_cookie)
        ),

        TP_fast_assign(
                const struct tcphdr *th = (const struct tcphdr *)skb->data;
                const struct inet_sock *inet = inet_sk(sk);
                const struct tcp_sock *tp = tcp_sk(sk);

                memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
                memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));

                TP_STORE_ADDR_PORTS(__entry, inet, sk);

                /* For filtering use */
                __entry->sport = ntohs(inet->inet_sport);
                __entry->dport = ntohs(inet->inet_dport);
                __entry->mark = skb->mark;

                __entry->data_len = skb->len - __tcp_hdrlen(th);
                __entry->snd_nxt = tp->snd_nxt;
                __entry->snd_una = tp->snd_una;
                __entry->snd_cwnd = tp->snd_cwnd;
                __entry->snd_wnd = tp->snd_wnd;
                __entry->rcv_wnd = tp->rcv_wnd;
                __entry->ssthresh = tcp_current_ssthresh(sk);
                __entry->srtt = tp->srtt_us >> 3;
                __entry->sock_cookie = sock_gen_cookie(sk);
        ),

        TP_printk("src=%pISpc dest=%pISpc mark=%#x data_len=%d snd_nxt=%#x snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u sock_cookie=%llx",
                  __entry->saddr, __entry->daddr, __entry->mark,
                  __entry->data_len, __entry->snd_nxt, __entry->snd_una,
                  __entry->snd_cwnd, __entry->ssthresh, __entry->snd_wnd,
                  __entry->srtt, __entry->rcv_wnd, __entry->sock_cookie)
);

#endif /* _TRACE_TCP_H */

/* This part must be outside protection */
#include <trace/define_trace.h>

















































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
/*
 * 25-Jul-1998 Major changes to allow for ip chain table
 *
 * 3-Jan-2000 Named tables to allow packet selection for different uses.
 */

/*
 *         Format of an IP6 firewall descriptor
 *
 *         src, dst, src_mask, dst_mask are always stored in network byte order.
 *         flags are stored in host byte order (of course).
 *         Port numbers are stored in HOST byte order.
 */

#ifndef _UAPI_IP6_TABLES_H
#define _UAPI_IP6_TABLES_H

#include <linux/types.h>
#include <linux/compiler.h>
#include <linux/if.h>
#include <linux/netfilter_ipv6.h>

#include <linux/netfilter/x_tables.h>

#ifndef __KERNEL__
#define IP6T_FUNCTION_MAXNAMELEN XT_FUNCTION_MAXNAMELEN
#define IP6T_TABLE_MAXNAMELEN XT_TABLE_MAXNAMELEN
#define ip6t_match xt_match
#define ip6t_target xt_target
#define ip6t_table xt_table
#define ip6t_get_revision xt_get_revision
#define ip6t_entry_match xt_entry_match
#define ip6t_entry_target xt_entry_target
#define ip6t_standard_target xt_standard_target
#define ip6t_error_target xt_error_target
#define ip6t_counters xt_counters
#define IP6T_CONTINUE XT_CONTINUE
#define IP6T_RETURN XT_RETURN

/* Pre-iptables-1.4.0 */
#include <linux/netfilter/xt_tcpudp.h>
#define ip6t_tcp xt_tcp
#define ip6t_udp xt_udp
#define IP6T_TCP_INV_SRCPT        XT_TCP_INV_SRCPT
#define IP6T_TCP_INV_DSTPT        XT_TCP_INV_DSTPT
#define IP6T_TCP_INV_FLAGS        XT_TCP_INV_FLAGS
#define IP6T_TCP_INV_OPTION        XT_TCP_INV_OPTION
#define IP6T_TCP_INV_MASK        XT_TCP_INV_MASK
#define IP6T_UDP_INV_SRCPT        XT_UDP_INV_SRCPT
#define IP6T_UDP_INV_DSTPT        XT_UDP_INV_DSTPT
#define IP6T_UDP_INV_MASK        XT_UDP_INV_MASK

#define ip6t_counters_info xt_counters_info
#define IP6T_STANDARD_TARGET XT_STANDARD_TARGET
#define IP6T_ERROR_TARGET XT_ERROR_TARGET
#define IP6T_MATCH_ITERATE(e, fn, args...) \
        XT_MATCH_ITERATE(struct ip6t_entry, e, fn, ## args)
#define IP6T_ENTRY_ITERATE(entries, size, fn, args...) \
        XT_ENTRY_ITERATE(struct ip6t_entry, entries, size, fn, ## args)
#endif

/* Yes, Virginia, you have to zero the padding. */
struct ip6t_ip6 {
        /* Source and destination IP6 addr */
        struct in6_addr src, dst;                
        /* Mask for src and dest IP6 addr */
        struct in6_addr smsk, dmsk;
        char iniface[IFNAMSIZ], outiface[IFNAMSIZ];
        unsigned char iniface_mask[IFNAMSIZ], outiface_mask[IFNAMSIZ];

        /* Upper protocol number
         * - The allowed value is 0 (any) or protocol number of last parsable
         *   header, which is 50 (ESP), 59 (No Next Header), 135 (MH), or
         *   the non IPv6 extension headers.
         * - The protocol numbers of IPv6 extension headers except of ESP and
         *   MH do not match any packets.
         * - You also need to set IP6T_FLAGS_PROTO to "flags" to check protocol.
         */
        __u16 proto;
        /* TOS to match iff flags & IP6T_F_TOS */
        __u8 tos;

        /* Flags word */
        __u8 flags;
        /* Inverse flags */
        __u8 invflags;
};

/* Values for "flag" field in struct ip6t_ip6 (general ip6 structure). */
#define IP6T_F_PROTO                0x01        /* Set if rule cares about upper 
                                           protocols */
#define IP6T_F_TOS                0x02        /* Match the TOS. */
#define IP6T_F_GOTO                0x04        /* Set if jump is a goto */
#define IP6T_F_MASK                0x07        /* All possible flag bits mask. */

/* Values for "inv" field in struct ip6t_ip6. */
#define IP6T_INV_VIA_IN                0x01        /* Invert the sense of IN IFACE. */
#define IP6T_INV_VIA_OUT                0x02        /* Invert the sense of OUT IFACE */
#define IP6T_INV_TOS                0x04        /* Invert the sense of TOS. */
#define IP6T_INV_SRCIP                0x08        /* Invert the sense of SRC IP. */
#define IP6T_INV_DSTIP                0x10        /* Invert the sense of DST OP. */
#define IP6T_INV_FRAG                0x20        /* Invert the sense of FRAG. */
#define IP6T_INV_PROTO                XT_INV_PROTO
#define IP6T_INV_MASK                0x7F        /* All possible flag bits mask. */

/* This structure defines each of the firewall rules.  Consists of 3
   parts which are 1) general IP header stuff 2) match specific
   stuff 3) the target to perform if the rule matches */
struct ip6t_entry {
        struct ip6t_ip6 ipv6;

        /* Mark with fields that we care about. */
        unsigned int nfcache;

        /* Size of ipt_entry + matches */
        __u16 target_offset;
        /* Size of ipt_entry + matches + target */
        __u16 next_offset;

        /* Back pointer */
        unsigned int comefrom;

        /* Packet and byte counters. */
        struct xt_counters counters;

        /* The matches (if any), then the target. */
        unsigned char elems[0];
};

/* Standard entry */
struct ip6t_standard {
        struct ip6t_entry entry;
        struct xt_standard_target target;
};

struct ip6t_error {
        struct ip6t_entry entry;
        struct xt_error_target target;
};

#define IP6T_ENTRY_INIT(__size)                                                       \
{                                                                               \
        .target_offset        = sizeof(struct ip6t_entry),                               \
        .next_offset        = (__size),                                               \
}

#define IP6T_STANDARD_INIT(__verdict)                                               \
{                                                                               \
        .entry                = IP6T_ENTRY_INIT(sizeof(struct ip6t_standard)),       \
        .target                = XT_TARGET_INIT(XT_STANDARD_TARGET,                       \
                                         sizeof(struct xt_standard_target)),   \
        .target.verdict        = -(__verdict) - 1,                                       \
}

#define IP6T_ERROR_INIT                                                               \
{                                                                               \
        .entry                = IP6T_ENTRY_INIT(sizeof(struct ip6t_error)),               \
        .target                = XT_TARGET_INIT(XT_ERROR_TARGET,                       \
                                         sizeof(struct xt_error_target)),      \
        .target.errorname = "ERROR",                                               \
}

/*
 * New IP firewall options for [gs]etsockopt at the RAW IP level.
 * Unlike BSD Linux inherits IP options so you don't have to use
 * a raw socket for this. Instead we check rights in the calls.
 *
 * ATTENTION: check linux/in6.h before adding new number here.
 */
#define IP6T_BASE_CTL                        64

#define IP6T_SO_SET_REPLACE                (IP6T_BASE_CTL)
#define IP6T_SO_SET_ADD_COUNTERS        (IP6T_BASE_CTL + 1)
#define IP6T_SO_SET_MAX                        IP6T_SO_SET_ADD_COUNTERS

#define IP6T_SO_GET_INFO                (IP6T_BASE_CTL)
#define IP6T_SO_GET_ENTRIES                (IP6T_BASE_CTL + 1)
#define IP6T_SO_GET_REVISION_MATCH        (IP6T_BASE_CTL + 4)
#define IP6T_SO_GET_REVISION_TARGET        (IP6T_BASE_CTL + 5)
#define IP6T_SO_GET_MAX                        IP6T_SO_GET_REVISION_TARGET

/* obtain original address if REDIRECT'd connection */
#define IP6T_SO_ORIGINAL_DST            80

/* ICMP matching stuff */
struct ip6t_icmp {
        __u8 type;                                /* type to match */
        __u8 code[2];                                /* range of code */
        __u8 invflags;                                /* Inverse flags */
};

/* Values for "inv" field for struct ipt_icmp. */
#define IP6T_ICMP_INV        0x01        /* Invert the sense of type/code test */

/* The argument to IP6T_SO_GET_INFO */
struct ip6t_getinfo {
        /* Which table: caller fills this in. */
        char name[XT_TABLE_MAXNAMELEN];

        /* Kernel fills these in. */
        /* Which hook entry points are valid: bitmask */
        unsigned int valid_hooks;

        /* Hook entry points: one per netfilter hook. */
        unsigned int hook_entry[NF_INET_NUMHOOKS];

        /* Underflow points. */
        unsigned int underflow[NF_INET_NUMHOOKS];

        /* Number of entries */
        unsigned int num_entries;

        /* Size of entries. */
        unsigned int size;
};

/* The argument to IP6T_SO_SET_REPLACE. */
struct ip6t_replace {
        /* Which table. */
        char name[XT_TABLE_MAXNAMELEN];

        /* Which hook entry points are valid: bitmask.  You can't
           change this. */
        unsigned int valid_hooks;

        /* Number of entries */
        unsigned int num_entries;

        /* Total size of new entries */
        unsigned int size;

        /* Hook entry points. */
        unsigned int hook_entry[NF_INET_NUMHOOKS];

        /* Underflow points. */
        unsigned int underflow[NF_INET_NUMHOOKS];

        /* Information about old entries: */
        /* Number of counters (must be equal to current number of entries). */
        unsigned int num_counters;
        /* The old entries' counters. */
        struct xt_counters __user *counters;

        /* The entries (hang off end: not really an array). */
        struct ip6t_entry entries[0];
};

/* The argument to IP6T_SO_GET_ENTRIES. */
struct ip6t_get_entries {
        /* Which table: user fills this in. */
        char name[XT_TABLE_MAXNAMELEN];

        /* User fills this in: total entry size. */
        unsigned int size;

        /* The entries. */
        struct ip6t_entry entrytable[0];
};

/* Helper functions */
static __inline__ struct xt_entry_target *
ip6t_get_target(struct ip6t_entry *e)
{
        return (struct xt_entry_target *)((char *)e + e->target_offset);
}

/*
 *        Main firewall chains definitions and global var's definitions.
 */

#endif /* _UAPI_IP6_TABLES_H */









































































    1 


    1 






























































































































































































































































































    1 




    1 
























































































    1 




    1 











































































































    1 


































































    1 













    1 





    1 









    1 



    1 



    1 










    1 
    1 






























    1 



















    1 







    1 











    1 







    1 



















    1 


    1 





    1 

































































    1 














    1 



















    1 



    1 






























    1 

















    1 












    1 






    1 














    1 





    1 





    1 









































    1 





















































































































































































































































































    1 



    1 













    1 






    1 
























    1 
    1 






















    1 














    1 










    1 


    1 






    1 
















    1 
















    1 



    1 



    1 







    1 











    1 
    1 
















































    1 


















    1 


    1 










    1 





    1 









    1 
















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/isofs/inode.c
 *
 *  (C) 1991  Linus Torvalds - minix filesystem
 *      1992, 1993, 1994  Eric Youngdale Modified for ISO 9660 filesystem.
 *      1994  Eberhard Mönkeberg - multi session handling.
 *      1995  Mark Dobie - allow mounting of some weird VideoCDs and PhotoCDs.
 *        1997  Gordon Chaffee - Joliet CDs
 *        1998  Eric Lammerts - ISO 9660 Level 3
 *        2004  Paul Serice - Inode Support pushed out from 4GB to 128GB
 *        2004  Paul Serice - NFS Export Operations
 */

#include <linux/init.h>
#include <linux/module.h>

#include <linux/slab.h>
#include <linux/cred.h>
#include <linux/nls.h>
#include <linux/ctype.h>
#include <linux/statfs.h>
#include <linux/cdrom.h>
#include <linux/parser.h>
#include <linux/mpage.h>
#include <linux/user_namespace.h>
#include <linux/seq_file.h>
#include <linux/blkdev.h>

#include "isofs.h"
#include "zisofs.h"

/* max tz offset is 13 hours */
#define MAX_TZ_OFFSET (52*15*60)

#define BEQUIET

static int isofs_hashi(const struct dentry *parent, struct qstr *qstr);
static int isofs_dentry_cmpi(const struct dentry *dentry,
                unsigned int len, const char *str, const struct qstr *name);

#ifdef CONFIG_JOLIET
static int isofs_hashi_ms(const struct dentry *parent, struct qstr *qstr);
static int isofs_hash_ms(const struct dentry *parent, struct qstr *qstr);
static int isofs_dentry_cmpi_ms(const struct dentry *dentry,
                unsigned int len, const char *str, const struct qstr *name);
static int isofs_dentry_cmp_ms(const struct dentry *dentry,
                unsigned int len, const char *str, const struct qstr *name);
#endif

static void isofs_put_super(struct super_block *sb)
{
        struct isofs_sb_info *sbi = ISOFS_SB(sb);

#ifdef CONFIG_JOLIET
        unload_nls(sbi->s_nls_iocharset);
#endif

        kfree(sbi);
        sb->s_fs_info = NULL;
        return;
}

static int isofs_read_inode(struct inode *, int relocated);
static int isofs_statfs (struct dentry *, struct kstatfs *);
static int isofs_show_options(struct seq_file *, struct dentry *);

static struct kmem_cache *isofs_inode_cachep;

static struct inode *isofs_alloc_inode(struct super_block *sb)
{
        struct iso_inode_info *ei;
        ei = kmem_cache_alloc(isofs_inode_cachep, GFP_KERNEL);
        if (!ei)
                return NULL;
        return &ei->vfs_inode;
}

static void isofs_free_inode(struct inode *inode)
{
        kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode));
}

static void init_once(void *foo)
{
        struct iso_inode_info *ei = foo;

        inode_init_once(&ei->vfs_inode);
}

static int __init init_inodecache(void)
{
        isofs_inode_cachep = kmem_cache_create("isofs_inode_cache",
                                        sizeof(struct iso_inode_info),
                                        0, (SLAB_RECLAIM_ACCOUNT|
                                        SLAB_MEM_SPREAD|SLAB_ACCOUNT),
                                        init_once);
        if (!isofs_inode_cachep)
                return -ENOMEM;
        return 0;
}

static void destroy_inodecache(void)
{
        /*
         * Make sure all delayed rcu free inodes are flushed before we
         * destroy cache.
         */
        rcu_barrier();
        kmem_cache_destroy(isofs_inode_cachep);
}

static int isofs_remount(struct super_block *sb, int *flags, char *data)
{
        sync_filesystem(sb);
        if (!(*flags & SB_RDONLY))
                return -EROFS;
        return 0;
}

static const struct super_operations isofs_sops = {
        .alloc_inode        = isofs_alloc_inode,
        .free_inode        = isofs_free_inode,
        .put_super        = isofs_put_super,
        .statfs                = isofs_statfs,
        .remount_fs        = isofs_remount,
        .show_options        = isofs_show_options,
};


static const struct dentry_operations isofs_dentry_ops[] = {
        {
                .d_hash                = isofs_hashi,
                .d_compare        = isofs_dentry_cmpi,
        },
#ifdef CONFIG_JOLIET
        {
                .d_hash                = isofs_hash_ms,
                .d_compare        = isofs_dentry_cmp_ms,
        },
        {
                .d_hash                = isofs_hashi_ms,
                .d_compare        = isofs_dentry_cmpi_ms,
        },
#endif
};

struct iso9660_options{
        unsigned int rock:1;
        unsigned int joliet:1;
        unsigned int cruft:1;
        unsigned int hide:1;
        unsigned int showassoc:1;
        unsigned int nocompress:1;
        unsigned int overriderockperm:1;
        unsigned int uid_set:1;
        unsigned int gid_set:1;
        unsigned char map;
        unsigned char check;
        unsigned int blocksize;
        umode_t fmode;
        umode_t dmode;
        kgid_t gid;
        kuid_t uid;
        char *iocharset;
        /* LVE */
        s32 session;
        s32 sbsector;
};

/*
 * Compute the hash for the isofs name corresponding to the dentry.
 */
static int
isofs_hashi_common(const struct dentry *dentry, struct qstr *qstr, int ms)
{
        const char *name;
        int len;
        char c;
        unsigned long hash;

        len = qstr->len;
        name = qstr->name;
        if (ms) {
                while (len && name[len-1] == '.')
                        len--;
        }

        hash = init_name_hash(dentry);
        while (len--) {
                c = tolower(*name++);
                hash = partial_name_hash(c, hash);
        }
        qstr->hash = end_name_hash(hash);

        return 0;
}

/*
 * Compare of two isofs names.
 */
static int isofs_dentry_cmp_common(
                unsigned int len, const char *str,
                const struct qstr *name, int ms, int ci)
{
        int alen, blen;

        /* A filename cannot end in '.' or we treat it like it has none */
        alen = name->len;
        blen = len;
        if (ms) {
                while (alen && name->name[alen-1] == '.')
                        alen--;
                while (blen && str[blen-1] == '.')
                        blen--;
        }
        if (alen == blen) {
                if (ci) {
                        if (strncasecmp(name->name, str, alen) == 0)
                                return 0;
                } else {
                        if (strncmp(name->name, str, alen) == 0)
                                return 0;
                }
        }
        return 1;
}

static int
isofs_hashi(const struct dentry *dentry, struct qstr *qstr)
{
        return isofs_hashi_common(dentry, qstr, 0);
}

static int
isofs_dentry_cmpi(const struct dentry *dentry,
                unsigned int len, const char *str, const struct qstr *name)
{
        return isofs_dentry_cmp_common(len, str, name, 0, 1);
}

#ifdef CONFIG_JOLIET
/*
 * Compute the hash for the isofs name corresponding to the dentry.
 */
static int
isofs_hash_common(const struct dentry *dentry, struct qstr *qstr, int ms)
{
        const char *name;
        int len;

        len = qstr->len;
        name = qstr->name;
        if (ms) {
                while (len && name[len-1] == '.')
                        len--;
        }

        qstr->hash = full_name_hash(dentry, name, len);

        return 0;
}

static int
isofs_hash_ms(const struct dentry *dentry, struct qstr *qstr)
{
        return isofs_hash_common(dentry, qstr, 1);
}

static int
isofs_hashi_ms(const struct dentry *dentry, struct qstr *qstr)
{
        return isofs_hashi_common(dentry, qstr, 1);
}

static int
isofs_dentry_cmp_ms(const struct dentry *dentry,
                unsigned int len, const char *str, const struct qstr *name)
{
        return isofs_dentry_cmp_common(len, str, name, 1, 0);
}

static int
isofs_dentry_cmpi_ms(const struct dentry *dentry,
                unsigned int len, const char *str, const struct qstr *name)
{
        return isofs_dentry_cmp_common(len, str, name, 1, 1);
}
#endif

enum {
        Opt_block, Opt_check_r, Opt_check_s, Opt_cruft, Opt_gid, Opt_ignore,
        Opt_iocharset, Opt_map_a, Opt_map_n, Opt_map_o, Opt_mode, Opt_nojoliet,
        Opt_norock, Opt_sb, Opt_session, Opt_uid, Opt_unhide, Opt_utf8, Opt_err,
        Opt_nocompress, Opt_hide, Opt_showassoc, Opt_dmode, Opt_overriderockperm,
};

static const match_table_t tokens = {
        {Opt_norock, "norock"},
        {Opt_nojoliet, "nojoliet"},
        {Opt_unhide, "unhide"},
        {Opt_hide, "hide"},
        {Opt_showassoc, "showassoc"},
        {Opt_cruft, "cruft"},
        {Opt_utf8, "utf8"},
        {Opt_iocharset, "iocharset=%s"},
        {Opt_map_a, "map=acorn"},
        {Opt_map_a, "map=a"},
        {Opt_map_n, "map=normal"},
        {Opt_map_n, "map=n"},
        {Opt_map_o, "map=off"},
        {Opt_map_o, "map=o"},
        {Opt_session, "session=%u"},
        {Opt_sb, "sbsector=%u"},
        {Opt_check_r, "check=relaxed"},
        {Opt_check_r, "check=r"},
        {Opt_check_s, "check=strict"},
        {Opt_check_s, "check=s"},
        {Opt_uid, "uid=%u"},
        {Opt_gid, "gid=%u"},
        {Opt_mode, "mode=%u"},
        {Opt_dmode, "dmode=%u"},
        {Opt_overriderockperm, "overriderockperm"},
        {Opt_block, "block=%u"},
        {Opt_ignore, "conv=binary"},
        {Opt_ignore, "conv=b"},
        {Opt_ignore, "conv=text"},
        {Opt_ignore, "conv=t"},
        {Opt_ignore, "conv=mtext"},
        {Opt_ignore, "conv=m"},
        {Opt_ignore, "conv=auto"},
        {Opt_ignore, "conv=a"},
        {Opt_nocompress, "nocompress"},
        {Opt_err, NULL}
};

static int parse_options(char *options, struct iso9660_options *popt)
{
        char *p;
        int option;

        popt->map = 'n';
        popt->rock = 1;
        popt->joliet = 1;
        popt->cruft = 0;
        popt->hide = 0;
        popt->showassoc = 0;
        popt->check = 'u';                /* unset */
        popt->nocompress = 0;
        popt->blocksize = 1024;
        popt->fmode = popt->dmode = ISOFS_INVALID_MODE;
        popt->uid_set = 0;
        popt->gid_set = 0;
        popt->gid = GLOBAL_ROOT_GID;
        popt->uid = GLOBAL_ROOT_UID;
        popt->iocharset = NULL;
        popt->overriderockperm = 0;
        popt->session=-1;
        popt->sbsector=-1;
        if (!options)
                return 1;

        while ((p = strsep(&options, ",")) != NULL) {
                int token;
                substring_t args[MAX_OPT_ARGS];
                unsigned n;

                if (!*p)
                        continue;

                token = match_token(p, tokens, args);
                switch (token) {
                case Opt_norock:
                        popt->rock = 0;
                        break;
                case Opt_nojoliet:
                        popt->joliet = 0;
                        break;
                case Opt_hide:
                        popt->hide = 1;
                        break;
                case Opt_unhide:
                case Opt_showassoc:
                        popt->showassoc = 1;
                        break;
                case Opt_cruft:
                        popt->cruft = 1;
                        break;
#ifdef CONFIG_JOLIET
                case Opt_utf8:
                        kfree(popt->iocharset);
                        popt->iocharset = kstrdup("utf8", GFP_KERNEL);
                        if (!popt->iocharset)
                                return 0;
                        break;
                case Opt_iocharset:
                        kfree(popt->iocharset);
                        popt->iocharset = match_strdup(&args[0]);
                        if (!popt->iocharset)
                                return 0;
                        break;
#endif
                case Opt_map_a:
                        popt->map = 'a';
                        break;
                case Opt_map_o:
                        popt->map = 'o';
                        break;
                case Opt_map_n:
                        popt->map = 'n';
                        break;
                case Opt_session:
                        if (match_int(&args[0], &option))
                                return 0;
                        n = option;
                        /*
                         * Track numbers are supposed to be in range 1-99, the
                         * mount option starts indexing at 0.
                         */
                        if (n >= 99)
                                return 0;
                        popt->session = n + 1;
                        break;
                case Opt_sb:
                        if (match_int(&args[0], &option))
                                return 0;
                        popt->sbsector = option;
                        break;
                case Opt_check_r:
                        popt->check = 'r';
                        break;
                case Opt_check_s:
                        popt->check = 's';
                        break;
                case Opt_ignore:
                        break;
                case Opt_uid:
                        if (match_int(&args[0], &option))
                                return 0;
                        popt->uid = make_kuid(current_user_ns(), option);
                        if (!uid_valid(popt->uid))
                                return 0;
                        popt->uid_set = 1;
                        break;
                case Opt_gid:
                        if (match_int(&args[0], &option))
                                return 0;
                        popt->gid = make_kgid(current_user_ns(), option);
                        if (!gid_valid(popt->gid))
                                return 0;
                        popt->gid_set = 1;
                        break;
                case Opt_mode:
                        if (match_int(&args[0], &option))
                                return 0;
                        popt->fmode = option;
                        break;
                case Opt_dmode:
                        if (match_int(&args[0], &option))
                                return 0;
                        popt->dmode = option;
                        break;
                case Opt_overriderockperm:
                        popt->overriderockperm = 1;
                        break;
                case Opt_block:
                        if (match_int(&args[0], &option))
                                return 0;
                        n = option;
                        if (n != 512 && n != 1024 && n != 2048)
                                return 0;
                        popt->blocksize = n;
                        break;
                case Opt_nocompress:
                        popt->nocompress = 1;
                        break;
                default:
                        return 0;
                }
        }
        return 1;
}

/*
 * Display the mount options in /proc/mounts.
 */
static int isofs_show_options(struct seq_file *m, struct dentry *root)
{
        struct isofs_sb_info *sbi = ISOFS_SB(root->d_sb);

        if (!sbi->s_rock)                seq_puts(m, ",norock");
        else if (!sbi->s_joliet_level)        seq_puts(m, ",nojoliet");
        if (sbi->s_cruft)                seq_puts(m, ",cruft");
        if (sbi->s_hide)                seq_puts(m, ",hide");
        if (sbi->s_nocompress)                seq_puts(m, ",nocompress");
        if (sbi->s_overriderockperm)        seq_puts(m, ",overriderockperm");
        if (sbi->s_showassoc)                seq_puts(m, ",showassoc");

        if (sbi->s_check)                seq_printf(m, ",check=%c", sbi->s_check);
        if (sbi->s_mapping)                seq_printf(m, ",map=%c", sbi->s_mapping);
        if (sbi->s_session != 255)        seq_printf(m, ",session=%u", sbi->s_session - 1);
        if (sbi->s_sbsector != -1)        seq_printf(m, ",sbsector=%u", sbi->s_sbsector);

        if (root->d_sb->s_blocksize != 1024)
                seq_printf(m, ",blocksize=%lu", root->d_sb->s_blocksize);

        if (sbi->s_uid_set)
                seq_printf(m, ",uid=%u",
                           from_kuid_munged(&init_user_ns, sbi->s_uid));
        if (sbi->s_gid_set)
                seq_printf(m, ",gid=%u",
                           from_kgid_munged(&init_user_ns, sbi->s_gid));

        if (sbi->s_dmode != ISOFS_INVALID_MODE)
                seq_printf(m, ",dmode=%o", sbi->s_dmode);
        if (sbi->s_fmode != ISOFS_INVALID_MODE)
                seq_printf(m, ",fmode=%o", sbi->s_fmode);

#ifdef CONFIG_JOLIET
        if (sbi->s_nls_iocharset)
                seq_printf(m, ",iocharset=%s", sbi->s_nls_iocharset->charset);
        else
                seq_puts(m, ",iocharset=utf8");
#endif
        return 0;
}

/*
 * look if the driver can tell the multi session redirection value
 *
 * don't change this if you don't know what you do, please!
 * Multisession is legal only with XA disks.
 * A non-XA disk with more than one volume descriptor may do it right, but
 * usually is written in a nowhere standardized "multi-partition" manner.
 * Multisession uses absolute addressing (solely the first frame of the whole
 * track is #0), multi-partition uses relative addressing (each first frame of
 * each track is #0), and a track is not a session.
 *
 * A broken CDwriter software or drive firmware does not set new standards,
 * at least not if conflicting with the existing ones.
 *
 * emoenke@gwdg.de
 */
#define WE_OBEY_THE_WRITTEN_STANDARDS 1

static unsigned int isofs_get_last_session(struct super_block *sb, s32 session)
{
        struct cdrom_device_info *cdi = disk_to_cdi(sb->s_bdev->bd_disk);
        unsigned int vol_desc_start = 0;

        if (session > 0) {
                struct cdrom_tocentry te;

                if (!cdi)
                        return 0;

                te.cdte_track = session;
                te.cdte_format = CDROM_LBA;
                if (cdrom_read_tocentry(cdi, &te) == 0) {
                        printk(KERN_DEBUG "ISOFS: Session %d start %d type %d\n",
                                session, te.cdte_addr.lba,
                                te.cdte_ctrl & CDROM_DATA_TRACK);
                        if ((te.cdte_ctrl & CDROM_DATA_TRACK) == 4)
                                return te.cdte_addr.lba;
                }

                printk(KERN_ERR "ISOFS: Invalid session number or type of track\n");
        }

        if (cdi) {
                struct cdrom_multisession ms_info;

                ms_info.addr_format = CDROM_LBA;
                if (cdrom_multisession(cdi, &ms_info) == 0) {
#if WE_OBEY_THE_WRITTEN_STANDARDS
                        /* necessary for a valid ms_info.addr */
                        if (ms_info.xa_flag)
#endif
                                vol_desc_start = ms_info.addr.lba;
                }
        }

        return vol_desc_start;
}

/*
 * Check if root directory is empty (has less than 3 files).
 *
 * Used to detect broken CDs where ISO root directory is empty but Joliet root
 * directory is OK. If such CD has Rock Ridge extensions, they will be disabled
 * (and Joliet used instead) or else no files would be visible.
 */
static bool rootdir_empty(struct super_block *sb, unsigned long block)
{
        int offset = 0, files = 0, de_len;
        struct iso_directory_record *de;
        struct buffer_head *bh;

        bh = sb_bread(sb, block);
        if (!bh)
                return true;
        while (files < 3) {
                de = (struct iso_directory_record *) (bh->b_data + offset);
                de_len = *(unsigned char *) de;
                if (de_len == 0)
                        break;
                files++;
                offset += de_len;
        }
        brelse(bh);
        return files < 3;
}

/*
 * Initialize the superblock and read the root inode.
 */
static int isofs_fill_super(struct super_block *s, void *data, int silent)
{
        struct buffer_head *bh = NULL, *pri_bh = NULL;
        struct hs_primary_descriptor *h_pri = NULL;
        struct iso_primary_descriptor *pri = NULL;
        struct iso_supplementary_descriptor *sec = NULL;
        struct iso_directory_record *rootp;
        struct inode *inode;
        struct iso9660_options opt;
        struct isofs_sb_info *sbi;
        unsigned long first_data_zone;
        int joliet_level = 0;
        int iso_blknum, block;
        int orig_zonesize;
        int table, error = -EINVAL;
        unsigned int vol_desc_start;

        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
        s->s_fs_info = sbi;

        if (!parse_options((char *)data, &opt))
                goto out_freesbi;

        /*
         * First of all, get the hardware blocksize for this device.
         * If we don't know what it is, or the hardware blocksize is
         * larger than the blocksize the user specified, then use
         * that value.
         */
        /*
         * What if bugger tells us to go beyond page size?
         */
        if (bdev_logical_block_size(s->s_bdev) > 2048) {
                printk(KERN_WARNING
                       "ISOFS: unsupported/invalid hardware sector size %d\n",
                        bdev_logical_block_size(s->s_bdev));
                goto out_freesbi;
        }
        opt.blocksize = sb_min_blocksize(s, opt.blocksize);

        sbi->s_high_sierra = 0; /* default is iso9660 */
        sbi->s_session = opt.session;
        sbi->s_sbsector = opt.sbsector;

        vol_desc_start = (opt.sbsector != -1) ?
                opt.sbsector : isofs_get_last_session(s,opt.session);

        for (iso_blknum = vol_desc_start+16;
                iso_blknum < vol_desc_start+100; iso_blknum++) {
                struct hs_volume_descriptor *hdp;
                struct iso_volume_descriptor  *vdp;

                block = iso_blknum << (ISOFS_BLOCK_BITS - s->s_blocksize_bits);
                if (!(bh = sb_bread(s, block)))
                        goto out_no_read;

                vdp = (struct iso_volume_descriptor *)bh->b_data;
                hdp = (struct hs_volume_descriptor *)bh->b_data;

                /*
                 * Due to the overlapping physical location of the descriptors,
                 * ISO CDs can match hdp->id==HS_STANDARD_ID as well. To ensure
                 * proper identification in this case, we first check for ISO.
                 */
                if (strncmp (vdp->id, ISO_STANDARD_ID, sizeof vdp->id) == 0) {
                        if (isonum_711(vdp->type) == ISO_VD_END)
                                break;
                        if (isonum_711(vdp->type) == ISO_VD_PRIMARY) {
                                if (!pri) {
                                        pri = (struct iso_primary_descriptor *)vdp;
                                        /* Save the buffer in case we need it ... */
                                        pri_bh = bh;
                                        bh = NULL;
                                }
                        }
#ifdef CONFIG_JOLIET
                        else if (isonum_711(vdp->type) == ISO_VD_SUPPLEMENTARY) {
                                sec = (struct iso_supplementary_descriptor *)vdp;
                                if (sec->escape[0] == 0x25 && sec->escape[1] == 0x2f) {
                                        if (opt.joliet) {
                                                if (sec->escape[2] == 0x40)
                                                        joliet_level = 1;
                                                else if (sec->escape[2] == 0x43)
                                                        joliet_level = 2;
                                                else if (sec->escape[2] == 0x45)
                                                        joliet_level = 3;

                                                printk(KERN_DEBUG "ISO 9660 Extensions: "
                                                        "Microsoft Joliet Level %d\n",
                                                        joliet_level);
                                        }
                                        goto root_found;
                                } else {
                                /* Unknown supplementary volume descriptor */
                                sec = NULL;
                                }
                        }
#endif
                } else {
                        if (strncmp (hdp->id, HS_STANDARD_ID, sizeof hdp->id) == 0) {
                                if (isonum_711(hdp->type) != ISO_VD_PRIMARY)
                                        goto out_freebh;

                                sbi->s_high_sierra = 1;
                                opt.rock = 0;
                                h_pri = (struct hs_primary_descriptor *)vdp;
                                goto root_found;
                        }
                }

                /* Just skip any volume descriptors we don't recognize */

                brelse(bh);
                bh = NULL;
        }
        /*
         * If we fall through, either no volume descriptor was found,
         * or else we passed a primary descriptor looking for others.
         */
        if (!pri)
                goto out_unknown_format;
        brelse(bh);
        bh = pri_bh;
        pri_bh = NULL;

root_found:
        /* We don't support read-write mounts */
        if (!sb_rdonly(s)) {
                error = -EACCES;
                goto out_freebh;
        }

        if (joliet_level && (!pri || !opt.rock)) {
                /* This is the case of Joliet with the norock mount flag.
                 * A disc with both Joliet and Rock Ridge is handled later
                 */
                pri = (struct iso_primary_descriptor *) sec;
        }

        if(sbi->s_high_sierra){
                rootp = (struct iso_directory_record *) h_pri->root_directory_record;
                sbi->s_nzones = isonum_733(h_pri->volume_space_size);
                sbi->s_log_zone_size = isonum_723(h_pri->logical_block_size);
                sbi->s_max_size = isonum_733(h_pri->volume_space_size);
        } else {
                if (!pri)
                        goto out_freebh;
                rootp = (struct iso_directory_record *) pri->root_directory_record;
                sbi->s_nzones = isonum_733(pri->volume_space_size);
                sbi->s_log_zone_size = isonum_723(pri->logical_block_size);
                sbi->s_max_size = isonum_733(pri->volume_space_size);
        }

        sbi->s_ninodes = 0; /* No way to figure this out easily */

        orig_zonesize = sbi->s_log_zone_size;
        /*
         * If the zone size is smaller than the hardware sector size,
         * this is a fatal error.  This would occur if the disc drive
         * had sectors that were 2048 bytes, but the filesystem had
         * blocks that were 512 bytes (which should only very rarely
         * happen.)
         */
        if (orig_zonesize < opt.blocksize)
                goto out_bad_size;

        /* RDE: convert log zone size to bit shift */
        switch (sbi->s_log_zone_size) {
        case  512: sbi->s_log_zone_size =  9; break;
        case 1024: sbi->s_log_zone_size = 10; break;
        case 2048: sbi->s_log_zone_size = 11; break;

        default:
                goto out_bad_zone_size;
        }

        s->s_magic = ISOFS_SUPER_MAGIC;

        /*
         * With multi-extent files, file size is only limited by the maximum
         * size of a file system, which is 8 TB.
         */
        s->s_maxbytes = 0x80000000000LL;

        /* ECMA-119 timestamp from 1900/1/1 with tz offset */
        s->s_time_min = mktime64(1900, 1, 1, 0, 0, 0) - MAX_TZ_OFFSET;
        s->s_time_max = mktime64(U8_MAX+1900, 12, 31, 23, 59, 59) + MAX_TZ_OFFSET;

        /* Set this for reference. Its not currently used except on write
           which we don't have .. */

        first_data_zone = isonum_733(rootp->extent) +
                          isonum_711(rootp->ext_attr_length);
        sbi->s_firstdatazone = first_data_zone;
#ifndef BEQUIET
        printk(KERN_DEBUG "ISOFS: Max size:%ld   Log zone size:%ld\n",
                sbi->s_max_size, 1UL << sbi->s_log_zone_size);
        printk(KERN_DEBUG "ISOFS: First datazone:%ld\n", sbi->s_firstdatazone);
        if(sbi->s_high_sierra)
                printk(KERN_DEBUG "ISOFS: Disc in High Sierra format.\n");
#endif

        /*
         * If the Joliet level is set, we _may_ decide to use the
         * secondary descriptor, but can't be sure until after we
         * read the root inode. But before reading the root inode
         * we may need to change the device blocksize, and would
         * rather release the old buffer first. So, we cache the
         * first_data_zone value from the secondary descriptor.
         */
        if (joliet_level) {
                pri = (struct iso_primary_descriptor *) sec;
                rootp = (struct iso_directory_record *)
                        pri->root_directory_record;
                first_data_zone = isonum_733(rootp->extent) +
                                isonum_711(rootp->ext_attr_length);
        }

        /*
         * We're all done using the volume descriptor, and may need
         * to change the device blocksize, so release the buffer now.
         */
        brelse(pri_bh);
        brelse(bh);

        /*
         * Force the blocksize to 512 for 512 byte sectors.  The file
         * read primitives really get it wrong in a bad way if we don't
         * do this.
         *
         * Note - we should never be setting the blocksize to something
         * less than the hardware sector size for the device.  If we
         * do, we would end up having to read larger buffers and split
         * out portions to satisfy requests.
         *
         * Note2- the idea here is that we want to deal with the optimal
         * zonesize in the filesystem.  If we have it set to something less,
         * then we have horrible problems with trying to piece together
         * bits of adjacent blocks in order to properly read directory
         * entries.  By forcing the blocksize in this way, we ensure
         * that we will never be required to do this.
         */
        sb_set_blocksize(s, orig_zonesize);

        sbi->s_nls_iocharset = NULL;

#ifdef CONFIG_JOLIET
        if (joliet_level) {
                char *p = opt.iocharset ? opt.iocharset : CONFIG_NLS_DEFAULT;
                if (strcmp(p, "utf8") != 0) {
                        sbi->s_nls_iocharset = opt.iocharset ?
                                load_nls(opt.iocharset) : load_nls_default();
                        if (!sbi->s_nls_iocharset)
                                goto out_freesbi;
                }
        }
#endif
        s->s_op = &isofs_sops;
        s->s_export_op = &isofs_export_ops;
        sbi->s_mapping = opt.map;
        sbi->s_rock = (opt.rock ? 2 : 0);
        sbi->s_rock_offset = -1; /* initial offset, will guess until SP is found*/
        sbi->s_cruft = opt.cruft;
        sbi->s_hide = opt.hide;
        sbi->s_showassoc = opt.showassoc;
        sbi->s_uid = opt.uid;
        sbi->s_gid = opt.gid;
        sbi->s_uid_set = opt.uid_set;
        sbi->s_gid_set = opt.gid_set;
        sbi->s_nocompress = opt.nocompress;
        sbi->s_overriderockperm = opt.overriderockperm;
        /*
         * It would be incredibly stupid to allow people to mark every file
         * on the disk as suid, so we merely allow them to set the default
         * permissions.
         */
        if (opt.fmode != ISOFS_INVALID_MODE)
                sbi->s_fmode = opt.fmode & 0777;
        else
                sbi->s_fmode = ISOFS_INVALID_MODE;
        if (opt.dmode != ISOFS_INVALID_MODE)
                sbi->s_dmode = opt.dmode & 0777;
        else
                sbi->s_dmode = ISOFS_INVALID_MODE;

        /*
         * Read the root inode, which _may_ result in changing
         * the s_rock flag. Once we have the final s_rock value,
         * we then decide whether to use the Joliet descriptor.
         */
        inode = isofs_iget(s, sbi->s_firstdatazone, 0);

        /*
         * Fix for broken CDs with a corrupt root inode but a correct Joliet
         * root directory.
         */
        if (IS_ERR(inode)) {
                if (joliet_level && sbi->s_firstdatazone != first_data_zone) {
                        printk(KERN_NOTICE
                               "ISOFS: root inode is unusable. "
                               "Disabling Rock Ridge and switching to Joliet.");
                        sbi->s_rock = 0;
                        inode = NULL;
                } else {
                        goto out_no_root;
                }
        }

        /*
         * Fix for broken CDs with Rock Ridge and empty ISO root directory but
         * correct Joliet root directory.
         */
        if (sbi->s_rock == 1 && joliet_level &&
                                rootdir_empty(s, sbi->s_firstdatazone)) {
                printk(KERN_NOTICE
                        "ISOFS: primary root directory is empty. "
                        "Disabling Rock Ridge and switching to Joliet.");
                sbi->s_rock = 0;
        }

        /*
         * If this disk has both Rock Ridge and Joliet on it, then we
         * want to use Rock Ridge by default.  This can be overridden
         * by using the norock mount option.  There is still one other
         * possibility that is not taken into account: a Rock Ridge
         * CD with Unicode names.  Until someone sees such a beast, it
         * will not be supported.
         */
        if (sbi->s_rock == 1) {
                joliet_level = 0;
        } else if (joliet_level) {
                sbi->s_rock = 0;
                if (sbi->s_firstdatazone != first_data_zone) {
                        sbi->s_firstdatazone = first_data_zone;
                        printk(KERN_DEBUG
                                "ISOFS: changing to secondary root\n");
                        iput(inode);
                        inode = isofs_iget(s, sbi->s_firstdatazone, 0);
                        if (IS_ERR(inode))
                                goto out_no_root;
                }
        }

        if (opt.check == 'u') {
                /* Only Joliet is case insensitive by default */
                if (joliet_level)
                        opt.check = 'r';
                else
                        opt.check = 's';
        }
        sbi->s_joliet_level = joliet_level;

        /* Make sure the root inode is a directory */
        if (!S_ISDIR(inode->i_mode)) {
                printk(KERN_WARNING
                        "isofs_fill_super: root inode is not a directory. "
                        "Corrupted media?\n");
                goto out_iput;
        }

        table = 0;
        if (joliet_level)
                table += 2;
        if (opt.check == 'r')
                table++;
        sbi->s_check = opt.check;

        if (table)
                s->s_d_op = &isofs_dentry_ops[table - 1];

        /* get the root dentry */
        s->s_root = d_make_root(inode);
        if (!(s->s_root)) {
                error = -ENOMEM;
                goto out_no_inode;
        }

        kfree(opt.iocharset);

        return 0;

        /*
         * Display error messages and free resources.
         */
out_iput:
        iput(inode);
        goto out_no_inode;
out_no_root:
        error = PTR_ERR(inode);
        if (error != -ENOMEM)
                printk(KERN_WARNING "%s: get root inode failed\n", __func__);
out_no_inode:
#ifdef CONFIG_JOLIET
        unload_nls(sbi->s_nls_iocharset);
#endif
        goto out_freesbi;
out_no_read:
        printk(KERN_WARNING "%s: bread failed, dev=%s, iso_blknum=%d, block=%d\n",
                __func__, s->s_id, iso_blknum, block);
        goto out_freebh;
out_bad_zone_size:
        printk(KERN_WARNING "ISOFS: Bad logical zone size %ld\n",
                sbi->s_log_zone_size);
        goto out_freebh;
out_bad_size:
        printk(KERN_WARNING "ISOFS: Logical zone size(%d) < hardware blocksize(%u)\n",
                orig_zonesize, opt.blocksize);
        goto out_freebh;
out_unknown_format:
        if (!silent)
                printk(KERN_WARNING "ISOFS: Unable to identify CD-ROM format.\n");

out_freebh:
        brelse(bh);
        brelse(pri_bh);
out_freesbi:
        kfree(opt.iocharset);
        kfree(sbi);
        s->s_fs_info = NULL;
        return error;
}

static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
{
        struct super_block *sb = dentry->d_sb;
        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);

        buf->f_type = ISOFS_SUPER_MAGIC;
        buf->f_bsize = sb->s_blocksize;
        buf->f_blocks = (ISOFS_SB(sb)->s_nzones
                << (ISOFS_SB(sb)->s_log_zone_size - sb->s_blocksize_bits));
        buf->f_bfree = 0;
        buf->f_bavail = 0;
        buf->f_files = ISOFS_SB(sb)->s_ninodes;
        buf->f_ffree = 0;
        buf->f_fsid = u64_to_fsid(id);
        buf->f_namelen = NAME_MAX;
        return 0;
}

/*
 * Get a set of blocks; filling in buffer_heads if already allocated
 * or getblk() if they are not.  Returns the number of blocks inserted
 * (-ve == error.)
 */
int isofs_get_blocks(struct inode *inode, sector_t iblock,
                     struct buffer_head **bh, unsigned long nblocks)
{
        unsigned long b_off = iblock;
        unsigned offset, sect_size;
        unsigned int firstext;
        unsigned long nextblk, nextoff;
        int section, rv, error;
        struct iso_inode_info *ei = ISOFS_I(inode);

        error = -EIO;
        rv = 0;
        if (iblock != b_off) {
                printk(KERN_DEBUG "%s: block number too large\n", __func__);
                goto abort;
        }


        offset = 0;
        firstext = ei->i_first_extent;
        sect_size = ei->i_section_size >> ISOFS_BUFFER_BITS(inode);
        nextblk = ei->i_next_section_block;
        nextoff = ei->i_next_section_offset;
        section = 0;

        while (nblocks) {
                /* If we are *way* beyond the end of the file, print a message.
                 * Access beyond the end of the file up to the next page boundary
                 * is normal, however because of the way the page cache works.
                 * In this case, we just return 0 so that we can properly fill
                 * the page with useless information without generating any
                 * I/O errors.
                 */
                if (b_off > ((inode->i_size + PAGE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) {
                        printk(KERN_DEBUG "%s: block >= EOF (%lu, %llu)\n",
                                __func__, b_off,
                                (unsigned long long)inode->i_size);
                        goto abort;
                }

                /* On the last section, nextblk == 0, section size is likely to
                 * exceed sect_size by a partial block, and access beyond the
                 * end of the file will reach beyond the section size, too.
                 */
                while (nextblk && (b_off >= (offset + sect_size))) {
                        struct inode *ninode;

                        offset += sect_size;
                        ninode = isofs_iget(inode->i_sb, nextblk, nextoff);
                        if (IS_ERR(ninode)) {
                                error = PTR_ERR(ninode);
                                goto abort;
                        }
                        firstext  = ISOFS_I(ninode)->i_first_extent;
                        sect_size = ISOFS_I(ninode)->i_section_size >> ISOFS_BUFFER_BITS(ninode);
                        nextblk   = ISOFS_I(ninode)->i_next_section_block;
                        nextoff   = ISOFS_I(ninode)->i_next_section_offset;
                        iput(ninode);

                        if (++section > 100) {
                                printk(KERN_DEBUG "%s: More than 100 file sections ?!?"
                                        " aborting...\n", __func__);
                                printk(KERN_DEBUG "%s: block=%lu firstext=%u sect_size=%u "
                                        "nextblk=%lu nextoff=%lu\n", __func__,
                                        b_off, firstext, (unsigned) sect_size,
                                        nextblk, nextoff);
                                goto abort;
                        }
                }

                if (*bh) {
                        map_bh(*bh, inode->i_sb, firstext + b_off - offset);
                } else {
                        *bh = sb_getblk(inode->i_sb, firstext+b_off-offset);
                        if (!*bh)
                                goto abort;
                }
                bh++;        /* Next buffer head */
                b_off++;        /* Next buffer offset */
                nblocks--;
                rv++;
        }

        error = 0;
abort:
        return rv != 0 ? rv : error;
}

/*
 * Used by the standard interfaces.
 */
static int isofs_get_block(struct inode *inode, sector_t iblock,
                    struct buffer_head *bh_result, int create)
{
        int ret;

        if (create) {
                printk(KERN_DEBUG "%s: Kernel tries to allocate a block\n", __func__);
                return -EROFS;
        }

        ret = isofs_get_blocks(inode, iblock, &bh_result, 1);
        return ret < 0 ? ret : 0;
}

static int isofs_bmap(struct inode *inode, sector_t block)
{
        struct buffer_head dummy;
        int error;

        dummy.b_state = 0;
        dummy.b_blocknr = -1000;
        error = isofs_get_block(inode, block, &dummy, 0);
        if (!error)
                return dummy.b_blocknr;
        return 0;
}

struct buffer_head *isofs_bread(struct inode *inode, sector_t block)
{
        sector_t blknr = isofs_bmap(inode, block);
        if (!blknr)
                return NULL;
        return sb_bread(inode->i_sb, blknr);
}

static int isofs_readpage(struct file *file, struct page *page)
{
        return mpage_readpage(page, isofs_get_block);
}

static void isofs_readahead(struct readahead_control *rac)
{
        mpage_readahead(rac, isofs_get_block);
}

static sector_t _isofs_bmap(struct address_space *mapping, sector_t block)
{
        return generic_block_bmap(mapping,block,isofs_get_block);
}

static const struct address_space_operations isofs_aops = {
        .readpage = isofs_readpage,
        .readahead = isofs_readahead,
        .bmap = _isofs_bmap
};

static int isofs_read_level3_size(struct inode *inode)
{
        unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
        int high_sierra = ISOFS_SB(inode->i_sb)->s_high_sierra;
        struct buffer_head *bh = NULL;
        unsigned long block, offset, block_saved, offset_saved;
        int i = 0;
        int more_entries = 0;
        struct iso_directory_record *tmpde = NULL;
        struct iso_inode_info *ei = ISOFS_I(inode);

        inode->i_size = 0;

        /* The first 16 blocks are reserved as the System Area.  Thus,
         * no inodes can appear in block 0.  We use this to flag that
         * this is the last section. */
        ei->i_next_section_block = 0;
        ei->i_next_section_offset = 0;

        block = ei->i_iget5_block;
        offset = ei->i_iget5_offset;

        do {
                struct iso_directory_record *de;
                unsigned int de_len;

                if (!bh) {
                        bh = sb_bread(inode->i_sb, block);
                        if (!bh)
                                goto out_noread;
                }
                de = (struct iso_directory_record *) (bh->b_data + offset);
                de_len = *(unsigned char *) de;

                if (de_len == 0) {
                        brelse(bh);
                        bh = NULL;
                        ++block;
                        offset = 0;
                        continue;
                }

                block_saved = block;
                offset_saved = offset;
                offset += de_len;

                /* Make sure we have a full directory entry */
                if (offset >= bufsize) {
                        int slop = bufsize - offset + de_len;
                        if (!tmpde) {
                                tmpde = kmalloc(256, GFP_KERNEL);
                                if (!tmpde)
                                        goto out_nomem;
                        }
                        memcpy(tmpde, de, slop);
                        offset &= bufsize - 1;
                        block++;
                        brelse(bh);
                        bh = NULL;
                        if (offset) {
                                bh = sb_bread(inode->i_sb, block);
                                if (!bh)
                                        goto out_noread;
                                memcpy((void *)tmpde+slop, bh->b_data, offset);
                        }
                        de = tmpde;
                }

                inode->i_size += isonum_733(de->size);
                if (i == 1) {
                        ei->i_next_section_block = block_saved;
                        ei->i_next_section_offset = offset_saved;
                }

                more_entries = de->flags[-high_sierra] & 0x80;

                i++;
                if (i > 100)
                        goto out_toomany;
        } while (more_entries);
out:
        kfree(tmpde);
        if (bh)
                brelse(bh);
        return 0;

out_nomem:
        if (bh)
                brelse(bh);
        return -ENOMEM;

out_noread:
        printk(KERN_INFO "ISOFS: unable to read i-node block %lu\n", block);
        kfree(tmpde);
        return -EIO;

out_toomany:
        printk(KERN_INFO "%s: More than 100 file sections ?!?, aborting...\n"
                "isofs_read_level3_size: inode=%lu\n",
                __func__, inode->i_ino);
        goto out;
}

static int isofs_read_inode(struct inode *inode, int relocated)
{
        struct super_block *sb = inode->i_sb;
        struct isofs_sb_info *sbi = ISOFS_SB(sb);
        unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
        unsigned long block;
        int high_sierra = sbi->s_high_sierra;
        struct buffer_head *bh;
        struct iso_directory_record *de;
        struct iso_directory_record *tmpde = NULL;
        unsigned int de_len;
        unsigned long offset;
        struct iso_inode_info *ei = ISOFS_I(inode);
        int ret = -EIO;

        block = ei->i_iget5_block;
        bh = sb_bread(inode->i_sb, block);
        if (!bh)
                goto out_badread;

        offset = ei->i_iget5_offset;

        de = (struct iso_directory_record *) (bh->b_data + offset);
        de_len = *(unsigned char *) de;
        if (de_len < sizeof(struct iso_directory_record))
                goto fail;

        if (offset + de_len > bufsize) {
                int frag1 = bufsize - offset;

                tmpde = kmalloc(de_len, GFP_KERNEL);
                if (!tmpde) {
                        ret = -ENOMEM;
                        goto fail;
                }
                memcpy(tmpde, bh->b_data + offset, frag1);
                brelse(bh);
                bh = sb_bread(inode->i_sb, ++block);
                if (!bh)
                        goto out_badread;
                memcpy((char *)tmpde+frag1, bh->b_data, de_len - frag1);
                de = tmpde;
        }

        inode->i_ino = isofs_get_ino(ei->i_iget5_block,
                                        ei->i_iget5_offset,
                                        ISOFS_BUFFER_BITS(inode));

        /* Assume it is a normal-format file unless told otherwise */
        ei->i_file_format = isofs_file_normal;

        if (de->flags[-high_sierra] & 2) {
                if (sbi->s_dmode != ISOFS_INVALID_MODE)
                        inode->i_mode = S_IFDIR | sbi->s_dmode;
                else
                        inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
                set_nlink(inode, 1);        /*
                                         * Set to 1.  We know there are 2, but
                                         * the find utility tries to optimize
                                         * if it is 2, and it screws up.  It is
                                         * easier to give 1 which tells find to
                                         * do it the hard way.
                                         */
        } else {
                if (sbi->s_fmode != ISOFS_INVALID_MODE) {
                        inode->i_mode = S_IFREG | sbi->s_fmode;
                } else {
                        /*
                         * Set default permissions: r-x for all.  The disc
                         * could be shared with DOS machines so virtually
                         * anything could be a valid executable.
                         */
                        inode->i_mode = S_IFREG | S_IRUGO | S_IXUGO;
                }
                set_nlink(inode, 1);
        }
        inode->i_uid = sbi->s_uid;
        inode->i_gid = sbi->s_gid;
        inode->i_blocks = 0;

        ei->i_format_parm[0] = 0;
        ei->i_format_parm[1] = 0;
        ei->i_format_parm[2] = 0;

        ei->i_section_size = isonum_733(de->size);
        if (de->flags[-high_sierra] & 0x80) {
                ret = isofs_read_level3_size(inode);
                if (ret < 0)
                        goto fail;
                ret = -EIO;
        } else {
                ei->i_next_section_block = 0;
                ei->i_next_section_offset = 0;
                inode->i_size = isonum_733(de->size);
        }

        /*
         * Some dipshit decided to store some other bit of information
         * in the high byte of the file length.  Truncate size in case
         * this CDROM was mounted with the cruft option.
         */

        if (sbi->s_cruft)
                inode->i_size &= 0x00ffffff;

        if (de->interleave[0]) {
                printk(KERN_DEBUG "ISOFS: Interleaved files not (yet) supported.\n");
                inode->i_size = 0;
        }

        /* I have no idea what file_unit_size is used for, so
           we will flag it for now */
        if (de->file_unit_size[0] != 0) {
                printk(KERN_DEBUG "ISOFS: File unit size != 0 for ISO file (%ld).\n",
                        inode->i_ino);
        }

        /* I have no idea what other flag bits are used for, so
           we will flag it for now */
#ifdef DEBUG
        if((de->flags[-high_sierra] & ~2)!= 0){
                printk(KERN_DEBUG "ISOFS: Unusual flag settings for ISO file "
                                "(%ld %x).\n",
                        inode->i_ino, de->flags[-high_sierra]);
        }
#endif

        inode->i_mtime.tv_sec =
        inode->i_atime.tv_sec =
        inode->i_ctime.tv_sec = iso_date(de->date, high_sierra);
        inode->i_mtime.tv_nsec =
        inode->i_atime.tv_nsec =
        inode->i_ctime.tv_nsec = 0;

        ei->i_first_extent = (isonum_733(de->extent) +
                        isonum_711(de->ext_attr_length));

        /* Set the number of blocks for stat() - should be done before RR */
        inode->i_blocks = (inode->i_size + 511) >> 9;

        /*
         * Now test for possible Rock Ridge extensions which will override
         * some of these numbers in the inode structure.
         */

        if (!high_sierra) {
                parse_rock_ridge_inode(de, inode, relocated);
                /* if we want uid/gid set, override the rock ridge setting */
                if (sbi->s_uid_set)
                        inode->i_uid = sbi->s_uid;
                if (sbi->s_gid_set)
                        inode->i_gid = sbi->s_gid;
        }
        /* Now set final access rights if overriding rock ridge setting */
        if (S_ISDIR(inode->i_mode) && sbi->s_overriderockperm &&
            sbi->s_dmode != ISOFS_INVALID_MODE)
                inode->i_mode = S_IFDIR | sbi->s_dmode;
        if (S_ISREG(inode->i_mode) && sbi->s_overriderockperm &&
            sbi->s_fmode != ISOFS_INVALID_MODE)
                inode->i_mode = S_IFREG | sbi->s_fmode;

        /* Install the inode operations vector */
        if (S_ISREG(inode->i_mode)) {
                inode->i_fop = &generic_ro_fops;
                switch (ei->i_file_format) {
#ifdef CONFIG_ZISOFS
                case isofs_file_compressed:
                        inode->i_data.a_ops = &zisofs_aops;
                        break;
#endif
                default:
                        inode->i_data.a_ops = &isofs_aops;
                        break;
                }
        } else if (S_ISDIR(inode->i_mode)) {
                inode->i_op = &isofs_dir_inode_operations;
                inode->i_fop = &isofs_dir_operations;
        } else if (S_ISLNK(inode->i_mode)) {
                inode->i_op = &page_symlink_inode_operations;
                inode_nohighmem(inode);
                inode->i_data.a_ops = &isofs_symlink_aops;
        } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
                   S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
                /* XXX - parse_rock_ridge_inode() had already set i_rdev. */
                init_special_inode(inode, inode->i_mode, inode->i_rdev);
        } else {
                printk(KERN_DEBUG "ISOFS: Invalid file type 0%04o for inode %lu.\n",
                        inode->i_mode, inode->i_ino);
                ret = -EIO;
                goto fail;
        }

        ret = 0;
out:
        kfree(tmpde);
        if (bh)
                brelse(bh);
        return ret;

out_badread:
        printk(KERN_WARNING "ISOFS: unable to read i-node block\n");
fail:
        goto out;
}

struct isofs_iget5_callback_data {
        unsigned long block;
        unsigned long offset;
};

static int isofs_iget5_test(struct inode *ino, void *data)
{
        struct iso_inode_info *i = ISOFS_I(ino);
        struct isofs_iget5_callback_data *d =
                (struct isofs_iget5_callback_data*)data;
        return (i->i_iget5_block == d->block)
                && (i->i_iget5_offset == d->offset);
}

static int isofs_iget5_set(struct inode *ino, void *data)
{
        struct iso_inode_info *i = ISOFS_I(ino);
        struct isofs_iget5_callback_data *d =
                (struct isofs_iget5_callback_data*)data;
        i->i_iget5_block = d->block;
        i->i_iget5_offset = d->offset;
        return 0;
}

/* Store, in the inode's containing structure, the block and block
 * offset that point to the underlying meta-data for the inode.  The
 * code below is otherwise similar to the iget() code in
 * include/linux/fs.h */
struct inode *__isofs_iget(struct super_block *sb,
                           unsigned long block,
                           unsigned long offset,
                           int relocated)
{
        unsigned long hashval;
        struct inode *inode;
        struct isofs_iget5_callback_data data;
        long ret;

        if (offset >= 1ul << sb->s_blocksize_bits)
                return ERR_PTR(-EINVAL);

        data.block = block;
        data.offset = offset;

        hashval = (block << sb->s_blocksize_bits) | offset;

        inode = iget5_locked(sb, hashval, &isofs_iget5_test,
                                &isofs_iget5_set, &data);

        if (!inode)
                return ERR_PTR(-ENOMEM);

        if (inode->i_state & I_NEW) {
                ret = isofs_read_inode(inode, relocated);
                if (ret < 0) {
                        iget_failed(inode);
                        inode = ERR_PTR(ret);
                } else {
                        unlock_new_inode(inode);
                }
        }

        return inode;
}

static struct dentry *isofs_mount(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data)
{
        return mount_bdev(fs_type, flags, dev_name, data, isofs_fill_super);
}

static struct file_system_type iso9660_fs_type = {
        .owner                = THIS_MODULE,
        .name                = "iso9660",
        .mount                = isofs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags        = FS_REQUIRES_DEV,
};
MODULE_ALIAS_FS("iso9660");
MODULE_ALIAS("iso9660");

static int __init init_iso9660_fs(void)
{
        int err = init_inodecache();
        if (err)
                goto out;
#ifdef CONFIG_ZISOFS
        err = zisofs_init();
        if (err)
                goto out1;
#endif
        err = register_filesystem(&iso9660_fs_type);
        if (err)
                goto out2;
        return 0;
out2:
#ifdef CONFIG_ZISOFS
        zisofs_cleanup();
out1:
#endif
        destroy_inodecache();
out:
        return err;
}

static void __exit exit_iso9660_fs(void)
{
        unregister_filesystem(&iso9660_fs_type);
#ifdef CONFIG_ZISOFS
        zisofs_cleanup();
#endif
        destroy_inodecache();
}

module_init(init_iso9660_fs)
module_exit(exit_iso9660_fs)
MODULE_LICENSE("GPL");

























































    8 














    1 




    1 




    1 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM x86_fpu

#if !defined(_TRACE_FPU_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_FPU_H

#include <linux/tracepoint.h>

DECLARE_EVENT_CLASS(x86_fpu,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu),

        TP_STRUCT__entry(
                __field(struct fpu *, fpu)
                __field(bool, load_fpu)
                __field(u64, xfeatures)
                __field(u64, xcomp_bv)
                ),

        TP_fast_assign(
                __entry->fpu                = fpu;
                __entry->load_fpu        = test_thread_flag(TIF_NEED_FPU_LOAD);
                if (boot_cpu_has(X86_FEATURE_OSXSAVE)) {
                        __entry->xfeatures = fpu->state.xsave.header.xfeatures;
                        __entry->xcomp_bv  = fpu->state.xsave.header.xcomp_bv;
                }
        ),
        TP_printk("x86/fpu: %p load: %d xfeatures: %llx xcomp_bv: %llx",
                        __entry->fpu,
                        __entry->load_fpu,
                        __entry->xfeatures,
                        __entry->xcomp_bv
        )
);

DEFINE_EVENT(x86_fpu, x86_fpu_before_save,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_after_save,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_before_restore,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_after_restore,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_regs_activated,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_regs_deactivated,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_init_state,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_dropped,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_copy_src,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_copy_dst,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_xstate_check_failed,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH asm/trace/
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE fpu
#endif /* _TRACE_FPU_H */

/* This part must be outside protection */
#include <trace/define_trace.h>









































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
/* SPDX-License-Identifier: GPL-2.0 OR MIT */
/*
 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
 */

#ifndef _CRYPTO_BLAKE2S_H
#define _CRYPTO_BLAKE2S_H

#include <linux/bug.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>

enum blake2s_lengths {
        BLAKE2S_BLOCK_SIZE = 64,
        BLAKE2S_HASH_SIZE = 32,
        BLAKE2S_KEY_SIZE = 32,

        BLAKE2S_128_HASH_SIZE = 16,
        BLAKE2S_160_HASH_SIZE = 20,
        BLAKE2S_224_HASH_SIZE = 28,
        BLAKE2S_256_HASH_SIZE = 32,
};

struct blake2s_state {
        /* 'h', 't', and 'f' are used in assembly code, so keep them as-is. */
        u32 h[8];
        u32 t[2];
        u32 f[2];
        u8 buf[BLAKE2S_BLOCK_SIZE];
        unsigned int buflen;
        unsigned int outlen;
};

enum blake2s_iv {
        BLAKE2S_IV0 = 0x6A09E667UL,
        BLAKE2S_IV1 = 0xBB67AE85UL,
        BLAKE2S_IV2 = 0x3C6EF372UL,
        BLAKE2S_IV3 = 0xA54FF53AUL,
        BLAKE2S_IV4 = 0x510E527FUL,
        BLAKE2S_IV5 = 0x9B05688CUL,
        BLAKE2S_IV6 = 0x1F83D9ABUL,
        BLAKE2S_IV7 = 0x5BE0CD19UL,
};

static inline void __blake2s_init(struct blake2s_state *state, size_t outlen,
                                  const void *key, size_t keylen)
{
        state->h[0] = BLAKE2S_IV0 ^ (0x01010000 | keylen << 8 | outlen);
        state->h[1] = BLAKE2S_IV1;
        state->h[2] = BLAKE2S_IV2;
        state->h[3] = BLAKE2S_IV3;
        state->h[4] = BLAKE2S_IV4;
        state->h[5] = BLAKE2S_IV5;
        state->h[6] = BLAKE2S_IV6;
        state->h[7] = BLAKE2S_IV7;
        state->t[0] = 0;
        state->t[1] = 0;
        state->f[0] = 0;
        state->f[1] = 0;
        state->buflen = 0;
        state->outlen = outlen;
        if (keylen) {
                memcpy(state->buf, key, keylen);
                memset(&state->buf[keylen], 0, BLAKE2S_BLOCK_SIZE - keylen);
                state->buflen = BLAKE2S_BLOCK_SIZE;
        }
}

static inline void blake2s_init(struct blake2s_state *state,
                                const size_t outlen)
{
        __blake2s_init(state, outlen, NULL, 0);
}

static inline void blake2s_init_key(struct blake2s_state *state,
                                    const size_t outlen, const void *key,
                                    const size_t keylen)
{
        WARN_ON(IS_ENABLED(DEBUG) && (!outlen || outlen > BLAKE2S_HASH_SIZE ||
                !key || !keylen || keylen > BLAKE2S_KEY_SIZE));

        __blake2s_init(state, outlen, key, keylen);
}

void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen);
void blake2s_final(struct blake2s_state *state, u8 *out);

static inline void blake2s(u8 *out, const u8 *in, const u8 *key,
                           const size_t outlen, const size_t inlen,
                           const size_t keylen)
{
        struct blake2s_state state;

        WARN_ON(IS_ENABLED(DEBUG) && ((!in && inlen > 0) || !out || !outlen ||
                outlen > BLAKE2S_HASH_SIZE || keylen > BLAKE2S_KEY_SIZE ||
                (!key && keylen)));

        __blake2s_init(&state, outlen, key, keylen);
        blake2s_update(&state, in, inlen);
        blake2s_final(&state, out);
}

#endif /* _CRYPTO_BLAKE2S_H */


































































































    1 












    1 









    1 

    1 















































    1 













    1 























    1 


    1 
































    1 





































    1 


















    1 
    1 


    1 











    1 

    1 

    1 



































    1 





    1 


















    1 






    1 

    1 



















































    1 
    1 



    1 



















    1 










    1 





















    1 










    1 

    1 













    1 






    1 


    1 











































    1 
    1 




    1 
    1 
    1 









































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
// SPDX-License-Identifier: GPL-2.0
/*
 *  MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler,
 *  for the blk-mq scheduling framework
 *
 *  Copyright (C) 2016 Jens Axboe <axboe@kernel.dk>
 */
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
#include <linux/elevator.h>
#include <linux/bio.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/compiler.h>
#include <linux/rbtree.h>
#include <linux/sbitmap.h>

#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
#include "blk-mq-tag.h"
#include "blk-mq-sched.h"

/*
 * See Documentation/block/deadline-iosched.rst
 */
static const int read_expire = HZ / 2;  /* max time before a read is submitted. */
static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
static const int writes_starved = 2;    /* max times reads can starve a write */
static const int fifo_batch = 16;       /* # of sequential requests treated as one
                                     by the above parameters. For throughput. */

struct deadline_data {
        /*
         * run time data
         */

        /*
         * requests (deadline_rq s) are present on both sort_list and fifo_list
         */
        struct rb_root sort_list[2];
        struct list_head fifo_list[2];

        /*
         * next in sort order. read, write or both are NULL
         */
        struct request *next_rq[2];
        unsigned int batching;                /* number of sequential requests made */
        unsigned int starved;                /* times reads have starved writes */

        /*
         * settings that change how the i/o scheduler behaves
         */
        int fifo_expire[2];
        int fifo_batch;
        int writes_starved;
        int front_merges;

        spinlock_t lock;
        spinlock_t zone_lock;
        struct list_head dispatch;
};

static inline struct rb_root *
deadline_rb_root(struct deadline_data *dd, struct request *rq)
{
        return &dd->sort_list[rq_data_dir(rq)];
}

/*
 * get the request after `rq' in sector-sorted order
 */
static inline struct request *
deadline_latter_request(struct request *rq)
{
        struct rb_node *node = rb_next(&rq->rb_node);

        if (node)
                return rb_entry_rq(node);

        return NULL;
}

static void
deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
{
        struct rb_root *root = deadline_rb_root(dd, rq);

        elv_rb_add(root, rq);
}

static inline void
deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
{
        const int data_dir = rq_data_dir(rq);

        if (dd->next_rq[data_dir] == rq)
                dd->next_rq[data_dir] = deadline_latter_request(rq);

        elv_rb_del(deadline_rb_root(dd, rq), rq);
}

/*
 * remove rq from rbtree and fifo.
 */
static void deadline_remove_request(struct request_queue *q, struct request *rq)
{
        struct deadline_data *dd = q->elevator->elevator_data;

        list_del_init(&rq->queuelist);

        /*
         * We might not be on the rbtree, if we are doing an insert merge
         */
        if (!RB_EMPTY_NODE(&rq->rb_node))
                deadline_del_rq_rb(dd, rq);

        elv_rqhash_del(q, rq);
        if (q->last_merge == rq)
                q->last_merge = NULL;
}

static void dd_request_merged(struct request_queue *q, struct request *req,
                              enum elv_merge type)
{
        struct deadline_data *dd = q->elevator->elevator_data;

        /*
         * if the merge was a front merge, we need to reposition request
         */
        if (type == ELEVATOR_FRONT_MERGE) {
                elv_rb_del(deadline_rb_root(dd, req), req);
                deadline_add_rq_rb(dd, req);
        }
}

static void dd_merged_requests(struct request_queue *q, struct request *req,
                               struct request *next)
{
        /*
         * if next expires before rq, assign its expire time to rq
         * and move into next position (next will be deleted) in fifo
         */
        if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
                if (time_before((unsigned long)next->fifo_time,
                                (unsigned long)req->fifo_time)) {
                        list_move(&req->queuelist, &next->queuelist);
                        req->fifo_time = next->fifo_time;
                }
        }

        /*
         * kill knowledge of next, this one is a goner
         */
        deadline_remove_request(q, next);
}

/*
 * move an entry to dispatch queue
 */
static void
deadline_move_request(struct deadline_data *dd, struct request *rq)
{
        const int data_dir = rq_data_dir(rq);

        dd->next_rq[READ] = NULL;
        dd->next_rq[WRITE] = NULL;
        dd->next_rq[data_dir] = deadline_latter_request(rq);

        /*
         * take it off the sort and fifo list
         */
        deadline_remove_request(rq->q, rq);
}

/*
 * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
 * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
 */
static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
{
        struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next);

        /*
         * rq is expired!
         */
        if (time_after_eq(jiffies, (unsigned long)rq->fifo_time))
                return 1;

        return 0;
}

/*
 * For the specified data direction, return the next request to
 * dispatch using arrival ordered lists.
 */
static struct request *
deadline_fifo_request(struct deadline_data *dd, int data_dir)
{
        struct request *rq;
        unsigned long flags;

        if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
                return NULL;

        if (list_empty(&dd->fifo_list[data_dir]))
                return NULL;

        rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
        if (data_dir == READ || !blk_queue_is_zoned(rq->q))
                return rq;

        /*
         * Look for a write request that can be dispatched, that is one with
         * an unlocked target zone.
         */
        spin_lock_irqsave(&dd->zone_lock, flags);
        list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) {
                if (blk_req_can_dispatch_to_zone(rq))
                        goto out;
        }
        rq = NULL;
out:
        spin_unlock_irqrestore(&dd->zone_lock, flags);

        return rq;
}

/*
 * For the specified data direction, return the next request to
 * dispatch using sector position sorted lists.
 */
static struct request *
deadline_next_request(struct deadline_data *dd, int data_dir)
{
        struct request *rq;
        unsigned long flags;

        if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
                return NULL;

        rq = dd->next_rq[data_dir];
        if (!rq)
                return NULL;

        if (data_dir == READ || !blk_queue_is_zoned(rq->q))
                return rq;

        /*
         * Look for a write request that can be dispatched, that is one with
         * an unlocked target zone.
         */
        spin_lock_irqsave(&dd->zone_lock, flags);
        while (rq) {
                if (blk_req_can_dispatch_to_zone(rq))
                        break;
                rq = deadline_latter_request(rq);
        }
        spin_unlock_irqrestore(&dd->zone_lock, flags);

        return rq;
}

/*
 * deadline_dispatch_requests selects the best request according to
 * read/write expire, fifo_batch, etc
 */
static struct request *__dd_dispatch_request(struct deadline_data *dd)
{
        struct request *rq, *next_rq;
        bool reads, writes;
        int data_dir;

        if (!list_empty(&dd->dispatch)) {
                rq = list_first_entry(&dd->dispatch, struct request, queuelist);
                list_del_init(&rq->queuelist);
                goto done;
        }

        reads = !list_empty(&dd->fifo_list[READ]);
        writes = !list_empty(&dd->fifo_list[WRITE]);

        /*
         * batches are currently reads XOR writes
         */
        rq = deadline_next_request(dd, WRITE);
        if (!rq)
                rq = deadline_next_request(dd, READ);

        if (rq && dd->batching < dd->fifo_batch)
                /* we have a next request are still entitled to batch */
                goto dispatch_request;

        /*
         * at this point we are not running a batch. select the appropriate
         * data direction (read / write)
         */

        if (reads) {
                BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));

                if (deadline_fifo_request(dd, WRITE) &&
                    (dd->starved++ >= dd->writes_starved))
                        goto dispatch_writes;

                data_dir = READ;

                goto dispatch_find_request;
        }

        /*
         * there are either no reads or writes have been starved
         */

        if (writes) {
dispatch_writes:
                BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE]));

                dd->starved = 0;

                data_dir = WRITE;

                goto dispatch_find_request;
        }

        return NULL;

dispatch_find_request:
        /*
         * we are not running a batch, find best request for selected data_dir
         */
        next_rq = deadline_next_request(dd, data_dir);
        if (deadline_check_fifo(dd, data_dir) || !next_rq) {
                /*
                 * A deadline has expired, the last request was in the other
                 * direction, or we have run out of higher-sectored requests.
                 * Start again from the request with the earliest expiry time.
                 */
                rq = deadline_fifo_request(dd, data_dir);
        } else {
                /*
                 * The last req was the same dir and we have a next request in
                 * sort order. No expired requests so continue on from here.
                 */
                rq = next_rq;
        }

        /*
         * For a zoned block device, if we only have writes queued and none of
         * them can be dispatched, rq will be NULL.
         */
        if (!rq)
                return NULL;

        dd->batching = 0;

dispatch_request:
        /*
         * rq is the selected appropriate request.
         */
        dd->batching++;
        deadline_move_request(dd, rq);
done:
        /*
         * If the request needs its target zone locked, do it.
         */
        blk_req_zone_write_lock(rq);
        rq->rq_flags |= RQF_STARTED;
        return rq;
}

/*
 * One confusing aspect here is that we get called for a specific
 * hardware queue, but we may return a request that is for a
 * different hardware queue. This is because mq-deadline has shared
 * state for all hardware queues, in terms of sorting, FIFOs, etc.
 */
static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
{
        struct deadline_data *dd = hctx->queue->elevator->elevator_data;
        struct request *rq;

        spin_lock(&dd->lock);
        rq = __dd_dispatch_request(dd);
        spin_unlock(&dd->lock);
        if (rq)
                atomic_dec(&rq->mq_hctx->elevator_queued);

        return rq;
}

static void dd_exit_queue(struct elevator_queue *e)
{
        struct deadline_data *dd = e->elevator_data;

        BUG_ON(!list_empty(&dd->fifo_list[READ]));
        BUG_ON(!list_empty(&dd->fifo_list[WRITE]));

        kfree(dd);
}

/*
 * initialize elevator private data (deadline_data).
 */
static int dd_init_queue(struct request_queue *q, struct elevator_type *e)
{
        struct deadline_data *dd;
        struct elevator_queue *eq;

        eq = elevator_alloc(q, e);
        if (!eq)
                return -ENOMEM;

        dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
        if (!dd) {
                kobject_put(&eq->kobj);
                return -ENOMEM;
        }
        eq->elevator_data = dd;

        INIT_LIST_HEAD(&dd->fifo_list[READ]);
        INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
        dd->sort_list[READ] = RB_ROOT;
        dd->sort_list[WRITE] = RB_ROOT;
        dd->fifo_expire[READ] = read_expire;
        dd->fifo_expire[WRITE] = write_expire;
        dd->writes_starved = writes_starved;
        dd->front_merges = 1;
        dd->fifo_batch = fifo_batch;
        spin_lock_init(&dd->lock);
        spin_lock_init(&dd->zone_lock);
        INIT_LIST_HEAD(&dd->dispatch);

        q->elevator = eq;
        return 0;
}

static int dd_request_merge(struct request_queue *q, struct request **rq,
                            struct bio *bio)
{
        struct deadline_data *dd = q->elevator->elevator_data;
        sector_t sector = bio_end_sector(bio);
        struct request *__rq;

        if (!dd->front_merges)
                return ELEVATOR_NO_MERGE;

        __rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);
        if (__rq) {
                BUG_ON(sector != blk_rq_pos(__rq));

                if (elv_bio_merge_ok(__rq, bio)) {
                        *rq = __rq;
                        if (blk_discard_mergable(__rq))
                                return ELEVATOR_DISCARD_MERGE;
                        return ELEVATOR_FRONT_MERGE;
                }
        }

        return ELEVATOR_NO_MERGE;
}

static bool dd_bio_merge(struct request_queue *q, struct bio *bio,
                unsigned int nr_segs)
{
        struct deadline_data *dd = q->elevator->elevator_data;
        struct request *free = NULL;
        bool ret;

        spin_lock(&dd->lock);
        ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
        spin_unlock(&dd->lock);

        if (free)
                blk_mq_free_request(free);

        return ret;
}

/*
 * add rq to rbtree and fifo
 */
static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
                              bool at_head)
{
        struct request_queue *q = hctx->queue;
        struct deadline_data *dd = q->elevator->elevator_data;
        const int data_dir = rq_data_dir(rq);

        /*
         * This may be a requeue of a write request that has locked its
         * target zone. If it is the case, this releases the zone lock.
         */
        blk_req_zone_write_unlock(rq);

        if (blk_mq_sched_try_insert_merge(q, rq))
                return;

        blk_mq_sched_request_inserted(rq);

        if (at_head || blk_rq_is_passthrough(rq)) {
                if (at_head)
                        list_add(&rq->queuelist, &dd->dispatch);
                else
                        list_add_tail(&rq->queuelist, &dd->dispatch);
        } else {
                deadline_add_rq_rb(dd, rq);

                if (rq_mergeable(rq)) {
                        elv_rqhash_add(q, rq);
                        if (!q->last_merge)
                                q->last_merge = rq;
                }

                /*
                 * set expire time and add to fifo list
                 */
                rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
                list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
        }
}

static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
                               struct list_head *list, bool at_head)
{
        struct request_queue *q = hctx->queue;
        struct deadline_data *dd = q->elevator->elevator_data;

        spin_lock(&dd->lock);
        while (!list_empty(list)) {
                struct request *rq;

                rq = list_first_entry(list, struct request, queuelist);
                list_del_init(&rq->queuelist);
                dd_insert_request(hctx, rq, at_head);
                atomic_inc(&hctx->elevator_queued);
        }
        spin_unlock(&dd->lock);
}

/*
 * Nothing to do here. This is defined only to ensure that .finish_request
 * method is called upon request completion.
 */
static void dd_prepare_request(struct request *rq)
{
}

/*
 * For zoned block devices, write unlock the target zone of
 * completed write requests. Do this while holding the zone lock
 * spinlock so that the zone is never unlocked while deadline_fifo_request()
 * or deadline_next_request() are executing. This function is called for
 * all requests, whether or not these requests complete successfully.
 *
 * For a zoned block device, __dd_dispatch_request() may have stopped
 * dispatching requests if all the queued requests are write requests directed
 * at zones that are already locked due to on-going write requests. To ensure
 * write request dispatch progress in this case, mark the queue as needing a
 * restart to ensure that the queue is run again after completion of the
 * request and zones being unlocked.
 */
static void dd_finish_request(struct request *rq)
{
        struct request_queue *q = rq->q;

        if (blk_queue_is_zoned(q)) {
                struct deadline_data *dd = q->elevator->elevator_data;
                unsigned long flags;

                spin_lock_irqsave(&dd->zone_lock, flags);
                blk_req_zone_write_unlock(rq);
                if (!list_empty(&dd->fifo_list[WRITE]))
                        blk_mq_sched_mark_restart_hctx(rq->mq_hctx);
                spin_unlock_irqrestore(&dd->zone_lock, flags);
        }
}

static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
{
        struct deadline_data *dd = hctx->queue->elevator->elevator_data;

        if (!atomic_read(&hctx->elevator_queued))
                return false;

        return !list_empty_careful(&dd->dispatch) ||
                !list_empty_careful(&dd->fifo_list[0]) ||
                !list_empty_careful(&dd->fifo_list[1]);
}

/*
 * sysfs parts below
 */
static ssize_t
deadline_var_show(int var, char *page)
{
        return sprintf(page, "%d\n", var);
}

static void
deadline_var_store(int *var, const char *page)
{
        char *p = (char *) page;

        *var = simple_strtol(p, &p, 10);
}

#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)                                \
static ssize_t __FUNC(struct elevator_queue *e, char *page)                \
{                                                                        \
        struct deadline_data *dd = e->elevator_data;                        \
        int __data = __VAR;                                                \
        if (__CONV)                                                        \
                __data = jiffies_to_msecs(__data);                        \
        return deadline_var_show(__data, (page));                        \
}
SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1);
SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1);
SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0);
SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0);
SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0);
#undef SHOW_FUNCTION

#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                        \
static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)        \
{                                                                        \
        struct deadline_data *dd = e->elevator_data;                        \
        int __data;                                                        \
        deadline_var_store(&__data, (page));                                \
        if (__data < (MIN))                                                \
                __data = (MIN);                                                \
        else if (__data > (MAX))                                        \
                __data = (MAX);                                                \
        if (__CONV)                                                        \
                *(__PTR) = msecs_to_jiffies(__data);                        \
        else                                                                \
                *(__PTR) = __data;                                        \
        return count;                                                        \
}
STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1);
STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1);
STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0);
STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0);
STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
#undef STORE_FUNCTION

#define DD_ATTR(name) \
        __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store)

static struct elv_fs_entry deadline_attrs[] = {
        DD_ATTR(read_expire),
        DD_ATTR(write_expire),
        DD_ATTR(writes_starved),
        DD_ATTR(front_merges),
        DD_ATTR(fifo_batch),
        __ATTR_NULL
};

#ifdef CONFIG_BLK_DEBUG_FS
#define DEADLINE_DEBUGFS_DDIR_ATTRS(ddir, name)                                \
static void *deadline_##name##_fifo_start(struct seq_file *m,                \
                                          loff_t *pos)                        \
        __acquires(&dd->lock)                                                \
{                                                                        \
        struct request_queue *q = m->private;                                \
        struct deadline_data *dd = q->elevator->elevator_data;                \
                                                                        \
        spin_lock(&dd->lock);                                                \
        return seq_list_start(&dd->fifo_list[ddir], *pos);                \
}                                                                        \
                                                                        \
static void *deadline_##name##_fifo_next(struct seq_file *m, void *v,        \
                                         loff_t *pos)                        \
{                                                                        \
        struct request_queue *q = m->private;                                \
        struct deadline_data *dd = q->elevator->elevator_data;                \
                                                                        \
        return seq_list_next(v, &dd->fifo_list[ddir], pos);                \
}                                                                        \
                                                                        \
static void deadline_##name##_fifo_stop(struct seq_file *m, void *v)        \
        __releases(&dd->lock)                                                \
{                                                                        \
        struct request_queue *q = m->private;                                \
        struct deadline_data *dd = q->elevator->elevator_data;                \
                                                                        \
        spin_unlock(&dd->lock);                                                \
}                                                                        \
                                                                        \
static const struct seq_operations deadline_##name##_fifo_seq_ops = {        \
        .start        = deadline_##name##_fifo_start,                                \
        .next        = deadline_##name##_fifo_next,                                \
        .stop        = deadline_##name##_fifo_stop,                                \
        .show        = blk_mq_debugfs_rq_show,                                \
};                                                                        \
                                                                        \
static int deadline_##name##_next_rq_show(void *data,                        \
                                          struct seq_file *m)                \
{                                                                        \
        struct request_queue *q = data;                                        \
        struct deadline_data *dd = q->elevator->elevator_data;                \
        struct request *rq = dd->next_rq[ddir];                                \
                                                                        \
        if (rq)                                                                \
                __blk_mq_debugfs_rq_show(m, rq);                        \
        return 0;                                                        \
}
DEADLINE_DEBUGFS_DDIR_ATTRS(READ, read)
DEADLINE_DEBUGFS_DDIR_ATTRS(WRITE, write)
#undef DEADLINE_DEBUGFS_DDIR_ATTRS

static int deadline_batching_show(void *data, struct seq_file *m)
{
        struct request_queue *q = data;
        struct deadline_data *dd = q->elevator->elevator_data;

        seq_printf(m, "%u\n", dd->batching);
        return 0;
}

static int deadline_starved_show(void *data, struct seq_file *m)
{
        struct request_queue *q = data;
        struct deadline_data *dd = q->elevator->elevator_data;

        seq_printf(m, "%u\n", dd->starved);
        return 0;
}

static void *deadline_dispatch_start(struct seq_file *m, loff_t *pos)
        __acquires(&dd->lock)
{
        struct request_queue *q = m->private;
        struct deadline_data *dd = q->elevator->elevator_data;

        spin_lock(&dd->lock);
        return seq_list_start(&dd->dispatch, *pos);
}

static void *deadline_dispatch_next(struct seq_file *m, void *v, loff_t *pos)
{
        struct request_queue *q = m->private;
        struct deadline_data *dd = q->elevator->elevator_data;

        return seq_list_next(v, &dd->dispatch, pos);
}

static void deadline_dispatch_stop(struct seq_file *m, void *v)
        __releases(&dd->lock)
{
        struct request_queue *q = m->private;
        struct deadline_data *dd = q->elevator->elevator_data;

        spin_unlock(&dd->lock);
}

static const struct seq_operations deadline_dispatch_seq_ops = {
        .start        = deadline_dispatch_start,
        .next        = deadline_dispatch_next,
        .stop        = deadline_dispatch_stop,
        .show        = blk_mq_debugfs_rq_show,
};

#define DEADLINE_QUEUE_DDIR_ATTRS(name)                                                \
        {#name "_fifo_list", 0400, .seq_ops = &deadline_##name##_fifo_seq_ops},        \
        {#name "_next_rq", 0400, deadline_##name##_next_rq_show}
static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = {
        DEADLINE_QUEUE_DDIR_ATTRS(read),
        DEADLINE_QUEUE_DDIR_ATTRS(write),
        {"batching", 0400, deadline_batching_show},
        {"starved", 0400, deadline_starved_show},
        {"dispatch", 0400, .seq_ops = &deadline_dispatch_seq_ops},
        {},
};
#undef DEADLINE_QUEUE_DDIR_ATTRS
#endif

static struct elevator_type mq_deadline = {
        .ops = {
                .insert_requests        = dd_insert_requests,
                .dispatch_request        = dd_dispatch_request,
                .prepare_request        = dd_prepare_request,
                .finish_request                = dd_finish_request,
                .next_request                = elv_rb_latter_request,
                .former_request                = elv_rb_former_request,
                .bio_merge                = dd_bio_merge,
                .request_merge                = dd_request_merge,
                .requests_merged        = dd_merged_requests,
                .request_merged                = dd_request_merged,
                .has_work                = dd_has_work,
                .init_sched                = dd_init_queue,
                .exit_sched                = dd_exit_queue,
        },

#ifdef CONFIG_BLK_DEBUG_FS
        .queue_debugfs_attrs = deadline_queue_debugfs_attrs,
#endif
        .elevator_attrs = deadline_attrs,
        .elevator_name = "mq-deadline",
        .elevator_alias = "deadline",
        .elevator_features = ELEVATOR_F_ZBD_SEQ_WRITE,
        .elevator_owner = THIS_MODULE,
};
MODULE_ALIAS("mq-deadline-iosched");

static int __init deadline_init(void)
{
        return elv_register(&mq_deadline);
}

static void __exit deadline_exit(void)
{
        elv_unregister(&mq_deadline);
}

module_init(deadline_init);
module_exit(deadline_exit);

MODULE_AUTHOR("Jens Axboe");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("MQ deadline IO scheduler");


























































































































































































    1 












    1 


















    1 



    1 








    1 


















































































    1 

    1 

    1 




    1 
    1 




























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
// SPDX-License-Identifier: GPL-2.0-or-later
/* Generic associative array implementation.
 *
 * See Documentation/core-api/assoc_array.rst for information.
 *
 * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */
//#define DEBUG
#include <linux/rcupdate.h>
#include <linux/slab.h>
#include <linux/err.h>
#include <linux/assoc_array_priv.h>

/*
 * Iterate over an associative array.  The caller must hold the RCU read lock
 * or better.
 */
static int assoc_array_subtree_iterate(const struct assoc_array_ptr *root,
                                       const struct assoc_array_ptr *stop,
                                       int (*iterator)(const void *leaf,
                                                       void *iterator_data),
                                       void *iterator_data)
{
        const struct assoc_array_shortcut *shortcut;
        const struct assoc_array_node *node;
        const struct assoc_array_ptr *cursor, *ptr, *parent;
        unsigned long has_meta;
        int slot, ret;

        cursor = root;

begin_node:
        if (assoc_array_ptr_is_shortcut(cursor)) {
                /* Descend through a shortcut */
                shortcut = assoc_array_ptr_to_shortcut(cursor);
                cursor = READ_ONCE(shortcut->next_node); /* Address dependency. */
        }

        node = assoc_array_ptr_to_node(cursor);
        slot = 0;

        /* We perform two passes of each node.
         *
         * The first pass does all the leaves in this node.  This means we
         * don't miss any leaves if the node is split up by insertion whilst
         * we're iterating over the branches rooted here (we may, however, see
         * some leaves twice).
         */
        has_meta = 0;
        for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) {
                ptr = READ_ONCE(node->slots[slot]); /* Address dependency. */
                has_meta |= (unsigned long)ptr;
                if (ptr && assoc_array_ptr_is_leaf(ptr)) {
                        /* We need a barrier between the read of the pointer,
                         * which is supplied by the above READ_ONCE().
                         */
                        /* Invoke the callback */
                        ret = iterator(assoc_array_ptr_to_leaf(ptr),
                                       iterator_data);
                        if (ret)
                                return ret;
                }
        }

        /* The second pass attends to all the metadata pointers.  If we follow
         * one of these we may find that we don't come back here, but rather go
         * back to a replacement node with the leaves in a different layout.
         *
         * We are guaranteed to make progress, however, as the slot number for
         * a particular portion of the key space cannot change - and we
         * continue at the back pointer + 1.
         */
        if (!(has_meta & ASSOC_ARRAY_PTR_META_TYPE))
                goto finished_node;
        slot = 0;

continue_node:
        node = assoc_array_ptr_to_node(cursor);
        for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) {
                ptr = READ_ONCE(node->slots[slot]); /* Address dependency. */
                if (assoc_array_ptr_is_meta(ptr)) {
                        cursor = ptr;
                        goto begin_node;
                }
        }

finished_node:
        /* Move up to the parent (may need to skip back over a shortcut) */
        parent = READ_ONCE(node->back_pointer); /* Address dependency. */
        slot = node->parent_slot;
        if (parent == stop)
                return 0;

        if (assoc_array_ptr_is_shortcut(parent)) {
                shortcut = assoc_array_ptr_to_shortcut(parent);
                cursor = parent;
                parent = READ_ONCE(shortcut->back_pointer); /* Address dependency. */
                slot = shortcut->parent_slot;
                if (parent == stop)
                        return 0;
        }

        /* Ascend to next slot in parent node */
        cursor = parent;
        slot++;
        goto continue_node;
}

/**
 * assoc_array_iterate - Pass all objects in the array to a callback
 * @array: The array to iterate over.
 * @iterator: The callback function.
 * @iterator_data: Private data for the callback function.
 *
 * Iterate over all the objects in an associative array.  Each one will be
 * presented to the iterator function.
 *
 * If the array is being modified concurrently with the iteration then it is
 * possible that some objects in the array will be passed to the iterator
 * callback more than once - though every object should be passed at least
 * once.  If this is undesirable then the caller must lock against modification
 * for the duration of this function.
 *
 * The function will return 0 if no objects were in the array or else it will
 * return the result of the last iterator function called.  Iteration stops
 * immediately if any call to the iteration function results in a non-zero
 * return.
 *
 * The caller should hold the RCU read lock or better if concurrent
 * modification is possible.
 */
int assoc_array_iterate(const struct assoc_array *array,
                        int (*iterator)(const void *object,
                                        void *iterator_data),
                        void *iterator_data)
{
        struct assoc_array_ptr *root = READ_ONCE(array->root); /* Address dependency. */

        if (!root)
                return 0;
        return assoc_array_subtree_iterate(root, NULL, iterator, iterator_data);
}

enum assoc_array_walk_status {
        assoc_array_walk_tree_empty,
        assoc_array_walk_found_terminal_node,
        assoc_array_walk_found_wrong_shortcut,
};

struct assoc_array_walk_result {
        struct {
                struct assoc_array_node        *node;        /* Node in which leaf might be found */
                int                level;
                int                slot;
        } terminal_node;
        struct {
                struct assoc_array_shortcut *shortcut;
                int                level;
                int                sc_level;
                unsigned long        sc_segments;
                unsigned long        dissimilarity;
        } wrong_shortcut;
};

/*
 * Navigate through the internal tree looking for the closest node to the key.
 */
static enum assoc_array_walk_status
assoc_array_walk(const struct assoc_array *array,
                 const struct assoc_array_ops *ops,
                 const void *index_key,
                 struct assoc_array_walk_result *result)
{
        struct assoc_array_shortcut *shortcut;
        struct assoc_array_node *node;
        struct assoc_array_ptr *cursor, *ptr;
        unsigned long sc_segments, dissimilarity;
        unsigned long segments;
        int level, sc_level, next_sc_level;
        int slot;

        pr_devel("-->%s()\n", __func__);

        cursor = READ_ONCE(array->root);  /* Address dependency. */
        if (!cursor)
                return assoc_array_walk_tree_empty;

        level = 0;

        /* Use segments from the key for the new leaf to navigate through the
         * internal tree, skipping through nodes and shortcuts that are on
         * route to the destination.  Eventually we'll come to a slot that is
         * either empty or contains a leaf at which point we've found a node in
         * which the leaf we're looking for might be found or into which it
         * should be inserted.
         */
jumped:
        segments = ops->get_key_chunk(index_key, level);
        pr_devel("segments[%d]: %lx\n", level, segments);

        if (assoc_array_ptr_is_shortcut(cursor))
                goto follow_shortcut;

consider_node:
        node = assoc_array_ptr_to_node(cursor);
        slot = segments >> (level & ASSOC_ARRAY_KEY_CHUNK_MASK);
        slot &= ASSOC_ARRAY_FAN_MASK;
        ptr = READ_ONCE(node->slots[slot]); /* Address dependency. */

        pr_devel("consider slot %x [ix=%d type=%lu]\n",
                 slot, level, (unsigned long)ptr & 3);

        if (!assoc_array_ptr_is_meta(ptr)) {
                /* The node doesn't have a node/shortcut pointer in the slot
                 * corresponding to the index key that we have to follow.
                 */
                result->terminal_node.node = node;
                result->terminal_node.level = level;
                result->terminal_node.slot = slot;
                pr_devel("<--%s() = terminal_node\n", __func__);
                return assoc_array_walk_found_terminal_node;
        }

        if (assoc_array_ptr_is_node(ptr)) {
                /* There is a pointer to a node in the slot corresponding to
                 * this index key segment, so we need to follow it.
                 */
                cursor = ptr;
                level += ASSOC_ARRAY_LEVEL_STEP;
                if ((level & ASSOC_ARRAY_KEY_CHUNK_MASK) != 0)
                        goto consider_node;
                goto jumped;
        }

        /* There is a shortcut in the slot corresponding to the index key
         * segment.  We follow the shortcut if its partial index key matches
         * this leaf's.  Otherwise we need to split the shortcut.
         */
        cursor = ptr;
follow_shortcut:
        shortcut = assoc_array_ptr_to_shortcut(cursor);
        pr_devel("shortcut to %d\n", shortcut->skip_to_level);
        sc_level = level + ASSOC_ARRAY_LEVEL_STEP;
        BUG_ON(sc_level > shortcut->skip_to_level);

        do {
                /* Check the leaf against the shortcut's index key a word at a
                 * time, trimming the final word (the shortcut stores the index
                 * key completely from the root to the shortcut's target).
                 */
                if ((sc_level & ASSOC_ARRAY_KEY_CHUNK_MASK) == 0)
                        segments = ops->get_key_chunk(index_key, sc_level);

                sc_segments = shortcut->index_key[sc_level >> ASSOC_ARRAY_KEY_CHUNK_SHIFT];
                dissimilarity = segments ^ sc_segments;

                if (round_up(sc_level, ASSOC_ARRAY_KEY_CHUNK_SIZE) > shortcut->skip_to_level) {
                        /* Trim segments that are beyond the shortcut */
                        int shift = shortcut->skip_to_level & ASSOC_ARRAY_KEY_CHUNK_MASK;
                        dissimilarity &= ~(ULONG_MAX << shift);
                        next_sc_level = shortcut->skip_to_level;
                } else {
                        next_sc_level = sc_level + ASSOC_ARRAY_KEY_CHUNK_SIZE;
                        next_sc_level = round_down(next_sc_level, ASSOC_ARRAY_KEY_CHUNK_SIZE);
                }

                if (dissimilarity != 0) {
                        /* This shortcut points elsewhere */
                        result->wrong_shortcut.shortcut = shortcut;
                        result->wrong_shortcut.level = level;
                        result->wrong_shortcut.sc_level = sc_level;
                        result->wrong_shortcut.sc_segments = sc_segments;
                        result->wrong_shortcut.dissimilarity = dissimilarity;
                        return assoc_array_walk_found_wrong_shortcut;
                }

                sc_level = next_sc_level;
        } while (sc_level < shortcut->skip_to_level);

        /* The shortcut matches the leaf's index to this point. */
        cursor = READ_ONCE(shortcut->next_node); /* Address dependency. */
        if (((level ^ sc_level) & ~ASSOC_ARRAY_KEY_CHUNK_MASK) != 0) {
                level = sc_level;
                goto jumped;
        } else {
                level = sc_level;
                goto consider_node;
        }
}

/**
 * assoc_array_find - Find an object by index key
 * @array: The associative array to search.
 * @ops: The operations to use.
 * @index_key: The key to the object.
 *
 * Find an object in an associative array by walking through the internal tree
 * to the node that should contain the object and then searching the leaves
 * there.  NULL is returned if the requested object was not found in the array.
 *
 * The caller must hold the RCU read lock or better.
 */
void *assoc_array_find(const struct assoc_array *array,
                       const struct assoc_array_ops *ops,
                       const void *index_key)
{
        struct assoc_array_walk_result result;
        const struct assoc_array_node *node;
        const struct assoc_array_ptr *ptr;
        const void *leaf;
        int slot;

        if (assoc_array_walk(array, ops, index_key, &result) !=
            assoc_array_walk_found_terminal_node)
                return NULL;

        node = result.terminal_node.node;

        /* If the target key is available to us, it's has to be pointed to by
         * the terminal node.
         */
        for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) {
                ptr = READ_ONCE(node->slots[slot]); /* Address dependency. */
                if (ptr && assoc_array_ptr_is_leaf(ptr)) {
                        /* We need a barrier between the read of the pointer
                         * and dereferencing the pointer - but only if we are
                         * actually going to dereference it.
                         */
                        leaf = assoc_array_ptr_to_leaf(ptr);
                        if (ops->compare_object(leaf, index_key))
                                return (void *)leaf;
                }
        }

        return NULL;
}

/*
 * Destructively iterate over an associative array.  The caller must prevent
 * other simultaneous accesses.
 */
static void assoc_array_destroy_subtree(struct assoc_array_ptr *root,
                                        const struct assoc_array_ops *ops)
{
        struct assoc_array_shortcut *shortcut;
        struct assoc_array_node *node;
        struct assoc_array_ptr *cursor, *parent = NULL;
        int slot = -1;

        pr_devel("-->%s()\n", __func__);

        cursor = root;
        if (!cursor) {
                pr_devel("empty\n");
                return;
        }

move_to_meta:
        if (assoc_array_ptr_is_shortcut(cursor)) {
                /* Descend through a shortcut */
                pr_devel("[%d] shortcut\n", slot);
                BUG_ON(!assoc_array_ptr_is_shortcut(cursor));
                shortcut = assoc_array_ptr_to_shortcut(cursor);
                BUG_ON(shortcut->back_pointer != parent);
                BUG_ON(slot != -1 && shortcut->parent_slot != slot);
                parent = cursor;
                cursor = shortcut->next_node;
                slot = -1;
                BUG_ON(!assoc_array_ptr_is_node(cursor));
        }

        pr_devel("[%d] node\n", slot);
        node = assoc_array_ptr_to_node(cursor);
        BUG_ON(node->back_pointer != parent);
        BUG_ON(slot != -1 && node->parent_slot != slot);
        slot = 0;

continue_node:
        pr_devel("Node %p [back=%p]\n", node, node->back_pointer);
        for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) {
                struct assoc_array_ptr *ptr = node->slots[slot];
                if (!ptr)
                        continue;
                if (assoc_array_ptr_is_meta(ptr)) {
                        parent = cursor;
                        cursor = ptr;
                        goto move_to_meta;
                }

                if (ops) {
                        pr_devel("[%d] free leaf\n", slot);
                        ops->free_object(assoc_array_ptr_to_leaf(ptr));
                }
        }

        parent = node->back_pointer;
        slot = node->parent_slot;
        pr_devel("free node\n");
        kfree(node);
        if (!parent)
                return; /* Done */

        /* Move back up to the parent (may need to free a shortcut on
         * the way up) */
        if (assoc_array_ptr_is_shortcut(parent)) {
                shortcut = assoc_array_ptr_to_shortcut(parent);
                BUG_ON(shortcut->next_node != cursor);
                cursor = parent;
                parent = shortcut->back_pointer;
                slot = shortcut->parent_slot;
                pr_devel("free shortcut\n");
                kfree(shortcut);
                if (!parent)
                        return;

                BUG_ON(!assoc_array_ptr_is_node(parent));
        }

        /* Ascend to next slot in parent node */
        pr_devel("ascend to %p[%d]\n", parent, slot);
        cursor = parent;
        node = assoc_array_ptr_to_node(cursor);
        slot++;
        goto continue_node;
}

/**
 * assoc_array_destroy - Destroy an associative array
 * @array: The array to destroy.
 * @ops: The operations to use.
 *
 * Discard all metadata and free all objects in an associative array.  The
 * array will be empty and ready to use again upon completion.  This function
 * cannot fail.
 *
 * The caller must prevent all other accesses whilst this takes place as no
 * attempt is made to adjust pointers gracefully to permit RCU readlock-holding
 * accesses to continue.  On the other hand, no memory allocation is required.
 */
void assoc_array_destroy(struct assoc_array *array,
                         const struct assoc_array_ops *ops)
{
        assoc_array_destroy_subtree(array->root, ops);
        array->root = NULL;
}

/*
 * Handle insertion into an empty tree.
 */
static bool assoc_array_insert_in_empty_tree(struct assoc_array_edit *edit)
{
        struct assoc_array_node *new_n0;

        pr_devel("-->%s()\n", __func__);

        new_n0 = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL);
        if (!new_n0)
                return false;

        edit->new_meta[0] = assoc_array_node_to_ptr(new_n0);
        edit->leaf_p = &new_n0->slots[0];
        edit->adjust_count_on = new_n0;
        edit->set[0].ptr = &edit->array->root;
        edit->set[0].to = assoc_array_node_to_ptr(new_n0);

        pr_devel("<--%s() = ok [no root]\n", __func__);
        return true;
}

/*
 * Handle insertion into a terminal node.
 */
static bool assoc_array_insert_into_terminal_node(struct assoc_array_edit *edit,
                                                  const struct assoc_array_ops *ops,
                                                  const void *index_key,
                                                  struct assoc_array_walk_result *result)
{
        struct assoc_array_shortcut *shortcut, *new_s0;
        struct assoc_array_node *node, *new_n0, *new_n1, *side;
        struct assoc_array_ptr *ptr;
        unsigned long dissimilarity, base_seg, blank;
        size_t keylen;
        bool have_meta;
        int level, diff;
        int slot, next_slot, free_slot, i, j;

        node        = result->terminal_node.node;
        level        = result->terminal_node.level;
        edit->segment_cache[ASSOC_ARRAY_FAN_OUT] = result->terminal_node.slot;

        pr_devel("-->%s()\n", __func__);

        /* We arrived at a node which doesn't have an onward node or shortcut
         * pointer that we have to follow.  This means that (a) the leaf we
         * want must go here (either by insertion or replacement) or (b) we
         * need to split this node and insert in one of the fragments.
         */
        free_slot = -1;

        /* Firstly, we have to check the leaves in this node to see if there's
         * a matching one we should replace in place.
         */
        for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) {
                ptr = node->slots[i];
                if (!ptr) {
                        free_slot = i;
                        continue;
                }
                if (assoc_array_ptr_is_leaf(ptr) &&
                    ops->compare_object(assoc_array_ptr_to_leaf(ptr),
                                        index_key)) {
                        pr_devel("replace in slot %d\n", i);
                        edit->leaf_p = &node->slots[i];
                        edit->dead_leaf = node->slots[i];
                        pr_devel("<--%s() = ok [replace]\n", __func__);
                        return true;
                }
        }

        /* If there is a free slot in this node then we can just insert the
         * leaf here.
         */
        if (free_slot >= 0) {
                pr_devel("insert in free slot %d\n", free_slot);
                edit->leaf_p = &node->slots[free_slot];
                edit->adjust_count_on = node;
                pr_devel("<--%s() = ok [insert]\n", __func__);
                return true;
        }

        /* The node has no spare slots - so we're either going to have to split
         * it or insert another node before it.
         *
         * Whatever, we're going to need at least two new nodes - so allocate
         * those now.  We may also need a new shortcut, but we deal with that
         * when we need it.
         */
        new_n0 = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL);
        if (!new_n0)
                return false;
        edit->new_meta[0] = assoc_array_node_to_ptr(new_n0);
        new_n1 = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL);
        if (!new_n1)
                return false;
        edit->new_meta[1] = assoc_array_node_to_ptr(new_n1);

        /* We need to find out how similar the leaves are. */
        pr_devel("no spare slots\n");
        have_meta = false;
        for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) {
                ptr = node->slots[i];
                if (assoc_array_ptr_is_meta(ptr)) {
                        edit->segment_cache[i] = 0xff;
                        have_meta = true;
                        continue;
                }
                base_seg = ops->get_object_key_chunk(
                        assoc_array_ptr_to_leaf(ptr), level);
                base_seg >>= level & ASSOC_ARRAY_KEY_CHUNK_MASK;
                edit->segment_cache[i] = base_seg & ASSOC_ARRAY_FAN_MASK;
        }

        if (have_meta) {
                pr_devel("have meta\n");
                goto split_node;
        }

        /* The node contains only leaves */
        dissimilarity = 0;
        base_seg = edit->segment_cache[0];
        for (i = 1; i < ASSOC_ARRAY_FAN_OUT; i++)
                dissimilarity |= edit->segment_cache[i] ^ base_seg;

        pr_devel("only leaves; dissimilarity=%lx\n", dissimilarity);

        if ((dissimilarity & ASSOC_ARRAY_FAN_MASK) == 0) {
                /* The old leaves all cluster in the same slot.  We will need
                 * to insert a shortcut if the new node wants to cluster with them.
                 */
                if ((edit->segment_cache[ASSOC_ARRAY_FAN_OUT] ^ base_seg) == 0)
                        goto all_leaves_cluster_together;

                /* Otherwise all the old leaves cluster in the same slot, but
                 * the new leaf wants to go into a different slot - so we
                 * create a new node (n0) to hold the new leaf and a pointer to
                 * a new node (n1) holding all the old leaves.
                 *
                 * This can be done by falling through to the node splitting
                 * path.
                 */
                pr_devel("present leaves cluster but not new leaf\n");
        }

split_node:
        pr_devel("split node\n");

        /* We need to split the current node.  The node must contain anything
         * from a single leaf (in the one leaf case, this leaf will cluster
         * with the new leaf) and the rest meta-pointers, to all leaves, some
         * of which may cluster.
         *
         * It won't contain the case in which all the current leaves plus the
         * new leaves want to cluster in the same slot.
         *
         * We need to expel at least two leaves out of a set consisting of the
         * leaves in the node and the new leaf.  The current meta pointers can
         * just be copied as they shouldn't cluster with any of the leaves.
         *
         * We need a new node (n0) to replace the current one and a new node to
         * take the expelled nodes (n1).
         */
        edit->set[0].to = assoc_array_node_to_ptr(new_n0);
        new_n0->back_pointer = node->back_pointer;
        new_n0->parent_slot = node->parent_slot;
        new_n1->back_pointer = assoc_array_node_to_ptr(new_n0);
        new_n1->parent_slot = -1; /* Need to calculate this */

do_split_node:
        pr_devel("do_split_node\n");

        new_n0->nr_leaves_on_branch = node->nr_leaves_on_branch;
        new_n1->nr_leaves_on_branch = 0;

        /* Begin by finding two matching leaves.  There have to be at least two
         * that match - even if there are meta pointers - because any leaf that
         * would match a slot with a meta pointer in it must be somewhere
         * behind that meta pointer and cannot be here.  Further, given N
         * remaining leaf slots, we now have N+1 leaves to go in them.
         */
        for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) {
                slot = edit->segment_cache[i];
                if (slot != 0xff)
                        for (j = i + 1; j < ASSOC_ARRAY_FAN_OUT + 1; j++)
                                if (edit->segment_cache[j] == slot)
                                        goto found_slot_for_multiple_occupancy;
        }
found_slot_for_multiple_occupancy:
        pr_devel("same slot: %x %x [%02x]\n", i, j, slot);
        BUG_ON(i >= ASSOC_ARRAY_FAN_OUT);
        BUG_ON(j >= ASSOC_ARRAY_FAN_OUT + 1);
        BUG_ON(slot >= ASSOC_ARRAY_FAN_OUT);

        new_n1->parent_slot = slot;

        /* Metadata pointers cannot change slot */
        for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++)
                if (assoc_array_ptr_is_meta(node->slots[i]))
                        new_n0->slots[i] = node->slots[i];
                else
                        new_n0->slots[i] = NULL;
        BUG_ON(new_n0->slots[slot] != NULL);
        new_n0->slots[slot] = assoc_array_node_to_ptr(new_n1);

        /* Filter the leaf pointers between the new nodes */
        free_slot = -1;
        next_slot = 0;
        for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) {
                if (assoc_array_ptr_is_meta(node->slots[i]))
                        continue;
                if (edit->segment_cache[i] == slot) {
                        new_n1->slots[next_slot++] = node->slots[i];
                        new_n1->nr_leaves_on_branch++;
                } else {
                        do {
                                free_slot++;
                        } while (new_n0->slots[free_slot] != NULL);
                        new_n0->slots[free_slot] = node->slots[i];
                }
        }

        pr_devel("filtered: f=%x n=%x\n", free_slot, next_slot);

        if (edit->segment_cache[ASSOC_ARRAY_FAN_OUT] != slot) {
                do {
                        free_slot++;
                } while (new_n0->slots[free_slot] != NULL);
                edit->leaf_p = &new_n0->slots[free_slot];
                edit->adjust_count_on = new_n0;
        } else {
                edit->leaf_p = &new_n1->slots[next_slot++];
                edit->adjust_count_on = new_n1;
        }

        BUG_ON(next_slot <= 1);

        edit->set_backpointers_to = assoc_array_node_to_ptr(new_n0);
        for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) {
                if (edit->segment_cache[i] == 0xff) {
                        ptr = node->slots[i];
                        BUG_ON(assoc_array_ptr_is_leaf(ptr));
                        if (assoc_array_ptr_is_node(ptr)) {
                                side = assoc_array_ptr_to_node(ptr);
                                edit->set_backpointers[i] = &side->back_pointer;
                        } else {
                                shortcut = assoc_array_ptr_to_shortcut(ptr);
                                edit->set_backpointers[i] = &shortcut->back_pointer;
                        }
                }
        }

        ptr = node->back_pointer;
        if (!ptr)
                edit->set[0].ptr = &edit->array->root;
        else if (assoc_array_ptr_is_node(ptr))
                edit->set[0].ptr = &assoc_array_ptr_to_node(ptr)->slots[node->parent_slot];
        else
                edit->set[0].ptr = &assoc_array_ptr_to_shortcut(ptr)->next_node;
        edit->excised_meta[0] = assoc_array_node_to_ptr(node);
        pr_devel("<--%s() = ok [split node]\n", __func__);
        return true;

all_leaves_cluster_together:
        /* All the leaves, new and old, want to cluster together in this node
         * in the same slot, so we have to replace this node with a shortcut to
         * skip over the identical parts of the key and then place a pair of
         * nodes, one inside the other, at the end of the shortcut and
         * distribute the keys between them.
         *
         * Firstly we need to work out where the leaves start diverging as a
         * bit position into their keys so that we know how big the shortcut
         * needs to be.
         *
         * We only need to make a single pass of N of the N+1 leaves because if
         * any keys differ between themselves at bit X then at least one of
         * them must also differ with the base key at bit X or before.
         */
        pr_devel("all leaves cluster together\n");
        diff = INT_MAX;
        for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) {
                int x = ops->diff_objects(assoc_array_ptr_to_leaf(node->slots[i]),
                                          index_key);
                if (x < diff) {
                        BUG_ON(x < 0);
                        diff = x;
                }
        }
        BUG_ON(diff == INT_MAX);
        BUG_ON(diff < level + ASSOC_ARRAY_LEVEL_STEP);

        keylen = round_up(diff, ASSOC_ARRAY_KEY_CHUNK_SIZE);
        keylen >>= ASSOC_ARRAY_KEY_CHUNK_SHIFT;

        new_s0 = kzalloc(sizeof(struct assoc_array_shortcut) +
                         keylen * sizeof(unsigned long), GFP_KERNEL);
        if (!new_s0)
                return false;
        edit->new_meta[2] = assoc_array_shortcut_to_ptr(new_s0);

        edit->set[0].to = assoc_array_shortcut_to_ptr(new_s0);
        new_s0->back_pointer = node->back_pointer;
        new_s0->parent_slot = node->parent_slot;
        new_s0->next_node = assoc_array_node_to_ptr(new_n0);
        new_n0->back_pointer = assoc_array_shortcut_to_ptr(new_s0);
        new_n0->parent_slot = 0;
        new_n1->back_pointer = assoc_array_node_to_ptr(new_n0);
        new_n1->parent_slot = -1; /* Need to calculate this */

        new_s0->skip_to_level = level = diff & ~ASSOC_ARRAY_LEVEL_STEP_MASK;
        pr_devel("skip_to_level = %d [diff %d]\n", level, diff);
        BUG_ON(level <= 0);

        for (i = 0; i < keylen; i++)
                new_s0->index_key[i] =
                        ops->get_key_chunk(index_key, i * ASSOC_ARRAY_KEY_CHUNK_SIZE);

        if (level & ASSOC_ARRAY_KEY_CHUNK_MASK) {
                blank = ULONG_MAX << (level & ASSOC_ARRAY_KEY_CHUNK_MASK);
                pr_devel("blank off [%zu] %d: %lx\n", keylen - 1, level, blank);
                new_s0->index_key[keylen - 1] &= ~blank;
        }

        /* This now reduces to a node splitting exercise for which we'll need
         * to regenerate the disparity table.
         */
        for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) {
                ptr = node->slots[i];
                base_seg = ops->get_object_key_chunk(assoc_array_ptr_to_leaf(ptr),
                                                     level);
                base_seg >>= level & ASSOC_ARRAY_KEY_CHUNK_MASK;
                edit->segment_cache[i] = base_seg & ASSOC_ARRAY_FAN_MASK;
        }

        base_seg = ops->get_key_chunk(index_key, level);
        base_seg >>= level & ASSOC_ARRAY_KEY_CHUNK_MASK;
        edit->segment_cache[ASSOC_ARRAY_FAN_OUT] = base_seg & ASSOC_ARRAY_FAN_MASK;
        goto do_split_node;
}

/*
 * Handle insertion into the middle of a shortcut.
 */
static bool assoc_array_insert_mid_shortcut(struct assoc_array_edit *edit,
                                            const struct assoc_array_ops *ops,
                                            struct assoc_array_walk_result *result)
{
        struct assoc_array_shortcut *shortcut, *new_s0, *new_s1;
        struct assoc_array_node *node, *new_n0, *side;
        unsigned long sc_segments, dissimilarity, blank;
        size_t keylen;
        int level, sc_level, diff;
        int sc_slot;

        shortcut        = result->wrong_shortcut.shortcut;
        level                = result->wrong_shortcut.level;
        sc_level        = result->wrong_shortcut.sc_level;
        sc_segments        = result->wrong_shortcut.sc_segments;
        dissimilarity        = result->wrong_shortcut.dissimilarity;

        pr_devel("-->%s(ix=%d dis=%lx scix=%d)\n",
                 __func__, level, dissimilarity, sc_level);

        /* We need to split a shortcut and insert a node between the two
         * pieces.  Zero-length pieces will be dispensed with entirely.
         *
         * First of all, we need to find out in which level the first
         * difference was.
         */
        diff = __ffs(dissimilarity);
        diff &= ~ASSOC_ARRAY_LEVEL_STEP_MASK;
        diff += sc_level & ~ASSOC_ARRAY_KEY_CHUNK_MASK;
        pr_devel("diff=%d\n", diff);

        if (!shortcut->back_pointer) {
                edit->set[0].ptr = &edit->array->root;
        } else if (assoc_array_ptr_is_node(shortcut->back_pointer)) {
                node = assoc_array_ptr_to_node(shortcut->back_pointer);
                edit->set[0].ptr = &node->slots[shortcut->parent_slot];
        } else {
                BUG();
        }

        edit->excised_meta[0] = assoc_array_shortcut_to_ptr(shortcut);

        /* Create a new node now since we're going to need it anyway */
        new_n0 = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL);
        if (!new_n0)
                return false;
        edit->new_meta[0] = assoc_array_node_to_ptr(new_n0);
        edit->adjust_count_on = new_n0;

        /* Insert a new shortcut before the new node if this segment isn't of
         * zero length - otherwise we just connect the new node directly to the
         * parent.
         */
        level += ASSOC_ARRAY_LEVEL_STEP;
        if (diff > level) {
                pr_devel("pre-shortcut %d...%d\n", level, diff);
                keylen = round_up(diff, ASSOC_ARRAY_KEY_CHUNK_SIZE);
                keylen >>= ASSOC_ARRAY_KEY_CHUNK_SHIFT;

                new_s0 = kzalloc(sizeof(struct assoc_array_shortcut) +
                                 keylen * sizeof(unsigned long), GFP_KERNEL);
                if (!new_s0)
                        return false;
                edit->new_meta[1] = assoc_array_shortcut_to_ptr(new_s0);
                edit->set[0].to = assoc_array_shortcut_to_ptr(new_s0);
                new_s0->back_pointer = shortcut->back_pointer;
                new_s0->parent_slot = shortcut->parent_slot;
                new_s0->next_node = assoc_array_node_to_ptr(new_n0);
                new_s0->skip_to_level = diff;

                new_n0->back_pointer = assoc_array_shortcut_to_ptr(new_s0);
                new_n0->parent_slot = 0;

                memcpy(new_s0->index_key, shortcut->index_key,
                       keylen * sizeof(unsigned long));

                blank = ULONG_MAX << (diff & ASSOC_ARRAY_KEY_CHUNK_MASK);
                pr_devel("blank off [%zu] %d: %lx\n", keylen - 1, diff, blank);
                new_s0->index_key[keylen - 1] &= ~blank;
        } else {
                pr_devel("no pre-shortcut\n");
                edit->set[0].to = assoc_array_node_to_ptr(new_n0);
                new_n0->back_pointer = shortcut->back_pointer;
                new_n0->parent_slot = shortcut->parent_slot;
        }

        side = assoc_array_ptr_to_node(shortcut->next_node);
        new_n0->nr_leaves_on_branch = side->nr_leaves_on_branch;

        /* We need to know which slot in the new node is going to take a
         * metadata pointer.
         */
        sc_slot = sc_segments >> (diff & ASSOC_ARRAY_KEY_CHUNK_MASK);
        sc_slot &= ASSOC_ARRAY_FAN_MASK;

        pr_devel("new slot %lx >> %d -> %d\n",
                 sc_segments, diff & ASSOC_ARRAY_KEY_CHUNK_MASK, sc_slot);

        /* Determine whether we need to follow the new node with a replacement
         * for the current shortcut.  We could in theory reuse the current
         * shortcut if its parent slot number doesn't change - but that's a
         * 1-in-16 chance so not worth expending the code upon.
         */
        level = diff + ASSOC_ARRAY_LEVEL_STEP;
        if (level < shortcut->skip_to_level) {
                pr_devel("post-shortcut %d...%d\n", level, shortcut->skip_to_level);
                keylen = round_up(shortcut->skip_to_level, ASSOC_ARRAY_KEY_CHUNK_SIZE);
                keylen >>= ASSOC_ARRAY_KEY_CHUNK_SHIFT;

                new_s1 = kzalloc(sizeof(struct assoc_array_shortcut) +
                                 keylen * sizeof(unsigned long), GFP_KERNEL);
                if (!new_s1)
                        return false;
                edit->new_meta[2] = assoc_array_shortcut_to_ptr(new_s1);

                new_s1->back_pointer = assoc_array_node_to_ptr(new_n0);
                new_s1->parent_slot = sc_slot;
                new_s1->next_node = shortcut->next_node;
                new_s1->skip_to_level = shortcut->skip_to_level;

                new_n0->slots[sc_slot] = assoc_array_shortcut_to_ptr(new_s1);

                memcpy(new_s1->index_key, shortcut->index_key,
                       keylen * sizeof(unsigned long));

                edit->set[1].ptr = &side->back_pointer;
                edit->set[1].to = assoc_array_shortcut_to_ptr(new_s1);
        } else {
                pr_devel("no post-shortcut\n");

                /* We don't have to replace the pointed-to node as long as we
                 * use memory barriers to make sure the parent slot number is
                 * changed before the back pointer (the parent slot number is
                 * irrelevant to the old parent shortcut).
                 */
                new_n0->slots[sc_slot] = shortcut->next_node;
                edit->set_parent_slot[0].p = &side->parent_slot;
                edit->set_parent_slot[0].to = sc_slot;
                edit->set[1].ptr = &side->back_pointer;
                edit->set[1].to = assoc_array_node_to_ptr(new_n0);
        }

        /* Install the new leaf in a spare slot in the new node. */
        if (sc_slot == 0)
                edit->leaf_p = &new_n0->slots[1];
        else
                edit->leaf_p = &new_n0->slots[0];

        pr_devel("<--%s() = ok [split shortcut]\n", __func__);
        return edit;
}

/**
 * assoc_array_insert - Script insertion of an object into an associative array
 * @array: The array to insert into.
 * @ops: The operations to use.
 * @index_key: The key to insert at.
 * @object: The object to insert.
 *
 * Precalculate and preallocate a script for the insertion or replacement of an
 * object in an associative array.  This results in an edit script that can
 * either be applied or cancelled.
 *
 * The function returns a pointer to an edit script or -ENOMEM.
 *
 * The caller should lock against other modifications and must continue to hold
 * the lock until assoc_array_apply_edit() has been called.
 *
 * Accesses to the tree may take place concurrently with this function,
 * provided they hold the RCU read lock.
 */
struct assoc_array_edit *assoc_array_insert(struct assoc_array *array,
                                            const struct assoc_array_ops *ops,
                                            const void *index_key,
                                            void *object)
{
        struct assoc_array_walk_result result;
        struct assoc_array_edit *edit;

        pr_devel("-->%s()\n", __func__);

        /* The leaf pointer we're given must not have the bottom bit set as we
         * use those for type-marking the pointer.  NULL pointers are also not
         * allowed as they indicate an empty slot but we have to allow them
         * here as they can be updated later.
         */
        BUG_ON(assoc_array_ptr_is_meta(object));

        edit = kzalloc(sizeof(struct assoc_array_edit), GFP_KERNEL);
        if (!edit)
                return ERR_PTR(-ENOMEM);
        edit->array = array;
        edit->ops = ops;
        edit->leaf = assoc_array_leaf_to_ptr(object);
        edit->adjust_count_by = 1;

        switch (assoc_array_walk(array, ops, index_key, &result)) {
        case assoc_array_walk_tree_empty:
                /* Allocate a root node if there isn't one yet */
                if (!assoc_array_insert_in_empty_tree(edit))
                        goto enomem;
                return edit;

        case assoc_array_walk_found_terminal_node:
                /* We found a node that doesn't have a node/shortcut pointer in
                 * the slot corresponding to the index key that we have to
                 * follow.
                 */
                if (!assoc_array_insert_into_terminal_node(edit, ops, index_key,
                                                           &result))
                        goto enomem;
                return edit;

        case assoc_array_walk_found_wrong_shortcut:
                /* We found a shortcut that didn't match our key in a slot we
                 * needed to follow.
                 */
                if (!assoc_array_insert_mid_shortcut(edit, ops, &result))
                        goto enomem;
                return edit;
        }

enomem:
        /* Clean up after an out of memory error */
        pr_devel("enomem\n");
        assoc_array_cancel_edit(edit);
        return ERR_PTR(-ENOMEM);
}

/**
 * assoc_array_insert_set_object - Set the new object pointer in an edit script
 * @edit: The edit script to modify.
 * @object: The object pointer to set.
 *
 * Change the object to be inserted in an edit script.  The object pointed to
 * by the old object is not freed.  This must be done prior to applying the
 * script.
 */
void assoc_array_insert_set_object(struct assoc_array_edit *edit, void *object)
{
        BUG_ON(!object);
        edit->leaf = assoc_array_leaf_to_ptr(object);
}

struct assoc_array_delete_collapse_context {
        struct assoc_array_node        *node;
        const void                *skip_leaf;
        int                        slot;
};

/*
 * Subtree collapse to node iterator.
 */
static int assoc_array_delete_collapse_iterator(const void *leaf,
                                                void *iterator_data)
{
        struct assoc_array_delete_collapse_context *collapse = iterator_data;

        if (leaf == collapse->skip_leaf)
                return 0;

        BUG_ON(collapse->slot >= ASSOC_ARRAY_FAN_OUT);

        collapse->node->slots[collapse->slot++] = assoc_array_leaf_to_ptr(leaf);
        return 0;
}

/**
 * assoc_array_delete - Script deletion of an object from an associative array
 * @array: The array to search.
 * @ops: The operations to use.
 * @index_key: The key to the object.
 *
 * Precalculate and preallocate a script for the deletion of an object from an
 * associative array.  This results in an edit script that can either be
 * applied or cancelled.
 *
 * The function returns a pointer to an edit script if the object was found,
 * NULL if the object was not found or -ENOMEM.
 *
 * The caller should lock against other modifications and must continue to hold
 * the lock until assoc_array_apply_edit() has been called.
 *
 * Accesses to the tree may take place concurrently with this function,
 * provided they hold the RCU read lock.
 */
struct assoc_array_edit *assoc_array_delete(struct assoc_array *array,
                                            const struct assoc_array_ops *ops,
                                            const void *index_key)
{
        struct assoc_array_delete_collapse_context collapse;
        struct assoc_array_walk_result result;
        struct assoc_array_node *node, *new_n0;
        struct assoc_array_edit *edit;
        struct assoc_array_ptr *ptr;
        bool has_meta;
        int slot, i;

        pr_devel("-->%s()\n", __func__);

        edit = kzalloc(sizeof(struct assoc_array_edit), GFP_KERNEL);
        if (!edit)
                return ERR_PTR(-ENOMEM);
        edit->array = array;
        edit->ops = ops;
        edit->adjust_count_by = -1;

        switch (assoc_array_walk(array, ops, index_key, &result)) {
        case assoc_array_walk_found_terminal_node:
                /* We found a node that should contain the leaf we've been
                 * asked to remove - *if* it's in the tree.
                 */
                pr_devel("terminal_node\n");
                node = result.terminal_node.node;

                for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) {
                        ptr = node->slots[slot];
                        if (ptr &&
                            assoc_array_ptr_is_leaf(ptr) &&
                            ops->compare_object(assoc_array_ptr_to_leaf(ptr),
                                                index_key))
                                goto found_leaf;
                }
                /* fall through */
        case assoc_array_walk_tree_empty:
        case assoc_array_walk_found_wrong_shortcut:
        default:
                assoc_array_cancel_edit(edit);
                pr_devel("not found\n");
                return NULL;
        }

found_leaf:
        BUG_ON(array->nr_leaves_on_tree <= 0);

        /* In the simplest form of deletion we just clear the slot and release
         * the leaf after a suitable interval.
         */
        edit->dead_leaf = node->slots[slot];
        edit->set[0].ptr = &node->slots[slot];
        edit->set[0].to = NULL;
        edit->adjust_count_on = node;

        /* If that concludes erasure of the last leaf, then delete the entire
         * internal array.
         */
        if (array->nr_leaves_on_tree == 1) {
                edit->set[1].ptr = &array->root;
                edit->set[1].to = NULL;
                edit->adjust_count_on = NULL;
                edit->excised_subtree = array->root;
                pr_devel("all gone\n");
                return edit;
        }

        /* However, we'd also like to clear up some metadata blocks if we
         * possibly can.
         *
         * We go for a simple algorithm of: if this node has FAN_OUT or fewer
         * leaves in it, then attempt to collapse it - and attempt to
         * recursively collapse up the tree.
         *
         * We could also try and collapse in partially filled subtrees to take
         * up space in this node.
         */
        if (node->nr_leaves_on_branch <= ASSOC_ARRAY_FAN_OUT + 1) {
                struct assoc_array_node *parent, *grandparent;
                struct assoc_array_ptr *ptr;

                /* First of all, we need to know if this node has metadata so
                 * that we don't try collapsing if all the leaves are already
                 * here.
                 */
                has_meta = false;
                for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) {
                        ptr = node->slots[i];
                        if (assoc_array_ptr_is_meta(ptr)) {
                                has_meta = true;
                                break;
                        }
                }

                pr_devel("leaves: %ld [m=%d]\n",
                         node->nr_leaves_on_branch - 1, has_meta);

                /* Look further up the tree to see if we can collapse this node
                 * into a more proximal node too.
                 */
                parent = node;
        collapse_up:
                pr_devel("collapse subtree: %ld\n", parent->nr_leaves_on_branch);

                ptr = parent->back_pointer;
                if (!ptr)
                        goto do_collapse;
                if (assoc_array_ptr_is_shortcut(ptr)) {
                        struct assoc_array_shortcut *s = assoc_array_ptr_to_shortcut(ptr);
                        ptr = s->back_pointer;
                        if (!ptr)
                                goto do_collapse;
                }

                grandparent = assoc_array_ptr_to_node(ptr);
                if (grandparent->nr_leaves_on_branch <= ASSOC_ARRAY_FAN_OUT + 1) {
                        parent = grandparent;
                        goto collapse_up;
                }

        do_collapse:
                /* There's no point collapsing if the original node has no meta
                 * pointers to discard and if we didn't merge into one of that
                 * node's ancestry.
                 */
                if (has_meta || parent != node) {
                        node = parent;

                        /* Create a new node to collapse into */
                        new_n0 = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL);
                        if (!new_n0)
                                goto enomem;
                        edit->new_meta[0] = assoc_array_node_to_ptr(new_n0);

                        new_n0->back_pointer = node->back_pointer;
                        new_n0->parent_slot = node->parent_slot;
                        new_n0->nr_leaves_on_branch = node->nr_leaves_on_branch;
                        edit->adjust_count_on = new_n0;

                        collapse.node = new_n0;
                        collapse.skip_leaf = assoc_array_ptr_to_leaf(edit->dead_leaf);
                        collapse.slot = 0;
                        assoc_array_subtree_iterate(assoc_array_node_to_ptr(node),
                                                    node->back_pointer,
                                                    assoc_array_delete_collapse_iterator,
                                                    &collapse);
                        pr_devel("collapsed %d,%lu\n", collapse.slot, new_n0->nr_leaves_on_branch);
                        BUG_ON(collapse.slot != new_n0->nr_leaves_on_branch - 1);

                        if (!node->back_pointer) {
                                edit->set[1].ptr = &array->root;
                        } else if (assoc_array_ptr_is_leaf(node->back_pointer)) {
                                BUG();
                        } else if (assoc_array_ptr_is_node(node->back_pointer)) {
                                struct assoc_array_node *p =
                                        assoc_array_ptr_to_node(node->back_pointer);
                                edit->set[1].ptr = &p->slots[node->parent_slot];
                        } else if (assoc_array_ptr_is_shortcut(node->back_pointer)) {
                                struct assoc_array_shortcut *s =
                                        assoc_array_ptr_to_shortcut(node->back_pointer);
                                edit->set[1].ptr = &s->next_node;
                        }
                        edit->set[1].to = assoc_array_node_to_ptr(new_n0);
                        edit->excised_subtree = assoc_array_node_to_ptr(node);
                }
        }

        return edit;

enomem:
        /* Clean up after an out of memory error */
        pr_devel("enomem\n");
        assoc_array_cancel_edit(edit);
        return ERR_PTR(-ENOMEM);
}

/**
 * assoc_array_clear - Script deletion of all objects from an associative array
 * @array: The array to clear.
 * @ops: The operations to use.
 *
 * Precalculate and preallocate a script for the deletion of all the objects
 * from an associative array.  This results in an edit script that can either
 * be applied or cancelled.
 *
 * The function returns a pointer to an edit script if there are objects to be
 * deleted, NULL if there are no objects in the array or -ENOMEM.
 *
 * The caller should lock against other modifications and must continue to hold
 * the lock until assoc_array_apply_edit() has been called.
 *
 * Accesses to the tree may take place concurrently with this function,
 * provided they hold the RCU read lock.
 */
struct assoc_array_edit *assoc_array_clear(struct assoc_array *array,
                                           const struct assoc_array_ops *ops)
{
        struct assoc_array_edit *edit;

        pr_devel("-->%s()\n", __func__);

        if (!array->root)
                return NULL;

        edit = kzalloc(sizeof(struct assoc_array_edit), GFP_KERNEL);
        if (!edit)
                return ERR_PTR(-ENOMEM);
        edit->array = array;
        edit->ops = ops;
        edit->set[1].ptr = &array->root;
        edit->set[1].to = NULL;
        edit->excised_subtree = array->root;
        edit->ops_for_excised_subtree = ops;
        pr_devel("all gone\n");
        return edit;
}

/*
 * Handle the deferred destruction after an applied edit.
 */
static void assoc_array_rcu_cleanup(struct rcu_head *head)
{
        struct assoc_array_edit *edit =
                container_of(head, struct assoc_array_edit, rcu);
        int i;

        pr_devel("-->%s()\n", __func__);

        if (edit->dead_leaf)
                edit->ops->free_object(assoc_array_ptr_to_leaf(edit->dead_leaf));
        for (i = 0; i < ARRAY_SIZE(edit->excised_meta); i++)
                if (edit->excised_meta[i])
                        kfree(assoc_array_ptr_to_node(edit->excised_meta[i]));

        if (edit->excised_subtree) {
                BUG_ON(assoc_array_ptr_is_leaf(edit->excised_subtree));
                if (assoc_array_ptr_is_node(edit->excised_subtree)) {
                        struct assoc_array_node *n =
                                assoc_array_ptr_to_node(edit->excised_subtree);
                        n->back_pointer = NULL;
                } else {
                        struct assoc_array_shortcut *s =
                                assoc_array_ptr_to_shortcut(edit->excised_subtree);
                        s->back_pointer = NULL;
                }
                assoc_array_destroy_subtree(edit->excised_subtree,
                                            edit->ops_for_excised_subtree);
        }

        kfree(edit);
}

/**
 * assoc_array_apply_edit - Apply an edit script to an associative array
 * @edit: The script to apply.
 *
 * Apply an edit script to an associative array to effect an insertion,
 * deletion or clearance.  As the edit script includes preallocated memory,
 * this is guaranteed not to fail.
 *
 * The edit script, dead objects and dead metadata will be scheduled for
 * destruction after an RCU grace period to permit those doing read-only
 * accesses on the array to continue to do so under the RCU read lock whilst
 * the edit is taking place.
 */
void assoc_array_apply_edit(struct assoc_array_edit *edit)
{
        struct assoc_array_shortcut *shortcut;
        struct assoc_array_node *node;
        struct assoc_array_ptr *ptr;
        int i;

        pr_devel("-->%s()\n", __func__);

        smp_wmb();
        if (edit->leaf_p)
                *edit->leaf_p = edit->leaf;

        smp_wmb();
        for (i = 0; i < ARRAY_SIZE(edit->set_parent_slot); i++)
                if (edit->set_parent_slot[i].p)
                        *edit->set_parent_slot[i].p = edit->set_parent_slot[i].to;

        smp_wmb();
        for (i = 0; i < ARRAY_SIZE(edit->set_backpointers); i++)
                if (edit->set_backpointers[i])
                        *edit->set_backpointers[i] = edit->set_backpointers_to;

        smp_wmb();
        for (i = 0; i < ARRAY_SIZE(edit->set); i++)
                if (edit->set[i].ptr)
                        *edit->set[i].ptr = edit->set[i].to;

        if (edit->array->root == NULL) {
                edit->array->nr_leaves_on_tree = 0;
        } else if (edit->adjust_count_on) {
                node = edit->adjust_count_on;
                for (;;) {
                        node->nr_leaves_on_branch += edit->adjust_count_by;

                        ptr = node->back_pointer;
                        if (!ptr)
                                break;
                        if (assoc_array_ptr_is_shortcut(ptr)) {
                                shortcut = assoc_array_ptr_to_shortcut(ptr);
                                ptr = shortcut->back_pointer;
                                if (!ptr)
                                        break;
                        }
                        BUG_ON(!assoc_array_ptr_is_node(ptr));
                        node = assoc_array_ptr_to_node(ptr);
                }

                edit->array->nr_leaves_on_tree += edit->adjust_count_by;
        }

        call_rcu(&edit->rcu, assoc_array_rcu_cleanup);
}

/**
 * assoc_array_cancel_edit - Discard an edit script.
 * @edit: The script to discard.
 *
 * Free an edit script and all the preallocated data it holds without making
 * any changes to the associative array it was intended for.
 *
 * NOTE!  In the case of an insertion script, this does _not_ release the leaf
 * that was to be inserted.  That is left to the caller.
 */
void assoc_array_cancel_edit(struct assoc_array_edit *edit)
{
        struct assoc_array_ptr *ptr;
        int i;

        pr_devel("-->%s()\n", __func__);

        /* Clean up after an out of memory error */
        for (i = 0; i < ARRAY_SIZE(edit->new_meta); i++) {
                ptr = edit->new_meta[i];
                if (ptr) {
                        if (assoc_array_ptr_is_node(ptr))
                                kfree(assoc_array_ptr_to_node(ptr));
                        else
                                kfree(assoc_array_ptr_to_shortcut(ptr));
                }
        }
        kfree(edit);
}

/**
 * assoc_array_gc - Garbage collect an associative array.
 * @array: The array to clean.
 * @ops: The operations to use.
 * @iterator: A callback function to pass judgement on each object.
 * @iterator_data: Private data for the callback function.
 *
 * Collect garbage from an associative array and pack down the internal tree to
 * save memory.
 *
 * The iterator function is asked to pass judgement upon each object in the
 * array.  If it returns false, the object is discard and if it returns true,
 * the object is kept.  If it returns true, it must increment the object's
 * usage count (or whatever it needs to do to retain it) before returning.
 *
 * This function returns 0 if successful or -ENOMEM if out of memory.  In the
 * latter case, the array is not changed.
 *
 * The caller should lock against other modifications and must continue to hold
 * the lock until assoc_array_apply_edit() has been called.
 *
 * Accesses to the tree may take place concurrently with this function,
 * provided they hold the RCU read lock.
 */
int assoc_array_gc(struct assoc_array *array,
                   const struct assoc_array_ops *ops,
                   bool (*iterator)(void *object, void *iterator_data),
                   void *iterator_data)
{
        struct assoc_array_shortcut *shortcut, *new_s;
        struct assoc_array_node *node, *new_n;
        struct assoc_array_edit *edit;
        struct assoc_array_ptr *cursor, *ptr;
        struct assoc_array_ptr *new_root, *new_parent, **new_ptr_pp;
        unsigned long nr_leaves_on_tree;
        bool retained;
        int keylen, slot, nr_free, next_slot, i;

        pr_devel("-->%s()\n", __func__);

        if (!array->root)
                return 0;

        edit = kzalloc(sizeof(struct assoc_array_edit), GFP_KERNEL);
        if (!edit)
                return -ENOMEM;
        edit->array = array;
        edit->ops = ops;
        edit->ops_for_excised_subtree = ops;
        edit->set[0].ptr = &array->root;
        edit->excised_subtree = array->root;

        new_root = new_parent = NULL;
        new_ptr_pp = &new_root;
        cursor = array->root;

descend:
        /* If this point is a shortcut, then we need to duplicate it and
         * advance the target cursor.
         */
        if (assoc_array_ptr_is_shortcut(cursor)) {
                shortcut = assoc_array_ptr_to_shortcut(cursor);
                keylen = round_up(shortcut->skip_to_level, ASSOC_ARRAY_KEY_CHUNK_SIZE);
                keylen >>= ASSOC_ARRAY_KEY_CHUNK_SHIFT;
                new_s = kmalloc(sizeof(struct assoc_array_shortcut) +
                                keylen * sizeof(unsigned long), GFP_KERNEL);
                if (!new_s)
                        goto enomem;
                pr_devel("dup shortcut %p -> %p\n", shortcut, new_s);
                memcpy(new_s, shortcut, (sizeof(struct assoc_array_shortcut) +
                                         keylen * sizeof(unsigned long)));
                new_s->back_pointer = new_parent;
                new_s->parent_slot = shortcut->parent_slot;
                *new_ptr_pp = new_parent = assoc_array_shortcut_to_ptr(new_s);
                new_ptr_pp = &new_s->next_node;
                cursor = shortcut->next_node;
        }

        /* Duplicate the node at this position */
        node = assoc_array_ptr_to_node(cursor);
        new_n = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL);
        if (!new_n)
                goto enomem;
        pr_devel("dup node %p -> %p\n", node, new_n);
        new_n->back_pointer = new_parent;
        new_n->parent_slot = node->parent_slot;
        *new_ptr_pp = new_parent = assoc_array_node_to_ptr(new_n);
        new_ptr_pp = NULL;
        slot = 0;

continue_node:
        /* Filter across any leaves and gc any subtrees */
        for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) {
                ptr = node->slots[slot];
                if (!ptr)
                        continue;

                if (assoc_array_ptr_is_leaf(ptr)) {
                        if (iterator(assoc_array_ptr_to_leaf(ptr),
                                     iterator_data))
                                /* The iterator will have done any reference
                                 * counting on the object for us.
                                 */
                                new_n->slots[slot] = ptr;
                        continue;
                }

                new_ptr_pp = &new_n->slots[slot];
                cursor = ptr;
                goto descend;
        }

retry_compress:
        pr_devel("-- compress node %p --\n", new_n);

        /* Count up the number of empty slots in this node and work out the
         * subtree leaf count.
         */
        new_n->nr_leaves_on_branch = 0;
        nr_free = 0;
        for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) {
                ptr = new_n->slots[slot];
                if (!ptr)
                        nr_free++;
                else if (assoc_array_ptr_is_leaf(ptr))
                        new_n->nr_leaves_on_branch++;
        }
        pr_devel("free=%d, leaves=%lu\n", nr_free, new_n->nr_leaves_on_branch);

        /* See what we can fold in */
        retained = false;
        next_slot = 0;
        for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) {
                struct assoc_array_shortcut *s;
                struct assoc_array_node *child;

                ptr = new_n->slots[slot];
                if (!ptr || assoc_array_ptr_is_leaf(ptr))
                        continue;

                s = NULL;
                if (assoc_array_ptr_is_shortcut(ptr)) {
                        s = assoc_array_ptr_to_shortcut(ptr);
                        ptr = s->next_node;
                }

                child = assoc_array_ptr_to_node(ptr);
                new_n->nr_leaves_on_branch += child->nr_leaves_on_branch;

                if (child->nr_leaves_on_branch <= nr_free + 1) {
                        /* Fold the child node into this one */
                        pr_devel("[%d] fold node %lu/%d [nx %d]\n",
                                 slot, child->nr_leaves_on_branch, nr_free + 1,
                                 next_slot);

                        /* We would already have reaped an intervening shortcut
                         * on the way back up the tree.
                         */
                        BUG_ON(s);

                        new_n->slots[slot] = NULL;
                        nr_free++;
                        if (slot < next_slot)
                                next_slot = slot;
                        for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) {
                                struct assoc_array_ptr *p = child->slots[i];
                                if (!p)
                                        continue;
                                BUG_ON(assoc_array_ptr_is_meta(p));
                                while (new_n->slots[next_slot])
                                        next_slot++;
                                BUG_ON(next_slot >= ASSOC_ARRAY_FAN_OUT);
                                new_n->slots[next_slot++] = p;
                                nr_free--;
                        }
                        kfree(child);
                } else {
                        pr_devel("[%d] retain node %lu/%d [nx %d]\n",
                                 slot, child->nr_leaves_on_branch, nr_free + 1,
                                 next_slot);
                        retained = true;
                }
        }

        if (retained && new_n->nr_leaves_on_branch <= ASSOC_ARRAY_FAN_OUT) {
                pr_devel("internal nodes remain despite enough space, retrying\n");
                goto retry_compress;
        }
        pr_devel("after: %lu\n", new_n->nr_leaves_on_branch);

        nr_leaves_on_tree = new_n->nr_leaves_on_branch;

        /* Excise this node if it is singly occupied by a shortcut */
        if (nr_free == ASSOC_ARRAY_FAN_OUT - 1) {
                for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++)
                        if ((ptr = new_n->slots[slot]))
                                break;

                if (assoc_array_ptr_is_meta(ptr) &&
                    assoc_array_ptr_is_shortcut(ptr)) {
                        pr_devel("excise node %p with 1 shortcut\n", new_n);
                        new_s = assoc_array_ptr_to_shortcut(ptr);
                        new_parent = new_n->back_pointer;
                        slot = new_n->parent_slot;
                        kfree(new_n);
                        if (!new_parent) {
                                new_s->back_pointer = NULL;
                                new_s->parent_slot = 0;
                                new_root = ptr;
                                goto gc_complete;
                        }

                        if (assoc_array_ptr_is_shortcut(new_parent)) {
                                /* We can discard any preceding shortcut also */
                                struct assoc_array_shortcut *s =
                                        assoc_array_ptr_to_shortcut(new_parent);

                                pr_devel("excise preceding shortcut\n");

                                new_parent = new_s->back_pointer = s->back_pointer;
                                slot = new_s->parent_slot = s->parent_slot;
                                kfree(s);
                                if (!new_parent) {
                                        new_s->back_pointer = NULL;
                                        new_s->parent_slot = 0;
                                        new_root = ptr;
                                        goto gc_complete;
                                }
                        }

                        new_s->back_pointer = new_parent;
                        new_s->parent_slot = slot;
                        new_n = assoc_array_ptr_to_node(new_parent);
                        new_n->slots[slot] = ptr;
                        goto ascend_old_tree;
                }
        }

        /* Excise any shortcuts we might encounter that point to nodes that
         * only contain leaves.
         */
        ptr = new_n->back_pointer;
        if (!ptr)
                goto gc_complete;

        if (assoc_array_ptr_is_shortcut(ptr)) {
                new_s = assoc_array_ptr_to_shortcut(ptr);
                new_parent = new_s->back_pointer;
                slot = new_s->parent_slot;

                if (new_n->nr_leaves_on_branch <= ASSOC_ARRAY_FAN_OUT) {
                        struct assoc_array_node *n;

                        pr_devel("excise shortcut\n");
                        new_n->back_pointer = new_parent;
                        new_n->parent_slot = slot;
                        kfree(new_s);
                        if (!new_parent) {
                                new_root = assoc_array_node_to_ptr(new_n);
                                goto gc_complete;
                        }

                        n = assoc_array_ptr_to_node(new_parent);
                        n->slots[slot] = assoc_array_node_to_ptr(new_n);
                }
        } else {
                new_parent = ptr;
        }
        new_n = assoc_array_ptr_to_node(new_parent);

ascend_old_tree:
        ptr = node->back_pointer;
        if (assoc_array_ptr_is_shortcut(ptr)) {
                shortcut = assoc_array_ptr_to_shortcut(ptr);
                slot = shortcut->parent_slot;
                cursor = shortcut->back_pointer;
                if (!cursor)
                        goto gc_complete;
        } else {
                slot = node->parent_slot;
                cursor = ptr;
        }
        BUG_ON(!cursor);
        node = assoc_array_ptr_to_node(cursor);
        slot++;
        goto continue_node;

gc_complete:
        edit->set[0].to = new_root;
        assoc_array_apply_edit(edit);
        array->nr_leaves_on_tree = nr_leaves_on_tree;
        return 0;

enomem:
        pr_devel("enomem\n");
        assoc_array_destroy_subtree(new_root, edit->ops);
        kfree(edit);
        return -ENOMEM;
}































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_BITMAP_H
#define __LINUX_BITMAP_H

#ifndef __ASSEMBLY__

#include <linux/bitops.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/types.h>

struct device;

/*
 * bitmaps provide bit arrays that consume one or more unsigned
 * longs.  The bitmap interface and available operations are listed
 * here, in bitmap.h
 *
 * Function implementations generic to all architectures are in
 * lib/bitmap.c.  Functions implementations that are architecture
 * specific are in various include/asm-<arch>/bitops.h headers
 * and other arch/<arch> specific files.
 *
 * See lib/bitmap.c for more details.
 */

/**
 * DOC: bitmap overview
 *
 * The available bitmap operations and their rough meaning in the
 * case that the bitmap is a single unsigned long are thus:
 *
 * The generated code is more efficient when nbits is known at
 * compile-time and at most BITS_PER_LONG.
 *
 * ::
 *
 *  bitmap_zero(dst, nbits)                     *dst = 0UL
 *  bitmap_fill(dst, nbits)                     *dst = ~0UL
 *  bitmap_copy(dst, src, nbits)                *dst = *src
 *  bitmap_and(dst, src1, src2, nbits)          *dst = *src1 & *src2
 *  bitmap_or(dst, src1, src2, nbits)           *dst = *src1 | *src2
 *  bitmap_xor(dst, src1, src2, nbits)          *dst = *src1 ^ *src2
 *  bitmap_andnot(dst, src1, src2, nbits)       *dst = *src1 & ~(*src2)
 *  bitmap_complement(dst, src, nbits)          *dst = ~(*src)
 *  bitmap_equal(src1, src2, nbits)             Are *src1 and *src2 equal?
 *  bitmap_intersects(src1, src2, nbits)        Do *src1 and *src2 overlap?
 *  bitmap_subset(src1, src2, nbits)            Is *src1 a subset of *src2?
 *  bitmap_empty(src, nbits)                    Are all bits zero in *src?
 *  bitmap_full(src, nbits)                     Are all bits set in *src?
 *  bitmap_weight(src, nbits)                   Hamming Weight: number set bits
 *  bitmap_set(dst, pos, nbits)                 Set specified bit area
 *  bitmap_clear(dst, pos, nbits)               Clear specified bit area
 *  bitmap_find_next_zero_area(buf, len, pos, n, mask)  Find bit free area
 *  bitmap_find_next_zero_area_off(buf, len, pos, n, mask, mask_off)  as above
 *  bitmap_next_clear_region(map, &start, &end, nbits)  Find next clear region
 *  bitmap_next_set_region(map, &start, &end, nbits)  Find next set region
 *  bitmap_for_each_clear_region(map, rs, re, start, end)
 *                                                  Iterate over all clear regions
 *  bitmap_for_each_set_region(map, rs, re, start, end)
 *                                                  Iterate over all set regions
 *  bitmap_shift_right(dst, src, n, nbits)      *dst = *src >> n
 *  bitmap_shift_left(dst, src, n, nbits)       *dst = *src << n
 *  bitmap_cut(dst, src, first, n, nbits)       Cut n bits from first, copy rest
 *  bitmap_replace(dst, old, new, mask, nbits)  *dst = (*old & ~(*mask)) | (*new & *mask)
 *  bitmap_remap(dst, src, old, new, nbits)     *dst = map(old, new)(src)
 *  bitmap_bitremap(oldbit, old, new, nbits)    newbit = map(old, new)(oldbit)
 *  bitmap_onto(dst, orig, relmap, nbits)       *dst = orig relative to relmap
 *  bitmap_fold(dst, orig, sz, nbits)           dst bits = orig bits mod sz
 *  bitmap_parse(buf, buflen, dst, nbits)       Parse bitmap dst from kernel buf
 *  bitmap_parse_user(ubuf, ulen, dst, nbits)   Parse bitmap dst from user buf
 *  bitmap_parselist(buf, dst, nbits)           Parse bitmap dst from kernel buf
 *  bitmap_parselist_user(buf, dst, nbits)      Parse bitmap dst from user buf
 *  bitmap_find_free_region(bitmap, bits, order)  Find and allocate bit region
 *  bitmap_release_region(bitmap, pos, order)   Free specified bit region
 *  bitmap_allocate_region(bitmap, pos, order)  Allocate specified bit region
 *  bitmap_from_arr32(dst, buf, nbits)          Copy nbits from u32[] buf to dst
 *  bitmap_to_arr32(buf, src, nbits)            Copy nbits from buf to u32[] dst
 *  bitmap_get_value8(map, start)               Get 8bit value from map at start
 *  bitmap_set_value8(map, value, start)        Set 8bit value to map at start
 *
 * Note, bitmap_zero() and bitmap_fill() operate over the region of
 * unsigned longs, that is, bits behind bitmap till the unsigned long
 * boundary will be zeroed or filled as well. Consider to use
 * bitmap_clear() or bitmap_set() to make explicit zeroing or filling
 * respectively.
 */

/**
 * DOC: bitmap bitops
 *
 * Also the following operations in asm/bitops.h apply to bitmaps.::
 *
 *  set_bit(bit, addr)                  *addr |= bit
 *  clear_bit(bit, addr)                *addr &= ~bit
 *  change_bit(bit, addr)               *addr ^= bit
 *  test_bit(bit, addr)                 Is bit set in *addr?
 *  test_and_set_bit(bit, addr)         Set bit and return old value
 *  test_and_clear_bit(bit, addr)       Clear bit and return old value
 *  test_and_change_bit(bit, addr)      Change bit and return old value
 *  find_first_zero_bit(addr, nbits)    Position first zero bit in *addr
 *  find_first_bit(addr, nbits)         Position first set bit in *addr
 *  find_next_zero_bit(addr, nbits, bit)
 *                                      Position next zero bit in *addr >= bit
 *  find_next_bit(addr, nbits, bit)     Position next set bit in *addr >= bit
 *  find_next_and_bit(addr1, addr2, nbits, bit)
 *                                      Same as find_next_bit, but in
 *                                      (*addr1 & *addr2)
 *
 */

/**
 * DOC: declare bitmap
 * The DECLARE_BITMAP(name,bits) macro, in linux/types.h, can be used
 * to declare an array named 'name' of just enough unsigned longs to
 * contain all bit positions from 0 to 'bits' - 1.
 */

/*
 * Allocation and deallocation of bitmap.
 * Provided in lib/bitmap.c to avoid circular dependency.
 */
extern unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags);
extern unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags);
extern void bitmap_free(const unsigned long *bitmap);

/* Managed variants of the above. */
unsigned long *devm_bitmap_alloc(struct device *dev,
                                 unsigned int nbits, gfp_t flags);
unsigned long *devm_bitmap_zalloc(struct device *dev,
                                  unsigned int nbits, gfp_t flags);

/*
 * lib/bitmap.c provides these functions:
 */

extern int __bitmap_empty(const unsigned long *bitmap, unsigned int nbits);
extern int __bitmap_full(const unsigned long *bitmap, unsigned int nbits);
extern int __bitmap_equal(const unsigned long *bitmap1,
                          const unsigned long *bitmap2, unsigned int nbits);
extern bool __pure __bitmap_or_equal(const unsigned long *src1,
                                     const unsigned long *src2,
                                     const unsigned long *src3,
                                     unsigned int nbits);
extern void __bitmap_complement(unsigned long *dst, const unsigned long *src,
                        unsigned int nbits);
extern void __bitmap_shift_right(unsigned long *dst, const unsigned long *src,
                                unsigned int shift, unsigned int nbits);
extern void __bitmap_shift_left(unsigned long *dst, const unsigned long *src,
                                unsigned int shift, unsigned int nbits);
extern void bitmap_cut(unsigned long *dst, const unsigned long *src,
                       unsigned int first, unsigned int cut,
                       unsigned int nbits);
extern int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
                        const unsigned long *bitmap2, unsigned int nbits);
extern void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
                        const unsigned long *bitmap2, unsigned int nbits);
extern void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
                        const unsigned long *bitmap2, unsigned int nbits);
extern int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
                        const unsigned long *bitmap2, unsigned int nbits);
extern void __bitmap_replace(unsigned long *dst,
                        const unsigned long *old, const unsigned long *new,
                        const unsigned long *mask, unsigned int nbits);
extern int __bitmap_intersects(const unsigned long *bitmap1,
                        const unsigned long *bitmap2, unsigned int nbits);
extern int __bitmap_subset(const unsigned long *bitmap1,
                        const unsigned long *bitmap2, unsigned int nbits);
extern int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits);
extern void __bitmap_set(unsigned long *map, unsigned int start, int len);
extern void __bitmap_clear(unsigned long *map, unsigned int start, int len);

extern unsigned long bitmap_find_next_zero_area_off(unsigned long *map,
                                                    unsigned long size,
                                                    unsigned long start,
                                                    unsigned int nr,
                                                    unsigned long align_mask,
                                                    unsigned long align_offset);

/**
 * bitmap_find_next_zero_area - find a contiguous aligned zero area
 * @map: The address to base the search on
 * @size: The bitmap size in bits
 * @start: The bitnumber to start searching at
 * @nr: The number of zeroed bits we're looking for
 * @align_mask: Alignment mask for zero area
 *
 * The @align_mask should be one less than a power of 2; the effect is that
 * the bit offset of all zero areas this function finds is multiples of that
 * power of 2. A @align_mask of 0 means no alignment is required.
 */
static inline unsigned long
bitmap_find_next_zero_area(unsigned long *map,
                           unsigned long size,
                           unsigned long start,
                           unsigned int nr,
                           unsigned long align_mask)
{
        return bitmap_find_next_zero_area_off(map, size, start, nr,
                                              align_mask, 0);
}

extern int bitmap_parse(const char *buf, unsigned int buflen,
                        unsigned long *dst, int nbits);
extern int bitmap_parse_user(const char __user *ubuf, unsigned int ulen,
                        unsigned long *dst, int nbits);
extern int bitmap_parselist(const char *buf, unsigned long *maskp,
                        int nmaskbits);
extern int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen,
                        unsigned long *dst, int nbits);
extern void bitmap_remap(unsigned long *dst, const unsigned long *src,
                const unsigned long *old, const unsigned long *new, unsigned int nbits);
extern int bitmap_bitremap(int oldbit,
                const unsigned long *old, const unsigned long *new, int bits);
extern void bitmap_onto(unsigned long *dst, const unsigned long *orig,
                const unsigned long *relmap, unsigned int bits);
extern void bitmap_fold(unsigned long *dst, const unsigned long *orig,
                unsigned int sz, unsigned int nbits);
extern int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order);
extern void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order);
extern int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order);

#ifdef __BIG_ENDIAN
extern void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int nbits);
#else
#define bitmap_copy_le bitmap_copy
#endif
extern unsigned int bitmap_ord_to_pos(const unsigned long *bitmap, unsigned int ord, unsigned int nbits);
extern int bitmap_print_to_pagebuf(bool list, char *buf,
                                   const unsigned long *maskp, int nmaskbits);

#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
#define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))

/*
 * The static inlines below do not handle constant nbits==0 correctly,
 * so make such users (should any ever turn up) call the out-of-line
 * versions.
 */
#define small_const_nbits(nbits) \
        (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG && (nbits) > 0)

#define bitmap_size(nbits)        (ALIGN(nbits, BITS_PER_LONG) / BITS_PER_BYTE)

static inline void bitmap_zero(unsigned long *dst, unsigned int nbits)
{
        unsigned int len = bitmap_size(nbits);
        memset(dst, 0, len);
}

static inline void bitmap_fill(unsigned long *dst, unsigned int nbits)
{
        unsigned int len = bitmap_size(nbits);
        memset(dst, 0xff, len);
}

static inline void bitmap_copy(unsigned long *dst, const unsigned long *src,
                        unsigned int nbits)
{
        unsigned int len = bitmap_size(nbits);
        memcpy(dst, src, len);
}

/*
 * Copy bitmap and clear tail bits in last word.
 */
static inline void bitmap_copy_clear_tail(unsigned long *dst,
                const unsigned long *src, unsigned int nbits)
{
        bitmap_copy(dst, src, nbits);
        if (nbits % BITS_PER_LONG)
                dst[nbits / BITS_PER_LONG] &= BITMAP_LAST_WORD_MASK(nbits);
}

static inline void bitmap_copy_and_extend(unsigned long *to,
                                          const unsigned long *from,
                                          unsigned int count, unsigned int size)
{
        unsigned int copy = BITS_TO_LONGS(count);

        memcpy(to, from, copy * sizeof(long));
        if (count % BITS_PER_LONG)
                to[copy - 1] &= BITMAP_LAST_WORD_MASK(count);
        memset(to + copy, 0, bitmap_size(size) - copy * sizeof(long));
}

/*
 * On 32-bit systems bitmaps are represented as u32 arrays internally, and
 * therefore conversion is not needed when copying data from/to arrays of u32.
 */
#if BITS_PER_LONG == 64
extern void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf,
                                                        unsigned int nbits);
extern void bitmap_to_arr32(u32 *buf, const unsigned long *bitmap,
                                                        unsigned int nbits);
#else
#define bitmap_from_arr32(bitmap, buf, nbits)                        \
        bitmap_copy_clear_tail((unsigned long *) (bitmap),        \
                        (const unsigned long *) (buf), (nbits))
#define bitmap_to_arr32(buf, bitmap, nbits)                        \
        bitmap_copy_clear_tail((unsigned long *) (buf),                \
                        (const unsigned long *) (bitmap), (nbits))
#endif

static inline int bitmap_and(unsigned long *dst, const unsigned long *src1,
                        const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return (*dst = *src1 & *src2 & BITMAP_LAST_WORD_MASK(nbits)) != 0;
        return __bitmap_and(dst, src1, src2, nbits);
}

static inline void bitmap_or(unsigned long *dst, const unsigned long *src1,
                        const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                *dst = *src1 | *src2;
        else
                __bitmap_or(dst, src1, src2, nbits);
}

static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1,
                        const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                *dst = *src1 ^ *src2;
        else
                __bitmap_xor(dst, src1, src2, nbits);
}

static inline int bitmap_andnot(unsigned long *dst, const unsigned long *src1,
                        const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return (*dst = *src1 & ~(*src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0;
        return __bitmap_andnot(dst, src1, src2, nbits);
}

static inline void bitmap_complement(unsigned long *dst, const unsigned long *src,
                        unsigned int nbits)
{
        if (small_const_nbits(nbits))
                *dst = ~(*src);
        else
                __bitmap_complement(dst, src, nbits);
}

#ifdef __LITTLE_ENDIAN
#define BITMAP_MEM_ALIGNMENT 8
#else
#define BITMAP_MEM_ALIGNMENT (8 * sizeof(unsigned long))
#endif
#define BITMAP_MEM_MASK (BITMAP_MEM_ALIGNMENT - 1)

static inline int bitmap_equal(const unsigned long *src1,
                        const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return !((*src1 ^ *src2) & BITMAP_LAST_WORD_MASK(nbits));
        if (__builtin_constant_p(nbits & BITMAP_MEM_MASK) &&
            IS_ALIGNED(nbits, BITMAP_MEM_ALIGNMENT))
                return !memcmp(src1, src2, nbits / 8);
        return __bitmap_equal(src1, src2, nbits);
}

/**
 * bitmap_or_equal - Check whether the or of two bitmaps is equal to a third
 * @src1:        Pointer to bitmap 1
 * @src2:        Pointer to bitmap 2 will be or'ed with bitmap 1
 * @src3:        Pointer to bitmap 3. Compare to the result of *@src1 | *@src2
 * @nbits:        number of bits in each of these bitmaps
 *
 * Returns: True if (*@src1 | *@src2) == *@src3, false otherwise
 */
static inline bool bitmap_or_equal(const unsigned long *src1,
                                   const unsigned long *src2,
                                   const unsigned long *src3,
                                   unsigned int nbits)
{
        if (!small_const_nbits(nbits))
                return __bitmap_or_equal(src1, src2, src3, nbits);

        return !(((*src1 | *src2) ^ *src3) & BITMAP_LAST_WORD_MASK(nbits));
}

static inline int bitmap_intersects(const unsigned long *src1,
                        const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return ((*src1 & *src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0;
        else
                return __bitmap_intersects(src1, src2, nbits);
}

static inline int bitmap_subset(const unsigned long *src1,
                        const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return ! ((*src1 & ~(*src2)) & BITMAP_LAST_WORD_MASK(nbits));
        else
                return __bitmap_subset(src1, src2, nbits);
}

static inline int bitmap_empty(const unsigned long *src, unsigned nbits)
{
        if (small_const_nbits(nbits))
                return ! (*src & BITMAP_LAST_WORD_MASK(nbits));

        return find_first_bit(src, nbits) == nbits;
}

static inline int bitmap_full(const unsigned long *src, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits));

        return find_first_zero_bit(src, nbits) == nbits;
}

static __always_inline int bitmap_weight(const unsigned long *src, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));
        return __bitmap_weight(src, nbits);
}

static __always_inline void bitmap_set(unsigned long *map, unsigned int start,
                unsigned int nbits)
{
        if (__builtin_constant_p(nbits) && nbits == 1)
                __set_bit(start, map);
        else if (__builtin_constant_p(start & BITMAP_MEM_MASK) &&
                 IS_ALIGNED(start, BITMAP_MEM_ALIGNMENT) &&
                 __builtin_constant_p(nbits & BITMAP_MEM_MASK) &&
                 IS_ALIGNED(nbits, BITMAP_MEM_ALIGNMENT))
                memset((char *)map + start / 8, 0xff, nbits / 8);
        else
                __bitmap_set(map, start, nbits);
}

static __always_inline void bitmap_clear(unsigned long *map, unsigned int start,
                unsigned int nbits)
{
        if (__builtin_constant_p(nbits) && nbits == 1)
                __clear_bit(start, map);
        else if (__builtin_constant_p(start & BITMAP_MEM_MASK) &&
                 IS_ALIGNED(start, BITMAP_MEM_ALIGNMENT) &&
                 __builtin_constant_p(nbits & BITMAP_MEM_MASK) &&
                 IS_ALIGNED(nbits, BITMAP_MEM_ALIGNMENT))
                memset((char *)map + start / 8, 0, nbits / 8);
        else
                __bitmap_clear(map, start, nbits);
}

static inline void bitmap_shift_right(unsigned long *dst, const unsigned long *src,
                                unsigned int shift, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                *dst = (*src & BITMAP_LAST_WORD_MASK(nbits)) >> shift;
        else
                __bitmap_shift_right(dst, src, shift, nbits);
}

static inline void bitmap_shift_left(unsigned long *dst, const unsigned long *src,
                                unsigned int shift, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                *dst = (*src << shift) & BITMAP_LAST_WORD_MASK(nbits);
        else
                __bitmap_shift_left(dst, src, shift, nbits);
}

static inline void bitmap_replace(unsigned long *dst,
                                  const unsigned long *old,
                                  const unsigned long *new,
                                  const unsigned long *mask,
                                  unsigned int nbits)
{
        if (small_const_nbits(nbits))
                *dst = (*old & ~(*mask)) | (*new & *mask);
        else
                __bitmap_replace(dst, old, new, mask, nbits);
}

static inline void bitmap_next_clear_region(unsigned long *bitmap,
                                            unsigned int *rs, unsigned int *re,
                                            unsigned int end)
{
        *rs = find_next_zero_bit(bitmap, end, *rs);
        *re = find_next_bit(bitmap, end, *rs + 1);
}

static inline void bitmap_next_set_region(unsigned long *bitmap,
                                          unsigned int *rs, unsigned int *re,
                                          unsigned int end)
{
        *rs = find_next_bit(bitmap, end, *rs);
        *re = find_next_zero_bit(bitmap, end, *rs + 1);
}

/*
 * Bitmap region iterators.  Iterates over the bitmap between [@start, @end).
 * @rs and @re should be integer variables and will be set to start and end
 * index of the current clear or set region.
 */
#define bitmap_for_each_clear_region(bitmap, rs, re, start, end)             \
        for ((rs) = (start),                                                     \
             bitmap_next_clear_region((bitmap), &(rs), &(re), (end));             \
             (rs) < (re);                                                     \
             (rs) = (re) + 1,                                                     \
             bitmap_next_clear_region((bitmap), &(rs), &(re), (end)))

#define bitmap_for_each_set_region(bitmap, rs, re, start, end)                     \
        for ((rs) = (start),                                                     \
             bitmap_next_set_region((bitmap), &(rs), &(re), (end));             \
             (rs) < (re);                                                     \
             (rs) = (re) + 1,                                                     \
             bitmap_next_set_region((bitmap), &(rs), &(re), (end)))

/**
 * BITMAP_FROM_U64() - Represent u64 value in the format suitable for bitmap.
 * @n: u64 value
 *
 * Linux bitmaps are internally arrays of unsigned longs, i.e. 32-bit
 * integers in 32-bit environment, and 64-bit integers in 64-bit one.
 *
 * There are four combinations of endianness and length of the word in linux
 * ABIs: LE64, BE64, LE32 and BE32.
 *
 * On 64-bit kernels 64-bit LE and BE numbers are naturally ordered in
 * bitmaps and therefore don't require any special handling.
 *
 * On 32-bit kernels 32-bit LE ABI orders lo word of 64-bit number in memory
 * prior to hi, and 32-bit BE orders hi word prior to lo. The bitmap on the
 * other hand is represented as an array of 32-bit words and the position of
 * bit N may therefore be calculated as: word #(N/32) and bit #(N%32) in that
 * word.  For example, bit #42 is located at 10th position of 2nd word.
 * It matches 32-bit LE ABI, and we can simply let the compiler store 64-bit
 * values in memory as it usually does. But for BE we need to swap hi and lo
 * words manually.
 *
 * With all that, the macro BITMAP_FROM_U64() does explicit reordering of hi and
 * lo parts of u64.  For LE32 it does nothing, and for BE environment it swaps
 * hi and lo words, as is expected by bitmap.
 */
#if __BITS_PER_LONG == 64
#define BITMAP_FROM_U64(n) (n)
#else
#define BITMAP_FROM_U64(n) ((unsigned long) ((u64)(n) & ULONG_MAX)), \
                                ((unsigned long) ((u64)(n) >> 32))
#endif

/**
 * bitmap_from_u64 - Check and swap words within u64.
 *  @mask: source bitmap
 *  @dst:  destination bitmap
 *
 * In 32-bit Big Endian kernel, when using ``(u32 *)(&val)[*]``
 * to read u64 mask, we will get the wrong word.
 * That is ``(u32 *)(&val)[0]`` gets the upper 32 bits,
 * but we expect the lower 32-bits of u64.
 */
static inline void bitmap_from_u64(unsigned long *dst, u64 mask)
{
        dst[0] = mask & ULONG_MAX;

        if (sizeof(mask) > sizeof(unsigned long))
                dst[1] = mask >> 32;
}

/**
 * bitmap_get_value8 - get an 8-bit value within a memory region
 * @map: address to the bitmap memory region
 * @start: bit offset of the 8-bit value; must be a multiple of 8
 *
 * Returns the 8-bit value located at the @start bit offset within the @src
 * memory region.
 */
static inline unsigned long bitmap_get_value8(const unsigned long *map,
                                              unsigned long start)
{
        const size_t index = BIT_WORD(start);
        const unsigned long offset = start % BITS_PER_LONG;

        return (map[index] >> offset) & 0xFF;
}

/**
 * bitmap_set_value8 - set an 8-bit value within a memory region
 * @map: address to the bitmap memory region
 * @value: the 8-bit value; values wider than 8 bits may clobber bitmap
 * @start: bit offset of the 8-bit value; must be a multiple of 8
 */
static inline void bitmap_set_value8(unsigned long *map, unsigned long value,
                                     unsigned long start)
{
        const size_t index = BIT_WORD(start);
        const unsigned long offset = start % BITS_PER_LONG;

        map[index] &= ~(0xFFUL << offset);
        map[index] |= value << offset;
}

#endif /* __ASSEMBLY__ */

#endif /* __LINUX_BITMAP_H */














































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef _ASM_X86_APIC_H
#define _ASM_X86_APIC_H

#include <linux/cpumask.h>

#include <asm/alternative.h>
#include <asm/cpufeature.h>
#include <asm/apicdef.h>
#include <linux/atomic.h>
#include <asm/fixmap.h>
#include <asm/mpspec.h>
#include <asm/msr.h>
#include <asm/hardirq.h>
#include <asm/io.h>

#define ARCH_APICTIMER_STOPS_ON_C3        1

/*
 * Debugging macros
 */
#define APIC_QUIET   0
#define APIC_VERBOSE 1
#define APIC_DEBUG   2

/* Macros for apic_extnmi which controls external NMI masking */
#define APIC_EXTNMI_BSP                0 /* Default */
#define APIC_EXTNMI_ALL                1
#define APIC_EXTNMI_NONE        2

/*
 * Define the default level of output to be very little
 * This can be turned up by using apic=verbose for more
 * information and apic=debug for _lots_ of information.
 * apic_verbosity is defined in apic.c
 */
#define apic_printk(v, s, a...) do {       \
                if ((v) <= apic_verbosity) \
                        printk(s, ##a);    \
        } while (0)


#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
extern void generic_apic_probe(void);
#else
static inline void generic_apic_probe(void)
{
}
#endif

#ifdef CONFIG_X86_LOCAL_APIC

extern int apic_verbosity;
extern int local_apic_timer_c2_ok;

extern int disable_apic;
extern unsigned int lapic_timer_period;

extern enum apic_intr_mode_id apic_intr_mode;
enum apic_intr_mode_id {
        APIC_PIC,
        APIC_VIRTUAL_WIRE,
        APIC_VIRTUAL_WIRE_NO_CONFIG,
        APIC_SYMMETRIC_IO,
        APIC_SYMMETRIC_IO_NO_ROUTING
};

#ifdef CONFIG_SMP
extern void __inquire_remote_apic(int apicid);
#else /* CONFIG_SMP */
static inline void __inquire_remote_apic(int apicid)
{
}
#endif /* CONFIG_SMP */

static inline void default_inquire_remote_apic(int apicid)
{
        if (apic_verbosity >= APIC_DEBUG)
                __inquire_remote_apic(apicid);
}

/*
 * With 82489DX we can't rely on apic feature bit
 * retrieved via cpuid but still have to deal with
 * such an apic chip so we assume that SMP configuration
 * is found from MP table (64bit case uses ACPI mostly
 * which set smp presence flag as well so we are safe
 * to use this helper too).
 */
static inline bool apic_from_smp_config(void)
{
        return smp_found_config && !disable_apic;
}

/*
 * Basic functions accessing APICs.
 */
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#endif

extern int setup_profiling_timer(unsigned int);

static inline void native_apic_mem_write(u32 reg, u32 v)
{
        volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg);

        alternative_io("movl %0, %P1", "xchgl %0, %P1", X86_BUG_11AP,
                       ASM_OUTPUT2("=r" (v), "=m" (*addr)),
                       ASM_OUTPUT2("0" (v), "m" (*addr)));
}

static inline u32 native_apic_mem_read(u32 reg)
{
        return readl((void __iomem *)(APIC_BASE + reg));
}

extern void native_apic_wait_icr_idle(void);
extern u32 native_safe_apic_wait_icr_idle(void);
extern void native_apic_icr_write(u32 low, u32 id);
extern u64 native_apic_icr_read(void);

static inline bool apic_is_x2apic_enabled(void)
{
        u64 msr;

        if (rdmsrl_safe(MSR_IA32_APICBASE, &msr))
                return false;
        return msr & X2APIC_ENABLE;
}

extern void enable_IR_x2apic(void);

extern int get_physical_broadcast(void);

extern int lapic_get_maxlvt(void);
extern void clear_local_APIC(void);
extern void disconnect_bsp_APIC(int virt_wire_setup);
extern void disable_local_APIC(void);
extern void apic_soft_disable(void);
extern void lapic_shutdown(void);
extern void sync_Arb_IDs(void);
extern void init_bsp_APIC(void);
extern void apic_intr_mode_select(void);
extern void apic_intr_mode_init(void);
extern void init_apic_mappings(void);
void register_lapic_address(unsigned long address);
extern void setup_boot_APIC_clock(void);
extern void setup_secondary_APIC_clock(void);
extern void lapic_update_tsc_freq(void);

#ifdef CONFIG_X86_64
static inline int apic_force_enable(unsigned long addr)
{
        return -1;
}
#else
extern int apic_force_enable(unsigned long addr);
#endif

extern void apic_ap_setup(void);

/*
 * On 32bit this is mach-xxx local
 */
#ifdef CONFIG_X86_64
extern int apic_is_clustered_box(void);
#else
static inline int apic_is_clustered_box(void)
{
        return 0;
}
#endif

extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask);
extern void lapic_assign_system_vectors(void);
extern void lapic_assign_legacy_vector(unsigned int isairq, bool replace);
extern void lapic_update_legacy_vectors(void);
extern void lapic_online(void);
extern void lapic_offline(void);
extern bool apic_needs_pit(void);

extern void apic_send_IPI_allbutself(unsigned int vector);

#else /* !CONFIG_X86_LOCAL_APIC */
static inline void lapic_shutdown(void) { }
#define local_apic_timer_c2_ok                1
static inline void init_apic_mappings(void) { }
static inline void disable_local_APIC(void) { }
# define setup_boot_APIC_clock x86_init_noop
# define setup_secondary_APIC_clock x86_init_noop
static inline void lapic_update_tsc_freq(void) { }
static inline void init_bsp_APIC(void) { }
static inline void apic_intr_mode_select(void) { }
static inline void apic_intr_mode_init(void) { }
static inline void lapic_assign_system_vectors(void) { }
static inline void lapic_assign_legacy_vector(unsigned int i, bool r) { }
static inline bool apic_needs_pit(void) { return true; }
#endif /* !CONFIG_X86_LOCAL_APIC */

#ifdef CONFIG_X86_X2APIC
static inline void native_apic_msr_write(u32 reg, u32 v)
{
        if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR ||
            reg == APIC_LVR)
                return;

        wrmsr(APIC_BASE_MSR + (reg >> 4), v, 0);
}

static inline void native_apic_msr_eoi_write(u32 reg, u32 v)
{
        __wrmsr(APIC_BASE_MSR + (APIC_EOI >> 4), APIC_EOI_ACK, 0);
}

static inline u32 native_apic_msr_read(u32 reg)
{
        u64 msr;

        if (reg == APIC_DFR)
                return -1;

        rdmsrl(APIC_BASE_MSR + (reg >> 4), msr);
        return (u32)msr;
}

static inline void native_x2apic_wait_icr_idle(void)
{
        /* no need to wait for icr idle in x2apic */
        return;
}

static inline u32 native_safe_x2apic_wait_icr_idle(void)
{
        /* no need to wait for icr idle in x2apic */
        return 0;
}

static inline void native_x2apic_icr_write(u32 low, u32 id)
{
        wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low);
}

static inline u64 native_x2apic_icr_read(void)
{
        unsigned long val;

        rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val);
        return val;
}

extern int x2apic_mode;
extern int x2apic_phys;
extern void __init x2apic_set_max_apicid(u32 apicid);
extern void __init check_x2apic(void);
extern void x2apic_setup(void);
static inline int x2apic_enabled(void)
{
        return boot_cpu_has(X86_FEATURE_X2APIC) && apic_is_x2apic_enabled();
}

#define x2apic_supported()        (boot_cpu_has(X86_FEATURE_X2APIC))
#else /* !CONFIG_X86_X2APIC */
static inline void check_x2apic(void) { }
static inline void x2apic_setup(void) { }
static inline int x2apic_enabled(void) { return 0; }

#define x2apic_mode                (0)
#define        x2apic_supported()        (0)
#endif /* !CONFIG_X86_X2APIC */

struct irq_data;

/*
 * Copyright 2004 James Cleverdon, IBM.
 *
 * Generic APIC sub-arch data struct.
 *
 * Hacked for x86-64 by James Cleverdon from i386 architecture code by
 * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
 * James Cleverdon.
 */
struct apic {
        /* Hotpath functions first */
        void        (*eoi_write)(u32 reg, u32 v);
        void        (*native_eoi_write)(u32 reg, u32 v);
        void        (*write)(u32 reg, u32 v);
        u32        (*read)(u32 reg);

        /* IPI related functions */
        void        (*wait_icr_idle)(void);
        u32        (*safe_wait_icr_idle)(void);

        void        (*send_IPI)(int cpu, int vector);
        void        (*send_IPI_mask)(const struct cpumask *mask, int vector);
        void        (*send_IPI_mask_allbutself)(const struct cpumask *msk, int vec);
        void        (*send_IPI_allbutself)(int vector);
        void        (*send_IPI_all)(int vector);
        void        (*send_IPI_self)(int vector);

        /* dest_logical is used by the IPI functions */
        u32        dest_logical;
        u32        disable_esr;
        u32        irq_delivery_mode;
        u32        irq_dest_mode;

        u32        (*calc_dest_apicid)(unsigned int cpu);

        /* ICR related functions */
        u64        (*icr_read)(void);
        void        (*icr_write)(u32 low, u32 high);

        /* Probe, setup and smpboot functions */
        int        (*probe)(void);
        int        (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
        int        (*apic_id_valid)(u32 apicid);
        int        (*apic_id_registered)(void);

        bool        (*check_apicid_used)(physid_mask_t *map, int apicid);
        void        (*init_apic_ldr)(void);
        void        (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap);
        void        (*setup_apic_routing)(void);
        int        (*cpu_present_to_apicid)(int mps_cpu);
        void        (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap);
        int        (*check_phys_apicid_present)(int phys_apicid);
        int        (*phys_pkg_id)(int cpuid_apic, int index_msb);

        u32        (*get_apic_id)(unsigned long x);
        u32        (*set_apic_id)(unsigned int id);

        /* wakeup_secondary_cpu */
        int        (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip);

        void        (*inquire_remote_apic)(int apicid);

#ifdef CONFIG_X86_32
        /*
         * Called very early during boot from get_smp_config().  It should
         * return the logical apicid.  x86_[bios]_cpu_to_apicid is
         * initialized before this function is called.
         *
         * If logical apicid can't be determined that early, the function
         * may return BAD_APICID.  Logical apicid will be configured after
         * init_apic_ldr() while bringing up CPUs.  Note that NUMA affinity
         * won't be applied properly during early boot in this case.
         */
        int (*x86_32_early_logical_apicid)(int cpu);
#endif
        char        *name;
};

/*
 * Pointer to the local APIC driver in use on this system (there's
 * always just one such driver in use - the kernel decides via an
 * early probing process which one it picks - and then sticks to it):
 */
extern struct apic *apic;

/*
 * APIC drivers are probed based on how they are listed in the .apicdrivers
 * section. So the order is important and enforced by the ordering
 * of different apic driver files in the Makefile.
 *
 * For the files having two apic drivers, we use apic_drivers()
 * to enforce the order with in them.
 */
#define apic_driver(sym)                                        \
        static const struct apic *__apicdrivers_##sym __used                \
        __aligned(sizeof(struct apic *))                        \
        __section(".apicdrivers") = { &sym }

#define apic_drivers(sym1, sym2)                                        \
        static struct apic *__apicdrivers_##sym1##sym2[2] __used        \
        __aligned(sizeof(struct apic *))                                \
        __section(".apicdrivers") = { &sym1, &sym2 }

extern struct apic *__apicdrivers[], *__apicdrivers_end[];

/*
 * APIC functionality to boot other CPUs - only used on SMP:
 */
#ifdef CONFIG_SMP
extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip);
extern int lapic_can_unplug_cpu(void);
#endif

#ifdef CONFIG_X86_LOCAL_APIC

static inline u32 apic_read(u32 reg)
{
        return apic->read(reg);
}

static inline void apic_write(u32 reg, u32 val)
{
        apic->write(reg, val);
}

static inline void apic_eoi(void)
{
        apic->eoi_write(APIC_EOI, APIC_EOI_ACK);
}

static inline u64 apic_icr_read(void)
{
        return apic->icr_read();
}

static inline void apic_icr_write(u32 low, u32 high)
{
        apic->icr_write(low, high);
}

static inline void apic_wait_icr_idle(void)
{
        apic->wait_icr_idle();
}

static inline u32 safe_apic_wait_icr_idle(void)
{
        return apic->safe_wait_icr_idle();
}

extern void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v));

#else /* CONFIG_X86_LOCAL_APIC */

static inline u32 apic_read(u32 reg) { return 0; }
static inline void apic_write(u32 reg, u32 val) { }
static inline void apic_eoi(void) { }
static inline u64 apic_icr_read(void) { return 0; }
static inline void apic_icr_write(u32 low, u32 high) { }
static inline void apic_wait_icr_idle(void) { }
static inline u32 safe_apic_wait_icr_idle(void) { return 0; }
static inline void apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) {}

#endif /* CONFIG_X86_LOCAL_APIC */

extern void apic_ack_irq(struct irq_data *data);

static inline void ack_APIC_irq(void)
{
        /*
         * ack_APIC_irq() actually gets compiled as a single instruction
         * ... yummie.
         */
        apic_eoi();
}


static inline bool lapic_vector_set_in_irr(unsigned int vector)
{
        u32 irr = apic_read(APIC_IRR + (vector / 32 * 0x10));

        return !!(irr & (1U << (vector % 32)));
}

static inline unsigned default_get_apic_id(unsigned long x)
{
        unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));

        if (APIC_XAPIC(ver) || boot_cpu_has(X86_FEATURE_EXTD_APICID))
                return (x >> 24) & 0xFF;
        else
                return (x >> 24) & 0x0F;
}

/*
 * Warm reset vector position:
 */
#define TRAMPOLINE_PHYS_LOW                0x467
#define TRAMPOLINE_PHYS_HIGH                0x469

extern void generic_bigsmp_probe(void);

#ifdef CONFIG_X86_LOCAL_APIC

#include <asm/smp.h>

#define APIC_DFR_VALUE        (APIC_DFR_FLAT)

DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);

extern struct apic apic_noop;

static inline unsigned int read_apic_id(void)
{
        unsigned int reg = apic_read(APIC_ID);

        return apic->get_apic_id(reg);
}

extern int default_apic_id_valid(u32 apicid);
extern int default_acpi_madt_oem_check(char *, char *);
extern void default_setup_apic_routing(void);

extern u32 apic_default_calc_apicid(unsigned int cpu);
extern u32 apic_flat_calc_apicid(unsigned int cpu);

extern bool default_check_apicid_used(physid_mask_t *map, int apicid);
extern void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap);
extern int default_cpu_present_to_apicid(int mps_cpu);
extern int default_check_phys_apicid_present(int phys_apicid);

#endif /* CONFIG_X86_LOCAL_APIC */

#ifdef CONFIG_SMP
bool apic_id_is_primary_thread(unsigned int id);
void apic_smt_update(void);
#else
static inline bool apic_id_is_primary_thread(unsigned int id) { return false; }
static inline void apic_smt_update(void) { }
#endif

struct msi_msg;

#ifdef CONFIG_PCI_MSI
void x86_vector_msi_compose_msg(struct irq_data *data, struct msi_msg *msg);
#else
# define x86_vector_msi_compose_msg NULL
#endif

extern void ioapic_zap_locks(void);

#endif /* _ASM_X86_APIC_H */







































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __SEQ_FILE_NET_H__
#define __SEQ_FILE_NET_H__

#include <linux/seq_file.h>

struct net;
extern struct net init_net;

struct seq_net_private {
#ifdef CONFIG_NET_NS
        struct net *net;
#endif
};

static inline struct net *seq_file_net(struct seq_file *seq)
{
#ifdef CONFIG_NET_NS
        return ((struct seq_net_private *)seq->private)->net;
#else
        return &init_net;
#endif
}

/*
 * This one is needed for proc_create_net_single since net is stored directly
 * in private not as a struct i.e. seq_file_net can't be used.
 */
static inline struct net *seq_file_single_net(struct seq_file *seq)
{
#ifdef CONFIG_NET_NS
        return (struct net *)seq->private;
#else
        return &init_net;
#endif
}

#endif




































    1 






















    1 




















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Common values and helper functions for the ChaCha and XChaCha stream ciphers.
 *
 * XChaCha extends ChaCha's nonce to 192 bits, while provably retaining ChaCha's
 * security.  Here they share the same key size, tfm context, and setkey
 * function; only their IV size and encrypt/decrypt function differ.
 *
 * The ChaCha paper specifies 20, 12, and 8-round variants.  In general, it is
 * recommended to use the 20-round variant ChaCha20.  However, the other
 * variants can be needed in some performance-sensitive scenarios.  The generic
 * ChaCha code currently allows only the 20 and 12-round variants.
 */

#ifndef _CRYPTO_CHACHA_H
#define _CRYPTO_CHACHA_H

#include <asm/unaligned.h>
#include <linux/types.h>

/* 32-bit stream position, then 96-bit nonce (RFC7539 convention) */
#define CHACHA_IV_SIZE                16

#define CHACHA_KEY_SIZE                32
#define CHACHA_BLOCK_SIZE        64
#define CHACHAPOLY_IV_SIZE        12

#define CHACHA_STATE_WORDS        (CHACHA_BLOCK_SIZE / sizeof(u32))

/* 192-bit nonce, then 64-bit stream position */
#define XCHACHA_IV_SIZE                32

void chacha_block_generic(u32 *state, u8 *stream, int nrounds);
static inline void chacha20_block(u32 *state, u8 *stream)
{
        chacha_block_generic(state, stream, 20);
}

void hchacha_block_arch(const u32 *state, u32 *out, int nrounds);
void hchacha_block_generic(const u32 *state, u32 *out, int nrounds);

static inline void hchacha_block(const u32 *state, u32 *out, int nrounds)
{
        if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA))
                hchacha_block_arch(state, out, nrounds);
        else
                hchacha_block_generic(state, out, nrounds);
}

enum chacha_constants { /* expand 32-byte k */
        CHACHA_CONSTANT_EXPA = 0x61707865U,
        CHACHA_CONSTANT_ND_3 = 0x3320646eU,
        CHACHA_CONSTANT_2_BY = 0x79622d32U,
        CHACHA_CONSTANT_TE_K = 0x6b206574U
};

static inline void chacha_init_consts(u32 *state)
{
        state[0]  = CHACHA_CONSTANT_EXPA;
        state[1]  = CHACHA_CONSTANT_ND_3;
        state[2]  = CHACHA_CONSTANT_2_BY;
        state[3]  = CHACHA_CONSTANT_TE_K;
}

void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv);
static inline void chacha_init_generic(u32 *state, const u32 *key, const u8 *iv)
{
        chacha_init_consts(state);
        state[4]  = key[0];
        state[5]  = key[1];
        state[6]  = key[2];
        state[7]  = key[3];
        state[8]  = key[4];
        state[9]  = key[5];
        state[10] = key[6];
        state[11] = key[7];
        state[12] = get_unaligned_le32(iv +  0);
        state[13] = get_unaligned_le32(iv +  4);
        state[14] = get_unaligned_le32(iv +  8);
        state[15] = get_unaligned_le32(iv + 12);
}

static inline void chacha_init(u32 *state, const u32 *key, const u8 *iv)
{
        if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA))
                chacha_init_arch(state, key, iv);
        else
                chacha_init_generic(state, key, iv);
}

void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src,
                       unsigned int bytes, int nrounds);
void chacha_crypt_generic(u32 *state, u8 *dst, const u8 *src,
                          unsigned int bytes, int nrounds);

static inline void chacha_crypt(u32 *state, u8 *dst, const u8 *src,
                                unsigned int bytes, int nrounds)
{
        if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA))
                chacha_crypt_arch(state, dst, src, bytes, nrounds);
        else
                chacha_crypt_generic(state, dst, src, bytes, nrounds);
}

static inline void chacha20_crypt(u32 *state, u8 *dst, const u8 *src,
                                  unsigned int bytes)
{
        chacha_crypt(state, dst, src, bytes, 20);
}

#endif /* _CRYPTO_CHACHA_H */












































































































    1 








    1 



























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_TASK_H
#define _LINUX_SCHED_TASK_H

/*
 * Interface between the scheduler and various task lifetime (fork()/exit())
 * functionality:
 */

#include <linux/sched.h>
#include <linux/uaccess.h>

struct task_struct;
struct rusage;
union thread_union;
struct css_set;

/* All the bits taken by the old clone syscall. */
#define CLONE_LEGACY_FLAGS 0xffffffffULL

struct kernel_clone_args {
        u64 flags;
        int __user *pidfd;
        int __user *child_tid;
        int __user *parent_tid;
        int exit_signal;
        unsigned long stack;
        unsigned long stack_size;
        unsigned long tls;
        pid_t *set_tid;
        /* Number of elements in *set_tid */
        size_t set_tid_size;
        int cgroup;
        int io_thread;
        struct cgroup *cgrp;
        struct css_set *cset;
};

/*
 * This serializes "schedule()" and also protects
 * the run-queue from deletions/modifications (but
 * _adding_ to the beginning of the run-queue has
 * a separate lock).
 */
extern rwlock_t tasklist_lock;
extern spinlock_t mmlist_lock;

extern union thread_union init_thread_union;
extern struct task_struct init_task;

#ifdef CONFIG_PROVE_RCU
extern int lockdep_tasklist_lock_is_held(void);
#endif /* #ifdef CONFIG_PROVE_RCU */

extern asmlinkage void schedule_tail(struct task_struct *prev);
extern void init_idle(struct task_struct *idle, int cpu);

extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
extern void sched_post_fork(struct task_struct *p,
                            struct kernel_clone_args *kargs);
extern void sched_dead(struct task_struct *p);

void __noreturn do_task_dead(void);
void __noreturn make_task_dead(int signr);

extern void mm_cache_init(void);
extern void proc_caches_init(void);

extern void fork_init(void);

extern void release_task(struct task_struct * p);

extern int copy_thread(unsigned long, unsigned long, unsigned long,
                       struct task_struct *, unsigned long);

extern void flush_thread(void);

#ifdef CONFIG_HAVE_EXIT_THREAD
extern void exit_thread(struct task_struct *tsk);
#else
static inline void exit_thread(struct task_struct *tsk)
{
}
#endif
extern void do_group_exit(int);

extern void exit_files(struct task_struct *);
extern void exit_itimers(struct task_struct *);

extern pid_t kernel_clone(struct kernel_clone_args *kargs);
struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node);
struct task_struct *fork_idle(int);
extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
extern long kernel_wait4(pid_t, int __user *, int, struct rusage *);
int kernel_wait(pid_t pid, int *stat);

extern void free_task(struct task_struct *tsk);

/* sched_exec is called by processes performing an exec */
#ifdef CONFIG_SMP
extern void sched_exec(void);
#else
#define sched_exec()   {}
#endif

static inline struct task_struct *get_task_struct(struct task_struct *t)
{
        refcount_inc(&t->usage);
        return t;
}

extern void __put_task_struct(struct task_struct *t);
extern void __put_task_struct_rcu_cb(struct rcu_head *rhp);

static inline void put_task_struct(struct task_struct *t)
{
        if (!refcount_dec_and_test(&t->usage))
                return;

        /*
         * under PREEMPT_RT, we can't call put_task_struct
         * in atomic context because it will indirectly
         * acquire sleeping locks.
         *
         * call_rcu() will schedule delayed_put_task_struct_rcu()
         * to be called in process context.
         *
         * __put_task_struct() is called when
         * refcount_dec_and_test(&t->usage) succeeds.
         *
         * This means that it can't "conflict" with
         * put_task_struct_rcu_user() which abuses ->rcu the same
         * way; rcu_users has a reference so task->usage can't be
         * zero after rcu_users 1 -> 0 transition.
         *
         * delayed_free_task() also uses ->rcu, but it is only called
         * when it fails to fork a process. Therefore, there is no
         * way it can conflict with put_task_struct().
         */
        if (IS_ENABLED(CONFIG_PREEMPT_RT) && !preemptible())
                call_rcu(&t->rcu, __put_task_struct_rcu_cb);
        else
                __put_task_struct(t);
}

DEFINE_FREE(put_task, struct task_struct *, if (_T) put_task_struct(_T))

static inline void put_task_struct_many(struct task_struct *t, int nr)
{
        if (refcount_sub_and_test(nr, &t->usage))
                __put_task_struct(t);
}

void put_task_struct_rcu_user(struct task_struct *task);

#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
extern int arch_task_struct_size __read_mostly;
#else
# define arch_task_struct_size (sizeof(struct task_struct))
#endif

#ifndef CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST
/*
 * If an architecture has not declared a thread_struct whitelist we
 * must assume something there may need to be copied to userspace.
 */
static inline void arch_thread_struct_whitelist(unsigned long *offset,
                                                unsigned long *size)
{
        *offset = 0;
        /* Handle dynamically sized thread_struct. */
        *size = arch_task_struct_size - offsetof(struct task_struct, thread);
}
#endif

#ifdef CONFIG_VMAP_STACK
static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
{
        return t->stack_vm_area;
}
#else
static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
{
        return NULL;
}
#endif

/*
 * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
 * subscriptions and synchronises with wait4().  Also used in procfs.  Also
 * pins the final release of task.io_context.  Also protects ->cpuset and
 * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist.
 *
 * Nests both inside and outside of read_lock(&tasklist_lock).
 * It must not be nested with write_lock_irq(&tasklist_lock),
 * neither inside nor outside.
 */
static inline void task_lock(struct task_struct *p)
{
        spin_lock(&p->alloc_lock);
}

static inline void task_unlock(struct task_struct *p)
{
        spin_unlock(&p->alloc_lock);
}

#endif /* _LINUX_SCHED_TASK_H */
































    5 
    5 

    5 

























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Lock-less NULL terminated single linked list
 *
 * The basic atomic operation of this list is cmpxchg on long.  On
 * architectures that don't have NMI-safe cmpxchg implementation, the
 * list can NOT be used in NMI handlers.  So code that uses the list in
 * an NMI handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.
 *
 * Copyright 2010,2011 Intel Corp.
 *   Author: Huang Ying <ying.huang@intel.com>
 */
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/llist.h>


/**
 * llist_add_batch - add several linked entries in batch
 * @new_first:        first entry in batch to be added
 * @new_last:        last entry in batch to be added
 * @head:        the head for your lock-less list
 *
 * Return whether list is empty before adding.
 */
bool llist_add_batch(struct llist_node *new_first, struct llist_node *new_last,
                     struct llist_head *head)
{
        struct llist_node *first;

        do {
                new_last->next = first = READ_ONCE(head->first);
        } while (cmpxchg(&head->first, first, new_first) != first);

        return !first;
}
EXPORT_SYMBOL_GPL(llist_add_batch);

/**
 * llist_del_first - delete the first entry of lock-less list
 * @head:        the head for your lock-less list
 *
 * If list is empty, return NULL, otherwise, return the first entry
 * deleted, this is the newest added one.
 *
 * Only one llist_del_first user can be used simultaneously with
 * multiple llist_add users without lock.  Because otherwise
 * llist_del_first, llist_add, llist_add (or llist_del_all, llist_add,
 * llist_add) sequence in another user may change @head->first->next,
 * but keep @head->first.  If multiple consumers are needed, please
 * use llist_del_all or use lock between consumers.
 */
struct llist_node *llist_del_first(struct llist_head *head)
{
        struct llist_node *entry, *old_entry, *next;

        entry = smp_load_acquire(&head->first);
        for (;;) {
                if (entry == NULL)
                        return NULL;
                old_entry = entry;
                next = READ_ONCE(entry->next);
                entry = cmpxchg(&head->first, old_entry, next);
                if (entry == old_entry)
                        break;
        }

        return entry;
}
EXPORT_SYMBOL_GPL(llist_del_first);

/**
 * llist_reverse_order - reverse order of a llist chain
 * @head:        first item of the list to be reversed
 *
 * Reverse the order of a chain of llist entries and return the
 * new first entry.
 */
struct llist_node *llist_reverse_order(struct llist_node *head)
{
        struct llist_node *new_head = NULL;

        while (head) {
                struct llist_node *tmp = head;
                head = head->next;
                tmp->next = new_head;
                new_head = tmp;
        }

        return new_head;
}
EXPORT_SYMBOL_GPL(llist_reverse_order);










































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Directory notifications for Linux.
 *
 * Copyright (C) 2000,2001,2002 Stephen Rothwell
 *
 * Copyright (C) 2009 Eric Paris <Red Hat Inc>
 * dnotify was largly rewritten to use the new fsnotify infrastructure
 */
#include <linux/fs.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/sched/signal.h>
#include <linux/dnotify.h>
#include <linux/init.h>
#include <linux/security.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/fdtable.h>
#include <linux/fsnotify_backend.h>

int dir_notify_enable __read_mostly = 1;

static struct kmem_cache *dnotify_struct_cache __read_mostly;
static struct kmem_cache *dnotify_mark_cache __read_mostly;
static struct fsnotify_group *dnotify_group __read_mostly;

/*
 * dnotify will attach one of these to each inode (i_fsnotify_marks) which
 * is being watched by dnotify.  If multiple userspace applications are watching
 * the same directory with dnotify their information is chained in dn
 */
struct dnotify_mark {
        struct fsnotify_mark fsn_mark;
        struct dnotify_struct *dn;
};

/*
 * When a process starts or stops watching an inode the set of events which
 * dnotify cares about for that inode may change.  This function runs the
 * list of everything receiving dnotify events about this directory and calculates
 * the set of all those events.  After it updates what dnotify is interested in
 * it calls the fsnotify function so it can update the set of all events relevant
 * to this inode.
 */
static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
{
        __u32 new_mask = 0;
        struct dnotify_struct *dn;
        struct dnotify_mark *dn_mark  = container_of(fsn_mark,
                                                     struct dnotify_mark,
                                                     fsn_mark);

        assert_spin_locked(&fsn_mark->lock);

        for (dn = dn_mark->dn; dn != NULL; dn = dn->dn_next)
                new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT);
        if (fsn_mark->mask == new_mask)
                return;
        fsn_mark->mask = new_mask;

        fsnotify_recalc_mask(fsn_mark->connector);
}

/*
 * Mains fsnotify call where events are delivered to dnotify.
 * Find the dnotify mark on the relevant inode, run the list of dnotify structs
 * on that mark and determine which of them has expressed interest in receiving
 * events of this type.  When found send the correct process and signal and
 * destroy the dnotify struct if it was not registered to receive multiple
 * events.
 */
static int dnotify_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
                                struct inode *inode, struct inode *dir,
                                const struct qstr *name, u32 cookie)
{
        struct dnotify_mark *dn_mark;
        struct dnotify_struct *dn;
        struct dnotify_struct **prev;
        struct fown_struct *fown;
        __u32 test_mask = mask & ~FS_EVENT_ON_CHILD;

        /* not a dir, dnotify doesn't care */
        if (!dir && !(mask & FS_ISDIR))
                return 0;

        dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark);

        spin_lock(&inode_mark->lock);
        prev = &dn_mark->dn;
        while ((dn = *prev) != NULL) {
                if ((dn->dn_mask & test_mask) == 0) {
                        prev = &dn->dn_next;
                        continue;
                }
                fown = &dn->dn_filp->f_owner;
                send_sigio(fown, dn->dn_fd, POLL_MSG);
                if (dn->dn_mask & FS_DN_MULTISHOT)
                        prev = &dn->dn_next;
                else {
                        *prev = dn->dn_next;
                        kmem_cache_free(dnotify_struct_cache, dn);
                        dnotify_recalc_inode_mask(inode_mark);
                }
        }

        spin_unlock(&inode_mark->lock);

        return 0;
}

static void dnotify_free_mark(struct fsnotify_mark *fsn_mark)
{
        struct dnotify_mark *dn_mark = container_of(fsn_mark,
                                                    struct dnotify_mark,
                                                    fsn_mark);

        BUG_ON(dn_mark->dn);

        kmem_cache_free(dnotify_mark_cache, dn_mark);
}

static const struct fsnotify_ops dnotify_fsnotify_ops = {
        .handle_inode_event = dnotify_handle_event,
        .free_mark = dnotify_free_mark,
};

/*
 * Called every time a file is closed.  Looks first for a dnotify mark on the
 * inode.  If one is found run all of the ->dn structures attached to that
 * mark for one relevant to this process closing the file and remove that
 * dnotify_struct.  If that was the last dnotify_struct also remove the
 * fsnotify_mark.
 */
void dnotify_flush(struct file *filp, fl_owner_t id)
{
        struct fsnotify_mark *fsn_mark;
        struct dnotify_mark *dn_mark;
        struct dnotify_struct *dn;
        struct dnotify_struct **prev;
        struct inode *inode;
        bool free = false;

        inode = file_inode(filp);
        if (!S_ISDIR(inode->i_mode))
                return;

        fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group);
        if (!fsn_mark)
                return;
        dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);

        fsnotify_group_lock(dnotify_group);

        spin_lock(&fsn_mark->lock);
        prev = &dn_mark->dn;
        while ((dn = *prev) != NULL) {
                if ((dn->dn_owner == id) && (dn->dn_filp == filp)) {
                        *prev = dn->dn_next;
                        kmem_cache_free(dnotify_struct_cache, dn);
                        dnotify_recalc_inode_mask(fsn_mark);
                        break;
                }
                prev = &dn->dn_next;
        }

        spin_unlock(&fsn_mark->lock);

        /* nothing else could have found us thanks to the dnotify_groups
           mark_mutex */
        if (dn_mark->dn == NULL) {
                fsnotify_detach_mark(fsn_mark);
                free = true;
        }

        fsnotify_group_unlock(dnotify_group);

        if (free)
                fsnotify_free_mark(fsn_mark);
        fsnotify_put_mark(fsn_mark);
}

/* this conversion is done only at watch creation */
static __u32 convert_arg(unsigned long arg)
{
        __u32 new_mask = FS_EVENT_ON_CHILD;

        if (arg & DN_MULTISHOT)
                new_mask |= FS_DN_MULTISHOT;
        if (arg & DN_DELETE)
                new_mask |= (FS_DELETE | FS_MOVED_FROM);
        if (arg & DN_MODIFY)
                new_mask |= FS_MODIFY;
        if (arg & DN_ACCESS)
                new_mask |= FS_ACCESS;
        if (arg & DN_ATTRIB)
                new_mask |= FS_ATTRIB;
        if (arg & DN_RENAME)
                new_mask |= FS_RENAME;
        if (arg & DN_CREATE)
                new_mask |= (FS_CREATE | FS_MOVED_TO);

        return new_mask;
}

/*
 * If multiple processes watch the same inode with dnotify there is only one
 * dnotify mark in inode->i_fsnotify_marks but we chain a dnotify_struct
 * onto that mark.  This function either attaches the new dnotify_struct onto
 * that list, or it |= the mask onto an existing dnofiy_struct.
 */
static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark *dn_mark,
                     fl_owner_t id, int fd, struct file *filp, __u32 mask)
{
        struct dnotify_struct *odn;

        odn = dn_mark->dn;
        while (odn != NULL) {
                /* adding more events to existing dnofiy_struct? */
                if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
                        odn->dn_fd = fd;
                        odn->dn_mask |= mask;
                        return -EEXIST;
                }
                odn = odn->dn_next;
        }

        dn->dn_mask = mask;
        dn->dn_fd = fd;
        dn->dn_filp = filp;
        dn->dn_owner = id;
        dn->dn_next = dn_mark->dn;
        dn_mark->dn = dn;

        return 0;
}

/*
 * When a process calls fcntl to attach a dnotify watch to a directory it ends
 * up here.  Allocate both a mark for fsnotify to add and a dnotify_struct to be
 * attached to the fsnotify_mark.
 */
int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
{
        struct dnotify_mark *new_dn_mark, *dn_mark;
        struct fsnotify_mark *new_fsn_mark, *fsn_mark;
        struct dnotify_struct *dn;
        struct inode *inode;
        fl_owner_t id = current->files;
        struct file *f;
        int destroy = 0, error = 0;
        __u32 mask;

        /* we use these to tell if we need to kfree */
        new_fsn_mark = NULL;
        dn = NULL;

        if (!dir_notify_enable) {
                error = -EINVAL;
                goto out_err;
        }

        /* a 0 mask means we are explicitly removing the watch */
        if ((arg & ~DN_MULTISHOT) == 0) {
                dnotify_flush(filp, id);
                error = 0;
                goto out_err;
        }

        /* dnotify only works on directories */
        inode = file_inode(filp);
        if (!S_ISDIR(inode->i_mode)) {
                error = -ENOTDIR;
                goto out_err;
        }

        /*
         * convert the userspace DN_* "arg" to the internal FS_*
         * defined in fsnotify
         */
        mask = convert_arg(arg);

        error = security_path_notify(&filp->f_path, mask,
                        FSNOTIFY_OBJ_TYPE_INODE);
        if (error)
                goto out_err;

        /* expect most fcntl to add new rather than augment old */
        dn = kmem_cache_alloc(dnotify_struct_cache, GFP_KERNEL);
        if (!dn) {
                error = -ENOMEM;
                goto out_err;
        }

        /* new fsnotify mark, we expect most fcntl calls to add a new mark */
        new_dn_mark = kmem_cache_alloc(dnotify_mark_cache, GFP_KERNEL);
        if (!new_dn_mark) {
                error = -ENOMEM;
                goto out_err;
        }

        /* set up the new_fsn_mark and new_dn_mark */
        new_fsn_mark = &new_dn_mark->fsn_mark;
        fsnotify_init_mark(new_fsn_mark, dnotify_group);
        new_fsn_mark->mask = mask;
        new_dn_mark->dn = NULL;

        /* this is needed to prevent the fcntl/close race described below */
        fsnotify_group_lock(dnotify_group);

        /* add the new_fsn_mark or find an old one. */
        fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group);
        if (fsn_mark) {
                dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
                spin_lock(&fsn_mark->lock);
        } else {
                error = fsnotify_add_inode_mark_locked(new_fsn_mark, inode, 0);
                if (error) {
                        fsnotify_group_unlock(dnotify_group);
                        goto out_err;
                }
                spin_lock(&new_fsn_mark->lock);
                fsn_mark = new_fsn_mark;
                dn_mark = new_dn_mark;
                /* we used new_fsn_mark, so don't free it */
                new_fsn_mark = NULL;
        }

        rcu_read_lock();
        f = lookup_fd_rcu(fd);
        rcu_read_unlock();

        /* if (f != filp) means that we lost a race and another task/thread
         * actually closed the fd we are still playing with before we grabbed
         * the dnotify_groups mark_mutex and fsn_mark->lock.  Since closing the
         * fd is the only time we clean up the marks we need to get our mark
         * off the list. */
        if (f != filp) {
                /* if we added ourselves, shoot ourselves, it's possible that
                 * the flush actually did shoot this fsn_mark.  That's fine too
                 * since multiple calls to destroy_mark is perfectly safe, if
                 * we found a dn_mark already attached to the inode, just sod
                 * off silently as the flush at close time dealt with it.
                 */
                if (dn_mark == new_dn_mark)
                        destroy = 1;
                error = 0;
                goto out;
        }

        __f_setown(filp, task_pid(current), PIDTYPE_TGID, 0);

        error = attach_dn(dn, dn_mark, id, fd, filp, mask);
        /* !error means that we attached the dn to the dn_mark, so don't free it */
        if (!error)
                dn = NULL;
        /* -EEXIST means that we didn't add this new dn and used an old one.
         * that isn't an error (and the unused dn should be freed) */
        else if (error == -EEXIST)
                error = 0;

        dnotify_recalc_inode_mask(fsn_mark);
out:
        spin_unlock(&fsn_mark->lock);

        if (destroy)
                fsnotify_detach_mark(fsn_mark);
        fsnotify_group_unlock(dnotify_group);
        if (destroy)
                fsnotify_free_mark(fsn_mark);
        fsnotify_put_mark(fsn_mark);
out_err:
        if (new_fsn_mark)
                fsnotify_put_mark(new_fsn_mark);
        if (dn)
                kmem_cache_free(dnotify_struct_cache, dn);
        return error;
}

static int __init dnotify_init(void)
{
        dnotify_struct_cache = KMEM_CACHE(dnotify_struct,
                                          SLAB_PANIC|SLAB_ACCOUNT);
        dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC|SLAB_ACCOUNT);

        dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops,
                                             FSNOTIFY_GROUP_NOFS);
        if (IS_ERR(dnotify_group))
                panic("unable to allocate fsnotify group for dnotify\n");
        return 0;
}

module_init(dnotify_init)
























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_BARRIER_H
#define _ASM_X86_BARRIER_H

#include <asm/alternative.h>
#include <asm/nops.h>

/*
 * Force strict CPU ordering.
 * And yes, this might be required on UP too when we're talking
 * to devices.
 */

#ifdef CONFIG_X86_32
#define mb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "mfence", \
                                      X86_FEATURE_XMM2) ::: "memory", "cc")
#define rmb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "lfence", \
                                       X86_FEATURE_XMM2) ::: "memory", "cc")
#define wmb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "sfence", \
                                       X86_FEATURE_XMM2) ::: "memory", "cc")
#else
#define mb()         asm volatile("mfence":::"memory")
#define rmb()        asm volatile("lfence":::"memory")
#define wmb()        asm volatile("sfence" ::: "memory")
#endif

/**
 * array_index_mask_nospec() - generate a mask that is ~0UL when the
 *         bounds check succeeds and 0 otherwise
 * @index: array element index
 * @size: number of elements in array
 *
 * Returns:
 *     0 - (index < size)
 */
static inline unsigned long array_index_mask_nospec(unsigned long index,
                unsigned long size)
{
        unsigned long mask;

        asm volatile ("cmp %1,%2; sbb %0,%0;"
                        :"=r" (mask)
                        :"g"(size),"r" (index)
                        :"cc");
        return mask;
}

/* Override the default implementation from linux/nospec.h. */
#define array_index_mask_nospec array_index_mask_nospec

/* Prevent speculative execution past this barrier. */
#define barrier_nospec() alternative("", "lfence", X86_FEATURE_LFENCE_RDTSC)

#define dma_rmb()        barrier()
#define dma_wmb()        barrier()

#ifdef CONFIG_X86_32
#define __smp_mb()        asm volatile("lock; addl $0,-4(%%esp)" ::: "memory", "cc")
#else
#define __smp_mb()        asm volatile("lock; addl $0,-4(%%rsp)" ::: "memory", "cc")
#endif
#define __smp_rmb()        dma_rmb()
#define __smp_wmb()        barrier()
#define __smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0)

#define __smp_store_release(p, v)                                        \
do {                                                                        \
        compiletime_assert_atomic_type(*p);                                \
        barrier();                                                        \
        WRITE_ONCE(*p, v);                                                \
} while (0)

#define __smp_load_acquire(p)                                                \
({                                                                        \
        typeof(*p) ___p1 = READ_ONCE(*p);                                \
        compiletime_assert_atomic_type(*p);                                \
        barrier();                                                        \
        ___p1;                                                                \
})

/* Atomic operations are already serializing on x86 */
#define __smp_mb__before_atomic()        do { } while (0)
#define __smp_mb__after_atomic()        do { } while (0)

#include <asm-generic/barrier.h>

#endif /* _ASM_X86_BARRIER_H */

























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __IPC_NAMESPACE_H__
#define __IPC_NAMESPACE_H__

#include <linux/err.h>
#include <linux/idr.h>
#include <linux/rwsem.h>
#include <linux/notifier.h>
#include <linux/nsproxy.h>
#include <linux/ns_common.h>
#include <linux/refcount.h>
#include <linux/rhashtable-types.h>

struct user_namespace;

struct ipc_ids {
        int in_use;
        unsigned short seq;
        struct rw_semaphore rwsem;
        struct idr ipcs_idr;
        int max_idx;
        int last_idx;        /* For wrap around detection */
#ifdef CONFIG_CHECKPOINT_RESTORE
        int next_id;
#endif
        struct rhashtable key_ht;
};

struct ipc_namespace {
        refcount_t        count;
        struct ipc_ids        ids[3];

        int                sem_ctls[4];
        int                used_sems;

        unsigned int        msg_ctlmax;
        unsigned int        msg_ctlmnb;
        unsigned int        msg_ctlmni;
        atomic_t        msg_bytes;
        atomic_t        msg_hdrs;

        size_t                shm_ctlmax;
        size_t                shm_ctlall;
        unsigned long        shm_tot;
        int                shm_ctlmni;
        /*
         * Defines whether IPC_RMID is forced for _all_ shm segments regardless
         * of shmctl()
         */
        int                shm_rmid_forced;

        struct notifier_block ipcns_nb;

        /* The kern_mount of the mqueuefs sb.  We take a ref on it */
        struct vfsmount        *mq_mnt;

        /* # queues in this ns, protected by mq_lock */
        unsigned int    mq_queues_count;

        /* next fields are set through sysctl */
        unsigned int    mq_queues_max;   /* initialized to DFLT_QUEUESMAX */
        unsigned int    mq_msg_max;      /* initialized to DFLT_MSGMAX */
        unsigned int    mq_msgsize_max;  /* initialized to DFLT_MSGSIZEMAX */
        unsigned int    mq_msg_default;
        unsigned int    mq_msgsize_default;

        /* user_ns which owns the ipc ns */
        struct user_namespace *user_ns;
        struct ucounts *ucounts;

        struct llist_node mnt_llist;

        struct ns_common ns;
} __randomize_layout;

extern struct ipc_namespace init_ipc_ns;
extern spinlock_t mq_lock;

#ifdef CONFIG_SYSVIPC
extern void shm_destroy_orphaned(struct ipc_namespace *ns);
#else /* CONFIG_SYSVIPC */
static inline void shm_destroy_orphaned(struct ipc_namespace *ns) {}
#endif /* CONFIG_SYSVIPC */

#ifdef CONFIG_POSIX_MQUEUE
extern int mq_init_ns(struct ipc_namespace *ns);
/*
 * POSIX Message Queue default values:
 *
 * MIN_*: Lowest value an admin can set the maximum unprivileged limit to
 * DFLT_*MAX: Default values for the maximum unprivileged limits
 * DFLT_{MSG,MSGSIZE}: Default values used when the user doesn't supply
 *   an attribute to the open call and the queue must be created
 * HARD_*: Highest value the maximums can be set to.  These are enforced
 *   on CAP_SYS_RESOURCE apps as well making them inviolate (so make them
 *   suitably high)
 *
 * POSIX Requirements:
 *   Per app minimum openable message queues - 8.  This does not map well
 *     to the fact that we limit the number of queues on a per namespace
 *     basis instead of a per app basis.  So, make the default high enough
 *     that no given app should have a hard time opening 8 queues.
 *   Minimum maximum for HARD_MSGMAX - 32767.  I bumped this to 65536.
 *   Minimum maximum for HARD_MSGSIZEMAX - POSIX is silent on this.  However,
 *     we have run into a situation where running applications in the wild
 *     require this to be at least 5MB, and preferably 10MB, so I set the
 *     value to 16MB in hopes that this user is the worst of the bunch and
 *     the new maximum will handle anyone else.  I may have to revisit this
 *     in the future.
 */
#define DFLT_QUEUESMAX                      256
#define MIN_MSGMAX                        1
#define DFLT_MSG                       10U
#define DFLT_MSGMAX                       10
#define HARD_MSGMAX                    65536
#define MIN_MSGSIZEMAX                      128
#define DFLT_MSGSIZE                     8192U
#define DFLT_MSGSIZEMAX                     8192
#define HARD_MSGSIZEMAX            (16*1024*1024)
#else
static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; }
#endif

#if defined(CONFIG_IPC_NS)
extern struct ipc_namespace *copy_ipcs(unsigned long flags,
        struct user_namespace *user_ns, struct ipc_namespace *ns);

static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
{
        if (ns)
                refcount_inc(&ns->count);
        return ns;
}

static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns)
{
        if (ns) {
                if (refcount_inc_not_zero(&ns->count))
                        return ns;
        }

        return NULL;
}

extern void put_ipc_ns(struct ipc_namespace *ns);
#else
static inline struct ipc_namespace *copy_ipcs(unsigned long flags,
        struct user_namespace *user_ns, struct ipc_namespace *ns)
{
        if (flags & CLONE_NEWIPC)
                return ERR_PTR(-EINVAL);

        return ns;
}

static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
{
        return ns;
}

static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns)
{
        return ns;
}

static inline void put_ipc_ns(struct ipc_namespace *ns)
{
}
#endif

#ifdef CONFIG_POSIX_MQUEUE_SYSCTL

struct ctl_table_header;
extern struct ctl_table_header *mq_register_sysctl_table(void);

#else /* CONFIG_POSIX_MQUEUE_SYSCTL */

static inline struct ctl_table_header *mq_register_sysctl_table(void)
{
        return NULL;
}

#endif /* CONFIG_POSIX_MQUEUE_SYSCTL */
#endif
































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * NET                Generic infrastructure for INET connection oriented protocols.
 *
 *                Definitions for inet_connection_sock 
 *
 * Authors:        Many people, see the TCP sources
 *
 *                 From code originally in TCP
 */
#ifndef _INET_CONNECTION_SOCK_H
#define _INET_CONNECTION_SOCK_H

#include <linux/compiler.h>
#include <linux/string.h>
#include <linux/timer.h>
#include <linux/poll.h>
#include <linux/kernel.h>
#include <linux/sockptr.h>

#include <net/inet_sock.h>
#include <net/request_sock.h>

/* Cancel timers, when they are not required. */
#undef INET_CSK_CLEAR_TIMERS

struct inet_bind_bucket;
struct tcp_congestion_ops;

/*
 * Pointers to address related TCP functions
 * (i.e. things that depend on the address family)
 */
struct inet_connection_sock_af_ops {
        int            (*queue_xmit)(struct sock *sk, struct sk_buff *skb, struct flowi *fl);
        void            (*send_check)(struct sock *sk, struct sk_buff *skb);
        int            (*rebuild_header)(struct sock *sk);
        void            (*sk_rx_dst_set)(struct sock *sk, const struct sk_buff *skb);
        int            (*conn_request)(struct sock *sk, struct sk_buff *skb);
        struct sock *(*syn_recv_sock)(const struct sock *sk, struct sk_buff *skb,
                                      struct request_sock *req,
                                      struct dst_entry *dst,
                                      struct request_sock *req_unhash,
                                      bool *own_req);
        u16            net_header_len;
        u16            net_frag_header_len;
        u16            sockaddr_len;
        int            (*setsockopt)(struct sock *sk, int level, int optname,
                                  sockptr_t optval, unsigned int optlen);
        int            (*getsockopt)(struct sock *sk, int level, int optname,
                                  char __user *optval, int __user *optlen);
        void            (*addr2sockaddr)(struct sock *sk, struct sockaddr *);
        void            (*mtu_reduced)(struct sock *sk);
};

/** inet_connection_sock - INET connection oriented sock
 *
 * @icsk_accept_queue:           FIFO of established children
 * @icsk_bind_hash:           Bind node
 * @icsk_timeout:           Timeout
 * @icsk_retransmit_timer: Resend (no ack)
 * @icsk_rto:                   Retransmit timeout
 * @icsk_pmtu_cookie           Last pmtu seen by socket
 * @icsk_ca_ops                   Pluggable congestion control hook
 * @icsk_af_ops                   Operations which are AF_INET{4,6} specific
 * @icsk_ulp_ops           Pluggable ULP control hook
 * @icsk_ulp_data           ULP private data
 * @icsk_clean_acked           Clean acked data hook
 * @icsk_listen_portaddr_node        hash to the portaddr listener hashtable
 * @icsk_ca_state:           Congestion control state
 * @icsk_retransmits:           Number of unrecovered [RTO] timeouts
 * @icsk_pending:           Scheduled timer event
 * @icsk_backoff:           Backoff
 * @icsk_syn_retries:      Number of allowed SYN (or equivalent) retries
 * @icsk_probes_out:           unanswered 0 window probes
 * @icsk_ext_hdr_len:           Network protocol overhead (IP/IPv6 options)
 * @icsk_ack:                   Delayed ACK control data
 * @icsk_mtup;                   MTU probing control data
 * @icsk_probes_tstamp:    Probe timestamp (cleared by non-zero window ack)
 * @icsk_user_timeout:           TCP_USER_TIMEOUT value
 */
struct inet_connection_sock {
        /* inet_sock has to be the first member! */
        struct inet_sock          icsk_inet;
        struct request_sock_queue icsk_accept_queue;
        struct inet_bind_bucket          *icsk_bind_hash;
        unsigned long                  icsk_timeout;
         struct timer_list          icsk_retransmit_timer;
         struct timer_list          icsk_delack_timer;
        __u32                          icsk_rto;
        __u32                     icsk_rto_min;
        __u32                     icsk_delack_max;
        __u32                          icsk_pmtu_cookie;
        const struct tcp_congestion_ops *icsk_ca_ops;
        const struct inet_connection_sock_af_ops *icsk_af_ops;
        const struct tcp_ulp_ops  *icsk_ulp_ops;
        void __rcu                  *icsk_ulp_data;
        void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq);
        struct hlist_node         icsk_listen_portaddr_node;
        unsigned int                  (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
        __u8                          icsk_ca_state:5,
                                  icsk_ca_initialized:1,
                                  icsk_ca_setsockopt:1,
                                  icsk_ca_dst_locked:1;
        __u8                          icsk_retransmits;
        __u8                          icsk_pending;
        __u8                          icsk_backoff;
        __u8                          icsk_syn_retries;
        __u8                          icsk_probes_out;
        __u16                          icsk_ext_hdr_len;
        struct {
                __u8                  pending;         /* ACK is pending                           */
                __u8                  quick;         /* Scheduled number of quick acks           */
                __u8                  pingpong;         /* The session is interactive                   */
                __u8                  retry;         /* Number of attempts                           */
                __u32                  ato;                 /* Predicted tick of soft clock           */
                unsigned long          timeout;         /* Currently scheduled timeout                   */
                __u32                  lrcvtime;         /* timestamp of last received data packet */
                __u16                  last_seg_size; /* Size of last incoming segment           */
                __u16                  rcv_mss;         /* MSS used for delayed ACK decisions           */
        } icsk_ack;
        struct {
                int                  enabled;

                /* Range of MTUs to search */
                int                  search_high;
                int                  search_low;

                /* Information on the current probe. */
                int                  probe_size;

                u32                  probe_timestamp;
        } icsk_mtup;
        u32                          icsk_probes_tstamp;
        u32                          icsk_user_timeout;

        u64                          icsk_ca_priv[104 / sizeof(u64)];
#define ICSK_CA_PRIV_SIZE      (13 * sizeof(u64))
};

#define ICSK_TIME_RETRANS        1        /* Retransmit timer */
#define ICSK_TIME_DACK                2        /* Delayed ack timer */
#define ICSK_TIME_PROBE0        3        /* Zero window probe timer */
#define ICSK_TIME_EARLY_RETRANS 4        /* Early retransmit timer */
#define ICSK_TIME_LOSS_PROBE        5        /* Tail loss probe timer */
#define ICSK_TIME_REO_TIMEOUT        6        /* Reordering timer */

static inline struct inet_connection_sock *inet_csk(const struct sock *sk)
{
        return (struct inet_connection_sock *)sk;
}

static inline void *inet_csk_ca(const struct sock *sk)
{
        return (void *)inet_csk(sk)->icsk_ca_priv;
}

struct sock *inet_csk_clone_lock(const struct sock *sk,
                                 const struct request_sock *req,
                                 const gfp_t priority);

enum inet_csk_ack_state_t {
        ICSK_ACK_SCHED        = 1,
        ICSK_ACK_TIMER  = 2,
        ICSK_ACK_PUSHED = 4,
        ICSK_ACK_PUSHED2 = 8,
        ICSK_ACK_NOW = 16        /* Send the next ACK immediately (once) */
};

void inet_csk_init_xmit_timers(struct sock *sk,
                               void (*retransmit_handler)(struct timer_list *),
                               void (*delack_handler)(struct timer_list *),
                               void (*keepalive_handler)(struct timer_list *));
void inet_csk_clear_xmit_timers(struct sock *sk);
void inet_csk_clear_xmit_timers_sync(struct sock *sk);

static inline void inet_csk_schedule_ack(struct sock *sk)
{
        inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_SCHED;
}

static inline int inet_csk_ack_scheduled(const struct sock *sk)
{
        return inet_csk(sk)->icsk_ack.pending & ICSK_ACK_SCHED;
}

static inline void inet_csk_delack_init(struct sock *sk)
{
        memset(&inet_csk(sk)->icsk_ack, 0, sizeof(inet_csk(sk)->icsk_ack));
}

void inet_csk_delete_keepalive_timer(struct sock *sk);
void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long timeout);

static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what)
{
        struct inet_connection_sock *icsk = inet_csk(sk);

        if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0) {
                icsk->icsk_pending = 0;
#ifdef INET_CSK_CLEAR_TIMERS
                sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
#endif
        } else if (what == ICSK_TIME_DACK) {
                icsk->icsk_ack.pending = 0;
                icsk->icsk_ack.retry = 0;
#ifdef INET_CSK_CLEAR_TIMERS
                sk_stop_timer(sk, &icsk->icsk_delack_timer);
#endif
        } else {
                pr_debug("inet_csk BUG: unknown timer value\n");
        }
}

/*
 *        Reset the retransmission timer
 */
static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what,
                                             unsigned long when,
                                             const unsigned long max_when)
{
        struct inet_connection_sock *icsk = inet_csk(sk);

        if (when > max_when) {
                pr_debug("reset_xmit_timer: sk=%p %d when=0x%lx, caller=%p\n",
                         sk, what, when, (void *)_THIS_IP_);
                when = max_when;
        }

        if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 ||
            what == ICSK_TIME_EARLY_RETRANS || what == ICSK_TIME_LOSS_PROBE ||
            what == ICSK_TIME_REO_TIMEOUT) {
                icsk->icsk_pending = what;
                icsk->icsk_timeout = jiffies + when;
                sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
        } else if (what == ICSK_TIME_DACK) {
                icsk->icsk_ack.pending |= ICSK_ACK_TIMER;
                icsk->icsk_ack.timeout = jiffies + when;
                sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
        } else {
                pr_debug("inet_csk BUG: unknown timer value\n");
        }
}

static inline unsigned long
inet_csk_rto_backoff(const struct inet_connection_sock *icsk,
                     unsigned long max_when)
{
        u64 when = (u64)icsk->icsk_rto << icsk->icsk_backoff;

        return (unsigned long)min_t(u64, when, max_when);
}

struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern);

int inet_csk_get_port(struct sock *sk, unsigned short snum);

struct dst_entry *inet_csk_route_req(const struct sock *sk, struct flowi4 *fl4,
                                     const struct request_sock *req);
struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
                                            struct sock *newsk,
                                            const struct request_sock *req);

struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
                                      struct request_sock *req,
                                      struct sock *child);
void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
                                   unsigned long timeout);
struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
                                         struct request_sock *req,
                                         bool own_req);

static inline void inet_csk_reqsk_queue_added(struct sock *sk)
{
        reqsk_queue_added(&inet_csk(sk)->icsk_accept_queue);
}

static inline int inet_csk_reqsk_queue_len(const struct sock *sk)
{
        return reqsk_queue_len(&inet_csk(sk)->icsk_accept_queue);
}

static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk)
{
        return inet_csk_reqsk_queue_len(sk) > READ_ONCE(sk->sk_max_ack_backlog);
}

bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req);
void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req);

static inline void inet_csk_prepare_for_destroy_sock(struct sock *sk)
{
        /* The below has to be done to allow calling inet_csk_destroy_sock */
        sock_set_flag(sk, SOCK_DEAD);
        this_cpu_inc(*sk->sk_prot->orphan_count);
}

void inet_csk_destroy_sock(struct sock *sk);
void inet_csk_prepare_forced_close(struct sock *sk);

/*
 * LISTEN is a special case for poll..
 */
static inline __poll_t inet_csk_listen_poll(const struct sock *sk)
{
        return !reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue) ?
                        (EPOLLIN | EPOLLRDNORM) : 0;
}

int inet_csk_listen_start(struct sock *sk, int backlog);
void inet_csk_listen_stop(struct sock *sk);

void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr);

/* update the fast reuse flag when adding a socket */
void inet_csk_update_fastreuse(struct inet_bind_bucket *tb,
                               struct sock *sk);

struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu);

#define TCP_PINGPONG_THRESH        1

static inline void inet_csk_enter_pingpong_mode(struct sock *sk)
{
        inet_csk(sk)->icsk_ack.pingpong = TCP_PINGPONG_THRESH;
}

static inline void inet_csk_exit_pingpong_mode(struct sock *sk)
{
        inet_csk(sk)->icsk_ack.pingpong = 0;
}

static inline bool inet_csk_in_pingpong_mode(struct sock *sk)
{
        return inet_csk(sk)->icsk_ack.pingpong >= TCP_PINGPONG_THRESH;
}

static inline bool inet_csk_has_ulp(struct sock *sk)
{
        return inet_sk(sk)->is_icsk && !!inet_csk(sk)->icsk_ulp_ops;
}

static inline void inet_init_csk_locks(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);

        spin_lock_init(&icsk->icsk_accept_queue.rskq_lock);
        spin_lock_init(&icsk->icsk_accept_queue.fastopenq.lock);
}

#endif /* _INET_CONNECTION_SOCK_H */






























































    3 







    1 







    1 


























    1 



















    1 














































    1 









    1 















































    3 

    3 






    1 
    1 
    3 













































    3 












    3 




























    3 











    3 
    3 














    3 




    1 
    1 
    1 












    3 


    3 










    3 






    3 
    1 


    1 

    1 



    1 







    3 





















    3 












    3 















    3 
















    1 
    1 
    3 


    3 
















    3 








    3 

































































































































































































    3 




    3 





















    3 





    3 

    3 













































































































































































    1 




    1 
























    3 





















    3 
















    3 





























    3 
    3 
    1 







    3 








    3 






    3 












    3 




    2 












    2 
    2 





























    2 

    1 






    1 









    2 












    3 



    3 











    3 
















    3 



    3 

    3 
    2 










    3 




























    3 


    3 





    3 
























    1 








    1 

















    1 

    1 








    1 





































    1 



















































































































































    3 


    3 


















    2 






    3 

    3 






























    3 











    3 










    3 


























    3 

    2 


























    3 












































































































































































































    3 








    3 



    3 







    3 








    3 





































































    3 























    3 





    3 
























    3 
    3 



    3 


















    3 
    3 

    3 

    3 




















    3 
    3 

    3 

    3 
    3 
    3 

    3 














    3 






























































































































































































































































































































































































































































































    3 
    3 

    3 
    3 


    3 


    3 















    3 
    3 

    3 
    3 







    3 









    3 




    3 







































































































    1 
















    1 




    1 
    1 







    1 
    1 


    1 







    1 
    1 




    1 




    1 




























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
// SPDX-License-Identifier: GPL-2.0+
/*
 * linux/fs/jbd2/transaction.c
 *
 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
 *
 * Copyright 1998 Red Hat corp --- All Rights Reserved
 *
 * Generic filesystem transaction handling code; part of the ext2fs
 * journaling system.
 *
 * This file manages transactions (compound commits managed by the
 * journaling code) and handles (individual atomic operations by the
 * filesystem).
 */

#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd2.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/timer.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/hrtimer.h>
#include <linux/backing-dev.h>
#include <linux/bug.h>
#include <linux/module.h>
#include <linux/sched/mm.h>

#include <trace/events/jbd2.h>

static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
static void __jbd2_journal_unfile_buffer(struct journal_head *jh);

static struct kmem_cache *transaction_cache;
int __init jbd2_journal_init_transaction_cache(void)
{
        J_ASSERT(!transaction_cache);
        transaction_cache = kmem_cache_create("jbd2_transaction_s",
                                        sizeof(transaction_t),
                                        0,
                                        SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
                                        NULL);
        if (!transaction_cache) {
                pr_emerg("JBD2: failed to create transaction cache\n");
                return -ENOMEM;
        }
        return 0;
}

void jbd2_journal_destroy_transaction_cache(void)
{
        kmem_cache_destroy(transaction_cache);
        transaction_cache = NULL;
}

void jbd2_journal_free_transaction(transaction_t *transaction)
{
        if (unlikely(ZERO_OR_NULL_PTR(transaction)))
                return;
        kmem_cache_free(transaction_cache, transaction);
}

/*
 * Base amount of descriptor blocks we reserve for each transaction.
 */
static int jbd2_descriptor_blocks_per_trans(journal_t *journal)
{
        int tag_space = journal->j_blocksize - sizeof(journal_header_t);
        int tags_per_block;

        /* Subtract UUID */
        tag_space -= 16;
        if (jbd2_journal_has_csum_v2or3(journal))
                tag_space -= sizeof(struct jbd2_journal_block_tail);
        /* Commit code leaves a slack space of 16 bytes at the end of block */
        tags_per_block = (tag_space - 16) / journal_tag_bytes(journal);
        /*
         * Revoke descriptors are accounted separately so we need to reserve
         * space for commit block and normal transaction descriptor blocks.
         */
        return 1 + DIV_ROUND_UP(journal->j_max_transaction_buffers,
                                tags_per_block);
}

/*
 * jbd2_get_transaction: obtain a new transaction_t object.
 *
 * Simply initialise a new transaction. Initialize it in
 * RUNNING state and add it to the current journal (which should not
 * have an existing running transaction: we only make a new transaction
 * once we have started to commit the old one).
 *
 * Preconditions:
 *        The journal MUST be locked.  We don't perform atomic mallocs on the
 *        new transaction        and we can't block without protecting against other
 *        processes trying to touch the journal while it is in transition.
 *
 */

static void jbd2_get_transaction(journal_t *journal,
                                transaction_t *transaction)
{
        transaction->t_journal = journal;
        transaction->t_state = T_RUNNING;
        transaction->t_start_time = ktime_get();
        transaction->t_tid = journal->j_transaction_sequence++;
        transaction->t_expires = jiffies + journal->j_commit_interval;
        spin_lock_init(&transaction->t_handle_lock);
        atomic_set(&transaction->t_updates, 0);
        atomic_set(&transaction->t_outstanding_credits,
                   jbd2_descriptor_blocks_per_trans(journal) +
                   atomic_read(&journal->j_reserved_credits));
        atomic_set(&transaction->t_outstanding_revokes, 0);
        atomic_set(&transaction->t_handle_count, 0);
        INIT_LIST_HEAD(&transaction->t_inode_list);
        INIT_LIST_HEAD(&transaction->t_private_list);

        /* Set up the commit timer for the new transaction. */
        journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires);
        add_timer(&journal->j_commit_timer);

        J_ASSERT(journal->j_running_transaction == NULL);
        journal->j_running_transaction = transaction;
        transaction->t_max_wait = 0;
        transaction->t_start = jiffies;
        transaction->t_requested = 0;
}

/*
 * Handle management.
 *
 * A handle_t is an object which represents a single atomic update to a
 * filesystem, and which tracks all of the modifications which form part
 * of that one update.
 */

/*
 * Update transaction's maximum wait time, if debugging is enabled.
 *
 * In order for t_max_wait to be reliable, it must be protected by a
 * lock.  But doing so will mean that start_this_handle() can not be
 * run in parallel on SMP systems, which limits our scalability.  So
 * unless debugging is enabled, we no longer update t_max_wait, which
 * means that maximum wait time reported by the jbd2_run_stats
 * tracepoint will always be zero.
 */
static inline void update_t_max_wait(transaction_t *transaction,
                                     unsigned long ts)
{
#ifdef CONFIG_JBD2_DEBUG
        if (jbd2_journal_enable_debug &&
            time_after(transaction->t_start, ts)) {
                ts = jbd2_time_diff(ts, transaction->t_start);
                spin_lock(&transaction->t_handle_lock);
                if (ts > transaction->t_max_wait)
                        transaction->t_max_wait = ts;
                spin_unlock(&transaction->t_handle_lock);
        }
#endif
}

/*
 * Wait until running transaction passes to T_FLUSH state and new transaction
 * can thus be started. Also starts the commit if needed. The function expects
 * running transaction to exist and releases j_state_lock.
 */
static void wait_transaction_locked(journal_t *journal)
        __releases(journal->j_state_lock)
{
        DEFINE_WAIT(wait);
        int need_to_start;
        tid_t tid = journal->j_running_transaction->t_tid;

        prepare_to_wait_exclusive(&journal->j_wait_transaction_locked, &wait,
                        TASK_UNINTERRUPTIBLE);
        need_to_start = !tid_geq(journal->j_commit_request, tid);
        read_unlock(&journal->j_state_lock);
        if (need_to_start)
                jbd2_log_start_commit(journal, tid);
        jbd2_might_wait_for_commit(journal);
        schedule();
        finish_wait(&journal->j_wait_transaction_locked, &wait);
}

/*
 * Wait until running transaction transitions from T_SWITCH to T_FLUSH
 * state and new transaction can thus be started. The function releases
 * j_state_lock.
 */
static void wait_transaction_switching(journal_t *journal)
        __releases(journal->j_state_lock)
{
        DEFINE_WAIT(wait);

        if (WARN_ON(!journal->j_running_transaction ||
                    journal->j_running_transaction->t_state != T_SWITCH)) {
                read_unlock(&journal->j_state_lock);
                return;
        }
        prepare_to_wait_exclusive(&journal->j_wait_transaction_locked, &wait,
                        TASK_UNINTERRUPTIBLE);
        read_unlock(&journal->j_state_lock);
        /*
         * We don't call jbd2_might_wait_for_commit() here as there's no
         * waiting for outstanding handles happening anymore in T_SWITCH state
         * and handling of reserved handles actually relies on that for
         * correctness.
         */
        schedule();
        finish_wait(&journal->j_wait_transaction_locked, &wait);
}

static void sub_reserved_credits(journal_t *journal, int blocks)
{
        atomic_sub(blocks, &journal->j_reserved_credits);
        wake_up(&journal->j_wait_reserved);
}

/*
 * Wait until we can add credits for handle to the running transaction.  Called
 * with j_state_lock held for reading. Returns 0 if handle joined the running
 * transaction. Returns 1 if we had to wait, j_state_lock is dropped, and
 * caller must retry.
 */
static int add_transaction_credits(journal_t *journal, int blocks,
                                   int rsv_blocks)
{
        transaction_t *t = journal->j_running_transaction;
        int needed;
        int total = blocks + rsv_blocks;

        /*
         * If the current transaction is locked down for commit, wait
         * for the lock to be released.
         */
        if (t->t_state != T_RUNNING) {
                WARN_ON_ONCE(t->t_state >= T_FLUSH);
                wait_transaction_locked(journal);
                return 1;
        }

        /*
         * If there is not enough space left in the log to write all
         * potential buffers requested by this operation, we need to
         * stall pending a log checkpoint to free some more log space.
         */
        needed = atomic_add_return(total, &t->t_outstanding_credits);
        if (needed > journal->j_max_transaction_buffers) {
                /*
                 * If the current transaction is already too large,
                 * then start to commit it: we can then go back and
                 * attach this handle to a new transaction.
                 */
                atomic_sub(total, &t->t_outstanding_credits);

                /*
                 * Is the number of reserved credits in the current transaction too
                 * big to fit this handle? Wait until reserved credits are freed.
                 */
                if (atomic_read(&journal->j_reserved_credits) + total >
                    journal->j_max_transaction_buffers) {
                        read_unlock(&journal->j_state_lock);
                        jbd2_might_wait_for_commit(journal);
                        wait_event(journal->j_wait_reserved,
                                   atomic_read(&journal->j_reserved_credits) + total <=
                                   journal->j_max_transaction_buffers);
                        return 1;
                }

                wait_transaction_locked(journal);
                return 1;
        }

        /*
         * The commit code assumes that it can get enough log space
         * without forcing a checkpoint.  This is *critical* for
         * correctness: a checkpoint of a buffer which is also
         * associated with a committing transaction creates a deadlock,
         * so commit simply cannot force through checkpoints.
         *
         * We must therefore ensure the necessary space in the journal
         * *before* starting to dirty potentially checkpointed buffers
         * in the new transaction.
         */
        if (jbd2_log_space_left(journal) < journal->j_max_transaction_buffers) {
                atomic_sub(total, &t->t_outstanding_credits);
                read_unlock(&journal->j_state_lock);
                jbd2_might_wait_for_commit(journal);
                write_lock(&journal->j_state_lock);
                if (jbd2_log_space_left(journal) <
                                        journal->j_max_transaction_buffers)
                        __jbd2_log_wait_for_space(journal);
                write_unlock(&journal->j_state_lock);
                return 1;
        }

        /* No reservation? We are done... */
        if (!rsv_blocks)
                return 0;

        needed = atomic_add_return(rsv_blocks, &journal->j_reserved_credits);
        /* We allow at most half of a transaction to be reserved */
        if (needed > journal->j_max_transaction_buffers / 2) {
                sub_reserved_credits(journal, rsv_blocks);
                atomic_sub(total, &t->t_outstanding_credits);
                read_unlock(&journal->j_state_lock);
                jbd2_might_wait_for_commit(journal);
                wait_event(journal->j_wait_reserved,
                         atomic_read(&journal->j_reserved_credits) + rsv_blocks
                         <= journal->j_max_transaction_buffers / 2);
                return 1;
        }
        return 0;
}

/*
 * start_this_handle: Given a handle, deal with any locking or stalling
 * needed to make sure that there is enough journal space for the handle
 * to begin.  Attach the handle to a transaction and set up the
 * transaction's buffer credits.
 */

static int start_this_handle(journal_t *journal, handle_t *handle,
                             gfp_t gfp_mask)
{
        transaction_t        *transaction, *new_transaction = NULL;
        int                blocks = handle->h_total_credits;
        int                rsv_blocks = 0;
        unsigned long ts = jiffies;

        if (handle->h_rsv_handle)
                rsv_blocks = handle->h_rsv_handle->h_total_credits;

        /*
         * Limit the number of reserved credits to 1/2 of maximum transaction
         * size and limit the number of total credits to not exceed maximum
         * transaction size per operation.
         */
        if ((rsv_blocks > journal->j_max_transaction_buffers / 2) ||
            (rsv_blocks + blocks > journal->j_max_transaction_buffers)) {
                printk(KERN_ERR "JBD2: %s wants too many credits "
                       "credits:%d rsv_credits:%d max:%d\n",
                       current->comm, blocks, rsv_blocks,
                       journal->j_max_transaction_buffers);
                WARN_ON(1);
                return -ENOSPC;
        }

alloc_transaction:
        /*
         * This check is racy but it is just an optimization of allocating new
         * transaction early if there are high chances we'll need it. If we
         * guess wrong, we'll retry or free unused transaction.
         */
        if (!data_race(journal->j_running_transaction)) {
                /*
                 * If __GFP_FS is not present, then we may be being called from
                 * inside the fs writeback layer, so we MUST NOT fail.
                 */
                if ((gfp_mask & __GFP_FS) == 0)
                        gfp_mask |= __GFP_NOFAIL;
                new_transaction = kmem_cache_zalloc(transaction_cache,
                                                    gfp_mask);
                if (!new_transaction)
                        return -ENOMEM;
        }

        jbd_debug(3, "New handle %p going live.\n", handle);

        /*
         * We need to hold j_state_lock until t_updates has been incremented,
         * for proper journal barrier handling
         */
repeat:
        read_lock(&journal->j_state_lock);
        BUG_ON(journal->j_flags & JBD2_UNMOUNT);
        if (is_journal_aborted(journal) ||
            (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
                read_unlock(&journal->j_state_lock);
                jbd2_journal_free_transaction(new_transaction);
                return -EROFS;
        }

        /*
         * Wait on the journal's transaction barrier if necessary. Specifically
         * we allow reserved handles to proceed because otherwise commit could
         * deadlock on page writeback not being able to complete.
         */
        if (!handle->h_reserved && journal->j_barrier_count) {
                read_unlock(&journal->j_state_lock);
                wait_event(journal->j_wait_transaction_locked,
                                journal->j_barrier_count == 0);
                goto repeat;
        }

        if (!journal->j_running_transaction) {
                read_unlock(&journal->j_state_lock);
                if (!new_transaction)
                        goto alloc_transaction;
                write_lock(&journal->j_state_lock);
                if (!journal->j_running_transaction &&
                    (handle->h_reserved || !journal->j_barrier_count)) {
                        jbd2_get_transaction(journal, new_transaction);
                        new_transaction = NULL;
                }
                write_unlock(&journal->j_state_lock);
                goto repeat;
        }

        transaction = journal->j_running_transaction;

        if (!handle->h_reserved) {
                /* We may have dropped j_state_lock - restart in that case */
                if (add_transaction_credits(journal, blocks, rsv_blocks))
                        goto repeat;
        } else {
                /*
                 * We have handle reserved so we are allowed to join T_LOCKED
                 * transaction and we don't have to check for transaction size
                 * and journal space. But we still have to wait while running
                 * transaction is being switched to a committing one as it
                 * won't wait for any handles anymore.
                 */
                if (transaction->t_state == T_SWITCH) {
                        wait_transaction_switching(journal);
                        goto repeat;
                }
                sub_reserved_credits(journal, blocks);
                handle->h_reserved = 0;
        }

        /* OK, account for the buffers that this operation expects to
         * use and add the handle to the running transaction. 
         */
        update_t_max_wait(transaction, ts);
        handle->h_transaction = transaction;
        handle->h_requested_credits = blocks;
        handle->h_revoke_credits_requested = handle->h_revoke_credits;
        handle->h_start_jiffies = jiffies;
        atomic_inc(&transaction->t_updates);
        atomic_inc(&transaction->t_handle_count);
        jbd_debug(4, "Handle %p given %d credits (total %d, free %lu)\n",
                  handle, blocks,
                  atomic_read(&transaction->t_outstanding_credits),
                  jbd2_log_space_left(journal));
        read_unlock(&journal->j_state_lock);
        current->journal_info = handle;

        rwsem_acquire_read(&journal->j_trans_commit_map, 0, 1, _THIS_IP_);
        jbd2_journal_free_transaction(new_transaction);
        /*
         * Ensure that no allocations done while the transaction is open are
         * going to recurse back to the fs layer.
         */
        handle->saved_alloc_context = memalloc_nofs_save();
        return 0;
}

/* Allocate a new handle.  This should probably be in a slab... */
static handle_t *new_handle(int nblocks)
{
        handle_t *handle = jbd2_alloc_handle(GFP_NOFS);
        if (!handle)
                return NULL;
        handle->h_total_credits = nblocks;
        handle->h_ref = 1;

        return handle;
}

handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
                              int revoke_records, gfp_t gfp_mask,
                              unsigned int type, unsigned int line_no)
{
        handle_t *handle = journal_current_handle();
        int err;

        if (!journal)
                return ERR_PTR(-EROFS);

        if (handle) {
                J_ASSERT(handle->h_transaction->t_journal == journal);
                handle->h_ref++;
                return handle;
        }

        nblocks += DIV_ROUND_UP(revoke_records,
                                journal->j_revoke_records_per_block);
        handle = new_handle(nblocks);
        if (!handle)
                return ERR_PTR(-ENOMEM);
        if (rsv_blocks) {
                handle_t *rsv_handle;

                rsv_handle = new_handle(rsv_blocks);
                if (!rsv_handle) {
                        jbd2_free_handle(handle);
                        return ERR_PTR(-ENOMEM);
                }
                rsv_handle->h_reserved = 1;
                rsv_handle->h_journal = journal;
                handle->h_rsv_handle = rsv_handle;
        }
        handle->h_revoke_credits = revoke_records;

        err = start_this_handle(journal, handle, gfp_mask);
        if (err < 0) {
                if (handle->h_rsv_handle)
                        jbd2_free_handle(handle->h_rsv_handle);
                jbd2_free_handle(handle);
                return ERR_PTR(err);
        }
        handle->h_type = type;
        handle->h_line_no = line_no;
        trace_jbd2_handle_start(journal->j_fs_dev->bd_dev,
                                handle->h_transaction->t_tid, type,
                                line_no, nblocks);

        return handle;
}
EXPORT_SYMBOL(jbd2__journal_start);


/**
 * jbd2_journal_start() - Obtain a new handle.
 * @journal: Journal to start transaction on.
 * @nblocks: number of block buffer we might modify
 *
 * We make sure that the transaction can guarantee at least nblocks of
 * modified buffers in the log.  We block until the log can guarantee
 * that much space. Additionally, if rsv_blocks > 0, we also create another
 * handle with rsv_blocks reserved blocks in the journal. This handle is
 * stored in h_rsv_handle. It is not attached to any particular transaction
 * and thus doesn't block transaction commit. If the caller uses this reserved
 * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop()
 * on the parent handle will dispose the reserved one. Reserved handle has to
 * be converted to a normal handle using jbd2_journal_start_reserved() before
 * it can be used.
 *
 * Return a pointer to a newly allocated handle, or an ERR_PTR() value
 * on failure.
 */
handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
{
        return jbd2__journal_start(journal, nblocks, 0, 0, GFP_NOFS, 0, 0);
}
EXPORT_SYMBOL(jbd2_journal_start);

static void __jbd2_journal_unreserve_handle(handle_t *handle, transaction_t *t)
{
        journal_t *journal = handle->h_journal;

        WARN_ON(!handle->h_reserved);
        sub_reserved_credits(journal, handle->h_total_credits);
        if (t)
                atomic_sub(handle->h_total_credits, &t->t_outstanding_credits);
}

void jbd2_journal_free_reserved(handle_t *handle)
{
        journal_t *journal = handle->h_journal;

        /* Get j_state_lock to pin running transaction if it exists */
        read_lock(&journal->j_state_lock);
        __jbd2_journal_unreserve_handle(handle, journal->j_running_transaction);
        read_unlock(&journal->j_state_lock);
        jbd2_free_handle(handle);
}
EXPORT_SYMBOL(jbd2_journal_free_reserved);

/**
 * jbd2_journal_start_reserved() - start reserved handle
 * @handle: handle to start
 * @type: for handle statistics
 * @line_no: for handle statistics
 *
 * Start handle that has been previously reserved with jbd2_journal_reserve().
 * This attaches @handle to the running transaction (or creates one if there's
 * not transaction running). Unlike jbd2_journal_start() this function cannot
 * block on journal commit, checkpointing, or similar stuff. It can block on
 * memory allocation or frozen journal though.
 *
 * Return 0 on success, non-zero on error - handle is freed in that case.
 */
int jbd2_journal_start_reserved(handle_t *handle, unsigned int type,
                                unsigned int line_no)
{
        journal_t *journal = handle->h_journal;
        int ret = -EIO;

        if (WARN_ON(!handle->h_reserved)) {
                /* Someone passed in normal handle? Just stop it. */
                jbd2_journal_stop(handle);
                return ret;
        }
        /*
         * Usefulness of mixing of reserved and unreserved handles is
         * questionable. So far nobody seems to need it so just error out.
         */
        if (WARN_ON(current->journal_info)) {
                jbd2_journal_free_reserved(handle);
                return ret;
        }

        handle->h_journal = NULL;
        /*
         * GFP_NOFS is here because callers are likely from writeback or
         * similarly constrained call sites
         */
        ret = start_this_handle(journal, handle, GFP_NOFS);
        if (ret < 0) {
                handle->h_journal = journal;
                jbd2_journal_free_reserved(handle);
                return ret;
        }
        handle->h_type = type;
        handle->h_line_no = line_no;
        trace_jbd2_handle_start(journal->j_fs_dev->bd_dev,
                                handle->h_transaction->t_tid, type,
                                line_no, handle->h_total_credits);
        return 0;
}
EXPORT_SYMBOL(jbd2_journal_start_reserved);

/**
 * jbd2_journal_extend() - extend buffer credits.
 * @handle:  handle to 'extend'
 * @nblocks: nr blocks to try to extend by.
 * @revoke_records: number of revoke records to try to extend by.
 *
 * Some transactions, such as large extends and truncates, can be done
 * atomically all at once or in several stages.  The operation requests
 * a credit for a number of buffer modifications in advance, but can
 * extend its credit if it needs more.
 *
 * jbd2_journal_extend tries to give the running handle more buffer credits.
 * It does not guarantee that allocation - this is a best-effort only.
 * The calling process MUST be able to deal cleanly with a failure to
 * extend here.
 *
 * Return 0 on success, non-zero on failure.
 *
 * return code < 0 implies an error
 * return code > 0 implies normal transaction-full status.
 */
int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
{
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal;
        int result;
        int wanted;

        if (is_handle_aborted(handle))
                return -EROFS;
        journal = transaction->t_journal;

        result = 1;

        read_lock(&journal->j_state_lock);

        /* Don't extend a locked-down transaction! */
        if (transaction->t_state != T_RUNNING) {
                jbd_debug(3, "denied handle %p %d blocks: "
                          "transaction not running\n", handle, nblocks);
                goto error_out;
        }

        nblocks += DIV_ROUND_UP(
                        handle->h_revoke_credits_requested + revoke_records,
                        journal->j_revoke_records_per_block) -
                DIV_ROUND_UP(
                        handle->h_revoke_credits_requested,
                        journal->j_revoke_records_per_block);
        spin_lock(&transaction->t_handle_lock);
        wanted = atomic_add_return(nblocks,
                                   &transaction->t_outstanding_credits);

        if (wanted > journal->j_max_transaction_buffers) {
                jbd_debug(3, "denied handle %p %d blocks: "
                          "transaction too large\n", handle, nblocks);
                atomic_sub(nblocks, &transaction->t_outstanding_credits);
                goto unlock;
        }

        trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev,
                                 transaction->t_tid,
                                 handle->h_type, handle->h_line_no,
                                 handle->h_total_credits,
                                 nblocks);

        handle->h_total_credits += nblocks;
        handle->h_requested_credits += nblocks;
        handle->h_revoke_credits += revoke_records;
        handle->h_revoke_credits_requested += revoke_records;
        result = 0;

        jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
unlock:
        spin_unlock(&transaction->t_handle_lock);
error_out:
        read_unlock(&journal->j_state_lock);
        return result;
}

static void stop_this_handle(handle_t *handle)
{
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal = transaction->t_journal;
        int revokes;

        J_ASSERT(journal_current_handle() == handle);
        J_ASSERT(atomic_read(&transaction->t_updates) > 0);
        current->journal_info = NULL;
        /*
         * Subtract necessary revoke descriptor blocks from handle credits. We
         * take care to account only for revoke descriptor blocks the
         * transaction will really need as large sequences of transactions with
         * small numbers of revokes are relatively common.
         */
        revokes = handle->h_revoke_credits_requested - handle->h_revoke_credits;
        if (revokes) {
                int t_revokes, revoke_descriptors;
                int rr_per_blk = journal->j_revoke_records_per_block;

                WARN_ON_ONCE(DIV_ROUND_UP(revokes, rr_per_blk)
                                > handle->h_total_credits);
                t_revokes = atomic_add_return(revokes,
                                &transaction->t_outstanding_revokes);
                revoke_descriptors =
                        DIV_ROUND_UP(t_revokes, rr_per_blk) -
                        DIV_ROUND_UP(t_revokes - revokes, rr_per_blk);
                handle->h_total_credits -= revoke_descriptors;
        }
        atomic_sub(handle->h_total_credits,
                   &transaction->t_outstanding_credits);
        if (handle->h_rsv_handle)
                __jbd2_journal_unreserve_handle(handle->h_rsv_handle,
                                                transaction);
        if (atomic_dec_and_test(&transaction->t_updates))
                wake_up(&journal->j_wait_updates);

        rwsem_release(&journal->j_trans_commit_map, _THIS_IP_);
        /*
         * Scope of the GFP_NOFS context is over here and so we can restore the
         * original alloc context.
         */
        memalloc_nofs_restore(handle->saved_alloc_context);
}

/**
 * jbd2__journal_restart() - restart a handle .
 * @handle:  handle to restart
 * @nblocks: nr credits requested
 * @revoke_records: number of revoke record credits requested
 * @gfp_mask: memory allocation flags (for start_this_handle)
 *
 * Restart a handle for a multi-transaction filesystem
 * operation.
 *
 * If the jbd2_journal_extend() call above fails to grant new buffer credits
 * to a running handle, a call to jbd2_journal_restart will commit the
 * handle's transaction so far and reattach the handle to a new
 * transaction capable of guaranteeing the requested number of
 * credits. We preserve reserved handle if there's any attached to the
 * passed in handle.
 */
int jbd2__journal_restart(handle_t *handle, int nblocks, int revoke_records,
                          gfp_t gfp_mask)
{
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal;
        tid_t                tid;
        int                need_to_start;
        int                ret;

        /* If we've had an abort of any type, don't even think about
         * actually doing the restart! */
        if (is_handle_aborted(handle))
                return 0;
        journal = transaction->t_journal;
        tid = transaction->t_tid;

        /*
         * First unlink the handle from its current transaction, and start the
         * commit on that.
         */
        jbd_debug(2, "restarting handle %p\n", handle);
        stop_this_handle(handle);
        handle->h_transaction = NULL;

        /*
         * TODO: If we use READ_ONCE / WRITE_ONCE for j_commit_request we can
          * get rid of pointless j_state_lock traffic like this.
         */
        read_lock(&journal->j_state_lock);
        need_to_start = !tid_geq(journal->j_commit_request, tid);
        read_unlock(&journal->j_state_lock);
        if (need_to_start)
                jbd2_log_start_commit(journal, tid);
        handle->h_total_credits = nblocks +
                DIV_ROUND_UP(revoke_records,
                             journal->j_revoke_records_per_block);
        handle->h_revoke_credits = revoke_records;
        ret = start_this_handle(journal, handle, gfp_mask);
        trace_jbd2_handle_restart(journal->j_fs_dev->bd_dev,
                                 ret ? 0 : handle->h_transaction->t_tid,
                                 handle->h_type, handle->h_line_no,
                                 handle->h_total_credits);
        return ret;
}
EXPORT_SYMBOL(jbd2__journal_restart);


int jbd2_journal_restart(handle_t *handle, int nblocks)
{
        return jbd2__journal_restart(handle, nblocks, 0, GFP_NOFS);
}
EXPORT_SYMBOL(jbd2_journal_restart);

/**
 * jbd2_journal_lock_updates () - establish a transaction barrier.
 * @journal:  Journal to establish a barrier on.
 *
 * This locks out any further updates from being started, and blocks
 * until all existing updates have completed, returning only once the
 * journal is in a quiescent state with no updates running.
 *
 * The journal lock should not be held on entry.
 */
void jbd2_journal_lock_updates(journal_t *journal)
{
        DEFINE_WAIT(wait);

        jbd2_might_wait_for_commit(journal);

        write_lock(&journal->j_state_lock);
        ++journal->j_barrier_count;

        /* Wait until there are no reserved handles */
        if (atomic_read(&journal->j_reserved_credits)) {
                write_unlock(&journal->j_state_lock);
                wait_event(journal->j_wait_reserved,
                           atomic_read(&journal->j_reserved_credits) == 0);
                write_lock(&journal->j_state_lock);
        }

        /* Wait until there are no running updates */
        while (1) {
                transaction_t *transaction = journal->j_running_transaction;

                if (!transaction)
                        break;

                spin_lock(&transaction->t_handle_lock);
                prepare_to_wait(&journal->j_wait_updates, &wait,
                                TASK_UNINTERRUPTIBLE);
                if (!atomic_read(&transaction->t_updates)) {
                        spin_unlock(&transaction->t_handle_lock);
                        finish_wait(&journal->j_wait_updates, &wait);
                        break;
                }
                spin_unlock(&transaction->t_handle_lock);
                write_unlock(&journal->j_state_lock);
                schedule();
                finish_wait(&journal->j_wait_updates, &wait);
                write_lock(&journal->j_state_lock);
        }
        write_unlock(&journal->j_state_lock);

        /*
         * We have now established a barrier against other normal updates, but
         * we also need to barrier against other jbd2_journal_lock_updates() calls
         * to make sure that we serialise special journal-locked operations
         * too.
         */
        mutex_lock(&journal->j_barrier);
}

/**
 * jbd2_journal_unlock_updates () - release barrier
 * @journal:  Journal to release the barrier on.
 *
 * Release a transaction barrier obtained with jbd2_journal_lock_updates().
 *
 * Should be called without the journal lock held.
 */
void jbd2_journal_unlock_updates (journal_t *journal)
{
        J_ASSERT(journal->j_barrier_count != 0);

        mutex_unlock(&journal->j_barrier);
        write_lock(&journal->j_state_lock);
        --journal->j_barrier_count;
        write_unlock(&journal->j_state_lock);
        wake_up_all(&journal->j_wait_transaction_locked);
}

static void warn_dirty_buffer(struct buffer_head *bh)
{
        printk(KERN_WARNING
               "JBD2: Spotted dirty metadata buffer (dev = %pg, blocknr = %llu). "
               "There's a risk of filesystem corruption in case of system "
               "crash.\n",
               bh->b_bdev, (unsigned long long)bh->b_blocknr);
}

/* Call t_frozen trigger and copy buffer data into jh->b_frozen_data. */
static void jbd2_freeze_jh_data(struct journal_head *jh)
{
        struct page *page;
        int offset;
        char *source;
        struct buffer_head *bh = jh2bh(jh);

        J_EXPECT_JH(jh, buffer_uptodate(bh), "Possible IO failure.\n");
        page = bh->b_page;
        offset = offset_in_page(bh->b_data);
        source = kmap_atomic(page);
        /* Fire data frozen trigger just before we copy the data */
        jbd2_buffer_frozen_trigger(jh, source + offset, jh->b_triggers);
        memcpy(jh->b_frozen_data, source + offset, bh->b_size);
        kunmap_atomic(source);

        /*
         * Now that the frozen data is saved off, we need to store any matching
         * triggers.
         */
        jh->b_frozen_triggers = jh->b_triggers;
}

/*
 * If the buffer is already part of the current transaction, then there
 * is nothing we need to do.  If it is already part of a prior
 * transaction which we are still committing to disk, then we need to
 * make sure that we do not overwrite the old copy: we do copy-out to
 * preserve the copy going to disk.  We also account the buffer against
 * the handle's metadata buffer credits (unless the buffer is already
 * part of the transaction, that is).
 *
 */
static int
do_get_write_access(handle_t *handle, struct journal_head *jh,
                        int force_copy)
{
        struct buffer_head *bh;
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal;
        int error;
        char *frozen_buffer = NULL;
        unsigned long start_lock, time_lock;

        journal = transaction->t_journal;

        jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);

        JBUFFER_TRACE(jh, "entry");
repeat:
        bh = jh2bh(jh);

        /* @@@ Need to check for errors here at some point. */

         start_lock = jiffies;
        lock_buffer(bh);
        spin_lock(&jh->b_state_lock);

        /* If it takes too long to lock the buffer, trace it */
        time_lock = jbd2_time_diff(start_lock, jiffies);
        if (time_lock > HZ/10)
                trace_jbd2_lock_buffer_stall(bh->b_bdev->bd_dev,
                        jiffies_to_msecs(time_lock));

        /* We now hold the buffer lock so it is safe to query the buffer
         * state.  Is the buffer dirty?
         *
         * If so, there are two possibilities.  The buffer may be
         * non-journaled, and undergoing a quite legitimate writeback.
         * Otherwise, it is journaled, and we don't expect dirty buffers
         * in that state (the buffers should be marked JBD_Dirty
         * instead.)  So either the IO is being done under our own
         * control and this is a bug, or it's a third party IO such as
         * dump(8) (which may leave the buffer scheduled for read ---
         * ie. locked but not dirty) or tune2fs (which may actually have
         * the buffer dirtied, ugh.)  */

        if (buffer_dirty(bh) && jh->b_transaction) {
                warn_dirty_buffer(bh);
                /*
                 * We need to clean the dirty flag and we must do it under the
                 * buffer lock to be sure we don't race with running write-out.
                 */
                JBUFFER_TRACE(jh, "Journalling dirty buffer");
                clear_buffer_dirty(bh);
                /*
                 * The buffer is going to be added to BJ_Reserved list now and
                 * nothing guarantees jbd2_journal_dirty_metadata() will be
                 * ever called for it. So we need to set jbddirty bit here to
                 * make sure the buffer is dirtied and written out when the
                 * journaling machinery is done with it.
                 */
                set_buffer_jbddirty(bh);
        }

        error = -EROFS;
        if (is_handle_aborted(handle)) {
                spin_unlock(&jh->b_state_lock);
                unlock_buffer(bh);
                goto out;
        }
        error = 0;

        /*
         * The buffer is already part of this transaction if b_transaction or
         * b_next_transaction points to it
         */
        if (jh->b_transaction == transaction ||
            jh->b_next_transaction == transaction) {
                unlock_buffer(bh);
                goto done;
        }

        /*
         * this is the first time this transaction is touching this buffer,
         * reset the modified flag
         */
        jh->b_modified = 0;

        /*
         * If the buffer is not journaled right now, we need to make sure it
         * doesn't get written to disk before the caller actually commits the
         * new data
         */
        if (!jh->b_transaction) {
                JBUFFER_TRACE(jh, "no transaction");
                J_ASSERT_JH(jh, !jh->b_next_transaction);
                JBUFFER_TRACE(jh, "file as BJ_Reserved");
                /*
                 * Make sure all stores to jh (b_modified, b_frozen_data) are
                 * visible before attaching it to the running transaction.
                 * Paired with barrier in jbd2_write_access_granted()
                 */
                smp_wmb();
                spin_lock(&journal->j_list_lock);
                if (test_clear_buffer_dirty(bh)) {
                        /*
                         * Execute buffer dirty clearing and jh->b_transaction
                         * assignment under journal->j_list_lock locked to
                         * prevent bh being removed from checkpoint list if
                         * the buffer is in an intermediate state (not dirty
                         * and jh->b_transaction is NULL).
                         */
                        JBUFFER_TRACE(jh, "Journalling dirty buffer");
                        set_buffer_jbddirty(bh);
                }
                __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
                spin_unlock(&journal->j_list_lock);
                unlock_buffer(bh);
                goto done;
        }
        unlock_buffer(bh);

        /*
         * If there is already a copy-out version of this buffer, then we don't
         * need to make another one
         */
        if (jh->b_frozen_data) {
                JBUFFER_TRACE(jh, "has frozen data");
                J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
                goto attach_next;
        }

        JBUFFER_TRACE(jh, "owned by older transaction");
        J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
        J_ASSERT_JH(jh, jh->b_transaction == journal->j_committing_transaction);

        /*
         * There is one case we have to be very careful about.  If the
         * committing transaction is currently writing this buffer out to disk
         * and has NOT made a copy-out, then we cannot modify the buffer
         * contents at all right now.  The essence of copy-out is that it is
         * the extra copy, not the primary copy, which gets journaled.  If the
         * primary copy is already going to disk then we cannot do copy-out
         * here.
         */
        if (buffer_shadow(bh)) {
                JBUFFER_TRACE(jh, "on shadow: sleep");
                spin_unlock(&jh->b_state_lock);
                wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE);
                goto repeat;
        }

        /*
         * Only do the copy if the currently-owning transaction still needs it.
         * If buffer isn't on BJ_Metadata list, the committing transaction is
         * past that stage (here we use the fact that BH_Shadow is set under
         * bh_state lock together with refiling to BJ_Shadow list and at this
         * point we know the buffer doesn't have BH_Shadow set).
         *
         * Subtle point, though: if this is a get_undo_access, then we will be
         * relying on the frozen_data to contain the new value of the
         * committed_data record after the transaction, so we HAVE to force the
         * frozen_data copy in that case.
         */
        if (jh->b_jlist == BJ_Metadata || force_copy) {
                JBUFFER_TRACE(jh, "generate frozen data");
                if (!frozen_buffer) {
                        JBUFFER_TRACE(jh, "allocate memory for buffer");
                        spin_unlock(&jh->b_state_lock);
                        frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size,
                                                   GFP_NOFS | __GFP_NOFAIL);
                        goto repeat;
                }
                jh->b_frozen_data = frozen_buffer;
                frozen_buffer = NULL;
                jbd2_freeze_jh_data(jh);
        }
attach_next:
        /*
         * Make sure all stores to jh (b_modified, b_frozen_data) are visible
         * before attaching it to the running transaction. Paired with barrier
         * in jbd2_write_access_granted()
         */
        smp_wmb();
        jh->b_next_transaction = transaction;

done:
        spin_unlock(&jh->b_state_lock);

        /*
         * If we are about to journal a buffer, then any revoke pending on it is
         * no longer valid
         */
        jbd2_journal_cancel_revoke(handle, jh);

out:
        if (unlikely(frozen_buffer))        /* It's usually NULL */
                jbd2_free(frozen_buffer, bh->b_size);

        JBUFFER_TRACE(jh, "exit");
        return error;
}

/* Fast check whether buffer is already attached to the required transaction */
static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh,
                                                        bool undo)
{
        struct journal_head *jh;
        bool ret = false;

        /* Dirty buffers require special handling... */
        if (buffer_dirty(bh))
                return false;

        /*
         * RCU protects us from dereferencing freed pages. So the checks we do
         * are guaranteed not to oops. However the jh slab object can get freed
         * & reallocated while we work with it. So we have to be careful. When
         * we see jh attached to the running transaction, we know it must stay
         * so until the transaction is committed. Thus jh won't be freed and
         * will be attached to the same bh while we run.  However it can
         * happen jh gets freed, reallocated, and attached to the transaction
         * just after we get pointer to it from bh. So we have to be careful
         * and recheck jh still belongs to our bh before we return success.
         */
        rcu_read_lock();
        if (!buffer_jbd(bh))
                goto out;
        /* This should be bh2jh() but that doesn't work with inline functions */
        jh = READ_ONCE(bh->b_private);
        if (!jh)
                goto out;
        /* For undo access buffer must have data copied */
        if (undo && !jh->b_committed_data)
                goto out;
        if (READ_ONCE(jh->b_transaction) != handle->h_transaction &&
            READ_ONCE(jh->b_next_transaction) != handle->h_transaction)
                goto out;
        /*
         * There are two reasons for the barrier here:
         * 1) Make sure to fetch b_bh after we did previous checks so that we
         * detect when jh went through free, realloc, attach to transaction
         * while we were checking. Paired with implicit barrier in that path.
         * 2) So that access to bh done after jbd2_write_access_granted()
         * doesn't get reordered and see inconsistent state of concurrent
         * do_get_write_access().
         */
        smp_mb();
        if (unlikely(jh->b_bh != bh))
                goto out;
        ret = true;
out:
        rcu_read_unlock();
        return ret;
}

/**
 * jbd2_journal_get_write_access() - notify intent to modify a buffer
 *                                     for metadata (not data) update.
 * @handle: transaction to add buffer modifications to
 * @bh:     bh to be used for metadata writes
 *
 * Returns: error code or 0 on success.
 *
 * In full data journalling mode the buffer may be of type BJ_AsyncData,
 * because we're ``write()ing`` a buffer which is also part of a shared mapping.
 */

int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
{
        struct journal_head *jh;
        int rc;

        if (is_handle_aborted(handle))
                return -EROFS;

        if (jbd2_write_access_granted(handle, bh, false))
                return 0;

        jh = jbd2_journal_add_journal_head(bh);
        /* We do not want to get caught playing with fields which the
         * log thread also manipulates.  Make sure that the buffer
         * completes any outstanding IO before proceeding. */
        rc = do_get_write_access(handle, jh, 0);
        jbd2_journal_put_journal_head(jh);
        return rc;
}


/*
 * When the user wants to journal a newly created buffer_head
 * (ie. getblk() returned a new buffer and we are going to populate it
 * manually rather than reading off disk), then we need to keep the
 * buffer_head locked until it has been completely filled with new
 * data.  In this case, we should be able to make the assertion that
 * the bh is not already part of an existing transaction.
 *
 * The buffer should already be locked by the caller by this point.
 * There is no lock ranking violation: it was a newly created,
 * unlocked buffer beforehand. */

/**
 * jbd2_journal_get_create_access () - notify intent to use newly created bh
 * @handle: transaction to new buffer to
 * @bh: new buffer.
 *
 * Call this if you create a new bh.
 */
int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
{
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal;
        struct journal_head *jh = jbd2_journal_add_journal_head(bh);
        int err;

        jbd_debug(5, "journal_head %p\n", jh);
        err = -EROFS;
        if (is_handle_aborted(handle))
                goto out;
        journal = transaction->t_journal;
        err = 0;

        JBUFFER_TRACE(jh, "entry");
        /*
         * The buffer may already belong to this transaction due to pre-zeroing
         * in the filesystem's new_block code.  It may also be on the previous,
         * committing transaction's lists, but it HAS to be in Forget state in
         * that case: the transaction must have deleted the buffer for it to be
         * reused here.
         * In the case of file system data inconsistency, for example, if the
         * block bitmap of a referenced block is not set, it can lead to the
         * situation where a block being committed is allocated and used again.
         * As a result, the following condition will not be satisfied, so here
         * we directly trigger a JBD abort instead of immediately invoking
         * bugon.
         */
        spin_lock(&jh->b_state_lock);
        if (!(jh->b_transaction == transaction || jh->b_transaction == NULL ||
              (jh->b_transaction == journal->j_committing_transaction &&
               jh->b_jlist == BJ_Forget)) || jh->b_next_transaction != NULL) {
                err = -EROFS;
                spin_unlock(&jh->b_state_lock);
                jbd2_journal_abort(journal, err);
                goto out;
        }

        J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));

        if (jh->b_transaction == NULL) {
                /*
                 * Previous jbd2_journal_forget() could have left the buffer
                 * with jbddirty bit set because it was being committed. When
                 * the commit finished, we've filed the buffer for
                 * checkpointing and marked it dirty. Now we are reallocating
                 * the buffer so the transaction freeing it must have
                 * committed and so it's safe to clear the dirty bit.
                 */
                clear_buffer_dirty(jh2bh(jh));
                /* first access by this transaction */
                jh->b_modified = 0;

                JBUFFER_TRACE(jh, "file as BJ_Reserved");
                spin_lock(&journal->j_list_lock);
                __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
                spin_unlock(&journal->j_list_lock);
        } else if (jh->b_transaction == journal->j_committing_transaction) {
                /* first access by this transaction */
                jh->b_modified = 0;

                JBUFFER_TRACE(jh, "set next transaction");
                spin_lock(&journal->j_list_lock);
                jh->b_next_transaction = transaction;
                spin_unlock(&journal->j_list_lock);
        }
        spin_unlock(&jh->b_state_lock);

        /*
         * akpm: I added this.  ext3_alloc_branch can pick up new indirect
         * blocks which contain freed but then revoked metadata.  We need
         * to cancel the revoke in case we end up freeing it yet again
         * and the reallocating as data - this would cause a second revoke,
         * which hits an assertion error.
         */
        JBUFFER_TRACE(jh, "cancelling revoke");
        jbd2_journal_cancel_revoke(handle, jh);
out:
        jbd2_journal_put_journal_head(jh);
        return err;
}

/**
 * jbd2_journal_get_undo_access() -  Notify intent to modify metadata with
 *     non-rewindable consequences
 * @handle: transaction
 * @bh: buffer to undo
 *
 * Sometimes there is a need to distinguish between metadata which has
 * been committed to disk and that which has not.  The ext3fs code uses
 * this for freeing and allocating space, we have to make sure that we
 * do not reuse freed space until the deallocation has been committed,
 * since if we overwrote that space we would make the delete
 * un-rewindable in case of a crash.
 *
 * To deal with that, jbd2_journal_get_undo_access requests write access to a
 * buffer for parts of non-rewindable operations such as delete
 * operations on the bitmaps.  The journaling code must keep a copy of
 * the buffer's contents prior to the undo_access call until such time
 * as we know that the buffer has definitely been committed to disk.
 *
 * We never need to know which transaction the committed data is part
 * of, buffers touched here are guaranteed to be dirtied later and so
 * will be committed to a new transaction in due course, at which point
 * we can discard the old committed data pointer.
 *
 * Returns error number or 0 on success.
 */
int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
{
        int err;
        struct journal_head *jh;
        char *committed_data = NULL;

        if (is_handle_aborted(handle))
                return -EROFS;

        if (jbd2_write_access_granted(handle, bh, true))
                return 0;

        jh = jbd2_journal_add_journal_head(bh);
        JBUFFER_TRACE(jh, "entry");

        /*
         * Do this first --- it can drop the journal lock, so we want to
         * make sure that obtaining the committed_data is done
         * atomically wrt. completion of any outstanding commits.
         */
        err = do_get_write_access(handle, jh, 1);
        if (err)
                goto out;

repeat:
        if (!jh->b_committed_data)
                committed_data = jbd2_alloc(jh2bh(jh)->b_size,
                                            GFP_NOFS|__GFP_NOFAIL);

        spin_lock(&jh->b_state_lock);
        if (!jh->b_committed_data) {
                /* Copy out the current buffer contents into the
                 * preserved, committed copy. */
                JBUFFER_TRACE(jh, "generate b_committed data");
                if (!committed_data) {
                        spin_unlock(&jh->b_state_lock);
                        goto repeat;
                }

                jh->b_committed_data = committed_data;
                committed_data = NULL;
                memcpy(jh->b_committed_data, bh->b_data, bh->b_size);
        }
        spin_unlock(&jh->b_state_lock);
out:
        jbd2_journal_put_journal_head(jh);
        if (unlikely(committed_data))
                jbd2_free(committed_data, bh->b_size);
        return err;
}

/**
 * jbd2_journal_set_triggers() - Add triggers for commit writeout
 * @bh: buffer to trigger on
 * @type: struct jbd2_buffer_trigger_type containing the trigger(s).
 *
 * Set any triggers on this journal_head.  This is always safe, because
 * triggers for a committing buffer will be saved off, and triggers for
 * a running transaction will match the buffer in that transaction.
 *
 * Call with NULL to clear the triggers.
 */
void jbd2_journal_set_triggers(struct buffer_head *bh,
                               struct jbd2_buffer_trigger_type *type)
{
        struct journal_head *jh = jbd2_journal_grab_journal_head(bh);

        if (WARN_ON(!jh))
                return;
        jh->b_triggers = type;
        jbd2_journal_put_journal_head(jh);
}

void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data,
                                struct jbd2_buffer_trigger_type *triggers)
{
        struct buffer_head *bh = jh2bh(jh);

        if (!triggers || !triggers->t_frozen)
                return;

        triggers->t_frozen(triggers, bh, mapped_data, bh->b_size);
}

void jbd2_buffer_abort_trigger(struct journal_head *jh,
                               struct jbd2_buffer_trigger_type *triggers)
{
        if (!triggers || !triggers->t_abort)
                return;

        triggers->t_abort(triggers, jh2bh(jh));
}

/**
 * jbd2_journal_dirty_metadata() -  mark a buffer as containing dirty metadata
 * @handle: transaction to add buffer to.
 * @bh: buffer to mark
 *
 * mark dirty metadata which needs to be journaled as part of the current
 * transaction.
 *
 * The buffer must have previously had jbd2_journal_get_write_access()
 * called so that it has a valid journal_head attached to the buffer
 * head.
 *
 * The buffer is placed on the transaction's metadata list and is marked
 * as belonging to the transaction.
 *
 * Returns error number or 0 on success.
 *
 * Special care needs to be taken if the buffer already belongs to the
 * current committing transaction (in which case we should have frozen
 * data present for that commit).  In that case, we don't relink the
 * buffer: that only gets done when the old transaction finally
 * completes its commit.
 */
int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
{
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal;
        struct journal_head *jh;
        int ret = 0;

        if (!buffer_jbd(bh))
                return -EUCLEAN;

        /*
         * We don't grab jh reference here since the buffer must be part
         * of the running transaction.
         */
        jh = bh2jh(bh);
        jbd_debug(5, "journal_head %p\n", jh);
        JBUFFER_TRACE(jh, "entry");

        /*
         * This and the following assertions are unreliable since we may see jh
         * in inconsistent state unless we grab bh_state lock. But this is
         * crucial to catch bugs so let's do a reliable check until the
         * lockless handling is fully proven.
         */
        if (data_race(jh->b_transaction != transaction &&
            jh->b_next_transaction != transaction)) {
                spin_lock(&jh->b_state_lock);
                J_ASSERT_JH(jh, jh->b_transaction == transaction ||
                                jh->b_next_transaction == transaction);
                spin_unlock(&jh->b_state_lock);
        }
        if (data_race(jh->b_modified == 1)) {
                /* If it's in our transaction it must be in BJ_Metadata list. */
                if (data_race(jh->b_transaction == transaction &&
                    jh->b_jlist != BJ_Metadata)) {
                        spin_lock(&jh->b_state_lock);
                        if (jh->b_transaction == transaction &&
                            jh->b_jlist != BJ_Metadata)
                                pr_err("JBD2: assertion failure: h_type=%u "
                                       "h_line_no=%u block_no=%llu jlist=%u\n",
                                       handle->h_type, handle->h_line_no,
                                       (unsigned long long) bh->b_blocknr,
                                       jh->b_jlist);
                        J_ASSERT_JH(jh, jh->b_transaction != transaction ||
                                        jh->b_jlist == BJ_Metadata);
                        spin_unlock(&jh->b_state_lock);
                }
                goto out;
        }

        spin_lock(&jh->b_state_lock);

        if (is_handle_aborted(handle)) {
                /*
                 * Check journal aborting with @jh->b_state_lock locked,
                 * since 'jh->b_transaction' could be replaced with
                 * 'jh->b_next_transaction' during old transaction
                 * committing if journal aborted, which may fail
                 * assertion on 'jh->b_frozen_data == NULL'.
                 */
                ret = -EROFS;
                goto out_unlock_bh;
        }

        journal = transaction->t_journal;

        if (jh->b_modified == 0) {
                /*
                 * This buffer's got modified and becoming part
                 * of the transaction. This needs to be done
                 * once a transaction -bzzz
                 */
                if (WARN_ON_ONCE(jbd2_handle_buffer_credits(handle) <= 0)) {
                        ret = -ENOSPC;
                        goto out_unlock_bh;
                }
                jh->b_modified = 1;
                handle->h_total_credits--;
        }

        /*
         * fastpath, to avoid expensive locking.  If this buffer is already
         * on the running transaction's metadata list there is nothing to do.
         * Nobody can take it off again because there is a handle open.
         * I _think_ we're OK here with SMP barriers - a mistaken decision will
         * result in this test being false, so we go in and take the locks.
         */
        if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
                JBUFFER_TRACE(jh, "fastpath");
                if (unlikely(jh->b_transaction !=
                             journal->j_running_transaction)) {
                        printk(KERN_ERR "JBD2: %s: "
                               "jh->b_transaction (%llu, %p, %u) != "
                               "journal->j_running_transaction (%p, %u)\n",
                               journal->j_devname,
                               (unsigned long long) bh->b_blocknr,
                               jh->b_transaction,
                               jh->b_transaction ? jh->b_transaction->t_tid : 0,
                               journal->j_running_transaction,
                               journal->j_running_transaction ?
                               journal->j_running_transaction->t_tid : 0);
                        ret = -EINVAL;
                }
                goto out_unlock_bh;
        }

        set_buffer_jbddirty(bh);

        /*
         * Metadata already on the current transaction list doesn't
         * need to be filed.  Metadata on another transaction's list must
         * be committing, and will be refiled once the commit completes:
         * leave it alone for now.
         */
        if (jh->b_transaction != transaction) {
                JBUFFER_TRACE(jh, "already on other transaction");
                if (unlikely(((jh->b_transaction !=
                               journal->j_committing_transaction)) ||
                             (jh->b_next_transaction != transaction))) {
                        printk(KERN_ERR "jbd2_journal_dirty_metadata: %s: "
                               "bad jh for block %llu: "
                               "transaction (%p, %u), "
                               "jh->b_transaction (%p, %u), "
                               "jh->b_next_transaction (%p, %u), jlist %u\n",
                               journal->j_devname,
                               (unsigned long long) bh->b_blocknr,
                               transaction, transaction->t_tid,
                               jh->b_transaction,
                               jh->b_transaction ?
                               jh->b_transaction->t_tid : 0,
                               jh->b_next_transaction,
                               jh->b_next_transaction ?
                               jh->b_next_transaction->t_tid : 0,
                               jh->b_jlist);
                        WARN_ON(1);
                        ret = -EINVAL;
                }
                /* And this case is illegal: we can't reuse another
                 * transaction's data buffer, ever. */
                goto out_unlock_bh;
        }

        /* That test should have eliminated the following case: */
        J_ASSERT_JH(jh, jh->b_frozen_data == NULL);

        JBUFFER_TRACE(jh, "file as BJ_Metadata");
        spin_lock(&journal->j_list_lock);
        __jbd2_journal_file_buffer(jh, transaction, BJ_Metadata);
        spin_unlock(&journal->j_list_lock);
out_unlock_bh:
        spin_unlock(&jh->b_state_lock);
out:
        JBUFFER_TRACE(jh, "exit");
        return ret;
}

/**
 * jbd2_journal_forget() - bforget() for potentially-journaled buffers.
 * @handle: transaction handle
 * @bh:     bh to 'forget'
 *
 * We can only do the bforget if there are no commits pending against the
 * buffer.  If the buffer is dirty in the current running transaction we
 * can safely unlink it.
 *
 * bh may not be a journalled buffer at all - it may be a non-JBD
 * buffer which came off the hashtable.  Check for this.
 *
 * Decrements bh->b_count by one.
 *
 * Allow this call even if the handle has aborted --- it may be part of
 * the caller's cleanup after an abort.
 */
int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh)
{
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal;
        struct journal_head *jh;
        int drop_reserve = 0;
        int err = 0;
        int was_modified = 0;
        int wait_for_writeback = 0;

        if (is_handle_aborted(handle))
                return -EROFS;
        journal = transaction->t_journal;

        BUFFER_TRACE(bh, "entry");

        jh = jbd2_journal_grab_journal_head(bh);
        if (!jh) {
                __bforget(bh);
                return 0;
        }

        spin_lock(&jh->b_state_lock);

        /* Critical error: attempting to delete a bitmap buffer, maybe?
         * Don't do any jbd operations, and return an error. */
        if (!J_EXPECT_JH(jh, !jh->b_committed_data,
                         "inconsistent data on disk")) {
                err = -EIO;
                goto drop;
        }

        /* keep track of whether or not this transaction modified us */
        was_modified = jh->b_modified;

        /*
         * The buffer's going from the transaction, we must drop
         * all references -bzzz
         */
        jh->b_modified = 0;

        if (jh->b_transaction == transaction) {
                J_ASSERT_JH(jh, !jh->b_frozen_data);

                /* If we are forgetting a buffer which is already part
                 * of this transaction, then we can just drop it from
                 * the transaction immediately. */
                clear_buffer_dirty(bh);
                clear_buffer_jbddirty(bh);

                JBUFFER_TRACE(jh, "belongs to current transaction: unfile");

                /*
                 * we only want to drop a reference if this transaction
                 * modified the buffer
                 */
                if (was_modified)
                        drop_reserve = 1;

                /*
                 * We are no longer going to journal this buffer.
                 * However, the commit of this transaction is still
                 * important to the buffer: the delete that we are now
                 * processing might obsolete an old log entry, so by
                 * committing, we can satisfy the buffer's checkpoint.
                 *
                 * So, if we have a checkpoint on the buffer, we should
                 * now refile the buffer on our BJ_Forget list so that
                 * we know to remove the checkpoint after we commit.
                 */

                spin_lock(&journal->j_list_lock);
                if (jh->b_cp_transaction) {
                        __jbd2_journal_temp_unlink_buffer(jh);
                        __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
                } else {
                        __jbd2_journal_unfile_buffer(jh);
                        jbd2_journal_put_journal_head(jh);
                }
                spin_unlock(&journal->j_list_lock);
        } else if (jh->b_transaction) {
                J_ASSERT_JH(jh, (jh->b_transaction ==
                                 journal->j_committing_transaction));
                /* However, if the buffer is still owned by a prior
                 * (committing) transaction, we can't drop it yet... */
                JBUFFER_TRACE(jh, "belongs to older transaction");
                /* ... but we CAN drop it from the new transaction through
                 * marking the buffer as freed and set j_next_transaction to
                 * the new transaction, so that not only the commit code
                 * knows it should clear dirty bits when it is done with the
                 * buffer, but also the buffer can be checkpointed only
                 * after the new transaction commits. */

                set_buffer_freed(bh);

                if (!jh->b_next_transaction) {
                        spin_lock(&journal->j_list_lock);
                        jh->b_next_transaction = transaction;
                        spin_unlock(&journal->j_list_lock);
                } else {
                        J_ASSERT(jh->b_next_transaction == transaction);

                        /*
                         * only drop a reference if this transaction modified
                         * the buffer
                         */
                        if (was_modified)
                                drop_reserve = 1;
                }
        } else {
                /*
                 * Finally, if the buffer is not belongs to any
                 * transaction, we can just drop it now if it has no
                 * checkpoint.
                 */
                spin_lock(&journal->j_list_lock);
                if (!jh->b_cp_transaction) {
                        JBUFFER_TRACE(jh, "belongs to none transaction");
                        spin_unlock(&journal->j_list_lock);
                        goto drop;
                }

                /*
                 * Otherwise, if the buffer has been written to disk,
                 * it is safe to remove the checkpoint and drop it.
                 */
                if (!buffer_dirty(bh)) {
                        __jbd2_journal_remove_checkpoint(jh);
                        spin_unlock(&journal->j_list_lock);
                        goto drop;
                }

                /*
                 * The buffer has not yet been written to disk. We should
                 * either clear the buffer or ensure that the ongoing I/O
                 * is completed, and attach this buffer to current
                 * transaction so that the buffer can be checkpointed only
                 * after the current transaction commits.
                 */
                clear_buffer_dirty(bh);
                wait_for_writeback = 1;
                __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
                spin_unlock(&journal->j_list_lock);
        }
drop:
        __brelse(bh);
        spin_unlock(&jh->b_state_lock);
        if (wait_for_writeback)
                wait_on_buffer(bh);
        jbd2_journal_put_journal_head(jh);
        if (drop_reserve) {
                /* no need to reserve log space for this block -bzzz */
                handle->h_total_credits++;
        }
        return err;
}

/**
 * jbd2_journal_stop() - complete a transaction
 * @handle: transaction to complete.
 *
 * All done for a particular handle.
 *
 * There is not much action needed here.  We just return any remaining
 * buffer credits to the transaction and remove the handle.  The only
 * complication is that we need to start a commit operation if the
 * filesystem is marked for synchronous update.
 *
 * jbd2_journal_stop itself will not usually return an error, but it may
 * do so in unusual circumstances.  In particular, expect it to
 * return -EIO if a jbd2_journal_abort has been executed since the
 * transaction began.
 */
int jbd2_journal_stop(handle_t *handle)
{
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal;
        int err = 0, wait_for_commit = 0;
        tid_t tid;
        pid_t pid;

        if (--handle->h_ref > 0) {
                jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
                                                 handle->h_ref);
                if (is_handle_aborted(handle))
                        return -EIO;
                return 0;
        }
        if (!transaction) {
                /*
                 * Handle is already detached from the transaction so there is
                 * nothing to do other than free the handle.
                 */
                memalloc_nofs_restore(handle->saved_alloc_context);
                goto free_and_exit;
        }
        journal = transaction->t_journal;
        tid = transaction->t_tid;

        if (is_handle_aborted(handle))
                err = -EIO;

        jbd_debug(4, "Handle %p going down\n", handle);
        trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev,
                                tid, handle->h_type, handle->h_line_no,
                                jiffies - handle->h_start_jiffies,
                                handle->h_sync, handle->h_requested_credits,
                                (handle->h_requested_credits -
                                 handle->h_total_credits));

        /*
         * Implement synchronous transaction batching.  If the handle
         * was synchronous, don't force a commit immediately.  Let's
         * yield and let another thread piggyback onto this
         * transaction.  Keep doing that while new threads continue to
         * arrive.  It doesn't cost much - we're about to run a commit
         * and sleep on IO anyway.  Speeds up many-threaded, many-dir
         * operations by 30x or more...
         *
         * We try and optimize the sleep time against what the
         * underlying disk can do, instead of having a static sleep
         * time.  This is useful for the case where our storage is so
         * fast that it is more optimal to go ahead and force a flush
         * and wait for the transaction to be committed than it is to
         * wait for an arbitrary amount of time for new writers to
         * join the transaction.  We achieve this by measuring how
         * long it takes to commit a transaction, and compare it with
         * how long this transaction has been running, and if run time
         * < commit time then we sleep for the delta and commit.  This
         * greatly helps super fast disks that would see slowdowns as
         * more threads started doing fsyncs.
         *
         * But don't do this if this process was the most recent one
         * to perform a synchronous write.  We do this to detect the
         * case where a single process is doing a stream of sync
         * writes.  No point in waiting for joiners in that case.
         *
         * Setting max_batch_time to 0 disables this completely.
         */
        pid = current->pid;
        if (handle->h_sync && journal->j_last_sync_writer != pid &&
            journal->j_max_batch_time) {
                u64 commit_time, trans_time;

                journal->j_last_sync_writer = pid;

                read_lock(&journal->j_state_lock);
                commit_time = journal->j_average_commit_time;
                read_unlock(&journal->j_state_lock);

                trans_time = ktime_to_ns(ktime_sub(ktime_get(),
                                                   transaction->t_start_time));

                commit_time = max_t(u64, commit_time,
                                    1000*journal->j_min_batch_time);
                commit_time = min_t(u64, commit_time,
                                    1000*journal->j_max_batch_time);

                if (trans_time < commit_time) {
                        ktime_t expires = ktime_add_ns(ktime_get(),
                                                       commit_time);
                        set_current_state(TASK_UNINTERRUPTIBLE);
                        schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
                }
        }

        if (handle->h_sync)
                transaction->t_synchronous_commit = 1;

        /*
         * If the handle is marked SYNC, we need to set another commit
         * going!  We also want to force a commit if the transaction is too
         * old now.
         */
        if (handle->h_sync ||
            time_after_eq(jiffies, transaction->t_expires)) {
                /* Do this even for aborted journals: an abort still
                 * completes the commit thread, it just doesn't write
                 * anything to disk. */

                jbd_debug(2, "transaction too old, requesting commit for "
                                        "handle %p\n", handle);
                /* This is non-blocking */
                jbd2_log_start_commit(journal, tid);

                /*
                 * Special case: JBD2_SYNC synchronous updates require us
                 * to wait for the commit to complete.
                 */
                if (handle->h_sync && !(current->flags & PF_MEMALLOC))
                        wait_for_commit = 1;
        }

        /*
         * Once stop_this_handle() drops t_updates, the transaction could start
         * committing on us and eventually disappear.  So we must not
         * dereference transaction pointer again after calling
         * stop_this_handle().
         */
        stop_this_handle(handle);

        if (wait_for_commit)
                err = jbd2_log_wait_commit(journal, tid);

free_and_exit:
        if (handle->h_rsv_handle)
                jbd2_free_handle(handle->h_rsv_handle);
        jbd2_free_handle(handle);
        return err;
}

/*
 *
 * List management code snippets: various functions for manipulating the
 * transaction buffer lists.
 *
 */

/*
 * Append a buffer to a transaction list, given the transaction's list head
 * pointer.
 *
 * j_list_lock is held.
 *
 * jh->b_state_lock is held.
 */

static inline void
__blist_add_buffer(struct journal_head **list, struct journal_head *jh)
{
        if (!*list) {
                jh->b_tnext = jh->b_tprev = jh;
                *list = jh;
        } else {
                /* Insert at the tail of the list to preserve order */
                struct journal_head *first = *list, *last = first->b_tprev;
                jh->b_tprev = last;
                jh->b_tnext = first;
                last->b_tnext = first->b_tprev = jh;
        }
}

/*
 * Remove a buffer from a transaction list, given the transaction's list
 * head pointer.
 *
 * Called with j_list_lock held, and the journal may not be locked.
 *
 * jh->b_state_lock is held.
 */

static inline void
__blist_del_buffer(struct journal_head **list, struct journal_head *jh)
{
        if (*list == jh) {
                *list = jh->b_tnext;
                if (*list == jh)
                        *list = NULL;
        }
        jh->b_tprev->b_tnext = jh->b_tnext;
        jh->b_tnext->b_tprev = jh->b_tprev;
}

/*
 * Remove a buffer from the appropriate transaction list.
 *
 * Note that this function can *change* the value of
 * bh->b_transaction->t_buffers, t_forget, t_shadow_list, t_log_list or
 * t_reserved_list.  If the caller is holding onto a copy of one of these
 * pointers, it could go bad.  Generally the caller needs to re-read the
 * pointer from the transaction_t.
 *
 * Called under j_list_lock.
 */
static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
{
        struct journal_head **list = NULL;
        transaction_t *transaction;
        struct buffer_head *bh = jh2bh(jh);

        lockdep_assert_held(&jh->b_state_lock);
        transaction = jh->b_transaction;
        if (transaction)
                assert_spin_locked(&transaction->t_journal->j_list_lock);

        J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
        if (jh->b_jlist != BJ_None)
                J_ASSERT_JH(jh, transaction != NULL);

        switch (jh->b_jlist) {
        case BJ_None:
                return;
        case BJ_Metadata:
                transaction->t_nr_buffers--;
                J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
                list = &transaction->t_buffers;
                break;
        case BJ_Forget:
                list = &transaction->t_forget;
                break;
        case BJ_Shadow:
                list = &transaction->t_shadow_list;
                break;
        case BJ_Reserved:
                list = &transaction->t_reserved_list;
                break;
        }

        __blist_del_buffer(list, jh);
        jh->b_jlist = BJ_None;
        if (transaction && is_journal_aborted(transaction->t_journal))
                clear_buffer_jbddirty(bh);
        else if (test_clear_buffer_jbddirty(bh))
                mark_buffer_dirty(bh);        /* Expose it to the VM */
}

/*
 * Remove buffer from all transactions. The caller is responsible for dropping
 * the jh reference that belonged to the transaction.
 *
 * Called with bh_state lock and j_list_lock
 */
static void __jbd2_journal_unfile_buffer(struct journal_head *jh)
{
        J_ASSERT_JH(jh, jh->b_transaction != NULL);
        J_ASSERT_JH(jh, jh->b_next_transaction == NULL);

        __jbd2_journal_temp_unlink_buffer(jh);
        jh->b_transaction = NULL;
}

void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
{
        struct buffer_head *bh = jh2bh(jh);

        /* Get reference so that buffer cannot be freed before we unlock it */
        get_bh(bh);
        spin_lock(&jh->b_state_lock);
        spin_lock(&journal->j_list_lock);
        __jbd2_journal_unfile_buffer(jh);
        spin_unlock(&journal->j_list_lock);
        spin_unlock(&jh->b_state_lock);
        jbd2_journal_put_journal_head(jh);
        __brelse(bh);
}

/*
 * Called from jbd2_journal_try_to_free_buffers().
 *
 * Called under jh->b_state_lock
 */
static void
__journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
{
        struct journal_head *jh;

        jh = bh2jh(bh);

        if (buffer_locked(bh) || buffer_dirty(bh))
                goto out;

        if (jh->b_next_transaction != NULL || jh->b_transaction != NULL)
                goto out;

        spin_lock(&journal->j_list_lock);
        if (jh->b_cp_transaction != NULL) {
                /* written-back checkpointed metadata buffer */
                JBUFFER_TRACE(jh, "remove from checkpoint list");
                __jbd2_journal_remove_checkpoint(jh);
        }
        spin_unlock(&journal->j_list_lock);
out:
        return;
}

/**
 * jbd2_journal_try_to_free_buffers() - try to free page buffers.
 * @journal: journal for operation
 * @page: to try and free
 *
 * For all the buffers on this page,
 * if they are fully written out ordered data, move them onto BUF_CLEAN
 * so try_to_free_buffers() can reap them.
 *
 * This function returns non-zero if we wish try_to_free_buffers()
 * to be called. We do this if the page is releasable by try_to_free_buffers().
 * We also do it if the page has locked or dirty buffers and the caller wants
 * us to perform sync or async writeout.
 *
 * This complicates JBD locking somewhat.  We aren't protected by the
 * BKL here.  We wish to remove the buffer from its committing or
 * running transaction's ->t_datalist via __jbd2_journal_unfile_buffer.
 *
 * This may *change* the value of transaction_t->t_datalist, so anyone
 * who looks at t_datalist needs to lock against this function.
 *
 * Even worse, someone may be doing a jbd2_journal_dirty_data on this
 * buffer.  So we need to lock against that.  jbd2_journal_dirty_data()
 * will come out of the lock with the buffer dirty, which makes it
 * ineligible for release here.
 *
 * Who else is affected by this?  hmm...  Really the only contender
 * is do_get_write_access() - it could be looking at the buffer while
 * journal_try_to_free_buffer() is changing its state.  But that
 * cannot happen because we never reallocate freed data as metadata
 * while the data is part of a transaction.  Yes?
 *
 * Return 0 on failure, 1 on success
 */
int jbd2_journal_try_to_free_buffers(journal_t *journal, struct page *page)
{
        struct buffer_head *head;
        struct buffer_head *bh;
        bool has_write_io_error = false;
        int ret = 0;

        J_ASSERT(PageLocked(page));

        head = page_buffers(page);
        bh = head;
        do {
                struct journal_head *jh;

                /*
                 * We take our own ref against the journal_head here to avoid
                 * having to add tons of locking around each instance of
                 * jbd2_journal_put_journal_head().
                 */
                jh = jbd2_journal_grab_journal_head(bh);
                if (!jh)
                        continue;

                spin_lock(&jh->b_state_lock);
                __journal_try_to_free_buffer(journal, bh);
                spin_unlock(&jh->b_state_lock);
                jbd2_journal_put_journal_head(jh);
                if (buffer_jbd(bh))
                        goto busy;

                /*
                 * If we free a metadata buffer which has been failed to
                 * write out, the jbd2 checkpoint procedure will not detect
                 * this failure and may lead to filesystem inconsistency
                 * after cleanup journal tail.
                 */
                if (buffer_write_io_error(bh)) {
                        pr_err("JBD2: Error while async write back metadata bh %llu.",
                               (unsigned long long)bh->b_blocknr);
                        has_write_io_error = true;
                }
        } while ((bh = bh->b_this_page) != head);

        ret = try_to_free_buffers(page);

busy:
        if (has_write_io_error)
                jbd2_journal_abort(journal, -EIO);

        return ret;
}

/*
 * This buffer is no longer needed.  If it is on an older transaction's
 * checkpoint list we need to record it on this transaction's forget list
 * to pin this buffer (and hence its checkpointing transaction) down until
 * this transaction commits.  If the buffer isn't on a checkpoint list, we
 * release it.
 * Returns non-zero if JBD no longer has an interest in the buffer.
 *
 * Called under j_list_lock.
 *
 * Called under jh->b_state_lock.
 */
static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
{
        int may_free = 1;
        struct buffer_head *bh = jh2bh(jh);

        if (jh->b_cp_transaction) {
                JBUFFER_TRACE(jh, "on running+cp transaction");
                __jbd2_journal_temp_unlink_buffer(jh);
                /*
                 * We don't want to write the buffer anymore, clear the
                 * bit so that we don't confuse checks in
                 * __journal_file_buffer
                 */
                clear_buffer_dirty(bh);
                __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
                may_free = 0;
        } else {
                JBUFFER_TRACE(jh, "on running transaction");
                __jbd2_journal_unfile_buffer(jh);
                jbd2_journal_put_journal_head(jh);
        }
        return may_free;
}

/*
 * jbd2_journal_invalidatepage
 *
 * This code is tricky.  It has a number of cases to deal with.
 *
 * There are two invariants which this code relies on:
 *
 * i_size must be updated on disk before we start calling invalidatepage on the
 * data.
 *
 *  This is done in ext3 by defining an ext3_setattr method which
 *  updates i_size before truncate gets going.  By maintaining this
 *  invariant, we can be sure that it is safe to throw away any buffers
 *  attached to the current transaction: once the transaction commits,
 *  we know that the data will not be needed.
 *
 *  Note however that we can *not* throw away data belonging to the
 *  previous, committing transaction!
 *
 * Any disk blocks which *are* part of the previous, committing
 * transaction (and which therefore cannot be discarded immediately) are
 * not going to be reused in the new running transaction
 *
 *  The bitmap committed_data images guarantee this: any block which is
 *  allocated in one transaction and removed in the next will be marked
 *  as in-use in the committed_data bitmap, so cannot be reused until
 *  the next transaction to delete the block commits.  This means that
 *  leaving committing buffers dirty is quite safe: the disk blocks
 *  cannot be reallocated to a different file and so buffer aliasing is
 *  not possible.
 *
 *
 * The above applies mainly to ordered data mode.  In writeback mode we
 * don't make guarantees about the order in which data hits disk --- in
 * particular we don't guarantee that new dirty data is flushed before
 * transaction commit --- so it is always safe just to discard data
 * immediately in that mode.  --sct
 */

/*
 * The journal_unmap_buffer helper function returns zero if the buffer
 * concerned remains pinned as an anonymous buffer belonging to an older
 * transaction.
 *
 * We're outside-transaction here.  Either or both of j_running_transaction
 * and j_committing_transaction may be NULL.
 */
static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
                                int partial_page)
{
        transaction_t *transaction;
        struct journal_head *jh;
        int may_free = 1;

        BUFFER_TRACE(bh, "entry");

        /*
         * It is safe to proceed here without the j_list_lock because the
         * buffers cannot be stolen by try_to_free_buffers as long as we are
         * holding the page lock. --sct
         */

        jh = jbd2_journal_grab_journal_head(bh);
        if (!jh)
                goto zap_buffer_unlocked;

        /* OK, we have data buffer in journaled mode */
        write_lock(&journal->j_state_lock);
        spin_lock(&jh->b_state_lock);
        spin_lock(&journal->j_list_lock);

        /*
         * We cannot remove the buffer from checkpoint lists until the
         * transaction adding inode to orphan list (let's call it T)
         * is committed.  Otherwise if the transaction changing the
         * buffer would be cleaned from the journal before T is
         * committed, a crash will cause that the correct contents of
         * the buffer will be lost.  On the other hand we have to
         * clear the buffer dirty bit at latest at the moment when the
         * transaction marking the buffer as freed in the filesystem
         * structures is committed because from that moment on the
         * block can be reallocated and used by a different page.
         * Since the block hasn't been freed yet but the inode has
         * already been added to orphan list, it is safe for us to add
         * the buffer to BJ_Forget list of the newest transaction.
         *
         * Also we have to clear buffer_mapped flag of a truncated buffer
         * because the buffer_head may be attached to the page straddling
         * i_size (can happen only when blocksize < pagesize) and thus the
         * buffer_head can be reused when the file is extended again. So we end
         * up keeping around invalidated buffers attached to transactions'
         * BJ_Forget list just to stop checkpointing code from cleaning up
         * the transaction this buffer was modified in.
         */
        transaction = jh->b_transaction;
        if (transaction == NULL) {
                /* First case: not on any transaction.  If it
                 * has no checkpoint link, then we can zap it:
                 * it's a writeback-mode buffer so we don't care
                 * if it hits disk safely. */
                if (!jh->b_cp_transaction) {
                        JBUFFER_TRACE(jh, "not on any transaction: zap");
                        goto zap_buffer;
                }

                if (!buffer_dirty(bh)) {
                        /* bdflush has written it.  We can drop it now */
                        __jbd2_journal_remove_checkpoint(jh);
                        goto zap_buffer;
                }

                /* OK, it must be in the journal but still not
                 * written fully to disk: it's metadata or
                 * journaled data... */

                if (journal->j_running_transaction) {
                        /* ... and once the current transaction has
                         * committed, the buffer won't be needed any
                         * longer. */
                        JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
                        may_free = __dispose_buffer(jh,
                                        journal->j_running_transaction);
                        goto zap_buffer;
                } else {
                        /* There is no currently-running transaction. So the
                         * orphan record which we wrote for this file must have
                         * passed into commit.  We must attach this buffer to
                         * the committing transaction, if it exists. */
                        if (journal->j_committing_transaction) {
                                JBUFFER_TRACE(jh, "give to committing trans");
                                may_free = __dispose_buffer(jh,
                                        journal->j_committing_transaction);
                                goto zap_buffer;
                        } else {
                                /* The orphan record's transaction has
                                 * committed.  We can cleanse this buffer */
                                clear_buffer_jbddirty(bh);
                                __jbd2_journal_remove_checkpoint(jh);
                                goto zap_buffer;
                        }
                }
        } else if (transaction == journal->j_committing_transaction) {
                JBUFFER_TRACE(jh, "on committing transaction");
                /*
                 * The buffer is committing, we simply cannot touch
                 * it. If the page is straddling i_size we have to wait
                 * for commit and try again.
                 */
                if (partial_page) {
                        spin_unlock(&journal->j_list_lock);
                        spin_unlock(&jh->b_state_lock);
                        write_unlock(&journal->j_state_lock);
                        jbd2_journal_put_journal_head(jh);
                        /* Already zapped buffer? Nothing to do... */
                        if (!bh->b_bdev)
                                return 0;
                        return -EBUSY;
                }
                /*
                 * OK, buffer won't be reachable after truncate. We just clear
                 * b_modified to not confuse transaction credit accounting, and
                 * set j_next_transaction to the running transaction (if there
                 * is one) and mark buffer as freed so that commit code knows
                 * it should clear dirty bits when it is done with the buffer.
                 */
                set_buffer_freed(bh);
                if (journal->j_running_transaction && buffer_jbddirty(bh))
                        jh->b_next_transaction = journal->j_running_transaction;
                jh->b_modified = 0;
                spin_unlock(&journal->j_list_lock);
                spin_unlock(&jh->b_state_lock);
                write_unlock(&journal->j_state_lock);
                jbd2_journal_put_journal_head(jh);
                return 0;
        } else {
                /* Good, the buffer belongs to the running transaction.
                 * We are writing our own transaction's data, not any
                 * previous one's, so it is safe to throw it away
                 * (remember that we expect the filesystem to have set
                 * i_size already for this truncate so recovery will not
                 * expose the disk blocks we are discarding here.) */
                J_ASSERT_JH(jh, transaction == journal->j_running_transaction);
                JBUFFER_TRACE(jh, "on running transaction");
                may_free = __dispose_buffer(jh, transaction);
        }

zap_buffer:
        /*
         * This is tricky. Although the buffer is truncated, it may be reused
         * if blocksize < pagesize and it is attached to the page straddling
         * EOF. Since the buffer might have been added to BJ_Forget list of the
         * running transaction, journal_get_write_access() won't clear
         * b_modified and credit accounting gets confused. So clear b_modified
         * here.
         */
        jh->b_modified = 0;
        spin_unlock(&journal->j_list_lock);
        spin_unlock(&jh->b_state_lock);
        write_unlock(&journal->j_state_lock);
        jbd2_journal_put_journal_head(jh);
zap_buffer_unlocked:
        clear_buffer_dirty(bh);
        J_ASSERT_BH(bh, !buffer_jbddirty(bh));
        clear_buffer_mapped(bh);
        clear_buffer_req(bh);
        clear_buffer_new(bh);
        clear_buffer_delay(bh);
        clear_buffer_unwritten(bh);
        bh->b_bdev = NULL;
        return may_free;
}

/**
 * jbd2_journal_invalidatepage()
 * @journal: journal to use for flush...
 * @page:    page to flush
 * @offset:  start of the range to invalidate
 * @length:  length of the range to invalidate
 *
 * Reap page buffers containing data after in the specified range in page.
 * Can return -EBUSY if buffers are part of the committing transaction and
 * the page is straddling i_size. Caller then has to wait for current commit
 * and try again.
 */
int jbd2_journal_invalidatepage(journal_t *journal,
                                struct page *page,
                                unsigned int offset,
                                unsigned int length)
{
        struct buffer_head *head, *bh, *next;
        unsigned int stop = offset + length;
        unsigned int curr_off = 0;
        int partial_page = (offset || length < PAGE_SIZE);
        int may_free = 1;
        int ret = 0;

        if (!PageLocked(page))
                BUG();
        if (!page_has_buffers(page))
                return 0;

        BUG_ON(stop > PAGE_SIZE || stop < length);

        /* We will potentially be playing with lists other than just the
         * data lists (especially for journaled data mode), so be
         * cautious in our locking. */

        head = bh = page_buffers(page);
        do {
                unsigned int next_off = curr_off + bh->b_size;
                next = bh->b_this_page;

                if (next_off > stop)
                        return 0;

                if (offset <= curr_off) {
                        /* This block is wholly outside the truncation point */
                        lock_buffer(bh);
                        ret = journal_unmap_buffer(journal, bh, partial_page);
                        unlock_buffer(bh);
                        if (ret < 0)
                                return ret;
                        may_free &= ret;
                }
                curr_off = next_off;
                bh = next;

        } while (bh != head);

        if (!partial_page) {
                if (may_free && try_to_free_buffers(page))
                        J_ASSERT(!page_has_buffers(page));
        }
        return 0;
}

/*
 * File a buffer on the given transaction list.
 */
void __jbd2_journal_file_buffer(struct journal_head *jh,
                        transaction_t *transaction, int jlist)
{
        struct journal_head **list = NULL;
        int was_dirty = 0;
        struct buffer_head *bh = jh2bh(jh);

        lockdep_assert_held(&jh->b_state_lock);
        assert_spin_locked(&transaction->t_journal->j_list_lock);

        J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
        J_ASSERT_JH(jh, jh->b_transaction == transaction ||
                                jh->b_transaction == NULL);

        if (jh->b_transaction && jh->b_jlist == jlist)
                return;

        if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
            jlist == BJ_Shadow || jlist == BJ_Forget) {
                /*
                 * For metadata buffers, we track dirty bit in buffer_jbddirty
                 * instead of buffer_dirty. We should not see a dirty bit set
                 * here because we clear it in do_get_write_access but e.g.
                 * tune2fs can modify the sb and set the dirty bit at any time
                 * so we try to gracefully handle that.
                 */
                if (buffer_dirty(bh))
                        warn_dirty_buffer(bh);
                if (test_clear_buffer_dirty(bh) ||
                    test_clear_buffer_jbddirty(bh))
                        was_dirty = 1;
        }

        if (jh->b_transaction)
                __jbd2_journal_temp_unlink_buffer(jh);
        else
                jbd2_journal_grab_journal_head(bh);
        jh->b_transaction = transaction;

        switch (jlist) {
        case BJ_None:
                J_ASSERT_JH(jh, !jh->b_committed_data);
                J_ASSERT_JH(jh, !jh->b_frozen_data);
                return;
        case BJ_Metadata:
                transaction->t_nr_buffers++;
                list = &transaction->t_buffers;
                break;
        case BJ_Forget:
                list = &transaction->t_forget;
                break;
        case BJ_Shadow:
                list = &transaction->t_shadow_list;
                break;
        case BJ_Reserved:
                list = &transaction->t_reserved_list;
                break;
        }

        __blist_add_buffer(list, jh);
        jh->b_jlist = jlist;

        if (was_dirty)
                set_buffer_jbddirty(bh);
}

void jbd2_journal_file_buffer(struct journal_head *jh,
                                transaction_t *transaction, int jlist)
{
        spin_lock(&jh->b_state_lock);
        spin_lock(&transaction->t_journal->j_list_lock);
        __jbd2_journal_file_buffer(jh, transaction, jlist);
        spin_unlock(&transaction->t_journal->j_list_lock);
        spin_unlock(&jh->b_state_lock);
}

/*
 * Remove a buffer from its current buffer list in preparation for
 * dropping it from its current transaction entirely.  If the buffer has
 * already started to be used by a subsequent transaction, refile the
 * buffer on that transaction's metadata list.
 *
 * Called under j_list_lock
 * Called under jh->b_state_lock
 *
 * When this function returns true, there's no next transaction to refile to
 * and the caller has to drop jh reference through
 * jbd2_journal_put_journal_head().
 */
bool __jbd2_journal_refile_buffer(struct journal_head *jh)
{
        int was_dirty, jlist;
        struct buffer_head *bh = jh2bh(jh);

        lockdep_assert_held(&jh->b_state_lock);
        if (jh->b_transaction)
                assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);

        /* If the buffer is now unused, just drop it. */
        if (jh->b_next_transaction == NULL) {
                __jbd2_journal_unfile_buffer(jh);
                return true;
        }

        /*
         * It has been modified by a later transaction: add it to the new
         * transaction's metadata list.
         */

        was_dirty = test_clear_buffer_jbddirty(bh);
        __jbd2_journal_temp_unlink_buffer(jh);

        /*
         * b_transaction must be set, otherwise the new b_transaction won't
         * be holding jh reference
         */
        J_ASSERT_JH(jh, jh->b_transaction != NULL);

        /*
         * We set b_transaction here because b_next_transaction will inherit
         * our jh reference and thus __jbd2_journal_file_buffer() must not
         * take a new one.
         */
        WRITE_ONCE(jh->b_transaction, jh->b_next_transaction);
        WRITE_ONCE(jh->b_next_transaction, NULL);
        if (buffer_freed(bh))
                jlist = BJ_Forget;
        else if (jh->b_modified)
                jlist = BJ_Metadata;
        else
                jlist = BJ_Reserved;
        __jbd2_journal_file_buffer(jh, jh->b_transaction, jlist);
        J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);

        if (was_dirty)
                set_buffer_jbddirty(bh);
        return false;
}

/*
 * __jbd2_journal_refile_buffer() with necessary locking added. We take our
 * bh reference so that we can safely unlock bh.
 *
 * The jh and bh may be freed by this call.
 */
void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
{
        bool drop;

        spin_lock(&jh->b_state_lock);
        spin_lock(&journal->j_list_lock);
        drop = __jbd2_journal_refile_buffer(jh);
        spin_unlock(&jh->b_state_lock);
        spin_unlock(&journal->j_list_lock);
        if (drop)
                jbd2_journal_put_journal_head(jh);
}

/*
 * File inode in the inode list of the handle's transaction
 */
static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode,
                unsigned long flags, loff_t start_byte, loff_t end_byte)
{
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal;

        if (is_handle_aborted(handle))
                return -EROFS;
        journal = transaction->t_journal;

        jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
                        transaction->t_tid);

        spin_lock(&journal->j_list_lock);
        jinode->i_flags |= flags;

        if (jinode->i_dirty_end) {
                jinode->i_dirty_start = min(jinode->i_dirty_start, start_byte);
                jinode->i_dirty_end = max(jinode->i_dirty_end, end_byte);
        } else {
                jinode->i_dirty_start = start_byte;
                jinode->i_dirty_end = end_byte;
        }

        /* Is inode already attached where we need it? */
        if (jinode->i_transaction == transaction ||
            jinode->i_next_transaction == transaction)
                goto done;

        /*
         * We only ever set this variable to 1 so the test is safe. Since
         * t_need_data_flush is likely to be set, we do the test to save some
         * cacheline bouncing
         */
        if (!transaction->t_need_data_flush)
                transaction->t_need_data_flush = 1;
        /* On some different transaction's list - should be
         * the committing one */
        if (jinode->i_transaction) {
                J_ASSERT(jinode->i_next_transaction == NULL);
                J_ASSERT(jinode->i_transaction ==
                                        journal->j_committing_transaction);
                jinode->i_next_transaction = transaction;
                goto done;
        }
        /* Not on any transaction list... */
        J_ASSERT(!jinode->i_next_transaction);
        jinode->i_transaction = transaction;
        list_add(&jinode->i_list, &transaction->t_inode_list);
done:
        spin_unlock(&journal->j_list_lock);

        return 0;
}

int jbd2_journal_inode_ranged_write(handle_t *handle,
                struct jbd2_inode *jinode, loff_t start_byte, loff_t length)
{
        return jbd2_journal_file_inode(handle, jinode,
                        JI_WRITE_DATA | JI_WAIT_DATA, start_byte,
                        start_byte + length - 1);
}

int jbd2_journal_inode_ranged_wait(handle_t *handle, struct jbd2_inode *jinode,
                loff_t start_byte, loff_t length)
{
        return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA,
                        start_byte, start_byte + length - 1);
}

/*
 * File truncate and transaction commit interact with each other in a
 * non-trivial way.  If a transaction writing data block A is
 * committing, we cannot discard the data by truncate until we have
 * written them.  Otherwise if we crashed after the transaction with
 * write has committed but before the transaction with truncate has
 * committed, we could see stale data in block A.  This function is a
 * helper to solve this problem.  It starts writeout of the truncated
 * part in case it is in the committing transaction.
 *
 * Filesystem code must call this function when inode is journaled in
 * ordered mode before truncation happens and after the inode has been
 * placed on orphan list with the new inode size. The second condition
 * avoids the race that someone writes new data and we start
 * committing the transaction after this function has been called but
 * before a transaction for truncate is started (and furthermore it
 * allows us to optimize the case where the addition to orphan list
 * happens in the same transaction as write --- we don't have to write
 * any data in such case).
 */
int jbd2_journal_begin_ordered_truncate(journal_t *journal,
                                        struct jbd2_inode *jinode,
                                        loff_t new_size)
{
        transaction_t *inode_trans, *commit_trans;
        int ret = 0;

        /* This is a quick check to avoid locking if not necessary */
        if (!jinode->i_transaction)
                goto out;
        /* Locks are here just to force reading of recent values, it is
         * enough that the transaction was not committing before we started
         * a transaction adding the inode to orphan list */
        read_lock(&journal->j_state_lock);
        commit_trans = journal->j_committing_transaction;
        read_unlock(&journal->j_state_lock);
        spin_lock(&journal->j_list_lock);
        inode_trans = jinode->i_transaction;
        spin_unlock(&journal->j_list_lock);
        if (inode_trans == commit_trans) {
                ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping,
                        new_size, LLONG_MAX);
                if (ret)
                        jbd2_journal_abort(journal, ret);
        }
out:
        return ret;
}











    3 












1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_HUGETLB_INLINE_H
#define _LINUX_HUGETLB_INLINE_H

#ifdef CONFIG_HUGETLB_PAGE

#include <linux/mm.h>

static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
{
        return !!(vma->vm_flags & VM_HUGETLB);
}

#else

static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
{
        return false;
}

#endif

#endif




























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    5 









































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
/*
 *  kernel/cpuset.c
 *
 *  Processor and Memory placement constraints for sets of tasks.
 *
 *  Copyright (C) 2003 BULL SA.
 *  Copyright (C) 2004-2007 Silicon Graphics, Inc.
 *  Copyright (C) 2006 Google, Inc
 *
 *  Portions derived from Patrick Mochel's sysfs code.
 *  sysfs is Copyright (c) 2001-3 Patrick Mochel
 *
 *  2003-10-10 Written by Simon Derr.
 *  2003-10-22 Updates by Stephen Hemminger.
 *  2004 May-July Rework by Paul Jackson.
 *  2006 Rework by Paul Menage to use generic cgroups
 *  2008 Rework of the scheduler domains and CPU hotplug handling
 *       by Max Krasnyansky
 *
 *  This file is subject to the terms and conditions of the GNU General Public
 *  License.  See the file COPYING in the main directory of the Linux
 *  distribution for more details.
 */

#include "cgroup-internal.h"
#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/cpuset.h>
#include <linux/err.h>
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <linux/kmod.h>
#include <linux/kthread.h>
#include <linux/list.h>
#include <linux/mempolicy.h>
#include <linux/mm.h>
#include <linux/memory.h>
#include <linux/export.h>
#include <linux/mount.h>
#include <linux/fs_context.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
#include <linux/proc_fs.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/sched/deadline.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/seq_file.h>
#include <linux/security.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/stat.h>
#include <linux/string.h>
#include <linux/time.h>
#include <linux/time64.h>
#include <linux/backing-dev.h>
#include <linux/sort.h>
#include <linux/oom.h>
#include <linux/sched/isolation.h>
#include <linux/uaccess.h>
#include <linux/atomic.h>
#include <linux/mutex.h>
#include <linux/cgroup.h>
#include <linux/wait.h>

DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);

/*
 * There could be abnormal cpuset configurations for cpu or memory
 * node binding, add this key to provide a quick low-cost judgement
 * of the situation.
 */
DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key);

/* See "Frequency meter" comments, below. */

struct fmeter {
        int cnt;                /* unprocessed events count */
        int val;                /* most recent output value */
        time64_t time;                /* clock (secs) when val computed */
        spinlock_t lock;        /* guards read or write of above */
};

struct cpuset {
        struct cgroup_subsys_state css;

        unsigned long flags;                /* "unsigned long" so bitops work */

        /*
         * On default hierarchy:
         *
         * The user-configured masks can only be changed by writing to
         * cpuset.cpus and cpuset.mems, and won't be limited by the
         * parent masks.
         *
         * The effective masks is the real masks that apply to the tasks
         * in the cpuset. They may be changed if the configured masks are
         * changed or hotplug happens.
         *
         * effective_mask == configured_mask & parent's effective_mask,
         * and if it ends up empty, it will inherit the parent's mask.
         *
         *
         * On legacy hierachy:
         *
         * The user-configured masks are always the same with effective masks.
         */

        /* user-configured CPUs and Memory Nodes allow to tasks */
        cpumask_var_t cpus_allowed;
        nodemask_t mems_allowed;

        /* effective CPUs and Memory Nodes allow to tasks */
        cpumask_var_t effective_cpus;
        nodemask_t effective_mems;

        /*
         * CPUs allocated to child sub-partitions (default hierarchy only)
         * - CPUs granted by the parent = effective_cpus U subparts_cpus
         * - effective_cpus and subparts_cpus are mutually exclusive.
         *
         * effective_cpus contains only onlined CPUs, but subparts_cpus
         * may have offlined ones.
         */
        cpumask_var_t subparts_cpus;

        /*
         * This is old Memory Nodes tasks took on.
         *
         * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
         * - A new cpuset's old_mems_allowed is initialized when some
         *   task is moved into it.
         * - old_mems_allowed is used in cpuset_migrate_mm() when we change
         *   cpuset.mems_allowed and have tasks' nodemask updated, and
         *   then old_mems_allowed is updated to mems_allowed.
         */
        nodemask_t old_mems_allowed;

        struct fmeter fmeter;                /* memory_pressure filter */

        /*
         * Tasks are being attached to this cpuset.  Used to prevent
         * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
         */
        int attach_in_progress;

        /* partition number for rebuild_sched_domains() */
        int pn;

        /* for custom sched domain */
        int relax_domain_level;

        /* number of CPUs in subparts_cpus */
        int nr_subparts_cpus;

        /* partition root state */
        int partition_root_state;

        /*
         * Default hierarchy only:
         * use_parent_ecpus - set if using parent's effective_cpus
         * child_ecpus_count - # of children with use_parent_ecpus set
         */
        int use_parent_ecpus;
        int child_ecpus_count;

        /*
         * number of SCHED_DEADLINE tasks attached to this cpuset, so that we
         * know when to rebuild associated root domain bandwidth information.
         */
        int nr_deadline_tasks;
        int nr_migrate_dl_tasks;
        u64 sum_migrate_dl_bw;
};

/*
 * Partition root states:
 *
 *   0 - not a partition root
 *
 *   1 - partition root
 *
 *  -1 - invalid partition root
 *       None of the cpus in cpus_allowed can be put into the parent's
 *       subparts_cpus. In this case, the cpuset is not a real partition
 *       root anymore.  However, the CPU_EXCLUSIVE bit will still be set
 *       and the cpuset can be restored back to a partition root if the
 *       parent cpuset can give more CPUs back to this child cpuset.
 */
#define PRS_DISABLED                0
#define PRS_ENABLED                1
#define PRS_ERROR                -1

/*
 * Temporary cpumasks for working with partitions that are passed among
 * functions to avoid memory allocation in inner functions.
 */
struct tmpmasks {
        cpumask_var_t addmask, delmask;        /* For partition root */
        cpumask_var_t new_cpus;                /* For update_cpumasks_hier() */
};

static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
{
        return css ? container_of(css, struct cpuset, css) : NULL;
}

/* Retrieve the cpuset for a task */
static inline struct cpuset *task_cs(struct task_struct *task)
{
        return css_cs(task_css(task, cpuset_cgrp_id));
}

static inline struct cpuset *parent_cs(struct cpuset *cs)
{
        return css_cs(cs->css.parent);
}

void inc_dl_tasks_cs(struct task_struct *p)
{
        struct cpuset *cs = task_cs(p);

        cs->nr_deadline_tasks++;
}

void dec_dl_tasks_cs(struct task_struct *p)
{
        struct cpuset *cs = task_cs(p);

        cs->nr_deadline_tasks--;
}

/* bits in struct cpuset flags field */
typedef enum {
        CS_ONLINE,
        CS_CPU_EXCLUSIVE,
        CS_MEM_EXCLUSIVE,
        CS_MEM_HARDWALL,
        CS_MEMORY_MIGRATE,
        CS_SCHED_LOAD_BALANCE,
        CS_SPREAD_PAGE,
        CS_SPREAD_SLAB,
} cpuset_flagbits_t;

/* convenient tests for these bits */
static inline bool is_cpuset_online(struct cpuset *cs)
{
        return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
}

static inline int is_cpu_exclusive(const struct cpuset *cs)
{
        return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
}

static inline int is_mem_exclusive(const struct cpuset *cs)
{
        return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
}

static inline int is_mem_hardwall(const struct cpuset *cs)
{
        return test_bit(CS_MEM_HARDWALL, &cs->flags);
}

static inline int is_sched_load_balance(const struct cpuset *cs)
{
        return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
}

static inline int is_memory_migrate(const struct cpuset *cs)
{
        return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
}

static inline int is_spread_page(const struct cpuset *cs)
{
        return test_bit(CS_SPREAD_PAGE, &cs->flags);
}

static inline int is_spread_slab(const struct cpuset *cs)
{
        return test_bit(CS_SPREAD_SLAB, &cs->flags);
}

static inline int is_partition_root(const struct cpuset *cs)
{
        return cs->partition_root_state > 0;
}

static struct cpuset top_cpuset = {
        .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
                  (1 << CS_MEM_EXCLUSIVE)),
        .partition_root_state = PRS_ENABLED,
};

/**
 * cpuset_for_each_child - traverse online children of a cpuset
 * @child_cs: loop cursor pointing to the current child
 * @pos_css: used for iteration
 * @parent_cs: target cpuset to walk children of
 *
 * Walk @child_cs through the online children of @parent_cs.  Must be used
 * with RCU read locked.
 */
#define cpuset_for_each_child(child_cs, pos_css, parent_cs)                \
        css_for_each_child((pos_css), &(parent_cs)->css)                \
                if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))

/**
 * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
 * @des_cs: loop cursor pointing to the current descendant
 * @pos_css: used for iteration
 * @root_cs: target cpuset to walk ancestor of
 *
 * Walk @des_cs through the online descendants of @root_cs.  Must be used
 * with RCU read locked.  The caller may modify @pos_css by calling
 * css_rightmost_descendant() to skip subtree.  @root_cs is included in the
 * iteration and the first node to be visited.
 */
#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs)        \
        css_for_each_descendant_pre((pos_css), &(root_cs)->css)                \
                if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))

/*
 * There are two global locks guarding cpuset structures - cpuset_mutex and
 * callback_lock. We also require taking task_lock() when dereferencing a
 * task's cpuset pointer. See "The task_lock() exception", at the end of this
 * comment.
 *
 * A task must hold both locks to modify cpusets.  If a task holds
 * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
 * is the only task able to also acquire callback_lock and be able to
 * modify cpusets.  It can perform various checks on the cpuset structure
 * first, knowing nothing will change.  It can also allocate memory while
 * just holding cpuset_mutex.  While it is performing these checks, various
 * callback routines can briefly acquire callback_lock to query cpusets.
 * Once it is ready to make the changes, it takes callback_lock, blocking
 * everyone else.
 *
 * Calls to the kernel memory allocator can not be made while holding
 * callback_lock, as that would risk double tripping on callback_lock
 * from one of the callbacks into the cpuset code from within
 * __alloc_pages().
 *
 * If a task is only holding callback_lock, then it has read-only
 * access to cpusets.
 *
 * Now, the task_struct fields mems_allowed and mempolicy may be changed
 * by other task, we use alloc_lock in the task_struct fields to protect
 * them.
 *
 * The cpuset_common_file_read() handlers only hold callback_lock across
 * small pieces of code, such as when reading out possibly multi-word
 * cpumasks and nodemasks.
 *
 * Accessing a task's cpuset should be done in accordance with the
 * guidelines for accessing subsystem state in kernel/cgroup.c
 */

static DEFINE_MUTEX(cpuset_mutex);

void cpuset_lock(void)
{
        mutex_lock(&cpuset_mutex);
}

void cpuset_unlock(void)
{
        mutex_unlock(&cpuset_mutex);
}

static DEFINE_SPINLOCK(callback_lock);

static struct workqueue_struct *cpuset_migrate_mm_wq;

/*
 * CPU / memory hotplug is handled asynchronously.
 */
static void cpuset_hotplug_workfn(struct work_struct *work);
static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);

static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);

static inline void check_insane_mems_config(nodemask_t *nodes)
{
        if (!cpusets_insane_config() &&
                movable_only_nodes(nodes)) {
                static_branch_enable_cpuslocked(&cpusets_insane_config_key);
                pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n"
                        "Cpuset allocations might fail even with a lot of memory available.\n",
                        nodemask_pr_args(nodes));
        }
}

/*
 * Cgroup v2 behavior is used on the "cpus" and "mems" control files when
 * on default hierarchy or when the cpuset_v2_mode flag is set by mounting
 * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option.
 * With v2 behavior, "cpus" and "mems" are always what the users have
 * requested and won't be changed by hotplug events. Only the effective
 * cpus or mems will be affected.
 */
static inline bool is_in_v2_mode(void)
{
        return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
              (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
}

/*
 * Return in pmask the portion of a cpusets's cpus_allowed that
 * are online.  If none are online, walk up the cpuset hierarchy
 * until we find one that does have some online cpus.
 *
 * One way or another, we guarantee to return some non-empty subset
 * of cpu_online_mask.
 *
 * Call with callback_lock or cpuset_mutex held.
 */
static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
{
        while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
                cs = parent_cs(cs);
                if (unlikely(!cs)) {
                        /*
                         * The top cpuset doesn't have any online cpu as a
                         * consequence of a race between cpuset_hotplug_work
                         * and cpu hotplug notifier.  But we know the top
                         * cpuset's effective_cpus is on its way to be
                         * identical to cpu_online_mask.
                         */
                        cpumask_copy(pmask, cpu_online_mask);
                        return;
                }
        }
        cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
}

/*
 * Return in *pmask the portion of a cpusets's mems_allowed that
 * are online, with memory.  If none are online with memory, walk
 * up the cpuset hierarchy until we find one that does have some
 * online mems.  The top cpuset always has some mems online.
 *
 * One way or another, we guarantee to return some non-empty subset
 * of node_states[N_MEMORY].
 *
 * Call with callback_lock or cpuset_mutex held.
 */
static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
{
        while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
                cs = parent_cs(cs);
        nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
}

/*
 * update task's spread flag if cpuset's page/slab spread flag is set
 *
 * Call with callback_lock or cpuset_mutex held.
 */
static void cpuset_update_task_spread_flag(struct cpuset *cs,
                                        struct task_struct *tsk)
{
        if (is_spread_page(cs))
                task_set_spread_page(tsk);
        else
                task_clear_spread_page(tsk);

        if (is_spread_slab(cs))
                task_set_spread_slab(tsk);
        else
                task_clear_spread_slab(tsk);
}

/*
 * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
 *
 * One cpuset is a subset of another if all its allowed CPUs and
 * Memory Nodes are a subset of the other, and its exclusive flags
 * are only set if the other's are set.  Call holding cpuset_mutex.
 */

static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
{
        return        cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
                nodes_subset(p->mems_allowed, q->mems_allowed) &&
                is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
                is_mem_exclusive(p) <= is_mem_exclusive(q);
}

/**
 * alloc_cpumasks - allocate three cpumasks for cpuset
 * @cs:  the cpuset that have cpumasks to be allocated.
 * @tmp: the tmpmasks structure pointer
 * Return: 0 if successful, -ENOMEM otherwise.
 *
 * Only one of the two input arguments should be non-NULL.
 */
static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
{
        cpumask_var_t *pmask1, *pmask2, *pmask3;

        if (cs) {
                pmask1 = &cs->cpus_allowed;
                pmask2 = &cs->effective_cpus;
                pmask3 = &cs->subparts_cpus;
        } else {
                pmask1 = &tmp->new_cpus;
                pmask2 = &tmp->addmask;
                pmask3 = &tmp->delmask;
        }

        if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
                return -ENOMEM;

        if (!zalloc_cpumask_var(pmask2, GFP_KERNEL))
                goto free_one;

        if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
                goto free_two;

        return 0;

free_two:
        free_cpumask_var(*pmask2);
free_one:
        free_cpumask_var(*pmask1);
        return -ENOMEM;
}

/**
 * free_cpumasks - free cpumasks in a tmpmasks structure
 * @cs:  the cpuset that have cpumasks to be free.
 * @tmp: the tmpmasks structure pointer
 */
static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
{
        if (cs) {
                free_cpumask_var(cs->cpus_allowed);
                free_cpumask_var(cs->effective_cpus);
                free_cpumask_var(cs->subparts_cpus);
        }
        if (tmp) {
                free_cpumask_var(tmp->new_cpus);
                free_cpumask_var(tmp->addmask);
                free_cpumask_var(tmp->delmask);
        }
}

/**
 * alloc_trial_cpuset - allocate a trial cpuset
 * @cs: the cpuset that the trial cpuset duplicates
 */
static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
{
        struct cpuset *trial;

        trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
        if (!trial)
                return NULL;

        if (alloc_cpumasks(trial, NULL)) {
                kfree(trial);
                return NULL;
        }

        cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
        cpumask_copy(trial->effective_cpus, cs->effective_cpus);
        return trial;
}

/**
 * free_cpuset - free the cpuset
 * @cs: the cpuset to be freed
 */
static inline void free_cpuset(struct cpuset *cs)
{
        free_cpumasks(cs, NULL);
        kfree(cs);
}

/*
 * validate_change() - Used to validate that any proposed cpuset change
 *                       follows the structural rules for cpusets.
 *
 * If we replaced the flag and mask values of the current cpuset
 * (cur) with those values in the trial cpuset (trial), would
 * our various subset and exclusive rules still be valid?  Presumes
 * cpuset_mutex held.
 *
 * 'cur' is the address of an actual, in-use cpuset.  Operations
 * such as list traversal that depend on the actual address of the
 * cpuset in the list must use cur below, not trial.
 *
 * 'trial' is the address of bulk structure copy of cur, with
 * perhaps one or more of the fields cpus_allowed, mems_allowed,
 * or flags changed to new, trial values.
 *
 * Return 0 if valid, -errno if not.
 */

static int validate_change(struct cpuset *cur, struct cpuset *trial)
{
        struct cgroup_subsys_state *css;
        struct cpuset *c, *par;
        int ret;

        rcu_read_lock();

        /* Each of our child cpusets must be a subset of us */
        ret = -EBUSY;
        cpuset_for_each_child(c, css, cur)
                if (!is_cpuset_subset(c, trial))
                        goto out;

        /* Remaining checks don't apply to root cpuset */
        ret = 0;
        if (cur == &top_cpuset)
                goto out;

        par = parent_cs(cur);

        /* On legacy hiearchy, we must be a subset of our parent cpuset. */
        ret = -EACCES;
        if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
                goto out;

        /*
         * If either I or some sibling (!= me) is exclusive, we can't
         * overlap
         */
        ret = -EINVAL;
        cpuset_for_each_child(c, css, par) {
                if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
                    c != cur &&
                    cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
                        goto out;
                if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
                    c != cur &&
                    nodes_intersects(trial->mems_allowed, c->mems_allowed))
                        goto out;
        }

        /*
         * Cpusets with tasks - existing or newly being attached - can't
         * be changed to have empty cpus_allowed or mems_allowed.
         */
        ret = -ENOSPC;
        if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
                if (!cpumask_empty(cur->cpus_allowed) &&
                    cpumask_empty(trial->cpus_allowed))
                        goto out;
                if (!nodes_empty(cur->mems_allowed) &&
                    nodes_empty(trial->mems_allowed))
                        goto out;
        }

        /*
         * We can't shrink if we won't have enough room for SCHED_DEADLINE
         * tasks.
         */
        ret = -EBUSY;
        if (is_cpu_exclusive(cur) &&
            !cpuset_cpumask_can_shrink(cur->cpus_allowed,
                                       trial->cpus_allowed))
                goto out;

        ret = 0;
out:
        rcu_read_unlock();
        return ret;
}

#ifdef CONFIG_SMP
/*
 * Helper routine for generate_sched_domains().
 * Do cpusets a, b have overlapping effective cpus_allowed masks?
 */
static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
{
        return cpumask_intersects(a->effective_cpus, b->effective_cpus);
}

static void
update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
{
        if (dattr->relax_domain_level < c->relax_domain_level)
                dattr->relax_domain_level = c->relax_domain_level;
        return;
}

static void update_domain_attr_tree(struct sched_domain_attr *dattr,
                                    struct cpuset *root_cs)
{
        struct cpuset *cp;
        struct cgroup_subsys_state *pos_css;

        rcu_read_lock();
        cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
                /* skip the whole subtree if @cp doesn't have any CPU */
                if (cpumask_empty(cp->cpus_allowed)) {
                        pos_css = css_rightmost_descendant(pos_css);
                        continue;
                }

                if (is_sched_load_balance(cp))
                        update_domain_attr(dattr, cp);
        }
        rcu_read_unlock();
}

/* Must be called with cpuset_mutex held.  */
static inline int nr_cpusets(void)
{
        /* jump label reference count + the top-level cpuset */
        return static_key_count(&cpusets_enabled_key.key) + 1;
}

/*
 * generate_sched_domains()
 *
 * This function builds a partial partition of the systems CPUs
 * A 'partial partition' is a set of non-overlapping subsets whose
 * union is a subset of that set.
 * The output of this function needs to be passed to kernel/sched/core.c
 * partition_sched_domains() routine, which will rebuild the scheduler's
 * load balancing domains (sched domains) as specified by that partial
 * partition.
 *
 * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst
 * for a background explanation of this.
 *
 * Does not return errors, on the theory that the callers of this
 * routine would rather not worry about failures to rebuild sched
 * domains when operating in the severe memory shortage situations
 * that could cause allocation failures below.
 *
 * Must be called with cpuset_mutex held.
 *
 * The three key local variables below are:
 *    cp - cpuset pointer, used (together with pos_css) to perform a
 *           top-down scan of all cpusets. For our purposes, rebuilding
 *           the schedulers sched domains, we can ignore !is_sched_load_
 *           balance cpusets.
 *  csa  - (for CpuSet Array) Array of pointers to all the cpusets
 *           that need to be load balanced, for convenient iterative
 *           access by the subsequent code that finds the best partition,
 *           i.e the set of domains (subsets) of CPUs such that the
 *           cpus_allowed of every cpuset marked is_sched_load_balance
 *           is a subset of one of these domains, while there are as
 *           many such domains as possible, each as small as possible.
 * doms  - Conversion of 'csa' to an array of cpumasks, for passing to
 *           the kernel/sched/core.c routine partition_sched_domains() in a
 *           convenient format, that can be easily compared to the prior
 *           value to determine what partition elements (sched domains)
 *           were changed (added or removed.)
 *
 * Finding the best partition (set of domains):
 *        The triple nested loops below over i, j, k scan over the
 *        load balanced cpusets (using the array of cpuset pointers in
 *        csa[]) looking for pairs of cpusets that have overlapping
 *        cpus_allowed, but which don't have the same 'pn' partition
 *        number and gives them in the same partition number.  It keeps
 *        looping on the 'restart' label until it can no longer find
 *        any such pairs.
 *
 *        The union of the cpus_allowed masks from the set of
 *        all cpusets having the same 'pn' value then form the one
 *        element of the partition (one sched domain) to be passed to
 *        partition_sched_domains().
 */
static int generate_sched_domains(cpumask_var_t **domains,
                        struct sched_domain_attr **attributes)
{
        struct cpuset *cp;        /* top-down scan of cpusets */
        struct cpuset **csa;        /* array of all cpuset ptrs */
        int csn;                /* how many cpuset ptrs in csa so far */
        int i, j, k;                /* indices for partition finding loops */
        cpumask_var_t *doms;        /* resulting partition; i.e. sched domains */
        struct sched_domain_attr *dattr;  /* attributes for custom domains */
        int ndoms = 0;                /* number of sched domains in result */
        int nslot;                /* next empty doms[] struct cpumask slot */
        struct cgroup_subsys_state *pos_css;
        bool root_load_balance = is_sched_load_balance(&top_cpuset);

        doms = NULL;
        dattr = NULL;
        csa = NULL;

        /* Special case for the 99% of systems with one, full, sched domain */
        if (root_load_balance && !top_cpuset.nr_subparts_cpus) {
                ndoms = 1;
                doms = alloc_sched_domains(ndoms);
                if (!doms)
                        goto done;

                dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
                if (dattr) {
                        *dattr = SD_ATTR_INIT;
                        update_domain_attr_tree(dattr, &top_cpuset);
                }
                cpumask_and(doms[0], top_cpuset.effective_cpus,
                            housekeeping_cpumask(HK_FLAG_DOMAIN));

                goto done;
        }

        csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
        if (!csa)
                goto done;
        csn = 0;

        rcu_read_lock();
        if (root_load_balance)
                csa[csn++] = &top_cpuset;
        cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
                if (cp == &top_cpuset)
                        continue;
                /*
                 * Continue traversing beyond @cp iff @cp has some CPUs and
                 * isn't load balancing.  The former is obvious.  The
                 * latter: All child cpusets contain a subset of the
                 * parent's cpus, so just skip them, and then we call
                 * update_domain_attr_tree() to calc relax_domain_level of
                 * the corresponding sched domain.
                 *
                 * If root is load-balancing, we can skip @cp if it
                 * is a subset of the root's effective_cpus.
                 */
                if (!cpumask_empty(cp->cpus_allowed) &&
                    !(is_sched_load_balance(cp) &&
                      cpumask_intersects(cp->cpus_allowed,
                                         housekeeping_cpumask(HK_FLAG_DOMAIN))))
                        continue;

                if (root_load_balance &&
                    cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
                        continue;

                if (is_sched_load_balance(cp) &&
                    !cpumask_empty(cp->effective_cpus))
                        csa[csn++] = cp;

                /* skip @cp's subtree if not a partition root */
                if (!is_partition_root(cp))
                        pos_css = css_rightmost_descendant(pos_css);
        }
        rcu_read_unlock();

        for (i = 0; i < csn; i++)
                csa[i]->pn = i;
        ndoms = csn;

restart:
        /* Find the best partition (set of sched domains) */
        for (i = 0; i < csn; i++) {
                struct cpuset *a = csa[i];
                int apn = a->pn;

                for (j = 0; j < csn; j++) {
                        struct cpuset *b = csa[j];
                        int bpn = b->pn;

                        if (apn != bpn && cpusets_overlap(a, b)) {
                                for (k = 0; k < csn; k++) {
                                        struct cpuset *c = csa[k];

                                        if (c->pn == bpn)
                                                c->pn = apn;
                                }
                                ndoms--;        /* one less element */
                                goto restart;
                        }
                }
        }

        /*
         * Now we know how many domains to create.
         * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
         */
        doms = alloc_sched_domains(ndoms);
        if (!doms)
                goto done;

        /*
         * The rest of the code, including the scheduler, can deal with
         * dattr==NULL case. No need to abort if alloc fails.
         */
        dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
                              GFP_KERNEL);

        for (nslot = 0, i = 0; i < csn; i++) {
                struct cpuset *a = csa[i];
                struct cpumask *dp;
                int apn = a->pn;

                if (apn < 0) {
                        /* Skip completed partitions */
                        continue;
                }

                dp = doms[nslot];

                if (nslot == ndoms) {
                        static int warnings = 10;
                        if (warnings) {
                                pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
                                        nslot, ndoms, csn, i, apn);
                                warnings--;
                        }
                        continue;
                }

                cpumask_clear(dp);
                if (dattr)
                        *(dattr + nslot) = SD_ATTR_INIT;
                for (j = i; j < csn; j++) {
                        struct cpuset *b = csa[j];

                        if (apn == b->pn) {
                                cpumask_or(dp, dp, b->effective_cpus);
                                cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
                                if (dattr)
                                        update_domain_attr_tree(dattr + nslot, b);

                                /* Done with this partition */
                                b->pn = -1;
                        }
                }
                nslot++;
        }
        BUG_ON(nslot != ndoms);

done:
        kfree(csa);

        /*
         * Fallback to the default domain if kmalloc() failed.
         * See comments in partition_sched_domains().
         */
        if (doms == NULL)
                ndoms = 1;

        *domains    = doms;
        *attributes = dattr;
        return ndoms;
}

static void dl_update_tasks_root_domain(struct cpuset *cs)
{
        struct css_task_iter it;
        struct task_struct *task;

        if (cs->nr_deadline_tasks == 0)
                return;

        css_task_iter_start(&cs->css, 0, &it);

        while ((task = css_task_iter_next(&it)))
                dl_add_task_root_domain(task);

        css_task_iter_end(&it);
}

static void dl_rebuild_rd_accounting(void)
{
        struct cpuset *cs = NULL;
        struct cgroup_subsys_state *pos_css;

        lockdep_assert_held(&cpuset_mutex);
        lockdep_assert_cpus_held();
        lockdep_assert_held(&sched_domains_mutex);

        rcu_read_lock();

        /*
         * Clear default root domain DL accounting, it will be computed again
         * if a task belongs to it.
         */
        dl_clear_root_domain(&def_root_domain);

        cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {

                if (cpumask_empty(cs->effective_cpus)) {
                        pos_css = css_rightmost_descendant(pos_css);
                        continue;
                }

                css_get(&cs->css);

                rcu_read_unlock();

                dl_update_tasks_root_domain(cs);

                rcu_read_lock();
                css_put(&cs->css);
        }
        rcu_read_unlock();
}

static void
partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
                                    struct sched_domain_attr *dattr_new)
{
        mutex_lock(&sched_domains_mutex);
        partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
        dl_rebuild_rd_accounting();
        mutex_unlock(&sched_domains_mutex);
}

/*
 * Rebuild scheduler domains.
 *
 * If the flag 'sched_load_balance' of any cpuset with non-empty
 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
 * which has that flag enabled, or if any cpuset with a non-empty
 * 'cpus' is removed, then call this routine to rebuild the
 * scheduler's dynamic sched domains.
 *
 * Call with cpuset_mutex held.  Takes get_online_cpus().
 */
static void rebuild_sched_domains_locked(void)
{
        struct cgroup_subsys_state *pos_css;
        struct sched_domain_attr *attr;
        cpumask_var_t *doms;
        struct cpuset *cs;
        int ndoms;

        lockdep_assert_cpus_held();
        lockdep_assert_held(&cpuset_mutex);

        /*
         * If we have raced with CPU hotplug, return early to avoid
         * passing doms with offlined cpu to partition_sched_domains().
         * Anyways, cpuset_hotplug_workfn() will rebuild sched domains.
         *
         * With no CPUs in any subpartitions, top_cpuset's effective CPUs
         * should be the same as the active CPUs, so checking only top_cpuset
         * is enough to detect racing CPU offlines.
         */
        if (!top_cpuset.nr_subparts_cpus &&
            !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
                return;

        /*
         * With subpartition CPUs, however, the effective CPUs of a partition
         * root should be only a subset of the active CPUs.  Since a CPU in any
         * partition root could be offlined, all must be checked.
         */
        if (top_cpuset.nr_subparts_cpus) {
                rcu_read_lock();
                cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
                        if (!is_partition_root(cs)) {
                                pos_css = css_rightmost_descendant(pos_css);
                                continue;
                        }
                        if (!cpumask_subset(cs->effective_cpus,
                                            cpu_active_mask)) {
                                rcu_read_unlock();
                                return;
                        }
                }
                rcu_read_unlock();
        }

        /* Generate domain masks and attrs */
        ndoms = generate_sched_domains(&doms, &attr);

        /* Have scheduler rebuild the domains */
        partition_and_rebuild_sched_domains(ndoms, doms, attr);
}
#else /* !CONFIG_SMP */
static void rebuild_sched_domains_locked(void)
{
}
#endif /* CONFIG_SMP */

void rebuild_sched_domains(void)
{
        get_online_cpus();
        mutex_lock(&cpuset_mutex);
        rebuild_sched_domains_locked();
        mutex_unlock(&cpuset_mutex);
        put_online_cpus();
}

/**
 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
 *
 * Iterate through each task of @cs updating its cpus_allowed to the
 * effective cpuset's.  As this function is called with cpuset_mutex held,
 * cpuset membership stays stable.
 */
static void update_tasks_cpumask(struct cpuset *cs)
{
        struct css_task_iter it;
        struct task_struct *task;
        bool top_cs = cs == &top_cpuset;

        css_task_iter_start(&cs->css, 0, &it);
        while ((task = css_task_iter_next(&it))) {
                /*
                 * Percpu kthreads in top_cpuset are ignored
                 */
                if (top_cs && (task->flags & PF_KTHREAD) &&
                    kthread_is_per_cpu(task))
                        continue;
                set_cpus_allowed_ptr(task, cs->effective_cpus);
        }
        css_task_iter_end(&it);
}

/**
 * compute_effective_cpumask - Compute the effective cpumask of the cpuset
 * @new_cpus: the temp variable for the new effective_cpus mask
 * @cs: the cpuset the need to recompute the new effective_cpus mask
 * @parent: the parent cpuset
 *
 * If the parent has subpartition CPUs, include them in the list of
 * allowable CPUs in computing the new effective_cpus mask. Since offlined
 * CPUs are not removed from subparts_cpus, we have to use cpu_active_mask
 * to mask those out.
 */
static void compute_effective_cpumask(struct cpumask *new_cpus,
                                      struct cpuset *cs, struct cpuset *parent)
{
        if (parent->nr_subparts_cpus) {
                cpumask_or(new_cpus, parent->effective_cpus,
                           parent->subparts_cpus);
                cpumask_and(new_cpus, new_cpus, cs->cpus_allowed);
                cpumask_and(new_cpus, new_cpus, cpu_active_mask);
        } else {
                cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
        }
}

/*
 * Commands for update_parent_subparts_cpumask
 */
enum subparts_cmd {
        partcmd_enable,                /* Enable partition root         */
        partcmd_disable,        /* Disable partition root         */
        partcmd_update,                /* Update parent's subparts_cpus */
};

/**
 * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset
 * @cpuset:  The cpuset that requests change in partition root state
 * @cmd:     Partition root state change command
 * @newmask: Optional new cpumask for partcmd_update
 * @tmp:     Temporary addmask and delmask
 * Return:   0, 1 or an error code
 *
 * For partcmd_enable, the cpuset is being transformed from a non-partition
 * root to a partition root. The cpus_allowed mask of the given cpuset will
 * be put into parent's subparts_cpus and taken away from parent's
 * effective_cpus. The function will return 0 if all the CPUs listed in
 * cpus_allowed can be granted or an error code will be returned.
 *
 * For partcmd_disable, the cpuset is being transofrmed from a partition
 * root back to a non-partition root. Any CPUs in cpus_allowed that are in
 * parent's subparts_cpus will be taken away from that cpumask and put back
 * into parent's effective_cpus. 0 should always be returned.
 *
 * For partcmd_update, if the optional newmask is specified, the cpu
 * list is to be changed from cpus_allowed to newmask. Otherwise,
 * cpus_allowed is assumed to remain the same. The cpuset should either
 * be a partition root or an invalid partition root. The partition root
 * state may change if newmask is NULL and none of the requested CPUs can
 * be granted by the parent. The function will return 1 if changes to
 * parent's subparts_cpus and effective_cpus happen or 0 otherwise.
 * Error code should only be returned when newmask is non-NULL.
 *
 * The partcmd_enable and partcmd_disable commands are used by
 * update_prstate(). The partcmd_update command is used by
 * update_cpumasks_hier() with newmask NULL and update_cpumask() with
 * newmask set.
 *
 * The checking is more strict when enabling partition root than the
 * other two commands.
 *
 * Because of the implicit cpu exclusive nature of a partition root,
 * cpumask changes that violates the cpu exclusivity rule will not be
 * permitted when checked by validate_change(). The validate_change()
 * function will also prevent any changes to the cpu list if it is not
 * a superset of children's cpu lists.
 */
static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
                                          struct cpumask *newmask,
                                          struct tmpmasks *tmp)
{
        struct cpuset *parent = parent_cs(cpuset);
        int adding;        /* Moving cpus from effective_cpus to subparts_cpus */
        int deleting;        /* Moving cpus from subparts_cpus to effective_cpus */
        int new_prs;
        bool part_error = false;        /* Partition error? */

        lockdep_assert_held(&cpuset_mutex);

        /*
         * The parent must be a partition root.
         * The new cpumask, if present, or the current cpus_allowed must
         * not be empty.
         */
        if (!is_partition_root(parent) ||
           (newmask && cpumask_empty(newmask)) ||
           (!newmask && cpumask_empty(cpuset->cpus_allowed)))
                return -EINVAL;

        /*
         * Enabling/disabling partition root is not allowed if there are
         * online children.
         */
        if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css))
                return -EBUSY;

        /*
         * Enabling partition root is not allowed if not all the CPUs
         * can be granted from parent's effective_cpus or at least one
         * CPU will be left after that.
         */
        if ((cmd == partcmd_enable) &&
           (!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus) ||
             cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus)))
                return -EINVAL;

        /*
         * A cpumask update cannot make parent's effective_cpus become empty.
         */
        adding = deleting = false;
        new_prs = cpuset->partition_root_state;
        if (cmd == partcmd_enable) {
                cpumask_copy(tmp->addmask, cpuset->cpus_allowed);
                adding = true;
        } else if (cmd == partcmd_disable) {
                deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
                                       parent->subparts_cpus);
        } else if (newmask) {
                /*
                 * partcmd_update with newmask:
                 *
                 * delmask = cpus_allowed & ~newmask & parent->subparts_cpus
                 * addmask = newmask & parent->effective_cpus
                 *                     & ~parent->subparts_cpus
                 */
                cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask);
                deleting = cpumask_and(tmp->delmask, tmp->delmask,
                                       parent->subparts_cpus);

                cpumask_and(tmp->addmask, newmask, parent->effective_cpus);
                adding = cpumask_andnot(tmp->addmask, tmp->addmask,
                                        parent->subparts_cpus);
                /*
                 * Return error if the new effective_cpus could become empty.
                 */
                if (adding &&
                    cpumask_equal(parent->effective_cpus, tmp->addmask)) {
                        if (!deleting)
                                return -EINVAL;
                        /*
                         * As some of the CPUs in subparts_cpus might have
                         * been offlined, we need to compute the real delmask
                         * to confirm that.
                         */
                        if (!cpumask_and(tmp->addmask, tmp->delmask,
                                         cpu_active_mask))
                                return -EINVAL;
                        cpumask_copy(tmp->addmask, parent->effective_cpus);
                }
        } else {
                /*
                 * partcmd_update w/o newmask:
                 *
                 * addmask = cpus_allowed & parent->effective_cpus
                 *
                 * Note that parent's subparts_cpus may have been
                 * pre-shrunk in case there is a change in the cpu list.
                 * So no deletion is needed.
                 */
                adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed,
                                     parent->effective_cpus);
                part_error = cpumask_equal(tmp->addmask,
                                           parent->effective_cpus);
        }

        if (cmd == partcmd_update) {
                int prev_prs = cpuset->partition_root_state;

                /*
                 * Check for possible transition between PRS_ENABLED
                 * and PRS_ERROR.
                 */
                switch (cpuset->partition_root_state) {
                case PRS_ENABLED:
                        if (part_error)
                                new_prs = PRS_ERROR;
                        break;
                case PRS_ERROR:
                        if (!part_error)
                                new_prs = PRS_ENABLED;
                        break;
                }
                /*
                 * Set part_error if previously in invalid state.
                 */
                part_error = (prev_prs == PRS_ERROR);
        }

        if (!part_error && (new_prs == PRS_ERROR))
                return 0;        /* Nothing need to be done */

        if (new_prs == PRS_ERROR) {
                /*
                 * Remove all its cpus from parent's subparts_cpus.
                 */
                adding = false;
                deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
                                       parent->subparts_cpus);
        }

        if (!adding && !deleting && (new_prs == cpuset->partition_root_state))
                return 0;

        /*
         * Change the parent's subparts_cpus.
         * Newly added CPUs will be removed from effective_cpus and
         * newly deleted ones will be added back to effective_cpus.
         */
        spin_lock_irq(&callback_lock);
        if (adding) {
                cpumask_or(parent->subparts_cpus,
                           parent->subparts_cpus, tmp->addmask);
                cpumask_andnot(parent->effective_cpus,
                               parent->effective_cpus, tmp->addmask);
        }
        if (deleting) {
                cpumask_andnot(parent->subparts_cpus,
                               parent->subparts_cpus, tmp->delmask);
                /*
                 * Some of the CPUs in subparts_cpus might have been offlined.
                 */
                cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask);
                cpumask_or(parent->effective_cpus,
                           parent->effective_cpus, tmp->delmask);
        }

        parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);

        if (cpuset->partition_root_state != new_prs)
                cpuset->partition_root_state = new_prs;
        spin_unlock_irq(&callback_lock);

        return cmd == partcmd_update;
}

/*
 * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
 * @cs:  the cpuset to consider
 * @tmp: temp variables for calculating effective_cpus & partition setup
 *
 * When congifured cpumask is changed, the effective cpumasks of this cpuset
 * and all its descendants need to be updated.
 *
 * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
 *
 * Called with cpuset_mutex held
 */
static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
{
        struct cpuset *cp;
        struct cgroup_subsys_state *pos_css;
        bool need_rebuild_sched_domains = false;
        int new_prs;

        rcu_read_lock();
        cpuset_for_each_descendant_pre(cp, pos_css, cs) {
                struct cpuset *parent = parent_cs(cp);

                compute_effective_cpumask(tmp->new_cpus, cp, parent);

                /*
                 * If it becomes empty, inherit the effective mask of the
                 * parent, which is guaranteed to have some CPUs.
                 */
                if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) {
                        cpumask_copy(tmp->new_cpus, parent->effective_cpus);
                        if (!cp->use_parent_ecpus) {
                                cp->use_parent_ecpus = true;
                                parent->child_ecpus_count++;
                        }
                } else if (cp->use_parent_ecpus) {
                        cp->use_parent_ecpus = false;
                        WARN_ON_ONCE(!parent->child_ecpus_count);
                        parent->child_ecpus_count--;
                }

                /*
                 * Skip the whole subtree if the cpumask remains the same
                 * and has no partition root state.
                 */
                if (!cp->partition_root_state &&
                    cpumask_equal(tmp->new_cpus, cp->effective_cpus)) {
                        pos_css = css_rightmost_descendant(pos_css);
                        continue;
                }

                /*
                 * update_parent_subparts_cpumask() should have been called
                 * for cs already in update_cpumask(). We should also call
                 * update_tasks_cpumask() again for tasks in the parent
                 * cpuset if the parent's subparts_cpus changes.
                 */
                new_prs = cp->partition_root_state;
                if ((cp != cs) && new_prs) {
                        switch (parent->partition_root_state) {
                        case PRS_DISABLED:
                                /*
                                 * If parent is not a partition root or an
                                 * invalid partition root, clear its state
                                 * and its CS_CPU_EXCLUSIVE flag.
                                 */
                                WARN_ON_ONCE(cp->partition_root_state
                                             != PRS_ERROR);
                                new_prs = PRS_DISABLED;

                                /*
                                 * clear_bit() is an atomic operation and
                                 * readers aren't interested in the state
                                 * of CS_CPU_EXCLUSIVE anyway. So we can
                                 * just update the flag without holding
                                 * the callback_lock.
                                 */
                                clear_bit(CS_CPU_EXCLUSIVE, &cp->flags);
                                break;

                        case PRS_ENABLED:
                                if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp))
                                        update_tasks_cpumask(parent);
                                break;

                        case PRS_ERROR:
                                /*
                                 * When parent is invalid, it has to be too.
                                 */
                                new_prs = PRS_ERROR;
                                break;
                        }
                }

                if (!css_tryget_online(&cp->css))
                        continue;
                rcu_read_unlock();

                spin_lock_irq(&callback_lock);

                cpumask_copy(cp->effective_cpus, tmp->new_cpus);
                if (cp->nr_subparts_cpus && (new_prs != PRS_ENABLED)) {
                        cp->nr_subparts_cpus = 0;
                        cpumask_clear(cp->subparts_cpus);
                } else if (cp->nr_subparts_cpus) {
                        /*
                         * Make sure that effective_cpus & subparts_cpus
                         * are mutually exclusive.
                         *
                         * In the unlikely event that effective_cpus
                         * becomes empty. we clear cp->nr_subparts_cpus and
                         * let its child partition roots to compete for
                         * CPUs again.
                         */
                        cpumask_andnot(cp->effective_cpus, cp->effective_cpus,
                                       cp->subparts_cpus);
                        if (cpumask_empty(cp->effective_cpus)) {
                                cpumask_copy(cp->effective_cpus, tmp->new_cpus);
                                cpumask_clear(cp->subparts_cpus);
                                cp->nr_subparts_cpus = 0;
                        } else if (!cpumask_subset(cp->subparts_cpus,
                                                   tmp->new_cpus)) {
                                cpumask_andnot(cp->subparts_cpus,
                                        cp->subparts_cpus, tmp->new_cpus);
                                cp->nr_subparts_cpus
                                        = cpumask_weight(cp->subparts_cpus);
                        }
                }

                if (new_prs != cp->partition_root_state)
                        cp->partition_root_state = new_prs;

                spin_unlock_irq(&callback_lock);

                WARN_ON(!is_in_v2_mode() &&
                        !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));

                update_tasks_cpumask(cp);

                /*
                 * On legacy hierarchy, if the effective cpumask of any non-
                 * empty cpuset is changed, we need to rebuild sched domains.
                 * On default hierarchy, the cpuset needs to be a partition
                 * root as well.
                 */
                if (!cpumask_empty(cp->cpus_allowed) &&
                    is_sched_load_balance(cp) &&
                   (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
                    is_partition_root(cp)))
                        need_rebuild_sched_domains = true;

                rcu_read_lock();
                css_put(&cp->css);
        }
        rcu_read_unlock();

        if (need_rebuild_sched_domains)
                rebuild_sched_domains_locked();
}

/**
 * update_sibling_cpumasks - Update siblings cpumasks
 * @parent:  Parent cpuset
 * @cs:      Current cpuset
 * @tmp:     Temp variables
 */
static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
                                    struct tmpmasks *tmp)
{
        struct cpuset *sibling;
        struct cgroup_subsys_state *pos_css;

        lockdep_assert_held(&cpuset_mutex);

        /*
         * Check all its siblings and call update_cpumasks_hier()
         * if their use_parent_ecpus flag is set in order for them
         * to use the right effective_cpus value.
         *
         * The update_cpumasks_hier() function may sleep. So we have to
         * release the RCU read lock before calling it.
         */
        rcu_read_lock();
        cpuset_for_each_child(sibling, pos_css, parent) {
                if (sibling == cs)
                        continue;
                if (!sibling->use_parent_ecpus)
                        continue;
                if (!css_tryget_online(&sibling->css))
                        continue;

                rcu_read_unlock();
                update_cpumasks_hier(sibling, tmp);
                rcu_read_lock();
                css_put(&sibling->css);
        }
        rcu_read_unlock();
}

/**
 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
 * @cs: the cpuset to consider
 * @trialcs: trial cpuset
 * @buf: buffer of cpu numbers written to this cpuset
 */
static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
                          const char *buf)
{
        int retval;
        struct tmpmasks tmp;

        /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
        if (cs == &top_cpuset)
                return -EACCES;

        /*
         * An empty cpus_allowed is ok only if the cpuset has no tasks.
         * Since cpulist_parse() fails on an empty mask, we special case
         * that parsing.  The validate_change() call ensures that cpusets
         * with tasks have cpus.
         */
        if (!*buf) {
                cpumask_clear(trialcs->cpus_allowed);
        } else {
                retval = cpulist_parse(buf, trialcs->cpus_allowed);
                if (retval < 0)
                        return retval;

                if (!cpumask_subset(trialcs->cpus_allowed,
                                    top_cpuset.cpus_allowed))
                        return -EINVAL;
        }

        /* Nothing to do if the cpus didn't change */
        if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
                return 0;

        retval = validate_change(cs, trialcs);
        if (retval < 0)
                return retval;

#ifdef CONFIG_CPUMASK_OFFSTACK
        /*
         * Use the cpumasks in trialcs for tmpmasks when they are pointers
         * to allocated cpumasks.
         */
        tmp.addmask  = trialcs->subparts_cpus;
        tmp.delmask  = trialcs->effective_cpus;
        tmp.new_cpus = trialcs->cpus_allowed;
#endif

        if (cs->partition_root_state) {
                /* Cpumask of a partition root cannot be empty */
                if (cpumask_empty(trialcs->cpus_allowed))
                        return -EINVAL;
                if (update_parent_subparts_cpumask(cs, partcmd_update,
                                        trialcs->cpus_allowed, &tmp) < 0)
                        return -EINVAL;
        }

        spin_lock_irq(&callback_lock);
        cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);

        /*
         * Make sure that subparts_cpus is a subset of cpus_allowed.
         */
        if (cs->nr_subparts_cpus) {
                cpumask_and(cs->subparts_cpus, cs->subparts_cpus, cs->cpus_allowed);
                cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
        }
        spin_unlock_irq(&callback_lock);

        update_cpumasks_hier(cs, &tmp);

        if (cs->partition_root_state) {
                struct cpuset *parent = parent_cs(cs);

                /*
                 * For partition root, update the cpumasks of sibling
                 * cpusets if they use parent's effective_cpus.
                 */
                if (parent->child_ecpus_count)
                        update_sibling_cpumasks(parent, cs, &tmp);
        }
        return 0;
}

/*
 * Migrate memory region from one set of nodes to another.  This is
 * performed asynchronously as it can be called from process migration path
 * holding locks involved in process management.  All mm migrations are
 * performed in the queued order and can be waited for by flushing
 * cpuset_migrate_mm_wq.
 */

struct cpuset_migrate_mm_work {
        struct work_struct        work;
        struct mm_struct        *mm;
        nodemask_t                from;
        nodemask_t                to;
};

static void cpuset_migrate_mm_workfn(struct work_struct *work)
{
        struct cpuset_migrate_mm_work *mwork =
                container_of(work, struct cpuset_migrate_mm_work, work);

        /* on a wq worker, no need to worry about %current's mems_allowed */
        do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
        mmput(mwork->mm);
        kfree(mwork);
}

static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
                                                        const nodemask_t *to)
{
        struct cpuset_migrate_mm_work *mwork;

        mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
        if (mwork) {
                mwork->mm = mm;
                mwork->from = *from;
                mwork->to = *to;
                INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
                queue_work(cpuset_migrate_mm_wq, &mwork->work);
        } else {
                mmput(mm);
        }
}

static void cpuset_post_attach(void)
{
        flush_workqueue(cpuset_migrate_mm_wq);
}

/*
 * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
 * @tsk: the task to change
 * @newmems: new nodes that the task will be set
 *
 * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed
 * and rebind an eventual tasks' mempolicy. If the task is allocating in
 * parallel, it might temporarily see an empty intersection, which results in
 * a seqlock check and retry before OOM or allocation failure.
 */
static void cpuset_change_task_nodemask(struct task_struct *tsk,
                                        nodemask_t *newmems)
{
        task_lock(tsk);

        local_irq_disable();
        write_seqcount_begin(&tsk->mems_allowed_seq);

        nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
        mpol_rebind_task(tsk, newmems);
        tsk->mems_allowed = *newmems;

        write_seqcount_end(&tsk->mems_allowed_seq);
        local_irq_enable();

        task_unlock(tsk);
}

static void *cpuset_being_rebound;

/**
 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
 *
 * Iterate through each task of @cs updating its mems_allowed to the
 * effective cpuset's.  As this function is called with cpuset_mutex held,
 * cpuset membership stays stable.
 */
static void update_tasks_nodemask(struct cpuset *cs)
{
        static nodemask_t newmems;        /* protected by cpuset_mutex */
        struct css_task_iter it;
        struct task_struct *task;

        cpuset_being_rebound = cs;                /* causes mpol_dup() rebind */

        guarantee_online_mems(cs, &newmems);

        /*
         * The mpol_rebind_mm() call takes mmap_lock, which we couldn't
         * take while holding tasklist_lock.  Forks can happen - the
         * mpol_dup() cpuset_being_rebound check will catch such forks,
         * and rebind their vma mempolicies too.  Because we still hold
         * the global cpuset_mutex, we know that no other rebind effort
         * will be contending for the global variable cpuset_being_rebound.
         * It's ok if we rebind the same mm twice; mpol_rebind_mm()
         * is idempotent.  Also migrate pages in each mm to new nodes.
         */
        css_task_iter_start(&cs->css, 0, &it);
        while ((task = css_task_iter_next(&it))) {
                struct mm_struct *mm;
                bool migrate;

                cpuset_change_task_nodemask(task, &newmems);

                mm = get_task_mm(task);
                if (!mm)
                        continue;

                migrate = is_memory_migrate(cs);

                mpol_rebind_mm(mm, &cs->mems_allowed);
                if (migrate)
                        cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
                else
                        mmput(mm);
        }
        css_task_iter_end(&it);

        /*
         * All the tasks' nodemasks have been updated, update
         * cs->old_mems_allowed.
         */
        cs->old_mems_allowed = newmems;

        /* We're done rebinding vmas to this cpuset's new mems_allowed. */
        cpuset_being_rebound = NULL;
}

/*
 * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
 * @cs: the cpuset to consider
 * @new_mems: a temp variable for calculating new effective_mems
 *
 * When configured nodemask is changed, the effective nodemasks of this cpuset
 * and all its descendants need to be updated.
 *
 * On legacy hiearchy, effective_mems will be the same with mems_allowed.
 *
 * Called with cpuset_mutex held
 */
static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
{
        struct cpuset *cp;
        struct cgroup_subsys_state *pos_css;

        rcu_read_lock();
        cpuset_for_each_descendant_pre(cp, pos_css, cs) {
                struct cpuset *parent = parent_cs(cp);

                nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);

                /*
                 * If it becomes empty, inherit the effective mask of the
                 * parent, which is guaranteed to have some MEMs.
                 */
                if (is_in_v2_mode() && nodes_empty(*new_mems))
                        *new_mems = parent->effective_mems;

                /* Skip the whole subtree if the nodemask remains the same. */
                if (nodes_equal(*new_mems, cp->effective_mems)) {
                        pos_css = css_rightmost_descendant(pos_css);
                        continue;
                }

                if (!css_tryget_online(&cp->css))
                        continue;
                rcu_read_unlock();

                spin_lock_irq(&callback_lock);
                cp->effective_mems = *new_mems;
                spin_unlock_irq(&callback_lock);

                WARN_ON(!is_in_v2_mode() &&
                        !nodes_equal(cp->mems_allowed, cp->effective_mems));

                update_tasks_nodemask(cp);

                rcu_read_lock();
                css_put(&cp->css);
        }
        rcu_read_unlock();
}

/*
 * Handle user request to change the 'mems' memory placement
 * of a cpuset.  Needs to validate the request, update the
 * cpusets mems_allowed, and for each task in the cpuset,
 * update mems_allowed and rebind task's mempolicy and any vma
 * mempolicies and if the cpuset is marked 'memory_migrate',
 * migrate the tasks pages to the new memory.
 *
 * Call with cpuset_mutex held. May take callback_lock during call.
 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
 * lock each such tasks mm->mmap_lock, scan its vma's and rebind
 * their mempolicies to the cpusets new mems_allowed.
 */
static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
                           const char *buf)
{
        int retval;

        /*
         * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
         * it's read-only
         */
        if (cs == &top_cpuset) {
                retval = -EACCES;
                goto done;
        }

        /*
         * An empty mems_allowed is ok iff there are no tasks in the cpuset.
         * Since nodelist_parse() fails on an empty mask, we special case
         * that parsing.  The validate_change() call ensures that cpusets
         * with tasks have memory.
         */
        if (!*buf) {
                nodes_clear(trialcs->mems_allowed);
        } else {
                retval = nodelist_parse(buf, trialcs->mems_allowed);
                if (retval < 0)
                        goto done;

                if (!nodes_subset(trialcs->mems_allowed,
                                  top_cpuset.mems_allowed)) {
                        retval = -EINVAL;
                        goto done;
                }
        }

        if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
                retval = 0;                /* Too easy - nothing to do */
                goto done;
        }
        retval = validate_change(cs, trialcs);
        if (retval < 0)
                goto done;

        check_insane_mems_config(&trialcs->mems_allowed);

        spin_lock_irq(&callback_lock);
        cs->mems_allowed = trialcs->mems_allowed;
        spin_unlock_irq(&callback_lock);

        /* use trialcs->mems_allowed as a temp variable */
        update_nodemasks_hier(cs, &trialcs->mems_allowed);
done:
        return retval;
}

bool current_cpuset_is_being_rebound(void)
{
        bool ret;

        rcu_read_lock();
        ret = task_cs(current) == cpuset_being_rebound;
        rcu_read_unlock();

        return ret;
}

static int update_relax_domain_level(struct cpuset *cs, s64 val)
{
#ifdef CONFIG_SMP
        if (val < -1 || val > sched_domain_level_max + 1)
                return -EINVAL;
#endif

        if (val != cs->relax_domain_level) {
                cs->relax_domain_level = val;
                if (!cpumask_empty(cs->cpus_allowed) &&
                    is_sched_load_balance(cs))
                        rebuild_sched_domains_locked();
        }

        return 0;
}

/**
 * update_tasks_flags - update the spread flags of tasks in the cpuset.
 * @cs: the cpuset in which each task's spread flags needs to be changed
 *
 * Iterate through each task of @cs updating its spread flags.  As this
 * function is called with cpuset_mutex held, cpuset membership stays
 * stable.
 */
static void update_tasks_flags(struct cpuset *cs)
{
        struct css_task_iter it;
        struct task_struct *task;

        css_task_iter_start(&cs->css, 0, &it);
        while ((task = css_task_iter_next(&it)))
                cpuset_update_task_spread_flag(cs, task);
        css_task_iter_end(&it);
}

/*
 * update_flag - read a 0 or a 1 in a file and update associated flag
 * bit:                the bit to update (see cpuset_flagbits_t)
 * cs:                the cpuset to update
 * turning_on:         whether the flag is being set or cleared
 *
 * Call with cpuset_mutex held.
 */

static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
                       int turning_on)
{
        struct cpuset *trialcs;
        int balance_flag_changed;
        int spread_flag_changed;
        int err;

        trialcs = alloc_trial_cpuset(cs);
        if (!trialcs)
                return -ENOMEM;

        if (turning_on)
                set_bit(bit, &trialcs->flags);
        else
                clear_bit(bit, &trialcs->flags);

        err = validate_change(cs, trialcs);
        if (err < 0)
                goto out;

        balance_flag_changed = (is_sched_load_balance(cs) !=
                                is_sched_load_balance(trialcs));

        spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
                        || (is_spread_page(cs) != is_spread_page(trialcs)));

        spin_lock_irq(&callback_lock);
        cs->flags = trialcs->flags;
        spin_unlock_irq(&callback_lock);

        if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
                rebuild_sched_domains_locked();

        if (spread_flag_changed)
                update_tasks_flags(cs);
out:
        free_cpuset(trialcs);
        return err;
}

/*
 * update_prstate - update partititon_root_state
 * cs: the cpuset to update
 * new_prs: new partition root state
 *
 * Call with cpuset_mutex held.
 */
static int update_prstate(struct cpuset *cs, int new_prs)
{
        int err, old_prs = cs->partition_root_state;
        struct cpuset *parent = parent_cs(cs);
        struct tmpmasks tmpmask;

        if (old_prs == new_prs)
                return 0;

        /*
         * Cannot force a partial or invalid partition root to a full
         * partition root.
         */
        if (new_prs && (old_prs == PRS_ERROR))
                return -EINVAL;

        if (alloc_cpumasks(NULL, &tmpmask))
                return -ENOMEM;

        err = -EINVAL;
        if (!old_prs) {
                /*
                 * Turning on partition root requires setting the
                 * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed
                 * cannot be NULL.
                 */
                if (cpumask_empty(cs->cpus_allowed))
                        goto out;

                err = update_flag(CS_CPU_EXCLUSIVE, cs, 1);
                if (err)
                        goto out;

                err = update_parent_subparts_cpumask(cs, partcmd_enable,
                                                     NULL, &tmpmask);
                if (err) {
                        update_flag(CS_CPU_EXCLUSIVE, cs, 0);
                        goto out;
                }
        } else {
                /*
                 * Turning off partition root will clear the
                 * CS_CPU_EXCLUSIVE bit.
                 */
                if (old_prs == PRS_ERROR) {
                        update_flag(CS_CPU_EXCLUSIVE, cs, 0);
                        err = 0;
                        goto out;
                }

                err = update_parent_subparts_cpumask(cs, partcmd_disable,
                                                     NULL, &tmpmask);
                if (err)
                        goto out;

                /* Turning off CS_CPU_EXCLUSIVE will not return error */
                update_flag(CS_CPU_EXCLUSIVE, cs, 0);
        }

        update_tasks_cpumask(parent);

        if (parent->child_ecpus_count)
                update_sibling_cpumasks(parent, cs, &tmpmask);

        rebuild_sched_domains_locked();
out:
        if (!err) {
                spin_lock_irq(&callback_lock);
                cs->partition_root_state = new_prs;
                spin_unlock_irq(&callback_lock);
        }

        free_cpumasks(NULL, &tmpmask);
        return err;
}

/*
 * Frequency meter - How fast is some event occurring?
 *
 * These routines manage a digitally filtered, constant time based,
 * event frequency meter.  There are four routines:
 *   fmeter_init() - initialize a frequency meter.
 *   fmeter_markevent() - called each time the event happens.
 *   fmeter_getrate() - returns the recent rate of such events.
 *   fmeter_update() - internal routine used to update fmeter.
 *
 * A common data structure is passed to each of these routines,
 * which is used to keep track of the state required to manage the
 * frequency meter and its digital filter.
 *
 * The filter works on the number of events marked per unit time.
 * The filter is single-pole low-pass recursive (IIR).  The time unit
 * is 1 second.  Arithmetic is done using 32-bit integers scaled to
 * simulate 3 decimal digits of precision (multiplied by 1000).
 *
 * With an FM_COEF of 933, and a time base of 1 second, the filter
 * has a half-life of 10 seconds, meaning that if the events quit
 * happening, then the rate returned from the fmeter_getrate()
 * will be cut in half each 10 seconds, until it converges to zero.
 *
 * It is not worth doing a real infinitely recursive filter.  If more
 * than FM_MAXTICKS ticks have elapsed since the last filter event,
 * just compute FM_MAXTICKS ticks worth, by which point the level
 * will be stable.
 *
 * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
 * arithmetic overflow in the fmeter_update() routine.
 *
 * Given the simple 32 bit integer arithmetic used, this meter works
 * best for reporting rates between one per millisecond (msec) and
 * one per 32 (approx) seconds.  At constant rates faster than one
 * per msec it maxes out at values just under 1,000,000.  At constant
 * rates between one per msec, and one per second it will stabilize
 * to a value N*1000, where N is the rate of events per second.
 * At constant rates between one per second and one per 32 seconds,
 * it will be choppy, moving up on the seconds that have an event,
 * and then decaying until the next event.  At rates slower than
 * about one in 32 seconds, it decays all the way back to zero between
 * each event.
 */

#define FM_COEF 933                /* coefficient for half-life of 10 secs */
#define FM_MAXTICKS ((u32)99)   /* useless computing more ticks than this */
#define FM_MAXCNT 1000000        /* limit cnt to avoid overflow */
#define FM_SCALE 1000                /* faux fixed point scale */

/* Initialize a frequency meter */
static void fmeter_init(struct fmeter *fmp)
{
        fmp->cnt = 0;
        fmp->val = 0;
        fmp->time = 0;
        spin_lock_init(&fmp->lock);
}

/* Internal meter update - process cnt events and update value */
static void fmeter_update(struct fmeter *fmp)
{
        time64_t now;
        u32 ticks;

        now = ktime_get_seconds();
        ticks = now - fmp->time;

        if (ticks == 0)
                return;

        ticks = min(FM_MAXTICKS, ticks);
        while (ticks-- > 0)
                fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
        fmp->time = now;

        fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
        fmp->cnt = 0;
}

/* Process any previous ticks, then bump cnt by one (times scale). */
static void fmeter_markevent(struct fmeter *fmp)
{
        spin_lock(&fmp->lock);
        fmeter_update(fmp);
        fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
        spin_unlock(&fmp->lock);
}

/* Process any previous ticks, then return current value. */
static int fmeter_getrate(struct fmeter *fmp)
{
        int val;

        spin_lock(&fmp->lock);
        fmeter_update(fmp);
        val = fmp->val;
        spin_unlock(&fmp->lock);
        return val;
}

static struct cpuset *cpuset_attach_old_cs;

static void reset_migrate_dl_data(struct cpuset *cs)
{
        cs->nr_migrate_dl_tasks = 0;
        cs->sum_migrate_dl_bw = 0;
}

/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
static int cpuset_can_attach(struct cgroup_taskset *tset)
{
        struct cgroup_subsys_state *css;
        struct cpuset *cs, *oldcs;
        struct task_struct *task;
        int ret;

        /* used later by cpuset_attach() */
        cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
        oldcs = cpuset_attach_old_cs;
        cs = css_cs(css);

        mutex_lock(&cpuset_mutex);

        /* allow moving tasks into an empty cpuset if on default hierarchy */
        ret = -ENOSPC;
        if (!is_in_v2_mode() &&
            (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
                goto out_unlock;

        cgroup_taskset_for_each(task, css, tset) {
                ret = task_can_attach(task);
                if (ret)
                        goto out_unlock;
                ret = security_task_setscheduler(task);
                if (ret)
                        goto out_unlock;

                if (dl_task(task)) {
                        cs->nr_migrate_dl_tasks++;
                        cs->sum_migrate_dl_bw += task->dl.dl_bw;
                }
        }

        if (!cs->nr_migrate_dl_tasks)
                goto out_success;

        if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) {
                int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);

                if (unlikely(cpu >= nr_cpu_ids)) {
                        reset_migrate_dl_data(cs);
                        ret = -EINVAL;
                        goto out_unlock;
                }

                ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
                if (ret) {
                        reset_migrate_dl_data(cs);
                        goto out_unlock;
                }
        }

out_success:
        /*
         * Mark attach is in progress.  This makes validate_change() fail
         * changes which zero cpus/mems_allowed.
         */
        cs->attach_in_progress++;
        ret = 0;
out_unlock:
        mutex_unlock(&cpuset_mutex);
        return ret;
}

static void cpuset_cancel_attach(struct cgroup_taskset *tset)
{
        struct cgroup_subsys_state *css;
        struct cpuset *cs;

        cgroup_taskset_first(tset, &css);
        cs = css_cs(css);

        mutex_lock(&cpuset_mutex);
        cs->attach_in_progress--;
        if (!cs->attach_in_progress)
                wake_up(&cpuset_attach_wq);

        if (cs->nr_migrate_dl_tasks) {
                int cpu = cpumask_any(cs->effective_cpus);

                dl_bw_free(cpu, cs->sum_migrate_dl_bw);
                reset_migrate_dl_data(cs);
        }

        mutex_unlock(&cpuset_mutex);
}

/*
 * Protected by cpuset_mutex.  cpus_attach is used only by cpuset_attach()
 * but we can't allocate it dynamically there.  Define it global and
 * allocate from cpuset_init().
 */
static cpumask_var_t cpus_attach;

static void cpuset_attach(struct cgroup_taskset *tset)
{
        /* static buf protected by cpuset_mutex */
        static nodemask_t cpuset_attach_nodemask_to;
        struct task_struct *task;
        struct task_struct *leader;
        struct cgroup_subsys_state *css;
        struct cpuset *cs;
        struct cpuset *oldcs = cpuset_attach_old_cs;

        cgroup_taskset_first(tset, &css);
        cs = css_cs(css);

        lockdep_assert_cpus_held();        /* see cgroup_attach_lock() */
        mutex_lock(&cpuset_mutex);

        /* prepare for attach */
        if (cs == &top_cpuset)
                cpumask_copy(cpus_attach, cpu_possible_mask);
        else
                guarantee_online_cpus(cs, cpus_attach);

        guarantee_online_mems(cs, &cpuset_attach_nodemask_to);

        cgroup_taskset_for_each(task, css, tset) {
                /*
                 * can_attach beforehand should guarantee that this doesn't
                 * fail.  TODO: have a better way to handle failure here
                 */
                WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));

                cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
                cpuset_update_task_spread_flag(cs, task);
        }

        /*
         * Change mm for all threadgroup leaders. This is expensive and may
         * sleep and should be moved outside migration path proper.
         */
        cpuset_attach_nodemask_to = cs->effective_mems;
        cgroup_taskset_for_each_leader(leader, css, tset) {
                struct mm_struct *mm = get_task_mm(leader);

                if (mm) {
                        mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);

                        /*
                         * old_mems_allowed is the same with mems_allowed
                         * here, except if this task is being moved
                         * automatically due to hotplug.  In that case
                         * @mems_allowed has been updated and is empty, so
                         * @old_mems_allowed is the right nodesets that we
                         * migrate mm from.
                         */
                        if (is_memory_migrate(cs))
                                cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
                                                  &cpuset_attach_nodemask_to);
                        else
                                mmput(mm);
                }
        }

        cs->old_mems_allowed = cpuset_attach_nodemask_to;

        if (cs->nr_migrate_dl_tasks) {
                cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks;
                oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks;
                reset_migrate_dl_data(cs);
        }

        cs->attach_in_progress--;
        if (!cs->attach_in_progress)
                wake_up(&cpuset_attach_wq);

        mutex_unlock(&cpuset_mutex);
}

/* The various types of files and directories in a cpuset file system */

typedef enum {
        FILE_MEMORY_MIGRATE,
        FILE_CPULIST,
        FILE_MEMLIST,
        FILE_EFFECTIVE_CPULIST,
        FILE_EFFECTIVE_MEMLIST,
        FILE_SUBPARTS_CPULIST,
        FILE_CPU_EXCLUSIVE,
        FILE_MEM_EXCLUSIVE,
        FILE_MEM_HARDWALL,
        FILE_SCHED_LOAD_BALANCE,
        FILE_PARTITION_ROOT,
        FILE_SCHED_RELAX_DOMAIN_LEVEL,
        FILE_MEMORY_PRESSURE_ENABLED,
        FILE_MEMORY_PRESSURE,
        FILE_SPREAD_PAGE,
        FILE_SPREAD_SLAB,
} cpuset_filetype_t;

static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
                            u64 val)
{
        struct cpuset *cs = css_cs(css);
        cpuset_filetype_t type = cft->private;
        int retval = 0;

        get_online_cpus();
        mutex_lock(&cpuset_mutex);
        if (!is_cpuset_online(cs)) {
                retval = -ENODEV;
                goto out_unlock;
        }

        switch (type) {
        case FILE_CPU_EXCLUSIVE:
                retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
                break;
        case FILE_MEM_EXCLUSIVE:
                retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
                break;
        case FILE_MEM_HARDWALL:
                retval = update_flag(CS_MEM_HARDWALL, cs, val);
                break;
        case FILE_SCHED_LOAD_BALANCE:
                retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
                break;
        case FILE_MEMORY_MIGRATE:
                retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
                break;
        case FILE_MEMORY_PRESSURE_ENABLED:
                cpuset_memory_pressure_enabled = !!val;
                break;
        case FILE_SPREAD_PAGE:
                retval = update_flag(CS_SPREAD_PAGE, cs, val);
                break;
        case FILE_SPREAD_SLAB:
                retval = update_flag(CS_SPREAD_SLAB, cs, val);
                break;
        default:
                retval = -EINVAL;
                break;
        }
out_unlock:
        mutex_unlock(&cpuset_mutex);
        put_online_cpus();
        return retval;
}

static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
                            s64 val)
{
        struct cpuset *cs = css_cs(css);
        cpuset_filetype_t type = cft->private;
        int retval = -ENODEV;

        get_online_cpus();
        mutex_lock(&cpuset_mutex);
        if (!is_cpuset_online(cs))
                goto out_unlock;

        switch (type) {
        case FILE_SCHED_RELAX_DOMAIN_LEVEL:
                retval = update_relax_domain_level(cs, val);
                break;
        default:
                retval = -EINVAL;
                break;
        }
out_unlock:
        mutex_unlock(&cpuset_mutex);
        put_online_cpus();
        return retval;
}

/*
 * Common handling for a write to a "cpus" or "mems" file.
 */
static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
                                    char *buf, size_t nbytes, loff_t off)
{
        struct cpuset *cs = css_cs(of_css(of));
        struct cpuset *trialcs;
        int retval = -ENODEV;

        buf = strstrip(buf);

        /*
         * CPU or memory hotunplug may leave @cs w/o any execution
         * resources, in which case the hotplug code asynchronously updates
         * configuration and transfers all tasks to the nearest ancestor
         * which can execute.
         *
         * As writes to "cpus" or "mems" may restore @cs's execution
         * resources, wait for the previously scheduled operations before
         * proceeding, so that we don't end up keep removing tasks added
         * after execution capability is restored.
         *
         * cpuset_hotplug_work calls back into cgroup core via
         * cgroup_transfer_tasks() and waiting for it from a cgroupfs
         * operation like this one can lead to a deadlock through kernfs
         * active_ref protection.  Let's break the protection.  Losing the
         * protection is okay as we check whether @cs is online after
         * grabbing cpuset_mutex anyway.  This only happens on the legacy
         * hierarchies.
         */
        css_get(&cs->css);
        kernfs_break_active_protection(of->kn);
        flush_work(&cpuset_hotplug_work);

        get_online_cpus();
        mutex_lock(&cpuset_mutex);
        if (!is_cpuset_online(cs))
                goto out_unlock;

        trialcs = alloc_trial_cpuset(cs);
        if (!trialcs) {
                retval = -ENOMEM;
                goto out_unlock;
        }

        switch (of_cft(of)->private) {
        case FILE_CPULIST:
                retval = update_cpumask(cs, trialcs, buf);
                break;
        case FILE_MEMLIST:
                retval = update_nodemask(cs, trialcs, buf);
                break;
        default:
                retval = -EINVAL;
                break;
        }

        free_cpuset(trialcs);
out_unlock:
        mutex_unlock(&cpuset_mutex);
        put_online_cpus();
        kernfs_unbreak_active_protection(of->kn);
        css_put(&cs->css);
        flush_workqueue(cpuset_migrate_mm_wq);
        return retval ?: nbytes;
}

/*
 * These ascii lists should be read in a single call, by using a user
 * buffer large enough to hold the entire map.  If read in smaller
 * chunks, there is no guarantee of atomicity.  Since the display format
 * used, list of ranges of sequential numbers, is variable length,
 * and since these maps can change value dynamically, one could read
 * gibberish by doing partial reads while a list was changing.
 */
static int cpuset_common_seq_show(struct seq_file *sf, void *v)
{
        struct cpuset *cs = css_cs(seq_css(sf));
        cpuset_filetype_t type = seq_cft(sf)->private;
        int ret = 0;

        spin_lock_irq(&callback_lock);

        switch (type) {
        case FILE_CPULIST:
                seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
                break;
        case FILE_MEMLIST:
                seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
                break;
        case FILE_EFFECTIVE_CPULIST:
                seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
                break;
        case FILE_EFFECTIVE_MEMLIST:
                seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
                break;
        case FILE_SUBPARTS_CPULIST:
                seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus));
                break;
        default:
                ret = -EINVAL;
        }

        spin_unlock_irq(&callback_lock);
        return ret;
}

static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
{
        struct cpuset *cs = css_cs(css);
        cpuset_filetype_t type = cft->private;
        switch (type) {
        case FILE_CPU_EXCLUSIVE:
                return is_cpu_exclusive(cs);
        case FILE_MEM_EXCLUSIVE:
                return is_mem_exclusive(cs);
        case FILE_MEM_HARDWALL:
                return is_mem_hardwall(cs);
        case FILE_SCHED_LOAD_BALANCE:
                return is_sched_load_balance(cs);
        case FILE_MEMORY_MIGRATE:
                return is_memory_migrate(cs);
        case FILE_MEMORY_PRESSURE_ENABLED:
                return cpuset_memory_pressure_enabled;
        case FILE_MEMORY_PRESSURE:
                return fmeter_getrate(&cs->fmeter);
        case FILE_SPREAD_PAGE:
                return is_spread_page(cs);
        case FILE_SPREAD_SLAB:
                return is_spread_slab(cs);
        default:
                BUG();
        }

        /* Unreachable but makes gcc happy */
        return 0;
}

static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
{
        struct cpuset *cs = css_cs(css);
        cpuset_filetype_t type = cft->private;
        switch (type) {
        case FILE_SCHED_RELAX_DOMAIN_LEVEL:
                return cs->relax_domain_level;
        default:
                BUG();
        }

        /* Unrechable but makes gcc happy */
        return 0;
}

static int sched_partition_show(struct seq_file *seq, void *v)
{
        struct cpuset *cs = css_cs(seq_css(seq));

        switch (cs->partition_root_state) {
        case PRS_ENABLED:
                seq_puts(seq, "root\n");
                break;
        case PRS_DISABLED:
                seq_puts(seq, "member\n");
                break;
        case PRS_ERROR:
                seq_puts(seq, "root invalid\n");
                break;
        }
        return 0;
}

static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
                                     size_t nbytes, loff_t off)
{
        struct cpuset *cs = css_cs(of_css(of));
        int val;
        int retval = -ENODEV;

        buf = strstrip(buf);

        /*
         * Convert "root" to ENABLED, and convert "member" to DISABLED.
         */
        if (!strcmp(buf, "root"))
                val = PRS_ENABLED;
        else if (!strcmp(buf, "member"))
                val = PRS_DISABLED;
        else
                return -EINVAL;

        css_get(&cs->css);
        get_online_cpus();
        mutex_lock(&cpuset_mutex);
        if (!is_cpuset_online(cs))
                goto out_unlock;

        retval = update_prstate(cs, val);
out_unlock:
        mutex_unlock(&cpuset_mutex);
        put_online_cpus();
        css_put(&cs->css);
        return retval ?: nbytes;
}

/*
 * for the common functions, 'private' gives the type of file
 */

static struct cftype legacy_files[] = {
        {
                .name = "cpus",
                .seq_show = cpuset_common_seq_show,
                .write = cpuset_write_resmask,
                .max_write_len = (100U + 6 * NR_CPUS),
                .private = FILE_CPULIST,
        },

        {
                .name = "mems",
                .seq_show = cpuset_common_seq_show,
                .write = cpuset_write_resmask,
                .max_write_len = (100U + 6 * MAX_NUMNODES),
                .private = FILE_MEMLIST,
        },

        {
                .name = "effective_cpus",
                .seq_show = cpuset_common_seq_show,
                .private = FILE_EFFECTIVE_CPULIST,
        },

        {
                .name = "effective_mems",
                .seq_show = cpuset_common_seq_show,
                .private = FILE_EFFECTIVE_MEMLIST,
        },

        {
                .name = "cpu_exclusive",
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_CPU_EXCLUSIVE,
        },

        {
                .name = "mem_exclusive",
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_MEM_EXCLUSIVE,
        },

        {
                .name = "mem_hardwall",
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_MEM_HARDWALL,
        },

        {
                .name = "sched_load_balance",
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_SCHED_LOAD_BALANCE,
        },

        {
                .name = "sched_relax_domain_level",
                .read_s64 = cpuset_read_s64,
                .write_s64 = cpuset_write_s64,
                .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
        },

        {
                .name = "memory_migrate",
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_MEMORY_MIGRATE,
        },

        {
                .name = "memory_pressure",
                .read_u64 = cpuset_read_u64,
                .private = FILE_MEMORY_PRESSURE,
        },

        {
                .name = "memory_spread_page",
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_SPREAD_PAGE,
        },

        {
                .name = "memory_spread_slab",
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_SPREAD_SLAB,
        },

        {
                .name = "memory_pressure_enabled",
                .flags = CFTYPE_ONLY_ON_ROOT,
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_MEMORY_PRESSURE_ENABLED,
        },

        { }        /* terminate */
};

/*
 * This is currently a minimal set for the default hierarchy. It can be
 * expanded later on by migrating more features and control files from v1.
 */
static struct cftype dfl_files[] = {
        {
                .name = "cpus",
                .seq_show = cpuset_common_seq_show,
                .write = cpuset_write_resmask,
                .max_write_len = (100U + 6 * NR_CPUS),
                .private = FILE_CPULIST,
                .flags = CFTYPE_NOT_ON_ROOT,
        },

        {
                .name = "mems",
                .seq_show = cpuset_common_seq_show,
                .write = cpuset_write_resmask,
                .max_write_len = (100U + 6 * MAX_NUMNODES),
                .private = FILE_MEMLIST,
                .flags = CFTYPE_NOT_ON_ROOT,
        },

        {
                .name = "cpus.effective",
                .seq_show = cpuset_common_seq_show,
                .private = FILE_EFFECTIVE_CPULIST,
        },

        {
                .name = "mems.effective",
                .seq_show = cpuset_common_seq_show,
                .private = FILE_EFFECTIVE_MEMLIST,
        },

        {
                .name = "cpus.partition",
                .seq_show = sched_partition_show,
                .write = sched_partition_write,
                .private = FILE_PARTITION_ROOT,
                .flags = CFTYPE_NOT_ON_ROOT,
        },

        {
                .name = "cpus.subpartitions",
                .seq_show = cpuset_common_seq_show,
                .private = FILE_SUBPARTS_CPULIST,
                .flags = CFTYPE_DEBUG,
        },

        { }        /* terminate */
};


/*
 *        cpuset_css_alloc - allocate a cpuset css
 *        cgrp:        control group that the new cpuset will be part of
 */

static struct cgroup_subsys_state *
cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
{
        struct cpuset *cs;

        if (!parent_css)
                return &top_cpuset.css;

        cs = kzalloc(sizeof(*cs), GFP_KERNEL);
        if (!cs)
                return ERR_PTR(-ENOMEM);

        if (alloc_cpumasks(cs, NULL)) {
                kfree(cs);
                return ERR_PTR(-ENOMEM);
        }

        set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
        nodes_clear(cs->mems_allowed);
        nodes_clear(cs->effective_mems);
        fmeter_init(&cs->fmeter);
        cs->relax_domain_level = -1;

        return &cs->css;
}

static int cpuset_css_online(struct cgroup_subsys_state *css)
{
        struct cpuset *cs = css_cs(css);
        struct cpuset *parent = parent_cs(cs);
        struct cpuset *tmp_cs;
        struct cgroup_subsys_state *pos_css;

        if (!parent)
                return 0;

        get_online_cpus();
        mutex_lock(&cpuset_mutex);

        set_bit(CS_ONLINE, &cs->flags);
        if (is_spread_page(parent))
                set_bit(CS_SPREAD_PAGE, &cs->flags);
        if (is_spread_slab(parent))
                set_bit(CS_SPREAD_SLAB, &cs->flags);

        cpuset_inc();

        spin_lock_irq(&callback_lock);
        if (is_in_v2_mode()) {
                cpumask_copy(cs->effective_cpus, parent->effective_cpus);
                cs->effective_mems = parent->effective_mems;
                cs->use_parent_ecpus = true;
                parent->child_ecpus_count++;
        }
        spin_unlock_irq(&callback_lock);

        if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
                goto out_unlock;

        /*
         * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
         * set.  This flag handling is implemented in cgroup core for
         * histrical reasons - the flag may be specified during mount.
         *
         * Currently, if any sibling cpusets have exclusive cpus or mem, we
         * refuse to clone the configuration - thereby refusing the task to
         * be entered, and as a result refusing the sys_unshare() or
         * clone() which initiated it.  If this becomes a problem for some
         * users who wish to allow that scenario, then this could be
         * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
         * (and likewise for mems) to the new cgroup.
         */
        rcu_read_lock();
        cpuset_for_each_child(tmp_cs, pos_css, parent) {
                if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
                        rcu_read_unlock();
                        goto out_unlock;
                }
        }
        rcu_read_unlock();

        spin_lock_irq(&callback_lock);
        cs->mems_allowed = parent->mems_allowed;
        cs->effective_mems = parent->mems_allowed;
        cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
        cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
        spin_unlock_irq(&callback_lock);
out_unlock:
        mutex_unlock(&cpuset_mutex);
        put_online_cpus();
        return 0;
}

/*
 * If the cpuset being removed has its flag 'sched_load_balance'
 * enabled, then simulate turning sched_load_balance off, which
 * will call rebuild_sched_domains_locked(). That is not needed
 * in the default hierarchy where only changes in partition
 * will cause repartitioning.
 *
 * If the cpuset has the 'sched.partition' flag enabled, simulate
 * turning 'sched.partition" off.
 */

static void cpuset_css_offline(struct cgroup_subsys_state *css)
{
        struct cpuset *cs = css_cs(css);

        get_online_cpus();
        mutex_lock(&cpuset_mutex);

        if (is_partition_root(cs))
                update_prstate(cs, 0);

        if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
            is_sched_load_balance(cs))
                update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);

        if (cs->use_parent_ecpus) {
                struct cpuset *parent = parent_cs(cs);

                cs->use_parent_ecpus = false;
                parent->child_ecpus_count--;
        }

        cpuset_dec();
        clear_bit(CS_ONLINE, &cs->flags);

        mutex_unlock(&cpuset_mutex);
        put_online_cpus();
}

static void cpuset_css_free(struct cgroup_subsys_state *css)
{
        struct cpuset *cs = css_cs(css);

        free_cpuset(cs);
}

static void cpuset_bind(struct cgroup_subsys_state *root_css)
{
        mutex_lock(&cpuset_mutex);
        spin_lock_irq(&callback_lock);

        if (is_in_v2_mode()) {
                cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
                top_cpuset.mems_allowed = node_possible_map;
        } else {
                cpumask_copy(top_cpuset.cpus_allowed,
                             top_cpuset.effective_cpus);
                top_cpuset.mems_allowed = top_cpuset.effective_mems;
        }

        spin_unlock_irq(&callback_lock);
        mutex_unlock(&cpuset_mutex);
}

/*
 * Make sure the new task conform to the current state of its parent,
 * which could have been changed by cpuset just after it inherits the
 * state from the parent and before it sits on the cgroup's task list.
 */
static void cpuset_fork(struct task_struct *task)
{
        if (task_css_is_root(task, cpuset_cgrp_id))
                return;

        set_cpus_allowed_ptr(task, current->cpus_ptr);
        task->mems_allowed = current->mems_allowed;
}

struct cgroup_subsys cpuset_cgrp_subsys = {
        .css_alloc        = cpuset_css_alloc,
        .css_online        = cpuset_css_online,
        .css_offline        = cpuset_css_offline,
        .css_free        = cpuset_css_free,
        .can_attach        = cpuset_can_attach,
        .cancel_attach        = cpuset_cancel_attach,
        .attach                = cpuset_attach,
        .post_attach        = cpuset_post_attach,
        .bind                = cpuset_bind,
        .fork                = cpuset_fork,
        .legacy_cftypes        = legacy_files,
        .dfl_cftypes        = dfl_files,
        .early_init        = true,
        .threaded        = true,
};

/**
 * cpuset_init - initialize cpusets at system boot
 *
 * Description: Initialize top_cpuset
 **/

int __init cpuset_init(void)
{
        BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
        BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
        BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));

        cpumask_setall(top_cpuset.cpus_allowed);
        nodes_setall(top_cpuset.mems_allowed);
        cpumask_setall(top_cpuset.effective_cpus);
        nodes_setall(top_cpuset.effective_mems);

        fmeter_init(&top_cpuset.fmeter);
        set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
        top_cpuset.relax_domain_level = -1;

        BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));

        return 0;
}

/*
 * If CPU and/or memory hotplug handlers, below, unplug any CPUs
 * or memory nodes, we need to walk over the cpuset hierarchy,
 * removing that CPU or node from all cpusets.  If this removes the
 * last CPU or node from a cpuset, then move the tasks in the empty
 * cpuset to its next-highest non-empty parent.
 */
static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
{
        struct cpuset *parent;

        /*
         * Find its next-highest non-empty parent, (top cpuset
         * has online cpus, so can't be empty).
         */
        parent = parent_cs(cs);
        while (cpumask_empty(parent->cpus_allowed) ||
                        nodes_empty(parent->mems_allowed))
                parent = parent_cs(parent);

        if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
                pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
                pr_cont_cgroup_name(cs->css.cgroup);
                pr_cont("\n");
        }
}

static void
hotplug_update_tasks_legacy(struct cpuset *cs,
                            struct cpumask *new_cpus, nodemask_t *new_mems,
                            bool cpus_updated, bool mems_updated)
{
        bool is_empty;

        spin_lock_irq(&callback_lock);
        cpumask_copy(cs->cpus_allowed, new_cpus);
        cpumask_copy(cs->effective_cpus, new_cpus);
        cs->mems_allowed = *new_mems;
        cs->effective_mems = *new_mems;
        spin_unlock_irq(&callback_lock);

        /*
         * Don't call update_tasks_cpumask() if the cpuset becomes empty,
         * as the tasks will be migratecd to an ancestor.
         */
        if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
                update_tasks_cpumask(cs);
        if (mems_updated && !nodes_empty(cs->mems_allowed))
                update_tasks_nodemask(cs);

        is_empty = cpumask_empty(cs->cpus_allowed) ||
                   nodes_empty(cs->mems_allowed);

        mutex_unlock(&cpuset_mutex);

        /*
         * Move tasks to the nearest ancestor with execution resources,
         * This is full cgroup operation which will also call back into
         * cpuset. Should be done outside any lock.
         */
        if (is_empty)
                remove_tasks_in_empty_cpuset(cs);

        mutex_lock(&cpuset_mutex);
}

static void
hotplug_update_tasks(struct cpuset *cs,
                     struct cpumask *new_cpus, nodemask_t *new_mems,
                     bool cpus_updated, bool mems_updated)
{
        if (cpumask_empty(new_cpus))
                cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
        if (nodes_empty(*new_mems))
                *new_mems = parent_cs(cs)->effective_mems;

        spin_lock_irq(&callback_lock);
        cpumask_copy(cs->effective_cpus, new_cpus);
        cs->effective_mems = *new_mems;
        spin_unlock_irq(&callback_lock);

        if (cpus_updated)
                update_tasks_cpumask(cs);
        if (mems_updated)
                update_tasks_nodemask(cs);
}

static bool force_rebuild;

void cpuset_force_rebuild(void)
{
        force_rebuild = true;
}

/**
 * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
 * @cs: cpuset in interest
 * @tmp: the tmpmasks structure pointer
 *
 * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
 * offline, update @cs accordingly.  If @cs ends up with no CPU or memory,
 * all its tasks are moved to the nearest ancestor with both resources.
 */
static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
{
        static cpumask_t new_cpus;
        static nodemask_t new_mems;
        bool cpus_updated;
        bool mems_updated;
        struct cpuset *parent;
retry:
        wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);

        mutex_lock(&cpuset_mutex);

        /*
         * We have raced with task attaching. We wait until attaching
         * is finished, so we won't attach a task to an empty cpuset.
         */
        if (cs->attach_in_progress) {
                mutex_unlock(&cpuset_mutex);
                goto retry;
        }

        parent = parent_cs(cs);
        compute_effective_cpumask(&new_cpus, cs, parent);
        nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);

        if (cs->nr_subparts_cpus)
                /*
                 * Make sure that CPUs allocated to child partitions
                 * do not show up in effective_cpus.
                 */
                cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus);

        if (!tmp || !cs->partition_root_state)
                goto update_tasks;

        /*
         * In the unlikely event that a partition root has empty
         * effective_cpus or its parent becomes erroneous, we have to
         * transition it to the erroneous state.
         */
        if (is_partition_root(cs) && (cpumask_empty(&new_cpus) ||
           (parent->partition_root_state == PRS_ERROR))) {
                if (cs->nr_subparts_cpus) {
                        spin_lock_irq(&callback_lock);
                        cs->nr_subparts_cpus = 0;
                        cpumask_clear(cs->subparts_cpus);
                        spin_unlock_irq(&callback_lock);
                        compute_effective_cpumask(&new_cpus, cs, parent);
                }

                /*
                 * If the effective_cpus is empty because the child
                 * partitions take away all the CPUs, we can keep
                 * the current partition and let the child partitions
                 * fight for available CPUs.
                 */
                if ((parent->partition_root_state == PRS_ERROR) ||
                     cpumask_empty(&new_cpus)) {
                        update_parent_subparts_cpumask(cs, partcmd_disable,
                                                       NULL, tmp);
                        spin_lock_irq(&callback_lock);
                        cs->partition_root_state = PRS_ERROR;
                        spin_unlock_irq(&callback_lock);
                }
                cpuset_force_rebuild();
        }

        /*
         * On the other hand, an erroneous partition root may be transitioned
         * back to a regular one or a partition root with no CPU allocated
         * from the parent may change to erroneous.
         */
        if (is_partition_root(parent) &&
           ((cs->partition_root_state == PRS_ERROR) ||
            !cpumask_intersects(&new_cpus, parent->subparts_cpus)) &&
             update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp))
                cpuset_force_rebuild();

update_tasks:
        cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
        mems_updated = !nodes_equal(new_mems, cs->effective_mems);

        if (mems_updated)
                check_insane_mems_config(&new_mems);

        if (is_in_v2_mode())
                hotplug_update_tasks(cs, &new_cpus, &new_mems,
                                     cpus_updated, mems_updated);
        else
                hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
                                            cpus_updated, mems_updated);

        mutex_unlock(&cpuset_mutex);
}

/**
 * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
 *
 * This function is called after either CPU or memory configuration has
 * changed and updates cpuset accordingly.  The top_cpuset is always
 * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
 * order to make cpusets transparent (of no affect) on systems that are
 * actively using CPU hotplug but making no active use of cpusets.
 *
 * Non-root cpusets are only affected by offlining.  If any CPUs or memory
 * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
 * all descendants.
 *
 * Note that CPU offlining during suspend is ignored.  We don't modify
 * cpusets across suspend/resume cycles at all.
 */
static void cpuset_hotplug_workfn(struct work_struct *work)
{
        static cpumask_t new_cpus;
        static nodemask_t new_mems;
        bool cpus_updated, mems_updated;
        bool on_dfl = is_in_v2_mode();
        struct tmpmasks tmp, *ptmp = NULL;

        if (on_dfl && !alloc_cpumasks(NULL, &tmp))
                ptmp = &tmp;

        mutex_lock(&cpuset_mutex);

        /* fetch the available cpus/mems and find out which changed how */
        cpumask_copy(&new_cpus, cpu_active_mask);
        new_mems = node_states[N_MEMORY];

        /*
         * If subparts_cpus is populated, it is likely that the check below
         * will produce a false positive on cpus_updated when the cpu list
         * isn't changed. It is extra work, but it is better to be safe.
         */
        cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
        mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);

        /*
         * In the rare case that hotplug removes all the cpus in subparts_cpus,
         * we assumed that cpus are updated.
         */
        if (!cpus_updated && top_cpuset.nr_subparts_cpus)
                cpus_updated = true;

        /* synchronize cpus_allowed to cpu_active_mask */
        if (cpus_updated) {
                spin_lock_irq(&callback_lock);
                if (!on_dfl)
                        cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
                /*
                 * Make sure that CPUs allocated to child partitions
                 * do not show up in effective_cpus. If no CPU is left,
                 * we clear the subparts_cpus & let the child partitions
                 * fight for the CPUs again.
                 */
                if (top_cpuset.nr_subparts_cpus) {
                        if (cpumask_subset(&new_cpus,
                                           top_cpuset.subparts_cpus)) {
                                top_cpuset.nr_subparts_cpus = 0;
                                cpumask_clear(top_cpuset.subparts_cpus);
                        } else {
                                cpumask_andnot(&new_cpus, &new_cpus,
                                               top_cpuset.subparts_cpus);
                        }
                }
                cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
                spin_unlock_irq(&callback_lock);
                /* we don't mess with cpumasks of tasks in top_cpuset */
        }

        /* synchronize mems_allowed to N_MEMORY */
        if (mems_updated) {
                spin_lock_irq(&callback_lock);
                if (!on_dfl)
                        top_cpuset.mems_allowed = new_mems;
                top_cpuset.effective_mems = new_mems;
                spin_unlock_irq(&callback_lock);
                update_tasks_nodemask(&top_cpuset);
        }

        mutex_unlock(&cpuset_mutex);

        /* if cpus or mems changed, we need to propagate to descendants */
        if (cpus_updated || mems_updated) {
                struct cpuset *cs;
                struct cgroup_subsys_state *pos_css;

                rcu_read_lock();
                cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
                        if (cs == &top_cpuset || !css_tryget_online(&cs->css))
                                continue;
                        rcu_read_unlock();

                        cpuset_hotplug_update_tasks(cs, ptmp);

                        rcu_read_lock();
                        css_put(&cs->css);
                }
                rcu_read_unlock();
        }

        /* rebuild sched domains if cpus_allowed has changed */
        if (cpus_updated || force_rebuild) {
                force_rebuild = false;
                rebuild_sched_domains();
        }

        free_cpumasks(NULL, ptmp);
}

void cpuset_update_active_cpus(void)
{
        /*
         * We're inside cpu hotplug critical region which usually nests
         * inside cgroup synchronization.  Bounce actual hotplug processing
         * to a work item to avoid reverse locking order.
         */
        schedule_work(&cpuset_hotplug_work);
}

void cpuset_wait_for_hotplug(void)
{
        flush_work(&cpuset_hotplug_work);
}

/*
 * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
 * Call this routine anytime after node_states[N_MEMORY] changes.
 * See cpuset_update_active_cpus() for CPU hotplug handling.
 */
static int cpuset_track_online_nodes(struct notifier_block *self,
                                unsigned long action, void *arg)
{
        schedule_work(&cpuset_hotplug_work);
        return NOTIFY_OK;
}

static struct notifier_block cpuset_track_online_nodes_nb = {
        .notifier_call = cpuset_track_online_nodes,
        .priority = 10,                /* ??! */
};

/**
 * cpuset_init_smp - initialize cpus_allowed
 *
 * Description: Finish top cpuset after cpu, node maps are initialized
 */
void __init cpuset_init_smp(void)
{
        /*
         * cpus_allowd/mems_allowed set to v2 values in the initial
         * cpuset_bind() call will be reset to v1 values in another
         * cpuset_bind() call when v1 cpuset is mounted.
         */
        top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;

        cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
        top_cpuset.effective_mems = node_states[N_MEMORY];

        register_hotmemory_notifier(&cpuset_track_online_nodes_nb);

        cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
        BUG_ON(!cpuset_migrate_mm_wq);
}

/**
 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
 * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
 *
 * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
 * attached to the specified @tsk.  Guaranteed to return some non-empty
 * subset of cpu_online_mask, even if this means going outside the
 * tasks cpuset.
 **/

void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
{
        unsigned long flags;

        spin_lock_irqsave(&callback_lock, flags);
        rcu_read_lock();
        guarantee_online_cpus(task_cs(tsk), pmask);
        rcu_read_unlock();
        spin_unlock_irqrestore(&callback_lock, flags);
}

/**
 * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.
 * @tsk: pointer to task_struct with which the scheduler is struggling
 *
 * Description: In the case that the scheduler cannot find an allowed cpu in
 * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy
 * mode however, this value is the same as task_cs(tsk)->effective_cpus,
 * which will not contain a sane cpumask during cases such as cpu hotplugging.
 * This is the absolute last resort for the scheduler and it is only used if
 * _every_ other avenue has been traveled.
 **/

void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
{
        rcu_read_lock();
        do_set_cpus_allowed(tsk, is_in_v2_mode() ?
                task_cs(tsk)->cpus_allowed : cpu_possible_mask);
        rcu_read_unlock();

        /*
         * We own tsk->cpus_allowed, nobody can change it under us.
         *
         * But we used cs && cs->cpus_allowed lockless and thus can
         * race with cgroup_attach_task() or update_cpumask() and get
         * the wrong tsk->cpus_allowed. However, both cases imply the
         * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
         * which takes task_rq_lock().
         *
         * If we are called after it dropped the lock we must see all
         * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
         * set any mask even if it is not right from task_cs() pov,
         * the pending set_cpus_allowed_ptr() will fix things.
         *
         * select_fallback_rq() will fix things ups and set cpu_possible_mask
         * if required.
         */
}

void __init cpuset_init_current_mems_allowed(void)
{
        nodes_setall(current->mems_allowed);
}

/**
 * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
 * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
 *
 * Description: Returns the nodemask_t mems_allowed of the cpuset
 * attached to the specified @tsk.  Guaranteed to return some non-empty
 * subset of node_states[N_MEMORY], even if this means going outside the
 * tasks cpuset.
 **/

nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
{
        nodemask_t mask;
        unsigned long flags;

        spin_lock_irqsave(&callback_lock, flags);
        rcu_read_lock();
        guarantee_online_mems(task_cs(tsk), &mask);
        rcu_read_unlock();
        spin_unlock_irqrestore(&callback_lock, flags);

        return mask;
}

/**
 * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed
 * @nodemask: the nodemask to be checked
 *
 * Are any of the nodes in the nodemask allowed in current->mems_allowed?
 */
int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
{
        return nodes_intersects(*nodemask, current->mems_allowed);
}

/*
 * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
 * mem_hardwall ancestor to the specified cpuset.  Call holding
 * callback_lock.  If no ancestor is mem_exclusive or mem_hardwall
 * (an unusual configuration), then returns the root cpuset.
 */
static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
{
        while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
                cs = parent_cs(cs);
        return cs;
}

/**
 * cpuset_node_allowed - Can we allocate on a memory node?
 * @node: is this an allowed node?
 * @gfp_mask: memory allocation flags
 *
 * If we're in interrupt, yes, we can always allocate.  If @node is set in
 * current's mems_allowed, yes.  If it's not a __GFP_HARDWALL request and this
 * node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
 * yes.  If current has access to memory reserves as an oom victim, yes.
 * Otherwise, no.
 *
 * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
 * and do not allow allocations outside the current tasks cpuset
 * unless the task has been OOM killed.
 * GFP_KERNEL allocations are not so marked, so can escape to the
 * nearest enclosing hardwalled ancestor cpuset.
 *
 * Scanning up parent cpusets requires callback_lock.  The
 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
 * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
 * current tasks mems_allowed came up empty on the first pass over
 * the zonelist.  So only GFP_KERNEL allocations, if all nodes in the
 * cpuset are short of memory, might require taking the callback_lock.
 *
 * The first call here from mm/page_alloc:get_page_from_freelist()
 * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
 * so no allocation on a node outside the cpuset is allowed (unless
 * in interrupt, of course).
 *
 * The second pass through get_page_from_freelist() doesn't even call
 * here for GFP_ATOMIC calls.  For those calls, the __alloc_pages()
 * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
 * in alloc_flags.  That logic and the checks below have the combined
 * affect that:
 *        in_interrupt - any node ok (current task context irrelevant)
 *        GFP_ATOMIC   - any node ok
 *        tsk_is_oom_victim   - any node ok
 *        GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
 *        GFP_USER     - only nodes in current tasks mems allowed ok.
 */
bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
{
        struct cpuset *cs;                /* current cpuset ancestors */
        int allowed;                        /* is allocation in zone z allowed? */
        unsigned long flags;

        if (in_interrupt())
                return true;
        if (node_isset(node, current->mems_allowed))
                return true;
        /*
         * Allow tasks that have access to memory reserves because they have
         * been OOM killed to get memory anywhere.
         */
        if (unlikely(tsk_is_oom_victim(current)))
                return true;
        if (gfp_mask & __GFP_HARDWALL)        /* If hardwall request, stop here */
                return false;

        if (current->flags & PF_EXITING) /* Let dying task have memory */
                return true;

        /* Not hardwall and node outside mems_allowed: scan up cpusets */
        spin_lock_irqsave(&callback_lock, flags);

        rcu_read_lock();
        cs = nearest_hardwall_ancestor(task_cs(current));
        allowed = node_isset(node, cs->mems_allowed);
        rcu_read_unlock();

        spin_unlock_irqrestore(&callback_lock, flags);
        return allowed;
}

/**
 * cpuset_mem_spread_node() - On which node to begin search for a file page
 * cpuset_slab_spread_node() - On which node to begin search for a slab page
 *
 * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
 * tasks in a cpuset with is_spread_page or is_spread_slab set),
 * and if the memory allocation used cpuset_mem_spread_node()
 * to determine on which node to start looking, as it will for
 * certain page cache or slab cache pages such as used for file
 * system buffers and inode caches, then instead of starting on the
 * local node to look for a free page, rather spread the starting
 * node around the tasks mems_allowed nodes.
 *
 * We don't have to worry about the returned node being offline
 * because "it can't happen", and even if it did, it would be ok.
 *
 * The routines calling guarantee_online_mems() are careful to
 * only set nodes in task->mems_allowed that are online.  So it
 * should not be possible for the following code to return an
 * offline node.  But if it did, that would be ok, as this routine
 * is not returning the node where the allocation must be, only
 * the node where the search should start.  The zonelist passed to
 * __alloc_pages() will include all nodes.  If the slab allocator
 * is passed an offline node, it will fall back to the local node.
 * See kmem_cache_alloc_node().
 */

static int cpuset_spread_node(int *rotor)
{
        return *rotor = next_node_in(*rotor, current->mems_allowed);
}

int cpuset_mem_spread_node(void)
{
        if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
                current->cpuset_mem_spread_rotor =
                        node_random(&current->mems_allowed);

        return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
}

int cpuset_slab_spread_node(void)
{
        if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
                current->cpuset_slab_spread_rotor =
                        node_random(&current->mems_allowed);

        return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
}

EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);

/**
 * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
 * @tsk1: pointer to task_struct of some task.
 * @tsk2: pointer to task_struct of some other task.
 *
 * Description: Return true if @tsk1's mems_allowed intersects the
 * mems_allowed of @tsk2.  Used by the OOM killer to determine if
 * one of the task's memory usage might impact the memory available
 * to the other.
 **/

int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
                                   const struct task_struct *tsk2)
{
        return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
}

/**
 * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed
 *
 * Description: Prints current's name, cpuset name, and cached copy of its
 * mems_allowed to the kernel log.
 */
void cpuset_print_current_mems_allowed(void)
{
        struct cgroup *cgrp;

        rcu_read_lock();

        cgrp = task_cs(current)->css.cgroup;
        pr_cont(",cpuset=");
        pr_cont_cgroup_name(cgrp);
        pr_cont(",mems_allowed=%*pbl",
                nodemask_pr_args(&current->mems_allowed));

        rcu_read_unlock();
}

/*
 * Collection of memory_pressure is suppressed unless
 * this flag is enabled by writing "1" to the special
 * cpuset file 'memory_pressure_enabled' in the root cpuset.
 */

int cpuset_memory_pressure_enabled __read_mostly;

/**
 * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
 *
 * Keep a running average of the rate of synchronous (direct)
 * page reclaim efforts initiated by tasks in each cpuset.
 *
 * This represents the rate at which some task in the cpuset
 * ran low on memory on all nodes it was allowed to use, and
 * had to enter the kernels page reclaim code in an effort to
 * create more free memory by tossing clean pages or swapping
 * or writing dirty pages.
 *
 * Display to user space in the per-cpuset read-only file
 * "memory_pressure".  Value displayed is an integer
 * representing the recent rate of entry into the synchronous
 * (direct) page reclaim by any task attached to the cpuset.
 **/

void __cpuset_memory_pressure_bump(void)
{
        rcu_read_lock();
        fmeter_markevent(&task_cs(current)->fmeter);
        rcu_read_unlock();
}

#ifdef CONFIG_PROC_PID_CPUSET
/*
 * proc_cpuset_show()
 *  - Print tasks cpuset path into seq_file.
 *  - Used for /proc/<pid>/cpuset.
 *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
 *    doesn't really matter if tsk->cpuset changes after we read it,
 *    and we take cpuset_mutex, keeping cpuset_attach() from changing it
 *    anyway.
 */
int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
                     struct pid *pid, struct task_struct *tsk)
{
        char *buf;
        struct cgroup_subsys_state *css;
        int retval;

        retval = -ENOMEM;
        buf = kmalloc(PATH_MAX, GFP_KERNEL);
        if (!buf)
                goto out;

        rcu_read_lock();
        spin_lock_irq(&css_set_lock);
        css = task_css(tsk, cpuset_cgrp_id);
        retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX,
                                       current->nsproxy->cgroup_ns);
        spin_unlock_irq(&css_set_lock);
        rcu_read_unlock();

        if (retval >= PATH_MAX)
                retval = -ENAMETOOLONG;
        if (retval < 0)
                goto out_free;
        seq_puts(m, buf);
        seq_putc(m, '\n');
        retval = 0;
out_free:
        kfree(buf);
out:
        return retval;
}
#endif /* CONFIG_PROC_PID_CPUSET */

/* Display task mems_allowed in /proc/<pid>/status file. */
void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
{
        seq_printf(m, "Mems_allowed:\t%*pb\n",
                   nodemask_pr_args(&task->mems_allowed));
        seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
                   nodemask_pr_args(&task->mems_allowed));
}











































    1 


    1 













    1 
    1 


    1 

























































































































































































































    3 

    3 














    3 


    3 



























    3 

    3 
    3 





































































    1 


    1 







































    1 

























    1 






    1 



















































































    1 
























    1 


    1 
    1 









































    1 
    1 























    1 
    1 
    1 



































































    1 









    1 



    1 













































































































































































































    1 













    1 

    1 








    1 








    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ext4/balloc.c
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993
 *  Big-endian to little-endian byte-swapping/bitmaps by
 *        David S. Miller (davem@caip.rutgers.edu), 1995
 */

#include <linux/time.h>
#include <linux/capability.h>
#include <linux/fs.h>
#include <linux/quotaops.h>
#include <linux/buffer_head.h>
#include "ext4.h"
#include "ext4_jbd2.h"
#include "mballoc.h"

#include <trace/events/ext4.h>

static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
                                            ext4_group_t block_group);
/*
 * balloc.c contains the blocks allocation and deallocation routines
 */

/*
 * Calculate block group number for a given block number
 */
ext4_group_t ext4_get_group_number(struct super_block *sb,
                                   ext4_fsblk_t block)
{
        ext4_group_t group;

        if (test_opt2(sb, STD_GROUP_SIZE))
                group = (block -
                         le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) >>
                        (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3);
        else
                ext4_get_group_no_and_offset(sb, block, &group, NULL);
        return group;
}

/*
 * Calculate the block group number and offset into the block/cluster
 * allocation bitmap, given a block number
 */
void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
                ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp)
{
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
        ext4_grpblk_t offset;

        blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
        offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)) >>
                EXT4_SB(sb)->s_cluster_bits;
        if (offsetp)
                *offsetp = offset;
        if (blockgrpp)
                *blockgrpp = blocknr;

}

/*
 * Check whether the 'block' lives within the 'block_group'. Returns 1 if so
 * and 0 otherwise.
 */
static inline int ext4_block_in_group(struct super_block *sb,
                                      ext4_fsblk_t block,
                                      ext4_group_t block_group)
{
        ext4_group_t actual_group;

        actual_group = ext4_get_group_number(sb, block);
        return (actual_group == block_group) ? 1 : 0;
}

/* Return the number of clusters used for file system metadata; this
 * represents the overhead needed by the file system.
 */
static unsigned ext4_num_overhead_clusters(struct super_block *sb,
                                           ext4_group_t block_group,
                                           struct ext4_group_desc *gdp)
{
        unsigned num_clusters;
        int block_cluster = -1, inode_cluster = -1, itbl_cluster = -1, i, c;
        ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group);
        ext4_fsblk_t itbl_blk;
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        /* This is the number of clusters used by the superblock,
         * block group descriptors, and reserved block group
         * descriptor blocks */
        num_clusters = ext4_num_base_meta_clusters(sb, block_group);

        /*
         * For the allocation bitmaps and inode table, we first need
         * to check to see if the block is in the block group.  If it
         * is, then check to see if the cluster is already accounted
         * for in the clusters used for the base metadata cluster, or
         * if we can increment the base metadata cluster to include
         * that block.  Otherwise, we will have to track the cluster
         * used for the allocation bitmap or inode table explicitly.
         * Normally all of these blocks are contiguous, so the special
         * case handling shouldn't be necessary except for *very*
         * unusual file system layouts.
         */
        if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) {
                block_cluster = EXT4_B2C(sbi,
                                         ext4_block_bitmap(sb, gdp) - start);
                if (block_cluster < num_clusters)
                        block_cluster = -1;
                else if (block_cluster == num_clusters) {
                        num_clusters++;
                        block_cluster = -1;
                }
        }

        if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) {
                inode_cluster = EXT4_B2C(sbi,
                                         ext4_inode_bitmap(sb, gdp) - start);
                if (inode_cluster < num_clusters)
                        inode_cluster = -1;
                else if (inode_cluster == num_clusters) {
                        num_clusters++;
                        inode_cluster = -1;
                }
        }

        itbl_blk = ext4_inode_table(sb, gdp);
        for (i = 0; i < sbi->s_itb_per_group; i++) {
                if (ext4_block_in_group(sb, itbl_blk + i, block_group)) {
                        c = EXT4_B2C(sbi, itbl_blk + i - start);
                        if ((c < num_clusters) || (c == inode_cluster) ||
                            (c == block_cluster) || (c == itbl_cluster))
                                continue;
                        if (c == num_clusters) {
                                num_clusters++;
                                continue;
                        }
                        num_clusters++;
                        itbl_cluster = c;
                }
        }

        if (block_cluster != -1)
                num_clusters++;
        if (inode_cluster != -1)
                num_clusters++;

        return num_clusters;
}

static unsigned int num_clusters_in_group(struct super_block *sb,
                                          ext4_group_t block_group)
{
        unsigned int blocks;

        if (block_group == ext4_get_groups_count(sb) - 1) {
                /*
                 * Even though mke2fs always initializes the first and
                 * last group, just in case some other tool was used,
                 * we need to make sure we calculate the right free
                 * blocks.
                 */
                blocks = ext4_blocks_count(EXT4_SB(sb)->s_es) -
                        ext4_group_first_block_no(sb, block_group);
        } else
                blocks = EXT4_BLOCKS_PER_GROUP(sb);
        return EXT4_NUM_B2C(EXT4_SB(sb), blocks);
}

/* Initializes an uninitialized block bitmap */
static int ext4_init_block_bitmap(struct super_block *sb,
                                   struct buffer_head *bh,
                                   ext4_group_t block_group,
                                   struct ext4_group_desc *gdp)
{
        unsigned int bit, bit_max;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_fsblk_t start, tmp;

        J_ASSERT_BH(bh, buffer_locked(bh));

        /* If checksum is bad mark all blocks used to prevent allocation
         * essentially implementing a per-group read-only flag. */
        if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
                ext4_mark_group_bitmap_corrupted(sb, block_group,
                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT |
                                        EXT4_GROUP_INFO_IBITMAP_CORRUPT);
                return -EFSBADCRC;
        }
        memset(bh->b_data, 0, sb->s_blocksize);

        bit_max = ext4_num_base_meta_clusters(sb, block_group);
        if ((bit_max >> 3) >= bh->b_size)
                return -EFSCORRUPTED;

        for (bit = 0; bit < bit_max; bit++)
                ext4_set_bit(bit, bh->b_data);

        start = ext4_group_first_block_no(sb, block_group);

        /* Set bits for block and inode bitmaps, and inode table */
        tmp = ext4_block_bitmap(sb, gdp);
        if (ext4_block_in_group(sb, tmp, block_group))
                ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);

        tmp = ext4_inode_bitmap(sb, gdp);
        if (ext4_block_in_group(sb, tmp, block_group))
                ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);

        tmp = ext4_inode_table(sb, gdp);
        for (; tmp < ext4_inode_table(sb, gdp) +
                     sbi->s_itb_per_group; tmp++) {
                if (ext4_block_in_group(sb, tmp, block_group))
                        ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
        }

        /*
         * Also if the number of blocks within the group is less than
         * the blocksize * 8 ( which is the size of bitmap ), set rest
         * of the block bitmap to 1
         */
        ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group),
                             sb->s_blocksize * 8, bh->b_data);
        return 0;
}

/* Return the number of free blocks in a block group.  It is used when
 * the block bitmap is uninitialized, so we can't just count the bits
 * in the bitmap. */
unsigned ext4_free_clusters_after_init(struct super_block *sb,
                                       ext4_group_t block_group,
                                       struct ext4_group_desc *gdp)
{
        return num_clusters_in_group(sb, block_group) - 
                ext4_num_overhead_clusters(sb, block_group, gdp);
}

/*
 * The free blocks are managed by bitmaps.  A file system contains several
 * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
 * block for inodes, N blocks for the inode table and data blocks.
 *
 * The file system contains group descriptors which are located after the
 * super block.  Each descriptor contains the number of the bitmap block and
 * the free blocks count in the block.  The descriptors are loaded in memory
 * when a file system is mounted (see ext4_fill_super).
 */

/**
 * ext4_get_group_desc() -- load group descriptor from disk
 * @sb:                        super block
 * @block_group:        given block group
 * @bh:                        pointer to the buffer head to store the block
 *                        group descriptor
 */
struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
                                             ext4_group_t block_group,
                                             struct buffer_head **bh)
{
        unsigned int group_desc;
        unsigned int offset;
        ext4_group_t ngroups = ext4_get_groups_count(sb);
        struct ext4_group_desc *desc;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct buffer_head *bh_p;

        if (block_group >= ngroups) {
                ext4_error(sb, "block_group >= groups_count - block_group = %u,"
                           " groups_count = %u", block_group, ngroups);

                return NULL;
        }

        group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
        offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
        bh_p = sbi_array_rcu_deref(sbi, s_group_desc, group_desc);
        /*
         * sbi_array_rcu_deref returns with rcu unlocked, this is ok since
         * the pointer being dereferenced won't be dereferenced again. By
         * looking at the usage in add_new_gdb() the value isn't modified,
         * just the pointer, and so it remains valid.
         */
        if (!bh_p) {
                ext4_error(sb, "Group descriptor not loaded - "
                           "block_group = %u, group_desc = %u, desc = %u",
                           block_group, group_desc, offset);
                return NULL;
        }

        desc = (struct ext4_group_desc *)(
                (__u8 *)bh_p->b_data +
                offset * EXT4_DESC_SIZE(sb));
        if (bh)
                *bh = bh_p;
        return desc;
}

static ext4_fsblk_t ext4_valid_block_bitmap_padding(struct super_block *sb,
                                                    ext4_group_t block_group,
                                                    struct buffer_head *bh)
{
        ext4_grpblk_t next_zero_bit;
        unsigned long bitmap_size = sb->s_blocksize * 8;
        unsigned int offset = num_clusters_in_group(sb, block_group);

        if (bitmap_size <= offset)
                return 0;

        next_zero_bit = ext4_find_next_zero_bit(bh->b_data, bitmap_size, offset);

        return (next_zero_bit < bitmap_size ? next_zero_bit : 0);
}

struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
                                            ext4_group_t group)
{
        struct ext4_group_info **grp_info;
        long indexv, indexh;

        if (unlikely(group >= EXT4_SB(sb)->s_groups_count))
                return NULL;
        indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
        indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
        grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv);
        return grp_info[indexh];
}

/*
 * Return the block number which was discovered to be invalid, or 0 if
 * the block bitmap is valid.
 */
static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
                                            struct ext4_group_desc *desc,
                                            ext4_group_t block_group,
                                            struct buffer_head *bh)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_grpblk_t offset;
        ext4_grpblk_t next_zero_bit;
        ext4_grpblk_t max_bit = EXT4_CLUSTERS_PER_GROUP(sb);
        ext4_fsblk_t blk;
        ext4_fsblk_t group_first_block;

        if (ext4_has_feature_flex_bg(sb)) {
                /* with FLEX_BG, the inode/block bitmaps and itable
                 * blocks may not be in the group at all
                 * so the bitmap validation will be skipped for those groups
                 * or it has to also read the block group where the bitmaps
                 * are located to verify they are set.
                 */
                return 0;
        }
        group_first_block = ext4_group_first_block_no(sb, block_group);

        /* check whether block bitmap block number is set */
        blk = ext4_block_bitmap(sb, desc);
        offset = blk - group_first_block;
        if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit ||
            !ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data))
                /* bad block bitmap */
                return blk;

        /* check whether the inode bitmap block number is set */
        blk = ext4_inode_bitmap(sb, desc);
        offset = blk - group_first_block;
        if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit ||
            !ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data))
                /* bad block bitmap */
                return blk;

        /* check whether the inode table block number is set */
        blk = ext4_inode_table(sb, desc);
        offset = blk - group_first_block;
        if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit ||
            EXT4_B2C(sbi, offset + sbi->s_itb_per_group) >= max_bit)
                return blk;
        next_zero_bit = ext4_find_next_zero_bit(bh->b_data,
                        EXT4_B2C(sbi, offset + sbi->s_itb_per_group),
                        EXT4_B2C(sbi, offset));
        if (next_zero_bit <
            EXT4_B2C(sbi, offset + sbi->s_itb_per_group))
                /* bad bitmap for inode tables */
                return blk;
        return 0;
}

static int ext4_validate_block_bitmap(struct super_block *sb,
                                      struct ext4_group_desc *desc,
                                      ext4_group_t block_group,
                                      struct buffer_head *bh)
{
        ext4_fsblk_t        blk;
        struct ext4_group_info *grp;

        if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
                return 0;

        grp = ext4_get_group_info(sb, block_group);

        if (buffer_verified(bh))
                return 0;
        if (!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
                return -EFSCORRUPTED;

        ext4_lock_group(sb, block_group);
        if (buffer_verified(bh))
                goto verified;
        if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group,
                                                    desc, bh) ||
                     ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_CRC))) {
                ext4_unlock_group(sb, block_group);
                ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
                ext4_mark_group_bitmap_corrupted(sb, block_group,
                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT);
                return -EFSBADCRC;
        }
        blk = ext4_valid_block_bitmap(sb, desc, block_group, bh);
        if (unlikely(blk != 0)) {
                ext4_unlock_group(sb, block_group);
                ext4_error(sb, "bg %u: block %llu: invalid block bitmap",
                           block_group, blk);
                ext4_mark_group_bitmap_corrupted(sb, block_group,
                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT);
                return -EFSCORRUPTED;
        }
        blk = ext4_valid_block_bitmap_padding(sb, block_group, bh);
        if (unlikely(blk != 0)) {
                ext4_unlock_group(sb, block_group);
                ext4_error(sb, "bg %u: block %llu: padding at end of block bitmap is not set",
                           block_group, blk);
                ext4_mark_group_bitmap_corrupted(sb, block_group,
                                                 EXT4_GROUP_INFO_BBITMAP_CORRUPT);
                return -EFSCORRUPTED;
        }
        set_buffer_verified(bh);
verified:
        ext4_unlock_group(sb, block_group);
        return 0;
}

/**
 * ext4_read_block_bitmap_nowait()
 * @sb:                        super block
 * @block_group:        given block group
 *
 * Read the bitmap for a given block_group,and validate the
 * bits for block/inode/inode tables are set in the bitmaps
 *
 * Return buffer_head on success or an ERR_PTR in case of failure.
 */
struct buffer_head *
ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group,
                              bool ignore_locked)
{
        struct ext4_group_desc *desc;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct buffer_head *bh;
        ext4_fsblk_t bitmap_blk;
        int err;

        desc = ext4_get_group_desc(sb, block_group, NULL);
        if (!desc)
                return ERR_PTR(-EFSCORRUPTED);
        bitmap_blk = ext4_block_bitmap(sb, desc);
        if ((bitmap_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
            (bitmap_blk >= ext4_blocks_count(sbi->s_es))) {
                ext4_error(sb, "Invalid block bitmap block %llu in "
                           "block_group %u", bitmap_blk, block_group);
                ext4_mark_group_bitmap_corrupted(sb, block_group,
                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT);
                return ERR_PTR(-EFSCORRUPTED);
        }
        bh = sb_getblk(sb, bitmap_blk);
        if (unlikely(!bh)) {
                ext4_warning(sb, "Cannot get buffer for block bitmap - "
                             "block_group = %u, block_bitmap = %llu",
                             block_group, bitmap_blk);
                return ERR_PTR(-ENOMEM);
        }

        if (ignore_locked && buffer_locked(bh)) {
                /* buffer under IO already, return if called for prefetching */
                put_bh(bh);
                return NULL;
        }

        if (bitmap_uptodate(bh))
                goto verify;

        lock_buffer(bh);
        if (bitmap_uptodate(bh)) {
                unlock_buffer(bh);
                goto verify;
        }
        ext4_lock_group(sb, block_group);
        if (ext4_has_group_desc_csum(sb) &&
            (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
                if (block_group == 0) {
                        ext4_unlock_group(sb, block_group);
                        unlock_buffer(bh);
                        ext4_error(sb, "Block bitmap for bg 0 marked "
                                   "uninitialized");
                        err = -EFSCORRUPTED;
                        goto out;
                }
                err = ext4_init_block_bitmap(sb, bh, block_group, desc);
                set_bitmap_uptodate(bh);
                set_buffer_uptodate(bh);
                set_buffer_verified(bh);
                ext4_unlock_group(sb, block_group);
                unlock_buffer(bh);
                if (err) {
                        ext4_error(sb, "Failed to init block bitmap for group "
                                   "%u: %d", block_group, err);
                        goto out;
                }
                goto verify;
        }
        ext4_unlock_group(sb, block_group);
        if (buffer_uptodate(bh)) {
                /*
                 * if not uninit if bh is uptodate,
                 * bitmap is also uptodate
                 */
                set_bitmap_uptodate(bh);
                unlock_buffer(bh);
                goto verify;
        }
        /*
         * submit the buffer_head for reading
         */
        set_buffer_new(bh);
        trace_ext4_read_block_bitmap_load(sb, block_group, ignore_locked);
        ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO |
                            (ignore_locked ? REQ_RAHEAD : 0),
                            ext4_end_bitmap_read);
        return bh;
verify:
        err = ext4_validate_block_bitmap(sb, desc, block_group, bh);
        if (err)
                goto out;
        return bh;
out:
        put_bh(bh);
        return ERR_PTR(err);
}

/* Returns 0 on success, -errno on error */
int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
                           struct buffer_head *bh)
{
        struct ext4_group_desc *desc;

        if (!buffer_new(bh))
                return 0;
        desc = ext4_get_group_desc(sb, block_group, NULL);
        if (!desc)
                return -EFSCORRUPTED;
        wait_on_buffer(bh);
        ext4_simulate_fail_bh(sb, bh, EXT4_SIM_BBITMAP_EIO);
        if (!buffer_uptodate(bh)) {
                ext4_error_err(sb, EIO, "Cannot read block bitmap - "
                               "block_group = %u, block_bitmap = %llu",
                               block_group, (unsigned long long) bh->b_blocknr);
                ext4_mark_group_bitmap_corrupted(sb, block_group,
                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT);
                return -EIO;
        }
        clear_buffer_new(bh);
        /* Panic or remount fs read-only if block bitmap is invalid */
        return ext4_validate_block_bitmap(sb, desc, block_group, bh);
}

struct buffer_head *
ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
{
        struct buffer_head *bh;
        int err;

        bh = ext4_read_block_bitmap_nowait(sb, block_group, false);
        if (IS_ERR(bh))
                return bh;
        err = ext4_wait_block_bitmap(sb, block_group, bh);
        if (err) {
                put_bh(bh);
                return ERR_PTR(err);
        }
        return bh;
}

/**
 * ext4_has_free_clusters()
 * @sbi:        in-core super block structure.
 * @nclusters:        number of needed blocks
 * @flags:        flags from ext4_mb_new_blocks()
 *
 * Check if filesystem has nclusters free & available for allocation.
 * On success return 1, return 0 on failure.
 */
static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
                                  s64 nclusters, unsigned int flags)
{
        s64 free_clusters, dirty_clusters, rsv, resv_clusters;
        struct percpu_counter *fcc = &sbi->s_freeclusters_counter;
        struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter;

        free_clusters  = percpu_counter_read_positive(fcc);
        dirty_clusters = percpu_counter_read_positive(dcc);
        resv_clusters = atomic64_read(&sbi->s_resv_clusters);

        /*
         * r_blocks_count should always be multiple of the cluster ratio so
         * we are safe to do a plane bit shift only.
         */
        rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) +
              resv_clusters;

        if (free_clusters - (nclusters + rsv + dirty_clusters) <
                                        EXT4_FREECLUSTERS_WATERMARK) {
                free_clusters  = percpu_counter_sum_positive(fcc);
                dirty_clusters = percpu_counter_sum_positive(dcc);
        }
        /* Check whether we have space after accounting for current
         * dirty clusters & root reserved clusters.
         */
        if (free_clusters >= (rsv + nclusters + dirty_clusters))
                return 1;

        /* Hm, nope.  Are (enough) root reserved clusters available? */
        if (uid_eq(sbi->s_resuid, current_fsuid()) ||
            (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) ||
            (flags & EXT4_MB_USE_ROOT_BLOCKS) ||
            capable(CAP_SYS_RESOURCE)) {

                if (free_clusters >= (nclusters + dirty_clusters +
                                      resv_clusters))
                        return 1;
        }
        /* No free blocks. Let's see if we can dip into reserved pool */
        if (flags & EXT4_MB_USE_RESERVED) {
                if (free_clusters >= (nclusters + dirty_clusters))
                        return 1;
        }

        return 0;
}

int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
                             s64 nclusters, unsigned int flags)
{
        if (ext4_has_free_clusters(sbi, nclusters, flags)) {
                percpu_counter_add(&sbi->s_dirtyclusters_counter, nclusters);
                return 0;
        } else
                return -ENOSPC;
}

/**
 * ext4_should_retry_alloc() - check if a block allocation should be retried
 * @sb:                        superblock
 * @retries:                number of retry attempts made so far
 *
 * ext4_should_retry_alloc() is called when ENOSPC is returned while
 * attempting to allocate blocks.  If there's an indication that a pending
 * journal transaction might free some space and allow another attempt to
 * succeed, this function will wait for the current or committing transaction
 * to complete and then return TRUE.
 */
int ext4_should_retry_alloc(struct super_block *sb, int *retries)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (!sbi->s_journal)
                return 0;

        if (++(*retries) > 3) {
                percpu_counter_inc(&sbi->s_sra_exceeded_retry_limit);
                return 0;
        }

        /*
         * if there's no indication that blocks are about to be freed it's
         * possible we just missed a transaction commit that did so
         */
        smp_mb();
        if (sbi->s_mb_free_pending == 0)
                return ext4_has_free_clusters(sbi, 1, 0);

        /*
         * it's possible we've just missed a transaction commit here,
         * so ignore the returned status
         */
        jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
        (void) jbd2_journal_force_commit_nested(sbi->s_journal);
        return 1;
}

/*
 * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
 *
 * @handle:             handle to this transaction
 * @inode:              file inode
 * @goal:               given target block(filesystem wide)
 * @count:                pointer to total number of clusters needed
 * @errp:               error code
 *
 * Return 1st allocated block number on success, *count stores total account
 * error stores in errp pointer
 */
ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
                                  ext4_fsblk_t goal, unsigned int flags,
                                  unsigned long *count, int *errp)
{
        struct ext4_allocation_request ar;
        ext4_fsblk_t ret;

        memset(&ar, 0, sizeof(ar));
        /* Fill with neighbour allocated blocks */
        ar.inode = inode;
        ar.goal = goal;
        ar.len = count ? *count : 1;
        ar.flags = flags;

        ret = ext4_mb_new_blocks(handle, &ar, errp);
        if (count)
                *count = ar.len;
        /*
         * Account for the allocated meta blocks.  We will never
         * fail EDQUOT for metdata, but we do account for it.
         */
        if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) {
                dquot_alloc_block_nofail(inode,
                                EXT4_C2B(EXT4_SB(inode->i_sb), ar.len));
        }
        return ret;
}

/**
 * ext4_count_free_clusters() -- count filesystem free clusters
 * @sb:                superblock
 *
 * Adds up the number of free clusters from each block group.
 */
ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
{
        ext4_fsblk_t desc_count;
        struct ext4_group_desc *gdp;
        ext4_group_t i;
        ext4_group_t ngroups = ext4_get_groups_count(sb);
        struct ext4_group_info *grp;
#ifdef EXT4FS_DEBUG
        struct ext4_super_block *es;
        ext4_fsblk_t bitmap_count;
        unsigned int x;
        struct buffer_head *bitmap_bh = NULL;

        es = EXT4_SB(sb)->s_es;
        desc_count = 0;
        bitmap_count = 0;
        gdp = NULL;

        for (i = 0; i < ngroups; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
                grp = NULL;
                if (EXT4_SB(sb)->s_group_info)
                        grp = ext4_get_group_info(sb, i);
                if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
                        desc_count += ext4_free_group_clusters(sb, gdp);
                brelse(bitmap_bh);
                bitmap_bh = ext4_read_block_bitmap(sb, i);
                if (IS_ERR(bitmap_bh)) {
                        bitmap_bh = NULL;
                        continue;
                }

                x = ext4_count_free(bitmap_bh->b_data,
                                    EXT4_CLUSTERS_PER_GROUP(sb) / 8);
                printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n",
                        i, ext4_free_group_clusters(sb, gdp), x);
                bitmap_count += x;
        }
        brelse(bitmap_bh);
        printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu"
               ", computed = %llu, %llu\n",
               EXT4_NUM_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)),
               desc_count, bitmap_count);
        return bitmap_count;
#else
        desc_count = 0;
        for (i = 0; i < ngroups; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
                grp = NULL;
                if (EXT4_SB(sb)->s_group_info)
                        grp = ext4_get_group_info(sb, i);
                if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
                        desc_count += ext4_free_group_clusters(sb, gdp);
        }

        return desc_count;
#endif
}

static inline int test_root(ext4_group_t a, int b)
{
        while (1) {
                if (a < b)
                        return 0;
                if (a == b)
                        return 1;
                if ((a % b) != 0)
                        return 0;
                a = a / b;
        }
}

/**
 *        ext4_bg_has_super - number of blocks used by the superblock in group
 *        @sb: superblock for filesystem
 *        @group: group number to check
 *
 *        Return the number of blocks used by the superblock (primary or backup)
 *        in this group.  Currently this will be only 0 or 1.
 */
int ext4_bg_has_super(struct super_block *sb, ext4_group_t group)
{
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;

        if (group == 0)
                return 1;
        if (ext4_has_feature_sparse_super2(sb)) {
                if (group == le32_to_cpu(es->s_backup_bgs[0]) ||
                    group == le32_to_cpu(es->s_backup_bgs[1]))
                        return 1;
                return 0;
        }
        if ((group <= 1) || !ext4_has_feature_sparse_super(sb))
                return 1;
        if (!(group & 1))
                return 0;
        if (test_root(group, 3) || (test_root(group, 5)) ||
            test_root(group, 7))
                return 1;

        return 0;
}

static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb,
                                        ext4_group_t group)
{
        unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
        ext4_group_t first = metagroup * EXT4_DESC_PER_BLOCK(sb);
        ext4_group_t last = first + EXT4_DESC_PER_BLOCK(sb) - 1;

        if (group == first || group == first + 1 || group == last)
                return 1;
        return 0;
}

static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
                                        ext4_group_t group)
{
        if (!ext4_bg_has_super(sb, group))
                return 0;

        if (ext4_has_feature_meta_bg(sb))
                return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
        else
                return EXT4_SB(sb)->s_gdb_count;
}

/**
 *        ext4_bg_num_gdb - number of blocks used by the group table in group
 *        @sb: superblock for filesystem
 *        @group: group number to check
 *
 *        Return the number of blocks used by the group descriptor table
 *        (primary or backup) in this group.  In the future there may be a
 *        different number of descriptor blocks in each group.
 */
unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
{
        unsigned long first_meta_bg =
                        le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
        unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);

        if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg)
                return ext4_bg_num_gdb_nometa(sb, group);

        return ext4_bg_num_gdb_meta(sb,group);

}

/*
 * This function returns the number of file system metadata blocks at
 * the beginning of a block group, including the reserved gdt blocks.
 */
unsigned int ext4_num_base_meta_blocks(struct super_block *sb,
                                       ext4_group_t block_group)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        unsigned num;

        /* Check for superblock and gdt backups in this group */
        num = ext4_bg_has_super(sb, block_group);

        if (!ext4_has_feature_meta_bg(sb) ||
            block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) *
                          sbi->s_desc_per_block) {
                if (num) {
                        num += ext4_bg_num_gdb(sb, block_group);
                        num += le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
                }
        } else { /* For META_BG_BLOCK_GROUPS */
                num += ext4_bg_num_gdb(sb, block_group);
        }
        return num;
}

static unsigned int ext4_num_base_meta_clusters(struct super_block *sb,
                                                ext4_group_t block_group)
{
        return EXT4_NUM_B2C(EXT4_SB(sb), ext4_num_base_meta_blocks(sb, block_group));
}

/**
 *        ext4_inode_to_goal_block - return a hint for block allocation
 *        @inode: inode for block allocation
 *
 *        Return the ideal location to start allocating blocks for a
 *        newly created inode.
 */
ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        ext4_group_t block_group;
        ext4_grpblk_t colour;
        int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
        ext4_fsblk_t bg_start;
        ext4_fsblk_t last_block;

        block_group = ei->i_block_group;
        if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
                /*
                 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
                 * block groups per flexgroup, reserve the first block
                 * group for directories and special files.  Regular
                 * files will start at the second block group.  This
                 * tends to speed up directory access and improves
                 * fsck times.
                 */
                block_group &= ~(flex_size-1);
                if (S_ISREG(inode->i_mode))
                        block_group++;
        }
        bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
        last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;

        /*
         * If we are doing delayed allocation, we don't need take
         * colour into account.
         */
        if (test_opt(inode->i_sb, DELALLOC))
                return bg_start;

        if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
                colour = (task_pid_nr(current) % 16) *
                        (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
        else
                colour = (task_pid_nr(current) % 16) *
                        ((last_block - bg_start) / 16);
        return bg_start + colour;
}



















    2 












    2 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_BH_H
#define _LINUX_BH_H

#include <linux/preempt.h>

#ifdef CONFIG_TRACE_IRQFLAGS
extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
#else
static __always_inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
{
        preempt_count_add(cnt);
        barrier();
}
#endif

static inline void local_bh_disable(void)
{
        __local_bh_disable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
}

extern void _local_bh_enable(void);
extern void __local_bh_enable_ip(unsigned long ip, unsigned int cnt);

static inline void local_bh_enable_ip(unsigned long ip)
{
        __local_bh_enable_ip(ip, SOFTIRQ_DISABLE_OFFSET);
}

static inline void local_bh_enable(void)
{
        __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
}

#endif /* _LINUX_BH_H */





















    4 









































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_TASK_STACK_H
#define _LINUX_SCHED_TASK_STACK_H

/*
 * task->stack (kernel stack) handling interfaces:
 */

#include <linux/sched.h>
#include <linux/magic.h>

#ifdef CONFIG_THREAD_INFO_IN_TASK

/*
 * When accessing the stack of a non-current task that might exit, use
 * try_get_task_stack() instead.  task_stack_page will return a pointer
 * that could get freed out from under you.
 */
static __always_inline void *task_stack_page(const struct task_struct *task)
{
        return task->stack;
}

#define setup_thread_stack(new,old)        do { } while(0)

static __always_inline unsigned long *end_of_stack(const struct task_struct *task)
{
#ifdef CONFIG_STACK_GROWSUP
        return (unsigned long *)((unsigned long)task->stack + THREAD_SIZE) - 1;
#else
        return task->stack;
#endif
}

#elif !defined(__HAVE_THREAD_FUNCTIONS)

#define task_stack_page(task)        ((void *)(task)->stack)

static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
{
        *task_thread_info(p) = *task_thread_info(org);
        task_thread_info(p)->task = p;
}

/*
 * Return the address of the last usable long on the stack.
 *
 * When the stack grows down, this is just above the thread
 * info struct. Going any lower will corrupt the threadinfo.
 *
 * When the stack grows up, this is the highest address.
 * Beyond that position, we corrupt data on the next page.
 */
static inline unsigned long *end_of_stack(struct task_struct *p)
{
#ifdef CONFIG_STACK_GROWSUP
        return (unsigned long *)((unsigned long)task_thread_info(p) + THREAD_SIZE) - 1;
#else
        return (unsigned long *)(task_thread_info(p) + 1);
#endif
}

#endif

#ifdef CONFIG_THREAD_INFO_IN_TASK
static inline void *try_get_task_stack(struct task_struct *tsk)
{
        return refcount_inc_not_zero(&tsk->stack_refcount) ?
                task_stack_page(tsk) : NULL;
}

extern void put_task_stack(struct task_struct *tsk);
#else
static inline void *try_get_task_stack(struct task_struct *tsk)
{
        return task_stack_page(tsk);
}

static inline void put_task_stack(struct task_struct *tsk) {}
#endif

#define task_stack_end_corrupted(task) \
                (*(end_of_stack(task)) != STACK_END_MAGIC)

static inline int object_is_on_stack(const void *obj)
{
        void *stack = task_stack_page(current);

        return (obj >= stack) && (obj < (stack + THREAD_SIZE));
}

extern void thread_stack_cache_init(void);

#ifdef CONFIG_DEBUG_STACK_USAGE
static inline unsigned long stack_not_used(struct task_struct *p)
{
        unsigned long *n = end_of_stack(p);

        do {         /* Skip over canary */
# ifdef CONFIG_STACK_GROWSUP
                n--;
# else
                n++;
# endif
        } while (!*n);

# ifdef CONFIG_STACK_GROWSUP
        return (unsigned long)end_of_stack(p) - (unsigned long)n;
# else
        return (unsigned long)n - (unsigned long)end_of_stack(p);
# endif
}
#endif
extern void set_task_stack_end_magic(struct task_struct *tsk);

#ifndef __HAVE_ARCH_KSTACK_END
static inline int kstack_end(void *addr)
{
        /* Reliable end of stack detection:
         * Some APM bios versions misalign the stack
         */
        return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*)));
}
#endif

#endif /* _LINUX_SCHED_TASK_STACK_H */



































































    4 

































    1 
































































    4 


























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PAGE_REF_H
#define _LINUX_PAGE_REF_H

#include <linux/atomic.h>
#include <linux/mm_types.h>
#include <linux/page-flags.h>
#include <linux/tracepoint-defs.h>

DECLARE_TRACEPOINT(page_ref_set);
DECLARE_TRACEPOINT(page_ref_mod);
DECLARE_TRACEPOINT(page_ref_mod_and_test);
DECLARE_TRACEPOINT(page_ref_mod_and_return);
DECLARE_TRACEPOINT(page_ref_mod_unless);
DECLARE_TRACEPOINT(page_ref_freeze);
DECLARE_TRACEPOINT(page_ref_unfreeze);

#ifdef CONFIG_DEBUG_PAGE_REF

/*
 * Ideally we would want to use the trace_<tracepoint>_enabled() helper
 * functions. But due to include header file issues, that is not
 * feasible. Instead we have to open code the static key functions.
 *
 * See trace_##name##_enabled(void) in include/linux/tracepoint.h
 */
#define page_ref_tracepoint_active(t) tracepoint_enabled(t)

extern void __page_ref_set(struct page *page, int v);
extern void __page_ref_mod(struct page *page, int v);
extern void __page_ref_mod_and_test(struct page *page, int v, int ret);
extern void __page_ref_mod_and_return(struct page *page, int v, int ret);
extern void __page_ref_mod_unless(struct page *page, int v, int u);
extern void __page_ref_freeze(struct page *page, int v, int ret);
extern void __page_ref_unfreeze(struct page *page, int v);

#else

#define page_ref_tracepoint_active(t) false

static inline void __page_ref_set(struct page *page, int v)
{
}
static inline void __page_ref_mod(struct page *page, int v)
{
}
static inline void __page_ref_mod_and_test(struct page *page, int v, int ret)
{
}
static inline void __page_ref_mod_and_return(struct page *page, int v, int ret)
{
}
static inline void __page_ref_mod_unless(struct page *page, int v, int u)
{
}
static inline void __page_ref_freeze(struct page *page, int v, int ret)
{
}
static inline void __page_ref_unfreeze(struct page *page, int v)
{
}

#endif

static inline int page_ref_count(struct page *page)
{
        return atomic_read(&page->_refcount);
}

static inline int page_count(struct page *page)
{
        return atomic_read(&compound_head(page)->_refcount);
}

static inline void set_page_count(struct page *page, int v)
{
        atomic_set(&page->_refcount, v);
        if (page_ref_tracepoint_active(page_ref_set))
                __page_ref_set(page, v);
}

/*
 * Setup the page count before being freed into the page allocator for
 * the first time (boot or memory hotplug)
 */
static inline void init_page_count(struct page *page)
{
        set_page_count(page, 1);
}

static inline void page_ref_add(struct page *page, int nr)
{
        atomic_add(nr, &page->_refcount);
        if (page_ref_tracepoint_active(page_ref_mod))
                __page_ref_mod(page, nr);
}

static inline void page_ref_sub(struct page *page, int nr)
{
        atomic_sub(nr, &page->_refcount);
        if (page_ref_tracepoint_active(page_ref_mod))
                __page_ref_mod(page, -nr);
}

static inline int page_ref_sub_return(struct page *page, int nr)
{
        int ret = atomic_sub_return(nr, &page->_refcount);

        if (page_ref_tracepoint_active(page_ref_mod_and_return))
                __page_ref_mod_and_return(page, -nr, ret);
        return ret;
}

static inline void page_ref_inc(struct page *page)
{
        atomic_inc(&page->_refcount);
        if (page_ref_tracepoint_active(page_ref_mod))
                __page_ref_mod(page, 1);
}

static inline void page_ref_dec(struct page *page)
{
        atomic_dec(&page->_refcount);
        if (page_ref_tracepoint_active(page_ref_mod))
                __page_ref_mod(page, -1);
}

static inline int page_ref_sub_and_test(struct page *page, int nr)
{
        int ret = atomic_sub_and_test(nr, &page->_refcount);

        if (page_ref_tracepoint_active(page_ref_mod_and_test))
                __page_ref_mod_and_test(page, -nr, ret);
        return ret;
}

static inline int page_ref_inc_return(struct page *page)
{
        int ret = atomic_inc_return(&page->_refcount);

        if (page_ref_tracepoint_active(page_ref_mod_and_return))
                __page_ref_mod_and_return(page, 1, ret);
        return ret;
}

static inline int page_ref_dec_and_test(struct page *page)
{
        int ret = atomic_dec_and_test(&page->_refcount);

        if (page_ref_tracepoint_active(page_ref_mod_and_test))
                __page_ref_mod_and_test(page, -1, ret);
        return ret;
}

static inline int page_ref_dec_return(struct page *page)
{
        int ret = atomic_dec_return(&page->_refcount);

        if (page_ref_tracepoint_active(page_ref_mod_and_return))
                __page_ref_mod_and_return(page, -1, ret);
        return ret;
}

static inline int page_ref_add_unless(struct page *page, int nr, int u)
{
        int ret = atomic_add_unless(&page->_refcount, nr, u);

        if (page_ref_tracepoint_active(page_ref_mod_unless))
                __page_ref_mod_unless(page, nr, ret);
        return ret;
}

static inline int page_ref_freeze(struct page *page, int count)
{
        int ret = likely(atomic_cmpxchg(&page->_refcount, count, 0) == count);

        if (page_ref_tracepoint_active(page_ref_freeze))
                __page_ref_freeze(page, count, ret);
        return ret;
}

static inline void page_ref_unfreeze(struct page *page, int count)
{
        VM_BUG_ON_PAGE(page_count(page) != 0, page);
        VM_BUG_ON(count == 0);

        atomic_set_release(&page->_refcount, count);
        if (page_ref_tracepoint_active(page_ref_unfreeze))
                __page_ref_unfreeze(page, count);
}

#endif





































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * The proc filesystem constants/structures
 */
#ifndef _LINUX_PROC_FS_H
#define _LINUX_PROC_FS_H

#include <linux/compiler.h>
#include <linux/types.h>
#include <linux/fs.h>

struct proc_dir_entry;
struct seq_file;
struct seq_operations;

enum {
        /*
         * All /proc entries using this ->proc_ops instance are never removed.
         *
         * If in doubt, ignore this flag.
         */
#ifdef MODULE
        PROC_ENTRY_PERMANENT                = 0U,
#else
        PROC_ENTRY_PERMANENT                = 1U << 0,
#endif

        PROC_ENTRY_proc_read_iter        = 1U << 1,
        PROC_ENTRY_proc_compat_ioctl        = 1U << 2,
};

struct proc_ops {
        unsigned int proc_flags;
        int        (*proc_open)(struct inode *, struct file *);
        ssize_t        (*proc_read)(struct file *, char __user *, size_t, loff_t *);
        ssize_t (*proc_read_iter)(struct kiocb *, struct iov_iter *);
        ssize_t        (*proc_write)(struct file *, const char __user *, size_t, loff_t *);
        loff_t        (*proc_lseek)(struct file *, loff_t, int);
        int        (*proc_release)(struct inode *, struct file *);
        __poll_t (*proc_poll)(struct file *, struct poll_table_struct *);
        long        (*proc_ioctl)(struct file *, unsigned int, unsigned long);
#ifdef CONFIG_COMPAT
        long        (*proc_compat_ioctl)(struct file *, unsigned int, unsigned long);
#endif
        int        (*proc_mmap)(struct file *, struct vm_area_struct *);
        unsigned long (*proc_get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
} __randomize_layout;

/* definitions for hide_pid field */
enum proc_hidepid {
        HIDEPID_OFF          = 0,
        HIDEPID_NO_ACCESS = 1,
        HIDEPID_INVISIBLE = 2,
        HIDEPID_NOT_PTRACEABLE = 4, /* Limit pids to only ptraceable pids */
};

/* definitions for proc mount option pidonly */
enum proc_pidonly {
        PROC_PIDONLY_OFF = 0,
        PROC_PIDONLY_ON  = 1,
};

struct proc_fs_info {
        struct pid_namespace *pid_ns;
        struct dentry *proc_self;        /* For /proc/self */
        struct dentry *proc_thread_self; /* For /proc/thread-self */
        kgid_t pid_gid;
        enum proc_hidepid hide_pid;
        enum proc_pidonly pidonly;
};

static inline struct proc_fs_info *proc_sb_info(struct super_block *sb)
{
        return sb->s_fs_info;
}

#ifdef CONFIG_PROC_FS

typedef int (*proc_write_t)(struct file *, char *, size_t);

extern void proc_root_init(void);
extern void proc_flush_pid(struct pid *);

extern struct proc_dir_entry *proc_symlink(const char *,
                struct proc_dir_entry *, const char *);
struct proc_dir_entry *_proc_mkdir(const char *, umode_t, struct proc_dir_entry *, void *, bool);
extern struct proc_dir_entry *proc_mkdir(const char *, struct proc_dir_entry *);
extern struct proc_dir_entry *proc_mkdir_data(const char *, umode_t,
                                              struct proc_dir_entry *, void *);
extern struct proc_dir_entry *proc_mkdir_mode(const char *, umode_t,
                                              struct proc_dir_entry *);
struct proc_dir_entry *proc_create_mount_point(const char *name);

struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode,
                struct proc_dir_entry *parent, const struct seq_operations *ops,
                unsigned int state_size, void *data);
#define proc_create_seq_data(name, mode, parent, ops, data) \
        proc_create_seq_private(name, mode, parent, ops, 0, data)
#define proc_create_seq(name, mode, parent, ops) \
        proc_create_seq_private(name, mode, parent, ops, 0, NULL)
struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode,
                struct proc_dir_entry *parent,
                int (*show)(struct seq_file *, void *), void *data);
#define proc_create_single(name, mode, parent, show) \
        proc_create_single_data(name, mode, parent, show, NULL)
 
extern struct proc_dir_entry *proc_create_data(const char *, umode_t,
                                               struct proc_dir_entry *,
                                               const struct proc_ops *,
                                               void *);

struct proc_dir_entry *proc_create(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct proc_ops *proc_ops);
extern void proc_set_size(struct proc_dir_entry *, loff_t);
extern void proc_set_user(struct proc_dir_entry *, kuid_t, kgid_t);
extern void *PDE_DATA(const struct inode *);
extern void *proc_get_parent_data(const struct inode *);
extern void proc_remove(struct proc_dir_entry *);
extern void remove_proc_entry(const char *, struct proc_dir_entry *);
extern int remove_proc_subtree(const char *, struct proc_dir_entry *);

struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode,
                struct proc_dir_entry *parent, const struct seq_operations *ops,
                unsigned int state_size, void *data);
#define proc_create_net(name, mode, parent, ops, state_size) \
        proc_create_net_data(name, mode, parent, ops, state_size, NULL)
struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode,
                struct proc_dir_entry *parent,
                int (*show)(struct seq_file *, void *), void *data);
struct proc_dir_entry *proc_create_net_data_write(const char *name, umode_t mode,
                                                  struct proc_dir_entry *parent,
                                                  const struct seq_operations *ops,
                                                  proc_write_t write,
                                                  unsigned int state_size, void *data);
struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mode,
                                                    struct proc_dir_entry *parent,
                                                    int (*show)(struct seq_file *, void *),
                                                    proc_write_t write,
                                                    void *data);
extern struct pid *tgid_pidfd_to_pid(const struct file *file);

struct bpf_iter_aux_info;
extern int bpf_iter_init_seq_net(void *priv_data, struct bpf_iter_aux_info *aux);
extern void bpf_iter_fini_seq_net(void *priv_data);

#ifdef CONFIG_PROC_PID_ARCH_STATUS
/*
 * The architecture which selects CONFIG_PROC_PID_ARCH_STATUS must
 * provide proc_pid_arch_status() definition.
 */
int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
                        struct pid *pid, struct task_struct *task);
#endif /* CONFIG_PROC_PID_ARCH_STATUS */

#else /* CONFIG_PROC_FS */

static inline void proc_root_init(void)
{
}

static inline void proc_flush_pid(struct pid *pid)
{
}

static inline struct proc_dir_entry *proc_symlink(const char *name,
                struct proc_dir_entry *parent,const char *dest) { return NULL;}
static inline struct proc_dir_entry *proc_mkdir(const char *name,
        struct proc_dir_entry *parent) {return NULL;}
static inline struct proc_dir_entry *proc_create_mount_point(const char *name) { return NULL; }
static inline struct proc_dir_entry *_proc_mkdir(const char *name, umode_t mode,
                struct proc_dir_entry *parent, void *data, bool force_lookup)
{
        return NULL;
}
static inline struct proc_dir_entry *proc_mkdir_data(const char *name,
        umode_t mode, struct proc_dir_entry *parent, void *data) { return NULL; }
static inline struct proc_dir_entry *proc_mkdir_mode(const char *name,
        umode_t mode, struct proc_dir_entry *parent) { return NULL; }
#define proc_create_seq_private(name, mode, parent, ops, size, data) ({NULL;})
#define proc_create_seq_data(name, mode, parent, ops, data) ({NULL;})
#define proc_create_seq(name, mode, parent, ops) ({NULL;})
#define proc_create_single(name, mode, parent, show) ({NULL;})
#define proc_create_single_data(name, mode, parent, show, data) ({NULL;})
#define proc_create(name, mode, parent, proc_ops) ({NULL;})
#define proc_create_data(name, mode, parent, proc_ops, data) ({NULL;})

static inline void proc_set_size(struct proc_dir_entry *de, loff_t size) {}
static inline void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid) {}
static inline void *PDE_DATA(const struct inode *inode) {BUG(); return NULL;}
static inline void *proc_get_parent_data(const struct inode *inode) { BUG(); return NULL; }

static inline void proc_remove(struct proc_dir_entry *de) {}
#define remove_proc_entry(name, parent) do {} while (0)
static inline int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) { return 0; }

#define proc_create_net_data(name, mode, parent, ops, state_size, data) ({NULL;})
#define proc_create_net_data_write(name, mode, parent, ops, write, state_size, data) ({NULL;})
#define proc_create_net(name, mode, parent, state_size, ops) ({NULL;})
#define proc_create_net_single(name, mode, parent, show, data) ({NULL;})
#define proc_create_net_single_write(name, mode, parent, show, write, data) ({NULL;})

static inline struct pid *tgid_pidfd_to_pid(const struct file *file)
{
        return ERR_PTR(-EBADF);
}

#endif /* CONFIG_PROC_FS */

struct net;

static inline struct proc_dir_entry *proc_net_mkdir(
        struct net *net, const char *name, struct proc_dir_entry *parent)
{
        return _proc_mkdir(name, 0, parent, net, true);
}

struct ns_common;
int open_related_ns(struct ns_common *ns,
                   struct ns_common *(*get_ns)(struct ns_common *ns));

/* get the associated pid namespace for a file in procfs */
static inline struct pid_namespace *proc_pid_ns(struct super_block *sb)
{
        return proc_sb_info(sb)->pid_ns;
}

bool proc_ns_file(const struct file *file);

#endif /* _LINUX_PROC_FS_H */






































    2 

    2 
    2 


    2 























































































































    2 

    2 




    2 

    2 






    2 



    2 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
// SPDX-License-Identifier: GPL-2.0
#include <linux/spinlock.h>
#include <linux/task_work.h>
#include <linux/tracehook.h>

static struct callback_head work_exited; /* all we need is ->next == NULL */

/**
 * task_work_add - ask the @task to execute @work->func()
 * @task: the task which should run the callback
 * @work: the callback to run
 * @notify: how to notify the targeted task
 *
 * Queue @work for task_work_run() below and notify the @task if @notify
 * is @TWA_RESUME or @TWA_SIGNAL. @TWA_SIGNAL works like signals, in that the
 * it will interrupt the targeted task and run the task_work. @TWA_RESUME
 * work is run only when the task exits the kernel and returns to user mode,
 * or before entering guest mode. Fails if the @task is exiting/exited and thus
 * it can't process this @work. Otherwise @work->func() will be called when the
 * @task goes through one of the aforementioned transitions, or exits.
 *
 * If the targeted task is exiting, then an error is returned and the work item
 * is not queued. It's up to the caller to arrange for an alternative mechanism
 * in that case.
 *
 * Note: there is no ordering guarantee on works queued here. The task_work
 * list is LIFO.
 *
 * RETURNS:
 * 0 if succeeds or -ESRCH.
 */
int task_work_add(struct task_struct *task, struct callback_head *work,
                  enum task_work_notify_mode notify)
{
        struct callback_head *head;

        do {
                head = READ_ONCE(task->task_works);
                if (unlikely(head == &work_exited))
                        return -ESRCH;
                work->next = head;
        } while (cmpxchg(&task->task_works, head, work) != head);

        switch (notify) {
        case TWA_NONE:
                break;
        case TWA_RESUME:
                set_notify_resume(task);
                break;
        case TWA_SIGNAL:
                set_notify_signal(task);
                break;
        default:
                WARN_ON_ONCE(1);
                break;
        }

        return 0;
}

/**
 * task_work_cancel_match - cancel a pending work added by task_work_add()
 * @task: the task which should execute the work
 * @match: match function to call
 *
 * RETURNS:
 * The found work or NULL if not found.
 */
struct callback_head *
task_work_cancel_match(struct task_struct *task,
                       bool (*match)(struct callback_head *, void *data),
                       void *data)
{
        struct callback_head **pprev = &task->task_works;
        struct callback_head *work;
        unsigned long flags;

        if (likely(!task->task_works))
                return NULL;
        /*
         * If cmpxchg() fails we continue without updating pprev.
         * Either we raced with task_work_add() which added the
         * new entry before this work, we will find it again. Or
         * we raced with task_work_run(), *pprev == NULL/exited.
         */
        raw_spin_lock_irqsave(&task->pi_lock, flags);
        while ((work = READ_ONCE(*pprev))) {
                if (!match(work, data))
                        pprev = &work->next;
                else if (cmpxchg(pprev, work, work->next) == work)
                        break;
        }
        raw_spin_unlock_irqrestore(&task->pi_lock, flags);

        return work;
}

static bool task_work_func_match(struct callback_head *cb, void *data)
{
        return cb->func == data;
}

/**
 * task_work_cancel_func - cancel a pending work matching a function added by task_work_add()
 * @task: the task which should execute the func's work
 * @func: identifies the func to match with a work to remove
 *
 * Find the last queued pending work with ->func == @func and remove
 * it from queue.
 *
 * RETURNS:
 * The found work or NULL if not found.
 */
struct callback_head *
task_work_cancel_func(struct task_struct *task, task_work_func_t func)
{
        return task_work_cancel_match(task, task_work_func_match, func);
}

static bool task_work_match(struct callback_head *cb, void *data)
{
        return cb == data;
}

/**
 * task_work_cancel - cancel a pending work added by task_work_add()
 * @task: the task which should execute the work
 * @cb: the callback to remove if queued
 *
 * Remove a callback from a task's queue if queued.
 *
 * RETURNS:
 * True if the callback was queued and got cancelled, false otherwise.
 */
bool task_work_cancel(struct task_struct *task, struct callback_head *cb)
{
        struct callback_head *ret;

        ret = task_work_cancel_match(task, task_work_match, cb);

        return ret == cb;
}

/**
 * task_work_run - execute the works added by task_work_add()
 *
 * Flush the pending works. Should be used by the core kernel code.
 * Called before the task returns to the user-mode or stops, or when
 * it exits. In the latter case task_work_add() can no longer add the
 * new work after task_work_run() returns.
 */
void task_work_run(void)
{
        struct task_struct *task = current;
        struct callback_head *work, *head, *next;

        for (;;) {
                /*
                 * work->func() can do task_work_add(), do not set
                 * work_exited unless the list is empty.
                 */
                do {
                        head = NULL;
                        work = READ_ONCE(task->task_works);
                        if (!work) {
                                if (task->flags & PF_EXITING)
                                        head = &work_exited;
                                else
                                        break;
                        }
                } while (cmpxchg(&task->task_works, work, head) != work);

                if (!work)
                        break;
                /*
                 * Synchronize with task_work_cancel_match(). It can not remove
                 * the first entry == work, cmpxchg(task_works) must fail.
                 * But it can remove another entry from the ->next list.
                 */
                raw_spin_lock_irq(&task->pi_lock);
                raw_spin_unlock_irq(&task->pi_lock);

                do {
                        next = work->next;
                        work->func(work);
                        work = next;
                        cond_resched();
                } while (work);
        }
}













































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_HUGETLB_H
#define _LINUX_HUGETLB_H

#include <linux/mm_types.h>
#include <linux/mmdebug.h>
#include <linux/fs.h>
#include <linux/hugetlb_inline.h>
#include <linux/cgroup.h>
#include <linux/page_ref.h>
#include <linux/list.h>
#include <linux/kref.h>
#include <linux/pgtable.h>
#include <linux/gfp.h>

struct ctl_table;
struct user_struct;
struct mmu_gather;

#ifndef is_hugepd
typedef struct { unsigned long pd; } hugepd_t;
#define is_hugepd(hugepd) (0)
#define __hugepd(x) ((hugepd_t) { (x) })
#endif

#ifdef CONFIG_HUGETLB_PAGE

#include <linux/mempolicy.h>
#include <linux/shm.h>
#include <asm/tlbflush.h>

struct hugepage_subpool {
        spinlock_t lock;
        long count;
        long max_hpages;        /* Maximum huge pages or -1 if no maximum. */
        long used_hpages;        /* Used count against maximum, includes */
                                /* both alloced and reserved pages. */
        struct hstate *hstate;
        long min_hpages;        /* Minimum huge pages or -1 if no minimum. */
        long rsv_hpages;        /* Pages reserved against global pool to */
                                /* sasitfy minimum size. */
};

struct resv_map {
        struct kref refs;
        spinlock_t lock;
        struct list_head regions;
        long adds_in_progress;
        struct list_head region_cache;
        long region_cache_count;
#ifdef CONFIG_CGROUP_HUGETLB
        /*
         * On private mappings, the counter to uncharge reservations is stored
         * here. If these fields are 0, then either the mapping is shared, or
         * cgroup accounting is disabled for this resv_map.
         */
        struct page_counter *reservation_counter;
        unsigned long pages_per_hpage;
        struct cgroup_subsys_state *css;
#endif
};

/*
 * Region tracking -- allows tracking of reservations and instantiated pages
 *                    across the pages in a mapping.
 *
 * The region data structures are embedded into a resv_map and protected
 * by a resv_map's lock.  The set of regions within the resv_map represent
 * reservations for huge pages, or huge pages that have already been
 * instantiated within the map.  The from and to elements are huge page
 * indicies into the associated mapping.  from indicates the starting index
 * of the region.  to represents the first index past the end of  the region.
 *
 * For example, a file region structure with from == 0 and to == 4 represents
 * four huge pages in a mapping.  It is important to note that the to element
 * represents the first element past the end of the region. This is used in
 * arithmetic as 4(to) - 0(from) = 4 huge pages in the region.
 *
 * Interval notation of the form [from, to) will be used to indicate that
 * the endpoint from is inclusive and to is exclusive.
 */
struct file_region {
        struct list_head link;
        long from;
        long to;
#ifdef CONFIG_CGROUP_HUGETLB
        /*
         * On shared mappings, each reserved region appears as a struct
         * file_region in resv_map. These fields hold the info needed to
         * uncharge each reservation.
         */
        struct page_counter *reservation_counter;
        struct cgroup_subsys_state *css;
#endif
};

extern struct resv_map *resv_map_alloc(void);
void resv_map_release(struct kref *ref);

extern spinlock_t hugetlb_lock;
extern int hugetlb_max_hstate __read_mostly;
#define for_each_hstate(h) \
        for ((h) = hstates; (h) < &hstates[hugetlb_max_hstate]; (h)++)

struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
                                                long min_hpages);
void hugepage_put_subpool(struct hugepage_subpool *spool);

void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
int hugetlb_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *);
int hugetlb_overcommit_handler(struct ctl_table *, int, void *, size_t *,
                loff_t *);
int hugetlb_treat_movable_handler(struct ctl_table *, int, void *, size_t *,
                loff_t *);
int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, void *, size_t *,
                loff_t *);

int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
                         struct page **, struct vm_area_struct **,
                         unsigned long *, unsigned long *, long, unsigned int,
                         int *);
void unmap_hugepage_range(struct vm_area_struct *,
                          unsigned long, unsigned long, struct page *);
void __unmap_hugepage_range_final(struct mmu_gather *tlb,
                          struct vm_area_struct *vma,
                          unsigned long start, unsigned long end,
                          struct page *ref_page);
void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
                                unsigned long start, unsigned long end,
                                struct page *ref_page);
void hugetlb_report_meminfo(struct seq_file *);
int hugetlb_report_node_meminfo(char *buf, int len, int nid);
void hugetlb_show_meminfo(void);
unsigned long hugetlb_total_pages(void);
vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, unsigned int flags);
int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
                                struct vm_area_struct *dst_vma,
                                unsigned long dst_addr,
                                unsigned long src_addr,
                                struct page **pagep);
bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
                                                struct vm_area_struct *vma,
                                                vm_flags_t vm_flags);
long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
                                                long freed);
int isolate_hugetlb(struct page *page, struct list_head *list);
void putback_active_hugepage(struct page *page);
void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason);
void free_huge_page(struct page *page);
void hugetlb_fix_reserve_counts(struct inode *inode);
extern struct mutex *hugetlb_fault_mutex_table;
u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx);

pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);

struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage);

extern int sysctl_hugetlb_shm_group;
extern struct list_head huge_boot_pages;

/* arch callbacks */

pte_t *huge_pte_alloc(struct mm_struct *mm,
                        unsigned long addr, unsigned long sz);
pte_t *huge_pte_offset(struct mm_struct *mm,
                       unsigned long addr, unsigned long sz);
int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma,
                unsigned long *addr, pte_t *ptep);
void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma);
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
                                unsigned long *start, unsigned long *end);
struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
                              int write);
struct page *follow_huge_pd(struct vm_area_struct *vma,
                            unsigned long address, hugepd_t hpd,
                            int flags, int pdshift);
struct page *follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address,
                                 int flags);
struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
                                pud_t *pud, int flags);
struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address,
                             pgd_t *pgd, int flags);

int pmd_huge(pmd_t pmd);
int pud_huge(pud_t pud);
unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                unsigned long address, unsigned long end, pgprot_t newprot);

bool is_hugetlb_entry_migration(pte_t pte);
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
void hugetlb_split(struct vm_area_struct *vma, unsigned long addr);

#else /* !CONFIG_HUGETLB_PAGE */

static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
{
}

static inline unsigned long hugetlb_total_pages(void)
{
        return 0;
}

static inline struct address_space *hugetlb_page_mapping_lock_write(
                                                        struct page *hpage)
{
        return NULL;
}

static inline int huge_pmd_unshare(struct mmu_gather *tlb,
                struct vm_area_struct *vma, unsigned long *addr, pte_t *ptep)
{
        return 0;
}

static inline void huge_pmd_unshare_flush(struct mmu_gather *tlb,
                struct vm_area_struct *vma)
{
}

static inline void adjust_range_if_pmd_sharing_possible(
                                struct vm_area_struct *vma,
                                unsigned long *start, unsigned long *end)
{
}

static inline long follow_hugetlb_page(struct mm_struct *mm,
                        struct vm_area_struct *vma, struct page **pages,
                        struct vm_area_struct **vmas, unsigned long *position,
                        unsigned long *nr_pages, long i, unsigned int flags,
                        int *nonblocking)
{
        BUG();
        return 0;
}

static inline struct page *follow_huge_addr(struct mm_struct *mm,
                                        unsigned long address, int write)
{
        return ERR_PTR(-EINVAL);
}

static inline int copy_hugetlb_page_range(struct mm_struct *dst,
                        struct mm_struct *src, struct vm_area_struct *vma)
{
        BUG();
        return 0;
}

static inline void hugetlb_report_meminfo(struct seq_file *m)
{
}

static inline int hugetlb_report_node_meminfo(char *buf, int len, int nid)
{
        return 0;
}

static inline void hugetlb_show_meminfo(void)
{
}

static inline struct page *follow_huge_pd(struct vm_area_struct *vma,
                                unsigned long address, hugepd_t hpd, int flags,
                                int pdshift)
{
        return NULL;
}

static inline struct page *follow_huge_pmd_pte(struct vm_area_struct *vma,
                                unsigned long address, int flags)
{
        return NULL;
}

static inline struct page *follow_huge_pud(struct mm_struct *mm,
                                unsigned long address, pud_t *pud, int flags)
{
        return NULL;
}

static inline struct page *follow_huge_pgd(struct mm_struct *mm,
                                unsigned long address, pgd_t *pgd, int flags)
{
        return NULL;
}

static inline int prepare_hugepage_range(struct file *file,
                                unsigned long addr, unsigned long len)
{
        return -EINVAL;
}

static inline int pmd_huge(pmd_t pmd)
{
        return 0;
}

static inline int pud_huge(pud_t pud)
{
        return 0;
}

static inline int is_hugepage_only_range(struct mm_struct *mm,
                                        unsigned long addr, unsigned long len)
{
        return 0;
}

static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
                                unsigned long addr, unsigned long end,
                                unsigned long floor, unsigned long ceiling)
{
        BUG();
}

static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
                                                pte_t *dst_pte,
                                                struct vm_area_struct *dst_vma,
                                                unsigned long dst_addr,
                                                unsigned long src_addr,
                                                struct page **pagep)
{
        BUG();
        return 0;
}

static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr,
                                        unsigned long sz)
{
        return NULL;
}

static inline int isolate_hugetlb(struct page *page, struct list_head *list)
{
        return -EBUSY;
}

static inline void putback_active_hugepage(struct page *page)
{
}

static inline void move_hugetlb_state(struct page *oldpage,
                                        struct page *newpage, int reason)
{
}

static inline unsigned long hugetlb_change_protection(
                        struct vm_area_struct *vma, unsigned long address,
                        unsigned long end, pgprot_t newprot)
{
        return 0;
}

static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb,
                        struct vm_area_struct *vma, unsigned long start,
                        unsigned long end, struct page *ref_page)
{
        BUG();
}

static inline void __unmap_hugepage_range(struct mmu_gather *tlb,
                        struct vm_area_struct *vma, unsigned long start,
                        unsigned long end, struct page *ref_page)
{
        BUG();
}

static inline vm_fault_t hugetlb_fault(struct mm_struct *mm,
                        struct vm_area_struct *vma, unsigned long address,
                        unsigned int flags)
{
        BUG();
        return 0;
}

static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { }

static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {}

#endif /* !CONFIG_HUGETLB_PAGE */
/*
 * hugepages at page global directory. If arch support
 * hugepages at pgd level, they need to define this.
 */
#ifndef pgd_huge
#define pgd_huge(x)        0
#endif
#ifndef p4d_huge
#define p4d_huge(x)        0
#endif

#ifndef pgd_write
static inline int pgd_write(pgd_t pgd)
{
        BUG();
        return 0;
}
#endif

#define HUGETLB_ANON_FILE "anon_hugepage"

enum {
        /*
         * The file will be used as an shm file so shmfs accounting rules
         * apply
         */
        HUGETLB_SHMFS_INODE     = 1,
        /*
         * The file is being created on the internal vfs mount and shmfs
         * accounting rules do not apply
         */
        HUGETLB_ANONHUGE_INODE  = 2,
};

#ifdef CONFIG_HUGETLBFS
struct hugetlbfs_sb_info {
        long        max_inodes;   /* inodes allowed */
        long        free_inodes;  /* inodes free */
        spinlock_t        stat_lock;
        struct hstate *hstate;
        struct hugepage_subpool *spool;
        kuid_t        uid;
        kgid_t        gid;
        umode_t mode;
};

static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
{
        return sb->s_fs_info;
}

struct hugetlbfs_inode_info {
        struct shared_policy policy;
        struct inode vfs_inode;
        unsigned int seals;
};

static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
{
        return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
}

extern const struct file_operations hugetlbfs_file_operations;
extern const struct vm_operations_struct hugetlb_vm_ops;
struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
                                struct user_struct **user, int creat_flags,
                                int page_size_log);

static inline bool is_file_hugepages(struct file *file)
{
        if (file->f_op == &hugetlbfs_file_operations)
                return true;

        return is_file_shm_hugepages(file);
}

static inline struct hstate *hstate_inode(struct inode *i)
{
        return HUGETLBFS_SB(i->i_sb)->hstate;
}
#else /* !CONFIG_HUGETLBFS */

#define is_file_hugepages(file)                        false
static inline struct file *
hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag,
                struct user_struct **user, int creat_flags,
                int page_size_log)
{
        return ERR_PTR(-ENOSYS);
}

static inline struct hstate *hstate_inode(struct inode *i)
{
        return NULL;
}
#endif /* !CONFIG_HUGETLBFS */

#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
                                        unsigned long len, unsigned long pgoff,
                                        unsigned long flags);
#endif /* HAVE_ARCH_HUGETLB_UNMAPPED_AREA */

#ifdef CONFIG_HUGETLB_PAGE

#define HSTATE_NAME_LEN 32
/* Defines one hugetlb page size */
struct hstate {
        int next_nid_to_alloc;
        int next_nid_to_free;
        unsigned int order;
        unsigned long mask;
        unsigned long max_huge_pages;
        unsigned long nr_huge_pages;
        unsigned long free_huge_pages;
        unsigned long resv_huge_pages;
        unsigned long surplus_huge_pages;
        unsigned long nr_overcommit_huge_pages;
        struct list_head hugepage_activelist;
        struct list_head hugepage_freelists[MAX_NUMNODES];
        unsigned int nr_huge_pages_node[MAX_NUMNODES];
        unsigned int free_huge_pages_node[MAX_NUMNODES];
        unsigned int surplus_huge_pages_node[MAX_NUMNODES];
#ifdef CONFIG_CGROUP_HUGETLB
        /* cgroup control files */
        struct cftype cgroup_files_dfl[7];
        struct cftype cgroup_files_legacy[9];
#endif
        char name[HSTATE_NAME_LEN];
};

struct huge_bootmem_page {
        struct list_head list;
        struct hstate *hstate;
};

struct page *alloc_huge_page(struct vm_area_struct *vma,
                                unsigned long addr, int avoid_reserve);
struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
                                nodemask_t *nmask, gfp_t gfp_mask);
struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
                                unsigned long address);
int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
                        pgoff_t idx);

/* arch callback */
int __init __alloc_bootmem_huge_page(struct hstate *h);
int __init alloc_bootmem_huge_page(struct hstate *h);

void __init hugetlb_add_hstate(unsigned order);
bool __init arch_hugetlb_valid_size(unsigned long size);
struct hstate *size_to_hstate(unsigned long size);

#ifndef HUGE_MAX_HSTATE
#define HUGE_MAX_HSTATE 1
#endif

extern struct hstate hstates[HUGE_MAX_HSTATE];
extern unsigned int default_hstate_idx;

#define default_hstate (hstates[default_hstate_idx])

static inline struct hstate *hstate_file(struct file *f)
{
        return hstate_inode(file_inode(f));
}

static inline struct hstate *hstate_sizelog(int page_size_log)
{
        if (!page_size_log)
                return &default_hstate;

        if (page_size_log < BITS_PER_LONG)
                return size_to_hstate(1UL << page_size_log);

        return NULL;
}

static inline struct hstate *hstate_vma(struct vm_area_struct *vma)
{
        return hstate_file(vma->vm_file);
}

static inline unsigned long huge_page_size(struct hstate *h)
{
        return (unsigned long)PAGE_SIZE << h->order;
}

extern unsigned long vma_kernel_pagesize(struct vm_area_struct *vma);

extern unsigned long vma_mmu_pagesize(struct vm_area_struct *vma);

static inline unsigned long huge_page_mask(struct hstate *h)
{
        return h->mask;
}

static inline unsigned int huge_page_order(struct hstate *h)
{
        return h->order;
}

static inline unsigned huge_page_shift(struct hstate *h)
{
        return h->order + PAGE_SHIFT;
}

static inline bool hstate_is_gigantic(struct hstate *h)
{
        return huge_page_order(h) >= MAX_ORDER;
}

static inline unsigned int pages_per_huge_page(struct hstate *h)
{
        return 1 << h->order;
}

static inline unsigned int blocks_per_huge_page(struct hstate *h)
{
        return huge_page_size(h) / 512;
}

#include <asm/hugetlb.h>

#ifndef is_hugepage_only_range
static inline int is_hugepage_only_range(struct mm_struct *mm,
                                        unsigned long addr, unsigned long len)
{
        return 0;
}
#define is_hugepage_only_range is_hugepage_only_range
#endif

#ifndef arch_clear_hugepage_flags
static inline void arch_clear_hugepage_flags(struct page *page) { }
#define arch_clear_hugepage_flags arch_clear_hugepage_flags
#endif

#ifndef arch_make_huge_pte
static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
                                       struct page *page, int writable)
{
        return entry;
}
#endif

static inline struct hstate *page_hstate(struct page *page)
{
        VM_BUG_ON_PAGE(!PageHuge(page), page);
        return size_to_hstate(page_size(page));
}

static inline unsigned hstate_index_to_shift(unsigned index)
{
        return hstates[index].order + PAGE_SHIFT;
}

static inline int hstate_index(struct hstate *h)
{
        return h - hstates;
}

extern int dissolve_free_huge_page(struct page *page);
extern int dissolve_free_huge_pages(unsigned long start_pfn,
                                    unsigned long end_pfn);

#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
#ifndef arch_hugetlb_migration_supported
static inline bool arch_hugetlb_migration_supported(struct hstate *h)
{
        if ((huge_page_shift(h) == PMD_SHIFT) ||
                (huge_page_shift(h) == PUD_SHIFT) ||
                        (huge_page_shift(h) == PGDIR_SHIFT))
                return true;
        else
                return false;
}
#endif
#else
static inline bool arch_hugetlb_migration_supported(struct hstate *h)
{
        return false;
}
#endif

static inline bool hugepage_migration_supported(struct hstate *h)
{
        return arch_hugetlb_migration_supported(h);
}

/*
 * Movability check is different as compared to migration check.
 * It determines whether or not a huge page should be placed on
 * movable zone or not. Movability of any huge page should be
 * required only if huge page size is supported for migration.
 * There wont be any reason for the huge page to be movable if
 * it is not migratable to start with. Also the size of the huge
 * page should be large enough to be placed under a movable zone
 * and still feasible enough to be migratable. Just the presence
 * in movable zone does not make the migration feasible.
 *
 * So even though large huge page sizes like the gigantic ones
 * are migratable they should not be movable because its not
 * feasible to migrate them from movable zone.
 */
static inline bool hugepage_movable_supported(struct hstate *h)
{
        if (!hugepage_migration_supported(h))
                return false;

        if (hstate_is_gigantic(h))
                return false;
        return true;
}

/* Movability of hugepages depends on migration support. */
static inline gfp_t htlb_alloc_mask(struct hstate *h)
{
        if (hugepage_movable_supported(h))
                return GFP_HIGHUSER_MOVABLE;
        else
                return GFP_HIGHUSER;
}

static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
{
        gfp_t modified_mask = htlb_alloc_mask(h);

        /* Some callers might want to enforce node */
        modified_mask |= (gfp_mask & __GFP_THISNODE);

        modified_mask |= (gfp_mask & __GFP_NOWARN);

        return modified_mask;
}

static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
                                           struct mm_struct *mm, pte_t *pte)
{
        if (huge_page_size(h) == PMD_SIZE)
                return pmd_lockptr(mm, (pmd_t *) pte);
        VM_BUG_ON(huge_page_size(h) == PAGE_SIZE);
        return &mm->page_table_lock;
}

#ifndef hugepages_supported
/*
 * Some platform decide whether they support huge pages at boot
 * time. Some of them, such as powerpc, set HPAGE_SHIFT to 0
 * when there is no such support
 */
#define hugepages_supported() (HPAGE_SHIFT != 0)
#endif

void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm);

static inline void hugetlb_count_init(struct mm_struct *mm)
{
        atomic_long_set(&mm->hugetlb_usage, 0);
}

static inline void hugetlb_count_add(long l, struct mm_struct *mm)
{
        atomic_long_add(l, &mm->hugetlb_usage);
}

static inline void hugetlb_count_sub(long l, struct mm_struct *mm)
{
        atomic_long_sub(l, &mm->hugetlb_usage);
}

#ifndef set_huge_swap_pte_at
static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
                                        pte_t *ptep, pte_t pte, unsigned long sz)
{
        set_huge_pte_at(mm, addr, ptep, pte);
}
#endif

#ifndef huge_ptep_modify_prot_start
#define huge_ptep_modify_prot_start huge_ptep_modify_prot_start
static inline pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma,
                                                unsigned long addr, pte_t *ptep)
{
        return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep);
}
#endif

#ifndef huge_ptep_modify_prot_commit
#define huge_ptep_modify_prot_commit huge_ptep_modify_prot_commit
static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
                                                unsigned long addr, pte_t *ptep,
                                                pte_t old_pte, pte_t pte)
{
        set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
}
#endif

void set_page_huge_active(struct page *page);

#else        /* CONFIG_HUGETLB_PAGE */
struct hstate {};

static inline struct page *alloc_huge_page(struct vm_area_struct *vma,
                                           unsigned long addr,
                                           int avoid_reserve)
{
        return NULL;
}

static inline struct page *
alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
                        nodemask_t *nmask, gfp_t gfp_mask)
{
        return NULL;
}

static inline struct page *alloc_huge_page_vma(struct hstate *h,
                                               struct vm_area_struct *vma,
                                               unsigned long address)
{
        return NULL;
}

static inline int __alloc_bootmem_huge_page(struct hstate *h)
{
        return 0;
}

static inline struct hstate *hstate_file(struct file *f)
{
        return NULL;
}

static inline struct hstate *hstate_sizelog(int page_size_log)
{
        return NULL;
}

static inline struct hstate *hstate_vma(struct vm_area_struct *vma)
{
        return NULL;
}

static inline struct hstate *page_hstate(struct page *page)
{
        return NULL;
}

static inline unsigned long huge_page_size(struct hstate *h)
{
        return PAGE_SIZE;
}

static inline unsigned long huge_page_mask(struct hstate *h)
{
        return PAGE_MASK;
}

static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
{
        return PAGE_SIZE;
}

static inline unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
{
        return PAGE_SIZE;
}

static inline unsigned int huge_page_order(struct hstate *h)
{
        return 0;
}

static inline unsigned int huge_page_shift(struct hstate *h)
{
        return PAGE_SHIFT;
}

static inline bool hstate_is_gigantic(struct hstate *h)
{
        return false;
}

static inline unsigned int pages_per_huge_page(struct hstate *h)
{
        return 1;
}

static inline unsigned hstate_index_to_shift(unsigned index)
{
        return 0;
}

static inline int hstate_index(struct hstate *h)
{
        return 0;
}

static inline int dissolve_free_huge_page(struct page *page)
{
        return 0;
}

static inline int dissolve_free_huge_pages(unsigned long start_pfn,
                                           unsigned long end_pfn)
{
        return 0;
}

static inline bool hugepage_migration_supported(struct hstate *h)
{
        return false;
}

static inline bool hugepage_movable_supported(struct hstate *h)
{
        return false;
}

static inline gfp_t htlb_alloc_mask(struct hstate *h)
{
        return 0;
}

static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
{
        return 0;
}

static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
                                           struct mm_struct *mm, pte_t *pte)
{
        return &mm->page_table_lock;
}

static inline void hugetlb_count_init(struct mm_struct *mm)
{
}

static inline void hugetlb_report_usage(struct seq_file *f, struct mm_struct *m)
{
}

static inline void hugetlb_count_sub(long l, struct mm_struct *mm)
{
}

static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
                                        pte_t *ptep, pte_t pte, unsigned long sz)
{
}
#endif        /* CONFIG_HUGETLB_PAGE */

static inline spinlock_t *huge_pte_lock(struct hstate *h,
                                        struct mm_struct *mm, pte_t *pte)
{
        spinlock_t *ptl;

        ptl = huge_pte_lockptr(h, mm, pte);
        spin_lock(ptl);
        return ptl;
}

#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
extern void __init hugetlb_cma_reserve(int order);
extern void __init hugetlb_cma_check(void);
#else
static inline __init void hugetlb_cma_reserve(int order)
{
}
static inline __init void hugetlb_cma_check(void)
{
}
#endif

#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
static inline bool hugetlb_pmd_shared(pte_t *pte)
{
        return atomic_read(&virt_to_page(pte)->pt_share_count);
}
#else
static inline bool hugetlb_pmd_shared(pte_t *pte)
{
        return false;
}
#endif

#endif /* _LINUX_HUGETLB_H */





































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 1999-2002 Vojtech Pavlik
 */
#ifndef _SERIO_H
#define _SERIO_H


#include <linux/types.h>
#include <linux/interrupt.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/mutex.h>
#include <linux/device.h>
#include <linux/mod_devicetable.h>
#include <uapi/linux/serio.h>

extern struct bus_type serio_bus;

struct serio {
        void *port_data;

        char name[32];
        char phys[32];
        char firmware_id[128];

        bool manual_bind;

        struct serio_device_id id;

        /* Protects critical sections from port's interrupt handler */
        spinlock_t lock;

        int (*write)(struct serio *, unsigned char);
        int (*open)(struct serio *);
        void (*close)(struct serio *);
        int (*start)(struct serio *);
        void (*stop)(struct serio *);

        struct serio *parent;
        /* Entry in parent->children list */
        struct list_head child_node;
        struct list_head children;
        /* Level of nesting in serio hierarchy */
        unsigned int depth;

        /*
         * serio->drv is accessed from interrupt handlers; when modifying
         * caller should acquire serio->drv_mutex and serio->lock.
         */
        struct serio_driver *drv;
        /* Protects serio->drv so attributes can pin current driver */
        struct mutex drv_mutex;

        struct device dev;

        struct list_head node;

        /*
         * For use by PS/2 layer when several ports share hardware and
         * may get indigestion when exposed to concurrent access (i8042).
         */
        struct mutex *ps2_cmd_mutex;
};
#define to_serio_port(d)        container_of(d, struct serio, dev)

struct serio_driver {
        const char *description;

        const struct serio_device_id *id_table;
        bool manual_bind;

        void (*write_wakeup)(struct serio *);
        irqreturn_t (*interrupt)(struct serio *, unsigned char, unsigned int);
        int  (*connect)(struct serio *, struct serio_driver *drv);
        int  (*reconnect)(struct serio *);
        int  (*fast_reconnect)(struct serio *);
        void (*disconnect)(struct serio *);
        void (*cleanup)(struct serio *);

        struct device_driver driver;
};
#define to_serio_driver(d)        container_of(d, struct serio_driver, driver)

int serio_open(struct serio *serio, struct serio_driver *drv);
void serio_close(struct serio *serio);
void serio_rescan(struct serio *serio);
void serio_reconnect(struct serio *serio);
irqreturn_t serio_interrupt(struct serio *serio, unsigned char data, unsigned int flags);

void __serio_register_port(struct serio *serio, struct module *owner);

/* use a define to avoid include chaining to get THIS_MODULE */
#define serio_register_port(serio) \
        __serio_register_port(serio, THIS_MODULE)

void serio_unregister_port(struct serio *serio);
void serio_unregister_child_port(struct serio *serio);

int __must_check __serio_register_driver(struct serio_driver *drv,
                                struct module *owner, const char *mod_name);

/* use a define to avoid include chaining to get THIS_MODULE & friends */
#define serio_register_driver(drv) \
        __serio_register_driver(drv, THIS_MODULE, KBUILD_MODNAME)

void serio_unregister_driver(struct serio_driver *drv);

/**
 * module_serio_driver() - Helper macro for registering a serio driver
 * @__serio_driver: serio_driver struct
 *
 * Helper macro for serio drivers which do not do anything special in
 * module init/exit. This eliminates a lot of boilerplate. Each module
 * may only use this macro once, and calling it replaces module_init()
 * and module_exit().
 */
#define module_serio_driver(__serio_driver) \
        module_driver(__serio_driver, serio_register_driver, \
                       serio_unregister_driver)

static inline int serio_write(struct serio *serio, unsigned char data)
{
        if (serio->write)
                return serio->write(serio, data);
        else
                return -1;
}

static inline void serio_drv_write_wakeup(struct serio *serio)
{
        if (serio->drv && serio->drv->write_wakeup)
                serio->drv->write_wakeup(serio);
}

/*
 * Use the following functions to manipulate serio's per-port
 * driver-specific data.
 */
static inline void *serio_get_drvdata(struct serio *serio)
{
        return dev_get_drvdata(&serio->dev);
}

static inline void serio_set_drvdata(struct serio *serio, void *data)
{
        dev_set_drvdata(&serio->dev, data);
}

/*
 * Use the following functions to protect critical sections in
 * driver code from port's interrupt handler
 */
static inline void serio_pause_rx(struct serio *serio)
{
        spin_lock_irq(&serio->lock);
}

static inline void serio_continue_rx(struct serio *serio)
{
        spin_unlock_irq(&serio->lock);
}

#endif







































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_COMPAT_H
#define _LINUX_COMPAT_H
/*
 * These are the type definitions for the architecture specific
 * syscall compatibility layer.
 */

#include <linux/types.h>
#include <linux/time.h>

#include <linux/stat.h>
#include <linux/param.h>        /* for HZ */
#include <linux/sem.h>
#include <linux/socket.h>
#include <linux/if.h>
#include <linux/fs.h>
#include <linux/aio_abi.h>        /* for aio_context_t */
#include <linux/uaccess.h>
#include <linux/unistd.h>

#include <asm/compat.h>

#ifdef CONFIG_COMPAT
#include <asm/siginfo.h>
#include <asm/signal.h>
#endif

#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER
/*
 * It may be useful for an architecture to override the definitions of the
 * COMPAT_SYSCALL_DEFINE0 and COMPAT_SYSCALL_DEFINEx() macros, in particular
 * to use a different calling convention for syscalls. To allow for that,
 + the prototypes for the compat_sys_*() functions below will *not* be included
 * if CONFIG_ARCH_HAS_SYSCALL_WRAPPER is enabled.
 */
#include <asm/syscall_wrapper.h>
#endif /* CONFIG_ARCH_HAS_SYSCALL_WRAPPER */

#ifndef COMPAT_USE_64BIT_TIME
#define COMPAT_USE_64BIT_TIME 0
#endif

#ifndef __SC_DELOUSE
#define __SC_DELOUSE(t,v) ((__force t)(unsigned long)(v))
#endif

#ifndef COMPAT_SYSCALL_DEFINE0
#define COMPAT_SYSCALL_DEFINE0(name) \
        asmlinkage long compat_sys_##name(void); \
        ALLOW_ERROR_INJECTION(compat_sys_##name, ERRNO); \
        asmlinkage long compat_sys_##name(void)
#endif /* COMPAT_SYSCALL_DEFINE0 */

#define COMPAT_SYSCALL_DEFINE1(name, ...) \
        COMPAT_SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
#define COMPAT_SYSCALL_DEFINE2(name, ...) \
        COMPAT_SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
#define COMPAT_SYSCALL_DEFINE3(name, ...) \
        COMPAT_SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
#define COMPAT_SYSCALL_DEFINE4(name, ...) \
        COMPAT_SYSCALL_DEFINEx(4, _##name, __VA_ARGS__)
#define COMPAT_SYSCALL_DEFINE5(name, ...) \
        COMPAT_SYSCALL_DEFINEx(5, _##name, __VA_ARGS__)
#define COMPAT_SYSCALL_DEFINE6(name, ...) \
        COMPAT_SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)

/*
 * The asmlinkage stub is aliased to a function named __se_compat_sys_*() which
 * sign-extends 32-bit ints to longs whenever needed. The actual work is
 * done within __do_compat_sys_*().
 */
#ifndef COMPAT_SYSCALL_DEFINEx
#define COMPAT_SYSCALL_DEFINEx(x, name, ...)                                        \
        __diag_push();                                                                \
        __diag_ignore(GCC, 8, "-Wattribute-alias",                                \
                      "Type aliasing is used to sanitize syscall arguments");\
        asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));        \
        asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))        \
                __attribute__((alias(__stringify(__se_compat_sys##name))));        \
        ALLOW_ERROR_INJECTION(compat_sys##name, ERRNO);                                \
        static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\
        asmlinkage long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__));        \
        asmlinkage long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__))        \
        {                                                                        \
                long ret = __do_compat_sys##name(__MAP(x,__SC_DELOUSE,__VA_ARGS__));\
                __MAP(x,__SC_TEST,__VA_ARGS__);                                        \
                return ret;                                                        \
        }                                                                        \
        __diag_pop();                                                                \
        static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))
#endif /* COMPAT_SYSCALL_DEFINEx */

struct compat_iovec {
        compat_uptr_t        iov_base;
        compat_size_t        iov_len;
};

#ifdef CONFIG_COMPAT

#ifndef compat_user_stack_pointer
#define compat_user_stack_pointer() current_user_stack_pointer()
#endif
#ifndef compat_sigaltstack        /* we'll need that for MIPS */
typedef struct compat_sigaltstack {
        compat_uptr_t                        ss_sp;
        int                                ss_flags;
        compat_size_t                        ss_size;
} compat_stack_t;
#endif
#ifndef COMPAT_MINSIGSTKSZ
#define COMPAT_MINSIGSTKSZ        MINSIGSTKSZ
#endif

#define compat_jiffies_to_clock_t(x)        \
                (((unsigned long)(x) * COMPAT_USER_HZ) / HZ)

typedef __compat_uid32_t        compat_uid_t;
typedef __compat_gid32_t        compat_gid_t;

struct compat_sel_arg_struct;
struct rusage;

struct old_itimerval32;

struct compat_tms {
        compat_clock_t                tms_utime;
        compat_clock_t                tms_stime;
        compat_clock_t                tms_cutime;
        compat_clock_t                tms_cstime;
};

#define _COMPAT_NSIG_WORDS        (_COMPAT_NSIG / _COMPAT_NSIG_BPW)

typedef struct {
        compat_sigset_word        sig[_COMPAT_NSIG_WORDS];
} compat_sigset_t;

int set_compat_user_sigmask(const compat_sigset_t __user *umask,
                            size_t sigsetsize);

struct compat_sigaction {
#ifndef __ARCH_HAS_IRIX_SIGACTION
        compat_uptr_t                        sa_handler;
        compat_ulong_t                        sa_flags;
#else
        compat_uint_t                        sa_flags;
        compat_uptr_t                        sa_handler;
#endif
#ifdef __ARCH_HAS_SA_RESTORER
        compat_uptr_t                        sa_restorer;
#endif
        compat_sigset_t                        sa_mask __packed;
};

typedef union compat_sigval {
        compat_int_t        sival_int;
        compat_uptr_t        sival_ptr;
} compat_sigval_t;

typedef struct compat_siginfo {
        int si_signo;
#ifndef __ARCH_HAS_SWAPPED_SIGINFO
        int si_errno;
        int si_code;
#else
        int si_code;
        int si_errno;
#endif

        union {
                int _pad[128/sizeof(int) - 3];

                /* kill() */
                struct {
                        compat_pid_t _pid;        /* sender's pid */
                        __compat_uid32_t _uid;        /* sender's uid */
                } _kill;

                /* POSIX.1b timers */
                struct {
                        compat_timer_t _tid;        /* timer id */
                        int _overrun;                /* overrun count */
                        compat_sigval_t _sigval;        /* same as below */
                } _timer;

                /* POSIX.1b signals */
                struct {
                        compat_pid_t _pid;        /* sender's pid */
                        __compat_uid32_t _uid;        /* sender's uid */
                        compat_sigval_t _sigval;
                } _rt;

                /* SIGCHLD */
                struct {
                        compat_pid_t _pid;        /* which child */
                        __compat_uid32_t _uid;        /* sender's uid */
                        int _status;                /* exit code */
                        compat_clock_t _utime;
                        compat_clock_t _stime;
                } _sigchld;

#ifdef CONFIG_X86_X32_ABI
                /* SIGCHLD (x32 version) */
                struct {
                        compat_pid_t _pid;        /* which child */
                        __compat_uid32_t _uid;        /* sender's uid */
                        int _status;                /* exit code */
                        compat_s64 _utime;
                        compat_s64 _stime;
                } _sigchld_x32;
#endif

                /* SIGILL, SIGFPE, SIGSEGV, SIGBUS, SIGTRAP, SIGEMT */
                struct {
                        compat_uptr_t _addr;        /* faulting insn/memory ref. */
#ifdef __ARCH_SI_TRAPNO
                        int _trapno;        /* TRAP # which caused the signal */
#endif
#define __COMPAT_ADDR_BND_PKEY_PAD  (__alignof__(compat_uptr_t) < sizeof(short) ? \
                                     sizeof(short) : __alignof__(compat_uptr_t))
                        union {
                                /*
                                 * used when si_code=BUS_MCEERR_AR or
                                 * used when si_code=BUS_MCEERR_AO
                                 */
                                short int _addr_lsb;        /* Valid LSB of the reported address. */
                                /* used when si_code=SEGV_BNDERR */
                                struct {
                                        char _dummy_bnd[__COMPAT_ADDR_BND_PKEY_PAD];
                                        compat_uptr_t _lower;
                                        compat_uptr_t _upper;
                                } _addr_bnd;
                                /* used when si_code=SEGV_PKUERR */
                                struct {
                                        char _dummy_pkey[__COMPAT_ADDR_BND_PKEY_PAD];
                                        u32 _pkey;
                                } _addr_pkey;
                        };
                } _sigfault;

                /* SIGPOLL */
                struct {
                        compat_long_t _band;        /* POLL_IN, POLL_OUT, POLL_MSG */
                        int _fd;
                } _sigpoll;

                struct {
                        compat_uptr_t _call_addr; /* calling user insn */
                        int _syscall;        /* triggering system call number */
                        unsigned int _arch;        /* AUDIT_ARCH_* of syscall */
                } _sigsys;
        } _sifields;
} compat_siginfo_t;

struct compat_rlimit {
        compat_ulong_t        rlim_cur;
        compat_ulong_t        rlim_max;
};

struct compat_rusage {
        struct old_timeval32 ru_utime;
        struct old_timeval32 ru_stime;
        compat_long_t        ru_maxrss;
        compat_long_t        ru_ixrss;
        compat_long_t        ru_idrss;
        compat_long_t        ru_isrss;
        compat_long_t        ru_minflt;
        compat_long_t        ru_majflt;
        compat_long_t        ru_nswap;
        compat_long_t        ru_inblock;
        compat_long_t        ru_oublock;
        compat_long_t        ru_msgsnd;
        compat_long_t        ru_msgrcv;
        compat_long_t        ru_nsignals;
        compat_long_t        ru_nvcsw;
        compat_long_t        ru_nivcsw;
};

extern int put_compat_rusage(const struct rusage *,
                             struct compat_rusage __user *);

struct compat_siginfo;
struct __compat_aio_sigset;

struct compat_dirent {
        u32                d_ino;
        compat_off_t        d_off;
        u16                d_reclen;
        char                d_name[256];
};

struct compat_ustat {
        compat_daddr_t                f_tfree;
        compat_ino_t                f_tinode;
        char                        f_fname[6];
        char                        f_fpack[6];
};

#define COMPAT_SIGEV_PAD_SIZE        ((SIGEV_MAX_SIZE/sizeof(int)) - 3)

typedef struct compat_sigevent {
        compat_sigval_t sigev_value;
        compat_int_t sigev_signo;
        compat_int_t sigev_notify;
        union {
                compat_int_t _pad[COMPAT_SIGEV_PAD_SIZE];
                compat_int_t _tid;

                struct {
                        compat_uptr_t _function;
                        compat_uptr_t _attribute;
                } _sigev_thread;
        } _sigev_un;
} compat_sigevent_t;

struct compat_ifmap {
        compat_ulong_t mem_start;
        compat_ulong_t mem_end;
        unsigned short base_addr;
        unsigned char irq;
        unsigned char dma;
        unsigned char port;
};

struct compat_if_settings {
        unsigned int type;        /* Type of physical device or protocol */
        unsigned int size;        /* Size of the data allocated by the caller */
        compat_uptr_t ifs_ifsu;        /* union of pointers */
};

struct compat_ifreq {
        union {
                char        ifrn_name[IFNAMSIZ];    /* if name, e.g. "en0" */
        } ifr_ifrn;
        union {
                struct        sockaddr ifru_addr;
                struct        sockaddr ifru_dstaddr;
                struct        sockaddr ifru_broadaddr;
                struct        sockaddr ifru_netmask;
                struct        sockaddr ifru_hwaddr;
                short        ifru_flags;
                compat_int_t        ifru_ivalue;
                compat_int_t        ifru_mtu;
                struct        compat_ifmap ifru_map;
                char        ifru_slave[IFNAMSIZ];   /* Just fits the size */
                char        ifru_newname[IFNAMSIZ];
                compat_caddr_t        ifru_data;
                struct        compat_if_settings ifru_settings;
        } ifr_ifru;
};

struct compat_ifconf {
        compat_int_t        ifc_len;                /* size of buffer */
        compat_caddr_t  ifcbuf;
};

struct compat_robust_list {
        compat_uptr_t                        next;
};

struct compat_robust_list_head {
        struct compat_robust_list        list;
        compat_long_t                        futex_offset;
        compat_uptr_t                        list_op_pending;
};

#ifdef CONFIG_COMPAT_OLD_SIGACTION
struct compat_old_sigaction {
        compat_uptr_t                        sa_handler;
        compat_old_sigset_t                sa_mask;
        compat_ulong_t                        sa_flags;
        compat_uptr_t                        sa_restorer;
};
#endif

struct compat_keyctl_kdf_params {
        compat_uptr_t hashname;
        compat_uptr_t otherinfo;
        __u32 otherinfolen;
        __u32 __spare[8];
};

struct compat_statfs;
struct compat_statfs64;
struct compat_old_linux_dirent;
struct compat_linux_dirent;
struct linux_dirent64;
struct compat_msghdr;
struct compat_mmsghdr;
struct compat_sysinfo;
struct compat_sysctl_args;
struct compat_kexec_segment;
struct compat_mq_attr;
struct compat_msgbuf;

#define BITS_PER_COMPAT_LONG    (8*sizeof(compat_long_t))

#define BITS_TO_COMPAT_LONGS(bits) DIV_ROUND_UP(bits, BITS_PER_COMPAT_LONG)

long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask,
                       unsigned long bitmap_size);
long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
                       unsigned long bitmap_size);
void copy_siginfo_to_external32(struct compat_siginfo *to,
                const struct kernel_siginfo *from);
int copy_siginfo_from_user32(kernel_siginfo_t *to,
                const struct compat_siginfo __user *from);
int __copy_siginfo_to_user32(struct compat_siginfo __user *to,
                const kernel_siginfo_t *from);
#ifndef copy_siginfo_to_user32
#define copy_siginfo_to_user32 __copy_siginfo_to_user32
#endif
int get_compat_sigevent(struct sigevent *event,
                const struct compat_sigevent __user *u_event);

extern int get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat);

/*
 * Defined inline such that size can be compile time constant, which avoids
 * CONFIG_HARDENED_USERCOPY complaining about copies from task_struct
 */
static inline int
put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set,
                  unsigned int size)
{
        /* size <= sizeof(compat_sigset_t) <= sizeof(sigset_t) */
#ifdef __BIG_ENDIAN
        compat_sigset_t v;
        switch (_NSIG_WORDS) {
        case 4: v.sig[7] = (set->sig[3] >> 32); v.sig[6] = set->sig[3];
                fallthrough;
        case 3: v.sig[5] = (set->sig[2] >> 32); v.sig[4] = set->sig[2];
                fallthrough;
        case 2: v.sig[3] = (set->sig[1] >> 32); v.sig[2] = set->sig[1];
                fallthrough;
        case 1: v.sig[1] = (set->sig[0] >> 32); v.sig[0] = set->sig[0];
        }
        return copy_to_user(compat, &v, size) ? -EFAULT : 0;
#else
        return copy_to_user(compat, set, size) ? -EFAULT : 0;
#endif
}

extern int compat_ptrace_request(struct task_struct *child,
                                 compat_long_t request,
                                 compat_ulong_t addr, compat_ulong_t data);

extern long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
                               compat_ulong_t addr, compat_ulong_t data);

struct epoll_event;        /* fortunately, this one is fixed-layout */

extern void __user *compat_alloc_user_space(unsigned long len);

int compat_restore_altstack(const compat_stack_t __user *uss);
int __compat_save_altstack(compat_stack_t __user *, unsigned long);
#define unsafe_compat_save_altstack(uss, sp, label) do { \
        compat_stack_t __user *__uss = uss; \
        struct task_struct *t = current; \
        unsafe_put_user(ptr_to_compat((void __user *)t->sas_ss_sp), \
                        &__uss->ss_sp, label); \
        unsafe_put_user(t->sas_ss_flags, &__uss->ss_flags, label); \
        unsafe_put_user(t->sas_ss_size, &__uss->ss_size, label); \
        if (t->sas_ss_flags & SS_AUTODISARM) \
                sas_ss_reset(t); \
} while (0);

/*
 * These syscall function prototypes are kept in the same order as
 * include/uapi/asm-generic/unistd.h. Deprecated or obsolete system calls
 * go below.
 *
 * Please note that these prototypes here are only provided for information
 * purposes, for static analysis, and for linking from the syscall table.
 * These functions should not be called elsewhere from kernel code.
 *
 * As the syscall calling convention may be different from the default
 * for architectures overriding the syscall calling convention, do not
 * include the prototypes if CONFIG_ARCH_HAS_SYSCALL_WRAPPER is enabled.
 */
#ifndef CONFIG_ARCH_HAS_SYSCALL_WRAPPER
asmlinkage long compat_sys_io_setup(unsigned nr_reqs, u32 __user *ctx32p);
asmlinkage long compat_sys_io_submit(compat_aio_context_t ctx_id, int nr,
                                     u32 __user *iocb);
asmlinkage long compat_sys_io_pgetevents(compat_aio_context_t ctx_id,
                                        compat_long_t min_nr,
                                        compat_long_t nr,
                                        struct io_event __user *events,
                                        struct old_timespec32 __user *timeout,
                                        const struct __compat_aio_sigset __user *usig);
asmlinkage long compat_sys_io_pgetevents_time64(compat_aio_context_t ctx_id,
                                        compat_long_t min_nr,
                                        compat_long_t nr,
                                        struct io_event __user *events,
                                        struct __kernel_timespec __user *timeout,
                                        const struct __compat_aio_sigset __user *usig);

/* fs/cookies.c */
asmlinkage long compat_sys_lookup_dcookie(u32, u32, char __user *, compat_size_t);

/* fs/eventpoll.c */
asmlinkage long compat_sys_epoll_pwait(int epfd,
                        struct epoll_event __user *events,
                        int maxevents, int timeout,
                        const compat_sigset_t __user *sigmask,
                        compat_size_t sigsetsize);

/* fs/fcntl.c */
asmlinkage long compat_sys_fcntl(unsigned int fd, unsigned int cmd,
                                 compat_ulong_t arg);
asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd,
                                   compat_ulong_t arg);

/* fs/ioctl.c */
asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
                                 compat_ulong_t arg);

/* fs/open.c */
asmlinkage long compat_sys_statfs(const char __user *pathname,
                                  struct compat_statfs __user *buf);
asmlinkage long compat_sys_statfs64(const char __user *pathname,
                                    compat_size_t sz,
                                    struct compat_statfs64 __user *buf);
asmlinkage long compat_sys_fstatfs(unsigned int fd,
                                   struct compat_statfs __user *buf);
asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz,
                                     struct compat_statfs64 __user *buf);
asmlinkage long compat_sys_truncate(const char __user *, compat_off_t);
asmlinkage long compat_sys_ftruncate(unsigned int, compat_off_t);
/* No generic prototype for truncate64, ftruncate64, fallocate */
asmlinkage long compat_sys_openat(int dfd, const char __user *filename,
                                  int flags, umode_t mode);

/* fs/readdir.c */
asmlinkage long compat_sys_getdents(unsigned int fd,
                                    struct compat_linux_dirent __user *dirent,
                                    unsigned int count);

/* fs/read_write.c */
asmlinkage long compat_sys_lseek(unsigned int, compat_off_t, unsigned int);
/* No generic prototype for pread64 and pwrite64 */
asmlinkage ssize_t compat_sys_preadv(compat_ulong_t fd,
                const struct iovec __user *vec,
                compat_ulong_t vlen, u32 pos_low, u32 pos_high);
asmlinkage ssize_t compat_sys_pwritev(compat_ulong_t fd,
                const struct iovec __user *vec,
                compat_ulong_t vlen, u32 pos_low, u32 pos_high);
#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
asmlinkage long compat_sys_preadv64(unsigned long fd,
                const struct iovec __user *vec,
                unsigned long vlen, loff_t pos);
#endif

#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
asmlinkage long compat_sys_pwritev64(unsigned long fd,
                const struct iovec __user *vec,
                unsigned long vlen, loff_t pos);
#endif

/* fs/sendfile.c */
asmlinkage long compat_sys_sendfile(int out_fd, int in_fd,
                                    compat_off_t __user *offset, compat_size_t count);
asmlinkage long compat_sys_sendfile64(int out_fd, int in_fd,
                                    compat_loff_t __user *offset, compat_size_t count);

/* fs/select.c */
asmlinkage long compat_sys_pselect6_time32(int n, compat_ulong_t __user *inp,
                                    compat_ulong_t __user *outp,
                                    compat_ulong_t __user *exp,
                                    struct old_timespec32 __user *tsp,
                                    void __user *sig);
asmlinkage long compat_sys_pselect6_time64(int n, compat_ulong_t __user *inp,
                                    compat_ulong_t __user *outp,
                                    compat_ulong_t __user *exp,
                                    struct __kernel_timespec __user *tsp,
                                    void __user *sig);
asmlinkage long compat_sys_ppoll_time32(struct pollfd __user *ufds,
                                 unsigned int nfds,
                                 struct old_timespec32 __user *tsp,
                                 const compat_sigset_t __user *sigmask,
                                 compat_size_t sigsetsize);
asmlinkage long compat_sys_ppoll_time64(struct pollfd __user *ufds,
                                 unsigned int nfds,
                                 struct __kernel_timespec __user *tsp,
                                 const compat_sigset_t __user *sigmask,
                                 compat_size_t sigsetsize);

/* fs/signalfd.c */
asmlinkage long compat_sys_signalfd4(int ufd,
                                     const compat_sigset_t __user *sigmask,
                                     compat_size_t sigsetsize, int flags);

/* fs/stat.c */
asmlinkage long compat_sys_newfstatat(unsigned int dfd,
                                      const char __user *filename,
                                      struct compat_stat __user *statbuf,
                                      int flag);
asmlinkage long compat_sys_newfstat(unsigned int fd,
                                    struct compat_stat __user *statbuf);

/* fs/sync.c: No generic prototype for sync_file_range and sync_file_range2 */

/* kernel/exit.c */
asmlinkage long compat_sys_waitid(int, compat_pid_t,
                struct compat_siginfo __user *, int,
                struct compat_rusage __user *);



/* kernel/futex.c */
asmlinkage long
compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
                           compat_size_t len);
asmlinkage long
compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
                           compat_size_t __user *len_ptr);

/* kernel/itimer.c */
asmlinkage long compat_sys_getitimer(int which,
                                     struct old_itimerval32 __user *it);
asmlinkage long compat_sys_setitimer(int which,
                                     struct old_itimerval32 __user *in,
                                     struct old_itimerval32 __user *out);

/* kernel/kexec.c */
asmlinkage long compat_sys_kexec_load(compat_ulong_t entry,
                                      compat_ulong_t nr_segments,
                                      struct compat_kexec_segment __user *,
                                      compat_ulong_t flags);

/* kernel/posix-timers.c */
asmlinkage long compat_sys_timer_create(clockid_t which_clock,
                        struct compat_sigevent __user *timer_event_spec,
                        timer_t __user *created_timer_id);

/* kernel/ptrace.c */
asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
                                  compat_long_t addr, compat_long_t data);

/* kernel/sched/core.c */
asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid,
                                     unsigned int len,
                                     compat_ulong_t __user *user_mask_ptr);
asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid,
                                     unsigned int len,
                                     compat_ulong_t __user *user_mask_ptr);

/* kernel/signal.c */
asmlinkage long compat_sys_sigaltstack(const compat_stack_t __user *uss_ptr,
                                       compat_stack_t __user *uoss_ptr);
asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset,
                                         compat_size_t sigsetsize);
#ifndef CONFIG_ODD_RT_SIGACTION
asmlinkage long compat_sys_rt_sigaction(int,
                                 const struct compat_sigaction __user *,
                                 struct compat_sigaction __user *,
                                 compat_size_t);
#endif
asmlinkage long compat_sys_rt_sigprocmask(int how, compat_sigset_t __user *set,
                                          compat_sigset_t __user *oset,
                                          compat_size_t sigsetsize);
asmlinkage long compat_sys_rt_sigpending(compat_sigset_t __user *uset,
                                         compat_size_t sigsetsize);
asmlinkage long compat_sys_rt_sigtimedwait_time32(compat_sigset_t __user *uthese,
                struct compat_siginfo __user *uinfo,
                struct old_timespec32 __user *uts, compat_size_t sigsetsize);
asmlinkage long compat_sys_rt_sigtimedwait_time64(compat_sigset_t __user *uthese,
                struct compat_siginfo __user *uinfo,
                struct __kernel_timespec __user *uts, compat_size_t sigsetsize);
asmlinkage long compat_sys_rt_sigqueueinfo(compat_pid_t pid, int sig,
                                struct compat_siginfo __user *uinfo);
/* No generic prototype for rt_sigreturn */

/* kernel/sys.c */
asmlinkage long compat_sys_times(struct compat_tms __user *tbuf);
asmlinkage long compat_sys_getrlimit(unsigned int resource,
                                     struct compat_rlimit __user *rlim);
asmlinkage long compat_sys_setrlimit(unsigned int resource,
                                     struct compat_rlimit __user *rlim);
asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru);

/* kernel/time.c */
asmlinkage long compat_sys_gettimeofday(struct old_timeval32 __user *tv,
                struct timezone __user *tz);
asmlinkage long compat_sys_settimeofday(struct old_timeval32 __user *tv,
                struct timezone __user *tz);

/* kernel/timer.c */
asmlinkage long compat_sys_sysinfo(struct compat_sysinfo __user *info);

/* ipc/mqueue.c */
asmlinkage long compat_sys_mq_open(const char __user *u_name,
                        int oflag, compat_mode_t mode,
                        struct compat_mq_attr __user *u_attr);
asmlinkage long compat_sys_mq_notify(mqd_t mqdes,
                        const struct compat_sigevent __user *u_notification);
asmlinkage long compat_sys_mq_getsetattr(mqd_t mqdes,
                        const struct compat_mq_attr __user *u_mqstat,
                        struct compat_mq_attr __user *u_omqstat);

/* ipc/msg.c */
asmlinkage long compat_sys_msgctl(int first, int second, void __user *uptr);
asmlinkage long compat_sys_msgrcv(int msqid, compat_uptr_t msgp,
                compat_ssize_t msgsz, compat_long_t msgtyp, int msgflg);
asmlinkage long compat_sys_msgsnd(int msqid, compat_uptr_t msgp,
                compat_ssize_t msgsz, int msgflg);

/* ipc/sem.c */
asmlinkage long compat_sys_semctl(int semid, int semnum, int cmd, int arg);

/* ipc/shm.c */
asmlinkage long compat_sys_shmctl(int first, int second, void __user *uptr);
asmlinkage long compat_sys_shmat(int shmid, compat_uptr_t shmaddr, int shmflg);

/* net/socket.c */
asmlinkage long compat_sys_recvfrom(int fd, void __user *buf, compat_size_t len,
                            unsigned flags, struct sockaddr __user *addr,
                            int __user *addrlen);
asmlinkage long compat_sys_sendmsg(int fd, struct compat_msghdr __user *msg,
                                   unsigned flags);
asmlinkage long compat_sys_recvmsg(int fd, struct compat_msghdr __user *msg,
                                   unsigned int flags);

/* mm/filemap.c: No generic prototype for readahead */

/* security/keys/keyctl.c */
asmlinkage long compat_sys_keyctl(u32 option,
                              u32 arg2, u32 arg3, u32 arg4, u32 arg5);

/* arch/example/kernel/sys_example.c */
asmlinkage long compat_sys_execve(const char __user *filename, const compat_uptr_t __user *argv,
                     const compat_uptr_t __user *envp);

/* mm/fadvise.c: No generic prototype for fadvise64_64 */

/* mm/, CONFIG_MMU only */
asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
                                 compat_ulong_t mode,
                                 compat_ulong_t __user *nmask,
                                 compat_ulong_t maxnode, compat_ulong_t flags);
asmlinkage long compat_sys_get_mempolicy(int __user *policy,
                                         compat_ulong_t __user *nmask,
                                         compat_ulong_t maxnode,
                                         compat_ulong_t addr,
                                         compat_ulong_t flags);
asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
                                         compat_ulong_t maxnode);
asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
                compat_ulong_t maxnode, const compat_ulong_t __user *old_nodes,
                const compat_ulong_t __user *new_nodes);
asmlinkage long compat_sys_move_pages(pid_t pid, compat_ulong_t nr_pages,
                                      __u32 __user *pages,
                                      const int __user *nodes,
                                      int __user *status,
                                      int flags);

asmlinkage long compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid,
                                        compat_pid_t pid, int sig,
                                        struct compat_siginfo __user *uinfo);
asmlinkage long compat_sys_recvmmsg_time64(int fd, struct compat_mmsghdr __user *mmsg,
                                    unsigned vlen, unsigned int flags,
                                    struct __kernel_timespec __user *timeout);
asmlinkage long compat_sys_recvmmsg_time32(int fd, struct compat_mmsghdr __user *mmsg,
                                    unsigned vlen, unsigned int flags,
                                    struct old_timespec32 __user *timeout);
asmlinkage long compat_sys_wait4(compat_pid_t pid,
                                 compat_uint_t __user *stat_addr, int options,
                                 struct compat_rusage __user *ru);
asmlinkage long compat_sys_fanotify_mark(int, unsigned int, __u32, __u32,
                                            int, const char __user *);
asmlinkage long compat_sys_open_by_handle_at(int mountdirfd,
                                             struct file_handle __user *handle,
                                             int flags);
asmlinkage long compat_sys_sendmmsg(int fd, struct compat_mmsghdr __user *mmsg,
                                    unsigned vlen, unsigned int flags);
asmlinkage long compat_sys_execveat(int dfd, const char __user *filename,
                     const compat_uptr_t __user *argv,
                     const compat_uptr_t __user *envp, int flags);
asmlinkage ssize_t compat_sys_preadv2(compat_ulong_t fd,
                const struct iovec __user *vec,
                compat_ulong_t vlen, u32 pos_low, u32 pos_high, rwf_t flags);
asmlinkage ssize_t compat_sys_pwritev2(compat_ulong_t fd,
                const struct iovec __user *vec,
                compat_ulong_t vlen, u32 pos_low, u32 pos_high, rwf_t flags);
#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
asmlinkage long  compat_sys_preadv64v2(unsigned long fd,
                const struct iovec __user *vec,
                unsigned long vlen, loff_t pos, rwf_t flags);
#endif

#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
asmlinkage long compat_sys_pwritev64v2(unsigned long fd,
                const struct iovec __user *vec,
                unsigned long vlen, loff_t pos, rwf_t flags);
#endif


/*
 * Deprecated system calls which are still defined in
 * include/uapi/asm-generic/unistd.h and wanted by >= 1 arch
 */

/* __ARCH_WANT_SYSCALL_NO_AT */
asmlinkage long compat_sys_open(const char __user *filename, int flags,
                                umode_t mode);

/* __ARCH_WANT_SYSCALL_NO_FLAGS */
asmlinkage long compat_sys_signalfd(int ufd,
                                    const compat_sigset_t __user *sigmask,
                                    compat_size_t sigsetsize);

/* __ARCH_WANT_SYSCALL_OFF_T */
asmlinkage long compat_sys_newstat(const char __user *filename,
                                   struct compat_stat __user *statbuf);
asmlinkage long compat_sys_newlstat(const char __user *filename,
                                    struct compat_stat __user *statbuf);

/* __ARCH_WANT_SYSCALL_DEPRECATED */
asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
                compat_ulong_t __user *outp, compat_ulong_t __user *exp,
                struct old_timeval32 __user *tvp);
asmlinkage long compat_sys_ustat(unsigned dev, struct compat_ustat __user *u32);
asmlinkage long compat_sys_recv(int fd, void __user *buf, compat_size_t len,
                                unsigned flags);

/* obsolete: fs/readdir.c */
asmlinkage long compat_sys_old_readdir(unsigned int fd,
                                       struct compat_old_linux_dirent __user *,
                                       unsigned int count);

/* obsolete: fs/select.c */
asmlinkage long compat_sys_old_select(struct compat_sel_arg_struct __user *arg);

/* obsolete: ipc */
asmlinkage long compat_sys_ipc(u32, int, int, u32, compat_uptr_t, u32);

/* obsolete: kernel/signal.c */
#ifdef __ARCH_WANT_SYS_SIGPENDING
asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set);
#endif

#ifdef __ARCH_WANT_SYS_SIGPROCMASK
asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *nset,
                                       compat_old_sigset_t __user *oset);
#endif
#ifdef CONFIG_COMPAT_OLD_SIGACTION
asmlinkage long compat_sys_sigaction(int sig,
                                   const struct compat_old_sigaction __user *act,
                                   struct compat_old_sigaction __user *oact);
#endif

/* obsolete: net/socket.c */
asmlinkage long compat_sys_socketcall(int call, u32 __user *args);

#endif /* CONFIG_ARCH_HAS_SYSCALL_WRAPPER */


/*
 * For most but not all architectures, "am I in a compat syscall?" and
 * "am I a compat task?" are the same question.  For architectures on which
 * they aren't the same question, arch code can override in_compat_syscall.
 */

#ifndef in_compat_syscall
static inline bool in_compat_syscall(void) { return is_compat_task(); }
#endif

/**
 * ns_to_old_timeval32 - Compat version of ns_to_timeval
 * @nsec:        the nanoseconds value to be converted
 *
 * Returns the old_timeval32 representation of the nsec parameter.
 */
static inline struct old_timeval32 ns_to_old_timeval32(s64 nsec)
{
        struct __kernel_old_timeval tv;
        struct old_timeval32 ctv;

        tv = ns_to_kernel_old_timeval(nsec);
        ctv.tv_sec = tv.tv_sec;
        ctv.tv_usec = tv.tv_usec;

        return ctv;
}

/*
 * Kernel code should not call compat syscalls (i.e., compat_sys_xyzyyz())
 * directly.  Instead, use one of the functions which work equivalently, such
 * as the kcompat_sys_xyzyyz() functions prototyped below.
 */

int kcompat_sys_statfs64(const char __user * pathname, compat_size_t sz,
                     struct compat_statfs64 __user * buf);
int kcompat_sys_fstatfs64(unsigned int fd, compat_size_t sz,
                          struct compat_statfs64 __user * buf);

#else /* !CONFIG_COMPAT */

#define is_compat_task() (0)
/* Ensure no one redefines in_compat_syscall() under !CONFIG_COMPAT */
#define in_compat_syscall in_compat_syscall
static inline bool in_compat_syscall(void) { return false; }

#endif /* CONFIG_COMPAT */

/*
 * Some legacy ABIs like the i386 one use less than natural alignment for 64-bit
 * types, and will need special compat treatment for that.  Most architectures
 * don't need that special handling even for compat syscalls.
 */
#ifndef compat_need_64bit_alignment_fixup
#define compat_need_64bit_alignment_fixup()                false
#endif

/*
 * A pointer passed in from user mode. This should not
 * be used for syscall parameters, just declare them
 * as pointers because the syscall entry code will have
 * appropriately converted them already.
 */
#ifndef compat_ptr
static inline void __user *compat_ptr(compat_uptr_t uptr)
{
        return (void __user *)(unsigned long)uptr;
}
#endif

static inline compat_uptr_t ptr_to_compat(void __user *uptr)
{
        return (u32)(unsigned long)uptr;
}

#endif /* _LINUX_COMPAT_H */































































    3 











    3 



















































































































    4 

















    4 




    4 




    1 










































































































































































































































































    1 








































































































































































































































































































































































    1 


    1 




    1 
    1 



    1 











    1 
























    1 

    1 










    1 











    1 


    1 



    1 





    1 


    1 






    1 


































    1 

























    1 

    1 
















    1 


    1 






















    1 














    1 



    1 
    1 





































































































    4 





























































































    4 

    4 
    4 
    4 





















    4 
    4 
    4 

    3 
    3 
    3 

    3 



    3 






    4 
















    4 




    4 















    4 



    1 
    3 















































    1 














    1 
    1 

    1 




    1 



    1 




    1 



    1 


































































    1 








    1 

    1 


    1 











    1 



    1 






    1 
    1 































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 














    1 





    1 






    1 





















































































































































    1 






    1 






    1 



    1 




    1 
    1 



    1 




    1 







    1 

























    1 



    1 


    1 



































































    1 

    1 




    1 

    1 





    1 
    1 



    1 


























    1 






















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/buffer.c
 *
 *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
 */

/*
 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
 *
 * Removed a lot of unnecessary code and simplified things now that
 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
 *
 * Speed up hash, lru, and free list operations.  Use gfp() for allocating
 * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
 *
 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
 *
 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
 */

#include <linux/kernel.h>
#include <linux/sched/signal.h>
#include <linux/syscalls.h>
#include <linux/fs.h>
#include <linux/iomap.h>
#include <linux/mm.h>
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/capability.h>
#include <linux/blkdev.h>
#include <linux/file.h>
#include <linux/quotaops.h>
#include <linux/highmem.h>
#include <linux/export.h>
#include <linux/backing-dev.h>
#include <linux/writeback.h>
#include <linux/hash.h>
#include <linux/suspend.h>
#include <linux/buffer_head.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/bio.h>
#include <linux/cpu.h>
#include <linux/bitops.h>
#include <linux/mpage.h>
#include <linux/bit_spinlock.h>
#include <linux/pagevec.h>
#include <linux/sched/mm.h>
#include <trace/events/block.h>
#include <linux/fscrypt.h>

#include "internal.h"

static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
                         enum rw_hint hint, struct writeback_control *wbc);

#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)

inline void touch_buffer(struct buffer_head *bh)
{
        trace_block_touch_buffer(bh);
        mark_page_accessed(bh->b_page);
}
EXPORT_SYMBOL(touch_buffer);

void __lock_buffer(struct buffer_head *bh)
{
        wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(__lock_buffer);

void unlock_buffer(struct buffer_head *bh)
{
        clear_bit_unlock(BH_Lock, &bh->b_state);
        smp_mb__after_atomic();
        wake_up_bit(&bh->b_state, BH_Lock);
}
EXPORT_SYMBOL(unlock_buffer);

/*
 * Returns if the page has dirty or writeback buffers. If all the buffers
 * are unlocked and clean then the PageDirty information is stale. If
 * any of the pages are locked, it is assumed they are locked for IO.
 */
void buffer_check_dirty_writeback(struct page *page,
                                     bool *dirty, bool *writeback)
{
        struct buffer_head *head, *bh;
        *dirty = false;
        *writeback = false;

        BUG_ON(!PageLocked(page));

        if (!page_has_buffers(page))
                return;

        if (PageWriteback(page))
                *writeback = true;

        head = page_buffers(page);
        bh = head;
        do {
                if (buffer_locked(bh))
                        *writeback = true;

                if (buffer_dirty(bh))
                        *dirty = true;

                bh = bh->b_this_page;
        } while (bh != head);
}
EXPORT_SYMBOL(buffer_check_dirty_writeback);

/*
 * Block until a buffer comes unlocked.  This doesn't stop it
 * from becoming locked again - you have to lock it yourself
 * if you want to preserve its state.
 */
void __wait_on_buffer(struct buffer_head * bh)
{
        wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(__wait_on_buffer);

static void buffer_io_error(struct buffer_head *bh, char *msg)
{
        if (!test_bit(BH_Quiet, &bh->b_state))
                printk_ratelimited(KERN_ERR
                        "Buffer I/O error on dev %pg, logical block %llu%s\n",
                        bh->b_bdev, (unsigned long long)bh->b_blocknr, msg);
}

/*
 * End-of-IO handler helper function which does not touch the bh after
 * unlocking it.
 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
 * a race there is benign: unlock_buffer() only use the bh's address for
 * hashing after unlocking the buffer, so it doesn't actually touch the bh
 * itself.
 */
static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
{
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
                /* This happens, due to failed read-ahead attempts. */
                clear_buffer_uptodate(bh);
        }
        unlock_buffer(bh);
}

/*
 * Default synchronous end-of-IO handler..  Just mark it up-to-date and
 * unlock the buffer. This is what ll_rw_block uses too.
 */
void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
{
        put_bh(bh);
        __end_buffer_read_notouch(bh, uptodate);
}
EXPORT_SYMBOL(end_buffer_read_sync);

void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
{
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
                buffer_io_error(bh, ", lost sync page write");
                mark_buffer_write_io_error(bh);
                clear_buffer_uptodate(bh);
        }
        unlock_buffer(bh);
        put_bh(bh);
}
EXPORT_SYMBOL(end_buffer_write_sync);

/*
 * Various filesystems appear to want __find_get_block to be non-blocking.
 * But it's the page lock which protects the buffers.  To get around this,
 * we get exclusion from try_to_free_buffers with the blockdev mapping's
 * private_lock.
 *
 * Hack idea: for the blockdev mapping, private_lock contention
 * may be quite high.  This code could TryLock the page, and if that
 * succeeds, there is no need to take private_lock.
 */
static struct buffer_head *
__find_get_block_slow(struct block_device *bdev, sector_t block)
{
        struct inode *bd_inode = bdev->bd_inode;
        struct address_space *bd_mapping = bd_inode->i_mapping;
        struct buffer_head *ret = NULL;
        pgoff_t index;
        struct buffer_head *bh;
        struct buffer_head *head;
        struct page *page;
        int all_mapped = 1;
        static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1);

        index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
        page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED);
        if (!page)
                goto out;

        spin_lock(&bd_mapping->private_lock);
        if (!page_has_buffers(page))
                goto out_unlock;
        head = page_buffers(page);
        bh = head;
        do {
                if (!buffer_mapped(bh))
                        all_mapped = 0;
                else if (bh->b_blocknr == block) {
                        ret = bh;
                        get_bh(bh);
                        goto out_unlock;
                }
                bh = bh->b_this_page;
        } while (bh != head);

        /* we might be here because some of the buffers on this page are
         * not mapped.  This is due to various races between
         * file io on the block device and getblk.  It gets dealt with
         * elsewhere, don't buffer_error if we had some unmapped buffers
         */
        ratelimit_set_flags(&last_warned, RATELIMIT_MSG_ON_RELEASE);
        if (all_mapped && __ratelimit(&last_warned)) {
                printk("__find_get_block_slow() failed. block=%llu, "
                       "b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, "
                       "device %pg blocksize: %d\n",
                       (unsigned long long)block,
                       (unsigned long long)bh->b_blocknr,
                       bh->b_state, bh->b_size, bdev,
                       1 << bd_inode->i_blkbits);
        }
out_unlock:
        spin_unlock(&bd_mapping->private_lock);
        put_page(page);
out:
        return ret;
}

static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
{
        unsigned long flags;
        struct buffer_head *first;
        struct buffer_head *tmp;
        struct page *page;
        int page_uptodate = 1;

        BUG_ON(!buffer_async_read(bh));

        page = bh->b_page;
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
                clear_buffer_uptodate(bh);
                buffer_io_error(bh, ", async page read");
                SetPageError(page);
        }

        /*
         * Be _very_ careful from here on. Bad things can happen if
         * two buffer heads end IO at almost the same time and both
         * decide that the page is now completely done.
         */
        first = page_buffers(page);
        spin_lock_irqsave(&first->b_uptodate_lock, flags);
        clear_buffer_async_read(bh);
        unlock_buffer(bh);
        tmp = bh;
        do {
                if (!buffer_uptodate(tmp))
                        page_uptodate = 0;
                if (buffer_async_read(tmp)) {
                        BUG_ON(!buffer_locked(tmp));
                        goto still_busy;
                }
                tmp = tmp->b_this_page;
        } while (tmp != bh);
        spin_unlock_irqrestore(&first->b_uptodate_lock, flags);

        /*
         * If none of the buffers had errors and they are all
         * uptodate then we can set the page uptodate.
         */
        if (page_uptodate && !PageError(page))
                SetPageUptodate(page);
        unlock_page(page);
        return;

still_busy:
        spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
        return;
}

struct decrypt_bh_ctx {
        struct work_struct work;
        struct buffer_head *bh;
};

static void decrypt_bh(struct work_struct *work)
{
        struct decrypt_bh_ctx *ctx =
                container_of(work, struct decrypt_bh_ctx, work);
        struct buffer_head *bh = ctx->bh;
        int err;

        err = fscrypt_decrypt_pagecache_blocks(bh->b_page, bh->b_size,
                                               bh_offset(bh));
        end_buffer_async_read(bh, err == 0);
        kfree(ctx);
}

/*
 * I/O completion handler for block_read_full_page() - pages
 * which come unlocked at the end of I/O.
 */
static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
{
        /* Decrypt if needed */
        if (uptodate &&
            fscrypt_inode_uses_fs_layer_crypto(bh->b_page->mapping->host)) {
                struct decrypt_bh_ctx *ctx = kmalloc(sizeof(*ctx), GFP_ATOMIC);

                if (ctx) {
                        INIT_WORK(&ctx->work, decrypt_bh);
                        ctx->bh = bh;
                        fscrypt_enqueue_decrypt_work(&ctx->work);
                        return;
                }
                uptodate = 0;
        }
        end_buffer_async_read(bh, uptodate);
}

/*
 * Completion handler for block_write_full_page() - pages which are unlocked
 * during I/O, and which have PageWriteback cleared upon I/O completion.
 */
void end_buffer_async_write(struct buffer_head *bh, int uptodate)
{
        unsigned long flags;
        struct buffer_head *first;
        struct buffer_head *tmp;
        struct page *page;

        BUG_ON(!buffer_async_write(bh));

        page = bh->b_page;
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
                buffer_io_error(bh, ", lost async page write");
                mark_buffer_write_io_error(bh);
                clear_buffer_uptodate(bh);
                SetPageError(page);
        }

        first = page_buffers(page);
        spin_lock_irqsave(&first->b_uptodate_lock, flags);

        clear_buffer_async_write(bh);
        unlock_buffer(bh);
        tmp = bh->b_this_page;
        while (tmp != bh) {
                if (buffer_async_write(tmp)) {
                        BUG_ON(!buffer_locked(tmp));
                        goto still_busy;
                }
                tmp = tmp->b_this_page;
        }
        spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
        end_page_writeback(page);
        return;

still_busy:
        spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
        return;
}
EXPORT_SYMBOL(end_buffer_async_write);

/*
 * If a page's buffers are under async readin (end_buffer_async_read
 * completion) then there is a possibility that another thread of
 * control could lock one of the buffers after it has completed
 * but while some of the other buffers have not completed.  This
 * locked buffer would confuse end_buffer_async_read() into not unlocking
 * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
 * that this buffer is not under async I/O.
 *
 * The page comes unlocked when it has no locked buffer_async buffers
 * left.
 *
 * PageLocked prevents anyone starting new async I/O reads any of
 * the buffers.
 *
 * PageWriteback is used to prevent simultaneous writeout of the same
 * page.
 *
 * PageLocked prevents anyone from starting writeback of a page which is
 * under read I/O (PageWriteback is only ever set against a locked page).
 */
static void mark_buffer_async_read(struct buffer_head *bh)
{
        bh->b_end_io = end_buffer_async_read_io;
        set_buffer_async_read(bh);
}

static void mark_buffer_async_write_endio(struct buffer_head *bh,
                                          bh_end_io_t *handler)
{
        bh->b_end_io = handler;
        set_buffer_async_write(bh);
}

void mark_buffer_async_write(struct buffer_head *bh)
{
        mark_buffer_async_write_endio(bh, end_buffer_async_write);
}
EXPORT_SYMBOL(mark_buffer_async_write);


/*
 * fs/buffer.c contains helper functions for buffer-backed address space's
 * fsync functions.  A common requirement for buffer-based filesystems is
 * that certain data from the backing blockdev needs to be written out for
 * a successful fsync().  For example, ext2 indirect blocks need to be
 * written back and waited upon before fsync() returns.
 *
 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
 * management of a list of dependent buffers at ->i_mapping->private_list.
 *
 * Locking is a little subtle: try_to_free_buffers() will remove buffers
 * from their controlling inode's queue when they are being freed.  But
 * try_to_free_buffers() will be operating against the *blockdev* mapping
 * at the time, not against the S_ISREG file which depends on those buffers.
 * So the locking for private_list is via the private_lock in the address_space
 * which backs the buffers.  Which is different from the address_space 
 * against which the buffers are listed.  So for a particular address_space,
 * mapping->private_lock does *not* protect mapping->private_list!  In fact,
 * mapping->private_list will always be protected by the backing blockdev's
 * ->private_lock.
 *
 * Which introduces a requirement: all buffers on an address_space's
 * ->private_list must be from the same address_space: the blockdev's.
 *
 * address_spaces which do not place buffers at ->private_list via these
 * utility functions are free to use private_lock and private_list for
 * whatever they want.  The only requirement is that list_empty(private_list)
 * be true at clear_inode() time.
 *
 * FIXME: clear_inode should not call invalidate_inode_buffers().  The
 * filesystems should do that.  invalidate_inode_buffers() should just go
 * BUG_ON(!list_empty).
 *
 * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
 * take an address_space, not an inode.  And it should be called
 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
 * queued up.
 *
 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
 * list if it is already on a list.  Because if the buffer is on a list,
 * it *must* already be on the right one.  If not, the filesystem is being
 * silly.  This will save a ton of locking.  But first we have to ensure
 * that buffers are taken *off* the old inode's list when they are freed
 * (presumably in truncate).  That requires careful auditing of all
 * filesystems (do it inside bforget()).  It could also be done by bringing
 * b_inode back.
 */

/*
 * The buffer's backing address_space's private_lock must be held
 */
static void __remove_assoc_queue(struct buffer_head *bh)
{
        list_del_init(&bh->b_assoc_buffers);
        WARN_ON(!bh->b_assoc_map);
        bh->b_assoc_map = NULL;
}

int inode_has_buffers(struct inode *inode)
{
        return !list_empty(&inode->i_data.private_list);
}

/*
 * osync is designed to support O_SYNC io.  It waits synchronously for
 * all already-submitted IO to complete, but does not queue any new
 * writes to the disk.
 *
 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
 * you dirty the buffers, and then use osync_inode_buffers to wait for
 * completion.  Any other dirty buffers which are not yet queued for
 * write will not be flushed to disk by the osync.
 */
static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
{
        struct buffer_head *bh;
        struct list_head *p;
        int err = 0;

        spin_lock(lock);
repeat:
        list_for_each_prev(p, list) {
                bh = BH_ENTRY(p);
                if (buffer_locked(bh)) {
                        get_bh(bh);
                        spin_unlock(lock);
                        wait_on_buffer(bh);
                        if (!buffer_uptodate(bh))
                                err = -EIO;
                        brelse(bh);
                        spin_lock(lock);
                        goto repeat;
                }
        }
        spin_unlock(lock);
        return err;
}

void emergency_thaw_bdev(struct super_block *sb)
{
        while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
                printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev);
}

/**
 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
 * @mapping: the mapping which wants those buffers written
 *
 * Starts I/O against the buffers at mapping->private_list, and waits upon
 * that I/O.
 *
 * Basically, this is a convenience function for fsync().
 * @mapping is a file or directory which needs those buffers to be written for
 * a successful fsync().
 */
int sync_mapping_buffers(struct address_space *mapping)
{
        struct address_space *buffer_mapping = mapping->private_data;

        if (buffer_mapping == NULL || list_empty(&mapping->private_list))
                return 0;

        return fsync_buffers_list(&buffer_mapping->private_lock,
                                        &mapping->private_list);
}
EXPORT_SYMBOL(sync_mapping_buffers);

/*
 * Called when we've recently written block `bblock', and it is known that
 * `bblock' was for a buffer_boundary() buffer.  This means that the block at
 * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
 * dirty, schedule it for IO.  So that indirects merge nicely with their data.
 */
void write_boundary_block(struct block_device *bdev,
                        sector_t bblock, unsigned blocksize)
{
        struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
        if (bh) {
                if (buffer_dirty(bh))
                        ll_rw_block(REQ_OP_WRITE, 0, 1, &bh);
                put_bh(bh);
        }
}

void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
{
        struct address_space *mapping = inode->i_mapping;
        struct address_space *buffer_mapping = bh->b_page->mapping;

        mark_buffer_dirty(bh);
        if (!mapping->private_data) {
                mapping->private_data = buffer_mapping;
        } else {
                BUG_ON(mapping->private_data != buffer_mapping);
        }
        if (!bh->b_assoc_map) {
                spin_lock(&buffer_mapping->private_lock);
                list_move_tail(&bh->b_assoc_buffers,
                                &mapping->private_list);
                bh->b_assoc_map = mapping;
                spin_unlock(&buffer_mapping->private_lock);
        }
}
EXPORT_SYMBOL(mark_buffer_dirty_inode);

/*
 * Mark the page dirty, and set it dirty in the page cache, and mark the inode
 * dirty.
 *
 * If warn is true, then emit a warning if the page is not uptodate and has
 * not been truncated.
 *
 * The caller must hold lock_page_memcg().
 */
void __set_page_dirty(struct page *page, struct address_space *mapping,
                             int warn)
{
        unsigned long flags;

        xa_lock_irqsave(&mapping->i_pages, flags);
        if (page->mapping) {        /* Race with truncate? */
                WARN_ON_ONCE(warn && !PageUptodate(page));
                account_page_dirtied(page, mapping);
                __xa_set_mark(&mapping->i_pages, page_index(page),
                                PAGECACHE_TAG_DIRTY);
        }
        xa_unlock_irqrestore(&mapping->i_pages, flags);
}
EXPORT_SYMBOL_GPL(__set_page_dirty);

/*
 * Add a page to the dirty page list.
 *
 * It is a sad fact of life that this function is called from several places
 * deeply under spinlocking.  It may not sleep.
 *
 * If the page has buffers, the uptodate buffers are set dirty, to preserve
 * dirty-state coherency between the page and the buffers.  It the page does
 * not have buffers then when they are later attached they will all be set
 * dirty.
 *
 * The buffers are dirtied before the page is dirtied.  There's a small race
 * window in which a writepage caller may see the page cleanness but not the
 * buffer dirtiness.  That's fine.  If this code were to set the page dirty
 * before the buffers, a concurrent writepage caller could clear the page dirty
 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
 * page on the dirty page list.
 *
 * We use private_lock to lock against try_to_free_buffers while using the
 * page's buffer list.  Also use this to protect against clean buffers being
 * added to the page after it was set dirty.
 *
 * FIXME: may need to call ->reservepage here as well.  That's rather up to the
 * address_space though.
 */
int __set_page_dirty_buffers(struct page *page)
{
        int newly_dirty;
        struct address_space *mapping = page_mapping(page);

        if (unlikely(!mapping))
                return !TestSetPageDirty(page);

        spin_lock(&mapping->private_lock);
        if (page_has_buffers(page)) {
                struct buffer_head *head = page_buffers(page);
                struct buffer_head *bh = head;

                do {
                        set_buffer_dirty(bh);
                        bh = bh->b_this_page;
                } while (bh != head);
        }
        /*
         * Lock out page->mem_cgroup migration to keep PageDirty
         * synchronized with per-memcg dirty page counters.
         */
        lock_page_memcg(page);
        newly_dirty = !TestSetPageDirty(page);
        spin_unlock(&mapping->private_lock);

        if (newly_dirty)
                __set_page_dirty(page, mapping, 1);

        unlock_page_memcg(page);

        if (newly_dirty)
                __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);

        return newly_dirty;
}
EXPORT_SYMBOL(__set_page_dirty_buffers);

/*
 * Write out and wait upon a list of buffers.
 *
 * We have conflicting pressures: we want to make sure that all
 * initially dirty buffers get waited on, but that any subsequently
 * dirtied buffers don't.  After all, we don't want fsync to last
 * forever if somebody is actively writing to the file.
 *
 * Do this in two main stages: first we copy dirty buffers to a
 * temporary inode list, queueing the writes as we go.  Then we clean
 * up, waiting for those writes to complete.
 * 
 * During this second stage, any subsequent updates to the file may end
 * up refiling the buffer on the original inode's dirty list again, so
 * there is a chance we will end up with a buffer queued for write but
 * not yet completed on that list.  So, as a final cleanup we go through
 * the osync code to catch these locked, dirty buffers without requeuing
 * any newly dirty buffers for write.
 */
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
{
        struct buffer_head *bh;
        struct list_head tmp;
        struct address_space *mapping;
        int err = 0, err2;
        struct blk_plug plug;

        INIT_LIST_HEAD(&tmp);
        blk_start_plug(&plug);

        spin_lock(lock);
        while (!list_empty(list)) {
                bh = BH_ENTRY(list->next);
                mapping = bh->b_assoc_map;
                __remove_assoc_queue(bh);
                /* Avoid race with mark_buffer_dirty_inode() which does
                 * a lockless check and we rely on seeing the dirty bit */
                smp_mb();
                if (buffer_dirty(bh) || buffer_locked(bh)) {
                        list_add(&bh->b_assoc_buffers, &tmp);
                        bh->b_assoc_map = mapping;
                        if (buffer_dirty(bh)) {
                                get_bh(bh);
                                spin_unlock(lock);
                                /*
                                 * Ensure any pending I/O completes so that
                                 * write_dirty_buffer() actually writes the
                                 * current contents - it is a noop if I/O is
                                 * still in flight on potentially older
                                 * contents.
                                 */
                                write_dirty_buffer(bh, REQ_SYNC);

                                /*
                                 * Kick off IO for the previous mapping. Note
                                 * that we will not run the very last mapping,
                                 * wait_on_buffer() will do that for us
                                 * through sync_buffer().
                                 */
                                brelse(bh);
                                spin_lock(lock);
                        }
                }
        }

        spin_unlock(lock);
        blk_finish_plug(&plug);
        spin_lock(lock);

        while (!list_empty(&tmp)) {
                bh = BH_ENTRY(tmp.prev);
                get_bh(bh);
                mapping = bh->b_assoc_map;
                __remove_assoc_queue(bh);
                /* Avoid race with mark_buffer_dirty_inode() which does
                 * a lockless check and we rely on seeing the dirty bit */
                smp_mb();
                if (buffer_dirty(bh)) {
                        list_add(&bh->b_assoc_buffers,
                                 &mapping->private_list);
                        bh->b_assoc_map = mapping;
                }
                spin_unlock(lock);
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh))
                        err = -EIO;
                brelse(bh);
                spin_lock(lock);
        }
        
        spin_unlock(lock);
        err2 = osync_buffers_list(lock, list);
        if (err)
                return err;
        else
                return err2;
}

/*
 * Invalidate any and all dirty buffers on a given inode.  We are
 * probably unmounting the fs, but that doesn't mean we have already
 * done a sync().  Just drop the buffers from the inode list.
 *
 * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
 * assumes that all the buffers are against the blockdev.  Not true
 * for reiserfs.
 */
void invalidate_inode_buffers(struct inode *inode)
{
        if (inode_has_buffers(inode)) {
                struct address_space *mapping = &inode->i_data;
                struct list_head *list = &mapping->private_list;
                struct address_space *buffer_mapping = mapping->private_data;

                spin_lock(&buffer_mapping->private_lock);
                while (!list_empty(list))
                        __remove_assoc_queue(BH_ENTRY(list->next));
                spin_unlock(&buffer_mapping->private_lock);
        }
}
EXPORT_SYMBOL(invalidate_inode_buffers);

/*
 * Remove any clean buffers from the inode's buffer list.  This is called
 * when we're trying to free the inode itself.  Those buffers can pin it.
 *
 * Returns true if all buffers were removed.
 */
int remove_inode_buffers(struct inode *inode)
{
        int ret = 1;

        if (inode_has_buffers(inode)) {
                struct address_space *mapping = &inode->i_data;
                struct list_head *list = &mapping->private_list;
                struct address_space *buffer_mapping = mapping->private_data;

                spin_lock(&buffer_mapping->private_lock);
                while (!list_empty(list)) {
                        struct buffer_head *bh = BH_ENTRY(list->next);
                        if (buffer_dirty(bh)) {
                                ret = 0;
                                break;
                        }
                        __remove_assoc_queue(bh);
                }
                spin_unlock(&buffer_mapping->private_lock);
        }
        return ret;
}

/*
 * Create the appropriate buffers when given a page for data area and
 * the size of each buffer.. Use the bh->b_this_page linked list to
 * follow the buffers created.  Return NULL if unable to create more
 * buffers.
 *
 * The retry flag is used to differentiate async IO (paging, swapping)
 * which may not fail from ordinary buffer allocations.
 */
struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
                bool retry)
{
        struct buffer_head *bh, *head;
        gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
        long offset;
        struct mem_cgroup *memcg, *old_memcg;

        if (retry)
                gfp |= __GFP_NOFAIL;

        memcg = get_mem_cgroup_from_page(page);
        old_memcg = set_active_memcg(memcg);

        head = NULL;
        offset = PAGE_SIZE;
        while ((offset -= size) >= 0) {
                bh = alloc_buffer_head(gfp);
                if (!bh)
                        goto no_grow;

                bh->b_this_page = head;
                bh->b_blocknr = -1;
                head = bh;

                bh->b_size = size;

                /* Link the buffer to its page */
                set_bh_page(bh, page, offset);
        }
out:
        set_active_memcg(old_memcg);
        mem_cgroup_put(memcg);
        return head;
/*
 * In case anything failed, we just free everything we got.
 */
no_grow:
        if (head) {
                do {
                        bh = head;
                        head = head->b_this_page;
                        free_buffer_head(bh);
                } while (head);
        }

        goto out;
}
EXPORT_SYMBOL_GPL(alloc_page_buffers);

static inline void
link_dev_buffers(struct page *page, struct buffer_head *head)
{
        struct buffer_head *bh, *tail;

        bh = head;
        do {
                tail = bh;
                bh = bh->b_this_page;
        } while (bh);
        tail->b_this_page = head;
        attach_page_private(page, head);
}

static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
{
        sector_t retval = ~((sector_t)0);
        loff_t sz = i_size_read(bdev->bd_inode);

        if (sz) {
                unsigned int sizebits = blksize_bits(size);
                retval = (sz >> sizebits);
        }
        return retval;
}

/*
 * Initialise the state of a blockdev page's buffers.
 */ 
static sector_t
init_page_buffers(struct page *page, struct block_device *bdev,
                        sector_t block, int size)
{
        struct buffer_head *head = page_buffers(page);
        struct buffer_head *bh = head;
        int uptodate = PageUptodate(page);
        sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size);

        do {
                if (!buffer_mapped(bh)) {
                        bh->b_end_io = NULL;
                        bh->b_private = NULL;
                        bh->b_bdev = bdev;
                        bh->b_blocknr = block;
                        if (uptodate)
                                set_buffer_uptodate(bh);
                        if (block < end_block)
                                set_buffer_mapped(bh);
                }
                block++;
                bh = bh->b_this_page;
        } while (bh != head);

        /*
         * Caller needs to validate requested block against end of device.
         */
        return end_block;
}

/*
 * Create the page-cache page that contains the requested block.
 *
 * This is used purely for blockdev mappings.
 */
static int
grow_dev_page(struct block_device *bdev, sector_t block,
              pgoff_t index, int size, int sizebits, gfp_t gfp)
{
        struct inode *inode = bdev->bd_inode;
        struct page *page;
        struct buffer_head *bh;
        sector_t end_block;
        int ret = 0;
        gfp_t gfp_mask;

        gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp;

        /*
         * XXX: __getblk_slow() can not really deal with failure and
         * will endlessly loop on improvised global reclaim.  Prefer
         * looping in the allocator rather than here, at least that
         * code knows what it's doing.
         */
        gfp_mask |= __GFP_NOFAIL;

        page = find_or_create_page(inode->i_mapping, index, gfp_mask);

        BUG_ON(!PageLocked(page));

        if (page_has_buffers(page)) {
                bh = page_buffers(page);
                if (bh->b_size == size) {
                        end_block = init_page_buffers(page, bdev,
                                                (sector_t)index << sizebits,
                                                size);
                        goto done;
                }
                if (!try_to_free_buffers(page))
                        goto failed;
        }

        /*
         * Allocate some buffers for this page
         */
        bh = alloc_page_buffers(page, size, true);

        /*
         * Link the page to the buffers and initialise them.  Take the
         * lock to be atomic wrt __find_get_block(), which does not
         * run under the page lock.
         */
        spin_lock(&inode->i_mapping->private_lock);
        link_dev_buffers(page, bh);
        end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits,
                        size);
        spin_unlock(&inode->i_mapping->private_lock);
done:
        ret = (block < end_block) ? 1 : -ENXIO;
failed:
        unlock_page(page);
        put_page(page);
        return ret;
}

/*
 * Create buffers for the specified block device block's page.  If
 * that page was dirty, the buffers are set dirty also.
 */
static int
grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
{
        pgoff_t index;
        int sizebits;

        sizebits = -1;
        do {
                sizebits++;
        } while ((size << sizebits) < PAGE_SIZE);

        index = block >> sizebits;

        /*
         * Check for a block which wants to lie outside our maximum possible
         * pagecache index.  (this comparison is done using sector_t types).
         */
        if (unlikely(index != block >> sizebits)) {
                printk(KERN_ERR "%s: requested out-of-range block %llu for "
                        "device %pg\n",
                        __func__, (unsigned long long)block,
                        bdev);
                return -EIO;
        }

        /* Create a page with the proper size buffers.. */
        return grow_dev_page(bdev, block, index, size, sizebits, gfp);
}

static struct buffer_head *
__getblk_slow(struct block_device *bdev, sector_t block,
             unsigned size, gfp_t gfp)
{
        /* Size must be multiple of hard sectorsize */
        if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
                        (size < 512 || size > PAGE_SIZE))) {
                printk(KERN_ERR "getblk(): invalid block size %d requested\n",
                                        size);
                printk(KERN_ERR "logical block size: %d\n",
                                        bdev_logical_block_size(bdev));

                dump_stack();
                return NULL;
        }

        for (;;) {
                struct buffer_head *bh;
                int ret;

                bh = __find_get_block(bdev, block, size);
                if (bh)
                        return bh;

                ret = grow_buffers(bdev, block, size, gfp);
                if (ret < 0)
                        return NULL;
        }
}

/*
 * The relationship between dirty buffers and dirty pages:
 *
 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
 * the page is tagged dirty in the page cache.
 *
 * At all times, the dirtiness of the buffers represents the dirtiness of
 * subsections of the page.  If the page has buffers, the page dirty bit is
 * merely a hint about the true dirty state.
 *
 * When a page is set dirty in its entirety, all its buffers are marked dirty
 * (if the page has buffers).
 *
 * When a buffer is marked dirty, its page is dirtied, but the page's other
 * buffers are not.
 *
 * Also.  When blockdev buffers are explicitly read with bread(), they
 * individually become uptodate.  But their backing page remains not
 * uptodate - even if all of its buffers are uptodate.  A subsequent
 * block_read_full_page() against that page will discover all the uptodate
 * buffers, will set the page uptodate and will perform no I/O.
 */

/**
 * mark_buffer_dirty - mark a buffer_head as needing writeout
 * @bh: the buffer_head to mark dirty
 *
 * mark_buffer_dirty() will set the dirty bit against the buffer, then set
 * its backing page dirty, then tag the page as dirty in the page cache
 * and then attach the address_space's inode to its superblock's dirty
 * inode list.
 *
 * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
 * i_pages lock and mapping->host->i_lock.
 */
void mark_buffer_dirty(struct buffer_head *bh)
{
        WARN_ON_ONCE(!buffer_uptodate(bh));

        trace_block_dirty_buffer(bh);

        /*
         * Very *carefully* optimize the it-is-already-dirty case.
         *
         * Don't let the final "is it dirty" escape to before we
         * perhaps modified the buffer.
         */
        if (buffer_dirty(bh)) {
                smp_mb();
                if (buffer_dirty(bh))
                        return;
        }

        if (!test_set_buffer_dirty(bh)) {
                struct page *page = bh->b_page;
                struct address_space *mapping = NULL;

                lock_page_memcg(page);
                if (!TestSetPageDirty(page)) {
                        mapping = page_mapping(page);
                        if (mapping)
                                __set_page_dirty(page, mapping, 0);
                }
                unlock_page_memcg(page);
                if (mapping)
                        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
        }
}
EXPORT_SYMBOL(mark_buffer_dirty);

void mark_buffer_write_io_error(struct buffer_head *bh)
{
        struct super_block *sb;

        set_buffer_write_io_error(bh);
        /* FIXME: do we need to set this in both places? */
        if (bh->b_page && bh->b_page->mapping)
                mapping_set_error(bh->b_page->mapping, -EIO);
        if (bh->b_assoc_map)
                mapping_set_error(bh->b_assoc_map, -EIO);
        rcu_read_lock();
        sb = READ_ONCE(bh->b_bdev->bd_super);
        if (sb)
                errseq_set(&sb->s_wb_err, -EIO);
        rcu_read_unlock();
}
EXPORT_SYMBOL(mark_buffer_write_io_error);

/*
 * Decrement a buffer_head's reference count.  If all buffers against a page
 * have zero reference count, are clean and unlocked, and if the page is clean
 * and unlocked then try_to_free_buffers() may strip the buffers from the page
 * in preparation for freeing it (sometimes, rarely, buffers are removed from
 * a page but it ends up not being freed, and buffers may later be reattached).
 */
void __brelse(struct buffer_head * buf)
{
        if (atomic_read(&buf->b_count)) {
                put_bh(buf);
                return;
        }
        WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
}
EXPORT_SYMBOL(__brelse);

/*
 * bforget() is like brelse(), except it discards any
 * potentially dirty data.
 */
void __bforget(struct buffer_head *bh)
{
        clear_buffer_dirty(bh);
        if (bh->b_assoc_map) {
                struct address_space *buffer_mapping = bh->b_page->mapping;

                spin_lock(&buffer_mapping->private_lock);
                list_del_init(&bh->b_assoc_buffers);
                bh->b_assoc_map = NULL;
                spin_unlock(&buffer_mapping->private_lock);
        }
        __brelse(bh);
}
EXPORT_SYMBOL(__bforget);

static struct buffer_head *__bread_slow(struct buffer_head *bh)
{
        lock_buffer(bh);
        if (buffer_uptodate(bh)) {
                unlock_buffer(bh);
                return bh;
        } else {
                get_bh(bh);
                bh->b_end_io = end_buffer_read_sync;
                submit_bh(REQ_OP_READ, 0, bh);
                wait_on_buffer(bh);
                if (buffer_uptodate(bh))
                        return bh;
        }
        brelse(bh);
        return NULL;
}

/*
 * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
 * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
 * refcount elevated by one when they're in an LRU.  A buffer can only appear
 * once in a particular CPU's LRU.  A single buffer can be present in multiple
 * CPU's LRUs at the same time.
 *
 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
 * sb_find_get_block().
 *
 * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
 * a local interrupt disable for that.
 */

#define BH_LRU_SIZE        16

struct bh_lru {
        struct buffer_head *bhs[BH_LRU_SIZE];
};

static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};

#ifdef CONFIG_SMP
#define bh_lru_lock()        local_irq_disable()
#define bh_lru_unlock()        local_irq_enable()
#else
#define bh_lru_lock()        preempt_disable()
#define bh_lru_unlock()        preempt_enable()
#endif

static inline void check_irqs_on(void)
{
#ifdef irqs_disabled
        BUG_ON(irqs_disabled());
#endif
}

/*
 * Install a buffer_head into this cpu's LRU.  If not already in the LRU, it is
 * inserted at the front, and the buffer_head at the back if any is evicted.
 * Or, if already in the LRU it is moved to the front.
 */
static void bh_lru_install(struct buffer_head *bh)
{
        struct buffer_head *evictee = bh;
        struct bh_lru *b;
        int i;

        check_irqs_on();
        bh_lru_lock();

        b = this_cpu_ptr(&bh_lrus);
        for (i = 0; i < BH_LRU_SIZE; i++) {
                swap(evictee, b->bhs[i]);
                if (evictee == bh) {
                        bh_lru_unlock();
                        return;
                }
        }

        get_bh(bh);
        bh_lru_unlock();
        brelse(evictee);
}

/*
 * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
 */
static struct buffer_head *
lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
{
        struct buffer_head *ret = NULL;
        unsigned int i;

        check_irqs_on();
        bh_lru_lock();
        for (i = 0; i < BH_LRU_SIZE; i++) {
                struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);

                if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
                    bh->b_size == size) {
                        if (i) {
                                while (i) {
                                        __this_cpu_write(bh_lrus.bhs[i],
                                                __this_cpu_read(bh_lrus.bhs[i - 1]));
                                        i--;
                                }
                                __this_cpu_write(bh_lrus.bhs[0], bh);
                        }
                        get_bh(bh);
                        ret = bh;
                        break;
                }
        }
        bh_lru_unlock();
        return ret;
}

/*
 * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
 * it in the LRU and mark it as accessed.  If it is not present then return
 * NULL
 */
struct buffer_head *
__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
{
        struct buffer_head *bh = lookup_bh_lru(bdev, block, size);

        if (bh == NULL) {
                /* __find_get_block_slow will mark the page accessed */
                bh = __find_get_block_slow(bdev, block);
                if (bh)
                        bh_lru_install(bh);
        } else
                touch_buffer(bh);

        return bh;
}
EXPORT_SYMBOL(__find_get_block);

/*
 * __getblk_gfp() will locate (and, if necessary, create) the buffer_head
 * which corresponds to the passed block_device, block and size. The
 * returned buffer has its reference count incremented.
 *
 * __getblk_gfp() will lock up the machine if grow_dev_page's
 * try_to_free_buffers() attempt is failing.  FIXME, perhaps?
 */
struct buffer_head *
__getblk_gfp(struct block_device *bdev, sector_t block,
             unsigned size, gfp_t gfp)
{
        struct buffer_head *bh = __find_get_block(bdev, block, size);

        might_sleep();
        if (bh == NULL)
                bh = __getblk_slow(bdev, block, size, gfp);
        return bh;
}
EXPORT_SYMBOL(__getblk_gfp);

/*
 * Do async read-ahead on a buffer..
 */
void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
{
        struct buffer_head *bh = __getblk(bdev, block, size);
        if (likely(bh)) {
                ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, &bh);
                brelse(bh);
        }
}
EXPORT_SYMBOL(__breadahead);

void __breadahead_gfp(struct block_device *bdev, sector_t block, unsigned size,
                      gfp_t gfp)
{
        struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
        if (likely(bh)) {
                ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, &bh);
                brelse(bh);
        }
}
EXPORT_SYMBOL(__breadahead_gfp);

/**
 *  __bread_gfp() - reads a specified block and returns the bh
 *  @bdev: the block_device to read from
 *  @block: number of block
 *  @size: size (in bytes) to read
 *  @gfp: page allocation flag
 *
 *  Reads a specified block, and returns buffer head that contains it.
 *  The page cache can be allocated from non-movable area
 *  not to prevent page migration if you set gfp to zero.
 *  It returns NULL if the block was unreadable.
 */
struct buffer_head *
__bread_gfp(struct block_device *bdev, sector_t block,
                   unsigned size, gfp_t gfp)
{
        struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);

        if (likely(bh) && !buffer_uptodate(bh))
                bh = __bread_slow(bh);
        return bh;
}
EXPORT_SYMBOL(__bread_gfp);

/*
 * invalidate_bh_lrus() is called rarely - but not only at unmount.
 * This doesn't race because it runs in each cpu either in irq
 * or with preempt disabled.
 */
static void invalidate_bh_lru(void *arg)
{
        struct bh_lru *b = &get_cpu_var(bh_lrus);
        int i;

        for (i = 0; i < BH_LRU_SIZE; i++) {
                brelse(b->bhs[i]);
                b->bhs[i] = NULL;
        }
        put_cpu_var(bh_lrus);
}

static bool has_bh_in_lru(int cpu, void *dummy)
{
        struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
        int i;
        
        for (i = 0; i < BH_LRU_SIZE; i++) {
                if (b->bhs[i])
                        return true;
        }

        return false;
}

void invalidate_bh_lrus(void)
{
        on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
}
EXPORT_SYMBOL_GPL(invalidate_bh_lrus);

void set_bh_page(struct buffer_head *bh,
                struct page *page, unsigned long offset)
{
        bh->b_page = page;
        BUG_ON(offset >= PAGE_SIZE);
        if (PageHighMem(page))
                /*
                 * This catches illegal uses and preserves the offset:
                 */
                bh->b_data = (char *)(0 + offset);
        else
                bh->b_data = page_address(page) + offset;
}
EXPORT_SYMBOL(set_bh_page);

/*
 * Called when truncating a buffer on a page completely.
 */

/* Bits that are cleared during an invalidate */
#define BUFFER_FLAGS_DISCARD \
        (1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \
         1 << BH_Delay | 1 << BH_Unwritten)

static void discard_buffer(struct buffer_head * bh)
{
        unsigned long b_state, b_state_old;

        lock_buffer(bh);
        clear_buffer_dirty(bh);
        bh->b_bdev = NULL;
        b_state = bh->b_state;
        for (;;) {
                b_state_old = cmpxchg(&bh->b_state, b_state,
                                      (b_state & ~BUFFER_FLAGS_DISCARD));
                if (b_state_old == b_state)
                        break;
                b_state = b_state_old;
        }
        unlock_buffer(bh);
}

/**
 * block_invalidatepage - invalidate part or all of a buffer-backed page
 *
 * @page: the page which is affected
 * @offset: start of the range to invalidate
 * @length: length of the range to invalidate
 *
 * block_invalidatepage() is called when all or part of the page has become
 * invalidated by a truncate operation.
 *
 * block_invalidatepage() does not have to release all buffers, but it must
 * ensure that no dirty buffer is left outside @offset and that no I/O
 * is underway against any of the blocks which are outside the truncation
 * point.  Because the caller is about to free (and possibly reuse) those
 * blocks on-disk.
 */
void block_invalidatepage(struct page *page, unsigned int offset,
                          unsigned int length)
{
        struct buffer_head *head, *bh, *next;
        unsigned int curr_off = 0;
        unsigned int stop = length + offset;

        BUG_ON(!PageLocked(page));
        if (!page_has_buffers(page))
                goto out;

        /*
         * Check for overflow
         */
        BUG_ON(stop > PAGE_SIZE || stop < length);

        head = page_buffers(page);
        bh = head;
        do {
                unsigned int next_off = curr_off + bh->b_size;
                next = bh->b_this_page;

                /*
                 * Are we still fully in range ?
                 */
                if (next_off > stop)
                        goto out;

                /*
                 * is this block fully invalidated?
                 */
                if (offset <= curr_off)
                        discard_buffer(bh);
                curr_off = next_off;
                bh = next;
        } while (bh != head);

        /*
         * We release buffers only if the entire page is being invalidated.
         * The get_block cached value has been unconditionally invalidated,
         * so real IO is not possible anymore.
         */
        if (length == PAGE_SIZE)
                try_to_release_page(page, 0);
out:
        return;
}
EXPORT_SYMBOL(block_invalidatepage);


/*
 * We attach and possibly dirty the buffers atomically wrt
 * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
 * is already excluded via the page lock.
 */
void create_empty_buffers(struct page *page,
                        unsigned long blocksize, unsigned long b_state)
{
        struct buffer_head *bh, *head, *tail;

        head = alloc_page_buffers(page, blocksize, true);
        bh = head;
        do {
                bh->b_state |= b_state;
                tail = bh;
                bh = bh->b_this_page;
        } while (bh);
        tail->b_this_page = head;

        spin_lock(&page->mapping->private_lock);
        if (PageUptodate(page) || PageDirty(page)) {
                bh = head;
                do {
                        if (PageDirty(page))
                                set_buffer_dirty(bh);
                        if (PageUptodate(page))
                                set_buffer_uptodate(bh);
                        bh = bh->b_this_page;
                } while (bh != head);
        }
        attach_page_private(page, head);
        spin_unlock(&page->mapping->private_lock);
}
EXPORT_SYMBOL(create_empty_buffers);

/**
 * clean_bdev_aliases: clean a range of buffers in block device
 * @bdev: Block device to clean buffers in
 * @block: Start of a range of blocks to clean
 * @len: Number of blocks to clean
 *
 * We are taking a range of blocks for data and we don't want writeback of any
 * buffer-cache aliases starting from return from this function and until the
 * moment when something will explicitly mark the buffer dirty (hopefully that
 * will not happen until we will free that block ;-) We don't even need to mark
 * it not-uptodate - nobody can expect anything from a newly allocated buffer
 * anyway. We used to use unmap_buffer() for such invalidation, but that was
 * wrong. We definitely don't want to mark the alias unmapped, for example - it
 * would confuse anyone who might pick it with bread() afterwards...
 *
 * Also..  Note that bforget() doesn't lock the buffer.  So there can be
 * writeout I/O going on against recently-freed buffers.  We don't wait on that
 * I/O in bforget() - it's more efficient to wait on the I/O only if we really
 * need to.  That happens here.
 */
void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
{
        struct inode *bd_inode = bdev->bd_inode;
        struct address_space *bd_mapping = bd_inode->i_mapping;
        struct pagevec pvec;
        pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
        pgoff_t end;
        int i, count;
        struct buffer_head *bh;
        struct buffer_head *head;

        end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
        pagevec_init(&pvec);
        while (pagevec_lookup_range(&pvec, bd_mapping, &index, end)) {
                count = pagevec_count(&pvec);
                for (i = 0; i < count; i++) {
                        struct page *page = pvec.pages[i];

                        if (!page_has_buffers(page))
                                continue;
                        /*
                         * We use page lock instead of bd_mapping->private_lock
                         * to pin buffers here since we can afford to sleep and
                         * it scales better than a global spinlock lock.
                         */
                        lock_page(page);
                        /* Recheck when the page is locked which pins bhs */
                        if (!page_has_buffers(page))
                                goto unlock_page;
                        head = page_buffers(page);
                        bh = head;
                        do {
                                if (!buffer_mapped(bh) || (bh->b_blocknr < block))
                                        goto next;
                                if (bh->b_blocknr >= block + len)
                                        break;
                                clear_buffer_dirty(bh);
                                wait_on_buffer(bh);
                                clear_buffer_req(bh);
next:
                                bh = bh->b_this_page;
                        } while (bh != head);
unlock_page:
                        unlock_page(page);
                }
                pagevec_release(&pvec);
                cond_resched();
                /* End of range already reached? */
                if (index > end || !index)
                        break;
        }
}
EXPORT_SYMBOL(clean_bdev_aliases);

/*
 * Size is a power-of-two in the range 512..PAGE_SIZE,
 * and the case we care about most is PAGE_SIZE.
 *
 * So this *could* possibly be written with those
 * constraints in mind (relevant mostly if some
 * architecture has a slow bit-scan instruction)
 */
static inline int block_size_bits(unsigned int blocksize)
{
        return ilog2(blocksize);
}

static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state)
{
        BUG_ON(!PageLocked(page));

        if (!page_has_buffers(page))
                create_empty_buffers(page, 1 << READ_ONCE(inode->i_blkbits),
                                     b_state);
        return page_buffers(page);
}

/*
 * NOTE! All mapped/uptodate combinations are valid:
 *
 *        Mapped        Uptodate        Meaning
 *
 *        No        No                "unknown" - must do get_block()
 *        No        Yes                "hole" - zero-filled
 *        Yes        No                "allocated" - allocated on disk, not read in
 *        Yes        Yes                "valid" - allocated and up-to-date in memory.
 *
 * "Dirty" is valid only with the last case (mapped+uptodate).
 */

/*
 * While block_write_full_page is writing back the dirty buffers under
 * the page lock, whoever dirtied the buffers may decide to clean them
 * again at any time.  We handle that by only looking at the buffer
 * state inside lock_buffer().
 *
 * If block_write_full_page() is called for regular writeback
 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
 * locked buffer.   This only can happen if someone has written the buffer
 * directly, with submit_bh().  At the address_space level PageWriteback
 * prevents this contention from occurring.
 *
 * If block_write_full_page() is called with wbc->sync_mode ==
 * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this
 * causes the writes to be flagged as synchronous writes.
 */
int __block_write_full_page(struct inode *inode, struct page *page,
                        get_block_t *get_block, struct writeback_control *wbc,
                        bh_end_io_t *handler)
{
        int err;
        sector_t block;
        sector_t last_block;
        struct buffer_head *bh, *head;
        unsigned int blocksize, bbits;
        int nr_underway = 0;
        int write_flags = wbc_to_write_flags(wbc);

        head = create_page_buffers(page, inode,
                                        (1 << BH_Dirty)|(1 << BH_Uptodate));

        /*
         * Be very careful.  We have no exclusion from __set_page_dirty_buffers
         * here, and the (potentially unmapped) buffers may become dirty at
         * any time.  If a buffer becomes dirty here after we've inspected it
         * then we just miss that fact, and the page stays dirty.
         *
         * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
         * handle that here by just cleaning them.
         */

        bh = head;
        blocksize = bh->b_size;
        bbits = block_size_bits(blocksize);

        block = (sector_t)page->index << (PAGE_SHIFT - bbits);
        last_block = (i_size_read(inode) - 1) >> bbits;

        /*
         * Get all the dirty buffers mapped to disk addresses and
         * handle any aliases from the underlying blockdev's mapping.
         */
        do {
                if (block > last_block) {
                        /*
                         * mapped buffers outside i_size will occur, because
                         * this page can be outside i_size when there is a
                         * truncate in progress.
                         */
                        /*
                         * The buffer was zeroed by block_write_full_page()
                         */
                        clear_buffer_dirty(bh);
                        set_buffer_uptodate(bh);
                } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
                           buffer_dirty(bh)) {
                        WARN_ON(bh->b_size != blocksize);
                        err = get_block(inode, block, bh, 1);
                        if (err)
                                goto recover;
                        clear_buffer_delay(bh);
                        if (buffer_new(bh)) {
                                /* blockdev mappings never come here */
                                clear_buffer_new(bh);
                                clean_bdev_bh_alias(bh);
                        }
                }
                bh = bh->b_this_page;
                block++;
        } while (bh != head);

        do {
                if (!buffer_mapped(bh))
                        continue;
                /*
                 * If it's a fully non-blocking write attempt and we cannot
                 * lock the buffer then redirty the page.  Note that this can
                 * potentially cause a busy-wait loop from writeback threads
                 * and kswapd activity, but those code paths have their own
                 * higher-level throttling.
                 */
                if (wbc->sync_mode != WB_SYNC_NONE) {
                        lock_buffer(bh);
                } else if (!trylock_buffer(bh)) {
                        redirty_page_for_writepage(wbc, page);
                        continue;
                }
                if (test_clear_buffer_dirty(bh)) {
                        mark_buffer_async_write_endio(bh, handler);
                } else {
                        unlock_buffer(bh);
                }
        } while ((bh = bh->b_this_page) != head);

        /*
         * The page and its buffers are protected by PageWriteback(), so we can
         * drop the bh refcounts early.
         */
        BUG_ON(PageWriteback(page));
        set_page_writeback(page);

        do {
                struct buffer_head *next = bh->b_this_page;
                if (buffer_async_write(bh)) {
                        submit_bh_wbc(REQ_OP_WRITE, write_flags, bh,
                                        inode->i_write_hint, wbc);
                        nr_underway++;
                }
                bh = next;
        } while (bh != head);
        unlock_page(page);

        err = 0;
done:
        if (nr_underway == 0) {
                /*
                 * The page was marked dirty, but the buffers were
                 * clean.  Someone wrote them back by hand with
                 * ll_rw_block/submit_bh.  A rare case.
                 */
                end_page_writeback(page);

                /*
                 * The page and buffer_heads can be released at any time from
                 * here on.
                 */
        }
        return err;

recover:
        /*
         * ENOSPC, or some other error.  We may already have added some
         * blocks to the file, so we need to write these out to avoid
         * exposing stale data.
         * The page is currently locked and not marked for writeback
         */
        bh = head;
        /* Recovery: lock and submit the mapped buffers */
        do {
                if (buffer_mapped(bh) && buffer_dirty(bh) &&
                    !buffer_delay(bh)) {
                        lock_buffer(bh);
                        mark_buffer_async_write_endio(bh, handler);
                } else {
                        /*
                         * The buffer may have been set dirty during
                         * attachment to a dirty page.
                         */
                        clear_buffer_dirty(bh);
                }
        } while ((bh = bh->b_this_page) != head);
        SetPageError(page);
        BUG_ON(PageWriteback(page));
        mapping_set_error(page->mapping, err);
        set_page_writeback(page);
        do {
                struct buffer_head *next = bh->b_this_page;
                if (buffer_async_write(bh)) {
                        clear_buffer_dirty(bh);
                        submit_bh_wbc(REQ_OP_WRITE, write_flags, bh,
                                        inode->i_write_hint, wbc);
                        nr_underway++;
                }
                bh = next;
        } while (bh != head);
        unlock_page(page);
        goto done;
}
EXPORT_SYMBOL(__block_write_full_page);

/*
 * If a page has any new buffers, zero them out here, and mark them uptodate
 * and dirty so they'll be written out (in order to prevent uninitialised
 * block data from leaking). And clear the new bit.
 */
void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
{
        unsigned int block_start, block_end;
        struct buffer_head *head, *bh;

        BUG_ON(!PageLocked(page));
        if (!page_has_buffers(page))
                return;

        bh = head = page_buffers(page);
        block_start = 0;
        do {
                block_end = block_start + bh->b_size;

                if (buffer_new(bh)) {
                        if (block_end > from && block_start < to) {
                                if (!PageUptodate(page)) {
                                        unsigned start, size;

                                        start = max(from, block_start);
                                        size = min(to, block_end) - start;

                                        zero_user(page, start, size);
                                        set_buffer_uptodate(bh);
                                }

                                clear_buffer_new(bh);
                                mark_buffer_dirty(bh);
                        }
                }

                block_start = block_end;
                bh = bh->b_this_page;
        } while (bh != head);
}
EXPORT_SYMBOL(page_zero_new_buffers);

static void
iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
                struct iomap *iomap)
{
        loff_t offset = block << inode->i_blkbits;

        bh->b_bdev = iomap->bdev;

        /*
         * Block points to offset in file we need to map, iomap contains
         * the offset at which the map starts. If the map ends before the
         * current block, then do not map the buffer and let the caller
         * handle it.
         */
        BUG_ON(offset >= iomap->offset + iomap->length);

        switch (iomap->type) {
        case IOMAP_HOLE:
                /*
                 * If the buffer is not up to date or beyond the current EOF,
                 * we need to mark it as new to ensure sub-block zeroing is
                 * executed if necessary.
                 */
                if (!buffer_uptodate(bh) ||
                    (offset >= i_size_read(inode)))
                        set_buffer_new(bh);
                break;
        case IOMAP_DELALLOC:
                if (!buffer_uptodate(bh) ||
                    (offset >= i_size_read(inode)))
                        set_buffer_new(bh);
                set_buffer_uptodate(bh);
                set_buffer_mapped(bh);
                set_buffer_delay(bh);
                break;
        case IOMAP_UNWRITTEN:
                /*
                 * For unwritten regions, we always need to ensure that regions
                 * in the block we are not writing to are zeroed. Mark the
                 * buffer as new to ensure this.
                 */
                set_buffer_new(bh);
                set_buffer_unwritten(bh);
                fallthrough;
        case IOMAP_MAPPED:
                if ((iomap->flags & IOMAP_F_NEW) ||
                    offset >= i_size_read(inode))
                        set_buffer_new(bh);
                bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
                                inode->i_blkbits;
                set_buffer_mapped(bh);
                break;
        }
}

int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
                get_block_t *get_block, struct iomap *iomap)
{
        unsigned from = pos & (PAGE_SIZE - 1);
        unsigned to = from + len;
        struct inode *inode = page->mapping->host;
        unsigned block_start, block_end;
        sector_t block;
        int err = 0;
        unsigned blocksize, bbits;
        struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;

        BUG_ON(!PageLocked(page));
        BUG_ON(from > PAGE_SIZE);
        BUG_ON(to > PAGE_SIZE);
        BUG_ON(from > to);

        head = create_page_buffers(page, inode, 0);
        blocksize = head->b_size;
        bbits = block_size_bits(blocksize);

        block = (sector_t)page->index << (PAGE_SHIFT - bbits);

        for(bh = head, block_start = 0; bh != head || !block_start;
            block++, block_start=block_end, bh = bh->b_this_page) {
                block_end = block_start + blocksize;
                if (block_end <= from || block_start >= to) {
                        if (PageUptodate(page)) {
                                if (!buffer_uptodate(bh))
                                        set_buffer_uptodate(bh);
                        }
                        continue;
                }
                if (buffer_new(bh))
                        clear_buffer_new(bh);
                if (!buffer_mapped(bh)) {
                        WARN_ON(bh->b_size != blocksize);
                        if (get_block) {
                                err = get_block(inode, block, bh, 1);
                                if (err)
                                        break;
                        } else {
                                iomap_to_bh(inode, block, bh, iomap);
                        }

                        if (buffer_new(bh)) {
                                clean_bdev_bh_alias(bh);
                                if (PageUptodate(page)) {
                                        clear_buffer_new(bh);
                                        set_buffer_uptodate(bh);
                                        mark_buffer_dirty(bh);
                                        continue;
                                }
                                if (block_end > to || block_start < from)
                                        zero_user_segments(page,
                                                to, block_end,
                                                block_start, from);
                                continue;
                        }
                }
                if (PageUptodate(page)) {
                        if (!buffer_uptodate(bh))
                                set_buffer_uptodate(bh);
                        continue; 
                }
                if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
                    !buffer_unwritten(bh) &&
                     (block_start < from || block_end > to)) {
                        ll_rw_block(REQ_OP_READ, 0, 1, &bh);
                        *wait_bh++=bh;
                }
        }
        /*
         * If we issued read requests - let them complete.
         */
        while(wait_bh > wait) {
                wait_on_buffer(*--wait_bh);
                if (!buffer_uptodate(*wait_bh))
                        err = -EIO;
        }
        if (unlikely(err))
                page_zero_new_buffers(page, from, to);
        return err;
}

int __block_write_begin(struct page *page, loff_t pos, unsigned len,
                get_block_t *get_block)
{
        return __block_write_begin_int(page, pos, len, get_block, NULL);
}
EXPORT_SYMBOL(__block_write_begin);

static int __block_commit_write(struct inode *inode, struct page *page,
                unsigned from, unsigned to)
{
        unsigned block_start, block_end;
        int partial = 0;
        unsigned blocksize;
        struct buffer_head *bh, *head;

        bh = head = page_buffers(page);
        blocksize = bh->b_size;

        block_start = 0;
        do {
                block_end = block_start + blocksize;
                if (block_end <= from || block_start >= to) {
                        if (!buffer_uptodate(bh))
                                partial = 1;
                } else {
                        set_buffer_uptodate(bh);
                        mark_buffer_dirty(bh);
                }
                clear_buffer_new(bh);

                block_start = block_end;
                bh = bh->b_this_page;
        } while (bh != head);

        /*
         * If this is a partial write which happened to make all buffers
         * uptodate then we can optimize away a bogus readpage() for
         * the next read(). Here we 'discover' whether the page went
         * uptodate as a result of this (potentially partial) write.
         */
        if (!partial)
                SetPageUptodate(page);
        return 0;
}

/*
 * block_write_begin takes care of the basic task of block allocation and
 * bringing partial write blocks uptodate first.
 *
 * The filesystem needs to handle block truncation upon failure.
 */
int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
                unsigned flags, struct page **pagep, get_block_t *get_block)
{
        pgoff_t index = pos >> PAGE_SHIFT;
        struct page *page;
        int status;

        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
                return -ENOMEM;

        status = __block_write_begin(page, pos, len, get_block);
        if (unlikely(status)) {
                unlock_page(page);
                put_page(page);
                page = NULL;
        }

        *pagep = page;
        return status;
}
EXPORT_SYMBOL(block_write_begin);

int block_write_end(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned copied,
                        struct page *page, void *fsdata)
{
        struct inode *inode = mapping->host;
        unsigned start;

        start = pos & (PAGE_SIZE - 1);

        if (unlikely(copied < len)) {
                /*
                 * The buffers that were written will now be uptodate, so we
                 * don't have to worry about a readpage reading them and
                 * overwriting a partial write. However if we have encountered
                 * a short write and only partially written into a buffer, it
                 * will not be marked uptodate, so a readpage might come in and
                 * destroy our partial write.
                 *
                 * Do the simplest thing, and just treat any short write to a
                 * non uptodate page as a zero-length write, and force the
                 * caller to redo the whole thing.
                 */
                if (!PageUptodate(page))
                        copied = 0;

                page_zero_new_buffers(page, start+copied, start+len);
        }
        flush_dcache_page(page);

        /* This could be a short (even 0-length) commit */
        __block_commit_write(inode, page, start, start+copied);

        return copied;
}
EXPORT_SYMBOL(block_write_end);

int generic_write_end(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned copied,
                        struct page *page, void *fsdata)
{
        struct inode *inode = mapping->host;
        loff_t old_size = inode->i_size;
        bool i_size_changed = false;

        copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);

        /*
         * No need to use i_size_read() here, the i_size cannot change under us
         * because we hold i_rwsem.
         *
         * But it's important to update i_size while still holding page lock:
         * page writeout could otherwise come in and zero beyond i_size.
         */
        if (pos + copied > inode->i_size) {
                i_size_write(inode, pos + copied);
                i_size_changed = true;
        }

        unlock_page(page);
        put_page(page);

        if (old_size < pos)
                pagecache_isize_extended(inode, old_size, pos);
        /*
         * Don't mark the inode dirty under page lock. First, it unnecessarily
         * makes the holding time of page lock longer. Second, it forces lock
         * ordering of page lock and transaction start for journaling
         * filesystems.
         */
        if (i_size_changed)
                mark_inode_dirty(inode);
        return copied;
}
EXPORT_SYMBOL(generic_write_end);

/*
 * block_is_partially_uptodate checks whether buffers within a page are
 * uptodate or not.
 *
 * Returns true if all buffers which correspond to a file portion
 * we want to read are uptodate.
 */
int block_is_partially_uptodate(struct page *page, unsigned long from,
                                        unsigned long count)
{
        unsigned block_start, block_end, blocksize;
        unsigned to;
        struct buffer_head *bh, *head;
        int ret = 1;

        if (!page_has_buffers(page))
                return 0;

        head = page_buffers(page);
        blocksize = head->b_size;
        to = min_t(unsigned, PAGE_SIZE - from, count);
        to = from + to;
        if (from < blocksize && to > PAGE_SIZE - blocksize)
                return 0;

        bh = head;
        block_start = 0;
        do {
                block_end = block_start + blocksize;
                if (block_end > from && block_start < to) {
                        if (!buffer_uptodate(bh)) {
                                ret = 0;
                                break;
                        }
                        if (block_end >= to)
                                break;
                }
                block_start = block_end;
                bh = bh->b_this_page;
        } while (bh != head);

        return ret;
}
EXPORT_SYMBOL(block_is_partially_uptodate);

/*
 * Generic "read page" function for block devices that have the normal
 * get_block functionality. This is most of the block device filesystems.
 * Reads the page asynchronously --- the unlock_buffer() and
 * set/clear_buffer_uptodate() functions propagate buffer state into the
 * page struct once IO has completed.
 */
int block_read_full_page(struct page *page, get_block_t *get_block)
{
        struct inode *inode = page->mapping->host;
        sector_t iblock, lblock;
        struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
        unsigned int blocksize, bbits;
        int nr, i;
        int fully_mapped = 1;

        head = create_page_buffers(page, inode, 0);
        blocksize = head->b_size;
        bbits = block_size_bits(blocksize);

        iblock = (sector_t)page->index << (PAGE_SHIFT - bbits);
        lblock = (i_size_read(inode)+blocksize-1) >> bbits;
        bh = head;
        nr = 0;
        i = 0;

        do {
                if (buffer_uptodate(bh))
                        continue;

                if (!buffer_mapped(bh)) {
                        int err = 0;

                        fully_mapped = 0;
                        if (iblock < lblock) {
                                WARN_ON(bh->b_size != blocksize);
                                err = get_block(inode, iblock, bh, 0);
                                if (err)
                                        SetPageError(page);
                        }
                        if (!buffer_mapped(bh)) {
                                zero_user(page, i * blocksize, blocksize);
                                if (!err)
                                        set_buffer_uptodate(bh);
                                continue;
                        }
                        /*
                         * get_block() might have updated the buffer
                         * synchronously
                         */
                        if (buffer_uptodate(bh))
                                continue;
                }
                arr[nr++] = bh;
        } while (i++, iblock++, (bh = bh->b_this_page) != head);

        if (fully_mapped)
                SetPageMappedToDisk(page);

        if (!nr) {
                /*
                 * All buffers are uptodate - we can set the page uptodate
                 * as well. But not if get_block() returned an error.
                 */
                if (!PageError(page))
                        SetPageUptodate(page);
                unlock_page(page);
                return 0;
        }

        /* Stage two: lock the buffers */
        for (i = 0; i < nr; i++) {
                bh = arr[i];
                lock_buffer(bh);
                mark_buffer_async_read(bh);
        }

        /*
         * Stage 3: start the IO.  Check for uptodateness
         * inside the buffer lock in case another process reading
         * the underlying blockdev brought it uptodate (the sct fix).
         */
        for (i = 0; i < nr; i++) {
                bh = arr[i];
                if (buffer_uptodate(bh))
                        end_buffer_async_read(bh, 1);
                else
                        submit_bh(REQ_OP_READ, 0, bh);
        }
        return 0;
}
EXPORT_SYMBOL(block_read_full_page);

/* utility function for filesystems that need to do work on expanding
 * truncates.  Uses filesystem pagecache writes to allow the filesystem to
 * deal with the hole.  
 */
int generic_cont_expand_simple(struct inode *inode, loff_t size)
{
        struct address_space *mapping = inode->i_mapping;
        struct page *page;
        void *fsdata = NULL;
        int err;

        err = inode_newsize_ok(inode, size);
        if (err)
                goto out;

        err = pagecache_write_begin(NULL, mapping, size, 0,
                                    AOP_FLAG_CONT_EXPAND, &page, &fsdata);
        if (err)
                goto out;

        err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
        BUG_ON(err > 0);

out:
        return err;
}
EXPORT_SYMBOL(generic_cont_expand_simple);

static int cont_expand_zero(struct file *file, struct address_space *mapping,
                            loff_t pos, loff_t *bytes)
{
        struct inode *inode = mapping->host;
        unsigned int blocksize = i_blocksize(inode);
        struct page *page;
        void *fsdata = NULL;
        pgoff_t index, curidx;
        loff_t curpos;
        unsigned zerofrom, offset, len;
        int err = 0;

        index = pos >> PAGE_SHIFT;
        offset = pos & ~PAGE_MASK;

        while (index > (curidx = (curpos = *bytes)>>PAGE_SHIFT)) {
                zerofrom = curpos & ~PAGE_MASK;
                if (zerofrom & (blocksize-1)) {
                        *bytes |= (blocksize-1);
                        (*bytes)++;
                }
                len = PAGE_SIZE - zerofrom;

                err = pagecache_write_begin(file, mapping, curpos, len, 0,
                                            &page, &fsdata);
                if (err)
                        goto out;
                zero_user(page, zerofrom, len);
                err = pagecache_write_end(file, mapping, curpos, len, len,
                                                page, fsdata);
                if (err < 0)
                        goto out;
                BUG_ON(err != len);
                err = 0;

                balance_dirty_pages_ratelimited(mapping);

                if (fatal_signal_pending(current)) {
                        err = -EINTR;
                        goto out;
                }
        }

        /* page covers the boundary, find the boundary offset */
        if (index == curidx) {
                zerofrom = curpos & ~PAGE_MASK;
                /* if we will expand the thing last block will be filled */
                if (offset <= zerofrom) {
                        goto out;
                }
                if (zerofrom & (blocksize-1)) {
                        *bytes |= (blocksize-1);
                        (*bytes)++;
                }
                len = offset - zerofrom;

                err = pagecache_write_begin(file, mapping, curpos, len, 0,
                                            &page, &fsdata);
                if (err)
                        goto out;
                zero_user(page, zerofrom, len);
                err = pagecache_write_end(file, mapping, curpos, len, len,
                                                page, fsdata);
                if (err < 0)
                        goto out;
                BUG_ON(err != len);
                err = 0;
        }
out:
        return err;
}

/*
 * For moronic filesystems that do not allow holes in file.
 * We may have to extend the file.
 */
int cont_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata,
                        get_block_t *get_block, loff_t *bytes)
{
        struct inode *inode = mapping->host;
        unsigned int blocksize = i_blocksize(inode);
        unsigned int zerofrom;
        int err;

        err = cont_expand_zero(file, mapping, pos, bytes);
        if (err)
                return err;

        zerofrom = *bytes & ~PAGE_MASK;
        if (pos+len > *bytes && zerofrom & (blocksize-1)) {
                *bytes |= (blocksize-1);
                (*bytes)++;
        }

        return block_write_begin(mapping, pos, len, flags, pagep, get_block);
}
EXPORT_SYMBOL(cont_write_begin);

int block_commit_write(struct page *page, unsigned from, unsigned to)
{
        struct inode *inode = page->mapping->host;
        __block_commit_write(inode,page,from,to);
        return 0;
}
EXPORT_SYMBOL(block_commit_write);

/*
 * block_page_mkwrite() is not allowed to change the file size as it gets
 * called from a page fault handler when a page is first dirtied. Hence we must
 * be careful to check for EOF conditions here. We set the page up correctly
 * for a written page which means we get ENOSPC checking when writing into
 * holes and correct delalloc and unwritten extent mapping on filesystems that
 * support these features.
 *
 * We are not allowed to take the i_mutex here so we have to play games to
 * protect against truncate races as the page could now be beyond EOF.  Because
 * truncate writes the inode size before removing pages, once we have the
 * page lock we can determine safely if the page is beyond EOF. If it is not
 * beyond EOF, then the page is guaranteed safe against truncation until we
 * unlock the page.
 *
 * Direct callers of this function should protect against filesystem freezing
 * using sb_start_pagefault() - sb_end_pagefault() functions.
 */
int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
                         get_block_t get_block)
{
        struct page *page = vmf->page;
        struct inode *inode = file_inode(vma->vm_file);
        unsigned long end;
        loff_t size;
        int ret;

        lock_page(page);
        size = i_size_read(inode);
        if ((page->mapping != inode->i_mapping) ||
            (page_offset(page) > size)) {
                /* We overload EFAULT to mean page got truncated */
                ret = -EFAULT;
                goto out_unlock;
        }

        /* page is wholly or partially inside EOF */
        if (((page->index + 1) << PAGE_SHIFT) > size)
                end = size & ~PAGE_MASK;
        else
                end = PAGE_SIZE;

        ret = __block_write_begin(page, 0, end, get_block);
        if (!ret)
                ret = block_commit_write(page, 0, end);

        if (unlikely(ret < 0))
                goto out_unlock;
        set_page_dirty(page);
        wait_for_stable_page(page);
        return 0;
out_unlock:
        unlock_page(page);
        return ret;
}
EXPORT_SYMBOL(block_page_mkwrite);

/*
 * nobh_write_begin()'s prereads are special: the buffer_heads are freed
 * immediately, while under the page lock.  So it needs a special end_io
 * handler which does not touch the bh after unlocking it.
 */
static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
{
        __end_buffer_read_notouch(bh, uptodate);
}

/*
 * Attach the singly-linked list of buffers created by nobh_write_begin, to
 * the page (converting it to circular linked list and taking care of page
 * dirty races).
 */
static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
{
        struct buffer_head *bh;

        BUG_ON(!PageLocked(page));

        spin_lock(&page->mapping->private_lock);
        bh = head;
        do {
                if (PageDirty(page))
                        set_buffer_dirty(bh);
                if (!bh->b_this_page)
                        bh->b_this_page = head;
                bh = bh->b_this_page;
        } while (bh != head);
        attach_page_private(page, head);
        spin_unlock(&page->mapping->private_lock);
}

/*
 * On entry, the page is fully not uptodate.
 * On exit the page is fully uptodate in the areas outside (from,to)
 * The filesystem needs to handle block truncation upon failure.
 */
int nobh_write_begin(struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata,
                        get_block_t *get_block)
{
        struct inode *inode = mapping->host;
        const unsigned blkbits = inode->i_blkbits;
        const unsigned blocksize = 1 << blkbits;
        struct buffer_head *head, *bh;
        struct page *page;
        pgoff_t index;
        unsigned from, to;
        unsigned block_in_page;
        unsigned block_start, block_end;
        sector_t block_in_file;
        int nr_reads = 0;
        int ret = 0;
        int is_mapped_to_disk = 1;

        index = pos >> PAGE_SHIFT;
        from = pos & (PAGE_SIZE - 1);
        to = from + len;

        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
                return -ENOMEM;
        *pagep = page;
        *fsdata = NULL;

        if (page_has_buffers(page)) {
                ret = __block_write_begin(page, pos, len, get_block);
                if (unlikely(ret))
                        goto out_release;
                return ret;
        }

        if (PageMappedToDisk(page))
                return 0;

        /*
         * Allocate buffers so that we can keep track of state, and potentially
         * attach them to the page if an error occurs. In the common case of
         * no error, they will just be freed again without ever being attached
         * to the page (which is all OK, because we're under the page lock).
         *
         * Be careful: the buffer linked list is a NULL terminated one, rather
         * than the circular one we're used to.
         */
        head = alloc_page_buffers(page, blocksize, false);
        if (!head) {
                ret = -ENOMEM;
                goto out_release;
        }

        block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);

        /*
         * We loop across all blocks in the page, whether or not they are
         * part of the affected region.  This is so we can discover if the
         * page is fully mapped-to-disk.
         */
        for (block_start = 0, block_in_page = 0, bh = head;
                  block_start < PAGE_SIZE;
                  block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
                int create;

                block_end = block_start + blocksize;
                bh->b_state = 0;
                create = 1;
                if (block_start >= to)
                        create = 0;
                ret = get_block(inode, block_in_file + block_in_page,
                                        bh, create);
                if (ret)
                        goto failed;
                if (!buffer_mapped(bh))
                        is_mapped_to_disk = 0;
                if (buffer_new(bh))
                        clean_bdev_bh_alias(bh);
                if (PageUptodate(page)) {
                        set_buffer_uptodate(bh);
                        continue;
                }
                if (buffer_new(bh) || !buffer_mapped(bh)) {
                        zero_user_segments(page, block_start, from,
                                                        to, block_end);
                        continue;
                }
                if (buffer_uptodate(bh))
                        continue;        /* reiserfs does this */
                if (block_start < from || block_end > to) {
                        lock_buffer(bh);
                        bh->b_end_io = end_buffer_read_nobh;
                        submit_bh(REQ_OP_READ, 0, bh);
                        nr_reads++;
                }
        }

        if (nr_reads) {
                /*
                 * The page is locked, so these buffers are protected from
                 * any VM or truncate activity.  Hence we don't need to care
                 * for the buffer_head refcounts.
                 */
                for (bh = head; bh; bh = bh->b_this_page) {
                        wait_on_buffer(bh);
                        if (!buffer_uptodate(bh))
                                ret = -EIO;
                }
                if (ret)
                        goto failed;
        }

        if (is_mapped_to_disk)
                SetPageMappedToDisk(page);

        *fsdata = head; /* to be released by nobh_write_end */

        return 0;

failed:
        BUG_ON(!ret);
        /*
         * Error recovery is a bit difficult. We need to zero out blocks that
         * were newly allocated, and dirty them to ensure they get written out.
         * Buffers need to be attached to the page at this point, otherwise
         * the handling of potential IO errors during writeout would be hard
         * (could try doing synchronous writeout, but what if that fails too?)
         */
        attach_nobh_buffers(page, head);
        page_zero_new_buffers(page, from, to);

out_release:
        unlock_page(page);
        put_page(page);
        *pagep = NULL;

        return ret;
}
EXPORT_SYMBOL(nobh_write_begin);

int nobh_write_end(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned copied,
                        struct page *page, void *fsdata)
{
        struct inode *inode = page->mapping->host;
        struct buffer_head *head = fsdata;
        struct buffer_head *bh;
        BUG_ON(fsdata != NULL && page_has_buffers(page));

        if (unlikely(copied < len) && head)
                attach_nobh_buffers(page, head);
        if (page_has_buffers(page))
                return generic_write_end(file, mapping, pos, len,
                                        copied, page, fsdata);

        SetPageUptodate(page);
        set_page_dirty(page);
        if (pos+copied > inode->i_size) {
                i_size_write(inode, pos+copied);
                mark_inode_dirty(inode);
        }

        unlock_page(page);
        put_page(page);

        while (head) {
                bh = head;
                head = head->b_this_page;
                free_buffer_head(bh);
        }

        return copied;
}
EXPORT_SYMBOL(nobh_write_end);

/*
 * nobh_writepage() - based on block_full_write_page() except
 * that it tries to operate without attaching bufferheads to
 * the page.
 */
int nobh_writepage(struct page *page, get_block_t *get_block,
                        struct writeback_control *wbc)
{
        struct inode * const inode = page->mapping->host;
        loff_t i_size = i_size_read(inode);
        const pgoff_t end_index = i_size >> PAGE_SHIFT;
        unsigned offset;
        int ret;

        /* Is the page fully inside i_size? */
        if (page->index < end_index)
                goto out;

        /* Is the page fully outside i_size? (truncate in progress) */
        offset = i_size & (PAGE_SIZE-1);
        if (page->index >= end_index+1 || !offset) {
                unlock_page(page);
                return 0; /* don't care */
        }

        /*
         * The page straddles i_size.  It must be zeroed out on each and every
         * writepage invocation because it may be mmapped.  "A file is mapped
         * in multiples of the page size.  For a file that is not a multiple of
         * the  page size, the remaining memory is zeroed when mapped, and
         * writes to that region are not written out to the file."
         */
        zero_user_segment(page, offset, PAGE_SIZE);
out:
        ret = mpage_writepage(page, get_block, wbc);
        if (ret == -EAGAIN)
                ret = __block_write_full_page(inode, page, get_block, wbc,
                                              end_buffer_async_write);
        return ret;
}
EXPORT_SYMBOL(nobh_writepage);

int nobh_truncate_page(struct address_space *mapping,
                        loff_t from, get_block_t *get_block)
{
        pgoff_t index = from >> PAGE_SHIFT;
        unsigned offset = from & (PAGE_SIZE-1);
        unsigned blocksize;
        sector_t iblock;
        unsigned length, pos;
        struct inode *inode = mapping->host;
        struct page *page;
        struct buffer_head map_bh;
        int err;

        blocksize = i_blocksize(inode);
        length = offset & (blocksize - 1);

        /* Block boundary? Nothing to do */
        if (!length)
                return 0;

        length = blocksize - length;
        iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits);

        page = grab_cache_page(mapping, index);
        err = -ENOMEM;
        if (!page)
                goto out;

        if (page_has_buffers(page)) {
has_buffers:
                unlock_page(page);
                put_page(page);
                return block_truncate_page(mapping, from, get_block);
        }

        /* Find the buffer that contains "offset" */
        pos = blocksize;
        while (offset >= pos) {
                iblock++;
                pos += blocksize;
        }

        map_bh.b_size = blocksize;
        map_bh.b_state = 0;
        err = get_block(inode, iblock, &map_bh, 0);
        if (err)
                goto unlock;
        /* unmapped? It's a hole - nothing to do */
        if (!buffer_mapped(&map_bh))
                goto unlock;

        /* Ok, it's mapped. Make sure it's up-to-date */
        if (!PageUptodate(page)) {
                err = mapping->a_ops->readpage(NULL, page);
                if (err) {
                        put_page(page);
                        goto out;
                }
                lock_page(page);
                if (!PageUptodate(page)) {
                        err = -EIO;
                        goto unlock;
                }
                if (page_has_buffers(page))
                        goto has_buffers;
        }
        zero_user(page, offset, length);
        set_page_dirty(page);
        err = 0;

unlock:
        unlock_page(page);
        put_page(page);
out:
        return err;
}
EXPORT_SYMBOL(nobh_truncate_page);

int block_truncate_page(struct address_space *mapping,
                        loff_t from, get_block_t *get_block)
{
        pgoff_t index = from >> PAGE_SHIFT;
        unsigned offset = from & (PAGE_SIZE-1);
        unsigned blocksize;
        sector_t iblock;
        unsigned length, pos;
        struct inode *inode = mapping->host;
        struct page *page;
        struct buffer_head *bh;
        int err;

        blocksize = i_blocksize(inode);
        length = offset & (blocksize - 1);

        /* Block boundary? Nothing to do */
        if (!length)
                return 0;

        length = blocksize - length;
        iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits);
        
        page = grab_cache_page(mapping, index);
        err = -ENOMEM;
        if (!page)
                goto out;

        if (!page_has_buffers(page))
                create_empty_buffers(page, blocksize, 0);

        /* Find the buffer that contains "offset" */
        bh = page_buffers(page);
        pos = blocksize;
        while (offset >= pos) {
                bh = bh->b_this_page;
                iblock++;
                pos += blocksize;
        }

        err = 0;
        if (!buffer_mapped(bh)) {
                WARN_ON(bh->b_size != blocksize);
                err = get_block(inode, iblock, bh, 0);
                if (err)
                        goto unlock;
                /* unmapped? It's a hole - nothing to do */
                if (!buffer_mapped(bh))
                        goto unlock;
        }

        /* Ok, it's mapped. Make sure it's up-to-date */
        if (PageUptodate(page))
                set_buffer_uptodate(bh);

        if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
                err = -EIO;
                ll_rw_block(REQ_OP_READ, 0, 1, &bh);
                wait_on_buffer(bh);
                /* Uhhuh. Read error. Complain and punt. */
                if (!buffer_uptodate(bh))
                        goto unlock;
        }

        zero_user(page, offset, length);
        mark_buffer_dirty(bh);
        err = 0;

unlock:
        unlock_page(page);
        put_page(page);
out:
        return err;
}
EXPORT_SYMBOL(block_truncate_page);

/*
 * The generic ->writepage function for buffer-backed address_spaces
 */
int block_write_full_page(struct page *page, get_block_t *get_block,
                        struct writeback_control *wbc)
{
        struct inode * const inode = page->mapping->host;
        loff_t i_size = i_size_read(inode);
        const pgoff_t end_index = i_size >> PAGE_SHIFT;
        unsigned offset;

        /* Is the page fully inside i_size? */
        if (page->index < end_index)
                return __block_write_full_page(inode, page, get_block, wbc,
                                               end_buffer_async_write);

        /* Is the page fully outside i_size? (truncate in progress) */
        offset = i_size & (PAGE_SIZE-1);
        if (page->index >= end_index+1 || !offset) {
                unlock_page(page);
                return 0; /* don't care */
        }

        /*
         * The page straddles i_size.  It must be zeroed out on each and every
         * writepage invocation because it may be mmapped.  "A file is mapped
         * in multiples of the page size.  For a file that is not a multiple of
         * the  page size, the remaining memory is zeroed when mapped, and
         * writes to that region are not written out to the file."
         */
        zero_user_segment(page, offset, PAGE_SIZE);
        return __block_write_full_page(inode, page, get_block, wbc,
                                                        end_buffer_async_write);
}
EXPORT_SYMBOL(block_write_full_page);

sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
                            get_block_t *get_block)
{
        struct inode *inode = mapping->host;
        struct buffer_head tmp = {
                .b_size = i_blocksize(inode),
        };

        get_block(inode, block, &tmp, 0);
        return tmp.b_blocknr;
}
EXPORT_SYMBOL(generic_block_bmap);

static void end_bio_bh_io_sync(struct bio *bio)
{
        struct buffer_head *bh = bio->bi_private;

        if (unlikely(bio_flagged(bio, BIO_QUIET)))
                set_bit(BH_Quiet, &bh->b_state);

        bh->b_end_io(bh, !bio->bi_status);
        bio_put(bio);
}

static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
                         enum rw_hint write_hint, struct writeback_control *wbc)
{
        struct bio *bio;

        BUG_ON(!buffer_locked(bh));
        BUG_ON(!buffer_mapped(bh));
        BUG_ON(!bh->b_end_io);
        BUG_ON(buffer_delay(bh));
        BUG_ON(buffer_unwritten(bh));

        /*
         * Only clear out a write error when rewriting
         */
        if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE))
                clear_buffer_write_io_error(bh);

        bio = bio_alloc(GFP_NOIO, 1);

        fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);

        bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
        bio_set_dev(bio, bh->b_bdev);
        bio->bi_write_hint = write_hint;

        bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
        BUG_ON(bio->bi_iter.bi_size != bh->b_size);

        bio->bi_end_io = end_bio_bh_io_sync;
        bio->bi_private = bh;

        if (buffer_meta(bh))
                op_flags |= REQ_META;
        if (buffer_prio(bh))
                op_flags |= REQ_PRIO;
        bio_set_op_attrs(bio, op, op_flags);

        /* Take care of bh's that straddle the end of the device */
        guard_bio_eod(bio);

        if (wbc) {
                wbc_init_bio(wbc, bio);
                wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size);
        }

        submit_bio(bio);
        return 0;
}

int submit_bh(int op, int op_flags, struct buffer_head *bh)
{
        return submit_bh_wbc(op, op_flags, bh, 0, NULL);
}
EXPORT_SYMBOL(submit_bh);

/**
 * ll_rw_block: low-level access to block devices (DEPRECATED)
 * @op: whether to %READ or %WRITE
 * @op_flags: req_flag_bits
 * @nr: number of &struct buffer_heads in the array
 * @bhs: array of pointers to &struct buffer_head
 *
 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
 * requests an I/O operation on them, either a %REQ_OP_READ or a %REQ_OP_WRITE.
 * @op_flags contains flags modifying the detailed I/O behavior, most notably
 * %REQ_RAHEAD.
 *
 * This function drops any buffer that it cannot get a lock on (with the
 * BH_Lock state bit), any buffer that appears to be clean when doing a write
 * request, and any buffer that appears to be up-to-date when doing read
 * request.  Further it marks as clean buffers that are processed for
 * writing (the buffer cache won't assume that they are actually clean
 * until the buffer gets unlocked).
 *
 * ll_rw_block sets b_end_io to simple completion handler that marks
 * the buffer up-to-date (if appropriate), unlocks the buffer and wakes
 * any waiters. 
 *
 * All of the buffers must be for the same device, and must also be a
 * multiple of the current approved size for the device.
 */
void ll_rw_block(int op, int op_flags,  int nr, struct buffer_head *bhs[])
{
        int i;

        for (i = 0; i < nr; i++) {
                struct buffer_head *bh = bhs[i];

                if (!trylock_buffer(bh))
                        continue;
                if (op == WRITE) {
                        if (test_clear_buffer_dirty(bh)) {
                                bh->b_end_io = end_buffer_write_sync;
                                get_bh(bh);
                                submit_bh(op, op_flags, bh);
                                continue;
                        }
                } else {
                        if (!buffer_uptodate(bh)) {
                                bh->b_end_io = end_buffer_read_sync;
                                get_bh(bh);
                                submit_bh(op, op_flags, bh);
                                continue;
                        }
                }
                unlock_buffer(bh);
        }
}
EXPORT_SYMBOL(ll_rw_block);

void write_dirty_buffer(struct buffer_head *bh, int op_flags)
{
        lock_buffer(bh);
        if (!test_clear_buffer_dirty(bh)) {
                unlock_buffer(bh);
                return;
        }
        bh->b_end_io = end_buffer_write_sync;
        get_bh(bh);
        submit_bh(REQ_OP_WRITE, op_flags, bh);
}
EXPORT_SYMBOL(write_dirty_buffer);

/*
 * For a data-integrity writeout, we need to wait upon any in-progress I/O
 * and then start new I/O and then wait upon it.  The caller must have a ref on
 * the buffer_head.
 */
int __sync_dirty_buffer(struct buffer_head *bh, int op_flags)
{
        int ret = 0;

        WARN_ON(atomic_read(&bh->b_count) < 1);
        lock_buffer(bh);
        if (test_clear_buffer_dirty(bh)) {
                /*
                 * The bh should be mapped, but it might not be if the
                 * device was hot-removed. Not much we can do but fail the I/O.
                 */
                if (!buffer_mapped(bh)) {
                        unlock_buffer(bh);
                        return -EIO;
                }

                get_bh(bh);
                bh->b_end_io = end_buffer_write_sync;
                ret = submit_bh(REQ_OP_WRITE, op_flags, bh);
                wait_on_buffer(bh);
                if (!ret && !buffer_uptodate(bh))
                        ret = -EIO;
        } else {
                unlock_buffer(bh);
        }
        return ret;
}
EXPORT_SYMBOL(__sync_dirty_buffer);

int sync_dirty_buffer(struct buffer_head *bh)
{
        return __sync_dirty_buffer(bh, REQ_SYNC);
}
EXPORT_SYMBOL(sync_dirty_buffer);

/*
 * try_to_free_buffers() checks if all the buffers on this particular page
 * are unused, and releases them if so.
 *
 * Exclusion against try_to_free_buffers may be obtained by either
 * locking the page or by holding its mapping's private_lock.
 *
 * If the page is dirty but all the buffers are clean then we need to
 * be sure to mark the page clean as well.  This is because the page
 * may be against a block device, and a later reattachment of buffers
 * to a dirty page will set *all* buffers dirty.  Which would corrupt
 * filesystem data on the same device.
 *
 * The same applies to regular filesystem pages: if all the buffers are
 * clean then we set the page clean and proceed.  To do that, we require
 * total exclusion from __set_page_dirty_buffers().  That is obtained with
 * private_lock.
 *
 * try_to_free_buffers() is non-blocking.
 */
static inline int buffer_busy(struct buffer_head *bh)
{
        return atomic_read(&bh->b_count) |
                (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
}

static int
drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
{
        struct buffer_head *head = page_buffers(page);
        struct buffer_head *bh;

        bh = head;
        do {
                if (buffer_busy(bh))
                        goto failed;
                bh = bh->b_this_page;
        } while (bh != head);

        do {
                struct buffer_head *next = bh->b_this_page;

                if (bh->b_assoc_map)
                        __remove_assoc_queue(bh);
                bh = next;
        } while (bh != head);
        *buffers_to_free = head;
        detach_page_private(page);
        return 1;
failed:
        return 0;
}

int try_to_free_buffers(struct page *page)
{
        struct address_space * const mapping = page->mapping;
        struct buffer_head *buffers_to_free = NULL;
        int ret = 0;

        BUG_ON(!PageLocked(page));
        if (PageWriteback(page))
                return 0;

        if (mapping == NULL) {                /* can this still happen? */
                ret = drop_buffers(page, &buffers_to_free);
                goto out;
        }

        spin_lock(&mapping->private_lock);
        ret = drop_buffers(page, &buffers_to_free);

        /*
         * If the filesystem writes its buffers by hand (eg ext3)
         * then we can have clean buffers against a dirty page.  We
         * clean the page here; otherwise the VM will never notice
         * that the filesystem did any IO at all.
         *
         * Also, during truncate, discard_buffer will have marked all
         * the page's buffers clean.  We discover that here and clean
         * the page also.
         *
         * private_lock must be held over this entire operation in order
         * to synchronise against __set_page_dirty_buffers and prevent the
         * dirty bit from being lost.
         */
        if (ret)
                cancel_dirty_page(page);
        spin_unlock(&mapping->private_lock);
out:
        if (buffers_to_free) {
                struct buffer_head *bh = buffers_to_free;

                do {
                        struct buffer_head *next = bh->b_this_page;
                        free_buffer_head(bh);
                        bh = next;
                } while (bh != buffers_to_free);
        }
        return ret;
}
EXPORT_SYMBOL(try_to_free_buffers);

/*
 * There are no bdflush tunables left.  But distributions are
 * still running obsolete flush daemons, so we terminate them here.
 *
 * Use of bdflush() is deprecated and will be removed in a future kernel.
 * The `flush-X' kernel threads fully replace bdflush daemons and this call.
 */
SYSCALL_DEFINE2(bdflush, int, func, long, data)
{
        static int msg_count;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        if (msg_count < 5) {
                msg_count++;
                printk(KERN_INFO
                        "warning: process `%s' used the obsolete bdflush"
                        " system call\n", current->comm);
                printk(KERN_INFO "Fix your initscripts?\n");
        }

        if (func == 1)
                do_exit(0);
        return 0;
}

/*
 * Buffer-head allocation
 */
static struct kmem_cache *bh_cachep __read_mostly;

/*
 * Once the number of bh's in the machine exceeds this level, we start
 * stripping them in writeback.
 */
static unsigned long max_buffer_heads;

int buffer_heads_over_limit;

struct bh_accounting {
        int nr;                        /* Number of live bh's */
        int ratelimit;                /* Limit cacheline bouncing */
};

static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};

static void recalc_bh_state(void)
{
        int i;
        int tot = 0;

        if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
                return;
        __this_cpu_write(bh_accounting.ratelimit, 0);
        for_each_online_cpu(i)
                tot += per_cpu(bh_accounting, i).nr;
        buffer_heads_over_limit = (tot > max_buffer_heads);
}

struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
{
        struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
        if (ret) {
                INIT_LIST_HEAD(&ret->b_assoc_buffers);
                spin_lock_init(&ret->b_uptodate_lock);
                preempt_disable();
                __this_cpu_inc(bh_accounting.nr);
                recalc_bh_state();
                preempt_enable();
        }
        return ret;
}
EXPORT_SYMBOL(alloc_buffer_head);

void free_buffer_head(struct buffer_head *bh)
{
        BUG_ON(!list_empty(&bh->b_assoc_buffers));
        kmem_cache_free(bh_cachep, bh);
        preempt_disable();
        __this_cpu_dec(bh_accounting.nr);
        recalc_bh_state();
        preempt_enable();
}
EXPORT_SYMBOL(free_buffer_head);

static int buffer_exit_cpu_dead(unsigned int cpu)
{
        int i;
        struct bh_lru *b = &per_cpu(bh_lrus, cpu);

        for (i = 0; i < BH_LRU_SIZE; i++) {
                brelse(b->bhs[i]);
                b->bhs[i] = NULL;
        }
        this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
        per_cpu(bh_accounting, cpu).nr = 0;
        return 0;
}

/**
 * bh_uptodate_or_lock - Test whether the buffer is uptodate
 * @bh: struct buffer_head
 *
 * Return true if the buffer is up-to-date and false,
 * with the buffer locked, if not.
 */
int bh_uptodate_or_lock(struct buffer_head *bh)
{
        if (!buffer_uptodate(bh)) {
                lock_buffer(bh);
                if (!buffer_uptodate(bh))
                        return 0;
                unlock_buffer(bh);
        }
        return 1;
}
EXPORT_SYMBOL(bh_uptodate_or_lock);

/**
 * bh_submit_read - Submit a locked buffer for reading
 * @bh: struct buffer_head
 *
 * Returns zero on success and -EIO on error.
 */
int bh_submit_read(struct buffer_head *bh)
{
        BUG_ON(!buffer_locked(bh));

        if (buffer_uptodate(bh)) {
                unlock_buffer(bh);
                return 0;
        }

        get_bh(bh);
        bh->b_end_io = end_buffer_read_sync;
        submit_bh(REQ_OP_READ, 0, bh);
        wait_on_buffer(bh);
        if (buffer_uptodate(bh))
                return 0;
        return -EIO;
}
EXPORT_SYMBOL(bh_submit_read);

void __init buffer_init(void)
{
        unsigned long nrpages;
        int ret;

        bh_cachep = kmem_cache_create("buffer_head",
                        sizeof(struct buffer_head), 0,
                                (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
                                SLAB_MEM_SPREAD),
                                NULL);

        /*
         * Limit the bh occupancy to 10% of ZONE_NORMAL
         */
        nrpages = (nr_free_buffer_pages() * 10) / 100;
        max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
        ret = cpuhp_setup_state_nocalls(CPUHP_FS_BUFF_DEAD, "fs/buffer:dead",
                                        NULL, buffer_exit_cpu_dead);
        WARN_ON(ret < 0);
}






































































    1 

































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_MM_H
#define _LINUX_SCHED_MM_H

#include <linux/kernel.h>
#include <linux/atomic.h>
#include <linux/sched.h>
#include <linux/mm_types.h>
#include <linux/gfp.h>
#include <linux/sync_core.h>

/*
 * Routines for handling mm_structs
 */
extern struct mm_struct *mm_alloc(void);

/**
 * mmgrab() - Pin a &struct mm_struct.
 * @mm: The &struct mm_struct to pin.
 *
 * Make sure that @mm will not get freed even after the owning task
 * exits. This doesn't guarantee that the associated address space
 * will still exist later on and mmget_not_zero() has to be used before
 * accessing it.
 *
 * This is a preferred way to pin @mm for a longer/unbounded amount
 * of time.
 *
 * Use mmdrop() to release the reference acquired by mmgrab().
 *
 * See also <Documentation/vm/active_mm.rst> for an in-depth explanation
 * of &mm_struct.mm_count vs &mm_struct.mm_users.
 */
static inline void mmgrab(struct mm_struct *mm)
{
        atomic_inc(&mm->mm_count);
}

extern void __mmdrop(struct mm_struct *mm);

static inline void mmdrop(struct mm_struct *mm)
{
        /*
         * The implicit full barrier implied by atomic_dec_and_test() is
         * required by the membarrier system call before returning to
         * user-space, after storing to rq->curr.
         */
        if (unlikely(atomic_dec_and_test(&mm->mm_count)))
                __mmdrop(mm);
}

/**
 * mmget() - Pin the address space associated with a &struct mm_struct.
 * @mm: The address space to pin.
 *
 * Make sure that the address space of the given &struct mm_struct doesn't
 * go away. This does not protect against parts of the address space being
 * modified or freed, however.
 *
 * Never use this function to pin this address space for an
 * unbounded/indefinite amount of time.
 *
 * Use mmput() to release the reference acquired by mmget().
 *
 * See also <Documentation/vm/active_mm.rst> for an in-depth explanation
 * of &mm_struct.mm_count vs &mm_struct.mm_users.
 */
static inline void mmget(struct mm_struct *mm)
{
        atomic_inc(&mm->mm_users);
}

static inline bool mmget_not_zero(struct mm_struct *mm)
{
        return atomic_inc_not_zero(&mm->mm_users);
}

/* mmput gets rid of the mappings and all user-space */
extern void mmput(struct mm_struct *);
#ifdef CONFIG_MMU
/* same as above but performs the slow path from the async context. Can
 * be called from the atomic context as well
 */
void mmput_async(struct mm_struct *);
#endif

/* Grab a reference to a task's mm, if it is not already going away */
extern struct mm_struct *get_task_mm(struct task_struct *task);
/*
 * Grab a reference to a task's mm, if it is not already going away
 * and ptrace_may_access with the mode parameter passed to it
 * succeeds.
 */
extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode);
/* Remove the current tasks stale references to the old mm_struct on exit() */
extern void exit_mm_release(struct task_struct *, struct mm_struct *);
/* Remove the current tasks stale references to the old mm_struct on exec() */
extern void exec_mm_release(struct task_struct *, struct mm_struct *);

#ifdef CONFIG_MEMCG
extern void mm_update_next_owner(struct mm_struct *mm);
#else
static inline void mm_update_next_owner(struct mm_struct *mm)
{
}
#endif /* CONFIG_MEMCG */

#ifdef CONFIG_MMU
#ifndef arch_get_mmap_end
#define arch_get_mmap_end(addr)        (TASK_SIZE)
#endif

#ifndef arch_get_mmap_base
#define arch_get_mmap_base(addr, base) (base)
#endif

extern void arch_pick_mmap_layout(struct mm_struct *mm,
                                  struct rlimit *rlim_stack);
extern unsigned long
arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
                       unsigned long, unsigned long);
extern unsigned long
arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
                          unsigned long len, unsigned long pgoff,
                          unsigned long flags);
#else
static inline void arch_pick_mmap_layout(struct mm_struct *mm,
                                         struct rlimit *rlim_stack) {}
#endif

static inline bool in_vfork(struct task_struct *tsk)
{
        bool ret;

        /*
         * need RCU to access ->real_parent if CLONE_VM was used along with
         * CLONE_PARENT.
         *
         * We check real_parent->mm == tsk->mm because CLONE_VFORK does not
         * imply CLONE_VM
         *
         * CLONE_VFORK can be used with CLONE_PARENT/CLONE_THREAD and thus
         * ->real_parent is not necessarily the task doing vfork(), so in
         * theory we can't rely on task_lock() if we want to dereference it.
         *
         * And in this case we can't trust the real_parent->mm == tsk->mm
         * check, it can be false negative. But we do not care, if init or
         * another oom-unkillable task does this it should blame itself.
         */
        rcu_read_lock();
        ret = tsk->vfork_done &&
                        rcu_dereference(tsk->real_parent)->mm == tsk->mm;
        rcu_read_unlock();

        return ret;
}

/*
 * Applies per-task gfp context to the given allocation flags.
 * PF_MEMALLOC_NOIO implies GFP_NOIO
 * PF_MEMALLOC_NOFS implies GFP_NOFS
 */
static inline gfp_t current_gfp_context(gfp_t flags)
{
        unsigned int pflags = READ_ONCE(current->flags);

        if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS))) {
                /*
                 * NOIO implies both NOIO and NOFS and it is a weaker context
                 * so always make sure it makes precedence
                 */
                if (pflags & PF_MEMALLOC_NOIO)
                        flags &= ~(__GFP_IO | __GFP_FS);
                else if (pflags & PF_MEMALLOC_NOFS)
                        flags &= ~__GFP_FS;
        }
        return flags;
}

#ifdef CONFIG_LOCKDEP
extern void __fs_reclaim_acquire(void);
extern void __fs_reclaim_release(void);
extern void fs_reclaim_acquire(gfp_t gfp_mask);
extern void fs_reclaim_release(gfp_t gfp_mask);
#else
static inline void __fs_reclaim_acquire(void) { }
static inline void __fs_reclaim_release(void) { }
static inline void fs_reclaim_acquire(gfp_t gfp_mask) { }
static inline void fs_reclaim_release(gfp_t gfp_mask) { }
#endif

/**
 * might_alloc - Mark possible allocation sites
 * @gfp_mask: gfp_t flags that would be used to allocate
 *
 * Similar to might_sleep() and other annotations, this can be used in functions
 * that might allocate, but often don't. Compiles to nothing without
 * CONFIG_LOCKDEP. Includes a conditional might_sleep() if @gfp allows blocking.
 */
static inline void might_alloc(gfp_t gfp_mask)
{
        fs_reclaim_acquire(gfp_mask);
        fs_reclaim_release(gfp_mask);

        might_sleep_if(gfpflags_allow_blocking(gfp_mask));
}

/**
 * memalloc_noio_save - Marks implicit GFP_NOIO allocation scope.
 *
 * This functions marks the beginning of the GFP_NOIO allocation scope.
 * All further allocations will implicitly drop __GFP_IO flag and so
 * they are safe for the IO critical section from the allocation recursion
 * point of view. Use memalloc_noio_restore to end the scope with flags
 * returned by this function.
 *
 * This function is safe to be used from any context.
 */
static inline unsigned int memalloc_noio_save(void)
{
        unsigned int flags = current->flags & PF_MEMALLOC_NOIO;
        current->flags |= PF_MEMALLOC_NOIO;
        return flags;
}

/**
 * memalloc_noio_restore - Ends the implicit GFP_NOIO scope.
 * @flags: Flags to restore.
 *
 * Ends the implicit GFP_NOIO scope started by memalloc_noio_save function.
 * Always make sure that the given flags is the return value from the
 * pairing memalloc_noio_save call.
 */
static inline void memalloc_noio_restore(unsigned int flags)
{
        current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags;
}

/**
 * memalloc_nofs_save - Marks implicit GFP_NOFS allocation scope.
 *
 * This functions marks the beginning of the GFP_NOFS allocation scope.
 * All further allocations will implicitly drop __GFP_FS flag and so
 * they are safe for the FS critical section from the allocation recursion
 * point of view. Use memalloc_nofs_restore to end the scope with flags
 * returned by this function.
 *
 * This function is safe to be used from any context.
 */
static inline unsigned int memalloc_nofs_save(void)
{
        unsigned int flags = current->flags & PF_MEMALLOC_NOFS;
        current->flags |= PF_MEMALLOC_NOFS;
        return flags;
}

/**
 * memalloc_nofs_restore - Ends the implicit GFP_NOFS scope.
 * @flags: Flags to restore.
 *
 * Ends the implicit GFP_NOFS scope started by memalloc_nofs_save function.
 * Always make sure that the given flags is the return value from the
 * pairing memalloc_nofs_save call.
 */
static inline void memalloc_nofs_restore(unsigned int flags)
{
        current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags;
}

static inline unsigned int memalloc_noreclaim_save(void)
{
        unsigned int flags = current->flags & PF_MEMALLOC;
        current->flags |= PF_MEMALLOC;
        return flags;
}

static inline void memalloc_noreclaim_restore(unsigned int flags)
{
        current->flags = (current->flags & ~PF_MEMALLOC) | flags;
}

#ifdef CONFIG_CMA
static inline unsigned int memalloc_nocma_save(void)
{
        unsigned int flags = current->flags & PF_MEMALLOC_NOCMA;

        current->flags |= PF_MEMALLOC_NOCMA;
        return flags;
}

static inline void memalloc_nocma_restore(unsigned int flags)
{
        current->flags = (current->flags & ~PF_MEMALLOC_NOCMA) | flags;
}
#else
static inline unsigned int memalloc_nocma_save(void)
{
        return 0;
}

static inline void memalloc_nocma_restore(unsigned int flags)
{
}
#endif

#ifdef CONFIG_MEMCG
DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg);
/**
 * set_active_memcg - Starts the remote memcg charging scope.
 * @memcg: memcg to charge.
 *
 * This function marks the beginning of the remote memcg charging scope. All the
 * __GFP_ACCOUNT allocations till the end of the scope will be charged to the
 * given memcg.
 *
 * NOTE: This function can nest. Users must save the return value and
 * reset the previous value after their own charging scope is over.
 */
static inline struct mem_cgroup *
set_active_memcg(struct mem_cgroup *memcg)
{
        struct mem_cgroup *old;

        if (in_interrupt()) {
                old = this_cpu_read(int_active_memcg);
                this_cpu_write(int_active_memcg, memcg);
        } else {
                old = current->active_memcg;
                current->active_memcg = memcg;
        }

        return old;
}
#else
static inline struct mem_cgroup *
set_active_memcg(struct mem_cgroup *memcg)
{
        return NULL;
}
#endif

#ifdef CONFIG_MEMBARRIER
enum {
        MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY                = (1U << 0),
        MEMBARRIER_STATE_PRIVATE_EXPEDITED                        = (1U << 1),
        MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY                        = (1U << 2),
        MEMBARRIER_STATE_GLOBAL_EXPEDITED                        = (1U << 3),
        MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY        = (1U << 4),
        MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE                = (1U << 5),
        MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY                = (1U << 6),
        MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ                        = (1U << 7),
};

enum {
        MEMBARRIER_FLAG_SYNC_CORE        = (1U << 0),
        MEMBARRIER_FLAG_RSEQ                = (1U << 1),
};

#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
#include <asm/membarrier.h>
#endif

static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
{
        if (current->mm != mm)
                return;
        if (likely(!(atomic_read(&mm->membarrier_state) &
                     MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE)))
                return;
        sync_core_before_usermode();
}

extern void membarrier_exec_mmap(struct mm_struct *mm);

#else
#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
                                             struct mm_struct *next,
                                             struct task_struct *tsk)
{
}
#endif
static inline void membarrier_exec_mmap(struct mm_struct *mm)
{
}
static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
{
}
#endif

#endif /* _LINUX_SCHED_MM_H */






























































































































































    2 















    2 

    2 





























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
// SPDX-License-Identifier: GPL-2.0
/*
  File: fs/ext4/xattr.h

  On-disk format of extended attributes for the ext4 filesystem.

  (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
*/

#include <linux/xattr.h>

/* Magic value in attribute blocks */
#define EXT4_XATTR_MAGIC                0xEA020000

/* Maximum number of references to one attribute block */
#define EXT4_XATTR_REFCOUNT_MAX                1024

/* Name indexes */
#define EXT4_XATTR_INDEX_USER                        1
#define EXT4_XATTR_INDEX_POSIX_ACL_ACCESS        2
#define EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT        3
#define EXT4_XATTR_INDEX_TRUSTED                4
#define        EXT4_XATTR_INDEX_LUSTRE                        5
#define EXT4_XATTR_INDEX_SECURITY                6
#define EXT4_XATTR_INDEX_SYSTEM                        7
#define EXT4_XATTR_INDEX_RICHACL                8
#define EXT4_XATTR_INDEX_ENCRYPTION                9
#define EXT4_XATTR_INDEX_HURD                        10 /* Reserved for Hurd */

struct ext4_xattr_header {
        __le32        h_magic;        /* magic number for identification */
        __le32        h_refcount;        /* reference count */
        __le32        h_blocks;        /* number of disk blocks used */
        __le32        h_hash;                /* hash value of all attributes */
        __le32        h_checksum;        /* crc32c(uuid+id+xattrblock) */
                                /* id = inum if refcount=1, blknum otherwise */
        __u32        h_reserved[3];        /* zero right now */
};

struct ext4_xattr_ibody_header {
        __le32        h_magic;        /* magic number for identification */
};

struct ext4_xattr_entry {
        __u8        e_name_len;        /* length of name */
        __u8        e_name_index;        /* attribute name index */
        __le16        e_value_offs;        /* offset in disk block of value */
        __le32        e_value_inum;        /* inode in which the value is stored */
        __le32        e_value_size;        /* size of attribute value */
        __le32        e_hash;                /* hash value of name and value */
        char        e_name[];        /* attribute name */
};

#define EXT4_XATTR_PAD_BITS                2
#define EXT4_XATTR_PAD                (1<<EXT4_XATTR_PAD_BITS)
#define EXT4_XATTR_ROUND                (EXT4_XATTR_PAD-1)
#define EXT4_XATTR_LEN(name_len) \
        (((name_len) + EXT4_XATTR_ROUND + \
        sizeof(struct ext4_xattr_entry)) & ~EXT4_XATTR_ROUND)
#define EXT4_XATTR_NEXT(entry) \
        ((struct ext4_xattr_entry *)( \
         (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)))
#define EXT4_XATTR_SIZE(size) \
        (((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND)

#define IHDR(inode, raw_inode) \
        ((struct ext4_xattr_ibody_header *) \
                ((void *)raw_inode + \
                EXT4_GOOD_OLD_INODE_SIZE + \
                EXT4_I(inode)->i_extra_isize))
#define ITAIL(inode, raw_inode) \
        ((void *)(raw_inode) + \
         EXT4_SB((inode)->i_sb)->s_inode_size)
#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))

/*
 * XATTR_SIZE_MAX is currently 64k, but for the purposes of checking
 * for file system consistency errors, we use a somewhat bigger value.
 * This allows XATTR_SIZE_MAX to grow in the future, but by using this
 * instead of INT_MAX for certain consistency checks, we don't need to
 * worry about arithmetic overflows.  (Actually XATTR_SIZE_MAX is
 * defined in include/uapi/linux/limits.h, so changing it is going
 * not going to be trivial....)
 */
#define EXT4_XATTR_SIZE_MAX (1 << 24)

/*
 * The minimum size of EA value when you start storing it in an external inode
 * size of block - size of header - size of 1 entry - 4 null bytes
*/
#define EXT4_XATTR_MIN_LARGE_EA_SIZE(b)                                        \
        ((b) - EXT4_XATTR_LEN(3) - sizeof(struct ext4_xattr_header) - 4)

#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
#define BFIRST(bh) ENTRY(BHDR(bh)+1)
#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)

#define EXT4_ZERO_XATTR_VALUE ((void *)-1)

/*
 * If we want to add an xattr to the inode, we should make sure that
 * i_extra_isize is not 0 and that the inode size is not less than
 * EXT4_GOOD_OLD_INODE_SIZE + extra_isize + pad.
 *   EXT4_GOOD_OLD_INODE_SIZE   extra_isize header   entry   pad  data
 * |--------------------------|------------|------|---------|---|-------|
 */
#define EXT4_INODE_HAS_XATTR_SPACE(inode)                                \
        ((EXT4_I(inode)->i_extra_isize != 0) &&                                \
         (EXT4_GOOD_OLD_INODE_SIZE + EXT4_I(inode)->i_extra_isize +        \
          sizeof(struct ext4_xattr_ibody_header) + EXT4_XATTR_PAD <=        \
          EXT4_INODE_SIZE((inode)->i_sb)))

struct ext4_xattr_info {
        const char *name;
        const void *value;
        size_t value_len;
        int name_index;
        int in_inode;
};

struct ext4_xattr_search {
        struct ext4_xattr_entry *first;
        void *base;
        void *end;
        struct ext4_xattr_entry *here;
        int not_found;
};

struct ext4_xattr_ibody_find {
        struct ext4_xattr_search s;
        struct ext4_iloc iloc;
};

struct ext4_xattr_inode_array {
        unsigned int count;                /* # of used items in the array */
        struct inode *inodes[];
};

extern const struct xattr_handler ext4_xattr_user_handler;
extern const struct xattr_handler ext4_xattr_trusted_handler;
extern const struct xattr_handler ext4_xattr_security_handler;
extern const struct xattr_handler ext4_xattr_hurd_handler;

#define EXT4_XATTR_NAME_ENCRYPTION_CONTEXT "c"

/*
 * The EXT4_STATE_NO_EXPAND is overloaded and used for two purposes.
 * The first is to signal that there the inline xattrs and data are
 * taking up so much space that we might as well not keep trying to
 * expand it.  The second is that xattr_sem is taken for writing, so
 * we shouldn't try to recurse into the inode expansion.  For this
 * second case, we need to make sure that we take save and restore the
 * NO_EXPAND state flag appropriately.
 */
static inline void ext4_write_lock_xattr(struct inode *inode, int *save)
{
        down_write(&EXT4_I(inode)->xattr_sem);
        *save = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
        ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
}

static inline int ext4_write_trylock_xattr(struct inode *inode, int *save)
{
        if (down_write_trylock(&EXT4_I(inode)->xattr_sem) == 0)
                return 0;
        *save = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
        ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
        return 1;
}

static inline void ext4_write_unlock_xattr(struct inode *inode, int *save)
{
        if (*save == 0)
                ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
        up_write(&EXT4_I(inode)->xattr_sem);
}

extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);

extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len,
                                  bool is_create, int *credits);
extern int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode,
                                struct buffer_head *block_bh, size_t value_len,
                                bool is_create);

extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
                                   struct ext4_xattr_inode_array **array,
                                   int extra_credits);
extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array);

extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
                            struct ext4_inode *raw_inode, handle_t *handle);
extern void ext4_evict_ea_inode(struct inode *inode);

extern const struct xattr_handler *ext4_xattr_handlers[];

extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
                                 struct ext4_xattr_ibody_find *is);
extern int ext4_xattr_ibody_get(struct inode *inode, int name_index,
                                const char *name,
                                void *buffer, size_t buffer_size);
extern int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
                                struct ext4_xattr_info *i,
                                struct ext4_xattr_ibody_find *is);

extern struct mb_cache *ext4_xattr_create_cache(void);
extern void ext4_xattr_destroy_cache(struct mb_cache *);

extern int
__xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header,
                    void *end, const char *function, unsigned int line);

#define xattr_check_inode(inode, header, end) \
        __xattr_check_inode((inode), (header), (end), __func__, __LINE__)

#ifdef CONFIG_EXT4_FS_SECURITY
extern int ext4_init_security(handle_t *handle, struct inode *inode,
                              struct inode *dir, const struct qstr *qstr);
#else
static inline int ext4_init_security(handle_t *handle, struct inode *inode,
                                     struct inode *dir, const struct qstr *qstr)
{
        return 0;
}
#endif

#ifdef CONFIG_LOCKDEP
extern void ext4_xattr_inode_set_class(struct inode *ea_inode);
#else
static inline void ext4_xattr_inode_set_class(struct inode *ea_inode) { }
#endif

extern int ext4_get_inode_usage(struct inode *inode, qsize_t *usage);



































































































    2 




























































































































































    2 




































































































































































    1 






























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BLK_INTERNAL_H
#define BLK_INTERNAL_H

#include <linux/idr.h>
#include <linux/blk-mq.h>
#include <linux/part_stat.h>
#include <linux/blk-crypto.h>
#include <xen/xen.h>
#include "blk-crypto-internal.h"
#include "blk-mq.h"
#include "blk-mq-sched.h"

/* Max future timer expiry for timeouts */
#define BLK_MAX_TIMEOUT                (5 * HZ)

extern struct dentry *blk_debugfs_root;

struct blk_flush_queue {
        unsigned int                flush_pending_idx:1;
        unsigned int                flush_running_idx:1;
        blk_status_t                 rq_status;
        unsigned long                flush_pending_since;
        struct list_head        flush_queue[2];
        struct list_head        flush_data_in_flight;
        struct request                *flush_rq;

        struct lock_class_key        key;
        spinlock_t                mq_flush_lock;
};

extern struct kmem_cache *blk_requestq_cachep;
extern struct kobj_type blk_queue_ktype;
extern struct ida blk_queue_ida;

static inline struct blk_flush_queue *
blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx)
{
        return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq;
}

static inline void __blk_get_queue(struct request_queue *q)
{
        kobject_get(&q->kobj);
}

bool is_flush_rq(struct request *req);

struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
                                              gfp_t flags);
void blk_free_flush_queue(struct blk_flush_queue *q);

void blk_freeze_queue(struct request_queue *q);

static inline bool biovec_phys_mergeable(struct request_queue *q,
                struct bio_vec *vec1, struct bio_vec *vec2)
{
        unsigned long mask = queue_segment_boundary(q);
        phys_addr_t addr1 = page_to_phys(vec1->bv_page) + vec1->bv_offset;
        phys_addr_t addr2 = page_to_phys(vec2->bv_page) + vec2->bv_offset;

        if (addr1 + vec1->bv_len != addr2)
                return false;
        if (xen_domain() && !xen_biovec_phys_mergeable(vec1, vec2->bv_page))
                return false;
        if ((addr1 | mask) != ((addr2 + vec2->bv_len - 1) | mask))
                return false;
        return true;
}

static inline bool __bvec_gap_to_prev(struct request_queue *q,
                struct bio_vec *bprv, unsigned int offset)
{
        return (offset & queue_virt_boundary(q)) ||
                ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q));
}

/*
 * Check if adding a bio_vec after bprv with offset would create a gap in
 * the SG list. Most drivers don't care about this, but some do.
 */
static inline bool bvec_gap_to_prev(struct request_queue *q,
                struct bio_vec *bprv, unsigned int offset)
{
        if (!queue_virt_boundary(q))
                return false;
        return __bvec_gap_to_prev(q, bprv, offset);
}

static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio,
                unsigned int nr_segs)
{
        rq->nr_phys_segments = nr_segs;
        rq->__data_len = bio->bi_iter.bi_size;
        rq->bio = rq->biotail = bio;
        rq->ioprio = bio_prio(bio);

        if (bio->bi_disk)
                rq->rq_disk = bio->bi_disk;
}

#ifdef CONFIG_BLK_DEV_INTEGRITY
void blk_flush_integrity(void);
bool __bio_integrity_endio(struct bio *);
void bio_integrity_free(struct bio *bio);
static inline bool bio_integrity_endio(struct bio *bio)
{
        if (bio_integrity(bio))
                return __bio_integrity_endio(bio);
        return true;
}

bool blk_integrity_merge_rq(struct request_queue *, struct request *,
                struct request *);
bool blk_integrity_merge_bio(struct request_queue *, struct request *,
                struct bio *);

static inline bool integrity_req_gap_back_merge(struct request *req,
                struct bio *next)
{
        struct bio_integrity_payload *bip = bio_integrity(req->bio);
        struct bio_integrity_payload *bip_next = bio_integrity(next);

        return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1],
                                bip_next->bip_vec[0].bv_offset);
}

static inline bool integrity_req_gap_front_merge(struct request *req,
                struct bio *bio)
{
        struct bio_integrity_payload *bip = bio_integrity(bio);
        struct bio_integrity_payload *bip_next = bio_integrity(req->bio);

        return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1],
                                bip_next->bip_vec[0].bv_offset);
}

void blk_integrity_add(struct gendisk *);
void blk_integrity_del(struct gendisk *);
#else /* CONFIG_BLK_DEV_INTEGRITY */
static inline bool blk_integrity_merge_rq(struct request_queue *rq,
                struct request *r1, struct request *r2)
{
        return true;
}
static inline bool blk_integrity_merge_bio(struct request_queue *rq,
                struct request *r, struct bio *b)
{
        return true;
}
static inline bool integrity_req_gap_back_merge(struct request *req,
                struct bio *next)
{
        return false;
}
static inline bool integrity_req_gap_front_merge(struct request *req,
                struct bio *bio)
{
        return false;
}

static inline void blk_flush_integrity(void)
{
}
static inline bool bio_integrity_endio(struct bio *bio)
{
        return true;
}
static inline void bio_integrity_free(struct bio *bio)
{
}
static inline void blk_integrity_add(struct gendisk *disk)
{
}
static inline void blk_integrity_del(struct gendisk *disk)
{
}
#endif /* CONFIG_BLK_DEV_INTEGRITY */

unsigned long blk_rq_timeout(unsigned long timeout);
void blk_add_timer(struct request *req);

bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
                unsigned int nr_segs, struct request **same_queue_rq);
bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
                        struct bio *bio, unsigned int nr_segs);

void blk_account_io_start(struct request *req);
void blk_account_io_done(struct request *req, u64 now);

/*
 * Plug flush limits
 */
#define BLK_MAX_REQUEST_COUNT        32
#define BLK_PLUG_FLUSH_SIZE        (128 * 1024)

/*
 * Internal elevator interface
 */
#define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED)

void blk_insert_flush(struct request *rq);

void elevator_init_mq(struct request_queue *q);
int elevator_switch_mq(struct request_queue *q,
                              struct elevator_type *new_e);
void __elevator_exit(struct request_queue *, struct elevator_queue *);
int elv_register_queue(struct request_queue *q, bool uevent);
void elv_unregister_queue(struct request_queue *q);

static inline void elevator_exit(struct request_queue *q,
                struct elevator_queue *e)
{
        lockdep_assert_held(&q->sysfs_lock);

        blk_mq_sched_free_requests(q);
        __elevator_exit(q, e);
}

struct hd_struct *__disk_get_part(struct gendisk *disk, int partno);

ssize_t part_size_show(struct device *dev, struct device_attribute *attr,
                char *buf);
ssize_t part_stat_show(struct device *dev, struct device_attribute *attr,
                char *buf);
ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
                char *buf);
ssize_t part_fail_show(struct device *dev, struct device_attribute *attr,
                char *buf);
ssize_t part_fail_store(struct device *dev, struct device_attribute *attr,
                const char *buf, size_t count);
ssize_t part_timeout_show(struct device *, struct device_attribute *, char *);
ssize_t part_timeout_store(struct device *, struct device_attribute *,
                                const char *, size_t);

void __blk_queue_split(struct bio **bio, unsigned int *nr_segs);
int ll_back_merge_fn(struct request *req, struct bio *bio,
                unsigned int nr_segs);
int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
                                struct request *next);
unsigned int blk_recalc_rq_segments(struct request *rq);
void blk_rq_set_mixed_merge(struct request *rq);
bool blk_rq_merge_ok(struct request *rq, struct bio *bio);
enum elv_merge blk_try_merge(struct request *rq, struct bio *bio);

int blk_dev_init(void);

/*
 * Contribute to IO statistics IFF:
 *
 *        a) it's attached to a gendisk, and
 *        b) the queue had IO stats enabled when this request was started
 */
static inline bool blk_do_io_stat(struct request *rq)
{
        return rq->rq_disk && (rq->rq_flags & RQF_IO_STAT);
}

static inline void req_set_nomerge(struct request_queue *q, struct request *req)
{
        req->cmd_flags |= REQ_NOMERGE;
        if (req == q->last_merge)
                q->last_merge = NULL;
}

/*
 * The max size one bio can handle is UINT_MAX becasue bvec_iter.bi_size
 * is defined as 'unsigned int', meantime it has to aligned to with logical
 * block size which is the minimum accepted unit by hardware.
 */
static inline unsigned int bio_allowed_max_sectors(struct request_queue *q)
{
        return round_down(UINT_MAX, queue_logical_block_size(q)) >> 9;
}

/*
 * The max bio size which is aligned to q->limits.discard_granularity. This
 * is a hint to split large discard bio in generic block layer, then if device
 * driver needs to split the discard bio into smaller ones, their bi_size can
 * be very probably and easily aligned to discard_granularity of the device's
 * queue.
 */
static inline unsigned int bio_aligned_discard_max_sectors(
                                        struct request_queue *q)
{
        return round_down(UINT_MAX, q->limits.discard_granularity) >>
                        SECTOR_SHIFT;
}

/*
 * Internal io_context interface
 */
void get_io_context(struct io_context *ioc);
struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q);
struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
                             gfp_t gfp_mask);
void ioc_clear_queue(struct request_queue *q);

int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node);

/*
 * Internal throttling interface
 */
#ifdef CONFIG_BLK_DEV_THROTTLING
extern int blk_throtl_init(struct request_queue *q);
extern void blk_throtl_exit(struct request_queue *q);
extern void blk_throtl_register_queue(struct request_queue *q);
extern void blk_throtl_charge_bio_split(struct bio *bio);
bool blk_throtl_bio(struct bio *bio);
#else /* CONFIG_BLK_DEV_THROTTLING */
static inline int blk_throtl_init(struct request_queue *q) { return 0; }
static inline void blk_throtl_exit(struct request_queue *q) { }
static inline void blk_throtl_register_queue(struct request_queue *q) { }
static inline void blk_throtl_charge_bio_split(struct bio *bio) { }
static inline bool blk_throtl_bio(struct bio *bio) { return false; }
#endif /* CONFIG_BLK_DEV_THROTTLING */
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page);
extern ssize_t blk_throtl_sample_time_store(struct request_queue *q,
        const char *page, size_t count);
extern void blk_throtl_bio_endio(struct bio *bio);
extern void blk_throtl_stat_add(struct request *rq, u64 time);
#else
static inline void blk_throtl_bio_endio(struct bio *bio) { }
static inline void blk_throtl_stat_add(struct request *rq, u64 time) { }
#endif

#ifdef CONFIG_BOUNCE
extern int init_emergency_isa_pool(void);
extern void blk_queue_bounce(struct request_queue *q, struct bio **bio);
#else
static inline int init_emergency_isa_pool(void)
{
        return 0;
}
static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio)
{
}
#endif /* CONFIG_BOUNCE */

#ifdef CONFIG_BLK_CGROUP_IOLATENCY
extern int blk_iolatency_init(struct request_queue *q);
#else
static inline int blk_iolatency_init(struct request_queue *q) { return 0; }
#endif

struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp);

#ifdef CONFIG_BLK_DEV_ZONED
void blk_queue_free_zone_bitmaps(struct request_queue *q);
#else
static inline void blk_queue_free_zone_bitmaps(struct request_queue *q) {}
#endif

struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector);

int blk_alloc_devt(struct hd_struct *part, dev_t *devt);
void blk_free_devt(dev_t devt);
void blk_invalidate_devt(dev_t devt);
char *disk_name(struct gendisk *hd, int partno, char *buf);
#define ADDPART_FLAG_NONE        0
#define ADDPART_FLAG_RAID        1
#define ADDPART_FLAG_WHOLEDISK        2
void delete_partition(struct hd_struct *part);
int bdev_add_partition(struct block_device *bdev, int partno,
                sector_t start, sector_t length);
int bdev_del_partition(struct block_device *bdev, int partno);
int bdev_resize_partition(struct block_device *bdev, int partno,
                sector_t start, sector_t length);
int disk_expand_part_tbl(struct gendisk *disk, int target);
int hd_ref_init(struct hd_struct *part);

/* no need to get/put refcount of part0 */
static inline int hd_struct_try_get(struct hd_struct *part)
{
        if (part->partno)
                return percpu_ref_tryget_live(&part->ref);
        return 1;
}

static inline void hd_struct_put(struct hd_struct *part)
{
        if (part->partno)
                percpu_ref_put(&part->ref);
}

static inline void hd_free_part(struct hd_struct *part)
{
        free_percpu(part->dkstats);
        kfree(part->info);
        percpu_ref_exit(&part->ref);
}

/*
 * Any access of part->nr_sects which is not protected by partition
 * bd_mutex or gendisk bdev bd_mutex, should be done using this
 * accessor function.
 *
 * Code written along the lines of i_size_read() and i_size_write().
 * CONFIG_PREEMPTION case optimizes the case of UP kernel with preemption
 * on.
 */
static inline sector_t part_nr_sects_read(struct hd_struct *part)
{
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
        sector_t nr_sects;
        unsigned seq;
        do {
                seq = read_seqcount_begin(&part->nr_sects_seq);
                nr_sects = part->nr_sects;
        } while (read_seqcount_retry(&part->nr_sects_seq, seq));
        return nr_sects;
#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION)
        sector_t nr_sects;

        preempt_disable();
        nr_sects = part->nr_sects;
        preempt_enable();
        return nr_sects;
#else
        return part->nr_sects;
#endif
}

/*
 * Should be called with mutex lock held (typically bd_mutex) of partition
 * to provide mutual exlusion among writers otherwise seqcount might be
 * left in wrong state leaving the readers spinning infinitely.
 */
static inline void part_nr_sects_write(struct hd_struct *part, sector_t size)
{
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
        preempt_disable();
        write_seqcount_begin(&part->nr_sects_seq);
        part->nr_sects = size;
        write_seqcount_end(&part->nr_sects_seq);
        preempt_enable();
#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION)
        preempt_disable();
        part->nr_sects = size;
        preempt_enable();
#else
        part->nr_sects = size;
#endif
}

int bio_add_hw_page(struct request_queue *q, struct bio *bio,
                struct page *page, unsigned int len, unsigned int offset,
                unsigned int max_sectors, bool *same_page);

#endif /* BLK_INTERNAL_H */























































































































    1 









    1 
































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_USER_NAMESPACE_H
#define _LINUX_USER_NAMESPACE_H

#include <linux/kref.h>
#include <linux/nsproxy.h>
#include <linux/ns_common.h>
#include <linux/sched.h>
#include <linux/workqueue.h>
#include <linux/rwsem.h>
#include <linux/sysctl.h>
#include <linux/err.h>

#define UID_GID_MAP_MAX_BASE_EXTENTS 5
#define UID_GID_MAP_MAX_EXTENTS 340

struct uid_gid_extent {
        u32 first;
        u32 lower_first;
        u32 count;
};

struct uid_gid_map { /* 64 bytes -- 1 cache line */
        u32 nr_extents;
        union {
                struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS];
                struct {
                        struct uid_gid_extent *forward;
                        struct uid_gid_extent *reverse;
                };
        };
};

#define USERNS_SETGROUPS_ALLOWED 1UL

#define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED

struct ucounts;

enum ucount_type {
        UCOUNT_USER_NAMESPACES,
        UCOUNT_PID_NAMESPACES,
        UCOUNT_UTS_NAMESPACES,
        UCOUNT_IPC_NAMESPACES,
        UCOUNT_NET_NAMESPACES,
        UCOUNT_MNT_NAMESPACES,
        UCOUNT_CGROUP_NAMESPACES,
        UCOUNT_TIME_NAMESPACES,
#ifdef CONFIG_INOTIFY_USER
        UCOUNT_INOTIFY_INSTANCES,
        UCOUNT_INOTIFY_WATCHES,
#endif
#ifdef CONFIG_FANOTIFY
        UCOUNT_FANOTIFY_GROUPS,
        UCOUNT_FANOTIFY_MARKS,
#endif
        UCOUNT_COUNTS,
};

struct user_namespace {
        struct uid_gid_map        uid_map;
        struct uid_gid_map        gid_map;
        struct uid_gid_map        projid_map;
        atomic_t                count;
        struct user_namespace        *parent;
        int                        level;
        kuid_t                        owner;
        kgid_t                        group;
        struct ns_common        ns;
        unsigned long                flags;
        /* parent_could_setfcap: true if the creator if this ns had CAP_SETFCAP
         * in its effective capability set at the child ns creation time. */
        bool                        parent_could_setfcap;

#ifdef CONFIG_KEYS
        /* List of joinable keyrings in this namespace.  Modification access of
         * these pointers is controlled by keyring_sem.  Once
         * user_keyring_register is set, it won't be changed, so it can be
         * accessed directly with READ_ONCE().
         */
        struct list_head        keyring_name_list;
        struct key                *user_keyring_register;
        struct rw_semaphore        keyring_sem;
#endif

        /* Register of per-UID persistent keyrings for this namespace */
#ifdef CONFIG_PERSISTENT_KEYRINGS
        struct key                *persistent_keyring_register;
#endif
        struct work_struct        work;
#ifdef CONFIG_SYSCTL
        struct ctl_table_set        set;
        struct ctl_table_header *sysctls;
#endif
        struct ucounts                *ucounts;
        int ucount_max[UCOUNT_COUNTS];
} __randomize_layout;

struct ucounts {
        struct hlist_node node;
        struct user_namespace *ns;
        kuid_t uid;
        int count;
        atomic_t ucount[UCOUNT_COUNTS];
};

extern struct user_namespace init_user_ns;

bool setup_userns_sysctls(struct user_namespace *ns);
void retire_userns_sysctls(struct user_namespace *ns);
struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type);
void dec_ucount(struct ucounts *ucounts, enum ucount_type type);

#ifdef CONFIG_USER_NS

static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
{
        if (ns)
                atomic_inc(&ns->count);
        return ns;
}

extern int create_user_ns(struct cred *new);
extern int unshare_userns(unsigned long unshare_flags, struct cred **new_cred);
extern void __put_user_ns(struct user_namespace *ns);

static inline void put_user_ns(struct user_namespace *ns)
{
        if (ns && atomic_dec_and_test(&ns->count))
                __put_user_ns(ns);
}

struct seq_operations;
extern const struct seq_operations proc_uid_seq_operations;
extern const struct seq_operations proc_gid_seq_operations;
extern const struct seq_operations proc_projid_seq_operations;
extern ssize_t proc_uid_map_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t proc_gid_map_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, loff_t *);
extern int proc_setgroups_show(struct seq_file *m, void *v);
extern bool userns_may_setgroups(const struct user_namespace *ns);
extern bool in_userns(const struct user_namespace *ancestor,
                       const struct user_namespace *child);
extern bool current_in_userns(const struct user_namespace *target_ns);
struct ns_common *ns_get_owner(struct ns_common *ns);
#else

static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
{
        return &init_user_ns;
}

static inline int create_user_ns(struct cred *new)
{
        return -EINVAL;
}

static inline int unshare_userns(unsigned long unshare_flags,
                                 struct cred **new_cred)
{
        if (unshare_flags & CLONE_NEWUSER)
                return -EINVAL;
        return 0;
}

static inline void put_user_ns(struct user_namespace *ns)
{
}

static inline bool userns_may_setgroups(const struct user_namespace *ns)
{
        return true;
}

static inline bool in_userns(const struct user_namespace *ancestor,
                             const struct user_namespace *child)
{
        return true;
}

static inline bool current_in_userns(const struct user_namespace *target_ns)
{
        return true;
}

static inline struct ns_common *ns_get_owner(struct ns_common *ns)
{
        return ERR_PTR(-EPERM);
}
#endif

#endif /* _LINUX_USER_H */


























































































































































    3 


























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_TIMEKEEPING_H
#define _LINUX_TIMEKEEPING_H

#include <linux/errno.h>

/* Included from linux/ktime.h */

void timekeeping_init(void);
extern int timekeeping_suspended;

/* Architecture timer tick functions: */
extern void update_process_times(int user);
extern void xtime_update(unsigned long ticks);

/*
 * Get and set timeofday
 */
extern int do_settimeofday64(const struct timespec64 *ts);
extern int do_sys_settimeofday64(const struct timespec64 *tv,
                                 const struct timezone *tz);

/*
 * ktime_get() family: read the current time in a multitude of ways,
 *
 * The default time reference is CLOCK_MONOTONIC, starting at
 * boot time but not counting the time spent in suspend.
 * For other references, use the functions with "real", "clocktai",
 * "boottime" and "raw" suffixes.
 *
 * To get the time in a different format, use the ones wit
 * "ns", "ts64" and "seconds" suffix.
 *
 * See Documentation/core-api/timekeeping.rst for more details.
 */


/*
 * timespec64 based interfaces
 */
extern void ktime_get_raw_ts64(struct timespec64 *ts);
extern void ktime_get_ts64(struct timespec64 *ts);
extern void ktime_get_real_ts64(struct timespec64 *tv);
extern void ktime_get_coarse_ts64(struct timespec64 *ts);
extern void ktime_get_coarse_real_ts64(struct timespec64 *ts);

void getboottime64(struct timespec64 *ts);

/*
 * time64_t base interfaces
 */
extern time64_t ktime_get_seconds(void);
extern time64_t __ktime_get_real_seconds(void);
extern time64_t ktime_get_real_seconds(void);

/*
 * ktime_t based interfaces
 */

enum tk_offsets {
        TK_OFFS_REAL,
        TK_OFFS_BOOT,
        TK_OFFS_TAI,
        TK_OFFS_MAX,
};

extern ktime_t ktime_get(void);
extern ktime_t ktime_get_with_offset(enum tk_offsets offs);
extern ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs);
extern ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs);
extern ktime_t ktime_get_raw(void);
extern u32 ktime_get_resolution_ns(void);

/**
 * ktime_get_real - get the real (wall-) time in ktime_t format
 */
static inline ktime_t ktime_get_real(void)
{
        return ktime_get_with_offset(TK_OFFS_REAL);
}

static inline ktime_t ktime_get_coarse_real(void)
{
        return ktime_get_coarse_with_offset(TK_OFFS_REAL);
}

/**
 * ktime_get_boottime - Returns monotonic time since boot in ktime_t format
 *
 * This is similar to CLOCK_MONTONIC/ktime_get, but also includes the
 * time spent in suspend.
 */
static inline ktime_t ktime_get_boottime(void)
{
        return ktime_get_with_offset(TK_OFFS_BOOT);
}

static inline ktime_t ktime_get_coarse_boottime(void)
{
        return ktime_get_coarse_with_offset(TK_OFFS_BOOT);
}

/**
 * ktime_get_clocktai - Returns the TAI time of day in ktime_t format
 */
static inline ktime_t ktime_get_clocktai(void)
{
        return ktime_get_with_offset(TK_OFFS_TAI);
}

static inline ktime_t ktime_get_coarse_clocktai(void)
{
        return ktime_get_coarse_with_offset(TK_OFFS_TAI);
}

static inline ktime_t ktime_get_coarse(void)
{
        struct timespec64 ts;

        ktime_get_coarse_ts64(&ts);
        return timespec64_to_ktime(ts);
}

static inline u64 ktime_get_coarse_ns(void)
{
        return ktime_to_ns(ktime_get_coarse());
}

static inline u64 ktime_get_coarse_real_ns(void)
{
        return ktime_to_ns(ktime_get_coarse_real());
}

static inline u64 ktime_get_coarse_boottime_ns(void)
{
        return ktime_to_ns(ktime_get_coarse_boottime());
}

static inline u64 ktime_get_coarse_clocktai_ns(void)
{
        return ktime_to_ns(ktime_get_coarse_clocktai());
}

/**
 * ktime_mono_to_real - Convert monotonic time to clock realtime
 */
static inline ktime_t ktime_mono_to_real(ktime_t mono)
{
        return ktime_mono_to_any(mono, TK_OFFS_REAL);
}

static inline u64 ktime_get_ns(void)
{
        return ktime_to_ns(ktime_get());
}

static inline u64 ktime_get_real_ns(void)
{
        return ktime_to_ns(ktime_get_real());
}

static inline u64 ktime_get_boottime_ns(void)
{
        return ktime_to_ns(ktime_get_boottime());
}

static inline u64 ktime_get_clocktai_ns(void)
{
        return ktime_to_ns(ktime_get_clocktai());
}

static inline u64 ktime_get_raw_ns(void)
{
        return ktime_to_ns(ktime_get_raw());
}

extern u64 ktime_get_mono_fast_ns(void);
extern u64 ktime_get_raw_fast_ns(void);
extern u64 ktime_get_boot_fast_ns(void);
extern u64 ktime_get_real_fast_ns(void);

/*
 * timespec64/time64_t interfaces utilizing the ktime based ones
 * for API completeness, these could be implemented more efficiently
 * if needed.
 */
static inline void ktime_get_boottime_ts64(struct timespec64 *ts)
{
        *ts = ktime_to_timespec64(ktime_get_boottime());
}

static inline void ktime_get_coarse_boottime_ts64(struct timespec64 *ts)
{
        *ts = ktime_to_timespec64(ktime_get_coarse_boottime());
}

static inline time64_t ktime_get_boottime_seconds(void)
{
        return ktime_divns(ktime_get_coarse_boottime(), NSEC_PER_SEC);
}

static inline void ktime_get_clocktai_ts64(struct timespec64 *ts)
{
        *ts = ktime_to_timespec64(ktime_get_clocktai());
}

static inline void ktime_get_coarse_clocktai_ts64(struct timespec64 *ts)
{
        *ts = ktime_to_timespec64(ktime_get_coarse_clocktai());
}

static inline time64_t ktime_get_clocktai_seconds(void)
{
        return ktime_divns(ktime_get_coarse_clocktai(), NSEC_PER_SEC);
}

/*
 * RTC specific
 */
extern bool timekeeping_rtc_skipsuspend(void);
extern bool timekeeping_rtc_skipresume(void);

extern void timekeeping_inject_sleeptime64(const struct timespec64 *delta);

/*
 * struct ktime_timestanps - Simultaneous mono/boot/real timestamps
 * @mono:        Monotonic timestamp
 * @boot:        Boottime timestamp
 * @real:        Realtime timestamp
 */
struct ktime_timestamps {
        u64                mono;
        u64                boot;
        u64                real;
};

/**
 * struct system_time_snapshot - simultaneous raw/real time capture with
 *                                 counter value
 * @cycles:        Clocksource counter value to produce the system times
 * @real:        Realtime system time
 * @raw:        Monotonic raw system time
 * @clock_was_set_seq:        The sequence number of clock was set events
 * @cs_was_changed_seq:        The sequence number of clocksource change events
 */
struct system_time_snapshot {
        u64                cycles;
        ktime_t                real;
        ktime_t                raw;
        unsigned int        clock_was_set_seq;
        u8                cs_was_changed_seq;
};

/**
 * struct system_device_crosststamp - system/device cross-timestamp
 *                                      (synchronized capture)
 * @device:                Device time
 * @sys_realtime:        Realtime simultaneous with device time
 * @sys_monoraw:        Monotonic raw simultaneous with device time
 */
struct system_device_crosststamp {
        ktime_t device;
        ktime_t sys_realtime;
        ktime_t sys_monoraw;
};

/**
 * struct system_counterval_t - system counter value with the pointer to the
 *                                corresponding clocksource
 * @cycles:        System counter value
 * @cs:                Clocksource corresponding to system counter value. Used by
 *                timekeeping code to verify comparibility of two cycle values
 */
struct system_counterval_t {
        u64                        cycles;
        struct clocksource        *cs;
};

/*
 * Get cross timestamp between system clock and device clock
 */
extern int get_device_system_crosststamp(
                        int (*get_time_fn)(ktime_t *device_time,
                                struct system_counterval_t *system_counterval,
                                void *ctx),
                        void *ctx,
                        struct system_time_snapshot *history,
                        struct system_device_crosststamp *xtstamp);

/*
 * Simultaneously snapshot realtime and monotonic raw clocks
 */
extern void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot);

/* NMI safe mono/boot/realtime timestamps */
extern void ktime_get_fast_timestamps(struct ktime_timestamps *snap);

/*
 * Persistent clock related interfaces
 */
extern int persistent_clock_is_local;

extern void read_persistent_clock64(struct timespec64 *ts);
void read_persistent_wall_and_boot_offset(struct timespec64 *wall_clock,
                                          struct timespec64 *boot_offset);
extern int update_persistent_clock64(struct timespec64 now);

#endif


















































    1 
























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __SHMEM_FS_H
#define __SHMEM_FS_H

#include <linux/file.h>
#include <linux/swap.h>
#include <linux/mempolicy.h>
#include <linux/pagemap.h>
#include <linux/percpu_counter.h>
#include <linux/xattr.h>
#include <linux/fs_parser.h>

/* inode in-kernel data */

struct shmem_inode_info {
        spinlock_t                lock;
        unsigned int                seals;                /* shmem seals */
        unsigned long                flags;
        unsigned long                alloced;        /* data pages alloced to file */
        unsigned long                swapped;        /* subtotal assigned to swap */
        struct list_head        shrinklist;     /* shrinkable hpage inodes */
        struct list_head        swaplist;        /* chain of maybes on swap */
        struct shared_policy        policy;                /* NUMA memory alloc policy */
        struct simple_xattrs        xattrs;                /* list of xattrs */
        atomic_t                stop_eviction;        /* hold when working on inode */
        struct inode                vfs_inode;
};

struct shmem_sb_info {
        unsigned long max_blocks;   /* How many blocks are allowed */
        struct percpu_counter used_blocks;  /* How many are allocated */
        unsigned long max_inodes;   /* How many inodes are allowed */
        unsigned long free_inodes;  /* How many are left for allocation */
        spinlock_t stat_lock;            /* Serialize shmem_sb_info changes */
        umode_t mode;                    /* Mount mode for root directory */
        unsigned char huge;            /* Whether to try for hugepages */
        kuid_t uid;                    /* Mount uid for root directory */
        kgid_t gid;                    /* Mount gid for root directory */
        bool full_inums;            /* If i_ino should be uint or ino_t */
        ino_t next_ino;                    /* The next per-sb inode number to use */
        ino_t __percpu *ino_batch;  /* The next per-cpu inode number to use */
        struct mempolicy *mpol;     /* default memory policy for mappings */
        spinlock_t shrinklist_lock;   /* Protects shrinklist */
        struct list_head shrinklist;  /* List of shinkable inodes */
        unsigned long shrinklist_len; /* Length of shrinklist */
};

static inline struct shmem_inode_info *SHMEM_I(struct inode *inode)
{
        return container_of(inode, struct shmem_inode_info, vfs_inode);
}

/*
 * Functions in mm/shmem.c called directly from elsewhere:
 */
extern const struct fs_parameter_spec shmem_fs_parameters[];
extern int shmem_init(void);
extern int shmem_init_fs_context(struct fs_context *fc);
extern struct file *shmem_file_setup(const char *name,
                                        loff_t size, unsigned long flags);
extern struct file *shmem_kernel_file_setup(const char *name, loff_t size,
                                            unsigned long flags);
extern struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt,
                const char *name, loff_t size, unsigned long flags);
extern int shmem_zero_setup(struct vm_area_struct *);
extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr,
                unsigned long len, unsigned long pgoff, unsigned long flags);
extern int shmem_lock(struct file *file, int lock, struct user_struct *user);
#ifdef CONFIG_SHMEM
extern bool shmem_mapping(struct address_space *mapping);
#else
static inline bool shmem_mapping(struct address_space *mapping)
{
        return false;
}
#endif /* CONFIG_SHMEM */
extern void shmem_unlock_mapping(struct address_space *mapping);
extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
                                        pgoff_t index, gfp_t gfp_mask);
extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
extern int shmem_unuse(unsigned int type, bool frontswap,
                       unsigned long *fs_pages_to_unuse);

extern bool shmem_huge_enabled(struct vm_area_struct *vma);
extern unsigned long shmem_swap_usage(struct vm_area_struct *vma);
extern unsigned long shmem_partial_swap_usage(struct address_space *mapping,
                                                pgoff_t start, pgoff_t end);

/* Flag allocation requirements to shmem_getpage */
enum sgp_type {
        SGP_READ,        /* don't exceed i_size, don't allocate page */
        SGP_CACHE,        /* don't exceed i_size, may allocate page */
        SGP_NOHUGE,        /* like SGP_CACHE, but no huge pages */
        SGP_HUGE,        /* like SGP_CACHE, huge pages preferred */
        SGP_WRITE,        /* may exceed i_size, may allocate !Uptodate page */
        SGP_FALLOC,        /* like SGP_WRITE, but make existing page Uptodate */
};

extern int shmem_getpage(struct inode *inode, pgoff_t index,
                struct page **pagep, enum sgp_type sgp);

static inline struct page *shmem_read_mapping_page(
                                struct address_space *mapping, pgoff_t index)
{
        return shmem_read_mapping_page_gfp(mapping, index,
                                        mapping_gfp_mask(mapping));
}

static inline bool shmem_file(struct file *file)
{
        if (!IS_ENABLED(CONFIG_SHMEM))
                return false;
        if (!file || !file->f_mapping)
                return false;
        return shmem_mapping(file->f_mapping);
}

extern bool shmem_charge(struct inode *inode, long pages);
extern void shmem_uncharge(struct inode *inode, long pages);

#ifdef CONFIG_SHMEM
extern int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
                                  struct vm_area_struct *dst_vma,
                                  unsigned long dst_addr,
                                  unsigned long src_addr,
                                  struct page **pagep);
extern int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
                                    pmd_t *dst_pmd,
                                    struct vm_area_struct *dst_vma,
                                    unsigned long dst_addr);
#else
#define shmem_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \
                               src_addr, pagep)        ({ BUG(); 0; })
#define shmem_mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, \
                                 dst_addr)      ({ BUG(); 0; })
#endif

#endif






































    4 
    4 













1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Type definitions for the multi-level security (MLS) policy.
 *
 * Author : Stephen Smalley, <sds@tycho.nsa.gov>
 */
/*
 * Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com>
 *
 *        Support for enhanced MLS infrastructure.
 *
 * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc.
 */

#ifndef _SS_MLS_TYPES_H_
#define _SS_MLS_TYPES_H_

#include "security.h"
#include "ebitmap.h"

struct mls_level {
        u32 sens;                /* sensitivity */
        struct ebitmap cat;        /* category set */
};

struct mls_range {
        struct mls_level level[2]; /* low == level[0], high == level[1] */
};

static inline int mls_level_eq(struct mls_level *l1, struct mls_level *l2)
{
        return ((l1->sens == l2->sens) &&
                ebitmap_cmp(&l1->cat, &l2->cat));
}

static inline int mls_level_dom(struct mls_level *l1, struct mls_level *l2)
{
        return ((l1->sens >= l2->sens) &&
                ebitmap_contains(&l1->cat, &l2->cat, 0));
}

#define mls_level_incomp(l1, l2) \
(!mls_level_dom((l1), (l2)) && !mls_level_dom((l2), (l1)))

#define mls_level_between(l1, l2, l3) \
(mls_level_dom((l1), (l2)) && mls_level_dom((l3), (l1)))

#define mls_range_contains(r1, r2) \
(mls_level_dom(&(r2).level[0], &(r1).level[0]) && \
 mls_level_dom(&(r1).level[1], &(r2).level[1]))

#endif        /* _SS_MLS_TYPES_H_ */































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_MEMFD_H
#define __LINUX_MEMFD_H

#include <linux/file.h>

#ifdef CONFIG_MEMFD_CREATE
extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg);
unsigned int *memfd_file_seals_ptr(struct file *file);
#else
static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned long a)
{
        return -EINVAL;
}

static inline unsigned int *memfd_file_seals_ptr(struct file *file)
{
        return NULL;
}
#endif

/* Retrieve memfd seals associated with the file, if any. */
static inline unsigned int memfd_file_seals(struct file *file)
{
        unsigned int *sealsp = memfd_file_seals_ptr(file);

        return sealsp ? *sealsp : 0;
}

#endif /* __LINUX_MEMFD_H */























































































































































































































































































































































































































































































































































































































































































































































    1 
























































































































































































































    1 































































































































































































































































































































































































































































































































































    1 




































































































































    1 
    1 









    1 





    1 




































































































































































































































































    1 

































    1 






















































































































































































































































































































































    1 





































































































































































































































    1 




























































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the AF_INET socket handler.
 *
 * Version:        @(#)sock.h        1.0.4        05/13/93
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Corey Minyard <wf-rch!minyard@relay.EU.net>
 *                Florian La Roche <flla@stud.uni-sb.de>
 *
 * Fixes:
 *                Alan Cox        :        Volatiles in skbuff pointers. See
 *                                        skbuff comments. May be overdone,
 *                                        better to prove they can be removed
 *                                        than the reverse.
 *                Alan Cox        :        Added a zapped field for tcp to note
 *                                        a socket is reset and must stay shut up
 *                Alan Cox        :        New fields for options
 *        Pauline Middelink        :        identd support
 *                Alan Cox        :        Eliminate low level recv/recvfrom
 *                David S. Miller        :        New socket lookup architecture.
 *              Steve Whitehouse:       Default routines for sock_ops
 *              Arnaldo C. Melo :        removed net_pinfo, tp_pinfo and made
 *                                      protinfo be just a void pointer, as the
 *                                      protocol specific parts were moved to
 *                                      respective headers and ipv4/v6, etc now
 *                                      use private slabcaches for its socks
 *              Pedro Hortas        :        New flags field for socket options
 */
#ifndef _SOCK_H
#define _SOCK_H

#include <linux/hardirq.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/list_nulls.h>
#include <linux/timer.h>
#include <linux/cache.h>
#include <linux/bitops.h>
#include <linux/lockdep.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>        /* struct sk_buff */
#include <linux/mm.h>
#include <linux/security.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/page_counter.h>
#include <linux/memcontrol.h>
#include <linux/static_key.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/cgroup-defs.h>
#include <linux/rbtree.h>
#include <linux/filter.h>
#include <linux/rculist_nulls.h>
#include <linux/poll.h>
#include <linux/sockptr.h>

#include <linux/atomic.h>
#include <linux/refcount.h>
#include <net/dst.h>
#include <net/checksum.h>
#include <net/tcp_states.h>
#include <linux/net_tstamp.h>
#include <net/l3mdev.h>

/*
 * This structure really needs to be cleaned up.
 * Most of it is for TCP, and not used by any of
 * the other protocols.
 */

/* Define this to get the SOCK_DBG debugging facility. */
#define SOCK_DEBUGGING
#ifdef SOCK_DEBUGGING
#define SOCK_DEBUG(sk, msg...) do { if ((sk) && sock_flag((sk), SOCK_DBG)) \
                                        printk(KERN_DEBUG msg); } while (0)
#else
/* Validate arguments and do nothing */
static inline __printf(2, 3)
void SOCK_DEBUG(const struct sock *sk, const char *msg, ...)
{
}
#endif

/* This is the per-socket lock.  The spinlock provides a synchronization
 * between user contexts and software interrupt processing, whereas the
 * mini-semaphore synchronizes multiple users amongst themselves.
 */
typedef struct {
        spinlock_t                slock;
        int                        owned;
        wait_queue_head_t        wq;
        /*
         * We express the mutex-alike socket_lock semantics
         * to the lock validator by explicitly managing
         * the slock as a lock variant (in addition to
         * the slock itself):
         */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map dep_map;
#endif
} socket_lock_t;

struct sock;
struct proto;
struct net;

typedef __u32 __bitwise __portpair;
typedef __u64 __bitwise __addrpair;

/**
 *        struct sock_common - minimal network layer representation of sockets
 *        @skc_daddr: Foreign IPv4 addr
 *        @skc_rcv_saddr: Bound local IPv4 addr
 *        @skc_addrpair: 8-byte-aligned __u64 union of @skc_daddr & @skc_rcv_saddr
 *        @skc_hash: hash value used with various protocol lookup tables
 *        @skc_u16hashes: two u16 hash values used by UDP lookup tables
 *        @skc_dport: placeholder for inet_dport/tw_dport
 *        @skc_num: placeholder for inet_num/tw_num
 *        @skc_portpair: __u32 union of @skc_dport & @skc_num
 *        @skc_family: network address family
 *        @skc_state: Connection state
 *        @skc_reuse: %SO_REUSEADDR setting
 *        @skc_reuseport: %SO_REUSEPORT setting
 *        @skc_ipv6only: socket is IPV6 only
 *        @skc_net_refcnt: socket is using net ref counting
 *        @skc_bound_dev_if: bound device index if != 0
 *        @skc_bind_node: bind hash linkage for various protocol lookup tables
 *        @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol
 *        @skc_prot: protocol handlers inside a network family
 *        @skc_net: reference to the network namespace of this socket
 *        @skc_v6_daddr: IPV6 destination address
 *        @skc_v6_rcv_saddr: IPV6 source address
 *        @skc_cookie: socket's cookie value
 *        @skc_node: main hash linkage for various protocol lookup tables
 *        @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
 *        @skc_tx_queue_mapping: tx queue number for this connection
 *        @skc_rx_queue_mapping: rx queue number for this connection
 *        @skc_flags: place holder for sk_flags
 *                %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
 *                %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
 *        @skc_listener: connection request listener socket (aka rsk_listener)
 *                [union with @skc_flags]
 *        @skc_tw_dr: (aka tw_dr) ptr to &struct inet_timewait_death_row
 *                [union with @skc_flags]
 *        @skc_incoming_cpu: record/match cpu processing incoming packets
 *        @skc_rcv_wnd: (aka rsk_rcv_wnd) TCP receive window size (possibly scaled)
 *                [union with @skc_incoming_cpu]
 *        @skc_tw_rcv_nxt: (aka tw_rcv_nxt) TCP window next expected seq number
 *                [union with @skc_incoming_cpu]
 *        @skc_refcnt: reference count
 *
 *        This is the minimal network layer representation of sockets, the header
 *        for struct sock and struct inet_timewait_sock.
 */
struct sock_common {
        union {
                __addrpair        skc_addrpair;
                struct {
                        __be32        skc_daddr;
                        __be32        skc_rcv_saddr;
                };
        };
        union  {
                unsigned int        skc_hash;
                __u16                skc_u16hashes[2];
        };
        /* skc_dport && skc_num must be grouped as well */
        union {
                __portpair        skc_portpair;
                struct {
                        __be16        skc_dport;
                        __u16        skc_num;
                };
        };

        unsigned short                skc_family;
        volatile unsigned char        skc_state;
        unsigned char                skc_reuse:4;
        unsigned char                skc_reuseport:1;
        unsigned char                skc_ipv6only:1;
        unsigned char                skc_net_refcnt:1;
        int                        skc_bound_dev_if;
        union {
                struct hlist_node        skc_bind_node;
                struct hlist_node        skc_portaddr_node;
        };
        struct proto                *skc_prot;
        possible_net_t                skc_net;

#if IS_ENABLED(CONFIG_IPV6)
        struct in6_addr                skc_v6_daddr;
        struct in6_addr                skc_v6_rcv_saddr;
#endif

        atomic64_t                skc_cookie;

        /* following fields are padding to force
         * offset(struct sock, sk_refcnt) == 128 on 64bit arches
         * assuming IPV6 is enabled. We use this padding differently
         * for different kind of 'sockets'
         */
        union {
                unsigned long        skc_flags;
                struct sock        *skc_listener; /* request_sock */
                struct inet_timewait_death_row *skc_tw_dr; /* inet_timewait_sock */
        };
        /*
         * fields between dontcopy_begin/dontcopy_end
         * are not copied in sock_copy()
         */
        /* private: */
        int                        skc_dontcopy_begin[0];
        /* public: */
        union {
                struct hlist_node        skc_node;
                struct hlist_nulls_node skc_nulls_node;
        };
        unsigned short                skc_tx_queue_mapping;
#ifdef CONFIG_XPS
        unsigned short                skc_rx_queue_mapping;
#endif
        union {
                int                skc_incoming_cpu;
                u32                skc_rcv_wnd;
                u32                skc_tw_rcv_nxt; /* struct tcp_timewait_sock  */
        };

        refcount_t                skc_refcnt;
        /* private: */
        int                     skc_dontcopy_end[0];
        union {
                u32                skc_rxhash;
                u32                skc_window_clamp;
                u32                skc_tw_snd_nxt; /* struct tcp_timewait_sock */
        };
        /* public: */
};

struct bpf_local_storage;

/**
  *        struct sock - network layer representation of sockets
  *        @__sk_common: shared layout with inet_timewait_sock
  *        @sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN
  *        @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings
  *        @sk_lock:        synchronizer
  *        @sk_kern_sock: True if sock is using kernel lock classes
  *        @sk_rcvbuf: size of receive buffer in bytes
  *        @sk_wq: sock wait queue and async head
  *        @sk_rx_dst: receive input route used by early demux
  *        @sk_dst_cache: destination cache
  *        @sk_dst_pending_confirm: need to confirm neighbour
  *        @sk_policy: flow policy
  *        @sk_rx_skb_cache: cache copy of recently accessed RX skb
  *        @sk_receive_queue: incoming packets
  *        @sk_wmem_alloc: transmit queue bytes committed
  *        @sk_tsq_flags: TCP Small Queues flags
  *        @sk_write_queue: Packet sending queue
  *        @sk_omem_alloc: "o" is "option" or "other"
  *        @sk_wmem_queued: persistent queue size
  *        @sk_forward_alloc: space allocated forward
  *        @sk_napi_id: id of the last napi context to receive data for sk
  *        @sk_ll_usec: usecs to busypoll when there is no data
  *        @sk_allocation: allocation mode
  *        @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler)
  *        @sk_pacing_status: Pacing status (requested, handled by sch_fq)
  *        @sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE)
  *        @sk_sndbuf: size of send buffer in bytes
  *        @__sk_flags_offset: empty field used to determine location of bitfield
  *        @sk_padding: unused element for alignment
  *        @sk_no_check_tx: %SO_NO_CHECK setting, set checksum in TX packets
  *        @sk_no_check_rx: allow zero checksum in RX packets
  *        @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO)
  *        @sk_route_nocaps: forbidden route capabilities (e.g NETIF_F_GSO_MASK)
  *        @sk_route_forced_caps: static, forced route capabilities
  *                (set in tcp_init_sock())
  *        @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4)
  *        @sk_gso_max_size: Maximum GSO segment size to build
  *        @sk_gso_max_segs: Maximum number of GSO segments
  *        @sk_pacing_shift: scaling factor for TCP Small Queues
  *        @sk_lingertime: %SO_LINGER l_linger setting
  *        @sk_backlog: always used with the per-socket spinlock held
  *        @sk_callback_lock: used with the callbacks in the end of this struct
  *        @sk_error_queue: rarely used
  *        @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt,
  *                          IPV6_ADDRFORM for instance)
  *        @sk_err: last error
  *        @sk_err_soft: errors that don't cause failure but are the cause of a
  *                      persistent failure not just 'timed out'
  *        @sk_drops: raw/udp drops counter
  *        @sk_ack_backlog: current listen backlog
  *        @sk_max_ack_backlog: listen backlog set in listen()
  *        @sk_uid: user id of owner
  *        @sk_priority: %SO_PRIORITY setting
  *        @sk_type: socket type (%SOCK_STREAM, etc)
  *        @sk_protocol: which protocol this socket belongs in this network family
  *        @sk_peer_pid: &struct pid for this socket's peer
  *        @sk_peer_cred: %SO_PEERCRED setting
  *        @sk_rcvlowat: %SO_RCVLOWAT setting
  *        @sk_rcvtimeo: %SO_RCVTIMEO setting
  *        @sk_sndtimeo: %SO_SNDTIMEO setting
  *        @sk_txhash: computed flow hash for use on transmit
  *        @sk_filter: socket filtering instructions
  *        @sk_timer: sock cleanup timer
  *        @sk_stamp: time stamp of last packet received
  *        @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
  *        @sk_tsflags: SO_TIMESTAMPING socket options
  *        @sk_tskey: counter to disambiguate concurrent tstamp requests
  *        @sk_zckey: counter to order MSG_ZEROCOPY notifications
  *        @sk_socket: Identd and reporting IO signals
  *        @sk_user_data: RPC layer private data. Write-protected by @sk_callback_lock.
  *        @sk_frag: cached page frag
  *        @sk_peek_off: current peek_offset value
  *        @sk_send_head: front of stuff to transmit
  *        @tcp_rtx_queue: TCP re-transmit queue [union with @sk_send_head]
  *        @sk_tx_skb_cache: cache copy of recently accessed TX skb
  *        @sk_security: used by security modules
  *        @sk_mark: generic packet mark
  *        @sk_cgrp_data: cgroup data for this cgroup
  *        @sk_memcg: this socket's memory cgroup association
  *        @sk_write_pending: a write to stream socket waits to start
  *        @sk_wait_pending: number of threads blocked on this socket
  *        @sk_state_change: callback to indicate change in the state of the sock
  *        @sk_data_ready: callback to indicate there is data to be processed
  *        @sk_write_space: callback to indicate there is bf sending space available
  *        @sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE)
  *        @sk_backlog_rcv: callback to process the backlog
  *        @sk_validate_xmit_skb: ptr to an optional validate function
  *        @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0
  *        @sk_reuseport_cb: reuseport group container
  *        @sk_bpf_storage: ptr to cache and control for bpf_sk_storage
  *        @sk_rcu: used during RCU grace period
  *        @sk_clockid: clockid used by time-based scheduling (SO_TXTIME)
  *        @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME
  *        @sk_txtime_report_errors: set report errors mode for SO_TXTIME
  *        @sk_txtime_unused: unused txtime flags
  *        @sk_owner: reference to the real owner of the socket that calls
  *                   sock_lock_init_class_and_name().
  */
struct sock {
        /*
         * Now struct inet_timewait_sock also uses sock_common, so please just
         * don't add nothing before this first member (__sk_common) --acme
         */
        struct sock_common        __sk_common;
#define sk_node                        __sk_common.skc_node
#define sk_nulls_node                __sk_common.skc_nulls_node
#define sk_refcnt                __sk_common.skc_refcnt
#define sk_tx_queue_mapping        __sk_common.skc_tx_queue_mapping
#ifdef CONFIG_XPS
#define sk_rx_queue_mapping        __sk_common.skc_rx_queue_mapping
#endif

#define sk_dontcopy_begin        __sk_common.skc_dontcopy_begin
#define sk_dontcopy_end                __sk_common.skc_dontcopy_end
#define sk_hash                        __sk_common.skc_hash
#define sk_portpair                __sk_common.skc_portpair
#define sk_num                        __sk_common.skc_num
#define sk_dport                __sk_common.skc_dport
#define sk_addrpair                __sk_common.skc_addrpair
#define sk_daddr                __sk_common.skc_daddr
#define sk_rcv_saddr                __sk_common.skc_rcv_saddr
#define sk_family                __sk_common.skc_family
#define sk_state                __sk_common.skc_state
#define sk_reuse                __sk_common.skc_reuse
#define sk_reuseport                __sk_common.skc_reuseport
#define sk_ipv6only                __sk_common.skc_ipv6only
#define sk_net_refcnt                __sk_common.skc_net_refcnt
#define sk_bound_dev_if                __sk_common.skc_bound_dev_if
#define sk_bind_node                __sk_common.skc_bind_node
#define sk_prot                        __sk_common.skc_prot
#define sk_net                        __sk_common.skc_net
#define sk_v6_daddr                __sk_common.skc_v6_daddr
#define sk_v6_rcv_saddr        __sk_common.skc_v6_rcv_saddr
#define sk_cookie                __sk_common.skc_cookie
#define sk_incoming_cpu                __sk_common.skc_incoming_cpu
#define sk_flags                __sk_common.skc_flags
#define sk_rxhash                __sk_common.skc_rxhash

        socket_lock_t                sk_lock;
        atomic_t                sk_drops;
        int                        sk_rcvlowat;
        struct sk_buff_head        sk_error_queue;
        struct sk_buff                *sk_rx_skb_cache;
        struct sk_buff_head        sk_receive_queue;
        /*
         * The backlog queue is special, it is always used with
         * the per-socket spinlock held and requires low latency
         * access. Therefore we special case it's implementation.
         * Note : rmem_alloc is in this structure to fill a hole
         * on 64bit arches, not because its logically part of
         * backlog.
         */
        struct {
                atomic_t        rmem_alloc;
                int                len;
                struct sk_buff        *head;
                struct sk_buff        *tail;
        } sk_backlog;
#define sk_rmem_alloc sk_backlog.rmem_alloc

        int                        sk_forward_alloc;
#ifdef CONFIG_NET_RX_BUSY_POLL
        unsigned int                sk_ll_usec;
        /* ===== mostly read cache line ===== */
        unsigned int                sk_napi_id;
#endif
        int                        sk_rcvbuf;
        int                        sk_wait_pending;

        struct sk_filter __rcu        *sk_filter;
        union {
                struct socket_wq __rcu        *sk_wq;
                /* private: */
                struct socket_wq        *sk_wq_raw;
                /* public: */
        };
#ifdef CONFIG_XFRM
        struct xfrm_policy __rcu *sk_policy[2];
#endif
        struct dst_entry __rcu        *sk_rx_dst;
        struct dst_entry __rcu        *sk_dst_cache;
        atomic_t                sk_omem_alloc;
        int                        sk_sndbuf;

        /* ===== cache line for TX ===== */
        int                        sk_wmem_queued;
        refcount_t                sk_wmem_alloc;
        unsigned long                sk_tsq_flags;
        union {
                struct sk_buff        *sk_send_head;
                struct rb_root        tcp_rtx_queue;
        };
        struct sk_buff                *sk_tx_skb_cache;
        struct sk_buff_head        sk_write_queue;
        __s32                        sk_peek_off;
        int                        sk_write_pending;
        __u32                        sk_dst_pending_confirm;
        u32                        sk_pacing_status; /* see enum sk_pacing */
        long                        sk_sndtimeo;
        struct timer_list        sk_timer;
        __u32                        sk_priority;
        __u32                        sk_mark;
        unsigned long                sk_pacing_rate; /* bytes per second */
        unsigned long                sk_max_pacing_rate;
        struct page_frag        sk_frag;
        netdev_features_t        sk_route_caps;
        netdev_features_t        sk_route_nocaps;
        netdev_features_t        sk_route_forced_caps;
        int                        sk_gso_type;
        unsigned int                sk_gso_max_size;
        gfp_t                        sk_allocation;
        __u32                        sk_txhash;

        /*
         * Because of non atomicity rules, all
         * changes are protected by socket lock.
         */
        u8                        sk_padding : 1,
                                sk_kern_sock : 1,
                                sk_no_check_tx : 1,
                                sk_no_check_rx : 1,
                                sk_userlocks : 4;
        u8                        sk_pacing_shift;
        u16                        sk_type;
        u16                        sk_protocol;
        u16                        sk_gso_max_segs;
        unsigned long                sk_lingertime;
        struct proto                *sk_prot_creator;
        rwlock_t                sk_callback_lock;
        int                        sk_err,
                                sk_err_soft;
        u32                        sk_ack_backlog;
        u32                        sk_max_ack_backlog;
        kuid_t                        sk_uid;
        spinlock_t                sk_peer_lock;
        struct pid                *sk_peer_pid;
        const struct cred        *sk_peer_cred;

        long                        sk_rcvtimeo;
        ktime_t                        sk_stamp;
#if BITS_PER_LONG==32
        seqlock_t                sk_stamp_seq;
#endif
        u16                        sk_tsflags;
        u8                        sk_shutdown;
        u32                        sk_tskey;
        atomic_t                sk_zckey;

        u8                        sk_clockid;
        u8                        sk_txtime_deadline_mode : 1,
                                sk_txtime_report_errors : 1,
                                sk_txtime_unused : 6;

        struct socket                *sk_socket;
        void                        *sk_user_data;
#ifdef CONFIG_SECURITY
        void                        *sk_security;
#endif
        struct sock_cgroup_data        sk_cgrp_data;
        struct mem_cgroup        *sk_memcg;
        void                        (*sk_state_change)(struct sock *sk);
        void                        (*sk_data_ready)(struct sock *sk);
        void                        (*sk_write_space)(struct sock *sk);
        void                        (*sk_error_report)(struct sock *sk);
        int                        (*sk_backlog_rcv)(struct sock *sk,
                                                  struct sk_buff *skb);
#ifdef CONFIG_SOCK_VALIDATE_XMIT
        struct sk_buff*                (*sk_validate_xmit_skb)(struct sock *sk,
                                                        struct net_device *dev,
                                                        struct sk_buff *skb);
#endif
        void                    (*sk_destruct)(struct sock *sk);
        struct sock_reuseport __rcu        *sk_reuseport_cb;
#ifdef CONFIG_BPF_SYSCALL
        struct bpf_local_storage __rcu        *sk_bpf_storage;
#endif
        struct rcu_head                sk_rcu;

#if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES)
        struct module                *sk_owner;
#endif
};

enum sk_pacing {
        SK_PACING_NONE                = 0,
        SK_PACING_NEEDED        = 1,
        SK_PACING_FQ                = 2,
};

/* flag bits in sk_user_data
 *
 * - SK_USER_DATA_NOCOPY:      Pointer stored in sk_user_data might
 *   not be suitable for copying when cloning the socket. For instance,
 *   it can point to a reference counted object. sk_user_data bottom
 *   bit is set if pointer must not be copied.
 *
 * - SK_USER_DATA_BPF:         Mark whether sk_user_data field is
 *   managed/owned by a BPF reuseport array. This bit should be set
 *   when sk_user_data's sk is added to the bpf's reuseport_array.
 *
 * - SK_USER_DATA_PSOCK:       Mark whether pointer stored in
 *   sk_user_data points to psock type. This bit should be set
 *   when sk_user_data is assigned to a psock object.
 */
#define SK_USER_DATA_NOCOPY        1UL
#define SK_USER_DATA_BPF        2UL
#define SK_USER_DATA_PSOCK        4UL
#define SK_USER_DATA_PTRMASK        ~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF |\
                                  SK_USER_DATA_PSOCK)

/**
 * sk_user_data_is_nocopy - Test if sk_user_data pointer must not be copied
 * @sk: socket
 */
static inline bool sk_user_data_is_nocopy(const struct sock *sk)
{
        return ((uintptr_t)sk->sk_user_data & SK_USER_DATA_NOCOPY);
}

#define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data)))

/**
 * __rcu_dereference_sk_user_data_with_flags - return the pointer
 * only if argument flags all has been set in sk_user_data. Otherwise
 * return NULL
 *
 * @sk: socket
 * @flags: flag bits
 */
static inline void *
__rcu_dereference_sk_user_data_with_flags(const struct sock *sk,
                                          uintptr_t flags)
{
        uintptr_t sk_user_data = (uintptr_t)rcu_dereference(__sk_user_data(sk));

        WARN_ON_ONCE(flags & SK_USER_DATA_PTRMASK);

        if ((sk_user_data & flags) == flags)
                return (void *)(sk_user_data & SK_USER_DATA_PTRMASK);
        return NULL;
}

#define rcu_dereference_sk_user_data(sk)                                \
        __rcu_dereference_sk_user_data_with_flags(sk, 0)
#define __rcu_assign_sk_user_data_with_flags(sk, ptr, flags)                \
({                                                                        \
        uintptr_t __tmp1 = (uintptr_t)(ptr),                                \
                  __tmp2 = (uintptr_t)(flags);                                \
        WARN_ON_ONCE(__tmp1 & ~SK_USER_DATA_PTRMASK);                        \
        WARN_ON_ONCE(__tmp2 & SK_USER_DATA_PTRMASK);                        \
        rcu_assign_pointer(__sk_user_data((sk)),                        \
                           __tmp1 | __tmp2);                                \
})
#define rcu_assign_sk_user_data(sk, ptr)                                \
        __rcu_assign_sk_user_data_with_flags(sk, ptr, 0)

/*
 * SK_CAN_REUSE and SK_NO_REUSE on a socket mean that the socket is OK
 * or not whether his port will be reused by someone else. SK_FORCE_REUSE
 * on a socket means that the socket will reuse everybody else's port
 * without looking at the other's sk_reuse value.
 */

#define SK_NO_REUSE        0
#define SK_CAN_REUSE        1
#define SK_FORCE_REUSE        2

int sk_set_peek_off(struct sock *sk, int val);

static inline int sk_peek_offset(struct sock *sk, int flags)
{
        if (unlikely(flags & MSG_PEEK)) {
                return READ_ONCE(sk->sk_peek_off);
        }

        return 0;
}

static inline void sk_peek_offset_bwd(struct sock *sk, int val)
{
        s32 off = READ_ONCE(sk->sk_peek_off);

        if (unlikely(off >= 0)) {
                off = max_t(s32, off - val, 0);
                WRITE_ONCE(sk->sk_peek_off, off);
        }
}

static inline void sk_peek_offset_fwd(struct sock *sk, int val)
{
        sk_peek_offset_bwd(sk, -val);
}

/*
 * Hashed lists helper routines
 */
static inline struct sock *sk_entry(const struct hlist_node *node)
{
        return hlist_entry(node, struct sock, sk_node);
}

static inline struct sock *__sk_head(const struct hlist_head *head)
{
        return hlist_entry(head->first, struct sock, sk_node);
}

static inline struct sock *sk_head(const struct hlist_head *head)
{
        return hlist_empty(head) ? NULL : __sk_head(head);
}

static inline struct sock *__sk_nulls_head(const struct hlist_nulls_head *head)
{
        return hlist_nulls_entry(head->first, struct sock, sk_nulls_node);
}

static inline struct sock *sk_nulls_head(const struct hlist_nulls_head *head)
{
        return hlist_nulls_empty(head) ? NULL : __sk_nulls_head(head);
}

static inline struct sock *sk_next(const struct sock *sk)
{
        return hlist_entry_safe(sk->sk_node.next, struct sock, sk_node);
}

static inline struct sock *sk_nulls_next(const struct sock *sk)
{
        return (!is_a_nulls(sk->sk_nulls_node.next)) ?
                hlist_nulls_entry(sk->sk_nulls_node.next,
                                  struct sock, sk_nulls_node) :
                NULL;
}

static inline bool sk_unhashed(const struct sock *sk)
{
        return hlist_unhashed(&sk->sk_node);
}

static inline bool sk_hashed(const struct sock *sk)
{
        return !sk_unhashed(sk);
}

static inline void sk_node_init(struct hlist_node *node)
{
        node->pprev = NULL;
}

static inline void sk_nulls_node_init(struct hlist_nulls_node *node)
{
        node->pprev = NULL;
}

static inline void __sk_del_node(struct sock *sk)
{
        __hlist_del(&sk->sk_node);
}

/* NB: equivalent to hlist_del_init_rcu */
static inline bool __sk_del_node_init(struct sock *sk)
{
        if (sk_hashed(sk)) {
                __sk_del_node(sk);
                sk_node_init(&sk->sk_node);
                return true;
        }
        return false;
}

/* Grab socket reference count. This operation is valid only
   when sk is ALREADY grabbed f.e. it is found in hash table
   or a list and the lookup is made under lock preventing hash table
   modifications.
 */

static __always_inline void sock_hold(struct sock *sk)
{
        refcount_inc(&sk->sk_refcnt);
}

/* Ungrab socket in the context, which assumes that socket refcnt
   cannot hit zero, f.e. it is true in context of any socketcall.
 */
static __always_inline void __sock_put(struct sock *sk)
{
        refcount_dec(&sk->sk_refcnt);
}

static inline bool sk_del_node_init(struct sock *sk)
{
        bool rc = __sk_del_node_init(sk);

        if (rc) {
                /* paranoid for a while -acme */
                WARN_ON(refcount_read(&sk->sk_refcnt) == 1);
                __sock_put(sk);
        }
        return rc;
}
#define sk_del_node_init_rcu(sk)        sk_del_node_init(sk)

static inline bool __sk_nulls_del_node_init_rcu(struct sock *sk)
{
        if (sk_hashed(sk)) {
                hlist_nulls_del_init_rcu(&sk->sk_nulls_node);
                return true;
        }
        return false;
}

static inline bool sk_nulls_del_node_init_rcu(struct sock *sk)
{
        bool rc = __sk_nulls_del_node_init_rcu(sk);

        if (rc) {
                /* paranoid for a while -acme */
                WARN_ON(refcount_read(&sk->sk_refcnt) == 1);
                __sock_put(sk);
        }
        return rc;
}

static inline bool sk_nulls_replace_node_init_rcu(struct sock *old,
                                                  struct sock *new)
{
        if (sk_hashed(old)) {
                hlist_nulls_replace_init_rcu(&old->sk_nulls_node,
                                             &new->sk_nulls_node);
                __sock_put(old);
                return true;
        }

        return false;
}

static inline void __sk_add_node(struct sock *sk, struct hlist_head *list)
{
        hlist_add_head(&sk->sk_node, list);
}

static inline void sk_add_node(struct sock *sk, struct hlist_head *list)
{
        sock_hold(sk);
        __sk_add_node(sk, list);
}

static inline void sk_add_node_rcu(struct sock *sk, struct hlist_head *list)
{
        sock_hold(sk);
        if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
            sk->sk_family == AF_INET6)
                hlist_add_tail_rcu(&sk->sk_node, list);
        else
                hlist_add_head_rcu(&sk->sk_node, list);
}

static inline void sk_add_node_tail_rcu(struct sock *sk, struct hlist_head *list)
{
        sock_hold(sk);
        hlist_add_tail_rcu(&sk->sk_node, list);
}

static inline void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
{
        hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list);
}

static inline void __sk_nulls_add_node_tail_rcu(struct sock *sk, struct hlist_nulls_head *list)
{
        hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list);
}

static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
{
        sock_hold(sk);
        __sk_nulls_add_node_rcu(sk, list);
}

static inline void __sk_del_bind_node(struct sock *sk)
{
        __hlist_del(&sk->sk_bind_node);
}

static inline void sk_add_bind_node(struct sock *sk,
                                        struct hlist_head *list)
{
        hlist_add_head(&sk->sk_bind_node, list);
}

#define sk_for_each(__sk, list) \
        hlist_for_each_entry(__sk, list, sk_node)
#define sk_for_each_rcu(__sk, list) \
        hlist_for_each_entry_rcu(__sk, list, sk_node)
#define sk_nulls_for_each(__sk, node, list) \
        hlist_nulls_for_each_entry(__sk, node, list, sk_nulls_node)
#define sk_nulls_for_each_rcu(__sk, node, list) \
        hlist_nulls_for_each_entry_rcu(__sk, node, list, sk_nulls_node)
#define sk_for_each_from(__sk) \
        hlist_for_each_entry_from(__sk, sk_node)
#define sk_nulls_for_each_from(__sk, node) \
        if (__sk && ({ node = &(__sk)->sk_nulls_node; 1; })) \
                hlist_nulls_for_each_entry_from(__sk, node, sk_nulls_node)
#define sk_for_each_safe(__sk, tmp, list) \
        hlist_for_each_entry_safe(__sk, tmp, list, sk_node)
#define sk_for_each_bound(__sk, list) \
        hlist_for_each_entry(__sk, list, sk_bind_node)
#define sk_for_each_bound_safe(__sk, tmp, list) \
        hlist_for_each_entry_safe(__sk, tmp, list, sk_bind_node)

/**
 * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_node to use as a loop cursor.
 * @head:        the head for your list.
 * @offset:        offset of hlist_node within the struct.
 *
 */
#define sk_for_each_entry_offset_rcu(tpos, pos, head, offset)                       \
        for (pos = rcu_dereference(hlist_first_rcu(head));                       \
             pos != NULL &&                                                       \
                ({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;});       \
             pos = rcu_dereference(hlist_next_rcu(pos)))

static inline struct user_namespace *sk_user_ns(struct sock *sk)
{
        /* Careful only use this in a context where these parameters
         * can not change and must all be valid, such as recvmsg from
         * userspace.
         */
        return sk->sk_socket->file->f_cred->user_ns;
}

/* Sock flags */
enum sock_flags {
        SOCK_DEAD,
        SOCK_DONE,
        SOCK_URGINLINE,
        SOCK_KEEPOPEN,
        SOCK_LINGER,
        SOCK_DESTROY,
        SOCK_BROADCAST,
        SOCK_TIMESTAMP,
        SOCK_ZAPPED,
        SOCK_USE_WRITE_QUEUE, /* whether to call sk->sk_write_space in sock_wfree */
        SOCK_DBG, /* %SO_DEBUG setting */
        SOCK_RCVTSTAMP, /* %SO_TIMESTAMP setting */
        SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */
        SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */
        SOCK_MEMALLOC, /* VM depends on this socket for swapping */
        SOCK_TIMESTAMPING_RX_SOFTWARE,  /* %SOF_TIMESTAMPING_RX_SOFTWARE */
        SOCK_FASYNC, /* fasync() active */
        SOCK_RXQ_OVFL,
        SOCK_ZEROCOPY, /* buffers from userspace */
        SOCK_WIFI_STATUS, /* push wifi status to userspace */
        SOCK_NOFCS, /* Tell NIC not to do the Ethernet FCS.
                     * Will use last 4 bytes of packet sent from
                     * user-space instead.
                     */
        SOCK_FILTER_LOCKED, /* Filter cannot be changed anymore */
        SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */
        SOCK_RCU_FREE, /* wait rcu grace period in sk_destruct() */
        SOCK_TXTIME,
        SOCK_XDP, /* XDP is attached */
        SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */
};

#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))

static inline void sock_copy_flags(struct sock *nsk, struct sock *osk)
{
        nsk->sk_flags = osk->sk_flags;
}

static inline void sock_set_flag(struct sock *sk, enum sock_flags flag)
{
        __set_bit(flag, &sk->sk_flags);
}

static inline void sock_reset_flag(struct sock *sk, enum sock_flags flag)
{
        __clear_bit(flag, &sk->sk_flags);
}

static inline void sock_valbool_flag(struct sock *sk, enum sock_flags bit,
                                     int valbool)
{
        if (valbool)
                sock_set_flag(sk, bit);
        else
                sock_reset_flag(sk, bit);
}

static inline bool sock_flag(const struct sock *sk, enum sock_flags flag)
{
        return test_bit(flag, &sk->sk_flags);
}

#ifdef CONFIG_NET
DECLARE_STATIC_KEY_FALSE(memalloc_socks_key);
static inline int sk_memalloc_socks(void)
{
        return static_branch_unlikely(&memalloc_socks_key);
}

void __receive_sock(struct file *file);
#else

static inline int sk_memalloc_socks(void)
{
        return 0;
}

static inline void __receive_sock(struct file *file)
{ }
#endif

static inline gfp_t sk_gfp_mask(const struct sock *sk, gfp_t gfp_mask)
{
        return gfp_mask | (sk->sk_allocation & __GFP_MEMALLOC);
}

static inline void sk_acceptq_removed(struct sock *sk)
{
        WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog - 1);
}

static inline void sk_acceptq_added(struct sock *sk)
{
        WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog + 1);
}

static inline bool sk_acceptq_is_full(const struct sock *sk)
{
        return READ_ONCE(sk->sk_ack_backlog) > READ_ONCE(sk->sk_max_ack_backlog);
}

/*
 * Compute minimal free write space needed to queue new packets.
 */
static inline int sk_stream_min_wspace(const struct sock *sk)
{
        return READ_ONCE(sk->sk_wmem_queued) >> 1;
}

static inline int sk_stream_wspace(const struct sock *sk)
{
        return READ_ONCE(sk->sk_sndbuf) - READ_ONCE(sk->sk_wmem_queued);
}

static inline void sk_wmem_queued_add(struct sock *sk, int val)
{
        WRITE_ONCE(sk->sk_wmem_queued, sk->sk_wmem_queued + val);
}

void sk_stream_write_space(struct sock *sk);

/* OOB backlog add */
static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb)
{
        /* dont let skb dst not refcounted, we are going to leave rcu lock */
        skb_dst_force(skb);

        if (!sk->sk_backlog.tail)
                WRITE_ONCE(sk->sk_backlog.head, skb);
        else
                sk->sk_backlog.tail->next = skb;

        WRITE_ONCE(sk->sk_backlog.tail, skb);
        skb->next = NULL;
}

/*
 * Take into account size of receive queue and backlog queue
 * Do not take into account this skb truesize,
 * to allow even a single big packet to come.
 */
static inline bool sk_rcvqueues_full(const struct sock *sk, unsigned int limit)
{
        unsigned int qsize = sk->sk_backlog.len + atomic_read(&sk->sk_rmem_alloc);

        return qsize > limit;
}

/* The per-socket spinlock must be held here. */
static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *skb,
                                              unsigned int limit)
{
        if (sk_rcvqueues_full(sk, limit))
                return -ENOBUFS;

        /*
         * If the skb was allocated from pfmemalloc reserves, only
         * allow SOCK_MEMALLOC sockets to use it as this socket is
         * helping free memory
         */
        if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
                return -ENOMEM;

        __sk_add_backlog(sk, skb);
        sk->sk_backlog.len += skb->truesize;
        return 0;
}

int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb);

static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
{
        if (sk_memalloc_socks() && skb_pfmemalloc(skb))
                return __sk_backlog_rcv(sk, skb);

        return sk->sk_backlog_rcv(sk, skb);
}

static inline void sk_incoming_cpu_update(struct sock *sk)
{
        int cpu = raw_smp_processor_id();

        if (unlikely(READ_ONCE(sk->sk_incoming_cpu) != cpu))
                WRITE_ONCE(sk->sk_incoming_cpu, cpu);
}

static inline void sock_rps_record_flow_hash(__u32 hash)
{
#ifdef CONFIG_RPS
        struct rps_sock_flow_table *sock_flow_table;

        rcu_read_lock();
        sock_flow_table = rcu_dereference(rps_sock_flow_table);
        rps_record_sock_flow(sock_flow_table, hash);
        rcu_read_unlock();
#endif
}

static inline void sock_rps_record_flow(const struct sock *sk)
{
#ifdef CONFIG_RPS
        if (static_branch_unlikely(&rfs_needed)) {
                /* Reading sk->sk_rxhash might incur an expensive cache line
                 * miss.
                 *
                 * TCP_ESTABLISHED does cover almost all states where RFS
                 * might be useful, and is cheaper [1] than testing :
                 *        IPv4: inet_sk(sk)->inet_daddr
                 *         IPv6: ipv6_addr_any(&sk->sk_v6_daddr)
                 * OR        an additional socket flag
                 * [1] : sk_state and sk_prot are in the same cache line.
                 */
                if (sk->sk_state == TCP_ESTABLISHED) {
                        /* This READ_ONCE() is paired with the WRITE_ONCE()
                         * from sock_rps_save_rxhash() and sock_rps_reset_rxhash().
                         */
                        sock_rps_record_flow_hash(READ_ONCE(sk->sk_rxhash));
                }
        }
#endif
}

static inline void sock_rps_save_rxhash(struct sock *sk,
                                        const struct sk_buff *skb)
{
#ifdef CONFIG_RPS
        /* The following WRITE_ONCE() is paired with the READ_ONCE()
         * here, and another one in sock_rps_record_flow().
         */
        if (unlikely(READ_ONCE(sk->sk_rxhash) != skb->hash))
                WRITE_ONCE(sk->sk_rxhash, skb->hash);
#endif
}

static inline void sock_rps_reset_rxhash(struct sock *sk)
{
#ifdef CONFIG_RPS
        /* Paired with READ_ONCE() in sock_rps_record_flow() */
        WRITE_ONCE(sk->sk_rxhash, 0);
#endif
}

#define sk_wait_event(__sk, __timeo, __condition, __wait)                \
        ({        int __rc;                                                \
                __sk->sk_wait_pending++;                                \
                release_sock(__sk);                                        \
                __rc = __condition;                                        \
                if (!__rc) {                                                \
                        *(__timeo) = wait_woken(__wait,                        \
                                                TASK_INTERRUPTIBLE,        \
                                                *(__timeo));                \
                }                                                        \
                sched_annotate_sleep();                                        \
                lock_sock(__sk);                                        \
                __sk->sk_wait_pending--;                                \
                __rc = __condition;                                        \
                __rc;                                                        \
        })

int sk_stream_wait_connect(struct sock *sk, long *timeo_p);
int sk_stream_wait_memory(struct sock *sk, long *timeo_p);
void sk_stream_wait_close(struct sock *sk, long timeo_p);
int sk_stream_error(struct sock *sk, int flags, int err);
void sk_stream_kill_queues(struct sock *sk);
void sk_set_memalloc(struct sock *sk);
void sk_clear_memalloc(struct sock *sk);

void __sk_flush_backlog(struct sock *sk);

static inline bool sk_flush_backlog(struct sock *sk)
{
        if (unlikely(READ_ONCE(sk->sk_backlog.tail))) {
                __sk_flush_backlog(sk);
                return true;
        }
        return false;
}

int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb);

struct request_sock_ops;
struct timewait_sock_ops;
struct inet_hashinfo;
struct raw_hashinfo;
struct smc_hashinfo;
struct module;

/*
 * caches using SLAB_TYPESAFE_BY_RCU should let .next pointer from nulls nodes
 * un-modified. Special care is taken when initializing object to zero.
 */
static inline void sk_prot_clear_nulls(struct sock *sk, int size)
{
        if (offsetof(struct sock, sk_node.next) != 0)
                memset(sk, 0, offsetof(struct sock, sk_node.next));
        memset(&sk->sk_node.pprev, 0,
               size - offsetof(struct sock, sk_node.pprev));
}

/* Networking protocol blocks we attach to sockets.
 * socket layer -> transport layer interface
 */
struct proto {
        void                        (*close)(struct sock *sk,
                                        long timeout);
        int                        (*pre_connect)(struct sock *sk,
                                        struct sockaddr *uaddr,
                                        int addr_len);
        int                        (*connect)(struct sock *sk,
                                        struct sockaddr *uaddr,
                                        int addr_len);
        int                        (*disconnect)(struct sock *sk, int flags);

        struct sock *                (*accept)(struct sock *sk, int flags, int *err,
                                          bool kern);

        int                        (*ioctl)(struct sock *sk, int cmd,
                                         unsigned long arg);
        int                        (*init)(struct sock *sk);
        void                        (*destroy)(struct sock *sk);
        void                        (*shutdown)(struct sock *sk, int how);
        int                        (*setsockopt)(struct sock *sk, int level,
                                        int optname, sockptr_t optval,
                                        unsigned int optlen);
        int                        (*getsockopt)(struct sock *sk, int level,
                                        int optname, char __user *optval,
                                        int __user *option);
        void                        (*keepalive)(struct sock *sk, int valbool);
#ifdef CONFIG_COMPAT
        int                        (*compat_ioctl)(struct sock *sk,
                                        unsigned int cmd, unsigned long arg);
#endif
        int                        (*sendmsg)(struct sock *sk, struct msghdr *msg,
                                           size_t len);
        int                        (*recvmsg)(struct sock *sk, struct msghdr *msg,
                                           size_t len, int noblock, int flags,
                                           int *addr_len);
        int                        (*sendpage)(struct sock *sk, struct page *page,
                                        int offset, size_t size, int flags);
        int                        (*bind)(struct sock *sk,
                                        struct sockaddr *addr, int addr_len);
        int                        (*bind_add)(struct sock *sk,
                                        struct sockaddr *addr, int addr_len);

        int                        (*backlog_rcv) (struct sock *sk,
                                                struct sk_buff *skb);
        bool                        (*bpf_bypass_getsockopt)(int level,
                                                         int optname);

        void                (*release_cb)(struct sock *sk);

        /* Keeping track of sk's, looking them up, and port selection methods. */
        int                        (*hash)(struct sock *sk);
        void                        (*unhash)(struct sock *sk);
        void                        (*rehash)(struct sock *sk);
        int                        (*get_port)(struct sock *sk, unsigned short snum);

        /* Keeping track of sockets in use */
#ifdef CONFIG_PROC_FS
        unsigned int                inuse_idx;
#endif

        bool                        (*stream_memory_free)(const struct sock *sk, int wake);
        bool                        (*sock_is_readable)(struct sock *sk);
        /* Memory pressure */
        void                        (*enter_memory_pressure)(struct sock *sk);
        void                        (*leave_memory_pressure)(struct sock *sk);
        atomic_long_t                *memory_allocated;        /* Current allocated memory. */
        struct percpu_counter        *sockets_allocated;        /* Current number of sockets. */
        /*
         * Pressure flag: try to collapse.
         * Technical note: it is used by multiple contexts non atomically.
         * Make sure to use READ_ONCE()/WRITE_ONCE() for all reads/writes.
         * All the __sk_mem_schedule() is of this nature: accounting
         * is strict, actions are advisory and have some latency.
         */
        unsigned long                *memory_pressure;
        long                        *sysctl_mem;

        int                        *sysctl_wmem;
        int                        *sysctl_rmem;
        u32                        sysctl_wmem_offset;
        u32                        sysctl_rmem_offset;

        int                        max_header;
        bool                        no_autobind;

        struct kmem_cache        *slab;
        unsigned int                obj_size;
        slab_flags_t                slab_flags;
        unsigned int                useroffset;        /* Usercopy region offset */
        unsigned int                usersize;        /* Usercopy region size */

        unsigned int __percpu        *orphan_count;

        struct request_sock_ops        *rsk_prot;
        struct timewait_sock_ops *twsk_prot;

        union {
                struct inet_hashinfo        *hashinfo;
                struct udp_table        *udp_table;
                struct raw_hashinfo        *raw_hash;
                struct smc_hashinfo        *smc_hash;
        } h;

        struct module                *owner;

        char                        name[32];

        struct list_head        node;
#ifdef SOCK_REFCNT_DEBUG
        atomic_t                socks;
#endif
        int                        (*diag_destroy)(struct sock *sk, int err);
} __randomize_layout;

int proto_register(struct proto *prot, int alloc_slab);
void proto_unregister(struct proto *prot);
int sock_load_diag_module(int family, int protocol);

#ifdef SOCK_REFCNT_DEBUG
static inline void sk_refcnt_debug_inc(struct sock *sk)
{
        atomic_inc(&sk->sk_prot->socks);
}

static inline void sk_refcnt_debug_dec(struct sock *sk)
{
        atomic_dec(&sk->sk_prot->socks);
        printk(KERN_DEBUG "%s socket %p released, %d are still alive\n",
               sk->sk_prot->name, sk, atomic_read(&sk->sk_prot->socks));
}

static inline void sk_refcnt_debug_release(const struct sock *sk)
{
        if (refcount_read(&sk->sk_refcnt) != 1)
                printk(KERN_DEBUG "Destruction of the %s socket %p delayed, refcnt=%d\n",
                       sk->sk_prot->name, sk, refcount_read(&sk->sk_refcnt));
}
#else /* SOCK_REFCNT_DEBUG */
#define sk_refcnt_debug_inc(sk) do { } while (0)
#define sk_refcnt_debug_dec(sk) do { } while (0)
#define sk_refcnt_debug_release(sk) do { } while (0)
#endif /* SOCK_REFCNT_DEBUG */

static inline bool __sk_stream_memory_free(const struct sock *sk, int wake)
{
        if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf))
                return false;

        return sk->sk_prot->stream_memory_free ?
                sk->sk_prot->stream_memory_free(sk, wake) : true;
}

static inline bool sk_stream_memory_free(const struct sock *sk)
{
        return __sk_stream_memory_free(sk, 0);
}

static inline bool __sk_stream_is_writeable(const struct sock *sk, int wake)
{
        return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) &&
               __sk_stream_memory_free(sk, wake);
}

static inline bool sk_stream_is_writeable(const struct sock *sk)
{
        return __sk_stream_is_writeable(sk, 0);
}

static inline int sk_under_cgroup_hierarchy(struct sock *sk,
                                            struct cgroup *ancestor)
{
#ifdef CONFIG_SOCK_CGROUP_DATA
        return cgroup_is_descendant(sock_cgroup_ptr(&sk->sk_cgrp_data),
                                    ancestor);
#else
        return -ENOTSUPP;
#endif
}

static inline bool sk_has_memory_pressure(const struct sock *sk)
{
        return sk->sk_prot->memory_pressure != NULL;
}

static inline bool sk_under_global_memory_pressure(const struct sock *sk)
{
        return sk->sk_prot->memory_pressure &&
                !!READ_ONCE(*sk->sk_prot->memory_pressure);
}

static inline bool sk_under_memory_pressure(const struct sock *sk)
{
        if (!sk->sk_prot->memory_pressure)
                return false;

        if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
            mem_cgroup_under_socket_pressure(sk->sk_memcg))
                return true;

        return !!READ_ONCE(*sk->sk_prot->memory_pressure);
}

static inline long
sk_memory_allocated(const struct sock *sk)
{
        return atomic_long_read(sk->sk_prot->memory_allocated);
}

static inline long
sk_memory_allocated_add(struct sock *sk, int amt)
{
        return atomic_long_add_return(amt, sk->sk_prot->memory_allocated);
}

static inline void
sk_memory_allocated_sub(struct sock *sk, int amt)
{
        atomic_long_sub(amt, sk->sk_prot->memory_allocated);
}

static inline void sk_sockets_allocated_dec(struct sock *sk)
{
        percpu_counter_dec(sk->sk_prot->sockets_allocated);
}

static inline void sk_sockets_allocated_inc(struct sock *sk)
{
        percpu_counter_inc(sk->sk_prot->sockets_allocated);
}

static inline u64
sk_sockets_allocated_read_positive(struct sock *sk)
{
        return percpu_counter_read_positive(sk->sk_prot->sockets_allocated);
}

static inline int
proto_sockets_allocated_sum_positive(struct proto *prot)
{
        return percpu_counter_sum_positive(prot->sockets_allocated);
}

static inline long
proto_memory_allocated(struct proto *prot)
{
        return atomic_long_read(prot->memory_allocated);
}

static inline bool
proto_memory_pressure(struct proto *prot)
{
        if (!prot->memory_pressure)
                return false;
        return !!READ_ONCE(*prot->memory_pressure);
}


#ifdef CONFIG_PROC_FS
/* Called with local bh disabled */
void sock_prot_inuse_add(struct net *net, struct proto *prot, int inc);
int sock_prot_inuse_get(struct net *net, struct proto *proto);
int sock_inuse_get(struct net *net);
#else
static inline void sock_prot_inuse_add(struct net *net, struct proto *prot,
                int inc)
{
}
#endif


/* With per-bucket locks this operation is not-atomic, so that
 * this version is not worse.
 */
static inline int __sk_prot_rehash(struct sock *sk)
{
        sk->sk_prot->unhash(sk);
        return sk->sk_prot->hash(sk);
}

/* About 10 seconds */
#define SOCK_DESTROY_TIME (10*HZ)

/* Sockets 0-1023 can't be bound to unless you are superuser */
#define PROT_SOCK        1024

#define SHUTDOWN_MASK        3
#define RCV_SHUTDOWN        1
#define SEND_SHUTDOWN        2

#define SOCK_SNDBUF_LOCK        1
#define SOCK_RCVBUF_LOCK        2
#define SOCK_BINDADDR_LOCK        4
#define SOCK_BINDPORT_LOCK        8

struct socket_alloc {
        struct socket socket;
        struct inode vfs_inode;
};

static inline struct socket *SOCKET_I(struct inode *inode)
{
        return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
}

static inline struct inode *SOCK_INODE(struct socket *socket)
{
        return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
}

/*
 * Functions for memory accounting
 */
int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind);
int __sk_mem_schedule(struct sock *sk, int size, int kind);
void __sk_mem_reduce_allocated(struct sock *sk, int amount);
void __sk_mem_reclaim(struct sock *sk, int amount);

/* We used to have PAGE_SIZE here, but systems with 64KB pages
 * do not necessarily have 16x time more memory than 4KB ones.
 */
#define SK_MEM_QUANTUM 4096
#define SK_MEM_QUANTUM_SHIFT ilog2(SK_MEM_QUANTUM)
#define SK_MEM_SEND        0
#define SK_MEM_RECV        1

/* sysctl_mem values are in pages, we convert them in SK_MEM_QUANTUM units */
static inline long sk_prot_mem_limits(const struct sock *sk, int index)
{
        long val = READ_ONCE(sk->sk_prot->sysctl_mem[index]);

#if PAGE_SIZE > SK_MEM_QUANTUM
        val <<= PAGE_SHIFT - SK_MEM_QUANTUM_SHIFT;
#elif PAGE_SIZE < SK_MEM_QUANTUM
        val >>= SK_MEM_QUANTUM_SHIFT - PAGE_SHIFT;
#endif
        return val;
}

static inline int sk_mem_pages(int amt)
{
        return (amt + SK_MEM_QUANTUM - 1) >> SK_MEM_QUANTUM_SHIFT;
}

static inline bool sk_has_account(struct sock *sk)
{
        /* return true if protocol supports memory accounting */
        return !!sk->sk_prot->memory_allocated;
}

static inline bool sk_wmem_schedule(struct sock *sk, int size)
{
        int delta;

        if (!sk_has_account(sk))
                return true;
        delta = size - sk->sk_forward_alloc;
        return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_SEND);
}

static inline bool
__sk_rmem_schedule(struct sock *sk, int size, bool pfmemalloc)
{
        int delta;

        if (!sk_has_account(sk))
                return true;
        delta = size - sk->sk_forward_alloc;
        return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_RECV) ||
               pfmemalloc;
}

static inline bool
sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size)
{
        return __sk_rmem_schedule(sk, size, skb_pfmemalloc(skb));
}

static inline void sk_mem_reclaim(struct sock *sk)
{
        if (!sk_has_account(sk))
                return;
        if (sk->sk_forward_alloc >= SK_MEM_QUANTUM)
                __sk_mem_reclaim(sk, sk->sk_forward_alloc);
}

static inline void sk_mem_reclaim_partial(struct sock *sk)
{
        if (!sk_has_account(sk))
                return;
        if (sk->sk_forward_alloc > SK_MEM_QUANTUM)
                __sk_mem_reclaim(sk, sk->sk_forward_alloc - 1);
}

static inline void sk_mem_charge(struct sock *sk, int size)
{
        if (!sk_has_account(sk))
                return;
        sk->sk_forward_alloc -= size;
}

static inline void sk_mem_uncharge(struct sock *sk, int size)
{
        if (!sk_has_account(sk))
                return;
        sk->sk_forward_alloc += size;

        /* Avoid a possible overflow.
         * TCP send queues can make this happen, if sk_mem_reclaim()
         * is not called and more than 2 GBytes are released at once.
         *
         * If we reach 2 MBytes, reclaim 1 MBytes right now, there is
         * no need to hold that much forward allocation anyway.
         */
        if (unlikely(sk->sk_forward_alloc >= 1 << 21))
                __sk_mem_reclaim(sk, 1 << 20);
}

DECLARE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key);
static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
{
        sk_wmem_queued_add(sk, -skb->truesize);
        sk_mem_uncharge(sk, skb->truesize);
        if (static_branch_unlikely(&tcp_tx_skb_cache_key) &&
            !sk->sk_tx_skb_cache && !skb_cloned(skb)) {
                skb_ext_reset(skb);
                skb_zcopy_clear(skb, true);
                sk->sk_tx_skb_cache = skb;
                return;
        }
        __kfree_skb(skb);
}

static inline void sock_release_ownership(struct sock *sk)
{
        if (sk->sk_lock.owned) {
                sk->sk_lock.owned = 0;

                /* The sk_lock has mutex_unlock() semantics: */
                mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
        }
}

#if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES)
static inline void sk_owner_set(struct sock *sk, struct module *owner)
{
        __module_get(owner);
        sk->sk_owner = owner;
}

static inline void sk_owner_clear(struct sock *sk)
{
        sk->sk_owner = NULL;
}

static inline void sk_owner_put(struct sock *sk)
{
        module_put(sk->sk_owner);
}
#else
static inline void sk_owner_set(struct sock *sk, struct module *owner)
{
}

static inline void sk_owner_clear(struct sock *sk)
{
}

static inline void sk_owner_put(struct sock *sk)
{
}
#endif
/*
 * Macro so as to not evaluate some arguments when
 * lockdep is not enabled.
 *
 * Mark both the sk_lock and the sk_lock.slock as a
 * per-address-family lock class.
 */
#define sock_lock_init_class_and_name(sk, sname, skey, name, key)        \
do {                                                                        \
        sk_owner_set(sk, THIS_MODULE);                                        \
        sk->sk_lock.owned = 0;                                                \
        init_waitqueue_head(&sk->sk_lock.wq);                                \
        spin_lock_init(&(sk)->sk_lock.slock);                                \
        debug_check_no_locks_freed((void *)&(sk)->sk_lock,                \
                                   sizeof((sk)->sk_lock));                \
        lockdep_set_class_and_name(&(sk)->sk_lock.slock,                \
                                   (skey), (sname));                        \
        lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0);        \
} while (0)

#ifdef CONFIG_LOCKDEP
static inline bool lockdep_sock_is_held(const struct sock *sk)
{
        return lockdep_is_held(&sk->sk_lock) ||
               lockdep_is_held(&sk->sk_lock.slock);
}
#endif

void lock_sock_nested(struct sock *sk, int subclass);

static inline void lock_sock(struct sock *sk)
{
        lock_sock_nested(sk, 0);
}

void __release_sock(struct sock *sk);
void release_sock(struct sock *sk);

/* BH context may only use the following locking interface. */
#define bh_lock_sock(__sk)        spin_lock(&((__sk)->sk_lock.slock))
#define bh_lock_sock_nested(__sk) \
                                spin_lock_nested(&((__sk)->sk_lock.slock), \
                                SINGLE_DEPTH_NESTING)
#define bh_unlock_sock(__sk)        spin_unlock(&((__sk)->sk_lock.slock))

bool lock_sock_fast(struct sock *sk);
/**
 * unlock_sock_fast - complement of lock_sock_fast
 * @sk: socket
 * @slow: slow mode
 *
 * fast unlock socket for user context.
 * If slow mode is on, we call regular release_sock()
 */
static inline void unlock_sock_fast(struct sock *sk, bool slow)
{
        if (slow)
                release_sock(sk);
        else
                spin_unlock_bh(&sk->sk_lock.slock);
}

/* Used by processes to "lock" a socket state, so that
 * interrupts and bottom half handlers won't change it
 * from under us. It essentially blocks any incoming
 * packets, so that we won't get any new data or any
 * packets that change the state of the socket.
 *
 * While locked, BH processing will add new packets to
 * the backlog queue.  This queue is processed by the
 * owner of the socket lock right before it is released.
 *
 * Since ~2.3.5 it is also exclusive sleep lock serializing
 * accesses from user process context.
 */

static inline void sock_owned_by_me(const struct sock *sk)
{
#ifdef CONFIG_LOCKDEP
        WARN_ON_ONCE(!lockdep_sock_is_held(sk) && debug_locks);
#endif
}

static inline void sock_not_owned_by_me(const struct sock *sk)
{
#ifdef CONFIG_LOCKDEP
        WARN_ON_ONCE(lockdep_sock_is_held(sk) && debug_locks);
#endif
}

static inline bool sock_owned_by_user(const struct sock *sk)
{
        sock_owned_by_me(sk);
        return sk->sk_lock.owned;
}

static inline bool sock_owned_by_user_nocheck(const struct sock *sk)
{
        return sk->sk_lock.owned;
}

/* no reclassification while locks are held */
static inline bool sock_allow_reclassification(const struct sock *csk)
{
        struct sock *sk = (struct sock *)csk;

        return !sk->sk_lock.owned && !spin_is_locked(&sk->sk_lock.slock);
}

struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
                      struct proto *prot, int kern);
void sk_free(struct sock *sk);
void sk_destruct(struct sock *sk);
struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority);
void sk_free_unlock_clone(struct sock *sk);

struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
                             gfp_t priority);
void __sock_wfree(struct sk_buff *skb);
void sock_wfree(struct sk_buff *skb);
struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
                             gfp_t priority);
void skb_orphan_partial(struct sk_buff *skb);
void sock_rfree(struct sk_buff *skb);
void sock_efree(struct sk_buff *skb);
#ifdef CONFIG_INET
void sock_edemux(struct sk_buff *skb);
void sock_pfree(struct sk_buff *skb);
#else
#define sock_edemux sock_efree
#endif

int sock_setsockopt(struct socket *sock, int level, int op,
                    sockptr_t optval, unsigned int optlen);

int sock_getsockopt(struct socket *sock, int level, int op,
                    char __user *optval, int __user *optlen);
int sock_gettstamp(struct socket *sock, void __user *userstamp,
                   bool timeval, bool time32);
struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
                                    int noblock, int *errcode);
struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
                                     unsigned long data_len, int noblock,
                                     int *errcode, int max_page_order);
void *sock_kmalloc(struct sock *sk, int size, gfp_t priority);
void sock_kfree_s(struct sock *sk, void *mem, int size);
void sock_kzfree_s(struct sock *sk, void *mem, int size);
void sk_send_sigurg(struct sock *sk);

struct sockcm_cookie {
        u64 transmit_time;
        u32 mark;
        u16 tsflags;
};

static inline void sockcm_init(struct sockcm_cookie *sockc,
                               const struct sock *sk)
{
        *sockc = (struct sockcm_cookie) { .tsflags = sk->sk_tsflags };
}

int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
                     struct sockcm_cookie *sockc);
int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
                   struct sockcm_cookie *sockc);

/*
 * Functions to fill in entries in struct proto_ops when a protocol
 * does not implement a particular function.
 */
int sock_no_bind(struct socket *, struct sockaddr *, int);
int sock_no_connect(struct socket *, struct sockaddr *, int, int);
int sock_no_socketpair(struct socket *, struct socket *);
int sock_no_accept(struct socket *, struct socket *, int, bool);
int sock_no_getname(struct socket *, struct sockaddr *, int);
int sock_no_ioctl(struct socket *, unsigned int, unsigned long);
int sock_no_listen(struct socket *, int);
int sock_no_shutdown(struct socket *, int);
int sock_no_sendmsg(struct socket *, struct msghdr *, size_t);
int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t len);
int sock_no_recvmsg(struct socket *, struct msghdr *, size_t, int);
int sock_no_mmap(struct file *file, struct socket *sock,
                 struct vm_area_struct *vma);
ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset,
                         size_t size, int flags);
ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
                                int offset, size_t size, int flags);

/*
 * Functions to fill in entries in struct proto_ops when a protocol
 * uses the inet style.
 */
int sock_common_getsockopt(struct socket *sock, int level, int optname,
                                  char __user *optval, int __user *optlen);
int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
                        int flags);
int sock_common_setsockopt(struct socket *sock, int level, int optname,
                           sockptr_t optval, unsigned int optlen);

void sk_common_release(struct sock *sk);

/*
 *        Default socket callbacks and setup code
 */

/* Initialise core socket variables using an explicit uid. */
void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid);

/* Initialise core socket variables.
 * Assumes struct socket *sock is embedded in a struct socket_alloc.
 */
void sock_init_data(struct socket *sock, struct sock *sk);

/*
 * Socket reference counting postulates.
 *
 * * Each user of socket SHOULD hold a reference count.
 * * Each access point to socket (an hash table bucket, reference from a list,
 *   running timer, skb in flight MUST hold a reference count.
 * * When reference count hits 0, it means it will never increase back.
 * * When reference count hits 0, it means that no references from
 *   outside exist to this socket and current process on current CPU
 *   is last user and may/should destroy this socket.
 * * sk_free is called from any context: process, BH, IRQ. When
 *   it is called, socket has no references from outside -> sk_free
 *   may release descendant resources allocated by the socket, but
 *   to the time when it is called, socket is NOT referenced by any
 *   hash tables, lists etc.
 * * Packets, delivered from outside (from network or from another process)
 *   and enqueued on receive/error queues SHOULD NOT grab reference count,
 *   when they sit in queue. Otherwise, packets will leak to hole, when
 *   socket is looked up by one cpu and unhasing is made by another CPU.
 *   It is true for udp/raw, netlink (leak to receive and error queues), tcp
 *   (leak to backlog). Packet socket does all the processing inside
 *   BR_NETPROTO_LOCK, so that it has not this race condition. UNIX sockets
 *   use separate SMP lock, so that they are prone too.
 */

/* Ungrab socket and destroy it, if it was the last reference. */
static inline void sock_put(struct sock *sk)
{
        if (refcount_dec_and_test(&sk->sk_refcnt))
                sk_free(sk);
}
/* Generic version of sock_put(), dealing with all sockets
 * (TCP_TIMEWAIT, TCP_NEW_SYN_RECV, ESTABLISHED...)
 */
void sock_gen_put(struct sock *sk);

int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested,
                     unsigned int trim_cap, bool refcounted);
static inline int sk_receive_skb(struct sock *sk, struct sk_buff *skb,
                                 const int nested)
{
        return __sk_receive_skb(sk, skb, nested, 1, true);
}

static inline void sk_tx_queue_set(struct sock *sk, int tx_queue)
{
        /* sk_tx_queue_mapping accept only upto a 16-bit value */
        if (WARN_ON_ONCE((unsigned short)tx_queue >= USHRT_MAX))
                return;
        /* Paired with READ_ONCE() in sk_tx_queue_get() and
         * other WRITE_ONCE() because socket lock might be not held.
         */
        WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue);
}

#define NO_QUEUE_MAPPING        USHRT_MAX

static inline void sk_tx_queue_clear(struct sock *sk)
{
        /* Paired with READ_ONCE() in sk_tx_queue_get() and
         * other WRITE_ONCE() because socket lock might be not held.
         */
        WRITE_ONCE(sk->sk_tx_queue_mapping, NO_QUEUE_MAPPING);
}

static inline int sk_tx_queue_get(const struct sock *sk)
{
        if (sk) {
                /* Paired with WRITE_ONCE() in sk_tx_queue_clear()
                 * and sk_tx_queue_set().
                 */
                int val = READ_ONCE(sk->sk_tx_queue_mapping);

                if (val != NO_QUEUE_MAPPING)
                        return val;
        }
        return -1;
}

static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb)
{
#ifdef CONFIG_XPS
        if (skb_rx_queue_recorded(skb)) {
                u16 rx_queue = skb_get_rx_queue(skb);

                if (WARN_ON_ONCE(rx_queue == NO_QUEUE_MAPPING))
                        return;

                sk->sk_rx_queue_mapping = rx_queue;
        }
#endif
}

static inline void sk_rx_queue_clear(struct sock *sk)
{
#ifdef CONFIG_XPS
        sk->sk_rx_queue_mapping = NO_QUEUE_MAPPING;
#endif
}

#ifdef CONFIG_XPS
static inline int sk_rx_queue_get(const struct sock *sk)
{
        if (sk && sk->sk_rx_queue_mapping != NO_QUEUE_MAPPING)
                return sk->sk_rx_queue_mapping;

        return -1;
}
#endif

static inline void sk_set_socket(struct sock *sk, struct socket *sock)
{
        sk->sk_socket = sock;
}

static inline wait_queue_head_t *sk_sleep(struct sock *sk)
{
        BUILD_BUG_ON(offsetof(struct socket_wq, wait) != 0);
        return &rcu_dereference_raw(sk->sk_wq)->wait;
}
/* Detach socket from process context.
 * Announce socket dead, detach it from wait queue and inode.
 * Note that parent inode held reference count on this struct sock,
 * we do not release it in this function, because protocol
 * probably wants some additional cleanups or even continuing
 * to work with this socket (TCP).
 */
static inline void sock_orphan(struct sock *sk)
{
        write_lock_bh(&sk->sk_callback_lock);
        sock_set_flag(sk, SOCK_DEAD);
        sk_set_socket(sk, NULL);
        sk->sk_wq  = NULL;
        write_unlock_bh(&sk->sk_callback_lock);
}

static inline void sock_graft(struct sock *sk, struct socket *parent)
{
        WARN_ON(parent->sk);
        write_lock_bh(&sk->sk_callback_lock);
        rcu_assign_pointer(sk->sk_wq, &parent->wq);
        parent->sk = sk;
        sk_set_socket(sk, parent);
        sk->sk_uid = SOCK_INODE(parent)->i_uid;
        security_sock_graft(sk, parent);
        write_unlock_bh(&sk->sk_callback_lock);
}

kuid_t sock_i_uid(struct sock *sk);
unsigned long __sock_i_ino(struct sock *sk);
unsigned long sock_i_ino(struct sock *sk);

static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk)
{
        return sk ? sk->sk_uid : make_kuid(net->user_ns, 0);
}

static inline u32 net_tx_rndhash(void)
{
        u32 v = prandom_u32();

        return v ?: 1;
}

static inline void sk_set_txhash(struct sock *sk)
{
        /* This pairs with READ_ONCE() in skb_set_hash_from_sk() */
        WRITE_ONCE(sk->sk_txhash, net_tx_rndhash());
}

static inline bool sk_rethink_txhash(struct sock *sk)
{
        if (sk->sk_txhash) {
                sk_set_txhash(sk);
                return true;
        }
        return false;
}

static inline struct dst_entry *
__sk_dst_get(struct sock *sk)
{
        return rcu_dereference_check(sk->sk_dst_cache,
                                     lockdep_sock_is_held(sk));
}

static inline struct dst_entry *
sk_dst_get(struct sock *sk)
{
        struct dst_entry *dst;

        rcu_read_lock();
        dst = rcu_dereference(sk->sk_dst_cache);
        if (dst && !atomic_inc_not_zero(&dst->__refcnt))
                dst = NULL;
        rcu_read_unlock();
        return dst;
}

static inline void __dst_negative_advice(struct sock *sk)
{
        struct dst_entry *dst = __sk_dst_get(sk);

        if (dst && dst->ops->negative_advice)
                dst->ops->negative_advice(sk, dst);
}

static inline void dst_negative_advice(struct sock *sk)
{
        sk_rethink_txhash(sk);
        __dst_negative_advice(sk);
}

static inline void
__sk_dst_set(struct sock *sk, struct dst_entry *dst)
{
        struct dst_entry *old_dst;

        sk_tx_queue_clear(sk);
        WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
        old_dst = rcu_dereference_protected(sk->sk_dst_cache,
                                            lockdep_sock_is_held(sk));
        rcu_assign_pointer(sk->sk_dst_cache, dst);
        dst_release(old_dst);
}

static inline void
sk_dst_set(struct sock *sk, struct dst_entry *dst)
{
        struct dst_entry *old_dst;

        sk_tx_queue_clear(sk);
        WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
        old_dst = xchg((__force struct dst_entry **)&sk->sk_dst_cache, dst);
        dst_release(old_dst);
}

static inline void
__sk_dst_reset(struct sock *sk)
{
        __sk_dst_set(sk, NULL);
}

static inline void
sk_dst_reset(struct sock *sk)
{
        sk_dst_set(sk, NULL);
}

struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie);

struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie);

static inline void sk_dst_confirm(struct sock *sk)
{
        if (!READ_ONCE(sk->sk_dst_pending_confirm))
                WRITE_ONCE(sk->sk_dst_pending_confirm, 1);
}

static inline void sock_confirm_neigh(struct sk_buff *skb, struct neighbour *n)
{
        if (skb_get_dst_pending_confirm(skb)) {
                struct sock *sk = skb->sk;
                unsigned long now = jiffies;

                /* avoid dirtying neighbour */
                if (READ_ONCE(n->confirmed) != now)
                        WRITE_ONCE(n->confirmed, now);
                if (sk && READ_ONCE(sk->sk_dst_pending_confirm))
                        WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
        }
}

bool sk_mc_loop(struct sock *sk);

static inline bool sk_can_gso(const struct sock *sk)
{
        return net_gso_ok(sk->sk_route_caps, sk->sk_gso_type);
}

void sk_setup_caps(struct sock *sk, struct dst_entry *dst);

static inline void sk_nocaps_add(struct sock *sk, netdev_features_t flags)
{
        sk->sk_route_nocaps |= flags;
        sk->sk_route_caps &= ~flags;
}

static inline int skb_do_copy_data_nocache(struct sock *sk, struct sk_buff *skb,
                                           struct iov_iter *from, char *to,
                                           int copy, int offset)
{
        if (skb->ip_summed == CHECKSUM_NONE) {
                __wsum csum = 0;
                if (!csum_and_copy_from_iter_full(to, copy, &csum, from))
                        return -EFAULT;
                skb->csum = csum_block_add(skb->csum, csum, offset);
        } else if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) {
                if (!copy_from_iter_full_nocache(to, copy, from))
                        return -EFAULT;
        } else if (!copy_from_iter_full(to, copy, from))
                return -EFAULT;

        return 0;
}

static inline int skb_add_data_nocache(struct sock *sk, struct sk_buff *skb,
                                       struct iov_iter *from, int copy)
{
        int err, offset = skb->len;

        err = skb_do_copy_data_nocache(sk, skb, from, skb_put(skb, copy),
                                       copy, offset);
        if (err)
                __skb_trim(skb, offset);

        return err;
}

static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *from,
                                           struct sk_buff *skb,
                                           struct page *page,
                                           int off, int copy)
{
        int err;

        err = skb_do_copy_data_nocache(sk, skb, from, page_address(page) + off,
                                       copy, skb->len);
        if (err)
                return err;

        skb->len             += copy;
        skb->data_len             += copy;
        skb->truesize             += copy;
        sk_wmem_queued_add(sk, copy);
        sk_mem_charge(sk, copy);
        return 0;
}

/**
 * sk_wmem_alloc_get - returns write allocations
 * @sk: socket
 *
 * Return: sk_wmem_alloc minus initial offset of one
 */
static inline int sk_wmem_alloc_get(const struct sock *sk)
{
        return refcount_read(&sk->sk_wmem_alloc) - 1;
}

/**
 * sk_rmem_alloc_get - returns read allocations
 * @sk: socket
 *
 * Return: sk_rmem_alloc
 */
static inline int sk_rmem_alloc_get(const struct sock *sk)
{
        return atomic_read(&sk->sk_rmem_alloc);
}

/**
 * sk_has_allocations - check if allocations are outstanding
 * @sk: socket
 *
 * Return: true if socket has write or read allocations
 */
static inline bool sk_has_allocations(const struct sock *sk)
{
        return sk_wmem_alloc_get(sk) || sk_rmem_alloc_get(sk);
}

/**
 * skwq_has_sleeper - check if there are any waiting processes
 * @wq: struct socket_wq
 *
 * Return: true if socket_wq has waiting processes
 *
 * The purpose of the skwq_has_sleeper and sock_poll_wait is to wrap the memory
 * barrier call. They were added due to the race found within the tcp code.
 *
 * Consider following tcp code paths::
 *
 *   CPU1                CPU2
 *   sys_select          receive packet
 *   ...                 ...
 *   __add_wait_queue    update tp->rcv_nxt
 *   ...                 ...
 *   tp->rcv_nxt check   sock_def_readable
 *   ...                 {
 *   schedule               rcu_read_lock();
 *                          wq = rcu_dereference(sk->sk_wq);
 *                          if (wq && waitqueue_active(&wq->wait))
 *                              wake_up_interruptible(&wq->wait)
 *                          ...
 *                       }
 *
 * The race for tcp fires when the __add_wait_queue changes done by CPU1 stay
 * in its cache, and so does the tp->rcv_nxt update on CPU2 side.  The CPU1
 * could then endup calling schedule and sleep forever if there are no more
 * data on the socket.
 *
 */
static inline bool skwq_has_sleeper(struct socket_wq *wq)
{
        return wq && wq_has_sleeper(&wq->wait);
}

/**
 * sock_poll_wait - place memory barrier behind the poll_wait call.
 * @filp:           file
 * @sock:           socket to wait on
 * @p:              poll_table
 *
 * See the comments in the wq_has_sleeper function.
 */
static inline void sock_poll_wait(struct file *filp, struct socket *sock,
                                  poll_table *p)
{
        if (!poll_does_not_wait(p)) {
                poll_wait(filp, &sock->wq.wait, p);
                /* We need to be sure we are in sync with the
                 * socket flags modification.
                 *
                 * This memory barrier is paired in the wq_has_sleeper.
                 */
                smp_mb();
        }
}

static inline void skb_set_hash_from_sk(struct sk_buff *skb, struct sock *sk)
{
        /* This pairs with WRITE_ONCE() in sk_set_txhash() */
        u32 txhash = READ_ONCE(sk->sk_txhash);

        if (txhash) {
                skb->l4_hash = 1;
                skb->hash = txhash;
        }
}

void skb_set_owner_w(struct sk_buff *skb, struct sock *sk);

/*
 *        Queue a received datagram if it will fit. Stream and sequenced
 *        protocols can't normally use this as they need to fit buffers in
 *        and play with them.
 *
 *        Inlined as it's very short and called for pretty much every
 *        packet ever received.
 */
static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
{
        skb_orphan(skb);
        skb->sk = sk;
        skb->destructor = sock_rfree;
        atomic_add(skb->truesize, &sk->sk_rmem_alloc);
        sk_mem_charge(sk, skb->truesize);
}

static inline __must_check bool skb_set_owner_sk_safe(struct sk_buff *skb, struct sock *sk)
{
        if (sk && refcount_inc_not_zero(&sk->sk_refcnt)) {
                skb_orphan(skb);
                skb->destructor = sock_efree;
                skb->sk = sk;
                return true;
        }
        return false;
}

static inline struct sk_buff *skb_clone_and_charge_r(struct sk_buff *skb, struct sock *sk)
{
        skb = skb_clone(skb, sk_gfp_mask(sk, GFP_ATOMIC));
        if (skb) {
                if (sk_rmem_schedule(sk, skb, skb->truesize)) {
                        skb_set_owner_r(skb, sk);
                        return skb;
                }
                __kfree_skb(skb);
        }
        return NULL;
}

void sk_reset_timer(struct sock *sk, struct timer_list *timer,
                    unsigned long expires);

void sk_stop_timer(struct sock *sk, struct timer_list *timer);

void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer);

int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue,
                        struct sk_buff *skb, unsigned int flags,
                        void (*destructor)(struct sock *sk,
                                           struct sk_buff *skb));
int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);

int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb);
struct sk_buff *sock_dequeue_err_skb(struct sock *sk);

/*
 *        Recover an error report and clear atomically
 */

static inline int sock_error(struct sock *sk)
{
        int err;

        /* Avoid an atomic operation for the common case.
         * This is racy since another cpu/thread can change sk_err under us.
         */
        if (likely(data_race(!sk->sk_err)))
                return 0;

        err = xchg(&sk->sk_err, 0);
        return -err;
}

static inline unsigned long sock_wspace(struct sock *sk)
{
        int amt = 0;

        if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
                amt = sk->sk_sndbuf - refcount_read(&sk->sk_wmem_alloc);
                if (amt < 0)
                        amt = 0;
        }
        return amt;
}

/* Note:
 *  We use sk->sk_wq_raw, from contexts knowing this
 *  pointer is not NULL and cannot disappear/change.
 */
static inline void sk_set_bit(int nr, struct sock *sk)
{
        if ((nr == SOCKWQ_ASYNC_NOSPACE || nr == SOCKWQ_ASYNC_WAITDATA) &&
            !sock_flag(sk, SOCK_FASYNC))
                return;

        set_bit(nr, &sk->sk_wq_raw->flags);
}

static inline void sk_clear_bit(int nr, struct sock *sk)
{
        if ((nr == SOCKWQ_ASYNC_NOSPACE || nr == SOCKWQ_ASYNC_WAITDATA) &&
            !sock_flag(sk, SOCK_FASYNC))
                return;

        clear_bit(nr, &sk->sk_wq_raw->flags);
}

static inline void sk_wake_async(const struct sock *sk, int how, int band)
{
        if (sock_flag(sk, SOCK_FASYNC)) {
                rcu_read_lock();
                sock_wake_async(rcu_dereference(sk->sk_wq), how, band);
                rcu_read_unlock();
        }
}

/* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might
 * need sizeof(sk_buff) + MTU + padding, unless net driver perform copybreak.
 * Note: for send buffers, TCP works better if we can build two skbs at
 * minimum.
 */
#define TCP_SKB_MIN_TRUESIZE        (2048 + SKB_DATA_ALIGN(sizeof(struct sk_buff)))

#define SOCK_MIN_SNDBUF                (TCP_SKB_MIN_TRUESIZE * 2)
#define SOCK_MIN_RCVBUF                 TCP_SKB_MIN_TRUESIZE

static inline void sk_stream_moderate_sndbuf(struct sock *sk)
{
        u32 val;

        if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
                return;

        val = min(sk->sk_sndbuf, sk->sk_wmem_queued >> 1);

        WRITE_ONCE(sk->sk_sndbuf, max_t(u32, val, SOCK_MIN_SNDBUF));
}

struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
                                    bool force_schedule);

/**
 * sk_page_frag - return an appropriate page_frag
 * @sk: socket
 *
 * Use the per task page_frag instead of the per socket one for
 * optimization when we know that we're in process context and own
 * everything that's associated with %current.
 *
 * Both direct reclaim and page faults can nest inside other
 * socket operations and end up recursing into sk_page_frag()
 * while it's already in use: explicitly avoid task page_frag
 * usage if the caller is potentially doing any of them.
 * This assumes that page fault handlers use the GFP_NOFS flags.
 *
 * Return: a per task page_frag if context allows that,
 * otherwise a per socket one.
 */
static inline struct page_frag *sk_page_frag(struct sock *sk)
{
        if ((sk->sk_allocation & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC | __GFP_FS)) ==
            (__GFP_DIRECT_RECLAIM | __GFP_FS))
                return &current->task_frag;

        return &sk->sk_frag;
}

bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag);

/*
 *        Default write policy as shown to user space via poll/select/SIGIO
 */
static inline bool sock_writeable(const struct sock *sk)
{
        return refcount_read(&sk->sk_wmem_alloc) < (READ_ONCE(sk->sk_sndbuf) >> 1);
}

static inline gfp_t gfp_any(void)
{
        return in_softirq() ? GFP_ATOMIC : GFP_KERNEL;
}

static inline long sock_rcvtimeo(const struct sock *sk, bool noblock)
{
        return noblock ? 0 : sk->sk_rcvtimeo;
}

static inline long sock_sndtimeo(const struct sock *sk, bool noblock)
{
        return noblock ? 0 : sk->sk_sndtimeo;
}

static inline int sock_rcvlowat(const struct sock *sk, int waitall, int len)
{
        int v = waitall ? len : min_t(int, READ_ONCE(sk->sk_rcvlowat), len);

        return v ?: 1;
}

/* Alas, with timeout socket operations are not restartable.
 * Compare this to poll().
 */
static inline int sock_intr_errno(long timeo)
{
        return timeo == MAX_SCHEDULE_TIMEOUT ? -ERESTARTSYS : -EINTR;
}

struct sock_skb_cb {
        u32 dropcount;
};

/* Store sock_skb_cb at the end of skb->cb[] so protocol families
 * using skb->cb[] would keep using it directly and utilize its
 * alignement guarantee.
 */
#define SOCK_SKB_CB_OFFSET ((sizeof_field(struct sk_buff, cb) - \
                            sizeof(struct sock_skb_cb)))

#define SOCK_SKB_CB(__skb) ((struct sock_skb_cb *)((__skb)->cb + \
                            SOCK_SKB_CB_OFFSET))

#define sock_skb_cb_check_size(size) \
        BUILD_BUG_ON((size) > SOCK_SKB_CB_OFFSET)

static inline void
sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb)
{
        SOCK_SKB_CB(skb)->dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ?
                                                atomic_read(&sk->sk_drops) : 0;
}

static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb)
{
        int segs = max_t(u16, 1, skb_shinfo(skb)->gso_segs);

        atomic_add(segs, &sk->sk_drops);
}

static inline ktime_t sock_read_timestamp(struct sock *sk)
{
#if BITS_PER_LONG==32
        unsigned int seq;
        ktime_t kt;

        do {
                seq = read_seqbegin(&sk->sk_stamp_seq);
                kt = sk->sk_stamp;
        } while (read_seqretry(&sk->sk_stamp_seq, seq));

        return kt;
#else
        return READ_ONCE(sk->sk_stamp);
#endif
}

static inline void sock_write_timestamp(struct sock *sk, ktime_t kt)
{
#if BITS_PER_LONG==32
        write_seqlock(&sk->sk_stamp_seq);
        sk->sk_stamp = kt;
        write_sequnlock(&sk->sk_stamp_seq);
#else
        WRITE_ONCE(sk->sk_stamp, kt);
#endif
}

void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
                           struct sk_buff *skb);
void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
                             struct sk_buff *skb);

static inline void
sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
{
        ktime_t kt = skb->tstamp;
        struct skb_shared_hwtstamps *hwtstamps = skb_hwtstamps(skb);

        /*
         * generate control messages if
         * - receive time stamping in software requested
         * - software time stamp available and wanted
         * - hardware time stamps available and wanted
         */
        if (sock_flag(sk, SOCK_RCVTSTAMP) ||
            (sk->sk_tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) ||
            (kt && sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) ||
            (hwtstamps->hwtstamp &&
             (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)))
                __sock_recv_timestamp(msg, sk, skb);
        else
                sock_write_timestamp(sk, kt);

        if (sock_flag(sk, SOCK_WIFI_STATUS) && skb->wifi_acked_valid)
                __sock_recv_wifi_status(msg, sk, skb);
}

void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
                              struct sk_buff *skb);

#define SK_DEFAULT_STAMP (-1L * NSEC_PER_SEC)
static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
                                          struct sk_buff *skb)
{
#define FLAGS_TS_OR_DROPS ((1UL << SOCK_RXQ_OVFL)                        | \
                           (1UL << SOCK_RCVTSTAMP))
#define TSFLAGS_ANY          (SOF_TIMESTAMPING_SOFTWARE                        | \
                           SOF_TIMESTAMPING_RAW_HARDWARE)

        if (sk->sk_flags & FLAGS_TS_OR_DROPS || sk->sk_tsflags & TSFLAGS_ANY)
                __sock_recv_ts_and_drops(msg, sk, skb);
        else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP)))
                sock_write_timestamp(sk, skb->tstamp);
        else if (unlikely(sock_read_timestamp(sk) == SK_DEFAULT_STAMP))
                sock_write_timestamp(sk, 0);
}

void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags);

/**
 * _sock_tx_timestamp - checks whether the outgoing packet is to be time stamped
 * @sk:                socket sending this packet
 * @tsflags:        timestamping flags to use
 * @tx_flags:        completed with instructions for time stamping
 * @tskey:      filled in with next sk_tskey (not for TCP, which uses seqno)
 *
 * Note: callers should take care of initial ``*tx_flags`` value (usually 0)
 */
static inline void _sock_tx_timestamp(struct sock *sk, __u16 tsflags,
                                      __u8 *tx_flags, __u32 *tskey)
{
        if (unlikely(tsflags)) {
                __sock_tx_timestamp(tsflags, tx_flags);
                if (tsflags & SOF_TIMESTAMPING_OPT_ID && tskey &&
                    tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
                        *tskey = sk->sk_tskey++;
        }
        if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS)))
                *tx_flags |= SKBTX_WIFI_STATUS;
}

static inline void sock_tx_timestamp(struct sock *sk, __u16 tsflags,
                                     __u8 *tx_flags)
{
        _sock_tx_timestamp(sk, tsflags, tx_flags, NULL);
}

static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags)
{
        _sock_tx_timestamp(skb->sk, tsflags, &skb_shinfo(skb)->tx_flags,
                           &skb_shinfo(skb)->tskey);
}

DECLARE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key);
/**
 * sk_eat_skb - Release a skb if it is no longer needed
 * @sk: socket to eat this skb from
 * @skb: socket buffer to eat
 *
 * This routine must be called with interrupts disabled or with the socket
 * locked so that the sk_buff queue operation is ok.
*/
static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
{
        __skb_unlink(skb, &sk->sk_receive_queue);
        if (static_branch_unlikely(&tcp_rx_skb_cache_key) &&
            !sk->sk_rx_skb_cache) {
                sk->sk_rx_skb_cache = skb;
                skb_orphan(skb);
                return;
        }
        __kfree_skb(skb);
}

static inline
struct net *sock_net(const struct sock *sk)
{
        return read_pnet(&sk->sk_net);
}

static inline
void sock_net_set(struct sock *sk, struct net *net)
{
        write_pnet(&sk->sk_net, net);
}

static inline bool
skb_sk_is_prefetched(struct sk_buff *skb)
{
#ifdef CONFIG_INET
        return skb->destructor == sock_pfree;
#else
        return false;
#endif /* CONFIG_INET */
}

/* This helper checks if a socket is a full socket,
 * ie _not_ a timewait or request socket.
 */
static inline bool sk_fullsock(const struct sock *sk)
{
        return (1 << sk->sk_state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV);
}

static inline bool
sk_is_refcounted(struct sock *sk)
{
        /* Only full sockets have sk->sk_flags. */
        return !sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE);
}

/**
 * skb_steal_sock - steal a socket from an sk_buff
 * @skb: sk_buff to steal the socket from
 * @refcounted: is set to true if the socket is reference-counted
 */
static inline struct sock *
skb_steal_sock(struct sk_buff *skb, bool *refcounted)
{
        if (skb->sk) {
                struct sock *sk = skb->sk;

                *refcounted = true;
                if (skb_sk_is_prefetched(skb))
                        *refcounted = sk_is_refcounted(sk);
                skb->destructor = NULL;
                skb->sk = NULL;
                return sk;
        }
        *refcounted = false;
        return NULL;
}

/* Checks if this SKB belongs to an HW offloaded socket
 * and whether any SW fallbacks are required based on dev.
 * Check decrypted mark in case skb_orphan() cleared socket.
 */
static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb,
                                                   struct net_device *dev)
{
#ifdef CONFIG_SOCK_VALIDATE_XMIT
        struct sock *sk = skb->sk;

        if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) {
                skb = sk->sk_validate_xmit_skb(sk, dev, skb);
#ifdef CONFIG_TLS_DEVICE
        } else if (unlikely(skb->decrypted)) {
                pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n");
                kfree_skb(skb);
                skb = NULL;
#endif
        }
#endif

        return skb;
}

/* This helper checks if a socket is a LISTEN or NEW_SYN_RECV
 * SYNACK messages can be attached to either ones (depending on SYNCOOKIE)
 */
static inline bool sk_listener(const struct sock *sk)
{
        return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV);
}

void sock_enable_timestamp(struct sock *sk, enum sock_flags flag);
int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level,
                       int type);

bool sk_ns_capable(const struct sock *sk,
                   struct user_namespace *user_ns, int cap);
bool sk_capable(const struct sock *sk, int cap);
bool sk_net_capable(const struct sock *sk, int cap);

void sk_get_meminfo(const struct sock *sk, u32 *meminfo);

/* Take into consideration the size of the struct sk_buff overhead in the
 * determination of these values, since that is non-constant across
 * platforms.  This makes socket queueing behavior and performance
 * not depend upon such differences.
 */
#define _SK_MEM_PACKETS                256
#define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
#define SK_WMEM_MAX                (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
#define SK_RMEM_MAX                (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)

extern __u32 sysctl_wmem_max;
extern __u32 sysctl_rmem_max;

extern int sysctl_tstamp_allow_data;
extern int sysctl_optmem_max;

extern __u32 sysctl_wmem_default;
extern __u32 sysctl_rmem_default;

#define SKB_FRAG_PAGE_ORDER        get_order(32768)
DECLARE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);

static inline int sk_get_wmem0(const struct sock *sk, const struct proto *proto)
{
        /* Does this proto have per netns sysctl_wmem ? */
        if (proto->sysctl_wmem_offset)
                return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset));

        return READ_ONCE(*proto->sysctl_wmem);
}

static inline int sk_get_rmem0(const struct sock *sk, const struct proto *proto)
{
        /* Does this proto have per netns sysctl_rmem ? */
        if (proto->sysctl_rmem_offset)
                return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset));

        return READ_ONCE(*proto->sysctl_rmem);
}

/* Default TCP Small queue budget is ~1 ms of data (1sec >> 10)
 * Some wifi drivers need to tweak it to get more chunks.
 * They can use this helper from their ndo_start_xmit()
 */
static inline void sk_pacing_shift_update(struct sock *sk, int val)
{
        if (!sk || !sk_fullsock(sk) || READ_ONCE(sk->sk_pacing_shift) == val)
                return;
        WRITE_ONCE(sk->sk_pacing_shift, val);
}

/* if a socket is bound to a device, check that the given device
 * index is either the same or that the socket is bound to an L3
 * master device and the given device index is also enslaved to
 * that L3 master
 */
static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif)
{
        int mdif;

        if (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif)
                return true;

        mdif = l3mdev_master_ifindex_by_index(sock_net(sk), dif);
        if (mdif && mdif == sk->sk_bound_dev_if)
                return true;

        return false;
}

void sock_def_readable(struct sock *sk);

int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk);
void sock_enable_timestamps(struct sock *sk);
void sock_no_linger(struct sock *sk);
void sock_set_keepalive(struct sock *sk);
void sock_set_priority(struct sock *sk, u32 priority);
void sock_set_rcvbuf(struct sock *sk, int val);
void sock_set_mark(struct sock *sk, u32 val);
void sock_set_reuseaddr(struct sock *sk);
void sock_set_reuseport(struct sock *sk);
void sock_set_sndtimeo(struct sock *sk, s64 secs);

int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len);

static inline bool sk_is_readable(struct sock *sk)
{
        const struct proto *prot = READ_ONCE(sk->sk_prot);

        if (prot->sock_is_readable)
                return prot->sock_is_readable(sk);

        return false;
}
#endif        /* _SOCK_H */













































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
/* SPDX-License-Identifier: GPL-2.0 */
/*
 *        Routines to manage notifier chains for passing status changes to any
 *        interested routines. We need this instead of hard coded call lists so
 *        that modules can poke their nose into the innards. The network devices
 *        needed them so here they are for the rest of you.
 *
 *                                Alan Cox <Alan.Cox@linux.org>
 */
 
#ifndef _LINUX_NOTIFIER_H
#define _LINUX_NOTIFIER_H
#include <linux/errno.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/srcu.h>

/*
 * Notifier chains are of four types:
 *
 *        Atomic notifier chains: Chain callbacks run in interrupt/atomic
 *                context. Callouts are not allowed to block.
 *        Blocking notifier chains: Chain callbacks run in process context.
 *                Callouts are allowed to block.
 *        Raw notifier chains: There are no restrictions on callbacks,
 *                registration, or unregistration.  All locking and protection
 *                must be provided by the caller.
 *        SRCU notifier chains: A variant of blocking notifier chains, with
 *                the same restrictions.
 *
 * atomic_notifier_chain_register() may be called from an atomic context,
 * but blocking_notifier_chain_register() and srcu_notifier_chain_register()
 * must be called from a process context.  Ditto for the corresponding
 * _unregister() routines.
 *
 * atomic_notifier_chain_unregister(), blocking_notifier_chain_unregister(),
 * and srcu_notifier_chain_unregister() _must not_ be called from within
 * the call chain.
 *
 * SRCU notifier chains are an alternative form of blocking notifier chains.
 * They use SRCU (Sleepable Read-Copy Update) instead of rw-semaphores for
 * protection of the chain links.  This means there is _very_ low overhead
 * in srcu_notifier_call_chain(): no cache bounces and no memory barriers.
 * As compensation, srcu_notifier_chain_unregister() is rather expensive.
 * SRCU notifier chains should be used when the chain will be called very
 * often but notifier_blocks will seldom be removed.
 */

struct notifier_block;

typedef        int (*notifier_fn_t)(struct notifier_block *nb,
                        unsigned long action, void *data);

struct notifier_block {
        notifier_fn_t notifier_call;
        struct notifier_block __rcu *next;
        int priority;
};

struct atomic_notifier_head {
        spinlock_t lock;
        struct notifier_block __rcu *head;
};

struct blocking_notifier_head {
        struct rw_semaphore rwsem;
        struct notifier_block __rcu *head;
};

struct raw_notifier_head {
        struct notifier_block __rcu *head;
};

struct srcu_notifier_head {
        struct mutex mutex;
        struct srcu_struct srcu;
        struct notifier_block __rcu *head;
};

#define ATOMIC_INIT_NOTIFIER_HEAD(name) do {        \
                spin_lock_init(&(name)->lock);        \
                (name)->head = NULL;                \
        } while (0)
#define BLOCKING_INIT_NOTIFIER_HEAD(name) do {        \
                init_rwsem(&(name)->rwsem);        \
                (name)->head = NULL;                \
        } while (0)
#define RAW_INIT_NOTIFIER_HEAD(name) do {        \
                (name)->head = NULL;                \
        } while (0)

/* srcu_notifier_heads must be cleaned up dynamically */
extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
#define srcu_cleanup_notifier_head(name)        \
                cleanup_srcu_struct(&(name)->srcu);

#define ATOMIC_NOTIFIER_INIT(name) {                                \
                .lock = __SPIN_LOCK_UNLOCKED(name.lock),        \
                .head = NULL }
#define BLOCKING_NOTIFIER_INIT(name) {                                \
                .rwsem = __RWSEM_INITIALIZER((name).rwsem),        \
                .head = NULL }
#define RAW_NOTIFIER_INIT(name)        {                                \
                .head = NULL }

#define SRCU_NOTIFIER_INIT(name, pcpu)                                \
        {                                                        \
                .mutex = __MUTEX_INITIALIZER(name.mutex),        \
                .head = NULL,                                        \
                .srcu = __SRCU_STRUCT_INIT(name.srcu, pcpu),        \
        }

#define ATOMIC_NOTIFIER_HEAD(name)                                \
        struct atomic_notifier_head name =                        \
                ATOMIC_NOTIFIER_INIT(name)
#define BLOCKING_NOTIFIER_HEAD(name)                                \
        struct blocking_notifier_head name =                        \
                BLOCKING_NOTIFIER_INIT(name)
#define RAW_NOTIFIER_HEAD(name)                                        \
        struct raw_notifier_head name =                                \
                RAW_NOTIFIER_INIT(name)

#ifdef CONFIG_TREE_SRCU
#define _SRCU_NOTIFIER_HEAD(name, mod)                                \
        static DEFINE_PER_CPU(struct srcu_data, name##_head_srcu_data); \
        mod struct srcu_notifier_head name =                        \
                        SRCU_NOTIFIER_INIT(name, name##_head_srcu_data)

#else
#define _SRCU_NOTIFIER_HEAD(name, mod)                                \
        mod struct srcu_notifier_head name =                        \
                        SRCU_NOTIFIER_INIT(name, name)

#endif

#define SRCU_NOTIFIER_HEAD(name)                                \
        _SRCU_NOTIFIER_HEAD(name, /* not static */)

#define SRCU_NOTIFIER_HEAD_STATIC(name)                                \
        _SRCU_NOTIFIER_HEAD(name, static)

#ifdef __KERNEL__

extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
                struct notifier_block *nb);
extern int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
                struct notifier_block *nb);
extern int raw_notifier_chain_register(struct raw_notifier_head *nh,
                struct notifier_block *nb);
extern int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
                struct notifier_block *nb);

extern int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
                struct notifier_block *nb);
extern int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
                struct notifier_block *nb);
extern int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
                struct notifier_block *nb);
extern int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
                struct notifier_block *nb);

extern int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
                unsigned long val, void *v);
extern int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
                unsigned long val, void *v);
extern int raw_notifier_call_chain(struct raw_notifier_head *nh,
                unsigned long val, void *v);
extern int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
                unsigned long val, void *v);

extern int atomic_notifier_call_chain_robust(struct atomic_notifier_head *nh,
                unsigned long val_up, unsigned long val_down, void *v);
extern int blocking_notifier_call_chain_robust(struct blocking_notifier_head *nh,
                unsigned long val_up, unsigned long val_down, void *v);
extern int raw_notifier_call_chain_robust(struct raw_notifier_head *nh,
                unsigned long val_up, unsigned long val_down, void *v);

#define NOTIFY_DONE                0x0000                /* Don't care */
#define NOTIFY_OK                0x0001                /* Suits me */
#define NOTIFY_STOP_MASK        0x8000                /* Don't call further */
#define NOTIFY_BAD                (NOTIFY_STOP_MASK|0x0002)
                                                /* Bad/Veto action */
/*
 * Clean way to return from the notifier and stop further calls.
 */
#define NOTIFY_STOP                (NOTIFY_OK|NOTIFY_STOP_MASK)

/* Encapsulate (negative) errno value (in particular, NOTIFY_BAD <=> EPERM). */
static inline int notifier_from_errno(int err)
{
        if (err)
                return NOTIFY_STOP_MASK | (NOTIFY_OK - err);

        return NOTIFY_OK;
}

/* Restore (negative) errno value from notify return value. */
static inline int notifier_to_errno(int ret)
{
        ret &= ~NOTIFY_STOP_MASK;
        return ret > NOTIFY_OK ? NOTIFY_OK - ret : 0;
}

/*
 *        Declared notifiers so far. I can imagine quite a few more chains
 *        over time (eg laptop power reset chains, reboot chain (to clean 
 *        device units up), device [un]mount chain, module load/unload chain,
 *        low memory chain, screenblank chain (for plug in modular screenblankers) 
 *        VC switch chains (for loadable kernel svgalib VC switch helpers) etc...
 */
 
/* CPU notfiers are defined in include/linux/cpu.h. */

/* netdevice notifiers are defined in include/linux/netdevice.h */

/* reboot notifiers are defined in include/linux/reboot.h. */

/* Hibernation and suspend events are defined in include/linux/suspend.h. */

/* Virtual Terminal events are defined in include/linux/vt.h. */

#define NETLINK_URELEASE        0x0001        /* Unicast netlink socket released */

/* Console keyboard events.
 * Note: KBD_KEYCODE is always sent before KBD_UNBOUND_KEYCODE, KBD_UNICODE and
 * KBD_KEYSYM. */
#define KBD_KEYCODE                0x0001 /* Keyboard keycode, called before any other */
#define KBD_UNBOUND_KEYCODE        0x0002 /* Keyboard keycode which is not bound to any other */
#define KBD_UNICODE                0x0003 /* Keyboard unicode */
#define KBD_KEYSYM                0x0004 /* Keyboard keysym */
#define KBD_POST_KEYSYM                0x0005 /* Called after keyboard keysym interpretation */

extern struct blocking_notifier_head reboot_notifier_list;

#endif /* __KERNEL__ */
#endif /* _LINUX_NOTIFIER_H */












































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_ICMPV6_H
#define _LINUX_ICMPV6_H

#include <linux/skbuff.h>
#include <linux/ipv6.h>
#include <uapi/linux/icmpv6.h>

static inline struct icmp6hdr *icmp6_hdr(const struct sk_buff *skb)
{
        return (struct icmp6hdr *)skb_transport_header(skb);
}

#include <linux/netdevice.h>

#if IS_ENABLED(CONFIG_IPV6)

typedef void ip6_icmp_send_t(struct sk_buff *skb, u8 type, u8 code, __u32 info,
                             const struct in6_addr *force_saddr,
                             const struct inet6_skb_parm *parm);
void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
                const struct in6_addr *force_saddr,
                const struct inet6_skb_parm *parm);
#if IS_BUILTIN(CONFIG_IPV6)
static inline void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
                                 const struct inet6_skb_parm *parm)
{
        icmp6_send(skb, type, code, info, NULL, parm);
}
static inline int inet6_register_icmp_sender(ip6_icmp_send_t *fn)
{
        BUILD_BUG_ON(fn != icmp6_send);
        return 0;
}
static inline int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn)
{
        BUILD_BUG_ON(fn != icmp6_send);
        return 0;
}
#else
extern void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
                          const struct inet6_skb_parm *parm);
extern int inet6_register_icmp_sender(ip6_icmp_send_t *fn);
extern int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn);
#endif

static inline void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
{
        __icmpv6_send(skb, type, code, info, IP6CB(skb));
}

int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type,
                               unsigned int data_len);

#if IS_ENABLED(CONFIG_NF_NAT)
void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info);
#else
static inline void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info)
{
        struct inet6_skb_parm parm = { 0 };
        __icmpv6_send(skb_in, type, code, info, &parm);
}
#endif

#else

static inline void icmpv6_send(struct sk_buff *skb,
                               u8 type, u8 code, __u32 info)
{
}

static inline void icmpv6_ndo_send(struct sk_buff *skb,
                                   u8 type, u8 code, __u32 info)
{
}
#endif

extern int                                icmpv6_init(void);
extern int                                icmpv6_err_convert(u8 type, u8 code,
                                                           int *err);
extern void                                icmpv6_cleanup(void);
extern void                                icmpv6_param_prob(struct sk_buff *skb,
                                                          u8 code, int pos);

struct flowi6;
struct in6_addr;
extern void                                icmpv6_flow_init(struct sock *sk,
                                                         struct flowi6 *fl6,
                                                         u8 type,
                                                         const struct in6_addr *saddr,
                                                         const struct in6_addr *daddr,
                                                         int oif);

static inline bool icmpv6_is_err(int type)
{
        switch (type) {
        case ICMPV6_DEST_UNREACH:
        case ICMPV6_PKT_TOOBIG:
        case ICMPV6_TIME_EXCEED:
        case ICMPV6_PARAMPROB:
                return true;
        }

        return false;
}

#endif































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
// SPDX-License-Identifier: GPL-2.0+
/*
 * User-space Probes (UProbes)
 *
 * Copyright (C) IBM Corporation, 2008-2012
 * Authors:
 *        Srikar Dronamraju
 *        Jim Keniston
 * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra
 */

#include <linux/kernel.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>        /* read_mapping_page */
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/export.h>
#include <linux/rmap.h>                /* anon_vma_prepare */
#include <linux/mmu_notifier.h>        /* set_pte_at_notify */
#include <linux/swap.h>                /* try_to_free_swap */
#include <linux/ptrace.h>        /* user_enable_single_step */
#include <linux/kdebug.h>        /* notifier mechanism */
#include "../../mm/internal.h"        /* munlock_vma_page */
#include <linux/percpu-rwsem.h>
#include <linux/task_work.h>
#include <linux/shmem_fs.h>
#include <linux/khugepaged.h>

#include <linux/uprobes.h>

#define UINSNS_PER_PAGE                        (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES)
#define MAX_UPROBE_XOL_SLOTS                UINSNS_PER_PAGE

static struct rb_root uprobes_tree = RB_ROOT;
/*
 * allows us to skip the uprobe_mmap if there are no uprobe events active
 * at this time.  Probably a fine grained per inode count is better?
 */
#define no_uprobe_events()        RB_EMPTY_ROOT(&uprobes_tree)

static DEFINE_SPINLOCK(uprobes_treelock);        /* serialize rbtree access */

#define UPROBES_HASH_SZ        13
/* serialize uprobe->pending_list */
static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
#define uprobes_mmap_hash(v)        (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])

DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);

/* Have a copy of original instruction */
#define UPROBE_COPY_INSN        0

struct uprobe {
        struct rb_node                rb_node;        /* node in the rb tree */
        refcount_t                ref;
        struct rw_semaphore        register_rwsem;
        struct rw_semaphore        consumer_rwsem;
        struct list_head        pending_list;
        struct uprobe_consumer        *consumers;
        struct inode                *inode;                /* Also hold a ref to inode */
        loff_t                        offset;
        loff_t                        ref_ctr_offset;
        unsigned long                flags;

        /*
         * The generic code assumes that it has two members of unknown type
         * owned by the arch-specific code:
         *
         *         insn -        copy_insn() saves the original instruction here for
         *                arch_uprobe_analyze_insn().
         *
         *        ixol -        potentially modified instruction to execute out of
         *                line, copied to xol_area by xol_get_insn_slot().
         */
        struct arch_uprobe        arch;
};

struct delayed_uprobe {
        struct list_head list;
        struct uprobe *uprobe;
        struct mm_struct *mm;
};

static DEFINE_MUTEX(delayed_uprobe_lock);
static LIST_HEAD(delayed_uprobe_list);

/*
 * Execute out of line area: anonymous executable mapping installed
 * by the probed task to execute the copy of the original instruction
 * mangled by set_swbp().
 *
 * On a breakpoint hit, thread contests for a slot.  It frees the
 * slot after singlestep. Currently a fixed number of slots are
 * allocated.
 */
struct xol_area {
        wait_queue_head_t                 wq;                /* if all slots are busy */
        atomic_t                         slot_count;        /* number of in-use slots */
        unsigned long                         *bitmap;        /* 0 = free slot */

        struct vm_special_mapping        xol_mapping;
        struct page                         *pages[2];
        /*
         * We keep the vma's vm_start rather than a pointer to the vma
         * itself.  The probed process or a naughty kernel module could make
         * the vma go away, and we must handle that reasonably gracefully.
         */
        unsigned long                         vaddr;                /* Page(s) of instruction slots */
};

/*
 * valid_vma: Verify if the specified vma is an executable vma
 * Relax restrictions while unregistering: vm_flags might have
 * changed after breakpoint was inserted.
 *        - is_register: indicates if we are in register context.
 *        - Return 1 if the specified virtual address is in an
 *          executable vma.
 */
static bool valid_vma(struct vm_area_struct *vma, bool is_register)
{
        vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;

        if (is_register)
                flags |= VM_WRITE;

        return vma->vm_file && (vma->vm_flags & flags) == VM_MAYEXEC;
}

static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset)
{
        return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
}

static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
{
        return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start);
}

/**
 * __replace_page - replace page in vma by new page.
 * based on replace_page in mm/ksm.c
 *
 * @vma:      vma that holds the pte pointing to page
 * @addr:     address the old @page is mapped at
 * @old_page: the page we are replacing by new_page
 * @new_page: the modified page we replace page by
 *
 * If @new_page is NULL, only unmap @old_page.
 *
 * Returns 0 on success, negative error code otherwise.
 */
static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
                                struct page *old_page, struct page *new_page)
{
        struct mm_struct *mm = vma->vm_mm;
        struct page_vma_mapped_walk pvmw = {
                .page = compound_head(old_page),
                .vma = vma,
                .address = addr,
        };
        int err;
        struct mmu_notifier_range range;

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
                                addr + PAGE_SIZE);

        if (new_page) {
                err = mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL);
                if (err)
                        return err;
        }

        /* For try_to_free_swap() and munlock_vma_page() below */
        lock_page(old_page);

        mmu_notifier_invalidate_range_start(&range);
        err = -EAGAIN;
        if (!page_vma_mapped_walk(&pvmw))
                goto unlock;
        VM_BUG_ON_PAGE(addr != pvmw.address, old_page);

        if (new_page) {
                get_page(new_page);
                page_add_new_anon_rmap(new_page, vma, addr, false);
                lru_cache_add_inactive_or_unevictable(new_page, vma);
        } else
                /* no new page, just dec_mm_counter for old_page */
                dec_mm_counter(mm, MM_ANONPAGES);

        if (!PageAnon(old_page)) {
                dec_mm_counter(mm, mm_counter_file(old_page));
                inc_mm_counter(mm, MM_ANONPAGES);
        }

        flush_cache_page(vma, addr, pte_pfn(*pvmw.pte));
        ptep_clear_flush_notify(vma, addr, pvmw.pte);
        if (new_page)
                set_pte_at_notify(mm, addr, pvmw.pte,
                                  mk_pte(new_page, vma->vm_page_prot));

        page_remove_rmap(old_page, false);
        if (!page_mapped(old_page))
                try_to_free_swap(old_page);
        page_vma_mapped_walk_done(&pvmw);

        if ((vma->vm_flags & VM_LOCKED) && !PageCompound(old_page))
                munlock_vma_page(old_page);
        put_page(old_page);

        err = 0;
 unlock:
        mmu_notifier_invalidate_range_end(&range);
        unlock_page(old_page);
        return err;
}

/**
 * is_swbp_insn - check if instruction is breakpoint instruction.
 * @insn: instruction to be checked.
 * Default implementation of is_swbp_insn
 * Returns true if @insn is a breakpoint instruction.
 */
bool __weak is_swbp_insn(uprobe_opcode_t *insn)
{
        return *insn == UPROBE_SWBP_INSN;
}

/**
 * is_trap_insn - check if instruction is breakpoint instruction.
 * @insn: instruction to be checked.
 * Default implementation of is_trap_insn
 * Returns true if @insn is a breakpoint instruction.
 *
 * This function is needed for the case where an architecture has multiple
 * trap instructions (like powerpc).
 */
bool __weak is_trap_insn(uprobe_opcode_t *insn)
{
        return is_swbp_insn(insn);
}

static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
{
        void *kaddr = kmap_atomic(page);
        memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
        kunmap_atomic(kaddr);
}

static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len)
{
        void *kaddr = kmap_atomic(page);
        memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len);
        kunmap_atomic(kaddr);
}

static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
{
        uprobe_opcode_t old_opcode;
        bool is_swbp;

        /*
         * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here.
         * We do not check if it is any other 'trap variant' which could
         * be conditional trap instruction such as the one powerpc supports.
         *
         * The logic is that we do not care if the underlying instruction
         * is a trap variant; uprobes always wins over any other (gdb)
         * breakpoint.
         */
        copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
        is_swbp = is_swbp_insn(&old_opcode);

        if (is_swbp_insn(new_opcode)) {
                if (is_swbp)                /* register: already installed? */
                        return 0;
        } else {
                if (!is_swbp)                /* unregister: was it changed by us? */
                        return 0;
        }

        return 1;
}

static struct delayed_uprobe *
delayed_uprobe_check(struct uprobe *uprobe, struct mm_struct *mm)
{
        struct delayed_uprobe *du;

        list_for_each_entry(du, &delayed_uprobe_list, list)
                if (du->uprobe == uprobe && du->mm == mm)
                        return du;
        return NULL;
}

static int delayed_uprobe_add(struct uprobe *uprobe, struct mm_struct *mm)
{
        struct delayed_uprobe *du;

        if (delayed_uprobe_check(uprobe, mm))
                return 0;

        du  = kzalloc(sizeof(*du), GFP_KERNEL);
        if (!du)
                return -ENOMEM;

        du->uprobe = uprobe;
        du->mm = mm;
        list_add(&du->list, &delayed_uprobe_list);
        return 0;
}

static void delayed_uprobe_delete(struct delayed_uprobe *du)
{
        if (WARN_ON(!du))
                return;
        list_del(&du->list);
        kfree(du);
}

static void delayed_uprobe_remove(struct uprobe *uprobe, struct mm_struct *mm)
{
        struct list_head *pos, *q;
        struct delayed_uprobe *du;

        if (!uprobe && !mm)
                return;

        list_for_each_safe(pos, q, &delayed_uprobe_list) {
                du = list_entry(pos, struct delayed_uprobe, list);

                if (uprobe && du->uprobe != uprobe)
                        continue;
                if (mm && du->mm != mm)
                        continue;

                delayed_uprobe_delete(du);
        }
}

static bool valid_ref_ctr_vma(struct uprobe *uprobe,
                              struct vm_area_struct *vma)
{
        unsigned long vaddr = offset_to_vaddr(vma, uprobe->ref_ctr_offset);

        return uprobe->ref_ctr_offset &&
                vma->vm_file &&
                file_inode(vma->vm_file) == uprobe->inode &&
                (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE &&
                vma->vm_start <= vaddr &&
                vma->vm_end > vaddr;
}

static struct vm_area_struct *
find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm)
{
        struct vm_area_struct *tmp;

        for (tmp = mm->mmap; tmp; tmp = tmp->vm_next)
                if (valid_ref_ctr_vma(uprobe, tmp))
                        return tmp;

        return NULL;
}

static int
__update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d)
{
        void *kaddr;
        struct page *page;
        struct vm_area_struct *vma;
        int ret;
        short *ptr;

        if (!vaddr || !d)
                return -EINVAL;

        ret = get_user_pages_remote(mm, vaddr, 1,
                        FOLL_WRITE, &page, &vma, NULL);
        if (unlikely(ret <= 0)) {
                /*
                 * We are asking for 1 page. If get_user_pages_remote() fails,
                 * it may return 0, in that case we have to return error.
                 */
                return ret == 0 ? -EBUSY : ret;
        }

        kaddr = kmap_atomic(page);
        ptr = kaddr + (vaddr & ~PAGE_MASK);

        if (unlikely(*ptr + d < 0)) {
                pr_warn("ref_ctr going negative. vaddr: 0x%lx, "
                        "curr val: %d, delta: %d\n", vaddr, *ptr, d);
                ret = -EINVAL;
                goto out;
        }

        *ptr += d;
        ret = 0;
out:
        kunmap_atomic(kaddr);
        put_page(page);
        return ret;
}

static void update_ref_ctr_warn(struct uprobe *uprobe,
                                struct mm_struct *mm, short d)
{
        pr_warn("ref_ctr %s failed for inode: 0x%lx offset: "
                "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%pK\n",
                d > 0 ? "increment" : "decrement", uprobe->inode->i_ino,
                (unsigned long long) uprobe->offset,
                (unsigned long long) uprobe->ref_ctr_offset, mm);
}

static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
                          short d)
{
        struct vm_area_struct *rc_vma;
        unsigned long rc_vaddr;
        int ret = 0;

        rc_vma = find_ref_ctr_vma(uprobe, mm);

        if (rc_vma) {
                rc_vaddr = offset_to_vaddr(rc_vma, uprobe->ref_ctr_offset);
                ret = __update_ref_ctr(mm, rc_vaddr, d);
                if (ret)
                        update_ref_ctr_warn(uprobe, mm, d);

                if (d > 0)
                        return ret;
        }

        mutex_lock(&delayed_uprobe_lock);
        if (d > 0)
                ret = delayed_uprobe_add(uprobe, mm);
        else
                delayed_uprobe_remove(uprobe, mm);
        mutex_unlock(&delayed_uprobe_lock);

        return ret;
}

/*
 * NOTE:
 * Expect the breakpoint instruction to be the smallest size instruction for
 * the architecture. If an arch has variable length instruction and the
 * breakpoint instruction is not of the smallest length instruction
 * supported by that architecture then we need to modify is_trap_at_addr and
 * uprobe_write_opcode accordingly. This would never be a problem for archs
 * that have fixed length instructions.
 *
 * uprobe_write_opcode - write the opcode at a given virtual address.
 * @mm: the probed process address space.
 * @vaddr: the virtual address to store the opcode.
 * @opcode: opcode to be written at @vaddr.
 *
 * Called with mm->mmap_lock held for write.
 * Return 0 (success) or a negative errno.
 */
int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
                        unsigned long vaddr, uprobe_opcode_t opcode)
{
        struct uprobe *uprobe;
        struct page *old_page, *new_page;
        struct vm_area_struct *vma;
        int ret, is_register, ref_ctr_updated = 0;
        bool orig_page_huge = false;
        unsigned int gup_flags = FOLL_FORCE;

        is_register = is_swbp_insn(&opcode);
        uprobe = container_of(auprobe, struct uprobe, arch);

retry:
        if (is_register)
                gup_flags |= FOLL_SPLIT_PMD;
        /* Read the page with vaddr into memory */
        ret = get_user_pages_remote(mm, vaddr, 1, gup_flags,
                                    &old_page, &vma, NULL);
        if (ret <= 0)
                return ret;

        ret = verify_opcode(old_page, vaddr, &opcode);
        if (ret <= 0)
                goto put_old;

        if (WARN(!is_register && PageCompound(old_page),
                 "uprobe unregister should never work on compound page\n")) {
                ret = -EINVAL;
                goto put_old;
        }

        /* We are going to replace instruction, update ref_ctr. */
        if (!ref_ctr_updated && uprobe->ref_ctr_offset) {
                ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1);
                if (ret)
                        goto put_old;

                ref_ctr_updated = 1;
        }

        ret = 0;
        if (!is_register && !PageAnon(old_page))
                goto put_old;

        ret = anon_vma_prepare(vma);
        if (ret)
                goto put_old;

        ret = -ENOMEM;
        new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
        if (!new_page)
                goto put_old;

        __SetPageUptodate(new_page);
        copy_highpage(new_page, old_page);
        copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);

        if (!is_register) {
                struct page *orig_page;
                pgoff_t index;

                VM_BUG_ON_PAGE(!PageAnon(old_page), old_page);

                index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT;
                orig_page = find_get_page(vma->vm_file->f_inode->i_mapping,
                                          index);

                if (orig_page) {
                        if (PageUptodate(orig_page) &&
                            pages_identical(new_page, orig_page)) {
                                /* let go new_page */
                                put_page(new_page);
                                new_page = NULL;

                                if (PageCompound(orig_page))
                                        orig_page_huge = true;
                        }
                        put_page(orig_page);
                }
        }

        ret = __replace_page(vma, vaddr, old_page, new_page);
        if (new_page)
                put_page(new_page);
put_old:
        put_page(old_page);

        if (unlikely(ret == -EAGAIN))
                goto retry;

        /* Revert back reference counter if instruction update failed. */
        if (ret && is_register && ref_ctr_updated)
                update_ref_ctr(uprobe, mm, -1);

        /* try collapse pmd for compound page */
        if (!ret && orig_page_huge)
                collapse_pte_mapped_thp(mm, vaddr);

        return ret;
}

/**
 * set_swbp - store breakpoint at a given address.
 * @auprobe: arch specific probepoint information.
 * @mm: the probed process address space.
 * @vaddr: the virtual address to insert the opcode.
 *
 * For mm @mm, store the breakpoint instruction at @vaddr.
 * Return 0 (success) or a negative errno.
 */
int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
{
        return uprobe_write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN);
}

/**
 * set_orig_insn - Restore the original instruction.
 * @mm: the probed process address space.
 * @auprobe: arch specific probepoint information.
 * @vaddr: the virtual address to insert the opcode.
 *
 * For mm @mm, restore the original opcode (opcode) at @vaddr.
 * Return 0 (success) or a negative errno.
 */
int __weak
set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
{
        return uprobe_write_opcode(auprobe, mm, vaddr,
                        *(uprobe_opcode_t *)&auprobe->insn);
}

static struct uprobe *get_uprobe(struct uprobe *uprobe)
{
        refcount_inc(&uprobe->ref);
        return uprobe;
}

static void put_uprobe(struct uprobe *uprobe)
{
        if (refcount_dec_and_test(&uprobe->ref)) {
                /*
                 * If application munmap(exec_vma) before uprobe_unregister()
                 * gets called, we don't get a chance to remove uprobe from
                 * delayed_uprobe_list from remove_breakpoint(). Do it here.
                 */
                mutex_lock(&delayed_uprobe_lock);
                delayed_uprobe_remove(uprobe, NULL);
                mutex_unlock(&delayed_uprobe_lock);
                kfree(uprobe);
        }
}

static int match_uprobe(struct uprobe *l, struct uprobe *r)
{
        if (l->inode < r->inode)
                return -1;

        if (l->inode > r->inode)
                return 1;

        if (l->offset < r->offset)
                return -1;

        if (l->offset > r->offset)
                return 1;

        return 0;
}

static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
{
        struct uprobe u = { .inode = inode, .offset = offset };
        struct rb_node *n = uprobes_tree.rb_node;
        struct uprobe *uprobe;
        int match;

        while (n) {
                uprobe = rb_entry(n, struct uprobe, rb_node);
                match = match_uprobe(&u, uprobe);
                if (!match)
                        return get_uprobe(uprobe);

                if (match < 0)
                        n = n->rb_left;
                else
                        n = n->rb_right;
        }
        return NULL;
}

/*
 * Find a uprobe corresponding to a given inode:offset
 * Acquires uprobes_treelock
 */
static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
{
        struct uprobe *uprobe;

        spin_lock(&uprobes_treelock);
        uprobe = __find_uprobe(inode, offset);
        spin_unlock(&uprobes_treelock);

        return uprobe;
}

static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
{
        struct rb_node **p = &uprobes_tree.rb_node;
        struct rb_node *parent = NULL;
        struct uprobe *u;
        int match;

        while (*p) {
                parent = *p;
                u = rb_entry(parent, struct uprobe, rb_node);
                match = match_uprobe(uprobe, u);
                if (!match)
                        return get_uprobe(u);

                if (match < 0)
                        p = &parent->rb_left;
                else
                        p = &parent->rb_right;

        }

        u = NULL;
        rb_link_node(&uprobe->rb_node, parent, p);
        rb_insert_color(&uprobe->rb_node, &uprobes_tree);
        /* get access + creation ref */
        refcount_set(&uprobe->ref, 2);

        return u;
}

/*
 * Acquire uprobes_treelock.
 * Matching uprobe already exists in rbtree;
 *        increment (access refcount) and return the matching uprobe.
 *
 * No matching uprobe; insert the uprobe in rb_tree;
 *        get a double refcount (access + creation) and return NULL.
 */
static struct uprobe *insert_uprobe(struct uprobe *uprobe)
{
        struct uprobe *u;

        spin_lock(&uprobes_treelock);
        u = __insert_uprobe(uprobe);
        spin_unlock(&uprobes_treelock);

        return u;
}

static void
ref_ctr_mismatch_warn(struct uprobe *cur_uprobe, struct uprobe *uprobe)
{
        pr_warn("ref_ctr_offset mismatch. inode: 0x%lx offset: 0x%llx "
                "ref_ctr_offset(old): 0x%llx ref_ctr_offset(new): 0x%llx\n",
                uprobe->inode->i_ino, (unsigned long long) uprobe->offset,
                (unsigned long long) cur_uprobe->ref_ctr_offset,
                (unsigned long long) uprobe->ref_ctr_offset);
}

static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
                                   loff_t ref_ctr_offset)
{
        struct uprobe *uprobe, *cur_uprobe;

        uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL);
        if (!uprobe)
                return NULL;

        uprobe->inode = inode;
        uprobe->offset = offset;
        uprobe->ref_ctr_offset = ref_ctr_offset;
        init_rwsem(&uprobe->register_rwsem);
        init_rwsem(&uprobe->consumer_rwsem);

        /* add to uprobes_tree, sorted on inode:offset */
        cur_uprobe = insert_uprobe(uprobe);
        /* a uprobe exists for this inode:offset combination */
        if (cur_uprobe) {
                if (cur_uprobe->ref_ctr_offset != uprobe->ref_ctr_offset) {
                        ref_ctr_mismatch_warn(cur_uprobe, uprobe);
                        put_uprobe(cur_uprobe);
                        kfree(uprobe);
                        return ERR_PTR(-EINVAL);
                }
                kfree(uprobe);
                uprobe = cur_uprobe;
        }

        return uprobe;
}

static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
        down_write(&uprobe->consumer_rwsem);
        uc->next = uprobe->consumers;
        uprobe->consumers = uc;
        up_write(&uprobe->consumer_rwsem);
}

/*
 * For uprobe @uprobe, delete the consumer @uc.
 * Return true if the @uc is deleted successfully
 * or return false.
 */
static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
        struct uprobe_consumer **con;
        bool ret = false;

        down_write(&uprobe->consumer_rwsem);
        for (con = &uprobe->consumers; *con; con = &(*con)->next) {
                if (*con == uc) {
                        *con = uc->next;
                        ret = true;
                        break;
                }
        }
        up_write(&uprobe->consumer_rwsem);

        return ret;
}

static int __copy_insn(struct address_space *mapping, struct file *filp,
                        void *insn, int nbytes, loff_t offset)
{
        struct page *page;
        /*
         * Ensure that the page that has the original instruction is populated
         * and in page-cache. If ->readpage == NULL it must be shmem_mapping(),
         * see uprobe_register().
         */
        if (mapping->a_ops->readpage)
                page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp);
        else
                page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
        if (IS_ERR(page))
                return PTR_ERR(page);

        copy_from_page(page, offset, insn, nbytes);
        put_page(page);

        return 0;
}

static int copy_insn(struct uprobe *uprobe, struct file *filp)
{
        struct address_space *mapping = uprobe->inode->i_mapping;
        loff_t offs = uprobe->offset;
        void *insn = &uprobe->arch.insn;
        int size = sizeof(uprobe->arch.insn);
        int len, err = -EIO;

        /* Copy only available bytes, -EIO if nothing was read */
        do {
                if (offs >= i_size_read(uprobe->inode))
                        break;

                len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK));
                err = __copy_insn(mapping, filp, insn, len, offs);
                if (err)
                        break;

                insn += len;
                offs += len;
                size -= len;
        } while (size);

        return err;
}

static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
                                struct mm_struct *mm, unsigned long vaddr)
{
        int ret = 0;

        if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
                return ret;

        /* TODO: move this into _register, until then we abuse this sem. */
        down_write(&uprobe->consumer_rwsem);
        if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
                goto out;

        ret = copy_insn(uprobe, file);
        if (ret)
                goto out;

        ret = -ENOTSUPP;
        if (is_trap_insn((uprobe_opcode_t *)&uprobe->arch.insn))
                goto out;

        ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
        if (ret)
                goto out;

        smp_wmb(); /* pairs with the smp_rmb() in handle_swbp() */
        set_bit(UPROBE_COPY_INSN, &uprobe->flags);

 out:
        up_write(&uprobe->consumer_rwsem);

        return ret;
}

static inline bool consumer_filter(struct uprobe_consumer *uc,
                                   enum uprobe_filter_ctx ctx, struct mm_struct *mm)
{
        return !uc->filter || uc->filter(uc, ctx, mm);
}

static bool filter_chain(struct uprobe *uprobe,
                         enum uprobe_filter_ctx ctx, struct mm_struct *mm)
{
        struct uprobe_consumer *uc;
        bool ret = false;

        down_read(&uprobe->consumer_rwsem);
        for (uc = uprobe->consumers; uc; uc = uc->next) {
                ret = consumer_filter(uc, ctx, mm);
                if (ret)
                        break;
        }
        up_read(&uprobe->consumer_rwsem);

        return ret;
}

static int
install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
                        struct vm_area_struct *vma, unsigned long vaddr)
{
        bool first_uprobe;
        int ret;

        ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
        if (ret)
                return ret;

        /*
         * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(),
         * the task can hit this breakpoint right after __replace_page().
         */
        first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags);
        if (first_uprobe)
                set_bit(MMF_HAS_UPROBES, &mm->flags);

        ret = set_swbp(&uprobe->arch, mm, vaddr);
        if (!ret)
                clear_bit(MMF_RECALC_UPROBES, &mm->flags);
        else if (first_uprobe)
                clear_bit(MMF_HAS_UPROBES, &mm->flags);

        return ret;
}

static int
remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
{
        set_bit(MMF_RECALC_UPROBES, &mm->flags);
        return set_orig_insn(&uprobe->arch, mm, vaddr);
}

static inline bool uprobe_is_active(struct uprobe *uprobe)
{
        return !RB_EMPTY_NODE(&uprobe->rb_node);
}
/*
 * There could be threads that have already hit the breakpoint. They
 * will recheck the current insn and restart if find_uprobe() fails.
 * See find_active_uprobe().
 */
static void delete_uprobe(struct uprobe *uprobe)
{
        if (WARN_ON(!uprobe_is_active(uprobe)))
                return;

        spin_lock(&uprobes_treelock);
        rb_erase(&uprobe->rb_node, &uprobes_tree);
        spin_unlock(&uprobes_treelock);
        RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
        put_uprobe(uprobe);
}

struct map_info {
        struct map_info *next;
        struct mm_struct *mm;
        unsigned long vaddr;
};

static inline struct map_info *free_map_info(struct map_info *info)
{
        struct map_info *next = info->next;
        kfree(info);
        return next;
}

static struct map_info *
build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
{
        unsigned long pgoff = offset >> PAGE_SHIFT;
        struct vm_area_struct *vma;
        struct map_info *curr = NULL;
        struct map_info *prev = NULL;
        struct map_info *info;
        int more = 0;

 again:
        i_mmap_lock_read(mapping);
        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                if (!valid_vma(vma, is_register))
                        continue;

                if (!prev && !more) {
                        /*
                         * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through
                         * reclaim. This is optimistic, no harm done if it fails.
                         */
                        prev = kmalloc(sizeof(struct map_info),
                                        GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
                        if (prev)
                                prev->next = NULL;
                }
                if (!prev) {
                        more++;
                        continue;
                }

                if (!mmget_not_zero(vma->vm_mm))
                        continue;

                info = prev;
                prev = prev->next;
                info->next = curr;
                curr = info;

                info->mm = vma->vm_mm;
                info->vaddr = offset_to_vaddr(vma, offset);
        }
        i_mmap_unlock_read(mapping);

        if (!more)
                goto out;

        prev = curr;
        while (curr) {
                mmput(curr->mm);
                curr = curr->next;
        }

        do {
                info = kmalloc(sizeof(struct map_info), GFP_KERNEL);
                if (!info) {
                        curr = ERR_PTR(-ENOMEM);
                        goto out;
                }
                info->next = prev;
                prev = info;
        } while (--more);

        goto again;
 out:
        while (prev)
                prev = free_map_info(prev);
        return curr;
}

static int
register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
{
        bool is_register = !!new;
        struct map_info *info;
        int err = 0;

        percpu_down_write(&dup_mmap_sem);
        info = build_map_info(uprobe->inode->i_mapping,
                                        uprobe->offset, is_register);
        if (IS_ERR(info)) {
                err = PTR_ERR(info);
                goto out;
        }

        while (info) {
                struct mm_struct *mm = info->mm;
                struct vm_area_struct *vma;

                if (err && is_register)
                        goto free;

                mmap_write_lock(mm);
                vma = find_vma(mm, info->vaddr);
                if (!vma || !valid_vma(vma, is_register) ||
                    file_inode(vma->vm_file) != uprobe->inode)
                        goto unlock;

                if (vma->vm_start > info->vaddr ||
                    vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
                        goto unlock;

                if (is_register) {
                        /* consult only the "caller", new consumer. */
                        if (consumer_filter(new,
                                        UPROBE_FILTER_REGISTER, mm))
                                err = install_breakpoint(uprobe, mm, vma, info->vaddr);
                } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
                        if (!filter_chain(uprobe,
                                        UPROBE_FILTER_UNREGISTER, mm))
                                err |= remove_breakpoint(uprobe, mm, info->vaddr);
                }

 unlock:
                mmap_write_unlock(mm);
 free:
                mmput(mm);
                info = free_map_info(info);
        }
 out:
        percpu_up_write(&dup_mmap_sem);
        return err;
}

static void
__uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
        int err;

        if (WARN_ON(!consumer_del(uprobe, uc)))
                return;

        err = register_for_each_vma(uprobe, NULL);
        /* TODO : cant unregister? schedule a worker thread */
        if (!uprobe->consumers && !err)
                delete_uprobe(uprobe);
}

/*
 * uprobe_unregister - unregister an already registered probe.
 * @inode: the file in which the probe has to be removed.
 * @offset: offset from the start of the file.
 * @uc: identify which probe if multiple probes are colocated.
 */
void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
{
        struct uprobe *uprobe;

        uprobe = find_uprobe(inode, offset);
        if (WARN_ON(!uprobe))
                return;

        down_write(&uprobe->register_rwsem);
        __uprobe_unregister(uprobe, uc);
        up_write(&uprobe->register_rwsem);
        put_uprobe(uprobe);
}
EXPORT_SYMBOL_GPL(uprobe_unregister);

/*
 * __uprobe_register - register a probe
 * @inode: the file in which the probe has to be placed.
 * @offset: offset from the start of the file.
 * @uc: information on howto handle the probe..
 *
 * Apart from the access refcount, __uprobe_register() takes a creation
 * refcount (thro alloc_uprobe) if and only if this @uprobe is getting
 * inserted into the rbtree (i.e first consumer for a @inode:@offset
 * tuple).  Creation refcount stops uprobe_unregister from freeing the
 * @uprobe even before the register operation is complete. Creation
 * refcount is released when the last @uc for the @uprobe
 * unregisters. Caller of __uprobe_register() is required to keep @inode
 * (and the containing mount) referenced.
 *
 * Return errno if it cannot successully install probes
 * else return 0 (success)
 */
static int __uprobe_register(struct inode *inode, loff_t offset,
                             loff_t ref_ctr_offset, struct uprobe_consumer *uc)
{
        struct uprobe *uprobe;
        int ret;

        /* Uprobe must have at least one set consumer */
        if (!uc->handler && !uc->ret_handler)
                return -EINVAL;

        /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
        if (!inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping))
                return -EIO;
        /* Racy, just to catch the obvious mistakes */
        if (offset > i_size_read(inode))
                return -EINVAL;

        /*
         * This ensures that copy_from_page(), copy_to_page() and
         * __update_ref_ctr() can't cross page boundary.
         */
        if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE))
                return -EINVAL;
        if (!IS_ALIGNED(ref_ctr_offset, sizeof(short)))
                return -EINVAL;

 retry:
        uprobe = alloc_uprobe(inode, offset, ref_ctr_offset);
        if (!uprobe)
                return -ENOMEM;
        if (IS_ERR(uprobe))
                return PTR_ERR(uprobe);

        /*
         * We can race with uprobe_unregister()->delete_uprobe().
         * Check uprobe_is_active() and retry if it is false.
         */
        down_write(&uprobe->register_rwsem);
        ret = -EAGAIN;
        if (likely(uprobe_is_active(uprobe))) {
                consumer_add(uprobe, uc);
                ret = register_for_each_vma(uprobe, uc);
                if (ret)
                        __uprobe_unregister(uprobe, uc);
        }
        up_write(&uprobe->register_rwsem);
        put_uprobe(uprobe);

        if (unlikely(ret == -EAGAIN))
                goto retry;
        return ret;
}

int uprobe_register(struct inode *inode, loff_t offset,
                    struct uprobe_consumer *uc)
{
        return __uprobe_register(inode, offset, 0, uc);
}
EXPORT_SYMBOL_GPL(uprobe_register);

int uprobe_register_refctr(struct inode *inode, loff_t offset,
                           loff_t ref_ctr_offset, struct uprobe_consumer *uc)
{
        return __uprobe_register(inode, offset, ref_ctr_offset, uc);
}
EXPORT_SYMBOL_GPL(uprobe_register_refctr);

/*
 * uprobe_apply - unregister an already registered probe.
 * @inode: the file in which the probe has to be removed.
 * @offset: offset from the start of the file.
 * @uc: consumer which wants to add more or remove some breakpoints
 * @add: add or remove the breakpoints
 */
int uprobe_apply(struct inode *inode, loff_t offset,
                        struct uprobe_consumer *uc, bool add)
{
        struct uprobe *uprobe;
        struct uprobe_consumer *con;
        int ret = -ENOENT;

        uprobe = find_uprobe(inode, offset);
        if (WARN_ON(!uprobe))
                return ret;

        down_write(&uprobe->register_rwsem);
        for (con = uprobe->consumers; con && con != uc ; con = con->next)
                ;
        if (con)
                ret = register_for_each_vma(uprobe, add ? uc : NULL);
        up_write(&uprobe->register_rwsem);
        put_uprobe(uprobe);

        return ret;
}

static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
{
        struct vm_area_struct *vma;
        int err = 0;

        mmap_read_lock(mm);
        for (vma = mm->mmap; vma; vma = vma->vm_next) {
                unsigned long vaddr;
                loff_t offset;

                if (!valid_vma(vma, false) ||
                    file_inode(vma->vm_file) != uprobe->inode)
                        continue;

                offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
                if (uprobe->offset <  offset ||
                    uprobe->offset >= offset + vma->vm_end - vma->vm_start)
                        continue;

                vaddr = offset_to_vaddr(vma, uprobe->offset);
                err |= remove_breakpoint(uprobe, mm, vaddr);
        }
        mmap_read_unlock(mm);

        return err;
}

static struct rb_node *
find_node_in_range(struct inode *inode, loff_t min, loff_t max)
{
        struct rb_node *n = uprobes_tree.rb_node;

        while (n) {
                struct uprobe *u = rb_entry(n, struct uprobe, rb_node);

                if (inode < u->inode) {
                        n = n->rb_left;
                } else if (inode > u->inode) {
                        n = n->rb_right;
                } else {
                        if (max < u->offset)
                                n = n->rb_left;
                        else if (min > u->offset)
                                n = n->rb_right;
                        else
                                break;
                }
        }

        return n;
}

/*
 * For a given range in vma, build a list of probes that need to be inserted.
 */
static void build_probe_list(struct inode *inode,
                                struct vm_area_struct *vma,
                                unsigned long start, unsigned long end,
                                struct list_head *head)
{
        loff_t min, max;
        struct rb_node *n, *t;
        struct uprobe *u;

        INIT_LIST_HEAD(head);
        min = vaddr_to_offset(vma, start);
        max = min + (end - start) - 1;

        spin_lock(&uprobes_treelock);
        n = find_node_in_range(inode, min, max);
        if (n) {
                for (t = n; t; t = rb_prev(t)) {
                        u = rb_entry(t, struct uprobe, rb_node);
                        if (u->inode != inode || u->offset < min)
                                break;
                        list_add(&u->pending_list, head);
                        get_uprobe(u);
                }
                for (t = n; (t = rb_next(t)); ) {
                        u = rb_entry(t, struct uprobe, rb_node);
                        if (u->inode != inode || u->offset > max)
                                break;
                        list_add(&u->pending_list, head);
                        get_uprobe(u);
                }
        }
        spin_unlock(&uprobes_treelock);
}

/* @vma contains reference counter, not the probed instruction. */
static int delayed_ref_ctr_inc(struct vm_area_struct *vma)
{
        struct list_head *pos, *q;
        struct delayed_uprobe *du;
        unsigned long vaddr;
        int ret = 0, err = 0;

        mutex_lock(&delayed_uprobe_lock);
        list_for_each_safe(pos, q, &delayed_uprobe_list) {
                du = list_entry(pos, struct delayed_uprobe, list);

                if (du->mm != vma->vm_mm ||
                    !valid_ref_ctr_vma(du->uprobe, vma))
                        continue;

                vaddr = offset_to_vaddr(vma, du->uprobe->ref_ctr_offset);
                ret = __update_ref_ctr(vma->vm_mm, vaddr, 1);
                if (ret) {
                        update_ref_ctr_warn(du->uprobe, vma->vm_mm, 1);
                        if (!err)
                                err = ret;
                }
                delayed_uprobe_delete(du);
        }
        mutex_unlock(&delayed_uprobe_lock);
        return err;
}

/*
 * Called from mmap_region/vma_adjust with mm->mmap_lock acquired.
 *
 * Currently we ignore all errors and always return 0, the callers
 * can't handle the failure anyway.
 */
int uprobe_mmap(struct vm_area_struct *vma)
{
        struct list_head tmp_list;
        struct uprobe *uprobe, *u;
        struct inode *inode;

        if (no_uprobe_events())
                return 0;

        if (vma->vm_file &&
            (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE &&
            test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags))
                delayed_ref_ctr_inc(vma);

        if (!valid_vma(vma, true))
                return 0;

        inode = file_inode(vma->vm_file);
        if (!inode)
                return 0;

        mutex_lock(uprobes_mmap_hash(inode));
        build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
        /*
         * We can race with uprobe_unregister(), this uprobe can be already
         * removed. But in this case filter_chain() must return false, all
         * consumers have gone away.
         */
        list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
                if (!fatal_signal_pending(current) &&
                    filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) {
                        unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
                        install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
                }
                put_uprobe(uprobe);
        }
        mutex_unlock(uprobes_mmap_hash(inode));

        return 0;
}

static bool
vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long end)
{
        loff_t min, max;
        struct inode *inode;
        struct rb_node *n;

        inode = file_inode(vma->vm_file);

        min = vaddr_to_offset(vma, start);
        max = min + (end - start) - 1;

        spin_lock(&uprobes_treelock);
        n = find_node_in_range(inode, min, max);
        spin_unlock(&uprobes_treelock);

        return !!n;
}

/*
 * Called in context of a munmap of a vma.
 */
void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
{
        if (no_uprobe_events() || !valid_vma(vma, false))
                return;

        if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
                return;

        if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) ||
             test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags))
                return;

        if (vma_has_uprobes(vma, start, end))
                set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags);
}

/* Slot allocation for XOL */
static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
{
        struct vm_area_struct *vma;
        int ret;

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        if (mm->uprobes_state.xol_area) {
                ret = -EALREADY;
                goto fail;
        }

        if (!area->vaddr) {
                /* Try to map as high as possible, this is only a hint. */
                area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
                                                PAGE_SIZE, 0, 0);
                if (IS_ERR_VALUE(area->vaddr)) {
                        ret = area->vaddr;
                        goto fail;
                }
        }

        vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE,
                                VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO,
                                &area->xol_mapping);
        if (IS_ERR(vma)) {
                ret = PTR_ERR(vma);
                goto fail;
        }

        ret = 0;
        /* pairs with get_xol_area() */
        smp_store_release(&mm->uprobes_state.xol_area, area); /* ^^^ */
 fail:
        mmap_write_unlock(mm);

        return ret;
}

static struct xol_area *__create_xol_area(unsigned long vaddr)
{
        struct mm_struct *mm = current->mm;
        uprobe_opcode_t insn = UPROBE_SWBP_INSN;
        struct xol_area *area;

        area = kzalloc(sizeof(*area), GFP_KERNEL);
        if (unlikely(!area))
                goto out;

        area->bitmap = kcalloc(BITS_TO_LONGS(UINSNS_PER_PAGE), sizeof(long),
                               GFP_KERNEL);
        if (!area->bitmap)
                goto free_area;

        area->xol_mapping.name = "[uprobes]";
        area->xol_mapping.pages = area->pages;
        area->pages[0] = alloc_page(GFP_HIGHUSER | __GFP_ZERO);
        if (!area->pages[0])
                goto free_bitmap;
        area->pages[1] = NULL;

        area->vaddr = vaddr;
        init_waitqueue_head(&area->wq);
        /* Reserve the 1st slot for get_trampoline_vaddr() */
        set_bit(0, area->bitmap);
        atomic_set(&area->slot_count, 1);
        arch_uprobe_copy_ixol(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE);

        if (!xol_add_vma(mm, area))
                return area;

        __free_page(area->pages[0]);
 free_bitmap:
        kfree(area->bitmap);
 free_area:
        kfree(area);
 out:
        return NULL;
}

/*
 * get_xol_area - Allocate process's xol_area if necessary.
 * This area will be used for storing instructions for execution out of line.
 *
 * Returns the allocated area or NULL.
 */
static struct xol_area *get_xol_area(void)
{
        struct mm_struct *mm = current->mm;
        struct xol_area *area;

        if (!mm->uprobes_state.xol_area)
                __create_xol_area(0);

        /* Pairs with xol_add_vma() smp_store_release() */
        area = READ_ONCE(mm->uprobes_state.xol_area); /* ^^^ */
        return area;
}

/*
 * uprobe_clear_state - Free the area allocated for slots.
 */
void uprobe_clear_state(struct mm_struct *mm)
{
        struct xol_area *area = mm->uprobes_state.xol_area;

        mutex_lock(&delayed_uprobe_lock);
        delayed_uprobe_remove(NULL, mm);
        mutex_unlock(&delayed_uprobe_lock);

        if (!area)
                return;

        put_page(area->pages[0]);
        kfree(area->bitmap);
        kfree(area);
}

void uprobe_start_dup_mmap(void)
{
        percpu_down_read(&dup_mmap_sem);
}

void uprobe_end_dup_mmap(void)
{
        percpu_up_read(&dup_mmap_sem);
}

void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
{
        if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) {
                set_bit(MMF_HAS_UPROBES, &newmm->flags);
                /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */
                set_bit(MMF_RECALC_UPROBES, &newmm->flags);
        }
}

/*
 *  - search for a free slot.
 */
static unsigned long xol_take_insn_slot(struct xol_area *area)
{
        unsigned long slot_addr;
        int slot_nr;

        do {
                slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE);
                if (slot_nr < UINSNS_PER_PAGE) {
                        if (!test_and_set_bit(slot_nr, area->bitmap))
                                break;

                        slot_nr = UINSNS_PER_PAGE;
                        continue;
                }
                wait_event(area->wq, (atomic_read(&area->slot_count) < UINSNS_PER_PAGE));
        } while (slot_nr >= UINSNS_PER_PAGE);

        slot_addr = area->vaddr + (slot_nr * UPROBE_XOL_SLOT_BYTES);
        atomic_inc(&area->slot_count);

        return slot_addr;
}

/*
 * xol_get_insn_slot - allocate a slot for xol.
 * Returns the allocated slot address or 0.
 */
static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
{
        struct xol_area *area;
        unsigned long xol_vaddr;

        area = get_xol_area();
        if (!area)
                return 0;

        xol_vaddr = xol_take_insn_slot(area);
        if (unlikely(!xol_vaddr))
                return 0;

        arch_uprobe_copy_ixol(area->pages[0], xol_vaddr,
                              &uprobe->arch.ixol, sizeof(uprobe->arch.ixol));

        return xol_vaddr;
}

/*
 * xol_free_insn_slot - If slot was earlier allocated by
 * @xol_get_insn_slot(), make the slot available for
 * subsequent requests.
 */
static void xol_free_insn_slot(struct task_struct *tsk)
{
        struct xol_area *area;
        unsigned long vma_end;
        unsigned long slot_addr;

        if (!tsk->mm || !tsk->mm->uprobes_state.xol_area || !tsk->utask)
                return;

        slot_addr = tsk->utask->xol_vaddr;
        if (unlikely(!slot_addr))
                return;

        area = tsk->mm->uprobes_state.xol_area;
        vma_end = area->vaddr + PAGE_SIZE;
        if (area->vaddr <= slot_addr && slot_addr < vma_end) {
                unsigned long offset;
                int slot_nr;

                offset = slot_addr - area->vaddr;
                slot_nr = offset / UPROBE_XOL_SLOT_BYTES;
                if (slot_nr >= UINSNS_PER_PAGE)
                        return;

                clear_bit(slot_nr, area->bitmap);
                atomic_dec(&area->slot_count);
                smp_mb__after_atomic(); /* pairs with prepare_to_wait() */
                if (waitqueue_active(&area->wq))
                        wake_up(&area->wq);

                tsk->utask->xol_vaddr = 0;
        }
}

void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
                                  void *src, unsigned long len)
{
        /* Initialize the slot */
        copy_to_page(page, vaddr, src, len);

        /*
         * We probably need flush_icache_user_page() but it needs vma.
         * This should work on most of architectures by default. If
         * architecture needs to do something different it can define
         * its own version of the function.
         */
        flush_dcache_page(page);
}

/**
 * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs
 * @regs: Reflects the saved state of the task after it has hit a breakpoint
 * instruction.
 * Return the address of the breakpoint instruction.
 */
unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
{
        return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE;
}

unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
{
        struct uprobe_task *utask = current->utask;

        if (unlikely(utask && utask->active_uprobe))
                return utask->vaddr;

        return instruction_pointer(regs);
}

static struct return_instance *free_ret_instance(struct return_instance *ri)
{
        struct return_instance *next = ri->next;
        put_uprobe(ri->uprobe);
        kfree(ri);
        return next;
}

/*
 * Called with no locks held.
 * Called in context of an exiting or an exec-ing thread.
 */
void uprobe_free_utask(struct task_struct *t)
{
        struct uprobe_task *utask = t->utask;
        struct return_instance *ri;

        if (!utask)
                return;

        if (utask->active_uprobe)
                put_uprobe(utask->active_uprobe);

        ri = utask->return_instances;
        while (ri)
                ri = free_ret_instance(ri);

        xol_free_insn_slot(t);
        kfree(utask);
        t->utask = NULL;
}

/*
 * Allocate a uprobe_task object for the task if necessary.
 * Called when the thread hits a breakpoint.
 *
 * Returns:
 * - pointer to new uprobe_task on success
 * - NULL otherwise
 */
static struct uprobe_task *get_utask(void)
{
        if (!current->utask)
                current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
        return current->utask;
}

static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
{
        struct uprobe_task *n_utask;
        struct return_instance **p, *o, *n;

        n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
        if (!n_utask)
                return -ENOMEM;
        t->utask = n_utask;

        p = &n_utask->return_instances;
        for (o = o_utask->return_instances; o; o = o->next) {
                n = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
                if (!n)
                        return -ENOMEM;

                *n = *o;
                get_uprobe(n->uprobe);
                n->next = NULL;

                *p = n;
                p = &n->next;
                n_utask->depth++;
        }

        return 0;
}

static void uprobe_warn(struct task_struct *t, const char *msg)
{
        pr_warn("uprobe: %s:%d failed to %s\n",
                        current->comm, current->pid, msg);
}

static void dup_xol_work(struct callback_head *work)
{
        if (current->flags & PF_EXITING)
                return;

        if (!__create_xol_area(current->utask->dup_xol_addr) &&
                        !fatal_signal_pending(current))
                uprobe_warn(current, "dup xol area");
}

/*
 * Called in context of a new clone/fork from copy_process.
 */
void uprobe_copy_process(struct task_struct *t, unsigned long flags)
{
        struct uprobe_task *utask = current->utask;
        struct mm_struct *mm = current->mm;
        struct xol_area *area;

        t->utask = NULL;

        if (!utask || !utask->return_instances)
                return;

        if (mm == t->mm && !(flags & CLONE_VFORK))
                return;

        if (dup_utask(t, utask))
                return uprobe_warn(t, "dup ret instances");

        /* The task can fork() after dup_xol_work() fails */
        area = mm->uprobes_state.xol_area;
        if (!area)
                return uprobe_warn(t, "dup xol area");

        if (mm == t->mm)
                return;

        t->utask->dup_xol_addr = area->vaddr;
        init_task_work(&t->utask->dup_xol_work, dup_xol_work);
        task_work_add(t, &t->utask->dup_xol_work, TWA_RESUME);
}

/*
 * Current area->vaddr notion assume the trampoline address is always
 * equal area->vaddr.
 *
 * Returns -1 in case the xol_area is not allocated.
 */
static unsigned long get_trampoline_vaddr(void)
{
        struct xol_area *area;
        unsigned long trampoline_vaddr = -1;

        /* Pairs with xol_add_vma() smp_store_release() */
        area = READ_ONCE(current->mm->uprobes_state.xol_area); /* ^^^ */
        if (area)
                trampoline_vaddr = area->vaddr;

        return trampoline_vaddr;
}

static void cleanup_return_instances(struct uprobe_task *utask, bool chained,
                                        struct pt_regs *regs)
{
        struct return_instance *ri = utask->return_instances;
        enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL;

        while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) {
                ri = free_ret_instance(ri);
                utask->depth--;
        }
        utask->return_instances = ri;
}

static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
{
        struct return_instance *ri;
        struct uprobe_task *utask;
        unsigned long orig_ret_vaddr, trampoline_vaddr;
        bool chained;

        if (!get_xol_area())
                return;

        utask = get_utask();
        if (!utask)
                return;

        if (utask->depth >= MAX_URETPROBE_DEPTH) {
                printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to"
                                " nestedness limit pid/tgid=%d/%d\n",
                                current->pid, current->tgid);
                return;
        }

        ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
        if (!ri)
                return;

        trampoline_vaddr = get_trampoline_vaddr();
        orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
        if (orig_ret_vaddr == -1)
                goto fail;

        /* drop the entries invalidated by longjmp() */
        chained = (orig_ret_vaddr == trampoline_vaddr);
        cleanup_return_instances(utask, chained, regs);

        /*
         * We don't want to keep trampoline address in stack, rather keep the
         * original return address of first caller thru all the consequent
         * instances. This also makes breakpoint unwrapping easier.
         */
        if (chained) {
                if (!utask->return_instances) {
                        /*
                         * This situation is not possible. Likely we have an
                         * attack from user-space.
                         */
                        uprobe_warn(current, "handle tail call");
                        goto fail;
                }
                orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
        }

        ri->uprobe = get_uprobe(uprobe);
        ri->func = instruction_pointer(regs);
        ri->stack = user_stack_pointer(regs);
        ri->orig_ret_vaddr = orig_ret_vaddr;
        ri->chained = chained;

        utask->depth++;
        ri->next = utask->return_instances;
        utask->return_instances = ri;

        return;
 fail:
        kfree(ri);
}

/* Prepare to single-step probed instruction out of line. */
static int
pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
{
        struct uprobe_task *utask;
        unsigned long xol_vaddr;
        int err;

        utask = get_utask();
        if (!utask)
                return -ENOMEM;

        xol_vaddr = xol_get_insn_slot(uprobe);
        if (!xol_vaddr)
                return -ENOMEM;

        utask->xol_vaddr = xol_vaddr;
        utask->vaddr = bp_vaddr;

        err = arch_uprobe_pre_xol(&uprobe->arch, regs);
        if (unlikely(err)) {
                xol_free_insn_slot(current);
                return err;
        }

        utask->active_uprobe = uprobe;
        utask->state = UTASK_SSTEP;
        return 0;
}

/*
 * If we are singlestepping, then ensure this thread is not connected to
 * non-fatal signals until completion of singlestep.  When xol insn itself
 * triggers the signal,  restart the original insn even if the task is
 * already SIGKILL'ed (since coredump should report the correct ip).  This
 * is even more important if the task has a handler for SIGSEGV/etc, The
 * _same_ instruction should be repeated again after return from the signal
 * handler, and SSTEP can never finish in this case.
 */
bool uprobe_deny_signal(void)
{
        struct task_struct *t = current;
        struct uprobe_task *utask = t->utask;

        if (likely(!utask || !utask->active_uprobe))
                return false;

        WARN_ON_ONCE(utask->state != UTASK_SSTEP);

        if (task_sigpending(t)) {
                spin_lock_irq(&t->sighand->siglock);
                clear_tsk_thread_flag(t, TIF_SIGPENDING);
                spin_unlock_irq(&t->sighand->siglock);

                if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) {
                        utask->state = UTASK_SSTEP_TRAPPED;
                        set_tsk_thread_flag(t, TIF_UPROBE);
                }
        }

        return true;
}

static void mmf_recalc_uprobes(struct mm_struct *mm)
{
        struct vm_area_struct *vma;

        for (vma = mm->mmap; vma; vma = vma->vm_next) {
                if (!valid_vma(vma, false))
                        continue;
                /*
                 * This is not strictly accurate, we can race with
                 * uprobe_unregister() and see the already removed
                 * uprobe if delete_uprobe() was not yet called.
                 * Or this uprobe can be filtered out.
                 */
                if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end))
                        return;
        }

        clear_bit(MMF_HAS_UPROBES, &mm->flags);
}

static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
{
        struct page *page;
        uprobe_opcode_t opcode;
        int result;

        if (WARN_ON_ONCE(!IS_ALIGNED(vaddr, UPROBE_SWBP_INSN_SIZE)))
                return -EINVAL;

        pagefault_disable();
        result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr);
        pagefault_enable();

        if (likely(result == 0))
                goto out;

        /*
         * The NULL 'tsk' here ensures that any faults that occur here
         * will not be accounted to the task.  'mm' *is* current->mm,
         * but we treat this as a 'remote' access since it is
         * essentially a kernel access to the memory.
         */
        result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page,
                        NULL, NULL);
        if (result < 0)
                return result;

        copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
        put_page(page);
 out:
        /* This needs to return true for any variant of the trap insn */
        return is_trap_insn(&opcode);
}

static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
{
        struct mm_struct *mm = current->mm;
        struct uprobe *uprobe = NULL;
        struct vm_area_struct *vma;

        mmap_read_lock(mm);
        vma = find_vma(mm, bp_vaddr);
        if (vma && vma->vm_start <= bp_vaddr) {
                if (valid_vma(vma, false)) {
                        struct inode *inode = file_inode(vma->vm_file);
                        loff_t offset = vaddr_to_offset(vma, bp_vaddr);

                        uprobe = find_uprobe(inode, offset);
                }

                if (!uprobe)
                        *is_swbp = is_trap_at_addr(mm, bp_vaddr);
        } else {
                *is_swbp = -EFAULT;
        }

        if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags))
                mmf_recalc_uprobes(mm);
        mmap_read_unlock(mm);

        return uprobe;
}

static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
{
        struct uprobe_consumer *uc;
        int remove = UPROBE_HANDLER_REMOVE;
        bool need_prep = false; /* prepare return uprobe, when needed */

        down_read(&uprobe->register_rwsem);
        for (uc = uprobe->consumers; uc; uc = uc->next) {
                int rc = 0;

                if (uc->handler) {
                        rc = uc->handler(uc, regs);
                        WARN(rc & ~UPROBE_HANDLER_MASK,
                                "bad rc=0x%x from %ps()\n", rc, uc->handler);
                }

                if (uc->ret_handler)
                        need_prep = true;

                remove &= rc;
        }

        if (need_prep && !remove)
                prepare_uretprobe(uprobe, regs); /* put bp at return */

        if (remove && uprobe->consumers) {
                WARN_ON(!uprobe_is_active(uprobe));
                unapply_uprobe(uprobe, current->mm);
        }
        up_read(&uprobe->register_rwsem);
}

static void
handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
{
        struct uprobe *uprobe = ri->uprobe;
        struct uprobe_consumer *uc;

        down_read(&uprobe->register_rwsem);
        for (uc = uprobe->consumers; uc; uc = uc->next) {
                if (uc->ret_handler)
                        uc->ret_handler(uc, ri->func, regs);
        }
        up_read(&uprobe->register_rwsem);
}

static struct return_instance *find_next_ret_chain(struct return_instance *ri)
{
        bool chained;

        do {
                chained = ri->chained;
                ri = ri->next;        /* can't be NULL if chained */
        } while (chained);

        return ri;
}

static void handle_trampoline(struct pt_regs *regs)
{
        struct uprobe_task *utask;
        struct return_instance *ri, *next;
        bool valid;

        utask = current->utask;
        if (!utask)
                goto sigill;

        ri = utask->return_instances;
        if (!ri)
                goto sigill;

        do {
                /*
                 * We should throw out the frames invalidated by longjmp().
                 * If this chain is valid, then the next one should be alive
                 * or NULL; the latter case means that nobody but ri->func
                 * could hit this trampoline on return. TODO: sigaltstack().
                 */
                next = find_next_ret_chain(ri);
                valid = !next || arch_uretprobe_is_alive(next, RP_CHECK_RET, regs);

                instruction_pointer_set(regs, ri->orig_ret_vaddr);
                do {
                        if (valid)
                                handle_uretprobe_chain(ri, regs);
                        ri = free_ret_instance(ri);
                        utask->depth--;
                } while (ri != next);
        } while (!valid);

        utask->return_instances = ri;
        return;

 sigill:
        uprobe_warn(current, "handle uretprobe, sending SIGILL.");
        force_sig(SIGILL);

}

bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
{
        return false;
}

bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
                                        struct pt_regs *regs)
{
        return true;
}

/*
 * Run handler and ask thread to singlestep.
 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
 */
static void handle_swbp(struct pt_regs *regs)
{
        struct uprobe *uprobe;
        unsigned long bp_vaddr;
        int is_swbp;

        bp_vaddr = uprobe_get_swbp_addr(regs);
        if (bp_vaddr == get_trampoline_vaddr())
                return handle_trampoline(regs);

        uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
        if (!uprobe) {
                if (is_swbp > 0) {
                        /* No matching uprobe; signal SIGTRAP. */
                        force_sig(SIGTRAP);
                } else {
                        /*
                         * Either we raced with uprobe_unregister() or we can't
                         * access this memory. The latter is only possible if
                         * another thread plays with our ->mm. In both cases
                         * we can simply restart. If this vma was unmapped we
                         * can pretend this insn was not executed yet and get
                         * the (correct) SIGSEGV after restart.
                         */
                        instruction_pointer_set(regs, bp_vaddr);
                }
                return;
        }

        /* change it in advance for ->handler() and restart */
        instruction_pointer_set(regs, bp_vaddr);

        /*
         * TODO: move copy_insn/etc into _register and remove this hack.
         * After we hit the bp, _unregister + _register can install the
         * new and not-yet-analyzed uprobe at the same address, restart.
         */
        if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
                goto out;

        /*
         * Pairs with the smp_wmb() in prepare_uprobe().
         *
         * Guarantees that if we see the UPROBE_COPY_INSN bit set, then
         * we must also see the stores to &uprobe->arch performed by the
         * prepare_uprobe() call.
         */
        smp_rmb();

        /* Tracing handlers use ->utask to communicate with fetch methods */
        if (!get_utask())
                goto out;

        if (arch_uprobe_ignore(&uprobe->arch, regs))
                goto out;

        handler_chain(uprobe, regs);

        /*
         * If user decided to take execution elsewhere, it makes little sense
         * to execute the original instruction, so let's skip it.
         */
        if (instruction_pointer(regs) != bp_vaddr)
                goto out;

        if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
                goto out;

        if (!pre_ssout(uprobe, regs, bp_vaddr))
                return;

        /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
out:
        put_uprobe(uprobe);
}

/*
 * Perform required fix-ups and disable singlestep.
 * Allow pending signals to take effect.
 */
static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
{
        struct uprobe *uprobe;
        int err = 0;

        uprobe = utask->active_uprobe;
        if (utask->state == UTASK_SSTEP_ACK)
                err = arch_uprobe_post_xol(&uprobe->arch, regs);
        else if (utask->state == UTASK_SSTEP_TRAPPED)
                arch_uprobe_abort_xol(&uprobe->arch, regs);
        else
                WARN_ON_ONCE(1);

        put_uprobe(uprobe);
        utask->active_uprobe = NULL;
        utask->state = UTASK_RUNNING;
        xol_free_insn_slot(current);

        spin_lock_irq(&current->sighand->siglock);
        recalc_sigpending(); /* see uprobe_deny_signal() */
        spin_unlock_irq(&current->sighand->siglock);

        if (unlikely(err)) {
                uprobe_warn(current, "execute the probed insn, sending SIGILL.");
                force_sig(SIGILL);
        }
}

/*
 * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag and
 * allows the thread to return from interrupt. After that handle_swbp()
 * sets utask->active_uprobe.
 *
 * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag
 * and allows the thread to return from interrupt.
 *
 * While returning to userspace, thread notices the TIF_UPROBE flag and calls
 * uprobe_notify_resume().
 */
void uprobe_notify_resume(struct pt_regs *regs)
{
        struct uprobe_task *utask;

        clear_thread_flag(TIF_UPROBE);

        utask = current->utask;
        if (utask && utask->active_uprobe)
                handle_singlestep(utask, regs);
        else
                handle_swbp(regs);
}

/*
 * uprobe_pre_sstep_notifier gets called from interrupt context as part of
 * notifier mechanism. Set TIF_UPROBE flag and indicate breakpoint hit.
 */
int uprobe_pre_sstep_notifier(struct pt_regs *regs)
{
        if (!current->mm)
                return 0;

        if (!test_bit(MMF_HAS_UPROBES, &current->mm->flags) &&
            (!current->utask || !current->utask->return_instances))
                return 0;

        set_thread_flag(TIF_UPROBE);
        return 1;
}

/*
 * uprobe_post_sstep_notifier gets called in interrupt context as part of notifier
 * mechanism. Set TIF_UPROBE flag and indicate completion of singlestep.
 */
int uprobe_post_sstep_notifier(struct pt_regs *regs)
{
        struct uprobe_task *utask = current->utask;

        if (!current->mm || !utask || !utask->active_uprobe)
                /* task is currently not uprobed */
                return 0;

        utask->state = UTASK_SSTEP_ACK;
        set_thread_flag(TIF_UPROBE);
        return 1;
}

static struct notifier_block uprobe_exception_nb = {
        .notifier_call                = arch_uprobe_exception_notify,
        .priority                = INT_MAX-1,        /* notified after kprobes, kgdb */
};

void __init uprobes_init(void)
{
        int i;

        for (i = 0; i < UPROBES_HASH_SZ; i++)
                mutex_init(&uprobes_mmap_mutex[i]);

        BUG_ON(register_die_notifier(&uprobe_exception_nb));
}























    9 


    9 








    9 

    9 



    9 












    9 













































    9 















    9 












    9 







    9 

















    9 
    9 










































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
// SPDX-License-Identifier: GPL-2.0
/*
 * Lockless hierarchical page accounting & limiting
 *
 * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner
 */

#include <linux/page_counter.h>
#include <linux/atomic.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/sched.h>
#include <linux/bug.h>
#include <asm/page.h>

static void propagate_protected_usage(struct page_counter *c,
                                      unsigned long usage)
{
        unsigned long protected, old_protected;
        unsigned long low, min;
        long delta;

        if (!c->parent)
                return;

        min = READ_ONCE(c->min);
        if (min || atomic_long_read(&c->min_usage)) {
                protected = min(usage, min);
                old_protected = atomic_long_xchg(&c->min_usage, protected);
                delta = protected - old_protected;
                if (delta)
                        atomic_long_add(delta, &c->parent->children_min_usage);
        }

        low = READ_ONCE(c->low);
        if (low || atomic_long_read(&c->low_usage)) {
                protected = min(usage, low);
                old_protected = atomic_long_xchg(&c->low_usage, protected);
                delta = protected - old_protected;
                if (delta)
                        atomic_long_add(delta, &c->parent->children_low_usage);
        }
}

/**
 * page_counter_cancel - take pages out of the local counter
 * @counter: counter
 * @nr_pages: number of pages to cancel
 */
void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
{
        long new;

        new = atomic_long_sub_return(nr_pages, &counter->usage);
        propagate_protected_usage(counter, new);
        /* More uncharges than charges? */
        WARN_ON_ONCE(new < 0);
}

/**
 * page_counter_charge - hierarchically charge pages
 * @counter: counter
 * @nr_pages: number of pages to charge
 *
 * NOTE: This does not consider any configured counter limits.
 */
void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
{
        struct page_counter *c;

        for (c = counter; c; c = c->parent) {
                long new;

                new = atomic_long_add_return(nr_pages, &c->usage);
                propagate_protected_usage(c, new);
                /*
                 * This is indeed racy, but we can live with some
                 * inaccuracy in the watermark.
                 */
                if (new > READ_ONCE(c->watermark))
                        WRITE_ONCE(c->watermark, new);
        }
}

/**
 * page_counter_try_charge - try to hierarchically charge pages
 * @counter: counter
 * @nr_pages: number of pages to charge
 * @fail: points first counter to hit its limit, if any
 *
 * Returns %true on success, or %false and @fail if the counter or one
 * of its ancestors has hit its configured limit.
 */
bool page_counter_try_charge(struct page_counter *counter,
                             unsigned long nr_pages,
                             struct page_counter **fail)
{
        struct page_counter *c;

        for (c = counter; c; c = c->parent) {
                long new;
                /*
                 * Charge speculatively to avoid an expensive CAS.  If
                 * a bigger charge fails, it might falsely lock out a
                 * racing smaller charge and send it into reclaim
                 * early, but the error is limited to the difference
                 * between the two sizes, which is less than 2M/4M in
                 * case of a THP locking out a regular page charge.
                 *
                 * The atomic_long_add_return() implies a full memory
                 * barrier between incrementing the count and reading
                 * the limit.  When racing with page_counter_set_max(),
                 * we either see the new limit or the setter sees the
                 * counter has changed and retries.
                 */
                new = atomic_long_add_return(nr_pages, &c->usage);
                if (new > c->max) {
                        atomic_long_sub(nr_pages, &c->usage);
                        propagate_protected_usage(c, new);
                        /*
                         * This is racy, but we can live with some
                         * inaccuracy in the failcnt which is only used
                         * to report stats.
                         */
                        data_race(c->failcnt++);
                        *fail = c;
                        goto failed;
                }
                propagate_protected_usage(c, new);
                /*
                 * Just like with failcnt, we can live with some
                 * inaccuracy in the watermark.
                 */
                if (new > READ_ONCE(c->watermark))
                        WRITE_ONCE(c->watermark, new);
        }
        return true;

failed:
        for (c = counter; c != *fail; c = c->parent)
                page_counter_cancel(c, nr_pages);

        return false;
}

/**
 * page_counter_uncharge - hierarchically uncharge pages
 * @counter: counter
 * @nr_pages: number of pages to uncharge
 */
void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages)
{
        struct page_counter *c;

        for (c = counter; c; c = c->parent)
                page_counter_cancel(c, nr_pages);
}

/**
 * page_counter_set_max - set the maximum number of pages allowed
 * @counter: counter
 * @nr_pages: limit to set
 *
 * Returns 0 on success, -EBUSY if the current number of pages on the
 * counter already exceeds the specified limit.
 *
 * The caller must serialize invocations on the same counter.
 */
int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages)
{
        for (;;) {
                unsigned long old;
                long usage;

                /*
                 * Update the limit while making sure that it's not
                 * below the concurrently-changing counter value.
                 *
                 * The xchg implies two full memory barriers before
                 * and after, so the read-swap-read is ordered and
                 * ensures coherency with page_counter_try_charge():
                 * that function modifies the count before checking
                 * the limit, so if it sees the old limit, we see the
                 * modified counter and retry.
                 */
                usage = atomic_long_read(&counter->usage);

                if (usage > nr_pages)
                        return -EBUSY;

                old = xchg(&counter->max, nr_pages);

                if (atomic_long_read(&counter->usage) <= usage)
                        return 0;

                counter->max = old;
                cond_resched();
        }
}

/**
 * page_counter_set_min - set the amount of protected memory
 * @counter: counter
 * @nr_pages: value to set
 *
 * The caller must serialize invocations on the same counter.
 */
void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages)
{
        struct page_counter *c;

        WRITE_ONCE(counter->min, nr_pages);

        for (c = counter; c; c = c->parent)
                propagate_protected_usage(c, atomic_long_read(&c->usage));
}

/**
 * page_counter_set_low - set the amount of protected memory
 * @counter: counter
 * @nr_pages: value to set
 *
 * The caller must serialize invocations on the same counter.
 */
void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages)
{
        struct page_counter *c;

        WRITE_ONCE(counter->low, nr_pages);

        for (c = counter; c; c = c->parent)
                propagate_protected_usage(c, atomic_long_read(&c->usage));
}

/**
 * page_counter_memparse - memparse() for page counter limits
 * @buf: string to parse
 * @max: string meaning maximum possible value
 * @nr_pages: returns the result in number of pages
 *
 * Returns -EINVAL, or 0 and @nr_pages on success.  @nr_pages will be
 * limited to %PAGE_COUNTER_MAX.
 */
int page_counter_memparse(const char *buf, const char *max,
                          unsigned long *nr_pages)
{
        char *end;
        u64 bytes;

        if (!strcmp(buf, max)) {
                *nr_pages = PAGE_COUNTER_MAX;
                return 0;
        }

        bytes = memparse(buf, &end);
        if (*end != '\0')
                return -EINVAL;

        *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX);

        return 0;
}













































































    1 




























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/fs.h>
#include <linux/buffer_head.h>
#include <linux/exportfs.h>
#include <linux/iso_fs.h>
#include <asm/unaligned.h>

enum isofs_file_format {
        isofs_file_normal = 0,
        isofs_file_sparse = 1,
        isofs_file_compressed = 2,
};
        
/*
 * iso fs inode data in memory
 */
struct iso_inode_info {
        unsigned long i_iget5_block;
        unsigned long i_iget5_offset;
        unsigned int i_first_extent;
        unsigned char i_file_format;
        unsigned char i_format_parm[3];
        unsigned long i_next_section_block;
        unsigned long i_next_section_offset;
        off_t i_section_size;
        struct inode vfs_inode;
};

/*
 * iso9660 super-block data in memory
 */
struct isofs_sb_info {
        unsigned long s_ninodes;
        unsigned long s_nzones;
        unsigned long s_firstdatazone;
        unsigned long s_log_zone_size;
        unsigned long s_max_size;
        
        int           s_rock_offset; /* offset of SUSP fields within SU area */
        s32           s_sbsector;
        unsigned char s_joliet_level;
        unsigned char s_mapping;
        unsigned char s_check;
        unsigned char s_session;
        unsigned int  s_high_sierra:1;
        unsigned int  s_rock:2;
        unsigned int  s_cruft:1; /* Broken disks with high byte of length
                                  * containing junk */
        unsigned int  s_nocompress:1;
        unsigned int  s_hide:1;
        unsigned int  s_showassoc:1;
        unsigned int  s_overriderockperm:1;
        unsigned int  s_uid_set:1;
        unsigned int  s_gid_set:1;

        umode_t s_fmode;
        umode_t s_dmode;
        kgid_t s_gid;
        kuid_t s_uid;
        struct nls_table *s_nls_iocharset; /* Native language support table */
};

#define ISOFS_INVALID_MODE ((umode_t) -1)

static inline struct isofs_sb_info *ISOFS_SB(struct super_block *sb)
{
        return sb->s_fs_info;
}

static inline struct iso_inode_info *ISOFS_I(struct inode *inode)
{
        return container_of(inode, struct iso_inode_info, vfs_inode);
}

static inline int isonum_711(u8 *p)
{
        return *p;
}
static inline int isonum_712(s8 *p)
{
        return *p;
}
static inline unsigned int isonum_721(u8 *p)
{
        return get_unaligned_le16(p);
}
static inline unsigned int isonum_722(u8 *p)
{
        return get_unaligned_be16(p);
}
static inline unsigned int isonum_723(u8 *p)
{
        /* Ignore bigendian datum due to broken mastering programs */
        return get_unaligned_le16(p);
}
static inline unsigned int isonum_731(u8 *p)
{
        return get_unaligned_le32(p);
}
static inline unsigned int isonum_732(u8 *p)
{
        return get_unaligned_be32(p);
}
static inline unsigned int isonum_733(u8 *p)
{
        /* Ignore bigendian datum due to broken mastering programs */
        return get_unaligned_le32(p);
}
extern int iso_date(u8 *, int);

struct inode;                /* To make gcc happy */

extern int parse_rock_ridge_inode(struct iso_directory_record *, struct inode *, int relocated);
extern int get_rock_ridge_filename(struct iso_directory_record *, char *, struct inode *);
extern int isofs_name_translate(struct iso_directory_record *, char *, struct inode *);

int get_joliet_filename(struct iso_directory_record *, unsigned char *, struct inode *);
int get_acorn_filename(struct iso_directory_record *, char *, struct inode *);

extern struct dentry *isofs_lookup(struct inode *, struct dentry *, unsigned int flags);
extern struct buffer_head *isofs_bread(struct inode *, sector_t);
extern int isofs_get_blocks(struct inode *, sector_t, struct buffer_head **, unsigned long);

struct inode *__isofs_iget(struct super_block *sb,
                           unsigned long block,
                           unsigned long offset,
                           int relocated);

static inline struct inode *isofs_iget(struct super_block *sb,
                                       unsigned long block,
                                       unsigned long offset)
{
        return __isofs_iget(sb, block, offset, 0);
}

static inline struct inode *isofs_iget_reloc(struct super_block *sb,
                                             unsigned long block,
                                             unsigned long offset)
{
        return __isofs_iget(sb, block, offset, 1);
}

/* Because the inode number is no longer relevant to finding the
 * underlying meta-data for an inode, we are free to choose a more
 * convenient 32-bit number as the inode number.  The inode numbering
 * scheme was recommended by Sergey Vlasov and Eric Lammerts. */
static inline unsigned long isofs_get_ino(unsigned long block,
                                          unsigned long offset,
                                          unsigned long bufbits)
{
        return (block << (bufbits - 5)) | (offset >> 5);
}

/* Every directory can have many redundant directory entries scattered
 * throughout the directory tree.  First there is the directory entry
 * with the name of the directory stored in the parent directory.
 * Then, there is the "." directory entry stored in the directory
 * itself.  Finally, there are possibly many ".." directory entries
 * stored in all the subdirectories.
 *
 * In order for the NFS get_parent() method to work and for the
 * general consistency of the dcache, we need to make sure the
 * "i_iget5_block" and "i_iget5_offset" all point to exactly one of
 * the many redundant entries for each directory.  We normalize the
 * block and offset by always making them point to the "."  directory.
 *
 * Notice that we do not use the entry for the directory with the name
 * that is located in the parent directory.  Even though choosing this
 * first directory is more natural, it is much easier to find the "."
 * entry in the NFS get_parent() method because it is implicitly
 * encoded in the "extent + ext_attr_length" fields of _all_ the
 * redundant entries for the directory.  Thus, it can always be
 * reached regardless of which directory entry you have in hand.
 *
 * This works because the "." entry is simply the first directory
 * record when you start reading the file that holds all the directory
 * records, and this file starts at "extent + ext_attr_length" blocks.
 * Because the "." entry is always the first entry listed in the
 * directories file, the normalized "offset" value is always 0.
 *
 * You should pass the directory entry in "de".  On return, "block"
 * and "offset" will hold normalized values.  Only directories are
 * affected making it safe to call even for non-directory file
 * types. */
static inline void
isofs_normalize_block_and_offset(struct iso_directory_record* de,
                                 unsigned long *block,
                                 unsigned long *offset)
{
        /* Only directories are normalized. */
        if (de->flags[0] & 2) {
                *offset = 0;
                *block = (unsigned long)isonum_733(de->extent)
                        + (unsigned long)isonum_711(de->ext_attr_length);
        }
}

extern const struct inode_operations isofs_dir_inode_operations;
extern const struct file_operations isofs_dir_operations;
extern const struct address_space_operations isofs_symlink_aops;
extern const struct export_operations isofs_export_ops;





















































































































































































































































































    3 




















































































































































    1 




















    3 

    1 

















    3 




    1 














































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
// SPDX-License-Identifier: GPL-2.0+
/*
 * ext4_jbd2.h
 *
 * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
 *
 * Copyright 1998--1999 Red Hat corp --- All Rights Reserved
 *
 * Ext4-specific journaling extensions.
 */

#ifndef _EXT4_JBD2_H
#define _EXT4_JBD2_H

#include <linux/fs.h>
#include <linux/jbd2.h>
#include "ext4.h"

#define EXT4_JOURNAL(inode)        (EXT4_SB((inode)->i_sb)->s_journal)

/* Define the number of blocks we need to account to a transaction to
 * modify one block of data.
 *
 * We may have to touch one inode, one bitmap buffer, up to three
 * indirection blocks, the group and superblock summaries, and the data
 * block to complete the transaction.
 *
 * For extents-enabled fs we may have to allocate and modify up to
 * 5 levels of tree, data block (for each of these we need bitmap + group
 * summaries), root which is stored in the inode, sb
 */

#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb)                                \
        (ext4_has_feature_extents(sb) ? 20U : 8U)

/* Extended attribute operations touch at most two data buffers,
 * two bitmap buffers, and two group summaries, in addition to the inode
 * and the superblock, which are already accounted for. */

#define EXT4_XATTR_TRANS_BLOCKS                6U

/* Define the minimum size for a transaction which modifies data.  This
 * needs to take into account the fact that we may end up modifying two
 * quota files too (one for the group, one for the user quota).  The
 * superblock only gets updated once, of course, so don't bother
 * counting that again for the quota updates. */

#define EXT4_DATA_TRANS_BLOCKS(sb)        (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \
                                         EXT4_XATTR_TRANS_BLOCKS - 2 + \
                                         EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))

/*
 * Define the number of metadata blocks we need to account to modify data.
 *
 * This include super block, inode block, quota blocks and xattr blocks
 */
#define EXT4_META_TRANS_BLOCKS(sb)        (EXT4_XATTR_TRANS_BLOCKS + \
                                        EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))

/* Define an arbitrary limit for the amount of data we will anticipate
 * writing to any given transaction.  For unbounded transactions such as
 * write(2) and truncate(2) we can write more than this, but we always
 * start off at the maximum transaction size and grow the transaction
 * optimistically as we go. */

#define EXT4_MAX_TRANS_DATA                64U

/* We break up a large truncate or write transaction once the handle's
 * buffer credits gets this low, we need either to extend the
 * transaction or to start a new one.  Reserve enough space here for
 * inode, bitmap, superblock, group and indirection updates for at least
 * one block, plus two quota updates.  Quota allocations are not
 * needed. */

#define EXT4_RESERVE_TRANS_BLOCKS        12U

/*
 * Number of credits needed if we need to insert an entry into a
 * directory.  For each new index block, we need 4 blocks (old index
 * block, new index block, bitmap block, bg summary).  For normal
 * htree directories there are 2 levels; if the largedir feature
 * enabled it's 3 levels.
 */
#define EXT4_INDEX_EXTRA_TRANS_BLOCKS        12U

#ifdef CONFIG_QUOTA
/* Amount of blocks needed for quota update - we know that the structure was
 * allocated so we need to update only data block */
#define EXT4_QUOTA_TRANS_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
                ext4_has_feature_quota(sb)) ? 1 : 0)
/* Amount of blocks needed for quota insert/delete - we do some block writes
 * but inode, sb and group updates are done only once */
#define EXT4_QUOTA_INIT_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
                ext4_has_feature_quota(sb)) ?\
                (DQUOT_INIT_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
                 +3+DQUOT_INIT_REWRITE) : 0)

#define EXT4_QUOTA_DEL_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
                ext4_has_feature_quota(sb)) ?\
                (DQUOT_DEL_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
                 +3+DQUOT_DEL_REWRITE) : 0)
#else
#define EXT4_QUOTA_TRANS_BLOCKS(sb) 0
#define EXT4_QUOTA_INIT_BLOCKS(sb) 0
#define EXT4_QUOTA_DEL_BLOCKS(sb) 0
#endif
#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb))
#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))

/*
 * Ext4 handle operation types -- for logging purposes
 */
#define EXT4_HT_MISC             0
#define EXT4_HT_INODE            1
#define EXT4_HT_WRITE_PAGE       2
#define EXT4_HT_MAP_BLOCKS       3
#define EXT4_HT_DIR              4
#define EXT4_HT_TRUNCATE         5
#define EXT4_HT_QUOTA            6
#define EXT4_HT_RESIZE           7
#define EXT4_HT_MIGRATE          8
#define EXT4_HT_MOVE_EXTENTS     9
#define EXT4_HT_XATTR           10
#define EXT4_HT_EXT_CONVERT     11
#define EXT4_HT_MAX             12

/**
 *   struct ext4_journal_cb_entry - Base structure for callback information.
 *
 *   This struct is a 'seed' structure for a using with your own callback
 *   structs. If you are using callbacks you must allocate one of these
 *   or another struct of your own definition which has this struct
 *   as it's first element and pass it to ext4_journal_callback_add().
 */
struct ext4_journal_cb_entry {
        /* list information for other callbacks attached to the same handle */
        struct list_head jce_list;

        /*  Function to call with this callback structure */
        void (*jce_func)(struct super_block *sb,
                         struct ext4_journal_cb_entry *jce, int error);

        /* user data goes here */
};

/**
 * ext4_journal_callback_add: add a function to call after transaction commit
 * @handle: active journal transaction handle to register callback on
 * @func: callback function to call after the transaction has committed:
 *        @sb: superblock of current filesystem for transaction
 *        @jce: returned journal callback data
 *        @rc: journal state at commit (0 = transaction committed properly)
 * @jce: journal callback data (internal and function private data struct)
 *
 * The registered function will be called in the context of the journal thread
 * after the transaction for which the handle was created has completed.
 *
 * No locks are held when the callback function is called, so it is safe to
 * call blocking functions from within the callback, but the callback should
 * not block or run for too long, or the filesystem will be blocked waiting for
 * the next transaction to commit. No journaling functions can be used, or
 * there is a risk of deadlock.
 *
 * There is no guaranteed calling order of multiple registered callbacks on
 * the same transaction.
 */
static inline void _ext4_journal_callback_add(handle_t *handle,
                        struct ext4_journal_cb_entry *jce)
{
        /* Add the jce to transaction's private list */
        list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list);
}

static inline void ext4_journal_callback_add(handle_t *handle,
                        void (*func)(struct super_block *sb,
                                     struct ext4_journal_cb_entry *jce,
                                     int rc),
                        struct ext4_journal_cb_entry *jce)
{
        struct ext4_sb_info *sbi =
                        EXT4_SB(handle->h_transaction->t_journal->j_private);

        /* Add the jce to transaction's private list */
        jce->jce_func = func;
        spin_lock(&sbi->s_md_lock);
        _ext4_journal_callback_add(handle, jce);
        spin_unlock(&sbi->s_md_lock);
}


/**
 * ext4_journal_callback_del: delete a registered callback
 * @handle: active journal transaction handle on which callback was registered
 * @jce: registered journal callback entry to unregister
 * Return true if object was successfully removed
 */
static inline bool ext4_journal_callback_try_del(handle_t *handle,
                                             struct ext4_journal_cb_entry *jce)
{
        bool deleted;
        struct ext4_sb_info *sbi =
                        EXT4_SB(handle->h_transaction->t_journal->j_private);

        spin_lock(&sbi->s_md_lock);
        deleted = !list_empty(&jce->jce_list);
        list_del_init(&jce->jce_list);
        spin_unlock(&sbi->s_md_lock);
        return deleted;
}

int
ext4_mark_iloc_dirty(handle_t *handle,
                     struct inode *inode,
                     struct ext4_iloc *iloc);

/*
 * On success, We end up with an outstanding reference count against
 * iloc->bh.  This _must_ be cleaned up later.
 */

int ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
                        struct ext4_iloc *iloc);

#define ext4_mark_inode_dirty(__h, __i)                                        \
                __ext4_mark_inode_dirty((__h), (__i), __func__, __LINE__)
int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode,
                                const char *func, unsigned int line);

int ext4_expand_extra_isize(struct inode *inode,
                            unsigned int new_extra_isize,
                            struct ext4_iloc *iloc);
/*
 * Wrapper functions with which ext4 calls into JBD.
 */
int __ext4_journal_get_write_access(const char *where, unsigned int line,
                                    handle_t *handle, struct buffer_head *bh);

int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
                  int is_metadata, struct inode *inode,
                  struct buffer_head *bh, ext4_fsblk_t blocknr);

int __ext4_journal_get_create_access(const char *where, unsigned int line,
                                handle_t *handle, struct buffer_head *bh);

int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
                                 handle_t *handle, struct inode *inode,
                                 struct buffer_head *bh);

int __ext4_handle_dirty_super(const char *where, unsigned int line,
                              handle_t *handle, struct super_block *sb);

#define ext4_journal_get_write_access(handle, bh) \
        __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
        __ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \
                      (bh), (block_nr))
#define ext4_journal_get_create_access(handle, bh) \
        __ext4_journal_get_create_access(__func__, __LINE__, (handle), (bh))
#define ext4_handle_dirty_metadata(handle, inode, bh) \
        __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \
                                     (bh))
#define ext4_handle_dirty_super(handle, sb) \
        __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))

handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
                                  int type, int blocks, int rsv_blocks,
                                  int revoke_creds);
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);

#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)

/* Note:  Do not use this for NULL handles.  This is only to determine if
 * a properly allocated handle is using a journal or not. */
static inline int ext4_handle_valid(handle_t *handle)
{
        if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT)
                return 0;
        return 1;
}

static inline void ext4_handle_sync(handle_t *handle)
{
        if (ext4_handle_valid(handle))
                handle->h_sync = 1;
}

static inline int ext4_handle_is_aborted(handle_t *handle)
{
        if (ext4_handle_valid(handle))
                return is_handle_aborted(handle);
        return 0;
}

static inline int ext4_free_metadata_revoke_credits(struct super_block *sb,
                                                    int blocks)
{
        /* Freeing each metadata block can result in freeing one cluster */
        return blocks * EXT4_SB(sb)->s_cluster_ratio;
}

static inline int ext4_trans_default_revoke_credits(struct super_block *sb)
{
        return ext4_free_metadata_revoke_credits(sb, 8);
}

#define ext4_journal_start_sb(sb, type, nblocks)                        \
        __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0,        \
                                ext4_trans_default_revoke_credits(sb))

#define ext4_journal_start(inode, type, nblocks)                        \
        __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0,        \
                             ext4_trans_default_revoke_credits((inode)->i_sb))

#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks)\
        __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks),\
                             ext4_trans_default_revoke_credits((inode)->i_sb))

#define ext4_journal_start_with_revoke(inode, type, blocks, revoke_creds) \
        __ext4_journal_start((inode), __LINE__, (type), (blocks), 0,        \
                             (revoke_creds))

static inline handle_t *__ext4_journal_start(struct inode *inode,
                                             unsigned int line, int type,
                                             int blocks, int rsv_blocks,
                                             int revoke_creds)
{
        return __ext4_journal_start_sb(inode->i_sb, line, type, blocks,
                                       rsv_blocks, revoke_creds);
}

#define ext4_journal_stop(handle) \
        __ext4_journal_stop(__func__, __LINE__, (handle))

#define ext4_journal_start_reserved(handle, type) \
        __ext4_journal_start_reserved((handle), __LINE__, (type))

handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
                                        int type);

static inline handle_t *ext4_journal_current_handle(void)
{
        return journal_current_handle();
}

static inline int ext4_journal_extend(handle_t *handle, int nblocks, int revoke)
{
        if (ext4_handle_valid(handle))
                return jbd2_journal_extend(handle, nblocks, revoke);
        return 0;
}

static inline int ext4_journal_restart(handle_t *handle, int nblocks,
                                       int revoke)
{
        if (ext4_handle_valid(handle))
                return jbd2__journal_restart(handle, nblocks, revoke, GFP_NOFS);
        return 0;
}

int __ext4_journal_ensure_credits(handle_t *handle, int check_cred,
                                  int extend_cred, int revoke_cred);


/*
 * Ensure @handle has at least @check_creds credits available. If not,
 * transaction will be extended or restarted to contain at least @extend_cred
 * credits. Before restarting transaction @fn is executed to allow for cleanup
 * before the transaction is restarted.
 *
 * The return value is < 0 in case of error, 0 in case the handle has enough
 * credits or transaction extension succeeded, 1 in case transaction had to be
 * restarted.
 */
#define ext4_journal_ensure_credits_fn(handle, check_cred, extend_cred,        \
                                       revoke_cred, fn) \
({                                                                        \
        __label__ __ensure_end;                                                \
        int err = __ext4_journal_ensure_credits((handle), (check_cred),        \
                                        (extend_cred), (revoke_cred));        \
                                                                        \
        if (err <= 0)                                                        \
                goto __ensure_end;                                        \
        err = (fn);                                                        \
        if (err < 0)                                                        \
                goto __ensure_end;                                        \
        err = ext4_journal_restart((handle), (extend_cred), (revoke_cred)); \
        if (err == 0)                                                        \
                err = 1;                                                \
__ensure_end:                                                                \
        err;                                                                \
})

/*
 * Ensure given handle has at least requested amount of credits available,
 * possibly restarting transaction if needed. We also make sure the transaction
 * has space for at least ext4_trans_default_revoke_credits(sb) revoke records
 * as freeing one or two blocks is very common pattern and requesting this is
 * very cheap.
 */
static inline int ext4_journal_ensure_credits(handle_t *handle, int credits,
                                              int revoke_creds)
{
        return ext4_journal_ensure_credits_fn(handle, credits, credits,
                                revoke_creds, 0);
}

static inline int ext4_journal_blocks_per_page(struct inode *inode)
{
        if (EXT4_JOURNAL(inode) != NULL)
                return jbd2_journal_blocks_per_page(inode);
        return 0;
}

static inline int ext4_journal_force_commit(journal_t *journal)
{
        if (journal)
                return jbd2_journal_force_commit(journal);
        return 0;
}

static inline int ext4_jbd2_inode_add_write(handle_t *handle,
                struct inode *inode, loff_t start_byte, loff_t length)
{
        if (ext4_handle_valid(handle))
                return jbd2_journal_inode_ranged_write(handle,
                                EXT4_I(inode)->jinode, start_byte, length);
        return 0;
}

static inline int ext4_jbd2_inode_add_wait(handle_t *handle,
                struct inode *inode, loff_t start_byte, loff_t length)
{
        if (ext4_handle_valid(handle))
                return jbd2_journal_inode_ranged_wait(handle,
                                EXT4_I(inode)->jinode, start_byte, length);
        return 0;
}

static inline void ext4_update_inode_fsync_trans(handle_t *handle,
                                                 struct inode *inode,
                                                 int datasync)
{
        struct ext4_inode_info *ei = EXT4_I(inode);

        if (ext4_handle_valid(handle) && !is_handle_aborted(handle)) {
                ei->i_sync_tid = handle->h_transaction->t_tid;
                if (datasync)
                        ei->i_datasync_tid = handle->h_transaction->t_tid;
        }
}

/* super.c */
int ext4_force_commit(struct super_block *sb);

/*
 * Ext4 inode journal modes
 */
#define EXT4_INODE_JOURNAL_DATA_MODE        0x01 /* journal data mode */
#define EXT4_INODE_ORDERED_DATA_MODE        0x02 /* ordered data mode */
#define EXT4_INODE_WRITEBACK_DATA_MODE        0x04 /* writeback data mode */

int ext4_inode_journal_mode(struct inode *inode);

static inline int ext4_should_journal_data(struct inode *inode)
{
        return ext4_inode_journal_mode(inode) & EXT4_INODE_JOURNAL_DATA_MODE;
}

static inline int ext4_should_order_data(struct inode *inode)
{
        return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE;
}

static inline int ext4_should_writeback_data(struct inode *inode)
{
        return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE;
}

static inline int ext4_free_data_revoke_credits(struct inode *inode, int blocks)
{
        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
                return 0;
        if (!ext4_should_journal_data(inode))
                return 0;
        /*
         * Data blocks in one extent are contiguous, just account for partial
         * clusters at extent boundaries
         */
        return blocks + 2*(EXT4_SB(inode->i_sb)->s_cluster_ratio - 1);
}

/*
 * This function controls whether or not we should try to go down the
 * dioread_nolock code paths, which makes it safe to avoid taking
 * i_mutex for direct I/O reads.  This only works for extent-based
 * files, and it doesn't work if data journaling is enabled, since the
 * dioread_nolock code uses b_private to pass information back to the
 * I/O completion handler, and this conflicts with the jbd's use of
 * b_private.
 */
static inline int ext4_should_dioread_nolock(struct inode *inode)
{
        if (!test_opt(inode->i_sb, DIOREAD_NOLOCK))
                return 0;
        if (!S_ISREG(inode->i_mode))
                return 0;
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return 0;
        if (ext4_should_journal_data(inode))
                return 0;
        /* temporary fix to prevent generic/422 test failures */
        if (!test_opt(inode->i_sb, DELALLOC))
                return 0;
        return 1;
}

#endif        /* _EXT4_JBD2_H */






















































    3 













    3 








    3 


    3 




























    3 




    3 


    3 
    3 







































































































































































































































































































    3 

    3 
    3 

    3 





































































































































    3 










    3 



    3 
    3 



    3 






    1 



















    1 

    1 



    1 




















    1 


    3 
    3 





    3 






















    1 














    1 
















    1 














    1 














    1 




    1 






    1 




    1 

    1 








    1 
    1 




    1 








    1 



    3 


    1 



























































































    2 


    2 


    2 





    2 


    2 


    2 









































    2 


    2 
















    2 
    2 







    2 

    2 


    2 















    2 














































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 


    1 














    1 








    1 
    1 

    1 
    1 

    1 

















    1 


    1 
    1 











    1 








    1 








    1 

    1 





    1 








    1 




    1 









    1 












    1 





    1 




    1 

















    1 


    1 


    1 






























    1 


    1 

































































































































































































    1 












    1 


    1 











































































































































































































































































































































































































































    1 





    1 



    1 











    1 





















































































































































    3 




    3 



















    3 





































































































    3 
    3 




















    3 










    3 











    2 



    2 









    1 









    2 








    2 

    2 

    2 





    2 


    2 

    2 

    2 

    2 










































































    3 








































































































































































































































































































































































































    3 







    3 





































































































    3 


























    3 





    1 




    3 









    3 

    3 
    3 
    3 
    3 

    3 



    3 
    3 

    3 


    3 





    3 

    1 











    3 


    3 


    3 

    3 

    3 
    3 




    3 


    3 
    3 
    3 

    3 






    3 















    3 


















































































































































































































































































































































































































































































    1 
















    1 
















    1 




    1 


    1 


    1 






































    1 











    3 



    3 










    3 



    3 














    3 


    3 


    3 





    3 
































































































































































    3 




    3 



    3 



    3 




    3 
























    1 

    1 



    1 






























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ext4/inode.c
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  from
 *
 *  linux/fs/minix/inode.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  64-bit file support on 64-bit platforms by Jakub Jelinek
 *        (jj@sunsite.ms.mff.cuni.cz)
 *
 *  Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
 */

#include <linux/fs.h>
#include <linux/time.h>
#include <linux/highuid.h>
#include <linux/pagemap.h>
#include <linux/dax.h>
#include <linux/quotaops.h>
#include <linux/string.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/pagevec.h>
#include <linux/mpage.h>
#include <linux/namei.h>
#include <linux/uio.h>
#include <linux/bio.h>
#include <linux/workqueue.h>
#include <linux/kernel.h>
#include <linux/printk.h>
#include <linux/slab.h>
#include <linux/bitops.h>
#include <linux/iomap.h>
#include <linux/iversion.h>

#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
#include "truncate.h"

#include <trace/events/ext4.h>

static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
                              struct ext4_inode_info *ei)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        __u32 csum;
        __u16 dummy_csum = 0;
        int offset = offsetof(struct ext4_inode, i_checksum_lo);
        unsigned int csum_size = sizeof(dummy_csum);

        csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset);
        csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size);
        offset += csum_size;
        csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
                           EXT4_GOOD_OLD_INODE_SIZE - offset);

        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
                offset = offsetof(struct ext4_inode, i_checksum_hi);
                csum = ext4_chksum(sbi, csum, (__u8 *)raw +
                                   EXT4_GOOD_OLD_INODE_SIZE,
                                   offset - EXT4_GOOD_OLD_INODE_SIZE);
                if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
                        csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum,
                                           csum_size);
                        offset += csum_size;
                }
                csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
                                   EXT4_INODE_SIZE(inode->i_sb) - offset);
        }

        return csum;
}

static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
                                  struct ext4_inode_info *ei)
{
        __u32 provided, calculated;

        if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
            cpu_to_le32(EXT4_OS_LINUX) ||
            !ext4_has_metadata_csum(inode->i_sb))
                return 1;

        provided = le16_to_cpu(raw->i_checksum_lo);
        calculated = ext4_inode_csum(inode, raw, ei);
        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
            EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
                provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16;
        else
                calculated &= 0xFFFF;

        return provided == calculated;
}

void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
                         struct ext4_inode_info *ei)
{
        __u32 csum;

        if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
            cpu_to_le32(EXT4_OS_LINUX) ||
            !ext4_has_metadata_csum(inode->i_sb))
                return;

        csum = ext4_inode_csum(inode, raw, ei);
        raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF);
        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
            EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
                raw->i_checksum_hi = cpu_to_le16(csum >> 16);
}

static inline int ext4_begin_ordered_truncate(struct inode *inode,
                                              loff_t new_size)
{
        struct jbd2_inode *jinode = READ_ONCE(EXT4_I(inode)->jinode);

        trace_ext4_begin_ordered_truncate(inode, new_size);
        /*
         * If jinode is zero, then we never opened the file for
         * writing, so there's no need to call
         * jbd2_journal_begin_ordered_truncate() since there's no
         * outstanding writes we need to flush.
         */
        if (!jinode)
                return 0;
        return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
                                                   jinode,
                                                   new_size);
}

static void ext4_invalidatepage(struct page *page, unsigned int offset,
                                unsigned int length);
static int __ext4_journalled_writepage(struct page *page, unsigned int len);
static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
                                  int pextents);

/*
 * Test whether an inode is a fast symlink.
 * A fast symlink has its symlink data stored in ext4_inode_info->i_data.
 */
int ext4_inode_is_fast_symlink(struct inode *inode)
{
        if (!ext4_has_feature_ea_inode(inode->i_sb)) {
                int ea_blocks = EXT4_I(inode)->i_file_acl ?
                                EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;

                if (ext4_has_inline_data(inode))
                        return 0;

                return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
        }
        return S_ISLNK(inode->i_mode) && inode->i_size &&
               (inode->i_size < EXT4_N_BLOCKS * 4);
}

/*
 * Called at the last iput() if i_nlink is zero.
 */
void ext4_evict_inode(struct inode *inode)
{
        handle_t *handle;
        int err;
        /*
         * Credits for final inode cleanup and freeing:
         * sb + inode (ext4_orphan_del()), block bitmap, group descriptor
         * (xattr block freeing), bitmap, group descriptor (inode freeing)
         */
        int extra_credits = 6;
        struct ext4_xattr_inode_array *ea_inode_array = NULL;
        bool freeze_protected = false;

        trace_ext4_evict_inode(inode);

        if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)
                ext4_evict_ea_inode(inode);
        if (inode->i_nlink) {
                /*
                 * When journalling data dirty buffers are tracked only in the
                 * journal. So although mm thinks everything is clean and
                 * ready for reaping the inode might still have some pages to
                 * write in the running transaction or waiting to be
                 * checkpointed. Thus calling jbd2_journal_invalidatepage()
                 * (via truncate_inode_pages()) to discard these buffers can
                 * cause data loss. Also even if we did not discard these
                 * buffers, we would have no way to find them after the inode
                 * is reaped and thus user could see stale data if he tries to
                 * read them before the transaction is checkpointed. So be
                 * careful and force everything to disk here... We use
                 * ei->i_datasync_tid to store the newest transaction
                 * containing inode's data.
                 *
                 * Note that directories do not have this problem because they
                 * don't use page cache.
                 */
                if (inode->i_ino != EXT4_JOURNAL_INO &&
                    ext4_should_journal_data(inode) &&
                    (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) &&
                    inode->i_data.nrpages) {
                        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
                        tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;

                        jbd2_complete_transaction(journal, commit_tid);
                        filemap_write_and_wait(&inode->i_data);
                }
                truncate_inode_pages_final(&inode->i_data);

                goto no_delete;
        }

        if (is_bad_inode(inode))
                goto no_delete;
        dquot_initialize(inode);

        if (ext4_should_order_data(inode))
                ext4_begin_ordered_truncate(inode, 0);
        truncate_inode_pages_final(&inode->i_data);

        /*
         * For inodes with journalled data, transaction commit could have
         * dirtied the inode. And for inodes with dioread_nolock, unwritten
         * extents converting worker could merge extents and also have dirtied
         * the inode. Flush worker is ignoring it because of I_FREEING flag but
         * we still need to remove the inode from the writeback lists.
         */
        if (!list_empty_careful(&inode->i_io_list))
                inode_io_list_del(inode);

        /*
         * Protect us against freezing - iput() caller didn't have to have any
         * protection against it. When we are in a running transaction though,
         * we are already protected against freezing and we cannot grab further
         * protection due to lock ordering constraints.
         */
        if (!ext4_journal_current_handle()) {
                sb_start_intwrite(inode->i_sb);
                freeze_protected = true;
        }

        if (!IS_NOQUOTA(inode))
                extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb);

        /*
         * Block bitmap, group descriptor, and inode are accounted in both
         * ext4_blocks_for_truncate() and extra_credits. So subtract 3.
         */
        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
                         ext4_blocks_for_truncate(inode) + extra_credits - 3);
        if (IS_ERR(handle)) {
                ext4_std_error(inode->i_sb, PTR_ERR(handle));
                /*
                 * If we're going to skip the normal cleanup, we still need to
                 * make sure that the in-core orphan linked list is properly
                 * cleaned up.
                 */
                ext4_orphan_del(NULL, inode);
                if (freeze_protected)
                        sb_end_intwrite(inode->i_sb);
                goto no_delete;
        }

        if (IS_SYNC(inode))
                ext4_handle_sync(handle);

        /*
         * Set inode->i_size to 0 before calling ext4_truncate(). We need
         * special handling of symlinks here because i_size is used to
         * determine whether ext4_inode_info->i_data contains symlink data or
         * block mappings. Setting i_size to 0 will remove its fast symlink
         * status. Erase i_data so that it becomes a valid empty block map.
         */
        if (ext4_inode_is_fast_symlink(inode))
                memset(EXT4_I(inode)->i_data, 0, sizeof(EXT4_I(inode)->i_data));
        inode->i_size = 0;
        err = ext4_mark_inode_dirty(handle, inode);
        if (err) {
                ext4_warning(inode->i_sb,
                             "couldn't mark inode dirty (err %d)", err);
                goto stop_handle;
        }
        if (inode->i_blocks) {
                err = ext4_truncate(inode);
                if (err) {
                        ext4_error_err(inode->i_sb, -err,
                                       "couldn't truncate inode %lu (err %d)",
                                       inode->i_ino, err);
                        goto stop_handle;
                }
        }

        /* Remove xattr references. */
        err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array,
                                      extra_credits);
        if (err) {
                ext4_warning(inode->i_sb, "xattr delete (err %d)", err);
stop_handle:
                ext4_journal_stop(handle);
                ext4_orphan_del(NULL, inode);
                if (freeze_protected)
                        sb_end_intwrite(inode->i_sb);
                ext4_xattr_inode_array_free(ea_inode_array);
                goto no_delete;
        }

        /*
         * Kill off the orphan record which ext4_truncate created.
         * AKPM: I think this can be inside the above `if'.
         * Note that ext4_orphan_del() has to be able to cope with the
         * deletion of a non-existent orphan - this is because we don't
         * know if ext4_truncate() actually created an orphan record.
         * (Well, we could do this if we need to, but heck - it works)
         */
        ext4_orphan_del(handle, inode);
        EXT4_I(inode)->i_dtime        = (__u32)ktime_get_real_seconds();

        /*
         * One subtle ordering requirement: if anything has gone wrong
         * (transaction abort, IO errors, whatever), then we can still
         * do these next steps (the fs will already have been marked as
         * having errors), but we can't free the inode if the mark_dirty
         * fails.
         */
        if (ext4_mark_inode_dirty(handle, inode))
                /* If that failed, just do the required in-core inode clear. */
                ext4_clear_inode(inode);
        else
                ext4_free_inode(handle, inode);
        ext4_journal_stop(handle);
        if (freeze_protected)
                sb_end_intwrite(inode->i_sb);
        ext4_xattr_inode_array_free(ea_inode_array);
        return;
no_delete:
        /*
         * Check out some where else accidentally dirty the evicting inode,
         * which may probably cause inode use-after-free issues later.
         */
        WARN_ON_ONCE(!list_empty_careful(&inode->i_io_list));

        if (!list_empty(&EXT4_I(inode)->i_fc_list))
                ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
        ext4_clear_inode(inode);        /* We must guarantee clearing of inode... */
}

#ifdef CONFIG_QUOTA
qsize_t *ext4_get_reserved_space(struct inode *inode)
{
        return &EXT4_I(inode)->i_reserved_quota;
}
#endif

/*
 * Called with i_data_sem down, which is important since we can call
 * ext4_discard_preallocations() from here.
 */
void ext4_da_update_reserve_space(struct inode *inode,
                                        int used, int quota_claim)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);

        spin_lock(&ei->i_block_reservation_lock);
        trace_ext4_da_update_reserve_space(inode, used, quota_claim);
        if (unlikely(used > ei->i_reserved_data_blocks)) {
                ext4_warning(inode->i_sb, "%s: ino %lu, used %d "
                         "with only %d reserved data blocks",
                         __func__, inode->i_ino, used,
                         ei->i_reserved_data_blocks);
                WARN_ON(1);
                used = ei->i_reserved_data_blocks;
        }

        /* Update per-inode reservations */
        ei->i_reserved_data_blocks -= used;
        percpu_counter_sub(&sbi->s_dirtyclusters_counter, used);

        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);

        /* Update quota subsystem for data blocks */
        if (quota_claim)
                dquot_claim_block(inode, EXT4_C2B(sbi, used));
        else {
                /*
                 * We did fallocate with an offset that is already delayed
                 * allocated. So on delayed allocated writeback we should
                 * not re-claim the quota for fallocated blocks.
                 */
                dquot_release_reservation_block(inode, EXT4_C2B(sbi, used));
        }

        /*
         * If we have done all the pending block allocations and if
         * there aren't any writers on the inode, we can discard the
         * inode's preallocations.
         */
        if ((ei->i_reserved_data_blocks == 0) &&
            !inode_is_open_for_write(inode))
                ext4_discard_preallocations(inode, 0);
}

static int __check_block_validity(struct inode *inode, const char *func,
                                unsigned int line,
                                struct ext4_map_blocks *map)
{
        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;

        if (journal && inode == journal->j_inode)
                return 0;

        if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) {
                ext4_error_inode(inode, func, line, map->m_pblk,
                                 "lblock %lu mapped to illegal pblock %llu "
                                 "(length %d)", (unsigned long) map->m_lblk,
                                 map->m_pblk, map->m_len);
                return -EFSCORRUPTED;
        }
        return 0;
}

int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
                       ext4_lblk_t len)
{
        int ret;

        if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
                return fscrypt_zeroout_range(inode, lblk, pblk, len);

        ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS);
        if (ret > 0)
                ret = 0;

        return ret;
}

#define check_block_validity(inode, map)        \
        __check_block_validity((inode), __func__, __LINE__, (map))

#ifdef ES_AGGRESSIVE_TEST
static void ext4_map_blocks_es_recheck(handle_t *handle,
                                       struct inode *inode,
                                       struct ext4_map_blocks *es_map,
                                       struct ext4_map_blocks *map,
                                       int flags)
{
        int retval;

        map->m_flags = 0;
        /*
         * There is a race window that the result is not the same.
         * e.g. xfstests #223 when dioread_nolock enables.  The reason
         * is that we lookup a block mapping in extent status tree with
         * out taking i_data_sem.  So at the time the unwritten extent
         * could be converted.
         */
        down_read(&EXT4_I(inode)->i_data_sem);
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                retval = ext4_ext_map_blocks(handle, inode, map, 0);
        } else {
                retval = ext4_ind_map_blocks(handle, inode, map, 0);
        }
        up_read((&EXT4_I(inode)->i_data_sem));

        /*
         * We don't check m_len because extent will be collpased in status
         * tree.  So the m_len might not equal.
         */
        if (es_map->m_lblk != map->m_lblk ||
            es_map->m_flags != map->m_flags ||
            es_map->m_pblk != map->m_pblk) {
                printk("ES cache assertion failed for inode: %lu "
                       "es_cached ex [%d/%d/%llu/%x] != "
                       "found ex [%d/%d/%llu/%x] retval %d flags %x\n",
                       inode->i_ino, es_map->m_lblk, es_map->m_len,
                       es_map->m_pblk, es_map->m_flags, map->m_lblk,
                       map->m_len, map->m_pblk, map->m_flags,
                       retval, flags);
        }
}
#endif /* ES_AGGRESSIVE_TEST */

static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
                                 struct ext4_map_blocks *map)
{
        unsigned int status;
        int retval;

        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                retval = ext4_ext_map_blocks(handle, inode, map, 0);
        else
                retval = ext4_ind_map_blocks(handle, inode, map, 0);

        if (retval <= 0)
                return retval;

        if (unlikely(retval != map->m_len)) {
                ext4_warning(inode->i_sb,
                             "ES len assertion failed for inode "
                             "%lu: retval %d != map->m_len %d",
                             inode->i_ino, retval, map->m_len);
                WARN_ON(1);
        }

        status = map->m_flags & EXT4_MAP_UNWRITTEN ?
                        EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
        ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
                              map->m_pblk, status);
        return retval;
}

/*
 * The ext4_map_blocks() function tries to look up the requested blocks,
 * and returns if the blocks are already mapped.
 *
 * Otherwise it takes the write lock of the i_data_sem and allocate blocks
 * and store the allocated blocks in the result buffer head and mark it
 * mapped.
 *
 * If file type is extents based, it will call ext4_ext_map_blocks(),
 * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
 * based files
 *
 * On success, it returns the number of blocks being mapped or allocated.  if
 * create==0 and the blocks are pre-allocated and unwritten, the resulting @map
 * is marked as unwritten. If the create == 1, it will mark @map as mapped.
 *
 * It returns 0 if plain look up failed (blocks have not been allocated), in
 * that case, @map is returned as unmapped but we still do fill map->m_len to
 * indicate the length of a hole starting at map->m_lblk.
 *
 * It returns the error in case of allocation failure.
 */
int ext4_map_blocks(handle_t *handle, struct inode *inode,
                    struct ext4_map_blocks *map, int flags)
{
        struct extent_status es;
        int retval;
        int ret = 0;
#ifdef ES_AGGRESSIVE_TEST
        struct ext4_map_blocks orig_map;

        memcpy(&orig_map, map, sizeof(*map));
#endif

        map->m_flags = 0;
        ext_debug(inode, "flag 0x%x, max_blocks %u, logical block %lu\n",
                  flags, map->m_len, (unsigned long) map->m_lblk);

        /*
         * ext4_map_blocks returns an int, and m_len is an unsigned int
         */
        if (unlikely(map->m_len > INT_MAX))
                map->m_len = INT_MAX;

        /* We can handle the block number less than EXT_MAX_BLOCKS */
        if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS))
                return -EFSCORRUPTED;

        /* Lookup extent status tree firstly */
        if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) &&
            ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
                if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
                        map->m_pblk = ext4_es_pblock(&es) +
                                        map->m_lblk - es.es_lblk;
                        map->m_flags |= ext4_es_is_written(&es) ?
                                        EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
                        retval = es.es_len - (map->m_lblk - es.es_lblk);
                        if (retval > map->m_len)
                                retval = map->m_len;
                        map->m_len = retval;
                } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
                        map->m_pblk = 0;
                        retval = es.es_len - (map->m_lblk - es.es_lblk);
                        if (retval > map->m_len)
                                retval = map->m_len;
                        map->m_len = retval;
                        retval = 0;
                } else {
                        BUG();
                }
#ifdef ES_AGGRESSIVE_TEST
                ext4_map_blocks_es_recheck(handle, inode, map,
                                           &orig_map, flags);
#endif
                goto found;
        }

        /*
         * Try to see if we can get the block without requesting a new
         * file system block.
         */
        down_read(&EXT4_I(inode)->i_data_sem);
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                retval = ext4_ext_map_blocks(handle, inode, map, 0);
        } else {
                retval = ext4_ind_map_blocks(handle, inode, map, 0);
        }
        if (retval > 0) {
                unsigned int status;

                if (unlikely(retval != map->m_len)) {
                        ext4_warning(inode->i_sb,
                                     "ES len assertion failed for inode "
                                     "%lu: retval %d != map->m_len %d",
                                     inode->i_ino, retval, map->m_len);
                        WARN_ON(1);
                }

                status = map->m_flags & EXT4_MAP_UNWRITTEN ?
                                EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
                if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
                    !(status & EXTENT_STATUS_WRITTEN) &&
                    ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
                                       map->m_lblk + map->m_len - 1))
                        status |= EXTENT_STATUS_DELAYED;
                ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
                                      map->m_pblk, status);
        }
        up_read((&EXT4_I(inode)->i_data_sem));

found:
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
                ret = check_block_validity(inode, map);
                if (ret != 0)
                        return ret;
        }

        /* If it is only a block(s) look up */
        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
                return retval;

        /*
         * Returns if the blocks have already allocated
         *
         * Note that if blocks have been preallocated
         * ext4_ext_get_block() returns the create = 0
         * with buffer head unmapped.
         */
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
                /*
                 * If we need to convert extent to unwritten
                 * we continue and do the actual work in
                 * ext4_ext_map_blocks()
                 */
                if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN))
                        return retval;

        /*
         * Here we clear m_flags because after allocating an new extent,
         * it will be set again.
         */
        map->m_flags &= ~EXT4_MAP_FLAGS;

        /*
         * New blocks allocate and/or writing to unwritten extent
         * will possibly result in updating i_data, so we take
         * the write lock of i_data_sem, and call get_block()
         * with create == 1 flag.
         */
        down_write(&EXT4_I(inode)->i_data_sem);

        /*
         * We need to check for EXT4 here because migrate
         * could have changed the inode type in between
         */
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                retval = ext4_ext_map_blocks(handle, inode, map, flags);
        } else {
                retval = ext4_ind_map_blocks(handle, inode, map, flags);

                if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
                        /*
                         * We allocated new blocks which will result in
                         * i_data's format changing.  Force the migrate
                         * to fail by clearing migrate flags
                         */
                        ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
                }
        }

        if (retval > 0) {
                unsigned int status;

                if (unlikely(retval != map->m_len)) {
                        ext4_warning(inode->i_sb,
                                     "ES len assertion failed for inode "
                                     "%lu: retval %d != map->m_len %d",
                                     inode->i_ino, retval, map->m_len);
                        WARN_ON(1);
                }

                /*
                 * We have to zeroout blocks before inserting them into extent
                 * status tree. Otherwise someone could look them up there and
                 * use them before they are really zeroed. We also have to
                 * unmap metadata before zeroing as otherwise writeback can
                 * overwrite zeros with stale data from block device.
                 */
                if (flags & EXT4_GET_BLOCKS_ZERO &&
                    map->m_flags & EXT4_MAP_MAPPED &&
                    map->m_flags & EXT4_MAP_NEW) {
                        ret = ext4_issue_zeroout(inode, map->m_lblk,
                                                 map->m_pblk, map->m_len);
                        if (ret) {
                                retval = ret;
                                goto out_sem;
                        }
                }

                /*
                 * If the extent has been zeroed out, we don't need to update
                 * extent status tree.
                 */
                if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
                    ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
                        if (ext4_es_is_written(&es))
                                goto out_sem;
                }
                status = map->m_flags & EXT4_MAP_UNWRITTEN ?
                                EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
                if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
                    !(status & EXTENT_STATUS_WRITTEN) &&
                    ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
                                       map->m_lblk + map->m_len - 1))
                        status |= EXTENT_STATUS_DELAYED;
                ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
                                      map->m_pblk, status);
        }

out_sem:
        up_write((&EXT4_I(inode)->i_data_sem));
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
                ret = check_block_validity(inode, map);
                if (ret != 0)
                        return ret;

                /*
                 * Inodes with freshly allocated blocks where contents will be
                 * visible after transaction commit must be on transaction's
                 * ordered data list.
                 */
                if (map->m_flags & EXT4_MAP_NEW &&
                    !(map->m_flags & EXT4_MAP_UNWRITTEN) &&
                    !(flags & EXT4_GET_BLOCKS_ZERO) &&
                    !ext4_is_quota_file(inode) &&
                    ext4_should_order_data(inode)) {
                        loff_t start_byte =
                                (loff_t)map->m_lblk << inode->i_blkbits;
                        loff_t length = (loff_t)map->m_len << inode->i_blkbits;

                        if (flags & EXT4_GET_BLOCKS_IO_SUBMIT)
                                ret = ext4_jbd2_inode_add_wait(handle, inode,
                                                start_byte, length);
                        else
                                ret = ext4_jbd2_inode_add_write(handle, inode,
                                                start_byte, length);
                        if (ret)
                                return ret;
                }
        }
        if (retval > 0 && (map->m_flags & EXT4_MAP_UNWRITTEN ||
                                map->m_flags & EXT4_MAP_MAPPED))
                ext4_fc_track_range(handle, inode, map->m_lblk,
                                        map->m_lblk + map->m_len - 1);
        if (retval < 0)
                ext_debug(inode, "failed with err %d\n", retval);
        return retval;
}

/*
 * Update EXT4_MAP_FLAGS in bh->b_state. For buffer heads attached to pages
 * we have to be careful as someone else may be manipulating b_state as well.
 */
static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags)
{
        unsigned long old_state;
        unsigned long new_state;

        flags &= EXT4_MAP_FLAGS;

        /* Dummy buffer_head? Set non-atomically. */
        if (!bh->b_page) {
                bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags;
                return;
        }
        /*
         * Someone else may be modifying b_state. Be careful! This is ugly but
         * once we get rid of using bh as a container for mapping information
         * to pass to / from get_block functions, this can go away.
         */
        do {
                old_state = READ_ONCE(bh->b_state);
                new_state = (old_state & ~EXT4_MAP_FLAGS) | flags;
        } while (unlikely(
                 cmpxchg(&bh->b_state, old_state, new_state) != old_state));
}

static int _ext4_get_block(struct inode *inode, sector_t iblock,
                           struct buffer_head *bh, int flags)
{
        struct ext4_map_blocks map;
        int ret = 0;

        if (ext4_has_inline_data(inode))
                return -ERANGE;

        map.m_lblk = iblock;
        map.m_len = bh->b_size >> inode->i_blkbits;

        ret = ext4_map_blocks(ext4_journal_current_handle(), inode, &map,
                              flags);
        if (ret > 0) {
                map_bh(bh, inode->i_sb, map.m_pblk);
                ext4_update_bh_state(bh, map.m_flags);
                bh->b_size = inode->i_sb->s_blocksize * map.m_len;
                ret = 0;
        } else if (ret == 0) {
                /* hole case, need to fill in bh->b_size */
                bh->b_size = inode->i_sb->s_blocksize * map.m_len;
        }
        return ret;
}

int ext4_get_block(struct inode *inode, sector_t iblock,
                   struct buffer_head *bh, int create)
{
        return _ext4_get_block(inode, iblock, bh,
                               create ? EXT4_GET_BLOCKS_CREATE : 0);
}

/*
 * Get block function used when preparing for buffered write if we require
 * creating an unwritten extent if blocks haven't been allocated.  The extent
 * will be converted to written after the IO is complete.
 */
int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
                             struct buffer_head *bh_result, int create)
{
        ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n",
                   inode->i_ino, create);
        return _ext4_get_block(inode, iblock, bh_result,
                               EXT4_GET_BLOCKS_IO_CREATE_EXT);
}

/* Maximum number of blocks we map for direct IO at once. */
#define DIO_MAX_BLOCKS 4096

/*
 * `handle' can be NULL if create is zero
 */
struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
                                ext4_lblk_t block, int map_flags)
{
        struct ext4_map_blocks map;
        struct buffer_head *bh;
        int create = map_flags & EXT4_GET_BLOCKS_CREATE;
        int err;

        J_ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                 || handle != NULL || create == 0);

        map.m_lblk = block;
        map.m_len = 1;
        err = ext4_map_blocks(handle, inode, &map, map_flags);

        if (err == 0)
                return create ? ERR_PTR(-ENOSPC) : NULL;
        if (err < 0)
                return ERR_PTR(err);

        bh = sb_getblk(inode->i_sb, map.m_pblk);
        if (unlikely(!bh))
                return ERR_PTR(-ENOMEM);
        if (map.m_flags & EXT4_MAP_NEW) {
                J_ASSERT(create != 0);
                J_ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                         || (handle != NULL));

                /*
                 * Now that we do not always journal data, we should
                 * keep in mind whether this should always journal the
                 * new buffer as metadata.  For now, regular file
                 * writes use ext4_get_block instead, so it's not a
                 * problem.
                 */
                lock_buffer(bh);
                BUFFER_TRACE(bh, "call get_create_access");
                err = ext4_journal_get_create_access(handle, bh);
                if (unlikely(err)) {
                        unlock_buffer(bh);
                        goto errout;
                }
                if (!buffer_uptodate(bh)) {
                        memset(bh->b_data, 0, inode->i_sb->s_blocksize);
                        set_buffer_uptodate(bh);
                }
                unlock_buffer(bh);
                BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
                err = ext4_handle_dirty_metadata(handle, inode, bh);
                if (unlikely(err))
                        goto errout;
        } else
                BUFFER_TRACE(bh, "not a new buffer");
        return bh;
errout:
        brelse(bh);
        return ERR_PTR(err);
}

struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
                               ext4_lblk_t block, int map_flags)
{
        struct buffer_head *bh;
        int ret;

        bh = ext4_getblk(handle, inode, block, map_flags);
        if (IS_ERR(bh))
                return bh;
        if (!bh || ext4_buffer_uptodate(bh))
                return bh;

        ret = ext4_read_bh_lock(bh, REQ_META | REQ_PRIO, true);
        if (ret) {
                put_bh(bh);
                return ERR_PTR(ret);
        }
        return bh;
}

/* Read a contiguous batch of blocks. */
int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count,
                     bool wait, struct buffer_head **bhs)
{
        int i, err;

        for (i = 0; i < bh_count; i++) {
                bhs[i] = ext4_getblk(NULL, inode, block + i, 0 /* map_flags */);
                if (IS_ERR(bhs[i])) {
                        err = PTR_ERR(bhs[i]);
                        bh_count = i;
                        goto out_brelse;
                }
        }

        for (i = 0; i < bh_count; i++)
                /* Note that NULL bhs[i] is valid because of holes. */
                if (bhs[i] && !ext4_buffer_uptodate(bhs[i]))
                        ext4_read_bh_lock(bhs[i], REQ_META | REQ_PRIO, false);

        if (!wait)
                return 0;

        for (i = 0; i < bh_count; i++)
                if (bhs[i])
                        wait_on_buffer(bhs[i]);

        for (i = 0; i < bh_count; i++) {
                if (bhs[i] && !buffer_uptodate(bhs[i])) {
                        err = -EIO;
                        goto out_brelse;
                }
        }
        return 0;

out_brelse:
        for (i = 0; i < bh_count; i++) {
                brelse(bhs[i]);
                bhs[i] = NULL;
        }
        return err;
}

int ext4_walk_page_buffers(handle_t *handle,
                           struct buffer_head *head,
                           unsigned from,
                           unsigned to,
                           int *partial,
                           int (*fn)(handle_t *handle,
                                     struct buffer_head *bh))
{
        struct buffer_head *bh;
        unsigned block_start, block_end;
        unsigned blocksize = head->b_size;
        int err, ret = 0;
        struct buffer_head *next;

        for (bh = head, block_start = 0;
             ret == 0 && (bh != head || !block_start);
             block_start = block_end, bh = next) {
                next = bh->b_this_page;
                block_end = block_start + blocksize;
                if (block_end <= from || block_start >= to) {
                        if (partial && !buffer_uptodate(bh))
                                *partial = 1;
                        continue;
                }
                err = (*fn)(handle, bh);
                if (!ret)
                        ret = err;
        }
        return ret;
}

/*
 * To preserve ordering, it is essential that the hole instantiation and
 * the data write be encapsulated in a single transaction.  We cannot
 * close off a transaction and start a new one between the ext4_get_block()
 * and the commit_write().  So doing the jbd2_journal_start at the start of
 * prepare_write() is the right place.
 *
 * Also, this function can nest inside ext4_writepage().  In that case, we
 * *know* that ext4_writepage() has generated enough buffer credits to do the
 * whole page.  So we won't block on the journal in that case, which is good,
 * because the caller may be PF_MEMALLOC.
 *
 * By accident, ext4 can be reentered when a transaction is open via
 * quota file writes.  If we were to commit the transaction while thus
 * reentered, there can be a deadlock - we would be holding a quota
 * lock, and the commit would never complete if another thread had a
 * transaction open and was blocking on the quota lock - a ranking
 * violation.
 *
 * So what we do is to rely on the fact that jbd2_journal_stop/journal_start
 * will _not_ run commit under these circumstances because handle->h_ref
 * is elevated.  We'll still have enough credits for the tiny quotafile
 * write.
 */
int do_journal_get_write_access(handle_t *handle,
                                struct buffer_head *bh)
{
        int dirty = buffer_dirty(bh);
        int ret;

        if (!buffer_mapped(bh) || buffer_freed(bh))
                return 0;
        /*
         * __block_write_begin() could have dirtied some buffers. Clean
         * the dirty bit as jbd2_journal_get_write_access() could complain
         * otherwise about fs integrity issues. Setting of the dirty bit
         * by __block_write_begin() isn't a real problem here as we clear
         * the bit before releasing a page lock and thus writeback cannot
         * ever write the buffer.
         */
        if (dirty)
                clear_buffer_dirty(bh);
        BUFFER_TRACE(bh, "get write access");
        ret = ext4_journal_get_write_access(handle, bh);
        if (!ret && dirty)
                ret = ext4_handle_dirty_metadata(handle, NULL, bh);
        return ret;
}

#ifdef CONFIG_FS_ENCRYPTION
static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
                                  get_block_t *get_block)
{
        unsigned from = pos & (PAGE_SIZE - 1);
        unsigned to = from + len;
        struct inode *inode = page->mapping->host;
        unsigned block_start, block_end;
        sector_t block;
        int err = 0;
        unsigned blocksize = inode->i_sb->s_blocksize;
        unsigned bbits;
        struct buffer_head *bh, *head, *wait[2];
        int nr_wait = 0;
        int i;

        BUG_ON(!PageLocked(page));
        BUG_ON(from > PAGE_SIZE);
        BUG_ON(to > PAGE_SIZE);
        BUG_ON(from > to);

        if (!page_has_buffers(page))
                create_empty_buffers(page, blocksize, 0);
        head = page_buffers(page);
        bbits = ilog2(blocksize);
        block = (sector_t)page->index << (PAGE_SHIFT - bbits);

        for (bh = head, block_start = 0; bh != head || !block_start;
            block++, block_start = block_end, bh = bh->b_this_page) {
                block_end = block_start + blocksize;
                if (block_end <= from || block_start >= to) {
                        if (PageUptodate(page)) {
                                if (!buffer_uptodate(bh))
                                        set_buffer_uptodate(bh);
                        }
                        continue;
                }
                if (buffer_new(bh))
                        clear_buffer_new(bh);
                if (!buffer_mapped(bh)) {
                        WARN_ON(bh->b_size != blocksize);
                        err = get_block(inode, block, bh, 1);
                        if (err)
                                break;
                        if (buffer_new(bh)) {
                                if (PageUptodate(page)) {
                                        clear_buffer_new(bh);
                                        set_buffer_uptodate(bh);
                                        mark_buffer_dirty(bh);
                                        continue;
                                }
                                if (block_end > to || block_start < from)
                                        zero_user_segments(page, to, block_end,
                                                           block_start, from);
                                continue;
                        }
                }
                if (PageUptodate(page)) {
                        if (!buffer_uptodate(bh))
                                set_buffer_uptodate(bh);
                        continue;
                }
                if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
                    !buffer_unwritten(bh) &&
                    (block_start < from || block_end > to)) {
                        ext4_read_bh_lock(bh, 0, false);
                        wait[nr_wait++] = bh;
                }
        }
        /*
         * If we issued read requests, let them complete.
         */
        for (i = 0; i < nr_wait; i++) {
                wait_on_buffer(wait[i]);
                if (!buffer_uptodate(wait[i]))
                        err = -EIO;
        }
        if (unlikely(err)) {
                page_zero_new_buffers(page, from, to);
        } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
                for (i = 0; i < nr_wait; i++) {
                        int err2;

                        err2 = fscrypt_decrypt_pagecache_blocks(page, blocksize,
                                                                bh_offset(wait[i]));
                        if (err2) {
                                clear_buffer_uptodate(wait[i]);
                                err = err2;
                        }
                }
        }

        return err;
}
#endif

static int ext4_write_begin(struct file *file, struct address_space *mapping,
                            loff_t pos, unsigned len, unsigned flags,
                            struct page **pagep, void **fsdata)
{
        struct inode *inode = mapping->host;
        int ret, needed_blocks;
        handle_t *handle;
        int retries = 0;
        struct page *page;
        pgoff_t index;
        unsigned from, to;

        if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
                return -EIO;

        trace_ext4_write_begin(inode, pos, len, flags);
        /*
         * Reserve one block more for addition to orphan list in case
         * we allocate blocks but write fails for some reason
         */
        needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
        index = pos >> PAGE_SHIFT;
        from = pos & (PAGE_SIZE - 1);
        to = from + len;

        if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
                ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
                                                    flags, pagep);
                if (ret < 0)
                        return ret;
                if (ret == 1)
                        return 0;
        }

        /*
         * grab_cache_page_write_begin() can take a long time if the
         * system is thrashing due to memory pressure, or if the page
         * is being written back.  So grab it first before we start
         * the transaction handle.  This also allows us to allocate
         * the page (if needed) without using GFP_NOFS.
         */
retry_grab:
        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
                return -ENOMEM;
        /*
         * The same as page allocation, we prealloc buffer heads before
         * starting the handle.
         */
        if (!page_has_buffers(page))
                create_empty_buffers(page, inode->i_sb->s_blocksize, 0);

        unlock_page(page);

retry_journal:
        handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
        if (IS_ERR(handle)) {
                put_page(page);
                return PTR_ERR(handle);
        }

        lock_page(page);
        if (page->mapping != mapping) {
                /* The page got truncated from under us */
                unlock_page(page);
                put_page(page);
                ext4_journal_stop(handle);
                goto retry_grab;
        }
        /* In case writeback began while the page was unlocked */
        wait_for_stable_page(page);

#ifdef CONFIG_FS_ENCRYPTION
        if (ext4_should_dioread_nolock(inode))
                ret = ext4_block_write_begin(page, pos, len,
                                             ext4_get_block_unwritten);
        else
                ret = ext4_block_write_begin(page, pos, len,
                                             ext4_get_block);
#else
        if (ext4_should_dioread_nolock(inode))
                ret = __block_write_begin(page, pos, len,
                                          ext4_get_block_unwritten);
        else
                ret = __block_write_begin(page, pos, len, ext4_get_block);
#endif
        if (!ret && ext4_should_journal_data(inode)) {
                ret = ext4_walk_page_buffers(handle, page_buffers(page),
                                             from, to, NULL,
                                             do_journal_get_write_access);
        }

        if (ret) {
                bool extended = (pos + len > inode->i_size) &&
                                !ext4_verity_in_progress(inode);

                unlock_page(page);
                /*
                 * __block_write_begin may have instantiated a few blocks
                 * outside i_size.  Trim these off again. Don't need
                 * i_size_read because we hold i_mutex.
                 *
                 * Add inode to orphan list in case we crash before
                 * truncate finishes
                 */
                if (extended && ext4_can_truncate(inode))
                        ext4_orphan_add(handle, inode);

                ext4_journal_stop(handle);
                if (extended) {
                        ext4_truncate_failed_write(inode);
                        /*
                         * If truncate failed early the inode might
                         * still be on the orphan list; we need to
                         * make sure the inode is removed from the
                         * orphan list in that case.
                         */
                        if (inode->i_nlink)
                                ext4_orphan_del(NULL, inode);
                }

                if (ret == -ENOSPC &&
                    ext4_should_retry_alloc(inode->i_sb, &retries))
                        goto retry_journal;
                put_page(page);
                return ret;
        }
        *pagep = page;
        return ret;
}

/* For write_end() in data=journal mode */
static int write_end_fn(handle_t *handle, struct buffer_head *bh)
{
        int ret;
        if (!buffer_mapped(bh) || buffer_freed(bh))
                return 0;
        set_buffer_uptodate(bh);
        ret = ext4_handle_dirty_metadata(handle, NULL, bh);
        clear_buffer_meta(bh);
        clear_buffer_prio(bh);
        return ret;
}

/*
 * We need to pick up the new inode size which generic_commit_write gave us
 * `file' can be NULL - eg, when called from page_symlink().
 *
 * ext4 never places buffers on inode->i_mapping->private_list.  metadata
 * buffers are managed internally.
 */
static int ext4_write_end(struct file *file,
                          struct address_space *mapping,
                          loff_t pos, unsigned len, unsigned copied,
                          struct page *page, void *fsdata)
{
        handle_t *handle = ext4_journal_current_handle();
        struct inode *inode = mapping->host;
        loff_t old_size = inode->i_size;
        int ret = 0, ret2;
        int i_size_changed = 0;
        int inline_data = ext4_has_inline_data(inode);
        bool verity = ext4_verity_in_progress(inode);

        trace_ext4_write_end(inode, pos, len, copied);
        if (inline_data &&
            ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
                ret = ext4_write_inline_data_end(inode, pos, len,
                                                 copied, page);
                if (ret < 0) {
                        unlock_page(page);
                        put_page(page);
                        goto errout;
                }
                copied = ret;
                ret = 0;
        } else
                copied = block_write_end(file, mapping, pos,
                                         len, copied, page, fsdata);
        /*
         * it's important to update i_size while still holding page lock:
         * page writeout could otherwise come in and zero beyond i_size.
         *
         * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree
         * blocks are being written past EOF, so skip the i_size update.
         */
        if (!verity)
                i_size_changed = ext4_update_inode_size(inode, pos + copied);
        unlock_page(page);
        put_page(page);

        if (old_size < pos && !verity)
                pagecache_isize_extended(inode, old_size, pos);
        /*
         * Don't mark the inode dirty under page lock. First, it unnecessarily
         * makes the holding time of page lock longer. Second, it forces lock
         * ordering of page lock and transaction start for journaling
         * filesystems.
         */
        if (i_size_changed || inline_data)
                ret = ext4_mark_inode_dirty(handle, inode);

errout:
        if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
                /* if we have allocated more blocks and copied
                 * less. We will have blocks allocated outside
                 * inode->i_size. So truncate them
                 */
                ext4_orphan_add(handle, inode);

        ret2 = ext4_journal_stop(handle);
        if (!ret)
                ret = ret2;

        if (pos + len > inode->i_size && !verity) {
                ext4_truncate_failed_write(inode);
                /*
                 * If truncate failed early the inode might still be
                 * on the orphan list; we need to make sure the inode
                 * is removed from the orphan list in that case.
                 */
                if (inode->i_nlink)
                        ext4_orphan_del(NULL, inode);
        }

        return ret ? ret : copied;
}

/*
 * This is a private version of page_zero_new_buffers() which doesn't
 * set the buffer to be dirty, since in data=journalled mode we need
 * to call ext4_handle_dirty_metadata() instead.
 */
static void ext4_journalled_zero_new_buffers(handle_t *handle,
                                            struct page *page,
                                            unsigned from, unsigned to)
{
        unsigned int block_start = 0, block_end;
        struct buffer_head *head, *bh;

        bh = head = page_buffers(page);
        do {
                block_end = block_start + bh->b_size;
                if (buffer_new(bh)) {
                        if (block_end > from && block_start < to) {
                                if (!PageUptodate(page)) {
                                        unsigned start, size;

                                        start = max(from, block_start);
                                        size = min(to, block_end) - start;

                                        zero_user(page, start, size);
                                        write_end_fn(handle, bh);
                                }
                                clear_buffer_new(bh);
                        }
                }
                block_start = block_end;
                bh = bh->b_this_page;
        } while (bh != head);
}

static int ext4_journalled_write_end(struct file *file,
                                     struct address_space *mapping,
                                     loff_t pos, unsigned len, unsigned copied,
                                     struct page *page, void *fsdata)
{
        handle_t *handle = ext4_journal_current_handle();
        struct inode *inode = mapping->host;
        loff_t old_size = inode->i_size;
        int ret = 0, ret2;
        int partial = 0;
        unsigned from, to;
        int size_changed = 0;
        int inline_data = ext4_has_inline_data(inode);
        bool verity = ext4_verity_in_progress(inode);

        trace_ext4_journalled_write_end(inode, pos, len, copied);
        from = pos & (PAGE_SIZE - 1);
        to = from + len;

        BUG_ON(!ext4_handle_valid(handle));

        if (inline_data) {
                ret = ext4_write_inline_data_end(inode, pos, len,
                                                 copied, page);
                if (ret < 0) {
                        unlock_page(page);
                        put_page(page);
                        goto errout;
                }
                copied = ret;
                ret = 0;
        } else if (unlikely(copied < len) && !PageUptodate(page)) {
                copied = 0;
                ext4_journalled_zero_new_buffers(handle, page, from, to);
        } else {
                if (unlikely(copied < len))
                        ext4_journalled_zero_new_buffers(handle, page,
                                                         from + copied, to);
                ret = ext4_walk_page_buffers(handle, page_buffers(page), from,
                                             from + copied, &partial,
                                             write_end_fn);
                if (!partial)
                        SetPageUptodate(page);
        }
        if (!verity)
                size_changed = ext4_update_inode_size(inode, pos + copied);
        ext4_set_inode_state(inode, EXT4_STATE_JDATA);
        EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
        unlock_page(page);
        put_page(page);

        if (old_size < pos && !verity)
                pagecache_isize_extended(inode, old_size, pos);

        if (size_changed || inline_data) {
                ret2 = ext4_mark_inode_dirty(handle, inode);
                if (!ret)
                        ret = ret2;
        }

errout:
        if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
                /* if we have allocated more blocks and copied
                 * less. We will have blocks allocated outside
                 * inode->i_size. So truncate them
                 */
                ext4_orphan_add(handle, inode);

        ret2 = ext4_journal_stop(handle);
        if (!ret)
                ret = ret2;
        if (pos + len > inode->i_size && !verity) {
                ext4_truncate_failed_write(inode);
                /*
                 * If truncate failed early the inode might still be
                 * on the orphan list; we need to make sure the inode
                 * is removed from the orphan list in that case.
                 */
                if (inode->i_nlink)
                        ext4_orphan_del(NULL, inode);
        }

        return ret ? ret : copied;
}

/*
 * Reserve space for a single cluster
 */
static int ext4_da_reserve_space(struct inode *inode)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);
        int ret;

        /*
         * We will charge metadata quota at writeout time; this saves
         * us from metadata over-estimation, though we may go over by
         * a small amount in the end.  Here we just reserve for data.
         */
        ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1));
        if (ret)
                return ret;

        spin_lock(&ei->i_block_reservation_lock);
        if (ext4_claim_free_clusters(sbi, 1, 0)) {
                spin_unlock(&ei->i_block_reservation_lock);
                dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
                return -ENOSPC;
        }
        ei->i_reserved_data_blocks++;
        trace_ext4_da_reserve_space(inode);
        spin_unlock(&ei->i_block_reservation_lock);

        return 0;       /* success */
}

void ext4_da_release_space(struct inode *inode, int to_free)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);

        if (!to_free)
                return;                /* Nothing to release, exit */

        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);

        trace_ext4_da_release_space(inode, to_free);
        if (unlikely(to_free > ei->i_reserved_data_blocks)) {
                /*
                 * if there aren't enough reserved blocks, then the
                 * counter is messed up somewhere.  Since this
                 * function is called from invalidate page, it's
                 * harmless to return without any action.
                 */
                ext4_warning(inode->i_sb, "ext4_da_release_space: "
                         "ino %lu, to_free %d with only %d reserved "
                         "data blocks", inode->i_ino, to_free,
                         ei->i_reserved_data_blocks);
                WARN_ON(1);
                to_free = ei->i_reserved_data_blocks;
        }
        ei->i_reserved_data_blocks -= to_free;

        /* update fs dirty data blocks counter */
        percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free);

        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);

        dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
}

/*
 * Delayed allocation stuff
 */

struct mpage_da_data {
        struct inode *inode;
        struct writeback_control *wbc;

        pgoff_t first_page;        /* The first page to write */
        pgoff_t next_page;        /* Current page to examine */
        pgoff_t last_page;        /* Last page to examine */
        /*
         * Extent to map - this can be after first_page because that can be
         * fully mapped. We somewhat abuse m_flags to store whether the extent
         * is delalloc or unwritten.
         */
        struct ext4_map_blocks map;
        struct ext4_io_submit io_submit;        /* IO submission data */
        unsigned int do_map:1;
        unsigned int scanned_until_end:1;
};

static void mpage_release_unused_pages(struct mpage_da_data *mpd,
                                       bool invalidate)
{
        int nr_pages, i;
        pgoff_t index, end;
        struct pagevec pvec;
        struct inode *inode = mpd->inode;
        struct address_space *mapping = inode->i_mapping;

        /* This is necessary when next_page == 0. */
        if (mpd->first_page >= mpd->next_page)
                return;

        mpd->scanned_until_end = 0;
        index = mpd->first_page;
        end   = mpd->next_page - 1;
        if (invalidate) {
                ext4_lblk_t start, last;
                start = index << (PAGE_SHIFT - inode->i_blkbits);
                last = end << (PAGE_SHIFT - inode->i_blkbits);

                /*
                 * avoid racing with extent status tree scans made by
                 * ext4_insert_delayed_block()
                 */
                down_write(&EXT4_I(inode)->i_data_sem);
                ext4_es_remove_extent(inode, start, last - start + 1);
                up_write(&EXT4_I(inode)->i_data_sem);
        }

        pagevec_init(&pvec);
        while (index <= end) {
                nr_pages = pagevec_lookup_range(&pvec, mapping, &index, end);
                if (nr_pages == 0)
                        break;
                for (i = 0; i < nr_pages; i++) {
                        struct page *page = pvec.pages[i];

                        BUG_ON(!PageLocked(page));
                        BUG_ON(PageWriteback(page));
                        if (invalidate) {
                                if (page_mapped(page))
                                        clear_page_dirty_for_io(page);
                                block_invalidatepage(page, 0, PAGE_SIZE);
                                ClearPageUptodate(page);
                        }
                        unlock_page(page);
                }
                pagevec_release(&pvec);
        }
}

static void ext4_print_free_blocks(struct inode *inode)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct super_block *sb = inode->i_sb;
        struct ext4_inode_info *ei = EXT4_I(inode);

        ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld",
               EXT4_C2B(EXT4_SB(inode->i_sb),
                        ext4_count_free_clusters(sb)));
        ext4_msg(sb, KERN_CRIT, "Free/Dirty block details");
        ext4_msg(sb, KERN_CRIT, "free_blocks=%lld",
               (long long) EXT4_C2B(EXT4_SB(sb),
                percpu_counter_sum(&sbi->s_freeclusters_counter)));
        ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld",
               (long long) EXT4_C2B(EXT4_SB(sb),
                percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
        ext4_msg(sb, KERN_CRIT, "Block reservation details");
        ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u",
                 ei->i_reserved_data_blocks);
        return;
}

static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
{
        return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
}

/*
 * ext4_insert_delayed_block - adds a delayed block to the extents status
 *                             tree, incrementing the reserved cluster/block
 *                             count or making a pending reservation
 *                             where needed
 *
 * @inode - file containing the newly added block
 * @lblk - logical block to be added
 *
 * Returns 0 on success, negative error code on failure.
 */
static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        int ret;
        bool allocated = false;
        bool reserved = false;

        /*
         * If the cluster containing lblk is shared with a delayed,
         * written, or unwritten extent in a bigalloc file system, it's
         * already been accounted for and does not need to be reserved.
         * A pending reservation must be made for the cluster if it's
         * shared with a written or unwritten extent and doesn't already
         * have one.  Written and unwritten extents can be purged from the
         * extents status tree if the system is under memory pressure, so
         * it's necessary to examine the extent tree if a search of the
         * extents status tree doesn't get a match.
         */
        if (sbi->s_cluster_ratio == 1) {
                ret = ext4_da_reserve_space(inode);
                if (ret != 0)   /* ENOSPC */
                        goto errout;
                reserved = true;
        } else {   /* bigalloc */
                if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) {
                        if (!ext4_es_scan_clu(inode,
                                              &ext4_es_is_mapped, lblk)) {
                                ret = ext4_clu_mapped(inode,
                                                      EXT4_B2C(sbi, lblk));
                                if (ret < 0)
                                        goto errout;
                                if (ret == 0) {
                                        ret = ext4_da_reserve_space(inode);
                                        if (ret != 0)   /* ENOSPC */
                                                goto errout;
                                        reserved = true;
                                } else {
                                        allocated = true;
                                }
                        } else {
                                allocated = true;
                        }
                }
        }

        ret = ext4_es_insert_delayed_block(inode, lblk, allocated);
        if (ret && reserved)
                ext4_da_release_space(inode, 1);

errout:
        return ret;
}

/*
 * This function is grabs code from the very beginning of
 * ext4_map_blocks, but assumes that the caller is from delayed write
 * time. This function looks up the requested blocks and sets the
 * buffer delay bit under the protection of i_data_sem.
 */
static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
                              struct ext4_map_blocks *map,
                              struct buffer_head *bh)
{
        struct extent_status es;
        int retval;
        sector_t invalid_block = ~((sector_t) 0xffff);
#ifdef ES_AGGRESSIVE_TEST
        struct ext4_map_blocks orig_map;

        memcpy(&orig_map, map, sizeof(*map));
#endif

        if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
                invalid_block = ~0;

        map->m_flags = 0;
        ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len,
                  (unsigned long) map->m_lblk);

        /* Lookup extent status tree firstly */
        if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) {
                if (ext4_es_is_hole(&es))
                        goto add_delayed;

found:
                /*
                 * Delayed extent could be allocated by fallocate.
                 * So we need to check it.
                 */
                if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) {
                        map_bh(bh, inode->i_sb, invalid_block);
                        set_buffer_new(bh);
                        set_buffer_delay(bh);
                        return 0;
                }

                map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk;
                retval = es.es_len - (iblock - es.es_lblk);
                if (retval > map->m_len)
                        retval = map->m_len;
                map->m_len = retval;
                if (ext4_es_is_written(&es))
                        map->m_flags |= EXT4_MAP_MAPPED;
                else if (ext4_es_is_unwritten(&es))
                        map->m_flags |= EXT4_MAP_UNWRITTEN;
                else
                        BUG();

#ifdef ES_AGGRESSIVE_TEST
                ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0);
#endif
                return retval;
        }

        /*
         * Try to see if we can get the block without requesting a new
         * file system block.
         */
        down_read(&EXT4_I(inode)->i_data_sem);
        if (ext4_has_inline_data(inode))
                retval = 0;
        else
                retval = ext4_map_query_blocks(NULL, inode, map);
        up_read(&EXT4_I(inode)->i_data_sem);
        if (retval)
                return retval;

add_delayed:
        down_write(&EXT4_I(inode)->i_data_sem);
        /*
         * Page fault path (ext4_page_mkwrite does not take i_rwsem)
         * and fallocate path (no folio lock) can race. Make sure we
         * lookup the extent status tree here again while i_data_sem
         * is held in write mode, before inserting a new da entry in
         * the extent status tree.
         */
        if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) {
                if (!ext4_es_is_hole(&es)) {
                        up_write(&EXT4_I(inode)->i_data_sem);
                        goto found;
                }
        } else if (!ext4_has_inline_data(inode)) {
                retval = ext4_map_query_blocks(NULL, inode, map);
                if (retval) {
                        up_write(&EXT4_I(inode)->i_data_sem);
                        return retval;
                }
        }

        retval = ext4_insert_delayed_block(inode, map->m_lblk);
        up_write(&EXT4_I(inode)->i_data_sem);
        if (retval)
                return retval;

        map_bh(bh, inode->i_sb, invalid_block);
        set_buffer_new(bh);
        set_buffer_delay(bh);
        return retval;
}

/*
 * This is a special get_block_t callback which is used by
 * ext4_da_write_begin().  It will either return mapped block or
 * reserve space for a single block.
 *
 * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
 * We also have b_blocknr = -1 and b_bdev initialized properly
 *
 * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
 * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
 * initialized properly.
 */
int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                           struct buffer_head *bh, int create)
{
        struct ext4_map_blocks map;
        int ret = 0;

        BUG_ON(create == 0);
        BUG_ON(bh->b_size != inode->i_sb->s_blocksize);

        map.m_lblk = iblock;
        map.m_len = 1;

        /*
         * first, we need to know whether the block is allocated already
         * preallocated blocks are unmapped but should treated
         * the same as allocated blocks.
         */
        ret = ext4_da_map_blocks(inode, iblock, &map, bh);
        if (ret <= 0)
                return ret;

        map_bh(bh, inode->i_sb, map.m_pblk);
        ext4_update_bh_state(bh, map.m_flags);

        if (buffer_unwritten(bh)) {
                /* A delayed write to unwritten bh should be marked
                 * new and mapped.  Mapped ensures that we don't do
                 * get_block multiple times when we write to the same
                 * offset and new ensures that we do proper zero out
                 * for partial write.
                 */
                set_buffer_new(bh);
                set_buffer_mapped(bh);
        }
        return 0;
}

static int bget_one(handle_t *handle, struct buffer_head *bh)
{
        get_bh(bh);
        return 0;
}

static int bput_one(handle_t *handle, struct buffer_head *bh)
{
        put_bh(bh);
        return 0;
}

static int __ext4_journalled_writepage(struct page *page,
                                       unsigned int len)
{
        struct address_space *mapping = page->mapping;
        struct inode *inode = mapping->host;
        struct buffer_head *page_bufs = NULL;
        handle_t *handle = NULL;
        int ret = 0, err = 0;
        int inline_data = ext4_has_inline_data(inode);
        struct buffer_head *inode_bh = NULL;

        ClearPageChecked(page);

        if (inline_data) {
                BUG_ON(page->index != 0);
                BUG_ON(len > ext4_get_max_inline_size(inode));
                inode_bh = ext4_journalled_write_inline_data(inode, len, page);
                if (inode_bh == NULL)
                        goto out;
        } else {
                page_bufs = page_buffers(page);
                if (!page_bufs) {
                        BUG();
                        goto out;
                }
                ext4_walk_page_buffers(handle, page_bufs, 0, len,
                                       NULL, bget_one);
        }
        /*
         * We need to release the page lock before we start the
         * journal, so grab a reference so the page won't disappear
         * out from under us.
         */
        get_page(page);
        unlock_page(page);

        handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
                                    ext4_writepage_trans_blocks(inode));
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                put_page(page);
                goto out_no_pagelock;
        }
        BUG_ON(!ext4_handle_valid(handle));

        lock_page(page);
        put_page(page);
        if (page->mapping != mapping) {
                /* The page got truncated from under us */
                ext4_journal_stop(handle);
                ret = 0;
                goto out;
        }

        if (inline_data) {
                ret = ext4_mark_inode_dirty(handle, inode);
        } else {
                ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
                                             do_journal_get_write_access);

                err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
                                             write_end_fn);
        }
        if (ret == 0)
                ret = err;
        err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len);
        if (ret == 0)
                ret = err;
        EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
        err = ext4_journal_stop(handle);
        if (!ret)
                ret = err;

        ext4_set_inode_state(inode, EXT4_STATE_JDATA);
out:
        unlock_page(page);
out_no_pagelock:
        if (!inline_data && page_bufs)
                ext4_walk_page_buffers(NULL, page_bufs, 0, len,
                                       NULL, bput_one);
        brelse(inode_bh);
        return ret;
}

/*
 * Note that we don't need to start a transaction unless we're journaling data
 * because we should have holes filled from ext4_page_mkwrite(). We even don't
 * need to file the inode to the transaction's list in ordered mode because if
 * we are writing back data added by write(), the inode is already there and if
 * we are writing back data modified via mmap(), no one guarantees in which
 * transaction the data will hit the disk. In case we are journaling data, we
 * cannot start transaction directly because transaction start ranks above page
 * lock so we have to do some magic.
 *
 * This function can get called via...
 *   - ext4_writepages after taking page lock (have journal handle)
 *   - journal_submit_inode_data_buffers (no journal handle)
 *   - shrink_page_list via the kswapd/direct reclaim (no journal handle)
 *   - grab_page_cache when doing write_begin (have journal handle)
 *
 * We don't do any block allocation in this function. If we have page with
 * multiple blocks we need to write those buffer_heads that are mapped. This
 * is important for mmaped based write. So if we do with blocksize 1K
 * truncate(f, 1024);
 * a = mmap(f, 0, 4096);
 * a[0] = 'a';
 * truncate(f, 4096);
 * we have in the page first buffer_head mapped via page_mkwrite call back
 * but other buffer_heads would be unmapped but dirty (dirty done via the
 * do_wp_page). So writepage should write the first block. If we modify
 * the mmap area beyond 1024 we will again get a page_fault and the
 * page_mkwrite callback will do the block allocation and mark the
 * buffer_heads mapped.
 *
 * We redirty the page if we have any buffer_heads that is either delay or
 * unwritten in the page.
 *
 * We can get recursively called as show below.
 *
 *        ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
 *                ext4_writepage()
 *
 * But since we don't do any block allocation we should not deadlock.
 * Page also have the dirty flag cleared so we don't get recurive page_lock.
 */
static int ext4_writepage(struct page *page,
                          struct writeback_control *wbc)
{
        int ret = 0;
        loff_t size;
        unsigned int len;
        struct buffer_head *page_bufs = NULL;
        struct inode *inode = page->mapping->host;
        struct ext4_io_submit io_submit;
        bool keep_towrite = false;

        if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) {
                inode->i_mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
                unlock_page(page);
                return -EIO;
        }

        trace_ext4_writepage(page);
        size = i_size_read(inode);
        if (page->index == size >> PAGE_SHIFT &&
            !ext4_verity_in_progress(inode))
                len = size & ~PAGE_MASK;
        else
                len = PAGE_SIZE;

        /* Should never happen but for bugs in other kernel subsystems */
        if (!page_has_buffers(page)) {
                ext4_warning_inode(inode,
                   "page %lu does not have buffers attached", page->index);
                ClearPageDirty(page);
                unlock_page(page);
                return 0;
        }

        page_bufs = page_buffers(page);
        /*
         * We cannot do block allocation or other extent handling in this
         * function. If there are buffers needing that, we have to redirty
         * the page. But we may reach here when we do a journal commit via
         * journal_submit_inode_data_buffers() and in that case we must write
         * allocated buffers to achieve data=ordered mode guarantees.
         *
         * Also, if there is only one buffer per page (the fs block
         * size == the page size), if one buffer needs block
         * allocation or needs to modify the extent tree to clear the
         * unwritten flag, we know that the page can't be written at
         * all, so we might as well refuse the write immediately.
         * Unfortunately if the block size != page size, we can't as
         * easily detect this case using ext4_walk_page_buffers(), but
         * for the extremely common case, this is an optimization that
         * skips a useless round trip through ext4_bio_write_page().
         */
        if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
                                   ext4_bh_delay_or_unwritten)) {
                redirty_page_for_writepage(wbc, page);
                if ((current->flags & PF_MEMALLOC) ||
                    (inode->i_sb->s_blocksize == PAGE_SIZE)) {
                        /*
                         * For memory cleaning there's no point in writing only
                         * some buffers. So just bail out. Warn if we came here
                         * from direct reclaim.
                         */
                        WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD))
                                                        == PF_MEMALLOC);
                        unlock_page(page);
                        return 0;
                }
                keep_towrite = true;
        }

        if (PageChecked(page) && ext4_should_journal_data(inode))
                /*
                 * It's mmapped pagecache.  Add buffers and journal it.  There
                 * doesn't seem much point in redirtying the page here.
                 */
                return __ext4_journalled_writepage(page, len);

        ext4_io_submit_init(&io_submit, wbc);
        io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
        if (!io_submit.io_end) {
                redirty_page_for_writepage(wbc, page);
                unlock_page(page);
                return -ENOMEM;
        }
        ret = ext4_bio_write_page(&io_submit, page, len, wbc, keep_towrite);
        ext4_io_submit(&io_submit);
        /* Drop io_end reference we got from init */
        ext4_put_io_end_defer(io_submit.io_end);
        return ret;
}

static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
{
        int len;
        loff_t size;
        int err;

        BUG_ON(page->index != mpd->first_page);
        clear_page_dirty_for_io(page);
        /*
         * We have to be very careful here!  Nothing protects writeback path
         * against i_size changes and the page can be writeably mapped into
         * page tables. So an application can be growing i_size and writing
         * data through mmap while writeback runs. clear_page_dirty_for_io()
         * write-protects our page in page tables and the page cannot get
         * written to again until we release page lock. So only after
         * clear_page_dirty_for_io() we are safe to sample i_size for
         * ext4_bio_write_page() to zero-out tail of the written page. We rely
         * on the barrier provided by TestClearPageDirty in
         * clear_page_dirty_for_io() to make sure i_size is really sampled only
         * after page tables are updated.
         */
        size = i_size_read(mpd->inode);
        if (page->index == size >> PAGE_SHIFT &&
            !ext4_verity_in_progress(mpd->inode))
                len = size & ~PAGE_MASK;
        else
                len = PAGE_SIZE;
        err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false);
        if (!err)
                mpd->wbc->nr_to_write--;
        mpd->first_page++;

        return err;
}

#define BH_FLAGS (BIT(BH_Unwritten) | BIT(BH_Delay))

/*
 * mballoc gives us at most this number of blocks...
 * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
 * The rest of mballoc seems to handle chunks up to full group size.
 */
#define MAX_WRITEPAGES_EXTENT_LEN 2048

/*
 * mpage_add_bh_to_extent - try to add bh to extent of blocks to map
 *
 * @mpd - extent of blocks
 * @lblk - logical number of the block in the file
 * @bh - buffer head we want to add to the extent
 *
 * The function is used to collect contig. blocks in the same state. If the
 * buffer doesn't require mapping for writeback and we haven't started the
 * extent of buffers to map yet, the function returns 'true' immediately - the
 * caller can write the buffer right away. Otherwise the function returns true
 * if the block has been added to the extent, false if the block couldn't be
 * added.
 */
static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
                                   struct buffer_head *bh)
{
        struct ext4_map_blocks *map = &mpd->map;

        /* Buffer that doesn't need mapping for writeback? */
        if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
            (!buffer_delay(bh) && !buffer_unwritten(bh))) {
                /* So far no extent to map => we write the buffer right away */
                if (map->m_len == 0)
                        return true;
                return false;
        }

        /* First block in the extent? */
        if (map->m_len == 0) {
                /* We cannot map unless handle is started... */
                if (!mpd->do_map)
                        return false;
                map->m_lblk = lblk;
                map->m_len = 1;
                map->m_flags = bh->b_state & BH_FLAGS;
                return true;
        }

        /* Don't go larger than mballoc is willing to allocate */
        if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
                return false;

        /* Can we merge the block to our big extent? */
        if (lblk == map->m_lblk + map->m_len &&
            (bh->b_state & BH_FLAGS) == map->m_flags) {
                map->m_len++;
                return true;
        }
        return false;
}

/*
 * mpage_process_page_bufs - submit page buffers for IO or add them to extent
 *
 * @mpd - extent of blocks for mapping
 * @head - the first buffer in the page
 * @bh - buffer we should start processing from
 * @lblk - logical number of the block in the file corresponding to @bh
 *
 * Walk through page buffers from @bh upto @head (exclusive) and either submit
 * the page for IO if all buffers in this page were mapped and there's no
 * accumulated extent of buffers to map or add buffers in the page to the
 * extent of buffers to map. The function returns 1 if the caller can continue
 * by processing the next page, 0 if it should stop adding buffers to the
 * extent to map because we cannot extend it anymore. It can also return value
 * < 0 in case of error during IO submission.
 */
static int mpage_process_page_bufs(struct mpage_da_data *mpd,
                                   struct buffer_head *head,
                                   struct buffer_head *bh,
                                   ext4_lblk_t lblk)
{
        struct inode *inode = mpd->inode;
        int err;
        ext4_lblk_t blocks = (i_size_read(inode) + i_blocksize(inode) - 1)
                                                        >> inode->i_blkbits;

        if (ext4_verity_in_progress(inode))
                blocks = EXT_MAX_BLOCKS;

        do {
                BUG_ON(buffer_locked(bh));

                if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) {
                        /* Found extent to map? */
                        if (mpd->map.m_len)
                                return 0;
                        /* Buffer needs mapping and handle is not started? */
                        if (!mpd->do_map)
                                return 0;
                        /* Everything mapped so far and we hit EOF */
                        break;
                }
        } while (lblk++, (bh = bh->b_this_page) != head);
        /* So far everything mapped? Submit the page for IO. */
        if (mpd->map.m_len == 0) {
                err = mpage_submit_page(mpd, head->b_page);
                if (err < 0)
                        return err;
        }
        if (lblk >= blocks) {
                mpd->scanned_until_end = 1;
                return 0;
        }
        return 1;
}

/*
 * mpage_process_page - update page buffers corresponding to changed extent and
 *                       may submit fully mapped page for IO
 *
 * @mpd                - description of extent to map, on return next extent to map
 * @m_lblk        - logical block mapping.
 * @m_pblk        - corresponding physical mapping.
 * @map_bh        - determines on return whether this page requires any further
 *                  mapping or not.
 * Scan given page buffers corresponding to changed extent and update buffer
 * state according to new extent state.
 * We map delalloc buffers to their physical location, clear unwritten bits.
 * If the given page is not fully mapped, we update @map to the next extent in
 * the given page that needs mapping & return @map_bh as true.
 */
static int mpage_process_page(struct mpage_da_data *mpd, struct page *page,
                              ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk,
                              bool *map_bh)
{
        struct buffer_head *head, *bh;
        ext4_io_end_t *io_end = mpd->io_submit.io_end;
        ext4_lblk_t lblk = *m_lblk;
        ext4_fsblk_t pblock = *m_pblk;
        int err = 0;
        int blkbits = mpd->inode->i_blkbits;
        ssize_t io_end_size = 0;
        struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end);

        bh = head = page_buffers(page);
        do {
                if (lblk < mpd->map.m_lblk)
                        continue;
                if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
                        /*
                         * Buffer after end of mapped extent.
                         * Find next buffer in the page to map.
                         */
                        mpd->map.m_len = 0;
                        mpd->map.m_flags = 0;
                        io_end_vec->size += io_end_size;
                        io_end_size = 0;

                        err = mpage_process_page_bufs(mpd, head, bh, lblk);
                        if (err > 0)
                                err = 0;
                        if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) {
                                io_end_vec = ext4_alloc_io_end_vec(io_end);
                                if (IS_ERR(io_end_vec)) {
                                        err = PTR_ERR(io_end_vec);
                                        goto out;
                                }
                                io_end_vec->offset = (loff_t)mpd->map.m_lblk << blkbits;
                        }
                        *map_bh = true;
                        goto out;
                }
                if (buffer_delay(bh)) {
                        clear_buffer_delay(bh);
                        bh->b_blocknr = pblock++;
                }
                clear_buffer_unwritten(bh);
                io_end_size += (1 << blkbits);
        } while (lblk++, (bh = bh->b_this_page) != head);

        io_end_vec->size += io_end_size;
        io_end_size = 0;
        *map_bh = false;
out:
        *m_lblk = lblk;
        *m_pblk = pblock;
        return err;
}

/*
 * mpage_map_buffers - update buffers corresponding to changed extent and
 *                       submit fully mapped pages for IO
 *
 * @mpd - description of extent to map, on return next extent to map
 *
 * Scan buffers corresponding to changed extent (we expect corresponding pages
 * to be already locked) and update buffer state according to new extent state.
 * We map delalloc buffers to their physical location, clear unwritten bits,
 * and mark buffers as uninit when we perform writes to unwritten extents
 * and do extent conversion after IO is finished. If the last page is not fully
 * mapped, we update @map to the next extent in the last page that needs
 * mapping. Otherwise we submit the page for IO.
 */
static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
{
        struct pagevec pvec;
        int nr_pages, i;
        struct inode *inode = mpd->inode;
        int bpp_bits = PAGE_SHIFT - inode->i_blkbits;
        pgoff_t start, end;
        ext4_lblk_t lblk;
        ext4_fsblk_t pblock;
        int err;
        bool map_bh = false;

        start = mpd->map.m_lblk >> bpp_bits;
        end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
        lblk = start << bpp_bits;
        pblock = mpd->map.m_pblk;

        pagevec_init(&pvec);
        while (start <= end) {
                nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping,
                                                &start, end);
                if (nr_pages == 0)
                        break;
                for (i = 0; i < nr_pages; i++) {
                        struct page *page = pvec.pages[i];

                        err = mpage_process_page(mpd, page, &lblk, &pblock,
                                                 &map_bh);
                        /*
                         * If map_bh is true, means page may require further bh
                         * mapping, or maybe the page was submitted for IO.
                         * So we return to call further extent mapping.
                         */
                        if (err < 0 || map_bh)
                                goto out;
                        /* Page fully mapped - let IO run! */
                        err = mpage_submit_page(mpd, page);
                        if (err < 0)
                                goto out;
                }
                pagevec_release(&pvec);
        }
        /* Extent fully mapped and matches with page boundary. We are done. */
        mpd->map.m_len = 0;
        mpd->map.m_flags = 0;
        return 0;
out:
        pagevec_release(&pvec);
        return err;
}

static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
{
        struct inode *inode = mpd->inode;
        struct ext4_map_blocks *map = &mpd->map;
        int get_blocks_flags;
        int err, dioread_nolock;

        trace_ext4_da_write_pages_extent(inode, map);
        /*
         * Call ext4_map_blocks() to allocate any delayed allocation blocks, or
         * to convert an unwritten extent to be initialized (in the case
         * where we have written into one or more preallocated blocks).  It is
         * possible that we're going to need more metadata blocks than
         * previously reserved. However we must not fail because we're in
         * writeback and there is nothing we can do about it so it might result
         * in data loss.  So use reserved blocks to allocate metadata if
         * possible.
         *
         * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if
         * the blocks in question are delalloc blocks.  This indicates
         * that the blocks and quotas has already been checked when
         * the data was copied into the page cache.
         */
        get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
                           EXT4_GET_BLOCKS_METADATA_NOFAIL |
                           EXT4_GET_BLOCKS_IO_SUBMIT;
        dioread_nolock = ext4_should_dioread_nolock(inode);
        if (dioread_nolock)
                get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
        if (map->m_flags & BIT(BH_Delay))
                get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;

        err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
        if (err < 0)
                return err;
        if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) {
                if (!mpd->io_submit.io_end->handle &&
                    ext4_handle_valid(handle)) {
                        mpd->io_submit.io_end->handle = handle->h_rsv_handle;
                        handle->h_rsv_handle = NULL;
                }
                ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end);
        }

        BUG_ON(map->m_len == 0);
        return 0;
}

/*
 * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
 *                                 mpd->len and submit pages underlying it for IO
 *
 * @handle - handle for journal operations
 * @mpd - extent to map
 * @give_up_on_write - we set this to true iff there is a fatal error and there
 *                     is no hope of writing the data. The caller should discard
 *                     dirty pages to avoid infinite loops.
 *
 * The function maps extent starting at mpd->lblk of length mpd->len. If it is
 * delayed, blocks are allocated, if it is unwritten, we may need to convert
 * them to initialized or split the described range from larger unwritten
 * extent. Note that we need not map all the described range since allocation
 * can return less blocks or the range is covered by more unwritten extents. We
 * cannot map more because we are limited by reserved transaction credits. On
 * the other hand we always make sure that the last touched page is fully
 * mapped so that it can be written out (and thus forward progress is
 * guaranteed). After mapping we submit all mapped pages for IO.
 */
static int mpage_map_and_submit_extent(handle_t *handle,
                                       struct mpage_da_data *mpd,
                                       bool *give_up_on_write)
{
        struct inode *inode = mpd->inode;
        struct ext4_map_blocks *map = &mpd->map;
        int err;
        loff_t disksize;
        int progress = 0;
        ext4_io_end_t *io_end = mpd->io_submit.io_end;
        struct ext4_io_end_vec *io_end_vec;

        io_end_vec = ext4_alloc_io_end_vec(io_end);
        if (IS_ERR(io_end_vec))
                return PTR_ERR(io_end_vec);
        io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits;
        do {
                err = mpage_map_one_extent(handle, mpd);
                if (err < 0) {
                        struct super_block *sb = inode->i_sb;

                        if (ext4_forced_shutdown(EXT4_SB(sb)) ||
                            ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
                                goto invalidate_dirty_pages;
                        /*
                         * Let the uper layers retry transient errors.
                         * In the case of ENOSPC, if ext4_count_free_blocks()
                         * is non-zero, a commit should free up blocks.
                         */
                        if ((err == -ENOMEM) ||
                            (err == -ENOSPC && ext4_count_free_clusters(sb))) {
                                if (progress)
                                        goto update_disksize;
                                return err;
                        }
                        ext4_msg(sb, KERN_CRIT,
                                 "Delayed block allocation failed for "
                                 "inode %lu at logical offset %llu with"
                                 " max blocks %u with error %d",
                                 inode->i_ino,
                                 (unsigned long long)map->m_lblk,
                                 (unsigned)map->m_len, -err);
                        ext4_msg(sb, KERN_CRIT,
                                 "This should not happen!! Data will "
                                 "be lost\n");
                        if (err == -ENOSPC)
                                ext4_print_free_blocks(inode);
                invalidate_dirty_pages:
                        *give_up_on_write = true;
                        return err;
                }
                progress = 1;
                /*
                 * Update buffer state, submit mapped pages, and get us new
                 * extent to map
                 */
                err = mpage_map_and_submit_buffers(mpd);
                if (err < 0)
                        goto update_disksize;
        } while (map->m_len);

update_disksize:
        /*
         * Update on-disk size after IO is submitted.  Races with
         * truncate are avoided by checking i_size under i_data_sem.
         */
        disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT;
        if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) {
                int err2;
                loff_t i_size;

                down_write(&EXT4_I(inode)->i_data_sem);
                i_size = i_size_read(inode);
                if (disksize > i_size)
                        disksize = i_size;
                if (disksize > EXT4_I(inode)->i_disksize)
                        EXT4_I(inode)->i_disksize = disksize;
                up_write(&EXT4_I(inode)->i_data_sem);
                err2 = ext4_mark_inode_dirty(handle, inode);
                if (err2) {
                        ext4_error_err(inode->i_sb, -err2,
                                       "Failed to mark inode %lu dirty",
                                       inode->i_ino);
                }
                if (!err)
                        err = err2;
        }
        return err;
}

/*
 * Calculate the total number of credits to reserve for one writepages
 * iteration. This is called from ext4_writepages(). We map an extent of
 * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
 * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
 * bpp - 1 blocks in bpp different extents.
 */
static int ext4_da_writepages_trans_blocks(struct inode *inode)
{
        int bpp = ext4_journal_blocks_per_page(inode);

        return ext4_meta_trans_blocks(inode,
                                MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
}

/*
 * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
 *                                  and underlying extent to map
 *
 * @mpd - where to look for pages
 *
 * Walk dirty pages in the mapping. If they are fully mapped, submit them for
 * IO immediately. When we find a page which isn't mapped we start accumulating
 * extent of buffers underlying these pages that needs mapping (formed by
 * either delayed or unwritten buffers). We also lock the pages containing
 * these buffers. The extent found is returned in @mpd structure (starting at
 * mpd->lblk with length mpd->len blocks).
 *
 * Note that this function can attach bios to one io_end structure which are
 * neither logically nor physically contiguous. Although it may seem as an
 * unnecessary complication, it is actually inevitable in blocksize < pagesize
 * case as we need to track IO to all buffers underlying a page in one io_end.
 */
static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
{
        struct address_space *mapping = mpd->inode->i_mapping;
        struct pagevec pvec;
        unsigned int nr_pages;
        long left = mpd->wbc->nr_to_write;
        pgoff_t index = mpd->first_page;
        pgoff_t end = mpd->last_page;
        xa_mark_t tag;
        int i, err = 0;
        int blkbits = mpd->inode->i_blkbits;
        ext4_lblk_t lblk;
        struct buffer_head *head;

        if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
                tag = PAGECACHE_TAG_TOWRITE;
        else
                tag = PAGECACHE_TAG_DIRTY;

        pagevec_init(&pvec);
        mpd->map.m_len = 0;
        mpd->next_page = index;
        while (index <= end) {
                nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
                                tag);
                if (nr_pages == 0)
                        break;

                for (i = 0; i < nr_pages; i++) {
                        struct page *page = pvec.pages[i];

                        /*
                         * Accumulated enough dirty pages? This doesn't apply
                         * to WB_SYNC_ALL mode. For integrity sync we have to
                         * keep going because someone may be concurrently
                         * dirtying pages, and we might have synced a lot of
                         * newly appeared dirty pages, but have not synced all
                         * of the old dirty pages.
                         */
                        if (mpd->wbc->sync_mode == WB_SYNC_NONE && left <= 0)
                                goto out;

                        /* If we can't merge this page, we are done. */
                        if (mpd->map.m_len > 0 && mpd->next_page != page->index)
                                goto out;

                        lock_page(page);
                        /*
                         * If the page is no longer dirty, or its mapping no
                         * longer corresponds to inode we are writing (which
                         * means it has been truncated or invalidated), or the
                         * page is already under writeback and we are not doing
                         * a data integrity writeback, skip the page
                         */
                        if (!PageDirty(page) ||
                            (PageWriteback(page) &&
                             (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
                            unlikely(page->mapping != mapping)) {
                                unlock_page(page);
                                continue;
                        }

                        wait_on_page_writeback(page);
                        BUG_ON(PageWriteback(page));

                        /*
                         * Should never happen but for buggy code in
                         * other subsystems that call
                         * set_page_dirty() without properly warning
                         * the file system first.  See [1] for more
                         * information.
                         *
                         * [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz
                         */
                        if (!page_has_buffers(page)) {
                                ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", page->index);
                                ClearPageDirty(page);
                                unlock_page(page);
                                continue;
                        }

                        if (mpd->map.m_len == 0)
                                mpd->first_page = page->index;
                        mpd->next_page = page->index + 1;
                        /* Add all dirty buffers to mpd */
                        lblk = ((ext4_lblk_t)page->index) <<
                                (PAGE_SHIFT - blkbits);
                        head = page_buffers(page);
                        err = mpage_process_page_bufs(mpd, head, head, lblk);
                        if (err <= 0)
                                goto out;
                        err = 0;
                        left--;
                }
                pagevec_release(&pvec);
                cond_resched();
        }
        mpd->scanned_until_end = 1;
        return 0;
out:
        pagevec_release(&pvec);
        return err;
}

static int ext4_writepages(struct address_space *mapping,
                           struct writeback_control *wbc)
{
        pgoff_t        writeback_index = 0;
        long nr_to_write = wbc->nr_to_write;
        int range_whole = 0;
        int cycled = 1;
        handle_t *handle = NULL;
        struct mpage_da_data mpd;
        struct inode *inode = mapping->host;
        int needed_blocks, rsv_blocks = 0, ret = 0;
        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
        struct blk_plug plug;
        bool give_up_on_write = false;

        if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
                return -EIO;

        percpu_down_read(&sbi->s_writepages_rwsem);
        trace_ext4_writepages(inode, wbc);

        /*
         * No pages to write? This is mainly a kludge to avoid starting
         * a transaction for special inodes like journal inode on last iput()
         * because that could violate lock ordering on umount
         */
        if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
                goto out_writepages;

        if (ext4_should_journal_data(inode)) {
                ret = generic_writepages(mapping, wbc);
                goto out_writepages;
        }

        /*
         * If the filesystem has aborted, it is read-only, so return
         * right away instead of dumping stack traces later on that
         * will obscure the real source of the problem.  We test
         * EXT4_MF_FS_ABORTED instead of sb->s_flag's SB_RDONLY because
         * the latter could be true if the filesystem is mounted
         * read-only, and in that case, ext4_writepages should
         * *never* be called, so if that ever happens, we would want
         * the stack trace.
         */
        if (unlikely(ext4_forced_shutdown(EXT4_SB(mapping->host->i_sb)) ||
                     ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED))) {
                ret = -EROFS;
                goto out_writepages;
        }

        /*
         * If we have inline data and arrive here, it means that
         * we will soon create the block for the 1st page, so
         * we'd better clear the inline data here.
         */
        if (ext4_has_inline_data(inode)) {
                /* Just inode will be modified... */
                handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        goto out_writepages;
                }
                BUG_ON(ext4_test_inode_state(inode,
                                EXT4_STATE_MAY_INLINE_DATA));
                ext4_destroy_inline_data(handle, inode);
                ext4_journal_stop(handle);
        }

        if (ext4_should_dioread_nolock(inode)) {
                /*
                 * We may need to convert up to one extent per block in
                 * the page and we may dirty the inode.
                 */
                rsv_blocks = 1 + ext4_chunk_trans_blocks(inode,
                                                PAGE_SIZE >> inode->i_blkbits);
        }

        if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
                range_whole = 1;

        if (wbc->range_cyclic) {
                writeback_index = mapping->writeback_index;
                if (writeback_index)
                        cycled = 0;
                mpd.first_page = writeback_index;
                mpd.last_page = -1;
        } else {
                mpd.first_page = wbc->range_start >> PAGE_SHIFT;
                mpd.last_page = wbc->range_end >> PAGE_SHIFT;
        }

        mpd.inode = inode;
        mpd.wbc = wbc;
        ext4_io_submit_init(&mpd.io_submit, wbc);
retry:
        if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
                tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
        blk_start_plug(&plug);

        /*
         * First writeback pages that don't need mapping - we can avoid
         * starting a transaction unnecessarily and also avoid being blocked
         * in the block layer on device congestion while having transaction
         * started.
         */
        mpd.do_map = 0;
        mpd.scanned_until_end = 0;
        mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
        if (!mpd.io_submit.io_end) {
                ret = -ENOMEM;
                goto unplug;
        }
        ret = mpage_prepare_extent_to_map(&mpd);
        /* Unlock pages we didn't use */
        mpage_release_unused_pages(&mpd, false);
        /* Submit prepared bio */
        ext4_io_submit(&mpd.io_submit);
        ext4_put_io_end_defer(mpd.io_submit.io_end);
        mpd.io_submit.io_end = NULL;
        if (ret < 0)
                goto unplug;

        while (!mpd.scanned_until_end && wbc->nr_to_write > 0) {
                /* For each extent of pages we use new io_end */
                mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
                if (!mpd.io_submit.io_end) {
                        ret = -ENOMEM;
                        break;
                }

                /*
                 * We have two constraints: We find one extent to map and we
                 * must always write out whole page (makes a difference when
                 * blocksize < pagesize) so that we don't block on IO when we
                 * try to write out the rest of the page. Journalled mode is
                 * not supported by delalloc.
                 */
                BUG_ON(ext4_should_journal_data(inode));
                needed_blocks = ext4_da_writepages_trans_blocks(inode);

                /* start a new transaction */
                handle = ext4_journal_start_with_reserve(inode,
                                EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
                               "%ld pages, ino %lu; err %d", __func__,
                                wbc->nr_to_write, inode->i_ino, ret);
                        /* Release allocated io_end */
                        ext4_put_io_end(mpd.io_submit.io_end);
                        mpd.io_submit.io_end = NULL;
                        break;
                }
                mpd.do_map = 1;

                trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
                ret = mpage_prepare_extent_to_map(&mpd);
                if (!ret && mpd.map.m_len)
                        ret = mpage_map_and_submit_extent(handle, &mpd,
                                        &give_up_on_write);
                /*
                 * Caution: If the handle is synchronous,
                 * ext4_journal_stop() can wait for transaction commit
                 * to finish which may depend on writeback of pages to
                 * complete or on page lock to be released.  In that
                 * case, we have to wait until after we have
                 * submitted all the IO, released page locks we hold,
                 * and dropped io_end reference (for extent conversion
                 * to be able to complete) before stopping the handle.
                 */
                if (!ext4_handle_valid(handle) || handle->h_sync == 0) {
                        ext4_journal_stop(handle);
                        handle = NULL;
                        mpd.do_map = 0;
                }
                /* Unlock pages we didn't use */
                mpage_release_unused_pages(&mpd, give_up_on_write);
                /* Submit prepared bio */
                ext4_io_submit(&mpd.io_submit);

                /*
                 * Drop our io_end reference we got from init. We have
                 * to be careful and use deferred io_end finishing if
                 * we are still holding the transaction as we can
                 * release the last reference to io_end which may end
                 * up doing unwritten extent conversion.
                 */
                if (handle) {
                        ext4_put_io_end_defer(mpd.io_submit.io_end);
                        ext4_journal_stop(handle);
                } else
                        ext4_put_io_end(mpd.io_submit.io_end);
                mpd.io_submit.io_end = NULL;

                if (ret == -ENOSPC && sbi->s_journal) {
                        /*
                         * Commit the transaction which would
                         * free blocks released in the transaction
                         * and try again
                         */
                        jbd2_journal_force_commit_nested(sbi->s_journal);
                        ret = 0;
                        continue;
                }
                /* Fatal error - ENOMEM, EIO... */
                if (ret)
                        break;
        }
unplug:
        blk_finish_plug(&plug);
        if (!ret && !cycled && wbc->nr_to_write > 0) {
                cycled = 1;
                mpd.last_page = writeback_index - 1;
                mpd.first_page = 0;
                goto retry;
        }

        /* Update index */
        if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
                /*
                 * Set the writeback_index so that range_cyclic
                 * mode will write it back later
                 */
                mapping->writeback_index = mpd.first_page;

out_writepages:
        trace_ext4_writepages_result(inode, wbc, ret,
                                     nr_to_write - wbc->nr_to_write);
        percpu_up_read(&sbi->s_writepages_rwsem);
        return ret;
}

static int ext4_dax_writepages(struct address_space *mapping,
                               struct writeback_control *wbc)
{
        int ret;
        long nr_to_write = wbc->nr_to_write;
        struct inode *inode = mapping->host;
        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);

        if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
                return -EIO;

        percpu_down_read(&sbi->s_writepages_rwsem);
        trace_ext4_writepages(inode, wbc);

        ret = dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc);
        trace_ext4_writepages_result(inode, wbc, ret,
                                     nr_to_write - wbc->nr_to_write);
        percpu_up_read(&sbi->s_writepages_rwsem);
        return ret;
}

static int ext4_nonda_switch(struct super_block *sb)
{
        s64 free_clusters, dirty_clusters;
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        /*
         * switch to non delalloc mode if we are running low
         * on free block. The free block accounting via percpu
         * counters can get slightly wrong with percpu_counter_batch getting
         * accumulated on each CPU without updating global counters
         * Delalloc need an accurate free block accounting. So switch
         * to non delalloc when we are near to error range.
         */
        free_clusters =
                percpu_counter_read_positive(&sbi->s_freeclusters_counter);
        dirty_clusters =
                percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
        /*
         * Start pushing delalloc when 1/2 of free blocks are dirty.
         */
        if (dirty_clusters && (free_clusters < 2 * dirty_clusters))
                try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);

        if (2 * free_clusters < 3 * dirty_clusters ||
            free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) {
                /*
                 * free block count is less than 150% of dirty blocks
                 * or free blocks is less than watermark
                 */
                return 1;
        }
        return 0;
}

/* We always reserve for an inode update; the superblock could be there too */
static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len)
{
        if (likely(ext4_has_feature_large_file(inode->i_sb)))
                return 1;

        if (pos + len <= 0x7fffffffULL)
                return 1;

        /* We might need to update the superblock to set LARGE_FILE */
        return 2;
}

static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
                               loff_t pos, unsigned len, unsigned flags,
                               struct page **pagep, void **fsdata)
{
        int ret, retries = 0;
        struct page *page;
        pgoff_t index;
        struct inode *inode = mapping->host;
        handle_t *handle;

        if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
                return -EIO;

        index = pos >> PAGE_SHIFT;

        if (ext4_nonda_switch(inode->i_sb) || S_ISLNK(inode->i_mode) ||
            ext4_verity_in_progress(inode)) {
                *fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
                return ext4_write_begin(file, mapping, pos,
                                        len, flags, pagep, fsdata);
        }
        *fsdata = (void *)0;
        trace_ext4_da_write_begin(inode, pos, len, flags);

        if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
                ret = ext4_da_write_inline_data_begin(mapping, inode,
                                                      pos, len, flags,
                                                      pagep, fsdata);
                if (ret < 0)
                        return ret;
                if (ret == 1)
                        return 0;
        }

        /*
         * grab_cache_page_write_begin() can take a long time if the
         * system is thrashing due to memory pressure, or if the page
         * is being written back.  So grab it first before we start
         * the transaction handle.  This also allows us to allocate
         * the page (if needed) without using GFP_NOFS.
         */
retry_grab:
        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
                return -ENOMEM;
        unlock_page(page);

        /*
         * With delayed allocation, we don't log the i_disksize update
         * if there is delayed block allocation. But we still need
         * to journalling the i_disksize update if writes to the end
         * of file which has an already mapped buffer.
         */
retry_journal:
        handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
                                ext4_da_write_credits(inode, pos, len));
        if (IS_ERR(handle)) {
                put_page(page);
                return PTR_ERR(handle);
        }

        lock_page(page);
        if (page->mapping != mapping) {
                /* The page got truncated from under us */
                unlock_page(page);
                put_page(page);
                ext4_journal_stop(handle);
                goto retry_grab;
        }
        /* In case writeback began while the page was unlocked */
        wait_for_stable_page(page);

#ifdef CONFIG_FS_ENCRYPTION
        ret = ext4_block_write_begin(page, pos, len,
                                     ext4_da_get_block_prep);
#else
        ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
#endif
        if (ret < 0) {
                unlock_page(page);
                ext4_journal_stop(handle);
                /*
                 * block_write_begin may have instantiated a few blocks
                 * outside i_size.  Trim these off again. Don't need
                 * i_size_read because we hold i_mutex.
                 */
                if (pos + len > inode->i_size)
                        ext4_truncate_failed_write(inode);

                if (ret == -ENOSPC &&
                    ext4_should_retry_alloc(inode->i_sb, &retries))
                        goto retry_journal;

                put_page(page);
                return ret;
        }

        *pagep = page;
        return ret;
}

/*
 * Check if we should update i_disksize
 * when write to the end of file but not require block allocation
 */
static int ext4_da_should_update_i_disksize(struct page *page,
                                            unsigned long offset)
{
        struct buffer_head *bh;
        struct inode *inode = page->mapping->host;
        unsigned int idx;
        int i;

        bh = page_buffers(page);
        idx = offset >> inode->i_blkbits;

        for (i = 0; i < idx; i++)
                bh = bh->b_this_page;

        if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
                return 0;
        return 1;
}

static int ext4_da_write_end(struct file *file,
                             struct address_space *mapping,
                             loff_t pos, unsigned len, unsigned copied,
                             struct page *page, void *fsdata)
{
        struct inode *inode = mapping->host;
        int ret = 0, ret2;
        handle_t *handle = ext4_journal_current_handle();
        loff_t new_i_size;
        unsigned long start, end;
        int write_mode = (int)(unsigned long)fsdata;

        if (write_mode == FALL_BACK_TO_NONDELALLOC)
                return ext4_write_end(file, mapping, pos,
                                      len, copied, page, fsdata);

        trace_ext4_da_write_end(inode, pos, len, copied);
        start = pos & (PAGE_SIZE - 1);
        end = start + copied - 1;

        /*
         * Since we are holding inode lock, we are sure i_disksize <=
         * i_size. We also know that if i_disksize < i_size, there are
         * delalloc writes pending in the range upto i_size. If the end of
         * the current write is <= i_size, there's no need to touch
         * i_disksize since writeback will push i_disksize upto i_size
         * eventually. If the end of the current write is > i_size and
         * inside an allocated block (ext4_da_should_update_i_disksize()
         * check), we need to update i_disksize here as neither
         * ext4_writepage() nor certain ext4_writepages() paths not
         * allocating blocks update i_disksize.
         *
         * Note that we defer inode dirtying to generic_write_end() /
         * ext4_da_write_inline_data_end().
         */
        new_i_size = pos + copied;
        if (copied && new_i_size > inode->i_size) {
                if (ext4_has_inline_data(inode) ||
                    ext4_da_should_update_i_disksize(page, end))
                        ext4_update_i_disksize(inode, new_i_size);
        }

        if (write_mode != CONVERT_INLINE_DATA &&
            ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
            ext4_has_inline_data(inode))
                ret = ext4_da_write_inline_data_end(inode, pos, len, copied,
                                                     page);
        else
                ret = generic_write_end(file, mapping, pos, len, copied,
                                                        page, fsdata);

        copied = ret;
        ret2 = ext4_journal_stop(handle);
        if (unlikely(ret2 && !ret))
                ret = ret2;

        return ret ? ret : copied;
}

/*
 * Force all delayed allocation blocks to be allocated for a given inode.
 */
int ext4_alloc_da_blocks(struct inode *inode)
{
        trace_ext4_alloc_da_blocks(inode);

        if (!EXT4_I(inode)->i_reserved_data_blocks)
                return 0;

        /*
         * We do something simple for now.  The filemap_flush() will
         * also start triggering a write of the data blocks, which is
         * not strictly speaking necessary (and for users of
         * laptop_mode, not even desirable).  However, to do otherwise
         * would require replicating code paths in:
         *
         * ext4_writepages() ->
         *    write_cache_pages() ---> (via passed in callback function)
         *        __mpage_da_writepage() -->
         *           mpage_add_bh_to_extent()
         *           mpage_da_map_blocks()
         *
         * The problem is that write_cache_pages(), located in
         * mm/page-writeback.c, marks pages clean in preparation for
         * doing I/O, which is not desirable if we're not planning on
         * doing I/O at all.
         *
         * We could call write_cache_pages(), and then redirty all of
         * the pages by calling redirty_page_for_writepage() but that
         * would be ugly in the extreme.  So instead we would need to
         * replicate parts of the code in the above functions,
         * simplifying them because we wouldn't actually intend to
         * write out the pages, but rather only collect contiguous
         * logical block extents, call the multi-block allocator, and
         * then update the buffer heads with the block allocations.
         *
         * For now, though, we'll cheat by calling filemap_flush(),
         * which will map the blocks, and start the I/O, but not
         * actually wait for the I/O to complete.
         */
        return filemap_flush(inode->i_mapping);
}

/*
 * bmap() is special.  It gets used by applications such as lilo and by
 * the swapper to find the on-disk block of a specific piece of data.
 *
 * Naturally, this is dangerous if the block concerned is still in the
 * journal.  If somebody makes a swapfile on an ext4 data-journaling
 * filesystem and enables swap, then they may get a nasty shock when the
 * data getting swapped to that swapfile suddenly gets overwritten by
 * the original zero's written out previously to the journal and
 * awaiting writeback in the kernel's buffer cache.
 *
 * So, if we see any bmap calls here on a modified, data-journaled file,
 * take extra steps to flush any blocks which might be in the cache.
 */
static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
{
        struct inode *inode = mapping->host;
        journal_t *journal;
        sector_t ret = 0;
        int err;

        inode_lock_shared(inode);
        /*
         * We can get here for an inline file via the FIBMAP ioctl
         */
        if (ext4_has_inline_data(inode))
                goto out;

        if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
                        test_opt(inode->i_sb, DELALLOC)) {
                /*
                 * With delalloc we want to sync the file
                 * so that we can make sure we allocate
                 * blocks for file
                 */
                filemap_write_and_wait(mapping);
        }

        if (EXT4_JOURNAL(inode) &&
            ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
                /*
                 * This is a REALLY heavyweight approach, but the use of
                 * bmap on dirty files is expected to be extremely rare:
                 * only if we run lilo or swapon on a freshly made file
                 * do we expect this to happen.
                 *
                 * (bmap requires CAP_SYS_RAWIO so this does not
                 * represent an unprivileged user DOS attack --- we'd be
                 * in trouble if mortal users could trigger this path at
                 * will.)
                 *
                 * NB. EXT4_STATE_JDATA is not set on files other than
                 * regular files.  If somebody wants to bmap a directory
                 * or symlink and gets confused because the buffer
                 * hasn't yet been flushed to disk, they deserve
                 * everything they get.
                 */

                ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
                journal = EXT4_JOURNAL(inode);
                jbd2_journal_lock_updates(journal);
                err = jbd2_journal_flush(journal);
                jbd2_journal_unlock_updates(journal);

                if (err)
                        goto out;
        }

        ret = iomap_bmap(mapping, block, &ext4_iomap_ops);

out:
        inode_unlock_shared(inode);
        return ret;
}

static int ext4_readpage(struct file *file, struct page *page)
{
        int ret = -EAGAIN;
        struct inode *inode = page->mapping->host;

        trace_ext4_readpage(page);

        if (ext4_has_inline_data(inode))
                ret = ext4_readpage_inline(inode, page);

        if (ret == -EAGAIN)
                return ext4_mpage_readpages(inode, NULL, page);

        return ret;
}

static void ext4_readahead(struct readahead_control *rac)
{
        struct inode *inode = rac->mapping->host;

        /* If the file has inline data, no need to do readahead. */
        if (ext4_has_inline_data(inode))
                return;

        ext4_mpage_readpages(inode, rac, NULL);
}

static void ext4_invalidatepage(struct page *page, unsigned int offset,
                                unsigned int length)
{
        trace_ext4_invalidatepage(page, offset, length);

        /* No journalling happens on data buffers when this function is used */
        WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));

        block_invalidatepage(page, offset, length);
}

static int __ext4_journalled_invalidatepage(struct page *page,
                                            unsigned int offset,
                                            unsigned int length)
{
        journal_t *journal = EXT4_JOURNAL(page->mapping->host);

        trace_ext4_journalled_invalidatepage(page, offset, length);

        /*
         * If it's a full truncate we just forget about the pending dirtying
         */
        if (offset == 0 && length == PAGE_SIZE)
                ClearPageChecked(page);

        return jbd2_journal_invalidatepage(journal, page, offset, length);
}

/* Wrapper for aops... */
static void ext4_journalled_invalidatepage(struct page *page,
                                           unsigned int offset,
                                           unsigned int length)
{
        WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0);
}

static int ext4_releasepage(struct page *page, gfp_t wait)
{
        journal_t *journal = EXT4_JOURNAL(page->mapping->host);

        trace_ext4_releasepage(page);

        /* Page has dirty journalled data -> cannot release */
        if (PageChecked(page))
                return 0;
        if (journal)
                return jbd2_journal_try_to_free_buffers(journal, page);
        else
                return try_to_free_buffers(page);
}

static bool ext4_inode_datasync_dirty(struct inode *inode)
{
        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;

        if (journal) {
                if (jbd2_transaction_committed(journal,
                        EXT4_I(inode)->i_datasync_tid))
                        return false;
                if (test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT))
                        return !list_empty(&EXT4_I(inode)->i_fc_list);
                return true;
        }

        /* Any metadata buffers to write? */
        if (!list_empty(&inode->i_mapping->private_list))
                return true;
        return inode->i_state & I_DIRTY_DATASYNC;
}

static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
                           struct ext4_map_blocks *map, loff_t offset,
                           loff_t length)
{
        u8 blkbits = inode->i_blkbits;

        /*
         * Writes that span EOF might trigger an I/O size update on completion,
         * so consider them to be dirty for the purpose of O_DSYNC, even if
         * there is no other metadata changes being made or are pending.
         */
        iomap->flags = 0;
        if (ext4_inode_datasync_dirty(inode) ||
            offset + length > i_size_read(inode))
                iomap->flags |= IOMAP_F_DIRTY;

        if (map->m_flags & EXT4_MAP_NEW)
                iomap->flags |= IOMAP_F_NEW;

        iomap->bdev = inode->i_sb->s_bdev;
        iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
        iomap->offset = (u64) map->m_lblk << blkbits;
        iomap->length = (u64) map->m_len << blkbits;

        if ((map->m_flags & EXT4_MAP_MAPPED) &&
            !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                iomap->flags |= IOMAP_F_MERGED;

        /*
         * Flags passed to ext4_map_blocks() for direct I/O writes can result
         * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits
         * set. In order for any allocated unwritten extents to be converted
         * into written extents correctly within the ->end_io() handler, we
         * need to ensure that the iomap->type is set appropriately. Hence, the
         * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has
         * been set first.
         */
        if (map->m_flags & EXT4_MAP_UNWRITTEN) {
                iomap->type = IOMAP_UNWRITTEN;
                iomap->addr = (u64) map->m_pblk << blkbits;
        } else if (map->m_flags & EXT4_MAP_MAPPED) {
                iomap->type = IOMAP_MAPPED;
                iomap->addr = (u64) map->m_pblk << blkbits;
        } else {
                iomap->type = IOMAP_HOLE;
                iomap->addr = IOMAP_NULL_ADDR;
        }
}

static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map,
                            unsigned int flags)
{
        handle_t *handle;
        u8 blkbits = inode->i_blkbits;
        int ret, dio_credits, m_flags = 0, retries = 0;

        /*
         * Trim the mapping request to the maximum value that we can map at
         * once for direct I/O.
         */
        if (map->m_len > DIO_MAX_BLOCKS)
                map->m_len = DIO_MAX_BLOCKS;
        dio_credits = ext4_chunk_trans_blocks(inode, map->m_len);

retry:
        /*
         * Either we allocate blocks and then don't get an unwritten extent, so
         * in that case we have reserved enough credits. Or, the blocks are
         * already allocated and unwritten. In that case, the extent conversion
         * fits into the credits as well.
         */
        handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
        if (IS_ERR(handle))
                return PTR_ERR(handle);

        /*
         * DAX and direct I/O are the only two operations that are currently
         * supported with IOMAP_WRITE.
         */
        WARN_ON(!IS_DAX(inode) && !(flags & IOMAP_DIRECT));
        if (IS_DAX(inode))
                m_flags = EXT4_GET_BLOCKS_CREATE_ZERO;
        /*
         * We use i_size instead of i_disksize here because delalloc writeback
         * can complete at any point during the I/O and subsequently push the
         * i_disksize out to i_size. This could be beyond where direct I/O is
         * happening and thus expose allocated blocks to direct I/O reads.
         */
        else if (((loff_t)map->m_lblk << blkbits) >= i_size_read(inode))
                m_flags = EXT4_GET_BLOCKS_CREATE;
        else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;

        ret = ext4_map_blocks(handle, inode, map, m_flags);

        /*
         * We cannot fill holes in indirect tree based inodes as that could
         * expose stale data in the case of a crash. Use the magic error code
         * to fallback to buffered I/O.
         */
        if (!m_flags && !ret)
                ret = -ENOTBLK;

        ext4_journal_stop(handle);
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
                goto retry;

        return ret;
}


static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
                unsigned flags, struct iomap *iomap, struct iomap *srcmap)
{
        int ret;
        struct ext4_map_blocks map;
        u8 blkbits = inode->i_blkbits;

        if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
                return -EINVAL;

        if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
                return -ERANGE;

        /*
         * Calculate the first and last logical blocks respectively.
         */
        map.m_lblk = offset >> blkbits;
        map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
                          EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;

        if (flags & IOMAP_WRITE) {
                /*
                 * We check here if the blocks are already allocated, then we
                 * don't need to start a journal txn and we can directly return
                 * the mapping information. This could boost performance
                 * especially in multi-threaded overwrite requests.
                 */
                if (offset + length <= i_size_read(inode)) {
                        ret = ext4_map_blocks(NULL, inode, &map, 0);
                        if (ret > 0 && (map.m_flags & EXT4_MAP_MAPPED))
                                goto out;
                }
                ret = ext4_iomap_alloc(inode, &map, flags);
        } else {
                ret = ext4_map_blocks(NULL, inode, &map, 0);
        }

        if (ret < 0)
                return ret;
out:
        ext4_set_iomap(inode, iomap, &map, offset, length);

        return 0;
}

static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset,
                loff_t length, unsigned flags, struct iomap *iomap,
                struct iomap *srcmap)
{
        int ret;

        /*
         * Even for writes we don't need to allocate blocks, so just pretend
         * we are reading to save overhead of starting a transaction.
         */
        flags &= ~IOMAP_WRITE;
        ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap);
        WARN_ON_ONCE(!ret && iomap->type != IOMAP_MAPPED);
        return ret;
}

static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
                          ssize_t written, unsigned flags, struct iomap *iomap)
{
        /*
         * Check to see whether an error occurred while writing out the data to
         * the allocated blocks. If so, return the magic error code so that we
         * fallback to buffered I/O and attempt to complete the remainder of
         * the I/O. Any blocks that may have been allocated in preparation for
         * the direct I/O will be reused during buffered I/O.
         */
        if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0)
                return -ENOTBLK;

        return 0;
}

const struct iomap_ops ext4_iomap_ops = {
        .iomap_begin                = ext4_iomap_begin,
        .iomap_end                = ext4_iomap_end,
};

const struct iomap_ops ext4_iomap_overwrite_ops = {
        .iomap_begin                = ext4_iomap_overwrite_begin,
        .iomap_end                = ext4_iomap_end,
};

static bool ext4_iomap_is_delalloc(struct inode *inode,
                                   struct ext4_map_blocks *map)
{
        struct extent_status es;
        ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1;

        ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
                                  map->m_lblk, end, &es);

        if (!es.es_len || es.es_lblk > end)
                return false;

        if (es.es_lblk > map->m_lblk) {
                map->m_len = es.es_lblk - map->m_lblk;
                return false;
        }

        offset = map->m_lblk - es.es_lblk;
        map->m_len = es.es_len - offset;

        return true;
}

static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
                                   loff_t length, unsigned int flags,
                                   struct iomap *iomap, struct iomap *srcmap)
{
        int ret;
        bool delalloc = false;
        struct ext4_map_blocks map;
        u8 blkbits = inode->i_blkbits;

        if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
                return -EINVAL;

        if (ext4_has_inline_data(inode)) {
                ret = ext4_inline_data_iomap(inode, iomap);
                if (ret != -EAGAIN) {
                        if (ret == 0 && offset >= iomap->length)
                                ret = -ENOENT;
                        return ret;
                }
        }

        /*
         * Calculate the first and last logical block respectively.
         */
        map.m_lblk = offset >> blkbits;
        map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
                          EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;

        /*
         * Fiemap callers may call for offset beyond s_bitmap_maxbytes.
         * So handle it here itself instead of querying ext4_map_blocks().
         * Since ext4_map_blocks() will warn about it and will return
         * -EIO error.
         */
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

                if (offset >= sbi->s_bitmap_maxbytes) {
                        map.m_flags = 0;
                        goto set_iomap;
                }
        }

        ret = ext4_map_blocks(NULL, inode, &map, 0);
        if (ret < 0)
                return ret;
        if (ret == 0)
                delalloc = ext4_iomap_is_delalloc(inode, &map);

set_iomap:
        ext4_set_iomap(inode, iomap, &map, offset, length);
        if (delalloc && iomap->type == IOMAP_HOLE)
                iomap->type = IOMAP_DELALLOC;

        return 0;
}

const struct iomap_ops ext4_iomap_report_ops = {
        .iomap_begin = ext4_iomap_begin_report,
};

/*
 * Pages can be marked dirty completely asynchronously from ext4's journalling
 * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
 * much here because ->set_page_dirty is called under VFS locks.  The page is
 * not necessarily locked.
 *
 * We cannot just dirty the page and leave attached buffers clean, because the
 * buffers' dirty state is "definitive".  We cannot just set the buffers dirty
 * or jbddirty because all the journalling code will explode.
 *
 * So what we do is to mark the page "pending dirty" and next time writepage
 * is called, propagate that into the buffers appropriately.
 */
static int ext4_journalled_set_page_dirty(struct page *page)
{
        SetPageChecked(page);
        return __set_page_dirty_nobuffers(page);
}

static int ext4_set_page_dirty(struct page *page)
{
        WARN_ON_ONCE(!PageLocked(page) && !PageDirty(page));
        WARN_ON_ONCE(!page_has_buffers(page));
        return __set_page_dirty_buffers(page);
}

static int ext4_iomap_swap_activate(struct swap_info_struct *sis,
                                    struct file *file, sector_t *span)
{
        return iomap_swapfile_activate(sis, file, span,
                                       &ext4_iomap_report_ops);
}

static const struct address_space_operations ext4_aops = {
        .readpage                = ext4_readpage,
        .readahead                = ext4_readahead,
        .writepage                = ext4_writepage,
        .writepages                = ext4_writepages,
        .write_begin                = ext4_write_begin,
        .write_end                = ext4_write_end,
        .set_page_dirty                = ext4_set_page_dirty,
        .bmap                        = ext4_bmap,
        .invalidatepage                = ext4_invalidatepage,
        .releasepage                = ext4_releasepage,
        .direct_IO                = noop_direct_IO,
        .migratepage                = buffer_migrate_page,
        .is_partially_uptodate  = block_is_partially_uptodate,
        .error_remove_page        = generic_error_remove_page,
        .swap_activate                = ext4_iomap_swap_activate,
};

static const struct address_space_operations ext4_journalled_aops = {
        .readpage                = ext4_readpage,
        .readahead                = ext4_readahead,
        .writepage                = ext4_writepage,
        .writepages                = ext4_writepages,
        .write_begin                = ext4_write_begin,
        .write_end                = ext4_journalled_write_end,
        .set_page_dirty                = ext4_journalled_set_page_dirty,
        .bmap                        = ext4_bmap,
        .invalidatepage                = ext4_journalled_invalidatepage,
        .releasepage                = ext4_releasepage,
        .direct_IO                = noop_direct_IO,
        .is_partially_uptodate  = block_is_partially_uptodate,
        .error_remove_page        = generic_error_remove_page,
        .swap_activate                = ext4_iomap_swap_activate,
};

static const struct address_space_operations ext4_da_aops = {
        .readpage                = ext4_readpage,
        .readahead                = ext4_readahead,
        .writepage                = ext4_writepage,
        .writepages                = ext4_writepages,
        .write_begin                = ext4_da_write_begin,
        .write_end                = ext4_da_write_end,
        .set_page_dirty                = ext4_set_page_dirty,
        .bmap                        = ext4_bmap,
        .invalidatepage                = ext4_invalidatepage,
        .releasepage                = ext4_releasepage,
        .direct_IO                = noop_direct_IO,
        .migratepage                = buffer_migrate_page,
        .is_partially_uptodate  = block_is_partially_uptodate,
        .error_remove_page        = generic_error_remove_page,
        .swap_activate                = ext4_iomap_swap_activate,
};

static const struct address_space_operations ext4_dax_aops = {
        .writepages                = ext4_dax_writepages,
        .direct_IO                = noop_direct_IO,
        .set_page_dirty                = noop_set_page_dirty,
        .bmap                        = ext4_bmap,
        .invalidatepage                = noop_invalidatepage,
        .swap_activate                = ext4_iomap_swap_activate,
};

void ext4_set_aops(struct inode *inode)
{
        switch (ext4_inode_journal_mode(inode)) {
        case EXT4_INODE_ORDERED_DATA_MODE:
        case EXT4_INODE_WRITEBACK_DATA_MODE:
                break;
        case EXT4_INODE_JOURNAL_DATA_MODE:
                inode->i_mapping->a_ops = &ext4_journalled_aops;
                return;
        default:
                BUG();
        }
        if (IS_DAX(inode))
                inode->i_mapping->a_ops = &ext4_dax_aops;
        else if (test_opt(inode->i_sb, DELALLOC))
                inode->i_mapping->a_ops = &ext4_da_aops;
        else
                inode->i_mapping->a_ops = &ext4_aops;
}

static int __ext4_block_zero_page_range(handle_t *handle,
                struct address_space *mapping, loff_t from, loff_t length)
{
        ext4_fsblk_t index = from >> PAGE_SHIFT;
        unsigned offset = from & (PAGE_SIZE-1);
        unsigned blocksize, pos;
        ext4_lblk_t iblock;
        struct inode *inode = mapping->host;
        struct buffer_head *bh;
        struct page *page;
        int err = 0;

        page = find_or_create_page(mapping, from >> PAGE_SHIFT,
                                   mapping_gfp_constraint(mapping, ~__GFP_FS));
        if (!page)
                return -ENOMEM;

        blocksize = inode->i_sb->s_blocksize;

        iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);

        if (!page_has_buffers(page))
                create_empty_buffers(page, blocksize, 0);

        /* Find the buffer that contains "offset" */
        bh = page_buffers(page);
        pos = blocksize;
        while (offset >= pos) {
                bh = bh->b_this_page;
                iblock++;
                pos += blocksize;
        }
        if (buffer_freed(bh)) {
                BUFFER_TRACE(bh, "freed: skip");
                goto unlock;
        }
        if (!buffer_mapped(bh)) {
                BUFFER_TRACE(bh, "unmapped");
                ext4_get_block(inode, iblock, bh, 0);
                /* unmapped? It's a hole - nothing to do */
                if (!buffer_mapped(bh)) {
                        BUFFER_TRACE(bh, "still unmapped");
                        goto unlock;
                }
        }

        /* Ok, it's mapped. Make sure it's up-to-date */
        if (PageUptodate(page))
                set_buffer_uptodate(bh);

        if (!buffer_uptodate(bh)) {
                err = ext4_read_bh_lock(bh, 0, true);
                if (err)
                        goto unlock;
                if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
                        /* We expect the key to be set. */
                        BUG_ON(!fscrypt_has_encryption_key(inode));
                        err = fscrypt_decrypt_pagecache_blocks(page, blocksize,
                                                               bh_offset(bh));
                        if (err) {
                                clear_buffer_uptodate(bh);
                                goto unlock;
                        }
                }
        }
        if (ext4_should_journal_data(inode)) {
                BUFFER_TRACE(bh, "get write access");
                err = ext4_journal_get_write_access(handle, bh);
                if (err)
                        goto unlock;
        }
        zero_user(page, offset, length);
        BUFFER_TRACE(bh, "zeroed end of block");

        if (ext4_should_journal_data(inode)) {
                err = ext4_handle_dirty_metadata(handle, inode, bh);
        } else {
                err = 0;
                mark_buffer_dirty(bh);
                if (ext4_should_order_data(inode))
                        err = ext4_jbd2_inode_add_write(handle, inode, from,
                                        length);
        }

unlock:
        unlock_page(page);
        put_page(page);
        return err;
}

/*
 * ext4_block_zero_page_range() zeros out a mapping of length 'length'
 * starting from file offset 'from'.  The range to be zero'd must
 * be contained with in one block.  If the specified range exceeds
 * the end of the block it will be shortened to end of the block
 * that corresponds to 'from'
 */
static int ext4_block_zero_page_range(handle_t *handle,
                struct address_space *mapping, loff_t from, loff_t length)
{
        struct inode *inode = mapping->host;
        unsigned offset = from & (PAGE_SIZE-1);
        unsigned blocksize = inode->i_sb->s_blocksize;
        unsigned max = blocksize - (offset & (blocksize - 1));

        /*
         * correct length if it does not fall between
         * 'from' and the end of the block
         */
        if (length > max || length < 0)
                length = max;

        if (IS_DAX(inode)) {
                return iomap_zero_range(inode, from, length, NULL,
                                        &ext4_iomap_ops);
        }
        return __ext4_block_zero_page_range(handle, mapping, from, length);
}

/*
 * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
 * up to the end of the block which corresponds to `from'.
 * This required during truncate. We need to physically zero the tail end
 * of that block so it doesn't yield old data if the file is later grown.
 */
static int ext4_block_truncate_page(handle_t *handle,
                struct address_space *mapping, loff_t from)
{
        unsigned offset = from & (PAGE_SIZE-1);
        unsigned length;
        unsigned blocksize;
        struct inode *inode = mapping->host;

        /* If we are processing an encrypted inode during orphan list handling */
        if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode))
                return 0;

        blocksize = inode->i_sb->s_blocksize;
        length = blocksize - (offset & (blocksize - 1));

        return ext4_block_zero_page_range(handle, mapping, from, length);
}

int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
                             loff_t lstart, loff_t length)
{
        struct super_block *sb = inode->i_sb;
        struct address_space *mapping = inode->i_mapping;
        unsigned partial_start, partial_end;
        ext4_fsblk_t start, end;
        loff_t byte_end = (lstart + length - 1);
        int err = 0;

        partial_start = lstart & (sb->s_blocksize - 1);
        partial_end = byte_end & (sb->s_blocksize - 1);

        start = lstart >> sb->s_blocksize_bits;
        end = byte_end >> sb->s_blocksize_bits;

        /* Handle partial zero within the single block */
        if (start == end &&
            (partial_start || (partial_end != sb->s_blocksize - 1))) {
                err = ext4_block_zero_page_range(handle, mapping,
                                                 lstart, length);
                return err;
        }
        /* Handle partial zero out on the start of the range */
        if (partial_start) {
                err = ext4_block_zero_page_range(handle, mapping,
                                                 lstart, sb->s_blocksize);
                if (err)
                        return err;
        }
        /* Handle partial zero out on the end of the range */
        if (partial_end != sb->s_blocksize - 1)
                err = ext4_block_zero_page_range(handle, mapping,
                                                 byte_end - partial_end,
                                                 partial_end + 1);
        return err;
}

int ext4_can_truncate(struct inode *inode)
{
        if (S_ISREG(inode->i_mode))
                return 1;
        if (S_ISDIR(inode->i_mode))
                return 1;
        if (S_ISLNK(inode->i_mode))
                return !ext4_inode_is_fast_symlink(inode);
        return 0;
}

/*
 * We have to make sure i_disksize gets properly updated before we truncate
 * page cache due to hole punching or zero range. Otherwise i_disksize update
 * can get lost as it may have been postponed to submission of writeback but
 * that will never happen if we remove the folio containing i_size from the
 * page cache. Also if we punch hole within i_size but above i_disksize,
 * following ext4_page_mkwrite() may mistakenly allocate written blocks over
 * the hole and thus introduce allocated blocks beyond i_disksize which is
 * not allowed (e2fsck would complain in case of crash).
 */
int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
                                      loff_t len)
{
        handle_t *handle;
        int ret;

        loff_t size = i_size_read(inode);

        WARN_ON(!inode_is_locked(inode));
        if (offset > size)
                return 0;

        if (offset + len < size)
                size = offset + len;
        if (EXT4_I(inode)->i_disksize >= size)
                return 0;

        handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
        if (IS_ERR(handle))
                return PTR_ERR(handle);
        ext4_update_i_disksize(inode, size);
        ret = ext4_mark_inode_dirty(handle, inode);
        ext4_journal_stop(handle);

        return ret;
}

static void ext4_wait_dax_page(struct ext4_inode_info *ei)
{
        up_write(&ei->i_mmap_sem);
        schedule();
        down_write(&ei->i_mmap_sem);
}

int ext4_break_layouts(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct page *page;
        int error;

        if (WARN_ON_ONCE(!rwsem_is_locked(&ei->i_mmap_sem)))
                return -EINVAL;

        do {
                page = dax_layout_busy_page(inode->i_mapping);
                if (!page)
                        return 0;

                error = ___wait_var_event(&page->_refcount,
                                atomic_read(&page->_refcount) == 1,
                                TASK_INTERRUPTIBLE, 0, 0,
                                ext4_wait_dax_page(ei));
        } while (error == 0);

        return error;
}

/*
 * ext4_punch_hole: punches a hole in a file by releasing the blocks
 * associated with the given offset and length
 *
 * @inode:  File inode
 * @offset: The offset where the hole will begin
 * @len:    The length of the hole
 *
 * Returns: 0 on success or negative on failure
 */

int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
{
        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
        ext4_lblk_t first_block, stop_block;
        struct address_space *mapping = inode->i_mapping;
        loff_t first_block_offset, last_block_offset, max_length;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        handle_t *handle;
        unsigned int credits;
        int ret = 0, ret2 = 0;

        trace_ext4_punch_hole(inode, offset, length, 0);

        /*
         * Write out all dirty pages to avoid race conditions
         * Then release them.
         */
        if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
                ret = filemap_write_and_wait_range(mapping, offset,
                                                   offset + length - 1);
                if (ret)
                        return ret;
        }

        inode_lock(inode);

        /* No need to punch hole beyond i_size */
        if (offset >= inode->i_size)
                goto out_mutex;

        /*
         * If the hole extends beyond i_size, set the hole
         * to end after the page that contains i_size
         */
        if (offset + length > inode->i_size) {
                length = inode->i_size +
                   PAGE_SIZE - (inode->i_size & (PAGE_SIZE - 1)) -
                   offset;
        }

        /*
         * For punch hole the length + offset needs to be within one block
         * before last range. Adjust the length if it goes beyond that limit.
         */
        max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize;
        if (offset + length > max_length)
                length = max_length - offset;

        if (offset & (sb->s_blocksize - 1) ||
            (offset + length) & (sb->s_blocksize - 1)) {
                /*
                 * Attach jinode to inode for jbd2 if we do any zeroing of
                 * partial block
                 */
                ret = ext4_inode_attach_jinode(inode);
                if (ret < 0)
                        goto out_mutex;

        }

        /* Wait all existing dio workers, newcomers will block on i_mutex */
        inode_dio_wait(inode);

        ret = file_modified(file);
        if (ret)
                goto out_mutex;

        /*
         * Prevent page faults from reinstantiating pages we have released from
         * page cache.
         */
        down_write(&EXT4_I(inode)->i_mmap_sem);

        ret = ext4_break_layouts(inode);
        if (ret)
                goto out_dio;

        first_block_offset = round_up(offset, sb->s_blocksize);
        last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;

        /* Now release the pages and zero block aligned part of pages*/
        if (last_block_offset > first_block_offset) {
                ret = ext4_update_disksize_before_punch(inode, offset, length);
                if (ret)
                        goto out_dio;
                truncate_pagecache_range(inode, first_block_offset,
                                         last_block_offset);
        }

        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                credits = ext4_writepage_trans_blocks(inode);
        else
                credits = ext4_blocks_for_truncate(inode);
        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                ext4_std_error(sb, ret);
                goto out_dio;
        }

        ret = ext4_zero_partial_blocks(handle, inode, offset,
                                       length);
        if (ret)
                goto out_stop;

        first_block = (offset + sb->s_blocksize - 1) >>
                EXT4_BLOCK_SIZE_BITS(sb);
        stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);

        /* If there are blocks to remove, do it */
        if (stop_block > first_block) {

                down_write(&EXT4_I(inode)->i_data_sem);
                ext4_discard_preallocations(inode, 0);

                ret = ext4_es_remove_extent(inode, first_block,
                                            stop_block - first_block);
                if (ret) {
                        up_write(&EXT4_I(inode)->i_data_sem);
                        goto out_stop;
                }

                if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                        ret = ext4_ext_remove_space(inode, first_block,
                                                    stop_block - 1);
                else
                        ret = ext4_ind_remove_space(handle, inode, first_block,
                                                    stop_block);

                up_write(&EXT4_I(inode)->i_data_sem);
        }
        ext4_fc_track_range(handle, inode, first_block, stop_block);
        if (IS_SYNC(inode))
                ext4_handle_sync(handle);

        inode->i_mtime = inode->i_ctime = current_time(inode);
        ret2 = ext4_mark_inode_dirty(handle, inode);
        if (unlikely(ret2))
                ret = ret2;
        if (ret >= 0)
                ext4_update_inode_fsync_trans(handle, inode, 1);
out_stop:
        ext4_journal_stop(handle);
out_dio:
        up_write(&EXT4_I(inode)->i_mmap_sem);
out_mutex:
        inode_unlock(inode);
        return ret;
}

int ext4_inode_attach_jinode(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct jbd2_inode *jinode;

        if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal)
                return 0;

        jinode = jbd2_alloc_inode(GFP_KERNEL);
        spin_lock(&inode->i_lock);
        if (!ei->jinode) {
                if (!jinode) {
                        spin_unlock(&inode->i_lock);
                        return -ENOMEM;
                }
                jbd2_journal_init_jbd_inode(jinode, inode);
                /*
                 * Publish ->jinode only after it is fully initialized so that
                 * readers never observe a partially initialized jbd2_inode.
                 */
                smp_wmb();
                WRITE_ONCE(ei->jinode, jinode);
                jinode = NULL;
        }
        spin_unlock(&inode->i_lock);
        if (unlikely(jinode != NULL))
                jbd2_free_inode(jinode);
        return 0;
}

/*
 * ext4_truncate()
 *
 * We block out ext4_get_block() block instantiations across the entire
 * transaction, and VFS/VM ensures that ext4_truncate() cannot run
 * simultaneously on behalf of the same inode.
 *
 * As we work through the truncate and commit bits of it to the journal there
 * is one core, guiding principle: the file's tree must always be consistent on
 * disk.  We must be able to restart the truncate after a crash.
 *
 * The file's tree may be transiently inconsistent in memory (although it
 * probably isn't), but whenever we close off and commit a journal transaction,
 * the contents of (the filesystem + the journal) must be consistent and
 * restartable.  It's pretty simple, really: bottom up, right to left (although
 * left-to-right works OK too).
 *
 * Note that at recovery time, journal replay occurs *before* the restart of
 * truncate against the orphan inode list.
 *
 * The committed inode has the new, desired i_size (which is the same as
 * i_disksize in this case).  After a crash, ext4_orphan_cleanup() will see
 * that this inode's truncate did not complete and it will again call
 * ext4_truncate() to have another go.  So there will be instantiated blocks
 * to the right of the truncation point in a crashed ext4 filesystem.  But
 * that's fine - as long as they are linked from the inode, the post-crash
 * ext4_truncate() run will find them and release them.
 */
int ext4_truncate(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        unsigned int credits;
        int err = 0, err2;
        handle_t *handle;
        struct address_space *mapping = inode->i_mapping;

        /*
         * There is a possibility that we're either freeing the inode
         * or it's a completely new inode. In those cases we might not
         * have i_mutex locked because it's not necessary.
         */
        if (!(inode->i_state & (I_NEW|I_FREEING)))
                WARN_ON(!inode_is_locked(inode));
        trace_ext4_truncate_enter(inode);

        if (!ext4_can_truncate(inode))
                goto out_trace;

        if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
                ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);

        if (ext4_has_inline_data(inode)) {
                int has_inline = 1;

                err = ext4_inline_data_truncate(inode, &has_inline);
                if (err || has_inline)
                        goto out_trace;
        }

        /* If we zero-out tail of the page, we have to create jinode for jbd2 */
        if (inode->i_size & (inode->i_sb->s_blocksize - 1)) {
                err = ext4_inode_attach_jinode(inode);
                if (err)
                        goto out_trace;
        }

        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                credits = ext4_writepage_trans_blocks(inode);
        else
                credits = ext4_blocks_for_truncate(inode);

        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
        if (IS_ERR(handle)) {
                err = PTR_ERR(handle);
                goto out_trace;
        }

        if (inode->i_size & (inode->i_sb->s_blocksize - 1))
                ext4_block_truncate_page(handle, mapping, inode->i_size);

        /*
         * We add the inode to the orphan list, so that if this
         * truncate spans multiple transactions, and we crash, we will
         * resume the truncate when the filesystem recovers.  It also
         * marks the inode dirty, to catch the new size.
         *
         * Implication: the file must always be in a sane, consistent
         * truncatable state while each transaction commits.
         */
        err = ext4_orphan_add(handle, inode);
        if (err)
                goto out_stop;

        down_write(&EXT4_I(inode)->i_data_sem);

        ext4_discard_preallocations(inode, 0);

        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                err = ext4_ext_truncate(handle, inode);
        else
                ext4_ind_truncate(handle, inode);

        up_write(&ei->i_data_sem);
        if (err)
                goto out_stop;

        if (IS_SYNC(inode))
                ext4_handle_sync(handle);

out_stop:
        /*
         * If this was a simple ftruncate() and the file will remain alive,
         * then we need to clear up the orphan record which we created above.
         * However, if this was a real unlink then we were called by
         * ext4_evict_inode(), and we allow that function to clean up the
         * orphan info for us.
         */
        if (inode->i_nlink)
                ext4_orphan_del(handle, inode);

        inode->i_mtime = inode->i_ctime = current_time(inode);
        err2 = ext4_mark_inode_dirty(handle, inode);
        if (unlikely(err2 && !err))
                err = err2;
        ext4_journal_stop(handle);

out_trace:
        trace_ext4_truncate_exit(inode);
        return err;
}

/*
 * ext4_get_inode_loc returns with an extra refcount against the inode's
 * underlying buffer_head on success. If 'in_mem' is true, we have all
 * data in memory that is needed to recreate the on-disk version of this
 * inode.
 */
static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino,
                                struct ext4_iloc *iloc, int in_mem,
                                ext4_fsblk_t *ret_block)
{
        struct ext4_group_desc        *gdp;
        struct buffer_head        *bh;
        ext4_fsblk_t                block;
        struct blk_plug                plug;
        int                        inodes_per_block, inode_offset;

        iloc->bh = NULL;
        if (ino < EXT4_ROOT_INO ||
            ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
                return -EFSCORRUPTED;

        iloc->block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
        gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
        if (!gdp)
                return -EIO;

        /*
         * Figure out the offset within the block group inode table
         */
        inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
        inode_offset = ((ino - 1) %
                        EXT4_INODES_PER_GROUP(sb));
        iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);

        block = ext4_inode_table(sb, gdp);
        if ((block <= le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) ||
            (block >= ext4_blocks_count(EXT4_SB(sb)->s_es))) {
                ext4_error(sb, "Invalid inode table block %llu in "
                           "block_group %u", block, iloc->block_group);
                return -EFSCORRUPTED;
        }
        block += (inode_offset / inodes_per_block);

        bh = sb_getblk(sb, block);
        if (unlikely(!bh))
                return -ENOMEM;
        if (ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO))
                goto simulate_eio;
        if (!buffer_uptodate(bh)) {
                lock_buffer(bh);

                if (ext4_buffer_uptodate(bh)) {
                        /* someone brought it uptodate while we waited */
                        unlock_buffer(bh);
                        goto has_buffer;
                }

                /*
                 * If we have all information of the inode in memory and this
                 * is the only valid inode in the block, we need not read the
                 * block.
                 */
                if (in_mem) {
                        struct buffer_head *bitmap_bh;
                        int i, start;

                        start = inode_offset & ~(inodes_per_block - 1);

                        /* Is the inode bitmap in cache? */
                        bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
                        if (unlikely(!bitmap_bh))
                                goto make_io;

                        /*
                         * If the inode bitmap isn't in cache then the
                         * optimisation may end up performing two reads instead
                         * of one, so skip it.
                         */
                        if (!buffer_uptodate(bitmap_bh)) {
                                brelse(bitmap_bh);
                                goto make_io;
                        }
                        for (i = start; i < start + inodes_per_block; i++) {
                                if (i == inode_offset)
                                        continue;
                                if (ext4_test_bit(i, bitmap_bh->b_data))
                                        break;
                        }
                        brelse(bitmap_bh);
                        if (i == start + inodes_per_block) {
                                /* all other inodes are free, so skip I/O */
                                memset(bh->b_data, 0, bh->b_size);
                                set_buffer_uptodate(bh);
                                unlock_buffer(bh);
                                goto has_buffer;
                        }
                }

make_io:
                /*
                 * If we need to do any I/O, try to pre-readahead extra
                 * blocks from the inode table.
                 */
                blk_start_plug(&plug);
                if (EXT4_SB(sb)->s_inode_readahead_blks) {
                        ext4_fsblk_t b, end, table;
                        unsigned num;
                        __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks;

                        table = ext4_inode_table(sb, gdp);
                        /* s_inode_readahead_blks is always a power of 2 */
                        b = block & ~((ext4_fsblk_t) ra_blks - 1);
                        if (table > b)
                                b = table;
                        end = b + ra_blks;
                        num = EXT4_INODES_PER_GROUP(sb);
                        if (ext4_has_group_desc_csum(sb))
                                num -= ext4_itable_unused_count(sb, gdp);
                        table += num / inodes_per_block;
                        if (end > table)
                                end = table;
                        while (b <= end)
                                ext4_sb_breadahead_unmovable(sb, b++);
                }

                /*
                 * There are other valid inodes in the buffer, this inode
                 * has in-inode xattrs, or we don't have this inode in memory.
                 * Read the block from disk.
                 */
                trace_ext4_load_inode(sb, ino);
                ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL);
                blk_finish_plug(&plug);
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh)) {
                simulate_eio:
                        if (ret_block)
                                *ret_block = block;
                        brelse(bh);
                        return -EIO;
                }
        }
has_buffer:
        iloc->bh = bh;
        return 0;
}

static int __ext4_get_inode_loc_noinmem(struct inode *inode,
                                        struct ext4_iloc *iloc)
{
        ext4_fsblk_t err_blk = 0;
        int ret;

        ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, iloc, 0,
                                        &err_blk);

        if (ret == -EIO)
                ext4_error_inode_block(inode, err_blk, EIO,
                                        "unable to read itable block");

        return ret;
}

int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
{
        ext4_fsblk_t err_blk = 0;
        int ret;

        /* We have all inode data except xattrs in memory here. */
        ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, iloc,
                !ext4_test_inode_state(inode, EXT4_STATE_XATTR), &err_blk);

        if (ret == -EIO)
                ext4_error_inode_block(inode, err_blk, EIO,
                                        "unable to read itable block");

        return ret;
}


int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
                          struct ext4_iloc *iloc)
{
        return __ext4_get_inode_loc(sb, ino, iloc, 0, NULL);
}

static bool ext4_should_enable_dax(struct inode *inode)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

        if (test_opt2(inode->i_sb, DAX_NEVER))
                return false;
        if (!S_ISREG(inode->i_mode))
                return false;
        if (ext4_should_journal_data(inode))
                return false;
        if (ext4_has_inline_data(inode))
                return false;
        if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT))
                return false;
        if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY))
                return false;
        if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags))
                return false;
        if (test_opt(inode->i_sb, DAX_ALWAYS))
                return true;

        return ext4_test_inode_flag(inode, EXT4_INODE_DAX);
}

void ext4_set_inode_flags(struct inode *inode, bool init)
{
        unsigned int flags = EXT4_I(inode)->i_flags;
        unsigned int new_fl = 0;

        WARN_ON_ONCE(IS_DAX(inode) && init);

        if (flags & EXT4_SYNC_FL)
                new_fl |= S_SYNC;
        if (flags & EXT4_APPEND_FL)
                new_fl |= S_APPEND;
        if (flags & EXT4_IMMUTABLE_FL)
                new_fl |= S_IMMUTABLE;
        if (flags & EXT4_NOATIME_FL)
                new_fl |= S_NOATIME;
        if (flags & EXT4_DIRSYNC_FL)
                new_fl |= S_DIRSYNC;

        /* Because of the way inode_set_flags() works we must preserve S_DAX
         * here if already set. */
        new_fl |= (inode->i_flags & S_DAX);
        if (init && ext4_should_enable_dax(inode))
                new_fl |= S_DAX;

        if (flags & EXT4_ENCRYPT_FL)
                new_fl |= S_ENCRYPTED;
        if (flags & EXT4_CASEFOLD_FL)
                new_fl |= S_CASEFOLD;
        if (flags & EXT4_VERITY_FL)
                new_fl |= S_VERITY;
        inode_set_flags(inode, new_fl,
                        S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX|
                        S_ENCRYPTED|S_CASEFOLD|S_VERITY);
}

static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
                                  struct ext4_inode_info *ei)
{
        blkcnt_t i_blocks ;
        struct inode *inode = &(ei->vfs_inode);
        struct super_block *sb = inode->i_sb;

        if (ext4_has_feature_huge_file(sb)) {
                /* we are using combined 48 bit field */
                i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
                                        le32_to_cpu(raw_inode->i_blocks_lo);
                if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) {
                        /* i_blocks represent file system block size */
                        return i_blocks  << (inode->i_blkbits - 9);
                } else {
                        return i_blocks;
                }
        } else {
                return le32_to_cpu(raw_inode->i_blocks_lo);
        }
}

static inline int ext4_iget_extra_inode(struct inode *inode,
                                         struct ext4_inode *raw_inode,
                                         struct ext4_inode_info *ei)
{
        __le32 *magic = (void *)raw_inode +
                        EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;

        if (EXT4_INODE_HAS_XATTR_SPACE(inode)  &&
            *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
                int err;

                err = xattr_check_inode(inode, IHDR(inode, raw_inode),
                                        ITAIL(inode, raw_inode));
                if (err)
                        return err;

                ext4_set_inode_state(inode, EXT4_STATE_XATTR);
                err = ext4_find_inline_data_nolock(inode);
                if (!err && ext4_has_inline_data(inode))
                        ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
                return err;
        } else
                EXT4_I(inode)->i_inline_off = 0;
        return 0;
}

int ext4_get_projid(struct inode *inode, kprojid_t *projid)
{
        if (!ext4_has_feature_project(inode->i_sb))
                return -EOPNOTSUPP;
        *projid = EXT4_I(inode)->i_projid;
        return 0;
}

/*
 * ext4 has self-managed i_version for ea inodes, it stores the lower 32bit of
 * refcount in i_version, so use raw values if inode has EXT4_EA_INODE_FL flag
 * set.
 */
static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val)
{
        if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
                inode_set_iversion_raw(inode, val);
        else
                inode_set_iversion_queried(inode, val);
}
static inline u64 ext4_inode_peek_iversion(const struct inode *inode)
{
        if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
                return inode_peek_iversion_raw(inode);
        else
                return inode_peek_iversion(inode);
}

static int check_igot_inode(struct inode *inode, ext4_iget_flags flags,
                            const char *function, unsigned int line)
{
        const char *err_str;

        if (flags & EXT4_IGET_EA_INODE) {
                if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
                        err_str = "missing EA_INODE flag";
                        goto error;
                }
                if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
                    EXT4_I(inode)->i_file_acl) {
                        err_str = "ea_inode with extended attributes";
                        goto error;
                }
        } else {
                if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
                        /*
                         * open_by_handle_at() could provide an old inode number
                         * that has since been reused for an ea_inode; this does
                         * not indicate filesystem corruption
                         */
                        if (flags & EXT4_IGET_HANDLE)
                                return -ESTALE;
                        err_str = "unexpected EA_INODE flag";
                        goto error;
                }
        }
        if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) {
                err_str = "unexpected bad inode w/o EXT4_IGET_BAD";
                goto error;
        }
        return 0;

error:
        ext4_error_inode(inode, function, line, 0, err_str);
        return -EFSCORRUPTED;
}

struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
                          ext4_iget_flags flags, const char *function,
                          unsigned int line)
{
        struct ext4_iloc iloc;
        struct ext4_inode *raw_inode;
        struct ext4_inode_info *ei;
        struct inode *inode;
        journal_t *journal = EXT4_SB(sb)->s_journal;
        long ret;
        loff_t size;
        int block;
        uid_t i_uid;
        gid_t i_gid;
        projid_t i_projid;

        if ((!(flags & EXT4_IGET_SPECIAL) &&
             (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)) ||
            (ino < EXT4_ROOT_INO) ||
            (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) {
                if (flags & EXT4_IGET_HANDLE)
                        return ERR_PTR(-ESTALE);
                __ext4_error(sb, function, line, EFSCORRUPTED, 0,
                             "inode #%lu: comm %s: iget: illegal inode #",
                             ino, current->comm);
                return ERR_PTR(-EFSCORRUPTED);
        }

        inode = iget_locked(sb, ino);
        if (!inode)
                return ERR_PTR(-ENOMEM);
        if (!(inode->i_state & I_NEW)) {
                ret = check_igot_inode(inode, flags, function, line);
                if (ret) {
                        iput(inode);
                        return ERR_PTR(ret);
                }
                return inode;
        }

        ei = EXT4_I(inode);
        iloc.bh = NULL;

        ret = __ext4_get_inode_loc_noinmem(inode, &iloc);
        if (ret < 0)
                goto bad_inode;
        raw_inode = ext4_raw_inode(&iloc);

        if ((flags & EXT4_IGET_HANDLE) &&
            (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) {
                ret = -ESTALE;
                goto bad_inode;
        }

        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
                ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
                if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
                        EXT4_INODE_SIZE(inode->i_sb) ||
                    (ei->i_extra_isize & 3)) {
                        ext4_error_inode(inode, function, line, 0,
                                         "iget: bad extra_isize %u "
                                         "(inode size %u)",
                                         ei->i_extra_isize,
                                         EXT4_INODE_SIZE(inode->i_sb));
                        ret = -EFSCORRUPTED;
                        goto bad_inode;
                }
        } else
                ei->i_extra_isize = 0;

        /* Precompute checksum seed for inode metadata */
        if (ext4_has_metadata_csum(sb)) {
                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
                __u32 csum;
                __le32 inum = cpu_to_le32(inode->i_ino);
                __le32 gen = raw_inode->i_generation;
                csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
                                   sizeof(inum));
                ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
                                              sizeof(gen));
        }

        if ((!ext4_inode_csum_verify(inode, raw_inode, ei) ||
            ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) &&
             (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))) {
                ext4_error_inode_err(inode, function, line, 0,
                                EFSBADCRC, "iget: checksum invalid");
                ret = -EFSBADCRC;
                goto bad_inode;
        }

        inode->i_mode = le16_to_cpu(raw_inode->i_mode);
        i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
        i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
        if (ext4_has_feature_project(sb) &&
            EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE &&
            EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
                i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid);
        else
                i_projid = EXT4_DEF_PROJID;

        if (!(test_opt(inode->i_sb, NO_UID32))) {
                i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
                i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
        }
        i_uid_write(inode, i_uid);
        i_gid_write(inode, i_gid);
        ei->i_projid = make_kprojid(&init_user_ns, i_projid);
        set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));

        ext4_clear_state_flags(ei);        /* Only relevant on 32-bit archs */
        ei->i_inline_off = 0;
        ei->i_dir_start_lookup = 0;
        ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
        /* We now have enough fields to check if the inode was active or not.
         * This is needed because nfsd might try to access dead inodes
         * the test is that same one that e2fsck uses
         * NeilBrown 1999oct15
         */
        if (inode->i_nlink == 0) {
                if ((inode->i_mode == 0 || flags & EXT4_IGET_SPECIAL ||
                     !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) &&
                    ino != EXT4_BOOT_LOADER_INO) {
                        /* this inode is deleted or unallocated */
                        if (flags & EXT4_IGET_SPECIAL) {
                                ext4_error_inode(inode, function, line, 0,
                                                 "iget: special inode unallocated");
                                ret = -EFSCORRUPTED;
                        } else
                                ret = -ESTALE;
                        goto bad_inode;
                }
                /* The only unlinked inodes we let through here have
                 * valid i_mode and are being read by the orphan
                 * recovery code: that's fine, we're about to complete
                 * the process of deleting those.
                 * OR it is the EXT4_BOOT_LOADER_INO which is
                 * not initialized on a new filesystem. */
        }
        ei->i_flags = le32_to_cpu(raw_inode->i_flags);
        ext4_set_inode_flags(inode, true);
        /* Detect invalid flag combination - can't have both inline data and extents */
        if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) &&
            ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                ext4_error_inode(inode, function, line, 0,
                        "inode has both inline data and extents flags");
                ret = -EFSCORRUPTED;
                goto bad_inode;
        }
        inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
        ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
        if (ext4_has_feature_64bit(sb))
                ei->i_file_acl |=
                        ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
        inode->i_size = ext4_isize(sb, raw_inode);
        size = i_size_read(inode);
        if (size < 0 || size > ext4_get_maxbytes(inode)) {
                ext4_error_inode(inode, function, line, 0,
                                 "iget: bad i_size value: %lld", size);
                ret = -EFSCORRUPTED;
                goto bad_inode;
        }
        /*
         * If dir_index is not enabled but there's dir with INDEX flag set,
         * we'd normally treat htree data as empty space. But with metadata
         * checksumming that corrupts checksums so forbid that.
         */
        if (!ext4_has_feature_dir_index(sb) && ext4_has_metadata_csum(sb) &&
            ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) {
                ext4_error_inode(inode, function, line, 0,
                         "iget: Dir with htree data on filesystem without dir_index feature.");
                ret = -EFSCORRUPTED;
                goto bad_inode;
        }
        ei->i_disksize = inode->i_size;
#ifdef CONFIG_QUOTA
        ei->i_reserved_quota = 0;
#endif
        inode->i_generation = le32_to_cpu(raw_inode->i_generation);
        ei->i_block_group = iloc.block_group;
        ei->i_last_alloc_group = ~0;
        /*
         * NOTE! The in-memory inode i_data array is in little-endian order
         * even on big-endian machines: we do NOT byteswap the block numbers!
         */
        for (block = 0; block < EXT4_N_BLOCKS; block++)
                ei->i_data[block] = raw_inode->i_block[block];
        INIT_LIST_HEAD(&ei->i_orphan);
        ext4_fc_init_inode(&ei->vfs_inode);

        /*
         * Set transaction id's of transactions that have to be committed
         * to finish f[data]sync. We set them to currently running transaction
         * as we cannot be sure that the inode or some of its metadata isn't
         * part of the transaction - the inode could have been reclaimed and
         * now it is reread from disk.
         */
        if (journal) {
                transaction_t *transaction;
                tid_t tid;

                read_lock(&journal->j_state_lock);
                if (journal->j_running_transaction)
                        transaction = journal->j_running_transaction;
                else
                        transaction = journal->j_committing_transaction;
                if (transaction)
                        tid = transaction->t_tid;
                else
                        tid = journal->j_commit_sequence;
                read_unlock(&journal->j_state_lock);
                ei->i_sync_tid = tid;
                ei->i_datasync_tid = tid;
        }

        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
                if (ei->i_extra_isize == 0) {
                        /* The extra space is currently unused. Use it. */
                        BUILD_BUG_ON(sizeof(struct ext4_inode) & 3);
                        ei->i_extra_isize = sizeof(struct ext4_inode) -
                                            EXT4_GOOD_OLD_INODE_SIZE;
                } else {
                        ret = ext4_iget_extra_inode(inode, raw_inode, ei);
                        if (ret)
                                goto bad_inode;
                }
        }

        EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
        EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
        EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
        EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);

        if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
                u64 ivers = le32_to_cpu(raw_inode->i_disk_version);

                if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
                        if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
                                ivers |=
                    (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
                }
                ext4_inode_set_iversion_queried(inode, ivers);
        }

        ret = 0;
        if (ei->i_file_acl &&
            !ext4_inode_block_valid(inode, ei->i_file_acl, 1)) {
                ext4_error_inode(inode, function, line, 0,
                                 "iget: bad extended attribute block %llu",
                                 ei->i_file_acl);
                ret = -EFSCORRUPTED;
                goto bad_inode;
        } else if (!ext4_has_inline_data(inode)) {
                /* validate the block references in the inode */
                if (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) &&
                        (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
                        (S_ISLNK(inode->i_mode) &&
                        !ext4_inode_is_fast_symlink(inode)))) {
                        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                                ret = ext4_ext_check_inode(inode);
                        else
                                ret = ext4_ind_check_inode(inode);
                }
        }
        if (ret)
                goto bad_inode;

        if (S_ISREG(inode->i_mode)) {
                inode->i_op = &ext4_file_inode_operations;
                inode->i_fop = &ext4_file_operations;
                ext4_set_aops(inode);
        } else if (S_ISDIR(inode->i_mode)) {
                inode->i_op = &ext4_dir_inode_operations;
                inode->i_fop = &ext4_dir_operations;
        } else if (S_ISLNK(inode->i_mode)) {
                /* VFS does not allow setting these so must be corruption */
                if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
                        ext4_error_inode(inode, function, line, 0,
                                         "iget: immutable or append flags "
                                         "not allowed on symlinks");
                        ret = -EFSCORRUPTED;
                        goto bad_inode;
                }
                if (IS_ENCRYPTED(inode)) {
                        inode->i_op = &ext4_encrypted_symlink_inode_operations;
                        ext4_set_aops(inode);
                } else if (ext4_inode_is_fast_symlink(inode)) {
                        inode->i_link = (char *)ei->i_data;
                        inode->i_op = &ext4_fast_symlink_inode_operations;
                        nd_terminate_link(ei->i_data, inode->i_size,
                                sizeof(ei->i_data) - 1);
                } else {
                        inode->i_op = &ext4_symlink_inode_operations;
                        ext4_set_aops(inode);
                }
                inode_nohighmem(inode);
        } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
              S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
                inode->i_op = &ext4_special_inode_operations;
                if (raw_inode->i_block[0])
                        init_special_inode(inode, inode->i_mode,
                           old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
                else
                        init_special_inode(inode, inode->i_mode,
                           new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
        } else if (ino == EXT4_BOOT_LOADER_INO) {
                make_bad_inode(inode);
        } else {
                ret = -EFSCORRUPTED;
                ext4_error_inode(inode, function, line, 0,
                                 "iget: bogus i_mode (%o)", inode->i_mode);
                goto bad_inode;
        }
        if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) {
                ext4_error_inode(inode, function, line, 0,
                                 "casefold flag without casefold feature");
                ret = -EFSCORRUPTED;
                goto bad_inode;
        }
        ret = check_igot_inode(inode, flags, function, line);
        /*
         * -ESTALE here means there is nothing inherently wrong with the inode,
         * it's just not an inode we can return for an fhandle lookup.
         */
        if (ret == -ESTALE) {
                brelse(iloc.bh);
                unlock_new_inode(inode);
                iput(inode);
                return ERR_PTR(-ESTALE);
        }
        if (ret)
                goto bad_inode;
        brelse(iloc.bh);

        unlock_new_inode(inode);
        return inode;

bad_inode:
        brelse(iloc.bh);
        iget_failed(inode);
        return ERR_PTR(ret);
}

static int ext4_inode_blocks_set(handle_t *handle,
                                struct ext4_inode *raw_inode,
                                struct ext4_inode_info *ei)
{
        struct inode *inode = &(ei->vfs_inode);
        u64 i_blocks = READ_ONCE(inode->i_blocks);
        struct super_block *sb = inode->i_sb;

        if (i_blocks <= ~0U) {
                /*
                 * i_blocks can be represented in a 32 bit variable
                 * as multiple of 512 bytes
                 */
                raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
                raw_inode->i_blocks_high = 0;
                ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
                return 0;
        }
        if (!ext4_has_feature_huge_file(sb))
                return -EFBIG;

        if (i_blocks <= 0xffffffffffffULL) {
                /*
                 * i_blocks can be represented in a 48 bit variable
                 * as multiple of 512 bytes
                 */
                raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
                raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
                ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
        } else {
                ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
                /* i_block is stored in file system block size */
                i_blocks = i_blocks >> (inode->i_blkbits - 9);
                raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
                raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
        }
        return 0;
}

static void __ext4_update_other_inode_time(struct super_block *sb,
                                           unsigned long orig_ino,
                                           unsigned long ino,
                                           struct ext4_inode *raw_inode)
{
        struct inode *inode;

        inode = find_inode_by_ino_rcu(sb, ino);
        if (!inode)
                return;

        if ((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
                               I_DIRTY_INODE)) ||
            ((inode->i_state & I_DIRTY_TIME) == 0))
                return;

        spin_lock(&inode->i_lock);
        if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
                                I_DIRTY_INODE)) == 0) &&
            (inode->i_state & I_DIRTY_TIME)) {
                struct ext4_inode_info        *ei = EXT4_I(inode);

                inode->i_state &= ~I_DIRTY_TIME;
                spin_unlock(&inode->i_lock);

                spin_lock(&ei->i_raw_lock);
                EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
                EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
                EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
                ext4_inode_csum_set(inode, raw_inode, ei);
                spin_unlock(&ei->i_raw_lock);
                trace_ext4_other_inode_update_time(inode, orig_ino);
                return;
        }
        spin_unlock(&inode->i_lock);
}

/*
 * Opportunistically update the other time fields for other inodes in
 * the same inode table block.
 */
static void ext4_update_other_inodes_time(struct super_block *sb,
                                          unsigned long orig_ino, char *buf)
{
        unsigned long ino;
        int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
        int inode_size = EXT4_INODE_SIZE(sb);

        /*
         * Calculate the first inode in the inode table block.  Inode
         * numbers are one-based.  That is, the first inode in a block
         * (assuming 4k blocks and 256 byte inodes) is (n*16 + 1).
         */
        ino = ((orig_ino - 1) & ~(inodes_per_block - 1)) + 1;
        rcu_read_lock();
        for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
                if (ino == orig_ino)
                        continue;
                __ext4_update_other_inode_time(sb, orig_ino, ino,
                                               (struct ext4_inode *)buf);
        }
        rcu_read_unlock();
}

/*
 * Post the struct inode info into an on-disk inode location in the
 * buffer-cache.  This gobbles the caller's reference to the
 * buffer_head in the inode location struct.
 *
 * The caller must have write access to iloc->bh.
 */
static int ext4_do_update_inode(handle_t *handle,
                                struct inode *inode,
                                struct ext4_iloc *iloc)
{
        struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct buffer_head *bh = iloc->bh;
        struct super_block *sb = inode->i_sb;
        int err = 0, block;
        int need_datasync = 0, set_large_file = 0;
        uid_t i_uid;
        gid_t i_gid;
        projid_t i_projid;

        spin_lock(&ei->i_raw_lock);

        /* For fields not tracked in the in-memory inode,
         * initialise them to zero for new inodes. */
        if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
                memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);

        err = ext4_inode_blocks_set(handle, raw_inode, ei);
        if (err) {
                spin_unlock(&ei->i_raw_lock);
                goto out_brelse;
        }

        raw_inode->i_mode = cpu_to_le16(inode->i_mode);
        i_uid = i_uid_read(inode);
        i_gid = i_gid_read(inode);
        i_projid = from_kprojid(&init_user_ns, ei->i_projid);
        if (!(test_opt(inode->i_sb, NO_UID32))) {
                raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
                raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
/*
 * Fix up interoperability with old kernels. Otherwise, old inodes get
 * re-used with the upper 16 bits of the uid/gid intact
 */
                if (ei->i_dtime && list_empty(&ei->i_orphan)) {
                        raw_inode->i_uid_high = 0;
                        raw_inode->i_gid_high = 0;
                } else {
                        raw_inode->i_uid_high =
                                cpu_to_le16(high_16_bits(i_uid));
                        raw_inode->i_gid_high =
                                cpu_to_le16(high_16_bits(i_gid));
                }
        } else {
                raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid));
                raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid));
                raw_inode->i_uid_high = 0;
                raw_inode->i_gid_high = 0;
        }
        raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);

        EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
        EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
        EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
        EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);

        raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
        raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
        if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
                raw_inode->i_file_acl_high =
                        cpu_to_le16(ei->i_file_acl >> 32);
        raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
        if (READ_ONCE(ei->i_disksize) != ext4_isize(inode->i_sb, raw_inode)) {
                ext4_isize_set(raw_inode, ei->i_disksize);
                need_datasync = 1;
        }
        if (ei->i_disksize > 0x7fffffffULL) {
                if (!ext4_has_feature_large_file(sb) ||
                                EXT4_SB(sb)->s_es->s_rev_level ==
                    cpu_to_le32(EXT4_GOOD_OLD_REV))
                        set_large_file = 1;
        }
        raw_inode->i_generation = cpu_to_le32(inode->i_generation);
        if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
                if (old_valid_dev(inode->i_rdev)) {
                        raw_inode->i_block[0] =
                                cpu_to_le32(old_encode_dev(inode->i_rdev));
                        raw_inode->i_block[1] = 0;
                } else {
                        raw_inode->i_block[0] = 0;
                        raw_inode->i_block[1] =
                                cpu_to_le32(new_encode_dev(inode->i_rdev));
                        raw_inode->i_block[2] = 0;
                }
        } else if (!ext4_has_inline_data(inode)) {
                for (block = 0; block < EXT4_N_BLOCKS; block++)
                        raw_inode->i_block[block] = ei->i_data[block];
        }

        if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
                u64 ivers = ext4_inode_peek_iversion(inode);

                raw_inode->i_disk_version = cpu_to_le32(ivers);
                if (ei->i_extra_isize) {
                        if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
                                raw_inode->i_version_hi =
                                        cpu_to_le32(ivers >> 32);
                        raw_inode->i_extra_isize =
                                cpu_to_le16(ei->i_extra_isize);
                }
        }

        BUG_ON(!ext4_has_feature_project(inode->i_sb) &&
               i_projid != EXT4_DEF_PROJID);

        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
            EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
                raw_inode->i_projid = cpu_to_le32(i_projid);

        ext4_inode_csum_set(inode, raw_inode, ei);
        spin_unlock(&ei->i_raw_lock);
        if (inode->i_sb->s_flags & SB_LAZYTIME)
                ext4_update_other_inodes_time(inode->i_sb, inode->i_ino,
                                              bh->b_data);

        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
        err = ext4_handle_dirty_metadata(handle, NULL, bh);
        if (err)
                goto out_brelse;
        ext4_clear_inode_state(inode, EXT4_STATE_NEW);
        if (set_large_file) {
                BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access");
                err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
                if (err)
                        goto out_brelse;
                ext4_set_feature_large_file(sb);
                ext4_handle_sync(handle);
                err = ext4_handle_dirty_super(handle, sb);
        }
        ext4_update_inode_fsync_trans(handle, inode, need_datasync);
out_brelse:
        brelse(bh);
        ext4_std_error(inode->i_sb, err);
        return err;
}

/*
 * ext4_write_inode()
 *
 * We are called from a few places:
 *
 * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.
 *   Here, there will be no transaction running. We wait for any running
 *   transaction to commit.
 *
 * - Within flush work (sys_sync(), kupdate and such).
 *   We wait on commit, if told to.
 *
 * - Within iput_final() -> write_inode_now()
 *   We wait on commit, if told to.
 *
 * In all cases it is actually safe for us to return without doing anything,
 * because the inode has been copied into a raw inode buffer in
 * ext4_mark_inode_dirty().  This is a correctness thing for WB_SYNC_ALL
 * writeback.
 *
 * Note that we are absolutely dependent upon all inode dirtiers doing the
 * right thing: they *must* call mark_inode_dirty() after dirtying info in
 * which we are interested.
 *
 * It would be a bug for them to not do this.  The code:
 *
 *        mark_inode_dirty(inode)
 *        stuff();
 *        inode->i_size = expr;
 *
 * is in error because write_inode() could occur while `stuff()' is running,
 * and the new i_size will be lost.  Plus the inode will no longer be on the
 * superblock's dirty inode list.
 */
int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
{
        int err;

        if (WARN_ON_ONCE(current->flags & PF_MEMALLOC) ||
            sb_rdonly(inode->i_sb))
                return 0;

        if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
                return -EIO;

        if (EXT4_SB(inode->i_sb)->s_journal) {
                if (ext4_journal_current_handle()) {
                        jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
                        dump_stack();
                        return -EIO;
                }

                /*
                 * No need to force transaction in WB_SYNC_NONE mode. Also
                 * ext4_sync_fs() will force the commit after everything is
                 * written.
                 */
                if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
                        return 0;

                err = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
                                                EXT4_I(inode)->i_sync_tid);
        } else {
                struct ext4_iloc iloc;

                err = __ext4_get_inode_loc_noinmem(inode, &iloc);
                if (err)
                        return err;
                /*
                 * sync(2) will flush the whole buffer cache. No need to do
                 * it here separately for each inode.
                 */
                if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
                        sync_dirty_buffer(iloc.bh);
                if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
                        ext4_error_inode_block(inode, iloc.bh->b_blocknr, EIO,
                                               "IO error syncing inode");
                        err = -EIO;
                }
                brelse(iloc.bh);
        }
        return err;
}

/*
 * In data=journal mode ext4_journalled_invalidatepage() may fail to invalidate
 * buffers that are attached to a page stradding i_size and are undergoing
 * commit. In that case we have to wait for commit to finish and try again.
 */
static void ext4_wait_for_tail_page_commit(struct inode *inode)
{
        struct page *page;
        unsigned offset;
        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
        tid_t commit_tid;
        int ret;
        bool has_transaction;

        offset = inode->i_size & (PAGE_SIZE - 1);
        /*
         * If the page is fully truncated, we don't need to wait for any commit
         * (and we even should not as __ext4_journalled_invalidatepage() may
         * strip all buffers from the page but keep the page dirty which can then
         * confuse e.g. concurrent ext4_writepage() seeing dirty page without
         * buffers). Also we don't need to wait for any commit if all buffers in
         * the page remain valid. This is most beneficial for the common case of
         * blocksize == PAGESIZE.
         */
        if (!offset || offset > (PAGE_SIZE - i_blocksize(inode)))
                return;
        while (1) {
                page = find_lock_page(inode->i_mapping,
                                      inode->i_size >> PAGE_SHIFT);
                if (!page)
                        return;
                ret = __ext4_journalled_invalidatepage(page, offset,
                                                PAGE_SIZE - offset);
                unlock_page(page);
                put_page(page);
                if (ret != -EBUSY)
                        return;
                has_transaction = false;
                read_lock(&journal->j_state_lock);
                if (journal->j_committing_transaction) {
                        commit_tid = journal->j_committing_transaction->t_tid;
                        has_transaction = true;
                }
                read_unlock(&journal->j_state_lock);
                if (has_transaction)
                        jbd2_log_wait_commit(journal, commit_tid);
        }
}

/*
 * ext4_setattr()
 *
 * Called from notify_change.
 *
 * We want to trap VFS attempts to truncate the file as soon as
 * possible.  In particular, we want to make sure that when the VFS
 * shrinks i_size, we put the inode on the orphan list and modify
 * i_disksize immediately, so that during the subsequent flushing of
 * dirty pages and freeing of disk blocks, we can guarantee that any
 * commit will leave the blocks being flushed in an unused state on
 * disk.  (On recovery, the inode will get truncated and the blocks will
 * be freed, so we have a strong guarantee that no future commit will
 * leave these blocks visible to the user.)
 *
 * Another thing we have to assure is that if we are in ordered mode
 * and inode is still attached to the committing transaction, we must
 * we start writeout of all the dirty pages which are being truncated.
 * This way we are sure that all the data written in the previous
 * transaction are already on disk (truncate waits for pages under
 * writeback).
 *
 * Called with inode->i_mutex down.
 */
int ext4_setattr(struct dentry *dentry, struct iattr *attr)
{
        struct inode *inode = d_inode(dentry);
        int error, rc = 0;
        int orphan = 0;
        const unsigned int ia_valid = attr->ia_valid;

        if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
                return -EIO;

        if (unlikely(IS_IMMUTABLE(inode)))
                return -EPERM;

        if (unlikely(IS_APPEND(inode) &&
                     (ia_valid & (ATTR_MODE | ATTR_UID |
                                  ATTR_GID | ATTR_TIMES_SET))))
                return -EPERM;

        error = setattr_prepare(dentry, attr);
        if (error)
                return error;

        error = fscrypt_prepare_setattr(dentry, attr);
        if (error)
                return error;

        error = fsverity_prepare_setattr(dentry, attr);
        if (error)
                return error;

        if (is_quota_modification(inode, attr)) {
                error = dquot_initialize(inode);
                if (error)
                        return error;
        }

        if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
            (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
                handle_t *handle;

                /* (user+group)*(old+new) structure, inode write (sb,
                 * inode block, ? - but truncate inode update has it) */
                handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
                        (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) +
                         EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3);
                if (IS_ERR(handle)) {
                        error = PTR_ERR(handle);
                        goto err_out;
                }

                /* dquot_transfer() calls back ext4_get_inode_usage() which
                 * counts xattr inode references.
                 */
                down_read(&EXT4_I(inode)->xattr_sem);
                error = dquot_transfer(inode, attr);
                up_read(&EXT4_I(inode)->xattr_sem);

                if (error) {
                        ext4_journal_stop(handle);
                        return error;
                }
                /* Update corresponding info in inode so that everything is in
                 * one transaction */
                if (attr->ia_valid & ATTR_UID)
                        inode->i_uid = attr->ia_uid;
                if (attr->ia_valid & ATTR_GID)
                        inode->i_gid = attr->ia_gid;
                error = ext4_mark_inode_dirty(handle, inode);
                ext4_journal_stop(handle);
                if (unlikely(error)) {
                        return error;
                }
        }

        if (attr->ia_valid & ATTR_SIZE) {
                handle_t *handle;
                loff_t oldsize = inode->i_size;
                loff_t old_disksize;
                int shrink = (attr->ia_size < inode->i_size);

                if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
                        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

                        if (attr->ia_size > sbi->s_bitmap_maxbytes) {
                                return -EFBIG;
                        }
                }
                if (!S_ISREG(inode->i_mode)) {
                        return -EINVAL;
                }

                if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size)
                        inode_inc_iversion(inode);

                /*
                 * If file has inline data but new size exceeds inline capacity,
                 * convert to extent-based storage first to prevent inconsistent
                 * state (inline flag set but size exceeds inline capacity).
                 */
                if (ext4_has_inline_data(inode) &&
                    attr->ia_size > EXT4_I(inode)->i_inline_size) {
                        error = ext4_convert_inline_data(inode);
                        if (error)
                                goto err_out;
                }

                if (shrink) {
                        if (ext4_should_order_data(inode)) {
                                error = ext4_begin_ordered_truncate(inode,
                                                            attr->ia_size);
                                if (error)
                                        goto err_out;
                        }
                        /*
                         * Blocks are going to be removed from the inode. Wait
                         * for dio in flight.
                         */
                        inode_dio_wait(inode);
                }

                down_write(&EXT4_I(inode)->i_mmap_sem);

                rc = ext4_break_layouts(inode);
                if (rc) {
                        up_write(&EXT4_I(inode)->i_mmap_sem);
                        goto err_out;
                }

                if (attr->ia_size != inode->i_size) {
                        handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
                        if (IS_ERR(handle)) {
                                error = PTR_ERR(handle);
                                goto out_mmap_sem;
                        }
                        if (ext4_handle_valid(handle) && shrink) {
                                error = ext4_orphan_add(handle, inode);
                                orphan = 1;
                        }
                        /*
                         * Update c/mtime on truncate up, ext4_truncate() will
                         * update c/mtime in shrink case below
                         */
                        if (!shrink) {
                                inode->i_mtime = current_time(inode);
                                inode->i_ctime = inode->i_mtime;
                        }

                        if (shrink)
                                ext4_fc_track_range(handle, inode,
                                        (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
                                        inode->i_sb->s_blocksize_bits,
                                        EXT_MAX_BLOCKS - 1);
                        else
                                ext4_fc_track_range(
                                        handle, inode,
                                        (oldsize > 0 ? oldsize - 1 : oldsize) >>
                                        inode->i_sb->s_blocksize_bits,
                                        (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
                                        inode->i_sb->s_blocksize_bits);

                        down_write(&EXT4_I(inode)->i_data_sem);
                        old_disksize = EXT4_I(inode)->i_disksize;
                        EXT4_I(inode)->i_disksize = attr->ia_size;
                        rc = ext4_mark_inode_dirty(handle, inode);
                        if (!error)
                                error = rc;
                        /*
                         * We have to update i_size under i_data_sem together
                         * with i_disksize to avoid races with writeback code
                         * running ext4_wb_update_i_disksize().
                         */
                        if (!error)
                                i_size_write(inode, attr->ia_size);
                        else
                                EXT4_I(inode)->i_disksize = old_disksize;
                        up_write(&EXT4_I(inode)->i_data_sem);
                        ext4_journal_stop(handle);
                        if (error)
                                goto out_mmap_sem;
                        if (!shrink) {
                                pagecache_isize_extended(inode, oldsize,
                                                         inode->i_size);
                        } else if (ext4_should_journal_data(inode)) {
                                ext4_wait_for_tail_page_commit(inode);
                        }
                }

                /*
                 * Truncate pagecache after we've waited for commit
                 * in data=journal mode to make pages freeable.
                 */
                truncate_pagecache(inode, inode->i_size);
                /*
                 * Call ext4_truncate() even if i_size didn't change to
                 * truncate possible preallocated blocks.
                 */
                if (attr->ia_size <= oldsize) {
                        rc = ext4_truncate(inode);
                        if (rc)
                                error = rc;
                }
out_mmap_sem:
                up_write(&EXT4_I(inode)->i_mmap_sem);
        }

        if (!error) {
                setattr_copy(inode, attr);
                mark_inode_dirty(inode);
        }

        /*
         * If the call to ext4_truncate failed to get a transaction handle at
         * all, we need to clean up the in-core orphan list manually.
         */
        if (orphan && inode->i_nlink)
                ext4_orphan_del(NULL, inode);

        if (!error && (ia_valid & ATTR_MODE))
                rc = posix_acl_chmod(inode, inode->i_mode);

err_out:
        if  (error)
                ext4_std_error(inode->i_sb, error);
        if (!error)
                error = rc;
        return error;
}

int ext4_getattr(const struct path *path, struct kstat *stat,
                 u32 request_mask, unsigned int query_flags)
{
        struct inode *inode = d_inode(path->dentry);
        struct ext4_inode *raw_inode;
        struct ext4_inode_info *ei = EXT4_I(inode);
        unsigned int flags;

        if ((request_mask & STATX_BTIME) &&
            EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) {
                stat->result_mask |= STATX_BTIME;
                stat->btime.tv_sec = ei->i_crtime.tv_sec;
                stat->btime.tv_nsec = ei->i_crtime.tv_nsec;
        }

        flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
        if (flags & EXT4_APPEND_FL)
                stat->attributes |= STATX_ATTR_APPEND;
        if (flags & EXT4_COMPR_FL)
                stat->attributes |= STATX_ATTR_COMPRESSED;
        if (flags & EXT4_ENCRYPT_FL)
                stat->attributes |= STATX_ATTR_ENCRYPTED;
        if (flags & EXT4_IMMUTABLE_FL)
                stat->attributes |= STATX_ATTR_IMMUTABLE;
        if (flags & EXT4_NODUMP_FL)
                stat->attributes |= STATX_ATTR_NODUMP;
        if (flags & EXT4_VERITY_FL)
                stat->attributes |= STATX_ATTR_VERITY;

        stat->attributes_mask |= (STATX_ATTR_APPEND |
                                  STATX_ATTR_COMPRESSED |
                                  STATX_ATTR_ENCRYPTED |
                                  STATX_ATTR_IMMUTABLE |
                                  STATX_ATTR_NODUMP |
                                  STATX_ATTR_VERITY);

        generic_fillattr(inode, stat);
        return 0;
}

int ext4_file_getattr(const struct path *path, struct kstat *stat,
                      u32 request_mask, unsigned int query_flags)
{
        struct inode *inode = d_inode(path->dentry);
        u64 delalloc_blocks;

        ext4_getattr(path, stat, request_mask, query_flags);

        /*
         * If there is inline data in the inode, the inode will normally not
         * have data blocks allocated (it may have an external xattr block).
         * Report at least one sector for such files, so tools like tar, rsync,
         * others don't incorrectly think the file is completely sparse.
         */
        if (unlikely(ext4_has_inline_data(inode)))
                stat->blocks += (stat->size + 511) >> 9;

        /*
         * We can't update i_blocks if the block allocation is delayed
         * otherwise in the case of system crash before the real block
         * allocation is done, we will have i_blocks inconsistent with
         * on-disk file blocks.
         * We always keep i_blocks updated together with real
         * allocation. But to not confuse with user, stat
         * will return the blocks that include the delayed allocation
         * blocks for this file.
         */
        delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
                                   EXT4_I(inode)->i_reserved_data_blocks);
        stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits - 9);
        return 0;
}

static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
                                   int pextents)
{
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return ext4_ind_trans_blocks(inode, lblocks);
        return ext4_ext_index_trans_blocks(inode, pextents);
}

/*
 * Account for index blocks, block groups bitmaps and block group
 * descriptor blocks if modify datablocks and index blocks
 * worse case, the indexs blocks spread over different block groups
 *
 * If datablocks are discontiguous, they are possible to spread over
 * different block groups too. If they are contiguous, with flexbg,
 * they could still across block group boundary.
 *
 * Also account for superblock, inode, quota and xattr blocks
 */
static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
                                  int pextents)
{
        ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
        int gdpblocks;
        int idxblocks;
        int ret = 0;

        /*
         * How many index blocks need to touch to map @lblocks logical blocks
         * to @pextents physical extents?
         */
        idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);

        ret = idxblocks;

        /*
         * Now let's see how many group bitmaps and group descriptors need
         * to account
         */
        groups = idxblocks + pextents;
        gdpblocks = groups;
        if (groups > ngroups)
                groups = ngroups;
        if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
                gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;

        /* bitmaps and block group descriptor blocks */
        ret += groups + gdpblocks;

        /* Blocks for super block, inode, quota and xattr blocks */
        ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);

        return ret;
}

/*
 * Calculate the total number of credits to reserve to fit
 * the modification of a single pages into a single transaction,
 * which may include multiple chunks of block allocations.
 *
 * This could be called via ext4_write_begin()
 *
 * We need to consider the worse case, when
 * one new block per extent.
 */
int ext4_writepage_trans_blocks(struct inode *inode)
{
        int bpp = ext4_journal_blocks_per_page(inode);
        int ret;

        ret = ext4_meta_trans_blocks(inode, bpp, bpp);

        /* Account for data blocks for journalled mode */
        if (ext4_should_journal_data(inode))
                ret += bpp;
        return ret;
}

/*
 * Calculate the journal credits for a chunk of data modification.
 *
 * This is called from DIO, fallocate or whoever calling
 * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks.
 *
 * journal buffers for data blocks are not included here, as DIO
 * and fallocate do no need to journal data buffers.
 */
int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
{
        return ext4_meta_trans_blocks(inode, nrblocks, 1);
}

/*
 * The caller must have previously called ext4_reserve_inode_write().
 * Give this, we know that the caller already has write access to iloc->bh.
 */
int ext4_mark_iloc_dirty(handle_t *handle,
                         struct inode *inode, struct ext4_iloc *iloc)
{
        int err = 0;

        if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) {
                put_bh(iloc->bh);
                return -EIO;
        }
        ext4_fc_track_inode(handle, inode);

        /*
         * ea_inodes are using i_version for storing reference count, don't
         * mess with it
         */
        if (IS_I_VERSION(inode) &&
            !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
                inode_inc_iversion(inode);

        /* the do_update_inode consumes one bh->b_count */
        get_bh(iloc->bh);

        /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
        err = ext4_do_update_inode(handle, inode, iloc);
        put_bh(iloc->bh);
        return err;
}

/*
 * On success, We end up with an outstanding reference count against
 * iloc->bh.  This _must_ be cleaned up later.
 */

int
ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
                         struct ext4_iloc *iloc)
{
        int err;

        if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
                return -EIO;

        err = ext4_get_inode_loc(inode, iloc);
        if (!err) {
                BUFFER_TRACE(iloc->bh, "get_write_access");
                err = ext4_journal_get_write_access(handle, iloc->bh);
                if (err) {
                        brelse(iloc->bh);
                        iloc->bh = NULL;
                }
        }
        ext4_std_error(inode->i_sb, err);
        return err;
}

static int __ext4_expand_extra_isize(struct inode *inode,
                                     unsigned int new_extra_isize,
                                     struct ext4_iloc *iloc,
                                     handle_t *handle, int *no_expand)
{
        struct ext4_inode *raw_inode;
        struct ext4_xattr_ibody_header *header;
        unsigned int inode_size = EXT4_INODE_SIZE(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);
        int error;

        /* this was checked at iget time, but double check for good measure */
        if ((EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > inode_size) ||
            (ei->i_extra_isize & 3)) {
                EXT4_ERROR_INODE(inode, "bad extra_isize %u (inode size %u)",
                                 ei->i_extra_isize,
                                 EXT4_INODE_SIZE(inode->i_sb));
                return -EFSCORRUPTED;
        }
        if ((new_extra_isize < ei->i_extra_isize) ||
            (new_extra_isize < 4) ||
            (new_extra_isize > inode_size - EXT4_GOOD_OLD_INODE_SIZE))
                return -EINVAL;        /* Should never happen */

        raw_inode = ext4_raw_inode(iloc);

        header = IHDR(inode, raw_inode);

        /* No extended attributes present */
        if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
            header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
                memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE +
                       EXT4_I(inode)->i_extra_isize, 0,
                       new_extra_isize - EXT4_I(inode)->i_extra_isize);
                EXT4_I(inode)->i_extra_isize = new_extra_isize;
                return 0;
        }

        /*
         * We may need to allocate external xattr block so we need quotas
         * initialized. Here we can be called with various locks held so we
         * cannot affort to initialize quotas ourselves. So just bail.
         */
        if (dquot_initialize_needed(inode))
                return -EAGAIN;

        /* try to expand with EAs present */
        error = ext4_expand_extra_isize_ea(inode, new_extra_isize,
                                           raw_inode, handle);
        if (error) {
                /*
                 * Inode size expansion failed; don't try again
                 */
                *no_expand = 1;
        }

        return error;
}

/*
 * Expand an inode by new_extra_isize bytes.
 * Returns 0 on success or negative error number on failure.
 */
static int ext4_try_to_expand_extra_isize(struct inode *inode,
                                          unsigned int new_extra_isize,
                                          struct ext4_iloc iloc,
                                          handle_t *handle)
{
        int no_expand;
        int error;

        if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND))
                return -EOVERFLOW;

        /*
         * In nojournal mode, we can immediately attempt to expand
         * the inode.  When journaled, we first need to obtain extra
         * buffer credits since we may write into the EA block
         * with this same handle. If journal_extend fails, then it will
         * only result in a minor loss of functionality for that inode.
         * If this is felt to be critical, then e2fsck should be run to
         * force a large enough s_min_extra_isize.
         */
        if (ext4_journal_extend(handle,
                                EXT4_DATA_TRANS_BLOCKS(inode->i_sb), 0) != 0)
                return -ENOSPC;

        if (ext4_write_trylock_xattr(inode, &no_expand) == 0)
                return -EBUSY;

        error = __ext4_expand_extra_isize(inode, new_extra_isize, &iloc,
                                          handle, &no_expand);
        ext4_write_unlock_xattr(inode, &no_expand);

        return error;
}

int ext4_expand_extra_isize(struct inode *inode,
                            unsigned int new_extra_isize,
                            struct ext4_iloc *iloc)
{
        handle_t *handle;
        int no_expand;
        int error, rc;

        if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
                brelse(iloc->bh);
                return -EOVERFLOW;
        }

        handle = ext4_journal_start(inode, EXT4_HT_INODE,
                                    EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
        if (IS_ERR(handle)) {
                error = PTR_ERR(handle);
                brelse(iloc->bh);
                return error;
        }

        ext4_write_lock_xattr(inode, &no_expand);

        BUFFER_TRACE(iloc->bh, "get_write_access");
        error = ext4_journal_get_write_access(handle, iloc->bh);
        if (error) {
                brelse(iloc->bh);
                goto out_unlock;
        }

        error = __ext4_expand_extra_isize(inode, new_extra_isize, iloc,
                                          handle, &no_expand);

        rc = ext4_mark_iloc_dirty(handle, inode, iloc);
        if (!error)
                error = rc;

out_unlock:
        ext4_write_unlock_xattr(inode, &no_expand);
        ext4_journal_stop(handle);
        return error;
}

/*
 * What we do here is to mark the in-core inode as clean with respect to inode
 * dirtiness (it may still be data-dirty).
 * This means that the in-core inode may be reaped by prune_icache
 * without having to perform any I/O.  This is a very good thing,
 * because *any* task may call prune_icache - even ones which
 * have a transaction open against a different journal.
 *
 * Is this cheating?  Not really.  Sure, we haven't written the
 * inode out, but prune_icache isn't a user-visible syncing function.
 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
 * we start and wait on commits.
 */
int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode,
                                const char *func, unsigned int line)
{
        struct ext4_iloc iloc;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        int err;

        might_sleep();
        trace_ext4_mark_inode_dirty(inode, _RET_IP_);
        err = ext4_reserve_inode_write(handle, inode, &iloc);
        if (err)
                goto out;

        if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize)
                ext4_try_to_expand_extra_isize(inode, sbi->s_want_extra_isize,
                                               iloc, handle);

        err = ext4_mark_iloc_dirty(handle, inode, &iloc);
out:
        if (unlikely(err))
                ext4_error_inode_err(inode, func, line, 0, err,
                                        "mark_inode_dirty error");
        return err;
}

/*
 * ext4_dirty_inode() is called from __mark_inode_dirty()
 *
 * We're really interested in the case where a file is being extended.
 * i_size has been changed by generic_commit_write() and we thus need
 * to include the updated inode in the current transaction.
 *
 * Also, dquot_alloc_block() will always dirty the inode when blocks
 * are allocated to the file.
 *
 * If the inode is marked synchronous, we don't honour that here - doing
 * so would cause a commit on atime updates, which we don't bother doing.
 * We handle synchronous inodes at the highest possible level.
 *
 * If only the I_DIRTY_TIME flag is set, we can skip everything.  If
 * I_DIRTY_TIME and I_DIRTY_SYNC is set, the only inode fields we need
 * to copy into the on-disk inode structure are the timestamp files.
 */
void ext4_dirty_inode(struct inode *inode, int flags)
{
        handle_t *handle;

        if (flags == I_DIRTY_TIME)
                return;
        handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
        if (IS_ERR(handle))
                goto out;

        ext4_mark_inode_dirty(handle, inode);

        ext4_journal_stop(handle);
out:
        return;
}

int ext4_change_inode_journal_flag(struct inode *inode, int val)
{
        journal_t *journal;
        handle_t *handle;
        int err;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

        /*
         * We have to be very careful here: changing a data block's
         * journaling status dynamically is dangerous.  If we write a
         * data block to the journal, change the status and then delete
         * that block, we risk forgetting to revoke the old log record
         * from the journal and so a subsequent replay can corrupt data.
         * So, first we make sure that the journal is empty and that
         * nobody is changing anything.
         */

        journal = EXT4_JOURNAL(inode);
        if (!journal)
                return 0;
        if (is_journal_aborted(journal))
                return -EROFS;

        /* Wait for all existing dio workers */
        inode_dio_wait(inode);

        /*
         * Before flushing the journal and switching inode's aops, we have
         * to flush all dirty data the inode has. There can be outstanding
         * delayed allocations, there can be unwritten extents created by
         * fallocate or buffered writes in dioread_nolock mode covered by
         * dirty data which can be converted only after flushing the dirty
         * data (and journalled aops don't know how to handle these cases).
         */
        if (val) {
                down_write(&EXT4_I(inode)->i_mmap_sem);
                err = filemap_write_and_wait(inode->i_mapping);
                if (err < 0) {
                        up_write(&EXT4_I(inode)->i_mmap_sem);
                        return err;
                }
        }

        percpu_down_write(&sbi->s_writepages_rwsem);
        jbd2_journal_lock_updates(journal);

        /*
         * OK, there are no updates running now, and all cached data is
         * synced to disk.  We are now in a completely consistent state
         * which doesn't have anything in the journal, and we know that
         * no filesystem updates are running, so it is safe to modify
         * the inode's in-core data-journaling state flag now.
         */

        if (val)
                ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
        else {
                err = jbd2_journal_flush(journal);
                if (err < 0) {
                        jbd2_journal_unlock_updates(journal);
                        percpu_up_write(&sbi->s_writepages_rwsem);
                        return err;
                }
                ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
        }
        ext4_set_aops(inode);

        jbd2_journal_unlock_updates(journal);
        percpu_up_write(&sbi->s_writepages_rwsem);

        if (val)
                up_write(&EXT4_I(inode)->i_mmap_sem);

        /* Finally we can mark the inode as dirty. */

        handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
        if (IS_ERR(handle))
                return PTR_ERR(handle);

        ext4_fc_mark_ineligible(inode->i_sb,
                EXT4_FC_REASON_JOURNAL_FLAG_CHANGE);
        err = ext4_mark_inode_dirty(handle, inode);
        ext4_handle_sync(handle);
        ext4_journal_stop(handle);
        ext4_std_error(inode->i_sb, err);

        return err;
}

static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
{
        return !buffer_mapped(bh);
}

vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct page *page = vmf->page;
        loff_t size;
        unsigned long len;
        int err;
        vm_fault_t ret;
        struct file *file = vma->vm_file;
        struct inode *inode = file_inode(file);
        struct address_space *mapping = inode->i_mapping;
        handle_t *handle;
        get_block_t *get_block;
        int retries = 0;

        if (unlikely(IS_IMMUTABLE(inode)))
                return VM_FAULT_SIGBUS;

        sb_start_pagefault(inode->i_sb);
        file_update_time(vma->vm_file);

        down_read(&EXT4_I(inode)->i_mmap_sem);

        err = ext4_convert_inline_data(inode);
        if (err)
                goto out_ret;

        /*
         * On data journalling we skip straight to the transaction handle:
         * there's no delalloc; page truncated will be checked later; the
         * early return w/ all buffers mapped (calculates size/len) can't
         * be used; and there's no dioread_nolock, so only ext4_get_block.
         */
        if (ext4_should_journal_data(inode))
                goto retry_alloc;

        /* Delalloc case is easy... */
        if (test_opt(inode->i_sb, DELALLOC) &&
            !ext4_nonda_switch(inode->i_sb)) {
                do {
                        err = block_page_mkwrite(vma, vmf,
                                                   ext4_da_get_block_prep);
                } while (err == -ENOSPC &&
                       ext4_should_retry_alloc(inode->i_sb, &retries));
                goto out_ret;
        }

        lock_page(page);
        size = i_size_read(inode);
        /* Page got truncated from under us? */
        if (page->mapping != mapping || page_offset(page) > size) {
                unlock_page(page);
                ret = VM_FAULT_NOPAGE;
                goto out;
        }

        if (page->index == size >> PAGE_SHIFT)
                len = size & ~PAGE_MASK;
        else
                len = PAGE_SIZE;
        /*
         * Return if we have all the buffers mapped. This avoids the need to do
         * journal_start/journal_stop which can block and take a long time
         *
         * This cannot be done for data journalling, as we have to add the
         * inode to the transaction's list to writeprotect pages on commit.
         */
        if (page_has_buffers(page)) {
                if (!ext4_walk_page_buffers(NULL, page_buffers(page),
                                            0, len, NULL,
                                            ext4_bh_unmapped)) {
                        /* Wait so that we don't change page under IO */
                        wait_for_stable_page(page);
                        ret = VM_FAULT_LOCKED;
                        goto out;
                }
        }
        unlock_page(page);
        /* OK, we need to fill the hole... */
        if (ext4_should_dioread_nolock(inode))
                get_block = ext4_get_block_unwritten;
        else
                get_block = ext4_get_block;
retry_alloc:
        handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
                                    ext4_writepage_trans_blocks(inode));
        if (IS_ERR(handle)) {
                ret = VM_FAULT_SIGBUS;
                goto out;
        }
        /*
         * Data journalling can't use block_page_mkwrite() because it
         * will set_buffer_dirty() before do_journal_get_write_access()
         * thus might hit warning messages for dirty metadata buffers.
         */
        if (!ext4_should_journal_data(inode)) {
                err = block_page_mkwrite(vma, vmf, get_block);
        } else {
                lock_page(page);
                size = i_size_read(inode);
                /* Page got truncated from under us? */
                if (page->mapping != mapping || page_offset(page) > size) {
                        ret = VM_FAULT_NOPAGE;
                        goto out_error;
                }

                if (page->index == size >> PAGE_SHIFT)
                        len = size & ~PAGE_MASK;
                else
                        len = PAGE_SIZE;

                err = __block_write_begin(page, 0, len, ext4_get_block);
                if (!err) {
                        ret = VM_FAULT_SIGBUS;
                        if (ext4_walk_page_buffers(handle, page_buffers(page),
                                        0, len, NULL, do_journal_get_write_access))
                                goto out_error;
                        if (ext4_walk_page_buffers(handle, page_buffers(page),
                                        0, len, NULL, write_end_fn))
                                goto out_error;
                        if (ext4_jbd2_inode_add_write(handle, inode,
                                                      page_offset(page), len))
                                goto out_error;
                        ext4_set_inode_state(inode, EXT4_STATE_JDATA);
                } else {
                        unlock_page(page);
                }
        }
        ext4_journal_stop(handle);
        if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
                goto retry_alloc;
out_ret:
        ret = block_page_mkwrite_return(err);
out:
        up_read(&EXT4_I(inode)->i_mmap_sem);
        sb_end_pagefault(inode->i_sb);
        return ret;
out_error:
        unlock_page(page);
        ext4_journal_stop(handle);
        goto out;
}

vm_fault_t ext4_filemap_fault(struct vm_fault *vmf)
{
        struct inode *inode = file_inode(vmf->vma->vm_file);
        vm_fault_t ret;

        down_read(&EXT4_I(inode)->i_mmap_sem);
        ret = filemap_fault(vmf);
        up_read(&EXT4_I(inode)->i_mmap_sem);

        return ret;
}


































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _linux_POSIX_TIMERS_H
#define _linux_POSIX_TIMERS_H

#include <linux/spinlock.h>
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/alarmtimer.h>
#include <linux/timerqueue.h>
#include <linux/task_work.h>

struct kernel_siginfo;
struct task_struct;

/*
 * Bit fields within a clockid:
 *
 * The most significant 29 bits hold either a pid or a file descriptor.
 *
 * Bit 2 indicates whether a cpu clock refers to a thread or a process.
 *
 * Bits 1 and 0 give the type: PROF=0, VIRT=1, SCHED=2, or FD=3.
 *
 * A clockid is invalid if bits 2, 1, and 0 are all set.
 */
#define CPUCLOCK_PID(clock)                ((pid_t) ~((clock) >> 3))
#define CPUCLOCK_PERTHREAD(clock) \
        (((clock) & (clockid_t) CPUCLOCK_PERTHREAD_MASK) != 0)

#define CPUCLOCK_PERTHREAD_MASK        4
#define CPUCLOCK_WHICH(clock)        ((clock) & (clockid_t) CPUCLOCK_CLOCK_MASK)
#define CPUCLOCK_CLOCK_MASK        3
#define CPUCLOCK_PROF                0
#define CPUCLOCK_VIRT                1
#define CPUCLOCK_SCHED                2
#define CPUCLOCK_MAX                3
#define CLOCKFD                        CPUCLOCK_MAX
#define CLOCKFD_MASK                (CPUCLOCK_PERTHREAD_MASK|CPUCLOCK_CLOCK_MASK)

static inline clockid_t make_process_cpuclock(const unsigned int pid,
                const clockid_t clock)
{
        return ((~pid) << 3) | clock;
}
static inline clockid_t make_thread_cpuclock(const unsigned int tid,
                const clockid_t clock)
{
        return make_process_cpuclock(tid, clock | CPUCLOCK_PERTHREAD_MASK);
}

static inline clockid_t fd_to_clockid(const int fd)
{
        return make_process_cpuclock((unsigned int) fd, CLOCKFD);
}

static inline int clockid_to_fd(const clockid_t clk)
{
        return ~(clk >> 3);
}

#ifdef CONFIG_POSIX_TIMERS

/**
 * cpu_timer - Posix CPU timer representation for k_itimer
 * @node:        timerqueue node to queue in the task/sig
 * @head:        timerqueue head on which this timer is queued
 * @pid:        Pointer to target task PID
 * @elist:        List head for the expiry list
 * @firing:        Timer is currently firing
 * @handling:        Pointer to the task which handles expiry
 */
struct cpu_timer {
        struct timerqueue_node                node;
        struct timerqueue_head                *head;
        struct pid                        *pid;
        struct list_head                elist;
        int                                firing;
        struct task_struct __rcu        *handling;
};

static inline bool cpu_timer_enqueue(struct timerqueue_head *head,
                                     struct cpu_timer *ctmr)
{
        ctmr->head = head;
        return timerqueue_add(head, &ctmr->node);
}

static inline void cpu_timer_dequeue(struct cpu_timer *ctmr)
{
        if (ctmr->head) {
                timerqueue_del(ctmr->head, &ctmr->node);
                ctmr->head = NULL;
        }
}

static inline u64 cpu_timer_getexpires(struct cpu_timer *ctmr)
{
        return ctmr->node.expires;
}

static inline void cpu_timer_setexpires(struct cpu_timer *ctmr, u64 exp)
{
        ctmr->node.expires = exp;
}

/**
 * posix_cputimer_base - Container per posix CPU clock
 * @nextevt:                Earliest-expiration cache
 * @tqhead:                timerqueue head for cpu_timers
 */
struct posix_cputimer_base {
        u64                        nextevt;
        struct timerqueue_head        tqhead;
};

/**
 * posix_cputimers - Container for posix CPU timer related data
 * @bases:                Base container for posix CPU clocks
 * @timers_active:        Timers are queued.
 * @expiry_active:        Timer expiry is active. Used for
 *                        process wide timers to avoid multiple
 *                        task trying to handle expiry concurrently
 *
 * Used in task_struct and signal_struct
 */
struct posix_cputimers {
        struct posix_cputimer_base        bases[CPUCLOCK_MAX];
        unsigned int                        timers_active;
        unsigned int                        expiry_active;
};

/**
 * posix_cputimers_work - Container for task work based posix CPU timer expiry
 * @work:        The task work to be scheduled
 * @mutex:        Mutex held around expiry in context of this task work
 * @scheduled:  @work has been scheduled already, no further processing
 */
struct posix_cputimers_work {
        struct callback_head        work;
        struct mutex                mutex;
        unsigned int                scheduled;
};

static inline void posix_cputimers_init(struct posix_cputimers *pct)
{
        memset(pct, 0, sizeof(*pct));
        pct->bases[0].nextevt = U64_MAX;
        pct->bases[1].nextevt = U64_MAX;
        pct->bases[2].nextevt = U64_MAX;
}

void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit);

static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct,
                                               u64 runtime)
{
        pct->bases[CPUCLOCK_SCHED].nextevt = runtime;
}

/* Init task static initializer */
#define INIT_CPU_TIMERBASE(b) {                                                \
        .nextevt        = U64_MAX,                                        \
}

#define INIT_CPU_TIMERBASES(b) {                                        \
        INIT_CPU_TIMERBASE(b[0]),                                        \
        INIT_CPU_TIMERBASE(b[1]),                                        \
        INIT_CPU_TIMERBASE(b[2]),                                        \
}

#define INIT_CPU_TIMERS(s)                                                \
        .posix_cputimers = {                                                \
                .bases = INIT_CPU_TIMERBASES(s.posix_cputimers.bases),        \
        },
#else
struct posix_cputimers { };
struct cpu_timer { };
#define INIT_CPU_TIMERS(s)
static inline void posix_cputimers_init(struct posix_cputimers *pct) { }
static inline void posix_cputimers_group_init(struct posix_cputimers *pct,
                                              u64 cpu_limit) { }
#endif

#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
void clear_posix_cputimers_work(struct task_struct *p);
void posix_cputimers_init_work(void);
#else
static inline void clear_posix_cputimers_work(struct task_struct *p) { }
static inline void posix_cputimers_init_work(void) { }
#endif

#define REQUEUE_PENDING 1

/**
 * struct k_itimer - POSIX.1b interval timer structure.
 * @list:                List head for binding the timer to signals->posix_timers
 * @t_hash:                Entry in the posix timer hash table
 * @it_lock:                Lock protecting the timer
 * @kclock:                Pointer to the k_clock struct handling this timer
 * @it_clock:                The posix timer clock id
 * @it_id:                The posix timer id for identifying the timer
 * @it_active:                Marker that timer is active
 * @it_overrun:                The overrun counter for pending signals
 * @it_overrun_last:        The overrun at the time of the last delivered signal
 * @it_requeue_pending:        Indicator that timer waits for being requeued on
 *                        signal delivery
 * @it_sigev_notify:        The notify word of sigevent struct for signal delivery
 * @it_interval:        The interval for periodic timers
 * @it_signal:                Pointer to the creators signal struct
 * @it_pid:                The pid of the process/task targeted by the signal
 * @it_process:                The task to wakeup on clock_nanosleep (CPU timers)
 * @sigq:                Pointer to preallocated sigqueue
 * @it:                        Union representing the various posix timer type
 *                        internals.
 * @rcu:                RCU head for freeing the timer.
 */
struct k_itimer {
        struct list_head        list;
        struct hlist_node        t_hash;
        spinlock_t                it_lock;
        const struct k_clock        *kclock;
        clockid_t                it_clock;
        timer_t                        it_id;
        int                        it_active;
        s64                        it_overrun;
        s64                        it_overrun_last;
        int                        it_requeue_pending;
        int                        it_sigev_notify;
        ktime_t                        it_interval;
        struct signal_struct        *it_signal;
        union {
                struct pid                *it_pid;
                struct task_struct        *it_process;
        };
        struct sigqueue                *sigq;
        union {
                struct {
                        struct hrtimer        timer;
                } real;
                struct cpu_timer        cpu;
                struct {
                        struct alarm        alarmtimer;
                } alarm;
        } it;
        struct rcu_head                rcu;
};

void run_posix_cpu_timers(void);
void posix_cpu_timers_exit(struct task_struct *task);
void posix_cpu_timers_exit_group(struct task_struct *task);
void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx,
                           u64 *newval, u64 *oldval);

void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new);

void posixtimer_rearm(struct kernel_siginfo *info);
#endif










    1 




















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM percpu

#if !defined(_TRACE_PERCPU_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_PERCPU_H

#include <linux/tracepoint.h>

TRACE_EVENT(percpu_alloc_percpu,

        TP_PROTO(bool reserved, bool is_atomic, size_t size,
                 size_t align, void *base_addr, int off, void __percpu *ptr),

        TP_ARGS(reserved, is_atomic, size, align, base_addr, off, ptr),

        TP_STRUCT__entry(
                __field(        bool,                        reserved        )
                __field(        bool,                        is_atomic        )
                __field(        size_t,                        size                )
                __field(        size_t,                        align                )
                __field(        void *,                        base_addr        )
                __field(        int,                        off                )
                __field(        void __percpu *,        ptr                )
        ),

        TP_fast_assign(
                __entry->reserved        = reserved;
                __entry->is_atomic        = is_atomic;
                __entry->size                = size;
                __entry->align                = align;
                __entry->base_addr        = base_addr;
                __entry->off                = off;
                __entry->ptr                = ptr;
        ),

        TP_printk("reserved=%d is_atomic=%d size=%zu align=%zu base_addr=%p off=%d ptr=%p",
                  __entry->reserved, __entry->is_atomic,
                  __entry->size, __entry->align,
                  __entry->base_addr, __entry->off, __entry->ptr)
);

TRACE_EVENT(percpu_free_percpu,

        TP_PROTO(void *base_addr, int off, void __percpu *ptr),

        TP_ARGS(base_addr, off, ptr),

        TP_STRUCT__entry(
                __field(        void *,                        base_addr        )
                __field(        int,                        off                )
                __field(        void __percpu *,        ptr                )
        ),

        TP_fast_assign(
                __entry->base_addr        = base_addr;
                __entry->off                = off;
                __entry->ptr                = ptr;
        ),

        TP_printk("base_addr=%p off=%d ptr=%p",
                __entry->base_addr, __entry->off, __entry->ptr)
);

TRACE_EVENT(percpu_alloc_percpu_fail,

        TP_PROTO(bool reserved, bool is_atomic, size_t size, size_t align),

        TP_ARGS(reserved, is_atomic, size, align),

        TP_STRUCT__entry(
                __field(        bool,        reserved        )
                __field(        bool,        is_atomic        )
                __field(        size_t,        size                )
                __field(        size_t, align                )
        ),

        TP_fast_assign(
                __entry->reserved        = reserved;
                __entry->is_atomic        = is_atomic;
                __entry->size                = size;
                __entry->align                = align;
        ),

        TP_printk("reserved=%d is_atomic=%d size=%zu align=%zu",
                  __entry->reserved, __entry->is_atomic,
                  __entry->size, __entry->align)
);

TRACE_EVENT(percpu_create_chunk,

        TP_PROTO(void *base_addr),

        TP_ARGS(base_addr),

        TP_STRUCT__entry(
                __field(        void *, base_addr        )
        ),

        TP_fast_assign(
                __entry->base_addr        = base_addr;
        ),

        TP_printk("base_addr=%p", __entry->base_addr)
);

TRACE_EVENT(percpu_destroy_chunk,

        TP_PROTO(void *base_addr),

        TP_ARGS(base_addr),

        TP_STRUCT__entry(
                __field(        void *,        base_addr        )
        ),

        TP_fast_assign(
                __entry->base_addr        = base_addr;
        ),

        TP_printk("base_addr=%p", __entry->base_addr)
);

#endif /* _TRACE_PERCPU_H */

#include <trace/define_trace.h>



























































































































































































































    1 


































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PTRACE_H
#define _LINUX_PTRACE_H

#include <linux/compiler.h>                /* For unlikely.  */
#include <linux/sched.h>                /* For struct task_struct.  */
#include <linux/sched/signal.h>                /* For send_sig(), same_thread_group(), etc. */
#include <linux/err.h>                        /* for IS_ERR_VALUE */
#include <linux/bug.h>                        /* For BUG_ON.  */
#include <linux/pid_namespace.h>        /* For task_active_pid_ns.  */
#include <uapi/linux/ptrace.h>
#include <linux/seccomp.h>

/* Add sp to seccomp_data, as seccomp is user API, we don't want to modify it */
struct syscall_info {
        __u64                        sp;
        struct seccomp_data        data;
};

extern int ptrace_access_vm(struct task_struct *tsk, unsigned long addr,
                            void *buf, int len, unsigned int gup_flags);

/*
 * Ptrace flags
 *
 * The owner ship rules for task->ptrace which holds the ptrace
 * flags is simple.  When a task is running it owns it's task->ptrace
 * flags.  When the a task is stopped the ptracer owns task->ptrace.
 */

#define PT_SEIZED        0x00010000        /* SEIZE used, enable new behavior */
#define PT_PTRACED        0x00000001

#define PT_OPT_FLAG_SHIFT        3
/* PT_TRACE_* event enable flags */
#define PT_EVENT_FLAG(event)        (1 << (PT_OPT_FLAG_SHIFT + (event)))
#define PT_TRACESYSGOOD                PT_EVENT_FLAG(0)
#define PT_TRACE_FORK                PT_EVENT_FLAG(PTRACE_EVENT_FORK)
#define PT_TRACE_VFORK                PT_EVENT_FLAG(PTRACE_EVENT_VFORK)
#define PT_TRACE_CLONE                PT_EVENT_FLAG(PTRACE_EVENT_CLONE)
#define PT_TRACE_EXEC                PT_EVENT_FLAG(PTRACE_EVENT_EXEC)
#define PT_TRACE_VFORK_DONE        PT_EVENT_FLAG(PTRACE_EVENT_VFORK_DONE)
#define PT_TRACE_EXIT                PT_EVENT_FLAG(PTRACE_EVENT_EXIT)
#define PT_TRACE_SECCOMP        PT_EVENT_FLAG(PTRACE_EVENT_SECCOMP)

#define PT_EXITKILL                (PTRACE_O_EXITKILL << PT_OPT_FLAG_SHIFT)
#define PT_SUSPEND_SECCOMP        (PTRACE_O_SUSPEND_SECCOMP << PT_OPT_FLAG_SHIFT)

extern long arch_ptrace(struct task_struct *child, long request,
                        unsigned long addr, unsigned long data);
extern int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len);
extern int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len);
extern void ptrace_disable(struct task_struct *);
extern int ptrace_request(struct task_struct *child, long request,
                          unsigned long addr, unsigned long data);
extern void ptrace_notify(int exit_code);
extern void __ptrace_link(struct task_struct *child,
                          struct task_struct *new_parent,
                          const struct cred *ptracer_cred);
extern void __ptrace_unlink(struct task_struct *child);
extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead);
#define PTRACE_MODE_READ        0x01
#define PTRACE_MODE_ATTACH        0x02
#define PTRACE_MODE_NOAUDIT        0x04
#define PTRACE_MODE_FSCREDS        0x08
#define PTRACE_MODE_REALCREDS        0x10

/* shorthands for READ/ATTACH and FSCREDS/REALCREDS combinations */
#define PTRACE_MODE_READ_FSCREDS (PTRACE_MODE_READ | PTRACE_MODE_FSCREDS)
#define PTRACE_MODE_READ_REALCREDS (PTRACE_MODE_READ | PTRACE_MODE_REALCREDS)
#define PTRACE_MODE_ATTACH_FSCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_FSCREDS)
#define PTRACE_MODE_ATTACH_REALCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_REALCREDS)

/**
 * ptrace_may_access - check whether the caller is permitted to access
 * a target task.
 * @task: target task
 * @mode: selects type of access and caller credentials
 *
 * Returns true on success, false on denial.
 *
 * One of the flags PTRACE_MODE_FSCREDS and PTRACE_MODE_REALCREDS must
 * be set in @mode to specify whether the access was requested through
 * a filesystem syscall (should use effective capabilities and fsuid
 * of the caller) or through an explicit syscall such as
 * process_vm_writev or ptrace (and should use the real credentials).
 */
extern bool ptrace_may_access(struct task_struct *task, unsigned int mode);

static inline int ptrace_reparented(struct task_struct *child)
{
        return !same_thread_group(child->real_parent, child->parent);
}

static inline void ptrace_unlink(struct task_struct *child)
{
        if (unlikely(child->ptrace))
                __ptrace_unlink(child);
}

int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr,
                            unsigned long data);
int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
                            unsigned long data);

/**
 * ptrace_parent - return the task that is tracing the given task
 * @task: task to consider
 *
 * Returns %NULL if no one is tracing @task, or the &struct task_struct
 * pointer to its tracer.
 *
 * Must called under rcu_read_lock().  The pointer returned might be kept
 * live only by RCU.  During exec, this may be called with task_lock() held
 * on @task, still held from when check_unsafe_exec() was called.
 */
static inline struct task_struct *ptrace_parent(struct task_struct *task)
{
        if (unlikely(task->ptrace))
                return rcu_dereference(task->parent);
        return NULL;
}

/**
 * ptrace_event_enabled - test whether a ptrace event is enabled
 * @task: ptracee of interest
 * @event: %PTRACE_EVENT_* to test
 *
 * Test whether @event is enabled for ptracee @task.
 *
 * Returns %true if @event is enabled, %false otherwise.
 */
static inline bool ptrace_event_enabled(struct task_struct *task, int event)
{
        return task->ptrace & PT_EVENT_FLAG(event);
}

/**
 * ptrace_event - possibly stop for a ptrace event notification
 * @event:        %PTRACE_EVENT_* value to report
 * @message:        value for %PTRACE_GETEVENTMSG to return
 *
 * Check whether @event is enabled and, if so, report @event and @message
 * to the ptrace parent.
 *
 * Called without locks.
 */
static inline void ptrace_event(int event, unsigned long message)
{
        if (unlikely(ptrace_event_enabled(current, event))) {
                current->ptrace_message = message;
                ptrace_notify((event << 8) | SIGTRAP);
        } else if (event == PTRACE_EVENT_EXEC) {
                /* legacy EXEC report via SIGTRAP */
                if ((current->ptrace & (PT_PTRACED|PT_SEIZED)) == PT_PTRACED)
                        send_sig(SIGTRAP, current, 0);
        }
}

/**
 * ptrace_event_pid - possibly stop for a ptrace event notification
 * @event:        %PTRACE_EVENT_* value to report
 * @pid:        process identifier for %PTRACE_GETEVENTMSG to return
 *
 * Check whether @event is enabled and, if so, report @event and @pid
 * to the ptrace parent.  @pid is reported as the pid_t seen from the
 * the ptrace parent's pid namespace.
 *
 * Called without locks.
 */
static inline void ptrace_event_pid(int event, struct pid *pid)
{
        /*
         * FIXME: There's a potential race if a ptracer in a different pid
         * namespace than parent attaches between computing message below and
         * when we acquire tasklist_lock in ptrace_stop().  If this happens,
         * the ptracer will get a bogus pid from PTRACE_GETEVENTMSG.
         */
        unsigned long message = 0;
        struct pid_namespace *ns;

        rcu_read_lock();
        ns = task_active_pid_ns(rcu_dereference(current->parent));
        if (ns)
                message = pid_nr_ns(pid, ns);
        rcu_read_unlock();

        ptrace_event(event, message);
}

/**
 * ptrace_init_task - initialize ptrace state for a new child
 * @child:                new child task
 * @ptrace:                true if child should be ptrace'd by parent's tracer
 *
 * This is called immediately after adding @child to its parent's children
 * list.  @ptrace is false in the normal case, and true to ptrace @child.
 *
 * Called with current's siglock and write_lock_irq(&tasklist_lock) held.
 */
static inline void ptrace_init_task(struct task_struct *child, bool ptrace)
{
        INIT_LIST_HEAD(&child->ptrace_entry);
        INIT_LIST_HEAD(&child->ptraced);
        child->jobctl = 0;
        child->ptrace = 0;
        child->parent = child->real_parent;

        if (unlikely(ptrace) && current->ptrace) {
                child->ptrace = current->ptrace;
                __ptrace_link(child, current->parent, current->ptracer_cred);

                if (child->ptrace & PT_SEIZED)
                        task_set_jobctl_pending(child, JOBCTL_TRAP_STOP);
                else
                        sigaddset(&child->pending.signal, SIGSTOP);
        }
        else
                child->ptracer_cred = NULL;
}

/**
 * ptrace_release_task - final ptrace-related cleanup of a zombie being reaped
 * @task:        task in %EXIT_DEAD state
 *
 * Called with write_lock(&tasklist_lock) held.
 */
static inline void ptrace_release_task(struct task_struct *task)
{
        BUG_ON(!list_empty(&task->ptraced));
        ptrace_unlink(task);
        BUG_ON(!list_empty(&task->ptrace_entry));
}

#ifndef force_successful_syscall_return
/*
 * System call handlers that, upon successful completion, need to return a
 * negative value should call force_successful_syscall_return() right before
 * returning.  On architectures where the syscall convention provides for a
 * separate error flag (e.g., alpha, ia64, ppc{,64}, sparc{,64}, possibly
 * others), this macro can be used to ensure that the error flag will not get
 * set.  On architectures which do not support a separate error flag, the macro
 * is a no-op and the spurious error condition needs to be filtered out by some
 * other means (e.g., in user-level, by passing an extra argument to the
 * syscall handler, or something along those lines).
 */
#define force_successful_syscall_return() do { } while (0)
#endif

#ifndef is_syscall_success
/*
 * On most systems we can tell if a syscall is a success based on if the retval
 * is an error value.  On some systems like ia64 and powerpc they have different
 * indicators of success/failure and must define their own.
 */
#define is_syscall_success(regs) (!IS_ERR_VALUE((unsigned long)(regs_return_value(regs))))
#endif

/*
 * <asm/ptrace.h> should define the following things inside #ifdef __KERNEL__.
 *
 * These do-nothing inlines are used when the arch does not
 * implement single-step.  The kerneldoc comments are here
 * to document the interface for all arch definitions.
 */

#ifndef arch_has_single_step
/**
 * arch_has_single_step - does this CPU support user-mode single-step?
 *
 * If this is defined, then there must be function declarations or
 * inlines for user_enable_single_step() and user_disable_single_step().
 * arch_has_single_step() should evaluate to nonzero iff the machine
 * supports instruction single-step for user mode.
 * It can be a constant or it can test a CPU feature bit.
 */
#define arch_has_single_step()                (0)

/**
 * user_enable_single_step - single-step in user-mode task
 * @task: either current or a task stopped in %TASK_TRACED
 *
 * This can only be called when arch_has_single_step() has returned nonzero.
 * Set @task so that when it returns to user mode, it will trap after the
 * next single instruction executes.  If arch_has_block_step() is defined,
 * this must clear the effects of user_enable_block_step() too.
 */
static inline void user_enable_single_step(struct task_struct *task)
{
        BUG();                        /* This can never be called.  */
}

/**
 * user_disable_single_step - cancel user-mode single-step
 * @task: either current or a task stopped in %TASK_TRACED
 *
 * Clear @task of the effects of user_enable_single_step() and
 * user_enable_block_step().  This can be called whether or not either
 * of those was ever called on @task, and even if arch_has_single_step()
 * returned zero.
 */
static inline void user_disable_single_step(struct task_struct *task)
{
}
#else
extern void user_enable_single_step(struct task_struct *);
extern void user_disable_single_step(struct task_struct *);
#endif        /* arch_has_single_step */

#ifndef arch_has_block_step
/**
 * arch_has_block_step - does this CPU support user-mode block-step?
 *
 * If this is defined, then there must be a function declaration or inline
 * for user_enable_block_step(), and arch_has_single_step() must be defined
 * too.  arch_has_block_step() should evaluate to nonzero iff the machine
 * supports step-until-branch for user mode.  It can be a constant or it
 * can test a CPU feature bit.
 */
#define arch_has_block_step()                (0)

/**
 * user_enable_block_step - step until branch in user-mode task
 * @task: either current or a task stopped in %TASK_TRACED
 *
 * This can only be called when arch_has_block_step() has returned nonzero,
 * and will never be called when single-instruction stepping is being used.
 * Set @task so that when it returns to user mode, it will trap after the
 * next branch or trap taken.
 */
static inline void user_enable_block_step(struct task_struct *task)
{
        BUG();                        /* This can never be called.  */
}
#else
extern void user_enable_block_step(struct task_struct *);
#endif        /* arch_has_block_step */

#ifdef ARCH_HAS_USER_SINGLE_STEP_REPORT
extern void user_single_step_report(struct pt_regs *regs);
#else
static inline void user_single_step_report(struct pt_regs *regs)
{
        kernel_siginfo_t info;
        clear_siginfo(&info);
        info.si_signo = SIGTRAP;
        info.si_errno = 0;
        info.si_code = SI_USER;
        info.si_pid = 0;
        info.si_uid = 0;
        force_sig_info(&info);
}
#endif

#ifndef arch_ptrace_stop_needed
/**
 * arch_ptrace_stop_needed - Decide whether arch_ptrace_stop() should be called
 * @code:        current->exit_code value ptrace will stop with
 * @info:        siginfo_t pointer (or %NULL) for signal ptrace will stop with
 *
 * This is called with the siglock held, to decide whether or not it's
 * necessary to release the siglock and call arch_ptrace_stop() with the
 * same @code and @info arguments.  It can be defined to a constant if
 * arch_ptrace_stop() is never required, or always is.  On machines where
 * this makes sense, it should be defined to a quick test to optimize out
 * calling arch_ptrace_stop() when it would be superfluous.  For example,
 * if the thread has not been back to user mode since the last stop, the
 * thread state might indicate that nothing needs to be done.
 *
 * This is guaranteed to be invoked once before a task stops for ptrace and
 * may include arch-specific operations necessary prior to a ptrace stop.
 */
#define arch_ptrace_stop_needed(code, info)        (0)
#endif

#ifndef arch_ptrace_stop
/**
 * arch_ptrace_stop - Do machine-specific work before stopping for ptrace
 * @code:        current->exit_code value ptrace will stop with
 * @info:        siginfo_t pointer (or %NULL) for signal ptrace will stop with
 *
 * This is called with no locks held when arch_ptrace_stop_needed() has
 * just returned nonzero.  It is allowed to block, e.g. for user memory
 * access.  The arch can have machine-specific work to be done before
 * ptrace stops.  On ia64, register backing store gets written back to user
 * memory here.  Since this can be costly (requires dropping the siglock),
 * we only do it when the arch requires it for this particular stop, as
 * indicated by arch_ptrace_stop_needed().
 */
#define arch_ptrace_stop(code, info)                do { } while (0)
#endif

#ifndef current_pt_regs
#define current_pt_regs() task_pt_regs(current)
#endif

/*
 * unlike current_pt_regs(), this one is equal to task_pt_regs(current)
 * on *all* architectures; the only reason to have a per-arch definition
 * is optimisation.
 */
#ifndef signal_pt_regs
#define signal_pt_regs() task_pt_regs(current)
#endif

#ifndef current_user_stack_pointer
#define current_user_stack_pointer() user_stack_pointer(current_pt_regs())
#endif

extern int task_current_syscall(struct task_struct *target, struct syscall_info *info);

extern void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact);
#endif


































    4 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_VSYSCALL_H
#define _ASM_X86_VSYSCALL_H

#include <linux/seqlock.h>
#include <uapi/asm/vsyscall.h>
#include <asm/page_types.h>

#ifdef CONFIG_X86_VSYSCALL_EMULATION
extern void map_vsyscall(void);
extern void set_vsyscall_pgtable_user_bits(pgd_t *root);

/*
 * Called on instruction fetch fault in vsyscall page.
 * Returns true if handled.
 */
extern bool emulate_vsyscall(unsigned long error_code,
                             struct pt_regs *regs, unsigned long address);
#else
static inline void map_vsyscall(void) {}
static inline bool emulate_vsyscall(unsigned long error_code,
                                    struct pt_regs *regs, unsigned long address)
{
        return false;
}
#endif

/*
 * The (legacy) vsyscall page is the long page in the kernel portion
 * of the address space that has user-accessible permissions.
 */
static inline bool is_vsyscall_vaddr(unsigned long vaddr)
{
        return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR);
}

#endif /* _ASM_X86_VSYSCALL_H */




















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
/* SPDX-License-Identifier: GPL-2.0 */
/* linux/net/inet/arp.h */
#ifndef _ARP_H
#define _ARP_H

#include <linux/if_arp.h>
#include <linux/hash.h>
#include <net/neighbour.h>


extern struct neigh_table arp_tbl;

static inline u32 arp_hashfn(const void *pkey, const struct net_device *dev, u32 *hash_rnd)
{
        u32 key = *(const u32 *)pkey;
        u32 val = key ^ hash32_ptr(dev);

        return val * hash_rnd[0];
}

#ifdef CONFIG_INET
static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key)
{
        if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
                key = INADDR_ANY;

        return ___neigh_lookup_noref(&arp_tbl, neigh_key_eq32, arp_hashfn, &key, dev);
}
#else
static inline
struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key)
{
        return NULL;
}
#endif

static inline struct neighbour *__ipv4_neigh_lookup(struct net_device *dev, u32 key)
{
        struct neighbour *n;

        rcu_read_lock_bh();
        n = __ipv4_neigh_lookup_noref(dev, key);
        if (n && !refcount_inc_not_zero(&n->refcnt))
                n = NULL;
        rcu_read_unlock_bh();

        return n;
}

static inline void __ipv4_confirm_neigh(struct net_device *dev, u32 key)
{
        struct neighbour *n;

        rcu_read_lock_bh();
        n = __ipv4_neigh_lookup_noref(dev, key);
        if (n) {
                unsigned long now = jiffies;

                /* avoid dirtying neighbour */
                if (READ_ONCE(n->confirmed) != now)
                        WRITE_ONCE(n->confirmed, now);
        }
        rcu_read_unlock_bh();
}

void arp_init(void);
int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg);
void arp_send(int type, int ptype, __be32 dest_ip,
              struct net_device *dev, __be32 src_ip,
              const unsigned char *dest_hw,
              const unsigned char *src_hw, const unsigned char *th);
int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir);
void arp_ifdown(struct net_device *dev);
int arp_invalidate(struct net_device *dev, __be32 ip, bool force);

struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
                           struct net_device *dev, __be32 src_ip,
                           const unsigned char *dest_hw,
                           const unsigned char *src_hw,
                           const unsigned char *target_hw);
void arp_xmit(struct sk_buff *skb);

#endif        /* _ARP_H */


















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_LIST_NULLS_H
#define _LINUX_LIST_NULLS_H

#include <linux/poison.h>
#include <linux/const.h>

/*
 * Special version of lists, where end of list is not a NULL pointer,
 * but a 'nulls' marker, which can have many different values.
 * (up to 2^31 different values guaranteed on all platforms)
 *
 * In the standard hlist, termination of a list is the NULL pointer.
 * In this special 'nulls' variant, we use the fact that objects stored in
 * a list are aligned on a word (4 or 8 bytes alignment).
 * We therefore use the last significant bit of 'ptr' :
 * Set to 1 : This is a 'nulls' end-of-list marker (ptr >> 1)
 * Set to 0 : This is a pointer to some object (ptr)
 */

struct hlist_nulls_head {
        struct hlist_nulls_node *first;
};

struct hlist_nulls_node {
        struct hlist_nulls_node *next, **pprev;
};
#define NULLS_MARKER(value) (1UL | (((long)value) << 1))
#define INIT_HLIST_NULLS_HEAD(ptr, nulls) \
        ((ptr)->first = (struct hlist_nulls_node *) NULLS_MARKER(nulls))

#define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member)

#define hlist_nulls_entry_safe(ptr, type, member) \
        ({ typeof(ptr) ____ptr = (ptr); \
           !is_a_nulls(____ptr) ? hlist_nulls_entry(____ptr, type, member) : NULL; \
        })
/**
 * ptr_is_a_nulls - Test if a ptr is a nulls
 * @ptr: ptr to be tested
 *
 */
static inline int is_a_nulls(const struct hlist_nulls_node *ptr)
{
        return ((unsigned long)ptr & 1);
}

/**
 * get_nulls_value - Get the 'nulls' value of the end of chain
 * @ptr: end of chain
 *
 * Should be called only if is_a_nulls(ptr);
 */
static inline unsigned long get_nulls_value(const struct hlist_nulls_node *ptr)
{
        return ((unsigned long)ptr) >> 1;
}

/**
 * hlist_nulls_unhashed - Has node been removed and reinitialized?
 * @h: Node to be checked
 *
 * Not that not all removal functions will leave a node in unhashed state.
 * For example, hlist_del_init_rcu() leaves the node in unhashed state,
 * but hlist_nulls_del() does not.
 */
static inline int hlist_nulls_unhashed(const struct hlist_nulls_node *h)
{
        return !h->pprev;
}

/**
 * hlist_nulls_unhashed_lockless - Has node been removed and reinitialized?
 * @h: Node to be checked
 *
 * Not that not all removal functions will leave a node in unhashed state.
 * For example, hlist_del_init_rcu() leaves the node in unhashed state,
 * but hlist_nulls_del() does not.  Unlike hlist_nulls_unhashed(), this
 * function may be used locklessly.
 */
static inline int hlist_nulls_unhashed_lockless(const struct hlist_nulls_node *h)
{
        return !READ_ONCE(h->pprev);
}

static inline int hlist_nulls_empty(const struct hlist_nulls_head *h)
{
        return is_a_nulls(READ_ONCE(h->first));
}

static inline void hlist_nulls_add_head(struct hlist_nulls_node *n,
                                        struct hlist_nulls_head *h)
{
        struct hlist_nulls_node *first = h->first;

        n->next = first;
        WRITE_ONCE(n->pprev, &h->first);
        h->first = n;
        if (!is_a_nulls(first))
                WRITE_ONCE(first->pprev, &n->next);
}

static inline void __hlist_nulls_del(struct hlist_nulls_node *n)
{
        struct hlist_nulls_node *next = n->next;
        struct hlist_nulls_node **pprev = n->pprev;

        WRITE_ONCE(*pprev, next);
        if (!is_a_nulls(next))
                WRITE_ONCE(next->pprev, pprev);
}

static inline void hlist_nulls_del(struct hlist_nulls_node *n)
{
        __hlist_nulls_del(n);
        WRITE_ONCE(n->pprev, LIST_POISON2);
}

/**
 * hlist_nulls_for_each_entry        - iterate over list of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_node to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 *
 */
#define hlist_nulls_for_each_entry(tpos, pos, head, member)                       \
        for (pos = (head)->first;                                               \
             (!is_a_nulls(pos)) &&                                               \
                ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \
             pos = pos->next)

/**
 * hlist_nulls_for_each_entry_from - iterate over a hlist continuing from current point
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_node to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 *
 */
#define hlist_nulls_for_each_entry_from(tpos, pos, member)        \
        for (; (!is_a_nulls(pos)) &&                                 \
                ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \
             pos = pos->next)

#endif



























































































































































    1 
    1 












    1 












































    1 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Netlink message type permission tables, for user generated messages.
 *
 * Author: James Morris <jmorris@redhat.com>
 *
 * Copyright (C) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
 */
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
#include <linux/if.h>
#include <linux/inet_diag.h>
#include <linux/xfrm.h>
#include <linux/audit.h>
#include <linux/sock_diag.h>

#include "flask.h"
#include "av_permissions.h"
#include "security.h"

struct nlmsg_perm {
        u16        nlmsg_type;
        u32        perm;
};

static const struct nlmsg_perm nlmsg_route_perms[] =
{
        { RTM_NEWLINK,                NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_DELLINK,                NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_GETLINK,                NETLINK_ROUTE_SOCKET__NLMSG_READ  },
        { RTM_SETLINK,                NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_NEWADDR,                NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_DELADDR,                NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_GETADDR,                NETLINK_ROUTE_SOCKET__NLMSG_READ  },
        { RTM_NEWROUTE,                NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_DELROUTE,                NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_GETROUTE,                NETLINK_ROUTE_SOCKET__NLMSG_READ  },
        { RTM_NEWNEIGH,                NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_DELNEIGH,                NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_GETNEIGH,                NETLINK_ROUTE_SOCKET__NLMSG_READ  },
        { RTM_NEWRULE,                NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_DELRULE,                NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_GETRULE,                NETLINK_ROUTE_SOCKET__NLMSG_READ  },
        { RTM_NEWQDISC,                NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_DELQDISC,                NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_GETQDISC,                NETLINK_ROUTE_SOCKET__NLMSG_READ  },
        { RTM_NEWTCLASS,        NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_DELTCLASS,        NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_GETTCLASS,        NETLINK_ROUTE_SOCKET__NLMSG_READ  },
        { RTM_NEWTFILTER,        NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_DELTFILTER,        NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_GETTFILTER,        NETLINK_ROUTE_SOCKET__NLMSG_READ  },
        { RTM_NEWACTION,        NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_DELACTION,        NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_GETACTION,        NETLINK_ROUTE_SOCKET__NLMSG_READ  },
        { RTM_NEWPREFIX,        NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_GETMULTICAST,        NETLINK_ROUTE_SOCKET__NLMSG_READ  },
        { RTM_GETANYCAST,        NETLINK_ROUTE_SOCKET__NLMSG_READ  },
        { RTM_GETNEIGHTBL,        NETLINK_ROUTE_SOCKET__NLMSG_READ  },
        { RTM_SETNEIGHTBL,        NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_NEWADDRLABEL,        NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_DELADDRLABEL,        NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_GETADDRLABEL,        NETLINK_ROUTE_SOCKET__NLMSG_READ  },
        { RTM_GETDCB,                NETLINK_ROUTE_SOCKET__NLMSG_READ  },
        { RTM_SETDCB,                NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_NEWNETCONF,        NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_DELNETCONF,        NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_GETNETCONF,        NETLINK_ROUTE_SOCKET__NLMSG_READ  },
        { RTM_NEWMDB,                NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_DELMDB,                NETLINK_ROUTE_SOCKET__NLMSG_WRITE  },
        { RTM_GETMDB,                NETLINK_ROUTE_SOCKET__NLMSG_READ  },
        { RTM_NEWNSID,                NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_DELNSID,                NETLINK_ROUTE_SOCKET__NLMSG_READ  },
        { RTM_GETNSID,                NETLINK_ROUTE_SOCKET__NLMSG_READ  },
        { RTM_NEWSTATS,                NETLINK_ROUTE_SOCKET__NLMSG_READ },
        { RTM_GETSTATS,                NETLINK_ROUTE_SOCKET__NLMSG_READ  },
        { RTM_NEWCACHEREPORT,        NETLINK_ROUTE_SOCKET__NLMSG_READ },
        { RTM_NEWCHAIN,                NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_DELCHAIN,                NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_GETCHAIN,                NETLINK_ROUTE_SOCKET__NLMSG_READ  },
        { RTM_NEWNEXTHOP,        NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_DELNEXTHOP,        NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_GETNEXTHOP,        NETLINK_ROUTE_SOCKET__NLMSG_READ  },
        { RTM_NEWLINKPROP,        NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_DELLINKPROP,        NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_NEWVLAN,                NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_DELVLAN,                NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_GETVLAN,                NETLINK_ROUTE_SOCKET__NLMSG_READ  },
};

static const struct nlmsg_perm nlmsg_tcpdiag_perms[] =
{
        { TCPDIAG_GETSOCK,        NETLINK_TCPDIAG_SOCKET__NLMSG_READ },
        { DCCPDIAG_GETSOCK,        NETLINK_TCPDIAG_SOCKET__NLMSG_READ },
        { SOCK_DIAG_BY_FAMILY,        NETLINK_TCPDIAG_SOCKET__NLMSG_READ },
        { SOCK_DESTROY,                NETLINK_TCPDIAG_SOCKET__NLMSG_WRITE },
};

static const struct nlmsg_perm nlmsg_xfrm_perms[] =
{
        { XFRM_MSG_NEWSA,        NETLINK_XFRM_SOCKET__NLMSG_WRITE },
        { XFRM_MSG_DELSA,        NETLINK_XFRM_SOCKET__NLMSG_WRITE },
        { XFRM_MSG_GETSA,        NETLINK_XFRM_SOCKET__NLMSG_READ  },
        { XFRM_MSG_NEWPOLICY,        NETLINK_XFRM_SOCKET__NLMSG_WRITE },
        { XFRM_MSG_DELPOLICY,        NETLINK_XFRM_SOCKET__NLMSG_WRITE },
        { XFRM_MSG_GETPOLICY,        NETLINK_XFRM_SOCKET__NLMSG_READ  },
        { XFRM_MSG_ALLOCSPI,        NETLINK_XFRM_SOCKET__NLMSG_WRITE },
        { XFRM_MSG_ACQUIRE,        NETLINK_XFRM_SOCKET__NLMSG_WRITE },
        { XFRM_MSG_EXPIRE,        NETLINK_XFRM_SOCKET__NLMSG_WRITE },
        { XFRM_MSG_UPDPOLICY,        NETLINK_XFRM_SOCKET__NLMSG_WRITE },
        { XFRM_MSG_UPDSA,        NETLINK_XFRM_SOCKET__NLMSG_WRITE },
        { XFRM_MSG_POLEXPIRE,        NETLINK_XFRM_SOCKET__NLMSG_WRITE },
        { XFRM_MSG_FLUSHSA,        NETLINK_XFRM_SOCKET__NLMSG_WRITE },
        { XFRM_MSG_FLUSHPOLICY,        NETLINK_XFRM_SOCKET__NLMSG_WRITE },
        { XFRM_MSG_NEWAE,        NETLINK_XFRM_SOCKET__NLMSG_WRITE },
        { XFRM_MSG_GETAE,        NETLINK_XFRM_SOCKET__NLMSG_READ  },
        { XFRM_MSG_REPORT,        NETLINK_XFRM_SOCKET__NLMSG_READ  },
        { XFRM_MSG_MIGRATE,        NETLINK_XFRM_SOCKET__NLMSG_WRITE },
        { XFRM_MSG_NEWSADINFO,        NETLINK_XFRM_SOCKET__NLMSG_READ  },
        { XFRM_MSG_GETSADINFO,        NETLINK_XFRM_SOCKET__NLMSG_READ  },
        { XFRM_MSG_NEWSPDINFO,        NETLINK_XFRM_SOCKET__NLMSG_WRITE },
        { XFRM_MSG_GETSPDINFO,        NETLINK_XFRM_SOCKET__NLMSG_READ  },
        { XFRM_MSG_MAPPING,        NETLINK_XFRM_SOCKET__NLMSG_READ  },
        { XFRM_MSG_SETDEFAULT,        NETLINK_XFRM_SOCKET__NLMSG_WRITE },
        { XFRM_MSG_GETDEFAULT,        NETLINK_XFRM_SOCKET__NLMSG_READ  },
};

static const struct nlmsg_perm nlmsg_audit_perms[] =
{
        { AUDIT_GET,                NETLINK_AUDIT_SOCKET__NLMSG_READ     },
        { AUDIT_SET,                NETLINK_AUDIT_SOCKET__NLMSG_WRITE    },
        { AUDIT_LIST,                NETLINK_AUDIT_SOCKET__NLMSG_READPRIV },
        { AUDIT_ADD,                NETLINK_AUDIT_SOCKET__NLMSG_WRITE    },
        { AUDIT_DEL,                NETLINK_AUDIT_SOCKET__NLMSG_WRITE    },
        { AUDIT_LIST_RULES,        NETLINK_AUDIT_SOCKET__NLMSG_READPRIV },
        { AUDIT_ADD_RULE,        NETLINK_AUDIT_SOCKET__NLMSG_WRITE    },
        { AUDIT_DEL_RULE,        NETLINK_AUDIT_SOCKET__NLMSG_WRITE    },
        { AUDIT_USER,                NETLINK_AUDIT_SOCKET__NLMSG_RELAY    },
        { AUDIT_SIGNAL_INFO,        NETLINK_AUDIT_SOCKET__NLMSG_READ     },
        { AUDIT_TRIM,                NETLINK_AUDIT_SOCKET__NLMSG_WRITE    },
        { AUDIT_MAKE_EQUIV,        NETLINK_AUDIT_SOCKET__NLMSG_WRITE    },
        { AUDIT_TTY_GET,        NETLINK_AUDIT_SOCKET__NLMSG_READ     },
        { AUDIT_TTY_SET,        NETLINK_AUDIT_SOCKET__NLMSG_TTY_AUDIT        },
        { AUDIT_GET_FEATURE,        NETLINK_AUDIT_SOCKET__NLMSG_READ     },
        { AUDIT_SET_FEATURE,        NETLINK_AUDIT_SOCKET__NLMSG_WRITE    },
};


static int nlmsg_perm(u16 nlmsg_type, u32 *perm, const struct nlmsg_perm *tab, size_t tabsize)
{
        int i, err = -EINVAL;

        for (i = 0; i < tabsize/sizeof(struct nlmsg_perm); i++)
                if (nlmsg_type == tab[i].nlmsg_type) {
                        *perm = tab[i].perm;
                        err = 0;
                        break;
                }

        return err;
}

int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm)
{
        int err = 0;

        switch (sclass) {
        case SECCLASS_NETLINK_ROUTE_SOCKET:
                /* RTM_MAX always points to RTM_SETxxxx, ie RTM_NEWxxx + 3.
                 * If the BUILD_BUG_ON() below fails you must update the
                 * structures at the top of this file with the new mappings
                 * before updating the BUILD_BUG_ON() macro!
                 */
                BUILD_BUG_ON(RTM_MAX != (RTM_NEWVLAN + 3));
                err = nlmsg_perm(nlmsg_type, perm, nlmsg_route_perms,
                                 sizeof(nlmsg_route_perms));
                break;

        case SECCLASS_NETLINK_TCPDIAG_SOCKET:
                err = nlmsg_perm(nlmsg_type, perm, nlmsg_tcpdiag_perms,
                                 sizeof(nlmsg_tcpdiag_perms));
                break;

        case SECCLASS_NETLINK_XFRM_SOCKET:
                /* If the BUILD_BUG_ON() below fails you must update the
                 * structures at the top of this file with the new mappings
                 * before updating the BUILD_BUG_ON() macro!
                 */
                BUILD_BUG_ON(XFRM_MSG_MAX != XFRM_MSG_GETDEFAULT);
                err = nlmsg_perm(nlmsg_type, perm, nlmsg_xfrm_perms,
                                 sizeof(nlmsg_xfrm_perms));
                break;

        case SECCLASS_NETLINK_AUDIT_SOCKET:
                if ((nlmsg_type >= AUDIT_FIRST_USER_MSG &&
                     nlmsg_type <= AUDIT_LAST_USER_MSG) ||
                    (nlmsg_type >= AUDIT_FIRST_USER_MSG2 &&
                     nlmsg_type <= AUDIT_LAST_USER_MSG2)) {
                        *perm = NETLINK_AUDIT_SOCKET__NLMSG_RELAY;
                } else {
                        err = nlmsg_perm(nlmsg_type, perm, nlmsg_audit_perms,
                                         sizeof(nlmsg_audit_perms));
                }
                break;

        /* No messaging from userspace, or class unknown/unhandled */
        default:
                err = -ENOENT;
                break;
        }

        return err;
}





















































    1 



























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *  include/linux/eventpoll.h ( Efficient event polling implementation )
 *  Copyright (C) 2001,...,2006         Davide Libenzi
 *
 *  Davide Libenzi <davidel@xmailserver.org>
 */
#ifndef _LINUX_EVENTPOLL_H
#define _LINUX_EVENTPOLL_H

#include <uapi/linux/eventpoll.h>
#include <uapi/linux/kcmp.h>


/* Forward declarations to avoid compiler errors */
struct file;


#ifdef CONFIG_EPOLL

#ifdef CONFIG_KCMP
struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff);
#endif

/* Used to initialize the epoll bits inside the "struct file" */
static inline void eventpoll_init_file(struct file *file)
{
        INIT_LIST_HEAD(&file->f_ep_links);
        INIT_LIST_HEAD(&file->f_tfile_llink);
}


/* Used to release the epoll bits inside the "struct file" */
void eventpoll_release_file(struct file *file);

/*
 * This is called from inside fs/file_table.c:__fput() to unlink files
 * from the eventpoll interface. We need to have this facility to cleanup
 * correctly files that are closed without being removed from the eventpoll
 * interface.
 */
static inline void eventpoll_release(struct file *file)
{

        /*
         * Fast check to avoid the get/release of the semaphore. Since
         * we're doing this outside the semaphore lock, it might return
         * false negatives, but we don't care. It'll help in 99.99% of cases
         * to avoid the semaphore lock. False positives simply cannot happen
         * because the file in on the way to be removed and nobody ( but
         * eventpoll ) has still a reference to this file.
         */
        if (likely(list_empty(&file->f_ep_links)))
                return;

        /*
         * The file is being closed while it is still linked to an epoll
         * descriptor. We need to handle this by correctly unlinking it
         * from its containers.
         */
        eventpoll_release_file(file);
}

int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
                 bool nonblock);

/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
static inline int ep_op_has_event(int op)
{
        return op != EPOLL_CTL_DEL;
}

#else

static inline void eventpoll_init_file(struct file *file) {}
static inline void eventpoll_release(struct file *file) {}

#endif

#endif /* #ifndef _LINUX_EVENTPOLL_H */



















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM cgroup

#if !defined(_TRACE_CGROUP_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_CGROUP_H

#include <linux/cgroup.h>
#include <linux/tracepoint.h>

DECLARE_EVENT_CLASS(cgroup_root,

        TP_PROTO(struct cgroup_root *root),

        TP_ARGS(root),

        TP_STRUCT__entry(
                __field(        int,                root                        )
                __field(        u16,                ss_mask                        )
                __string(        name,                root->name                )
        ),

        TP_fast_assign(
                __entry->root = root->hierarchy_id;
                __entry->ss_mask = root->subsys_mask;
                __assign_str(name, root->name);
        ),

        TP_printk("root=%d ss_mask=%#x name=%s",
                  __entry->root, __entry->ss_mask, __get_str(name))
);

DEFINE_EVENT(cgroup_root, cgroup_setup_root,

        TP_PROTO(struct cgroup_root *root),

        TP_ARGS(root)
);

DEFINE_EVENT(cgroup_root, cgroup_destroy_root,

        TP_PROTO(struct cgroup_root *root),

        TP_ARGS(root)
);

DEFINE_EVENT(cgroup_root, cgroup_remount,

        TP_PROTO(struct cgroup_root *root),

        TP_ARGS(root)
);

DECLARE_EVENT_CLASS(cgroup,

        TP_PROTO(struct cgroup *cgrp, const char *path),

        TP_ARGS(cgrp, path),

        TP_STRUCT__entry(
                __field(        int,                root                        )
                __field(        int,                level                        )
                __field(        u64,                id                        )
                __string(        path,                path                        )
        ),

        TP_fast_assign(
                __entry->root = cgrp->root->hierarchy_id;
                __entry->id = cgroup_id(cgrp);
                __entry->level = cgrp->level;
                __assign_str(path, path);
        ),

        TP_printk("root=%d id=%llu level=%d path=%s",
                  __entry->root, __entry->id, __entry->level, __get_str(path))
);

DEFINE_EVENT(cgroup, cgroup_mkdir,

        TP_PROTO(struct cgroup *cgrp, const char *path),

        TP_ARGS(cgrp, path)
);

DEFINE_EVENT(cgroup, cgroup_rmdir,

        TP_PROTO(struct cgroup *cgrp, const char *path),

        TP_ARGS(cgrp, path)
);

DEFINE_EVENT(cgroup, cgroup_release,

        TP_PROTO(struct cgroup *cgrp, const char *path),

        TP_ARGS(cgrp, path)
);

DEFINE_EVENT(cgroup, cgroup_rename,

        TP_PROTO(struct cgroup *cgrp, const char *path),

        TP_ARGS(cgrp, path)
);

DEFINE_EVENT(cgroup, cgroup_freeze,

        TP_PROTO(struct cgroup *cgrp, const char *path),

        TP_ARGS(cgrp, path)
);

DEFINE_EVENT(cgroup, cgroup_unfreeze,

        TP_PROTO(struct cgroup *cgrp, const char *path),

        TP_ARGS(cgrp, path)
);

DECLARE_EVENT_CLASS(cgroup_migrate,

        TP_PROTO(struct cgroup *dst_cgrp, const char *path,
                 struct task_struct *task, bool threadgroup),

        TP_ARGS(dst_cgrp, path, task, threadgroup),

        TP_STRUCT__entry(
                __field(        int,                dst_root                )
                __field(        int,                dst_level                )
                __field(        u64,                dst_id                        )
                __field(        int,                pid                        )
                __string(        dst_path,        path                        )
                __string(        comm,                task->comm                )
        ),

        TP_fast_assign(
                __entry->dst_root = dst_cgrp->root->hierarchy_id;
                __entry->dst_id = cgroup_id(dst_cgrp);
                __entry->dst_level = dst_cgrp->level;
                __assign_str(dst_path, path);
                __entry->pid = task->pid;
                __assign_str(comm, task->comm);
        ),

        TP_printk("dst_root=%d dst_id=%llu dst_level=%d dst_path=%s pid=%d comm=%s",
                  __entry->dst_root, __entry->dst_id, __entry->dst_level,
                  __get_str(dst_path), __entry->pid, __get_str(comm))
);

DEFINE_EVENT(cgroup_migrate, cgroup_attach_task,

        TP_PROTO(struct cgroup *dst_cgrp, const char *path,
                 struct task_struct *task, bool threadgroup),

        TP_ARGS(dst_cgrp, path, task, threadgroup)
);

DEFINE_EVENT(cgroup_migrate, cgroup_transfer_tasks,

        TP_PROTO(struct cgroup *dst_cgrp, const char *path,
                 struct task_struct *task, bool threadgroup),

        TP_ARGS(dst_cgrp, path, task, threadgroup)
);

DECLARE_EVENT_CLASS(cgroup_event,

        TP_PROTO(struct cgroup *cgrp, const char *path, int val),

        TP_ARGS(cgrp, path, val),

        TP_STRUCT__entry(
                __field(        int,                root                        )
                __field(        int,                level                        )
                __field(        u64,                id                        )
                __string(        path,                path                        )
                __field(        int,                val                        )
        ),

        TP_fast_assign(
                __entry->root = cgrp->root->hierarchy_id;
                __entry->id = cgroup_id(cgrp);
                __entry->level = cgrp->level;
                __assign_str(path, path);
                __entry->val = val;
        ),

        TP_printk("root=%d id=%llu level=%d path=%s val=%d",
                  __entry->root, __entry->id, __entry->level, __get_str(path),
                  __entry->val)
);

DEFINE_EVENT(cgroup_event, cgroup_notify_populated,

        TP_PROTO(struct cgroup *cgrp, const char *path, int val),

        TP_ARGS(cgrp, path, val)
);

DEFINE_EVENT(cgroup_event, cgroup_notify_frozen,

        TP_PROTO(struct cgroup *cgrp, const char *path, int val),

        TP_ARGS(cgrp, path, val)
);

#endif /* _TRACE_CGROUP_H */

/* This part must be outside protection */
#include <trace/define_trace.h>













































    1 


    1 


    1 




    1 


    1 


    1 



    1 
































































































    1 







    1 


    1 


    1 




    1 







    1 




    1 
    1 



    1 


    1 






















































































































































































































































































































































































































































































































































































    3 











    3 



    2 




    2 



    3 
    3 

    3 





    3 



    3 




    3 

    3 

    2 



    3 


    3 
    3 
    3 
    3 
    2 
    2 

    3 
































    3 



































































    3 





































































































    3 




    3 


    3 


















    3 



    3 



    3 
    1 

    1 

    2 

    2 








    3 





    3 












    3 
    3 









    3 


    3 

    2 

    3 
    1 
    1 





    3 

    3 
    3 

    3 

    3 

    3 

    3 

    3 

    3 






    3 
    3 































































    3 



    3 

    3 



    3 

    3 

    1 



    3 


    3 

















    3 












































































    1 




    1 


    1 
    1 


    1 
    1 









    1 




    1 





    1 








































    1 

    1 






























    1 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/open.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/string.h>
#include <linux/mm.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/fsnotify.h>
#include <linux/module.h>
#include <linux/tty.h>
#include <linux/namei.h>
#include <linux/backing-dev.h>
#include <linux/capability.h>
#include <linux/securebits.h>
#include <linux/security.h>
#include <linux/mount.h>
#include <linux/fcntl.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/fs.h>
#include <linux/personality.h>
#include <linux/pagemap.h>
#include <linux/syscalls.h>
#include <linux/rcupdate.h>
#include <linux/audit.h>
#include <linux/falloc.h>
#include <linux/fs_struct.h>
#include <linux/ima.h>
#include <linux/dnotify.h>
#include <linux/compat.h>

#include "internal.h"

int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
        struct file *filp)
{
        int ret;
        struct iattr newattrs;

        /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */
        if (length < 0)
                return -EINVAL;

        newattrs.ia_size = length;
        newattrs.ia_valid = ATTR_SIZE | time_attrs;
        if (filp) {
                newattrs.ia_file = filp;
                newattrs.ia_valid |= ATTR_FILE;
        }

        /* Remove suid, sgid, and file capabilities on truncate too */
        ret = dentry_needs_remove_privs(dentry);
        if (ret < 0)
                return ret;
        if (ret)
                newattrs.ia_valid |= ret | ATTR_FORCE;

        inode_lock(dentry->d_inode);
        /* Note any delegations or leases have already been broken: */
        ret = notify_change(dentry, &newattrs, NULL);
        inode_unlock(dentry->d_inode);
        return ret;
}

long vfs_truncate(const struct path *path, loff_t length)
{
        struct inode *inode;
        long error;

        inode = path->dentry->d_inode;

        /* For directories it's -EISDIR, for other non-regulars - -EINVAL */
        if (S_ISDIR(inode->i_mode))
                return -EISDIR;
        if (!S_ISREG(inode->i_mode))
                return -EINVAL;

        error = mnt_want_write(path->mnt);
        if (error)
                goto out;

        error = inode_permission(inode, MAY_WRITE);
        if (error)
                goto mnt_drop_write_and_out;

        error = -EPERM;
        if (IS_APPEND(inode))
                goto mnt_drop_write_and_out;

        error = get_write_access(inode);
        if (error)
                goto mnt_drop_write_and_out;

        /*
         * Make sure that there are no leases.  get_write_access() protects
         * against the truncate racing with a lease-granting setlease().
         */
        error = break_lease(inode, O_WRONLY);
        if (error)
                goto put_write_and_out;

        error = locks_verify_truncate(inode, NULL, length);
        if (!error)
                error = security_path_truncate(path);
        if (!error)
                error = do_truncate(path->dentry, length, 0, NULL);

put_write_and_out:
        put_write_access(inode);
mnt_drop_write_and_out:
        mnt_drop_write(path->mnt);
out:
        return error;
}
EXPORT_SYMBOL_GPL(vfs_truncate);

long do_sys_truncate(const char __user *pathname, loff_t length)
{
        unsigned int lookup_flags = LOOKUP_FOLLOW;
        struct path path;
        int error;

        if (length < 0)        /* sorry, but loff_t says... */
                return -EINVAL;

retry:
        error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
        if (!error) {
                error = vfs_truncate(&path, length);
                path_put(&path);
        }
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
        return error;
}

SYSCALL_DEFINE2(truncate, const char __user *, path, long, length)
{
        return do_sys_truncate(path, length);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length)
{
        return do_sys_truncate(path, length);
}
#endif

long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
{
        struct inode *inode;
        struct dentry *dentry;
        struct fd f;
        int error;

        error = -EINVAL;
        if (length < 0)
                goto out;
        error = -EBADF;
        f = fdget(fd);
        if (!f.file)
                goto out;

        /* explicitly opened as large or we are on 64-bit box */
        if (f.file->f_flags & O_LARGEFILE)
                small = 0;

        dentry = f.file->f_path.dentry;
        inode = dentry->d_inode;
        error = -EINVAL;
        if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE))
                goto out_putf;

        error = -EINVAL;
        /* Cannot ftruncate over 2^31 bytes without large file support */
        if (small && length > MAX_NON_LFS)
                goto out_putf;

        error = -EPERM;
        /* Check IS_APPEND on real upper inode */
        if (IS_APPEND(file_inode(f.file)))
                goto out_putf;

        sb_start_write(inode->i_sb);
        error = locks_verify_truncate(inode, f.file, length);
        if (!error)
                error = security_path_truncate(&f.file->f_path);
        if (!error)
                error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, f.file);
        sb_end_write(inode->i_sb);
out_putf:
        fdput(f);
out:
        return error;
}

SYSCALL_DEFINE2(ftruncate, unsigned int, fd, off_t, length)
{
        return do_sys_ftruncate(fd, length, 1);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_off_t, length)
{
        return do_sys_ftruncate(fd, length, 1);
}
#endif

/* LFS versions of truncate are only needed on 32 bit machines */
#if BITS_PER_LONG == 32
SYSCALL_DEFINE2(truncate64, const char __user *, path, loff_t, length)
{
        return do_sys_truncate(path, length);
}

SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length)
{
        return do_sys_ftruncate(fd, length, 0);
}
#endif /* BITS_PER_LONG == 32 */


int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
        struct inode *inode = file_inode(file);
        long ret;

        if (offset < 0 || len <= 0)
                return -EINVAL;

        /* Return error if mode is not supported */
        if (mode & ~FALLOC_FL_SUPPORTED_MASK)
                return -EOPNOTSUPP;

        /* Punch hole and zero range are mutually exclusive */
        if ((mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) ==
            (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))
                return -EOPNOTSUPP;

        /* Punch hole must have keep size set */
        if ((mode & FALLOC_FL_PUNCH_HOLE) &&
            !(mode & FALLOC_FL_KEEP_SIZE))
                return -EOPNOTSUPP;

        /* Collapse range should only be used exclusively. */
        if ((mode & FALLOC_FL_COLLAPSE_RANGE) &&
            (mode & ~FALLOC_FL_COLLAPSE_RANGE))
                return -EINVAL;

        /* Insert range should only be used exclusively. */
        if ((mode & FALLOC_FL_INSERT_RANGE) &&
            (mode & ~FALLOC_FL_INSERT_RANGE))
                return -EINVAL;

        /* Unshare range should only be used with allocate mode. */
        if ((mode & FALLOC_FL_UNSHARE_RANGE) &&
            (mode & ~(FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE)))
                return -EINVAL;

        if (!(file->f_mode & FMODE_WRITE))
                return -EBADF;

        /*
         * We can only allow pure fallocate on append only files
         */
        if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode))
                return -EPERM;

        if (IS_IMMUTABLE(inode))
                return -EPERM;

        /*
         * We cannot allow any fallocate operation on an active swapfile
         */
        if (IS_SWAPFILE(inode))
                return -ETXTBSY;

        /*
         * Revalidate the write permissions, in case security policy has
         * changed since the files were opened.
         */
        ret = security_file_permission(file, MAY_WRITE);
        if (ret)
                return ret;

        if (S_ISFIFO(inode->i_mode))
                return -ESPIPE;

        if (S_ISDIR(inode->i_mode))
                return -EISDIR;

        if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
                return -ENODEV;

        /* Check for wrap through zero too */
        if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
                return -EFBIG;

        if (!file->f_op->fallocate)
                return -EOPNOTSUPP;

        file_start_write(file);
        ret = file->f_op->fallocate(file, mode, offset, len);

        /*
         * Create inotify and fanotify events.
         *
         * To keep the logic simple always create events if fallocate succeeds.
         * This implies that events are even created if the file size remains
         * unchanged, e.g. when using flag FALLOC_FL_KEEP_SIZE.
         */
        if (ret == 0)
                fsnotify_modify(file);

        file_end_write(file);
        return ret;
}
EXPORT_SYMBOL_GPL(vfs_fallocate);

int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len)
{
        struct fd f = fdget(fd);
        int error = -EBADF;

        if (f.file) {
                error = vfs_fallocate(f.file, mode, offset, len);
                fdput(f);
        }
        return error;
}

SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len)
{
        return ksys_fallocate(fd, mode, offset, len);
}

/*
 * access() needs to use the real uid/gid, not the effective uid/gid.
 * We do this by temporarily clearing all FS-related capabilities and
 * switching the fsuid/fsgid around to the real ones.
 */
static const struct cred *access_override_creds(void)
{
        const struct cred *old_cred;
        struct cred *override_cred;

        override_cred = prepare_creds();
        if (!override_cred)
                return NULL;

        override_cred->fsuid = override_cred->uid;
        override_cred->fsgid = override_cred->gid;

        if (!issecure(SECURE_NO_SETUID_FIXUP)) {
                /* Clear the capabilities if we switch to a non-root user */
                kuid_t root_uid = make_kuid(override_cred->user_ns, 0);
                if (!uid_eq(override_cred->uid, root_uid))
                        cap_clear(override_cred->cap_effective);
                else
                        override_cred->cap_effective =
                                override_cred->cap_permitted;
        }

        /*
         * The new set of credentials can *only* be used in
         * task-synchronous circumstances, and does not need
         * RCU freeing, unless somebody then takes a separate
         * reference to it.
         *
         * NOTE! This is _only_ true because this credential
         * is used purely for override_creds() that installs
         * it as the subjective cred. Other threads will be
         * accessing ->real_cred, not the subjective cred.
         *
         * If somebody _does_ make a copy of this (using the
         * 'get_current_cred()' function), that will clear the
         * non_rcu field, because now that other user may be
         * expecting RCU freeing. But normal thread-synchronous
         * cred accesses will keep things non-RCY.
         */
        override_cred->non_rcu = 1;

        old_cred = override_creds(override_cred);

        /* override_cred() gets its own ref */
        put_cred(override_cred);

        return old_cred;
}

static long do_faccessat(int dfd, const char __user *filename, int mode, int flags)
{
        struct path path;
        struct inode *inode;
        int res;
        unsigned int lookup_flags = LOOKUP_FOLLOW;
        const struct cred *old_cred = NULL;

        if (mode & ~S_IRWXO)        /* where's F_OK, X_OK, W_OK, R_OK? */
                return -EINVAL;

        if (flags & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH))
                return -EINVAL;

        if (flags & AT_SYMLINK_NOFOLLOW)
                lookup_flags &= ~LOOKUP_FOLLOW;
        if (flags & AT_EMPTY_PATH)
                lookup_flags |= LOOKUP_EMPTY;

        if (!(flags & AT_EACCESS)) {
                old_cred = access_override_creds();
                if (!old_cred)
                        return -ENOMEM;
        }

retry:
        res = user_path_at(dfd, filename, lookup_flags, &path);
        if (res)
                goto out;

        inode = d_backing_inode(path.dentry);

        if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) {
                /*
                 * MAY_EXEC on regular files is denied if the fs is mounted
                 * with the "noexec" flag.
                 */
                res = -EACCES;
                if (path_noexec(&path))
                        goto out_path_release;
        }

        res = inode_permission(inode, mode | MAY_ACCESS);
        /* SuS v2 requires we report a read only fs too */
        if (res || !(mode & S_IWOTH) || special_file(inode->i_mode))
                goto out_path_release;
        /*
         * This is a rare case where using __mnt_is_readonly()
         * is OK without a mnt_want/drop_write() pair.  Since
         * no actual write to the fs is performed here, we do
         * not need to telegraph to that to anyone.
         *
         * By doing this, we accept that this access is
         * inherently racy and know that the fs may change
         * state before we even see this result.
         */
        if (__mnt_is_readonly(path.mnt))
                res = -EROFS;

out_path_release:
        path_put(&path);
        if (retry_estale(res, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
out:
        if (old_cred)
                revert_creds(old_cred);

        return res;
}

SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
{
        return do_faccessat(dfd, filename, mode, 0);
}

SYSCALL_DEFINE4(faccessat2, int, dfd, const char __user *, filename, int, mode,
                int, flags)
{
        return do_faccessat(dfd, filename, mode, flags);
}

SYSCALL_DEFINE2(access, const char __user *, filename, int, mode)
{
        return do_faccessat(AT_FDCWD, filename, mode, 0);
}

SYSCALL_DEFINE1(chdir, const char __user *, filename)
{
        struct path path;
        int error;
        unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
retry:
        error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
        if (error)
                goto out;

        error = path_permission(&path, MAY_EXEC | MAY_CHDIR);
        if (error)
                goto dput_and_out;

        set_fs_pwd(current->fs, &path);

dput_and_out:
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
out:
        return error;
}

SYSCALL_DEFINE1(fchdir, unsigned int, fd)
{
        struct fd f = fdget_raw(fd);
        int error;

        error = -EBADF;
        if (!f.file)
                goto out;

        error = -ENOTDIR;
        if (!d_can_lookup(f.file->f_path.dentry))
                goto out_putf;

        error = file_permission(f.file, MAY_EXEC | MAY_CHDIR);
        if (!error)
                set_fs_pwd(current->fs, &f.file->f_path);
out_putf:
        fdput(f);
out:
        return error;
}

SYSCALL_DEFINE1(chroot, const char __user *, filename)
{
        struct path path;
        int error;
        unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
retry:
        error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
        if (error)
                goto out;

        error = path_permission(&path, MAY_EXEC | MAY_CHDIR);
        if (error)
                goto dput_and_out;

        error = -EPERM;
        if (!ns_capable(current_user_ns(), CAP_SYS_CHROOT))
                goto dput_and_out;
        error = security_path_chroot(&path);
        if (error)
                goto dput_and_out;

        set_fs_root(current->fs, &path);
        error = 0;
dput_and_out:
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
out:
        return error;
}

int chmod_common(const struct path *path, umode_t mode)
{
        struct inode *inode = path->dentry->d_inode;
        struct inode *delegated_inode = NULL;
        struct iattr newattrs;
        int error;

        error = mnt_want_write(path->mnt);
        if (error)
                return error;
retry_deleg:
        inode_lock(inode);
        error = security_path_chmod(path, mode);
        if (error)
                goto out_unlock;
        newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
        newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
        error = notify_change(path->dentry, &newattrs, &delegated_inode);
out_unlock:
        inode_unlock(inode);
        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }
        mnt_drop_write(path->mnt);
        return error;
}

int vfs_fchmod(struct file *file, umode_t mode)
{
        audit_file(file);
        return chmod_common(&file->f_path, mode);
}

SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
{
        struct fd f = fdget(fd);
        int err = -EBADF;

        if (f.file) {
                err = vfs_fchmod(f.file, mode);
                fdput(f);
        }
        return err;
}

static int do_fchmodat(int dfd, const char __user *filename, umode_t mode)
{
        struct path path;
        int error;
        unsigned int lookup_flags = LOOKUP_FOLLOW;
retry:
        error = user_path_at(dfd, filename, lookup_flags, &path);
        if (!error) {
                error = chmod_common(&path, mode);
                path_put(&path);
                if (retry_estale(error, lookup_flags)) {
                        lookup_flags |= LOOKUP_REVAL;
                        goto retry;
                }
        }
        return error;
}

SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename,
                umode_t, mode)
{
        return do_fchmodat(dfd, filename, mode);
}

SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode)
{
        return do_fchmodat(AT_FDCWD, filename, mode);
}

int chown_common(const struct path *path, uid_t user, gid_t group)
{
        struct inode *inode = path->dentry->d_inode;
        struct inode *delegated_inode = NULL;
        int error;
        struct iattr newattrs;
        kuid_t uid;
        kgid_t gid;

        uid = make_kuid(current_user_ns(), user);
        gid = make_kgid(current_user_ns(), group);

retry_deleg:
        newattrs.ia_valid =  ATTR_CTIME;
        if (user != (uid_t) -1) {
                if (!uid_valid(uid))
                        return -EINVAL;
                newattrs.ia_valid |= ATTR_UID;
                newattrs.ia_uid = uid;
        }
        if (group != (gid_t) -1) {
                if (!gid_valid(gid))
                        return -EINVAL;
                newattrs.ia_valid |= ATTR_GID;
                newattrs.ia_gid = gid;
        }
        inode_lock(inode);
        if (!S_ISDIR(inode->i_mode))
                newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV |
                                     setattr_should_drop_sgid(inode);
        error = security_path_chown(path, uid, gid);
        if (!error)
                error = notify_change(path->dentry, &newattrs, &delegated_inode);
        inode_unlock(inode);
        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }
        return error;
}

int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
                int flag)
{
        struct path path;
        int error = -EINVAL;
        int lookup_flags;

        if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
                goto out;

        lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
        if (flag & AT_EMPTY_PATH)
                lookup_flags |= LOOKUP_EMPTY;
retry:
        error = user_path_at(dfd, filename, lookup_flags, &path);
        if (error)
                goto out;
        error = mnt_want_write(path.mnt);
        if (error)
                goto out_release;
        error = chown_common(&path, user, group);
        mnt_drop_write(path.mnt);
out_release:
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
out:
        return error;
}

SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
                gid_t, group, int, flag)
{
        return do_fchownat(dfd, filename, user, group, flag);
}

SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group)
{
        return do_fchownat(AT_FDCWD, filename, user, group, 0);
}

SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group)
{
        return do_fchownat(AT_FDCWD, filename, user, group,
                           AT_SYMLINK_NOFOLLOW);
}

int vfs_fchown(struct file *file, uid_t user, gid_t group)
{
        int error;

        error = mnt_want_write_file(file);
        if (error)
                return error;
        audit_file(file);
        error = chown_common(&file->f_path, user, group);
        mnt_drop_write_file(file);
        return error;
}

int ksys_fchown(unsigned int fd, uid_t user, gid_t group)
{
        struct fd f = fdget(fd);
        int error = -EBADF;

        if (f.file) {
                error = vfs_fchown(f.file, user, group);
                fdput(f);
        }
        return error;
}

SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
{
        return ksys_fchown(fd, user, group);
}

static int do_dentry_open(struct file *f,
                          struct inode *inode,
                          int (*open)(struct inode *, struct file *))
{
        static const struct file_operations empty_fops = {};
        int error;

        path_get(&f->f_path);
        f->f_inode = inode;
        f->f_mapping = inode->i_mapping;
        f->f_wb_err = filemap_sample_wb_err(f->f_mapping);
        f->f_sb_err = file_sample_sb_err(f);

        if (unlikely(f->f_flags & O_PATH)) {
                f->f_mode = FMODE_PATH | FMODE_OPENED;
                f->f_op = &empty_fops;
                return 0;
        }

        if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
                error = get_write_access(inode);
                if (unlikely(error))
                        goto cleanup_file;
                error = __mnt_want_write(f->f_path.mnt);
                if (unlikely(error)) {
                        put_write_access(inode);
                        goto cleanup_file;
                }
                f->f_mode |= FMODE_WRITER;
        }

        /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
        if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))
                f->f_mode |= FMODE_ATOMIC_POS;

        f->f_op = fops_get(inode->i_fop);
        if (WARN_ON(!f->f_op)) {
                error = -ENODEV;
                goto cleanup_all;
        }

        error = security_file_open(f);
        if (error)
                goto cleanup_all;

        error = break_lease(locks_inode(f), f->f_flags);
        if (error)
                goto cleanup_all;

        /* normally all 3 are set; ->open() can clear them if needed */
        f->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
        if (!open)
                open = f->f_op->open;
        if (open) {
                error = open(inode, f);
                if (error)
                        goto cleanup_all;
        }
        f->f_mode |= FMODE_OPENED;
        if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
                i_readcount_inc(inode);
        if ((f->f_mode & FMODE_READ) &&
             likely(f->f_op->read || f->f_op->read_iter))
                f->f_mode |= FMODE_CAN_READ;
        if ((f->f_mode & FMODE_WRITE) &&
             likely(f->f_op->write || f->f_op->write_iter))
                f->f_mode |= FMODE_CAN_WRITE;

        f->f_write_hint = WRITE_LIFE_NOT_SET;
        f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);

        file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);

        /* NB: we're sure to have correct a_ops only after f_op->open */
        if (f->f_flags & O_DIRECT) {
                if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)
                        return -EINVAL;
        }

        /*
         * XXX: Huge page cache doesn't support writing yet. Drop all page
         * cache for this file before processing writes.
         */
        if ((f->f_mode & FMODE_WRITE) && filemap_nr_thps(inode->i_mapping))
                truncate_pagecache(inode, 0);

        return 0;

cleanup_all:
        if (WARN_ON_ONCE(error > 0))
                error = -EINVAL;
        fops_put(f->f_op);
        if (f->f_mode & FMODE_WRITER) {
                put_write_access(inode);
                __mnt_drop_write(f->f_path.mnt);
        }
cleanup_file:
        path_put(&f->f_path);
        f->f_path.mnt = NULL;
        f->f_path.dentry = NULL;
        f->f_inode = NULL;
        return error;
}

/**
 * finish_open - finish opening a file
 * @file: file pointer
 * @dentry: pointer to dentry
 * @open: open callback
 * @opened: state of open
 *
 * This can be used to finish opening a file passed to i_op->atomic_open().
 *
 * If the open callback is set to NULL, then the standard f_op->open()
 * filesystem callback is substituted.
 *
 * NB: the dentry reference is _not_ consumed.  If, for example, the dentry is
 * the return value of d_splice_alias(), then the caller needs to perform dput()
 * on it after finish_open().
 *
 * Returns zero on success or -errno if the open failed.
 */
int finish_open(struct file *file, struct dentry *dentry,
                int (*open)(struct inode *, struct file *))
{
        BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */

        file->f_path.dentry = dentry;
        return do_dentry_open(file, d_backing_inode(dentry), open);
}
EXPORT_SYMBOL(finish_open);

/**
 * finish_no_open - finish ->atomic_open() without opening the file
 *
 * @file: file pointer
 * @dentry: dentry, ERR_PTR(-E...) or NULL (as returned from ->lookup())
 *
 * This can be used to set the result of a lookup in ->atomic_open().
 *
 * NB: unlike finish_open() this function does consume the dentry reference and
 * the caller need not dput() it.
 *
 * Returns 0 or -E..., which must be the return value of ->atomic_open() after
 * having called this function.
 */
int finish_no_open(struct file *file, struct dentry *dentry)
{
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);
        file->f_path.dentry = dentry;
        return 0;
}
EXPORT_SYMBOL(finish_no_open);

char *file_path(struct file *filp, char *buf, int buflen)
{
        return d_path(&filp->f_path, buf, buflen);
}
EXPORT_SYMBOL(file_path);

/**
 * vfs_open - open the file at the given path
 * @path: path to open
 * @file: newly allocated file with f_flag initialized
 * @cred: credentials to use
 */
int vfs_open(const struct path *path, struct file *file)
{
        file->f_path = *path;
        return do_dentry_open(file, d_backing_inode(path->dentry), NULL);
}

struct file *dentry_open(const struct path *path, int flags,
                         const struct cred *cred)
{
        int error;
        struct file *f;

        validate_creds(cred);

        /* We must always pass in a valid mount pointer. */
        BUG_ON(!path->mnt);

        f = alloc_empty_file(flags, cred);
        if (!IS_ERR(f)) {
                error = vfs_open(path, f);
                if (error) {
                        fput(f);
                        f = ERR_PTR(error);
                }
        }
        return f;
}
EXPORT_SYMBOL(dentry_open);

/**
 * dentry_create - Create and open a file
 * @path: path to create
 * @flags: O_ flags
 * @mode: mode bits for new file
 * @cred: credentials to use
 *
 * Caller must hold the parent directory's lock, and have prepared
 * a negative dentry, placed in @path->dentry, for the new file.
 *
 * Caller sets @path->mnt to the vfsmount of the filesystem where
 * the new file is to be created. The parent directory and the
 * negative dentry must reside on the same filesystem instance.
 *
 * On success, returns a "struct file *". Otherwise a ERR_PTR
 * is returned.
 */
struct file *dentry_create(const struct path *path, int flags, umode_t mode,
                           const struct cred *cred)
{
        struct file *f;
        int error;

        validate_creds(cred);
        f = alloc_empty_file(flags, cred);
        if (IS_ERR(f))
                return f;

        error = vfs_create(d_inode(path->dentry->d_parent),
                           path->dentry, mode, true);
        if (!error)
                error = vfs_open(path, f);

        if (unlikely(error)) {
                fput(f);
                return ERR_PTR(error);
        }
        return f;
}
EXPORT_SYMBOL(dentry_create);

struct file *open_with_fake_path(const struct path *path, int flags,
                                struct inode *inode, const struct cred *cred)
{
        struct file *f = alloc_empty_file_noaccount(flags, cred);
        if (!IS_ERR(f)) {
                int error;

                f->f_path = *path;
                error = do_dentry_open(f, inode, NULL);
                if (error) {
                        fput(f);
                        f = ERR_PTR(error);
                }
        }
        return f;
}
EXPORT_SYMBOL(open_with_fake_path);

#define WILL_CREATE(flags)        (flags & (O_CREAT | __O_TMPFILE))
#define O_PATH_FLAGS                (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC)

inline struct open_how build_open_how(int flags, umode_t mode)
{
        struct open_how how = {
                .flags = flags & VALID_OPEN_FLAGS,
                .mode = mode & S_IALLUGO,
        };

        /* O_PATH beats everything else. */
        if (how.flags & O_PATH)
                how.flags &= O_PATH_FLAGS;
        /* Modes should only be set for create-like flags. */
        if (!WILL_CREATE(how.flags))
                how.mode = 0;
        return how;
}

inline int build_open_flags(const struct open_how *how, struct open_flags *op)
{
        u64 flags = how->flags;
        u64 strip = FMODE_NONOTIFY | O_CLOEXEC;
        int lookup_flags = 0;
        int acc_mode = ACC_MODE(flags);

        BUILD_BUG_ON_MSG(upper_32_bits(VALID_OPEN_FLAGS),
                         "struct open_flags doesn't yet handle flags > 32 bits");

        /*
         * Strip flags that either shouldn't be set by userspace like
         * FMODE_NONOTIFY or that aren't relevant in determining struct
         * open_flags like O_CLOEXEC.
         */
        flags &= ~strip;

        /*
         * Older syscalls implicitly clear all of the invalid flags or argument
         * values before calling build_open_flags(), but openat2(2) checks all
         * of its arguments.
         */
        if (flags & ~VALID_OPEN_FLAGS)
                return -EINVAL;
        if (how->resolve & ~VALID_RESOLVE_FLAGS)
                return -EINVAL;

        /* Scoping flags are mutually exclusive. */
        if ((how->resolve & RESOLVE_BENEATH) && (how->resolve & RESOLVE_IN_ROOT))
                return -EINVAL;

        /* Deal with the mode. */
        if (WILL_CREATE(flags)) {
                if (how->mode & ~S_IALLUGO)
                        return -EINVAL;
                op->mode = how->mode | S_IFREG;
        } else {
                if (how->mode != 0)
                        return -EINVAL;
                op->mode = 0;
        }

        /*
         * In order to ensure programs get explicit errors when trying to use
         * O_TMPFILE on old kernels, O_TMPFILE is implemented such that it
         * looks like (O_DIRECTORY|O_RDWR & ~O_CREAT) to old kernels. But we
         * have to require userspace to explicitly set it.
         */
        if (flags & __O_TMPFILE) {
                if ((flags & O_TMPFILE_MASK) != O_TMPFILE)
                        return -EINVAL;
                if (!(acc_mode & MAY_WRITE))
                        return -EINVAL;
        }
        if (flags & O_PATH) {
                /* O_PATH only permits certain other flags to be set. */
                if (flags & ~O_PATH_FLAGS)
                        return -EINVAL;
                acc_mode = 0;
        }

        /*
         * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
         * check for O_DSYNC if the need any syncing at all we enforce it's
         * always set instead of having to deal with possibly weird behaviour
         * for malicious applications setting only __O_SYNC.
         */
        if (flags & __O_SYNC)
                flags |= O_DSYNC;

        op->open_flag = flags;

        /* O_TRUNC implies we need access checks for write permissions */
        if (flags & O_TRUNC)
                acc_mode |= MAY_WRITE;

        /* Allow the LSM permission hook to distinguish append
           access from general write access. */
        if (flags & O_APPEND)
                acc_mode |= MAY_APPEND;

        op->acc_mode = acc_mode;

        op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN;

        if (flags & O_CREAT) {
                op->intent |= LOOKUP_CREATE;
                if (flags & O_EXCL) {
                        op->intent |= LOOKUP_EXCL;
                        flags |= O_NOFOLLOW;
                }
        }

        if (flags & O_DIRECTORY)
                lookup_flags |= LOOKUP_DIRECTORY;
        if (!(flags & O_NOFOLLOW))
                lookup_flags |= LOOKUP_FOLLOW;

        if (how->resolve & RESOLVE_NO_XDEV)
                lookup_flags |= LOOKUP_NO_XDEV;
        if (how->resolve & RESOLVE_NO_MAGICLINKS)
                lookup_flags |= LOOKUP_NO_MAGICLINKS;
        if (how->resolve & RESOLVE_NO_SYMLINKS)
                lookup_flags |= LOOKUP_NO_SYMLINKS;
        if (how->resolve & RESOLVE_BENEATH)
                lookup_flags |= LOOKUP_BENEATH;
        if (how->resolve & RESOLVE_IN_ROOT)
                lookup_flags |= LOOKUP_IN_ROOT;
        if (how->resolve & RESOLVE_CACHED) {
                /* Don't bother even trying for create/truncate/tmpfile open */
                if (flags & (O_TRUNC | O_CREAT | __O_TMPFILE))
                        return -EAGAIN;
                lookup_flags |= LOOKUP_CACHED;
        }

        op->lookup_flags = lookup_flags;
        return 0;
}

/**
 * file_open_name - open file and return file pointer
 *
 * @name:        struct filename containing path to open
 * @flags:        open flags as per the open(2) second argument
 * @mode:        mode for the new file if O_CREAT is set, else ignored
 *
 * This is the helper to open a file from kernelspace if you really
 * have to.  But in generally you should not do this, so please move
 * along, nothing to see here..
 */
struct file *file_open_name(struct filename *name, int flags, umode_t mode)
{
        struct open_flags op;
        struct open_how how = build_open_how(flags, mode);
        int err = build_open_flags(&how, &op);
        if (err)
                return ERR_PTR(err);
        return do_filp_open(AT_FDCWD, name, &op);
}

/**
 * filp_open - open file and return file pointer
 *
 * @filename:        path to open
 * @flags:        open flags as per the open(2) second argument
 * @mode:        mode for the new file if O_CREAT is set, else ignored
 *
 * This is the helper to open a file from kernelspace if you really
 * have to.  But in generally you should not do this, so please move
 * along, nothing to see here..
 */
struct file *filp_open(const char *filename, int flags, umode_t mode)
{
        struct filename *name = getname_kernel(filename);
        struct file *file = ERR_CAST(name);
        
        if (!IS_ERR(name)) {
                file = file_open_name(name, flags, mode);
                putname(name);
        }
        return file;
}
EXPORT_SYMBOL(filp_open);

struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
                            const char *filename, int flags, umode_t mode)
{
        struct open_flags op;
        struct open_how how = build_open_how(flags, mode);
        int err = build_open_flags(&how, &op);
        if (err)
                return ERR_PTR(err);
        return do_file_open_root(dentry, mnt, filename, &op);
}
EXPORT_SYMBOL(file_open_root);

static long do_sys_openat2(int dfd, const char __user *filename,
                           struct open_how *how)
{
        struct open_flags op;
        int fd = build_open_flags(how, &op);
        struct filename *tmp;

        if (fd)
                return fd;

        tmp = getname(filename);
        if (IS_ERR(tmp))
                return PTR_ERR(tmp);

        fd = get_unused_fd_flags(how->flags);
        if (fd >= 0) {
                struct file *f = do_filp_open(dfd, tmp, &op);
                if (IS_ERR(f)) {
                        put_unused_fd(fd);
                        fd = PTR_ERR(f);
                } else {
                        fsnotify_open(f);
                        fd_install(fd, f);
                }
        }
        putname(tmp);
        return fd;
}

long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
{
        struct open_how how = build_open_how(flags, mode);
        return do_sys_openat2(dfd, filename, &how);
}


SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
{
        if (force_o_largefile())
                flags |= O_LARGEFILE;
        return do_sys_open(AT_FDCWD, filename, flags, mode);
}

SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
                umode_t, mode)
{
        if (force_o_largefile())
                flags |= O_LARGEFILE;
        return do_sys_open(dfd, filename, flags, mode);
}

SYSCALL_DEFINE4(openat2, int, dfd, const char __user *, filename,
                struct open_how __user *, how, size_t, usize)
{
        int err;
        struct open_how tmp;

        BUILD_BUG_ON(sizeof(struct open_how) < OPEN_HOW_SIZE_VER0);
        BUILD_BUG_ON(sizeof(struct open_how) != OPEN_HOW_SIZE_LATEST);

        if (unlikely(usize < OPEN_HOW_SIZE_VER0))
                return -EINVAL;
        if (unlikely(usize > PAGE_SIZE))
                return -E2BIG;

        err = copy_struct_from_user(&tmp, sizeof(tmp), how, usize);
        if (err)
                return err;

        /* O_LARGEFILE is only allowed for non-O_PATH. */
        if (!(tmp.flags & O_PATH) && force_o_largefile())
                tmp.flags |= O_LARGEFILE;

        return do_sys_openat2(dfd, filename, &tmp);
}

#ifdef CONFIG_COMPAT
/*
 * Exactly like sys_open(), except that it doesn't set the
 * O_LARGEFILE flag.
 */
COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
{
        return do_sys_open(AT_FDCWD, filename, flags, mode);
}

/*
 * Exactly like sys_openat(), except that it doesn't set the
 * O_LARGEFILE flag.
 */
COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode)
{
        return do_sys_open(dfd, filename, flags, mode);
}
#endif

#ifndef __alpha__

/*
 * For backward compatibility?  Maybe this should be moved
 * into arch/i386 instead?
 */
SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode)
{
        int flags = O_CREAT | O_WRONLY | O_TRUNC;

        if (force_o_largefile())
                flags |= O_LARGEFILE;
        return do_sys_open(AT_FDCWD, pathname, flags, mode);
}
#endif

/*
 * "id" is the POSIX thread ID. We use the
 * files pointer for this..
 */
int filp_close(struct file *filp, fl_owner_t id)
{
        int retval = 0;

        if (!file_count(filp)) {
                printk(KERN_ERR "VFS: Close: file count is 0\n");
                return 0;
        }

        if (filp->f_op->flush)
                retval = filp->f_op->flush(filp, id);

        if (likely(!(filp->f_mode & FMODE_PATH))) {
                dnotify_flush(filp, id);
                locks_remove_posix(filp, id);
        }
        fput(filp);
        return retval;
}

EXPORT_SYMBOL(filp_close);

/*
 * Careful here! We test whether the file pointer is NULL before
 * releasing the fd. This ensures that one clone task can't release
 * an fd while another clone is opening it.
 */
SYSCALL_DEFINE1(close, unsigned int, fd)
{
        int retval = close_fd(fd);

        /* can't restart close syscall because file table entry was cleared */
        if (unlikely(retval == -ERESTARTSYS ||
                     retval == -ERESTARTNOINTR ||
                     retval == -ERESTARTNOHAND ||
                     retval == -ERESTART_RESTARTBLOCK))
                retval = -EINTR;

        return retval;
}

/**
 * close_range() - Close all file descriptors in a given range.
 *
 * @fd:     starting file descriptor to close
 * @max_fd: last file descriptor to close
 * @flags:  reserved for future extensions
 *
 * This closes a range of file descriptors. All file descriptors
 * from @fd up to and including @max_fd are closed.
 * Currently, errors to close a given file descriptor are ignored.
 */
SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd,
                unsigned int, flags)
{
        return __close_range(fd, max_fd, flags);
}

/*
 * This routine simulates a hangup on the tty, to arrange that users
 * are given clean terminals at login time.
 */
SYSCALL_DEFINE0(vhangup)
{
        if (capable(CAP_SYS_TTY_CONFIG)) {
                tty_vhangup_self();
                return 0;
        }
        return -EPERM;
}

/*
 * Called when an inode is about to be open.
 * We use this to disallow opening large files on 32bit systems if
 * the caller didn't specify O_LARGEFILE.  On 64bit systems we force
 * on this flag in sys_open.
 */
int generic_file_open(struct inode * inode, struct file * filp)
{
        if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
                return -EOVERFLOW;
        return 0;
}

EXPORT_SYMBOL(generic_file_open);

/*
 * This is used by subsystems that don't want seekable
 * file descriptors. The function is not supposed to ever fail, the only
 * reason it returns an 'int' and not 'void' is so that it can be plugged
 * directly into file_operations structure.
 */
int nonseekable_open(struct inode *inode, struct file *filp)
{
        filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
        return 0;
}

EXPORT_SYMBOL(nonseekable_open);

/*
 * stream_open is used by subsystems that want stream-like file descriptors.
 * Such file descriptors are not seekable and don't have notion of position
 * (file.f_pos is always 0 and ppos passed to .read()/.write() is always NULL).
 * Contrary to file descriptors of other regular files, .read() and .write()
 * can run simultaneously.
 *
 * stream_open never fails and is marked to return int so that it could be
 * directly used as file_operations.open .
 */
int stream_open(struct inode *inode, struct file *filp)
{
        filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE | FMODE_ATOMIC_POS);
        filp->f_mode |= FMODE_STREAM;
        return 0;
}

EXPORT_SYMBOL(stream_open);











































































































































































































































































































































































































































































































































































































































































































































































    1 





















































































































































































































































































































































































































































































































































































    1 


    1 













    1 
    1 















































































































































































































































































































































    1 


























































































































































































































    1 
    1 







    1 

    1 
    1 






    1 

























    1 
































































    1 








    1 






    1 










    1 


    1 



















































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
/*
 *  linux/drivers/block/loop.c
 *
 *  Written by Theodore Ts'o, 3/29/93
 *
 * Copyright 1993 by Theodore Ts'o.  Redistribution of this file is
 * permitted under the GNU General Public License.
 *
 * DES encryption plus some minor changes by Werner Almesberger, 30-MAY-1993
 * more DES encryption plus IDEA encryption by Nicholas J. Leon, June 20, 1996
 *
 * Modularized and updated for 1.1.16 kernel - Mitch Dsouza 28th May 1994
 * Adapted for 1.3.59 kernel - Andries Brouwer, 1 Feb 1996
 *
 * Fixed do_loop_request() re-entrancy - Vincent.Renardias@waw.com Mar 20, 1997
 *
 * Added devfs support - Richard Gooch <rgooch@atnf.csiro.au> 16-Jan-1998
 *
 * Handle sparse backing files correctly - Kenn Humborg, Jun 28, 1998
 *
 * Loadable modules and other fixes by AK, 1998
 *
 * Make real block number available to downstream transfer functions, enables
 * CBC (and relatives) mode encryption requiring unique IVs per data block.
 * Reed H. Petty, rhp@draper.net
 *
 * Maximum number of loop devices now dynamic via max_loop module parameter.
 * Russell Kroll <rkroll@exploits.org> 19990701
 *
 * Maximum number of loop devices when compiled-in now selectable by passing
 * max_loop=<1-255> to the kernel on boot.
 * Erik I. Bolsø, <eriki@himolde.no>, Oct 31, 1999
 *
 * Completely rewrite request handling to be make_request_fn style and
 * non blocking, pushing work to a helper thread. Lots of fixes from
 * Al Viro too.
 * Jens Axboe <axboe@suse.de>, Nov 2000
 *
 * Support up to 256 loop devices
 * Heinz Mauelshagen <mge@sistina.com>, Feb 2002
 *
 * Support for falling back on the write file operation when the address space
 * operations write_begin is not available on the backing filesystem.
 * Anton Altaparmakov, 16 Feb 2005
 *
 * Still To Fix:
 * - Advisory locking is ignored here.
 * - Should use an own CAP_* category instead of CAP_SYS_ADMIN
 *
 */

#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/stat.h>
#include <linux/errno.h>
#include <linux/major.h>
#include <linux/wait.h>
#include <linux/blkdev.h>
#include <linux/blkpg.h>
#include <linux/init.h>
#include <linux/swap.h>
#include <linux/slab.h>
#include <linux/compat.h>
#include <linux/suspend.h>
#include <linux/freezer.h>
#include <linux/mutex.h>
#include <linux/writeback.h>
#include <linux/completion.h>
#include <linux/highmem.h>
#include <linux/kthread.h>
#include <linux/splice.h>
#include <linux/sysfs.h>
#include <linux/miscdevice.h>
#include <linux/falloc.h>
#include <linux/uio.h>
#include <linux/ioprio.h>
#include <linux/blk-cgroup.h>

#include "loop.h"

#include <linux/uaccess.h>

static DEFINE_IDR(loop_index_idr);
static DEFINE_MUTEX(loop_ctl_mutex);

static int max_part;
static int part_shift;

static int transfer_xor(struct loop_device *lo, int cmd,
                        struct page *raw_page, unsigned raw_off,
                        struct page *loop_page, unsigned loop_off,
                        int size, sector_t real_block)
{
        char *raw_buf = kmap_atomic(raw_page) + raw_off;
        char *loop_buf = kmap_atomic(loop_page) + loop_off;
        char *in, *out, *key;
        int i, keysize;

        if (cmd == READ) {
                in = raw_buf;
                out = loop_buf;
        } else {
                in = loop_buf;
                out = raw_buf;
        }

        key = lo->lo_encrypt_key;
        keysize = lo->lo_encrypt_key_size;
        for (i = 0; i < size; i++)
                *out++ = *in++ ^ key[(i & 511) % keysize];

        kunmap_atomic(loop_buf);
        kunmap_atomic(raw_buf);
        cond_resched();
        return 0;
}

static int xor_init(struct loop_device *lo, const struct loop_info64 *info)
{
        if (unlikely(info->lo_encrypt_key_size <= 0))
                return -EINVAL;
        return 0;
}

static struct loop_func_table none_funcs = {
        .number = LO_CRYPT_NONE,
}; 

static struct loop_func_table xor_funcs = {
        .number = LO_CRYPT_XOR,
        .transfer = transfer_xor,
        .init = xor_init
}; 

/* xfer_funcs[0] is special - its release function is never called */
static struct loop_func_table *xfer_funcs[MAX_LO_CRYPT] = {
        &none_funcs,
        &xor_funcs
};

static loff_t get_size(loff_t offset, loff_t sizelimit, struct file *file)
{
        loff_t loopsize;

        /* Compute loopsize in bytes */
        loopsize = i_size_read(file->f_mapping->host);
        if (offset > 0)
                loopsize -= offset;
        /* offset is beyond i_size, weird but possible */
        if (loopsize < 0)
                return 0;

        if (sizelimit > 0 && sizelimit < loopsize)
                loopsize = sizelimit;
        /*
         * Unfortunately, if we want to do I/O on the device,
         * the number of 512-byte sectors has to fit into a sector_t.
         */
        return loopsize >> 9;
}

static loff_t get_loop_size(struct loop_device *lo, struct file *file)
{
        return get_size(lo->lo_offset, lo->lo_sizelimit, file);
}

static void __loop_update_dio(struct loop_device *lo, bool dio)
{
        struct file *file = lo->lo_backing_file;
        struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;
        unsigned short sb_bsize = 0;
        unsigned dio_align = 0;
        bool use_dio;

        if (inode->i_sb->s_bdev) {
                sb_bsize = bdev_logical_block_size(inode->i_sb->s_bdev);
                dio_align = sb_bsize - 1;
        }

        /*
         * We support direct I/O only if lo_offset is aligned with the
         * logical I/O size of backing device, and the logical block
         * size of loop is bigger than the backing device's and the loop
         * needn't transform transfer.
         *
         * TODO: the above condition may be loosed in the future, and
         * direct I/O may be switched runtime at that time because most
         * of requests in sane applications should be PAGE_SIZE aligned
         */
        if (dio) {
                if (queue_logical_block_size(lo->lo_queue) >= sb_bsize &&
                                !(lo->lo_offset & dio_align) &&
                                mapping->a_ops->direct_IO &&
                                !lo->transfer)
                        use_dio = true;
                else
                        use_dio = false;
        } else {
                use_dio = false;
        }

        if (lo->use_dio == use_dio)
                return;

        /* flush dirty pages before changing direct IO */
        vfs_fsync(file, 0);

        /*
         * The flag of LO_FLAGS_DIRECT_IO is handled similarly with
         * LO_FLAGS_READ_ONLY, both are set from kernel, and losetup
         * will get updated by ioctl(LOOP_GET_STATUS)
         */
        if (lo->lo_state == Lo_bound)
                blk_mq_freeze_queue(lo->lo_queue);
        lo->use_dio = use_dio;
        if (use_dio) {
                blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, lo->lo_queue);
                lo->lo_flags |= LO_FLAGS_DIRECT_IO;
        } else {
                blk_queue_flag_set(QUEUE_FLAG_NOMERGES, lo->lo_queue);
                lo->lo_flags &= ~LO_FLAGS_DIRECT_IO;
        }
        if (lo->lo_state == Lo_bound)
                blk_mq_unfreeze_queue(lo->lo_queue);
}

/**
 * loop_set_size() - sets device size and notifies userspace
 * @lo: struct loop_device to set the size for
 * @size: new size of the loop device
 *
 * Callers must validate that the size passed into this function fits into
 * a sector_t, eg using loop_validate_size()
 */
static void loop_set_size(struct loop_device *lo, loff_t size)
{
        if (!set_capacity_and_notify(lo->lo_disk, size))
                kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE);
}

static inline int
lo_do_transfer(struct loop_device *lo, int cmd,
               struct page *rpage, unsigned roffs,
               struct page *lpage, unsigned loffs,
               int size, sector_t rblock)
{
        int ret;

        ret = lo->transfer(lo, cmd, rpage, roffs, lpage, loffs, size, rblock);
        if (likely(!ret))
                return 0;

        printk_ratelimited(KERN_ERR
                "loop: Transfer error at byte offset %llu, length %i.\n",
                (unsigned long long)rblock << 9, size);
        return ret;
}

static int lo_write_bvec(struct file *file, struct bio_vec *bvec, loff_t *ppos)
{
        struct iov_iter i;
        ssize_t bw;

        iov_iter_bvec(&i, WRITE, bvec, 1, bvec->bv_len);

        file_start_write(file);
        bw = vfs_iter_write(file, &i, ppos, 0);
        file_end_write(file);

        if (likely(bw ==  bvec->bv_len))
                return 0;

        printk_ratelimited(KERN_ERR
                "loop: Write error at byte offset %llu, length %i.\n",
                (unsigned long long)*ppos, bvec->bv_len);
        if (bw >= 0)
                bw = -EIO;
        return bw;
}

static int lo_write_simple(struct loop_device *lo, struct request *rq,
                loff_t pos)
{
        struct bio_vec bvec;
        struct req_iterator iter;
        int ret = 0;

        rq_for_each_segment(bvec, rq, iter) {
                ret = lo_write_bvec(lo->lo_backing_file, &bvec, &pos);
                if (ret < 0)
                        break;
                cond_resched();
        }

        return ret;
}

/*
 * This is the slow, transforming version that needs to double buffer the
 * data as it cannot do the transformations in place without having direct
 * access to the destination pages of the backing file.
 */
static int lo_write_transfer(struct loop_device *lo, struct request *rq,
                loff_t pos)
{
        struct bio_vec bvec, b;
        struct req_iterator iter;
        struct page *page;
        int ret = 0;

        page = alloc_page(GFP_NOIO);
        if (unlikely(!page))
                return -ENOMEM;

        rq_for_each_segment(bvec, rq, iter) {
                ret = lo_do_transfer(lo, WRITE, page, 0, bvec.bv_page,
                        bvec.bv_offset, bvec.bv_len, pos >> 9);
                if (unlikely(ret))
                        break;

                b.bv_page = page;
                b.bv_offset = 0;
                b.bv_len = bvec.bv_len;
                ret = lo_write_bvec(lo->lo_backing_file, &b, &pos);
                if (ret < 0)
                        break;
        }

        __free_page(page);
        return ret;
}

static int lo_read_simple(struct loop_device *lo, struct request *rq,
                loff_t pos)
{
        struct bio_vec bvec;
        struct req_iterator iter;
        struct iov_iter i;
        ssize_t len;

        rq_for_each_segment(bvec, rq, iter) {
                iov_iter_bvec(&i, READ, &bvec, 1, bvec.bv_len);
                len = vfs_iter_read(lo->lo_backing_file, &i, &pos, 0);
                if (len < 0)
                        return len;

                flush_dcache_page(bvec.bv_page);

                if (len != bvec.bv_len) {
                        struct bio *bio;

                        __rq_for_each_bio(bio, rq)
                                zero_fill_bio(bio);
                        break;
                }
                cond_resched();
        }

        return 0;
}

static int lo_read_transfer(struct loop_device *lo, struct request *rq,
                loff_t pos)
{
        struct bio_vec bvec, b;
        struct req_iterator iter;
        struct iov_iter i;
        struct page *page;
        ssize_t len;
        int ret = 0;

        page = alloc_page(GFP_NOIO);
        if (unlikely(!page))
                return -ENOMEM;

        rq_for_each_segment(bvec, rq, iter) {
                loff_t offset = pos;

                b.bv_page = page;
                b.bv_offset = 0;
                b.bv_len = bvec.bv_len;

                iov_iter_bvec(&i, READ, &b, 1, b.bv_len);
                len = vfs_iter_read(lo->lo_backing_file, &i, &pos, 0);
                if (len < 0) {
                        ret = len;
                        goto out_free_page;
                }

                ret = lo_do_transfer(lo, READ, page, 0, bvec.bv_page,
                        bvec.bv_offset, len, offset >> 9);
                if (ret)
                        goto out_free_page;

                flush_dcache_page(bvec.bv_page);

                if (len != bvec.bv_len) {
                        struct bio *bio;

                        __rq_for_each_bio(bio, rq)
                                zero_fill_bio(bio);
                        break;
                }
        }

        ret = 0;
out_free_page:
        __free_page(page);
        return ret;
}

static int lo_fallocate(struct loop_device *lo, struct request *rq, loff_t pos,
                        int mode)
{
        /*
         * We use fallocate to manipulate the space mappings used by the image
         * a.k.a. discard/zerorange. However we do not support this if
         * encryption is enabled, because it may give an attacker useful
         * information.
         */
        struct file *file = lo->lo_backing_file;
        struct request_queue *q = lo->lo_queue;
        int ret;

        mode |= FALLOC_FL_KEEP_SIZE;

        if (!blk_queue_discard(q)) {
                ret = -EOPNOTSUPP;
                goto out;
        }

        ret = file->f_op->fallocate(file, mode, pos, blk_rq_bytes(rq));
        if (unlikely(ret && ret != -EINVAL && ret != -EOPNOTSUPP))
                ret = -EIO;
 out:
        return ret;
}

static int lo_req_flush(struct loop_device *lo, struct request *rq)
{
        struct file *file = lo->lo_backing_file;
        int ret = vfs_fsync(file, 0);
        if (unlikely(ret && ret != -EINVAL))
                ret = -EIO;

        return ret;
}

static void lo_complete_rq(struct request *rq)
{
        struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
        blk_status_t ret = BLK_STS_OK;

        if (!cmd->use_aio || cmd->ret < 0 || cmd->ret == blk_rq_bytes(rq) ||
            req_op(rq) != REQ_OP_READ) {
                if (cmd->ret < 0)
                        ret = errno_to_blk_status(cmd->ret);
                goto end_io;
        }

        /*
         * Short READ - if we got some data, advance our request and
         * retry it. If we got no data, end the rest with EIO.
         */
        if (cmd->ret) {
                blk_update_request(rq, BLK_STS_OK, cmd->ret);
                cmd->ret = 0;
                blk_mq_requeue_request(rq, true);
        } else {
                if (cmd->use_aio) {
                        struct bio *bio = rq->bio;

                        while (bio) {
                                zero_fill_bio(bio);
                                bio = bio->bi_next;
                        }
                }
                ret = BLK_STS_IOERR;
end_io:
                blk_mq_end_request(rq, ret);
        }
}

static void lo_rw_aio_do_completion(struct loop_cmd *cmd)
{
        struct request *rq = blk_mq_rq_from_pdu(cmd);

        if (!atomic_dec_and_test(&cmd->ref))
                return;
        kfree(cmd->bvec);
        cmd->bvec = NULL;
        if (likely(!blk_should_fake_timeout(rq->q)))
                blk_mq_complete_request(rq);
}

static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
{
        struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);

        if (cmd->css)
                css_put(cmd->css);
        cmd->ret = ret;
        lo_rw_aio_do_completion(cmd);
}

static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
                     loff_t pos, bool rw)
{
        struct iov_iter iter;
        struct req_iterator rq_iter;
        struct bio_vec *bvec;
        struct request *rq = blk_mq_rq_from_pdu(cmd);
        struct bio *bio = rq->bio;
        struct file *file = lo->lo_backing_file;
        struct bio_vec tmp;
        unsigned int offset;
        int nr_bvec = 0;
        int ret;

        rq_for_each_bvec(tmp, rq, rq_iter)
                nr_bvec++;

        if (rq->bio != rq->biotail) {

                bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec),
                                     GFP_NOIO);
                if (!bvec)
                        return -EIO;
                cmd->bvec = bvec;

                /*
                 * The bios of the request may be started from the middle of
                 * the 'bvec' because of bio splitting, so we can't directly
                 * copy bio->bi_iov_vec to new bvec. The rq_for_each_bvec
                 * API will take care of all details for us.
                 */
                rq_for_each_bvec(tmp, rq, rq_iter) {
                        *bvec = tmp;
                        bvec++;
                }
                bvec = cmd->bvec;
                offset = 0;
        } else {
                /*
                 * Same here, this bio may be started from the middle of the
                 * 'bvec' because of bio splitting, so offset from the bvec
                 * must be passed to iov iterator
                 */
                offset = bio->bi_iter.bi_bvec_done;
                bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
        }
        atomic_set(&cmd->ref, 2);

        iov_iter_bvec(&iter, rw, bvec, nr_bvec, blk_rq_bytes(rq));
        iter.iov_offset = offset;

        cmd->iocb.ki_pos = pos;
        cmd->iocb.ki_filp = file;
        cmd->iocb.ki_complete = lo_rw_aio_complete;
        cmd->iocb.ki_flags = IOCB_DIRECT;
        cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
        if (cmd->css)
                kthread_associate_blkcg(cmd->css);

        if (rw == WRITE)
                ret = call_write_iter(file, &cmd->iocb, &iter);
        else
                ret = call_read_iter(file, &cmd->iocb, &iter);

        lo_rw_aio_do_completion(cmd);
        kthread_associate_blkcg(NULL);

        if (ret != -EIOCBQUEUED)
                cmd->iocb.ki_complete(&cmd->iocb, ret, 0);
        return 0;
}

static int do_req_filebacked(struct loop_device *lo, struct request *rq)
{
        struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
        loff_t pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset;

        /*
         * lo_write_simple and lo_read_simple should have been covered
         * by io submit style function like lo_rw_aio(), one blocker
         * is that lo_read_simple() need to call flush_dcache_page after
         * the page is written from kernel, and it isn't easy to handle
         * this in io submit style function which submits all segments
         * of the req at one time. And direct read IO doesn't need to
         * run flush_dcache_page().
         */
        switch (req_op(rq)) {
        case REQ_OP_FLUSH:
                return lo_req_flush(lo, rq);
        case REQ_OP_WRITE_ZEROES:
                /*
                 * If the caller doesn't want deallocation, call zeroout to
                 * write zeroes the range.  Otherwise, punch them out.
                 */
                return lo_fallocate(lo, rq, pos,
                        (rq->cmd_flags & REQ_NOUNMAP) ?
                                FALLOC_FL_ZERO_RANGE :
                                FALLOC_FL_PUNCH_HOLE);
        case REQ_OP_DISCARD:
                return lo_fallocate(lo, rq, pos, FALLOC_FL_PUNCH_HOLE);
        case REQ_OP_WRITE:
                if (lo->transfer)
                        return lo_write_transfer(lo, rq, pos);
                else if (cmd->use_aio)
                        return lo_rw_aio(lo, cmd, pos, WRITE);
                else
                        return lo_write_simple(lo, rq, pos);
        case REQ_OP_READ:
                if (lo->transfer)
                        return lo_read_transfer(lo, rq, pos);
                else if (cmd->use_aio)
                        return lo_rw_aio(lo, cmd, pos, READ);
                else
                        return lo_read_simple(lo, rq, pos);
        default:
                WARN_ON_ONCE(1);
                return -EIO;
        }
}

static inline void loop_update_dio(struct loop_device *lo)
{
        __loop_update_dio(lo, (lo->lo_backing_file->f_flags & O_DIRECT) |
                                lo->use_dio);
}

static void loop_reread_partitions(struct loop_device *lo,
                                   struct block_device *bdev)
{
        int rc;

        mutex_lock(&bdev->bd_mutex);
        rc = bdev_disk_changed(bdev, false);
        mutex_unlock(&bdev->bd_mutex);
        if (rc)
                pr_warn("%s: partition scan of loop%d (%s) failed (rc=%d)\n",
                        __func__, lo->lo_number, lo->lo_file_name, rc);
}

static inline int is_loop_device(struct file *file)
{
        struct inode *i = file->f_mapping->host;

        return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR;
}

static int loop_validate_file(struct file *file, struct block_device *bdev)
{
        struct inode        *inode = file->f_mapping->host;
        struct file        *f = file;

        /* Avoid recursion */
        while (is_loop_device(f)) {
                struct loop_device *l;

                if (f->f_mapping->host->i_bdev == bdev)
                        return -EBADF;

                l = f->f_mapping->host->i_bdev->bd_disk->private_data;
                if (l->lo_state != Lo_bound) {
                        return -EINVAL;
                }
                f = l->lo_backing_file;
        }
        if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
                return -EINVAL;
        return 0;
}

/*
 * loop_change_fd switched the backing store of a loopback device to
 * a new file. This is useful for operating system installers to free up
 * the original file and in High Availability environments to switch to
 * an alternative location for the content in case of server meltdown.
 * This can only work if the loop device is used read-only, and if the
 * new backing store is the same size and type as the old backing store.
 */
static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
                          unsigned int arg)
{
        struct file        *file = NULL, *old_file;
        int                error;
        bool                partscan;

        error = mutex_lock_killable(&loop_ctl_mutex);
        if (error)
                return error;
        error = -ENXIO;
        if (lo->lo_state != Lo_bound)
                goto out_err;

        /* the loop device has to be read-only */
        error = -EINVAL;
        if (!(lo->lo_flags & LO_FLAGS_READ_ONLY))
                goto out_err;

        error = -EBADF;
        file = fget(arg);
        if (!file)
                goto out_err;

        error = loop_validate_file(file, bdev);
        if (error)
                goto out_err;

        old_file = lo->lo_backing_file;

        error = -EINVAL;

        /* size of the new backing store needs to be the same */
        if (get_loop_size(lo, file) != get_loop_size(lo, old_file))
                goto out_err;

        /* and ... switch */
        blk_mq_freeze_queue(lo->lo_queue);
        mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
        lo->lo_backing_file = file;
        lo->old_gfp_mask = mapping_gfp_mask(file->f_mapping);
        mapping_set_gfp_mask(file->f_mapping,
                             lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
        loop_update_dio(lo);
        blk_mq_unfreeze_queue(lo->lo_queue);
        partscan = lo->lo_flags & LO_FLAGS_PARTSCAN;
        mutex_unlock(&loop_ctl_mutex);
        /*
         * We must drop file reference outside of loop_ctl_mutex as dropping
         * the file ref can take bd_mutex which creates circular locking
         * dependency.
         */
        fput(old_file);
        if (partscan)
                loop_reread_partitions(lo, bdev);
        return 0;

out_err:
        mutex_unlock(&loop_ctl_mutex);
        if (file)
                fput(file);
        return error;
}

/* loop sysfs attributes */

static ssize_t loop_attr_show(struct device *dev, char *page,
                              ssize_t (*callback)(struct loop_device *, char *))
{
        struct gendisk *disk = dev_to_disk(dev);
        struct loop_device *lo = disk->private_data;

        return callback(lo, page);
}

#define LOOP_ATTR_RO(_name)                                                \
static ssize_t loop_attr_##_name##_show(struct loop_device *, char *);        \
static ssize_t loop_attr_do_show_##_name(struct device *d,                \
                                struct device_attribute *attr, char *b)        \
{                                                                        \
        return loop_attr_show(d, b, loop_attr_##_name##_show);                \
}                                                                        \
static struct device_attribute loop_attr_##_name =                        \
        __ATTR(_name, 0444, loop_attr_do_show_##_name, NULL);

static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf)
{
        ssize_t ret;
        char *p = NULL;

        spin_lock_irq(&lo->lo_lock);
        if (lo->lo_backing_file)
                p = file_path(lo->lo_backing_file, buf, PAGE_SIZE - 1);
        spin_unlock_irq(&lo->lo_lock);

        if (IS_ERR_OR_NULL(p))
                ret = PTR_ERR(p);
        else {
                ret = strlen(p);
                memmove(buf, p, ret);
                buf[ret++] = '\n';
                buf[ret] = 0;
        }

        return ret;
}

static ssize_t loop_attr_offset_show(struct loop_device *lo, char *buf)
{
        return sysfs_emit(buf, "%llu\n", (unsigned long long)lo->lo_offset);
}

static ssize_t loop_attr_sizelimit_show(struct loop_device *lo, char *buf)
{
        return sysfs_emit(buf, "%llu\n", (unsigned long long)lo->lo_sizelimit);
}

static ssize_t loop_attr_autoclear_show(struct loop_device *lo, char *buf)
{
        int autoclear = (lo->lo_flags & LO_FLAGS_AUTOCLEAR);

        return sysfs_emit(buf, "%s\n", autoclear ? "1" : "0");
}

static ssize_t loop_attr_partscan_show(struct loop_device *lo, char *buf)
{
        int partscan = (lo->lo_flags & LO_FLAGS_PARTSCAN);

        return sysfs_emit(buf, "%s\n", partscan ? "1" : "0");
}

static ssize_t loop_attr_dio_show(struct loop_device *lo, char *buf)
{
        int dio = (lo->lo_flags & LO_FLAGS_DIRECT_IO);

        return sysfs_emit(buf, "%s\n", dio ? "1" : "0");
}

LOOP_ATTR_RO(backing_file);
LOOP_ATTR_RO(offset);
LOOP_ATTR_RO(sizelimit);
LOOP_ATTR_RO(autoclear);
LOOP_ATTR_RO(partscan);
LOOP_ATTR_RO(dio);

static struct attribute *loop_attrs[] = {
        &loop_attr_backing_file.attr,
        &loop_attr_offset.attr,
        &loop_attr_sizelimit.attr,
        &loop_attr_autoclear.attr,
        &loop_attr_partscan.attr,
        &loop_attr_dio.attr,
        NULL,
};

static struct attribute_group loop_attribute_group = {
        .name = "loop",
        .attrs= loop_attrs,
};

static void loop_sysfs_init(struct loop_device *lo)
{
        lo->sysfs_inited = !sysfs_create_group(&disk_to_dev(lo->lo_disk)->kobj,
                                                &loop_attribute_group);
}

static void loop_sysfs_exit(struct loop_device *lo)
{
        if (lo->sysfs_inited)
                sysfs_remove_group(&disk_to_dev(lo->lo_disk)->kobj,
                                   &loop_attribute_group);
}

static void loop_config_discard(struct loop_device *lo)
{
        struct file *file = lo->lo_backing_file;
        struct inode *inode = file->f_mapping->host;
        struct request_queue *q = lo->lo_queue;
        u32 granularity, max_discard_sectors;

        /*
         * If the backing device is a block device, mirror its zeroing
         * capability. Set the discard sectors to the block device's zeroing
         * capabilities because loop discards result in blkdev_issue_zeroout(),
         * not blkdev_issue_discard(). This maintains consistent behavior with
         * file-backed loop devices: discarded regions read back as zero.
         */
        if (S_ISBLK(inode->i_mode) && !lo->lo_encrypt_key_size) {
                struct request_queue *backingq;

                backingq = bdev_get_queue(inode->i_bdev);

                max_discard_sectors = backingq->limits.max_write_zeroes_sectors;
                granularity = backingq->limits.discard_granularity ?:
                        queue_physical_block_size(backingq);

        /*
         * We use punch hole to reclaim the free space used by the
         * image a.k.a. discard. However we do not support discard if
         * encryption is enabled, because it may give an attacker
         * useful information.
         */
        } else if (!file->f_op->fallocate || lo->lo_encrypt_key_size) {
                max_discard_sectors = 0;
                granularity = 0;

        } else {
                max_discard_sectors = UINT_MAX >> 9;
                granularity = inode->i_sb->s_blocksize;
        }

        if (max_discard_sectors) {
                q->limits.discard_granularity = granularity;
                blk_queue_max_discard_sectors(q, max_discard_sectors);
                blk_queue_max_write_zeroes_sectors(q, max_discard_sectors);
                blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
        } else {
                q->limits.discard_granularity = 0;
                blk_queue_max_discard_sectors(q, 0);
                blk_queue_max_write_zeroes_sectors(q, 0);
                blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q);
        }
        q->limits.discard_alignment = 0;
}

static void loop_unprepare_queue(struct loop_device *lo)
{
        kthread_flush_worker(&lo->worker);
        kthread_stop(lo->worker_task);
}

static int loop_kthread_worker_fn(void *worker_ptr)
{
        current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
        return kthread_worker_fn(worker_ptr);
}

static int loop_prepare_queue(struct loop_device *lo)
{
        kthread_init_worker(&lo->worker);
        lo->worker_task = kthread_run(loop_kthread_worker_fn,
                        &lo->worker, "loop%d", lo->lo_number);
        if (IS_ERR(lo->worker_task))
                return -ENOMEM;
        set_user_nice(lo->worker_task, MIN_NICE);
        return 0;
}

static void loop_update_rotational(struct loop_device *lo)
{
        struct file *file = lo->lo_backing_file;
        struct inode *file_inode = file->f_mapping->host;
        struct block_device *file_bdev = file_inode->i_sb->s_bdev;
        struct request_queue *q = lo->lo_queue;
        bool nonrot = true;

        /* not all filesystems (e.g. tmpfs) have a sb->s_bdev */
        if (file_bdev)
                nonrot = blk_queue_nonrot(bdev_get_queue(file_bdev));

        if (nonrot)
                blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
        else
                blk_queue_flag_clear(QUEUE_FLAG_NONROT, q);
}

static int
loop_release_xfer(struct loop_device *lo)
{
        int err = 0;
        struct loop_func_table *xfer = lo->lo_encryption;

        if (xfer) {
                if (xfer->release)
                        err = xfer->release(lo);
                lo->transfer = NULL;
                lo->lo_encryption = NULL;
                module_put(xfer->owner);
        }
        return err;
}

static int
loop_init_xfer(struct loop_device *lo, struct loop_func_table *xfer,
               const struct loop_info64 *i)
{
        int err = 0;

        if (xfer) {
                struct module *owner = xfer->owner;

                if (!try_module_get(owner))
                        return -EINVAL;
                if (xfer->init)
                        err = xfer->init(lo, i);
                if (err)
                        module_put(owner);
                else
                        lo->lo_encryption = xfer;
        }
        return err;
}

/**
 * loop_set_status_from_info - configure device from loop_info
 * @lo: struct loop_device to configure
 * @info: struct loop_info64 to configure the device with
 *
 * Configures the loop device parameters according to the passed
 * in loop_info64 configuration.
 */
static int
loop_set_status_from_info(struct loop_device *lo,
                          const struct loop_info64 *info)
{
        int err;
        struct loop_func_table *xfer;
        kuid_t uid = current_uid();

        if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE)
                return -EINVAL;

        err = loop_release_xfer(lo);
        if (err)
                return err;

        if (info->lo_encrypt_type) {
                unsigned int type = info->lo_encrypt_type;

                if (type >= MAX_LO_CRYPT)
                        return -EINVAL;
                xfer = xfer_funcs[type];
                if (xfer == NULL)
                        return -EINVAL;
        } else
                xfer = NULL;

        err = loop_init_xfer(lo, xfer, info);
        if (err)
                return err;

        /* Avoid assigning overflow values */
        if (info->lo_offset > LLONG_MAX || info->lo_sizelimit > LLONG_MAX)
                return -EOVERFLOW;

        lo->lo_offset = info->lo_offset;
        lo->lo_sizelimit = info->lo_sizelimit;

        memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE);
        memcpy(lo->lo_crypt_name, info->lo_crypt_name, LO_NAME_SIZE);
        lo->lo_file_name[LO_NAME_SIZE-1] = 0;
        lo->lo_crypt_name[LO_NAME_SIZE-1] = 0;

        if (!xfer)
                xfer = &none_funcs;
        lo->transfer = xfer->transfer;
        lo->ioctl = xfer->ioctl;

        lo->lo_flags = info->lo_flags;

        lo->lo_encrypt_key_size = info->lo_encrypt_key_size;
        lo->lo_init[0] = info->lo_init[0];
        lo->lo_init[1] = info->lo_init[1];
        if (info->lo_encrypt_key_size) {
                memcpy(lo->lo_encrypt_key, info->lo_encrypt_key,
                       info->lo_encrypt_key_size);
                lo->lo_key_owner = uid;
        }

        return 0;
}

static int loop_configure(struct loop_device *lo, fmode_t mode,
                          struct block_device *bdev,
                          const struct loop_config *config)
{
        struct file        *file;
        struct inode        *inode;
        struct address_space *mapping;
        struct block_device *claimed_bdev = NULL;
        int                error;
        loff_t                size;
        bool                partscan;
        unsigned short  bsize;

        /* This is safe, since we have a reference from open(). */
        __module_get(THIS_MODULE);

        error = -EBADF;
        file = fget(config->fd);
        if (!file)
                goto out;

        /*
         * If we don't hold exclusive handle for the device, upgrade to it
         * here to avoid changing device under exclusive owner.
         */
        if (!(mode & FMODE_EXCL)) {
                claimed_bdev = bdev->bd_contains;
                error = bd_prepare_to_claim(bdev, claimed_bdev, loop_configure);
                if (error)
                        goto out_putf;
        }

        error = mutex_lock_killable(&loop_ctl_mutex);
        if (error)
                goto out_bdev;

        error = -EBUSY;
        if (lo->lo_state != Lo_unbound)
                goto out_unlock;

        error = loop_validate_file(file, bdev);
        if (error)
                goto out_unlock;

        mapping = file->f_mapping;
        inode = mapping->host;

        if ((config->info.lo_flags & ~LOOP_CONFIGURE_SETTABLE_FLAGS) != 0) {
                error = -EINVAL;
                goto out_unlock;
        }

        if (config->block_size) {
                error = blk_validate_block_size(config->block_size);
                if (error)
                        goto out_unlock;
        }

        error = loop_set_status_from_info(lo, &config->info);
        if (error)
                goto out_unlock;

        if (!(file->f_mode & FMODE_WRITE) || !(mode & FMODE_WRITE) ||
            !file->f_op->write_iter)
                lo->lo_flags |= LO_FLAGS_READ_ONLY;

        error = loop_prepare_queue(lo);
        if (error)
                goto out_unlock;

        set_device_ro(bdev, (lo->lo_flags & LO_FLAGS_READ_ONLY) != 0);

        lo->use_dio = lo->lo_flags & LO_FLAGS_DIRECT_IO;
        lo->lo_device = bdev;
        lo->lo_backing_file = file;
        lo->old_gfp_mask = mapping_gfp_mask(mapping);
        mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));

        if (!(lo->lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
                blk_queue_write_cache(lo->lo_queue, true, false);

        if (config->block_size)
                bsize = config->block_size;
        else if ((lo->lo_backing_file->f_flags & O_DIRECT) && inode->i_sb->s_bdev)
                /* In case of direct I/O, match underlying block size */
                bsize = bdev_logical_block_size(inode->i_sb->s_bdev);
        else
                bsize = 512;

        blk_queue_logical_block_size(lo->lo_queue, bsize);
        blk_queue_physical_block_size(lo->lo_queue, bsize);
        blk_queue_io_min(lo->lo_queue, bsize);

        loop_config_discard(lo);
        loop_update_rotational(lo);
        loop_update_dio(lo);
        loop_sysfs_init(lo);

        size = get_loop_size(lo, file);
        loop_set_size(lo, size);

        set_blocksize(bdev, S_ISBLK(inode->i_mode) ?
                      block_size(inode->i_bdev) : PAGE_SIZE);

        lo->lo_state = Lo_bound;
        if (part_shift)
                lo->lo_flags |= LO_FLAGS_PARTSCAN;
        partscan = lo->lo_flags & LO_FLAGS_PARTSCAN;
        if (partscan)
                lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN;

        /* Grab the block_device to prevent its destruction after we
         * put /dev/loopXX inode. Later in __loop_clr_fd() we bdput(bdev).
         */
        bdgrab(bdev);
        mutex_unlock(&loop_ctl_mutex);
        if (partscan)
                loop_reread_partitions(lo, bdev);
        if (claimed_bdev)
                bd_abort_claiming(bdev, claimed_bdev, loop_configure);
        return 0;

out_unlock:
        mutex_unlock(&loop_ctl_mutex);
out_bdev:
        if (claimed_bdev)
                bd_abort_claiming(bdev, claimed_bdev, loop_configure);
out_putf:
        fput(file);
out:
        /* This is safe: open() is still holding a reference. */
        module_put(THIS_MODULE);
        return error;
}

static int __loop_clr_fd(struct loop_device *lo, bool release)
{
        struct file *filp = NULL;
        gfp_t gfp = lo->old_gfp_mask;
        struct block_device *bdev = lo->lo_device;
        int err = 0;
        bool partscan = false;
        int lo_number;

        mutex_lock(&loop_ctl_mutex);
        if (WARN_ON_ONCE(lo->lo_state != Lo_rundown)) {
                err = -ENXIO;
                goto out_unlock;
        }

        filp = lo->lo_backing_file;
        if (filp == NULL) {
                err = -EINVAL;
                goto out_unlock;
        }

        if (test_bit(QUEUE_FLAG_WC, &lo->lo_queue->queue_flags))
                blk_queue_write_cache(lo->lo_queue, false, false);

        /* freeze request queue during the transition */
        blk_mq_freeze_queue(lo->lo_queue);

        spin_lock_irq(&lo->lo_lock);
        lo->lo_backing_file = NULL;
        spin_unlock_irq(&lo->lo_lock);

        loop_release_xfer(lo);
        lo->transfer = NULL;
        lo->ioctl = NULL;
        lo->lo_device = NULL;
        lo->lo_encryption = NULL;
        lo->lo_offset = 0;
        lo->lo_sizelimit = 0;
        lo->lo_encrypt_key_size = 0;
        memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE);
        memset(lo->lo_crypt_name, 0, LO_NAME_SIZE);
        memset(lo->lo_file_name, 0, LO_NAME_SIZE);
        blk_queue_logical_block_size(lo->lo_queue, 512);
        blk_queue_physical_block_size(lo->lo_queue, 512);
        blk_queue_io_min(lo->lo_queue, 512);
        if (bdev) {
                bdput(bdev);
                invalidate_bdev(bdev);
                bdev->bd_inode->i_mapping->wb_err = 0;
        }
        set_capacity(lo->lo_disk, 0);
        loop_sysfs_exit(lo);
        if (bdev) {
                bd_set_nr_sectors(bdev, 0);
                /* let user-space know about this change */
                kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
        }
        mapping_set_gfp_mask(filp->f_mapping, gfp);
        /* This is safe: open() is still holding a reference. */
        module_put(THIS_MODULE);
        blk_mq_unfreeze_queue(lo->lo_queue);

        partscan = lo->lo_flags & LO_FLAGS_PARTSCAN && bdev;
        lo_number = lo->lo_number;
        loop_unprepare_queue(lo);
out_unlock:
        mutex_unlock(&loop_ctl_mutex);
        if (partscan) {
                /*
                 * bd_mutex has been held already in release path, so don't
                 * acquire it if this function is called in such case.
                 *
                 * If the reread partition isn't from release path, lo_refcnt
                 * must be at least one and it can only become zero when the
                 * current holder is released.
                 */
                if (!release)
                        mutex_lock(&bdev->bd_mutex);
                err = bdev_disk_changed(bdev, false);
                if (!release)
                        mutex_unlock(&bdev->bd_mutex);
                if (err)
                        pr_warn("%s: partition scan of loop%d failed (rc=%d)\n",
                                __func__, lo_number, err);
                /* Device is gone, no point in returning error */
                err = 0;
        }

        /*
         * lo->lo_state is set to Lo_unbound here after above partscan has
         * finished.
         *
         * There cannot be anybody else entering __loop_clr_fd() as
         * lo->lo_backing_file is already cleared and Lo_rundown state
         * protects us from all the other places trying to change the 'lo'
         * device.
         */
        mutex_lock(&loop_ctl_mutex);
        lo->lo_flags = 0;
        if (!part_shift)
                lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
        lo->lo_state = Lo_unbound;
        mutex_unlock(&loop_ctl_mutex);

        /*
         * Need not hold loop_ctl_mutex to fput backing file.
         * Calling fput holding loop_ctl_mutex triggers a circular
         * lock dependency possibility warning as fput can take
         * bd_mutex which is usually taken before loop_ctl_mutex.
         */
        if (filp)
                fput(filp);
        return err;
}

static int loop_clr_fd(struct loop_device *lo)
{
        int err;

        err = mutex_lock_killable(&loop_ctl_mutex);
        if (err)
                return err;
        if (lo->lo_state != Lo_bound) {
                mutex_unlock(&loop_ctl_mutex);
                return -ENXIO;
        }
        /*
         * If we've explicitly asked to tear down the loop device,
         * and it has an elevated reference count, set it for auto-teardown when
         * the last reference goes away. This stops $!~#$@ udev from
         * preventing teardown because it decided that it needs to run blkid on
         * the loopback device whenever they appear. xfstests is notorious for
         * failing tests because blkid via udev races with a losetup
         * <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d
         * command to fail with EBUSY.
         */
        if (atomic_read(&lo->lo_refcnt) > 1) {
                lo->lo_flags |= LO_FLAGS_AUTOCLEAR;
                mutex_unlock(&loop_ctl_mutex);
                return 0;
        }
        lo->lo_state = Lo_rundown;
        mutex_unlock(&loop_ctl_mutex);

        return __loop_clr_fd(lo, false);
}

static int
loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
{
        int err;
        struct block_device *bdev;
        kuid_t uid = current_uid();
        int prev_lo_flags;
        bool partscan = false;
        bool size_changed = false;

        err = mutex_lock_killable(&loop_ctl_mutex);
        if (err)
                return err;
        if (lo->lo_encrypt_key_size &&
            !uid_eq(lo->lo_key_owner, uid) &&
            !capable(CAP_SYS_ADMIN)) {
                err = -EPERM;
                goto out_unlock;
        }
        if (lo->lo_state != Lo_bound) {
                err = -ENXIO;
                goto out_unlock;
        }

        if (lo->lo_offset != info->lo_offset ||
            lo->lo_sizelimit != info->lo_sizelimit) {
                size_changed = true;
                sync_blockdev(lo->lo_device);
                invalidate_bdev(lo->lo_device);
        }

        /* I/O need to be drained during transfer transition */
        blk_mq_freeze_queue(lo->lo_queue);

        if (size_changed && lo->lo_device->bd_inode->i_mapping->nrpages) {
                /* If any pages were dirtied after invalidate_bdev(), try again */
                err = -EAGAIN;
                pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
                        __func__, lo->lo_number, lo->lo_file_name,
                        lo->lo_device->bd_inode->i_mapping->nrpages);
                goto out_unfreeze;
        }

        prev_lo_flags = lo->lo_flags;

        err = loop_set_status_from_info(lo, info);
        if (err)
                goto out_unfreeze;

        /* Mask out flags that can't be set using LOOP_SET_STATUS. */
        lo->lo_flags &= LOOP_SET_STATUS_SETTABLE_FLAGS;
        /* For those flags, use the previous values instead */
        lo->lo_flags |= prev_lo_flags & ~LOOP_SET_STATUS_SETTABLE_FLAGS;
        /* For flags that can't be cleared, use previous values too */
        lo->lo_flags |= prev_lo_flags & ~LOOP_SET_STATUS_CLEARABLE_FLAGS;

        if (size_changed) {
                loff_t new_size = get_size(lo->lo_offset, lo->lo_sizelimit,
                                           lo->lo_backing_file);
                loop_set_size(lo, new_size);
        }

        loop_config_discard(lo);

        /* update dio if lo_offset or transfer is changed */
        __loop_update_dio(lo, lo->use_dio);

out_unfreeze:
        blk_mq_unfreeze_queue(lo->lo_queue);

        if (!err && (lo->lo_flags & LO_FLAGS_PARTSCAN) &&
             !(prev_lo_flags & LO_FLAGS_PARTSCAN)) {
                lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN;
                bdev = lo->lo_device;
                partscan = true;
        }
out_unlock:
        mutex_unlock(&loop_ctl_mutex);
        if (partscan)
                loop_reread_partitions(lo, bdev);

        return err;
}

static int
loop_get_status(struct loop_device *lo, struct loop_info64 *info)
{
        struct path path;
        struct kstat stat;
        int ret;

        ret = mutex_lock_killable(&loop_ctl_mutex);
        if (ret)
                return ret;
        if (lo->lo_state != Lo_bound) {
                mutex_unlock(&loop_ctl_mutex);
                return -ENXIO;
        }

        memset(info, 0, sizeof(*info));
        info->lo_number = lo->lo_number;
        info->lo_offset = lo->lo_offset;
        info->lo_sizelimit = lo->lo_sizelimit;
        info->lo_flags = lo->lo_flags;
        memcpy(info->lo_file_name, lo->lo_file_name, LO_NAME_SIZE);
        memcpy(info->lo_crypt_name, lo->lo_crypt_name, LO_NAME_SIZE);
        info->lo_encrypt_type =
                lo->lo_encryption ? lo->lo_encryption->number : 0;
        if (lo->lo_encrypt_key_size && capable(CAP_SYS_ADMIN)) {
                info->lo_encrypt_key_size = lo->lo_encrypt_key_size;
                memcpy(info->lo_encrypt_key, lo->lo_encrypt_key,
                       lo->lo_encrypt_key_size);
        }

        /* Drop loop_ctl_mutex while we call into the filesystem. */
        path = lo->lo_backing_file->f_path;
        path_get(&path);
        mutex_unlock(&loop_ctl_mutex);
        ret = vfs_getattr(&path, &stat, STATX_INO, AT_STATX_SYNC_AS_STAT);
        if (!ret) {
                info->lo_device = huge_encode_dev(stat.dev);
                info->lo_inode = stat.ino;
                info->lo_rdevice = huge_encode_dev(stat.rdev);
        }
        path_put(&path);
        return ret;
}

static void
loop_info64_from_old(const struct loop_info *info, struct loop_info64 *info64)
{
        memset(info64, 0, sizeof(*info64));
        info64->lo_number = info->lo_number;
        info64->lo_device = info->lo_device;
        info64->lo_inode = info->lo_inode;
        info64->lo_rdevice = info->lo_rdevice;
        info64->lo_offset = info->lo_offset;
        info64->lo_sizelimit = 0;
        info64->lo_encrypt_type = info->lo_encrypt_type;
        info64->lo_encrypt_key_size = info->lo_encrypt_key_size;
        info64->lo_flags = info->lo_flags;
        info64->lo_init[0] = info->lo_init[0];
        info64->lo_init[1] = info->lo_init[1];
        if (info->lo_encrypt_type == LO_CRYPT_CRYPTOAPI)
                memcpy(info64->lo_crypt_name, info->lo_name, LO_NAME_SIZE);
        else
                memcpy(info64->lo_file_name, info->lo_name, LO_NAME_SIZE);
        memcpy(info64->lo_encrypt_key, info->lo_encrypt_key, LO_KEY_SIZE);
}

static int
loop_info64_to_old(const struct loop_info64 *info64, struct loop_info *info)
{
        memset(info, 0, sizeof(*info));
        info->lo_number = info64->lo_number;
        info->lo_device = info64->lo_device;
        info->lo_inode = info64->lo_inode;
        info->lo_rdevice = info64->lo_rdevice;
        info->lo_offset = info64->lo_offset;
        info->lo_encrypt_type = info64->lo_encrypt_type;
        info->lo_encrypt_key_size = info64->lo_encrypt_key_size;
        info->lo_flags = info64->lo_flags;
        info->lo_init[0] = info64->lo_init[0];
        info->lo_init[1] = info64->lo_init[1];
        if (info->lo_encrypt_type == LO_CRYPT_CRYPTOAPI)
                memcpy(info->lo_name, info64->lo_crypt_name, LO_NAME_SIZE);
        else
                memcpy(info->lo_name, info64->lo_file_name, LO_NAME_SIZE);
        memcpy(info->lo_encrypt_key, info64->lo_encrypt_key, LO_KEY_SIZE);

        /* error in case values were truncated */
        if (info->lo_device != info64->lo_device ||
            info->lo_rdevice != info64->lo_rdevice ||
            info->lo_inode != info64->lo_inode ||
            info->lo_offset != info64->lo_offset)
                return -EOVERFLOW;

        return 0;
}

static int
loop_set_status_old(struct loop_device *lo, const struct loop_info __user *arg)
{
        struct loop_info info;
        struct loop_info64 info64;

        if (copy_from_user(&info, arg, sizeof (struct loop_info)))
                return -EFAULT;
        loop_info64_from_old(&info, &info64);
        return loop_set_status(lo, &info64);
}

static int
loop_set_status64(struct loop_device *lo, const struct loop_info64 __user *arg)
{
        struct loop_info64 info64;

        if (copy_from_user(&info64, arg, sizeof (struct loop_info64)))
                return -EFAULT;
        return loop_set_status(lo, &info64);
}

static int
loop_get_status_old(struct loop_device *lo, struct loop_info __user *arg) {
        struct loop_info info;
        struct loop_info64 info64;
        int err;

        if (!arg)
                return -EINVAL;
        err = loop_get_status(lo, &info64);
        if (!err)
                err = loop_info64_to_old(&info64, &info);
        if (!err && copy_to_user(arg, &info, sizeof(info)))
                err = -EFAULT;

        return err;
}

static int
loop_get_status64(struct loop_device *lo, struct loop_info64 __user *arg) {
        struct loop_info64 info64;
        int err;

        if (!arg)
                return -EINVAL;
        err = loop_get_status(lo, &info64);
        if (!err && copy_to_user(arg, &info64, sizeof(info64)))
                err = -EFAULT;

        return err;
}

static int loop_set_capacity(struct loop_device *lo)
{
        loff_t size;

        if (unlikely(lo->lo_state != Lo_bound))
                return -ENXIO;

        size = get_loop_size(lo, lo->lo_backing_file);
        loop_set_size(lo, size);

        return 0;
}

static int loop_set_dio(struct loop_device *lo, unsigned long arg)
{
        int error = -ENXIO;
        if (lo->lo_state != Lo_bound)
                goto out;

        __loop_update_dio(lo, !!arg);
        if (lo->use_dio == !!arg)
                return 0;
        error = -EINVAL;
 out:
        return error;
}

static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
{
        int err = 0;

        if (lo->lo_state != Lo_bound)
                return -ENXIO;

        err = blk_validate_block_size(arg);
        if (err)
                return err;

        if (lo->lo_queue->limits.logical_block_size == arg)
                return 0;

        sync_blockdev(lo->lo_device);
        invalidate_bdev(lo->lo_device);

        blk_mq_freeze_queue(lo->lo_queue);

        /* invalidate_bdev should have truncated all the pages */
        if (lo->lo_device->bd_inode->i_mapping->nrpages) {
                err = -EAGAIN;
                pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
                        __func__, lo->lo_number, lo->lo_file_name,
                        lo->lo_device->bd_inode->i_mapping->nrpages);
                goto out_unfreeze;
        }

        blk_queue_logical_block_size(lo->lo_queue, arg);
        blk_queue_physical_block_size(lo->lo_queue, arg);
        blk_queue_io_min(lo->lo_queue, arg);
        loop_update_dio(lo);
out_unfreeze:
        blk_mq_unfreeze_queue(lo->lo_queue);

        return err;
}

static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd,
                           unsigned long arg)
{
        int err;

        err = mutex_lock_killable(&loop_ctl_mutex);
        if (err)
                return err;
        switch (cmd) {
        case LOOP_SET_CAPACITY:
                err = loop_set_capacity(lo);
                break;
        case LOOP_SET_DIRECT_IO:
                err = loop_set_dio(lo, arg);
                break;
        case LOOP_SET_BLOCK_SIZE:
                err = loop_set_block_size(lo, arg);
                break;
        default:
                err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;
        }
        mutex_unlock(&loop_ctl_mutex);
        return err;
}

static int lo_ioctl(struct block_device *bdev, fmode_t mode,
        unsigned int cmd, unsigned long arg)
{
        struct loop_device *lo = bdev->bd_disk->private_data;
        void __user *argp = (void __user *) arg;
        int err;

        switch (cmd) {
        case LOOP_SET_FD: {
                /*
                 * Legacy case - pass in a zeroed out struct loop_config with
                 * only the file descriptor set , which corresponds with the
                 * default parameters we'd have used otherwise.
                 */
                struct loop_config config;

                memset(&config, 0, sizeof(config));
                config.fd = arg;

                return loop_configure(lo, mode, bdev, &config);
        }
        case LOOP_CONFIGURE: {
                struct loop_config config;

                if (copy_from_user(&config, argp, sizeof(config)))
                        return -EFAULT;

                return loop_configure(lo, mode, bdev, &config);
        }
        case LOOP_CHANGE_FD:
                return loop_change_fd(lo, bdev, arg);
        case LOOP_CLR_FD:
                return loop_clr_fd(lo);
        case LOOP_SET_STATUS:
                err = -EPERM;
                if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) {
                        err = loop_set_status_old(lo, argp);
                }
                break;
        case LOOP_GET_STATUS:
                return loop_get_status_old(lo, argp);
        case LOOP_SET_STATUS64:
                err = -EPERM;
                if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) {
                        err = loop_set_status64(lo, argp);
                }
                break;
        case LOOP_GET_STATUS64:
                return loop_get_status64(lo, argp);
        case LOOP_SET_CAPACITY:
        case LOOP_SET_DIRECT_IO:
        case LOOP_SET_BLOCK_SIZE:
                if (!(mode & FMODE_WRITE) && !capable(CAP_SYS_ADMIN))
                        return -EPERM;
                fallthrough;
        default:
                err = lo_simple_ioctl(lo, cmd, arg);
                break;
        }

        return err;
}

#ifdef CONFIG_COMPAT
struct compat_loop_info {
        compat_int_t        lo_number;      /* ioctl r/o */
        compat_dev_t        lo_device;      /* ioctl r/o */
        compat_ulong_t        lo_inode;       /* ioctl r/o */
        compat_dev_t        lo_rdevice;     /* ioctl r/o */
        compat_int_t        lo_offset;
        compat_int_t        lo_encrypt_type;
        compat_int_t        lo_encrypt_key_size;    /* ioctl w/o */
        compat_int_t        lo_flags;       /* ioctl r/o */
        char                lo_name[LO_NAME_SIZE];
        unsigned char        lo_encrypt_key[LO_KEY_SIZE]; /* ioctl w/o */
        compat_ulong_t        lo_init[2];
        char                reserved[4];
};

/*
 * Transfer 32-bit compatibility structure in userspace to 64-bit loop info
 * - noinlined to reduce stack space usage in main part of driver
 */
static noinline int
loop_info64_from_compat(const struct compat_loop_info __user *arg,
                        struct loop_info64 *info64)
{
        struct compat_loop_info info;

        if (copy_from_user(&info, arg, sizeof(info)))
                return -EFAULT;

        memset(info64, 0, sizeof(*info64));
        info64->lo_number = info.lo_number;
        info64->lo_device = info.lo_device;
        info64->lo_inode = info.lo_inode;
        info64->lo_rdevice = info.lo_rdevice;
        info64->lo_offset = info.lo_offset;
        info64->lo_sizelimit = 0;
        info64->lo_encrypt_type = info.lo_encrypt_type;
        info64->lo_encrypt_key_size = info.lo_encrypt_key_size;
        info64->lo_flags = info.lo_flags;
        info64->lo_init[0] = info.lo_init[0];
        info64->lo_init[1] = info.lo_init[1];
        if (info.lo_encrypt_type == LO_CRYPT_CRYPTOAPI)
                memcpy(info64->lo_crypt_name, info.lo_name, LO_NAME_SIZE);
        else
                memcpy(info64->lo_file_name, info.lo_name, LO_NAME_SIZE);
        memcpy(info64->lo_encrypt_key, info.lo_encrypt_key, LO_KEY_SIZE);
        return 0;
}

/*
 * Transfer 64-bit loop info to 32-bit compatibility structure in userspace
 * - noinlined to reduce stack space usage in main part of driver
 */
static noinline int
loop_info64_to_compat(const struct loop_info64 *info64,
                      struct compat_loop_info __user *arg)
{
        struct compat_loop_info info;

        memset(&info, 0, sizeof(info));
        info.lo_number = info64->lo_number;
        info.lo_device = info64->lo_device;
        info.lo_inode = info64->lo_inode;
        info.lo_rdevice = info64->lo_rdevice;
        info.lo_offset = info64->lo_offset;
        info.lo_encrypt_type = info64->lo_encrypt_type;
        info.lo_encrypt_key_size = info64->lo_encrypt_key_size;
        info.lo_flags = info64->lo_flags;
        info.lo_init[0] = info64->lo_init[0];
        info.lo_init[1] = info64->lo_init[1];
        if (info.lo_encrypt_type == LO_CRYPT_CRYPTOAPI)
                memcpy(info.lo_name, info64->lo_crypt_name, LO_NAME_SIZE);
        else
                memcpy(info.lo_name, info64->lo_file_name, LO_NAME_SIZE);
        memcpy(info.lo_encrypt_key, info64->lo_encrypt_key, LO_KEY_SIZE);

        /* error in case values were truncated */
        if (info.lo_device != info64->lo_device ||
            info.lo_rdevice != info64->lo_rdevice ||
            info.lo_inode != info64->lo_inode ||
            info.lo_offset != info64->lo_offset ||
            info.lo_init[0] != info64->lo_init[0] ||
            info.lo_init[1] != info64->lo_init[1])
                return -EOVERFLOW;

        if (copy_to_user(arg, &info, sizeof(info)))
                return -EFAULT;
        return 0;
}

static int
loop_set_status_compat(struct loop_device *lo,
                       const struct compat_loop_info __user *arg)
{
        struct loop_info64 info64;
        int ret;

        ret = loop_info64_from_compat(arg, &info64);
        if (ret < 0)
                return ret;
        return loop_set_status(lo, &info64);
}

static int
loop_get_status_compat(struct loop_device *lo,
                       struct compat_loop_info __user *arg)
{
        struct loop_info64 info64;
        int err;

        if (!arg)
                return -EINVAL;
        err = loop_get_status(lo, &info64);
        if (!err)
                err = loop_info64_to_compat(&info64, arg);
        return err;
}

static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
                           unsigned int cmd, unsigned long arg)
{
        struct loop_device *lo = bdev->bd_disk->private_data;
        int err;

        switch(cmd) {
        case LOOP_SET_STATUS:
                err = loop_set_status_compat(lo,
                             (const struct compat_loop_info __user *)arg);
                break;
        case LOOP_GET_STATUS:
                err = loop_get_status_compat(lo,
                                     (struct compat_loop_info __user *)arg);
                break;
        case LOOP_SET_CAPACITY:
        case LOOP_CLR_FD:
        case LOOP_GET_STATUS64:
        case LOOP_SET_STATUS64:
        case LOOP_CONFIGURE:
                arg = (unsigned long) compat_ptr(arg);
                fallthrough;
        case LOOP_SET_FD:
        case LOOP_CHANGE_FD:
        case LOOP_SET_BLOCK_SIZE:
        case LOOP_SET_DIRECT_IO:
                err = lo_ioctl(bdev, mode, cmd, arg);
                break;
        default:
                err = -ENOIOCTLCMD;
                break;
        }
        return err;
}
#endif

static int lo_open(struct block_device *bdev, fmode_t mode)
{
        struct loop_device *lo;
        int err;

        err = mutex_lock_killable(&loop_ctl_mutex);
        if (err)
                return err;
        lo = bdev->bd_disk->private_data;
        if (!lo) {
                err = -ENXIO;
                goto out;
        }

        atomic_inc(&lo->lo_refcnt);
out:
        mutex_unlock(&loop_ctl_mutex);
        return err;
}

static void lo_release(struct gendisk *disk, fmode_t mode)
{
        struct loop_device *lo;

        mutex_lock(&loop_ctl_mutex);
        lo = disk->private_data;
        if (atomic_dec_return(&lo->lo_refcnt))
                goto out_unlock;

        if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
                if (lo->lo_state != Lo_bound)
                        goto out_unlock;
                lo->lo_state = Lo_rundown;
                mutex_unlock(&loop_ctl_mutex);
                /*
                 * In autoclear mode, stop the loop thread
                 * and remove configuration after last close.
                 */
                __loop_clr_fd(lo, true);
                return;
        } else if (lo->lo_state == Lo_bound) {
                /*
                 * Otherwise keep thread (if running) and config,
                 * but flush possible ongoing bios in thread.
                 */
                blk_mq_freeze_queue(lo->lo_queue);
                blk_mq_unfreeze_queue(lo->lo_queue);
        }

out_unlock:
        mutex_unlock(&loop_ctl_mutex);
}

static const struct block_device_operations lo_fops = {
        .owner =        THIS_MODULE,
        .open =                lo_open,
        .release =        lo_release,
        .ioctl =        lo_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl =        lo_compat_ioctl,
#endif
};

/*
 * And now the modules code and kernel interface.
 */
static int max_loop;
module_param(max_loop, int, 0444);
MODULE_PARM_DESC(max_loop, "Maximum number of loop devices");
module_param(max_part, int, 0444);
MODULE_PARM_DESC(max_part, "Maximum number of partitions per loop device");
MODULE_LICENSE("GPL");
MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR);

int loop_register_transfer(struct loop_func_table *funcs)
{
        unsigned int n = funcs->number;

        if (n >= MAX_LO_CRYPT || xfer_funcs[n])
                return -EINVAL;
        xfer_funcs[n] = funcs;
        return 0;
}

static int unregister_transfer_cb(int id, void *ptr, void *data)
{
        struct loop_device *lo = ptr;
        struct loop_func_table *xfer = data;

        mutex_lock(&loop_ctl_mutex);
        if (lo->lo_encryption == xfer)
                loop_release_xfer(lo);
        mutex_unlock(&loop_ctl_mutex);
        return 0;
}

int loop_unregister_transfer(int number)
{
        unsigned int n = number;
        struct loop_func_table *xfer;

        if (n == 0 || n >= MAX_LO_CRYPT || (xfer = xfer_funcs[n]) == NULL)
                return -EINVAL;

        xfer_funcs[n] = NULL;
        idr_for_each(&loop_index_idr, &unregister_transfer_cb, xfer);
        return 0;
}

EXPORT_SYMBOL(loop_register_transfer);
EXPORT_SYMBOL(loop_unregister_transfer);

static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
                const struct blk_mq_queue_data *bd)
{
        struct request *rq = bd->rq;
        struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
        struct loop_device *lo = rq->q->queuedata;

        blk_mq_start_request(rq);

        if (lo->lo_state != Lo_bound)
                return BLK_STS_IOERR;

        switch (req_op(rq)) {
        case REQ_OP_FLUSH:
        case REQ_OP_DISCARD:
        case REQ_OP_WRITE_ZEROES:
                cmd->use_aio = false;
                break;
        default:
                cmd->use_aio = lo->use_dio;
                break;
        }

        /* always use the first bio's css */
#ifdef CONFIG_BLK_CGROUP
        if (cmd->use_aio && rq->bio && rq->bio->bi_blkg) {
                cmd->css = &bio_blkcg(rq->bio)->css;
                css_get(cmd->css);
        } else
#endif
                cmd->css = NULL;
        kthread_queue_work(&lo->worker, &cmd->work);

        return BLK_STS_OK;
}

static void loop_handle_cmd(struct loop_cmd *cmd)
{
        struct request *rq = blk_mq_rq_from_pdu(cmd);
        const bool write = op_is_write(req_op(rq));
        struct loop_device *lo = rq->q->queuedata;
        int ret = 0;

        if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY)) {
                ret = -EIO;
                goto failed;
        }

        ret = do_req_filebacked(lo, rq);
 failed:
        /* complete non-aio request */
        if (!cmd->use_aio || ret) {
                if (ret == -EOPNOTSUPP)
                        cmd->ret = ret;
                else
                        cmd->ret = ret ? -EIO : 0;
                if (likely(!blk_should_fake_timeout(rq->q)))
                        blk_mq_complete_request(rq);
        }
}

static void loop_queue_work(struct kthread_work *work)
{
        struct loop_cmd *cmd =
                container_of(work, struct loop_cmd, work);

        loop_handle_cmd(cmd);
}

static int loop_init_request(struct blk_mq_tag_set *set, struct request *rq,
                unsigned int hctx_idx, unsigned int numa_node)
{
        struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);

        kthread_init_work(&cmd->work, loop_queue_work);
        return 0;
}

static const struct blk_mq_ops loop_mq_ops = {
        .queue_rq       = loop_queue_rq,
        .init_request        = loop_init_request,
        .complete        = lo_complete_rq,
};

static int loop_add(struct loop_device **l, int i)
{
        struct loop_device *lo;
        struct gendisk *disk;
        int err;

        err = -ENOMEM;
        lo = kzalloc(sizeof(*lo), GFP_KERNEL);
        if (!lo)
                goto out;

        lo->lo_state = Lo_unbound;

        /* allocate id, if @id >= 0, we're requesting that specific id */
        if (i >= 0) {
                err = idr_alloc(&loop_index_idr, lo, i, i + 1, GFP_KERNEL);
                if (err == -ENOSPC)
                        err = -EEXIST;
        } else {
                err = idr_alloc(&loop_index_idr, lo, 0, 0, GFP_KERNEL);
        }
        if (err < 0)
                goto out_free_dev;
        i = err;

        err = -ENOMEM;
        lo->tag_set.ops = &loop_mq_ops;
        lo->tag_set.nr_hw_queues = 1;
        lo->tag_set.queue_depth = 128;
        lo->tag_set.numa_node = NUMA_NO_NODE;
        lo->tag_set.cmd_size = sizeof(struct loop_cmd);
        lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING |
                BLK_MQ_F_NO_SCHED;
        lo->tag_set.driver_data = lo;

        err = blk_mq_alloc_tag_set(&lo->tag_set);
        if (err)
                goto out_free_idr;

        lo->lo_queue = blk_mq_init_queue(&lo->tag_set);
        if (IS_ERR(lo->lo_queue)) {
                err = PTR_ERR(lo->lo_queue);
                goto out_cleanup_tags;
        }
        lo->lo_queue->queuedata = lo;

        blk_queue_max_hw_sectors(lo->lo_queue, BLK_DEF_MAX_SECTORS);

        /*
         * By default, we do buffer IO, so it doesn't make sense to enable
         * merge because the I/O submitted to backing file is handled page by
         * page. For directio mode, merge does help to dispatch bigger request
         * to underlayer disk. We will enable merge once directio is enabled.
         */
        blk_queue_flag_set(QUEUE_FLAG_NOMERGES, lo->lo_queue);

        err = -ENOMEM;
        disk = lo->lo_disk = alloc_disk(1 << part_shift);
        if (!disk)
                goto out_free_queue;

        /*
         * Disable partition scanning by default. The in-kernel partition
         * scanning can be requested individually per-device during its
         * setup. Userspace can always add and remove partitions from all
         * devices. The needed partition minors are allocated from the
         * extended minor space, the main loop device numbers will continue
         * to match the loop minors, regardless of the number of partitions
         * used.
         *
         * If max_part is given, partition scanning is globally enabled for
         * all loop devices. The minors for the main loop devices will be
         * multiples of max_part.
         *
         * Note: Global-for-all-devices, set-only-at-init, read-only module
         * parameteters like 'max_loop' and 'max_part' make things needlessly
         * complicated, are too static, inflexible and may surprise
         * userspace tools. Parameters like this in general should be avoided.
         */
        if (!part_shift)
                disk->flags |= GENHD_FL_NO_PART_SCAN;
        disk->flags |= GENHD_FL_EXT_DEVT;
        atomic_set(&lo->lo_refcnt, 0);
        lo->lo_number                = i;
        spin_lock_init(&lo->lo_lock);
        disk->major                = LOOP_MAJOR;
        disk->first_minor        = i << part_shift;
        disk->fops                = &lo_fops;
        disk->private_data        = lo;
        disk->queue                = lo->lo_queue;
        sprintf(disk->disk_name, "loop%d", i);
        add_disk(disk);
        *l = lo;
        return lo->lo_number;

out_free_queue:
        blk_cleanup_queue(lo->lo_queue);
out_cleanup_tags:
        blk_mq_free_tag_set(&lo->tag_set);
out_free_idr:
        idr_remove(&loop_index_idr, i);
out_free_dev:
        kfree(lo);
out:
        return err;
}

static void loop_remove(struct loop_device *lo)
{
        del_gendisk(lo->lo_disk);
        blk_cleanup_queue(lo->lo_queue);
        blk_mq_free_tag_set(&lo->tag_set);
        put_disk(lo->lo_disk);
        kfree(lo);
}

static int find_free_cb(int id, void *ptr, void *data)
{
        struct loop_device *lo = ptr;
        struct loop_device **l = data;

        if (lo->lo_state == Lo_unbound) {
                *l = lo;
                return 1;
        }
        return 0;
}

static int loop_lookup(struct loop_device **l, int i)
{
        struct loop_device *lo;
        int ret = -ENODEV;

        if (i < 0) {
                int err;

                err = idr_for_each(&loop_index_idr, &find_free_cb, &lo);
                if (err == 1) {
                        *l = lo;
                        ret = lo->lo_number;
                }
                goto out;
        }

        /* lookup and return a specific i */
        lo = idr_find(&loop_index_idr, i);
        if (lo) {
                *l = lo;
                ret = lo->lo_number;
        }
out:
        return ret;
}

static struct kobject *loop_probe(dev_t dev, int *part, void *data)
{
        struct loop_device *lo;
        struct kobject *kobj;
        int err;

        mutex_lock(&loop_ctl_mutex);
        err = loop_lookup(&lo, MINOR(dev) >> part_shift);
        if (err < 0)
                err = loop_add(&lo, MINOR(dev) >> part_shift);
        if (err < 0)
                kobj = NULL;
        else
                kobj = get_disk_and_module(lo->lo_disk);
        mutex_unlock(&loop_ctl_mutex);

        *part = 0;
        return kobj;
}

static long loop_control_ioctl(struct file *file, unsigned int cmd,
                               unsigned long parm)
{
        struct loop_device *lo;
        int ret;

        ret = mutex_lock_killable(&loop_ctl_mutex);
        if (ret)
                return ret;

        ret = -ENOSYS;
        switch (cmd) {
        case LOOP_CTL_ADD:
                ret = loop_lookup(&lo, parm);
                if (ret >= 0) {
                        ret = -EEXIST;
                        break;
                }
                ret = loop_add(&lo, parm);
                break;
        case LOOP_CTL_REMOVE:
                ret = loop_lookup(&lo, parm);
                if (ret < 0)
                        break;
                if (lo->lo_state != Lo_unbound) {
                        ret = -EBUSY;
                        break;
                }
                if (atomic_read(&lo->lo_refcnt) > 0) {
                        ret = -EBUSY;
                        break;
                }
                lo->lo_disk->private_data = NULL;
                idr_remove(&loop_index_idr, lo->lo_number);
                loop_remove(lo);
                break;
        case LOOP_CTL_GET_FREE:
                ret = loop_lookup(&lo, -1);
                if (ret >= 0)
                        break;
                ret = loop_add(&lo, -1);
        }
        mutex_unlock(&loop_ctl_mutex);

        return ret;
}

static const struct file_operations loop_ctl_fops = {
        .open                = nonseekable_open,
        .unlocked_ioctl        = loop_control_ioctl,
        .compat_ioctl        = loop_control_ioctl,
        .owner                = THIS_MODULE,
        .llseek                = noop_llseek,
};

static struct miscdevice loop_misc = {
        .minor                = LOOP_CTRL_MINOR,
        .name                = "loop-control",
        .fops                = &loop_ctl_fops,
};

MODULE_ALIAS_MISCDEV(LOOP_CTRL_MINOR);
MODULE_ALIAS("devname:loop-control");

static int __init loop_init(void)
{
        int i, nr;
        unsigned long range;
        struct loop_device *lo;
        int err;

        part_shift = 0;
        if (max_part > 0) {
                part_shift = fls(max_part);

                /*
                 * Adjust max_part according to part_shift as it is exported
                 * to user space so that user can decide correct minor number
                 * if [s]he want to create more devices.
                 *
                 * Note that -1 is required because partition 0 is reserved
                 * for the whole disk.
                 */
                max_part = (1UL << part_shift) - 1;
        }

        if ((1UL << part_shift) > DISK_MAX_PARTS) {
                err = -EINVAL;
                goto err_out;
        }

        if (max_loop > 1UL << (MINORBITS - part_shift)) {
                err = -EINVAL;
                goto err_out;
        }

        /*
         * If max_loop is specified, create that many devices upfront.
         * This also becomes a hard limit. If max_loop is not specified,
         * create CONFIG_BLK_DEV_LOOP_MIN_COUNT loop devices at module
         * init time. Loop devices can be requested on-demand with the
         * /dev/loop-control interface, or be instantiated by accessing
         * a 'dead' device node.
         */
        if (max_loop) {
                nr = max_loop;
                range = max_loop << part_shift;
        } else {
                nr = CONFIG_BLK_DEV_LOOP_MIN_COUNT;
                range = 1UL << MINORBITS;
        }

        err = misc_register(&loop_misc);
        if (err < 0)
                goto err_out;


        if (register_blkdev(LOOP_MAJOR, "loop")) {
                err = -EIO;
                goto misc_out;
        }

        blk_register_region(MKDEV(LOOP_MAJOR, 0), range,
                                  THIS_MODULE, loop_probe, NULL, NULL);

        /* pre-create number of devices given by config or max_loop */
        mutex_lock(&loop_ctl_mutex);
        for (i = 0; i < nr; i++)
                loop_add(&lo, i);
        mutex_unlock(&loop_ctl_mutex);

        printk(KERN_INFO "loop: module loaded\n");
        return 0;

misc_out:
        misc_deregister(&loop_misc);
err_out:
        return err;
}

static int loop_exit_cb(int id, void *ptr, void *data)
{
        struct loop_device *lo = ptr;

        loop_remove(lo);
        return 0;
}

static void __exit loop_exit(void)
{
        unsigned long range;

        range = max_loop ? max_loop << part_shift : 1UL << MINORBITS;

        mutex_lock(&loop_ctl_mutex);

        idr_for_each(&loop_index_idr, &loop_exit_cb, NULL);
        idr_destroy(&loop_index_idr);

        blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range);
        unregister_blkdev(LOOP_MAJOR, "loop");

        misc_deregister(&loop_misc);

        mutex_unlock(&loop_ctl_mutex);
}

module_init(loop_init);
module_exit(loop_exit);

#ifndef MODULE
static int __init max_loop_setup(char *str)
{
        max_loop = simple_strtol(str, NULL, 0);
        return 1;
}

__setup("max_loop=", max_loop_setup);
#endif


































































































    1 


    2 













































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_WAIT_BIT_H
#define _LINUX_WAIT_BIT_H

/*
 * Linux wait-bit related types and methods:
 */
#include <linux/wait.h>

struct wait_bit_key {
        void                        *flags;
        int                        bit_nr;
        unsigned long                timeout;
};

struct wait_bit_queue_entry {
        struct wait_bit_key        key;
        struct wait_queue_entry        wq_entry;
};

#define __WAIT_BIT_KEY_INITIALIZER(word, bit)                                        \
        { .flags = word, .bit_nr = bit, }

typedef int wait_bit_action_f(struct wait_bit_key *key, int mode);

void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit);
int __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode);
int __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode);
void wake_up_bit(void *word, int bit);
int out_of_line_wait_on_bit(void *word, int, wait_bit_action_f *action, unsigned int mode);
int out_of_line_wait_on_bit_timeout(void *word, int, wait_bit_action_f *action, unsigned int mode, unsigned long timeout);
int out_of_line_wait_on_bit_lock(void *word, int, wait_bit_action_f *action, unsigned int mode);
struct wait_queue_head *bit_waitqueue(void *word, int bit);
extern void __init wait_bit_init(void);

int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key);

#define DEFINE_WAIT_BIT(name, word, bit)                                        \
        struct wait_bit_queue_entry name = {                                        \
                .key = __WAIT_BIT_KEY_INITIALIZER(word, bit),                        \
                .wq_entry = {                                                        \
                        .private        = current,                                \
                        .func                = wake_bit_function,                        \
                        .entry                =                                        \
                                LIST_HEAD_INIT((name).wq_entry.entry),                \
                },                                                                \
        }

extern int bit_wait(struct wait_bit_key *key, int mode);
extern int bit_wait_io(struct wait_bit_key *key, int mode);
extern int bit_wait_timeout(struct wait_bit_key *key, int mode);
extern int bit_wait_io_timeout(struct wait_bit_key *key, int mode);

/**
 * wait_on_bit - wait for a bit to be cleared
 * @word: the word being waited on, a kernel virtual address
 * @bit: the bit of the word being waited on
 * @mode: the task state to sleep in
 *
 * There is a standard hashed waitqueue table for generic use. This
 * is the part of the hashtable's accessor API that waits on a bit.
 * For instance, if one were to have waiters on a bitflag, one would
 * call wait_on_bit() in threads waiting for the bit to clear.
 * One uses wait_on_bit() where one is waiting for the bit to clear,
 * but has no intention of setting it.
 * Returned value will be zero if the bit was cleared, or non-zero
 * if the process received a signal and the mode permitted wakeup
 * on that signal.
 */
static inline int
wait_on_bit(unsigned long *word, int bit, unsigned mode)
{
        might_sleep();
        if (!test_bit(bit, word))
                return 0;
        return out_of_line_wait_on_bit(word, bit,
                                       bit_wait,
                                       mode);
}

/**
 * wait_on_bit_io - wait for a bit to be cleared
 * @word: the word being waited on, a kernel virtual address
 * @bit: the bit of the word being waited on
 * @mode: the task state to sleep in
 *
 * Use the standard hashed waitqueue table to wait for a bit
 * to be cleared.  This is similar to wait_on_bit(), but calls
 * io_schedule() instead of schedule() for the actual waiting.
 *
 * Returned value will be zero if the bit was cleared, or non-zero
 * if the process received a signal and the mode permitted wakeup
 * on that signal.
 */
static inline int
wait_on_bit_io(unsigned long *word, int bit, unsigned mode)
{
        might_sleep();
        if (!test_bit(bit, word))
                return 0;
        return out_of_line_wait_on_bit(word, bit,
                                       bit_wait_io,
                                       mode);
}

/**
 * wait_on_bit_timeout - wait for a bit to be cleared or a timeout elapses
 * @word: the word being waited on, a kernel virtual address
 * @bit: the bit of the word being waited on
 * @mode: the task state to sleep in
 * @timeout: timeout, in jiffies
 *
 * Use the standard hashed waitqueue table to wait for a bit
 * to be cleared. This is similar to wait_on_bit(), except also takes a
 * timeout parameter.
 *
 * Returned value will be zero if the bit was cleared before the
 * @timeout elapsed, or non-zero if the @timeout elapsed or process
 * received a signal and the mode permitted wakeup on that signal.
 */
static inline int
wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode,
                    unsigned long timeout)
{
        might_sleep();
        if (!test_bit(bit, word))
                return 0;
        return out_of_line_wait_on_bit_timeout(word, bit,
                                               bit_wait_timeout,
                                               mode, timeout);
}

/**
 * wait_on_bit_action - wait for a bit to be cleared
 * @word: the word being waited on, a kernel virtual address
 * @bit: the bit of the word being waited on
 * @action: the function used to sleep, which may take special actions
 * @mode: the task state to sleep in
 *
 * Use the standard hashed waitqueue table to wait for a bit
 * to be cleared, and allow the waiting action to be specified.
 * This is like wait_on_bit() but allows fine control of how the waiting
 * is done.
 *
 * Returned value will be zero if the bit was cleared, or non-zero
 * if the process received a signal and the mode permitted wakeup
 * on that signal.
 */
static inline int
wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action,
                   unsigned mode)
{
        might_sleep();
        if (!test_bit(bit, word))
                return 0;
        return out_of_line_wait_on_bit(word, bit, action, mode);
}

/**
 * wait_on_bit_lock - wait for a bit to be cleared, when wanting to set it
 * @word: the word being waited on, a kernel virtual address
 * @bit: the bit of the word being waited on
 * @mode: the task state to sleep in
 *
 * There is a standard hashed waitqueue table for generic use. This
 * is the part of the hashtable's accessor API that waits on a bit
 * when one intends to set it, for instance, trying to lock bitflags.
 * For instance, if one were to have waiters trying to set bitflag
 * and waiting for it to clear before setting it, one would call
 * wait_on_bit() in threads waiting to be able to set the bit.
 * One uses wait_on_bit_lock() where one is waiting for the bit to
 * clear with the intention of setting it, and when done, clearing it.
 *
 * Returns zero if the bit was (eventually) found to be clear and was
 * set.  Returns non-zero if a signal was delivered to the process and
 * the @mode allows that signal to wake the process.
 */
static inline int
wait_on_bit_lock(unsigned long *word, int bit, unsigned mode)
{
        might_sleep();
        if (!test_and_set_bit(bit, word))
                return 0;
        return out_of_line_wait_on_bit_lock(word, bit, bit_wait, mode);
}

/**
 * wait_on_bit_lock_io - wait for a bit to be cleared, when wanting to set it
 * @word: the word being waited on, a kernel virtual address
 * @bit: the bit of the word being waited on
 * @mode: the task state to sleep in
 *
 * Use the standard hashed waitqueue table to wait for a bit
 * to be cleared and then to atomically set it.  This is similar
 * to wait_on_bit(), but calls io_schedule() instead of schedule()
 * for the actual waiting.
 *
 * Returns zero if the bit was (eventually) found to be clear and was
 * set.  Returns non-zero if a signal was delivered to the process and
 * the @mode allows that signal to wake the process.
 */
static inline int
wait_on_bit_lock_io(unsigned long *word, int bit, unsigned mode)
{
        might_sleep();
        if (!test_and_set_bit(bit, word))
                return 0;
        return out_of_line_wait_on_bit_lock(word, bit, bit_wait_io, mode);
}

/**
 * wait_on_bit_lock_action - wait for a bit to be cleared, when wanting to set it
 * @word: the word being waited on, a kernel virtual address
 * @bit: the bit of the word being waited on
 * @action: the function used to sleep, which may take special actions
 * @mode: the task state to sleep in
 *
 * Use the standard hashed waitqueue table to wait for a bit
 * to be cleared and then to set it, and allow the waiting action
 * to be specified.
 * This is like wait_on_bit() but allows fine control of how the waiting
 * is done.
 *
 * Returns zero if the bit was (eventually) found to be clear and was
 * set.  Returns non-zero if a signal was delivered to the process and
 * the @mode allows that signal to wake the process.
 */
static inline int
wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action,
                        unsigned mode)
{
        might_sleep();
        if (!test_and_set_bit(bit, word))
                return 0;
        return out_of_line_wait_on_bit_lock(word, bit, action, mode);
}

extern void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int flags);
extern void wake_up_var(void *var);
extern wait_queue_head_t *__var_waitqueue(void *p);

#define ___wait_var_event(var, condition, state, exclusive, ret, cmd)        \
({                                                                        \
        __label__ __out;                                                \
        struct wait_queue_head *__wq_head = __var_waitqueue(var);        \
        struct wait_bit_queue_entry __wbq_entry;                        \
        long __ret = ret; /* explicit shadow */                                \
                                                                        \
        init_wait_var_entry(&__wbq_entry, var,                                \
                            exclusive ? WQ_FLAG_EXCLUSIVE : 0);                \
        for (;;) {                                                        \
                long __int = prepare_to_wait_event(__wq_head,                \
                                                   &__wbq_entry.wq_entry, \
                                                   state);                \
                if (condition)                                                \
                        break;                                                \
                                                                        \
                if (___wait_is_interruptible(state) && __int) {                \
                        __ret = __int;                                        \
                        goto __out;                                        \
                }                                                        \
                                                                        \
                cmd;                                                        \
        }                                                                \
        finish_wait(__wq_head, &__wbq_entry.wq_entry);                        \
__out:        __ret;                                                                \
})

#define __wait_var_event(var, condition)                                \
        ___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0,        \
                          schedule())

#define wait_var_event(var, condition)                                        \
do {                                                                        \
        might_sleep();                                                        \
        if (condition)                                                        \
                break;                                                        \
        __wait_var_event(var, condition);                                \
} while (0)

#define __wait_var_event_killable(var, condition)                        \
        ___wait_var_event(var, condition, TASK_KILLABLE, 0, 0,                \
                          schedule())

#define wait_var_event_killable(var, condition)                                \
({                                                                        \
        int __ret = 0;                                                        \
        might_sleep();                                                        \
        if (!(condition))                                                \
                __ret = __wait_var_event_killable(var, condition);        \
        __ret;                                                                \
})

#define __wait_var_event_timeout(var, condition, timeout)                \
        ___wait_var_event(var, ___wait_cond_timeout(condition),                \
                          TASK_UNINTERRUPTIBLE, 0, timeout,                \
                          __ret = schedule_timeout(__ret))

#define wait_var_event_timeout(var, condition, timeout)                        \
({                                                                        \
        long __ret = timeout;                                                \
        might_sleep();                                                        \
        if (!___wait_cond_timeout(condition))                                \
                __ret = __wait_var_event_timeout(var, condition, timeout); \
        __ret;                                                                \
})

#define __wait_var_event_interruptible(var, condition)                        \
        ___wait_var_event(var, condition, TASK_INTERRUPTIBLE, 0, 0,        \
                          schedule())

#define wait_var_event_interruptible(var, condition)                        \
({                                                                        \
        int __ret = 0;                                                        \
        might_sleep();                                                        \
        if (!(condition))                                                \
                __ret = __wait_var_event_interruptible(var, condition);        \
        __ret;                                                                \
})

/**
 * clear_and_wake_up_bit - clear a bit and wake up anyone waiting on that bit
 *
 * @bit: the bit of the word being waited on
 * @word: the word being waited on, a kernel virtual address
 *
 * You can use this helper if bitflags are manipulated atomically rather than
 * non-atomically under a lock.
 */
static inline void clear_and_wake_up_bit(int bit, void *word)
{
        clear_bit_unlock(bit, word);
        /* See wake_up_bit() for which memory barrier you need to use. */
        smp_mb__after_atomic();
        wake_up_bit(word, bit);
}

#endif /* _LINUX_WAIT_BIT_H */


























































    1 




    1 
















    1 































































    1 

















    1 





    1 



















    4 

    1 






    4 
    4 




    4 


    4 


























    4 










    4 
























    1 





























    1 











    1 









































    1 






    1 





    1 




    1 


    1 






    1 
    1 






    1 





    1 










































    1 











































    1 





    1 


    1 





    1 




    1 






    1 


















































    1 





    1 

    1 
    1 
    1 






    1 



    1 



    1 


    1 












    1 

    1 














    1 


    1 


















    1 




    1 







    1 

    1 





    1 
    1 










    1 
    1 

    1 




























    1 






    1 



    1 


    1 




















    1 








    1 


    1 




    1 



















    1 







    1 
    1 

    1 





    1 





    1 
    1 

    1 



    1 










    1 


    1 

    1 


    1 




    1 


    1 

    1 





































































    1 
    1 




    1 






























    1 


    1 


    1 

































































































































































































































































































    1 

    1 
    1 

    1 


    1 
    1 
    1 

    1 
    1 





    1 
    1 
    1 
    1 



    1 





    1 








































    2 


    2 


    2 

    2 
    2 
    1 


    1 




    2 



    1 
    2 
    1 
    1 







    2 






    2 
    2 

    2 



    1 














    1 



    1 




















    1 


    1 
    1 






    1 



    1 




    1 



















    1 
    1 





























































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
// SPDX-License-Identifier: GPL-2.0+
/*
 * XArray implementation
 * Copyright (c) 2017-2018 Microsoft Corporation
 * Copyright (c) 2018-2020 Oracle
 * Author: Matthew Wilcox <willy@infradead.org>
 */

#include <linux/bitmap.h>
#include <linux/export.h>
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/xarray.h>

/*
 * Coding conventions in this file:
 *
 * @xa is used to refer to the entire xarray.
 * @xas is the 'xarray operation state'.  It may be either a pointer to
 * an xa_state, or an xa_state stored on the stack.  This is an unfortunate
 * ambiguity.
 * @index is the index of the entry being operated on
 * @mark is an xa_mark_t; a small number indicating one of the mark bits.
 * @node refers to an xa_node; usually the primary one being operated on by
 * this function.
 * @offset is the index into the slots array inside an xa_node.
 * @parent refers to the @xa_node closer to the head than @node.
 * @entry refers to something stored in a slot in the xarray
 */

static inline unsigned int xa_lock_type(const struct xarray *xa)
{
        return (__force unsigned int)xa->xa_flags & 3;
}

static inline void xas_lock_type(struct xa_state *xas, unsigned int lock_type)
{
        if (lock_type == XA_LOCK_IRQ)
                xas_lock_irq(xas);
        else if (lock_type == XA_LOCK_BH)
                xas_lock_bh(xas);
        else
                xas_lock(xas);
}

static inline void xas_unlock_type(struct xa_state *xas, unsigned int lock_type)
{
        if (lock_type == XA_LOCK_IRQ)
                xas_unlock_irq(xas);
        else if (lock_type == XA_LOCK_BH)
                xas_unlock_bh(xas);
        else
                xas_unlock(xas);
}

static inline bool xa_track_free(const struct xarray *xa)
{
        return xa->xa_flags & XA_FLAGS_TRACK_FREE;
}

static inline bool xa_zero_busy(const struct xarray *xa)
{
        return xa->xa_flags & XA_FLAGS_ZERO_BUSY;
}

static inline void xa_mark_set(struct xarray *xa, xa_mark_t mark)
{
        if (!(xa->xa_flags & XA_FLAGS_MARK(mark)))
                xa->xa_flags |= XA_FLAGS_MARK(mark);
}

static inline void xa_mark_clear(struct xarray *xa, xa_mark_t mark)
{
        if (xa->xa_flags & XA_FLAGS_MARK(mark))
                xa->xa_flags &= ~(XA_FLAGS_MARK(mark));
}

static inline unsigned long *node_marks(struct xa_node *node, xa_mark_t mark)
{
        return node->marks[(__force unsigned)mark];
}

static inline bool node_get_mark(struct xa_node *node,
                unsigned int offset, xa_mark_t mark)
{
        return test_bit(offset, node_marks(node, mark));
}

/* returns true if the bit was set */
static inline bool node_set_mark(struct xa_node *node, unsigned int offset,
                                xa_mark_t mark)
{
        return __test_and_set_bit(offset, node_marks(node, mark));
}

/* returns true if the bit was set */
static inline bool node_clear_mark(struct xa_node *node, unsigned int offset,
                                xa_mark_t mark)
{
        return __test_and_clear_bit(offset, node_marks(node, mark));
}

static inline bool node_any_mark(struct xa_node *node, xa_mark_t mark)
{
        return !bitmap_empty(node_marks(node, mark), XA_CHUNK_SIZE);
}

static inline void node_mark_all(struct xa_node *node, xa_mark_t mark)
{
        bitmap_fill(node_marks(node, mark), XA_CHUNK_SIZE);
}

#define mark_inc(mark) do { \
        mark = (__force xa_mark_t)((__force unsigned)(mark) + 1); \
} while (0)

/*
 * xas_squash_marks() - Merge all marks to the first entry
 * @xas: Array operation state.
 *
 * Set a mark on the first entry if any entry has it set.  Clear marks on
 * all sibling entries.
 */
static void xas_squash_marks(const struct xa_state *xas)
{
        unsigned int mark = 0;
        unsigned int limit = xas->xa_offset + xas->xa_sibs + 1;

        if (!xas->xa_sibs)
                return;

        do {
                unsigned long *marks = xas->xa_node->marks[mark];
                if (find_next_bit(marks, limit, xas->xa_offset + 1) == limit)
                        continue;
                __set_bit(xas->xa_offset, marks);
                bitmap_clear(marks, xas->xa_offset + 1, xas->xa_sibs);
        } while (mark++ != (__force unsigned)XA_MARK_MAX);
}

/* extracts the offset within this node from the index */
static unsigned int get_offset(unsigned long index, struct xa_node *node)
{
        return (index >> node->shift) & XA_CHUNK_MASK;
}

static void xas_set_offset(struct xa_state *xas)
{
        xas->xa_offset = get_offset(xas->xa_index, xas->xa_node);
}

/* move the index either forwards (find) or backwards (sibling slot) */
static void xas_move_index(struct xa_state *xas, unsigned long offset)
{
        unsigned int shift = xas->xa_node->shift;
        xas->xa_index &= ~XA_CHUNK_MASK << shift;
        xas->xa_index += offset << shift;
}

static void xas_advance(struct xa_state *xas)
{
        xas->xa_offset++;
        xas_move_index(xas, xas->xa_offset);
}

static void *set_bounds(struct xa_state *xas)
{
        xas->xa_node = XAS_BOUNDS;
        return NULL;
}

/*
 * Starts a walk.  If the @xas is already valid, we assume that it's on
 * the right path and just return where we've got to.  If we're in an
 * error state, return NULL.  If the index is outside the current scope
 * of the xarray, return NULL without changing @xas->xa_node.  Otherwise
 * set @xas->xa_node to NULL and return the current head of the array.
 */
static void *xas_start(struct xa_state *xas)
{
        void *entry;

        if (xas_valid(xas))
                return xas_reload(xas);
        if (xas_error(xas))
                return NULL;

        entry = xa_head(xas->xa);
        if (!xa_is_node(entry)) {
                if (xas->xa_index)
                        return set_bounds(xas);
        } else {
                if ((xas->xa_index >> xa_to_node(entry)->shift) > XA_CHUNK_MASK)
                        return set_bounds(xas);
        }

        xas->xa_node = NULL;
        return entry;
}

static void *xas_descend(struct xa_state *xas, struct xa_node *node)
{
        unsigned int offset = get_offset(xas->xa_index, node);
        void *entry = xa_entry(xas->xa, node, offset);

        xas->xa_node = node;
        if (xa_is_sibling(entry)) {
                offset = xa_to_sibling(entry);
                entry = xa_entry(xas->xa, node, offset);
        }

        xas->xa_offset = offset;
        return entry;
}

/**
 * xas_load() - Load an entry from the XArray (advanced).
 * @xas: XArray operation state.
 *
 * Usually walks the @xas to the appropriate state to load the entry
 * stored at xa_index.  However, it will do nothing and return %NULL if
 * @xas is in an error state.  xas_load() will never expand the tree.
 *
 * If the xa_state is set up to operate on a multi-index entry, xas_load()
 * may return %NULL or an internal entry, even if there are entries
 * present within the range specified by @xas.
 *
 * Context: Any context.  The caller should hold the xa_lock or the RCU lock.
 * Return: Usually an entry in the XArray, but see description for exceptions.
 */
void *xas_load(struct xa_state *xas)
{
        void *entry = xas_start(xas);

        while (xa_is_node(entry)) {
                struct xa_node *node = xa_to_node(entry);

                if (xas->xa_shift > node->shift)
                        break;
                entry = xas_descend(xas, node);
                if (node->shift == 0)
                        break;
        }
        return entry;
}
EXPORT_SYMBOL_GPL(xas_load);

/* Move the radix tree node cache here */
extern struct kmem_cache *radix_tree_node_cachep;
extern void radix_tree_node_rcu_free(struct rcu_head *head);

#define XA_RCU_FREE        ((struct xarray *)1)

static void xa_node_free(struct xa_node *node)
{
        XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
        node->array = XA_RCU_FREE;
        call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
}

/*
 * xas_destroy() - Free any resources allocated during the XArray operation.
 * @xas: XArray operation state.
 *
 * This function is now internal-only.
 */
static void xas_destroy(struct xa_state *xas)
{
        struct xa_node *next, *node = xas->xa_alloc;

        while (node) {
                XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
                next = rcu_dereference_raw(node->parent);
                radix_tree_node_rcu_free(&node->rcu_head);
                xas->xa_alloc = node = next;
        }
}

/**
 * xas_nomem() - Allocate memory if needed.
 * @xas: XArray operation state.
 * @gfp: Memory allocation flags.
 *
 * If we need to add new nodes to the XArray, we try to allocate memory
 * with GFP_NOWAIT while holding the lock, which will usually succeed.
 * If it fails, @xas is flagged as needing memory to continue.  The caller
 * should drop the lock and call xas_nomem().  If xas_nomem() succeeds,
 * the caller should retry the operation.
 *
 * Forward progress is guaranteed as one node is allocated here and
 * stored in the xa_state where it will be found by xas_alloc().  More
 * nodes will likely be found in the slab allocator, but we do not tie
 * them up here.
 *
 * Return: true if memory was needed, and was successfully allocated.
 */
bool xas_nomem(struct xa_state *xas, gfp_t gfp)
{
        if (xas->xa_node != XA_ERROR(-ENOMEM)) {
                xas_destroy(xas);
                return false;
        }
        if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT)
                gfp |= __GFP_ACCOUNT;
        xas->xa_alloc = kmem_cache_alloc(radix_tree_node_cachep, gfp);
        if (!xas->xa_alloc)
                return false;
        xas->xa_alloc->parent = NULL;
        XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list));
        xas->xa_node = XAS_RESTART;
        return true;
}
EXPORT_SYMBOL_GPL(xas_nomem);

/*
 * __xas_nomem() - Drop locks and allocate memory if needed.
 * @xas: XArray operation state.
 * @gfp: Memory allocation flags.
 *
 * Internal variant of xas_nomem().
 *
 * Return: true if memory was needed, and was successfully allocated.
 */
static bool __xas_nomem(struct xa_state *xas, gfp_t gfp)
        __must_hold(xas->xa->xa_lock)
{
        unsigned int lock_type = xa_lock_type(xas->xa);

        if (xas->xa_node != XA_ERROR(-ENOMEM)) {
                xas_destroy(xas);
                return false;
        }
        if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT)
                gfp |= __GFP_ACCOUNT;
        if (gfpflags_allow_blocking(gfp)) {
                xas_unlock_type(xas, lock_type);
                xas->xa_alloc = kmem_cache_alloc(radix_tree_node_cachep, gfp);
                xas_lock_type(xas, lock_type);
        } else {
                xas->xa_alloc = kmem_cache_alloc(radix_tree_node_cachep, gfp);
        }
        if (!xas->xa_alloc)
                return false;
        xas->xa_alloc->parent = NULL;
        XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list));
        xas->xa_node = XAS_RESTART;
        return true;
}

static void xas_update(struct xa_state *xas, struct xa_node *node)
{
        if (xas->xa_update)
                xas->xa_update(node);
        else
                XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
}

static void *xas_alloc(struct xa_state *xas, unsigned int shift)
{
        struct xa_node *parent = xas->xa_node;
        struct xa_node *node = xas->xa_alloc;

        if (xas_invalid(xas))
                return NULL;

        if (node) {
                xas->xa_alloc = NULL;
        } else {
                gfp_t gfp = GFP_NOWAIT | __GFP_NOWARN;

                if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT)
                        gfp |= __GFP_ACCOUNT;

                node = kmem_cache_alloc(radix_tree_node_cachep, gfp);
                if (!node) {
                        xas_set_err(xas, -ENOMEM);
                        return NULL;
                }
        }

        if (parent) {
                node->offset = xas->xa_offset;
                parent->count++;
                XA_NODE_BUG_ON(node, parent->count > XA_CHUNK_SIZE);
                xas_update(xas, parent);
        }
        XA_NODE_BUG_ON(node, shift > BITS_PER_LONG);
        XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
        node->shift = shift;
        node->count = 0;
        node->nr_values = 0;
        RCU_INIT_POINTER(node->parent, xas->xa_node);
        node->array = xas->xa;

        return node;
}

#ifdef CONFIG_XARRAY_MULTI
/* Returns the number of indices covered by a given xa_state */
static unsigned long xas_size(const struct xa_state *xas)
{
        return (xas->xa_sibs + 1UL) << xas->xa_shift;
}
#endif

/*
 * Use this to calculate the maximum index that will need to be created
 * in order to add the entry described by @xas.  Because we cannot store a
 * multi-index entry at index 0, the calculation is a little more complex
 * than you might expect.
 */
static unsigned long xas_max(struct xa_state *xas)
{
        unsigned long max = xas->xa_index;

#ifdef CONFIG_XARRAY_MULTI
        if (xas->xa_shift || xas->xa_sibs) {
                unsigned long mask = xas_size(xas) - 1;
                max |= mask;
                if (mask == max)
                        max++;
        }
#endif

        return max;
}

/* The maximum index that can be contained in the array without expanding it */
static unsigned long max_index(void *entry)
{
        if (!xa_is_node(entry))
                return 0;
        return (XA_CHUNK_SIZE << xa_to_node(entry)->shift) - 1;
}

static void xas_shrink(struct xa_state *xas)
{
        struct xarray *xa = xas->xa;
        struct xa_node *node = xas->xa_node;

        for (;;) {
                void *entry;

                XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE);
                if (node->count != 1)
                        break;
                entry = xa_entry_locked(xa, node, 0);
                if (!entry)
                        break;
                if (!xa_is_node(entry) && node->shift)
                        break;
                if (xa_is_zero(entry) && xa_zero_busy(xa))
                        entry = NULL;
                xas->xa_node = XAS_BOUNDS;

                RCU_INIT_POINTER(xa->xa_head, entry);
                if (xa_track_free(xa) && !node_get_mark(node, 0, XA_FREE_MARK))
                        xa_mark_clear(xa, XA_FREE_MARK);

                node->count = 0;
                node->nr_values = 0;
                if (!xa_is_node(entry))
                        RCU_INIT_POINTER(node->slots[0], XA_RETRY_ENTRY);
                xas_update(xas, node);
                xa_node_free(node);
                if (!xa_is_node(entry))
                        break;
                node = xa_to_node(entry);
                node->parent = NULL;
        }
}

/*
 * xas_delete_node() - Attempt to delete an xa_node
 * @xas: Array operation state.
 *
 * Attempts to delete the @xas->xa_node.  This will fail if xa->node has
 * a non-zero reference count.
 */
static void xas_delete_node(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;

        for (;;) {
                struct xa_node *parent;

                XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE);
                if (node->count)
                        break;

                parent = xa_parent_locked(xas->xa, node);
                xas->xa_node = parent;
                xas->xa_offset = node->offset;
                xa_node_free(node);

                if (!parent) {
                        xas->xa->xa_head = NULL;
                        xas->xa_node = XAS_BOUNDS;
                        return;
                }

                parent->slots[xas->xa_offset] = NULL;
                parent->count--;
                XA_NODE_BUG_ON(parent, parent->count > XA_CHUNK_SIZE);
                node = parent;
                xas_update(xas, node);
        }

        if (!node->parent)
                xas_shrink(xas);
}

/**
 * xas_free_nodes() - Free this node and all nodes that it references
 * @xas: Array operation state.
 * @top: Node to free
 *
 * This node has been removed from the tree.  We must now free it and all
 * of its subnodes.  There may be RCU walkers with references into the tree,
 * so we must replace all entries with retry markers.
 */
static void xas_free_nodes(struct xa_state *xas, struct xa_node *top)
{
        unsigned int offset = 0;
        struct xa_node *node = top;

        for (;;) {
                void *entry = xa_entry_locked(xas->xa, node, offset);

                if (node->shift && xa_is_node(entry)) {
                        node = xa_to_node(entry);
                        offset = 0;
                        continue;
                }
                if (entry)
                        RCU_INIT_POINTER(node->slots[offset], XA_RETRY_ENTRY);
                offset++;
                while (offset == XA_CHUNK_SIZE) {
                        struct xa_node *parent;

                        parent = xa_parent_locked(xas->xa, node);
                        offset = node->offset + 1;
                        node->count = 0;
                        node->nr_values = 0;
                        xas_update(xas, node);
                        xa_node_free(node);
                        if (node == top)
                                return;
                        node = parent;
                }
        }
}

/*
 * xas_expand adds nodes to the head of the tree until it has reached
 * sufficient height to be able to contain @xas->xa_index
 */
static int xas_expand(struct xa_state *xas, void *head)
{
        struct xarray *xa = xas->xa;
        struct xa_node *node = NULL;
        unsigned int shift = 0;
        unsigned long max = xas_max(xas);

        if (!head) {
                if (max == 0)
                        return 0;
                while ((max >> shift) >= XA_CHUNK_SIZE)
                        shift += XA_CHUNK_SHIFT;
                return shift + XA_CHUNK_SHIFT;
        } else if (xa_is_node(head)) {
                node = xa_to_node(head);
                shift = node->shift + XA_CHUNK_SHIFT;
        }
        xas->xa_node = NULL;

        while (max > max_index(head)) {
                xa_mark_t mark = 0;

                XA_NODE_BUG_ON(node, shift > BITS_PER_LONG);
                node = xas_alloc(xas, shift);
                if (!node)
                        return -ENOMEM;

                node->count = 1;
                if (xa_is_value(head))
                        node->nr_values = 1;
                RCU_INIT_POINTER(node->slots[0], head);

                /* Propagate the aggregated mark info to the new child */
                for (;;) {
                        if (xa_track_free(xa) && mark == XA_FREE_MARK) {
                                node_mark_all(node, XA_FREE_MARK);
                                if (!xa_marked(xa, XA_FREE_MARK)) {
                                        node_clear_mark(node, 0, XA_FREE_MARK);
                                        xa_mark_set(xa, XA_FREE_MARK);
                                }
                        } else if (xa_marked(xa, mark)) {
                                node_set_mark(node, 0, mark);
                        }
                        if (mark == XA_MARK_MAX)
                                break;
                        mark_inc(mark);
                }

                /*
                 * Now that the new node is fully initialised, we can add
                 * it to the tree
                 */
                if (xa_is_node(head)) {
                        xa_to_node(head)->offset = 0;
                        rcu_assign_pointer(xa_to_node(head)->parent, node);
                }
                head = xa_mk_node(node);
                rcu_assign_pointer(xa->xa_head, head);
                xas_update(xas, node);

                shift += XA_CHUNK_SHIFT;
        }

        xas->xa_node = node;
        return shift;
}

/*
 * xas_create() - Create a slot to store an entry in.
 * @xas: XArray operation state.
 * @allow_root: %true if we can store the entry in the root directly
 *
 * Most users will not need to call this function directly, as it is called
 * by xas_store().  It is useful for doing conditional store operations
 * (see the xa_cmpxchg() implementation for an example).
 *
 * Return: If the slot already existed, returns the contents of this slot.
 * If the slot was newly created, returns %NULL.  If it failed to create the
 * slot, returns %NULL and indicates the error in @xas.
 */
static void *xas_create(struct xa_state *xas, bool allow_root)
{
        struct xarray *xa = xas->xa;
        void *entry;
        void __rcu **slot;
        struct xa_node *node = xas->xa_node;
        int shift;
        unsigned int order = xas->xa_shift;

        if (xas_top(node)) {
                entry = xa_head_locked(xa);
                xas->xa_node = NULL;
                if (!entry && xa_zero_busy(xa))
                        entry = XA_ZERO_ENTRY;
                shift = xas_expand(xas, entry);
                if (shift < 0)
                        return NULL;
                if (!shift && !allow_root)
                        shift = XA_CHUNK_SHIFT;
                entry = xa_head_locked(xa);
                slot = &xa->xa_head;
        } else if (xas_error(xas)) {
                return NULL;
        } else if (node) {
                unsigned int offset = xas->xa_offset;

                shift = node->shift;
                entry = xa_entry_locked(xa, node, offset);
                slot = &node->slots[offset];
        } else {
                shift = 0;
                entry = xa_head_locked(xa);
                slot = &xa->xa_head;
        }

        while (shift > order) {
                shift -= XA_CHUNK_SHIFT;
                if (!entry) {
                        node = xas_alloc(xas, shift);
                        if (!node)
                                break;
                        if (xa_track_free(xa))
                                node_mark_all(node, XA_FREE_MARK);
                        rcu_assign_pointer(*slot, xa_mk_node(node));
                } else if (xa_is_node(entry)) {
                        node = xa_to_node(entry);
                } else {
                        break;
                }
                entry = xas_descend(xas, node);
                slot = &node->slots[xas->xa_offset];
        }

        return entry;
}

/**
 * xas_create_range() - Ensure that stores to this range will succeed
 * @xas: XArray operation state.
 *
 * Creates all of the slots in the range covered by @xas.  Sets @xas to
 * create single-index entries and positions it at the beginning of the
 * range.  This is for the benefit of users which have not yet been
 * converted to use multi-index entries.
 */
void xas_create_range(struct xa_state *xas)
{
        unsigned long index = xas->xa_index;
        unsigned char shift = xas->xa_shift;
        unsigned char sibs = xas->xa_sibs;

        xas->xa_index |= ((sibs + 1UL) << shift) - 1;
        if (xas_is_node(xas) && xas->xa_node->shift == xas->xa_shift)
                xas->xa_offset |= sibs;
        xas->xa_shift = 0;
        xas->xa_sibs = 0;

        for (;;) {
                xas_create(xas, true);
                if (xas_error(xas))
                        goto restore;
                if (xas->xa_index <= (index | XA_CHUNK_MASK))
                        goto success;
                xas->xa_index -= XA_CHUNK_SIZE;

                for (;;) {
                        struct xa_node *node = xas->xa_node;
                        if (node->shift >= shift)
                                break;
                        xas->xa_node = xa_parent_locked(xas->xa, node);
                        xas->xa_offset = node->offset - 1;
                        if (node->offset != 0)
                                break;
                }
        }

restore:
        xas->xa_shift = shift;
        xas->xa_sibs = sibs;
        xas->xa_index = index;
        return;
success:
        xas->xa_index = index;
        if (xas->xa_node)
                xas_set_offset(xas);
}
EXPORT_SYMBOL_GPL(xas_create_range);

static void update_node(struct xa_state *xas, struct xa_node *node,
                int count, int values)
{
        if (!node || (!count && !values))
                return;

        node->count += count;
        node->nr_values += values;
        XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE);
        XA_NODE_BUG_ON(node, node->nr_values > XA_CHUNK_SIZE);
        xas_update(xas, node);
        if (count < 0)
                xas_delete_node(xas);
}

/**
 * xas_store() - Store this entry in the XArray.
 * @xas: XArray operation state.
 * @entry: New entry.
 *
 * If @xas is operating on a multi-index entry, the entry returned by this
 * function is essentially meaningless (it may be an internal entry or it
 * may be %NULL, even if there are non-NULL entries at some of the indices
 * covered by the range).  This is not a problem for any current users,
 * and can be changed if needed.
 *
 * Return: The old entry at this index.
 */
void *xas_store(struct xa_state *xas, void *entry)
{
        struct xa_node *node;
        void __rcu **slot = &xas->xa->xa_head;
        unsigned int offset, max;
        int count = 0;
        int values = 0;
        void *first, *next;
        bool value = xa_is_value(entry);

        if (entry) {
                bool allow_root = !xa_is_node(entry) && !xa_is_zero(entry);
                first = xas_create(xas, allow_root);
        } else {
                first = xas_load(xas);
        }

        if (xas_invalid(xas))
                return first;
        node = xas->xa_node;
        if (node && (xas->xa_shift < node->shift))
                xas->xa_sibs = 0;
        if ((first == entry) && !xas->xa_sibs)
                return first;

        next = first;
        offset = xas->xa_offset;
        max = xas->xa_offset + xas->xa_sibs;
        if (node) {
                slot = &node->slots[offset];
                if (xas->xa_sibs)
                        xas_squash_marks(xas);
        }
        if (!entry)
                xas_init_marks(xas);

        for (;;) {
                /*
                 * Must clear the marks before setting the entry to NULL,
                 * otherwise xas_for_each_marked may find a NULL entry and
                 * stop early.  rcu_assign_pointer contains a release barrier
                 * so the mark clearing will appear to happen before the
                 * entry is set to NULL.
                 */
                rcu_assign_pointer(*slot, entry);
                if (xa_is_node(next) && (!node || node->shift))
                        xas_free_nodes(xas, xa_to_node(next));
                if (!node)
                        break;
                count += !next - !entry;
                values += !xa_is_value(first) - !value;
                if (entry) {
                        if (offset == max)
                                break;
                        if (!xa_is_sibling(entry))
                                entry = xa_mk_sibling(xas->xa_offset);
                } else {
                        if (offset == XA_CHUNK_MASK)
                                break;
                }
                next = xa_entry_locked(xas->xa, node, ++offset);
                if (!xa_is_sibling(next)) {
                        if (!entry && (offset > max))
                                break;
                        first = next;
                }
                slot++;
        }

        update_node(xas, node, count, values);
        return first;
}
EXPORT_SYMBOL_GPL(xas_store);

/**
 * xas_get_mark() - Returns the state of this mark.
 * @xas: XArray operation state.
 * @mark: Mark number.
 *
 * Return: true if the mark is set, false if the mark is clear or @xas
 * is in an error state.
 */
bool xas_get_mark(const struct xa_state *xas, xa_mark_t mark)
{
        if (xas_invalid(xas))
                return false;
        if (!xas->xa_node)
                return xa_marked(xas->xa, mark);
        return node_get_mark(xas->xa_node, xas->xa_offset, mark);
}
EXPORT_SYMBOL_GPL(xas_get_mark);

/**
 * xas_set_mark() - Sets the mark on this entry and its parents.
 * @xas: XArray operation state.
 * @mark: Mark number.
 *
 * Sets the specified mark on this entry, and walks up the tree setting it
 * on all the ancestor entries.  Does nothing if @xas has not been walked to
 * an entry, or is in an error state.
 */
void xas_set_mark(const struct xa_state *xas, xa_mark_t mark)
{
        struct xa_node *node = xas->xa_node;
        unsigned int offset = xas->xa_offset;

        if (xas_invalid(xas))
                return;

        while (node) {
                if (node_set_mark(node, offset, mark))
                        return;
                offset = node->offset;
                node = xa_parent_locked(xas->xa, node);
        }

        if (!xa_marked(xas->xa, mark))
                xa_mark_set(xas->xa, mark);
}
EXPORT_SYMBOL_GPL(xas_set_mark);

/**
 * xas_clear_mark() - Clears the mark on this entry and its parents.
 * @xas: XArray operation state.
 * @mark: Mark number.
 *
 * Clears the specified mark on this entry, and walks back to the head
 * attempting to clear it on all the ancestor entries.  Does nothing if
 * @xas has not been walked to an entry, or is in an error state.
 */
void xas_clear_mark(const struct xa_state *xas, xa_mark_t mark)
{
        struct xa_node *node = xas->xa_node;
        unsigned int offset = xas->xa_offset;

        if (xas_invalid(xas))
                return;

        while (node) {
                if (!node_clear_mark(node, offset, mark))
                        return;
                if (node_any_mark(node, mark))
                        return;

                offset = node->offset;
                node = xa_parent_locked(xas->xa, node);
        }

        if (xa_marked(xas->xa, mark))
                xa_mark_clear(xas->xa, mark);
}
EXPORT_SYMBOL_GPL(xas_clear_mark);

/**
 * xas_init_marks() - Initialise all marks for the entry
 * @xas: Array operations state.
 *
 * Initialise all marks for the entry specified by @xas.  If we're tracking
 * free entries with a mark, we need to set it on all entries.  All other
 * marks are cleared.
 *
 * This implementation is not as efficient as it could be; we may walk
 * up the tree multiple times.
 */
void xas_init_marks(const struct xa_state *xas)
{
        xa_mark_t mark = 0;

        for (;;) {
                if (xa_track_free(xas->xa) && mark == XA_FREE_MARK)
                        xas_set_mark(xas, mark);
                else
                        xas_clear_mark(xas, mark);
                if (mark == XA_MARK_MAX)
                        break;
                mark_inc(mark);
        }
}
EXPORT_SYMBOL_GPL(xas_init_marks);

#ifdef CONFIG_XARRAY_MULTI
static unsigned int node_get_marks(struct xa_node *node, unsigned int offset)
{
        unsigned int marks = 0;
        xa_mark_t mark = XA_MARK_0;

        for (;;) {
                if (node_get_mark(node, offset, mark))
                        marks |= 1 << (__force unsigned int)mark;
                if (mark == XA_MARK_MAX)
                        break;
                mark_inc(mark);
        }

        return marks;
}

static void node_set_marks(struct xa_node *node, unsigned int offset,
                        struct xa_node *child, unsigned int marks)
{
        xa_mark_t mark = XA_MARK_0;

        for (;;) {
                if (marks & (1 << (__force unsigned int)mark)) {
                        node_set_mark(node, offset, mark);
                        if (child)
                                node_mark_all(child, mark);
                }
                if (mark == XA_MARK_MAX)
                        break;
                mark_inc(mark);
        }
}

/**
 * xas_split_alloc() - Allocate memory for splitting an entry.
 * @xas: XArray operation state.
 * @entry: New entry which will be stored in the array.
 * @order: New entry order.
 * @gfp: Memory allocation flags.
 *
 * This function should be called before calling xas_split().
 * If necessary, it will allocate new nodes (and fill them with @entry)
 * to prepare for the upcoming split of an entry of @order size into
 * entries of the order stored in the @xas.
 *
 * Context: May sleep if @gfp flags permit.
 */
void xas_split_alloc(struct xa_state *xas, void *entry, unsigned int order,
                gfp_t gfp)
{
        unsigned int sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1;
        unsigned int mask = xas->xa_sibs;

        /* XXX: no support for splitting really large entries yet */
        if (WARN_ON(xas->xa_shift + 2 * XA_CHUNK_SHIFT < order))
                goto nomem;
        if (xas->xa_shift + XA_CHUNK_SHIFT > order)
                return;

        do {
                unsigned int i;
                void *sibling = NULL;
                struct xa_node *node;

                node = kmem_cache_alloc(radix_tree_node_cachep, gfp);
                if (!node)
                        goto nomem;
                node->array = xas->xa;
                for (i = 0; i < XA_CHUNK_SIZE; i++) {
                        if ((i & mask) == 0) {
                                RCU_INIT_POINTER(node->slots[i], entry);
                                sibling = xa_mk_sibling(i);
                        } else {
                                RCU_INIT_POINTER(node->slots[i], sibling);
                        }
                }
                RCU_INIT_POINTER(node->parent, xas->xa_alloc);
                xas->xa_alloc = node;
        } while (sibs-- > 0);

        return;
nomem:
        xas_destroy(xas);
        xas_set_err(xas, -ENOMEM);
}
EXPORT_SYMBOL_GPL(xas_split_alloc);

/**
 * xas_split() - Split a multi-index entry into smaller entries.
 * @xas: XArray operation state.
 * @entry: New entry to store in the array.
 * @order: New entry order.
 *
 * The value in the entry is copied to all the replacement entries.
 *
 * Context: Any context.  The caller should hold the xa_lock.
 */
void xas_split(struct xa_state *xas, void *entry, unsigned int order)
{
        unsigned int sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1;
        unsigned int offset, marks;
        struct xa_node *node;
        void *curr = xas_load(xas);
        int values = 0;

        node = xas->xa_node;
        if (xas_top(node))
                return;

        marks = node_get_marks(node, xas->xa_offset);

        offset = xas->xa_offset + sibs;
        do {
                if (xas->xa_shift < node->shift) {
                        struct xa_node *child = xas->xa_alloc;

                        xas->xa_alloc = rcu_dereference_raw(child->parent);
                        child->shift = node->shift - XA_CHUNK_SHIFT;
                        child->offset = offset;
                        child->count = XA_CHUNK_SIZE;
                        child->nr_values = xa_is_value(entry) ?
                                        XA_CHUNK_SIZE : 0;
                        RCU_INIT_POINTER(child->parent, node);
                        node_set_marks(node, offset, child, marks);
                        rcu_assign_pointer(node->slots[offset],
                                        xa_mk_node(child));
                        if (xa_is_value(curr))
                                values--;
                        xas_update(xas, child);
                } else {
                        unsigned int canon = offset - xas->xa_sibs;

                        node_set_marks(node, canon, NULL, marks);
                        rcu_assign_pointer(node->slots[canon], entry);
                        while (offset > canon)
                                rcu_assign_pointer(node->slots[offset--],
                                                xa_mk_sibling(canon));
                        values += (xa_is_value(entry) - xa_is_value(curr)) *
                                        (xas->xa_sibs + 1);
                }
        } while (offset-- > xas->xa_offset);

        node->nr_values += values;
        xas_update(xas, node);
}
EXPORT_SYMBOL_GPL(xas_split);
#endif

/**
 * xas_pause() - Pause a walk to drop a lock.
 * @xas: XArray operation state.
 *
 * Some users need to pause a walk and drop the lock they're holding in
 * order to yield to a higher priority thread or carry out an operation
 * on an entry.  Those users should call this function before they drop
 * the lock.  It resets the @xas to be suitable for the next iteration
 * of the loop after the user has reacquired the lock.  If most entries
 * found during a walk require you to call xas_pause(), the xa_for_each()
 * iterator may be more appropriate.
 *
 * Note that xas_pause() only works for forward iteration.  If a user needs
 * to pause a reverse iteration, we will need a xas_pause_rev().
 */
void xas_pause(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;

        if (xas_invalid(xas))
                return;

        xas->xa_node = XAS_RESTART;
        if (node) {
                unsigned long offset = xas->xa_offset;
                while (++offset < XA_CHUNK_SIZE) {
                        if (!xa_is_sibling(xa_entry(xas->xa, node, offset)))
                                break;
                }
                xas->xa_index += (offset - xas->xa_offset) << node->shift;
                if (xas->xa_index == 0)
                        xas->xa_node = XAS_BOUNDS;
        } else {
                xas->xa_index++;
        }
}
EXPORT_SYMBOL_GPL(xas_pause);

/*
 * __xas_prev() - Find the previous entry in the XArray.
 * @xas: XArray operation state.
 *
 * Helper function for xas_prev() which handles all the complex cases
 * out of line.
 */
void *__xas_prev(struct xa_state *xas)
{
        void *entry;

        if (!xas_frozen(xas->xa_node))
                xas->xa_index--;
        if (!xas->xa_node)
                return set_bounds(xas);
        if (xas_not_node(xas->xa_node))
                return xas_load(xas);

        if (xas->xa_offset != get_offset(xas->xa_index, xas->xa_node))
                xas->xa_offset--;

        while (xas->xa_offset == 255) {
                xas->xa_offset = xas->xa_node->offset - 1;
                xas->xa_node = xa_parent(xas->xa, xas->xa_node);
                if (!xas->xa_node)
                        return set_bounds(xas);
        }

        for (;;) {
                entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
                if (!xa_is_node(entry))
                        return entry;

                xas->xa_node = xa_to_node(entry);
                xas_set_offset(xas);
        }
}
EXPORT_SYMBOL_GPL(__xas_prev);

/*
 * __xas_next() - Find the next entry in the XArray.
 * @xas: XArray operation state.
 *
 * Helper function for xas_next() which handles all the complex cases
 * out of line.
 */
void *__xas_next(struct xa_state *xas)
{
        void *entry;

        if (!xas_frozen(xas->xa_node))
                xas->xa_index++;
        if (!xas->xa_node)
                return set_bounds(xas);
        if (xas_not_node(xas->xa_node))
                return xas_load(xas);

        if (xas->xa_offset != get_offset(xas->xa_index, xas->xa_node))
                xas->xa_offset++;

        while (xas->xa_offset == XA_CHUNK_SIZE) {
                xas->xa_offset = xas->xa_node->offset + 1;
                xas->xa_node = xa_parent(xas->xa, xas->xa_node);
                if (!xas->xa_node)
                        return set_bounds(xas);
        }

        for (;;) {
                entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
                if (!xa_is_node(entry))
                        return entry;

                xas->xa_node = xa_to_node(entry);
                xas_set_offset(xas);
        }
}
EXPORT_SYMBOL_GPL(__xas_next);

/**
 * xas_find() - Find the next present entry in the XArray.
 * @xas: XArray operation state.
 * @max: Highest index to return.
 *
 * If the @xas has not yet been walked to an entry, return the entry
 * which has an index >= xas.xa_index.  If it has been walked, the entry
 * currently being pointed at has been processed, and so we move to the
 * next entry.
 *
 * If no entry is found and the array is smaller than @max, the iterator
 * is set to the smallest index not yet in the array.  This allows @xas
 * to be immediately passed to xas_store().
 *
 * Return: The entry, if found, otherwise %NULL.
 */
void *xas_find(struct xa_state *xas, unsigned long max)
{
        void *entry;

        if (xas_error(xas) || xas->xa_node == XAS_BOUNDS)
                return NULL;
        if (xas->xa_index > max)
                return set_bounds(xas);

        if (!xas->xa_node) {
                xas->xa_index = 1;
                return set_bounds(xas);
        } else if (xas->xa_node == XAS_RESTART) {
                entry = xas_load(xas);
                if (entry || xas_not_node(xas->xa_node))
                        return entry;
        } else if (!xas->xa_node->shift &&
                    xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK)) {
                xas->xa_offset = ((xas->xa_index - 1) & XA_CHUNK_MASK) + 1;
        }

        xas_advance(xas);

        while (xas->xa_node && (xas->xa_index <= max)) {
                if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) {
                        xas->xa_offset = xas->xa_node->offset + 1;
                        xas->xa_node = xa_parent(xas->xa, xas->xa_node);
                        continue;
                }

                entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
                if (xa_is_node(entry)) {
                        xas->xa_node = xa_to_node(entry);
                        xas->xa_offset = 0;
                        continue;
                }
                if (entry && !xa_is_sibling(entry))
                        return entry;

                xas_advance(xas);
        }

        if (!xas->xa_node)
                xas->xa_node = XAS_BOUNDS;
        return NULL;
}
EXPORT_SYMBOL_GPL(xas_find);

/**
 * xas_find_marked() - Find the next marked entry in the XArray.
 * @xas: XArray operation state.
 * @max: Highest index to return.
 * @mark: Mark number to search for.
 *
 * If the @xas has not yet been walked to an entry, return the marked entry
 * which has an index >= xas.xa_index.  If it has been walked, the entry
 * currently being pointed at has been processed, and so we return the
 * first marked entry with an index > xas.xa_index.
 *
 * If no marked entry is found and the array is smaller than @max, @xas is
 * set to the bounds state and xas->xa_index is set to the smallest index
 * not yet in the array.  This allows @xas to be immediately passed to
 * xas_store().
 *
 * If no entry is found before @max is reached, @xas is set to the restart
 * state.
 *
 * Return: The entry, if found, otherwise %NULL.
 */
void *xas_find_marked(struct xa_state *xas, unsigned long max, xa_mark_t mark)
{
        bool advance = true;
        unsigned int offset;
        void *entry;

        if (xas_error(xas))
                return NULL;
        if (xas->xa_index > max)
                goto max;

        if (!xas->xa_node) {
                xas->xa_index = 1;
                goto out;
        } else if (xas_top(xas->xa_node)) {
                advance = false;
                entry = xa_head(xas->xa);
                xas->xa_node = NULL;
                if (xas->xa_index > max_index(entry))
                        goto out;
                if (!xa_is_node(entry)) {
                        if (xa_marked(xas->xa, mark))
                                return entry;
                        xas->xa_index = 1;
                        goto out;
                }
                xas->xa_node = xa_to_node(entry);
                xas->xa_offset = xas->xa_index >> xas->xa_node->shift;
        }

        while (xas->xa_index <= max) {
                if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) {
                        xas->xa_offset = xas->xa_node->offset + 1;
                        xas->xa_node = xa_parent(xas->xa, xas->xa_node);
                        if (!xas->xa_node)
                                break;
                        advance = false;
                        continue;
                }

                if (!advance) {
                        entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
                        if (xa_is_sibling(entry)) {
                                xas->xa_offset = xa_to_sibling(entry);
                                xas_move_index(xas, xas->xa_offset);
                        }
                }

                offset = xas_find_chunk(xas, advance, mark);
                if (offset > xas->xa_offset) {
                        advance = false;
                        xas_move_index(xas, offset);
                        /* Mind the wrap */
                        if ((xas->xa_index - 1) >= max)
                                goto max;
                        xas->xa_offset = offset;
                        if (offset == XA_CHUNK_SIZE)
                                continue;
                }

                entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
                if (!entry && !(xa_track_free(xas->xa) && mark == XA_FREE_MARK))
                        continue;
                if (!xa_is_node(entry))
                        return entry;
                xas->xa_node = xa_to_node(entry);
                xas_set_offset(xas);
        }

out:
        if (xas->xa_index > max)
                goto max;
        return set_bounds(xas);
max:
        xas->xa_node = XAS_RESTART;
        return NULL;
}
EXPORT_SYMBOL_GPL(xas_find_marked);

/**
 * xas_find_conflict() - Find the next present entry in a range.
 * @xas: XArray operation state.
 *
 * The @xas describes both a range and a position within that range.
 *
 * Context: Any context.  Expects xa_lock to be held.
 * Return: The next entry in the range covered by @xas or %NULL.
 */
void *xas_find_conflict(struct xa_state *xas)
{
        void *curr;

        if (xas_error(xas))
                return NULL;

        if (!xas->xa_node)
                return NULL;

        if (xas_top(xas->xa_node)) {
                curr = xas_start(xas);
                if (!curr)
                        return NULL;
                while (xa_is_node(curr)) {
                        struct xa_node *node = xa_to_node(curr);
                        curr = xas_descend(xas, node);
                }
                if (curr)
                        return curr;
        }

        if (xas->xa_node->shift > xas->xa_shift)
                return NULL;

        for (;;) {
                if (xas->xa_node->shift == xas->xa_shift) {
                        if ((xas->xa_offset & xas->xa_sibs) == xas->xa_sibs)
                                break;
                } else if (xas->xa_offset == XA_CHUNK_MASK) {
                        xas->xa_offset = xas->xa_node->offset;
                        xas->xa_node = xa_parent_locked(xas->xa, xas->xa_node);
                        if (!xas->xa_node)
                                break;
                        continue;
                }
                curr = xa_entry_locked(xas->xa, xas->xa_node, ++xas->xa_offset);
                if (xa_is_sibling(curr))
                        continue;
                while (xa_is_node(curr)) {
                        xas->xa_node = xa_to_node(curr);
                        xas->xa_offset = 0;
                        curr = xa_entry_locked(xas->xa, xas->xa_node, 0);
                }
                if (curr)
                        return curr;
        }
        xas->xa_offset -= xas->xa_sibs;
        return NULL;
}
EXPORT_SYMBOL_GPL(xas_find_conflict);

/**
 * xa_load() - Load an entry from an XArray.
 * @xa: XArray.
 * @index: index into array.
 *
 * Context: Any context.  Takes and releases the RCU lock.
 * Return: The entry at @index in @xa.
 */
void *xa_load(struct xarray *xa, unsigned long index)
{
        XA_STATE(xas, xa, index);
        void *entry;

        rcu_read_lock();
        do {
                entry = xas_load(&xas);
                if (xa_is_zero(entry))
                        entry = NULL;
        } while (xas_retry(&xas, entry));
        rcu_read_unlock();

        return entry;
}
EXPORT_SYMBOL(xa_load);

static void *xas_result(struct xa_state *xas, void *curr)
{
        if (xa_is_zero(curr))
                return NULL;
        if (xas_error(xas))
                curr = xas->xa_node;
        return curr;
}

/**
 * __xa_erase() - Erase this entry from the XArray while locked.
 * @xa: XArray.
 * @index: Index into array.
 *
 * After this function returns, loading from @index will return %NULL.
 * If the index is part of a multi-index entry, all indices will be erased
 * and none of the entries will be part of a multi-index entry.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.
 * Return: The entry which used to be at this index.
 */
void *__xa_erase(struct xarray *xa, unsigned long index)
{
        XA_STATE(xas, xa, index);
        return xas_result(&xas, xas_store(&xas, NULL));
}
EXPORT_SYMBOL(__xa_erase);

/**
 * xa_erase() - Erase this entry from the XArray.
 * @xa: XArray.
 * @index: Index of entry.
 *
 * After this function returns, loading from @index will return %NULL.
 * If the index is part of a multi-index entry, all indices will be erased
 * and none of the entries will be part of a multi-index entry.
 *
 * Context: Any context.  Takes and releases the xa_lock.
 * Return: The entry which used to be at this index.
 */
void *xa_erase(struct xarray *xa, unsigned long index)
{
        void *entry;

        xa_lock(xa);
        entry = __xa_erase(xa, index);
        xa_unlock(xa);

        return entry;
}
EXPORT_SYMBOL(xa_erase);

/**
 * __xa_store() - Store this entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * You must already be holding the xa_lock when calling this function.
 * It will drop the lock if needed to allocate memory, and then reacquire
 * it afterwards.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.  May
 * release and reacquire xa_lock if @gfp flags permit.
 * Return: The old entry at this index or xa_err() if an error happened.
 */
void *__xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp)
{
        XA_STATE(xas, xa, index);
        void *curr;

        if (WARN_ON_ONCE(xa_is_advanced(entry)))
                return XA_ERROR(-EINVAL);
        if (xa_track_free(xa) && !entry)
                entry = XA_ZERO_ENTRY;

        do {
                curr = xas_store(&xas, entry);
                if (xa_track_free(xa))
                        xas_clear_mark(&xas, XA_FREE_MARK);
        } while (__xas_nomem(&xas, gfp));

        return xas_result(&xas, curr);
}
EXPORT_SYMBOL(__xa_store);

/**
 * xa_store() - Store this entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * After this function returns, loads from this index will return @entry.
 * Storing into an existing multi-index entry updates the entry of every index.
 * The marks associated with @index are unaffected unless @entry is %NULL.
 *
 * Context: Any context.  Takes and releases the xa_lock.
 * May sleep if the @gfp flags permit.
 * Return: The old entry at this index on success, xa_err(-EINVAL) if @entry
 * cannot be stored in an XArray, or xa_err(-ENOMEM) if memory allocation
 * failed.
 */
void *xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp)
{
        void *curr;

        xa_lock(xa);
        curr = __xa_store(xa, index, entry, gfp);
        xa_unlock(xa);

        return curr;
}
EXPORT_SYMBOL(xa_store);

/**
 * __xa_cmpxchg() - Store this entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @old: Old value to test against.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * You must already be holding the xa_lock when calling this function.
 * It will drop the lock if needed to allocate memory, and then reacquire
 * it afterwards.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.  May
 * release and reacquire xa_lock if @gfp flags permit.
 * Return: The old entry at this index or xa_err() if an error happened.
 */
void *__xa_cmpxchg(struct xarray *xa, unsigned long index,
                        void *old, void *entry, gfp_t gfp)
{
        XA_STATE(xas, xa, index);
        void *curr;

        if (WARN_ON_ONCE(xa_is_advanced(entry)))
                return XA_ERROR(-EINVAL);

        do {
                curr = xas_load(&xas);
                if (curr == old) {
                        xas_store(&xas, entry);
                        if (xa_track_free(xa) && entry && !curr)
                                xas_clear_mark(&xas, XA_FREE_MARK);
                }
        } while (__xas_nomem(&xas, gfp));

        return xas_result(&xas, curr);
}
EXPORT_SYMBOL(__xa_cmpxchg);

/**
 * __xa_insert() - Store this entry in the XArray if no entry is present.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * Inserting a NULL entry will store a reserved entry (like xa_reserve())
 * if no entry is present.  Inserting will fail if a reserved entry is
 * present, even though loading from this index will return NULL.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.  May
 * release and reacquire xa_lock if @gfp flags permit.
 * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
 * -ENOMEM if memory could not be allocated.
 */
int __xa_insert(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp)
{
        XA_STATE(xas, xa, index);
        void *curr;

        if (WARN_ON_ONCE(xa_is_advanced(entry)))
                return -EINVAL;
        if (!entry)
                entry = XA_ZERO_ENTRY;

        do {
                curr = xas_load(&xas);
                if (!curr) {
                        xas_store(&xas, entry);
                        if (xa_track_free(xa))
                                xas_clear_mark(&xas, XA_FREE_MARK);
                } else {
                        xas_set_err(&xas, -EBUSY);
                }
        } while (__xas_nomem(&xas, gfp));

        return xas_error(&xas);
}
EXPORT_SYMBOL(__xa_insert);

#ifdef CONFIG_XARRAY_MULTI
static void xas_set_range(struct xa_state *xas, unsigned long first,
                unsigned long last)
{
        unsigned int shift = 0;
        unsigned long sibs = last - first;
        unsigned int offset = XA_CHUNK_MASK;

        xas_set(xas, first);

        while ((first & XA_CHUNK_MASK) == 0) {
                if (sibs < XA_CHUNK_MASK)
                        break;
                if ((sibs == XA_CHUNK_MASK) && (offset < XA_CHUNK_MASK))
                        break;
                shift += XA_CHUNK_SHIFT;
                if (offset == XA_CHUNK_MASK)
                        offset = sibs & XA_CHUNK_MASK;
                sibs >>= XA_CHUNK_SHIFT;
                first >>= XA_CHUNK_SHIFT;
        }

        offset = first & XA_CHUNK_MASK;
        if (offset + sibs > XA_CHUNK_MASK)
                sibs = XA_CHUNK_MASK - offset;
        if ((((first + sibs + 1) << shift) - 1) > last)
                sibs -= 1;

        xas->xa_shift = shift;
        xas->xa_sibs = sibs;
}

/**
 * xa_store_range() - Store this entry at a range of indices in the XArray.
 * @xa: XArray.
 * @first: First index to affect.
 * @last: Last index to affect.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * After this function returns, loads from any index between @first and @last,
 * inclusive will return @entry.
 * Storing into an existing multi-index entry updates the entry of every index.
 * The marks associated with @index are unaffected unless @entry is %NULL.
 *
 * Context: Process context.  Takes and releases the xa_lock.  May sleep
 * if the @gfp flags permit.
 * Return: %NULL on success, xa_err(-EINVAL) if @entry cannot be stored in
 * an XArray, or xa_err(-ENOMEM) if memory allocation failed.
 */
void *xa_store_range(struct xarray *xa, unsigned long first,
                unsigned long last, void *entry, gfp_t gfp)
{
        XA_STATE(xas, xa, 0);

        if (WARN_ON_ONCE(xa_is_internal(entry)))
                return XA_ERROR(-EINVAL);
        if (last < first)
                return XA_ERROR(-EINVAL);

        do {
                xas_lock(&xas);
                if (entry) {
                        unsigned int order = BITS_PER_LONG;
                        if (last + 1)
                                order = __ffs(last + 1);
                        xas_set_order(&xas, last, order);
                        xas_create(&xas, true);
                        if (xas_error(&xas))
                                goto unlock;
                }
                do {
                        xas_set_range(&xas, first, last);
                        xas_store(&xas, entry);
                        if (xas_error(&xas))
                                goto unlock;
                        first += xas_size(&xas);
                } while (first <= last);
unlock:
                xas_unlock(&xas);
        } while (xas_nomem(&xas, gfp));

        return xas_result(&xas, NULL);
}
EXPORT_SYMBOL(xa_store_range);

/**
 * xa_get_order() - Get the order of an entry.
 * @xa: XArray.
 * @index: Index of the entry.
 *
 * Return: A number between 0 and 63 indicating the order of the entry.
 */
int xa_get_order(struct xarray *xa, unsigned long index)
{
        XA_STATE(xas, xa, index);
        void *entry;
        int order = 0;

        rcu_read_lock();
        entry = xas_load(&xas);

        if (!entry)
                goto unlock;

        if (!xas.xa_node)
                goto unlock;

        for (;;) {
                unsigned int slot = xas.xa_offset + (1 << order);

                if (slot >= XA_CHUNK_SIZE)
                        break;
                if (!xa_is_sibling(xas.xa_node->slots[slot]))
                        break;
                order++;
        }

        order += xas.xa_node->shift;
unlock:
        rcu_read_unlock();

        return order;
}
EXPORT_SYMBOL(xa_get_order);
#endif /* CONFIG_XARRAY_MULTI */

/**
 * __xa_alloc() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @limit: Range for allocated ID.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.  May
 * release and reacquire xa_lock if @gfp flags permit.
 * Return: 0 on success, -ENOMEM if memory could not be allocated or
 * -EBUSY if there are no free entries in @limit.
 */
int __xa_alloc(struct xarray *xa, u32 *id, void *entry,
                struct xa_limit limit, gfp_t gfp)
{
        XA_STATE(xas, xa, 0);

        if (WARN_ON_ONCE(xa_is_advanced(entry)))
                return -EINVAL;
        if (WARN_ON_ONCE(!xa_track_free(xa)))
                return -EINVAL;

        if (!entry)
                entry = XA_ZERO_ENTRY;

        do {
                xas.xa_index = limit.min;
                xas_find_marked(&xas, limit.max, XA_FREE_MARK);
                if (xas.xa_node == XAS_RESTART)
                        xas_set_err(&xas, -EBUSY);
                else
                        *id = xas.xa_index;
                xas_store(&xas, entry);
                xas_clear_mark(&xas, XA_FREE_MARK);
        } while (__xas_nomem(&xas, gfp));

        return xas_error(&xas);
}
EXPORT_SYMBOL(__xa_alloc);

/**
 * __xa_alloc_cyclic() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of allocated ID.
 * @next: Pointer to next ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 * The search for an empty entry will start at @next and will wrap
 * around if necessary.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.  May
 * release and reacquire xa_lock if @gfp flags permit.
 * Return: 0 if the allocation succeeded without wrapping.  1 if the
 * allocation succeeded after wrapping, -ENOMEM if memory could not be
 * allocated or -EBUSY if there are no free entries in @limit.
 */
int __xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry,
                struct xa_limit limit, u32 *next, gfp_t gfp)
{
        u32 min = limit.min;
        int ret;

        limit.min = max(min, *next);
        ret = __xa_alloc(xa, id, entry, limit, gfp);
        if ((xa->xa_flags & XA_FLAGS_ALLOC_WRAPPED) && ret == 0) {
                xa->xa_flags &= ~XA_FLAGS_ALLOC_WRAPPED;
                ret = 1;
        }

        if (ret < 0 && limit.min > min) {
                limit.min = min;
                ret = __xa_alloc(xa, id, entry, limit, gfp);
                if (ret == 0)
                        ret = 1;
        }

        if (ret >= 0) {
                *next = *id + 1;
                if (*next == 0)
                        xa->xa_flags |= XA_FLAGS_ALLOC_WRAPPED;
        }
        return ret;
}
EXPORT_SYMBOL(__xa_alloc_cyclic);

/**
 * __xa_set_mark() - Set this mark on this entry while locked.
 * @xa: XArray.
 * @index: Index of entry.
 * @mark: Mark number.
 *
 * Attempting to set a mark on a %NULL entry does not succeed.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.
 */
void __xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
{
        XA_STATE(xas, xa, index);
        void *entry = xas_load(&xas);

        if (entry)
                xas_set_mark(&xas, mark);
}
EXPORT_SYMBOL(__xa_set_mark);

/**
 * __xa_clear_mark() - Clear this mark on this entry while locked.
 * @xa: XArray.
 * @index: Index of entry.
 * @mark: Mark number.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.
 */
void __xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
{
        XA_STATE(xas, xa, index);
        void *entry = xas_load(&xas);

        if (entry)
                xas_clear_mark(&xas, mark);
}
EXPORT_SYMBOL(__xa_clear_mark);

/**
 * xa_get_mark() - Inquire whether this mark is set on this entry.
 * @xa: XArray.
 * @index: Index of entry.
 * @mark: Mark number.
 *
 * This function uses the RCU read lock, so the result may be out of date
 * by the time it returns.  If you need the result to be stable, use a lock.
 *
 * Context: Any context.  Takes and releases the RCU lock.
 * Return: True if the entry at @index has this mark set, false if it doesn't.
 */
bool xa_get_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
{
        XA_STATE(xas, xa, index);
        void *entry;

        rcu_read_lock();
        entry = xas_start(&xas);
        while (xas_get_mark(&xas, mark)) {
                if (!xa_is_node(entry))
                        goto found;
                entry = xas_descend(&xas, xa_to_node(entry));
        }
        rcu_read_unlock();
        return false;
 found:
        rcu_read_unlock();
        return true;
}
EXPORT_SYMBOL(xa_get_mark);

/**
 * xa_set_mark() - Set this mark on this entry.
 * @xa: XArray.
 * @index: Index of entry.
 * @mark: Mark number.
 *
 * Attempting to set a mark on a %NULL entry does not succeed.
 *
 * Context: Process context.  Takes and releases the xa_lock.
 */
void xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
{
        xa_lock(xa);
        __xa_set_mark(xa, index, mark);
        xa_unlock(xa);
}
EXPORT_SYMBOL(xa_set_mark);

/**
 * xa_clear_mark() - Clear this mark on this entry.
 * @xa: XArray.
 * @index: Index of entry.
 * @mark: Mark number.
 *
 * Clearing a mark always succeeds.
 *
 * Context: Process context.  Takes and releases the xa_lock.
 */
void xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
{
        xa_lock(xa);
        __xa_clear_mark(xa, index, mark);
        xa_unlock(xa);
}
EXPORT_SYMBOL(xa_clear_mark);

/**
 * xa_find() - Search the XArray for an entry.
 * @xa: XArray.
 * @indexp: Pointer to an index.
 * @max: Maximum index to search to.
 * @filter: Selection criterion.
 *
 * Finds the entry in @xa which matches the @filter, and has the lowest
 * index that is at least @indexp and no more than @max.
 * If an entry is found, @indexp is updated to be the index of the entry.
 * This function is protected by the RCU read lock, so it may not find
 * entries which are being simultaneously added.  It will not return an
 * %XA_RETRY_ENTRY; if you need to see retry entries, use xas_find().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 * Return: The entry, if found, otherwise %NULL.
 */
void *xa_find(struct xarray *xa, unsigned long *indexp,
                        unsigned long max, xa_mark_t filter)
{
        XA_STATE(xas, xa, *indexp);
        void *entry;

        rcu_read_lock();
        do {
                if ((__force unsigned int)filter < XA_MAX_MARKS)
                        entry = xas_find_marked(&xas, max, filter);
                else
                        entry = xas_find(&xas, max);
        } while (xas_retry(&xas, entry));
        rcu_read_unlock();

        if (entry)
                *indexp = xas.xa_index;
        return entry;
}
EXPORT_SYMBOL(xa_find);

static bool xas_sibling(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;
        unsigned long mask;

        if (!IS_ENABLED(CONFIG_XARRAY_MULTI) || !node)
                return false;
        mask = (XA_CHUNK_SIZE << node->shift) - 1;
        return (xas->xa_index & mask) >
                ((unsigned long)xas->xa_offset << node->shift);
}

/**
 * xa_find_after() - Search the XArray for a present entry.
 * @xa: XArray.
 * @indexp: Pointer to an index.
 * @max: Maximum index to search to.
 * @filter: Selection criterion.
 *
 * Finds the entry in @xa which matches the @filter and has the lowest
 * index that is above @indexp and no more than @max.
 * If an entry is found, @indexp is updated to be the index of the entry.
 * This function is protected by the RCU read lock, so it may miss entries
 * which are being simultaneously added.  It will not return an
 * %XA_RETRY_ENTRY; if you need to see retry entries, use xas_find().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 * Return: The pointer, if found, otherwise %NULL.
 */
void *xa_find_after(struct xarray *xa, unsigned long *indexp,
                        unsigned long max, xa_mark_t filter)
{
        XA_STATE(xas, xa, *indexp + 1);
        void *entry;

        if (xas.xa_index == 0)
                return NULL;

        rcu_read_lock();
        for (;;) {
                if ((__force unsigned int)filter < XA_MAX_MARKS)
                        entry = xas_find_marked(&xas, max, filter);
                else
                        entry = xas_find(&xas, max);

                if (xas_invalid(&xas))
                        break;
                if (xas_sibling(&xas))
                        continue;
                if (!xas_retry(&xas, entry))
                        break;
        }
        rcu_read_unlock();

        if (entry)
                *indexp = xas.xa_index;
        return entry;
}
EXPORT_SYMBOL(xa_find_after);

static unsigned int xas_extract_present(struct xa_state *xas, void **dst,
                        unsigned long max, unsigned int n)
{
        void *entry;
        unsigned int i = 0;

        rcu_read_lock();
        xas_for_each(xas, entry, max) {
                if (xas_retry(xas, entry))
                        continue;
                dst[i++] = entry;
                if (i == n)
                        break;
        }
        rcu_read_unlock();

        return i;
}

static unsigned int xas_extract_marked(struct xa_state *xas, void **dst,
                        unsigned long max, unsigned int n, xa_mark_t mark)
{
        void *entry;
        unsigned int i = 0;

        rcu_read_lock();
        xas_for_each_marked(xas, entry, max, mark) {
                if (xas_retry(xas, entry))
                        continue;
                dst[i++] = entry;
                if (i == n)
                        break;
        }
        rcu_read_unlock();

        return i;
}

/**
 * xa_extract() - Copy selected entries from the XArray into a normal array.
 * @xa: The source XArray to copy from.
 * @dst: The buffer to copy entries into.
 * @start: The first index in the XArray eligible to be selected.
 * @max: The last index in the XArray eligible to be selected.
 * @n: The maximum number of entries to copy.
 * @filter: Selection criterion.
 *
 * Copies up to @n entries that match @filter from the XArray.  The
 * copied entries will have indices between @start and @max, inclusive.
 *
 * The @filter may be an XArray mark value, in which case entries which are
 * marked with that mark will be copied.  It may also be %XA_PRESENT, in
 * which case all entries which are not %NULL will be copied.
 *
 * The entries returned may not represent a snapshot of the XArray at a
 * moment in time.  For example, if another thread stores to index 5, then
 * index 10, calling xa_extract() may return the old contents of index 5
 * and the new contents of index 10.  Indices not modified while this
 * function is running will not be skipped.
 *
 * If you need stronger guarantees, holding the xa_lock across calls to this
 * function will prevent concurrent modification.
 *
 * Context: Any context.  Takes and releases the RCU lock.
 * Return: The number of entries copied.
 */
unsigned int xa_extract(struct xarray *xa, void **dst, unsigned long start,
                        unsigned long max, unsigned int n, xa_mark_t filter)
{
        XA_STATE(xas, xa, start);

        if (!n)
                return 0;

        if ((__force unsigned int)filter < XA_MAX_MARKS)
                return xas_extract_marked(&xas, dst, max, n, filter);
        return xas_extract_present(&xas, dst, max, n);
}
EXPORT_SYMBOL(xa_extract);

/**
 * xa_delete_node() - Private interface for workingset code.
 * @node: Node to be removed from the tree.
 * @update: Function to call to update ancestor nodes.
 *
 * Context: xa_lock must be held on entry and will not be released.
 */
void xa_delete_node(struct xa_node *node, xa_update_node_t update)
{
        struct xa_state xas = {
                .xa = node->array,
                .xa_index = (unsigned long)node->offset <<
                                (node->shift + XA_CHUNK_SHIFT),
                .xa_shift = node->shift + XA_CHUNK_SHIFT,
                .xa_offset = node->offset,
                .xa_node = xa_parent_locked(node->array, node),
                .xa_update = update,
        };

        xas_store(&xas, NULL);
}
EXPORT_SYMBOL_GPL(xa_delete_node);        /* For the benefit of the test suite */

/**
 * xa_destroy() - Free all internal data structures.
 * @xa: XArray.
 *
 * After calling this function, the XArray is empty and has freed all memory
 * allocated for its internal data structures.  You are responsible for
 * freeing the objects referenced by the XArray.
 *
 * Context: Any context.  Takes and releases the xa_lock, interrupt-safe.
 */
void xa_destroy(struct xarray *xa)
{
        XA_STATE(xas, xa, 0);
        unsigned long flags;
        void *entry;

        xas.xa_node = NULL;
        xas_lock_irqsave(&xas, flags);
        entry = xa_head_locked(xa);
        RCU_INIT_POINTER(xa->xa_head, NULL);
        xas_init_marks(&xas);
        if (xa_zero_busy(xa))
                xa_mark_clear(xa, XA_FREE_MARK);
        /* lockdep checks we're still holding the lock in xas_free_nodes() */
        if (xa_is_node(entry))
                xas_free_nodes(&xas, xa_to_node(entry));
        xas_unlock_irqrestore(&xas, flags);
}
EXPORT_SYMBOL(xa_destroy);

#ifdef XA_DEBUG
void xa_dump_node(const struct xa_node *node)
{
        unsigned i, j;

        if (!node)
                return;
        if ((unsigned long)node & 3) {
                pr_cont("node %px\n", node);
                return;
        }

        pr_cont("node %px %s %d parent %px shift %d count %d values %d "
                "array %px list %px %px marks",
                node, node->parent ? "offset" : "max", node->offset,
                node->parent, node->shift, node->count, node->nr_values,
                node->array, node->private_list.prev, node->private_list.next);
        for (i = 0; i < XA_MAX_MARKS; i++)
                for (j = 0; j < XA_MARK_LONGS; j++)
                        pr_cont(" %lx", node->marks[i][j]);
        pr_cont("\n");
}

void xa_dump_index(unsigned long index, unsigned int shift)
{
        if (!shift)
                pr_info("%lu: ", index);
        else if (shift >= BITS_PER_LONG)
                pr_info("0-%lu: ", ~0UL);
        else
                pr_info("%lu-%lu: ", index, index | ((1UL << shift) - 1));
}

void xa_dump_entry(const void *entry, unsigned long index, unsigned long shift)
{
        if (!entry)
                return;

        xa_dump_index(index, shift);

        if (xa_is_node(entry)) {
                if (shift == 0) {
                        pr_cont("%px\n", entry);
                } else {
                        unsigned long i;
                        struct xa_node *node = xa_to_node(entry);
                        xa_dump_node(node);
                        for (i = 0; i < XA_CHUNK_SIZE; i++)
                                xa_dump_entry(node->slots[i],
                                      index + (i << node->shift), node->shift);
                }
        } else if (xa_is_value(entry))
                pr_cont("value %ld (0x%lx) [%px]\n", xa_to_value(entry),
                                                xa_to_value(entry), entry);
        else if (!xa_is_internal(entry))
                pr_cont("%px\n", entry);
        else if (xa_is_retry(entry))
                pr_cont("retry (%ld)\n", xa_to_internal(entry));
        else if (xa_is_sibling(entry))
                pr_cont("sibling (slot %ld)\n", xa_to_sibling(entry));
        else if (xa_is_zero(entry))
                pr_cont("zero (%ld)\n", xa_to_internal(entry));
        else
                pr_cont("UNKNOWN ENTRY (%px)\n", entry);
}

void xa_dump(const struct xarray *xa)
{
        void *entry = xa->xa_head;
        unsigned int shift = 0;

        pr_info("xarray: %px head %px flags %x marks %d %d %d\n", xa, entry,
                        xa->xa_flags, xa_marked(xa, XA_MARK_0),
                        xa_marked(xa, XA_MARK_1), xa_marked(xa, XA_MARK_2));
        if (xa_is_node(entry))
                shift = xa_to_node(entry)->shift + XA_CHUNK_SHIFT;
        xa_dump_entry(entry, 0, shift);
}
#endif



















































































    1 















































































































































































































































    1 



    1 






































    1 






    1 
    1 




    1 
    1 



























































































    1 

































    1 







































    1 





    1 



    1 

















    1 






    1 




































































































































































































































    1 

    1 












    1 




    1 


    1 
















    1 
    1 















































    1 
















































































































































































































































    1 






    1 





























































    1 







    1 
































    1 




    1 


























    1 
    1 

    1 






















    1 
    1 


    1 

    1 















    1 

    1 
















    1 


    1 

































































































































































































































































































































































































































































































    1 












    1 





    1 


    1 














    1 




    1 
    1 








    1 


    1 



    1 










    1 





    1 



    1 


















































































































































































































































































































































































































































































































    1 










    1 



    1 






    1 


    1 






    1 



    1 



































    1 






    1 



    1 



    1 


    1 




    1 
    1 


    1 


    1 













































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * NETLINK      Kernel-user communication protocol.
 *
 *                 Authors:        Alan Cox <alan@lxorguk.ukuu.org.uk>
 *                                 Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
 *                                 Patrick McHardy <kaber@trash.net>
 *
 * Tue Jun 26 14:36:48 MEST 2001 Herbert "herp" Rosmanith
 *                               added netlink_proto_exit
 * Tue Jan 22 18:32:44 BRST 2002 Arnaldo C. de Melo <acme@conectiva.com.br>
 *                                  use nlk_sk, as sk->protinfo is on a diet 8)
 * Fri Jul 22 19:51:12 MEST 2005 Harald Welte <laforge@gnumonks.org>
 *                                  - inc module use count of module that owns
 *                                    the kernel socket in case userspace opens
 *                                    socket of same protocol
 *                                  - remove all module support, since netlink is
 *                                    mandatory if CONFIG_NET=y these days
 */

#include <linux/module.h>

#include <linux/capability.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/stat.h>
#include <linux/socket.h>
#include <linux/un.h>
#include <linux/fcntl.h>
#include <linux/termios.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/notifier.h>
#include <linux/security.h>
#include <linux/jhash.h>
#include <linux/jiffies.h>
#include <linux/random.h>
#include <linux/bitops.h>
#include <linux/mm.h>
#include <linux/types.h>
#include <linux/audit.h>
#include <linux/mutex.h>
#include <linux/vmalloc.h>
#include <linux/if_arp.h>
#include <linux/rhashtable.h>
#include <asm/cacheflush.h>
#include <linux/hash.h>
#include <linux/genetlink.h>
#include <linux/net_namespace.h>
#include <linux/nospec.h>
#include <linux/btf_ids.h>

#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/sock.h>
#include <net/scm.h>
#include <net/netlink.h>

#include "af_netlink.h"

struct listeners {
        struct rcu_head                rcu;
        unsigned long                masks[];
};

/* state bits */
#define NETLINK_S_CONGESTED                0x0

static inline int netlink_is_kernel(struct sock *sk)
{
        return nlk_sk(sk)->flags & NETLINK_F_KERNEL_SOCKET;
}

struct netlink_table *nl_table __read_mostly;
EXPORT_SYMBOL_GPL(nl_table);

static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait);

static struct lock_class_key nlk_cb_mutex_keys[MAX_LINKS];

static const char *const nlk_cb_mutex_key_strings[MAX_LINKS + 1] = {
        "nlk_cb_mutex-ROUTE",
        "nlk_cb_mutex-1",
        "nlk_cb_mutex-USERSOCK",
        "nlk_cb_mutex-FIREWALL",
        "nlk_cb_mutex-SOCK_DIAG",
        "nlk_cb_mutex-NFLOG",
        "nlk_cb_mutex-XFRM",
        "nlk_cb_mutex-SELINUX",
        "nlk_cb_mutex-ISCSI",
        "nlk_cb_mutex-AUDIT",
        "nlk_cb_mutex-FIB_LOOKUP",
        "nlk_cb_mutex-CONNECTOR",
        "nlk_cb_mutex-NETFILTER",
        "nlk_cb_mutex-IP6_FW",
        "nlk_cb_mutex-DNRTMSG",
        "nlk_cb_mutex-KOBJECT_UEVENT",
        "nlk_cb_mutex-GENERIC",
        "nlk_cb_mutex-17",
        "nlk_cb_mutex-SCSITRANSPORT",
        "nlk_cb_mutex-ECRYPTFS",
        "nlk_cb_mutex-RDMA",
        "nlk_cb_mutex-CRYPTO",
        "nlk_cb_mutex-SMC",
        "nlk_cb_mutex-23",
        "nlk_cb_mutex-24",
        "nlk_cb_mutex-25",
        "nlk_cb_mutex-26",
        "nlk_cb_mutex-27",
        "nlk_cb_mutex-28",
        "nlk_cb_mutex-29",
        "nlk_cb_mutex-30",
        "nlk_cb_mutex-31",
        "nlk_cb_mutex-MAX_LINKS"
};

static int netlink_dump(struct sock *sk, bool lock_taken);

/* nl_table locking explained:
 * Lookup and traversal are protected with an RCU read-side lock. Insertion
 * and removal are protected with per bucket lock while using RCU list
 * modification primitives and may run in parallel to RCU protected lookups.
 * Destruction of the Netlink socket may only occur *after* nl_table_lock has
 * been acquired * either during or after the socket has been removed from
 * the list and after an RCU grace period.
 */
DEFINE_RWLOCK(nl_table_lock);
EXPORT_SYMBOL_GPL(nl_table_lock);
static atomic_t nl_table_users = ATOMIC_INIT(0);

#define nl_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&nl_table_lock));

static BLOCKING_NOTIFIER_HEAD(netlink_chain);


static const struct rhashtable_params netlink_rhashtable_params;

static inline u32 netlink_group_mask(u32 group)
{
        if (group > 32)
                return 0;
        return group ? 1 << (group - 1) : 0;
}

static struct sk_buff *netlink_to_full_skb(const struct sk_buff *skb,
                                           gfp_t gfp_mask)
{
        unsigned int len = skb->len;
        struct sk_buff *new;

        new = alloc_skb(len, gfp_mask);
        if (new == NULL)
                return NULL;

        NETLINK_CB(new).portid = NETLINK_CB(skb).portid;
        NETLINK_CB(new).dst_group = NETLINK_CB(skb).dst_group;
        NETLINK_CB(new).creds = NETLINK_CB(skb).creds;

        skb_put_data(new, skb->data, len);
        return new;
}

static unsigned int netlink_tap_net_id;

struct netlink_tap_net {
        struct list_head netlink_tap_all;
        struct mutex netlink_tap_lock;
};

int netlink_add_tap(struct netlink_tap *nt)
{
        struct net *net = dev_net(nt->dev);
        struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);

        if (unlikely(nt->dev->type != ARPHRD_NETLINK))
                return -EINVAL;

        mutex_lock(&nn->netlink_tap_lock);
        list_add_rcu(&nt->list, &nn->netlink_tap_all);
        mutex_unlock(&nn->netlink_tap_lock);

        __module_get(nt->module);

        return 0;
}
EXPORT_SYMBOL_GPL(netlink_add_tap);

static int __netlink_remove_tap(struct netlink_tap *nt)
{
        struct net *net = dev_net(nt->dev);
        struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);
        bool found = false;
        struct netlink_tap *tmp;

        mutex_lock(&nn->netlink_tap_lock);

        list_for_each_entry(tmp, &nn->netlink_tap_all, list) {
                if (nt == tmp) {
                        list_del_rcu(&nt->list);
                        found = true;
                        goto out;
                }
        }

        pr_warn("__netlink_remove_tap: %p not found\n", nt);
out:
        mutex_unlock(&nn->netlink_tap_lock);

        if (found)
                module_put(nt->module);

        return found ? 0 : -ENODEV;
}

int netlink_remove_tap(struct netlink_tap *nt)
{
        int ret;

        ret = __netlink_remove_tap(nt);
        synchronize_net();

        return ret;
}
EXPORT_SYMBOL_GPL(netlink_remove_tap);

static __net_init int netlink_tap_init_net(struct net *net)
{
        struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);

        INIT_LIST_HEAD(&nn->netlink_tap_all);
        mutex_init(&nn->netlink_tap_lock);
        return 0;
}

static struct pernet_operations netlink_tap_net_ops = {
        .init = netlink_tap_init_net,
        .id   = &netlink_tap_net_id,
        .size = sizeof(struct netlink_tap_net),
};

static bool netlink_filter_tap(const struct sk_buff *skb)
{
        struct sock *sk = skb->sk;

        /* We take the more conservative approach and
         * whitelist socket protocols that may pass.
         */
        switch (sk->sk_protocol) {
        case NETLINK_ROUTE:
        case NETLINK_USERSOCK:
        case NETLINK_SOCK_DIAG:
        case NETLINK_NFLOG:
        case NETLINK_XFRM:
        case NETLINK_FIB_LOOKUP:
        case NETLINK_NETFILTER:
        case NETLINK_GENERIC:
                return true;
        }

        return false;
}

static int __netlink_deliver_tap_skb(struct sk_buff *skb,
                                     struct net_device *dev)
{
        struct sk_buff *nskb;
        struct sock *sk = skb->sk;
        int ret = -ENOMEM;

        if (!net_eq(dev_net(dev), sock_net(sk)))
                return 0;

        dev_hold(dev);

        if (is_vmalloc_addr(skb->head))
                nskb = netlink_to_full_skb(skb, GFP_ATOMIC);
        else
                nskb = skb_clone(skb, GFP_ATOMIC);
        if (nskb) {
                nskb->dev = dev;
                nskb->protocol = htons((u16) sk->sk_protocol);
                nskb->pkt_type = netlink_is_kernel(sk) ?
                                 PACKET_KERNEL : PACKET_USER;
                skb_reset_network_header(nskb);
                ret = dev_queue_xmit(nskb);
                if (unlikely(ret > 0))
                        ret = net_xmit_errno(ret);
        }

        dev_put(dev);
        return ret;
}

static void __netlink_deliver_tap(struct sk_buff *skb, struct netlink_tap_net *nn)
{
        int ret;
        struct netlink_tap *tmp;

        if (!netlink_filter_tap(skb))
                return;

        list_for_each_entry_rcu(tmp, &nn->netlink_tap_all, list) {
                ret = __netlink_deliver_tap_skb(skb, tmp->dev);
                if (unlikely(ret))
                        break;
        }
}

static void netlink_deliver_tap(struct net *net, struct sk_buff *skb)
{
        struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);

        rcu_read_lock();

        if (unlikely(!list_empty(&nn->netlink_tap_all)))
                __netlink_deliver_tap(skb, nn);

        rcu_read_unlock();
}

static void netlink_deliver_tap_kernel(struct sock *dst, struct sock *src,
                                       struct sk_buff *skb)
{
        if (!(netlink_is_kernel(dst) && netlink_is_kernel(src)))
                netlink_deliver_tap(sock_net(dst), skb);
}

static void netlink_overrun(struct sock *sk)
{
        struct netlink_sock *nlk = nlk_sk(sk);

        if (!(nlk->flags & NETLINK_F_RECV_NO_ENOBUFS)) {
                if (!test_and_set_bit(NETLINK_S_CONGESTED,
                                      &nlk_sk(sk)->state)) {
                        sk->sk_err = ENOBUFS;
                        sk->sk_error_report(sk);
                }
        }
        atomic_inc(&sk->sk_drops);
}

static void netlink_rcv_wake(struct sock *sk)
{
        struct netlink_sock *nlk = nlk_sk(sk);

        if (skb_queue_empty_lockless(&sk->sk_receive_queue))
                clear_bit(NETLINK_S_CONGESTED, &nlk->state);
        if (!test_bit(NETLINK_S_CONGESTED, &nlk->state))
                wake_up_interruptible(&nlk->wait);
}

static void netlink_skb_destructor(struct sk_buff *skb)
{
        if (is_vmalloc_addr(skb->head)) {
                if (!skb->cloned ||
                    !atomic_dec_return(&(skb_shinfo(skb)->dataref)))
                        vfree_atomic(skb->head);

                skb->head = NULL;
        }
        if (skb->sk != NULL)
                sock_rfree(skb);
}

static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
{
        WARN_ON(skb->sk != NULL);
        skb->sk = sk;
        skb->destructor = netlink_skb_destructor;
        sk_mem_charge(sk, skb->truesize);
}

static void netlink_sock_destruct(struct sock *sk)
{
        skb_queue_purge(&sk->sk_receive_queue);

        if (!sock_flag(sk, SOCK_DEAD)) {
                printk(KERN_ERR "Freeing alive netlink socket %p\n", sk);
                return;
        }

        WARN_ON(atomic_read(&sk->sk_rmem_alloc));
        WARN_ON(refcount_read(&sk->sk_wmem_alloc));
        WARN_ON(nlk_sk(sk)->groups);
}

/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on
 * SMP. Look, when several writers sleep and reader wakes them up, all but one
 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
 * this, _but_ remember, it adds useless work on UP machines.
 */

void netlink_table_grab(void)
        __acquires(nl_table_lock)
{
        might_sleep();

        write_lock_irq(&nl_table_lock);

        if (atomic_read(&nl_table_users)) {
                DECLARE_WAITQUEUE(wait, current);

                add_wait_queue_exclusive(&nl_table_wait, &wait);
                for (;;) {
                        set_current_state(TASK_UNINTERRUPTIBLE);
                        if (atomic_read(&nl_table_users) == 0)
                                break;
                        write_unlock_irq(&nl_table_lock);
                        schedule();
                        write_lock_irq(&nl_table_lock);
                }

                __set_current_state(TASK_RUNNING);
                remove_wait_queue(&nl_table_wait, &wait);
        }
}

void netlink_table_ungrab(void)
        __releases(nl_table_lock)
{
        write_unlock_irq(&nl_table_lock);
        wake_up(&nl_table_wait);
}

static inline void
netlink_lock_table(void)
{
        unsigned long flags;

        /* read_lock() synchronizes us to netlink_table_grab */

        read_lock_irqsave(&nl_table_lock, flags);
        atomic_inc(&nl_table_users);
        read_unlock_irqrestore(&nl_table_lock, flags);
}

static inline void
netlink_unlock_table(void)
{
        if (atomic_dec_and_test(&nl_table_users))
                wake_up(&nl_table_wait);
}

struct netlink_compare_arg
{
        possible_net_t pnet;
        u32 portid;
};

/* Doing sizeof directly may yield 4 extra bytes on 64-bit. */
#define netlink_compare_arg_len \
        (offsetof(struct netlink_compare_arg, portid) + sizeof(u32))

static inline int netlink_compare(struct rhashtable_compare_arg *arg,
                                  const void *ptr)
{
        const struct netlink_compare_arg *x = arg->key;
        const struct netlink_sock *nlk = ptr;

        return nlk->portid != x->portid ||
               !net_eq(sock_net(&nlk->sk), read_pnet(&x->pnet));
}

static void netlink_compare_arg_init(struct netlink_compare_arg *arg,
                                     struct net *net, u32 portid)
{
        memset(arg, 0, sizeof(*arg));
        write_pnet(&arg->pnet, net);
        arg->portid = portid;
}

static struct sock *__netlink_lookup(struct netlink_table *table, u32 portid,
                                     struct net *net)
{
        struct netlink_compare_arg arg;

        netlink_compare_arg_init(&arg, net, portid);
        return rhashtable_lookup_fast(&table->hash, &arg,
                                      netlink_rhashtable_params);
}

static int __netlink_insert(struct netlink_table *table, struct sock *sk)
{
        struct netlink_compare_arg arg;

        netlink_compare_arg_init(&arg, sock_net(sk), nlk_sk(sk)->portid);
        return rhashtable_lookup_insert_key(&table->hash, &arg,
                                            &nlk_sk(sk)->node,
                                            netlink_rhashtable_params);
}

static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid)
{
        struct netlink_table *table = &nl_table[protocol];
        struct sock *sk;

        rcu_read_lock();
        sk = __netlink_lookup(table, portid, net);
        if (sk)
                sock_hold(sk);
        rcu_read_unlock();

        return sk;
}

static const struct proto_ops netlink_ops;

static void
netlink_update_listeners(struct sock *sk)
{
        struct netlink_table *tbl = &nl_table[sk->sk_protocol];
        unsigned long mask;
        unsigned int i;
        struct listeners *listeners;

        listeners = nl_deref_protected(tbl->listeners);
        if (!listeners)
                return;

        for (i = 0; i < NLGRPLONGS(tbl->groups); i++) {
                mask = 0;
                sk_for_each_bound(sk, &tbl->mc_list) {
                        if (i < NLGRPLONGS(nlk_sk(sk)->ngroups))
                                mask |= nlk_sk(sk)->groups[i];
                }
                listeners->masks[i] = mask;
        }
        /* this function is only called with the netlink table "grabbed", which
         * makes sure updates are visible before bind or setsockopt return. */
}

static int netlink_insert(struct sock *sk, u32 portid)
{
        struct netlink_table *table = &nl_table[sk->sk_protocol];
        int err;

        lock_sock(sk);

        err = nlk_sk(sk)->portid == portid ? 0 : -EBUSY;
        if (nlk_sk(sk)->bound)
                goto err;

        /* portid can be read locklessly from netlink_getname(). */
        WRITE_ONCE(nlk_sk(sk)->portid, portid);

        sock_hold(sk);

        err = __netlink_insert(table, sk);
        if (err) {
                /* In case the hashtable backend returns with -EBUSY
                 * from here, it must not escape to the caller.
                 */
                if (unlikely(err == -EBUSY))
                        err = -EOVERFLOW;
                if (err == -EEXIST)
                        err = -EADDRINUSE;
                sock_put(sk);
                goto err;
        }

        /* We need to ensure that the socket is hashed and visible. */
        smp_wmb();
        /* Paired with lockless reads from netlink_bind(),
         * netlink_connect() and netlink_sendmsg().
         */
        WRITE_ONCE(nlk_sk(sk)->bound, portid);

err:
        release_sock(sk);
        return err;
}

static void netlink_remove(struct sock *sk)
{
        struct netlink_table *table;

        table = &nl_table[sk->sk_protocol];
        if (!rhashtable_remove_fast(&table->hash, &nlk_sk(sk)->node,
                                    netlink_rhashtable_params)) {
                WARN_ON(refcount_read(&sk->sk_refcnt) == 1);
                __sock_put(sk);
        }

        netlink_table_grab();
        if (nlk_sk(sk)->subscriptions) {
                __sk_del_bind_node(sk);
                netlink_update_listeners(sk);
        }
        if (sk->sk_protocol == NETLINK_GENERIC)
                atomic_inc(&genl_sk_destructing_cnt);
        netlink_table_ungrab();
}

static struct proto netlink_proto = {
        .name          = "NETLINK",
        .owner          = THIS_MODULE,
        .obj_size = sizeof(struct netlink_sock),
};

static int __netlink_create(struct net *net, struct socket *sock,
                            struct mutex *cb_mutex, int protocol,
                            int kern)
{
        struct sock *sk;
        struct netlink_sock *nlk;

        sock->ops = &netlink_ops;

        sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto, kern);
        if (!sk)
                return -ENOMEM;

        sock_init_data(sock, sk);

        nlk = nlk_sk(sk);
        if (cb_mutex) {
                nlk->cb_mutex = cb_mutex;
        } else {
                nlk->cb_mutex = &nlk->cb_def_mutex;
                mutex_init(nlk->cb_mutex);
                lockdep_set_class_and_name(nlk->cb_mutex,
                                           nlk_cb_mutex_keys + protocol,
                                           nlk_cb_mutex_key_strings[protocol]);
        }
        init_waitqueue_head(&nlk->wait);

        sk->sk_destruct = netlink_sock_destruct;
        sk->sk_protocol = protocol;
        return 0;
}

static int netlink_create(struct net *net, struct socket *sock, int protocol,
                          int kern)
{
        struct module *module = NULL;
        struct mutex *cb_mutex;
        struct netlink_sock *nlk;
        int (*bind)(struct net *net, int group);
        void (*unbind)(struct net *net, int group);
        int err = 0;

        sock->state = SS_UNCONNECTED;

        if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
                return -ESOCKTNOSUPPORT;

        if (protocol < 0 || protocol >= MAX_LINKS)
                return -EPROTONOSUPPORT;
        protocol = array_index_nospec(protocol, MAX_LINKS);

        netlink_lock_table();
#ifdef CONFIG_MODULES
        if (!nl_table[protocol].registered) {
                netlink_unlock_table();
                request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol);
                netlink_lock_table();
        }
#endif
        if (nl_table[protocol].registered &&
            try_module_get(nl_table[protocol].module))
                module = nl_table[protocol].module;
        else
                err = -EPROTONOSUPPORT;
        cb_mutex = nl_table[protocol].cb_mutex;
        bind = nl_table[protocol].bind;
        unbind = nl_table[protocol].unbind;
        netlink_unlock_table();

        if (err < 0)
                goto out;

        err = __netlink_create(net, sock, cb_mutex, protocol, kern);
        if (err < 0)
                goto out_module;

        local_bh_disable();
        sock_prot_inuse_add(net, &netlink_proto, 1);
        local_bh_enable();

        nlk = nlk_sk(sock->sk);
        nlk->module = module;
        nlk->netlink_bind = bind;
        nlk->netlink_unbind = unbind;
out:
        return err;

out_module:
        module_put(module);
        goto out;
}

static void deferred_put_nlk_sk(struct rcu_head *head)
{
        struct netlink_sock *nlk = container_of(head, struct netlink_sock, rcu);
        struct sock *sk = &nlk->sk;

        kfree(nlk->groups);
        nlk->groups = NULL;

        if (!refcount_dec_and_test(&sk->sk_refcnt))
                return;

        sk_free(sk);
}

static int netlink_release(struct socket *sock)
{
        struct sock *sk = sock->sk;
        struct netlink_sock *nlk;

        if (!sk)
                return 0;

        netlink_remove(sk);
        sock_orphan(sk);
        nlk = nlk_sk(sk);

        /*
         * OK. Socket is unlinked, any packets that arrive now
         * will be purged.
         */

        /* must not acquire netlink_table_lock in any way again before unbind
         * and notifying genetlink is done as otherwise it might deadlock
         */
        if (nlk->netlink_unbind) {
                int i;

                for (i = 0; i < nlk->ngroups; i++)
                        if (test_bit(i, nlk->groups))
                                nlk->netlink_unbind(sock_net(sk), i + 1);
        }
        if (sk->sk_protocol == NETLINK_GENERIC &&
            atomic_dec_return(&genl_sk_destructing_cnt) == 0)
                wake_up(&genl_sk_destructing_waitq);

        sock->sk = NULL;
        wake_up_interruptible_all(&nlk->wait);

        skb_queue_purge(&sk->sk_write_queue);

        if (nlk->portid && nlk->bound) {
                struct netlink_notify n = {
                                                .net = sock_net(sk),
                                                .protocol = sk->sk_protocol,
                                                .portid = nlk->portid,
                                          };
                blocking_notifier_call_chain(&netlink_chain,
                                NETLINK_URELEASE, &n);
        }

        /* Terminate any outstanding dump */
        if (nlk->cb_running) {
                if (nlk->cb.done)
                        nlk->cb.done(&nlk->cb);
                module_put(nlk->cb.module);
                kfree_skb(nlk->cb.skb);
        }

        module_put(nlk->module);

        if (netlink_is_kernel(sk)) {
                netlink_table_grab();
                BUG_ON(nl_table[sk->sk_protocol].registered == 0);
                if (--nl_table[sk->sk_protocol].registered == 0) {
                        struct listeners *old;

                        old = nl_deref_protected(nl_table[sk->sk_protocol].listeners);
                        RCU_INIT_POINTER(nl_table[sk->sk_protocol].listeners, NULL);
                        kfree_rcu(old, rcu);
                        nl_table[sk->sk_protocol].module = NULL;
                        nl_table[sk->sk_protocol].bind = NULL;
                        nl_table[sk->sk_protocol].unbind = NULL;
                        nl_table[sk->sk_protocol].flags = 0;
                        nl_table[sk->sk_protocol].registered = 0;
                }
                netlink_table_ungrab();
        }

        local_bh_disable();
        sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1);
        local_bh_enable();
        call_rcu(&nlk->rcu, deferred_put_nlk_sk);
        return 0;
}

static int netlink_autobind(struct socket *sock)
{
        struct sock *sk = sock->sk;
        struct net *net = sock_net(sk);
        struct netlink_table *table = &nl_table[sk->sk_protocol];
        s32 portid = task_tgid_vnr(current);
        int err;
        s32 rover = -4096;
        bool ok;

retry:
        cond_resched();
        rcu_read_lock();
        ok = !__netlink_lookup(table, portid, net);
        rcu_read_unlock();
        if (!ok) {
                /* Bind collision, search negative portid values. */
                if (rover == -4096)
                        /* rover will be in range [S32_MIN, -4097] */
                        rover = S32_MIN + prandom_u32_max(-4096 - S32_MIN);
                else if (rover >= -4096)
                        rover = -4097;
                portid = rover--;
                goto retry;
        }

        err = netlink_insert(sk, portid);
        if (err == -EADDRINUSE)
                goto retry;

        /* If 2 threads race to autobind, that is fine.  */
        if (err == -EBUSY)
                err = 0;

        return err;
}

/**
 * __netlink_ns_capable - General netlink message capability test
 * @nsp: NETLINK_CB of the socket buffer holding a netlink command from userspace.
 * @user_ns: The user namespace of the capability to use
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket we received the message
 * from had when the netlink socket was created and the sender of the
 * message has the capability @cap in the user namespace @user_ns.
 */
bool __netlink_ns_capable(const struct netlink_skb_parms *nsp,
                        struct user_namespace *user_ns, int cap)
{
        return ((nsp->flags & NETLINK_SKB_DST) ||
                file_ns_capable(nsp->sk->sk_socket->file, user_ns, cap)) &&
                ns_capable(user_ns, cap);
}
EXPORT_SYMBOL(__netlink_ns_capable);

/**
 * netlink_ns_capable - General netlink message capability test
 * @skb: socket buffer holding a netlink command from userspace
 * @user_ns: The user namespace of the capability to use
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket we received the message
 * from had when the netlink socket was created and the sender of the
 * message has the capability @cap in the user namespace @user_ns.
 */
bool netlink_ns_capable(const struct sk_buff *skb,
                        struct user_namespace *user_ns, int cap)
{
        return __netlink_ns_capable(&NETLINK_CB(skb), user_ns, cap);
}
EXPORT_SYMBOL(netlink_ns_capable);

/**
 * netlink_capable - Netlink global message capability test
 * @skb: socket buffer holding a netlink command from userspace
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket we received the message
 * from had when the netlink socket was created and the sender of the
 * message has the capability @cap in all user namespaces.
 */
bool netlink_capable(const struct sk_buff *skb, int cap)
{
        return netlink_ns_capable(skb, &init_user_ns, cap);
}
EXPORT_SYMBOL(netlink_capable);

/**
 * netlink_net_capable - Netlink network namespace message capability test
 * @skb: socket buffer holding a netlink command from userspace
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket we received the message
 * from had when the netlink socket was created and the sender of the
 * message has the capability @cap over the network namespace of
 * the socket we received the message from.
 */
bool netlink_net_capable(const struct sk_buff *skb, int cap)
{
        return netlink_ns_capable(skb, sock_net(skb->sk)->user_ns, cap);
}
EXPORT_SYMBOL(netlink_net_capable);

static inline int netlink_allowed(const struct socket *sock, unsigned int flag)
{
        return (nl_table[sock->sk->sk_protocol].flags & flag) ||
                ns_capable(sock_net(sock->sk)->user_ns, CAP_NET_ADMIN);
}

static void
netlink_update_subscriptions(struct sock *sk, unsigned int subscriptions)
{
        struct netlink_sock *nlk = nlk_sk(sk);

        if (nlk->subscriptions && !subscriptions)
                __sk_del_bind_node(sk);
        else if (!nlk->subscriptions && subscriptions)
                sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list);
        nlk->subscriptions = subscriptions;
}

static int netlink_realloc_groups(struct sock *sk)
{
        struct netlink_sock *nlk = nlk_sk(sk);
        unsigned int groups;
        unsigned long *new_groups;
        int err = 0;

        netlink_table_grab();

        groups = nl_table[sk->sk_protocol].groups;
        if (!nl_table[sk->sk_protocol].registered) {
                err = -ENOENT;
                goto out_unlock;
        }

        if (nlk->ngroups >= groups)
                goto out_unlock;

        new_groups = krealloc(nlk->groups, NLGRPSZ(groups), GFP_ATOMIC);
        if (new_groups == NULL) {
                err = -ENOMEM;
                goto out_unlock;
        }
        memset((char *)new_groups + NLGRPSZ(nlk->ngroups), 0,
               NLGRPSZ(groups) - NLGRPSZ(nlk->ngroups));

        nlk->groups = new_groups;
        nlk->ngroups = groups;
 out_unlock:
        netlink_table_ungrab();
        return err;
}

static void netlink_undo_bind(int group, long unsigned int groups,
                              struct sock *sk)
{
        struct netlink_sock *nlk = nlk_sk(sk);
        int undo;

        if (!nlk->netlink_unbind)
                return;

        for (undo = 0; undo < group; undo++)
                if (test_bit(undo, &groups))
                        nlk->netlink_unbind(sock_net(sk), undo + 1);
}

static int netlink_bind(struct socket *sock, struct sockaddr *addr,
                        int addr_len)
{
        struct sock *sk = sock->sk;
        struct net *net = sock_net(sk);
        struct netlink_sock *nlk = nlk_sk(sk);
        struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;
        int err = 0;
        unsigned long groups;
        bool bound;

        if (addr_len < sizeof(struct sockaddr_nl))
                return -EINVAL;

        if (nladdr->nl_family != AF_NETLINK)
                return -EINVAL;
        groups = nladdr->nl_groups;

        /* Only superuser is allowed to listen multicasts */
        if (groups) {
                if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV))
                        return -EPERM;
                err = netlink_realloc_groups(sk);
                if (err)
                        return err;
        }

        if (nlk->ngroups < BITS_PER_LONG)
                groups &= (1UL << nlk->ngroups) - 1;

        /* Paired with WRITE_ONCE() in netlink_insert() */
        bound = READ_ONCE(nlk->bound);
        if (bound) {
                /* Ensure nlk->portid is up-to-date. */
                smp_rmb();

                if (nladdr->nl_pid != nlk->portid)
                        return -EINVAL;
        }

        if (nlk->netlink_bind && groups) {
                int group;

                /* nl_groups is a u32, so cap the maximum groups we can bind */
                for (group = 0; group < BITS_PER_TYPE(u32); group++) {
                        if (!test_bit(group, &groups))
                                continue;
                        err = nlk->netlink_bind(net, group + 1);
                        if (!err)
                                continue;
                        netlink_undo_bind(group, groups, sk);
                        return err;
                }
        }

        /* No need for barriers here as we return to user-space without
         * using any of the bound attributes.
         */
        netlink_lock_table();
        if (!bound) {
                err = nladdr->nl_pid ?
                        netlink_insert(sk, nladdr->nl_pid) :
                        netlink_autobind(sock);
                if (err) {
                        netlink_undo_bind(BITS_PER_TYPE(u32), groups, sk);
                        goto unlock;
                }
        }

        if (!groups && (nlk->groups == NULL || !(u32)nlk->groups[0]))
                goto unlock;
        netlink_unlock_table();

        netlink_table_grab();
        netlink_update_subscriptions(sk, nlk->subscriptions +
                                         hweight32(groups) -
                                         hweight32(nlk->groups[0]));
        nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | groups;
        netlink_update_listeners(sk);
        netlink_table_ungrab();

        return 0;

unlock:
        netlink_unlock_table();
        return err;
}

static int netlink_connect(struct socket *sock, struct sockaddr *addr,
                           int alen, int flags)
{
        int err = 0;
        struct sock *sk = sock->sk;
        struct netlink_sock *nlk = nlk_sk(sk);
        struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;

        if (alen < sizeof(addr->sa_family))
                return -EINVAL;

        if (addr->sa_family == AF_UNSPEC) {
                /* paired with READ_ONCE() in netlink_getsockbyportid() */
                WRITE_ONCE(sk->sk_state, NETLINK_UNCONNECTED);
                /* dst_portid and dst_group can be read locklessly */
                WRITE_ONCE(nlk->dst_portid, 0);
                WRITE_ONCE(nlk->dst_group, 0);
                return 0;
        }
        if (addr->sa_family != AF_NETLINK)
                return -EINVAL;

        if (alen < sizeof(struct sockaddr_nl))
                return -EINVAL;

        if ((nladdr->nl_groups || nladdr->nl_pid) &&
            !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND))
                return -EPERM;

        /* No need for barriers here as we return to user-space without
         * using any of the bound attributes.
         * Paired with WRITE_ONCE() in netlink_insert().
         */
        if (!READ_ONCE(nlk->bound))
                err = netlink_autobind(sock);

        if (err == 0) {
                /* paired with READ_ONCE() in netlink_getsockbyportid() */
                WRITE_ONCE(sk->sk_state, NETLINK_CONNECTED);
                /* dst_portid and dst_group can be read locklessly */
                WRITE_ONCE(nlk->dst_portid, nladdr->nl_pid);
                WRITE_ONCE(nlk->dst_group, ffs(nladdr->nl_groups));
        }

        return err;
}

static int netlink_getname(struct socket *sock, struct sockaddr *addr,
                           int peer)
{
        struct sock *sk = sock->sk;
        struct netlink_sock *nlk = nlk_sk(sk);
        DECLARE_SOCKADDR(struct sockaddr_nl *, nladdr, addr);

        nladdr->nl_family = AF_NETLINK;
        nladdr->nl_pad = 0;

        if (peer) {
                /* Paired with WRITE_ONCE() in netlink_connect() */
                nladdr->nl_pid = READ_ONCE(nlk->dst_portid);
                nladdr->nl_groups = netlink_group_mask(READ_ONCE(nlk->dst_group));
        } else {
                /* Paired with WRITE_ONCE() in netlink_insert() */
                nladdr->nl_pid = READ_ONCE(nlk->portid);
                netlink_lock_table();
                nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0;
                netlink_unlock_table();
        }
        return sizeof(*nladdr);
}

static int netlink_ioctl(struct socket *sock, unsigned int cmd,
                         unsigned long arg)
{
        /* try to hand this ioctl down to the NIC drivers.
         */
        return -ENOIOCTLCMD;
}

static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid)
{
        struct sock *sock;
        struct netlink_sock *nlk;

        sock = netlink_lookup(sock_net(ssk), ssk->sk_protocol, portid);
        if (!sock)
                return ERR_PTR(-ECONNREFUSED);

        /* Don't bother queuing skb if kernel socket has no input function */
        nlk = nlk_sk(sock);
        /* dst_portid and sk_state can be changed in netlink_connect() */
        if (READ_ONCE(sock->sk_state) == NETLINK_CONNECTED &&
            READ_ONCE(nlk->dst_portid) != nlk_sk(ssk)->portid) {
                sock_put(sock);
                return ERR_PTR(-ECONNREFUSED);
        }
        return sock;
}

struct sock *netlink_getsockbyfilp(struct file *filp)
{
        struct inode *inode = file_inode(filp);
        struct sock *sock;

        if (!S_ISSOCK(inode->i_mode))
                return ERR_PTR(-ENOTSOCK);

        sock = SOCKET_I(inode)->sk;
        if (sock->sk_family != AF_NETLINK)
                return ERR_PTR(-EINVAL);

        sock_hold(sock);
        return sock;
}

static struct sk_buff *netlink_alloc_large_skb(unsigned int size,
                                               int broadcast)
{
        struct sk_buff *skb;
        void *data;

        if (size <= NLMSG_GOODSIZE || broadcast)
                return alloc_skb(size, GFP_KERNEL);

        size = SKB_DATA_ALIGN(size) +
               SKB_DATA_ALIGN(sizeof(struct skb_shared_info));

        data = vmalloc(size);
        if (data == NULL)
                return NULL;

        skb = __build_skb(data, size);
        if (skb == NULL)
                vfree(data);
        else
                skb->destructor = netlink_skb_destructor;

        return skb;
}

/*
 * Attach a skb to a netlink socket.
 * The caller must hold a reference to the destination socket. On error, the
 * reference is dropped. The skb is not send to the destination, just all
 * all error checks are performed and memory in the queue is reserved.
 * Return values:
 * < 0: error. skb freed, reference to sock dropped.
 * 0: continue
 * 1: repeat lookup - reference dropped while waiting for socket memory.
 */
int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
                      long *timeo, struct sock *ssk)
{
        DECLARE_WAITQUEUE(wait, current);
        struct netlink_sock *nlk;
        unsigned int rmem;

        nlk = nlk_sk(sk);
        rmem = atomic_add_return(skb->truesize, &sk->sk_rmem_alloc);

        if ((rmem == skb->truesize || rmem <= READ_ONCE(sk->sk_rcvbuf)) &&
            !test_bit(NETLINK_S_CONGESTED, &nlk->state)) {
                netlink_skb_set_owner_r(skb, sk);
                return 0;
        }

        atomic_sub(skb->truesize, &sk->sk_rmem_alloc);

        if (!*timeo) {
                if (!ssk || netlink_is_kernel(ssk))
                        netlink_overrun(sk);
                sock_put(sk);
                kfree_skb(skb);
                return -EAGAIN;
        }

        __set_current_state(TASK_INTERRUPTIBLE);
        add_wait_queue(&nlk->wait, &wait);
        rmem = atomic_read(&sk->sk_rmem_alloc);

        if (((rmem && rmem + skb->truesize > READ_ONCE(sk->sk_rcvbuf)) ||
             test_bit(NETLINK_S_CONGESTED, &nlk->state)) &&
            !sock_flag(sk, SOCK_DEAD))
                *timeo = schedule_timeout(*timeo);

        __set_current_state(TASK_RUNNING);
        remove_wait_queue(&nlk->wait, &wait);
        sock_put(sk);

        if (signal_pending(current)) {
                kfree_skb(skb);
                return sock_intr_errno(*timeo);
        }

        return 1;
}

static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb)
{
        int len = skb->len;

        netlink_deliver_tap(sock_net(sk), skb);

        skb_queue_tail(&sk->sk_receive_queue, skb);
        sk->sk_data_ready(sk);
        return len;
}

int netlink_sendskb(struct sock *sk, struct sk_buff *skb)
{
        int len = __netlink_sendskb(sk, skb);

        sock_put(sk);
        return len;
}

void netlink_detachskb(struct sock *sk, struct sk_buff *skb)
{
        kfree_skb(skb);
        sock_put(sk);
}

static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation)
{
        int delta;

        WARN_ON(skb->sk != NULL);
        delta = skb->end - skb->tail;
        if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize)
                return skb;

        if (skb_shared(skb)) {
                struct sk_buff *nskb = skb_clone(skb, allocation);
                if (!nskb)
                        return skb;
                consume_skb(skb);
                skb = nskb;
        }

        pskb_expand_head(skb, 0, -delta,
                         (allocation & ~__GFP_DIRECT_RECLAIM) |
                         __GFP_NOWARN | __GFP_NORETRY);
        return skb;
}

static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb,
                                  struct sock *ssk)
{
        int ret;
        struct netlink_sock *nlk = nlk_sk(sk);

        ret = -ECONNREFUSED;
        if (nlk->netlink_rcv != NULL) {
                ret = skb->len;
                atomic_add(skb->truesize, &sk->sk_rmem_alloc);
                netlink_skb_set_owner_r(skb, sk);
                NETLINK_CB(skb).sk = ssk;
                netlink_deliver_tap_kernel(sk, ssk, skb);
                nlk->netlink_rcv(skb);
                consume_skb(skb);
        } else {
                kfree_skb(skb);
        }
        sock_put(sk);
        return ret;
}

int netlink_unicast(struct sock *ssk, struct sk_buff *skb,
                    u32 portid, int nonblock)
{
        struct sock *sk;
        int err;
        long timeo;

        skb = netlink_trim(skb, gfp_any());

        timeo = sock_sndtimeo(ssk, nonblock);
retry:
        sk = netlink_getsockbyportid(ssk, portid);
        if (IS_ERR(sk)) {
                kfree_skb(skb);
                return PTR_ERR(sk);
        }
        if (netlink_is_kernel(sk))
                return netlink_unicast_kernel(sk, skb, ssk);

        if (sk_filter(sk, skb)) {
                err = skb->len;
                kfree_skb(skb);
                sock_put(sk);
                return err;
        }

        err = netlink_attachskb(sk, skb, &timeo, ssk);
        if (err == 1)
                goto retry;
        if (err)
                return err;

        return netlink_sendskb(sk, skb);
}
EXPORT_SYMBOL(netlink_unicast);

int netlink_has_listeners(struct sock *sk, unsigned int group)
{
        int res = 0;
        struct listeners *listeners;

        BUG_ON(!netlink_is_kernel(sk));

        rcu_read_lock();
        listeners = rcu_dereference(nl_table[sk->sk_protocol].listeners);

        if (listeners && group - 1 < nl_table[sk->sk_protocol].groups)
                res = test_bit(group - 1, listeners->masks);

        rcu_read_unlock();

        return res;
}
EXPORT_SYMBOL_GPL(netlink_has_listeners);

bool netlink_strict_get_check(struct sk_buff *skb)
{
        const struct netlink_sock *nlk = nlk_sk(NETLINK_CB(skb).sk);

        return nlk->flags & NETLINK_F_STRICT_CHK;
}
EXPORT_SYMBOL_GPL(netlink_strict_get_check);

static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb)
{
        struct netlink_sock *nlk = nlk_sk(sk);
        unsigned int rmem, rcvbuf;

        rmem = atomic_add_return(skb->truesize, &sk->sk_rmem_alloc);
        rcvbuf = READ_ONCE(sk->sk_rcvbuf);

        if ((rmem == skb->truesize || rmem <= rcvbuf) &&
            !test_bit(NETLINK_S_CONGESTED, &nlk->state)) {
                netlink_skb_set_owner_r(skb, sk);
                __netlink_sendskb(sk, skb);
                return rmem > (rcvbuf >> 1);
        }

        atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
        return -1;
}

struct netlink_broadcast_data {
        struct sock *exclude_sk;
        struct net *net;
        u32 portid;
        u32 group;
        int failure;
        int delivery_failure;
        int congested;
        int delivered;
        gfp_t allocation;
        struct sk_buff *skb, *skb2;
        int (*tx_filter)(struct sock *dsk, struct sk_buff *skb, void *data);
        void *tx_data;
};

static void do_one_broadcast(struct sock *sk,
                                    struct netlink_broadcast_data *p)
{
        struct netlink_sock *nlk = nlk_sk(sk);
        int val;

        if (p->exclude_sk == sk)
                return;

        if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups ||
            !test_bit(p->group - 1, nlk->groups))
                return;

        if (!net_eq(sock_net(sk), p->net)) {
                if (!(nlk->flags & NETLINK_F_LISTEN_ALL_NSID))
                        return;

                if (!peernet_has_id(sock_net(sk), p->net))
                        return;

                if (!file_ns_capable(sk->sk_socket->file, p->net->user_ns,
                                     CAP_NET_BROADCAST))
                        return;
        }

        if (p->failure) {
                netlink_overrun(sk);
                return;
        }

        sock_hold(sk);
        if (p->skb2 == NULL) {
                if (skb_shared(p->skb)) {
                        p->skb2 = skb_clone(p->skb, p->allocation);
                } else {
                        p->skb2 = skb_get(p->skb);
                        /*
                         * skb ownership may have been set when
                         * delivered to a previous socket.
                         */
                        skb_orphan(p->skb2);
                }
        }
        if (p->skb2 == NULL) {
                netlink_overrun(sk);
                /* Clone failed. Notify ALL listeners. */
                p->failure = 1;
                if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR)
                        p->delivery_failure = 1;
                goto out;
        }
        if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) {
                kfree_skb(p->skb2);
                p->skb2 = NULL;
                goto out;
        }
        if (sk_filter(sk, p->skb2)) {
                kfree_skb(p->skb2);
                p->skb2 = NULL;
                goto out;
        }
        NETLINK_CB(p->skb2).nsid = peernet2id(sock_net(sk), p->net);
        if (NETLINK_CB(p->skb2).nsid != NETNSA_NSID_NOT_ASSIGNED)
                NETLINK_CB(p->skb2).nsid_is_set = true;
        val = netlink_broadcast_deliver(sk, p->skb2);
        if (val < 0) {
                netlink_overrun(sk);
                if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR)
                        p->delivery_failure = 1;
        } else {
                p->congested |= val;
                p->delivered = 1;
                p->skb2 = NULL;
        }
out:
        sock_put(sk);
}

int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 portid,
        u32 group, gfp_t allocation,
        int (*filter)(struct sock *dsk, struct sk_buff *skb, void *data),
        void *filter_data)
{
        struct net *net = sock_net(ssk);
        struct netlink_broadcast_data info;
        struct sock *sk;

        skb = netlink_trim(skb, allocation);

        info.exclude_sk = ssk;
        info.net = net;
        info.portid = portid;
        info.group = group;
        info.failure = 0;
        info.delivery_failure = 0;
        info.congested = 0;
        info.delivered = 0;
        info.allocation = allocation;
        info.skb = skb;
        info.skb2 = NULL;
        info.tx_filter = filter;
        info.tx_data = filter_data;

        /* While we sleep in clone, do not allow to change socket list */

        netlink_lock_table();

        sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list)
                do_one_broadcast(sk, &info);

        consume_skb(skb);

        netlink_unlock_table();

        if (info.delivery_failure) {
                kfree_skb(info.skb2);
                return -ENOBUFS;
        }
        consume_skb(info.skb2);

        if (info.delivered) {
                if (info.congested && gfpflags_allow_blocking(allocation))
                        yield();
                return 0;
        }
        return -ESRCH;
}
EXPORT_SYMBOL(netlink_broadcast_filtered);

int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid,
                      u32 group, gfp_t allocation)
{
        return netlink_broadcast_filtered(ssk, skb, portid, group, allocation,
                NULL, NULL);
}
EXPORT_SYMBOL(netlink_broadcast);

struct netlink_set_err_data {
        struct sock *exclude_sk;
        u32 portid;
        u32 group;
        int code;
};

static int do_one_set_err(struct sock *sk, struct netlink_set_err_data *p)
{
        struct netlink_sock *nlk = nlk_sk(sk);
        int ret = 0;

        if (sk == p->exclude_sk)
                goto out;

        if (!net_eq(sock_net(sk), sock_net(p->exclude_sk)))
                goto out;

        if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups ||
            !test_bit(p->group - 1, nlk->groups))
                goto out;

        if (p->code == ENOBUFS && nlk->flags & NETLINK_F_RECV_NO_ENOBUFS) {
                ret = 1;
                goto out;
        }

        sk->sk_err = p->code;
        sk->sk_error_report(sk);
out:
        return ret;
}

/**
 * netlink_set_err - report error to broadcast listeners
 * @ssk: the kernel netlink socket, as returned by netlink_kernel_create()
 * @portid: the PORTID of a process that we want to skip (if any)
 * @group: the broadcast group that will notice the error
 * @code: error code, must be negative (as usual in kernelspace)
 *
 * This function returns the number of broadcast listeners that have set the
 * NETLINK_NO_ENOBUFS socket option.
 */
int netlink_set_err(struct sock *ssk, u32 portid, u32 group, int code)
{
        struct netlink_set_err_data info;
        unsigned long flags;
        struct sock *sk;
        int ret = 0;

        info.exclude_sk = ssk;
        info.portid = portid;
        info.group = group;
        /* sk->sk_err wants a positive error value */
        info.code = -code;

        read_lock_irqsave(&nl_table_lock, flags);

        sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list)
                ret += do_one_set_err(sk, &info);

        read_unlock_irqrestore(&nl_table_lock, flags);
        return ret;
}
EXPORT_SYMBOL(netlink_set_err);

/* must be called with netlink table grabbed */
static void netlink_update_socket_mc(struct netlink_sock *nlk,
                                     unsigned int group,
                                     int is_new)
{
        int old, new = !!is_new, subscriptions;

        old = test_bit(group - 1, nlk->groups);
        subscriptions = nlk->subscriptions - old + new;
        if (new)
                __set_bit(group - 1, nlk->groups);
        else
                __clear_bit(group - 1, nlk->groups);
        netlink_update_subscriptions(&nlk->sk, subscriptions);
        netlink_update_listeners(&nlk->sk);
}

static int netlink_setsockopt(struct socket *sock, int level, int optname,
                              sockptr_t optval, unsigned int optlen)
{
        struct sock *sk = sock->sk;
        struct netlink_sock *nlk = nlk_sk(sk);
        unsigned int val = 0;
        int err;

        if (level != SOL_NETLINK)
                return -ENOPROTOOPT;

        if (optlen >= sizeof(int) &&
            copy_from_sockptr(&val, optval, sizeof(val)))
                return -EFAULT;

        switch (optname) {
        case NETLINK_PKTINFO:
                if (val)
                        nlk->flags |= NETLINK_F_RECV_PKTINFO;
                else
                        nlk->flags &= ~NETLINK_F_RECV_PKTINFO;
                err = 0;
                break;
        case NETLINK_ADD_MEMBERSHIP:
        case NETLINK_DROP_MEMBERSHIP: {
                if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV))
                        return -EPERM;
                err = netlink_realloc_groups(sk);
                if (err)
                        return err;
                if (!val || val - 1 >= nlk->ngroups)
                        return -EINVAL;
                if (optname == NETLINK_ADD_MEMBERSHIP && nlk->netlink_bind) {
                        err = nlk->netlink_bind(sock_net(sk), val);
                        if (err)
                                return err;
                }
                netlink_table_grab();
                netlink_update_socket_mc(nlk, val,
                                         optname == NETLINK_ADD_MEMBERSHIP);
                netlink_table_ungrab();
                if (optname == NETLINK_DROP_MEMBERSHIP && nlk->netlink_unbind)
                        nlk->netlink_unbind(sock_net(sk), val);

                err = 0;
                break;
        }
        case NETLINK_BROADCAST_ERROR:
                if (val)
                        nlk->flags |= NETLINK_F_BROADCAST_SEND_ERROR;
                else
                        nlk->flags &= ~NETLINK_F_BROADCAST_SEND_ERROR;
                err = 0;
                break;
        case NETLINK_NO_ENOBUFS:
                if (val) {
                        nlk->flags |= NETLINK_F_RECV_NO_ENOBUFS;
                        clear_bit(NETLINK_S_CONGESTED, &nlk->state);
                        wake_up_interruptible(&nlk->wait);
                } else {
                        nlk->flags &= ~NETLINK_F_RECV_NO_ENOBUFS;
                }
                err = 0;
                break;
        case NETLINK_LISTEN_ALL_NSID:
                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST))
                        return -EPERM;

                if (val)
                        nlk->flags |= NETLINK_F_LISTEN_ALL_NSID;
                else
                        nlk->flags &= ~NETLINK_F_LISTEN_ALL_NSID;
                err = 0;
                break;
        case NETLINK_CAP_ACK:
                if (val)
                        nlk->flags |= NETLINK_F_CAP_ACK;
                else
                        nlk->flags &= ~NETLINK_F_CAP_ACK;
                err = 0;
                break;
        case NETLINK_EXT_ACK:
                if (val)
                        nlk->flags |= NETLINK_F_EXT_ACK;
                else
                        nlk->flags &= ~NETLINK_F_EXT_ACK;
                err = 0;
                break;
        case NETLINK_GET_STRICT_CHK:
                if (val)
                        nlk->flags |= NETLINK_F_STRICT_CHK;
                else
                        nlk->flags &= ~NETLINK_F_STRICT_CHK;
                err = 0;
                break;
        default:
                err = -ENOPROTOOPT;
        }
        return err;
}

static int netlink_getsockopt(struct socket *sock, int level, int optname,
                              char __user *optval, int __user *optlen)
{
        struct sock *sk = sock->sk;
        struct netlink_sock *nlk = nlk_sk(sk);
        unsigned int flag;
        int len, val;

        if (level != SOL_NETLINK)
                return -ENOPROTOOPT;

        if (get_user(len, optlen))
                return -EFAULT;
        if (len < 0)
                return -EINVAL;

        switch (optname) {
        case NETLINK_PKTINFO:
                flag = NETLINK_F_RECV_PKTINFO;
                break;
        case NETLINK_BROADCAST_ERROR:
                flag = NETLINK_F_BROADCAST_SEND_ERROR;
                break;
        case NETLINK_NO_ENOBUFS:
                flag = NETLINK_F_RECV_NO_ENOBUFS;
                break;
        case NETLINK_LIST_MEMBERSHIPS: {
                int pos, idx, shift, err = 0;

                netlink_lock_table();
                for (pos = 0; pos * 8 < nlk->ngroups; pos += sizeof(u32)) {
                        if (len - pos < sizeof(u32))
                                break;

                        idx = pos / sizeof(unsigned long);
                        shift = (pos % sizeof(unsigned long)) * 8;
                        if (put_user((u32)(nlk->groups[idx] >> shift),
                                     (u32 __user *)(optval + pos))) {
                                err = -EFAULT;
                                break;
                        }
                }
                if (put_user(ALIGN(BITS_TO_BYTES(nlk->ngroups), sizeof(u32)), optlen))
                        err = -EFAULT;
                netlink_unlock_table();
                return err;
        }
        case NETLINK_CAP_ACK:
                flag = NETLINK_F_CAP_ACK;
                break;
        case NETLINK_EXT_ACK:
                flag = NETLINK_F_EXT_ACK;
                break;
        case NETLINK_GET_STRICT_CHK:
                flag = NETLINK_F_STRICT_CHK;
                break;
        default:
                return -ENOPROTOOPT;
        }

        if (len < sizeof(int))
                return -EINVAL;

        len = sizeof(int);
        val = nlk->flags & flag ? 1 : 0;

        if (put_user(len, optlen) ||
            copy_to_user(optval, &val, len))
                return -EFAULT;

        return 0;
}

static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
{
        struct nl_pktinfo info;

        info.group = NETLINK_CB(skb).dst_group;
        put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info);
}

static void netlink_cmsg_listen_all_nsid(struct sock *sk, struct msghdr *msg,
                                         struct sk_buff *skb)
{
        if (!NETLINK_CB(skb).nsid_is_set)
                return;

        put_cmsg(msg, SOL_NETLINK, NETLINK_LISTEN_ALL_NSID, sizeof(int),
                 &NETLINK_CB(skb).nsid);
}

static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
{
        struct sock *sk = sock->sk;
        struct netlink_sock *nlk = nlk_sk(sk);
        DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name);
        u32 dst_portid;
        u32 dst_group;
        struct sk_buff *skb;
        int err;
        struct scm_cookie scm;
        u32 netlink_skb_flags = 0;

        if (msg->msg_flags & MSG_OOB)
                return -EOPNOTSUPP;

        if (len == 0) {
                pr_warn_once("Zero length message leads to an empty skb\n");
                return -ENODATA;
        }

        err = scm_send(sock, msg, &scm, true);
        if (err < 0)
                return err;

        if (msg->msg_namelen) {
                err = -EINVAL;
                if (msg->msg_namelen < sizeof(struct sockaddr_nl))
                        goto out;
                if (addr->nl_family != AF_NETLINK)
                        goto out;
                dst_portid = addr->nl_pid;
                dst_group = ffs(addr->nl_groups);
                err =  -EPERM;
                if ((dst_group || dst_portid) &&
                    !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND))
                        goto out;
                netlink_skb_flags |= NETLINK_SKB_DST;
        } else {
                /* Paired with WRITE_ONCE() in netlink_connect() */
                dst_portid = READ_ONCE(nlk->dst_portid);
                dst_group = READ_ONCE(nlk->dst_group);
        }

        /* Paired with WRITE_ONCE() in netlink_insert() */
        if (!READ_ONCE(nlk->bound)) {
                err = netlink_autobind(sock);
                if (err)
                        goto out;
        } else {
                /* Ensure nlk is hashed and visible. */
                smp_rmb();
        }

        err = -EMSGSIZE;
        if (len > sk->sk_sndbuf - 32)
                goto out;
        err = -ENOBUFS;
        skb = netlink_alloc_large_skb(len, dst_group);
        if (skb == NULL)
                goto out;

        NETLINK_CB(skb).portid        = nlk->portid;
        NETLINK_CB(skb).dst_group = dst_group;
        NETLINK_CB(skb).creds        = scm.creds;
        NETLINK_CB(skb).flags        = netlink_skb_flags;

        err = -EFAULT;
        if (memcpy_from_msg(skb_put(skb, len), msg, len)) {
                kfree_skb(skb);
                goto out;
        }

        err = security_netlink_send(sk, skb);
        if (err) {
                kfree_skb(skb);
                goto out;
        }

        if (dst_group) {
                refcount_inc(&skb->users);
                netlink_broadcast(sk, skb, dst_portid, dst_group, GFP_KERNEL);
        }
        err = netlink_unicast(sk, skb, dst_portid, msg->msg_flags & MSG_DONTWAIT);

out:
        scm_destroy(&scm);
        return err;
}

static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
                           int flags)
{
        struct scm_cookie scm;
        struct sock *sk = sock->sk;
        struct netlink_sock *nlk = nlk_sk(sk);
        int noblock = flags & MSG_DONTWAIT;
        size_t copied, max_recvmsg_len;
        struct sk_buff *skb, *data_skb;
        int err, ret;

        if (flags & MSG_OOB)
                return -EOPNOTSUPP;

        copied = 0;

        skb = skb_recv_datagram(sk, flags, noblock, &err);
        if (skb == NULL)
                goto out;

        data_skb = skb;

#ifdef CONFIG_COMPAT_NETLINK_MESSAGES
        if (unlikely(skb_shinfo(skb)->frag_list)) {
                /*
                 * If this skb has a frag_list, then here that means that we
                 * will have to use the frag_list skb's data for compat tasks
                 * and the regular skb's data for normal (non-compat) tasks.
                 *
                 * If we need to send the compat skb, assign it to the
                 * 'data_skb' variable so that it will be used below for data
                 * copying. We keep 'skb' for everything else, including
                 * freeing both later.
                 */
                if (flags & MSG_CMSG_COMPAT)
                        data_skb = skb_shinfo(skb)->frag_list;
        }
#endif

        /* Record the max length of recvmsg() calls for future allocations */
        max_recvmsg_len = max(READ_ONCE(nlk->max_recvmsg_len), len);
        max_recvmsg_len = min_t(size_t, max_recvmsg_len,
                                SKB_WITH_OVERHEAD(32768));
        WRITE_ONCE(nlk->max_recvmsg_len, max_recvmsg_len);

        copied = data_skb->len;
        if (len < copied) {
                msg->msg_flags |= MSG_TRUNC;
                copied = len;
        }

        err = skb_copy_datagram_msg(data_skb, 0, msg, copied);

        if (msg->msg_name) {
                DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name);
                addr->nl_family = AF_NETLINK;
                addr->nl_pad    = 0;
                addr->nl_pid        = NETLINK_CB(skb).portid;
                addr->nl_groups        = netlink_group_mask(NETLINK_CB(skb).dst_group);
                msg->msg_namelen = sizeof(*addr);
        }

        if (nlk->flags & NETLINK_F_RECV_PKTINFO)
                netlink_cmsg_recv_pktinfo(msg, skb);
        if (nlk->flags & NETLINK_F_LISTEN_ALL_NSID)
                netlink_cmsg_listen_all_nsid(sk, msg, skb);

        memset(&scm, 0, sizeof(scm));
        scm.creds = *NETLINK_CREDS(skb);
        if (flags & MSG_TRUNC)
                copied = data_skb->len;

        skb_free_datagram(sk, skb);

        if (READ_ONCE(nlk->cb_running) &&
            atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) {
                ret = netlink_dump(sk, false);
                if (ret) {
                        sk->sk_err = -ret;
                        sk->sk_error_report(sk);
                }
        }

        scm_recv(sock, msg, &scm, flags);
out:
        netlink_rcv_wake(sk);
        return err ? : copied;
}

static void netlink_data_ready(struct sock *sk)
{
        BUG();
}

/*
 *        We export these functions to other modules. They provide a
 *        complete set of kernel non-blocking support for message
 *        queueing.
 */

struct sock *
__netlink_kernel_create(struct net *net, int unit, struct module *module,
                        struct netlink_kernel_cfg *cfg)
{
        struct socket *sock;
        struct sock *sk;
        struct netlink_sock *nlk;
        struct listeners *listeners = NULL;
        struct mutex *cb_mutex = cfg ? cfg->cb_mutex : NULL;
        unsigned int groups;

        BUG_ON(!nl_table);

        if (unit < 0 || unit >= MAX_LINKS)
                return NULL;

        if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock))
                return NULL;

        if (__netlink_create(net, sock, cb_mutex, unit, 1) < 0)
                goto out_sock_release_nosk;

        sk = sock->sk;

        if (!cfg || cfg->groups < 32)
                groups = 32;
        else
                groups = cfg->groups;

        listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL);
        if (!listeners)
                goto out_sock_release;

        sk->sk_data_ready = netlink_data_ready;
        if (cfg && cfg->input)
                nlk_sk(sk)->netlink_rcv = cfg->input;

        if (netlink_insert(sk, 0))
                goto out_sock_release;

        nlk = nlk_sk(sk);
        nlk->flags |= NETLINK_F_KERNEL_SOCKET;

        netlink_table_grab();
        if (!nl_table[unit].registered) {
                nl_table[unit].groups = groups;
                rcu_assign_pointer(nl_table[unit].listeners, listeners);
                nl_table[unit].cb_mutex = cb_mutex;
                nl_table[unit].module = module;
                if (cfg) {
                        nl_table[unit].bind = cfg->bind;
                        nl_table[unit].unbind = cfg->unbind;
                        nl_table[unit].flags = cfg->flags;
                        if (cfg->compare)
                                nl_table[unit].compare = cfg->compare;
                }
                nl_table[unit].registered = 1;
        } else {
                kfree(listeners);
                nl_table[unit].registered++;
        }
        netlink_table_ungrab();
        return sk;

out_sock_release:
        kfree(listeners);
        netlink_kernel_release(sk);
        return NULL;

out_sock_release_nosk:
        sock_release(sock);
        return NULL;
}
EXPORT_SYMBOL(__netlink_kernel_create);

void
netlink_kernel_release(struct sock *sk)
{
        if (sk == NULL || sk->sk_socket == NULL)
                return;

        sock_release(sk->sk_socket);
}
EXPORT_SYMBOL(netlink_kernel_release);

int __netlink_change_ngroups(struct sock *sk, unsigned int groups)
{
        struct listeners *new, *old;
        struct netlink_table *tbl = &nl_table[sk->sk_protocol];

        if (groups < 32)
                groups = 32;

        if (NLGRPSZ(tbl->groups) < NLGRPSZ(groups)) {
                new = kzalloc(sizeof(*new) + NLGRPSZ(groups), GFP_ATOMIC);
                if (!new)
                        return -ENOMEM;
                old = nl_deref_protected(tbl->listeners);
                memcpy(new->masks, old->masks, NLGRPSZ(tbl->groups));
                rcu_assign_pointer(tbl->listeners, new);

                kfree_rcu(old, rcu);
        }
        tbl->groups = groups;

        return 0;
}

/**
 * netlink_change_ngroups - change number of multicast groups
 *
 * This changes the number of multicast groups that are available
 * on a certain netlink family. Note that it is not possible to
 * change the number of groups to below 32. Also note that it does
 * not implicitly call netlink_clear_multicast_users() when the
 * number of groups is reduced.
 *
 * @sk: The kernel netlink socket, as returned by netlink_kernel_create().
 * @groups: The new number of groups.
 */
int netlink_change_ngroups(struct sock *sk, unsigned int groups)
{
        int err;

        netlink_table_grab();
        err = __netlink_change_ngroups(sk, groups);
        netlink_table_ungrab();

        return err;
}

void __netlink_clear_multicast_users(struct sock *ksk, unsigned int group)
{
        struct sock *sk;
        struct netlink_table *tbl = &nl_table[ksk->sk_protocol];
        struct hlist_node *tmp;

        sk_for_each_bound_safe(sk, tmp, &tbl->mc_list)
                netlink_update_socket_mc(nlk_sk(sk), group, 0);
}

struct nlmsghdr *
__nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags)
{
        struct nlmsghdr *nlh;
        int size = nlmsg_msg_size(len);

        nlh = skb_put(skb, NLMSG_ALIGN(size));
        nlh->nlmsg_type = type;
        nlh->nlmsg_len = size;
        nlh->nlmsg_flags = flags;
        nlh->nlmsg_pid = portid;
        nlh->nlmsg_seq = seq;
        if (!__builtin_constant_p(size) || NLMSG_ALIGN(size) - size != 0)
                memset(nlmsg_data(nlh) + len, 0, NLMSG_ALIGN(size) - size);
        return nlh;
}
EXPORT_SYMBOL(__nlmsg_put);

/*
 * It looks a bit ugly.
 * It would be better to create kernel thread.
 */

static int netlink_dump_done(struct netlink_sock *nlk, struct sk_buff *skb,
                             struct netlink_callback *cb,
                             struct netlink_ext_ack *extack)
{
        struct nlmsghdr *nlh;

        nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, sizeof(nlk->dump_done_errno),
                               NLM_F_MULTI | cb->answer_flags);
        if (WARN_ON(!nlh))
                return -ENOBUFS;

        nl_dump_check_consistent(cb, nlh);
        memcpy(nlmsg_data(nlh), &nlk->dump_done_errno, sizeof(nlk->dump_done_errno));

        if (extack->_msg && nlk->flags & NETLINK_F_EXT_ACK) {
                nlh->nlmsg_flags |= NLM_F_ACK_TLVS;
                if (!nla_put_string(skb, NLMSGERR_ATTR_MSG, extack->_msg))
                        nlmsg_end(skb, nlh);
        }

        return 0;
}

static int netlink_dump(struct sock *sk, bool lock_taken)
{
        struct netlink_sock *nlk = nlk_sk(sk);
        struct netlink_ext_ack extack = {};
        struct netlink_callback *cb;
        struct sk_buff *skb = NULL;
        unsigned int rmem, rcvbuf;
        size_t max_recvmsg_len;
        struct module *module;
        int err = -ENOBUFS;
        int alloc_min_size;
        int alloc_size;

        if (!lock_taken)
                mutex_lock(nlk->cb_mutex);
        if (!nlk->cb_running) {
                err = -EINVAL;
                goto errout_skb;
        }

        /* NLMSG_GOODSIZE is small to avoid high order allocations being
         * required, but it makes sense to _attempt_ a 16K bytes allocation
         * to reduce number of system calls on dump operations, if user
         * ever provided a big enough buffer.
         */
        cb = &nlk->cb;
        alloc_min_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE);

        max_recvmsg_len = READ_ONCE(nlk->max_recvmsg_len);
        if (alloc_min_size < max_recvmsg_len) {
                alloc_size = max_recvmsg_len;
                skb = alloc_skb(alloc_size,
                                (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) |
                                __GFP_NOWARN | __GFP_NORETRY);
        }
        if (!skb) {
                alloc_size = alloc_min_size;
                skb = alloc_skb(alloc_size, GFP_KERNEL);
        }
        if (!skb)
                goto errout_skb;

        rcvbuf = READ_ONCE(sk->sk_rcvbuf);
        rmem = atomic_add_return(skb->truesize, &sk->sk_rmem_alloc);
        if (rmem != skb->truesize && rmem >= rcvbuf) {
                atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
                goto errout_skb;
        }

        /* Trim skb to allocated size. User is expected to provide buffer as
         * large as max(min_dump_alloc, 16KiB (mac_recvmsg_len capped at
         * netlink_recvmsg())). dump will pack as many smaller messages as
         * could fit within the allocated skb. skb is typically allocated
         * with larger space than required (could be as much as near 2x the
         * requested size with align to next power of 2 approach). Allowing
         * dump to use the excess space makes it difficult for a user to have a
         * reasonable static buffer based on the expected largest dump of a
         * single netdev. The outcome is MSG_TRUNC error.
         */
        skb_reserve(skb, skb_tailroom(skb) - alloc_size);

        /* Make sure malicious BPF programs can not read unitialized memory
         * from skb->head -> skb->data
         */
        skb_reset_network_header(skb);
        skb_reset_mac_header(skb);

        netlink_skb_set_owner_r(skb, sk);

        if (nlk->dump_done_errno > 0) {
                cb->extack = &extack;
                nlk->dump_done_errno = cb->dump(skb, cb);
                cb->extack = NULL;
        }

        if (nlk->dump_done_errno > 0 ||
            skb_tailroom(skb) < nlmsg_total_size(sizeof(nlk->dump_done_errno))) {
                mutex_unlock(nlk->cb_mutex);

                if (sk_filter(sk, skb))
                        kfree_skb(skb);
                else
                        __netlink_sendskb(sk, skb);
                return 0;
        }

        if (netlink_dump_done(nlk, skb, cb, &extack))
                goto errout_skb;

#ifdef CONFIG_COMPAT_NETLINK_MESSAGES
        /* frag_list skb's data is used for compat tasks
         * and the regular skb's data for normal (non-compat) tasks.
         * See netlink_recvmsg().
         */
        if (unlikely(skb_shinfo(skb)->frag_list)) {
                if (netlink_dump_done(nlk, skb_shinfo(skb)->frag_list, cb, &extack))
                        goto errout_skb;
        }
#endif

        if (sk_filter(sk, skb))
                kfree_skb(skb);
        else
                __netlink_sendskb(sk, skb);

        if (cb->done)
                cb->done(cb);

        WRITE_ONCE(nlk->cb_running, false);
        module = cb->module;
        skb = cb->skb;
        mutex_unlock(nlk->cb_mutex);
        module_put(module);
        consume_skb(skb);
        return 0;

errout_skb:
        mutex_unlock(nlk->cb_mutex);
        kfree_skb(skb);
        return err;
}

int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
                         const struct nlmsghdr *nlh,
                         struct netlink_dump_control *control)
{
        struct netlink_sock *nlk, *nlk2;
        struct netlink_callback *cb;
        struct sock *sk;
        int ret;

        refcount_inc(&skb->users);

        sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).portid);
        if (sk == NULL) {
                ret = -ECONNREFUSED;
                goto error_free;
        }

        nlk = nlk_sk(sk);
        mutex_lock(nlk->cb_mutex);
        /* A dump is in progress... */
        if (nlk->cb_running) {
                ret = -EBUSY;
                goto error_unlock;
        }
        /* add reference of module which cb->dump belongs to */
        if (!try_module_get(control->module)) {
                ret = -EPROTONOSUPPORT;
                goto error_unlock;
        }

        cb = &nlk->cb;
        memset(cb, 0, sizeof(*cb));
        cb->dump = control->dump;
        cb->done = control->done;
        cb->nlh = nlh;
        cb->data = control->data;
        cb->module = control->module;
        cb->min_dump_alloc = control->min_dump_alloc;
        cb->skb = skb;

        nlk2 = nlk_sk(NETLINK_CB(skb).sk);
        cb->strict_check = !!(nlk2->flags & NETLINK_F_STRICT_CHK);

        if (control->start) {
                ret = control->start(cb);
                if (ret)
                        goto error_put;
        }

        WRITE_ONCE(nlk->cb_running, true);
        nlk->dump_done_errno = INT_MAX;

        ret = netlink_dump(sk, true);

        sock_put(sk);

        if (ret)
                return ret;

        /* We successfully started a dump, by returning -EINTR we
         * signal not to send ACK even if it was requested.
         */
        return -EINTR;

error_put:
        module_put(control->module);
error_unlock:
        sock_put(sk);
        mutex_unlock(nlk->cb_mutex);
error_free:
        kfree_skb(skb);
        return ret;
}
EXPORT_SYMBOL(__netlink_dump_start);

void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err,
                 const struct netlink_ext_ack *extack)
{
        struct sk_buff *skb;
        struct nlmsghdr *rep;
        struct nlmsgerr *errmsg;
        size_t payload = sizeof(*errmsg);
        size_t tlvlen = 0;
        struct netlink_sock *nlk = nlk_sk(NETLINK_CB(in_skb).sk);
        unsigned int flags = 0;
        bool nlk_has_extack = nlk->flags & NETLINK_F_EXT_ACK;

        /* Error messages get the original request appened, unless the user
         * requests to cap the error message, and get extra error data if
         * requested.
         */
        if (nlk_has_extack && extack && extack->_msg)
                tlvlen += nla_total_size(strlen(extack->_msg) + 1);

        if (err && !(nlk->flags & NETLINK_F_CAP_ACK))
                payload += nlmsg_len(nlh);
        else
                flags |= NLM_F_CAPPED;
        if (err && nlk_has_extack && extack && extack->bad_attr)
                tlvlen += nla_total_size(sizeof(u32));
        if (nlk_has_extack && extack && extack->cookie_len)
                tlvlen += nla_total_size(extack->cookie_len);
        if (err && nlk_has_extack && extack && extack->policy)
                tlvlen += netlink_policy_dump_attr_size_estimate(extack->policy);

        if (tlvlen)
                flags |= NLM_F_ACK_TLVS;

        skb = nlmsg_new(payload + tlvlen, GFP_KERNEL);
        if (!skb) {
                NETLINK_CB(in_skb).sk->sk_err = ENOBUFS;
                NETLINK_CB(in_skb).sk->sk_error_report(NETLINK_CB(in_skb).sk);
                return;
        }

        rep = __nlmsg_put(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
                          NLMSG_ERROR, payload, flags);
        errmsg = nlmsg_data(rep);
        errmsg->error = err;
        memcpy(&errmsg->msg, nlh, payload > sizeof(*errmsg) ? nlh->nlmsg_len : sizeof(*nlh));

        if (nlk_has_extack && extack) {
                if (extack->_msg) {
                        WARN_ON(nla_put_string(skb, NLMSGERR_ATTR_MSG,
                                               extack->_msg));
                }
                if (err && extack->bad_attr &&
                    !WARN_ON((u8 *)extack->bad_attr < in_skb->data ||
                             (u8 *)extack->bad_attr >= in_skb->data +
                                                       in_skb->len))
                        WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_OFFS,
                                            (u8 *)extack->bad_attr -
                                            (u8 *)nlh));
                if (extack->cookie_len)
                        WARN_ON(nla_put(skb, NLMSGERR_ATTR_COOKIE,
                                        extack->cookie_len, extack->cookie));
                if (extack->policy)
                        netlink_policy_dump_write_attr(skb, extack->policy,
                                                       NLMSGERR_ATTR_POLICY);
        }

        nlmsg_end(skb, rep);

        nlmsg_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid);
}
EXPORT_SYMBOL(netlink_ack);

int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *,
                                                   struct nlmsghdr *,
                                                   struct netlink_ext_ack *))
{
        struct netlink_ext_ack extack;
        struct nlmsghdr *nlh;
        int err;

        while (skb->len >= nlmsg_total_size(0)) {
                int msglen;

                memset(&extack, 0, sizeof(extack));
                nlh = nlmsg_hdr(skb);
                err = 0;

                if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len)
                        return 0;

                /* Only requests are handled by the kernel */
                if (!(nlh->nlmsg_flags & NLM_F_REQUEST))
                        goto ack;

                /* Skip control messages */
                if (nlh->nlmsg_type < NLMSG_MIN_TYPE)
                        goto ack;

                err = cb(skb, nlh, &extack);
                if (err == -EINTR)
                        goto skip;

ack:
                if (nlh->nlmsg_flags & NLM_F_ACK || err)
                        netlink_ack(skb, nlh, err, &extack);

skip:
                msglen = NLMSG_ALIGN(nlh->nlmsg_len);
                if (msglen > skb->len)
                        msglen = skb->len;
                skb_pull(skb, msglen);
        }

        return 0;
}
EXPORT_SYMBOL(netlink_rcv_skb);

/**
 * nlmsg_notify - send a notification netlink message
 * @sk: netlink socket to use
 * @skb: notification message
 * @portid: destination netlink portid for reports or 0
 * @group: destination multicast group or 0
 * @report: 1 to report back, 0 to disable
 * @flags: allocation flags
 */
int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid,
                 unsigned int group, int report, gfp_t flags)
{
        int err = 0;

        if (group) {
                int exclude_portid = 0;

                if (report) {
                        refcount_inc(&skb->users);
                        exclude_portid = portid;
                }

                /* errors reported via destination sk->sk_err, but propagate
                 * delivery errors if NETLINK_BROADCAST_ERROR flag is set */
                err = nlmsg_multicast(sk, skb, exclude_portid, group, flags);
                if (err == -ESRCH)
                        err = 0;
        }

        if (report) {
                int err2;

                err2 = nlmsg_unicast(sk, skb, portid);
                if (!err)
                        err = err2;
        }

        return err;
}
EXPORT_SYMBOL(nlmsg_notify);

#ifdef CONFIG_PROC_FS
struct nl_seq_iter {
        struct seq_net_private p;
        struct rhashtable_iter hti;
        int link;
};

static void netlink_walk_start(struct nl_seq_iter *iter)
{
        rhashtable_walk_enter(&nl_table[iter->link].hash, &iter->hti);
        rhashtable_walk_start(&iter->hti);
}

static void netlink_walk_stop(struct nl_seq_iter *iter)
{
        rhashtable_walk_stop(&iter->hti);
        rhashtable_walk_exit(&iter->hti);
}

static void *__netlink_seq_next(struct seq_file *seq)
{
        struct nl_seq_iter *iter = seq->private;
        struct netlink_sock *nlk;

        do {
                for (;;) {
                        nlk = rhashtable_walk_next(&iter->hti);

                        if (IS_ERR(nlk)) {
                                if (PTR_ERR(nlk) == -EAGAIN)
                                        continue;

                                return nlk;
                        }

                        if (nlk)
                                break;

                        netlink_walk_stop(iter);
                        if (++iter->link >= MAX_LINKS)
                                return NULL;

                        netlink_walk_start(iter);
                }
        } while (sock_net(&nlk->sk) != seq_file_net(seq));

        return nlk;
}

static void *netlink_seq_start(struct seq_file *seq, loff_t *posp)
        __acquires(RCU)
{
        struct nl_seq_iter *iter = seq->private;
        void *obj = SEQ_START_TOKEN;
        loff_t pos;

        iter->link = 0;

        netlink_walk_start(iter);

        for (pos = *posp; pos && obj && !IS_ERR(obj); pos--)
                obj = __netlink_seq_next(seq);

        return obj;
}

static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        ++*pos;
        return __netlink_seq_next(seq);
}

static void netlink_native_seq_stop(struct seq_file *seq, void *v)
{
        struct nl_seq_iter *iter = seq->private;

        if (iter->link >= MAX_LINKS)
                return;

        netlink_walk_stop(iter);
}


static int netlink_native_seq_show(struct seq_file *seq, void *v)
{
        if (v == SEQ_START_TOKEN) {
                seq_puts(seq,
                         "sk               Eth Pid        Groups   "
                         "Rmem     Wmem     Dump  Locks    Drops    Inode\n");
        } else {
                struct sock *s = v;
                struct netlink_sock *nlk = nlk_sk(s);

                seq_printf(seq, "%pK %-3d %-10u %08x %-8d %-8d %-5d %-8d %-8u %-8lu\n",
                           s,
                           s->sk_protocol,
                           nlk->portid,
                           nlk->groups ? (u32)nlk->groups[0] : 0,
                           sk_rmem_alloc_get(s),
                           sk_wmem_alloc_get(s),
                           READ_ONCE(nlk->cb_running),
                           refcount_read(&s->sk_refcnt),
                           atomic_read(&s->sk_drops),
                           sock_i_ino(s)
                        );

        }
        return 0;
}

#ifdef CONFIG_BPF_SYSCALL
struct bpf_iter__netlink {
        __bpf_md_ptr(struct bpf_iter_meta *, meta);
        __bpf_md_ptr(struct netlink_sock *, sk);
};

DEFINE_BPF_ITER_FUNC(netlink, struct bpf_iter_meta *meta, struct netlink_sock *sk)

static int netlink_prog_seq_show(struct bpf_prog *prog,
                                  struct bpf_iter_meta *meta,
                                  void *v)
{
        struct bpf_iter__netlink ctx;

        meta->seq_num--;  /* skip SEQ_START_TOKEN */
        ctx.meta = meta;
        ctx.sk = nlk_sk((struct sock *)v);
        return bpf_iter_run_prog(prog, &ctx);
}

static int netlink_seq_show(struct seq_file *seq, void *v)
{
        struct bpf_iter_meta meta;
        struct bpf_prog *prog;

        meta.seq = seq;
        prog = bpf_iter_get_info(&meta, false);
        if (!prog)
                return netlink_native_seq_show(seq, v);

        if (v != SEQ_START_TOKEN)
                return netlink_prog_seq_show(prog, &meta, v);

        return 0;
}

static void netlink_seq_stop(struct seq_file *seq, void *v)
{
        struct bpf_iter_meta meta;
        struct bpf_prog *prog;

        if (!v) {
                meta.seq = seq;
                prog = bpf_iter_get_info(&meta, true);
                if (prog)
                        (void)netlink_prog_seq_show(prog, &meta, v);
        }

        netlink_native_seq_stop(seq, v);
}
#else
static int netlink_seq_show(struct seq_file *seq, void *v)
{
        return netlink_native_seq_show(seq, v);
}

static void netlink_seq_stop(struct seq_file *seq, void *v)
{
        netlink_native_seq_stop(seq, v);
}
#endif

static const struct seq_operations netlink_seq_ops = {
        .start  = netlink_seq_start,
        .next   = netlink_seq_next,
        .stop   = netlink_seq_stop,
        .show   = netlink_seq_show,
};
#endif

int netlink_register_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_register(&netlink_chain, nb);
}
EXPORT_SYMBOL(netlink_register_notifier);

int netlink_unregister_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_unregister(&netlink_chain, nb);
}
EXPORT_SYMBOL(netlink_unregister_notifier);

static const struct proto_ops netlink_ops = {
        .family =        PF_NETLINK,
        .owner =        THIS_MODULE,
        .release =        netlink_release,
        .bind =                netlink_bind,
        .connect =        netlink_connect,
        .socketpair =        sock_no_socketpair,
        .accept =        sock_no_accept,
        .getname =        netlink_getname,
        .poll =                datagram_poll,
        .ioctl =        netlink_ioctl,
        .listen =        sock_no_listen,
        .shutdown =        sock_no_shutdown,
        .setsockopt =        netlink_setsockopt,
        .getsockopt =        netlink_getsockopt,
        .sendmsg =        netlink_sendmsg,
        .recvmsg =        netlink_recvmsg,
        .mmap =                sock_no_mmap,
        .sendpage =        sock_no_sendpage,
};

static const struct net_proto_family netlink_family_ops = {
        .family = PF_NETLINK,
        .create = netlink_create,
        .owner        = THIS_MODULE,        /* for consistency 8) */
};

static int __net_init netlink_net_init(struct net *net)
{
#ifdef CONFIG_PROC_FS
        if (!proc_create_net("netlink", 0, net->proc_net, &netlink_seq_ops,
                        sizeof(struct nl_seq_iter)))
                return -ENOMEM;
#endif
        return 0;
}

static void __net_exit netlink_net_exit(struct net *net)
{
#ifdef CONFIG_PROC_FS
        remove_proc_entry("netlink", net->proc_net);
#endif
}

static void __init netlink_add_usersock_entry(void)
{
        struct listeners *listeners;
        int groups = 32;

        listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL);
        if (!listeners)
                panic("netlink_add_usersock_entry: Cannot allocate listeners\n");

        netlink_table_grab();

        nl_table[NETLINK_USERSOCK].groups = groups;
        rcu_assign_pointer(nl_table[NETLINK_USERSOCK].listeners, listeners);
        nl_table[NETLINK_USERSOCK].module = THIS_MODULE;
        nl_table[NETLINK_USERSOCK].registered = 1;
        nl_table[NETLINK_USERSOCK].flags = NL_CFG_F_NONROOT_SEND;

        netlink_table_ungrab();
}

static struct pernet_operations __net_initdata netlink_net_ops = {
        .init = netlink_net_init,
        .exit = netlink_net_exit,
};

static inline u32 netlink_hash(const void *data, u32 len, u32 seed)
{
        const struct netlink_sock *nlk = data;
        struct netlink_compare_arg arg;

        netlink_compare_arg_init(&arg, sock_net(&nlk->sk), nlk->portid);
        return jhash2((u32 *)&arg, netlink_compare_arg_len / sizeof(u32), seed);
}

static const struct rhashtable_params netlink_rhashtable_params = {
        .head_offset = offsetof(struct netlink_sock, node),
        .key_len = netlink_compare_arg_len,
        .obj_hashfn = netlink_hash,
        .obj_cmpfn = netlink_compare,
        .automatic_shrinking = true,
};

#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
BTF_ID_LIST(btf_netlink_sock_id)
BTF_ID(struct, netlink_sock)

static const struct bpf_iter_seq_info netlink_seq_info = {
        .seq_ops                = &netlink_seq_ops,
        .init_seq_private        = bpf_iter_init_seq_net,
        .fini_seq_private        = bpf_iter_fini_seq_net,
        .seq_priv_size                = sizeof(struct nl_seq_iter),
};

static struct bpf_iter_reg netlink_reg_info = {
        .target                        = "netlink",
        .ctx_arg_info_size        = 1,
        .ctx_arg_info                = {
                { offsetof(struct bpf_iter__netlink, sk),
                  PTR_TO_BTF_ID_OR_NULL },
        },
        .seq_info                = &netlink_seq_info,
};

static int __init bpf_iter_register(void)
{
        netlink_reg_info.ctx_arg_info[0].btf_id = *btf_netlink_sock_id;
        return bpf_iter_reg_target(&netlink_reg_info);
}
#endif

static int __init netlink_proto_init(void)
{
        int i;
        int err = proto_register(&netlink_proto, 0);

        if (err != 0)
                goto out;

#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
        err = bpf_iter_register();
        if (err)
                goto out;
#endif

        BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > sizeof_field(struct sk_buff, cb));

        nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL);
        if (!nl_table)
                goto panic;

        for (i = 0; i < MAX_LINKS; i++) {
                if (rhashtable_init(&nl_table[i].hash,
                                    &netlink_rhashtable_params) < 0) {
                        while (--i > 0)
                                rhashtable_destroy(&nl_table[i].hash);
                        kfree(nl_table);
                        goto panic;
                }
        }

        netlink_add_usersock_entry();

        sock_register(&netlink_family_ops);
        register_pernet_subsys(&netlink_net_ops);
        register_pernet_subsys(&netlink_tap_net_ops);
        /* The netlink device handler may be needed early. */
        rtnetlink_init();
out:
        return err;
panic:
        panic("netlink_init: Cannot allocate nl_table\n");
}

core_initcall(netlink_proto_init);



































































































































































































































































































































































































































    2 





























































    2 
































    1 


































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * fscrypt.h: declarations for per-file encryption
 *
 * Filesystems that implement per-file encryption must include this header
 * file.
 *
 * Copyright (C) 2015, Google, Inc.
 *
 * Written by Michael Halcrow, 2015.
 * Modified by Jaegeuk Kim, 2015.
 */
#ifndef _LINUX_FSCRYPT_H
#define _LINUX_FSCRYPT_H

#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <uapi/linux/fscrypt.h>

#define FS_CRYPTO_BLOCK_SIZE                16

union fscrypt_policy;
struct fscrypt_info;
struct seq_file;

struct fscrypt_str {
        unsigned char *name;
        u32 len;
};

struct fscrypt_name {
        const struct qstr *usr_fname;
        struct fscrypt_str disk_name;
        u32 hash;
        u32 minor_hash;
        struct fscrypt_str crypto_buf;
        bool is_nokey_name;
};

#define FSTR_INIT(n, l)                { .name = n, .len = l }
#define FSTR_TO_QSTR(f)                QSTR_INIT((f)->name, (f)->len)
#define fname_name(p)                ((p)->disk_name.name)
#define fname_len(p)                ((p)->disk_name.len)

/* Maximum value for the third parameter of fscrypt_operations.set_context(). */
#define FSCRYPT_SET_CONTEXT_MAX_SIZE        40

#ifdef CONFIG_FS_ENCRYPTION
/*
 * fscrypt superblock flags
 */
#define FS_CFLG_OWN_PAGES (1U << 1)

/*
 * crypto operations for filesystems
 */
struct fscrypt_operations {
        unsigned int flags;
        const char *key_prefix;
        int (*get_context)(struct inode *inode, void *ctx, size_t len);
        int (*set_context)(struct inode *inode, const void *ctx, size_t len,
                           void *fs_data);
        const union fscrypt_policy *(*get_dummy_policy)(struct super_block *sb);
        bool (*empty_dir)(struct inode *inode);
        unsigned int max_namelen;
        bool (*has_stable_inodes)(struct super_block *sb);
        void (*get_ino_and_lblk_bits)(struct super_block *sb,
                                      int *ino_bits_ret, int *lblk_bits_ret);
        int (*get_num_devices)(struct super_block *sb);
        void (*get_devices)(struct super_block *sb,
                            struct request_queue **devs);
};

static inline struct fscrypt_info *fscrypt_get_info(const struct inode *inode)
{
        /*
         * Pairs with the cmpxchg_release() in fscrypt_get_encryption_info().
         * I.e., another task may publish ->i_crypt_info concurrently, executing
         * a RELEASE barrier.  We need to use smp_load_acquire() here to safely
         * ACQUIRE the memory the other task published.
         */
        return smp_load_acquire(&inode->i_crypt_info);
}

/**
 * fscrypt_needs_contents_encryption() - check whether an inode needs
 *                                         contents encryption
 * @inode: the inode to check
 *
 * Return: %true iff the inode is an encrypted regular file and the kernel was
 * built with fscrypt support.
 *
 * If you need to know whether the encrypt bit is set even when the kernel was
 * built without fscrypt support, you must use IS_ENCRYPTED() directly instead.
 */
static inline bool fscrypt_needs_contents_encryption(const struct inode *inode)
{
        return IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode);
}

/*
 * When d_splice_alias() moves a directory's no-key alias to its plaintext alias
 * as a result of the encryption key being added, DCACHE_NOKEY_NAME must be
 * cleared.  Note that we don't have to support arbitrary moves of this flag
 * because fscrypt doesn't allow no-key names to be the source or target of a
 * rename().
 */
static inline void fscrypt_handle_d_move(struct dentry *dentry)
{
        dentry->d_flags &= ~DCACHE_NOKEY_NAME;
}

/**
 * fscrypt_is_nokey_name() - test whether a dentry is a no-key name
 * @dentry: the dentry to check
 *
 * This returns true if the dentry is a no-key dentry.  A no-key dentry is a
 * dentry that was created in an encrypted directory that hasn't had its
 * encryption key added yet.  Such dentries may be either positive or negative.
 *
 * When a filesystem is asked to create a new filename in an encrypted directory
 * and the new filename's dentry is a no-key dentry, it must fail the operation
 * with ENOKEY.  This includes ->create(), ->mkdir(), ->mknod(), ->symlink(),
 * ->rename(), and ->link().  (However, ->rename() and ->link() are already
 * handled by fscrypt_prepare_rename() and fscrypt_prepare_link().)
 *
 * This is necessary because creating a filename requires the directory's
 * encryption key, but just checking for the key on the directory inode during
 * the final filesystem operation doesn't guarantee that the key was available
 * during the preceding dentry lookup.  And the key must have already been
 * available during the dentry lookup in order for it to have been checked
 * whether the filename already exists in the directory and for the new file's
 * dentry not to be invalidated due to it incorrectly having the no-key flag.
 *
 * Return: %true if the dentry is a no-key name
 */
static inline bool fscrypt_is_nokey_name(const struct dentry *dentry)
{
        return dentry->d_flags & DCACHE_NOKEY_NAME;
}

/* crypto.c */
void fscrypt_enqueue_decrypt_work(struct work_struct *);

struct page *fscrypt_encrypt_pagecache_blocks(struct page *page,
                                              unsigned int len,
                                              unsigned int offs,
                                              gfp_t gfp_flags);
int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page,
                                  unsigned int len, unsigned int offs,
                                  u64 lblk_num, gfp_t gfp_flags);

int fscrypt_decrypt_pagecache_blocks(struct page *page, unsigned int len,
                                     unsigned int offs);
int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page,
                                  unsigned int len, unsigned int offs,
                                  u64 lblk_num);

static inline bool fscrypt_is_bounce_page(struct page *page)
{
        return page->mapping == NULL;
}

static inline struct page *fscrypt_pagecache_page(struct page *bounce_page)
{
        return (struct page *)page_private(bounce_page);
}

void fscrypt_free_bounce_page(struct page *bounce_page);

/* policy.c */
int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg);
int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg);
int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *arg);
int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg);
int fscrypt_has_permitted_context(struct inode *parent, struct inode *child);
int fscrypt_set_context(struct inode *inode, void *fs_data);

struct fscrypt_dummy_policy {
        const union fscrypt_policy *policy;
};

int fscrypt_set_test_dummy_encryption(struct super_block *sb, const char *arg,
                                struct fscrypt_dummy_policy *dummy_policy);
void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep,
                                        struct super_block *sb);
static inline void
fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy)
{
        kfree(dummy_policy->policy);
        dummy_policy->policy = NULL;
}

/* keyring.c */
void fscrypt_destroy_keyring(struct super_block *sb);
int fscrypt_ioctl_add_key(struct file *filp, void __user *arg);
int fscrypt_ioctl_remove_key(struct file *filp, void __user *arg);
int fscrypt_ioctl_remove_key_all_users(struct file *filp, void __user *arg);
int fscrypt_ioctl_get_key_status(struct file *filp, void __user *arg);

/* keysetup.c */
int fscrypt_get_encryption_info(struct inode *inode);
int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode,
                              bool *encrypt_ret);
void fscrypt_put_encryption_info(struct inode *inode);
void fscrypt_free_inode(struct inode *inode);
int fscrypt_drop_inode(struct inode *inode);

/* fname.c */
int fscrypt_setup_filename(struct inode *inode, const struct qstr *iname,
                           int lookup, struct fscrypt_name *fname);

static inline void fscrypt_free_filename(struct fscrypt_name *fname)
{
        kfree(fname->crypto_buf.name);
}

int fscrypt_fname_alloc_buffer(u32 max_encrypted_len,
                               struct fscrypt_str *crypto_str);
void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str);
int fscrypt_fname_disk_to_usr(const struct inode *inode,
                              u32 hash, u32 minor_hash,
                              const struct fscrypt_str *iname,
                              struct fscrypt_str *oname);
bool fscrypt_match_name(const struct fscrypt_name *fname,
                        const u8 *de_name, u32 de_name_len);
u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name);
int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags);

/* bio.c */
void fscrypt_decrypt_bio(struct bio *bio);
int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
                          sector_t pblk, unsigned int len);

/* hooks.c */
int fscrypt_file_open(struct inode *inode, struct file *filp);
int __fscrypt_prepare_link(struct inode *inode, struct inode *dir,
                           struct dentry *dentry);
int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry,
                             struct inode *new_dir, struct dentry *new_dentry,
                             unsigned int flags);
int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry,
                             struct fscrypt_name *fname);
int fscrypt_prepare_setflags(struct inode *inode,
                             unsigned int oldflags, unsigned int flags);
int fscrypt_prepare_symlink(struct inode *dir, const char *target,
                            unsigned int len, unsigned int max_len,
                            struct fscrypt_str *disk_link);
int __fscrypt_encrypt_symlink(struct inode *inode, const char *target,
                              unsigned int len, struct fscrypt_str *disk_link);
const char *fscrypt_get_symlink(struct inode *inode, const void *caddr,
                                unsigned int max_size,
                                struct delayed_call *done);
int fscrypt_symlink_getattr(const struct path *path, struct kstat *stat);
static inline void fscrypt_set_ops(struct super_block *sb,
                                   const struct fscrypt_operations *s_cop)
{
        sb->s_cop = s_cop;
}
#else  /* !CONFIG_FS_ENCRYPTION */

static inline struct fscrypt_info *fscrypt_get_info(const struct inode *inode)
{
        return NULL;
}

static inline bool fscrypt_needs_contents_encryption(const struct inode *inode)
{
        return false;
}

static inline void fscrypt_handle_d_move(struct dentry *dentry)
{
}

static inline bool fscrypt_is_nokey_name(const struct dentry *dentry)
{
        return false;
}

/* crypto.c */
static inline void fscrypt_enqueue_decrypt_work(struct work_struct *work)
{
}

static inline struct page *fscrypt_encrypt_pagecache_blocks(struct page *page,
                                                            unsigned int len,
                                                            unsigned int offs,
                                                            gfp_t gfp_flags)
{
        return ERR_PTR(-EOPNOTSUPP);
}

static inline int fscrypt_encrypt_block_inplace(const struct inode *inode,
                                                struct page *page,
                                                unsigned int len,
                                                unsigned int offs, u64 lblk_num,
                                                gfp_t gfp_flags)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_decrypt_pagecache_blocks(struct page *page,
                                                   unsigned int len,
                                                   unsigned int offs)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_decrypt_block_inplace(const struct inode *inode,
                                                struct page *page,
                                                unsigned int len,
                                                unsigned int offs, u64 lblk_num)
{
        return -EOPNOTSUPP;
}

static inline bool fscrypt_is_bounce_page(struct page *page)
{
        return false;
}

static inline struct page *fscrypt_pagecache_page(struct page *bounce_page)
{
        WARN_ON_ONCE(1);
        return ERR_PTR(-EINVAL);
}

static inline void fscrypt_free_bounce_page(struct page *bounce_page)
{
}

/* policy.c */
static inline int fscrypt_ioctl_set_policy(struct file *filp,
                                           const void __user *arg)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_ioctl_get_policy_ex(struct file *filp,
                                              void __user *arg)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_has_permitted_context(struct inode *parent,
                                                struct inode *child)
{
        return 0;
}

static inline int fscrypt_set_context(struct inode *inode, void *fs_data)
{
        return -EOPNOTSUPP;
}

struct fscrypt_dummy_policy {
};

static inline void fscrypt_show_test_dummy_encryption(struct seq_file *seq,
                                                      char sep,
                                                      struct super_block *sb)
{
}

static inline void
fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy)
{
}

/* keyring.c */
static inline void fscrypt_destroy_keyring(struct super_block *sb)
{
}

static inline int fscrypt_ioctl_add_key(struct file *filp, void __user *arg)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_ioctl_remove_key(struct file *filp, void __user *arg)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_ioctl_remove_key_all_users(struct file *filp,
                                                     void __user *arg)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_ioctl_get_key_status(struct file *filp,
                                               void __user *arg)
{
        return -EOPNOTSUPP;
}

/* keysetup.c */
static inline int fscrypt_get_encryption_info(struct inode *inode)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_prepare_new_inode(struct inode *dir,
                                            struct inode *inode,
                                            bool *encrypt_ret)
{
        if (IS_ENCRYPTED(dir))
                return -EOPNOTSUPP;
        return 0;
}

static inline void fscrypt_put_encryption_info(struct inode *inode)
{
        return;
}

static inline void fscrypt_free_inode(struct inode *inode)
{
}

static inline int fscrypt_drop_inode(struct inode *inode)
{
        return 0;
}

 /* fname.c */
static inline int fscrypt_setup_filename(struct inode *dir,
                                         const struct qstr *iname,
                                         int lookup, struct fscrypt_name *fname)
{
        if (IS_ENCRYPTED(dir))
                return -EOPNOTSUPP;

        memset(fname, 0, sizeof(*fname));
        fname->usr_fname = iname;
        fname->disk_name.name = (unsigned char *)iname->name;
        fname->disk_name.len = iname->len;
        return 0;
}

static inline void fscrypt_free_filename(struct fscrypt_name *fname)
{
        return;
}

static inline int fscrypt_fname_alloc_buffer(u32 max_encrypted_len,
                                             struct fscrypt_str *crypto_str)
{
        return -EOPNOTSUPP;
}

static inline void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str)
{
        return;
}

static inline int fscrypt_fname_disk_to_usr(const struct inode *inode,
                                            u32 hash, u32 minor_hash,
                                            const struct fscrypt_str *iname,
                                            struct fscrypt_str *oname)
{
        return -EOPNOTSUPP;
}

static inline bool fscrypt_match_name(const struct fscrypt_name *fname,
                                      const u8 *de_name, u32 de_name_len)
{
        /* Encryption support disabled; use standard comparison */
        if (de_name_len != fname->disk_name.len)
                return false;
        return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len);
}

static inline u64 fscrypt_fname_siphash(const struct inode *dir,
                                        const struct qstr *name)
{
        WARN_ON_ONCE(1);
        return 0;
}

static inline int fscrypt_d_revalidate(struct dentry *dentry,
                                       unsigned int flags)
{
        return 1;
}

/* bio.c */
static inline void fscrypt_decrypt_bio(struct bio *bio)
{
}

static inline int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
                                        sector_t pblk, unsigned int len)
{
        return -EOPNOTSUPP;
}

/* hooks.c */

static inline int fscrypt_file_open(struct inode *inode, struct file *filp)
{
        if (IS_ENCRYPTED(inode))
                return -EOPNOTSUPP;
        return 0;
}

static inline int __fscrypt_prepare_link(struct inode *inode, struct inode *dir,
                                         struct dentry *dentry)
{
        return -EOPNOTSUPP;
}

static inline int __fscrypt_prepare_rename(struct inode *old_dir,
                                           struct dentry *old_dentry,
                                           struct inode *new_dir,
                                           struct dentry *new_dentry,
                                           unsigned int flags)
{
        return -EOPNOTSUPP;
}

static inline int __fscrypt_prepare_lookup(struct inode *dir,
                                           struct dentry *dentry,
                                           struct fscrypt_name *fname)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_prepare_setflags(struct inode *inode,
                                           unsigned int oldflags,
                                           unsigned int flags)
{
        return 0;
}

static inline int fscrypt_prepare_symlink(struct inode *dir,
                                          const char *target,
                                          unsigned int len,
                                          unsigned int max_len,
                                          struct fscrypt_str *disk_link)
{
        if (IS_ENCRYPTED(dir))
                return -EOPNOTSUPP;
        disk_link->name = (unsigned char *)target;
        disk_link->len = len + 1;
        if (disk_link->len > max_len)
                return -ENAMETOOLONG;
        return 0;
}

static inline int __fscrypt_encrypt_symlink(struct inode *inode,
                                            const char *target,
                                            unsigned int len,
                                            struct fscrypt_str *disk_link)
{
        return -EOPNOTSUPP;
}

static inline const char *fscrypt_get_symlink(struct inode *inode,
                                              const void *caddr,
                                              unsigned int max_size,
                                              struct delayed_call *done)
{
        return ERR_PTR(-EOPNOTSUPP);
}

static inline int fscrypt_symlink_getattr(const struct path *path,
                                          struct kstat *stat)
{
        return -EOPNOTSUPP;
}

static inline void fscrypt_set_ops(struct super_block *sb,
                                   const struct fscrypt_operations *s_cop)
{
}

#endif        /* !CONFIG_FS_ENCRYPTION */

/* inline_crypt.c */
#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT

bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode);

void fscrypt_set_bio_crypt_ctx(struct bio *bio,
                               const struct inode *inode, u64 first_lblk,
                               gfp_t gfp_mask);

void fscrypt_set_bio_crypt_ctx_bh(struct bio *bio,
                                  const struct buffer_head *first_bh,
                                  gfp_t gfp_mask);

bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode,
                           u64 next_lblk);

bool fscrypt_mergeable_bio_bh(struct bio *bio,
                              const struct buffer_head *next_bh);

#else /* CONFIG_FS_ENCRYPTION_INLINE_CRYPT */

static inline bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode)
{
        return false;
}

static inline void fscrypt_set_bio_crypt_ctx(struct bio *bio,
                                             const struct inode *inode,
                                             u64 first_lblk, gfp_t gfp_mask) { }

static inline void fscrypt_set_bio_crypt_ctx_bh(
                                         struct bio *bio,
                                         const struct buffer_head *first_bh,
                                         gfp_t gfp_mask) { }

static inline bool fscrypt_mergeable_bio(struct bio *bio,
                                         const struct inode *inode,
                                         u64 next_lblk)
{
        return true;
}

static inline bool fscrypt_mergeable_bio_bh(struct bio *bio,
                                            const struct buffer_head *next_bh)
{
        return true;
}
#endif /* !CONFIG_FS_ENCRYPTION_INLINE_CRYPT */

/**
 * fscrypt_inode_uses_inline_crypto() - test whether an inode uses inline
 *                                        encryption
 * @inode: an inode. If encrypted, its key must be set up.
 *
 * Return: true if the inode requires file contents encryption and if the
 *           encryption should be done in the block layer via blk-crypto rather
 *           than in the filesystem layer.
 */
static inline bool fscrypt_inode_uses_inline_crypto(const struct inode *inode)
{
        return fscrypt_needs_contents_encryption(inode) &&
               __fscrypt_inode_uses_inline_crypto(inode);
}

/**
 * fscrypt_inode_uses_fs_layer_crypto() - test whether an inode uses fs-layer
 *                                          encryption
 * @inode: an inode. If encrypted, its key must be set up.
 *
 * Return: true if the inode requires file contents encryption and if the
 *           encryption should be done in the filesystem layer rather than in the
 *           block layer via blk-crypto.
 */
static inline bool fscrypt_inode_uses_fs_layer_crypto(const struct inode *inode)
{
        return fscrypt_needs_contents_encryption(inode) &&
               !__fscrypt_inode_uses_inline_crypto(inode);
}

/**
 * fscrypt_has_encryption_key() - check whether an inode has had its key set up
 * @inode: the inode to check
 *
 * Return: %true if the inode has had its encryption key set up, else %false.
 *
 * Usually this should be preceded by fscrypt_get_encryption_info() to try to
 * set up the key first.
 */
static inline bool fscrypt_has_encryption_key(const struct inode *inode)
{
        return fscrypt_get_info(inode) != NULL;
}

/**
 * fscrypt_require_key() - require an inode's encryption key
 * @inode: the inode we need the key for
 *
 * If the inode is encrypted, set up its encryption key if not already done.
 * Then require that the key be present and return -ENOKEY otherwise.
 *
 * No locks are needed, and the key will live as long as the struct inode --- so
 * it won't go away from under you.
 *
 * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code
 * if a problem occurred while setting up the encryption key.
 */
static inline int fscrypt_require_key(struct inode *inode)
{
        if (IS_ENCRYPTED(inode)) {
                int err = fscrypt_get_encryption_info(inode);

                if (err)
                        return err;
                if (!fscrypt_has_encryption_key(inode))
                        return -ENOKEY;
        }
        return 0;
}

/**
 * fscrypt_prepare_link() - prepare to link an inode into a possibly-encrypted
 *                            directory
 * @old_dentry: an existing dentry for the inode being linked
 * @dir: the target directory
 * @dentry: negative dentry for the target filename
 *
 * A new link can only be added to an encrypted directory if the directory's
 * encryption key is available --- since otherwise we'd have no way to encrypt
 * the filename.  Therefore, we first set up the directory's encryption key (if
 * not already done) and return an error if it's unavailable.
 *
 * We also verify that the link will not violate the constraint that all files
 * in an encrypted directory tree use the same encryption policy.
 *
 * Return: 0 on success, -ENOKEY if the directory's encryption key is missing,
 * -EXDEV if the link would result in an inconsistent encryption policy, or
 * another -errno code.
 */
static inline int fscrypt_prepare_link(struct dentry *old_dentry,
                                       struct inode *dir,
                                       struct dentry *dentry)
{
        if (IS_ENCRYPTED(dir))
                return __fscrypt_prepare_link(d_inode(old_dentry), dir, dentry);
        return 0;
}

/**
 * fscrypt_prepare_rename() - prepare for a rename between possibly-encrypted
 *                              directories
 * @old_dir: source directory
 * @old_dentry: dentry for source file
 * @new_dir: target directory
 * @new_dentry: dentry for target location (may be negative unless exchanging)
 * @flags: rename flags (we care at least about %RENAME_EXCHANGE)
 *
 * Prepare for ->rename() where the source and/or target directories may be
 * encrypted.  A new link can only be added to an encrypted directory if the
 * directory's encryption key is available --- since otherwise we'd have no way
 * to encrypt the filename.  A rename to an existing name, on the other hand,
 * *is* cryptographically possible without the key.  However, we take the more
 * conservative approach and just forbid all no-key renames.
 *
 * We also verify that the rename will not violate the constraint that all files
 * in an encrypted directory tree use the same encryption policy.
 *
 * Return: 0 on success, -ENOKEY if an encryption key is missing, -EXDEV if the
 * rename would cause inconsistent encryption policies, or another -errno code.
 */
static inline int fscrypt_prepare_rename(struct inode *old_dir,
                                         struct dentry *old_dentry,
                                         struct inode *new_dir,
                                         struct dentry *new_dentry,
                                         unsigned int flags)
{
        if (IS_ENCRYPTED(old_dir) || IS_ENCRYPTED(new_dir))
                return __fscrypt_prepare_rename(old_dir, old_dentry,
                                                new_dir, new_dentry, flags);
        return 0;
}

/**
 * fscrypt_prepare_lookup() - prepare to lookup a name in a possibly-encrypted
 *                              directory
 * @dir: directory being searched
 * @dentry: filename being looked up
 * @fname: (output) the name to use to search the on-disk directory
 *
 * Prepare for ->lookup() in a directory which may be encrypted by determining
 * the name that will actually be used to search the directory on-disk.  If the
 * directory's encryption key is available, then the lookup is assumed to be by
 * plaintext name; otherwise, it is assumed to be by no-key name.
 *
 * This also installs a custom ->d_revalidate() method which will invalidate the
 * dentry if it was created without the key and the key is later added.
 *
 * Return: 0 on success; -ENOENT if the directory's key is unavailable but the
 * filename isn't a valid no-key name, so a negative dentry should be created;
 * or another -errno code.
 */
static inline int fscrypt_prepare_lookup(struct inode *dir,
                                         struct dentry *dentry,
                                         struct fscrypt_name *fname)
{
        if (IS_ENCRYPTED(dir))
                return __fscrypt_prepare_lookup(dir, dentry, fname);

        memset(fname, 0, sizeof(*fname));
        fname->usr_fname = &dentry->d_name;
        fname->disk_name.name = (unsigned char *)dentry->d_name.name;
        fname->disk_name.len = dentry->d_name.len;
        return 0;
}

/**
 * fscrypt_prepare_setattr() - prepare to change a possibly-encrypted inode's
 *                               attributes
 * @dentry: dentry through which the inode is being changed
 * @attr: attributes to change
 *
 * Prepare for ->setattr() on a possibly-encrypted inode.  On an encrypted file,
 * most attribute changes are allowed even without the encryption key.  However,
 * without the encryption key we do have to forbid truncates.  This is needed
 * because the size being truncated to may not be a multiple of the filesystem
 * block size, and in that case we'd have to decrypt the final block, zero the
 * portion past i_size, and re-encrypt it.  (We *could* allow truncating to a
 * filesystem block boundary, but it's simpler to just forbid all truncates ---
 * and we already forbid all other contents modifications without the key.)
 *
 * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code
 * if a problem occurred while setting up the encryption key.
 */
static inline int fscrypt_prepare_setattr(struct dentry *dentry,
                                          struct iattr *attr)
{
        if (attr->ia_valid & ATTR_SIZE)
                return fscrypt_require_key(d_inode(dentry));
        return 0;
}

/**
 * fscrypt_encrypt_symlink() - encrypt the symlink target if needed
 * @inode: symlink inode
 * @target: plaintext symlink target
 * @len: length of @target excluding null terminator
 * @disk_link: (in/out) the on-disk symlink target being prepared
 *
 * If the symlink target needs to be encrypted, then this function encrypts it
 * into @disk_link->name.  fscrypt_prepare_symlink() must have been called
 * previously to compute @disk_link->len.  If the filesystem did not allocate a
 * buffer for @disk_link->name after calling fscrypt_prepare_link(), then one
 * will be kmalloc()'ed and the filesystem will be responsible for freeing it.
 *
 * Return: 0 on success, -errno on failure
 */
static inline int fscrypt_encrypt_symlink(struct inode *inode,
                                          const char *target,
                                          unsigned int len,
                                          struct fscrypt_str *disk_link)
{
        if (IS_ENCRYPTED(inode))
                return __fscrypt_encrypt_symlink(inode, target, len, disk_link);
        return 0;
}

/* If *pagep is a bounce page, free it and set *pagep to the pagecache page */
static inline void fscrypt_finalize_bounce_page(struct page **pagep)
{
        struct page *page = *pagep;

        if (fscrypt_is_bounce_page(page)) {
                *pagep = fscrypt_pagecache_page(page);
                fscrypt_free_bounce_page(page);
        }
}

#endif        /* _LINUX_FSCRYPT_H */









































































    1 





























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Filesystem parameter description and parser
 *
 * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#ifndef _LINUX_FS_PARSER_H
#define _LINUX_FS_PARSER_H

#include <linux/fs_context.h>

struct path;

struct constant_table {
        const char        *name;
        int                value;
};

struct fs_parameter_spec;
struct fs_parse_result;
typedef int fs_param_type(struct p_log *,
                          const struct fs_parameter_spec *,
                          struct fs_parameter *,
                          struct fs_parse_result *);
/*
 * The type of parameter expected.
 */
fs_param_type fs_param_is_bool, fs_param_is_u32, fs_param_is_s32, fs_param_is_u64,
        fs_param_is_enum, fs_param_is_string, fs_param_is_blob, fs_param_is_blockdev,
        fs_param_is_path, fs_param_is_fd;

/*
 * Specification of the type of value a parameter wants.
 *
 * Note that the fsparam_flag(), fsparam_string(), fsparam_u32(), ... macros
 * should be used to generate elements of this type.
 */
struct fs_parameter_spec {
        const char                *name;
        fs_param_type                *type;        /* The desired parameter type */
        u8                        opt;        /* Option number (returned by fs_parse()) */
        unsigned short                flags;
#define fs_param_neg_with_no        0x0002        /* "noxxx" is negative param */
#define fs_param_neg_with_empty        0x0004        /* "xxx=" is negative param */
#define fs_param_deprecated        0x0008        /* The param is deprecated */
        const void                *data;
};

/*
 * Result of parse.
 */
struct fs_parse_result {
        bool                        negated;        /* T if param was "noxxx" */
        union {
                bool                boolean;        /* For spec_bool */
                int                int_32;                /* For spec_s32/spec_enum */
                unsigned int        uint_32;        /* For spec_u32{,_octal,_hex}/spec_enum */
                u64                uint_64;        /* For spec_u64 */
        };
};

extern int __fs_parse(struct p_log *log,
                    const struct fs_parameter_spec *desc,
                    struct fs_parameter *value,
                    struct fs_parse_result *result);

static inline int fs_parse(struct fs_context *fc,
             const struct fs_parameter_spec *desc,
             struct fs_parameter *param,
             struct fs_parse_result *result)
{
        return __fs_parse(&fc->log, desc, param, result);
}

extern int fs_lookup_param(struct fs_context *fc,
                           struct fs_parameter *param,
                           bool want_bdev,
                           struct path *_path);

extern int lookup_constant(const struct constant_table tbl[], const char *name, int not_found);

#ifdef CONFIG_VALIDATE_FS_PARSER
extern bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size,
                                    int low, int high, int special);
extern bool fs_validate_description(const char *name,
                                    const struct fs_parameter_spec *desc);
#else
static inline bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size,
                                           int low, int high, int special)
{ return true; }
static inline bool fs_validate_description(const char *name,
                                           const struct fs_parameter_spec *desc)
{ return true; }
#endif

/*
 * Parameter type, name, index and flags element constructors.  Use as:
 *
 *  fsparam_xxxx("foo", Opt_foo)
 *
 * If existing helpers are not enough, direct use of __fsparam() would
 * work, but any such case is probably a sign that new helper is needed.
 * Helpers will remain stable; low-level implementation may change.
 */
#define __fsparam(TYPE, NAME, OPT, FLAGS, DATA) \
        { \
                .name = NAME, \
                .opt = OPT, \
                .type = TYPE, \
                .flags = FLAGS, \
                .data = DATA \
        }

#define fsparam_flag(NAME, OPT)        __fsparam(NULL, NAME, OPT, 0, NULL)
#define fsparam_flag_no(NAME, OPT) \
                        __fsparam(NULL, NAME, OPT, fs_param_neg_with_no, NULL)
#define fsparam_bool(NAME, OPT)        __fsparam(fs_param_is_bool, NAME, OPT, 0, NULL)
#define fsparam_u32(NAME, OPT)        __fsparam(fs_param_is_u32, NAME, OPT, 0, NULL)
#define fsparam_u32oct(NAME, OPT) \
                        __fsparam(fs_param_is_u32, NAME, OPT, 0, (void *)8)
#define fsparam_u32hex(NAME, OPT) \
                        __fsparam(fs_param_is_u32_hex, NAME, OPT, 0, (void *)16)
#define fsparam_s32(NAME, OPT)        __fsparam(fs_param_is_s32, NAME, OPT, 0, NULL)
#define fsparam_u64(NAME, OPT)        __fsparam(fs_param_is_u64, NAME, OPT, 0, NULL)
#define fsparam_enum(NAME, OPT, array)        __fsparam(fs_param_is_enum, NAME, OPT, 0, array)
#define fsparam_string(NAME, OPT) \
                                __fsparam(fs_param_is_string, NAME, OPT, 0, NULL)
#define fsparam_blob(NAME, OPT)        __fsparam(fs_param_is_blob, NAME, OPT, 0, NULL)
#define fsparam_bdev(NAME, OPT)        __fsparam(fs_param_is_blockdev, NAME, OPT, 0, NULL)
#define fsparam_path(NAME, OPT)        __fsparam(fs_param_is_path, NAME, OPT, 0, NULL)
#define fsparam_fd(NAME, OPT)        __fsparam(fs_param_is_fd, NAME, OPT, 0, NULL)

#endif /* _LINUX_FS_PARSER_H */
































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
/* SPDX-License-Identifier: GPL-2.0+ */
/*
 *  Driver for 8250/16550-type serial ports
 *
 *  Based on drivers/char/serial.c, by Linus Torvalds, Theodore Ts'o.
 *
 *  Copyright (C) 2001 Russell King.
 */

#include <linux/serial_8250.h>
#include <linux/serial_reg.h>
#include <linux/dmaengine.h>

#include "../serial_mctrl_gpio.h"

struct uart_8250_dma {
        int (*tx_dma)(struct uart_8250_port *p);
        int (*rx_dma)(struct uart_8250_port *p);

        /* Filter function */
        dma_filter_fn                fn;
        /* Parameter to the filter function */
        void                        *rx_param;
        void                        *tx_param;

        struct dma_slave_config        rxconf;
        struct dma_slave_config        txconf;

        struct dma_chan                *rxchan;
        struct dma_chan                *txchan;

        /* Device address base for DMA operations */
        phys_addr_t                rx_dma_addr;
        phys_addr_t                tx_dma_addr;

        /* DMA address of the buffer in memory */
        dma_addr_t                rx_addr;
        dma_addr_t                tx_addr;

        dma_cookie_t                rx_cookie;
        dma_cookie_t                tx_cookie;

        void                        *rx_buf;

        size_t                        rx_size;
        size_t                        tx_size;

        unsigned char                tx_running;
        unsigned char                tx_err;
        unsigned char                rx_running;
};

struct old_serial_port {
        unsigned int uart;
        unsigned int baud_base;
        unsigned int port;
        unsigned int irq;
        upf_t        flags;
        unsigned char io_type;
        unsigned char __iomem *iomem_base;
        unsigned short iomem_reg_shift;
};

struct serial8250_config {
        const char        *name;
        unsigned short        fifo_size;
        unsigned short        tx_loadsz;
        unsigned char        fcr;
        unsigned char        rxtrig_bytes[UART_FCR_R_TRIG_MAX_STATE];
        unsigned int        flags;
};

#define UART_CAP_FIFO        (1 << 8)        /* UART has FIFO */
#define UART_CAP_EFR        (1 << 9)        /* UART has EFR */
#define UART_CAP_SLEEP        (1 << 10)        /* UART has IER sleep */
#define UART_CAP_AFE        (1 << 11)        /* MCR-based hw flow control */
#define UART_CAP_UUE        (1 << 12)        /* UART needs IER bit 6 set (Xscale) */
#define UART_CAP_RTOIE        (1 << 13)        /* UART needs IER bit 4 set (Xscale, Tegra) */
#define UART_CAP_HFIFO        (1 << 14)        /* UART has a "hidden" FIFO */
#define UART_CAP_RPM        (1 << 15)        /* Runtime PM is active while idle */
#define UART_CAP_IRDA        (1 << 16)        /* UART supports IrDA line discipline */
#define UART_CAP_MINI        (1 << 17)        /* Mini UART on BCM283X family lacks:
                                         * STOP PARITY EPAR SPAR WLEN5 WLEN6
                                         */

#define UART_BUG_QUOT        (1 << 0)        /* UART has buggy quot LSB */
#define UART_BUG_TXEN        (1 << 1)        /* UART has buggy TX IIR status */
#define UART_BUG_NOMSR        (1 << 2)        /* UART has buggy MSR status bits (Au1x00) */
#define UART_BUG_THRE        (1 << 3)        /* UART has buggy THRE reassertion */
#define UART_BUG_TXRACE        (1 << 5)        /* UART Tx fails to set remote DR */


#ifdef CONFIG_SERIAL_8250_SHARE_IRQ
#define SERIAL8250_SHARE_IRQS 1
#else
#define SERIAL8250_SHARE_IRQS 0
#endif

#define SERIAL8250_PORT_FLAGS(_base, _irq, _flags)                \
        {                                                        \
                .iobase                = _base,                        \
                .irq                = _irq,                                \
                .uartclk        = 1843200,                        \
                .iotype                = UPIO_PORT,                        \
                .flags                = UPF_BOOT_AUTOCONF | (_flags),        \
        }

#define SERIAL8250_PORT(_base, _irq) SERIAL8250_PORT_FLAGS(_base, _irq, 0)


static inline int serial_in(struct uart_8250_port *up, int offset)
{
        return up->port.serial_in(&up->port, offset);
}

static inline void serial_out(struct uart_8250_port *up, int offset, int value)
{
        up->port.serial_out(&up->port, offset, value);
}

/*
 * For the 16C950
 */
static void serial_icr_write(struct uart_8250_port *up, int offset, int value)
{
        serial_out(up, UART_SCR, offset);
        serial_out(up, UART_ICR, value);
}

static unsigned int __maybe_unused serial_icr_read(struct uart_8250_port *up,
                                                   int offset)
{
        unsigned int value;

        serial_icr_write(up, UART_ACR, up->acr | UART_ACR_ICRRD);
        serial_out(up, UART_SCR, offset);
        value = serial_in(up, UART_ICR);
        serial_icr_write(up, UART_ACR, up->acr);

        return value;
}

void serial8250_clear_and_reinit_fifos(struct uart_8250_port *p);

static inline int serial_dl_read(struct uart_8250_port *up)
{
        return up->dl_read(up);
}

static inline void serial_dl_write(struct uart_8250_port *up, int value)
{
        up->dl_write(up, value);
}

static inline bool serial8250_set_THRI(struct uart_8250_port *up)
{
        if (up->ier & UART_IER_THRI)
                return false;
        up->ier |= UART_IER_THRI;
        serial_out(up, UART_IER, up->ier);
        return true;
}

static inline bool serial8250_clear_THRI(struct uart_8250_port *up)
{
        if (!(up->ier & UART_IER_THRI))
                return false;
        up->ier &= ~UART_IER_THRI;
        serial_out(up, UART_IER, up->ier);
        return true;
}

struct uart_8250_port *serial8250_get_port(int line);

void serial8250_rpm_get(struct uart_8250_port *p);
void serial8250_rpm_put(struct uart_8250_port *p);

void serial8250_rpm_get_tx(struct uart_8250_port *p);
void serial8250_rpm_put_tx(struct uart_8250_port *p);

int serial8250_em485_config(struct uart_port *port, struct serial_rs485 *rs485);
void serial8250_em485_start_tx(struct uart_8250_port *p);
void serial8250_em485_stop_tx(struct uart_8250_port *p);
void serial8250_em485_destroy(struct uart_8250_port *p);

/* MCR <-> TIOCM conversion */
static inline int serial8250_TIOCM_to_MCR(int tiocm)
{
        int mcr = 0;

        if (tiocm & TIOCM_RTS)
                mcr |= UART_MCR_RTS;
        if (tiocm & TIOCM_DTR)
                mcr |= UART_MCR_DTR;
        if (tiocm & TIOCM_OUT1)
                mcr |= UART_MCR_OUT1;
        if (tiocm & TIOCM_OUT2)
                mcr |= UART_MCR_OUT2;
        if (tiocm & TIOCM_LOOP)
                mcr |= UART_MCR_LOOP;

        return mcr;
}

static inline int serial8250_MCR_to_TIOCM(int mcr)
{
        int tiocm = 0;

        if (mcr & UART_MCR_RTS)
                tiocm |= TIOCM_RTS;
        if (mcr & UART_MCR_DTR)
                tiocm |= TIOCM_DTR;
        if (mcr & UART_MCR_OUT1)
                tiocm |= TIOCM_OUT1;
        if (mcr & UART_MCR_OUT2)
                tiocm |= TIOCM_OUT2;
        if (mcr & UART_MCR_LOOP)
                tiocm |= TIOCM_LOOP;

        return tiocm;
}

/* MSR <-> TIOCM conversion */
static inline int serial8250_MSR_to_TIOCM(int msr)
{
        int tiocm = 0;

        if (msr & UART_MSR_DCD)
                tiocm |= TIOCM_CAR;
        if (msr & UART_MSR_RI)
                tiocm |= TIOCM_RNG;
        if (msr & UART_MSR_DSR)
                tiocm |= TIOCM_DSR;
        if (msr & UART_MSR_CTS)
                tiocm |= TIOCM_CTS;

        return tiocm;
}

static inline void serial8250_out_MCR(struct uart_8250_port *up, int value)
{
        serial_out(up, UART_MCR, value);

        if (up->gpios)
                mctrl_gpio_set(up->gpios, serial8250_MCR_to_TIOCM(value));
}

static inline int serial8250_in_MCR(struct uart_8250_port *up)
{
        int mctrl;

        mctrl = serial_in(up, UART_MCR);

        if (up->gpios) {
                unsigned int mctrl_gpio = 0;

                mctrl_gpio = mctrl_gpio_get_outputs(up->gpios, &mctrl_gpio);
                mctrl |= serial8250_TIOCM_to_MCR(mctrl_gpio);
        }

        return mctrl;
}

#if defined(__alpha__) && !defined(CONFIG_PCI)
/*
 * Digital did something really horribly wrong with the OUT1 and OUT2
 * lines on at least some ALPHA's.  The failure mode is that if either
 * is cleared, the machine locks up with endless interrupts.
 */
#define ALPHA_KLUDGE_MCR  (UART_MCR_OUT2 | UART_MCR_OUT1)
#else
#define ALPHA_KLUDGE_MCR 0
#endif

#ifdef CONFIG_SERIAL_8250_PNP
int serial8250_pnp_init(void);
void serial8250_pnp_exit(void);
#else
static inline int serial8250_pnp_init(void) { return 0; }
static inline void serial8250_pnp_exit(void) { }
#endif

#ifdef CONFIG_SERIAL_8250_FINTEK
int fintek_8250_probe(struct uart_8250_port *uart);
#else
static inline int fintek_8250_probe(struct uart_8250_port *uart) { return 0; }
#endif

#ifdef CONFIG_ARCH_OMAP1
static inline int is_omap1_8250(struct uart_8250_port *pt)
{
        int res;

        switch (pt->port.mapbase) {
        case OMAP1_UART1_BASE:
        case OMAP1_UART2_BASE:
        case OMAP1_UART3_BASE:
                res = 1;
                break;
        default:
                res = 0;
                break;
        }

        return res;
}

static inline int is_omap1510_8250(struct uart_8250_port *pt)
{
        if (!cpu_is_omap1510())
                return 0;

        return is_omap1_8250(pt);
}
#else
static inline int is_omap1_8250(struct uart_8250_port *pt)
{
        return 0;
}
static inline int is_omap1510_8250(struct uart_8250_port *pt)
{
        return 0;
}
#endif

#ifdef CONFIG_SERIAL_8250_DMA
extern int serial8250_tx_dma(struct uart_8250_port *);
extern void serial8250_tx_dma_flush(struct uart_8250_port *);
extern int serial8250_rx_dma(struct uart_8250_port *);
extern void serial8250_rx_dma_flush(struct uart_8250_port *);
extern int serial8250_request_dma(struct uart_8250_port *);
extern void serial8250_release_dma(struct uart_8250_port *);

static inline bool serial8250_tx_dma_running(struct uart_8250_port *p)
{
        struct uart_8250_dma *dma = p->dma;

        return dma && dma->tx_running;
}
#else
static inline int serial8250_tx_dma(struct uart_8250_port *p)
{
        return -1;
}
static inline void serial8250_tx_dma_flush(struct uart_8250_port *p) { }
static inline int serial8250_rx_dma(struct uart_8250_port *p)
{
        return -1;
}
static inline void serial8250_rx_dma_flush(struct uart_8250_port *p) { }
static inline int serial8250_request_dma(struct uart_8250_port *p)
{
        return -1;
}
static inline void serial8250_release_dma(struct uart_8250_port *p) { }

static inline bool serial8250_tx_dma_running(struct uart_8250_port *p)
{
        return false;
}
#endif

static inline int ns16550a_goto_highspeed(struct uart_8250_port *up)
{
        unsigned char status;

        status = serial_in(up, 0x04); /* EXCR2 */
#define PRESL(x) ((x) & 0x30)
        if (PRESL(status) == 0x10) {
                /* already in high speed mode */
                return 0;
        } else {
                status &= ~0xB0; /* Disable LOCK, mask out PRESL[01] */
                status |= 0x10;  /* 1.625 divisor for baud_base --> 921600 */
                serial_out(up, 0x04, status);
        }
        return 1;
}

static inline int serial_index(struct uart_port *port)
{
        return port->minor - 64;
}



































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *        Definitions for the 'struct ptr_ring' datastructure.
 *
 *        Author:
 *                Michael S. Tsirkin <mst@redhat.com>
 *
 *        Copyright (C) 2016 Red Hat, Inc.
 *
 *        This is a limited-size FIFO maintaining pointers in FIFO order, with
 *        one CPU producing entries and another consuming entries from a FIFO.
 *
 *        This implementation tries to minimize cache-contention when there is a
 *        single producer and a single consumer CPU.
 */

#ifndef _LINUX_PTR_RING_H
#define _LINUX_PTR_RING_H 1

#ifdef __KERNEL__
#include <linux/spinlock.h>
#include <linux/cache.h>
#include <linux/types.h>
#include <linux/compiler.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <asm/errno.h>
#endif

struct ptr_ring {
        int producer ____cacheline_aligned_in_smp;
        spinlock_t producer_lock;
        int consumer_head ____cacheline_aligned_in_smp; /* next valid entry */
        int consumer_tail; /* next entry to invalidate */
        spinlock_t consumer_lock;
        /* Shared consumer/producer data */
        /* Read-only by both the producer and the consumer */
        int size ____cacheline_aligned_in_smp; /* max entries in queue */
        int batch; /* number of entries to consume in a batch */
        void **queue;
};

/* Note: callers invoking this in a loop must use a compiler barrier,
 * for example cpu_relax().
 *
 * NB: this is unlike __ptr_ring_empty in that callers must hold producer_lock:
 * see e.g. ptr_ring_full.
 */
static inline bool __ptr_ring_full(struct ptr_ring *r)
{
        return r->queue[r->producer];
}

static inline bool ptr_ring_full(struct ptr_ring *r)
{
        bool ret;

        spin_lock(&r->producer_lock);
        ret = __ptr_ring_full(r);
        spin_unlock(&r->producer_lock);

        return ret;
}

static inline bool ptr_ring_full_irq(struct ptr_ring *r)
{
        bool ret;

        spin_lock_irq(&r->producer_lock);
        ret = __ptr_ring_full(r);
        spin_unlock_irq(&r->producer_lock);

        return ret;
}

static inline bool ptr_ring_full_any(struct ptr_ring *r)
{
        unsigned long flags;
        bool ret;

        spin_lock_irqsave(&r->producer_lock, flags);
        ret = __ptr_ring_full(r);
        spin_unlock_irqrestore(&r->producer_lock, flags);

        return ret;
}

static inline bool ptr_ring_full_bh(struct ptr_ring *r)
{
        bool ret;

        spin_lock_bh(&r->producer_lock);
        ret = __ptr_ring_full(r);
        spin_unlock_bh(&r->producer_lock);

        return ret;
}

/* Note: callers invoking this in a loop must use a compiler barrier,
 * for example cpu_relax(). Callers must hold producer_lock.
 * Callers are responsible for making sure pointer that is being queued
 * points to a valid data.
 */
static inline int __ptr_ring_produce(struct ptr_ring *r, void *ptr)
{
        if (unlikely(!r->size) || r->queue[r->producer])
                return -ENOSPC;

        /* Make sure the pointer we are storing points to a valid data. */
        /* Pairs with the dependency ordering in __ptr_ring_consume. */
        smp_wmb();

        WRITE_ONCE(r->queue[r->producer++], ptr);
        if (unlikely(r->producer >= r->size))
                r->producer = 0;
        return 0;
}

/*
 * Note: resize (below) nests producer lock within consumer lock, so if you
 * consume in interrupt or BH context, you must disable interrupts/BH when
 * calling this.
 */
static inline int ptr_ring_produce(struct ptr_ring *r, void *ptr)
{
        int ret;

        spin_lock(&r->producer_lock);
        ret = __ptr_ring_produce(r, ptr);
        spin_unlock(&r->producer_lock);

        return ret;
}

static inline int ptr_ring_produce_irq(struct ptr_ring *r, void *ptr)
{
        int ret;

        spin_lock_irq(&r->producer_lock);
        ret = __ptr_ring_produce(r, ptr);
        spin_unlock_irq(&r->producer_lock);

        return ret;
}

static inline int ptr_ring_produce_any(struct ptr_ring *r, void *ptr)
{
        unsigned long flags;
        int ret;

        spin_lock_irqsave(&r->producer_lock, flags);
        ret = __ptr_ring_produce(r, ptr);
        spin_unlock_irqrestore(&r->producer_lock, flags);

        return ret;
}

static inline int ptr_ring_produce_bh(struct ptr_ring *r, void *ptr)
{
        int ret;

        spin_lock_bh(&r->producer_lock);
        ret = __ptr_ring_produce(r, ptr);
        spin_unlock_bh(&r->producer_lock);

        return ret;
}

static inline void *__ptr_ring_peek(struct ptr_ring *r)
{
        if (likely(r->size))
                return READ_ONCE(r->queue[r->consumer_head]);
        return NULL;
}

/*
 * Test ring empty status without taking any locks.
 *
 * NB: This is only safe to call if ring is never resized.
 *
 * However, if some other CPU consumes ring entries at the same time, the value
 * returned is not guaranteed to be correct.
 *
 * In this case - to avoid incorrectly detecting the ring
 * as empty - the CPU consuming the ring entries is responsible
 * for either consuming all ring entries until the ring is empty,
 * or synchronizing with some other CPU and causing it to
 * re-test __ptr_ring_empty and/or consume the ring enteries
 * after the synchronization point.
 *
 * Note: callers invoking this in a loop must use a compiler barrier,
 * for example cpu_relax().
 */
static inline bool __ptr_ring_empty(struct ptr_ring *r)
{
        if (likely(r->size))
                return !r->queue[READ_ONCE(r->consumer_head)];
        return true;
}

static inline bool ptr_ring_empty(struct ptr_ring *r)
{
        bool ret;

        spin_lock(&r->consumer_lock);
        ret = __ptr_ring_empty(r);
        spin_unlock(&r->consumer_lock);

        return ret;
}

static inline bool ptr_ring_empty_irq(struct ptr_ring *r)
{
        bool ret;

        spin_lock_irq(&r->consumer_lock);
        ret = __ptr_ring_empty(r);
        spin_unlock_irq(&r->consumer_lock);

        return ret;
}

static inline bool ptr_ring_empty_any(struct ptr_ring *r)
{
        unsigned long flags;
        bool ret;

        spin_lock_irqsave(&r->consumer_lock, flags);
        ret = __ptr_ring_empty(r);
        spin_unlock_irqrestore(&r->consumer_lock, flags);

        return ret;
}

static inline bool ptr_ring_empty_bh(struct ptr_ring *r)
{
        bool ret;

        spin_lock_bh(&r->consumer_lock);
        ret = __ptr_ring_empty(r);
        spin_unlock_bh(&r->consumer_lock);

        return ret;
}

/* Must only be called after __ptr_ring_peek returned !NULL */
static inline void __ptr_ring_discard_one(struct ptr_ring *r)
{
        /* Fundamentally, what we want to do is update consumer
         * index and zero out the entry so producer can reuse it.
         * Doing it naively at each consume would be as simple as:
         *       consumer = r->consumer;
         *       r->queue[consumer++] = NULL;
         *       if (unlikely(consumer >= r->size))
         *               consumer = 0;
         *       r->consumer = consumer;
         * but that is suboptimal when the ring is full as producer is writing
         * out new entries in the same cache line.  Defer these updates until a
         * batch of entries has been consumed.
         */
        /* Note: we must keep consumer_head valid at all times for __ptr_ring_empty
         * to work correctly.
         */
        int consumer_head = r->consumer_head;
        int head = consumer_head++;

        /* Once we have processed enough entries invalidate them in
         * the ring all at once so producer can reuse their space in the ring.
         * We also do this when we reach end of the ring - not mandatory
         * but helps keep the implementation simple.
         */
        if (unlikely(consumer_head - r->consumer_tail >= r->batch ||
                     consumer_head >= r->size)) {
                /* Zero out entries in the reverse order: this way we touch the
                 * cache line that producer might currently be reading the last;
                 * producer won't make progress and touch other cache lines
                 * besides the first one until we write out all entries.
                 */
                while (likely(head >= r->consumer_tail))
                        r->queue[head--] = NULL;
                r->consumer_tail = consumer_head;
        }
        if (unlikely(consumer_head >= r->size)) {
                consumer_head = 0;
                r->consumer_tail = 0;
        }
        /* matching READ_ONCE in __ptr_ring_empty for lockless tests */
        WRITE_ONCE(r->consumer_head, consumer_head);
}

static inline void *__ptr_ring_consume(struct ptr_ring *r)
{
        void *ptr;

        /* The READ_ONCE in __ptr_ring_peek guarantees that anyone
         * accessing data through the pointer is up to date. Pairs
         * with smp_wmb in __ptr_ring_produce.
         */
        ptr = __ptr_ring_peek(r);
        if (ptr)
                __ptr_ring_discard_one(r);

        return ptr;
}

static inline int __ptr_ring_consume_batched(struct ptr_ring *r,
                                             void **array, int n)
{
        void *ptr;
        int i;

        for (i = 0; i < n; i++) {
                ptr = __ptr_ring_consume(r);
                if (!ptr)
                        break;
                array[i] = ptr;
        }

        return i;
}

/*
 * Note: resize (below) nests producer lock within consumer lock, so if you
 * call this in interrupt or BH context, you must disable interrupts/BH when
 * producing.
 */
static inline void *ptr_ring_consume(struct ptr_ring *r)
{
        void *ptr;

        spin_lock(&r->consumer_lock);
        ptr = __ptr_ring_consume(r);
        spin_unlock(&r->consumer_lock);

        return ptr;
}

static inline void *ptr_ring_consume_irq(struct ptr_ring *r)
{
        void *ptr;

        spin_lock_irq(&r->consumer_lock);
        ptr = __ptr_ring_consume(r);
        spin_unlock_irq(&r->consumer_lock);

        return ptr;
}

static inline void *ptr_ring_consume_any(struct ptr_ring *r)
{
        unsigned long flags;
        void *ptr;

        spin_lock_irqsave(&r->consumer_lock, flags);
        ptr = __ptr_ring_consume(r);
        spin_unlock_irqrestore(&r->consumer_lock, flags);

        return ptr;
}

static inline void *ptr_ring_consume_bh(struct ptr_ring *r)
{
        void *ptr;

        spin_lock_bh(&r->consumer_lock);
        ptr = __ptr_ring_consume(r);
        spin_unlock_bh(&r->consumer_lock);

        return ptr;
}

static inline int ptr_ring_consume_batched(struct ptr_ring *r,
                                           void **array, int n)
{
        int ret;

        spin_lock(&r->consumer_lock);
        ret = __ptr_ring_consume_batched(r, array, n);
        spin_unlock(&r->consumer_lock);

        return ret;
}

static inline int ptr_ring_consume_batched_irq(struct ptr_ring *r,
                                               void **array, int n)
{
        int ret;

        spin_lock_irq(&r->consumer_lock);
        ret = __ptr_ring_consume_batched(r, array, n);
        spin_unlock_irq(&r->consumer_lock);

        return ret;
}

static inline int ptr_ring_consume_batched_any(struct ptr_ring *r,
                                               void **array, int n)
{
        unsigned long flags;
        int ret;

        spin_lock_irqsave(&r->consumer_lock, flags);
        ret = __ptr_ring_consume_batched(r, array, n);
        spin_unlock_irqrestore(&r->consumer_lock, flags);

        return ret;
}

static inline int ptr_ring_consume_batched_bh(struct ptr_ring *r,
                                              void **array, int n)
{
        int ret;

        spin_lock_bh(&r->consumer_lock);
        ret = __ptr_ring_consume_batched(r, array, n);
        spin_unlock_bh(&r->consumer_lock);

        return ret;
}

/* Cast to structure type and call a function without discarding from FIFO.
 * Function must return a value.
 * Callers must take consumer_lock.
 */
#define __PTR_RING_PEEK_CALL(r, f) ((f)(__ptr_ring_peek(r)))

#define PTR_RING_PEEK_CALL(r, f) ({ \
        typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \
        \
        spin_lock(&(r)->consumer_lock); \
        __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \
        spin_unlock(&(r)->consumer_lock); \
        __PTR_RING_PEEK_CALL_v; \
})

#define PTR_RING_PEEK_CALL_IRQ(r, f) ({ \
        typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \
        \
        spin_lock_irq(&(r)->consumer_lock); \
        __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \
        spin_unlock_irq(&(r)->consumer_lock); \
        __PTR_RING_PEEK_CALL_v; \
})

#define PTR_RING_PEEK_CALL_BH(r, f) ({ \
        typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \
        \
        spin_lock_bh(&(r)->consumer_lock); \
        __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \
        spin_unlock_bh(&(r)->consumer_lock); \
        __PTR_RING_PEEK_CALL_v; \
})

#define PTR_RING_PEEK_CALL_ANY(r, f) ({ \
        typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \
        unsigned long __PTR_RING_PEEK_CALL_f;\
        \
        spin_lock_irqsave(&(r)->consumer_lock, __PTR_RING_PEEK_CALL_f); \
        __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \
        spin_unlock_irqrestore(&(r)->consumer_lock, __PTR_RING_PEEK_CALL_f); \
        __PTR_RING_PEEK_CALL_v; \
})

/* Not all gfp_t flags (besides GFP_KERNEL) are allowed. See
 * documentation for vmalloc for which of them are legal.
 */
static inline void **__ptr_ring_init_queue_alloc(unsigned int size, gfp_t gfp)
{
        if (size > KMALLOC_MAX_SIZE / sizeof(void *))
                return NULL;
        return kvmalloc_array(size, sizeof(void *), gfp | __GFP_ZERO);
}

static inline void __ptr_ring_set_size(struct ptr_ring *r, int size)
{
        r->size = size;
        r->batch = SMP_CACHE_BYTES * 2 / sizeof(*(r->queue));
        /* We need to set batch at least to 1 to make logic
         * in __ptr_ring_discard_one work correctly.
         * Batching too much (because ring is small) would cause a lot of
         * burstiness. Needs tuning, for now disable batching.
         */
        if (r->batch > r->size / 2 || !r->batch)
                r->batch = 1;
}

static inline int ptr_ring_init(struct ptr_ring *r, int size, gfp_t gfp)
{
        r->queue = __ptr_ring_init_queue_alloc(size, gfp);
        if (!r->queue)
                return -ENOMEM;

        __ptr_ring_set_size(r, size);
        r->producer = r->consumer_head = r->consumer_tail = 0;
        spin_lock_init(&r->producer_lock);
        spin_lock_init(&r->consumer_lock);

        return 0;
}

/*
 * Return entries into ring. Destroy entries that don't fit.
 *
 * Note: this is expected to be a rare slow path operation.
 *
 * Note: producer lock is nested within consumer lock, so if you
 * resize you must make sure all uses nest correctly.
 * In particular if you consume ring in interrupt or BH context, you must
 * disable interrupts/BH when doing so.
 */
static inline void ptr_ring_unconsume(struct ptr_ring *r, void **batch, int n,
                                      void (*destroy)(void *))
{
        unsigned long flags;
        int head;

        spin_lock_irqsave(&r->consumer_lock, flags);
        spin_lock(&r->producer_lock);

        if (!r->size)
                goto done;

        /*
         * Clean out buffered entries (for simplicity). This way following code
         * can test entries for NULL and if not assume they are valid.
         */
        head = r->consumer_head - 1;
        while (likely(head >= r->consumer_tail))
                r->queue[head--] = NULL;
        r->consumer_tail = r->consumer_head;

        /*
         * Go over entries in batch, start moving head back and copy entries.
         * Stop when we run into previously unconsumed entries.
         */
        while (n) {
                head = r->consumer_head - 1;
                if (head < 0)
                        head = r->size - 1;
                if (r->queue[head]) {
                        /* This batch entry will have to be destroyed. */
                        goto done;
                }
                r->queue[head] = batch[--n];
                r->consumer_tail = head;
                /* matching READ_ONCE in __ptr_ring_empty for lockless tests */
                WRITE_ONCE(r->consumer_head, head);
        }

done:
        /* Destroy all entries left in the batch. */
        while (n)
                destroy(batch[--n]);
        spin_unlock(&r->producer_lock);
        spin_unlock_irqrestore(&r->consumer_lock, flags);
}

static inline void **__ptr_ring_swap_queue(struct ptr_ring *r, void **queue,
                                           int size, gfp_t gfp,
                                           void (*destroy)(void *))
{
        int producer = 0;
        void **old;
        void *ptr;

        while ((ptr = __ptr_ring_consume(r)))
                if (producer < size)
                        queue[producer++] = ptr;
                else if (destroy)
                        destroy(ptr);

        if (producer >= size)
                producer = 0;
        __ptr_ring_set_size(r, size);
        r->producer = producer;
        r->consumer_head = 0;
        r->consumer_tail = 0;
        old = r->queue;
        r->queue = queue;

        return old;
}

/*
 * Note: producer lock is nested within consumer lock, so if you
 * resize you must make sure all uses nest correctly.
 * In particular if you consume ring in interrupt or BH context, you must
 * disable interrupts/BH when doing so.
 */
static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp,
                                  void (*destroy)(void *))
{
        unsigned long flags;
        void **queue = __ptr_ring_init_queue_alloc(size, gfp);
        void **old;

        if (!queue)
                return -ENOMEM;

        spin_lock_irqsave(&(r)->consumer_lock, flags);
        spin_lock(&(r)->producer_lock);

        old = __ptr_ring_swap_queue(r, queue, size, gfp, destroy);

        spin_unlock(&(r)->producer_lock);
        spin_unlock_irqrestore(&(r)->consumer_lock, flags);

        kvfree(old);

        return 0;
}

/*
 * Note: producer lock is nested within consumer lock, so if you
 * resize you must make sure all uses nest correctly.
 * In particular if you consume ring in interrupt or BH context, you must
 * disable interrupts/BH when doing so.
 */
static inline int ptr_ring_resize_multiple(struct ptr_ring **rings,
                                           unsigned int nrings,
                                           int size,
                                           gfp_t gfp, void (*destroy)(void *))
{
        unsigned long flags;
        void ***queues;
        int i;

        queues = kmalloc_array(nrings, sizeof(*queues), gfp);
        if (!queues)
                goto noqueues;

        for (i = 0; i < nrings; ++i) {
                queues[i] = __ptr_ring_init_queue_alloc(size, gfp);
                if (!queues[i])
                        goto nomem;
        }

        for (i = 0; i < nrings; ++i) {
                spin_lock_irqsave(&(rings[i])->consumer_lock, flags);
                spin_lock(&(rings[i])->producer_lock);
                queues[i] = __ptr_ring_swap_queue(rings[i], queues[i],
                                                  size, gfp, destroy);
                spin_unlock(&(rings[i])->producer_lock);
                spin_unlock_irqrestore(&(rings[i])->consumer_lock, flags);
        }

        for (i = 0; i < nrings; ++i)
                kvfree(queues[i]);

        kfree(queues);

        return 0;

nomem:
        while (--i >= 0)
                kvfree(queues[i]);

        kfree(queues);

noqueues:
        return -ENOMEM;
}

static inline void ptr_ring_cleanup(struct ptr_ring *r, void (*destroy)(void *))
{
        void *ptr;

        if (destroy)
                while ((ptr = ptr_ring_consume(r)))
                        destroy(ptr);
        kvfree(r->queue);
}

#endif /* _LINUX_PTR_RING_H  */
























































































































































































    2 












    3 
































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 












    3 







    3 









    1 













    2 



    2 















    2 






    2 










    2 



    3 




    3 













































































































































































































































































































































































































































































































































































































































































































































































































    3 










    3 


















    3 






















































    3 














    3 
    3 












    3 






























    3 

























































    3 







    3 































    3 


    3 













    3 







    3 













































































































































































































































































































































































































































   14 


   14 


















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/mm/memory.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 */

/*
 * demand-loading started 01.12.91 - seems it is high on the list of
 * things wanted, and it should be easy to implement. - Linus
 */

/*
 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
 * pages started 02.12.91, seems to work. - Linus.
 *
 * Tested sharing by executing about 30 /bin/sh: under the old kernel it
 * would have taken more than the 6M I have free, but it worked well as
 * far as I could see.
 *
 * Also corrected some "invalidate()"s - I wasn't doing enough of them.
 */

/*
 * Real VM (paging to/from disk) started 18.12.91. Much more work and
 * thought has to go into this. Oh, well..
 * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
 *                Found it. Everything seems to work now.
 * 20.12.91  -  Ok, making the swap-device changeable like the root.
 */

/*
 * 05.04.94  -  Multi-page memory management added for v1.1.
 *              Idea by Alex Bligh (alex@cconcepts.co.uk)
 *
 * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
 *                (Gerhard.Wichert@pdb.siemens.de)
 *
 * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
 */

#include <linux/kernel_stat.h>
#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/task.h>
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/memremap.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/export.h>
#include <linux/delayacct.h>
#include <linux/init.h>
#include <linux/pfn_t.h>
#include <linux/writeback.h>
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
#include <linux/swapops.h>
#include <linux/elf.h>
#include <linux/gfp.h>
#include <linux/migrate.h>
#include <linux/string.h>
#include <linux/debugfs.h>
#include <linux/userfaultfd_k.h>
#include <linux/dax.h>
#include <linux/oom.h>
#include <linux/numa.h>
#include <linux/perf_event.h>
#include <linux/ptrace.h>
#include <linux/vmalloc.h>

#include <trace/events/kmem.h>

#include <asm/io.h>
#include <asm/mmu_context.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>

#include "pgalloc-track.h"
#include "internal.h"

#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
#endif

#ifndef CONFIG_NEED_MULTIPLE_NODES
/* use the per-pgdat data instead for discontigmem - mbligh */
unsigned long max_mapnr;
EXPORT_SYMBOL(max_mapnr);

struct page *mem_map;
EXPORT_SYMBOL(mem_map);
#endif

/*
 * A number of key systems in x86 including ioremap() rely on the assumption
 * that high_memory defines the upper bound on direct map memory, then end
 * of ZONE_NORMAL.  Under CONFIG_DISCONTIG this means that max_low_pfn and
 * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
 * and ZONE_HIGHMEM.
 */
void *high_memory;
EXPORT_SYMBOL(high_memory);

/*
 * Randomize the address space (stacks, mmaps, brk, etc.).
 *
 * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
 *   as ancient (libc5 based) binaries can segfault. )
 */
int randomize_va_space __read_mostly =
#ifdef CONFIG_COMPAT_BRK
                                        1;
#else
                                        2;
#endif

#ifndef arch_faults_on_old_pte
static inline bool arch_faults_on_old_pte(void)
{
        /*
         * Those arches which don't have hw access flag feature need to
         * implement their own helper. By default, "true" means pagefault
         * will be hit on old pte.
         */
        return true;
}
#endif

static int __init disable_randmaps(char *s)
{
        randomize_va_space = 0;
        return 1;
}
__setup("norandmaps", disable_randmaps);

unsigned long zero_pfn __read_mostly;
EXPORT_SYMBOL(zero_pfn);

unsigned long highest_memmap_pfn __read_mostly;

/*
 * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
 */
static int __init init_zero_pfn(void)
{
        zero_pfn = page_to_pfn(ZERO_PAGE(0));
        return 0;
}
early_initcall(init_zero_pfn);

void mm_trace_rss_stat(struct mm_struct *mm, int member, long count)
{
        trace_rss_stat(mm, member, count);
}

#if defined(SPLIT_RSS_COUNTING)

void sync_mm_rss(struct mm_struct *mm)
{
        int i;

        for (i = 0; i < NR_MM_COUNTERS; i++) {
                if (current->rss_stat.count[i]) {
                        add_mm_counter(mm, i, current->rss_stat.count[i]);
                        current->rss_stat.count[i] = 0;
                }
        }
        current->rss_stat.events = 0;
}

static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
{
        struct task_struct *task = current;

        if (likely(task->mm == mm))
                task->rss_stat.count[member] += val;
        else
                add_mm_counter(mm, member, val);
}
#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)

/* sync counter once per 64 page faults */
#define TASK_RSS_EVENTS_THRESH        (64)
static void check_sync_rss_stat(struct task_struct *task)
{
        if (unlikely(task != current))
                return;
        if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
                sync_mm_rss(task->mm);
}
#else /* SPLIT_RSS_COUNTING */

#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)

static void check_sync_rss_stat(struct task_struct *task)
{
}

#endif /* SPLIT_RSS_COUNTING */

/*
 * Note: this doesn't free the actual pages themselves. That
 * has been handled earlier when unmapping all the memory regions.
 */
static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
                           unsigned long addr)
{
        pgtable_t token = pmd_pgtable(*pmd);
        pmd_clear(pmd);
        pte_free_tlb(tlb, token, addr);
        mm_dec_nr_ptes(tlb->mm);
}

static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
                                unsigned long addr, unsigned long end,
                                unsigned long floor, unsigned long ceiling)
{
        pmd_t *pmd;
        unsigned long next;
        unsigned long start;

        start = addr;
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
                if (pmd_none_or_clear_bad(pmd))
                        continue;
                free_pte_range(tlb, pmd, addr);
        } while (pmd++, addr = next, addr != end);

        start &= PUD_MASK;
        if (start < floor)
                return;
        if (ceiling) {
                ceiling &= PUD_MASK;
                if (!ceiling)
                        return;
        }
        if (end - 1 > ceiling - 1)
                return;

        pmd = pmd_offset(pud, start);
        pud_clear(pud);
        pmd_free_tlb(tlb, pmd, start);
        mm_dec_nr_pmds(tlb->mm);
}

static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
                                unsigned long addr, unsigned long end,
                                unsigned long floor, unsigned long ceiling)
{
        pud_t *pud;
        unsigned long next;
        unsigned long start;

        start = addr;
        pud = pud_offset(p4d, addr);
        do {
                next = pud_addr_end(addr, end);
                if (pud_none_or_clear_bad(pud))
                        continue;
                free_pmd_range(tlb, pud, addr, next, floor, ceiling);
        } while (pud++, addr = next, addr != end);

        start &= P4D_MASK;
        if (start < floor)
                return;
        if (ceiling) {
                ceiling &= P4D_MASK;
                if (!ceiling)
                        return;
        }
        if (end - 1 > ceiling - 1)
                return;

        pud = pud_offset(p4d, start);
        p4d_clear(p4d);
        pud_free_tlb(tlb, pud, start);
        mm_dec_nr_puds(tlb->mm);
}

static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
                                unsigned long addr, unsigned long end,
                                unsigned long floor, unsigned long ceiling)
{
        p4d_t *p4d;
        unsigned long next;
        unsigned long start;

        start = addr;
        p4d = p4d_offset(pgd, addr);
        do {
                next = p4d_addr_end(addr, end);
                if (p4d_none_or_clear_bad(p4d))
                        continue;
                free_pud_range(tlb, p4d, addr, next, floor, ceiling);
        } while (p4d++, addr = next, addr != end);

        start &= PGDIR_MASK;
        if (start < floor)
                return;
        if (ceiling) {
                ceiling &= PGDIR_MASK;
                if (!ceiling)
                        return;
        }
        if (end - 1 > ceiling - 1)
                return;

        p4d = p4d_offset(pgd, start);
        pgd_clear(pgd);
        p4d_free_tlb(tlb, p4d, start);
}

/*
 * This function frees user-level page tables of a process.
 */
void free_pgd_range(struct mmu_gather *tlb,
                        unsigned long addr, unsigned long end,
                        unsigned long floor, unsigned long ceiling)
{
        pgd_t *pgd;
        unsigned long next;

        /*
         * The next few lines have given us lots of grief...
         *
         * Why are we testing PMD* at this top level?  Because often
         * there will be no work to do at all, and we'd prefer not to
         * go all the way down to the bottom just to discover that.
         *
         * Why all these "- 1"s?  Because 0 represents both the bottom
         * of the address space and the top of it (using -1 for the
         * top wouldn't help much: the masks would do the wrong thing).
         * The rule is that addr 0 and floor 0 refer to the bottom of
         * the address space, but end 0 and ceiling 0 refer to the top
         * Comparisons need to use "end - 1" and "ceiling - 1" (though
         * that end 0 case should be mythical).
         *
         * Wherever addr is brought up or ceiling brought down, we must
         * be careful to reject "the opposite 0" before it confuses the
         * subsequent tests.  But what about where end is brought down
         * by PMD_SIZE below? no, end can't go down to 0 there.
         *
         * Whereas we round start (addr) and ceiling down, by different
         * masks at different levels, in order to test whether a table
         * now has no other vmas using it, so can be freed, we don't
         * bother to round floor or end up - the tests don't need that.
         */

        addr &= PMD_MASK;
        if (addr < floor) {
                addr += PMD_SIZE;
                if (!addr)
                        return;
        }
        if (ceiling) {
                ceiling &= PMD_MASK;
                if (!ceiling)
                        return;
        }
        if (end - 1 > ceiling - 1)
                end -= PMD_SIZE;
        if (addr > end - 1)
                return;
        /*
         * We add page table cache pages with PAGE_SIZE,
         * (see pte_free_tlb()), flush the tlb if we need
         */
        tlb_change_page_size(tlb, PAGE_SIZE);
        pgd = pgd_offset(tlb->mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
                free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
        } while (pgd++, addr = next, addr != end);
}

void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
                unsigned long floor, unsigned long ceiling)
{
        while (vma) {
                struct vm_area_struct *next = vma->vm_next;
                unsigned long addr = vma->vm_start;

                /*
                 * Hide vma from rmap and truncate_pagecache before freeing
                 * pgtables
                 */
                unlink_anon_vmas(vma);
                unlink_file_vma(vma);

                if (is_vm_hugetlb_page(vma)) {
                        hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
                                floor, next ? next->vm_start : ceiling);
                } else {
                        /*
                         * Optimization: gather nearby vmas into one call down
                         */
                        while (next && next->vm_start <= vma->vm_end + PMD_SIZE
                               && !is_vm_hugetlb_page(next)) {
                                vma = next;
                                next = vma->vm_next;
                                unlink_anon_vmas(vma);
                                unlink_file_vma(vma);
                        }
                        free_pgd_range(tlb, addr, vma->vm_end,
                                floor, next ? next->vm_start : ceiling);
                }
                vma = next;
        }
}

int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
{
        spinlock_t *ptl;
        pgtable_t new = pte_alloc_one(mm);
        if (!new)
                return -ENOMEM;

        /*
         * Ensure all pte setup (eg. pte page lock and page clearing) are
         * visible before the pte is made visible to other CPUs by being
         * put into page tables.
         *
         * The other side of the story is the pointer chasing in the page
         * table walking code (when walking the page table without locking;
         * ie. most of the time). Fortunately, these data accesses consist
         * of a chain of data-dependent loads, meaning most CPUs (alpha
         * being the notable exception) will already guarantee loads are
         * seen in-order. See the alpha page table accessors for the
         * smp_rmb() barriers in page table walking code.
         */
        smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */

        ptl = pmd_lock(mm, pmd);
        if (likely(pmd_none(*pmd))) {        /* Has another populated it ? */
                mm_inc_nr_ptes(mm);
                pmd_populate(mm, pmd, new);
                new = NULL;
        }
        spin_unlock(ptl);
        if (new)
                pte_free(mm, new);
        return 0;
}

int __pte_alloc_kernel(pmd_t *pmd)
{
        pte_t *new = pte_alloc_one_kernel(&init_mm);
        if (!new)
                return -ENOMEM;

        smp_wmb(); /* See comment in __pte_alloc */

        spin_lock(&init_mm.page_table_lock);
        if (likely(pmd_none(*pmd))) {        /* Has another populated it ? */
                pmd_populate_kernel(&init_mm, pmd, new);
                new = NULL;
        }
        spin_unlock(&init_mm.page_table_lock);
        if (new)
                pte_free_kernel(&init_mm, new);
        return 0;
}

static inline void init_rss_vec(int *rss)
{
        memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
}

static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
{
        int i;

        if (current->mm == mm)
                sync_mm_rss(mm);
        for (i = 0; i < NR_MM_COUNTERS; i++)
                if (rss[i])
                        add_mm_counter(mm, i, rss[i]);
}

/*
 * This function is called to print an error when a bad pte
 * is found. For example, we might have a PFN-mapped pte in
 * a region that doesn't allow it.
 *
 * The calling function must still handle the error.
 */
static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
                          pte_t pte, struct page *page)
{
        pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
        p4d_t *p4d = p4d_offset(pgd, addr);
        pud_t *pud = pud_offset(p4d, addr);
        pmd_t *pmd = pmd_offset(pud, addr);
        struct address_space *mapping;
        pgoff_t index;
        static unsigned long resume;
        static unsigned long nr_shown;
        static unsigned long nr_unshown;

        /*
         * Allow a burst of 60 reports, then keep quiet for that minute;
         * or allow a steady drip of one report per second.
         */
        if (nr_shown == 60) {
                if (time_before(jiffies, resume)) {
                        nr_unshown++;
                        return;
                }
                if (nr_unshown) {
                        pr_alert("BUG: Bad page map: %lu messages suppressed\n",
                                 nr_unshown);
                        nr_unshown = 0;
                }
                nr_shown = 0;
        }
        if (nr_shown++ == 0)
                resume = jiffies + 60 * HZ;

        mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
        index = linear_page_index(vma, addr);

        pr_alert("BUG: Bad page map in process %s  pte:%08llx pmd:%08llx\n",
                 current->comm,
                 (long long)pte_val(pte), (long long)pmd_val(*pmd));
        if (page)
                dump_page(page, "bad pte");
        pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
                 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
        pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n",
                 vma->vm_file,
                 vma->vm_ops ? vma->vm_ops->fault : NULL,
                 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
                 mapping ? mapping->a_ops->readpage : NULL);
        dump_stack();
        add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}

/*
 * vm_normal_page -- This function gets the "struct page" associated with a pte.
 *
 * "Special" mappings do not wish to be associated with a "struct page" (either
 * it doesn't exist, or it exists but they don't want to touch it). In this
 * case, NULL is returned here. "Normal" mappings do have a struct page.
 *
 * There are 2 broad cases. Firstly, an architecture may define a pte_special()
 * pte bit, in which case this function is trivial. Secondly, an architecture
 * may not have a spare pte bit, which requires a more complicated scheme,
 * described below.
 *
 * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
 * special mapping (even if there are underlying and valid "struct pages").
 * COWed pages of a VM_PFNMAP are always normal.
 *
 * The way we recognize COWed pages within VM_PFNMAP mappings is through the
 * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
 * set, and the vm_pgoff will point to the first PFN mapped: thus every special
 * mapping will always honor the rule
 *
 *        pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
 *
 * And for normal mappings this is false.
 *
 * This restricts such mappings to be a linear translation from virtual address
 * to pfn. To get around this restriction, we allow arbitrary mappings so long
 * as the vma is not a COW mapping; in that case, we know that all ptes are
 * special (because none can have been COWed).
 *
 *
 * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
 *
 * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
 * page" backing, however the difference is that _all_ pages with a struct
 * page (that is, those where pfn_valid is true) are refcounted and considered
 * normal pages by the VM. The disadvantage is that pages are refcounted
 * (which can be slower and simply not an option for some PFNMAP users). The
 * advantage is that we don't have to follow the strict linearity rule of
 * PFNMAP mappings in order to support COWable mappings.
 *
 */
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
                            pte_t pte)
{
        unsigned long pfn = pte_pfn(pte);

        if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
                if (likely(!pte_special(pte)))
                        goto check_pfn;
                if (vma->vm_ops && vma->vm_ops->find_special_page)
                        return vma->vm_ops->find_special_page(vma, addr);
                if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
                        return NULL;
                if (is_zero_pfn(pfn))
                        return NULL;
                if (pte_devmap(pte))
                        return NULL;

                print_bad_pte(vma, addr, pte, NULL);
                return NULL;
        }

        /* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */

        if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
                if (vma->vm_flags & VM_MIXEDMAP) {
                        if (!pfn_valid(pfn))
                                return NULL;
                        goto out;
                } else {
                        unsigned long off;
                        off = (addr - vma->vm_start) >> PAGE_SHIFT;
                        if (pfn == vma->vm_pgoff + off)
                                return NULL;
                        if (!is_cow_mapping(vma->vm_flags))
                                return NULL;
                }
        }

        if (is_zero_pfn(pfn))
                return NULL;

check_pfn:
        if (unlikely(pfn > highest_memmap_pfn)) {
                print_bad_pte(vma, addr, pte, NULL);
                return NULL;
        }

        /*
         * NOTE! We still have PageReserved() pages in the page tables.
         * eg. VDSO mappings can cause them to exist.
         */
out:
        return pfn_to_page(pfn);
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
                                pmd_t pmd)
{
        unsigned long pfn = pmd_pfn(pmd);

        /*
         * There is no pmd_special() but there may be special pmds, e.g.
         * in a direct-access (dax) mapping, so let's just replicate the
         * !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here.
         */
        if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
                if (vma->vm_flags & VM_MIXEDMAP) {
                        if (!pfn_valid(pfn))
                                return NULL;
                        goto out;
                } else {
                        unsigned long off;
                        off = (addr - vma->vm_start) >> PAGE_SHIFT;
                        if (pfn == vma->vm_pgoff + off)
                                return NULL;
                        if (!is_cow_mapping(vma->vm_flags))
                                return NULL;
                }
        }

        if (pmd_devmap(pmd))
                return NULL;
        if (is_huge_zero_pmd(pmd))
                return NULL;
        if (unlikely(pfn > highest_memmap_pfn))
                return NULL;

        /*
         * NOTE! We still have PageReserved() pages in the page tables.
         * eg. VDSO mappings can cause them to exist.
         */
out:
        return pfn_to_page(pfn);
}
#endif

/*
 * copy one vm_area from one task to the other. Assumes the page tables
 * already present in the new task to be cleared in the whole range
 * covered by this vma.
 */

static unsigned long
copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma,
                struct vm_area_struct *src_vma, unsigned long addr, int *rss)
{
        unsigned long vm_flags = dst_vma->vm_flags;
        pte_t pte = *src_pte;
        struct page *page;
        swp_entry_t entry = pte_to_swp_entry(pte);

        if (likely(!non_swap_entry(entry))) {
                if (swap_duplicate(entry) < 0)
                        return entry.val;

                /* make sure dst_mm is on swapoff's mmlist. */
                if (unlikely(list_empty(&dst_mm->mmlist))) {
                        spin_lock(&mmlist_lock);
                        if (list_empty(&dst_mm->mmlist))
                                list_add(&dst_mm->mmlist,
                                                &src_mm->mmlist);
                        spin_unlock(&mmlist_lock);
                }
                rss[MM_SWAPENTS]++;
        } else if (is_migration_entry(entry)) {
                page = migration_entry_to_page(entry);

                rss[mm_counter(page)]++;

                if (is_write_migration_entry(entry) &&
                                is_cow_mapping(vm_flags)) {
                        /*
                         * COW mappings require pages in both
                         * parent and child to be set to read.
                         */
                        make_migration_entry_read(&entry);
                        pte = swp_entry_to_pte(entry);
                        if (pte_swp_soft_dirty(*src_pte))
                                pte = pte_swp_mksoft_dirty(pte);
                        if (pte_swp_uffd_wp(*src_pte))
                                pte = pte_swp_mkuffd_wp(pte);
                        set_pte_at(src_mm, addr, src_pte, pte);
                }
        } else if (is_device_private_entry(entry)) {
                page = device_private_entry_to_page(entry);

                /*
                 * Update rss count even for unaddressable pages, as
                 * they should treated just like normal pages in this
                 * respect.
                 *
                 * We will likely want to have some new rss counters
                 * for unaddressable pages, at some point. But for now
                 * keep things as they are.
                 */
                get_page(page);
                rss[mm_counter(page)]++;
                page_dup_rmap(page, false);

                /*
                 * We do not preserve soft-dirty information, because so
                 * far, checkpoint/restore is the only feature that
                 * requires that. And checkpoint/restore does not work
                 * when a device driver is involved (you cannot easily
                 * save and restore device driver state).
                 */
                if (is_write_device_private_entry(entry) &&
                    is_cow_mapping(vm_flags)) {
                        make_device_private_entry_read(&entry);
                        pte = swp_entry_to_pte(entry);
                        if (pte_swp_uffd_wp(*src_pte))
                                pte = pte_swp_mkuffd_wp(pte);
                        set_pte_at(src_mm, addr, src_pte, pte);
                }
        }
        if (!userfaultfd_wp(dst_vma))
                pte = pte_swp_clear_uffd_wp(pte);
        set_pte_at(dst_mm, addr, dst_pte, pte);
        return 0;
}

/*
 * Copy a present and normal page if necessary.
 *
 * NOTE! The usual case is that this doesn't need to do
 * anything, and can just return a positive value. That
 * will let the caller know that it can just increase
 * the page refcount and re-use the pte the traditional
 * way.
 *
 * But _if_ we need to copy it because it needs to be
 * pinned in the parent (and the child should get its own
 * copy rather than just a reference to the same page),
 * we'll do that here and return zero to let the caller
 * know we're done.
 *
 * And if we need a pre-allocated page but don't yet have
 * one, return a negative error to let the preallocation
 * code know so that it can do so outside the page table
 * lock.
 */
static inline int
copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
                  pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
                  struct page **prealloc, pte_t pte, struct page *page)
{
        struct mm_struct *src_mm = src_vma->vm_mm;
        struct page *new_page;

        if (!is_cow_mapping(src_vma->vm_flags))
                return 1;

        /*
         * What we want to do is to check whether this page may
         * have been pinned by the parent process.  If so,
         * instead of wrprotect the pte on both sides, we copy
         * the page immediately so that we'll always guarantee
         * the pinned page won't be randomly replaced in the
         * future.
         *
         * The page pinning checks are just "has this mm ever
         * seen pinning", along with the (inexact) check of
         * the page count. That might give false positives for
         * for pinning, but it will work correctly.
         */
        if (likely(!atomic_read(&src_mm->has_pinned)))
                return 1;
        if (likely(!page_maybe_dma_pinned(page)))
                return 1;

        /*
         * The vma->anon_vma of the child process may be NULL
         * because the entire vma does not contain anonymous pages.
         * A BUG will occur when the copy_present_page() passes
         * a copy of a non-anonymous page of that vma to the
         * page_add_new_anon_rmap() to set up new anonymous rmap.
         * Return 1 if the page is not an anonymous page.
         */
        if (!PageAnon(page))
                return 1;

        new_page = *prealloc;
        if (!new_page)
                return -EAGAIN;

        /*
         * We have a prealloc page, all good!  Take it
         * over and copy the page & arm it.
         */
        *prealloc = NULL;
        copy_user_highpage(new_page, page, addr, src_vma);
        __SetPageUptodate(new_page);
        page_add_new_anon_rmap(new_page, dst_vma, addr, false);
        lru_cache_add_inactive_or_unevictable(new_page, dst_vma);
        rss[mm_counter(new_page)]++;

        /* All done, just insert the new page copy in the child */
        pte = mk_pte(new_page, dst_vma->vm_page_prot);
        pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
        if (userfaultfd_pte_wp(dst_vma, *src_pte))
                /* Uffd-wp needs to be delivered to dest pte as well */
                pte = pte_wrprotect(pte_mkuffd_wp(pte));
        set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
        return 0;
}

/*
 * Copy one pte.  Returns 0 if succeeded, or -EAGAIN if one preallocated page
 * is required to copy this pte.
 */
static inline int
copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
                 pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
                 struct page **prealloc)
{
        struct mm_struct *src_mm = src_vma->vm_mm;
        unsigned long vm_flags = src_vma->vm_flags;
        pte_t pte = *src_pte;
        struct page *page;

        page = vm_normal_page(src_vma, addr, pte);
        if (page) {
                int retval;

                retval = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
                                           addr, rss, prealloc, pte, page);
                if (retval <= 0)
                        return retval;

                get_page(page);
                page_dup_rmap(page, false);
                rss[mm_counter(page)]++;
        }

        /*
         * If it's a COW mapping, write protect it both
         * in the parent and the child
         */
        if (is_cow_mapping(vm_flags) && pte_write(pte)) {
                ptep_set_wrprotect(src_mm, addr, src_pte);
                pte = pte_wrprotect(pte);
        }

        /*
         * If it's a shared mapping, mark it clean in
         * the child
         */
        if (vm_flags & VM_SHARED)
                pte = pte_mkclean(pte);
        pte = pte_mkold(pte);

        if (!userfaultfd_wp(dst_vma))
                pte = pte_clear_uffd_wp(pte);

        set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
        return 0;
}

static inline struct page *
page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma,
                   unsigned long addr)
{
        struct page *new_page;

        new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, addr);
        if (!new_page)
                return NULL;

        if (mem_cgroup_charge(new_page, src_mm, GFP_KERNEL)) {
                put_page(new_page);
                return NULL;
        }
        cgroup_throttle_swaprate(new_page, GFP_KERNEL);

        return new_page;
}

static int
copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
               pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
               unsigned long end)
{
        struct mm_struct *dst_mm = dst_vma->vm_mm;
        struct mm_struct *src_mm = src_vma->vm_mm;
        pte_t *orig_src_pte, *orig_dst_pte;
        pte_t *src_pte, *dst_pte;
        spinlock_t *src_ptl, *dst_ptl;
        int progress, ret = 0;
        int rss[NR_MM_COUNTERS];
        swp_entry_t entry = (swp_entry_t){0};
        struct page *prealloc = NULL;

again:
        progress = 0;
        init_rss_vec(rss);

        dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
        if (!dst_pte) {
                ret = -ENOMEM;
                goto out;
        }
        src_pte = pte_offset_map(src_pmd, addr);
        src_ptl = pte_lockptr(src_mm, src_pmd);
        spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
        orig_src_pte = src_pte;
        orig_dst_pte = dst_pte;
        arch_enter_lazy_mmu_mode();

        do {
                /*
                 * We are holding two locks at this point - either of them
                 * could generate latencies in another task on another CPU.
                 */
                if (progress >= 32) {
                        progress = 0;
                        if (need_resched() ||
                            spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
                                break;
                }
                if (pte_none(*src_pte)) {
                        progress++;
                        continue;
                }
                if (unlikely(!pte_present(*src_pte))) {
                        entry.val = copy_nonpresent_pte(dst_mm, src_mm,
                                                        dst_pte, src_pte,
                                                        dst_vma, src_vma,
                                                        addr, rss);
                        if (entry.val)
                                break;
                        progress += 8;
                        continue;
                }
                /* copy_present_pte() will clear `*prealloc' if consumed */
                ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
                                       addr, rss, &prealloc);
                /*
                 * If we need a pre-allocated page for this pte, drop the
                 * locks, allocate, and try again.
                 */
                if (unlikely(ret == -EAGAIN))
                        break;
                if (unlikely(prealloc)) {
                        /*
                         * pre-alloc page cannot be reused by next time so as
                         * to strictly follow mempolicy (e.g., alloc_page_vma()
                         * will allocate page according to address).  This
                         * could only happen if one pinned pte changed.
                         */
                        put_page(prealloc);
                        prealloc = NULL;
                }
                progress += 8;
        } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);

        arch_leave_lazy_mmu_mode();
        spin_unlock(src_ptl);
        pte_unmap(orig_src_pte);
        add_mm_rss_vec(dst_mm, rss);
        pte_unmap_unlock(orig_dst_pte, dst_ptl);
        cond_resched();

        if (entry.val) {
                if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) {
                        ret = -ENOMEM;
                        goto out;
                }
                entry.val = 0;
        } else if (ret) {
                WARN_ON_ONCE(ret != -EAGAIN);
                prealloc = page_copy_prealloc(src_mm, src_vma, addr);
                if (!prealloc)
                        return -ENOMEM;
                /* We've captured and resolved the error. Reset, try again. */
                ret = 0;
        }
        if (addr != end)
                goto again;
out:
        if (unlikely(prealloc))
                put_page(prealloc);
        return ret;
}

static inline int
copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
               pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
               unsigned long end)
{
        struct mm_struct *dst_mm = dst_vma->vm_mm;
        struct mm_struct *src_mm = src_vma->vm_mm;
        pmd_t *src_pmd, *dst_pmd;
        unsigned long next;

        dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
        if (!dst_pmd)
                return -ENOMEM;
        src_pmd = pmd_offset(src_pud, addr);
        do {
                next = pmd_addr_end(addr, end);
                if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
                        || pmd_devmap(*src_pmd)) {
                        int err;
                        VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
                        err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
                                            addr, dst_vma, src_vma);
                        if (err == -ENOMEM)
                                return -ENOMEM;
                        if (!err)
                                continue;
                        /* fall through */
                }
                if (pmd_none_or_clear_bad(src_pmd))
                        continue;
                if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd,
                                   addr, next))
                        return -ENOMEM;
        } while (dst_pmd++, src_pmd++, addr = next, addr != end);
        return 0;
}

static inline int
copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
               p4d_t *dst_p4d, p4d_t *src_p4d, unsigned long addr,
               unsigned long end)
{
        struct mm_struct *dst_mm = dst_vma->vm_mm;
        struct mm_struct *src_mm = src_vma->vm_mm;
        pud_t *src_pud, *dst_pud;
        unsigned long next;

        dst_pud = pud_alloc(dst_mm, dst_p4d, addr);
        if (!dst_pud)
                return -ENOMEM;
        src_pud = pud_offset(src_p4d, addr);
        do {
                next = pud_addr_end(addr, end);
                if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
                        int err;

                        VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma);
                        err = copy_huge_pud(dst_mm, src_mm,
                                            dst_pud, src_pud, addr, src_vma);
                        if (err == -ENOMEM)
                                return -ENOMEM;
                        if (!err)
                                continue;
                        /* fall through */
                }
                if (pud_none_or_clear_bad(src_pud))
                        continue;
                if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud,
                                   addr, next))
                        return -ENOMEM;
        } while (dst_pud++, src_pud++, addr = next, addr != end);
        return 0;
}

static inline int
copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
               pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long addr,
               unsigned long end)
{
        struct mm_struct *dst_mm = dst_vma->vm_mm;
        p4d_t *src_p4d, *dst_p4d;
        unsigned long next;

        dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr);
        if (!dst_p4d)
                return -ENOMEM;
        src_p4d = p4d_offset(src_pgd, addr);
        do {
                next = p4d_addr_end(addr, end);
                if (p4d_none_or_clear_bad(src_p4d))
                        continue;
                if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d,
                                   addr, next))
                        return -ENOMEM;
        } while (dst_p4d++, src_p4d++, addr = next, addr != end);
        return 0;
}

int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
        pgd_t *src_pgd, *dst_pgd;
        unsigned long next;
        unsigned long addr = src_vma->vm_start;
        unsigned long end = src_vma->vm_end;
        struct mm_struct *dst_mm = dst_vma->vm_mm;
        struct mm_struct *src_mm = src_vma->vm_mm;
        struct mmu_notifier_range range;
        bool is_cow;
        int ret;

        /*
         * Don't copy ptes where a page fault will fill them correctly.
         * Fork becomes much lighter when there are big shared or private
         * readonly mappings. The tradeoff is that copy_page_range is more
         * efficient than faulting.
         */
        if (!(src_vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
            !src_vma->anon_vma)
                return 0;

        if (is_vm_hugetlb_page(src_vma))
                return copy_hugetlb_page_range(dst_mm, src_mm, src_vma);

        if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
                /*
                 * We do not free on error cases below as remove_vma
                 * gets called on error from higher level routine
                 */
                ret = track_pfn_copy(src_vma);
                if (ret)
                        return ret;
        }

        /*
         * We need to invalidate the secondary MMU mappings only when
         * there could be a permission downgrade on the ptes of the
         * parent mm. And a permission downgrade will only happen if
         * is_cow_mapping() returns true.
         */
        is_cow = is_cow_mapping(src_vma->vm_flags);

        if (is_cow) {
                mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
                                        0, src_vma, src_mm, addr, end);
                mmu_notifier_invalidate_range_start(&range);
                /*
                 * Disabling preemption is not needed for the write side, as
                 * the read side doesn't spin, but goes to the mmap_lock.
                 *
                 * Use the raw variant of the seqcount_t write API to avoid
                 * lockdep complaining about preemptibility.
                 */
                mmap_assert_write_locked(src_mm);
                raw_write_seqcount_begin(&src_mm->write_protect_seq);
        }

        ret = 0;
        dst_pgd = pgd_offset(dst_mm, addr);
        src_pgd = pgd_offset(src_mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(src_pgd))
                        continue;
                if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
                                            addr, next))) {
                        ret = -ENOMEM;
                        break;
                }
        } while (dst_pgd++, src_pgd++, addr = next, addr != end);

        if (is_cow) {
                raw_write_seqcount_end(&src_mm->write_protect_seq);
                mmu_notifier_invalidate_range_end(&range);
        }
        return ret;
}

/* Whether we should zap all COWed (private) pages too */
static inline bool should_zap_cows(struct zap_details *details)
{
        /* By default, zap all pages */
        if (!details)
                return true;

        /* Or, we zap COWed pages only if the caller wants to */
        return !details->check_mapping;
}

static unsigned long zap_pte_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, pmd_t *pmd,
                                unsigned long addr, unsigned long end,
                                struct zap_details *details)
{
        struct mm_struct *mm = tlb->mm;
        int force_flush = 0;
        int rss[NR_MM_COUNTERS];
        spinlock_t *ptl;
        pte_t *start_pte;
        pte_t *pte;
        swp_entry_t entry;

        tlb_change_page_size(tlb, PAGE_SIZE);
again:
        init_rss_vec(rss);
        start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
        pte = start_pte;
        flush_tlb_batched_pending(mm);
        arch_enter_lazy_mmu_mode();
        do {
                pte_t ptent = *pte;
                if (pte_none(ptent))
                        continue;

                if (need_resched())
                        break;

                if (pte_present(ptent)) {
                        struct page *page;

                        page = vm_normal_page(vma, addr, ptent);
                        if (unlikely(details) && page) {
                                /*
                                 * unmap_shared_mapping_pages() wants to
                                 * invalidate cache without truncating:
                                 * unmap shared but keep private pages.
                                 */
                                if (details->check_mapping &&
                                    details->check_mapping != page_rmapping(page))
                                        continue;
                        }
                        ptent = ptep_get_and_clear_full(mm, addr, pte,
                                                        tlb->fullmm);
                        tlb_remove_tlb_entry(tlb, pte, addr);
                        if (unlikely(!page))
                                continue;

                        if (!PageAnon(page)) {
                                if (pte_dirty(ptent)) {
                                        force_flush = 1;
                                        set_page_dirty(page);
                                }
                                if (pte_young(ptent) &&
                                    likely(!(vma->vm_flags & VM_SEQ_READ)))
                                        mark_page_accessed(page);
                        }
                        rss[mm_counter(page)]--;
                        page_remove_rmap(page, false);
                        if (unlikely(page_mapcount(page) < 0))
                                print_bad_pte(vma, addr, ptent, page);
                        if (unlikely(__tlb_remove_page(tlb, page))) {
                                force_flush = 1;
                                addr += PAGE_SIZE;
                                break;
                        }
                        continue;
                }

                entry = pte_to_swp_entry(ptent);
                if (is_device_private_entry(entry)) {
                        struct page *page = device_private_entry_to_page(entry);

                        if (unlikely(details && details->check_mapping)) {
                                /*
                                 * unmap_shared_mapping_pages() wants to
                                 * invalidate cache without truncating:
                                 * unmap shared but keep private pages.
                                 */
                                if (details->check_mapping !=
                                    page_rmapping(page))
                                        continue;
                        }

                        pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
                        rss[mm_counter(page)]--;
                        page_remove_rmap(page, false);
                        put_page(page);
                        continue;
                }

                if (!non_swap_entry(entry)) {
                        /* Genuine swap entry, hence a private anon page */
                        if (!should_zap_cows(details))
                                continue;
                        rss[MM_SWAPENTS]--;
                } else if (is_migration_entry(entry)) {
                        struct page *page;

                        page = migration_entry_to_page(entry);
                        if (details && details->check_mapping &&
                            details->check_mapping != page_rmapping(page))
                                continue;
                        rss[mm_counter(page)]--;
                }
                if (unlikely(!free_swap_and_cache(entry)))
                        print_bad_pte(vma, addr, ptent, NULL);
                pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
        } while (pte++, addr += PAGE_SIZE, addr != end);

        add_mm_rss_vec(mm, rss);
        arch_leave_lazy_mmu_mode();

        /* Do the actual TLB flush before dropping ptl */
        if (force_flush)
                tlb_flush_mmu_tlbonly(tlb);
        pte_unmap_unlock(start_pte, ptl);

        /*
         * If we forced a TLB flush (either due to running out of
         * batch buffers or because we needed to flush dirty TLB
         * entries before releasing the ptl), free the batched
         * memory too. Restart if we didn't do everything.
         */
        if (force_flush) {
                force_flush = 0;
                tlb_flush_mmu(tlb);
        }

        if (addr != end) {
                cond_resched();
                goto again;
        }

        return addr;
}

static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, pud_t *pud,
                                unsigned long addr, unsigned long end,
                                struct zap_details *details)
{
        pmd_t *pmd;
        unsigned long next;

        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
                if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
                        if (next - addr != HPAGE_PMD_SIZE)
                                __split_huge_pmd(vma, pmd, addr, false, NULL);
                        else if (zap_huge_pmd(tlb, vma, pmd, addr))
                                goto next;
                        /* fall through */
                } else if (details && details->single_page &&
                           PageTransCompound(details->single_page) &&
                           next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
                        spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
                        /*
                         * Take and drop THP pmd lock so that we cannot return
                         * prematurely, while zap_huge_pmd() has cleared *pmd,
                         * but not yet decremented compound_mapcount().
                         */
                        spin_unlock(ptl);
                }

                /*
                 * Here there can be other concurrent MADV_DONTNEED or
                 * trans huge page faults running, and if the pmd is
                 * none or trans huge it can change under us. This is
                 * because MADV_DONTNEED holds the mmap_lock in read
                 * mode.
                 */
                if (pmd_none_or_trans_huge_or_clear_bad(pmd))
                        goto next;
                next = zap_pte_range(tlb, vma, pmd, addr, next, details);
next:
                cond_resched();
        } while (pmd++, addr = next, addr != end);

        return addr;
}

static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, p4d_t *p4d,
                                unsigned long addr, unsigned long end,
                                struct zap_details *details)
{
        pud_t *pud;
        unsigned long next;

        pud = pud_offset(p4d, addr);
        do {
                next = pud_addr_end(addr, end);
                if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
                        if (next - addr != HPAGE_PUD_SIZE) {
                                mmap_assert_locked(tlb->mm);
                                split_huge_pud(vma, pud, addr);
                        } else if (zap_huge_pud(tlb, vma, pud, addr))
                                goto next;
                        /* fall through */
                }
                if (pud_none_or_clear_bad(pud))
                        continue;
                next = zap_pmd_range(tlb, vma, pud, addr, next, details);
next:
                cond_resched();
        } while (pud++, addr = next, addr != end);

        return addr;
}

static inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, pgd_t *pgd,
                                unsigned long addr, unsigned long end,
                                struct zap_details *details)
{
        p4d_t *p4d;
        unsigned long next;

        p4d = p4d_offset(pgd, addr);
        do {
                next = p4d_addr_end(addr, end);
                if (p4d_none_or_clear_bad(p4d))
                        continue;
                next = zap_pud_range(tlb, vma, p4d, addr, next, details);
        } while (p4d++, addr = next, addr != end);

        return addr;
}

void unmap_page_range(struct mmu_gather *tlb,
                             struct vm_area_struct *vma,
                             unsigned long addr, unsigned long end,
                             struct zap_details *details)
{
        pgd_t *pgd;
        unsigned long next;

        BUG_ON(addr >= end);
        tlb_start_vma(tlb, vma);
        pgd = pgd_offset(vma->vm_mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
                next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
        } while (pgd++, addr = next, addr != end);
        tlb_end_vma(tlb, vma);
}


static void unmap_single_vma(struct mmu_gather *tlb,
                struct vm_area_struct *vma, unsigned long start_addr,
                unsigned long end_addr,
                struct zap_details *details)
{
        unsigned long start = max(vma->vm_start, start_addr);
        unsigned long end;

        if (start >= vma->vm_end)
                return;
        end = min(vma->vm_end, end_addr);
        if (end <= vma->vm_start)
                return;

        if (vma->vm_file)
                uprobe_munmap(vma, start, end);

        if (unlikely(vma->vm_flags & VM_PFNMAP))
                untrack_pfn(vma, 0, 0);

        if (start != end) {
                if (unlikely(is_vm_hugetlb_page(vma))) {
                        /*
                         * It is undesirable to test vma->vm_file as it
                         * should be non-null for valid hugetlb area.
                         * However, vm_file will be NULL in the error
                         * cleanup path of mmap_region. When
                         * hugetlbfs ->mmap method fails,
                         * mmap_region() nullifies vma->vm_file
                         * before calling this function to clean up.
                         * Since no pte has actually been setup, it is
                         * safe to do nothing in this case.
                         */
                        if (vma->vm_file) {
                                i_mmap_lock_write(vma->vm_file->f_mapping);
                                __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
                                i_mmap_unlock_write(vma->vm_file->f_mapping);
                        }
                } else
                        unmap_page_range(tlb, vma, start, end, details);
        }
}

/**
 * unmap_vmas - unmap a range of memory covered by a list of vma's
 * @tlb: address of the caller's struct mmu_gather
 * @vma: the starting vma
 * @start_addr: virtual address at which to start unmapping
 * @end_addr: virtual address at which to end unmapping
 *
 * Unmap all pages in the vma list.
 *
 * Only addresses between `start' and `end' will be unmapped.
 *
 * The VMA list must be sorted in ascending virtual address order.
 *
 * unmap_vmas() assumes that the caller will flush the whole unmapped address
 * range after unmap_vmas() returns.  So the only responsibility here is to
 * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
 * drops the lock and schedules.
 */
void unmap_vmas(struct mmu_gather *tlb,
                struct vm_area_struct *vma, unsigned long start_addr,
                unsigned long end_addr)
{
        struct mmu_notifier_range range;

        mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
                                start_addr, end_addr);
        mmu_notifier_invalidate_range_start(&range);
        for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
                unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
        mmu_notifier_invalidate_range_end(&range);
}

/**
 * zap_page_range - remove user pages in a given range
 * @vma: vm_area_struct holding the applicable pages
 * @start: starting address of pages to zap
 * @size: number of bytes to zap
 *
 * Caller must protect the VMA list
 */
void zap_page_range(struct vm_area_struct *vma, unsigned long start,
                unsigned long size)
{
        struct mmu_notifier_range range;
        struct mmu_gather tlb;

        lru_add_drain();
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
                                start, start + size);
        tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end);
        update_hiwater_rss(vma->vm_mm);
        mmu_notifier_invalidate_range_start(&range);
        for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
                unmap_single_vma(&tlb, vma, start, range.end, NULL);
        mmu_notifier_invalidate_range_end(&range);
        tlb_finish_mmu(&tlb, start, range.end);
}

/**
 * zap_page_range_single - remove user pages in a given range
 * @vma: vm_area_struct holding the applicable pages
 * @address: starting address of pages to zap
 * @size: number of bytes to zap
 * @details: details of shared cache invalidation
 *
 * The range must fit into one VMA.
 */
static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
                unsigned long size, struct zap_details *details)
{
        struct mmu_notifier_range range;
        struct mmu_gather tlb;

        lru_add_drain();
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
                                address, address + size);
        tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end);
        update_hiwater_rss(vma->vm_mm);
        mmu_notifier_invalidate_range_start(&range);
        unmap_single_vma(&tlb, vma, address, range.end, details);
        mmu_notifier_invalidate_range_end(&range);
        tlb_finish_mmu(&tlb, address, range.end);
}

/**
 * zap_vma_ptes - remove ptes mapping the vma
 * @vma: vm_area_struct holding ptes to be zapped
 * @address: starting address of pages to zap
 * @size: number of bytes to zap
 *
 * This function only unmaps ptes assigned to VM_PFNMAP vmas.
 *
 * The entire address range must be fully contained within the vma.
 *
 */
void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
                unsigned long size)
{
        if (address < vma->vm_start || address + size > vma->vm_end ||
                            !(vma->vm_flags & VM_PFNMAP))
                return;

        zap_page_range_single(vma, address, size, NULL);
}
EXPORT_SYMBOL_GPL(zap_vma_ptes);

static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;

        pgd = pgd_offset(mm, addr);
        p4d = p4d_alloc(mm, pgd, addr);
        if (!p4d)
                return NULL;
        pud = pud_alloc(mm, p4d, addr);
        if (!pud)
                return NULL;
        pmd = pmd_alloc(mm, pud, addr);
        if (!pmd)
                return NULL;

        VM_BUG_ON(pmd_trans_huge(*pmd));
        return pmd;
}

pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
                        spinlock_t **ptl)
{
        pmd_t *pmd = walk_to_pmd(mm, addr);

        if (!pmd)
                return NULL;
        return pte_alloc_map_lock(mm, pmd, addr, ptl);
}

static int validate_page_before_insert(struct page *page)
{
        if (PageAnon(page) || PageSlab(page) || page_has_type(page))
                return -EINVAL;
        flush_dcache_page(page);
        return 0;
}

static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte,
                        unsigned long addr, struct page *page, pgprot_t prot)
{
        if (!pte_none(*pte))
                return -EBUSY;
        /* Ok, finally just insert the thing.. */
        get_page(page);
        inc_mm_counter_fast(mm, mm_counter_file(page));
        page_add_file_rmap(page, false);
        set_pte_at(mm, addr, pte, mk_pte(page, prot));
        return 0;
}

/*
 * This is the old fallback for page remapping.
 *
 * For historical reasons, it only allows reserved pages. Only
 * old drivers should use this, and they needed to mark their
 * pages reserved for the old functions anyway.
 */
static int insert_page(struct vm_area_struct *vma, unsigned long addr,
                        struct page *page, pgprot_t prot)
{
        struct mm_struct *mm = vma->vm_mm;
        int retval;
        pte_t *pte;
        spinlock_t *ptl;

        retval = validate_page_before_insert(page);
        if (retval)
                goto out;
        retval = -ENOMEM;
        pte = get_locked_pte(mm, addr, &ptl);
        if (!pte)
                goto out;
        retval = insert_page_into_pte_locked(mm, pte, addr, page, prot);
        pte_unmap_unlock(pte, ptl);
out:
        return retval;
}

#ifdef pte_index
static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte,
                        unsigned long addr, struct page *page, pgprot_t prot)
{
        int err;

        if (!page_count(page))
                return -EINVAL;
        err = validate_page_before_insert(page);
        if (err)
                return err;
        return insert_page_into_pte_locked(mm, pte, addr, page, prot);
}

/* insert_pages() amortizes the cost of spinlock operations
 * when inserting pages in a loop. Arch *must* define pte_index.
 */
static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
                        struct page **pages, unsigned long *num, pgprot_t prot)
{
        pmd_t *pmd = NULL;
        pte_t *start_pte, *pte;
        spinlock_t *pte_lock;
        struct mm_struct *const mm = vma->vm_mm;
        unsigned long curr_page_idx = 0;
        unsigned long remaining_pages_total = *num;
        unsigned long pages_to_write_in_pmd;
        int ret;
more:
        ret = -EFAULT;
        pmd = walk_to_pmd(mm, addr);
        if (!pmd)
                goto out;

        pages_to_write_in_pmd = min_t(unsigned long,
                remaining_pages_total, PTRS_PER_PTE - pte_index(addr));

        /* Allocate the PTE if necessary; takes PMD lock once only. */
        ret = -ENOMEM;
        if (pte_alloc(mm, pmd))
                goto out;

        while (pages_to_write_in_pmd) {
                int pte_idx = 0;
                const int batch_size = min_t(int, pages_to_write_in_pmd, 8);

                start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
                for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
                        int err = insert_page_in_batch_locked(mm, pte,
                                addr, pages[curr_page_idx], prot);
                        if (unlikely(err)) {
                                pte_unmap_unlock(start_pte, pte_lock);
                                ret = err;
                                remaining_pages_total -= pte_idx;
                                goto out;
                        }
                        addr += PAGE_SIZE;
                        ++curr_page_idx;
                }
                pte_unmap_unlock(start_pte, pte_lock);
                pages_to_write_in_pmd -= batch_size;
                remaining_pages_total -= batch_size;
        }
        if (remaining_pages_total)
                goto more;
        ret = 0;
out:
        *num = remaining_pages_total;
        return ret;
}
#endif  /* ifdef pte_index */

/**
 * vm_insert_pages - insert multiple pages into user vma, batching the pmd lock.
 * @vma: user vma to map to
 * @addr: target start user address of these pages
 * @pages: source kernel pages
 * @num: in: number of pages to map. out: number of pages that were *not*
 * mapped. (0 means all pages were successfully mapped).
 *
 * Preferred over vm_insert_page() when inserting multiple pages.
 *
 * In case of error, we may have mapped a subset of the provided
 * pages. It is the caller's responsibility to account for this case.
 *
 * The same restrictions apply as in vm_insert_page().
 */
int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
                        struct page **pages, unsigned long *num)
{
#ifdef pte_index
        const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1;

        if (addr < vma->vm_start || end_addr >= vma->vm_end)
                return -EFAULT;
        if (!(vma->vm_flags & VM_MIXEDMAP)) {
                BUG_ON(mmap_read_trylock(vma->vm_mm));
                BUG_ON(vma->vm_flags & VM_PFNMAP);
                vma->vm_flags |= VM_MIXEDMAP;
        }
        /* Defer page refcount checking till we're about to map that page. */
        return insert_pages(vma, addr, pages, num, vma->vm_page_prot);
#else
        unsigned long idx = 0, pgcount = *num;
        int err = -EINVAL;

        for (; idx < pgcount; ++idx) {
                err = vm_insert_page(vma, addr + (PAGE_SIZE * idx), pages[idx]);
                if (err)
                        break;
        }
        *num = pgcount - idx;
        return err;
#endif  /* ifdef pte_index */
}
EXPORT_SYMBOL(vm_insert_pages);

/**
 * vm_insert_page - insert single page into user vma
 * @vma: user vma to map to
 * @addr: target user address of this page
 * @page: source kernel page
 *
 * This allows drivers to insert individual pages they've allocated
 * into a user vma.
 *
 * The page has to be a nice clean _individual_ kernel allocation.
 * If you allocate a compound page, you need to have marked it as
 * such (__GFP_COMP), or manually just split the page up yourself
 * (see split_page()).
 *
 * NOTE! Traditionally this was done with "remap_pfn_range()" which
 * took an arbitrary page protection parameter. This doesn't allow
 * that. Your vma protection will have to be set up correctly, which
 * means that if you want a shared writable mapping, you'd better
 * ask for a shared writable mapping!
 *
 * The page does not need to be reserved.
 *
 * Usually this function is called from f_op->mmap() handler
 * under mm->mmap_lock write-lock, so it can change vma->vm_flags.
 * Caller must set VM_MIXEDMAP on vma if it wants to call this
 * function from other places, for example from page-fault handler.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
                        struct page *page)
{
        if (addr < vma->vm_start || addr >= vma->vm_end)
                return -EFAULT;
        if (!page_count(page))
                return -EINVAL;
        if (!(vma->vm_flags & VM_MIXEDMAP)) {
                BUG_ON(mmap_read_trylock(vma->vm_mm));
                BUG_ON(vma->vm_flags & VM_PFNMAP);
                vma->vm_flags |= VM_MIXEDMAP;
        }
        return insert_page(vma, addr, page, vma->vm_page_prot);
}
EXPORT_SYMBOL(vm_insert_page);

/*
 * __vm_map_pages - maps range of kernel pages into user vma
 * @vma: user vma to map to
 * @pages: pointer to array of source kernel pages
 * @num: number of pages in page array
 * @offset: user's requested vm_pgoff
 *
 * This allows drivers to map range of kernel pages into a user vma.
 *
 * Return: 0 on success and error code otherwise.
 */
static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages,
                                unsigned long num, unsigned long offset)
{
        unsigned long count = vma_pages(vma);
        unsigned long uaddr = vma->vm_start;
        int ret, i;

        /* Fail if the user requested offset is beyond the end of the object */
        if (offset >= num)
                return -ENXIO;

        /* Fail if the user requested size exceeds available object size */
        if (count > num - offset)
                return -ENXIO;

        for (i = 0; i < count; i++) {
                ret = vm_insert_page(vma, uaddr, pages[offset + i]);
                if (ret < 0)
                        return ret;
                uaddr += PAGE_SIZE;
        }

        return 0;
}

/**
 * vm_map_pages - maps range of kernel pages starts with non zero offset
 * @vma: user vma to map to
 * @pages: pointer to array of source kernel pages
 * @num: number of pages in page array
 *
 * Maps an object consisting of @num pages, catering for the user's
 * requested vm_pgoff
 *
 * If we fail to insert any page into the vma, the function will return
 * immediately leaving any previously inserted pages present.  Callers
 * from the mmap handler may immediately return the error as their caller
 * will destroy the vma, removing any successfully inserted pages. Other
 * callers should make their own arrangements for calling unmap_region().
 *
 * Context: Process context. Called by mmap handlers.
 * Return: 0 on success and error code otherwise.
 */
int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
                                unsigned long num)
{
        return __vm_map_pages(vma, pages, num, vma->vm_pgoff);
}
EXPORT_SYMBOL(vm_map_pages);

/**
 * vm_map_pages_zero - map range of kernel pages starts with zero offset
 * @vma: user vma to map to
 * @pages: pointer to array of source kernel pages
 * @num: number of pages in page array
 *
 * Similar to vm_map_pages(), except that it explicitly sets the offset
 * to 0. This function is intended for the drivers that did not consider
 * vm_pgoff.
 *
 * Context: Process context. Called by mmap handlers.
 * Return: 0 on success and error code otherwise.
 */
int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
                                unsigned long num)
{
        return __vm_map_pages(vma, pages, num, 0);
}
EXPORT_SYMBOL(vm_map_pages_zero);

static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                        pfn_t pfn, pgprot_t prot, bool mkwrite)
{
        struct mm_struct *mm = vma->vm_mm;
        pte_t *pte, entry;
        spinlock_t *ptl;

        pte = get_locked_pte(mm, addr, &ptl);
        if (!pte)
                return VM_FAULT_OOM;
        if (!pte_none(*pte)) {
                if (mkwrite) {
                        /*
                         * For read faults on private mappings the PFN passed
                         * in may not match the PFN we have mapped if the
                         * mapped PFN is a writeable COW page.  In the mkwrite
                         * case we are creating a writable PTE for a shared
                         * mapping and we expect the PFNs to match. If they
                         * don't match, we are likely racing with block
                         * allocation and mapping invalidation so just skip the
                         * update.
                         */
                        if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) {
                                WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte)));
                                goto out_unlock;
                        }
                        entry = pte_mkyoung(*pte);
                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                        if (ptep_set_access_flags(vma, addr, pte, entry, 1))
                                update_mmu_cache(vma, addr, pte);
                }
                goto out_unlock;
        }

        /* Ok, finally just insert the thing.. */
        if (pfn_t_devmap(pfn))
                entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
        else
                entry = pte_mkspecial(pfn_t_pte(pfn, prot));

        if (mkwrite) {
                entry = pte_mkyoung(entry);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
        }

        set_pte_at(mm, addr, pte, entry);
        update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */

out_unlock:
        pte_unmap_unlock(pte, ptl);
        return VM_FAULT_NOPAGE;
}

/**
 * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot
 * @vma: user vma to map to
 * @addr: target user address of this page
 * @pfn: source kernel pfn
 * @pgprot: pgprot flags for the inserted page
 *
 * This is exactly like vmf_insert_pfn(), except that it allows drivers
 * to override pgprot on a per-page basis.
 *
 * This only makes sense for IO mappings, and it makes no sense for
 * COW mappings.  In general, using multiple vmas is preferable;
 * vmf_insert_pfn_prot should only be used if using multiple VMAs is
 * impractical.
 *
 * See vmf_insert_mixed_prot() for a discussion of the implication of using
 * a value of @pgprot different from that of @vma->vm_page_prot.
 *
 * Context: Process context.  May allocate using %GFP_KERNEL.
 * Return: vm_fault_t value.
 */
vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn, pgprot_t pgprot)
{
        /*
         * Technically, architectures with pte_special can avoid all these
         * restrictions (same for remap_pfn_range).  However we would like
         * consistency in testing and feature parity among all, so we should
         * try to keep these invariants in place for everybody.
         */
        BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
        BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
                                                (VM_PFNMAP|VM_MIXEDMAP));
        BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
        BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));

        if (addr < vma->vm_start || addr >= vma->vm_end)
                return VM_FAULT_SIGBUS;

        if (!pfn_modify_allowed(pfn, pgprot))
                return VM_FAULT_SIGBUS;

        track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));

        return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
                        false);
}
EXPORT_SYMBOL(vmf_insert_pfn_prot);

/**
 * vmf_insert_pfn - insert single pfn into user vma
 * @vma: user vma to map to
 * @addr: target user address of this page
 * @pfn: source kernel pfn
 *
 * Similar to vm_insert_page, this allows drivers to insert individual pages
 * they've allocated into a user vma. Same comments apply.
 *
 * This function should only be called from a vm_ops->fault handler, and
 * in that case the handler should return the result of this function.
 *
 * vma cannot be a COW mapping.
 *
 * As this is called only for pages that do not currently exist, we
 * do not need to flush old virtual caches or the TLB.
 *
 * Context: Process context.  May allocate using %GFP_KERNEL.
 * Return: vm_fault_t value.
 */
vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn)
{
        return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
}
EXPORT_SYMBOL(vmf_insert_pfn);

static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
{
        /* these checks mirror the abort conditions in vm_normal_page */
        if (vma->vm_flags & VM_MIXEDMAP)
                return true;
        if (pfn_t_devmap(pfn))
                return true;
        if (pfn_t_special(pfn))
                return true;
        if (is_zero_pfn(pfn_t_to_pfn(pfn)))
                return true;
        return false;
}

static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
                unsigned long addr, pfn_t pfn, pgprot_t pgprot,
                bool mkwrite)
{
        int err;

        BUG_ON(!vm_mixed_ok(vma, pfn));

        if (addr < vma->vm_start || addr >= vma->vm_end)
                return VM_FAULT_SIGBUS;

        track_pfn_insert(vma, &pgprot, pfn);

        if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
                return VM_FAULT_SIGBUS;

        /*
         * If we don't have pte special, then we have to use the pfn_valid()
         * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
         * refcount the page if pfn_valid is true (hence insert_page rather
         * than insert_pfn).  If a zero_pfn were inserted into a VM_MIXEDMAP
         * without pte special, it would there be refcounted as a normal page.
         */
        if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) &&
            !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
                struct page *page;

                /*
                 * At this point we are committed to insert_page()
                 * regardless of whether the caller specified flags that
                 * result in pfn_t_has_page() == false.
                 */
                page = pfn_to_page(pfn_t_to_pfn(pfn));
                err = insert_page(vma, addr, page, pgprot);
        } else {
                return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
        }

        if (err == -ENOMEM)
                return VM_FAULT_OOM;
        if (err < 0 && err != -EBUSY)
                return VM_FAULT_SIGBUS;

        return VM_FAULT_NOPAGE;
}

/**
 * vmf_insert_mixed_prot - insert single pfn into user vma with specified pgprot
 * @vma: user vma to map to
 * @addr: target user address of this page
 * @pfn: source kernel pfn
 * @pgprot: pgprot flags for the inserted page
 *
 * This is exactly like vmf_insert_mixed(), except that it allows drivers
 * to override pgprot on a per-page basis.
 *
 * Typically this function should be used by drivers to set caching- and
 * encryption bits different than those of @vma->vm_page_prot, because
 * the caching- or encryption mode may not be known at mmap() time.
 * This is ok as long as @vma->vm_page_prot is not used by the core vm
 * to set caching and encryption bits for those vmas (except for COW pages).
 * This is ensured by core vm only modifying these page table entries using
 * functions that don't touch caching- or encryption bits, using pte_modify()
 * if needed. (See for example mprotect()).
 * Also when new page-table entries are created, this is only done using the
 * fault() callback, and never using the value of vma->vm_page_prot,
 * except for page-table entries that point to anonymous pages as the result
 * of COW.
 *
 * Context: Process context.  May allocate using %GFP_KERNEL.
 * Return: vm_fault_t value.
 */
vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr,
                                 pfn_t pfn, pgprot_t pgprot)
{
        return __vm_insert_mixed(vma, addr, pfn, pgprot, false);
}
EXPORT_SYMBOL(vmf_insert_mixed_prot);

vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
                pfn_t pfn)
{
        return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, false);
}
EXPORT_SYMBOL(vmf_insert_mixed);

/*
 *  If the insertion of PTE failed because someone else already added a
 *  different entry in the mean time, we treat that as success as we assume
 *  the same entry was actually inserted.
 */
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
                unsigned long addr, pfn_t pfn)
{
        return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, true);
}
EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);

/*
 * maps a range of physical memory into the requested pages. the old
 * mappings are removed. any references to nonexistent pages results
 * in null mappings (currently treated as "copy-on-access")
 */
static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
                        unsigned long addr, unsigned long end,
                        unsigned long pfn, pgprot_t prot)
{
        pte_t *pte, *mapped_pte;
        spinlock_t *ptl;
        int err = 0;

        mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
        if (!pte)
                return -ENOMEM;
        arch_enter_lazy_mmu_mode();
        do {
                BUG_ON(!pte_none(*pte));
                if (!pfn_modify_allowed(pfn, prot)) {
                        err = -EACCES;
                        break;
                }
                set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
                pfn++;
        } while (pte++, addr += PAGE_SIZE, addr != end);
        arch_leave_lazy_mmu_mode();
        pte_unmap_unlock(mapped_pte, ptl);
        return err;
}

static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
                        unsigned long addr, unsigned long end,
                        unsigned long pfn, pgprot_t prot)
{
        pmd_t *pmd;
        unsigned long next;
        int err;

        pfn -= addr >> PAGE_SHIFT;
        pmd = pmd_alloc(mm, pud, addr);
        if (!pmd)
                return -ENOMEM;
        VM_BUG_ON(pmd_trans_huge(*pmd));
        do {
                next = pmd_addr_end(addr, end);
                err = remap_pte_range(mm, pmd, addr, next,
                                pfn + (addr >> PAGE_SHIFT), prot);
                if (err)
                        return err;
        } while (pmd++, addr = next, addr != end);
        return 0;
}

static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
                        unsigned long addr, unsigned long end,
                        unsigned long pfn, pgprot_t prot)
{
        pud_t *pud;
        unsigned long next;
        int err;

        pfn -= addr >> PAGE_SHIFT;
        pud = pud_alloc(mm, p4d, addr);
        if (!pud)
                return -ENOMEM;
        do {
                next = pud_addr_end(addr, end);
                err = remap_pmd_range(mm, pud, addr, next,
                                pfn + (addr >> PAGE_SHIFT), prot);
                if (err)
                        return err;
        } while (pud++, addr = next, addr != end);
        return 0;
}

static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
                        unsigned long addr, unsigned long end,
                        unsigned long pfn, pgprot_t prot)
{
        p4d_t *p4d;
        unsigned long next;
        int err;

        pfn -= addr >> PAGE_SHIFT;
        p4d = p4d_alloc(mm, pgd, addr);
        if (!p4d)
                return -ENOMEM;
        do {
                next = p4d_addr_end(addr, end);
                err = remap_pud_range(mm, p4d, addr, next,
                                pfn + (addr >> PAGE_SHIFT), prot);
                if (err)
                        return err;
        } while (p4d++, addr = next, addr != end);
        return 0;
}

static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long addr,
                unsigned long pfn, unsigned long size, pgprot_t prot)
{
        pgd_t *pgd;
        unsigned long next;
        unsigned long end = addr + PAGE_ALIGN(size);
        struct mm_struct *mm = vma->vm_mm;
        int err;

        if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
                return -EINVAL;

        /*
         * Physically remapped pages are special. Tell the
         * rest of the world about it:
         *   VM_IO tells people not to look at these pages
         *        (accesses can have side effects).
         *   VM_PFNMAP tells the core MM that the base pages are just
         *        raw PFN mappings, and do not have a "struct page" associated
         *        with them.
         *   VM_DONTEXPAND
         *      Disable vma merging and expanding with mremap().
         *   VM_DONTDUMP
         *      Omit vma from core dump, even when VM_IO turned off.
         *
         * There's a horrible special case to handle copy-on-write
         * behaviour that some programs depend on. We mark the "original"
         * un-COW'ed pages by matching them up with "vma->vm_pgoff".
         * See vm_normal_page() for details.
         */
        if (is_cow_mapping(vma->vm_flags)) {
                if (addr != vma->vm_start || end != vma->vm_end)
                        return -EINVAL;
                vma->vm_pgoff = pfn;
        }

        vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;

        BUG_ON(addr >= end);
        pfn -= addr >> PAGE_SHIFT;
        pgd = pgd_offset(mm, addr);
        flush_cache_range(vma, addr, end);
        do {
                next = pgd_addr_end(addr, end);
                err = remap_p4d_range(mm, pgd, addr, next,
                                pfn + (addr >> PAGE_SHIFT), prot);
                if (err)
                        return err;
        } while (pgd++, addr = next, addr != end);

        return 0;
}

/*
 * Variant of remap_pfn_range that does not call track_pfn_remap.  The caller
 * must have pre-validated the caching bits of the pgprot_t.
 */
int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
                unsigned long pfn, unsigned long size, pgprot_t prot)
{
        int error = remap_pfn_range_internal(vma, addr, pfn, size, prot);

        if (!error)
                return 0;

        /*
         * A partial pfn range mapping is dangerous: it does not
         * maintain page reference counts, and callers may free
         * pages due to the error. So zap it early.
         */
        zap_page_range_single(vma, addr, size, NULL);
        return error;
}

/**
 * remap_pfn_range - remap kernel memory to userspace
 * @vma: user vma to map to
 * @addr: target page aligned user address to start at
 * @pfn: page frame number of kernel physical memory address
 * @size: size of mapping area
 * @prot: page protection flags for this mapping
 *
 * Note: this is only safe if the mm semaphore is held when called.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
                    unsigned long pfn, unsigned long size, pgprot_t prot)
{
        int err;

        err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
        if (err)
                return -EINVAL;

        err = remap_pfn_range_notrack(vma, addr, pfn, size, prot);
        if (err)
                untrack_pfn(vma, pfn, PAGE_ALIGN(size));
        return err;
}
EXPORT_SYMBOL(remap_pfn_range);

/**
 * vm_iomap_memory - remap memory to userspace
 * @vma: user vma to map to
 * @start: start of the physical memory to be mapped
 * @len: size of area
 *
 * This is a simplified io_remap_pfn_range() for common driver use. The
 * driver just needs to give us the physical memory range to be mapped,
 * we'll figure out the rest from the vma information.
 *
 * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
 * whatever write-combining details or similar.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
{
        unsigned long vm_len, pfn, pages;

        /* Check that the physical memory area passed in looks valid */
        if (start + len < start)
                return -EINVAL;
        /*
         * You *really* shouldn't map things that aren't page-aligned,
         * but we've historically allowed it because IO memory might
         * just have smaller alignment.
         */
        len += start & ~PAGE_MASK;
        pfn = start >> PAGE_SHIFT;
        pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
        if (pfn + pages < pfn)
                return -EINVAL;

        /* We start the mapping 'vm_pgoff' pages into the area */
        if (vma->vm_pgoff > pages)
                return -EINVAL;
        pfn += vma->vm_pgoff;
        pages -= vma->vm_pgoff;

        /* Can we fit all of the mapping? */
        vm_len = vma->vm_end - vma->vm_start;
        if (vm_len >> PAGE_SHIFT > pages)
                return -EINVAL;

        /* Ok, let it rip */
        return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
}
EXPORT_SYMBOL(vm_iomap_memory);

static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
                                     unsigned long addr, unsigned long end,
                                     pte_fn_t fn, void *data, bool create,
                                     pgtbl_mod_mask *mask)
{
        pte_t *pte;
        int err = 0;
        spinlock_t *ptl;

        if (create) {
                pte = (mm == &init_mm) ?
                        pte_alloc_kernel_track(pmd, addr, mask) :
                        pte_alloc_map_lock(mm, pmd, addr, &ptl);
                if (!pte)
                        return -ENOMEM;
        } else {
                pte = (mm == &init_mm) ?
                        pte_offset_kernel(pmd, addr) :
                        pte_offset_map_lock(mm, pmd, addr, &ptl);
        }

        BUG_ON(pmd_huge(*pmd));

        arch_enter_lazy_mmu_mode();

        if (fn) {
                do {
                        if (create || !pte_none(*pte)) {
                                err = fn(pte, addr, data);
                                if (err)
                                        break;
                        }
                } while (pte++, addr += PAGE_SIZE, addr != end);
        }
        *mask |= PGTBL_PTE_MODIFIED;

        arch_leave_lazy_mmu_mode();

        if (mm != &init_mm)
                pte_unmap_unlock(pte-1, ptl);
        return err;
}

static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
                                     unsigned long addr, unsigned long end,
                                     pte_fn_t fn, void *data, bool create,
                                     pgtbl_mod_mask *mask)
{
        pmd_t *pmd;
        unsigned long next;
        int err = 0;

        BUG_ON(pud_huge(*pud));

        if (create) {
                pmd = pmd_alloc_track(mm, pud, addr, mask);
                if (!pmd)
                        return -ENOMEM;
        } else {
                pmd = pmd_offset(pud, addr);
        }
        do {
                next = pmd_addr_end(addr, end);
                if (create || !pmd_none_or_clear_bad(pmd)) {
                        err = apply_to_pte_range(mm, pmd, addr, next, fn, data,
                                                 create, mask);
                        if (err)
                                break;
                }
        } while (pmd++, addr = next, addr != end);
        return err;
}

static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
                                     unsigned long addr, unsigned long end,
                                     pte_fn_t fn, void *data, bool create,
                                     pgtbl_mod_mask *mask)
{
        pud_t *pud;
        unsigned long next;
        int err = 0;

        if (create) {
                pud = pud_alloc_track(mm, p4d, addr, mask);
                if (!pud)
                        return -ENOMEM;
        } else {
                pud = pud_offset(p4d, addr);
        }
        do {
                next = pud_addr_end(addr, end);
                if (create || !pud_none_or_clear_bad(pud)) {
                        err = apply_to_pmd_range(mm, pud, addr, next, fn, data,
                                                 create, mask);
                        if (err)
                                break;
                }
        } while (pud++, addr = next, addr != end);
        return err;
}

static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
                                     unsigned long addr, unsigned long end,
                                     pte_fn_t fn, void *data, bool create,
                                     pgtbl_mod_mask *mask)
{
        p4d_t *p4d;
        unsigned long next;
        int err = 0;

        if (create) {
                p4d = p4d_alloc_track(mm, pgd, addr, mask);
                if (!p4d)
                        return -ENOMEM;
        } else {
                p4d = p4d_offset(pgd, addr);
        }
        do {
                next = p4d_addr_end(addr, end);
                if (create || !p4d_none_or_clear_bad(p4d)) {
                        err = apply_to_pud_range(mm, p4d, addr, next, fn, data,
                                                 create, mask);
                        if (err)
                                break;
                }
        } while (p4d++, addr = next, addr != end);
        return err;
}

static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
                                 unsigned long size, pte_fn_t fn,
                                 void *data, bool create)
{
        pgd_t *pgd;
        unsigned long start = addr, next;
        unsigned long end = addr + size;
        pgtbl_mod_mask mask = 0;
        int err = 0;

        if (WARN_ON(addr >= end))
                return -EINVAL;

        pgd = pgd_offset(mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (!create && pgd_none_or_clear_bad(pgd))
                        continue;
                err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create, &mask);
                if (err)
                        break;
        } while (pgd++, addr = next, addr != end);

        if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
                arch_sync_kernel_mappings(start, start + size);

        return err;
}

/*
 * Scan a region of virtual memory, filling in page tables as necessary
 * and calling a provided function on each leaf page table.
 */
int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
                        unsigned long size, pte_fn_t fn, void *data)
{
        return __apply_to_page_range(mm, addr, size, fn, data, true);
}
EXPORT_SYMBOL_GPL(apply_to_page_range);

/*
 * Scan a region of virtual memory, calling a provided function on
 * each leaf page table where it exists.
 *
 * Unlike apply_to_page_range, this does _not_ fill in page tables
 * where they are absent.
 */
int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr,
                                 unsigned long size, pte_fn_t fn, void *data)
{
        return __apply_to_page_range(mm, addr, size, fn, data, false);
}
EXPORT_SYMBOL_GPL(apply_to_existing_page_range);

/*
 * handle_pte_fault chooses page fault handler according to an entry which was
 * read non-atomically.  Before making any commitment, on those architectures
 * or configurations (e.g. i386 with PAE) which might give a mix of unmatched
 * parts, do_swap_page must check under lock before unmapping the pte and
 * proceeding (but do_wp_page is only called after already making such a check;
 * and do_anonymous_page can safely check later on).
 */
static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
                                pte_t *page_table, pte_t orig_pte)
{
        int same = 1;
#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION)
        if (sizeof(pte_t) > sizeof(unsigned long)) {
                spinlock_t *ptl = pte_lockptr(mm, pmd);
                spin_lock(ptl);
                same = pte_same(*page_table, orig_pte);
                spin_unlock(ptl);
        }
#endif
        pte_unmap(page_table);
        return same;
}

static inline bool cow_user_page(struct page *dst, struct page *src,
                                 struct vm_fault *vmf)
{
        bool ret;
        void *kaddr;
        void __user *uaddr;
        bool locked = false;
        struct vm_area_struct *vma = vmf->vma;
        struct mm_struct *mm = vma->vm_mm;
        unsigned long addr = vmf->address;

        if (likely(src)) {
                copy_user_highpage(dst, src, addr, vma);
                return true;
        }

        /*
         * If the source page was a PFN mapping, we don't have
         * a "struct page" for it. We do a best-effort copy by
         * just copying from the original user address. If that
         * fails, we just zero-fill it. Live with it.
         */
        kaddr = kmap_atomic(dst);
        uaddr = (void __user *)(addr & PAGE_MASK);

        /*
         * On architectures with software "accessed" bits, we would
         * take a double page fault, so mark it accessed here.
         */
        if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
                pte_t entry;

                vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
                locked = true;
                if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
                        /*
                         * Other thread has already handled the fault
                         * and update local tlb only
                         */
                        update_mmu_tlb(vma, addr, vmf->pte);
                        ret = false;
                        goto pte_unlock;
                }

                entry = pte_mkyoung(vmf->orig_pte);
                if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
                        update_mmu_cache(vma, addr, vmf->pte);
        }

        /*
         * This really shouldn't fail, because the page is there
         * in the page tables. But it might just be unreadable,
         * in which case we just give up and fill the result with
         * zeroes.
         */
        if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
                if (locked)
                        goto warn;

                /* Re-validate under PTL if the page is still mapped */
                vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
                locked = true;
                if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
                        /* The PTE changed under us, update local tlb */
                        update_mmu_tlb(vma, addr, vmf->pte);
                        ret = false;
                        goto pte_unlock;
                }

                /*
                 * The same page can be mapped back since last copy attempt.
                 * Try to copy again under PTL.
                 */
                if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
                        /*
                         * Give a warn in case there can be some obscure
                         * use-case
                         */
warn:
                        WARN_ON_ONCE(1);
                        clear_page(kaddr);
                }
        }

        ret = true;

pte_unlock:
        if (locked)
                pte_unmap_unlock(vmf->pte, vmf->ptl);
        kunmap_atomic(kaddr);
        flush_dcache_page(dst);

        return ret;
}

static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
{
        struct file *vm_file = vma->vm_file;

        if (vm_file)
                return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;

        /*
         * Special mappings (e.g. VDSO) do not have any file so fake
         * a default GFP_KERNEL for them.
         */
        return GFP_KERNEL;
}

/*
 * Notify the address space that the page is about to become writable so that
 * it can prohibit this or wait for the page to get into an appropriate state.
 *
 * We do this without the lock held, so that it can sleep if it needs to.
 */
static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
{
        vm_fault_t ret;
        struct page *page = vmf->page;
        unsigned int old_flags = vmf->flags;

        vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;

        if (vmf->vma->vm_file &&
            IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host))
                return VM_FAULT_SIGBUS;

        ret = vmf->vma->vm_ops->page_mkwrite(vmf);
        /* Restore original flags so that caller is not surprised */
        vmf->flags = old_flags;
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
                return ret;
        if (unlikely(!(ret & VM_FAULT_LOCKED))) {
                lock_page(page);
                if (!page->mapping) {
                        unlock_page(page);
                        return 0; /* retry */
                }
                ret |= VM_FAULT_LOCKED;
        } else
                VM_BUG_ON_PAGE(!PageLocked(page), page);
        return ret;
}

/*
 * Handle dirtying of a page in shared file mapping on a write fault.
 *
 * The function expects the page to be locked and unlocks it.
 */
static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct address_space *mapping;
        struct page *page = vmf->page;
        bool dirtied;
        bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;

        dirtied = set_page_dirty(page);
        VM_BUG_ON_PAGE(PageAnon(page), page);
        /*
         * Take a local copy of the address_space - page.mapping may be zeroed
         * by truncate after unlock_page().   The address_space itself remains
         * pinned by vma->vm_file's reference.  We rely on unlock_page()'s
         * release semantics to prevent the compiler from undoing this copying.
         */
        mapping = page_rmapping(page);
        unlock_page(page);

        if (!page_mkwrite)
                file_update_time(vma->vm_file);

        /*
         * Throttle page dirtying rate down to writeback speed.
         *
         * mapping may be NULL here because some device drivers do not
         * set page.mapping but still dirty their pages
         *
         * Drop the mmap_lock before waiting on IO, if we can. The file
         * is pinning the mapping, as per above.
         */
        if ((dirtied || page_mkwrite) && mapping) {
                struct file *fpin;

                fpin = maybe_unlock_mmap_for_io(vmf, NULL);
                balance_dirty_pages_ratelimited(mapping);
                if (fpin) {
                        fput(fpin);
                        return VM_FAULT_RETRY;
                }
        }

        return 0;
}

/*
 * Handle write page faults for pages that can be reused in the current vma
 *
 * This can happen either due to the mapping being with the VM_SHARED flag,
 * or due to us being the last reference standing to the page. In either
 * case, all we need to do here is to mark the page as writable and update
 * any related book-keeping.
 */
static inline void wp_page_reuse(struct vm_fault *vmf)
        __releases(vmf->ptl)
{
        struct vm_area_struct *vma = vmf->vma;
        struct page *page = vmf->page;
        pte_t entry;
        /*
         * Clear the pages cpupid information as the existing
         * information potentially belongs to a now completely
         * unrelated process.
         */
        if (page)
                page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);

        flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
        entry = pte_mkyoung(vmf->orig_pte);
        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
        if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
                update_mmu_cache(vma, vmf->address, vmf->pte);
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        count_vm_event(PGREUSE);
}

/*
 * Handle the case of a page which we actually need to copy to a new page.
 *
 * Called with mmap_lock locked and the old page referenced, but
 * without the ptl held.
 *
 * High level logic flow:
 *
 * - Allocate a page, copy the content of the old page to the new one.
 * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc.
 * - Take the PTL. If the pte changed, bail out and release the allocated page
 * - If the pte is still the way we remember it, update the page table and all
 *   relevant references. This includes dropping the reference the page-table
 *   held to the old page, as well as updating the rmap.
 * - In any case, unlock the PTL and drop the reference we took to the old page.
 */
static vm_fault_t wp_page_copy(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct mm_struct *mm = vma->vm_mm;
        struct page *old_page = vmf->page;
        struct page *new_page = NULL;
        pte_t entry;
        int page_copied = 0;
        struct mmu_notifier_range range;

        if (unlikely(anon_vma_prepare(vma)))
                goto oom;

        if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
                new_page = alloc_zeroed_user_highpage_movable(vma,
                                                              vmf->address);
                if (!new_page)
                        goto oom;
        } else {
                new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
                                vmf->address);
                if (!new_page)
                        goto oom;

                if (!cow_user_page(new_page, old_page, vmf)) {
                        /*
                         * COW failed, if the fault was solved by other,
                         * it's fine. If not, userspace would re-fault on
                         * the same address and we will handle the fault
                         * from the second attempt.
                         */
                        put_page(new_page);
                        if (old_page)
                                put_page(old_page);
                        return 0;
                }
        }

        if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
                goto oom_free_new;
        cgroup_throttle_swaprate(new_page, GFP_KERNEL);

        __SetPageUptodate(new_page);

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
                                vmf->address & PAGE_MASK,
                                (vmf->address & PAGE_MASK) + PAGE_SIZE);
        mmu_notifier_invalidate_range_start(&range);

        /*
         * Re-check the pte - we dropped the lock
         */
        vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
        if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
                if (old_page) {
                        if (!PageAnon(old_page)) {
                                dec_mm_counter_fast(mm,
                                                mm_counter_file(old_page));
                                inc_mm_counter_fast(mm, MM_ANONPAGES);
                        }
                } else {
                        inc_mm_counter_fast(mm, MM_ANONPAGES);
                }
                flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
                entry = mk_pte(new_page, vma->vm_page_prot);
                entry = pte_sw_mkyoung(entry);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                /*
                 * Clear the pte entry and flush it first, before updating the
                 * pte with the new entry. This will avoid a race condition
                 * seen in the presence of one thread doing SMC and another
                 * thread doing COW.
                 */
                ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
                page_add_new_anon_rmap(new_page, vma, vmf->address, false);
                lru_cache_add_inactive_or_unevictable(new_page, vma);
                /*
                 * We call the notify macro here because, when using secondary
                 * mmu page tables (such as kvm shadow page tables), we want the
                 * new page to be mapped directly into the secondary page table.
                 */
                set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
                update_mmu_cache(vma, vmf->address, vmf->pte);
                if (old_page) {
                        /*
                         * Only after switching the pte to the new page may
                         * we remove the mapcount here. Otherwise another
                         * process may come and find the rmap count decremented
                         * before the pte is switched to the new page, and
                         * "reuse" the old page writing into it while our pte
                         * here still points into it and can be read by other
                         * threads.
                         *
                         * The critical issue is to order this
                         * page_remove_rmap with the ptp_clear_flush above.
                         * Those stores are ordered by (if nothing else,)
                         * the barrier present in the atomic_add_negative
                         * in page_remove_rmap.
                         *
                         * Then the TLB flush in ptep_clear_flush ensures that
                         * no process can access the old page before the
                         * decremented mapcount is visible. And the old page
                         * cannot be reused until after the decremented
                         * mapcount is visible. So transitively, TLBs to
                         * old page will be flushed before it can be reused.
                         */
                        page_remove_rmap(old_page, false);
                }

                /* Free the old page.. */
                new_page = old_page;
                page_copied = 1;
        } else {
                update_mmu_tlb(vma, vmf->address, vmf->pte);
        }

        if (new_page)
                put_page(new_page);

        pte_unmap_unlock(vmf->pte, vmf->ptl);
        /*
         * No need to double call mmu_notifier->invalidate_range() callback as
         * the above ptep_clear_flush_notify() did already call it.
         */
        mmu_notifier_invalidate_range_only_end(&range);
        if (old_page) {
                /*
                 * Don't let another task, with possibly unlocked vma,
                 * keep the mlocked page.
                 */
                if (page_copied && (vma->vm_flags & VM_LOCKED)) {
                        lock_page(old_page);        /* LRU manipulation */
                        if (PageMlocked(old_page))
                                munlock_vma_page(old_page);
                        unlock_page(old_page);
                }
                put_page(old_page);
        }
        return page_copied ? VM_FAULT_WRITE : 0;
oom_free_new:
        put_page(new_page);
oom:
        if (old_page)
                put_page(old_page);
        return VM_FAULT_OOM;
}

/**
 * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE
 *                          writeable once the page is prepared
 *
 * @vmf: structure describing the fault
 *
 * This function handles all that is needed to finish a write page fault in a
 * shared mapping due to PTE being read-only once the mapped page is prepared.
 * It handles locking of PTE and modifying it.
 *
 * The function expects the page to be locked or other protection against
 * concurrent faults / writeback (such as DAX radix tree locks).
 *
 * Return: %VM_FAULT_WRITE on success, %0 when PTE got changed before
 * we acquired PTE lock.
 */
vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
{
        WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
        vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
                                       &vmf->ptl);
        /*
         * We might have raced with another page fault while we released the
         * pte_offset_map_lock.
         */
        if (!pte_same(*vmf->pte, vmf->orig_pte)) {
                update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
                pte_unmap_unlock(vmf->pte, vmf->ptl);
                return VM_FAULT_NOPAGE;
        }
        wp_page_reuse(vmf);
        return 0;
}

/*
 * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
 * mapping
 */
static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;

        if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
                vm_fault_t ret;

                pte_unmap_unlock(vmf->pte, vmf->ptl);
                vmf->flags |= FAULT_FLAG_MKWRITE;
                ret = vma->vm_ops->pfn_mkwrite(vmf);
                if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
                        return ret;
                return finish_mkwrite_fault(vmf);
        }
        wp_page_reuse(vmf);
        return VM_FAULT_WRITE;
}

static vm_fault_t wp_page_shared(struct vm_fault *vmf)
        __releases(vmf->ptl)
{
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret = VM_FAULT_WRITE;

        get_page(vmf->page);

        if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
                vm_fault_t tmp;

                pte_unmap_unlock(vmf->pte, vmf->ptl);
                tmp = do_page_mkwrite(vmf);
                if (unlikely(!tmp || (tmp &
                                      (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
                        put_page(vmf->page);
                        return tmp;
                }
                tmp = finish_mkwrite_fault(vmf);
                if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
                        unlock_page(vmf->page);
                        put_page(vmf->page);
                        return tmp;
                }
        } else {
                wp_page_reuse(vmf);
                lock_page(vmf->page);
        }
        ret |= fault_dirty_shared_page(vmf);
        put_page(vmf->page);

        return ret;
}

/*
 * This routine handles present pages, when users try to write
 * to a shared page. It is done by copying the page to a new address
 * and decrementing the shared-page counter for the old page.
 *
 * Note that this routine assumes that the protection checks have been
 * done by the caller (the low-level page fault routine in most cases).
 * Thus we can safely just mark it writable once we've done any necessary
 * COW.
 *
 * We also mark the page dirty at this point even though the page will
 * change only once the write actually happens. This avoids a few races,
 * and potentially makes it more efficient.
 *
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
 * but allow concurrent faults), with pte both mapped and locked.
 * We return with mmap_lock still held, but pte unmapped and unlocked.
 */
static vm_fault_t do_wp_page(struct vm_fault *vmf)
        __releases(vmf->ptl)
{
        struct vm_area_struct *vma = vmf->vma;

        if (userfaultfd_pte_wp(vma, *vmf->pte)) {
                pte_unmap_unlock(vmf->pte, vmf->ptl);
                return handle_userfault(vmf, VM_UFFD_WP);
        }

        /*
         * Userfaultfd write-protect can defer flushes. Ensure the TLB
         * is flushed in this case before copying.
         */
        if (unlikely(userfaultfd_wp(vmf->vma) &&
                     mm_tlb_flush_pending(vmf->vma->vm_mm)))
                flush_tlb_page(vmf->vma, vmf->address);

        vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
        if (!vmf->page) {
                /*
                 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
                 * VM_PFNMAP VMA.
                 *
                 * We should not cow pages in a shared writeable mapping.
                 * Just mark the pages writable and/or call ops->pfn_mkwrite.
                 */
                if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
                                     (VM_WRITE|VM_SHARED))
                        return wp_pfn_shared(vmf);

                pte_unmap_unlock(vmf->pte, vmf->ptl);
                return wp_page_copy(vmf);
        }

        /*
         * Take out anonymous pages first, anonymous shared vmas are
         * not dirty accountable.
         */
        if (PageAnon(vmf->page)) {
                struct page *page = vmf->page;

                /* PageKsm() doesn't necessarily raise the page refcount */
                if (PageKsm(page) || page_count(page) != 1)
                        goto copy;
                if (!trylock_page(page))
                        goto copy;
                if (PageKsm(page) || page_mapcount(page) != 1 || page_count(page) != 1) {
                        unlock_page(page);
                        goto copy;
                }
                /*
                 * Ok, we've got the only map reference, and the only
                 * page count reference, and the page is locked,
                 * it's dark out, and we're wearing sunglasses. Hit it.
                 */
                unlock_page(page);
                wp_page_reuse(vmf);
                return VM_FAULT_WRITE;
        } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
                                        (VM_WRITE|VM_SHARED))) {
                return wp_page_shared(vmf);
        }
copy:
        /*
         * Ok, we need to copy. Oh, well..
         */
        get_page(vmf->page);

        pte_unmap_unlock(vmf->pte, vmf->ptl);
        return wp_page_copy(vmf);
}

static void unmap_mapping_range_vma(struct vm_area_struct *vma,
                unsigned long start_addr, unsigned long end_addr,
                struct zap_details *details)
{
        zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
}

static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
                                            struct zap_details *details)
{
        struct vm_area_struct *vma;
        pgoff_t vba, vea, zba, zea;

        vma_interval_tree_foreach(vma, root,
                        details->first_index, details->last_index) {

                vba = vma->vm_pgoff;
                vea = vba + vma_pages(vma) - 1;
                zba = details->first_index;
                if (zba < vba)
                        zba = vba;
                zea = details->last_index;
                if (zea > vea)
                        zea = vea;

                unmap_mapping_range_vma(vma,
                        ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
                        ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
                                details);
        }
}

/**
 * unmap_mapping_page() - Unmap single page from processes.
 * @page: The locked page to be unmapped.
 *
 * Unmap this page from any userspace process which still has it mmaped.
 * Typically, for efficiency, the range of nearby pages has already been
 * unmapped by unmap_mapping_pages() or unmap_mapping_range().  But once
 * truncation or invalidation holds the lock on a page, it may find that
 * the page has been remapped again: and then uses unmap_mapping_page()
 * to unmap it finally.
 */
void unmap_mapping_page(struct page *page)
{
        struct address_space *mapping = page->mapping;
        struct zap_details details = { };

        VM_BUG_ON(!PageLocked(page));
        VM_BUG_ON(PageTail(page));

        details.check_mapping = mapping;
        details.first_index = page->index;
        details.last_index = page->index + thp_nr_pages(page) - 1;
        details.single_page = page;

        i_mmap_lock_write(mapping);
        if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
                unmap_mapping_range_tree(&mapping->i_mmap, &details);
        i_mmap_unlock_write(mapping);
}

/**
 * unmap_mapping_pages() - Unmap pages from processes.
 * @mapping: The address space containing pages to be unmapped.
 * @start: Index of first page to be unmapped.
 * @nr: Number of pages to be unmapped.  0 to unmap to end of file.
 * @even_cows: Whether to unmap even private COWed pages.
 *
 * Unmap the pages in this address space from any userspace process which
 * has them mmaped.  Generally, you want to remove COWed pages as well when
 * a file is being truncated, but not when invalidating pages from the page
 * cache.
 */
void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
                pgoff_t nr, bool even_cows)
{
        struct zap_details details = { };

        details.check_mapping = even_cows ? NULL : mapping;
        details.first_index = start;
        details.last_index = start + nr - 1;
        if (details.last_index < details.first_index)
                details.last_index = ULONG_MAX;

        i_mmap_lock_write(mapping);
        if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
                unmap_mapping_range_tree(&mapping->i_mmap, &details);
        i_mmap_unlock_write(mapping);
}

/**
 * unmap_mapping_range - unmap the portion of all mmaps in the specified
 * address_space corresponding to the specified byte range in the underlying
 * file.
 *
 * @mapping: the address space containing mmaps to be unmapped.
 * @holebegin: byte in first page to unmap, relative to the start of
 * the underlying file.  This will be rounded down to a PAGE_SIZE
 * boundary.  Note that this is different from truncate_pagecache(), which
 * must keep the partial page.  In contrast, we must get rid of
 * partial pages.
 * @holelen: size of prospective hole in bytes.  This will be rounded
 * up to a PAGE_SIZE boundary.  A holelen of zero truncates to the
 * end of the file.
 * @even_cows: 1 when truncating a file, unmap even private COWed pages;
 * but 0 when invalidating pagecache, don't throw away private data.
 */
void unmap_mapping_range(struct address_space *mapping,
                loff_t const holebegin, loff_t const holelen, int even_cows)
{
        pgoff_t hba = (pgoff_t)(holebegin) >> PAGE_SHIFT;
        pgoff_t hlen = ((pgoff_t)(holelen) + PAGE_SIZE - 1) >> PAGE_SHIFT;

        /* Check for overflow. */
        if (sizeof(holelen) > sizeof(hlen)) {
                long long holeend =
                        (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
                if (holeend & ~(long long)ULONG_MAX)
                        hlen = ULONG_MAX - hba + 1;
        }

        unmap_mapping_pages(mapping, hba, hlen, even_cows);
}
EXPORT_SYMBOL(unmap_mapping_range);

/*
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
 * but allow concurrent faults), and pte mapped but not yet locked.
 * We return with pte unmapped and unlocked.
 *
 * We return with the mmap_lock locked or unlocked in the same cases
 * as does filemap_fault().
 */
vm_fault_t do_swap_page(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct page *page = NULL, *swapcache;
        swp_entry_t entry;
        pte_t pte;
        int locked;
        int exclusive = 0;
        vm_fault_t ret = 0;
        void *shadow = NULL;

        if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
                goto out;

        entry = pte_to_swp_entry(vmf->orig_pte);
        if (unlikely(non_swap_entry(entry))) {
                if (is_migration_entry(entry)) {
                        migration_entry_wait(vma->vm_mm, vmf->pmd,
                                             vmf->address);
                } else if (is_device_private_entry(entry)) {
                        vmf->page = device_private_entry_to_page(entry);
                        ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
                } else if (is_hwpoison_entry(entry)) {
                        ret = VM_FAULT_HWPOISON;
                } else {
                        print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
                        ret = VM_FAULT_SIGBUS;
                }
                goto out;
        }


        delayacct_set_flag(DELAYACCT_PF_SWAPIN);
        page = lookup_swap_cache(entry, vma, vmf->address);
        swapcache = page;

        if (!page) {
                struct swap_info_struct *si = swp_swap_info(entry);

                if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
                    __swap_count(entry) == 1) {
                        /* skip swapcache */
                        page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
                                                        vmf->address);
                        if (page) {
                                int err;

                                __SetPageLocked(page);
                                __SetPageSwapBacked(page);
                                set_page_private(page, entry.val);

                                /* Tell memcg to use swap ownership records */
                                SetPageSwapCache(page);
                                err = mem_cgroup_charge(page, vma->vm_mm,
                                                        GFP_KERNEL);
                                ClearPageSwapCache(page);
                                if (err) {
                                        ret = VM_FAULT_OOM;
                                        goto out_page;
                                }

                                shadow = get_shadow_from_swap_cache(entry);
                                if (shadow)
                                        workingset_refault(page, shadow);

                                lru_cache_add(page);
                                swap_readpage(page, true);
                        }
                } else {
                        page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
                                                vmf);
                        swapcache = page;
                }

                if (!page) {
                        /*
                         * Back out if somebody else faulted in this pte
                         * while we released the pte lock.
                         */
                        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                        vmf->address, &vmf->ptl);
                        if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
                                ret = VM_FAULT_OOM;
                        delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
                        goto unlock;
                }

                /* Had to read the page from swap area: Major fault */
                ret = VM_FAULT_MAJOR;
                count_vm_event(PGMAJFAULT);
                count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
        } else if (PageHWPoison(page)) {
                /*
                 * hwpoisoned dirty swapcache pages are kept for killing
                 * owner processes (which may be unknown at hwpoison time)
                 */
                ret = VM_FAULT_HWPOISON;
                delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
                goto out_release;
        }

        locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);

        delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
        if (!locked) {
                ret |= VM_FAULT_RETRY;
                goto out_release;
        }

        /*
         * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
         * release the swapcache from under us.  The page pin, and pte_same
         * test below, are not enough to exclude that.  Even if it is still
         * swapcache, we need to check that the page's swap has not changed.
         */
        if (unlikely((!PageSwapCache(page) ||
                        page_private(page) != entry.val)) && swapcache)
                goto out_page;

        page = ksm_might_need_to_copy(page, vma, vmf->address);
        if (unlikely(!page)) {
                ret = VM_FAULT_OOM;
                page = swapcache;
                goto out_page;
        }

        cgroup_throttle_swaprate(page, GFP_KERNEL);

        /*
         * Back out if somebody else already faulted in this pte.
         */
        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
                        &vmf->ptl);
        if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
                goto out_nomap;

        if (unlikely(!PageUptodate(page))) {
                ret = VM_FAULT_SIGBUS;
                goto out_nomap;
        }

        /*
         * The page isn't present yet, go ahead with the fault.
         *
         * Be careful about the sequence of operations here.
         * To get its accounting right, reuse_swap_page() must be called
         * while the page is counted on swap but not yet in mapcount i.e.
         * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
         * must be called after the swap_free(), or it will never succeed.
         */

        inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
        dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
        pte = mk_pte(page, vma->vm_page_prot);
        if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
                pte = maybe_mkwrite(pte_mkdirty(pte), vma);
                vmf->flags &= ~FAULT_FLAG_WRITE;
                ret |= VM_FAULT_WRITE;
                exclusive = RMAP_EXCLUSIVE;
        }
        flush_icache_page(vma, page);
        if (pte_swp_soft_dirty(vmf->orig_pte))
                pte = pte_mksoft_dirty(pte);
        if (pte_swp_uffd_wp(vmf->orig_pte)) {
                pte = pte_mkuffd_wp(pte);
                pte = pte_wrprotect(pte);
        }
        set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
        arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
        vmf->orig_pte = pte;

        /* ksm created a completely new copy */
        if (unlikely(page != swapcache && swapcache)) {
                page_add_new_anon_rmap(page, vma, vmf->address, false);
                lru_cache_add_inactive_or_unevictable(page, vma);
        } else {
                do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
        }

        swap_free(entry);
        if (mem_cgroup_swap_full(page) ||
            (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
                try_to_free_swap(page);
        unlock_page(page);
        if (page != swapcache && swapcache) {
                /*
                 * Hold the lock to avoid the swap entry to be reused
                 * until we take the PT lock for the pte_same() check
                 * (to avoid false positives from pte_same). For
                 * further safety release the lock after the swap_free
                 * so that the swap count won't change under a
                 * parallel locked swapcache.
                 */
                unlock_page(swapcache);
                put_page(swapcache);
        }

        if (vmf->flags & FAULT_FLAG_WRITE) {
                ret |= do_wp_page(vmf);
                if (ret & VM_FAULT_ERROR)
                        ret &= VM_FAULT_ERROR;
                goto out;
        }

        /* No need to invalidate - it was non-present before */
        update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
        pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
        return ret;
out_nomap:
        pte_unmap_unlock(vmf->pte, vmf->ptl);
out_page:
        unlock_page(page);
out_release:
        put_page(page);
        if (page != swapcache && swapcache) {
                unlock_page(swapcache);
                put_page(swapcache);
        }
        return ret;
}

/*
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
 * but allow concurrent faults), and pte mapped but not yet locked.
 * We return with mmap_lock still held, but pte unmapped and unlocked.
 */
static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct page *page;
        vm_fault_t ret = 0;
        pte_t entry;

        /* File mapping without ->vm_ops ? */
        if (vma->vm_flags & VM_SHARED)
                return VM_FAULT_SIGBUS;

        /*
         * Use pte_alloc() instead of pte_alloc_map().  We can't run
         * pte_offset_map() on pmds where a huge pmd might be created
         * from a different thread.
         *
         * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
         * parallel threads are excluded by other means.
         *
         * Here we only have mmap_read_lock(mm).
         */
        if (pte_alloc(vma->vm_mm, vmf->pmd))
                return VM_FAULT_OOM;

        /* See the comment in pte_alloc_one_map() */
        if (unlikely(pmd_trans_unstable(vmf->pmd)))
                return 0;

        /* Use the zero-page for reads */
        if (!(vmf->flags & FAULT_FLAG_WRITE) &&
                        !mm_forbids_zeropage(vma->vm_mm)) {
                entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
                                                vma->vm_page_prot));
                vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                vmf->address, &vmf->ptl);
                if (!pte_none(*vmf->pte)) {
                        update_mmu_tlb(vma, vmf->address, vmf->pte);
                        goto unlock;
                }
                ret = check_stable_address_space(vma->vm_mm);
                if (ret)
                        goto unlock;
                /* Deliver the page fault to userland, check inside PT lock */
                if (userfaultfd_missing(vma)) {
                        pte_unmap_unlock(vmf->pte, vmf->ptl);
                        return handle_userfault(vmf, VM_UFFD_MISSING);
                }
                goto setpte;
        }

        /* Allocate our own private page. */
        if (unlikely(anon_vma_prepare(vma)))
                goto oom;
        page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
        if (!page)
                goto oom;

        if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
                goto oom_free_page;
        cgroup_throttle_swaprate(page, GFP_KERNEL);

        /*
         * The memory barrier inside __SetPageUptodate makes sure that
         * preceding stores to the page contents become visible before
         * the set_pte_at() write.
         */
        __SetPageUptodate(page);

        entry = mk_pte(page, vma->vm_page_prot);
        entry = pte_sw_mkyoung(entry);
        if (vma->vm_flags & VM_WRITE)
                entry = pte_mkwrite(pte_mkdirty(entry));

        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
                        &vmf->ptl);
        if (!pte_none(*vmf->pte)) {
                update_mmu_cache(vma, vmf->address, vmf->pte);
                goto release;
        }

        ret = check_stable_address_space(vma->vm_mm);
        if (ret)
                goto release;

        /* Deliver the page fault to userland, check inside PT lock */
        if (userfaultfd_missing(vma)) {
                pte_unmap_unlock(vmf->pte, vmf->ptl);
                put_page(page);
                return handle_userfault(vmf, VM_UFFD_MISSING);
        }

        inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
        page_add_new_anon_rmap(page, vma, vmf->address, false);
        lru_cache_add_inactive_or_unevictable(page, vma);
setpte:
        set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);

        /* No need to invalidate - it was non-present before */
        update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        return ret;
release:
        put_page(page);
        goto unlock;
oom_free_page:
        put_page(page);
oom:
        return VM_FAULT_OOM;
}

/*
 * The mmap_lock must have been held on entry, and may have been
 * released depending on flags and vma->vm_ops->fault() return value.
 * See filemap_fault() and __lock_page_retry().
 */
static vm_fault_t __do_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret;

        /*
         * Preallocate pte before we take page_lock because this might lead to
         * deadlocks for memcg reclaim which waits for pages under writeback:
         *                                lock_page(A)
         *                                SetPageWriteback(A)
         *                                unlock_page(A)
         * lock_page(B)
         *                                lock_page(B)
         * pte_alloc_one
         *   shrink_page_list
         *     wait_on_page_writeback(A)
         *                                SetPageWriteback(B)
         *                                unlock_page(B)
         *                                # flush A, B to clear the writeback
         */
        if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
                vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
                if (!vmf->prealloc_pte)
                        return VM_FAULT_OOM;
                smp_wmb(); /* See comment in __pte_alloc() */
        }

        ret = vma->vm_ops->fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
                            VM_FAULT_DONE_COW)))
                return ret;

        if (unlikely(PageHWPoison(vmf->page))) {
                struct page *page = vmf->page;
                vm_fault_t poisonret = VM_FAULT_HWPOISON;
                if (ret & VM_FAULT_LOCKED) {
                        if (page_mapped(page))
                                unmap_mapping_pages(page_mapping(page),
                                                    page->index, 1, false);
                        /* Retry if a clean page was removed from the cache. */
                        if (invalidate_inode_page(page))
                                poisonret = VM_FAULT_NOPAGE;
                        unlock_page(page);
                }
                put_page(page);
                vmf->page = NULL;
                return poisonret;
        }

        if (unlikely(!(ret & VM_FAULT_LOCKED)))
                lock_page(vmf->page);
        else
                VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);

        return ret;
}

/*
 * The ordering of these checks is important for pmds with _PAGE_DEVMAP set.
 * If we check pmd_trans_unstable() first we will trip the bad_pmd() check
 * inside of pmd_none_or_trans_huge_or_clear_bad(). This will end up correctly
 * returning 1 but not before it spams dmesg with the pmd_clear_bad() output.
 */
static int pmd_devmap_trans_unstable(pmd_t *pmd)
{
        return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
}

static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;

        if (!pmd_none(*vmf->pmd))
                goto map_pte;
        if (vmf->prealloc_pte) {
                vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
                if (unlikely(!pmd_none(*vmf->pmd))) {
                        spin_unlock(vmf->ptl);
                        goto map_pte;
                }

                mm_inc_nr_ptes(vma->vm_mm);
                pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
                spin_unlock(vmf->ptl);
                vmf->prealloc_pte = NULL;
        } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) {
                return VM_FAULT_OOM;
        }
map_pte:
        /*
         * If a huge pmd materialized under us just retry later.  Use
         * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead of
         * pmd_trans_huge() to ensure the pmd didn't become pmd_trans_huge
         * under us and then back to pmd_none, as a result of MADV_DONTNEED
         * running immediately after a huge pmd fault in a different thread of
         * this mm, in turn leading to a misleading pmd_trans_huge() retval.
         * All we have to ensure is that it is a regular pmd that we can walk
         * with pte_offset_map() and we can do that through an atomic read in
         * C, which is what pmd_trans_unstable() provides.
         */
        if (pmd_devmap_trans_unstable(vmf->pmd))
                return VM_FAULT_NOPAGE;

        /*
         * At this point we know that our vmf->pmd points to a page of ptes
         * and it cannot become pmd_none(), pmd_devmap() or pmd_trans_huge()
         * for the duration of the fault.  If a racing MADV_DONTNEED runs and
         * we zap the ptes pointed to by our vmf->pmd, the vmf->ptl will still
         * be valid and we will re-check to make sure the vmf->pte isn't
         * pte_none() under vmf->ptl protection when we return to
         * alloc_set_pte().
         */
        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
                        &vmf->ptl);
        return 0;
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void deposit_prealloc_pte(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;

        pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
        /*
         * We are going to consume the prealloc table,
         * count that as nr_ptes.
         */
        mm_inc_nr_ptes(vma->vm_mm);
        vmf->prealloc_pte = NULL;
}

static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
{
        struct vm_area_struct *vma = vmf->vma;
        bool write = vmf->flags & FAULT_FLAG_WRITE;
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        pmd_t entry;
        int i;
        vm_fault_t ret = VM_FAULT_FALLBACK;

        if (!transhuge_vma_suitable(vma, haddr))
                return ret;

        page = compound_head(page);
        if (compound_order(page) != HPAGE_PMD_ORDER)
                return ret;

        /*
         * Archs like ppc64 need additonal space to store information
         * related to pte entry. Use the preallocated table for that.
         */
        if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
                vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
                if (!vmf->prealloc_pte)
                        return VM_FAULT_OOM;
                smp_wmb(); /* See comment in __pte_alloc() */
        }

        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
        if (unlikely(!pmd_none(*vmf->pmd)))
                goto out;

        for (i = 0; i < HPAGE_PMD_NR; i++)
                flush_icache_page(vma, page + i);

        entry = mk_huge_pmd(page, vma->vm_page_prot);
        if (write)
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);

        add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
        page_add_file_rmap(page, true);
        /*
         * deposit and withdraw with pmd lock held
         */
        if (arch_needs_pgtable_deposit())
                deposit_prealloc_pte(vmf);

        set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);

        update_mmu_cache_pmd(vma, haddr, vmf->pmd);

        /* fault is handled */
        ret = 0;
        count_vm_event(THP_FILE_MAPPED);
out:
        spin_unlock(vmf->ptl);
        return ret;
}
#else
static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
{
        BUILD_BUG();
        return 0;
}
#endif

/**
 * alloc_set_pte - setup new PTE entry for given page and add reverse page
 * mapping. If needed, the function allocates page table or use pre-allocated.
 *
 * @vmf: fault environment
 * @page: page to map
 *
 * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on
 * return.
 *
 * Target users are page handler itself and implementations of
 * vm_ops->map_pages.
 *
 * Return: %0 on success, %VM_FAULT_ code in case of error.
 */
vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page)
{
        struct vm_area_struct *vma = vmf->vma;
        bool write = vmf->flags & FAULT_FLAG_WRITE;
        pte_t entry;
        vm_fault_t ret;

        if (pmd_none(*vmf->pmd) && PageTransCompound(page)) {
                ret = do_set_pmd(vmf, page);
                if (ret != VM_FAULT_FALLBACK)
                        return ret;
        }

        if (!vmf->pte) {
                ret = pte_alloc_one_map(vmf);
                if (ret)
                        return ret;
        }

        /* Re-check under ptl */
        if (unlikely(!pte_none(*vmf->pte))) {
                update_mmu_tlb(vma, vmf->address, vmf->pte);
                return VM_FAULT_NOPAGE;
        }

        flush_icache_page(vma, page);
        entry = mk_pte(page, vma->vm_page_prot);
        entry = pte_sw_mkyoung(entry);
        if (write)
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
        /* copy-on-write page */
        if (write && !(vma->vm_flags & VM_SHARED)) {
                inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
                page_add_new_anon_rmap(page, vma, vmf->address, false);
                lru_cache_add_inactive_or_unevictable(page, vma);
        } else {
                inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
                page_add_file_rmap(page, false);
        }
        set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);

        /* no need to invalidate: a not-present page won't be cached */
        update_mmu_cache(vma, vmf->address, vmf->pte);

        return 0;
}


/**
 * finish_fault - finish page fault once we have prepared the page to fault
 *
 * @vmf: structure describing the fault
 *
 * This function handles all that is needed to finish a page fault once the
 * page to fault in is prepared. It handles locking of PTEs, inserts PTE for
 * given page, adds reverse page mapping, handles memcg charges and LRU
 * addition.
 *
 * The function expects the page to be locked and on success it consumes a
 * reference of a page being mapped (for the PTE which maps it).
 *
 * Return: %0 on success, %VM_FAULT_ code in case of error.
 */
vm_fault_t finish_fault(struct vm_fault *vmf)
{
        struct page *page;
        vm_fault_t ret = 0;

        /* Did we COW the page? */
        if ((vmf->flags & FAULT_FLAG_WRITE) &&
            !(vmf->vma->vm_flags & VM_SHARED))
                page = vmf->cow_page;
        else
                page = vmf->page;

        /*
         * check even for read faults because we might have lost our CoWed
         * page
         */
        if (!(vmf->vma->vm_flags & VM_SHARED))
                ret = check_stable_address_space(vmf->vma->vm_mm);
        if (!ret)
                ret = alloc_set_pte(vmf, page);
        if (vmf->pte)
                pte_unmap_unlock(vmf->pte, vmf->ptl);
        return ret;
}

static unsigned long fault_around_bytes __read_mostly =
        rounddown_pow_of_two(65536);

#ifdef CONFIG_DEBUG_FS
static int fault_around_bytes_get(void *data, u64 *val)
{
        *val = fault_around_bytes;
        return 0;
}

/*
 * fault_around_bytes must be rounded down to the nearest page order as it's
 * what do_fault_around() expects to see.
 */
static int fault_around_bytes_set(void *data, u64 val)
{
        if (val / PAGE_SIZE > PTRS_PER_PTE)
                return -EINVAL;
        if (val > PAGE_SIZE)
                fault_around_bytes = rounddown_pow_of_two(val);
        else
                fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */
        return 0;
}
DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
                fault_around_bytes_get, fault_around_bytes_set, "%llu\n");

static int __init fault_around_debugfs(void)
{
        debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
                                   &fault_around_bytes_fops);
        return 0;
}
late_initcall(fault_around_debugfs);
#endif

/*
 * do_fault_around() tries to map few pages around the fault address. The hope
 * is that the pages will be needed soon and this will lower the number of
 * faults to handle.
 *
 * It uses vm_ops->map_pages() to map the pages, which skips the page if it's
 * not ready to be mapped: not up-to-date, locked, etc.
 *
 * This function is called with the page table lock taken. In the split ptlock
 * case the page table lock only protects only those entries which belong to
 * the page table corresponding to the fault address.
 *
 * This function doesn't cross the VMA boundaries, in order to call map_pages()
 * only once.
 *
 * fault_around_bytes defines how many bytes we'll try to map.
 * do_fault_around() expects it to be set to a power of two less than or equal
 * to PTRS_PER_PTE.
 *
 * The virtual address of the area that we map is naturally aligned to
 * fault_around_bytes rounded down to the machine page size
 * (and therefore to page order).  This way it's easier to guarantee
 * that we don't cross page table boundaries.
 */
static vm_fault_t do_fault_around(struct vm_fault *vmf)
{
        unsigned long address = vmf->address, nr_pages, mask;
        pgoff_t start_pgoff = vmf->pgoff;
        pgoff_t end_pgoff;
        int off;
        vm_fault_t ret = 0;

        nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
        mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;

        vmf->address = max(address & mask, vmf->vma->vm_start);
        off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
        start_pgoff -= off;

        /*
         *  end_pgoff is either the end of the page table, the end of
         *  the vma or nr_pages from start_pgoff, depending what is nearest.
         */
        end_pgoff = start_pgoff -
                ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
                PTRS_PER_PTE - 1;
        end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
                        start_pgoff + nr_pages - 1);

        if (pmd_none(*vmf->pmd)) {
                vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
                if (!vmf->prealloc_pte)
                        goto out;
                smp_wmb(); /* See comment in __pte_alloc() */
        }

        vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);

        /* Huge page is mapped? Page fault is solved */
        if (pmd_trans_huge(*vmf->pmd)) {
                ret = VM_FAULT_NOPAGE;
                goto out;
        }

        /* ->map_pages() haven't done anything useful. Cold page cache? */
        if (!vmf->pte)
                goto out;

        /* check if the page fault is solved */
        vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
        if (!pte_none(*vmf->pte))
                ret = VM_FAULT_NOPAGE;
        pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
        vmf->address = address;
        vmf->pte = NULL;
        return ret;
}

static vm_fault_t do_read_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret = 0;

        /*
         * Let's call ->map_pages() first and use ->fault() as fallback
         * if page by the offset is not ready to be mapped (cold cache or
         * something).
         */
        if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
                ret = do_fault_around(vmf);
                if (ret)
                        return ret;
        }

        ret = __do_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                return ret;

        ret |= finish_fault(vmf);
        unlock_page(vmf->page);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                put_page(vmf->page);
        return ret;
}

static vm_fault_t do_cow_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret;

        if (unlikely(anon_vma_prepare(vma)))
                return VM_FAULT_OOM;

        vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
        if (!vmf->cow_page)
                return VM_FAULT_OOM;

        if (mem_cgroup_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL)) {
                put_page(vmf->cow_page);
                return VM_FAULT_OOM;
        }
        cgroup_throttle_swaprate(vmf->cow_page, GFP_KERNEL);

        ret = __do_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                goto uncharge_out;
        if (ret & VM_FAULT_DONE_COW)
                return ret;

        copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
        __SetPageUptodate(vmf->cow_page);

        ret |= finish_fault(vmf);
        unlock_page(vmf->page);
        put_page(vmf->page);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                goto uncharge_out;
        return ret;
uncharge_out:
        put_page(vmf->cow_page);
        return ret;
}

static vm_fault_t do_shared_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret, tmp;

        ret = __do_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                return ret;

        /*
         * Check if the backing address space wants to know that the page is
         * about to become writable
         */
        if (vma->vm_ops->page_mkwrite) {
                unlock_page(vmf->page);
                tmp = do_page_mkwrite(vmf);
                if (unlikely(!tmp ||
                                (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
                        put_page(vmf->page);
                        return tmp;
                }
        }

        ret |= finish_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
                                        VM_FAULT_RETRY))) {
                unlock_page(vmf->page);
                put_page(vmf->page);
                return ret;
        }

        ret |= fault_dirty_shared_page(vmf);
        return ret;
}

/*
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
 * but allow concurrent faults).
 * The mmap_lock may have been released depending on flags and our
 * return value.  See filemap_fault() and __lock_page_or_retry().
 * If mmap_lock is released, vma may become invalid (for example
 * by other thread calling munmap()).
 */
static vm_fault_t do_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct mm_struct *vm_mm = vma->vm_mm;
        vm_fault_t ret;

        /*
         * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
         */
        if (!vma->vm_ops->fault) {
                /*
                 * If we find a migration pmd entry or a none pmd entry, which
                 * should never happen, return SIGBUS
                 */
                if (unlikely(!pmd_present(*vmf->pmd)))
                        ret = VM_FAULT_SIGBUS;
                else {
                        vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm,
                                                       vmf->pmd,
                                                       vmf->address,
                                                       &vmf->ptl);
                        /*
                         * Make sure this is not a temporary clearing of pte
                         * by holding ptl and checking again. A R/M/W update
                         * of pte involves: take ptl, clearing the pte so that
                         * we don't have concurrent modification by hardware
                         * followed by an update.
                         */
                        if (unlikely(pte_none(*vmf->pte)))
                                ret = VM_FAULT_SIGBUS;
                        else
                                ret = VM_FAULT_NOPAGE;

                        pte_unmap_unlock(vmf->pte, vmf->ptl);
                }
        } else if (!(vmf->flags & FAULT_FLAG_WRITE))
                ret = do_read_fault(vmf);
        else if (!(vma->vm_flags & VM_SHARED))
                ret = do_cow_fault(vmf);
        else
                ret = do_shared_fault(vmf);

        /* preallocated pagetable is unused: free it */
        if (vmf->prealloc_pte) {
                pte_free(vm_mm, vmf->prealloc_pte);
                vmf->prealloc_pte = NULL;
        }
        return ret;
}

static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
                                unsigned long addr, int page_nid,
                                int *flags)
{
        get_page(page);

        count_vm_numa_event(NUMA_HINT_FAULTS);
        if (page_nid == numa_node_id()) {
                count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
                *flags |= TNF_FAULT_LOCAL;
        }

        return mpol_misplaced(page, vma, addr);
}

static vm_fault_t do_numa_page(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct page *page = NULL;
        int page_nid = NUMA_NO_NODE;
        int last_cpupid;
        int target_nid;
        bool migrated = false;
        pte_t pte, old_pte;
        bool was_writable = pte_savedwrite(vmf->orig_pte);
        int flags = 0;

        /*
         * The "pte" at this point cannot be used safely without
         * validation through pte_unmap_same(). It's of NUMA type but
         * the pfn may be screwed if the read is non atomic.
         */
        vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
        spin_lock(vmf->ptl);
        if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
                pte_unmap_unlock(vmf->pte, vmf->ptl);
                goto out;
        }

        /*
         * Make it present again, Depending on how arch implementes non
         * accessible ptes, some can allow access by kernel mode.
         */
        old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
        pte = pte_modify(old_pte, vma->vm_page_prot);
        pte = pte_mkyoung(pte);
        if (was_writable)
                pte = pte_mkwrite(pte);
        ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
        update_mmu_cache(vma, vmf->address, vmf->pte);

        page = vm_normal_page(vma, vmf->address, pte);
        if (!page) {
                pte_unmap_unlock(vmf->pte, vmf->ptl);
                return 0;
        }

        /* TODO: handle PTE-mapped THP */
        if (PageCompound(page)) {
                pte_unmap_unlock(vmf->pte, vmf->ptl);
                return 0;
        }

        /*
         * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
         * much anyway since they can be in shared cache state. This misses
         * the case where a mapping is writable but the process never writes
         * to it but pte_write gets cleared during protection updates and
         * pte_dirty has unpredictable behaviour between PTE scan updates,
         * background writeback, dirty balancing and application behaviour.
         */
        if (!pte_write(pte))
                flags |= TNF_NO_GROUP;

        /*
         * Flag if the page is shared between multiple address spaces. This
         * is later used when determining whether to group tasks together
         */
        if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
                flags |= TNF_SHARED;

        last_cpupid = page_cpupid_last(page);
        page_nid = page_to_nid(page);
        target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
                        &flags);
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        if (target_nid == NUMA_NO_NODE) {
                put_page(page);
                goto out;
        }

        /* Migrate to the requested node */
        migrated = migrate_misplaced_page(page, vma, target_nid);
        if (migrated) {
                page_nid = target_nid;
                flags |= TNF_MIGRATED;
        } else
                flags |= TNF_MIGRATE_FAIL;

out:
        if (page_nid != NUMA_NO_NODE)
                task_numa_fault(last_cpupid, page_nid, 1, flags);
        return 0;
}

static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
{
        if (vma_is_anonymous(vmf->vma))
                return do_huge_pmd_anonymous_page(vmf);
        if (vmf->vma->vm_ops->huge_fault)
                return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
        return VM_FAULT_FALLBACK;
}

/* `inline' is required to avoid gcc 4.1.2 build error */
static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
{
        if (vma_is_anonymous(vmf->vma)) {
                if (userfaultfd_huge_pmd_wp(vmf->vma, orig_pmd))
                        return handle_userfault(vmf, VM_UFFD_WP);
                return do_huge_pmd_wp_page(vmf, orig_pmd);
        }
        if (vmf->vma->vm_ops->huge_fault) {
                vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);

                if (!(ret & VM_FAULT_FALLBACK))
                        return ret;
        }

        /* COW or write-notify handled on pte level: split pmd. */
        __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);

        return VM_FAULT_FALLBACK;
}

static vm_fault_t create_huge_pud(struct vm_fault *vmf)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&                        \
        defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
        /* No support for anonymous transparent PUD pages yet */
        if (vma_is_anonymous(vmf->vma))
                return VM_FAULT_FALLBACK;
        if (vmf->vma->vm_ops->huge_fault)
                return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
        return VM_FAULT_FALLBACK;
}

static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&                        \
        defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
        /* No support for anonymous transparent PUD pages yet */
        if (vma_is_anonymous(vmf->vma))
                goto split;
        if (vmf->vma->vm_ops->huge_fault) {
                vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);

                if (!(ret & VM_FAULT_FALLBACK))
                        return ret;
        }
split:
        /* COW or write-notify not handled on PUD level: split pud.*/
        __split_huge_pud(vmf->vma, vmf->pud, vmf->address);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
        return VM_FAULT_FALLBACK;
}

/*
 * These routines also need to handle stuff like marking pages dirty
 * and/or accessed for architectures that don't do it in hardware (most
 * RISC architectures).  The early dirtying is also good on the i386.
 *
 * There is also a hook called "update_mmu_cache()" that architectures
 * with external mmu caches can use to update those (ie the Sparc or
 * PowerPC hashed page tables that act as extended TLBs).
 *
 * We enter with non-exclusive mmap_lock (to exclude vma changes, but allow
 * concurrent faults).
 *
 * The mmap_lock may have been released depending on flags and our return value.
 * See filemap_fault() and __lock_page_or_retry().
 */
static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
{
        pte_t entry;

        if (unlikely(pmd_none(*vmf->pmd))) {
                /*
                 * Leave __pte_alloc() until later: because vm_ops->fault may
                 * want to allocate huge page, and if we expose page table
                 * for an instant, it will be difficult to retract from
                 * concurrent faults and from rmap lookups.
                 */
                vmf->pte = NULL;
        } else {
                /* See comment in pte_alloc_one_map() */
                if (pmd_devmap_trans_unstable(vmf->pmd))
                        return 0;
                /*
                 * A regular pmd is established and it can't morph into a huge
                 * pmd from under us anymore at this point because we hold the
                 * mmap_lock read mode and khugepaged takes it in write mode.
                 * So now it's safe to run pte_offset_map().
                 */
                vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
                vmf->orig_pte = *vmf->pte;

                /*
                 * some architectures can have larger ptes than wordsize,
                 * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and
                 * CONFIG_32BIT=y, so READ_ONCE cannot guarantee atomic
                 * accesses.  The code below just needs a consistent view
                 * for the ifs and we later double check anyway with the
                 * ptl lock held. So here a barrier will do.
                 */
                barrier();
                if (pte_none(vmf->orig_pte)) {
                        pte_unmap(vmf->pte);
                        vmf->pte = NULL;
                }
        }

        if (!vmf->pte) {
                if (vma_is_anonymous(vmf->vma))
                        return do_anonymous_page(vmf);
                else
                        return do_fault(vmf);
        }

        if (!pte_present(vmf->orig_pte))
                return do_swap_page(vmf);

        if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
                return do_numa_page(vmf);

        vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
        spin_lock(vmf->ptl);
        entry = vmf->orig_pte;
        if (unlikely(!pte_same(*vmf->pte, entry))) {
                update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
                goto unlock;
        }
        if (vmf->flags & FAULT_FLAG_WRITE) {
                if (!pte_write(entry))
                        return do_wp_page(vmf);
                entry = pte_mkdirty(entry);
        }
        entry = pte_mkyoung(entry);
        if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
                                vmf->flags & FAULT_FLAG_WRITE)) {
                update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
        } else {
                /* Skip spurious TLB flush for retried page fault */
                if (vmf->flags & FAULT_FLAG_TRIED)
                        goto unlock;
                /*
                 * This is needed only for protection faults but the arch code
                 * is not yet telling us if this is a protection fault or not.
                 * This still avoids useless tlb flushes for .text page faults
                 * with threads.
                 */
                if (vmf->flags & FAULT_FLAG_WRITE)
                        flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
        }
unlock:
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        return 0;
}

/*
 * By the time we get here, we already hold the mm semaphore
 *
 * The mmap_lock may have been released depending on flags and our
 * return value.  See filemap_fault() and __lock_page_or_retry().
 */
static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
                unsigned long address, unsigned int flags)
{
        struct vm_fault vmf = {
                .vma = vma,
                .address = address & PAGE_MASK,
                .flags = flags,
                .pgoff = linear_page_index(vma, address),
                .gfp_mask = __get_fault_gfp_mask(vma),
        };
        unsigned int dirty = flags & FAULT_FLAG_WRITE;
        struct mm_struct *mm = vma->vm_mm;
        pgd_t *pgd;
        p4d_t *p4d;
        vm_fault_t ret;

        pgd = pgd_offset(mm, address);
        p4d = p4d_alloc(mm, pgd, address);
        if (!p4d)
                return VM_FAULT_OOM;

        vmf.pud = pud_alloc(mm, p4d, address);
        if (!vmf.pud)
                return VM_FAULT_OOM;
retry_pud:
        if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) {
                ret = create_huge_pud(&vmf);
                if (!(ret & VM_FAULT_FALLBACK))
                        return ret;
        } else {
                pud_t orig_pud = *vmf.pud;

                barrier();
                if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {

                        /* NUMA case for anonymous PUDs would go here */

                        if (dirty && !pud_write(orig_pud)) {
                                ret = wp_huge_pud(&vmf, orig_pud);
                                if (!(ret & VM_FAULT_FALLBACK))
                                        return ret;
                        } else {
                                huge_pud_set_accessed(&vmf, orig_pud);
                                return 0;
                        }
                }
        }

        vmf.pmd = pmd_alloc(mm, vmf.pud, address);
        if (!vmf.pmd)
                return VM_FAULT_OOM;

        /* Huge pud page fault raced with pmd_alloc? */
        if (pud_trans_unstable(vmf.pud))
                goto retry_pud;

        if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) {
                ret = create_huge_pmd(&vmf);
                if (!(ret & VM_FAULT_FALLBACK))
                        return ret;
        } else {
                pmd_t orig_pmd = *vmf.pmd;

                barrier();
                if (unlikely(is_swap_pmd(orig_pmd))) {
                        VM_BUG_ON(thp_migration_supported() &&
                                          !is_pmd_migration_entry(orig_pmd));
                        if (is_pmd_migration_entry(orig_pmd))
                                pmd_migration_entry_wait(mm, vmf.pmd);
                        return 0;
                }
                if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
                        if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
                                return do_huge_pmd_numa_page(&vmf, orig_pmd);

                        if (dirty && !pmd_write(orig_pmd)) {
                                ret = wp_huge_pmd(&vmf, orig_pmd);
                                if (!(ret & VM_FAULT_FALLBACK))
                                        return ret;
                        } else {
                                huge_pmd_set_accessed(&vmf, orig_pmd);
                                return 0;
                        }
                }
        }

        return handle_pte_fault(&vmf);
}

/**
 * mm_account_fault - Do page fault accountings
 *
 * @regs: the pt_regs struct pointer.  When set to NULL, will skip accounting
 *        of perf event counters, but we'll still do the per-task accounting to
 *        the task who triggered this page fault.
 * @address: the faulted address.
 * @flags: the fault flags.
 * @ret: the fault retcode.
 *
 * This will take care of most of the page fault accountings.  Meanwhile, it
 * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter
 * updates.  However note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
 * still be in per-arch page fault handlers at the entry of page fault.
 */
static inline void mm_account_fault(struct pt_regs *regs,
                                    unsigned long address, unsigned int flags,
                                    vm_fault_t ret)
{
        bool major;

        /*
         * We don't do accounting for some specific faults:
         *
         * - Unsuccessful faults (e.g. when the address wasn't valid).  That
         *   includes arch_vma_access_permitted() failing before reaching here.
         *   So this is not a "this many hardware page faults" counter.  We
         *   should use the hw profiling for that.
         *
         * - Incomplete faults (VM_FAULT_RETRY).  They will only be counted
         *   once they're completed.
         */
        if (ret & (VM_FAULT_ERROR | VM_FAULT_RETRY))
                return;

        /*
         * We define the fault as a major fault when the final successful fault
         * is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't
         * handle it immediately previously).
         */
        major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED);

        if (major)
                current->maj_flt++;
        else
                current->min_flt++;

        /*
         * If the fault is done for GUP, regs will be NULL.  We only do the
         * accounting for the per thread fault counters who triggered the
         * fault, and we skip the perf event updates.
         */
        if (!regs)
                return;

        if (major)
                perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
        else
                perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
}

/*
 * By the time we get here, we already hold the mm semaphore
 *
 * The mmap_lock may have been released depending on flags and our
 * return value.  See filemap_fault() and __lock_page_or_retry().
 */
vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
                           unsigned int flags, struct pt_regs *regs)
{
        vm_fault_t ret;

        __set_current_state(TASK_RUNNING);

        count_vm_event(PGFAULT);
        count_memcg_event_mm(vma->vm_mm, PGFAULT);

        /* do counter updates before entering really critical section. */
        check_sync_rss_stat(current);

        if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
                                            flags & FAULT_FLAG_INSTRUCTION,
                                            flags & FAULT_FLAG_REMOTE))
                return VM_FAULT_SIGSEGV;

        /*
         * Enable the memcg OOM handling for faults triggered in user
         * space.  Kernel faults are handled more gracefully.
         */
        if (flags & FAULT_FLAG_USER)
                mem_cgroup_enter_user_fault();

        if (unlikely(is_vm_hugetlb_page(vma)))
                ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
        else
                ret = __handle_mm_fault(vma, address, flags);

        if (flags & FAULT_FLAG_USER) {
                mem_cgroup_exit_user_fault();
                /*
                 * The task may have entered a memcg OOM situation but
                 * if the allocation error was handled gracefully (no
                 * VM_FAULT_OOM), there is no need to kill anything.
                 * Just clean up the OOM state peacefully.
                 */
                if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
                        mem_cgroup_oom_synchronize(false);
        }

        mm_account_fault(regs, address, flags, ret);

        return ret;
}
EXPORT_SYMBOL_GPL(handle_mm_fault);

#ifndef __PAGETABLE_P4D_FOLDED
/*
 * Allocate p4d page table.
 * We've already handled the fast-path in-line.
 */
int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
{
        p4d_t *new = p4d_alloc_one(mm, address);
        if (!new)
                return -ENOMEM;

        smp_wmb(); /* See comment in __pte_alloc */

        spin_lock(&mm->page_table_lock);
        if (pgd_present(*pgd))                /* Another has populated it */
                p4d_free(mm, new);
        else
                pgd_populate(mm, pgd, new);
        spin_unlock(&mm->page_table_lock);
        return 0;
}
#endif /* __PAGETABLE_P4D_FOLDED */

#ifndef __PAGETABLE_PUD_FOLDED
/*
 * Allocate page upper directory.
 * We've already handled the fast-path in-line.
 */
int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
{
        pud_t *new = pud_alloc_one(mm, address);
        if (!new)
                return -ENOMEM;

        smp_wmb(); /* See comment in __pte_alloc */

        spin_lock(&mm->page_table_lock);
        if (!p4d_present(*p4d)) {
                mm_inc_nr_puds(mm);
                p4d_populate(mm, p4d, new);
        } else        /* Another has populated it */
                pud_free(mm, new);
        spin_unlock(&mm->page_table_lock);
        return 0;
}
#endif /* __PAGETABLE_PUD_FOLDED */

#ifndef __PAGETABLE_PMD_FOLDED
/*
 * Allocate page middle directory.
 * We've already handled the fast-path in-line.
 */
int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
{
        spinlock_t *ptl;
        pmd_t *new = pmd_alloc_one(mm, address);
        if (!new)
                return -ENOMEM;

        smp_wmb(); /* See comment in __pte_alloc */

        ptl = pud_lock(mm, pud);
        if (!pud_present(*pud)) {
                mm_inc_nr_pmds(mm);
                pud_populate(mm, pud, new);
        } else        /* Another has populated it */
                pmd_free(mm, new);
        spin_unlock(ptl);
        return 0;
}
#endif /* __PAGETABLE_PMD_FOLDED */

int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
                          struct mmu_notifier_range *range, pte_t **ptepp,
                          pmd_t **pmdpp, spinlock_t **ptlp)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;
        pte_t *ptep;

        pgd = pgd_offset(mm, address);
        if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
                goto out;

        p4d = p4d_offset(pgd, address);
        if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
                goto out;

        pud = pud_offset(p4d, address);
        if (pud_none(*pud) || unlikely(pud_bad(*pud)))
                goto out;

        pmd = pmd_offset(pud, address);
        VM_BUG_ON(pmd_trans_huge(*pmd));

        if (pmd_huge(*pmd)) {
                if (!pmdpp)
                        goto out;

                if (range) {
                        mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0,
                                                NULL, mm, address & PMD_MASK,
                                                (address & PMD_MASK) + PMD_SIZE);
                        mmu_notifier_invalidate_range_start(range);
                }
                *ptlp = pmd_lock(mm, pmd);
                if (pmd_huge(*pmd)) {
                        *pmdpp = pmd;
                        return 0;
                }
                spin_unlock(*ptlp);
                if (range)
                        mmu_notifier_invalidate_range_end(range);
        }

        if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
                goto out;

        if (range) {
                mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
                                        address & PAGE_MASK,
                                        (address & PAGE_MASK) + PAGE_SIZE);
                mmu_notifier_invalidate_range_start(range);
        }
        ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
        if (!pte_present(*ptep))
                goto unlock;
        *ptepp = ptep;
        return 0;
unlock:
        pte_unmap_unlock(ptep, *ptlp);
        if (range)
                mmu_notifier_invalidate_range_end(range);
out:
        return -EINVAL;
}

/**
 * follow_pte - look up PTE at a user virtual address
 * @mm: the mm_struct of the target address space
 * @address: user virtual address
 * @ptepp: location to store found PTE
 * @ptlp: location to store the lock for the PTE
 *
 * On a successful return, the pointer to the PTE is stored in @ptepp;
 * the corresponding lock is taken and its location is stored in @ptlp.
 * The contents of the PTE are only stable until @ptlp is released;
 * any further use, if any, must be protected against invalidation
 * with MMU notifiers.
 *
 * Only IO mappings and raw PFN mappings are allowed.  The mmap semaphore
 * should be taken for read.
 *
 * KVM uses this function.  While it is arguably less bad than ``follow_pfn``,
 * it is not a good general-purpose API.
 *
 * Return: zero on success, -ve otherwise.
 */
int follow_pte(struct mm_struct *mm, unsigned long address,
               pte_t **ptepp, spinlock_t **ptlp)
{
        return follow_invalidate_pte(mm, address, NULL, ptepp, NULL, ptlp);
}
EXPORT_SYMBOL_GPL(follow_pte);

/**
 * follow_pfn - look up PFN at a user virtual address
 * @vma: memory mapping
 * @address: user virtual address
 * @pfn: location to store found PFN
 *
 * Only IO mappings and raw PFN mappings are allowed.
 *
 * This function does not allow the caller to read the permissions
 * of the PTE.  Do not use it.
 *
 * Return: zero and the pfn at @pfn on success, -ve otherwise.
 */
int follow_pfn(struct vm_area_struct *vma, unsigned long address,
        unsigned long *pfn)
{
        int ret = -EINVAL;
        spinlock_t *ptl;
        pte_t *ptep;

        if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
                return ret;

        ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
        if (ret)
                return ret;
        *pfn = pte_pfn(*ptep);
        pte_unmap_unlock(ptep, ptl);
        return 0;
}
EXPORT_SYMBOL(follow_pfn);

#ifdef CONFIG_HAVE_IOREMAP_PROT
int follow_phys(struct vm_area_struct *vma,
                unsigned long address, unsigned int flags,
                unsigned long *prot, resource_size_t *phys)
{
        int ret = -EINVAL;
        pte_t *ptep, pte;
        spinlock_t *ptl;

        if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
                goto out;

        if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
                goto out;
        pte = *ptep;

        /* Never return PFNs of anon folios in COW mappings. */
        if (vm_normal_page(vma, address, pte))
                goto unlock;

        if ((flags & FOLL_WRITE) && !pte_write(pte))
                goto unlock;

        *prot = pgprot_val(pte_pgprot(pte));
        *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;

        ret = 0;
unlock:
        pte_unmap_unlock(ptep, ptl);
out:
        return ret;
}

int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
                        void *buf, int len, int write)
{
        resource_size_t phys_addr;
        unsigned long prot = 0;
        void __iomem *maddr;
        int offset = addr & (PAGE_SIZE-1);

        if (follow_phys(vma, addr, write, &prot, &phys_addr))
                return -EINVAL;

        maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
        if (!maddr)
                return -ENOMEM;

        if (write)
                memcpy_toio(maddr + offset, buf, len);
        else
                memcpy_fromio(buf, maddr + offset, len);
        iounmap(maddr);

        return len;
}
EXPORT_SYMBOL_GPL(generic_access_phys);
#endif

/*
 * Access another process' address space as given in mm.  If non-NULL, use the
 * given task for page fault accounting.
 */
int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
                unsigned long addr, void *buf, int len, unsigned int gup_flags)
{
        struct vm_area_struct *vma;
        void *old_buf = buf;
        int write = gup_flags & FOLL_WRITE;

        if (mmap_read_lock_killable(mm))
                return 0;

        /* ignore errors, just check how much was successfully transferred */
        while (len) {
                int bytes, ret, offset;
                void *maddr;
                struct page *page = NULL;

                ret = get_user_pages_remote(mm, addr, 1,
                                gup_flags, &page, &vma, NULL);
                if (ret <= 0) {
#ifndef CONFIG_HAVE_IOREMAP_PROT
                        break;
#else
                        /*
                         * Check if this is a VM_IO | VM_PFNMAP VMA, which
                         * we can access using slightly different code.
                         */
                        vma = find_vma(mm, addr);
                        if (!vma || vma->vm_start > addr)
                                break;
                        if (vma->vm_ops && vma->vm_ops->access)
                                ret = vma->vm_ops->access(vma, addr, buf,
                                                          len, write);
                        if (ret <= 0)
                                break;
                        bytes = ret;
#endif
                } else {
                        bytes = len;
                        offset = addr & (PAGE_SIZE-1);
                        if (bytes > PAGE_SIZE-offset)
                                bytes = PAGE_SIZE-offset;

                        maddr = kmap(page);
                        if (write) {
                                copy_to_user_page(vma, page, addr,
                                                  maddr + offset, buf, bytes);
                                set_page_dirty_lock(page);
                        } else {
                                copy_from_user_page(vma, page, addr,
                                                    buf, maddr + offset, bytes);
                        }
                        kunmap(page);
                        put_page(page);
                }
                len -= bytes;
                buf += bytes;
                addr += bytes;
        }
        mmap_read_unlock(mm);

        return buf - old_buf;
}

/**
 * access_remote_vm - access another process' address space
 * @mm:                the mm_struct of the target address space
 * @addr:        start address to access
 * @buf:        source or destination buffer
 * @len:        number of bytes to transfer
 * @gup_flags:        flags modifying lookup behaviour
 *
 * The caller must hold a reference on @mm.
 *
 * Return: number of bytes copied from source to destination.
 */
int access_remote_vm(struct mm_struct *mm, unsigned long addr,
                void *buf, int len, unsigned int gup_flags)
{
        return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags);
}

/*
 * Access another process' address space.
 * Source/target buffer must be kernel space,
 * Do not walk the page table directly, use get_user_pages
 */
int access_process_vm(struct task_struct *tsk, unsigned long addr,
                void *buf, int len, unsigned int gup_flags)
{
        struct mm_struct *mm;
        int ret;

        mm = get_task_mm(tsk);
        if (!mm)
                return 0;

        ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);

        mmput(mm);

        return ret;
}
EXPORT_SYMBOL_GPL(access_process_vm);

/*
 * Print the name of a VMA.
 */
void print_vma_addr(char *prefix, unsigned long ip)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;

        /*
         * we might be running from an atomic context so we cannot sleep
         */
        if (!mmap_read_trylock(mm))
                return;

        vma = find_vma(mm, ip);
        if (vma && vma->vm_file) {
                struct file *f = vma->vm_file;
                char *buf = (char *)__get_free_page(GFP_NOWAIT);
                if (buf) {
                        char *p;

                        p = file_path(f, buf, PAGE_SIZE);
                        if (IS_ERR(p))
                                p = "?";
                        printk("%s%s[%lx+%lx]", prefix, kbasename(p),
                                        vma->vm_start,
                                        vma->vm_end - vma->vm_start);
                        free_page((unsigned long)buf);
                }
        }
        mmap_read_unlock(mm);
}

#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
void __might_fault(const char *file, int line)
{
        /*
         * Some code (nfs/sunrpc) uses socket ops on kernel memory while
         * holding the mmap_lock, this is safe because kernel memory doesn't
         * get paged out, therefore we'll never actually fault, and the
         * below annotations will generate false positives.
         */
        if (uaccess_kernel())
                return;
        if (pagefault_disabled())
                return;
        __might_sleep(file, line, 0);
#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
        if (current->mm)
                might_lock_read(&current->mm->mmap_lock);
#endif
}
EXPORT_SYMBOL(__might_fault);
#endif

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
/*
 * Process all subpages of the specified huge page with the specified
 * operation.  The target subpage will be processed last to keep its
 * cache lines hot.
 */
static inline void process_huge_page(
        unsigned long addr_hint, unsigned int pages_per_huge_page,
        void (*process_subpage)(unsigned long addr, int idx, void *arg),
        void *arg)
{
        int i, n, base, l;
        unsigned long addr = addr_hint &
                ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);

        /* Process target subpage last to keep its cache lines hot */
        might_sleep();
        n = (addr_hint - addr) / PAGE_SIZE;
        if (2 * n <= pages_per_huge_page) {
                /* If target subpage in first half of huge page */
                base = 0;
                l = n;
                /* Process subpages at the end of huge page */
                for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
                        cond_resched();
                        process_subpage(addr + i * PAGE_SIZE, i, arg);
                }
        } else {
                /* If target subpage in second half of huge page */
                base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
                l = pages_per_huge_page - n;
                /* Process subpages at the begin of huge page */
                for (i = 0; i < base; i++) {
                        cond_resched();
                        process_subpage(addr + i * PAGE_SIZE, i, arg);
                }
        }
        /*
         * Process remaining subpages in left-right-left-right pattern
         * towards the target subpage
         */
        for (i = 0; i < l; i++) {
                int left_idx = base + i;
                int right_idx = base + 2 * l - 1 - i;

                cond_resched();
                process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
                cond_resched();
                process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
        }
}

static void clear_gigantic_page(struct page *page,
                                unsigned long addr,
                                unsigned int pages_per_huge_page)
{
        int i;
        struct page *p = page;

        might_sleep();
        for (i = 0; i < pages_per_huge_page;
             i++, p = mem_map_next(p, page, i)) {
                cond_resched();
                clear_user_highpage(p, addr + i * PAGE_SIZE);
        }
}

static void clear_subpage(unsigned long addr, int idx, void *arg)
{
        struct page *page = arg;

        clear_user_highpage(page + idx, addr);
}

void clear_huge_page(struct page *page,
                     unsigned long addr_hint, unsigned int pages_per_huge_page)
{
        unsigned long addr = addr_hint &
                ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);

        if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
                clear_gigantic_page(page, addr, pages_per_huge_page);
                return;
        }

        process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page);
}

static void copy_user_gigantic_page(struct page *dst, struct page *src,
                                    unsigned long addr,
                                    struct vm_area_struct *vma,
                                    unsigned int pages_per_huge_page)
{
        int i;
        struct page *dst_base = dst;
        struct page *src_base = src;

        for (i = 0; i < pages_per_huge_page; ) {
                cond_resched();
                copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);

                i++;
                dst = mem_map_next(dst, dst_base, i);
                src = mem_map_next(src, src_base, i);
        }
}

struct copy_subpage_arg {
        struct page *dst;
        struct page *src;
        struct vm_area_struct *vma;
};

static void copy_subpage(unsigned long addr, int idx, void *arg)
{
        struct copy_subpage_arg *copy_arg = arg;

        copy_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
                           addr, copy_arg->vma);
}

void copy_user_huge_page(struct page *dst, struct page *src,
                         unsigned long addr_hint, struct vm_area_struct *vma,
                         unsigned int pages_per_huge_page)
{
        unsigned long addr = addr_hint &
                ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
        struct copy_subpage_arg arg = {
                .dst = dst,
                .src = src,
                .vma = vma,
        };

        if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
                copy_user_gigantic_page(dst, src, addr, vma,
                                        pages_per_huge_page);
                return;
        }

        process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
}

long copy_huge_page_from_user(struct page *dst_page,
                                const void __user *usr_src,
                                unsigned int pages_per_huge_page,
                                bool allow_pagefault)
{
        void *src = (void *)usr_src;
        void *page_kaddr;
        unsigned long i, rc = 0;
        unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
        struct page *subpage = dst_page;

        for (i = 0; i < pages_per_huge_page;
             i++, subpage = mem_map_next(subpage, dst_page, i)) {
                if (allow_pagefault)
                        page_kaddr = kmap(subpage);
                else
                        page_kaddr = kmap_atomic(subpage);
                rc = copy_from_user(page_kaddr,
                                (const void __user *)(src + i * PAGE_SIZE),
                                PAGE_SIZE);
                if (allow_pagefault)
                        kunmap(subpage);
                else
                        kunmap_atomic(page_kaddr);

                ret_val -= (PAGE_SIZE - rc);
                if (rc)
                        break;

                flush_dcache_page(subpage);

                cond_resched();
        }
        return ret_val;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */

#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS

static struct kmem_cache *page_ptl_cachep;

void __init ptlock_cache_init(void)
{
        page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
                        SLAB_PANIC, NULL);
}

bool ptlock_alloc(struct page *page)
{
        spinlock_t *ptl;

        ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
        if (!ptl)
                return false;
        page->ptl = ptl;
        return true;
}

void ptlock_free(struct page *page)
{
        kmem_cache_free(page_ptl_cachep, page->ptl);
}
#endif







































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Because linux/module.h has tracepoints in the header, and ftrace.h
 * used to include this file, define_trace.h includes linux/module.h
 * But we do not want the module.h to override the TRACE_SYSTEM macro
 * variable that define_trace.h is processing, so we only set it
 * when module events are being processed, which would happen when
 * CREATE_TRACE_POINTS is defined.
 */
#ifdef CREATE_TRACE_POINTS
#undef TRACE_SYSTEM
#define TRACE_SYSTEM module
#endif

#if !defined(_TRACE_MODULE_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_MODULE_H

#include <linux/tracepoint.h>

#ifdef CONFIG_MODULES

struct module;

#define show_module_flags(flags) __print_flags(flags, "",        \
        { (1UL << TAINT_PROPRIETARY_MODULE),        "P" },                \
        { (1UL << TAINT_OOT_MODULE),                "O" },                \
        { (1UL << TAINT_FORCED_MODULE),                "F" },                \
        { (1UL << TAINT_CRAP),                        "C" },                \
        { (1UL << TAINT_UNSIGNED_MODULE),        "E" })

TRACE_EVENT(module_load,

        TP_PROTO(struct module *mod),

        TP_ARGS(mod),

        TP_STRUCT__entry(
                __field(        unsigned int,        taints                )
                __string(        name,                mod->name        )
        ),

        TP_fast_assign(
                __entry->taints = mod->taints;
                __assign_str(name, mod->name);
        ),

        TP_printk("%s %s", __get_str(name), show_module_flags(__entry->taints))
);

TRACE_EVENT(module_free,

        TP_PROTO(struct module *mod),

        TP_ARGS(mod),

        TP_STRUCT__entry(
                __string(        name,                mod->name        )
        ),

        TP_fast_assign(
                __assign_str(name, mod->name);
        ),

        TP_printk("%s", __get_str(name))
);

#ifdef CONFIG_MODULE_UNLOAD
/* trace_module_get/put are only used if CONFIG_MODULE_UNLOAD is defined */

DECLARE_EVENT_CLASS(module_refcnt,

        TP_PROTO(struct module *mod, unsigned long ip),

        TP_ARGS(mod, ip),

        TP_STRUCT__entry(
                __field(        unsigned long,        ip                )
                __field(        int,                refcnt                )
                __string(        name,                mod->name        )
        ),

        TP_fast_assign(
                __entry->ip        = ip;
                __entry->refcnt        = atomic_read(&mod->refcnt);
                __assign_str(name, mod->name);
        ),

        TP_printk("%s call_site=%ps refcnt=%d",
                  __get_str(name), (void *)__entry->ip, __entry->refcnt)
);

DEFINE_EVENT(module_refcnt, module_get,

        TP_PROTO(struct module *mod, unsigned long ip),

        TP_ARGS(mod, ip)
);

DEFINE_EVENT(module_refcnt, module_put,

        TP_PROTO(struct module *mod, unsigned long ip),

        TP_ARGS(mod, ip)
);
#endif /* CONFIG_MODULE_UNLOAD */

TRACE_EVENT(module_request,

        TP_PROTO(char *name, bool wait, unsigned long ip),

        TP_ARGS(name, wait, ip),

        TP_STRUCT__entry(
                __field(        unsigned long,        ip                )
                __field(        bool,                wait                )
                __string(        name,                name                )
        ),

        TP_fast_assign(
                __entry->ip        = ip;
                __entry->wait        = wait;
                __assign_str(name, name);
        ),

        TP_printk("%s wait=%d call_site=%ps",
                  __get_str(name), (int)__entry->wait, (void *)__entry->ip)
);

#endif /* CONFIG_MODULES */

#endif /* _TRACE_MODULE_H */

/* This part must be outside protection */
#include <trace/define_trace.h>



































































































































































































































































    1 
    1 

    1 





    1 



    1 




    1 
































    1 



    1 







    1 




    1 


    1 



    1 









    1 



































    1 
    1 
























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                PF_INET protocol family socket handler.
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Florian La Roche, <flla@stud.uni-sb.de>
 *                Alan Cox, <A.Cox@swansea.ac.uk>
 *
 * Changes (see also sock.c)
 *
 *                piggy,
 *                Karl Knutson        :        Socket protocol table
 *                A.N.Kuznetsov        :        Socket death error in accept().
 *                John Richardson :        Fix non blocking error in connect()
 *                                        so sockets that fail to connect
 *                                        don't return -EINPROGRESS.
 *                Alan Cox        :        Asynchronous I/O support
 *                Alan Cox        :        Keep correct socket pointer on sock
 *                                        structures
 *                                        when accept() ed
 *                Alan Cox        :        Semantics of SO_LINGER aren't state
 *                                        moved to close when you look carefully.
 *                                        With this fixed and the accept bug fixed
 *                                        some RPC stuff seems happier.
 *                Niibe Yutaka        :        4.4BSD style write async I/O
 *                Alan Cox,
 *                Tony Gale         :        Fixed reuse semantics.
 *                Alan Cox        :        bind() shouldn't abort existing but dead
 *                                        sockets. Stops FTP netin:.. I hope.
 *                Alan Cox        :        bind() works correctly for RAW sockets.
 *                                        Note that FreeBSD at least was broken
 *                                        in this respect so be careful with
 *                                        compatibility tests...
 *                Alan Cox        :        routing cache support
 *                Alan Cox        :        memzero the socket structure for
 *                                        compactness.
 *                Matt Day        :        nonblock connect error handler
 *                Alan Cox        :        Allow large numbers of pending sockets
 *                                        (eg for big web sites), but only if
 *                                        specifically application requested.
 *                Alan Cox        :        New buffering throughout IP. Used
 *                                        dumbly.
 *                Alan Cox        :        New buffering now used smartly.
 *                Alan Cox        :        BSD rather than common sense
 *                                        interpretation of listen.
 *                Germano Caronni        :        Assorted small races.
 *                Alan Cox        :        sendmsg/recvmsg basic support.
 *                Alan Cox        :        Only sendmsg/recvmsg now supported.
 *                Alan Cox        :        Locked down bind (see security list).
 *                Alan Cox        :        Loosened bind a little.
 *                Mike McLagan        :        ADD/DEL DLCI Ioctls
 *        Willy Konynenberg        :        Transparent proxying support.
 *                David S. Miller        :        New socket lookup architecture.
 *                                        Some other random speedups.
 *                Cyrus Durgin        :        Cleaned up file for kmod hacks.
 *                Andi Kleen        :        Fix inet_stream_connect TCP race.
 */

#define pr_fmt(fmt) "IPv4: " fmt

#include <linux/err.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/kernel.h>
#include <linux/kmod.h>
#include <linux/sched.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/capability.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/stat.h>
#include <linux/init.h>
#include <linux/poll.h>
#include <linux/netfilter_ipv4.h>
#include <linux/random.h>
#include <linux/slab.h>

#include <linux/uaccess.h>

#include <linux/inet.h>
#include <linux/igmp.h>
#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <net/checksum.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/arp.h>
#include <net/route.h>
#include <net/ip_fib.h>
#include <net/inet_connection_sock.h>
#include <net/tcp.h>
#include <net/udp.h>
#include <net/udplite.h>
#include <net/ping.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/raw.h>
#include <net/icmp.h>
#include <net/inet_common.h>
#include <net/ip_tunnels.h>
#include <net/xfrm.h>
#include <net/net_namespace.h>
#include <net/secure_seq.h>
#ifdef CONFIG_IP_MROUTE
#include <linux/mroute.h>
#endif
#include <net/l3mdev.h>
#include <net/compat.h>

#include <trace/events/sock.h>

/* The inetsw table contains everything that inet_create needs to
 * build a new socket.
 */
static struct list_head inetsw[SOCK_MAX];
static DEFINE_SPINLOCK(inetsw_lock);

/* New destruction routine */

void inet_sock_destruct(struct sock *sk)
{
        struct inet_sock *inet = inet_sk(sk);

        __skb_queue_purge(&sk->sk_receive_queue);
        if (sk->sk_rx_skb_cache) {
                __kfree_skb(sk->sk_rx_skb_cache);
                sk->sk_rx_skb_cache = NULL;
        }
        __skb_queue_purge(&sk->sk_error_queue);

        sk_mem_reclaim(sk);

        if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) {
                pr_err("Attempt to release TCP socket in state %d %p\n",
                       sk->sk_state, sk);
                return;
        }
        if (!sock_flag(sk, SOCK_DEAD)) {
                pr_err("Attempt to release alive inet socket %p\n", sk);
                return;
        }

        WARN_ON(atomic_read(&sk->sk_rmem_alloc));
        WARN_ON(refcount_read(&sk->sk_wmem_alloc));
        WARN_ON(sk->sk_wmem_queued);
        WARN_ON(sk->sk_forward_alloc);

        kfree(rcu_dereference_protected(inet->inet_opt, 1));
        dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1));
        dst_release(rcu_dereference_protected(sk->sk_rx_dst, 1));
        sk_refcnt_debug_dec(sk);
}
EXPORT_SYMBOL(inet_sock_destruct);

/*
 *        The routines beyond this point handle the behaviour of an AF_INET
 *        socket object. Mostly it punts to the subprotocols of IP to do
 *        the work.
 */

/*
 *        Automatically bind an unbound socket.
 */

static int inet_autobind(struct sock *sk)
{
        struct inet_sock *inet;
        /* We may need to bind the socket. */
        lock_sock(sk);
        inet = inet_sk(sk);
        if (!inet->inet_num) {
                if (sk->sk_prot->get_port(sk, 0)) {
                        release_sock(sk);
                        return -EAGAIN;
                }
                inet->inet_sport = htons(inet->inet_num);
        }
        release_sock(sk);
        return 0;
}

/*
 *        Move a socket into listening state.
 */
int inet_listen(struct socket *sock, int backlog)
{
        struct sock *sk = sock->sk;
        unsigned char old_state;
        int err, tcp_fastopen;

        lock_sock(sk);

        err = -EINVAL;
        if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
                goto out;

        old_state = sk->sk_state;
        if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
                goto out;

        WRITE_ONCE(sk->sk_max_ack_backlog, backlog);
        /* Really, if the socket is already in listen state
         * we can only allow the backlog to be adjusted.
         */
        if (old_state != TCP_LISTEN) {
                /* Enable TFO w/o requiring TCP_FASTOPEN socket option.
                 * Note that only TCP sockets (SOCK_STREAM) will reach here.
                 * Also fastopen backlog may already been set via the option
                 * because the socket was in TCP_LISTEN state previously but
                 * was shutdown() rather than close().
                 */
                tcp_fastopen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen);
                if ((tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) &&
                    (tcp_fastopen & TFO_SERVER_ENABLE) &&
                    !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) {
                        fastopen_queue_tune(sk, backlog);
                        tcp_fastopen_init_key_once(sock_net(sk));
                }

                err = inet_csk_listen_start(sk, backlog);
                if (err)
                        goto out;
                tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL);
        }
        err = 0;

out:
        release_sock(sk);
        return err;
}
EXPORT_SYMBOL(inet_listen);

/*
 *        Create an inet socket.
 */

static int inet_create(struct net *net, struct socket *sock, int protocol,
                       int kern)
{
        struct sock *sk;
        struct inet_protosw *answer;
        struct inet_sock *inet;
        struct proto *answer_prot;
        unsigned char answer_flags;
        int try_loading_module = 0;
        int err;

        if (protocol < 0 || protocol >= IPPROTO_MAX)
                return -EINVAL;

        sock->state = SS_UNCONNECTED;

        /* Look for the requested type/protocol pair. */
lookup_protocol:
        err = -ESOCKTNOSUPPORT;
        rcu_read_lock();
        list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {

                err = 0;
                /* Check the non-wild match. */
                if (protocol == answer->protocol) {
                        if (protocol != IPPROTO_IP)
                                break;
                } else {
                        /* Check for the two wild cases. */
                        if (IPPROTO_IP == protocol) {
                                protocol = answer->protocol;
                                break;
                        }
                        if (IPPROTO_IP == answer->protocol)
                                break;
                }
                err = -EPROTONOSUPPORT;
        }

        if (unlikely(err)) {
                if (try_loading_module < 2) {
                        rcu_read_unlock();
                        /*
                         * Be more specific, e.g. net-pf-2-proto-132-type-1
                         * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
                         */
                        if (++try_loading_module == 1)
                                request_module("net-pf-%d-proto-%d-type-%d",
                                               PF_INET, protocol, sock->type);
                        /*
                         * Fall back to generic, e.g. net-pf-2-proto-132
                         * (net-pf-PF_INET-proto-IPPROTO_SCTP)
                         */
                        else
                                request_module("net-pf-%d-proto-%d",
                                               PF_INET, protocol);
                        goto lookup_protocol;
                } else
                        goto out_rcu_unlock;
        }

        err = -EPERM;
        if (sock->type == SOCK_RAW && !kern &&
            !ns_capable(net->user_ns, CAP_NET_RAW))
                goto out_rcu_unlock;

        sock->ops = answer->ops;
        answer_prot = answer->prot;
        answer_flags = answer->flags;
        rcu_read_unlock();

        WARN_ON(!answer_prot->slab);

        err = -ENOBUFS;
        sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern);
        if (!sk)
                goto out;

        err = 0;
        if (INET_PROTOSW_REUSE & answer_flags)
                sk->sk_reuse = SK_CAN_REUSE;

        if (INET_PROTOSW_ICSK & answer_flags)
                inet_init_csk_locks(sk);

        inet = inet_sk(sk);
        inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;

        inet->nodefrag = 0;

        if (SOCK_RAW == sock->type) {
                inet->inet_num = protocol;
                if (IPPROTO_RAW == protocol)
                        inet->hdrincl = 1;
        }

        if (READ_ONCE(net->ipv4.sysctl_ip_no_pmtu_disc))
                inet->pmtudisc = IP_PMTUDISC_DONT;
        else
                inet->pmtudisc = IP_PMTUDISC_WANT;

        inet->inet_id = 0;

        sock_init_data(sock, sk);

        sk->sk_destruct           = inet_sock_destruct;
        sk->sk_protocol           = protocol;
        sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;

        inet->uc_ttl        = -1;
        inet->mc_loop        = 1;
        inet->mc_ttl        = 1;
        inet->mc_all        = 1;
        inet->mc_index        = 0;
        inet->mc_list        = NULL;
        inet->rcv_tos        = 0;

        sk_refcnt_debug_inc(sk);

        if (inet->inet_num) {
                /* It assumes that any protocol which allows
                 * the user to assign a number at socket
                 * creation time automatically
                 * shares.
                 */
                inet->inet_sport = htons(inet->inet_num);
                /* Add to protocol hash chains. */
                err = sk->sk_prot->hash(sk);
                if (err)
                        goto out_sk_release;
        }

        if (sk->sk_prot->init) {
                err = sk->sk_prot->init(sk);
                if (err)
                        goto out_sk_release;
        }

        if (!kern) {
                err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
                if (err)
                        goto out_sk_release;
        }
out:
        return err;
out_rcu_unlock:
        rcu_read_unlock();
        goto out;
out_sk_release:
        sk_common_release(sk);
        sock->sk = NULL;
        goto out;
}


/*
 *        The peer socket should always be NULL (or else). When we call this
 *        function we are destroying the object and from then on nobody
 *        should refer to it.
 */
int inet_release(struct socket *sock)
{
        struct sock *sk = sock->sk;

        if (sk) {
                long timeout;

                if (!sk->sk_kern_sock)
                        BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk);

                /* Applications forget to leave groups before exiting */
                ip_mc_drop_socket(sk);

                /* If linger is set, we don't return until the close
                 * is complete.  Otherwise we return immediately. The
                 * actually closing is done the same either way.
                 *
                 * If the close is due to the process exiting, we never
                 * linger..
                 */
                timeout = 0;
                if (sock_flag(sk, SOCK_LINGER) &&
                    !(current->flags & PF_EXITING))
                        timeout = sk->sk_lingertime;
                sk->sk_prot->close(sk, timeout);
                sock->sk = NULL;
        }
        return 0;
}
EXPORT_SYMBOL(inet_release);

int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
        struct sock *sk = sock->sk;
        int err;

        /* If the socket has its own bind function then use it. (RAW) */
        if (sk->sk_prot->bind) {
                return sk->sk_prot->bind(sk, uaddr, addr_len);
        }
        if (addr_len < sizeof(struct sockaddr_in))
                return -EINVAL;

        /* BPF prog is run before any checks are done so that if the prog
         * changes context in a wrong way it will be caught.
         */
        err = BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr);
        if (err)
                return err;

        return __inet_bind(sk, uaddr, addr_len, BIND_WITH_LOCK);
}
EXPORT_SYMBOL(inet_bind);

int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
                u32 flags)
{
        struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
        struct inet_sock *inet = inet_sk(sk);
        struct net *net = sock_net(sk);
        unsigned short snum;
        int chk_addr_ret;
        u32 tb_id = RT_TABLE_LOCAL;
        int err;

        if (addr->sin_family != AF_INET) {
                /* Compatibility games : accept AF_UNSPEC (mapped to AF_INET)
                 * only if s_addr is INADDR_ANY.
                 */
                err = -EAFNOSUPPORT;
                if (addr->sin_family != AF_UNSPEC ||
                    addr->sin_addr.s_addr != htonl(INADDR_ANY))
                        goto out;
        }

        tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id;
        chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id);

        /* Not specified by any standard per-se, however it breaks too
         * many applications when removed.  It is unfortunate since
         * allowing applications to make a non-local bind solves
         * several problems with systems using dynamic addressing.
         * (ie. your servers still start up even if your ISDN link
         *  is temporarily down)
         */
        err = -EADDRNOTAVAIL;
        if (!inet_can_nonlocal_bind(net, inet) &&
            addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
            chk_addr_ret != RTN_LOCAL &&
            chk_addr_ret != RTN_MULTICAST &&
            chk_addr_ret != RTN_BROADCAST)
                goto out;

        snum = ntohs(addr->sin_port);
        err = -EACCES;
        if (snum && inet_port_requires_bind_service(net, snum) &&
            !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
                goto out;

        /*      We keep a pair of addresses. rcv_saddr is the one
         *      used by hash lookups, and saddr is used for transmit.
         *
         *      In the BSD API these are the same except where it
         *      would be illegal to use them (multicast/broadcast) in
         *      which case the sending device address is used.
         */
        if (flags & BIND_WITH_LOCK)
                lock_sock(sk);

        /* Check these errors (active socket, double bind). */
        err = -EINVAL;
        if (sk->sk_state != TCP_CLOSE || inet->inet_num)
                goto out_release_sock;

        inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
        if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
                inet->inet_saddr = 0;  /* Use device */

        /* Make sure we are allowed to bind here. */
        if (snum || !(inet->bind_address_no_port ||
                      (flags & BIND_FORCE_ADDRESS_NO_PORT))) {
                if (sk->sk_prot->get_port(sk, snum)) {
                        inet->inet_saddr = inet->inet_rcv_saddr = 0;
                        err = -EADDRINUSE;
                        goto out_release_sock;
                }
                if (!(flags & BIND_FROM_BPF)) {
                        err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk);
                        if (err) {
                                inet->inet_saddr = inet->inet_rcv_saddr = 0;
                                goto out_release_sock;
                        }
                }
        }

        if (inet->inet_rcv_saddr)
                sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
        if (snum)
                sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
        inet->inet_sport = htons(inet->inet_num);
        inet->inet_daddr = 0;
        inet->inet_dport = 0;
        sk_dst_reset(sk);
        err = 0;
out_release_sock:
        if (flags & BIND_WITH_LOCK)
                release_sock(sk);
out:
        return err;
}

int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
                       int addr_len, int flags)
{
        struct sock *sk = sock->sk;
        const struct proto *prot;
        int err;

        if (addr_len < sizeof(uaddr->sa_family))
                return -EINVAL;

        /* IPV6_ADDRFORM can change sk->sk_prot under us. */
        prot = READ_ONCE(sk->sk_prot);

        if (uaddr->sa_family == AF_UNSPEC)
                return prot->disconnect(sk, flags);

        if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) {
                err = prot->pre_connect(sk, uaddr, addr_len);
                if (err)
                        return err;
        }

        if (data_race(!inet_sk(sk)->inet_num) && inet_autobind(sk))
                return -EAGAIN;
        return prot->connect(sk, uaddr, addr_len);
}
EXPORT_SYMBOL(inet_dgram_connect);

static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
{
        DEFINE_WAIT_FUNC(wait, woken_wake_function);

        add_wait_queue(sk_sleep(sk), &wait);
        sk->sk_write_pending += writebias;
        sk->sk_wait_pending++;

        /* Basic assumption: if someone sets sk->sk_err, he _must_
         * change state of the socket from TCP_SYN_*.
         * Connect() does not allow to get error notifications
         * without closing the socket.
         */
        while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
                release_sock(sk);
                timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo);
                lock_sock(sk);
                if (signal_pending(current) || !timeo)
                        break;
        }
        remove_wait_queue(sk_sleep(sk), &wait);
        sk->sk_write_pending -= writebias;
        sk->sk_wait_pending--;
        return timeo;
}

/*
 *        Connect to a remote host. There is regrettably still a little
 *        TCP 'magic' in here.
 */
int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
                          int addr_len, int flags, int is_sendmsg)
{
        struct sock *sk = sock->sk;
        int err;
        long timeo;

        /*
         * uaddr can be NULL and addr_len can be 0 if:
         * sk is a TCP fastopen active socket and
         * TCP_FASTOPEN_CONNECT sockopt is set and
         * we already have a valid cookie for this socket.
         * In this case, user can call write() after connect().
         * write() will invoke tcp_sendmsg_fastopen() which calls
         * __inet_stream_connect().
         */
        if (uaddr) {
                if (addr_len < sizeof(uaddr->sa_family))
                        return -EINVAL;

                if (uaddr->sa_family == AF_UNSPEC) {
                        err = sk->sk_prot->disconnect(sk, flags);
                        sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
                        goto out;
                }
        }

        switch (sock->state) {
        default:
                err = -EINVAL;
                goto out;
        case SS_CONNECTED:
                err = -EISCONN;
                goto out;
        case SS_CONNECTING:
                if (inet_sk(sk)->defer_connect)
                        err = is_sendmsg ? -EINPROGRESS : -EISCONN;
                else
                        err = -EALREADY;
                /* Fall out of switch with err, set for this state */
                break;
        case SS_UNCONNECTED:
                err = -EISCONN;
                if (sk->sk_state != TCP_CLOSE)
                        goto out;

                if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) {
                        err = sk->sk_prot->pre_connect(sk, uaddr, addr_len);
                        if (err)
                                goto out;
                }

                err = sk->sk_prot->connect(sk, uaddr, addr_len);
                if (err < 0)
                        goto out;

                sock->state = SS_CONNECTING;

                if (!err && inet_sk(sk)->defer_connect)
                        goto out;

                /* Just entered SS_CONNECTING state; the only
                 * difference is that return value in non-blocking
                 * case is EINPROGRESS, rather than EALREADY.
                 */
                err = -EINPROGRESS;
                break;
        }

        timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);

        if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
                int writebias = (sk->sk_protocol == IPPROTO_TCP) &&
                                tcp_sk(sk)->fastopen_req &&
                                tcp_sk(sk)->fastopen_req->data ? 1 : 0;

                /* Error code is set above */
                if (!timeo || !inet_wait_for_connect(sk, timeo, writebias))
                        goto out;

                err = sock_intr_errno(timeo);
                if (signal_pending(current))
                        goto out;
        }

        /* Connection was closed by RST, timeout, ICMP error
         * or another process disconnected us.
         */
        if (sk->sk_state == TCP_CLOSE)
                goto sock_error;

        /* sk->sk_err may be not zero now, if RECVERR was ordered by user
         * and error was received after socket entered established state.
         * Hence, it is handled normally after connect() return successfully.
         */

        sock->state = SS_CONNECTED;
        err = 0;
out:
        return err;

sock_error:
        err = sock_error(sk) ? : -ECONNABORTED;
        sock->state = SS_UNCONNECTED;
        if (sk->sk_prot->disconnect(sk, flags))
                sock->state = SS_DISCONNECTING;
        goto out;
}
EXPORT_SYMBOL(__inet_stream_connect);

int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
                        int addr_len, int flags)
{
        int err;

        lock_sock(sock->sk);
        err = __inet_stream_connect(sock, uaddr, addr_len, flags, 0);
        release_sock(sock->sk);
        return err;
}
EXPORT_SYMBOL(inet_stream_connect);

/*
 *        Accept a pending connection. The TCP layer now gives BSD semantics.
 */

int inet_accept(struct socket *sock, struct socket *newsock, int flags,
                bool kern)
{
        struct sock *sk1 = sock->sk, *sk2;
        int err = -EINVAL;

        /* IPV6_ADDRFORM can change sk->sk_prot under us. */
        sk2 = READ_ONCE(sk1->sk_prot)->accept(sk1, flags, &err, kern);
        if (!sk2)
                goto do_err;

        lock_sock(sk2);

        sock_rps_record_flow(sk2);
        WARN_ON(!((1 << sk2->sk_state) &
                  (TCPF_ESTABLISHED | TCPF_SYN_RECV |
                   TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 |
                   TCPF_CLOSING | TCPF_CLOSE_WAIT |
                   TCPF_CLOSE)));

        sock_graft(sk2, newsock);

        newsock->state = SS_CONNECTED;
        err = 0;
        release_sock(sk2);
do_err:
        return err;
}
EXPORT_SYMBOL(inet_accept);

/*
 *        This does both peername and sockname.
 */
int inet_getname(struct socket *sock, struct sockaddr *uaddr,
                 int peer)
{
        struct sock *sk                = sock->sk;
        struct inet_sock *inet        = inet_sk(sk);
        DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr);

        sin->sin_family = AF_INET;
        if (peer) {
                if (!inet->inet_dport ||
                    (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
                     peer == 1))
                        return -ENOTCONN;
                sin->sin_port = inet->inet_dport;
                sin->sin_addr.s_addr = inet->inet_daddr;
        } else {
                __be32 addr = inet->inet_rcv_saddr;
                if (!addr)
                        addr = inet->inet_saddr;
                sin->sin_port = inet->inet_sport;
                sin->sin_addr.s_addr = addr;
        }
        if (cgroup_bpf_enabled)
                BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
                                            peer ? BPF_CGROUP_INET4_GETPEERNAME :
                                                   BPF_CGROUP_INET4_GETSOCKNAME,
                                            NULL);
        memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
        return sizeof(*sin);
}
EXPORT_SYMBOL(inet_getname);

int inet_send_prepare(struct sock *sk)
{
        sock_rps_record_flow(sk);

        /* We may need to bind the socket. */
        if (data_race(!inet_sk(sk)->inet_num) && !sk->sk_prot->no_autobind &&
            inet_autobind(sk))
                return -EAGAIN;

        return 0;
}
EXPORT_SYMBOL_GPL(inet_send_prepare);

int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
{
        struct sock *sk = sock->sk;

        if (unlikely(inet_send_prepare(sk)))
                return -EAGAIN;

        return INDIRECT_CALL_2(sk->sk_prot->sendmsg, tcp_sendmsg, udp_sendmsg,
                               sk, msg, size);
}
EXPORT_SYMBOL(inet_sendmsg);

ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
                      size_t size, int flags)
{
        struct sock *sk = sock->sk;
        const struct proto *prot;

        if (unlikely(inet_send_prepare(sk)))
                return -EAGAIN;

        /* IPV6_ADDRFORM can change sk->sk_prot under us. */
        prot = READ_ONCE(sk->sk_prot);
        if (prot->sendpage)
                return prot->sendpage(sk, page, offset, size, flags);
        return sock_no_sendpage(sock, page, offset, size, flags);
}
EXPORT_SYMBOL(inet_sendpage);

INDIRECT_CALLABLE_DECLARE(int udp_recvmsg(struct sock *, struct msghdr *,
                                          size_t, int, int, int *));
int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
                 int flags)
{
        struct sock *sk = sock->sk;
        int addr_len = 0;
        int err;

        if (likely(!(flags & MSG_ERRQUEUE)))
                sock_rps_record_flow(sk);

        err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udp_recvmsg,
                              sk, msg, size, flags & MSG_DONTWAIT,
                              flags & ~MSG_DONTWAIT, &addr_len);
        if (err >= 0)
                msg->msg_namelen = addr_len;
        return err;
}
EXPORT_SYMBOL(inet_recvmsg);

int inet_shutdown(struct socket *sock, int how)
{
        struct sock *sk = sock->sk;
        int err = 0;

        /* This should really check to make sure
         * the socket is a TCP socket. (WHY AC...)
         */
        how++; /* maps 0->1 has the advantage of making bit 1 rcvs and
                       1->2 bit 2 snds.
                       2->3 */
        if ((how & ~SHUTDOWN_MASK) || !how)        /* MAXINT->0 */
                return -EINVAL;

        lock_sock(sk);
        if (sock->state == SS_CONNECTING) {
                if ((1 << sk->sk_state) &
                    (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))
                        sock->state = SS_DISCONNECTING;
                else
                        sock->state = SS_CONNECTED;
        }

        switch (sk->sk_state) {
        case TCP_CLOSE:
                err = -ENOTCONN;
                /* Hack to wake up other listeners, who can poll for
                   EPOLLHUP, even on eg. unconnected UDP sockets -- RR */
                fallthrough;
        default:
                WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | how);
                if (sk->sk_prot->shutdown)
                        sk->sk_prot->shutdown(sk, how);
                break;

        /* Remaining two branches are temporary solution for missing
         * close() in multithreaded environment. It is _not_ a good idea,
         * but we have no choice until close() is repaired at VFS level.
         */
        case TCP_LISTEN:
                if (!(how & RCV_SHUTDOWN))
                        break;
                fallthrough;
        case TCP_SYN_SENT:
                err = sk->sk_prot->disconnect(sk, O_NONBLOCK);
                sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
                break;
        }

        /* Wake up anyone sleeping in poll. */
        sk->sk_state_change(sk);
        release_sock(sk);
        return err;
}
EXPORT_SYMBOL(inet_shutdown);

/*
 *        ioctl() calls you can issue on an INET socket. Most of these are
 *        device configuration and stuff and very rarely used. Some ioctls
 *        pass on to the socket itself.
 *
 *        NOTE: I like the idea of a module for the config stuff. ie ifconfig
 *        loads the devconfigure module does its configuring and unloads it.
 *        There's a good 20K of config code hanging around the kernel.
 */

int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
        struct sock *sk = sock->sk;
        int err = 0;
        struct net *net = sock_net(sk);
        void __user *p = (void __user *)arg;
        struct ifreq ifr;
        struct rtentry rt;

        switch (cmd) {
        case SIOCADDRT:
        case SIOCDELRT:
                if (copy_from_user(&rt, p, sizeof(struct rtentry)))
                        return -EFAULT;
                err = ip_rt_ioctl(net, cmd, &rt);
                break;
        case SIOCRTMSG:
                err = -EINVAL;
                break;
        case SIOCDARP:
        case SIOCGARP:
        case SIOCSARP:
                err = arp_ioctl(net, cmd, (void __user *)arg);
                break;
        case SIOCGIFADDR:
        case SIOCGIFBRDADDR:
        case SIOCGIFNETMASK:
        case SIOCGIFDSTADDR:
        case SIOCGIFPFLAGS:
                if (copy_from_user(&ifr, p, sizeof(struct ifreq)))
                        return -EFAULT;
                err = devinet_ioctl(net, cmd, &ifr);
                if (!err && copy_to_user(p, &ifr, sizeof(struct ifreq)))
                        err = -EFAULT;
                break;

        case SIOCSIFADDR:
        case SIOCSIFBRDADDR:
        case SIOCSIFNETMASK:
        case SIOCSIFDSTADDR:
        case SIOCSIFPFLAGS:
        case SIOCSIFFLAGS:
                if (copy_from_user(&ifr, p, sizeof(struct ifreq)))
                        return -EFAULT;
                err = devinet_ioctl(net, cmd, &ifr);
                break;
        default:
                if (sk->sk_prot->ioctl)
                        err = sk->sk_prot->ioctl(sk, cmd, arg);
                else
                        err = -ENOIOCTLCMD;
                break;
        }
        return err;
}
EXPORT_SYMBOL(inet_ioctl);

#ifdef CONFIG_COMPAT
static int inet_compat_routing_ioctl(struct sock *sk, unsigned int cmd,
                struct compat_rtentry __user *ur)
{
        compat_uptr_t rtdev;
        struct rtentry rt;

        if (copy_from_user(&rt.rt_dst, &ur->rt_dst,
                        3 * sizeof(struct sockaddr)) ||
            get_user(rt.rt_flags, &ur->rt_flags) ||
            get_user(rt.rt_metric, &ur->rt_metric) ||
            get_user(rt.rt_mtu, &ur->rt_mtu) ||
            get_user(rt.rt_window, &ur->rt_window) ||
            get_user(rt.rt_irtt, &ur->rt_irtt) ||
            get_user(rtdev, &ur->rt_dev))
                return -EFAULT;

        rt.rt_dev = compat_ptr(rtdev);
        return ip_rt_ioctl(sock_net(sk), cmd, &rt);
}

static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
        void __user *argp = compat_ptr(arg);
        struct sock *sk = sock->sk;

        switch (cmd) {
        case SIOCADDRT:
        case SIOCDELRT:
                return inet_compat_routing_ioctl(sk, cmd, argp);
        default:
                if (!sk->sk_prot->compat_ioctl)
                        return -ENOIOCTLCMD;
                return sk->sk_prot->compat_ioctl(sk, cmd, arg);
        }
}
#endif /* CONFIG_COMPAT */

const struct proto_ops inet_stream_ops = {
        .family                   = PF_INET,
        .owner                   = THIS_MODULE,
        .release           = inet_release,
        .bind                   = inet_bind,
        .connect           = inet_stream_connect,
        .socketpair           = sock_no_socketpair,
        .accept                   = inet_accept,
        .getname           = inet_getname,
        .poll                   = tcp_poll,
        .ioctl                   = inet_ioctl,
        .gettstamp           = sock_gettstamp,
        .listen                   = inet_listen,
        .shutdown           = inet_shutdown,
        .setsockopt           = sock_common_setsockopt,
        .getsockopt           = sock_common_getsockopt,
        .sendmsg           = inet_sendmsg,
        .recvmsg           = inet_recvmsg,
#ifdef CONFIG_MMU
        .mmap                   = tcp_mmap,
#endif
        .sendpage           = inet_sendpage,
        .splice_read           = tcp_splice_read,
        .read_sock           = tcp_read_sock,
        .sendmsg_locked    = tcp_sendmsg_locked,
        .sendpage_locked   = tcp_sendpage_locked,
        .peek_len           = tcp_peek_len,
#ifdef CONFIG_COMPAT
        .compat_ioctl           = inet_compat_ioctl,
#endif
        .set_rcvlowat           = tcp_set_rcvlowat,
};
EXPORT_SYMBOL(inet_stream_ops);

const struct proto_ops inet_dgram_ops = {
        .family                   = PF_INET,
        .owner                   = THIS_MODULE,
        .release           = inet_release,
        .bind                   = inet_bind,
        .connect           = inet_dgram_connect,
        .socketpair           = sock_no_socketpair,
        .accept                   = sock_no_accept,
        .getname           = inet_getname,
        .poll                   = udp_poll,
        .ioctl                   = inet_ioctl,
        .gettstamp           = sock_gettstamp,
        .listen                   = sock_no_listen,
        .shutdown           = inet_shutdown,
        .setsockopt           = sock_common_setsockopt,
        .getsockopt           = sock_common_getsockopt,
        .sendmsg           = inet_sendmsg,
        .recvmsg           = inet_recvmsg,
        .mmap                   = sock_no_mmap,
        .sendpage           = inet_sendpage,
        .set_peek_off           = sk_set_peek_off,
#ifdef CONFIG_COMPAT
        .compat_ioctl           = inet_compat_ioctl,
#endif
};
EXPORT_SYMBOL(inet_dgram_ops);

/*
 * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
 * udp_poll
 */
static const struct proto_ops inet_sockraw_ops = {
        .family                   = PF_INET,
        .owner                   = THIS_MODULE,
        .release           = inet_release,
        .bind                   = inet_bind,
        .connect           = inet_dgram_connect,
        .socketpair           = sock_no_socketpair,
        .accept                   = sock_no_accept,
        .getname           = inet_getname,
        .poll                   = datagram_poll,
        .ioctl                   = inet_ioctl,
        .gettstamp           = sock_gettstamp,
        .listen                   = sock_no_listen,
        .shutdown           = inet_shutdown,
        .setsockopt           = sock_common_setsockopt,
        .getsockopt           = sock_common_getsockopt,
        .sendmsg           = inet_sendmsg,
        .recvmsg           = inet_recvmsg,
        .mmap                   = sock_no_mmap,
        .sendpage           = inet_sendpage,
#ifdef CONFIG_COMPAT
        .compat_ioctl           = inet_compat_ioctl,
#endif
};

static const struct net_proto_family inet_family_ops = {
        .family = PF_INET,
        .create = inet_create,
        .owner        = THIS_MODULE,
};

/* Upon startup we insert all the elements in inetsw_array[] into
 * the linked list inetsw.
 */
static struct inet_protosw inetsw_array[] =
{
        {
                .type =       SOCK_STREAM,
                .protocol =   IPPROTO_TCP,
                .prot =       &tcp_prot,
                .ops =        &inet_stream_ops,
                .flags =      INET_PROTOSW_PERMANENT |
                              INET_PROTOSW_ICSK,
        },

        {
                .type =       SOCK_DGRAM,
                .protocol =   IPPROTO_UDP,
                .prot =       &udp_prot,
                .ops =        &inet_dgram_ops,
                .flags =      INET_PROTOSW_PERMANENT,
       },

       {
                .type =       SOCK_DGRAM,
                .protocol =   IPPROTO_ICMP,
                .prot =       &ping_prot,
                .ops =        &inet_sockraw_ops,
                .flags =      INET_PROTOSW_REUSE,
       },

       {
               .type =       SOCK_RAW,
               .protocol =   IPPROTO_IP,        /* wild card */
               .prot =       &raw_prot,
               .ops =        &inet_sockraw_ops,
               .flags =      INET_PROTOSW_REUSE,
       }
};

#define INETSW_ARRAY_LEN ARRAY_SIZE(inetsw_array)

void inet_register_protosw(struct inet_protosw *p)
{
        struct list_head *lh;
        struct inet_protosw *answer;
        int protocol = p->protocol;
        struct list_head *last_perm;

        spin_lock_bh(&inetsw_lock);

        if (p->type >= SOCK_MAX)
                goto out_illegal;

        /* If we are trying to override a permanent protocol, bail. */
        last_perm = &inetsw[p->type];
        list_for_each(lh, &inetsw[p->type]) {
                answer = list_entry(lh, struct inet_protosw, list);
                /* Check only the non-wild match. */
                if ((INET_PROTOSW_PERMANENT & answer->flags) == 0)
                        break;
                if (protocol == answer->protocol)
                        goto out_permanent;
                last_perm = lh;
        }

        /* Add the new entry after the last permanent entry if any, so that
         * the new entry does not override a permanent entry when matched with
         * a wild-card protocol. But it is allowed to override any existing
         * non-permanent entry.  This means that when we remove this entry, the
         * system automatically returns to the old behavior.
         */
        list_add_rcu(&p->list, last_perm);
out:
        spin_unlock_bh(&inetsw_lock);

        return;

out_permanent:
        pr_err("Attempt to override permanent protocol %d\n", protocol);
        goto out;

out_illegal:
        pr_err("Ignoring attempt to register invalid socket type %d\n",
               p->type);
        goto out;
}
EXPORT_SYMBOL(inet_register_protosw);

void inet_unregister_protosw(struct inet_protosw *p)
{
        if (INET_PROTOSW_PERMANENT & p->flags) {
                pr_err("Attempt to unregister permanent protocol %d\n",
                       p->protocol);
        } else {
                spin_lock_bh(&inetsw_lock);
                list_del_rcu(&p->list);
                spin_unlock_bh(&inetsw_lock);

                synchronize_net();
        }
}
EXPORT_SYMBOL(inet_unregister_protosw);

static int inet_sk_reselect_saddr(struct sock *sk)
{
        struct inet_sock *inet = inet_sk(sk);
        __be32 old_saddr = inet->inet_saddr;
        __be32 daddr = inet->inet_daddr;
        struct flowi4 *fl4;
        struct rtable *rt;
        __be32 new_saddr;
        struct ip_options_rcu *inet_opt;

        inet_opt = rcu_dereference_protected(inet->inet_opt,
                                             lockdep_sock_is_held(sk));
        if (inet_opt && inet_opt->opt.srr)
                daddr = inet_opt->opt.faddr;

        /* Query new route. */
        fl4 = &inet->cork.fl.u.ip4;
        rt = ip_route_connect(fl4, daddr, 0, RT_CONN_FLAGS(sk),
                              sk->sk_bound_dev_if, sk->sk_protocol,
                              inet->inet_sport, inet->inet_dport, sk);
        if (IS_ERR(rt))
                return PTR_ERR(rt);

        sk_setup_caps(sk, &rt->dst);

        new_saddr = fl4->saddr;

        if (new_saddr == old_saddr)
                return 0;

        if (READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) > 1) {
                pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n",
                        __func__, &old_saddr, &new_saddr);
        }

        inet->inet_saddr = inet->inet_rcv_saddr = new_saddr;

        /*
         * XXX The only one ugly spot where we need to
         * XXX really change the sockets identity after
         * XXX it has entered the hashes. -DaveM
         *
         * Besides that, it does not check for connection
         * uniqueness. Wait for troubles.
         */
        return __sk_prot_rehash(sk);
}

int inet_sk_rebuild_header(struct sock *sk)
{
        struct inet_sock *inet = inet_sk(sk);
        struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
        __be32 daddr;
        struct ip_options_rcu *inet_opt;
        struct flowi4 *fl4;
        int err;

        /* Route is OK, nothing to do. */
        if (rt)
                return 0;

        /* Reroute. */
        rcu_read_lock();
        inet_opt = rcu_dereference(inet->inet_opt);
        daddr = inet->inet_daddr;
        if (inet_opt && inet_opt->opt.srr)
                daddr = inet_opt->opt.faddr;
        rcu_read_unlock();
        fl4 = &inet->cork.fl.u.ip4;
        rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr,
                                   inet->inet_dport, inet->inet_sport,
                                   sk->sk_protocol, RT_CONN_FLAGS(sk),
                                   sk->sk_bound_dev_if);
        if (!IS_ERR(rt)) {
                err = 0;
                sk_setup_caps(sk, &rt->dst);
        } else {
                err = PTR_ERR(rt);

                /* Routing failed... */
                sk->sk_route_caps = 0;
                /*
                 * Other protocols have to map its equivalent state to TCP_SYN_SENT.
                 * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme
                 */
                if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) ||
                    sk->sk_state != TCP_SYN_SENT ||
                    (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
                    (err = inet_sk_reselect_saddr(sk)) != 0)
                        sk->sk_err_soft = -err;
        }

        return err;
}
EXPORT_SYMBOL(inet_sk_rebuild_header);

void inet_sk_set_state(struct sock *sk, int state)
{
        trace_inet_sock_set_state(sk, sk->sk_state, state);
        sk->sk_state = state;
}
EXPORT_SYMBOL(inet_sk_set_state);

void inet_sk_state_store(struct sock *sk, int newstate)
{
        trace_inet_sock_set_state(sk, sk->sk_state, newstate);
        smp_store_release(&sk->sk_state, newstate);
}

struct sk_buff *inet_gso_segment(struct sk_buff *skb,
                                 netdev_features_t features)
{
        bool udpfrag = false, fixedid = false, gso_partial, encap;
        struct sk_buff *segs = ERR_PTR(-EINVAL);
        const struct net_offload *ops;
        unsigned int offset = 0;
        struct iphdr *iph;
        int proto, tot_len;
        int nhoff;
        int ihl;
        int id;

        skb_reset_network_header(skb);
        nhoff = skb_network_header(skb) - skb_mac_header(skb);
        if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
                goto out;

        iph = ip_hdr(skb);
        ihl = iph->ihl * 4;
        if (ihl < sizeof(*iph))
                goto out;

        id = ntohs(iph->id);
        proto = iph->protocol;

        /* Warning: after this point, iph might be no longer valid */
        if (unlikely(!pskb_may_pull(skb, ihl)))
                goto out;
        __skb_pull(skb, ihl);

        encap = SKB_GSO_CB(skb)->encap_level > 0;
        if (encap)
                features &= skb->dev->hw_enc_features;
        SKB_GSO_CB(skb)->encap_level += ihl;

        skb_reset_transport_header(skb);

        segs = ERR_PTR(-EPROTONOSUPPORT);

        if (!skb->encapsulation || encap) {
                udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
                fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID);

                /* fixed ID is invalid if DF bit is not set */
                if (fixedid && !(ip_hdr(skb)->frag_off & htons(IP_DF)))
                        goto out;
        }

        ops = rcu_dereference(inet_offloads[proto]);
        if (likely(ops && ops->callbacks.gso_segment)) {
                segs = ops->callbacks.gso_segment(skb, features);
                if (!segs)
                        skb->network_header = skb_mac_header(skb) + nhoff - skb->head;
        }

        if (IS_ERR_OR_NULL(segs))
                goto out;

        gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);

        skb = segs;
        do {
                iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
                if (udpfrag) {
                        iph->frag_off = htons(offset >> 3);
                        if (skb->next)
                                iph->frag_off |= htons(IP_MF);
                        offset += skb->len - nhoff - ihl;
                        tot_len = skb->len - nhoff;
                } else if (skb_is_gso(skb)) {
                        if (!fixedid) {
                                iph->id = htons(id);
                                id += skb_shinfo(skb)->gso_segs;
                        }

                        if (gso_partial)
                                tot_len = skb_shinfo(skb)->gso_size +
                                          SKB_GSO_CB(skb)->data_offset +
                                          skb->head - (unsigned char *)iph;
                        else
                                tot_len = skb->len - nhoff;
                } else {
                        if (!fixedid)
                                iph->id = htons(id++);
                        tot_len = skb->len - nhoff;
                }
                iph->tot_len = htons(tot_len);
                ip_send_check(iph);
                if (encap)
                        skb_reset_inner_headers(skb);
                skb->network_header = (u8 *)iph - skb->head;
                skb_reset_mac_len(skb);
        } while ((skb = skb->next));

out:
        return segs;
}
EXPORT_SYMBOL(inet_gso_segment);

static struct sk_buff *ipip_gso_segment(struct sk_buff *skb,
                                        netdev_features_t features)
{
        if (!(skb_shinfo(skb)->gso_type & SKB_GSO_IPXIP4))
                return ERR_PTR(-EINVAL);

        return inet_gso_segment(skb, features);
}

struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
{
        const struct net_offload *ops;
        struct sk_buff *pp = NULL;
        const struct iphdr *iph;
        struct sk_buff *p;
        unsigned int hlen;
        unsigned int off;
        unsigned int id;
        int flush = 1;
        int proto;

        off = skb_gro_offset(skb);
        hlen = off + sizeof(*iph);
        iph = skb_gro_header_fast(skb, off);
        if (skb_gro_header_hard(skb, hlen)) {
                iph = skb_gro_header_slow(skb, hlen, off);
                if (unlikely(!iph))
                        goto out;
        }

        proto = iph->protocol;

        ops = rcu_dereference(inet_offloads[proto]);
        if (!ops || !ops->callbacks.gro_receive)
                goto out;

        if (*(u8 *)iph != 0x45)
                goto out;

        if (ip_is_fragment(iph))
                goto out;

        if (unlikely(ip_fast_csum((u8 *)iph, 5)))
                goto out;

        id = ntohl(*(__be32 *)&iph->id);
        flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
        id >>= 16;

        list_for_each_entry(p, head, list) {
                struct iphdr *iph2;
                u16 flush_id;

                if (!NAPI_GRO_CB(p)->same_flow)
                        continue;

                iph2 = (struct iphdr *)(p->data + off);
                /* The above works because, with the exception of the top
                 * (inner most) layer, we only aggregate pkts with the same
                 * hdr length so all the hdrs we'll need to verify will start
                 * at the same offset.
                 */
                if ((iph->protocol ^ iph2->protocol) |
                    ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |
                    ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {
                        NAPI_GRO_CB(p)->same_flow = 0;
                        continue;
                }

                /* All fields must match except length and checksum. */
                NAPI_GRO_CB(p)->flush |=
                        (iph->ttl ^ iph2->ttl) |
                        (iph->tos ^ iph2->tos) |
                        ((iph->frag_off ^ iph2->frag_off) & htons(IP_DF));

                NAPI_GRO_CB(p)->flush |= flush;

                /* We need to store of the IP ID check to be included later
                 * when we can verify that this packet does in fact belong
                 * to a given flow.
                 */
                flush_id = (u16)(id - ntohs(iph2->id));

                /* This bit of code makes it much easier for us to identify
                 * the cases where we are doing atomic vs non-atomic IP ID
                 * checks.  Specifically an atomic check can return IP ID
                 * values 0 - 0xFFFF, while a non-atomic check can only
                 * return 0 or 0xFFFF.
                 */
                if (!NAPI_GRO_CB(p)->is_atomic ||
                    !(iph->frag_off & htons(IP_DF))) {
                        flush_id ^= NAPI_GRO_CB(p)->count;
                        flush_id = flush_id ? 0xFFFF : 0;
                }

                /* If the previous IP ID value was based on an atomic
                 * datagram we can overwrite the value and ignore it.
                 */
                if (NAPI_GRO_CB(skb)->is_atomic)
                        NAPI_GRO_CB(p)->flush_id = flush_id;
                else
                        NAPI_GRO_CB(p)->flush_id |= flush_id;
        }

        NAPI_GRO_CB(skb)->is_atomic = !!(iph->frag_off & htons(IP_DF));
        NAPI_GRO_CB(skb)->flush |= flush;
        skb_set_network_header(skb, off);
        /* The above will be needed by the transport layer if there is one
         * immediately following this IP hdr.
         */

        /* Note : No need to call skb_gro_postpull_rcsum() here,
         * as we already checked checksum over ipv4 header was 0
         */
        skb_gro_pull(skb, sizeof(*iph));
        skb_set_transport_header(skb, skb_gro_offset(skb));

        pp = indirect_call_gro_receive(tcp4_gro_receive, udp4_gro_receive,
                                       ops->callbacks.gro_receive, head, skb);

out:
        skb_gro_flush_final(skb, pp, flush);

        return pp;
}
EXPORT_SYMBOL(inet_gro_receive);

static struct sk_buff *ipip_gro_receive(struct list_head *head,
                                        struct sk_buff *skb)
{
        if (NAPI_GRO_CB(skb)->encap_mark) {
                NAPI_GRO_CB(skb)->flush = 1;
                return NULL;
        }

        NAPI_GRO_CB(skb)->encap_mark = 1;

        return inet_gro_receive(head, skb);
}

#define SECONDS_PER_DAY        86400

/* inet_current_timestamp - Return IP network timestamp
 *
 * Return milliseconds since midnight in network byte order.
 */
__be32 inet_current_timestamp(void)
{
        u32 secs;
        u32 msecs;
        struct timespec64 ts;

        ktime_get_real_ts64(&ts);

        /* Get secs since midnight. */
        (void)div_u64_rem(ts.tv_sec, SECONDS_PER_DAY, &secs);
        /* Convert to msecs. */
        msecs = secs * MSEC_PER_SEC;
        /* Convert nsec to msec. */
        msecs += (u32)ts.tv_nsec / NSEC_PER_MSEC;

        /* Convert to network byte order. */
        return htonl(msecs);
}
EXPORT_SYMBOL(inet_current_timestamp);

int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
{
        unsigned int family = READ_ONCE(sk->sk_family);

        if (family == AF_INET)
                return ip_recv_error(sk, msg, len, addr_len);
#if IS_ENABLED(CONFIG_IPV6)
        if (family == AF_INET6)
                return pingv6_ops.ipv6_recv_error(sk, msg, len, addr_len);
#endif
        return -EINVAL;
}

int inet_gro_complete(struct sk_buff *skb, int nhoff)
{
        __be16 newlen = htons(skb->len - nhoff);
        struct iphdr *iph = (struct iphdr *)(skb->data + nhoff);
        const struct net_offload *ops;
        int proto = iph->protocol;
        int err = -ENOSYS;

        if (skb->encapsulation) {
                skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IP));
                skb_set_inner_network_header(skb, nhoff);
        }

        csum_replace2(&iph->check, iph->tot_len, newlen);
        iph->tot_len = newlen;

        ops = rcu_dereference(inet_offloads[proto]);
        if (WARN_ON(!ops || !ops->callbacks.gro_complete))
                goto out;

        /* Only need to add sizeof(*iph) to get to the next hdr below
         * because any hdr with option will have been flushed in
         * inet_gro_receive().
         */
        err = INDIRECT_CALL_2(ops->callbacks.gro_complete,
                              tcp4_gro_complete, udp4_gro_complete,
                              skb, nhoff + sizeof(*iph));

out:
        return err;
}
EXPORT_SYMBOL(inet_gro_complete);

static int ipip_gro_complete(struct sk_buff *skb, int nhoff)
{
        skb->encapsulation = 1;
        skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4;
        return inet_gro_complete(skb, nhoff);
}

int inet_ctl_sock_create(struct sock **sk, unsigned short family,
                         unsigned short type, unsigned char protocol,
                         struct net *net)
{
        struct socket *sock;
        int rc = sock_create_kern(net, family, type, protocol, &sock);

        if (rc == 0) {
                *sk = sock->sk;
                (*sk)->sk_allocation = GFP_ATOMIC;
                /*
                 * Unhash it so that IP input processing does not even see it,
                 * we do not wish this socket to see incoming packets.
                 */
                (*sk)->sk_prot->unhash(*sk);
        }
        return rc;
}
EXPORT_SYMBOL_GPL(inet_ctl_sock_create);

u64 snmp_get_cpu_field(void __percpu *mib, int cpu, int offt)
{
        return  *(((unsigned long *)per_cpu_ptr(mib, cpu)) + offt);
}
EXPORT_SYMBOL_GPL(snmp_get_cpu_field);

unsigned long snmp_fold_field(void __percpu *mib, int offt)
{
        unsigned long res = 0;
        int i;

        for_each_possible_cpu(i)
                res += snmp_get_cpu_field(mib, i, offt);
        return res;
}
EXPORT_SYMBOL_GPL(snmp_fold_field);

#if BITS_PER_LONG==32

u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offt,
                         size_t syncp_offset)
{
        void *bhptr;
        struct u64_stats_sync *syncp;
        u64 v;
        unsigned int start;

        bhptr = per_cpu_ptr(mib, cpu);
        syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
        do {
                start = u64_stats_fetch_begin_irq(syncp);
                v = *(((u64 *)bhptr) + offt);
        } while (u64_stats_fetch_retry_irq(syncp, start));

        return v;
}
EXPORT_SYMBOL_GPL(snmp_get_cpu_field64);

u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset)
{
        u64 res = 0;
        int cpu;

        for_each_possible_cpu(cpu) {
                res += snmp_get_cpu_field64(mib, cpu, offt, syncp_offset);
        }
        return res;
}
EXPORT_SYMBOL_GPL(snmp_fold_field64);
#endif

#ifdef CONFIG_IP_MULTICAST
static const struct net_protocol igmp_protocol = {
        .handler =        igmp_rcv,
        .netns_ok =        1,
};
#endif

static const struct net_protocol tcp_protocol = {
        .handler        =        tcp_v4_rcv,
        .err_handler        =        tcp_v4_err,
        .no_policy        =        1,
        .netns_ok        =        1,
        .icmp_strict_tag_validation = 1,
};

static const struct net_protocol udp_protocol = {
        .handler =        udp_rcv,
        .err_handler =        udp_err,
        .no_policy =        1,
        .netns_ok =        1,
};

static const struct net_protocol icmp_protocol = {
        .handler =        icmp_rcv,
        .err_handler =        icmp_err,
        .no_policy =        1,
        .netns_ok =        1,
};

static __net_init int ipv4_mib_init_net(struct net *net)
{
        int i;

        net->mib.tcp_statistics = alloc_percpu(struct tcp_mib);
        if (!net->mib.tcp_statistics)
                goto err_tcp_mib;
        net->mib.ip_statistics = alloc_percpu(struct ipstats_mib);
        if (!net->mib.ip_statistics)
                goto err_ip_mib;

        for_each_possible_cpu(i) {
                struct ipstats_mib *af_inet_stats;
                af_inet_stats = per_cpu_ptr(net->mib.ip_statistics, i);
                u64_stats_init(&af_inet_stats->syncp);
        }

        net->mib.net_statistics = alloc_percpu(struct linux_mib);
        if (!net->mib.net_statistics)
                goto err_net_mib;
        net->mib.udp_statistics = alloc_percpu(struct udp_mib);
        if (!net->mib.udp_statistics)
                goto err_udp_mib;
        net->mib.udplite_statistics = alloc_percpu(struct udp_mib);
        if (!net->mib.udplite_statistics)
                goto err_udplite_mib;
        net->mib.icmp_statistics = alloc_percpu(struct icmp_mib);
        if (!net->mib.icmp_statistics)
                goto err_icmp_mib;
        net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib),
                                              GFP_KERNEL);
        if (!net->mib.icmpmsg_statistics)
                goto err_icmpmsg_mib;

        tcp_mib_init(net);
        return 0;

err_icmpmsg_mib:
        free_percpu(net->mib.icmp_statistics);
err_icmp_mib:
        free_percpu(net->mib.udplite_statistics);
err_udplite_mib:
        free_percpu(net->mib.udp_statistics);
err_udp_mib:
        free_percpu(net->mib.net_statistics);
err_net_mib:
        free_percpu(net->mib.ip_statistics);
err_ip_mib:
        free_percpu(net->mib.tcp_statistics);
err_tcp_mib:
        return -ENOMEM;
}

static __net_exit void ipv4_mib_exit_net(struct net *net)
{
        kfree(net->mib.icmpmsg_statistics);
        free_percpu(net->mib.icmp_statistics);
        free_percpu(net->mib.udplite_statistics);
        free_percpu(net->mib.udp_statistics);
        free_percpu(net->mib.net_statistics);
        free_percpu(net->mib.ip_statistics);
        free_percpu(net->mib.tcp_statistics);
#ifdef CONFIG_MPTCP
        /* allocated on demand, see mptcp_init_sock() */
        free_percpu(net->mib.mptcp_statistics);
#endif
}

static __net_initdata struct pernet_operations ipv4_mib_ops = {
        .init = ipv4_mib_init_net,
        .exit = ipv4_mib_exit_net,
};

static int __init init_ipv4_mibs(void)
{
        return register_pernet_subsys(&ipv4_mib_ops);
}

static __net_init int inet_init_net(struct net *net)
{
        /*
         * Set defaults for local port range
         */
        seqlock_init(&net->ipv4.ip_local_ports.lock);
        net->ipv4.ip_local_ports.range[0] =  32768;
        net->ipv4.ip_local_ports.range[1] =  60999;

        seqlock_init(&net->ipv4.ping_group_range.lock);
        /*
         * Sane defaults - nobody may create ping sockets.
         * Boot scripts should set this to distro-specific group.
         */
        net->ipv4.ping_group_range.range[0] = make_kgid(&init_user_ns, 1);
        net->ipv4.ping_group_range.range[1] = make_kgid(&init_user_ns, 0);

        /* Default values for sysctl-controlled parameters.
         * We set them here, in case sysctl is not compiled.
         */
        net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
        net->ipv4.sysctl_ip_fwd_update_priority = 1;
        net->ipv4.sysctl_ip_dynaddr = 0;
        net->ipv4.sysctl_ip_early_demux = 1;
        net->ipv4.sysctl_udp_early_demux = 1;
        net->ipv4.sysctl_tcp_early_demux = 1;
        net->ipv4.sysctl_nexthop_compat_mode = 1;
#ifdef CONFIG_SYSCTL
        net->ipv4.sysctl_ip_prot_sock = PROT_SOCK;
#endif

        /* Some igmp sysctl, whose values are always used */
        net->ipv4.sysctl_igmp_max_memberships = 20;
        net->ipv4.sysctl_igmp_max_msf = 10;
        /* IGMP reports for link-local multicast groups are enabled by default */
        net->ipv4.sysctl_igmp_llm_reports = 1;
        net->ipv4.sysctl_igmp_qrv = 2;

        return 0;
}

static __net_initdata struct pernet_operations af_inet_ops = {
        .init = inet_init_net,
};

static int __init init_inet_pernet_ops(void)
{
        return register_pernet_subsys(&af_inet_ops);
}

static int ipv4_proc_init(void);

/*
 *        IP protocol layer initialiser
 */

static struct packet_offload ip_packet_offload __read_mostly = {
        .type = cpu_to_be16(ETH_P_IP),
        .callbacks = {
                .gso_segment = inet_gso_segment,
                .gro_receive = inet_gro_receive,
                .gro_complete = inet_gro_complete,
        },
};

static const struct net_offload ipip_offload = {
        .callbacks = {
                .gso_segment        = ipip_gso_segment,
                .gro_receive        = ipip_gro_receive,
                .gro_complete        = ipip_gro_complete,
        },
};

static int __init ipip_offload_init(void)
{
        return inet_add_offload(&ipip_offload, IPPROTO_IPIP);
}

static int __init ipv4_offload_init(void)
{
        /*
         * Add offloads
         */
        if (udpv4_offload_init() < 0)
                pr_crit("%s: Cannot add UDP protocol offload\n", __func__);
        if (tcpv4_offload_init() < 0)
                pr_crit("%s: Cannot add TCP protocol offload\n", __func__);
        if (ipip_offload_init() < 0)
                pr_crit("%s: Cannot add IPIP protocol offload\n", __func__);

        dev_add_offload(&ip_packet_offload);
        return 0;
}

fs_initcall(ipv4_offload_init);

static struct packet_type ip_packet_type __read_mostly = {
        .type = cpu_to_be16(ETH_P_IP),
        .func = ip_rcv,
        .list_func = ip_list_rcv,
};

static int __init inet_init(void)
{
        struct inet_protosw *q;
        struct list_head *r;
        int rc;

        sock_skb_cb_check_size(sizeof(struct inet_skb_parm));

        rc = proto_register(&tcp_prot, 1);
        if (rc)
                goto out;

        rc = proto_register(&udp_prot, 1);
        if (rc)
                goto out_unregister_tcp_proto;

        rc = proto_register(&raw_prot, 1);
        if (rc)
                goto out_unregister_udp_proto;

        rc = proto_register(&ping_prot, 1);
        if (rc)
                goto out_unregister_raw_proto;

        /*
         *        Tell SOCKET that we are alive...
         */

        (void)sock_register(&inet_family_ops);

#ifdef CONFIG_SYSCTL
        ip_static_sysctl_init();
#endif

        /*
         *        Add all the base protocols.
         */

        if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
                pr_crit("%s: Cannot add ICMP protocol\n", __func__);
        if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
                pr_crit("%s: Cannot add UDP protocol\n", __func__);
        if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
                pr_crit("%s: Cannot add TCP protocol\n", __func__);
#ifdef CONFIG_IP_MULTICAST
        if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
                pr_crit("%s: Cannot add IGMP protocol\n", __func__);
#endif

        /* Register the socket-side information for inet_create. */
        for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
                INIT_LIST_HEAD(r);

        for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
                inet_register_protosw(q);

        /*
         *        Set the ARP module up
         */

        arp_init();

        /*
         *        Set the IP module up
         */

        ip_init();

        /* Initialise per-cpu ipv4 mibs */
        if (init_ipv4_mibs())
                panic("%s: Cannot init ipv4 mibs\n", __func__);

        /* Setup TCP slab cache for open requests. */
        tcp_init();

        /* Setup UDP memory threshold */
        udp_init();

        /* Add UDP-Lite (RFC 3828) */
        udplite4_register();

        raw_init();

        ping_init();

        /*
         *        Set the ICMP layer up
         */

        if (icmp_init() < 0)
                panic("Failed to create the ICMP control socket.\n");

        /*
         *        Initialise the multicast router
         */
#if defined(CONFIG_IP_MROUTE)
        if (ip_mr_init())
                pr_crit("%s: Cannot init ipv4 mroute\n", __func__);
#endif

        if (init_inet_pernet_ops())
                pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__);

        ipv4_proc_init();

        ipfrag_init();

        dev_add_pack(&ip_packet_type);

        ip_tunnel_core_init();

        rc = 0;
out:
        return rc;
out_unregister_raw_proto:
        proto_unregister(&raw_prot);
out_unregister_udp_proto:
        proto_unregister(&udp_prot);
out_unregister_tcp_proto:
        proto_unregister(&tcp_prot);
        goto out;
}

fs_initcall(inet_init);

/* ------------------------------------------------------------------------ */

#ifdef CONFIG_PROC_FS
static int __init ipv4_proc_init(void)
{
        int rc = 0;

        if (raw_proc_init())
                goto out_raw;
        if (tcp4_proc_init())
                goto out_tcp;
        if (udp4_proc_init())
                goto out_udp;
        if (ping_proc_init())
                goto out_ping;
        if (ip_misc_proc_init())
                goto out_misc;
out:
        return rc;
out_misc:
        ping_proc_exit();
out_ping:
        udp4_proc_exit();
out_udp:
        tcp4_proc_exit();
out_tcp:
        raw_proc_exit();
out_raw:
        rc = -ENOMEM;
        goto out;
}

#else /* CONFIG_PROC_FS */
static int __init ipv4_proc_init(void)
{
        return 0;
}
#endif /* CONFIG_PROC_FS */



















































































    1 





    1 
    1 













































































































    1 













    1 











    1 

    1 









    1 


















































































    1 









































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  Copyright (C) 1994 Linus Torvalds
 *
 *  Pentium III FXSR, SSE support
 *  General FPU state handling cleanups
 *        Gareth Hughes <gareth@valinux.com>, May 2000
 */
#include <asm/fpu/internal.h>
#include <asm/fpu/regset.h>
#include <asm/fpu/signal.h>
#include <asm/fpu/types.h>
#include <asm/traps.h>
#include <asm/irq_regs.h>

#include <linux/hardirq.h>
#include <linux/pkeys.h>

#define CREATE_TRACE_POINTS
#include <asm/trace/fpu.h>

/*
 * Represents the initial FPU state. It's mostly (but not completely) zeroes,
 * depending on the FPU hardware format:
 */
union fpregs_state init_fpstate __read_mostly;

/* Track in-kernel FPU usage */
static DEFINE_PER_CPU(bool, in_kernel_fpu);

/*
 * Track which context is using the FPU on the CPU:
 */
DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);

/*
 * Can we use the FPU in kernel mode with the
 * whole "kernel_fpu_begin/end()" sequence?
 */
bool irq_fpu_usable(void)
{
        if (WARN_ON_ONCE(in_nmi()))
                return false;

        /* In kernel FPU usage already active? */
        if (this_cpu_read(in_kernel_fpu))
                return false;

        /*
         * When not in NMI or hard interrupt context, FPU can be used in:
         *
         * - Task context except from within fpregs_lock()'ed critical
         *   regions.
         *
         * - Soft interrupt processing context which cannot happen
         *   while in a fpregs_lock()'ed critical region.
         */
        if (!in_irq())
                return true;

        /*
         * In hard interrupt context it's safe when soft interrupts
         * are enabled, which means the interrupt did not hit in
         * a fpregs_lock()'ed critical region.
         */
        return !softirq_count();
}
EXPORT_SYMBOL(irq_fpu_usable);

/*
 * These must be called with preempt disabled. Returns
 * 'true' if the FPU state is still intact and we can
 * keep registers active.
 *
 * The legacy FNSAVE instruction cleared all FPU state
 * unconditionally, so registers are essentially destroyed.
 * Modern FPU state can be kept in registers, if there are
 * no pending FP exceptions.
 */
int copy_fpregs_to_fpstate(struct fpu *fpu)
{
        if (likely(use_xsave())) {
                copy_xregs_to_kernel(&fpu->state.xsave);

                /*
                 * AVX512 state is tracked here because its use is
                 * known to slow the max clock speed of the core.
                 */
                if (fpu->state.xsave.header.xfeatures & XFEATURE_MASK_AVX512)
                        fpu->avx512_timestamp = jiffies;
                return 1;
        }

        if (likely(use_fxsr())) {
                copy_fxregs_to_kernel(fpu);
                return 1;
        }

        /*
         * Legacy FPU register saving, FNSAVE always clears FPU registers,
         * so we have to mark them inactive:
         */
        asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave));

        return 0;
}
EXPORT_SYMBOL(copy_fpregs_to_fpstate);

void kernel_fpu_begin_mask(unsigned int kfpu_mask)
{
        preempt_disable();

        WARN_ON_FPU(!irq_fpu_usable());
        WARN_ON_FPU(this_cpu_read(in_kernel_fpu));

        this_cpu_write(in_kernel_fpu, true);

        if (!(current->flags & PF_KTHREAD) &&
            !test_thread_flag(TIF_NEED_FPU_LOAD)) {
                set_thread_flag(TIF_NEED_FPU_LOAD);
                /*
                 * Ignore return value -- we don't care if reg state
                 * is clobbered.
                 */
                copy_fpregs_to_fpstate(&current->thread.fpu);
        }
        __cpu_invalidate_fpregs_state();

        /* Put sane initial values into the control registers. */
        if (likely(kfpu_mask & KFPU_MXCSR) && boot_cpu_has(X86_FEATURE_XMM))
                ldmxcsr(MXCSR_DEFAULT);

        if (unlikely(kfpu_mask & KFPU_387) && boot_cpu_has(X86_FEATURE_FPU))
                asm volatile ("fninit");
}
EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask);

void kernel_fpu_end(void)
{
        WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));

        this_cpu_write(in_kernel_fpu, false);
        preempt_enable();
}
EXPORT_SYMBOL_GPL(kernel_fpu_end);

/*
 * Save the FPU state (mark it for reload if necessary):
 *
 * This only ever gets called for the current task.
 */
void fpu__save(struct fpu *fpu)
{
        WARN_ON_FPU(fpu != &current->thread.fpu);

        fpregs_lock();
        trace_x86_fpu_before_save(fpu);

        if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
                if (!copy_fpregs_to_fpstate(fpu)) {
                        copy_kernel_to_fpregs(&fpu->state);
                }
        }

        trace_x86_fpu_after_save(fpu);
        fpregs_unlock();
}

/*
 * Legacy x87 fpstate state init:
 */
static inline void fpstate_init_fstate(struct fregs_state *fp)
{
        fp->cwd = 0xffff037fu;
        fp->swd = 0xffff0000u;
        fp->twd = 0xffffffffu;
        fp->fos = 0xffff0000u;
}

void fpstate_init(union fpregs_state *state)
{
        if (!static_cpu_has(X86_FEATURE_FPU)) {
                fpstate_init_soft(&state->soft);
                return;
        }

        memset(state, 0, fpu_kernel_xstate_size);

        if (static_cpu_has(X86_FEATURE_XSAVES))
                fpstate_init_xstate(&state->xsave);
        if (static_cpu_has(X86_FEATURE_FXSR))
                fpstate_init_fxstate(&state->fxsave);
        else
                fpstate_init_fstate(&state->fsave);
}
EXPORT_SYMBOL_GPL(fpstate_init);

int fpu__copy(struct task_struct *dst, struct task_struct *src)
{
        struct fpu *dst_fpu = &dst->thread.fpu;
        struct fpu *src_fpu = &src->thread.fpu;

        dst_fpu->last_cpu = -1;

        if (!static_cpu_has(X86_FEATURE_FPU))
                return 0;

        WARN_ON_FPU(src_fpu != &current->thread.fpu);

        /*
         * Don't let 'init optimized' areas of the XSAVE area
         * leak into the child task:
         */
        memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size);

        /*
         * If the FPU registers are not current just memcpy() the state.
         * Otherwise save current FPU registers directly into the child's FPU
         * context, without any memory-to-memory copying.
         *
         * ( The function 'fails' in the FNSAVE case, which destroys
         *   register contents so we have to load them back. )
         */
        fpregs_lock();
        if (test_thread_flag(TIF_NEED_FPU_LOAD))
                memcpy(&dst_fpu->state, &src_fpu->state, fpu_kernel_xstate_size);

        else if (!copy_fpregs_to_fpstate(dst_fpu))
                copy_kernel_to_fpregs(&dst_fpu->state);

        fpregs_unlock();

        set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD);

        trace_x86_fpu_copy_src(src_fpu);
        trace_x86_fpu_copy_dst(dst_fpu);

        return 0;
}

/*
 * Activate the current task's in-memory FPU context,
 * if it has not been used before:
 */
static void fpu__initialize(struct fpu *fpu)
{
        WARN_ON_FPU(fpu != &current->thread.fpu);

        set_thread_flag(TIF_NEED_FPU_LOAD);
        fpstate_init(&fpu->state);
        trace_x86_fpu_init_state(fpu);
}

/*
 * This function must be called before we read a task's fpstate.
 *
 * There's two cases where this gets called:
 *
 * - for the current task (when coredumping), in which case we have
 *   to save the latest FPU registers into the fpstate,
 *
 * - or it's called for stopped tasks (ptrace), in which case the
 *   registers were already saved by the context-switch code when
 *   the task scheduled out.
 *
 * If the task has used the FPU before then save it.
 */
void fpu__prepare_read(struct fpu *fpu)
{
        if (fpu == &current->thread.fpu)
                fpu__save(fpu);
}

/*
 * This function must be called before we write a task's fpstate.
 *
 * Invalidate any cached FPU registers.
 *
 * After this function call, after registers in the fpstate are
 * modified and the child task has woken up, the child task will
 * restore the modified FPU state from the modified context. If we
 * didn't clear its cached status here then the cached in-registers
 * state pending on its former CPU could be restored, corrupting
 * the modifications.
 */
void fpu__prepare_write(struct fpu *fpu)
{
        /*
         * Only stopped child tasks can be used to modify the FPU
         * state in the fpstate buffer:
         */
        WARN_ON_FPU(fpu == &current->thread.fpu);

        /* Invalidate any cached state: */
        __fpu_invalidate_fpregs_state(fpu);
}

/*
 * Drops current FPU state: deactivates the fpregs and
 * the fpstate. NOTE: it still leaves previous contents
 * in the fpregs in the eager-FPU case.
 *
 * This function can be used in cases where we know that
 * a state-restore is coming: either an explicit one,
 * or a reschedule.
 */
void fpu__drop(struct fpu *fpu)
{
        preempt_disable();

        if (fpu == &current->thread.fpu) {
                /* Ignore delayed exceptions from user space */
                asm volatile("1: fwait\n"
                             "2:\n"
                             _ASM_EXTABLE(1b, 2b));
                fpregs_deactivate(fpu);
        }

        trace_x86_fpu_dropped(fpu);

        preempt_enable();
}

/*
 * Clear FPU registers by setting them up from the init fpstate.
 * Caller must do fpregs_[un]lock() around it.
 */
static inline void copy_init_fpstate_to_fpregs(u64 features_mask)
{
        if (use_xsave())
                copy_kernel_to_xregs(&init_fpstate.xsave, features_mask);
        else if (static_cpu_has(X86_FEATURE_FXSR))
                copy_kernel_to_fxregs(&init_fpstate.fxsave);
        else
                copy_kernel_to_fregs(&init_fpstate.fsave);

        if (boot_cpu_has(X86_FEATURE_OSPKE))
                copy_init_pkru_to_fpregs();
}

/*
 * Clear the FPU state back to init state.
 *
 * Called by sys_execve(), by the signal handler code and by various
 * error paths.
 */
static void fpu__clear(struct fpu *fpu, bool user_only)
{
        WARN_ON_FPU(fpu != &current->thread.fpu);

        if (!static_cpu_has(X86_FEATURE_FPU)) {
                fpu__drop(fpu);
                fpu__initialize(fpu);
                return;
        }

        fpregs_lock();

        if (user_only) {
                if (!fpregs_state_valid(fpu, smp_processor_id()) &&
                    xfeatures_mask_supervisor())
                        copy_kernel_to_xregs(&fpu->state.xsave,
                                             xfeatures_mask_supervisor());
                copy_init_fpstate_to_fpregs(xfeatures_mask_user());
        } else {
                copy_init_fpstate_to_fpregs(xfeatures_mask_all);
        }

        fpregs_mark_activate();
        fpregs_unlock();
}

void fpu__clear_user_states(struct fpu *fpu)
{
        fpu__clear(fpu, true);
}

void fpu__clear_all(struct fpu *fpu)
{
        fpu__clear(fpu, false);
}

/*
 * Load FPU context before returning to userspace.
 */
void switch_fpu_return(void)
{
        if (!static_cpu_has(X86_FEATURE_FPU))
                return;

        __fpregs_load_activate();
}
EXPORT_SYMBOL_GPL(switch_fpu_return);

#ifdef CONFIG_X86_DEBUG_FPU
/*
 * If current FPU state according to its tracking (loaded FPU context on this
 * CPU) is not valid then we must have TIF_NEED_FPU_LOAD set so the context is
 * loaded on return to userland.
 */
void fpregs_assert_state_consistent(void)
{
        struct fpu *fpu = &current->thread.fpu;

        if (test_thread_flag(TIF_NEED_FPU_LOAD))
                return;

        WARN_ON_FPU(!fpregs_state_valid(fpu, smp_processor_id()));
}
EXPORT_SYMBOL_GPL(fpregs_assert_state_consistent);
#endif

void fpregs_mark_activate(void)
{
        struct fpu *fpu = &current->thread.fpu;

        fpregs_activate(fpu);
        fpu->last_cpu = smp_processor_id();
        clear_thread_flag(TIF_NEED_FPU_LOAD);
}
EXPORT_SYMBOL_GPL(fpregs_mark_activate);

/*
 * x87 math exception handling:
 */

int fpu__exception_code(struct fpu *fpu, int trap_nr)
{
        int err;

        if (trap_nr == X86_TRAP_MF) {
                unsigned short cwd, swd;
                /*
                 * (~cwd & swd) will mask out exceptions that are not set to unmasked
                 * status.  0x3f is the exception bits in these regs, 0x200 is the
                 * C1 reg you need in case of a stack fault, 0x040 is the stack
                 * fault bit.  We should only be taking one exception at a time,
                 * so if this combination doesn't produce any single exception,
                 * then we have a bad program that isn't synchronizing its FPU usage
                 * and it will suffer the consequences since we won't be able to
                 * fully reproduce the context of the exception.
                 */
                if (boot_cpu_has(X86_FEATURE_FXSR)) {
                        cwd = fpu->state.fxsave.cwd;
                        swd = fpu->state.fxsave.swd;
                } else {
                        cwd = (unsigned short)fpu->state.fsave.cwd;
                        swd = (unsigned short)fpu->state.fsave.swd;
                }

                err = swd & ~cwd;
        } else {
                /*
                 * The SIMD FPU exceptions are handled a little differently, as there
                 * is only a single status/control register.  Thus, to determine which
                 * unmasked exception was caught we must mask the exception mask bits
                 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
                 */
                unsigned short mxcsr = MXCSR_DEFAULT;

                if (boot_cpu_has(X86_FEATURE_XMM))
                        mxcsr = fpu->state.fxsave.mxcsr;

                err = ~(mxcsr >> 7) & mxcsr;
        }

        if (err & 0x001) {        /* Invalid op */
                /*
                 * swd & 0x240 == 0x040: Stack Underflow
                 * swd & 0x240 == 0x240: Stack Overflow
                 * User must clear the SF bit (0x40) if set
                 */
                return FPE_FLTINV;
        } else if (err & 0x004) { /* Divide by Zero */
                return FPE_FLTDIV;
        } else if (err & 0x008) { /* Overflow */
                return FPE_FLTOVF;
        } else if (err & 0x012) { /* Denormal, Underflow */
                return FPE_FLTUND;
        } else if (err & 0x020) { /* Precision */
                return FPE_FLTRES;
        }

        /*
         * If we're using IRQ 13, or supposedly even some trap
         * X86_TRAP_MF implementations, it's possible
         * we get a spurious trap, which is not an error.
         */
        return 0;
}











































































































































































































































































































































    1 








































































































































































    1 
    1 




    1 



























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _SCSI_SCSI_DEVICE_H
#define _SCSI_SCSI_DEVICE_H

#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/workqueue.h>
#include <linux/blkdev.h>
#include <scsi/scsi.h>
#include <linux/atomic.h>

struct device;
struct request_queue;
struct scsi_cmnd;
struct scsi_lun;
struct scsi_sense_hdr;

typedef __u64 __bitwise blist_flags_t;

#define SCSI_SENSE_BUFFERSIZE        96

struct scsi_mode_data {
        __u32        length;
        __u16        block_descriptor_length;
        __u8        medium_type;
        __u8        device_specific;
        __u8        header_length;
        __u8        longlba:1;
};

/*
 * sdev state: If you alter this, you also need to alter scsi_sysfs.c
 * (for the ascii descriptions) and the state model enforcer:
 * scsi_lib:scsi_device_set_state().
 */
enum scsi_device_state {
        SDEV_CREATED = 1,        /* device created but not added to sysfs
                                 * Only internal commands allowed (for inq) */
        SDEV_RUNNING,                /* device properly configured
                                 * All commands allowed */
        SDEV_CANCEL,                /* beginning to delete device
                                 * Only error handler commands allowed */
        SDEV_DEL,                /* device deleted 
                                 * no commands allowed */
        SDEV_QUIESCE,                /* Device quiescent.  No block commands
                                 * will be accepted, only specials (which
                                 * originate in the mid-layer) */
        SDEV_OFFLINE,                /* Device offlined (by error handling or
                                 * user request */
        SDEV_TRANSPORT_OFFLINE,        /* Offlined by transport class error handler */
        SDEV_BLOCK,                /* Device blocked by scsi lld.  No
                                 * scsi commands from user or midlayer
                                 * should be issued to the scsi
                                 * lld. */
        SDEV_CREATED_BLOCK,        /* same as above but for created devices */
};

enum scsi_scan_mode {
        SCSI_SCAN_INITIAL = 0,
        SCSI_SCAN_RESCAN,
        SCSI_SCAN_MANUAL,
};

enum scsi_device_event {
        SDEV_EVT_MEDIA_CHANGE        = 1,        /* media has changed */
        SDEV_EVT_INQUIRY_CHANGE_REPORTED,                /* 3F 03  UA reported */
        SDEV_EVT_CAPACITY_CHANGE_REPORTED,                /* 2A 09  UA reported */
        SDEV_EVT_SOFT_THRESHOLD_REACHED_REPORTED,        /* 38 07  UA reported */
        SDEV_EVT_MODE_PARAMETER_CHANGE_REPORTED,        /* 2A 01  UA reported */
        SDEV_EVT_LUN_CHANGE_REPORTED,                        /* 3F 0E  UA reported */
        SDEV_EVT_ALUA_STATE_CHANGE_REPORTED,                /* 2A 06  UA reported */
        SDEV_EVT_POWER_ON_RESET_OCCURRED,                /* 29 00  UA reported */

        SDEV_EVT_FIRST                = SDEV_EVT_MEDIA_CHANGE,
        SDEV_EVT_LAST                = SDEV_EVT_POWER_ON_RESET_OCCURRED,

        SDEV_EVT_MAXBITS        = SDEV_EVT_LAST + 1
};

struct scsi_event {
        enum scsi_device_event        evt_type;
        struct list_head        node;

        /* put union of data structures, for non-simple event types,
         * here
         */
};

/**
 * struct scsi_vpd - SCSI Vital Product Data
 * @rcu: For kfree_rcu().
 * @len: Length in bytes of @data.
 * @data: VPD data as defined in various T10 SCSI standard documents.
 */
struct scsi_vpd {
        struct rcu_head        rcu;
        int                len;
        unsigned char        data[];
};

struct scsi_device {
        struct Scsi_Host *host;
        struct request_queue *request_queue;

        /* the next two are protected by the host->host_lock */
        struct list_head    siblings;   /* list of all devices on this host */
        struct list_head    same_target_siblings; /* just the devices sharing same target id */

        atomic_t device_busy;                /* commands actually active on LLDD */
        atomic_t device_blocked;        /* Device returned QUEUE_FULL. */

        atomic_t restarts;
        spinlock_t list_lock;
        struct list_head starved_entry;
        unsigned short queue_depth;        /* How deep of a queue we want */
        unsigned short max_queue_depth;        /* max queue depth */
        unsigned short last_queue_full_depth; /* These two are used by */
        unsigned short last_queue_full_count; /* scsi_track_queue_full() */
        unsigned long last_queue_full_time;        /* last queue full time */
        unsigned long queue_ramp_up_period;        /* ramp up period in jiffies */
#define SCSI_DEFAULT_RAMP_UP_PERIOD        (120 * HZ)

        unsigned long last_queue_ramp_up;        /* last queue ramp up time */

        unsigned int id, channel;
        u64 lun;
        unsigned int manufacturer;        /* Manufacturer of device, for using 
                                         * vendor-specific cmd's */
        unsigned sector_size;        /* size in bytes */

        void *hostdata;                /* available to low-level driver */
        unsigned char type;
        char scsi_level;
        char inq_periph_qual;        /* PQ from INQUIRY data */        
        struct mutex inquiry_mutex;
        unsigned char inquiry_len;        /* valid bytes in 'inquiry' */
        unsigned char * inquiry;        /* INQUIRY response data */
        const char * vendor;                /* [back_compat] point into 'inquiry' ... */
        const char * model;                /* ... after scan; point to static string */
        const char * rev;                /* ... "nullnullnullnull" before scan */

#define SCSI_VPD_PG_LEN                255
        struct scsi_vpd __rcu *vpd_pg0;
        struct scsi_vpd __rcu *vpd_pg83;
        struct scsi_vpd __rcu *vpd_pg80;
        struct scsi_vpd __rcu *vpd_pg89;
        unsigned char current_tag;        /* current tag */
        struct scsi_target      *sdev_target;   /* used only for single_lun */

        blist_flags_t                sdev_bflags; /* black/white flags as also found in
                                 * scsi_devinfo.[hc]. For now used only to
                                 * pass settings from slave_alloc to scsi
                                 * core. */
        unsigned int eh_timeout; /* Error handling timeout */
        unsigned removable:1;
        unsigned changed:1;        /* Data invalid due to media change */
        unsigned busy:1;        /* Used to prevent races */
        unsigned lockable:1;        /* Able to prevent media removal */
        unsigned locked:1;      /* Media removal disabled */
        unsigned borken:1;        /* Tell the Seagate driver to be 
                                 * painfully slow on this device */
        unsigned disconnect:1;        /* can disconnect */
        unsigned soft_reset:1;        /* Uses soft reset option */
        unsigned sdtr:1;        /* Device supports SDTR messages */
        unsigned wdtr:1;        /* Device supports WDTR messages */
        unsigned ppr:1;                /* Device supports PPR messages */
        unsigned tagged_supported:1;        /* Supports SCSI-II tagged queuing */
        unsigned simple_tags:1;        /* simple queue tag messages are enabled */
        unsigned was_reset:1;        /* There was a bus reset on the bus for 
                                 * this device */
        unsigned expecting_cc_ua:1; /* Expecting a CHECK_CONDITION/UNIT_ATTN
                                     * because we did a bus reset. */
        unsigned use_10_for_rw:1; /* first try 10-byte read / write */
        unsigned use_10_for_ms:1; /* first try 10-byte mode sense/select */
        unsigned set_dbd_for_ms:1; /* Set "DBD" field in mode sense */
        unsigned no_report_opcodes:1;        /* no REPORT SUPPORTED OPERATION CODES */
        unsigned no_write_same:1;        /* no WRITE SAME command */
        unsigned use_16_for_rw:1; /* Use read/write(16) over read/write(10) */
        unsigned skip_ms_page_8:1;        /* do not use MODE SENSE page 0x08 */
        unsigned skip_ms_page_3f:1;        /* do not use MODE SENSE page 0x3f */
        unsigned skip_vpd_pages:1;        /* do not read VPD pages */
        unsigned try_vpd_pages:1;        /* attempt to read VPD pages */
        unsigned use_192_bytes_for_3f:1; /* ask for 192 bytes from page 0x3f */
        unsigned no_start_on_add:1;        /* do not issue start on add */
        unsigned allow_restart:1; /* issue START_UNIT in error handler */
        unsigned manage_start_stop:1;        /* Let HLD (sd) manage start/stop */
        unsigned start_stop_pwr_cond:1;        /* Set power cond. in START_STOP_UNIT */
        unsigned no_uld_attach:1; /* disable connecting to upper level drivers */
        unsigned select_no_atn:1;
        unsigned fix_capacity:1;        /* READ_CAPACITY is too high by 1 */
        unsigned guess_capacity:1;        /* READ_CAPACITY might be too high by 1 */
        unsigned retry_hwerror:1;        /* Retry HARDWARE_ERROR */
        unsigned last_sector_bug:1;        /* do not use multisector accesses on
                                           SD_LAST_BUGGY_SECTORS */
        unsigned no_read_disc_info:1;        /* Avoid READ_DISC_INFO cmds */
        unsigned no_read_capacity_16:1; /* Avoid READ_CAPACITY_16 cmds */
        unsigned try_rc_10_first:1;        /* Try READ_CAPACACITY_10 first */
        unsigned security_supported:1;        /* Supports Security Protocols */
        unsigned is_visible:1;        /* is the device visible in sysfs */
        unsigned wce_default_on:1;        /* Cache is ON by default */
        unsigned no_dif:1;        /* T10 PI (DIF) should be disabled */
        unsigned broken_fua:1;                /* Don't set FUA bit */
        unsigned lun_in_cdb:1;                /* Store LUN bits in CDB[1] */
        unsigned unmap_limit_for_ws:1;        /* Use the UNMAP limit for WRITE SAME */
        unsigned rpm_autosuspend:1;        /* Enable runtime autosuspend at device
                                         * creation time */

        bool offline_already;                /* Device offline message logged */

        atomic_t disk_events_disable_depth; /* disable depth for disk events */

        DECLARE_BITMAP(supported_events, SDEV_EVT_MAXBITS); /* supported events */
        DECLARE_BITMAP(pending_events, SDEV_EVT_MAXBITS); /* pending events */
        struct list_head event_list;        /* asserted events */
        struct work_struct event_work;

        unsigned int max_device_blocked; /* what device_blocked counts down from  */
#define SCSI_DEFAULT_DEVICE_BLOCKED        3

        atomic_t iorequest_cnt;
        atomic_t iodone_cnt;
        atomic_t ioerr_cnt;

        struct device                sdev_gendev,
                                sdev_dev;

        struct execute_work        ew; /* used to get process context on put */
        struct work_struct        requeue_work;

        struct scsi_device_handler *handler;
        void                        *handler_data;

        size_t                        dma_drain_len;
        void                        *dma_drain_buf;

        unsigned char                access_state;
        struct mutex                state_mutex;
        enum scsi_device_state sdev_state;
        struct task_struct        *quiesced_by;
        unsigned long                sdev_data[];
} __attribute__((aligned(sizeof(unsigned long))));

#define        to_scsi_device(d)        \
        container_of(d, struct scsi_device, sdev_gendev)
#define        class_to_sdev(d)        \
        container_of(d, struct scsi_device, sdev_dev)
#define transport_class_to_sdev(class_dev) \
        to_scsi_device(class_dev->parent)

#define sdev_dbg(sdev, fmt, a...) \
        dev_dbg(&(sdev)->sdev_gendev, fmt, ##a)

/*
 * like scmd_printk, but the device name is passed in
 * as a string pointer
 */
__printf(4, 5) void
sdev_prefix_printk(const char *, const struct scsi_device *, const char *,
                const char *, ...);

#define sdev_printk(l, sdev, fmt, a...)                                \
        sdev_prefix_printk(l, sdev, NULL, fmt, ##a)

__printf(3, 4) void
scmd_printk(const char *, const struct scsi_cmnd *, const char *, ...);

#define scmd_dbg(scmd, fmt, a...)                                           \
        do {                                                                   \
                if ((scmd)->request->rq_disk)                                   \
                        sdev_dbg((scmd)->device, "[%s] " fmt,                   \
                                 (scmd)->request->rq_disk->disk_name, ##a);\
                else                                                           \
                        sdev_dbg((scmd)->device, fmt, ##a);                   \
        } while (0)

enum scsi_target_state {
        STARGET_CREATED = 1,
        STARGET_RUNNING,
        STARGET_REMOVE,
        STARGET_CREATED_REMOVE,
        STARGET_DEL,
};

/*
 * scsi_target: representation of a scsi target, for now, this is only
 * used for single_lun devices. If no one has active IO to the target,
 * starget_sdev_user is NULL, else it points to the active sdev.
 */
struct scsi_target {
        struct scsi_device        *starget_sdev_user;
        struct list_head        siblings;
        struct list_head        devices;
        struct device                dev;
        struct kref                reap_ref; /* last put renders target invisible */
        unsigned int                channel;
        unsigned int                id; /* target id ... replace
                                     * scsi_device.id eventually */
        unsigned int                create:1; /* signal that it needs to be added */
        unsigned int                single_lun:1;        /* Indicates we should only
                                                 * allow I/O to one of the luns
                                                 * for the device at a time. */
        unsigned int                pdt_1f_for_no_lun:1;        /* PDT = 0x1f
                                                 * means no lun present. */
        unsigned int                no_report_luns:1;        /* Don't use
                                                 * REPORT LUNS for scanning. */
        unsigned int                expecting_lun_change:1;        /* A device has reported
                                                 * a 3F/0E UA, other devices on
                                                 * the same target will also. */
        /* commands actually active on LLD. */
        atomic_t                target_busy;
        atomic_t                target_blocked;

        /*
         * LLDs should set this in the slave_alloc host template callout.
         * If set to zero then there is not limit.
         */
        unsigned int                can_queue;
        unsigned int                max_target_blocked;
#define SCSI_DEFAULT_TARGET_BLOCKED        3

        char                        scsi_level;
        enum scsi_target_state        state;
        void                         *hostdata; /* available to low-level driver */
        unsigned long                starget_data[]; /* for the transport */
        /* starget_data must be the last element!!!! */
} __attribute__((aligned(sizeof(unsigned long))));

#define to_scsi_target(d)        container_of(d, struct scsi_target, dev)
static inline struct scsi_target *scsi_target(struct scsi_device *sdev)
{
        return to_scsi_target(sdev->sdev_gendev.parent);
}
#define transport_class_to_starget(class_dev) \
        to_scsi_target(class_dev->parent)

#define starget_printk(prefix, starget, fmt, a...)        \
        dev_printk(prefix, &(starget)->dev, fmt, ##a)

extern struct scsi_device *__scsi_add_device(struct Scsi_Host *,
                uint, uint, u64, void *hostdata);
extern int scsi_add_device(struct Scsi_Host *host, uint channel,
                           uint target, u64 lun);
extern int scsi_register_device_handler(struct scsi_device_handler *scsi_dh);
extern void scsi_remove_device(struct scsi_device *);
extern int scsi_unregister_device_handler(struct scsi_device_handler *scsi_dh);
void scsi_attach_vpd(struct scsi_device *sdev);

extern struct scsi_device *scsi_device_from_queue(struct request_queue *q);
extern int __must_check scsi_device_get(struct scsi_device *);
extern void scsi_device_put(struct scsi_device *);
extern struct scsi_device *scsi_device_lookup(struct Scsi_Host *,
                                              uint, uint, u64);
extern struct scsi_device *__scsi_device_lookup(struct Scsi_Host *,
                                                uint, uint, u64);
extern struct scsi_device *scsi_device_lookup_by_target(struct scsi_target *,
                                                        u64);
extern struct scsi_device *__scsi_device_lookup_by_target(struct scsi_target *,
                                                          u64);
extern void starget_for_each_device(struct scsi_target *, void *,
                     void (*fn)(struct scsi_device *, void *));
extern void __starget_for_each_device(struct scsi_target *, void *,
                                      void (*fn)(struct scsi_device *,
                                                 void *));

/* only exposed to implement shost_for_each_device */
extern struct scsi_device *__scsi_iterate_devices(struct Scsi_Host *,
                                                  struct scsi_device *);

/**
 * shost_for_each_device - iterate over all devices of a host
 * @sdev: the &struct scsi_device to use as a cursor
 * @shost: the &struct scsi_host to iterate over
 *
 * Iterator that returns each device attached to @shost.  This loop
 * takes a reference on each device and releases it at the end.  If
 * you break out of the loop, you must call scsi_device_put(sdev).
 */
#define shost_for_each_device(sdev, shost) \
        for ((sdev) = __scsi_iterate_devices((shost), NULL); \
             (sdev); \
             (sdev) = __scsi_iterate_devices((shost), (sdev)))

/**
 * __shost_for_each_device - iterate over all devices of a host (UNLOCKED)
 * @sdev: the &struct scsi_device to use as a cursor
 * @shost: the &struct scsi_host to iterate over
 *
 * Iterator that returns each device attached to @shost.  It does _not_
 * take a reference on the scsi_device, so the whole loop must be
 * protected by shost->host_lock.
 *
 * Note: The only reason to use this is because you need to access the
 * device list in interrupt context.  Otherwise you really want to use
 * shost_for_each_device instead.
 */
#define __shost_for_each_device(sdev, shost) \
        list_for_each_entry((sdev), &((shost)->__devices), siblings)

extern int scsi_change_queue_depth(struct scsi_device *, int);
extern int scsi_track_queue_full(struct scsi_device *, int);

extern int scsi_set_medium_removal(struct scsi_device *, char);

extern int scsi_mode_sense(struct scsi_device *sdev, int dbd, int modepage,
                           unsigned char *buffer, int len, int timeout,
                           int retries, struct scsi_mode_data *data,
                           struct scsi_sense_hdr *);
extern int scsi_mode_select(struct scsi_device *sdev, int pf, int sp,
                            int modepage, unsigned char *buffer, int len,
                            int timeout, int retries,
                            struct scsi_mode_data *data,
                            struct scsi_sense_hdr *);
extern int scsi_test_unit_ready(struct scsi_device *sdev, int timeout,
                                int retries, struct scsi_sense_hdr *sshdr);
extern int scsi_get_vpd_page(struct scsi_device *, u8 page, unsigned char *buf,
                             int buf_len);
extern int scsi_report_opcode(struct scsi_device *sdev, unsigned char *buffer,
                              unsigned int len, unsigned char opcode);
extern int scsi_device_set_state(struct scsi_device *sdev,
                                 enum scsi_device_state state);
extern struct scsi_event *sdev_evt_alloc(enum scsi_device_event evt_type,
                                          gfp_t gfpflags);
extern void sdev_evt_send(struct scsi_device *sdev, struct scsi_event *evt);
extern void sdev_evt_send_simple(struct scsi_device *sdev,
                          enum scsi_device_event evt_type, gfp_t gfpflags);
extern int scsi_device_quiesce(struct scsi_device *sdev);
extern void scsi_device_resume(struct scsi_device *sdev);
extern void scsi_target_quiesce(struct scsi_target *);
extern void scsi_target_resume(struct scsi_target *);
extern void scsi_scan_target(struct device *parent, unsigned int channel,
                             unsigned int id, u64 lun,
                             enum scsi_scan_mode rescan);
extern void scsi_target_reap(struct scsi_target *);
extern void scsi_target_block(struct device *);
extern void scsi_target_unblock(struct device *, enum scsi_device_state);
extern void scsi_remove_target(struct device *);
extern const char *scsi_device_state_name(enum scsi_device_state);
extern int scsi_is_sdev_device(const struct device *);
extern int scsi_is_target_device(const struct device *);
extern void scsi_sanitize_inquiry_string(unsigned char *s, int len);
extern int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
                        int data_direction, void *buffer, unsigned bufflen,
                        unsigned char *sense, struct scsi_sense_hdr *sshdr,
                        int timeout, int retries, u64 flags,
                        req_flags_t rq_flags, int *resid);
/* Make sure any sense buffer is the correct size. */
#define scsi_execute(sdev, cmd, data_direction, buffer, bufflen, sense,        \
                     sshdr, timeout, retries, flags, rq_flags, resid)        \
({                                                                        \
        BUILD_BUG_ON((sense) != NULL &&                                        \
                     sizeof(sense) != SCSI_SENSE_BUFFERSIZE);                \
        __scsi_execute(sdev, cmd, data_direction, buffer, bufflen,        \
                       sense, sshdr, timeout, retries, flags, rq_flags,        \
                       resid);                                                \
})
static inline int scsi_execute_req(struct scsi_device *sdev,
        const unsigned char *cmd, int data_direction, void *buffer,
        unsigned bufflen, struct scsi_sense_hdr *sshdr, int timeout,
        int retries, int *resid)
{
        return scsi_execute(sdev, cmd, data_direction, buffer,
                bufflen, NULL, sshdr, timeout, retries,  0, 0, resid);
}
extern void sdev_disable_disk_events(struct scsi_device *sdev);
extern void sdev_enable_disk_events(struct scsi_device *sdev);
extern int scsi_vpd_lun_id(struct scsi_device *, char *, size_t);
extern int scsi_vpd_tpg_id(struct scsi_device *, int *);

#ifdef CONFIG_PM
extern int scsi_autopm_get_device(struct scsi_device *);
extern void scsi_autopm_put_device(struct scsi_device *);
#else
static inline int scsi_autopm_get_device(struct scsi_device *d) { return 0; }
static inline void scsi_autopm_put_device(struct scsi_device *d) {}
#endif /* CONFIG_PM */

static inline int __must_check scsi_device_reprobe(struct scsi_device *sdev)
{
        return device_reprobe(&sdev->sdev_gendev);
}

static inline unsigned int sdev_channel(struct scsi_device *sdev)
{
        return sdev->channel;
}

static inline unsigned int sdev_id(struct scsi_device *sdev)
{
        return sdev->id;
}

#define scmd_id(scmd) sdev_id((scmd)->device)
#define scmd_channel(scmd) sdev_channel((scmd)->device)

/*
 * checks for positions of the SCSI state machine
 */
static inline int scsi_device_online(struct scsi_device *sdev)
{
        return (sdev->sdev_state != SDEV_OFFLINE &&
                sdev->sdev_state != SDEV_TRANSPORT_OFFLINE &&
                sdev->sdev_state != SDEV_DEL);
}
static inline int scsi_device_blocked(struct scsi_device *sdev)
{
        return sdev->sdev_state == SDEV_BLOCK ||
                sdev->sdev_state == SDEV_CREATED_BLOCK;
}
static inline int scsi_device_created(struct scsi_device *sdev)
{
        return sdev->sdev_state == SDEV_CREATED ||
                sdev->sdev_state == SDEV_CREATED_BLOCK;
}

int scsi_internal_device_block_nowait(struct scsi_device *sdev);
int scsi_internal_device_unblock_nowait(struct scsi_device *sdev,
                                        enum scsi_device_state new_state);

/* accessor functions for the SCSI parameters */
static inline int scsi_device_sync(struct scsi_device *sdev)
{
        return sdev->sdtr;
}
static inline int scsi_device_wide(struct scsi_device *sdev)
{
        return sdev->wdtr;
}
static inline int scsi_device_dt(struct scsi_device *sdev)
{
        return sdev->ppr;
}
static inline int scsi_device_dt_only(struct scsi_device *sdev)
{
        if (sdev->inquiry_len < 57)
                return 0;
        return (sdev->inquiry[56] & 0x0c) == 0x04;
}
static inline int scsi_device_ius(struct scsi_device *sdev)
{
        if (sdev->inquiry_len < 57)
                return 0;
        return sdev->inquiry[56] & 0x01;
}
static inline int scsi_device_qas(struct scsi_device *sdev)
{
        if (sdev->inquiry_len < 57)
                return 0;
        return sdev->inquiry[56] & 0x02;
}
static inline int scsi_device_enclosure(struct scsi_device *sdev)
{
        return sdev->inquiry ? (sdev->inquiry[6] & (1<<6)) : 1;
}

static inline int scsi_device_protection(struct scsi_device *sdev)
{
        if (sdev->no_dif)
                return 0;

        return sdev->scsi_level > SCSI_2 && sdev->inquiry[5] & (1<<0);
}

static inline int scsi_device_tpgs(struct scsi_device *sdev)
{
        return sdev->inquiry ? (sdev->inquiry[5] >> 4) & 0x3 : 0;
}

/**
 * scsi_device_supports_vpd - test if a device supports VPD pages
 * @sdev: the &struct scsi_device to test
 *
 * If the 'try_vpd_pages' flag is set it takes precedence.
 * Otherwise we will assume VPD pages are supported if the
 * SCSI level is at least SPC-3 and 'skip_vpd_pages' is not set.
 */
static inline int scsi_device_supports_vpd(struct scsi_device *sdev)
{
        /* Attempt VPD inquiry if the device blacklist explicitly calls
         * for it.
         */
        if (sdev->try_vpd_pages)
                return 1;
        /*
         * Although VPD inquiries can go to SCSI-2 type devices,
         * some USB ones crash on receiving them, and the pages
         * we currently ask for are mandatory for SPC-2 and beyond
         */
        if (sdev->scsi_level >= SCSI_SPC_2 && !sdev->skip_vpd_pages)
                return 1;
        return 0;
}

#define MODULE_ALIAS_SCSI_DEVICE(type) \
        MODULE_ALIAS("scsi:t-" __stringify(type) "*")
#define SCSI_DEVICE_MODALIAS_FMT "scsi:t-0x%02x"

#endif /* _SCSI_SCSI_DEVICE_H */












































































































































































    1 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * tsacct.c - System accounting over taskstats interface
 *
 * Copyright (C) Jay Lan,        <jlan@sgi.com>
 */

#include <linux/kernel.h>
#include <linux/sched/signal.h>
#include <linux/sched/mm.h>
#include <linux/sched/cputime.h>
#include <linux/tsacct_kern.h>
#include <linux/acct.h>
#include <linux/jiffies.h>
#include <linux/mm.h>

/*
 * fill in basic accounting fields
 */
void bacct_add_tsk(struct user_namespace *user_ns,
                   struct pid_namespace *pid_ns,
                   struct taskstats *stats, struct task_struct *tsk)
{
        const struct cred *tcred;
        u64 utime, stime, utimescaled, stimescaled;
        u64 delta;
        time64_t btime;

        BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);

        /* calculate task elapsed time in nsec */
        delta = ktime_get_ns() - tsk->start_time;
        /* Convert to micro seconds */
        do_div(delta, NSEC_PER_USEC);
        stats->ac_etime = delta;
        /* Convert to seconds for btime (note y2106 limit) */
        btime = ktime_get_real_seconds() - div_u64(delta, USEC_PER_SEC);
        stats->ac_btime = clamp_t(time64_t, btime, 0, U32_MAX);
        stats->ac_btime64 = btime;

        if (tsk->flags & PF_EXITING)
                stats->ac_exitcode = tsk->exit_code;
        if (thread_group_leader(tsk) && (tsk->flags & PF_FORKNOEXEC))
                stats->ac_flag |= AFORK;
        if (tsk->flags & PF_SUPERPRIV)
                stats->ac_flag |= ASU;
        if (tsk->flags & PF_DUMPCORE)
                stats->ac_flag |= ACORE;
        if (tsk->flags & PF_SIGNALED)
                stats->ac_flag |= AXSIG;
        stats->ac_nice         = task_nice(tsk);
        stats->ac_sched         = tsk->policy;
        stats->ac_pid         = task_pid_nr_ns(tsk, pid_ns);
        rcu_read_lock();
        tcred = __task_cred(tsk);
        stats->ac_uid         = from_kuid_munged(user_ns, tcred->uid);
        stats->ac_gid         = from_kgid_munged(user_ns, tcred->gid);
        stats->ac_ppid         = pid_alive(tsk) ?
                task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0;
        rcu_read_unlock();

        task_cputime(tsk, &utime, &stime);
        stats->ac_utime = div_u64(utime, NSEC_PER_USEC);
        stats->ac_stime = div_u64(stime, NSEC_PER_USEC);

        task_cputime_scaled(tsk, &utimescaled, &stimescaled);
        stats->ac_utimescaled = div_u64(utimescaled, NSEC_PER_USEC);
        stats->ac_stimescaled = div_u64(stimescaled, NSEC_PER_USEC);

        stats->ac_minflt = tsk->min_flt;
        stats->ac_majflt = tsk->maj_flt;

        strncpy(stats->ac_comm, tsk->comm, sizeof(stats->ac_comm));
}


#ifdef CONFIG_TASK_XACCT

#define KB 1024
#define MB (1024*KB)
#define KB_MASK (~(KB-1))
/*
 * fill in extended accounting fields
 */
void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
{
        struct mm_struct *mm;

        /* convert pages-nsec/1024 to Mbyte-usec, see __acct_update_integrals */
        stats->coremem = p->acct_rss_mem1 * PAGE_SIZE;
        do_div(stats->coremem, 1000 * KB);
        stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE;
        do_div(stats->virtmem, 1000 * KB);
        mm = get_task_mm(p);
        if (mm) {
                /* adjust to KB unit */
                stats->hiwater_rss   = get_mm_hiwater_rss(mm) * PAGE_SIZE / KB;
                stats->hiwater_vm    = get_mm_hiwater_vm(mm)  * PAGE_SIZE / KB;
                mmput(mm);
        }
        stats->read_char        = p->ioac.rchar & KB_MASK;
        stats->write_char        = p->ioac.wchar & KB_MASK;
        stats->read_syscalls        = p->ioac.syscr & KB_MASK;
        stats->write_syscalls        = p->ioac.syscw & KB_MASK;
#ifdef CONFIG_TASK_IO_ACCOUNTING
        stats->read_bytes        = p->ioac.read_bytes & KB_MASK;
        stats->write_bytes        = p->ioac.write_bytes & KB_MASK;
        stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes & KB_MASK;
#else
        stats->read_bytes        = 0;
        stats->write_bytes        = 0;
        stats->cancelled_write_bytes = 0;
#endif
}
#undef KB
#undef MB

static void __acct_update_integrals(struct task_struct *tsk,
                                    u64 utime, u64 stime)
{
        u64 time, delta;

        if (!likely(tsk->mm))
                return;

        time = stime + utime;
        delta = time - tsk->acct_timexpd;

        if (delta < TICK_NSEC)
                return;

        tsk->acct_timexpd = time;
        /*
         * Divide by 1024 to avoid overflow, and to avoid division.
         * The final unit reported to userspace is Mbyte-usecs,
         * the rest of the math is done in xacct_add_tsk.
         */
        tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10;
        tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10;
}

/**
 * acct_update_integrals - update mm integral fields in task_struct
 * @tsk: task_struct for accounting
 */
void acct_update_integrals(struct task_struct *tsk)
{
        u64 utime, stime;
        unsigned long flags;

        local_irq_save(flags);
        task_cputime(tsk, &utime, &stime);
        __acct_update_integrals(tsk, utime, stime);
        local_irq_restore(flags);
}

/**
 * acct_account_cputime - update mm integral after cputime update
 * @tsk: task_struct for accounting
 */
void acct_account_cputime(struct task_struct *tsk)
{
        __acct_update_integrals(tsk, tsk->utime, tsk->stime);
}

/**
 * acct_clear_integrals - clear the mm integral fields in task_struct
 * @tsk: task_struct whose accounting fields are cleared
 */
void acct_clear_integrals(struct task_struct *tsk)
{
        tsk->acct_timexpd = 0;
        tsk->acct_rss_mem1 = 0;
        tsk->acct_vm_mem1 = 0;
}
#endif

















































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *        Linux INET6 implementation 
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>        
 */

#ifndef _IP6_FIB_H
#define _IP6_FIB_H

#include <linux/ipv6_route.h>
#include <linux/rtnetlink.h>
#include <linux/spinlock.h>
#include <linux/notifier.h>
#include <net/dst.h>
#include <net/flow.h>
#include <net/ip_fib.h>
#include <net/netlink.h>
#include <net/inetpeer.h>
#include <net/fib_notifier.h>
#include <linux/indirect_call_wrapper.h>

#ifdef CONFIG_IPV6_MULTIPLE_TABLES
#define FIB6_TABLE_HASHSZ 256
#else
#define FIB6_TABLE_HASHSZ 1
#endif

#define RT6_DEBUG 2

#if RT6_DEBUG >= 3
#define RT6_TRACE(x...) pr_debug(x)
#else
#define RT6_TRACE(x...) do { ; } while (0)
#endif

struct rt6_info;
struct fib6_info;

struct fib6_config {
        u32                fc_table;
        u32                fc_metric;
        int                fc_dst_len;
        int                fc_src_len;
        int                fc_ifindex;
        u32                fc_flags;
        u32                fc_protocol;
        u16                fc_type;        /* only 8 bits are used */
        u16                fc_delete_all_nh : 1,
                        fc_ignore_dev_down:1,
                        __unused : 14;
        u32                fc_nh_id;

        struct in6_addr        fc_dst;
        struct in6_addr        fc_src;
        struct in6_addr        fc_prefsrc;
        struct in6_addr        fc_gateway;

        unsigned long        fc_expires;
        struct nlattr        *fc_mx;
        int                fc_mx_len;
        int                fc_mp_len;
        struct nlattr        *fc_mp;

        struct nl_info        fc_nlinfo;
        struct nlattr        *fc_encap;
        u16                fc_encap_type;
        bool                fc_is_fdb;
};

struct fib6_node {
        struct fib6_node __rcu        *parent;
        struct fib6_node __rcu        *left;
        struct fib6_node __rcu        *right;
#ifdef CONFIG_IPV6_SUBTREES
        struct fib6_node __rcu        *subtree;
#endif
        struct fib6_info __rcu        *leaf;

        __u16                        fn_bit;                /* bit key */
        __u16                        fn_flags;
        int                        fn_sernum;
        struct fib6_info __rcu        *rr_ptr;
        struct rcu_head                rcu;
};

struct fib6_gc_args {
        int                        timeout;
        int                        more;
};

#ifndef CONFIG_IPV6_SUBTREES
#define FIB6_SUBTREE(fn)        NULL

static inline bool fib6_routes_require_src(const struct net *net)
{
        return false;
}

static inline void fib6_routes_require_src_inc(struct net *net) {}
static inline void fib6_routes_require_src_dec(struct net *net) {}

#else

static inline bool fib6_routes_require_src(const struct net *net)
{
        return net->ipv6.fib6_routes_require_src > 0;
}

static inline void fib6_routes_require_src_inc(struct net *net)
{
        net->ipv6.fib6_routes_require_src++;
}

static inline void fib6_routes_require_src_dec(struct net *net)
{
        net->ipv6.fib6_routes_require_src--;
}

#define FIB6_SUBTREE(fn)        (rcu_dereference_protected((fn)->subtree, 1))
#endif

/*
 *        routing information
 *
 */

struct rt6key {
        struct in6_addr        addr;
        int                plen;
};

struct fib6_table;

struct rt6_exception_bucket {
        struct hlist_head        chain;
        int                        depth;
};

struct rt6_exception {
        struct hlist_node        hlist;
        struct rt6_info                *rt6i;
        unsigned long                stamp;
        struct rcu_head                rcu;
};

#define FIB6_EXCEPTION_BUCKET_SIZE_SHIFT 10
#define FIB6_EXCEPTION_BUCKET_SIZE (1 << FIB6_EXCEPTION_BUCKET_SIZE_SHIFT)
#define FIB6_MAX_DEPTH 5

struct fib6_nh {
        struct fib_nh_common        nh_common;

#ifdef CONFIG_IPV6_ROUTER_PREF
        unsigned long                last_probe;
#endif

        struct rt6_info * __percpu *rt6i_pcpu;
        struct rt6_exception_bucket __rcu *rt6i_exception_bucket;
};

struct fib6_info {
        struct fib6_table                *fib6_table;
        struct fib6_info __rcu                *fib6_next;
        struct fib6_node __rcu                *fib6_node;

        /* Multipath routes:
         * siblings is a list of fib6_info that have the same metric/weight,
         * destination, but not the same gateway. nsiblings is just a cache
         * to speed up lookup.
         */
        union {
                struct list_head        fib6_siblings;
                struct list_head        nh_list;
        };
        unsigned int                        fib6_nsiblings;

        refcount_t                        fib6_ref;
        unsigned long                        expires;
        struct dst_metrics                *fib6_metrics;
#define fib6_pmtu                fib6_metrics->metrics[RTAX_MTU-1]

        struct rt6key                        fib6_dst;
        u32                                fib6_flags;
        struct rt6key                        fib6_src;
        struct rt6key                        fib6_prefsrc;

        u32                                fib6_metric;
        u8                                fib6_protocol;
        u8                                fib6_type;
        u8                                should_flush:1,
                                        dst_nocount:1,
                                        dst_nopolicy:1,
                                        fib6_destroying:1,
                                        offload:1,
                                        trap:1,
                                        unused:2;

        struct rcu_head                        rcu;
        struct nexthop                        *nh;
        struct fib6_nh                        fib6_nh[];
};

struct rt6_info {
        struct dst_entry                dst;
        struct fib6_info __rcu                *from;
        int                                sernum;

        struct rt6key                        rt6i_dst;
        struct rt6key                        rt6i_src;
        struct in6_addr                        rt6i_gateway;
        struct inet6_dev                *rt6i_idev;
        u32                                rt6i_flags;

        struct list_head                rt6i_uncached;
        struct uncached_list                *rt6i_uncached_list;

        /* more non-fragment space at head required */
        unsigned short                        rt6i_nfheader_len;
};

struct fib6_result {
        struct fib6_nh                *nh;
        struct fib6_info        *f6i;
        u32                        fib6_flags;
        u8                        fib6_type;
        struct rt6_info                *rt6;
};

#define for_each_fib6_node_rt_rcu(fn)                                        \
        for (rt = rcu_dereference((fn)->leaf); rt;                        \
             rt = rcu_dereference(rt->fib6_next))

#define for_each_fib6_walker_rt(w)                                        \
        for (rt = (w)->leaf; rt;                                        \
             rt = rcu_dereference_protected(rt->fib6_next, 1))

static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst)
{
        return ((struct rt6_info *)dst)->rt6i_idev;
}

static inline bool fib6_requires_src(const struct fib6_info *rt)
{
        return rt->fib6_src.plen > 0;
}

static inline void fib6_clean_expires(struct fib6_info *f6i)
{
        f6i->fib6_flags &= ~RTF_EXPIRES;
        f6i->expires = 0;
}

static inline void fib6_set_expires(struct fib6_info *f6i,
                                    unsigned long expires)
{
        f6i->expires = expires;
        f6i->fib6_flags |= RTF_EXPIRES;
}

static inline bool fib6_check_expired(const struct fib6_info *f6i)
{
        if (f6i->fib6_flags & RTF_EXPIRES)
                return time_after(jiffies, f6i->expires);
        return false;
}

/* Function to safely get fn->sernum for passed in rt
 * and store result in passed in cookie.
 * Return true if we can get cookie safely
 * Return false if not
 */
static inline bool fib6_get_cookie_safe(const struct fib6_info *f6i,
                                        u32 *cookie)
{
        struct fib6_node *fn;
        bool status = false;

        fn = rcu_dereference(f6i->fib6_node);

        if (fn) {
                *cookie = READ_ONCE(fn->fn_sernum);
                /* pairs with smp_wmb() in fib6_update_sernum_upto_root() */
                smp_rmb();
                status = true;
        }

        return status;
}

static inline u32 rt6_get_cookie(const struct rt6_info *rt)
{
        struct fib6_info *from;
        u32 cookie = 0;

        if (rt->sernum)
                return rt->sernum;

        rcu_read_lock();

        from = rcu_dereference(rt->from);
        if (from)
                fib6_get_cookie_safe(from, &cookie);

        rcu_read_unlock();

        return cookie;
}

static inline void ip6_rt_put(struct rt6_info *rt)
{
        /* dst_release() accepts a NULL parameter.
         * We rely on dst being first structure in struct rt6_info
         */
        BUILD_BUG_ON(offsetof(struct rt6_info, dst) != 0);
        dst_release(&rt->dst);
}

struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh);
void fib6_info_destroy_rcu(struct rcu_head *head);

static inline void fib6_info_hold(struct fib6_info *f6i)
{
        refcount_inc(&f6i->fib6_ref);
}

static inline bool fib6_info_hold_safe(struct fib6_info *f6i)
{
        return refcount_inc_not_zero(&f6i->fib6_ref);
}

static inline void fib6_info_release(struct fib6_info *f6i)
{
        if (f6i && refcount_dec_and_test(&f6i->fib6_ref))
                call_rcu(&f6i->rcu, fib6_info_destroy_rcu);
}

static inline void fib6_info_hw_flags_set(struct fib6_info *f6i, bool offload,
                                          bool trap)
{
        f6i->offload = offload;
        f6i->trap = trap;
}

enum fib6_walk_state {
#ifdef CONFIG_IPV6_SUBTREES
        FWS_S,
#endif
        FWS_L,
        FWS_R,
        FWS_C,
        FWS_U
};

struct fib6_walker {
        struct list_head lh;
        struct fib6_node *root, *node;
        struct fib6_info *leaf;
        enum fib6_walk_state state;
        unsigned int skip;
        unsigned int count;
        unsigned int skip_in_node;
        int (*func)(struct fib6_walker *);
        void *args;
};

struct rt6_statistics {
        __u32                fib_nodes;                /* all fib6 nodes */
        __u32                fib_route_nodes;        /* intermediate nodes */
        __u32                fib_rt_entries;                /* rt entries in fib table */
        __u32                fib_rt_cache;                /* cached rt entries in exception table */
        __u32                fib_discarded_routes;        /* total number of routes delete */

        /* The following stats are not protected by any lock */
        atomic_t        fib_rt_alloc;                /* total number of routes alloced */
        atomic_t        fib_rt_uncache;                /* rt entries in uncached list */
};

#define RTN_TL_ROOT        0x0001
#define RTN_ROOT        0x0002                /* tree root node                */
#define RTN_RTINFO        0x0004                /* node with valid routing info        */

/*
 *        priority levels (or metrics)
 *
 */


struct fib6_table {
        struct hlist_node        tb6_hlist;
        u32                        tb6_id;
        spinlock_t                tb6_lock;
        struct fib6_node        tb6_root;
        struct inet_peer_base        tb6_peers;
        unsigned int                flags;
        unsigned int                fib_seq;
#define RT6_TABLE_HAS_DFLT_ROUTER        BIT(0)
};

#define RT6_TABLE_UNSPEC        RT_TABLE_UNSPEC
#define RT6_TABLE_MAIN                RT_TABLE_MAIN
#define RT6_TABLE_DFLT                RT6_TABLE_MAIN
#define RT6_TABLE_INFO                RT6_TABLE_MAIN
#define RT6_TABLE_PREFIX        RT6_TABLE_MAIN

#ifdef CONFIG_IPV6_MULTIPLE_TABLES
#define FIB6_TABLE_MIN                1
#define FIB6_TABLE_MAX                RT_TABLE_MAX
#define RT6_TABLE_LOCAL                RT_TABLE_LOCAL
#else
#define FIB6_TABLE_MIN                RT_TABLE_MAIN
#define FIB6_TABLE_MAX                FIB6_TABLE_MIN
#define RT6_TABLE_LOCAL                RT6_TABLE_MAIN
#endif

typedef struct rt6_info *(*pol_lookup_t)(struct net *,
                                         struct fib6_table *,
                                         struct flowi6 *,
                                         const struct sk_buff *, int);

struct fib6_entry_notifier_info {
        struct fib_notifier_info info; /* must be first */
        struct fib6_info *rt;
        unsigned int nsiblings;
};

/*
 *        exported functions
 */

struct fib6_table *fib6_get_table(struct net *net, u32 id);
struct fib6_table *fib6_new_table(struct net *net, u32 id);
struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
                                   const struct sk_buff *skb,
                                   int flags, pol_lookup_t lookup);

/* called with rcu lock held; can return error pointer
 * caller needs to select path
 */
int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
                struct fib6_result *res, int flags);

/* called with rcu lock held; caller needs to select path */
int fib6_table_lookup(struct net *net, struct fib6_table *table,
                      int oif, struct flowi6 *fl6, struct fib6_result *res,
                      int strict);

void fib6_select_path(const struct net *net, struct fib6_result *res,
                      struct flowi6 *fl6, int oif, bool have_oif_match,
                      const struct sk_buff *skb, int strict);
struct fib6_node *fib6_node_lookup(struct fib6_node *root,
                                   const struct in6_addr *daddr,
                                   const struct in6_addr *saddr);

struct fib6_node *fib6_locate(struct fib6_node *root,
                              const struct in6_addr *daddr, int dst_len,
                              const struct in6_addr *saddr, int src_len,
                              bool exact_match);

void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *arg),
                    void *arg);
void fib6_clean_all_skip_notify(struct net *net,
                                int (*func)(struct fib6_info *, void *arg),
                                void *arg);

int fib6_add(struct fib6_node *root, struct fib6_info *rt,
             struct nl_info *info, struct netlink_ext_ack *extack);
int fib6_del(struct fib6_info *rt, struct nl_info *info);

static inline
void rt6_get_prefsrc(const struct rt6_info *rt, struct in6_addr *addr)
{
        const struct fib6_info *from;

        rcu_read_lock();

        from = rcu_dereference(rt->from);
        if (from) {
                *addr = from->fib6_prefsrc.addr;
        } else {
                struct in6_addr in6_zero = {};

                *addr = in6_zero;
        }

        rcu_read_unlock();
}

int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
                 struct fib6_config *cfg, gfp_t gfp_flags,
                 struct netlink_ext_ack *extack);
void fib6_nh_release(struct fib6_nh *fib6_nh);
void fib6_nh_release_dsts(struct fib6_nh *fib6_nh);

int call_fib6_entry_notifiers(struct net *net,
                              enum fib_event_type event_type,
                              struct fib6_info *rt,
                              struct netlink_ext_ack *extack);
int call_fib6_multipath_entry_notifiers(struct net *net,
                                        enum fib_event_type event_type,
                                        struct fib6_info *rt,
                                        unsigned int nsiblings,
                                        struct netlink_ext_ack *extack);
int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt);
void fib6_rt_update(struct net *net, struct fib6_info *rt,
                    struct nl_info *info);
void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
                     unsigned int flags);

void fib6_run_gc(unsigned long expires, struct net *net, bool force);

void fib6_gc_cleanup(void);

int fib6_init(void);

struct ipv6_route_iter {
        struct seq_net_private p;
        struct fib6_walker w;
        loff_t skip;
        struct fib6_table *tbl;
        int sernum;
};

extern const struct seq_operations ipv6_route_seq_ops;

int call_fib6_notifier(struct notifier_block *nb,
                       enum fib_event_type event_type,
                       struct fib_notifier_info *info);
int call_fib6_notifiers(struct net *net, enum fib_event_type event_type,
                        struct fib_notifier_info *info);

int __net_init fib6_notifier_init(struct net *net);
void __net_exit fib6_notifier_exit(struct net *net);

unsigned int fib6_tables_seq_read(struct net *net);
int fib6_tables_dump(struct net *net, struct notifier_block *nb,
                     struct netlink_ext_ack *extack);

void fib6_update_sernum(struct net *net, struct fib6_info *rt);
void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt);
void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i);

void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val);
static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric)
{
        return !!(f6i->fib6_metrics->metrics[RTAX_LOCK - 1] & (1 << metric));
}

#if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL)
struct bpf_iter__ipv6_route {
        __bpf_md_ptr(struct bpf_iter_meta *, meta);
        __bpf_md_ptr(struct fib6_info *, rt);
};
#endif

INDIRECT_CALLABLE_DECLARE(struct rt6_info *ip6_pol_route_output(struct net *net,
                                             struct fib6_table *table,
                                             struct flowi6 *fl6,
                                             const struct sk_buff *skb,
                                             int flags));
INDIRECT_CALLABLE_DECLARE(struct rt6_info *ip6_pol_route_input(struct net *net,
                                             struct fib6_table *table,
                                             struct flowi6 *fl6,
                                             const struct sk_buff *skb,
                                             int flags));
INDIRECT_CALLABLE_DECLARE(struct rt6_info *__ip6_route_redirect(struct net *net,
                                             struct fib6_table *table,
                                             struct flowi6 *fl6,
                                             const struct sk_buff *skb,
                                             int flags));
INDIRECT_CALLABLE_DECLARE(struct rt6_info *ip6_pol_route_lookup(struct net *net,
                                             struct fib6_table *table,
                                             struct flowi6 *fl6,
                                             const struct sk_buff *skb,
                                             int flags));
static inline struct rt6_info *pol_lookup_func(pol_lookup_t lookup,
                                                struct net *net,
                                                struct fib6_table *table,
                                                struct flowi6 *fl6,
                                                const struct sk_buff *skb,
                                                int flags)
{
        return INDIRECT_CALL_4(lookup,
                               ip6_pol_route_output,
                               ip6_pol_route_input,
                               ip6_pol_route_lookup,
                               __ip6_route_redirect,
                               net, table, fl6, skb, flags);
}

#ifdef CONFIG_IPV6_MULTIPLE_TABLES
static inline bool fib6_has_custom_rules(const struct net *net)
{
        return net->ipv6.fib6_has_custom_rules;
}

int fib6_rules_init(void);
void fib6_rules_cleanup(void);
bool fib6_rule_default(const struct fib_rule *rule);
int fib6_rules_dump(struct net *net, struct notifier_block *nb,
                    struct netlink_ext_ack *extack);
unsigned int fib6_rules_seq_read(struct net *net);

static inline bool fib6_rules_early_flow_dissect(struct net *net,
                                                 struct sk_buff *skb,
                                                 struct flowi6 *fl6,
                                                 struct flow_keys *flkeys)
{
        unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;

        if (!net->ipv6.fib6_rules_require_fldissect)
                return false;

        skb_flow_dissect_flow_keys(skb, flkeys, flag);
        fl6->fl6_sport = flkeys->ports.src;
        fl6->fl6_dport = flkeys->ports.dst;
        fl6->flowi6_proto = flkeys->basic.ip_proto;

        return true;
}
#else
static inline bool fib6_has_custom_rules(const struct net *net)
{
        return false;
}
static inline int               fib6_rules_init(void)
{
        return 0;
}
static inline void              fib6_rules_cleanup(void)
{
        return ;
}
static inline bool fib6_rule_default(const struct fib_rule *rule)
{
        return true;
}
static inline int fib6_rules_dump(struct net *net, struct notifier_block *nb,
                                  struct netlink_ext_ack *extack)
{
        return 0;
}
static inline unsigned int fib6_rules_seq_read(struct net *net)
{
        return 0;
}
static inline bool fib6_rules_early_flow_dissect(struct net *net,
                                                 struct sk_buff *skb,
                                                 struct flowi6 *fl6,
                                                 struct flow_keys *flkeys)
{
        return false;
}
#endif
#endif





































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM cfg80211

#if !defined(__RDEV_OPS_TRACE) || defined(TRACE_HEADER_MULTI_READ)
#define __RDEV_OPS_TRACE

#include <linux/tracepoint.h>

#include <linux/rtnetlink.h>
#include <linux/etherdevice.h>
#include <net/cfg80211.h>
#include "core.h"

#define MAC_ENTRY(entry_mac) __array(u8, entry_mac, ETH_ALEN)
#define MAC_ASSIGN(entry_mac, given_mac) do {                             \
        if (given_mac)                                                     \
                memcpy(__entry->entry_mac, given_mac, ETH_ALEN);     \
        else                                                             \
                eth_zero_addr(__entry->entry_mac);                     \
        } while (0)
#define MAC_PR_FMT "%pM"
#define MAC_PR_ARG(entry_mac) (__entry->entry_mac)

#define MAXNAME                32
#define WIPHY_ENTRY        __array(char, wiphy_name, 32)
#define WIPHY_ASSIGN        strlcpy(__entry->wiphy_name, wiphy_name(wiphy), MAXNAME)
#define WIPHY_PR_FMT        "%s"
#define WIPHY_PR_ARG        __entry->wiphy_name

#define WDEV_ENTRY        __field(u32, id)
#define WDEV_ASSIGN        (__entry->id) = (!IS_ERR_OR_NULL(wdev)        \
                                         ? wdev->identifier : 0)
#define WDEV_PR_FMT        "wdev(%u)"
#define WDEV_PR_ARG        (__entry->id)

#define NETDEV_ENTRY        __array(char, name, IFNAMSIZ) \
                        __field(int, ifindex)
#define NETDEV_ASSIGN                                               \
        do {                                                       \
                memcpy(__entry->name, netdev->name, IFNAMSIZ); \
                (__entry->ifindex) = (netdev->ifindex);               \
        } while (0)
#define NETDEV_PR_FMT        "netdev:%s(%d)"
#define NETDEV_PR_ARG        __entry->name, __entry->ifindex

#define MESH_CFG_ENTRY __field(u16, dot11MeshRetryTimeout)                   \
                       __field(u16, dot11MeshConfirmTimeout)                   \
                       __field(u16, dot11MeshHoldingTimeout)                   \
                       __field(u16, dot11MeshMaxPeerLinks)                   \
                       __field(u8, dot11MeshMaxRetries)                           \
                       __field(u8, dot11MeshTTL)                           \
                       __field(u8, element_ttl)                                   \
                       __field(bool, auto_open_plinks)                           \
                       __field(u32, dot11MeshNbrOffsetMaxNeighbor)           \
                       __field(u8, dot11MeshHWMPmaxPREQretries)                   \
                       __field(u32, path_refresh_time)                           \
                       __field(u32, dot11MeshHWMPactivePathTimeout)           \
                       __field(u16, min_discovery_timeout)                   \
                       __field(u16, dot11MeshHWMPpreqMinInterval)           \
                       __field(u16, dot11MeshHWMPperrMinInterval)           \
                       __field(u16, dot11MeshHWMPnetDiameterTraversalTime) \
                       __field(u8, dot11MeshHWMPRootMode)                   \
                       __field(u16, dot11MeshHWMPRannInterval)                   \
                       __field(bool, dot11MeshGateAnnouncementProtocol)           \
                       __field(bool, dot11MeshForwarding)                   \
                       __field(s32, rssi_threshold)                           \
                       __field(u16, ht_opmode)                                   \
                       __field(u32, dot11MeshHWMPactivePathToRootTimeout)  \
                       __field(u16, dot11MeshHWMProotInterval)                   \
                       __field(u16, dot11MeshHWMPconfirmationInterval)           \
                       __field(bool, dot11MeshNolearn)
#define MESH_CFG_ASSIGN                                                              \
        do {                                                                      \
                __entry->dot11MeshRetryTimeout = conf->dot11MeshRetryTimeout; \
                __entry->dot11MeshConfirmTimeout =                              \
                                conf->dot11MeshConfirmTimeout;                      \
                __entry->dot11MeshHoldingTimeout =                              \
                                conf->dot11MeshHoldingTimeout;                      \
                __entry->dot11MeshMaxPeerLinks = conf->dot11MeshMaxPeerLinks; \
                __entry->dot11MeshMaxRetries = conf->dot11MeshMaxRetries;     \
                __entry->dot11MeshTTL = conf->dot11MeshTTL;                      \
                __entry->element_ttl = conf->element_ttl;                      \
                __entry->auto_open_plinks = conf->auto_open_plinks;              \
                __entry->dot11MeshNbrOffsetMaxNeighbor =                      \
                                conf->dot11MeshNbrOffsetMaxNeighbor;              \
                __entry->dot11MeshHWMPmaxPREQretries =                              \
                                conf->dot11MeshHWMPmaxPREQretries;              \
                __entry->path_refresh_time = conf->path_refresh_time;              \
                __entry->dot11MeshHWMPactivePathTimeout =                      \
                                conf->dot11MeshHWMPactivePathTimeout;              \
                __entry->min_discovery_timeout = conf->min_discovery_timeout; \
                __entry->dot11MeshHWMPpreqMinInterval =                              \
                                conf->dot11MeshHWMPpreqMinInterval;              \
                __entry->dot11MeshHWMPperrMinInterval =                              \
                                conf->dot11MeshHWMPperrMinInterval;              \
                __entry->dot11MeshHWMPnetDiameterTraversalTime =              \
                                conf->dot11MeshHWMPnetDiameterTraversalTime;  \
                __entry->dot11MeshHWMPRootMode = conf->dot11MeshHWMPRootMode; \
                __entry->dot11MeshHWMPRannInterval =                              \
                                conf->dot11MeshHWMPRannInterval;              \
                __entry->dot11MeshGateAnnouncementProtocol =                      \
                                conf->dot11MeshGateAnnouncementProtocol;      \
                __entry->dot11MeshForwarding = conf->dot11MeshForwarding;     \
                __entry->rssi_threshold = conf->rssi_threshold;                      \
                __entry->ht_opmode = conf->ht_opmode;                              \
                __entry->dot11MeshHWMPactivePathToRootTimeout =                      \
                                conf->dot11MeshHWMPactivePathToRootTimeout;   \
                __entry->dot11MeshHWMProotInterval =                              \
                                conf->dot11MeshHWMProotInterval;              \
                __entry->dot11MeshHWMPconfirmationInterval =                      \
                                conf->dot11MeshHWMPconfirmationInterval;      \
                __entry->dot11MeshNolearn = conf->dot11MeshNolearn;              \
        } while (0)

#define CHAN_ENTRY __field(enum nl80211_band, band) \
                   __field(u32, center_freq)                \
                   __field(u16, freq_offset)
#define CHAN_ASSIGN(chan)                                          \
        do {                                                          \
                if (chan) {                                          \
                        __entry->band = chan->band;                  \
                        __entry->center_freq = chan->center_freq; \
                        __entry->freq_offset = chan->freq_offset; \
                } else {                                          \
                        __entry->band = 0;                          \
                        __entry->center_freq = 0;                  \
                        __entry->freq_offset = 0;                  \
                }                                                  \
        } while (0)
#define CHAN_PR_FMT "band: %d, freq: %u.%03u"
#define CHAN_PR_ARG __entry->band, __entry->center_freq, __entry->freq_offset

#define CHAN_DEF_ENTRY __field(enum nl80211_band, band)                \
                       __field(u32, control_freq)                        \
                       __field(u32, freq_offset)                        \
                       __field(u32, width)                                \
                       __field(u32, center_freq1)                        \
                       __field(u32, freq1_offset)                        \
                       __field(u32, center_freq2)
#define CHAN_DEF_ASSIGN(chandef)                                        \
        do {                                                                \
                if ((chandef) && (chandef)->chan) {                        \
                        __entry->band = (chandef)->chan->band;                \
                        __entry->control_freq =                                \
                                (chandef)->chan->center_freq;                \
                        __entry->freq_offset =                                \
                                (chandef)->chan->freq_offset;                \
                        __entry->width = (chandef)->width;                \
                        __entry->center_freq1 = (chandef)->center_freq1;\
                        __entry->freq1_offset = (chandef)->freq1_offset;\
                        __entry->center_freq2 = (chandef)->center_freq2;\
                } else {                                                \
                        __entry->band = 0;                                \
                        __entry->control_freq = 0;                        \
                        __entry->freq_offset = 0;                        \
                        __entry->width = 0;                                \
                        __entry->center_freq1 = 0;                        \
                        __entry->freq1_offset = 0;                        \
                        __entry->center_freq2 = 0;                        \
                }                                                        \
        } while (0)
#define CHAN_DEF_PR_FMT                                                        \
        "band: %d, control freq: %u.%03u, width: %d, cf1: %u.%03u, cf2: %u"
#define CHAN_DEF_PR_ARG __entry->band, __entry->control_freq,                \
                        __entry->freq_offset, __entry->width,                \
                        __entry->center_freq1, __entry->freq1_offset,        \
                        __entry->center_freq2

#define SINFO_ENTRY __field(int, generation)            \
                    __field(u32, connected_time)    \
                    __field(u32, inactive_time)            \
                    __field(u32, rx_bytes)            \
                    __field(u32, tx_bytes)            \
                    __field(u32, rx_packets)            \
                    __field(u32, tx_packets)            \
                    __field(u32, tx_retries)            \
                    __field(u32, tx_failed)            \
                    __field(u32, rx_dropped_misc)   \
                    __field(u32, beacon_loss_count) \
                    __field(u16, llid)                    \
                    __field(u16, plid)                    \
                    __field(u8, plink_state)
#define SINFO_ASSIGN                                                       \
        do {                                                               \
                __entry->generation = sinfo->generation;               \
                __entry->connected_time = sinfo->connected_time;       \
                __entry->inactive_time = sinfo->inactive_time;               \
                __entry->rx_bytes = sinfo->rx_bytes;                       \
                __entry->tx_bytes = sinfo->tx_bytes;                       \
                __entry->rx_packets = sinfo->rx_packets;               \
                __entry->tx_packets = sinfo->tx_packets;               \
                __entry->tx_retries = sinfo->tx_retries;               \
                __entry->tx_failed = sinfo->tx_failed;                       \
                __entry->rx_dropped_misc = sinfo->rx_dropped_misc;     \
                __entry->beacon_loss_count = sinfo->beacon_loss_count; \
                __entry->llid = sinfo->llid;                               \
                __entry->plid = sinfo->plid;                               \
                __entry->plink_state = sinfo->plink_state;               \
        } while (0)

#define BOOL_TO_STR(bo) (bo) ? "true" : "false"

#define QOS_MAP_ENTRY __field(u8, num_des)                        \
                      __array(u8, dscp_exception,                \
                              2 * IEEE80211_QOS_MAP_MAX_EX)        \
                      __array(u8, up, IEEE80211_QOS_MAP_LEN_MIN)
#define QOS_MAP_ASSIGN(qos_map)                                        \
        do {                                                        \
                if ((qos_map)) {                                \
                        __entry->num_des = (qos_map)->num_des;        \
                        memcpy(__entry->dscp_exception,                \
                               &(qos_map)->dscp_exception,        \
                               2 * IEEE80211_QOS_MAP_MAX_EX);        \
                        memcpy(__entry->up, &(qos_map)->up,        \
                               IEEE80211_QOS_MAP_LEN_MIN);        \
                } else {                                        \
                        __entry->num_des = 0;                        \
                        memset(__entry->dscp_exception, 0,        \
                               2 * IEEE80211_QOS_MAP_MAX_EX);        \
                        memset(__entry->up, 0,                        \
                               IEEE80211_QOS_MAP_LEN_MIN);        \
                }                                                \
        } while (0)

/*************************************************************
 *                        rdev->ops traces                     *
 *************************************************************/

TRACE_EVENT(rdev_suspend,
        TP_PROTO(struct wiphy *wiphy, struct cfg80211_wowlan *wow),
        TP_ARGS(wiphy, wow),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(bool, any)
                __field(bool, disconnect)
                __field(bool, magic_pkt)
                __field(bool, gtk_rekey_failure)
                __field(bool, eap_identity_req)
                __field(bool, four_way_handshake)
                __field(bool, rfkill_release)
                __field(bool, valid_wow)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                if (wow) {
                        __entry->any = wow->any;
                        __entry->disconnect = wow->disconnect;
                        __entry->magic_pkt = wow->magic_pkt;
                        __entry->gtk_rekey_failure = wow->gtk_rekey_failure;
                        __entry->eap_identity_req = wow->eap_identity_req;
                        __entry->four_way_handshake = wow->four_way_handshake;
                        __entry->rfkill_release = wow->rfkill_release;
                        __entry->valid_wow = true;
                } else {
                        __entry->valid_wow = false;
                }
        ),
        TP_printk(WIPHY_PR_FMT ", wow%s - any: %d, disconnect: %d, "
                  "magic pkt: %d, gtk rekey failure: %d, eap identify req: %d, "
                  "four way handshake: %d, rfkill release: %d.",
                  WIPHY_PR_ARG, __entry->valid_wow ? "" : "(Not configured!)",
                  __entry->any, __entry->disconnect, __entry->magic_pkt,
                  __entry->gtk_rekey_failure, __entry->eap_identity_req,
                  __entry->four_way_handshake, __entry->rfkill_release)
);

TRACE_EVENT(rdev_return_int,
        TP_PROTO(struct wiphy *wiphy, int ret),
        TP_ARGS(wiphy, ret),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(int, ret)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->ret = ret;
        ),
        TP_printk(WIPHY_PR_FMT ", returned: %d", WIPHY_PR_ARG, __entry->ret)
);

TRACE_EVENT(rdev_scan,
        TP_PROTO(struct wiphy *wiphy, struct cfg80211_scan_request *request),
        TP_ARGS(wiphy, request),
        TP_STRUCT__entry(
                WIPHY_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
        ),
        TP_printk(WIPHY_PR_FMT, WIPHY_PR_ARG)
);

DECLARE_EVENT_CLASS(wiphy_only_evt,
        TP_PROTO(struct wiphy *wiphy),
        TP_ARGS(wiphy),
        TP_STRUCT__entry(
                WIPHY_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
        ),
        TP_printk(WIPHY_PR_FMT, WIPHY_PR_ARG)
);

DEFINE_EVENT(wiphy_only_evt, rdev_resume,
        TP_PROTO(struct wiphy *wiphy),
        TP_ARGS(wiphy)
);

DEFINE_EVENT(wiphy_only_evt, rdev_return_void,
        TP_PROTO(struct wiphy *wiphy),
        TP_ARGS(wiphy)
);

DEFINE_EVENT(wiphy_only_evt, rdev_get_antenna,
        TP_PROTO(struct wiphy *wiphy),
        TP_ARGS(wiphy)
);

DEFINE_EVENT(wiphy_only_evt, rdev_rfkill_poll,
        TP_PROTO(struct wiphy *wiphy),
        TP_ARGS(wiphy)
);

DECLARE_EVENT_CLASS(wiphy_enabled_evt,
        TP_PROTO(struct wiphy *wiphy, bool enabled),
        TP_ARGS(wiphy, enabled),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(bool, enabled)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->enabled = enabled;
        ),
        TP_printk(WIPHY_PR_FMT ", %senabled ",
                  WIPHY_PR_ARG, __entry->enabled ? "" : "not ")
);

DEFINE_EVENT(wiphy_enabled_evt, rdev_set_wakeup,
        TP_PROTO(struct wiphy *wiphy, bool enabled),
        TP_ARGS(wiphy, enabled)
);

TRACE_EVENT(rdev_add_virtual_intf,
        TP_PROTO(struct wiphy *wiphy, char *name, enum nl80211_iftype type),
        TP_ARGS(wiphy, name, type),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __string(vir_intf_name, name ? name : "<noname>")
                __field(enum nl80211_iftype, type)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __assign_str(vir_intf_name, name ? name : "<noname>");
                __entry->type = type;
        ),
        TP_printk(WIPHY_PR_FMT ", virtual intf name: %s, type: %d",
                  WIPHY_PR_ARG, __get_str(vir_intf_name), __entry->type)
);

DECLARE_EVENT_CLASS(wiphy_wdev_evt,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
        TP_ARGS(wiphy, wdev),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT, WIPHY_PR_ARG, WDEV_PR_ARG)
);

DECLARE_EVENT_CLASS(wiphy_wdev_cookie_evt,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie),
        TP_ARGS(wiphy, wdev, cookie),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(u64, cookie)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->cookie = cookie;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie: %lld",
                  WIPHY_PR_ARG, WDEV_PR_ARG,
                  (unsigned long long)__entry->cookie)
);

DEFINE_EVENT(wiphy_wdev_evt, rdev_return_wdev,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
        TP_ARGS(wiphy, wdev)
);

DEFINE_EVENT(wiphy_wdev_evt, rdev_del_virtual_intf,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
        TP_ARGS(wiphy, wdev)
);

TRACE_EVENT(rdev_change_virtual_intf,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 enum nl80211_iftype type),
        TP_ARGS(wiphy, netdev, type),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(enum nl80211_iftype, type)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->type = type;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", type: %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->type)
);

DECLARE_EVENT_CLASS(key_handle,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index,
                 bool pairwise, const u8 *mac_addr),
        TP_ARGS(wiphy, netdev, key_index, pairwise, mac_addr),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(mac_addr)
                __field(u8, key_index)
                __field(bool, pairwise)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(mac_addr, mac_addr);
                __entry->key_index = key_index;
                __entry->pairwise = pairwise;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", key_index: %u, pairwise: %s, mac addr: " MAC_PR_FMT,
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->key_index,
                  BOOL_TO_STR(__entry->pairwise), MAC_PR_ARG(mac_addr))
);

DEFINE_EVENT(key_handle, rdev_get_key,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index,
                 bool pairwise, const u8 *mac_addr),
        TP_ARGS(wiphy, netdev, key_index, pairwise, mac_addr)
);

DEFINE_EVENT(key_handle, rdev_del_key,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index,
                 bool pairwise, const u8 *mac_addr),
        TP_ARGS(wiphy, netdev, key_index, pairwise, mac_addr)
);

TRACE_EVENT(rdev_add_key,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index,
                 bool pairwise, const u8 *mac_addr, u8 mode),
        TP_ARGS(wiphy, netdev, key_index, pairwise, mac_addr, mode),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(mac_addr)
                __field(u8, key_index)
                __field(bool, pairwise)
                __field(u8, mode)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(mac_addr, mac_addr);
                __entry->key_index = key_index;
                __entry->pairwise = pairwise;
                __entry->mode = mode;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", key_index: %u, "
                  "mode: %u, pairwise: %s, mac addr: " MAC_PR_FMT,
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->key_index,
                  __entry->mode, BOOL_TO_STR(__entry->pairwise),
                  MAC_PR_ARG(mac_addr))
);

TRACE_EVENT(rdev_set_default_key,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index,
                 bool unicast, bool multicast),
        TP_ARGS(wiphy, netdev, key_index, unicast, multicast),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(u8, key_index)
                __field(bool, unicast)
                __field(bool, multicast)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->key_index = key_index;
                __entry->unicast = unicast;
                __entry->multicast = multicast;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", key index: %u, unicast: %s, multicast: %s",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->key_index,
                  BOOL_TO_STR(__entry->unicast),
                  BOOL_TO_STR(__entry->multicast))
);

TRACE_EVENT(rdev_set_default_mgmt_key,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index),
        TP_ARGS(wiphy, netdev, key_index),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(u8, key_index)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->key_index = key_index;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", key index: %u",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->key_index)
);

TRACE_EVENT(rdev_set_default_beacon_key,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index),
        TP_ARGS(wiphy, netdev, key_index),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(u8, key_index)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->key_index = key_index;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", key index: %u",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->key_index)
);

TRACE_EVENT(rdev_start_ap,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_ap_settings *settings),
        TP_ARGS(wiphy, netdev, settings),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                CHAN_DEF_ENTRY
                __field(int, beacon_interval)
                __field(int, dtim_period)
                __array(char, ssid, IEEE80211_MAX_SSID_LEN + 1)
                __field(enum nl80211_hidden_ssid, hidden_ssid)
                __field(u32, wpa_ver)
                __field(bool, privacy)
                __field(enum nl80211_auth_type, auth_type)
                __field(int, inactivity_timeout)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                CHAN_DEF_ASSIGN(&settings->chandef);
                __entry->beacon_interval = settings->beacon_interval;
                __entry->dtim_period = settings->dtim_period;
                __entry->hidden_ssid = settings->hidden_ssid;
                __entry->wpa_ver = settings->crypto.wpa_versions;
                __entry->privacy = settings->privacy;
                __entry->auth_type = settings->auth_type;
                __entry->inactivity_timeout = settings->inactivity_timeout;
                memset(__entry->ssid, 0, IEEE80211_MAX_SSID_LEN + 1);
                memcpy(__entry->ssid, settings->ssid, settings->ssid_len);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", AP settings - ssid: %s, "
                  CHAN_DEF_PR_FMT ", beacon interval: %d, dtim period: %d, "
                  "hidden ssid: %d, wpa versions: %u, privacy: %s, "
                  "auth type: %d, inactivity timeout: %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->ssid, CHAN_DEF_PR_ARG,
                  __entry->beacon_interval, __entry->dtim_period,
                  __entry->hidden_ssid, __entry->wpa_ver,
                  BOOL_TO_STR(__entry->privacy), __entry->auth_type,
                  __entry->inactivity_timeout)
);

TRACE_EVENT(rdev_change_beacon,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_beacon_data *info),
        TP_ARGS(wiphy, netdev, info),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __dynamic_array(u8, head, info ? info->head_len : 0)
                __dynamic_array(u8, tail, info ? info->tail_len : 0)
                __dynamic_array(u8, beacon_ies, info ? info->beacon_ies_len : 0)
                __dynamic_array(u8, proberesp_ies,
                                info ? info->proberesp_ies_len : 0)
                __dynamic_array(u8, assocresp_ies,
                                info ? info->assocresp_ies_len : 0)
                __dynamic_array(u8, probe_resp, info ? info->probe_resp_len : 0)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                if (info) {
                        if (info->head)
                                memcpy(__get_dynamic_array(head), info->head,
                                       info->head_len);
                        if (info->tail)
                                memcpy(__get_dynamic_array(tail), info->tail,
                                       info->tail_len);
                        if (info->beacon_ies)
                                memcpy(__get_dynamic_array(beacon_ies),
                                       info->beacon_ies, info->beacon_ies_len);
                        if (info->proberesp_ies)
                                memcpy(__get_dynamic_array(proberesp_ies),
                                       info->proberesp_ies,
                                       info->proberesp_ies_len);
                        if (info->assocresp_ies)
                                memcpy(__get_dynamic_array(assocresp_ies),
                                       info->assocresp_ies,
                                       info->assocresp_ies_len);
                        if (info->probe_resp)
                                memcpy(__get_dynamic_array(probe_resp),
                                       info->probe_resp, info->probe_resp_len);
                }
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT, WIPHY_PR_ARG, NETDEV_PR_ARG)
);

DECLARE_EVENT_CLASS(wiphy_netdev_evt,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
        TP_ARGS(wiphy, netdev),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT, WIPHY_PR_ARG, NETDEV_PR_ARG)
);

DEFINE_EVENT(wiphy_netdev_evt, rdev_stop_ap,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
        TP_ARGS(wiphy, netdev)
);

DEFINE_EVENT(wiphy_netdev_evt, rdev_set_rekey_data,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
        TP_ARGS(wiphy, netdev)
);

DEFINE_EVENT(wiphy_netdev_evt, rdev_get_mesh_config,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
        TP_ARGS(wiphy, netdev)
);

DEFINE_EVENT(wiphy_netdev_evt, rdev_leave_mesh,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
        TP_ARGS(wiphy, netdev)
);

DEFINE_EVENT(wiphy_netdev_evt, rdev_leave_ibss,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
        TP_ARGS(wiphy, netdev)
);

DEFINE_EVENT(wiphy_netdev_evt, rdev_leave_ocb,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
        TP_ARGS(wiphy, netdev)
);

DEFINE_EVENT(wiphy_netdev_evt, rdev_flush_pmksa,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
        TP_ARGS(wiphy, netdev)
);

DEFINE_EVENT(wiphy_netdev_evt, rdev_end_cac,
             TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
             TP_ARGS(wiphy, netdev)
);

DECLARE_EVENT_CLASS(station_add_change,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *mac,
                 struct station_parameters *params),
        TP_ARGS(wiphy, netdev, mac, params),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(sta_mac)
                __field(u32, sta_flags_mask)
                __field(u32, sta_flags_set)
                __field(u32, sta_modify_mask)
                __field(int, listen_interval)
                __field(u16, capability)
                __field(u16, aid)
                __field(u8, plink_action)
                __field(u8, plink_state)
                __field(u8, uapsd_queues)
                __field(u8, max_sp)
                __field(u8, opmode_notif)
                __field(bool, opmode_notif_used)
                __array(u8, ht_capa, (int)sizeof(struct ieee80211_ht_cap))
                __array(u8, vht_capa, (int)sizeof(struct ieee80211_vht_cap))
                __array(char, vlan, IFNAMSIZ)
                __dynamic_array(u8, supported_rates,
                                params->supported_rates_len)
                __dynamic_array(u8, ext_capab, params->ext_capab_len)
                __dynamic_array(u8, supported_channels,
                                params->supported_channels_len)
                __dynamic_array(u8, supported_oper_classes,
                                params->supported_oper_classes_len)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(sta_mac, mac);
                __entry->sta_flags_mask = params->sta_flags_mask;
                __entry->sta_flags_set = params->sta_flags_set;
                __entry->sta_modify_mask = params->sta_modify_mask;
                __entry->listen_interval = params->listen_interval;
                __entry->aid = params->aid;
                __entry->plink_action = params->plink_action;
                __entry->plink_state = params->plink_state;
                __entry->uapsd_queues = params->uapsd_queues;
                memset(__entry->ht_capa, 0, sizeof(struct ieee80211_ht_cap));
                if (params->ht_capa)
                        memcpy(__entry->ht_capa, params->ht_capa,
                               sizeof(struct ieee80211_ht_cap));
                memset(__entry->vht_capa, 0, sizeof(struct ieee80211_vht_cap));
                if (params->vht_capa)
                        memcpy(__entry->vht_capa, params->vht_capa,
                               sizeof(struct ieee80211_vht_cap));
                memset(__entry->vlan, 0, sizeof(__entry->vlan));
                if (params->vlan)
                        memcpy(__entry->vlan, params->vlan->name, IFNAMSIZ);
                if (params->supported_rates && params->supported_rates_len)
                        memcpy(__get_dynamic_array(supported_rates),
                               params->supported_rates,
                               params->supported_rates_len);
                if (params->ext_capab && params->ext_capab_len)
                        memcpy(__get_dynamic_array(ext_capab),
                               params->ext_capab,
                               params->ext_capab_len);
                if (params->supported_channels &&
                    params->supported_channels_len)
                        memcpy(__get_dynamic_array(supported_channels),
                               params->supported_channels,
                               params->supported_channels_len);
                if (params->supported_oper_classes &&
                    params->supported_oper_classes_len)
                        memcpy(__get_dynamic_array(supported_oper_classes),
                               params->supported_oper_classes,
                               params->supported_oper_classes_len);
                __entry->max_sp = params->max_sp;
                __entry->capability = params->capability;
                __entry->opmode_notif = params->opmode_notif;
                __entry->opmode_notif_used = params->opmode_notif_used;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: " MAC_PR_FMT
                  ", station flags mask: %u, station flags set: %u, "
                  "station modify mask: %u, listen interval: %d, aid: %u, "
                  "plink action: %u, plink state: %u, uapsd queues: %u, vlan:%s",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(sta_mac),
                  __entry->sta_flags_mask, __entry->sta_flags_set,
                  __entry->sta_modify_mask, __entry->listen_interval,
                  __entry->aid, __entry->plink_action, __entry->plink_state,
                  __entry->uapsd_queues, __entry->vlan)
);

DEFINE_EVENT(station_add_change, rdev_add_station,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *mac,
                 struct station_parameters *params),
        TP_ARGS(wiphy, netdev, mac, params)
);

DEFINE_EVENT(station_add_change, rdev_change_station,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *mac,
                 struct station_parameters *params),
        TP_ARGS(wiphy, netdev, mac, params)
);

DECLARE_EVENT_CLASS(wiphy_netdev_mac_evt,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *mac),
        TP_ARGS(wiphy, netdev, mac),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(sta_mac)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(sta_mac, mac);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", mac: " MAC_PR_FMT,
                  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(sta_mac))
);

DECLARE_EVENT_CLASS(station_del,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct station_del_parameters *params),
        TP_ARGS(wiphy, netdev, params),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(sta_mac)
                __field(u8, subtype)
                __field(u16, reason_code)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(sta_mac, params->mac);
                __entry->subtype = params->subtype;
                __entry->reason_code = params->reason_code;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: " MAC_PR_FMT
                  ", subtype: %u, reason_code: %u",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(sta_mac),
                  __entry->subtype, __entry->reason_code)
);

DEFINE_EVENT(station_del, rdev_del_station,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct station_del_parameters *params),
        TP_ARGS(wiphy, netdev, params)
);

DEFINE_EVENT(wiphy_netdev_mac_evt, rdev_get_station,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *mac),
        TP_ARGS(wiphy, netdev, mac)
);

DEFINE_EVENT(wiphy_netdev_mac_evt, rdev_del_mpath,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *mac),
        TP_ARGS(wiphy, netdev, mac)
);

DEFINE_EVENT(wiphy_netdev_mac_evt, rdev_set_wds_peer,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *mac),
        TP_ARGS(wiphy, netdev, mac)
);

TRACE_EVENT(rdev_dump_station,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int _idx,
                 u8 *mac),
        TP_ARGS(wiphy, netdev, _idx, mac),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(sta_mac)
                __field(int, idx)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(sta_mac, mac);
                __entry->idx = _idx;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: " MAC_PR_FMT ", idx: %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(sta_mac),
                  __entry->idx)
);

TRACE_EVENT(rdev_return_int_station_info,
        TP_PROTO(struct wiphy *wiphy, int ret, struct station_info *sinfo),
        TP_ARGS(wiphy, ret, sinfo),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(int, ret)
                SINFO_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->ret = ret;
                SINFO_ASSIGN;
        ),
        TP_printk(WIPHY_PR_FMT ", returned %d" ,
                  WIPHY_PR_ARG, __entry->ret)
);

DECLARE_EVENT_CLASS(mpath_evt,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *dst,
                 u8 *next_hop),
        TP_ARGS(wiphy, netdev, dst, next_hop),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(dst)
                MAC_ENTRY(next_hop)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(dst, dst);
                MAC_ASSIGN(next_hop, next_hop);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", destination: " MAC_PR_FMT ", next hop: " MAC_PR_FMT,
                  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(dst),
                  MAC_PR_ARG(next_hop))
);

DEFINE_EVENT(mpath_evt, rdev_add_mpath,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *dst,
                 u8 *next_hop),
        TP_ARGS(wiphy, netdev, dst, next_hop)
);

DEFINE_EVENT(mpath_evt, rdev_change_mpath,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *dst,
                 u8 *next_hop),
        TP_ARGS(wiphy, netdev, dst, next_hop)
);

DEFINE_EVENT(mpath_evt, rdev_get_mpath,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *dst,
                 u8 *next_hop),
        TP_ARGS(wiphy, netdev, dst, next_hop)
);

TRACE_EVENT(rdev_dump_mpath,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int _idx,
                 u8 *dst, u8 *next_hop),
        TP_ARGS(wiphy, netdev, _idx, dst, next_hop),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(dst)
                MAC_ENTRY(next_hop)
                __field(int, idx)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(dst, dst);
                MAC_ASSIGN(next_hop, next_hop);
                __entry->idx = _idx;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", index: %d, destination: "
                  MAC_PR_FMT ", next hop: " MAC_PR_FMT,
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->idx, MAC_PR_ARG(dst),
                  MAC_PR_ARG(next_hop))
);

TRACE_EVENT(rdev_get_mpp,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 u8 *dst, u8 *mpp),
        TP_ARGS(wiphy, netdev, dst, mpp),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(dst)
                MAC_ENTRY(mpp)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(dst, dst);
                MAC_ASSIGN(mpp, mpp);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", destination: " MAC_PR_FMT
                  ", mpp: " MAC_PR_FMT, WIPHY_PR_ARG, NETDEV_PR_ARG,
                  MAC_PR_ARG(dst), MAC_PR_ARG(mpp))
);

TRACE_EVENT(rdev_dump_mpp,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int _idx,
                 u8 *dst, u8 *mpp),
        TP_ARGS(wiphy, netdev, _idx, dst, mpp),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(dst)
                MAC_ENTRY(mpp)
                __field(int, idx)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(dst, dst);
                MAC_ASSIGN(mpp, mpp);
                __entry->idx = _idx;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", index: %d, destination: "
                  MAC_PR_FMT ", mpp: " MAC_PR_FMT,
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->idx, MAC_PR_ARG(dst),
                  MAC_PR_ARG(mpp))
);

TRACE_EVENT(rdev_return_int_mpath_info,
        TP_PROTO(struct wiphy *wiphy, int ret, struct mpath_info *pinfo),
        TP_ARGS(wiphy, ret, pinfo),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(int, ret)
                __field(int, generation)
                __field(u32, filled)
                __field(u32, frame_qlen)
                __field(u32, sn)
                __field(u32, metric)
                __field(u32, exptime)
                __field(u32, discovery_timeout)
                __field(u8, discovery_retries)
                __field(u8, flags)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->ret = ret;
                __entry->generation = pinfo->generation;
                __entry->filled = pinfo->filled;
                __entry->frame_qlen = pinfo->frame_qlen;
                __entry->sn = pinfo->sn;
                __entry->metric = pinfo->metric;
                __entry->exptime = pinfo->exptime;
                __entry->discovery_timeout = pinfo->discovery_timeout;
                __entry->discovery_retries = pinfo->discovery_retries;
                __entry->flags = pinfo->flags;
        ),
        TP_printk(WIPHY_PR_FMT ", returned %d. mpath info - generation: %d, "
                  "filled: %u, frame qlen: %u, sn: %u, metric: %u, exptime: %u,"
                  " discovery timeout: %u, discovery retries: %u, flags: %u",
                  WIPHY_PR_ARG, __entry->ret, __entry->generation,
                  __entry->filled, __entry->frame_qlen, __entry->sn,
                  __entry->metric, __entry->exptime, __entry->discovery_timeout,
                  __entry->discovery_retries, __entry->flags)
);

TRACE_EVENT(rdev_return_int_mesh_config,
        TP_PROTO(struct wiphy *wiphy, int ret, struct mesh_config *conf),
        TP_ARGS(wiphy, ret, conf),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                MESH_CFG_ENTRY
                __field(int, ret)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                MESH_CFG_ASSIGN;
                __entry->ret = ret;
        ),
        TP_printk(WIPHY_PR_FMT ", returned: %d",
                  WIPHY_PR_ARG, __entry->ret)
);

TRACE_EVENT(rdev_update_mesh_config,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u32 mask,
                 const struct mesh_config *conf),
        TP_ARGS(wiphy, netdev, mask, conf),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MESH_CFG_ENTRY
                __field(u32, mask)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MESH_CFG_ASSIGN;
                __entry->mask = mask;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", mask: %u",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->mask)
);

TRACE_EVENT(rdev_join_mesh,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 const struct mesh_config *conf,
                 const struct mesh_setup *setup),
        TP_ARGS(wiphy, netdev, conf, setup),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MESH_CFG_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MESH_CFG_ASSIGN;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT,
                  WIPHY_PR_ARG, NETDEV_PR_ARG)
);

TRACE_EVENT(rdev_change_bss,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct bss_parameters *params),
        TP_ARGS(wiphy, netdev, params),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(int, use_cts_prot)
                __field(int, use_short_preamble)
                __field(int, use_short_slot_time)
                __field(int, ap_isolate)
                __field(int, ht_opmode)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->use_cts_prot = params->use_cts_prot;
                __entry->use_short_preamble = params->use_short_preamble;
                __entry->use_short_slot_time = params->use_short_slot_time;
                __entry->ap_isolate = params->ap_isolate;
                __entry->ht_opmode = params->ht_opmode;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", use cts prot: %d, "
                  "use short preamble: %d, use short slot time: %d, "
                  "ap isolate: %d, ht opmode: %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->use_cts_prot,
                  __entry->use_short_preamble, __entry->use_short_slot_time,
                  __entry->ap_isolate, __entry->ht_opmode)
);

TRACE_EVENT(rdev_set_txq_params,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct ieee80211_txq_params *params),
        TP_ARGS(wiphy, netdev, params),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(enum nl80211_ac, ac)
                __field(u16, txop)
                __field(u16, cwmin)
                __field(u16, cwmax)
                __field(u8, aifs)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->ac = params->ac;
                __entry->txop = params->txop;
                __entry->cwmin = params->cwmin;
                __entry->cwmax = params->cwmax;
                __entry->aifs = params->aifs;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", ac: %d, txop: %u, cwmin: %u, cwmax: %u, aifs: %u",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->ac, __entry->txop,
                  __entry->cwmin, __entry->cwmax, __entry->aifs)
);

TRACE_EVENT(rdev_libertas_set_mesh_channel,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct ieee80211_channel *chan),
        TP_ARGS(wiphy, netdev, chan),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                CHAN_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                CHAN_ASSIGN(chan);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_PR_FMT, WIPHY_PR_ARG,
                  NETDEV_PR_ARG, CHAN_PR_ARG)
);

TRACE_EVENT(rdev_set_monitor_channel,
        TP_PROTO(struct wiphy *wiphy,
                 struct cfg80211_chan_def *chandef),
        TP_ARGS(wiphy, chandef),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                CHAN_DEF_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                CHAN_DEF_ASSIGN(chandef);
        ),
        TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT,
                  WIPHY_PR_ARG, CHAN_DEF_PR_ARG)
);

TRACE_EVENT(rdev_auth,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_auth_request *req),
        TP_ARGS(wiphy, netdev, req),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(bssid)
                __field(enum nl80211_auth_type, auth_type)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                if (req->bss)
                        MAC_ASSIGN(bssid, req->bss->bssid);
                else
                        eth_zero_addr(__entry->bssid);
                __entry->auth_type = req->auth_type;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", auth type: %d, bssid: " MAC_PR_FMT,
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->auth_type,
                  MAC_PR_ARG(bssid))
);

TRACE_EVENT(rdev_assoc,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_assoc_request *req),
        TP_ARGS(wiphy, netdev, req),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(bssid)
                MAC_ENTRY(prev_bssid)
                __field(bool, use_mfp)
                __field(u32, flags)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                if (req->bss)
                        MAC_ASSIGN(bssid, req->bss->bssid);
                else
                        eth_zero_addr(__entry->bssid);
                MAC_ASSIGN(prev_bssid, req->prev_bssid);
                __entry->use_mfp = req->use_mfp;
                __entry->flags = req->flags;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: " MAC_PR_FMT
                  ", previous bssid: " MAC_PR_FMT ", use mfp: %s, flags: %u",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(bssid),
                  MAC_PR_ARG(prev_bssid), BOOL_TO_STR(__entry->use_mfp),
                  __entry->flags)
);

TRACE_EVENT(rdev_deauth,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_deauth_request *req),
        TP_ARGS(wiphy, netdev, req),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(bssid)
                __field(u16, reason_code)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(bssid, req->bssid);
                __entry->reason_code = req->reason_code;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: " MAC_PR_FMT ", reason: %u",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(bssid),
                  __entry->reason_code)
);

TRACE_EVENT(rdev_disassoc,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_disassoc_request *req),
        TP_ARGS(wiphy, netdev, req),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(bssid)
                __field(u16, reason_code)
                __field(bool, local_state_change)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                if (req->bss)
                        MAC_ASSIGN(bssid, req->bss->bssid);
                else
                        eth_zero_addr(__entry->bssid);
                __entry->reason_code = req->reason_code;
                __entry->local_state_change = req->local_state_change;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: " MAC_PR_FMT
                  ", reason: %u, local state change: %s",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(bssid),
                  __entry->reason_code,
                  BOOL_TO_STR(__entry->local_state_change))
);

TRACE_EVENT(rdev_mgmt_tx_cancel_wait,
        TP_PROTO(struct wiphy *wiphy,
                 struct wireless_dev *wdev, u64 cookie),
        TP_ARGS(wiphy, wdev, cookie),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(u64, cookie)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->cookie = cookie;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie: %llu ",
                  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->cookie)
);

TRACE_EVENT(rdev_set_power_mgmt,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 bool enabled, int timeout),
        TP_ARGS(wiphy, netdev, enabled, timeout),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(bool, enabled)
                __field(int, timeout)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->enabled = enabled;
                __entry->timeout = timeout;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %senabled, timeout: %d ",
                  WIPHY_PR_ARG, NETDEV_PR_ARG,
                  __entry->enabled ? "" : "not ", __entry->timeout)
);

TRACE_EVENT(rdev_connect,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_connect_params *sme),
        TP_ARGS(wiphy, netdev, sme),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(bssid)
                __array(char, ssid, IEEE80211_MAX_SSID_LEN + 1)
                __field(enum nl80211_auth_type, auth_type)
                __field(bool, privacy)
                __field(u32, wpa_versions)
                __field(u32, flags)
                MAC_ENTRY(prev_bssid)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(bssid, sme->bssid);
                memset(__entry->ssid, 0, IEEE80211_MAX_SSID_LEN + 1);
                memcpy(__entry->ssid, sme->ssid, sme->ssid_len);
                __entry->auth_type = sme->auth_type;
                __entry->privacy = sme->privacy;
                __entry->wpa_versions = sme->crypto.wpa_versions;
                __entry->flags = sme->flags;
                MAC_ASSIGN(prev_bssid, sme->prev_bssid);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: " MAC_PR_FMT
                  ", ssid: %s, auth type: %d, privacy: %s, wpa versions: %u, "
                  "flags: %u, previous bssid: " MAC_PR_FMT,
                  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(bssid), __entry->ssid,
                  __entry->auth_type, BOOL_TO_STR(__entry->privacy),
                  __entry->wpa_versions, __entry->flags, MAC_PR_ARG(prev_bssid))
);

TRACE_EVENT(rdev_update_connect_params,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_connect_params *sme, u32 changed),
        TP_ARGS(wiphy, netdev, sme, changed),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(u32, changed)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->changed = changed;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", parameters changed: %u",
                  WIPHY_PR_ARG, NETDEV_PR_ARG,  __entry->changed)
);

TRACE_EVENT(rdev_set_cqm_rssi_config,
        TP_PROTO(struct wiphy *wiphy,
                 struct net_device *netdev, s32 rssi_thold,
                 u32 rssi_hyst),
        TP_ARGS(wiphy, netdev, rssi_thold, rssi_hyst),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(s32, rssi_thold)
                __field(u32, rssi_hyst)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->rssi_thold = rssi_thold;
                __entry->rssi_hyst = rssi_hyst;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT
                  ", rssi_thold: %d, rssi_hyst: %u ",
                  WIPHY_PR_ARG, NETDEV_PR_ARG,
                 __entry->rssi_thold, __entry->rssi_hyst)
);

TRACE_EVENT(rdev_set_cqm_rssi_range_config,
        TP_PROTO(struct wiphy *wiphy,
                 struct net_device *netdev, s32 low, s32 high),
        TP_ARGS(wiphy, netdev, low, high),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(s32, rssi_low)
                __field(s32, rssi_high)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->rssi_low = low;
                __entry->rssi_high = high;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT
                  ", range: %d - %d ",
                  WIPHY_PR_ARG, NETDEV_PR_ARG,
                  __entry->rssi_low, __entry->rssi_high)
);

TRACE_EVENT(rdev_set_cqm_txe_config,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u32 rate,
                 u32 pkts, u32 intvl),
        TP_ARGS(wiphy, netdev, rate, pkts, intvl),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(u32, rate)
                __field(u32, pkts)
                __field(u32, intvl)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->rate = rate;
                __entry->pkts = pkts;
                __entry->intvl = intvl;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", rate: %u, packets: %u, interval: %u",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->rate, __entry->pkts,
                  __entry->intvl)
);

TRACE_EVENT(rdev_disconnect,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 u16 reason_code),
        TP_ARGS(wiphy, netdev, reason_code),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(u16, reason_code)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->reason_code = reason_code;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", reason code: %u", WIPHY_PR_ARG,
                  NETDEV_PR_ARG, __entry->reason_code)
);

TRACE_EVENT(rdev_join_ibss,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_ibss_params *params),
        TP_ARGS(wiphy, netdev, params),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(bssid)
                __array(char, ssid, IEEE80211_MAX_SSID_LEN + 1)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(bssid, params->bssid);
                memset(__entry->ssid, 0, IEEE80211_MAX_SSID_LEN + 1);
                memcpy(__entry->ssid, params->ssid, params->ssid_len);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: " MAC_PR_FMT ", ssid: %s",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(bssid), __entry->ssid)
);

TRACE_EVENT(rdev_join_ocb,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 const struct ocb_setup *setup),
        TP_ARGS(wiphy, netdev, setup),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT,
                  WIPHY_PR_ARG, NETDEV_PR_ARG)
);

TRACE_EVENT(rdev_set_wiphy_params,
        TP_PROTO(struct wiphy *wiphy, u32 changed),
        TP_ARGS(wiphy, changed),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(u32, changed)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->changed = changed;
        ),
        TP_printk(WIPHY_PR_FMT ", changed: %u",
                  WIPHY_PR_ARG, __entry->changed)
);

DEFINE_EVENT(wiphy_wdev_evt, rdev_get_tx_power,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
        TP_ARGS(wiphy, wdev)
);

TRACE_EVENT(rdev_set_tx_power,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 enum nl80211_tx_power_setting type, int mbm),
        TP_ARGS(wiphy, wdev, type, mbm),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(enum nl80211_tx_power_setting, type)
                __field(int, mbm)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->type = type;
                __entry->mbm = mbm;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", type: %u, mbm: %d",
                  WIPHY_PR_ARG, WDEV_PR_ARG,__entry->type, __entry->mbm)
);

TRACE_EVENT(rdev_return_int_int,
        TP_PROTO(struct wiphy *wiphy, int func_ret, int func_fill),
        TP_ARGS(wiphy, func_ret, func_fill),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(int, func_ret)
                __field(int, func_fill)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->func_ret = func_ret;
                __entry->func_fill = func_fill;
        ),
        TP_printk(WIPHY_PR_FMT ", function returns: %d, function filled: %d",
                  WIPHY_PR_ARG, __entry->func_ret, __entry->func_fill)
);

#ifdef CONFIG_NL80211_TESTMODE
TRACE_EVENT(rdev_testmode_cmd,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
        TP_ARGS(wiphy, wdev),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
        ),
        TP_printk(WIPHY_PR_FMT WDEV_PR_FMT, WIPHY_PR_ARG, WDEV_PR_ARG)
);

TRACE_EVENT(rdev_testmode_dump,
        TP_PROTO(struct wiphy *wiphy),
        TP_ARGS(wiphy),
        TP_STRUCT__entry(
                WIPHY_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
        ),
        TP_printk(WIPHY_PR_FMT, WIPHY_PR_ARG)
);
#endif /* CONFIG_NL80211_TESTMODE */

TRACE_EVENT(rdev_set_bitrate_mask,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 const u8 *peer, const struct cfg80211_bitrate_mask *mask),
        TP_ARGS(wiphy, netdev, peer, mask),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(peer)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(peer, peer);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: " MAC_PR_FMT,
                  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer))
);

TRACE_EVENT(rdev_update_mgmt_frame_registrations,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 struct mgmt_frame_regs *upd),
        TP_ARGS(wiphy, wdev, upd),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(u16, global_stypes)
                __field(u16, interface_stypes)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->global_stypes = upd->global_stypes;
                __entry->interface_stypes = upd->interface_stypes;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", global: 0x%.2x, intf: 0x%.2x",
                  WIPHY_PR_ARG, WDEV_PR_ARG,
                  __entry->global_stypes, __entry->interface_stypes)
);

TRACE_EVENT(rdev_return_int_tx_rx,
        TP_PROTO(struct wiphy *wiphy, int ret, u32 tx, u32 rx),
        TP_ARGS(wiphy, ret, tx, rx),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(int, ret)
                __field(u32, tx)
                __field(u32, rx)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->ret = ret;
                __entry->tx = tx;
                __entry->rx = rx;
        ),
        TP_printk(WIPHY_PR_FMT ", returned %d, tx: %u, rx: %u",
                  WIPHY_PR_ARG, __entry->ret, __entry->tx, __entry->rx)
);

TRACE_EVENT(rdev_return_void_tx_rx,
        TP_PROTO(struct wiphy *wiphy, u32 tx, u32 tx_max,
                 u32 rx, u32 rx_max),
        TP_ARGS(wiphy, tx, tx_max, rx, rx_max),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(u32, tx)
                __field(u32, tx_max)
                __field(u32, rx)
                __field(u32, rx_max)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->tx = tx;
                __entry->tx_max = tx_max;
                __entry->rx = rx;
                __entry->rx_max = rx_max;
        ),
        TP_printk(WIPHY_PR_FMT ", tx: %u, tx_max: %u, rx: %u, rx_max: %u ",
                  WIPHY_PR_ARG, __entry->tx, __entry->tx_max, __entry->rx,
                  __entry->rx_max)
);

DECLARE_EVENT_CLASS(tx_rx_evt,
        TP_PROTO(struct wiphy *wiphy, u32 tx, u32 rx),
        TP_ARGS(wiphy, tx, rx),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(u32, tx)
                __field(u32, rx)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->tx = tx;
                __entry->rx = rx;
        ),
        TP_printk(WIPHY_PR_FMT ", tx: %u, rx: %u ",
                  WIPHY_PR_ARG, __entry->tx, __entry->rx)
);

DEFINE_EVENT(tx_rx_evt, rdev_set_antenna,
        TP_PROTO(struct wiphy *wiphy, u32 tx, u32 rx),
        TP_ARGS(wiphy, tx, rx)
);

DECLARE_EVENT_CLASS(wiphy_netdev_id_evt,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u64 id),
        TP_ARGS(wiphy, netdev, id),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(u64, id)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->id = id;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", id: %llu",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->id)
);

DEFINE_EVENT(wiphy_netdev_id_evt, rdev_sched_scan_start,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u64 id),
        TP_ARGS(wiphy, netdev, id)
);

DEFINE_EVENT(wiphy_netdev_id_evt, rdev_sched_scan_stop,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u64 id),
        TP_ARGS(wiphy, netdev, id)
);

TRACE_EVENT(rdev_tdls_mgmt,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 u8 *peer, u8 action_code, u8 dialog_token,
                 u16 status_code, u32 peer_capability,
                 bool initiator, const u8 *buf, size_t len),
        TP_ARGS(wiphy, netdev, peer, action_code, dialog_token, status_code,
                peer_capability, initiator, buf, len),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(peer)
                __field(u8, action_code)
                __field(u8, dialog_token)
                __field(u16, status_code)
                __field(u32, peer_capability)
                __field(bool, initiator)
                __dynamic_array(u8, buf, len)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(peer, peer);
                __entry->action_code = action_code;
                __entry->dialog_token = dialog_token;
                __entry->status_code = status_code;
                __entry->peer_capability = peer_capability;
                __entry->initiator = initiator;
                memcpy(__get_dynamic_array(buf), buf, len);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT ", action_code: %u, "
                  "dialog_token: %u, status_code: %u, peer_capability: %u "
                  "initiator: %s buf: %#.2x ",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer),
                  __entry->action_code, __entry->dialog_token,
                  __entry->status_code, __entry->peer_capability,
                  BOOL_TO_STR(__entry->initiator),
                  ((u8 *)__get_dynamic_array(buf))[0])
);

TRACE_EVENT(rdev_dump_survey,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int _idx),
        TP_ARGS(wiphy, netdev, _idx),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(int, idx)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->idx = _idx;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", index: %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->idx)
);

TRACE_EVENT(rdev_return_int_survey_info,
        TP_PROTO(struct wiphy *wiphy, int ret, struct survey_info *info),
        TP_ARGS(wiphy, ret, info),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                CHAN_ENTRY
                __field(int, ret)
                __field(u64, time)
                __field(u64, time_busy)
                __field(u64, time_ext_busy)
                __field(u64, time_rx)
                __field(u64, time_tx)
                __field(u64, time_scan)
                __field(u32, filled)
                __field(s8, noise)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                CHAN_ASSIGN(info->channel);
                __entry->ret = ret;
                __entry->time = info->time;
                __entry->time_busy = info->time_busy;
                __entry->time_ext_busy = info->time_ext_busy;
                __entry->time_rx = info->time_rx;
                __entry->time_tx = info->time_tx;
                __entry->time_scan = info->time_scan;
                __entry->filled = info->filled;
                __entry->noise = info->noise;
        ),
        TP_printk(WIPHY_PR_FMT ", returned: %d, " CHAN_PR_FMT
                  ", channel time: %llu, channel time busy: %llu, "
                  "channel time extension busy: %llu, channel time rx: %llu, "
                  "channel time tx: %llu, scan time: %llu, filled: %u, noise: %d",
                  WIPHY_PR_ARG, __entry->ret, CHAN_PR_ARG,
                  __entry->time, __entry->time_busy,
                  __entry->time_ext_busy, __entry->time_rx,
                  __entry->time_tx, __entry->time_scan,
                  __entry->filled, __entry->noise)
);

TRACE_EVENT(rdev_tdls_oper,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 u8 *peer, enum nl80211_tdls_operation oper),
        TP_ARGS(wiphy, netdev, peer, oper),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(peer)
                __field(enum nl80211_tdls_operation, oper)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(peer, peer);
                __entry->oper = oper;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT ", oper: %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), __entry->oper)
);

DECLARE_EVENT_CLASS(rdev_pmksa,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_pmksa *pmksa),
        TP_ARGS(wiphy, netdev, pmksa),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(bssid)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(bssid, pmksa->bssid);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: " MAC_PR_FMT,
                  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(bssid))
);

TRACE_EVENT(rdev_probe_client,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 const u8 *peer),
        TP_ARGS(wiphy, netdev, peer),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(peer)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(peer, peer);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT,
                  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer))
);

DEFINE_EVENT(rdev_pmksa, rdev_set_pmksa,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_pmksa *pmksa),
        TP_ARGS(wiphy, netdev, pmksa)
);

DEFINE_EVENT(rdev_pmksa, rdev_del_pmksa,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_pmksa *pmksa),
        TP_ARGS(wiphy, netdev, pmksa)
);

TRACE_EVENT(rdev_remain_on_channel,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 struct ieee80211_channel *chan,
                 unsigned int duration),
        TP_ARGS(wiphy, wdev, chan, duration),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                CHAN_ENTRY
                __field(unsigned int, duration)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                CHAN_ASSIGN(chan);
                __entry->duration = duration;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", " CHAN_PR_FMT ", duration: %u",
                  WIPHY_PR_ARG, WDEV_PR_ARG, CHAN_PR_ARG, __entry->duration)
);

TRACE_EVENT(rdev_return_int_cookie,
        TP_PROTO(struct wiphy *wiphy, int ret, u64 cookie),
        TP_ARGS(wiphy, ret, cookie),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(int, ret)
                __field(u64, cookie)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->ret = ret;
                __entry->cookie = cookie;
        ),
        TP_printk(WIPHY_PR_FMT ", returned %d, cookie: %llu",
                  WIPHY_PR_ARG, __entry->ret, __entry->cookie)
);

TRACE_EVENT(rdev_cancel_remain_on_channel,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie),
        TP_ARGS(wiphy, wdev, cookie),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(u64, cookie)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->cookie = cookie;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie: %llu",
                  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->cookie)
);

TRACE_EVENT(rdev_mgmt_tx,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 struct cfg80211_mgmt_tx_params *params),
        TP_ARGS(wiphy, wdev, params),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                CHAN_ENTRY
                __field(bool, offchan)
                __field(unsigned int, wait)
                __field(bool, no_cck)
                __field(bool, dont_wait_for_ack)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                CHAN_ASSIGN(params->chan);
                __entry->offchan = params->offchan;
                __entry->wait = params->wait;
                __entry->no_cck = params->no_cck;
                __entry->dont_wait_for_ack = params->dont_wait_for_ack;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", " CHAN_PR_FMT ", offchan: %s,"
                  " wait: %u, no cck: %s, dont wait for ack: %s",
                  WIPHY_PR_ARG, WDEV_PR_ARG, CHAN_PR_ARG,
                  BOOL_TO_STR(__entry->offchan), __entry->wait,
                  BOOL_TO_STR(__entry->no_cck),
                  BOOL_TO_STR(__entry->dont_wait_for_ack))
);

TRACE_EVENT(rdev_tx_control_port,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 const u8 *buf, size_t len, const u8 *dest, __be16 proto,
                 bool unencrypted),
        TP_ARGS(wiphy, netdev, buf, len, dest, proto, unencrypted),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(dest)
                __field(__be16, proto)
                __field(bool, unencrypted)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(dest, dest);
                __entry->proto = proto;
                __entry->unencrypted = unencrypted;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT ","
                  " proto: 0x%x, unencrypted: %s",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(dest),
                  be16_to_cpu(__entry->proto),
                  BOOL_TO_STR(__entry->unencrypted))
);

TRACE_EVENT(rdev_set_noack_map,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 u16 noack_map),
        TP_ARGS(wiphy, netdev, noack_map),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(u16, noack_map)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->noack_map = noack_map;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", noack_map: %u",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->noack_map)
);

DEFINE_EVENT(wiphy_wdev_evt, rdev_get_channel,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
        TP_ARGS(wiphy, wdev)
);

TRACE_EVENT(rdev_return_chandef,
        TP_PROTO(struct wiphy *wiphy, int ret,
                 struct cfg80211_chan_def *chandef),
        TP_ARGS(wiphy, ret, chandef),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(int, ret)
                CHAN_DEF_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                if (ret == 0)
                        CHAN_DEF_ASSIGN(chandef);
                else
                        CHAN_DEF_ASSIGN((struct cfg80211_chan_def *)NULL);
                __entry->ret = ret;
        ),
        TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT ", ret: %d",
                  WIPHY_PR_ARG, CHAN_DEF_PR_ARG, __entry->ret)
);

DEFINE_EVENT(wiphy_wdev_evt, rdev_start_p2p_device,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
        TP_ARGS(wiphy, wdev)
);

DEFINE_EVENT(wiphy_wdev_evt, rdev_stop_p2p_device,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
        TP_ARGS(wiphy, wdev)
);

TRACE_EVENT(rdev_start_nan,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 struct cfg80211_nan_conf *conf),
        TP_ARGS(wiphy, wdev, conf),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(u8, master_pref)
                __field(u8, bands)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->master_pref = conf->master_pref;
                __entry->bands = conf->bands;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT
                  ", master preference: %u, bands: 0x%0x",
                  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->master_pref,
                  __entry->bands)
);

TRACE_EVENT(rdev_nan_change_conf,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 struct cfg80211_nan_conf *conf, u32 changes),
        TP_ARGS(wiphy, wdev, conf, changes),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(u8, master_pref)
                __field(u8, bands)
                __field(u32, changes)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->master_pref = conf->master_pref;
                __entry->bands = conf->bands;
                __entry->changes = changes;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT
                  ", master preference: %u, bands: 0x%0x, changes: %x",
                  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->master_pref,
                  __entry->bands, __entry->changes)
);

DEFINE_EVENT(wiphy_wdev_evt, rdev_stop_nan,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
        TP_ARGS(wiphy, wdev)
);

TRACE_EVENT(rdev_add_nan_func,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 const struct cfg80211_nan_func *func),
        TP_ARGS(wiphy, wdev, func),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(u8, func_type)
                __field(u64, cookie)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->func_type = func->type;
                __entry->cookie = func->cookie
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", type=%u, cookie=%llu",
                  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->func_type,
                  __entry->cookie)
);

TRACE_EVENT(rdev_del_nan_func,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 u64 cookie),
        TP_ARGS(wiphy, wdev, cookie),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(u64, cookie)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->cookie = cookie;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie=%llu",
                  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->cookie)
);

TRACE_EVENT(rdev_set_mac_acl,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_acl_data *params),
        TP_ARGS(wiphy, netdev, params),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(u32, acl_policy)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->acl_policy = params->acl_policy;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", acl policy: %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->acl_policy)
);

TRACE_EVENT(rdev_update_ft_ies,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_update_ft_ies_params *ftie),
        TP_ARGS(wiphy, netdev, ftie),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(u16, md)
                __dynamic_array(u8, ie, ftie->ie_len)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->md = ftie->md;
                memcpy(__get_dynamic_array(ie), ftie->ie, ftie->ie_len);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", md: 0x%x",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->md)
);

TRACE_EVENT(rdev_crit_proto_start,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 enum nl80211_crit_proto_id protocol, u16 duration),
        TP_ARGS(wiphy, wdev, protocol, duration),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(u16, proto)
                __field(u16, duration)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->proto = protocol;
                __entry->duration = duration;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", proto=%x, duration=%u",
                  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->proto, __entry->duration)
);

TRACE_EVENT(rdev_crit_proto_stop,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
        TP_ARGS(wiphy, wdev),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT,
                  WIPHY_PR_ARG, WDEV_PR_ARG)
);

TRACE_EVENT(rdev_channel_switch,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_csa_settings *params),
        TP_ARGS(wiphy, netdev, params),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                CHAN_DEF_ENTRY
                __field(bool, radar_required)
                __field(bool, block_tx)
                __field(u8, count)
                __dynamic_array(u16, bcn_ofs, params->n_counter_offsets_beacon)
                __dynamic_array(u16, pres_ofs, params->n_counter_offsets_presp)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                CHAN_DEF_ASSIGN(&params->chandef);
                __entry->radar_required = params->radar_required;
                __entry->block_tx = params->block_tx;
                __entry->count = params->count;
                memcpy(__get_dynamic_array(bcn_ofs),
                       params->counter_offsets_beacon,
                       params->n_counter_offsets_beacon * sizeof(u16));

                /* probe response offsets are optional */
                if (params->n_counter_offsets_presp)
                        memcpy(__get_dynamic_array(pres_ofs),
                               params->counter_offsets_presp,
                               params->n_counter_offsets_presp * sizeof(u16));
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT
                  ", block_tx: %d, count: %u, radar_required: %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG,
                  __entry->block_tx, __entry->count, __entry->radar_required)
);

TRACE_EVENT(rdev_set_qos_map,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_qos_map *qos_map),
        TP_ARGS(wiphy, netdev, qos_map),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                QOS_MAP_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                QOS_MAP_ASSIGN(qos_map);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", num_des: %u",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->num_des)
);

TRACE_EVENT(rdev_set_ap_chanwidth,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_chan_def *chandef),
        TP_ARGS(wiphy, netdev, chandef),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                CHAN_DEF_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                CHAN_DEF_ASSIGN(chandef);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT,
                  WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG)
);

TRACE_EVENT(rdev_add_tx_ts,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 u8 tsid, const u8 *peer, u8 user_prio, u16 admitted_time),
        TP_ARGS(wiphy, netdev, tsid, peer, user_prio, admitted_time),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(peer)
                __field(u8, tsid)
                __field(u8, user_prio)
                __field(u16, admitted_time)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(peer, peer);
                __entry->tsid = tsid;
                __entry->user_prio = user_prio;
                __entry->admitted_time = admitted_time;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT ", TSID %d, UP %d, time %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer),
                  __entry->tsid, __entry->user_prio, __entry->admitted_time)
);

TRACE_EVENT(rdev_del_tx_ts,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 u8 tsid, const u8 *peer),
        TP_ARGS(wiphy, netdev, tsid, peer),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(peer)
                __field(u8, tsid)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(peer, peer);
                __entry->tsid = tsid;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT ", TSID %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), __entry->tsid)
);

TRACE_EVENT(rdev_tdls_channel_switch,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 const u8 *addr, u8 oper_class,
                 struct cfg80211_chan_def *chandef),
        TP_ARGS(wiphy, netdev, addr, oper_class, chandef),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(addr)
                __field(u8, oper_class)
                CHAN_DEF_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(addr, addr);
                CHAN_DEF_ASSIGN(chandef);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT
                  " oper class %d, " CHAN_DEF_PR_FMT,
                  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(addr),
                  __entry->oper_class, CHAN_DEF_PR_ARG)
);

TRACE_EVENT(rdev_tdls_cancel_channel_switch,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 const u8 *addr),
        TP_ARGS(wiphy, netdev, addr),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(addr)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(addr, addr);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT,
                  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(addr))
);

TRACE_EVENT(rdev_set_pmk,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_pmk_conf *pmk_conf),

        TP_ARGS(wiphy, netdev, pmk_conf),

        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(aa)
                __field(u8, pmk_len)
                __field(u8, pmk_r0_name_len)
                __dynamic_array(u8, pmk, pmk_conf->pmk_len)
                __dynamic_array(u8, pmk_r0_name, WLAN_PMK_NAME_LEN)
        ),

        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(aa, pmk_conf->aa);
                __entry->pmk_len = pmk_conf->pmk_len;
                __entry->pmk_r0_name_len =
                pmk_conf->pmk_r0_name ? WLAN_PMK_NAME_LEN : 0;
                memcpy(__get_dynamic_array(pmk), pmk_conf->pmk,
                       pmk_conf->pmk_len);
                memcpy(__get_dynamic_array(pmk_r0_name), pmk_conf->pmk_r0_name,
                       pmk_conf->pmk_r0_name ? WLAN_PMK_NAME_LEN : 0);
        ),

        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT
                  "pmk_len=%u, pmk: %s pmk_r0_name: %s", WIPHY_PR_ARG,
                  NETDEV_PR_ARG, MAC_PR_ARG(aa), __entry->pmk_len,
                  __print_array(__get_dynamic_array(pmk),
                                __get_dynamic_array_len(pmk), 1),
                  __entry->pmk_r0_name_len ?
                  __print_array(__get_dynamic_array(pmk_r0_name),
                                __get_dynamic_array_len(pmk_r0_name), 1) : "")
);

TRACE_EVENT(rdev_del_pmk,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *aa),

        TP_ARGS(wiphy, netdev, aa),

        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(aa)
        ),

        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(aa, aa);
        ),

        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT,
                  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(aa))
);

TRACE_EVENT(rdev_external_auth,
            TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                     struct cfg80211_external_auth_params *params),
            TP_ARGS(wiphy, netdev, params),
            TP_STRUCT__entry(WIPHY_ENTRY
                             NETDEV_ENTRY
                             MAC_ENTRY(bssid)
                             __array(u8, ssid, IEEE80211_MAX_SSID_LEN + 1)
                             __field(u16, status)
            ),
            TP_fast_assign(WIPHY_ASSIGN;
                           NETDEV_ASSIGN;
                           MAC_ASSIGN(bssid, params->bssid);
                           memset(__entry->ssid, 0, IEEE80211_MAX_SSID_LEN + 1);
                           memcpy(__entry->ssid, params->ssid.ssid,
                                  params->ssid.ssid_len);
                           __entry->status = params->status;
            ),
            TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: " MAC_PR_FMT
                      ", ssid: %s, status: %u", WIPHY_PR_ARG, NETDEV_PR_ARG,
                      __entry->bssid, __entry->ssid, __entry->status)
);

TRACE_EVENT(rdev_start_radar_detection,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_chan_def *chandef,
                 u32 cac_time_ms),
        TP_ARGS(wiphy, netdev, chandef, cac_time_ms),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                CHAN_DEF_ENTRY
                __field(u32, cac_time_ms)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                CHAN_DEF_ASSIGN(chandef);
                __entry->cac_time_ms = cac_time_ms;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT
                  ", cac_time_ms=%u",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG,
                  __entry->cac_time_ms)
);

TRACE_EVENT(rdev_set_mcast_rate,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 int *mcast_rate),
        TP_ARGS(wiphy, netdev, mcast_rate),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __array(int, mcast_rate, NUM_NL80211_BANDS)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                memcpy(__entry->mcast_rate, mcast_rate,
                       sizeof(int) * NUM_NL80211_BANDS);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", "
                  "mcast_rates [2.4GHz=0x%x, 5.2GHz=0x%x, 6GHz=0x%x, 60GHz=0x%x]",
                  WIPHY_PR_ARG, NETDEV_PR_ARG,
                  __entry->mcast_rate[NL80211_BAND_2GHZ],
                  __entry->mcast_rate[NL80211_BAND_5GHZ],
                  __entry->mcast_rate[NL80211_BAND_6GHZ],
                  __entry->mcast_rate[NL80211_BAND_60GHZ])
);

TRACE_EVENT(rdev_set_coalesce,
        TP_PROTO(struct wiphy *wiphy, struct cfg80211_coalesce *coalesce),
        TP_ARGS(wiphy, coalesce),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(int, n_rules)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->n_rules = coalesce ? coalesce->n_rules : 0;
        ),
        TP_printk(WIPHY_PR_FMT ", n_rules=%d",
                  WIPHY_PR_ARG, __entry->n_rules)
);

DEFINE_EVENT(wiphy_wdev_evt, rdev_abort_scan,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
        TP_ARGS(wiphy, wdev)
);

TRACE_EVENT(rdev_set_multicast_to_unicast,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 const bool enabled),
        TP_ARGS(wiphy, netdev, enabled),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(bool, enabled)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->enabled = enabled;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", unicast: %s",
                  WIPHY_PR_ARG, NETDEV_PR_ARG,
                  BOOL_TO_STR(__entry->enabled))
);

DEFINE_EVENT(wiphy_wdev_evt, rdev_get_txq_stats,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
        TP_ARGS(wiphy, wdev)
);

TRACE_EVENT(rdev_get_ftm_responder_stats,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_ftm_responder_stats *ftm_stats),

        TP_ARGS(wiphy, netdev, ftm_stats),

        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(u64, timestamp)
                __field(u32, success_num)
                __field(u32, partial_num)
                __field(u32, failed_num)
                __field(u32, asap_num)
                __field(u32, non_asap_num)
                __field(u64, duration)
                __field(u32, unknown_triggers)
                __field(u32, reschedule)
                __field(u32, out_of_window)
        ),

        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->success_num = ftm_stats->success_num;
                __entry->partial_num = ftm_stats->partial_num;
                __entry->failed_num = ftm_stats->failed_num;
                __entry->asap_num = ftm_stats->asap_num;
                __entry->non_asap_num = ftm_stats->non_asap_num;
                __entry->duration = ftm_stats->total_duration_ms;
                __entry->unknown_triggers = ftm_stats->unknown_triggers_num;
                __entry->reschedule = ftm_stats->reschedule_requests_num;
                __entry->out_of_window = ftm_stats->out_of_window_triggers_num;
        ),

        TP_printk(WIPHY_PR_FMT "Ftm responder stats: success %u, partial %u, "
                "failed %u, asap %u, non asap %u, total duration %llu, unknown "
                "triggers %u, rescheduled %u, out of window %u", WIPHY_PR_ARG,
                __entry->success_num, __entry->partial_num, __entry->failed_num,
                __entry->asap_num, __entry->non_asap_num, __entry->duration,
                __entry->unknown_triggers, __entry->reschedule,
                __entry->out_of_window)
);

DEFINE_EVENT(wiphy_wdev_cookie_evt, rdev_start_pmsr,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie),
        TP_ARGS(wiphy, wdev, cookie)
);

DEFINE_EVENT(wiphy_wdev_cookie_evt, rdev_abort_pmsr,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie),
        TP_ARGS(wiphy, wdev, cookie)
);

/*************************************************************
 *             cfg80211 exported functions traces                     *
 *************************************************************/

TRACE_EVENT(cfg80211_return_bool,
        TP_PROTO(bool ret),
        TP_ARGS(ret),
        TP_STRUCT__entry(
                __field(bool, ret)
        ),
        TP_fast_assign(
                __entry->ret = ret;
        ),
        TP_printk("returned %s", BOOL_TO_STR(__entry->ret))
);

DECLARE_EVENT_CLASS(cfg80211_netdev_mac_evt,
        TP_PROTO(struct net_device *netdev, const u8 *macaddr),
        TP_ARGS(netdev, macaddr),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                MAC_ENTRY(macaddr)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                MAC_ASSIGN(macaddr, macaddr);
        ),
        TP_printk(NETDEV_PR_FMT ", mac: " MAC_PR_FMT,
                  NETDEV_PR_ARG, MAC_PR_ARG(macaddr))
);

DEFINE_EVENT(cfg80211_netdev_mac_evt, cfg80211_notify_new_peer_candidate,
        TP_PROTO(struct net_device *netdev, const u8 *macaddr),
        TP_ARGS(netdev, macaddr)
);

DECLARE_EVENT_CLASS(netdev_evt_only,
        TP_PROTO(struct net_device *netdev),
        TP_ARGS(netdev),
        TP_STRUCT__entry(
                NETDEV_ENTRY
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
        ),
        TP_printk(NETDEV_PR_FMT , NETDEV_PR_ARG)
);

DEFINE_EVENT(netdev_evt_only, cfg80211_send_rx_auth,
        TP_PROTO(struct net_device *netdev),
        TP_ARGS(netdev)
);

TRACE_EVENT(cfg80211_send_rx_assoc,
        TP_PROTO(struct net_device *netdev, struct cfg80211_bss *bss),
        TP_ARGS(netdev, bss),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                MAC_ENTRY(bssid)
                CHAN_ENTRY
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                MAC_ASSIGN(bssid, bss->bssid);
                CHAN_ASSIGN(bss->channel);
        ),
        TP_printk(NETDEV_PR_FMT ", " MAC_PR_FMT ", " CHAN_PR_FMT,
                  NETDEV_PR_ARG, MAC_PR_ARG(bssid), CHAN_PR_ARG)
);

DECLARE_EVENT_CLASS(netdev_frame_event,
        TP_PROTO(struct net_device *netdev, const u8 *buf, int len),
        TP_ARGS(netdev, buf, len),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                __dynamic_array(u8, frame, len)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                memcpy(__get_dynamic_array(frame), buf, len);
        ),
        TP_printk(NETDEV_PR_FMT ", ftype:0x%.2x",
                  NETDEV_PR_ARG,
                  le16_to_cpup((__le16 *)__get_dynamic_array(frame)))
);

DEFINE_EVENT(netdev_frame_event, cfg80211_rx_unprot_mlme_mgmt,
        TP_PROTO(struct net_device *netdev, const u8 *buf, int len),
        TP_ARGS(netdev, buf, len)
);

DEFINE_EVENT(netdev_frame_event, cfg80211_rx_mlme_mgmt,
        TP_PROTO(struct net_device *netdev, const u8 *buf, int len),
        TP_ARGS(netdev, buf, len)
);

TRACE_EVENT(cfg80211_tx_mlme_mgmt,
        TP_PROTO(struct net_device *netdev, const u8 *buf, int len),
        TP_ARGS(netdev, buf, len),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                __dynamic_array(u8, frame, len)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                memcpy(__get_dynamic_array(frame), buf, len);
        ),
        TP_printk(NETDEV_PR_FMT ", ftype:0x%.2x",
                  NETDEV_PR_ARG,
                  le16_to_cpup((__le16 *)__get_dynamic_array(frame)))
);

DECLARE_EVENT_CLASS(netdev_mac_evt,
        TP_PROTO(struct net_device *netdev, const u8 *mac),
        TP_ARGS(netdev, mac),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                MAC_ENTRY(mac)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                MAC_ASSIGN(mac, mac)
        ),
        TP_printk(NETDEV_PR_FMT ", mac: " MAC_PR_FMT,
                  NETDEV_PR_ARG, MAC_PR_ARG(mac))
);

DEFINE_EVENT(netdev_mac_evt, cfg80211_send_auth_timeout,
        TP_PROTO(struct net_device *netdev, const u8 *mac),
        TP_ARGS(netdev, mac)
);

DEFINE_EVENT(netdev_mac_evt, cfg80211_send_assoc_timeout,
        TP_PROTO(struct net_device *netdev, const u8 *mac),
        TP_ARGS(netdev, mac)
);

TRACE_EVENT(cfg80211_michael_mic_failure,
        TP_PROTO(struct net_device *netdev, const u8 *addr,
                 enum nl80211_key_type key_type, int key_id, const u8 *tsc),
        TP_ARGS(netdev, addr, key_type, key_id, tsc),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                MAC_ENTRY(addr)
                __field(enum nl80211_key_type, key_type)
                __field(int, key_id)
                __array(u8, tsc, 6)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                MAC_ASSIGN(addr, addr);
                __entry->key_type = key_type;
                __entry->key_id = key_id;
                if (tsc)
                        memcpy(__entry->tsc, tsc, 6);
        ),
        TP_printk(NETDEV_PR_FMT ", " MAC_PR_FMT ", key type: %d, key id: %d, tsc: %pm",
                  NETDEV_PR_ARG, MAC_PR_ARG(addr), __entry->key_type,
                  __entry->key_id, __entry->tsc)
);

TRACE_EVENT(cfg80211_ready_on_channel,
        TP_PROTO(struct wireless_dev *wdev, u64 cookie,
                 struct ieee80211_channel *chan,
                 unsigned int duration),
        TP_ARGS(wdev, cookie, chan, duration),
        TP_STRUCT__entry(
                WDEV_ENTRY
                __field(u64, cookie)
                CHAN_ENTRY
                __field(unsigned int, duration)
        ),
        TP_fast_assign(
                WDEV_ASSIGN;
                __entry->cookie = cookie;
                CHAN_ASSIGN(chan);
                __entry->duration = duration;
        ),
        TP_printk(WDEV_PR_FMT ", cookie: %llu, " CHAN_PR_FMT ", duration: %u",
                  WDEV_PR_ARG, __entry->cookie, CHAN_PR_ARG,
                  __entry->duration)
);

TRACE_EVENT(cfg80211_ready_on_channel_expired,
        TP_PROTO(struct wireless_dev *wdev, u64 cookie,
                 struct ieee80211_channel *chan),
        TP_ARGS(wdev, cookie, chan),
        TP_STRUCT__entry(
                WDEV_ENTRY
                __field(u64, cookie)
                CHAN_ENTRY
        ),
        TP_fast_assign(
                WDEV_ASSIGN;
                __entry->cookie = cookie;
                CHAN_ASSIGN(chan);
        ),
        TP_printk(WDEV_PR_FMT ", cookie: %llu, " CHAN_PR_FMT,
                  WDEV_PR_ARG, __entry->cookie, CHAN_PR_ARG)
);

TRACE_EVENT(cfg80211_tx_mgmt_expired,
        TP_PROTO(struct wireless_dev *wdev, u64 cookie,
                 struct ieee80211_channel *chan),
        TP_ARGS(wdev, cookie, chan),
        TP_STRUCT__entry(
                WDEV_ENTRY
                __field(u64, cookie)
                CHAN_ENTRY
        ),
        TP_fast_assign(
                WDEV_ASSIGN;
                __entry->cookie = cookie;
                CHAN_ASSIGN(chan);
        ),
        TP_printk(WDEV_PR_FMT ", cookie: %llu, " CHAN_PR_FMT,
                  WDEV_PR_ARG, __entry->cookie, CHAN_PR_ARG)
);

TRACE_EVENT(cfg80211_new_sta,
        TP_PROTO(struct net_device *netdev, const u8 *mac_addr,
                 struct station_info *sinfo),
        TP_ARGS(netdev, mac_addr, sinfo),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                MAC_ENTRY(mac_addr)
                SINFO_ENTRY
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                MAC_ASSIGN(mac_addr, mac_addr);
                SINFO_ASSIGN;
        ),
        TP_printk(NETDEV_PR_FMT ", " MAC_PR_FMT,
                  NETDEV_PR_ARG, MAC_PR_ARG(mac_addr))
);

DEFINE_EVENT(cfg80211_netdev_mac_evt, cfg80211_del_sta,
        TP_PROTO(struct net_device *netdev, const u8 *macaddr),
        TP_ARGS(netdev, macaddr)
);

TRACE_EVENT(cfg80211_rx_mgmt,
        TP_PROTO(struct wireless_dev *wdev, int freq, int sig_dbm),
        TP_ARGS(wdev, freq, sig_dbm),
        TP_STRUCT__entry(
                WDEV_ENTRY
                __field(int, freq)
                __field(int, sig_dbm)
        ),
        TP_fast_assign(
                WDEV_ASSIGN;
                __entry->freq = freq;
                __entry->sig_dbm = sig_dbm;
        ),
        TP_printk(WDEV_PR_FMT ", freq: "KHZ_F", sig dbm: %d",
                  WDEV_PR_ARG, PR_KHZ(__entry->freq), __entry->sig_dbm)
);

TRACE_EVENT(cfg80211_mgmt_tx_status,
        TP_PROTO(struct wireless_dev *wdev, u64 cookie, bool ack),
        TP_ARGS(wdev, cookie, ack),
        TP_STRUCT__entry(
                WDEV_ENTRY
                __field(u64, cookie)
                __field(bool, ack)
        ),
        TP_fast_assign(
                WDEV_ASSIGN;
                __entry->cookie = cookie;
                __entry->ack = ack;
        ),
        TP_printk(WDEV_PR_FMT", cookie: %llu, ack: %s",
                  WDEV_PR_ARG, __entry->cookie, BOOL_TO_STR(__entry->ack))
);

TRACE_EVENT(cfg80211_control_port_tx_status,
        TP_PROTO(struct wireless_dev *wdev, u64 cookie, bool ack),
        TP_ARGS(wdev, cookie, ack),
        TP_STRUCT__entry(
                WDEV_ENTRY
                __field(u64, cookie)
                __field(bool, ack)
        ),
        TP_fast_assign(
                WDEV_ASSIGN;
                __entry->cookie = cookie;
                __entry->ack = ack;
        ),
        TP_printk(WDEV_PR_FMT", cookie: %llu, ack: %s",
                  WDEV_PR_ARG, __entry->cookie, BOOL_TO_STR(__entry->ack))
);

TRACE_EVENT(cfg80211_rx_control_port,
        TP_PROTO(struct net_device *netdev, struct sk_buff *skb,
                 bool unencrypted),
        TP_ARGS(netdev, skb, unencrypted),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                __field(int, len)
                MAC_ENTRY(from)
                __field(u16, proto)
                __field(bool, unencrypted)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                __entry->len = skb->len;
                MAC_ASSIGN(from, eth_hdr(skb)->h_source);
                __entry->proto = be16_to_cpu(skb->protocol);
                __entry->unencrypted = unencrypted;
        ),
        TP_printk(NETDEV_PR_FMT ", len=%d, " MAC_PR_FMT ", proto: 0x%x, unencrypted: %s",
                  NETDEV_PR_ARG, __entry->len, MAC_PR_ARG(from),
                  __entry->proto, BOOL_TO_STR(__entry->unencrypted))
);

TRACE_EVENT(cfg80211_cqm_rssi_notify,
        TP_PROTO(struct net_device *netdev,
                 enum nl80211_cqm_rssi_threshold_event rssi_event,
                 s32 rssi_level),
        TP_ARGS(netdev, rssi_event, rssi_level),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                __field(enum nl80211_cqm_rssi_threshold_event, rssi_event)
                __field(s32, rssi_level)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                __entry->rssi_event = rssi_event;
                __entry->rssi_level = rssi_level;
        ),
        TP_printk(NETDEV_PR_FMT ", rssi event: %d, level: %d",
                  NETDEV_PR_ARG, __entry->rssi_event, __entry->rssi_level)
);

TRACE_EVENT(cfg80211_reg_can_beacon,
        TP_PROTO(struct wiphy *wiphy, struct cfg80211_chan_def *chandef,
                 enum nl80211_iftype iftype, bool check_no_ir),
        TP_ARGS(wiphy, chandef, iftype, check_no_ir),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                CHAN_DEF_ENTRY
                __field(enum nl80211_iftype, iftype)
                __field(bool, check_no_ir)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                CHAN_DEF_ASSIGN(chandef);
                __entry->iftype = iftype;
                __entry->check_no_ir = check_no_ir;
        ),
        TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT ", iftype=%d check_no_ir=%s",
                  WIPHY_PR_ARG, CHAN_DEF_PR_ARG, __entry->iftype,
                  BOOL_TO_STR(__entry->check_no_ir))
);

TRACE_EVENT(cfg80211_chandef_dfs_required,
        TP_PROTO(struct wiphy *wiphy, struct cfg80211_chan_def *chandef),
        TP_ARGS(wiphy, chandef),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                CHAN_DEF_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                CHAN_DEF_ASSIGN(chandef);
        ),
        TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT,
                  WIPHY_PR_ARG, CHAN_DEF_PR_ARG)
);

TRACE_EVENT(cfg80211_ch_switch_notify,
        TP_PROTO(struct net_device *netdev,
                 struct cfg80211_chan_def *chandef),
        TP_ARGS(netdev, chandef),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                CHAN_DEF_ENTRY
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                CHAN_DEF_ASSIGN(chandef);
        ),
        TP_printk(NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT,
                  NETDEV_PR_ARG, CHAN_DEF_PR_ARG)
);

TRACE_EVENT(cfg80211_ch_switch_started_notify,
        TP_PROTO(struct net_device *netdev,
                 struct cfg80211_chan_def *chandef),
        TP_ARGS(netdev, chandef),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                CHAN_DEF_ENTRY
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                CHAN_DEF_ASSIGN(chandef);
        ),
        TP_printk(NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT,
                  NETDEV_PR_ARG, CHAN_DEF_PR_ARG)
);

TRACE_EVENT(cfg80211_radar_event,
        TP_PROTO(struct wiphy *wiphy, struct cfg80211_chan_def *chandef),
        TP_ARGS(wiphy, chandef),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                CHAN_DEF_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                CHAN_DEF_ASSIGN(chandef);
        ),
        TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT,
                  WIPHY_PR_ARG, CHAN_DEF_PR_ARG)
);

TRACE_EVENT(cfg80211_cac_event,
        TP_PROTO(struct net_device *netdev, enum nl80211_radar_event evt),
        TP_ARGS(netdev, evt),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                __field(enum nl80211_radar_event, evt)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                __entry->evt = evt;
        ),
        TP_printk(NETDEV_PR_FMT ",  event: %d",
                  NETDEV_PR_ARG, __entry->evt)
);

DECLARE_EVENT_CLASS(cfg80211_rx_evt,
        TP_PROTO(struct net_device *netdev, const u8 *addr),
        TP_ARGS(netdev, addr),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                MAC_ENTRY(addr)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                MAC_ASSIGN(addr, addr);
        ),
        TP_printk(NETDEV_PR_FMT ", " MAC_PR_FMT, NETDEV_PR_ARG, MAC_PR_ARG(addr))
);

DEFINE_EVENT(cfg80211_rx_evt, cfg80211_rx_spurious_frame,
        TP_PROTO(struct net_device *netdev, const u8 *addr),
        TP_ARGS(netdev, addr)
);

DEFINE_EVENT(cfg80211_rx_evt, cfg80211_rx_unexpected_4addr_frame,
        TP_PROTO(struct net_device *netdev, const u8 *addr),
        TP_ARGS(netdev, addr)
);

TRACE_EVENT(cfg80211_ibss_joined,
        TP_PROTO(struct net_device *netdev, const u8 *bssid,
                 struct ieee80211_channel *channel),
        TP_ARGS(netdev, bssid, channel),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                MAC_ENTRY(bssid)
                CHAN_ENTRY
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                MAC_ASSIGN(bssid, bssid);
                CHAN_ASSIGN(channel);
        ),
        TP_printk(NETDEV_PR_FMT ", bssid: " MAC_PR_FMT ", " CHAN_PR_FMT,
                  NETDEV_PR_ARG, MAC_PR_ARG(bssid), CHAN_PR_ARG)
);

TRACE_EVENT(cfg80211_probe_status,
        TP_PROTO(struct net_device *netdev, const u8 *addr, u64 cookie,
                 bool acked),
        TP_ARGS(netdev, addr, cookie, acked),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                MAC_ENTRY(addr)
                __field(u64, cookie)
                __field(bool, acked)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                MAC_ASSIGN(addr, addr);
                __entry->cookie = cookie;
                __entry->acked = acked;
        ),
        TP_printk(NETDEV_PR_FMT " addr:" MAC_PR_FMT ", cookie: %llu, acked: %s",
                  NETDEV_PR_ARG, MAC_PR_ARG(addr), __entry->cookie,
                  BOOL_TO_STR(__entry->acked))
);

TRACE_EVENT(cfg80211_cqm_pktloss_notify,
        TP_PROTO(struct net_device *netdev, const u8 *peer, u32 num_packets),
        TP_ARGS(netdev, peer, num_packets),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                MAC_ENTRY(peer)
                __field(u32, num_packets)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                MAC_ASSIGN(peer, peer);
                __entry->num_packets = num_packets;
        ),
        TP_printk(NETDEV_PR_FMT ", peer: " MAC_PR_FMT ", num of lost packets: %u",
                  NETDEV_PR_ARG, MAC_PR_ARG(peer), __entry->num_packets)
);

DEFINE_EVENT(cfg80211_netdev_mac_evt, cfg80211_gtk_rekey_notify,
        TP_PROTO(struct net_device *netdev, const u8 *macaddr),
        TP_ARGS(netdev, macaddr)
);

TRACE_EVENT(cfg80211_pmksa_candidate_notify,
        TP_PROTO(struct net_device *netdev, int index, const u8 *bssid,
                 bool preauth),
        TP_ARGS(netdev, index, bssid, preauth),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                __field(int, index)
                MAC_ENTRY(bssid)
                __field(bool, preauth)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                __entry->index = index;
                MAC_ASSIGN(bssid, bssid);
                __entry->preauth = preauth;
        ),
        TP_printk(NETDEV_PR_FMT ", index:%d, bssid: " MAC_PR_FMT ", pre auth: %s",
                  NETDEV_PR_ARG, __entry->index, MAC_PR_ARG(bssid),
                  BOOL_TO_STR(__entry->preauth))
);

TRACE_EVENT(cfg80211_report_obss_beacon,
        TP_PROTO(struct wiphy *wiphy, const u8 *frame, size_t len,
                 int freq, int sig_dbm),
        TP_ARGS(wiphy, frame, len, freq, sig_dbm),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(int, freq)
                __field(int, sig_dbm)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->freq = freq;
                __entry->sig_dbm = sig_dbm;
        ),
        TP_printk(WIPHY_PR_FMT ", freq: "KHZ_F", sig_dbm: %d",
                  WIPHY_PR_ARG, PR_KHZ(__entry->freq), __entry->sig_dbm)
);

TRACE_EVENT(cfg80211_tdls_oper_request,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *peer,
                 enum nl80211_tdls_operation oper, u16 reason_code),
        TP_ARGS(wiphy, netdev, peer, oper, reason_code),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(peer)
                __field(enum nl80211_tdls_operation, oper)
                __field(u16, reason_code)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(peer, peer);
                __entry->oper = oper;
                __entry->reason_code = reason_code;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: " MAC_PR_FMT ", oper: %d, reason_code %u",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), __entry->oper,
                  __entry->reason_code)
        );

TRACE_EVENT(cfg80211_scan_done,
        TP_PROTO(struct cfg80211_scan_request *request,
                 struct cfg80211_scan_info *info),
        TP_ARGS(request, info),
        TP_STRUCT__entry(
                __field(u32, n_channels)
                __dynamic_array(u8, ie, request ? request->ie_len : 0)
                __array(u32, rates, NUM_NL80211_BANDS)
                __field(u32, wdev_id)
                MAC_ENTRY(wiphy_mac)
                __field(bool, no_cck)
                __field(bool, aborted)
                __field(u64, scan_start_tsf)
                MAC_ENTRY(tsf_bssid)
        ),
        TP_fast_assign(
                if (request) {
                        memcpy(__get_dynamic_array(ie), request->ie,
                               request->ie_len);
                        memcpy(__entry->rates, request->rates,
                               NUM_NL80211_BANDS);
                        __entry->wdev_id = request->wdev ?
                                        request->wdev->identifier : 0;
                        if (request->wiphy)
                                MAC_ASSIGN(wiphy_mac,
                                           request->wiphy->perm_addr);
                        __entry->no_cck = request->no_cck;
                }
                if (info) {
                        __entry->aborted = info->aborted;
                        __entry->scan_start_tsf = info->scan_start_tsf;
                        MAC_ASSIGN(tsf_bssid, info->tsf_bssid);
                }
        ),
        TP_printk("aborted: %s, scan start (TSF): %llu, tsf_bssid: " MAC_PR_FMT,
                  BOOL_TO_STR(__entry->aborted),
                  (unsigned long long)__entry->scan_start_tsf,
                  MAC_PR_ARG(tsf_bssid))
);

DECLARE_EVENT_CLASS(wiphy_id_evt,
        TP_PROTO(struct wiphy *wiphy, u64 id),
        TP_ARGS(wiphy, id),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(u64, id)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->id = id;
        ),
        TP_printk(WIPHY_PR_FMT ", id: %llu", WIPHY_PR_ARG, __entry->id)
);

DEFINE_EVENT(wiphy_id_evt, cfg80211_sched_scan_stopped,
        TP_PROTO(struct wiphy *wiphy, u64 id),
        TP_ARGS(wiphy, id)
);

DEFINE_EVENT(wiphy_id_evt, cfg80211_sched_scan_results,
        TP_PROTO(struct wiphy *wiphy, u64 id),
        TP_ARGS(wiphy, id)
);

TRACE_EVENT(cfg80211_get_bss,
        TP_PROTO(struct wiphy *wiphy, struct ieee80211_channel *channel,
                 const u8 *bssid, const u8 *ssid, size_t ssid_len,
                 enum ieee80211_bss_type bss_type,
                 enum ieee80211_privacy privacy),
        TP_ARGS(wiphy, channel, bssid, ssid, ssid_len, bss_type, privacy),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                CHAN_ENTRY
                MAC_ENTRY(bssid)
                __dynamic_array(u8, ssid, ssid_len)
                __field(enum ieee80211_bss_type, bss_type)
                __field(enum ieee80211_privacy, privacy)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                CHAN_ASSIGN(channel);
                MAC_ASSIGN(bssid, bssid);
                memcpy(__get_dynamic_array(ssid), ssid, ssid_len);
                __entry->bss_type = bss_type;
                __entry->privacy = privacy;
        ),
        TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT ", " MAC_PR_FMT
                  ", buf: %#.2x, bss_type: %d, privacy: %d",
                  WIPHY_PR_ARG, CHAN_PR_ARG, MAC_PR_ARG(bssid),
                  ((u8 *)__get_dynamic_array(ssid))[0], __entry->bss_type,
                  __entry->privacy)
);

TRACE_EVENT(cfg80211_inform_bss_frame,
        TP_PROTO(struct wiphy *wiphy, struct cfg80211_inform_bss *data,
                 struct ieee80211_mgmt *mgmt, size_t len),
        TP_ARGS(wiphy, data, mgmt, len),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                CHAN_ENTRY
                __field(enum nl80211_bss_scan_width, scan_width)
                __dynamic_array(u8, mgmt, len)
                __field(s32, signal)
                __field(u64, ts_boottime)
                __field(u64, parent_tsf)
                MAC_ENTRY(parent_bssid)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                CHAN_ASSIGN(data->chan);
                __entry->scan_width = data->scan_width;
                if (mgmt)
                        memcpy(__get_dynamic_array(mgmt), mgmt, len);
                __entry->signal = data->signal;
                __entry->ts_boottime = data->boottime_ns;
                __entry->parent_tsf = data->parent_tsf;
                MAC_ASSIGN(parent_bssid, data->parent_bssid);
        ),
        TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT
                  "(scan_width: %d) signal: %d, tsb:%llu, detect_tsf:%llu, tsf_bssid: "
                  MAC_PR_FMT, WIPHY_PR_ARG, CHAN_PR_ARG, __entry->scan_width,
                  __entry->signal, (unsigned long long)__entry->ts_boottime,
                  (unsigned long long)__entry->parent_tsf,
                  MAC_PR_ARG(parent_bssid))
);

DECLARE_EVENT_CLASS(cfg80211_bss_evt,
        TP_PROTO(struct cfg80211_bss *pub),
        TP_ARGS(pub),
        TP_STRUCT__entry(
                MAC_ENTRY(bssid)
                CHAN_ENTRY
        ),
        TP_fast_assign(
                MAC_ASSIGN(bssid, pub->bssid);
                CHAN_ASSIGN(pub->channel);
        ),
        TP_printk(MAC_PR_FMT ", " CHAN_PR_FMT, MAC_PR_ARG(bssid), CHAN_PR_ARG)
);

DEFINE_EVENT(cfg80211_bss_evt, cfg80211_return_bss,
        TP_PROTO(struct cfg80211_bss *pub),
        TP_ARGS(pub)
);

TRACE_EVENT(cfg80211_return_uint,
        TP_PROTO(unsigned int ret),
        TP_ARGS(ret),
        TP_STRUCT__entry(
                __field(unsigned int, ret)
        ),
        TP_fast_assign(
                __entry->ret = ret;
        ),
        TP_printk("ret: %d", __entry->ret)
);

TRACE_EVENT(cfg80211_return_u32,
        TP_PROTO(u32 ret),
        TP_ARGS(ret),
        TP_STRUCT__entry(
                __field(u32, ret)
        ),
        TP_fast_assign(
                __entry->ret = ret;
        ),
        TP_printk("ret: %u", __entry->ret)
);

TRACE_EVENT(cfg80211_report_wowlan_wakeup,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 struct cfg80211_wowlan_wakeup *wakeup),
        TP_ARGS(wiphy, wdev, wakeup),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(bool, non_wireless)
                __field(bool, disconnect)
                __field(bool, magic_pkt)
                __field(bool, gtk_rekey_failure)
                __field(bool, eap_identity_req)
                __field(bool, four_way_handshake)
                __field(bool, rfkill_release)
                __field(s32, pattern_idx)
                __field(u32, packet_len)
                __dynamic_array(u8, packet,
                                wakeup ? wakeup->packet_present_len : 0)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->non_wireless = !wakeup;
                __entry->disconnect = wakeup ? wakeup->disconnect : false;
                __entry->magic_pkt = wakeup ? wakeup->magic_pkt : false;
                __entry->gtk_rekey_failure = wakeup ? wakeup->gtk_rekey_failure : false;
                __entry->eap_identity_req = wakeup ? wakeup->eap_identity_req : false;
                __entry->four_way_handshake = wakeup ? wakeup->four_way_handshake : false;
                __entry->rfkill_release = wakeup ? wakeup->rfkill_release : false;
                __entry->pattern_idx = wakeup ? wakeup->pattern_idx : false;
                __entry->packet_len = wakeup ? wakeup->packet_len : false;
                if (wakeup && wakeup->packet && wakeup->packet_present_len)
                        memcpy(__get_dynamic_array(packet), wakeup->packet,
                               wakeup->packet_present_len);
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT, WIPHY_PR_ARG, WDEV_PR_ARG)
);

TRACE_EVENT(cfg80211_ft_event,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_ft_event_params *ft_event),
        TP_ARGS(wiphy, netdev, ft_event),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __dynamic_array(u8, ies, ft_event->ies_len)
                MAC_ENTRY(target_ap)
                __dynamic_array(u8, ric_ies, ft_event->ric_ies_len)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                if (ft_event->ies)
                        memcpy(__get_dynamic_array(ies), ft_event->ies,
                               ft_event->ies_len);
                MAC_ASSIGN(target_ap, ft_event->target_ap);
                if (ft_event->ric_ies)
                        memcpy(__get_dynamic_array(ric_ies), ft_event->ric_ies,
                               ft_event->ric_ies_len);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", target_ap: " MAC_PR_FMT,
                  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(target_ap))
);

TRACE_EVENT(cfg80211_stop_iface,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
        TP_ARGS(wiphy, wdev),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT,
                  WIPHY_PR_ARG, WDEV_PR_ARG)
);

TRACE_EVENT(cfg80211_pmsr_report,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 u64 cookie, const u8 *addr),
        TP_ARGS(wiphy, wdev, cookie, addr),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(u64, cookie)
                MAC_ENTRY(addr)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->cookie = cookie;
                MAC_ASSIGN(addr, addr);
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie:%lld, " MAC_PR_FMT,
                  WIPHY_PR_ARG, WDEV_PR_ARG,
                  (unsigned long long)__entry->cookie,
                  MAC_PR_ARG(addr))
);

TRACE_EVENT(cfg80211_pmsr_complete,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie),
        TP_ARGS(wiphy, wdev, cookie),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(u64, cookie)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->cookie = cookie;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie:%lld",
                  WIPHY_PR_ARG, WDEV_PR_ARG,
                  (unsigned long long)__entry->cookie)
);

TRACE_EVENT(rdev_update_owe_info,
            TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                     struct cfg80211_update_owe_info *owe_info),
            TP_ARGS(wiphy, netdev, owe_info),
            TP_STRUCT__entry(WIPHY_ENTRY
                             NETDEV_ENTRY
                             MAC_ENTRY(peer)
                             __field(u16, status)
                             __dynamic_array(u8, ie, owe_info->ie_len)),
            TP_fast_assign(WIPHY_ASSIGN;
                           NETDEV_ASSIGN;
                           MAC_ASSIGN(peer, owe_info->peer);
                           __entry->status = owe_info->status;
                           memcpy(__get_dynamic_array(ie),
                                  owe_info->ie, owe_info->ie_len);),
            TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: " MAC_PR_FMT
                  " status %d", WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer),
                  __entry->status)
);

TRACE_EVENT(cfg80211_update_owe_info_event,
            TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                     struct cfg80211_update_owe_info *owe_info),
            TP_ARGS(wiphy, netdev, owe_info),
            TP_STRUCT__entry(WIPHY_ENTRY
                             NETDEV_ENTRY
                             MAC_ENTRY(peer)
                             __dynamic_array(u8, ie, owe_info->ie_len)),
            TP_fast_assign(WIPHY_ASSIGN;
                           NETDEV_ASSIGN;
                           MAC_ASSIGN(peer, owe_info->peer);
                           memcpy(__get_dynamic_array(ie), owe_info->ie,
                                  owe_info->ie_len);),
            TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: " MAC_PR_FMT,
                      WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer))
);

TRACE_EVENT(rdev_probe_mesh_link,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 const u8 *dest, const u8 *buf, size_t len),
        TP_ARGS(wiphy, netdev, dest, buf, len),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(dest)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(dest, dest);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT,
                  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(dest))
);

TRACE_EVENT(rdev_set_tid_config,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_tid_config *tid_conf),
        TP_ARGS(wiphy, netdev, tid_conf),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(peer)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(peer, tid_conf->peer);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: " MAC_PR_FMT,
                  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer))
);

TRACE_EVENT(rdev_reset_tid_config,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 const u8 *peer, u8 tids),
        TP_ARGS(wiphy, netdev, peer, tids),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(peer)
                __field(u8, tids)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(peer, peer);
                __entry->tids = tids;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: " MAC_PR_FMT ", tids: 0x%x",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), __entry->tids)
);
#endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */

#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH .
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE trace
#include <trace/define_trace.h>














































































































































































































































































































































































































    1 

    1 


    1 






    1 








































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2018-2020 Christoph Hellwig.
 *
 * DMA operations that map physical memory directly without using an IOMMU.
 */
#include <linux/memblock.h> /* for max_pfn */
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/dma-map-ops.h>
#include <linux/scatterlist.h>
#include <linux/pfn.h>
#include <linux/vmalloc.h>
#include <linux/set_memory.h>
#include <linux/slab.h>
#include "direct.h"

/*
 * Most architectures use ZONE_DMA for the first 16 Megabytes, but some use
 * it for entirely different regions. In that case the arch code needs to
 * override the variable below for dma-direct to work properly.
 */
unsigned int zone_dma_bits __ro_after_init = 24;

static inline dma_addr_t phys_to_dma_direct(struct device *dev,
                phys_addr_t phys)
{
        if (force_dma_unencrypted(dev))
                return phys_to_dma_unencrypted(dev, phys);
        return phys_to_dma(dev, phys);
}

static inline struct page *dma_direct_to_page(struct device *dev,
                dma_addr_t dma_addr)
{
        return pfn_to_page(PHYS_PFN(dma_to_phys(dev, dma_addr)));
}

u64 dma_direct_get_required_mask(struct device *dev)
{
        phys_addr_t phys = (phys_addr_t)(max_pfn - 1) << PAGE_SHIFT;
        u64 max_dma = phys_to_dma_direct(dev, phys);

        return (1ULL << (fls64(max_dma) - 1)) * 2 - 1;
}

static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
                                  u64 *phys_limit)
{
        u64 dma_limit = min_not_zero(dma_mask, dev->bus_dma_limit);

        /*
         * Optimistically try the zone that the physical address mask falls
         * into first.  If that returns memory that isn't actually addressable
         * we will fallback to the next lower zone and try again.
         *
         * Note that GFP_DMA32 and GFP_DMA are no ops without the corresponding
         * zones.
         */
        *phys_limit = dma_to_phys(dev, dma_limit);
        if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits))
                return GFP_DMA;
        if (*phys_limit <= DMA_BIT_MASK(32))
                return GFP_DMA32;
        return 0;
}

static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
{
        dma_addr_t dma_addr = phys_to_dma_direct(dev, phys);

        if (dma_addr == DMA_MAPPING_ERROR)
                return false;
        return dma_addr + size - 1 <=
                min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit);
}

static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
                gfp_t gfp)
{
        int node = dev_to_node(dev);
        struct page *page = NULL;
        u64 phys_limit;

        WARN_ON_ONCE(!PAGE_ALIGNED(size));

        gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
                                           &phys_limit);
        page = dma_alloc_contiguous(dev, size, gfp);
        if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
                dma_free_contiguous(dev, page, size);
                page = NULL;
        }
again:
        if (!page)
                page = alloc_pages_node(node, gfp, get_order(size));
        if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
                dma_free_contiguous(dev, page, size);
                page = NULL;

                if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
                    phys_limit < DMA_BIT_MASK(64) &&
                    !(gfp & (GFP_DMA32 | GFP_DMA))) {
                        gfp |= GFP_DMA32;
                        goto again;
                }

                if (IS_ENABLED(CONFIG_ZONE_DMA) && !(gfp & GFP_DMA)) {
                        gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
                        goto again;
                }
        }

        return page;
}

static void *dma_direct_alloc_from_pool(struct device *dev, size_t size,
                dma_addr_t *dma_handle, gfp_t gfp)
{
        struct page *page;
        u64 phys_mask;
        void *ret;

        gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
                                           &phys_mask);
        page = dma_alloc_from_pool(dev, size, &ret, gfp, dma_coherent_ok);
        if (!page)
                return NULL;
        *dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
        return ret;
}

void *dma_direct_alloc(struct device *dev, size_t size,
                dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
{
        struct page *page;
        void *ret;
        int err;

        size = PAGE_ALIGN(size);
        if (attrs & DMA_ATTR_NO_WARN)
                gfp |= __GFP_NOWARN;

        if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
            !force_dma_unencrypted(dev)) {
                page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
                if (!page)
                        return NULL;
                /* remove any dirty cache lines on the kernel alias */
                if (!PageHighMem(page))
                        arch_dma_prep_coherent(page, size);
                *dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
                /* return the page pointer as the opaque cookie */
                return page;
        }

        if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
            !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
            !dev_is_dma_coherent(dev))
                return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);

        /*
         * Remapping or decrypting memory may block. If either is required and
         * we can't block, allocate the memory from the atomic pools.
         */
        if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
            !gfpflags_allow_blocking(gfp) &&
            (force_dma_unencrypted(dev) ||
             (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && !dev_is_dma_coherent(dev))))
                return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);

        /* we always manually zero the memory once we are done */
        page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
        if (!page)
                return NULL;

        if ((IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
             !dev_is_dma_coherent(dev)) ||
            (IS_ENABLED(CONFIG_DMA_REMAP) && PageHighMem(page))) {
                /* remove any dirty cache lines on the kernel alias */
                arch_dma_prep_coherent(page, size);

                /* create a coherent mapping */
                ret = dma_common_contiguous_remap(page, size,
                                dma_pgprot(dev, PAGE_KERNEL, attrs),
                                __builtin_return_address(0));
                if (!ret)
                        goto out_free_pages;
                if (force_dma_unencrypted(dev)) {
                        err = set_memory_decrypted((unsigned long)ret,
                                                   PFN_UP(size));
                        if (err)
                                goto out_free_pages;
                }
                memset(ret, 0, size);
                goto done;
        }

        if (PageHighMem(page)) {
                /*
                 * Depending on the cma= arguments and per-arch setup
                 * dma_alloc_contiguous could return highmem pages.
                 * Without remapping there is no way to return them here,
                 * so log an error and fail.
                 */
                dev_info(dev, "Rejecting highmem page from CMA.\n");
                goto out_free_pages;
        }

        ret = page_address(page);
        if (force_dma_unencrypted(dev)) {
                err = set_memory_decrypted((unsigned long)ret,
                                           PFN_UP(size));
                if (err)
                        goto out_free_pages;
        }

        memset(ret, 0, size);

        if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
            !dev_is_dma_coherent(dev)) {
                arch_dma_prep_coherent(page, size);
                ret = arch_dma_set_uncached(ret, size);
                if (IS_ERR(ret))
                        goto out_encrypt_pages;
        }
done:
        *dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
        return ret;

out_encrypt_pages:
        if (force_dma_unencrypted(dev)) {
                err = set_memory_encrypted((unsigned long)page_address(page),
                                           PFN_UP(size));
                /* If memory cannot be re-encrypted, it must be leaked */
                if (err)
                        return NULL;
        }
out_free_pages:
        dma_free_contiguous(dev, page, size);
        return NULL;
}

void dma_direct_free(struct device *dev, size_t size,
                void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs)
{
        if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
            !force_dma_unencrypted(dev)) {
                /* cpu_addr is a struct page cookie, not a kernel address */
                dma_free_contiguous(dev, cpu_addr, size);
                return;
        }

        if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
            !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
            !dev_is_dma_coherent(dev)) {
                arch_dma_free(dev, size, cpu_addr, dma_addr, attrs);
                return;
        }

        /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */
        if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
            dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size)))
                return;

        if (force_dma_unencrypted(dev))
                set_memory_encrypted((unsigned long)cpu_addr, PFN_UP(size));

        if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr))
                vunmap(cpu_addr);
        else if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED))
                arch_dma_clear_uncached(cpu_addr, size);

        dma_free_contiguous(dev, dma_direct_to_page(dev, dma_addr), size);
}

struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
                dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
{
        struct page *page;
        void *ret;

        if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
            force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp))
                return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);

        page = __dma_direct_alloc_pages(dev, size, gfp);
        if (!page)
                return NULL;
        if (PageHighMem(page)) {
                /*
                 * Depending on the cma= arguments and per-arch setup
                 * dma_alloc_contiguous could return highmem pages.
                 * Without remapping there is no way to return them here,
                 * so log an error and fail.
                 */
                dev_info(dev, "Rejecting highmem page from CMA.\n");
                goto out_free_pages;
        }

        ret = page_address(page);
        if (force_dma_unencrypted(dev)) {
                if (set_memory_decrypted((unsigned long)ret, PFN_UP(size)))
                        goto out_free_pages;
        }
        memset(ret, 0, size);
        *dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
        return page;
out_free_pages:
        dma_free_contiguous(dev, page, size);
        return NULL;
}

void dma_direct_free_pages(struct device *dev, size_t size,
                struct page *page, dma_addr_t dma_addr,
                enum dma_data_direction dir)
{
        void *vaddr = page_address(page);

        /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */
        if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
            dma_free_from_pool(dev, vaddr, size))
                return;

        if (force_dma_unencrypted(dev))
                set_memory_encrypted((unsigned long)vaddr, PFN_UP(size));

        dma_free_contiguous(dev, page, size);
}

#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
    defined(CONFIG_SWIOTLB)
void dma_direct_sync_sg_for_device(struct device *dev,
                struct scatterlist *sgl, int nents, enum dma_data_direction dir)
{
        struct scatterlist *sg;
        int i;

        for_each_sg(sgl, sg, nents, i) {
                phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));

                if (unlikely(is_swiotlb_buffer(paddr)))
                        swiotlb_tbl_sync_single(dev, paddr, sg->length,
                                        dir, SYNC_FOR_DEVICE);

                if (!dev_is_dma_coherent(dev))
                        arch_sync_dma_for_device(paddr, sg->length,
                                        dir);
        }
}
#endif

#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \
    defined(CONFIG_SWIOTLB)
void dma_direct_sync_sg_for_cpu(struct device *dev,
                struct scatterlist *sgl, int nents, enum dma_data_direction dir)
{
        struct scatterlist *sg;
        int i;

        for_each_sg(sgl, sg, nents, i) {
                phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));

                if (!dev_is_dma_coherent(dev))
                        arch_sync_dma_for_cpu(paddr, sg->length, dir);

                if (unlikely(is_swiotlb_buffer(paddr)))
                        swiotlb_tbl_sync_single(dev, paddr, sg->length, dir,
                                        SYNC_FOR_CPU);

                if (dir == DMA_FROM_DEVICE)
                        arch_dma_mark_clean(paddr, sg->length);
        }

        if (!dev_is_dma_coherent(dev))
                arch_sync_dma_for_cpu_all();
}

void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
                int nents, enum dma_data_direction dir, unsigned long attrs)
{
        struct scatterlist *sg;
        int i;

        for_each_sg(sgl, sg, nents, i)
                dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir,
                             attrs);
}
#endif

int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
                enum dma_data_direction dir, unsigned long attrs)
{
        int i;
        struct scatterlist *sg;

        for_each_sg(sgl, sg, nents, i) {
                sg->dma_address = dma_direct_map_page(dev, sg_page(sg),
                                sg->offset, sg->length, dir, attrs);
                if (sg->dma_address == DMA_MAPPING_ERROR)
                        goto out_unmap;
                sg_dma_len(sg) = sg->length;
        }

        return nents;

out_unmap:
        dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
        return 0;
}

dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
                size_t size, enum dma_data_direction dir, unsigned long attrs)
{
        dma_addr_t dma_addr = paddr;

        if (unlikely(!dma_capable(dev, dma_addr, size, false))) {
                dev_err_once(dev,
                             "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
                             &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
                WARN_ON_ONCE(1);
                return DMA_MAPPING_ERROR;
        }

        return dma_addr;
}

int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt,
                void *cpu_addr, dma_addr_t dma_addr, size_t size,
                unsigned long attrs)
{
        struct page *page = dma_direct_to_page(dev, dma_addr);
        int ret;

        ret = sg_alloc_table(sgt, 1, GFP_KERNEL);
        if (!ret)
                sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
        return ret;
}

bool dma_direct_can_mmap(struct device *dev)
{
        return dev_is_dma_coherent(dev) ||
                IS_ENABLED(CONFIG_DMA_NONCOHERENT_MMAP);
}

int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
                void *cpu_addr, dma_addr_t dma_addr, size_t size,
                unsigned long attrs)
{
        unsigned long user_count = vma_pages(vma);
        unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
        unsigned long pfn = PHYS_PFN(dma_to_phys(dev, dma_addr));
        int ret = -ENXIO;

        vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs);

        if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret))
                return ret;

        if (vma->vm_pgoff >= count || user_count > count - vma->vm_pgoff)
                return -ENXIO;
        return remap_pfn_range(vma, vma->vm_start, pfn + vma->vm_pgoff,
                        user_count << PAGE_SHIFT, vma->vm_page_prot);
}

int dma_direct_supported(struct device *dev, u64 mask)
{
        u64 min_mask = (max_pfn - 1) << PAGE_SHIFT;

        /*
         * Because 32-bit DMA masks are so common we expect every architecture
         * to be able to satisfy them - either by not supporting more physical
         * memory, or by providing a ZONE_DMA32.  If neither is the case, the
         * architecture needs to use an IOMMU instead of the direct mapping.
         */
        if (mask >= DMA_BIT_MASK(32))
                return 1;

        /*
         * This check needs to be against the actual bit mask value, so use
         * phys_to_dma_unencrypted() here so that the SME encryption mask isn't
         * part of the check.
         */
        if (IS_ENABLED(CONFIG_ZONE_DMA))
                min_mask = min_t(u64, min_mask, DMA_BIT_MASK(zone_dma_bits));
        return mask >= phys_to_dma_unencrypted(dev, min_mask);
}

size_t dma_direct_max_mapping_size(struct device *dev)
{
        /* If SWIOTLB is active, use its maximum mapping size */
        if (is_swiotlb_active() &&
            (dma_addressing_limited(dev) || swiotlb_force == SWIOTLB_FORCE))
                return swiotlb_max_mapping_size(dev);
        return SIZE_MAX;
}

bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr)
{
        return !dev_is_dma_coherent(dev) ||
                is_swiotlb_buffer(dma_to_phys(dev, dma_addr));
}

/**
 * dma_direct_set_offset - Assign scalar offset for a single DMA range.
 * @dev:        device pointer; needed to "own" the alloced memory.
 * @cpu_start:  beginning of memory region covered by this offset.
 * @dma_start:  beginning of DMA/PCI region covered by this offset.
 * @size:        size of the region.
 *
 * This is for the simple case of a uniform offset which cannot
 * be discovered by "dma-ranges".
 *
 * It returns -ENOMEM if out of memory, -EINVAL if a map
 * already exists, 0 otherwise.
 *
 * Note: any call to this from a driver is a bug.  The mapping needs
 * to be described by the device tree or other firmware interfaces.
 */
int dma_direct_set_offset(struct device *dev, phys_addr_t cpu_start,
                         dma_addr_t dma_start, u64 size)
{
        struct bus_dma_region *map;
        u64 offset = (u64)cpu_start - (u64)dma_start;

        if (dev->dma_range_map) {
                dev_err(dev, "attempt to add DMA range to existing map\n");
                return -EINVAL;
        }

        if (!offset)
                return 0;

        map = kcalloc(2, sizeof(*map), GFP_KERNEL);
        if (!map)
                return -ENOMEM;
        map[0].cpu_start = cpu_start;
        map[0].dma_start = dma_start;
        map[0].offset = offset;
        map[0].size = size;
        dev->dma_range_map = map;
        return 0;
}
EXPORT_SYMBOL_GPL(dma_direct_set_offset);













































































































































































































































    1 


    1 






















































































































































































































































































    1 












    1 





































































    1 






























    1 




    1 













































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
// SPDX-License-Identifier: GPL-2.0
#include <linux/capability.h>
#include <linux/compat.h>
#include <linux/blkdev.h>
#include <linux/export.h>
#include <linux/gfp.h>
#include <linux/blkpg.h>
#include <linux/hdreg.h>
#include <linux/backing-dev.h>
#include <linux/fs.h>
#include <linux/blktrace_api.h>
#include <linux/pr.h>
#include <linux/uaccess.h>
#include "blk.h"

static int blkpg_do_ioctl(struct block_device *bdev,
                          struct blkpg_partition __user *upart, int op)
{
        struct blkpg_partition p;
        sector_t start, length;

        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;
        if (copy_from_user(&p, upart, sizeof(struct blkpg_partition)))
                return -EFAULT;
        if (bdev_is_partition(bdev))
                return -EINVAL;

        if (p.pno <= 0)
                return -EINVAL;

        if (op == BLKPG_DEL_PARTITION)
                return bdev_del_partition(bdev, p.pno);

        if (p.start < 0 || p.length <= 0 || LLONG_MAX - p.length < p.start)
                return -EINVAL;
        /* Check that the partition is aligned to the block size */
        if (!IS_ALIGNED(p.start | p.length, bdev_logical_block_size(bdev)))
                return -EINVAL;

        start = p.start >> SECTOR_SHIFT;
        length = p.length >> SECTOR_SHIFT;

        /* check for fit in a hd_struct */
        if (sizeof(sector_t) < sizeof(long long)) {
                long pstart = start, plength = length;

                if (pstart != start || plength != length || pstart < 0 ||
                    plength < 0 || p.pno > 65535)
                        return -EINVAL;
        }

        switch (op) {
        case BLKPG_ADD_PARTITION:
                return bdev_add_partition(bdev, p.pno, start, length);
        case BLKPG_RESIZE_PARTITION:
                return bdev_resize_partition(bdev, p.pno, start, length);
        default:
                return -EINVAL;
        }
}

static int blkpg_ioctl(struct block_device *bdev,
                       struct blkpg_ioctl_arg __user *arg)
{
        struct blkpg_partition __user *udata;
        int op;

        if (get_user(op, &arg->op) || get_user(udata, &arg->data))
                return -EFAULT;

        return blkpg_do_ioctl(bdev, udata, op);
}

#ifdef CONFIG_COMPAT
struct compat_blkpg_ioctl_arg {
        compat_int_t op;
        compat_int_t flags;
        compat_int_t datalen;
        compat_caddr_t data;
};

static int compat_blkpg_ioctl(struct block_device *bdev,
                              struct compat_blkpg_ioctl_arg __user *arg)
{
        compat_caddr_t udata;
        int op;

        if (get_user(op, &arg->op) || get_user(udata, &arg->data))
                return -EFAULT;

        return blkpg_do_ioctl(bdev, compat_ptr(udata), op);
}
#endif

static int blkdev_reread_part(struct block_device *bdev, fmode_t mode)
{
        struct block_device *tmp;

        if (!disk_part_scan_enabled(bdev->bd_disk) || bdev_is_partition(bdev))
                return -EINVAL;
        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;
        if (bdev->bd_part_count)
                return -EBUSY;

        /*
         * Reopen the device to revalidate the driver state and force a
         * partition rescan.
         */
        mode &= ~FMODE_EXCL;
        set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);

        tmp = blkdev_get_by_dev(bdev->bd_dev, mode, NULL);
        if (IS_ERR(tmp))
                return PTR_ERR(tmp);
        blkdev_put(tmp, mode);
        return 0;
}

static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode,
                unsigned long arg, unsigned long flags)
{
        uint64_t range[2];
        uint64_t start, len;
        struct request_queue *q = bdev_get_queue(bdev);
        int err;

        if (!(mode & FMODE_WRITE))
                return -EBADF;

        if (!blk_queue_discard(q))
                return -EOPNOTSUPP;

        if (copy_from_user(range, (void __user *)arg, sizeof(range)))
                return -EFAULT;

        start = range[0];
        len = range[1];

        if (start & 511)
                return -EINVAL;
        if (len & 511)
                return -EINVAL;

        if (start + len > i_size_read(bdev->bd_inode))
                return -EINVAL;

        err = truncate_bdev_range(bdev, mode, start, start + len - 1);
        if (err)
                return err;

        return blkdev_issue_discard(bdev, start >> 9, len >> 9,
                                    GFP_KERNEL, flags);
}

static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode,
                unsigned long arg)
{
        uint64_t range[2];
        uint64_t start, end, len;
        int err;

        if (!(mode & FMODE_WRITE))
                return -EBADF;

        if (copy_from_user(range, (void __user *)arg, sizeof(range)))
                return -EFAULT;

        start = range[0];
        len = range[1];
        end = start + len - 1;

        if (start & 511)
                return -EINVAL;
        if (len & 511)
                return -EINVAL;
        if (end >= (uint64_t)i_size_read(bdev->bd_inode))
                return -EINVAL;
        if (end < start)
                return -EINVAL;

        /* Invalidate the page cache, including dirty pages */
        err = truncate_bdev_range(bdev, mode, start, end);
        if (err)
                return err;

        return blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL,
                        BLKDEV_ZERO_NOUNMAP);
}

static int put_ushort(unsigned short __user *argp, unsigned short val)
{
        return put_user(val, argp);
}

static int put_int(int __user *argp, int val)
{
        return put_user(val, argp);
}

static int put_uint(unsigned int __user *argp, unsigned int val)
{
        return put_user(val, argp);
}

static int put_long(long __user *argp, long val)
{
        return put_user(val, argp);
}

static int put_ulong(unsigned long __user *argp, unsigned long val)
{
        return put_user(val, argp);
}

static int put_u64(u64 __user *argp, u64 val)
{
        return put_user(val, argp);
}

#ifdef CONFIG_COMPAT
static int compat_put_long(compat_long_t __user *argp, long val)
{
        return put_user(val, argp);
}

static int compat_put_ulong(compat_ulong_t __user *argp, compat_ulong_t val)
{
        return put_user(val, argp);
}
#endif

int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode,
                        unsigned cmd, unsigned long arg)
{
        struct gendisk *disk = bdev->bd_disk;

        if (disk->fops->ioctl)
                return disk->fops->ioctl(bdev, mode, cmd, arg);

        return -ENOTTY;
}
/*
 * For the record: _GPL here is only because somebody decided to slap it
 * on the previous export.  Sheer idiocy, since it wasn't copyrightable
 * at all and could be open-coded without any exports by anybody who cares.
 */
EXPORT_SYMBOL_GPL(__blkdev_driver_ioctl);

#ifdef CONFIG_COMPAT
/*
 * This is the equivalent of compat_ptr_ioctl(), to be used by block
 * drivers that implement only commands that are completely compatible
 * between 32-bit and 64-bit user space
 */
int blkdev_compat_ptr_ioctl(struct block_device *bdev, fmode_t mode,
                        unsigned cmd, unsigned long arg)
{
        struct gendisk *disk = bdev->bd_disk;

        if (disk->fops->ioctl)
                return disk->fops->ioctl(bdev, mode, cmd,
                                         (unsigned long)compat_ptr(arg));

        return -ENOIOCTLCMD;
}
EXPORT_SYMBOL(blkdev_compat_ptr_ioctl);
#endif

static int blkdev_pr_register(struct block_device *bdev,
                struct pr_registration __user *arg)
{
        const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
        struct pr_registration reg;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
        if (!ops || !ops->pr_register)
                return -EOPNOTSUPP;
        if (copy_from_user(&reg, arg, sizeof(reg)))
                return -EFAULT;

        if (reg.flags & ~PR_FL_IGNORE_KEY)
                return -EOPNOTSUPP;
        return ops->pr_register(bdev, reg.old_key, reg.new_key, reg.flags);
}

static int blkdev_pr_reserve(struct block_device *bdev,
                struct pr_reservation __user *arg)
{
        const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
        struct pr_reservation rsv;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
        if (!ops || !ops->pr_reserve)
                return -EOPNOTSUPP;
        if (copy_from_user(&rsv, arg, sizeof(rsv)))
                return -EFAULT;

        if (rsv.flags & ~PR_FL_IGNORE_KEY)
                return -EOPNOTSUPP;
        return ops->pr_reserve(bdev, rsv.key, rsv.type, rsv.flags);
}

static int blkdev_pr_release(struct block_device *bdev,
                struct pr_reservation __user *arg)
{
        const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
        struct pr_reservation rsv;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
        if (!ops || !ops->pr_release)
                return -EOPNOTSUPP;
        if (copy_from_user(&rsv, arg, sizeof(rsv)))
                return -EFAULT;

        if (rsv.flags)
                return -EOPNOTSUPP;
        return ops->pr_release(bdev, rsv.key, rsv.type);
}

static int blkdev_pr_preempt(struct block_device *bdev,
                struct pr_preempt __user *arg, bool abort)
{
        const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
        struct pr_preempt p;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
        if (!ops || !ops->pr_preempt)
                return -EOPNOTSUPP;
        if (copy_from_user(&p, arg, sizeof(p)))
                return -EFAULT;

        if (p.flags)
                return -EOPNOTSUPP;
        return ops->pr_preempt(bdev, p.old_key, p.new_key, p.type, abort);
}

static int blkdev_pr_clear(struct block_device *bdev,
                struct pr_clear __user *arg)
{
        const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
        struct pr_clear c;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
        if (!ops || !ops->pr_clear)
                return -EOPNOTSUPP;
        if (copy_from_user(&c, arg, sizeof(c)))
                return -EFAULT;

        if (c.flags)
                return -EOPNOTSUPP;
        return ops->pr_clear(bdev, c.key);
}

/*
 * Is it an unrecognized ioctl? The correct returns are either
 * ENOTTY (final) or ENOIOCTLCMD ("I don't know this one, try a
 * fallback"). ENOIOCTLCMD gets turned into ENOTTY by the ioctl
 * code before returning.
 *
 * Confused drivers sometimes return EINVAL, which is wrong. It
 * means "I understood the ioctl command, but the parameters to
 * it were wrong".
 *
 * We should aim to just fix the broken drivers, the EINVAL case
 * should go away.
 */
static inline int is_unrecognized_ioctl(int ret)
{
        return        ret == -EINVAL ||
                ret == -ENOTTY ||
                ret == -ENOIOCTLCMD;
}

static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode,
                unsigned cmd, unsigned long arg)
{
        int ret;

        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;

        ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
        if (!is_unrecognized_ioctl(ret))
                return ret;

        fsync_bdev(bdev);
        invalidate_bdev(bdev);
        return 0;
}

static int blkdev_roset(struct block_device *bdev, fmode_t mode,
                unsigned cmd, unsigned long arg)
{
        int ret, n;

        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;

        ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
        if (!is_unrecognized_ioctl(ret))
                return ret;
        if (get_user(n, (int __user *)arg))
                return -EFAULT;
        if (bdev->bd_disk->fops->set_read_only) {
                ret = bdev->bd_disk->fops->set_read_only(bdev, n);
                if (ret)
                        return ret;
        }
        set_device_ro(bdev, n);
        return 0;
}

static int blkdev_getgeo(struct block_device *bdev,
                struct hd_geometry __user *argp)
{
        struct gendisk *disk = bdev->bd_disk;
        struct hd_geometry geo;
        int ret;

        if (!argp)
                return -EINVAL;
        if (!disk->fops->getgeo)
                return -ENOTTY;

        /*
         * We need to set the startsect first, the driver may
         * want to override it.
         */
        memset(&geo, 0, sizeof(geo));
        geo.start = get_start_sect(bdev);
        ret = disk->fops->getgeo(bdev, &geo);
        if (ret)
                return ret;
        if (copy_to_user(argp, &geo, sizeof(geo)))
                return -EFAULT;
        return 0;
}

#ifdef CONFIG_COMPAT
struct compat_hd_geometry {
        unsigned char heads;
        unsigned char sectors;
        unsigned short cylinders;
        u32 start;
};

static int compat_hdio_getgeo(struct block_device *bdev,
                              struct compat_hd_geometry __user *ugeo)
{
        struct gendisk *disk = bdev->bd_disk;
        struct hd_geometry geo;
        int ret;

        if (!ugeo)
                return -EINVAL;
        if (!disk->fops->getgeo)
                return -ENOTTY;

        memset(&geo, 0, sizeof(geo));
        /*
         * We need to set the startsect first, the driver may
         * want to override it.
         */
        geo.start = get_start_sect(bdev);
        ret = disk->fops->getgeo(bdev, &geo);
        if (ret)
                return ret;

        ret = copy_to_user(ugeo, &geo, 4);
        ret |= put_user(geo.start, &ugeo->start);
        if (ret)
                ret = -EFAULT;

        return ret;
}
#endif

/* set the logical block size */
static int blkdev_bszset(struct block_device *bdev, fmode_t mode,
                int __user *argp)
{
        int ret, n;

        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;
        if (!argp)
                return -EINVAL;
        if (get_user(n, argp))
                return -EFAULT;

        if (mode & FMODE_EXCL)
                return set_blocksize(bdev, n);

        if (IS_ERR(blkdev_get_by_dev(bdev->bd_dev, mode | FMODE_EXCL, &bdev)))
                return -EBUSY;
        ret = set_blocksize(bdev, n);
        blkdev_put(bdev, mode | FMODE_EXCL);

        return ret;
}

/*
 * Common commands that are handled the same way on native and compat
 * user space. Note the separate arg/argp parameters that are needed
 * to deal with the compat_ptr() conversion.
 */
static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
                                unsigned cmd, unsigned long arg, void __user *argp)
{
        unsigned int max_sectors;

        switch (cmd) {
        case BLKFLSBUF:
                return blkdev_flushbuf(bdev, mode, cmd, arg);
        case BLKROSET:
                return blkdev_roset(bdev, mode, cmd, arg);
        case BLKDISCARD:
                return blk_ioctl_discard(bdev, mode, arg, 0);
        case BLKSECDISCARD:
                return blk_ioctl_discard(bdev, mode, arg,
                                BLKDEV_DISCARD_SECURE);
        case BLKZEROOUT:
                return blk_ioctl_zeroout(bdev, mode, arg);
        case BLKREPORTZONE:
                return blkdev_report_zones_ioctl(bdev, mode, cmd, arg);
        case BLKRESETZONE:
        case BLKOPENZONE:
        case BLKCLOSEZONE:
        case BLKFINISHZONE:
                return blkdev_zone_mgmt_ioctl(bdev, mode, cmd, arg);
        case BLKGETZONESZ:
                return put_uint(argp, bdev_zone_sectors(bdev));
        case BLKGETNRZONES:
                return put_uint(argp, blkdev_nr_zones(bdev->bd_disk));
        case BLKROGET:
                return put_int(argp, bdev_read_only(bdev) != 0);
        case BLKSSZGET: /* get block device logical block size */
                return put_int(argp, bdev_logical_block_size(bdev));
        case BLKPBSZGET: /* get block device physical block size */
                return put_uint(argp, bdev_physical_block_size(bdev));
        case BLKIOMIN:
                return put_uint(argp, bdev_io_min(bdev));
        case BLKIOOPT:
                return put_uint(argp, bdev_io_opt(bdev));
        case BLKALIGNOFF:
                return put_int(argp, bdev_alignment_offset(bdev));
        case BLKDISCARDZEROES:
                return put_uint(argp, 0);
        case BLKSECTGET:
                max_sectors = min_t(unsigned int, USHRT_MAX,
                                    queue_max_sectors(bdev_get_queue(bdev)));
                return put_ushort(argp, max_sectors);
        case BLKROTATIONAL:
                return put_ushort(argp, !blk_queue_nonrot(bdev_get_queue(bdev)));
        case BLKRASET:
        case BLKFRASET:
                if(!capable(CAP_SYS_ADMIN))
                        return -EACCES;
                bdev->bd_bdi->ra_pages = (arg * 512) / PAGE_SIZE;
                return 0;
        case BLKRRPART:
                return blkdev_reread_part(bdev, mode);
        case BLKTRACESTART:
        case BLKTRACESTOP:
        case BLKTRACETEARDOWN:
                return blk_trace_ioctl(bdev, cmd, argp);
        case IOC_PR_REGISTER:
                return blkdev_pr_register(bdev, argp);
        case IOC_PR_RESERVE:
                return blkdev_pr_reserve(bdev, argp);
        case IOC_PR_RELEASE:
                return blkdev_pr_release(bdev, argp);
        case IOC_PR_PREEMPT:
                return blkdev_pr_preempt(bdev, argp, false);
        case IOC_PR_PREEMPT_ABORT:
                return blkdev_pr_preempt(bdev, argp, true);
        case IOC_PR_CLEAR:
                return blkdev_pr_clear(bdev, argp);
        default:
                return -ENOIOCTLCMD;
        }
}

/*
 * Always keep this in sync with compat_blkdev_ioctl()
 * to handle all incompatible commands in both functions.
 *
 * New commands must be compatible and go into blkdev_common_ioctl
 */
int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
                        unsigned long arg)
{
        int ret;
        loff_t size;
        void __user *argp = (void __user *)arg;

        switch (cmd) {
        /* These need separate implementations for the data structure */
        case HDIO_GETGEO:
                return blkdev_getgeo(bdev, argp);
        case BLKPG:
                return blkpg_ioctl(bdev, argp);

        /* Compat mode returns 32-bit data instead of 'long' */
        case BLKRAGET:
        case BLKFRAGET:
                if (!argp)
                        return -EINVAL;
                return put_long(argp, (bdev->bd_bdi->ra_pages*PAGE_SIZE) / 512);
        case BLKGETSIZE:
                size = i_size_read(bdev->bd_inode);
                if ((size >> 9) > ~0UL)
                        return -EFBIG;
                return put_ulong(argp, size >> 9);

        /* The data is compatible, but the command number is different */
        case BLKBSZGET: /* get block device soft block size (cf. BLKSSZGET) */
                return put_int(argp, block_size(bdev));
        case BLKBSZSET:
                return blkdev_bszset(bdev, mode, argp);
        case BLKGETSIZE64:
                return put_u64(argp, i_size_read(bdev->bd_inode));

        /* Incompatible alignment on i386 */
        case BLKTRACESETUP:
                return blk_trace_ioctl(bdev, cmd, argp);
        default:
                break;
        }

        ret = blkdev_common_ioctl(bdev, mode, cmd, arg, argp);
        if (ret == -ENOIOCTLCMD)
                return __blkdev_driver_ioctl(bdev, mode, cmd, arg);

        return ret;
}
EXPORT_SYMBOL_GPL(blkdev_ioctl); /* for /dev/raw */

#ifdef CONFIG_COMPAT

#define BLKBSZGET_32                _IOR(0x12, 112, int)
#define BLKBSZSET_32                _IOW(0x12, 113, int)
#define BLKGETSIZE64_32                _IOR(0x12, 114, int)

/* Most of the generic ioctls are handled in the normal fallback path.
   This assumes the blkdev's low level compat_ioctl always returns
   ENOIOCTLCMD for unknown ioctls. */
long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
        int ret;
        void __user *argp = compat_ptr(arg);
        struct inode *inode = file->f_mapping->host;
        struct block_device *bdev = inode->i_bdev;
        struct gendisk *disk = bdev->bd_disk;
        fmode_t mode = file->f_mode;
        loff_t size;

        /*
         * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
         * to updated it before every ioctl.
         */
        if (file->f_flags & O_NDELAY)
                mode |= FMODE_NDELAY;
        else
                mode &= ~FMODE_NDELAY;

        switch (cmd) {
        /* These need separate implementations for the data structure */
        case HDIO_GETGEO:
                return compat_hdio_getgeo(bdev, argp);
        case BLKPG:
                return compat_blkpg_ioctl(bdev, argp);

        /* Compat mode returns 32-bit data instead of 'long' */
        case BLKRAGET:
        case BLKFRAGET:
                if (!argp)
                        return -EINVAL;
                return compat_put_long(argp,
                               (bdev->bd_bdi->ra_pages * PAGE_SIZE) / 512);
        case BLKGETSIZE:
                size = i_size_read(bdev->bd_inode);
                if ((size >> 9) > ~(compat_ulong_t)0)
                        return -EFBIG;
                return compat_put_ulong(argp, size >> 9);

        /* The data is compatible, but the command number is different */
        case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */
                return put_int(argp, bdev_logical_block_size(bdev));
        case BLKBSZSET_32:
                return blkdev_bszset(bdev, mode, argp);
        case BLKGETSIZE64_32:
                return put_u64(argp, i_size_read(bdev->bd_inode));

        /* Incompatible alignment on i386 */
        case BLKTRACESETUP32:
                return blk_trace_ioctl(bdev, cmd, argp);
        default:
                break;
        }

        ret = blkdev_common_ioctl(bdev, mode, cmd, arg, argp);
        if (ret == -ENOIOCTLCMD && disk->fops->compat_ioctl)
                ret = disk->fops->compat_ioctl(bdev, mode, cmd, arg);

        return ret;
}
#endif



































































































































































































































































































































































































































































































































































































































































































    1 

    1 














































































    1 
    1 















































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
// SPDX-License-Identifier: GPL-2.0
/*
 * kobject.c - library routines for handling generic kernel objects
 *
 * Copyright (c) 2002-2003 Patrick Mochel <mochel@osdl.org>
 * Copyright (c) 2006-2007 Greg Kroah-Hartman <greg@kroah.com>
 * Copyright (c) 2006-2007 Novell Inc.
 *
 * Please see the file Documentation/core-api/kobject.rst for critical information
 * about using the kobject interface.
 */

#include <linux/kobject.h>
#include <linux/string.h>
#include <linux/export.h>
#include <linux/stat.h>
#include <linux/slab.h>
#include <linux/random.h>

/**
 * kobject_namespace() - Return @kobj's namespace tag.
 * @kobj: kobject in question
 *
 * Returns namespace tag of @kobj if its parent has namespace ops enabled
 * and thus @kobj should have a namespace tag associated with it.  Returns
 * %NULL otherwise.
 */
const void *kobject_namespace(struct kobject *kobj)
{
        const struct kobj_ns_type_operations *ns_ops = kobj_ns_ops(kobj);

        if (!ns_ops || ns_ops->type == KOBJ_NS_TYPE_NONE)
                return NULL;

        return kobj->ktype->namespace(kobj);
}

/**
 * kobject_get_ownership() - Get sysfs ownership data for @kobj.
 * @kobj: kobject in question
 * @uid: kernel user ID for sysfs objects
 * @gid: kernel group ID for sysfs objects
 *
 * Returns initial uid/gid pair that should be used when creating sysfs
 * representation of given kobject. Normally used to adjust ownership of
 * objects in a container.
 */
void kobject_get_ownership(struct kobject *kobj, kuid_t *uid, kgid_t *gid)
{
        *uid = GLOBAL_ROOT_UID;
        *gid = GLOBAL_ROOT_GID;

        if (kobj->ktype->get_ownership)
                kobj->ktype->get_ownership(kobj, uid, gid);
}

/*
 * populate_dir - populate directory with attributes.
 * @kobj: object we're working on.
 *
 * Most subsystems have a set of default attributes that are associated
 * with an object that registers with them.  This is a helper called during
 * object registration that loops through the default attributes of the
 * subsystem and creates attributes files for them in sysfs.
 */
static int populate_dir(struct kobject *kobj)
{
        struct kobj_type *t = get_ktype(kobj);
        struct attribute *attr;
        int error = 0;
        int i;

        if (t && t->default_attrs) {
                for (i = 0; (attr = t->default_attrs[i]) != NULL; i++) {
                        error = sysfs_create_file(kobj, attr);
                        if (error)
                                break;
                }
        }
        return error;
}

static int create_dir(struct kobject *kobj)
{
        const struct kobj_type *ktype = get_ktype(kobj);
        const struct kobj_ns_type_operations *ops;
        int error;

        error = sysfs_create_dir_ns(kobj, kobject_namespace(kobj));
        if (error)
                return error;

        error = populate_dir(kobj);
        if (error) {
                sysfs_remove_dir(kobj);
                return error;
        }

        if (ktype) {
                error = sysfs_create_groups(kobj, ktype->default_groups);
                if (error) {
                        sysfs_remove_dir(kobj);
                        return error;
                }
        }

        /*
         * @kobj->sd may be deleted by an ancestor going away.  Hold an
         * extra reference so that it stays until @kobj is gone.
         */
        sysfs_get(kobj->sd);

        /*
         * If @kobj has ns_ops, its children need to be filtered based on
         * their namespace tags.  Enable namespace support on @kobj->sd.
         */
        ops = kobj_child_ns_ops(kobj);
        if (ops) {
                BUG_ON(ops->type <= KOBJ_NS_TYPE_NONE);
                BUG_ON(ops->type >= KOBJ_NS_TYPES);
                BUG_ON(!kobj_ns_type_registered(ops->type));

                sysfs_enable_ns(kobj->sd);
        }

        return 0;
}

static int get_kobj_path_length(struct kobject *kobj)
{
        int length = 1;
        struct kobject *parent = kobj;

        /* walk up the ancestors until we hit the one pointing to the
         * root.
         * Add 1 to strlen for leading '/' of each level.
         */
        do {
                if (kobject_name(parent) == NULL)
                        return 0;
                length += strlen(kobject_name(parent)) + 1;
                parent = parent->parent;
        } while (parent);
        return length;
}

static int fill_kobj_path(struct kobject *kobj, char *path, int length)
{
        struct kobject *parent;

        --length;
        for (parent = kobj; parent; parent = parent->parent) {
                int cur = strlen(kobject_name(parent));
                /* back up enough to print this name with '/' */
                length -= cur;
                if (length <= 0)
                        return -EINVAL;
                memcpy(path + length, kobject_name(parent), cur);
                *(path + --length) = '/';
        }

        pr_debug("kobject: '%s' (%p): %s: path = '%s'\n", kobject_name(kobj),
                 kobj, __func__, path);

        return 0;
}

/**
 * kobject_get_path() - Allocate memory and fill in the path for @kobj.
 * @kobj:        kobject in question, with which to build the path
 * @gfp_mask:        the allocation type used to allocate the path
 *
 * Return: The newly allocated memory, caller must free with kfree().
 */
char *kobject_get_path(struct kobject *kobj, gfp_t gfp_mask)
{
        char *path;
        int len;

retry:
        len = get_kobj_path_length(kobj);
        if (len == 0)
                return NULL;
        path = kzalloc(len, gfp_mask);
        if (!path)
                return NULL;
        if (fill_kobj_path(kobj, path, len)) {
                kfree(path);
                goto retry;
        }

        return path;
}
EXPORT_SYMBOL_GPL(kobject_get_path);

/* add the kobject to its kset's list */
static void kobj_kset_join(struct kobject *kobj)
{
        if (!kobj->kset)
                return;

        kset_get(kobj->kset);
        spin_lock(&kobj->kset->list_lock);
        list_add_tail(&kobj->entry, &kobj->kset->list);
        spin_unlock(&kobj->kset->list_lock);
}

/* remove the kobject from its kset's list */
static void kobj_kset_leave(struct kobject *kobj)
{
        if (!kobj->kset)
                return;

        spin_lock(&kobj->kset->list_lock);
        list_del_init(&kobj->entry);
        spin_unlock(&kobj->kset->list_lock);
        kset_put(kobj->kset);
}

static void kobject_init_internal(struct kobject *kobj)
{
        if (!kobj)
                return;
        kref_init(&kobj->kref);
        INIT_LIST_HEAD(&kobj->entry);
        kobj->state_in_sysfs = 0;
        kobj->state_add_uevent_sent = 0;
        kobj->state_remove_uevent_sent = 0;
        kobj->state_initialized = 1;
}


static int kobject_add_internal(struct kobject *kobj)
{
        int error = 0;
        struct kobject *parent;

        if (!kobj)
                return -ENOENT;

        if (!kobj->name || !kobj->name[0]) {
                WARN(1,
                     "kobject: (%p): attempted to be registered with empty name!\n",
                     kobj);
                return -EINVAL;
        }

        parent = kobject_get(kobj->parent);

        /* join kset if set, use it as parent if we do not already have one */
        if (kobj->kset) {
                if (!parent)
                        parent = kobject_get(&kobj->kset->kobj);
                kobj_kset_join(kobj);
                kobj->parent = parent;
        }

        pr_debug("kobject: '%s' (%p): %s: parent: '%s', set: '%s'\n",
                 kobject_name(kobj), kobj, __func__,
                 parent ? kobject_name(parent) : "<NULL>",
                 kobj->kset ? kobject_name(&kobj->kset->kobj) : "<NULL>");

        error = create_dir(kobj);
        if (error) {
                kobj_kset_leave(kobj);
                kobject_put(parent);
                kobj->parent = NULL;

                /* be noisy on error issues */
                if (error == -EEXIST)
                        pr_err("%s failed for %s with -EEXIST, don't try to register things with the same name in the same directory.\n",
                               __func__, kobject_name(kobj));
                else
                        pr_err("%s failed for %s (error: %d parent: %s)\n",
                               __func__, kobject_name(kobj), error,
                               parent ? kobject_name(parent) : "'none'");
        } else
                kobj->state_in_sysfs = 1;

        return error;
}

/**
 * kobject_set_name_vargs() - Set the name of a kobject.
 * @kobj: struct kobject to set the name of
 * @fmt: format string used to build the name
 * @vargs: vargs to format the string.
 */
int kobject_set_name_vargs(struct kobject *kobj, const char *fmt,
                                  va_list vargs)
{
        const char *s;

        if (kobj->name && !fmt)
                return 0;

        s = kvasprintf_const(GFP_KERNEL, fmt, vargs);
        if (!s)
                return -ENOMEM;

        /*
         * ewww... some of these buggers have '/' in the name ... If
         * that's the case, we need to make sure we have an actual
         * allocated copy to modify, since kvasprintf_const may have
         * returned something from .rodata.
         */
        if (strchr(s, '/')) {
                char *t;

                t = kstrdup(s, GFP_KERNEL);
                kfree_const(s);
                if (!t)
                        return -ENOMEM;
                strreplace(t, '/', '!');
                s = t;
        }
        kfree_const(kobj->name);
        kobj->name = s;

        return 0;
}

/**
 * kobject_set_name() - Set the name of a kobject.
 * @kobj: struct kobject to set the name of
 * @fmt: format string used to build the name
 *
 * This sets the name of the kobject.  If you have already added the
 * kobject to the system, you must call kobject_rename() in order to
 * change the name of the kobject.
 */
int kobject_set_name(struct kobject *kobj, const char *fmt, ...)
{
        va_list vargs;
        int retval;

        va_start(vargs, fmt);
        retval = kobject_set_name_vargs(kobj, fmt, vargs);
        va_end(vargs);

        return retval;
}
EXPORT_SYMBOL(kobject_set_name);

/**
 * kobject_init() - Initialize a kobject structure.
 * @kobj: pointer to the kobject to initialize
 * @ktype: pointer to the ktype for this kobject.
 *
 * This function will properly initialize a kobject such that it can then
 * be passed to the kobject_add() call.
 *
 * After this function is called, the kobject MUST be cleaned up by a call
 * to kobject_put(), not by a call to kfree directly to ensure that all of
 * the memory is cleaned up properly.
 */
void kobject_init(struct kobject *kobj, struct kobj_type *ktype)
{
        char *err_str;

        if (!kobj) {
                err_str = "invalid kobject pointer!";
                goto error;
        }
        if (!ktype) {
                err_str = "must have a ktype to be initialized properly!\n";
                goto error;
        }
        if (kobj->state_initialized) {
                /* do not error out as sometimes we can recover */
                pr_err("kobject (%p): tried to init an initialized object, something is seriously wrong.\n",
                       kobj);
                dump_stack();
        }

        kobject_init_internal(kobj);
        kobj->ktype = ktype;
        return;

error:
        pr_err("kobject (%p): %s\n", kobj, err_str);
        dump_stack();
}
EXPORT_SYMBOL(kobject_init);

static __printf(3, 0) int kobject_add_varg(struct kobject *kobj,
                                           struct kobject *parent,
                                           const char *fmt, va_list vargs)
{
        int retval;

        retval = kobject_set_name_vargs(kobj, fmt, vargs);
        if (retval) {
                pr_err("kobject: can not set name properly!\n");
                return retval;
        }
        kobj->parent = parent;
        return kobject_add_internal(kobj);
}

/**
 * kobject_add() - The main kobject add function.
 * @kobj: the kobject to add
 * @parent: pointer to the parent of the kobject.
 * @fmt: format to name the kobject with.
 *
 * The kobject name is set and added to the kobject hierarchy in this
 * function.
 *
 * If @parent is set, then the parent of the @kobj will be set to it.
 * If @parent is NULL, then the parent of the @kobj will be set to the
 * kobject associated with the kset assigned to this kobject.  If no kset
 * is assigned to the kobject, then the kobject will be located in the
 * root of the sysfs tree.
 *
 * Note, no "add" uevent will be created with this call, the caller should set
 * up all of the necessary sysfs files for the object and then call
 * kobject_uevent() with the UEVENT_ADD parameter to ensure that
 * userspace is properly notified of this kobject's creation.
 *
 * Return: If this function returns an error, kobject_put() must be
 *         called to properly clean up the memory associated with the
 *         object.  Under no instance should the kobject that is passed
 *         to this function be directly freed with a call to kfree(),
 *         that can leak memory.
 *
 *         If this function returns success, kobject_put() must also be called
 *         in order to properly clean up the memory associated with the object.
 *
 *         In short, once this function is called, kobject_put() MUST be called
 *         when the use of the object is finished in order to properly free
 *         everything.
 */
int kobject_add(struct kobject *kobj, struct kobject *parent,
                const char *fmt, ...)
{
        va_list args;
        int retval;

        if (!kobj)
                return -EINVAL;

        if (!kobj->state_initialized) {
                pr_err("kobject '%s' (%p): tried to add an uninitialized object, something is seriously wrong.\n",
                       kobject_name(kobj), kobj);
                dump_stack();
                return -EINVAL;
        }
        va_start(args, fmt);
        retval = kobject_add_varg(kobj, parent, fmt, args);
        va_end(args);

        return retval;
}
EXPORT_SYMBOL(kobject_add);

/**
 * kobject_init_and_add() - Initialize a kobject structure and add it to
 *                          the kobject hierarchy.
 * @kobj: pointer to the kobject to initialize
 * @ktype: pointer to the ktype for this kobject.
 * @parent: pointer to the parent of this kobject.
 * @fmt: the name of the kobject.
 *
 * This function combines the call to kobject_init() and kobject_add().
 *
 * If this function returns an error, kobject_put() must be called to
 * properly clean up the memory associated with the object.  This is the
 * same type of error handling after a call to kobject_add() and kobject
 * lifetime rules are the same here.
 */
int kobject_init_and_add(struct kobject *kobj, struct kobj_type *ktype,
                         struct kobject *parent, const char *fmt, ...)
{
        va_list args;
        int retval;

        kobject_init(kobj, ktype);

        va_start(args, fmt);
        retval = kobject_add_varg(kobj, parent, fmt, args);
        va_end(args);

        return retval;
}
EXPORT_SYMBOL_GPL(kobject_init_and_add);

/**
 * kobject_rename() - Change the name of an object.
 * @kobj: object in question.
 * @new_name: object's new name
 *
 * It is the responsibility of the caller to provide mutual
 * exclusion between two different calls of kobject_rename
 * on the same kobject and to ensure that new_name is valid and
 * won't conflict with other kobjects.
 */
int kobject_rename(struct kobject *kobj, const char *new_name)
{
        int error = 0;
        const char *devpath = NULL;
        const char *dup_name = NULL, *name;
        char *devpath_string = NULL;
        char *envp[2];

        kobj = kobject_get(kobj);
        if (!kobj)
                return -EINVAL;
        if (!kobj->parent) {
                kobject_put(kobj);
                return -EINVAL;
        }

        devpath = kobject_get_path(kobj, GFP_KERNEL);
        if (!devpath) {
                error = -ENOMEM;
                goto out;
        }
        devpath_string = kmalloc(strlen(devpath) + 15, GFP_KERNEL);
        if (!devpath_string) {
                error = -ENOMEM;
                goto out;
        }
        sprintf(devpath_string, "DEVPATH_OLD=%s", devpath);
        envp[0] = devpath_string;
        envp[1] = NULL;

        name = dup_name = kstrdup_const(new_name, GFP_KERNEL);
        if (!name) {
                error = -ENOMEM;
                goto out;
        }

        error = sysfs_rename_dir_ns(kobj, new_name, kobject_namespace(kobj));
        if (error)
                goto out;

        /* Install the new kobject name */
        dup_name = kobj->name;
        kobj->name = name;

        /* This function is mostly/only used for network interface.
         * Some hotplug package track interfaces by their name and
         * therefore want to know when the name is changed by the user. */
        kobject_uevent_env(kobj, KOBJ_MOVE, envp);

out:
        kfree_const(dup_name);
        kfree(devpath_string);
        kfree(devpath);
        kobject_put(kobj);

        return error;
}
EXPORT_SYMBOL_GPL(kobject_rename);

/**
 * kobject_move() - Move object to another parent.
 * @kobj: object in question.
 * @new_parent: object's new parent (can be NULL)
 */
int kobject_move(struct kobject *kobj, struct kobject *new_parent)
{
        int error;
        struct kobject *old_parent;
        const char *devpath = NULL;
        char *devpath_string = NULL;
        char *envp[2];

        kobj = kobject_get(kobj);
        if (!kobj)
                return -EINVAL;
        new_parent = kobject_get(new_parent);
        if (!new_parent) {
                if (kobj->kset)
                        new_parent = kobject_get(&kobj->kset->kobj);
        }

        /* old object path */
        devpath = kobject_get_path(kobj, GFP_KERNEL);
        if (!devpath) {
                error = -ENOMEM;
                goto out;
        }
        devpath_string = kmalloc(strlen(devpath) + 15, GFP_KERNEL);
        if (!devpath_string) {
                error = -ENOMEM;
                goto out;
        }
        sprintf(devpath_string, "DEVPATH_OLD=%s", devpath);
        envp[0] = devpath_string;
        envp[1] = NULL;
        error = sysfs_move_dir_ns(kobj, new_parent, kobject_namespace(kobj));
        if (error)
                goto out;
        old_parent = kobj->parent;
        kobj->parent = new_parent;
        new_parent = NULL;
        kobject_put(old_parent);
        kobject_uevent_env(kobj, KOBJ_MOVE, envp);
out:
        kobject_put(new_parent);
        kobject_put(kobj);
        kfree(devpath_string);
        kfree(devpath);
        return error;
}
EXPORT_SYMBOL_GPL(kobject_move);

static void __kobject_del(struct kobject *kobj)
{
        struct kernfs_node *sd;
        const struct kobj_type *ktype;

        sd = kobj->sd;
        ktype = get_ktype(kobj);

        if (ktype)
                sysfs_remove_groups(kobj, ktype->default_groups);

        /* send "remove" if the caller did not do it but sent "add" */
        if (kobj->state_add_uevent_sent && !kobj->state_remove_uevent_sent) {
                pr_debug("kobject: '%s' (%p): auto cleanup 'remove' event\n",
                         kobject_name(kobj), kobj);
                kobject_uevent(kobj, KOBJ_REMOVE);
        }

        sysfs_remove_dir(kobj);
        sysfs_put(sd);

        kobj->state_in_sysfs = 0;
        kobj_kset_leave(kobj);
        kobj->parent = NULL;
}

/**
 * kobject_del() - Unlink kobject from hierarchy.
 * @kobj: object.
 *
 * This is the function that should be called to delete an object
 * successfully added via kobject_add().
 */
void kobject_del(struct kobject *kobj)
{
        struct kobject *parent;

        if (!kobj)
                return;

        parent = kobj->parent;
        __kobject_del(kobj);
        kobject_put(parent);
}
EXPORT_SYMBOL(kobject_del);

/**
 * kobject_get() - Increment refcount for object.
 * @kobj: object.
 */
struct kobject *kobject_get(struct kobject *kobj)
{
        if (kobj) {
                if (!kobj->state_initialized)
                        WARN(1, KERN_WARNING
                                "kobject: '%s' (%p): is not initialized, yet kobject_get() is being called.\n",
                             kobject_name(kobj), kobj);
                kref_get(&kobj->kref);
        }
        return kobj;
}
EXPORT_SYMBOL(kobject_get);

struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj)
{
        if (!kobj)
                return NULL;
        if (!kref_get_unless_zero(&kobj->kref))
                kobj = NULL;
        return kobj;
}
EXPORT_SYMBOL(kobject_get_unless_zero);

/*
 * kobject_cleanup - free kobject resources.
 * @kobj: object to cleanup
 */
static void kobject_cleanup(struct kobject *kobj)
{
        struct kobject *parent = kobj->parent;
        struct kobj_type *t = get_ktype(kobj);
        const char *name = kobj->name;

        pr_debug("kobject: '%s' (%p): %s, parent %p\n",
                 kobject_name(kobj), kobj, __func__, kobj->parent);

        if (t && !t->release)
                pr_debug("kobject: '%s' (%p): does not have a release() function, it is broken and must be fixed. See Documentation/core-api/kobject.rst.\n",
                         kobject_name(kobj), kobj);

        /* remove from sysfs if the caller did not do it */
        if (kobj->state_in_sysfs) {
                pr_debug("kobject: '%s' (%p): auto cleanup kobject_del\n",
                         kobject_name(kobj), kobj);
                __kobject_del(kobj);
        } else {
                /* avoid dropping the parent reference unnecessarily */
                parent = NULL;
        }

        if (t && t->release) {
                pr_debug("kobject: '%s' (%p): calling ktype release\n",
                         kobject_name(kobj), kobj);
                t->release(kobj);
        }

        /* free name if we allocated it */
        if (name) {
                pr_debug("kobject: '%s': free name\n", name);
                kfree_const(name);
        }

        kobject_put(parent);
}

#ifdef CONFIG_DEBUG_KOBJECT_RELEASE
static void kobject_delayed_cleanup(struct work_struct *work)
{
        kobject_cleanup(container_of(to_delayed_work(work),
                                     struct kobject, release));
}
#endif

static void kobject_release(struct kref *kref)
{
        struct kobject *kobj = container_of(kref, struct kobject, kref);
#ifdef CONFIG_DEBUG_KOBJECT_RELEASE
        unsigned long delay = HZ + HZ * (get_random_int() & 0x3);
        pr_info("kobject: '%s' (%p): %s, parent %p (delayed %ld)\n",
                 kobject_name(kobj), kobj, __func__, kobj->parent, delay);
        INIT_DELAYED_WORK(&kobj->release, kobject_delayed_cleanup);

        schedule_delayed_work(&kobj->release, delay);
#else
        kobject_cleanup(kobj);
#endif
}

/**
 * kobject_put() - Decrement refcount for object.
 * @kobj: object.
 *
 * Decrement the refcount, and if 0, call kobject_cleanup().
 */
void kobject_put(struct kobject *kobj)
{
        if (kobj) {
                if (!kobj->state_initialized)
                        WARN(1, KERN_WARNING
                                "kobject: '%s' (%p): is not initialized, yet kobject_put() is being called.\n",
                             kobject_name(kobj), kobj);
                kref_put(&kobj->kref, kobject_release);
        }
}
EXPORT_SYMBOL(kobject_put);

static void dynamic_kobj_release(struct kobject *kobj)
{
        pr_debug("kobject: (%p): %s\n", kobj, __func__);
        kfree(kobj);
}

static struct kobj_type dynamic_kobj_ktype = {
        .release        = dynamic_kobj_release,
        .sysfs_ops        = &kobj_sysfs_ops,
};

/**
 * kobject_create() - Create a struct kobject dynamically.
 *
 * This function creates a kobject structure dynamically and sets it up
 * to be a "dynamic" kobject with a default release function set up.
 *
 * If the kobject was not able to be created, NULL will be returned.
 * The kobject structure returned from here must be cleaned up with a
 * call to kobject_put() and not kfree(), as kobject_init() has
 * already been called on this structure.
 */
struct kobject *kobject_create(void)
{
        struct kobject *kobj;

        kobj = kzalloc(sizeof(*kobj), GFP_KERNEL);
        if (!kobj)
                return NULL;

        kobject_init(kobj, &dynamic_kobj_ktype);
        return kobj;
}

/**
 * kobject_create_and_add() - Create a struct kobject dynamically and
 *                            register it with sysfs.
 * @name: the name for the kobject
 * @parent: the parent kobject of this kobject, if any.
 *
 * This function creates a kobject structure dynamically and registers it
 * with sysfs.  When you are finished with this structure, call
 * kobject_put() and the structure will be dynamically freed when
 * it is no longer being used.
 *
 * If the kobject was not able to be created, NULL will be returned.
 */
struct kobject *kobject_create_and_add(const char *name, struct kobject *parent)
{
        struct kobject *kobj;
        int retval;

        kobj = kobject_create();
        if (!kobj)
                return NULL;

        retval = kobject_add(kobj, parent, "%s", name);
        if (retval) {
                pr_warn("%s: kobject_add error: %d\n", __func__, retval);
                kobject_put(kobj);
                kobj = NULL;
        }
        return kobj;
}
EXPORT_SYMBOL_GPL(kobject_create_and_add);

/**
 * kset_init() - Initialize a kset for use.
 * @k: kset
 */
void kset_init(struct kset *k)
{
        kobject_init_internal(&k->kobj);
        INIT_LIST_HEAD(&k->list);
        spin_lock_init(&k->list_lock);
}

/* default kobject attribute operations */
static ssize_t kobj_attr_show(struct kobject *kobj, struct attribute *attr,
                              char *buf)
{
        struct kobj_attribute *kattr;
        ssize_t ret = -EIO;

        kattr = container_of(attr, struct kobj_attribute, attr);
        if (kattr->show)
                ret = kattr->show(kobj, kattr, buf);
        return ret;
}

static ssize_t kobj_attr_store(struct kobject *kobj, struct attribute *attr,
                               const char *buf, size_t count)
{
        struct kobj_attribute *kattr;
        ssize_t ret = -EIO;

        kattr = container_of(attr, struct kobj_attribute, attr);
        if (kattr->store)
                ret = kattr->store(kobj, kattr, buf, count);
        return ret;
}

const struct sysfs_ops kobj_sysfs_ops = {
        .show        = kobj_attr_show,
        .store        = kobj_attr_store,
};
EXPORT_SYMBOL_GPL(kobj_sysfs_ops);

/**
 * kset_register() - Initialize and add a kset.
 * @k: kset.
 */
int kset_register(struct kset *k)
{
        int err;

        if (!k)
                return -EINVAL;

        if (!k->kobj.ktype) {
                pr_err("must have a ktype to be initialized properly!\n");
                return -EINVAL;
        }

        kset_init(k);
        err = kobject_add_internal(&k->kobj);
        if (err)
                return err;
        kobject_uevent(&k->kobj, KOBJ_ADD);
        return 0;
}
EXPORT_SYMBOL(kset_register);

/**
 * kset_unregister() - Remove a kset.
 * @k: kset.
 */
void kset_unregister(struct kset *k)
{
        if (!k)
                return;
        kobject_del(&k->kobj);
        kobject_put(&k->kobj);
}
EXPORT_SYMBOL(kset_unregister);

/**
 * kset_find_obj() - Search for object in kset.
 * @kset: kset we're looking in.
 * @name: object's name.
 *
 * Lock kset via @kset->subsys, and iterate over @kset->list,
 * looking for a matching kobject. If matching object is found
 * take a reference and return the object.
 */
struct kobject *kset_find_obj(struct kset *kset, const char *name)
{
        struct kobject *k;
        struct kobject *ret = NULL;

        spin_lock(&kset->list_lock);

        list_for_each_entry(k, &kset->list, entry) {
                if (kobject_name(k) && !strcmp(kobject_name(k), name)) {
                        ret = kobject_get_unless_zero(k);
                        break;
                }
        }

        spin_unlock(&kset->list_lock);
        return ret;
}
EXPORT_SYMBOL_GPL(kset_find_obj);

static void kset_release(struct kobject *kobj)
{
        struct kset *kset = container_of(kobj, struct kset, kobj);
        pr_debug("kobject: '%s' (%p): %s\n",
                 kobject_name(kobj), kobj, __func__);
        kfree(kset);
}

static void kset_get_ownership(struct kobject *kobj, kuid_t *uid, kgid_t *gid)
{
        if (kobj->parent)
                kobject_get_ownership(kobj->parent, uid, gid);
}

static struct kobj_type kset_ktype = {
        .sysfs_ops        = &kobj_sysfs_ops,
        .release        = kset_release,
        .get_ownership        = kset_get_ownership,
};

/**
 * kset_create() - Create a struct kset dynamically.
 *
 * @name: the name for the kset
 * @uevent_ops: a struct kset_uevent_ops for the kset
 * @parent_kobj: the parent kobject of this kset, if any.
 *
 * This function creates a kset structure dynamically.  This structure can
 * then be registered with the system and show up in sysfs with a call to
 * kset_register().  When you are finished with this structure, if
 * kset_register() has been called, call kset_unregister() and the
 * structure will be dynamically freed when it is no longer being used.
 *
 * If the kset was not able to be created, NULL will be returned.
 */
static struct kset *kset_create(const char *name,
                                const struct kset_uevent_ops *uevent_ops,
                                struct kobject *parent_kobj)
{
        struct kset *kset;
        int retval;

        kset = kzalloc(sizeof(*kset), GFP_KERNEL);
        if (!kset)
                return NULL;
        retval = kobject_set_name(&kset->kobj, "%s", name);
        if (retval) {
                kfree(kset);
                return NULL;
        }
        kset->uevent_ops = uevent_ops;
        kset->kobj.parent = parent_kobj;

        /*
         * The kobject of this kset will have a type of kset_ktype and belong to
         * no kset itself.  That way we can properly free it when it is
         * finished being used.
         */
        kset->kobj.ktype = &kset_ktype;
        kset->kobj.kset = NULL;

        return kset;
}

/**
 * kset_create_and_add() - Create a struct kset dynamically and add it to sysfs.
 *
 * @name: the name for the kset
 * @uevent_ops: a struct kset_uevent_ops for the kset
 * @parent_kobj: the parent kobject of this kset, if any.
 *
 * This function creates a kset structure dynamically and registers it
 * with sysfs.  When you are finished with this structure, call
 * kset_unregister() and the structure will be dynamically freed when it
 * is no longer being used.
 *
 * If the kset was not able to be created, NULL will be returned.
 */
struct kset *kset_create_and_add(const char *name,
                                 const struct kset_uevent_ops *uevent_ops,
                                 struct kobject *parent_kobj)
{
        struct kset *kset;
        int error;

        kset = kset_create(name, uevent_ops, parent_kobj);
        if (!kset)
                return NULL;
        error = kset_register(kset);
        if (error) {
                kfree(kset);
                return NULL;
        }
        return kset;
}
EXPORT_SYMBOL_GPL(kset_create_and_add);


static DEFINE_SPINLOCK(kobj_ns_type_lock);
static const struct kobj_ns_type_operations *kobj_ns_ops_tbl[KOBJ_NS_TYPES];

int kobj_ns_type_register(const struct kobj_ns_type_operations *ops)
{
        enum kobj_ns_type type = ops->type;
        int error;

        spin_lock(&kobj_ns_type_lock);

        error = -EINVAL;
        if (type >= KOBJ_NS_TYPES)
                goto out;

        error = -EINVAL;
        if (type <= KOBJ_NS_TYPE_NONE)
                goto out;

        error = -EBUSY;
        if (kobj_ns_ops_tbl[type])
                goto out;

        error = 0;
        kobj_ns_ops_tbl[type] = ops;

out:
        spin_unlock(&kobj_ns_type_lock);
        return error;
}

int kobj_ns_type_registered(enum kobj_ns_type type)
{
        int registered = 0;

        spin_lock(&kobj_ns_type_lock);
        if ((type > KOBJ_NS_TYPE_NONE) && (type < KOBJ_NS_TYPES))
                registered = kobj_ns_ops_tbl[type] != NULL;
        spin_unlock(&kobj_ns_type_lock);

        return registered;
}

const struct kobj_ns_type_operations *kobj_child_ns_ops(struct kobject *parent)
{
        const struct kobj_ns_type_operations *ops = NULL;

        if (parent && parent->ktype && parent->ktype->child_ns_type)
                ops = parent->ktype->child_ns_type(parent);

        return ops;
}

const struct kobj_ns_type_operations *kobj_ns_ops(struct kobject *kobj)
{
        return kobj_child_ns_ops(kobj->parent);
}

bool kobj_ns_current_may_mount(enum kobj_ns_type type)
{
        bool may_mount = true;

        spin_lock(&kobj_ns_type_lock);
        if ((type > KOBJ_NS_TYPE_NONE) && (type < KOBJ_NS_TYPES) &&
            kobj_ns_ops_tbl[type])
                may_mount = kobj_ns_ops_tbl[type]->current_may_mount();
        spin_unlock(&kobj_ns_type_lock);

        return may_mount;
}

void *kobj_ns_grab_current(enum kobj_ns_type type)
{
        void *ns = NULL;

        spin_lock(&kobj_ns_type_lock);
        if ((type > KOBJ_NS_TYPE_NONE) && (type < KOBJ_NS_TYPES) &&
            kobj_ns_ops_tbl[type])
                ns = kobj_ns_ops_tbl[type]->grab_current_ns();
        spin_unlock(&kobj_ns_type_lock);

        return ns;
}
EXPORT_SYMBOL_GPL(kobj_ns_grab_current);

const void *kobj_ns_netlink(enum kobj_ns_type type, struct sock *sk)
{
        const void *ns = NULL;

        spin_lock(&kobj_ns_type_lock);
        if ((type > KOBJ_NS_TYPE_NONE) && (type < KOBJ_NS_TYPES) &&
            kobj_ns_ops_tbl[type])
                ns = kobj_ns_ops_tbl[type]->netlink_ns(sk);
        spin_unlock(&kobj_ns_type_lock);

        return ns;
}

const void *kobj_ns_initial(enum kobj_ns_type type)
{
        const void *ns = NULL;

        spin_lock(&kobj_ns_type_lock);
        if ((type > KOBJ_NS_TYPE_NONE) && (type < KOBJ_NS_TYPES) &&
            kobj_ns_ops_tbl[type])
                ns = kobj_ns_ops_tbl[type]->initial_ns();
        spin_unlock(&kobj_ns_type_lock);

        return ns;
}

void kobj_ns_drop(enum kobj_ns_type type, void *ns)
{
        spin_lock(&kobj_ns_type_lock);
        if ((type > KOBJ_NS_TYPE_NONE) && (type < KOBJ_NS_TYPES) &&
            kobj_ns_ops_tbl[type] && kobj_ns_ops_tbl[type]->drop_ns)
                kobj_ns_ops_tbl[type]->drop_ns(ns);
        spin_unlock(&kobj_ns_type_lock);
}
EXPORT_SYMBOL_GPL(kobj_ns_drop);




















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_FUTEX_H
#define _ASM_X86_FUTEX_H

#ifdef __KERNEL__

#include <linux/futex.h>
#include <linux/uaccess.h>

#include <asm/asm.h>
#include <asm/errno.h>
#include <asm/processor.h>
#include <asm/smap.h>

#define unsafe_atomic_op1(insn, oval, uaddr, oparg, label)        \
do {                                                                \
        int oldval = 0, ret;                                        \
        asm volatile("1:\t" insn "\n"                                \
                     "2:\n"                                        \
                     "\t.section .fixup,\"ax\"\n"                \
                     "3:\tmov\t%3, %1\n"                        \
                     "\tjmp\t2b\n"                                \
                     "\t.previous\n"                                \
                     _ASM_EXTABLE_UA(1b, 3b)                        \
                     : "=r" (oldval), "=r" (ret), "+m" (*uaddr)        \
                     : "i" (-EFAULT), "0" (oparg), "1" (0));        \
        if (ret)                                                \
                goto label;                                        \
        *oval = oldval;                                                \
} while(0)


#define unsafe_atomic_op2(insn, oval, uaddr, oparg, label)        \
do {                                                                \
        int oldval = 0, ret, tem;                                \
        asm volatile("1:\tmovl        %2, %0\n"                        \
                     "2:\tmovl\t%0, %3\n"                        \
                     "\t" insn "\n"                                \
                     "3:\t" LOCK_PREFIX "cmpxchgl %3, %2\n"        \
                     "\tjnz\t2b\n"                                \
                     "4:\n"                                        \
                     "\t.section .fixup,\"ax\"\n"                \
                     "5:\tmov\t%5, %1\n"                        \
                     "\tjmp\t4b\n"                                \
                     "\t.previous\n"                                \
                     _ASM_EXTABLE_UA(1b, 5b)                        \
                     _ASM_EXTABLE_UA(3b, 5b)                        \
                     : "=&a" (oldval), "=&r" (ret),                \
                       "+m" (*uaddr), "=&r" (tem)                \
                     : "r" (oparg), "i" (-EFAULT), "1" (0));        \
        if (ret)                                                \
                goto label;                                        \
        *oval = oldval;                                                \
} while(0)

static __always_inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
                u32 __user *uaddr)
{
        if (!user_access_begin(uaddr, sizeof(u32)))
                return -EFAULT;

        switch (op) {
        case FUTEX_OP_SET:
                unsafe_atomic_op1("xchgl %0, %2", oval, uaddr, oparg, Efault);
                break;
        case FUTEX_OP_ADD:
                unsafe_atomic_op1(LOCK_PREFIX "xaddl %0, %2", oval,
                                   uaddr, oparg, Efault);
                break;
        case FUTEX_OP_OR:
                unsafe_atomic_op2("orl %4, %3", oval, uaddr, oparg, Efault);
                break;
        case FUTEX_OP_ANDN:
                unsafe_atomic_op2("andl %4, %3", oval, uaddr, ~oparg, Efault);
                break;
        case FUTEX_OP_XOR:
                unsafe_atomic_op2("xorl %4, %3", oval, uaddr, oparg, Efault);
                break;
        default:
                user_access_end();
                return -ENOSYS;
        }
        user_access_end();
        return 0;
Efault:
        user_access_end();
        return -EFAULT;
}

static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
                                                u32 oldval, u32 newval)
{
        int ret = 0;

        if (!user_access_begin(uaddr, sizeof(u32)))
                return -EFAULT;
        asm volatile("\n"
                "1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n"
                "2:\n"
                "\t.section .fixup, \"ax\"\n"
                "3:\tmov     %3, %0\n"
                "\tjmp     2b\n"
                "\t.previous\n"
                _ASM_EXTABLE_UA(1b, 3b)
                : "+r" (ret), "=a" (oldval), "+m" (*uaddr)
                : "i" (-EFAULT), "r" (newval), "1" (oldval)
                : "memory"
        );
        user_access_end();
        *uval = oldval;
        return ret;
}

#endif
#endif /* _ASM_X86_FUTEX_H */








































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * A policy database (policydb) specifies the
 * configuration data for the security policy.
 *
 * Author : Stephen Smalley, <sds@tycho.nsa.gov>
 */

/*
 * Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com>
 *
 *        Support for enhanced MLS infrastructure.
 *
 * Updated: Frank Mayer <mayerf@tresys.com> and Karl MacMillan <kmacmillan@tresys.com>
 *
 *        Added conditional policy language extensions
 *
 * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc.
 * Copyright (C) 2003 - 2004 Tresys Technology, LLC
 */

#ifndef _SS_POLICYDB_H_
#define _SS_POLICYDB_H_

#include "symtab.h"
#include "avtab.h"
#include "sidtab.h"
#include "ebitmap.h"
#include "mls_types.h"
#include "context.h"
#include "constraint.h"

/*
 * A datum type is defined for each kind of symbol
 * in the configuration data:  individual permissions,
 * common prefixes for access vectors, classes,
 * users, roles, types, sensitivities, categories, etc.
 */

/* Permission attributes */
struct perm_datum {
        u32 value;                /* permission bit + 1 */
};

/* Attributes of a common prefix for access vectors */
struct common_datum {
        u32 value;                        /* internal common value */
        struct symtab permissions;        /* common permissions */
};

/* Class attributes */
struct class_datum {
        u32 value;                        /* class value */
        char *comkey;                        /* common name */
        struct common_datum *comdatum;        /* common datum */
        struct symtab permissions;        /* class-specific permission symbol table */
        struct constraint_node *constraints;        /* constraints on class permissions */
        struct constraint_node *validatetrans;        /* special transition rules */
/* Options how a new object user, role, and type should be decided */
#define DEFAULT_SOURCE         1
#define DEFAULT_TARGET         2
        char default_user;
        char default_role;
        char default_type;
/* Options how a new object range should be decided */
#define DEFAULT_SOURCE_LOW     1
#define DEFAULT_SOURCE_HIGH    2
#define DEFAULT_SOURCE_LOW_HIGH        3
#define DEFAULT_TARGET_LOW     4
#define DEFAULT_TARGET_HIGH    5
#define DEFAULT_TARGET_LOW_HIGH        6
#define DEFAULT_GLBLUB                7
        char default_range;
};

/* Role attributes */
struct role_datum {
        u32 value;                        /* internal role value */
        u32 bounds;                        /* boundary of role */
        struct ebitmap dominates;        /* set of roles dominated by this role */
        struct ebitmap types;                /* set of authorized types for role */
};

struct role_trans_key {
        u32 role;                /* current role */
        u32 type;                /* program executable type, or new object type */
        u32 tclass;                /* process class, or new object class */
};

struct role_trans_datum {
        u32 new_role;                /* new role */
};

struct filename_trans_key {
        u32 ttype;                /* parent dir context */
        u16 tclass;                /* class of new object */
        const char *name;        /* last path component */
};

struct filename_trans_datum {
        struct ebitmap stypes;        /* bitmap of source types for this otype */
        u32 otype;                /* resulting type of new object */
        struct filename_trans_datum *next;        /* record for next otype*/
};

struct role_allow {
        u32 role;                /* current role */
        u32 new_role;                /* new role */
        struct role_allow *next;
};

/* Type attributes */
struct type_datum {
        u32 value;                /* internal type value */
        u32 bounds;                /* boundary of type */
        unsigned char primary;        /* primary name? */
        unsigned char attribute;/* attribute ?*/
};

/* User attributes */
struct user_datum {
        u32 value;                        /* internal user value */
        u32 bounds;                        /* bounds of user */
        struct ebitmap roles;                /* set of authorized roles for user */
        struct mls_range range;                /* MLS range (min - max) for user */
        struct mls_level dfltlevel;        /* default login MLS level for user */
};


/* Sensitivity attributes */
struct level_datum {
        struct mls_level *level;        /* sensitivity and associated categories */
        unsigned char isalias;        /* is this sensitivity an alias for another? */
};

/* Category attributes */
struct cat_datum {
        u32 value;                /* internal category bit + 1 */
        unsigned char isalias;  /* is this category an alias for another? */
};

struct range_trans {
        u32 source_type;
        u32 target_type;
        u32 target_class;
};

/* Boolean data type */
struct cond_bool_datum {
        __u32 value;                /* internal type value */
        int state;
};

struct cond_node;

/*
 * type set preserves data needed to determine constraint info from
 * policy source. This is not used by the kernel policy but allows
 * utilities such as audit2allow to determine constraint denials.
 */
struct type_set {
        struct ebitmap types;
        struct ebitmap negset;
        u32 flags;
};

/*
 * The configuration data includes security contexts for
 * initial SIDs, unlabeled file systems, TCP and UDP port numbers,
 * network interfaces, and nodes.  This structure stores the
 * relevant data for one such entry.  Entries of the same kind
 * (e.g. all initial SIDs) are linked together into a list.
 */
struct ocontext {
        union {
                char *name;        /* name of initial SID, fs, netif, fstype, path */
                struct {
                        u8 protocol;
                        u16 low_port;
                        u16 high_port;
                } port;                /* TCP or UDP port information */
                struct {
                        u32 addr;
                        u32 mask;
                } node;                /* node information */
                struct {
                        u32 addr[4];
                        u32 mask[4];
                } node6;        /* IPv6 node information */
                struct {
                        u64 subnet_prefix;
                        u16 low_pkey;
                        u16 high_pkey;
                } ibpkey;
                struct {
                        char *dev_name;
                        u8 port;
                } ibendport;
        } u;
        union {
                u32 sclass;  /* security class for genfs */
                u32 behavior;  /* labeling behavior for fs_use */
        } v;
        struct context context[2];        /* security context(s) */
        u32 sid[2];        /* SID(s) */
        struct ocontext *next;
};

struct genfs {
        char *fstype;
        struct ocontext *head;
        struct genfs *next;
};

/* symbol table array indices */
#define SYM_COMMONS 0
#define SYM_CLASSES 1
#define SYM_ROLES   2
#define SYM_TYPES   3
#define SYM_USERS   4
#define SYM_BOOLS   5
#define SYM_LEVELS  6
#define SYM_CATS    7
#define SYM_NUM     8

/* object context array indices */
#define OCON_ISID        0 /* initial SIDs */
#define OCON_FS                1 /* unlabeled file systems */
#define OCON_PORT        2 /* TCP and UDP port numbers */
#define OCON_NETIF        3 /* network interfaces */
#define OCON_NODE        4 /* nodes */
#define OCON_FSUSE        5 /* fs_use */
#define OCON_NODE6        6 /* IPv6 nodes */
#define OCON_IBPKEY        7 /* Infiniband PKeys */
#define OCON_IBENDPORT        8 /* Infiniband end ports */
#define OCON_NUM        9

/* The policy database */
struct policydb {
        int mls_enabled;

        /* symbol tables */
        struct symtab symtab[SYM_NUM];
#define p_commons symtab[SYM_COMMONS]
#define p_classes symtab[SYM_CLASSES]
#define p_roles symtab[SYM_ROLES]
#define p_types symtab[SYM_TYPES]
#define p_users symtab[SYM_USERS]
#define p_bools symtab[SYM_BOOLS]
#define p_levels symtab[SYM_LEVELS]
#define p_cats symtab[SYM_CATS]

        /* symbol names indexed by (value - 1) */
        char                **sym_val_to_name[SYM_NUM];

        /* class, role, and user attributes indexed by (value - 1) */
        struct class_datum **class_val_to_struct;
        struct role_datum **role_val_to_struct;
        struct user_datum **user_val_to_struct;
        struct type_datum **type_val_to_struct;

        /* type enforcement access vectors and transitions */
        struct avtab te_avtab;

        /* role transitions */
        struct hashtab role_tr;

        /* file transitions with the last path component */
        /* quickly exclude lookups when parent ttype has no rules */
        struct ebitmap filename_trans_ttypes;
        /* actual set of filename_trans rules */
        struct hashtab filename_trans;
        /* only used if policyvers < POLICYDB_VERSION_COMP_FTRANS */
        u32 compat_filename_trans_count;

        /* bools indexed by (value - 1) */
        struct cond_bool_datum **bool_val_to_struct;
        /* type enforcement conditional access vectors and transitions */
        struct avtab te_cond_avtab;
        /* array indexing te_cond_avtab by conditional */
        struct cond_node *cond_list;
        u32 cond_list_len;

        /* role allows */
        struct role_allow *role_allow;

        /* security contexts of initial SIDs, unlabeled file systems,
           TCP or UDP port numbers, network interfaces and nodes */
        struct ocontext *ocontexts[OCON_NUM];

        /* security contexts for files in filesystems that cannot support
           a persistent label mapping or use another
           fixed labeling behavior. */
        struct genfs *genfs;

        /* range transitions table (range_trans_key -> mls_range) */
        struct hashtab range_tr;

        /* type -> attribute reverse mapping */
        struct ebitmap *type_attr_map_array;

        struct ebitmap policycaps;

        struct ebitmap permissive_map;

        /* length of this policy when it was loaded */
        size_t len;

        unsigned int policyvers;

        unsigned int reject_unknown : 1;
        unsigned int allow_unknown : 1;

        u16 process_class;
        u32 process_trans_perms;
} __randomize_layout;

extern void policydb_destroy(struct policydb *p);
extern int policydb_load_isids(struct policydb *p, struct sidtab *s);
extern int policydb_context_isvalid(struct policydb *p, struct context *c);
extern int policydb_class_isvalid(struct policydb *p, unsigned int class);
extern int policydb_type_isvalid(struct policydb *p, unsigned int type);
extern int policydb_role_isvalid(struct policydb *p, unsigned int role);
extern int policydb_read(struct policydb *p, void *fp);
extern int policydb_write(struct policydb *p, void *fp);

extern struct filename_trans_datum *policydb_filenametr_search(
        struct policydb *p, struct filename_trans_key *key);

extern struct mls_range *policydb_rangetr_search(
        struct policydb *p, struct range_trans *key);

extern struct role_trans_datum *policydb_roletr_search(
        struct policydb *p, struct role_trans_key *key);

#define POLICYDB_CONFIG_MLS    1

/* the config flags related to unknown classes/perms are bits 2 and 3 */
#define REJECT_UNKNOWN        0x00000002
#define ALLOW_UNKNOWN        0x00000004

#define OBJECT_R "object_r"
#define OBJECT_R_VAL 1

#define POLICYDB_MAGIC SELINUX_MAGIC
#define POLICYDB_STRING "SE Linux"

struct policy_file {
        char *data;
        size_t len;
};

struct policy_data {
        struct policydb *p;
        void *fp;
};

static inline int next_entry(void *buf, struct policy_file *fp, size_t bytes)
{
        if (bytes > fp->len)
                return -EINVAL;

        memcpy(buf, fp->data, bytes);
        fp->data += bytes;
        fp->len -= bytes;
        return 0;
}

static inline int put_entry(const void *buf, size_t bytes, int num, struct policy_file *fp)
{
        size_t len = bytes * num;

        if (len > fp->len)
                return -EINVAL;
        memcpy(fp->data, buf, len);
        fp->data += len;
        fp->len -= len;

        return 0;
}

static inline char *sym_name(struct policydb *p, unsigned int sym_num, unsigned int element_nr)
{
        return p->sym_val_to_name[sym_num][element_nr];
}

extern u16 string_to_security_class(struct policydb *p, const char *name);
extern u32 string_to_av_perm(struct policydb *p, u16 tclass, const char *name);

#endif        /* _SS_POLICYDB_H_ */




































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
/* SPDX-License-Identifier: GPL-2.0 */
/*  linux/include/linux/clockchips.h
 *
 *  This file contains the structure definitions for clockchips.
 *
 *  If you are not a clockchip, or the time of day code, you should
 *  not be including this file!
 */
#ifndef _LINUX_CLOCKCHIPS_H
#define _LINUX_CLOCKCHIPS_H

#ifdef CONFIG_GENERIC_CLOCKEVENTS

# include <linux/clocksource.h>
# include <linux/cpumask.h>
# include <linux/ktime.h>
# include <linux/notifier.h>

struct clock_event_device;
struct module;

/*
 * Possible states of a clock event device.
 *
 * DETACHED:        Device is not used by clockevents core. Initial state or can be
 *                reached from SHUTDOWN.
 * SHUTDOWN:        Device is powered-off. Can be reached from PERIODIC or ONESHOT.
 * PERIODIC:        Device is programmed to generate events periodically. Can be
 *                reached from DETACHED or SHUTDOWN.
 * ONESHOT:        Device is programmed to generate event only once. Can be reached
 *                from DETACHED or SHUTDOWN.
 * ONESHOT_STOPPED: Device was programmed in ONESHOT mode and is temporarily
 *                    stopped.
 */
enum clock_event_state {
        CLOCK_EVT_STATE_DETACHED,
        CLOCK_EVT_STATE_SHUTDOWN,
        CLOCK_EVT_STATE_PERIODIC,
        CLOCK_EVT_STATE_ONESHOT,
        CLOCK_EVT_STATE_ONESHOT_STOPPED,
};

/*
 * Clock event features
 */
# define CLOCK_EVT_FEAT_PERIODIC        0x000001
# define CLOCK_EVT_FEAT_ONESHOT                0x000002
# define CLOCK_EVT_FEAT_KTIME                0x000004

/*
 * x86(64) specific (mis)features:
 *
 * - Clockevent source stops in C3 State and needs broadcast support.
 * - Local APIC timer is used as a dummy device.
 */
# define CLOCK_EVT_FEAT_C3STOP                0x000008
# define CLOCK_EVT_FEAT_DUMMY                0x000010

/*
 * Core shall set the interrupt affinity dynamically in broadcast mode
 */
# define CLOCK_EVT_FEAT_DYNIRQ                0x000020
# define CLOCK_EVT_FEAT_PERCPU                0x000040

/*
 * Clockevent device is based on a hrtimer for broadcast
 */
# define CLOCK_EVT_FEAT_HRTIMER                0x000080

/**
 * struct clock_event_device - clock event device descriptor
 * @event_handler:        Assigned by the framework to be called by the low
 *                        level handler of the event source
 * @set_next_event:        set next event function using a clocksource delta
 * @set_next_ktime:        set next event function using a direct ktime value
 * @next_event:                local storage for the next event in oneshot mode
 * @max_delta_ns:        maximum delta value in ns
 * @min_delta_ns:        minimum delta value in ns
 * @mult:                nanosecond to cycles multiplier
 * @shift:                nanoseconds to cycles divisor (power of two)
 * @state_use_accessors:current state of the device, assigned by the core code
 * @features:                features
 * @retries:                number of forced programming retries
 * @set_state_periodic:        switch state to periodic
 * @set_state_oneshot:        switch state to oneshot
 * @set_state_oneshot_stopped: switch state to oneshot_stopped
 * @set_state_shutdown:        switch state to shutdown
 * @tick_resume:        resume clkevt device
 * @broadcast:                function to broadcast events
 * @min_delta_ticks:        minimum delta value in ticks stored for reconfiguration
 * @max_delta_ticks:        maximum delta value in ticks stored for reconfiguration
 * @name:                ptr to clock event name
 * @rating:                variable to rate clock event devices
 * @irq:                IRQ number (only for non CPU local devices)
 * @bound_on:                Bound on CPU
 * @cpumask:                cpumask to indicate for which CPUs this device works
 * @list:                list head for the management code
 * @owner:                module reference
 */
struct clock_event_device {
        void                        (*event_handler)(struct clock_event_device *);
        int                        (*set_next_event)(unsigned long evt, struct clock_event_device *);
        int                        (*set_next_ktime)(ktime_t expires, struct clock_event_device *);
        ktime_t                        next_event;
        u64                        max_delta_ns;
        u64                        min_delta_ns;
        u32                        mult;
        u32                        shift;
        enum clock_event_state        state_use_accessors;
        unsigned int                features;
        unsigned long                retries;

        int                        (*set_state_periodic)(struct clock_event_device *);
        int                        (*set_state_oneshot)(struct clock_event_device *);
        int                        (*set_state_oneshot_stopped)(struct clock_event_device *);
        int                        (*set_state_shutdown)(struct clock_event_device *);
        int                        (*tick_resume)(struct clock_event_device *);

        void                        (*broadcast)(const struct cpumask *mask);
        void                        (*suspend)(struct clock_event_device *);
        void                        (*resume)(struct clock_event_device *);
        unsigned long                min_delta_ticks;
        unsigned long                max_delta_ticks;

        const char                *name;
        int                        rating;
        int                        irq;
        int                        bound_on;
        const struct cpumask        *cpumask;
        struct list_head        list;
        struct module                *owner;
} ____cacheline_aligned;

/* Helpers to verify state of a clockevent device */
static inline bool clockevent_state_detached(struct clock_event_device *dev)
{
        return dev->state_use_accessors == CLOCK_EVT_STATE_DETACHED;
}

static inline bool clockevent_state_shutdown(struct clock_event_device *dev)
{
        return dev->state_use_accessors == CLOCK_EVT_STATE_SHUTDOWN;
}

static inline bool clockevent_state_periodic(struct clock_event_device *dev)
{
        return dev->state_use_accessors == CLOCK_EVT_STATE_PERIODIC;
}

static inline bool clockevent_state_oneshot(struct clock_event_device *dev)
{
        return dev->state_use_accessors == CLOCK_EVT_STATE_ONESHOT;
}

static inline bool clockevent_state_oneshot_stopped(struct clock_event_device *dev)
{
        return dev->state_use_accessors == CLOCK_EVT_STATE_ONESHOT_STOPPED;
}

/*
 * Calculate a multiplication factor for scaled math, which is used to convert
 * nanoseconds based values to clock ticks:
 *
 * clock_ticks = (nanoseconds * factor) >> shift.
 *
 * div_sc is the rearranged equation to calculate a factor from a given clock
 * ticks / nanoseconds ratio:
 *
 * factor = (clock_ticks << shift) / nanoseconds
 */
static inline unsigned long
div_sc(unsigned long ticks, unsigned long nsec, int shift)
{
        u64 tmp = ((u64)ticks) << shift;

        do_div(tmp, nsec);

        return (unsigned long) tmp;
}

/* Clock event layer functions */
extern u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt);
extern void clockevents_register_device(struct clock_event_device *dev);
extern int clockevents_unbind_device(struct clock_event_device *ced, int cpu);

extern void clockevents_config_and_register(struct clock_event_device *dev,
                                            u32 freq, unsigned long min_delta,
                                            unsigned long max_delta);

extern int clockevents_update_freq(struct clock_event_device *ce, u32 freq);

static inline void
clockevents_calc_mult_shift(struct clock_event_device *ce, u32 freq, u32 maxsec)
{
        return clocks_calc_mult_shift(&ce->mult, &ce->shift, NSEC_PER_SEC, freq, maxsec);
}

extern void clockevents_suspend(void);
extern void clockevents_resume(void);

# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
#  ifdef CONFIG_ARCH_HAS_TICK_BROADCAST
extern void tick_broadcast(const struct cpumask *mask);
#  else
#   define tick_broadcast        NULL
#  endif
extern int tick_receive_broadcast(void);
# endif

# if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT)
extern void tick_setup_hrtimer_broadcast(void);
extern int tick_check_broadcast_expired(void);
# else
static inline int tick_check_broadcast_expired(void) { return 0; }
static inline void tick_setup_hrtimer_broadcast(void) { }
# endif

#else /* !CONFIG_GENERIC_CLOCKEVENTS: */

static inline void clockevents_suspend(void) { }
static inline void clockevents_resume(void) { }
static inline int tick_check_broadcast_expired(void) { return 0; }
static inline void tick_setup_hrtimer_broadcast(void) { }

#endif /* !CONFIG_GENERIC_CLOCKEVENTS */

#endif /* _LINUX_CLOCKCHIPS_H */
























































    2 























    1 


    1 












































    3 


    3 
















    1 


    1 


    1 


















    1 



    1 














































    1 




    1 


    1 




    1 

    1 






























































































































































































































































































































































    2 









    2 






































    1 


    1 
















































































































    1 









































    1 



    1 











































































































































































    1 

    1 





































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/compiler.h>
#include <linux/export.h>
#include <linux/err.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/sched/task_stack.h>
#include <linux/security.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/mman.h>
#include <linux/hugetlb.h>
#include <linux/vmalloc.h>
#include <linux/userfaultfd_k.h>
#include <linux/elf.h>
#include <linux/elf-randomize.h>
#include <linux/personality.h>
#include <linux/random.h>
#include <linux/processor.h>
#include <linux/sizes.h>
#include <linux/compat.h>

#include <linux/uaccess.h>

#include "internal.h"

/**
 * kfree_const - conditionally free memory
 * @x: pointer to the memory
 *
 * Function calls kfree only if @x is not in .rodata section.
 */
void kfree_const(const void *x)
{
        if (!is_kernel_rodata((unsigned long)x))
                kfree(x);
}
EXPORT_SYMBOL(kfree_const);

/**
 * kstrdup - allocate space for and copy an existing string
 * @s: the string to duplicate
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 *
 * Return: newly allocated copy of @s or %NULL in case of error
 */
char *kstrdup(const char *s, gfp_t gfp)
{
        size_t len;
        char *buf;

        if (!s)
                return NULL;

        len = strlen(s) + 1;
        buf = kmalloc_track_caller(len, gfp);
        if (buf)
                memcpy(buf, s, len);
        return buf;
}
EXPORT_SYMBOL(kstrdup);

/**
 * kstrdup_const - conditionally duplicate an existing const string
 * @s: the string to duplicate
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 *
 * Note: Strings allocated by kstrdup_const should be freed by kfree_const and
 * must not be passed to krealloc().
 *
 * Return: source string if it is in .rodata section otherwise
 * fallback to kstrdup.
 */
const char *kstrdup_const(const char *s, gfp_t gfp)
{
        if (is_kernel_rodata((unsigned long)s))
                return s;

        return kstrdup(s, gfp);
}
EXPORT_SYMBOL(kstrdup_const);

/**
 * kstrndup - allocate space for and copy an existing string
 * @s: the string to duplicate
 * @max: read at most @max chars from @s
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 *
 * Note: Use kmemdup_nul() instead if the size is known exactly.
 *
 * Return: newly allocated copy of @s or %NULL in case of error
 */
char *kstrndup(const char *s, size_t max, gfp_t gfp)
{
        size_t len;
        char *buf;

        if (!s)
                return NULL;

        len = strnlen(s, max);
        buf = kmalloc_track_caller(len+1, gfp);
        if (buf) {
                memcpy(buf, s, len);
                buf[len] = '\0';
        }
        return buf;
}
EXPORT_SYMBOL(kstrndup);

/**
 * kmemdup - duplicate region of memory
 *
 * @src: memory region to duplicate
 * @len: memory region length
 * @gfp: GFP mask to use
 *
 * Return: newly allocated copy of @src or %NULL in case of error
 */
void *kmemdup(const void *src, size_t len, gfp_t gfp)
{
        void *p;

        p = kmalloc_track_caller(len, gfp);
        if (p)
                memcpy(p, src, len);
        return p;
}
EXPORT_SYMBOL(kmemdup);

/**
 * kmemdup_nul - Create a NUL-terminated string from unterminated data
 * @s: The data to stringify
 * @len: The size of the data
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 *
 * Return: newly allocated copy of @s with NUL-termination or %NULL in
 * case of error
 */
char *kmemdup_nul(const char *s, size_t len, gfp_t gfp)
{
        char *buf;

        if (!s)
                return NULL;

        buf = kmalloc_track_caller(len + 1, gfp);
        if (buf) {
                memcpy(buf, s, len);
                buf[len] = '\0';
        }
        return buf;
}
EXPORT_SYMBOL(kmemdup_nul);

/**
 * memdup_user - duplicate memory region from user space
 *
 * @src: source address in user space
 * @len: number of bytes to copy
 *
 * Return: an ERR_PTR() on failure.  Result is physically
 * contiguous, to be freed by kfree().
 */
void *memdup_user(const void __user *src, size_t len)
{
        void *p;

        p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN);
        if (!p)
                return ERR_PTR(-ENOMEM);

        if (copy_from_user(p, src, len)) {
                kfree(p);
                return ERR_PTR(-EFAULT);
        }

        return p;
}
EXPORT_SYMBOL(memdup_user);

/**
 * vmemdup_user - duplicate memory region from user space
 *
 * @src: source address in user space
 * @len: number of bytes to copy
 *
 * Return: an ERR_PTR() on failure.  Result may be not
 * physically contiguous.  Use kvfree() to free.
 */
void *vmemdup_user(const void __user *src, size_t len)
{
        void *p;

        p = kvmalloc(len, GFP_USER);
        if (!p)
                return ERR_PTR(-ENOMEM);

        if (copy_from_user(p, src, len)) {
                kvfree(p);
                return ERR_PTR(-EFAULT);
        }

        return p;
}
EXPORT_SYMBOL(vmemdup_user);

/**
 * strndup_user - duplicate an existing string from user space
 * @s: The string to duplicate
 * @n: Maximum number of bytes to copy, including the trailing NUL.
 *
 * Return: newly allocated copy of @s or an ERR_PTR() in case of error
 */
char *strndup_user(const char __user *s, long n)
{
        char *p;
        long length;

        length = strnlen_user(s, n);

        if (!length)
                return ERR_PTR(-EFAULT);

        if (length > n)
                return ERR_PTR(-EINVAL);

        p = memdup_user(s, length);

        if (IS_ERR(p))
                return p;

        p[length - 1] = '\0';

        return p;
}
EXPORT_SYMBOL(strndup_user);

/**
 * memdup_user_nul - duplicate memory region from user space and NUL-terminate
 *
 * @src: source address in user space
 * @len: number of bytes to copy
 *
 * Return: an ERR_PTR() on failure.
 */
void *memdup_user_nul(const void __user *src, size_t len)
{
        char *p;

        /*
         * Always use GFP_KERNEL, since copy_from_user() can sleep and
         * cause pagefault, which makes it pointless to use GFP_NOFS
         * or GFP_ATOMIC.
         */
        p = kmalloc_track_caller(len + 1, GFP_KERNEL);
        if (!p)
                return ERR_PTR(-ENOMEM);

        if (copy_from_user(p, src, len)) {
                kfree(p);
                return ERR_PTR(-EFAULT);
        }
        p[len] = '\0';

        return p;
}
EXPORT_SYMBOL(memdup_user_nul);

void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
                struct vm_area_struct *prev)
{
        struct vm_area_struct *next;

        vma->vm_prev = prev;
        if (prev) {
                next = prev->vm_next;
                prev->vm_next = vma;
        } else {
                next = mm->mmap;
                mm->mmap = vma;
        }
        vma->vm_next = next;
        if (next)
                next->vm_prev = vma;
}

void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma)
{
        struct vm_area_struct *prev, *next;

        next = vma->vm_next;
        prev = vma->vm_prev;
        if (prev)
                prev->vm_next = next;
        else
                mm->mmap = next;
        if (next)
                next->vm_prev = prev;
}

/* Check if the vma is being used as a stack by this task */
int vma_is_stack_for_current(struct vm_area_struct *vma)
{
        struct task_struct * __maybe_unused t = current;

        return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
}

#ifndef STACK_RND_MASK
#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12))     /* 8MB of VA */
#endif

unsigned long randomize_stack_top(unsigned long stack_top)
{
        unsigned long random_variable = 0;

        if (current->flags & PF_RANDOMIZE) {
                random_variable = get_random_long();
                random_variable &= STACK_RND_MASK;
                random_variable <<= PAGE_SHIFT;
        }
#ifdef CONFIG_STACK_GROWSUP
        return PAGE_ALIGN(stack_top) + random_variable;
#else
        return PAGE_ALIGN(stack_top) - random_variable;
#endif
}

/**
 * randomize_page - Generate a random, page aligned address
 * @start:        The smallest acceptable address the caller will take.
 * @range:        The size of the area, starting at @start, within which the
 *                random address must fall.
 *
 * If @start + @range would overflow, @range is capped.
 *
 * NOTE: Historical use of randomize_range, which this replaces, presumed that
 * @start was already page aligned.  We now align it regardless.
 *
 * Return: A page aligned address within [start, start + range).  On error,
 * @start is returned.
 */
unsigned long randomize_page(unsigned long start, unsigned long range)
{
        if (!PAGE_ALIGNED(start)) {
                range -= PAGE_ALIGN(start) - start;
                start = PAGE_ALIGN(start);
        }

        if (start > ULONG_MAX - range)
                range = ULONG_MAX - start;

        range >>= PAGE_SHIFT;

        if (range == 0)
                return start;

        return start + (get_random_long() % range << PAGE_SHIFT);
}

#ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
unsigned long arch_randomize_brk(struct mm_struct *mm)
{
        /* Is the current task 32bit ? */
        if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task())
                return randomize_page(mm->brk, SZ_32M);

        return randomize_page(mm->brk, SZ_1G);
}

unsigned long arch_mmap_rnd(void)
{
        unsigned long rnd;

#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
        if (is_compat_task())
                rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
        else
#endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */
                rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);

        return rnd << PAGE_SHIFT;
}

static int mmap_is_legacy(struct rlimit *rlim_stack)
{
        if (current->personality & ADDR_COMPAT_LAYOUT)
                return 1;

        if (rlim_stack->rlim_cur == RLIM_INFINITY)
                return 1;

        return sysctl_legacy_va_layout;
}

/*
 * Leave enough space between the mmap area and the stack to honour ulimit in
 * the face of randomisation.
 */
#define MIN_GAP                (SZ_128M)
#define MAX_GAP                (STACK_TOP / 6 * 5)

static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
{
        unsigned long gap = rlim_stack->rlim_cur;
        unsigned long pad = stack_guard_gap;

        /* Account for stack randomization if necessary */
        if (current->flags & PF_RANDOMIZE)
                pad += (STACK_RND_MASK << PAGE_SHIFT);

        /* Values close to RLIM_INFINITY can overflow. */
        if (gap + pad > gap)
                gap += pad;

        if (gap < MIN_GAP && MIN_GAP < MAX_GAP)
                gap = MIN_GAP;
        else if (gap > MAX_GAP)
                gap = MAX_GAP;

        return PAGE_ALIGN(STACK_TOP - gap - rnd);
}

void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
        unsigned long random_factor = 0UL;

        if (current->flags & PF_RANDOMIZE)
                random_factor = arch_mmap_rnd();

        if (mmap_is_legacy(rlim_stack)) {
                mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
                mm->get_unmapped_area = arch_get_unmapped_area;
        } else {
                mm->mmap_base = mmap_base(random_factor, rlim_stack);
                mm->get_unmapped_area = arch_get_unmapped_area_topdown;
        }
}
#elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
        mm->mmap_base = TASK_UNMAPPED_BASE;
        mm->get_unmapped_area = arch_get_unmapped_area;
}
#endif

/**
 * __account_locked_vm - account locked pages to an mm's locked_vm
 * @mm:          mm to account against
 * @pages:       number of pages to account
 * @inc:         %true if @pages should be considered positive, %false if not
 * @task:        task used to check RLIMIT_MEMLOCK
 * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
 *
 * Assumes @task and @mm are valid (i.e. at least one reference on each), and
 * that mmap_lock is held as writer.
 *
 * Return:
 * * 0       on success
 * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
 */
int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
                        struct task_struct *task, bool bypass_rlim)
{
        unsigned long locked_vm, limit;
        int ret = 0;

        mmap_assert_write_locked(mm);

        locked_vm = mm->locked_vm;
        if (inc) {
                if (!bypass_rlim) {
                        limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
                        if (locked_vm + pages > limit)
                                ret = -ENOMEM;
                }
                if (!ret)
                        mm->locked_vm = locked_vm + pages;
        } else {
                WARN_ON_ONCE(pages > locked_vm);
                mm->locked_vm = locked_vm - pages;
        }

        pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid,
                 (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT,
                 locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK),
                 ret ? " - exceeded" : "");

        return ret;
}
EXPORT_SYMBOL_GPL(__account_locked_vm);

/**
 * account_locked_vm - account locked pages to an mm's locked_vm
 * @mm:          mm to account against, may be NULL
 * @pages:       number of pages to account
 * @inc:         %true if @pages should be considered positive, %false if not
 *
 * Assumes a non-NULL @mm is valid (i.e. at least one reference on it).
 *
 * Return:
 * * 0       on success, or if mm is NULL
 * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
 */
int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc)
{
        int ret;

        if (pages == 0 || !mm)
                return 0;

        mmap_write_lock(mm);
        ret = __account_locked_vm(mm, pages, inc, current,
                                  capable(CAP_IPC_LOCK));
        mmap_write_unlock(mm);

        return ret;
}
EXPORT_SYMBOL_GPL(account_locked_vm);

unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
        unsigned long len, unsigned long prot,
        unsigned long flag, unsigned long pgoff)
{
        unsigned long ret;
        struct mm_struct *mm = current->mm;
        unsigned long populate;
        LIST_HEAD(uf);

        ret = security_mmap_file(file, prot, flag);
        if (!ret) {
                if (mmap_write_lock_killable(mm))
                        return -EINTR;
                ret = do_mmap(file, addr, len, prot, flag, pgoff, &populate,
                              &uf);
                mmap_write_unlock(mm);
                userfaultfd_unmap_complete(mm, &uf);
                if (populate)
                        mm_populate(ret, populate);
        }
        return ret;
}

unsigned long vm_mmap(struct file *file, unsigned long addr,
        unsigned long len, unsigned long prot,
        unsigned long flag, unsigned long offset)
{
        if (unlikely(offset + PAGE_ALIGN(len) < offset))
                return -EINVAL;
        if (unlikely(offset_in_page(offset)))
                return -EINVAL;

        return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
}
EXPORT_SYMBOL(vm_mmap);

/**
 * kvmalloc_node - attempt to allocate physically contiguous memory, but upon
 * failure, fall back to non-contiguous (vmalloc) allocation.
 * @size: size of the request.
 * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
 * @node: numa node to allocate from
 *
 * Uses kmalloc to get the memory but if the allocation fails then falls back
 * to the vmalloc allocator. Use kvfree for freeing the memory.
 *
 * Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported.
 * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
 * preferable to the vmalloc fallback, due to visible performance drawbacks.
 *
 * Please note that any use of gfp flags outside of GFP_KERNEL is careful to not
 * fall back to vmalloc.
 *
 * Return: pointer to the allocated memory of %NULL in case of failure
 */
void *kvmalloc_node(size_t size, gfp_t flags, int node)
{
        gfp_t kmalloc_flags = flags;
        void *ret;

        /*
         * vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables)
         * so the given set of flags has to be compatible.
         */
        if ((flags & GFP_KERNEL) != GFP_KERNEL)
                return kmalloc_node(size, flags, node);

        /*
         * We want to attempt a large physically contiguous block first because
         * it is less likely to fragment multiple larger blocks and therefore
         * contribute to a long term fragmentation less than vmalloc fallback.
         * However make sure that larger requests are not too disruptive - no
         * OOM killer and no allocation failure warnings as we have a fallback.
         */
        if (size > PAGE_SIZE) {
                kmalloc_flags |= __GFP_NOWARN;

                if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL))
                        kmalloc_flags |= __GFP_NORETRY;
        }

        ret = kmalloc_node(size, kmalloc_flags, node);

        /*
         * It doesn't really make sense to fallback to vmalloc for sub page
         * requests
         */
        if (ret || size <= PAGE_SIZE)
                return ret;

        /* Don't even allow crazy sizes */
        if (unlikely(size > INT_MAX)) {
                WARN_ON_ONCE(!(flags & __GFP_NOWARN));
                return NULL;
        }

        return __vmalloc_node(size, 1, flags, node,
                        __builtin_return_address(0));
}
EXPORT_SYMBOL(kvmalloc_node);

/**
 * kvfree() - Free memory.
 * @addr: Pointer to allocated memory.
 *
 * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc().
 * It is slightly more efficient to use kfree() or vfree() if you are certain
 * that you know which one to use.
 *
 * Context: Either preemptible task context or not-NMI interrupt.
 */
void kvfree(const void *addr)
{
        if (is_vmalloc_addr(addr))
                vfree(addr);
        else
                kfree(addr);
}
EXPORT_SYMBOL(kvfree);

/**
 * kvfree_sensitive - Free a data object containing sensitive information.
 * @addr: address of the data object to be freed.
 * @len: length of the data object.
 *
 * Use the special memzero_explicit() function to clear the content of a
 * kvmalloc'ed object containing sensitive data to make sure that the
 * compiler won't optimize out the data clearing.
 */
void kvfree_sensitive(const void *addr, size_t len)
{
        if (likely(!ZERO_OR_NULL_PTR(addr))) {
                memzero_explicit((void *)addr, len);
                kvfree(addr);
        }
}
EXPORT_SYMBOL(kvfree_sensitive);

void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
{
        void *newp;

        if (oldsize >= newsize)
                return (void *)p;
        newp = kvmalloc(newsize, flags);
        if (!newp)
                return NULL;
        memcpy(newp, p, oldsize);
        kvfree(p);
        return newp;
}
EXPORT_SYMBOL(kvrealloc);

static inline void *__page_rmapping(struct page *page)
{
        unsigned long mapping;

        mapping = (unsigned long)page->mapping;
        mapping &= ~PAGE_MAPPING_FLAGS;

        return (void *)mapping;
}

/**
 * __vmalloc_array - allocate memory for a virtually contiguous array.
 * @n: number of elements.
 * @size: element size.
 * @flags: the type of memory to allocate (see kmalloc).
 */
void *__vmalloc_array(size_t n, size_t size, gfp_t flags)
{
        size_t bytes;

        if (unlikely(check_mul_overflow(n, size, &bytes)))
                return NULL;
        return __vmalloc(bytes, flags);
}
EXPORT_SYMBOL(__vmalloc_array);

/**
 * vmalloc_array - allocate memory for a virtually contiguous array.
 * @n: number of elements.
 * @size: element size.
 */
void *vmalloc_array(size_t n, size_t size)
{
        return __vmalloc_array(n, size, GFP_KERNEL);
}
EXPORT_SYMBOL(vmalloc_array);

/**
 * __vcalloc - allocate and zero memory for a virtually contiguous array.
 * @n: number of elements.
 * @size: element size.
 * @flags: the type of memory to allocate (see kmalloc).
 */
void *__vcalloc(size_t n, size_t size, gfp_t flags)
{
        return __vmalloc_array(n, size, flags | __GFP_ZERO);
}
EXPORT_SYMBOL(__vcalloc);

/**
 * vcalloc - allocate and zero memory for a virtually contiguous array.
 * @n: number of elements.
 * @size: element size.
 */
void *vcalloc(size_t n, size_t size)
{
        return __vmalloc_array(n, size, GFP_KERNEL | __GFP_ZERO);
}
EXPORT_SYMBOL(vcalloc);

/* Neutral page->mapping pointer to address_space or anon_vma or other */
void *page_rmapping(struct page *page)
{
        page = compound_head(page);
        return __page_rmapping(page);
}

/*
 * Return true if this page is mapped into pagetables.
 * For compound page it returns true if any subpage of compound page is mapped.
 */
bool page_mapped(struct page *page)
{
        int i;

        if (likely(!PageCompound(page)))
                return atomic_read(&page->_mapcount) >= 0;
        page = compound_head(page);
        if (atomic_read(compound_mapcount_ptr(page)) >= 0)
                return true;
        if (PageHuge(page))
                return false;
        for (i = 0; i < compound_nr(page); i++) {
                if (atomic_read(&page[i]._mapcount) >= 0)
                        return true;
        }
        return false;
}
EXPORT_SYMBOL(page_mapped);

struct anon_vma *page_anon_vma(struct page *page)
{
        unsigned long mapping;

        page = compound_head(page);
        mapping = (unsigned long)page->mapping;
        if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
                return NULL;
        return __page_rmapping(page);
}

struct address_space *page_mapping(struct page *page)
{
        struct address_space *mapping;

        page = compound_head(page);

        /* This happens if someone calls flush_dcache_page on slab page */
        if (unlikely(PageSlab(page)))
                return NULL;

        if (unlikely(PageSwapCache(page))) {
                swp_entry_t entry;

                entry.val = page_private(page);
                return swap_address_space(entry);
        }

        mapping = page->mapping;
        if ((unsigned long)mapping & PAGE_MAPPING_ANON)
                return NULL;

        return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS);
}
EXPORT_SYMBOL(page_mapping);

/*
 * For file cache pages, return the address_space, otherwise return NULL
 */
struct address_space *page_mapping_file(struct page *page)
{
        if (unlikely(PageSwapCache(page)))
                return NULL;
        return page_mapping(page);
}

/* Slow path of page_mapcount() for compound pages */
int __page_mapcount(struct page *page)
{
        int ret;

        ret = atomic_read(&page->_mapcount) + 1;
        /*
         * For file THP page->_mapcount contains total number of mapping
         * of the page: no need to look into compound_mapcount.
         */
        if (!PageAnon(page) && !PageHuge(page))
                return ret;
        page = compound_head(page);
        ret += atomic_read(compound_mapcount_ptr(page)) + 1;
        if (PageDoubleMap(page))
                ret--;
        return ret;
}
EXPORT_SYMBOL_GPL(__page_mapcount);

int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
int sysctl_overcommit_ratio __read_mostly = 50;
unsigned long sysctl_overcommit_kbytes __read_mostly;
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */

int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer,
                size_t *lenp, loff_t *ppos)
{
        int ret;

        ret = proc_dointvec(table, write, buffer, lenp, ppos);
        if (ret == 0 && write)
                sysctl_overcommit_kbytes = 0;
        return ret;
}

static void sync_overcommit_as(struct work_struct *dummy)
{
        percpu_counter_sync(&vm_committed_as);
}

int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer,
                size_t *lenp, loff_t *ppos)
{
        struct ctl_table t;
        int new_policy = -1;
        int ret;

        /*
         * The deviation of sync_overcommit_as could be big with loose policy
         * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to
         * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply
         * with the strict "NEVER", and to avoid possible race condtion (even
         * though user usually won't too frequently do the switching to policy
         * OVERCOMMIT_NEVER), the switch is done in the following order:
         *        1. changing the batch
         *        2. sync percpu count on each CPU
         *        3. switch the policy
         */
        if (write) {
                t = *table;
                t.data = &new_policy;
                ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
                if (ret || new_policy == -1)
                        return ret;

                mm_compute_batch(new_policy);
                if (new_policy == OVERCOMMIT_NEVER)
                        schedule_on_each_cpu(sync_overcommit_as);
                sysctl_overcommit_memory = new_policy;
        } else {
                ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        }

        return ret;
}

int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer,
                size_t *lenp, loff_t *ppos)
{
        int ret;

        ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write)
                sysctl_overcommit_ratio = 0;
        return ret;
}

/*
 * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
 */
unsigned long vm_commit_limit(void)
{
        unsigned long allowed;

        if (sysctl_overcommit_kbytes)
                allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
        else
                allowed = ((totalram_pages() - hugetlb_total_pages())
                           * sysctl_overcommit_ratio / 100);
        allowed += total_swap_pages;

        return allowed;
}

/*
 * Make sure vm_committed_as in one cacheline and not cacheline shared with
 * other variables. It can be updated by several CPUs frequently.
 */
struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;

/*
 * The global memory commitment made in the system can be a metric
 * that can be used to drive ballooning decisions when Linux is hosted
 * as a guest. On Hyper-V, the host implements a policy engine for dynamically
 * balancing memory across competing virtual machines that are hosted.
 * Several metrics drive this policy engine including the guest reported
 * memory commitment.
 *
 * The time cost of this is very low for small platforms, and for big
 * platform like a 2S/36C/72T Skylake server, in worst case where
 * vm_committed_as's spinlock is under severe contention, the time cost
 * could be about 30~40 microseconds.
 */
unsigned long vm_memory_committed(void)
{
        return percpu_counter_sum_positive(&vm_committed_as);
}
EXPORT_SYMBOL_GPL(vm_memory_committed);

/*
 * Check that a process has enough memory to allocate a new virtual
 * mapping. 0 means there is enough memory for the allocation to
 * succeed and -ENOMEM implies there is not.
 *
 * We currently support three overcommit policies, which are set via the
 * vm.overcommit_memory sysctl.  See Documentation/vm/overcommit-accounting.rst
 *
 * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
 * Additional code 2002 Jul 20 by Robert Love.
 *
 * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
 *
 * Note this is a helper function intended to be used by LSMs which
 * wish to use this logic.
 */
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
        long allowed;

        vm_acct_memory(pages);

        /*
         * Sometimes we want to use more memory than we have
         */
        if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
                return 0;

        if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
                if (pages > totalram_pages() + total_swap_pages)
                        goto error;
                return 0;
        }

        allowed = vm_commit_limit();
        /*
         * Reserve some for root
         */
        if (!cap_sys_admin)
                allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);

        /*
         * Don't let a single process grow so big a user can't recover
         */
        if (mm) {
                long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);

                allowed -= min_t(long, mm->total_vm / 32, reserve);
        }

        if (percpu_counter_read_positive(&vm_committed_as) < allowed)
                return 0;
error:
        vm_unacct_memory(pages);

        return -ENOMEM;
}

/**
 * get_cmdline() - copy the cmdline value to a buffer.
 * @task:     the task whose cmdline value to copy.
 * @buffer:   the buffer to copy to.
 * @buflen:   the length of the buffer. Larger cmdline values are truncated
 *            to this length.
 *
 * Return: the size of the cmdline field copied. Note that the copy does
 * not guarantee an ending NULL byte.
 */
int get_cmdline(struct task_struct *task, char *buffer, int buflen)
{
        int res = 0;
        unsigned int len;
        struct mm_struct *mm = get_task_mm(task);
        unsigned long arg_start, arg_end, env_start, env_end;
        if (!mm)
                goto out;
        if (!mm->arg_end)
                goto out_mm;        /* Shh! No looking before we're done */

        spin_lock(&mm->arg_lock);
        arg_start = mm->arg_start;
        arg_end = mm->arg_end;
        env_start = mm->env_start;
        env_end = mm->env_end;
        spin_unlock(&mm->arg_lock);

        len = arg_end - arg_start;

        if (len > buflen)
                len = buflen;

        res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE);

        /*
         * If the nul at the end of args has been overwritten, then
         * assume application is using setproctitle(3).
         */
        if (res > 0 && buffer[res-1] != '\0' && len < buflen) {
                len = strnlen(buffer, res);
                if (len < res) {
                        res = len;
                } else {
                        len = env_end - env_start;
                        if (len > buflen - res)
                                len = buflen - res;
                        res += access_process_vm(task, env_start,
                                                 buffer+res, len,
                                                 FOLL_FORCE);
                        res = strnlen(buffer, res);
                }
        }
out_mm:
        mmput(mm);
out:
        return res;
}

int __weak memcmp_pages(struct page *page1, struct page *page2)
{
        char *addr1, *addr2;
        int ret;

        addr1 = kmap_atomic(page1);
        addr2 = kmap_atomic(page2);
        ret = memcmp(addr1, addr2, PAGE_SIZE);
        kunmap_atomic(addr2);
        kunmap_atomic(addr1);
        return ret;
}

int mmap_file(struct file *file, struct vm_area_struct *vma)
{
        static const struct vm_operations_struct dummy_vm_ops = {};
        int err = call_mmap(file, vma);

        if (likely(!err))
                return 0;

        /*
         * OK, we tried to call the file hook for mmap(), but an error
         * arose. The mapping is in an inconsistent state and we most not invoke
         * any further hooks on it.
         */
        vma->vm_ops = &dummy_vm_ops;

        return err;
}

void vma_close(struct vm_area_struct *vma)
{
        static const struct vm_operations_struct dummy_vm_ops = {};

        if (vma->vm_ops && vma->vm_ops->close) {
                vma->vm_ops->close(vma);

                /*
                 * The mapping is in an inconsistent state, and no further hooks
                 * may be invoked upon it.
                 */
                vma->vm_ops = &dummy_vm_ops;
        }
}























































































    2 












































    6 






























    1 












1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
/* SPDX-License-Identifier: GPL-2.0 */
/* thread_info.h: common low-level thread information accessors
 *
 * Copyright (C) 2002  David Howells (dhowells@redhat.com)
 * - Incorporating suggestions made by Linus Torvalds
 */

#ifndef _LINUX_THREAD_INFO_H
#define _LINUX_THREAD_INFO_H

#include <linux/types.h>
#include <linux/bug.h>
#include <linux/restart_block.h>
#include <linux/errno.h>

#ifdef CONFIG_THREAD_INFO_IN_TASK
/*
 * For CONFIG_THREAD_INFO_IN_TASK kernels we need <asm/current.h> for the
 * definition of current, but for !CONFIG_THREAD_INFO_IN_TASK kernels,
 * including <asm/current.h> can cause a circular dependency on some platforms.
 */
#include <asm/current.h>
#define current_thread_info() ((struct thread_info *)current)
#endif

#include <linux/bitops.h>

/*
 * For per-arch arch_within_stack_frames() implementations, defined in
 * asm/thread_info.h.
 */
enum {
        BAD_STACK = -1,
        NOT_STACK = 0,
        GOOD_FRAME,
        GOOD_STACK,
};

#include <asm/thread_info.h>

#ifdef __KERNEL__

#ifndef arch_set_restart_data
#define arch_set_restart_data(restart) do { } while (0)
#endif

static inline long set_restart_fn(struct restart_block *restart,
                                        long (*fn)(struct restart_block *))
{
        restart->fn = fn;
        arch_set_restart_data(restart);
        return -ERESTART_RESTARTBLOCK;
}

#ifndef THREAD_ALIGN
#define THREAD_ALIGN        THREAD_SIZE
#endif

#define THREADINFO_GFP                (GFP_KERNEL_ACCOUNT | __GFP_ZERO)

/*
 * flag set/clear/test wrappers
 * - pass TIF_xxxx constants to these functions
 */

static inline void set_ti_thread_flag(struct thread_info *ti, int flag)
{
        set_bit(flag, (unsigned long *)&ti->flags);
}

static inline void clear_ti_thread_flag(struct thread_info *ti, int flag)
{
        clear_bit(flag, (unsigned long *)&ti->flags);
}

static inline void update_ti_thread_flag(struct thread_info *ti, int flag,
                                         bool value)
{
        if (value)
                set_ti_thread_flag(ti, flag);
        else
                clear_ti_thread_flag(ti, flag);
}

static inline int test_and_set_ti_thread_flag(struct thread_info *ti, int flag)
{
        return test_and_set_bit(flag, (unsigned long *)&ti->flags);
}

static inline int test_and_clear_ti_thread_flag(struct thread_info *ti, int flag)
{
        return test_and_clear_bit(flag, (unsigned long *)&ti->flags);
}

static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
{
        return test_bit(flag, (unsigned long *)&ti->flags);
}

#define set_thread_flag(flag) \
        set_ti_thread_flag(current_thread_info(), flag)
#define clear_thread_flag(flag) \
        clear_ti_thread_flag(current_thread_info(), flag)
#define update_thread_flag(flag, value) \
        update_ti_thread_flag(current_thread_info(), flag, value)
#define test_and_set_thread_flag(flag) \
        test_and_set_ti_thread_flag(current_thread_info(), flag)
#define test_and_clear_thread_flag(flag) \
        test_and_clear_ti_thread_flag(current_thread_info(), flag)
#define test_thread_flag(flag) \
        test_ti_thread_flag(current_thread_info(), flag)

#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)

#ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
static inline int arch_within_stack_frames(const void * const stack,
                                           const void * const stackend,
                                           const void *obj, unsigned long len)
{
        return 0;
}
#endif

#ifdef CONFIG_HARDENED_USERCOPY
extern void __check_object_size(const void *ptr, unsigned long n,
                                        bool to_user);

static __always_inline void check_object_size(const void *ptr, unsigned long n,
                                              bool to_user)
{
        if (!__builtin_constant_p(n))
                __check_object_size(ptr, n, to_user);
}
#else
static inline void check_object_size(const void *ptr, unsigned long n,
                                     bool to_user)
{ }
#endif /* CONFIG_HARDENED_USERCOPY */

extern void __compiletime_error("copy source size is too small")
__bad_copy_from(void);
extern void __compiletime_error("copy destination size is too small")
__bad_copy_to(void);

static inline void copy_overflow(int size, unsigned long count)
{
        WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count);
}

static __always_inline __must_check bool
check_copy_size(const void *addr, size_t bytes, bool is_source)
{
        int sz = __compiletime_object_size(addr);
        if (unlikely(sz >= 0 && sz < bytes)) {
                if (!__builtin_constant_p(bytes))
                        copy_overflow(sz, bytes);
                else if (is_source)
                        __bad_copy_from();
                else
                        __bad_copy_to();
                return false;
        }
        if (WARN_ON_ONCE(bytes > INT_MAX))
                return false;
        check_object_size(addr, bytes, is_source);
        return true;
}

#ifndef arch_setup_new_exec
static inline void arch_setup_new_exec(void) { }
#endif

#endif        /* __KERNEL__ */

#endif /* _LINUX_THREAD_INFO_H */



























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* user-type.h: User-defined key type
 *
 * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#ifndef _KEYS_USER_TYPE_H
#define _KEYS_USER_TYPE_H

#include <linux/key.h>
#include <linux/rcupdate.h>

#ifdef CONFIG_KEYS

/*****************************************************************************/
/*
 * the payload for a key of type "user" or "logon"
 * - once filled in and attached to a key:
 *   - the payload struct is invariant may not be changed, only replaced
 *   - the payload must be read with RCU procedures or with the key semaphore
 *     held
 *   - the payload may only be replaced with the key semaphore write-locked
 * - the key's data length is the size of the actual data, not including the
 *   payload wrapper
 */
struct user_key_payload {
        struct rcu_head        rcu;                /* RCU destructor */
        unsigned short        datalen;        /* length of this data */
        char                data[] __aligned(__alignof__(u64)); /* actual data */
};

extern struct key_type key_type_user;
extern struct key_type key_type_logon;

struct key_preparsed_payload;

extern int user_preparse(struct key_preparsed_payload *prep);
extern void user_free_preparse(struct key_preparsed_payload *prep);
extern int user_update(struct key *key, struct key_preparsed_payload *prep);
extern void user_revoke(struct key *key);
extern void user_destroy(struct key *key);
extern void user_describe(const struct key *user, struct seq_file *m);
extern long user_read(const struct key *key, char *buffer, size_t buflen);

static inline const struct user_key_payload *user_key_payload_rcu(const struct key *key)
{
        return (struct user_key_payload *)dereference_key_rcu(key);
}

static inline struct user_key_payload *user_key_payload_locked(const struct key *key)
{
        return (struct user_key_payload *)dereference_key_locked((struct key *)key);
}

#endif /* CONFIG_KEYS */

#endif /* _KEYS_USER_TYPE_H */
















































































    1 














    1 

















































































































    1 























































































































































































































































































































































































































































    1 













    1 
    1 






    1 










    1 
    1 
    1 




    1 



    1 































































    1 






    1 




    1 













    1 

    1 





    1 
















    1 















    1 



















































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/pipe.c
 *
 *  Copyright (C) 1991, 1992, 1999  Linus Torvalds
 */

#include <linux/mm.h>
#include <linux/file.h>
#include <linux/poll.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/log2.h>
#include <linux/mount.h>
#include <linux/pseudo_fs.h>
#include <linux/magic.h>
#include <linux/pipe_fs_i.h>
#include <linux/uio.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/audit.h>
#include <linux/syscalls.h>
#include <linux/fcntl.h>
#include <linux/memcontrol.h>
#include <linux/watch_queue.h>

#include <linux/uaccess.h>
#include <asm/ioctls.h>

#include "internal.h"

/*
 * New pipe buffers will be restricted to this size while the user is exceeding
 * their pipe buffer quota. The general pipe use case needs at least two
 * buffers: one for data yet to be read, and one for new data. If this is less
 * than two, then a write to a non-empty pipe may block even if the pipe is not
 * full. This can occur with GNU make jobserver or similar uses of pipes as
 * semaphores: multiple processes may be waiting to write tokens back to the
 * pipe before reading tokens: https://lore.kernel.org/lkml/1628086770.5rn8p04n6j.none@localhost/.
 *
 * Users can reduce their pipe buffers with F_SETPIPE_SZ below this at their
 * own risk, namely: pipe writes to non-full pipes may block until the pipe is
 * emptied.
 */
#define PIPE_MIN_DEF_BUFFERS 2

/*
 * The max size that a non-root user is allowed to grow the pipe. Can
 * be set by root in /proc/sys/fs/pipe-max-size
 */
unsigned int pipe_max_size = 1048576;

/* Maximum allocatable pages per user. Hard limit is unset by default, soft
 * matches default values.
 */
unsigned long pipe_user_pages_hard;
unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;

/*
 * We use head and tail indices that aren't masked off, except at the point of
 * dereference, but rather they're allowed to wrap naturally.  This means there
 * isn't a dead spot in the buffer, but the ring has to be a power of two and
 * <= 2^31.
 * -- David Howells 2019-09-23.
 *
 * Reads with count = 0 should always return 0.
 * -- Julian Bradfield 1999-06-07.
 *
 * FIFOs and Pipes now generate SIGIO for both readers and writers.
 * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16
 *
 * pipe_read & write cleanup
 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
 */

static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
{
        if (pipe->files)
                mutex_lock_nested(&pipe->mutex, subclass);
}

void pipe_lock(struct pipe_inode_info *pipe)
{
        /*
         * pipe_lock() nests non-pipe inode locks (for writing to a file)
         */
        pipe_lock_nested(pipe, I_MUTEX_PARENT);
}
EXPORT_SYMBOL(pipe_lock);

void pipe_unlock(struct pipe_inode_info *pipe)
{
        if (pipe->files)
                mutex_unlock(&pipe->mutex);
}
EXPORT_SYMBOL(pipe_unlock);

static inline void __pipe_lock(struct pipe_inode_info *pipe)
{
        mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT);
}

static inline void __pipe_unlock(struct pipe_inode_info *pipe)
{
        mutex_unlock(&pipe->mutex);
}

void pipe_double_lock(struct pipe_inode_info *pipe1,
                      struct pipe_inode_info *pipe2)
{
        BUG_ON(pipe1 == pipe2);

        if (pipe1 < pipe2) {
                pipe_lock_nested(pipe1, I_MUTEX_PARENT);
                pipe_lock_nested(pipe2, I_MUTEX_CHILD);
        } else {
                pipe_lock_nested(pipe2, I_MUTEX_PARENT);
                pipe_lock_nested(pipe1, I_MUTEX_CHILD);
        }
}

static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
                                  struct pipe_buffer *buf)
{
        struct page *page = buf->page;

        /*
         * If nobody else uses this page, and we don't already have a
         * temporary page, let's keep track of it as a one-deep
         * allocation cache. (Otherwise just release our reference to it)
         */
        if (page_count(page) == 1 && !pipe->tmp_page)
                pipe->tmp_page = page;
        else
                put_page(page);
}

static bool anon_pipe_buf_try_steal(struct pipe_inode_info *pipe,
                struct pipe_buffer *buf)
{
        struct page *page = buf->page;

        if (page_count(page) != 1)
                return false;
        memcg_kmem_uncharge_page(page, 0);
        __SetPageLocked(page);
        return true;
}

/**
 * generic_pipe_buf_try_steal - attempt to take ownership of a &pipe_buffer
 * @pipe:        the pipe that the buffer belongs to
 * @buf:        the buffer to attempt to steal
 *
 * Description:
 *        This function attempts to steal the &struct page attached to
 *        @buf. If successful, this function returns 0 and returns with
 *        the page locked. The caller may then reuse the page for whatever
 *        he wishes; the typical use is insertion into a different file
 *        page cache.
 */
bool generic_pipe_buf_try_steal(struct pipe_inode_info *pipe,
                struct pipe_buffer *buf)
{
        struct page *page = buf->page;

        /*
         * A reference of one is golden, that means that the owner of this
         * page is the only one holding a reference to it. lock the page
         * and return OK.
         */
        if (page_count(page) == 1) {
                lock_page(page);
                return true;
        }
        return false;
}
EXPORT_SYMBOL(generic_pipe_buf_try_steal);

/**
 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
 * @pipe:        the pipe that the buffer belongs to
 * @buf:        the buffer to get a reference to
 *
 * Description:
 *        This function grabs an extra reference to @buf. It's used in
 *        in the tee() system call, when we duplicate the buffers in one
 *        pipe into another.
 */
bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
{
        return try_get_page(buf->page);
}
EXPORT_SYMBOL(generic_pipe_buf_get);

/**
 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
 * @pipe:        the pipe that the buffer belongs to
 * @buf:        the buffer to put a reference to
 *
 * Description:
 *        This function releases a reference to @buf.
 */
void generic_pipe_buf_release(struct pipe_inode_info *pipe,
                              struct pipe_buffer *buf)
{
        put_page(buf->page);
}
EXPORT_SYMBOL(generic_pipe_buf_release);

static const struct pipe_buf_operations anon_pipe_buf_ops = {
        .release        = anon_pipe_buf_release,
        .try_steal        = anon_pipe_buf_try_steal,
        .get                = generic_pipe_buf_get,
};

/* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
static inline bool pipe_readable(const struct pipe_inode_info *pipe)
{
        unsigned int head = READ_ONCE(pipe->head);
        unsigned int tail = READ_ONCE(pipe->tail);
        unsigned int writers = READ_ONCE(pipe->writers);

        return !pipe_empty(head, tail) || !writers;
}

static ssize_t
pipe_read(struct kiocb *iocb, struct iov_iter *to)
{
        size_t total_len = iov_iter_count(to);
        struct file *filp = iocb->ki_filp;
        struct pipe_inode_info *pipe = filp->private_data;
        bool was_full, wake_next_reader = false;
        ssize_t ret;

        /* Null read succeeds. */
        if (unlikely(total_len == 0))
                return 0;

        ret = 0;
        __pipe_lock(pipe);

        /*
         * We only wake up writers if the pipe was full when we started
         * reading in order to avoid unnecessary wakeups.
         *
         * But when we do wake up writers, we do so using a sync wakeup
         * (WF_SYNC), because we want them to get going and generate more
         * data for us.
         */
        was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
        for (;;) {
                /* Read ->head with a barrier vs post_one_notification() */
                unsigned int head = smp_load_acquire(&pipe->head);
                unsigned int tail = pipe->tail;
                unsigned int mask = pipe->ring_size - 1;

#ifdef CONFIG_WATCH_QUEUE
                if (pipe->note_loss) {
                        struct watch_notification n;

                        if (total_len < 8) {
                                if (ret == 0)
                                        ret = -ENOBUFS;
                                break;
                        }

                        n.type = WATCH_TYPE_META;
                        n.subtype = WATCH_META_LOSS_NOTIFICATION;
                        n.info = watch_sizeof(n);
                        if (copy_to_iter(&n, sizeof(n), to) != sizeof(n)) {
                                if (ret == 0)
                                        ret = -EFAULT;
                                break;
                        }
                        ret += sizeof(n);
                        total_len -= sizeof(n);
                        pipe->note_loss = false;
                }
#endif

                if (!pipe_empty(head, tail)) {
                        struct pipe_buffer *buf = &pipe->bufs[tail & mask];
                        size_t chars = buf->len;
                        size_t written;
                        int error;

                        if (chars > total_len) {
                                if (buf->flags & PIPE_BUF_FLAG_WHOLE) {
                                        if (ret == 0)
                                                ret = -ENOBUFS;
                                        break;
                                }
                                chars = total_len;
                        }

                        error = pipe_buf_confirm(pipe, buf);
                        if (error) {
                                if (!ret)
                                        ret = error;
                                break;
                        }

                        written = copy_page_to_iter(buf->page, buf->offset, chars, to);
                        if (unlikely(written < chars)) {
                                if (!ret)
                                        ret = -EFAULT;
                                break;
                        }
                        ret += chars;
                        buf->offset += chars;
                        buf->len -= chars;

                        /* Was it a packet buffer? Clean up and exit */
                        if (buf->flags & PIPE_BUF_FLAG_PACKET) {
                                total_len = chars;
                                buf->len = 0;
                        }

                        if (!buf->len) {
                                pipe_buf_release(pipe, buf);
                                spin_lock_irq(&pipe->rd_wait.lock);
#ifdef CONFIG_WATCH_QUEUE
                                if (buf->flags & PIPE_BUF_FLAG_LOSS)
                                        pipe->note_loss = true;
#endif
                                tail++;
                                pipe->tail = tail;
                                spin_unlock_irq(&pipe->rd_wait.lock);
                        }
                        total_len -= chars;
                        if (!total_len)
                                break;        /* common path: read succeeded */
                        if (!pipe_empty(head, tail))        /* More to do? */
                                continue;
                }

                if (!pipe->writers)
                        break;
                if (ret)
                        break;
                if (filp->f_flags & O_NONBLOCK) {
                        ret = -EAGAIN;
                        break;
                }
                __pipe_unlock(pipe);

                /*
                 * We only get here if we didn't actually read anything.
                 *
                 * However, we could have seen (and removed) a zero-sized
                 * pipe buffer, and might have made space in the buffers
                 * that way.
                 *
                 * You can't make zero-sized pipe buffers by doing an empty
                 * write (not even in packet mode), but they can happen if
                 * the writer gets an EFAULT when trying to fill a buffer
                 * that already got allocated and inserted in the buffer
                 * array.
                 *
                 * So we still need to wake up any pending writers in the
                 * _very_ unlikely case that the pipe was full, but we got
                 * no data.
                 */
                if (unlikely(was_full))
                        wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
                kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);

                /*
                 * But because we didn't read anything, at this point we can
                 * just return directly with -ERESTARTSYS if we're interrupted,
                 * since we've done any required wakeups and there's no need
                 * to mark anything accessed. And we've dropped the lock.
                 */
                if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
                        return -ERESTARTSYS;

                __pipe_lock(pipe);
                was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
                wake_next_reader = true;
        }
        if (pipe_empty(pipe->head, pipe->tail))
                wake_next_reader = false;
        __pipe_unlock(pipe);

        if (was_full)
                wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
        if (wake_next_reader)
                wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
        kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
        if (ret > 0)
                file_accessed(filp);
        return ret;
}

static inline int is_packetized(struct file *file)
{
        return (file->f_flags & O_DIRECT) != 0;
}

/* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
static inline bool pipe_writable(const struct pipe_inode_info *pipe)
{
        unsigned int head = READ_ONCE(pipe->head);
        unsigned int tail = READ_ONCE(pipe->tail);
        unsigned int max_usage = READ_ONCE(pipe->max_usage);

        return !pipe_full(head, tail, max_usage) ||
                !READ_ONCE(pipe->readers);
}

static ssize_t
pipe_write(struct kiocb *iocb, struct iov_iter *from)
{
        struct file *filp = iocb->ki_filp;
        struct pipe_inode_info *pipe = filp->private_data;
        unsigned int head;
        ssize_t ret = 0;
        size_t total_len = iov_iter_count(from);
        ssize_t chars;
        bool was_empty = false;
        bool wake_next_writer = false;

        /* Null write succeeds. */
        if (unlikely(total_len == 0))
                return 0;

        __pipe_lock(pipe);

        if (!pipe->readers) {
                send_sig(SIGPIPE, current, 0);
                ret = -EPIPE;
                goto out;
        }

        if (pipe_has_watch_queue(pipe)) {
                ret = -EXDEV;
                goto out;
        }

        /*
         * If it wasn't empty we try to merge new data into
         * the last buffer.
         *
         * That naturally merges small writes, but it also
         * page-aligns the rest of the writes for large writes
         * spanning multiple pages.
         */
        head = pipe->head;
        was_empty = pipe_empty(head, pipe->tail);
        chars = total_len & (PAGE_SIZE-1);
        if (chars && !was_empty) {
                unsigned int mask = pipe->ring_size - 1;
                struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
                int offset = buf->offset + buf->len;

                if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) &&
                    offset + chars <= PAGE_SIZE) {
                        ret = pipe_buf_confirm(pipe, buf);
                        if (ret)
                                goto out;

                        ret = copy_page_from_iter(buf->page, offset, chars, from);
                        if (unlikely(ret < chars)) {
                                ret = -EFAULT;
                                goto out;
                        }

                        buf->len += ret;
                        if (!iov_iter_count(from))
                                goto out;
                }
        }

        for (;;) {
                if (!pipe->readers) {
                        send_sig(SIGPIPE, current, 0);
                        if (!ret)
                                ret = -EPIPE;
                        break;
                }

                head = pipe->head;
                if (!pipe_full(head, pipe->tail, pipe->max_usage)) {
                        unsigned int mask = pipe->ring_size - 1;
                        struct pipe_buffer *buf = &pipe->bufs[head & mask];
                        struct page *page = pipe->tmp_page;
                        int copied;

                        if (!page) {
                                page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
                                if (unlikely(!page)) {
                                        ret = ret ? : -ENOMEM;
                                        break;
                                }
                                pipe->tmp_page = page;
                        }

                        /* Allocate a slot in the ring in advance and attach an
                         * empty buffer.  If we fault or otherwise fail to use
                         * it, either the reader will consume it or it'll still
                         * be there for the next write.
                         */
                        spin_lock_irq(&pipe->rd_wait.lock);

                        head = pipe->head;
                        if (pipe_full(head, pipe->tail, pipe->max_usage)) {
                                spin_unlock_irq(&pipe->rd_wait.lock);
                                continue;
                        }

                        pipe->head = head + 1;
                        spin_unlock_irq(&pipe->rd_wait.lock);

                        /* Insert it into the buffer array */
                        buf = &pipe->bufs[head & mask];
                        buf->page = page;
                        buf->ops = &anon_pipe_buf_ops;
                        buf->offset = 0;
                        buf->len = 0;
                        if (is_packetized(filp))
                                buf->flags = PIPE_BUF_FLAG_PACKET;
                        else
                                buf->flags = PIPE_BUF_FLAG_CAN_MERGE;
                        pipe->tmp_page = NULL;

                        copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
                        if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
                                if (!ret)
                                        ret = -EFAULT;
                                break;
                        }
                        ret += copied;
                        buf->offset = 0;
                        buf->len = copied;

                        if (!iov_iter_count(from))
                                break;
                }

                if (!pipe_full(head, pipe->tail, pipe->max_usage))
                        continue;

                /* Wait for buffer space to become available. */
                if (filp->f_flags & O_NONBLOCK) {
                        if (!ret)
                                ret = -EAGAIN;
                        break;
                }
                if (signal_pending(current)) {
                        if (!ret)
                                ret = -ERESTARTSYS;
                        break;
                }

                /*
                 * We're going to release the pipe lock and wait for more
                 * space. We wake up any readers if necessary, and then
                 * after waiting we need to re-check whether the pipe
                 * become empty while we dropped the lock.
                 */
                __pipe_unlock(pipe);
                if (was_empty)
                        wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
                wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
                __pipe_lock(pipe);
                was_empty = pipe_empty(pipe->head, pipe->tail);
                wake_next_writer = true;
        }
out:
        if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
                wake_next_writer = false;
        __pipe_unlock(pipe);

        /*
         * If we do do a wakeup event, we do a 'sync' wakeup, because we
         * want the reader to start processing things asap, rather than
         * leave the data pending.
         *
         * This is particularly important for small writes, because of
         * how (for example) the GNU make jobserver uses small writes to
         * wake up pending jobs
         *
         * Epoll nonsensically wants a wakeup whether the pipe
         * was already empty or not.
         */
        if (was_empty || pipe->poll_usage)
                wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
        kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
        if (wake_next_writer)
                wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
        if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
                int err = file_update_time(filp);
                if (err)
                        ret = err;
                sb_end_write(file_inode(filp)->i_sb);
        }
        return ret;
}

static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
        struct pipe_inode_info *pipe = filp->private_data;
        int count, head, tail, mask;

        switch (cmd) {
        case FIONREAD:
                __pipe_lock(pipe);
                count = 0;
                head = pipe->head;
                tail = pipe->tail;
                mask = pipe->ring_size - 1;

                while (tail != head) {
                        count += pipe->bufs[tail & mask].len;
                        tail++;
                }
                __pipe_unlock(pipe);

                return put_user(count, (int __user *)arg);

#ifdef CONFIG_WATCH_QUEUE
        case IOC_WATCH_QUEUE_SET_SIZE: {
                int ret;
                __pipe_lock(pipe);
                ret = watch_queue_set_size(pipe, arg);
                __pipe_unlock(pipe);
                return ret;
        }

        case IOC_WATCH_QUEUE_SET_FILTER:
                return watch_queue_set_filter(
                        pipe, (struct watch_notification_filter __user *)arg);
#endif

        default:
                return -ENOIOCTLCMD;
        }
}

/* No kernel lock held - fine */
static __poll_t
pipe_poll(struct file *filp, poll_table *wait)
{
        __poll_t mask;
        struct pipe_inode_info *pipe = filp->private_data;
        unsigned int head, tail;

        /* Epoll has some historical nasty semantics, this enables them */
        WRITE_ONCE(pipe->poll_usage, true);

        /*
         * Reading pipe state only -- no need for acquiring the semaphore.
         *
         * But because this is racy, the code has to add the
         * entry to the poll table _first_ ..
         */
        if (filp->f_mode & FMODE_READ)
                poll_wait(filp, &pipe->rd_wait, wait);
        if (filp->f_mode & FMODE_WRITE)
                poll_wait(filp, &pipe->wr_wait, wait);

        /*
         * .. and only then can you do the racy tests. That way,
         * if something changes and you got it wrong, the poll
         * table entry will wake you up and fix it.
         */
        head = READ_ONCE(pipe->head);
        tail = READ_ONCE(pipe->tail);

        mask = 0;
        if (filp->f_mode & FMODE_READ) {
                if (!pipe_empty(head, tail))
                        mask |= EPOLLIN | EPOLLRDNORM;
                if (!pipe->writers && filp->f_version != pipe->w_counter)
                        mask |= EPOLLHUP;
        }

        if (filp->f_mode & FMODE_WRITE) {
                if (!pipe_full(head, tail, pipe->max_usage))
                        mask |= EPOLLOUT | EPOLLWRNORM;
                /*
                 * Most Unices do not set EPOLLERR for FIFOs but on Linux they
                 * behave exactly like pipes for poll().
                 */
                if (!pipe->readers)
                        mask |= EPOLLERR;
        }

        return mask;
}

static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe)
{
        int kill = 0;

        spin_lock(&inode->i_lock);
        if (!--pipe->files) {
                inode->i_pipe = NULL;
                kill = 1;
        }
        spin_unlock(&inode->i_lock);

        if (kill)
                free_pipe_info(pipe);
}

static int
pipe_release(struct inode *inode, struct file *file)
{
        struct pipe_inode_info *pipe = file->private_data;

        __pipe_lock(pipe);
        if (file->f_mode & FMODE_READ)
                pipe->readers--;
        if (file->f_mode & FMODE_WRITE)
                pipe->writers--;

        /* Was that the last reader or writer, but not the other side? */
        if (!pipe->readers != !pipe->writers) {
                wake_up_interruptible_all(&pipe->rd_wait);
                wake_up_interruptible_all(&pipe->wr_wait);
                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
                kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
        }
        __pipe_unlock(pipe);

        put_pipe_info(inode, pipe);
        return 0;
}

static int
pipe_fasync(int fd, struct file *filp, int on)
{
        struct pipe_inode_info *pipe = filp->private_data;
        int retval = 0;

        __pipe_lock(pipe);
        if (filp->f_mode & FMODE_READ)
                retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
        if ((filp->f_mode & FMODE_WRITE) && retval >= 0) {
                retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
                if (retval < 0 && (filp->f_mode & FMODE_READ))
                        /* this can happen only if on == T */
                        fasync_helper(-1, filp, 0, &pipe->fasync_readers);
        }
        __pipe_unlock(pipe);
        return retval;
}

unsigned long account_pipe_buffers(struct user_struct *user,
                                   unsigned long old, unsigned long new)
{
        return atomic_long_add_return(new - old, &user->pipe_bufs);
}

bool too_many_pipe_buffers_soft(unsigned long user_bufs)
{
        unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft);

        return soft_limit && user_bufs > soft_limit;
}

bool too_many_pipe_buffers_hard(unsigned long user_bufs)
{
        unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard);

        return hard_limit && user_bufs > hard_limit;
}

bool pipe_is_unprivileged_user(void)
{
        return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
}

struct pipe_inode_info *alloc_pipe_info(void)
{
        struct pipe_inode_info *pipe;
        unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
        struct user_struct *user = get_current_user();
        unsigned long user_bufs;
        unsigned int max_size = READ_ONCE(pipe_max_size);

        pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);
        if (pipe == NULL)
                goto out_free_uid;

        if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE))
                pipe_bufs = max_size >> PAGE_SHIFT;

        user_bufs = account_pipe_buffers(user, 0, pipe_bufs);

        if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) {
                user_bufs = account_pipe_buffers(user, pipe_bufs, PIPE_MIN_DEF_BUFFERS);
                pipe_bufs = PIPE_MIN_DEF_BUFFERS;
        }

        if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user())
                goto out_revert_acct;

        pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
                             GFP_KERNEL_ACCOUNT);

        if (pipe->bufs) {
                init_waitqueue_head(&pipe->rd_wait);
                init_waitqueue_head(&pipe->wr_wait);
                pipe->r_counter = pipe->w_counter = 1;
                pipe->max_usage = pipe_bufs;
                pipe->ring_size = pipe_bufs;
                pipe->nr_accounted = pipe_bufs;
                pipe->user = user;
                mutex_init(&pipe->mutex);
                return pipe;
        }

out_revert_acct:
        (void) account_pipe_buffers(user, pipe_bufs, 0);
        kfree(pipe);
out_free_uid:
        free_uid(user);
        return NULL;
}

void free_pipe_info(struct pipe_inode_info *pipe)
{
        int i;

#ifdef CONFIG_WATCH_QUEUE
        if (pipe->watch_queue)
                watch_queue_clear(pipe->watch_queue);
#endif

        (void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0);
        free_uid(pipe->user);
        for (i = 0; i < pipe->ring_size; i++) {
                struct pipe_buffer *buf = pipe->bufs + i;
                if (buf->ops)
                        pipe_buf_release(pipe, buf);
        }
#ifdef CONFIG_WATCH_QUEUE
        if (pipe->watch_queue)
                put_watch_queue(pipe->watch_queue);
#endif
        if (pipe->tmp_page)
                __free_page(pipe->tmp_page);
        kfree(pipe->bufs);
        kfree(pipe);
}

static struct vfsmount *pipe_mnt __read_mostly;

/*
 * pipefs_dname() is called from d_path().
 */
static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
{
        return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]",
                                d_inode(dentry)->i_ino);
}

static const struct dentry_operations pipefs_dentry_operations = {
        .d_dname        = pipefs_dname,
};

static struct inode * get_pipe_inode(void)
{
        struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb);
        struct pipe_inode_info *pipe;

        if (!inode)
                goto fail_inode;

        inode->i_ino = get_next_ino();

        pipe = alloc_pipe_info();
        if (!pipe)
                goto fail_iput;

        inode->i_pipe = pipe;
        pipe->files = 2;
        pipe->readers = pipe->writers = 1;
        inode->i_fop = &pipefifo_fops;

        /*
         * Mark the inode dirty from the very beginning,
         * that way it will never be moved to the dirty
         * list because "mark_inode_dirty()" will think
         * that it already _is_ on the dirty list.
         */
        inode->i_state = I_DIRTY;
        inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
        inode->i_uid = current_fsuid();
        inode->i_gid = current_fsgid();
        inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);

        return inode;

fail_iput:
        iput(inode);

fail_inode:
        return NULL;
}

int create_pipe_files(struct file **res, int flags)
{
        struct inode *inode = get_pipe_inode();
        struct file *f;
        int error;

        if (!inode)
                return -ENFILE;

        if (flags & O_NOTIFICATION_PIPE) {
                error = watch_queue_init(inode->i_pipe);
                if (error) {
                        free_pipe_info(inode->i_pipe);
                        iput(inode);
                        return error;
                }
        }

        f = alloc_file_pseudo(inode, pipe_mnt, "",
                                O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)),
                                &pipefifo_fops);
        if (IS_ERR(f)) {
                free_pipe_info(inode->i_pipe);
                iput(inode);
                return PTR_ERR(f);
        }

        f->private_data = inode->i_pipe;

        res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK),
                                  &pipefifo_fops);
        if (IS_ERR(res[0])) {
                put_pipe_info(inode, inode->i_pipe);
                fput(f);
                return PTR_ERR(res[0]);
        }
        res[0]->private_data = inode->i_pipe;
        res[1] = f;
        stream_open(inode, res[0]);
        stream_open(inode, res[1]);
        return 0;
}

static int __do_pipe_flags(int *fd, struct file **files, int flags)
{
        int error;
        int fdw, fdr;

        if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE))
                return -EINVAL;

        error = create_pipe_files(files, flags);
        if (error)
                return error;

        error = get_unused_fd_flags(flags);
        if (error < 0)
                goto err_read_pipe;
        fdr = error;

        error = get_unused_fd_flags(flags);
        if (error < 0)
                goto err_fdr;
        fdw = error;

        audit_fd_pair(fdr, fdw);
        fd[0] = fdr;
        fd[1] = fdw;
        return 0;

 err_fdr:
        put_unused_fd(fdr);
 err_read_pipe:
        fput(files[0]);
        fput(files[1]);
        return error;
}

int do_pipe_flags(int *fd, int flags)
{
        struct file *files[2];
        int error = __do_pipe_flags(fd, files, flags);
        if (!error) {
                fd_install(fd[0], files[0]);
                fd_install(fd[1], files[1]);
        }
        return error;
}

/*
 * sys_pipe() is the normal C calling standard for creating
 * a pipe. It's not the way Unix traditionally does this, though.
 */
static int do_pipe2(int __user *fildes, int flags)
{
        struct file *files[2];
        int fd[2];
        int error;

        error = __do_pipe_flags(fd, files, flags);
        if (!error) {
                if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) {
                        fput(files[0]);
                        fput(files[1]);
                        put_unused_fd(fd[0]);
                        put_unused_fd(fd[1]);
                        error = -EFAULT;
                } else {
                        fd_install(fd[0], files[0]);
                        fd_install(fd[1], files[1]);
                }
        }
        return error;
}

SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
{
        return do_pipe2(fildes, flags);
}

SYSCALL_DEFINE1(pipe, int __user *, fildes)
{
        return do_pipe2(fildes, 0);
}

/*
 * This is the stupid "wait for pipe to be readable or writable"
 * model.
 *
 * See pipe_read/write() for the proper kind of exclusive wait,
 * but that requires that we wake up any other readers/writers
 * if we then do not end up reading everything (ie the whole
 * "wake_next_reader/writer" logic in pipe_read/write()).
 */
void pipe_wait_readable(struct pipe_inode_info *pipe)
{
        pipe_unlock(pipe);
        wait_event_interruptible(pipe->rd_wait, pipe_readable(pipe));
        pipe_lock(pipe);
}

void pipe_wait_writable(struct pipe_inode_info *pipe)
{
        pipe_unlock(pipe);
        wait_event_interruptible(pipe->wr_wait, pipe_writable(pipe));
        pipe_lock(pipe);
}

/*
 * This depends on both the wait (here) and the wakeup (wake_up_partner)
 * holding the pipe lock, so "*cnt" is stable and we know a wakeup cannot
 * race with the count check and waitqueue prep.
 *
 * Normally in order to avoid races, you'd do the prepare_to_wait() first,
 * then check the condition you're waiting for, and only then sleep. But
 * because of the pipe lock, we can check the condition before being on
 * the wait queue.
 *
 * We use the 'rd_wait' waitqueue for pipe partner waiting.
 */
static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
{
        DEFINE_WAIT(rdwait);
        int cur = *cnt;

        while (cur == *cnt) {
                prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE);
                pipe_unlock(pipe);
                schedule();
                finish_wait(&pipe->rd_wait, &rdwait);
                pipe_lock(pipe);
                if (signal_pending(current))
                        break;
        }
        return cur == *cnt ? -ERESTARTSYS : 0;
}

static void wake_up_partner(struct pipe_inode_info *pipe)
{
        wake_up_interruptible_all(&pipe->rd_wait);
}

static int fifo_open(struct inode *inode, struct file *filp)
{
        struct pipe_inode_info *pipe;
        bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC;
        int ret;

        filp->f_version = 0;

        spin_lock(&inode->i_lock);
        if (inode->i_pipe) {
                pipe = inode->i_pipe;
                pipe->files++;
                spin_unlock(&inode->i_lock);
        } else {
                spin_unlock(&inode->i_lock);
                pipe = alloc_pipe_info();
                if (!pipe)
                        return -ENOMEM;
                pipe->files = 1;
                spin_lock(&inode->i_lock);
                if (unlikely(inode->i_pipe)) {
                        inode->i_pipe->files++;
                        spin_unlock(&inode->i_lock);
                        free_pipe_info(pipe);
                        pipe = inode->i_pipe;
                } else {
                        inode->i_pipe = pipe;
                        spin_unlock(&inode->i_lock);
                }
        }
        filp->private_data = pipe;
        /* OK, we have a pipe and it's pinned down */

        __pipe_lock(pipe);

        /* We can only do regular read/write on fifos */
        stream_open(inode, filp);

        switch (filp->f_mode & (FMODE_READ | FMODE_WRITE)) {
        case FMODE_READ:
        /*
         *  O_RDONLY
         *  POSIX.1 says that O_NONBLOCK means return with the FIFO
         *  opened, even when there is no process writing the FIFO.
         */
                pipe->r_counter++;
                if (pipe->readers++ == 0)
                        wake_up_partner(pipe);

                if (!is_pipe && !pipe->writers) {
                        if ((filp->f_flags & O_NONBLOCK)) {
                                /* suppress EPOLLHUP until we have
                                 * seen a writer */
                                filp->f_version = pipe->w_counter;
                        } else {
                                if (wait_for_partner(pipe, &pipe->w_counter))
                                        goto err_rd;
                        }
                }
                break;

        case FMODE_WRITE:
        /*
         *  O_WRONLY
         *  POSIX.1 says that O_NONBLOCK means return -1 with
         *  errno=ENXIO when there is no process reading the FIFO.
         */
                ret = -ENXIO;
                if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers)
                        goto err;

                pipe->w_counter++;
                if (!pipe->writers++)
                        wake_up_partner(pipe);

                if (!is_pipe && !pipe->readers) {
                        if (wait_for_partner(pipe, &pipe->r_counter))
                                goto err_wr;
                }
                break;

        case FMODE_READ | FMODE_WRITE:
        /*
         *  O_RDWR
         *  POSIX.1 leaves this case "undefined" when O_NONBLOCK is set.
         *  This implementation will NEVER block on a O_RDWR open, since
         *  the process can at least talk to itself.
         */

                pipe->readers++;
                pipe->writers++;
                pipe->r_counter++;
                pipe->w_counter++;
                if (pipe->readers == 1 || pipe->writers == 1)
                        wake_up_partner(pipe);
                break;

        default:
                ret = -EINVAL;
                goto err;
        }

        /* Ok! */
        __pipe_unlock(pipe);
        return 0;

err_rd:
        if (!--pipe->readers)
                wake_up_interruptible(&pipe->wr_wait);
        ret = -ERESTARTSYS;
        goto err;

err_wr:
        if (!--pipe->writers)
                wake_up_interruptible_all(&pipe->rd_wait);
        ret = -ERESTARTSYS;
        goto err;

err:
        __pipe_unlock(pipe);

        put_pipe_info(inode, pipe);
        return ret;
}

const struct file_operations pipefifo_fops = {
        .open                = fifo_open,
        .llseek                = no_llseek,
        .read_iter        = pipe_read,
        .write_iter        = pipe_write,
        .poll                = pipe_poll,
        .unlocked_ioctl        = pipe_ioctl,
        .release        = pipe_release,
        .fasync                = pipe_fasync,
        .splice_write        = iter_file_splice_write,
};

/*
 * Currently we rely on the pipe array holding a power-of-2 number
 * of pages. Returns 0 on error.
 */
unsigned int round_pipe_size(unsigned long size)
{
        if (size > (1U << 31))
                return 0;

        /* Minimum pipe size, as required by POSIX */
        if (size < PAGE_SIZE)
                return PAGE_SIZE;

        return roundup_pow_of_two(size);
}

/*
 * Resize the pipe ring to a number of slots.
 *
 * Note the pipe can be reduced in capacity, but only if the current
 * occupancy doesn't exceed nr_slots; if it does, EBUSY will be
 * returned instead.
 */
int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
{
        struct pipe_buffer *bufs;
        unsigned int head, tail, mask, n;

        bufs = kcalloc(nr_slots, sizeof(*bufs),
                       GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
        if (unlikely(!bufs))
                return -ENOMEM;

        spin_lock_irq(&pipe->rd_wait.lock);
        mask = pipe->ring_size - 1;
        head = pipe->head;
        tail = pipe->tail;

        n = pipe_occupancy(head, tail);
        if (nr_slots < n) {
                spin_unlock_irq(&pipe->rd_wait.lock);
                kfree(bufs);
                return -EBUSY;
        }

        /*
         * The pipe array wraps around, so just start the new one at zero
         * and adjust the indices.
         */
        if (n > 0) {
                unsigned int h = head & mask;
                unsigned int t = tail & mask;
                if (h > t) {
                        memcpy(bufs, pipe->bufs + t,
                               n * sizeof(struct pipe_buffer));
                } else {
                        unsigned int tsize = pipe->ring_size - t;
                        if (h > 0)
                                memcpy(bufs + tsize, pipe->bufs,
                                       h * sizeof(struct pipe_buffer));
                        memcpy(bufs, pipe->bufs + t,
                               tsize * sizeof(struct pipe_buffer));
                }
        }

        head = n;
        tail = 0;

        kfree(pipe->bufs);
        pipe->bufs = bufs;
        pipe->ring_size = nr_slots;
        if (pipe->max_usage > nr_slots)
                pipe->max_usage = nr_slots;
        pipe->tail = tail;
        pipe->head = head;

        if (!pipe_has_watch_queue(pipe)) {
                pipe->max_usage = nr_slots;
                pipe->nr_accounted = nr_slots;
        }

        spin_unlock_irq(&pipe->rd_wait.lock);

        /* This might have made more room for writers */
        wake_up_interruptible(&pipe->wr_wait);
        return 0;
}

/*
 * Allocate a new array of pipe buffers and copy the info over. Returns the
 * pipe size if successful, or return -ERROR on error.
 */
static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
{
        unsigned long user_bufs;
        unsigned int nr_slots, size;
        long ret = 0;

        if (pipe_has_watch_queue(pipe))
                return -EBUSY;

        size = round_pipe_size(arg);
        nr_slots = size >> PAGE_SHIFT;

        if (!nr_slots)
                return -EINVAL;

        /*
         * If trying to increase the pipe capacity, check that an
         * unprivileged user is not trying to exceed various limits
         * (soft limit check here, hard limit check just below).
         * Decreasing the pipe capacity is always permitted, even
         * if the user is currently over a limit.
         */
        if (nr_slots > pipe->max_usage &&
                        size > pipe_max_size && !capable(CAP_SYS_RESOURCE))
                return -EPERM;

        user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_slots);

        if (nr_slots > pipe->max_usage &&
                        (too_many_pipe_buffers_hard(user_bufs) ||
                         too_many_pipe_buffers_soft(user_bufs)) &&
                        pipe_is_unprivileged_user()) {
                ret = -EPERM;
                goto out_revert_acct;
        }

        ret = pipe_resize_ring(pipe, nr_slots);
        if (ret < 0)
                goto out_revert_acct;

        return pipe->max_usage * PAGE_SIZE;

out_revert_acct:
        (void) account_pipe_buffers(pipe->user, nr_slots, pipe->nr_accounted);
        return ret;
}

/*
 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
 * location, so checking ->i_pipe is not enough to verify that this is a
 * pipe.
 */
struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice)
{
        struct pipe_inode_info *pipe = file->private_data;

        if (file->f_op != &pipefifo_fops || !pipe)
                return NULL;
        if (for_splice && pipe_has_watch_queue(pipe))
                return NULL;
        return pipe;
}

long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
{
        struct pipe_inode_info *pipe;
        long ret;

        pipe = get_pipe_info(file, false);
        if (!pipe)
                return -EBADF;

        __pipe_lock(pipe);

        switch (cmd) {
        case F_SETPIPE_SZ:
                ret = pipe_set_size(pipe, arg);
                break;
        case F_GETPIPE_SZ:
                ret = pipe->max_usage * PAGE_SIZE;
                break;
        default:
                ret = -EINVAL;
                break;
        }

        __pipe_unlock(pipe);
        return ret;
}

static const struct super_operations pipefs_ops = {
        .destroy_inode = free_inode_nonrcu,
        .statfs = simple_statfs,
};

/*
 * pipefs should _never_ be mounted by userland - too much of security hassle,
 * no real gain from having the whole whorehouse mounted. So we don't need
 * any operations on the root directory. However, we need a non-trivial
 * d_name - pipe: will go nicely and kill the special-casing in procfs.
 */

static int pipefs_init_fs_context(struct fs_context *fc)
{
        struct pseudo_fs_context *ctx = init_pseudo(fc, PIPEFS_MAGIC);
        if (!ctx)
                return -ENOMEM;
        ctx->ops = &pipefs_ops;
        ctx->dops = &pipefs_dentry_operations;
        return 0;
}

static struct file_system_type pipe_fs_type = {
        .name                = "pipefs",
        .init_fs_context = pipefs_init_fs_context,
        .kill_sb        = kill_anon_super,
};

static int __init init_pipe_fs(void)
{
        int err = register_filesystem(&pipe_fs_type);

        if (!err) {
                pipe_mnt = kern_mount(&pipe_fs_type);
                if (IS_ERR(pipe_mnt)) {
                        err = PTR_ERR(pipe_mnt);
                        unregister_filesystem(&pipe_fs_type);
                }
        }
        return err;
}

fs_initcall(init_pipe_fs);


































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __CFG802154_RDEV_OPS
#define __CFG802154_RDEV_OPS

#include <net/cfg802154.h>

#include "core.h"
#include "trace.h"

static inline struct net_device *
rdev_add_virtual_intf_deprecated(struct cfg802154_registered_device *rdev,
                                 const char *name,
                                 unsigned char name_assign_type,
                                 int type)
{
        return rdev->ops->add_virtual_intf_deprecated(&rdev->wpan_phy, name,
                                                      name_assign_type, type);
}

static inline void
rdev_del_virtual_intf_deprecated(struct cfg802154_registered_device *rdev,
                                 struct net_device *dev)
{
        rdev->ops->del_virtual_intf_deprecated(&rdev->wpan_phy, dev);
}

static inline int
rdev_suspend(struct cfg802154_registered_device *rdev)
{
        int ret;
        trace_802154_rdev_suspend(&rdev->wpan_phy);
        ret = rdev->ops->suspend(&rdev->wpan_phy);
        trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
        return ret;
}

static inline int
rdev_resume(struct cfg802154_registered_device *rdev)
{
        int ret;
        trace_802154_rdev_resume(&rdev->wpan_phy);
        ret = rdev->ops->resume(&rdev->wpan_phy);
        trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
        return ret;
}

static inline int
rdev_add_virtual_intf(struct cfg802154_registered_device *rdev, char *name,
                      unsigned char name_assign_type,
                      enum nl802154_iftype type, __le64 extended_addr)
{
        int ret;

        trace_802154_rdev_add_virtual_intf(&rdev->wpan_phy, name, type,
                                           extended_addr);
        ret = rdev->ops->add_virtual_intf(&rdev->wpan_phy, name,
                                          name_assign_type, type,
                                          extended_addr);
        trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
        return ret;
}

static inline int
rdev_del_virtual_intf(struct cfg802154_registered_device *rdev,
                      struct wpan_dev *wpan_dev)
{
        int ret;

        trace_802154_rdev_del_virtual_intf(&rdev->wpan_phy, wpan_dev);
        ret = rdev->ops->del_virtual_intf(&rdev->wpan_phy, wpan_dev);
        trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
        return ret;
}

static inline int
rdev_set_channel(struct cfg802154_registered_device *rdev, u8 page, u8 channel)
{
        int ret;

        trace_802154_rdev_set_channel(&rdev->wpan_phy, page, channel);
        ret = rdev->ops->set_channel(&rdev->wpan_phy, page, channel);
        trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
        return ret;
}

static inline int
rdev_set_cca_mode(struct cfg802154_registered_device *rdev,
                  const struct wpan_phy_cca *cca)
{
        int ret;

        trace_802154_rdev_set_cca_mode(&rdev->wpan_phy, cca);
        ret = rdev->ops->set_cca_mode(&rdev->wpan_phy, cca);
        trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
        return ret;
}

static inline int
rdev_set_cca_ed_level(struct cfg802154_registered_device *rdev, s32 ed_level)
{
        int ret;

        trace_802154_rdev_set_cca_ed_level(&rdev->wpan_phy, ed_level);
        ret = rdev->ops->set_cca_ed_level(&rdev->wpan_phy, ed_level);
        trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
        return ret;
}

static inline int
rdev_set_tx_power(struct cfg802154_registered_device *rdev,
                  s32 power)
{
        int ret;

        trace_802154_rdev_set_tx_power(&rdev->wpan_phy, power);
        ret = rdev->ops->set_tx_power(&rdev->wpan_phy, power);
        trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
        return ret;
}

static inline int
rdev_set_pan_id(struct cfg802154_registered_device *rdev,
                struct wpan_dev *wpan_dev, __le16 pan_id)
{
        int ret;

        trace_802154_rdev_set_pan_id(&rdev->wpan_phy, wpan_dev, pan_id);
        ret = rdev->ops->set_pan_id(&rdev->wpan_phy, wpan_dev, pan_id);
        trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
        return ret;
}

static inline int
rdev_set_short_addr(struct cfg802154_registered_device *rdev,
                    struct wpan_dev *wpan_dev, __le16 short_addr)
{
        int ret;

        trace_802154_rdev_set_short_addr(&rdev->wpan_phy, wpan_dev, short_addr);
        ret = rdev->ops->set_short_addr(&rdev->wpan_phy, wpan_dev, short_addr);
        trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
        return ret;
}

static inline int
rdev_set_backoff_exponent(struct cfg802154_registered_device *rdev,
                          struct wpan_dev *wpan_dev, u8 min_be, u8 max_be)
{
        int ret;

        trace_802154_rdev_set_backoff_exponent(&rdev->wpan_phy, wpan_dev,
                                               min_be, max_be);
        ret = rdev->ops->set_backoff_exponent(&rdev->wpan_phy, wpan_dev,
                                              min_be, max_be);
        trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
        return ret;
}

static inline int
rdev_set_max_csma_backoffs(struct cfg802154_registered_device *rdev,
                           struct wpan_dev *wpan_dev, u8 max_csma_backoffs)
{
        int ret;

        trace_802154_rdev_set_csma_backoffs(&rdev->wpan_phy, wpan_dev,
                                            max_csma_backoffs);
        ret = rdev->ops->set_max_csma_backoffs(&rdev->wpan_phy, wpan_dev,
                                               max_csma_backoffs);
        trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
        return ret;
}

static inline int
rdev_set_max_frame_retries(struct cfg802154_registered_device *rdev,
                           struct wpan_dev *wpan_dev, s8 max_frame_retries)
{
        int ret;

        trace_802154_rdev_set_max_frame_retries(&rdev->wpan_phy, wpan_dev,
                                                max_frame_retries);
        ret = rdev->ops->set_max_frame_retries(&rdev->wpan_phy, wpan_dev,
                                               max_frame_retries);
        trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
        return ret;
}

static inline int
rdev_set_lbt_mode(struct cfg802154_registered_device *rdev,
                  struct wpan_dev *wpan_dev, bool mode)
{
        int ret;

        trace_802154_rdev_set_lbt_mode(&rdev->wpan_phy, wpan_dev, mode);
        ret = rdev->ops->set_lbt_mode(&rdev->wpan_phy, wpan_dev, mode);
        trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
        return ret;
}

static inline int
rdev_set_ackreq_default(struct cfg802154_registered_device *rdev,
                        struct wpan_dev *wpan_dev, bool ackreq)
{
        int ret;

        trace_802154_rdev_set_ackreq_default(&rdev->wpan_phy, wpan_dev,
                                             ackreq);
        ret = rdev->ops->set_ackreq_default(&rdev->wpan_phy, wpan_dev, ackreq);
        trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
        return ret;
}

#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL
/* TODO this is already a nl802154, so move into ieee802154 */
static inline void
rdev_get_llsec_table(struct cfg802154_registered_device *rdev,
                     struct wpan_dev *wpan_dev,
                     struct ieee802154_llsec_table **table)
{
        rdev->ops->get_llsec_table(&rdev->wpan_phy, wpan_dev, table);
}

static inline void
rdev_lock_llsec_table(struct cfg802154_registered_device *rdev,
                      struct wpan_dev *wpan_dev)
{
        rdev->ops->lock_llsec_table(&rdev->wpan_phy, wpan_dev);
}

static inline void
rdev_unlock_llsec_table(struct cfg802154_registered_device *rdev,
                        struct wpan_dev *wpan_dev)
{
        rdev->ops->unlock_llsec_table(&rdev->wpan_phy, wpan_dev);
}

static inline int
rdev_get_llsec_params(struct cfg802154_registered_device *rdev,
                      struct wpan_dev *wpan_dev,
                      struct ieee802154_llsec_params *params)
{
        return rdev->ops->get_llsec_params(&rdev->wpan_phy, wpan_dev, params);
}

static inline int
rdev_set_llsec_params(struct cfg802154_registered_device *rdev,
                      struct wpan_dev *wpan_dev,
                      const struct ieee802154_llsec_params *params,
                      u32 changed)
{
        return rdev->ops->set_llsec_params(&rdev->wpan_phy, wpan_dev, params,
                                           changed);
}

static inline int
rdev_add_llsec_key(struct cfg802154_registered_device *rdev,
                   struct wpan_dev *wpan_dev,
                   const struct ieee802154_llsec_key_id *id,
                   const struct ieee802154_llsec_key *key)
{
        return rdev->ops->add_llsec_key(&rdev->wpan_phy, wpan_dev, id, key);
}

static inline int
rdev_del_llsec_key(struct cfg802154_registered_device *rdev,
                   struct wpan_dev *wpan_dev,
                   const struct ieee802154_llsec_key_id *id)
{
        return rdev->ops->del_llsec_key(&rdev->wpan_phy, wpan_dev, id);
}

static inline int
rdev_add_seclevel(struct cfg802154_registered_device *rdev,
                  struct wpan_dev *wpan_dev,
                  const struct ieee802154_llsec_seclevel *sl)
{
        return rdev->ops->add_seclevel(&rdev->wpan_phy, wpan_dev, sl);
}

static inline int
rdev_del_seclevel(struct cfg802154_registered_device *rdev,
                  struct wpan_dev *wpan_dev,
                  const struct ieee802154_llsec_seclevel *sl)
{
        return rdev->ops->del_seclevel(&rdev->wpan_phy, wpan_dev, sl);
}

static inline int
rdev_add_device(struct cfg802154_registered_device *rdev,
                struct wpan_dev *wpan_dev,
                const struct ieee802154_llsec_device *dev_desc)
{
        return rdev->ops->add_device(&rdev->wpan_phy, wpan_dev, dev_desc);
}

static inline int
rdev_del_device(struct cfg802154_registered_device *rdev,
                struct wpan_dev *wpan_dev, __le64 extended_addr)
{
        return rdev->ops->del_device(&rdev->wpan_phy, wpan_dev, extended_addr);
}

static inline int
rdev_add_devkey(struct cfg802154_registered_device *rdev,
                struct wpan_dev *wpan_dev, __le64 extended_addr,
                const struct ieee802154_llsec_device_key *devkey)
{
        return rdev->ops->add_devkey(&rdev->wpan_phy, wpan_dev, extended_addr,
                                     devkey);
}

static inline int
rdev_del_devkey(struct cfg802154_registered_device *rdev,
                struct wpan_dev *wpan_dev, __le64 extended_addr,
                const struct ieee802154_llsec_device_key *devkey)
{
        return rdev->ops->del_devkey(&rdev->wpan_phy, wpan_dev, extended_addr,
                                     devkey);
}
#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */

#endif /* __CFG802154_RDEV_OPS */

























































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (c) 1999-2002 Vojtech Pavlik
 */
#ifndef _INPUT_H
#define _INPUT_H

#include <linux/time.h>
#include <linux/list.h>
#include <uapi/linux/input.h>
/* Implementation details, userspace should not care about these */
#define ABS_MT_FIRST                ABS_MT_TOUCH_MAJOR
#define ABS_MT_LAST                ABS_MT_TOOL_Y

/*
 * In-kernel definitions.
 */

#include <linux/device.h>
#include <linux/fs.h>
#include <linux/timer.h>
#include <linux/mod_devicetable.h>

struct input_dev_poller;

/**
 * struct input_value - input value representation
 * @type: type of value (EV_KEY, EV_ABS, etc)
 * @code: the value code
 * @value: the value
 */
struct input_value {
        __u16 type;
        __u16 code;
        __s32 value;
};

enum input_clock_type {
        INPUT_CLK_REAL = 0,
        INPUT_CLK_MONO,
        INPUT_CLK_BOOT,
        INPUT_CLK_MAX
};

/**
 * struct input_dev - represents an input device
 * @name: name of the device
 * @phys: physical path to the device in the system hierarchy
 * @uniq: unique identification code for the device (if device has it)
 * @id: id of the device (struct input_id)
 * @propbit: bitmap of device properties and quirks
 * @evbit: bitmap of types of events supported by the device (EV_KEY,
 *        EV_REL, etc.)
 * @keybit: bitmap of keys/buttons this device has
 * @relbit: bitmap of relative axes for the device
 * @absbit: bitmap of absolute axes for the device
 * @mscbit: bitmap of miscellaneous events supported by the device
 * @ledbit: bitmap of leds present on the device
 * @sndbit: bitmap of sound effects supported by the device
 * @ffbit: bitmap of force feedback effects supported by the device
 * @swbit: bitmap of switches present on the device
 * @hint_events_per_packet: average number of events generated by the
 *        device in a packet (between EV_SYN/SYN_REPORT events). Used by
 *        event handlers to estimate size of the buffer needed to hold
 *        events.
 * @keycodemax: size of keycode table
 * @keycodesize: size of elements in keycode table
 * @keycode: map of scancodes to keycodes for this device
 * @getkeycode: optional legacy method to retrieve current keymap.
 * @setkeycode: optional method to alter current keymap, used to implement
 *        sparse keymaps. If not supplied default mechanism will be used.
 *        The method is being called while holding event_lock and thus must
 *        not sleep
 * @ff: force feedback structure associated with the device if device
 *        supports force feedback effects
 * @poller: poller structure associated with the device if device is
 *        set up to use polling mode
 * @repeat_key: stores key code of the last key pressed; used to implement
 *        software autorepeat
 * @timer: timer for software autorepeat
 * @rep: current values for autorepeat parameters (delay, rate)
 * @mt: pointer to multitouch state
 * @absinfo: array of &struct input_absinfo elements holding information
 *        about absolute axes (current value, min, max, flat, fuzz,
 *        resolution)
 * @key: reflects current state of device's keys/buttons
 * @led: reflects current state of device's LEDs
 * @snd: reflects current state of sound effects
 * @sw: reflects current state of device's switches
 * @open: this method is called when the very first user calls
 *        input_open_device(). The driver must prepare the device
 *        to start generating events (start polling thread,
 *        request an IRQ, submit URB, etc.)
 * @close: this method is called when the very last user calls
 *        input_close_device().
 * @flush: purges the device. Most commonly used to get rid of force
 *        feedback effects loaded into the device when disconnecting
 *        from it
 * @event: event handler for events sent _to_ the device, like EV_LED
 *        or EV_SND. The device is expected to carry out the requested
 *        action (turn on a LED, play sound, etc.) The call is protected
 *        by @event_lock and must not sleep
 * @grab: input handle that currently has the device grabbed (via
 *        EVIOCGRAB ioctl). When a handle grabs a device it becomes sole
 *        recipient for all input events coming from the device
 * @event_lock: this spinlock is taken when input core receives
 *        and processes a new event for the device (in input_event()).
 *        Code that accesses and/or modifies parameters of a device
 *        (such as keymap or absmin, absmax, absfuzz, etc.) after device
 *        has been registered with input core must take this lock.
 * @mutex: serializes calls to open(), close() and flush() methods
 * @users: stores number of users (input handlers) that opened this
 *        device. It is used by input_open_device() and input_close_device()
 *        to make sure that dev->open() is only called when the first
 *        user opens device and dev->close() is called when the very
 *        last user closes the device
 * @going_away: marks devices that are in a middle of unregistering and
 *        causes input_open_device*() fail with -ENODEV.
 * @dev: driver model's view of this device
 * @h_list: list of input handles associated with the device. When
 *        accessing the list dev->mutex must be held
 * @node: used to place the device onto input_dev_list
 * @num_vals: number of values queued in the current frame
 * @max_vals: maximum number of values queued in a frame
 * @vals: array of values queued in the current frame
 * @devres_managed: indicates that devices is managed with devres framework
 *        and needs not be explicitly unregistered or freed.
 * @timestamp: storage for a timestamp set by input_set_timestamp called
 *  by a driver
 */
struct input_dev {
        const char *name;
        const char *phys;
        const char *uniq;
        struct input_id id;

        unsigned long propbit[BITS_TO_LONGS(INPUT_PROP_CNT)];

        unsigned long evbit[BITS_TO_LONGS(EV_CNT)];
        unsigned long keybit[BITS_TO_LONGS(KEY_CNT)];
        unsigned long relbit[BITS_TO_LONGS(REL_CNT)];
        unsigned long absbit[BITS_TO_LONGS(ABS_CNT)];
        unsigned long mscbit[BITS_TO_LONGS(MSC_CNT)];
        unsigned long ledbit[BITS_TO_LONGS(LED_CNT)];
        unsigned long sndbit[BITS_TO_LONGS(SND_CNT)];
        unsigned long ffbit[BITS_TO_LONGS(FF_CNT)];
        unsigned long swbit[BITS_TO_LONGS(SW_CNT)];

        unsigned int hint_events_per_packet;

        unsigned int keycodemax;
        unsigned int keycodesize;
        void *keycode;

        int (*setkeycode)(struct input_dev *dev,
                          const struct input_keymap_entry *ke,
                          unsigned int *old_keycode);
        int (*getkeycode)(struct input_dev *dev,
                          struct input_keymap_entry *ke);

        struct ff_device *ff;

        struct input_dev_poller *poller;

        unsigned int repeat_key;
        struct timer_list timer;

        int rep[REP_CNT];

        struct input_mt *mt;

        struct input_absinfo *absinfo;

        unsigned long key[BITS_TO_LONGS(KEY_CNT)];
        unsigned long led[BITS_TO_LONGS(LED_CNT)];
        unsigned long snd[BITS_TO_LONGS(SND_CNT)];
        unsigned long sw[BITS_TO_LONGS(SW_CNT)];

        int (*open)(struct input_dev *dev);
        void (*close)(struct input_dev *dev);
        int (*flush)(struct input_dev *dev, struct file *file);
        int (*event)(struct input_dev *dev, unsigned int type, unsigned int code, int value);

        struct input_handle __rcu *grab;

        spinlock_t event_lock;
        struct mutex mutex;

        unsigned int users;
        bool going_away;

        struct device dev;

        struct list_head        h_list;
        struct list_head        node;

        unsigned int num_vals;
        unsigned int max_vals;
        struct input_value *vals;

        bool devres_managed;

        ktime_t timestamp[INPUT_CLK_MAX];
};
#define to_input_dev(d) container_of(d, struct input_dev, dev)

/*
 * Verify that we are in sync with input_device_id mod_devicetable.h #defines
 */

#if EV_MAX != INPUT_DEVICE_ID_EV_MAX
#error "EV_MAX and INPUT_DEVICE_ID_EV_MAX do not match"
#endif

#if KEY_MIN_INTERESTING != INPUT_DEVICE_ID_KEY_MIN_INTERESTING
#error "KEY_MIN_INTERESTING and INPUT_DEVICE_ID_KEY_MIN_INTERESTING do not match"
#endif

#if KEY_MAX != INPUT_DEVICE_ID_KEY_MAX
#error "KEY_MAX and INPUT_DEVICE_ID_KEY_MAX do not match"
#endif

#if REL_MAX != INPUT_DEVICE_ID_REL_MAX
#error "REL_MAX and INPUT_DEVICE_ID_REL_MAX do not match"
#endif

#if ABS_MAX != INPUT_DEVICE_ID_ABS_MAX
#error "ABS_MAX and INPUT_DEVICE_ID_ABS_MAX do not match"
#endif

#if MSC_MAX != INPUT_DEVICE_ID_MSC_MAX
#error "MSC_MAX and INPUT_DEVICE_ID_MSC_MAX do not match"
#endif

#if LED_MAX != INPUT_DEVICE_ID_LED_MAX
#error "LED_MAX and INPUT_DEVICE_ID_LED_MAX do not match"
#endif

#if SND_MAX != INPUT_DEVICE_ID_SND_MAX
#error "SND_MAX and INPUT_DEVICE_ID_SND_MAX do not match"
#endif

#if FF_MAX != INPUT_DEVICE_ID_FF_MAX
#error "FF_MAX and INPUT_DEVICE_ID_FF_MAX do not match"
#endif

#if SW_MAX != INPUT_DEVICE_ID_SW_MAX
#error "SW_MAX and INPUT_DEVICE_ID_SW_MAX do not match"
#endif

#if INPUT_PROP_MAX != INPUT_DEVICE_ID_PROP_MAX
#error "INPUT_PROP_MAX and INPUT_DEVICE_ID_PROP_MAX do not match"
#endif

#define INPUT_DEVICE_ID_MATCH_DEVICE \
        (INPUT_DEVICE_ID_MATCH_BUS | INPUT_DEVICE_ID_MATCH_VENDOR | INPUT_DEVICE_ID_MATCH_PRODUCT)
#define INPUT_DEVICE_ID_MATCH_DEVICE_AND_VERSION \
        (INPUT_DEVICE_ID_MATCH_DEVICE | INPUT_DEVICE_ID_MATCH_VERSION)

struct input_handle;

/**
 * struct input_handler - implements one of interfaces for input devices
 * @private: driver-specific data
 * @event: event handler. This method is being called by input core with
 *        interrupts disabled and dev->event_lock spinlock held and so
 *        it may not sleep
 * @events: event sequence handler. This method is being called by
 *        input core with interrupts disabled and dev->event_lock
 *        spinlock held and so it may not sleep
 * @filter: similar to @event; separates normal event handlers from
 *        "filters".
 * @match: called after comparing device's id with handler's id_table
 *        to perform fine-grained matching between device and handler
 * @connect: called when attaching a handler to an input device
 * @disconnect: disconnects a handler from input device
 * @start: starts handler for given handle. This function is called by
 *        input core right after connect() method and also when a process
 *        that "grabbed" a device releases it
 * @legacy_minors: set to %true by drivers using legacy minor ranges
 * @minor: beginning of range of 32 legacy minors for devices this driver
 *        can provide
 * @name: name of the handler, to be shown in /proc/bus/input/handlers
 * @id_table: pointer to a table of input_device_ids this driver can
 *        handle
 * @h_list: list of input handles associated with the handler
 * @node: for placing the driver onto input_handler_list
 *
 * Input handlers attach to input devices and create input handles. There
 * are likely several handlers attached to any given input device at the
 * same time. All of them will get their copy of input event generated by
 * the device.
 *
 * The very same structure is used to implement input filters. Input core
 * allows filters to run first and will not pass event to regular handlers
 * if any of the filters indicate that the event should be filtered (by
 * returning %true from their filter() method).
 *
 * Note that input core serializes calls to connect() and disconnect()
 * methods.
 */
struct input_handler {

        void *private;

        void (*event)(struct input_handle *handle, unsigned int type, unsigned int code, int value);
        void (*events)(struct input_handle *handle,
                       const struct input_value *vals, unsigned int count);
        bool (*filter)(struct input_handle *handle, unsigned int type, unsigned int code, int value);
        bool (*match)(struct input_handler *handler, struct input_dev *dev);
        int (*connect)(struct input_handler *handler, struct input_dev *dev, const struct input_device_id *id);
        void (*disconnect)(struct input_handle *handle);
        void (*start)(struct input_handle *handle);

        bool legacy_minors;
        int minor;
        const char *name;

        const struct input_device_id *id_table;

        struct list_head        h_list;
        struct list_head        node;
};

/**
 * struct input_handle - links input device with an input handler
 * @private: handler-specific data
 * @open: counter showing whether the handle is 'open', i.e. should deliver
 *        events from its device
 * @name: name given to the handle by handler that created it
 * @dev: input device the handle is attached to
 * @handler: handler that works with the device through this handle
 * @d_node: used to put the handle on device's list of attached handles
 * @h_node: used to put the handle on handler's list of handles from which
 *        it gets events
 */
struct input_handle {

        void *private;

        int open;
        const char *name;

        struct input_dev *dev;
        struct input_handler *handler;

        struct list_head        d_node;
        struct list_head        h_node;
};

struct input_dev __must_check *input_allocate_device(void);
struct input_dev __must_check *devm_input_allocate_device(struct device *);
void input_free_device(struct input_dev *dev);

static inline struct input_dev *input_get_device(struct input_dev *dev)
{
        return dev ? to_input_dev(get_device(&dev->dev)) : NULL;
}

static inline void input_put_device(struct input_dev *dev)
{
        if (dev)
                put_device(&dev->dev);
}

static inline void *input_get_drvdata(struct input_dev *dev)
{
        return dev_get_drvdata(&dev->dev);
}

static inline void input_set_drvdata(struct input_dev *dev, void *data)
{
        dev_set_drvdata(&dev->dev, data);
}

int __must_check input_register_device(struct input_dev *);
void input_unregister_device(struct input_dev *);

void input_reset_device(struct input_dev *);

int input_setup_polling(struct input_dev *dev,
                        void (*poll_fn)(struct input_dev *dev));
void input_set_poll_interval(struct input_dev *dev, unsigned int interval);
void input_set_min_poll_interval(struct input_dev *dev, unsigned int interval);
void input_set_max_poll_interval(struct input_dev *dev, unsigned int interval);
int input_get_poll_interval(struct input_dev *dev);

int __must_check input_register_handler(struct input_handler *);
void input_unregister_handler(struct input_handler *);

int __must_check input_get_new_minor(int legacy_base, unsigned int legacy_num,
                                     bool allow_dynamic);
void input_free_minor(unsigned int minor);

int input_handler_for_each_handle(struct input_handler *, void *data,
                                  int (*fn)(struct input_handle *, void *));

int input_register_handle(struct input_handle *);
void input_unregister_handle(struct input_handle *);

int input_grab_device(struct input_handle *);
void input_release_device(struct input_handle *);

int input_open_device(struct input_handle *);
void input_close_device(struct input_handle *);

int input_flush_device(struct input_handle *handle, struct file *file);

void input_set_timestamp(struct input_dev *dev, ktime_t timestamp);
ktime_t *input_get_timestamp(struct input_dev *dev);

void input_event(struct input_dev *dev, unsigned int type, unsigned int code, int value);
void input_inject_event(struct input_handle *handle, unsigned int type, unsigned int code, int value);

static inline void input_report_key(struct input_dev *dev, unsigned int code, int value)
{
        input_event(dev, EV_KEY, code, !!value);
}

static inline void input_report_rel(struct input_dev *dev, unsigned int code, int value)
{
        input_event(dev, EV_REL, code, value);
}

static inline void input_report_abs(struct input_dev *dev, unsigned int code, int value)
{
        input_event(dev, EV_ABS, code, value);
}

static inline void input_report_ff_status(struct input_dev *dev, unsigned int code, int value)
{
        input_event(dev, EV_FF_STATUS, code, value);
}

static inline void input_report_switch(struct input_dev *dev, unsigned int code, int value)
{
        input_event(dev, EV_SW, code, !!value);
}

static inline void input_sync(struct input_dev *dev)
{
        input_event(dev, EV_SYN, SYN_REPORT, 0);
}

static inline void input_mt_sync(struct input_dev *dev)
{
        input_event(dev, EV_SYN, SYN_MT_REPORT, 0);
}

void input_set_capability(struct input_dev *dev, unsigned int type, unsigned int code);

/**
 * input_set_events_per_packet - tell handlers about the driver event rate
 * @dev: the input device used by the driver
 * @n_events: the average number of events between calls to input_sync()
 *
 * If the event rate sent from a device is unusually large, use this
 * function to set the expected event rate. This will allow handlers
 * to set up an appropriate buffer size for the event stream, in order
 * to minimize information loss.
 */
static inline void input_set_events_per_packet(struct input_dev *dev, int n_events)
{
        dev->hint_events_per_packet = n_events;
}

void input_alloc_absinfo(struct input_dev *dev);
void input_set_abs_params(struct input_dev *dev, unsigned int axis,
                          int min, int max, int fuzz, int flat);

#define INPUT_GENERATE_ABS_ACCESSORS(_suffix, _item)                        \
static inline int input_abs_get_##_suffix(struct input_dev *dev,        \
                                          unsigned int axis)                \
{                                                                        \
        return dev->absinfo ? dev->absinfo[axis]._item : 0;                \
}                                                                        \
                                                                        \
static inline void input_abs_set_##_suffix(struct input_dev *dev,        \
                                           unsigned int axis, int val)        \
{                                                                        \
        input_alloc_absinfo(dev);                                        \
        if (dev->absinfo)                                                \
                dev->absinfo[axis]._item = val;                                \
}

INPUT_GENERATE_ABS_ACCESSORS(val, value)
INPUT_GENERATE_ABS_ACCESSORS(min, minimum)
INPUT_GENERATE_ABS_ACCESSORS(max, maximum)
INPUT_GENERATE_ABS_ACCESSORS(fuzz, fuzz)
INPUT_GENERATE_ABS_ACCESSORS(flat, flat)
INPUT_GENERATE_ABS_ACCESSORS(res, resolution)

int input_scancode_to_scalar(const struct input_keymap_entry *ke,
                             unsigned int *scancode);

int input_get_keycode(struct input_dev *dev, struct input_keymap_entry *ke);
int input_set_keycode(struct input_dev *dev,
                      const struct input_keymap_entry *ke);

bool input_match_device_id(const struct input_dev *dev,
                           const struct input_device_id *id);

void input_enable_softrepeat(struct input_dev *dev, int delay, int period);

extern struct class input_class;

/**
 * struct ff_device - force-feedback part of an input device
 * @upload: Called to upload an new effect into device
 * @erase: Called to erase an effect from device
 * @playback: Called to request device to start playing specified effect
 * @set_gain: Called to set specified gain
 * @set_autocenter: Called to auto-center device
 * @destroy: called by input core when parent input device is being
 *        destroyed
 * @private: driver-specific data, will be freed automatically
 * @ffbit: bitmap of force feedback capabilities truly supported by
 *        device (not emulated like ones in input_dev->ffbit)
 * @mutex: mutex for serializing access to the device
 * @max_effects: maximum number of effects supported by device
 * @effects: pointer to an array of effects currently loaded into device
 * @effect_owners: array of effect owners; when file handle owning
 *        an effect gets closed the effect is automatically erased
 *
 * Every force-feedback device must implement upload() and playback()
 * methods; erase() is optional. set_gain() and set_autocenter() need
 * only be implemented if driver sets up FF_GAIN and FF_AUTOCENTER
 * bits.
 *
 * Note that playback(), set_gain() and set_autocenter() are called with
 * dev->event_lock spinlock held and interrupts off and thus may not
 * sleep.
 */
struct ff_device {
        int (*upload)(struct input_dev *dev, struct ff_effect *effect,
                      struct ff_effect *old);
        int (*erase)(struct input_dev *dev, int effect_id);

        int (*playback)(struct input_dev *dev, int effect_id, int value);
        void (*set_gain)(struct input_dev *dev, u16 gain);
        void (*set_autocenter)(struct input_dev *dev, u16 magnitude);

        void (*destroy)(struct ff_device *);

        void *private;

        unsigned long ffbit[BITS_TO_LONGS(FF_CNT)];

        struct mutex mutex;

        int max_effects;
        struct ff_effect *effects;
        struct file *effect_owners[];
};

int input_ff_create(struct input_dev *dev, unsigned int max_effects);
void input_ff_destroy(struct input_dev *dev);

int input_ff_event(struct input_dev *dev, unsigned int type, unsigned int code, int value);

int input_ff_upload(struct input_dev *dev, struct ff_effect *effect, struct file *file);
int input_ff_erase(struct input_dev *dev, int effect_id, struct file *file);
int input_ff_flush(struct input_dev *dev, struct file *file);

int input_ff_create_memless(struct input_dev *dev, void *data,
                int (*play_effect)(struct input_dev *, void *, struct ff_effect *));

#endif
















































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _FAT_H
#define _FAT_H

#include <linux/buffer_head.h>
#include <linux/nls.h>
#include <linux/hash.h>
#include <linux/ratelimit.h>
#include <linux/msdos_fs.h>

/*
 * vfat shortname flags
 */
#define VFAT_SFN_DISPLAY_LOWER        0x0001 /* convert to lowercase for display */
#define VFAT_SFN_DISPLAY_WIN95        0x0002 /* emulate win95 rule for display */
#define VFAT_SFN_DISPLAY_WINNT        0x0004 /* emulate winnt rule for display */
#define VFAT_SFN_CREATE_WIN95        0x0100 /* emulate win95 rule for create */
#define VFAT_SFN_CREATE_WINNT        0x0200 /* emulate winnt rule for create */

#define FAT_ERRORS_CONT                1      /* ignore error and continue */
#define FAT_ERRORS_PANIC        2      /* panic on error */
#define FAT_ERRORS_RO                3      /* remount r/o on error */

#define FAT_NFS_STALE_RW        1      /* NFS RW support, can cause ESTALE */
#define FAT_NFS_NOSTALE_RO        2      /* NFS RO support, no ESTALE issue */

struct fat_mount_options {
        kuid_t fs_uid;
        kgid_t fs_gid;
        unsigned short fs_fmask;
        unsigned short fs_dmask;
        unsigned short codepage;   /* Codepage for shortname conversions */
        int time_offset;           /* Offset of timestamps from UTC (in minutes) */
        char *iocharset;           /* Charset used for filename input/display */
        unsigned short shortname;  /* flags for shortname display/create rule */
        unsigned char name_check;  /* r = relaxed, n = normal, s = strict */
        unsigned char errors;           /* On error: continue, panic, remount-ro */
        unsigned char nfs;          /* NFS support: nostale_ro, stale_rw */
        unsigned short allow_utime;/* permission for setting the [am]time */
        unsigned quiet:1,          /* set = fake successful chmods and chowns */
                 showexec:1,       /* set = only set x bit for com/exe/bat */
                 sys_immutable:1,  /* set = system files are immutable */
                 dotsOK:1,         /* set = hidden and system files are named '.filename' */
                 isvfat:1,         /* 0=no vfat long filename support, 1=vfat support */
                 utf8:1,           /* Use of UTF-8 character set (Default) */
                 unicode_xlate:1,  /* create escape sequences for unhandled Unicode */
                 numtail:1,        /* Does first alias have a numeric '~1' type tail? */
                 flush:1,           /* write things quickly */
                 nocase:1,           /* Does this need case conversion? 0=need case conversion*/
                 usefree:1,           /* Use free_clusters for FAT32 */
                 tz_set:1,           /* Filesystem timestamps' offset set */
                 rodir:1,           /* allow ATTR_RO for directory */
                 discard:1,           /* Issue discard requests on deletions */
                 dos1xfloppy:1;           /* Assume default BPB for DOS 1.x floppies */
};

#define FAT_HASH_BITS        8
#define FAT_HASH_SIZE        (1UL << FAT_HASH_BITS)

/*
 * MS-DOS file system in-core superblock data
 */
struct msdos_sb_info {
        unsigned short sec_per_clus;  /* sectors/cluster */
        unsigned short cluster_bits;  /* log2(cluster_size) */
        unsigned int cluster_size;    /* cluster size */
        unsigned char fats, fat_bits; /* number of FATs, FAT bits (12,16 or 32) */
        unsigned short fat_start;
        unsigned long fat_length;     /* FAT start & length (sec.) */
        unsigned long dir_start;
        unsigned short dir_entries;   /* root dir start & entries */
        unsigned long data_start;     /* first data sector */
        unsigned long max_cluster;    /* maximum cluster number */
        unsigned long root_cluster;   /* first cluster of the root directory */
        unsigned long fsinfo_sector;  /* sector number of FAT32 fsinfo */
        struct mutex fat_lock;
        struct mutex nfs_build_inode_lock;
        struct mutex s_lock;
        unsigned int prev_free;      /* previously allocated cluster number */
        unsigned int free_clusters;  /* -1 if undefined */
        unsigned int free_clus_valid; /* is free_clusters valid? */
        struct fat_mount_options options;
        struct nls_table *nls_disk;   /* Codepage used on disk */
        struct nls_table *nls_io;     /* Charset used for input and display */
        const void *dir_ops;              /* Opaque; default directory operations */
        int dir_per_block;              /* dir entries per block */
        int dir_per_block_bits;              /* log2(dir_per_block) */
        unsigned int vol_id;                /*volume ID*/

        int fatent_shift;
        const struct fatent_operations *fatent_ops;
        struct inode *fat_inode;
        struct inode *fsinfo_inode;

        struct ratelimit_state ratelimit;

        spinlock_t inode_hash_lock;
        struct hlist_head inode_hashtable[FAT_HASH_SIZE];

        spinlock_t dir_hash_lock;
        struct hlist_head dir_hashtable[FAT_HASH_SIZE];

        unsigned int dirty;           /* fs state before mount */
        struct rcu_head rcu;
};

#define FAT_CACHE_VALID        0        /* special case for valid cache */

/*
 * MS-DOS file system inode data in memory
 */
struct msdos_inode_info {
        spinlock_t cache_lru_lock;
        struct list_head cache_lru;
        int nr_caches;
        /* for avoiding the race between fat_free() and fat_get_cluster() */
        unsigned int cache_valid_id;

        /* NOTE: mmu_private is 64bits, so must hold ->i_mutex to access */
        loff_t mmu_private;        /* physically allocated size */

        int i_start;                /* first cluster or 0 */
        int i_logstart;                /* logical first cluster */
        int i_attrs;                /* unused attribute bits */
        loff_t i_pos;                /* on-disk position of directory entry or 0 */
        struct hlist_node i_fat_hash;        /* hash by i_location */
        struct hlist_node i_dir_hash;        /* hash by i_logstart */
        struct rw_semaphore truncate_lock; /* protect bmap against truncate */
        struct inode vfs_inode;
};

struct fat_slot_info {
        loff_t i_pos;                /* on-disk position of directory entry */
        loff_t slot_off;        /* offset for slot or de start */
        int nr_slots;                /* number of slots + 1(de) in filename */
        struct msdos_dir_entry *de;
        struct buffer_head *bh;
};

static inline struct msdos_sb_info *MSDOS_SB(struct super_block *sb)
{
        return sb->s_fs_info;
}

/*
 * Functions that determine the variant of the FAT file system (i.e.,
 * whether this is FAT12, FAT16 or FAT32.
 */
static inline bool is_fat12(const struct msdos_sb_info *sbi)
{
        return sbi->fat_bits == 12;
}

static inline bool is_fat16(const struct msdos_sb_info *sbi)
{
        return sbi->fat_bits == 16;
}

static inline bool is_fat32(const struct msdos_sb_info *sbi)
{
        return sbi->fat_bits == 32;
}

/* Maximum number of clusters */
static inline u32 max_fat(struct super_block *sb)
{
        struct msdos_sb_info *sbi = MSDOS_SB(sb);

        return is_fat32(sbi) ? MAX_FAT32 :
                is_fat16(sbi) ? MAX_FAT16 : MAX_FAT12;
}

static inline struct msdos_inode_info *MSDOS_I(struct inode *inode)
{
        return container_of(inode, struct msdos_inode_info, vfs_inode);
}

/*
 * If ->i_mode can't hold S_IWUGO (i.e. ATTR_RO), we use ->i_attrs to
 * save ATTR_RO instead of ->i_mode.
 *
 * If it's directory and !sbi->options.rodir, ATTR_RO isn't read-only
 * bit, it's just used as flag for app.
 */
static inline int fat_mode_can_hold_ro(struct inode *inode)
{
        struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
        umode_t mask;

        if (S_ISDIR(inode->i_mode)) {
                if (!sbi->options.rodir)
                        return 0;
                mask = ~sbi->options.fs_dmask;
        } else
                mask = ~sbi->options.fs_fmask;

        if (!(mask & S_IWUGO))
                return 0;
        return 1;
}

/* Convert attribute bits and a mask to the UNIX mode. */
static inline umode_t fat_make_mode(struct msdos_sb_info *sbi,
                                   u8 attrs, umode_t mode)
{
        if (attrs & ATTR_RO && !((attrs & ATTR_DIR) && !sbi->options.rodir))
                mode &= ~S_IWUGO;

        if (attrs & ATTR_DIR)
                return (mode & ~sbi->options.fs_dmask) | S_IFDIR;
        else
                return (mode & ~sbi->options.fs_fmask) | S_IFREG;
}

/* Return the FAT attribute byte for this inode */
static inline u8 fat_make_attrs(struct inode *inode)
{
        u8 attrs = MSDOS_I(inode)->i_attrs;
        if (S_ISDIR(inode->i_mode))
                attrs |= ATTR_DIR;
        if (fat_mode_can_hold_ro(inode) && !(inode->i_mode & S_IWUGO))
                attrs |= ATTR_RO;
        return attrs;
}

static inline void fat_save_attrs(struct inode *inode, u8 attrs)
{
        if (fat_mode_can_hold_ro(inode))
                MSDOS_I(inode)->i_attrs = attrs & ATTR_UNUSED;
        else
                MSDOS_I(inode)->i_attrs = attrs & (ATTR_UNUSED | ATTR_RO);
}

static inline unsigned char fat_checksum(const __u8 *name)
{
        unsigned char s = name[0];
        s = (s<<7) + (s>>1) + name[1];        s = (s<<7) + (s>>1) + name[2];
        s = (s<<7) + (s>>1) + name[3];        s = (s<<7) + (s>>1) + name[4];
        s = (s<<7) + (s>>1) + name[5];        s = (s<<7) + (s>>1) + name[6];
        s = (s<<7) + (s>>1) + name[7];        s = (s<<7) + (s>>1) + name[8];
        s = (s<<7) + (s>>1) + name[9];        s = (s<<7) + (s>>1) + name[10];
        return s;
}

static inline sector_t fat_clus_to_blknr(struct msdos_sb_info *sbi, int clus)
{
        return ((sector_t)clus - FAT_START_ENT) * sbi->sec_per_clus
                + sbi->data_start;
}

static inline void fat_get_blknr_offset(struct msdos_sb_info *sbi,
                                loff_t i_pos, sector_t *blknr, int *offset)
{
        *blknr = i_pos >> sbi->dir_per_block_bits;
        *offset = i_pos & (sbi->dir_per_block - 1);
}

static inline loff_t fat_i_pos_read(struct msdos_sb_info *sbi,
                                        struct inode *inode)
{
        loff_t i_pos;
#if BITS_PER_LONG == 32
        spin_lock(&sbi->inode_hash_lock);
#endif
        i_pos = MSDOS_I(inode)->i_pos;
#if BITS_PER_LONG == 32
        spin_unlock(&sbi->inode_hash_lock);
#endif
        return i_pos;
}

static inline void fat16_towchar(wchar_t *dst, const __u8 *src, size_t len)
{
#ifdef __BIG_ENDIAN
        while (len--) {
                *dst++ = src[0] | (src[1] << 8);
                src += 2;
        }
#else
        memcpy(dst, src, len * 2);
#endif
}

static inline int fat_get_start(const struct msdos_sb_info *sbi,
                                const struct msdos_dir_entry *de)
{
        int cluster = le16_to_cpu(de->start);
        if (is_fat32(sbi))
                cluster |= (le16_to_cpu(de->starthi) << 16);
        return cluster;
}

static inline void fat_set_start(struct msdos_dir_entry *de, int cluster)
{
        de->start   = cpu_to_le16(cluster);
        de->starthi = cpu_to_le16(cluster >> 16);
}

static inline void fatwchar_to16(__u8 *dst, const wchar_t *src, size_t len)
{
#ifdef __BIG_ENDIAN
        while (len--) {
                dst[0] = *src & 0x00FF;
                dst[1] = (*src & 0xFF00) >> 8;
                dst += 2;
                src++;
        }
#else
        memcpy(dst, src, len * 2);
#endif
}

/* fat/cache.c */
extern void fat_cache_inval_inode(struct inode *inode);
extern int fat_get_cluster(struct inode *inode, int cluster,
                           int *fclus, int *dclus);
extern int fat_get_mapped_cluster(struct inode *inode, sector_t sector,
                                  sector_t last_block,
                                  unsigned long *mapped_blocks, sector_t *bmap);
extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
                    unsigned long *mapped_blocks, int create, bool from_bmap);

/* fat/dir.c */
extern const struct file_operations fat_dir_operations;
extern int fat_search_long(struct inode *inode, const unsigned char *name,
                           int name_len, struct fat_slot_info *sinfo);
extern int fat_dir_empty(struct inode *dir);
extern int fat_subdirs(struct inode *dir);
extern int fat_scan(struct inode *dir, const unsigned char *name,
                    struct fat_slot_info *sinfo);
extern int fat_scan_logstart(struct inode *dir, int i_logstart,
                             struct fat_slot_info *sinfo);
extern int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh,
                                struct msdos_dir_entry **de);
extern int fat_alloc_new_dir(struct inode *dir, struct timespec64 *ts);
extern int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
                           struct fat_slot_info *sinfo);
extern int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo);

/* fat/fatent.c */
struct fat_entry {
        int entry;
        union {
                u8 *ent12_p[2];
                __le16 *ent16_p;
                __le32 *ent32_p;
        } u;
        int nr_bhs;
        struct buffer_head *bhs[2];
        struct inode *fat_inode;
};

static inline void fatent_init(struct fat_entry *fatent)
{
        fatent->nr_bhs = 0;
        fatent->entry = 0;
        fatent->u.ent32_p = NULL;
        fatent->bhs[0] = fatent->bhs[1] = NULL;
        fatent->fat_inode = NULL;
}

static inline void fatent_set_entry(struct fat_entry *fatent, int entry)
{
        fatent->entry = entry;
        fatent->u.ent32_p = NULL;
}

static inline void fatent_brelse(struct fat_entry *fatent)
{
        int i;
        fatent->u.ent32_p = NULL;
        for (i = 0; i < fatent->nr_bhs; i++)
                brelse(fatent->bhs[i]);
        fatent->nr_bhs = 0;
        fatent->bhs[0] = fatent->bhs[1] = NULL;
        fatent->fat_inode = NULL;
}

static inline bool fat_valid_entry(struct msdos_sb_info *sbi, int entry)
{
        return FAT_START_ENT <= entry && entry < sbi->max_cluster;
}

extern void fat_ent_access_init(struct super_block *sb);
extern int fat_ent_read(struct inode *inode, struct fat_entry *fatent,
                        int entry);
extern int fat_ent_write(struct inode *inode, struct fat_entry *fatent,
                         int new, int wait);
extern int fat_alloc_clusters(struct inode *inode, int *cluster,
                              int nr_cluster);
extern int fat_free_clusters(struct inode *inode, int cluster);
extern int fat_count_free_clusters(struct super_block *sb);
extern int fat_trim_fs(struct inode *inode, struct fstrim_range *range);

/* fat/file.c */
extern long fat_generic_ioctl(struct file *filp, unsigned int cmd,
                              unsigned long arg);
extern const struct file_operations fat_file_operations;
extern const struct inode_operations fat_file_inode_operations;
extern int fat_setattr(struct dentry *dentry, struct iattr *attr);
extern void fat_truncate_blocks(struct inode *inode, loff_t offset);
extern int fat_getattr(const struct path *path, struct kstat *stat,
                       u32 request_mask, unsigned int flags);
extern int fat_file_fsync(struct file *file, loff_t start, loff_t end,
                          int datasync);

/* fat/inode.c */
extern int fat_block_truncate_page(struct inode *inode, loff_t from);
extern void fat_attach(struct inode *inode, loff_t i_pos);
extern void fat_detach(struct inode *inode);
extern struct inode *fat_iget(struct super_block *sb, loff_t i_pos);
extern struct inode *fat_build_inode(struct super_block *sb,
                        struct msdos_dir_entry *de, loff_t i_pos);
extern int fat_sync_inode(struct inode *inode);
extern int fat_fill_super(struct super_block *sb, void *data, int silent,
                          int isvfat, void (*setup)(struct super_block *));
extern int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de);

extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
                            struct inode *i2);
static inline unsigned long fat_dir_hash(int logstart)
{
        return hash_32(logstart, FAT_HASH_BITS);
}
extern int fat_add_cluster(struct inode *inode);

/* fat/misc.c */
extern __printf(3, 4) __cold
void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...);
#define fat_fs_error(sb, fmt, args...)                \
        __fat_fs_error(sb, 1, fmt , ## args)
#define fat_fs_error_ratelimit(sb, fmt, args...) \
        __fat_fs_error(sb, __ratelimit(&MSDOS_SB(sb)->ratelimit), fmt , ## args)
__printf(3, 4) __cold
void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...);
#define fat_msg_ratelimit(sb, level, fmt, args...)        \
        do {        \
                        if (__ratelimit(&MSDOS_SB(sb)->ratelimit))        \
                                fat_msg(sb, level, fmt, ## args);        \
         } while (0)
extern int fat_clusters_flush(struct super_block *sb);
extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec64 *ts,
                              __le16 __time, __le16 __date, u8 time_cs);
extern void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec64 *ts,
                              __le16 *time, __le16 *date, u8 *time_cs);
extern int fat_truncate_time(struct inode *inode, struct timespec64 *now,
                             int flags);
extern int fat_update_time(struct inode *inode, struct timespec64 *now,
                           int flags);
extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs);

int fat_cache_init(void);
void fat_cache_destroy(void);

/* fat/nfs.c */
extern const struct export_operations fat_export_ops;
extern const struct export_operations fat_export_ops_nostale;

/* helper for printk */
typedef unsigned long long        llu;

#endif /* !_FAT_H */







































































































    1 


















































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* internal.h: mm/ internal definitions
 *
 * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */
#ifndef __MM_INTERNAL_H
#define __MM_INTERNAL_H

#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/tracepoint-defs.h>

/*
 * The set of flags that only affect watermark checking and reclaim
 * behaviour. This is used by the MM to obey the caller constraints
 * about IO, FS and watermark checking while ignoring placement
 * hints such as HIGHMEM usage.
 */
#define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\
                        __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\
                        __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\
                        __GFP_ATOMIC)

/* The GFP flags allowed during early boot */
#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS))

/* Control allocation cpuset and node placement constraints */
#define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)

/* Do not use these with a slab allocator */
#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)

void page_writeback_init(void);

/*
 * This is a file-backed mapping, and is about to be memory mapped - invoke its
 * mmap hook and safely handle error conditions. On error, VMA hooks will be
 * mutated.
 *
 * @file: File which backs the mapping.
 * @vma:  VMA which we are mapping.
 *
 * Returns: 0 if success, error otherwise.
 */
int mmap_file(struct file *file, struct vm_area_struct *vma);

/*
 * If the VMA has a close hook then close it, and since closing it might leave
 * it in an inconsistent state which makes the use of any hooks suspect, clear
 * them down by installing dummy empty hooks.
 */
void vma_close(struct vm_area_struct *vma);

vm_fault_t do_swap_page(struct vm_fault *vmf);

void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
                unsigned long floor, unsigned long ceiling);

static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
{
        return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP));
}

void unmap_page_range(struct mmu_gather *tlb,
                             struct vm_area_struct *vma,
                             unsigned long addr, unsigned long end,
                             struct zap_details *details);

void do_page_cache_ra(struct readahead_control *, unsigned long nr_to_read,
                unsigned long lookahead_size);
void force_page_cache_ra(struct readahead_control *, struct file_ra_state *,
                unsigned long nr);
static inline void force_page_cache_readahead(struct address_space *mapping,
                struct file *file, pgoff_t index, unsigned long nr_to_read)
{
        DEFINE_READAHEAD(ractl, file, mapping, index);
        force_page_cache_ra(&ractl, &file->f_ra, nr_to_read);
}

struct page *find_get_entry(struct address_space *mapping, pgoff_t index);
struct page *find_lock_entry(struct address_space *mapping, pgoff_t index);

/**
 * page_evictable - test whether a page is evictable
 * @page: the page to test
 *
 * Test whether page is evictable--i.e., should be placed on active/inactive
 * lists vs unevictable list.
 *
 * Reasons page might not be evictable:
 * (1) page's mapping marked unevictable
 * (2) page is part of an mlocked VMA
 *
 */
static inline bool page_evictable(struct page *page)
{
        bool ret;

        /* Prevent address_space of inode and swap cache from being freed */
        rcu_read_lock();
        ret = !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
        rcu_read_unlock();
        return ret;
}

/*
 * Turn a non-refcounted page (->_refcount == 0) into refcounted with
 * a count of one.
 */
static inline void set_page_refcounted(struct page *page)
{
        VM_BUG_ON_PAGE(PageTail(page), page);
        VM_BUG_ON_PAGE(page_ref_count(page), page);
        set_page_count(page, 1);
}

extern unsigned long highest_memmap_pfn;

/*
 * Maximum number of reclaim retries without progress before the OOM
 * killer is consider the only way forward.
 */
#define MAX_RECLAIM_RETRIES 16

/*
 * in mm/vmscan.c:
 */
extern int isolate_lru_page(struct page *page);
extern void putback_lru_page(struct page *page);

/*
 * in mm/rmap.c:
 */
extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);

/*
 * in mm/page_alloc.c
 */

/*
 * Structure for holding the mostly immutable allocation parameters passed
 * between functions involved in allocations, including the alloc_pages*
 * family of functions.
 *
 * nodemask, migratetype and highest_zoneidx are initialized only once in
 * __alloc_pages_nodemask() and then never change.
 *
 * zonelist, preferred_zone and highest_zoneidx are set first in
 * __alloc_pages_nodemask() for the fast path, and might be later changed
 * in __alloc_pages_slowpath(). All other functions pass the whole structure
 * by a const pointer.
 */
struct alloc_context {
        struct zonelist *zonelist;
        nodemask_t *nodemask;
        struct zoneref *preferred_zoneref;
        int migratetype;

        /*
         * highest_zoneidx represents highest usable zone index of
         * the allocation request. Due to the nature of the zone,
         * memory on lower zone than the highest_zoneidx will be
         * protected by lowmem_reserve[highest_zoneidx].
         *
         * highest_zoneidx is also used by reclaim/compaction to limit
         * the target zone since higher zone than this index cannot be
         * usable for this allocation request.
         */
        enum zone_type highest_zoneidx;
        bool spread_dirty_pages;
};

/*
 * Locate the struct page for both the matching buddy in our
 * pair (buddy1) and the combined O(n+1) page they form (page).
 *
 * 1) Any buddy B1 will have an order O twin B2 which satisfies
 * the following equation:
 *     B2 = B1 ^ (1 << O)
 * For example, if the starting buddy (buddy2) is #8 its order
 * 1 buddy is #10:
 *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
 *
 * 2) Any buddy B will have an order O+1 parent P which
 * satisfies the following equation:
 *     P = B & ~(1 << O)
 *
 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
 */
static inline unsigned long
__find_buddy_pfn(unsigned long page_pfn, unsigned int order)
{
        return page_pfn ^ (1 << order);
}

extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
                                unsigned long end_pfn, struct zone *zone);

static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
                                unsigned long end_pfn, struct zone *zone)
{
        if (zone->contiguous)
                return pfn_to_page(start_pfn);

        return __pageblock_pfn_to_page(start_pfn, end_pfn, zone);
}

extern int __isolate_free_page(struct page *page, unsigned int order);
extern void __putback_isolated_page(struct page *page, unsigned int order,
                                    int mt);
extern void memblock_free_pages(struct page *page, unsigned long pfn,
                                        unsigned int order);
extern void __free_pages_core(struct page *page, unsigned int order);
extern void prep_compound_page(struct page *page, unsigned int order);
extern void post_alloc_hook(struct page *page, unsigned int order,
                                        gfp_t gfp_flags);
extern int user_min_free_kbytes;

extern void zone_pcp_update(struct zone *zone);
extern void zone_pcp_reset(struct zone *zone);

#if defined CONFIG_COMPACTION || defined CONFIG_CMA

/*
 * in mm/compaction.c
 */
/*
 * compact_control is used to track pages being migrated and the free pages
 * they are being migrated to during memory compaction. The free_pfn starts
 * at the end of a zone and migrate_pfn begins at the start. Movable pages
 * are moved to the end of a zone during a compaction run and the run
 * completes when free_pfn <= migrate_pfn
 */
struct compact_control {
        struct list_head freepages;        /* List of free pages to migrate to */
        struct list_head migratepages;        /* List of pages being migrated */
        unsigned int nr_freepages;        /* Number of isolated free pages */
        unsigned int nr_migratepages;        /* Number of pages to migrate */
        unsigned long free_pfn;                /* isolate_freepages search base */
        unsigned long migrate_pfn;        /* isolate_migratepages search base */
        unsigned long fast_start_pfn;        /* a pfn to start linear scan from */
        struct zone *zone;
        unsigned long total_migrate_scanned;
        unsigned long total_free_scanned;
        unsigned short fast_search_fail;/* failures to use free list searches */
        short search_order;                /* order to start a fast search at */
        const gfp_t gfp_mask;                /* gfp mask of a direct compactor */
        int order;                        /* order a direct compactor needs */
        int migratetype;                /* migratetype of direct compactor */
        const unsigned int alloc_flags;        /* alloc flags of a direct compactor */
        const int highest_zoneidx;        /* zone index of a direct compactor */
        enum migrate_mode mode;                /* Async or sync migration mode */
        bool ignore_skip_hint;                /* Scan blocks even if marked skip */
        bool no_set_skip_hint;                /* Don't mark blocks for skipping */
        bool ignore_block_suitable;        /* Scan blocks considered unsuitable */
        bool direct_compaction;                /* False from kcompactd or /proc/... */
        bool proactive_compaction;        /* kcompactd proactive compaction */
        bool whole_zone;                /* Whole zone should/has been scanned */
        bool contended;                        /* Signal lock or sched contention */
        bool rescan;                        /* Rescanning the same pageblock */
        bool alloc_contig;                /* alloc_contig_range allocation */
};

/*
 * Used in direct compaction when a page should be taken from the freelists
 * immediately when one is created during the free path.
 */
struct capture_control {
        struct compact_control *cc;
        struct page *page;
};

unsigned long
isolate_freepages_range(struct compact_control *cc,
                        unsigned long start_pfn, unsigned long end_pfn);
unsigned long
isolate_migratepages_range(struct compact_control *cc,
                           unsigned long low_pfn, unsigned long end_pfn);
int find_suitable_fallback(struct free_area *area, unsigned int order,
                        int migratetype, bool only_stealable, bool *can_steal);

#endif

/*
 * This function returns the order of a free page in the buddy system. In
 * general, page_zone(page)->lock must be held by the caller to prevent the
 * page from being allocated in parallel and returning garbage as the order.
 * If a caller does not hold page_zone(page)->lock, it must guarantee that the
 * page cannot be allocated or merged in parallel. Alternatively, it must
 * handle invalid values gracefully, and use buddy_order_unsafe() below.
 */
static inline unsigned int buddy_order(struct page *page)
{
        /* PageBuddy() must be checked by the caller */
        return page_private(page);
}

/*
 * Like buddy_order(), but for callers who cannot afford to hold the zone lock.
 * PageBuddy() should be checked first by the caller to minimize race window,
 * and invalid values must be handled gracefully.
 *
 * READ_ONCE is used so that if the caller assigns the result into a local
 * variable and e.g. tests it for valid range before using, the compiler cannot
 * decide to remove the variable and inline the page_private(page) multiple
 * times, potentially observing different values in the tests and the actual
 * use of the result.
 */
#define buddy_order_unsafe(page)        READ_ONCE(page_private(page))

static inline bool is_cow_mapping(vm_flags_t flags)
{
        return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}

/*
 * These three helpers classifies VMAs for virtual memory accounting.
 */

/*
 * Executable code area - executable, not writable, not stack
 */
static inline bool is_exec_mapping(vm_flags_t flags)
{
        return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC;
}

/*
 * Stack area - atomatically grows in one direction
 *
 * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous:
 * do_mmap() forbids all other combinations.
 */
static inline bool is_stack_mapping(vm_flags_t flags)
{
        return (flags & VM_STACK) == VM_STACK;
}

/*
 * Data area - private, writable, not stack
 */
static inline bool is_data_mapping(vm_flags_t flags)
{
        return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE;
}

/* mm/util.c */
void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
                struct vm_area_struct *prev);
void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma);

#ifdef CONFIG_MMU
extern long populate_vma_page_range(struct vm_area_struct *vma,
                unsigned long start, unsigned long end, int *nonblocking);
extern void munlock_vma_pages_range(struct vm_area_struct *vma,
                        unsigned long start, unsigned long end);
static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
{
        munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
}

/*
 * must be called with vma's mmap_lock held for read or write, and page locked.
 */
extern void mlock_vma_page(struct page *page);
extern unsigned int munlock_vma_page(struct page *page);

/*
 * Clear the page's PageMlocked().  This can be useful in a situation where
 * we want to unconditionally remove a page from the pagecache -- e.g.,
 * on truncation or freeing.
 *
 * It is legal to call this function for any page, mlocked or not.
 * If called for a page that is still mapped by mlocked vmas, all we do
 * is revert to lazy LRU behaviour -- semantics are not broken.
 */
extern void clear_page_mlock(struct page *page);

/*
 * mlock_migrate_page - called only from migrate_misplaced_transhuge_page()
 * (because that does not go through the full procedure of migration ptes):
 * to migrate the Mlocked page flag; update statistics.
 */
static inline void mlock_migrate_page(struct page *newpage, struct page *page)
{
        if (TestClearPageMlocked(page)) {
                int nr_pages = thp_nr_pages(page);

                /* Holding pmd lock, no change in irq context: __mod is safe */
                __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
                SetPageMlocked(newpage);
                __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages);
        }
}

extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);

/*
 * At what user virtual address is page expected in vma?
 * Returns -EFAULT if all of the page is outside the range of vma.
 * If page is a compound head, the entire compound page is considered.
 */
static inline unsigned long
vma_address(struct page *page, struct vm_area_struct *vma)
{
        pgoff_t pgoff;
        unsigned long address;

        VM_BUG_ON_PAGE(PageKsm(page), page);        /* KSM page->index unusable */
        pgoff = page_to_pgoff(page);
        if (pgoff >= vma->vm_pgoff) {
                address = vma->vm_start +
                        ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
                /* Check for address beyond vma (or wrapped through 0?) */
                if (address < vma->vm_start || address >= vma->vm_end)
                        address = -EFAULT;
        } else if (PageHead(page) &&
                   pgoff + compound_nr(page) - 1 >= vma->vm_pgoff) {
                /* Test above avoids possibility of wrap to 0 on 32-bit */
                address = vma->vm_start;
        } else {
                address = -EFAULT;
        }
        return address;
}

/*
 * Then at what user virtual address will none of the page be found in vma?
 * Assumes that vma_address() already returned a good starting address.
 * If page is a compound head, the entire compound page is considered.
 */
static inline unsigned long
vma_address_end(struct page *page, struct vm_area_struct *vma)
{
        pgoff_t pgoff;
        unsigned long address;

        VM_BUG_ON_PAGE(PageKsm(page), page);        /* KSM page->index unusable */
        pgoff = page_to_pgoff(page) + compound_nr(page);
        address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
        /* Check for address beyond vma (or wrapped through 0?) */
        if (address < vma->vm_start || address > vma->vm_end)
                address = vma->vm_end;
        return address;
}

static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
                                                    struct file *fpin)
{
        int flags = vmf->flags;

        if (fpin)
                return fpin;

        /*
         * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
         * anything, so we only pin the file and drop the mmap_lock if only
         * FAULT_FLAG_ALLOW_RETRY is set, while this is the first attempt.
         */
        if (fault_flag_allow_retry_first(flags) &&
            !(flags & FAULT_FLAG_RETRY_NOWAIT)) {
                fpin = get_file(vmf->vma->vm_file);
                mmap_read_unlock(vmf->vma->vm_mm);
        }
        return fpin;
}

#else /* !CONFIG_MMU */
static inline void clear_page_mlock(struct page *page) { }
static inline void mlock_vma_page(struct page *page) { }
static inline void mlock_migrate_page(struct page *new, struct page *old) { }

#endif /* !CONFIG_MMU */

/*
 * Return the mem_map entry representing the 'offset' subpage within
 * the maximally aligned gigantic page 'base'.  Handle any discontiguity
 * in the mem_map at MAX_ORDER_NR_PAGES boundaries.
 */
static inline struct page *mem_map_offset(struct page *base, int offset)
{
        if (unlikely(offset >= MAX_ORDER_NR_PAGES))
                return nth_page(base, offset);
        return base + offset;
}

/*
 * Iterator over all subpages within the maximally aligned gigantic
 * page 'base'.  Handle any discontiguity in the mem_map.
 */
static inline struct page *mem_map_next(struct page *iter,
                                                struct page *base, int offset)
{
        if (unlikely((offset & (MAX_ORDER_NR_PAGES - 1)) == 0)) {
                unsigned long pfn = page_to_pfn(base) + offset;
                if (!pfn_valid(pfn))
                        return NULL;
                return pfn_to_page(pfn);
        }
        return iter + 1;
}

/* Memory initialisation debug and verification */
enum mminit_level {
        MMINIT_WARNING,
        MMINIT_VERIFY,
        MMINIT_TRACE
};

#ifdef CONFIG_DEBUG_MEMORY_INIT

extern int mminit_loglevel;

#define mminit_dprintk(level, prefix, fmt, arg...) \
do { \
        if (level < mminit_loglevel) { \
                if (level <= MMINIT_WARNING) \
                        pr_warn("mminit::" prefix " " fmt, ##arg);        \
                else \
                        printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \
        } \
} while (0)

extern void mminit_verify_pageflags_layout(void);
extern void mminit_verify_zonelist(void);
#else

static inline void mminit_dprintk(enum mminit_level level,
                                const char *prefix, const char *fmt, ...)
{
}

static inline void mminit_verify_pageflags_layout(void)
{
}

static inline void mminit_verify_zonelist(void)
{
}
#endif /* CONFIG_DEBUG_MEMORY_INIT */

/* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */
#if defined(CONFIG_SPARSEMEM)
extern void mminit_validate_memmodel_limits(unsigned long *start_pfn,
                                unsigned long *end_pfn);
#else
static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
                                unsigned long *end_pfn)
{
}
#endif /* CONFIG_SPARSEMEM */

#define NODE_RECLAIM_NOSCAN        -2
#define NODE_RECLAIM_FULL        -1
#define NODE_RECLAIM_SOME        0
#define NODE_RECLAIM_SUCCESS        1

#ifdef CONFIG_NUMA
extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
#else
static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
                                unsigned int order)
{
        return NODE_RECLAIM_NOSCAN;
}
#endif

extern int hwpoison_filter(struct page *p);

extern u32 hwpoison_filter_dev_major;
extern u32 hwpoison_filter_dev_minor;
extern u64 hwpoison_filter_flags_mask;
extern u64 hwpoison_filter_flags_value;
extern u64 hwpoison_filter_memcg;
extern u32 hwpoison_filter_enable;

extern unsigned long  __must_check vm_mmap_pgoff(struct file *, unsigned long,
        unsigned long, unsigned long,
        unsigned long, unsigned long);

extern void set_pageblock_order(void);
unsigned int reclaim_clean_pages_from_list(struct zone *zone,
                                            struct list_head *page_list);
/* The ALLOC_WMARK bits are used as an index to zone->watermark */
#define ALLOC_WMARK_MIN                WMARK_MIN
#define ALLOC_WMARK_LOW                WMARK_LOW
#define ALLOC_WMARK_HIGH        WMARK_HIGH
#define ALLOC_NO_WATERMARKS        0x04 /* don't check watermarks at all */

/* Mask to get the watermark bits */
#define ALLOC_WMARK_MASK        (ALLOC_NO_WATERMARKS-1)

/*
 * Only MMU archs have async oom victim reclaim - aka oom_reaper so we
 * cannot assume a reduced access to memory reserves is sufficient for
 * !MMU
 */
#ifdef CONFIG_MMU
#define ALLOC_OOM                0x08
#else
#define ALLOC_OOM                ALLOC_NO_WATERMARKS
#endif

#define ALLOC_HARDER                 0x10 /* try to alloc harder */
#define ALLOC_HIGH                 0x20 /* __GFP_HIGH set */
#define ALLOC_CPUSET                 0x40 /* check for correct cpuset */
#define ALLOC_CMA                 0x80 /* allow allocations from CMA areas */
#ifdef CONFIG_ZONE_DMA32
#define ALLOC_NOFRAGMENT        0x100 /* avoid mixing pageblock types */
#else
#define ALLOC_NOFRAGMENT          0x0
#endif
#define ALLOC_KSWAPD                0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */

enum ttu_flags;
struct tlbflush_unmap_batch;


/*
 * only for MM internal work items which do not depend on
 * any allocations or locks which might depend on allocations
 */
extern struct workqueue_struct *mm_percpu_wq;

#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
void try_to_unmap_flush(void);
void try_to_unmap_flush_dirty(void);
void flush_tlb_batched_pending(struct mm_struct *mm);
#else
static inline void try_to_unmap_flush(void)
{
}
static inline void try_to_unmap_flush_dirty(void)
{
}
static inline void flush_tlb_batched_pending(struct mm_struct *mm)
{
}
#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */

extern const struct trace_print_flags pageflag_names[];
extern const struct trace_print_flags vmaflag_names[];
extern const struct trace_print_flags gfpflag_names[];

static inline bool is_migrate_highatomic(enum migratetype migratetype)
{
        return migratetype == MIGRATE_HIGHATOMIC;
}

static inline bool is_migrate_highatomic_page(struct page *page)
{
        return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC;
}

void setup_zone_pageset(struct zone *zone);

struct migration_target_control {
        int nid;                /* preferred node id */
        nodemask_t *nmask;
        gfp_t gfp_mask;
};

#endif        /* __MM_INTERNAL_H */








































































































































































































































































































































































































































































































































































































































































































































































    2 









































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PROCESSOR_H
#define _ASM_X86_PROCESSOR_H

#include <asm/processor-flags.h>

/* Forward declaration, a strange C thing */
struct task_struct;
struct mm_struct;
struct io_bitmap;
struct vm86;

#include <asm/math_emu.h>
#include <asm/segment.h>
#include <asm/types.h>
#include <uapi/asm/sigcontext.h>
#include <asm/current.h>
#include <asm/cpufeatures.h>
#include <asm/page.h>
#include <asm/pgtable_types.h>
#include <asm/percpu.h>
#include <asm/msr.h>
#include <asm/desc_defs.h>
#include <asm/nops.h>
#include <asm/special_insns.h>
#include <asm/fpu/types.h>
#include <asm/unwind_hints.h>
#include <asm/vmxfeatures.h>
#include <asm/vdso/processor.h>

#include <linux/personality.h>
#include <linux/cache.h>
#include <linux/threads.h>
#include <linux/math64.h>
#include <linux/err.h>
#include <linux/irqflags.h>
#include <linux/mem_encrypt.h>

/*
 * We handle most unaligned accesses in hardware.  On the other hand
 * unaligned DMA can be quite expensive on some Nehalem processors.
 *
 * Based on this we disable the IP header alignment in network drivers.
 */
#define NET_IP_ALIGN        0

#define HBP_NUM 4

/*
 * These alignment constraints are for performance in the vSMP case,
 * but in the task_struct case we must also meet hardware imposed
 * alignment requirements of the FPU state:
 */
#ifdef CONFIG_X86_VSMP
# define ARCH_MIN_TASKALIGN                (1 << INTERNODE_CACHE_SHIFT)
# define ARCH_MIN_MMSTRUCT_ALIGN        (1 << INTERNODE_CACHE_SHIFT)
#else
# define ARCH_MIN_TASKALIGN                __alignof__(union fpregs_state)
# define ARCH_MIN_MMSTRUCT_ALIGN        0
#endif

enum tlb_infos {
        ENTRIES,
        NR_INFO
};

extern u16 __read_mostly tlb_lli_4k[NR_INFO];
extern u16 __read_mostly tlb_lli_2m[NR_INFO];
extern u16 __read_mostly tlb_lli_4m[NR_INFO];
extern u16 __read_mostly tlb_lld_4k[NR_INFO];
extern u16 __read_mostly tlb_lld_2m[NR_INFO];
extern u16 __read_mostly tlb_lld_4m[NR_INFO];
extern u16 __read_mostly tlb_lld_1g[NR_INFO];

/*
 *  CPU type and hardware bug flags. Kept separately for each CPU.
 *  Members of this structure are referenced in head_32.S, so think twice
 *  before touching them. [mj]
 */

struct cpuinfo_x86 {
        __u8                        x86;                /* CPU family */
        __u8                        x86_vendor;        /* CPU vendor */
        __u8                        x86_model;
        __u8                        x86_stepping;
#ifdef CONFIG_X86_64
        /* Number of 4K pages in DTLB/ITLB combined(in pages): */
        int                        x86_tlbsize;
#endif
#ifdef CONFIG_X86_VMX_FEATURE_NAMES
        __u32                        vmx_capability[NVMXINTS];
#endif
        __u8                        x86_virt_bits;
        __u8                        x86_phys_bits;
        /* CPUID returned core id bits: */
        __u8                        x86_coreid_bits;
        __u8                        cu_id;
        /* Max extended CPUID function supported: */
        __u32                        extended_cpuid_level;
        /* Maximum supported CPUID level, -1=no CPUID: */
        int                        cpuid_level;
        /*
         * Align to size of unsigned long because the x86_capability array
         * is passed to bitops which require the alignment. Use unnamed
         * union to enforce the array is aligned to size of unsigned long.
         */
        union {
                __u32                x86_capability[NCAPINTS + NBUGINTS];
                unsigned long        x86_capability_alignment;
        };
        char                        x86_vendor_id[16];
        char                        x86_model_id[64];
        /* in KB - valid for CPUS which support this call: */
        unsigned int                x86_cache_size;
        int                        x86_cache_alignment;        /* In bytes */
        /* Cache QoS architectural values, valid only on the BSP: */
        int                        x86_cache_max_rmid;        /* max index */
        int                        x86_cache_occ_scale;        /* scale to bytes */
        int                        x86_cache_mbm_width_offset;
        int                        x86_power;
        unsigned long                loops_per_jiffy;
        /* cpuid returned max cores value: */
        u16                        x86_max_cores;
        u16                        apicid;
        u16                        initial_apicid;
        u16                        x86_clflush_size;
        /* number of cores as seen by the OS: */
        u16                        booted_cores;
        /* Physical processor id: */
        u16                        phys_proc_id;
        /* Logical processor id: */
        u16                        logical_proc_id;
        /* Core id: */
        u16                        cpu_core_id;
        u16                        cpu_die_id;
        u16                        logical_die_id;
        /* Index into per_cpu list: */
        u16                        cpu_index;
        u32                        microcode;
        /* Address space bits used by the cache internally */
        u8                        x86_cache_bits;
        unsigned                initialized : 1;
} __randomize_layout;

struct cpuid_regs {
        u32 eax, ebx, ecx, edx;
};

enum cpuid_regs_idx {
        CPUID_EAX = 0,
        CPUID_EBX,
        CPUID_ECX,
        CPUID_EDX,
};

#define X86_VENDOR_INTEL        0
#define X86_VENDOR_CYRIX        1
#define X86_VENDOR_AMD                2
#define X86_VENDOR_UMC                3
#define X86_VENDOR_CENTAUR        5
#define X86_VENDOR_TRANSMETA        7
#define X86_VENDOR_NSC                8
#define X86_VENDOR_HYGON        9
#define X86_VENDOR_ZHAOXIN        10
#define X86_VENDOR_NUM                11

#define X86_VENDOR_UNKNOWN        0xff

/*
 * capabilities of CPUs
 */
extern struct cpuinfo_x86        boot_cpu_data;
extern struct cpuinfo_x86        new_cpu_data;

extern __u32                        cpu_caps_cleared[NCAPINTS + NBUGINTS];
extern __u32                        cpu_caps_set[NCAPINTS + NBUGINTS];

#ifdef CONFIG_SMP
DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
#define cpu_data(cpu)                per_cpu(cpu_info, cpu)
#else
#define cpu_info                boot_cpu_data
#define cpu_data(cpu)                boot_cpu_data
#endif

extern const struct seq_operations cpuinfo_op;

#define cache_line_size()        (boot_cpu_data.x86_cache_alignment)

extern void cpu_detect(struct cpuinfo_x86 *c);

static inline unsigned long long l1tf_pfn_limit(void)
{
        return BIT_ULL(boot_cpu_data.x86_cache_bits - 1 - PAGE_SHIFT);
}

void init_cpu_devs(void);
void get_cpu_vendor(struct cpuinfo_x86 *c);
extern void early_cpu_init(void);
extern void identify_boot_cpu(void);
extern void identify_secondary_cpu(struct cpuinfo_x86 *);
extern void print_cpu_info(struct cpuinfo_x86 *);
void print_cpu_msr(struct cpuinfo_x86 *);

#ifdef CONFIG_X86_32
extern int have_cpuid_p(void);
#else
static inline int have_cpuid_p(void)
{
        return 1;
}
#endif
static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
                                unsigned int *ecx, unsigned int *edx)
{
        /* ecx is often an input as well as an output. */
        asm volatile("cpuid"
            : "=a" (*eax),
              "=b" (*ebx),
              "=c" (*ecx),
              "=d" (*edx)
            : "0" (*eax), "2" (*ecx)
            : "memory");
}

#define native_cpuid_reg(reg)                                        \
static inline unsigned int native_cpuid_##reg(unsigned int op)        \
{                                                                \
        unsigned int eax = op, ebx, ecx = 0, edx;                \
                                                                \
        native_cpuid(&eax, &ebx, &ecx, &edx);                        \
                                                                \
        return reg;                                                \
}

/*
 * Native CPUID functions returning a single datum.
 */
native_cpuid_reg(eax)
native_cpuid_reg(ebx)
native_cpuid_reg(ecx)
native_cpuid_reg(edx)

/*
 * Friendlier CR3 helpers.
 */
static inline unsigned long read_cr3_pa(void)
{
        return __read_cr3() & CR3_ADDR_MASK;
}

static inline unsigned long native_read_cr3_pa(void)
{
        return __native_read_cr3() & CR3_ADDR_MASK;
}

static inline void load_cr3(pgd_t *pgdir)
{
        write_cr3(__sme_pa(pgdir));
}

/*
 * Note that while the legacy 'TSS' name comes from 'Task State Segment',
 * on modern x86 CPUs the TSS also holds information important to 64-bit mode,
 * unrelated to the task-switch mechanism:
 */
#ifdef CONFIG_X86_32
/* This is the TSS defined by the hardware. */
struct x86_hw_tss {
        unsigned short                back_link, __blh;
        unsigned long                sp0;
        unsigned short                ss0, __ss0h;
        unsigned long                sp1;

        /*
         * We don't use ring 1, so ss1 is a convenient scratch space in
         * the same cacheline as sp0.  We use ss1 to cache the value in
         * MSR_IA32_SYSENTER_CS.  When we context switch
         * MSR_IA32_SYSENTER_CS, we first check if the new value being
         * written matches ss1, and, if it's not, then we wrmsr the new
         * value and update ss1.
         *
         * The only reason we context switch MSR_IA32_SYSENTER_CS is
         * that we set it to zero in vm86 tasks to avoid corrupting the
         * stack if we were to go through the sysenter path from vm86
         * mode.
         */
        unsigned short                ss1;        /* MSR_IA32_SYSENTER_CS */

        unsigned short                __ss1h;
        unsigned long                sp2;
        unsigned short                ss2, __ss2h;
        unsigned long                __cr3;
        unsigned long                ip;
        unsigned long                flags;
        unsigned long                ax;
        unsigned long                cx;
        unsigned long                dx;
        unsigned long                bx;
        unsigned long                sp;
        unsigned long                bp;
        unsigned long                si;
        unsigned long                di;
        unsigned short                es, __esh;
        unsigned short                cs, __csh;
        unsigned short                ss, __ssh;
        unsigned short                ds, __dsh;
        unsigned short                fs, __fsh;
        unsigned short                gs, __gsh;
        unsigned short                ldt, __ldth;
        unsigned short                trace;
        unsigned short                io_bitmap_base;

} __attribute__((packed));
#else
struct x86_hw_tss {
        u32                        reserved1;
        u64                        sp0;

        /*
         * We store cpu_current_top_of_stack in sp1 so it's always accessible.
         * Linux does not use ring 1, so sp1 is not otherwise needed.
         */
        u64                        sp1;

        /*
         * Since Linux does not use ring 2, the 'sp2' slot is unused by
         * hardware.  entry_SYSCALL_64 uses it as scratch space to stash
         * the user RSP value.
         */
        u64                        sp2;

        u64                        reserved2;
        u64                        ist[7];
        u32                        reserved3;
        u32                        reserved4;
        u16                        reserved5;
        u16                        io_bitmap_base;

} __attribute__((packed));
#endif

/*
 * IO-bitmap sizes:
 */
#define IO_BITMAP_BITS                        65536
#define IO_BITMAP_BYTES                        (IO_BITMAP_BITS / BITS_PER_BYTE)
#define IO_BITMAP_LONGS                        (IO_BITMAP_BYTES / sizeof(long))

#define IO_BITMAP_OFFSET_VALID_MAP                                \
        (offsetof(struct tss_struct, io_bitmap.bitmap) -        \
         offsetof(struct tss_struct, x86_tss))

#define IO_BITMAP_OFFSET_VALID_ALL                                \
        (offsetof(struct tss_struct, io_bitmap.mapall) -        \
         offsetof(struct tss_struct, x86_tss))

#ifdef CONFIG_X86_IOPL_IOPERM
/*
 * sizeof(unsigned long) coming from an extra "long" at the end of the
 * iobitmap. The limit is inclusive, i.e. the last valid byte.
 */
# define __KERNEL_TSS_LIMIT        \
        (IO_BITMAP_OFFSET_VALID_ALL + IO_BITMAP_BYTES + \
         sizeof(unsigned long) - 1)
#else
# define __KERNEL_TSS_LIMIT        \
        (offsetof(struct tss_struct, x86_tss) + sizeof(struct x86_hw_tss) - 1)
#endif

/* Base offset outside of TSS_LIMIT so unpriviledged IO causes #GP */
#define IO_BITMAP_OFFSET_INVALID        (__KERNEL_TSS_LIMIT + 1)

struct entry_stack {
        char        stack[PAGE_SIZE];
};

struct entry_stack_page {
        struct entry_stack stack;
} __aligned(PAGE_SIZE);

/*
 * All IO bitmap related data stored in the TSS:
 */
struct x86_io_bitmap {
        /* The sequence number of the last active bitmap. */
        u64                        prev_sequence;

        /*
         * Store the dirty size of the last io bitmap offender. The next
         * one will have to do the cleanup as the switch out to a non io
         * bitmap user will just set x86_tss.io_bitmap_base to a value
         * outside of the TSS limit. So for sane tasks there is no need to
         * actually touch the io_bitmap at all.
         */
        unsigned int                prev_max;

        /*
         * The extra 1 is there because the CPU will access an
         * additional byte beyond the end of the IO permission
         * bitmap. The extra byte must be all 1 bits, and must
         * be within the limit.
         */
        unsigned long                bitmap[IO_BITMAP_LONGS + 1];

        /*
         * Special I/O bitmap to emulate IOPL(3). All bytes zero,
         * except the additional byte at the end.
         */
        unsigned long                mapall[IO_BITMAP_LONGS + 1];
};

struct tss_struct {
        /*
         * The fixed hardware portion.  This must not cross a page boundary
         * at risk of violating the SDM's advice and potentially triggering
         * errata.
         */
        struct x86_hw_tss        x86_tss;

        struct x86_io_bitmap        io_bitmap;
} __aligned(PAGE_SIZE);

DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw);

/* Per CPU interrupt stacks */
struct irq_stack {
        char                stack[IRQ_STACK_SIZE];
} __aligned(IRQ_STACK_SIZE);

DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);

#ifdef CONFIG_X86_32
DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
#else
/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */
#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
#endif

#ifdef CONFIG_X86_64
struct fixed_percpu_data {
        /*
         * GCC hardcodes the stack canary as %gs:40.  Since the
         * irq_stack is the object at %gs:0, we reserve the bottom
         * 48 bytes of the irq stack for the canary.
         *
         * Once we are willing to require -mstack-protector-guard-symbol=
         * support for x86_64 stackprotector, we can get rid of this.
         */
        char                gs_base[40];
        unsigned long        stack_canary;
};

DECLARE_PER_CPU_FIRST(struct fixed_percpu_data, fixed_percpu_data) __visible;
DECLARE_INIT_PER_CPU(fixed_percpu_data);

static inline unsigned long cpu_kernelmode_gs_base(int cpu)
{
        return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu);
}

DECLARE_PER_CPU(unsigned int, irq_count);
extern asmlinkage void ignore_sysret(void);

/* Save actual FS/GS selectors and bases to current->thread */
void current_save_fsgs(void);
#else        /* X86_64 */
#ifdef CONFIG_STACKPROTECTOR
DECLARE_PER_CPU(unsigned long, __stack_chk_guard);
#endif
/* Per CPU softirq stack pointer */
DECLARE_PER_CPU(struct irq_stack *, softirq_stack_ptr);
#endif        /* X86_64 */

extern unsigned int fpu_kernel_xstate_size;
extern unsigned int fpu_user_xstate_size;

struct perf_event;

struct thread_struct {
        /* Cached TLS descriptors: */
        struct desc_struct        tls_array[GDT_ENTRY_TLS_ENTRIES];
#ifdef CONFIG_X86_32
        unsigned long                sp0;
#endif
        unsigned long                sp;
#ifdef CONFIG_X86_32
        unsigned long                sysenter_cs;
#else
        unsigned short                es;
        unsigned short                ds;
        unsigned short                fsindex;
        unsigned short                gsindex;
#endif

#ifdef CONFIG_X86_64
        unsigned long                fsbase;
        unsigned long                gsbase;
#else
        /*
         * XXX: this could presumably be unsigned short.  Alternatively,
         * 32-bit kernels could be taught to use fsindex instead.
         */
        unsigned long fs;
        unsigned long gs;
#endif

        /* Save middle states of ptrace breakpoints */
        struct perf_event        *ptrace_bps[HBP_NUM];
        /* Debug status used for traps, single steps, etc... */
        unsigned long           virtual_dr6;
        /* Keep track of the exact dr7 value set by the user */
        unsigned long           ptrace_dr7;
        /* Fault info: */
        unsigned long                cr2;
        unsigned long                trap_nr;
        unsigned long                error_code;
#ifdef CONFIG_VM86
        /* Virtual 86 mode info */
        struct vm86                *vm86;
#endif
        /* IO permissions: */
        struct io_bitmap        *io_bitmap;

        /*
         * IOPL. Priviledge level dependent I/O permission which is
         * emulated via the I/O bitmap to prevent user space from disabling
         * interrupts.
         */
        unsigned long                iopl_emul;

        unsigned int                iopl_warn:1;

        /* Floating point and extended processor state */
        struct fpu                fpu;
        /*
         * WARNING: 'fpu' is dynamically-sized.  It *MUST* be at
         * the end.
         */
};

/* Whitelist the FPU state from the task_struct for hardened usercopy. */
static inline void arch_thread_struct_whitelist(unsigned long *offset,
                                                unsigned long *size)
{
        *offset = offsetof(struct thread_struct, fpu.state);
        *size = fpu_kernel_xstate_size;
}

static inline void
native_load_sp0(unsigned long sp0)
{
        this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
}

static __always_inline void native_swapgs(void)
{
#ifdef CONFIG_X86_64
        asm volatile("swapgs" ::: "memory");
#endif
}

static inline unsigned long current_top_of_stack(void)
{
        /*
         *  We can't read directly from tss.sp0: sp0 on x86_32 is special in
         *  and around vm86 mode and sp0 on x86_64 is special because of the
         *  entry trampoline.
         */
        return this_cpu_read_stable(cpu_current_top_of_stack);
}

static inline bool on_thread_stack(void)
{
        return (unsigned long)(current_top_of_stack() -
                               current_stack_pointer) < THREAD_SIZE;
}

#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else
#define __cpuid                        native_cpuid

static inline void load_sp0(unsigned long sp0)
{
        native_load_sp0(sp0);
}

#endif /* CONFIG_PARAVIRT_XXL */

/* Free all resources held by a thread. */
extern void release_thread(struct task_struct *);

unsigned long get_wchan(struct task_struct *p);

/*
 * Generic CPUID function
 * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
 * resulting in stale register contents being returned.
 */
static inline void cpuid(unsigned int op,
                         unsigned int *eax, unsigned int *ebx,
                         unsigned int *ecx, unsigned int *edx)
{
        *eax = op;
        *ecx = 0;
        __cpuid(eax, ebx, ecx, edx);
}

/* Some CPUID calls want 'count' to be placed in ecx */
static inline void cpuid_count(unsigned int op, int count,
                               unsigned int *eax, unsigned int *ebx,
                               unsigned int *ecx, unsigned int *edx)
{
        *eax = op;
        *ecx = count;
        __cpuid(eax, ebx, ecx, edx);
}

/*
 * CPUID functions returning a single datum
 */
static inline unsigned int cpuid_eax(unsigned int op)
{
        unsigned int eax, ebx, ecx, edx;

        cpuid(op, &eax, &ebx, &ecx, &edx);

        return eax;
}

static inline unsigned int cpuid_ebx(unsigned int op)
{
        unsigned int eax, ebx, ecx, edx;

        cpuid(op, &eax, &ebx, &ecx, &edx);

        return ebx;
}

static inline unsigned int cpuid_ecx(unsigned int op)
{
        unsigned int eax, ebx, ecx, edx;

        cpuid(op, &eax, &ebx, &ecx, &edx);

        return ecx;
}

static inline unsigned int cpuid_edx(unsigned int op)
{
        unsigned int eax, ebx, ecx, edx;

        cpuid(op, &eax, &ebx, &ecx, &edx);

        return edx;
}

extern void select_idle_routine(const struct cpuinfo_x86 *c);
extern void amd_e400_c1e_apic_setup(void);

extern unsigned long                boot_option_idle_override;

enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT,
                         IDLE_POLL};

extern void enable_sep_cpu(void);
extern int sysenter_setup(void);


/* Defined in head.S */
extern struct desc_ptr                early_gdt_descr;

extern void switch_to_new_gdt(int);
extern void load_direct_gdt(int);
extern void load_fixmap_gdt(int);
extern void load_percpu_segment(int);
extern void cpu_init(void);
extern void cpu_init_secondary(void);
extern void cpu_init_exception_handling(void);
extern void cr4_init(void);

static inline unsigned long get_debugctlmsr(void)
{
        unsigned long debugctlmsr = 0;

#ifndef CONFIG_X86_DEBUGCTLMSR
        if (boot_cpu_data.x86 < 6)
                return 0;
#endif
        rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);

        return debugctlmsr;
}

static inline void update_debugctlmsr(unsigned long debugctlmsr)
{
#ifndef CONFIG_X86_DEBUGCTLMSR
        if (boot_cpu_data.x86 < 6)
                return;
#endif
        wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
}

extern void set_task_blockstep(struct task_struct *task, bool on);

/* Boot loader type from the setup header: */
extern int                        bootloader_type;
extern int                        bootloader_version;

extern char                        ignore_fpu_irq;

#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
#define ARCH_HAS_PREFETCHW
#define ARCH_HAS_SPINLOCK_PREFETCH

#ifdef CONFIG_X86_32
# define BASE_PREFETCH                ""
# define ARCH_HAS_PREFETCH
#else
# define BASE_PREFETCH                "prefetcht0 %P1"
#endif

/*
 * Prefetch instructions for Pentium III (+) and AMD Athlon (+)
 *
 * It's not worth to care about 3dnow prefetches for the K6
 * because they are microcoded there and very slow.
 */
static inline void prefetch(const void *x)
{
        alternative_input(BASE_PREFETCH, "prefetchnta %P1",
                          X86_FEATURE_XMM,
                          "m" (*(const char *)x));
}

/*
 * 3dnow prefetch to get an exclusive cache line.
 * Useful for spinlocks to avoid one state transition in the
 * cache coherency protocol:
 */
static __always_inline void prefetchw(const void *x)
{
        alternative_input(BASE_PREFETCH, "prefetchw %P1",
                          X86_FEATURE_3DNOWPREFETCH,
                          "m" (*(const char *)x));
}

static inline void spin_lock_prefetch(const void *x)
{
        prefetchw(x);
}

#define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \
                           TOP_OF_KERNEL_STACK_PADDING)

#define task_top_of_stack(task) ((unsigned long)(task_pt_regs(task) + 1))

#define task_pt_regs(task) \
({                                                                        \
        unsigned long __ptr = (unsigned long)task_stack_page(task);        \
        __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING;                \
        ((struct pt_regs *)__ptr) - 1;                                        \
})

#ifdef CONFIG_X86_32
#define INIT_THREAD  {                                                          \
        .sp0                        = TOP_OF_INIT_STACK,                          \
        .sysenter_cs                = __KERNEL_CS,                                  \
}

#define KSTK_ESP(task)                (task_pt_regs(task)->sp)

#else
#define INIT_THREAD { }

extern unsigned long KSTK_ESP(struct task_struct *task);

#endif /* CONFIG_X86_64 */

extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
                                               unsigned long new_sp);

/*
 * This decides where the kernel will search for a free chunk of vm
 * space during mmap's.
 */
#define __TASK_UNMAPPED_BASE(task_size)        (PAGE_ALIGN(task_size / 3))
#define TASK_UNMAPPED_BASE                __TASK_UNMAPPED_BASE(TASK_SIZE_LOW)

#define KSTK_EIP(task)                (task_pt_regs(task)->ip)

/* Get/set a process' ability to use the timestamp counter instruction */
#define GET_TSC_CTL(adr)        get_tsc_mode((adr))
#define SET_TSC_CTL(val)        set_tsc_mode((val))

extern int get_tsc_mode(unsigned long adr);
extern int set_tsc_mode(unsigned int val);

DECLARE_PER_CPU(u64, msr_misc_features_shadow);

#ifdef CONFIG_CPU_SUP_AMD
extern u16 amd_get_nb_id(int cpu);
extern u32 amd_get_nodes_per_socket(void);
extern bool cpu_has_ibpb_brtype_microcode(void);
extern void amd_clear_divider(void);
#else
static inline u16 amd_get_nb_id(int cpu)                { return 0; }
static inline u32 amd_get_nodes_per_socket(void)        { return 0; }
static inline bool cpu_has_ibpb_brtype_microcode(void)        { return false; }
static inline void amd_clear_divider(void)                { }
#endif

static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves)
{
        uint32_t base, eax, signature[3];

        for (base = 0x40000000; base < 0x40010000; base += 0x100) {
                cpuid(base, &eax, &signature[0], &signature[1], &signature[2]);

                if (!memcmp(sig, signature, 12) &&
                    (leaves == 0 || ((eax - base) >= leaves)))
                        return base;
        }

        return 0;
}

extern unsigned long arch_align_stack(unsigned long sp);
void free_init_pages(const char *what, unsigned long begin, unsigned long end);
extern void free_kernel_image_pages(const char *what, void *begin, void *end);

void default_idle(void);
#ifdef        CONFIG_XEN
bool xen_set_default_idle(void);
#else
#define xen_set_default_idle 0
#endif

void __noreturn stop_this_cpu(void *dummy);
void microcode_check(struct cpuinfo_x86 *prev_info);
void store_cpu_caps(struct cpuinfo_x86 *info);

enum l1tf_mitigations {
        L1TF_MITIGATION_OFF,
        L1TF_MITIGATION_FLUSH_NOWARN,
        L1TF_MITIGATION_FLUSH,
        L1TF_MITIGATION_FLUSH_NOSMT,
        L1TF_MITIGATION_FULL,
        L1TF_MITIGATION_FULL_FORCE
};

extern enum l1tf_mitigations l1tf_mitigation;

enum mds_mitigations {
        MDS_MITIGATION_OFF,
        MDS_MITIGATION_FULL,
        MDS_MITIGATION_VMWERV,
};

extern bool gds_ucode_mitigated(void);

/*
 * Make previous memory operations globally visible before
 * a WRMSR.
 *
 * MFENCE makes writes visible, but only affects load/store
 * instructions.  WRMSR is unfortunately not a load/store
 * instruction and is unaffected by MFENCE.  The LFENCE ensures
 * that the WRMSR is not reordered.
 *
 * Most WRMSRs are full serializing instructions themselves and
 * do not require this barrier.  This is only required for the
 * IA32_TSC_DEADLINE and X2APIC MSRs.
 */
static inline void weak_wrmsr_fence(void)
{
        alternative("mfence; lfence", "", ALT_NOT(X86_FEATURE_APIC_MSRS_FENCE));
}

#endif /* _ASM_X86_PROCESSOR_H */








































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef RQ_QOS_H
#define RQ_QOS_H

#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/blk_types.h>
#include <linux/atomic.h>
#include <linux/wait.h>
#include <linux/blk-mq.h>

#include "blk-mq-debugfs.h"

struct blk_mq_debugfs_attr;

enum rq_qos_id {
        RQ_QOS_WBT,
        RQ_QOS_LATENCY,
        RQ_QOS_COST,
};

struct rq_wait {
        wait_queue_head_t wait;
        atomic_t inflight;
};

struct rq_qos {
        struct rq_qos_ops *ops;
        struct request_queue *q;
        enum rq_qos_id id;
        struct rq_qos *next;
#ifdef CONFIG_BLK_DEBUG_FS
        struct dentry *debugfs_dir;
#endif
};

struct rq_qos_ops {
        void (*throttle)(struct rq_qos *, struct bio *);
        void (*track)(struct rq_qos *, struct request *, struct bio *);
        void (*merge)(struct rq_qos *, struct request *, struct bio *);
        void (*issue)(struct rq_qos *, struct request *);
        void (*requeue)(struct rq_qos *, struct request *);
        void (*done)(struct rq_qos *, struct request *);
        void (*done_bio)(struct rq_qos *, struct bio *);
        void (*cleanup)(struct rq_qos *, struct bio *);
        void (*queue_depth_changed)(struct rq_qos *);
        void (*exit)(struct rq_qos *);
        const struct blk_mq_debugfs_attr *debugfs_attrs;
};

struct rq_depth {
        unsigned int max_depth;

        int scale_step;
        bool scaled_max;

        unsigned int queue_depth;
        unsigned int default_depth;
};

static inline struct rq_qos *rq_qos_id(struct request_queue *q,
                                       enum rq_qos_id id)
{
        struct rq_qos *rqos;
        for (rqos = q->rq_qos; rqos; rqos = rqos->next) {
                if (rqos->id == id)
                        break;
        }
        return rqos;
}

static inline struct rq_qos *wbt_rq_qos(struct request_queue *q)
{
        return rq_qos_id(q, RQ_QOS_WBT);
}

static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q)
{
        return rq_qos_id(q, RQ_QOS_LATENCY);
}

static inline const char *rq_qos_id_to_name(enum rq_qos_id id)
{
        switch (id) {
        case RQ_QOS_WBT:
                return "wbt";
        case RQ_QOS_LATENCY:
                return "latency";
        case RQ_QOS_COST:
                return "cost";
        }
        return "unknown";
}

static inline void rq_wait_init(struct rq_wait *rq_wait)
{
        atomic_set(&rq_wait->inflight, 0);
        init_waitqueue_head(&rq_wait->wait);
}

static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
{
        /*
         * No IO can be in-flight when adding rqos, so freeze queue, which
         * is fine since we only support rq_qos for blk-mq queue.
         *
         * Reuse ->queue_lock for protecting against other concurrent
         * rq_qos adding/deleting
         */
        blk_mq_freeze_queue(q);

        spin_lock_irq(&q->queue_lock);
        rqos->next = q->rq_qos;
        q->rq_qos = rqos;
        spin_unlock_irq(&q->queue_lock);

        blk_mq_unfreeze_queue(q);

        if (rqos->ops->debugfs_attrs)
                blk_mq_debugfs_register_rqos(rqos);
}

static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos)
{
        struct rq_qos **cur;

        /*
         * See comment in rq_qos_add() about freezing queue & using
         * ->queue_lock.
         */
        blk_mq_freeze_queue(q);

        spin_lock_irq(&q->queue_lock);
        for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) {
                if (*cur == rqos) {
                        *cur = rqos->next;
                        break;
                }
        }
        spin_unlock_irq(&q->queue_lock);

        blk_mq_unfreeze_queue(q);

        blk_mq_debugfs_unregister_rqos(rqos);
}

typedef bool (acquire_inflight_cb_t)(struct rq_wait *rqw, void *private_data);
typedef void (cleanup_cb_t)(struct rq_wait *rqw, void *private_data);

void rq_qos_wait(struct rq_wait *rqw, void *private_data,
                 acquire_inflight_cb_t *acquire_inflight_cb,
                 cleanup_cb_t *cleanup_cb);
bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit);
bool rq_depth_scale_up(struct rq_depth *rqd);
bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle);
bool rq_depth_calc_max_depth(struct rq_depth *rqd);

void __rq_qos_cleanup(struct rq_qos *rqos, struct bio *bio);
void __rq_qos_done(struct rq_qos *rqos, struct request *rq);
void __rq_qos_issue(struct rq_qos *rqos, struct request *rq);
void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq);
void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio);
void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio);
void __rq_qos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio);
void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio);
void __rq_qos_queue_depth_changed(struct rq_qos *rqos);

static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio)
{
        if (q->rq_qos)
                __rq_qos_cleanup(q->rq_qos, bio);
}

static inline void rq_qos_done(struct request_queue *q, struct request *rq)
{
        if (q->rq_qos)
                __rq_qos_done(q->rq_qos, rq);
}

static inline void rq_qos_issue(struct request_queue *q, struct request *rq)
{
        if (q->rq_qos)
                __rq_qos_issue(q->rq_qos, rq);
}

static inline void rq_qos_requeue(struct request_queue *q, struct request *rq)
{
        if (q->rq_qos)
                __rq_qos_requeue(q->rq_qos, rq);
}

static inline void rq_qos_done_bio(struct request_queue *q, struct bio *bio)
{
        if (q->rq_qos)
                __rq_qos_done_bio(q->rq_qos, bio);
}

static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
{
        /*
         * BIO_TRACKED lets controllers know that a bio went through the
         * normal rq_qos path.
         */
        bio_set_flag(bio, BIO_TRACKED);
        if (q->rq_qos)
                __rq_qos_throttle(q->rq_qos, bio);
}

static inline void rq_qos_track(struct request_queue *q, struct request *rq,
                                struct bio *bio)
{
        if (q->rq_qos)
                __rq_qos_track(q->rq_qos, rq, bio);
}

static inline void rq_qos_merge(struct request_queue *q, struct request *rq,
                                struct bio *bio)
{
        if (q->rq_qos)
                __rq_qos_merge(q->rq_qos, rq, bio);
}

static inline void rq_qos_queue_depth_changed(struct request_queue *q)
{
        if (q->rq_qos)
                __rq_qos_queue_depth_changed(q->rq_qos);
}

void rq_qos_exit(struct request_queue *);

#endif
























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM migrate

#if !defined(_TRACE_MIGRATE_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_MIGRATE_H

#include <linux/tracepoint.h>

#define MIGRATE_MODE                                                \
        EM( MIGRATE_ASYNC,        "MIGRATE_ASYNC")                \
        EM( MIGRATE_SYNC_LIGHT,        "MIGRATE_SYNC_LIGHT")                \
        EMe(MIGRATE_SYNC,        "MIGRATE_SYNC")


#define MIGRATE_REASON                                                \
        EM( MR_COMPACTION,        "compaction")                        \
        EM( MR_MEMORY_FAILURE,        "memory_failure")                \
        EM( MR_MEMORY_HOTPLUG,        "memory_hotplug")                \
        EM( MR_SYSCALL,                "syscall_or_cpuset")                \
        EM( MR_MEMPOLICY_MBIND,        "mempolicy_mbind")                \
        EM( MR_NUMA_MISPLACED,        "numa_misplaced")                \
        EMe(MR_CONTIG_RANGE,        "contig_range")

/*
 * First define the enums in the above macros to be exported to userspace
 * via TRACE_DEFINE_ENUM().
 */
#undef EM
#undef EMe
#define EM(a, b)        TRACE_DEFINE_ENUM(a);
#define EMe(a, b)        TRACE_DEFINE_ENUM(a);

MIGRATE_MODE
MIGRATE_REASON

/*
 * Now redefine the EM() and EMe() macros to map the enums to the strings
 * that will be printed in the output.
 */
#undef EM
#undef EMe
#define EM(a, b)        {a, b},
#define EMe(a, b)        {a, b}

TRACE_EVENT(mm_migrate_pages,

        TP_PROTO(unsigned long succeeded, unsigned long failed,
                 unsigned long thp_succeeded, unsigned long thp_failed,
                 unsigned long thp_split, enum migrate_mode mode, int reason),

        TP_ARGS(succeeded, failed, thp_succeeded, thp_failed,
                thp_split, mode, reason),

        TP_STRUCT__entry(
                __field(        unsigned long,                succeeded)
                __field(        unsigned long,                failed)
                __field(        unsigned long,                thp_succeeded)
                __field(        unsigned long,                thp_failed)
                __field(        unsigned long,                thp_split)
                __field(        enum migrate_mode,        mode)
                __field(        int,                        reason)
        ),

        TP_fast_assign(
                __entry->succeeded        = succeeded;
                __entry->failed                = failed;
                __entry->thp_succeeded        = thp_succeeded;
                __entry->thp_failed        = thp_failed;
                __entry->thp_split        = thp_split;
                __entry->mode                = mode;
                __entry->reason                = reason;
        ),

        TP_printk("nr_succeeded=%lu nr_failed=%lu nr_thp_succeeded=%lu nr_thp_failed=%lu nr_thp_split=%lu mode=%s reason=%s",
                __entry->succeeded,
                __entry->failed,
                __entry->thp_succeeded,
                __entry->thp_failed,
                __entry->thp_split,
                __print_symbolic(__entry->mode, MIGRATE_MODE),
                __print_symbolic(__entry->reason, MIGRATE_REASON))
);
#endif /* _TRACE_MIGRATE_H */

/* This part must be outside protection */
#include <trace/define_trace.h>



























































    1 
    1 













    1 




















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef INT_BLK_MQ_TAG_H
#define INT_BLK_MQ_TAG_H

/*
 * Tag address space map.
 */
struct blk_mq_tags {
        unsigned int nr_tags;
        unsigned int nr_reserved_tags;

        atomic_t active_queues;

        struct sbitmap_queue *bitmap_tags;
        struct sbitmap_queue *breserved_tags;

        struct sbitmap_queue __bitmap_tags;
        struct sbitmap_queue __breserved_tags;

        struct request **rqs;
        struct request **static_rqs;
        struct list_head page_list;

        /*
         * used to clear request reference in rqs[] before freeing one
         * request pool
         */
        spinlock_t lock;
};

extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags,
                                        unsigned int reserved_tags,
                                        int node, unsigned int flags);
extern void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags);

extern int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set,
                                      unsigned int flags);
extern void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set);

extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
                           unsigned int tag);
extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
                                        struct blk_mq_tags **tags,
                                        unsigned int depth, bool can_grow);
extern void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set,
                                             unsigned int size);

extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
                void *priv);
void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
                void *priv);

static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
                                                 struct blk_mq_hw_ctx *hctx)
{
        if (!hctx)
                return &bt->ws[0];
        return sbq_wait_ptr(bt, &hctx->wait_index);
}

enum {
        BLK_MQ_NO_TAG                = -1U,
        BLK_MQ_TAG_MIN                = 1,
        BLK_MQ_TAG_MAX                = BLK_MQ_NO_TAG - 1,
};

extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);

static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
        if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
                return false;

        return __blk_mq_tag_busy(hctx);
}

static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
{
        if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
                return;

        __blk_mq_tag_idle(hctx);
}

static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags,
                                          unsigned int tag)
{
        return tag < tags->nr_reserved_tags;
}

#endif












































































































































































































































































































   11 
   11 





























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 




































    3 


























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  Kernel Probes (KProbes)
 *  kernel/kprobes.c
 *
 * Copyright (C) IBM Corporation, 2002, 2004
 *
 * 2002-Oct        Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
 *                Probes initial implementation (includes suggestions from
 *                Rusty Russell).
 * 2004-Aug        Updated by Prasanna S Panchamukhi <prasanna@in.ibm.com> with
 *                hlists and exceptions notifier as suggested by Andi Kleen.
 * 2004-July        Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
 *                interface to access function arguments.
 * 2004-Sep        Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes
 *                exceptions notifier to be first on the priority list.
 * 2005-May        Hien Nguyen <hien@us.ibm.com>, Jim Keniston
 *                <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
 *                <prasanna@in.ibm.com> added function-return probes.
 */
#include <linux/kprobes.h>
#include <linux/hash.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/stddef.h>
#include <linux/export.h>
#include <linux/moduleloader.h>
#include <linux/kallsyms.h>
#include <linux/freezer.h>
#include <linux/seq_file.h>
#include <linux/debugfs.h>
#include <linux/sysctl.h>
#include <linux/kdebug.h>
#include <linux/memory.h>
#include <linux/ftrace.h>
#include <linux/cpu.h>
#include <linux/jump_label.h>
#include <linux/perf_event.h>
#include <linux/static_call.h>

#include <asm/sections.h>
#include <asm/cacheflush.h>
#include <asm/errno.h>
#include <linux/uaccess.h>

#define KPROBE_HASH_BITS 6
#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)


static int kprobes_initialized;
/* kprobe_table can be accessed by
 * - Normal hlist traversal and RCU add/del under kprobe_mutex is held.
 * Or
 * - RCU hlist traversal under disabling preempt (breakpoint handlers)
 */
static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];

/* NOTE: change this value only with kprobe_mutex held */
static bool kprobes_all_disarmed;

/* This protects kprobe_table and optimizing_list */
static DEFINE_MUTEX(kprobe_mutex);
static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
static struct {
        raw_spinlock_t lock ____cacheline_aligned_in_smp;
} kretprobe_table_locks[KPROBE_TABLE_SIZE];

kprobe_opcode_t * __weak kprobe_lookup_name(const char *name,
                                        unsigned int __unused)
{
        return ((kprobe_opcode_t *)(kallsyms_lookup_name(name)));
}

static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
{
        return &(kretprobe_table_locks[hash].lock);
}

/* Blacklist -- list of struct kprobe_blacklist_entry */
static LIST_HEAD(kprobe_blacklist);

#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
/*
 * kprobe->ainsn.insn points to the copy of the instruction to be
 * single-stepped. x86_64, POWER4 and above have no-exec support and
 * stepping on the instruction on a vmalloced/kmalloced/data page
 * is a recipe for disaster
 */
struct kprobe_insn_page {
        struct list_head list;
        kprobe_opcode_t *insns;                /* Page of instruction slots */
        struct kprobe_insn_cache *cache;
        int nused;
        int ngarbage;
        char slot_used[];
};

#define KPROBE_INSN_PAGE_SIZE(slots)                        \
        (offsetof(struct kprobe_insn_page, slot_used) +        \
         (sizeof(char) * (slots)))

static int slots_per_page(struct kprobe_insn_cache *c)
{
        return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t));
}

enum kprobe_slot_state {
        SLOT_CLEAN = 0,
        SLOT_DIRTY = 1,
        SLOT_USED = 2,
};

void __weak *alloc_insn_page(void)
{
        return module_alloc(PAGE_SIZE);
}

void __weak free_insn_page(void *page)
{
        module_memfree(page);
}

struct kprobe_insn_cache kprobe_insn_slots = {
        .mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex),
        .alloc = alloc_insn_page,
        .free = free_insn_page,
        .sym = KPROBE_INSN_PAGE_SYM,
        .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
        .insn_size = MAX_INSN_SIZE,
        .nr_garbage = 0,
};
static int collect_garbage_slots(struct kprobe_insn_cache *c);

/**
 * __get_insn_slot() - Find a slot on an executable page for an instruction.
 * We allocate an executable page if there's no room on existing ones.
 */
kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
{
        struct kprobe_insn_page *kip;
        kprobe_opcode_t *slot = NULL;

        /* Since the slot array is not protected by rcu, we need a mutex */
        mutex_lock(&c->mutex);
 retry:
        rcu_read_lock();
        list_for_each_entry_rcu(kip, &c->pages, list) {
                if (kip->nused < slots_per_page(c)) {
                        int i;
                        for (i = 0; i < slots_per_page(c); i++) {
                                if (kip->slot_used[i] == SLOT_CLEAN) {
                                        kip->slot_used[i] = SLOT_USED;
                                        kip->nused++;
                                        slot = kip->insns + (i * c->insn_size);
                                        rcu_read_unlock();
                                        goto out;
                                }
                        }
                        /* kip->nused is broken. Fix it. */
                        kip->nused = slots_per_page(c);
                        WARN_ON(1);
                }
        }
        rcu_read_unlock();

        /* If there are any garbage slots, collect it and try again. */
        if (c->nr_garbage && collect_garbage_slots(c) == 0)
                goto retry;

        /* All out of space.  Need to allocate a new page. */
        kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL);
        if (!kip)
                goto out;

        /*
         * Use module_alloc so this page is within +/- 2GB of where the
         * kernel image and loaded module images reside. This is required
         * so x86_64 can correctly handle the %rip-relative fixups.
         */
        kip->insns = c->alloc();
        if (!kip->insns) {
                kfree(kip);
                goto out;
        }
        INIT_LIST_HEAD(&kip->list);
        memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c));
        kip->slot_used[0] = SLOT_USED;
        kip->nused = 1;
        kip->ngarbage = 0;
        kip->cache = c;
        list_add_rcu(&kip->list, &c->pages);
        slot = kip->insns;

        /* Record the perf ksymbol register event after adding the page */
        perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, (unsigned long)kip->insns,
                           PAGE_SIZE, false, c->sym);
out:
        mutex_unlock(&c->mutex);
        return slot;
}

/* Return 1 if all garbages are collected, otherwise 0. */
static int collect_one_slot(struct kprobe_insn_page *kip, int idx)
{
        kip->slot_used[idx] = SLOT_CLEAN;
        kip->nused--;
        if (kip->nused == 0) {
                /*
                 * Page is no longer in use.  Free it unless
                 * it's the last one.  We keep the last one
                 * so as not to have to set it up again the
                 * next time somebody inserts a probe.
                 */
                if (!list_is_singular(&kip->list)) {
                        /*
                         * Record perf ksymbol unregister event before removing
                         * the page.
                         */
                        perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
                                           (unsigned long)kip->insns, PAGE_SIZE, true,
                                           kip->cache->sym);
                        list_del_rcu(&kip->list);
                        synchronize_rcu();
                        kip->cache->free(kip->insns);
                        kfree(kip);
                }
                return 1;
        }
        return 0;
}

static int collect_garbage_slots(struct kprobe_insn_cache *c)
{
        struct kprobe_insn_page *kip, *next;

        /* Ensure no-one is interrupted on the garbages */
        synchronize_rcu();

        list_for_each_entry_safe(kip, next, &c->pages, list) {
                int i;
                if (kip->ngarbage == 0)
                        continue;
                kip->ngarbage = 0;        /* we will collect all garbages */
                for (i = 0; i < slots_per_page(c); i++) {
                        if (kip->slot_used[i] == SLOT_DIRTY && collect_one_slot(kip, i))
                                break;
                }
        }
        c->nr_garbage = 0;
        return 0;
}

void __free_insn_slot(struct kprobe_insn_cache *c,
                      kprobe_opcode_t *slot, int dirty)
{
        struct kprobe_insn_page *kip;
        long idx;

        mutex_lock(&c->mutex);
        rcu_read_lock();
        list_for_each_entry_rcu(kip, &c->pages, list) {
                idx = ((long)slot - (long)kip->insns) /
                        (c->insn_size * sizeof(kprobe_opcode_t));
                if (idx >= 0 && idx < slots_per_page(c))
                        goto out;
        }
        /* Could not find this slot. */
        WARN_ON(1);
        kip = NULL;
out:
        rcu_read_unlock();
        /* Mark and sweep: this may sleep */
        if (kip) {
                /* Check double free */
                WARN_ON(kip->slot_used[idx] != SLOT_USED);
                if (dirty) {
                        kip->slot_used[idx] = SLOT_DIRTY;
                        kip->ngarbage++;
                        if (++c->nr_garbage > slots_per_page(c))
                                collect_garbage_slots(c);
                } else {
                        collect_one_slot(kip, idx);
                }
        }
        mutex_unlock(&c->mutex);
}

/*
 * Check given address is on the page of kprobe instruction slots.
 * This will be used for checking whether the address on a stack
 * is on a text area or not.
 */
bool __is_insn_slot_addr(struct kprobe_insn_cache *c, unsigned long addr)
{
        struct kprobe_insn_page *kip;
        bool ret = false;

        rcu_read_lock();
        list_for_each_entry_rcu(kip, &c->pages, list) {
                if (addr >= (unsigned long)kip->insns &&
                    addr < (unsigned long)kip->insns + PAGE_SIZE) {
                        ret = true;
                        break;
                }
        }
        rcu_read_unlock();

        return ret;
}

int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum,
                             unsigned long *value, char *type, char *sym)
{
        struct kprobe_insn_page *kip;
        int ret = -ERANGE;

        rcu_read_lock();
        list_for_each_entry_rcu(kip, &c->pages, list) {
                if ((*symnum)--)
                        continue;
                strlcpy(sym, c->sym, KSYM_NAME_LEN);
                *type = 't';
                *value = (unsigned long)kip->insns;
                ret = 0;
                break;
        }
        rcu_read_unlock();

        return ret;
}

#ifdef CONFIG_OPTPROBES
/* For optimized_kprobe buffer */
struct kprobe_insn_cache kprobe_optinsn_slots = {
        .mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex),
        .alloc = alloc_insn_page,
        .free = free_insn_page,
        .sym = KPROBE_OPTINSN_PAGE_SYM,
        .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
        /* .insn_size is initialized later */
        .nr_garbage = 0,
};
#endif
#endif

/* We have preemption disabled.. so it is safe to use __ versions */
static inline void set_kprobe_instance(struct kprobe *kp)
{
        __this_cpu_write(kprobe_instance, kp);
}

static inline void reset_kprobe_instance(void)
{
        __this_cpu_write(kprobe_instance, NULL);
}

/*
 * This routine is called either:
 *         - under the kprobe_mutex - during kprobe_[un]register()
 *                                 OR
 *         - with preemption disabled - from arch/xxx/kernel/kprobes.c
 */
struct kprobe *get_kprobe(void *addr)
{
        struct hlist_head *head;
        struct kprobe *p;

        head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
        hlist_for_each_entry_rcu(p, head, hlist,
                                 lockdep_is_held(&kprobe_mutex)) {
                if (p->addr == addr)
                        return p;
        }

        return NULL;
}
NOKPROBE_SYMBOL(get_kprobe);

static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);

/* Return true if the kprobe is an aggregator */
static inline int kprobe_aggrprobe(struct kprobe *p)
{
        return p->pre_handler == aggr_pre_handler;
}

/* Return true(!0) if the kprobe is unused */
static inline int kprobe_unused(struct kprobe *p)
{
        return kprobe_aggrprobe(p) && kprobe_disabled(p) &&
               list_empty(&p->list);
}

/*
 * Keep all fields in the kprobe consistent
 */
static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p)
{
        memcpy(&p->opcode, &ap->opcode, sizeof(kprobe_opcode_t));
        memcpy(&p->ainsn, &ap->ainsn, sizeof(struct arch_specific_insn));
}

#ifdef CONFIG_OPTPROBES
/* NOTE: change this value only with kprobe_mutex held */
static bool kprobes_allow_optimization;

/*
 * Call all pre_handler on the list, but ignores its return value.
 * This must be called from arch-dep optimized caller.
 */
void opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
{
        struct kprobe *kp;

        list_for_each_entry_rcu(kp, &p->list, list) {
                if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
                        set_kprobe_instance(kp);
                        kp->pre_handler(kp, regs);
                }
                reset_kprobe_instance();
        }
}
NOKPROBE_SYMBOL(opt_pre_handler);

/* Free optimized instructions and optimized_kprobe */
static void free_aggr_kprobe(struct kprobe *p)
{
        struct optimized_kprobe *op;

        op = container_of(p, struct optimized_kprobe, kp);
        arch_remove_optimized_kprobe(op);
        arch_remove_kprobe(p);
        kfree(op);
}

/* Return true(!0) if the kprobe is ready for optimization. */
static inline int kprobe_optready(struct kprobe *p)
{
        struct optimized_kprobe *op;

        if (kprobe_aggrprobe(p)) {
                op = container_of(p, struct optimized_kprobe, kp);
                return arch_prepared_optinsn(&op->optinsn);
        }

        return 0;
}

/* Return true if the kprobe is disarmed. Note: p must be on hash list */
bool kprobe_disarmed(struct kprobe *p)
{
        struct optimized_kprobe *op;

        /* If kprobe is not aggr/opt probe, just return kprobe is disabled */
        if (!kprobe_aggrprobe(p))
                return kprobe_disabled(p);

        op = container_of(p, struct optimized_kprobe, kp);

        return kprobe_disabled(p) && list_empty(&op->list);
}

/* Return true(!0) if the probe is queued on (un)optimizing lists */
static int kprobe_queued(struct kprobe *p)
{
        struct optimized_kprobe *op;

        if (kprobe_aggrprobe(p)) {
                op = container_of(p, struct optimized_kprobe, kp);
                if (!list_empty(&op->list))
                        return 1;
        }
        return 0;
}

/*
 * Return an optimized kprobe whose optimizing code replaces
 * instructions including addr (exclude breakpoint).
 */
static struct kprobe *get_optimized_kprobe(unsigned long addr)
{
        int i;
        struct kprobe *p = NULL;
        struct optimized_kprobe *op;

        /* Don't check i == 0, since that is a breakpoint case. */
        for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH; i++)
                p = get_kprobe((void *)(addr - i));

        if (p && kprobe_optready(p)) {
                op = container_of(p, struct optimized_kprobe, kp);
                if (arch_within_optimized_kprobe(op, addr))
                        return p;
        }

        return NULL;
}

/* Optimization staging list, protected by kprobe_mutex */
static LIST_HEAD(optimizing_list);
static LIST_HEAD(unoptimizing_list);
static LIST_HEAD(freeing_list);

static void kprobe_optimizer(struct work_struct *work);
static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
#define OPTIMIZE_DELAY 5

/*
 * Optimize (replace a breakpoint with a jump) kprobes listed on
 * optimizing_list.
 */
static void do_optimize_kprobes(void)
{
        lockdep_assert_held(&text_mutex);
        /*
         * The optimization/unoptimization refers online_cpus via
         * stop_machine() and cpu-hotplug modifies online_cpus.
         * And same time, text_mutex will be held in cpu-hotplug and here.
         * This combination can cause a deadlock (cpu-hotplug try to lock
         * text_mutex but stop_machine can not be done because online_cpus
         * has been changed)
         * To avoid this deadlock, caller must have locked cpu hotplug
         * for preventing cpu-hotplug outside of text_mutex locking.
         */
        lockdep_assert_cpus_held();

        /* Optimization never be done when disarmed */
        if (kprobes_all_disarmed || !kprobes_allow_optimization ||
            list_empty(&optimizing_list))
                return;

        arch_optimize_kprobes(&optimizing_list);
}

/*
 * Unoptimize (replace a jump with a breakpoint and remove the breakpoint
 * if need) kprobes listed on unoptimizing_list.
 */
static void do_unoptimize_kprobes(void)
{
        struct optimized_kprobe *op, *tmp;

        lockdep_assert_held(&text_mutex);
        /* See comment in do_optimize_kprobes() */
        lockdep_assert_cpus_held();

        if (!list_empty(&unoptimizing_list))
                arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);

        /* Loop on 'freeing_list' for disarming and removing from kprobe hash list */
        list_for_each_entry_safe(op, tmp, &freeing_list, list) {
                /* Switching from detour code to origin */
                op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
                /* Disarm probes if marked disabled and not gone */
                if (kprobe_disabled(&op->kp) && !kprobe_gone(&op->kp))
                        arch_disarm_kprobe(&op->kp);
                if (kprobe_unused(&op->kp)) {
                        /*
                         * Remove unused probes from hash list. After waiting
                         * for synchronization, these probes are reclaimed.
                         * (reclaiming is done by do_free_cleaned_kprobes.)
                         */
                        hlist_del_rcu(&op->kp.hlist);
                } else
                        list_del_init(&op->list);
        }
}

/* Reclaim all kprobes on the free_list */
static void do_free_cleaned_kprobes(void)
{
        struct optimized_kprobe *op, *tmp;

        list_for_each_entry_safe(op, tmp, &freeing_list, list) {
                list_del_init(&op->list);
                if (WARN_ON_ONCE(!kprobe_unused(&op->kp))) {
                        /*
                         * This must not happen, but if there is a kprobe
                         * still in use, keep it on kprobes hash list.
                         */
                        continue;
                }
                free_aggr_kprobe(&op->kp);
        }
}

/* Start optimizer after OPTIMIZE_DELAY passed */
static void kick_kprobe_optimizer(void)
{
        schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
}

/* Kprobe jump optimizer */
static void kprobe_optimizer(struct work_struct *work)
{
        mutex_lock(&kprobe_mutex);
        cpus_read_lock();
        mutex_lock(&text_mutex);

        /*
         * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
         * kprobes before waiting for quiesence period.
         */
        do_unoptimize_kprobes();

        /*
         * Step 2: Wait for quiesence period to ensure all potentially
         * preempted tasks to have normally scheduled. Because optprobe
         * may modify multiple instructions, there is a chance that Nth
         * instruction is preempted. In that case, such tasks can return
         * to 2nd-Nth byte of jump instruction. This wait is for avoiding it.
         * Note that on non-preemptive kernel, this is transparently converted
         * to synchronoze_sched() to wait for all interrupts to have completed.
         */
        synchronize_rcu_tasks();

        /* Step 3: Optimize kprobes after quiesence period */
        do_optimize_kprobes();

        /* Step 4: Free cleaned kprobes after quiesence period */
        do_free_cleaned_kprobes();

        mutex_unlock(&text_mutex);
        cpus_read_unlock();

        /* Step 5: Kick optimizer again if needed */
        if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
                kick_kprobe_optimizer();

        mutex_unlock(&kprobe_mutex);
}

/* Wait for completing optimization and unoptimization */
void wait_for_kprobe_optimizer(void)
{
        mutex_lock(&kprobe_mutex);

        while (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) {
                mutex_unlock(&kprobe_mutex);

                /* this will also make optimizing_work execute immmediately */
                flush_delayed_work(&optimizing_work);
                /* @optimizing_work might not have been queued yet, relax */
                cpu_relax();

                mutex_lock(&kprobe_mutex);
        }

        mutex_unlock(&kprobe_mutex);
}

bool optprobe_queued_unopt(struct optimized_kprobe *op)
{
        struct optimized_kprobe *_op;

        list_for_each_entry(_op, &unoptimizing_list, list) {
                if (op == _op)
                        return true;
        }

        return false;
}

/* Optimize kprobe if p is ready to be optimized */
static void optimize_kprobe(struct kprobe *p)
{
        struct optimized_kprobe *op;

        /* Check if the kprobe is disabled or not ready for optimization. */
        if (!kprobe_optready(p) || !kprobes_allow_optimization ||
            (kprobe_disabled(p) || kprobes_all_disarmed))
                return;

        /* kprobes with post_handler can not be optimized */
        if (p->post_handler)
                return;

        op = container_of(p, struct optimized_kprobe, kp);

        /* Check there is no other kprobes at the optimized instructions */
        if (arch_check_optimized_kprobe(op) < 0)
                return;

        /* Check if it is already optimized. */
        if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) {
                if (optprobe_queued_unopt(op)) {
                        /* This is under unoptimizing. Just dequeue the probe */
                        list_del_init(&op->list);
                }
                return;
        }
        op->kp.flags |= KPROBE_FLAG_OPTIMIZED;

        /* On unoptimizing/optimizing_list, op must have OPTIMIZED flag */
        if (WARN_ON_ONCE(!list_empty(&op->list)))
                return;

        list_add(&op->list, &optimizing_list);
        kick_kprobe_optimizer();
}

/* Short cut to direct unoptimizing */
static void force_unoptimize_kprobe(struct optimized_kprobe *op)
{
        lockdep_assert_cpus_held();
        arch_unoptimize_kprobe(op);
        op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
}

/* Unoptimize a kprobe if p is optimized */
static void unoptimize_kprobe(struct kprobe *p, bool force)
{
        struct optimized_kprobe *op;

        if (!kprobe_aggrprobe(p) || kprobe_disarmed(p))
                return; /* This is not an optprobe nor optimized */

        op = container_of(p, struct optimized_kprobe, kp);
        if (!kprobe_optimized(p))
                return;

        if (!list_empty(&op->list)) {
                if (optprobe_queued_unopt(op)) {
                        /* Queued in unoptimizing queue */
                        if (force) {
                                /*
                                 * Forcibly unoptimize the kprobe here, and queue it
                                 * in the freeing list for release afterwards.
                                 */
                                force_unoptimize_kprobe(op);
                                list_move(&op->list, &freeing_list);
                        }
                } else {
                        /* Dequeue from the optimizing queue */
                        list_del_init(&op->list);
                        op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
                }
                return;
        }

        /* Optimized kprobe case */
        if (force) {
                /* Forcibly update the code: this is a special case */
                force_unoptimize_kprobe(op);
        } else {
                list_add(&op->list, &unoptimizing_list);
                kick_kprobe_optimizer();
        }
}

/* Cancel unoptimizing for reusing */
static int reuse_unused_kprobe(struct kprobe *ap)
{
        struct optimized_kprobe *op;

        /*
         * Unused kprobe MUST be on the way of delayed unoptimizing (means
         * there is still a relative jump) and disabled.
         */
        op = container_of(ap, struct optimized_kprobe, kp);
        WARN_ON_ONCE(list_empty(&op->list));
        /* Enable the probe again */
        ap->flags &= ~KPROBE_FLAG_DISABLED;
        /* Optimize it again (remove from op->list) */
        if (!kprobe_optready(ap))
                return -EINVAL;

        optimize_kprobe(ap);
        return 0;
}

/* Remove optimized instructions */
static void kill_optimized_kprobe(struct kprobe *p)
{
        struct optimized_kprobe *op;

        op = container_of(p, struct optimized_kprobe, kp);
        if (!list_empty(&op->list))
                /* Dequeue from the (un)optimization queue */
                list_del_init(&op->list);
        op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;

        if (kprobe_unused(p)) {
                /*
                 * Unused kprobe is on unoptimizing or freeing list. We move it
                 * to freeing_list and let the kprobe_optimizer() remove it from
                 * the kprobe hash list and free it.
                 */
                if (optprobe_queued_unopt(op))
                        list_move(&op->list, &freeing_list);
        }

        /* Don't touch the code, because it is already freed. */
        arch_remove_optimized_kprobe(op);
}

static inline
void __prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p)
{
        if (!kprobe_ftrace(p))
                arch_prepare_optimized_kprobe(op, p);
}

/* Try to prepare optimized instructions */
static void prepare_optimized_kprobe(struct kprobe *p)
{
        struct optimized_kprobe *op;

        op = container_of(p, struct optimized_kprobe, kp);
        __prepare_optimized_kprobe(op, p);
}

/* Allocate new optimized_kprobe and try to prepare optimized instructions */
static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
{
        struct optimized_kprobe *op;

        op = kzalloc(sizeof(struct optimized_kprobe), GFP_KERNEL);
        if (!op)
                return NULL;

        INIT_LIST_HEAD(&op->list);
        op->kp.addr = p->addr;
        __prepare_optimized_kprobe(op, p);

        return &op->kp;
}

static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);

/*
 * Prepare an optimized_kprobe and optimize it
 * NOTE: p must be a normal registered kprobe
 */
static void try_to_optimize_kprobe(struct kprobe *p)
{
        struct kprobe *ap;
        struct optimized_kprobe *op;

        /* Impossible to optimize ftrace-based kprobe */
        if (kprobe_ftrace(p))
                return;

        /* For preparing optimization, jump_label_text_reserved() is called */
        cpus_read_lock();
        jump_label_lock();
        mutex_lock(&text_mutex);

        ap = alloc_aggr_kprobe(p);
        if (!ap)
                goto out;

        op = container_of(ap, struct optimized_kprobe, kp);
        if (!arch_prepared_optinsn(&op->optinsn)) {
                /* If failed to setup optimizing, fallback to kprobe */
                arch_remove_optimized_kprobe(op);
                kfree(op);
                goto out;
        }

        init_aggr_kprobe(ap, p);
        optimize_kprobe(ap);        /* This just kicks optimizer thread */

out:
        mutex_unlock(&text_mutex);
        jump_label_unlock();
        cpus_read_unlock();
}

static void optimize_all_kprobes(void)
{
        struct hlist_head *head;
        struct kprobe *p;
        unsigned int i;

        mutex_lock(&kprobe_mutex);
        /* If optimization is already allowed, just return */
        if (kprobes_allow_optimization)
                goto out;

        cpus_read_lock();
        kprobes_allow_optimization = true;
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                hlist_for_each_entry(p, head, hlist)
                        if (!kprobe_disabled(p))
                                optimize_kprobe(p);
        }
        cpus_read_unlock();
        printk(KERN_INFO "Kprobes globally optimized\n");
out:
        mutex_unlock(&kprobe_mutex);
}

#ifdef CONFIG_SYSCTL
static void unoptimize_all_kprobes(void)
{
        struct hlist_head *head;
        struct kprobe *p;
        unsigned int i;

        mutex_lock(&kprobe_mutex);
        /* If optimization is already prohibited, just return */
        if (!kprobes_allow_optimization) {
                mutex_unlock(&kprobe_mutex);
                return;
        }

        cpus_read_lock();
        kprobes_allow_optimization = false;
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                hlist_for_each_entry(p, head, hlist) {
                        if (!kprobe_disabled(p))
                                unoptimize_kprobe(p, false);
                }
        }
        cpus_read_unlock();
        mutex_unlock(&kprobe_mutex);

        /* Wait for unoptimizing completion */
        wait_for_kprobe_optimizer();
        printk(KERN_INFO "Kprobes globally unoptimized\n");
}

static DEFINE_MUTEX(kprobe_sysctl_mutex);
int sysctl_kprobes_optimization;
int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
                                      void *buffer, size_t *length,
                                      loff_t *ppos)
{
        int ret;

        mutex_lock(&kprobe_sysctl_mutex);
        sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;
        ret = proc_dointvec_minmax(table, write, buffer, length, ppos);

        if (sysctl_kprobes_optimization)
                optimize_all_kprobes();
        else
                unoptimize_all_kprobes();
        mutex_unlock(&kprobe_sysctl_mutex);

        return ret;
}
#endif /* CONFIG_SYSCTL */

/* Put a breakpoint for a probe. Must be called with text_mutex locked */
static void __arm_kprobe(struct kprobe *p)
{
        struct kprobe *_p;

        /* Check collision with other optimized kprobes */
        _p = get_optimized_kprobe((unsigned long)p->addr);
        if (unlikely(_p))
                /* Fallback to unoptimized kprobe */
                unoptimize_kprobe(_p, true);

        arch_arm_kprobe(p);
        optimize_kprobe(p);        /* Try to optimize (add kprobe to a list) */
}

/* Remove the breakpoint of a probe. Must be called with text_mutex locked */
static void __disarm_kprobe(struct kprobe *p, bool reopt)
{
        struct kprobe *_p;

        /* Try to unoptimize */
        unoptimize_kprobe(p, kprobes_all_disarmed);

        if (!kprobe_queued(p)) {
                arch_disarm_kprobe(p);
                /* If another kprobe was blocked, optimize it. */
                _p = get_optimized_kprobe((unsigned long)p->addr);
                if (unlikely(_p) && reopt)
                        optimize_kprobe(_p);
        }
        /* TODO: reoptimize others after unoptimized this probe */
}

#else /* !CONFIG_OPTPROBES */

#define optimize_kprobe(p)                        do {} while (0)
#define unoptimize_kprobe(p, f)                        do {} while (0)
#define kill_optimized_kprobe(p)                do {} while (0)
#define prepare_optimized_kprobe(p)                do {} while (0)
#define try_to_optimize_kprobe(p)                do {} while (0)
#define __arm_kprobe(p)                                arch_arm_kprobe(p)
#define __disarm_kprobe(p, o)                        arch_disarm_kprobe(p)
#define kprobe_disarmed(p)                        kprobe_disabled(p)
#define wait_for_kprobe_optimizer()                do {} while (0)

static int reuse_unused_kprobe(struct kprobe *ap)
{
        /*
         * If the optimized kprobe is NOT supported, the aggr kprobe is
         * released at the same time that the last aggregated kprobe is
         * unregistered.
         * Thus there should be no chance to reuse unused kprobe.
         */
        printk(KERN_ERR "Error: There should be no unused kprobe here.\n");
        return -EINVAL;
}

static void free_aggr_kprobe(struct kprobe *p)
{
        arch_remove_kprobe(p);
        kfree(p);
}

static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
{
        return kzalloc(sizeof(struct kprobe), GFP_KERNEL);
}
#endif /* CONFIG_OPTPROBES */

#ifdef CONFIG_KPROBES_ON_FTRACE
static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
        .func = kprobe_ftrace_handler,
        .flags = FTRACE_OPS_FL_SAVE_REGS,
};

static struct ftrace_ops kprobe_ipmodify_ops __read_mostly = {
        .func = kprobe_ftrace_handler,
        .flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_IPMODIFY,
};

static int kprobe_ipmodify_enabled;
static int kprobe_ftrace_enabled;

/* Must ensure p->addr is really on ftrace */
static int prepare_kprobe(struct kprobe *p)
{
        if (!kprobe_ftrace(p))
                return arch_prepare_kprobe(p);

        return arch_prepare_kprobe_ftrace(p);
}

/* Caller must lock kprobe_mutex */
static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
                               int *cnt)
{
        int ret = 0;

        ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 0, 0);
        if (ret) {
                pr_debug("Failed to arm kprobe-ftrace at %pS (%d)\n",
                         p->addr, ret);
                return ret;
        }

        if (*cnt == 0) {
                ret = register_ftrace_function(ops);
                if (ret) {
                        pr_debug("Failed to init kprobe-ftrace (%d)\n", ret);
                        goto err_ftrace;
                }
        }

        (*cnt)++;
        return ret;

err_ftrace:
        /*
         * At this point, sinec ops is not registered, we should be sefe from
         * registering empty filter.
         */
        ftrace_set_filter_ip(ops, (unsigned long)p->addr, 1, 0);
        return ret;
}

static int arm_kprobe_ftrace(struct kprobe *p)
{
        bool ipmodify = (p->post_handler != NULL);

        return __arm_kprobe_ftrace(p,
                ipmodify ? &kprobe_ipmodify_ops : &kprobe_ftrace_ops,
                ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled);
}

/* Caller must lock kprobe_mutex */
static int __disarm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
                                  int *cnt)
{
        int ret = 0;

        if (*cnt == 1) {
                ret = unregister_ftrace_function(ops);
                if (WARN(ret < 0, "Failed to unregister kprobe-ftrace (%d)\n", ret))
                        return ret;
        }

        (*cnt)--;

        ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 1, 0);
        WARN_ONCE(ret < 0, "Failed to disarm kprobe-ftrace at %pS (%d)\n",
                  p->addr, ret);
        return ret;
}

static int disarm_kprobe_ftrace(struct kprobe *p)
{
        bool ipmodify = (p->post_handler != NULL);

        return __disarm_kprobe_ftrace(p,
                ipmodify ? &kprobe_ipmodify_ops : &kprobe_ftrace_ops,
                ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled);
}
#else        /* !CONFIG_KPROBES_ON_FTRACE */
static inline int prepare_kprobe(struct kprobe *p)
{
        return arch_prepare_kprobe(p);
}

static inline int arm_kprobe_ftrace(struct kprobe *p)
{
        return -ENODEV;
}

static inline int disarm_kprobe_ftrace(struct kprobe *p)
{
        return -ENODEV;
}
#endif

/* Arm a kprobe with text_mutex */
static int arm_kprobe(struct kprobe *kp)
{
        if (unlikely(kprobe_ftrace(kp)))
                return arm_kprobe_ftrace(kp);

        cpus_read_lock();
        mutex_lock(&text_mutex);
        __arm_kprobe(kp);
        mutex_unlock(&text_mutex);
        cpus_read_unlock();

        return 0;
}

/* Disarm a kprobe with text_mutex */
static int disarm_kprobe(struct kprobe *kp, bool reopt)
{
        if (unlikely(kprobe_ftrace(kp)))
                return disarm_kprobe_ftrace(kp);

        cpus_read_lock();
        mutex_lock(&text_mutex);
        __disarm_kprobe(kp, reopt);
        mutex_unlock(&text_mutex);
        cpus_read_unlock();

        return 0;
}

/*
 * Aggregate handlers for multiple kprobes support - these handlers
 * take care of invoking the individual kprobe handlers on p->list
 */
static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
{
        struct kprobe *kp;

        list_for_each_entry_rcu(kp, &p->list, list) {
                if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
                        set_kprobe_instance(kp);
                        if (kp->pre_handler(kp, regs))
                                return 1;
                }
                reset_kprobe_instance();
        }
        return 0;
}
NOKPROBE_SYMBOL(aggr_pre_handler);

static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
                              unsigned long flags)
{
        struct kprobe *kp;

        list_for_each_entry_rcu(kp, &p->list, list) {
                if (kp->post_handler && likely(!kprobe_disabled(kp))) {
                        set_kprobe_instance(kp);
                        kp->post_handler(kp, regs, flags);
                        reset_kprobe_instance();
                }
        }
}
NOKPROBE_SYMBOL(aggr_post_handler);

static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
                              int trapnr)
{
        struct kprobe *cur = __this_cpu_read(kprobe_instance);

        /*
         * if we faulted "during" the execution of a user specified
         * probe handler, invoke just that probe's fault handler
         */
        if (cur && cur->fault_handler) {
                if (cur->fault_handler(cur, regs, trapnr))
                        return 1;
        }
        return 0;
}
NOKPROBE_SYMBOL(aggr_fault_handler);

/* Walks the list and increments nmissed count for multiprobe case */
void kprobes_inc_nmissed_count(struct kprobe *p)
{
        struct kprobe *kp;
        if (!kprobe_aggrprobe(p)) {
                p->nmissed++;
        } else {
                list_for_each_entry_rcu(kp, &p->list, list)
                        kp->nmissed++;
        }
        return;
}
NOKPROBE_SYMBOL(kprobes_inc_nmissed_count);

static void recycle_rp_inst(struct kretprobe_instance *ri)
{
        struct kretprobe *rp = ri->rp;

        /* remove rp inst off the rprobe_inst_table */
        hlist_del(&ri->hlist);
        INIT_HLIST_NODE(&ri->hlist);
        if (likely(rp)) {
                raw_spin_lock(&rp->lock);
                hlist_add_head(&ri->hlist, &rp->free_instances);
                raw_spin_unlock(&rp->lock);
        } else
                kfree_rcu(ri, rcu);
}
NOKPROBE_SYMBOL(recycle_rp_inst);

static void kretprobe_hash_lock(struct task_struct *tsk,
                         struct hlist_head **head, unsigned long *flags)
__acquires(hlist_lock)
{
        unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
        raw_spinlock_t *hlist_lock;

        *head = &kretprobe_inst_table[hash];
        hlist_lock = kretprobe_table_lock_ptr(hash);
        /*
         * Nested is a workaround that will soon not be needed.
         * There's other protections that make sure the same lock
         * is not taken on the same CPU that lockdep is unaware of.
         * Differentiate when it is taken in NMI context.
         */
        raw_spin_lock_irqsave_nested(hlist_lock, *flags, !!in_nmi());
}
NOKPROBE_SYMBOL(kretprobe_hash_lock);

static void kretprobe_table_lock(unsigned long hash,
                                 unsigned long *flags)
__acquires(hlist_lock)
{
        raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
        /*
         * Nested is a workaround that will soon not be needed.
         * There's other protections that make sure the same lock
         * is not taken on the same CPU that lockdep is unaware of.
         * Differentiate when it is taken in NMI context.
         */
        raw_spin_lock_irqsave_nested(hlist_lock, *flags, !!in_nmi());
}
NOKPROBE_SYMBOL(kretprobe_table_lock);

static void kretprobe_hash_unlock(struct task_struct *tsk,
                           unsigned long *flags)
__releases(hlist_lock)
{
        unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
        raw_spinlock_t *hlist_lock;

        hlist_lock = kretprobe_table_lock_ptr(hash);
        raw_spin_unlock_irqrestore(hlist_lock, *flags);
}
NOKPROBE_SYMBOL(kretprobe_hash_unlock);

static void kretprobe_table_unlock(unsigned long hash,
                                   unsigned long *flags)
__releases(hlist_lock)
{
        raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
        raw_spin_unlock_irqrestore(hlist_lock, *flags);
}
NOKPROBE_SYMBOL(kretprobe_table_unlock);

static struct kprobe kprobe_busy = {
        .addr = (void *) get_kprobe,
};

void kprobe_busy_begin(void)
{
        struct kprobe_ctlblk *kcb;

        preempt_disable();
        __this_cpu_write(current_kprobe, &kprobe_busy);
        kcb = get_kprobe_ctlblk();
        kcb->kprobe_status = KPROBE_HIT_ACTIVE;
}

void kprobe_busy_end(void)
{
        __this_cpu_write(current_kprobe, NULL);
        preempt_enable();
}

/*
 * This function is called from finish_task_switch when task tk becomes dead,
 * so that we can recycle any function-return probe instances associated
 * with this task. These left over instances represent probed functions
 * that have been called but will never return.
 */
void kprobe_flush_task(struct task_struct *tk)
{
        struct kretprobe_instance *ri;
        struct hlist_head *head;
        struct hlist_node *tmp;
        unsigned long hash, flags = 0;

        if (unlikely(!kprobes_initialized))
                /* Early boot.  kretprobe_table_locks not yet initialized. */
                return;

        kprobe_busy_begin();

        hash = hash_ptr(tk, KPROBE_HASH_BITS);
        head = &kretprobe_inst_table[hash];
        kretprobe_table_lock(hash, &flags);
        hlist_for_each_entry_safe(ri, tmp, head, hlist) {
                if (ri->task == tk)
                        recycle_rp_inst(ri);
        }
        kretprobe_table_unlock(hash, &flags);

        kprobe_busy_end();
}
NOKPROBE_SYMBOL(kprobe_flush_task);

static inline void free_rp_inst(struct kretprobe *rp)
{
        struct kretprobe_instance *ri;
        struct hlist_node *next;

        hlist_for_each_entry_safe(ri, next, &rp->free_instances, hlist) {
                hlist_del(&ri->hlist);
                kfree(ri);
        }
}

static void cleanup_rp_inst(struct kretprobe *rp)
{
        unsigned long flags, hash;
        struct kretprobe_instance *ri;
        struct hlist_node *next;
        struct hlist_head *head;

        /* To avoid recursive kretprobe by NMI, set kprobe busy here */
        kprobe_busy_begin();
        for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) {
                kretprobe_table_lock(hash, &flags);
                head = &kretprobe_inst_table[hash];
                hlist_for_each_entry_safe(ri, next, head, hlist) {
                        if (ri->rp == rp)
                                ri->rp = NULL;
                }
                kretprobe_table_unlock(hash, &flags);
        }
        kprobe_busy_end();

        free_rp_inst(rp);
}
NOKPROBE_SYMBOL(cleanup_rp_inst);

/* Add the new probe to ap->list */
static int add_new_kprobe(struct kprobe *ap, struct kprobe *p)
{
        if (p->post_handler)
                unoptimize_kprobe(ap, true);        /* Fall back to normal kprobe */

        list_add_rcu(&p->list, &ap->list);
        if (p->post_handler && !ap->post_handler)
                ap->post_handler = aggr_post_handler;

        return 0;
}

/*
 * Fill in the required fields of the "manager kprobe". Replace the
 * earlier kprobe in the hlist with the manager kprobe
 */
static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
{
        /* Copy p's insn slot to ap */
        copy_kprobe(p, ap);
        flush_insn_slot(ap);
        ap->addr = p->addr;
        ap->flags = p->flags & ~KPROBE_FLAG_OPTIMIZED;
        ap->pre_handler = aggr_pre_handler;
        ap->fault_handler = aggr_fault_handler;
        /* We don't care the kprobe which has gone. */
        if (p->post_handler && !kprobe_gone(p))
                ap->post_handler = aggr_post_handler;

        INIT_LIST_HEAD(&ap->list);
        INIT_HLIST_NODE(&ap->hlist);

        list_add_rcu(&p->list, &ap->list);
        hlist_replace_rcu(&p->hlist, &ap->hlist);
}

/*
 * This is the second or subsequent kprobe at the address - handle
 * the intricacies
 */
static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p)
{
        int ret = 0;
        struct kprobe *ap = orig_p;

        cpus_read_lock();

        /* For preparing optimization, jump_label_text_reserved() is called */
        jump_label_lock();
        mutex_lock(&text_mutex);

        if (!kprobe_aggrprobe(orig_p)) {
                /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */
                ap = alloc_aggr_kprobe(orig_p);
                if (!ap) {
                        ret = -ENOMEM;
                        goto out;
                }
                init_aggr_kprobe(ap, orig_p);
        } else if (kprobe_unused(ap)) {
                /* This probe is going to die. Rescue it */
                ret = reuse_unused_kprobe(ap);
                if (ret)
                        goto out;
        }

        if (kprobe_gone(ap)) {
                /*
                 * Attempting to insert new probe at the same location that
                 * had a probe in the module vaddr area which already
                 * freed. So, the instruction slot has already been
                 * released. We need a new slot for the new probe.
                 */
                ret = arch_prepare_kprobe(ap);
                if (ret)
                        /*
                         * Even if fail to allocate new slot, don't need to
                         * free aggr_probe. It will be used next time, or
                         * freed by unregister_kprobe.
                         */
                        goto out;

                /* Prepare optimized instructions if possible. */
                prepare_optimized_kprobe(ap);

                /*
                 * Clear gone flag to prevent allocating new slot again, and
                 * set disabled flag because it is not armed yet.
                 */
                ap->flags = (ap->flags & ~KPROBE_FLAG_GONE)
                            | KPROBE_FLAG_DISABLED;
        }

        /* Copy ap's insn slot to p */
        copy_kprobe(ap, p);
        ret = add_new_kprobe(ap, p);

out:
        mutex_unlock(&text_mutex);
        jump_label_unlock();
        cpus_read_unlock();

        if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) {
                ap->flags &= ~KPROBE_FLAG_DISABLED;
                if (!kprobes_all_disarmed) {
                        /* Arm the breakpoint again. */
                        ret = arm_kprobe(ap);
                        if (ret) {
                                ap->flags |= KPROBE_FLAG_DISABLED;
                                list_del_rcu(&p->list);
                                synchronize_rcu();
                        }
                }
        }
        return ret;
}

bool __weak arch_within_kprobe_blacklist(unsigned long addr)
{
        /* The __kprobes marked functions and entry code must not be probed */
        return addr >= (unsigned long)__kprobes_text_start &&
               addr < (unsigned long)__kprobes_text_end;
}

static bool __within_kprobe_blacklist(unsigned long addr)
{
        struct kprobe_blacklist_entry *ent;

        if (arch_within_kprobe_blacklist(addr))
                return true;
        /*
         * If there exists a kprobe_blacklist, verify and
         * fail any probe registration in the prohibited area
         */
        list_for_each_entry(ent, &kprobe_blacklist, list) {
                if (addr >= ent->start_addr && addr < ent->end_addr)
                        return true;
        }
        return false;
}

bool within_kprobe_blacklist(unsigned long addr)
{
        char symname[KSYM_NAME_LEN], *p;

        if (__within_kprobe_blacklist(addr))
                return true;

        /* Check if the address is on a suffixed-symbol */
        if (!lookup_symbol_name(addr, symname)) {
                p = strchr(symname, '.');
                if (!p)
                        return false;
                *p = '\0';
                addr = (unsigned long)kprobe_lookup_name(symname, 0);
                if (addr)
                        return __within_kprobe_blacklist(addr);
        }
        return false;
}

/*
 * If we have a symbol_name argument, look it up and add the offset field
 * to it. This way, we can specify a relative address to a symbol.
 * This returns encoded errors if it fails to look up symbol or invalid
 * combination of parameters.
 */
static kprobe_opcode_t *_kprobe_addr(kprobe_opcode_t *addr,
                        const char *symbol_name, unsigned int offset)
{
        if ((symbol_name && addr) || (!symbol_name && !addr))
                goto invalid;

        if (symbol_name) {
                addr = kprobe_lookup_name(symbol_name, offset);
                if (!addr)
                        return ERR_PTR(-ENOENT);
        }

        addr = (kprobe_opcode_t *)(((char *)addr) + offset);
        if (addr)
                return addr;

invalid:
        return ERR_PTR(-EINVAL);
}

static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
{
        return _kprobe_addr(p->addr, p->symbol_name, p->offset);
}

/* Check passed kprobe is valid and return kprobe in kprobe_table. */
static struct kprobe *__get_valid_kprobe(struct kprobe *p)
{
        struct kprobe *ap, *list_p;

        lockdep_assert_held(&kprobe_mutex);

        ap = get_kprobe(p->addr);
        if (unlikely(!ap))
                return NULL;

        if (p != ap) {
                list_for_each_entry(list_p, &ap->list, list)
                        if (list_p == p)
                        /* kprobe p is a valid probe */
                                goto valid;
                return NULL;
        }
valid:
        return ap;
}

/* Return error if the kprobe is being re-registered */
static inline int check_kprobe_rereg(struct kprobe *p)
{
        int ret = 0;

        mutex_lock(&kprobe_mutex);
        if (__get_valid_kprobe(p))
                ret = -EINVAL;
        mutex_unlock(&kprobe_mutex);

        return ret;
}

int __weak arch_check_ftrace_location(struct kprobe *p)
{
        unsigned long addr = (unsigned long)p->addr;

        if (ftrace_location(addr) == addr) {
#ifdef CONFIG_KPROBES_ON_FTRACE
                p->flags |= KPROBE_FLAG_FTRACE;
#else        /* !CONFIG_KPROBES_ON_FTRACE */
                return -EINVAL;
#endif
        }
        return 0;
}

static bool is_cfi_preamble_symbol(unsigned long addr)
{
        char symbuf[KSYM_NAME_LEN];

        if (lookup_symbol_name(addr, symbuf))
                return false;

        return str_has_prefix(symbuf, "__cfi_") ||
                str_has_prefix(symbuf, "__pfx_");
}

static int check_kprobe_address_safe(struct kprobe *p,
                                     struct module **probed_mod)
{
        int ret;

        ret = arch_check_ftrace_location(p);
        if (ret)
                return ret;
        jump_label_lock();
        preempt_disable();

        /* Ensure the address is in a text area, and find a module if exists. */
        *probed_mod = NULL;
        if (!core_kernel_text((unsigned long) p->addr)) {
                *probed_mod = __module_text_address((unsigned long) p->addr);
                if (!(*probed_mod)) {
                        ret = -EINVAL;
                        goto out;
                }
        }
        /* Ensure it is not in reserved area. */
        if (in_gate_area_no_mm((unsigned long) p->addr) ||
            within_kprobe_blacklist((unsigned long) p->addr) ||
            jump_label_text_reserved(p->addr, p->addr) ||
            static_call_text_reserved(p->addr, p->addr) ||
            find_bug((unsigned long)p->addr) ||
            is_cfi_preamble_symbol((unsigned long)p->addr)) {
                ret = -EINVAL;
                goto out;
        }

        /* Get module refcount and reject __init functions for loaded modules. */
        if (*probed_mod) {
                /*
                 * We must hold a refcount of the probed module while updating
                 * its code to prohibit unexpected unloading.
                 */
                if (unlikely(!try_module_get(*probed_mod))) {
                        ret = -ENOENT;
                        goto out;
                }

                /*
                 * If the module freed .init.text, we couldn't insert
                 * kprobes in there.
                 */
                if (within_module_init((unsigned long)p->addr, *probed_mod) &&
                    (*probed_mod)->state != MODULE_STATE_COMING) {
                        module_put(*probed_mod);
                        *probed_mod = NULL;
                        ret = -ENOENT;
                }
        }
out:
        preempt_enable();
        jump_label_unlock();

        return ret;
}

int register_kprobe(struct kprobe *p)
{
        int ret;
        struct kprobe *old_p;
        struct module *probed_mod;
        kprobe_opcode_t *addr;

        /* Adjust probe address from symbol */
        addr = kprobe_addr(p);
        if (IS_ERR(addr))
                return PTR_ERR(addr);
        p->addr = addr;

        ret = check_kprobe_rereg(p);
        if (ret)
                return ret;

        /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
        p->flags &= KPROBE_FLAG_DISABLED;
        p->nmissed = 0;
        INIT_LIST_HEAD(&p->list);

        ret = check_kprobe_address_safe(p, &probed_mod);
        if (ret)
                return ret;

        mutex_lock(&kprobe_mutex);

        old_p = get_kprobe(p->addr);
        if (old_p) {
                /* Since this may unoptimize old_p, locking text_mutex. */
                ret = register_aggr_kprobe(old_p, p);
                goto out;
        }

        cpus_read_lock();
        /* Prevent text modification */
        mutex_lock(&text_mutex);
        ret = prepare_kprobe(p);
        mutex_unlock(&text_mutex);
        cpus_read_unlock();
        if (ret)
                goto out;

        INIT_HLIST_NODE(&p->hlist);
        hlist_add_head_rcu(&p->hlist,
                       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);

        if (!kprobes_all_disarmed && !kprobe_disabled(p)) {
                ret = arm_kprobe(p);
                if (ret) {
                        hlist_del_rcu(&p->hlist);
                        synchronize_rcu();
                        goto out;
                }
        }

        /* Try to optimize kprobe */
        try_to_optimize_kprobe(p);
out:
        mutex_unlock(&kprobe_mutex);

        if (probed_mod)
                module_put(probed_mod);

        return ret;
}
EXPORT_SYMBOL_GPL(register_kprobe);

/* Check if all probes on the aggrprobe are disabled */
static int aggr_kprobe_disabled(struct kprobe *ap)
{
        struct kprobe *kp;

        lockdep_assert_held(&kprobe_mutex);

        list_for_each_entry(kp, &ap->list, list)
                if (!kprobe_disabled(kp))
                        /*
                         * There is an active probe on the list.
                         * We can't disable this ap.
                         */
                        return 0;

        return 1;
}

/* Disable one kprobe: Make sure called under kprobe_mutex is locked */
static struct kprobe *__disable_kprobe(struct kprobe *p)
{
        struct kprobe *orig_p;
        int ret;

        /* Get an original kprobe for return */
        orig_p = __get_valid_kprobe(p);
        if (unlikely(orig_p == NULL))
                return ERR_PTR(-EINVAL);

        if (!kprobe_disabled(p)) {
                /* Disable probe if it is a child probe */
                if (p != orig_p)
                        p->flags |= KPROBE_FLAG_DISABLED;

                /* Try to disarm and disable this/parent probe */
                if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
                        /*
                         * Don't be lazy here.  Even if 'kprobes_all_disarmed'
                         * is false, 'orig_p' might not have been armed yet.
                         * Note arm_all_kprobes() __tries__ to arm all kprobes
                         * on the best effort basis.
                         */
                        if (!kprobes_all_disarmed && !kprobe_disabled(orig_p)) {
                                ret = disarm_kprobe(orig_p, true);
                                if (ret) {
                                        p->flags &= ~KPROBE_FLAG_DISABLED;
                                        return ERR_PTR(ret);
                                }
                        }
                        orig_p->flags |= KPROBE_FLAG_DISABLED;
                }
        }

        return orig_p;
}

/*
 * Unregister a kprobe without a scheduler synchronization.
 */
static int __unregister_kprobe_top(struct kprobe *p)
{
        struct kprobe *ap, *list_p;

        /* Disable kprobe. This will disarm it if needed. */
        ap = __disable_kprobe(p);
        if (IS_ERR(ap))
                return PTR_ERR(ap);

        if (ap == p)
                /*
                 * This probe is an independent(and non-optimized) kprobe
                 * (not an aggrprobe). Remove from the hash list.
                 */
                goto disarmed;

        /* Following process expects this probe is an aggrprobe */
        WARN_ON(!kprobe_aggrprobe(ap));

        if (list_is_singular(&ap->list) && kprobe_disarmed(ap))
                /*
                 * !disarmed could be happen if the probe is under delayed
                 * unoptimizing.
                 */
                goto disarmed;
        else {
                /* If disabling probe has special handlers, update aggrprobe */
                if (p->post_handler && !kprobe_gone(p)) {
                        list_for_each_entry(list_p, &ap->list, list) {
                                if ((list_p != p) && (list_p->post_handler))
                                        goto noclean;
                        }
                        /*
                         * For the kprobe-on-ftrace case, we keep the
                         * post_handler setting to identify this aggrprobe
                         * armed with kprobe_ipmodify_ops.
                         */
                        if (!kprobe_ftrace(ap))
                                ap->post_handler = NULL;
                }
noclean:
                /*
                 * Remove from the aggrprobe: this path will do nothing in
                 * __unregister_kprobe_bottom().
                 */
                list_del_rcu(&p->list);
                if (!kprobe_disabled(ap) && !kprobes_all_disarmed)
                        /*
                         * Try to optimize this probe again, because post
                         * handler may have been changed.
                         */
                        optimize_kprobe(ap);
        }
        return 0;

disarmed:
        hlist_del_rcu(&ap->hlist);
        return 0;
}

static void __unregister_kprobe_bottom(struct kprobe *p)
{
        struct kprobe *ap;

        if (list_empty(&p->list))
                /* This is an independent kprobe */
                arch_remove_kprobe(p);
        else if (list_is_singular(&p->list)) {
                /* This is the last child of an aggrprobe */
                ap = list_entry(p->list.next, struct kprobe, list);
                list_del(&p->list);
                free_aggr_kprobe(ap);
        }
        /* Otherwise, do nothing. */
}

int register_kprobes(struct kprobe **kps, int num)
{
        int i, ret = 0;

        if (num <= 0)
                return -EINVAL;
        for (i = 0; i < num; i++) {
                ret = register_kprobe(kps[i]);
                if (ret < 0) {
                        if (i > 0)
                                unregister_kprobes(kps, i);
                        break;
                }
        }
        return ret;
}
EXPORT_SYMBOL_GPL(register_kprobes);

void unregister_kprobe(struct kprobe *p)
{
        unregister_kprobes(&p, 1);
}
EXPORT_SYMBOL_GPL(unregister_kprobe);

void unregister_kprobes(struct kprobe **kps, int num)
{
        int i;

        if (num <= 0)
                return;
        mutex_lock(&kprobe_mutex);
        for (i = 0; i < num; i++)
                if (__unregister_kprobe_top(kps[i]) < 0)
                        kps[i]->addr = NULL;
        mutex_unlock(&kprobe_mutex);

        synchronize_rcu();
        for (i = 0; i < num; i++)
                if (kps[i]->addr)
                        __unregister_kprobe_bottom(kps[i]);
}
EXPORT_SYMBOL_GPL(unregister_kprobes);

int __weak kprobe_exceptions_notify(struct notifier_block *self,
                                        unsigned long val, void *data)
{
        return NOTIFY_DONE;
}
NOKPROBE_SYMBOL(kprobe_exceptions_notify);

static struct notifier_block kprobe_exceptions_nb = {
        .notifier_call = kprobe_exceptions_notify,
        .priority = 0x7fffffff /* we need to be notified first */
};

unsigned long __weak arch_deref_entry_point(void *entry)
{
        return (unsigned long)entry;
}

#ifdef CONFIG_KRETPROBES

unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs,
                                             void *trampoline_address,
                                             void *frame_pointer)
{
        struct kretprobe_instance *ri = NULL, *last = NULL;
        struct hlist_head *head;
        struct hlist_node *tmp;
        unsigned long flags;
        kprobe_opcode_t *correct_ret_addr = NULL;
        bool skipped = false;

        kretprobe_hash_lock(current, &head, &flags);

        /*
         * It is possible to have multiple instances associated with a given
         * task either because multiple functions in the call path have
         * return probes installed on them, and/or more than one
         * return probe was registered for a target function.
         *
         * We can handle this because:
         *     - instances are always pushed into the head of the list
         *     - when multiple return probes are registered for the same
         *         function, the (chronologically) first instance's ret_addr
         *         will be the real return address, and all the rest will
         *         point to kretprobe_trampoline.
         */
        hlist_for_each_entry(ri, head, hlist) {
                if (ri->task != current)
                        /* another task is sharing our hash bucket */
                        continue;
                /*
                 * Return probes must be pushed on this hash list correct
                 * order (same as return order) so that it can be popped
                 * correctly. However, if we find it is pushed it incorrect
                 * order, this means we find a function which should not be
                 * probed, because the wrong order entry is pushed on the
                 * path of processing other kretprobe itself.
                 */
                if (ri->fp != frame_pointer) {
                        if (!skipped)
                                pr_warn("kretprobe is stacked incorrectly. Trying to fixup.\n");
                        skipped = true;
                        continue;
                }

                correct_ret_addr = ri->ret_addr;
                if (skipped)
                        pr_warn("%ps must be blacklisted because of incorrect kretprobe order\n",
                                ri->rp->kp.addr);

                if (correct_ret_addr != trampoline_address)
                        /*
                         * This is the real return address. Any other
                         * instances associated with this task are for
                         * other calls deeper on the call stack
                         */
                        break;
        }

        BUG_ON(!correct_ret_addr || (correct_ret_addr == trampoline_address));
        last = ri;

        hlist_for_each_entry_safe(ri, tmp, head, hlist) {
                if (ri->task != current)
                        /* another task is sharing our hash bucket */
                        continue;
                if (ri->fp != frame_pointer)
                        continue;

                if (ri->rp && ri->rp->handler) {
                        struct kprobe *prev = kprobe_running();

                        __this_cpu_write(current_kprobe, &ri->rp->kp);
                        ri->ret_addr = correct_ret_addr;
                        ri->rp->handler(ri, regs);
                        __this_cpu_write(current_kprobe, prev);
                }

                recycle_rp_inst(ri);

                if (ri == last)
                        break;
        }

        kretprobe_hash_unlock(current, &flags);

        return (unsigned long)correct_ret_addr;
}
NOKPROBE_SYMBOL(__kretprobe_trampoline_handler)

/*
 * This kprobe pre_handler is registered with every kretprobe. When probe
 * hits it will set up the return probe.
 */
static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
{
        struct kretprobe *rp = container_of(p, struct kretprobe, kp);
        unsigned long hash, flags = 0;
        struct kretprobe_instance *ri;

        /* TODO: consider to only swap the RA after the last pre_handler fired */
        hash = hash_ptr(current, KPROBE_HASH_BITS);
        /*
         * Nested is a workaround that will soon not be needed.
         * There's other protections that make sure the same lock
         * is not taken on the same CPU that lockdep is unaware of.
         */
        raw_spin_lock_irqsave_nested(&rp->lock, flags, 1);
        if (!hlist_empty(&rp->free_instances)) {
                ri = hlist_entry(rp->free_instances.first,
                                struct kretprobe_instance, hlist);
                hlist_del(&ri->hlist);
                raw_spin_unlock_irqrestore(&rp->lock, flags);

                ri->rp = rp;
                ri->task = current;

                if (rp->entry_handler && rp->entry_handler(ri, regs)) {
                        raw_spin_lock_irqsave_nested(&rp->lock, flags, 1);
                        hlist_add_head(&ri->hlist, &rp->free_instances);
                        raw_spin_unlock_irqrestore(&rp->lock, flags);
                        return 0;
                }

                arch_prepare_kretprobe(ri, regs);

                /* XXX(hch): why is there no hlist_move_head? */
                INIT_HLIST_NODE(&ri->hlist);
                kretprobe_table_lock(hash, &flags);
                hlist_add_head(&ri->hlist, &kretprobe_inst_table[hash]);
                kretprobe_table_unlock(hash, &flags);
        } else {
                rp->nmissed++;
                raw_spin_unlock_irqrestore(&rp->lock, flags);
        }
        return 0;
}
NOKPROBE_SYMBOL(pre_handler_kretprobe);

bool __weak arch_kprobe_on_func_entry(unsigned long offset)
{
        return !offset;
}

/**
 * kprobe_on_func_entry() -- check whether given address is function entry
 * @addr: Target address
 * @sym:  Target symbol name
 * @offset: The offset from the symbol or the address
 *
 * This checks whether the given @addr+@offset or @sym+@offset is on the
 * function entry address or not.
 * This returns 0 if it is the function entry, or -EINVAL if it is not.
 * And also it returns -ENOENT if it fails the symbol or address lookup.
 * Caller must pass @addr or @sym (either one must be NULL), or this
 * returns -EINVAL.
 */
int kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset)
{
        kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset);

        if (IS_ERR(kp_addr))
                return PTR_ERR(kp_addr);

        if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset))
                return -ENOENT;

        if (!arch_kprobe_on_func_entry(offset))
                return -EINVAL;

        return 0;
}

int register_kretprobe(struct kretprobe *rp)
{
        int ret;
        struct kretprobe_instance *inst;
        int i;
        void *addr;

        ret = kprobe_on_func_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset);
        if (ret)
                return ret;

        /* If only rp->kp.addr is specified, check reregistering kprobes */
        if (rp->kp.addr && check_kprobe_rereg(&rp->kp))
                return -EINVAL;

        if (kretprobe_blacklist_size) {
                addr = kprobe_addr(&rp->kp);
                if (IS_ERR(addr))
                        return PTR_ERR(addr);

                for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
                        if (kretprobe_blacklist[i].addr == addr)
                                return -EINVAL;
                }
        }

        if (rp->data_size > KRETPROBE_MAX_DATA_SIZE)
                return -E2BIG;

        rp->kp.pre_handler = pre_handler_kretprobe;
        rp->kp.post_handler = NULL;
        rp->kp.fault_handler = NULL;

        /* Pre-allocate memory for max kretprobe instances */
        if (rp->maxactive <= 0) {
#ifdef CONFIG_PREEMPTION
                rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());
#else
                rp->maxactive = num_possible_cpus();
#endif
        }
        raw_spin_lock_init(&rp->lock);
        INIT_HLIST_HEAD(&rp->free_instances);
        for (i = 0; i < rp->maxactive; i++) {
                inst = kmalloc(sizeof(struct kretprobe_instance) +
                               rp->data_size, GFP_KERNEL);
                if (inst == NULL) {
                        free_rp_inst(rp);
                        return -ENOMEM;
                }
                INIT_HLIST_NODE(&inst->hlist);
                hlist_add_head(&inst->hlist, &rp->free_instances);
        }

        rp->nmissed = 0;
        /* Establish function entry probe point */
        ret = register_kprobe(&rp->kp);
        if (ret != 0)
                free_rp_inst(rp);
        return ret;
}
EXPORT_SYMBOL_GPL(register_kretprobe);

int register_kretprobes(struct kretprobe **rps, int num)
{
        int ret = 0, i;

        if (num <= 0)
                return -EINVAL;
        for (i = 0; i < num; i++) {
                ret = register_kretprobe(rps[i]);
                if (ret < 0) {
                        if (i > 0)
                                unregister_kretprobes(rps, i);
                        break;
                }
        }
        return ret;
}
EXPORT_SYMBOL_GPL(register_kretprobes);

void unregister_kretprobe(struct kretprobe *rp)
{
        unregister_kretprobes(&rp, 1);
}
EXPORT_SYMBOL_GPL(unregister_kretprobe);

void unregister_kretprobes(struct kretprobe **rps, int num)
{
        int i;

        if (num <= 0)
                return;
        mutex_lock(&kprobe_mutex);
        for (i = 0; i < num; i++)
                if (__unregister_kprobe_top(&rps[i]->kp) < 0)
                        rps[i]->kp.addr = NULL;
        mutex_unlock(&kprobe_mutex);

        synchronize_rcu();
        for (i = 0; i < num; i++) {
                if (rps[i]->kp.addr) {
                        __unregister_kprobe_bottom(&rps[i]->kp);
                        cleanup_rp_inst(rps[i]);
                }
        }
}
EXPORT_SYMBOL_GPL(unregister_kretprobes);

#else /* CONFIG_KRETPROBES */
int register_kretprobe(struct kretprobe *rp)
{
        return -ENOSYS;
}
EXPORT_SYMBOL_GPL(register_kretprobe);

int register_kretprobes(struct kretprobe **rps, int num)
{
        return -ENOSYS;
}
EXPORT_SYMBOL_GPL(register_kretprobes);

void unregister_kretprobe(struct kretprobe *rp)
{
}
EXPORT_SYMBOL_GPL(unregister_kretprobe);

void unregister_kretprobes(struct kretprobe **rps, int num)
{
}
EXPORT_SYMBOL_GPL(unregister_kretprobes);

static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
{
        return 0;
}
NOKPROBE_SYMBOL(pre_handler_kretprobe);

#endif /* CONFIG_KRETPROBES */

/* Set the kprobe gone and remove its instruction buffer. */
static void kill_kprobe(struct kprobe *p)
{
        struct kprobe *kp;

        lockdep_assert_held(&kprobe_mutex);

        if (WARN_ON_ONCE(kprobe_gone(p)))
                return;

        p->flags |= KPROBE_FLAG_GONE;
        if (kprobe_aggrprobe(p)) {
                /*
                 * If this is an aggr_kprobe, we have to list all the
                 * chained probes and mark them GONE.
                 */
                list_for_each_entry(kp, &p->list, list)
                        kp->flags |= KPROBE_FLAG_GONE;
                p->post_handler = NULL;
                kill_optimized_kprobe(p);
        }
        /*
         * Here, we can remove insn_slot safely, because no thread calls
         * the original probed function (which will be freed soon) any more.
         */
        arch_remove_kprobe(p);

        /*
         * The module is going away. We should disarm the kprobe which
         * is using ftrace, because ftrace framework is still available at
         * MODULE_STATE_GOING notification.
         */
        if (kprobe_ftrace(p) && !kprobe_disabled(p) && !kprobes_all_disarmed)
                disarm_kprobe_ftrace(p);
}

/* Disable one kprobe */
int disable_kprobe(struct kprobe *kp)
{
        int ret = 0;
        struct kprobe *p;

        mutex_lock(&kprobe_mutex);

        /* Disable this kprobe */
        p = __disable_kprobe(kp);
        if (IS_ERR(p))
                ret = PTR_ERR(p);

        mutex_unlock(&kprobe_mutex);
        return ret;
}
EXPORT_SYMBOL_GPL(disable_kprobe);

/* Enable one kprobe */
int enable_kprobe(struct kprobe *kp)
{
        int ret = 0;
        struct kprobe *p;

        mutex_lock(&kprobe_mutex);

        /* Check whether specified probe is valid. */
        p = __get_valid_kprobe(kp);
        if (unlikely(p == NULL)) {
                ret = -EINVAL;
                goto out;
        }

        if (kprobe_gone(kp)) {
                /* This kprobe has gone, we couldn't enable it. */
                ret = -EINVAL;
                goto out;
        }

        if (p != kp)
                kp->flags &= ~KPROBE_FLAG_DISABLED;

        if (!kprobes_all_disarmed && kprobe_disabled(p)) {
                p->flags &= ~KPROBE_FLAG_DISABLED;
                ret = arm_kprobe(p);
                if (ret) {
                        p->flags |= KPROBE_FLAG_DISABLED;
                        if (p != kp)
                                kp->flags |= KPROBE_FLAG_DISABLED;
                }
        }
out:
        mutex_unlock(&kprobe_mutex);
        return ret;
}
EXPORT_SYMBOL_GPL(enable_kprobe);

/* Caller must NOT call this in usual path. This is only for critical case */
void dump_kprobe(struct kprobe *kp)
{
        pr_err("Dumping kprobe:\n");
        pr_err("Name: %s\nOffset: %x\nAddress: %pS\n",
               kp->symbol_name, kp->offset, kp->addr);
}
NOKPROBE_SYMBOL(dump_kprobe);

int kprobe_add_ksym_blacklist(unsigned long entry)
{
        struct kprobe_blacklist_entry *ent;
        unsigned long offset = 0, size = 0;

        if (!kernel_text_address(entry) ||
            !kallsyms_lookup_size_offset(entry, &size, &offset))
                return -EINVAL;

        ent = kmalloc(sizeof(*ent), GFP_KERNEL);
        if (!ent)
                return -ENOMEM;
        ent->start_addr = entry;
        ent->end_addr = entry + size;
        INIT_LIST_HEAD(&ent->list);
        list_add_tail(&ent->list, &kprobe_blacklist);

        return (int)size;
}

/* Add all symbols in given area into kprobe blacklist */
int kprobe_add_area_blacklist(unsigned long start, unsigned long end)
{
        unsigned long entry;
        int ret = 0;

        for (entry = start; entry < end; entry += ret) {
                ret = kprobe_add_ksym_blacklist(entry);
                if (ret < 0)
                        return ret;
                if (ret == 0)        /* In case of alias symbol */
                        ret = 1;
        }
        return 0;
}

/* Remove all symbols in given area from kprobe blacklist */
static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end)
{
        struct kprobe_blacklist_entry *ent, *n;

        list_for_each_entry_safe(ent, n, &kprobe_blacklist, list) {
                if (ent->start_addr < start || ent->start_addr >= end)
                        continue;
                list_del(&ent->list);
                kfree(ent);
        }
}

static void kprobe_remove_ksym_blacklist(unsigned long entry)
{
        kprobe_remove_area_blacklist(entry, entry + 1);
}

int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
                                   char *type, char *sym)
{
        return -ERANGE;
}

int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
                       char *sym)
{
#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
        if (!kprobe_cache_get_kallsym(&kprobe_insn_slots, &symnum, value, type, sym))
                return 0;
#ifdef CONFIG_OPTPROBES
        if (!kprobe_cache_get_kallsym(&kprobe_optinsn_slots, &symnum, value, type, sym))
                return 0;
#endif
#endif
        if (!arch_kprobe_get_kallsym(&symnum, value, type, sym))
                return 0;
        return -ERANGE;
}

int __init __weak arch_populate_kprobe_blacklist(void)
{
        return 0;
}

/*
 * Lookup and populate the kprobe_blacklist.
 *
 * Unlike the kretprobe blacklist, we'll need to determine
 * the range of addresses that belong to the said functions,
 * since a kprobe need not necessarily be at the beginning
 * of a function.
 */
static int __init populate_kprobe_blacklist(unsigned long *start,
                                             unsigned long *end)
{
        unsigned long entry;
        unsigned long *iter;
        int ret;

        for (iter = start; iter < end; iter++) {
                entry = arch_deref_entry_point((void *)*iter);
                ret = kprobe_add_ksym_blacklist(entry);
                if (ret == -EINVAL)
                        continue;
                if (ret < 0)
                        return ret;
        }

        /* Symbols in __kprobes_text are blacklisted */
        ret = kprobe_add_area_blacklist((unsigned long)__kprobes_text_start,
                                        (unsigned long)__kprobes_text_end);
        if (ret)
                return ret;

        /* Symbols in noinstr section are blacklisted */
        ret = kprobe_add_area_blacklist((unsigned long)__noinstr_text_start,
                                        (unsigned long)__noinstr_text_end);

        return ret ? : arch_populate_kprobe_blacklist();
}

static void add_module_kprobe_blacklist(struct module *mod)
{
        unsigned long start, end;
        int i;

        if (mod->kprobe_blacklist) {
                for (i = 0; i < mod->num_kprobe_blacklist; i++)
                        kprobe_add_ksym_blacklist(mod->kprobe_blacklist[i]);
        }

        start = (unsigned long)mod->kprobes_text_start;
        if (start) {
                end = start + mod->kprobes_text_size;
                kprobe_add_area_blacklist(start, end);
        }

        start = (unsigned long)mod->noinstr_text_start;
        if (start) {
                end = start + mod->noinstr_text_size;
                kprobe_add_area_blacklist(start, end);
        }
}

static void remove_module_kprobe_blacklist(struct module *mod)
{
        unsigned long start, end;
        int i;

        if (mod->kprobe_blacklist) {
                for (i = 0; i < mod->num_kprobe_blacklist; i++)
                        kprobe_remove_ksym_blacklist(mod->kprobe_blacklist[i]);
        }

        start = (unsigned long)mod->kprobes_text_start;
        if (start) {
                end = start + mod->kprobes_text_size;
                kprobe_remove_area_blacklist(start, end);
        }

        start = (unsigned long)mod->noinstr_text_start;
        if (start) {
                end = start + mod->noinstr_text_size;
                kprobe_remove_area_blacklist(start, end);
        }
}

/* Module notifier call back, checking kprobes on the module */
static int kprobes_module_callback(struct notifier_block *nb,
                                   unsigned long val, void *data)
{
        struct module *mod = data;
        struct hlist_head *head;
        struct kprobe *p;
        unsigned int i;
        int checkcore = (val == MODULE_STATE_GOING);

        if (val == MODULE_STATE_COMING) {
                mutex_lock(&kprobe_mutex);
                add_module_kprobe_blacklist(mod);
                mutex_unlock(&kprobe_mutex);
        }
        if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE)
                return NOTIFY_DONE;

        /*
         * When MODULE_STATE_GOING was notified, both of module .text and
         * .init.text sections would be freed. When MODULE_STATE_LIVE was
         * notified, only .init.text section would be freed. We need to
         * disable kprobes which have been inserted in the sections.
         */
        mutex_lock(&kprobe_mutex);
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                hlist_for_each_entry(p, head, hlist) {
                        if (kprobe_gone(p))
                                continue;

                        if (within_module_init((unsigned long)p->addr, mod) ||
                            (checkcore &&
                             within_module_core((unsigned long)p->addr, mod))) {
                                /*
                                 * The vaddr this probe is installed will soon
                                 * be vfreed buy not synced to disk. Hence,
                                 * disarming the breakpoint isn't needed.
                                 *
                                 * Note, this will also move any optimized probes
                                 * that are pending to be removed from their
                                 * corresponding lists to the freeing_list and
                                 * will not be touched by the delayed
                                 * kprobe_optimizer work handler.
                                 */
                                kill_kprobe(p);
                        }
                }
        }
        if (val == MODULE_STATE_GOING)
                remove_module_kprobe_blacklist(mod);
        mutex_unlock(&kprobe_mutex);
        return NOTIFY_DONE;
}

static struct notifier_block kprobe_module_nb = {
        .notifier_call = kprobes_module_callback,
        .priority = 0
};

/* Markers of _kprobe_blacklist section */
extern unsigned long __start_kprobe_blacklist[];
extern unsigned long __stop_kprobe_blacklist[];

void kprobe_free_init_mem(void)
{
        void *start = (void *)(&__init_begin);
        void *end = (void *)(&__init_end);
        struct hlist_head *head;
        struct kprobe *p;
        int i;

        mutex_lock(&kprobe_mutex);

        /* Kill all kprobes on initmem */
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                hlist_for_each_entry(p, head, hlist) {
                        if (start <= (void *)p->addr && (void *)p->addr < end)
                                kill_kprobe(p);
                }
        }

        mutex_unlock(&kprobe_mutex);
}

static int __init init_kprobes(void)
{
        int i, err = 0;

        /* FIXME allocate the probe table, currently defined statically */
        /* initialize all list heads */
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                INIT_HLIST_HEAD(&kprobe_table[i]);
                INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
                raw_spin_lock_init(&(kretprobe_table_locks[i].lock));
        }

        err = populate_kprobe_blacklist(__start_kprobe_blacklist,
                                        __stop_kprobe_blacklist);
        if (err) {
                pr_err("kprobes: failed to populate blacklist: %d\n", err);
                pr_err("Please take care of using kprobes.\n");
        }

        if (kretprobe_blacklist_size) {
                /* lookup the function address from its name */
                for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
                        kretprobe_blacklist[i].addr =
                                kprobe_lookup_name(kretprobe_blacklist[i].name, 0);
                        if (!kretprobe_blacklist[i].addr)
                                printk("kretprobe: lookup failed: %s\n",
                                       kretprobe_blacklist[i].name);
                }
        }

        /* By default, kprobes are armed */
        kprobes_all_disarmed = false;

#if defined(CONFIG_OPTPROBES) && defined(__ARCH_WANT_KPROBES_INSN_SLOT)
        /* Init kprobe_optinsn_slots for allocation */
        kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
#endif

        err = arch_init_kprobes();
        if (!err)
                err = register_die_notifier(&kprobe_exceptions_nb);
        if (!err)
                err = register_module_notifier(&kprobe_module_nb);

        kprobes_initialized = (err == 0);

        if (!err)
                init_test_probes();
        return err;
}
early_initcall(init_kprobes);

#if defined(CONFIG_OPTPROBES)
static int __init init_optprobes(void)
{
        /*
         * Enable kprobe optimization - this kicks the optimizer which
         * depends on synchronize_rcu_tasks() and ksoftirqd, that is
         * not spawned in early initcall. So delay the optimization.
         */
        optimize_all_kprobes();

        return 0;
}
subsys_initcall(init_optprobes);
#endif

#ifdef CONFIG_DEBUG_FS
static void report_probe(struct seq_file *pi, struct kprobe *p,
                const char *sym, int offset, char *modname, struct kprobe *pp)
{
        char *kprobe_type;
        void *addr = p->addr;

        if (p->pre_handler == pre_handler_kretprobe)
                kprobe_type = "r";
        else
                kprobe_type = "k";

        if (!kallsyms_show_value(pi->file->f_cred))
                addr = NULL;

        if (sym)
                seq_printf(pi, "%px  %s  %s+0x%x  %s ",
                        addr, kprobe_type, sym, offset,
                        (modname ? modname : " "));
        else        /* try to use %pS */
                seq_printf(pi, "%px  %s  %pS ",
                        addr, kprobe_type, p->addr);

        if (!pp)
                pp = p;
        seq_printf(pi, "%s%s%s%s\n",
                (kprobe_gone(p) ? "[GONE]" : ""),
                ((kprobe_disabled(p) && !kprobe_gone(p)) ?  "[DISABLED]" : ""),
                (kprobe_optimized(pp) ? "[OPTIMIZED]" : ""),
                (kprobe_ftrace(pp) ? "[FTRACE]" : ""));
}

static void *kprobe_seq_start(struct seq_file *f, loff_t *pos)
{
        return (*pos < KPROBE_TABLE_SIZE) ? pos : NULL;
}

static void *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos)
{
        (*pos)++;
        if (*pos >= KPROBE_TABLE_SIZE)
                return NULL;
        return pos;
}

static void kprobe_seq_stop(struct seq_file *f, void *v)
{
        /* Nothing to do */
}

static int show_kprobe_addr(struct seq_file *pi, void *v)
{
        struct hlist_head *head;
        struct kprobe *p, *kp;
        const char *sym = NULL;
        unsigned int i = *(loff_t *) v;
        unsigned long offset = 0;
        char *modname, namebuf[KSYM_NAME_LEN];

        head = &kprobe_table[i];
        preempt_disable();
        hlist_for_each_entry_rcu(p, head, hlist) {
                sym = kallsyms_lookup((unsigned long)p->addr, NULL,
                                        &offset, &modname, namebuf);
                if (kprobe_aggrprobe(p)) {
                        list_for_each_entry_rcu(kp, &p->list, list)
                                report_probe(pi, kp, sym, offset, modname, p);
                } else
                        report_probe(pi, p, sym, offset, modname, NULL);
        }
        preempt_enable();
        return 0;
}

static const struct seq_operations kprobes_sops = {
        .start = kprobe_seq_start,
        .next  = kprobe_seq_next,
        .stop  = kprobe_seq_stop,
        .show  = show_kprobe_addr
};

DEFINE_SEQ_ATTRIBUTE(kprobes);

/* kprobes/blacklist -- shows which functions can not be probed */
static void *kprobe_blacklist_seq_start(struct seq_file *m, loff_t *pos)
{
        mutex_lock(&kprobe_mutex);
        return seq_list_start(&kprobe_blacklist, *pos);
}

static void *kprobe_blacklist_seq_next(struct seq_file *m, void *v, loff_t *pos)
{
        return seq_list_next(v, &kprobe_blacklist, pos);
}

static int kprobe_blacklist_seq_show(struct seq_file *m, void *v)
{
        struct kprobe_blacklist_entry *ent =
                list_entry(v, struct kprobe_blacklist_entry, list);

        /*
         * If /proc/kallsyms is not showing kernel address, we won't
         * show them here either.
         */
        if (!kallsyms_show_value(m->file->f_cred))
                seq_printf(m, "0x%px-0x%px\t%ps\n", NULL, NULL,
                           (void *)ent->start_addr);
        else
                seq_printf(m, "0x%px-0x%px\t%ps\n", (void *)ent->start_addr,
                           (void *)ent->end_addr, (void *)ent->start_addr);
        return 0;
}

static void kprobe_blacklist_seq_stop(struct seq_file *f, void *v)
{
        mutex_unlock(&kprobe_mutex);
}

static const struct seq_operations kprobe_blacklist_sops = {
        .start = kprobe_blacklist_seq_start,
        .next  = kprobe_blacklist_seq_next,
        .stop  = kprobe_blacklist_seq_stop,
        .show  = kprobe_blacklist_seq_show,
};
DEFINE_SEQ_ATTRIBUTE(kprobe_blacklist);

static int arm_all_kprobes(void)
{
        struct hlist_head *head;
        struct kprobe *p;
        unsigned int i, total = 0, errors = 0;
        int err, ret = 0;

        mutex_lock(&kprobe_mutex);

        /* If kprobes are armed, just return */
        if (!kprobes_all_disarmed)
                goto already_enabled;

        /*
         * optimize_kprobe() called by arm_kprobe() checks
         * kprobes_all_disarmed, so set kprobes_all_disarmed before
         * arm_kprobe.
         */
        kprobes_all_disarmed = false;
        /* Arming kprobes doesn't optimize kprobe itself */
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                /* Arm all kprobes on a best-effort basis */
                hlist_for_each_entry(p, head, hlist) {
                        if (!kprobe_disabled(p)) {
                                err = arm_kprobe(p);
                                if (err)  {
                                        errors++;
                                        ret = err;
                                }
                                total++;
                        }
                }
        }

        if (errors)
                pr_warn("Kprobes globally enabled, but failed to arm %d out of %d probes\n",
                        errors, total);
        else
                pr_info("Kprobes globally enabled\n");

already_enabled:
        mutex_unlock(&kprobe_mutex);
        return ret;
}

static int disarm_all_kprobes(void)
{
        struct hlist_head *head;
        struct kprobe *p;
        unsigned int i, total = 0, errors = 0;
        int err, ret = 0;

        mutex_lock(&kprobe_mutex);

        /* If kprobes are already disarmed, just return */
        if (kprobes_all_disarmed) {
                mutex_unlock(&kprobe_mutex);
                return 0;
        }

        kprobes_all_disarmed = true;

        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                /* Disarm all kprobes on a best-effort basis */
                hlist_for_each_entry(p, head, hlist) {
                        if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) {
                                err = disarm_kprobe(p, false);
                                if (err) {
                                        errors++;
                                        ret = err;
                                }
                                total++;
                        }
                }
        }

        if (errors)
                pr_warn("Kprobes globally disabled, but failed to disarm %d out of %d probes\n",
                        errors, total);
        else
                pr_info("Kprobes globally disabled\n");

        mutex_unlock(&kprobe_mutex);

        /* Wait for disarming all kprobes by optimizer */
        wait_for_kprobe_optimizer();

        return ret;
}

/*
 * XXX: The debugfs bool file interface doesn't allow for callbacks
 * when the bool state is switched. We can reuse that facility when
 * available
 */
static ssize_t read_enabled_file_bool(struct file *file,
               char __user *user_buf, size_t count, loff_t *ppos)
{
        char buf[3];

        if (!kprobes_all_disarmed)
                buf[0] = '1';
        else
                buf[0] = '0';
        buf[1] = '\n';
        buf[2] = 0x00;
        return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
}

static ssize_t write_enabled_file_bool(struct file *file,
               const char __user *user_buf, size_t count, loff_t *ppos)
{
        char buf[32];
        size_t buf_size;
        int ret = 0;

        buf_size = min(count, (sizeof(buf)-1));
        if (copy_from_user(buf, user_buf, buf_size))
                return -EFAULT;

        buf[buf_size] = '\0';
        switch (buf[0]) {
        case 'y':
        case 'Y':
        case '1':
                ret = arm_all_kprobes();
                break;
        case 'n':
        case 'N':
        case '0':
                ret = disarm_all_kprobes();
                break;
        default:
                return -EINVAL;
        }

        if (ret)
                return ret;

        return count;
}

static const struct file_operations fops_kp = {
        .read =         read_enabled_file_bool,
        .write =        write_enabled_file_bool,
        .llseek =        default_llseek,
};

static int __init debugfs_kprobe_init(void)
{
        struct dentry *dir;

        dir = debugfs_create_dir("kprobes", NULL);

        debugfs_create_file("list", 0400, dir, NULL, &kprobes_fops);

        debugfs_create_file("enabled", 0600, dir, NULL, &fops_kp);

        debugfs_create_file("blacklist", 0400, dir, NULL,
                            &kprobe_blacklist_fops);

        return 0;
}

late_initcall(debugfs_kprobe_init);
#endif /* CONFIG_DEBUG_FS */

































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_SMP_H
#define _ASM_X86_SMP_H
#ifndef __ASSEMBLY__
#include <linux/cpumask.h>
#include <asm/percpu.h>

#include <asm/thread_info.h>
#include <asm/cpumask.h>

extern int smp_num_siblings;
extern unsigned int num_processors;

DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map);
/* cpus sharing the last level cache: */
DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id);
DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);

static inline struct cpumask *cpu_llc_shared_mask(int cpu)
{
        return per_cpu(cpu_llc_shared_map, cpu);
}

DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid);
DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid);
DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
DECLARE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid);
#endif

struct task_struct;

struct smp_ops {
        void (*smp_prepare_boot_cpu)(void);
        void (*smp_prepare_cpus)(unsigned max_cpus);
        void (*smp_cpus_done)(unsigned max_cpus);

        void (*stop_other_cpus)(int wait);
        void (*crash_stop_other_cpus)(void);
        void (*smp_send_reschedule)(int cpu);

        int (*cpu_up)(unsigned cpu, struct task_struct *tidle);
        int (*cpu_disable)(void);
        void (*cpu_die)(unsigned int cpu);
        void (*play_dead)(void);

        void (*send_call_func_ipi)(const struct cpumask *mask);
        void (*send_call_func_single_ipi)(int cpu);
};

/* Globals due to paravirt */
extern void set_cpu_sibling_map(int cpu);

#ifdef CONFIG_SMP
extern struct smp_ops smp_ops;

static inline void smp_send_stop(void)
{
        smp_ops.stop_other_cpus(0);
}

static inline void stop_other_cpus(void)
{
        smp_ops.stop_other_cpus(1);
}

static inline void smp_prepare_boot_cpu(void)
{
        smp_ops.smp_prepare_boot_cpu();
}

static inline void smp_prepare_cpus(unsigned int max_cpus)
{
        smp_ops.smp_prepare_cpus(max_cpus);
}

static inline void smp_cpus_done(unsigned int max_cpus)
{
        smp_ops.smp_cpus_done(max_cpus);
}

static inline int __cpu_up(unsigned int cpu, struct task_struct *tidle)
{
        return smp_ops.cpu_up(cpu, tidle);
}

static inline int __cpu_disable(void)
{
        return smp_ops.cpu_disable();
}

static inline void __cpu_die(unsigned int cpu)
{
        smp_ops.cpu_die(cpu);
}

static inline void play_dead(void)
{
        smp_ops.play_dead();
}

static inline void smp_send_reschedule(int cpu)
{
        smp_ops.smp_send_reschedule(cpu);
}

static inline void arch_send_call_function_single_ipi(int cpu)
{
        smp_ops.send_call_func_single_ipi(cpu);
}

static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
{
        smp_ops.send_call_func_ipi(mask);
}

void cpu_disable_common(void);
void native_smp_prepare_boot_cpu(void);
void native_smp_prepare_cpus(unsigned int max_cpus);
void calculate_max_logical_packages(void);
void native_smp_cpus_done(unsigned int max_cpus);
int common_cpu_up(unsigned int cpunum, struct task_struct *tidle);
int native_cpu_up(unsigned int cpunum, struct task_struct *tidle);
int native_cpu_disable(void);
int common_cpu_die(unsigned int cpu);
void native_cpu_die(unsigned int cpu);
void hlt_play_dead(void);
void native_play_dead(void);
void play_dead_common(void);
void wbinvd_on_cpu(int cpu);
int wbinvd_on_all_cpus(void);
void cond_wakeup_cpu0(void);

void native_smp_send_reschedule(int cpu);
void native_send_call_func_ipi(const struct cpumask *mask);
void native_send_call_func_single_ipi(int cpu);
void x86_idle_thread_init(unsigned int cpu, struct task_struct *idle);

void smp_store_boot_cpu_info(void);
void smp_store_cpu_info(int id);

asmlinkage __visible void smp_reboot_interrupt(void);
__visible void smp_reschedule_interrupt(struct pt_regs *regs);
__visible void smp_call_function_interrupt(struct pt_regs *regs);
__visible void smp_call_function_single_interrupt(struct pt_regs *r);

#define cpu_physical_id(cpu)        per_cpu(x86_cpu_to_apicid, cpu)
#define cpu_acpi_id(cpu)        per_cpu(x86_cpu_to_acpiid, cpu)

/*
 * This function is needed by all SMP systems. It must _always_ be valid
 * from the initial startup. We map APIC_BASE very early in page_setup(),
 * so this is correct in the x86 case.
 */
#define raw_smp_processor_id()  this_cpu_read(cpu_number)
#define __smp_processor_id() __this_cpu_read(cpu_number)

#ifdef CONFIG_X86_32
extern int safe_smp_processor_id(void);
#else
# define safe_smp_processor_id()        smp_processor_id()
#endif

#else /* !CONFIG_SMP */
#define wbinvd_on_cpu(cpu)     wbinvd()
static inline int wbinvd_on_all_cpus(void)
{
        wbinvd();
        return 0;
}
#endif /* CONFIG_SMP */

extern unsigned disabled_cpus;

#ifdef CONFIG_X86_LOCAL_APIC
extern int hard_smp_processor_id(void);

#else /* CONFIG_X86_LOCAL_APIC */
#define hard_smp_processor_id()        0
#endif /* CONFIG_X86_LOCAL_APIC */

#ifdef CONFIG_DEBUG_NMI_SELFTEST
extern void nmi_selftest(void);
#else
#define nmi_selftest() do { } while (0)
#endif

#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_SMP_H */
























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_HWEIGHT_H
#define _ASM_X86_HWEIGHT_H

#include <asm/cpufeatures.h>

#ifdef CONFIG_64BIT
#define REG_IN "D"
#define REG_OUT "a"
#else
#define REG_IN "a"
#define REG_OUT "a"
#endif

static __always_inline unsigned int __arch_hweight32(unsigned int w)
{
        unsigned int res;

        asm (ALTERNATIVE("call __sw_hweight32", "popcntl %1, %0", X86_FEATURE_POPCNT)
                         : "="REG_OUT (res)
                         : REG_IN (w));

        return res;
}

static inline unsigned int __arch_hweight16(unsigned int w)
{
        return __arch_hweight32(w & 0xffff);
}

static inline unsigned int __arch_hweight8(unsigned int w)
{
        return __arch_hweight32(w & 0xff);
}

#ifdef CONFIG_X86_32
static inline unsigned long __arch_hweight64(__u64 w)
{
        return  __arch_hweight32((u32)w) +
                __arch_hweight32((u32)(w >> 32));
}
#else
static __always_inline unsigned long __arch_hweight64(__u64 w)
{
        unsigned long res;

        asm (ALTERNATIVE("call __sw_hweight64", "popcntq %1, %0", X86_FEATURE_POPCNT)
                         : "="REG_OUT (res)
                         : REG_IN (w));

        return res;
}
#endif /* CONFIG_X86_32 */

#endif


























   14 











































    2 








   12 





    2 

































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ASM_PREEMPT_H
#define __ASM_PREEMPT_H

#include <asm/rmwcc.h>
#include <asm/percpu.h>
#include <linux/thread_info.h>

DECLARE_PER_CPU(int, __preempt_count);

/* We use the MSB mostly because its available */
#define PREEMPT_NEED_RESCHED        0x80000000

/*
 * We use the PREEMPT_NEED_RESCHED bit as an inverted NEED_RESCHED such
 * that a decrement hitting 0 means we can and should reschedule.
 */
#define PREEMPT_ENABLED        (0 + PREEMPT_NEED_RESCHED)

/*
 * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users
 * that think a non-zero value indicates we cannot preempt.
 */
static __always_inline int preempt_count(void)
{
        return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED;
}

static __always_inline void preempt_count_set(int pc)
{
        int old, new;

        do {
                old = raw_cpu_read_4(__preempt_count);
                new = (old & PREEMPT_NEED_RESCHED) |
                        (pc & ~PREEMPT_NEED_RESCHED);
        } while (raw_cpu_cmpxchg_4(__preempt_count, old, new) != old);
}

/*
 * must be macros to avoid header recursion hell
 */
#define init_task_preempt_count(p) do { } while (0)

#define init_idle_preempt_count(p, cpu) do { \
        per_cpu(__preempt_count, (cpu)) = PREEMPT_DISABLED; \
} while (0)

/*
 * We fold the NEED_RESCHED bit into the preempt count such that
 * preempt_enable() can decrement and test for needing to reschedule with a
 * single instruction.
 *
 * We invert the actual bit, so that when the decrement hits 0 we know we both
 * need to resched (the bit is cleared) and can resched (no preempt count).
 */

static __always_inline void set_preempt_need_resched(void)
{
        raw_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED);
}

static __always_inline void clear_preempt_need_resched(void)
{
        raw_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED);
}

static __always_inline bool test_preempt_need_resched(void)
{
        return !(raw_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED);
}

/*
 * The various preempt_count add/sub methods
 */

static __always_inline void __preempt_count_add(int val)
{
        raw_cpu_add_4(__preempt_count, val);
}

static __always_inline void __preempt_count_sub(int val)
{
        raw_cpu_add_4(__preempt_count, -val);
}

/*
 * Because we keep PREEMPT_NEED_RESCHED set when we do _not_ need to reschedule
 * a decrement which hits zero means we have no preempt_count and should
 * reschedule.
 */
static __always_inline bool __preempt_count_dec_and_test(void)
{
        return GEN_UNARY_RMWcc("decl", __preempt_count, e, __percpu_arg([var]));
}

/*
 * Returns true when we need to resched and can (barring IRQ state).
 */
static __always_inline bool should_resched(int preempt_offset)
{
        return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
}

#ifdef CONFIG_PREEMPTION
  extern asmlinkage void preempt_schedule_thunk(void);
# define __preempt_schedule() \
        asm volatile ("call preempt_schedule_thunk" : ASM_CALL_CONSTRAINT)

  extern asmlinkage void preempt_schedule(void);
  extern asmlinkage void preempt_schedule_notrace_thunk(void);
# define __preempt_schedule_notrace() \
        asm volatile ("call preempt_schedule_notrace_thunk" : ASM_CALL_CONSTRAINT)

  extern asmlinkage void preempt_schedule_notrace(void);
#endif

#endif /* __ASM_PREEMPT_H */






































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * This file is part of the Linux kernel.
 *
 * Copyright (c) 2011-2014, Intel Corporation
 * Authors: Fenghua Yu <fenghua.yu@intel.com>,
 *          H. Peter Anvin <hpa@linux.intel.com>
 */

#ifndef ASM_X86_ARCHRANDOM_H
#define ASM_X86_ARCHRANDOM_H

#include <asm/processor.h>
#include <asm/cpufeature.h>

#define RDRAND_RETRY_LOOPS        10

/* Unconditional execution of RDRAND and RDSEED */

static inline bool __must_check rdrand_long(unsigned long *v)
{
        bool ok;
        unsigned int retry = RDRAND_RETRY_LOOPS;
        do {
                asm volatile("rdrand %[out]"
                             CC_SET(c)
                             : CC_OUT(c) (ok), [out] "=r" (*v));
                if (ok)
                        return true;
        } while (--retry);
        return false;
}

static inline bool __must_check rdrand_int(unsigned int *v)
{
        bool ok;
        unsigned int retry = RDRAND_RETRY_LOOPS;
        do {
                asm volatile("rdrand %[out]"
                             CC_SET(c)
                             : CC_OUT(c) (ok), [out] "=r" (*v));
                if (ok)
                        return true;
        } while (--retry);
        return false;
}

static inline bool __must_check rdseed_long(unsigned long *v)
{
        bool ok;
        asm volatile("rdseed %[out]"
                     CC_SET(c)
                     : CC_OUT(c) (ok), [out] "=r" (*v));
        return ok;
}

static inline bool __must_check rdseed_int(unsigned int *v)
{
        bool ok;
        asm volatile("rdseed %[out]"
                     CC_SET(c)
                     : CC_OUT(c) (ok), [out] "=r" (*v));
        return ok;
}

/*
 * These are the generic interfaces; they must not be declared if the
 * stubs in <linux/random.h> are to be invoked,
 * i.e. CONFIG_ARCH_RANDOM is not defined.
 */
#ifdef CONFIG_ARCH_RANDOM

static inline bool __must_check arch_get_random_long(unsigned long *v)
{
        return static_cpu_has(X86_FEATURE_RDRAND) ? rdrand_long(v) : false;
}

static inline bool __must_check arch_get_random_int(unsigned int *v)
{
        return static_cpu_has(X86_FEATURE_RDRAND) ? rdrand_int(v) : false;
}

static inline bool __must_check arch_get_random_seed_long(unsigned long *v)
{
        return static_cpu_has(X86_FEATURE_RDSEED) ? rdseed_long(v) : false;
}

static inline bool __must_check arch_get_random_seed_int(unsigned int *v)
{
        return static_cpu_has(X86_FEATURE_RDSEED) ? rdseed_int(v) : false;
}

extern void x86_init_rdrand(struct cpuinfo_x86 *c);

#else  /* !CONFIG_ARCH_RANDOM */

static inline void x86_init_rdrand(struct cpuinfo_x86 *c) { }

#endif  /* !CONFIG_ARCH_RANDOM */

#endif /* ASM_X86_ARCHRANDOM_H */
































































































































    3 
































    2 






























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_WAIT_H
#define _LINUX_WAIT_H
/*
 * Linux wait queue related types and methods
 */
#include <linux/list.h>
#include <linux/stddef.h>
#include <linux/spinlock.h>

#include <asm/current.h>
#include <uapi/linux/wait.h>

typedef struct wait_queue_entry wait_queue_entry_t;

typedef int (*wait_queue_func_t)(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key);
int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key);

/* wait_queue_entry::flags */
#define WQ_FLAG_EXCLUSIVE        0x01
#define WQ_FLAG_WOKEN                0x02
#define WQ_FLAG_BOOKMARK        0x04
#define WQ_FLAG_CUSTOM                0x08
#define WQ_FLAG_DONE                0x10

/*
 * A single wait-queue entry structure:
 */
struct wait_queue_entry {
        unsigned int                flags;
        void                        *private;
        wait_queue_func_t        func;
        struct list_head        entry;
};

struct wait_queue_head {
        spinlock_t                lock;
        struct list_head        head;
};
typedef struct wait_queue_head wait_queue_head_t;

struct task_struct;

/*
 * Macros for declaration and initialisaton of the datatypes
 */

#define __WAITQUEUE_INITIALIZER(name, tsk) {                                        \
        .private        = tsk,                                                        \
        .func                = default_wake_function,                                \
        .entry                = { NULL, NULL } }

#define DECLARE_WAITQUEUE(name, tsk)                                                \
        struct wait_queue_entry name = __WAITQUEUE_INITIALIZER(name, tsk)

#define __WAIT_QUEUE_HEAD_INITIALIZER(name) {                                        \
        .lock                = __SPIN_LOCK_UNLOCKED(name.lock),                        \
        .head                = { &(name).head, &(name).head } }

#define DECLARE_WAIT_QUEUE_HEAD(name) \
        struct wait_queue_head name = __WAIT_QUEUE_HEAD_INITIALIZER(name)

extern void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *);

#define init_waitqueue_head(wq_head)                                                \
        do {                                                                        \
                static struct lock_class_key __key;                                \
                                                                                \
                __init_waitqueue_head((wq_head), #wq_head, &__key);                \
        } while (0)

#ifdef CONFIG_LOCKDEP
# define __WAIT_QUEUE_HEAD_INIT_ONSTACK(name) \
        ({ init_waitqueue_head(&name); name; })
# define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) \
        struct wait_queue_head name = __WAIT_QUEUE_HEAD_INIT_ONSTACK(name)
#else
# define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) DECLARE_WAIT_QUEUE_HEAD(name)
#endif

static inline void init_waitqueue_entry(struct wait_queue_entry *wq_entry, struct task_struct *p)
{
        wq_entry->flags                = 0;
        wq_entry->private        = p;
        wq_entry->func                = default_wake_function;
}

static inline void
init_waitqueue_func_entry(struct wait_queue_entry *wq_entry, wait_queue_func_t func)
{
        wq_entry->flags                = 0;
        wq_entry->private        = NULL;
        wq_entry->func                = func;
}

/**
 * waitqueue_active -- locklessly test for waiters on the queue
 * @wq_head: the waitqueue to test for waiters
 *
 * returns true if the wait list is not empty
 *
 * NOTE: this function is lockless and requires care, incorrect usage _will_
 * lead to sporadic and non-obvious failure.
 *
 * Use either while holding wait_queue_head::lock or when used for wakeups
 * with an extra smp_mb() like::
 *
 *      CPU0 - waker                    CPU1 - waiter
 *
 *                                      for (;;) {
 *      @cond = true;                     prepare_to_wait(&wq_head, &wait, state);
 *      smp_mb();                         // smp_mb() from set_current_state()
 *      if (waitqueue_active(wq_head))         if (@cond)
 *        wake_up(wq_head);                      break;
 *                                        schedule();
 *                                      }
 *                                      finish_wait(&wq_head, &wait);
 *
 * Because without the explicit smp_mb() it's possible for the
 * waitqueue_active() load to get hoisted over the @cond store such that we'll
 * observe an empty wait list while the waiter might not observe @cond.
 *
 * Also note that this 'optimization' trades a spin_lock() for an smp_mb(),
 * which (when the lock is uncontended) are of roughly equal cost.
 */
static inline int waitqueue_active(struct wait_queue_head *wq_head)
{
        return !list_empty(&wq_head->head);
}

/**
 * wq_has_single_sleeper - check if there is only one sleeper
 * @wq_head: wait queue head
 *
 * Returns true of wq_head has only one sleeper on the list.
 *
 * Please refer to the comment for waitqueue_active.
 */
static inline bool wq_has_single_sleeper(struct wait_queue_head *wq_head)
{
        return list_is_singular(&wq_head->head);
}

/**
 * wq_has_sleeper - check if there are any waiting processes
 * @wq_head: wait queue head
 *
 * Returns true if wq_head has waiting processes
 *
 * Please refer to the comment for waitqueue_active.
 */
static inline bool wq_has_sleeper(struct wait_queue_head *wq_head)
{
        /*
         * We need to be sure we are in sync with the
         * add_wait_queue modifications to the wait queue.
         *
         * This memory barrier should be paired with one on the
         * waiting side.
         */
        smp_mb();
        return waitqueue_active(wq_head);
}

extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);

static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
        list_add(&wq_entry->entry, &wq_head->head);
}

/*
 * Used for wake-one threads:
 */
static inline void
__add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
        wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
        __add_wait_queue(wq_head, wq_entry);
}

static inline void __add_wait_queue_entry_tail(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
        list_add_tail(&wq_entry->entry, &wq_head->head);
}

static inline void
__add_wait_queue_entry_tail_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
        wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
        __add_wait_queue_entry_tail(wq_head, wq_entry);
}

static inline void
__remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
        list_del(&wq_entry->entry);
}

void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
                unsigned int mode, void *key, wait_queue_entry_t *bookmark);
void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr);
void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode);
void __wake_up_pollfree(struct wait_queue_head *wq_head);

#define wake_up(x)                        __wake_up(x, TASK_NORMAL, 1, NULL)
#define wake_up_nr(x, nr)                __wake_up(x, TASK_NORMAL, nr, NULL)
#define wake_up_all(x)                        __wake_up(x, TASK_NORMAL, 0, NULL)
#define wake_up_locked(x)                __wake_up_locked((x), TASK_NORMAL, 1)
#define wake_up_all_locked(x)                __wake_up_locked((x), TASK_NORMAL, 0)
#define wake_up_sync(x)                        __wake_up_sync(x, TASK_NORMAL)

#define wake_up_interruptible(x)        __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL)
#define wake_up_interruptible_nr(x, nr)        __wake_up(x, TASK_INTERRUPTIBLE, nr, NULL)
#define wake_up_interruptible_all(x)        __wake_up(x, TASK_INTERRUPTIBLE, 0, NULL)
#define wake_up_interruptible_sync(x)        __wake_up_sync((x), TASK_INTERRUPTIBLE)

/*
 * Wakeup macros to be used to report events to the targets.
 */
#define poll_to_key(m) ((void *)(__force uintptr_t)(__poll_t)(m))
#define key_to_poll(m) ((__force __poll_t)(uintptr_t)(void *)(m))
#define wake_up_poll(x, m)                                                        \
        __wake_up(x, TASK_NORMAL, 1, poll_to_key(m))
#define wake_up_locked_poll(x, m)                                                \
        __wake_up_locked_key((x), TASK_NORMAL, poll_to_key(m))
#define wake_up_interruptible_poll(x, m)                                        \
        __wake_up(x, TASK_INTERRUPTIBLE, 1, poll_to_key(m))
#define wake_up_interruptible_sync_poll(x, m)                                        \
        __wake_up_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m))
#define wake_up_interruptible_sync_poll_locked(x, m)                                \
        __wake_up_locked_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m))

/**
 * wake_up_pollfree - signal that a polled waitqueue is going away
 * @wq_head: the wait queue head
 *
 * In the very rare cases where a ->poll() implementation uses a waitqueue whose
 * lifetime is tied to a task rather than to the 'struct file' being polled,
 * this function must be called before the waitqueue is freed so that
 * non-blocking polls (e.g. epoll) are notified that the queue is going away.
 *
 * The caller must also RCU-delay the freeing of the wait_queue_head, e.g. via
 * an explicit synchronize_rcu() or call_rcu(), or via SLAB_TYPESAFE_BY_RCU.
 */
static inline void wake_up_pollfree(struct wait_queue_head *wq_head)
{
        /*
         * For performance reasons, we don't always take the queue lock here.
         * Therefore, we might race with someone removing the last entry from
         * the queue, and proceed while they still hold the queue lock.
         * However, rcu_read_lock() is required to be held in such cases, so we
         * can safely proceed with an RCU-delayed free.
         */
        if (waitqueue_active(wq_head))
                __wake_up_pollfree(wq_head);
}

#define ___wait_cond_timeout(condition)                                                \
({                                                                                \
        bool __cond = (condition);                                                \
        if (__cond && !__ret)                                                        \
                __ret = 1;                                                        \
        __cond || !__ret;                                                        \
})

#define ___wait_is_interruptible(state)                                                \
        (!__builtin_constant_p(state) ||                                        \
                state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE)                \

extern void init_wait_entry(struct wait_queue_entry *wq_entry, int flags);

/*
 * The below macro ___wait_event() has an explicit shadow of the __ret
 * variable when used from the wait_event_*() macros.
 *
 * This is so that both can use the ___wait_cond_timeout() construct
 * to wrap the condition.
 *
 * The type inconsistency of the wait_event_*() __ret variable is also
 * on purpose; we use long where we can return timeout values and int
 * otherwise.
 */

#define ___wait_event(wq_head, condition, state, exclusive, ret, cmd)                \
({                                                                                \
        __label__ __out;                                                        \
        struct wait_queue_entry __wq_entry;                                        \
        long __ret = ret;        /* explicit shadow */                                \
                                                                                \
        init_wait_entry(&__wq_entry, exclusive ? WQ_FLAG_EXCLUSIVE : 0);        \
        for (;;) {                                                                \
                long __int = prepare_to_wait_event(&wq_head, &__wq_entry, state);\
                                                                                \
                if (condition)                                                        \
                        break;                                                        \
                                                                                \
                if (___wait_is_interruptible(state) && __int) {                        \
                        __ret = __int;                                                \
                        goto __out;                                                \
                }                                                                \
                                                                                \
                cmd;                                                                \
        }                                                                        \
        finish_wait(&wq_head, &__wq_entry);                                        \
__out:        __ret;                                                                        \
})

#define __wait_event(wq_head, condition)                                        \
        (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,        \
                            schedule())

/**
 * wait_event - sleep until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 */
#define wait_event(wq_head, condition)                                                \
do {                                                                                \
        might_sleep();                                                                \
        if (condition)                                                                \
                break;                                                                \
        __wait_event(wq_head, condition);                                        \
} while (0)

#define __io_wait_event(wq_head, condition)                                        \
        (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,        \
                            io_schedule())

/*
 * io_wait_event() -- like wait_event() but with io_schedule()
 */
#define io_wait_event(wq_head, condition)                                        \
do {                                                                                \
        might_sleep();                                                                \
        if (condition)                                                                \
                break;                                                                \
        __io_wait_event(wq_head, condition);                                        \
} while (0)

#define __wait_event_freezable(wq_head, condition)                                \
        ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0,                \
                            freezable_schedule())

/**
 * wait_event_freezable - sleep (or freeze) until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE -- so as not to contribute
 * to system load) until the @condition evaluates to true. The
 * @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 */
#define wait_event_freezable(wq_head, condition)                                \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_freezable(wq_head, condition);                \
        __ret;                                                                        \
})

#define __wait_event_timeout(wq_head, condition, timeout)                        \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      TASK_UNINTERRUPTIBLE, 0, timeout,                                \
                      __ret = schedule_timeout(__ret))

/**
 * wait_event_timeout - sleep until a condition gets true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * Returns:
 * 0 if the @condition evaluated to %false after the @timeout elapsed,
 * 1 if the @condition evaluated to %true after the @timeout elapsed,
 * or the remaining jiffies (at least 1) if the @condition evaluated
 * to %true before the @timeout elapsed.
 */
#define wait_event_timeout(wq_head, condition, timeout)                                \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_timeout(wq_head, condition, timeout);        \
        __ret;                                                                        \
})

#define __wait_event_freezable_timeout(wq_head, condition, timeout)                \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      TASK_INTERRUPTIBLE, 0, timeout,                                \
                      __ret = freezable_schedule_timeout(__ret))

/*
 * like wait_event_timeout() -- except it uses TASK_INTERRUPTIBLE to avoid
 * increasing load and is freezable.
 */
#define wait_event_freezable_timeout(wq_head, condition, timeout)                \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_freezable_timeout(wq_head, condition, timeout); \
        __ret;                                                                        \
})

#define __wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2)                \
        (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 1, 0,        \
                            cmd1; schedule(); cmd2)
/*
 * Just like wait_event_cmd(), except it sets exclusive flag
 */
#define wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2)                \
do {                                                                                \
        if (condition)                                                                \
                break;                                                                \
        __wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2);                \
} while (0)

#define __wait_event_cmd(wq_head, condition, cmd1, cmd2)                        \
        (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,        \
                            cmd1; schedule(); cmd2)

/**
 * wait_event_cmd - sleep until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @cmd1: the command will be executed before sleep
 * @cmd2: the command will be executed after sleep
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 */
#define wait_event_cmd(wq_head, condition, cmd1, cmd2)                                \
do {                                                                                \
        if (condition)                                                                \
                break;                                                                \
        __wait_event_cmd(wq_head, condition, cmd1, cmd2);                        \
} while (0)

#define __wait_event_interruptible(wq_head, condition)                                \
        ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0,                \
                      schedule())

/**
 * wait_event_interruptible - sleep until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible(wq_head, condition)                                \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_interruptible(wq_head, condition);                \
        __ret;                                                                        \
})

#define __wait_event_interruptible_timeout(wq_head, condition, timeout)                \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      TASK_INTERRUPTIBLE, 0, timeout,                                \
                      __ret = schedule_timeout(__ret))

/**
 * wait_event_interruptible_timeout - sleep until a condition gets true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * Returns:
 * 0 if the @condition evaluated to %false after the @timeout elapsed,
 * 1 if the @condition evaluated to %true after the @timeout elapsed,
 * the remaining jiffies (at least 1) if the @condition evaluated
 * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was
 * interrupted by a signal.
 */
#define wait_event_interruptible_timeout(wq_head, condition, timeout)                \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_interruptible_timeout(wq_head,                \
                                                condition, timeout);                \
        __ret;                                                                        \
})

#define __wait_event_hrtimeout(wq_head, condition, timeout, state)                \
({                                                                                \
        int __ret = 0;                                                                \
        struct hrtimer_sleeper __t;                                                \
                                                                                \
        hrtimer_init_sleeper_on_stack(&__t, CLOCK_MONOTONIC,                        \
                                      HRTIMER_MODE_REL);                        \
        if ((timeout) != KTIME_MAX) {                                                \
                hrtimer_set_expires_range_ns(&__t.timer, timeout,                \
                                        current->timer_slack_ns);                \
                hrtimer_sleeper_start_expires(&__t, HRTIMER_MODE_REL);                \
        }                                                                        \
                                                                                \
        __ret = ___wait_event(wq_head, condition, state, 0, 0,                        \
                if (!__t.task) {                                                \
                        __ret = -ETIME;                                                \
                        break;                                                        \
                }                                                                \
                schedule());                                                        \
                                                                                \
        hrtimer_cancel(&__t.timer);                                                \
        destroy_hrtimer_on_stack(&__t.timer);                                        \
        __ret;                                                                        \
})

/**
 * wait_event_hrtimeout - sleep until a condition gets true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, as a ktime_t
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function returns 0 if @condition became true, or -ETIME if the timeout
 * elapsed.
 */
#define wait_event_hrtimeout(wq_head, condition, timeout)                        \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_hrtimeout(wq_head, condition, timeout,        \
                                               TASK_UNINTERRUPTIBLE);                \
        __ret;                                                                        \
})

/**
 * wait_event_interruptible_hrtimeout - sleep until a condition gets true or a timeout elapses
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, as a ktime_t
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function returns 0 if @condition became true, -ERESTARTSYS if it was
 * interrupted by a signal, or -ETIME if the timeout elapsed.
 */
#define wait_event_interruptible_hrtimeout(wq, condition, timeout)                \
({                                                                                \
        long __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_hrtimeout(wq, condition, timeout,                \
                                               TASK_INTERRUPTIBLE);                \
        __ret;                                                                        \
})

#define __wait_event_interruptible_exclusive(wq, condition)                        \
        ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0,                        \
                      schedule())

#define wait_event_interruptible_exclusive(wq, condition)                        \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_interruptible_exclusive(wq, condition);        \
        __ret;                                                                        \
})

#define __wait_event_killable_exclusive(wq, condition)                                \
        ___wait_event(wq, condition, TASK_KILLABLE, 1, 0,                        \
                      schedule())

#define wait_event_killable_exclusive(wq, condition)                                \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_killable_exclusive(wq, condition);                \
        __ret;                                                                        \
})


#define __wait_event_freezable_exclusive(wq, condition)                                \
        ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0,                        \
                        freezable_schedule())

#define wait_event_freezable_exclusive(wq, condition)                                \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_freezable_exclusive(wq, condition);        \
        __ret;                                                                        \
})

/**
 * wait_event_idle - wait for a condition without contributing to system load
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_IDLE) until the
 * @condition evaluates to true.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 */
#define wait_event_idle(wq_head, condition)                                        \
do {                                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                ___wait_event(wq_head, condition, TASK_IDLE, 0, 0, schedule());        \
} while (0)

/**
 * wait_event_idle_exclusive - wait for a condition with contributing to system load
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_IDLE) until the
 * @condition evaluates to true.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
 * set thus if other processes wait on the same list, when this
 * process is woken further processes are not considered.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 */
#define wait_event_idle_exclusive(wq_head, condition)                                \
do {                                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                ___wait_event(wq_head, condition, TASK_IDLE, 1, 0, schedule());        \
} while (0)

#define __wait_event_idle_timeout(wq_head, condition, timeout)                        \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      TASK_IDLE, 0, timeout,                                        \
                      __ret = schedule_timeout(__ret))

/**
 * wait_event_idle_timeout - sleep without load until a condition becomes true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_IDLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * Returns:
 * 0 if the @condition evaluated to %false after the @timeout elapsed,
 * 1 if the @condition evaluated to %true after the @timeout elapsed,
 * or the remaining jiffies (at least 1) if the @condition evaluated
 * to %true before the @timeout elapsed.
 */
#define wait_event_idle_timeout(wq_head, condition, timeout)                        \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_idle_timeout(wq_head, condition, timeout);        \
        __ret;                                                                        \
})

#define __wait_event_idle_exclusive_timeout(wq_head, condition, timeout)        \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      TASK_IDLE, 1, timeout,                                        \
                      __ret = schedule_timeout(__ret))

/**
 * wait_event_idle_exclusive_timeout - sleep without load until a condition becomes true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_IDLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
 * set thus if other processes wait on the same list, when this
 * process is woken further processes are not considered.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * Returns:
 * 0 if the @condition evaluated to %false after the @timeout elapsed,
 * 1 if the @condition evaluated to %true after the @timeout elapsed,
 * or the remaining jiffies (at least 1) if the @condition evaluated
 * to %true before the @timeout elapsed.
 */
#define wait_event_idle_exclusive_timeout(wq_head, condition, timeout)                \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_idle_exclusive_timeout(wq_head, condition, timeout);\
        __ret;                                                                        \
})

extern int do_wait_intr(wait_queue_head_t *, wait_queue_entry_t *);
extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *);

#define __wait_event_interruptible_locked(wq, condition, exclusive, fn)                \
({                                                                                \
        int __ret;                                                                \
        DEFINE_WAIT(__wait);                                                        \
        if (exclusive)                                                                \
                __wait.flags |= WQ_FLAG_EXCLUSIVE;                                \
        do {                                                                        \
                __ret = fn(&(wq), &__wait);                                        \
                if (__ret)                                                        \
                        break;                                                        \
        } while (!(condition));                                                        \
        __remove_wait_queue(&(wq), &__wait);                                        \
        __set_current_state(TASK_RUNNING);                                        \
        __ret;                                                                        \
})


/**
 * wait_event_interruptible_locked - sleep until a condition gets true
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq is woken up.
 *
 * It must be called with wq.lock being held.  This spinlock is
 * unlocked while sleeping but @condition testing is done while lock
 * is held and when this macro exits the lock is held.
 *
 * The lock is locked/unlocked using spin_lock()/spin_unlock()
 * functions which must match the way they are locked/unlocked outside
 * of this macro.
 *
 * wake_up_locked() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_locked(wq, condition)                                \
        ((condition)                                                                \
         ? 0 : __wait_event_interruptible_locked(wq, condition, 0, do_wait_intr))

/**
 * wait_event_interruptible_locked_irq - sleep until a condition gets true
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq is woken up.
 *
 * It must be called with wq.lock being held.  This spinlock is
 * unlocked while sleeping but @condition testing is done while lock
 * is held and when this macro exits the lock is held.
 *
 * The lock is locked/unlocked using spin_lock_irq()/spin_unlock_irq()
 * functions which must match the way they are locked/unlocked outside
 * of this macro.
 *
 * wake_up_locked() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_locked_irq(wq, condition)                        \
        ((condition)                                                                \
         ? 0 : __wait_event_interruptible_locked(wq, condition, 0, do_wait_intr_irq))

/**
 * wait_event_interruptible_exclusive_locked - sleep exclusively until a condition gets true
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq is woken up.
 *
 * It must be called with wq.lock being held.  This spinlock is
 * unlocked while sleeping but @condition testing is done while lock
 * is held and when this macro exits the lock is held.
 *
 * The lock is locked/unlocked using spin_lock()/spin_unlock()
 * functions which must match the way they are locked/unlocked outside
 * of this macro.
 *
 * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
 * set thus when other process waits process on the list if this
 * process is awaken further processes are not considered.
 *
 * wake_up_locked() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_exclusive_locked(wq, condition)                \
        ((condition)                                                                \
         ? 0 : __wait_event_interruptible_locked(wq, condition, 1, do_wait_intr))

/**
 * wait_event_interruptible_exclusive_locked_irq - sleep until a condition gets true
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq is woken up.
 *
 * It must be called with wq.lock being held.  This spinlock is
 * unlocked while sleeping but @condition testing is done while lock
 * is held and when this macro exits the lock is held.
 *
 * The lock is locked/unlocked using spin_lock_irq()/spin_unlock_irq()
 * functions which must match the way they are locked/unlocked outside
 * of this macro.
 *
 * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
 * set thus when other process waits process on the list if this
 * process is awaken further processes are not considered.
 *
 * wake_up_locked() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_exclusive_locked_irq(wq, condition)                \
        ((condition)                                                                \
         ? 0 : __wait_event_interruptible_locked(wq, condition, 1, do_wait_intr_irq))


#define __wait_event_killable(wq, condition)                                        \
        ___wait_event(wq, condition, TASK_KILLABLE, 0, 0, schedule())

/**
 * wait_event_killable - sleep until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_KILLABLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_killable(wq_head, condition)                                        \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_killable(wq_head, condition);                \
        __ret;                                                                        \
})

#define __wait_event_killable_timeout(wq_head, condition, timeout)                \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      TASK_KILLABLE, 0, timeout,                                \
                      __ret = schedule_timeout(__ret))

/**
 * wait_event_killable_timeout - sleep until a condition gets true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_KILLABLE) until the
 * @condition evaluates to true or a kill signal is received.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * Returns:
 * 0 if the @condition evaluated to %false after the @timeout elapsed,
 * 1 if the @condition evaluated to %true after the @timeout elapsed,
 * the remaining jiffies (at least 1) if the @condition evaluated
 * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was
 * interrupted by a kill signal.
 *
 * Only kill signals interrupt this process.
 */
#define wait_event_killable_timeout(wq_head, condition, timeout)                \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_killable_timeout(wq_head,                        \
                                                condition, timeout);                \
        __ret;                                                                        \
})


#define __wait_event_lock_irq(wq_head, condition, lock, cmd)                        \
        (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,        \
                            spin_unlock_irq(&lock);                                \
                            cmd;                                                \
                            schedule();                                                \
                            spin_lock_irq(&lock))

/**
 * wait_event_lock_irq_cmd - sleep until a condition gets true. The
 *                             condition is checked under the lock. This
 *                             is expected to be called with the lock
 *                             taken.
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @lock: a locked spinlock_t, which will be released before cmd
 *          and schedule() and reacquired afterwards.
 * @cmd: a command which is invoked outside the critical section before
 *         sleep
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * This is supposed to be called while holding the lock. The lock is
 * dropped before invoking the cmd and going to sleep and is reacquired
 * afterwards.
 */
#define wait_event_lock_irq_cmd(wq_head, condition, lock, cmd)                        \
do {                                                                                \
        if (condition)                                                                \
                break;                                                                \
        __wait_event_lock_irq(wq_head, condition, lock, cmd);                        \
} while (0)

/**
 * wait_event_lock_irq - sleep until a condition gets true. The
 *                         condition is checked under the lock. This
 *                         is expected to be called with the lock
 *                         taken.
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @lock: a locked spinlock_t, which will be released before schedule()
 *          and reacquired afterwards.
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * This is supposed to be called while holding the lock. The lock is
 * dropped before going to sleep and is reacquired afterwards.
 */
#define wait_event_lock_irq(wq_head, condition, lock)                                \
do {                                                                                \
        if (condition)                                                                \
                break;                                                                \
        __wait_event_lock_irq(wq_head, condition, lock, );                        \
} while (0)


#define __wait_event_interruptible_lock_irq(wq_head, condition, lock, cmd)        \
        ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0,                \
                      spin_unlock_irq(&lock);                                        \
                      cmd;                                                        \
                      schedule();                                                \
                      spin_lock_irq(&lock))

/**
 * wait_event_interruptible_lock_irq_cmd - sleep until a condition gets true.
 *                The condition is checked under the lock. This is expected to
 *                be called with the lock taken.
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @lock: a locked spinlock_t, which will be released before cmd and
 *          schedule() and reacquired afterwards.
 * @cmd: a command which is invoked outside the critical section before
 *         sleep
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received. The @condition is
 * checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * This is supposed to be called while holding the lock. The lock is
 * dropped before invoking the cmd and going to sleep and is reacquired
 * afterwards.
 *
 * The macro will return -ERESTARTSYS if it was interrupted by a signal
 * and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_lock_irq_cmd(wq_head, condition, lock, cmd)        \
({                                                                                \
        int __ret = 0;                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_interruptible_lock_irq(wq_head,                \
                                                condition, lock, cmd);                \
        __ret;                                                                        \
})

/**
 * wait_event_interruptible_lock_irq - sleep until a condition gets true.
 *                The condition is checked under the lock. This is expected
 *                to be called with the lock taken.
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @lock: a locked spinlock_t, which will be released before schedule()
 *          and reacquired afterwards.
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or signal is received. The @condition is
 * checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * This is supposed to be called while holding the lock. The lock is
 * dropped before going to sleep and is reacquired afterwards.
 *
 * The macro will return -ERESTARTSYS if it was interrupted by a signal
 * and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_lock_irq(wq_head, condition, lock)                \
({                                                                                \
        int __ret = 0;                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_interruptible_lock_irq(wq_head,                \
                                                condition, lock,);                \
        __ret;                                                                        \
})

#define __wait_event_lock_irq_timeout(wq_head, condition, lock, timeout, state)        \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      state, 0, timeout,                                        \
                      spin_unlock_irq(&lock);                                        \
                      __ret = schedule_timeout(__ret);                                \
                      spin_lock_irq(&lock));

/**
 * wait_event_interruptible_lock_irq_timeout - sleep until a condition gets
 *                true or a timeout elapses. The condition is checked under
 *                the lock. This is expected to be called with the lock taken.
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @lock: a locked spinlock_t, which will be released before schedule()
 *          and reacquired afterwards.
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or signal is received. The @condition is
 * checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * This is supposed to be called while holding the lock. The lock is
 * dropped before going to sleep and is reacquired afterwards.
 *
 * The function returns 0 if the @timeout elapsed, -ERESTARTSYS if it
 * was interrupted by a signal, and the remaining jiffies otherwise
 * if the condition evaluated to true before the timeout elapsed.
 */
#define wait_event_interruptible_lock_irq_timeout(wq_head, condition, lock,        \
                                                  timeout)                        \
({                                                                                \
        long __ret = timeout;                                                        \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_lock_irq_timeout(                                \
                                        wq_head, condition, lock, timeout,        \
                                        TASK_INTERRUPTIBLE);                        \
        __ret;                                                                        \
})

#define wait_event_lock_irq_timeout(wq_head, condition, lock, timeout)                \
({                                                                                \
        long __ret = timeout;                                                        \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_lock_irq_timeout(                                \
                                        wq_head, condition, lock, timeout,        \
                                        TASK_UNINTERRUPTIBLE);                        \
        __ret;                                                                        \
})

/*
 * Waitqueues which are removed from the waitqueue_head at wakeup time
 */
void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout);
int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key);
int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key);

#define DEFINE_WAIT_FUNC(name, function)                                        \
        struct wait_queue_entry name = {                                        \
                .private        = current,                                        \
                .func                = function,                                        \
                .entry                = LIST_HEAD_INIT((name).entry),                        \
        }

#define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function)

#define init_wait(wait)                                                                \
        do {                                                                        \
                (wait)->private = current;                                        \
                (wait)->func = autoremove_wake_function;                        \
                INIT_LIST_HEAD(&(wait)->entry);                                        \
                (wait)->flags = 0;                                                \
        } while (0)

bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg);

#endif /* _LINUX_WAIT_H */








































































































































































    5 










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 
    2 















    2 
    2 
















    2 
























































    5 




    5 





    5 
    5 









    5 
























































    5 



































































































































































































































    2 
















































    2 




    2 






























    5 
    5 

    5 



    5 
























































































































































    1 








    1 

































































































































































































































    1 
































    1 































    1 







































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Simple NUMA memory policy for the Linux kernel.
 *
 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
 *
 * NUMA policy allows the user to give hints in which node(s) memory should
 * be allocated.
 *
 * Support four policies per VMA and per process:
 *
 * The VMA policy has priority over the process policy for a page fault.
 *
 * interleave     Allocate memory interleaved over a set of nodes,
 *                with normal fallback if it fails.
 *                For VMA based allocations this interleaves based on the
 *                offset into the backing object or offset into the mapping
 *                for anonymous memory. For process policy an process counter
 *                is used.
 *
 * bind           Only allocate memory on a specific set of nodes,
 *                no fallback.
 *                FIXME: memory is allocated starting with the first node
 *                to the last. It would be better if bind would truly restrict
 *                the allocation to memory nodes instead
 *
 * preferred       Try a specific node first before normal fallback.
 *                As a special case NUMA_NO_NODE here means do the allocation
 *                on the local CPU. This is normally identical to default,
 *                but useful to set in a VMA when you have a non default
 *                process policy.
 *
 * default        Allocate on the local node first, or when on a VMA
 *                use the process policy. This is what Linux always did
 *                  in a NUMA aware kernel and still does by, ahem, default.
 *
 * The process policy is applied for most non interrupt memory allocations
 * in that process' context. Interrupts ignore the policies and always
 * try to allocate on the local CPU. The VMA policy is only applied for memory
 * allocations for a VMA in the VM.
 *
 * Currently there are a few corner cases in swapping where the policy
 * is not applied, but the majority should be handled. When process policy
 * is used it is not remembered over swap outs/swap ins.
 *
 * Only the highest zone in the zone hierarchy gets policied. Allocations
 * requesting a lower zone just use default policy. This implies that
 * on systems with highmem kernel lowmem allocation don't get policied.
 * Same with GFP_DMA allocations.
 *
 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
 * all users and remembered even when nobody has memory mapped.
 */

/* Notebook:
   fix mmap readahead to honour policy and enable policy for any page cache
   object
   statistics for bigpages
   global policy for page cache? currently it uses process policy. Requires
   first item above.
   handle mremap for shared memory (currently ignored for the policy)
   grows down?
   make bind policy root only? It can trigger oom much faster and the
   kernel is not always grateful with that.
*/

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/mempolicy.h>
#include <linux/pagewalk.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/task.h>
#include <linux/nodemask.h>
#include <linux/cpuset.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/export.h>
#include <linux/nsproxy.h>
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/compat.h>
#include <linux/ptrace.h>
#include <linux/swap.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/migrate.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/ctype.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/printk.h>
#include <linux/swapops.h>

#include <asm/tlbflush.h>
#include <linux/uaccess.h>

#include "internal.h"

/* Internal flags */
#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)        /* Skip checks for continuous vmas */
#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)                /* Invert check for nodemask */

static struct kmem_cache *policy_cache;
static struct kmem_cache *sn_cache;

/* Highest zone. An specific allocation for a zone below that is not
   policied. */
enum zone_type policy_zone = 0;

/*
 * run-time system-wide default policy => local allocation
 */
static struct mempolicy default_policy = {
        .refcnt = ATOMIC_INIT(1), /* never free it */
        .mode = MPOL_PREFERRED,
        .flags = MPOL_F_LOCAL,
};

static struct mempolicy preferred_node_policy[MAX_NUMNODES];

/**
 * numa_map_to_online_node - Find closest online node
 * @node: Node id to start the search
 *
 * Lookup the next closest node by distance if @nid is not online.
 */
int numa_map_to_online_node(int node)
{
        int min_dist = INT_MAX, dist, n, min_node;

        if (node == NUMA_NO_NODE || node_online(node))
                return node;

        min_node = node;
        for_each_online_node(n) {
                dist = node_distance(node, n);
                if (dist < min_dist) {
                        min_dist = dist;
                        min_node = n;
                }
        }

        return min_node;
}
EXPORT_SYMBOL_GPL(numa_map_to_online_node);

struct mempolicy *get_task_policy(struct task_struct *p)
{
        struct mempolicy *pol = p->mempolicy;
        int node;

        if (pol)
                return pol;

        node = numa_node_id();
        if (node != NUMA_NO_NODE) {
                pol = &preferred_node_policy[node];
                /* preferred_node_policy is not initialised early in boot */
                if (pol->mode)
                        return pol;
        }

        return &default_policy;
}

static const struct mempolicy_operations {
        int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
        void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
} mpol_ops[MPOL_MAX];

static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
{
        return pol->flags & MPOL_MODE_FLAGS;
}

static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
                                   const nodemask_t *rel)
{
        nodemask_t tmp;
        nodes_fold(tmp, *orig, nodes_weight(*rel));
        nodes_onto(*ret, tmp, *rel);
}

static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
{
        if (nodes_empty(*nodes))
                return -EINVAL;
        pol->v.nodes = *nodes;
        return 0;
}

static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
{
        if (!nodes)
                pol->flags |= MPOL_F_LOCAL;        /* local allocation */
        else if (nodes_empty(*nodes))
                return -EINVAL;                        /*  no allowed nodes */
        else
                pol->v.preferred_node = first_node(*nodes);
        return 0;
}

static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
{
        if (nodes_empty(*nodes))
                return -EINVAL;
        pol->v.nodes = *nodes;
        return 0;
}

/*
 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 * any, for the new policy.  mpol_new() has already validated the nodes
 * parameter with respect to the policy mode and flags.  But, we need to
 * handle an empty nodemask with MPOL_PREFERRED here.
 *
 * Must be called holding task's alloc_lock to protect task's mems_allowed
 * and mempolicy.  May also be called holding the mmap_lock for write.
 */
static int mpol_set_nodemask(struct mempolicy *pol,
                     const nodemask_t *nodes, struct nodemask_scratch *nsc)
{
        int ret;

        /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
        if (pol == NULL)
                return 0;
        /* Check N_MEMORY */
        nodes_and(nsc->mask1,
                  cpuset_current_mems_allowed, node_states[N_MEMORY]);

        VM_BUG_ON(!nodes);
        if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
                nodes = NULL;        /* explicit local allocation */
        else {
                if (pol->flags & MPOL_F_RELATIVE_NODES)
                        mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
                else
                        nodes_and(nsc->mask2, *nodes, nsc->mask1);

                if (mpol_store_user_nodemask(pol))
                        pol->w.user_nodemask = *nodes;
                else
                        pol->w.cpuset_mems_allowed =
                                                cpuset_current_mems_allowed;
        }

        if (nodes)
                ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
        else
                ret = mpol_ops[pol->mode].create(pol, NULL);
        return ret;
}

/*
 * This function just creates a new policy, does some check and simple
 * initialization. You must invoke mpol_set_nodemask() to set nodes.
 */
static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
                                  nodemask_t *nodes)
{
        struct mempolicy *policy;

        pr_debug("setting mode %d flags %d nodes[0] %lx\n",
                 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);

        if (mode == MPOL_DEFAULT) {
                if (nodes && !nodes_empty(*nodes))
                        return ERR_PTR(-EINVAL);
                return NULL;
        }
        VM_BUG_ON(!nodes);

        /*
         * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
         * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
         * All other modes require a valid pointer to a non-empty nodemask.
         */
        if (mode == MPOL_PREFERRED) {
                if (nodes_empty(*nodes)) {
                        if (((flags & MPOL_F_STATIC_NODES) ||
                             (flags & MPOL_F_RELATIVE_NODES)))
                                return ERR_PTR(-EINVAL);
                }
        } else if (mode == MPOL_LOCAL) {
                if (!nodes_empty(*nodes) ||
                    (flags & MPOL_F_STATIC_NODES) ||
                    (flags & MPOL_F_RELATIVE_NODES))
                        return ERR_PTR(-EINVAL);
                mode = MPOL_PREFERRED;
        } else if (nodes_empty(*nodes))
                return ERR_PTR(-EINVAL);
        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
        if (!policy)
                return ERR_PTR(-ENOMEM);
        atomic_set(&policy->refcnt, 1);
        policy->mode = mode;
        policy->flags = flags;

        return policy;
}

/* Slow path of a mpol destructor. */
void __mpol_put(struct mempolicy *p)
{
        if (!atomic_dec_and_test(&p->refcnt))
                return;
        kmem_cache_free(policy_cache, p);
}

static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
{
}

static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
{
        nodemask_t tmp;

        if (pol->flags & MPOL_F_STATIC_NODES)
                nodes_and(tmp, pol->w.user_nodemask, *nodes);
        else if (pol->flags & MPOL_F_RELATIVE_NODES)
                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
        else {
                nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
                                                                *nodes);
                pol->w.cpuset_mems_allowed = *nodes;
        }

        if (nodes_empty(tmp))
                tmp = *nodes;

        pol->v.nodes = tmp;
}

static void mpol_rebind_preferred(struct mempolicy *pol,
                                                const nodemask_t *nodes)
{
        nodemask_t tmp;

        if (pol->flags & MPOL_F_STATIC_NODES) {
                int node = first_node(pol->w.user_nodemask);

                if (node_isset(node, *nodes)) {
                        pol->v.preferred_node = node;
                        pol->flags &= ~MPOL_F_LOCAL;
                } else
                        pol->flags |= MPOL_F_LOCAL;
        } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
                pol->v.preferred_node = first_node(tmp);
        } else if (!(pol->flags & MPOL_F_LOCAL)) {
                pol->v.preferred_node = node_remap(pol->v.preferred_node,
                                                   pol->w.cpuset_mems_allowed,
                                                   *nodes);
                pol->w.cpuset_mems_allowed = *nodes;
        }
}

/*
 * mpol_rebind_policy - Migrate a policy to a different set of nodes
 *
 * Per-vma policies are protected by mmap_lock. Allocations using per-task
 * policies are protected by task->mems_allowed_seq to prevent a premature
 * OOM/allocation failure due to parallel nodemask modification.
 */
static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
{
        if (!pol || pol->mode == MPOL_LOCAL)
                return;
        if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
            nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
                return;

        mpol_ops[pol->mode].rebind(pol, newmask);
}

/*
 * Wrapper for mpol_rebind_policy() that just requires task
 * pointer, and updates task mempolicy.
 *
 * Called with task's alloc_lock held.
 */

void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
{
        mpol_rebind_policy(tsk->mempolicy, new);
}

/*
 * Rebind each vma in mm to new nodemask.
 *
 * Call holding a reference to mm.  Takes mm->mmap_lock during call.
 */

void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
{
        struct vm_area_struct *vma;

        mmap_write_lock(mm);
        for (vma = mm->mmap; vma; vma = vma->vm_next)
                mpol_rebind_policy(vma->vm_policy, new);
        mmap_write_unlock(mm);
}

static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
        [MPOL_DEFAULT] = {
                .rebind = mpol_rebind_default,
        },
        [MPOL_INTERLEAVE] = {
                .create = mpol_new_interleave,
                .rebind = mpol_rebind_nodemask,
        },
        [MPOL_PREFERRED] = {
                .create = mpol_new_preferred,
                .rebind = mpol_rebind_preferred,
        },
        [MPOL_BIND] = {
                .create = mpol_new_bind,
                .rebind = mpol_rebind_nodemask,
        },
};

static int migrate_page_add(struct page *page, struct list_head *pagelist,
                                unsigned long flags);

struct queue_pages {
        struct list_head *pagelist;
        unsigned long flags;
        nodemask_t *nmask;
        unsigned long start;
        unsigned long end;
        struct vm_area_struct *first;
};

/*
 * Check if the page's nid is in qp->nmask.
 *
 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
 * in the invert of qp->nmask.
 */
static inline bool queue_pages_required(struct page *page,
                                        struct queue_pages *qp)
{
        int nid = page_to_nid(page);
        unsigned long flags = qp->flags;

        return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
}

/*
 * queue_pages_pmd() has four possible return values:
 * 0 - pages are placed on the right node or queued successfully.
 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
 *     specified.
 * 2 - THP was split.
 * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
 *        existing page was already on a node that does not follow the
 *        policy.
 */
static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
                                unsigned long end, struct mm_walk *walk)
        __releases(ptl)
{
        int ret = 0;
        struct page *page;
        struct queue_pages *qp = walk->private;
        unsigned long flags;

        if (unlikely(is_pmd_migration_entry(*pmd))) {
                ret = -EIO;
                goto unlock;
        }
        page = pmd_page(*pmd);
        if (is_huge_zero_page(page)) {
                spin_unlock(ptl);
                __split_huge_pmd(walk->vma, pmd, addr, false, NULL);
                ret = 2;
                goto out;
        }
        if (!queue_pages_required(page, qp))
                goto unlock;

        flags = qp->flags;
        /* go to thp migration */
        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
                if (!vma_migratable(walk->vma) ||
                    migrate_page_add(page, qp->pagelist, flags)) {
                        ret = 1;
                        goto unlock;
                }
        } else
                ret = -EIO;
unlock:
        spin_unlock(ptl);
out:
        return ret;
}

/*
 * Scan through pages checking if pages follow certain conditions,
 * and move them to the pagelist if they do.
 *
 * queue_pages_pte_range() has three possible return values:
 * 0 - pages are placed on the right node or queued successfully.
 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
 *     specified.
 * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
 *        on a node that does not follow the policy.
 */
static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
                        unsigned long end, struct mm_walk *walk)
{
        struct vm_area_struct *vma = walk->vma;
        struct page *page;
        struct queue_pages *qp = walk->private;
        unsigned long flags = qp->flags;
        int ret;
        bool has_unmovable = false;
        pte_t *pte, *mapped_pte;
        spinlock_t *ptl;

        ptl = pmd_trans_huge_lock(pmd, vma);
        if (ptl) {
                ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
                if (ret != 2)
                        return ret;
        }
        /* THP was split, fall through to pte walk */

        if (pmd_trans_unstable(pmd))
                return 0;

        mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
        for (; addr != end; pte++, addr += PAGE_SIZE) {
                if (!pte_present(*pte))
                        continue;
                page = vm_normal_page(vma, addr, *pte);
                if (!page)
                        continue;
                /*
                 * vm_normal_page() filters out zero pages, but there might
                 * still be PageReserved pages to skip, perhaps in a VDSO.
                 */
                if (PageReserved(page))
                        continue;
                if (!queue_pages_required(page, qp))
                        continue;
                if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
                        /* MPOL_MF_STRICT must be specified if we get here */
                        if (!vma_migratable(vma)) {
                                has_unmovable = true;
                                break;
                        }

                        /*
                         * Do not abort immediately since there may be
                         * temporary off LRU pages in the range.  Still
                         * need migrate other LRU pages.
                         */
                        if (migrate_page_add(page, qp->pagelist, flags))
                                has_unmovable = true;
                } else
                        break;
        }
        pte_unmap_unlock(mapped_pte, ptl);
        cond_resched();

        if (has_unmovable)
                return 1;

        return addr != end ? -EIO : 0;
}

static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
                               unsigned long addr, unsigned long end,
                               struct mm_walk *walk)
{
        int ret = 0;
#ifdef CONFIG_HUGETLB_PAGE
        struct queue_pages *qp = walk->private;
        unsigned long flags = (qp->flags & MPOL_MF_VALID);
        struct page *page;
        spinlock_t *ptl;
        pte_t entry;

        ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
        entry = huge_ptep_get(pte);
        if (!pte_present(entry))
                goto unlock;
        page = pte_page(entry);
        if (!queue_pages_required(page, qp))
                goto unlock;

        if (flags == MPOL_MF_STRICT) {
                /*
                 * STRICT alone means only detecting misplaced page and no
                 * need to further check other vma.
                 */
                ret = -EIO;
                goto unlock;
        }

        if (!vma_migratable(walk->vma)) {
                /*
                 * Must be STRICT with MOVE*, otherwise .test_walk() have
                 * stopped walking current vma.
                 * Detecting misplaced page but allow migrating pages which
                 * have been queued.
                 */
                ret = 1;
                goto unlock;
        }

        /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
        if (flags & (MPOL_MF_MOVE_ALL) ||
            (flags & MPOL_MF_MOVE && page_mapcount(page) == 1 &&
             !hugetlb_pmd_shared(pte))) {
                if (isolate_hugetlb(page, qp->pagelist) &&
                        (flags & MPOL_MF_STRICT))
                        /*
                         * Failed to isolate page but allow migrating pages
                         * which have been queued.
                         */
                        ret = 1;
        }
unlock:
        spin_unlock(ptl);
#else
        BUG();
#endif
        return ret;
}

#ifdef CONFIG_NUMA_BALANCING
/*
 * This is used to mark a range of virtual addresses to be inaccessible.
 * These are later cleared by a NUMA hinting fault. Depending on these
 * faults, pages may be migrated for better NUMA placement.
 *
 * This is assuming that NUMA faults are handled using PROT_NONE. If
 * an architecture makes a different choice, it will need further
 * changes to the core.
 */
unsigned long change_prot_numa(struct vm_area_struct *vma,
                        unsigned long addr, unsigned long end)
{
        int nr_updated;

        nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
        if (nr_updated)
                count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);

        return nr_updated;
}
#else
static unsigned long change_prot_numa(struct vm_area_struct *vma,
                        unsigned long addr, unsigned long end)
{
        return 0;
}
#endif /* CONFIG_NUMA_BALANCING */

static int queue_pages_test_walk(unsigned long start, unsigned long end,
                                struct mm_walk *walk)
{
        struct vm_area_struct *vma = walk->vma;
        struct queue_pages *qp = walk->private;
        unsigned long endvma = vma->vm_end;
        unsigned long flags = qp->flags;

        /* range check first */
        VM_BUG_ON_VMA((vma->vm_start > start) || (vma->vm_end < end), vma);

        if (!qp->first) {
                qp->first = vma;
                if (!(flags & MPOL_MF_DISCONTIG_OK) &&
                        (qp->start < vma->vm_start))
                        /* hole at head side of range */
                        return -EFAULT;
        }
        if (!(flags & MPOL_MF_DISCONTIG_OK) &&
                ((vma->vm_end < qp->end) &&
                (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
                /* hole at middle or tail of range */
                return -EFAULT;

        /*
         * Need check MPOL_MF_STRICT to return -EIO if possible
         * regardless of vma_migratable
         */
        if (!vma_migratable(vma) &&
            !(flags & MPOL_MF_STRICT))
                return 1;

        if (endvma > end)
                endvma = end;

        if (flags & MPOL_MF_LAZY) {
                /* Similar to task_numa_work, skip inaccessible VMAs */
                if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
                        !(vma->vm_flags & VM_MIXEDMAP))
                        change_prot_numa(vma, start, endvma);
                return 1;
        }

        /* queue pages from current vma */
        if (flags & MPOL_MF_VALID)
                return 0;
        return 1;
}

static const struct mm_walk_ops queue_pages_walk_ops = {
        .hugetlb_entry                = queue_pages_hugetlb,
        .pmd_entry                = queue_pages_pte_range,
        .test_walk                = queue_pages_test_walk,
};

/*
 * Walk through page tables and collect pages to be migrated.
 *
 * If pages found in a given range are on a set of nodes (determined by
 * @nodes and @flags,) it's isolated and queued to the pagelist which is
 * passed via @private.
 *
 * queue_pages_range() has three possible return values:
 * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
 *     specified.
 * 0 - queue pages successfully or no misplaced page.
 * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
 *         memory range specified by nodemask and maxnode points outside
 *         your accessible address space (-EFAULT)
 */
static int
queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
                nodemask_t *nodes, unsigned long flags,
                struct list_head *pagelist)
{
        int err;
        struct queue_pages qp = {
                .pagelist = pagelist,
                .flags = flags,
                .nmask = nodes,
                .start = start,
                .end = end,
                .first = NULL,
        };

        err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);

        if (!qp.first)
                /* whole range in hole */
                err = -EFAULT;

        return err;
}

/*
 * Apply policy to a single VMA
 * This must be called with the mmap_lock held for writing.
 */
static int vma_replace_policy(struct vm_area_struct *vma,
                                                struct mempolicy *pol)
{
        int err;
        struct mempolicy *old;
        struct mempolicy *new;

        pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
                 vma->vm_start, vma->vm_end, vma->vm_pgoff,
                 vma->vm_ops, vma->vm_file,
                 vma->vm_ops ? vma->vm_ops->set_policy : NULL);

        new = mpol_dup(pol);
        if (IS_ERR(new))
                return PTR_ERR(new);

        if (vma->vm_ops && vma->vm_ops->set_policy) {
                err = vma->vm_ops->set_policy(vma, new);
                if (err)
                        goto err_out;
        }

        old = vma->vm_policy;
        vma->vm_policy = new; /* protected by mmap_lock */
        mpol_put(old);

        return 0;
 err_out:
        mpol_put(new);
        return err;
}

/* Step 2: apply policy to a range and do splits. */
static int mbind_range(struct mm_struct *mm, unsigned long start,
                       unsigned long end, struct mempolicy *new_pol)
{
        struct vm_area_struct *prev;
        struct vm_area_struct *vma;
        int err = 0;
        pgoff_t pgoff;
        unsigned long vmstart;
        unsigned long vmend;

        vma = find_vma(mm, start);
        VM_BUG_ON(!vma);

        prev = vma->vm_prev;
        if (start > vma->vm_start)
                prev = vma;

        for (; vma && vma->vm_start < end; prev = vma, vma = vma->vm_next) {
                vmstart = max(start, vma->vm_start);
                vmend   = min(end, vma->vm_end);

                if (mpol_equal(vma_policy(vma), new_pol))
                        continue;

                pgoff = vma->vm_pgoff +
                        ((vmstart - vma->vm_start) >> PAGE_SHIFT);
                prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
                                 vma->anon_vma, vma->vm_file, pgoff,
                                 new_pol, vma->vm_userfaultfd_ctx);
                if (prev) {
                        vma = prev;
                        goto replace;
                }
                if (vma->vm_start != vmstart) {
                        err = split_vma(vma->vm_mm, vma, vmstart, 1);
                        if (err)
                                goto out;
                }
                if (vma->vm_end != vmend) {
                        err = split_vma(vma->vm_mm, vma, vmend, 0);
                        if (err)
                                goto out;
                }
 replace:
                err = vma_replace_policy(vma, new_pol);
                if (err)
                        goto out;
        }

 out:
        return err;
}

/* Set the process memory policy */
static long do_set_mempolicy(unsigned short mode, unsigned short flags,
                             nodemask_t *nodes)
{
        struct mempolicy *new, *old;
        NODEMASK_SCRATCH(scratch);
        int ret;

        if (!scratch)
                return -ENOMEM;

        new = mpol_new(mode, flags, nodes);
        if (IS_ERR(new)) {
                ret = PTR_ERR(new);
                goto out;
        }

        ret = mpol_set_nodemask(new, nodes, scratch);
        if (ret) {
                mpol_put(new);
                goto out;
        }
        task_lock(current);
        old = current->mempolicy;
        current->mempolicy = new;
        if (new && new->mode == MPOL_INTERLEAVE)
                current->il_prev = MAX_NUMNODES-1;
        task_unlock(current);
        mpol_put(old);
        ret = 0;
out:
        NODEMASK_SCRATCH_FREE(scratch);
        return ret;
}

/*
 * Return nodemask for policy for get_mempolicy() query
 *
 * Called with task's alloc_lock held
 */
static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
{
        nodes_clear(*nodes);
        if (p == &default_policy)
                return;

        switch (p->mode) {
        case MPOL_BIND:
        case MPOL_INTERLEAVE:
                *nodes = p->v.nodes;
                break;
        case MPOL_PREFERRED:
                if (!(p->flags & MPOL_F_LOCAL))
                        node_set(p->v.preferred_node, *nodes);
                /* else return empty node mask for local allocation */
                break;
        default:
                BUG();
        }
}

static int lookup_node(struct mm_struct *mm, unsigned long addr)
{
        struct page *p = NULL;
        int err;

        int locked = 1;
        err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
        if (err > 0) {
                err = page_to_nid(p);
                put_page(p);
        }
        if (locked)
                mmap_read_unlock(mm);
        return err;
}

/* Retrieve NUMA policy */
static long do_get_mempolicy(int *policy, nodemask_t *nmask,
                             unsigned long addr, unsigned long flags)
{
        int err;
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma = NULL;
        struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;

        if (flags &
                ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
                return -EINVAL;

        if (flags & MPOL_F_MEMS_ALLOWED) {
                if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
                        return -EINVAL;
                *policy = 0;        /* just so it's initialized */
                task_lock(current);
                *nmask  = cpuset_current_mems_allowed;
                task_unlock(current);
                return 0;
        }

        if (flags & MPOL_F_ADDR) {
                /*
                 * Do NOT fall back to task policy if the
                 * vma/shared policy at addr is NULL.  We
                 * want to return MPOL_DEFAULT in this case.
                 */
                mmap_read_lock(mm);
                vma = find_vma_intersection(mm, addr, addr+1);
                if (!vma) {
                        mmap_read_unlock(mm);
                        return -EFAULT;
                }
                if (vma->vm_ops && vma->vm_ops->get_policy)
                        pol = vma->vm_ops->get_policy(vma, addr);
                else
                        pol = vma->vm_policy;
        } else if (addr)
                return -EINVAL;

        if (!pol)
                pol = &default_policy;        /* indicates default behavior */

        if (flags & MPOL_F_NODE) {
                if (flags & MPOL_F_ADDR) {
                        /*
                         * Take a refcount on the mpol, lookup_node()
                         * wil drop the mmap_lock, so after calling
                         * lookup_node() only "pol" remains valid, "vma"
                         * is stale.
                         */
                        pol_refcount = pol;
                        vma = NULL;
                        mpol_get(pol);
                        err = lookup_node(mm, addr);
                        if (err < 0)
                                goto out;
                        *policy = err;
                } else if (pol == current->mempolicy &&
                                pol->mode == MPOL_INTERLEAVE) {
                        *policy = next_node_in(current->il_prev, pol->v.nodes);
                } else {
                        err = -EINVAL;
                        goto out;
                }
        } else {
                *policy = pol == &default_policy ? MPOL_DEFAULT :
                                                pol->mode;
                /*
                 * Internal mempolicy flags must be masked off before exposing
                 * the policy to userspace.
                 */
                *policy |= (pol->flags & MPOL_MODE_FLAGS);
        }

        err = 0;
        if (nmask) {
                if (mpol_store_user_nodemask(pol)) {
                        *nmask = pol->w.user_nodemask;
                } else {
                        task_lock(current);
                        get_policy_nodemask(pol, nmask);
                        task_unlock(current);
                }
        }

 out:
        mpol_cond_put(pol);
        if (vma)
                mmap_read_unlock(mm);
        if (pol_refcount)
                mpol_put(pol_refcount);
        return err;
}

#ifdef CONFIG_MIGRATION
/*
 * page migration, thp tail pages can be passed.
 */
static int migrate_page_add(struct page *page, struct list_head *pagelist,
                                unsigned long flags)
{
        struct page *head = compound_head(page);
        /*
         * Avoid migrating a page that is shared with others.
         */
        if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
                if (!isolate_lru_page(head)) {
                        list_add_tail(&head->lru, pagelist);
                        mod_node_page_state(page_pgdat(head),
                                NR_ISOLATED_ANON + page_is_file_lru(head),
                                thp_nr_pages(head));
                } else if (flags & MPOL_MF_STRICT) {
                        /*
                         * Non-movable page may reach here.  And, there may be
                         * temporary off LRU pages or non-LRU movable pages.
                         * Treat them as unmovable pages since they can't be
                         * isolated, so they can't be moved at the moment.  It
                         * should return -EIO for this case too.
                         */
                        return -EIO;
                }
        }

        return 0;
}

/*
 * Migrate pages from one node to a target node.
 * Returns error or the number of pages not migrated.
 */
static int migrate_to_node(struct mm_struct *mm, int source, int dest,
                           int flags)
{
        nodemask_t nmask;
        LIST_HEAD(pagelist);
        int err = 0;
        struct migration_target_control mtc = {
                .nid = dest,
                .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
        };

        nodes_clear(nmask);
        node_set(source, nmask);

        /*
         * This does not "check" the range but isolates all pages that
         * need migration.  Between passing in the full user address
         * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
         */
        VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
        queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);

        if (!list_empty(&pagelist)) {
                err = migrate_pages(&pagelist, alloc_migration_target, NULL,
                                (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
                if (err)
                        putback_movable_pages(&pagelist);
        }

        return err;
}

/*
 * Move pages between the two nodesets so as to preserve the physical
 * layout as much as possible.
 *
 * Returns the number of page that could not be moved.
 */
int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
                     const nodemask_t *to, int flags)
{
        int busy = 0;
        int err;
        nodemask_t tmp;

        err = migrate_prep();
        if (err)
                return err;

        mmap_read_lock(mm);

        /*
         * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
         * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
         * bit in 'tmp', and return that <source, dest> pair for migration.
         * The pair of nodemasks 'to' and 'from' define the map.
         *
         * If no pair of bits is found that way, fallback to picking some
         * pair of 'source' and 'dest' bits that are not the same.  If the
         * 'source' and 'dest' bits are the same, this represents a node
         * that will be migrating to itself, so no pages need move.
         *
         * If no bits are left in 'tmp', or if all remaining bits left
         * in 'tmp' correspond to the same bit in 'to', return false
         * (nothing left to migrate).
         *
         * This lets us pick a pair of nodes to migrate between, such that
         * if possible the dest node is not already occupied by some other
         * source node, minimizing the risk of overloading the memory on a
         * node that would happen if we migrated incoming memory to a node
         * before migrating outgoing memory source that same node.
         *
         * A single scan of tmp is sufficient.  As we go, we remember the
         * most recent <s, d> pair that moved (s != d).  If we find a pair
         * that not only moved, but what's better, moved to an empty slot
         * (d is not set in tmp), then we break out then, with that pair.
         * Otherwise when we finish scanning from_tmp, we at least have the
         * most recent <s, d> pair that moved.  If we get all the way through
         * the scan of tmp without finding any node that moved, much less
         * moved to an empty node, then there is nothing left worth migrating.
         */

        tmp = *from;
        while (!nodes_empty(tmp)) {
                int s,d;
                int source = NUMA_NO_NODE;
                int dest = 0;

                for_each_node_mask(s, tmp) {

                        /*
                         * do_migrate_pages() tries to maintain the relative
                         * node relationship of the pages established between
                         * threads and memory areas.
                         *
                         * However if the number of source nodes is not equal to
                         * the number of destination nodes we can not preserve
                         * this node relative relationship.  In that case, skip
                         * copying memory from a node that is in the destination
                         * mask.
                         *
                         * Example: [2,3,4] -> [3,4,5] moves everything.
                         *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
                         */

                        if ((nodes_weight(*from) != nodes_weight(*to)) &&
                                                (node_isset(s, *to)))
                                continue;

                        d = node_remap(s, *from, *to);
                        if (s == d)
                                continue;

                        source = s;        /* Node moved. Memorize */
                        dest = d;

                        /* dest not in remaining from nodes? */
                        if (!node_isset(dest, tmp))
                                break;
                }
                if (source == NUMA_NO_NODE)
                        break;

                node_clear(source, tmp);
                err = migrate_to_node(mm, source, dest, flags);
                if (err > 0)
                        busy += err;
                if (err < 0)
                        break;
        }
        mmap_read_unlock(mm);
        if (err < 0)
                return err;
        return busy;

}

/*
 * Allocate a new page for page migration based on vma policy.
 * Start by assuming the page is mapped by the same vma as contains @start.
 * Search forward from there, if not.  N.B., this assumes that the
 * list of pages handed to migrate_pages()--which is how we get here--
 * is in virtual address order.
 */
static struct page *new_page(struct page *page, unsigned long start)
{
        struct vm_area_struct *vma;
        unsigned long address;

        vma = find_vma(current->mm, start);
        while (vma) {
                address = page_address_in_vma(page, vma);
                if (address != -EFAULT)
                        break;
                vma = vma->vm_next;
        }

        if (PageHuge(page)) {
                return alloc_huge_page_vma(page_hstate(compound_head(page)),
                                vma, address);
        } else if (PageTransHuge(page)) {
                struct page *thp;

                thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
                                         HPAGE_PMD_ORDER);
                if (!thp)
                        return NULL;
                prep_transhuge_page(thp);
                return thp;
        }
        /*
         * if !vma, alloc_page_vma() will use task or system default policy
         */
        return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
                        vma, address);
}
#else

static int migrate_page_add(struct page *page, struct list_head *pagelist,
                                unsigned long flags)
{
        return -EIO;
}

int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
                     const nodemask_t *to, int flags)
{
        return -ENOSYS;
}

static struct page *new_page(struct page *page, unsigned long start)
{
        return NULL;
}
#endif

static long do_mbind(unsigned long start, unsigned long len,
                     unsigned short mode, unsigned short mode_flags,
                     nodemask_t *nmask, unsigned long flags)
{
        struct mm_struct *mm = current->mm;
        struct mempolicy *new;
        unsigned long end;
        int err;
        int ret;
        LIST_HEAD(pagelist);

        if (flags & ~(unsigned long)MPOL_MF_VALID)
                return -EINVAL;
        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
                return -EPERM;

        if (start & ~PAGE_MASK)
                return -EINVAL;

        if (mode == MPOL_DEFAULT)
                flags &= ~MPOL_MF_STRICT;

        len = (len + PAGE_SIZE - 1) & PAGE_MASK;
        end = start + len;

        if (end < start)
                return -EINVAL;
        if (end == start)
                return 0;

        new = mpol_new(mode, mode_flags, nmask);
        if (IS_ERR(new))
                return PTR_ERR(new);

        if (flags & MPOL_MF_LAZY)
                new->flags |= MPOL_F_MOF;

        /*
         * If we are using the default policy then operation
         * on discontinuous address spaces is okay after all
         */
        if (!new)
                flags |= MPOL_MF_DISCONTIG_OK;

        pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
                 start, start + len, mode, mode_flags,
                 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);

        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {

                err = migrate_prep();
                if (err)
                        goto mpol_out;
        }
        {
                NODEMASK_SCRATCH(scratch);
                if (scratch) {
                        mmap_write_lock(mm);
                        err = mpol_set_nodemask(new, nmask, scratch);
                        if (err)
                                mmap_write_unlock(mm);
                } else
                        err = -ENOMEM;
                NODEMASK_SCRATCH_FREE(scratch);
        }
        if (err)
                goto mpol_out;

        ret = queue_pages_range(mm, start, end, nmask,
                          flags | MPOL_MF_INVERT, &pagelist);

        if (ret < 0) {
                err = ret;
                goto up_out;
        }

        err = mbind_range(mm, start, end, new);

        if (!err) {
                int nr_failed = 0;

                if (!list_empty(&pagelist)) {
                        WARN_ON_ONCE(flags & MPOL_MF_LAZY);
                        nr_failed = migrate_pages(&pagelist, new_page, NULL,
                                start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
                        if (nr_failed)
                                putback_movable_pages(&pagelist);
                }

                if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
                        err = -EIO;
        } else {
up_out:
                if (!list_empty(&pagelist))
                        putback_movable_pages(&pagelist);
        }

        mmap_write_unlock(mm);
mpol_out:
        mpol_put(new);
        return err;
}

/*
 * User space interface with variable sized bitmaps for nodelists.
 */

/* Copy a node mask from user space. */
static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
                     unsigned long maxnode)
{
        unsigned long k;
        unsigned long t;
        unsigned long nlongs;
        unsigned long endmask;

        --maxnode;
        nodes_clear(*nodes);
        if (maxnode == 0 || !nmask)
                return 0;
        if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
                return -EINVAL;

        nlongs = BITS_TO_LONGS(maxnode);
        if ((maxnode % BITS_PER_LONG) == 0)
                endmask = ~0UL;
        else
                endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;

        /*
         * When the user specified more nodes than supported just check
         * if the non supported part is all zero.
         *
         * If maxnode have more longs than MAX_NUMNODES, check
         * the bits in that area first. And then go through to
         * check the rest bits which equal or bigger than MAX_NUMNODES.
         * Otherwise, just check bits [MAX_NUMNODES, maxnode).
         */
        if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
                for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
                        if (get_user(t, nmask + k))
                                return -EFAULT;
                        if (k == nlongs - 1) {
                                if (t & endmask)
                                        return -EINVAL;
                        } else if (t)
                                return -EINVAL;
                }
                nlongs = BITS_TO_LONGS(MAX_NUMNODES);
                endmask = ~0UL;
        }

        if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
                unsigned long valid_mask = endmask;

                valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
                if (get_user(t, nmask + nlongs - 1))
                        return -EFAULT;
                if (t & valid_mask)
                        return -EINVAL;
        }

        if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
                return -EFAULT;
        nodes_addr(*nodes)[nlongs-1] &= endmask;
        return 0;
}

/* Copy a kernel node mask to user space */
static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
                              nodemask_t *nodes)
{
        unsigned long copy = ALIGN(maxnode-1, 64) / 8;
        unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);

        if (copy > nbytes) {
                if (copy > PAGE_SIZE)
                        return -EINVAL;
                if (clear_user((char __user *)mask + nbytes, copy - nbytes))
                        return -EFAULT;
                copy = nbytes;
        }
        return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
}

static long kernel_mbind(unsigned long start, unsigned long len,
                         unsigned long mode, const unsigned long __user *nmask,
                         unsigned long maxnode, unsigned int flags)
{
        nodemask_t nodes;
        int err;
        unsigned short mode_flags;

        start = untagged_addr(start);
        mode_flags = mode & MPOL_MODE_FLAGS;
        mode &= ~MPOL_MODE_FLAGS;
        if (mode >= MPOL_MAX)
                return -EINVAL;
        if ((mode_flags & MPOL_F_STATIC_NODES) &&
            (mode_flags & MPOL_F_RELATIVE_NODES))
                return -EINVAL;
        err = get_nodes(&nodes, nmask, maxnode);
        if (err)
                return err;
        return do_mbind(start, len, mode, mode_flags, &nodes, flags);
}

SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
                unsigned long, mode, const unsigned long __user *, nmask,
                unsigned long, maxnode, unsigned int, flags)
{
        return kernel_mbind(start, len, mode, nmask, maxnode, flags);
}

/* Set the process memory policy */
static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
                                 unsigned long maxnode)
{
        int err;
        nodemask_t nodes;
        unsigned short flags;

        flags = mode & MPOL_MODE_FLAGS;
        mode &= ~MPOL_MODE_FLAGS;
        if ((unsigned int)mode >= MPOL_MAX)
                return -EINVAL;
        if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
                return -EINVAL;
        err = get_nodes(&nodes, nmask, maxnode);
        if (err)
                return err;
        return do_set_mempolicy(mode, flags, &nodes);
}

SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
                unsigned long, maxnode)
{
        return kernel_set_mempolicy(mode, nmask, maxnode);
}

static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
                                const unsigned long __user *old_nodes,
                                const unsigned long __user *new_nodes)
{
        struct mm_struct *mm = NULL;
        struct task_struct *task;
        nodemask_t task_nodes;
        int err;
        nodemask_t *old;
        nodemask_t *new;
        NODEMASK_SCRATCH(scratch);

        if (!scratch)
                return -ENOMEM;

        old = &scratch->mask1;
        new = &scratch->mask2;

        err = get_nodes(old, old_nodes, maxnode);
        if (err)
                goto out;

        err = get_nodes(new, new_nodes, maxnode);
        if (err)
                goto out;

        /* Find the mm_struct */
        rcu_read_lock();
        task = pid ? find_task_by_vpid(pid) : current;
        if (!task) {
                rcu_read_unlock();
                err = -ESRCH;
                goto out;
        }
        get_task_struct(task);

        err = -EINVAL;

        /*
         * Check if this process has the right to modify the specified process.
         * Use the regular "ptrace_may_access()" checks.
         */
        if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
                rcu_read_unlock();
                err = -EPERM;
                goto out_put;
        }
        rcu_read_unlock();

        task_nodes = cpuset_mems_allowed(task);
        /* Is the user allowed to access the target nodes? */
        if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
                err = -EPERM;
                goto out_put;
        }

        task_nodes = cpuset_mems_allowed(current);
        nodes_and(*new, *new, task_nodes);
        if (nodes_empty(*new))
                goto out_put;

        err = security_task_movememory(task);
        if (err)
                goto out_put;

        mm = get_task_mm(task);
        put_task_struct(task);

        if (!mm) {
                err = -EINVAL;
                goto out;
        }

        err = do_migrate_pages(mm, old, new,
                capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);

        mmput(mm);
out:
        NODEMASK_SCRATCH_FREE(scratch);

        return err;

out_put:
        put_task_struct(task);
        goto out;

}

SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
                const unsigned long __user *, old_nodes,
                const unsigned long __user *, new_nodes)
{
        return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
}


/* Retrieve NUMA policy */
static int kernel_get_mempolicy(int __user *policy,
                                unsigned long __user *nmask,
                                unsigned long maxnode,
                                unsigned long addr,
                                unsigned long flags)
{
        int err;
        int pval;
        nodemask_t nodes;

        if (nmask != NULL && maxnode < nr_node_ids)
                return -EINVAL;

        addr = untagged_addr(addr);

        err = do_get_mempolicy(&pval, &nodes, addr, flags);

        if (err)
                return err;

        if (policy && put_user(pval, policy))
                return -EFAULT;

        if (nmask)
                err = copy_nodes_to_user(nmask, maxnode, &nodes);

        return err;
}

SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
                unsigned long __user *, nmask, unsigned long, maxnode,
                unsigned long, addr, unsigned long, flags)
{
        return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
}

#ifdef CONFIG_COMPAT

COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
                       compat_ulong_t __user *, nmask,
                       compat_ulong_t, maxnode,
                       compat_ulong_t, addr, compat_ulong_t, flags)
{
        long err;
        unsigned long __user *nm = NULL;
        unsigned long nr_bits, alloc_size;
        DECLARE_BITMAP(bm, MAX_NUMNODES);

        nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;

        if (nmask)
                nm = compat_alloc_user_space(alloc_size);

        err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);

        if (!err && nmask) {
                unsigned long copy_size;
                copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
                err = copy_from_user(bm, nm, copy_size);
                /* ensure entire bitmap is zeroed */
                err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
                err |= compat_put_bitmap(nmask, bm, nr_bits);
        }

        return err;
}

COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
                       compat_ulong_t, maxnode)
{
        unsigned long __user *nm = NULL;
        unsigned long nr_bits, alloc_size;
        DECLARE_BITMAP(bm, MAX_NUMNODES);

        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;

        if (nmask) {
                if (compat_get_bitmap(bm, nmask, nr_bits))
                        return -EFAULT;
                nm = compat_alloc_user_space(alloc_size);
                if (copy_to_user(nm, bm, alloc_size))
                        return -EFAULT;
        }

        return kernel_set_mempolicy(mode, nm, nr_bits+1);
}

COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
                       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
                       compat_ulong_t, maxnode, compat_ulong_t, flags)
{
        unsigned long __user *nm = NULL;
        unsigned long nr_bits, alloc_size;
        nodemask_t bm;

        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;

        if (nmask) {
                if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
                        return -EFAULT;
                nm = compat_alloc_user_space(alloc_size);
                if (copy_to_user(nm, nodes_addr(bm), alloc_size))
                        return -EFAULT;
        }

        return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
}

COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
                       compat_ulong_t, maxnode,
                       const compat_ulong_t __user *, old_nodes,
                       const compat_ulong_t __user *, new_nodes)
{
        unsigned long __user *old = NULL;
        unsigned long __user *new = NULL;
        nodemask_t tmp_mask;
        unsigned long nr_bits;
        unsigned long size;

        nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
        size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
        if (old_nodes) {
                if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
                        return -EFAULT;
                old = compat_alloc_user_space(new_nodes ? size * 2 : size);
                if (new_nodes)
                        new = old + size / sizeof(unsigned long);
                if (copy_to_user(old, nodes_addr(tmp_mask), size))
                        return -EFAULT;
        }
        if (new_nodes) {
                if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
                        return -EFAULT;
                if (new == NULL)
                        new = compat_alloc_user_space(size);
                if (copy_to_user(new, nodes_addr(tmp_mask), size))
                        return -EFAULT;
        }
        return kernel_migrate_pages(pid, nr_bits + 1, old, new);
}

#endif /* CONFIG_COMPAT */

bool vma_migratable(struct vm_area_struct *vma)
{
        if (vma->vm_flags & (VM_IO | VM_PFNMAP))
                return false;

        /*
         * DAX device mappings require predictable access latency, so avoid
         * incurring periodic faults.
         */
        if (vma_is_dax(vma))
                return false;

        if (is_vm_hugetlb_page(vma) &&
                !hugepage_migration_supported(hstate_vma(vma)))
                return false;

        /*
         * Migration allocates pages in the highest zone. If we cannot
         * do so then migration (at least from node to node) is not
         * possible.
         */
        if (vma->vm_file &&
                gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
                        < policy_zone)
                return false;
        return true;
}

struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
                                                unsigned long addr)
{
        struct mempolicy *pol = NULL;

        if (vma) {
                if (vma->vm_ops && vma->vm_ops->get_policy) {
                        pol = vma->vm_ops->get_policy(vma, addr);
                } else if (vma->vm_policy) {
                        pol = vma->vm_policy;

                        /*
                         * shmem_alloc_page() passes MPOL_F_SHARED policy with
                         * a pseudo vma whose vma->vm_ops=NULL. Take a reference
                         * count on these policies which will be dropped by
                         * mpol_cond_put() later
                         */
                        if (mpol_needs_cond_ref(pol))
                                mpol_get(pol);
                }
        }

        return pol;
}

/*
 * get_vma_policy(@vma, @addr)
 * @vma: virtual memory area whose policy is sought
 * @addr: address in @vma for shared policy lookup
 *
 * Returns effective policy for a VMA at specified address.
 * Falls back to current->mempolicy or system default policy, as necessary.
 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
 * count--added by the get_policy() vm_op, as appropriate--to protect against
 * freeing by another task.  It is the caller's responsibility to free the
 * extra reference for shared policies.
 */
static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
                                                unsigned long addr)
{
        struct mempolicy *pol = __get_vma_policy(vma, addr);

        if (!pol)
                pol = get_task_policy(current);

        return pol;
}

bool vma_policy_mof(struct vm_area_struct *vma)
{
        struct mempolicy *pol;

        if (vma->vm_ops && vma->vm_ops->get_policy) {
                bool ret = false;

                pol = vma->vm_ops->get_policy(vma, vma->vm_start);
                if (pol && (pol->flags & MPOL_F_MOF))
                        ret = true;
                mpol_cond_put(pol);

                return ret;
        }

        pol = vma->vm_policy;
        if (!pol)
                pol = get_task_policy(current);

        return pol->flags & MPOL_F_MOF;
}

static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
{
        enum zone_type dynamic_policy_zone = policy_zone;

        BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);

        /*
         * if policy->v.nodes has movable memory only,
         * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
         *
         * policy->v.nodes is intersect with node_states[N_MEMORY].
         * so if the following test faile, it implies
         * policy->v.nodes has movable memory only.
         */
        if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
                dynamic_policy_zone = ZONE_MOVABLE;

        return zone >= dynamic_policy_zone;
}

/*
 * Return a nodemask representing a mempolicy for filtering nodes for
 * page allocation
 */
nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
{
        /* Lower zones don't get a nodemask applied for MPOL_BIND */
        if (unlikely(policy->mode == MPOL_BIND) &&
                        apply_policy_zone(policy, gfp_zone(gfp)) &&
                        cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
                return &policy->v.nodes;

        return NULL;
}

/* Return the node id preferred by the given mempolicy, or the given id */
static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
{
        if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
                nd = policy->v.preferred_node;
        else {
                /*
                 * __GFP_THISNODE shouldn't even be used with the bind policy
                 * because we might easily break the expectation to stay on the
                 * requested node and not break the policy.
                 */
                WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
        }

        return nd;
}

/* Do dynamic interleaving for a process */
static unsigned interleave_nodes(struct mempolicy *policy)
{
        unsigned next;
        struct task_struct *me = current;

        next = next_node_in(me->il_prev, policy->v.nodes);
        if (next < MAX_NUMNODES)
                me->il_prev = next;
        return next;
}

/*
 * Depending on the memory policy provide a node from which to allocate the
 * next slab entry.
 */
unsigned int mempolicy_slab_node(void)
{
        struct mempolicy *policy;
        int node = numa_mem_id();

        if (in_interrupt())
                return node;

        policy = current->mempolicy;
        if (!policy || policy->flags & MPOL_F_LOCAL)
                return node;

        switch (policy->mode) {
        case MPOL_PREFERRED:
                /*
                 * handled MPOL_F_LOCAL above
                 */
                return policy->v.preferred_node;

        case MPOL_INTERLEAVE:
                return interleave_nodes(policy);

        case MPOL_BIND: {
                struct zoneref *z;

                /*
                 * Follow bind policy behavior and start allocation at the
                 * first node.
                 */
                struct zonelist *zonelist;
                enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
                zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
                z = first_zones_zonelist(zonelist, highest_zoneidx,
                                                        &policy->v.nodes);
                return z->zone ? zone_to_nid(z->zone) : node;
        }

        default:
                BUG();
        }
}

/*
 * Do static interleaving for a VMA with known offset @n.  Returns the n'th
 * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
 * number of present nodes.
 */
static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
{
        unsigned nnodes = nodes_weight(pol->v.nodes);
        unsigned target;
        int i;
        int nid;

        if (!nnodes)
                return numa_node_id();
        target = (unsigned int)n % nnodes;
        nid = first_node(pol->v.nodes);
        for (i = 0; i < target; i++)
                nid = next_node(nid, pol->v.nodes);
        return nid;
}

/* Determine a node number for interleave */
static inline unsigned interleave_nid(struct mempolicy *pol,
                 struct vm_area_struct *vma, unsigned long addr, int shift)
{
        if (vma) {
                unsigned long off;

                /*
                 * for small pages, there is no difference between
                 * shift and PAGE_SHIFT, so the bit-shift is safe.
                 * for huge pages, since vm_pgoff is in units of small
                 * pages, we need to shift off the always 0 bits to get
                 * a useful offset.
                 */
                BUG_ON(shift < PAGE_SHIFT);
                off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
                off += (addr - vma->vm_start) >> shift;
                return offset_il_node(pol, off);
        } else
                return interleave_nodes(pol);
}

#ifdef CONFIG_HUGETLBFS
/*
 * huge_node(@vma, @addr, @gfp_flags, @mpol)
 * @vma: virtual memory area whose policy is sought
 * @addr: address in @vma for shared policy lookup and interleave policy
 * @gfp_flags: for requested zone
 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
 * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
 *
 * Returns a nid suitable for a huge page allocation and a pointer
 * to the struct mempolicy for conditional unref after allocation.
 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
 * @nodemask for filtering the zonelist.
 *
 * Must be protected by read_mems_allowed_begin()
 */
int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
                                struct mempolicy **mpol, nodemask_t **nodemask)
{
        int nid;

        *mpol = get_vma_policy(vma, addr);
        *nodemask = NULL;        /* assume !MPOL_BIND */

        if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
                nid = interleave_nid(*mpol, vma, addr,
                                        huge_page_shift(hstate_vma(vma)));
        } else {
                nid = policy_node(gfp_flags, *mpol, numa_node_id());
                if ((*mpol)->mode == MPOL_BIND)
                        *nodemask = &(*mpol)->v.nodes;
        }
        return nid;
}

/*
 * init_nodemask_of_mempolicy
 *
 * If the current task's mempolicy is "default" [NULL], return 'false'
 * to indicate default policy.  Otherwise, extract the policy nodemask
 * for 'bind' or 'interleave' policy into the argument nodemask, or
 * initialize the argument nodemask to contain the single node for
 * 'preferred' or 'local' policy and return 'true' to indicate presence
 * of non-default mempolicy.
 *
 * We don't bother with reference counting the mempolicy [mpol_get/put]
 * because the current task is examining it's own mempolicy and a task's
 * mempolicy is only ever changed by the task itself.
 *
 * N.B., it is the caller's responsibility to free a returned nodemask.
 */
bool init_nodemask_of_mempolicy(nodemask_t *mask)
{
        struct mempolicy *mempolicy;
        int nid;

        if (!(mask && current->mempolicy))
                return false;

        task_lock(current);
        mempolicy = current->mempolicy;
        switch (mempolicy->mode) {
        case MPOL_PREFERRED:
                if (mempolicy->flags & MPOL_F_LOCAL)
                        nid = numa_node_id();
                else
                        nid = mempolicy->v.preferred_node;
                init_nodemask_of_node(mask, nid);
                break;

        case MPOL_BIND:
        case MPOL_INTERLEAVE:
                *mask =  mempolicy->v.nodes;
                break;

        default:
                BUG();
        }
        task_unlock(current);

        return true;
}
#endif

/*
 * mempolicy_nodemask_intersects
 *
 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
 * policy.  Otherwise, check for intersection between mask and the policy
 * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
 * policy, always return true since it may allocate elsewhere on fallback.
 *
 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
 */
bool mempolicy_nodemask_intersects(struct task_struct *tsk,
                                        const nodemask_t *mask)
{
        struct mempolicy *mempolicy;
        bool ret = true;

        if (!mask)
                return ret;
        task_lock(tsk);
        mempolicy = tsk->mempolicy;
        if (!mempolicy)
                goto out;

        switch (mempolicy->mode) {
        case MPOL_PREFERRED:
                /*
                 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
                 * allocate from, they may fallback to other nodes when oom.
                 * Thus, it's possible for tsk to have allocated memory from
                 * nodes in mask.
                 */
                break;
        case MPOL_BIND:
        case MPOL_INTERLEAVE:
                ret = nodes_intersects(mempolicy->v.nodes, *mask);
                break;
        default:
                BUG();
        }
out:
        task_unlock(tsk);
        return ret;
}

/* Allocate a page in interleaved policy.
   Own path because it needs to do special accounting. */
static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
                                        unsigned nid)
{
        struct page *page;

        page = __alloc_pages(gfp, order, nid);
        /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
        if (!static_branch_likely(&vm_numa_stat_key))
                return page;
        if (page && page_to_nid(page) == nid) {
                preempt_disable();
                __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
                preempt_enable();
        }
        return page;
}

/**
 *         alloc_pages_vma        - Allocate a page for a VMA.
 *
 *         @gfp:
 *      %GFP_USER    user allocation.
 *      %GFP_KERNEL  kernel allocations,
 *      %GFP_HIGHMEM highmem/user allocations,
 *      %GFP_FS      allocation should not call back into a file system.
 *      %GFP_ATOMIC  don't sleep.
 *
 *        @order:Order of the GFP allocation.
 *         @vma:  Pointer to VMA or NULL if not available.
 *        @addr: Virtual Address of the allocation. Must be inside the VMA.
 *        @node: Which node to prefer for allocation (modulo policy).
 *        @hugepage: for hugepages try only the preferred node if possible
 *
 *         This function allocates a page from the kernel page pool and applies
 *        a NUMA policy associated with the VMA or the current process.
 *        When VMA is not NULL caller must read-lock the mmap_lock of the
 *        mm_struct of the VMA to prevent it from going away. Should be used for
 *        all allocations for pages that will be mapped into user space. Returns
 *        NULL when no page can be allocated.
 */
struct page *
alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
                unsigned long addr, int node, bool hugepage)
{
        struct mempolicy *pol;
        struct page *page;
        int preferred_nid;
        nodemask_t *nmask;

        pol = get_vma_policy(vma, addr);

        if (pol->mode == MPOL_INTERLEAVE) {
                unsigned nid;

                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
                mpol_cond_put(pol);
                page = alloc_page_interleave(gfp, order, nid);
                goto out;
        }

        if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
                int hpage_node = node;

                /*
                 * For hugepage allocation and non-interleave policy which
                 * allows the current node (or other explicitly preferred
                 * node) we only try to allocate from the current/preferred
                 * node and don't fall back to other nodes, as the cost of
                 * remote accesses would likely offset THP benefits.
                 *
                 * If the policy is interleave, or does not allow the current
                 * node in its nodemask, we allocate the standard way.
                 */
                if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
                        hpage_node = pol->v.preferred_node;

                nmask = policy_nodemask(gfp, pol);
                if (!nmask || node_isset(hpage_node, *nmask)) {
                        mpol_cond_put(pol);
                        /*
                         * First, try to allocate THP only on local node, but
                         * don't reclaim unnecessarily, just compact.
                         */
                        page = __alloc_pages_node(hpage_node,
                                gfp | __GFP_THISNODE | __GFP_NORETRY, order);

                        /*
                         * If hugepage allocations are configured to always
                         * synchronous compact or the vma has been madvised
                         * to prefer hugepage backing, retry allowing remote
                         * memory with both reclaim and compact as well.
                         */
                        if (!page && (gfp & __GFP_DIRECT_RECLAIM))
                                page = __alloc_pages_nodemask(gfp, order,
                                                        hpage_node, nmask);

                        goto out;
                }
        }

        nmask = policy_nodemask(gfp, pol);
        preferred_nid = policy_node(gfp, pol, node);
        page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
        mpol_cond_put(pol);
out:
        return page;
}
EXPORT_SYMBOL(alloc_pages_vma);

/**
 *         alloc_pages_current - Allocate pages.
 *
 *        @gfp:
 *                %GFP_USER   user allocation,
 *              %GFP_KERNEL kernel allocation,
 *              %GFP_HIGHMEM highmem allocation,
 *              %GFP_FS     don't call back into a file system.
 *              %GFP_ATOMIC don't sleep.
 *        @order: Power of two of allocation size in pages. 0 is a single page.
 *
 *        Allocate a page from the kernel page pool.  When not in
 *        interrupt context and apply the current process NUMA policy.
 *        Returns NULL when no page can be allocated.
 */
struct page *alloc_pages_current(gfp_t gfp, unsigned order)
{
        struct mempolicy *pol = &default_policy;
        struct page *page;

        if (!in_interrupt() && !(gfp & __GFP_THISNODE))
                pol = get_task_policy(current);

        /*
         * No reference counting needed for current->mempolicy
         * nor system default_policy
         */
        if (pol->mode == MPOL_INTERLEAVE)
                page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
        else
                page = __alloc_pages_nodemask(gfp, order,
                                policy_node(gfp, pol, numa_node_id()),
                                policy_nodemask(gfp, pol));

        return page;
}
EXPORT_SYMBOL(alloc_pages_current);

int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
{
        struct mempolicy *pol = mpol_dup(vma_policy(src));

        if (IS_ERR(pol))
                return PTR_ERR(pol);
        dst->vm_policy = pol;
        return 0;
}

/*
 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
 * with the mems_allowed returned by cpuset_mems_allowed().  This
 * keeps mempolicies cpuset relative after its cpuset moves.  See
 * further kernel/cpuset.c update_nodemask().
 *
 * current's mempolicy may be rebinded by the other task(the task that changes
 * cpuset's mems), so we needn't do rebind work for current task.
 */

/* Slow path of a mempolicy duplicate */
struct mempolicy *__mpol_dup(struct mempolicy *old)
{
        struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);

        if (!new)
                return ERR_PTR(-ENOMEM);

        /* task's mempolicy is protected by alloc_lock */
        if (old == current->mempolicy) {
                task_lock(current);
                *new = *old;
                task_unlock(current);
        } else
                *new = *old;

        if (current_cpuset_is_being_rebound()) {
                nodemask_t mems = cpuset_mems_allowed(current);
                mpol_rebind_policy(new, &mems);
        }
        atomic_set(&new->refcnt, 1);
        return new;
}

/* Slow path of a mempolicy comparison */
bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
        if (!a || !b)
                return false;
        if (a->mode != b->mode)
                return false;
        if (a->flags != b->flags)
                return false;
        if (mpol_store_user_nodemask(a))
                if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
                        return false;

        switch (a->mode) {
        case MPOL_BIND:
        case MPOL_INTERLEAVE:
                return !!nodes_equal(a->v.nodes, b->v.nodes);
        case MPOL_PREFERRED:
                /* a's ->flags is the same as b's */
                if (a->flags & MPOL_F_LOCAL)
                        return true;
                return a->v.preferred_node == b->v.preferred_node;
        default:
                BUG();
                return false;
        }
}

/*
 * Shared memory backing store policy support.
 *
 * Remember policies even when nobody has shared memory mapped.
 * The policies are kept in Red-Black tree linked from the inode.
 * They are protected by the sp->lock rwlock, which should be held
 * for any accesses to the tree.
 */

/*
 * lookup first element intersecting start-end.  Caller holds sp->lock for
 * reading or for writing
 */
static struct sp_node *
sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
{
        struct rb_node *n = sp->root.rb_node;

        while (n) {
                struct sp_node *p = rb_entry(n, struct sp_node, nd);

                if (start >= p->end)
                        n = n->rb_right;
                else if (end <= p->start)
                        n = n->rb_left;
                else
                        break;
        }
        if (!n)
                return NULL;
        for (;;) {
                struct sp_node *w = NULL;
                struct rb_node *prev = rb_prev(n);
                if (!prev)
                        break;
                w = rb_entry(prev, struct sp_node, nd);
                if (w->end <= start)
                        break;
                n = prev;
        }
        return rb_entry(n, struct sp_node, nd);
}

/*
 * Insert a new shared policy into the list.  Caller holds sp->lock for
 * writing.
 */
static void sp_insert(struct shared_policy *sp, struct sp_node *new)
{
        struct rb_node **p = &sp->root.rb_node;
        struct rb_node *parent = NULL;
        struct sp_node *nd;

        while (*p) {
                parent = *p;
                nd = rb_entry(parent, struct sp_node, nd);
                if (new->start < nd->start)
                        p = &(*p)->rb_left;
                else if (new->end > nd->end)
                        p = &(*p)->rb_right;
                else
                        BUG();
        }
        rb_link_node(&new->nd, parent, p);
        rb_insert_color(&new->nd, &sp->root);
        pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
                 new->policy ? new->policy->mode : 0);
}

/* Find shared policy intersecting idx */
struct mempolicy *
mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
{
        struct mempolicy *pol = NULL;
        struct sp_node *sn;

        if (!sp->root.rb_node)
                return NULL;
        read_lock(&sp->lock);
        sn = sp_lookup(sp, idx, idx+1);
        if (sn) {
                mpol_get(sn->policy);
                pol = sn->policy;
        }
        read_unlock(&sp->lock);
        return pol;
}

static void sp_free(struct sp_node *n)
{
        mpol_put(n->policy);
        kmem_cache_free(sn_cache, n);
}

/**
 * mpol_misplaced - check whether current page node is valid in policy
 *
 * @page: page to be checked
 * @vma: vm area where page mapped
 * @addr: virtual address where page mapped
 *
 * Lookup current policy node id for vma,addr and "compare to" page's
 * node id.
 *
 * Returns:
 *        -1        - not misplaced, page is in the right node
 *        node        - node id where the page should be
 *
 * Policy determination "mimics" alloc_page_vma().
 * Called from fault path where we know the vma and faulting address.
 */
int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
{
        struct mempolicy *pol;
        struct zoneref *z;
        int curnid = page_to_nid(page);
        unsigned long pgoff;
        int thiscpu = raw_smp_processor_id();
        int thisnid = cpu_to_node(thiscpu);
        int polnid = NUMA_NO_NODE;
        int ret = -1;

        pol = get_vma_policy(vma, addr);
        if (!(pol->flags & MPOL_F_MOF))
                goto out;

        switch (pol->mode) {
        case MPOL_INTERLEAVE:
                pgoff = vma->vm_pgoff;
                pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
                polnid = offset_il_node(pol, pgoff);
                break;

        case MPOL_PREFERRED:
                if (pol->flags & MPOL_F_LOCAL)
                        polnid = numa_node_id();
                else
                        polnid = pol->v.preferred_node;
                break;

        case MPOL_BIND:

                /*
                 * allows binding to multiple nodes.
                 * use current page if in policy nodemask,
                 * else select nearest allowed node, if any.
                 * If no allowed nodes, use current [!misplaced].
                 */
                if (node_isset(curnid, pol->v.nodes))
                        goto out;
                z = first_zones_zonelist(
                                node_zonelist(numa_node_id(), GFP_HIGHUSER),
                                gfp_zone(GFP_HIGHUSER),
                                &pol->v.nodes);
                polnid = zone_to_nid(z->zone);
                break;

        default:
                BUG();
        }

        /* Migrate the page towards the node whose CPU is referencing it */
        if (pol->flags & MPOL_F_MORON) {
                polnid = thisnid;

                if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
                        goto out;
        }

        if (curnid != polnid)
                ret = polnid;
out:
        mpol_cond_put(pol);

        return ret;
}

/*
 * Drop the (possibly final) reference to task->mempolicy.  It needs to be
 * dropped after task->mempolicy is set to NULL so that any allocation done as
 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
 * policy.
 */
void mpol_put_task_policy(struct task_struct *task)
{
        struct mempolicy *pol;

        task_lock(task);
        pol = task->mempolicy;
        task->mempolicy = NULL;
        task_unlock(task);
        mpol_put(pol);
}

static void sp_delete(struct shared_policy *sp, struct sp_node *n)
{
        pr_debug("deleting %lx-l%lx\n", n->start, n->end);
        rb_erase(&n->nd, &sp->root);
        sp_free(n);
}

static void sp_node_init(struct sp_node *node, unsigned long start,
                        unsigned long end, struct mempolicy *pol)
{
        node->start = start;
        node->end = end;
        node->policy = pol;
}

static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
                                struct mempolicy *pol)
{
        struct sp_node *n;
        struct mempolicy *newpol;

        n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
        if (!n)
                return NULL;

        newpol = mpol_dup(pol);
        if (IS_ERR(newpol)) {
                kmem_cache_free(sn_cache, n);
                return NULL;
        }
        newpol->flags |= MPOL_F_SHARED;
        sp_node_init(n, start, end, newpol);

        return n;
}

/* Replace a policy range. */
static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
                                 unsigned long end, struct sp_node *new)
{
        struct sp_node *n;
        struct sp_node *n_new = NULL;
        struct mempolicy *mpol_new = NULL;
        int ret = 0;

restart:
        write_lock(&sp->lock);
        n = sp_lookup(sp, start, end);
        /* Take care of old policies in the same range. */
        while (n && n->start < end) {
                struct rb_node *next = rb_next(&n->nd);
                if (n->start >= start) {
                        if (n->end <= end)
                                sp_delete(sp, n);
                        else
                                n->start = end;
                } else {
                        /* Old policy spanning whole new range. */
                        if (n->end > end) {
                                if (!n_new)
                                        goto alloc_new;

                                *mpol_new = *n->policy;
                                atomic_set(&mpol_new->refcnt, 1);
                                sp_node_init(n_new, end, n->end, mpol_new);
                                n->end = start;
                                sp_insert(sp, n_new);
                                n_new = NULL;
                                mpol_new = NULL;
                                break;
                        } else
                                n->end = start;
                }
                if (!next)
                        break;
                n = rb_entry(next, struct sp_node, nd);
        }
        if (new)
                sp_insert(sp, new);
        write_unlock(&sp->lock);
        ret = 0;

err_out:
        if (mpol_new)
                mpol_put(mpol_new);
        if (n_new)
                kmem_cache_free(sn_cache, n_new);

        return ret;

alloc_new:
        write_unlock(&sp->lock);
        ret = -ENOMEM;
        n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
        if (!n_new)
                goto err_out;
        mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
        if (!mpol_new)
                goto err_out;
        atomic_set(&mpol_new->refcnt, 1);
        goto restart;
}

/**
 * mpol_shared_policy_init - initialize shared policy for inode
 * @sp: pointer to inode shared policy
 * @mpol:  struct mempolicy to install
 *
 * Install non-NULL @mpol in inode's shared policy rb-tree.
 * On entry, the current task has a reference on a non-NULL @mpol.
 * This must be released on exit.
 * This is called at get_inode() calls and we can use GFP_KERNEL.
 */
void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
{
        int ret;

        sp->root = RB_ROOT;                /* empty tree == default mempolicy */
        rwlock_init(&sp->lock);

        if (mpol) {
                struct vm_area_struct pvma;
                struct mempolicy *new;
                NODEMASK_SCRATCH(scratch);

                if (!scratch)
                        goto put_mpol;
                /* contextualize the tmpfs mount point mempolicy */
                new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
                if (IS_ERR(new))
                        goto free_scratch; /* no valid nodemask intersection */

                task_lock(current);
                ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
                task_unlock(current);
                if (ret)
                        goto put_new;

                /* Create pseudo-vma that contains just the policy */
                vma_init(&pvma, NULL);
                pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
                mpol_set_shared_policy(sp, &pvma, new); /* adds ref */

put_new:
                mpol_put(new);                        /* drop initial ref */
free_scratch:
                NODEMASK_SCRATCH_FREE(scratch);
put_mpol:
                mpol_put(mpol);        /* drop our incoming ref on sb mpol */
        }
}

int mpol_set_shared_policy(struct shared_policy *info,
                        struct vm_area_struct *vma, struct mempolicy *npol)
{
        int err;
        struct sp_node *new = NULL;
        unsigned long sz = vma_pages(vma);

        pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
                 vma->vm_pgoff,
                 sz, npol ? npol->mode : -1,
                 npol ? npol->flags : -1,
                 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);

        if (npol) {
                new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
                if (!new)
                        return -ENOMEM;
        }
        err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
        if (err && new)
                sp_free(new);
        return err;
}

/* Free a backing policy store on inode delete. */
void mpol_free_shared_policy(struct shared_policy *p)
{
        struct sp_node *n;
        struct rb_node *next;

        if (!p->root.rb_node)
                return;
        write_lock(&p->lock);
        next = rb_first(&p->root);
        while (next) {
                n = rb_entry(next, struct sp_node, nd);
                next = rb_next(&n->nd);
                sp_delete(p, n);
        }
        write_unlock(&p->lock);
}

#ifdef CONFIG_NUMA_BALANCING
static int __initdata numabalancing_override;

static void __init check_numabalancing_enable(void)
{
        bool numabalancing_default = false;

        if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
                numabalancing_default = true;

        /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
        if (numabalancing_override)
                set_numabalancing_state(numabalancing_override == 1);

        if (num_online_nodes() > 1 && !numabalancing_override) {
                pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
                        numabalancing_default ? "Enabling" : "Disabling");
                set_numabalancing_state(numabalancing_default);
        }
}

static int __init setup_numabalancing(char *str)
{
        int ret = 0;
        if (!str)
                goto out;

        if (!strcmp(str, "enable")) {
                numabalancing_override = 1;
                ret = 1;
        } else if (!strcmp(str, "disable")) {
                numabalancing_override = -1;
                ret = 1;
        }
out:
        if (!ret)
                pr_warn("Unable to parse numa_balancing=\n");

        return ret;
}
__setup("numa_balancing=", setup_numabalancing);
#else
static inline void __init check_numabalancing_enable(void)
{
}
#endif /* CONFIG_NUMA_BALANCING */

/* assumes fs == KERNEL_DS */
void __init numa_policy_init(void)
{
        nodemask_t interleave_nodes;
        unsigned long largest = 0;
        int nid, prefer = 0;

        policy_cache = kmem_cache_create("numa_policy",
                                         sizeof(struct mempolicy),
                                         0, SLAB_PANIC, NULL);

        sn_cache = kmem_cache_create("shared_policy_node",
                                     sizeof(struct sp_node),
                                     0, SLAB_PANIC, NULL);

        for_each_node(nid) {
                preferred_node_policy[nid] = (struct mempolicy) {
                        .refcnt = ATOMIC_INIT(1),
                        .mode = MPOL_PREFERRED,
                        .flags = MPOL_F_MOF | MPOL_F_MORON,
                        .v = { .preferred_node = nid, },
                };
        }

        /*
         * Set interleaving policy for system init. Interleaving is only
         * enabled across suitably sized nodes (default is >= 16MB), or
         * fall back to the largest node if they're all smaller.
         */
        nodes_clear(interleave_nodes);
        for_each_node_state(nid, N_MEMORY) {
                unsigned long total_pages = node_present_pages(nid);

                /* Preserve the largest node */
                if (largest < total_pages) {
                        largest = total_pages;
                        prefer = nid;
                }

                /* Interleave this node? */
                if ((total_pages << PAGE_SHIFT) >= (16 << 20))
                        node_set(nid, interleave_nodes);
        }

        /* All too small, use the largest */
        if (unlikely(nodes_empty(interleave_nodes)))
                node_set(prefer, interleave_nodes);

        if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
                pr_err("%s: interleaving failed\n", __func__);

        check_numabalancing_enable();
}

/* Reset policy of current process to default */
void numa_default_policy(void)
{
        do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
}

/*
 * Parse and format mempolicy from/to strings
 */

/*
 * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
 */
static const char * const policy_modes[] =
{
        [MPOL_DEFAULT]    = "default",
        [MPOL_PREFERRED]  = "prefer",
        [MPOL_BIND]       = "bind",
        [MPOL_INTERLEAVE] = "interleave",
        [MPOL_LOCAL]      = "local",
};


#ifdef CONFIG_TMPFS
/**
 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
 * @str:  string containing mempolicy to parse
 * @mpol:  pointer to struct mempolicy pointer, returned on success.
 *
 * Format of input:
 *        <mode>[=<flags>][:<nodelist>]
 *
 * On success, returns 0, else 1
 */
int mpol_parse_str(char *str, struct mempolicy **mpol)
{
        struct mempolicy *new = NULL;
        unsigned short mode_flags;
        nodemask_t nodes;
        char *nodelist = strchr(str, ':');
        char *flags = strchr(str, '=');
        int err = 1, mode;

        if (flags)
                *flags++ = '\0';        /* terminate mode string */

        if (nodelist) {
                /* NUL-terminate mode or flags string */
                *nodelist++ = '\0';
                if (nodelist_parse(nodelist, nodes))
                        goto out;
                if (!nodes_subset(nodes, node_states[N_MEMORY]))
                        goto out;
        } else
                nodes_clear(nodes);

        mode = match_string(policy_modes, MPOL_MAX, str);
        if (mode < 0)
                goto out;

        switch (mode) {
        case MPOL_PREFERRED:
                /*
                 * Insist on a nodelist of one node only, although later
                 * we use first_node(nodes) to grab a single node, so here
                 * nodelist (or nodes) cannot be empty.
                 */
                if (nodelist) {
                        char *rest = nodelist;
                        while (isdigit(*rest))
                                rest++;
                        if (*rest)
                                goto out;
                        if (nodes_empty(nodes))
                                goto out;
                }
                break;
        case MPOL_INTERLEAVE:
                /*
                 * Default to online nodes with memory if no nodelist
                 */
                if (!nodelist)
                        nodes = node_states[N_MEMORY];
                break;
        case MPOL_LOCAL:
                /*
                 * Don't allow a nodelist;  mpol_new() checks flags
                 */
                if (nodelist)
                        goto out;
                mode = MPOL_PREFERRED;
                break;
        case MPOL_DEFAULT:
                /*
                 * Insist on a empty nodelist
                 */
                if (!nodelist)
                        err = 0;
                goto out;
        case MPOL_BIND:
                /*
                 * Insist on a nodelist
                 */
                if (!nodelist)
                        goto out;
        }

        mode_flags = 0;
        if (flags) {
                /*
                 * Currently, we only support two mutually exclusive
                 * mode flags.
                 */
                if (!strcmp(flags, "static"))
                        mode_flags |= MPOL_F_STATIC_NODES;
                else if (!strcmp(flags, "relative"))
                        mode_flags |= MPOL_F_RELATIVE_NODES;
                else
                        goto out;
        }

        new = mpol_new(mode, mode_flags, &nodes);
        if (IS_ERR(new))
                goto out;

        /*
         * Save nodes for mpol_to_str() to show the tmpfs mount options
         * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
         */
        if (mode != MPOL_PREFERRED)
                new->v.nodes = nodes;
        else if (nodelist)
                new->v.preferred_node = first_node(nodes);
        else
                new->flags |= MPOL_F_LOCAL;

        /*
         * Save nodes for contextualization: this will be used to "clone"
         * the mempolicy in a specific context [cpuset] at a later time.
         */
        new->w.user_nodemask = nodes;

        err = 0;

out:
        /* Restore string for error message */
        if (nodelist)
                *--nodelist = ':';
        if (flags)
                *--flags = '=';
        if (!err)
                *mpol = new;
        return err;
}
#endif /* CONFIG_TMPFS */

/**
 * mpol_to_str - format a mempolicy structure for printing
 * @buffer:  to contain formatted mempolicy string
 * @maxlen:  length of @buffer
 * @pol:  pointer to mempolicy to be formatted
 *
 * Convert @pol into a string.  If @buffer is too short, truncate the string.
 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
 * longest flag, "relative", and to display at least a few node ids.
 */
void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
{
        char *p = buffer;
        nodemask_t nodes = NODE_MASK_NONE;
        unsigned short mode = MPOL_DEFAULT;
        unsigned short flags = 0;

        if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
                mode = pol->mode;
                flags = pol->flags;
        }

        switch (mode) {
        case MPOL_DEFAULT:
                break;
        case MPOL_PREFERRED:
                if (flags & MPOL_F_LOCAL)
                        mode = MPOL_LOCAL;
                else
                        node_set(pol->v.preferred_node, nodes);
                break;
        case MPOL_BIND:
        case MPOL_INTERLEAVE:
                nodes = pol->v.nodes;
                break;
        default:
                WARN_ON_ONCE(1);
                snprintf(p, maxlen, "unknown");
                return;
        }

        p += snprintf(p, maxlen, "%s", policy_modes[mode]);

        if (flags & MPOL_MODE_FLAGS) {
                p += snprintf(p, buffer + maxlen - p, "=");

                /*
                 * Currently, the only defined flags are mutually exclusive
                 */
                if (flags & MPOL_F_STATIC_NODES)
                        p += snprintf(p, buffer + maxlen - p, "static");
                else if (flags & MPOL_F_RELATIVE_NODES)
                        p += snprintf(p, buffer + maxlen - p, "relative");
        }

        if (!nodes_empty(nodes))
                p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
                               nodemask_pr_args(&nodes));
}




























































































































































































































































































































































































    2 
    2 



    2 





    2 
    2 





















































































































    2 
    2 












































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/mm/mempool.c
 *
 *  memory buffer pool support. Such pools are mostly used
 *  for guaranteed, deadlock-free memory allocations during
 *  extreme VM load.
 *
 *  started by Ingo Molnar, Copyright (C) 2001
 *  debugging by David Rientjes, Copyright (C) 2015
 */

#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/kasan.h>
#include <linux/kmemleak.h>
#include <linux/export.h>
#include <linux/mempool.h>
#include <linux/blkdev.h>
#include <linux/writeback.h>
#include "slab.h"

#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON)
static void poison_error(mempool_t *pool, void *element, size_t size,
                         size_t byte)
{
        const int nr = pool->curr_nr;
        const int start = max_t(int, byte - (BITS_PER_LONG / 8), 0);
        const int end = min_t(int, byte + (BITS_PER_LONG / 8), size);
        int i;

        pr_err("BUG: mempool element poison mismatch\n");
        pr_err("Mempool %p size %zu\n", pool, size);
        pr_err(" nr=%d @ %p: %s0x", nr, element, start > 0 ? "... " : "");
        for (i = start; i < end; i++)
                pr_cont("%x ", *(u8 *)(element + i));
        pr_cont("%s\n", end < size ? "..." : "");
        dump_stack();
}

static void __check_element(mempool_t *pool, void *element, size_t size)
{
        u8 *obj = element;
        size_t i;

        for (i = 0; i < size; i++) {
                u8 exp = (i < size - 1) ? POISON_FREE : POISON_END;

                if (obj[i] != exp) {
                        poison_error(pool, element, size, i);
                        return;
                }
        }
        memset(obj, POISON_INUSE, size);
}

static void check_element(mempool_t *pool, void *element)
{
        /* Mempools backed by slab allocator */
        if (pool->free == mempool_free_slab || pool->free == mempool_kfree) {
                __check_element(pool, element, ksize(element));
        } else if (pool->free == mempool_free_pages) {
                /* Mempools backed by page allocator */
                int order = (int)(long)pool->pool_data;
                void *addr = kmap_atomic((struct page *)element);

                __check_element(pool, addr, 1UL << (PAGE_SHIFT + order));
                kunmap_atomic(addr);
        }
}

static void __poison_element(void *element, size_t size)
{
        u8 *obj = element;

        memset(obj, POISON_FREE, size - 1);
        obj[size - 1] = POISON_END;
}

static void poison_element(mempool_t *pool, void *element)
{
        /* Mempools backed by slab allocator */
        if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) {
                __poison_element(element, ksize(element));
        } else if (pool->alloc == mempool_alloc_pages) {
                /* Mempools backed by page allocator */
                int order = (int)(long)pool->pool_data;
                void *addr = kmap_atomic((struct page *)element);

                __poison_element(addr, 1UL << (PAGE_SHIFT + order));
                kunmap_atomic(addr);
        }
}
#else /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */
static inline void check_element(mempool_t *pool, void *element)
{
}
static inline void poison_element(mempool_t *pool, void *element)
{
}
#endif /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */

static __always_inline void kasan_poison_element(mempool_t *pool, void *element)
{
        if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
                kasan_poison_kfree(element, _RET_IP_);
        else if (pool->alloc == mempool_alloc_pages)
                kasan_free_pages(element, (unsigned long)pool->pool_data);
}

static void kasan_unpoison_element(mempool_t *pool, void *element)
{
        if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
                kasan_unpoison_slab(element);
        else if (pool->alloc == mempool_alloc_pages)
                kasan_alloc_pages(element, (unsigned long)pool->pool_data);
}

static __always_inline void add_element(mempool_t *pool, void *element)
{
        BUG_ON(pool->curr_nr >= pool->min_nr);
        poison_element(pool, element);
        kasan_poison_element(pool, element);
        pool->elements[pool->curr_nr++] = element;
}

static void *remove_element(mempool_t *pool)
{
        void *element = pool->elements[--pool->curr_nr];

        BUG_ON(pool->curr_nr < 0);
        kasan_unpoison_element(pool, element);
        check_element(pool, element);
        return element;
}

/**
 * mempool_exit - exit a mempool initialized with mempool_init()
 * @pool:      pointer to the memory pool which was initialized with
 *             mempool_init().
 *
 * Free all reserved elements in @pool and @pool itself.  This function
 * only sleeps if the free_fn() function sleeps.
 *
 * May be called on a zeroed but uninitialized mempool (i.e. allocated with
 * kzalloc()).
 */
void mempool_exit(mempool_t *pool)
{
        while (pool->curr_nr) {
                void *element = remove_element(pool);
                pool->free(element, pool->pool_data);
        }
        kfree(pool->elements);
        pool->elements = NULL;
}
EXPORT_SYMBOL(mempool_exit);

/**
 * mempool_destroy - deallocate a memory pool
 * @pool:      pointer to the memory pool which was allocated via
 *             mempool_create().
 *
 * Free all reserved elements in @pool and @pool itself.  This function
 * only sleeps if the free_fn() function sleeps.
 */
void mempool_destroy(mempool_t *pool)
{
        if (unlikely(!pool))
                return;

        mempool_exit(pool);
        kfree(pool);
}
EXPORT_SYMBOL(mempool_destroy);

int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
                      mempool_free_t *free_fn, void *pool_data,
                      gfp_t gfp_mask, int node_id)
{
        spin_lock_init(&pool->lock);
        pool->min_nr        = min_nr;
        pool->pool_data = pool_data;
        pool->alloc        = alloc_fn;
        pool->free        = free_fn;
        init_waitqueue_head(&pool->wait);

        pool->elements = kmalloc_array_node(min_nr, sizeof(void *),
                                            gfp_mask, node_id);
        if (!pool->elements)
                return -ENOMEM;

        /*
         * First pre-allocate the guaranteed number of buffers.
         */
        while (pool->curr_nr < pool->min_nr) {
                void *element;

                element = pool->alloc(gfp_mask, pool->pool_data);
                if (unlikely(!element)) {
                        mempool_exit(pool);
                        return -ENOMEM;
                }
                add_element(pool, element);
        }

        return 0;
}
EXPORT_SYMBOL(mempool_init_node);

/**
 * mempool_init - initialize a memory pool
 * @pool:      pointer to the memory pool that should be initialized
 * @min_nr:    the minimum number of elements guaranteed to be
 *             allocated for this pool.
 * @alloc_fn:  user-defined element-allocation function.
 * @free_fn:   user-defined element-freeing function.
 * @pool_data: optional private data available to the user-defined functions.
 *
 * Like mempool_create(), but initializes the pool in (i.e. embedded in another
 * structure).
 *
 * Return: %0 on success, negative error code otherwise.
 */
int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
                 mempool_free_t *free_fn, void *pool_data)
{
        return mempool_init_node(pool, min_nr, alloc_fn, free_fn,
                                 pool_data, GFP_KERNEL, NUMA_NO_NODE);

}
EXPORT_SYMBOL(mempool_init);

/**
 * mempool_create - create a memory pool
 * @min_nr:    the minimum number of elements guaranteed to be
 *             allocated for this pool.
 * @alloc_fn:  user-defined element-allocation function.
 * @free_fn:   user-defined element-freeing function.
 * @pool_data: optional private data available to the user-defined functions.
 *
 * this function creates and allocates a guaranteed size, preallocated
 * memory pool. The pool can be used from the mempool_alloc() and mempool_free()
 * functions. This function might sleep. Both the alloc_fn() and the free_fn()
 * functions might sleep - as long as the mempool_alloc() function is not called
 * from IRQ contexts.
 *
 * Return: pointer to the created memory pool object or %NULL on error.
 */
mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
                                mempool_free_t *free_fn, void *pool_data)
{
        return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,
                                   GFP_KERNEL, NUMA_NO_NODE);
}
EXPORT_SYMBOL(mempool_create);

mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
                               mempool_free_t *free_fn, void *pool_data,
                               gfp_t gfp_mask, int node_id)
{
        mempool_t *pool;

        pool = kzalloc_node(sizeof(*pool), gfp_mask, node_id);
        if (!pool)
                return NULL;

        if (mempool_init_node(pool, min_nr, alloc_fn, free_fn, pool_data,
                              gfp_mask, node_id)) {
                kfree(pool);
                return NULL;
        }

        return pool;
}
EXPORT_SYMBOL(mempool_create_node);

/**
 * mempool_resize - resize an existing memory pool
 * @pool:       pointer to the memory pool which was allocated via
 *              mempool_create().
 * @new_min_nr: the new minimum number of elements guaranteed to be
 *              allocated for this pool.
 *
 * This function shrinks/grows the pool. In the case of growing,
 * it cannot be guaranteed that the pool will be grown to the new
 * size immediately, but new mempool_free() calls will refill it.
 * This function may sleep.
 *
 * Note, the caller must guarantee that no mempool_destroy is called
 * while this function is running. mempool_alloc() & mempool_free()
 * might be called (eg. from IRQ contexts) while this function executes.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int mempool_resize(mempool_t *pool, int new_min_nr)
{
        void *element;
        void **new_elements;
        unsigned long flags;

        BUG_ON(new_min_nr <= 0);
        might_sleep();

        spin_lock_irqsave(&pool->lock, flags);
        if (new_min_nr <= pool->min_nr) {
                while (new_min_nr < pool->curr_nr) {
                        element = remove_element(pool);
                        spin_unlock_irqrestore(&pool->lock, flags);
                        pool->free(element, pool->pool_data);
                        spin_lock_irqsave(&pool->lock, flags);
                }
                pool->min_nr = new_min_nr;
                goto out_unlock;
        }
        spin_unlock_irqrestore(&pool->lock, flags);

        /* Grow the pool */
        new_elements = kmalloc_array(new_min_nr, sizeof(*new_elements),
                                     GFP_KERNEL);
        if (!new_elements)
                return -ENOMEM;

        spin_lock_irqsave(&pool->lock, flags);
        if (unlikely(new_min_nr <= pool->min_nr)) {
                /* Raced, other resize will do our work */
                spin_unlock_irqrestore(&pool->lock, flags);
                kfree(new_elements);
                goto out;
        }
        memcpy(new_elements, pool->elements,
                        pool->curr_nr * sizeof(*new_elements));
        kfree(pool->elements);
        pool->elements = new_elements;
        pool->min_nr = new_min_nr;

        while (pool->curr_nr < pool->min_nr) {
                spin_unlock_irqrestore(&pool->lock, flags);
                element = pool->alloc(GFP_KERNEL, pool->pool_data);
                if (!element)
                        goto out;
                spin_lock_irqsave(&pool->lock, flags);
                if (pool->curr_nr < pool->min_nr) {
                        add_element(pool, element);
                } else {
                        spin_unlock_irqrestore(&pool->lock, flags);
                        pool->free(element, pool->pool_data);        /* Raced */
                        goto out;
                }
        }
out_unlock:
        spin_unlock_irqrestore(&pool->lock, flags);
out:
        return 0;
}
EXPORT_SYMBOL(mempool_resize);

/**
 * mempool_alloc - allocate an element from a specific memory pool
 * @pool:      pointer to the memory pool which was allocated via
 *             mempool_create().
 * @gfp_mask:  the usual allocation bitmask.
 *
 * this function only sleeps if the alloc_fn() function sleeps or
 * returns NULL. Note that due to preallocation, this function
 * *never* fails when called from process contexts. (it might
 * fail if called from an IRQ context.)
 * Note: using __GFP_ZERO is not supported.
 *
 * Return: pointer to the allocated element or %NULL on error.
 */
void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
{
        void *element;
        unsigned long flags;
        wait_queue_entry_t wait;
        gfp_t gfp_temp;

        VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
        might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);

        gfp_mask |= __GFP_NOMEMALLOC;        /* don't allocate emergency reserves */
        gfp_mask |= __GFP_NORETRY;        /* don't loop in __alloc_pages */
        gfp_mask |= __GFP_NOWARN;        /* failures are OK */

        gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO);

repeat_alloc:

        element = pool->alloc(gfp_temp, pool->pool_data);
        if (likely(element != NULL))
                return element;

        spin_lock_irqsave(&pool->lock, flags);
        if (likely(pool->curr_nr)) {
                element = remove_element(pool);
                spin_unlock_irqrestore(&pool->lock, flags);
                /* paired with rmb in mempool_free(), read comment there */
                smp_wmb();
                /*
                 * Update the allocation stack trace as this is more useful
                 * for debugging.
                 */
                kmemleak_update_trace(element);
                return element;
        }

        /*
         * We use gfp mask w/o direct reclaim or IO for the first round.  If
         * alloc failed with that and @pool was empty, retry immediately.
         */
        if (gfp_temp != gfp_mask) {
                spin_unlock_irqrestore(&pool->lock, flags);
                gfp_temp = gfp_mask;
                goto repeat_alloc;
        }

        /* We must not sleep if !__GFP_DIRECT_RECLAIM */
        if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) {
                spin_unlock_irqrestore(&pool->lock, flags);
                return NULL;
        }

        /* Let's wait for someone else to return an element to @pool */
        init_wait(&wait);
        prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);

        spin_unlock_irqrestore(&pool->lock, flags);

        /*
         * FIXME: this should be io_schedule().  The timeout is there as a
         * workaround for some DM problems in 2.6.18.
         */
        io_schedule_timeout(5*HZ);

        finish_wait(&pool->wait, &wait);
        goto repeat_alloc;
}
EXPORT_SYMBOL(mempool_alloc);

/**
 * mempool_free - return an element to the pool.
 * @element:   pool element pointer.
 * @pool:      pointer to the memory pool which was allocated via
 *             mempool_create().
 *
 * this function only sleeps if the free_fn() function sleeps.
 */
void mempool_free(void *element, mempool_t *pool)
{
        unsigned long flags;

        if (unlikely(element == NULL))
                return;

        /*
         * Paired with the wmb in mempool_alloc().  The preceding read is
         * for @element and the following @pool->curr_nr.  This ensures
         * that the visible value of @pool->curr_nr is from after the
         * allocation of @element.  This is necessary for fringe cases
         * where @element was passed to this task without going through
         * barriers.
         *
         * For example, assume @p is %NULL at the beginning and one task
         * performs "p = mempool_alloc(...);" while another task is doing
         * "while (!p) cpu_relax(); mempool_free(p, ...);".  This function
         * may end up using curr_nr value which is from before allocation
         * of @p without the following rmb.
         */
        smp_rmb();

        /*
         * For correctness, we need a test which is guaranteed to trigger
         * if curr_nr + #allocated == min_nr.  Testing curr_nr < min_nr
         * without locking achieves that and refilling as soon as possible
         * is desirable.
         *
         * Because curr_nr visible here is always a value after the
         * allocation of @element, any task which decremented curr_nr below
         * min_nr is guaranteed to see curr_nr < min_nr unless curr_nr gets
         * incremented to min_nr afterwards.  If curr_nr gets incremented
         * to min_nr after the allocation of @element, the elements
         * allocated after that are subject to the same guarantee.
         *
         * Waiters happen iff curr_nr is 0 and the above guarantee also
         * ensures that there will be frees which return elements to the
         * pool waking up the waiters.
         */
        if (unlikely(READ_ONCE(pool->curr_nr) < pool->min_nr)) {
                spin_lock_irqsave(&pool->lock, flags);
                if (likely(pool->curr_nr < pool->min_nr)) {
                        add_element(pool, element);
                        spin_unlock_irqrestore(&pool->lock, flags);
                        wake_up(&pool->wait);
                        return;
                }
                spin_unlock_irqrestore(&pool->lock, flags);
        }
        pool->free(element, pool->pool_data);
}
EXPORT_SYMBOL(mempool_free);

/*
 * A commonly used alloc and free fn.
 */
void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data)
{
        struct kmem_cache *mem = pool_data;
        VM_BUG_ON(mem->ctor);
        return kmem_cache_alloc(mem, gfp_mask);
}
EXPORT_SYMBOL(mempool_alloc_slab);

void mempool_free_slab(void *element, void *pool_data)
{
        struct kmem_cache *mem = pool_data;
        kmem_cache_free(mem, element);
}
EXPORT_SYMBOL(mempool_free_slab);

/*
 * A commonly used alloc and free fn that kmalloc/kfrees the amount of memory
 * specified by pool_data
 */
void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data)
{
        size_t size = (size_t)pool_data;
        return kmalloc(size, gfp_mask);
}
EXPORT_SYMBOL(mempool_kmalloc);

void mempool_kfree(void *element, void *pool_data)
{
        kfree(element);
}
EXPORT_SYMBOL(mempool_kfree);

/*
 * A simple mempool-backed page allocator that allocates pages
 * of the order specified by pool_data.
 */
void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data)
{
        int order = (int)(long)pool_data;
        return alloc_pages(gfp_mask, order);
}
EXPORT_SYMBOL(mempool_alloc_pages);

void mempool_free_pages(void *element, void *pool_data)
{
        int order = (int)(long)pool_data;
        __free_pages(element, order);
}
EXPORT_SYMBOL(mempool_free_pages);


















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 














    2 
    2 




    2 





































































































    2 





















































































































    1 




    1 




    1 

















































    1 













    2 
































    2 






























































































































































































































































































































































































































    1 
    1 

    1 













































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
// SPDX-License-Identifier: GPL-2.0
/*
 * Implementation of the diskquota system for the LINUX operating system. QUOTA
 * is implemented using the BSD system call interface as the means of
 * communication with the user level. This file contains the generic routines
 * called by the different filesystems on allocation of an inode or block.
 * These routines take care of the administration needed to have a consistent
 * diskquota tracking system. The ideas of both user and group quotas are based
 * on the Melbourne quota system as used on BSD derived systems. The internal
 * implementation is based on one of the several variants of the LINUX
 * inode-subsystem with added complexity of the diskquota system.
 *
 * Author:        Marco van Wieringen <mvw@planets.elm.net>
 *
 * Fixes:   Dmitry Gorodchanin <pgmdsg@ibi.com>, 11 Feb 96
 *
 *                Revised list management to avoid races
 *                -- Bill Hawes, <whawes@star.net>, 9/98
 *
 *                Fixed races in dquot_transfer(), dqget() and dquot_alloc_...().
 *                As the consequence the locking was moved from dquot_decr_...(),
 *                dquot_incr_...() to calling functions.
 *                invalidate_dquots() now writes modified dquots.
 *                Serialized quota_off() and quota_on() for mount point.
 *                Fixed a few bugs in grow_dquots().
 *                Fixed deadlock in write_dquot() - we no longer account quotas on
 *                quota files
 *                remove_dquot_ref() moved to inode.c - it now traverses through inodes
 *                add_dquot_ref() restarts after blocking
 *                Added check for bogus uid and fixed check for group in quotactl.
 *                Jan Kara, <jack@suse.cz>, sponsored by SuSE CR, 10-11/99
 *
 *                Used struct list_head instead of own list struct
 *                Invalidation of referenced dquots is no longer possible
 *                Improved free_dquots list management
 *                Quota and i_blocks are now updated in one place to avoid races
 *                Warnings are now delayed so we won't block in critical section
 *                Write updated not to require dquot lock
 *                Jan Kara, <jack@suse.cz>, 9/2000
 *
 *                Added dynamic quota structure allocation
 *                Jan Kara <jack@suse.cz> 12/2000
 *
 *                Rewritten quota interface. Implemented new quota format and
 *                formats registering.
 *                Jan Kara, <jack@suse.cz>, 2001,2002
 *
 *                New SMP locking.
 *                Jan Kara, <jack@suse.cz>, 10/2002
 *
 *                Added journalled quota support, fix lock inversion problems
 *                Jan Kara, <jack@suse.cz>, 2003,2004
 *
 * (C) Copyright 1994 - 1997 Marco van Wieringen
 */

#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/mm.h>
#include <linux/time.h>
#include <linux/types.h>
#include <linux/string.h>
#include <linux/fcntl.h>
#include <linux/stat.h>
#include <linux/tty.h>
#include <linux/file.h>
#include <linux/slab.h>
#include <linux/sysctl.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/security.h>
#include <linux/sched.h>
#include <linux/cred.h>
#include <linux/kmod.h>
#include <linux/namei.h>
#include <linux/capability.h>
#include <linux/quotaops.h>
#include <linux/blkdev.h>
#include <linux/sched/mm.h>
#include "../internal.h" /* ugh */

#include <linux/uaccess.h>

/*
 * There are five quota SMP locks:
 * * dq_list_lock protects all lists with quotas and quota formats.
 * * dquot->dq_dqb_lock protects data from dq_dqb
 * * inode->i_lock protects inode->i_blocks, i_bytes and also guards
 *   consistency of dquot->dq_dqb with inode->i_blocks, i_bytes so that
 *   dquot_transfer() can stabilize amount it transfers
 * * dq_data_lock protects mem_dqinfo structures and modifications of dquot
 *   pointers in the inode
 * * dq_state_lock protects modifications of quota state (on quotaon and
 *   quotaoff) and readers who care about latest values take it as well.
 *
 * The spinlock ordering is hence:
 *   dq_data_lock > dq_list_lock > i_lock > dquot->dq_dqb_lock,
 *   dq_list_lock > dq_state_lock
 *
 * Note that some things (eg. sb pointer, type, id) doesn't change during
 * the life of the dquot structure and so needn't to be protected by a lock
 *
 * Operation accessing dquots via inode pointers are protected by dquot_srcu.
 * Operation of reading pointer needs srcu_read_lock(&dquot_srcu), and
 * synchronize_srcu(&dquot_srcu) is called after clearing pointers from
 * inode and before dropping dquot references to avoid use of dquots after
 * they are freed. dq_data_lock is used to serialize the pointer setting and
 * clearing operations.
 * Special care needs to be taken about S_NOQUOTA inode flag (marking that
 * inode is a quota file). Functions adding pointers from inode to dquots have
 * to check this flag under dq_data_lock and then (if S_NOQUOTA is not set) they
 * have to do all pointer modifications before dropping dq_data_lock. This makes
 * sure they cannot race with quotaon which first sets S_NOQUOTA flag and
 * then drops all pointers to dquots from an inode.
 *
 * Each dquot has its dq_lock mutex.  Dquot is locked when it is being read to
 * memory (or space for it is being allocated) on the first dqget(), when it is
 * being written out, and when it is being released on the last dqput(). The
 * allocation and release operations are serialized by the dq_lock and by
 * checking the use count in dquot_release().
 *
 * Lock ordering (including related VFS locks) is the following:
 *   s_umount > i_mutex > journal_lock > dquot->dq_lock > dqio_sem
 */

static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_list_lock);
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock);
__cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
EXPORT_SYMBOL(dq_data_lock);
DEFINE_STATIC_SRCU(dquot_srcu);

static DECLARE_WAIT_QUEUE_HEAD(dquot_ref_wq);

void __quota_error(struct super_block *sb, const char *func,
                   const char *fmt, ...)
{
        if (printk_ratelimit()) {
                va_list args;
                struct va_format vaf;

                va_start(args, fmt);

                vaf.fmt = fmt;
                vaf.va = &args;

                printk(KERN_ERR "Quota error (device %s): %s: %pV\n",
                       sb->s_id, func, &vaf);

                va_end(args);
        }
}
EXPORT_SYMBOL(__quota_error);

#if defined(CONFIG_QUOTA_DEBUG) || defined(CONFIG_PRINT_QUOTA_WARNING)
static char *quotatypes[] = INITQFNAMES;
#endif
static struct quota_format_type *quota_formats;        /* List of registered formats */
static struct quota_module_name module_names[] = INIT_QUOTA_MODULE_NAMES;

/* SLAB cache for dquot structures */
static struct kmem_cache *dquot_cachep;

int register_quota_format(struct quota_format_type *fmt)
{
        spin_lock(&dq_list_lock);
        fmt->qf_next = quota_formats;
        quota_formats = fmt;
        spin_unlock(&dq_list_lock);
        return 0;
}
EXPORT_SYMBOL(register_quota_format);

void unregister_quota_format(struct quota_format_type *fmt)
{
        struct quota_format_type **actqf;

        spin_lock(&dq_list_lock);
        for (actqf = &quota_formats; *actqf && *actqf != fmt;
             actqf = &(*actqf)->qf_next)
                ;
        if (*actqf)
                *actqf = (*actqf)->qf_next;
        spin_unlock(&dq_list_lock);
}
EXPORT_SYMBOL(unregister_quota_format);

static struct quota_format_type *find_quota_format(int id)
{
        struct quota_format_type *actqf;

        spin_lock(&dq_list_lock);
        for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id;
             actqf = actqf->qf_next)
                ;
        if (!actqf || !try_module_get(actqf->qf_owner)) {
                int qm;

                spin_unlock(&dq_list_lock);

                for (qm = 0; module_names[qm].qm_fmt_id &&
                             module_names[qm].qm_fmt_id != id; qm++)
                        ;
                if (!module_names[qm].qm_fmt_id ||
                    request_module(module_names[qm].qm_mod_name))
                        return NULL;

                spin_lock(&dq_list_lock);
                for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id;
                     actqf = actqf->qf_next)
                        ;
                if (actqf && !try_module_get(actqf->qf_owner))
                        actqf = NULL;
        }
        spin_unlock(&dq_list_lock);
        return actqf;
}

static void put_quota_format(struct quota_format_type *fmt)
{
        module_put(fmt->qf_owner);
}

/*
 * Dquot List Management:
 * The quota code uses five lists for dquot management: the inuse_list,
 * releasing_dquots, free_dquots, dqi_dirty_list, and dquot_hash[] array.
 * A single dquot structure may be on some of those lists, depending on
 * its current state.
 *
 * All dquots are placed to the end of inuse_list when first created, and this
 * list is used for invalidate operation, which must look at every dquot.
 *
 * When the last reference of a dquot is dropped, the dquot is added to
 * releasing_dquots. We'll then queue work item which will call
 * synchronize_srcu() and after that perform the final cleanup of all the
 * dquots on the list. Each cleaned up dquot is moved to free_dquots list.
 * Both releasing_dquots and free_dquots use the dq_free list_head in the dquot
 * struct.
 *
 * Unused and cleaned up dquots are in the free_dquots list and this list is
 * searched whenever we need an available dquot. Dquots are removed from the
 * list as soon as they are used again and dqstats.free_dquots gives the number
 * of dquots on the list. When dquot is invalidated it's completely released
 * from memory.
 *
 * Dirty dquots are added to the dqi_dirty_list of quota_info when mark
 * dirtied, and this list is searched when writing dirty dquots back to
 * quota file. Note that some filesystems do dirty dquot tracking on their
 * own (e.g. in a journal) and thus don't use dqi_dirty_list.
 *
 * Dquots with a specific identity (device, type and id) are placed on
 * one of the dquot_hash[] hash chains. The provides an efficient search
 * mechanism to locate a specific dquot.
 */

static LIST_HEAD(inuse_list);
static LIST_HEAD(free_dquots);
static LIST_HEAD(releasing_dquots);
static unsigned int dq_hash_bits, dq_hash_mask;
static struct hlist_head *dquot_hash;

struct dqstats dqstats;
EXPORT_SYMBOL(dqstats);

static qsize_t inode_get_rsv_space(struct inode *inode);
static qsize_t __inode_get_rsv_space(struct inode *inode);
static int __dquot_initialize(struct inode *inode, int type);

static void quota_release_workfn(struct work_struct *work);
static DECLARE_DELAYED_WORK(quota_release_work, quota_release_workfn);

static inline unsigned int
hashfn(const struct super_block *sb, struct kqid qid)
{
        unsigned int id = from_kqid(&init_user_ns, qid);
        int type = qid.type;
        unsigned long tmp;

        tmp = (((unsigned long)sb>>L1_CACHE_SHIFT) ^ id) * (MAXQUOTAS - type);
        return (tmp + (tmp >> dq_hash_bits)) & dq_hash_mask;
}

/*
 * Following list functions expect dq_list_lock to be held
 */
static inline void insert_dquot_hash(struct dquot *dquot)
{
        struct hlist_head *head;
        head = dquot_hash + hashfn(dquot->dq_sb, dquot->dq_id);
        hlist_add_head(&dquot->dq_hash, head);
}

static inline void remove_dquot_hash(struct dquot *dquot)
{
        hlist_del_init(&dquot->dq_hash);
}

static struct dquot *find_dquot(unsigned int hashent, struct super_block *sb,
                                struct kqid qid)
{
        struct hlist_node *node;
        struct dquot *dquot;

        hlist_for_each (node, dquot_hash+hashent) {
                dquot = hlist_entry(node, struct dquot, dq_hash);
                if (dquot->dq_sb == sb && qid_eq(dquot->dq_id, qid))
                        return dquot;
        }
        return NULL;
}

/* Add a dquot to the tail of the free list */
static inline void put_dquot_last(struct dquot *dquot)
{
        list_add_tail(&dquot->dq_free, &free_dquots);
        dqstats_inc(DQST_FREE_DQUOTS);
}

static inline void put_releasing_dquots(struct dquot *dquot)
{
        list_add_tail(&dquot->dq_free, &releasing_dquots);
        set_bit(DQ_RELEASING_B, &dquot->dq_flags);
}

static inline void remove_free_dquot(struct dquot *dquot)
{
        if (list_empty(&dquot->dq_free))
                return;
        list_del_init(&dquot->dq_free);
        if (!test_bit(DQ_RELEASING_B, &dquot->dq_flags))
                dqstats_dec(DQST_FREE_DQUOTS);
        else
                clear_bit(DQ_RELEASING_B, &dquot->dq_flags);
}

static inline void put_inuse(struct dquot *dquot)
{
        /* We add to the back of inuse list so we don't have to restart
         * when traversing this list and we block */
        list_add_tail(&dquot->dq_inuse, &inuse_list);
        dqstats_inc(DQST_ALLOC_DQUOTS);
}

static inline void remove_inuse(struct dquot *dquot)
{
        dqstats_dec(DQST_ALLOC_DQUOTS);
        list_del(&dquot->dq_inuse);
}
/*
 * End of list functions needing dq_list_lock
 */

static void wait_on_dquot(struct dquot *dquot)
{
        mutex_lock(&dquot->dq_lock);
        mutex_unlock(&dquot->dq_lock);
}

static inline int dquot_active(struct dquot *dquot)
{
        return test_bit(DQ_ACTIVE_B, &dquot->dq_flags);
}

static inline int dquot_dirty(struct dquot *dquot)
{
        return test_bit(DQ_MOD_B, &dquot->dq_flags);
}

static inline int mark_dquot_dirty(struct dquot *dquot)
{
        return dquot->dq_sb->dq_op->mark_dirty(dquot);
}

/* Mark dquot dirty in atomic manner, and return it's old dirty flag state */
int dquot_mark_dquot_dirty(struct dquot *dquot)
{
        int ret = 1;

        if (!dquot_active(dquot))
                return 0;

        if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NOLIST_DIRTY)
                return test_and_set_bit(DQ_MOD_B, &dquot->dq_flags);

        /* If quota is dirty already, we don't have to acquire dq_list_lock */
        if (dquot_dirty(dquot))
                return 1;

        spin_lock(&dq_list_lock);
        if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags)) {
                list_add(&dquot->dq_dirty, &sb_dqopt(dquot->dq_sb)->
                                info[dquot->dq_id.type].dqi_dirty_list);
                ret = 0;
        }
        spin_unlock(&dq_list_lock);
        return ret;
}
EXPORT_SYMBOL(dquot_mark_dquot_dirty);

/* Dirtify all the dquots - this can block when journalling */
static inline int mark_all_dquot_dirty(struct dquot __rcu * const *dquots)
{
        int ret, err, cnt;
        struct dquot *dquot;

        ret = err = 0;
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                if (dquot)
                        /* Even in case of error we have to continue */
                        ret = mark_dquot_dirty(dquot);
                if (!err)
                        err = ret;
        }
        return err;
}

static inline void dqput_all(struct dquot **dquot)
{
        unsigned int cnt;

        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                dqput(dquot[cnt]);
}

static inline int clear_dquot_dirty(struct dquot *dquot)
{
        if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NOLIST_DIRTY)
                return test_and_clear_bit(DQ_MOD_B, &dquot->dq_flags);

        spin_lock(&dq_list_lock);
        if (!test_and_clear_bit(DQ_MOD_B, &dquot->dq_flags)) {
                spin_unlock(&dq_list_lock);
                return 0;
        }
        list_del_init(&dquot->dq_dirty);
        spin_unlock(&dq_list_lock);
        return 1;
}

void mark_info_dirty(struct super_block *sb, int type)
{
        spin_lock(&dq_data_lock);
        sb_dqopt(sb)->info[type].dqi_flags |= DQF_INFO_DIRTY;
        spin_unlock(&dq_data_lock);
}
EXPORT_SYMBOL(mark_info_dirty);

/*
 *        Read dquot from disk and alloc space for it
 */

int dquot_acquire(struct dquot *dquot)
{
        int ret = 0, ret2 = 0;
        unsigned int memalloc;
        struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);

        mutex_lock(&dquot->dq_lock);
        memalloc = memalloc_nofs_save();
        if (!test_bit(DQ_READ_B, &dquot->dq_flags)) {
                ret = dqopt->ops[dquot->dq_id.type]->read_dqblk(dquot);
                if (ret < 0)
                        goto out_iolock;
        }
        /* Make sure flags update is visible after dquot has been filled */
        smp_mb__before_atomic();
        set_bit(DQ_READ_B, &dquot->dq_flags);
        /* Instantiate dquot if needed */
        if (!dquot_active(dquot) && !dquot->dq_off) {
                ret = dqopt->ops[dquot->dq_id.type]->commit_dqblk(dquot);
                /* Write the info if needed */
                if (info_dirty(&dqopt->info[dquot->dq_id.type])) {
                        ret2 = dqopt->ops[dquot->dq_id.type]->write_file_info(
                                        dquot->dq_sb, dquot->dq_id.type);
                }
                if (ret < 0)
                        goto out_iolock;
                if (ret2 < 0) {
                        ret = ret2;
                        goto out_iolock;
                }
        }
        /*
         * Make sure flags update is visible after on-disk struct has been
         * allocated. Paired with smp_rmb() in dqget().
         */
        smp_mb__before_atomic();
        set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
out_iolock:
        memalloc_nofs_restore(memalloc);
        mutex_unlock(&dquot->dq_lock);
        return ret;
}
EXPORT_SYMBOL(dquot_acquire);

/*
 *        Write dquot to disk
 */
int dquot_commit(struct dquot *dquot)
{
        int ret = 0;
        unsigned int memalloc;
        struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);

        mutex_lock(&dquot->dq_lock);
        memalloc = memalloc_nofs_save();
        if (!clear_dquot_dirty(dquot))
                goto out_lock;
        /* Inactive dquot can be only if there was error during read/init
         * => we have better not writing it */
        if (dquot_active(dquot))
                ret = dqopt->ops[dquot->dq_id.type]->commit_dqblk(dquot);
        else
                ret = -EIO;
out_lock:
        memalloc_nofs_restore(memalloc);
        mutex_unlock(&dquot->dq_lock);
        return ret;
}
EXPORT_SYMBOL(dquot_commit);

/*
 *        Release dquot
 */
int dquot_release(struct dquot *dquot)
{
        int ret = 0, ret2 = 0;
        unsigned int memalloc;
        struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);

        mutex_lock(&dquot->dq_lock);
        memalloc = memalloc_nofs_save();
        /* Check whether we are not racing with some other dqget() */
        if (dquot_is_busy(dquot))
                goto out_dqlock;
        if (dqopt->ops[dquot->dq_id.type]->release_dqblk) {
                ret = dqopt->ops[dquot->dq_id.type]->release_dqblk(dquot);
                /* Write the info */
                if (info_dirty(&dqopt->info[dquot->dq_id.type])) {
                        ret2 = dqopt->ops[dquot->dq_id.type]->write_file_info(
                                                dquot->dq_sb, dquot->dq_id.type);
                }
                if (ret >= 0)
                        ret = ret2;
        }
        clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
out_dqlock:
        memalloc_nofs_restore(memalloc);
        mutex_unlock(&dquot->dq_lock);
        return ret;
}
EXPORT_SYMBOL(dquot_release);

void dquot_destroy(struct dquot *dquot)
{
        kmem_cache_free(dquot_cachep, dquot);
}
EXPORT_SYMBOL(dquot_destroy);

static inline void do_destroy_dquot(struct dquot *dquot)
{
        dquot->dq_sb->dq_op->destroy_dquot(dquot);
}

/* Invalidate all dquots on the list. Note that this function is called after
 * quota is disabled and pointers from inodes removed so there cannot be new
 * quota users. There can still be some users of quotas due to inodes being
 * just deleted or pruned by prune_icache() (those are not attached to any
 * list) or parallel quotactl call. We have to wait for such users.
 */
static void invalidate_dquots(struct super_block *sb, int type)
{
        struct dquot *dquot, *tmp;

restart:
        flush_delayed_work(&quota_release_work);

        spin_lock(&dq_list_lock);
        list_for_each_entry_safe(dquot, tmp, &inuse_list, dq_inuse) {
                if (dquot->dq_sb != sb)
                        continue;
                if (dquot->dq_id.type != type)
                        continue;
                /* Wait for dquot users */
                if (atomic_read(&dquot->dq_count)) {
                        atomic_inc(&dquot->dq_count);
                        spin_unlock(&dq_list_lock);
                        /*
                         * Once dqput() wakes us up, we know it's time to free
                         * the dquot.
                         * IMPORTANT: we rely on the fact that there is always
                         * at most one process waiting for dquot to free.
                         * Otherwise dq_count would be > 1 and we would never
                         * wake up.
                         */
                        wait_event(dquot_ref_wq,
                                   atomic_read(&dquot->dq_count) == 1);
                        dqput(dquot);
                        /* At this moment dquot() need not exist (it could be
                         * reclaimed by prune_dqcache(). Hence we must
                         * restart. */
                        goto restart;
                }
                /*
                 * The last user already dropped its reference but dquot didn't
                 * get fully cleaned up yet. Restart the scan which flushes the
                 * work cleaning up released dquots.
                 */
                if (test_bit(DQ_RELEASING_B, &dquot->dq_flags)) {
                        spin_unlock(&dq_list_lock);
                        goto restart;
                }
                /*
                 * Quota now has no users and it has been written on last
                 * dqput()
                 */
                remove_dquot_hash(dquot);
                remove_free_dquot(dquot);
                remove_inuse(dquot);
                do_destroy_dquot(dquot);
        }
        spin_unlock(&dq_list_lock);
}

/* Call callback for every active dquot on given filesystem */
int dquot_scan_active(struct super_block *sb,
                      int (*fn)(struct dquot *dquot, unsigned long priv),
                      unsigned long priv)
{
        struct dquot *dquot, *old_dquot = NULL;
        int ret = 0;

        WARN_ON_ONCE(!rwsem_is_locked(&sb->s_umount));

        spin_lock(&dq_list_lock);
        list_for_each_entry(dquot, &inuse_list, dq_inuse) {
                if (!dquot_active(dquot))
                        continue;
                if (dquot->dq_sb != sb)
                        continue;
                /* Now we have active dquot so we can just increase use count */
                atomic_inc(&dquot->dq_count);
                spin_unlock(&dq_list_lock);
                dqput(old_dquot);
                old_dquot = dquot;
                /*
                 * ->release_dquot() can be racing with us. Our reference
                 * protects us from new calls to it so just wait for any
                 * outstanding call and recheck the DQ_ACTIVE_B after that.
                 */
                wait_on_dquot(dquot);
                if (dquot_active(dquot)) {
                        ret = fn(dquot, priv);
                        if (ret < 0)
                                goto out;
                }
                spin_lock(&dq_list_lock);
                /* We are safe to continue now because our dquot could not
                 * be moved out of the inuse list while we hold the reference */
        }
        spin_unlock(&dq_list_lock);
out:
        dqput(old_dquot);
        return ret;
}
EXPORT_SYMBOL(dquot_scan_active);

static inline int dquot_write_dquot(struct dquot *dquot)
{
        int ret = dquot->dq_sb->dq_op->write_dquot(dquot);
        if (ret < 0) {
                quota_error(dquot->dq_sb, "Can't write quota structure "
                            "(error %d). Quota may get out of sync!", ret);
                /* Clear dirty bit anyway to avoid infinite loop. */
                clear_dquot_dirty(dquot);
        }
        return ret;
}

/* Write all dquot structures to quota files */
int dquot_writeback_dquots(struct super_block *sb, int type)
{
        struct list_head dirty;
        struct dquot *dquot;
        struct quota_info *dqopt = sb_dqopt(sb);
        int cnt;
        int err, ret = 0;

        WARN_ON_ONCE(!rwsem_is_locked(&sb->s_umount));

        flush_delayed_work(&quota_release_work);

        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (type != -1 && cnt != type)
                        continue;
                if (!sb_has_quota_active(sb, cnt))
                        continue;
                spin_lock(&dq_list_lock);
                /* Move list away to avoid livelock. */
                list_replace_init(&dqopt->info[cnt].dqi_dirty_list, &dirty);
                while (!list_empty(&dirty)) {
                        dquot = list_first_entry(&dirty, struct dquot,
                                                 dq_dirty);

                        WARN_ON(!dquot_active(dquot));
                        /* If the dquot is releasing we should not touch it */
                        if (test_bit(DQ_RELEASING_B, &dquot->dq_flags)) {
                                spin_unlock(&dq_list_lock);
                                flush_delayed_work(&quota_release_work);
                                spin_lock(&dq_list_lock);
                                continue;
                        }

                        /* Now we have active dquot from which someone is
                          * holding reference so we can safely just increase
                         * use count */
                        dqgrab(dquot);
                        spin_unlock(&dq_list_lock);
                        err = dquot_write_dquot(dquot);
                        if (err && !ret)
                                ret = err;
                        dqput(dquot);
                        spin_lock(&dq_list_lock);
                }
                spin_unlock(&dq_list_lock);
        }

        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt)
                    && info_dirty(&dqopt->info[cnt]))
                        sb->dq_op->write_info(sb, cnt);
        dqstats_inc(DQST_SYNCS);

        return ret;
}
EXPORT_SYMBOL(dquot_writeback_dquots);

/* Write all dquot structures to disk and make them visible from userspace */
int dquot_quota_sync(struct super_block *sb, int type)
{
        struct quota_info *dqopt = sb_dqopt(sb);
        int cnt;
        int ret;

        ret = dquot_writeback_dquots(sb, type);
        if (ret)
                return ret;
        if (dqopt->flags & DQUOT_QUOTA_SYS_FILE)
                return 0;

        /* This is not very clever (and fast) but currently I don't know about
         * any other simple way of getting quota data to disk and we must get
         * them there for userspace to be visible... */
        if (sb->s_op->sync_fs) {
                ret = sb->s_op->sync_fs(sb, 1);
                if (ret)
                        return ret;
        }
        ret = sync_blockdev(sb->s_bdev);
        if (ret)
                return ret;

        /*
         * Now when everything is written we can discard the pagecache so
         * that userspace sees the changes.
         */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (type != -1 && cnt != type)
                        continue;
                if (!sb_has_quota_active(sb, cnt))
                        continue;
                inode_lock(dqopt->files[cnt]);
                truncate_inode_pages(&dqopt->files[cnt]->i_data, 0);
                inode_unlock(dqopt->files[cnt]);
        }

        return 0;
}
EXPORT_SYMBOL(dquot_quota_sync);

static unsigned long
dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
{
        struct dquot *dquot;
        unsigned long freed = 0;

        spin_lock(&dq_list_lock);
        while (!list_empty(&free_dquots) && sc->nr_to_scan) {
                dquot = list_first_entry(&free_dquots, struct dquot, dq_free);
                remove_dquot_hash(dquot);
                remove_free_dquot(dquot);
                remove_inuse(dquot);
                do_destroy_dquot(dquot);
                sc->nr_to_scan--;
                freed++;
        }
        spin_unlock(&dq_list_lock);
        return freed;
}

static unsigned long
dqcache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
{
        return vfs_pressure_ratio(
        percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS]));
}

static struct shrinker dqcache_shrinker = {
        .count_objects = dqcache_shrink_count,
        .scan_objects = dqcache_shrink_scan,
        .seeks = DEFAULT_SEEKS,
};

/*
 * Safely release dquot and put reference to dquot.
 */
static void quota_release_workfn(struct work_struct *work)
{
        struct dquot *dquot;
        struct list_head rls_head;

        spin_lock(&dq_list_lock);
        /* Exchange the list head to avoid livelock. */
        list_replace_init(&releasing_dquots, &rls_head);
        spin_unlock(&dq_list_lock);
        synchronize_srcu(&dquot_srcu);

restart:
        spin_lock(&dq_list_lock);
        while (!list_empty(&rls_head)) {
                dquot = list_first_entry(&rls_head, struct dquot, dq_free);
                WARN_ON_ONCE(atomic_read(&dquot->dq_count));
                /*
                 * Note that DQ_RELEASING_B protects us from racing with
                 * invalidate_dquots() calls so we are safe to work with the
                 * dquot even after we drop dq_list_lock.
                 */
                if (dquot_dirty(dquot)) {
                        spin_unlock(&dq_list_lock);
                        /* Commit dquot before releasing */
                        dquot_write_dquot(dquot);
                        goto restart;
                }
                if (dquot_active(dquot)) {
                        spin_unlock(&dq_list_lock);
                        dquot->dq_sb->dq_op->release_dquot(dquot);
                        goto restart;
                }
                /* Dquot is inactive and clean, now move it to free list */
                remove_free_dquot(dquot);
                put_dquot_last(dquot);
        }
        spin_unlock(&dq_list_lock);
}

/*
 * Put reference to dquot
 */
void dqput(struct dquot *dquot)
{
        if (!dquot)
                return;
#ifdef CONFIG_QUOTA_DEBUG
        if (!atomic_read(&dquot->dq_count)) {
                quota_error(dquot->dq_sb, "trying to free free dquot of %s %d",
                            quotatypes[dquot->dq_id.type],
                            from_kqid(&init_user_ns, dquot->dq_id));
                BUG();
        }
#endif
        dqstats_inc(DQST_DROPS);

        spin_lock(&dq_list_lock);
        if (atomic_read(&dquot->dq_count) > 1) {
                /* We have more than one user... nothing to do */
                atomic_dec(&dquot->dq_count);
                /* Releasing dquot during quotaoff phase? */
                if (!sb_has_quota_active(dquot->dq_sb, dquot->dq_id.type) &&
                    atomic_read(&dquot->dq_count) == 1)
                        wake_up(&dquot_ref_wq);
                spin_unlock(&dq_list_lock);
                return;
        }

        /* Need to release dquot? */
#ifdef CONFIG_QUOTA_DEBUG
        /* sanity check */
        BUG_ON(!list_empty(&dquot->dq_free));
#endif
        put_releasing_dquots(dquot);
        atomic_dec(&dquot->dq_count);
        spin_unlock(&dq_list_lock);
        queue_delayed_work(system_unbound_wq, &quota_release_work, 1);
}
EXPORT_SYMBOL(dqput);

struct dquot *dquot_alloc(struct super_block *sb, int type)
{
        return kmem_cache_zalloc(dquot_cachep, GFP_NOFS);
}
EXPORT_SYMBOL(dquot_alloc);

static struct dquot *get_empty_dquot(struct super_block *sb, int type)
{
        struct dquot *dquot;

        dquot = sb->dq_op->alloc_dquot(sb, type);
        if(!dquot)
                return NULL;

        mutex_init(&dquot->dq_lock);
        INIT_LIST_HEAD(&dquot->dq_free);
        INIT_LIST_HEAD(&dquot->dq_inuse);
        INIT_HLIST_NODE(&dquot->dq_hash);
        INIT_LIST_HEAD(&dquot->dq_dirty);
        dquot->dq_sb = sb;
        dquot->dq_id = make_kqid_invalid(type);
        atomic_set(&dquot->dq_count, 1);
        spin_lock_init(&dquot->dq_dqb_lock);

        return dquot;
}

/*
 * Get reference to dquot
 *
 * Locking is slightly tricky here. We are guarded from parallel quotaoff()
 * destroying our dquot by:
 *   a) checking for quota flags under dq_list_lock and
 *   b) getting a reference to dquot before we release dq_list_lock
 */
struct dquot *dqget(struct super_block *sb, struct kqid qid)
{
        unsigned int hashent = hashfn(sb, qid);
        struct dquot *dquot, *empty = NULL;

        if (!qid_has_mapping(sb->s_user_ns, qid))
                return ERR_PTR(-EINVAL);

        if (!sb_has_quota_active(sb, qid.type))
                return ERR_PTR(-ESRCH);
we_slept:
        spin_lock(&dq_list_lock);
        spin_lock(&dq_state_lock);
        if (!sb_has_quota_active(sb, qid.type)) {
                spin_unlock(&dq_state_lock);
                spin_unlock(&dq_list_lock);
                dquot = ERR_PTR(-ESRCH);
                goto out;
        }
        spin_unlock(&dq_state_lock);

        dquot = find_dquot(hashent, sb, qid);
        if (!dquot) {
                if (!empty) {
                        spin_unlock(&dq_list_lock);
                        empty = get_empty_dquot(sb, qid.type);
                        if (!empty)
                                schedule();        /* Try to wait for a moment... */
                        goto we_slept;
                }
                dquot = empty;
                empty = NULL;
                dquot->dq_id = qid;
                /* all dquots go on the inuse_list */
                put_inuse(dquot);
                /* hash it first so it can be found */
                insert_dquot_hash(dquot);
                spin_unlock(&dq_list_lock);
                dqstats_inc(DQST_LOOKUPS);
        } else {
                if (!atomic_read(&dquot->dq_count))
                        remove_free_dquot(dquot);
                atomic_inc(&dquot->dq_count);
                spin_unlock(&dq_list_lock);
                dqstats_inc(DQST_CACHE_HITS);
                dqstats_inc(DQST_LOOKUPS);
        }
        /* Wait for dq_lock - after this we know that either dquot_release() is
         * already finished or it will be canceled due to dq_count > 0 test */
        wait_on_dquot(dquot);
        /* Read the dquot / allocate space in quota file */
        if (!dquot_active(dquot)) {
                int err;

                err = sb->dq_op->acquire_dquot(dquot);
                if (err < 0) {
                        dqput(dquot);
                        dquot = ERR_PTR(err);
                        goto out;
                }
        }
        /*
         * Make sure following reads see filled structure - paired with
         * smp_mb__before_atomic() in dquot_acquire().
         */
        smp_rmb();
        /* Has somebody invalidated entry under us? */
        WARN_ON_ONCE(hlist_unhashed(&dquot->dq_hash));
out:
        if (empty)
                do_destroy_dquot(empty);

        return dquot;
}
EXPORT_SYMBOL(dqget);

static inline struct dquot __rcu **i_dquot(struct inode *inode)
{
        /* Force __rcu for now until filesystems are fixed */
        return (struct dquot __rcu **)inode->i_sb->s_op->get_dquots(inode);
}

static int dqinit_needed(struct inode *inode, int type)
{
        struct dquot __rcu * const *dquots;
        int cnt;

        if (IS_NOQUOTA(inode))
                return 0;

        dquots = i_dquot(inode);
        if (type != -1)
                return !dquots[type];
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                if (!dquots[cnt])
                        return 1;
        return 0;
}

/* This routine is guarded by s_umount semaphore */
static int add_dquot_ref(struct super_block *sb, int type)
{
        struct inode *inode, *old_inode = NULL;
#ifdef CONFIG_QUOTA_DEBUG
        int reserved = 0;
#endif
        int err = 0;

        spin_lock(&sb->s_inode_list_lock);
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
                spin_lock(&inode->i_lock);
                if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
                    !atomic_read(&inode->i_writecount) ||
                    !dqinit_needed(inode, type)) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }
                __iget(inode);
                spin_unlock(&inode->i_lock);
                spin_unlock(&sb->s_inode_list_lock);

#ifdef CONFIG_QUOTA_DEBUG
                if (unlikely(inode_get_rsv_space(inode) > 0))
                        reserved = 1;
#endif
                iput(old_inode);
                err = __dquot_initialize(inode, type);
                if (err) {
                        iput(inode);
                        goto out;
                }

                /*
                 * We hold a reference to 'inode' so it couldn't have been
                 * removed from s_inodes list while we dropped the
                 * s_inode_list_lock. We cannot iput the inode now as we can be
                 * holding the last reference and we cannot iput it under
                 * s_inode_list_lock. So we keep the reference and iput it
                 * later.
                 */
                old_inode = inode;
                cond_resched();
                spin_lock(&sb->s_inode_list_lock);
        }
        spin_unlock(&sb->s_inode_list_lock);
        iput(old_inode);
out:
#ifdef CONFIG_QUOTA_DEBUG
        if (reserved) {
                quota_error(sb, "Writes happened before quota was turned on "
                        "thus quota information is probably inconsistent. "
                        "Please run quotacheck(8)");
        }
#endif
        return err;
}

static void remove_dquot_ref(struct super_block *sb, int type)
{
        struct inode *inode;
#ifdef CONFIG_QUOTA_DEBUG
        int reserved = 0;
#endif

        spin_lock(&sb->s_inode_list_lock);
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
                /*
                 *  We have to scan also I_NEW inodes because they can already
                 *  have quota pointer initialized. Luckily, we need to touch
                 *  only quota pointers and these have separate locking
                 *  (dq_data_lock).
                 */
                spin_lock(&dq_data_lock);
                if (!IS_NOQUOTA(inode)) {
                        struct dquot __rcu **dquots = i_dquot(inode);
                        struct dquot *dquot = srcu_dereference_check(
                                dquots[type], &dquot_srcu,
                                lockdep_is_held(&dq_data_lock));

#ifdef CONFIG_QUOTA_DEBUG
                        if (unlikely(inode_get_rsv_space(inode) > 0))
                                reserved = 1;
#endif
                        rcu_assign_pointer(dquots[type], NULL);
                        if (dquot)
                                dqput(dquot);
                }
                spin_unlock(&dq_data_lock);
        }
        spin_unlock(&sb->s_inode_list_lock);
#ifdef CONFIG_QUOTA_DEBUG
        if (reserved) {
                printk(KERN_WARNING "VFS (%s): Writes happened after quota"
                        " was disabled thus quota information is probably "
                        "inconsistent. Please run quotacheck(8).\n", sb->s_id);
        }
#endif
}

/* Gather all references from inodes and drop them */
static void drop_dquot_ref(struct super_block *sb, int type)
{
        if (sb->dq_op)
                remove_dquot_ref(sb, type);
}

static inline
void dquot_free_reserved_space(struct dquot *dquot, qsize_t number)
{
        if (dquot->dq_dqb.dqb_rsvspace >= number)
                dquot->dq_dqb.dqb_rsvspace -= number;
        else {
                WARN_ON_ONCE(1);
                dquot->dq_dqb.dqb_rsvspace = 0;
        }
        if (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace <=
            dquot->dq_dqb.dqb_bsoftlimit)
                dquot->dq_dqb.dqb_btime = (time64_t) 0;
        clear_bit(DQ_BLKS_B, &dquot->dq_flags);
}

static void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
{
        if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
            dquot->dq_dqb.dqb_curinodes >= number)
                dquot->dq_dqb.dqb_curinodes -= number;
        else
                dquot->dq_dqb.dqb_curinodes = 0;
        if (dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit)
                dquot->dq_dqb.dqb_itime = (time64_t) 0;
        clear_bit(DQ_INODES_B, &dquot->dq_flags);
}

static void dquot_decr_space(struct dquot *dquot, qsize_t number)
{
        if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
            dquot->dq_dqb.dqb_curspace >= number)
                dquot->dq_dqb.dqb_curspace -= number;
        else
                dquot->dq_dqb.dqb_curspace = 0;
        if (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace <=
            dquot->dq_dqb.dqb_bsoftlimit)
                dquot->dq_dqb.dqb_btime = (time64_t) 0;
        clear_bit(DQ_BLKS_B, &dquot->dq_flags);
}

struct dquot_warn {
        struct super_block *w_sb;
        struct kqid w_dq_id;
        short w_type;
};

static int warning_issued(struct dquot *dquot, const int warntype)
{
        int flag = (warntype == QUOTA_NL_BHARDWARN ||
                warntype == QUOTA_NL_BSOFTLONGWARN) ? DQ_BLKS_B :
                ((warntype == QUOTA_NL_IHARDWARN ||
                warntype == QUOTA_NL_ISOFTLONGWARN) ? DQ_INODES_B : 0);

        if (!flag)
                return 0;
        return test_and_set_bit(flag, &dquot->dq_flags);
}

#ifdef CONFIG_PRINT_QUOTA_WARNING
static int flag_print_warnings = 1;

static int need_print_warning(struct dquot_warn *warn)
{
        if (!flag_print_warnings)
                return 0;

        switch (warn->w_dq_id.type) {
                case USRQUOTA:
                        return uid_eq(current_fsuid(), warn->w_dq_id.uid);
                case GRPQUOTA:
                        return in_group_p(warn->w_dq_id.gid);
                case PRJQUOTA:
                        return 1;
        }
        return 0;
}

/* Print warning to user which exceeded quota */
static void print_warning(struct dquot_warn *warn)
{
        char *msg = NULL;
        struct tty_struct *tty;
        int warntype = warn->w_type;

        if (warntype == QUOTA_NL_IHARDBELOW ||
            warntype == QUOTA_NL_ISOFTBELOW ||
            warntype == QUOTA_NL_BHARDBELOW ||
            warntype == QUOTA_NL_BSOFTBELOW || !need_print_warning(warn))
                return;

        tty = get_current_tty();
        if (!tty)
                return;
        tty_write_message(tty, warn->w_sb->s_id);
        if (warntype == QUOTA_NL_ISOFTWARN || warntype == QUOTA_NL_BSOFTWARN)
                tty_write_message(tty, ": warning, ");
        else
                tty_write_message(tty, ": write failed, ");
        tty_write_message(tty, quotatypes[warn->w_dq_id.type]);
        switch (warntype) {
                case QUOTA_NL_IHARDWARN:
                        msg = " file limit reached.\r\n";
                        break;
                case QUOTA_NL_ISOFTLONGWARN:
                        msg = " file quota exceeded too long.\r\n";
                        break;
                case QUOTA_NL_ISOFTWARN:
                        msg = " file quota exceeded.\r\n";
                        break;
                case QUOTA_NL_BHARDWARN:
                        msg = " block limit reached.\r\n";
                        break;
                case QUOTA_NL_BSOFTLONGWARN:
                        msg = " block quota exceeded too long.\r\n";
                        break;
                case QUOTA_NL_BSOFTWARN:
                        msg = " block quota exceeded.\r\n";
                        break;
        }
        tty_write_message(tty, msg);
        tty_kref_put(tty);
}
#endif

static void prepare_warning(struct dquot_warn *warn, struct dquot *dquot,
                            int warntype)
{
        if (warning_issued(dquot, warntype))
                return;
        warn->w_type = warntype;
        warn->w_sb = dquot->dq_sb;
        warn->w_dq_id = dquot->dq_id;
}

/*
 * Write warnings to the console and send warning messages over netlink.
 *
 * Note that this function can call into tty and networking code.
 */
static void flush_warnings(struct dquot_warn *warn)
{
        int i;

        for (i = 0; i < MAXQUOTAS; i++) {
                if (warn[i].w_type == QUOTA_NL_NOWARN)
                        continue;
#ifdef CONFIG_PRINT_QUOTA_WARNING
                print_warning(&warn[i]);
#endif
                quota_send_warning(warn[i].w_dq_id,
                                   warn[i].w_sb->s_dev, warn[i].w_type);
        }
}

static int ignore_hardlimit(struct dquot *dquot)
{
        struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];

        return capable(CAP_SYS_RESOURCE) &&
               (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD ||
                !(info->dqi_flags & DQF_ROOT_SQUASH));
}

static int dquot_add_inodes(struct dquot *dquot, qsize_t inodes,
                            struct dquot_warn *warn)
{
        qsize_t newinodes;
        int ret = 0;

        spin_lock(&dquot->dq_dqb_lock);
        newinodes = dquot->dq_dqb.dqb_curinodes + inodes;
        if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_id.type) ||
            test_bit(DQ_FAKE_B, &dquot->dq_flags))
                goto add;

        if (dquot->dq_dqb.dqb_ihardlimit &&
            newinodes > dquot->dq_dqb.dqb_ihardlimit &&
            !ignore_hardlimit(dquot)) {
                prepare_warning(warn, dquot, QUOTA_NL_IHARDWARN);
                ret = -EDQUOT;
                goto out;
        }

        if (dquot->dq_dqb.dqb_isoftlimit &&
            newinodes > dquot->dq_dqb.dqb_isoftlimit &&
            dquot->dq_dqb.dqb_itime &&
            ktime_get_real_seconds() >= dquot->dq_dqb.dqb_itime &&
            !ignore_hardlimit(dquot)) {
                prepare_warning(warn, dquot, QUOTA_NL_ISOFTLONGWARN);
                ret = -EDQUOT;
                goto out;
        }

        if (dquot->dq_dqb.dqb_isoftlimit &&
            newinodes > dquot->dq_dqb.dqb_isoftlimit &&
            dquot->dq_dqb.dqb_itime == 0) {
                prepare_warning(warn, dquot, QUOTA_NL_ISOFTWARN);
                dquot->dq_dqb.dqb_itime = ktime_get_real_seconds() +
                    sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type].dqi_igrace;
        }
add:
        dquot->dq_dqb.dqb_curinodes = newinodes;

out:
        spin_unlock(&dquot->dq_dqb_lock);
        return ret;
}

static int dquot_add_space(struct dquot *dquot, qsize_t space,
                           qsize_t rsv_space, unsigned int flags,
                           struct dquot_warn *warn)
{
        qsize_t tspace;
        struct super_block *sb = dquot->dq_sb;
        int ret = 0;

        spin_lock(&dquot->dq_dqb_lock);
        if (!sb_has_quota_limits_enabled(sb, dquot->dq_id.type) ||
            test_bit(DQ_FAKE_B, &dquot->dq_flags))
                goto finish;

        tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace
                + space + rsv_space;

        if (dquot->dq_dqb.dqb_bhardlimit &&
            tspace > dquot->dq_dqb.dqb_bhardlimit &&
            !ignore_hardlimit(dquot)) {
                if (flags & DQUOT_SPACE_WARN)
                        prepare_warning(warn, dquot, QUOTA_NL_BHARDWARN);
                ret = -EDQUOT;
                goto finish;
        }

        if (dquot->dq_dqb.dqb_bsoftlimit &&
            tspace > dquot->dq_dqb.dqb_bsoftlimit &&
            dquot->dq_dqb.dqb_btime &&
            ktime_get_real_seconds() >= dquot->dq_dqb.dqb_btime &&
            !ignore_hardlimit(dquot)) {
                if (flags & DQUOT_SPACE_WARN)
                        prepare_warning(warn, dquot, QUOTA_NL_BSOFTLONGWARN);
                ret = -EDQUOT;
                goto finish;
        }

        if (dquot->dq_dqb.dqb_bsoftlimit &&
            tspace > dquot->dq_dqb.dqb_bsoftlimit &&
            dquot->dq_dqb.dqb_btime == 0) {
                if (flags & DQUOT_SPACE_WARN) {
                        prepare_warning(warn, dquot, QUOTA_NL_BSOFTWARN);
                        dquot->dq_dqb.dqb_btime = ktime_get_real_seconds() +
                            sb_dqopt(sb)->info[dquot->dq_id.type].dqi_bgrace;
                } else {
                        /*
                         * We don't allow preallocation to exceed softlimit so exceeding will
                         * be always printed
                         */
                        ret = -EDQUOT;
                        goto finish;
                }
        }
finish:
        /*
         * We have to be careful and go through warning generation & grace time
         * setting even if DQUOT_SPACE_NOFAIL is set. That's why we check it
         * only here...
         */
        if (flags & DQUOT_SPACE_NOFAIL)
                ret = 0;
        if (!ret) {
                dquot->dq_dqb.dqb_rsvspace += rsv_space;
                dquot->dq_dqb.dqb_curspace += space;
        }
        spin_unlock(&dquot->dq_dqb_lock);
        return ret;
}

static int info_idq_free(struct dquot *dquot, qsize_t inodes)
{
        qsize_t newinodes;

        if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
            dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit ||
            !sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_id.type))
                return QUOTA_NL_NOWARN;

        newinodes = dquot->dq_dqb.dqb_curinodes - inodes;
        if (newinodes <= dquot->dq_dqb.dqb_isoftlimit)
                return QUOTA_NL_ISOFTBELOW;
        if (dquot->dq_dqb.dqb_curinodes >= dquot->dq_dqb.dqb_ihardlimit &&
            newinodes < dquot->dq_dqb.dqb_ihardlimit)
                return QUOTA_NL_IHARDBELOW;
        return QUOTA_NL_NOWARN;
}

static int info_bdq_free(struct dquot *dquot, qsize_t space)
{
        qsize_t tspace;

        tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace;

        if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
            tspace <= dquot->dq_dqb.dqb_bsoftlimit)
                return QUOTA_NL_NOWARN;

        if (tspace - space <= dquot->dq_dqb.dqb_bsoftlimit)
                return QUOTA_NL_BSOFTBELOW;
        if (tspace >= dquot->dq_dqb.dqb_bhardlimit &&
            tspace - space < dquot->dq_dqb.dqb_bhardlimit)
                return QUOTA_NL_BHARDBELOW;
        return QUOTA_NL_NOWARN;
}

static int inode_quota_active(const struct inode *inode)
{
        struct super_block *sb = inode->i_sb;

        if (IS_NOQUOTA(inode))
                return 0;
        return sb_any_quota_loaded(sb) & ~sb_any_quota_suspended(sb);
}

/*
 * Initialize quota pointers in inode
 *
 * It is better to call this function outside of any transaction as it
 * might need a lot of space in journal for dquot structure allocation.
 */
static int __dquot_initialize(struct inode *inode, int type)
{
        int cnt, init_needed = 0;
        struct dquot __rcu **dquots;
        struct dquot *got[MAXQUOTAS] = {};
        struct super_block *sb = inode->i_sb;
        qsize_t rsv;
        int ret = 0;

        if (!inode_quota_active(inode))
                return 0;

        dquots = i_dquot(inode);

        /* First get references to structures we might need. */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                struct kqid qid;
                kprojid_t projid;
                int rc;
                struct dquot *dquot;

                if (type != -1 && cnt != type)
                        continue;
                /*
                 * The i_dquot should have been initialized in most cases,
                 * we check it without locking here to avoid unnecessary
                 * dqget()/dqput() calls.
                 */
                if (dquots[cnt])
                        continue;

                if (!sb_has_quota_active(sb, cnt))
                        continue;

                init_needed = 1;

                switch (cnt) {
                case USRQUOTA:
                        qid = make_kqid_uid(inode->i_uid);
                        break;
                case GRPQUOTA:
                        qid = make_kqid_gid(inode->i_gid);
                        break;
                case PRJQUOTA:
                        rc = inode->i_sb->dq_op->get_projid(inode, &projid);
                        if (rc)
                                continue;
                        qid = make_kqid_projid(projid);
                        break;
                }
                dquot = dqget(sb, qid);
                if (IS_ERR(dquot)) {
                        /* We raced with somebody turning quotas off... */
                        if (PTR_ERR(dquot) != -ESRCH) {
                                ret = PTR_ERR(dquot);
                                goto out_put;
                        }
                        dquot = NULL;
                }
                got[cnt] = dquot;
        }

        /* All required i_dquot has been initialized */
        if (!init_needed)
                return 0;

        spin_lock(&dq_data_lock);
        if (IS_NOQUOTA(inode))
                goto out_lock;
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (type != -1 && cnt != type)
                        continue;
                /* Avoid races with quotaoff() */
                if (!sb_has_quota_active(sb, cnt))
                        continue;
                /* We could race with quotaon or dqget() could have failed */
                if (!got[cnt])
                        continue;
                if (!dquots[cnt]) {
                        rcu_assign_pointer(dquots[cnt], got[cnt]);
                        got[cnt] = NULL;
                        /*
                         * Make quota reservation system happy if someone
                         * did a write before quota was turned on
                         */
                        rsv = inode_get_rsv_space(inode);
                        if (unlikely(rsv)) {
                                struct dquot *dquot = srcu_dereference_check(
                                        dquots[cnt], &dquot_srcu,
                                        lockdep_is_held(&dq_data_lock));

                                spin_lock(&inode->i_lock);
                                /* Get reservation again under proper lock */
                                rsv = __inode_get_rsv_space(inode);
                                spin_lock(&dquot->dq_dqb_lock);
                                dquot->dq_dqb.dqb_rsvspace += rsv;
                                spin_unlock(&dquot->dq_dqb_lock);
                                spin_unlock(&inode->i_lock);
                        }
                }
        }
out_lock:
        spin_unlock(&dq_data_lock);
out_put:
        /* Drop unused references */
        dqput_all(got);

        return ret;
}

int dquot_initialize(struct inode *inode)
{
        return __dquot_initialize(inode, -1);
}
EXPORT_SYMBOL(dquot_initialize);

bool dquot_initialize_needed(struct inode *inode)
{
        struct dquot __rcu **dquots;
        int i;

        if (!inode_quota_active(inode))
                return false;

        dquots = i_dquot(inode);
        for (i = 0; i < MAXQUOTAS; i++)
                if (!dquots[i] && sb_has_quota_active(inode->i_sb, i))
                        return true;
        return false;
}
EXPORT_SYMBOL(dquot_initialize_needed);

/*
 * Release all quotas referenced by inode.
 *
 * This function only be called on inode free or converting
 * a file to quota file, no other users for the i_dquot in
 * both cases, so we needn't call synchronize_srcu() after
 * clearing i_dquot.
 */
static void __dquot_drop(struct inode *inode)
{
        int cnt;
        struct dquot __rcu **dquots = i_dquot(inode);
        struct dquot *put[MAXQUOTAS];

        spin_lock(&dq_data_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                put[cnt] = srcu_dereference_check(dquots[cnt], &dquot_srcu,
                                        lockdep_is_held(&dq_data_lock));
                rcu_assign_pointer(dquots[cnt], NULL);
        }
        spin_unlock(&dq_data_lock);
        dqput_all(put);
}

void dquot_drop(struct inode *inode)
{
        struct dquot __rcu * const *dquots;
        int cnt;

        if (IS_NOQUOTA(inode))
                return;

        /*
         * Test before calling to rule out calls from proc and such
         * where we are not allowed to block. Note that this is
         * actually reliable test even without the lock - the caller
         * must assure that nobody can come after the DQUOT_DROP and
         * add quota pointers back anyway.
         */
        dquots = i_dquot(inode);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (dquots[cnt])
                        break;
        }

        if (cnt < MAXQUOTAS)
                __dquot_drop(inode);
}
EXPORT_SYMBOL(dquot_drop);

/*
 * inode_reserved_space is managed internally by quota, and protected by
 * i_lock similar to i_blocks+i_bytes.
 */
static qsize_t *inode_reserved_space(struct inode * inode)
{
        /* Filesystem must explicitly define it's own method in order to use
         * quota reservation interface */
        BUG_ON(!inode->i_sb->dq_op->get_reserved_space);
        return inode->i_sb->dq_op->get_reserved_space(inode);
}

static qsize_t __inode_get_rsv_space(struct inode *inode)
{
        if (!inode->i_sb->dq_op->get_reserved_space)
                return 0;
        return *inode_reserved_space(inode);
}

static qsize_t inode_get_rsv_space(struct inode *inode)
{
        qsize_t ret;

        if (!inode->i_sb->dq_op->get_reserved_space)
                return 0;
        spin_lock(&inode->i_lock);
        ret = __inode_get_rsv_space(inode);
        spin_unlock(&inode->i_lock);
        return ret;
}

/*
 * This functions updates i_blocks+i_bytes fields and quota information
 * (together with appropriate checks).
 *
 * NOTE: We absolutely rely on the fact that caller dirties the inode
 * (usually helpers in quotaops.h care about this) and holds a handle for
 * the current transaction so that dquot write and inode write go into the
 * same transaction.
 */

/*
 * This operation can block, but only after everything is updated
 */
int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
{
        int cnt, ret = 0, index;
        struct dquot_warn warn[MAXQUOTAS];
        int reserve = flags & DQUOT_SPACE_RESERVE;
        struct dquot __rcu **dquots;
        struct dquot *dquot;

        if (!inode_quota_active(inode)) {
                if (reserve) {
                        spin_lock(&inode->i_lock);
                        *inode_reserved_space(inode) += number;
                        spin_unlock(&inode->i_lock);
                } else {
                        inode_add_bytes(inode, number);
                }
                goto out;
        }

        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                warn[cnt].w_type = QUOTA_NL_NOWARN;

        dquots = i_dquot(inode);
        index = srcu_read_lock(&dquot_srcu);
        spin_lock(&inode->i_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                if (!dquot)
                        continue;
                if (reserve) {
                        ret = dquot_add_space(dquot, 0, number, flags, &warn[cnt]);
                } else {
                        ret = dquot_add_space(dquot, number, 0, flags, &warn[cnt]);
                }
                if (ret) {
                        /* Back out changes we already did */
                        for (cnt--; cnt >= 0; cnt--) {
                                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                                if (!dquot)
                                        continue;
                                spin_lock(&dquot->dq_dqb_lock);
                                if (reserve)
                                        dquot_free_reserved_space(dquot, number);
                                else
                                        dquot_decr_space(dquot, number);
                                spin_unlock(&dquot->dq_dqb_lock);
                        }
                        spin_unlock(&inode->i_lock);
                        goto out_flush_warn;
                }
        }
        if (reserve)
                *inode_reserved_space(inode) += number;
        else
                __inode_add_bytes(inode, number);
        spin_unlock(&inode->i_lock);

        if (reserve)
                goto out_flush_warn;
        mark_all_dquot_dirty(dquots);
out_flush_warn:
        srcu_read_unlock(&dquot_srcu, index);
        flush_warnings(warn);
out:
        return ret;
}
EXPORT_SYMBOL(__dquot_alloc_space);

/*
 * This operation can block, but only after everything is updated
 */
int dquot_alloc_inode(struct inode *inode)
{
        int cnt, ret = 0, index;
        struct dquot_warn warn[MAXQUOTAS];
        struct dquot __rcu * const *dquots;
        struct dquot *dquot;

        if (!inode_quota_active(inode))
                return 0;
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                warn[cnt].w_type = QUOTA_NL_NOWARN;

        dquots = i_dquot(inode);
        index = srcu_read_lock(&dquot_srcu);
        spin_lock(&inode->i_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                if (!dquot)
                        continue;
                ret = dquot_add_inodes(dquot, 1, &warn[cnt]);
                if (ret) {
                        for (cnt--; cnt >= 0; cnt--) {
                                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                                if (!dquot)
                                        continue;
                                /* Back out changes we already did */
                                spin_lock(&dquot->dq_dqb_lock);
                                dquot_decr_inodes(dquot, 1);
                                spin_unlock(&dquot->dq_dqb_lock);
                        }
                        goto warn_put_all;
                }
        }

warn_put_all:
        spin_unlock(&inode->i_lock);
        if (ret == 0)
                mark_all_dquot_dirty(dquots);
        srcu_read_unlock(&dquot_srcu, index);
        flush_warnings(warn);
        return ret;
}
EXPORT_SYMBOL(dquot_alloc_inode);

/*
 * Convert in-memory reserved quotas to real consumed quotas
 */
int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
{
        struct dquot __rcu **dquots;
        struct dquot *dquot;
        int cnt, index;

        if (!inode_quota_active(inode)) {
                spin_lock(&inode->i_lock);
                *inode_reserved_space(inode) -= number;
                __inode_add_bytes(inode, number);
                spin_unlock(&inode->i_lock);
                return 0;
        }

        dquots = i_dquot(inode);
        index = srcu_read_lock(&dquot_srcu);
        spin_lock(&inode->i_lock);
        /* Claim reserved quotas to allocated quotas */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                if (dquot) {
                        spin_lock(&dquot->dq_dqb_lock);
                        if (WARN_ON_ONCE(dquot->dq_dqb.dqb_rsvspace < number))
                                number = dquot->dq_dqb.dqb_rsvspace;
                        dquot->dq_dqb.dqb_curspace += number;
                        dquot->dq_dqb.dqb_rsvspace -= number;
                        spin_unlock(&dquot->dq_dqb_lock);
                }
        }
        /* Update inode bytes */
        *inode_reserved_space(inode) -= number;
        __inode_add_bytes(inode, number);
        spin_unlock(&inode->i_lock);
        mark_all_dquot_dirty(dquots);
        srcu_read_unlock(&dquot_srcu, index);
        return 0;
}
EXPORT_SYMBOL(dquot_claim_space_nodirty);

/*
 * Convert allocated space back to in-memory reserved quotas
 */
void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number)
{
        struct dquot __rcu **dquots;
        struct dquot *dquot;
        int cnt, index;

        if (!inode_quota_active(inode)) {
                spin_lock(&inode->i_lock);
                *inode_reserved_space(inode) += number;
                __inode_sub_bytes(inode, number);
                spin_unlock(&inode->i_lock);
                return;
        }

        dquots = i_dquot(inode);
        index = srcu_read_lock(&dquot_srcu);
        spin_lock(&inode->i_lock);
        /* Claim reserved quotas to allocated quotas */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                if (dquot) {
                        spin_lock(&dquot->dq_dqb_lock);
                        if (WARN_ON_ONCE(dquot->dq_dqb.dqb_curspace < number))
                                number = dquot->dq_dqb.dqb_curspace;
                        dquot->dq_dqb.dqb_rsvspace += number;
                        dquot->dq_dqb.dqb_curspace -= number;
                        spin_unlock(&dquot->dq_dqb_lock);
                }
        }
        /* Update inode bytes */
        *inode_reserved_space(inode) += number;
        __inode_sub_bytes(inode, number);
        spin_unlock(&inode->i_lock);
        mark_all_dquot_dirty(dquots);
        srcu_read_unlock(&dquot_srcu, index);
        return;
}
EXPORT_SYMBOL(dquot_reclaim_space_nodirty);

/*
 * This operation can block, but only after everything is updated
 */
void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
{
        unsigned int cnt;
        struct dquot_warn warn[MAXQUOTAS];
        struct dquot __rcu **dquots;
        struct dquot *dquot;
        int reserve = flags & DQUOT_SPACE_RESERVE, index;

        if (!inode_quota_active(inode)) {
                if (reserve) {
                        spin_lock(&inode->i_lock);
                        *inode_reserved_space(inode) -= number;
                        spin_unlock(&inode->i_lock);
                } else {
                        inode_sub_bytes(inode, number);
                }
                return;
        }

        dquots = i_dquot(inode);
        index = srcu_read_lock(&dquot_srcu);
        spin_lock(&inode->i_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                int wtype;

                warn[cnt].w_type = QUOTA_NL_NOWARN;
                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                if (!dquot)
                        continue;
                spin_lock(&dquot->dq_dqb_lock);
                wtype = info_bdq_free(dquot, number);
                if (wtype != QUOTA_NL_NOWARN)
                        prepare_warning(&warn[cnt], dquot, wtype);
                if (reserve)
                        dquot_free_reserved_space(dquot, number);
                else
                        dquot_decr_space(dquot, number);
                spin_unlock(&dquot->dq_dqb_lock);
        }
        if (reserve)
                *inode_reserved_space(inode) -= number;
        else
                __inode_sub_bytes(inode, number);
        spin_unlock(&inode->i_lock);

        if (reserve)
                goto out_unlock;
        mark_all_dquot_dirty(dquots);
out_unlock:
        srcu_read_unlock(&dquot_srcu, index);
        flush_warnings(warn);
}
EXPORT_SYMBOL(__dquot_free_space);

/*
 * This operation can block, but only after everything is updated
 */
void dquot_free_inode(struct inode *inode)
{
        unsigned int cnt;
        struct dquot_warn warn[MAXQUOTAS];
        struct dquot __rcu * const *dquots;
        struct dquot *dquot;
        int index;

        if (!inode_quota_active(inode))
                return;

        dquots = i_dquot(inode);
        index = srcu_read_lock(&dquot_srcu);
        spin_lock(&inode->i_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                int wtype;
                warn[cnt].w_type = QUOTA_NL_NOWARN;
                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                if (!dquot)
                        continue;
                spin_lock(&dquot->dq_dqb_lock);
                wtype = info_idq_free(dquot, 1);
                if (wtype != QUOTA_NL_NOWARN)
                        prepare_warning(&warn[cnt], dquot, wtype);
                dquot_decr_inodes(dquot, 1);
                spin_unlock(&dquot->dq_dqb_lock);
        }
        spin_unlock(&inode->i_lock);
        mark_all_dquot_dirty(dquots);
        srcu_read_unlock(&dquot_srcu, index);
        flush_warnings(warn);
}
EXPORT_SYMBOL(dquot_free_inode);

/*
 * Transfer the number of inode and blocks from one diskquota to an other.
 * On success, dquot references in transfer_to are consumed and references
 * to original dquots that need to be released are placed there. On failure,
 * references are kept untouched.
 *
 * This operation can block, but only after everything is updated
 * A transaction must be started when entering this function.
 *
 * We are holding reference on transfer_from & transfer_to, no need to
 * protect them by srcu_read_lock().
 */
int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
{
        qsize_t cur_space;
        qsize_t rsv_space = 0;
        qsize_t inode_usage = 1;
        struct dquot __rcu **dquots;
        struct dquot *transfer_from[MAXQUOTAS] = {};
        int cnt, index, ret = 0;
        char is_valid[MAXQUOTAS] = {};
        struct dquot_warn warn_to[MAXQUOTAS];
        struct dquot_warn warn_from_inodes[MAXQUOTAS];
        struct dquot_warn warn_from_space[MAXQUOTAS];

        if (IS_NOQUOTA(inode))
                return 0;

        if (inode->i_sb->dq_op->get_inode_usage) {
                ret = inode->i_sb->dq_op->get_inode_usage(inode, &inode_usage);
                if (ret)
                        return ret;
        }

        /* Initialize the arrays */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                warn_to[cnt].w_type = QUOTA_NL_NOWARN;
                warn_from_inodes[cnt].w_type = QUOTA_NL_NOWARN;
                warn_from_space[cnt].w_type = QUOTA_NL_NOWARN;
        }

        spin_lock(&dq_data_lock);
        spin_lock(&inode->i_lock);
        if (IS_NOQUOTA(inode)) {        /* File without quota accounting? */
                spin_unlock(&inode->i_lock);
                spin_unlock(&dq_data_lock);
                return 0;
        }
        cur_space = __inode_get_bytes(inode);
        rsv_space = __inode_get_rsv_space(inode);
        dquots = i_dquot(inode);
        /*
         * Build the transfer_from list, check limits, and update usage in
         * the target structures.
         */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                /*
                 * Skip changes for same uid or gid or for turned off quota-type.
                 */
                if (!transfer_to[cnt])
                        continue;
                /* Avoid races with quotaoff() */
                if (!sb_has_quota_active(inode->i_sb, cnt))
                        continue;
                is_valid[cnt] = 1;
                transfer_from[cnt] = srcu_dereference_check(dquots[cnt],
                                &dquot_srcu, lockdep_is_held(&dq_data_lock));
                ret = dquot_add_inodes(transfer_to[cnt], inode_usage,
                                       &warn_to[cnt]);
                if (ret)
                        goto over_quota;
                ret = dquot_add_space(transfer_to[cnt], cur_space, rsv_space,
                                      DQUOT_SPACE_WARN, &warn_to[cnt]);
                if (ret) {
                        spin_lock(&transfer_to[cnt]->dq_dqb_lock);
                        dquot_decr_inodes(transfer_to[cnt], inode_usage);
                        spin_unlock(&transfer_to[cnt]->dq_dqb_lock);
                        goto over_quota;
                }
        }

        /* Decrease usage for source structures and update quota pointers */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (!is_valid[cnt])
                        continue;
                /* Due to IO error we might not have transfer_from[] structure */
                if (transfer_from[cnt]) {
                        int wtype;

                        spin_lock(&transfer_from[cnt]->dq_dqb_lock);
                        wtype = info_idq_free(transfer_from[cnt], inode_usage);
                        if (wtype != QUOTA_NL_NOWARN)
                                prepare_warning(&warn_from_inodes[cnt],
                                                transfer_from[cnt], wtype);
                        wtype = info_bdq_free(transfer_from[cnt],
                                              cur_space + rsv_space);
                        if (wtype != QUOTA_NL_NOWARN)
                                prepare_warning(&warn_from_space[cnt],
                                                transfer_from[cnt], wtype);
                        dquot_decr_inodes(transfer_from[cnt], inode_usage);
                        dquot_decr_space(transfer_from[cnt], cur_space);
                        dquot_free_reserved_space(transfer_from[cnt],
                                                  rsv_space);
                        spin_unlock(&transfer_from[cnt]->dq_dqb_lock);
                }
                rcu_assign_pointer(dquots[cnt], transfer_to[cnt]);
        }
        spin_unlock(&inode->i_lock);
        spin_unlock(&dq_data_lock);

        /*
         * These arrays are local and we hold dquot references so we don't need
         * the srcu protection but still take dquot_srcu to avoid warning in
         * mark_all_dquot_dirty().
         */
        index = srcu_read_lock(&dquot_srcu);
        mark_all_dquot_dirty((struct dquot __rcu **)transfer_from);
        mark_all_dquot_dirty((struct dquot __rcu **)transfer_to);
        srcu_read_unlock(&dquot_srcu, index);

        flush_warnings(warn_to);
        flush_warnings(warn_from_inodes);
        flush_warnings(warn_from_space);
        /* Pass back references to put */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                if (is_valid[cnt])
                        transfer_to[cnt] = transfer_from[cnt];
        return 0;
over_quota:
        /* Back out changes we already did */
        for (cnt--; cnt >= 0; cnt--) {
                if (!is_valid[cnt])
                        continue;
                spin_lock(&transfer_to[cnt]->dq_dqb_lock);
                dquot_decr_inodes(transfer_to[cnt], inode_usage);
                dquot_decr_space(transfer_to[cnt], cur_space);
                dquot_free_reserved_space(transfer_to[cnt], rsv_space);
                spin_unlock(&transfer_to[cnt]->dq_dqb_lock);
        }
        spin_unlock(&inode->i_lock);
        spin_unlock(&dq_data_lock);
        flush_warnings(warn_to);
        return ret;
}
EXPORT_SYMBOL(__dquot_transfer);

/* Wrapper for transferring ownership of an inode for uid/gid only
 * Called from FSXXX_setattr()
 */
int dquot_transfer(struct inode *inode, struct iattr *iattr)
{
        struct dquot *transfer_to[MAXQUOTAS] = {};
        struct dquot *dquot;
        struct super_block *sb = inode->i_sb;
        int ret;

        if (!inode_quota_active(inode))
                return 0;

        if (iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)){
                dquot = dqget(sb, make_kqid_uid(iattr->ia_uid));
                if (IS_ERR(dquot)) {
                        if (PTR_ERR(dquot) != -ESRCH) {
                                ret = PTR_ERR(dquot);
                                goto out_put;
                        }
                        dquot = NULL;
                }
                transfer_to[USRQUOTA] = dquot;
        }
        if (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid)){
                dquot = dqget(sb, make_kqid_gid(iattr->ia_gid));
                if (IS_ERR(dquot)) {
                        if (PTR_ERR(dquot) != -ESRCH) {
                                ret = PTR_ERR(dquot);
                                goto out_put;
                        }
                        dquot = NULL;
                }
                transfer_to[GRPQUOTA] = dquot;
        }
        ret = __dquot_transfer(inode, transfer_to);
out_put:
        dqput_all(transfer_to);
        return ret;
}
EXPORT_SYMBOL(dquot_transfer);

/*
 * Write info of quota file to disk
 */
int dquot_commit_info(struct super_block *sb, int type)
{
        struct quota_info *dqopt = sb_dqopt(sb);

        return dqopt->ops[type]->write_file_info(sb, type);
}
EXPORT_SYMBOL(dquot_commit_info);

int dquot_get_next_id(struct super_block *sb, struct kqid *qid)
{
        struct quota_info *dqopt = sb_dqopt(sb);

        if (!sb_has_quota_active(sb, qid->type))
                return -ESRCH;
        if (!dqopt->ops[qid->type]->get_next_id)
                return -ENOSYS;
        return dqopt->ops[qid->type]->get_next_id(sb, qid);
}
EXPORT_SYMBOL(dquot_get_next_id);

/*
 * Definitions of diskquota operations.
 */
const struct dquot_operations dquot_operations = {
        .write_dquot        = dquot_commit,
        .acquire_dquot        = dquot_acquire,
        .release_dquot        = dquot_release,
        .mark_dirty        = dquot_mark_dquot_dirty,
        .write_info        = dquot_commit_info,
        .alloc_dquot        = dquot_alloc,
        .destroy_dquot        = dquot_destroy,
        .get_next_id        = dquot_get_next_id,
};
EXPORT_SYMBOL(dquot_operations);

/*
 * Generic helper for ->open on filesystems supporting disk quotas.
 */
int dquot_file_open(struct inode *inode, struct file *file)
{
        int error;

        error = generic_file_open(inode, file);
        if (!error && (file->f_mode & FMODE_WRITE))
                error = dquot_initialize(inode);
        return error;
}
EXPORT_SYMBOL(dquot_file_open);

static void vfs_cleanup_quota_inode(struct super_block *sb, int type)
{
        struct quota_info *dqopt = sb_dqopt(sb);
        struct inode *inode = dqopt->files[type];

        if (!inode)
                return;
        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
                inode_lock(inode);
                inode->i_flags &= ~S_NOQUOTA;
                inode_unlock(inode);
        }
        dqopt->files[type] = NULL;
        iput(inode);
}

/*
 * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
 */
int dquot_disable(struct super_block *sb, int type, unsigned int flags)
{
        int cnt;
        struct quota_info *dqopt = sb_dqopt(sb);

        /* s_umount should be held in exclusive mode */
        if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount)))
                up_read(&sb->s_umount);

        /* Cannot turn off usage accounting without turning off limits, or
         * suspend quotas and simultaneously turn quotas off. */
        if ((flags & DQUOT_USAGE_ENABLED && !(flags & DQUOT_LIMITS_ENABLED))
            || (flags & DQUOT_SUSPENDED && flags & (DQUOT_LIMITS_ENABLED |
            DQUOT_USAGE_ENABLED)))
                return -EINVAL;

        /*
         * Skip everything if there's nothing to do. We have to do this because
         * sometimes we are called when fill_super() failed and calling
         * sync_fs() in such cases does no good.
         */
        if (!sb_any_quota_loaded(sb))
                return 0;

        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (type != -1 && cnt != type)
                        continue;
                if (!sb_has_quota_loaded(sb, cnt))
                        continue;

                if (flags & DQUOT_SUSPENDED) {
                        spin_lock(&dq_state_lock);
                        dqopt->flags |=
                                dquot_state_flag(DQUOT_SUSPENDED, cnt);
                        spin_unlock(&dq_state_lock);
                } else {
                        spin_lock(&dq_state_lock);
                        dqopt->flags &= ~dquot_state_flag(flags, cnt);
                        /* Turning off suspended quotas? */
                        if (!sb_has_quota_loaded(sb, cnt) &&
                            sb_has_quota_suspended(sb, cnt)) {
                                dqopt->flags &=        ~dquot_state_flag(
                                                        DQUOT_SUSPENDED, cnt);
                                spin_unlock(&dq_state_lock);
                                vfs_cleanup_quota_inode(sb, cnt);
                                continue;
                        }
                        spin_unlock(&dq_state_lock);
                }

                /* We still have to keep quota loaded? */
                if (sb_has_quota_loaded(sb, cnt) && !(flags & DQUOT_SUSPENDED))
                        continue;

                /* Note: these are blocking operations */
                drop_dquot_ref(sb, cnt);
                invalidate_dquots(sb, cnt);
                /*
                 * Now all dquots should be invalidated, all writes done so we
                 * should be only users of the info. No locks needed.
                 */
                if (info_dirty(&dqopt->info[cnt]))
                        sb->dq_op->write_info(sb, cnt);
                if (dqopt->ops[cnt]->free_file_info)
                        dqopt->ops[cnt]->free_file_info(sb, cnt);
                put_quota_format(dqopt->info[cnt].dqi_format);
                dqopt->info[cnt].dqi_flags = 0;
                dqopt->info[cnt].dqi_igrace = 0;
                dqopt->info[cnt].dqi_bgrace = 0;
                dqopt->ops[cnt] = NULL;
        }

        /* Skip syncing and setting flags if quota files are hidden */
        if (dqopt->flags & DQUOT_QUOTA_SYS_FILE)
                goto put_inodes;

        /* Sync the superblock so that buffers with quota data are written to
         * disk (and so userspace sees correct data afterwards). */
        if (sb->s_op->sync_fs)
                sb->s_op->sync_fs(sb, 1);
        sync_blockdev(sb->s_bdev);
        /* Now the quota files are just ordinary files and we can set the
         * inode flags back. Moreover we discard the pagecache so that
         * userspace sees the writes we did bypassing the pagecache. We
         * must also discard the blockdev buffers so that we see the
         * changes done by userspace on the next quotaon() */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                if (!sb_has_quota_loaded(sb, cnt) && dqopt->files[cnt]) {
                        inode_lock(dqopt->files[cnt]);
                        truncate_inode_pages(&dqopt->files[cnt]->i_data, 0);
                        inode_unlock(dqopt->files[cnt]);
                }
        if (sb->s_bdev)
                invalidate_bdev(sb->s_bdev);
put_inodes:
        /* We are done when suspending quotas */
        if (flags & DQUOT_SUSPENDED)
                return 0;

        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                if (!sb_has_quota_loaded(sb, cnt))
                        vfs_cleanup_quota_inode(sb, cnt);
        return 0;
}
EXPORT_SYMBOL(dquot_disable);

int dquot_quota_off(struct super_block *sb, int type)
{
        return dquot_disable(sb, type,
                             DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
}
EXPORT_SYMBOL(dquot_quota_off);

/*
 *        Turn quotas on on a device
 */

static int vfs_setup_quota_inode(struct inode *inode, int type)
{
        struct super_block *sb = inode->i_sb;
        struct quota_info *dqopt = sb_dqopt(sb);

        if (is_bad_inode(inode))
                return -EUCLEAN;
        if (!S_ISREG(inode->i_mode))
                return -EACCES;
        if (IS_RDONLY(inode))
                return -EROFS;
        if (sb_has_quota_loaded(sb, type))
                return -EBUSY;

        /*
         * Quota files should never be encrypted.  They should be thought of as
         * filesystem metadata, not user data.  New-style internal quota files
         * cannot be encrypted by users anyway, but old-style external quota
         * files could potentially be incorrectly created in an encrypted
         * directory, hence this explicit check.  Some reasons why encrypted
         * quota files don't work include: (1) some filesystems that support
         * encryption don't handle it in their quota_read and quota_write, and
         * (2) cleaning up encrypted quota files at unmount would need special
         * consideration, as quota files are cleaned up later than user files.
         */
        if (IS_ENCRYPTED(inode))
                return -EINVAL;

        dqopt->files[type] = igrab(inode);
        if (!dqopt->files[type])
                return -EIO;
        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
                /* We don't want quota and atime on quota files (deadlocks
                 * possible) Also nobody should write to the file - we use
                 * special IO operations which ignore the immutable bit. */
                inode_lock(inode);
                inode->i_flags |= S_NOQUOTA;
                inode_unlock(inode);
                /*
                 * When S_NOQUOTA is set, remove dquot references as no more
                 * references can be added
                 */
                __dquot_drop(inode);
        }
        return 0;
}

int dquot_load_quota_sb(struct super_block *sb, int type, int format_id,
        unsigned int flags)
{
        struct quota_format_type *fmt = find_quota_format(format_id);
        struct quota_info *dqopt = sb_dqopt(sb);
        int error;

        /* Just unsuspend quotas? */
        BUG_ON(flags & DQUOT_SUSPENDED);
        /* s_umount should be held in exclusive mode */
        if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount)))
                up_read(&sb->s_umount);

        if (!fmt)
                return -ESRCH;
        if (!sb->s_op->quota_write || !sb->s_op->quota_read ||
            (type == PRJQUOTA && sb->dq_op->get_projid == NULL)) {
                error = -EINVAL;
                goto out_fmt;
        }
        /* Filesystems outside of init_user_ns not yet supported */
        if (sb->s_user_ns != &init_user_ns) {
                error = -EINVAL;
                goto out_fmt;
        }
        /* Usage always has to be set... */
        if (!(flags & DQUOT_USAGE_ENABLED)) {
                error = -EINVAL;
                goto out_fmt;
        }
        if (sb_has_quota_loaded(sb, type)) {
                error = -EBUSY;
                goto out_fmt;
        }

        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
                /* As we bypass the pagecache we must now flush all the
                 * dirty data and invalidate caches so that kernel sees
                 * changes from userspace. It is not enough to just flush
                 * the quota file since if blocksize < pagesize, invalidation
                 * of the cache could fail because of other unrelated dirty
                 * data */
                sync_filesystem(sb);
                invalidate_bdev(sb->s_bdev);
        }

        error = -EINVAL;
        if (!fmt->qf_ops->check_quota_file(sb, type))
                goto out_fmt;

        dqopt->ops[type] = fmt->qf_ops;
        dqopt->info[type].dqi_format = fmt;
        dqopt->info[type].dqi_fmt_id = format_id;
        INIT_LIST_HEAD(&dqopt->info[type].dqi_dirty_list);
        error = dqopt->ops[type]->read_file_info(sb, type);
        if (error < 0)
                goto out_fmt;
        if (dqopt->flags & DQUOT_QUOTA_SYS_FILE) {
                spin_lock(&dq_data_lock);
                dqopt->info[type].dqi_flags |= DQF_SYS_FILE;
                spin_unlock(&dq_data_lock);
        }
        spin_lock(&dq_state_lock);
        dqopt->flags |= dquot_state_flag(flags, type);
        spin_unlock(&dq_state_lock);

        error = add_dquot_ref(sb, type);
        if (error)
                dquot_disable(sb, type,
                              DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);

        return error;
out_fmt:
        put_quota_format(fmt);

        return error;
}
EXPORT_SYMBOL(dquot_load_quota_sb);

/*
 * More powerful function for turning on quotas on given quota inode allowing
 * setting of individual quota flags
 */
int dquot_load_quota_inode(struct inode *inode, int type, int format_id,
        unsigned int flags)
{
        int err;

        err = vfs_setup_quota_inode(inode, type);
        if (err < 0)
                return err;
        err = dquot_load_quota_sb(inode->i_sb, type, format_id, flags);
        if (err < 0)
                vfs_cleanup_quota_inode(inode->i_sb, type);
        return err;
}
EXPORT_SYMBOL(dquot_load_quota_inode);

/* Reenable quotas on remount RW */
int dquot_resume(struct super_block *sb, int type)
{
        struct quota_info *dqopt = sb_dqopt(sb);
        int ret = 0, cnt;
        unsigned int flags;

        /* s_umount should be held in exclusive mode */
        if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount)))
                up_read(&sb->s_umount);

        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (type != -1 && cnt != type)
                        continue;
                if (!sb_has_quota_suspended(sb, cnt))
                        continue;

                spin_lock(&dq_state_lock);
                flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
                                                        DQUOT_LIMITS_ENABLED,
                                                        cnt);
                dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, cnt);
                spin_unlock(&dq_state_lock);

                flags = dquot_generic_flag(flags, cnt);
                ret = dquot_load_quota_sb(sb, cnt, dqopt->info[cnt].dqi_fmt_id,
                                          flags);
                if (ret < 0)
                        vfs_cleanup_quota_inode(sb, cnt);
        }

        return ret;
}
EXPORT_SYMBOL(dquot_resume);

int dquot_quota_on(struct super_block *sb, int type, int format_id,
                   const struct path *path)
{
        int error = security_quota_on(path->dentry);
        if (error)
                return error;
        /* Quota file not on the same filesystem? */
        if (path->dentry->d_sb != sb)
                error = -EXDEV;
        else
                error = dquot_load_quota_inode(d_inode(path->dentry), type,
                                             format_id, DQUOT_USAGE_ENABLED |
                                             DQUOT_LIMITS_ENABLED);
        return error;
}
EXPORT_SYMBOL(dquot_quota_on);

/*
 * This function is used when filesystem needs to initialize quotas
 * during mount time.
 */
int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
                int format_id, int type)
{
        struct dentry *dentry;
        int error;

        dentry = lookup_positive_unlocked(qf_name, sb->s_root, strlen(qf_name));
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);

        error = security_quota_on(dentry);
        if (!error)
                error = dquot_load_quota_inode(d_inode(dentry), type, format_id,
                                DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);

        dput(dentry);
        return error;
}
EXPORT_SYMBOL(dquot_quota_on_mount);

static int dquot_quota_enable(struct super_block *sb, unsigned int flags)
{
        int ret;
        int type;
        struct quota_info *dqopt = sb_dqopt(sb);

        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE))
                return -ENOSYS;
        /* Accounting cannot be turned on while fs is mounted */
        flags &= ~(FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT);
        if (!flags)
                return -EINVAL;
        for (type = 0; type < MAXQUOTAS; type++) {
                if (!(flags & qtype_enforce_flag(type)))
                        continue;
                /* Can't enforce without accounting */
                if (!sb_has_quota_usage_enabled(sb, type)) {
                        ret = -EINVAL;
                        goto out_err;
                }
                if (sb_has_quota_limits_enabled(sb, type)) {
                        ret = -EBUSY;
                        goto out_err;
                }
                spin_lock(&dq_state_lock);
                dqopt->flags |= dquot_state_flag(DQUOT_LIMITS_ENABLED, type);
                spin_unlock(&dq_state_lock);
        }
        return 0;
out_err:
        /* Backout enforcement enablement we already did */
        for (type--; type >= 0; type--)  {
                if (flags & qtype_enforce_flag(type))
                        dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
        }
        /* Error code translation for better compatibility with XFS */
        if (ret == -EBUSY)
                ret = -EEXIST;
        return ret;
}

static int dquot_quota_disable(struct super_block *sb, unsigned int flags)
{
        int ret;
        int type;
        struct quota_info *dqopt = sb_dqopt(sb);

        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE))
                return -ENOSYS;
        /*
         * We don't support turning off accounting via quotactl. In principle
         * quota infrastructure can do this but filesystems don't expect
         * userspace to be able to do it.
         */
        if (flags &
                  (FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT))
                return -EOPNOTSUPP;

        /* Filter out limits not enabled */
        for (type = 0; type < MAXQUOTAS; type++)
                if (!sb_has_quota_limits_enabled(sb, type))
                        flags &= ~qtype_enforce_flag(type);
        /* Nothing left? */
        if (!flags)
                return -EEXIST;
        for (type = 0; type < MAXQUOTAS; type++) {
                if (flags & qtype_enforce_flag(type)) {
                        ret = dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
                        if (ret < 0)
                                goto out_err;
                }
        }
        return 0;
out_err:
        /* Backout enforcement disabling we already did */
        for (type--; type >= 0; type--)  {
                if (flags & qtype_enforce_flag(type)) {
                        spin_lock(&dq_state_lock);
                        dqopt->flags |=
                                dquot_state_flag(DQUOT_LIMITS_ENABLED, type);
                        spin_unlock(&dq_state_lock);
                }
        }
        return ret;
}

/* Generic routine for getting common part of quota structure */
static void do_get_dqblk(struct dquot *dquot, struct qc_dqblk *di)
{
        struct mem_dqblk *dm = &dquot->dq_dqb;

        memset(di, 0, sizeof(*di));
        spin_lock(&dquot->dq_dqb_lock);
        di->d_spc_hardlimit = dm->dqb_bhardlimit;
        di->d_spc_softlimit = dm->dqb_bsoftlimit;
        di->d_ino_hardlimit = dm->dqb_ihardlimit;
        di->d_ino_softlimit = dm->dqb_isoftlimit;
        di->d_space = dm->dqb_curspace + dm->dqb_rsvspace;
        di->d_ino_count = dm->dqb_curinodes;
        di->d_spc_timer = dm->dqb_btime;
        di->d_ino_timer = dm->dqb_itime;
        spin_unlock(&dquot->dq_dqb_lock);
}

int dquot_get_dqblk(struct super_block *sb, struct kqid qid,
                    struct qc_dqblk *di)
{
        struct dquot *dquot;

        dquot = dqget(sb, qid);
        if (IS_ERR(dquot))
                return PTR_ERR(dquot);
        do_get_dqblk(dquot, di);
        dqput(dquot);

        return 0;
}
EXPORT_SYMBOL(dquot_get_dqblk);

int dquot_get_next_dqblk(struct super_block *sb, struct kqid *qid,
                         struct qc_dqblk *di)
{
        struct dquot *dquot;
        int err;

        if (!sb->dq_op->get_next_id)
                return -ENOSYS;
        err = sb->dq_op->get_next_id(sb, qid);
        if (err < 0)
                return err;
        dquot = dqget(sb, *qid);
        if (IS_ERR(dquot))
                return PTR_ERR(dquot);
        do_get_dqblk(dquot, di);
        dqput(dquot);

        return 0;
}
EXPORT_SYMBOL(dquot_get_next_dqblk);

#define VFS_QC_MASK \
        (QC_SPACE | QC_SPC_SOFT | QC_SPC_HARD | \
         QC_INO_COUNT | QC_INO_SOFT | QC_INO_HARD | \
         QC_SPC_TIMER | QC_INO_TIMER)

/* Generic routine for setting common part of quota structure */
static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di)
{
        struct mem_dqblk *dm = &dquot->dq_dqb;
        int check_blim = 0, check_ilim = 0;
        struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];

        if (di->d_fieldmask & ~VFS_QC_MASK)
                return -EINVAL;

        if (((di->d_fieldmask & QC_SPC_SOFT) &&
             di->d_spc_softlimit > dqi->dqi_max_spc_limit) ||
            ((di->d_fieldmask & QC_SPC_HARD) &&
             di->d_spc_hardlimit > dqi->dqi_max_spc_limit) ||
            ((di->d_fieldmask & QC_INO_SOFT) &&
             (di->d_ino_softlimit > dqi->dqi_max_ino_limit)) ||
            ((di->d_fieldmask & QC_INO_HARD) &&
             (di->d_ino_hardlimit > dqi->dqi_max_ino_limit)))
                return -ERANGE;

        spin_lock(&dquot->dq_dqb_lock);
        if (di->d_fieldmask & QC_SPACE) {
                dm->dqb_curspace = di->d_space - dm->dqb_rsvspace;
                check_blim = 1;
                set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
        }

        if (di->d_fieldmask & QC_SPC_SOFT)
                dm->dqb_bsoftlimit = di->d_spc_softlimit;
        if (di->d_fieldmask & QC_SPC_HARD)
                dm->dqb_bhardlimit = di->d_spc_hardlimit;
        if (di->d_fieldmask & (QC_SPC_SOFT | QC_SPC_HARD)) {
                check_blim = 1;
                set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
        }

        if (di->d_fieldmask & QC_INO_COUNT) {
                dm->dqb_curinodes = di->d_ino_count;
                check_ilim = 1;
                set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
        }

        if (di->d_fieldmask & QC_INO_SOFT)
                dm->dqb_isoftlimit = di->d_ino_softlimit;
        if (di->d_fieldmask & QC_INO_HARD)
                dm->dqb_ihardlimit = di->d_ino_hardlimit;
        if (di->d_fieldmask & (QC_INO_SOFT | QC_INO_HARD)) {
                check_ilim = 1;
                set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
        }

        if (di->d_fieldmask & QC_SPC_TIMER) {
                dm->dqb_btime = di->d_spc_timer;
                check_blim = 1;
                set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
        }

        if (di->d_fieldmask & QC_INO_TIMER) {
                dm->dqb_itime = di->d_ino_timer;
                check_ilim = 1;
                set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
        }

        if (check_blim) {
                if (!dm->dqb_bsoftlimit ||
                    dm->dqb_curspace + dm->dqb_rsvspace <= dm->dqb_bsoftlimit) {
                        dm->dqb_btime = 0;
                        clear_bit(DQ_BLKS_B, &dquot->dq_flags);
                } else if (!(di->d_fieldmask & QC_SPC_TIMER))
                        /* Set grace only if user hasn't provided his own... */
                        dm->dqb_btime = ktime_get_real_seconds() + dqi->dqi_bgrace;
        }
        if (check_ilim) {
                if (!dm->dqb_isoftlimit ||
                    dm->dqb_curinodes <= dm->dqb_isoftlimit) {
                        dm->dqb_itime = 0;
                        clear_bit(DQ_INODES_B, &dquot->dq_flags);
                } else if (!(di->d_fieldmask & QC_INO_TIMER))
                        /* Set grace only if user hasn't provided his own... */
                        dm->dqb_itime = ktime_get_real_seconds() + dqi->dqi_igrace;
        }
        if (dm->dqb_bhardlimit || dm->dqb_bsoftlimit || dm->dqb_ihardlimit ||
            dm->dqb_isoftlimit)
                clear_bit(DQ_FAKE_B, &dquot->dq_flags);
        else
                set_bit(DQ_FAKE_B, &dquot->dq_flags);
        spin_unlock(&dquot->dq_dqb_lock);
        mark_dquot_dirty(dquot);

        return 0;
}

int dquot_set_dqblk(struct super_block *sb, struct kqid qid,
                  struct qc_dqblk *di)
{
        struct dquot *dquot;
        int rc;

        dquot = dqget(sb, qid);
        if (IS_ERR(dquot)) {
                rc = PTR_ERR(dquot);
                goto out;
        }
        rc = do_set_dqblk(dquot, di);
        dqput(dquot);
out:
        return rc;
}
EXPORT_SYMBOL(dquot_set_dqblk);

/* Generic routine for getting common part of quota file information */
int dquot_get_state(struct super_block *sb, struct qc_state *state)
{
        struct mem_dqinfo *mi;
        struct qc_type_state *tstate;
        struct quota_info *dqopt = sb_dqopt(sb);
        int type;

        memset(state, 0, sizeof(*state));
        for (type = 0; type < MAXQUOTAS; type++) {
                if (!sb_has_quota_active(sb, type))
                        continue;
                tstate = state->s_state + type;
                mi = sb_dqopt(sb)->info + type;
                tstate->flags = QCI_ACCT_ENABLED;
                spin_lock(&dq_data_lock);
                if (mi->dqi_flags & DQF_SYS_FILE)
                        tstate->flags |= QCI_SYSFILE;
                if (mi->dqi_flags & DQF_ROOT_SQUASH)
                        tstate->flags |= QCI_ROOT_SQUASH;
                if (sb_has_quota_limits_enabled(sb, type))
                        tstate->flags |= QCI_LIMITS_ENFORCED;
                tstate->spc_timelimit = mi->dqi_bgrace;
                tstate->ino_timelimit = mi->dqi_igrace;
                if (dqopt->files[type]) {
                        tstate->ino = dqopt->files[type]->i_ino;
                        tstate->blocks = dqopt->files[type]->i_blocks;
                }
                tstate->nextents = 1;        /* We don't know... */
                spin_unlock(&dq_data_lock);
        }
        return 0;
}
EXPORT_SYMBOL(dquot_get_state);

/* Generic routine for setting common part of quota file information */
int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii)
{
        struct mem_dqinfo *mi;
        int err = 0;

        if ((ii->i_fieldmask & QC_WARNS_MASK) ||
            (ii->i_fieldmask & QC_RT_SPC_TIMER))
                return -EINVAL;
        if (!sb_has_quota_active(sb, type))
                return -ESRCH;
        mi = sb_dqopt(sb)->info + type;
        if (ii->i_fieldmask & QC_FLAGS) {
                if ((ii->i_flags & QCI_ROOT_SQUASH &&
                     mi->dqi_format->qf_fmt_id != QFMT_VFS_OLD))
                        return -EINVAL;
        }
        spin_lock(&dq_data_lock);
        if (ii->i_fieldmask & QC_SPC_TIMER)
                mi->dqi_bgrace = ii->i_spc_timelimit;
        if (ii->i_fieldmask & QC_INO_TIMER)
                mi->dqi_igrace = ii->i_ino_timelimit;
        if (ii->i_fieldmask & QC_FLAGS) {
                if (ii->i_flags & QCI_ROOT_SQUASH)
                        mi->dqi_flags |= DQF_ROOT_SQUASH;
                else
                        mi->dqi_flags &= ~DQF_ROOT_SQUASH;
        }
        spin_unlock(&dq_data_lock);
        mark_info_dirty(sb, type);
        /* Force write to disk */
        sb->dq_op->write_info(sb, type);
        return err;
}
EXPORT_SYMBOL(dquot_set_dqinfo);

const struct quotactl_ops dquot_quotactl_sysfile_ops = {
        .quota_enable        = dquot_quota_enable,
        .quota_disable        = dquot_quota_disable,
        .quota_sync        = dquot_quota_sync,
        .get_state        = dquot_get_state,
        .set_info        = dquot_set_dqinfo,
        .get_dqblk        = dquot_get_dqblk,
        .get_nextdqblk        = dquot_get_next_dqblk,
        .set_dqblk        = dquot_set_dqblk
};
EXPORT_SYMBOL(dquot_quotactl_sysfile_ops);

static int do_proc_dqstats(struct ctl_table *table, int write,
                     void *buffer, size_t *lenp, loff_t *ppos)
{
        unsigned int type = (unsigned long *)table->data - dqstats.stat;
        s64 value = percpu_counter_sum(&dqstats.counter[type]);

        /* Filter negative values for non-monotonic counters */
        if (value < 0 && (type == DQST_ALLOC_DQUOTS ||
                          type == DQST_FREE_DQUOTS))
                value = 0;

        /* Update global table */
        dqstats.stat[type] = value;
        return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}

static struct ctl_table fs_dqstats_table[] = {
        {
                .procname        = "lookups",
                .data                = &dqstats.stat[DQST_LOOKUPS],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
        {
                .procname        = "drops",
                .data                = &dqstats.stat[DQST_DROPS],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
        {
                .procname        = "reads",
                .data                = &dqstats.stat[DQST_READS],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
        {
                .procname        = "writes",
                .data                = &dqstats.stat[DQST_WRITES],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
        {
                .procname        = "cache_hits",
                .data                = &dqstats.stat[DQST_CACHE_HITS],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
        {
                .procname        = "allocated_dquots",
                .data                = &dqstats.stat[DQST_ALLOC_DQUOTS],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
        {
                .procname        = "free_dquots",
                .data                = &dqstats.stat[DQST_FREE_DQUOTS],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
        {
                .procname        = "syncs",
                .data                = &dqstats.stat[DQST_SYNCS],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
#ifdef CONFIG_PRINT_QUOTA_WARNING
        {
                .procname        = "warnings",
                .data                = &flag_print_warnings,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
#endif
        { },
};

static struct ctl_table fs_table[] = {
        {
                .procname        = "quota",
                .mode                = 0555,
                .child                = fs_dqstats_table,
        },
        { },
};

static struct ctl_table sys_table[] = {
        {
                .procname        = "fs",
                .mode                = 0555,
                .child                = fs_table,
        },
        { },
};

static int __init dquot_init(void)
{
        int i, ret;
        unsigned long nr_hash, order;

        printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__);

        register_sysctl_table(sys_table);

        dquot_cachep = kmem_cache_create("dquot",
                        sizeof(struct dquot), sizeof(unsigned long) * 4,
                        (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
                                SLAB_MEM_SPREAD|SLAB_PANIC),
                        NULL);

        order = 0;
        dquot_hash = (struct hlist_head *)__get_free_pages(GFP_KERNEL, order);
        if (!dquot_hash)
                panic("Cannot create dquot hash table");

        for (i = 0; i < _DQST_DQSTAT_LAST; i++) {
                ret = percpu_counter_init(&dqstats.counter[i], 0, GFP_KERNEL);
                if (ret)
                        panic("Cannot create dquot stat counters");
        }

        /* Find power-of-two hlist_heads which can fit into allocation */
        nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head);
        dq_hash_bits = ilog2(nr_hash);

        nr_hash = 1UL << dq_hash_bits;
        dq_hash_mask = nr_hash - 1;
        for (i = 0; i < nr_hash; i++)
                INIT_HLIST_HEAD(dquot_hash + i);

        pr_info("VFS: Dquot-cache hash table entries: %ld (order %ld,"
                " %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order));

        if (register_shrinker(&dqcache_shrinker))
                panic("Cannot register dquot shrinker");

        return 0;
}
fs_initcall(dquot_init);
























































































    3 
















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Cryptographic API.
 *
 * CRC32C chksum
 *
 *@Article{castagnoli-crc,
 * author =       { Guy Castagnoli and Stefan Braeuer and Martin Herrman},
 * title =        {{Optimization of Cyclic Redundancy-Check Codes with 24
 *                 and 32 Parity Bits}},
 * journal =      IEEE Transactions on Communication,
 * year =         {1993},
 * volume =       {41},
 * number =       {6},
 * pages =        {},
 * month =        {June},
 *}
 * Used by the iSCSI driver, possibly others, and derived from
 * the iscsi-crc.c module of the linux-iscsi driver at
 * http://linux-iscsi.sourceforge.net.
 *
 * Following the example of lib/crc32, this function is intended to be
 * flexible and useful for all users.  Modules that currently have their
 * own crc32c, but hopefully may be able to use this one are:
 *  net/sctp (please add all your doco to here if you change to
 *            use this one!)
 *  <endoflist>
 *
 * Copyright (c) 2004 Cisco Systems, Inc.
 * Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au>
 */

#include <asm/unaligned.h>
#include <crypto/internal/hash.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/crc32.h>

#define CHKSUM_BLOCK_SIZE        1
#define CHKSUM_DIGEST_SIZE        4

struct chksum_ctx {
        u32 key;
};

struct chksum_desc_ctx {
        u32 crc;
};

/*
 * Steps through buffer one byte at a time, calculates reflected
 * crc using table.
 */

static int chksum_init(struct shash_desc *desc)
{
        struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm);
        struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);

        ctx->crc = mctx->key;

        return 0;
}

/*
 * Setting the seed allows arbitrary accumulators and flexible XOR policy
 * If your algorithm starts with ~0, then XOR with ~0 before you set
 * the seed.
 */
static int chksum_setkey(struct crypto_shash *tfm, const u8 *key,
                         unsigned int keylen)
{
        struct chksum_ctx *mctx = crypto_shash_ctx(tfm);

        if (keylen != sizeof(mctx->key))
                return -EINVAL;
        mctx->key = get_unaligned_le32(key);
        return 0;
}

static int chksum_update(struct shash_desc *desc, const u8 *data,
                         unsigned int length)
{
        struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);

        ctx->crc = __crc32c_le(ctx->crc, data, length);
        return 0;
}

static int chksum_final(struct shash_desc *desc, u8 *out)
{
        struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);

        put_unaligned_le32(~ctx->crc, out);
        return 0;
}

static int __chksum_finup(u32 *crcp, const u8 *data, unsigned int len, u8 *out)
{
        put_unaligned_le32(~__crc32c_le(*crcp, data, len), out);
        return 0;
}

static int chksum_finup(struct shash_desc *desc, const u8 *data,
                        unsigned int len, u8 *out)
{
        struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);

        return __chksum_finup(&ctx->crc, data, len, out);
}

static int chksum_digest(struct shash_desc *desc, const u8 *data,
                         unsigned int length, u8 *out)
{
        struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm);

        return __chksum_finup(&mctx->key, data, length, out);
}

static int crc32c_cra_init(struct crypto_tfm *tfm)
{
        struct chksum_ctx *mctx = crypto_tfm_ctx(tfm);

        mctx->key = ~0;
        return 0;
}

static struct shash_alg alg = {
        .digestsize                =        CHKSUM_DIGEST_SIZE,
        .setkey                        =        chksum_setkey,
        .init                =        chksum_init,
        .update                =        chksum_update,
        .final                =        chksum_final,
        .finup                =        chksum_finup,
        .digest                =        chksum_digest,
        .descsize                =        sizeof(struct chksum_desc_ctx),
        .base                        =        {
                .cra_name                =        "crc32c",
                .cra_driver_name        =        "crc32c-generic",
                .cra_priority                =        100,
                .cra_flags                =        CRYPTO_ALG_OPTIONAL_KEY,
                .cra_blocksize                =        CHKSUM_BLOCK_SIZE,
                .cra_ctxsize                =        sizeof(struct chksum_ctx),
                .cra_module                =        THIS_MODULE,
                .cra_init                =        crc32c_cra_init,
        }
};

static int __init crc32c_mod_init(void)
{
        return crypto_register_shash(&alg);
}

static void __exit crc32c_mod_fini(void)
{
        crypto_unregister_shash(&alg);
}

subsys_initcall(crc32c_mod_init);
module_exit(crc32c_mod_fini);

MODULE_AUTHOR("Clay Haapala <chaapala@cisco.com>");
MODULE_DESCRIPTION("CRC32c (Castagnoli) calculations wrapper for lib/crc32c");
MODULE_LICENSE("GPL");
MODULE_ALIAS_CRYPTO("crc32c");
MODULE_ALIAS_CRYPTO("crc32c-generic");













































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Dynamic queue limits (dql) - Definitions
 *
 * Copyright (c) 2011, Tom Herbert <therbert@google.com>
 *
 * This header file contains the definitions for dynamic queue limits (dql).
 * dql would be used in conjunction with a producer/consumer type queue
 * (possibly a HW queue).  Such a queue would have these general properties:
 *
 *   1) Objects are queued up to some limit specified as number of objects.
 *   2) Periodically a completion process executes which retires consumed
 *      objects.
 *   3) Starvation occurs when limit has been reached, all queued data has
 *      actually been consumed, but completion processing has not yet run
 *      so queuing new data is blocked.
 *   4) Minimizing the amount of queued data is desirable.
 *
 * The goal of dql is to calculate the limit as the minimum number of objects
 * needed to prevent starvation.
 *
 * The primary functions of dql are:
 *    dql_queued - called when objects are enqueued to record number of objects
 *    dql_avail - returns how many objects are available to be queued based
 *      on the object limit and how many objects are already enqueued
 *    dql_completed - called at completion time to indicate how many objects
 *      were retired from the queue
 *
 * The dql implementation does not implement any locking for the dql data
 * structures, the higher layer should provide this.  dql_queued should
 * be serialized to prevent concurrent execution of the function; this
 * is also true for  dql_completed.  However, dql_queued and dlq_completed  can
 * be executed concurrently (i.e. they can be protected by different locks).
 */

#ifndef _LINUX_DQL_H
#define _LINUX_DQL_H

#ifdef __KERNEL__

#include <asm/bug.h>

struct dql {
        /* Fields accessed in enqueue path (dql_queued) */
        unsigned int        num_queued;                /* Total ever queued */
        unsigned int        adj_limit;                /* limit + num_completed */
        unsigned int        last_obj_cnt;                /* Count at last queuing */

        /* Fields accessed only by completion path (dql_completed) */

        unsigned int        limit ____cacheline_aligned_in_smp; /* Current limit */
        unsigned int        num_completed;                /* Total ever completed */

        unsigned int        prev_ovlimit;                /* Previous over limit */
        unsigned int        prev_num_queued;        /* Previous queue total */
        unsigned int        prev_last_obj_cnt;        /* Previous queuing cnt */

        unsigned int        lowest_slack;                /* Lowest slack found */
        unsigned long        slack_start_time;        /* Time slacks seen */

        /* Configuration */
        unsigned int        max_limit;                /* Max limit */
        unsigned int        min_limit;                /* Minimum limit */
        unsigned int        slack_hold_time;        /* Time to measure slack */
};

/* Set some static maximums */
#define DQL_MAX_OBJECT (UINT_MAX / 16)
#define DQL_MAX_LIMIT ((UINT_MAX / 2) - DQL_MAX_OBJECT)

/*
 * Record number of objects queued. Assumes that caller has already checked
 * availability in the queue with dql_avail.
 */
static inline void dql_queued(struct dql *dql, unsigned int count)
{
        BUG_ON(count > DQL_MAX_OBJECT);

        dql->last_obj_cnt = count;

        /* We want to force a write first, so that cpu do not attempt
         * to get cache line containing last_obj_cnt, num_queued, adj_limit
         * in Shared state, but directly does a Request For Ownership
         * It is only a hint, we use barrier() only.
         */
        barrier();

        dql->num_queued += count;
}

/* Returns how many objects can be queued, < 0 indicates over limit. */
static inline int dql_avail(const struct dql *dql)
{
        return READ_ONCE(dql->adj_limit) - READ_ONCE(dql->num_queued);
}

/* Record number of completed objects and recalculate the limit. */
void dql_completed(struct dql *dql, unsigned int count);

/* Reset dql state */
void dql_reset(struct dql *dql);

/* Initialize dql state */
void dql_init(struct dql *dql, unsigned int hold_time);

#endif /* _KERNEL_ */

#endif /* _LINUX_DQL_H */

















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * net busy poll support
 * Copyright(c) 2013 Intel Corporation.
 *
 * Author: Eliezer Tamir
 *
 * Contact Information:
 * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
 */

#ifndef _LINUX_NET_BUSY_POLL_H
#define _LINUX_NET_BUSY_POLL_H

#include <linux/netdevice.h>
#include <linux/sched/clock.h>
#include <linux/sched/signal.h>
#include <net/ip.h>

/*                0 - Reserved to indicate value not set
 *     1..NR_CPUS - Reserved for sender_cpu
 *  NR_CPUS+1..~0 - Region available for NAPI IDs
 */
#define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1))

#ifdef CONFIG_NET_RX_BUSY_POLL

struct napi_struct;
extern unsigned int sysctl_net_busy_read __read_mostly;
extern unsigned int sysctl_net_busy_poll __read_mostly;

static inline bool net_busy_loop_on(void)
{
        return READ_ONCE(sysctl_net_busy_poll);
}

static inline bool sk_can_busy_loop(const struct sock *sk)
{
        return READ_ONCE(sk->sk_ll_usec) && !signal_pending(current);
}

bool sk_busy_loop_end(void *p, unsigned long start_time);

void napi_busy_loop(unsigned int napi_id,
                    bool (*loop_end)(void *, unsigned long),
                    void *loop_end_arg);

#else /* CONFIG_NET_RX_BUSY_POLL */
static inline unsigned long net_busy_loop_on(void)
{
        return 0;
}

static inline bool sk_can_busy_loop(struct sock *sk)
{
        return false;
}

#endif /* CONFIG_NET_RX_BUSY_POLL */

static inline unsigned long busy_loop_current_time(void)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        return (unsigned long)(ktime_get_ns() >> 10);
#else
        return 0;
#endif
}

/* in poll/select we use the global sysctl_net_ll_poll value */
static inline bool busy_loop_timeout(unsigned long start_time)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        unsigned long bp_usec = READ_ONCE(sysctl_net_busy_poll);

        if (bp_usec) {
                unsigned long end_time = start_time + bp_usec;
                unsigned long now = busy_loop_current_time();

                return time_after(now, end_time);
        }
#endif
        return true;
}

static inline bool sk_busy_loop_timeout(struct sock *sk,
                                        unsigned long start_time)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        unsigned long bp_usec = READ_ONCE(sk->sk_ll_usec);

        if (bp_usec) {
                unsigned long end_time = start_time + bp_usec;
                unsigned long now = busy_loop_current_time();

                return time_after(now, end_time);
        }
#endif
        return true;
}

static inline void sk_busy_loop(struct sock *sk, int nonblock)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        unsigned int napi_id = READ_ONCE(sk->sk_napi_id);

        if (napi_id >= MIN_NAPI_ID)
                napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk);
#endif
}

/* used in the NIC receive handler to mark the skb */
static inline void skb_mark_napi_id(struct sk_buff *skb,
                                    struct napi_struct *napi)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        /* If the skb was already marked with a valid NAPI ID, avoid overwriting
         * it.
         */
        if (skb->napi_id < MIN_NAPI_ID)
                skb->napi_id = napi->napi_id;
#endif
}

/* used in the protocol hanlder to propagate the napi_id to the socket */
static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        WRITE_ONCE(sk->sk_napi_id, skb->napi_id);
#endif
        sk_rx_queue_set(sk, skb);
}

/* variant used for unconnected sockets */
static inline void sk_mark_napi_id_once(struct sock *sk,
                                        const struct sk_buff *skb)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        if (!READ_ONCE(sk->sk_napi_id))
                WRITE_ONCE(sk->sk_napi_id, skb->napi_id);
#endif
}

#endif /* _LINUX_NET_BUSY_POLL_H */





























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#ifndef _LINUX_SCHED_ISOLATION_H
#define _LINUX_SCHED_ISOLATION_H

#include <linux/cpumask.h>
#include <linux/init.h>
#include <linux/tick.h>

enum hk_flags {
        HK_FLAG_TIMER                = 1,
        HK_FLAG_RCU                = (1 << 1),
        HK_FLAG_MISC                = (1 << 2),
        HK_FLAG_SCHED                = (1 << 3),
        HK_FLAG_TICK                = (1 << 4),
        HK_FLAG_DOMAIN                = (1 << 5),
        HK_FLAG_WQ                = (1 << 6),
        HK_FLAG_MANAGED_IRQ        = (1 << 7),
        HK_FLAG_KTHREAD                = (1 << 8),
};

#ifdef CONFIG_CPU_ISOLATION
DECLARE_STATIC_KEY_FALSE(housekeeping_overridden);
extern int housekeeping_any_cpu(enum hk_flags flags);
extern const struct cpumask *housekeeping_cpumask(enum hk_flags flags);
extern bool housekeeping_enabled(enum hk_flags flags);
extern void housekeeping_affine(struct task_struct *t, enum hk_flags flags);
extern bool housekeeping_test_cpu(int cpu, enum hk_flags flags);
extern void __init housekeeping_init(void);

#else

static inline int housekeeping_any_cpu(enum hk_flags flags)
{
        return smp_processor_id();
}

static inline const struct cpumask *housekeeping_cpumask(enum hk_flags flags)
{
        return cpu_possible_mask;
}

static inline bool housekeeping_enabled(enum hk_flags flags)
{
        return false;
}

static inline void housekeeping_affine(struct task_struct *t,
                                       enum hk_flags flags) { }
static inline void housekeeping_init(void) { }
#endif /* CONFIG_CPU_ISOLATION */

static inline bool housekeeping_cpu(int cpu, enum hk_flags flags)
{
#ifdef CONFIG_CPU_ISOLATION
        if (static_branch_unlikely(&housekeeping_overridden))
                return housekeeping_test_cpu(cpu, flags);
#endif
        return true;
}

#endif /* _LINUX_SCHED_ISOLATION_H */



























    8 











































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
// SPDX-License-Identifier: GPL-2.0-only
#include "cgroup-internal.h"

#include <linux/sched/cputime.h>

static DEFINE_SPINLOCK(cgroup_rstat_lock);
static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);

static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);

static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
{
        return per_cpu_ptr(cgrp->rstat_cpu, cpu);
}

/**
 * cgroup_rstat_updated - keep track of updated rstat_cpu
 * @cgrp: target cgroup
 * @cpu: cpu on which rstat_cpu was updated
 *
 * @cgrp's rstat_cpu on @cpu was updated.  Put it on the parent's matching
 * rstat_cpu->updated_children list.  See the comment on top of
 * cgroup_rstat_cpu definition for details.
 */
void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
{
        raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
        struct cgroup *parent;
        unsigned long flags;

        /* nothing to do for root */
        if (!cgroup_parent(cgrp))
                return;

        /*
         * Speculative already-on-list test. This may race leading to
         * temporary inaccuracies, which is fine.
         *
         * Because @parent's updated_children is terminated with @parent
         * instead of NULL, we can tell whether @cgrp is on the list by
         * testing the next pointer for NULL.
         */
        if (cgroup_rstat_cpu(cgrp, cpu)->updated_next)
                return;

        raw_spin_lock_irqsave(cpu_lock, flags);

        /* put @cgrp and all ancestors on the corresponding updated lists */
        for (parent = cgroup_parent(cgrp); parent;
             cgrp = parent, parent = cgroup_parent(cgrp)) {
                struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
                struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);

                /*
                 * Both additions and removals are bottom-up.  If a cgroup
                 * is already in the tree, all ancestors are.
                 */
                if (rstatc->updated_next)
                        break;

                rstatc->updated_next = prstatc->updated_children;
                prstatc->updated_children = cgrp;
        }

        raw_spin_unlock_irqrestore(cpu_lock, flags);
}

/**
 * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
 * @pos: current position
 * @root: root of the tree to traversal
 * @cpu: target cpu
 *
 * Walks the udpated rstat_cpu tree on @cpu from @root.  %NULL @pos starts
 * the traversal and %NULL return indicates the end.  During traversal,
 * each returned cgroup is unlinked from the tree.  Must be called with the
 * matching cgroup_rstat_cpu_lock held.
 *
 * The only ordering guarantee is that, for a parent and a child pair
 * covered by a given traversal, if a child is visited, its parent is
 * guaranteed to be visited afterwards.
 */
static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
                                                   struct cgroup *root, int cpu)
{
        struct cgroup_rstat_cpu *rstatc;

        if (pos == root)
                return NULL;

        /*
         * We're gonna walk down to the first leaf and visit/remove it.  We
         * can pick whatever unvisited node as the starting point.
         */
        if (!pos)
                pos = root;
        else
                pos = cgroup_parent(pos);

        /* walk down to the first leaf */
        while (true) {
                rstatc = cgroup_rstat_cpu(pos, cpu);
                if (rstatc->updated_children == pos)
                        break;
                pos = rstatc->updated_children;
        }

        /*
         * Unlink @pos from the tree.  As the updated_children list is
         * singly linked, we have to walk it to find the removal point.
         * However, due to the way we traverse, @pos will be the first
         * child in most cases. The only exception is @root.
         */
        if (rstatc->updated_next) {
                struct cgroup *parent = cgroup_parent(pos);
                struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);
                struct cgroup_rstat_cpu *nrstatc;
                struct cgroup **nextp;

                nextp = &prstatc->updated_children;
                while (true) {
                        nrstatc = cgroup_rstat_cpu(*nextp, cpu);
                        if (*nextp == pos)
                                break;

                        WARN_ON_ONCE(*nextp == parent);
                        nextp = &nrstatc->updated_next;
                }

                *nextp = rstatc->updated_next;
                rstatc->updated_next = NULL;

                return pos;
        }

        /* only happens for @root */
        return NULL;
}

/* see cgroup_rstat_flush() */
static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
        __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
{
        int cpu;

        lockdep_assert_held(&cgroup_rstat_lock);

        for_each_possible_cpu(cpu) {
                raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
                                                       cpu);
                struct cgroup *pos = NULL;

                raw_spin_lock(cpu_lock);
                while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
                        struct cgroup_subsys_state *css;

                        cgroup_base_stat_flush(pos, cpu);

                        rcu_read_lock();
                        list_for_each_entry_rcu(css, &pos->rstat_css_list,
                                                rstat_css_node)
                                css->ss->css_rstat_flush(css, cpu);
                        rcu_read_unlock();
                }
                raw_spin_unlock(cpu_lock);

                /* if @may_sleep, play nice and yield if necessary */
                if (may_sleep && (need_resched() ||
                                  spin_needbreak(&cgroup_rstat_lock))) {
                        spin_unlock_irq(&cgroup_rstat_lock);
                        if (!cond_resched())
                                cpu_relax();
                        spin_lock_irq(&cgroup_rstat_lock);
                }
        }
}

/**
 * cgroup_rstat_flush - flush stats in @cgrp's subtree
 * @cgrp: target cgroup
 *
 * Collect all per-cpu stats in @cgrp's subtree into the global counters
 * and propagate them upwards.  After this function returns, all cgroups in
 * the subtree have up-to-date ->stat.
 *
 * This also gets all cgroups in the subtree including @cgrp off the
 * ->updated_children lists.
 *
 * This function may block.
 */
void cgroup_rstat_flush(struct cgroup *cgrp)
{
        might_sleep();

        spin_lock_irq(&cgroup_rstat_lock);
        cgroup_rstat_flush_locked(cgrp, true);
        spin_unlock_irq(&cgroup_rstat_lock);
}

/**
 * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush()
 * @cgrp: target cgroup
 *
 * This function can be called from any context.
 */
void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp)
{
        unsigned long flags;

        spin_lock_irqsave(&cgroup_rstat_lock, flags);
        cgroup_rstat_flush_locked(cgrp, false);
        spin_unlock_irqrestore(&cgroup_rstat_lock, flags);
}

/**
 * cgroup_rstat_flush_begin - flush stats in @cgrp's subtree and hold
 * @cgrp: target cgroup
 *
 * Flush stats in @cgrp's subtree and prevent further flushes.  Must be
 * paired with cgroup_rstat_flush_release().
 *
 * This function may block.
 */
void cgroup_rstat_flush_hold(struct cgroup *cgrp)
        __acquires(&cgroup_rstat_lock)
{
        might_sleep();
        spin_lock_irq(&cgroup_rstat_lock);
        cgroup_rstat_flush_locked(cgrp, true);
}

/**
 * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
 */
void cgroup_rstat_flush_release(void)
        __releases(&cgroup_rstat_lock)
{
        spin_unlock_irq(&cgroup_rstat_lock);
}

int cgroup_rstat_init(struct cgroup *cgrp)
{
        int cpu;

        /* the root cgrp has rstat_cpu preallocated */
        if (!cgrp->rstat_cpu) {
                cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
                if (!cgrp->rstat_cpu)
                        return -ENOMEM;
        }

        /* ->updated_children list is self terminated */
        for_each_possible_cpu(cpu) {
                struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);

                rstatc->updated_children = cgrp;
                u64_stats_init(&rstatc->bsync);
        }

        return 0;
}

void cgroup_rstat_exit(struct cgroup *cgrp)
{
        int cpu;

        cgroup_rstat_flush(cgrp);

        /* sanity check */
        for_each_possible_cpu(cpu) {
                struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);

                if (WARN_ON_ONCE(rstatc->updated_children != cgrp) ||
                    WARN_ON_ONCE(rstatc->updated_next))
                        return;
        }

        free_percpu(cgrp->rstat_cpu);
        cgrp->rstat_cpu = NULL;
}

void __init cgroup_rstat_boot(void)
{
        int cpu;

        for_each_possible_cpu(cpu)
                raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));

        BUG_ON(cgroup_rstat_init(&cgrp_dfl_root.cgrp));
}

/*
 * Functions for cgroup basic resource statistics implemented on top of
 * rstat.
 */
static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
                                 struct cgroup_base_stat *src_bstat)
{
        dst_bstat->cputime.utime += src_bstat->cputime.utime;
        dst_bstat->cputime.stime += src_bstat->cputime.stime;
        dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
}

static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
                                 struct cgroup_base_stat *src_bstat)
{
        dst_bstat->cputime.utime -= src_bstat->cputime.utime;
        dst_bstat->cputime.stime -= src_bstat->cputime.stime;
        dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
}

static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
{
        struct cgroup *parent = cgroup_parent(cgrp);
        struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
        struct cgroup_base_stat cur, delta;
        unsigned seq;

        /* fetch the current per-cpu values */
        do {
                seq = __u64_stats_fetch_begin(&rstatc->bsync);
                cur.cputime = rstatc->bstat.cputime;
        } while (__u64_stats_fetch_retry(&rstatc->bsync, seq));

        /* propagate percpu delta to global */
        delta = cur;
        cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
        cgroup_base_stat_add(&cgrp->bstat, &delta);
        cgroup_base_stat_add(&rstatc->last_bstat, &delta);

        /* propagate global delta to parent */
        if (parent) {
                delta = cgrp->bstat;
                cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
                cgroup_base_stat_add(&parent->bstat, &delta);
                cgroup_base_stat_add(&cgrp->last_bstat, &delta);
        }
}

static struct cgroup_rstat_cpu *
cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp)
{
        struct cgroup_rstat_cpu *rstatc;

        rstatc = get_cpu_ptr(cgrp->rstat_cpu);
        u64_stats_update_begin(&rstatc->bsync);
        return rstatc;
}

static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
                                                 struct cgroup_rstat_cpu *rstatc)
{
        u64_stats_update_end(&rstatc->bsync);
        cgroup_rstat_updated(cgrp, smp_processor_id());
        put_cpu_ptr(rstatc);
}

void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
{
        struct cgroup_rstat_cpu *rstatc;

        rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
        rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
        cgroup_base_stat_cputime_account_end(cgrp, rstatc);
}

void __cgroup_account_cputime_field(struct cgroup *cgrp,
                                    enum cpu_usage_stat index, u64 delta_exec)
{
        struct cgroup_rstat_cpu *rstatc;

        rstatc = cgroup_base_stat_cputime_account_begin(cgrp);

        switch (index) {
        case CPUTIME_USER:
        case CPUTIME_NICE:
                rstatc->bstat.cputime.utime += delta_exec;
                break;
        case CPUTIME_SYSTEM:
        case CPUTIME_IRQ:
        case CPUTIME_SOFTIRQ:
                rstatc->bstat.cputime.stime += delta_exec;
                break;
        default:
                break;
        }

        cgroup_base_stat_cputime_account_end(cgrp, rstatc);
}

/*
 * compute the cputime for the root cgroup by getting the per cpu data
 * at a global level, then categorizing the fields in a manner consistent
 * with how it is done by __cgroup_account_cputime_field for each bit of
 * cpu time attributed to a cgroup.
 */
static void root_cgroup_cputime(struct task_cputime *cputime)
{
        int i;

        cputime->stime = 0;
        cputime->utime = 0;
        cputime->sum_exec_runtime = 0;
        for_each_possible_cpu(i) {
                struct kernel_cpustat kcpustat;
                u64 *cpustat = kcpustat.cpustat;
                u64 user = 0;
                u64 sys = 0;

                kcpustat_cpu_fetch(&kcpustat, i);

                user += cpustat[CPUTIME_USER];
                user += cpustat[CPUTIME_NICE];
                cputime->utime += user;

                sys += cpustat[CPUTIME_SYSTEM];
                sys += cpustat[CPUTIME_IRQ];
                sys += cpustat[CPUTIME_SOFTIRQ];
                cputime->stime += sys;

                cputime->sum_exec_runtime += user;
                cputime->sum_exec_runtime += sys;
                cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
        }
}

void cgroup_base_stat_cputime_show(struct seq_file *seq)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;
        u64 usage, utime, stime;
        struct task_cputime cputime;

        if (cgroup_parent(cgrp)) {
                cgroup_rstat_flush_hold(cgrp);
                usage = cgrp->bstat.cputime.sum_exec_runtime;
                cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
                               &utime, &stime);
                cgroup_rstat_flush_release();
        } else {
                root_cgroup_cputime(&cputime);
                usage = cputime.sum_exec_runtime;
                utime = cputime.utime;
                stime = cputime.stime;
        }

        do_div(usage, NSEC_PER_USEC);
        do_div(utime, NSEC_PER_USEC);
        do_div(stime, NSEC_PER_USEC);

        seq_printf(seq, "usage_usec %llu\n"
                   "user_usec %llu\n"
                   "system_usec %llu\n",
                   usage, utime, stime);
}





























































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * sysfs.h - definitions for the device driver filesystem
 *
 * Copyright (c) 2001,2002 Patrick Mochel
 * Copyright (c) 2004 Silicon Graphics, Inc.
 * Copyright (c) 2007 SUSE Linux Products GmbH
 * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
 *
 * Please see Documentation/filesystems/sysfs.rst for more information.
 */

#ifndef _SYSFS_H_
#define _SYSFS_H_

#include <linux/kernfs.h>
#include <linux/compiler.h>
#include <linux/errno.h>
#include <linux/list.h>
#include <linux/lockdep.h>
#include <linux/kobject_ns.h>
#include <linux/stat.h>
#include <linux/atomic.h>

struct kobject;
struct module;
struct bin_attribute;
enum kobj_ns_type;

struct attribute {
        const char                *name;
        umode_t                        mode;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        bool                        ignore_lockdep:1;
        struct lock_class_key        *key;
        struct lock_class_key        skey;
#endif
};

/**
 *        sysfs_attr_init - initialize a dynamically allocated sysfs attribute
 *        @attr: struct attribute to initialize
 *
 *        Initialize a dynamically allocated struct attribute so we can
 *        make lockdep happy.  This is a new requirement for attributes
 *        and initially this is only needed when lockdep is enabled.
 *        Lockdep gives a nice error when your attribute is added to
 *        sysfs if you don't have this.
 */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
#define sysfs_attr_init(attr)                                \
do {                                                        \
        static struct lock_class_key __key;                \
                                                        \
        (attr)->key = &__key;                                \
} while (0)
#else
#define sysfs_attr_init(attr) do {} while (0)
#endif

/**
 * struct attribute_group - data structure used to declare an attribute group.
 * @name:        Optional: Attribute group name
 *                If specified, the attribute group will be created in
 *                a new subdirectory with this name.
 * @is_visible:        Optional: Function to return permissions associated with an
 *                attribute of the group. Will be called repeatedly for each
 *                non-binary attribute in the group. Only read/write
 *                permissions as well as SYSFS_PREALLOC are accepted. Must
 *                return 0 if an attribute is not visible. The returned value
 *                will replace static permissions defined in struct attribute.
 * @is_bin_visible:
 *                Optional: Function to return permissions associated with a
 *                binary attribute of the group. Will be called repeatedly
 *                for each binary attribute in the group. Only read/write
 *                permissions as well as SYSFS_PREALLOC are accepted. Must
 *                return 0 if a binary attribute is not visible. The returned
 *                value will replace static permissions defined in
 *                struct bin_attribute.
 * @attrs:        Pointer to NULL terminated list of attributes.
 * @bin_attrs:        Pointer to NULL terminated list of binary attributes.
 *                Either attrs or bin_attrs or both must be provided.
 */
struct attribute_group {
        const char                *name;
        umode_t                        (*is_visible)(struct kobject *,
                                              struct attribute *, int);
        umode_t                        (*is_bin_visible)(struct kobject *,
                                                  struct bin_attribute *, int);
        struct attribute        **attrs;
        struct bin_attribute        **bin_attrs;
};

/*
 * Use these macros to make defining attributes easier.
 * See include/linux/device.h for examples..
 */

#define SYSFS_PREALLOC 010000

#define __ATTR(_name, _mode, _show, _store) {                                \
        .attr = {.name = __stringify(_name),                                \
                 .mode = VERIFY_OCTAL_PERMISSIONS(_mode) },                \
        .show        = _show,                                                \
        .store        = _store,                                                \
}

#define __ATTR_PREALLOC(_name, _mode, _show, _store) {                        \
        .attr = {.name = __stringify(_name),                                \
                 .mode = SYSFS_PREALLOC | VERIFY_OCTAL_PERMISSIONS(_mode) },\
        .show        = _show,                                                \
        .store        = _store,                                                \
}

#define __ATTR_RO(_name) {                                                \
        .attr        = { .name = __stringify(_name), .mode = 0444 },                \
        .show        = _name##_show,                                                \
}

#define __ATTR_RO_MODE(_name, _mode) {                                        \
        .attr        = { .name = __stringify(_name),                                \
                    .mode = VERIFY_OCTAL_PERMISSIONS(_mode) },                \
        .show        = _name##_show,                                                \
}

#define __ATTR_RW_MODE(_name, _mode) {                                        \
        .attr        = { .name = __stringify(_name),                                \
                    .mode = VERIFY_OCTAL_PERMISSIONS(_mode) },                \
        .show        = _name##_show,                                                \
        .store        = _name##_store,                                        \
}

#define __ATTR_WO(_name) {                                                \
        .attr        = { .name = __stringify(_name), .mode = 0200 },                \
        .store        = _name##_store,                                        \
}

#define __ATTR_RW(_name) __ATTR(_name, 0644, _name##_show, _name##_store)

#define __ATTR_NULL { .attr = { .name = NULL } }

#ifdef CONFIG_DEBUG_LOCK_ALLOC
#define __ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) {        \
        .attr = {.name = __stringify(_name), .mode = _mode,        \
                        .ignore_lockdep = true },                \
        .show                = _show,                                \
        .store                = _store,                                \
}
#else
#define __ATTR_IGNORE_LOCKDEP        __ATTR
#endif

#define __ATTRIBUTE_GROUPS(_name)                                \
static const struct attribute_group *_name##_groups[] = {        \
        &_name##_group,                                                \
        NULL,                                                        \
}

#define ATTRIBUTE_GROUPS(_name)                                        \
static const struct attribute_group _name##_group = {                \
        .attrs = _name##_attrs,                                        \
};                                                                \
__ATTRIBUTE_GROUPS(_name)

struct file;
struct vm_area_struct;

struct bin_attribute {
        struct attribute        attr;
        size_t                        size;
        void                        *private;
        ssize_t (*read)(struct file *, struct kobject *, struct bin_attribute *,
                        char *, loff_t, size_t);
        ssize_t (*write)(struct file *, struct kobject *, struct bin_attribute *,
                         char *, loff_t, size_t);
        int (*mmap)(struct file *, struct kobject *, struct bin_attribute *attr,
                    struct vm_area_struct *vma);
};

/**
 *        sysfs_bin_attr_init - initialize a dynamically allocated bin_attribute
 *        @attr: struct bin_attribute to initialize
 *
 *        Initialize a dynamically allocated struct bin_attribute so we
 *        can make lockdep happy.  This is a new requirement for
 *        attributes and initially this is only needed when lockdep is
 *        enabled.  Lockdep gives a nice error when your attribute is
 *        added to sysfs if you don't have this.
 */
#define sysfs_bin_attr_init(bin_attr) sysfs_attr_init(&(bin_attr)->attr)

/* macros to create static binary attributes easier */
#define __BIN_ATTR(_name, _mode, _read, _write, _size) {                \
        .attr = { .name = __stringify(_name), .mode = _mode },                \
        .read        = _read,                                                \
        .write        = _write,                                                \
        .size        = _size,                                                \
}

#define __BIN_ATTR_RO(_name, _size) {                                        \
        .attr        = { .name = __stringify(_name), .mode = 0444 },                \
        .read        = _name##_read,                                                \
        .size        = _size,                                                \
}

#define __BIN_ATTR_WO(_name, _size) {                                        \
        .attr        = { .name = __stringify(_name), .mode = 0200 },                \
        .write        = _name##_write,                                        \
        .size        = _size,                                                \
}

#define __BIN_ATTR_RW(_name, _size)                                        \
        __BIN_ATTR(_name, 0644, _name##_read, _name##_write, _size)

#define __BIN_ATTR_NULL __ATTR_NULL

#define BIN_ATTR(_name, _mode, _read, _write, _size)                        \
struct bin_attribute bin_attr_##_name = __BIN_ATTR(_name, _mode, _read,        \
                                        _write, _size)

#define BIN_ATTR_RO(_name, _size)                                        \
struct bin_attribute bin_attr_##_name = __BIN_ATTR_RO(_name, _size)

#define BIN_ATTR_WO(_name, _size)                                        \
struct bin_attribute bin_attr_##_name = __BIN_ATTR_WO(_name, _size)

#define BIN_ATTR_RW(_name, _size)                                        \
struct bin_attribute bin_attr_##_name = __BIN_ATTR_RW(_name, _size)

struct sysfs_ops {
        ssize_t        (*show)(struct kobject *, struct attribute *, char *);
        ssize_t        (*store)(struct kobject *, struct attribute *, const char *, size_t);
};

#ifdef CONFIG_SYSFS

int __must_check sysfs_create_dir_ns(struct kobject *kobj, const void *ns);
void sysfs_remove_dir(struct kobject *kobj);
int __must_check sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name,
                                     const void *new_ns);
int __must_check sysfs_move_dir_ns(struct kobject *kobj,
                                   struct kobject *new_parent_kobj,
                                   const void *new_ns);
int __must_check sysfs_create_mount_point(struct kobject *parent_kobj,
                                          const char *name);
void sysfs_remove_mount_point(struct kobject *parent_kobj,
                              const char *name);

int __must_check sysfs_create_file_ns(struct kobject *kobj,
                                      const struct attribute *attr,
                                      const void *ns);
int __must_check sysfs_create_files(struct kobject *kobj,
                                   const struct attribute * const *attr);
int __must_check sysfs_chmod_file(struct kobject *kobj,
                                  const struct attribute *attr, umode_t mode);
struct kernfs_node *sysfs_break_active_protection(struct kobject *kobj,
                                                  const struct attribute *attr);
void sysfs_unbreak_active_protection(struct kernfs_node *kn);
void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr,
                          const void *ns);
bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr);
void sysfs_remove_files(struct kobject *kobj, const struct attribute * const *attr);

int __must_check sysfs_create_bin_file(struct kobject *kobj,
                                       const struct bin_attribute *attr);
void sysfs_remove_bin_file(struct kobject *kobj,
                           const struct bin_attribute *attr);

int __must_check sysfs_create_link(struct kobject *kobj, struct kobject *target,
                                   const char *name);
int __must_check sysfs_create_link_nowarn(struct kobject *kobj,
                                          struct kobject *target,
                                          const char *name);
void sysfs_remove_link(struct kobject *kobj, const char *name);

int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *target,
                         const char *old_name, const char *new_name,
                         const void *new_ns);

void sysfs_delete_link(struct kobject *dir, struct kobject *targ,
                        const char *name);

int __must_check sysfs_create_group(struct kobject *kobj,
                                    const struct attribute_group *grp);
int __must_check sysfs_create_groups(struct kobject *kobj,
                                     const struct attribute_group **groups);
int __must_check sysfs_update_groups(struct kobject *kobj,
                                     const struct attribute_group **groups);
int sysfs_update_group(struct kobject *kobj,
                       const struct attribute_group *grp);
void sysfs_remove_group(struct kobject *kobj,
                        const struct attribute_group *grp);
void sysfs_remove_groups(struct kobject *kobj,
                         const struct attribute_group **groups);
int sysfs_add_file_to_group(struct kobject *kobj,
                        const struct attribute *attr, const char *group);
void sysfs_remove_file_from_group(struct kobject *kobj,
                        const struct attribute *attr, const char *group);
int sysfs_merge_group(struct kobject *kobj,
                       const struct attribute_group *grp);
void sysfs_unmerge_group(struct kobject *kobj,
                       const struct attribute_group *grp);
int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name,
                            struct kobject *target, const char *link_name);
void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name,
                                  const char *link_name);
int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj,
                                         struct kobject *target_kobj,
                                         const char *target_name,
                                         const char *symlink_name);

void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr);

int __must_check sysfs_init(void);

static inline void sysfs_enable_ns(struct kernfs_node *kn)
{
        return kernfs_enable_ns(kn);
}

int sysfs_file_change_owner(struct kobject *kobj, const char *name, kuid_t kuid,
                            kgid_t kgid);
int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid);
int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ,
                            const char *name, kuid_t kuid, kgid_t kgid);
int sysfs_groups_change_owner(struct kobject *kobj,
                              const struct attribute_group **groups,
                              kuid_t kuid, kgid_t kgid);
int sysfs_group_change_owner(struct kobject *kobj,
                             const struct attribute_group *groups, kuid_t kuid,
                             kgid_t kgid);
__printf(2, 3)
int sysfs_emit(char *buf, const char *fmt, ...);
__printf(3, 4)
int sysfs_emit_at(char *buf, int at, const char *fmt, ...);

#else /* CONFIG_SYSFS */

static inline int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
{
        return 0;
}

static inline void sysfs_remove_dir(struct kobject *kobj)
{
}

static inline int sysfs_rename_dir_ns(struct kobject *kobj,
                                      const char *new_name, const void *new_ns)
{
        return 0;
}

static inline int sysfs_move_dir_ns(struct kobject *kobj,
                                    struct kobject *new_parent_kobj,
                                    const void *new_ns)
{
        return 0;
}

static inline int sysfs_create_mount_point(struct kobject *parent_kobj,
                                           const char *name)
{
        return 0;
}

static inline void sysfs_remove_mount_point(struct kobject *parent_kobj,
                                            const char *name)
{
}

static inline int sysfs_create_file_ns(struct kobject *kobj,
                                       const struct attribute *attr,
                                       const void *ns)
{
        return 0;
}

static inline int sysfs_create_files(struct kobject *kobj,
                                    const struct attribute * const *attr)
{
        return 0;
}

static inline int sysfs_chmod_file(struct kobject *kobj,
                                   const struct attribute *attr, umode_t mode)
{
        return 0;
}

static inline struct kernfs_node *
sysfs_break_active_protection(struct kobject *kobj,
                              const struct attribute *attr)
{
        return NULL;
}

static inline void sysfs_unbreak_active_protection(struct kernfs_node *kn)
{
}

static inline void sysfs_remove_file_ns(struct kobject *kobj,
                                        const struct attribute *attr,
                                        const void *ns)
{
}

static inline bool sysfs_remove_file_self(struct kobject *kobj,
                                          const struct attribute *attr)
{
        return false;
}

static inline void sysfs_remove_files(struct kobject *kobj,
                                     const struct attribute * const *attr)
{
}

static inline int sysfs_create_bin_file(struct kobject *kobj,
                                        const struct bin_attribute *attr)
{
        return 0;
}

static inline void sysfs_remove_bin_file(struct kobject *kobj,
                                         const struct bin_attribute *attr)
{
}

static inline int sysfs_create_link(struct kobject *kobj,
                                    struct kobject *target, const char *name)
{
        return 0;
}

static inline int sysfs_create_link_nowarn(struct kobject *kobj,
                                           struct kobject *target,
                                           const char *name)
{
        return 0;
}

static inline void sysfs_remove_link(struct kobject *kobj, const char *name)
{
}

static inline int sysfs_rename_link_ns(struct kobject *k, struct kobject *t,
                                       const char *old_name,
                                       const char *new_name, const void *ns)
{
        return 0;
}

static inline void sysfs_delete_link(struct kobject *k, struct kobject *t,
                                     const char *name)
{
}

static inline int sysfs_create_group(struct kobject *kobj,
                                     const struct attribute_group *grp)
{
        return 0;
}

static inline int sysfs_create_groups(struct kobject *kobj,
                                      const struct attribute_group **groups)
{
        return 0;
}

static inline int sysfs_update_groups(struct kobject *kobj,
                                      const struct attribute_group **groups)
{
        return 0;
}

static inline int sysfs_update_group(struct kobject *kobj,
                                const struct attribute_group *grp)
{
        return 0;
}

static inline void sysfs_remove_group(struct kobject *kobj,
                                      const struct attribute_group *grp)
{
}

static inline void sysfs_remove_groups(struct kobject *kobj,
                                       const struct attribute_group **groups)
{
}

static inline int sysfs_add_file_to_group(struct kobject *kobj,
                const struct attribute *attr, const char *group)
{
        return 0;
}

static inline void sysfs_remove_file_from_group(struct kobject *kobj,
                const struct attribute *attr, const char *group)
{
}

static inline int sysfs_merge_group(struct kobject *kobj,
                       const struct attribute_group *grp)
{
        return 0;
}

static inline void sysfs_unmerge_group(struct kobject *kobj,
                       const struct attribute_group *grp)
{
}

static inline int sysfs_add_link_to_group(struct kobject *kobj,
                const char *group_name, struct kobject *target,
                const char *link_name)
{
        return 0;
}

static inline void sysfs_remove_link_from_group(struct kobject *kobj,
                const char *group_name, const char *link_name)
{
}

static inline int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj,
                                                       struct kobject *target_kobj,
                                                       const char *target_name,
                                                       const char *symlink_name)
{
        return 0;
}

static inline void sysfs_notify(struct kobject *kobj, const char *dir,
                                const char *attr)
{
}

static inline int __must_check sysfs_init(void)
{
        return 0;
}

static inline void sysfs_enable_ns(struct kernfs_node *kn)
{
}

static inline int sysfs_file_change_owner(struct kobject *kobj,
                                          const char *name, kuid_t kuid,
                                          kgid_t kgid)
{
        return 0;
}

static inline int sysfs_link_change_owner(struct kobject *kobj,
                                          struct kobject *targ,
                                          const char *name, kuid_t kuid,
                                          kgid_t kgid)
{
        return 0;
}

static inline int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid)
{
        return 0;
}

static inline int sysfs_groups_change_owner(struct kobject *kobj,
                          const struct attribute_group **groups,
                          kuid_t kuid, kgid_t kgid)
{
        return 0;
}

static inline int sysfs_group_change_owner(struct kobject *kobj,
                                           const struct attribute_group *groups,
                                           kuid_t kuid, kgid_t kgid)
{
        return 0;
}

__printf(2, 3)
static inline int sysfs_emit(char *buf, const char *fmt, ...)
{
        return 0;
}

__printf(3, 4)
static inline int sysfs_emit_at(char *buf, int at, const char *fmt, ...)
{
        return 0;
}
#endif /* CONFIG_SYSFS */

static inline int __must_check sysfs_create_file(struct kobject *kobj,
                                                 const struct attribute *attr)
{
        return sysfs_create_file_ns(kobj, attr, NULL);
}

static inline void sysfs_remove_file(struct kobject *kobj,
                                     const struct attribute *attr)
{
        sysfs_remove_file_ns(kobj, attr, NULL);
}

static inline int sysfs_rename_link(struct kobject *kobj, struct kobject *target,
                                    const char *old_name, const char *new_name)
{
        return sysfs_rename_link_ns(kobj, target, old_name, new_name, NULL);
}

static inline void sysfs_notify_dirent(struct kernfs_node *kn)
{
        kernfs_notify(kn);
}

static inline struct kernfs_node *sysfs_get_dirent(struct kernfs_node *parent,
                                                   const char *name)
{
        return kernfs_find_and_get(parent, name);
}

static inline struct kernfs_node *sysfs_get(struct kernfs_node *kn)
{
        kernfs_get(kn);
        return kn;
}

static inline void sysfs_put(struct kernfs_node *kn)
{
        kernfs_put(kn);
}

#endif /* _SYSFS_H_ */























































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * This header file contains public constants and structures used by
 * the SCSI initiator code.
 */
#ifndef _SCSI_SCSI_H
#define _SCSI_SCSI_H

#include <linux/types.h>
#include <linux/scatterlist.h>
#include <linux/kernel.h>
#include <scsi/scsi_common.h>
#include <scsi/scsi_proto.h>

struct scsi_cmnd;

enum scsi_timeouts {
        SCSI_DEFAULT_EH_TIMEOUT                = 10 * HZ,
};

/*
 * DIX-capable adapters effectively support infinite chaining for the
 * protection information scatterlist
 */
#define SCSI_MAX_PROT_SG_SEGMENTS        0xFFFF

/*
 * Special value for scanning to specify scanning or rescanning of all
 * possible channels, (target) ids, or luns on a given shost.
 */
#define SCAN_WILD_CARD        ~0

/** scsi_status_is_good - check the status return.
 *
 * @status: the status passed up from the driver (including host and
 *          driver components)
 *
 * This returns true for known good conditions that may be treated as
 * command completed normally
 */
static inline int scsi_status_is_good(int status)
{
        /*
         * FIXME: bit0 is listed as reserved in SCSI-2, but is
         * significant in SCSI-3.  For now, we follow the SCSI-2
         * behaviour and ignore reserved bits.
         */
        status &= 0xfe;
        return ((status == SAM_STAT_GOOD) ||
                (status == SAM_STAT_CONDITION_MET) ||
                /* Next two "intermediate" statuses are obsolete in SAM-4 */
                (status == SAM_STAT_INTERMEDIATE) ||
                (status == SAM_STAT_INTERMEDIATE_CONDITION_MET) ||
                /* FIXME: this is obsolete in SAM-3 */
                (status == SAM_STAT_COMMAND_TERMINATED));
}


/*
 * standard mode-select header prepended to all mode-select commands
 */

struct ccs_modesel_head {
        __u8 _r1;                        /* reserved */
        __u8 medium;                /* device-specific medium type */
        __u8 _r2;                        /* reserved */
        __u8 block_desc_length;        /* block descriptor length */
        __u8 density;                /* device-specific density code */
        __u8 number_blocks_hi;        /* number of blocks in this block desc */
        __u8 number_blocks_med;
        __u8 number_blocks_lo;
        __u8 _r3;
        __u8 block_length_hi;        /* block length for blocks in this desc */
        __u8 block_length_med;
        __u8 block_length_lo;
};

/*
 * The Well Known LUNS (SAM-3) in our int representation of a LUN
 */
#define SCSI_W_LUN_BASE 0xc100
#define SCSI_W_LUN_REPORT_LUNS (SCSI_W_LUN_BASE + 1)
#define SCSI_W_LUN_ACCESS_CONTROL (SCSI_W_LUN_BASE + 2)
#define SCSI_W_LUN_TARGET_LOG_PAGE (SCSI_W_LUN_BASE + 3)

static inline int scsi_is_wlun(u64 lun)
{
        return (lun & 0xff00) == SCSI_W_LUN_BASE;
}


/*
 *  MESSAGE CODES
 */

#define COMMAND_COMPLETE    0x00
#define EXTENDED_MESSAGE    0x01
#define     EXTENDED_MODIFY_DATA_POINTER    0x00
#define     EXTENDED_SDTR                   0x01
#define     EXTENDED_EXTENDED_IDENTIFY      0x02    /* SCSI-I only */
#define     EXTENDED_WDTR                   0x03
#define     EXTENDED_PPR                    0x04
#define     EXTENDED_MODIFY_BIDI_DATA_PTR   0x05
#define SAVE_POINTERS       0x02
#define RESTORE_POINTERS    0x03
#define DISCONNECT          0x04
#define INITIATOR_ERROR     0x05
#define ABORT_TASK_SET      0x06
#define MESSAGE_REJECT      0x07
#define NOP                 0x08
#define MSG_PARITY_ERROR    0x09
#define LINKED_CMD_COMPLETE 0x0a
#define LINKED_FLG_CMD_COMPLETE 0x0b
#define TARGET_RESET        0x0c
#define ABORT_TASK          0x0d
#define CLEAR_TASK_SET      0x0e
#define INITIATE_RECOVERY   0x0f            /* SCSI-II only */
#define RELEASE_RECOVERY    0x10            /* SCSI-II only */
#define CLEAR_ACA           0x16
#define LOGICAL_UNIT_RESET  0x17
#define SIMPLE_QUEUE_TAG    0x20
#define HEAD_OF_QUEUE_TAG   0x21
#define ORDERED_QUEUE_TAG   0x22
#define IGNORE_WIDE_RESIDUE 0x23
#define ACA                 0x24
#define QAS_REQUEST         0x55

/* Old SCSI2 names, don't use in new code */
#define BUS_DEVICE_RESET    TARGET_RESET
#define ABORT               ABORT_TASK_SET

/*
 * Host byte codes
 */

#define DID_OK          0x00        /* NO error                                */
#define DID_NO_CONNECT  0x01        /* Couldn't connect before timeout period  */
#define DID_BUS_BUSY    0x02        /* BUS stayed busy through time out period */
#define DID_TIME_OUT    0x03        /* TIMED OUT for other reason              */
#define DID_BAD_TARGET  0x04        /* BAD target.                             */
#define DID_ABORT       0x05        /* Told to abort for some other reason     */
#define DID_PARITY      0x06        /* Parity error                            */
#define DID_ERROR       0x07        /* Internal error                          */
#define DID_RESET       0x08        /* Reset by somebody.                      */
#define DID_BAD_INTR    0x09        /* Got an interrupt we weren't expecting.  */
#define DID_PASSTHROUGH 0x0a        /* Force command past mid-layer            */
#define DID_SOFT_ERROR  0x0b        /* The low level driver just wish a retry  */
#define DID_IMM_RETRY   0x0c        /* Retry without decrementing retry count  */
#define DID_REQUEUE        0x0d        /* Requeue command (no immediate retry) also
                                 * without decrementing the retry count           */
#define DID_TRANSPORT_DISRUPTED 0x0e /* Transport error disrupted execution
                                      * and the driver blocked the port to
                                      * recover the link. Transport class will
                                      * retry or fail IO */
#define DID_TRANSPORT_FAILFAST        0x0f /* Transport class fastfailed the io */
#define DID_TARGET_FAILURE 0x10 /* Permanent target failure, do not retry on
                                 * other paths */
#define DID_NEXUS_FAILURE 0x11  /* Permanent nexus failure, retry on other
                                 * paths might yield different results */
#define DID_ALLOC_FAILURE 0x12  /* Space allocation on the device failed */
#define DID_MEDIUM_ERROR  0x13  /* Medium error */
#define DRIVER_OK       0x00        /* Driver status                           */

/*
 *  These indicate the error that occurred, and what is available.
 */

#define DRIVER_BUSY         0x01
#define DRIVER_SOFT         0x02
#define DRIVER_MEDIA        0x03
#define DRIVER_ERROR        0x04

#define DRIVER_INVALID      0x05
#define DRIVER_TIMEOUT      0x06
#define DRIVER_HARD         0x07
#define DRIVER_SENSE            0x08

/*
 * Internal return values.
 */
enum scsi_disposition {
        NEEDS_RETRY                = 0x2001,
        SUCCESS                        = 0x2002,
        FAILED                        = 0x2003,
        QUEUED                        = 0x2004,
        SOFT_ERROR                = 0x2005,
        ADD_TO_MLQUEUE                = 0x2006,
        TIMEOUT_ERROR                = 0x2007,
        SCSI_RETURN_NOT_HANDLED        = 0x2008,
        FAST_IO_FAIL                = 0x2009,
};

/*
 * Midlevel queue return values.
 */
#define SCSI_MLQUEUE_HOST_BUSY   0x1055
#define SCSI_MLQUEUE_DEVICE_BUSY 0x1056
#define SCSI_MLQUEUE_EH_RETRY    0x1057
#define SCSI_MLQUEUE_TARGET_BUSY 0x1058

/*
 *  Use these to separate status msg and our bytes
 *
 *  These are set by:
 *
 *      status byte = set from target device
 *      msg_byte    = return status from host adapter itself.
 *      host_byte   = set by low-level driver to indicate status.
 *      driver_byte = set by mid-level.
 */
#define status_byte(result) (((result) >> 1) & 0x7f)
#define msg_byte(result)    (((result) >> 8) & 0xff)
#define host_byte(result)   (((result) >> 16) & 0xff)
#define driver_byte(result) (((result) >> 24) & 0xff)

#define sense_class(sense)  (((sense) >> 4) & 0x7)
#define sense_error(sense)  ((sense) & 0xf)
#define sense_valid(sense)  ((sense) & 0x80)

/*
 * default timeouts
*/
#define FORMAT_UNIT_TIMEOUT                (2 * 60 * 60 * HZ)
#define START_STOP_TIMEOUT                (60 * HZ)
#define MOVE_MEDIUM_TIMEOUT                (5 * 60 * HZ)
#define READ_ELEMENT_STATUS_TIMEOUT        (5 * 60 * HZ)
#define READ_DEFECT_DATA_TIMEOUT        (60 * HZ )


#define IDENTIFY_BASE       0x80
#define IDENTIFY(can_disconnect, lun)   (IDENTIFY_BASE |\
                     ((can_disconnect) ?  0x40 : 0) |\
                     ((lun) & 0x07))

/*
 *  struct scsi_device::scsi_level values. For SCSI devices other than those
 *  prior to SCSI-2 (i.e. over 12 years old) this value is (resp[2] + 1)
 *  where "resp" is a byte array of the response to an INQUIRY. The scsi_level
 *  variable is visible to the user via sysfs.
 */

#define SCSI_UNKNOWN    0
#define SCSI_1          1
#define SCSI_1_CCS      2
#define SCSI_2          3
#define SCSI_3          4        /* SPC */
#define SCSI_SPC_2      5
#define SCSI_SPC_3      6

/*
 * INQ PERIPHERAL QUALIFIERS
 */
#define SCSI_INQ_PQ_CON         0x00
#define SCSI_INQ_PQ_NOT_CON     0x01
#define SCSI_INQ_PQ_NOT_CAP     0x03


/*
 * Here are some scsi specific ioctl commands which are sometimes useful.
 *
 * Note that include/linux/cdrom.h also defines IOCTL 0x5300 - 0x5395
 */

/* Used to obtain PUN and LUN info.  Conflicts with CDROMAUDIOBUFSIZ */
#define SCSI_IOCTL_GET_IDLUN                0x5382

/* 0x5383 and 0x5384 were used for SCSI_IOCTL_TAGGED_{ENABLE,DISABLE} */

/* Used to obtain the host number of a device. */
#define SCSI_IOCTL_PROBE_HOST                0x5385

/* Used to obtain the bus number for a device */
#define SCSI_IOCTL_GET_BUS_NUMBER        0x5386

/* Used to obtain the PCI location of a device */
#define SCSI_IOCTL_GET_PCI                0x5387

#endif /* _SCSI_SCSI_H */



































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * PTP 1588 clock support - private declarations for the core module.
 *
 * Copyright (C) 2010 OMICRON electronics GmbH
 */
#ifndef _PTP_PRIVATE_H_
#define _PTP_PRIVATE_H_

#include <linux/cdev.h>
#include <linux/device.h>
#include <linux/kthread.h>
#include <linux/mutex.h>
#include <linux/posix-clock.h>
#include <linux/ptp_clock.h>
#include <linux/ptp_clock_kernel.h>
#include <linux/time.h>

#define PTP_MAX_TIMESTAMPS 128
#define PTP_BUF_TIMESTAMPS 30

struct timestamp_event_queue {
        struct ptp_extts_event buf[PTP_MAX_TIMESTAMPS];
        int head;
        int tail;
        spinlock_t lock;
};

struct ptp_clock {
        struct posix_clock clock;
        struct device dev;
        struct ptp_clock_info *info;
        dev_t devid;
        int index; /* index into clocks.map */
        struct pps_device *pps_source;
        long dialed_frequency; /* remembers the frequency adjustment */
        struct timestamp_event_queue tsevq; /* simple fifo for time stamps */
        struct mutex tsevq_mux; /* one process at a time reading the fifo */
        struct mutex pincfg_mux; /* protect concurrent info->pin_config access */
        wait_queue_head_t tsev_wq;
        int defunct; /* tells readers to go away when clock is being removed */
        struct device_attribute *pin_dev_attr;
        struct attribute **pin_attr;
        struct attribute_group pin_attr_group;
        /* 1st entry is a pointer to the real group, 2nd is NULL terminator */
        const struct attribute_group *pin_attr_groups[2];
        struct kthread_worker *kworker;
        struct kthread_delayed_work aux_work;
};

/*
 * The function queue_cnt() is safe for readers to call without
 * holding q->lock. Readers use this function to verify that the queue
 * is nonempty before proceeding with a dequeue operation. The fact
 * that a writer might concurrently increment the tail does not
 * matter, since the queue remains nonempty nonetheless.
 */
static inline int queue_cnt(const struct timestamp_event_queue *q)
{
        /*
         * Paired with WRITE_ONCE() in enqueue_external_timestamp(),
         * ptp_read(), extts_fifo_show().
         */
        int cnt = READ_ONCE(q->tail) - READ_ONCE(q->head);
        return cnt < 0 ? PTP_MAX_TIMESTAMPS + cnt : cnt;
}

/*
 * see ptp_chardev.c
 */

/* caller must hold pincfg_mux */
int ptp_set_pinfunc(struct ptp_clock *ptp, unsigned int pin,
                    enum ptp_pin_function func, unsigned int chan);

long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd,
               unsigned long arg);

int ptp_open(struct posix_clock_context *pccontext, fmode_t fmode);

int ptp_release(struct posix_clock_context *pccontext);

ssize_t ptp_read(struct posix_clock_context *pccontext, uint flags, char __user *buf,
                 size_t cnt);

__poll_t ptp_poll(struct posix_clock_context *pccontext, struct file *fp,
                  poll_table *wait);

/*
 * see ptp_sysfs.c
 */

extern const struct attribute_group *ptp_groups[];

int ptp_populate_pin_groups(struct ptp_clock *ptp);
void ptp_cleanup_pin_groups(struct ptp_clock *ptp);

#endif


































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Global definitions for the ARP (RFC 826) protocol.
 *
 * Version:        @(#)if_arp.h        1.0.1        04/16/93
 *
 * Authors:        Original taken from Berkeley UNIX 4.3, (c) UCB 1986-1988
 *                Portions taken from the KA9Q/NOS (v2.00m PA0GRI) source.
 *                Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Florian La Roche,
 *                Jonathan Layes <layes@loran.com>
 *                Arnaldo Carvalho de Melo <acme@conectiva.com.br> ARPHRD_HWX25
 */
#ifndef _LINUX_IF_ARP_H
#define _LINUX_IF_ARP_H

#include <linux/skbuff.h>
#include <uapi/linux/if_arp.h>

static inline struct arphdr *arp_hdr(const struct sk_buff *skb)
{
        return (struct arphdr *)skb_network_header(skb);
}

static inline unsigned int arp_hdr_len(const struct net_device *dev)
{
        switch (dev->type) {
#if IS_ENABLED(CONFIG_FIREWIRE_NET)
        case ARPHRD_IEEE1394:
                /* ARP header, device address and 2 IP addresses */
                return sizeof(struct arphdr) + dev->addr_len + sizeof(u32) * 2;
#endif
        default:
                /* ARP header, plus 2 device addresses, plus 2 IP addresses. */
                return sizeof(struct arphdr) + (dev->addr_len + sizeof(u32)) * 2;
        }
}

static inline bool dev_is_mac_header_xmit(const struct net_device *dev)
{
        switch (dev->type) {
        case ARPHRD_TUNNEL:
        case ARPHRD_TUNNEL6:
        case ARPHRD_SIT:
        case ARPHRD_IPGRE:
        case ARPHRD_VOID:
        case ARPHRD_NONE:
        case ARPHRD_RAWIP:
        case ARPHRD_PIMREG:
        /* PPP adds its l2 header automatically in ppp_start_xmit().
         * This makes it look like an l3 device to __bpf_redirect() and tcf_mirred_init().
         */
        case ARPHRD_PPP:
                return false;
        default:
                return true;
        }
}

#endif        /* _LINUX_IF_ARP_H */






























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
/*
 * Written by: Matthew Dobson, IBM Corporation
 *
 * Copyright (C) 2002, IBM Corp.
 *
 * All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
 * NON INFRINGEMENT.  See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 * Send feedback to <colpatch@us.ibm.com>
 */
#ifndef _ASM_X86_TOPOLOGY_H
#define _ASM_X86_TOPOLOGY_H

/*
 * to preserve the visibility of NUMA_NO_NODE definition,
 * moved to there from here.  May be used independent of
 * CONFIG_NUMA.
 */
#include <linux/numa.h>

#ifdef CONFIG_NUMA
#include <linux/cpumask.h>

#include <asm/mpspec.h>
#include <asm/percpu.h>

/* Mappings between logical cpu number and node number */
DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);

#ifdef CONFIG_DEBUG_PER_CPU_MAPS
/*
 * override generic percpu implementation of cpu_to_node
 */
extern int __cpu_to_node(int cpu);
#define cpu_to_node __cpu_to_node

extern int early_cpu_to_node(int cpu);

#else        /* !CONFIG_DEBUG_PER_CPU_MAPS */

/* Same function but used if called before per_cpu areas are setup */
static inline int early_cpu_to_node(int cpu)
{
        return early_per_cpu(x86_cpu_to_node_map, cpu);
}

#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */

/* Mappings between node number and cpus on that node. */
extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];

#ifdef CONFIG_DEBUG_PER_CPU_MAPS
extern const struct cpumask *cpumask_of_node(int node);
#else
/* Returns a pointer to the cpumask of CPUs on Node 'node'. */
static inline const struct cpumask *cpumask_of_node(int node)
{
        return node_to_cpumask_map[node];
}
#endif

extern void setup_node_to_cpumask_map(void);

#define pcibus_to_node(bus) __pcibus_to_node(bus)

extern int __node_distance(int, int);
#define node_distance(a, b) __node_distance(a, b)

#else /* !CONFIG_NUMA */

static inline int numa_node_id(void)
{
        return 0;
}
/*
 * indicate override:
 */
#define numa_node_id numa_node_id

static inline int early_cpu_to_node(int cpu)
{
        return 0;
}

static inline void setup_node_to_cpumask_map(void) { }

#endif

#include <asm-generic/topology.h>

extern const struct cpumask *cpu_coregroup_mask(int cpu);

#define topology_logical_package_id(cpu)        (cpu_data(cpu).logical_proc_id)
#define topology_physical_package_id(cpu)        (cpu_data(cpu).phys_proc_id)
#define topology_logical_die_id(cpu)                (cpu_data(cpu).logical_die_id)
#define topology_die_id(cpu)                        (cpu_data(cpu).cpu_die_id)
#define topology_core_id(cpu)                        (cpu_data(cpu).cpu_core_id)

extern unsigned int __max_die_per_package;

#ifdef CONFIG_SMP
#define topology_die_cpumask(cpu)                (per_cpu(cpu_die_map, cpu))
#define topology_core_cpumask(cpu)                (per_cpu(cpu_core_map, cpu))
#define topology_sibling_cpumask(cpu)                (per_cpu(cpu_sibling_map, cpu))

extern unsigned int __max_logical_packages;
#define topology_max_packages()                        (__max_logical_packages)

static inline int topology_max_die_per_package(void)
{
        return __max_die_per_package;
}

extern int __max_smt_threads;

static inline int topology_max_smt_threads(void)
{
        return __max_smt_threads;
}

int topology_update_package_map(unsigned int apicid, unsigned int cpu);
int topology_update_die_map(unsigned int dieid, unsigned int cpu);
int topology_phys_to_logical_pkg(unsigned int pkg);
int topology_phys_to_logical_die(unsigned int die, unsigned int cpu);
bool topology_is_primary_thread(unsigned int cpu);
bool topology_smt_supported(void);
#else
#define topology_max_packages()                        (1)
static inline int
topology_update_package_map(unsigned int apicid, unsigned int cpu) { return 0; }
static inline int
topology_update_die_map(unsigned int dieid, unsigned int cpu) { return 0; }
static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; }
static inline int topology_phys_to_logical_die(unsigned int die,
                unsigned int cpu) { return 0; }
static inline int topology_max_die_per_package(void) { return 1; }
static inline int topology_max_smt_threads(void) { return 1; }
static inline bool topology_is_primary_thread(unsigned int cpu) { return true; }
static inline bool topology_smt_supported(void) { return false; }
#endif

static inline void arch_fix_phys_package_id(int num, u32 slot)
{
}

struct pci_bus;
int x86_pci_root_bus_node(int bus);
void x86_pci_root_bus_resources(int bus, struct list_head *resources);

extern bool x86_topology_update;

#ifdef CONFIG_SCHED_MC_PRIO
#include <asm/percpu.h>

DECLARE_PER_CPU_READ_MOSTLY(int, sched_core_priority);
extern unsigned int __read_mostly sysctl_sched_itmt_enabled;

/* Interface to set priority of a cpu */
void sched_set_itmt_core_prio(int prio, int core_cpu);

/* Interface to notify scheduler that system supports ITMT */
int sched_set_itmt_support(void);

/* Interface to notify scheduler that system revokes ITMT support */
void sched_clear_itmt_support(void);

#else /* CONFIG_SCHED_MC_PRIO */

#define sysctl_sched_itmt_enabled        0
static inline void sched_set_itmt_core_prio(int prio, int core_cpu)
{
}
static inline int sched_set_itmt_support(void)
{
        return 0;
}
static inline void sched_clear_itmt_support(void)
{
}
#endif /* CONFIG_SCHED_MC_PRIO */

#if defined(CONFIG_SMP) && defined(CONFIG_X86_64)
#include <asm/cpufeature.h>

DECLARE_STATIC_KEY_FALSE(arch_scale_freq_key);

#define arch_scale_freq_invariant() static_branch_likely(&arch_scale_freq_key)

DECLARE_PER_CPU(unsigned long, arch_freq_scale);

static inline long arch_scale_freq_capacity(int cpu)
{
        return per_cpu(arch_freq_scale, cpu);
}
#define arch_scale_freq_capacity arch_scale_freq_capacity

extern void arch_scale_freq_tick(void);
#define arch_scale_freq_tick arch_scale_freq_tick

extern void arch_set_max_freq_ratio(bool turbo_disabled);
#else
static inline void arch_set_max_freq_ratio(bool turbo_disabled)
{
}
#endif

#endif /* _ASM_X86_TOPOLOGY_H */





















































































































    1 







    1 








    1 





    1 






































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Private definitions for the generic associative array implementation.
 *
 * See Documentation/core-api/assoc_array.rst for information.
 *
 * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#ifndef _LINUX_ASSOC_ARRAY_PRIV_H
#define _LINUX_ASSOC_ARRAY_PRIV_H

#ifdef CONFIG_ASSOCIATIVE_ARRAY

#include <linux/assoc_array.h>

#define ASSOC_ARRAY_FAN_OUT                16        /* Number of slots per node */
#define ASSOC_ARRAY_FAN_MASK                (ASSOC_ARRAY_FAN_OUT - 1)
#define ASSOC_ARRAY_LEVEL_STEP                (ilog2(ASSOC_ARRAY_FAN_OUT))
#define ASSOC_ARRAY_LEVEL_STEP_MASK        (ASSOC_ARRAY_LEVEL_STEP - 1)
#define ASSOC_ARRAY_KEY_CHUNK_MASK        (ASSOC_ARRAY_KEY_CHUNK_SIZE - 1)
#define ASSOC_ARRAY_KEY_CHUNK_SHIFT        (ilog2(BITS_PER_LONG))

/*
 * Undefined type representing a pointer with type information in the bottom
 * two bits.
 */
struct assoc_array_ptr;

/*
 * An N-way node in the tree.
 *
 * Each slot contains one of four things:
 *
 *        (1) Nothing (NULL).
 *
 *        (2) A leaf object (pointer types 0).
 *
 *        (3) A next-level node (pointer type 1, subtype 0).
 *
 *        (4) A shortcut (pointer type 1, subtype 1).
 *
 * The tree is optimised for search-by-ID, but permits reasonable iteration
 * also.
 *
 * The tree is navigated by constructing an index key consisting of an array of
 * segments, where each segment is ilog2(ASSOC_ARRAY_FAN_OUT) bits in size.
 *
 * The segments correspond to levels of the tree (the first segment is used at
 * level 0, the second at level 1, etc.).
 */
struct assoc_array_node {
        struct assoc_array_ptr        *back_pointer;
        u8                        parent_slot;
        struct assoc_array_ptr        *slots[ASSOC_ARRAY_FAN_OUT];
        unsigned long                nr_leaves_on_branch;
};

/*
 * A shortcut through the index space out to where a collection of nodes/leaves
 * with the same IDs live.
 */
struct assoc_array_shortcut {
        struct assoc_array_ptr        *back_pointer;
        int                        parent_slot;
        int                        skip_to_level;
        struct assoc_array_ptr        *next_node;
        unsigned long                index_key[];
};

/*
 * Preallocation cache.
 */
struct assoc_array_edit {
        struct rcu_head                        rcu;
        struct assoc_array                *array;
        const struct assoc_array_ops        *ops;
        const struct assoc_array_ops        *ops_for_excised_subtree;
        struct assoc_array_ptr                *leaf;
        struct assoc_array_ptr                **leaf_p;
        struct assoc_array_ptr                *dead_leaf;
        struct assoc_array_ptr                *new_meta[3];
        struct assoc_array_ptr                *excised_meta[1];
        struct assoc_array_ptr                *excised_subtree;
        struct assoc_array_ptr                **set_backpointers[ASSOC_ARRAY_FAN_OUT];
        struct assoc_array_ptr                *set_backpointers_to;
        struct assoc_array_node                *adjust_count_on;
        long                                adjust_count_by;
        struct {
                struct assoc_array_ptr        **ptr;
                struct assoc_array_ptr        *to;
        } set[2];
        struct {
                u8                        *p;
                u8                        to;
        } set_parent_slot[1];
        u8                                segment_cache[ASSOC_ARRAY_FAN_OUT + 1];
};

/*
 * Internal tree member pointers are marked in the bottom one or two bits to
 * indicate what type they are so that we don't have to look behind every
 * pointer to see what it points to.
 *
 * We provide functions to test type annotations and to create and translate
 * the annotated pointers.
 */
#define ASSOC_ARRAY_PTR_TYPE_MASK 0x1UL
#define ASSOC_ARRAY_PTR_LEAF_TYPE 0x0UL        /* Points to leaf (or nowhere) */
#define ASSOC_ARRAY_PTR_META_TYPE 0x1UL        /* Points to node or shortcut */
#define ASSOC_ARRAY_PTR_SUBTYPE_MASK        0x2UL
#define ASSOC_ARRAY_PTR_NODE_SUBTYPE        0x0UL
#define ASSOC_ARRAY_PTR_SHORTCUT_SUBTYPE 0x2UL

static inline bool assoc_array_ptr_is_meta(const struct assoc_array_ptr *x)
{
        return (unsigned long)x & ASSOC_ARRAY_PTR_TYPE_MASK;
}
static inline bool assoc_array_ptr_is_leaf(const struct assoc_array_ptr *x)
{
        return !assoc_array_ptr_is_meta(x);
}
static inline bool assoc_array_ptr_is_shortcut(const struct assoc_array_ptr *x)
{
        return (unsigned long)x & ASSOC_ARRAY_PTR_SUBTYPE_MASK;
}
static inline bool assoc_array_ptr_is_node(const struct assoc_array_ptr *x)
{
        return !assoc_array_ptr_is_shortcut(x);
}

static inline void *assoc_array_ptr_to_leaf(const struct assoc_array_ptr *x)
{
        return (void *)((unsigned long)x & ~ASSOC_ARRAY_PTR_TYPE_MASK);
}

static inline
unsigned long __assoc_array_ptr_to_meta(const struct assoc_array_ptr *x)
{
        return (unsigned long)x &
                ~(ASSOC_ARRAY_PTR_SUBTYPE_MASK | ASSOC_ARRAY_PTR_TYPE_MASK);
}
static inline
struct assoc_array_node *assoc_array_ptr_to_node(const struct assoc_array_ptr *x)
{
        return (struct assoc_array_node *)__assoc_array_ptr_to_meta(x);
}
static inline
struct assoc_array_shortcut *assoc_array_ptr_to_shortcut(const struct assoc_array_ptr *x)
{
        return (struct assoc_array_shortcut *)__assoc_array_ptr_to_meta(x);
}

static inline
struct assoc_array_ptr *__assoc_array_x_to_ptr(const void *p, unsigned long t)
{
        return (struct assoc_array_ptr *)((unsigned long)p | t);
}
static inline
struct assoc_array_ptr *assoc_array_leaf_to_ptr(const void *p)
{
        return __assoc_array_x_to_ptr(p, ASSOC_ARRAY_PTR_LEAF_TYPE);
}
static inline
struct assoc_array_ptr *assoc_array_node_to_ptr(const struct assoc_array_node *p)
{
        return __assoc_array_x_to_ptr(
                p, ASSOC_ARRAY_PTR_META_TYPE | ASSOC_ARRAY_PTR_NODE_SUBTYPE);
}
static inline
struct assoc_array_ptr *assoc_array_shortcut_to_ptr(const struct assoc_array_shortcut *p)
{
        return __assoc_array_x_to_ptr(
                p, ASSOC_ARRAY_PTR_META_TYPE | ASSOC_ARRAY_PTR_SHORTCUT_SUBTYPE);
}

#endif /* CONFIG_ASSOCIATIVE_ARRAY */
#endif /* _LINUX_ASSOC_ARRAY_PRIV_H */


























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _SOCK_REUSEPORT_H
#define _SOCK_REUSEPORT_H

#include <linux/filter.h>
#include <linux/skbuff.h>
#include <linux/types.h>
#include <linux/spinlock.h>
#include <net/sock.h>

extern spinlock_t reuseport_lock;

struct sock_reuseport {
        struct rcu_head                rcu;

        u16                        max_socks;        /* length of socks */
        u16                        num_socks;        /* elements in socks */
        /* The last synq overflow event timestamp of this
         * reuse->socks[] group.
         */
        unsigned int                synq_overflow_ts;
        /* ID stays the same even after the size of socks[] grows. */
        unsigned int                reuseport_id;
        unsigned int                bind_inany:1;
        unsigned int                has_conns:1;
        struct bpf_prog __rcu        *prog;                /* optional BPF sock selector */
        struct sock                *socks[];        /* array of sock pointers */
};

extern int reuseport_alloc(struct sock *sk, bool bind_inany);
extern int reuseport_add_sock(struct sock *sk, struct sock *sk2,
                              bool bind_inany);
extern void reuseport_detach_sock(struct sock *sk);
extern struct sock *reuseport_select_sock(struct sock *sk,
                                          u32 hash,
                                          struct sk_buff *skb,
                                          int hdr_len);
extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog);
extern int reuseport_detach_prog(struct sock *sk);

static inline bool reuseport_has_conns(struct sock *sk)
{
        struct sock_reuseport *reuse;
        bool ret = false;

        rcu_read_lock();
        reuse = rcu_dereference(sk->sk_reuseport_cb);
        if (reuse && reuse->has_conns)
                ret = true;
        rcu_read_unlock();

        return ret;
}

void reuseport_has_conns_set(struct sock *sk);

#endif  /* _SOCK_REUSEPORT_H */






































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for inet_sock
 *
 * Authors:        Many, reorganised here by
 *                 Arnaldo Carvalho de Melo <acme@mandriva.com>
 */
#ifndef _INET_SOCK_H
#define _INET_SOCK_H

#include <linux/bitops.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/jhash.h>
#include <linux/netdevice.h>

#include <net/flow.h>
#include <net/sock.h>
#include <net/request_sock.h>
#include <net/netns/hash.h>
#include <net/tcp_states.h>
#include <net/l3mdev.h>

/** struct ip_options - IP Options
 *
 * @faddr - Saved first hop address
 * @nexthop - Saved nexthop address in LSRR and SSRR
 * @is_strictroute - Strict source route
 * @srr_is_hit - Packet destination addr was our one
 * @is_changed - IP checksum more not valid
 * @rr_needaddr - Need to record addr of outgoing dev
 * @ts_needtime - Need to record timestamp
 * @ts_needaddr - Need to record addr of outgoing dev
 */
struct ip_options {
        __be32                faddr;
        __be32                nexthop;
        unsigned char        optlen;
        unsigned char        srr;
        unsigned char        rr;
        unsigned char        ts;
        unsigned char        is_strictroute:1,
                        srr_is_hit:1,
                        is_changed:1,
                        rr_needaddr:1,
                        ts_needtime:1,
                        ts_needaddr:1;
        unsigned char        router_alert;
        unsigned char        cipso;
        unsigned char        __pad2;
        unsigned char        __data[];
};

struct ip_options_rcu {
        struct rcu_head rcu;
        struct ip_options opt;
};

struct ip_options_data {
        struct ip_options_rcu        opt;
        char                        data[40];
};

struct inet_request_sock {
        struct request_sock        req;
#define ir_loc_addr                req.__req_common.skc_rcv_saddr
#define ir_rmt_addr                req.__req_common.skc_daddr
#define ir_num                        req.__req_common.skc_num
#define ir_rmt_port                req.__req_common.skc_dport
#define ir_v6_rmt_addr                req.__req_common.skc_v6_daddr
#define ir_v6_loc_addr                req.__req_common.skc_v6_rcv_saddr
#define ir_iif                        req.__req_common.skc_bound_dev_if
#define ir_cookie                req.__req_common.skc_cookie
#define ireq_net                req.__req_common.skc_net
#define ireq_state                req.__req_common.skc_state
#define ireq_family                req.__req_common.skc_family

        u16                        snd_wscale : 4,
                                rcv_wscale : 4,
                                tstamp_ok  : 1,
                                sack_ok           : 1,
                                wscale_ok  : 1,
                                ecn_ok           : 1,
                                acked           : 1,
                                no_srccheck: 1,
                                smc_ok           : 1;
        u32                     ir_mark;
        union {
                struct ip_options_rcu __rcu        *ireq_opt;
#if IS_ENABLED(CONFIG_IPV6)
                struct {
                        struct ipv6_txoptions        *ipv6_opt;
                        struct sk_buff                *pktopts;
                };
#endif
        };
};

static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk)
{
        return (struct inet_request_sock *)sk;
}

static inline u32 inet_request_mark(const struct sock *sk, struct sk_buff *skb)
{
        if (!sk->sk_mark &&
            READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fwmark_accept))
                return skb->mark;

        return sk->sk_mark;
}

static inline int inet_request_bound_dev_if(const struct sock *sk,
                                            struct sk_buff *skb)
{
        int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
#ifdef CONFIG_NET_L3_MASTER_DEV
        struct net *net = sock_net(sk);

        if (!bound_dev_if && READ_ONCE(net->ipv4.sysctl_tcp_l3mdev_accept))
                return l3mdev_master_ifindex_by_index(net, skb->skb_iif);
#endif

        return bound_dev_if;
}

static inline int inet_sk_bound_l3mdev(const struct sock *sk)
{
#ifdef CONFIG_NET_L3_MASTER_DEV
        struct net *net = sock_net(sk);

        if (!READ_ONCE(net->ipv4.sysctl_tcp_l3mdev_accept))
                return l3mdev_master_ifindex_by_index(net,
                                                      sk->sk_bound_dev_if);
#endif

        return 0;
}

static inline bool inet_bound_dev_eq(bool l3mdev_accept, int bound_dev_if,
                                     int dif, int sdif)
{
        if (!bound_dev_if)
                return !sdif || l3mdev_accept;
        return bound_dev_if == dif || bound_dev_if == sdif;
}

static inline bool inet_sk_bound_dev_eq(struct net *net, int bound_dev_if,
                                        int dif, int sdif)
{
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
        return inet_bound_dev_eq(!!READ_ONCE(net->ipv4.sysctl_tcp_l3mdev_accept),
                                 bound_dev_if, dif, sdif);
#else
        return inet_bound_dev_eq(true, bound_dev_if, dif, sdif);
#endif
}

struct inet_cork {
        unsigned int                flags;
        __be32                        addr;
        struct ip_options        *opt;
        unsigned int                fragsize;
        int                        length; /* Total length of all frames */
        struct dst_entry        *dst;
        u8                        tx_flags;
        __u8                        ttl;
        __s16                        tos;
        char                        priority;
        __u16                        gso_size;
        u64                        transmit_time;
        u32                        mark;
};

struct inet_cork_full {
        struct inet_cork        base;
        struct flowi                fl;
};

struct ip_mc_socklist;
struct ipv6_pinfo;
struct rtable;

/** struct inet_sock - representation of INET sockets
 *
 * @sk - ancestor class
 * @pinet6 - pointer to IPv6 control block
 * @inet_daddr - Foreign IPv4 addr
 * @inet_rcv_saddr - Bound local IPv4 addr
 * @inet_dport - Destination port
 * @inet_num - Local port
 * @inet_saddr - Sending source
 * @uc_ttl - Unicast TTL
 * @inet_sport - Source port
 * @inet_id - ID counter for DF pkts
 * @tos - TOS
 * @mc_ttl - Multicasting TTL
 * @is_icsk - is this an inet_connection_sock?
 * @uc_index - Unicast outgoing device index
 * @mc_index - Multicast device index
 * @mc_list - Group array
 * @cork - info to build ip hdr on each ip frag while socket is corked
 */
struct inet_sock {
        /* sk and pinet6 has to be the first two members of inet_sock */
        struct sock                sk;
#if IS_ENABLED(CONFIG_IPV6)
        struct ipv6_pinfo        *pinet6;
#endif
        /* Socket demultiplex comparisons on incoming packets. */
#define inet_daddr                sk.__sk_common.skc_daddr
#define inet_rcv_saddr                sk.__sk_common.skc_rcv_saddr
#define inet_dport                sk.__sk_common.skc_dport
#define inet_num                sk.__sk_common.skc_num

        __be32                        inet_saddr;
        __s16                        uc_ttl;
        __u16                        cmsg_flags;
        __be16                        inet_sport;
        __u16                        inet_id;

        struct ip_options_rcu __rcu        *inet_opt;
        int                        rx_dst_ifindex;
        __u8                        tos;
        __u8                        min_ttl;
        __u8                        mc_ttl;
        __u8                        pmtudisc;
        __u8                        recverr:1,
                                is_icsk:1,
                                freebind:1,
                                hdrincl:1,
                                mc_loop:1,
                                transparent:1,
                                mc_all:1,
                                nodefrag:1;
        __u8                        bind_address_no_port:1,
                                recverr_rfc4884:1,
                                defer_connect:1; /* Indicates that fastopen_connect is set
                                                  * and cookie exists so we defer connect
                                                  * until first data frame is written
                                                  */
        __u8                        rcv_tos;
        __u8                        convert_csum;
        int                        uc_index;
        int                        mc_index;
        __be32                        mc_addr;
        struct ip_mc_socklist __rcu        *mc_list;
        struct inet_cork_full        cork;
};

#define IPCORK_OPT        1        /* ip-options has been held in ipcork.opt */
#define IPCORK_ALLFRAG        2        /* always fragment (for ipv6 for now) */

/* cmsg flags for inet */
#define IP_CMSG_PKTINFO                BIT(0)
#define IP_CMSG_TTL                BIT(1)
#define IP_CMSG_TOS                BIT(2)
#define IP_CMSG_RECVOPTS        BIT(3)
#define IP_CMSG_RETOPTS                BIT(4)
#define IP_CMSG_PASSSEC                BIT(5)
#define IP_CMSG_ORIGDSTADDR        BIT(6)
#define IP_CMSG_CHECKSUM        BIT(7)
#define IP_CMSG_RECVFRAGSIZE        BIT(8)

/**
 * sk_to_full_sk - Access to a full socket
 * @sk: pointer to a socket
 *
 * SYNACK messages might be attached to request sockets.
 * Some places want to reach the listener in this case.
 */
static inline struct sock *sk_to_full_sk(struct sock *sk)
{
#ifdef CONFIG_INET
        if (sk && sk->sk_state == TCP_NEW_SYN_RECV)
                sk = inet_reqsk(sk)->rsk_listener;
#endif
        return sk;
}

/* sk_to_full_sk() variant with a const argument */
static inline const struct sock *sk_const_to_full_sk(const struct sock *sk)
{
#ifdef CONFIG_INET
        if (sk && sk->sk_state == TCP_NEW_SYN_RECV)
                sk = ((const struct request_sock *)sk)->rsk_listener;
#endif
        return sk;
}

static inline struct sock *skb_to_full_sk(const struct sk_buff *skb)
{
        return sk_to_full_sk(skb->sk);
}

static inline struct inet_sock *inet_sk(const struct sock *sk)
{
        return (struct inet_sock *)sk;
}

static inline void __inet_sk_copy_descendant(struct sock *sk_to,
                                             const struct sock *sk_from,
                                             const int ancestor_size)
{
        memcpy(inet_sk(sk_to) + 1, inet_sk(sk_from) + 1,
               sk_from->sk_prot->obj_size - ancestor_size);
}

int inet_sk_rebuild_header(struct sock *sk);

/**
 * inet_sk_state_load - read sk->sk_state for lockless contexts
 * @sk: socket pointer
 *
 * Paired with inet_sk_state_store(). Used in places we don't hold socket lock:
 * tcp_diag_get_info(), tcp_get_info(), tcp_poll(), get_tcp4_sock() ...
 */
static inline int inet_sk_state_load(const struct sock *sk)
{
        /* state change might impact lockless readers. */
        return smp_load_acquire(&sk->sk_state);
}

/**
 * inet_sk_state_store - update sk->sk_state
 * @sk: socket pointer
 * @newstate: new state
 *
 * Paired with inet_sk_state_load(). Should be used in contexts where
 * state change might impact lockless readers.
 */
void inet_sk_state_store(struct sock *sk, int newstate);

void inet_sk_set_state(struct sock *sk, int state);

static inline unsigned int __inet_ehashfn(const __be32 laddr,
                                          const __u16 lport,
                                          const __be32 faddr,
                                          const __be16 fport,
                                          u32 initval)
{
        return jhash_3words((__force __u32) laddr,
                            (__force __u32) faddr,
                            ((__u32) lport) << 16 | (__force __u32)fport,
                            initval);
}

struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
                                      struct sock *sk_listener,
                                      bool attach_listener);

static inline __u8 inet_sk_flowi_flags(const struct sock *sk)
{
        __u8 flags = 0;

        if (inet_sk(sk)->transparent || inet_sk(sk)->hdrincl)
                flags |= FLOWI_FLAG_ANYSRC;
        return flags;
}

static inline void inet_inc_convert_csum(struct sock *sk)
{
        inet_sk(sk)->convert_csum++;
}

static inline void inet_dec_convert_csum(struct sock *sk)
{
        if (inet_sk(sk)->convert_csum > 0)
                inet_sk(sk)->convert_csum--;
}

static inline bool inet_get_convert_csum(struct sock *sk)
{
        return !!inet_sk(sk)->convert_csum;
}


static inline bool inet_can_nonlocal_bind(struct net *net,
                                          struct inet_sock *inet)
{
        return READ_ONCE(net->ipv4.sysctl_ip_nonlocal_bind) ||
                inet->freebind || inet->transparent;
}

#endif        /* _INET_SOCK_H */































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SWAPOPS_H
#define _LINUX_SWAPOPS_H

#include <linux/radix-tree.h>
#include <linux/bug.h>
#include <linux/mm_types.h>

#ifdef CONFIG_MMU

/*
 * swapcache pages are stored in the swapper_space radix tree.  We want to
 * get good packing density in that tree, so the index should be dense in
 * the low-order bits.
 *
 * We arrange the `type' and `offset' fields so that `type' is at the seven
 * high-order bits of the swp_entry_t and `offset' is right-aligned in the
 * remaining bits.  Although `type' itself needs only five bits, we allow for
 * shmem/tmpfs to shift it all up a further two bits: see swp_to_radix_entry().
 *
 * swp_entry_t's are *never* stored anywhere in their arch-dependent format.
 */
#define SWP_TYPE_SHIFT        (BITS_PER_XA_VALUE - MAX_SWAPFILES_SHIFT)
#define SWP_OFFSET_MASK        ((1UL << SWP_TYPE_SHIFT) - 1)

/* Clear all flags but only keep swp_entry_t related information */
static inline pte_t pte_swp_clear_flags(pte_t pte)
{
        if (pte_swp_soft_dirty(pte))
                pte = pte_swp_clear_soft_dirty(pte);
        if (pte_swp_uffd_wp(pte))
                pte = pte_swp_clear_uffd_wp(pte);
        return pte;
}

/*
 * Store a type+offset into a swp_entry_t in an arch-independent format
 */
static inline swp_entry_t swp_entry(unsigned long type, pgoff_t offset)
{
        swp_entry_t ret;

        ret.val = (type << SWP_TYPE_SHIFT) | (offset & SWP_OFFSET_MASK);
        return ret;
}

/*
 * Extract the `type' field from a swp_entry_t.  The swp_entry_t is in
 * arch-independent format
 */
static inline unsigned swp_type(swp_entry_t entry)
{
        return (entry.val >> SWP_TYPE_SHIFT);
}

/*
 * Extract the `offset' field from a swp_entry_t.  The swp_entry_t is in
 * arch-independent format
 */
static inline pgoff_t swp_offset(swp_entry_t entry)
{
        return entry.val & SWP_OFFSET_MASK;
}

/* check whether a pte points to a swap entry */
static inline int is_swap_pte(pte_t pte)
{
        return !pte_none(pte) && !pte_present(pte);
}

/*
 * Convert the arch-dependent pte representation of a swp_entry_t into an
 * arch-independent swp_entry_t.
 */
static inline swp_entry_t pte_to_swp_entry(pte_t pte)
{
        swp_entry_t arch_entry;

        pte = pte_swp_clear_flags(pte);
        arch_entry = __pte_to_swp_entry(pte);
        return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
}

/*
 * Convert the arch-independent representation of a swp_entry_t into the
 * arch-dependent pte representation.
 */
static inline pte_t swp_entry_to_pte(swp_entry_t entry)
{
        swp_entry_t arch_entry;

        arch_entry = __swp_entry(swp_type(entry), swp_offset(entry));
        return __swp_entry_to_pte(arch_entry);
}

static inline swp_entry_t radix_to_swp_entry(void *arg)
{
        swp_entry_t entry;

        entry.val = xa_to_value(arg);
        return entry;
}

static inline void *swp_to_radix_entry(swp_entry_t entry)
{
        return xa_mk_value(entry.val);
}

#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
static inline swp_entry_t make_device_private_entry(struct page *page, bool write)
{
        return swp_entry(write ? SWP_DEVICE_WRITE : SWP_DEVICE_READ,
                         page_to_pfn(page));
}

static inline bool is_device_private_entry(swp_entry_t entry)
{
        int type = swp_type(entry);
        return type == SWP_DEVICE_READ || type == SWP_DEVICE_WRITE;
}

static inline void make_device_private_entry_read(swp_entry_t *entry)
{
        *entry = swp_entry(SWP_DEVICE_READ, swp_offset(*entry));
}

static inline bool is_write_device_private_entry(swp_entry_t entry)
{
        return unlikely(swp_type(entry) == SWP_DEVICE_WRITE);
}

static inline unsigned long device_private_entry_to_pfn(swp_entry_t entry)
{
        return swp_offset(entry);
}

static inline struct page *device_private_entry_to_page(swp_entry_t entry)
{
        return pfn_to_page(swp_offset(entry));
}
#else /* CONFIG_DEVICE_PRIVATE */
static inline swp_entry_t make_device_private_entry(struct page *page, bool write)
{
        return swp_entry(0, 0);
}

static inline void make_device_private_entry_read(swp_entry_t *entry)
{
}

static inline bool is_device_private_entry(swp_entry_t entry)
{
        return false;
}

static inline bool is_write_device_private_entry(swp_entry_t entry)
{
        return false;
}

static inline unsigned long device_private_entry_to_pfn(swp_entry_t entry)
{
        return 0;
}

static inline struct page *device_private_entry_to_page(swp_entry_t entry)
{
        return NULL;
}
#endif /* CONFIG_DEVICE_PRIVATE */

#ifdef CONFIG_MIGRATION
static inline swp_entry_t make_migration_entry(struct page *page, int write)
{
        BUG_ON(!PageLocked(compound_head(page)));

        return swp_entry(write ? SWP_MIGRATION_WRITE : SWP_MIGRATION_READ,
                        page_to_pfn(page));
}

static inline int is_migration_entry(swp_entry_t entry)
{
        return unlikely(swp_type(entry) == SWP_MIGRATION_READ ||
                        swp_type(entry) == SWP_MIGRATION_WRITE);
}

static inline int is_write_migration_entry(swp_entry_t entry)
{
        return unlikely(swp_type(entry) == SWP_MIGRATION_WRITE);
}

static inline unsigned long migration_entry_to_pfn(swp_entry_t entry)
{
        return swp_offset(entry);
}

static inline struct page *migration_entry_to_page(swp_entry_t entry)
{
        struct page *p = pfn_to_page(swp_offset(entry));
        /*
         * Ensure we do not race with split, which might alter tail pages
         * into new folios and thus result in observing an unlocked page.
         * This matches the write barrier in __split_huge_page_tail().
         */
        smp_rmb();
        /*
         * Any use of migration entries may only occur while the
         * corresponding page is locked
         */
        BUG_ON(!PageLocked(compound_head(p)));
        return p;
}

static inline void make_migration_entry_read(swp_entry_t *entry)
{
        *entry = swp_entry(SWP_MIGRATION_READ, swp_offset(*entry));
}

extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
                                        spinlock_t *ptl);
extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
                                        unsigned long address);
extern void migration_entry_wait_huge(struct vm_area_struct *vma,
                struct mm_struct *mm, pte_t *pte);
#else

#define make_migration_entry(page, write) swp_entry(0, 0)
static inline int is_migration_entry(swp_entry_t swp)
{
        return 0;
}

static inline unsigned long migration_entry_to_pfn(swp_entry_t entry)
{
        return 0;
}

static inline struct page *migration_entry_to_page(swp_entry_t entry)
{
        return NULL;
}

static inline void make_migration_entry_read(swp_entry_t *entryp) { }
static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
                                        spinlock_t *ptl) { }
static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
                                         unsigned long address) { }
static inline void migration_entry_wait_huge(struct vm_area_struct *vma,
                struct mm_struct *mm, pte_t *pte) { }
static inline int is_write_migration_entry(swp_entry_t entry)
{
        return 0;
}

#endif

struct page_vma_mapped_walk;

#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
extern void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
                struct page *page);

extern void remove_migration_pmd(struct page_vma_mapped_walk *pvmw,
                struct page *new);

extern void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd);

static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd)
{
        swp_entry_t arch_entry;

        if (pmd_swp_soft_dirty(pmd))
                pmd = pmd_swp_clear_soft_dirty(pmd);
        if (pmd_swp_uffd_wp(pmd))
                pmd = pmd_swp_clear_uffd_wp(pmd);
        arch_entry = __pmd_to_swp_entry(pmd);
        return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
}

static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
{
        swp_entry_t arch_entry;

        arch_entry = __swp_entry(swp_type(entry), swp_offset(entry));
        return __swp_entry_to_pmd(arch_entry);
}

static inline int is_pmd_migration_entry(pmd_t pmd)
{
        return !pmd_present(pmd) && is_migration_entry(pmd_to_swp_entry(pmd));
}
#else
static inline void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
                struct page *page)
{
        BUILD_BUG();
}

static inline void remove_migration_pmd(struct page_vma_mapped_walk *pvmw,
                struct page *new)
{
        BUILD_BUG();
}

static inline void pmd_migration_entry_wait(struct mm_struct *m, pmd_t *p) { }

static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd)
{
        return swp_entry(0, 0);
}

static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
{
        return __pmd(0);
}

static inline int is_pmd_migration_entry(pmd_t pmd)
{
        return 0;
}
#endif

#ifdef CONFIG_MEMORY_FAILURE

extern atomic_long_t num_poisoned_pages __read_mostly;

/*
 * Support for hardware poisoned pages
 */
static inline swp_entry_t make_hwpoison_entry(struct page *page)
{
        BUG_ON(!PageLocked(page));
        return swp_entry(SWP_HWPOISON, page_to_pfn(page));
}

static inline int is_hwpoison_entry(swp_entry_t entry)
{
        return swp_type(entry) == SWP_HWPOISON;
}

static inline void num_poisoned_pages_inc(void)
{
        atomic_long_inc(&num_poisoned_pages);
}

static inline void num_poisoned_pages_dec(void)
{
        atomic_long_dec(&num_poisoned_pages);
}

#else

static inline swp_entry_t make_hwpoison_entry(struct page *page)
{
        return swp_entry(0, 0);
}

static inline int is_hwpoison_entry(swp_entry_t swp)
{
        return 0;
}

static inline void num_poisoned_pages_inc(void)
{
}
#endif

#if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION) || \
    defined(CONFIG_DEVICE_PRIVATE)
static inline int non_swap_entry(swp_entry_t entry)
{
        return swp_type(entry) >= MAX_SWAPFILES;
}
#else
static inline int non_swap_entry(swp_entry_t entry)
{
        return 0;
}
#endif

#endif /* CONFIG_MMU */
#endif /* _LINUX_SWAPOPS_H */
































































































































































































































    1 
    1 





































































































    1 


















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 








    1 









    1 





































    1 


























    1 


    1 












    1 






































    1 

    1 







































    1 


    1 







    1 











    1 
    1 

    1 
















    1 

    1 

    1 





















































    1 


    1 

























    1 
    1 

























    1 





















    1 
    1 


    1 




    1 




    1 
    1 





















    1 






    1 

    1 






    1 














    1 


    1 



    1 















































































































































































































    1 



    1 















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
// SPDX-License-Identifier: GPL-2.0-or-later
/* audit.c -- Auditing support
 * Gateway between the kernel (e.g., selinux) and the user-space audit daemon.
 * System-call specific features have moved to auditsc.c
 *
 * Copyright 2003-2007 Red Hat Inc., Durham, North Carolina.
 * All Rights Reserved.
 *
 * Written by Rickard E. (Rik) Faith <faith@redhat.com>
 *
 * Goals: 1) Integrate fully with Security Modules.
 *          2) Minimal run-time overhead:
 *             a) Minimal when syscall auditing is disabled (audit_enable=0).
 *             b) Small when syscall auditing is enabled and no audit record
 *                is generated (defer as much work as possible to record
 *                generation time):
 *                i) context is allocated,
 *                ii) names from getname are stored without a copy, and
 *                iii) inode information stored from path_lookup.
 *          3) Ability to disable syscall auditing at boot time (audit=0).
 *          4) Usable by other parts of the kernel (if audit_log* is called,
 *             then a syscall record will be generated automatically for the
 *             current syscall).
 *          5) Netlink interface to user-space.
 *          6) Support low-overhead kernel-based filtering to minimize the
 *             information that must be passed to user-space.
 *
 * Audit userspace, documentation, tests, and bug/issue trackers:
 *         https://github.com/linux-audit
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/file.h>
#include <linux/init.h>
#include <linux/types.h>
#include <linux/atomic.h>
#include <linux/mm.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/err.h>
#include <linux/kthread.h>
#include <linux/kernel.h>
#include <linux/syscalls.h>
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/mutex.h>
#include <linux/gfp.h>
#include <linux/pid.h>

#include <linux/audit.h>

#include <net/sock.h>
#include <net/netlink.h>
#include <linux/skbuff.h>
#ifdef CONFIG_SECURITY
#include <linux/security.h>
#endif
#include <linux/freezer.h>
#include <linux/pid_namespace.h>
#include <net/netns/generic.h>

#include "audit.h"

/* No auditing will take place until audit_initialized == AUDIT_INITIALIZED.
 * (Initialization happens after skb_init is called.) */
#define AUDIT_DISABLED                -1
#define AUDIT_UNINITIALIZED        0
#define AUDIT_INITIALIZED        1
static int        audit_initialized;

u32                audit_enabled = AUDIT_OFF;
bool                audit_ever_enabled = !!AUDIT_OFF;

EXPORT_SYMBOL_GPL(audit_enabled);

/* Default state when kernel boots without any parameters. */
static u32        audit_default = AUDIT_OFF;

/* If auditing cannot proceed, audit_failure selects what happens. */
static u32        audit_failure = AUDIT_FAIL_PRINTK;

/* private audit network namespace index */
static unsigned int audit_net_id;

/**
 * struct audit_net - audit private network namespace data
 * @sk: communication socket
 */
struct audit_net {
        struct sock *sk;
};

/**
 * struct auditd_connection - kernel/auditd connection state
 * @pid: auditd PID
 * @portid: netlink portid
 * @net: the associated network namespace
 * @rcu: RCU head
 *
 * Description:
 * This struct is RCU protected; you must either hold the RCU lock for reading
 * or the associated spinlock for writing.
 */
struct auditd_connection {
        struct pid *pid;
        u32 portid;
        struct net *net;
        struct rcu_head rcu;
};
static struct auditd_connection __rcu *auditd_conn;
static DEFINE_SPINLOCK(auditd_conn_lock);

/* If audit_rate_limit is non-zero, limit the rate of sending audit records
 * to that number per second.  This prevents DoS attacks, but results in
 * audit records being dropped. */
static u32        audit_rate_limit;

/* Number of outstanding audit_buffers allowed.
 * When set to zero, this means unlimited. */
static u32        audit_backlog_limit = 64;
#define AUDIT_BACKLOG_WAIT_TIME (60 * HZ)
static u32        audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;

/* The identity of the user shutting down the audit system. */
static kuid_t                audit_sig_uid = INVALID_UID;
static pid_t                audit_sig_pid = -1;
static u32                audit_sig_sid;

/* Records can be lost in several ways:
   0) [suppressed in audit_alloc]
   1) out of memory in audit_log_start [kmalloc of struct audit_buffer]
   2) out of memory in audit_log_move [alloc_skb]
   3) suppressed due to audit_rate_limit
   4) suppressed due to audit_backlog_limit
*/
static atomic_t        audit_lost = ATOMIC_INIT(0);

/* Monotonically increasing sum of time the kernel has spent
 * waiting while the backlog limit is exceeded.
 */
static atomic_t audit_backlog_wait_time_actual = ATOMIC_INIT(0);

/* Hash for inode-based rules */
struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];

static struct kmem_cache *audit_buffer_cache;

/* queue msgs to send via kauditd_task */
static struct sk_buff_head audit_queue;
/* queue msgs due to temporary unicast send problems */
static struct sk_buff_head audit_retry_queue;
/* queue msgs waiting for new auditd connection */
static struct sk_buff_head audit_hold_queue;

/* queue servicing thread */
static struct task_struct *kauditd_task;
static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);

/* waitqueue for callers who are blocked on the audit backlog */
static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);

static struct audit_features af = {.vers = AUDIT_FEATURE_VERSION,
                                   .mask = -1,
                                   .features = 0,
                                   .lock = 0,};

static char *audit_feature_names[2] = {
        "only_unset_loginuid",
        "loginuid_immutable",
};

/**
 * struct audit_ctl_mutex - serialize requests from userspace
 * @lock: the mutex used for locking
 * @owner: the task which owns the lock
 *
 * Description:
 * This is the lock struct used to ensure we only process userspace requests
 * in an orderly fashion.  We can't simply use a mutex/lock here because we
 * need to track lock ownership so we don't end up blocking the lock owner in
 * audit_log_start() or similar.
 */
static struct audit_ctl_mutex {
        struct mutex lock;
        void *owner;
} audit_cmd_mutex;

/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
 * audit records.  Since printk uses a 1024 byte buffer, this buffer
 * should be at least that large. */
#define AUDIT_BUFSIZ 1024

/* The audit_buffer is used when formatting an audit record.  The caller
 * locks briefly to get the record off the freelist or to allocate the
 * buffer, and locks briefly to send the buffer to the netlink layer or
 * to place it on a transmit queue.  Multiple audit_buffers can be in
 * use simultaneously. */
struct audit_buffer {
        struct sk_buff       *skb;        /* formatted skb ready to send */
        struct audit_context *ctx;        /* NULL or associated context */
        gfp_t                     gfp_mask;
};

struct audit_reply {
        __u32 portid;
        struct net *net;
        struct sk_buff *skb;
};

/**
 * auditd_test_task - Check to see if a given task is an audit daemon
 * @task: the task to check
 *
 * Description:
 * Return 1 if the task is a registered audit daemon, 0 otherwise.
 */
int auditd_test_task(struct task_struct *task)
{
        int rc;
        struct auditd_connection *ac;

        rcu_read_lock();
        ac = rcu_dereference(auditd_conn);
        rc = (ac && ac->pid == task_tgid(task) ? 1 : 0);
        rcu_read_unlock();

        return rc;
}

/**
 * audit_ctl_lock - Take the audit control lock
 */
void audit_ctl_lock(void)
{
        mutex_lock(&audit_cmd_mutex.lock);
        audit_cmd_mutex.owner = current;
}

/**
 * audit_ctl_unlock - Drop the audit control lock
 */
void audit_ctl_unlock(void)
{
        audit_cmd_mutex.owner = NULL;
        mutex_unlock(&audit_cmd_mutex.lock);
}

/**
 * audit_ctl_owner_current - Test to see if the current task owns the lock
 *
 * Description:
 * Return true if the current task owns the audit control lock, false if it
 * doesn't own the lock.
 */
static bool audit_ctl_owner_current(void)
{
        return (current == audit_cmd_mutex.owner);
}

/**
 * auditd_pid_vnr - Return the auditd PID relative to the namespace
 *
 * Description:
 * Returns the PID in relation to the namespace, 0 on failure.
 */
static pid_t auditd_pid_vnr(void)
{
        pid_t pid;
        const struct auditd_connection *ac;

        rcu_read_lock();
        ac = rcu_dereference(auditd_conn);
        if (!ac || !ac->pid)
                pid = 0;
        else
                pid = pid_vnr(ac->pid);
        rcu_read_unlock();

        return pid;
}

/**
 * audit_get_sk - Return the audit socket for the given network namespace
 * @net: the destination network namespace
 *
 * Description:
 * Returns the sock pointer if valid, NULL otherwise.  The caller must ensure
 * that a reference is held for the network namespace while the sock is in use.
 */
static struct sock *audit_get_sk(const struct net *net)
{
        struct audit_net *aunet;

        if (!net)
                return NULL;

        aunet = net_generic(net, audit_net_id);
        return aunet->sk;
}

void audit_panic(const char *message)
{
        switch (audit_failure) {
        case AUDIT_FAIL_SILENT:
                break;
        case AUDIT_FAIL_PRINTK:
                if (printk_ratelimit())
                        pr_err("%s\n", message);
                break;
        case AUDIT_FAIL_PANIC:
                panic("audit: %s\n", message);
                break;
        }
}

static inline int audit_rate_check(void)
{
        static unsigned long        last_check = 0;
        static int                messages   = 0;
        static DEFINE_SPINLOCK(lock);
        unsigned long                flags;
        unsigned long                now;
        unsigned long                elapsed;
        int                        retval           = 0;

        if (!audit_rate_limit) return 1;

        spin_lock_irqsave(&lock, flags);
        if (++messages < audit_rate_limit) {
                retval = 1;
        } else {
                now     = jiffies;
                elapsed = now - last_check;
                if (elapsed > HZ) {
                        last_check = now;
                        messages   = 0;
                        retval     = 1;
                }
        }
        spin_unlock_irqrestore(&lock, flags);

        return retval;
}

/**
 * audit_log_lost - conditionally log lost audit message event
 * @message: the message stating reason for lost audit message
 *
 * Emit at least 1 message per second, even if audit_rate_check is
 * throttling.
 * Always increment the lost messages counter.
*/
void audit_log_lost(const char *message)
{
        static unsigned long        last_msg = 0;
        static DEFINE_SPINLOCK(lock);
        unsigned long                flags;
        unsigned long                now;
        int                        print;

        atomic_inc(&audit_lost);

        print = (audit_failure == AUDIT_FAIL_PANIC || !audit_rate_limit);

        if (!print) {
                spin_lock_irqsave(&lock, flags);
                now = jiffies;
                if (now - last_msg > HZ) {
                        print = 1;
                        last_msg = now;
                }
                spin_unlock_irqrestore(&lock, flags);
        }

        if (print) {
                if (printk_ratelimit())
                        pr_warn("audit_lost=%u audit_rate_limit=%u audit_backlog_limit=%u\n",
                                atomic_read(&audit_lost),
                                audit_rate_limit,
                                audit_backlog_limit);
                audit_panic(message);
        }
}

static int audit_log_config_change(char *function_name, u32 new, u32 old,
                                   int allow_changes)
{
        struct audit_buffer *ab;
        int rc = 0;

        ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE);
        if (unlikely(!ab))
                return rc;
        audit_log_format(ab, "op=set %s=%u old=%u ", function_name, new, old);
        audit_log_session_info(ab);
        rc = audit_log_task_context(ab);
        if (rc)
                allow_changes = 0; /* Something weird, deny request */
        audit_log_format(ab, " res=%d", allow_changes);
        audit_log_end(ab);
        return rc;
}

static int audit_do_config_change(char *function_name, u32 *to_change, u32 new)
{
        int allow_changes, rc = 0;
        u32 old = *to_change;

        /* check if we are locked */
        if (audit_enabled == AUDIT_LOCKED)
                allow_changes = 0;
        else
                allow_changes = 1;

        if (audit_enabled != AUDIT_OFF) {
                rc = audit_log_config_change(function_name, new, old, allow_changes);
                if (rc)
                        allow_changes = 0;
        }

        /* If we are allowed, make the change */
        if (allow_changes == 1)
                *to_change = new;
        /* Not allowed, update reason */
        else if (rc == 0)
                rc = -EPERM;
        return rc;
}

static int audit_set_rate_limit(u32 limit)
{
        return audit_do_config_change("audit_rate_limit", &audit_rate_limit, limit);
}

static int audit_set_backlog_limit(u32 limit)
{
        return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, limit);
}

static int audit_set_backlog_wait_time(u32 timeout)
{
        return audit_do_config_change("audit_backlog_wait_time",
                                      &audit_backlog_wait_time, timeout);
}

static int audit_set_enabled(u32 state)
{
        int rc;
        if (state > AUDIT_LOCKED)
                return -EINVAL;

        rc =  audit_do_config_change("audit_enabled", &audit_enabled, state);
        if (!rc)
                audit_ever_enabled |= !!state;

        return rc;
}

static int audit_set_failure(u32 state)
{
        if (state != AUDIT_FAIL_SILENT
            && state != AUDIT_FAIL_PRINTK
            && state != AUDIT_FAIL_PANIC)
                return -EINVAL;

        return audit_do_config_change("audit_failure", &audit_failure, state);
}

/**
 * auditd_conn_free - RCU helper to release an auditd connection struct
 * @rcu: RCU head
 *
 * Description:
 * Drop any references inside the auditd connection tracking struct and free
 * the memory.
 */
static void auditd_conn_free(struct rcu_head *rcu)
{
        struct auditd_connection *ac;

        ac = container_of(rcu, struct auditd_connection, rcu);
        put_pid(ac->pid);
        put_net(ac->net);
        kfree(ac);
}

/**
 * auditd_set - Set/Reset the auditd connection state
 * @pid: auditd PID
 * @portid: auditd netlink portid
 * @net: auditd network namespace pointer
 * @skb: the netlink command from the audit daemon
 * @ack: netlink ack flag, cleared if ack'd here
 *
 * Description:
 * This function will obtain and drop network namespace references as
 * necessary.  Returns zero on success, negative values on failure.
 */
static int auditd_set(struct pid *pid, u32 portid, struct net *net,
                      struct sk_buff *skb, bool *ack)
{
        unsigned long flags;
        struct auditd_connection *ac_old, *ac_new;
        struct nlmsghdr *nlh;

        if (!pid || !net)
                return -EINVAL;

        ac_new = kzalloc(sizeof(*ac_new), GFP_KERNEL);
        if (!ac_new)
                return -ENOMEM;
        ac_new->pid = get_pid(pid);
        ac_new->portid = portid;
        ac_new->net = get_net(net);

        /* send the ack now to avoid a race with the queue backlog */
        if (*ack) {
                nlh = nlmsg_hdr(skb);
                netlink_ack(skb, nlh, 0, NULL);
                *ack = false;
        }

        spin_lock_irqsave(&auditd_conn_lock, flags);
        ac_old = rcu_dereference_protected(auditd_conn,
                                           lockdep_is_held(&auditd_conn_lock));
        rcu_assign_pointer(auditd_conn, ac_new);
        spin_unlock_irqrestore(&auditd_conn_lock, flags);

        if (ac_old)
                call_rcu(&ac_old->rcu, auditd_conn_free);

        return 0;
}

/**
 * kauditd_print_skb - Print the audit record to the ring buffer
 * @skb: audit record
 *
 * Whatever the reason, this packet may not make it to the auditd connection
 * so write it via printk so the information isn't completely lost.
 */
static void kauditd_printk_skb(struct sk_buff *skb)
{
        struct nlmsghdr *nlh = nlmsg_hdr(skb);
        char *data = nlmsg_data(nlh);

        if (nlh->nlmsg_type != AUDIT_EOE && printk_ratelimit())
                pr_notice("type=%d %s\n", nlh->nlmsg_type, data);
}

/**
 * kauditd_rehold_skb - Handle a audit record send failure in the hold queue
 * @skb: audit record
 * @error: error code (unused)
 *
 * Description:
 * This should only be used by the kauditd_thread when it fails to flush the
 * hold queue.
 */
static void kauditd_rehold_skb(struct sk_buff *skb, __always_unused int error)
{
        /* put the record back in the queue */
        skb_queue_tail(&audit_hold_queue, skb);
}

/**
 * kauditd_hold_skb - Queue an audit record, waiting for auditd
 * @skb: audit record
 * @error: error code
 *
 * Description:
 * Queue the audit record, waiting for an instance of auditd.  When this
 * function is called we haven't given up yet on sending the record, but things
 * are not looking good.  The first thing we want to do is try to write the
 * record via printk and then see if we want to try and hold on to the record
 * and queue it, if we have room.  If we want to hold on to the record, but we
 * don't have room, record a record lost message.
 */
static void kauditd_hold_skb(struct sk_buff *skb, int error)
{
        /* at this point it is uncertain if we will ever send this to auditd so
         * try to send the message via printk before we go any further */
        kauditd_printk_skb(skb);

        /* can we just silently drop the message? */
        if (!audit_default)
                goto drop;

        /* the hold queue is only for when the daemon goes away completely,
         * not -EAGAIN failures; if we are in a -EAGAIN state requeue the
         * record on the retry queue unless it's full, in which case drop it
         */
        if (error == -EAGAIN) {
                if (!audit_backlog_limit ||
                    skb_queue_len(&audit_retry_queue) < audit_backlog_limit) {
                        skb_queue_tail(&audit_retry_queue, skb);
                        return;
                }
                audit_log_lost("kauditd retry queue overflow");
                goto drop;
        }

        /* if we have room in the hold queue, queue the message */
        if (!audit_backlog_limit ||
            skb_queue_len(&audit_hold_queue) < audit_backlog_limit) {
                skb_queue_tail(&audit_hold_queue, skb);
                return;
        }

        /* we have no other options - drop the message */
        audit_log_lost("kauditd hold queue overflow");
drop:
        kfree_skb(skb);
}

/**
 * kauditd_retry_skb - Queue an audit record, attempt to send again to auditd
 * @skb: audit record
 * @error: error code (unused)
 *
 * Description:
 * Not as serious as kauditd_hold_skb() as we still have a connected auditd,
 * but for some reason we are having problems sending it audit records so
 * queue the given record and attempt to resend.
 */
static void kauditd_retry_skb(struct sk_buff *skb, __always_unused int error)
{
        if (!audit_backlog_limit ||
            skb_queue_len(&audit_retry_queue) < audit_backlog_limit) {
                skb_queue_tail(&audit_retry_queue, skb);
                return;
        }

        /* we have to drop the record, send it via printk as a last effort */
        kauditd_printk_skb(skb);
        audit_log_lost("kauditd retry queue overflow");
        kfree_skb(skb);
}

/**
 * auditd_reset - Disconnect the auditd connection
 * @ac: auditd connection state
 *
 * Description:
 * Break the auditd/kauditd connection and move all the queued records into the
 * hold queue in case auditd reconnects.  It is important to note that the @ac
 * pointer should never be dereferenced inside this function as it may be NULL
 * or invalid, you can only compare the memory address!  If @ac is NULL then
 * the connection will always be reset.
 */
static void auditd_reset(const struct auditd_connection *ac)
{
        unsigned long flags;
        struct sk_buff *skb;
        struct auditd_connection *ac_old;

        /* if it isn't already broken, break the connection */
        spin_lock_irqsave(&auditd_conn_lock, flags);
        ac_old = rcu_dereference_protected(auditd_conn,
                                           lockdep_is_held(&auditd_conn_lock));
        if (ac && ac != ac_old) {
                /* someone already registered a new auditd connection */
                spin_unlock_irqrestore(&auditd_conn_lock, flags);
                return;
        }
        rcu_assign_pointer(auditd_conn, NULL);
        spin_unlock_irqrestore(&auditd_conn_lock, flags);

        if (ac_old)
                call_rcu(&ac_old->rcu, auditd_conn_free);

        /* flush the retry queue to the hold queue, but don't touch the main
         * queue since we need to process that normally for multicast */
        while ((skb = skb_dequeue(&audit_retry_queue)))
                kauditd_hold_skb(skb, -ECONNREFUSED);
}

/**
 * auditd_send_unicast_skb - Send a record via unicast to auditd
 * @skb: audit record
 *
 * Description:
 * Send a skb to the audit daemon, returns positive/zero values on success and
 * negative values on failure; in all cases the skb will be consumed by this
 * function.  If the send results in -ECONNREFUSED the connection with auditd
 * will be reset.  This function may sleep so callers should not hold any locks
 * where this would cause a problem.
 */
static int auditd_send_unicast_skb(struct sk_buff *skb)
{
        int rc;
        u32 portid;
        struct net *net;
        struct sock *sk;
        struct auditd_connection *ac;

        /* NOTE: we can't call netlink_unicast while in the RCU section so
         *       take a reference to the network namespace and grab local
         *       copies of the namespace, the sock, and the portid; the
         *       namespace and sock aren't going to go away while we hold a
         *       reference and if the portid does become invalid after the RCU
         *       section netlink_unicast() should safely return an error */

        rcu_read_lock();
        ac = rcu_dereference(auditd_conn);
        if (!ac) {
                rcu_read_unlock();
                kfree_skb(skb);
                rc = -ECONNREFUSED;
                goto err;
        }
        net = get_net(ac->net);
        sk = audit_get_sk(net);
        portid = ac->portid;
        rcu_read_unlock();

        rc = netlink_unicast(sk, skb, portid, 0);
        put_net(net);
        if (rc < 0)
                goto err;

        return rc;

err:
        if (ac && rc == -ECONNREFUSED)
                auditd_reset(ac);
        return rc;
}

/**
 * kauditd_send_queue - Helper for kauditd_thread to flush skb queues
 * @sk: the sending sock
 * @portid: the netlink destination
 * @queue: the skb queue to process
 * @retry_limit: limit on number of netlink unicast failures
 * @skb_hook: per-skb hook for additional processing
 * @err_hook: hook called if the skb fails the netlink unicast send
 *
 * Description:
 * Run through the given queue and attempt to send the audit records to auditd,
 * returns zero on success, negative values on failure.  It is up to the caller
 * to ensure that the @sk is valid for the duration of this function.
 *
 */
static int kauditd_send_queue(struct sock *sk, u32 portid,
                              struct sk_buff_head *queue,
                              unsigned int retry_limit,
                              void (*skb_hook)(struct sk_buff *skb),
                              void (*err_hook)(struct sk_buff *skb, int error))
{
        int rc = 0;
        struct sk_buff *skb = NULL;
        struct sk_buff *skb_tail;
        unsigned int failed = 0;

        /* NOTE: kauditd_thread takes care of all our locking, we just use
         *       the netlink info passed to us (e.g. sk and portid) */

        skb_tail = skb_peek_tail(queue);
        while ((skb != skb_tail) && (skb = skb_dequeue(queue))) {
                /* call the skb_hook for each skb we touch */
                if (skb_hook)
                        (*skb_hook)(skb);

                /* can we send to anyone via unicast? */
                if (!sk) {
                        if (err_hook)
                                (*err_hook)(skb, -ECONNREFUSED);
                        continue;
                }

retry:
                /* grab an extra skb reference in case of error */
                skb_get(skb);
                rc = netlink_unicast(sk, skb, portid, 0);
                if (rc < 0) {
                        /* send failed - try a few times unless fatal error */
                        if (++failed >= retry_limit ||
                            rc == -ECONNREFUSED || rc == -EPERM) {
                                sk = NULL;
                                if (err_hook)
                                        (*err_hook)(skb, rc);
                                if (rc == -EAGAIN)
                                        rc = 0;
                                /* continue to drain the queue */
                                continue;
                        } else
                                goto retry;
                } else {
                        /* skb sent - drop the extra reference and continue */
                        consume_skb(skb);
                        failed = 0;
                }
        }

        return (rc >= 0 ? 0 : rc);
}

/*
 * kauditd_send_multicast_skb - Send a record to any multicast listeners
 * @skb: audit record
 *
 * Description:
 * Write a multicast message to anyone listening in the initial network
 * namespace.  This function doesn't consume an skb as might be expected since
 * it has to copy it anyways.
 */
static void kauditd_send_multicast_skb(struct sk_buff *skb)
{
        struct sk_buff *copy;
        struct sock *sock = audit_get_sk(&init_net);
        struct nlmsghdr *nlh;

        /* NOTE: we are not taking an additional reference for init_net since
         *       we don't have to worry about it going away */

        if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG))
                return;

        /*
         * The seemingly wasteful skb_copy() rather than bumping the refcount
         * using skb_get() is necessary because non-standard mods are made to
         * the skb by the original kaudit unicast socket send routine.  The
         * existing auditd daemon assumes this breakage.  Fixing this would
         * require co-ordinating a change in the established protocol between
         * the kaudit kernel subsystem and the auditd userspace code.  There is
         * no reason for new multicast clients to continue with this
         * non-compliance.
         */
        copy = skb_copy(skb, GFP_KERNEL);
        if (!copy)
                return;
        nlh = nlmsg_hdr(copy);
        nlh->nlmsg_len = skb->len;

        nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL);
}

/**
 * kauditd_thread - Worker thread to send audit records to userspace
 * @dummy: unused
 */
static int kauditd_thread(void *dummy)
{
        int rc;
        u32 portid = 0;
        struct net *net = NULL;
        struct sock *sk = NULL;
        struct auditd_connection *ac;

#define UNICAST_RETRIES 5

        set_freezable();
        while (!kthread_should_stop()) {
                /* NOTE: see the lock comments in auditd_send_unicast_skb() */
                rcu_read_lock();
                ac = rcu_dereference(auditd_conn);
                if (!ac) {
                        rcu_read_unlock();
                        goto main_queue;
                }
                net = get_net(ac->net);
                sk = audit_get_sk(net);
                portid = ac->portid;
                rcu_read_unlock();

                /* attempt to flush the hold queue */
                rc = kauditd_send_queue(sk, portid,
                                        &audit_hold_queue, UNICAST_RETRIES,
                                        NULL, kauditd_rehold_skb);
                if (rc < 0) {
                        sk = NULL;
                        auditd_reset(ac);
                        goto main_queue;
                }

                /* attempt to flush the retry queue */
                rc = kauditd_send_queue(sk, portid,
                                        &audit_retry_queue, UNICAST_RETRIES,
                                        NULL, kauditd_hold_skb);
                if (rc < 0) {
                        sk = NULL;
                        auditd_reset(ac);
                        goto main_queue;
                }

main_queue:
                /* process the main queue - do the multicast send and attempt
                 * unicast, dump failed record sends to the retry queue; if
                 * sk == NULL due to previous failures we will just do the
                 * multicast send and move the record to the hold queue */
                rc = kauditd_send_queue(sk, portid, &audit_queue, 1,
                                        kauditd_send_multicast_skb,
                                        (sk ?
                                         kauditd_retry_skb : kauditd_hold_skb));
                if (ac && rc < 0)
                        auditd_reset(ac);
                sk = NULL;

                /* drop our netns reference, no auditd sends past this line */
                if (net) {
                        put_net(net);
                        net = NULL;
                }

                /* we have processed all the queues so wake everyone */
                wake_up(&audit_backlog_wait);

                /* NOTE: we want to wake up if there is anything on the queue,
                 *       regardless of if an auditd is connected, as we need to
                 *       do the multicast send and rotate records from the
                 *       main queue to the retry/hold queues */
                wait_event_freezable(kauditd_wait,
                                     (skb_queue_len(&audit_queue) ? 1 : 0));
        }

        return 0;
}

int audit_send_list_thread(void *_dest)
{
        struct audit_netlink_list *dest = _dest;
        struct sk_buff *skb;
        struct sock *sk = audit_get_sk(dest->net);

        /* wait for parent to finish and send an ACK */
        audit_ctl_lock();
        audit_ctl_unlock();

        while ((skb = __skb_dequeue(&dest->q)) != NULL)
                netlink_unicast(sk, skb, dest->portid, 0);

        put_net(dest->net);
        kfree(dest);

        return 0;
}

struct sk_buff *audit_make_reply(int seq, int type, int done,
                                 int multi, const void *payload, int size)
{
        struct sk_buff        *skb;
        struct nlmsghdr        *nlh;
        void                *data;
        int                flags = multi ? NLM_F_MULTI : 0;
        int                t     = done  ? NLMSG_DONE  : type;

        skb = nlmsg_new(size, GFP_KERNEL);
        if (!skb)
                return NULL;

        nlh        = nlmsg_put(skb, 0, seq, t, size, flags);
        if (!nlh)
                goto out_kfree_skb;
        data = nlmsg_data(nlh);
        memcpy(data, payload, size);
        return skb;

out_kfree_skb:
        kfree_skb(skb);
        return NULL;
}

static void audit_free_reply(struct audit_reply *reply)
{
        if (!reply)
                return;

        kfree_skb(reply->skb);
        if (reply->net)
                put_net(reply->net);
        kfree(reply);
}

static int audit_send_reply_thread(void *arg)
{
        struct audit_reply *reply = (struct audit_reply *)arg;

        audit_ctl_lock();
        audit_ctl_unlock();

        /* Ignore failure. It'll only happen if the sender goes away,
           because our timeout is set to infinite. */
        netlink_unicast(audit_get_sk(reply->net), reply->skb, reply->portid, 0);
        reply->skb = NULL;
        audit_free_reply(reply);
        return 0;
}

/**
 * audit_send_reply - send an audit reply message via netlink
 * @request_skb: skb of request we are replying to (used to target the reply)
 * @seq: sequence number
 * @type: audit message type
 * @done: done (last) flag
 * @multi: multi-part message flag
 * @payload: payload data
 * @size: payload size
 *
 * Allocates a skb, builds the netlink message, and sends it to the port id.
 */
static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done,
                             int multi, const void *payload, int size)
{
        struct task_struct *tsk;
        struct audit_reply *reply;

        reply = kzalloc(sizeof(*reply), GFP_KERNEL);
        if (!reply)
                return;

        reply->skb = audit_make_reply(seq, type, done, multi, payload, size);
        if (!reply->skb)
                goto err;
        reply->net = get_net(sock_net(NETLINK_CB(request_skb).sk));
        reply->portid = NETLINK_CB(request_skb).portid;

        tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
        if (IS_ERR(tsk))
                goto err;

        return;

err:
        audit_free_reply(reply);
}

/*
 * Check for appropriate CAP_AUDIT_ capabilities on incoming audit
 * control messages.
 */
static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
{
        int err = 0;

        /* Only support initial user namespace for now. */
        /*
         * We return ECONNREFUSED because it tricks userspace into thinking
         * that audit was not configured into the kernel.  Lots of users
         * configure their PAM stack (because that's what the distro does)
         * to reject login if unable to send messages to audit.  If we return
         * ECONNREFUSED the PAM stack thinks the kernel does not have audit
         * configured in and will let login proceed.  If we return EPERM
         * userspace will reject all logins.  This should be removed when we
         * support non init namespaces!!
         */
        if (current_user_ns() != &init_user_ns)
                return -ECONNREFUSED;

        switch (msg_type) {
        case AUDIT_LIST:
        case AUDIT_ADD:
        case AUDIT_DEL:
                return -EOPNOTSUPP;
        case AUDIT_GET:
        case AUDIT_SET:
        case AUDIT_GET_FEATURE:
        case AUDIT_SET_FEATURE:
        case AUDIT_LIST_RULES:
        case AUDIT_ADD_RULE:
        case AUDIT_DEL_RULE:
        case AUDIT_SIGNAL_INFO:
        case AUDIT_TTY_GET:
        case AUDIT_TTY_SET:
        case AUDIT_TRIM:
        case AUDIT_MAKE_EQUIV:
                /* Only support auditd and auditctl in initial pid namespace
                 * for now. */
                if (task_active_pid_ns(current) != &init_pid_ns)
                        return -EPERM;

                if (!netlink_capable(skb, CAP_AUDIT_CONTROL))
                        err = -EPERM;
                break;
        case AUDIT_USER:
        case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
        case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
                if (!netlink_capable(skb, CAP_AUDIT_WRITE))
                        err = -EPERM;
                break;
        default:  /* bad msg */
                err = -EINVAL;
        }

        return err;
}

static void audit_log_common_recv_msg(struct audit_context *context,
                                        struct audit_buffer **ab, u16 msg_type)
{
        uid_t uid = from_kuid(&init_user_ns, current_uid());
        pid_t pid = task_tgid_nr(current);

        if (!audit_enabled && msg_type != AUDIT_USER_AVC) {
                *ab = NULL;
                return;
        }

        *ab = audit_log_start(context, GFP_KERNEL, msg_type);
        if (unlikely(!*ab))
                return;
        audit_log_format(*ab, "pid=%d uid=%u ", pid, uid);
        audit_log_session_info(*ab);
        audit_log_task_context(*ab);
}

static inline void audit_log_user_recv_msg(struct audit_buffer **ab,
                                           u16 msg_type)
{
        audit_log_common_recv_msg(NULL, ab, msg_type);
}

int is_audit_feature_set(int i)
{
        return af.features & AUDIT_FEATURE_TO_MASK(i);
}


static int audit_get_feature(struct sk_buff *skb)
{
        u32 seq;

        seq = nlmsg_hdr(skb)->nlmsg_seq;

        audit_send_reply(skb, seq, AUDIT_GET_FEATURE, 0, 0, &af, sizeof(af));

        return 0;
}

static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature,
                                     u32 old_lock, u32 new_lock, int res)
{
        struct audit_buffer *ab;

        if (audit_enabled == AUDIT_OFF)
                return;

        ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_FEATURE_CHANGE);
        if (!ab)
                return;
        audit_log_task_info(ab);
        audit_log_format(ab, " feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d",
                         audit_feature_names[which], !!old_feature, !!new_feature,
                         !!old_lock, !!new_lock, res);
        audit_log_end(ab);
}

static int audit_set_feature(struct audit_features *uaf)
{
        int i;

        BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > ARRAY_SIZE(audit_feature_names));

        /* if there is ever a version 2 we should handle that here */

        for (i = 0; i <= AUDIT_LAST_FEATURE; i++) {
                u32 feature = AUDIT_FEATURE_TO_MASK(i);
                u32 old_feature, new_feature, old_lock, new_lock;

                /* if we are not changing this feature, move along */
                if (!(feature & uaf->mask))
                        continue;

                old_feature = af.features & feature;
                new_feature = uaf->features & feature;
                new_lock = (uaf->lock | af.lock) & feature;
                old_lock = af.lock & feature;

                /* are we changing a locked feature? */
                if (old_lock && (new_feature != old_feature)) {
                        audit_log_feature_change(i, old_feature, new_feature,
                                                 old_lock, new_lock, 0);
                        return -EPERM;
                }
        }
        /* nothing invalid, do the changes */
        for (i = 0; i <= AUDIT_LAST_FEATURE; i++) {
                u32 feature = AUDIT_FEATURE_TO_MASK(i);
                u32 old_feature, new_feature, old_lock, new_lock;

                /* if we are not changing this feature, move along */
                if (!(feature & uaf->mask))
                        continue;

                old_feature = af.features & feature;
                new_feature = uaf->features & feature;
                old_lock = af.lock & feature;
                new_lock = (uaf->lock | af.lock) & feature;

                if (new_feature != old_feature)
                        audit_log_feature_change(i, old_feature, new_feature,
                                                 old_lock, new_lock, 1);

                if (new_feature)
                        af.features |= feature;
                else
                        af.features &= ~feature;
                af.lock |= new_lock;
        }

        return 0;
}

static int audit_replace(struct pid *pid)
{
        pid_t pvnr;
        struct sk_buff *skb;

        pvnr = pid_vnr(pid);
        skb = audit_make_reply(0, AUDIT_REPLACE, 0, 0, &pvnr, sizeof(pvnr));
        if (!skb)
                return -ENOMEM;
        return auditd_send_unicast_skb(skb);
}

static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
                             bool *ack)
{
        u32                        seq;
        void                        *data;
        int                        data_len;
        int                        err;
        struct audit_buffer        *ab;
        u16                        msg_type = nlh->nlmsg_type;
        struct audit_sig_info   *sig_data;
        char                        *ctx = NULL;
        u32                        len;

        err = audit_netlink_ok(skb, msg_type);
        if (err)
                return err;

        seq  = nlh->nlmsg_seq;
        data = nlmsg_data(nlh);
        data_len = nlmsg_len(nlh);

        switch (msg_type) {
        case AUDIT_GET: {
                struct audit_status        s;
                memset(&s, 0, sizeof(s));
                s.enabled                   = audit_enabled;
                s.failure                   = audit_failure;
                /* NOTE: use pid_vnr() so the PID is relative to the current
                 *       namespace */
                s.pid                           = auditd_pid_vnr();
                s.rate_limit                   = audit_rate_limit;
                s.backlog_limit                   = audit_backlog_limit;
                s.lost                           = atomic_read(&audit_lost);
                s.backlog                   = skb_queue_len(&audit_queue);
                s.feature_bitmap           = AUDIT_FEATURE_BITMAP_ALL;
                s.backlog_wait_time           = audit_backlog_wait_time;
                s.backlog_wait_time_actual = atomic_read(&audit_backlog_wait_time_actual);
                audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
                break;
        }
        case AUDIT_SET: {
                struct audit_status        s;
                memset(&s, 0, sizeof(s));
                /* guard against past and future API changes */
                memcpy(&s, data, min_t(size_t, sizeof(s), data_len));
                if (s.mask & AUDIT_STATUS_ENABLED) {
                        err = audit_set_enabled(s.enabled);
                        if (err < 0)
                                return err;
                }
                if (s.mask & AUDIT_STATUS_FAILURE) {
                        err = audit_set_failure(s.failure);
                        if (err < 0)
                                return err;
                }
                if (s.mask & AUDIT_STATUS_PID) {
                        /* NOTE: we are using the vnr PID functions below
                         *       because the s.pid value is relative to the
                         *       namespace of the caller; at present this
                         *       doesn't matter much since you can really only
                         *       run auditd from the initial pid namespace, but
                         *       something to keep in mind if this changes */
                        pid_t new_pid = s.pid;
                        pid_t auditd_pid;
                        struct pid *req_pid = task_tgid(current);

                        /* Sanity check - PID values must match. Setting
                         * pid to 0 is how auditd ends auditing. */
                        if (new_pid && (new_pid != pid_vnr(req_pid)))
                                return -EINVAL;

                        /* test the auditd connection */
                        audit_replace(req_pid);

                        auditd_pid = auditd_pid_vnr();
                        if (auditd_pid) {
                                /* replacing a healthy auditd is not allowed */
                                if (new_pid) {
                                        audit_log_config_change("audit_pid",
                                                        new_pid, auditd_pid, 0);
                                        return -EEXIST;
                                }
                                /* only current auditd can unregister itself */
                                if (pid_vnr(req_pid) != auditd_pid) {
                                        audit_log_config_change("audit_pid",
                                                        new_pid, auditd_pid, 0);
                                        return -EACCES;
                                }
                        }

                        if (new_pid) {
                                /* register a new auditd connection */
                                err = auditd_set(req_pid,
                                                 NETLINK_CB(skb).portid,
                                                 sock_net(NETLINK_CB(skb).sk),
                                                 skb, ack);
                                if (audit_enabled != AUDIT_OFF)
                                        audit_log_config_change("audit_pid",
                                                                new_pid,
                                                                auditd_pid,
                                                                err ? 0 : 1);
                                if (err)
                                        return err;

                                /* try to process any backlog */
                                wake_up_interruptible(&kauditd_wait);
                        } else {
                                if (audit_enabled != AUDIT_OFF)
                                        audit_log_config_change("audit_pid",
                                                                new_pid,
                                                                auditd_pid, 1);

                                /* unregister the auditd connection */
                                auditd_reset(NULL);
                        }
                }
                if (s.mask & AUDIT_STATUS_RATE_LIMIT) {
                        err = audit_set_rate_limit(s.rate_limit);
                        if (err < 0)
                                return err;
                }
                if (s.mask & AUDIT_STATUS_BACKLOG_LIMIT) {
                        err = audit_set_backlog_limit(s.backlog_limit);
                        if (err < 0)
                                return err;
                }
                if (s.mask & AUDIT_STATUS_BACKLOG_WAIT_TIME) {
                        if (sizeof(s) > (size_t)nlh->nlmsg_len)
                                return -EINVAL;
                        if (s.backlog_wait_time > 10*AUDIT_BACKLOG_WAIT_TIME)
                                return -EINVAL;
                        err = audit_set_backlog_wait_time(s.backlog_wait_time);
                        if (err < 0)
                                return err;
                }
                if (s.mask == AUDIT_STATUS_LOST) {
                        u32 lost = atomic_xchg(&audit_lost, 0);

                        audit_log_config_change("lost", 0, lost, 1);
                        return lost;
                }
                if (s.mask == AUDIT_STATUS_BACKLOG_WAIT_TIME_ACTUAL) {
                        u32 actual = atomic_xchg(&audit_backlog_wait_time_actual, 0);

                        audit_log_config_change("backlog_wait_time_actual", 0, actual, 1);
                        return actual;
                }
                break;
        }
        case AUDIT_GET_FEATURE:
                err = audit_get_feature(skb);
                if (err)
                        return err;
                break;
        case AUDIT_SET_FEATURE:
                if (data_len < sizeof(struct audit_features))
                        return -EINVAL;
                err = audit_set_feature(data);
                if (err)
                        return err;
                break;
        case AUDIT_USER:
        case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
        case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
                if (!audit_enabled && msg_type != AUDIT_USER_AVC)
                        return 0;
                /* exit early if there isn't at least one character to print */
                if (data_len < 2)
                        return -EINVAL;

                err = audit_filter(msg_type, AUDIT_FILTER_USER);
                if (err == 1) { /* match or error */
                        char *str = data;

                        err = 0;
                        if (msg_type == AUDIT_USER_TTY) {
                                err = tty_audit_push();
                                if (err)
                                        break;
                        }
                        audit_log_user_recv_msg(&ab, msg_type);
                        if (msg_type != AUDIT_USER_TTY) {
                                /* ensure NULL termination */
                                str[data_len - 1] = '\0';
                                audit_log_format(ab, " msg='%.*s'",
                                                 AUDIT_MESSAGE_TEXT_MAX,
                                                 str);
                        } else {
                                audit_log_format(ab, " data=");
                                if (data_len > 0 && str[data_len - 1] == '\0')
                                        data_len--;
                                audit_log_n_untrustedstring(ab, str, data_len);
                        }
                        audit_log_end(ab);
                }
                break;
        case AUDIT_ADD_RULE:
        case AUDIT_DEL_RULE:
                if (data_len < sizeof(struct audit_rule_data))
                        return -EINVAL;
                if (audit_enabled == AUDIT_LOCKED) {
                        audit_log_common_recv_msg(audit_context(), &ab,
                                                  AUDIT_CONFIG_CHANGE);
                        audit_log_format(ab, " op=%s audit_enabled=%d res=0",
                                         msg_type == AUDIT_ADD_RULE ?
                                                "add_rule" : "remove_rule",
                                         audit_enabled);
                        audit_log_end(ab);
                        return -EPERM;
                }
                err = audit_rule_change(msg_type, seq, data, data_len);
                break;
        case AUDIT_LIST_RULES:
                err = audit_list_rules_send(skb, seq);
                break;
        case AUDIT_TRIM:
                audit_trim_trees();
                audit_log_common_recv_msg(audit_context(), &ab,
                                          AUDIT_CONFIG_CHANGE);
                audit_log_format(ab, " op=trim res=1");
                audit_log_end(ab);
                break;
        case AUDIT_MAKE_EQUIV: {
                void *bufp = data;
                u32 sizes[2];
                size_t msglen = data_len;
                char *old, *new;

                err = -EINVAL;
                if (msglen < 2 * sizeof(u32))
                        break;
                memcpy(sizes, bufp, 2 * sizeof(u32));
                bufp += 2 * sizeof(u32);
                msglen -= 2 * sizeof(u32);
                old = audit_unpack_string(&bufp, &msglen, sizes[0]);
                if (IS_ERR(old)) {
                        err = PTR_ERR(old);
                        break;
                }
                new = audit_unpack_string(&bufp, &msglen, sizes[1]);
                if (IS_ERR(new)) {
                        err = PTR_ERR(new);
                        kfree(old);
                        break;
                }
                /* OK, here comes... */
                err = audit_tag_tree(old, new);

                audit_log_common_recv_msg(audit_context(), &ab,
                                          AUDIT_CONFIG_CHANGE);
                audit_log_format(ab, " op=make_equiv old=");
                audit_log_untrustedstring(ab, old);
                audit_log_format(ab, " new=");
                audit_log_untrustedstring(ab, new);
                audit_log_format(ab, " res=%d", !err);
                audit_log_end(ab);
                kfree(old);
                kfree(new);
                break;
        }
        case AUDIT_SIGNAL_INFO:
                len = 0;
                if (audit_sig_sid) {
                        err = security_secid_to_secctx(audit_sig_sid, &ctx, &len);
                        if (err)
                                return err;
                }
                sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
                if (!sig_data) {
                        if (audit_sig_sid)
                                security_release_secctx(ctx, len);
                        return -ENOMEM;
                }
                sig_data->uid = from_kuid(&init_user_ns, audit_sig_uid);
                sig_data->pid = audit_sig_pid;
                if (audit_sig_sid) {
                        memcpy(sig_data->ctx, ctx, len);
                        security_release_secctx(ctx, len);
                }
                audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0,
                                 sig_data, sizeof(*sig_data) + len);
                kfree(sig_data);
                break;
        case AUDIT_TTY_GET: {
                struct audit_tty_status s;
                unsigned int t;

                t = READ_ONCE(current->signal->audit_tty);
                s.enabled = t & AUDIT_TTY_ENABLE;
                s.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD);

                audit_send_reply(skb, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
                break;
        }
        case AUDIT_TTY_SET: {
                struct audit_tty_status s, old;
                struct audit_buffer        *ab;
                unsigned int t;

                memset(&s, 0, sizeof(s));
                /* guard against past and future API changes */
                memcpy(&s, data, min_t(size_t, sizeof(s), data_len));
                /* check if new data is valid */
                if ((s.enabled != 0 && s.enabled != 1) ||
                    (s.log_passwd != 0 && s.log_passwd != 1))
                        err = -EINVAL;

                if (err)
                        t = READ_ONCE(current->signal->audit_tty);
                else {
                        t = s.enabled | (-s.log_passwd & AUDIT_TTY_LOG_PASSWD);
                        t = xchg(&current->signal->audit_tty, t);
                }
                old.enabled = t & AUDIT_TTY_ENABLE;
                old.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD);

                audit_log_common_recv_msg(audit_context(), &ab,
                                          AUDIT_CONFIG_CHANGE);
                audit_log_format(ab, " op=tty_set old-enabled=%d new-enabled=%d"
                                 " old-log_passwd=%d new-log_passwd=%d res=%d",
                                 old.enabled, s.enabled, old.log_passwd,
                                 s.log_passwd, !err);
                audit_log_end(ab);
                break;
        }
        default:
                err = -EINVAL;
                break;
        }

        return err < 0 ? err : 0;
}

/**
 * audit_receive - receive messages from a netlink control socket
 * @skb: the message buffer
 *
 * Parse the provided skb and deal with any messages that may be present,
 * malformed skbs are discarded.
 */
static void audit_receive(struct sk_buff *skb)
{
        struct nlmsghdr *nlh;
        bool ack;
        /*
         * len MUST be signed for nlmsg_next to be able to dec it below 0
         * if the nlmsg_len was not aligned
         */
        int len;
        int err;

        nlh = nlmsg_hdr(skb);
        len = skb->len;

        audit_ctl_lock();
        while (nlmsg_ok(nlh, len)) {
                ack = nlh->nlmsg_flags & NLM_F_ACK;
                err = audit_receive_msg(skb, nlh, &ack);

                /* send an ack if the user asked for one and audit_receive_msg
                 * didn't already do it, or if there was an error. */
                if (ack || err)
                        netlink_ack(skb, nlh, err, NULL);

                nlh = nlmsg_next(nlh, &len);
        }
        audit_ctl_unlock();

        /* can't block with the ctrl lock, so penalize the sender now */
        if (audit_backlog_limit &&
            (skb_queue_len(&audit_queue) > audit_backlog_limit)) {
                DECLARE_WAITQUEUE(wait, current);

                /* wake kauditd to try and flush the queue */
                wake_up_interruptible(&kauditd_wait);

                add_wait_queue_exclusive(&audit_backlog_wait, &wait);
                set_current_state(TASK_UNINTERRUPTIBLE);
                schedule_timeout(audit_backlog_wait_time);
                remove_wait_queue(&audit_backlog_wait, &wait);
        }
}

/* Log information about who is connecting to the audit multicast socket */
static void audit_log_multicast(int group, const char *op, int err)
{
        const struct cred *cred;
        struct tty_struct *tty;
        char comm[sizeof(current->comm)];
        struct audit_buffer *ab;

        if (!audit_enabled)
                return;

        ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_EVENT_LISTENER);
        if (!ab)
                return;

        cred = current_cred();
        tty = audit_get_tty();
        audit_log_format(ab, "pid=%u uid=%u auid=%u tty=%s ses=%u",
                         task_pid_nr(current),
                         from_kuid(&init_user_ns, cred->uid),
                         from_kuid(&init_user_ns, audit_get_loginuid(current)),
                         tty ? tty_name(tty) : "(none)",
                         audit_get_sessionid(current));
        audit_put_tty(tty);
        audit_log_task_context(ab); /* subj= */
        audit_log_format(ab, " comm=");
        audit_log_untrustedstring(ab, get_task_comm(comm, current));
        audit_log_d_path_exe(ab, current->mm); /* exe= */
        audit_log_format(ab, " nl-mcgrp=%d op=%s res=%d", group, op, !err);
        audit_log_end(ab);
}

/* Run custom bind function on netlink socket group connect or bind requests. */
static int audit_multicast_bind(struct net *net, int group)
{
        int err = 0;

        if (!capable(CAP_AUDIT_READ))
                err = -EPERM;
        audit_log_multicast(group, "connect", err);
        return err;
}

static void audit_multicast_unbind(struct net *net, int group)
{
        audit_log_multicast(group, "disconnect", 0);
}

static int __net_init audit_net_init(struct net *net)
{
        struct netlink_kernel_cfg cfg = {
                .input        = audit_receive,
                .bind        = audit_multicast_bind,
                .unbind        = audit_multicast_unbind,
                .flags        = NL_CFG_F_NONROOT_RECV,
                .groups        = AUDIT_NLGRP_MAX,
        };

        struct audit_net *aunet = net_generic(net, audit_net_id);

        aunet->sk = netlink_kernel_create(net, NETLINK_AUDIT, &cfg);
        if (aunet->sk == NULL) {
                audit_panic("cannot initialize netlink socket in namespace");
                return -ENOMEM;
        }
        /* limit the timeout in case auditd is blocked/stopped */
        aunet->sk->sk_sndtimeo = HZ / 10;

        return 0;
}

static void __net_exit audit_net_exit(struct net *net)
{
        struct audit_net *aunet = net_generic(net, audit_net_id);

        /* NOTE: you would think that we would want to check the auditd
         * connection and potentially reset it here if it lives in this
         * namespace, but since the auditd connection tracking struct holds a
         * reference to this namespace (see auditd_set()) we are only ever
         * going to get here after that connection has been released */

        netlink_kernel_release(aunet->sk);
}

static struct pernet_operations audit_net_ops __net_initdata = {
        .init = audit_net_init,
        .exit = audit_net_exit,
        .id = &audit_net_id,
        .size = sizeof(struct audit_net),
};

/* Initialize audit support at boot time. */
static int __init audit_init(void)
{
        int i;

        if (audit_initialized == AUDIT_DISABLED)
                return 0;

        audit_buffer_cache = kmem_cache_create("audit_buffer",
                                               sizeof(struct audit_buffer),
                                               0, SLAB_PANIC, NULL);

        skb_queue_head_init(&audit_queue);
        skb_queue_head_init(&audit_retry_queue);
        skb_queue_head_init(&audit_hold_queue);

        for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
                INIT_LIST_HEAD(&audit_inode_hash[i]);

        mutex_init(&audit_cmd_mutex.lock);
        audit_cmd_mutex.owner = NULL;

        pr_info("initializing netlink subsys (%s)\n",
                audit_default ? "enabled" : "disabled");
        register_pernet_subsys(&audit_net_ops);

        audit_initialized = AUDIT_INITIALIZED;

        kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
        if (IS_ERR(kauditd_task)) {
                int err = PTR_ERR(kauditd_task);
                panic("audit: failed to start the kauditd thread (%d)\n", err);
        }

        audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL,
                "state=initialized audit_enabled=%u res=1",
                 audit_enabled);

        return 0;
}
postcore_initcall(audit_init);

/*
 * Process kernel command-line parameter at boot time.
 * audit={0|off} or audit={1|on}.
 */
static int __init audit_enable(char *str)
{
        if (!strcasecmp(str, "off") || !strcmp(str, "0"))
                audit_default = AUDIT_OFF;
        else if (!strcasecmp(str, "on") || !strcmp(str, "1"))
                audit_default = AUDIT_ON;
        else {
                pr_err("audit: invalid 'audit' parameter value (%s)\n", str);
                audit_default = AUDIT_ON;
        }

        if (audit_default == AUDIT_OFF)
                audit_initialized = AUDIT_DISABLED;
        if (audit_set_enabled(audit_default))
                pr_err("audit: error setting audit state (%d)\n",
                       audit_default);

        pr_info("%s\n", audit_default ?
                "enabled (after initialization)" : "disabled (until reboot)");

        return 1;
}
__setup("audit=", audit_enable);

/* Process kernel command-line parameter at boot time.
 * audit_backlog_limit=<n> */
static int __init audit_backlog_limit_set(char *str)
{
        u32 audit_backlog_limit_arg;

        pr_info("audit_backlog_limit: ");
        if (kstrtouint(str, 0, &audit_backlog_limit_arg)) {
                pr_cont("using default of %u, unable to parse %s\n",
                        audit_backlog_limit, str);
                return 1;
        }

        audit_backlog_limit = audit_backlog_limit_arg;
        pr_cont("%d\n", audit_backlog_limit);

        return 1;
}
__setup("audit_backlog_limit=", audit_backlog_limit_set);

static void audit_buffer_free(struct audit_buffer *ab)
{
        if (!ab)
                return;

        kfree_skb(ab->skb);
        kmem_cache_free(audit_buffer_cache, ab);
}

static struct audit_buffer *audit_buffer_alloc(struct audit_context *ctx,
                                               gfp_t gfp_mask, int type)
{
        struct audit_buffer *ab;

        ab = kmem_cache_alloc(audit_buffer_cache, gfp_mask);
        if (!ab)
                return NULL;

        ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
        if (!ab->skb)
                goto err;
        if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0))
                goto err;

        ab->ctx = ctx;
        ab->gfp_mask = gfp_mask;

        return ab;

err:
        audit_buffer_free(ab);
        return NULL;
}

/**
 * audit_serial - compute a serial number for the audit record
 *
 * Compute a serial number for the audit record.  Audit records are
 * written to user-space as soon as they are generated, so a complete
 * audit record may be written in several pieces.  The timestamp of the
 * record and this serial number are used by the user-space tools to
 * determine which pieces belong to the same audit record.  The
 * (timestamp,serial) tuple is unique for each syscall and is live from
 * syscall entry to syscall exit.
 *
 * NOTE: Another possibility is to store the formatted records off the
 * audit context (for those records that have a context), and emit them
 * all at syscall exit.  However, this could delay the reporting of
 * significant errors until syscall exit (or never, if the system
 * halts).
 */
unsigned int audit_serial(void)
{
        static atomic_t serial = ATOMIC_INIT(0);

        return atomic_add_return(1, &serial);
}

static inline void audit_get_stamp(struct audit_context *ctx,
                                   struct timespec64 *t, unsigned int *serial)
{
        if (!ctx || !auditsc_get_stamp(ctx, t, serial)) {
                ktime_get_coarse_real_ts64(t);
                *serial = audit_serial();
        }
}

/**
 * audit_log_start - obtain an audit buffer
 * @ctx: audit_context (may be NULL)
 * @gfp_mask: type of allocation
 * @type: audit message type
 *
 * Returns audit_buffer pointer on success or NULL on error.
 *
 * Obtain an audit buffer.  This routine does locking to obtain the
 * audit buffer, but then no locking is required for calls to
 * audit_log_*format.  If the task (ctx) is a task that is currently in a
 * syscall, then the syscall is marked as auditable and an audit record
 * will be written at syscall exit.  If there is no associated task, then
 * task context (ctx) should be NULL.
 */
struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
                                     int type)
{
        struct audit_buffer *ab;
        struct timespec64 t;
        unsigned int serial;

        if (audit_initialized != AUDIT_INITIALIZED)
                return NULL;

        if (unlikely(!audit_filter(type, AUDIT_FILTER_EXCLUDE)))
                return NULL;

        /* NOTE: don't ever fail/sleep on these two conditions:
         * 1. auditd generated record - since we need auditd to drain the
         *    queue; also, when we are checking for auditd, compare PIDs using
         *    task_tgid_vnr() since auditd_pid is set in audit_receive_msg()
         *    using a PID anchored in the caller's namespace
         * 2. generator holding the audit_cmd_mutex - we don't want to block
         *    while holding the mutex, although we do penalize the sender
         *    later in audit_receive() when it is safe to block
         */
        if (!(auditd_test_task(current) || audit_ctl_owner_current())) {
                long stime = audit_backlog_wait_time;

                while (audit_backlog_limit &&
                       (skb_queue_len(&audit_queue) > audit_backlog_limit)) {
                        /* wake kauditd to try and flush the queue */
                        wake_up_interruptible(&kauditd_wait);

                        /* sleep if we are allowed and we haven't exhausted our
                         * backlog wait limit */
                        if (gfpflags_allow_blocking(gfp_mask) && (stime > 0)) {
                                long rtime = stime;

                                DECLARE_WAITQUEUE(wait, current);

                                add_wait_queue_exclusive(&audit_backlog_wait,
                                                         &wait);
                                set_current_state(TASK_UNINTERRUPTIBLE);
                                stime = schedule_timeout(rtime);
                                atomic_add(rtime - stime, &audit_backlog_wait_time_actual);
                                remove_wait_queue(&audit_backlog_wait, &wait);
                        } else {
                                if (audit_rate_check() && printk_ratelimit())
                                        pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n",
                                                skb_queue_len(&audit_queue),
                                                audit_backlog_limit);
                                audit_log_lost("backlog limit exceeded");
                                return NULL;
                        }
                }
        }

        ab = audit_buffer_alloc(ctx, gfp_mask, type);
        if (!ab) {
                audit_log_lost("out of memory in audit_log_start");
                return NULL;
        }

        audit_get_stamp(ab->ctx, &t, &serial);
        audit_log_format(ab, "audit(%llu.%03lu:%u): ",
                         (unsigned long long)t.tv_sec, t.tv_nsec/1000000, serial);

        return ab;
}

/**
 * audit_expand - expand skb in the audit buffer
 * @ab: audit_buffer
 * @extra: space to add at tail of the skb
 *
 * Returns 0 (no space) on failed expansion, or available space if
 * successful.
 */
static inline int audit_expand(struct audit_buffer *ab, int extra)
{
        struct sk_buff *skb = ab->skb;
        int oldtail = skb_tailroom(skb);
        int ret = pskb_expand_head(skb, 0, extra, ab->gfp_mask);
        int newtail = skb_tailroom(skb);

        if (ret < 0) {
                audit_log_lost("out of memory in audit_expand");
                return 0;
        }

        skb->truesize += newtail - oldtail;
        return newtail;
}

/*
 * Format an audit message into the audit buffer.  If there isn't enough
 * room in the audit buffer, more room will be allocated and vsnprint
 * will be called a second time.  Currently, we assume that a printk
 * can't format message larger than 1024 bytes, so we don't either.
 */
static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
                              va_list args)
{
        int len, avail;
        struct sk_buff *skb;
        va_list args2;

        if (!ab)
                return;

        BUG_ON(!ab->skb);
        skb = ab->skb;
        avail = skb_tailroom(skb);
        if (avail == 0) {
                avail = audit_expand(ab, AUDIT_BUFSIZ);
                if (!avail)
                        goto out;
        }
        va_copy(args2, args);
        len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args);
        if (len >= avail) {
                /* The printk buffer is 1024 bytes long, so if we get
                 * here and AUDIT_BUFSIZ is at least 1024, then we can
                 * log everything that printk could have logged. */
                avail = audit_expand(ab,
                        max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
                if (!avail)
                        goto out_va_end;
                len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2);
        }
        if (len > 0)
                skb_put(skb, len);
out_va_end:
        va_end(args2);
out:
        return;
}

/**
 * audit_log_format - format a message into the audit buffer.
 * @ab: audit_buffer
 * @fmt: format string
 * @...: optional parameters matching @fmt string
 *
 * All the work is done in audit_log_vformat.
 */
void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
{
        va_list args;

        if (!ab)
                return;
        va_start(args, fmt);
        audit_log_vformat(ab, fmt, args);
        va_end(args);
}

/**
 * audit_log_n_hex - convert a buffer to hex and append it to the audit skb
 * @ab: the audit_buffer
 * @buf: buffer to convert to hex
 * @len: length of @buf to be converted
 *
 * No return value; failure to expand is silently ignored.
 *
 * This function will take the passed buf and convert it into a string of
 * ascii hex digits. The new string is placed onto the skb.
 */
void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf,
                size_t len)
{
        int i, avail, new_len;
        unsigned char *ptr;
        struct sk_buff *skb;

        if (!ab)
                return;

        BUG_ON(!ab->skb);
        skb = ab->skb;
        avail = skb_tailroom(skb);
        new_len = len<<1;
        if (new_len >= avail) {
                /* Round the buffer request up to the next multiple */
                new_len = AUDIT_BUFSIZ*(((new_len-avail)/AUDIT_BUFSIZ) + 1);
                avail = audit_expand(ab, new_len);
                if (!avail)
                        return;
        }

        ptr = skb_tail_pointer(skb);
        for (i = 0; i < len; i++)
                ptr = hex_byte_pack_upper(ptr, buf[i]);
        *ptr = 0;
        skb_put(skb, len << 1); /* new string is twice the old string */
}

/*
 * Format a string of no more than slen characters into the audit buffer,
 * enclosed in quote marks.
 */
void audit_log_n_string(struct audit_buffer *ab, const char *string,
                        size_t slen)
{
        int avail, new_len;
        unsigned char *ptr;
        struct sk_buff *skb;

        if (!ab)
                return;

        BUG_ON(!ab->skb);
        skb = ab->skb;
        avail = skb_tailroom(skb);
        new_len = slen + 3;        /* enclosing quotes + null terminator */
        if (new_len > avail) {
                avail = audit_expand(ab, new_len);
                if (!avail)
                        return;
        }
        ptr = skb_tail_pointer(skb);
        *ptr++ = '"';
        memcpy(ptr, string, slen);
        ptr += slen;
        *ptr++ = '"';
        *ptr = 0;
        skb_put(skb, slen + 2);        /* don't include null terminator */
}

/**
 * audit_string_contains_control - does a string need to be logged in hex
 * @string: string to be checked
 * @len: max length of the string to check
 */
bool audit_string_contains_control(const char *string, size_t len)
{
        const unsigned char *p;
        for (p = string; p < (const unsigned char *)string + len; p++) {
                if (*p == '"' || *p < 0x21 || *p > 0x7e)
                        return true;
        }
        return false;
}

/**
 * audit_log_n_untrustedstring - log a string that may contain random characters
 * @ab: audit_buffer
 * @len: length of string (not including trailing null)
 * @string: string to be logged
 *
 * This code will escape a string that is passed to it if the string
 * contains a control character, unprintable character, double quote mark,
 * or a space. Unescaped strings will start and end with a double quote mark.
 * Strings that are escaped are printed in hex (2 digits per char).
 *
 * The caller specifies the number of characters in the string to log, which may
 * or may not be the entire string.
 */
void audit_log_n_untrustedstring(struct audit_buffer *ab, const char *string,
                                 size_t len)
{
        if (audit_string_contains_control(string, len))
                audit_log_n_hex(ab, string, len);
        else
                audit_log_n_string(ab, string, len);
}

/**
 * audit_log_untrustedstring - log a string that may contain random characters
 * @ab: audit_buffer
 * @string: string to be logged
 *
 * Same as audit_log_n_untrustedstring(), except that strlen is used to
 * determine string length.
 */
void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
{
        audit_log_n_untrustedstring(ab, string, strlen(string));
}

/* This is a helper-function to print the escaped d_path */
void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
                      const struct path *path)
{
        char *p, *pathname;

        if (prefix)
                audit_log_format(ab, "%s", prefix);

        /* We will allow 11 spaces for ' (deleted)' to be appended */
        pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
        if (!pathname) {
                audit_log_format(ab, "\"<no_memory>\"");
                return;
        }
        p = d_path(path, pathname, PATH_MAX+11);
        if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */
                /* FIXME: can we save some information here? */
                audit_log_format(ab, "\"<too_long>\"");
        } else
                audit_log_untrustedstring(ab, p);
        kfree(pathname);
}

void audit_log_session_info(struct audit_buffer *ab)
{
        unsigned int sessionid = audit_get_sessionid(current);
        uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current));

        audit_log_format(ab, "auid=%u ses=%u", auid, sessionid);
}

void audit_log_key(struct audit_buffer *ab, char *key)
{
        audit_log_format(ab, " key=");
        if (key)
                audit_log_untrustedstring(ab, key);
        else
                audit_log_format(ab, "(null)");
}

int audit_log_task_context(struct audit_buffer *ab)
{
        char *ctx = NULL;
        unsigned len;
        int error;
        u32 sid;

        security_task_getsecid(current, &sid);
        if (!sid)
                return 0;

        error = security_secid_to_secctx(sid, &ctx, &len);
        if (error) {
                if (error != -EINVAL)
                        goto error_path;
                return 0;
        }

        audit_log_format(ab, " subj=%s", ctx);
        security_release_secctx(ctx, len);
        return 0;

error_path:
        audit_panic("error in audit_log_task_context");
        return error;
}
EXPORT_SYMBOL(audit_log_task_context);

void audit_log_d_path_exe(struct audit_buffer *ab,
                          struct mm_struct *mm)
{
        struct file *exe_file;

        if (!mm)
                goto out_null;

        exe_file = get_mm_exe_file(mm);
        if (!exe_file)
                goto out_null;

        audit_log_d_path(ab, " exe=", &exe_file->f_path);
        fput(exe_file);
        return;
out_null:
        audit_log_format(ab, " exe=(null)");
}

struct tty_struct *audit_get_tty(void)
{
        struct tty_struct *tty = NULL;
        unsigned long flags;

        spin_lock_irqsave(&current->sighand->siglock, flags);
        if (current->signal)
                tty = tty_kref_get(current->signal->tty);
        spin_unlock_irqrestore(&current->sighand->siglock, flags);
        return tty;
}

void audit_put_tty(struct tty_struct *tty)
{
        tty_kref_put(tty);
}

void audit_log_task_info(struct audit_buffer *ab)
{
        const struct cred *cred;
        char comm[sizeof(current->comm)];
        struct tty_struct *tty;

        if (!ab)
                return;

        cred = current_cred();
        tty = audit_get_tty();
        audit_log_format(ab,
                         " ppid=%d pid=%d auid=%u uid=%u gid=%u"
                         " euid=%u suid=%u fsuid=%u"
                         " egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
                         task_ppid_nr(current),
                         task_tgid_nr(current),
                         from_kuid(&init_user_ns, audit_get_loginuid(current)),
                         from_kuid(&init_user_ns, cred->uid),
                         from_kgid(&init_user_ns, cred->gid),
                         from_kuid(&init_user_ns, cred->euid),
                         from_kuid(&init_user_ns, cred->suid),
                         from_kuid(&init_user_ns, cred->fsuid),
                         from_kgid(&init_user_ns, cred->egid),
                         from_kgid(&init_user_ns, cred->sgid),
                         from_kgid(&init_user_ns, cred->fsgid),
                         tty ? tty_name(tty) : "(none)",
                         audit_get_sessionid(current));
        audit_put_tty(tty);
        audit_log_format(ab, " comm=");
        audit_log_untrustedstring(ab, get_task_comm(comm, current));
        audit_log_d_path_exe(ab, current->mm);
        audit_log_task_context(ab);
}
EXPORT_SYMBOL(audit_log_task_info);

/**
 * audit_log_path_denied - report a path restriction denial
 * @type: audit message type (AUDIT_ANOM_LINK, AUDIT_ANOM_CREAT, etc)
 * @operation: specific operation name
 */
void audit_log_path_denied(int type, const char *operation)
{
        struct audit_buffer *ab;

        if (!audit_enabled || audit_dummy_context())
                return;

        /* Generate log with subject, operation, outcome. */
        ab = audit_log_start(audit_context(), GFP_KERNEL, type);
        if (!ab)
                return;
        audit_log_format(ab, "op=%s", operation);
        audit_log_task_info(ab);
        audit_log_format(ab, " res=0");
        audit_log_end(ab);
}

/* global counter which is incremented every time something logs in */
static atomic_t session_id = ATOMIC_INIT(0);

static int audit_set_loginuid_perm(kuid_t loginuid)
{
        /* if we are unset, we don't need privs */
        if (!audit_loginuid_set(current))
                return 0;
        /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/
        if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE))
                return -EPERM;
        /* it is set, you need permission */
        if (!capable(CAP_AUDIT_CONTROL))
                return -EPERM;
        /* reject if this is not an unset and we don't allow that */
        if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID)
                                 && uid_valid(loginuid))
                return -EPERM;
        return 0;
}

static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
                                   unsigned int oldsessionid,
                                   unsigned int sessionid, int rc)
{
        struct audit_buffer *ab;
        uid_t uid, oldloginuid, loginuid;
        struct tty_struct *tty;

        if (!audit_enabled)
                return;

        ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_LOGIN);
        if (!ab)
                return;

        uid = from_kuid(&init_user_ns, task_uid(current));
        oldloginuid = from_kuid(&init_user_ns, koldloginuid);
        loginuid = from_kuid(&init_user_ns, kloginuid),
        tty = audit_get_tty();

        audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid);
        audit_log_task_context(ab);
        audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d",
                         oldloginuid, loginuid, tty ? tty_name(tty) : "(none)",
                         oldsessionid, sessionid, !rc);
        audit_put_tty(tty);
        audit_log_end(ab);
}

/**
 * audit_set_loginuid - set current task's loginuid
 * @loginuid: loginuid value
 *
 * Returns 0.
 *
 * Called (set) from fs/proc/base.c::proc_loginuid_write().
 */
int audit_set_loginuid(kuid_t loginuid)
{
        unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET;
        kuid_t oldloginuid;
        int rc;

        oldloginuid = audit_get_loginuid(current);
        oldsessionid = audit_get_sessionid(current);

        rc = audit_set_loginuid_perm(loginuid);
        if (rc)
                goto out;

        /* are we setting or clearing? */
        if (uid_valid(loginuid)) {
                sessionid = (unsigned int)atomic_inc_return(&session_id);
                if (unlikely(sessionid == AUDIT_SID_UNSET))
                        sessionid = (unsigned int)atomic_inc_return(&session_id);
        }

        current->sessionid = sessionid;
        current->loginuid = loginuid;
out:
        audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc);
        return rc;
}

/**
 * audit_signal_info - record signal info for shutting down audit subsystem
 * @sig: signal value
 * @t: task being signaled
 *
 * If the audit subsystem is being terminated, record the task (pid)
 * and uid that is doing that.
 */
int audit_signal_info(int sig, struct task_struct *t)
{
        kuid_t uid = current_uid(), auid;

        if (auditd_test_task(t) &&
            (sig == SIGTERM || sig == SIGHUP ||
             sig == SIGUSR1 || sig == SIGUSR2)) {
                audit_sig_pid = task_tgid_nr(current);
                auid = audit_get_loginuid(current);
                if (uid_valid(auid))
                        audit_sig_uid = auid;
                else
                        audit_sig_uid = uid;
                security_task_getsecid(current, &audit_sig_sid);
        }

        return audit_signal_info_syscall(t);
}

/**
 * audit_log_end - end one audit record
 * @ab: the audit_buffer
 *
 * We can not do a netlink send inside an irq context because it blocks (last
 * arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed on a
 * queue and a tasklet is scheduled to remove them from the queue outside the
 * irq context.  May be called in any context.
 */
void audit_log_end(struct audit_buffer *ab)
{
        struct sk_buff *skb;
        struct nlmsghdr *nlh;

        if (!ab)
                return;

        if (audit_rate_check()) {
                skb = ab->skb;
                ab->skb = NULL;

                /* setup the netlink header, see the comments in
                 * kauditd_send_multicast_skb() for length quirks */
                nlh = nlmsg_hdr(skb);
                nlh->nlmsg_len = skb->len - NLMSG_HDRLEN;

                /* queue the netlink packet and poke the kauditd thread */
                skb_queue_tail(&audit_queue, skb);
                wake_up_interruptible(&kauditd_wait);
        } else
                audit_log_lost("rate limit exceeded");

        audit_buffer_free(ab);
}

/**
 * audit_log - Log an audit record
 * @ctx: audit context
 * @gfp_mask: type of allocation
 * @type: audit message type
 * @fmt: format string to use
 * @...: variable parameters matching the format string
 *
 * This is a convenience function that calls audit_log_start,
 * audit_log_vformat, and audit_log_end.  It may be called
 * in any context.
 */
void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
               const char *fmt, ...)
{
        struct audit_buffer *ab;
        va_list args;

        ab = audit_log_start(ctx, gfp_mask, type);
        if (ab) {
                va_start(args, fmt);
                audit_log_vformat(ab, fmt, args);
                va_end(args);
                audit_log_end(ab);
        }
}

EXPORT_SYMBOL(audit_log_start);
EXPORT_SYMBOL(audit_log_end);
EXPORT_SYMBOL(audit_log_format);
EXPORT_SYMBOL(audit_log);


















































































































































































































































































































































































    4 




    5 




















    3 




















































    2 




















    1 


    1 















    1 














    1 

    1 
























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
// SPDX-License-Identifier: GPL-2.0
/*
 * linux/kernel/capability.c
 *
 * Copyright (C) 1997  Andrew Main <zefram@fysh.org>
 *
 * Integrated into 2.1.97+,  Andrew G. Morgan <morgan@kernel.org>
 * 30 May 2002:        Cleanup, Robert M. Love <rml@tech9.net>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/audit.h>
#include <linux/capability.h>
#include <linux/mm.h>
#include <linux/export.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <linux/uaccess.h>

/*
 * Leveraged for setting/resetting capabilities
 */

const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
EXPORT_SYMBOL(__cap_empty_set);

int file_caps_enabled = 1;

static int __init file_caps_disable(char *str)
{
        file_caps_enabled = 0;
        return 1;
}
__setup("no_file_caps", file_caps_disable);

#ifdef CONFIG_MULTIUSER
/*
 * More recent versions of libcap are available from:
 *
 *   http://www.kernel.org/pub/linux/libs/security/linux-privs/
 */

static void warn_legacy_capability_use(void)
{
        char name[sizeof(current->comm)];

        pr_info_once("warning: `%s' uses 32-bit capabilities (legacy support in use)\n",
                     get_task_comm(name, current));
}

/*
 * Version 2 capabilities worked fine, but the linux/capability.h file
 * that accompanied their introduction encouraged their use without
 * the necessary user-space source code changes. As such, we have
 * created a version 3 with equivalent functionality to version 2, but
 * with a header change to protect legacy source code from using
 * version 2 when it wanted to use version 1. If your system has code
 * that trips the following warning, it is using version 2 specific
 * capabilities and may be doing so insecurely.
 *
 * The remedy is to either upgrade your version of libcap (to 2.10+,
 * if the application is linked against it), or recompile your
 * application with modern kernel headers and this warning will go
 * away.
 */

static void warn_deprecated_v2(void)
{
        char name[sizeof(current->comm)];

        pr_info_once("warning: `%s' uses deprecated v2 capabilities in a way that may be insecure\n",
                     get_task_comm(name, current));
}

/*
 * Version check. Return the number of u32s in each capability flag
 * array, or a negative value on error.
 */
static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
{
        __u32 version;

        if (get_user(version, &header->version))
                return -EFAULT;

        switch (version) {
        case _LINUX_CAPABILITY_VERSION_1:
                warn_legacy_capability_use();
                *tocopy = _LINUX_CAPABILITY_U32S_1;
                break;
        case _LINUX_CAPABILITY_VERSION_2:
                warn_deprecated_v2();
                fallthrough;        /* v3 is otherwise equivalent to v2 */
        case _LINUX_CAPABILITY_VERSION_3:
                *tocopy = _LINUX_CAPABILITY_U32S_3;
                break;
        default:
                if (put_user((u32)_KERNEL_CAPABILITY_VERSION, &header->version))
                        return -EFAULT;
                return -EINVAL;
        }

        return 0;
}

/*
 * The only thing that can change the capabilities of the current
 * process is the current process. As such, we can't be in this code
 * at the same time as we are in the process of setting capabilities
 * in this process. The net result is that we can limit our use of
 * locks to when we are reading the caps of another process.
 */
static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
                                     kernel_cap_t *pIp, kernel_cap_t *pPp)
{
        int ret;

        if (pid && (pid != task_pid_vnr(current))) {
                struct task_struct *target;

                rcu_read_lock();

                target = find_task_by_vpid(pid);
                if (!target)
                        ret = -ESRCH;
                else
                        ret = security_capget(target, pEp, pIp, pPp);

                rcu_read_unlock();
        } else
                ret = security_capget(current, pEp, pIp, pPp);

        return ret;
}

/**
 * sys_capget - get the capabilities of a given process.
 * @header: pointer to struct that contains capability version and
 *        target pid data
 * @dataptr: pointer to struct that contains the effective, permitted,
 *        and inheritable capabilities that are returned
 *
 * Returns 0 on success and < 0 on error.
 */
SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
{
        int ret = 0;
        pid_t pid;
        unsigned tocopy;
        kernel_cap_t pE, pI, pP;

        ret = cap_validate_magic(header, &tocopy);
        if ((dataptr == NULL) || (ret != 0))
                return ((dataptr == NULL) && (ret == -EINVAL)) ? 0 : ret;

        if (get_user(pid, &header->pid))
                return -EFAULT;

        if (pid < 0)
                return -EINVAL;

        ret = cap_get_target_pid(pid, &pE, &pI, &pP);
        if (!ret) {
                struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
                unsigned i;

                for (i = 0; i < tocopy; i++) {
                        kdata[i].effective = pE.cap[i];
                        kdata[i].permitted = pP.cap[i];
                        kdata[i].inheritable = pI.cap[i];
                }

                /*
                 * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S,
                 * we silently drop the upper capabilities here. This
                 * has the effect of making older libcap
                 * implementations implicitly drop upper capability
                 * bits when they perform a: capget/modify/capset
                 * sequence.
                 *
                 * This behavior is considered fail-safe
                 * behavior. Upgrading the application to a newer
                 * version of libcap will enable access to the newer
                 * capabilities.
                 *
                 * An alternative would be to return an error here
                 * (-ERANGE), but that causes legacy applications to
                 * unexpectedly fail; the capget/modify/capset aborts
                 * before modification is attempted and the application
                 * fails.
                 */
                if (copy_to_user(dataptr, kdata, tocopy
                                 * sizeof(struct __user_cap_data_struct))) {
                        return -EFAULT;
                }
        }

        return ret;
}

/**
 * sys_capset - set capabilities for a process or (*) a group of processes
 * @header: pointer to struct that contains capability version and
 *        target pid data
 * @data: pointer to struct that contains the effective, permitted,
 *        and inheritable capabilities
 *
 * Set capabilities for the current process only.  The ability to any other
 * process(es) has been deprecated and removed.
 *
 * The restrictions on setting capabilities are specified as:
 *
 * I: any raised capabilities must be a subset of the old permitted
 * P: any raised capabilities must be a subset of the old permitted
 * E: must be set to a subset of new permitted
 *
 * Returns 0 on success and < 0 on error.
 */
SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
{
        struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
        unsigned i, tocopy, copybytes;
        kernel_cap_t inheritable, permitted, effective;
        struct cred *new;
        int ret;
        pid_t pid;

        ret = cap_validate_magic(header, &tocopy);
        if (ret != 0)
                return ret;

        if (get_user(pid, &header->pid))
                return -EFAULT;

        /* may only affect current now */
        if (pid != 0 && pid != task_pid_vnr(current))
                return -EPERM;

        copybytes = tocopy * sizeof(struct __user_cap_data_struct);
        if (copybytes > sizeof(kdata))
                return -EFAULT;

        if (copy_from_user(&kdata, data, copybytes))
                return -EFAULT;

        for (i = 0; i < tocopy; i++) {
                effective.cap[i] = kdata[i].effective;
                permitted.cap[i] = kdata[i].permitted;
                inheritable.cap[i] = kdata[i].inheritable;
        }
        while (i < _KERNEL_CAPABILITY_U32S) {
                effective.cap[i] = 0;
                permitted.cap[i] = 0;
                inheritable.cap[i] = 0;
                i++;
        }

        effective.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
        permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
        inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;

        ret = security_capset(new, current_cred(),
                              &effective, &inheritable, &permitted);
        if (ret < 0)
                goto error;

        audit_log_capset(new, current_cred());

        return commit_creds(new);

error:
        abort_creds(new);
        return ret;
}

/**
 * has_ns_capability - Does a task have a capability in a specific user ns
 * @t: The task in question
 * @ns: target user namespace
 * @cap: The capability to be tested for
 *
 * Return true if the specified task has the given superior capability
 * currently in effect to the specified user namespace, false if not.
 *
 * Note that this does not set PF_SUPERPRIV on the task.
 */
bool has_ns_capability(struct task_struct *t,
                       struct user_namespace *ns, int cap)
{
        int ret;

        rcu_read_lock();
        ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NONE);
        rcu_read_unlock();

        return (ret == 0);
}

/**
 * has_capability - Does a task have a capability in init_user_ns
 * @t: The task in question
 * @cap: The capability to be tested for
 *
 * Return true if the specified task has the given superior capability
 * currently in effect to the initial user namespace, false if not.
 *
 * Note that this does not set PF_SUPERPRIV on the task.
 */
bool has_capability(struct task_struct *t, int cap)
{
        return has_ns_capability(t, &init_user_ns, cap);
}
EXPORT_SYMBOL(has_capability);

/**
 * has_ns_capability_noaudit - Does a task have a capability (unaudited)
 * in a specific user ns.
 * @t: The task in question
 * @ns: target user namespace
 * @cap: The capability to be tested for
 *
 * Return true if the specified task has the given superior capability
 * currently in effect to the specified user namespace, false if not.
 * Do not write an audit message for the check.
 *
 * Note that this does not set PF_SUPERPRIV on the task.
 */
bool has_ns_capability_noaudit(struct task_struct *t,
                               struct user_namespace *ns, int cap)
{
        int ret;

        rcu_read_lock();
        ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NOAUDIT);
        rcu_read_unlock();

        return (ret == 0);
}

/**
 * has_capability_noaudit - Does a task have a capability (unaudited) in the
 * initial user ns
 * @t: The task in question
 * @cap: The capability to be tested for
 *
 * Return true if the specified task has the given superior capability
 * currently in effect to init_user_ns, false if not.  Don't write an
 * audit message for the check.
 *
 * Note that this does not set PF_SUPERPRIV on the task.
 */
bool has_capability_noaudit(struct task_struct *t, int cap)
{
        return has_ns_capability_noaudit(t, &init_user_ns, cap);
}

static bool ns_capable_common(struct user_namespace *ns,
                              int cap,
                              unsigned int opts)
{
        int capable;

        if (unlikely(!cap_valid(cap))) {
                pr_crit("capable() called with invalid cap=%u\n", cap);
                BUG();
        }

        capable = security_capable(current_cred(), ns, cap, opts);
        if (capable == 0) {
                current->flags |= PF_SUPERPRIV;
                return true;
        }
        return false;
}

/**
 * ns_capable - Determine if the current task has a superior capability in effect
 * @ns:  The usernamespace we want the capability in
 * @cap: The capability to be tested for
 *
 * Return true if the current task has the given superior capability currently
 * available for use, false if not.
 *
 * This sets PF_SUPERPRIV on the task if the capability is available on the
 * assumption that it's about to be used.
 */
bool ns_capable(struct user_namespace *ns, int cap)
{
        return ns_capable_common(ns, cap, CAP_OPT_NONE);
}
EXPORT_SYMBOL(ns_capable);

/**
 * ns_capable_noaudit - Determine if the current task has a superior capability
 * (unaudited) in effect
 * @ns:  The usernamespace we want the capability in
 * @cap: The capability to be tested for
 *
 * Return true if the current task has the given superior capability currently
 * available for use, false if not.
 *
 * This sets PF_SUPERPRIV on the task if the capability is available on the
 * assumption that it's about to be used.
 */
bool ns_capable_noaudit(struct user_namespace *ns, int cap)
{
        return ns_capable_common(ns, cap, CAP_OPT_NOAUDIT);
}
EXPORT_SYMBOL(ns_capable_noaudit);

/**
 * ns_capable_setid - Determine if the current task has a superior capability
 * in effect, while signalling that this check is being done from within a
 * setid or setgroups syscall.
 * @ns:  The usernamespace we want the capability in
 * @cap: The capability to be tested for
 *
 * Return true if the current task has the given superior capability currently
 * available for use, false if not.
 *
 * This sets PF_SUPERPRIV on the task if the capability is available on the
 * assumption that it's about to be used.
 */
bool ns_capable_setid(struct user_namespace *ns, int cap)
{
        return ns_capable_common(ns, cap, CAP_OPT_INSETID);
}
EXPORT_SYMBOL(ns_capable_setid);

/**
 * capable - Determine if the current task has a superior capability in effect
 * @cap: The capability to be tested for
 *
 * Return true if the current task has the given superior capability currently
 * available for use, false if not.
 *
 * This sets PF_SUPERPRIV on the task if the capability is available on the
 * assumption that it's about to be used.
 */
bool capable(int cap)
{
        return ns_capable(&init_user_ns, cap);
}
EXPORT_SYMBOL(capable);
#endif /* CONFIG_MULTIUSER */

/**
 * file_ns_capable - Determine if the file's opener had a capability in effect
 * @file:  The file we want to check
 * @ns:  The usernamespace we want the capability in
 * @cap: The capability to be tested for
 *
 * Return true if task that opened the file had a capability in effect
 * when the file was opened.
 *
 * This does not set PF_SUPERPRIV because the caller may not
 * actually be privileged.
 */
bool file_ns_capable(const struct file *file, struct user_namespace *ns,
                     int cap)
{

        if (WARN_ON_ONCE(!cap_valid(cap)))
                return false;

        if (security_capable(file->f_cred, ns, cap, CAP_OPT_NONE) == 0)
                return true;

        return false;
}
EXPORT_SYMBOL(file_ns_capable);

/**
 * privileged_wrt_inode_uidgid - Do capabilities in the namespace work over the inode?
 * @ns: The user namespace in question
 * @inode: The inode in question
 *
 * Return true if the inode uid and gid are within the namespace.
 */
bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode)
{
        return kuid_has_mapping(ns, inode->i_uid) &&
                kgid_has_mapping(ns, inode->i_gid);
}

/**
 * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
 * @inode: The inode in question
 * @cap: The capability in question
 *
 * Return true if the current task has the given capability targeted at
 * its own user namespace and that the given inode's uid and gid are
 * mapped into the current user namespace.
 */
bool capable_wrt_inode_uidgid(const struct inode *inode, int cap)
{
        struct user_namespace *ns = current_user_ns();

        return ns_capable(ns, cap) && privileged_wrt_inode_uidgid(ns, inode);
}
EXPORT_SYMBOL(capable_wrt_inode_uidgid);

/**
 * ptracer_capable - Determine if the ptracer holds CAP_SYS_PTRACE in the namespace
 * @tsk: The task that may be ptraced
 * @ns: The user namespace to search for CAP_SYS_PTRACE in
 *
 * Return true if the task that is ptracing the current task had CAP_SYS_PTRACE
 * in the specified user namespace.
 */
bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns)
{
        int ret = 0;  /* An absent tracer adds no restrictions */
        const struct cred *cred;

        rcu_read_lock();
        cred = rcu_dereference(tsk->ptracer_cred);
        if (cred)
                ret = security_capable(cred, ns, CAP_SYS_PTRACE,
                                       CAP_OPT_NOAUDIT);
        rcu_read_unlock();
        return (ret == 0);
}



















































































































































    1 








    1 

    1 


    1 














































































































































































































































    1 



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/kernel/signal.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  1997-11-02  Modified for POSIX.1b signals by Richard Henderson
 *
 *  2003-06-02  Jim Houston - Concurrent Computer Corp.
 *                Changes to use preallocated sigqueue structures
 *                to allow signals to be sent reliably.
 */

#include <linux/slab.h>
#include <linux/export.h>
#include <linux/init.h>
#include <linux/sched/mm.h>
#include <linux/sched/user.h>
#include <linux/sched/debug.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/proc_fs.h>
#include <linux/tty.h>
#include <linux/binfmts.h>
#include <linux/coredump.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/ptrace.h>
#include <linux/signal.h>
#include <linux/signalfd.h>
#include <linux/ratelimit.h>
#include <linux/tracehook.h>
#include <linux/capability.h>
#include <linux/freezer.h>
#include <linux/pid_namespace.h>
#include <linux/nsproxy.h>
#include <linux/user_namespace.h>
#include <linux/uprobes.h>
#include <linux/compat.h>
#include <linux/cn_proc.h>
#include <linux/compiler.h>
#include <linux/posix-timers.h>
#include <linux/livepatch.h>
#include <linux/cgroup.h>
#include <linux/audit.h>

#define CREATE_TRACE_POINTS
#include <trace/events/signal.h>

#include <asm/param.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <asm/siginfo.h>
#include <asm/cacheflush.h>

/*
 * SLAB caches for signal bits.
 */

static struct kmem_cache *sigqueue_cachep;

int print_fatal_signals __read_mostly;

static void __user *sig_handler(struct task_struct *t, int sig)
{
        return t->sighand->action[sig - 1].sa.sa_handler;
}

static inline bool sig_handler_ignored(void __user *handler, int sig)
{
        /* Is it explicitly or implicitly ignored? */
        return handler == SIG_IGN ||
               (handler == SIG_DFL && sig_kernel_ignore(sig));
}

static bool sig_task_ignored(struct task_struct *t, int sig, bool force)
{
        void __user *handler;

        handler = sig_handler(t, sig);

        /* SIGKILL and SIGSTOP may not be sent to the global init */
        if (unlikely(is_global_init(t) && sig_kernel_only(sig)))
                return true;

        if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
            handler == SIG_DFL && !(force && sig_kernel_only(sig)))
                return true;

        /* Only allow kernel generated signals to this kthread */
        if (unlikely((t->flags & PF_KTHREAD) &&
                     (handler == SIG_KTHREAD_KERNEL) && !force))
                return true;

        return sig_handler_ignored(handler, sig);
}

static bool sig_ignored(struct task_struct *t, int sig, bool force)
{
        /*
         * Blocked signals are never ignored, since the
         * signal handler may change by the time it is
         * unblocked.
         */
        if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
                return false;

        /*
         * Tracers may want to know about even ignored signal unless it
         * is SIGKILL which can't be reported anyway but can be ignored
         * by SIGNAL_UNKILLABLE task.
         */
        if (t->ptrace && sig != SIGKILL)
                return false;

        return sig_task_ignored(t, sig, force);
}

/*
 * Re-calculate pending state from the set of locally pending
 * signals, globally pending signals, and blocked signals.
 */
static inline bool has_pending_signals(sigset_t *signal, sigset_t *blocked)
{
        unsigned long ready;
        long i;

        switch (_NSIG_WORDS) {
        default:
                for (i = _NSIG_WORDS, ready = 0; --i >= 0 ;)
                        ready |= signal->sig[i] &~ blocked->sig[i];
                break;

        case 4: ready  = signal->sig[3] &~ blocked->sig[3];
                ready |= signal->sig[2] &~ blocked->sig[2];
                ready |= signal->sig[1] &~ blocked->sig[1];
                ready |= signal->sig[0] &~ blocked->sig[0];
                break;

        case 2: ready  = signal->sig[1] &~ blocked->sig[1];
                ready |= signal->sig[0] &~ blocked->sig[0];
                break;

        case 1: ready  = signal->sig[0] &~ blocked->sig[0];
        }
        return ready !=        0;
}

#define PENDING(p,b) has_pending_signals(&(p)->signal, (b))

static bool recalc_sigpending_tsk(struct task_struct *t)
{
        if ((t->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) ||
            PENDING(&t->pending, &t->blocked) ||
            PENDING(&t->signal->shared_pending, &t->blocked) ||
            cgroup_task_frozen(t)) {
                set_tsk_thread_flag(t, TIF_SIGPENDING);
                return true;
        }

        /*
         * We must never clear the flag in another thread, or in current
         * when it's possible the current syscall is returning -ERESTART*.
         * So we don't clear it here, and only callers who know they should do.
         */
        return false;
}

/*
 * After recalculating TIF_SIGPENDING, we need to make sure the task wakes up.
 * This is superfluous when called on current, the wakeup is a harmless no-op.
 */
void recalc_sigpending_and_wake(struct task_struct *t)
{
        if (recalc_sigpending_tsk(t))
                signal_wake_up(t, 0);
}

void recalc_sigpending(void)
{
        if (!recalc_sigpending_tsk(current) && !freezing(current) &&
            !klp_patch_pending(current))
                clear_thread_flag(TIF_SIGPENDING);

}
EXPORT_SYMBOL(recalc_sigpending);

void calculate_sigpending(void)
{
        /* Have any signals or users of TIF_SIGPENDING been delayed
         * until after fork?
         */
        spin_lock_irq(&current->sighand->siglock);
        set_tsk_thread_flag(current, TIF_SIGPENDING);
        recalc_sigpending();
        spin_unlock_irq(&current->sighand->siglock);
}

/* Given the mask, find the first available signal that should be serviced. */

#define SYNCHRONOUS_MASK \
        (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \
         sigmask(SIGTRAP) | sigmask(SIGFPE) | sigmask(SIGSYS))

int next_signal(struct sigpending *pending, sigset_t *mask)
{
        unsigned long i, *s, *m, x;
        int sig = 0;

        s = pending->signal.sig;
        m = mask->sig;

        /*
         * Handle the first word specially: it contains the
         * synchronous signals that need to be dequeued first.
         */
        x = *s &~ *m;
        if (x) {
                if (x & SYNCHRONOUS_MASK)
                        x &= SYNCHRONOUS_MASK;
                sig = ffz(~x) + 1;
                return sig;
        }

        switch (_NSIG_WORDS) {
        default:
                for (i = 1; i < _NSIG_WORDS; ++i) {
                        x = *++s &~ *++m;
                        if (!x)
                                continue;
                        sig = ffz(~x) + i*_NSIG_BPW + 1;
                        break;
                }
                break;

        case 2:
                x = s[1] &~ m[1];
                if (!x)
                        break;
                sig = ffz(~x) + _NSIG_BPW + 1;
                break;

        case 1:
                /* Nothing to do */
                break;
        }

        return sig;
}

static inline void print_dropped_signal(int sig)
{
        static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);

        if (!print_fatal_signals)
                return;

        if (!__ratelimit(&ratelimit_state))
                return;

        pr_info("%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n",
                                current->comm, current->pid, sig);
}

/**
 * task_set_jobctl_pending - set jobctl pending bits
 * @task: target task
 * @mask: pending bits to set
 *
 * Clear @mask from @task->jobctl.  @mask must be subset of
 * %JOBCTL_PENDING_MASK | %JOBCTL_STOP_CONSUME | %JOBCTL_STOP_SIGMASK |
 * %JOBCTL_TRAPPING.  If stop signo is being set, the existing signo is
 * cleared.  If @task is already being killed or exiting, this function
 * becomes noop.
 *
 * CONTEXT:
 * Must be called with @task->sighand->siglock held.
 *
 * RETURNS:
 * %true if @mask is set, %false if made noop because @task was dying.
 */
bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask)
{
        BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME |
                        JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING));
        BUG_ON((mask & JOBCTL_TRAPPING) && !(mask & JOBCTL_PENDING_MASK));

        if (unlikely(fatal_signal_pending(task) || (task->flags & PF_EXITING)))
                return false;

        if (mask & JOBCTL_STOP_SIGMASK)
                task->jobctl &= ~JOBCTL_STOP_SIGMASK;

        task->jobctl |= mask;
        return true;
}

/**
 * task_clear_jobctl_trapping - clear jobctl trapping bit
 * @task: target task
 *
 * If JOBCTL_TRAPPING is set, a ptracer is waiting for us to enter TRACED.
 * Clear it and wake up the ptracer.  Note that we don't need any further
 * locking.  @task->siglock guarantees that @task->parent points to the
 * ptracer.
 *
 * CONTEXT:
 * Must be called with @task->sighand->siglock held.
 */
void task_clear_jobctl_trapping(struct task_struct *task)
{
        if (unlikely(task->jobctl & JOBCTL_TRAPPING)) {
                task->jobctl &= ~JOBCTL_TRAPPING;
                smp_mb();        /* advised by wake_up_bit() */
                wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT);
        }
}

/**
 * task_clear_jobctl_pending - clear jobctl pending bits
 * @task: target task
 * @mask: pending bits to clear
 *
 * Clear @mask from @task->jobctl.  @mask must be subset of
 * %JOBCTL_PENDING_MASK.  If %JOBCTL_STOP_PENDING is being cleared, other
 * STOP bits are cleared together.
 *
 * If clearing of @mask leaves no stop or trap pending, this function calls
 * task_clear_jobctl_trapping().
 *
 * CONTEXT:
 * Must be called with @task->sighand->siglock held.
 */
void task_clear_jobctl_pending(struct task_struct *task, unsigned long mask)
{
        BUG_ON(mask & ~JOBCTL_PENDING_MASK);

        if (mask & JOBCTL_STOP_PENDING)
                mask |= JOBCTL_STOP_CONSUME | JOBCTL_STOP_DEQUEUED;

        task->jobctl &= ~mask;

        if (!(task->jobctl & JOBCTL_PENDING_MASK))
                task_clear_jobctl_trapping(task);
}

/**
 * task_participate_group_stop - participate in a group stop
 * @task: task participating in a group stop
 *
 * @task has %JOBCTL_STOP_PENDING set and is participating in a group stop.
 * Group stop states are cleared and the group stop count is consumed if
 * %JOBCTL_STOP_CONSUME was set.  If the consumption completes the group
 * stop, the appropriate `SIGNAL_*` flags are set.
 *
 * CONTEXT:
 * Must be called with @task->sighand->siglock held.
 *
 * RETURNS:
 * %true if group stop completion should be notified to the parent, %false
 * otherwise.
 */
static bool task_participate_group_stop(struct task_struct *task)
{
        struct signal_struct *sig = task->signal;
        bool consume = task->jobctl & JOBCTL_STOP_CONSUME;

        WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING));

        task_clear_jobctl_pending(task, JOBCTL_STOP_PENDING);

        if (!consume)
                return false;

        if (!WARN_ON_ONCE(sig->group_stop_count == 0))
                sig->group_stop_count--;

        /*
         * Tell the caller to notify completion iff we are entering into a
         * fresh group stop.  Read comment in do_signal_stop() for details.
         */
        if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) {
                signal_set_stop_flags(sig, SIGNAL_STOP_STOPPED);
                return true;
        }
        return false;
}

void task_join_group_stop(struct task_struct *task)
{
        unsigned long mask = current->jobctl & JOBCTL_STOP_SIGMASK;
        struct signal_struct *sig = current->signal;

        if (sig->group_stop_count) {
                sig->group_stop_count++;
                mask |= JOBCTL_STOP_CONSUME;
        } else if (!(sig->flags & SIGNAL_STOP_STOPPED))
                return;

        /* Have the new thread join an on-going signal group stop */
        task_set_jobctl_pending(task, mask | JOBCTL_STOP_PENDING);
}

/*
 * allocate a new signal queue record
 * - this may be called without locks if and only if t == current, otherwise an
 *   appropriate lock must be held to stop the target task from exiting
 */
static struct sigqueue *
__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
{
        struct sigqueue *q = NULL;
        struct user_struct *user;
        int sigpending;

        /*
         * Protect access to @t credentials. This can go away when all
         * callers hold rcu read lock.
         *
         * NOTE! A pending signal will hold on to the user refcount,
         * and we get/put the refcount only when the sigpending count
         * changes from/to zero.
         */
        rcu_read_lock();
        user = __task_cred(t)->user;
        sigpending = atomic_inc_return(&user->sigpending);
        if (sigpending == 1)
                get_uid(user);
        rcu_read_unlock();

        if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) {
                q = kmem_cache_alloc(sigqueue_cachep, flags);
        } else {
                print_dropped_signal(sig);
        }

        if (unlikely(q == NULL)) {
                if (atomic_dec_and_test(&user->sigpending))
                        free_uid(user);
        } else {
                INIT_LIST_HEAD(&q->list);
                q->flags = 0;
                q->user = user;
        }

        return q;
}

static void __sigqueue_free(struct sigqueue *q)
{
        if (q->flags & SIGQUEUE_PREALLOC)
                return;
        if (atomic_dec_and_test(&q->user->sigpending))
                free_uid(q->user);
        kmem_cache_free(sigqueue_cachep, q);
}

void flush_sigqueue(struct sigpending *queue)
{
        struct sigqueue *q;

        sigemptyset(&queue->signal);
        while (!list_empty(&queue->list)) {
                q = list_entry(queue->list.next, struct sigqueue , list);
                list_del_init(&q->list);
                __sigqueue_free(q);
        }
}

/*
 * Flush all pending signals for this kthread.
 */
void flush_signals(struct task_struct *t)
{
        unsigned long flags;

        spin_lock_irqsave(&t->sighand->siglock, flags);
        clear_tsk_thread_flag(t, TIF_SIGPENDING);
        flush_sigqueue(&t->pending);
        flush_sigqueue(&t->signal->shared_pending);
        spin_unlock_irqrestore(&t->sighand->siglock, flags);
}
EXPORT_SYMBOL(flush_signals);

#ifdef CONFIG_POSIX_TIMERS
static void __flush_itimer_signals(struct sigpending *pending)
{
        sigset_t signal, retain;
        struct sigqueue *q, *n;

        signal = pending->signal;
        sigemptyset(&retain);

        list_for_each_entry_safe(q, n, &pending->list, list) {
                int sig = q->info.si_signo;

                if (likely(q->info.si_code != SI_TIMER)) {
                        sigaddset(&retain, sig);
                } else {
                        sigdelset(&signal, sig);
                        list_del_init(&q->list);
                        __sigqueue_free(q);
                }
        }

        sigorsets(&pending->signal, &signal, &retain);
}

void flush_itimer_signals(void)
{
        struct task_struct *tsk = current;
        unsigned long flags;

        spin_lock_irqsave(&tsk->sighand->siglock, flags);
        __flush_itimer_signals(&tsk->pending);
        __flush_itimer_signals(&tsk->signal->shared_pending);
        spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
}
#endif

void ignore_signals(struct task_struct *t)
{
        int i;

        for (i = 0; i < _NSIG; ++i)
                t->sighand->action[i].sa.sa_handler = SIG_IGN;

        flush_signals(t);
}

/*
 * Flush all handlers for a task.
 */

void
flush_signal_handlers(struct task_struct *t, int force_default)
{
        int i;
        struct k_sigaction *ka = &t->sighand->action[0];
        for (i = _NSIG ; i != 0 ; i--) {
                if (force_default || ka->sa.sa_handler != SIG_IGN)
                        ka->sa.sa_handler = SIG_DFL;
                ka->sa.sa_flags = 0;
#ifdef __ARCH_HAS_SA_RESTORER
                ka->sa.sa_restorer = NULL;
#endif
                sigemptyset(&ka->sa.sa_mask);
                ka++;
        }
}

bool unhandled_signal(struct task_struct *tsk, int sig)
{
        void __user *handler = tsk->sighand->action[sig-1].sa.sa_handler;
        if (is_global_init(tsk))
                return true;

        if (handler != SIG_IGN && handler != SIG_DFL)
                return false;

        /* if ptraced, let the tracer determine */
        return !tsk->ptrace;
}

static void collect_signal(int sig, struct sigpending *list, kernel_siginfo_t *info,
                           bool *resched_timer)
{
        struct sigqueue *q, *first = NULL;

        /*
         * Collect the siginfo appropriate to this signal.  Check if
         * there is another siginfo for the same signal.
        */
        list_for_each_entry(q, &list->list, list) {
                if (q->info.si_signo == sig) {
                        if (first)
                                goto still_pending;
                        first = q;
                }
        }

        sigdelset(&list->signal, sig);

        if (first) {
still_pending:
                list_del_init(&first->list);
                copy_siginfo(info, &first->info);

                *resched_timer =
                        (first->flags & SIGQUEUE_PREALLOC) &&
                        (info->si_code == SI_TIMER) &&
                        (info->si_sys_private);

                __sigqueue_free(first);
        } else {
                /*
                 * Ok, it wasn't in the queue.  This must be
                 * a fast-pathed signal or we must have been
                 * out of queue space.  So zero out the info.
                 */
                clear_siginfo(info);
                info->si_signo = sig;
                info->si_errno = 0;
                info->si_code = SI_USER;
                info->si_pid = 0;
                info->si_uid = 0;
        }
}

static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
                        kernel_siginfo_t *info, bool *resched_timer)
{
        int sig = next_signal(pending, mask);

        if (sig)
                collect_signal(sig, pending, info, resched_timer);
        return sig;
}

/*
 * Dequeue a signal and return the element to the caller, which is
 * expected to free it.
 *
 * All callers have to hold the siglock.
 */
int dequeue_signal(struct task_struct *tsk, sigset_t *mask, kernel_siginfo_t *info)
{
        bool resched_timer = false;
        int signr;

        /* We only dequeue private signals from ourselves, we don't let
         * signalfd steal them
         */
        signr = __dequeue_signal(&tsk->pending, mask, info, &resched_timer);
        if (!signr) {
                signr = __dequeue_signal(&tsk->signal->shared_pending,
                                         mask, info, &resched_timer);
#ifdef CONFIG_POSIX_TIMERS
                /*
                 * itimer signal ?
                 *
                 * itimers are process shared and we restart periodic
                 * itimers in the signal delivery path to prevent DoS
                 * attacks in the high resolution timer case. This is
                 * compliant with the old way of self-restarting
                 * itimers, as the SIGALRM is a legacy signal and only
                 * queued once. Changing the restart behaviour to
                 * restart the timer in the signal dequeue path is
                 * reducing the timer noise on heavy loaded !highres
                 * systems too.
                 */
                if (unlikely(signr == SIGALRM)) {
                        struct hrtimer *tmr = &tsk->signal->real_timer;

                        if (!hrtimer_is_queued(tmr) &&
                            tsk->signal->it_real_incr != 0) {
                                hrtimer_forward(tmr, tmr->base->get_time(),
                                                tsk->signal->it_real_incr);
                                hrtimer_restart(tmr);
                        }
                }
#endif
        }

        recalc_sigpending();
        if (!signr)
                return 0;

        if (unlikely(sig_kernel_stop(signr))) {
                /*
                 * Set a marker that we have dequeued a stop signal.  Our
                 * caller might release the siglock and then the pending
                 * stop signal it is about to process is no longer in the
                 * pending bitmasks, but must still be cleared by a SIGCONT
                 * (and overruled by a SIGKILL).  So those cases clear this
                 * shared flag after we've set it.  Note that this flag may
                 * remain set after the signal we return is ignored or
                 * handled.  That doesn't matter because its only purpose
                 * is to alert stop-signal processing code when another
                 * processor has come along and cleared the flag.
                 */
                current->jobctl |= JOBCTL_STOP_DEQUEUED;
        }
#ifdef CONFIG_POSIX_TIMERS
        if (resched_timer) {
                /*
                 * Release the siglock to ensure proper locking order
                 * of timer locks outside of siglocks.  Note, we leave
                 * irqs disabled here, since the posix-timers code is
                 * about to disable them again anyway.
                 */
                spin_unlock(&tsk->sighand->siglock);
                posixtimer_rearm(info);
                spin_lock(&tsk->sighand->siglock);

                /* Don't expose the si_sys_private value to userspace */
                info->si_sys_private = 0;
        }
#endif
        return signr;
}
EXPORT_SYMBOL_GPL(dequeue_signal);

static int dequeue_synchronous_signal(kernel_siginfo_t *info)
{
        struct task_struct *tsk = current;
        struct sigpending *pending = &tsk->pending;
        struct sigqueue *q, *sync = NULL;

        /*
         * Might a synchronous signal be in the queue?
         */
        if (!((pending->signal.sig[0] & ~tsk->blocked.sig[0]) & SYNCHRONOUS_MASK))
                return 0;

        /*
         * Return the first synchronous signal in the queue.
         */
        list_for_each_entry(q, &pending->list, list) {
                /* Synchronous signals have a positive si_code */
                if ((q->info.si_code > SI_USER) &&
                    (sigmask(q->info.si_signo) & SYNCHRONOUS_MASK)) {
                        sync = q;
                        goto next;
                }
        }
        return 0;
next:
        /*
         * Check if there is another siginfo for the same signal.
         */
        list_for_each_entry_continue(q, &pending->list, list) {
                if (q->info.si_signo == sync->info.si_signo)
                        goto still_pending;
        }

        sigdelset(&pending->signal, sync->info.si_signo);
        recalc_sigpending();
still_pending:
        list_del_init(&sync->list);
        copy_siginfo(info, &sync->info);
        __sigqueue_free(sync);
        return info->si_signo;
}

/*
 * Tell a process that it has a new active signal..
 *
 * NOTE! we rely on the previous spin_lock to
 * lock interrupts for us! We can only be called with
 * "siglock" held, and the local interrupt must
 * have been disabled when that got acquired!
 *
 * No need to set need_resched since signal event passing
 * goes through ->blocked
 */
void signal_wake_up_state(struct task_struct *t, unsigned int state)
{
        set_tsk_thread_flag(t, TIF_SIGPENDING);
        /*
         * TASK_WAKEKILL also means wake it up in the stopped/traced/killable
         * case. We don't check t->state here because there is a race with it
         * executing another processor and just now entering stopped state.
         * By using wake_up_state, we ensure the process will wake up and
         * handle its death signal.
         */
        if (!wake_up_state(t, state | TASK_INTERRUPTIBLE))
                kick_process(t);
}

/*
 * Remove signals in mask from the pending set and queue.
 * Returns 1 if any signals were found.
 *
 * All callers must be holding the siglock.
 */
static void flush_sigqueue_mask(sigset_t *mask, struct sigpending *s)
{
        struct sigqueue *q, *n;
        sigset_t m;

        sigandsets(&m, mask, &s->signal);
        if (sigisemptyset(&m))
                return;

        sigandnsets(&s->signal, &s->signal, mask);
        list_for_each_entry_safe(q, n, &s->list, list) {
                if (sigismember(mask, q->info.si_signo)) {
                        list_del_init(&q->list);
                        __sigqueue_free(q);
                }
        }
}

static inline int is_si_special(const struct kernel_siginfo *info)
{
        return info <= SEND_SIG_PRIV;
}

static inline bool si_fromuser(const struct kernel_siginfo *info)
{
        return info == SEND_SIG_NOINFO ||
                (!is_si_special(info) && SI_FROMUSER(info));
}

/*
 * called with RCU read lock from check_kill_permission()
 */
static bool kill_ok_by_cred(struct task_struct *t)
{
        const struct cred *cred = current_cred();
        const struct cred *tcred = __task_cred(t);

        return uid_eq(cred->euid, tcred->suid) ||
               uid_eq(cred->euid, tcred->uid) ||
               uid_eq(cred->uid, tcred->suid) ||
               uid_eq(cred->uid, tcred->uid) ||
               ns_capable(tcred->user_ns, CAP_KILL);
}

/*
 * Bad permissions for sending the signal
 * - the caller must hold the RCU read lock
 */
static int check_kill_permission(int sig, struct kernel_siginfo *info,
                                 struct task_struct *t)
{
        struct pid *sid;
        int error;

        if (!valid_signal(sig))
                return -EINVAL;

        if (!si_fromuser(info))
                return 0;

        error = audit_signal_info(sig, t); /* Let audit system see the signal */
        if (error)
                return error;

        if (!same_thread_group(current, t) &&
            !kill_ok_by_cred(t)) {
                switch (sig) {
                case SIGCONT:
                        sid = task_session(t);
                        /*
                         * We don't return the error if sid == NULL. The
                         * task was unhashed, the caller must notice this.
                         */
                        if (!sid || sid == task_session(current))
                                break;
                        fallthrough;
                default:
                        return -EPERM;
                }
        }

        return security_task_kill(t, info, sig, NULL);
}

/**
 * ptrace_trap_notify - schedule trap to notify ptracer
 * @t: tracee wanting to notify tracer
 *
 * This function schedules sticky ptrace trap which is cleared on the next
 * TRAP_STOP to notify ptracer of an event.  @t must have been seized by
 * ptracer.
 *
 * If @t is running, STOP trap will be taken.  If trapped for STOP and
 * ptracer is listening for events, tracee is woken up so that it can
 * re-trap for the new event.  If trapped otherwise, STOP trap will be
 * eventually taken without returning to userland after the existing traps
 * are finished by PTRACE_CONT.
 *
 * CONTEXT:
 * Must be called with @task->sighand->siglock held.
 */
static void ptrace_trap_notify(struct task_struct *t)
{
        WARN_ON_ONCE(!(t->ptrace & PT_SEIZED));
        assert_spin_locked(&t->sighand->siglock);

        task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY);
        ptrace_signal_wake_up(t, t->jobctl & JOBCTL_LISTENING);
}

/*
 * Handle magic process-wide effects of stop/continue signals. Unlike
 * the signal actions, these happen immediately at signal-generation
 * time regardless of blocking, ignoring, or handling.  This does the
 * actual continuing for SIGCONT, but not the actual stopping for stop
 * signals. The process stop is done as a signal action for SIG_DFL.
 *
 * Returns true if the signal should be actually delivered, otherwise
 * it should be dropped.
 */
static bool prepare_signal(int sig, struct task_struct *p, bool force)
{
        struct signal_struct *signal = p->signal;
        struct task_struct *t;
        sigset_t flush;

        if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) {
                if (!(signal->flags & SIGNAL_GROUP_EXIT))
                        return sig == SIGKILL;
                /*
                 * The process is in the middle of dying, nothing to do.
                 */
        } else if (sig_kernel_stop(sig)) {
                /*
                 * This is a stop signal.  Remove SIGCONT from all queues.
                 */
                siginitset(&flush, sigmask(SIGCONT));
                flush_sigqueue_mask(&flush, &signal->shared_pending);
                for_each_thread(p, t)
                        flush_sigqueue_mask(&flush, &t->pending);
        } else if (sig == SIGCONT) {
                unsigned int why;
                /*
                 * Remove all stop signals from all queues, wake all threads.
                 */
                siginitset(&flush, SIG_KERNEL_STOP_MASK);
                flush_sigqueue_mask(&flush, &signal->shared_pending);
                for_each_thread(p, t) {
                        flush_sigqueue_mask(&flush, &t->pending);
                        task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
                        if (likely(!(t->ptrace & PT_SEIZED)))
                                wake_up_state(t, __TASK_STOPPED);
                        else
                                ptrace_trap_notify(t);
                }

                /*
                 * Notify the parent with CLD_CONTINUED if we were stopped.
                 *
                 * If we were in the middle of a group stop, we pretend it
                 * was already finished, and then continued. Since SIGCHLD
                 * doesn't queue we report only CLD_STOPPED, as if the next
                 * CLD_CONTINUED was dropped.
                 */
                why = 0;
                if (signal->flags & SIGNAL_STOP_STOPPED)
                        why |= SIGNAL_CLD_CONTINUED;
                else if (signal->group_stop_count)
                        why |= SIGNAL_CLD_STOPPED;

                if (why) {
                        /*
                         * The first thread which returns from do_signal_stop()
                         * will take ->siglock, notice SIGNAL_CLD_MASK, and
                         * notify its parent. See get_signal().
                         */
                        signal_set_stop_flags(signal, why | SIGNAL_STOP_CONTINUED);
                        signal->group_stop_count = 0;
                        signal->group_exit_code = 0;
                }
        }

        return !sig_ignored(p, sig, force);
}

/*
 * Test if P wants to take SIG.  After we've checked all threads with this,
 * it's equivalent to finding no threads not blocking SIG.  Any threads not
 * blocking SIG were ruled out because they are not running and already
 * have pending signals.  Such threads will dequeue from the shared queue
 * as soon as they're available, so putting the signal on the shared queue
 * will be equivalent to sending it to one such thread.
 */
static inline bool wants_signal(int sig, struct task_struct *p)
{
        if (sigismember(&p->blocked, sig))
                return false;

        if (p->flags & PF_EXITING)
                return false;

        if (sig == SIGKILL)
                return true;

        if (task_is_stopped_or_traced(p))
                return false;

        return task_curr(p) || !task_sigpending(p);
}

static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
{
        struct signal_struct *signal = p->signal;
        struct task_struct *t;

        /*
         * Now find a thread we can wake up to take the signal off the queue.
         *
         * If the main thread wants the signal, it gets first crack.
         * Probably the least surprising to the average bear.
         */
        if (wants_signal(sig, p))
                t = p;
        else if ((type == PIDTYPE_PID) || thread_group_empty(p))
                /*
                 * There is just one thread and it does not need to be woken.
                 * It will dequeue unblocked signals before it runs again.
                 */
                return;
        else {
                /*
                 * Otherwise try to find a suitable thread.
                 */
                t = signal->curr_target;
                while (!wants_signal(sig, t)) {
                        t = next_thread(t);
                        if (t == signal->curr_target)
                                /*
                                 * No thread needs to be woken.
                                 * Any eligible threads will see
                                 * the signal in the queue soon.
                                 */
                                return;
                }
                signal->curr_target = t;
        }

        /*
         * Found a killable thread.  If the signal will be fatal,
         * then start taking the whole group down immediately.
         */
        if (sig_fatal(p, sig) &&
            !(signal->flags & SIGNAL_GROUP_EXIT) &&
            !sigismember(&t->real_blocked, sig) &&
            (sig == SIGKILL || !p->ptrace)) {
                /*
                 * This signal will be fatal to the whole group.
                 */
                if (!sig_kernel_coredump(sig)) {
                        /*
                         * Start a group exit and wake everybody up.
                         * This way we don't have other threads
                         * running and doing things after a slower
                         * thread has the fatal signal pending.
                         */
                        signal->flags = SIGNAL_GROUP_EXIT;
                        signal->group_exit_code = sig;
                        signal->group_stop_count = 0;
                        t = p;
                        do {
                                task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
                                sigaddset(&t->pending.signal, SIGKILL);
                                signal_wake_up(t, 1);
                        } while_each_thread(p, t);
                        return;
                }
        }

        /*
         * The signal is already in the shared-pending queue.
         * Tell the chosen thread to wake up and dequeue it.
         */
        signal_wake_up(t, sig == SIGKILL);
        return;
}

static inline bool legacy_queue(struct sigpending *signals, int sig)
{
        return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
}

static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t,
                        enum pid_type type, bool force)
{
        struct sigpending *pending;
        struct sigqueue *q;
        int override_rlimit;
        int ret = 0, result;

        assert_spin_locked(&t->sighand->siglock);

        result = TRACE_SIGNAL_IGNORED;
        if (!prepare_signal(sig, t, force))
                goto ret;

        pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending;
        /*
         * Short-circuit ignored signals and support queuing
         * exactly one non-rt signal, so that we can get more
         * detailed information about the cause of the signal.
         */
        result = TRACE_SIGNAL_ALREADY_PENDING;
        if (legacy_queue(pending, sig))
                goto ret;

        result = TRACE_SIGNAL_DELIVERED;
        /*
         * Skip useless siginfo allocation for SIGKILL and kernel threads.
         */
        if ((sig == SIGKILL) || (t->flags & PF_KTHREAD))
                goto out_set;

        /*
         * Real-time signals must be queued if sent by sigqueue, or
         * some other real-time mechanism.  It is implementation
         * defined whether kill() does so.  We attempt to do so, on
         * the principle of least surprise, but since kill is not
         * allowed to fail with EAGAIN when low on memory we just
         * make sure at least one signal gets delivered and don't
         * pass on the info struct.
         */
        if (sig < SIGRTMIN)
                override_rlimit = (is_si_special(info) || info->si_code >= 0);
        else
                override_rlimit = 0;

        q = __sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit);
        if (q) {
                list_add_tail(&q->list, &pending->list);
                switch ((unsigned long) info) {
                case (unsigned long) SEND_SIG_NOINFO:
                        clear_siginfo(&q->info);
                        q->info.si_signo = sig;
                        q->info.si_errno = 0;
                        q->info.si_code = SI_USER;
                        q->info.si_pid = task_tgid_nr_ns(current,
                                                        task_active_pid_ns(t));
                        rcu_read_lock();
                        q->info.si_uid =
                                from_kuid_munged(task_cred_xxx(t, user_ns),
                                                 current_uid());
                        rcu_read_unlock();
                        break;
                case (unsigned long) SEND_SIG_PRIV:
                        clear_siginfo(&q->info);
                        q->info.si_signo = sig;
                        q->info.si_errno = 0;
                        q->info.si_code = SI_KERNEL;
                        q->info.si_pid = 0;
                        q->info.si_uid = 0;
                        break;
                default:
                        copy_siginfo(&q->info, info);
                        break;
                }
        } else if (!is_si_special(info) &&
                   sig >= SIGRTMIN && info->si_code != SI_USER) {
                /*
                 * Queue overflow, abort.  We may abort if the
                 * signal was rt and sent by user using something
                 * other than kill().
                 */
                result = TRACE_SIGNAL_OVERFLOW_FAIL;
                ret = -EAGAIN;
                goto ret;
        } else {
                /*
                 * This is a silent loss of information.  We still
                 * send the signal, but the *info bits are lost.
                 */
                result = TRACE_SIGNAL_LOSE_INFO;
        }

out_set:
        signalfd_notify(t, sig);
        sigaddset(&pending->signal, sig);

        /* Let multiprocess signals appear after on-going forks */
        if (type > PIDTYPE_TGID) {
                struct multiprocess_signals *delayed;
                hlist_for_each_entry(delayed, &t->signal->multiprocess, node) {
                        sigset_t *signal = &delayed->signal;
                        /* Can't queue both a stop and a continue signal */
                        if (sig == SIGCONT)
                                sigdelsetmask(signal, SIG_KERNEL_STOP_MASK);
                        else if (sig_kernel_stop(sig))
                                sigdelset(signal, SIGCONT);
                        sigaddset(signal, sig);
                }
        }

        complete_signal(sig, t, type);
ret:
        trace_signal_generate(sig, info, t, type != PIDTYPE_PID, result);
        return ret;
}

static inline bool has_si_pid_and_uid(struct kernel_siginfo *info)
{
        bool ret = false;
        switch (siginfo_layout(info->si_signo, info->si_code)) {
        case SIL_KILL:
        case SIL_CHLD:
        case SIL_RT:
                ret = true;
                break;
        case SIL_TIMER:
        case SIL_POLL:
        case SIL_FAULT:
        case SIL_FAULT_MCEERR:
        case SIL_FAULT_BNDERR:
        case SIL_FAULT_PKUERR:
        case SIL_SYS:
                ret = false;
                break;
        }
        return ret;
}

static int send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t,
                        enum pid_type type)
{
        /* Should SIGKILL or SIGSTOP be received by a pid namespace init? */
        bool force = false;

        if (info == SEND_SIG_NOINFO) {
                /* Force if sent from an ancestor pid namespace */
                force = !task_pid_nr_ns(current, task_active_pid_ns(t));
        } else if (info == SEND_SIG_PRIV) {
                /* Don't ignore kernel generated signals */
                force = true;
        } else if (has_si_pid_and_uid(info)) {
                /* SIGKILL and SIGSTOP is special or has ids */
                struct user_namespace *t_user_ns;

                rcu_read_lock();
                t_user_ns = task_cred_xxx(t, user_ns);
                if (current_user_ns() != t_user_ns) {
                        kuid_t uid = make_kuid(current_user_ns(), info->si_uid);
                        info->si_uid = from_kuid_munged(t_user_ns, uid);
                }
                rcu_read_unlock();

                /* A kernel generated signal? */
                force = (info->si_code == SI_KERNEL);

                /* From an ancestor pid namespace? */
                if (!task_pid_nr_ns(current, task_active_pid_ns(t))) {
                        info->si_pid = 0;
                        force = true;
                }
        }
        return __send_signal(sig, info, t, type, force);
}

static void print_fatal_signal(int signr)
{
        struct pt_regs *regs = signal_pt_regs();
        pr_info("potentially unexpected fatal signal %d.\n", signr);

#if defined(__i386__) && !defined(__arch_um__)
        pr_info("code at %08lx: ", regs->ip);
        {
                int i;
                for (i = 0; i < 16; i++) {
                        unsigned char insn;

                        if (get_user(insn, (unsigned char *)(regs->ip + i)))
                                break;
                        pr_cont("%02x ", insn);
                }
        }
        pr_cont("\n");
#endif
        preempt_disable();
        show_regs(regs);
        preempt_enable();
}

static int __init setup_print_fatal_signals(char *str)
{
        get_option (&str, &print_fatal_signals);

        return 1;
}

__setup("print-fatal-signals=", setup_print_fatal_signals);

int
__group_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p)
{
        return send_signal(sig, info, p, PIDTYPE_TGID);
}

int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p,
                        enum pid_type type)
{
        unsigned long flags;
        int ret = -ESRCH;

        if (lock_task_sighand(p, &flags)) {
                ret = send_signal(sig, info, p, type);
                unlock_task_sighand(p, &flags);
        }

        return ret;
}

/*
 * Force a signal that the process can't ignore: if necessary
 * we unblock the signal and change any SIG_IGN to SIG_DFL.
 *
 * Note: If we unblock the signal, we always reset it to SIG_DFL,
 * since we do not want to have a signal handler that was blocked
 * be invoked when user space had explicitly blocked it.
 *
 * We don't want to have recursive SIGSEGV's etc, for example,
 * that is why we also clear SIGNAL_UNKILLABLE.
 */
static int
force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t)
{
        unsigned long int flags;
        int ret, blocked, ignored;
        struct k_sigaction *action;
        int sig = info->si_signo;

        spin_lock_irqsave(&t->sighand->siglock, flags);
        action = &t->sighand->action[sig-1];
        ignored = action->sa.sa_handler == SIG_IGN;
        blocked = sigismember(&t->blocked, sig);
        if (blocked || ignored) {
                action->sa.sa_handler = SIG_DFL;
                if (blocked) {
                        sigdelset(&t->blocked, sig);
                        recalc_sigpending_and_wake(t);
                }
        }
        /*
         * Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect
         * debugging to leave init killable.
         */
        if (action->sa.sa_handler == SIG_DFL && !t->ptrace)
                t->signal->flags &= ~SIGNAL_UNKILLABLE;
        ret = send_signal(sig, info, t, PIDTYPE_PID);
        spin_unlock_irqrestore(&t->sighand->siglock, flags);

        return ret;
}

int force_sig_info(struct kernel_siginfo *info)
{
        return force_sig_info_to_task(info, current);
}

/*
 * Nuke all other threads in the group.
 */
int zap_other_threads(struct task_struct *p)
{
        struct task_struct *t = p;
        int count = 0;

        p->signal->group_stop_count = 0;

        while_each_thread(p, t) {
                task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
                count++;

                /* Don't bother with already dead threads */
                if (t->exit_state)
                        continue;
                sigaddset(&t->pending.signal, SIGKILL);
                signal_wake_up(t, 1);
        }

        return count;
}

struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
                                           unsigned long *flags)
{
        struct sighand_struct *sighand;

        rcu_read_lock();
        for (;;) {
                sighand = rcu_dereference(tsk->sighand);
                if (unlikely(sighand == NULL))
                        break;

                /*
                 * This sighand can be already freed and even reused, but
                 * we rely on SLAB_TYPESAFE_BY_RCU and sighand_ctor() which
                 * initializes ->siglock: this slab can't go away, it has
                 * the same object type, ->siglock can't be reinitialized.
                 *
                 * We need to ensure that tsk->sighand is still the same
                 * after we take the lock, we can race with de_thread() or
                 * __exit_signal(). In the latter case the next iteration
                 * must see ->sighand == NULL.
                 */
                spin_lock_irqsave(&sighand->siglock, *flags);
                if (likely(sighand == rcu_access_pointer(tsk->sighand)))
                        break;
                spin_unlock_irqrestore(&sighand->siglock, *flags);
        }
        rcu_read_unlock();

        return sighand;
}

/*
 * send signal info to all the members of a group
 */
int group_send_sig_info(int sig, struct kernel_siginfo *info,
                        struct task_struct *p, enum pid_type type)
{
        int ret;

        rcu_read_lock();
        ret = check_kill_permission(sig, info, p);
        rcu_read_unlock();

        if (!ret && sig)
                ret = do_send_sig_info(sig, info, p, type);

        return ret;
}

/*
 * __kill_pgrp_info() sends a signal to a process group: this is what the tty
 * control characters do (^C, ^Z etc)
 * - the caller must hold at least a readlock on tasklist_lock
 */
int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp)
{
        struct task_struct *p = NULL;
        int retval, success;

        success = 0;
        retval = -ESRCH;
        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
                int err = group_send_sig_info(sig, info, p, PIDTYPE_PGID);
                success |= !err;
                retval = err;
        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
        return success ? 0 : retval;
}

int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
{
        int error = -ESRCH;
        struct task_struct *p;

        for (;;) {
                rcu_read_lock();
                p = pid_task(pid, PIDTYPE_PID);
                if (p)
                        error = group_send_sig_info(sig, info, p, PIDTYPE_TGID);
                rcu_read_unlock();
                if (likely(!p || error != -ESRCH))
                        return error;

                /*
                 * The task was unhashed in between, try again.  If it
                 * is dead, pid_task() will return NULL, if we race with
                 * de_thread() it will find the new leader.
                 */
        }
}

static int kill_proc_info(int sig, struct kernel_siginfo *info, pid_t pid)
{
        int error;
        rcu_read_lock();
        error = kill_pid_info(sig, info, find_vpid(pid));
        rcu_read_unlock();
        return error;
}

static inline bool kill_as_cred_perm(const struct cred *cred,
                                     struct task_struct *target)
{
        const struct cred *pcred = __task_cred(target);

        return uid_eq(cred->euid, pcred->suid) ||
               uid_eq(cred->euid, pcred->uid) ||
               uid_eq(cred->uid, pcred->suid) ||
               uid_eq(cred->uid, pcred->uid);
}

/*
 * The usb asyncio usage of siginfo is wrong.  The glibc support
 * for asyncio which uses SI_ASYNCIO assumes the layout is SIL_RT.
 * AKA after the generic fields:
 *        kernel_pid_t        si_pid;
 *        kernel_uid32_t        si_uid;
 *        sigval_t        si_value;
 *
 * Unfortunately when usb generates SI_ASYNCIO it assumes the layout
 * after the generic fields is:
 *        void __user         *si_addr;
 *
 * This is a practical problem when there is a 64bit big endian kernel
 * and a 32bit userspace.  As the 32bit address will encoded in the low
 * 32bits of the pointer.  Those low 32bits will be stored at higher
 * address than appear in a 32 bit pointer.  So userspace will not
 * see the address it was expecting for it's completions.
 *
 * There is nothing in the encoding that can allow
 * copy_siginfo_to_user32 to detect this confusion of formats, so
 * handle this by requiring the caller of kill_pid_usb_asyncio to
 * notice when this situration takes place and to store the 32bit
 * pointer in sival_int, instead of sival_addr of the sigval_t addr
 * parameter.
 */
int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr,
                         struct pid *pid, const struct cred *cred)
{
        struct kernel_siginfo info;
        struct task_struct *p;
        unsigned long flags;
        int ret = -EINVAL;

        if (!valid_signal(sig))
                return ret;

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = errno;
        info.si_code = SI_ASYNCIO;
        *((sigval_t *)&info.si_pid) = addr;

        rcu_read_lock();
        p = pid_task(pid, PIDTYPE_PID);
        if (!p) {
                ret = -ESRCH;
                goto out_unlock;
        }
        if (!kill_as_cred_perm(cred, p)) {
                ret = -EPERM;
                goto out_unlock;
        }
        ret = security_task_kill(p, &info, sig, cred);
        if (ret)
                goto out_unlock;

        if (sig) {
                if (lock_task_sighand(p, &flags)) {
                        ret = __send_signal(sig, &info, p, PIDTYPE_TGID, false);
                        unlock_task_sighand(p, &flags);
                } else
                        ret = -ESRCH;
        }
out_unlock:
        rcu_read_unlock();
        return ret;
}
EXPORT_SYMBOL_GPL(kill_pid_usb_asyncio);

/*
 * kill_something_info() interprets pid in interesting ways just like kill(2).
 *
 * POSIX specifies that kill(-1,sig) is unspecified, but what we have
 * is probably wrong.  Should make it like BSD or SYSV.
 */

static int kill_something_info(int sig, struct kernel_siginfo *info, pid_t pid)
{
        int ret;

        if (pid > 0)
                return kill_proc_info(sig, info, pid);

        /* -INT_MIN is undefined.  Exclude this case to avoid a UBSAN warning */
        if (pid == INT_MIN)
                return -ESRCH;

        read_lock(&tasklist_lock);
        if (pid != -1) {
                ret = __kill_pgrp_info(sig, info,
                                pid ? find_vpid(-pid) : task_pgrp(current));
        } else {
                int retval = 0, count = 0;
                struct task_struct * p;

                for_each_process(p) {
                        if (task_pid_vnr(p) > 1 &&
                                        !same_thread_group(p, current)) {
                                int err = group_send_sig_info(sig, info, p,
                                                              PIDTYPE_MAX);
                                ++count;
                                if (err != -EPERM)
                                        retval = err;
                        }
                }
                ret = count ? retval : -ESRCH;
        }
        read_unlock(&tasklist_lock);

        return ret;
}

/*
 * These are for backward compatibility with the rest of the kernel source.
 */

int send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p)
{
        /*
         * Make sure legacy kernel users don't send in bad values
         * (normal paths check this in check_kill_permission).
         */
        if (!valid_signal(sig))
                return -EINVAL;

        return do_send_sig_info(sig, info, p, PIDTYPE_PID);
}
EXPORT_SYMBOL(send_sig_info);

#define __si_special(priv) \
        ((priv) ? SEND_SIG_PRIV : SEND_SIG_NOINFO)

int
send_sig(int sig, struct task_struct *p, int priv)
{
        return send_sig_info(sig, __si_special(priv), p);
}
EXPORT_SYMBOL(send_sig);

void force_sig(int sig)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        info.si_code = SI_KERNEL;
        info.si_pid = 0;
        info.si_uid = 0;
        force_sig_info(&info);
}
EXPORT_SYMBOL(force_sig);

/*
 * When things go south during signal handling, we
 * will force a SIGSEGV. And if the signal that caused
 * the problem was already a SIGSEGV, we'll want to
 * make sure we don't even try to deliver the signal..
 */
void force_sigsegv(int sig)
{
        struct task_struct *p = current;

        if (sig == SIGSEGV) {
                unsigned long flags;
                spin_lock_irqsave(&p->sighand->siglock, flags);
                p->sighand->action[sig - 1].sa.sa_handler = SIG_DFL;
                spin_unlock_irqrestore(&p->sighand->siglock, flags);
        }
        force_sig(SIGSEGV);
}

int force_sig_fault_to_task(int sig, int code, void __user *addr
        ___ARCH_SI_TRAPNO(int trapno)
        ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
        , struct task_struct *t)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        info.si_code  = code;
        info.si_addr  = addr;
#ifdef __ARCH_SI_TRAPNO
        info.si_trapno = trapno;
#endif
#ifdef __ia64__
        info.si_imm = imm;
        info.si_flags = flags;
        info.si_isr = isr;
#endif
        return force_sig_info_to_task(&info, t);
}

int force_sig_fault(int sig, int code, void __user *addr
        ___ARCH_SI_TRAPNO(int trapno)
        ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr))
{
        return force_sig_fault_to_task(sig, code, addr
                                       ___ARCH_SI_TRAPNO(trapno)
                                       ___ARCH_SI_IA64(imm, flags, isr), current);
}

int send_sig_fault(int sig, int code, void __user *addr
        ___ARCH_SI_TRAPNO(int trapno)
        ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
        , struct task_struct *t)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        info.si_code  = code;
        info.si_addr  = addr;
#ifdef __ARCH_SI_TRAPNO
        info.si_trapno = trapno;
#endif
#ifdef __ia64__
        info.si_imm = imm;
        info.si_flags = flags;
        info.si_isr = isr;
#endif
        return send_sig_info(info.si_signo, &info, t);
}

int force_sig_mceerr(int code, void __user *addr, short lsb)
{
        struct kernel_siginfo info;

        WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR));
        clear_siginfo(&info);
        info.si_signo = SIGBUS;
        info.si_errno = 0;
        info.si_code = code;
        info.si_addr = addr;
        info.si_addr_lsb = lsb;
        return force_sig_info(&info);
}

int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t)
{
        struct kernel_siginfo info;

        WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR));
        clear_siginfo(&info);
        info.si_signo = SIGBUS;
        info.si_errno = 0;
        info.si_code = code;
        info.si_addr = addr;
        info.si_addr_lsb = lsb;
        return send_sig_info(info.si_signo, &info, t);
}
EXPORT_SYMBOL(send_sig_mceerr);

int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = SIGSEGV;
        info.si_errno = 0;
        info.si_code  = SEGV_BNDERR;
        info.si_addr  = addr;
        info.si_lower = lower;
        info.si_upper = upper;
        return force_sig_info(&info);
}

#ifdef SEGV_PKUERR
int force_sig_pkuerr(void __user *addr, u32 pkey)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = SIGSEGV;
        info.si_errno = 0;
        info.si_code  = SEGV_PKUERR;
        info.si_addr  = addr;
        info.si_pkey  = pkey;
        return force_sig_info(&info);
}
#endif

/* For the crazy architectures that include trap information in
 * the errno field, instead of an actual errno value.
 */
int force_sig_ptrace_errno_trap(int errno, void __user *addr)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = SIGTRAP;
        info.si_errno = errno;
        info.si_code  = TRAP_HWBKPT;
        info.si_addr  = addr;
        return force_sig_info(&info);
}

int kill_pgrp(struct pid *pid, int sig, int priv)
{
        int ret;

        read_lock(&tasklist_lock);
        ret = __kill_pgrp_info(sig, __si_special(priv), pid);
        read_unlock(&tasklist_lock);

        return ret;
}
EXPORT_SYMBOL(kill_pgrp);

int kill_pid(struct pid *pid, int sig, int priv)
{
        return kill_pid_info(sig, __si_special(priv), pid);
}
EXPORT_SYMBOL(kill_pid);

/*
 * These functions support sending signals using preallocated sigqueue
 * structures.  This is needed "because realtime applications cannot
 * afford to lose notifications of asynchronous events, like timer
 * expirations or I/O completions".  In the case of POSIX Timers
 * we allocate the sigqueue structure from the timer_create.  If this
 * allocation fails we are able to report the failure to the application
 * with an EAGAIN error.
 */
struct sigqueue *sigqueue_alloc(void)
{
        struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);

        if (q)
                q->flags |= SIGQUEUE_PREALLOC;

        return q;
}

void sigqueue_free(struct sigqueue *q)
{
        spinlock_t *lock = &current->sighand->siglock;
        unsigned long flags;

        if (WARN_ON_ONCE(!(q->flags & SIGQUEUE_PREALLOC)))
                return;
        /*
         * We must hold ->siglock while testing q->list
         * to serialize with collect_signal() or with
         * __exit_signal()->flush_sigqueue().
         */
        spin_lock_irqsave(lock, flags);
        q->flags &= ~SIGQUEUE_PREALLOC;
        /*
         * If it is queued it will be freed when dequeued,
         * like the "regular" sigqueue.
         */
        if (!list_empty(&q->list))
                q = NULL;
        spin_unlock_irqrestore(lock, flags);

        if (q)
                __sigqueue_free(q);
}

int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type)
{
        int sig = q->info.si_signo;
        struct sigpending *pending;
        struct task_struct *t;
        unsigned long flags;
        int ret, result;

        if (WARN_ON_ONCE(!(q->flags & SIGQUEUE_PREALLOC)))
                return 0;
        if (WARN_ON_ONCE(q->info.si_code != SI_TIMER))
                return 0;

        ret = -1;
        rcu_read_lock();
        t = pid_task(pid, type);
        if (!t || !likely(lock_task_sighand(t, &flags)))
                goto ret;

        ret = 1; /* the signal is ignored */
        result = TRACE_SIGNAL_IGNORED;
        if (!prepare_signal(sig, t, false))
                goto out;

        ret = 0;
        if (unlikely(!list_empty(&q->list))) {
                /*
                 * If an SI_TIMER entry is already queue just increment
                 * the overrun count.
                 */
                q->info.si_overrun++;
                result = TRACE_SIGNAL_ALREADY_PENDING;
                goto out;
        }
        q->info.si_overrun = 0;

        signalfd_notify(t, sig);
        pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending;
        list_add_tail(&q->list, &pending->list);
        sigaddset(&pending->signal, sig);
        complete_signal(sig, t, type);
        result = TRACE_SIGNAL_DELIVERED;
out:
        trace_signal_generate(sig, &q->info, t, type != PIDTYPE_PID, result);
        unlock_task_sighand(t, &flags);
ret:
        rcu_read_unlock();
        return ret;
}

static void do_notify_pidfd(struct task_struct *task)
{
        struct pid *pid;

        WARN_ON(task->exit_state == 0);
        pid = task_pid(task);
        wake_up_all(&pid->wait_pidfd);
}

/*
 * Let a parent know about the death of a child.
 * For a stopped/continued status change, use do_notify_parent_cldstop instead.
 *
 * Returns true if our parent ignored us and so we've switched to
 * self-reaping.
 */
bool do_notify_parent(struct task_struct *tsk, int sig)
{
        struct kernel_siginfo info;
        unsigned long flags;
        struct sighand_struct *psig;
        bool autoreap = false;
        u64 utime, stime;

        WARN_ON_ONCE(sig == -1);

        /* do_notify_parent_cldstop should have been called instead.  */
        WARN_ON_ONCE(task_is_stopped_or_traced(tsk));

        WARN_ON_ONCE(!tsk->ptrace &&
               (tsk->group_leader != tsk || !thread_group_empty(tsk)));

        /* Wake up all pidfd waiters */
        do_notify_pidfd(tsk);

        if (sig != SIGCHLD) {
                /*
                 * This is only possible if parent == real_parent.
                 * Check if it has changed security domain.
                 */
                if (tsk->parent_exec_id != READ_ONCE(tsk->parent->self_exec_id))
                        sig = SIGCHLD;
        }

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        /*
         * We are under tasklist_lock here so our parent is tied to
         * us and cannot change.
         *
         * task_active_pid_ns will always return the same pid namespace
         * until a task passes through release_task.
         *
         * write_lock() currently calls preempt_disable() which is the
         * same as rcu_read_lock(), but according to Oleg, this is not
         * correct to rely on this
         */
        rcu_read_lock();
        info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(tsk->parent));
        info.si_uid = from_kuid_munged(task_cred_xxx(tsk->parent, user_ns),
                                       task_uid(tsk));
        rcu_read_unlock();

        task_cputime(tsk, &utime, &stime);
        info.si_utime = nsec_to_clock_t(utime + tsk->signal->utime);
        info.si_stime = nsec_to_clock_t(stime + tsk->signal->stime);

        info.si_status = tsk->exit_code & 0x7f;
        if (tsk->exit_code & 0x80)
                info.si_code = CLD_DUMPED;
        else if (tsk->exit_code & 0x7f)
                info.si_code = CLD_KILLED;
        else {
                info.si_code = CLD_EXITED;
                info.si_status = tsk->exit_code >> 8;
        }

        psig = tsk->parent->sighand;
        spin_lock_irqsave(&psig->siglock, flags);
        if (!tsk->ptrace && sig == SIGCHLD &&
            (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
             (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
                /*
                 * We are exiting and our parent doesn't care.  POSIX.1
                 * defines special semantics for setting SIGCHLD to SIG_IGN
                 * or setting the SA_NOCLDWAIT flag: we should be reaped
                 * automatically and not left for our parent's wait4 call.
                 * Rather than having the parent do it as a magic kind of
                 * signal handler, we just set this to tell do_exit that we
                 * can be cleaned up without becoming a zombie.  Note that
                 * we still call __wake_up_parent in this case, because a
                 * blocked sys_wait4 might now return -ECHILD.
                 *
                 * Whether we send SIGCHLD or not for SA_NOCLDWAIT
                 * is implementation-defined: we do (if you don't want
                 * it, just use SIG_IGN instead).
                 */
                autoreap = true;
                if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
                        sig = 0;
        }
        /*
         * Send with __send_signal as si_pid and si_uid are in the
         * parent's namespaces.
         */
        if (valid_signal(sig) && sig)
                __send_signal(sig, &info, tsk->parent, PIDTYPE_TGID, false);
        __wake_up_parent(tsk, tsk->parent);
        spin_unlock_irqrestore(&psig->siglock, flags);

        return autoreap;
}

/**
 * do_notify_parent_cldstop - notify parent of stopped/continued state change
 * @tsk: task reporting the state change
 * @for_ptracer: the notification is for ptracer
 * @why: CLD_{CONTINUED|STOPPED|TRAPPED} to report
 *
 * Notify @tsk's parent that the stopped/continued state has changed.  If
 * @for_ptracer is %false, @tsk's group leader notifies to its real parent.
 * If %true, @tsk reports to @tsk->parent which should be the ptracer.
 *
 * CONTEXT:
 * Must be called with tasklist_lock at least read locked.
 */
static void do_notify_parent_cldstop(struct task_struct *tsk,
                                     bool for_ptracer, int why)
{
        struct kernel_siginfo info;
        unsigned long flags;
        struct task_struct *parent;
        struct sighand_struct *sighand;
        u64 utime, stime;

        if (for_ptracer) {
                parent = tsk->parent;
        } else {
                tsk = tsk->group_leader;
                parent = tsk->real_parent;
        }

        clear_siginfo(&info);
        info.si_signo = SIGCHLD;
        info.si_errno = 0;
        /*
         * see comment in do_notify_parent() about the following 4 lines
         */
        rcu_read_lock();
        info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent));
        info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk));
        rcu_read_unlock();

        task_cputime(tsk, &utime, &stime);
        info.si_utime = nsec_to_clock_t(utime);
        info.si_stime = nsec_to_clock_t(stime);

         info.si_code = why;
         switch (why) {
         case CLD_CONTINUED:
                 info.si_status = SIGCONT;
                 break;
         case CLD_STOPPED:
                 info.si_status = tsk->signal->group_exit_code & 0x7f;
                 break;
         case CLD_TRAPPED:
                 info.si_status = tsk->exit_code & 0x7f;
                 break;
         default:
                 BUG();
         }

        sighand = parent->sighand;
        spin_lock_irqsave(&sighand->siglock, flags);
        if (sighand->action[SIGCHLD-1].sa.sa_handler != SIG_IGN &&
            !(sighand->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDSTOP))
                __group_send_sig_info(SIGCHLD, &info, parent);
        /*
         * Even if SIGCHLD is not generated, we must wake up wait4 calls.
         */
        __wake_up_parent(tsk, parent);
        spin_unlock_irqrestore(&sighand->siglock, flags);
}

static inline bool may_ptrace_stop(void)
{
        if (!likely(current->ptrace))
                return false;
        /*
         * Are we in the middle of do_coredump?
         * If so and our tracer is also part of the coredump stopping
         * is a deadlock situation, and pointless because our tracer
         * is dead so don't allow us to stop.
         * If SIGKILL was already sent before the caller unlocked
         * ->siglock we must see ->core_state != NULL. Otherwise it
         * is safe to enter schedule().
         *
         * This is almost outdated, a task with the pending SIGKILL can't
         * block in TASK_TRACED. But PTRACE_EVENT_EXIT can be reported
         * after SIGKILL was already dequeued.
         */
        if (unlikely(current->mm->core_state) &&
            unlikely(current->mm == current->parent->mm))
                return false;

        return true;
}


/*
 * This must be called with current->sighand->siglock held.
 *
 * This should be the path for all ptrace stops.
 * We always set current->last_siginfo while stopped here.
 * That makes it a way to test a stopped process for
 * being ptrace-stopped vs being job-control-stopped.
 *
 * If we actually decide not to stop at all because the tracer
 * is gone, we keep current->exit_code unless clear_code.
 */
static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t *info)
        __releases(&current->sighand->siglock)
        __acquires(&current->sighand->siglock)
{
        bool gstop_done = false;

        if (arch_ptrace_stop_needed(exit_code, info)) {
                /*
                 * The arch code has something special to do before a
                 * ptrace stop.  This is allowed to block, e.g. for faults
                 * on user stack pages.  We can't keep the siglock while
                 * calling arch_ptrace_stop, so we must release it now.
                 * To preserve proper semantics, we must do this before
                 * any signal bookkeeping like checking group_stop_count.
                 */
                spin_unlock_irq(&current->sighand->siglock);
                arch_ptrace_stop(exit_code, info);
                spin_lock_irq(&current->sighand->siglock);
        }

        /*
         * schedule() will not sleep if there is a pending signal that
         * can awaken the task.
         */
        set_special_state(TASK_TRACED);

        /*
         * We're committing to trapping.  TRACED should be visible before
         * TRAPPING is cleared; otherwise, the tracer might fail do_wait().
         * Also, transition to TRACED and updates to ->jobctl should be
         * atomic with respect to siglock and should be done after the arch
         * hook as siglock is released and regrabbed across it.
         *
         *     TRACER                                    TRACEE
         *
         *     ptrace_attach()
         * [L]   wait_on_bit(JOBCTL_TRAPPING)        [S] set_special_state(TRACED)
         *     do_wait()
         *       set_current_state()                smp_wmb();
         *       ptrace_do_wait()
         *         wait_task_stopped()
         *           task_stopped_code()
         * [L]         task_is_traced()                [S] task_clear_jobctl_trapping();
         */
        smp_wmb();

        current->last_siginfo = info;
        current->exit_code = exit_code;

        /*
         * If @why is CLD_STOPPED, we're trapping to participate in a group
         * stop.  Do the bookkeeping.  Note that if SIGCONT was delievered
         * across siglock relocks since INTERRUPT was scheduled, PENDING
         * could be clear now.  We act as if SIGCONT is received after
         * TASK_TRACED is entered - ignore it.
         */
        if (why == CLD_STOPPED && (current->jobctl & JOBCTL_STOP_PENDING))
                gstop_done = task_participate_group_stop(current);

        /* any trap clears pending STOP trap, STOP trap clears NOTIFY */
        task_clear_jobctl_pending(current, JOBCTL_TRAP_STOP);
        if (info && info->si_code >> 8 == PTRACE_EVENT_STOP)
                task_clear_jobctl_pending(current, JOBCTL_TRAP_NOTIFY);

        /* entering a trap, clear TRAPPING */
        task_clear_jobctl_trapping(current);

        spin_unlock_irq(&current->sighand->siglock);
        read_lock(&tasklist_lock);
        if (may_ptrace_stop()) {
                /*
                 * Notify parents of the stop.
                 *
                 * While ptraced, there are two parents - the ptracer and
                 * the real_parent of the group_leader.  The ptracer should
                 * know about every stop while the real parent is only
                 * interested in the completion of group stop.  The states
                 * for the two don't interact with each other.  Notify
                 * separately unless they're gonna be duplicates.
                 */
                do_notify_parent_cldstop(current, true, why);
                if (gstop_done && ptrace_reparented(current))
                        do_notify_parent_cldstop(current, false, why);

                /*
                 * Don't want to allow preemption here, because
                 * sys_ptrace() needs this task to be inactive.
                 *
                 * XXX: implement read_unlock_no_resched().
                 */
                preempt_disable();
                read_unlock(&tasklist_lock);
                cgroup_enter_frozen();
                preempt_enable_no_resched();
                freezable_schedule();
                cgroup_leave_frozen(true);
        } else {
                /*
                 * By the time we got the lock, our tracer went away.
                 * Don't drop the lock yet, another tracer may come.
                 *
                 * If @gstop_done, the ptracer went away between group stop
                 * completion and here.  During detach, it would have set
                 * JOBCTL_STOP_PENDING on us and we'll re-enter
                 * TASK_STOPPED in do_signal_stop() on return, so notifying
                 * the real parent of the group stop completion is enough.
                 */
                if (gstop_done)
                        do_notify_parent_cldstop(current, false, why);

                /* tasklist protects us from ptrace_freeze_traced() */
                __set_current_state(TASK_RUNNING);
                if (clear_code)
                        current->exit_code = 0;
                read_unlock(&tasklist_lock);
        }

        /*
         * We are back.  Now reacquire the siglock before touching
         * last_siginfo, so that we are sure to have synchronized with
         * any signal-sending on another CPU that wants to examine it.
         */
        spin_lock_irq(&current->sighand->siglock);
        current->last_siginfo = NULL;

        /* LISTENING can be set only during STOP traps, clear it */
        current->jobctl &= ~JOBCTL_LISTENING;

        /*
         * Queued signals ignored us while we were stopped for tracing.
         * So check for any that we should take before resuming user mode.
         * This sets TIF_SIGPENDING, but never clears it.
         */
        recalc_sigpending_tsk(current);
}

static void ptrace_do_notify(int signr, int exit_code, int why)
{
        kernel_siginfo_t info;

        clear_siginfo(&info);
        info.si_signo = signr;
        info.si_code = exit_code;
        info.si_pid = task_pid_vnr(current);
        info.si_uid = from_kuid_munged(current_user_ns(), current_uid());

        /* Let the debugger run.  */
        ptrace_stop(exit_code, why, 1, &info);
}

void ptrace_notify(int exit_code)
{
        BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
        if (unlikely(current->task_works))
                task_work_run();

        spin_lock_irq(&current->sighand->siglock);
        ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED);
        spin_unlock_irq(&current->sighand->siglock);
}

/**
 * do_signal_stop - handle group stop for SIGSTOP and other stop signals
 * @signr: signr causing group stop if initiating
 *
 * If %JOBCTL_STOP_PENDING is not set yet, initiate group stop with @signr
 * and participate in it.  If already set, participate in the existing
 * group stop.  If participated in a group stop (and thus slept), %true is
 * returned with siglock released.
 *
 * If ptraced, this function doesn't handle stop itself.  Instead,
 * %JOBCTL_TRAP_STOP is scheduled and %false is returned with siglock
 * untouched.  The caller must ensure that INTERRUPT trap handling takes
 * places afterwards.
 *
 * CONTEXT:
 * Must be called with @current->sighand->siglock held, which is released
 * on %true return.
 *
 * RETURNS:
 * %false if group stop is already cancelled or ptrace trap is scheduled.
 * %true if participated in group stop.
 */
static bool do_signal_stop(int signr)
        __releases(&current->sighand->siglock)
{
        struct signal_struct *sig = current->signal;

        if (!(current->jobctl & JOBCTL_STOP_PENDING)) {
                unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
                struct task_struct *t;

                /* signr will be recorded in task->jobctl for retries */
                WARN_ON_ONCE(signr & ~JOBCTL_STOP_SIGMASK);

                if (!likely(current->jobctl & JOBCTL_STOP_DEQUEUED) ||
                    unlikely(signal_group_exit(sig)))
                        return false;
                /*
                 * There is no group stop already in progress.  We must
                 * initiate one now.
                 *
                 * While ptraced, a task may be resumed while group stop is
                 * still in effect and then receive a stop signal and
                 * initiate another group stop.  This deviates from the
                 * usual behavior as two consecutive stop signals can't
                 * cause two group stops when !ptraced.  That is why we
                 * also check !task_is_stopped(t) below.
                 *
                 * The condition can be distinguished by testing whether
                 * SIGNAL_STOP_STOPPED is already set.  Don't generate
                 * group_exit_code in such case.
                 *
                 * This is not necessary for SIGNAL_STOP_CONTINUED because
                 * an intervening stop signal is required to cause two
                 * continued events regardless of ptrace.
                 */
                if (!(sig->flags & SIGNAL_STOP_STOPPED))
                        sig->group_exit_code = signr;

                sig->group_stop_count = 0;

                if (task_set_jobctl_pending(current, signr | gstop))
                        sig->group_stop_count++;

                t = current;
                while_each_thread(current, t) {
                        /*
                         * Setting state to TASK_STOPPED for a group
                         * stop is always done with the siglock held,
                         * so this check has no races.
                         */
                        if (!task_is_stopped(t) &&
                            task_set_jobctl_pending(t, signr | gstop)) {
                                sig->group_stop_count++;
                                if (likely(!(t->ptrace & PT_SEIZED)))
                                        signal_wake_up(t, 0);
                                else
                                        ptrace_trap_notify(t);
                        }
                }
        }

        if (likely(!current->ptrace)) {
                int notify = 0;

                /*
                 * If there are no other threads in the group, or if there
                 * is a group stop in progress and we are the last to stop,
                 * report to the parent.
                 */
                if (task_participate_group_stop(current))
                        notify = CLD_STOPPED;

                set_special_state(TASK_STOPPED);
                spin_unlock_irq(&current->sighand->siglock);

                /*
                 * Notify the parent of the group stop completion.  Because
                 * we're not holding either the siglock or tasklist_lock
                 * here, ptracer may attach inbetween; however, this is for
                 * group stop and should always be delivered to the real
                 * parent of the group leader.  The new ptracer will get
                 * its notification when this task transitions into
                 * TASK_TRACED.
                 */
                if (notify) {
                        read_lock(&tasklist_lock);
                        do_notify_parent_cldstop(current, false, notify);
                        read_unlock(&tasklist_lock);
                }

                /* Now we don't run again until woken by SIGCONT or SIGKILL */
                cgroup_enter_frozen();
                freezable_schedule();
                return true;
        } else {
                /*
                 * While ptraced, group stop is handled by STOP trap.
                 * Schedule it and let the caller deal with it.
                 */
                task_set_jobctl_pending(current, JOBCTL_TRAP_STOP);
                return false;
        }
}

/**
 * do_jobctl_trap - take care of ptrace jobctl traps
 *
 * When PT_SEIZED, it's used for both group stop and explicit
 * SEIZE/INTERRUPT traps.  Both generate PTRACE_EVENT_STOP trap with
 * accompanying siginfo.  If stopped, lower eight bits of exit_code contain
 * the stop signal; otherwise, %SIGTRAP.
 *
 * When !PT_SEIZED, it's used only for group stop trap with stop signal
 * number as exit_code and no siginfo.
 *
 * CONTEXT:
 * Must be called with @current->sighand->siglock held, which may be
 * released and re-acquired before returning with intervening sleep.
 */
static void do_jobctl_trap(void)
{
        struct signal_struct *signal = current->signal;
        int signr = current->jobctl & JOBCTL_STOP_SIGMASK;

        if (current->ptrace & PT_SEIZED) {
                if (!signal->group_stop_count &&
                    !(signal->flags & SIGNAL_STOP_STOPPED))
                        signr = SIGTRAP;
                WARN_ON_ONCE(!signr);
                ptrace_do_notify(signr, signr | (PTRACE_EVENT_STOP << 8),
                                 CLD_STOPPED);
        } else {
                WARN_ON_ONCE(!signr);
                ptrace_stop(signr, CLD_STOPPED, 0, NULL);
                current->exit_code = 0;
        }
}

/**
 * do_freezer_trap - handle the freezer jobctl trap
 *
 * Puts the task into frozen state, if only the task is not about to quit.
 * In this case it drops JOBCTL_TRAP_FREEZE.
 *
 * CONTEXT:
 * Must be called with @current->sighand->siglock held,
 * which is always released before returning.
 */
static void do_freezer_trap(void)
        __releases(&current->sighand->siglock)
{
        /*
         * If there are other trap bits pending except JOBCTL_TRAP_FREEZE,
         * let's make another loop to give it a chance to be handled.
         * In any case, we'll return back.
         */
        if ((current->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) !=
             JOBCTL_TRAP_FREEZE) {
                spin_unlock_irq(&current->sighand->siglock);
                return;
        }

        /*
         * Now we're sure that there is no pending fatal signal and no
         * pending traps. Clear TIF_SIGPENDING to not get out of schedule()
         * immediately (if there is a non-fatal signal pending), and
         * put the task into sleep.
         */
        __set_current_state(TASK_INTERRUPTIBLE);
        clear_thread_flag(TIF_SIGPENDING);
        spin_unlock_irq(&current->sighand->siglock);
        cgroup_enter_frozen();
        freezable_schedule();

        /*
         * We could've been woken by task_work, run it to clear
         * TIF_NOTIFY_SIGNAL. The caller will retry if necessary.
         */
        clear_notify_signal();
        if (unlikely(READ_ONCE(current->task_works)))
                task_work_run();
}

static int ptrace_signal(int signr, kernel_siginfo_t *info)
{
        /*
         * We do not check sig_kernel_stop(signr) but set this marker
         * unconditionally because we do not know whether debugger will
         * change signr. This flag has no meaning unless we are going
         * to stop after return from ptrace_stop(). In this case it will
         * be checked in do_signal_stop(), we should only stop if it was
         * not cleared by SIGCONT while we were sleeping. See also the
         * comment in dequeue_signal().
         */
        current->jobctl |= JOBCTL_STOP_DEQUEUED;
        ptrace_stop(signr, CLD_TRAPPED, 0, info);

        /* We're back.  Did the debugger cancel the sig?  */
        signr = current->exit_code;
        if (signr == 0)
                return signr;

        current->exit_code = 0;

        /*
         * Update the siginfo structure if the signal has
         * changed.  If the debugger wanted something
         * specific in the siginfo structure then it should
         * have updated *info via PTRACE_SETSIGINFO.
         */
        if (signr != info->si_signo) {
                clear_siginfo(info);
                info->si_signo = signr;
                info->si_errno = 0;
                info->si_code = SI_USER;
                rcu_read_lock();
                info->si_pid = task_pid_vnr(current->parent);
                info->si_uid = from_kuid_munged(current_user_ns(),
                                                task_uid(current->parent));
                rcu_read_unlock();
        }

        /* If the (new) signal is now blocked, requeue it.  */
        if (sigismember(&current->blocked, signr)) {
                send_signal(signr, info, current, PIDTYPE_PID);
                signr = 0;
        }

        return signr;
}

bool get_signal(struct ksignal *ksig)
{
        struct sighand_struct *sighand = current->sighand;
        struct signal_struct *signal = current->signal;
        int signr;

        if (unlikely(current->task_works))
                task_work_run();

        /*
         * For non-generic architectures, check for TIF_NOTIFY_SIGNAL so
         * that the arch handlers don't all have to do it. If we get here
         * without TIF_SIGPENDING, just exit after running signal work.
         */
        if (!IS_ENABLED(CONFIG_GENERIC_ENTRY)) {
                if (test_thread_flag(TIF_NOTIFY_SIGNAL))
                        tracehook_notify_signal();
                if (!task_sigpending(current))
                        return false;
        }

        if (unlikely(uprobe_deny_signal()))
                return false;

        /*
         * Do this once, we can't return to user-mode if freezing() == T.
         * do_signal_stop() and ptrace_stop() do freezable_schedule() and
         * thus do not need another check after return.
         */
        try_to_freeze();

relock:
        spin_lock_irq(&sighand->siglock);

        /*
         * Every stopped thread goes here after wakeup. Check to see if
         * we should notify the parent, prepare_signal(SIGCONT) encodes
         * the CLD_ si_code into SIGNAL_CLD_MASK bits.
         */
        if (unlikely(signal->flags & SIGNAL_CLD_MASK)) {
                int why;

                if (signal->flags & SIGNAL_CLD_CONTINUED)
                        why = CLD_CONTINUED;
                else
                        why = CLD_STOPPED;

                signal->flags &= ~SIGNAL_CLD_MASK;

                spin_unlock_irq(&sighand->siglock);

                /*
                 * Notify the parent that we're continuing.  This event is
                 * always per-process and doesn't make whole lot of sense
                 * for ptracers, who shouldn't consume the state via
                 * wait(2) either, but, for backward compatibility, notify
                 * the ptracer of the group leader too unless it's gonna be
                 * a duplicate.
                 */
                read_lock(&tasklist_lock);
                do_notify_parent_cldstop(current, false, why);

                if (ptrace_reparented(current->group_leader))
                        do_notify_parent_cldstop(current->group_leader,
                                                true, why);
                read_unlock(&tasklist_lock);

                goto relock;
        }

        /* Has this task already been marked for death? */
        if (signal_group_exit(signal)) {
                ksig->info.si_signo = signr = SIGKILL;
                sigdelset(&current->pending.signal, SIGKILL);
                trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO,
                                &sighand->action[SIGKILL - 1]);
                recalc_sigpending();
                goto fatal;
        }

        for (;;) {
                struct k_sigaction *ka;

                if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) &&
                    do_signal_stop(0))
                        goto relock;

                if (unlikely(current->jobctl &
                             (JOBCTL_TRAP_MASK | JOBCTL_TRAP_FREEZE))) {
                        if (current->jobctl & JOBCTL_TRAP_MASK) {
                                do_jobctl_trap();
                                spin_unlock_irq(&sighand->siglock);
                        } else if (current->jobctl & JOBCTL_TRAP_FREEZE)
                                do_freezer_trap();

                        goto relock;
                }

                /*
                 * If the task is leaving the frozen state, let's update
                 * cgroup counters and reset the frozen bit.
                 */
                if (unlikely(cgroup_task_frozen(current))) {
                        spin_unlock_irq(&sighand->siglock);
                        cgroup_leave_frozen(false);
                        goto relock;
                }

                /*
                 * Signals generated by the execution of an instruction
                 * need to be delivered before any other pending signals
                 * so that the instruction pointer in the signal stack
                 * frame points to the faulting instruction.
                 */
                signr = dequeue_synchronous_signal(&ksig->info);
                if (!signr)
                        signr = dequeue_signal(current, &current->blocked, &ksig->info);

                if (!signr)
                        break; /* will return 0 */

                if (unlikely(current->ptrace) && signr != SIGKILL) {
                        signr = ptrace_signal(signr, &ksig->info);
                        if (!signr)
                                continue;
                }

                ka = &sighand->action[signr-1];

                /* Trace actually delivered signals. */
                trace_signal_deliver(signr, &ksig->info, ka);

                if (ka->sa.sa_handler == SIG_IGN) /* Do nothing.  */
                        continue;
                if (ka->sa.sa_handler != SIG_DFL) {
                        /* Run the handler.  */
                        ksig->ka = *ka;

                        if (ka->sa.sa_flags & SA_ONESHOT)
                                ka->sa.sa_handler = SIG_DFL;

                        break; /* will return non-zero "signr" value */
                }

                /*
                 * Now we are doing the default action for this signal.
                 */
                if (sig_kernel_ignore(signr)) /* Default is nothing. */
                        continue;

                /*
                 * Global init gets no signals it doesn't want.
                 * Container-init gets no signals it doesn't want from same
                 * container.
                 *
                 * Note that if global/container-init sees a sig_kernel_only()
                 * signal here, the signal must have been generated internally
                 * or must have come from an ancestor namespace. In either
                 * case, the signal cannot be dropped.
                 */
                if (unlikely(signal->flags & SIGNAL_UNKILLABLE) &&
                                !sig_kernel_only(signr))
                        continue;

                if (sig_kernel_stop(signr)) {
                        /*
                         * The default action is to stop all threads in
                         * the thread group.  The job control signals
                         * do nothing in an orphaned pgrp, but SIGSTOP
                         * always works.  Note that siglock needs to be
                         * dropped during the call to is_orphaned_pgrp()
                         * because of lock ordering with tasklist_lock.
                         * This allows an intervening SIGCONT to be posted.
                         * We need to check for that and bail out if necessary.
                         */
                        if (signr != SIGSTOP) {
                                spin_unlock_irq(&sighand->siglock);

                                /* signals can be posted during this window */

                                if (is_current_pgrp_orphaned())
                                        goto relock;

                                spin_lock_irq(&sighand->siglock);
                        }

                        if (likely(do_signal_stop(ksig->info.si_signo))) {
                                /* It released the siglock.  */
                                goto relock;
                        }

                        /*
                         * We didn't actually stop, due to a race
                         * with SIGCONT or something like that.
                         */
                        continue;
                }

        fatal:
                spin_unlock_irq(&sighand->siglock);
                if (unlikely(cgroup_task_frozen(current)))
                        cgroup_leave_frozen(true);

                /*
                 * Anything else is fatal, maybe with a core dump.
                 */
                current->flags |= PF_SIGNALED;

                if (sig_kernel_coredump(signr)) {
                        if (print_fatal_signals)
                                print_fatal_signal(ksig->info.si_signo);
                        proc_coredump_connector(current);
                        /*
                         * If it was able to dump core, this kills all
                         * other threads in the group and synchronizes with
                         * their demise.  If we lost the race with another
                         * thread getting here, it set group_exit_code
                         * first and our do_group_exit call below will use
                         * that value and ignore the one we pass it.
                         */
                        do_coredump(&ksig->info);
                }

                /*
                 * PF_IO_WORKER threads will catch and exit on fatal signals
                 * themselves. They have cleanup that must be performed, so
                 * we cannot call do_exit() on their behalf.
                 */
                if (current->flags & PF_IO_WORKER)
                        goto out;

                /*
                 * Death signals, no core dump.
                 */
                do_group_exit(ksig->info.si_signo);
                /* NOTREACHED */
        }
        spin_unlock_irq(&sighand->siglock);
out:
        ksig->sig = signr;
        return ksig->sig > 0;
}

/**
 * signal_delivered - 
 * @ksig:                kernel signal struct
 * @stepping:                nonzero if debugger single-step or block-step in use
 *
 * This function should be called when a signal has successfully been
 * delivered. It updates the blocked signals accordingly (@ksig->ka.sa.sa_mask
 * is always blocked, and the signal itself is blocked unless %SA_NODEFER
 * is set in @ksig->ka.sa.sa_flags.  Tracing is notified.
 */
static void signal_delivered(struct ksignal *ksig, int stepping)
{
        sigset_t blocked;

        /* A signal was successfully delivered, and the
           saved sigmask was stored on the signal frame,
           and will be restored by sigreturn.  So we can
           simply clear the restore sigmask flag.  */
        clear_restore_sigmask();

        sigorsets(&blocked, &current->blocked, &ksig->ka.sa.sa_mask);
        if (!(ksig->ka.sa.sa_flags & SA_NODEFER))
                sigaddset(&blocked, ksig->sig);
        set_current_blocked(&blocked);
        tracehook_signal_handler(stepping);
}

void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
{
        if (failed)
                force_sigsegv(ksig->sig);
        else
                signal_delivered(ksig, stepping);
}

/*
 * It could be that complete_signal() picked us to notify about the
 * group-wide signal. Other threads should be notified now to take
 * the shared signals in @which since we will not.
 */
static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which)
{
        sigset_t retarget;
        struct task_struct *t;

        sigandsets(&retarget, &tsk->signal->shared_pending.signal, which);
        if (sigisemptyset(&retarget))
                return;

        t = tsk;
        while_each_thread(tsk, t) {
                if (t->flags & PF_EXITING)
                        continue;

                if (!has_pending_signals(&retarget, &t->blocked))
                        continue;
                /* Remove the signals this thread can handle. */
                sigandsets(&retarget, &retarget, &t->blocked);

                if (!task_sigpending(t))
                        signal_wake_up(t, 0);

                if (sigisemptyset(&retarget))
                        break;
        }
}

void exit_signals(struct task_struct *tsk)
{
        int group_stop = 0;
        sigset_t unblocked;

        /*
         * @tsk is about to have PF_EXITING set - lock out users which
         * expect stable threadgroup.
         */
        cgroup_threadgroup_change_begin(tsk);

        if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) {
                tsk->flags |= PF_EXITING;
                cgroup_threadgroup_change_end(tsk);
                return;
        }

        spin_lock_irq(&tsk->sighand->siglock);
        /*
         * From now this task is not visible for group-wide signals,
         * see wants_signal(), do_signal_stop().
         */
        tsk->flags |= PF_EXITING;

        cgroup_threadgroup_change_end(tsk);

        if (!task_sigpending(tsk))
                goto out;

        unblocked = tsk->blocked;
        signotset(&unblocked);
        retarget_shared_pending(tsk, &unblocked);

        if (unlikely(tsk->jobctl & JOBCTL_STOP_PENDING) &&
            task_participate_group_stop(tsk))
                group_stop = CLD_STOPPED;
out:
        spin_unlock_irq(&tsk->sighand->siglock);

        /*
         * If group stop has completed, deliver the notification.  This
         * should always go to the real parent of the group leader.
         */
        if (unlikely(group_stop)) {
                read_lock(&tasklist_lock);
                do_notify_parent_cldstop(tsk, false, group_stop);
                read_unlock(&tasklist_lock);
        }
}

/*
 * System call entry points.
 */

/**
 *  sys_restart_syscall - restart a system call
 */
SYSCALL_DEFINE0(restart_syscall)
{
        struct restart_block *restart = &current->restart_block;
        return restart->fn(restart);
}

long do_no_restart_syscall(struct restart_block *param)
{
        return -EINTR;
}

static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset)
{
        if (task_sigpending(tsk) && !thread_group_empty(tsk)) {
                sigset_t newblocked;
                /* A set of now blocked but previously unblocked signals. */
                sigandnsets(&newblocked, newset, &current->blocked);
                retarget_shared_pending(tsk, &newblocked);
        }
        tsk->blocked = *newset;
        recalc_sigpending();
}

/**
 * set_current_blocked - change current->blocked mask
 * @newset: new mask
 *
 * It is wrong to change ->blocked directly, this helper should be used
 * to ensure the process can't miss a shared signal we are going to block.
 */
void set_current_blocked(sigset_t *newset)
{
        sigdelsetmask(newset, sigmask(SIGKILL) | sigmask(SIGSTOP));
        __set_current_blocked(newset);
}

void __set_current_blocked(const sigset_t *newset)
{
        struct task_struct *tsk = current;

        /*
         * In case the signal mask hasn't changed, there is nothing we need
         * to do. The current->blocked shouldn't be modified by other task.
         */
        if (sigequalsets(&tsk->blocked, newset))
                return;

        spin_lock_irq(&tsk->sighand->siglock);
        __set_task_blocked(tsk, newset);
        spin_unlock_irq(&tsk->sighand->siglock);
}

/*
 * This is also useful for kernel threads that want to temporarily
 * (or permanently) block certain signals.
 *
 * NOTE! Unlike the user-mode sys_sigprocmask(), the kernel
 * interface happily blocks "unblockable" signals like SIGKILL
 * and friends.
 */
int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
{
        struct task_struct *tsk = current;
        sigset_t newset;

        /* Lockless, only current can change ->blocked, never from irq */
        if (oldset)
                *oldset = tsk->blocked;

        switch (how) {
        case SIG_BLOCK:
                sigorsets(&newset, &tsk->blocked, set);
                break;
        case SIG_UNBLOCK:
                sigandnsets(&newset, &tsk->blocked, set);
                break;
        case SIG_SETMASK:
                newset = *set;
                break;
        default:
                return -EINVAL;
        }

        __set_current_blocked(&newset);
        return 0;
}
EXPORT_SYMBOL(sigprocmask);

/*
 * The api helps set app-provided sigmasks.
 *
 * This is useful for syscalls such as ppoll, pselect, io_pgetevents and
 * epoll_pwait where a new sigmask is passed from userland for the syscalls.
 *
 * Note that it does set_restore_sigmask() in advance, so it must be always
 * paired with restore_saved_sigmask_unless() before return from syscall.
 */
int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize)
{
        sigset_t kmask;

        if (!umask)
                return 0;
        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;
        if (copy_from_user(&kmask, umask, sizeof(sigset_t)))
                return -EFAULT;

        set_restore_sigmask();
        current->saved_sigmask = current->blocked;
        set_current_blocked(&kmask);

        return 0;
}

#ifdef CONFIG_COMPAT
int set_compat_user_sigmask(const compat_sigset_t __user *umask,
                            size_t sigsetsize)
{
        sigset_t kmask;

        if (!umask)
                return 0;
        if (sigsetsize != sizeof(compat_sigset_t))
                return -EINVAL;
        if (get_compat_sigset(&kmask, umask))
                return -EFAULT;

        set_restore_sigmask();
        current->saved_sigmask = current->blocked;
        set_current_blocked(&kmask);

        return 0;
}
#endif

/**
 *  sys_rt_sigprocmask - change the list of currently blocked signals
 *  @how: whether to add, remove, or set signals
 *  @nset: stores pending signals
 *  @oset: previous value of signal mask if non-null
 *  @sigsetsize: size of sigset_t type
 */
SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset,
                sigset_t __user *, oset, size_t, sigsetsize)
{
        sigset_t old_set, new_set;
        int error;

        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        old_set = current->blocked;

        if (nset) {
                if (copy_from_user(&new_set, nset, sizeof(sigset_t)))
                        return -EFAULT;
                sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));

                error = sigprocmask(how, &new_set, NULL);
                if (error)
                        return error;
        }

        if (oset) {
                if (copy_to_user(oset, &old_set, sizeof(sigset_t)))
                        return -EFAULT;
        }

        return 0;
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
                compat_sigset_t __user *, oset, compat_size_t, sigsetsize)
{
        sigset_t old_set = current->blocked;

        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (nset) {
                sigset_t new_set;
                int error;
                if (get_compat_sigset(&new_set, nset))
                        return -EFAULT;
                sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));

                error = sigprocmask(how, &new_set, NULL);
                if (error)
                        return error;
        }
        return oset ? put_compat_sigset(oset, &old_set, sizeof(*oset)) : 0;
}
#endif

static void do_sigpending(sigset_t *set)
{
        spin_lock_irq(&current->sighand->siglock);
        sigorsets(set, &current->pending.signal,
                  &current->signal->shared_pending.signal);
        spin_unlock_irq(&current->sighand->siglock);

        /* Outside the lock because only this thread touches it.  */
        sigandsets(set, &current->blocked, set);
}

/**
 *  sys_rt_sigpending - examine a pending signal that has been raised
 *                        while blocked
 *  @uset: stores pending signals
 *  @sigsetsize: size of sigset_t type or larger
 */
SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
{
        sigset_t set;

        if (sigsetsize > sizeof(*uset))
                return -EINVAL;

        do_sigpending(&set);

        if (copy_to_user(uset, &set, sigsetsize))
                return -EFAULT;

        return 0;
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
                compat_size_t, sigsetsize)
{
        sigset_t set;

        if (sigsetsize > sizeof(*uset))
                return -EINVAL;

        do_sigpending(&set);

        return put_compat_sigset(uset, &set, sigsetsize);
}
#endif

static const struct {
        unsigned char limit, layout;
} sig_sicodes[] = {
        [SIGILL]  = { NSIGILL,  SIL_FAULT },
        [SIGFPE]  = { NSIGFPE,  SIL_FAULT },
        [SIGSEGV] = { NSIGSEGV, SIL_FAULT },
        [SIGBUS]  = { NSIGBUS,  SIL_FAULT },
        [SIGTRAP] = { NSIGTRAP, SIL_FAULT },
#if defined(SIGEMT)
        [SIGEMT]  = { NSIGEMT,  SIL_FAULT },
#endif
        [SIGCHLD] = { NSIGCHLD, SIL_CHLD },
        [SIGPOLL] = { NSIGPOLL, SIL_POLL },
        [SIGSYS]  = { NSIGSYS,  SIL_SYS },
};

static bool known_siginfo_layout(unsigned sig, int si_code)
{
        if (si_code == SI_KERNEL)
                return true;
        else if ((si_code > SI_USER)) {
                if (sig_specific_sicodes(sig)) {
                        if (si_code <= sig_sicodes[sig].limit)
                                return true;
                }
                else if (si_code <= NSIGPOLL)
                        return true;
        }
        else if (si_code >= SI_DETHREAD)
                return true;
        else if (si_code == SI_ASYNCNL)
                return true;
        return false;
}

enum siginfo_layout siginfo_layout(unsigned sig, int si_code)
{
        enum siginfo_layout layout = SIL_KILL;
        if ((si_code > SI_USER) && (si_code < SI_KERNEL)) {
                if ((sig < ARRAY_SIZE(sig_sicodes)) &&
                    (si_code <= sig_sicodes[sig].limit)) {
                        layout = sig_sicodes[sig].layout;
                        /* Handle the exceptions */
                        if ((sig == SIGBUS) &&
                            (si_code >= BUS_MCEERR_AR) && (si_code <= BUS_MCEERR_AO))
                                layout = SIL_FAULT_MCEERR;
                        else if ((sig == SIGSEGV) && (si_code == SEGV_BNDERR))
                                layout = SIL_FAULT_BNDERR;
#ifdef SEGV_PKUERR
                        else if ((sig == SIGSEGV) && (si_code == SEGV_PKUERR))
                                layout = SIL_FAULT_PKUERR;
#endif
                }
                else if (si_code <= NSIGPOLL)
                        layout = SIL_POLL;
        } else {
                if (si_code == SI_TIMER)
                        layout = SIL_TIMER;
                else if (si_code == SI_SIGIO)
                        layout = SIL_POLL;
                else if (si_code < 0)
                        layout = SIL_RT;
        }
        return layout;
}

static inline char __user *si_expansion(const siginfo_t __user *info)
{
        return ((char __user *)info) + sizeof(struct kernel_siginfo);
}

int copy_siginfo_to_user(siginfo_t __user *to, const kernel_siginfo_t *from)
{
        char __user *expansion = si_expansion(to);
        if (copy_to_user(to, from , sizeof(struct kernel_siginfo)))
                return -EFAULT;
        if (clear_user(expansion, SI_EXPANSION_SIZE))
                return -EFAULT;
        return 0;
}

static int post_copy_siginfo_from_user(kernel_siginfo_t *info,
                                       const siginfo_t __user *from)
{
        if (unlikely(!known_siginfo_layout(info->si_signo, info->si_code))) {
                char __user *expansion = si_expansion(from);
                char buf[SI_EXPANSION_SIZE];
                int i;
                /*
                 * An unknown si_code might need more than
                 * sizeof(struct kernel_siginfo) bytes.  Verify all of the
                 * extra bytes are 0.  This guarantees copy_siginfo_to_user
                 * will return this data to userspace exactly.
                 */
                if (copy_from_user(&buf, expansion, SI_EXPANSION_SIZE))
                        return -EFAULT;
                for (i = 0; i < SI_EXPANSION_SIZE; i++) {
                        if (buf[i] != 0)
                                return -E2BIG;
                }
        }
        return 0;
}

static int __copy_siginfo_from_user(int signo, kernel_siginfo_t *to,
                                    const siginfo_t __user *from)
{
        if (copy_from_user(to, from, sizeof(struct kernel_siginfo)))
                return -EFAULT;
        to->si_signo = signo;
        return post_copy_siginfo_from_user(to, from);
}

int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from)
{
        if (copy_from_user(to, from, sizeof(struct kernel_siginfo)))
                return -EFAULT;
        return post_copy_siginfo_from_user(to, from);
}

#ifdef CONFIG_COMPAT
/**
 * copy_siginfo_to_external32 - copy a kernel siginfo into a compat user siginfo
 * @to: compat siginfo destination
 * @from: kernel siginfo source
 *
 * Note: This function does not work properly for the SIGCHLD on x32, but
 * fortunately it doesn't have to.  The only valid callers for this function are
 * copy_siginfo_to_user32, which is overriden for x32 and the coredump code.
 * The latter does not care because SIGCHLD will never cause a coredump.
 */
void copy_siginfo_to_external32(struct compat_siginfo *to,
                const struct kernel_siginfo *from)
{
        memset(to, 0, sizeof(*to));

        to->si_signo = from->si_signo;
        to->si_errno = from->si_errno;
        to->si_code  = from->si_code;
        switch(siginfo_layout(from->si_signo, from->si_code)) {
        case SIL_KILL:
                to->si_pid = from->si_pid;
                to->si_uid = from->si_uid;
                break;
        case SIL_TIMER:
                to->si_tid     = from->si_tid;
                to->si_overrun = from->si_overrun;
                to->si_int     = from->si_int;
                break;
        case SIL_POLL:
                to->si_band = from->si_band;
                to->si_fd   = from->si_fd;
                break;
        case SIL_FAULT:
                to->si_addr = ptr_to_compat(from->si_addr);
#ifdef __ARCH_SI_TRAPNO
                to->si_trapno = from->si_trapno;
#endif
                break;
        case SIL_FAULT_MCEERR:
                to->si_addr = ptr_to_compat(from->si_addr);
#ifdef __ARCH_SI_TRAPNO
                to->si_trapno = from->si_trapno;
#endif
                to->si_addr_lsb = from->si_addr_lsb;
                break;
        case SIL_FAULT_BNDERR:
                to->si_addr = ptr_to_compat(from->si_addr);
#ifdef __ARCH_SI_TRAPNO
                to->si_trapno = from->si_trapno;
#endif
                to->si_lower = ptr_to_compat(from->si_lower);
                to->si_upper = ptr_to_compat(from->si_upper);
                break;
        case SIL_FAULT_PKUERR:
                to->si_addr = ptr_to_compat(from->si_addr);
#ifdef __ARCH_SI_TRAPNO
                to->si_trapno = from->si_trapno;
#endif
                to->si_pkey = from->si_pkey;
                break;
        case SIL_CHLD:
                to->si_pid = from->si_pid;
                to->si_uid = from->si_uid;
                to->si_status = from->si_status;
                to->si_utime = from->si_utime;
                to->si_stime = from->si_stime;
                break;
        case SIL_RT:
                to->si_pid = from->si_pid;
                to->si_uid = from->si_uid;
                to->si_int = from->si_int;
                break;
        case SIL_SYS:
                to->si_call_addr = ptr_to_compat(from->si_call_addr);
                to->si_syscall   = from->si_syscall;
                to->si_arch      = from->si_arch;
                break;
        }
}

int __copy_siginfo_to_user32(struct compat_siginfo __user *to,
                           const struct kernel_siginfo *from)
{
        struct compat_siginfo new;

        copy_siginfo_to_external32(&new, from);
        if (copy_to_user(to, &new, sizeof(struct compat_siginfo)))
                return -EFAULT;
        return 0;
}

static int post_copy_siginfo_from_user32(kernel_siginfo_t *to,
                                         const struct compat_siginfo *from)
{
        clear_siginfo(to);
        to->si_signo = from->si_signo;
        to->si_errno = from->si_errno;
        to->si_code  = from->si_code;
        switch(siginfo_layout(from->si_signo, from->si_code)) {
        case SIL_KILL:
                to->si_pid = from->si_pid;
                to->si_uid = from->si_uid;
                break;
        case SIL_TIMER:
                to->si_tid     = from->si_tid;
                to->si_overrun = from->si_overrun;
                to->si_int     = from->si_int;
                break;
        case SIL_POLL:
                to->si_band = from->si_band;
                to->si_fd   = from->si_fd;
                break;
        case SIL_FAULT:
                to->si_addr = compat_ptr(from->si_addr);
#ifdef __ARCH_SI_TRAPNO
                to->si_trapno = from->si_trapno;
#endif
                break;
        case SIL_FAULT_MCEERR:
                to->si_addr = compat_ptr(from->si_addr);
#ifdef __ARCH_SI_TRAPNO
                to->si_trapno = from->si_trapno;
#endif
                to->si_addr_lsb = from->si_addr_lsb;
                break;
        case SIL_FAULT_BNDERR:
                to->si_addr = compat_ptr(from->si_addr);
#ifdef __ARCH_SI_TRAPNO
                to->si_trapno = from->si_trapno;
#endif
                to->si_lower = compat_ptr(from->si_lower);
                to->si_upper = compat_ptr(from->si_upper);
                break;
        case SIL_FAULT_PKUERR:
                to->si_addr = compat_ptr(from->si_addr);
#ifdef __ARCH_SI_TRAPNO
                to->si_trapno = from->si_trapno;
#endif
                to->si_pkey = from->si_pkey;
                break;
        case SIL_CHLD:
                to->si_pid    = from->si_pid;
                to->si_uid    = from->si_uid;
                to->si_status = from->si_status;
#ifdef CONFIG_X86_X32_ABI
                if (in_x32_syscall()) {
                        to->si_utime = from->_sifields._sigchld_x32._utime;
                        to->si_stime = from->_sifields._sigchld_x32._stime;
                } else
#endif
                {
                        to->si_utime = from->si_utime;
                        to->si_stime = from->si_stime;
                }
                break;
        case SIL_RT:
                to->si_pid = from->si_pid;
                to->si_uid = from->si_uid;
                to->si_int = from->si_int;
                break;
        case SIL_SYS:
                to->si_call_addr = compat_ptr(from->si_call_addr);
                to->si_syscall   = from->si_syscall;
                to->si_arch      = from->si_arch;
                break;
        }
        return 0;
}

static int __copy_siginfo_from_user32(int signo, struct kernel_siginfo *to,
                                      const struct compat_siginfo __user *ufrom)
{
        struct compat_siginfo from;

        if (copy_from_user(&from, ufrom, sizeof(struct compat_siginfo)))
                return -EFAULT;

        from.si_signo = signo;
        return post_copy_siginfo_from_user32(to, &from);
}

int copy_siginfo_from_user32(struct kernel_siginfo *to,
                             const struct compat_siginfo __user *ufrom)
{
        struct compat_siginfo from;

        if (copy_from_user(&from, ufrom, sizeof(struct compat_siginfo)))
                return -EFAULT;

        return post_copy_siginfo_from_user32(to, &from);
}
#endif /* CONFIG_COMPAT */

/**
 *  do_sigtimedwait - wait for queued signals specified in @which
 *  @which: queued signals to wait for
 *  @info: if non-null, the signal's siginfo is returned here
 *  @ts: upper bound on process time suspension
 */
static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info,
                    const struct timespec64 *ts)
{
        ktime_t *to = NULL, timeout = KTIME_MAX;
        struct task_struct *tsk = current;
        sigset_t mask = *which;
        int sig, ret = 0;

        if (ts) {
                if (!timespec64_valid(ts))
                        return -EINVAL;
                timeout = timespec64_to_ktime(*ts);
                to = &timeout;
        }

        /*
         * Invert the set of allowed signals to get those we want to block.
         */
        sigdelsetmask(&mask, sigmask(SIGKILL) | sigmask(SIGSTOP));
        signotset(&mask);

        spin_lock_irq(&tsk->sighand->siglock);
        sig = dequeue_signal(tsk, &mask, info);
        if (!sig && timeout) {
                /*
                 * None ready, temporarily unblock those we're interested
                 * while we are sleeping in so that we'll be awakened when
                 * they arrive. Unblocking is always fine, we can avoid
                 * set_current_blocked().
                 */
                tsk->real_blocked = tsk->blocked;
                sigandsets(&tsk->blocked, &tsk->blocked, &mask);
                recalc_sigpending();
                spin_unlock_irq(&tsk->sighand->siglock);

                __set_current_state(TASK_INTERRUPTIBLE);
                ret = freezable_schedule_hrtimeout_range(to, tsk->timer_slack_ns,
                                                         HRTIMER_MODE_REL);
                spin_lock_irq(&tsk->sighand->siglock);
                __set_task_blocked(tsk, &tsk->real_blocked);
                sigemptyset(&tsk->real_blocked);
                sig = dequeue_signal(tsk, &mask, info);
        }
        spin_unlock_irq(&tsk->sighand->siglock);

        if (sig)
                return sig;
        return ret ? -EINTR : -EAGAIN;
}

/**
 *  sys_rt_sigtimedwait - synchronously wait for queued signals specified
 *                        in @uthese
 *  @uthese: queued signals to wait for
 *  @uinfo: if non-null, the signal's siginfo is returned here
 *  @uts: upper bound on process time suspension
 *  @sigsetsize: size of sigset_t type
 */
SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
                siginfo_t __user *, uinfo,
                const struct __kernel_timespec __user *, uts,
                size_t, sigsetsize)
{
        sigset_t these;
        struct timespec64 ts;
        kernel_siginfo_t info;
        int ret;

        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (copy_from_user(&these, uthese, sizeof(these)))
                return -EFAULT;

        if (uts) {
                if (get_timespec64(&ts, uts))
                        return -EFAULT;
        }

        ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL);

        if (ret > 0 && uinfo) {
                if (copy_siginfo_to_user(uinfo, &info))
                        ret = -EFAULT;
        }

        return ret;
}

#ifdef CONFIG_COMPAT_32BIT_TIME
SYSCALL_DEFINE4(rt_sigtimedwait_time32, const sigset_t __user *, uthese,
                siginfo_t __user *, uinfo,
                const struct old_timespec32 __user *, uts,
                size_t, sigsetsize)
{
        sigset_t these;
        struct timespec64 ts;
        kernel_siginfo_t info;
        int ret;

        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (copy_from_user(&these, uthese, sizeof(these)))
                return -EFAULT;

        if (uts) {
                if (get_old_timespec32(&ts, uts))
                        return -EFAULT;
        }

        ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL);

        if (ret > 0 && uinfo) {
                if (copy_siginfo_to_user(uinfo, &info))
                        ret = -EFAULT;
        }

        return ret;
}
#endif

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time64, compat_sigset_t __user *, uthese,
                struct compat_siginfo __user *, uinfo,
                struct __kernel_timespec __user *, uts, compat_size_t, sigsetsize)
{
        sigset_t s;
        struct timespec64 t;
        kernel_siginfo_t info;
        long ret;

        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (get_compat_sigset(&s, uthese))
                return -EFAULT;

        if (uts) {
                if (get_timespec64(&t, uts))
                        return -EFAULT;
        }

        ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);

        if (ret > 0 && uinfo) {
                if (copy_siginfo_to_user32(uinfo, &info))
                        ret = -EFAULT;
        }

        return ret;
}

#ifdef CONFIG_COMPAT_32BIT_TIME
COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time32, compat_sigset_t __user *, uthese,
                struct compat_siginfo __user *, uinfo,
                struct old_timespec32 __user *, uts, compat_size_t, sigsetsize)
{
        sigset_t s;
        struct timespec64 t;
        kernel_siginfo_t info;
        long ret;

        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (get_compat_sigset(&s, uthese))
                return -EFAULT;

        if (uts) {
                if (get_old_timespec32(&t, uts))
                        return -EFAULT;
        }

        ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);

        if (ret > 0 && uinfo) {
                if (copy_siginfo_to_user32(uinfo, &info))
                        ret = -EFAULT;
        }

        return ret;
}
#endif
#endif

static inline void prepare_kill_siginfo(int sig, struct kernel_siginfo *info)
{
        clear_siginfo(info);
        info->si_signo = sig;
        info->si_errno = 0;
        info->si_code = SI_USER;
        info->si_pid = task_tgid_vnr(current);
        info->si_uid = from_kuid_munged(current_user_ns(), current_uid());
}

/**
 *  sys_kill - send a signal to a process
 *  @pid: the PID of the process
 *  @sig: signal to be sent
 */
SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
{
        struct kernel_siginfo info;

        prepare_kill_siginfo(sig, &info);

        return kill_something_info(sig, &info, pid);
}

/*
 * Verify that the signaler and signalee either are in the same pid namespace
 * or that the signaler's pid namespace is an ancestor of the signalee's pid
 * namespace.
 */
static bool access_pidfd_pidns(struct pid *pid)
{
        struct pid_namespace *active = task_active_pid_ns(current);
        struct pid_namespace *p = ns_of_pid(pid);

        for (;;) {
                if (!p)
                        return false;
                if (p == active)
                        break;
                p = p->parent;
        }

        return true;
}

static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info)
{
#ifdef CONFIG_COMPAT
        /*
         * Avoid hooking up compat syscalls and instead handle necessary
         * conversions here. Note, this is a stop-gap measure and should not be
         * considered a generic solution.
         */
        if (in_compat_syscall())
                return copy_siginfo_from_user32(
                        kinfo, (struct compat_siginfo __user *)info);
#endif
        return copy_siginfo_from_user(kinfo, info);
}

static struct pid *pidfd_to_pid(const struct file *file)
{
        struct pid *pid;

        pid = pidfd_pid(file);
        if (!IS_ERR(pid))
                return pid;

        return tgid_pidfd_to_pid(file);
}

/**
 * sys_pidfd_send_signal - Signal a process through a pidfd
 * @pidfd:  file descriptor of the process
 * @sig:    signal to send
 * @info:   signal info
 * @flags:  future flags
 *
 * The syscall currently only signals via PIDTYPE_PID which covers
 * kill(<positive-pid>, <signal>. It does not signal threads or process
 * groups.
 * In order to extend the syscall to threads and process groups the @flags
 * argument should be used. In essence, the @flags argument will determine
 * what is signaled and not the file descriptor itself. Put in other words,
 * grouping is a property of the flags argument not a property of the file
 * descriptor.
 *
 * Return: 0 on success, negative errno on failure
 */
SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
                siginfo_t __user *, info, unsigned int, flags)
{
        int ret;
        struct fd f;
        struct pid *pid;
        kernel_siginfo_t kinfo;

        /* Enforce flags be set to 0 until we add an extension. */
        if (flags)
                return -EINVAL;

        f = fdget(pidfd);
        if (!f.file)
                return -EBADF;

        /* Is this a pidfd? */
        pid = pidfd_to_pid(f.file);
        if (IS_ERR(pid)) {
                ret = PTR_ERR(pid);
                goto err;
        }

        ret = -EINVAL;
        if (!access_pidfd_pidns(pid))
                goto err;

        if (info) {
                ret = copy_siginfo_from_user_any(&kinfo, info);
                if (unlikely(ret))
                        goto err;

                ret = -EINVAL;
                if (unlikely(sig != kinfo.si_signo))
                        goto err;

                /* Only allow sending arbitrary signals to yourself. */
                ret = -EPERM;
                if ((task_pid(current) != pid) &&
                    (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL))
                        goto err;
        } else {
                prepare_kill_siginfo(sig, &kinfo);
        }

        ret = kill_pid_info(sig, &kinfo, pid);

err:
        fdput(f);
        return ret;
}

static int
do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info)
{
        struct task_struct *p;
        int error = -ESRCH;

        rcu_read_lock();
        p = find_task_by_vpid(pid);
        if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) {
                error = check_kill_permission(sig, info, p);
                /*
                 * The null signal is a permissions and process existence
                 * probe.  No signal is actually delivered.
                 */
                if (!error && sig) {
                        error = do_send_sig_info(sig, info, p, PIDTYPE_PID);
                        /*
                         * If lock_task_sighand() failed we pretend the task
                         * dies after receiving the signal. The window is tiny,
                         * and the signal is private anyway.
                         */
                        if (unlikely(error == -ESRCH))
                                error = 0;
                }
        }
        rcu_read_unlock();

        return error;
}

static int do_tkill(pid_t tgid, pid_t pid, int sig)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        info.si_code = SI_TKILL;
        info.si_pid = task_tgid_vnr(current);
        info.si_uid = from_kuid_munged(current_user_ns(), current_uid());

        return do_send_specific(tgid, pid, sig, &info);
}

/**
 *  sys_tgkill - send signal to one specific thread
 *  @tgid: the thread group ID of the thread
 *  @pid: the PID of the thread
 *  @sig: signal to be sent
 *
 *  This syscall also checks the @tgid and returns -ESRCH even if the PID
 *  exists but it's not belonging to the target process anymore. This
 *  method solves the problem of threads exiting and PIDs getting reused.
 */
SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig)
{
        /* This is only valid for single tasks */
        if (pid <= 0 || tgid <= 0)
                return -EINVAL;

        return do_tkill(tgid, pid, sig);
}

/**
 *  sys_tkill - send signal to one specific task
 *  @pid: the PID of the task
 *  @sig: signal to be sent
 *
 *  Send a signal to only one task, even if it's a CLONE_THREAD task.
 */
SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
{
        /* This is only valid for single tasks */
        if (pid <= 0)
                return -EINVAL;

        return do_tkill(0, pid, sig);
}

static int do_rt_sigqueueinfo(pid_t pid, int sig, kernel_siginfo_t *info)
{
        /* Not even root can pretend to send signals from the kernel.
         * Nor can they impersonate a kill()/tgkill(), which adds source info.
         */
        if ((info->si_code >= 0 || info->si_code == SI_TKILL) &&
            (task_pid_vnr(current) != pid))
                return -EPERM;

        /* POSIX.1b doesn't mention process groups.  */
        return kill_proc_info(sig, info, pid);
}

/**
 *  sys_rt_sigqueueinfo - send signal information to a signal
 *  @pid: the PID of the thread
 *  @sig: signal to be sent
 *  @uinfo: signal info to be sent
 */
SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
                siginfo_t __user *, uinfo)
{
        kernel_siginfo_t info;
        int ret = __copy_siginfo_from_user(sig, &info, uinfo);
        if (unlikely(ret))
                return ret;
        return do_rt_sigqueueinfo(pid, sig, &info);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo,
                        compat_pid_t, pid,
                        int, sig,
                        struct compat_siginfo __user *, uinfo)
{
        kernel_siginfo_t info;
        int ret = __copy_siginfo_from_user32(sig, &info, uinfo);
        if (unlikely(ret))
                return ret;
        return do_rt_sigqueueinfo(pid, sig, &info);
}
#endif

static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, kernel_siginfo_t *info)
{
        /* This is only valid for single tasks */
        if (pid <= 0 || tgid <= 0)
                return -EINVAL;

        /* Not even root can pretend to send signals from the kernel.
         * Nor can they impersonate a kill()/tgkill(), which adds source info.
         */
        if ((info->si_code >= 0 || info->si_code == SI_TKILL) &&
            (task_pid_vnr(current) != pid))
                return -EPERM;

        return do_send_specific(tgid, pid, sig, info);
}

SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig,
                siginfo_t __user *, uinfo)
{
        kernel_siginfo_t info;
        int ret = __copy_siginfo_from_user(sig, &info, uinfo);
        if (unlikely(ret))
                return ret;
        return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo,
                        compat_pid_t, tgid,
                        compat_pid_t, pid,
                        int, sig,
                        struct compat_siginfo __user *, uinfo)
{
        kernel_siginfo_t info;
        int ret = __copy_siginfo_from_user32(sig, &info, uinfo);
        if (unlikely(ret))
                return ret;
        return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
}
#endif

/*
 * For kthreads only, must not be used if cloned with CLONE_SIGHAND
 */
void kernel_sigaction(int sig, __sighandler_t action)
{
        spin_lock_irq(&current->sighand->siglock);
        current->sighand->action[sig - 1].sa.sa_handler = action;
        if (action == SIG_IGN) {
                sigset_t mask;

                sigemptyset(&mask);
                sigaddset(&mask, sig);

                flush_sigqueue_mask(&mask, &current->signal->shared_pending);
                flush_sigqueue_mask(&mask, &current->pending);
                recalc_sigpending();
        }
        spin_unlock_irq(&current->sighand->siglock);
}
EXPORT_SYMBOL(kernel_sigaction);

void __weak sigaction_compat_abi(struct k_sigaction *act,
                struct k_sigaction *oact)
{
}

int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
{
        struct task_struct *p = current, *t;
        struct k_sigaction *k;
        sigset_t mask;

        if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
                return -EINVAL;

        k = &p->sighand->action[sig-1];

        spin_lock_irq(&p->sighand->siglock);
        if (oact)
                *oact = *k;

        sigaction_compat_abi(act, oact);

        if (act) {
                sigdelsetmask(&act->sa.sa_mask,
                              sigmask(SIGKILL) | sigmask(SIGSTOP));
                *k = *act;
                /*
                 * POSIX 3.3.1.3:
                 *  "Setting a signal action to SIG_IGN for a signal that is
                 *   pending shall cause the pending signal to be discarded,
                 *   whether or not it is blocked."
                 *
                 *  "Setting a signal action to SIG_DFL for a signal that is
                 *   pending and whose default action is to ignore the signal
                 *   (for example, SIGCHLD), shall cause the pending signal to
                 *   be discarded, whether or not it is blocked"
                 */
                if (sig_handler_ignored(sig_handler(p, sig), sig)) {
                        sigemptyset(&mask);
                        sigaddset(&mask, sig);
                        flush_sigqueue_mask(&mask, &p->signal->shared_pending);
                        for_each_thread(p, t)
                                flush_sigqueue_mask(&mask, &t->pending);
                }
        }

        spin_unlock_irq(&p->sighand->siglock);
        return 0;
}

static int
do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp,
                size_t min_ss_size)
{
        struct task_struct *t = current;

        if (oss) {
                memset(oss, 0, sizeof(stack_t));
                oss->ss_sp = (void __user *) t->sas_ss_sp;
                oss->ss_size = t->sas_ss_size;
                oss->ss_flags = sas_ss_flags(sp) |
                        (current->sas_ss_flags & SS_FLAG_BITS);
        }

        if (ss) {
                void __user *ss_sp = ss->ss_sp;
                size_t ss_size = ss->ss_size;
                unsigned ss_flags = ss->ss_flags;
                int ss_mode;

                if (unlikely(on_sig_stack(sp)))
                        return -EPERM;

                ss_mode = ss_flags & ~SS_FLAG_BITS;
                if (unlikely(ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK &&
                                ss_mode != 0))
                        return -EINVAL;

                if (ss_mode == SS_DISABLE) {
                        ss_size = 0;
                        ss_sp = NULL;
                } else {
                        if (unlikely(ss_size < min_ss_size))
                                return -ENOMEM;
                }

                t->sas_ss_sp = (unsigned long) ss_sp;
                t->sas_ss_size = ss_size;
                t->sas_ss_flags = ss_flags;
        }
        return 0;
}

SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss)
{
        stack_t new, old;
        int err;
        if (uss && copy_from_user(&new, uss, sizeof(stack_t)))
                return -EFAULT;
        err = do_sigaltstack(uss ? &new : NULL, uoss ? &old : NULL,
                              current_user_stack_pointer(),
                              MINSIGSTKSZ);
        if (!err && uoss && copy_to_user(uoss, &old, sizeof(stack_t)))
                err = -EFAULT;
        return err;
}

int restore_altstack(const stack_t __user *uss)
{
        stack_t new;
        if (copy_from_user(&new, uss, sizeof(stack_t)))
                return -EFAULT;
        (void)do_sigaltstack(&new, NULL, current_user_stack_pointer(),
                             MINSIGSTKSZ);
        /* squash all but EFAULT for now */
        return 0;
}

int __save_altstack(stack_t __user *uss, unsigned long sp)
{
        struct task_struct *t = current;
        int err = __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) |
                __put_user(t->sas_ss_flags, &uss->ss_flags) |
                __put_user(t->sas_ss_size, &uss->ss_size);
        if (err)
                return err;
        if (t->sas_ss_flags & SS_AUTODISARM)
                sas_ss_reset(t);
        return 0;
}

#ifdef CONFIG_COMPAT
static int do_compat_sigaltstack(const compat_stack_t __user *uss_ptr,
                                 compat_stack_t __user *uoss_ptr)
{
        stack_t uss, uoss;
        int ret;

        if (uss_ptr) {
                compat_stack_t uss32;
                if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t)))
                        return -EFAULT;
                uss.ss_sp = compat_ptr(uss32.ss_sp);
                uss.ss_flags = uss32.ss_flags;
                uss.ss_size = uss32.ss_size;
        }
        ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss,
                             compat_user_stack_pointer(),
                             COMPAT_MINSIGSTKSZ);
        if (ret >= 0 && uoss_ptr)  {
                compat_stack_t old;
                memset(&old, 0, sizeof(old));
                old.ss_sp = ptr_to_compat(uoss.ss_sp);
                old.ss_flags = uoss.ss_flags;
                old.ss_size = uoss.ss_size;
                if (copy_to_user(uoss_ptr, &old, sizeof(compat_stack_t)))
                        ret = -EFAULT;
        }
        return ret;
}

COMPAT_SYSCALL_DEFINE2(sigaltstack,
                        const compat_stack_t __user *, uss_ptr,
                        compat_stack_t __user *, uoss_ptr)
{
        return do_compat_sigaltstack(uss_ptr, uoss_ptr);
}

int compat_restore_altstack(const compat_stack_t __user *uss)
{
        int err = do_compat_sigaltstack(uss, NULL);
        /* squash all but -EFAULT for now */
        return err == -EFAULT ? err : 0;
}

int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
{
        int err;
        struct task_struct *t = current;
        err = __put_user(ptr_to_compat((void __user *)t->sas_ss_sp),
                         &uss->ss_sp) |
                __put_user(t->sas_ss_flags, &uss->ss_flags) |
                __put_user(t->sas_ss_size, &uss->ss_size);
        if (err)
                return err;
        if (t->sas_ss_flags & SS_AUTODISARM)
                sas_ss_reset(t);
        return 0;
}
#endif

#ifdef __ARCH_WANT_SYS_SIGPENDING

/**
 *  sys_sigpending - examine pending signals
 *  @uset: where mask of pending signal is returned
 */
SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, uset)
{
        sigset_t set;

        if (sizeof(old_sigset_t) > sizeof(*uset))
                return -EINVAL;

        do_sigpending(&set);

        if (copy_to_user(uset, &set, sizeof(old_sigset_t)))
                return -EFAULT;

        return 0;
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32)
{
        sigset_t set;

        do_sigpending(&set);

        return put_user(set.sig[0], set32);
}
#endif

#endif

#ifdef __ARCH_WANT_SYS_SIGPROCMASK
/**
 *  sys_sigprocmask - examine and change blocked signals
 *  @how: whether to add, remove, or set signals
 *  @nset: signals to add or remove (if non-null)
 *  @oset: previous value of signal mask if non-null
 *
 * Some platforms have their own version with special arguments;
 * others support only sys_rt_sigprocmask.
 */

SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
                old_sigset_t __user *, oset)
{
        old_sigset_t old_set, new_set;
        sigset_t new_blocked;

        old_set = current->blocked.sig[0];

        if (nset) {
                if (copy_from_user(&new_set, nset, sizeof(*nset)))
                        return -EFAULT;

                new_blocked = current->blocked;

                switch (how) {
                case SIG_BLOCK:
                        sigaddsetmask(&new_blocked, new_set);
                        break;
                case SIG_UNBLOCK:
                        sigdelsetmask(&new_blocked, new_set);
                        break;
                case SIG_SETMASK:
                        new_blocked.sig[0] = new_set;
                        break;
                default:
                        return -EINVAL;
                }

                set_current_blocked(&new_blocked);
        }

        if (oset) {
                if (copy_to_user(oset, &old_set, sizeof(*oset)))
                        return -EFAULT;
        }

        return 0;
}
#endif /* __ARCH_WANT_SYS_SIGPROCMASK */

#ifndef CONFIG_ODD_RT_SIGACTION
/**
 *  sys_rt_sigaction - alter an action taken by a process
 *  @sig: signal to be sent
 *  @act: new sigaction
 *  @oact: used to save the previous sigaction
 *  @sigsetsize: size of sigset_t type
 */
SYSCALL_DEFINE4(rt_sigaction, int, sig,
                const struct sigaction __user *, act,
                struct sigaction __user *, oact,
                size_t, sigsetsize)
{
        struct k_sigaction new_sa, old_sa;
        int ret;

        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (act && copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa)))
                return -EFAULT;

        ret = do_sigaction(sig, act ? &new_sa : NULL, oact ? &old_sa : NULL);
        if (ret)
                return ret;

        if (oact && copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa)))
                return -EFAULT;

        return 0;
}
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
                const struct compat_sigaction __user *, act,
                struct compat_sigaction __user *, oact,
                compat_size_t, sigsetsize)
{
        struct k_sigaction new_ka, old_ka;
#ifdef __ARCH_HAS_SA_RESTORER
        compat_uptr_t restorer;
#endif
        int ret;

        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(compat_sigset_t))
                return -EINVAL;

        if (act) {
                compat_uptr_t handler;
                ret = get_user(handler, &act->sa_handler);
                new_ka.sa.sa_handler = compat_ptr(handler);
#ifdef __ARCH_HAS_SA_RESTORER
                ret |= get_user(restorer, &act->sa_restorer);
                new_ka.sa.sa_restorer = compat_ptr(restorer);
#endif
                ret |= get_compat_sigset(&new_ka.sa.sa_mask, &act->sa_mask);
                ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags);
                if (ret)
                        return -EFAULT;
        }

        ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
        if (!ret && oact) {
                ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), 
                               &oact->sa_handler);
                ret |= put_compat_sigset(&oact->sa_mask, &old_ka.sa.sa_mask,
                                         sizeof(oact->sa_mask));
                ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags);
#ifdef __ARCH_HAS_SA_RESTORER
                ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer),
                                &oact->sa_restorer);
#endif
        }
        return ret;
}
#endif
#endif /* !CONFIG_ODD_RT_SIGACTION */

#ifdef CONFIG_OLD_SIGACTION
SYSCALL_DEFINE3(sigaction, int, sig,
                const struct old_sigaction __user *, act,
                struct old_sigaction __user *, oact)
{
        struct k_sigaction new_ka, old_ka;
        int ret;

        if (act) {
                old_sigset_t mask;
                if (!access_ok(act, sizeof(*act)) ||
                    __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
                    __get_user(new_ka.sa.sa_restorer, &act->sa_restorer) ||
                    __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
                    __get_user(mask, &act->sa_mask))
                        return -EFAULT;
#ifdef __ARCH_HAS_KA_RESTORER
                new_ka.ka_restorer = NULL;
#endif
                siginitset(&new_ka.sa.sa_mask, mask);
        }

        ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);

        if (!ret && oact) {
                if (!access_ok(oact, sizeof(*oact)) ||
                    __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
                    __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer) ||
                    __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
                    __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
                        return -EFAULT;
        }

        return ret;
}
#endif
#ifdef CONFIG_COMPAT_OLD_SIGACTION
COMPAT_SYSCALL_DEFINE3(sigaction, int, sig,
                const struct compat_old_sigaction __user *, act,
                struct compat_old_sigaction __user *, oact)
{
        struct k_sigaction new_ka, old_ka;
        int ret;
        compat_old_sigset_t mask;
        compat_uptr_t handler, restorer;

        if (act) {
                if (!access_ok(act, sizeof(*act)) ||
                    __get_user(handler, &act->sa_handler) ||
                    __get_user(restorer, &act->sa_restorer) ||
                    __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
                    __get_user(mask, &act->sa_mask))
                        return -EFAULT;

#ifdef __ARCH_HAS_KA_RESTORER
                new_ka.ka_restorer = NULL;
#endif
                new_ka.sa.sa_handler = compat_ptr(handler);
                new_ka.sa.sa_restorer = compat_ptr(restorer);
                siginitset(&new_ka.sa.sa_mask, mask);
        }

        ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);

        if (!ret && oact) {
                if (!access_ok(oact, sizeof(*oact)) ||
                    __put_user(ptr_to_compat(old_ka.sa.sa_handler),
                               &oact->sa_handler) ||
                    __put_user(ptr_to_compat(old_ka.sa.sa_restorer),
                               &oact->sa_restorer) ||
                    __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
                    __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
                        return -EFAULT;
        }
        return ret;
}
#endif

#ifdef CONFIG_SGETMASK_SYSCALL

/*
 * For backwards compatibility.  Functionality superseded by sigprocmask.
 */
SYSCALL_DEFINE0(sgetmask)
{
        /* SMP safe */
        return current->blocked.sig[0];
}

SYSCALL_DEFINE1(ssetmask, int, newmask)
{
        int old = current->blocked.sig[0];
        sigset_t newset;

        siginitset(&newset, newmask);
        set_current_blocked(&newset);

        return old;
}
#endif /* CONFIG_SGETMASK_SYSCALL */

#ifdef __ARCH_WANT_SYS_SIGNAL
/*
 * For backwards compatibility.  Functionality superseded by sigaction.
 */
SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler)
{
        struct k_sigaction new_sa, old_sa;
        int ret;

        new_sa.sa.sa_handler = handler;
        new_sa.sa.sa_flags = SA_ONESHOT | SA_NOMASK;
        sigemptyset(&new_sa.sa.sa_mask);

        ret = do_sigaction(sig, &new_sa, &old_sa);

        return ret ? ret : (unsigned long)old_sa.sa.sa_handler;
}
#endif /* __ARCH_WANT_SYS_SIGNAL */

#ifdef __ARCH_WANT_SYS_PAUSE

SYSCALL_DEFINE0(pause)
{
        while (!signal_pending(current)) {
                __set_current_state(TASK_INTERRUPTIBLE);
                schedule();
        }
        return -ERESTARTNOHAND;
}

#endif

static int sigsuspend(sigset_t *set)
{
        current->saved_sigmask = current->blocked;
        set_current_blocked(set);

        while (!signal_pending(current)) {
                __set_current_state(TASK_INTERRUPTIBLE);
                schedule();
        }
        set_restore_sigmask();
        return -ERESTARTNOHAND;
}

/**
 *  sys_rt_sigsuspend - replace the signal mask for a value with the
 *        @unewset value until a signal is received
 *  @unewset: new signal mask value
 *  @sigsetsize: size of sigset_t type
 */
SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
{
        sigset_t newset;

        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (copy_from_user(&newset, unewset, sizeof(newset)))
                return -EFAULT;
        return sigsuspend(&newset);
}
 
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize)
{
        sigset_t newset;

        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (get_compat_sigset(&newset, unewset))
                return -EFAULT;
        return sigsuspend(&newset);
}
#endif

#ifdef CONFIG_OLD_SIGSUSPEND
SYSCALL_DEFINE1(sigsuspend, old_sigset_t, mask)
{
        sigset_t blocked;
        siginitset(&blocked, mask);
        return sigsuspend(&blocked);
}
#endif
#ifdef CONFIG_OLD_SIGSUSPEND3
SYSCALL_DEFINE3(sigsuspend, int, unused1, int, unused2, old_sigset_t, mask)
{
        sigset_t blocked;
        siginitset(&blocked, mask);
        return sigsuspend(&blocked);
}
#endif

__weak const char *arch_vma_name(struct vm_area_struct *vma)
{
        return NULL;
}

static inline void siginfo_buildtime_checks(void)
{
        BUILD_BUG_ON(sizeof(struct siginfo) != SI_MAX_SIZE);

        /* Verify the offsets in the two siginfos match */
#define CHECK_OFFSET(field) \
        BUILD_BUG_ON(offsetof(siginfo_t, field) != offsetof(kernel_siginfo_t, field))

        /* kill */
        CHECK_OFFSET(si_pid);
        CHECK_OFFSET(si_uid);

        /* timer */
        CHECK_OFFSET(si_tid);
        CHECK_OFFSET(si_overrun);
        CHECK_OFFSET(si_value);

        /* rt */
        CHECK_OFFSET(si_pid);
        CHECK_OFFSET(si_uid);
        CHECK_OFFSET(si_value);

        /* sigchld */
        CHECK_OFFSET(si_pid);
        CHECK_OFFSET(si_uid);
        CHECK_OFFSET(si_status);
        CHECK_OFFSET(si_utime);
        CHECK_OFFSET(si_stime);

        /* sigfault */
        CHECK_OFFSET(si_addr);
        CHECK_OFFSET(si_addr_lsb);
        CHECK_OFFSET(si_lower);
        CHECK_OFFSET(si_upper);
        CHECK_OFFSET(si_pkey);

        /* sigpoll */
        CHECK_OFFSET(si_band);
        CHECK_OFFSET(si_fd);

        /* sigsys */
        CHECK_OFFSET(si_call_addr);
        CHECK_OFFSET(si_syscall);
        CHECK_OFFSET(si_arch);
#undef CHECK_OFFSET

        /* usb asyncio */
        BUILD_BUG_ON(offsetof(struct siginfo, si_pid) !=
                     offsetof(struct siginfo, si_addr));
        if (sizeof(int) == sizeof(void __user *)) {
                BUILD_BUG_ON(sizeof_field(struct siginfo, si_pid) !=
                             sizeof(void __user *));
        } else {
                BUILD_BUG_ON((sizeof_field(struct siginfo, si_pid) +
                              sizeof_field(struct siginfo, si_uid)) !=
                             sizeof(void __user *));
                BUILD_BUG_ON(offsetofend(struct siginfo, si_pid) !=
                             offsetof(struct siginfo, si_uid));
        }
#ifdef CONFIG_COMPAT
        BUILD_BUG_ON(offsetof(struct compat_siginfo, si_pid) !=
                     offsetof(struct compat_siginfo, si_addr));
        BUILD_BUG_ON(sizeof_field(struct compat_siginfo, si_pid) !=
                     sizeof(compat_uptr_t));
        BUILD_BUG_ON(sizeof_field(struct compat_siginfo, si_pid) !=
                     sizeof_field(struct siginfo, si_pid));
#endif
}

void __init signals_init(void)
{
        siginfo_buildtime_checks();

        sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC);
}

#ifdef CONFIG_KGDB_KDB
#include <linux/kdb.h>
/*
 * kdb_send_sig - Allows kdb to send signals without exposing
 * signal internals.  This function checks if the required locks are
 * available before calling the main signal code, to avoid kdb
 * deadlocks.
 */
void kdb_send_sig(struct task_struct *t, int sig)
{
        static struct task_struct *kdb_prev_t;
        int new_t, ret;
        if (!spin_trylock(&t->sighand->siglock)) {
                kdb_printf("Can't do kill command now.\n"
                           "The sigmask lock is held somewhere else in "
                           "kernel, try again later\n");
                return;
        }
        new_t = kdb_prev_t != t;
        kdb_prev_t = t;
        if (t->state != TASK_RUNNING && new_t) {
                spin_unlock(&t->sighand->siglock);
                kdb_printf("Process is not RUNNING, sending a signal from "
                           "kdb risks deadlock\n"
                           "on the run queue locks. "
                           "The signal has _not_ been sent.\n"
                           "Reissue the kill command if you want to risk "
                           "the deadlock.\n");
                return;
        }
        ret = send_signal(sig, SEND_SIG_PRIV, t, PIDTYPE_PID);
        spin_unlock(&t->sighand->siglock);
        if (ret)
                kdb_printf("Fail to deliver Signal %d to process %d.\n",
                           sig, t->pid);
        else
                kdb_printf("Signal %d is sent to process %d.\n", sig, t->pid);
}
#endif        /* CONFIG_KGDB_KDB */





















































    1 




























    1 


    1 










































































    1 
    1 


    1 



    1 

    1 









    1 



    1 
    1 







    1 
    1 

    1 
















    1 
























































































































































































































































































































    1 



    1 



































































































































































































































































































































































































































    1 



































    1 


    1 



































































    1 

    1 































    1 



    1 











    1 







    1 











    1 








    1 

    1 
    1 





















    1 

































































































































































































































































































































    1 







    1 
    1 
    1 



    1 




    1 




























































    1 

    1 
    1 




    1 



    1 


    1 













    1 


    1 


    1 
    1 
    1 




















    1 




































    1 
































    1 







    1 
















































































    1 












    1 



    1 






    1 
    1 
    1 

    1 









    1 



    1 



    1 





































    1 


























    1 

    1 















































































    1 




    1 






































































































































    1 


    1 



    1 




    1 


    1 



    1 
    1 



































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/block_dev.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *  Copyright (C) 2001  Andrea Arcangeli <andrea@suse.de> SuSE
 */

#include <linux/init.h>
#include <linux/mm.h>
#include <linux/fcntl.h>
#include <linux/slab.h>
#include <linux/kmod.h>
#include <linux/major.h>
#include <linux/device_cgroup.h>
#include <linux/highmem.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/module.h>
#include <linux/blkpg.h>
#include <linux/magic.h>
#include <linux/buffer_head.h>
#include <linux/swap.h>
#include <linux/pagevec.h>
#include <linux/writeback.h>
#include <linux/mpage.h>
#include <linux/mount.h>
#include <linux/pseudo_fs.h>
#include <linux/uio.h>
#include <linux/namei.h>
#include <linux/log2.h>
#include <linux/cleancache.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/falloc.h>
#include <linux/uaccess.h>
#include <linux/suspend.h>
#include "internal.h"

struct bdev_inode {
        struct block_device bdev;
        struct inode vfs_inode;
};

static const struct address_space_operations def_blk_aops;

static inline struct bdev_inode *BDEV_I(struct inode *inode)
{
        return container_of(inode, struct bdev_inode, vfs_inode);
}

struct block_device *I_BDEV(struct inode *inode)
{
        return &BDEV_I(inode)->bdev;
}
EXPORT_SYMBOL(I_BDEV);

static void bdev_write_inode(struct block_device *bdev)
{
        struct inode *inode = bdev->bd_inode;
        int ret;

        spin_lock(&inode->i_lock);
        while (inode->i_state & I_DIRTY) {
                spin_unlock(&inode->i_lock);
                ret = write_inode_now(inode, true);
                if (ret) {
                        char name[BDEVNAME_SIZE];
                        pr_warn_ratelimited("VFS: Dirty inode writeback failed "
                                            "for block device %s (err=%d).\n",
                                            bdevname(bdev, name), ret);
                }
                spin_lock(&inode->i_lock);
        }
        spin_unlock(&inode->i_lock);
}

/* Kill _all_ buffers and pagecache , dirty or not.. */
static void kill_bdev(struct block_device *bdev)
{
        struct address_space *mapping = bdev->bd_inode->i_mapping;

        if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
                return;

        invalidate_bh_lrus();
        truncate_inode_pages(mapping, 0);
}

/* Invalidate clean unused buffers and pagecache. */
void invalidate_bdev(struct block_device *bdev)
{
        struct address_space *mapping = bdev->bd_inode->i_mapping;

        if (mapping->nrpages) {
                invalidate_bh_lrus();
                lru_add_drain_all();        /* make sure all lru add caches are flushed */
                invalidate_mapping_pages(mapping, 0, -1);
        }
        /* 99% of the time, we don't need to flush the cleancache on the bdev.
         * But, for the strange corners, lets be cautious
         */
        cleancache_invalidate_inode(mapping);
}
EXPORT_SYMBOL(invalidate_bdev);

/*
 * Drop all buffers & page cache for given bdev range. This function bails
 * with error if bdev has other exclusive owner (such as filesystem).
 */
int truncate_bdev_range(struct block_device *bdev, fmode_t mode,
                        loff_t lstart, loff_t lend)
{
        struct block_device *claimed_bdev = NULL;
        int err;

        /*
         * If we don't hold exclusive handle for the device, upgrade to it
         * while we discard the buffer cache to avoid discarding buffers
         * under live filesystem.
         */
        if (!(mode & FMODE_EXCL)) {
                claimed_bdev = bdev->bd_contains;
                err = bd_prepare_to_claim(bdev, claimed_bdev,
                                          truncate_bdev_range);
                if (err)
                        goto invalidate;
        }
        truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend);
        if (claimed_bdev)
                bd_abort_claiming(bdev, claimed_bdev, truncate_bdev_range);
        return 0;

invalidate:
        /*
         * Someone else has handle exclusively open. Try invalidating instead.
         * The 'end' argument is inclusive so the rounding is safe.
         */
        return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping,
                                             lstart >> PAGE_SHIFT,
                                             lend >> PAGE_SHIFT);
}
EXPORT_SYMBOL(truncate_bdev_range);

static void set_init_blocksize(struct block_device *bdev)
{
        unsigned int bsize = bdev_logical_block_size(bdev);
        loff_t size = i_size_read(bdev->bd_inode);

        while (bsize < PAGE_SIZE) {
                if (size & bsize)
                        break;
                bsize <<= 1;
        }
        bdev->bd_inode->i_blkbits = blksize_bits(bsize);
}

int set_blocksize(struct block_device *bdev, int size)
{
        /* Size must be a power of two, and between 512 and PAGE_SIZE */
        if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
                return -EINVAL;

        /* Size cannot be smaller than the size supported by the device */
        if (size < bdev_logical_block_size(bdev))
                return -EINVAL;

        /* Don't change the size if it is same as current */
        if (bdev->bd_inode->i_blkbits != blksize_bits(size)) {
                sync_blockdev(bdev);
                bdev->bd_inode->i_blkbits = blksize_bits(size);
                kill_bdev(bdev);
        }
        return 0;
}

EXPORT_SYMBOL(set_blocksize);

int sb_set_blocksize(struct super_block *sb, int size)
{
        if (set_blocksize(sb->s_bdev, size))
                return 0;
        /* If we get here, we know size is power of two
         * and it's value is between 512 and PAGE_SIZE */
        sb->s_blocksize = size;
        sb->s_blocksize_bits = blksize_bits(size);
        return sb->s_blocksize;
}

EXPORT_SYMBOL(sb_set_blocksize);

int sb_min_blocksize(struct super_block *sb, int size)
{
        int minsize = bdev_logical_block_size(sb->s_bdev);
        if (size < minsize)
                size = minsize;
        return sb_set_blocksize(sb, size);
}

EXPORT_SYMBOL(sb_min_blocksize);

static int
blkdev_get_block(struct inode *inode, sector_t iblock,
                struct buffer_head *bh, int create)
{
        bh->b_bdev = I_BDEV(inode);
        bh->b_blocknr = iblock;
        set_buffer_mapped(bh);
        return 0;
}

static struct inode *bdev_file_inode(struct file *file)
{
        return file->f_mapping->host;
}

static unsigned int dio_bio_write_op(struct kiocb *iocb)
{
        unsigned int op = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;

        /* avoid the need for a I/O completion work item */
        if (iocb->ki_flags & IOCB_DSYNC)
                op |= REQ_FUA;
        return op;
}

#define DIO_INLINE_BIO_VECS 4

static void blkdev_bio_end_io_simple(struct bio *bio)
{
        struct task_struct *waiter = bio->bi_private;

        WRITE_ONCE(bio->bi_private, NULL);
        blk_wake_io_task(waiter);
}

static ssize_t
__blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
                int nr_pages)
{
        struct file *file = iocb->ki_filp;
        struct block_device *bdev = I_BDEV(bdev_file_inode(file));
        struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
        loff_t pos = iocb->ki_pos;
        bool should_dirty = false;
        struct bio bio;
        ssize_t ret;
        blk_qc_t qc;

        if ((pos | iov_iter_alignment(iter)) &
            (bdev_logical_block_size(bdev) - 1))
                return -EINVAL;

        if (nr_pages <= DIO_INLINE_BIO_VECS)
                vecs = inline_vecs;
        else {
                vecs = kmalloc_array(nr_pages, sizeof(struct bio_vec),
                                     GFP_KERNEL);
                if (!vecs)
                        return -ENOMEM;
        }

        bio_init(&bio, vecs, nr_pages);
        bio_set_dev(&bio, bdev);
        bio.bi_iter.bi_sector = pos >> 9;
        bio.bi_write_hint = iocb->ki_hint;
        bio.bi_private = current;
        bio.bi_end_io = blkdev_bio_end_io_simple;
        bio.bi_ioprio = iocb->ki_ioprio;

        ret = bio_iov_iter_get_pages(&bio, iter);
        if (unlikely(ret))
                goto out;
        ret = bio.bi_iter.bi_size;

        if (iov_iter_rw(iter) == READ) {
                bio.bi_opf = REQ_OP_READ;
                if (iter_is_iovec(iter))
                        should_dirty = true;
        } else {
                bio.bi_opf = dio_bio_write_op(iocb);
                task_io_account_write(ret);
        }
        if (iocb->ki_flags & IOCB_NOWAIT)
                bio.bi_opf |= REQ_NOWAIT;
        if (iocb->ki_flags & IOCB_HIPRI)
                bio_set_polled(&bio, iocb);

        qc = submit_bio(&bio);
        for (;;) {
                set_current_state(TASK_UNINTERRUPTIBLE);
                if (!READ_ONCE(bio.bi_private))
                        break;
                if (!(iocb->ki_flags & IOCB_HIPRI) ||
                    !blk_poll(bdev_get_queue(bdev), qc, true))
                        blk_io_schedule();
        }
        __set_current_state(TASK_RUNNING);

        bio_release_pages(&bio, should_dirty);
        if (unlikely(bio.bi_status))
                ret = blk_status_to_errno(bio.bi_status);

out:
        if (vecs != inline_vecs)
                kfree(vecs);

        bio_uninit(&bio);

        return ret;
}

struct blkdev_dio {
        union {
                struct kiocb                *iocb;
                struct task_struct        *waiter;
        };
        size_t                        size;
        atomic_t                ref;
        bool                        multi_bio : 1;
        bool                        should_dirty : 1;
        bool                        is_sync : 1;
        struct bio                bio;
};

static struct bio_set blkdev_dio_pool;

static int blkdev_iopoll(struct kiocb *kiocb, bool wait)
{
        struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host);
        struct request_queue *q = bdev_get_queue(bdev);

        return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait);
}

static void blkdev_bio_end_io(struct bio *bio)
{
        struct blkdev_dio *dio = bio->bi_private;
        bool should_dirty = dio->should_dirty;

        if (bio->bi_status && !dio->bio.bi_status)
                dio->bio.bi_status = bio->bi_status;

        if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) {
                if (!dio->is_sync) {
                        struct kiocb *iocb = dio->iocb;
                        ssize_t ret;

                        if (likely(!dio->bio.bi_status)) {
                                ret = dio->size;
                                iocb->ki_pos += ret;
                        } else {
                                ret = blk_status_to_errno(dio->bio.bi_status);
                        }

                        dio->iocb->ki_complete(iocb, ret, 0);
                        if (dio->multi_bio)
                                bio_put(&dio->bio);
                } else {
                        struct task_struct *waiter = dio->waiter;

                        WRITE_ONCE(dio->waiter, NULL);
                        blk_wake_io_task(waiter);
                }
        }

        if (should_dirty) {
                bio_check_pages_dirty(bio);
        } else {
                bio_release_pages(bio, false);
                bio_put(bio);
        }
}

static ssize_t
__blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
{
        struct file *file = iocb->ki_filp;
        struct inode *inode = bdev_file_inode(file);
        struct block_device *bdev = I_BDEV(inode);
        struct blk_plug plug;
        struct blkdev_dio *dio;
        struct bio *bio;
        bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0;
        bool is_read = (iov_iter_rw(iter) == READ), is_sync;
        loff_t pos = iocb->ki_pos;
        blk_qc_t qc = BLK_QC_T_NONE;
        int ret = 0;

        if ((pos | iov_iter_alignment(iter)) &
            (bdev_logical_block_size(bdev) - 1))
                return -EINVAL;

        bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool);

        dio = container_of(bio, struct blkdev_dio, bio);
        dio->is_sync = is_sync = is_sync_kiocb(iocb);
        if (dio->is_sync) {
                dio->waiter = current;
                bio_get(bio);
        } else {
                dio->iocb = iocb;
        }

        dio->size = 0;
        dio->multi_bio = false;
        dio->should_dirty = is_read && iter_is_iovec(iter);

        /*
         * Don't plug for HIPRI/polled IO, as those should go straight
         * to issue
         */
        if (!is_poll)
                blk_start_plug(&plug);

        for (;;) {
                bio_set_dev(bio, bdev);
                bio->bi_iter.bi_sector = pos >> 9;
                bio->bi_write_hint = iocb->ki_hint;
                bio->bi_private = dio;
                bio->bi_end_io = blkdev_bio_end_io;
                bio->bi_ioprio = iocb->ki_ioprio;

                ret = bio_iov_iter_get_pages(bio, iter);
                if (unlikely(ret)) {
                        bio->bi_status = BLK_STS_IOERR;
                        bio_endio(bio);
                        break;
                }

                if (is_read) {
                        bio->bi_opf = REQ_OP_READ;
                        if (dio->should_dirty)
                                bio_set_pages_dirty(bio);
                } else {
                        bio->bi_opf = dio_bio_write_op(iocb);
                        task_io_account_write(bio->bi_iter.bi_size);
                }
                if (iocb->ki_flags & IOCB_NOWAIT)
                        bio->bi_opf |= REQ_NOWAIT;

                dio->size += bio->bi_iter.bi_size;
                pos += bio->bi_iter.bi_size;

                nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
                if (!nr_pages) {
                        bool polled = false;

                        if (iocb->ki_flags & IOCB_HIPRI) {
                                bio_set_polled(bio, iocb);
                                polled = true;
                        }

                        qc = submit_bio(bio);

                        if (polled)
                                WRITE_ONCE(iocb->ki_cookie, qc);
                        break;
                }

                if (!dio->multi_bio) {
                        /*
                         * AIO needs an extra reference to ensure the dio
                         * structure which is embedded into the first bio
                         * stays around.
                         */
                        if (!is_sync)
                                bio_get(bio);
                        dio->multi_bio = true;
                        atomic_set(&dio->ref, 2);
                } else {
                        atomic_inc(&dio->ref);
                }

                submit_bio(bio);
                bio = bio_alloc(GFP_KERNEL, nr_pages);
        }

        if (!is_poll)
                blk_finish_plug(&plug);

        if (!is_sync)
                return -EIOCBQUEUED;

        for (;;) {
                set_current_state(TASK_UNINTERRUPTIBLE);
                if (!READ_ONCE(dio->waiter))
                        break;

                if (!(iocb->ki_flags & IOCB_HIPRI) ||
                    !blk_poll(bdev_get_queue(bdev), qc, true))
                        blk_io_schedule();
        }
        __set_current_state(TASK_RUNNING);

        if (!ret)
                ret = blk_status_to_errno(dio->bio.bi_status);
        if (likely(!ret))
                ret = dio->size;

        bio_put(&dio->bio);
        return ret;
}

static ssize_t
blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
        int nr_pages;

        nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES + 1);
        if (!nr_pages)
                return 0;
        if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES)
                return __blkdev_direct_IO_simple(iocb, iter, nr_pages);

        return __blkdev_direct_IO(iocb, iter, min(nr_pages, BIO_MAX_PAGES));
}

static __init int blkdev_init(void)
{
        return bioset_init(&blkdev_dio_pool, 4, offsetof(struct blkdev_dio, bio), BIOSET_NEED_BVECS);
}
module_init(blkdev_init);

int __sync_blockdev(struct block_device *bdev, int wait)
{
        if (!bdev)
                return 0;
        if (!wait)
                return filemap_flush(bdev->bd_inode->i_mapping);
        return filemap_write_and_wait(bdev->bd_inode->i_mapping);
}

/*
 * Write out and wait upon all the dirty data associated with a block
 * device via its mapping.  Does not take the superblock lock.
 */
int sync_blockdev(struct block_device *bdev)
{
        return __sync_blockdev(bdev, 1);
}
EXPORT_SYMBOL(sync_blockdev);

/*
 * Write out and wait upon all dirty data associated with this
 * device.   Filesystem data as well as the underlying block
 * device.  Takes the superblock lock.
 */
int fsync_bdev(struct block_device *bdev)
{
        struct super_block *sb = get_super(bdev);
        if (sb) {
                int res = sync_filesystem(sb);
                drop_super(sb);
                return res;
        }
        return sync_blockdev(bdev);
}
EXPORT_SYMBOL(fsync_bdev);

/**
 * freeze_bdev  --  lock a filesystem and force it into a consistent state
 * @bdev:        blockdevice to lock
 *
 * If a superblock is found on this device, we take the s_umount semaphore
 * on it to make sure nobody unmounts until the snapshot creation is done.
 * The reference counter (bd_fsfreeze_count) guarantees that only the last
 * unfreeze process can unfreeze the frozen filesystem actually when multiple
 * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
 * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
 * actually.
 */
struct super_block *freeze_bdev(struct block_device *bdev)
{
        struct super_block *sb;
        int error = 0;

        mutex_lock(&bdev->bd_fsfreeze_mutex);
        if (++bdev->bd_fsfreeze_count > 1) {
                /*
                 * We don't even need to grab a reference - the first call
                 * to freeze_bdev grab an active reference and only the last
                 * thaw_bdev drops it.
                 */
                sb = get_super(bdev);
                if (sb)
                        drop_super(sb);
                mutex_unlock(&bdev->bd_fsfreeze_mutex);
                return sb;
        }

        sb = get_active_super(bdev);
        if (!sb)
                goto out;
        if (sb->s_op->freeze_super)
                error = sb->s_op->freeze_super(sb);
        else
                error = freeze_super(sb);
        if (error) {
                deactivate_super(sb);
                bdev->bd_fsfreeze_count--;
                mutex_unlock(&bdev->bd_fsfreeze_mutex);
                return ERR_PTR(error);
        }
        deactivate_super(sb);
 out:
        sync_blockdev(bdev);
        mutex_unlock(&bdev->bd_fsfreeze_mutex);
        return sb;        /* thaw_bdev releases s->s_umount */
}
EXPORT_SYMBOL(freeze_bdev);

/**
 * thaw_bdev  -- unlock filesystem
 * @bdev:        blockdevice to unlock
 * @sb:                associated superblock
 *
 * Unlocks the filesystem and marks it writeable again after freeze_bdev().
 */
int thaw_bdev(struct block_device *bdev, struct super_block *sb)
{
        int error = -EINVAL;

        mutex_lock(&bdev->bd_fsfreeze_mutex);
        if (!bdev->bd_fsfreeze_count)
                goto out;

        error = 0;
        if (--bdev->bd_fsfreeze_count > 0)
                goto out;

        if (!sb)
                goto out;

        if (sb->s_op->thaw_super)
                error = sb->s_op->thaw_super(sb);
        else
                error = thaw_super(sb);
        if (error)
                bdev->bd_fsfreeze_count++;
out:
        mutex_unlock(&bdev->bd_fsfreeze_mutex);
        return error;
}
EXPORT_SYMBOL(thaw_bdev);

static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
{
        return block_write_full_page(page, blkdev_get_block, wbc);
}

static int blkdev_readpage(struct file * file, struct page * page)
{
        return block_read_full_page(page, blkdev_get_block);
}

static void blkdev_readahead(struct readahead_control *rac)
{
        mpage_readahead(rac, blkdev_get_block);
}

static int blkdev_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata)
{
        return block_write_begin(mapping, pos, len, flags, pagep,
                                 blkdev_get_block);
}

static int blkdev_write_end(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned copied,
                        struct page *page, void *fsdata)
{
        int ret;
        ret = block_write_end(file, mapping, pos, len, copied, page, fsdata);

        unlock_page(page);
        put_page(page);

        return ret;
}

/*
 * private llseek:
 * for a block special file file_inode(file)->i_size is zero
 * so we compute the size by hand (just as in block_read/write above)
 */
static loff_t block_llseek(struct file *file, loff_t offset, int whence)
{
        struct inode *bd_inode = bdev_file_inode(file);
        loff_t retval;

        inode_lock(bd_inode);
        retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode));
        inode_unlock(bd_inode);
        return retval;
}
        
int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
{
        struct inode *bd_inode = bdev_file_inode(filp);
        struct block_device *bdev = I_BDEV(bd_inode);
        int error;
        
        error = file_write_and_wait_range(filp, start, end);
        if (error)
                return error;

        /*
         * There is no need to serialise calls to blkdev_issue_flush with
         * i_mutex and doing so causes performance issues with concurrent
         * O_SYNC writers to a block device.
         */
        error = blkdev_issue_flush(bdev, GFP_KERNEL);
        if (error == -EOPNOTSUPP)
                error = 0;

        return error;
}
EXPORT_SYMBOL(blkdev_fsync);

/**
 * bdev_read_page() - Start reading a page from a block device
 * @bdev: The device to read the page from
 * @sector: The offset on the device to read the page to (need not be aligned)
 * @page: The page to read
 *
 * On entry, the page should be locked.  It will be unlocked when the page
 * has been read.  If the block driver implements rw_page synchronously,
 * that will be true on exit from this function, but it need not be.
 *
 * Errors returned by this function are usually "soft", eg out of memory, or
 * queue full; callers should try a different route to read this page rather
 * than propagate an error back up the stack.
 *
 * Return: negative errno if an error occurs, 0 if submission was successful.
 */
int bdev_read_page(struct block_device *bdev, sector_t sector,
                        struct page *page)
{
        const struct block_device_operations *ops = bdev->bd_disk->fops;
        int result = -EOPNOTSUPP;

        if (!ops->rw_page || bdev_get_integrity(bdev))
                return result;

        result = blk_queue_enter(bdev->bd_disk->queue, 0);
        if (result)
                return result;
        result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
                              REQ_OP_READ);
        blk_queue_exit(bdev->bd_disk->queue);
        return result;
}

/**
 * bdev_write_page() - Start writing a page to a block device
 * @bdev: The device to write the page to
 * @sector: The offset on the device to write the page to (need not be aligned)
 * @page: The page to write
 * @wbc: The writeback_control for the write
 *
 * On entry, the page should be locked and not currently under writeback.
 * On exit, if the write started successfully, the page will be unlocked and
 * under writeback.  If the write failed already (eg the driver failed to
 * queue the page to the device), the page will still be locked.  If the
 * caller is a ->writepage implementation, it will need to unlock the page.
 *
 * Errors returned by this function are usually "soft", eg out of memory, or
 * queue full; callers should try a different route to write this page rather
 * than propagate an error back up the stack.
 *
 * Return: negative errno if an error occurs, 0 if submission was successful.
 */
int bdev_write_page(struct block_device *bdev, sector_t sector,
                        struct page *page, struct writeback_control *wbc)
{
        int result;
        const struct block_device_operations *ops = bdev->bd_disk->fops;

        if (!ops->rw_page || bdev_get_integrity(bdev))
                return -EOPNOTSUPP;
        result = blk_queue_enter(bdev->bd_disk->queue, 0);
        if (result)
                return result;

        set_page_writeback(page);
        result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
                              REQ_OP_WRITE);
        if (result) {
                end_page_writeback(page);
        } else {
                clean_page_buffers(page);
                unlock_page(page);
        }
        blk_queue_exit(bdev->bd_disk->queue);
        return result;
}

/*
 * pseudo-fs
 */

static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock);
static struct kmem_cache * bdev_cachep __read_mostly;

static struct inode *bdev_alloc_inode(struct super_block *sb)
{
        struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
        if (!ei)
                return NULL;
        return &ei->vfs_inode;
}

static void bdev_free_inode(struct inode *inode)
{
        kmem_cache_free(bdev_cachep, BDEV_I(inode));
}

static void init_once(void *foo)
{
        struct bdev_inode *ei = (struct bdev_inode *) foo;
        struct block_device *bdev = &ei->bdev;

        memset(bdev, 0, sizeof(*bdev));
        mutex_init(&bdev->bd_mutex);
#ifdef CONFIG_SYSFS
        INIT_LIST_HEAD(&bdev->bd_holder_disks);
#endif
        bdev->bd_bdi = &noop_backing_dev_info;
        inode_init_once(&ei->vfs_inode);
        /* Initialize mutex for freeze. */
        mutex_init(&bdev->bd_fsfreeze_mutex);
}

static void bdev_evict_inode(struct inode *inode)
{
        struct block_device *bdev = &BDEV_I(inode)->bdev;
        truncate_inode_pages_final(&inode->i_data);
        invalidate_inode_buffers(inode); /* is it needed here? */
        clear_inode(inode);
        /* Detach inode from wb early as bdi_put() may free bdi->wb */
        inode_detach_wb(inode);
        if (bdev->bd_bdi != &noop_backing_dev_info) {
                bdi_put(bdev->bd_bdi);
                bdev->bd_bdi = &noop_backing_dev_info;
        }
}

static const struct super_operations bdev_sops = {
        .statfs = simple_statfs,
        .alloc_inode = bdev_alloc_inode,
        .free_inode = bdev_free_inode,
        .drop_inode = generic_delete_inode,
        .evict_inode = bdev_evict_inode,
};

static int bd_init_fs_context(struct fs_context *fc)
{
        struct pseudo_fs_context *ctx = init_pseudo(fc, BDEVFS_MAGIC);
        if (!ctx)
                return -ENOMEM;
        fc->s_iflags |= SB_I_CGROUPWB;
        ctx->ops = &bdev_sops;
        return 0;
}

static struct file_system_type bd_type = {
        .name                = "bdev",
        .init_fs_context = bd_init_fs_context,
        .kill_sb        = kill_anon_super,
};

struct super_block *blockdev_superblock __read_mostly;
EXPORT_SYMBOL_GPL(blockdev_superblock);

void __init bdev_cache_init(void)
{
        int err;
        static struct vfsmount *bd_mnt;

        bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
                        0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
                                SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC),
                        init_once);
        err = register_filesystem(&bd_type);
        if (err)
                panic("Cannot register bdev pseudo-fs");
        bd_mnt = kern_mount(&bd_type);
        if (IS_ERR(bd_mnt))
                panic("Cannot create bdev pseudo-fs");
        blockdev_superblock = bd_mnt->mnt_sb;   /* For writeback */
}

/*
 * Most likely _very_ bad one - but then it's hardly critical for small
 * /dev and can be fixed when somebody will need really large one.
 * Keep in mind that it will be fed through icache hash function too.
 */
static inline unsigned long hash(dev_t dev)
{
        return MAJOR(dev)+MINOR(dev);
}

static int bdev_test(struct inode *inode, void *data)
{
        return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data;
}

static int bdev_set(struct inode *inode, void *data)
{
        BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data;
        return 0;
}

static struct block_device *bdget(dev_t dev)
{
        struct block_device *bdev;
        struct inode *inode;

        inode = iget5_locked(blockdev_superblock, hash(dev),
                        bdev_test, bdev_set, &dev);

        if (!inode)
                return NULL;

        bdev = &BDEV_I(inode)->bdev;

        if (inode->i_state & I_NEW) {
                spin_lock_init(&bdev->bd_size_lock);
                bdev->bd_contains = NULL;
                bdev->bd_super = NULL;
                bdev->bd_inode = inode;
                bdev->bd_part_count = 0;
                inode->i_mode = S_IFBLK;
                inode->i_rdev = dev;
                inode->i_bdev = bdev;
                inode->i_data.a_ops = &def_blk_aops;
                mapping_set_gfp_mask(&inode->i_data, GFP_USER);
                unlock_new_inode(inode);
        }
        return bdev;
}

/**
 * bdgrab -- Grab a reference to an already referenced block device
 * @bdev:        Block device to grab a reference to.
 */
struct block_device *bdgrab(struct block_device *bdev)
{
        ihold(bdev->bd_inode);
        return bdev;
}
EXPORT_SYMBOL(bdgrab);

struct block_device *bdget_part(struct hd_struct *part)
{
        return bdget(part_devt(part));
}

long nr_blockdev_pages(void)
{
        struct inode *inode;
        long ret = 0;

        spin_lock(&blockdev_superblock->s_inode_list_lock);
        list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list)
                ret += inode->i_mapping->nrpages;
        spin_unlock(&blockdev_superblock->s_inode_list_lock);

        return ret;
}

void bdput(struct block_device *bdev)
{
        iput(bdev->bd_inode);
}

EXPORT_SYMBOL(bdput);
 
static struct block_device *bd_acquire(struct inode *inode)
{
        struct block_device *bdev;

        spin_lock(&bdev_lock);
        bdev = inode->i_bdev;
        if (bdev && !inode_unhashed(bdev->bd_inode)) {
                bdgrab(bdev);
                spin_unlock(&bdev_lock);
                return bdev;
        }
        spin_unlock(&bdev_lock);

        /*
         * i_bdev references block device inode that was already shut down
         * (corresponding device got removed).  Remove the reference and look
         * up block device inode again just in case new device got
         * reestablished under the same device number.
         */
        if (bdev)
                bd_forget(inode);

        bdev = bdget(inode->i_rdev);
        if (bdev) {
                spin_lock(&bdev_lock);
                if (!inode->i_bdev) {
                        /*
                         * We take an additional reference to bd_inode,
                         * and it's released in clear_inode() of inode.
                         * So, we can access it via ->i_mapping always
                         * without igrab().
                         */
                        bdgrab(bdev);
                        inode->i_bdev = bdev;
                        inode->i_mapping = bdev->bd_inode->i_mapping;
                }
                spin_unlock(&bdev_lock);
        }
        return bdev;
}

/* Call when you free inode */

void bd_forget(struct inode *inode)
{
        struct block_device *bdev = NULL;

        spin_lock(&bdev_lock);
        if (!sb_is_blkdev_sb(inode->i_sb))
                bdev = inode->i_bdev;
        inode->i_bdev = NULL;
        inode->i_mapping = &inode->i_data;
        spin_unlock(&bdev_lock);

        if (bdev)
                bdput(bdev);
}

/**
 * bd_may_claim - test whether a block device can be claimed
 * @bdev: block device of interest
 * @whole: whole block device containing @bdev, may equal @bdev
 * @holder: holder trying to claim @bdev
 *
 * Test whether @bdev can be claimed by @holder.
 *
 * CONTEXT:
 * spin_lock(&bdev_lock).
 *
 * RETURNS:
 * %true if @bdev can be claimed, %false otherwise.
 */
static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
                         void *holder)
{
        if (bdev->bd_holder == holder)
                return true;         /* already a holder */
        else if (bdev->bd_holder != NULL)
                return false;          /* held by someone else */
        else if (whole == bdev)
                return true;           /* is a whole device which isn't held */

        else if (whole->bd_holder == bd_may_claim)
                return true;          /* is a partition of a device that is being partitioned */
        else if (whole->bd_holder != NULL)
                return false;         /* is a partition of a held device */
        else
                return true;         /* is a partition of an un-held device */
}

/**
 * bd_prepare_to_claim - claim a block device
 * @bdev: block device of interest
 * @whole: the whole device containing @bdev, may equal @bdev
 * @holder: holder trying to claim @bdev
 *
 * Claim @bdev.  This function fails if @bdev is already claimed by another
 * holder and waits if another claiming is in progress. return, the caller
 * has ownership of bd_claiming and bd_holder[s].
 *
 * RETURNS:
 * 0 if @bdev can be claimed, -EBUSY otherwise.
 */
int bd_prepare_to_claim(struct block_device *bdev, struct block_device *whole,
                void *holder)
{
retry:
        spin_lock(&bdev_lock);
        /* if someone else claimed, fail */
        if (!bd_may_claim(bdev, whole, holder)) {
                spin_unlock(&bdev_lock);
                return -EBUSY;
        }

        /* if claiming is already in progress, wait for it to finish */
        if (whole->bd_claiming) {
                wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
                DEFINE_WAIT(wait);

                prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
                spin_unlock(&bdev_lock);
                schedule();
                finish_wait(wq, &wait);
                goto retry;
        }

        /* yay, all mine */
        whole->bd_claiming = holder;
        spin_unlock(&bdev_lock);
        return 0;
}
EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */

static struct gendisk *bdev_get_gendisk(struct block_device *bdev, int *partno)
{
        struct gendisk *disk = get_gendisk(bdev->bd_dev, partno);

        if (!disk)
                return NULL;
        /*
         * Now that we hold gendisk reference we make sure bdev we looked up is
         * not stale. If it is, it means device got removed and created before
         * we looked up gendisk and we fail open in such case. Associating
         * unhashed bdev with newly created gendisk could lead to two bdevs
         * (and thus two independent caches) being associated with one device
         * which is bad.
         */
        if (inode_unhashed(bdev->bd_inode)) {
                put_disk_and_module(disk);
                return NULL;
        }
        return disk;
}

static void bd_clear_claiming(struct block_device *whole, void *holder)
{
        lockdep_assert_held(&bdev_lock);
        /* tell others that we're done */
        BUG_ON(whole->bd_claiming != holder);
        whole->bd_claiming = NULL;
        wake_up_bit(&whole->bd_claiming, 0);
}

/**
 * bd_finish_claiming - finish claiming of a block device
 * @bdev: block device of interest
 * @whole: whole block device
 * @holder: holder that has claimed @bdev
 *
 * Finish exclusive open of a block device. Mark the device as exlusively
 * open by the holder and wake up all waiters for exclusive open to finish.
 */
static void bd_finish_claiming(struct block_device *bdev,
                struct block_device *whole, void *holder)
{
        spin_lock(&bdev_lock);
        BUG_ON(!bd_may_claim(bdev, whole, holder));
        /*
         * Note that for a whole device bd_holders will be incremented twice,
         * and bd_holder will be set to bd_may_claim before being set to holder
         */
        whole->bd_holders++;
        whole->bd_holder = bd_may_claim;
        bdev->bd_holders++;
        bdev->bd_holder = holder;
        bd_clear_claiming(whole, holder);
        spin_unlock(&bdev_lock);
}

/**
 * bd_abort_claiming - abort claiming of a block device
 * @bdev: block device of interest
 * @whole: whole block device
 * @holder: holder that has claimed @bdev
 *
 * Abort claiming of a block device when the exclusive open failed. This can be
 * also used when exclusive open is not actually desired and we just needed
 * to block other exclusive openers for a while.
 */
void bd_abort_claiming(struct block_device *bdev, struct block_device *whole,
                       void *holder)
{
        spin_lock(&bdev_lock);
        bd_clear_claiming(whole, holder);
        spin_unlock(&bdev_lock);
}
EXPORT_SYMBOL(bd_abort_claiming);

#ifdef CONFIG_SYSFS
struct bd_holder_disk {
        struct list_head        list;
        struct gendisk                *disk;
        int                        refcnt;
};

static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
                                                  struct gendisk *disk)
{
        struct bd_holder_disk *holder;

        list_for_each_entry(holder, &bdev->bd_holder_disks, list)
                if (holder->disk == disk)
                        return holder;
        return NULL;
}

static int add_symlink(struct kobject *from, struct kobject *to)
{
        return sysfs_create_link(from, to, kobject_name(to));
}

static void del_symlink(struct kobject *from, struct kobject *to)
{
        sysfs_remove_link(from, kobject_name(to));
}

/**
 * bd_link_disk_holder - create symlinks between holding disk and slave bdev
 * @bdev: the claimed slave bdev
 * @disk: the holding disk
 *
 * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
 *
 * This functions creates the following sysfs symlinks.
 *
 * - from "slaves" directory of the holder @disk to the claimed @bdev
 * - from "holders" directory of the @bdev to the holder @disk
 *
 * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is
 * passed to bd_link_disk_holder(), then:
 *
 *   /sys/block/dm-0/slaves/sda --> /sys/block/sda
 *   /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
 *
 * The caller must have claimed @bdev before calling this function and
 * ensure that both @bdev and @disk are valid during the creation and
 * lifetime of these symlinks.
 *
 * CONTEXT:
 * Might sleep.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
{
        struct bd_holder_disk *holder;
        int ret = 0;

        mutex_lock(&bdev->bd_mutex);

        WARN_ON_ONCE(!bdev->bd_holder);

        /* FIXME: remove the following once add_disk() handles errors */
        if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir))
                goto out_unlock;

        holder = bd_find_holder_disk(bdev, disk);
        if (holder) {
                holder->refcnt++;
                goto out_unlock;
        }

        holder = kzalloc(sizeof(*holder), GFP_KERNEL);
        if (!holder) {
                ret = -ENOMEM;
                goto out_unlock;
        }

        INIT_LIST_HEAD(&holder->list);
        holder->disk = disk;
        holder->refcnt = 1;

        ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
        if (ret)
                goto out_free;

        ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
        if (ret)
                goto out_del;
        /*
         * bdev could be deleted beneath us which would implicitly destroy
         * the holder directory.  Hold on to it.
         */
        kobject_get(bdev->bd_part->holder_dir);

        list_add(&holder->list, &bdev->bd_holder_disks);
        goto out_unlock;

out_del:
        del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
out_free:
        kfree(holder);
out_unlock:
        mutex_unlock(&bdev->bd_mutex);
        return ret;
}
EXPORT_SYMBOL_GPL(bd_link_disk_holder);

/**
 * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder()
 * @bdev: the calimed slave bdev
 * @disk: the holding disk
 *
 * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
 *
 * CONTEXT:
 * Might sleep.
 */
void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
{
        struct bd_holder_disk *holder;

        mutex_lock(&bdev->bd_mutex);

        holder = bd_find_holder_disk(bdev, disk);

        if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
                del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
                del_symlink(bdev->bd_part->holder_dir,
                            &disk_to_dev(disk)->kobj);
                kobject_put(bdev->bd_part->holder_dir);
                list_del_init(&holder->list);
                kfree(holder);
        }

        mutex_unlock(&bdev->bd_mutex);
}
EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
#endif

/**
 * check_disk_size_change - checks for disk size change and adjusts bdev size.
 * @disk: struct gendisk to check
 * @bdev: struct bdev to adjust.
 * @verbose: if %true log a message about a size change if there is any
 *
 * This routine checks to see if the bdev size does not match the disk size
 * and adjusts it if it differs. When shrinking the bdev size, its all caches
 * are freed.
 */
static void check_disk_size_change(struct gendisk *disk,
                struct block_device *bdev, bool verbose)
{
        loff_t disk_size, bdev_size;

        spin_lock(&bdev->bd_size_lock);
        disk_size = (loff_t)get_capacity(disk) << 9;
        bdev_size = i_size_read(bdev->bd_inode);
        if (disk_size != bdev_size) {
                if (verbose) {
                        printk(KERN_INFO
                               "%s: detected capacity change from %lld to %lld\n",
                               disk->disk_name, bdev_size, disk_size);
                }
                i_size_write(bdev->bd_inode, disk_size);
        }
        spin_unlock(&bdev->bd_size_lock);

        if (bdev_size > disk_size) {
                if (__invalidate_device(bdev, false))
                        pr_warn("VFS: busy inodes on resized disk %s\n",
                                disk->disk_name);
        }
}

/**
 * revalidate_disk_size - checks for disk size change and adjusts bdev size.
 * @disk: struct gendisk to check
 * @verbose: if %true log a message about a size change if there is any
 *
 * This routine checks to see if the bdev size does not match the disk size
 * and adjusts it if it differs. When shrinking the bdev size, its all caches
 * are freed.
 */
void revalidate_disk_size(struct gendisk *disk, bool verbose)
{
        struct block_device *bdev;

        /*
         * Hidden disks don't have associated bdev so there's no point in
         * revalidating them.
         */
        if (disk->flags & GENHD_FL_HIDDEN)
                return;

        bdev = bdget_disk(disk, 0);
        if (bdev) {
                check_disk_size_change(disk, bdev, verbose);
                bdput(bdev);
        }
}
EXPORT_SYMBOL(revalidate_disk_size);

void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors)
{
        spin_lock(&bdev->bd_size_lock);
        i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT);
        spin_unlock(&bdev->bd_size_lock);
}
EXPORT_SYMBOL(bd_set_nr_sectors);

static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);

int bdev_disk_changed(struct block_device *bdev, bool invalidate)
{
        struct gendisk *disk = bdev->bd_disk;
        int ret;

        lockdep_assert_held(&bdev->bd_mutex);

        if (!(disk->flags & GENHD_FL_UP))
                return -ENXIO;

rescan:
        ret = blk_drop_partitions(bdev);
        if (ret)
                return ret;

        clear_bit(GD_NEED_PART_SCAN, &disk->state);

        /*
         * Historically we only set the capacity to zero for devices that
         * support partitions (independ of actually having partitions created).
         * Doing that is rather inconsistent, but changing it broke legacy
         * udisks polling for legacy ide-cdrom devices.  Use the crude check
         * below to get the sane behavior for most device while not breaking
         * userspace for this particular setup.
         */
        if (invalidate) {
                if (disk_part_scan_enabled(disk) ||
                    !(disk->flags & GENHD_FL_REMOVABLE))
                        set_capacity(disk, 0);
        } else {
                if (disk->fops->revalidate_disk)
                        disk->fops->revalidate_disk(disk);
        }

        check_disk_size_change(disk, bdev, !invalidate);

        if (get_capacity(disk)) {
                ret = blk_add_partitions(disk, bdev);
                if (ret == -EAGAIN)
                        goto rescan;
        } else if (invalidate) {
                /*
                 * Tell userspace that the media / partition table may have
                 * changed.
                 */
                kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
        }

        return ret;
}
/*
 * Only exported for for loop and dasd for historic reasons.  Don't use in new
 * code!
 */
EXPORT_SYMBOL_GPL(bdev_disk_changed);

/*
 * bd_mutex locking:
 *
 *  mutex_lock(part->bd_mutex)
 *    mutex_lock_nested(whole->bd_mutex, 1)
 */

static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
                int for_part)
{
        struct block_device *whole = NULL, *claiming = NULL;
        struct gendisk *disk;
        int ret;
        int partno;
        bool first_open = false, unblock_events = true, need_restart;

 restart:
        need_restart = false;
        ret = -ENXIO;
        disk = bdev_get_gendisk(bdev, &partno);
        if (!disk)
                goto out;

        if (partno) {
                whole = bdget_disk(disk, 0);
                if (!whole) {
                        ret = -ENOMEM;
                        goto out_put_disk;
                }
        }

        if (!for_part && (mode & FMODE_EXCL)) {
                WARN_ON_ONCE(!holder);
                if (whole)
                        claiming = whole;
                else
                        claiming = bdev;
                ret = bd_prepare_to_claim(bdev, claiming, holder);
                if (ret)
                        goto out_put_whole;
        }

        disk_block_events(disk);
        mutex_lock_nested(&bdev->bd_mutex, for_part);
        if (!bdev->bd_openers) {
                first_open = true;
                bdev->bd_disk = disk;
                bdev->bd_contains = bdev;
                bdev->bd_partno = partno;

                if (!partno) {
                        ret = -ENXIO;
                        bdev->bd_part = disk_get_part(disk, partno);
                        if (!bdev->bd_part)
                                goto out_clear;

                        ret = 0;
                        if (disk->fops->open) {
                                ret = disk->fops->open(bdev, mode);
                                /*
                                 * If we lost a race with 'disk' being deleted,
                                 * try again.  See md.c
                                 */
                                if (ret == -ERESTARTSYS)
                                        need_restart = true;
                        }

                        if (!ret) {
                                bd_set_nr_sectors(bdev, get_capacity(disk));
                                set_init_blocksize(bdev);
                        }

                        /*
                         * If the device is invalidated, rescan partition
                         * if open succeeded or failed with -ENOMEDIUM.
                         * The latter is necessary to prevent ghost
                         * partitions on a removed medium.
                         */
                        if (test_bit(GD_NEED_PART_SCAN, &disk->state) &&
                            (!ret || ret == -ENOMEDIUM))
                                bdev_disk_changed(bdev, ret == -ENOMEDIUM);

                        if (ret)
                                goto out_clear;
                } else {
                        BUG_ON(for_part);
                        ret = __blkdev_get(whole, mode, NULL, 1);
                        if (ret)
                                goto out_clear;
                        bdev->bd_contains = bdgrab(whole);
                        bdev->bd_part = disk_get_part(disk, partno);
                        if (!(disk->flags & GENHD_FL_UP) ||
                            !bdev->bd_part || !bdev->bd_part->nr_sects) {
                                ret = -ENXIO;
                                goto out_clear;
                        }
                        bd_set_nr_sectors(bdev, bdev->bd_part->nr_sects);
                        set_init_blocksize(bdev);
                }

                if (bdev->bd_bdi == &noop_backing_dev_info)
                        bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info);
        } else {
                if (bdev->bd_contains == bdev) {
                        ret = 0;
                        if (bdev->bd_disk->fops->open)
                                ret = bdev->bd_disk->fops->open(bdev, mode);
                        /* the same as first opener case, read comment there */
                        if (test_bit(GD_NEED_PART_SCAN, &disk->state) &&
                            (!ret || ret == -ENOMEDIUM))
                                bdev_disk_changed(bdev, ret == -ENOMEDIUM);
                        if (ret)
                                goto out_unlock_bdev;
                }
        }
        bdev->bd_openers++;
        if (for_part)
                bdev->bd_part_count++;
        if (claiming)
                bd_finish_claiming(bdev, claiming, holder);

        /*
         * Block event polling for write claims if requested.  Any write holder
         * makes the write_holder state stick until all are released.  This is
         * good enough and tracking individual writeable reference is too
         * fragile given the way @mode is used in blkdev_get/put().
         */
        if (claiming && (mode & FMODE_WRITE) && !bdev->bd_write_holder &&
            (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
                bdev->bd_write_holder = true;
                unblock_events = false;
        }
        mutex_unlock(&bdev->bd_mutex);

        if (unblock_events)
                disk_unblock_events(disk);

        /* only one opener holds refs to the module and disk */
        if (!first_open)
                put_disk_and_module(disk);
        if (whole)
                bdput(whole);
        return 0;

 out_clear:
        disk_put_part(bdev->bd_part);
        bdev->bd_disk = NULL;
        bdev->bd_part = NULL;
        if (bdev != bdev->bd_contains)
                __blkdev_put(bdev->bd_contains, mode, 1);
        bdev->bd_contains = NULL;
 out_unlock_bdev:
        if (claiming)
                bd_abort_claiming(bdev, claiming, holder);
        mutex_unlock(&bdev->bd_mutex);
        disk_unblock_events(disk);
 out_put_whole:
         if (whole)
                bdput(whole);
 out_put_disk:
        put_disk_and_module(disk);
        if (need_restart)
                goto restart;
 out:
        return ret;
}

/**
 * blkdev_get - open a block device
 * @bdev: block_device to open
 * @mode: FMODE_* mask
 * @holder: exclusive holder identifier
 *
 * Open @bdev with @mode.  If @mode includes %FMODE_EXCL, @bdev is
 * open with exclusive access.  Specifying %FMODE_EXCL with %NULL
 * @holder is invalid.  Exclusive opens may nest for the same @holder.
 *
 * On success, the reference count of @bdev is unchanged.  On failure,
 * @bdev is put.
 *
 * CONTEXT:
 * Might sleep.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
static int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
{
        int ret, perm = 0;

        if (mode & FMODE_READ)
                perm |= MAY_READ;
        if (mode & FMODE_WRITE)
                perm |= MAY_WRITE;
        ret = devcgroup_inode_permission(bdev->bd_inode, perm);
        if (ret)
                goto bdput;

        ret =__blkdev_get(bdev, mode, holder, 0);
        if (ret)
                goto bdput;
        return 0;

bdput:
        bdput(bdev);
        return ret;
}

/**
 * blkdev_get_by_path - open a block device by name
 * @path: path to the block device to open
 * @mode: FMODE_* mask
 * @holder: exclusive holder identifier
 *
 * Open the blockdevice described by the device file at @path.  @mode
 * and @holder are identical to blkdev_get().
 *
 * On success, the returned block_device has reference count of one.
 *
 * CONTEXT:
 * Might sleep.
 *
 * RETURNS:
 * Pointer to block_device on success, ERR_PTR(-errno) on failure.
 */
struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
                                        void *holder)
{
        struct block_device *bdev;
        int err;

        bdev = lookup_bdev(path);
        if (IS_ERR(bdev))
                return bdev;

        err = blkdev_get(bdev, mode, holder);
        if (err)
                return ERR_PTR(err);

        if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) {
                blkdev_put(bdev, mode);
                return ERR_PTR(-EACCES);
        }

        return bdev;
}
EXPORT_SYMBOL(blkdev_get_by_path);

/**
 * blkdev_get_by_dev - open a block device by device number
 * @dev: device number of block device to open
 * @mode: FMODE_* mask
 * @holder: exclusive holder identifier
 *
 * Open the blockdevice described by device number @dev.  @mode and
 * @holder are identical to blkdev_get().
 *
 * Use it ONLY if you really do not have anything better - i.e. when
 * you are behind a truly sucky interface and all you are given is a
 * device number.  _Never_ to be used for internal purposes.  If you
 * ever need it - reconsider your API.
 *
 * On success, the returned block_device has reference count of one.
 *
 * CONTEXT:
 * Might sleep.
 *
 * RETURNS:
 * Pointer to block_device on success, ERR_PTR(-errno) on failure.
 */
struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
{
        struct block_device *bdev;
        int err;

        bdev = bdget(dev);
        if (!bdev)
                return ERR_PTR(-ENOMEM);

        err = blkdev_get(bdev, mode, holder);
        if (err)
                return ERR_PTR(err);

        return bdev;
}
EXPORT_SYMBOL(blkdev_get_by_dev);

static int blkdev_open(struct inode * inode, struct file * filp)
{
        struct block_device *bdev;

        /*
         * Preserve backwards compatibility and allow large file access
         * even if userspace doesn't ask for it explicitly. Some mkfs
         * binary needs it. We might want to drop this workaround
         * during an unstable branch.
         */
        filp->f_flags |= O_LARGEFILE;

        filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;

        if (filp->f_flags & O_NDELAY)
                filp->f_mode |= FMODE_NDELAY;
        if (filp->f_flags & O_EXCL)
                filp->f_mode |= FMODE_EXCL;
        if ((filp->f_flags & O_ACCMODE) == 3)
                filp->f_mode |= FMODE_WRITE_IOCTL;

        bdev = bd_acquire(inode);
        if (bdev == NULL)
                return -ENOMEM;

        filp->f_mapping = bdev->bd_inode->i_mapping;
        filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);

        return blkdev_get(bdev, filp->f_mode, filp);
}

static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
{
        struct gendisk *disk = bdev->bd_disk;
        struct block_device *victim = NULL;

        /*
         * Sync early if it looks like we're the last one.  If someone else
         * opens the block device between now and the decrement of bd_openers
         * then we did a sync that we didn't need to, but that's not the end
         * of the world and we want to avoid long (could be several minute)
         * syncs while holding the mutex.
         */
        if (bdev->bd_openers == 1)
                sync_blockdev(bdev);

        mutex_lock_nested(&bdev->bd_mutex, for_part);
        if (for_part)
                bdev->bd_part_count--;

        if (!--bdev->bd_openers) {
                WARN_ON_ONCE(bdev->bd_holders);
                sync_blockdev(bdev);
                kill_bdev(bdev);

                bdev_write_inode(bdev);
        }
        if (bdev->bd_contains == bdev) {
                if (disk->fops->release)
                        disk->fops->release(disk, mode);
        }
        if (!bdev->bd_openers) {
                disk_put_part(bdev->bd_part);
                bdev->bd_part = NULL;
                bdev->bd_disk = NULL;
                if (bdev != bdev->bd_contains)
                        victim = bdev->bd_contains;
                bdev->bd_contains = NULL;

                put_disk_and_module(disk);
        }
        mutex_unlock(&bdev->bd_mutex);
        bdput(bdev);
        if (victim)
                __blkdev_put(victim, mode, 1);
}

void blkdev_put(struct block_device *bdev, fmode_t mode)
{
        mutex_lock(&bdev->bd_mutex);

        if (mode & FMODE_EXCL) {
                bool bdev_free;

                /*
                 * Release a claim on the device.  The holder fields
                 * are protected with bdev_lock.  bd_mutex is to
                 * synchronize disk_holder unlinking.
                 */
                spin_lock(&bdev_lock);

                WARN_ON_ONCE(--bdev->bd_holders < 0);
                WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0);

                /* bd_contains might point to self, check in a separate step */
                if ((bdev_free = !bdev->bd_holders))
                        bdev->bd_holder = NULL;
                if (!bdev->bd_contains->bd_holders)
                        bdev->bd_contains->bd_holder = NULL;

                spin_unlock(&bdev_lock);

                /*
                 * If this was the last claim, remove holder link and
                 * unblock evpoll if it was a write holder.
                 */
                if (bdev_free && bdev->bd_write_holder) {
                        disk_unblock_events(bdev->bd_disk);
                        bdev->bd_write_holder = false;
                }
        }

        /*
         * Trigger event checking and tell drivers to flush MEDIA_CHANGE
         * event.  This is to ensure detection of media removal commanded
         * from userland - e.g. eject(1).
         */
        disk_flush_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE);

        mutex_unlock(&bdev->bd_mutex);

        __blkdev_put(bdev, mode, 0);
}
EXPORT_SYMBOL(blkdev_put);

static int blkdev_close(struct inode * inode, struct file * filp)
{
        struct block_device *bdev = I_BDEV(bdev_file_inode(filp));
        blkdev_put(bdev, filp->f_mode);
        return 0;
}

static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
        struct block_device *bdev = I_BDEV(bdev_file_inode(file));
        fmode_t mode = file->f_mode;

        /*
         * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
         * to updated it before every ioctl.
         */
        if (file->f_flags & O_NDELAY)
                mode |= FMODE_NDELAY;
        else
                mode &= ~FMODE_NDELAY;

        return blkdev_ioctl(bdev, mode, cmd, arg);
}

/*
 * Write data to the block device.  Only intended for the block device itself
 * and the raw driver which basically is a fake block device.
 *
 * Does not take i_mutex for the write and thus is not for general purpose
 * use.
 */
ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        struct file *file = iocb->ki_filp;
        struct inode *bd_inode = bdev_file_inode(file);
        loff_t size = i_size_read(bd_inode);
        struct blk_plug plug;
        size_t shorted = 0;
        ssize_t ret;

        if (bdev_read_only(I_BDEV(bd_inode)))
                return -EPERM;

        if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev))
                return -ETXTBSY;

        if (!iov_iter_count(from))
                return 0;

        if (iocb->ki_pos >= size)
                return -ENOSPC;

        if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT)
                return -EOPNOTSUPP;

        size -= iocb->ki_pos;
        if (iov_iter_count(from) > size) {
                shorted = iov_iter_count(from) - size;
                iov_iter_truncate(from, size);
        }

        blk_start_plug(&plug);
        ret = __generic_file_write_iter(iocb, from);
        if (ret > 0)
                ret = generic_write_sync(iocb, ret);
        iov_iter_reexpand(from, iov_iter_count(from) + shorted);
        blk_finish_plug(&plug);
        return ret;
}
EXPORT_SYMBOL_GPL(blkdev_write_iter);

ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
        struct file *file = iocb->ki_filp;
        struct inode *bd_inode = bdev_file_inode(file);
        loff_t size = i_size_read(bd_inode);
        loff_t pos = iocb->ki_pos;
        size_t shorted = 0;
        ssize_t ret;

        if (pos >= size)
                return 0;

        size -= pos;
        if (iov_iter_count(to) > size) {
                shorted = iov_iter_count(to) - size;
                iov_iter_truncate(to, size);
        }

        ret = generic_file_read_iter(iocb, to);
        iov_iter_reexpand(to, iov_iter_count(to) + shorted);
        return ret;
}
EXPORT_SYMBOL_GPL(blkdev_read_iter);

/*
 * Try to release a page associated with block device when the system
 * is under memory pressure.
 */
static int blkdev_releasepage(struct page *page, gfp_t wait)
{
        struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super;

        if (super && super->s_op->bdev_try_to_free_page)
                return super->s_op->bdev_try_to_free_page(super, page, wait);

        return try_to_free_buffers(page);
}

static int blkdev_writepages(struct address_space *mapping,
                             struct writeback_control *wbc)
{
        return generic_writepages(mapping, wbc);
}

static const struct address_space_operations def_blk_aops = {
        .readpage        = blkdev_readpage,
        .readahead        = blkdev_readahead,
        .writepage        = blkdev_writepage,
        .write_begin        = blkdev_write_begin,
        .write_end        = blkdev_write_end,
        .writepages        = blkdev_writepages,
        .releasepage        = blkdev_releasepage,
        .direct_IO        = blkdev_direct_IO,
        .migratepage        = buffer_migrate_page_norefs,
        .is_dirty_writeback = buffer_check_dirty_writeback,
};

#define        BLKDEV_FALLOC_FL_SUPPORTED                                        \
                (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |                \
                 FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE)

static long blkdev_fallocate(struct file *file, int mode, loff_t start,
                             loff_t len)
{
        struct block_device *bdev = I_BDEV(bdev_file_inode(file));
        loff_t end = start + len - 1;
        loff_t isize;
        int error;

        /* Fail if we don't recognize the flags. */
        if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED)
                return -EOPNOTSUPP;

        /* Don't go off the end of the device. */
        isize = i_size_read(bdev->bd_inode);
        if (start >= isize)
                return -EINVAL;
        if (end >= isize) {
                if (mode & FALLOC_FL_KEEP_SIZE) {
                        len = isize - start;
                        end = start + len - 1;
                } else
                        return -EINVAL;
        }

        /*
         * Don't allow IO that isn't aligned to logical block size.
         */
        if ((start | len) & (bdev_logical_block_size(bdev) - 1))
                return -EINVAL;

        /*
         * Invalidate the page cache, including dirty pages, for valid
         * de-allocate mode calls to fallocate().
         */
        switch (mode) {
        case FALLOC_FL_ZERO_RANGE:
        case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
                error = truncate_bdev_range(bdev, file->f_mode, start, end);
                if (error)
                        break;

                error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
                                            GFP_KERNEL, BLKDEV_ZERO_NOUNMAP);
                break;
        case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
                error = truncate_bdev_range(bdev, file->f_mode, start, end);
                if (error)
                        break;

                error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
                                             GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK);
                break;
        case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE:
                error = truncate_bdev_range(bdev, file->f_mode, start, end);
                if (error)
                        break;

                error = blkdev_issue_discard(bdev, start >> 9, len >> 9,
                                             GFP_KERNEL, 0);
                break;
        default:
                return -EOPNOTSUPP;
        }
        if (error)
                return error;

        /*
         * Invalidate again; if someone wandered in and dirtied a page,
         * the caller will be given -EBUSY.  The third argument is
         * inclusive, so the rounding here is safe.
         */
        return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping,
                                             start >> PAGE_SHIFT,
                                             end >> PAGE_SHIFT);
}

const struct file_operations def_blk_fops = {
        .open                = blkdev_open,
        .release        = blkdev_close,
        .llseek                = block_llseek,
        .read_iter        = blkdev_read_iter,
        .write_iter        = blkdev_write_iter,
        .iopoll                = blkdev_iopoll,
        .mmap                = generic_file_mmap,
        .fsync                = blkdev_fsync,
        .unlocked_ioctl        = block_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl        = compat_blkdev_ioctl,
#endif
        .splice_read        = generic_file_splice_read,
        .splice_write        = iter_file_splice_write,
        .fallocate        = blkdev_fallocate,
};

/**
 * lookup_bdev  - lookup a struct block_device by name
 * @pathname:        special file representing the block device
 *
 * Get a reference to the blockdevice at @pathname in the current
 * namespace if possible and return it.  Return ERR_PTR(error)
 * otherwise.
 */
struct block_device *lookup_bdev(const char *pathname)
{
        struct block_device *bdev;
        struct inode *inode;
        struct path path;
        int error;

        if (!pathname || !*pathname)
                return ERR_PTR(-EINVAL);

        error = kern_path(pathname, LOOKUP_FOLLOW, &path);
        if (error)
                return ERR_PTR(error);

        inode = d_backing_inode(path.dentry);
        error = -ENOTBLK;
        if (!S_ISBLK(inode->i_mode))
                goto fail;
        error = -EACCES;
        if (!may_open_dev(&path))
                goto fail;
        error = -ENOMEM;
        bdev = bd_acquire(inode);
        if (!bdev)
                goto fail;
out:
        path_put(&path);
        return bdev;
fail:
        bdev = ERR_PTR(error);
        goto out;
}
EXPORT_SYMBOL(lookup_bdev);

int __invalidate_device(struct block_device *bdev, bool kill_dirty)
{
        struct super_block *sb = get_super(bdev);
        int res = 0;

        if (sb) {
                /*
                 * no need to lock the super, get_super holds the
                 * read mutex so the filesystem cannot go away
                 * under us (->put_super runs with the write lock
                 * hold).
                 */
                shrink_dcache_sb(sb);
                res = invalidate_inodes(sb, kill_dirty);
                drop_super(sb);
        }
        invalidate_bdev(bdev);
        return res;
}
EXPORT_SYMBOL(__invalidate_device);

void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
{
        struct inode *inode, *old_inode = NULL;

        spin_lock(&blockdev_superblock->s_inode_list_lock);
        list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
                struct address_space *mapping = inode->i_mapping;
                struct block_device *bdev;

                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
                    mapping->nrpages == 0) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }
                __iget(inode);
                spin_unlock(&inode->i_lock);
                spin_unlock(&blockdev_superblock->s_inode_list_lock);
                /*
                 * We hold a reference to 'inode' so it couldn't have been
                 * removed from s_inodes list while we dropped the
                 * s_inode_list_lock  We cannot iput the inode now as we can
                 * be holding the last reference and we cannot iput it under
                 * s_inode_list_lock. So we keep the reference and iput it
                 * later.
                 */
                iput(old_inode);
                old_inode = inode;
                bdev = I_BDEV(inode);

                mutex_lock(&bdev->bd_mutex);
                if (bdev->bd_openers)
                        func(bdev, arg);
                mutex_unlock(&bdev->bd_mutex);

                spin_lock(&blockdev_superblock->s_inode_list_lock);
        }
        spin_unlock(&blockdev_superblock->s_inode_list_lock);
        iput(old_inode);
}



















































































































































































































































































































    1 





    1 













    1 

























































































































    1 



























    1 



    1 























    1 



















































































































    1 













    1 












    1 










    1 





    1 
    1 





    1 








    1 






    1 




    1 





    1 
    1 

    1 














    1 


    1 

    1 
    1 











































    1 











    1 


    1 



    1 




    1 







































    1 








    1 









    1 










    1 


    1 



    1 





    1 








    1 







    1 





    1 



    1 
































    1 

    1 






















    1 












    1 


    1 



    1 


































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
// SPDX-License-Identifier: GPL-2.0-only
/*
 * "splice": joining two ropes together by interweaving their strands.
 *
 * This is the "extended pipe" functionality, where a pipe is used as
 * an arbitrary in-memory buffer. Think of a pipe as a small kernel
 * buffer that you can use to transfer data from one end to the other.
 *
 * The traditional unix read/write is extended with a "splice()" operation
 * that transfers data buffers to or from a pipe buffer.
 *
 * Named by Larry McVoy, original implementation from Linus, extended by
 * Jens to support splicing to files, network, direct splicing, etc and
 * fixing lots of bugs.
 *
 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
 *
 */
#include <linux/bvec.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/pagemap.h>
#include <linux/splice.h>
#include <linux/memcontrol.h>
#include <linux/mm_inline.h>
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/export.h>
#include <linux/syscalls.h>
#include <linux/uio.h>
#include <linux/security.h>
#include <linux/gfp.h>
#include <linux/socket.h>
#include <linux/sched/signal.h>

#include "internal.h"

/*
 * Attempt to steal a page from a pipe buffer. This should perhaps go into
 * a vm helper function, it's already simplified quite a bit by the
 * addition of remove_mapping(). If success is returned, the caller may
 * attempt to reuse this page for another destination.
 */
static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe,
                struct pipe_buffer *buf)
{
        struct page *page = buf->page;
        struct address_space *mapping;

        lock_page(page);

        mapping = page_mapping(page);
        if (mapping) {
                WARN_ON(!PageUptodate(page));

                /*
                 * At least for ext2 with nobh option, we need to wait on
                 * writeback completing on this page, since we'll remove it
                 * from the pagecache.  Otherwise truncate wont wait on the
                 * page, allowing the disk blocks to be reused by someone else
                 * before we actually wrote our data to them. fs corruption
                 * ensues.
                 */
                wait_on_page_writeback(page);

                if (page_has_private(page) &&
                    !try_to_release_page(page, GFP_KERNEL))
                        goto out_unlock;

                /*
                 * If we succeeded in removing the mapping, set LRU flag
                 * and return good.
                 */
                if (remove_mapping(mapping, page)) {
                        buf->flags |= PIPE_BUF_FLAG_LRU;
                        return true;
                }
        }

        /*
         * Raced with truncate or failed to remove page from current
         * address space, unlock and return failure.
         */
out_unlock:
        unlock_page(page);
        return false;
}

static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
                                        struct pipe_buffer *buf)
{
        put_page(buf->page);
        buf->flags &= ~PIPE_BUF_FLAG_LRU;
}

/*
 * Check whether the contents of buf is OK to access. Since the content
 * is a page cache page, IO may be in flight.
 */
static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
                                       struct pipe_buffer *buf)
{
        struct page *page = buf->page;
        int err;

        if (!PageUptodate(page)) {
                lock_page(page);

                /*
                 * Page got truncated/unhashed. This will cause a 0-byte
                 * splice, if this is the first page.
                 */
                if (!page->mapping) {
                        err = -ENODATA;
                        goto error;
                }

                /*
                 * Uh oh, read-error from disk.
                 */
                if (!PageUptodate(page)) {
                        err = -EIO;
                        goto error;
                }

                /*
                 * Page is ok afterall, we are done.
                 */
                unlock_page(page);
        }

        return 0;
error:
        unlock_page(page);
        return err;
}

const struct pipe_buf_operations page_cache_pipe_buf_ops = {
        .confirm        = page_cache_pipe_buf_confirm,
        .release        = page_cache_pipe_buf_release,
        .try_steal        = page_cache_pipe_buf_try_steal,
        .get                = generic_pipe_buf_get,
};

static bool user_page_pipe_buf_try_steal(struct pipe_inode_info *pipe,
                struct pipe_buffer *buf)
{
        if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
                return false;

        buf->flags |= PIPE_BUF_FLAG_LRU;
        return generic_pipe_buf_try_steal(pipe, buf);
}

static const struct pipe_buf_operations user_page_pipe_buf_ops = {
        .release        = page_cache_pipe_buf_release,
        .try_steal        = user_page_pipe_buf_try_steal,
        .get                = generic_pipe_buf_get,
};

static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
{
        smp_mb();
        if (waitqueue_active(&pipe->rd_wait))
                wake_up_interruptible(&pipe->rd_wait);
        kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
}

/**
 * splice_to_pipe - fill passed data into a pipe
 * @pipe:        pipe to fill
 * @spd:        data to fill
 *
 * Description:
 *    @spd contains a map of pages and len/offset tuples, along with
 *    the struct pipe_buf_operations associated with these pages. This
 *    function will link that data to the pipe.
 *
 */
ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
                       struct splice_pipe_desc *spd)
{
        unsigned int spd_pages = spd->nr_pages;
        unsigned int tail = pipe->tail;
        unsigned int head = pipe->head;
        unsigned int mask = pipe->ring_size - 1;
        int ret = 0, page_nr = 0;

        if (!spd_pages)
                return 0;

        if (unlikely(!pipe->readers)) {
                send_sig(SIGPIPE, current, 0);
                ret = -EPIPE;
                goto out;
        }

        while (!pipe_full(head, tail, pipe->max_usage)) {
                struct pipe_buffer *buf = &pipe->bufs[head & mask];

                buf->page = spd->pages[page_nr];
                buf->offset = spd->partial[page_nr].offset;
                buf->len = spd->partial[page_nr].len;
                buf->private = spd->partial[page_nr].private;
                buf->ops = spd->ops;
                buf->flags = 0;

                head++;
                pipe->head = head;
                page_nr++;
                ret += buf->len;

                if (!--spd->nr_pages)
                        break;
        }

        if (!ret)
                ret = -EAGAIN;

out:
        while (page_nr < spd_pages)
                spd->spd_release(spd, page_nr++);

        return ret;
}
EXPORT_SYMBOL_GPL(splice_to_pipe);

ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
{
        unsigned int head = pipe->head;
        unsigned int tail = pipe->tail;
        unsigned int mask = pipe->ring_size - 1;
        int ret;

        if (unlikely(!pipe->readers)) {
                send_sig(SIGPIPE, current, 0);
                ret = -EPIPE;
        } else if (pipe_full(head, tail, pipe->max_usage)) {
                ret = -EAGAIN;
        } else {
                pipe->bufs[head & mask] = *buf;
                pipe->head = head + 1;
                return buf->len;
        }
        pipe_buf_release(pipe, buf);
        return ret;
}
EXPORT_SYMBOL(add_to_pipe);

/*
 * Check if we need to grow the arrays holding pages and partial page
 * descriptions.
 */
int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
{
        unsigned int max_usage = READ_ONCE(pipe->max_usage);

        spd->nr_pages_max = max_usage;
        if (max_usage <= PIPE_DEF_BUFFERS)
                return 0;

        spd->pages = kmalloc_array(max_usage, sizeof(struct page *), GFP_KERNEL);
        spd->partial = kmalloc_array(max_usage, sizeof(struct partial_page),
                                     GFP_KERNEL);

        if (spd->pages && spd->partial)
                return 0;

        kfree(spd->pages);
        kfree(spd->partial);
        return -ENOMEM;
}

void splice_shrink_spd(struct splice_pipe_desc *spd)
{
        if (spd->nr_pages_max <= PIPE_DEF_BUFFERS)
                return;

        kfree(spd->pages);
        kfree(spd->partial);
}

/**
 * generic_file_splice_read - splice data from file to a pipe
 * @in:                file to splice from
 * @ppos:        position in @in
 * @pipe:        pipe to splice to
 * @len:        number of bytes to splice
 * @flags:        splice modifier flags
 *
 * Description:
 *    Will read pages from given file and fill them into a pipe. Can be
 *    used as long as it has more or less sane ->read_iter().
 *
 */
ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
                                 struct pipe_inode_info *pipe, size_t len,
                                 unsigned int flags)
{
        struct iov_iter to;
        struct kiocb kiocb;
        unsigned int i_head;
        int ret;

        iov_iter_pipe(&to, READ, pipe, len);
        i_head = to.head;
        init_sync_kiocb(&kiocb, in);
        kiocb.ki_pos = *ppos;
        ret = call_read_iter(in, &kiocb, &to);
        if (ret > 0) {
                *ppos = kiocb.ki_pos;
                file_accessed(in);
        } else if (ret < 0) {
                to.head = i_head;
                to.iov_offset = 0;
                iov_iter_advance(&to, 0); /* to free what was emitted */
                /*
                 * callers of ->splice_read() expect -EAGAIN on
                 * "can't put anything in there", rather than -EFAULT.
                 */
                if (ret == -EFAULT)
                        ret = -EAGAIN;
        }

        return ret;
}
EXPORT_SYMBOL(generic_file_splice_read);

const struct pipe_buf_operations default_pipe_buf_ops = {
        .release        = generic_pipe_buf_release,
        .try_steal        = generic_pipe_buf_try_steal,
        .get                = generic_pipe_buf_get,
};

/* Pipe buffer operations for a socket and similar. */
const struct pipe_buf_operations nosteal_pipe_buf_ops = {
        .release        = generic_pipe_buf_release,
        .get                = generic_pipe_buf_get,
};
EXPORT_SYMBOL(nosteal_pipe_buf_ops);

/*
 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
 * using sendpage(). Return the number of bytes sent.
 */
static int pipe_to_sendpage(struct pipe_inode_info *pipe,
                            struct pipe_buffer *buf, struct splice_desc *sd)
{
        struct file *file = sd->u.file;
        loff_t pos = sd->pos;
        int more;

        if (!likely(file->f_op->sendpage))
                return -EINVAL;

        more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;

        if (sd->len < sd->total_len &&
            pipe_occupancy(pipe->head, pipe->tail) > 1)
                more |= MSG_SENDPAGE_NOTLAST;

        return file->f_op->sendpage(file, buf->page, buf->offset,
                                    sd->len, &pos, more);
}

static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
{
        smp_mb();
        if (waitqueue_active(&pipe->wr_wait))
                wake_up_interruptible(&pipe->wr_wait);
        kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}

/**
 * splice_from_pipe_feed - feed available data from a pipe to a file
 * @pipe:        pipe to splice from
 * @sd:                information to @actor
 * @actor:        handler that splices the data
 *
 * Description:
 *    This function loops over the pipe and calls @actor to do the
 *    actual moving of a single struct pipe_buffer to the desired
 *    destination.  It returns when there's no more buffers left in
 *    the pipe or if the requested number of bytes (@sd->total_len)
 *    have been copied.  It returns a positive number (one) if the
 *    pipe needs to be filled with more data, zero if the required
 *    number of bytes have been copied and -errno on error.
 *
 *    This, together with splice_from_pipe_{begin,end,next}, may be
 *    used to implement the functionality of __splice_from_pipe() when
 *    locking is required around copying the pipe buffers to the
 *    destination.
 */
static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
                          splice_actor *actor)
{
        unsigned int head = pipe->head;
        unsigned int tail = pipe->tail;
        unsigned int mask = pipe->ring_size - 1;
        int ret;

        while (!pipe_empty(head, tail)) {
                struct pipe_buffer *buf = &pipe->bufs[tail & mask];

                sd->len = buf->len;
                if (sd->len > sd->total_len)
                        sd->len = sd->total_len;

                ret = pipe_buf_confirm(pipe, buf);
                if (unlikely(ret)) {
                        if (ret == -ENODATA)
                                ret = 0;
                        return ret;
                }

                ret = actor(pipe, buf, sd);
                if (ret <= 0)
                        return ret;

                buf->offset += ret;
                buf->len -= ret;

                sd->num_spliced += ret;
                sd->len -= ret;
                sd->pos += ret;
                sd->total_len -= ret;

                if (!buf->len) {
                        pipe_buf_release(pipe, buf);
                        tail++;
                        pipe->tail = tail;
                        if (pipe->files)
                                sd->need_wakeup = true;
                }

                if (!sd->total_len)
                        return 0;
        }

        return 1;
}

/* We know we have a pipe buffer, but maybe it's empty? */
static inline bool eat_empty_buffer(struct pipe_inode_info *pipe)
{
        unsigned int tail = pipe->tail;
        unsigned int mask = pipe->ring_size - 1;
        struct pipe_buffer *buf = &pipe->bufs[tail & mask];

        if (unlikely(!buf->len)) {
                pipe_buf_release(pipe, buf);
                pipe->tail = tail+1;
                return true;
        }

        return false;
}

/**
 * splice_from_pipe_next - wait for some data to splice from
 * @pipe:        pipe to splice from
 * @sd:                information about the splice operation
 *
 * Description:
 *    This function will wait for some data and return a positive
 *    value (one) if pipe buffers are available.  It will return zero
 *    or -errno if no more data needs to be spliced.
 */
static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
{
        /*
         * Check for signal early to make process killable when there are
         * always buffers available
         */
        if (signal_pending(current))
                return -ERESTARTSYS;

repeat:
        while (pipe_empty(pipe->head, pipe->tail)) {
                if (!pipe->writers)
                        return 0;

                if (sd->num_spliced)
                        return 0;

                if (sd->flags & SPLICE_F_NONBLOCK)
                        return -EAGAIN;

                if (signal_pending(current))
                        return -ERESTARTSYS;

                if (sd->need_wakeup) {
                        wakeup_pipe_writers(pipe);
                        sd->need_wakeup = false;
                }

                pipe_wait_readable(pipe);
        }

        if (eat_empty_buffer(pipe))
                goto repeat;

        return 1;
}

/**
 * splice_from_pipe_begin - start splicing from pipe
 * @sd:                information about the splice operation
 *
 * Description:
 *    This function should be called before a loop containing
 *    splice_from_pipe_next() and splice_from_pipe_feed() to
 *    initialize the necessary fields of @sd.
 */
static void splice_from_pipe_begin(struct splice_desc *sd)
{
        sd->num_spliced = 0;
        sd->need_wakeup = false;
}

/**
 * splice_from_pipe_end - finish splicing from pipe
 * @pipe:        pipe to splice from
 * @sd:                information about the splice operation
 *
 * Description:
 *    This function will wake up pipe writers if necessary.  It should
 *    be called after a loop containing splice_from_pipe_next() and
 *    splice_from_pipe_feed().
 */
static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
{
        if (sd->need_wakeup)
                wakeup_pipe_writers(pipe);
}

/**
 * __splice_from_pipe - splice data from a pipe to given actor
 * @pipe:        pipe to splice from
 * @sd:                information to @actor
 * @actor:        handler that splices the data
 *
 * Description:
 *    This function does little more than loop over the pipe and call
 *    @actor to do the actual moving of a single struct pipe_buffer to
 *    the desired destination. See pipe_to_file, pipe_to_sendpage, or
 *    pipe_to_user.
 *
 */
ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
                           splice_actor *actor)
{
        int ret;

        splice_from_pipe_begin(sd);
        do {
                cond_resched();
                ret = splice_from_pipe_next(pipe, sd);
                if (ret > 0)
                        ret = splice_from_pipe_feed(pipe, sd, actor);
        } while (ret > 0);
        splice_from_pipe_end(pipe, sd);

        return sd->num_spliced ? sd->num_spliced : ret;
}
EXPORT_SYMBOL(__splice_from_pipe);

/**
 * splice_from_pipe - splice data from a pipe to a file
 * @pipe:        pipe to splice from
 * @out:        file to splice to
 * @ppos:        position in @out
 * @len:        how many bytes to splice
 * @flags:        splice modifier flags
 * @actor:        handler that splices the data
 *
 * Description:
 *    See __splice_from_pipe. This function locks the pipe inode,
 *    otherwise it's identical to __splice_from_pipe().
 *
 */
ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
                         loff_t *ppos, size_t len, unsigned int flags,
                         splice_actor *actor)
{
        ssize_t ret;
        struct splice_desc sd = {
                .total_len = len,
                .flags = flags,
                .pos = *ppos,
                .u.file = out,
        };

        pipe_lock(pipe);
        ret = __splice_from_pipe(pipe, &sd, actor);
        pipe_unlock(pipe);

        return ret;
}

/**
 * iter_file_splice_write - splice data from a pipe to a file
 * @pipe:        pipe info
 * @out:        file to write to
 * @ppos:        position in @out
 * @len:        number of bytes to splice
 * @flags:        splice modifier flags
 *
 * Description:
 *    Will either move or copy pages (determined by @flags options) from
 *    the given pipe inode to the given file.
 *    This one is ->write_iter-based.
 *
 */
ssize_t
iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
                          loff_t *ppos, size_t len, unsigned int flags)
{
        struct splice_desc sd = {
                .total_len = len,
                .flags = flags,
                .pos = *ppos,
                .u.file = out,
        };
        int nbufs = pipe->max_usage;
        struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
                                        GFP_KERNEL);
        ssize_t ret;

        if (unlikely(!array))
                return -ENOMEM;

        pipe_lock(pipe);

        splice_from_pipe_begin(&sd);
        while (sd.total_len) {
                struct iov_iter from;
                unsigned int head, tail, mask;
                size_t left;
                int n;

                ret = splice_from_pipe_next(pipe, &sd);
                if (ret <= 0)
                        break;

                if (unlikely(nbufs < pipe->max_usage)) {
                        kfree(array);
                        nbufs = pipe->max_usage;
                        array = kcalloc(nbufs, sizeof(struct bio_vec),
                                        GFP_KERNEL);
                        if (!array) {
                                ret = -ENOMEM;
                                break;
                        }
                }

                head = pipe->head;
                tail = pipe->tail;
                mask = pipe->ring_size - 1;

                /* build the vector */
                left = sd.total_len;
                for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) {
                        struct pipe_buffer *buf = &pipe->bufs[tail & mask];
                        size_t this_len = buf->len;

                        /* zero-length bvecs are not supported, skip them */
                        if (!this_len)
                                continue;
                        this_len = min(this_len, left);

                        ret = pipe_buf_confirm(pipe, buf);
                        if (unlikely(ret)) {
                                if (ret == -ENODATA)
                                        ret = 0;
                                goto done;
                        }

                        array[n].bv_page = buf->page;
                        array[n].bv_len = this_len;
                        array[n].bv_offset = buf->offset;
                        left -= this_len;
                        n++;
                }

                iov_iter_bvec(&from, WRITE, array, n, sd.total_len - left);
                ret = vfs_iter_write(out, &from, &sd.pos, 0);
                if (ret <= 0)
                        break;

                sd.num_spliced += ret;
                sd.total_len -= ret;
                *ppos = sd.pos;

                /* dismiss the fully eaten buffers, adjust the partial one */
                tail = pipe->tail;
                while (ret) {
                        struct pipe_buffer *buf = &pipe->bufs[tail & mask];
                        if (ret >= buf->len) {
                                ret -= buf->len;
                                buf->len = 0;
                                pipe_buf_release(pipe, buf);
                                tail++;
                                pipe->tail = tail;
                                if (pipe->files)
                                        sd.need_wakeup = true;
                        } else {
                                buf->offset += ret;
                                buf->len -= ret;
                                ret = 0;
                        }
                }
        }
done:
        kfree(array);
        splice_from_pipe_end(pipe, &sd);

        pipe_unlock(pipe);

        if (sd.num_spliced)
                ret = sd.num_spliced;

        return ret;
}

EXPORT_SYMBOL(iter_file_splice_write);

/**
 * generic_splice_sendpage - splice data from a pipe to a socket
 * @pipe:        pipe to splice from
 * @out:        socket to write to
 * @ppos:        position in @out
 * @len:        number of bytes to splice
 * @flags:        splice modifier flags
 *
 * Description:
 *    Will send @len bytes from the pipe to a network socket. No data copying
 *    is involved.
 *
 */
ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
                                loff_t *ppos, size_t len, unsigned int flags)
{
        return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
}

EXPORT_SYMBOL(generic_splice_sendpage);

static int warn_unsupported(struct file *file, const char *op)
{
        pr_debug_ratelimited(
                "splice %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
                op, file, current->pid, current->comm);
        return -EINVAL;
}

/*
 * Attempt to initiate a splice from pipe to file.
 */
static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
                           loff_t *ppos, size_t len, unsigned int flags)
{
        if (unlikely(!out->f_op->splice_write))
                return warn_unsupported(out, "write");
        return out->f_op->splice_write(pipe, out, ppos, len, flags);
}

/*
 * Attempt to initiate a splice from a file to a pipe.
 */
static long do_splice_to(struct file *in, loff_t *ppos,
                         struct pipe_inode_info *pipe, size_t len,
                         unsigned int flags)
{
        int ret;

        if (unlikely(!(in->f_mode & FMODE_READ)))
                return -EBADF;

        ret = rw_verify_area(READ, in, ppos, len);
        if (unlikely(ret < 0))
                return ret;

        if (unlikely(len > MAX_RW_COUNT))
                len = MAX_RW_COUNT;

        if (unlikely(!in->f_op->splice_read))
                return warn_unsupported(in, "read");
        return in->f_op->splice_read(in, ppos, pipe, len, flags);
}

/**
 * splice_direct_to_actor - splices data directly between two non-pipes
 * @in:                file to splice from
 * @sd:                actor information on where to splice to
 * @actor:        handles the data splicing
 *
 * Description:
 *    This is a special case helper to splice directly between two
 *    points, without requiring an explicit pipe. Internally an allocated
 *    pipe is cached in the process, and reused during the lifetime of
 *    that process.
 *
 */
ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
                               splice_direct_actor *actor)
{
        struct pipe_inode_info *pipe;
        long ret, bytes;
        umode_t i_mode;
        size_t len;
        int i, flags, more;

        /*
         * We require the input being a regular file, as we don't want to
         * randomly drop data for eg socket -> socket splicing. Use the
         * piped splicing for that!
         */
        i_mode = file_inode(in)->i_mode;
        if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
                return -EINVAL;

        /*
         * neither in nor out is a pipe, setup an internal pipe attached to
         * 'out' and transfer the wanted data from 'in' to 'out' through that
         */
        pipe = current->splice_pipe;
        if (unlikely(!pipe)) {
                pipe = alloc_pipe_info();
                if (!pipe)
                        return -ENOMEM;

                /*
                 * We don't have an immediate reader, but we'll read the stuff
                 * out of the pipe right after the splice_to_pipe(). So set
                 * PIPE_READERS appropriately.
                 */
                pipe->readers = 1;

                current->splice_pipe = pipe;
        }

        /*
         * Do the splice.
         */
        ret = 0;
        bytes = 0;
        len = sd->total_len;
        flags = sd->flags;

        /*
         * Don't block on output, we have to drain the direct pipe.
         */
        sd->flags &= ~SPLICE_F_NONBLOCK;
        more = sd->flags & SPLICE_F_MORE;

        WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail));

        while (len) {
                unsigned int p_space;
                size_t read_len;
                loff_t pos = sd->pos, prev_pos = pos;

                /* Don't try to read more the pipe has space for. */
                p_space = pipe->max_usage -
                        pipe_occupancy(pipe->head, pipe->tail);
                read_len = min_t(size_t, len, p_space << PAGE_SHIFT);
                ret = do_splice_to(in, &pos, pipe, read_len, flags);
                if (unlikely(ret <= 0))
                        goto out_release;

                read_len = ret;
                sd->total_len = read_len;

                /*
                 * If more data is pending, set SPLICE_F_MORE
                 * If this is the last data and SPLICE_F_MORE was not set
                 * initially, clears it.
                 */
                if (read_len < len)
                        sd->flags |= SPLICE_F_MORE;
                else if (!more)
                        sd->flags &= ~SPLICE_F_MORE;
                /*
                 * NOTE: nonblocking mode only applies to the input. We
                 * must not do the output in nonblocking mode as then we
                 * could get stuck data in the internal pipe:
                 */
                ret = actor(pipe, sd);
                if (unlikely(ret <= 0)) {
                        sd->pos = prev_pos;
                        goto out_release;
                }

                bytes += ret;
                len -= ret;
                sd->pos = pos;

                if (ret < read_len) {
                        sd->pos = prev_pos + ret;
                        goto out_release;
                }
        }

done:
        pipe->tail = pipe->head = 0;
        file_accessed(in);
        return bytes;

out_release:
        /*
         * If we did an incomplete transfer we must release
         * the pipe buffers in question:
         */
        for (i = 0; i < pipe->ring_size; i++) {
                struct pipe_buffer *buf = &pipe->bufs[i];

                if (buf->ops)
                        pipe_buf_release(pipe, buf);
        }

        if (!bytes)
                bytes = ret;

        goto done;
}
EXPORT_SYMBOL(splice_direct_to_actor);

static int direct_splice_actor(struct pipe_inode_info *pipe,
                               struct splice_desc *sd)
{
        struct file *file = sd->u.file;

        return do_splice_from(pipe, file, sd->opos, sd->total_len,
                              sd->flags);
}

/**
 * do_splice_direct - splices data directly between two files
 * @in:                file to splice from
 * @ppos:        input file offset
 * @out:        file to splice to
 * @opos:        output file offset
 * @len:        number of bytes to splice
 * @flags:        splice modifier flags
 *
 * Description:
 *    For use by do_sendfile(). splice can easily emulate sendfile, but
 *    doing it in the application would incur an extra system call
 *    (splice in + splice out, as compared to just sendfile()). So this helper
 *    can splice directly through a process-private pipe.
 *
 */
long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
                      loff_t *opos, size_t len, unsigned int flags)
{
        struct splice_desc sd = {
                .len                = len,
                .total_len        = len,
                .flags                = flags,
                .pos                = *ppos,
                .u.file                = out,
                .opos                = opos,
        };
        long ret;

        if (unlikely(!(out->f_mode & FMODE_WRITE)))
                return -EBADF;

        if (unlikely(out->f_flags & O_APPEND))
                return -EINVAL;

        ret = rw_verify_area(WRITE, out, opos, len);
        if (unlikely(ret < 0))
                return ret;

        ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
        if (ret > 0)
                *ppos = sd.pos;

        return ret;
}
EXPORT_SYMBOL(do_splice_direct);

static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
{
        for (;;) {
                if (unlikely(!pipe->readers)) {
                        send_sig(SIGPIPE, current, 0);
                        return -EPIPE;
                }
                if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
                        return 0;
                if (flags & SPLICE_F_NONBLOCK)
                        return -EAGAIN;
                if (signal_pending(current))
                        return -ERESTARTSYS;
                pipe_wait_writable(pipe);
        }
}

static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
                               struct pipe_inode_info *opipe,
                               size_t len, unsigned int flags);

/*
 * Determine where to splice to/from.
 */
long do_splice(struct file *in, loff_t *off_in, struct file *out,
               loff_t *off_out, size_t len, unsigned int flags)
{
        struct pipe_inode_info *ipipe;
        struct pipe_inode_info *opipe;
        loff_t offset;
        long ret;

        if (unlikely(!(in->f_mode & FMODE_READ) ||
                     !(out->f_mode & FMODE_WRITE)))
                return -EBADF;

        ipipe = get_pipe_info(in, true);
        opipe = get_pipe_info(out, true);

        if (ipipe && opipe) {
                if (off_in || off_out)
                        return -ESPIPE;

                /* Splicing to self would be fun, but... */
                if (ipipe == opipe)
                        return -EINVAL;

                if ((in->f_flags | out->f_flags) & O_NONBLOCK)
                        flags |= SPLICE_F_NONBLOCK;

                return splice_pipe_to_pipe(ipipe, opipe, len, flags);
        }

        if (ipipe) {
                if (off_in)
                        return -ESPIPE;
                if (off_out) {
                        if (!(out->f_mode & FMODE_PWRITE))
                                return -EINVAL;
                        offset = *off_out;
                } else {
                        offset = out->f_pos;
                }

                if (unlikely(out->f_flags & O_APPEND))
                        return -EINVAL;

                ret = rw_verify_area(WRITE, out, &offset, len);
                if (unlikely(ret < 0))
                        return ret;

                if (in->f_flags & O_NONBLOCK)
                        flags |= SPLICE_F_NONBLOCK;

                file_start_write(out);
                ret = do_splice_from(ipipe, out, &offset, len, flags);
                file_end_write(out);

                if (!off_out)
                        out->f_pos = offset;
                else
                        *off_out = offset;

                return ret;
        }

        if (opipe) {
                if (off_out)
                        return -ESPIPE;
                if (off_in) {
                        if (!(in->f_mode & FMODE_PREAD))
                                return -EINVAL;
                        offset = *off_in;
                } else {
                        offset = in->f_pos;
                }

                if (out->f_flags & O_NONBLOCK)
                        flags |= SPLICE_F_NONBLOCK;

                pipe_lock(opipe);
                ret = wait_for_space(opipe, flags);
                if (!ret) {
                        unsigned int p_space;

                        /* Don't try to read more the pipe has space for. */
                        p_space = opipe->max_usage - pipe_occupancy(opipe->head, opipe->tail);
                        len = min_t(size_t, len, p_space << PAGE_SHIFT);

                        ret = do_splice_to(in, &offset, opipe, len, flags);
                }
                pipe_unlock(opipe);
                if (ret > 0)
                        wakeup_pipe_readers(opipe);
                if (!off_in)
                        in->f_pos = offset;
                else
                        *off_in = offset;

                return ret;
        }

        return -EINVAL;
}

static long __do_splice(struct file *in, loff_t __user *off_in,
                        struct file *out, loff_t __user *off_out,
                        size_t len, unsigned int flags)
{
        struct pipe_inode_info *ipipe;
        struct pipe_inode_info *opipe;
        loff_t offset, *__off_in = NULL, *__off_out = NULL;
        long ret;

        ipipe = get_pipe_info(in, true);
        opipe = get_pipe_info(out, true);

        if (ipipe && off_in)
                return -ESPIPE;
        if (opipe && off_out)
                return -ESPIPE;

        if (off_out) {
                if (copy_from_user(&offset, off_out, sizeof(loff_t)))
                        return -EFAULT;
                __off_out = &offset;
        }
        if (off_in) {
                if (copy_from_user(&offset, off_in, sizeof(loff_t)))
                        return -EFAULT;
                __off_in = &offset;
        }

        ret = do_splice(in, __off_in, out, __off_out, len, flags);
        if (ret < 0)
                return ret;

        if (__off_out && copy_to_user(off_out, __off_out, sizeof(loff_t)))
                return -EFAULT;
        if (__off_in && copy_to_user(off_in, __off_in, sizeof(loff_t)))
                return -EFAULT;

        return ret;
}

static int iter_to_pipe(struct iov_iter *from,
                        struct pipe_inode_info *pipe,
                        unsigned flags)
{
        struct pipe_buffer buf = {
                .ops = &user_page_pipe_buf_ops,
                .flags = flags
        };
        size_t total = 0;
        int ret = 0;
        bool failed = false;

        while (iov_iter_count(from) && !failed) {
                struct page *pages[16];
                ssize_t copied;
                size_t start;
                int n;

                copied = iov_iter_get_pages(from, pages, ~0UL, 16, &start);
                if (copied <= 0) {
                        ret = copied;
                        break;
                }

                for (n = 0; copied; n++, start = 0) {
                        int size = min_t(int, copied, PAGE_SIZE - start);
                        if (!failed) {
                                buf.page = pages[n];
                                buf.offset = start;
                                buf.len = size;
                                ret = add_to_pipe(pipe, &buf);
                                if (unlikely(ret < 0)) {
                                        failed = true;
                                } else {
                                        iov_iter_advance(from, ret);
                                        total += ret;
                                }
                        } else {
                                put_page(pages[n]);
                        }
                        copied -= size;
                }
        }
        return total ? total : ret;
}

static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
                        struct splice_desc *sd)
{
        int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
        return n == sd->len ? n : -EFAULT;
}

/*
 * For lack of a better implementation, implement vmsplice() to userspace
 * as a simple copy of the pipes pages to the user iov.
 */
static long vmsplice_to_user(struct file *file, struct iov_iter *iter,
                             unsigned int flags)
{
        struct pipe_inode_info *pipe = get_pipe_info(file, true);
        struct splice_desc sd = {
                .total_len = iov_iter_count(iter),
                .flags = flags,
                .u.data = iter
        };
        long ret = 0;

        if (!pipe)
                return -EBADF;

        if (sd.total_len) {
                pipe_lock(pipe);
                ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
                pipe_unlock(pipe);
        }

        return ret;
}

/*
 * vmsplice splices a user address range into a pipe. It can be thought of
 * as splice-from-memory, where the regular splice is splice-from-file (or
 * to file). In both cases the output is a pipe, naturally.
 */
static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
                             unsigned int flags)
{
        struct pipe_inode_info *pipe;
        long ret = 0;
        unsigned buf_flag = 0;

        if (flags & SPLICE_F_GIFT)
                buf_flag = PIPE_BUF_FLAG_GIFT;

        pipe = get_pipe_info(file, true);
        if (!pipe)
                return -EBADF;

        pipe_lock(pipe);
        ret = wait_for_space(pipe, flags);
        if (!ret)
                ret = iter_to_pipe(iter, pipe, buf_flag);
        pipe_unlock(pipe);
        if (ret > 0)
                wakeup_pipe_readers(pipe);
        return ret;
}

static int vmsplice_type(struct fd f, int *type)
{
        if (!f.file)
                return -EBADF;
        if (f.file->f_mode & FMODE_WRITE) {
                *type = WRITE;
        } else if (f.file->f_mode & FMODE_READ) {
                *type = READ;
        } else {
                fdput(f);
                return -EBADF;
        }
        return 0;
}

/*
 * Note that vmsplice only really supports true splicing _from_ user memory
 * to a pipe, not the other way around. Splicing from user memory is a simple
 * operation that can be supported without any funky alignment restrictions
 * or nasty vm tricks. We simply map in the user memory and fill them into
 * a pipe. The reverse isn't quite as easy, though. There are two possible
 * solutions for that:
 *
 *        - memcpy() the data internally, at which point we might as well just
 *          do a regular read() on the buffer anyway.
 *        - Lots of nasty vm tricks, that are neither fast nor flexible (it
 *          has restriction limitations on both ends of the pipe).
 *
 * Currently we punt and implement it as a normal copy, see pipe_to_user().
 *
 */
SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
                unsigned long, nr_segs, unsigned int, flags)
{
        struct iovec iovstack[UIO_FASTIOV];
        struct iovec *iov = iovstack;
        struct iov_iter iter;
        ssize_t error;
        struct fd f;
        int type;

        if (unlikely(flags & ~SPLICE_F_ALL))
                return -EINVAL;

        f = fdget(fd);
        error = vmsplice_type(f, &type);
        if (error)
                return error;

        error = import_iovec(type, uiov, nr_segs,
                             ARRAY_SIZE(iovstack), &iov, &iter);
        if (error < 0)
                goto out_fdput;

        if (!iov_iter_count(&iter))
                error = 0;
        else if (iov_iter_rw(&iter) == WRITE)
                error = vmsplice_to_pipe(f.file, &iter, flags);
        else
                error = vmsplice_to_user(f.file, &iter, flags);

        kfree(iov);
out_fdput:
        fdput(f);
        return error;
}

SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
                int, fd_out, loff_t __user *, off_out,
                size_t, len, unsigned int, flags)
{
        struct fd in, out;
        long error;

        if (unlikely(!len))
                return 0;

        if (unlikely(flags & ~SPLICE_F_ALL))
                return -EINVAL;

        error = -EBADF;
        in = fdget(fd_in);
        if (in.file) {
                out = fdget(fd_out);
                if (out.file) {
                        error = __do_splice(in.file, off_in, out.file, off_out,
                                                len, flags);
                        fdput(out);
                }
                fdput(in);
        }
        return error;
}

/*
 * Make sure there's data to read. Wait for input if we can, otherwise
 * return an appropriate error.
 */
static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
{
        int ret;

        /*
         * Check the pipe occupancy without the inode lock first. This function
         * is speculative anyways, so missing one is ok.
         */
        if (!pipe_empty(pipe->head, pipe->tail))
                return 0;

        ret = 0;
        pipe_lock(pipe);

        while (pipe_empty(pipe->head, pipe->tail)) {
                if (signal_pending(current)) {
                        ret = -ERESTARTSYS;
                        break;
                }
                if (!pipe->writers)
                        break;
                if (flags & SPLICE_F_NONBLOCK) {
                        ret = -EAGAIN;
                        break;
                }
                pipe_wait_readable(pipe);
        }

        pipe_unlock(pipe);
        return ret;
}

/*
 * Make sure there's writeable room. Wait for room if we can, otherwise
 * return an appropriate error.
 */
static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
{
        int ret;

        /*
         * Check pipe occupancy without the inode lock first. This function
         * is speculative anyways, so missing one is ok.
         */
        if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
                return 0;

        ret = 0;
        pipe_lock(pipe);

        while (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
                if (!pipe->readers) {
                        send_sig(SIGPIPE, current, 0);
                        ret = -EPIPE;
                        break;
                }
                if (flags & SPLICE_F_NONBLOCK) {
                        ret = -EAGAIN;
                        break;
                }
                if (signal_pending(current)) {
                        ret = -ERESTARTSYS;
                        break;
                }
                pipe_wait_writable(pipe);
        }

        pipe_unlock(pipe);
        return ret;
}

/*
 * Splice contents of ipipe to opipe.
 */
static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
                               struct pipe_inode_info *opipe,
                               size_t len, unsigned int flags)
{
        struct pipe_buffer *ibuf, *obuf;
        unsigned int i_head, o_head;
        unsigned int i_tail, o_tail;
        unsigned int i_mask, o_mask;
        int ret = 0;
        bool input_wakeup = false;


retry:
        ret = ipipe_prep(ipipe, flags);
        if (ret)
                return ret;

        ret = opipe_prep(opipe, flags);
        if (ret)
                return ret;

        /*
         * Potential ABBA deadlock, work around it by ordering lock
         * grabbing by pipe info address. Otherwise two different processes
         * could deadlock (one doing tee from A -> B, the other from B -> A).
         */
        pipe_double_lock(ipipe, opipe);

        i_tail = ipipe->tail;
        i_mask = ipipe->ring_size - 1;
        o_head = opipe->head;
        o_mask = opipe->ring_size - 1;

        do {
                size_t o_len;

                if (!opipe->readers) {
                        send_sig(SIGPIPE, current, 0);
                        if (!ret)
                                ret = -EPIPE;
                        break;
                }

                i_head = ipipe->head;
                o_tail = opipe->tail;

                if (pipe_empty(i_head, i_tail) && !ipipe->writers)
                        break;

                /*
                 * Cannot make any progress, because either the input
                 * pipe is empty or the output pipe is full.
                 */
                if (pipe_empty(i_head, i_tail) ||
                    pipe_full(o_head, o_tail, opipe->max_usage)) {
                        /* Already processed some buffers, break */
                        if (ret)
                                break;

                        if (flags & SPLICE_F_NONBLOCK) {
                                ret = -EAGAIN;
                                break;
                        }

                        /*
                         * We raced with another reader/writer and haven't
                         * managed to process any buffers.  A zero return
                         * value means EOF, so retry instead.
                         */
                        pipe_unlock(ipipe);
                        pipe_unlock(opipe);
                        goto retry;
                }

                ibuf = &ipipe->bufs[i_tail & i_mask];
                obuf = &opipe->bufs[o_head & o_mask];

                if (len >= ibuf->len) {
                        /*
                         * Simply move the whole buffer from ipipe to opipe
                         */
                        *obuf = *ibuf;
                        ibuf->ops = NULL;
                        i_tail++;
                        ipipe->tail = i_tail;
                        input_wakeup = true;
                        o_len = obuf->len;
                        o_head++;
                        opipe->head = o_head;
                } else {
                        /*
                         * Get a reference to this pipe buffer,
                         * so we can copy the contents over.
                         */
                        if (!pipe_buf_get(ipipe, ibuf)) {
                                if (ret == 0)
                                        ret = -EFAULT;
                                break;
                        }
                        *obuf = *ibuf;

                        /*
                         * Don't inherit the gift and merge flags, we need to
                         * prevent multiple steals of this page.
                         */
                        obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
                        obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;

                        obuf->len = len;
                        ibuf->offset += len;
                        ibuf->len -= len;
                        o_len = len;
                        o_head++;
                        opipe->head = o_head;
                }
                ret += o_len;
                len -= o_len;
        } while (len);

        pipe_unlock(ipipe);
        pipe_unlock(opipe);

        /*
         * If we put data in the output pipe, wakeup any potential readers.
         */
        if (ret > 0)
                wakeup_pipe_readers(opipe);

        if (input_wakeup)
                wakeup_pipe_writers(ipipe);

        return ret;
}

/*
 * Link contents of ipipe to opipe.
 */
static int link_pipe(struct pipe_inode_info *ipipe,
                     struct pipe_inode_info *opipe,
                     size_t len, unsigned int flags)
{
        struct pipe_buffer *ibuf, *obuf;
        unsigned int i_head, o_head;
        unsigned int i_tail, o_tail;
        unsigned int i_mask, o_mask;
        int ret = 0;

        /*
         * Potential ABBA deadlock, work around it by ordering lock
         * grabbing by pipe info address. Otherwise two different processes
         * could deadlock (one doing tee from A -> B, the other from B -> A).
         */
        pipe_double_lock(ipipe, opipe);

        i_tail = ipipe->tail;
        i_mask = ipipe->ring_size - 1;
        o_head = opipe->head;
        o_mask = opipe->ring_size - 1;

        do {
                if (!opipe->readers) {
                        send_sig(SIGPIPE, current, 0);
                        if (!ret)
                                ret = -EPIPE;
                        break;
                }

                i_head = ipipe->head;
                o_tail = opipe->tail;

                /*
                 * If we have iterated all input buffers or run out of
                 * output room, break.
                 */
                if (pipe_empty(i_head, i_tail) ||
                    pipe_full(o_head, o_tail, opipe->max_usage))
                        break;

                ibuf = &ipipe->bufs[i_tail & i_mask];
                obuf = &opipe->bufs[o_head & o_mask];

                /*
                 * Get a reference to this pipe buffer,
                 * so we can copy the contents over.
                 */
                if (!pipe_buf_get(ipipe, ibuf)) {
                        if (ret == 0)
                                ret = -EFAULT;
                        break;
                }

                *obuf = *ibuf;

                /*
                 * Don't inherit the gift and merge flag, we need to prevent
                 * multiple steals of this page.
                 */
                obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
                obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;

                if (obuf->len > len)
                        obuf->len = len;
                ret += obuf->len;
                len -= obuf->len;

                o_head++;
                opipe->head = o_head;
                i_tail++;
        } while (len);

        pipe_unlock(ipipe);
        pipe_unlock(opipe);

        /*
         * If we put data in the output pipe, wakeup any potential readers.
         */
        if (ret > 0)
                wakeup_pipe_readers(opipe);

        return ret;
}

/*
 * This is a tee(1) implementation that works on pipes. It doesn't copy
 * any data, it simply references the 'in' pages on the 'out' pipe.
 * The 'flags' used are the SPLICE_F_* variants, currently the only
 * applicable one is SPLICE_F_NONBLOCK.
 */
long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags)
{
        struct pipe_inode_info *ipipe = get_pipe_info(in, true);
        struct pipe_inode_info *opipe = get_pipe_info(out, true);
        int ret = -EINVAL;

        if (unlikely(!(in->f_mode & FMODE_READ) ||
                     !(out->f_mode & FMODE_WRITE)))
                return -EBADF;

        /*
         * Duplicate the contents of ipipe to opipe without actually
         * copying the data.
         */
        if (ipipe && opipe && ipipe != opipe) {
                if ((in->f_flags | out->f_flags) & O_NONBLOCK)
                        flags |= SPLICE_F_NONBLOCK;

                /*
                 * Keep going, unless we encounter an error. The ipipe/opipe
                 * ordering doesn't really matter.
                 */
                ret = ipipe_prep(ipipe, flags);
                if (!ret) {
                        ret = opipe_prep(opipe, flags);
                        if (!ret)
                                ret = link_pipe(ipipe, opipe, len, flags);
                }
        }

        return ret;
}

SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
{
        struct fd in, out;
        int error;

        if (unlikely(flags & ~SPLICE_F_ALL))
                return -EINVAL;

        if (unlikely(!len))
                return 0;

        error = -EBADF;
        in = fdget(fdin);
        if (in.file) {
                out = fdget(fdout);
                if (out.file) {
                        error = do_tee(in.file, out.file, len, flags);
                        fdput(out);
                }
                 fdput(in);
         }

        return error;
}





















































































































































































































































































































































































































































































































    1 
    1 












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
// SPDX-License-Identifier: GPL-2.0-only
/*
 * fs/kernfs/dir.c - kernfs directory implementation
 *
 * Copyright (c) 2001-3 Patrick Mochel
 * Copyright (c) 2007 SUSE Linux Products GmbH
 * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
 */

#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/idr.h>
#include <linux/slab.h>
#include <linux/security.h>
#include <linux/hash.h>

#include "kernfs-internal.h"

DEFINE_MUTEX(kernfs_mutex);
static DEFINE_SPINLOCK(kernfs_rename_lock);        /* kn->parent and ->name */
/*
 * Don't use rename_lock to piggy back on pr_cont_buf. We don't want to
 * call pr_cont() while holding rename_lock. Because sometimes pr_cont()
 * will perform wakeups when releasing console_sem. Holding rename_lock
 * will introduce deadlock if the scheduler reads the kernfs_name in the
 * wakeup path.
 */
static DEFINE_SPINLOCK(kernfs_pr_cont_lock);
static char kernfs_pr_cont_buf[PATH_MAX];        /* protected by pr_cont_lock */
static DEFINE_SPINLOCK(kernfs_idr_lock);        /* root->ino_idr */

#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)

static bool kernfs_active(struct kernfs_node *kn)
{
        lockdep_assert_held(&kernfs_mutex);
        return atomic_read(&kn->active) >= 0;
}

static bool kernfs_lockdep(struct kernfs_node *kn)
{
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        return kn->flags & KERNFS_LOCKDEP;
#else
        return false;
#endif
}

static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen)
{
        if (!kn)
                return strlcpy(buf, "(null)", buflen);

        return strlcpy(buf, kn->parent ? kn->name : "/", buflen);
}

/* kernfs_node_depth - compute depth from @from to @to */
static size_t kernfs_depth(struct kernfs_node *from, struct kernfs_node *to)
{
        size_t depth = 0;

        while (to->parent && to != from) {
                depth++;
                to = to->parent;
        }
        return depth;
}

static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a,
                                                  struct kernfs_node *b)
{
        size_t da, db;
        struct kernfs_root *ra = kernfs_root(a), *rb = kernfs_root(b);

        if (ra != rb)
                return NULL;

        da = kernfs_depth(ra->kn, a);
        db = kernfs_depth(rb->kn, b);

        while (da > db) {
                a = a->parent;
                da--;
        }
        while (db > da) {
                b = b->parent;
                db--;
        }

        /* worst case b and a will be the same at root */
        while (b != a) {
                b = b->parent;
                a = a->parent;
        }

        return a;
}

/**
 * kernfs_path_from_node_locked - find a pseudo-absolute path to @kn_to,
 * where kn_from is treated as root of the path.
 * @kn_from: kernfs node which should be treated as root for the path
 * @kn_to: kernfs node to which path is needed
 * @buf: buffer to copy the path into
 * @buflen: size of @buf
 *
 * We need to handle couple of scenarios here:
 * [1] when @kn_from is an ancestor of @kn_to at some level
 * kn_from: /n1/n2/n3
 * kn_to:   /n1/n2/n3/n4/n5
 * result:  /n4/n5
 *
 * [2] when @kn_from is on a different hierarchy and we need to find common
 * ancestor between @kn_from and @kn_to.
 * kn_from: /n1/n2/n3/n4
 * kn_to:   /n1/n2/n5
 * result:  /../../n5
 * OR
 * kn_from: /n1/n2/n3/n4/n5   [depth=5]
 * kn_to:   /n1/n2/n3         [depth=3]
 * result:  /../..
 *
 * [3] when @kn_to is NULL result will be "(null)"
 *
 * Returns the length of the full path.  If the full length is equal to or
 * greater than @buflen, @buf contains the truncated path with the trailing
 * '\0'.  On error, -errno is returned.
 */
static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
                                        struct kernfs_node *kn_from,
                                        char *buf, size_t buflen)
{
        struct kernfs_node *kn, *common;
        const char parent_str[] = "/..";
        size_t depth_from, depth_to, len = 0;
        int i, j;

        if (!kn_to)
                return strlcpy(buf, "(null)", buflen);

        if (!kn_from)
                kn_from = kernfs_root(kn_to)->kn;

        if (kn_from == kn_to)
                return strlcpy(buf, "/", buflen);

        if (!buf)
                return -EINVAL;

        common = kernfs_common_ancestor(kn_from, kn_to);
        if (WARN_ON(!common))
                return -EINVAL;

        depth_to = kernfs_depth(common, kn_to);
        depth_from = kernfs_depth(common, kn_from);

        buf[0] = '\0';

        for (i = 0; i < depth_from; i++)
                len += strlcpy(buf + len, parent_str,
                               len < buflen ? buflen - len : 0);

        /* Calculate how many bytes we need for the rest */
        for (i = depth_to - 1; i >= 0; i--) {
                for (kn = kn_to, j = 0; j < i; j++)
                        kn = kn->parent;
                len += strlcpy(buf + len, "/",
                               len < buflen ? buflen - len : 0);
                len += strlcpy(buf + len, kn->name,
                               len < buflen ? buflen - len : 0);
        }

        return len;
}

/**
 * kernfs_name - obtain the name of a given node
 * @kn: kernfs_node of interest
 * @buf: buffer to copy @kn's name into
 * @buflen: size of @buf
 *
 * Copies the name of @kn into @buf of @buflen bytes.  The behavior is
 * similar to strlcpy().  It returns the length of @kn's name and if @buf
 * isn't long enough, it's filled upto @buflen-1 and nul terminated.
 *
 * Fills buffer with "(null)" if @kn is NULL.
 *
 * This function can be called from any context.
 */
int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
{
        unsigned long flags;
        int ret;

        spin_lock_irqsave(&kernfs_rename_lock, flags);
        ret = kernfs_name_locked(kn, buf, buflen);
        spin_unlock_irqrestore(&kernfs_rename_lock, flags);
        return ret;
}

/**
 * kernfs_path_from_node - build path of node @to relative to @from.
 * @from: parent kernfs_node relative to which we need to build the path
 * @to: kernfs_node of interest
 * @buf: buffer to copy @to's path into
 * @buflen: size of @buf
 *
 * Builds @to's path relative to @from in @buf. @from and @to must
 * be on the same kernfs-root. If @from is not parent of @to, then a relative
 * path (which includes '..'s) as needed to reach from @from to @to is
 * returned.
 *
 * Returns the length of the full path.  If the full length is equal to or
 * greater than @buflen, @buf contains the truncated path with the trailing
 * '\0'.  On error, -errno is returned.
 */
int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from,
                          char *buf, size_t buflen)
{
        unsigned long flags;
        int ret;

        spin_lock_irqsave(&kernfs_rename_lock, flags);
        ret = kernfs_path_from_node_locked(to, from, buf, buflen);
        spin_unlock_irqrestore(&kernfs_rename_lock, flags);
        return ret;
}
EXPORT_SYMBOL_GPL(kernfs_path_from_node);

/**
 * pr_cont_kernfs_name - pr_cont name of a kernfs_node
 * @kn: kernfs_node of interest
 *
 * This function can be called from any context.
 */
void pr_cont_kernfs_name(struct kernfs_node *kn)
{
        unsigned long flags;

        spin_lock_irqsave(&kernfs_pr_cont_lock, flags);

        kernfs_name(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf));
        pr_cont("%s", kernfs_pr_cont_buf);

        spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags);
}

/**
 * pr_cont_kernfs_path - pr_cont path of a kernfs_node
 * @kn: kernfs_node of interest
 *
 * This function can be called from any context.
 */
void pr_cont_kernfs_path(struct kernfs_node *kn)
{
        unsigned long flags;
        int sz;

        spin_lock_irqsave(&kernfs_pr_cont_lock, flags);

        sz = kernfs_path_from_node(kn, NULL, kernfs_pr_cont_buf,
                                   sizeof(kernfs_pr_cont_buf));
        if (sz < 0) {
                pr_cont("(error)");
                goto out;
        }

        if (sz >= sizeof(kernfs_pr_cont_buf)) {
                pr_cont("(name too long)");
                goto out;
        }

        pr_cont("%s", kernfs_pr_cont_buf);

out:
        spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags);
}

/**
 * kernfs_get_parent - determine the parent node and pin it
 * @kn: kernfs_node of interest
 *
 * Determines @kn's parent, pins and returns it.  This function can be
 * called from any context.
 */
struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn)
{
        struct kernfs_node *parent;
        unsigned long flags;

        spin_lock_irqsave(&kernfs_rename_lock, flags);
        parent = kn->parent;
        kernfs_get(parent);
        spin_unlock_irqrestore(&kernfs_rename_lock, flags);

        return parent;
}

/**
 *        kernfs_name_hash
 *        @name: Null terminated string to hash
 *        @ns:   Namespace tag to hash
 *
 *        Returns 31 bit hash of ns + name (so it fits in an off_t )
 */
static unsigned int kernfs_name_hash(const char *name, const void *ns)
{
        unsigned long hash = init_name_hash(ns);
        unsigned int len = strlen(name);
        while (len--)
                hash = partial_name_hash(*name++, hash);
        hash = end_name_hash(hash);
        hash &= 0x7fffffffU;
        /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
        if (hash < 2)
                hash += 2;
        if (hash >= INT_MAX)
                hash = INT_MAX - 1;
        return hash;
}

static int kernfs_name_compare(unsigned int hash, const char *name,
                               const void *ns, const struct kernfs_node *kn)
{
        if (hash < kn->hash)
                return -1;
        if (hash > kn->hash)
                return 1;
        if (ns < kn->ns)
                return -1;
        if (ns > kn->ns)
                return 1;
        return strcmp(name, kn->name);
}

static int kernfs_sd_compare(const struct kernfs_node *left,
                             const struct kernfs_node *right)
{
        return kernfs_name_compare(left->hash, left->name, left->ns, right);
}

/**
 *        kernfs_link_sibling - link kernfs_node into sibling rbtree
 *        @kn: kernfs_node of interest
 *
 *        Link @kn into its sibling rbtree which starts from
 *        @kn->parent->dir.children.
 *
 *        Locking:
 *        mutex_lock(kernfs_mutex)
 *
 *        RETURNS:
 *        0 on susccess -EEXIST on failure.
 */
static int kernfs_link_sibling(struct kernfs_node *kn)
{
        struct rb_node **node = &kn->parent->dir.children.rb_node;
        struct rb_node *parent = NULL;

        while (*node) {
                struct kernfs_node *pos;
                int result;

                pos = rb_to_kn(*node);
                parent = *node;
                result = kernfs_sd_compare(kn, pos);
                if (result < 0)
                        node = &pos->rb.rb_left;
                else if (result > 0)
                        node = &pos->rb.rb_right;
                else
                        return -EEXIST;
        }

        /* add new node and rebalance the tree */
        rb_link_node(&kn->rb, parent, node);
        rb_insert_color(&kn->rb, &kn->parent->dir.children);

        /* successfully added, account subdir number */
        if (kernfs_type(kn) == KERNFS_DIR)
                kn->parent->dir.subdirs++;

        return 0;
}

/**
 *        kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree
 *        @kn: kernfs_node of interest
 *
 *        Try to unlink @kn from its sibling rbtree which starts from
 *        kn->parent->dir.children.  Returns %true if @kn was actually
 *        removed, %false if @kn wasn't on the rbtree.
 *
 *        Locking:
 *        mutex_lock(kernfs_mutex)
 */
static bool kernfs_unlink_sibling(struct kernfs_node *kn)
{
        if (RB_EMPTY_NODE(&kn->rb))
                return false;

        if (kernfs_type(kn) == KERNFS_DIR)
                kn->parent->dir.subdirs--;

        rb_erase(&kn->rb, &kn->parent->dir.children);
        RB_CLEAR_NODE(&kn->rb);
        return true;
}

/**
 *        kernfs_get_active - get an active reference to kernfs_node
 *        @kn: kernfs_node to get an active reference to
 *
 *        Get an active reference of @kn.  This function is noop if @kn
 *        is NULL.
 *
 *        RETURNS:
 *        Pointer to @kn on success, NULL on failure.
 */
struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
{
        if (unlikely(!kn))
                return NULL;

        if (!atomic_inc_unless_negative(&kn->active))
                return NULL;

        if (kernfs_lockdep(kn))
                rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_);
        return kn;
}

/**
 *        kernfs_put_active - put an active reference to kernfs_node
 *        @kn: kernfs_node to put an active reference to
 *
 *        Put an active reference to @kn.  This function is noop if @kn
 *        is NULL.
 */
void kernfs_put_active(struct kernfs_node *kn)
{
        int v;

        if (unlikely(!kn))
                return;

        if (kernfs_lockdep(kn))
                rwsem_release(&kn->dep_map, _RET_IP_);
        v = atomic_dec_return(&kn->active);
        if (likely(v != KN_DEACTIVATED_BIAS))
                return;

        wake_up_all(&kernfs_root(kn)->deactivate_waitq);
}

/**
 * kernfs_drain - drain kernfs_node
 * @kn: kernfs_node to drain
 *
 * Drain existing usages and nuke all existing mmaps of @kn.  Mutiple
 * removers may invoke this function concurrently on @kn and all will
 * return after draining is complete.
 */
static void kernfs_drain(struct kernfs_node *kn)
        __releases(&kernfs_mutex) __acquires(&kernfs_mutex)
{
        struct kernfs_root *root = kernfs_root(kn);

        lockdep_assert_held(&kernfs_mutex);
        WARN_ON_ONCE(kernfs_active(kn));

        mutex_unlock(&kernfs_mutex);

        if (kernfs_lockdep(kn)) {
                rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
                if (atomic_read(&kn->active) != KN_DEACTIVATED_BIAS)
                        lock_contended(&kn->dep_map, _RET_IP_);
        }

        /* but everyone should wait for draining */
        wait_event(root->deactivate_waitq,
                   atomic_read(&kn->active) == KN_DEACTIVATED_BIAS);

        if (kernfs_lockdep(kn)) {
                lock_acquired(&kn->dep_map, _RET_IP_);
                rwsem_release(&kn->dep_map, _RET_IP_);
        }

        kernfs_drain_open_files(kn);

        mutex_lock(&kernfs_mutex);
}

/**
 * kernfs_get - get a reference count on a kernfs_node
 * @kn: the target kernfs_node
 */
void kernfs_get(struct kernfs_node *kn)
{
        if (kn) {
                WARN_ON(!atomic_read(&kn->count));
                atomic_inc(&kn->count);
        }
}
EXPORT_SYMBOL_GPL(kernfs_get);

/**
 * kernfs_put - put a reference count on a kernfs_node
 * @kn: the target kernfs_node
 *
 * Put a reference count of @kn and destroy it if it reached zero.
 */
void kernfs_put(struct kernfs_node *kn)
{
        struct kernfs_node *parent;
        struct kernfs_root *root;

        if (!kn || !atomic_dec_and_test(&kn->count))
                return;
        root = kernfs_root(kn);
 repeat:
        /*
         * Moving/renaming is always done while holding reference.
         * kn->parent won't change beneath us.
         */
        parent = kn->parent;

        WARN_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS,
                  "kernfs_put: %s/%s: released with incorrect active_ref %d\n",
                  parent ? parent->name : "", kn->name, atomic_read(&kn->active));

        if (kernfs_type(kn) == KERNFS_LINK)
                kernfs_put(kn->symlink.target_kn);

        kfree_const(kn->name);

        if (kn->iattr) {
                simple_xattrs_free(&kn->iattr->xattrs);
                kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
        }
        spin_lock(&kernfs_idr_lock);
        idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
        spin_unlock(&kernfs_idr_lock);
        kmem_cache_free(kernfs_node_cache, kn);

        kn = parent;
        if (kn) {
                if (atomic_dec_and_test(&kn->count))
                        goto repeat;
        } else {
                /* just released the root kn, free @root too */
                idr_destroy(&root->ino_idr);
                kfree(root);
        }
}
EXPORT_SYMBOL_GPL(kernfs_put);

static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
{
        struct kernfs_node *kn;

        if (flags & LOOKUP_RCU)
                return -ECHILD;

        /* Always perform fresh lookup for negatives */
        if (d_really_is_negative(dentry))
                goto out_bad_unlocked;

        kn = kernfs_dentry_node(dentry);
        mutex_lock(&kernfs_mutex);

        /* The kernfs node has been deactivated */
        if (!kernfs_active(kn))
                goto out_bad;

        /* The kernfs node has been moved? */
        if (kernfs_dentry_node(dentry->d_parent) != kn->parent)
                goto out_bad;

        /* The kernfs node has been renamed */
        if (strcmp(dentry->d_name.name, kn->name) != 0)
                goto out_bad;

        /* The kernfs node has been moved to a different namespace */
        if (kn->parent && kernfs_ns_enabled(kn->parent) &&
            kernfs_info(dentry->d_sb)->ns != kn->ns)
                goto out_bad;

        mutex_unlock(&kernfs_mutex);
        return 1;
out_bad:
        mutex_unlock(&kernfs_mutex);
out_bad_unlocked:
        return 0;
}

const struct dentry_operations kernfs_dops = {
        .d_revalidate        = kernfs_dop_revalidate,
};

/**
 * kernfs_node_from_dentry - determine kernfs_node associated with a dentry
 * @dentry: the dentry in question
 *
 * Return the kernfs_node associated with @dentry.  If @dentry is not a
 * kernfs one, %NULL is returned.
 *
 * While the returned kernfs_node will stay accessible as long as @dentry
 * is accessible, the returned node can be in any state and the caller is
 * fully responsible for determining what's accessible.
 */
struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry)
{
        if (dentry->d_sb->s_op == &kernfs_sops &&
            !d_really_is_negative(dentry))
                return kernfs_dentry_node(dentry);
        return NULL;
}

static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
                                             struct kernfs_node *parent,
                                             const char *name, umode_t mode,
                                             kuid_t uid, kgid_t gid,
                                             unsigned flags)
{
        struct kernfs_node *kn;
        u32 id_highbits;
        int ret;

        name = kstrdup_const(name, GFP_KERNEL);
        if (!name)
                return NULL;

        kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL);
        if (!kn)
                goto err_out1;

        idr_preload(GFP_KERNEL);
        spin_lock(&kernfs_idr_lock);
        ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC);
        if (ret >= 0 && ret < root->last_id_lowbits)
                root->id_highbits++;
        id_highbits = root->id_highbits;
        root->last_id_lowbits = ret;
        spin_unlock(&kernfs_idr_lock);
        idr_preload_end();
        if (ret < 0)
                goto err_out2;

        kn->id = (u64)id_highbits << 32 | ret;

        atomic_set(&kn->count, 1);
        atomic_set(&kn->active, KN_DEACTIVATED_BIAS);
        RB_CLEAR_NODE(&kn->rb);

        kn->name = name;
        kn->mode = mode;
        kn->flags = flags;

        if (!uid_eq(uid, GLOBAL_ROOT_UID) || !gid_eq(gid, GLOBAL_ROOT_GID)) {
                struct iattr iattr = {
                        .ia_valid = ATTR_UID | ATTR_GID,
                        .ia_uid = uid,
                        .ia_gid = gid,
                };

                ret = __kernfs_setattr(kn, &iattr);
                if (ret < 0)
                        goto err_out3;
        }

        if (parent) {
                ret = security_kernfs_init_security(parent, kn);
                if (ret)
                        goto err_out3;
        }

        return kn;

 err_out3:
        spin_lock(&kernfs_idr_lock);
        idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
        spin_unlock(&kernfs_idr_lock);
 err_out2:
        kmem_cache_free(kernfs_node_cache, kn);
 err_out1:
        kfree_const(name);
        return NULL;
}

struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
                                    const char *name, umode_t mode,
                                    kuid_t uid, kgid_t gid,
                                    unsigned flags)
{
        struct kernfs_node *kn;

        if (parent->mode & S_ISGID) {
                /* this code block imitates inode_init_owner() for
                 * kernfs
                 */

                if (parent->iattr)
                        gid = parent->iattr->ia_gid;

                if (flags & KERNFS_DIR)
                        mode |= S_ISGID;
        }

        kn = __kernfs_new_node(kernfs_root(parent), parent,
                               name, mode, uid, gid, flags);
        if (kn) {
                kernfs_get(parent);
                kn->parent = parent;
        }
        return kn;
}

/*
 * kernfs_find_and_get_node_by_id - get kernfs_node from node id
 * @root: the kernfs root
 * @id: the target node id
 *
 * @id's lower 32bits encode ino and upper gen.  If the gen portion is
 * zero, all generations are matched.
 *
 * RETURNS:
 * NULL on failure. Return a kernfs node with reference counter incremented
 */
struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
                                                   u64 id)
{
        struct kernfs_node *kn;
        ino_t ino = kernfs_id_ino(id);
        u32 gen = kernfs_id_gen(id);

        spin_lock(&kernfs_idr_lock);

        kn = idr_find(&root->ino_idr, (u32)ino);
        if (!kn)
                goto err_unlock;

        if (sizeof(ino_t) >= sizeof(u64)) {
                /* we looked up with the low 32bits, compare the whole */
                if (kernfs_ino(kn) != ino)
                        goto err_unlock;
        } else {
                /* 0 matches all generations */
                if (unlikely(gen && kernfs_gen(kn) != gen))
                        goto err_unlock;
        }

        /*
         * ACTIVATED is protected with kernfs_mutex but it was clear when
         * @kn was added to idr and we just wanna see it set.  No need to
         * grab kernfs_mutex.
         */
        if (unlikely(!(kn->flags & KERNFS_ACTIVATED) ||
                     !atomic_inc_not_zero(&kn->count)))
                goto err_unlock;

        spin_unlock(&kernfs_idr_lock);
        return kn;
err_unlock:
        spin_unlock(&kernfs_idr_lock);
        return NULL;
}

/**
 *        kernfs_add_one - add kernfs_node to parent without warning
 *        @kn: kernfs_node to be added
 *
 *        The caller must already have initialized @kn->parent.  This
 *        function increments nlink of the parent's inode if @kn is a
 *        directory and link into the children list of the parent.
 *
 *        RETURNS:
 *        0 on success, -EEXIST if entry with the given name already
 *        exists.
 */
int kernfs_add_one(struct kernfs_node *kn)
{
        struct kernfs_node *parent = kn->parent;
        struct kernfs_iattrs *ps_iattr;
        bool has_ns;
        int ret;

        mutex_lock(&kernfs_mutex);

        ret = -EINVAL;
        has_ns = kernfs_ns_enabled(parent);
        if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
                 has_ns ? "required" : "invalid", parent->name, kn->name))
                goto out_unlock;

        if (kernfs_type(parent) != KERNFS_DIR)
                goto out_unlock;

        ret = -ENOENT;
        if (parent->flags & KERNFS_EMPTY_DIR)
                goto out_unlock;

        if ((parent->flags & KERNFS_ACTIVATED) && !kernfs_active(parent))
                goto out_unlock;

        kn->hash = kernfs_name_hash(kn->name, kn->ns);

        ret = kernfs_link_sibling(kn);
        if (ret)
                goto out_unlock;

        /* Update timestamps on the parent */
        ps_iattr = parent->iattr;
        if (ps_iattr) {
                ktime_get_real_ts64(&ps_iattr->ia_ctime);
                ps_iattr->ia_mtime = ps_iattr->ia_ctime;
        }

        mutex_unlock(&kernfs_mutex);

        /*
         * Activate the new node unless CREATE_DEACTIVATED is requested.
         * If not activated here, the kernfs user is responsible for
         * activating the node with kernfs_activate().  A node which hasn't
         * been activated is not visible to userland and its removal won't
         * trigger deactivation.
         */
        if (!(kernfs_root(kn)->flags & KERNFS_ROOT_CREATE_DEACTIVATED))
                kernfs_activate(kn);
        return 0;

out_unlock:
        mutex_unlock(&kernfs_mutex);
        return ret;
}

/**
 * kernfs_find_ns - find kernfs_node with the given name
 * @parent: kernfs_node to search under
 * @name: name to look for
 * @ns: the namespace tag to use
 *
 * Look for kernfs_node with name @name under @parent.  Returns pointer to
 * the found kernfs_node on success, %NULL on failure.
 */
static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
                                          const unsigned char *name,
                                          const void *ns)
{
        struct rb_node *node = parent->dir.children.rb_node;
        bool has_ns = kernfs_ns_enabled(parent);
        unsigned int hash;

        lockdep_assert_held(&kernfs_mutex);

        if (has_ns != (bool)ns) {
                WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
                     has_ns ? "required" : "invalid", parent->name, name);
                return NULL;
        }

        hash = kernfs_name_hash(name, ns);
        while (node) {
                struct kernfs_node *kn;
                int result;

                kn = rb_to_kn(node);
                result = kernfs_name_compare(hash, name, ns, kn);
                if (result < 0)
                        node = node->rb_left;
                else if (result > 0)
                        node = node->rb_right;
                else
                        return kn;
        }
        return NULL;
}

static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
                                          const unsigned char *path,
                                          const void *ns)
{
        size_t len;
        char *p, *name;

        lockdep_assert_held(&kernfs_mutex);

        spin_lock_irq(&kernfs_pr_cont_lock);

        len = strlcpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf));

        if (len >= sizeof(kernfs_pr_cont_buf)) {
                spin_unlock_irq(&kernfs_pr_cont_lock);
                return NULL;
        }

        p = kernfs_pr_cont_buf;

        while ((name = strsep(&p, "/")) && parent) {
                if (*name == '\0')
                        continue;
                parent = kernfs_find_ns(parent, name, ns);
        }

        spin_unlock_irq(&kernfs_pr_cont_lock);

        return parent;
}

/**
 * kernfs_find_and_get_ns - find and get kernfs_node with the given name
 * @parent: kernfs_node to search under
 * @name: name to look for
 * @ns: the namespace tag to use
 *
 * Look for kernfs_node with name @name under @parent and get a reference
 * if found.  This function may sleep and returns pointer to the found
 * kernfs_node on success, %NULL on failure.
 */
struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
                                           const char *name, const void *ns)
{
        struct kernfs_node *kn;

        mutex_lock(&kernfs_mutex);
        kn = kernfs_find_ns(parent, name, ns);
        kernfs_get(kn);
        mutex_unlock(&kernfs_mutex);

        return kn;
}
EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns);

/**
 * kernfs_walk_and_get_ns - find and get kernfs_node with the given path
 * @parent: kernfs_node to search under
 * @path: path to look for
 * @ns: the namespace tag to use
 *
 * Look for kernfs_node with path @path under @parent and get a reference
 * if found.  This function may sleep and returns pointer to the found
 * kernfs_node on success, %NULL on failure.
 */
struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent,
                                           const char *path, const void *ns)
{
        struct kernfs_node *kn;

        mutex_lock(&kernfs_mutex);
        kn = kernfs_walk_ns(parent, path, ns);
        kernfs_get(kn);
        mutex_unlock(&kernfs_mutex);

        return kn;
}

/**
 * kernfs_create_root - create a new kernfs hierarchy
 * @scops: optional syscall operations for the hierarchy
 * @flags: KERNFS_ROOT_* flags
 * @priv: opaque data associated with the new directory
 *
 * Returns the root of the new hierarchy on success, ERR_PTR() value on
 * failure.
 */
struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
                                       unsigned int flags, void *priv)
{
        struct kernfs_root *root;
        struct kernfs_node *kn;

        root = kzalloc(sizeof(*root), GFP_KERNEL);
        if (!root)
                return ERR_PTR(-ENOMEM);

        idr_init(&root->ino_idr);
        INIT_LIST_HEAD(&root->supers);

        /*
         * On 64bit ino setups, id is ino.  On 32bit, low 32bits are ino.
         * High bits generation.  The starting value for both ino and
         * genenration is 1.  Initialize upper 32bit allocation
         * accordingly.
         */
        if (sizeof(ino_t) >= sizeof(u64))
                root->id_highbits = 0;
        else
                root->id_highbits = 1;

        kn = __kernfs_new_node(root, NULL, "", S_IFDIR | S_IRUGO | S_IXUGO,
                               GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
                               KERNFS_DIR);
        if (!kn) {
                idr_destroy(&root->ino_idr);
                kfree(root);
                return ERR_PTR(-ENOMEM);
        }

        kn->priv = priv;
        kn->dir.root = root;

        root->syscall_ops = scops;
        root->flags = flags;
        root->kn = kn;
        init_waitqueue_head(&root->deactivate_waitq);

        if (!(root->flags & KERNFS_ROOT_CREATE_DEACTIVATED))
                kernfs_activate(kn);

        return root;
}

/**
 * kernfs_destroy_root - destroy a kernfs hierarchy
 * @root: root of the hierarchy to destroy
 *
 * Destroy the hierarchy anchored at @root by removing all existing
 * directories and destroying @root.
 */
void kernfs_destroy_root(struct kernfs_root *root)
{
        kernfs_remove(root->kn);        /* will also free @root */
}

/**
 * kernfs_create_dir_ns - create a directory
 * @parent: parent in which to create a new directory
 * @name: name of the new directory
 * @mode: mode of the new directory
 * @uid: uid of the new directory
 * @gid: gid of the new directory
 * @priv: opaque data associated with the new directory
 * @ns: optional namespace tag of the directory
 *
 * Returns the created node on success, ERR_PTR() value on failure.
 */
struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
                                         const char *name, umode_t mode,
                                         kuid_t uid, kgid_t gid,
                                         void *priv, const void *ns)
{
        struct kernfs_node *kn;
        int rc;

        /* allocate */
        kn = kernfs_new_node(parent, name, mode | S_IFDIR,
                             uid, gid, KERNFS_DIR);
        if (!kn)
                return ERR_PTR(-ENOMEM);

        kn->dir.root = parent->dir.root;
        kn->ns = ns;
        kn->priv = priv;

        /* link in */
        rc = kernfs_add_one(kn);
        if (!rc)
                return kn;

        kernfs_put(kn);
        return ERR_PTR(rc);
}

/**
 * kernfs_create_empty_dir - create an always empty directory
 * @parent: parent in which to create a new directory
 * @name: name of the new directory
 *
 * Returns the created node on success, ERR_PTR() value on failure.
 */
struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent,
                                            const char *name)
{
        struct kernfs_node *kn;
        int rc;

        /* allocate */
        kn = kernfs_new_node(parent, name, S_IRUGO|S_IXUGO|S_IFDIR,
                             GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, KERNFS_DIR);
        if (!kn)
                return ERR_PTR(-ENOMEM);

        kn->flags |= KERNFS_EMPTY_DIR;
        kn->dir.root = parent->dir.root;
        kn->ns = NULL;
        kn->priv = NULL;

        /* link in */
        rc = kernfs_add_one(kn);
        if (!rc)
                return kn;

        kernfs_put(kn);
        return ERR_PTR(rc);
}

static struct dentry *kernfs_iop_lookup(struct inode *dir,
                                        struct dentry *dentry,
                                        unsigned int flags)
{
        struct dentry *ret;
        struct kernfs_node *parent = dir->i_private;
        struct kernfs_node *kn;
        struct inode *inode;
        const void *ns = NULL;

        mutex_lock(&kernfs_mutex);

        if (kernfs_ns_enabled(parent))
                ns = kernfs_info(dir->i_sb)->ns;

        kn = kernfs_find_ns(parent, dentry->d_name.name, ns);

        /* no such entry */
        if (!kn || !kernfs_active(kn)) {
                ret = NULL;
                goto out_unlock;
        }

        /* attach dentry and inode */
        inode = kernfs_get_inode(dir->i_sb, kn);
        if (!inode) {
                ret = ERR_PTR(-ENOMEM);
                goto out_unlock;
        }

        /* instantiate and hash dentry */
        ret = d_splice_alias(inode, dentry);
 out_unlock:
        mutex_unlock(&kernfs_mutex);
        return ret;
}

static int kernfs_iop_mkdir(struct inode *dir, struct dentry *dentry,
                            umode_t mode)
{
        struct kernfs_node *parent = dir->i_private;
        struct kernfs_syscall_ops *scops = kernfs_root(parent)->syscall_ops;
        int ret;

        if (!scops || !scops->mkdir)
                return -EPERM;

        if (!kernfs_get_active(parent))
                return -ENODEV;

        ret = scops->mkdir(parent, dentry->d_name.name, mode);

        kernfs_put_active(parent);
        return ret;
}

static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry)
{
        struct kernfs_node *kn  = kernfs_dentry_node(dentry);
        struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops;
        int ret;

        if (!scops || !scops->rmdir)
                return -EPERM;

        if (!kernfs_get_active(kn))
                return -ENODEV;

        ret = scops->rmdir(kn);

        kernfs_put_active(kn);
        return ret;
}

static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry,
                             struct inode *new_dir, struct dentry *new_dentry,
                             unsigned int flags)
{
        struct kernfs_node *kn = kernfs_dentry_node(old_dentry);
        struct kernfs_node *new_parent = new_dir->i_private;
        struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops;
        int ret;

        if (flags)
                return -EINVAL;

        if (!scops || !scops->rename)
                return -EPERM;

        if (!kernfs_get_active(kn))
                return -ENODEV;

        if (!kernfs_get_active(new_parent)) {
                kernfs_put_active(kn);
                return -ENODEV;
        }

        ret = scops->rename(kn, new_parent, new_dentry->d_name.name);

        kernfs_put_active(new_parent);
        kernfs_put_active(kn);
        return ret;
}

const struct inode_operations kernfs_dir_iops = {
        .lookup                = kernfs_iop_lookup,
        .permission        = kernfs_iop_permission,
        .setattr        = kernfs_iop_setattr,
        .getattr        = kernfs_iop_getattr,
        .listxattr        = kernfs_iop_listxattr,

        .mkdir                = kernfs_iop_mkdir,
        .rmdir                = kernfs_iop_rmdir,
        .rename                = kernfs_iop_rename,
};

static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos)
{
        struct kernfs_node *last;

        while (true) {
                struct rb_node *rbn;

                last = pos;

                if (kernfs_type(pos) != KERNFS_DIR)
                        break;

                rbn = rb_first(&pos->dir.children);
                if (!rbn)
                        break;

                pos = rb_to_kn(rbn);
        }

        return last;
}

/**
 * kernfs_next_descendant_post - find the next descendant for post-order walk
 * @pos: the current position (%NULL to initiate traversal)
 * @root: kernfs_node whose descendants to walk
 *
 * Find the next descendant to visit for post-order traversal of @root's
 * descendants.  @root is included in the iteration and the last node to be
 * visited.
 */
static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
                                                       struct kernfs_node *root)
{
        struct rb_node *rbn;

        lockdep_assert_held(&kernfs_mutex);

        /* if first iteration, visit leftmost descendant which may be root */
        if (!pos)
                return kernfs_leftmost_descendant(root);

        /* if we visited @root, we're done */
        if (pos == root)
                return NULL;

        /* if there's an unvisited sibling, visit its leftmost descendant */
        rbn = rb_next(&pos->rb);
        if (rbn)
                return kernfs_leftmost_descendant(rb_to_kn(rbn));

        /* no sibling left, visit parent */
        return pos->parent;
}

/**
 * kernfs_activate - activate a node which started deactivated
 * @kn: kernfs_node whose subtree is to be activated
 *
 * If the root has KERNFS_ROOT_CREATE_DEACTIVATED set, a newly created node
 * needs to be explicitly activated.  A node which hasn't been activated
 * isn't visible to userland and deactivation is skipped during its
 * removal.  This is useful to construct atomic init sequences where
 * creation of multiple nodes should either succeed or fail atomically.
 *
 * The caller is responsible for ensuring that this function is not called
 * after kernfs_remove*() is invoked on @kn.
 */
void kernfs_activate(struct kernfs_node *kn)
{
        struct kernfs_node *pos;

        mutex_lock(&kernfs_mutex);

        pos = NULL;
        while ((pos = kernfs_next_descendant_post(pos, kn))) {
                if (pos->flags & KERNFS_ACTIVATED)
                        continue;

                WARN_ON_ONCE(pos->parent && RB_EMPTY_NODE(&pos->rb));
                WARN_ON_ONCE(atomic_read(&pos->active) != KN_DEACTIVATED_BIAS);

                atomic_sub(KN_DEACTIVATED_BIAS, &pos->active);
                pos->flags |= KERNFS_ACTIVATED;
        }

        mutex_unlock(&kernfs_mutex);
}

static void __kernfs_remove(struct kernfs_node *kn)
{
        struct kernfs_node *pos;

        lockdep_assert_held(&kernfs_mutex);

        /*
         * Short-circuit if non-root @kn has already finished removal.
         * This is for kernfs_remove_self() which plays with active ref
         * after removal.
         */
        if (!kn || (kn->parent && RB_EMPTY_NODE(&kn->rb)))
                return;

        pr_debug("kernfs %s: removing\n", kn->name);

        /* prevent any new usage under @kn by deactivating all nodes */
        pos = NULL;
        while ((pos = kernfs_next_descendant_post(pos, kn)))
                if (kernfs_active(pos))
                        atomic_add(KN_DEACTIVATED_BIAS, &pos->active);

        /* deactivate and unlink the subtree node-by-node */
        do {
                pos = kernfs_leftmost_descendant(kn);

                /*
                 * kernfs_drain() drops kernfs_mutex temporarily and @pos's
                 * base ref could have been put by someone else by the time
                 * the function returns.  Make sure it doesn't go away
                 * underneath us.
                 */
                kernfs_get(pos);

                /*
                 * Drain iff @kn was activated.  This avoids draining and
                 * its lockdep annotations for nodes which have never been
                 * activated and allows embedding kernfs_remove() in create
                 * error paths without worrying about draining.
                 */
                if (kn->flags & KERNFS_ACTIVATED)
                        kernfs_drain(pos);
                else
                        WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS);

                /*
                 * kernfs_unlink_sibling() succeeds once per node.  Use it
                 * to decide who's responsible for cleanups.
                 */
                if (!pos->parent || kernfs_unlink_sibling(pos)) {
                        struct kernfs_iattrs *ps_iattr =
                                pos->parent ? pos->parent->iattr : NULL;

                        /* update timestamps on the parent */
                        if (ps_iattr) {
                                ktime_get_real_ts64(&ps_iattr->ia_ctime);
                                ps_iattr->ia_mtime = ps_iattr->ia_ctime;
                        }

                        kernfs_put(pos);
                }

                kernfs_put(pos);
        } while (pos != kn);
}

/**
 * kernfs_remove - remove a kernfs_node recursively
 * @kn: the kernfs_node to remove
 *
 * Remove @kn along with all its subdirectories and files.
 */
void kernfs_remove(struct kernfs_node *kn)
{
        mutex_lock(&kernfs_mutex);
        __kernfs_remove(kn);
        mutex_unlock(&kernfs_mutex);
}

/**
 * kernfs_break_active_protection - break out of active protection
 * @kn: the self kernfs_node
 *
 * The caller must be running off of a kernfs operation which is invoked
 * with an active reference - e.g. one of kernfs_ops.  Each invocation of
 * this function must also be matched with an invocation of
 * kernfs_unbreak_active_protection().
 *
 * This function releases the active reference of @kn the caller is
 * holding.  Once this function is called, @kn may be removed at any point
 * and the caller is solely responsible for ensuring that the objects it
 * dereferences are accessible.
 */
void kernfs_break_active_protection(struct kernfs_node *kn)
{
        /*
         * Take out ourself out of the active ref dependency chain.  If
         * we're called without an active ref, lockdep will complain.
         */
        kernfs_put_active(kn);
}

/**
 * kernfs_unbreak_active_protection - undo kernfs_break_active_protection()
 * @kn: the self kernfs_node
 *
 * If kernfs_break_active_protection() was called, this function must be
 * invoked before finishing the kernfs operation.  Note that while this
 * function restores the active reference, it doesn't and can't actually
 * restore the active protection - @kn may already or be in the process of
 * being removed.  Once kernfs_break_active_protection() is invoked, that
 * protection is irreversibly gone for the kernfs operation instance.
 *
 * While this function may be called at any point after
 * kernfs_break_active_protection() is invoked, its most useful location
 * would be right before the enclosing kernfs operation returns.
 */
void kernfs_unbreak_active_protection(struct kernfs_node *kn)
{
        /*
         * @kn->active could be in any state; however, the increment we do
         * here will be undone as soon as the enclosing kernfs operation
         * finishes and this temporary bump can't break anything.  If @kn
         * is alive, nothing changes.  If @kn is being deactivated, the
         * soon-to-follow put will either finish deactivation or restore
         * deactivated state.  If @kn is already removed, the temporary
         * bump is guaranteed to be gone before @kn is released.
         */
        atomic_inc(&kn->active);
        if (kernfs_lockdep(kn))
                rwsem_acquire(&kn->dep_map, 0, 1, _RET_IP_);
}

/**
 * kernfs_remove_self - remove a kernfs_node from its own method
 * @kn: the self kernfs_node to remove
 *
 * The caller must be running off of a kernfs operation which is invoked
 * with an active reference - e.g. one of kernfs_ops.  This can be used to
 * implement a file operation which deletes itself.
 *
 * For example, the "delete" file for a sysfs device directory can be
 * implemented by invoking kernfs_remove_self() on the "delete" file
 * itself.  This function breaks the circular dependency of trying to
 * deactivate self while holding an active ref itself.  It isn't necessary
 * to modify the usual removal path to use kernfs_remove_self().  The
 * "delete" implementation can simply invoke kernfs_remove_self() on self
 * before proceeding with the usual removal path.  kernfs will ignore later
 * kernfs_remove() on self.
 *
 * kernfs_remove_self() can be called multiple times concurrently on the
 * same kernfs_node.  Only the first one actually performs removal and
 * returns %true.  All others will wait until the kernfs operation which
 * won self-removal finishes and return %false.  Note that the losers wait
 * for the completion of not only the winning kernfs_remove_self() but also
 * the whole kernfs_ops which won the arbitration.  This can be used to
 * guarantee, for example, all concurrent writes to a "delete" file to
 * finish only after the whole operation is complete.
 */
bool kernfs_remove_self(struct kernfs_node *kn)
{
        bool ret;

        mutex_lock(&kernfs_mutex);
        kernfs_break_active_protection(kn);

        /*
         * SUICIDAL is used to arbitrate among competing invocations.  Only
         * the first one will actually perform removal.  When the removal
         * is complete, SUICIDED is set and the active ref is restored
         * while holding kernfs_mutex.  The ones which lost arbitration
         * waits for SUICDED && drained which can happen only after the
         * enclosing kernfs operation which executed the winning instance
         * of kernfs_remove_self() finished.
         */
        if (!(kn->flags & KERNFS_SUICIDAL)) {
                kn->flags |= KERNFS_SUICIDAL;
                __kernfs_remove(kn);
                kn->flags |= KERNFS_SUICIDED;
                ret = true;
        } else {
                wait_queue_head_t *waitq = &kernfs_root(kn)->deactivate_waitq;
                DEFINE_WAIT(wait);

                while (true) {
                        prepare_to_wait(waitq, &wait, TASK_UNINTERRUPTIBLE);

                        if ((kn->flags & KERNFS_SUICIDED) &&
                            atomic_read(&kn->active) == KN_DEACTIVATED_BIAS)
                                break;

                        mutex_unlock(&kernfs_mutex);
                        schedule();
                        mutex_lock(&kernfs_mutex);
                }
                finish_wait(waitq, &wait);
                WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb));
                ret = false;
        }

        /*
         * This must be done while holding kernfs_mutex; otherwise, waiting
         * for SUICIDED && deactivated could finish prematurely.
         */
        kernfs_unbreak_active_protection(kn);

        mutex_unlock(&kernfs_mutex);
        return ret;
}

/**
 * kernfs_remove_by_name_ns - find a kernfs_node by name and remove it
 * @parent: parent of the target
 * @name: name of the kernfs_node to remove
 * @ns: namespace tag of the kernfs_node to remove
 *
 * Look for the kernfs_node with @name and @ns under @parent and remove it.
 * Returns 0 on success, -ENOENT if such entry doesn't exist.
 */
int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
                             const void *ns)
{
        struct kernfs_node *kn;

        if (!parent) {
                WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n",
                        name);
                return -ENOENT;
        }

        mutex_lock(&kernfs_mutex);

        kn = kernfs_find_ns(parent, name, ns);
        if (kn) {
                kernfs_get(kn);
                __kernfs_remove(kn);
                kernfs_put(kn);
        }

        mutex_unlock(&kernfs_mutex);

        if (kn)
                return 0;
        else
                return -ENOENT;
}

/**
 * kernfs_rename_ns - move and rename a kernfs_node
 * @kn: target node
 * @new_parent: new parent to put @sd under
 * @new_name: new name
 * @new_ns: new namespace tag
 */
int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
                     const char *new_name, const void *new_ns)
{
        struct kernfs_node *old_parent;
        const char *old_name = NULL;
        int error;

        /* can't move or rename root */
        if (!kn->parent)
                return -EINVAL;

        mutex_lock(&kernfs_mutex);

        error = -ENOENT;
        if (!kernfs_active(kn) || !kernfs_active(new_parent) ||
            (new_parent->flags & KERNFS_EMPTY_DIR))
                goto out;

        error = 0;
        if ((kn->parent == new_parent) && (kn->ns == new_ns) &&
            (strcmp(kn->name, new_name) == 0))
                goto out;        /* nothing to rename */

        error = -EEXIST;
        if (kernfs_find_ns(new_parent, new_name, new_ns))
                goto out;

        /* rename kernfs_node */
        if (strcmp(kn->name, new_name) != 0) {
                error = -ENOMEM;
                new_name = kstrdup_const(new_name, GFP_KERNEL);
                if (!new_name)
                        goto out;
        } else {
                new_name = NULL;
        }

        /*
         * Move to the appropriate place in the appropriate directories rbtree.
         */
        kernfs_unlink_sibling(kn);
        kernfs_get(new_parent);

        /* rename_lock protects ->parent and ->name accessors */
        spin_lock_irq(&kernfs_rename_lock);

        old_parent = kn->parent;
        kn->parent = new_parent;

        kn->ns = new_ns;
        if (new_name) {
                old_name = kn->name;
                kn->name = new_name;
        }

        spin_unlock_irq(&kernfs_rename_lock);

        kn->hash = kernfs_name_hash(kn->name, kn->ns);
        kernfs_link_sibling(kn);

        kernfs_put(old_parent);
        kfree_const(old_name);

        error = 0;
 out:
        mutex_unlock(&kernfs_mutex);
        return error;
}

/* Relationship between s_mode and the DT_xxx types */
static inline unsigned char dt_type(struct kernfs_node *kn)
{
        return (kn->mode >> 12) & 15;
}

static int kernfs_dir_fop_release(struct inode *inode, struct file *filp)
{
        kernfs_put(filp->private_data);
        return 0;
}

static struct kernfs_node *kernfs_dir_pos(const void *ns,
        struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos)
{
        if (pos) {
                int valid = kernfs_active(pos) &&
                        pos->parent == parent && hash == pos->hash;
                kernfs_put(pos);
                if (!valid)
                        pos = NULL;
        }
        if (!pos && (hash > 1) && (hash < INT_MAX)) {
                struct rb_node *node = parent->dir.children.rb_node;
                while (node) {
                        pos = rb_to_kn(node);

                        if (hash < pos->hash)
                                node = node->rb_left;
                        else if (hash > pos->hash)
                                node = node->rb_right;
                        else
                                break;
                }
        }
        /* Skip over entries which are dying/dead or in the wrong namespace */
        while (pos && (!kernfs_active(pos) || pos->ns != ns)) {
                struct rb_node *node = rb_next(&pos->rb);
                if (!node)
                        pos = NULL;
                else
                        pos = rb_to_kn(node);
        }
        return pos;
}

static struct kernfs_node *kernfs_dir_next_pos(const void *ns,
        struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos)
{
        pos = kernfs_dir_pos(ns, parent, ino, pos);
        if (pos) {
                do {
                        struct rb_node *node = rb_next(&pos->rb);
                        if (!node)
                                pos = NULL;
                        else
                                pos = rb_to_kn(node);
                } while (pos && (!kernfs_active(pos) || pos->ns != ns));
        }
        return pos;
}

static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
{
        struct dentry *dentry = file->f_path.dentry;
        struct kernfs_node *parent = kernfs_dentry_node(dentry);
        struct kernfs_node *pos = file->private_data;
        const void *ns = NULL;

        if (!dir_emit_dots(file, ctx))
                return 0;
        mutex_lock(&kernfs_mutex);

        if (kernfs_ns_enabled(parent))
                ns = kernfs_info(dentry->d_sb)->ns;

        for (pos = kernfs_dir_pos(ns, parent, ctx->pos, pos);
             pos;
             pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) {
                const char *name = pos->name;
                unsigned int type = dt_type(pos);
                int len = strlen(name);
                ino_t ino = kernfs_ino(pos);

                ctx->pos = pos->hash;
                file->private_data = pos;
                kernfs_get(pos);

                mutex_unlock(&kernfs_mutex);
                if (!dir_emit(ctx, name, len, ino, type))
                        return 0;
                mutex_lock(&kernfs_mutex);
        }
        mutex_unlock(&kernfs_mutex);
        file->private_data = NULL;
        ctx->pos = INT_MAX;
        return 0;
}

const struct file_operations kernfs_dir_fops = {
        .read                = generic_read_dir,
        .iterate_shared        = kernfs_fop_readdir,
        .release        = kernfs_dir_fop_release,
        .llseek                = generic_file_llseek,
};





























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PAGE_H
#define _ASM_X86_PAGE_H

#include <linux/types.h>

#ifdef __KERNEL__

#include <asm/page_types.h>

#ifdef CONFIG_X86_64
#include <asm/page_64.h>
#else
#include <asm/page_32.h>
#endif        /* CONFIG_X86_64 */

#ifndef __ASSEMBLY__

struct page;

#include <linux/range.h>
extern struct range pfn_mapped[];
extern int nr_pfn_mapped;

static inline void clear_user_page(void *page, unsigned long vaddr,
                                   struct page *pg)
{
        clear_page(page);
}

static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
                                  struct page *topage)
{
        copy_page(to, from);
}

#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
        alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE

#ifndef __pa
#define __pa(x)                __phys_addr((unsigned long)(x))
#endif

#define __pa_nodebug(x)        __phys_addr_nodebug((unsigned long)(x))
/* __pa_symbol should be used for C visible symbols.
   This seems to be the official gcc blessed way to do such arithmetic. */
/*
 * We need __phys_reloc_hide() here because gcc may assume that there is no
 * overflow during __pa() calculation and can optimize it unexpectedly.
 * Newer versions of gcc provide -fno-strict-overflow switch to handle this
 * case properly. Once all supported versions of gcc understand it, we can
 * remove this Voodoo magic stuff. (i.e. once gcc3.x is deprecated)
 */
#define __pa_symbol(x) \
        __phys_addr_symbol(__phys_reloc_hide((unsigned long)(x)))

#ifndef __va
#define __va(x)                        ((void *)((unsigned long)(x)+PAGE_OFFSET))
#endif

#define __boot_va(x)                __va(x)
#define __boot_pa(x)                __pa(x)

/*
 * virt_to_page(kaddr) returns a valid pointer if and only if
 * virt_addr_valid(kaddr) returns true.
 */
#define virt_to_page(kaddr)        pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
#define pfn_to_kaddr(pfn)      __va((pfn) << PAGE_SHIFT)
extern bool __virt_addr_valid(unsigned long kaddr);
#define virt_addr_valid(kaddr)        __virt_addr_valid((unsigned long) (kaddr))

static __always_inline u64 __canonical_address(u64 vaddr, u8 vaddr_bits)
{
        return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits);
}

static __always_inline u64 __is_canonical_address(u64 vaddr, u8 vaddr_bits)
{
        return __canonical_address(vaddr, vaddr_bits) == vaddr;
}

#endif        /* __ASSEMBLY__ */

#include <asm-generic/memory_model.h>
#include <asm-generic/getorder.h>

#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA

#endif        /* __KERNEL__ */
#endif /* _ASM_X86_PAGE_H */






































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MEMREMAP_H_
#define _LINUX_MEMREMAP_H_
#include <linux/range.h>
#include <linux/ioport.h>
#include <linux/percpu-refcount.h>

struct resource;
struct device;

/**
 * struct vmem_altmap - pre-allocated storage for vmemmap_populate
 * @base_pfn: base of the entire dev_pagemap mapping
 * @reserve: pages mapped, but reserved for driver use (relative to @base)
 * @free: free pages set aside in the mapping for memmap storage
 * @align: pages reserved to meet allocation alignments
 * @alloc: track pages consumed, private to vmemmap_populate()
 */
struct vmem_altmap {
        const unsigned long base_pfn;
        const unsigned long end_pfn;
        const unsigned long reserve;
        unsigned long free;
        unsigned long align;
        unsigned long alloc;
};

/*
 * Specialize ZONE_DEVICE memory into multiple types each having differents
 * usage.
 *
 * MEMORY_DEVICE_PRIVATE:
 * Device memory that is not directly addressable by the CPU: CPU can neither
 * read nor write private memory. In this case, we do still have struct pages
 * backing the device memory. Doing so simplifies the implementation, but it is
 * important to remember that there are certain points at which the struct page
 * must be treated as an opaque object, rather than a "normal" struct page.
 *
 * A more complete discussion of unaddressable memory may be found in
 * include/linux/hmm.h and Documentation/vm/hmm.rst.
 *
 * MEMORY_DEVICE_FS_DAX:
 * Host memory that has similar access semantics as System RAM i.e. DMA
 * coherent and supports page pinning. In support of coordinating page
 * pinning vs other operations MEMORY_DEVICE_FS_DAX arranges for a
 * wakeup event whenever a page is unpinned and becomes idle. This
 * wakeup is used to coordinate physical address space management (ex:
 * fs truncate/hole punch) vs pinned pages (ex: device dma).
 *
 * MEMORY_DEVICE_GENERIC:
 * Host memory that has similar access semantics as System RAM i.e. DMA
 * coherent and supports page pinning. This is for example used by DAX devices
 * that expose memory using a character device.
 *
 * MEMORY_DEVICE_PCI_P2PDMA:
 * Device memory residing in a PCI BAR intended for use with Peer-to-Peer
 * transactions.
 */
enum memory_type {
        /* 0 is reserved to catch uninitialized type fields */
        MEMORY_DEVICE_PRIVATE = 1,
        MEMORY_DEVICE_FS_DAX,
        MEMORY_DEVICE_GENERIC,
        MEMORY_DEVICE_PCI_P2PDMA,
};

struct dev_pagemap_ops {
        /*
         * Called once the page refcount reaches 1.  (ZONE_DEVICE pages never
         * reach 0 refcount unless there is a refcount bug. This allows the
         * device driver to implement its own memory management.)
         */
        void (*page_free)(struct page *page);

        /*
         * Transition the refcount in struct dev_pagemap to the dead state.
         */
        void (*kill)(struct dev_pagemap *pgmap);

        /*
         * Wait for refcount in struct dev_pagemap to be idle and reap it.
         */
        void (*cleanup)(struct dev_pagemap *pgmap);

        /*
         * Used for private (un-addressable) device memory only.  Must migrate
         * the page back to a CPU accessible page.
         */
        vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf);
};

#define PGMAP_ALTMAP_VALID        (1 << 0)

/**
 * struct dev_pagemap - metadata for ZONE_DEVICE mappings
 * @altmap: pre-allocated/reserved memory for vmemmap allocations
 * @ref: reference count that pins the devm_memremap_pages() mapping
 * @internal_ref: internal reference if @ref is not provided by the caller
 * @done: completion for @internal_ref
 * @type: memory type: see MEMORY_* in memory_hotplug.h
 * @flags: PGMAP_* flags to specify defailed behavior
 * @ops: method table
 * @owner: an opaque pointer identifying the entity that manages this
 *        instance.  Used by various helpers to make sure that no
 *        foreign ZONE_DEVICE memory is accessed.
 * @nr_range: number of ranges to be mapped
 * @range: range to be mapped when nr_range == 1
 * @ranges: array of ranges to be mapped when nr_range > 1
 */
struct dev_pagemap {
        struct vmem_altmap altmap;
        struct percpu_ref *ref;
        struct percpu_ref internal_ref;
        struct completion done;
        enum memory_type type;
        unsigned int flags;
        const struct dev_pagemap_ops *ops;
        void *owner;
        int nr_range;
        union {
                struct range range;
                struct range ranges[0];
        };
};

static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap)
{
        if (pgmap->flags & PGMAP_ALTMAP_VALID)
                return &pgmap->altmap;
        return NULL;
}

#ifdef CONFIG_ZONE_DEVICE
void *memremap_pages(struct dev_pagemap *pgmap, int nid);
void memunmap_pages(struct dev_pagemap *pgmap);
void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap);
struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
                struct dev_pagemap *pgmap);
bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn);

unsigned long vmem_altmap_offset(struct vmem_altmap *altmap);
void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns);
unsigned long memremap_compat_align(void);
#else
static inline void *devm_memremap_pages(struct device *dev,
                struct dev_pagemap *pgmap)
{
        /*
         * Fail attempts to call devm_memremap_pages() without
         * ZONE_DEVICE support enabled, this requires callers to fall
         * back to plain devm_memremap() based on config
         */
        WARN_ON_ONCE(1);
        return ERR_PTR(-ENXIO);
}

static inline void devm_memunmap_pages(struct device *dev,
                struct dev_pagemap *pgmap)
{
}

static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
                struct dev_pagemap *pgmap)
{
        return NULL;
}

static inline bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn)
{
        return false;
}

static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
{
        return 0;
}

static inline void vmem_altmap_free(struct vmem_altmap *altmap,
                unsigned long nr_pfns)
{
}

/* when memremap_pages() is disabled all archs can remap a single page */
static inline unsigned long memremap_compat_align(void)
{
        return PAGE_SIZE;
}
#endif /* CONFIG_ZONE_DEVICE */

static inline void put_dev_pagemap(struct dev_pagemap *pgmap)
{
        if (pgmap)
                percpu_ref_put(pgmap->ref);
}

#endif /* _LINUX_MEMREMAP_H_ */















































    1 


















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Integer base 2 logarithm calculation
 *
 * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#ifndef _LINUX_LOG2_H
#define _LINUX_LOG2_H

#include <linux/types.h>
#include <linux/bitops.h>

/*
 * non-constant log of base 2 calculators
 * - the arch may override these in asm/bitops.h if they can be implemented
 *   more efficiently than using fls() and fls64()
 * - the arch is not required to handle n==0 if implementing the fallback
 */
#ifndef CONFIG_ARCH_HAS_ILOG2_U32
static inline __attribute__((const))
int __ilog2_u32(u32 n)
{
        return fls(n) - 1;
}
#endif

#ifndef CONFIG_ARCH_HAS_ILOG2_U64
static inline __attribute__((const))
int __ilog2_u64(u64 n)
{
        return fls64(n) - 1;
}
#endif

/**
 * is_power_of_2() - check if a value is a power of two
 * @n: the value to check
 *
 * Determine whether some value is a power of two, where zero is
 * *not* considered a power of two.
 * Return: true if @n is a power of 2, otherwise false.
 */
static inline __attribute__((const))
bool is_power_of_2(unsigned long n)
{
        return (n != 0 && ((n & (n - 1)) == 0));
}

/**
 * __roundup_pow_of_two() - round up to nearest power of two
 * @n: value to round up
 */
static inline __attribute__((const))
unsigned long __roundup_pow_of_two(unsigned long n)
{
        return 1UL << fls_long(n - 1);
}

/**
 * __rounddown_pow_of_two() - round down to nearest power of two
 * @n: value to round down
 */
static inline __attribute__((const))
unsigned long __rounddown_pow_of_two(unsigned long n)
{
        return 1UL << (fls_long(n) - 1);
}

/**
 * const_ilog2 - log base 2 of 32-bit or a 64-bit constant unsigned value
 * @n: parameter
 *
 * Use this where sparse expects a true constant expression, e.g. for array
 * indices.
 */
#define const_ilog2(n)                                \
(                                                \
        __builtin_constant_p(n) ? (                \
                (n) < 2 ? 0 :                        \
                (n) & (1ULL << 63) ? 63 :        \
                (n) & (1ULL << 62) ? 62 :        \
                (n) & (1ULL << 61) ? 61 :        \
                (n) & (1ULL << 60) ? 60 :        \
                (n) & (1ULL << 59) ? 59 :        \
                (n) & (1ULL << 58) ? 58 :        \
                (n) & (1ULL << 57) ? 57 :        \
                (n) & (1ULL << 56) ? 56 :        \
                (n) & (1ULL << 55) ? 55 :        \
                (n) & (1ULL << 54) ? 54 :        \
                (n) & (1ULL << 53) ? 53 :        \
                (n) & (1ULL << 52) ? 52 :        \
                (n) & (1ULL << 51) ? 51 :        \
                (n) & (1ULL << 50) ? 50 :        \
                (n) & (1ULL << 49) ? 49 :        \
                (n) & (1ULL << 48) ? 48 :        \
                (n) & (1ULL << 47) ? 47 :        \
                (n) & (1ULL << 46) ? 46 :        \
                (n) & (1ULL << 45) ? 45 :        \
                (n) & (1ULL << 44) ? 44 :        \
                (n) & (1ULL << 43) ? 43 :        \
                (n) & (1ULL << 42) ? 42 :        \
                (n) & (1ULL << 41) ? 41 :        \
                (n) & (1ULL << 40) ? 40 :        \
                (n) & (1ULL << 39) ? 39 :        \
                (n) & (1ULL << 38) ? 38 :        \
                (n) & (1ULL << 37) ? 37 :        \
                (n) & (1ULL << 36) ? 36 :        \
                (n) & (1ULL << 35) ? 35 :        \
                (n) & (1ULL << 34) ? 34 :        \
                (n) & (1ULL << 33) ? 33 :        \
                (n) & (1ULL << 32) ? 32 :        \
                (n) & (1ULL << 31) ? 31 :        \
                (n) & (1ULL << 30) ? 30 :        \
                (n) & (1ULL << 29) ? 29 :        \
                (n) & (1ULL << 28) ? 28 :        \
                (n) & (1ULL << 27) ? 27 :        \
                (n) & (1ULL << 26) ? 26 :        \
                (n) & (1ULL << 25) ? 25 :        \
                (n) & (1ULL << 24) ? 24 :        \
                (n) & (1ULL << 23) ? 23 :        \
                (n) & (1ULL << 22) ? 22 :        \
                (n) & (1ULL << 21) ? 21 :        \
                (n) & (1ULL << 20) ? 20 :        \
                (n) & (1ULL << 19) ? 19 :        \
                (n) & (1ULL << 18) ? 18 :        \
                (n) & (1ULL << 17) ? 17 :        \
                (n) & (1ULL << 16) ? 16 :        \
                (n) & (1ULL << 15) ? 15 :        \
                (n) & (1ULL << 14) ? 14 :        \
                (n) & (1ULL << 13) ? 13 :        \
                (n) & (1ULL << 12) ? 12 :        \
                (n) & (1ULL << 11) ? 11 :        \
                (n) & (1ULL << 10) ? 10 :        \
                (n) & (1ULL <<  9) ?  9 :        \
                (n) & (1ULL <<  8) ?  8 :        \
                (n) & (1ULL <<  7) ?  7 :        \
                (n) & (1ULL <<  6) ?  6 :        \
                (n) & (1ULL <<  5) ?  5 :        \
                (n) & (1ULL <<  4) ?  4 :        \
                (n) & (1ULL <<  3) ?  3 :        \
                (n) & (1ULL <<  2) ?  2 :        \
                1) :                                \
        -1)

/**
 * ilog2 - log base 2 of 32-bit or a 64-bit unsigned value
 * @n: parameter
 *
 * constant-capable log of base 2 calculation
 * - this can be used to initialise global variables from constant data, hence
 * the massive ternary operator construction
 *
 * selects the appropriately-sized optimised version depending on sizeof(n)
 */
#define ilog2(n) \
( \
        __builtin_constant_p(n) ?        \
        const_ilog2(n) :                \
        (sizeof(n) <= 4) ?                \
        __ilog2_u32(n) :                \
        __ilog2_u64(n)                        \
 )

/**
 * roundup_pow_of_two - round the given value up to nearest power of two
 * @n: parameter
 *
 * round the given value up to the nearest power of two
 * - the result is undefined when n == 0
 * - this can be used to initialise global variables from constant data
 */
#define roundup_pow_of_two(n)                        \
(                                                \
        __builtin_constant_p(n) ? (                \
                ((n) == 1) ? 1 :                \
                (1UL << (ilog2((n) - 1) + 1))        \
                                   ) :                \
        __roundup_pow_of_two(n)                        \
 )

/**
 * rounddown_pow_of_two - round the given value down to nearest power of two
 * @n: parameter
 *
 * round the given value down to the nearest power of two
 * - the result is undefined when n == 0
 * - this can be used to initialise global variables from constant data
 */
#define rounddown_pow_of_two(n)                        \
(                                                \
        __builtin_constant_p(n) ? (                \
                (1UL << ilog2(n))) :                \
        __rounddown_pow_of_two(n)                \
 )

static inline __attribute_const__
int __order_base_2(unsigned long n)
{
        return n > 1 ? ilog2(n - 1) + 1 : 0;
}

/**
 * order_base_2 - calculate the (rounded up) base 2 order of the argument
 * @n: parameter
 *
 * The first few values calculated by this routine:
 *  ob2(0) = 0
 *  ob2(1) = 0
 *  ob2(2) = 1
 *  ob2(3) = 2
 *  ob2(4) = 2
 *  ob2(5) = 3
 *  ... and so on.
 */
#define order_base_2(n)                                \
(                                                \
        __builtin_constant_p(n) ? (                \
                ((n) == 0 || (n) == 1) ? 0 :        \
                ilog2((n) - 1) + 1) :                \
        __order_base_2(n)                        \
)

static inline __attribute__((const))
int __bits_per(unsigned long n)
{
        if (n < 2)
                return 1;
        if (is_power_of_2(n))
                return order_base_2(n) + 1;
        return order_base_2(n);
}

/**
 * bits_per - calculate the number of bits required for the argument
 * @n: parameter
 *
 * This is constant-capable and can be used for compile time
 * initializations, e.g bitfields.
 *
 * The first few values calculated by this routine:
 * bf(0) = 1
 * bf(1) = 1
 * bf(2) = 2
 * bf(3) = 2
 * bf(4) = 3
 * ... and so on.
 */
#define bits_per(n)                                \
(                                                \
        __builtin_constant_p(n) ? (                \
                ((n) == 0 || (n) == 1)                \
                        ? 1 : ilog2(n) + 1        \
        ) :                                        \
        __bits_per(n)                                \
)
#endif /* _LINUX_LOG2_H */










































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *        inet6 interface/address list definitions
 *        Linux INET6 implementation 
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>        
 */

#ifndef _NET_IF_INET6_H
#define _NET_IF_INET6_H

#include <net/snmp.h>
#include <linux/ipv6.h>
#include <linux/refcount.h>

/* inet6_dev.if_flags */

#define IF_RA_OTHERCONF        0x80
#define IF_RA_MANAGED        0x40
#define IF_RA_RCVD        0x20
#define IF_RS_SENT        0x10
#define IF_READY        0x80000000

enum {
        INET6_IFADDR_STATE_PREDAD,
        INET6_IFADDR_STATE_DAD,
        INET6_IFADDR_STATE_POSTDAD,
        INET6_IFADDR_STATE_ERRDAD,
        INET6_IFADDR_STATE_DEAD,
};

struct inet6_ifaddr {
        struct in6_addr                addr;
        __u32                        prefix_len;
        __u32                        rt_priority;

        /* In seconds, relative to tstamp. Expiry is at tstamp + HZ * lft. */
        __u32                        valid_lft;
        __u32                        prefered_lft;
        refcount_t                refcnt;
        spinlock_t                lock;

        int                        state;

        __u32                        flags;
        __u8                        dad_probes;
        __u8                        stable_privacy_retry;

        __u16                        scope;
        __u64                        dad_nonce;

        unsigned long                cstamp;        /* created timestamp */
        unsigned long                tstamp; /* updated timestamp */

        struct delayed_work        dad_work;

        struct inet6_dev        *idev;
        struct fib6_info        *rt;

        struct hlist_node        addr_lst;
        struct list_head        if_list;
        /*
         * Used to safely traverse idev->addr_list in process context
         * if the idev->lock needed to protect idev->addr_list cannot be held.
         * In that case, add the items to this list temporarily and iterate
         * without holding idev->lock.
         * See addrconf_ifdown and dev_forward_change.
         */
        struct list_head        if_list_aux;

        struct list_head        tmp_list;
        struct inet6_ifaddr        *ifpub;
        int                        regen_count;

        bool                        tokenized;

        struct rcu_head                rcu;
        struct in6_addr                peer_addr;
};

struct ip6_sf_socklist {
        unsigned int                sl_max;
        unsigned int                sl_count;
        struct in6_addr                sl_addr[];
};

#define IP6_SFLSIZE(count)        (sizeof(struct ip6_sf_socklist) + \
        (count) * sizeof(struct in6_addr))

#define IP6_SFBLOCK        10        /* allocate this many at once */

struct ipv6_mc_socklist {
        struct in6_addr                addr;
        int                        ifindex;
        unsigned int                sfmode;                /* MCAST_{INCLUDE,EXCLUDE} */
        struct ipv6_mc_socklist __rcu *next;
        rwlock_t                sflock;
        struct ip6_sf_socklist        *sflist;
        struct rcu_head                rcu;
};

struct ip6_sf_list {
        struct ip6_sf_list        *sf_next;
        struct in6_addr                sf_addr;
        unsigned long                sf_count[2];        /* include/exclude counts */
        unsigned char                sf_gsresp;        /* include in g & s response? */
        unsigned char                sf_oldin;        /* change state */
        unsigned char                sf_crcount;        /* retrans. left to send */
};

#define MAF_TIMER_RUNNING        0x01
#define MAF_LAST_REPORTER        0x02
#define MAF_LOADED                0x04
#define MAF_NOREPORT                0x08
#define MAF_GSQUERY                0x10

struct ifmcaddr6 {
        struct in6_addr                mca_addr;
        struct inet6_dev        *idev;
        struct ifmcaddr6        *next;
        struct ip6_sf_list        *mca_sources;
        struct ip6_sf_list        *mca_tomb;
        unsigned int                mca_sfmode;
        unsigned char                mca_crcount;
        unsigned long                mca_sfcount[2];
        struct timer_list        mca_timer;
        unsigned int                mca_flags;
        int                        mca_users;
        refcount_t                mca_refcnt;
        spinlock_t                mca_lock;
        unsigned long                mca_cstamp;
        unsigned long                mca_tstamp;
};

/* Anycast stuff */

struct ipv6_ac_socklist {
        struct in6_addr                acl_addr;
        int                        acl_ifindex;
        struct ipv6_ac_socklist *acl_next;
};

struct ifacaddr6 {
        struct in6_addr                aca_addr;
        struct fib6_info        *aca_rt;
        struct ifacaddr6        *aca_next;
        struct hlist_node        aca_addr_lst;
        int                        aca_users;
        refcount_t                aca_refcnt;
        unsigned long                aca_cstamp;
        unsigned long                aca_tstamp;
        struct rcu_head                rcu;
};

#define        IFA_HOST        IPV6_ADDR_LOOPBACK
#define        IFA_LINK        IPV6_ADDR_LINKLOCAL
#define        IFA_SITE        IPV6_ADDR_SITELOCAL

struct ipv6_devstat {
        struct proc_dir_entry        *proc_dir_entry;
        DEFINE_SNMP_STAT(struct ipstats_mib, ipv6);
        DEFINE_SNMP_STAT_ATOMIC(struct icmpv6_mib_device, icmpv6dev);
        DEFINE_SNMP_STAT_ATOMIC(struct icmpv6msg_mib_device, icmpv6msgdev);
};

struct inet6_dev {
        struct net_device        *dev;

        struct list_head        addr_list;

        struct ifmcaddr6        *mc_list;
        struct ifmcaddr6        *mc_tomb;
        spinlock_t                mc_lock;

        unsigned char                mc_qrv;                /* Query Robustness Variable */
        unsigned char                mc_gq_running;
        unsigned char                mc_ifc_count;
        unsigned char                mc_dad_count;

        unsigned long                mc_v1_seen;        /* Max time we stay in MLDv1 mode */
        unsigned long                mc_qi;                /* Query Interval */
        unsigned long                mc_qri;                /* Query Response Interval */
        unsigned long                mc_maxdelay;

        struct timer_list        mc_gq_timer;        /* general query timer */
        struct timer_list        mc_ifc_timer;        /* interface change timer */
        struct timer_list        mc_dad_timer;        /* dad complete mc timer */

        struct ifacaddr6        *ac_list;
        rwlock_t                lock;
        refcount_t                refcnt;
        __u32                        if_flags;
        int                        dead;

        u32                        desync_factor;
        struct list_head        tempaddr_list;

        struct in6_addr                token;

        struct neigh_parms        *nd_parms;
        struct ipv6_devconf        cnf;
        struct ipv6_devstat        stats;

        struct timer_list        rs_timer;
        __s32                        rs_interval;        /* in jiffies */
        __u8                        rs_probes;

        unsigned long                tstamp; /* ipv6InterfaceTable update timestamp */
        struct rcu_head                rcu;
};

static inline void ipv6_eth_mc_map(const struct in6_addr *addr, char *buf)
{
        /*
         *        +-------+-------+-------+-------+-------+-------+
         *      |   33  |   33  | DST13 | DST14 | DST15 | DST16 |
         *      +-------+-------+-------+-------+-------+-------+
         */

        buf[0]= 0x33;
        buf[1]= 0x33;

        memcpy(buf + 2, &addr->s6_addr32[3], sizeof(__u32));
}

static inline void ipv6_arcnet_mc_map(const struct in6_addr *addr, char *buf)
{
        buf[0] = 0x00;
}

static inline void ipv6_ib_mc_map(const struct in6_addr *addr,
                                  const unsigned char *broadcast, char *buf)
{
        unsigned char scope = broadcast[5] & 0xF;

        buf[0]  = 0;                /* Reserved */
        buf[1]  = 0xff;                /* Multicast QPN */
        buf[2]  = 0xff;
        buf[3]  = 0xff;
        buf[4]  = 0xff;
        buf[5]  = 0x10 | scope;        /* scope from broadcast address */
        buf[6]  = 0x60;                /* IPv6 signature */
        buf[7]  = 0x1b;
        buf[8]  = broadcast[8];        /* P_Key */
        buf[9]  = broadcast[9];
        memcpy(buf + 10, addr->s6_addr + 6, 10);
}

static inline int ipv6_ipgre_mc_map(const struct in6_addr *addr,
                                    const unsigned char *broadcast, char *buf)
{
        if ((broadcast[0] | broadcast[1] | broadcast[2] | broadcast[3]) != 0) {
                memcpy(buf, broadcast, 4);
        } else {
                /* v4mapped? */
                if ((addr->s6_addr32[0] | addr->s6_addr32[1] |
                     (addr->s6_addr32[2] ^ htonl(0x0000ffff))) != 0)
                        return -EINVAL;
                memcpy(buf, &addr->s6_addr32[3], 4);
        }
        return 0;
}

#endif





































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * NET                Generic infrastructure for Network protocols.
 *
 * Authors:        Arnaldo Carvalho de Melo <acme@conectiva.com.br>
 */
#ifndef _TIMEWAIT_SOCK_H
#define _TIMEWAIT_SOCK_H

#include <linux/slab.h>
#include <linux/bug.h>
#include <net/sock.h>

struct timewait_sock_ops {
        struct kmem_cache        *twsk_slab;
        char                *twsk_slab_name;
        unsigned int        twsk_obj_size;
        int                (*twsk_unique)(struct sock *sk,
                                       struct sock *sktw, void *twp);
        void                (*twsk_destructor)(struct sock *sk);
};

static inline int twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
{
        if (sk->sk_prot->twsk_prot->twsk_unique != NULL)
                return sk->sk_prot->twsk_prot->twsk_unique(sk, sktw, twp);
        return 0;
}

static inline void twsk_destructor(struct sock *sk)
{
        if (sk->sk_prot->twsk_prot->twsk_destructor != NULL)
                sk->sk_prot->twsk_prot->twsk_destructor(sk);
}

#endif /* _TIMEWAIT_SOCK_H */














































   12 
    5 













    1 




    2 


























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Wrapper functions for accessing the file_struct fd array.
 */

#ifndef __LINUX_FILE_H
#define __LINUX_FILE_H

#include <linux/compiler.h>
#include <linux/types.h>
#include <linux/posix_types.h>
#include <linux/errno.h>
#include <linux/cleanup.h>

struct file;

extern void fput(struct file *);
extern void fput_many(struct file *, unsigned int);

struct file_operations;
struct task_struct;
struct vfsmount;
struct dentry;
struct inode;
struct path;
extern struct file *alloc_file_pseudo(struct inode *, struct vfsmount *,
        const char *, int flags, const struct file_operations *);
extern struct file *alloc_file_clone(struct file *, int flags,
        const struct file_operations *);

static inline void fput_light(struct file *file, int fput_needed)
{
        if (fput_needed)
                fput(file);
}

struct fd {
        struct file *file;
        unsigned int flags;
};
#define FDPUT_FPUT       1
#define FDPUT_POS_UNLOCK 2

static inline void fdput(struct fd fd)
{
        if (fd.flags & FDPUT_FPUT)
                fput(fd.file);
}

extern struct file *fget(unsigned int fd);
extern struct file *fget_many(unsigned int fd, unsigned int refs);
extern struct file *fget_raw(unsigned int fd);
extern struct file *fget_task(struct task_struct *task, unsigned int fd);
extern unsigned long __fdget(unsigned int fd);
extern unsigned long __fdget_raw(unsigned int fd);
extern unsigned long __fdget_pos(unsigned int fd);
extern void __f_unlock_pos(struct file *);

static inline struct fd __to_fd(unsigned long v)
{
        return (struct fd){(struct file *)(v & ~3),v & 3};
}

static inline struct fd fdget(unsigned int fd)
{
        return __to_fd(__fdget(fd));
}

static inline struct fd fdget_raw(unsigned int fd)
{
        return __to_fd(__fdget_raw(fd));
}

static inline struct fd fdget_pos(int fd)
{
        return __to_fd(__fdget_pos(fd));
}

static inline void fdput_pos(struct fd f)
{
        if (f.flags & FDPUT_POS_UNLOCK)
                __f_unlock_pos(f.file);
        fdput(f);
}

DEFINE_CLASS(fd, struct fd, fdput(_T), fdget(fd), int fd)

extern int f_dupfd(unsigned int from, struct file *file, unsigned flags);
extern int replace_fd(unsigned fd, struct file *file, unsigned flags);
extern void set_close_on_exec(unsigned int fd, int flag);
extern bool get_close_on_exec(unsigned int fd);
extern int __get_unused_fd_flags(unsigned flags, unsigned long nofile);
extern int get_unused_fd_flags(unsigned flags);
extern void put_unused_fd(unsigned int fd);

DEFINE_CLASS(get_unused_fd, int, if (_T >= 0) put_unused_fd(_T),
             get_unused_fd_flags(flags), unsigned flags)

extern void fd_install(unsigned int fd, struct file *file);

extern int __receive_fd(int fd, struct file *file, int __user *ufd,
                        unsigned int o_flags);
static inline int receive_fd_user(struct file *file, int __user *ufd,
                                  unsigned int o_flags)
{
        if (ufd == NULL)
                return -EFAULT;
        return __receive_fd(-1, file, ufd, o_flags);
}
static inline int receive_fd(struct file *file, unsigned int o_flags)
{
        return __receive_fd(-1, file, NULL, o_flags);
}
static inline int receive_fd_replace(int fd, struct file *file, unsigned int o_flags)
{
        return __receive_fd(fd, file, NULL, o_flags);
}

extern void flush_delayed_fput(void);
extern void __fput_sync(struct file *);

extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max;

#endif /* __LINUX_FILE_H */



































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    4 

    4 















































    3 

    3 






























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 

    7 



























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
// SPDX-License-Identifier: GPL-2.0

// Generated by scripts/atomic/gen-atomic-fallback.sh
// DO NOT MODIFY THIS FILE DIRECTLY

#ifndef _LINUX_ATOMIC_FALLBACK_H
#define _LINUX_ATOMIC_FALLBACK_H

#include <linux/compiler.h>

#ifndef arch_xchg_relaxed
#define arch_xchg_relaxed                arch_xchg
#define arch_xchg_acquire                arch_xchg
#define arch_xchg_release                arch_xchg
#else /* arch_xchg_relaxed */

#ifndef arch_xchg_acquire
#define arch_xchg_acquire(...) \
        __atomic_op_acquire(arch_xchg, __VA_ARGS__)
#endif

#ifndef arch_xchg_release
#define arch_xchg_release(...) \
        __atomic_op_release(arch_xchg, __VA_ARGS__)
#endif

#ifndef arch_xchg
#define arch_xchg(...) \
        __atomic_op_fence(arch_xchg, __VA_ARGS__)
#endif

#endif /* arch_xchg_relaxed */

#ifndef arch_cmpxchg_relaxed
#define arch_cmpxchg_relaxed                arch_cmpxchg
#define arch_cmpxchg_acquire                arch_cmpxchg
#define arch_cmpxchg_release                arch_cmpxchg
#else /* arch_cmpxchg_relaxed */

#ifndef arch_cmpxchg_acquire
#define arch_cmpxchg_acquire(...) \
        __atomic_op_acquire(arch_cmpxchg, __VA_ARGS__)
#endif

#ifndef arch_cmpxchg_release
#define arch_cmpxchg_release(...) \
        __atomic_op_release(arch_cmpxchg, __VA_ARGS__)
#endif

#ifndef arch_cmpxchg
#define arch_cmpxchg(...) \
        __atomic_op_fence(arch_cmpxchg, __VA_ARGS__)
#endif

#endif /* arch_cmpxchg_relaxed */

#ifndef arch_cmpxchg64_relaxed
#define arch_cmpxchg64_relaxed                arch_cmpxchg64
#define arch_cmpxchg64_acquire                arch_cmpxchg64
#define arch_cmpxchg64_release                arch_cmpxchg64
#else /* arch_cmpxchg64_relaxed */

#ifndef arch_cmpxchg64_acquire
#define arch_cmpxchg64_acquire(...) \
        __atomic_op_acquire(arch_cmpxchg64, __VA_ARGS__)
#endif

#ifndef arch_cmpxchg64_release
#define arch_cmpxchg64_release(...) \
        __atomic_op_release(arch_cmpxchg64, __VA_ARGS__)
#endif

#ifndef arch_cmpxchg64
#define arch_cmpxchg64(...) \
        __atomic_op_fence(arch_cmpxchg64, __VA_ARGS__)
#endif

#endif /* arch_cmpxchg64_relaxed */

#ifndef arch_atomic_read_acquire
static __always_inline int
arch_atomic_read_acquire(const atomic_t *v)
{
        return smp_load_acquire(&(v)->counter);
}
#define arch_atomic_read_acquire arch_atomic_read_acquire
#endif

#ifndef arch_atomic_set_release
static __always_inline void
arch_atomic_set_release(atomic_t *v, int i)
{
        smp_store_release(&(v)->counter, i);
}
#define arch_atomic_set_release arch_atomic_set_release
#endif

#ifndef arch_atomic_add_return_relaxed
#define arch_atomic_add_return_acquire arch_atomic_add_return
#define arch_atomic_add_return_release arch_atomic_add_return
#define arch_atomic_add_return_relaxed arch_atomic_add_return
#else /* arch_atomic_add_return_relaxed */

#ifndef arch_atomic_add_return_acquire
static __always_inline int
arch_atomic_add_return_acquire(int i, atomic_t *v)
{
        int ret = arch_atomic_add_return_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic_add_return_acquire arch_atomic_add_return_acquire
#endif

#ifndef arch_atomic_add_return_release
static __always_inline int
arch_atomic_add_return_release(int i, atomic_t *v)
{
        __atomic_release_fence();
        return arch_atomic_add_return_relaxed(i, v);
}
#define arch_atomic_add_return_release arch_atomic_add_return_release
#endif

#ifndef arch_atomic_add_return
static __always_inline int
arch_atomic_add_return(int i, atomic_t *v)
{
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_add_return_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic_add_return arch_atomic_add_return
#endif

#endif /* arch_atomic_add_return_relaxed */

#ifndef arch_atomic_fetch_add_relaxed
#define arch_atomic_fetch_add_acquire arch_atomic_fetch_add
#define arch_atomic_fetch_add_release arch_atomic_fetch_add
#define arch_atomic_fetch_add_relaxed arch_atomic_fetch_add
#else /* arch_atomic_fetch_add_relaxed */

#ifndef arch_atomic_fetch_add_acquire
static __always_inline int
arch_atomic_fetch_add_acquire(int i, atomic_t *v)
{
        int ret = arch_atomic_fetch_add_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic_fetch_add_acquire arch_atomic_fetch_add_acquire
#endif

#ifndef arch_atomic_fetch_add_release
static __always_inline int
arch_atomic_fetch_add_release(int i, atomic_t *v)
{
        __atomic_release_fence();
        return arch_atomic_fetch_add_relaxed(i, v);
}
#define arch_atomic_fetch_add_release arch_atomic_fetch_add_release
#endif

#ifndef arch_atomic_fetch_add
static __always_inline int
arch_atomic_fetch_add(int i, atomic_t *v)
{
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_add_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic_fetch_add arch_atomic_fetch_add
#endif

#endif /* arch_atomic_fetch_add_relaxed */

#ifndef arch_atomic_sub_return_relaxed
#define arch_atomic_sub_return_acquire arch_atomic_sub_return
#define arch_atomic_sub_return_release arch_atomic_sub_return
#define arch_atomic_sub_return_relaxed arch_atomic_sub_return
#else /* arch_atomic_sub_return_relaxed */

#ifndef arch_atomic_sub_return_acquire
static __always_inline int
arch_atomic_sub_return_acquire(int i, atomic_t *v)
{
        int ret = arch_atomic_sub_return_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic_sub_return_acquire arch_atomic_sub_return_acquire
#endif

#ifndef arch_atomic_sub_return_release
static __always_inline int
arch_atomic_sub_return_release(int i, atomic_t *v)
{
        __atomic_release_fence();
        return arch_atomic_sub_return_relaxed(i, v);
}
#define arch_atomic_sub_return_release arch_atomic_sub_return_release
#endif

#ifndef arch_atomic_sub_return
static __always_inline int
arch_atomic_sub_return(int i, atomic_t *v)
{
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_sub_return_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic_sub_return arch_atomic_sub_return
#endif

#endif /* arch_atomic_sub_return_relaxed */

#ifndef arch_atomic_fetch_sub_relaxed
#define arch_atomic_fetch_sub_acquire arch_atomic_fetch_sub
#define arch_atomic_fetch_sub_release arch_atomic_fetch_sub
#define arch_atomic_fetch_sub_relaxed arch_atomic_fetch_sub
#else /* arch_atomic_fetch_sub_relaxed */

#ifndef arch_atomic_fetch_sub_acquire
static __always_inline int
arch_atomic_fetch_sub_acquire(int i, atomic_t *v)
{
        int ret = arch_atomic_fetch_sub_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic_fetch_sub_acquire arch_atomic_fetch_sub_acquire
#endif

#ifndef arch_atomic_fetch_sub_release
static __always_inline int
arch_atomic_fetch_sub_release(int i, atomic_t *v)
{
        __atomic_release_fence();
        return arch_atomic_fetch_sub_relaxed(i, v);
}
#define arch_atomic_fetch_sub_release arch_atomic_fetch_sub_release
#endif

#ifndef arch_atomic_fetch_sub
static __always_inline int
arch_atomic_fetch_sub(int i, atomic_t *v)
{
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_sub_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic_fetch_sub arch_atomic_fetch_sub
#endif

#endif /* arch_atomic_fetch_sub_relaxed */

#ifndef arch_atomic_inc
static __always_inline void
arch_atomic_inc(atomic_t *v)
{
        arch_atomic_add(1, v);
}
#define arch_atomic_inc arch_atomic_inc
#endif

#ifndef arch_atomic_inc_return_relaxed
#ifdef arch_atomic_inc_return
#define arch_atomic_inc_return_acquire arch_atomic_inc_return
#define arch_atomic_inc_return_release arch_atomic_inc_return
#define arch_atomic_inc_return_relaxed arch_atomic_inc_return
#endif /* arch_atomic_inc_return */

#ifndef arch_atomic_inc_return
static __always_inline int
arch_atomic_inc_return(atomic_t *v)
{
        return arch_atomic_add_return(1, v);
}
#define arch_atomic_inc_return arch_atomic_inc_return
#endif

#ifndef arch_atomic_inc_return_acquire
static __always_inline int
arch_atomic_inc_return_acquire(atomic_t *v)
{
        return arch_atomic_add_return_acquire(1, v);
}
#define arch_atomic_inc_return_acquire arch_atomic_inc_return_acquire
#endif

#ifndef arch_atomic_inc_return_release
static __always_inline int
arch_atomic_inc_return_release(atomic_t *v)
{
        return arch_atomic_add_return_release(1, v);
}
#define arch_atomic_inc_return_release arch_atomic_inc_return_release
#endif

#ifndef arch_atomic_inc_return_relaxed
static __always_inline int
arch_atomic_inc_return_relaxed(atomic_t *v)
{
        return arch_atomic_add_return_relaxed(1, v);
}
#define arch_atomic_inc_return_relaxed arch_atomic_inc_return_relaxed
#endif

#else /* arch_atomic_inc_return_relaxed */

#ifndef arch_atomic_inc_return_acquire
static __always_inline int
arch_atomic_inc_return_acquire(atomic_t *v)
{
        int ret = arch_atomic_inc_return_relaxed(v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic_inc_return_acquire arch_atomic_inc_return_acquire
#endif

#ifndef arch_atomic_inc_return_release
static __always_inline int
arch_atomic_inc_return_release(atomic_t *v)
{
        __atomic_release_fence();
        return arch_atomic_inc_return_relaxed(v);
}
#define arch_atomic_inc_return_release arch_atomic_inc_return_release
#endif

#ifndef arch_atomic_inc_return
static __always_inline int
arch_atomic_inc_return(atomic_t *v)
{
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_inc_return_relaxed(v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic_inc_return arch_atomic_inc_return
#endif

#endif /* arch_atomic_inc_return_relaxed */

#ifndef arch_atomic_fetch_inc_relaxed
#ifdef arch_atomic_fetch_inc
#define arch_atomic_fetch_inc_acquire arch_atomic_fetch_inc
#define arch_atomic_fetch_inc_release arch_atomic_fetch_inc
#define arch_atomic_fetch_inc_relaxed arch_atomic_fetch_inc
#endif /* arch_atomic_fetch_inc */

#ifndef arch_atomic_fetch_inc
static __always_inline int
arch_atomic_fetch_inc(atomic_t *v)
{
        return arch_atomic_fetch_add(1, v);
}
#define arch_atomic_fetch_inc arch_atomic_fetch_inc
#endif

#ifndef arch_atomic_fetch_inc_acquire
static __always_inline int
arch_atomic_fetch_inc_acquire(atomic_t *v)
{
        return arch_atomic_fetch_add_acquire(1, v);
}
#define arch_atomic_fetch_inc_acquire arch_atomic_fetch_inc_acquire
#endif

#ifndef arch_atomic_fetch_inc_release
static __always_inline int
arch_atomic_fetch_inc_release(atomic_t *v)
{
        return arch_atomic_fetch_add_release(1, v);
}
#define arch_atomic_fetch_inc_release arch_atomic_fetch_inc_release
#endif

#ifndef arch_atomic_fetch_inc_relaxed
static __always_inline int
arch_atomic_fetch_inc_relaxed(atomic_t *v)
{
        return arch_atomic_fetch_add_relaxed(1, v);
}
#define arch_atomic_fetch_inc_relaxed arch_atomic_fetch_inc_relaxed
#endif

#else /* arch_atomic_fetch_inc_relaxed */

#ifndef arch_atomic_fetch_inc_acquire
static __always_inline int
arch_atomic_fetch_inc_acquire(atomic_t *v)
{
        int ret = arch_atomic_fetch_inc_relaxed(v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic_fetch_inc_acquire arch_atomic_fetch_inc_acquire
#endif

#ifndef arch_atomic_fetch_inc_release
static __always_inline int
arch_atomic_fetch_inc_release(atomic_t *v)
{
        __atomic_release_fence();
        return arch_atomic_fetch_inc_relaxed(v);
}
#define arch_atomic_fetch_inc_release arch_atomic_fetch_inc_release
#endif

#ifndef arch_atomic_fetch_inc
static __always_inline int
arch_atomic_fetch_inc(atomic_t *v)
{
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_inc_relaxed(v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic_fetch_inc arch_atomic_fetch_inc
#endif

#endif /* arch_atomic_fetch_inc_relaxed */

#ifndef arch_atomic_dec
static __always_inline void
arch_atomic_dec(atomic_t *v)
{
        arch_atomic_sub(1, v);
}
#define arch_atomic_dec arch_atomic_dec
#endif

#ifndef arch_atomic_dec_return_relaxed
#ifdef arch_atomic_dec_return
#define arch_atomic_dec_return_acquire arch_atomic_dec_return
#define arch_atomic_dec_return_release arch_atomic_dec_return
#define arch_atomic_dec_return_relaxed arch_atomic_dec_return
#endif /* arch_atomic_dec_return */

#ifndef arch_atomic_dec_return
static __always_inline int
arch_atomic_dec_return(atomic_t *v)
{
        return arch_atomic_sub_return(1, v);
}
#define arch_atomic_dec_return arch_atomic_dec_return
#endif

#ifndef arch_atomic_dec_return_acquire
static __always_inline int
arch_atomic_dec_return_acquire(atomic_t *v)
{
        return arch_atomic_sub_return_acquire(1, v);
}
#define arch_atomic_dec_return_acquire arch_atomic_dec_return_acquire
#endif

#ifndef arch_atomic_dec_return_release
static __always_inline int
arch_atomic_dec_return_release(atomic_t *v)
{
        return arch_atomic_sub_return_release(1, v);
}
#define arch_atomic_dec_return_release arch_atomic_dec_return_release
#endif

#ifndef arch_atomic_dec_return_relaxed
static __always_inline int
arch_atomic_dec_return_relaxed(atomic_t *v)
{
        return arch_atomic_sub_return_relaxed(1, v);
}
#define arch_atomic_dec_return_relaxed arch_atomic_dec_return_relaxed
#endif

#else /* arch_atomic_dec_return_relaxed */

#ifndef arch_atomic_dec_return_acquire
static __always_inline int
arch_atomic_dec_return_acquire(atomic_t *v)
{
        int ret = arch_atomic_dec_return_relaxed(v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic_dec_return_acquire arch_atomic_dec_return_acquire
#endif

#ifndef arch_atomic_dec_return_release
static __always_inline int
arch_atomic_dec_return_release(atomic_t *v)
{
        __atomic_release_fence();
        return arch_atomic_dec_return_relaxed(v);
}
#define arch_atomic_dec_return_release arch_atomic_dec_return_release
#endif

#ifndef arch_atomic_dec_return
static __always_inline int
arch_atomic_dec_return(atomic_t *v)
{
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_dec_return_relaxed(v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic_dec_return arch_atomic_dec_return
#endif

#endif /* arch_atomic_dec_return_relaxed */

#ifndef arch_atomic_fetch_dec_relaxed
#ifdef arch_atomic_fetch_dec
#define arch_atomic_fetch_dec_acquire arch_atomic_fetch_dec
#define arch_atomic_fetch_dec_release arch_atomic_fetch_dec
#define arch_atomic_fetch_dec_relaxed arch_atomic_fetch_dec
#endif /* arch_atomic_fetch_dec */

#ifndef arch_atomic_fetch_dec
static __always_inline int
arch_atomic_fetch_dec(atomic_t *v)
{
        return arch_atomic_fetch_sub(1, v);
}
#define arch_atomic_fetch_dec arch_atomic_fetch_dec
#endif

#ifndef arch_atomic_fetch_dec_acquire
static __always_inline int
arch_atomic_fetch_dec_acquire(atomic_t *v)
{
        return arch_atomic_fetch_sub_acquire(1, v);
}
#define arch_atomic_fetch_dec_acquire arch_atomic_fetch_dec_acquire
#endif

#ifndef arch_atomic_fetch_dec_release
static __always_inline int
arch_atomic_fetch_dec_release(atomic_t *v)
{
        return arch_atomic_fetch_sub_release(1, v);
}
#define arch_atomic_fetch_dec_release arch_atomic_fetch_dec_release
#endif

#ifndef arch_atomic_fetch_dec_relaxed
static __always_inline int
arch_atomic_fetch_dec_relaxed(atomic_t *v)
{
        return arch_atomic_fetch_sub_relaxed(1, v);
}
#define arch_atomic_fetch_dec_relaxed arch_atomic_fetch_dec_relaxed
#endif

#else /* arch_atomic_fetch_dec_relaxed */

#ifndef arch_atomic_fetch_dec_acquire
static __always_inline int
arch_atomic_fetch_dec_acquire(atomic_t *v)
{
        int ret = arch_atomic_fetch_dec_relaxed(v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic_fetch_dec_acquire arch_atomic_fetch_dec_acquire
#endif

#ifndef arch_atomic_fetch_dec_release
static __always_inline int
arch_atomic_fetch_dec_release(atomic_t *v)
{
        __atomic_release_fence();
        return arch_atomic_fetch_dec_relaxed(v);
}
#define arch_atomic_fetch_dec_release arch_atomic_fetch_dec_release
#endif

#ifndef arch_atomic_fetch_dec
static __always_inline int
arch_atomic_fetch_dec(atomic_t *v)
{
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_dec_relaxed(v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic_fetch_dec arch_atomic_fetch_dec
#endif

#endif /* arch_atomic_fetch_dec_relaxed */

#ifndef arch_atomic_fetch_and_relaxed
#define arch_atomic_fetch_and_acquire arch_atomic_fetch_and
#define arch_atomic_fetch_and_release arch_atomic_fetch_and
#define arch_atomic_fetch_and_relaxed arch_atomic_fetch_and
#else /* arch_atomic_fetch_and_relaxed */

#ifndef arch_atomic_fetch_and_acquire
static __always_inline int
arch_atomic_fetch_and_acquire(int i, atomic_t *v)
{
        int ret = arch_atomic_fetch_and_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic_fetch_and_acquire arch_atomic_fetch_and_acquire
#endif

#ifndef arch_atomic_fetch_and_release
static __always_inline int
arch_atomic_fetch_and_release(int i, atomic_t *v)
{
        __atomic_release_fence();
        return arch_atomic_fetch_and_relaxed(i, v);
}
#define arch_atomic_fetch_and_release arch_atomic_fetch_and_release
#endif

#ifndef arch_atomic_fetch_and
static __always_inline int
arch_atomic_fetch_and(int i, atomic_t *v)
{
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_and_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic_fetch_and arch_atomic_fetch_and
#endif

#endif /* arch_atomic_fetch_and_relaxed */

#ifndef arch_atomic_andnot
static __always_inline void
arch_atomic_andnot(int i, atomic_t *v)
{
        arch_atomic_and(~i, v);
}
#define arch_atomic_andnot arch_atomic_andnot
#endif

#ifndef arch_atomic_fetch_andnot_relaxed
#ifdef arch_atomic_fetch_andnot
#define arch_atomic_fetch_andnot_acquire arch_atomic_fetch_andnot
#define arch_atomic_fetch_andnot_release arch_atomic_fetch_andnot
#define arch_atomic_fetch_andnot_relaxed arch_atomic_fetch_andnot
#endif /* arch_atomic_fetch_andnot */

#ifndef arch_atomic_fetch_andnot
static __always_inline int
arch_atomic_fetch_andnot(int i, atomic_t *v)
{
        return arch_atomic_fetch_and(~i, v);
}
#define arch_atomic_fetch_andnot arch_atomic_fetch_andnot
#endif

#ifndef arch_atomic_fetch_andnot_acquire
static __always_inline int
arch_atomic_fetch_andnot_acquire(int i, atomic_t *v)
{
        return arch_atomic_fetch_and_acquire(~i, v);
}
#define arch_atomic_fetch_andnot_acquire arch_atomic_fetch_andnot_acquire
#endif

#ifndef arch_atomic_fetch_andnot_release
static __always_inline int
arch_atomic_fetch_andnot_release(int i, atomic_t *v)
{
        return arch_atomic_fetch_and_release(~i, v);
}
#define arch_atomic_fetch_andnot_release arch_atomic_fetch_andnot_release
#endif

#ifndef arch_atomic_fetch_andnot_relaxed
static __always_inline int
arch_atomic_fetch_andnot_relaxed(int i, atomic_t *v)
{
        return arch_atomic_fetch_and_relaxed(~i, v);
}
#define arch_atomic_fetch_andnot_relaxed arch_atomic_fetch_andnot_relaxed
#endif

#else /* arch_atomic_fetch_andnot_relaxed */

#ifndef arch_atomic_fetch_andnot_acquire
static __always_inline int
arch_atomic_fetch_andnot_acquire(int i, atomic_t *v)
{
        int ret = arch_atomic_fetch_andnot_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic_fetch_andnot_acquire arch_atomic_fetch_andnot_acquire
#endif

#ifndef arch_atomic_fetch_andnot_release
static __always_inline int
arch_atomic_fetch_andnot_release(int i, atomic_t *v)
{
        __atomic_release_fence();
        return arch_atomic_fetch_andnot_relaxed(i, v);
}
#define arch_atomic_fetch_andnot_release arch_atomic_fetch_andnot_release
#endif

#ifndef arch_atomic_fetch_andnot
static __always_inline int
arch_atomic_fetch_andnot(int i, atomic_t *v)
{
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_andnot_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic_fetch_andnot arch_atomic_fetch_andnot
#endif

#endif /* arch_atomic_fetch_andnot_relaxed */

#ifndef arch_atomic_fetch_or_relaxed
#define arch_atomic_fetch_or_acquire arch_atomic_fetch_or
#define arch_atomic_fetch_or_release arch_atomic_fetch_or
#define arch_atomic_fetch_or_relaxed arch_atomic_fetch_or
#else /* arch_atomic_fetch_or_relaxed */

#ifndef arch_atomic_fetch_or_acquire
static __always_inline int
arch_atomic_fetch_or_acquire(int i, atomic_t *v)
{
        int ret = arch_atomic_fetch_or_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic_fetch_or_acquire arch_atomic_fetch_or_acquire
#endif

#ifndef arch_atomic_fetch_or_release
static __always_inline int
arch_atomic_fetch_or_release(int i, atomic_t *v)
{
        __atomic_release_fence();
        return arch_atomic_fetch_or_relaxed(i, v);
}
#define arch_atomic_fetch_or_release arch_atomic_fetch_or_release
#endif

#ifndef arch_atomic_fetch_or
static __always_inline int
arch_atomic_fetch_or(int i, atomic_t *v)
{
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_or_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic_fetch_or arch_atomic_fetch_or
#endif

#endif /* arch_atomic_fetch_or_relaxed */

#ifndef arch_atomic_fetch_xor_relaxed
#define arch_atomic_fetch_xor_acquire arch_atomic_fetch_xor
#define arch_atomic_fetch_xor_release arch_atomic_fetch_xor
#define arch_atomic_fetch_xor_relaxed arch_atomic_fetch_xor
#else /* arch_atomic_fetch_xor_relaxed */

#ifndef arch_atomic_fetch_xor_acquire
static __always_inline int
arch_atomic_fetch_xor_acquire(int i, atomic_t *v)
{
        int ret = arch_atomic_fetch_xor_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic_fetch_xor_acquire arch_atomic_fetch_xor_acquire
#endif

#ifndef arch_atomic_fetch_xor_release
static __always_inline int
arch_atomic_fetch_xor_release(int i, atomic_t *v)
{
        __atomic_release_fence();
        return arch_atomic_fetch_xor_relaxed(i, v);
}
#define arch_atomic_fetch_xor_release arch_atomic_fetch_xor_release
#endif

#ifndef arch_atomic_fetch_xor
static __always_inline int
arch_atomic_fetch_xor(int i, atomic_t *v)
{
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_xor_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic_fetch_xor arch_atomic_fetch_xor
#endif

#endif /* arch_atomic_fetch_xor_relaxed */

#ifndef arch_atomic_xchg_relaxed
#define arch_atomic_xchg_acquire arch_atomic_xchg
#define arch_atomic_xchg_release arch_atomic_xchg
#define arch_atomic_xchg_relaxed arch_atomic_xchg
#else /* arch_atomic_xchg_relaxed */

#ifndef arch_atomic_xchg_acquire
static __always_inline int
arch_atomic_xchg_acquire(atomic_t *v, int i)
{
        int ret = arch_atomic_xchg_relaxed(v, i);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic_xchg_acquire arch_atomic_xchg_acquire
#endif

#ifndef arch_atomic_xchg_release
static __always_inline int
arch_atomic_xchg_release(atomic_t *v, int i)
{
        __atomic_release_fence();
        return arch_atomic_xchg_relaxed(v, i);
}
#define arch_atomic_xchg_release arch_atomic_xchg_release
#endif

#ifndef arch_atomic_xchg
static __always_inline int
arch_atomic_xchg(atomic_t *v, int i)
{
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_xchg_relaxed(v, i);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic_xchg arch_atomic_xchg
#endif

#endif /* arch_atomic_xchg_relaxed */

#ifndef arch_atomic_cmpxchg_relaxed
#define arch_atomic_cmpxchg_acquire arch_atomic_cmpxchg
#define arch_atomic_cmpxchg_release arch_atomic_cmpxchg
#define arch_atomic_cmpxchg_relaxed arch_atomic_cmpxchg
#else /* arch_atomic_cmpxchg_relaxed */

#ifndef arch_atomic_cmpxchg_acquire
static __always_inline int
arch_atomic_cmpxchg_acquire(atomic_t *v, int old, int new)
{
        int ret = arch_atomic_cmpxchg_relaxed(v, old, new);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic_cmpxchg_acquire arch_atomic_cmpxchg_acquire
#endif

#ifndef arch_atomic_cmpxchg_release
static __always_inline int
arch_atomic_cmpxchg_release(atomic_t *v, int old, int new)
{
        __atomic_release_fence();
        return arch_atomic_cmpxchg_relaxed(v, old, new);
}
#define arch_atomic_cmpxchg_release arch_atomic_cmpxchg_release
#endif

#ifndef arch_atomic_cmpxchg
static __always_inline int
arch_atomic_cmpxchg(atomic_t *v, int old, int new)
{
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_cmpxchg_relaxed(v, old, new);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic_cmpxchg arch_atomic_cmpxchg
#endif

#endif /* arch_atomic_cmpxchg_relaxed */

#ifndef arch_atomic_try_cmpxchg_relaxed
#ifdef arch_atomic_try_cmpxchg
#define arch_atomic_try_cmpxchg_acquire arch_atomic_try_cmpxchg
#define arch_atomic_try_cmpxchg_release arch_atomic_try_cmpxchg
#define arch_atomic_try_cmpxchg_relaxed arch_atomic_try_cmpxchg
#endif /* arch_atomic_try_cmpxchg */

#ifndef arch_atomic_try_cmpxchg
static __always_inline bool
arch_atomic_try_cmpxchg(atomic_t *v, int *old, int new)
{
        int r, o = *old;
        r = arch_atomic_cmpxchg(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
}
#define arch_atomic_try_cmpxchg arch_atomic_try_cmpxchg
#endif

#ifndef arch_atomic_try_cmpxchg_acquire
static __always_inline bool
arch_atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new)
{
        int r, o = *old;
        r = arch_atomic_cmpxchg_acquire(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
}
#define arch_atomic_try_cmpxchg_acquire arch_atomic_try_cmpxchg_acquire
#endif

#ifndef arch_atomic_try_cmpxchg_release
static __always_inline bool
arch_atomic_try_cmpxchg_release(atomic_t *v, int *old, int new)
{
        int r, o = *old;
        r = arch_atomic_cmpxchg_release(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
}
#define arch_atomic_try_cmpxchg_release arch_atomic_try_cmpxchg_release
#endif

#ifndef arch_atomic_try_cmpxchg_relaxed
static __always_inline bool
arch_atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new)
{
        int r, o = *old;
        r = arch_atomic_cmpxchg_relaxed(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
}
#define arch_atomic_try_cmpxchg_relaxed arch_atomic_try_cmpxchg_relaxed
#endif

#else /* arch_atomic_try_cmpxchg_relaxed */

#ifndef arch_atomic_try_cmpxchg_acquire
static __always_inline bool
arch_atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new)
{
        bool ret = arch_atomic_try_cmpxchg_relaxed(v, old, new);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic_try_cmpxchg_acquire arch_atomic_try_cmpxchg_acquire
#endif

#ifndef arch_atomic_try_cmpxchg_release
static __always_inline bool
arch_atomic_try_cmpxchg_release(atomic_t *v, int *old, int new)
{
        __atomic_release_fence();
        return arch_atomic_try_cmpxchg_relaxed(v, old, new);
}
#define arch_atomic_try_cmpxchg_release arch_atomic_try_cmpxchg_release
#endif

#ifndef arch_atomic_try_cmpxchg
static __always_inline bool
arch_atomic_try_cmpxchg(atomic_t *v, int *old, int new)
{
        bool ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_try_cmpxchg_relaxed(v, old, new);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic_try_cmpxchg arch_atomic_try_cmpxchg
#endif

#endif /* arch_atomic_try_cmpxchg_relaxed */

#ifndef arch_atomic_sub_and_test
/**
 * arch_atomic_sub_and_test - subtract value from variable and test result
 * @i: integer value to subtract
 * @v: pointer of type atomic_t
 *
 * Atomically subtracts @i from @v and returns
 * true if the result is zero, or false for all
 * other cases.
 */
static __always_inline bool
arch_atomic_sub_and_test(int i, atomic_t *v)
{
        return arch_atomic_sub_return(i, v) == 0;
}
#define arch_atomic_sub_and_test arch_atomic_sub_and_test
#endif

#ifndef arch_atomic_dec_and_test
/**
 * arch_atomic_dec_and_test - decrement and test
 * @v: pointer of type atomic_t
 *
 * Atomically decrements @v by 1 and
 * returns true if the result is 0, or false for all other
 * cases.
 */
static __always_inline bool
arch_atomic_dec_and_test(atomic_t *v)
{
        return arch_atomic_dec_return(v) == 0;
}
#define arch_atomic_dec_and_test arch_atomic_dec_and_test
#endif

#ifndef arch_atomic_inc_and_test
/**
 * arch_atomic_inc_and_test - increment and test
 * @v: pointer of type atomic_t
 *
 * Atomically increments @v by 1
 * and returns true if the result is zero, or false for all
 * other cases.
 */
static __always_inline bool
arch_atomic_inc_and_test(atomic_t *v)
{
        return arch_atomic_inc_return(v) == 0;
}
#define arch_atomic_inc_and_test arch_atomic_inc_and_test
#endif

#ifndef arch_atomic_add_negative
/**
 * arch_atomic_add_negative - add and test if negative
 * @i: integer value to add
 * @v: pointer of type atomic_t
 *
 * Atomically adds @i to @v and returns true
 * if the result is negative, or false when
 * result is greater than or equal to zero.
 */
static __always_inline bool
arch_atomic_add_negative(int i, atomic_t *v)
{
        return arch_atomic_add_return(i, v) < 0;
}
#define arch_atomic_add_negative arch_atomic_add_negative
#endif

#ifndef arch_atomic_fetch_add_unless
/**
 * arch_atomic_fetch_add_unless - add unless the number is already a given value
 * @v: pointer of type atomic_t
 * @a: the amount to add to v...
 * @u: ...unless v is equal to u.
 *
 * Atomically adds @a to @v, so long as @v was not already @u.
 * Returns original value of @v
 */
static __always_inline int
arch_atomic_fetch_add_unless(atomic_t *v, int a, int u)
{
        int c = arch_atomic_read(v);

        do {
                if (unlikely(c == u))
                        break;
        } while (!arch_atomic_try_cmpxchg(v, &c, c + a));

        return c;
}
#define arch_atomic_fetch_add_unless arch_atomic_fetch_add_unless
#endif

#ifndef arch_atomic_add_unless
/**
 * arch_atomic_add_unless - add unless the number is already a given value
 * @v: pointer of type atomic_t
 * @a: the amount to add to v...
 * @u: ...unless v is equal to u.
 *
 * Atomically adds @a to @v, if @v was not already @u.
 * Returns true if the addition was done.
 */
static __always_inline bool
arch_atomic_add_unless(atomic_t *v, int a, int u)
{
        return arch_atomic_fetch_add_unless(v, a, u) != u;
}
#define arch_atomic_add_unless arch_atomic_add_unless
#endif

#ifndef arch_atomic_inc_not_zero
/**
 * arch_atomic_inc_not_zero - increment unless the number is zero
 * @v: pointer of type atomic_t
 *
 * Atomically increments @v by 1, if @v is non-zero.
 * Returns true if the increment was done.
 */
static __always_inline bool
arch_atomic_inc_not_zero(atomic_t *v)
{
        return arch_atomic_add_unless(v, 1, 0);
}
#define arch_atomic_inc_not_zero arch_atomic_inc_not_zero
#endif

#ifndef arch_atomic_inc_unless_negative
static __always_inline bool
arch_atomic_inc_unless_negative(atomic_t *v)
{
        int c = arch_atomic_read(v);

        do {
                if (unlikely(c < 0))
                        return false;
        } while (!arch_atomic_try_cmpxchg(v, &c, c + 1));

        return true;
}
#define arch_atomic_inc_unless_negative arch_atomic_inc_unless_negative
#endif

#ifndef arch_atomic_dec_unless_positive
static __always_inline bool
arch_atomic_dec_unless_positive(atomic_t *v)
{
        int c = arch_atomic_read(v);

        do {
                if (unlikely(c > 0))
                        return false;
        } while (!arch_atomic_try_cmpxchg(v, &c, c - 1));

        return true;
}
#define arch_atomic_dec_unless_positive arch_atomic_dec_unless_positive
#endif

#ifndef arch_atomic_dec_if_positive
static __always_inline int
arch_atomic_dec_if_positive(atomic_t *v)
{
        int dec, c = arch_atomic_read(v);

        do {
                dec = c - 1;
                if (unlikely(dec < 0))
                        break;
        } while (!arch_atomic_try_cmpxchg(v, &c, dec));

        return dec;
}
#define arch_atomic_dec_if_positive arch_atomic_dec_if_positive
#endif

#ifdef CONFIG_GENERIC_ATOMIC64
#include <asm-generic/atomic64.h>
#endif

#ifndef arch_atomic64_read_acquire
static __always_inline s64
arch_atomic64_read_acquire(const atomic64_t *v)
{
        return smp_load_acquire(&(v)->counter);
}
#define arch_atomic64_read_acquire arch_atomic64_read_acquire
#endif

#ifndef arch_atomic64_set_release
static __always_inline void
arch_atomic64_set_release(atomic64_t *v, s64 i)
{
        smp_store_release(&(v)->counter, i);
}
#define arch_atomic64_set_release arch_atomic64_set_release
#endif

#ifndef arch_atomic64_add_return_relaxed
#define arch_atomic64_add_return_acquire arch_atomic64_add_return
#define arch_atomic64_add_return_release arch_atomic64_add_return
#define arch_atomic64_add_return_relaxed arch_atomic64_add_return
#else /* arch_atomic64_add_return_relaxed */

#ifndef arch_atomic64_add_return_acquire
static __always_inline s64
arch_atomic64_add_return_acquire(s64 i, atomic64_t *v)
{
        s64 ret = arch_atomic64_add_return_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic64_add_return_acquire arch_atomic64_add_return_acquire
#endif

#ifndef arch_atomic64_add_return_release
static __always_inline s64
arch_atomic64_add_return_release(s64 i, atomic64_t *v)
{
        __atomic_release_fence();
        return arch_atomic64_add_return_relaxed(i, v);
}
#define arch_atomic64_add_return_release arch_atomic64_add_return_release
#endif

#ifndef arch_atomic64_add_return
static __always_inline s64
arch_atomic64_add_return(s64 i, atomic64_t *v)
{
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_add_return_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic64_add_return arch_atomic64_add_return
#endif

#endif /* arch_atomic64_add_return_relaxed */

#ifndef arch_atomic64_fetch_add_relaxed
#define arch_atomic64_fetch_add_acquire arch_atomic64_fetch_add
#define arch_atomic64_fetch_add_release arch_atomic64_fetch_add
#define arch_atomic64_fetch_add_relaxed arch_atomic64_fetch_add
#else /* arch_atomic64_fetch_add_relaxed */

#ifndef arch_atomic64_fetch_add_acquire
static __always_inline s64
arch_atomic64_fetch_add_acquire(s64 i, atomic64_t *v)
{
        s64 ret = arch_atomic64_fetch_add_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic64_fetch_add_acquire arch_atomic64_fetch_add_acquire
#endif

#ifndef arch_atomic64_fetch_add_release
static __always_inline s64
arch_atomic64_fetch_add_release(s64 i, atomic64_t *v)
{
        __atomic_release_fence();
        return arch_atomic64_fetch_add_relaxed(i, v);
}
#define arch_atomic64_fetch_add_release arch_atomic64_fetch_add_release
#endif

#ifndef arch_atomic64_fetch_add
static __always_inline s64
arch_atomic64_fetch_add(s64 i, atomic64_t *v)
{
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_add_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic64_fetch_add arch_atomic64_fetch_add
#endif

#endif /* arch_atomic64_fetch_add_relaxed */

#ifndef arch_atomic64_sub_return_relaxed
#define arch_atomic64_sub_return_acquire arch_atomic64_sub_return
#define arch_atomic64_sub_return_release arch_atomic64_sub_return
#define arch_atomic64_sub_return_relaxed arch_atomic64_sub_return
#else /* arch_atomic64_sub_return_relaxed */

#ifndef arch_atomic64_sub_return_acquire
static __always_inline s64
arch_atomic64_sub_return_acquire(s64 i, atomic64_t *v)
{
        s64 ret = arch_atomic64_sub_return_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic64_sub_return_acquire arch_atomic64_sub_return_acquire
#endif

#ifndef arch_atomic64_sub_return_release
static __always_inline s64
arch_atomic64_sub_return_release(s64 i, atomic64_t *v)
{
        __atomic_release_fence();
        return arch_atomic64_sub_return_relaxed(i, v);
}
#define arch_atomic64_sub_return_release arch_atomic64_sub_return_release
#endif

#ifndef arch_atomic64_sub_return
static __always_inline s64
arch_atomic64_sub_return(s64 i, atomic64_t *v)
{
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_sub_return_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic64_sub_return arch_atomic64_sub_return
#endif

#endif /* arch_atomic64_sub_return_relaxed */

#ifndef arch_atomic64_fetch_sub_relaxed
#define arch_atomic64_fetch_sub_acquire arch_atomic64_fetch_sub
#define arch_atomic64_fetch_sub_release arch_atomic64_fetch_sub
#define arch_atomic64_fetch_sub_relaxed arch_atomic64_fetch_sub
#else /* arch_atomic64_fetch_sub_relaxed */

#ifndef arch_atomic64_fetch_sub_acquire
static __always_inline s64
arch_atomic64_fetch_sub_acquire(s64 i, atomic64_t *v)
{
        s64 ret = arch_atomic64_fetch_sub_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic64_fetch_sub_acquire arch_atomic64_fetch_sub_acquire
#endif

#ifndef arch_atomic64_fetch_sub_release
static __always_inline s64
arch_atomic64_fetch_sub_release(s64 i, atomic64_t *v)
{
        __atomic_release_fence();
        return arch_atomic64_fetch_sub_relaxed(i, v);
}
#define arch_atomic64_fetch_sub_release arch_atomic64_fetch_sub_release
#endif

#ifndef arch_atomic64_fetch_sub
static __always_inline s64
arch_atomic64_fetch_sub(s64 i, atomic64_t *v)
{
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_sub_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic64_fetch_sub arch_atomic64_fetch_sub
#endif

#endif /* arch_atomic64_fetch_sub_relaxed */

#ifndef arch_atomic64_inc
static __always_inline void
arch_atomic64_inc(atomic64_t *v)
{
        arch_atomic64_add(1, v);
}
#define arch_atomic64_inc arch_atomic64_inc
#endif

#ifndef arch_atomic64_inc_return_relaxed
#ifdef arch_atomic64_inc_return
#define arch_atomic64_inc_return_acquire arch_atomic64_inc_return
#define arch_atomic64_inc_return_release arch_atomic64_inc_return
#define arch_atomic64_inc_return_relaxed arch_atomic64_inc_return
#endif /* arch_atomic64_inc_return */

#ifndef arch_atomic64_inc_return
static __always_inline s64
arch_atomic64_inc_return(atomic64_t *v)
{
        return arch_atomic64_add_return(1, v);
}
#define arch_atomic64_inc_return arch_atomic64_inc_return
#endif

#ifndef arch_atomic64_inc_return_acquire
static __always_inline s64
arch_atomic64_inc_return_acquire(atomic64_t *v)
{
        return arch_atomic64_add_return_acquire(1, v);
}
#define arch_atomic64_inc_return_acquire arch_atomic64_inc_return_acquire
#endif

#ifndef arch_atomic64_inc_return_release
static __always_inline s64
arch_atomic64_inc_return_release(atomic64_t *v)
{
        return arch_atomic64_add_return_release(1, v);
}
#define arch_atomic64_inc_return_release arch_atomic64_inc_return_release
#endif

#ifndef arch_atomic64_inc_return_relaxed
static __always_inline s64
arch_atomic64_inc_return_relaxed(atomic64_t *v)
{
        return arch_atomic64_add_return_relaxed(1, v);
}
#define arch_atomic64_inc_return_relaxed arch_atomic64_inc_return_relaxed
#endif

#else /* arch_atomic64_inc_return_relaxed */

#ifndef arch_atomic64_inc_return_acquire
static __always_inline s64
arch_atomic64_inc_return_acquire(atomic64_t *v)
{
        s64 ret = arch_atomic64_inc_return_relaxed(v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic64_inc_return_acquire arch_atomic64_inc_return_acquire
#endif

#ifndef arch_atomic64_inc_return_release
static __always_inline s64
arch_atomic64_inc_return_release(atomic64_t *v)
{
        __atomic_release_fence();
        return arch_atomic64_inc_return_relaxed(v);
}
#define arch_atomic64_inc_return_release arch_atomic64_inc_return_release
#endif

#ifndef arch_atomic64_inc_return
static __always_inline s64
arch_atomic64_inc_return(atomic64_t *v)
{
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_inc_return_relaxed(v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic64_inc_return arch_atomic64_inc_return
#endif

#endif /* arch_atomic64_inc_return_relaxed */

#ifndef arch_atomic64_fetch_inc_relaxed
#ifdef arch_atomic64_fetch_inc
#define arch_atomic64_fetch_inc_acquire arch_atomic64_fetch_inc
#define arch_atomic64_fetch_inc_release arch_atomic64_fetch_inc
#define arch_atomic64_fetch_inc_relaxed arch_atomic64_fetch_inc
#endif /* arch_atomic64_fetch_inc */

#ifndef arch_atomic64_fetch_inc
static __always_inline s64
arch_atomic64_fetch_inc(atomic64_t *v)
{
        return arch_atomic64_fetch_add(1, v);
}
#define arch_atomic64_fetch_inc arch_atomic64_fetch_inc
#endif

#ifndef arch_atomic64_fetch_inc_acquire
static __always_inline s64
arch_atomic64_fetch_inc_acquire(atomic64_t *v)
{
        return arch_atomic64_fetch_add_acquire(1, v);
}
#define arch_atomic64_fetch_inc_acquire arch_atomic64_fetch_inc_acquire
#endif

#ifndef arch_atomic64_fetch_inc_release
static __always_inline s64
arch_atomic64_fetch_inc_release(atomic64_t *v)
{
        return arch_atomic64_fetch_add_release(1, v);
}
#define arch_atomic64_fetch_inc_release arch_atomic64_fetch_inc_release
#endif

#ifndef arch_atomic64_fetch_inc_relaxed
static __always_inline s64
arch_atomic64_fetch_inc_relaxed(atomic64_t *v)
{
        return arch_atomic64_fetch_add_relaxed(1, v);
}
#define arch_atomic64_fetch_inc_relaxed arch_atomic64_fetch_inc_relaxed
#endif

#else /* arch_atomic64_fetch_inc_relaxed */

#ifndef arch_atomic64_fetch_inc_acquire
static __always_inline s64
arch_atomic64_fetch_inc_acquire(atomic64_t *v)
{
        s64 ret = arch_atomic64_fetch_inc_relaxed(v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic64_fetch_inc_acquire arch_atomic64_fetch_inc_acquire
#endif

#ifndef arch_atomic64_fetch_inc_release
static __always_inline s64
arch_atomic64_fetch_inc_release(atomic64_t *v)
{
        __atomic_release_fence();
        return arch_atomic64_fetch_inc_relaxed(v);
}
#define arch_atomic64_fetch_inc_release arch_atomic64_fetch_inc_release
#endif

#ifndef arch_atomic64_fetch_inc
static __always_inline s64
arch_atomic64_fetch_inc(atomic64_t *v)
{
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_inc_relaxed(v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic64_fetch_inc arch_atomic64_fetch_inc
#endif

#endif /* arch_atomic64_fetch_inc_relaxed */

#ifndef arch_atomic64_dec
static __always_inline void
arch_atomic64_dec(atomic64_t *v)
{
        arch_atomic64_sub(1, v);
}
#define arch_atomic64_dec arch_atomic64_dec
#endif

#ifndef arch_atomic64_dec_return_relaxed
#ifdef arch_atomic64_dec_return
#define arch_atomic64_dec_return_acquire arch_atomic64_dec_return
#define arch_atomic64_dec_return_release arch_atomic64_dec_return
#define arch_atomic64_dec_return_relaxed arch_atomic64_dec_return
#endif /* arch_atomic64_dec_return */

#ifndef arch_atomic64_dec_return
static __always_inline s64
arch_atomic64_dec_return(atomic64_t *v)
{
        return arch_atomic64_sub_return(1, v);
}
#define arch_atomic64_dec_return arch_atomic64_dec_return
#endif

#ifndef arch_atomic64_dec_return_acquire
static __always_inline s64
arch_atomic64_dec_return_acquire(atomic64_t *v)
{
        return arch_atomic64_sub_return_acquire(1, v);
}
#define arch_atomic64_dec_return_acquire arch_atomic64_dec_return_acquire
#endif

#ifndef arch_atomic64_dec_return_release
static __always_inline s64
arch_atomic64_dec_return_release(atomic64_t *v)
{
        return arch_atomic64_sub_return_release(1, v);
}
#define arch_atomic64_dec_return_release arch_atomic64_dec_return_release
#endif

#ifndef arch_atomic64_dec_return_relaxed
static __always_inline s64
arch_atomic64_dec_return_relaxed(atomic64_t *v)
{
        return arch_atomic64_sub_return_relaxed(1, v);
}
#define arch_atomic64_dec_return_relaxed arch_atomic64_dec_return_relaxed
#endif

#else /* arch_atomic64_dec_return_relaxed */

#ifndef arch_atomic64_dec_return_acquire
static __always_inline s64
arch_atomic64_dec_return_acquire(atomic64_t *v)
{
        s64 ret = arch_atomic64_dec_return_relaxed(v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic64_dec_return_acquire arch_atomic64_dec_return_acquire
#endif

#ifndef arch_atomic64_dec_return_release
static __always_inline s64
arch_atomic64_dec_return_release(atomic64_t *v)
{
        __atomic_release_fence();
        return arch_atomic64_dec_return_relaxed(v);
}
#define arch_atomic64_dec_return_release arch_atomic64_dec_return_release
#endif

#ifndef arch_atomic64_dec_return
static __always_inline s64
arch_atomic64_dec_return(atomic64_t *v)
{
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_dec_return_relaxed(v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic64_dec_return arch_atomic64_dec_return
#endif

#endif /* arch_atomic64_dec_return_relaxed */

#ifndef arch_atomic64_fetch_dec_relaxed
#ifdef arch_atomic64_fetch_dec
#define arch_atomic64_fetch_dec_acquire arch_atomic64_fetch_dec
#define arch_atomic64_fetch_dec_release arch_atomic64_fetch_dec
#define arch_atomic64_fetch_dec_relaxed arch_atomic64_fetch_dec
#endif /* arch_atomic64_fetch_dec */

#ifndef arch_atomic64_fetch_dec
static __always_inline s64
arch_atomic64_fetch_dec(atomic64_t *v)
{
        return arch_atomic64_fetch_sub(1, v);
}
#define arch_atomic64_fetch_dec arch_atomic64_fetch_dec
#endif

#ifndef arch_atomic64_fetch_dec_acquire
static __always_inline s64
arch_atomic64_fetch_dec_acquire(atomic64_t *v)
{
        return arch_atomic64_fetch_sub_acquire(1, v);
}
#define arch_atomic64_fetch_dec_acquire arch_atomic64_fetch_dec_acquire
#endif

#ifndef arch_atomic64_fetch_dec_release
static __always_inline s64
arch_atomic64_fetch_dec_release(atomic64_t *v)
{
        return arch_atomic64_fetch_sub_release(1, v);
}
#define arch_atomic64_fetch_dec_release arch_atomic64_fetch_dec_release
#endif

#ifndef arch_atomic64_fetch_dec_relaxed
static __always_inline s64
arch_atomic64_fetch_dec_relaxed(atomic64_t *v)
{
        return arch_atomic64_fetch_sub_relaxed(1, v);
}
#define arch_atomic64_fetch_dec_relaxed arch_atomic64_fetch_dec_relaxed
#endif

#else /* arch_atomic64_fetch_dec_relaxed */

#ifndef arch_atomic64_fetch_dec_acquire
static __always_inline s64
arch_atomic64_fetch_dec_acquire(atomic64_t *v)
{
        s64 ret = arch_atomic64_fetch_dec_relaxed(v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic64_fetch_dec_acquire arch_atomic64_fetch_dec_acquire
#endif

#ifndef arch_atomic64_fetch_dec_release
static __always_inline s64
arch_atomic64_fetch_dec_release(atomic64_t *v)
{
        __atomic_release_fence();
        return arch_atomic64_fetch_dec_relaxed(v);
}
#define arch_atomic64_fetch_dec_release arch_atomic64_fetch_dec_release
#endif

#ifndef arch_atomic64_fetch_dec
static __always_inline s64
arch_atomic64_fetch_dec(atomic64_t *v)
{
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_dec_relaxed(v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic64_fetch_dec arch_atomic64_fetch_dec
#endif

#endif /* arch_atomic64_fetch_dec_relaxed */

#ifndef arch_atomic64_fetch_and_relaxed
#define arch_atomic64_fetch_and_acquire arch_atomic64_fetch_and
#define arch_atomic64_fetch_and_release arch_atomic64_fetch_and
#define arch_atomic64_fetch_and_relaxed arch_atomic64_fetch_and
#else /* arch_atomic64_fetch_and_relaxed */

#ifndef arch_atomic64_fetch_and_acquire
static __always_inline s64
arch_atomic64_fetch_and_acquire(s64 i, atomic64_t *v)
{
        s64 ret = arch_atomic64_fetch_and_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic64_fetch_and_acquire arch_atomic64_fetch_and_acquire
#endif

#ifndef arch_atomic64_fetch_and_release
static __always_inline s64
arch_atomic64_fetch_and_release(s64 i, atomic64_t *v)
{
        __atomic_release_fence();
        return arch_atomic64_fetch_and_relaxed(i, v);
}
#define arch_atomic64_fetch_and_release arch_atomic64_fetch_and_release
#endif

#ifndef arch_atomic64_fetch_and
static __always_inline s64
arch_atomic64_fetch_and(s64 i, atomic64_t *v)
{
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_and_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic64_fetch_and arch_atomic64_fetch_and
#endif

#endif /* arch_atomic64_fetch_and_relaxed */

#ifndef arch_atomic64_andnot
static __always_inline void
arch_atomic64_andnot(s64 i, atomic64_t *v)
{
        arch_atomic64_and(~i, v);
}
#define arch_atomic64_andnot arch_atomic64_andnot
#endif

#ifndef arch_atomic64_fetch_andnot_relaxed
#ifdef arch_atomic64_fetch_andnot
#define arch_atomic64_fetch_andnot_acquire arch_atomic64_fetch_andnot
#define arch_atomic64_fetch_andnot_release arch_atomic64_fetch_andnot
#define arch_atomic64_fetch_andnot_relaxed arch_atomic64_fetch_andnot
#endif /* arch_atomic64_fetch_andnot */

#ifndef arch_atomic64_fetch_andnot
static __always_inline s64
arch_atomic64_fetch_andnot(s64 i, atomic64_t *v)
{
        return arch_atomic64_fetch_and(~i, v);
}
#define arch_atomic64_fetch_andnot arch_atomic64_fetch_andnot
#endif

#ifndef arch_atomic64_fetch_andnot_acquire
static __always_inline s64
arch_atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v)
{
        return arch_atomic64_fetch_and_acquire(~i, v);
}
#define arch_atomic64_fetch_andnot_acquire arch_atomic64_fetch_andnot_acquire
#endif

#ifndef arch_atomic64_fetch_andnot_release
static __always_inline s64
arch_atomic64_fetch_andnot_release(s64 i, atomic64_t *v)
{
        return arch_atomic64_fetch_and_release(~i, v);
}
#define arch_atomic64_fetch_andnot_release arch_atomic64_fetch_andnot_release
#endif

#ifndef arch_atomic64_fetch_andnot_relaxed
static __always_inline s64
arch_atomic64_fetch_andnot_relaxed(s64 i, atomic64_t *v)
{
        return arch_atomic64_fetch_and_relaxed(~i, v);
}
#define arch_atomic64_fetch_andnot_relaxed arch_atomic64_fetch_andnot_relaxed
#endif

#else /* arch_atomic64_fetch_andnot_relaxed */

#ifndef arch_atomic64_fetch_andnot_acquire
static __always_inline s64
arch_atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v)
{
        s64 ret = arch_atomic64_fetch_andnot_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic64_fetch_andnot_acquire arch_atomic64_fetch_andnot_acquire
#endif

#ifndef arch_atomic64_fetch_andnot_release
static __always_inline s64
arch_atomic64_fetch_andnot_release(s64 i, atomic64_t *v)
{
        __atomic_release_fence();
        return arch_atomic64_fetch_andnot_relaxed(i, v);
}
#define arch_atomic64_fetch_andnot_release arch_atomic64_fetch_andnot_release
#endif

#ifndef arch_atomic64_fetch_andnot
static __always_inline s64
arch_atomic64_fetch_andnot(s64 i, atomic64_t *v)
{
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_andnot_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic64_fetch_andnot arch_atomic64_fetch_andnot
#endif

#endif /* arch_atomic64_fetch_andnot_relaxed */

#ifndef arch_atomic64_fetch_or_relaxed
#define arch_atomic64_fetch_or_acquire arch_atomic64_fetch_or
#define arch_atomic64_fetch_or_release arch_atomic64_fetch_or
#define arch_atomic64_fetch_or_relaxed arch_atomic64_fetch_or
#else /* arch_atomic64_fetch_or_relaxed */

#ifndef arch_atomic64_fetch_or_acquire
static __always_inline s64
arch_atomic64_fetch_or_acquire(s64 i, atomic64_t *v)
{
        s64 ret = arch_atomic64_fetch_or_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic64_fetch_or_acquire arch_atomic64_fetch_or_acquire
#endif

#ifndef arch_atomic64_fetch_or_release
static __always_inline s64
arch_atomic64_fetch_or_release(s64 i, atomic64_t *v)
{
        __atomic_release_fence();
        return arch_atomic64_fetch_or_relaxed(i, v);
}
#define arch_atomic64_fetch_or_release arch_atomic64_fetch_or_release
#endif

#ifndef arch_atomic64_fetch_or
static __always_inline s64
arch_atomic64_fetch_or(s64 i, atomic64_t *v)
{
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_or_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic64_fetch_or arch_atomic64_fetch_or
#endif

#endif /* arch_atomic64_fetch_or_relaxed */

#ifndef arch_atomic64_fetch_xor_relaxed
#define arch_atomic64_fetch_xor_acquire arch_atomic64_fetch_xor
#define arch_atomic64_fetch_xor_release arch_atomic64_fetch_xor
#define arch_atomic64_fetch_xor_relaxed arch_atomic64_fetch_xor
#else /* arch_atomic64_fetch_xor_relaxed */

#ifndef arch_atomic64_fetch_xor_acquire
static __always_inline s64
arch_atomic64_fetch_xor_acquire(s64 i, atomic64_t *v)
{
        s64 ret = arch_atomic64_fetch_xor_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic64_fetch_xor_acquire arch_atomic64_fetch_xor_acquire
#endif

#ifndef arch_atomic64_fetch_xor_release
static __always_inline s64
arch_atomic64_fetch_xor_release(s64 i, atomic64_t *v)
{
        __atomic_release_fence();
        return arch_atomic64_fetch_xor_relaxed(i, v);
}
#define arch_atomic64_fetch_xor_release arch_atomic64_fetch_xor_release
#endif

#ifndef arch_atomic64_fetch_xor
static __always_inline s64
arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
{
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_xor_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic64_fetch_xor arch_atomic64_fetch_xor
#endif

#endif /* arch_atomic64_fetch_xor_relaxed */

#ifndef arch_atomic64_xchg_relaxed
#define arch_atomic64_xchg_acquire arch_atomic64_xchg
#define arch_atomic64_xchg_release arch_atomic64_xchg
#define arch_atomic64_xchg_relaxed arch_atomic64_xchg
#else /* arch_atomic64_xchg_relaxed */

#ifndef arch_atomic64_xchg_acquire
static __always_inline s64
arch_atomic64_xchg_acquire(atomic64_t *v, s64 i)
{
        s64 ret = arch_atomic64_xchg_relaxed(v, i);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic64_xchg_acquire arch_atomic64_xchg_acquire
#endif

#ifndef arch_atomic64_xchg_release
static __always_inline s64
arch_atomic64_xchg_release(atomic64_t *v, s64 i)
{
        __atomic_release_fence();
        return arch_atomic64_xchg_relaxed(v, i);
}
#define arch_atomic64_xchg_release arch_atomic64_xchg_release
#endif

#ifndef arch_atomic64_xchg
static __always_inline s64
arch_atomic64_xchg(atomic64_t *v, s64 i)
{
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_xchg_relaxed(v, i);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic64_xchg arch_atomic64_xchg
#endif

#endif /* arch_atomic64_xchg_relaxed */

#ifndef arch_atomic64_cmpxchg_relaxed
#define arch_atomic64_cmpxchg_acquire arch_atomic64_cmpxchg
#define arch_atomic64_cmpxchg_release arch_atomic64_cmpxchg
#define arch_atomic64_cmpxchg_relaxed arch_atomic64_cmpxchg
#else /* arch_atomic64_cmpxchg_relaxed */

#ifndef arch_atomic64_cmpxchg_acquire
static __always_inline s64
arch_atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new)
{
        s64 ret = arch_atomic64_cmpxchg_relaxed(v, old, new);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic64_cmpxchg_acquire arch_atomic64_cmpxchg_acquire
#endif

#ifndef arch_atomic64_cmpxchg_release
static __always_inline s64
arch_atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new)
{
        __atomic_release_fence();
        return arch_atomic64_cmpxchg_relaxed(v, old, new);
}
#define arch_atomic64_cmpxchg_release arch_atomic64_cmpxchg_release
#endif

#ifndef arch_atomic64_cmpxchg
static __always_inline s64
arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
{
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_cmpxchg_relaxed(v, old, new);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic64_cmpxchg arch_atomic64_cmpxchg
#endif

#endif /* arch_atomic64_cmpxchg_relaxed */

#ifndef arch_atomic64_try_cmpxchg_relaxed
#ifdef arch_atomic64_try_cmpxchg
#define arch_atomic64_try_cmpxchg_acquire arch_atomic64_try_cmpxchg
#define arch_atomic64_try_cmpxchg_release arch_atomic64_try_cmpxchg
#define arch_atomic64_try_cmpxchg_relaxed arch_atomic64_try_cmpxchg
#endif /* arch_atomic64_try_cmpxchg */

#ifndef arch_atomic64_try_cmpxchg
static __always_inline bool
arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
{
        s64 r, o = *old;
        r = arch_atomic64_cmpxchg(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
}
#define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg
#endif

#ifndef arch_atomic64_try_cmpxchg_acquire
static __always_inline bool
arch_atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new)
{
        s64 r, o = *old;
        r = arch_atomic64_cmpxchg_acquire(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
}
#define arch_atomic64_try_cmpxchg_acquire arch_atomic64_try_cmpxchg_acquire
#endif

#ifndef arch_atomic64_try_cmpxchg_release
static __always_inline bool
arch_atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new)
{
        s64 r, o = *old;
        r = arch_atomic64_cmpxchg_release(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
}
#define arch_atomic64_try_cmpxchg_release arch_atomic64_try_cmpxchg_release
#endif

#ifndef arch_atomic64_try_cmpxchg_relaxed
static __always_inline bool
arch_atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new)
{
        s64 r, o = *old;
        r = arch_atomic64_cmpxchg_relaxed(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
}
#define arch_atomic64_try_cmpxchg_relaxed arch_atomic64_try_cmpxchg_relaxed
#endif

#else /* arch_atomic64_try_cmpxchg_relaxed */

#ifndef arch_atomic64_try_cmpxchg_acquire
static __always_inline bool
arch_atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new)
{
        bool ret = arch_atomic64_try_cmpxchg_relaxed(v, old, new);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic64_try_cmpxchg_acquire arch_atomic64_try_cmpxchg_acquire
#endif

#ifndef arch_atomic64_try_cmpxchg_release
static __always_inline bool
arch_atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new)
{
        __atomic_release_fence();
        return arch_atomic64_try_cmpxchg_relaxed(v, old, new);
}
#define arch_atomic64_try_cmpxchg_release arch_atomic64_try_cmpxchg_release
#endif

#ifndef arch_atomic64_try_cmpxchg
static __always_inline bool
arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
{
        bool ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_try_cmpxchg_relaxed(v, old, new);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg
#endif

#endif /* arch_atomic64_try_cmpxchg_relaxed */

#ifndef arch_atomic64_sub_and_test
/**
 * arch_atomic64_sub_and_test - subtract value from variable and test result
 * @i: integer value to subtract
 * @v: pointer of type atomic64_t
 *
 * Atomically subtracts @i from @v and returns
 * true if the result is zero, or false for all
 * other cases.
 */
static __always_inline bool
arch_atomic64_sub_and_test(s64 i, atomic64_t *v)
{
        return arch_atomic64_sub_return(i, v) == 0;
}
#define arch_atomic64_sub_and_test arch_atomic64_sub_and_test
#endif

#ifndef arch_atomic64_dec_and_test
/**
 * arch_atomic64_dec_and_test - decrement and test
 * @v: pointer of type atomic64_t
 *
 * Atomically decrements @v by 1 and
 * returns true if the result is 0, or false for all other
 * cases.
 */
static __always_inline bool
arch_atomic64_dec_and_test(atomic64_t *v)
{
        return arch_atomic64_dec_return(v) == 0;
}
#define arch_atomic64_dec_and_test arch_atomic64_dec_and_test
#endif

#ifndef arch_atomic64_inc_and_test
/**
 * arch_atomic64_inc_and_test - increment and test
 * @v: pointer of type atomic64_t
 *
 * Atomically increments @v by 1
 * and returns true if the result is zero, or false for all
 * other cases.
 */
static __always_inline bool
arch_atomic64_inc_and_test(atomic64_t *v)
{
        return arch_atomic64_inc_return(v) == 0;
}
#define arch_atomic64_inc_and_test arch_atomic64_inc_and_test
#endif

#ifndef arch_atomic64_add_negative
/**
 * arch_atomic64_add_negative - add and test if negative
 * @i: integer value to add
 * @v: pointer of type atomic64_t
 *
 * Atomically adds @i to @v and returns true
 * if the result is negative, or false when
 * result is greater than or equal to zero.
 */
static __always_inline bool
arch_atomic64_add_negative(s64 i, atomic64_t *v)
{
        return arch_atomic64_add_return(i, v) < 0;
}
#define arch_atomic64_add_negative arch_atomic64_add_negative
#endif

#ifndef arch_atomic64_fetch_add_unless
/**
 * arch_atomic64_fetch_add_unless - add unless the number is already a given value
 * @v: pointer of type atomic64_t
 * @a: the amount to add to v...
 * @u: ...unless v is equal to u.
 *
 * Atomically adds @a to @v, so long as @v was not already @u.
 * Returns original value of @v
 */
static __always_inline s64
arch_atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u)
{
        s64 c = arch_atomic64_read(v);

        do {
                if (unlikely(c == u))
                        break;
        } while (!arch_atomic64_try_cmpxchg(v, &c, c + a));

        return c;
}
#define arch_atomic64_fetch_add_unless arch_atomic64_fetch_add_unless
#endif

#ifndef arch_atomic64_add_unless
/**
 * arch_atomic64_add_unless - add unless the number is already a given value
 * @v: pointer of type atomic64_t
 * @a: the amount to add to v...
 * @u: ...unless v is equal to u.
 *
 * Atomically adds @a to @v, if @v was not already @u.
 * Returns true if the addition was done.
 */
static __always_inline bool
arch_atomic64_add_unless(atomic64_t *v, s64 a, s64 u)
{
        return arch_atomic64_fetch_add_unless(v, a, u) != u;
}
#define arch_atomic64_add_unless arch_atomic64_add_unless
#endif

#ifndef arch_atomic64_inc_not_zero
/**
 * arch_atomic64_inc_not_zero - increment unless the number is zero
 * @v: pointer of type atomic64_t
 *
 * Atomically increments @v by 1, if @v is non-zero.
 * Returns true if the increment was done.
 */
static __always_inline bool
arch_atomic64_inc_not_zero(atomic64_t *v)
{
        return arch_atomic64_add_unless(v, 1, 0);
}
#define arch_atomic64_inc_not_zero arch_atomic64_inc_not_zero
#endif

#ifndef arch_atomic64_inc_unless_negative
static __always_inline bool
arch_atomic64_inc_unless_negative(atomic64_t *v)
{
        s64 c = arch_atomic64_read(v);

        do {
                if (unlikely(c < 0))
                        return false;
        } while (!arch_atomic64_try_cmpxchg(v, &c, c + 1));

        return true;
}
#define arch_atomic64_inc_unless_negative arch_atomic64_inc_unless_negative
#endif

#ifndef arch_atomic64_dec_unless_positive
static __always_inline bool
arch_atomic64_dec_unless_positive(atomic64_t *v)
{
        s64 c = arch_atomic64_read(v);

        do {
                if (unlikely(c > 0))
                        return false;
        } while (!arch_atomic64_try_cmpxchg(v, &c, c - 1));

        return true;
}
#define arch_atomic64_dec_unless_positive arch_atomic64_dec_unless_positive
#endif

#ifndef arch_atomic64_dec_if_positive
static __always_inline s64
arch_atomic64_dec_if_positive(atomic64_t *v)
{
        s64 dec, c = arch_atomic64_read(v);

        do {
                dec = c - 1;
                if (unlikely(dec < 0))
                        break;
        } while (!arch_atomic64_try_cmpxchg(v, &c, dec));

        return dec;
}
#define arch_atomic64_dec_if_positive arch_atomic64_dec_if_positive
#endif

#endif /* _LINUX_ATOMIC_FALLBACK_H */
// 90cd26cfd69d2250303d654955a0cc12620fb91b



















































































































































































































































































































































































































    1 


    1 





















    1 











































































































    7 
    7 
    7 
    7 

























    7 











































































































































































































































































































































































































































































































    1 




    1 



    1 




































    1 












































    7 




    7 



    7 

    7 






























    6 


















    1 











    4 




















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Implementation of the kernel access vector cache (AVC).
 *
 * Authors:  Stephen Smalley, <sds@tycho.nsa.gov>
 *             James Morris <jmorris@redhat.com>
 *
 * Update:   KaiGai, Kohei <kaigai@ak.jp.nec.com>
 *        Replaced the avc_lock spinlock by RCU.
 *
 * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
 */
#include <linux/types.h>
#include <linux/stddef.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/dcache.h>
#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/percpu.h>
#include <linux/list.h>
#include <net/sock.h>
#include <linux/un.h>
#include <net/af_unix.h>
#include <linux/ip.h>
#include <linux/audit.h>
#include <linux/ipv6.h>
#include <net/ipv6.h>
#include "avc.h"
#include "avc_ss.h"
#include "classmap.h"

#define CREATE_TRACE_POINTS
#include <trace/events/avc.h>

#define AVC_CACHE_SLOTS                        512
#define AVC_DEF_CACHE_THRESHOLD                512
#define AVC_CACHE_RECLAIM                16

#ifdef CONFIG_SECURITY_SELINUX_AVC_STATS
#define avc_cache_stats_incr(field)        this_cpu_inc(avc_cache_stats.field)
#else
#define avc_cache_stats_incr(field)        do {} while (0)
#endif

struct avc_entry {
        u32                        ssid;
        u32                        tsid;
        u16                        tclass;
        struct av_decision        avd;
        struct avc_xperms_node        *xp_node;
};

struct avc_node {
        struct avc_entry        ae;
        struct hlist_node        list; /* anchored in avc_cache->slots[i] */
        struct rcu_head                rhead;
};

struct avc_xperms_decision_node {
        struct extended_perms_decision xpd;
        struct list_head xpd_list; /* list of extended_perms_decision */
};

struct avc_xperms_node {
        struct extended_perms xp;
        struct list_head xpd_head; /* list head of extended_perms_decision */
};

struct avc_cache {
        struct hlist_head        slots[AVC_CACHE_SLOTS]; /* head for avc_node->list */
        spinlock_t                slots_lock[AVC_CACHE_SLOTS]; /* lock for writes */
        atomic_t                lru_hint;        /* LRU hint for reclaim scan */
        atomic_t                active_nodes;
        u32                        latest_notif;        /* latest revocation notification */
};

struct avc_callback_node {
        int (*callback) (u32 event);
        u32 events;
        struct avc_callback_node *next;
};

#ifdef CONFIG_SECURITY_SELINUX_AVC_STATS
DEFINE_PER_CPU(struct avc_cache_stats, avc_cache_stats) = { 0 };
#endif

struct selinux_avc {
        unsigned int avc_cache_threshold;
        struct avc_cache avc_cache;
};

static struct selinux_avc selinux_avc;

void selinux_avc_init(struct selinux_avc **avc)
{
        int i;

        selinux_avc.avc_cache_threshold = AVC_DEF_CACHE_THRESHOLD;
        for (i = 0; i < AVC_CACHE_SLOTS; i++) {
                INIT_HLIST_HEAD(&selinux_avc.avc_cache.slots[i]);
                spin_lock_init(&selinux_avc.avc_cache.slots_lock[i]);
        }
        atomic_set(&selinux_avc.avc_cache.active_nodes, 0);
        atomic_set(&selinux_avc.avc_cache.lru_hint, 0);
        *avc = &selinux_avc;
}

unsigned int avc_get_cache_threshold(struct selinux_avc *avc)
{
        return avc->avc_cache_threshold;
}

void avc_set_cache_threshold(struct selinux_avc *avc,
                             unsigned int cache_threshold)
{
        avc->avc_cache_threshold = cache_threshold;
}

static struct avc_callback_node *avc_callbacks;
static struct kmem_cache *avc_node_cachep;
static struct kmem_cache *avc_xperms_data_cachep;
static struct kmem_cache *avc_xperms_decision_cachep;
static struct kmem_cache *avc_xperms_cachep;

static inline int avc_hash(u32 ssid, u32 tsid, u16 tclass)
{
        return (ssid ^ (tsid<<2) ^ (tclass<<4)) & (AVC_CACHE_SLOTS - 1);
}

/**
 * avc_init - Initialize the AVC.
 *
 * Initialize the access vector cache.
 */
void __init avc_init(void)
{
        avc_node_cachep = kmem_cache_create("avc_node", sizeof(struct avc_node),
                                        0, SLAB_PANIC, NULL);
        avc_xperms_cachep = kmem_cache_create("avc_xperms_node",
                                        sizeof(struct avc_xperms_node),
                                        0, SLAB_PANIC, NULL);
        avc_xperms_decision_cachep = kmem_cache_create(
                                        "avc_xperms_decision_node",
                                        sizeof(struct avc_xperms_decision_node),
                                        0, SLAB_PANIC, NULL);
        avc_xperms_data_cachep = kmem_cache_create("avc_xperms_data",
                                        sizeof(struct extended_perms_data),
                                        0, SLAB_PANIC, NULL);
}

int avc_get_hash_stats(struct selinux_avc *avc, char *page)
{
        int i, chain_len, max_chain_len, slots_used;
        struct avc_node *node;
        struct hlist_head *head;

        rcu_read_lock();

        slots_used = 0;
        max_chain_len = 0;
        for (i = 0; i < AVC_CACHE_SLOTS; i++) {
                head = &avc->avc_cache.slots[i];
                if (!hlist_empty(head)) {
                        slots_used++;
                        chain_len = 0;
                        hlist_for_each_entry_rcu(node, head, list)
                                chain_len++;
                        if (chain_len > max_chain_len)
                                max_chain_len = chain_len;
                }
        }

        rcu_read_unlock();

        return scnprintf(page, PAGE_SIZE, "entries: %d\nbuckets used: %d/%d\n"
                         "longest chain: %d\n",
                         atomic_read(&avc->avc_cache.active_nodes),
                         slots_used, AVC_CACHE_SLOTS, max_chain_len);
}

/*
 * using a linked list for extended_perms_decision lookup because the list is
 * always small. i.e. less than 5, typically 1
 */
static struct extended_perms_decision *avc_xperms_decision_lookup(u8 driver,
                                        struct avc_xperms_node *xp_node)
{
        struct avc_xperms_decision_node *xpd_node;

        list_for_each_entry(xpd_node, &xp_node->xpd_head, xpd_list) {
                if (xpd_node->xpd.driver == driver)
                        return &xpd_node->xpd;
        }
        return NULL;
}

static inline unsigned int
avc_xperms_has_perm(struct extended_perms_decision *xpd,
                                        u8 perm, u8 which)
{
        unsigned int rc = 0;

        if ((which == XPERMS_ALLOWED) &&
                        (xpd->used & XPERMS_ALLOWED))
                rc = security_xperm_test(xpd->allowed->p, perm);
        else if ((which == XPERMS_AUDITALLOW) &&
                        (xpd->used & XPERMS_AUDITALLOW))
                rc = security_xperm_test(xpd->auditallow->p, perm);
        else if ((which == XPERMS_DONTAUDIT) &&
                        (xpd->used & XPERMS_DONTAUDIT))
                rc = security_xperm_test(xpd->dontaudit->p, perm);
        return rc;
}

static void avc_xperms_allow_perm(struct avc_xperms_node *xp_node,
                                u8 driver, u8 perm)
{
        struct extended_perms_decision *xpd;
        security_xperm_set(xp_node->xp.drivers.p, driver);
        xpd = avc_xperms_decision_lookup(driver, xp_node);
        if (xpd && xpd->allowed)
                security_xperm_set(xpd->allowed->p, perm);
}

static void avc_xperms_decision_free(struct avc_xperms_decision_node *xpd_node)
{
        struct extended_perms_decision *xpd;

        xpd = &xpd_node->xpd;
        if (xpd->allowed)
                kmem_cache_free(avc_xperms_data_cachep, xpd->allowed);
        if (xpd->auditallow)
                kmem_cache_free(avc_xperms_data_cachep, xpd->auditallow);
        if (xpd->dontaudit)
                kmem_cache_free(avc_xperms_data_cachep, xpd->dontaudit);
        kmem_cache_free(avc_xperms_decision_cachep, xpd_node);
}

static void avc_xperms_free(struct avc_xperms_node *xp_node)
{
        struct avc_xperms_decision_node *xpd_node, *tmp;

        if (!xp_node)
                return;

        list_for_each_entry_safe(xpd_node, tmp, &xp_node->xpd_head, xpd_list) {
                list_del(&xpd_node->xpd_list);
                avc_xperms_decision_free(xpd_node);
        }
        kmem_cache_free(avc_xperms_cachep, xp_node);
}

static void avc_copy_xperms_decision(struct extended_perms_decision *dest,
                                        struct extended_perms_decision *src)
{
        dest->driver = src->driver;
        dest->used = src->used;
        if (dest->used & XPERMS_ALLOWED)
                memcpy(dest->allowed->p, src->allowed->p,
                                sizeof(src->allowed->p));
        if (dest->used & XPERMS_AUDITALLOW)
                memcpy(dest->auditallow->p, src->auditallow->p,
                                sizeof(src->auditallow->p));
        if (dest->used & XPERMS_DONTAUDIT)
                memcpy(dest->dontaudit->p, src->dontaudit->p,
                                sizeof(src->dontaudit->p));
}

/*
 * similar to avc_copy_xperms_decision, but only copy decision
 * information relevant to this perm
 */
static inline void avc_quick_copy_xperms_decision(u8 perm,
                        struct extended_perms_decision *dest,
                        struct extended_perms_decision *src)
{
        /*
         * compute index of the u32 of the 256 bits (8 u32s) that contain this
         * command permission
         */
        u8 i = perm >> 5;

        dest->used = src->used;
        if (dest->used & XPERMS_ALLOWED)
                dest->allowed->p[i] = src->allowed->p[i];
        if (dest->used & XPERMS_AUDITALLOW)
                dest->auditallow->p[i] = src->auditallow->p[i];
        if (dest->used & XPERMS_DONTAUDIT)
                dest->dontaudit->p[i] = src->dontaudit->p[i];
}

static struct avc_xperms_decision_node
                *avc_xperms_decision_alloc(u8 which)
{
        struct avc_xperms_decision_node *xpd_node;
        struct extended_perms_decision *xpd;

        xpd_node = kmem_cache_zalloc(avc_xperms_decision_cachep,
                                     GFP_NOWAIT | __GFP_NOWARN);
        if (!xpd_node)
                return NULL;

        xpd = &xpd_node->xpd;
        if (which & XPERMS_ALLOWED) {
                xpd->allowed = kmem_cache_zalloc(avc_xperms_data_cachep,
                                                GFP_NOWAIT | __GFP_NOWARN);
                if (!xpd->allowed)
                        goto error;
        }
        if (which & XPERMS_AUDITALLOW) {
                xpd->auditallow = kmem_cache_zalloc(avc_xperms_data_cachep,
                                                GFP_NOWAIT | __GFP_NOWARN);
                if (!xpd->auditallow)
                        goto error;
        }
        if (which & XPERMS_DONTAUDIT) {
                xpd->dontaudit = kmem_cache_zalloc(avc_xperms_data_cachep,
                                                GFP_NOWAIT | __GFP_NOWARN);
                if (!xpd->dontaudit)
                        goto error;
        }
        return xpd_node;
error:
        avc_xperms_decision_free(xpd_node);
        return NULL;
}

static int avc_add_xperms_decision(struct avc_node *node,
                        struct extended_perms_decision *src)
{
        struct avc_xperms_decision_node *dest_xpd;

        dest_xpd = avc_xperms_decision_alloc(src->used);
        if (!dest_xpd)
                return -ENOMEM;
        avc_copy_xperms_decision(&dest_xpd->xpd, src);
        list_add(&dest_xpd->xpd_list, &node->ae.xp_node->xpd_head);
        node->ae.xp_node->xp.len++;
        return 0;
}

static struct avc_xperms_node *avc_xperms_alloc(void)
{
        struct avc_xperms_node *xp_node;

        xp_node = kmem_cache_zalloc(avc_xperms_cachep, GFP_NOWAIT | __GFP_NOWARN);
        if (!xp_node)
                return xp_node;
        INIT_LIST_HEAD(&xp_node->xpd_head);
        return xp_node;
}

static int avc_xperms_populate(struct avc_node *node,
                                struct avc_xperms_node *src)
{
        struct avc_xperms_node *dest;
        struct avc_xperms_decision_node *dest_xpd;
        struct avc_xperms_decision_node *src_xpd;

        if (src->xp.len == 0)
                return 0;
        dest = avc_xperms_alloc();
        if (!dest)
                return -ENOMEM;

        memcpy(dest->xp.drivers.p, src->xp.drivers.p, sizeof(dest->xp.drivers.p));
        dest->xp.len = src->xp.len;

        /* for each source xpd allocate a destination xpd and copy */
        list_for_each_entry(src_xpd, &src->xpd_head, xpd_list) {
                dest_xpd = avc_xperms_decision_alloc(src_xpd->xpd.used);
                if (!dest_xpd)
                        goto error;
                avc_copy_xperms_decision(&dest_xpd->xpd, &src_xpd->xpd);
                list_add(&dest_xpd->xpd_list, &dest->xpd_head);
        }
        node->ae.xp_node = dest;
        return 0;
error:
        avc_xperms_free(dest);
        return -ENOMEM;

}

static inline u32 avc_xperms_audit_required(u32 requested,
                                        struct av_decision *avd,
                                        struct extended_perms_decision *xpd,
                                        u8 perm,
                                        int result,
                                        u32 *deniedp)
{
        u32 denied, audited;

        denied = requested & ~avd->allowed;
        if (unlikely(denied)) {
                audited = denied & avd->auditdeny;
                if (audited && xpd) {
                        if (avc_xperms_has_perm(xpd, perm, XPERMS_DONTAUDIT))
                                audited &= ~requested;
                }
        } else if (result) {
                audited = denied = requested;
        } else {
                audited = requested & avd->auditallow;
                if (audited && xpd) {
                        if (!avc_xperms_has_perm(xpd, perm, XPERMS_AUDITALLOW))
                                audited &= ~requested;
                }
        }

        *deniedp = denied;
        return audited;
}

static inline int avc_xperms_audit(struct selinux_state *state,
                                   u32 ssid, u32 tsid, u16 tclass,
                                   u32 requested, struct av_decision *avd,
                                   struct extended_perms_decision *xpd,
                                   u8 perm, int result,
                                   struct common_audit_data *ad)
{
        u32 audited, denied;

        audited = avc_xperms_audit_required(
                        requested, avd, xpd, perm, result, &denied);
        if (likely(!audited))
                return 0;
        return slow_avc_audit(state, ssid, tsid, tclass, requested,
                        audited, denied, result, ad);
}

static void avc_node_free(struct rcu_head *rhead)
{
        struct avc_node *node = container_of(rhead, struct avc_node, rhead);
        avc_xperms_free(node->ae.xp_node);
        kmem_cache_free(avc_node_cachep, node);
        avc_cache_stats_incr(frees);
}

static void avc_node_delete(struct selinux_avc *avc, struct avc_node *node)
{
        hlist_del_rcu(&node->list);
        call_rcu(&node->rhead, avc_node_free);
        atomic_dec(&avc->avc_cache.active_nodes);
}

static void avc_node_kill(struct selinux_avc *avc, struct avc_node *node)
{
        avc_xperms_free(node->ae.xp_node);
        kmem_cache_free(avc_node_cachep, node);
        avc_cache_stats_incr(frees);
        atomic_dec(&avc->avc_cache.active_nodes);
}

static void avc_node_replace(struct selinux_avc *avc,
                             struct avc_node *new, struct avc_node *old)
{
        hlist_replace_rcu(&old->list, &new->list);
        call_rcu(&old->rhead, avc_node_free);
        atomic_dec(&avc->avc_cache.active_nodes);
}

static inline int avc_reclaim_node(struct selinux_avc *avc)
{
        struct avc_node *node;
        int hvalue, try, ecx;
        unsigned long flags;
        struct hlist_head *head;
        spinlock_t *lock;

        for (try = 0, ecx = 0; try < AVC_CACHE_SLOTS; try++) {
                hvalue = atomic_inc_return(&avc->avc_cache.lru_hint) &
                        (AVC_CACHE_SLOTS - 1);
                head = &avc->avc_cache.slots[hvalue];
                lock = &avc->avc_cache.slots_lock[hvalue];

                if (!spin_trylock_irqsave(lock, flags))
                        continue;

                rcu_read_lock();
                hlist_for_each_entry(node, head, list) {
                        avc_node_delete(avc, node);
                        avc_cache_stats_incr(reclaims);
                        ecx++;
                        if (ecx >= AVC_CACHE_RECLAIM) {
                                rcu_read_unlock();
                                spin_unlock_irqrestore(lock, flags);
                                goto out;
                        }
                }
                rcu_read_unlock();
                spin_unlock_irqrestore(lock, flags);
        }
out:
        return ecx;
}

static struct avc_node *avc_alloc_node(struct selinux_avc *avc)
{
        struct avc_node *node;

        node = kmem_cache_zalloc(avc_node_cachep, GFP_NOWAIT | __GFP_NOWARN);
        if (!node)
                goto out;

        INIT_HLIST_NODE(&node->list);
        avc_cache_stats_incr(allocations);

        if (atomic_inc_return(&avc->avc_cache.active_nodes) >
            avc->avc_cache_threshold)
                avc_reclaim_node(avc);

out:
        return node;
}

static void avc_node_populate(struct avc_node *node, u32 ssid, u32 tsid, u16 tclass, struct av_decision *avd)
{
        node->ae.ssid = ssid;
        node->ae.tsid = tsid;
        node->ae.tclass = tclass;
        memcpy(&node->ae.avd, avd, sizeof(node->ae.avd));
}

static inline struct avc_node *avc_search_node(struct selinux_avc *avc,
                                               u32 ssid, u32 tsid, u16 tclass)
{
        struct avc_node *node, *ret = NULL;
        int hvalue;
        struct hlist_head *head;

        hvalue = avc_hash(ssid, tsid, tclass);
        head = &avc->avc_cache.slots[hvalue];
        hlist_for_each_entry_rcu(node, head, list) {
                if (ssid == node->ae.ssid &&
                    tclass == node->ae.tclass &&
                    tsid == node->ae.tsid) {
                        ret = node;
                        break;
                }
        }

        return ret;
}

/**
 * avc_lookup - Look up an AVC entry.
 * @ssid: source security identifier
 * @tsid: target security identifier
 * @tclass: target security class
 *
 * Look up an AVC entry that is valid for the
 * (@ssid, @tsid), interpreting the permissions
 * based on @tclass.  If a valid AVC entry exists,
 * then this function returns the avc_node.
 * Otherwise, this function returns NULL.
 */
static struct avc_node *avc_lookup(struct selinux_avc *avc,
                                   u32 ssid, u32 tsid, u16 tclass)
{
        struct avc_node *node;

        avc_cache_stats_incr(lookups);
        node = avc_search_node(avc, ssid, tsid, tclass);

        if (node)
                return node;

        avc_cache_stats_incr(misses);
        return NULL;
}

static int avc_latest_notif_update(struct selinux_avc *avc,
                                   int seqno, int is_insert)
{
        int ret = 0;
        static DEFINE_SPINLOCK(notif_lock);
        unsigned long flag;

        spin_lock_irqsave(&notif_lock, flag);
        if (is_insert) {
                if (seqno < avc->avc_cache.latest_notif) {
                        pr_warn("SELinux: avc:  seqno %d < latest_notif %d\n",
                               seqno, avc->avc_cache.latest_notif);
                        ret = -EAGAIN;
                }
        } else {
                if (seqno > avc->avc_cache.latest_notif)
                        avc->avc_cache.latest_notif = seqno;
        }
        spin_unlock_irqrestore(&notif_lock, flag);

        return ret;
}

/**
 * avc_insert - Insert an AVC entry.
 * @ssid: source security identifier
 * @tsid: target security identifier
 * @tclass: target security class
 * @avd: resulting av decision
 * @xp_node: resulting extended permissions
 *
 * Insert an AVC entry for the SID pair
 * (@ssid, @tsid) and class @tclass.
 * The access vectors and the sequence number are
 * normally provided by the security server in
 * response to a security_compute_av() call.  If the
 * sequence number @avd->seqno is not less than the latest
 * revocation notification, then the function copies
 * the access vectors into a cache entry, returns
 * avc_node inserted. Otherwise, this function returns NULL.
 */
static struct avc_node *avc_insert(struct selinux_avc *avc,
                                   u32 ssid, u32 tsid, u16 tclass,
                                   struct av_decision *avd,
                                   struct avc_xperms_node *xp_node)
{
        struct avc_node *pos, *node = NULL;
        int hvalue;
        unsigned long flag;
        spinlock_t *lock;
        struct hlist_head *head;

        if (avc_latest_notif_update(avc, avd->seqno, 1))
                return NULL;

        node = avc_alloc_node(avc);
        if (!node)
                return NULL;

        avc_node_populate(node, ssid, tsid, tclass, avd);
        if (avc_xperms_populate(node, xp_node)) {
                avc_node_kill(avc, node);
                return NULL;
        }

        hvalue = avc_hash(ssid, tsid, tclass);
        head = &avc->avc_cache.slots[hvalue];
        lock = &avc->avc_cache.slots_lock[hvalue];
        spin_lock_irqsave(lock, flag);
        hlist_for_each_entry(pos, head, list) {
                if (pos->ae.ssid == ssid &&
                        pos->ae.tsid == tsid &&
                        pos->ae.tclass == tclass) {
                        avc_node_replace(avc, node, pos);
                        goto found;
                }
        }
        hlist_add_head_rcu(&node->list, head);
found:
        spin_unlock_irqrestore(lock, flag);
        return node;
}

/**
 * avc_audit_pre_callback - SELinux specific information
 * will be called by generic audit code
 * @ab: the audit buffer
 * @a: audit_data
 */
static void avc_audit_pre_callback(struct audit_buffer *ab, void *a)
{
        struct common_audit_data *ad = a;
        struct selinux_audit_data *sad = ad->selinux_audit_data;
        u32 av = sad->audited;
        const char **perms;
        int i, perm;

        audit_log_format(ab, "avc:  %s ", sad->denied ? "denied" : "granted");

        if (av == 0) {
                audit_log_format(ab, " null");
                return;
        }

        perms = secclass_map[sad->tclass-1].perms;

        audit_log_format(ab, " {");
        i = 0;
        perm = 1;
        while (i < (sizeof(av) * 8)) {
                if ((perm & av) && perms[i]) {
                        audit_log_format(ab, " %s", perms[i]);
                        av &= ~perm;
                }
                i++;
                perm <<= 1;
        }

        if (av)
                audit_log_format(ab, " 0x%x", av);

        audit_log_format(ab, " } for ");
}

/**
 * avc_audit_post_callback - SELinux specific information
 * will be called by generic audit code
 * @ab: the audit buffer
 * @a: audit_data
 */
static void avc_audit_post_callback(struct audit_buffer *ab, void *a)
{
        struct common_audit_data *ad = a;
        struct selinux_audit_data *sad = ad->selinux_audit_data;
        char *scontext = NULL;
        char *tcontext = NULL;
        const char *tclass = NULL;
        u32 scontext_len;
        u32 tcontext_len;
        int rc;

        rc = security_sid_to_context(sad->state, sad->ssid, &scontext,
                                     &scontext_len);
        if (rc)
                audit_log_format(ab, " ssid=%d", sad->ssid);
        else
                audit_log_format(ab, " scontext=%s", scontext);

        rc = security_sid_to_context(sad->state, sad->tsid, &tcontext,
                                     &tcontext_len);
        if (rc)
                audit_log_format(ab, " tsid=%d", sad->tsid);
        else
                audit_log_format(ab, " tcontext=%s", tcontext);

        tclass = secclass_map[sad->tclass-1].name;
        audit_log_format(ab, " tclass=%s", tclass);

        if (sad->denied)
                audit_log_format(ab, " permissive=%u", sad->result ? 0 : 1);

        trace_selinux_audited(sad, scontext, tcontext, tclass);
        kfree(tcontext);
        kfree(scontext);

        /* in case of invalid context report also the actual context string */
        rc = security_sid_to_context_inval(sad->state, sad->ssid, &scontext,
                                           &scontext_len);
        if (!rc && scontext) {
                if (scontext_len && scontext[scontext_len - 1] == '\0')
                        scontext_len--;
                audit_log_format(ab, " srawcon=");
                audit_log_n_untrustedstring(ab, scontext, scontext_len);
                kfree(scontext);
        }

        rc = security_sid_to_context_inval(sad->state, sad->tsid, &scontext,
                                           &scontext_len);
        if (!rc && scontext) {
                if (scontext_len && scontext[scontext_len - 1] == '\0')
                        scontext_len--;
                audit_log_format(ab, " trawcon=");
                audit_log_n_untrustedstring(ab, scontext, scontext_len);
                kfree(scontext);
        }
}

/* This is the slow part of avc audit with big stack footprint */
noinline int slow_avc_audit(struct selinux_state *state,
                            u32 ssid, u32 tsid, u16 tclass,
                            u32 requested, u32 audited, u32 denied, int result,
                            struct common_audit_data *a)
{
        struct common_audit_data stack_data;
        struct selinux_audit_data sad;

        if (WARN_ON(!tclass || tclass >= ARRAY_SIZE(secclass_map)))
                return -EINVAL;

        if (!a) {
                a = &stack_data;
                a->type = LSM_AUDIT_DATA_NONE;
        }

        sad.tclass = tclass;
        sad.requested = requested;
        sad.ssid = ssid;
        sad.tsid = tsid;
        sad.audited = audited;
        sad.denied = denied;
        sad.result = result;
        sad.state = state;

        a->selinux_audit_data = &sad;

        common_lsm_audit(a, avc_audit_pre_callback, avc_audit_post_callback);
        return 0;
}

/**
 * avc_add_callback - Register a callback for security events.
 * @callback: callback function
 * @events: security events
 *
 * Register a callback function for events in the set @events.
 * Returns %0 on success or -%ENOMEM if insufficient memory
 * exists to add the callback.
 */
int __init avc_add_callback(int (*callback)(u32 event), u32 events)
{
        struct avc_callback_node *c;
        int rc = 0;

        c = kmalloc(sizeof(*c), GFP_KERNEL);
        if (!c) {
                rc = -ENOMEM;
                goto out;
        }

        c->callback = callback;
        c->events = events;
        c->next = avc_callbacks;
        avc_callbacks = c;
out:
        return rc;
}

/**
 * avc_update_node Update an AVC entry
 * @event : Updating event
 * @perms : Permission mask bits
 * @ssid,@tsid,@tclass : identifier of an AVC entry
 * @seqno : sequence number when decision was made
 * @xpd: extended_perms_decision to be added to the node
 * @flags: the AVC_* flags, e.g. AVC_NONBLOCKING, AVC_EXTENDED_PERMS, or 0.
 *
 * if a valid AVC entry doesn't exist,this function returns -ENOENT.
 * if kmalloc() called internal returns NULL, this function returns -ENOMEM.
 * otherwise, this function updates the AVC entry. The original AVC-entry object
 * will release later by RCU.
 */
static int avc_update_node(struct selinux_avc *avc,
                           u32 event, u32 perms, u8 driver, u8 xperm, u32 ssid,
                           u32 tsid, u16 tclass, u32 seqno,
                           struct extended_perms_decision *xpd,
                           u32 flags)
{
        int hvalue, rc = 0;
        unsigned long flag;
        struct avc_node *pos, *node, *orig = NULL;
        struct hlist_head *head;
        spinlock_t *lock;

        /*
         * If we are in a non-blocking code path, e.g. VFS RCU walk,
         * then we must not add permissions to a cache entry
         * because we will not audit the denial.  Otherwise,
         * during the subsequent blocking retry (e.g. VFS ref walk), we
         * will find the permissions already granted in the cache entry
         * and won't audit anything at all, leading to silent denials in
         * permissive mode that only appear when in enforcing mode.
         *
         * See the corresponding handling of MAY_NOT_BLOCK in avc_audit()
         * and selinux_inode_permission().
         */
        if (flags & AVC_NONBLOCKING)
                return 0;

        node = avc_alloc_node(avc);
        if (!node) {
                rc = -ENOMEM;
                goto out;
        }

        /* Lock the target slot */
        hvalue = avc_hash(ssid, tsid, tclass);

        head = &avc->avc_cache.slots[hvalue];
        lock = &avc->avc_cache.slots_lock[hvalue];

        spin_lock_irqsave(lock, flag);

        hlist_for_each_entry(pos, head, list) {
                if (ssid == pos->ae.ssid &&
                    tsid == pos->ae.tsid &&
                    tclass == pos->ae.tclass &&
                    seqno == pos->ae.avd.seqno){
                        orig = pos;
                        break;
                }
        }

        if (!orig) {
                rc = -ENOENT;
                avc_node_kill(avc, node);
                goto out_unlock;
        }

        /*
         * Copy and replace original node.
         */

        avc_node_populate(node, ssid, tsid, tclass, &orig->ae.avd);

        if (orig->ae.xp_node) {
                rc = avc_xperms_populate(node, orig->ae.xp_node);
                if (rc) {
                        avc_node_kill(avc, node);
                        goto out_unlock;
                }
        }

        switch (event) {
        case AVC_CALLBACK_GRANT:
                node->ae.avd.allowed |= perms;
                if (node->ae.xp_node && (flags & AVC_EXTENDED_PERMS))
                        avc_xperms_allow_perm(node->ae.xp_node, driver, xperm);
                break;
        case AVC_CALLBACK_TRY_REVOKE:
        case AVC_CALLBACK_REVOKE:
                node->ae.avd.allowed &= ~perms;
                break;
        case AVC_CALLBACK_AUDITALLOW_ENABLE:
                node->ae.avd.auditallow |= perms;
                break;
        case AVC_CALLBACK_AUDITALLOW_DISABLE:
                node->ae.avd.auditallow &= ~perms;
                break;
        case AVC_CALLBACK_AUDITDENY_ENABLE:
                node->ae.avd.auditdeny |= perms;
                break;
        case AVC_CALLBACK_AUDITDENY_DISABLE:
                node->ae.avd.auditdeny &= ~perms;
                break;
        case AVC_CALLBACK_ADD_XPERMS:
                avc_add_xperms_decision(node, xpd);
                break;
        }
        avc_node_replace(avc, node, orig);
out_unlock:
        spin_unlock_irqrestore(lock, flag);
out:
        return rc;
}

/**
 * avc_flush - Flush the cache
 */
static void avc_flush(struct selinux_avc *avc)
{
        struct hlist_head *head;
        struct avc_node *node;
        spinlock_t *lock;
        unsigned long flag;
        int i;

        for (i = 0; i < AVC_CACHE_SLOTS; i++) {
                head = &avc->avc_cache.slots[i];
                lock = &avc->avc_cache.slots_lock[i];

                spin_lock_irqsave(lock, flag);
                /*
                 * With preemptable RCU, the outer spinlock does not
                 * prevent RCU grace periods from ending.
                 */
                rcu_read_lock();
                hlist_for_each_entry(node, head, list)
                        avc_node_delete(avc, node);
                rcu_read_unlock();
                spin_unlock_irqrestore(lock, flag);
        }
}

/**
 * avc_ss_reset - Flush the cache and revalidate migrated permissions.
 * @seqno: policy sequence number
 */
int avc_ss_reset(struct selinux_avc *avc, u32 seqno)
{
        struct avc_callback_node *c;
        int rc = 0, tmprc;

        avc_flush(avc);

        for (c = avc_callbacks; c; c = c->next) {
                if (c->events & AVC_CALLBACK_RESET) {
                        tmprc = c->callback(AVC_CALLBACK_RESET);
                        /* save the first error encountered for the return
                           value and continue processing the callbacks */
                        if (!rc)
                                rc = tmprc;
                }
        }

        avc_latest_notif_update(avc, seqno, 0);
        return rc;
}

/*
 * Slow-path helper function for avc_has_perm_noaudit,
 * when the avc_node lookup fails. We get called with
 * the RCU read lock held, and need to return with it
 * still held, but drop if for the security compute.
 *
 * Don't inline this, since it's the slow-path and just
 * results in a bigger stack frame.
 */
static noinline
struct avc_node *avc_compute_av(struct selinux_state *state,
                                u32 ssid, u32 tsid,
                                u16 tclass, struct av_decision *avd,
                                struct avc_xperms_node *xp_node)
{
        rcu_read_unlock();
        INIT_LIST_HEAD(&xp_node->xpd_head);
        security_compute_av(state, ssid, tsid, tclass, avd, &xp_node->xp);
        rcu_read_lock();
        return avc_insert(state->avc, ssid, tsid, tclass, avd, xp_node);
}

static noinline int avc_denied(struct selinux_state *state,
                               u32 ssid, u32 tsid,
                               u16 tclass, u32 requested,
                               u8 driver, u8 xperm, unsigned int flags,
                               struct av_decision *avd)
{
        if (flags & AVC_STRICT)
                return -EACCES;

        if (enforcing_enabled(state) &&
            !(avd->flags & AVD_FLAGS_PERMISSIVE))
                return -EACCES;

        avc_update_node(state->avc, AVC_CALLBACK_GRANT, requested, driver,
                        xperm, ssid, tsid, tclass, avd->seqno, NULL, flags);
        return 0;
}

/*
 * The avc extended permissions logic adds an additional 256 bits of
 * permissions to an avc node when extended permissions for that node are
 * specified in the avtab. If the additional 256 permissions is not adequate,
 * as-is the case with ioctls, then multiple may be chained together and the
 * driver field is used to specify which set contains the permission.
 */
int avc_has_extended_perms(struct selinux_state *state,
                           u32 ssid, u32 tsid, u16 tclass, u32 requested,
                           u8 driver, u8 xperm, struct common_audit_data *ad)
{
        struct avc_node *node;
        struct av_decision avd;
        u32 denied;
        struct extended_perms_decision local_xpd;
        struct extended_perms_decision *xpd = NULL;
        struct extended_perms_data allowed;
        struct extended_perms_data auditallow;
        struct extended_perms_data dontaudit;
        struct avc_xperms_node local_xp_node;
        struct avc_xperms_node *xp_node;
        int rc = 0, rc2;

        xp_node = &local_xp_node;
        if (WARN_ON(!requested))
                return -EACCES;

        rcu_read_lock();

        node = avc_lookup(state->avc, ssid, tsid, tclass);
        if (unlikely(!node)) {
                node = avc_compute_av(state, ssid, tsid, tclass, &avd, xp_node);
        } else {
                memcpy(&avd, &node->ae.avd, sizeof(avd));
                xp_node = node->ae.xp_node;
        }
        /* if extended permissions are not defined, only consider av_decision */
        if (!xp_node || !xp_node->xp.len)
                goto decision;

        local_xpd.allowed = &allowed;
        local_xpd.auditallow = &auditallow;
        local_xpd.dontaudit = &dontaudit;

        xpd = avc_xperms_decision_lookup(driver, xp_node);
        if (unlikely(!xpd)) {
                /*
                 * Compute the extended_perms_decision only if the driver
                 * is flagged
                 */
                if (!security_xperm_test(xp_node->xp.drivers.p, driver)) {
                        avd.allowed &= ~requested;
                        goto decision;
                }
                rcu_read_unlock();
                security_compute_xperms_decision(state, ssid, tsid, tclass,
                                                 driver, &local_xpd);
                rcu_read_lock();
                avc_update_node(state->avc, AVC_CALLBACK_ADD_XPERMS, requested,
                                driver, xperm, ssid, tsid, tclass, avd.seqno,
                                &local_xpd, 0);
        } else {
                avc_quick_copy_xperms_decision(xperm, &local_xpd, xpd);
        }
        xpd = &local_xpd;

        if (!avc_xperms_has_perm(xpd, xperm, XPERMS_ALLOWED))
                avd.allowed &= ~requested;

decision:
        denied = requested & ~(avd.allowed);
        if (unlikely(denied))
                rc = avc_denied(state, ssid, tsid, tclass, requested,
                                driver, xperm, AVC_EXTENDED_PERMS, &avd);

        rcu_read_unlock();

        rc2 = avc_xperms_audit(state, ssid, tsid, tclass, requested,
                        &avd, xpd, xperm, rc, ad);
        if (rc2)
                return rc2;
        return rc;
}

/**
 * avc_has_perm_noaudit - Check permissions but perform no auditing.
 * @ssid: source security identifier
 * @tsid: target security identifier
 * @tclass: target security class
 * @requested: requested permissions, interpreted based on @tclass
 * @flags:  AVC_STRICT, AVC_NONBLOCKING, or 0
 * @avd: access vector decisions
 *
 * Check the AVC to determine whether the @requested permissions are granted
 * for the SID pair (@ssid, @tsid), interpreting the permissions
 * based on @tclass, and call the security server on a cache miss to obtain
 * a new decision and add it to the cache.  Return a copy of the decisions
 * in @avd.  Return %0 if all @requested permissions are granted,
 * -%EACCES if any permissions are denied, or another -errno upon
 * other errors.  This function is typically called by avc_has_perm(),
 * but may also be called directly to separate permission checking from
 * auditing, e.g. in cases where a lock must be held for the check but
 * should be released for the auditing.
 */
inline int avc_has_perm_noaudit(struct selinux_state *state,
                                u32 ssid, u32 tsid,
                                u16 tclass, u32 requested,
                                unsigned int flags,
                                struct av_decision *avd)
{
        struct avc_node *node;
        struct avc_xperms_node xp_node;
        int rc = 0;
        u32 denied;

        if (WARN_ON(!requested))
                return -EACCES;

        rcu_read_lock();

        node = avc_lookup(state->avc, ssid, tsid, tclass);
        if (unlikely(!node))
                node = avc_compute_av(state, ssid, tsid, tclass, avd, &xp_node);
        else
                memcpy(avd, &node->ae.avd, sizeof(*avd));

        denied = requested & ~(avd->allowed);
        if (unlikely(denied))
                rc = avc_denied(state, ssid, tsid, tclass, requested, 0, 0,
                                flags, avd);

        rcu_read_unlock();
        return rc;
}

/**
 * avc_has_perm - Check permissions and perform any appropriate auditing.
 * @ssid: source security identifier
 * @tsid: target security identifier
 * @tclass: target security class
 * @requested: requested permissions, interpreted based on @tclass
 * @auditdata: auxiliary audit data
 *
 * Check the AVC to determine whether the @requested permissions are granted
 * for the SID pair (@ssid, @tsid), interpreting the permissions
 * based on @tclass, and call the security server on a cache miss to obtain
 * a new decision and add it to the cache.  Audit the granting or denial of
 * permissions in accordance with the policy.  Return %0 if all @requested
 * permissions are granted, -%EACCES if any permissions are denied, or
 * another -errno upon other errors.
 */
int avc_has_perm(struct selinux_state *state, u32 ssid, u32 tsid, u16 tclass,
                 u32 requested, struct common_audit_data *auditdata)
{
        struct av_decision avd;
        int rc, rc2;

        rc = avc_has_perm_noaudit(state, ssid, tsid, tclass, requested, 0,
                                  &avd);

        rc2 = avc_audit(state, ssid, tsid, tclass, requested, &avd, rc,
                        auditdata, 0);
        if (rc2)
                return rc2;
        return rc;
}

int avc_has_perm_flags(struct selinux_state *state,
                       u32 ssid, u32 tsid, u16 tclass, u32 requested,
                       struct common_audit_data *auditdata,
                       int flags)
{
        struct av_decision avd;
        int rc, rc2;

        rc = avc_has_perm_noaudit(state, ssid, tsid, tclass, requested,
                                  (flags & MAY_NOT_BLOCK) ? AVC_NONBLOCKING : 0,
                                  &avd);

        rc2 = avc_audit(state, ssid, tsid, tclass, requested, &avd, rc,
                        auditdata, flags);
        if (rc2)
                return rc2;
        return rc;
}

u32 avc_policy_seqno(struct selinux_state *state)
{
        return state->avc->avc_cache.latest_notif;
}

void avc_disable(void)
{
        /*
         * If you are looking at this because you have realized that we are
         * not destroying the avc_node_cachep it might be easy to fix, but
         * I don't know the memory barrier semantics well enough to know.  It's
         * possible that some other task dereferenced security_ops when
         * it still pointed to selinux operations.  If that is the case it's
         * possible that it is about to use the avc and is about to need the
         * avc_node_cachep.  I know I could wrap the security.c security_ops call
         * in an rcu_lock, but seriously, it's not worth it.  Instead I just flush
         * the cache and get that memory back.
         */
        if (avc_node_cachep) {
                avc_flush(selinux_state.avc);
                /* kmem_cache_destroy(avc_node_cachep); */
        }
}




































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 


































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
// SPDX-License-Identifier: GPL-2.0
/*
 * Implement CPU time clocks for the POSIX clock interface.
 */

#include <linux/sched/signal.h>
#include <linux/sched/cputime.h>
#include <linux/posix-timers.h>
#include <linux/errno.h>
#include <linux/math64.h>
#include <linux/uaccess.h>
#include <linux/kernel_stat.h>
#include <trace/events/timer.h>
#include <linux/tick.h>
#include <linux/workqueue.h>
#include <linux/compat.h>
#include <linux/sched/deadline.h>

#include "posix-timers.h"

static void posix_cpu_timer_rearm(struct k_itimer *timer);

void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit)
{
        posix_cputimers_init(pct);
        if (cpu_limit != RLIM_INFINITY) {
                pct->bases[CPUCLOCK_PROF].nextevt = cpu_limit * NSEC_PER_SEC;
                pct->timers_active = true;
        }
}

/*
 * Called after updating RLIMIT_CPU to run cpu timer and update
 * tsk->signal->posix_cputimers.bases[clock].nextevt expiration cache if
 * necessary. Needs siglock protection since other code may update the
 * expiration cache as well.
 */
void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
{
        u64 nsecs = rlim_new * NSEC_PER_SEC;

        spin_lock_irq(&task->sighand->siglock);
        set_process_cpu_timer(task, CPUCLOCK_PROF, &nsecs, NULL);
        spin_unlock_irq(&task->sighand->siglock);
}

/*
 * Functions for validating access to tasks.
 */
static struct pid *pid_for_clock(const clockid_t clock, bool gettime)
{
        const bool thread = !!CPUCLOCK_PERTHREAD(clock);
        const pid_t upid = CPUCLOCK_PID(clock);
        struct pid *pid;

        if (CPUCLOCK_WHICH(clock) >= CPUCLOCK_MAX)
                return NULL;

        /*
         * If the encoded PID is 0, then the timer is targeted at current
         * or the process to which current belongs.
         */
        if (upid == 0)
                return thread ? task_pid(current) : task_tgid(current);

        pid = find_vpid(upid);
        if (!pid)
                return NULL;

        if (thread) {
                struct task_struct *tsk = pid_task(pid, PIDTYPE_PID);
                return (tsk && same_thread_group(tsk, current)) ? pid : NULL;
        }

        /*
         * For clock_gettime(PROCESS) allow finding the process by
         * with the pid of the current task.  The code needs the tgid
         * of the process so that pid_task(pid, PIDTYPE_TGID) can be
         * used to find the process.
         */
        if (gettime && (pid == task_pid(current)))
                return task_tgid(current);

        /*
         * For processes require that pid identifies a process.
         */
        return pid_has_task(pid, PIDTYPE_TGID) ? pid : NULL;
}

static inline int validate_clock_permissions(const clockid_t clock)
{
        int ret;

        rcu_read_lock();
        ret = pid_for_clock(clock, false) ? 0 : -EINVAL;
        rcu_read_unlock();

        return ret;
}

static inline enum pid_type clock_pid_type(const clockid_t clock)
{
        return CPUCLOCK_PERTHREAD(clock) ? PIDTYPE_PID : PIDTYPE_TGID;
}

static inline struct task_struct *cpu_timer_task_rcu(struct k_itimer *timer)
{
        return pid_task(timer->it.cpu.pid, clock_pid_type(timer->it_clock));
}

/*
 * Update expiry time from increment, and increase overrun count,
 * given the current clock sample.
 */
static u64 bump_cpu_timer(struct k_itimer *timer, u64 now)
{
        u64 delta, incr, expires = timer->it.cpu.node.expires;
        int i;

        if (!timer->it_interval)
                return expires;

        if (now < expires)
                return expires;

        incr = timer->it_interval;
        delta = now + incr - expires;

        /* Don't use (incr*2 < delta), incr*2 might overflow. */
        for (i = 0; incr < delta - incr; i++)
                incr = incr << 1;

        for (; i >= 0; incr >>= 1, i--) {
                if (delta < incr)
                        continue;

                timer->it.cpu.node.expires += incr;
                timer->it_overrun += 1LL << i;
                delta -= incr;
        }
        return timer->it.cpu.node.expires;
}

/* Check whether all cache entries contain U64_MAX, i.e. eternal expiry time */
static inline bool expiry_cache_is_inactive(const struct posix_cputimers *pct)
{
        return !(~pct->bases[CPUCLOCK_PROF].nextevt |
                 ~pct->bases[CPUCLOCK_VIRT].nextevt |
                 ~pct->bases[CPUCLOCK_SCHED].nextevt);
}

static int
posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp)
{
        int error = validate_clock_permissions(which_clock);

        if (!error) {
                tp->tv_sec = 0;
                tp->tv_nsec = ((NSEC_PER_SEC + HZ - 1) / HZ);
                if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
                        /*
                         * If sched_clock is using a cycle counter, we
                         * don't have any idea of its true resolution
                         * exported, but it is much more than 1s/HZ.
                         */
                        tp->tv_nsec = 1;
                }
        }
        return error;
}

static int
posix_cpu_clock_set(const clockid_t clock, const struct timespec64 *tp)
{
        int error = validate_clock_permissions(clock);

        /*
         * You can never reset a CPU clock, but we check for other errors
         * in the call before failing with EPERM.
         */
        return error ? : -EPERM;
}

/*
 * Sample a per-thread clock for the given task. clkid is validated.
 */
static u64 cpu_clock_sample(const clockid_t clkid, struct task_struct *p)
{
        u64 utime, stime;

        if (clkid == CPUCLOCK_SCHED)
                return task_sched_runtime(p);

        task_cputime(p, &utime, &stime);

        switch (clkid) {
        case CPUCLOCK_PROF:
                return utime + stime;
        case CPUCLOCK_VIRT:
                return utime;
        default:
                WARN_ON_ONCE(1);
        }
        return 0;
}

static inline void store_samples(u64 *samples, u64 stime, u64 utime, u64 rtime)
{
        samples[CPUCLOCK_PROF] = stime + utime;
        samples[CPUCLOCK_VIRT] = utime;
        samples[CPUCLOCK_SCHED] = rtime;
}

static void task_sample_cputime(struct task_struct *p, u64 *samples)
{
        u64 stime, utime;

        task_cputime(p, &utime, &stime);
        store_samples(samples, stime, utime, p->se.sum_exec_runtime);
}

static void proc_sample_cputime_atomic(struct task_cputime_atomic *at,
                                       u64 *samples)
{
        u64 stime, utime, rtime;

        utime = atomic64_read(&at->utime);
        stime = atomic64_read(&at->stime);
        rtime = atomic64_read(&at->sum_exec_runtime);
        store_samples(samples, stime, utime, rtime);
}

/*
 * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg
 * to avoid race conditions with concurrent updates to cputime.
 */
static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
{
        u64 curr_cputime;
retry:
        curr_cputime = atomic64_read(cputime);
        if (sum_cputime > curr_cputime) {
                if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime)
                        goto retry;
        }
}

static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic,
                              struct task_cputime *sum)
{
        __update_gt_cputime(&cputime_atomic->utime, sum->utime);
        __update_gt_cputime(&cputime_atomic->stime, sum->stime);
        __update_gt_cputime(&cputime_atomic->sum_exec_runtime, sum->sum_exec_runtime);
}

/**
 * thread_group_sample_cputime - Sample cputime for a given task
 * @tsk:        Task for which cputime needs to be started
 * @samples:        Storage for time samples
 *
 * Called from sys_getitimer() to calculate the expiry time of an active
 * timer. That means group cputime accounting is already active. Called
 * with task sighand lock held.
 *
 * Updates @times with an uptodate sample of the thread group cputimes.
 */
void thread_group_sample_cputime(struct task_struct *tsk, u64 *samples)
{
        struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
        struct posix_cputimers *pct = &tsk->signal->posix_cputimers;

        WARN_ON_ONCE(!pct->timers_active);

        proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples);
}

/**
 * thread_group_start_cputime - Start cputime and return a sample
 * @tsk:        Task for which cputime needs to be started
 * @samples:        Storage for time samples
 *
 * The thread group cputime accouting is avoided when there are no posix
 * CPU timers armed. Before starting a timer it's required to check whether
 * the time accounting is active. If not, a full update of the atomic
 * accounting store needs to be done and the accounting enabled.
 *
 * Updates @times with an uptodate sample of the thread group cputimes.
 */
static void thread_group_start_cputime(struct task_struct *tsk, u64 *samples)
{
        struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
        struct posix_cputimers *pct = &tsk->signal->posix_cputimers;

        /* Check if cputimer isn't running. This is accessed without locking. */
        if (!READ_ONCE(pct->timers_active)) {
                struct task_cputime sum;

                /*
                 * The POSIX timer interface allows for absolute time expiry
                 * values through the TIMER_ABSTIME flag, therefore we have
                 * to synchronize the timer to the clock every time we start it.
                 */
                thread_group_cputime(tsk, &sum);
                update_gt_cputime(&cputimer->cputime_atomic, &sum);

                /*
                 * We're setting timers_active without a lock. Ensure this
                 * only gets written to in one operation. We set it after
                 * update_gt_cputime() as a small optimization, but
                 * barriers are not required because update_gt_cputime()
                 * can handle concurrent updates.
                 */
                WRITE_ONCE(pct->timers_active, true);
        }
        proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples);
}

static void __thread_group_cputime(struct task_struct *tsk, u64 *samples)
{
        struct task_cputime ct;

        thread_group_cputime(tsk, &ct);
        store_samples(samples, ct.stime, ct.utime, ct.sum_exec_runtime);
}

/*
 * Sample a process (thread group) clock for the given task clkid. If the
 * group's cputime accounting is already enabled, read the atomic
 * store. Otherwise a full update is required.  clkid is already validated.
 */
static u64 cpu_clock_sample_group(const clockid_t clkid, struct task_struct *p,
                                  bool start)
{
        struct thread_group_cputimer *cputimer = &p->signal->cputimer;
        struct posix_cputimers *pct = &p->signal->posix_cputimers;
        u64 samples[CPUCLOCK_MAX];

        if (!READ_ONCE(pct->timers_active)) {
                if (start)
                        thread_group_start_cputime(p, samples);
                else
                        __thread_group_cputime(p, samples);
        } else {
                proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples);
        }

        return samples[clkid];
}

static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp)
{
        const clockid_t clkid = CPUCLOCK_WHICH(clock);
        struct task_struct *tsk;
        u64 t;

        rcu_read_lock();
        tsk = pid_task(pid_for_clock(clock, true), clock_pid_type(clock));
        if (!tsk) {
                rcu_read_unlock();
                return -EINVAL;
        }

        if (CPUCLOCK_PERTHREAD(clock))
                t = cpu_clock_sample(clkid, tsk);
        else
                t = cpu_clock_sample_group(clkid, tsk, false);
        rcu_read_unlock();

        *tp = ns_to_timespec64(t);
        return 0;
}

/*
 * Validate the clockid_t for a new CPU-clock timer, and initialize the timer.
 * This is called from sys_timer_create() and do_cpu_nanosleep() with the
 * new timer already all-zeros initialized.
 */
static int posix_cpu_timer_create(struct k_itimer *new_timer)
{
        static struct lock_class_key posix_cpu_timers_key;
        struct pid *pid;

        rcu_read_lock();
        pid = pid_for_clock(new_timer->it_clock, false);
        if (!pid) {
                rcu_read_unlock();
                return -EINVAL;
        }

        /*
         * If posix timer expiry is handled in task work context then
         * timer::it_lock can be taken without disabling interrupts as all
         * other locking happens in task context. This requires a seperate
         * lock class key otherwise regular posix timer expiry would record
         * the lock class being taken in interrupt context and generate a
         * false positive warning.
         */
        if (IS_ENABLED(CONFIG_POSIX_CPU_TIMERS_TASK_WORK))
                lockdep_set_class(&new_timer->it_lock, &posix_cpu_timers_key);

        new_timer->kclock = &clock_posix_cpu;
        timerqueue_init(&new_timer->it.cpu.node);
        new_timer->it.cpu.pid = get_pid(pid);
        rcu_read_unlock();
        return 0;
}

/*
 * Clean up a CPU-clock timer that is about to be destroyed.
 * This is called from timer deletion with the timer already locked.
 * If we return TIMER_RETRY, it's necessary to release the timer's lock
 * and try again.  (This happens when the timer is in the middle of firing.)
 */
static int posix_cpu_timer_del(struct k_itimer *timer)
{
        struct cpu_timer *ctmr = &timer->it.cpu;
        struct sighand_struct *sighand;
        struct task_struct *p;
        unsigned long flags;
        int ret = 0;

        rcu_read_lock();
        p = cpu_timer_task_rcu(timer);
        if (!p)
                goto out;

        /*
         * Protect against sighand release/switch in exit/exec and process/
         * thread timer list entry concurrent read/writes.
         */
        sighand = lock_task_sighand(p, &flags);
        if (unlikely(sighand == NULL)) {
                /*
                 * This raced with the reaping of the task. The exit cleanup
                 * should have removed this timer from the timer queue.
                 */
                WARN_ON_ONCE(ctmr->head || timerqueue_node_queued(&ctmr->node));
        } else {
                if (timer->it.cpu.firing)
                        ret = TIMER_RETRY;
                else
                        cpu_timer_dequeue(ctmr);

                unlock_task_sighand(p, &flags);
        }

out:
        rcu_read_unlock();
        if (!ret)
                put_pid(ctmr->pid);

        return ret;
}

static void cleanup_timerqueue(struct timerqueue_head *head)
{
        struct timerqueue_node *node;
        struct cpu_timer *ctmr;

        while ((node = timerqueue_getnext(head))) {
                timerqueue_del(head, node);
                ctmr = container_of(node, struct cpu_timer, node);
                ctmr->head = NULL;
        }
}

/*
 * Clean out CPU timers which are still armed when a thread exits. The
 * timers are only removed from the list. No other updates are done. The
 * corresponding posix timers are still accessible, but cannot be rearmed.
 *
 * This must be called with the siglock held.
 */
static void cleanup_timers(struct posix_cputimers *pct)
{
        cleanup_timerqueue(&pct->bases[CPUCLOCK_PROF].tqhead);
        cleanup_timerqueue(&pct->bases[CPUCLOCK_VIRT].tqhead);
        cleanup_timerqueue(&pct->bases[CPUCLOCK_SCHED].tqhead);
}

/*
 * These are both called with the siglock held, when the current thread
 * is being reaped.  When the final (leader) thread in the group is reaped,
 * posix_cpu_timers_exit_group will be called after posix_cpu_timers_exit.
 */
void posix_cpu_timers_exit(struct task_struct *tsk)
{
        cleanup_timers(&tsk->posix_cputimers);
}
void posix_cpu_timers_exit_group(struct task_struct *tsk)
{
        cleanup_timers(&tsk->signal->posix_cputimers);
}

/*
 * Insert the timer on the appropriate list before any timers that
 * expire later.  This must be called with the sighand lock held.
 */
static void arm_timer(struct k_itimer *timer, struct task_struct *p)
{
        int clkidx = CPUCLOCK_WHICH(timer->it_clock);
        struct cpu_timer *ctmr = &timer->it.cpu;
        u64 newexp = cpu_timer_getexpires(ctmr);
        struct posix_cputimer_base *base;

        if (CPUCLOCK_PERTHREAD(timer->it_clock))
                base = p->posix_cputimers.bases + clkidx;
        else
                base = p->signal->posix_cputimers.bases + clkidx;

        if (!cpu_timer_enqueue(&base->tqhead, ctmr))
                return;

        /*
         * We are the new earliest-expiring POSIX 1.b timer, hence
         * need to update expiration cache. Take into account that
         * for process timers we share expiration cache with itimers
         * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME.
         */
        if (newexp < base->nextevt)
                base->nextevt = newexp;

        if (CPUCLOCK_PERTHREAD(timer->it_clock))
                tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER);
        else
                tick_dep_set_signal(p->signal, TICK_DEP_BIT_POSIX_TIMER);
}

/*
 * The timer is locked, fire it and arrange for its reload.
 */
static void cpu_timer_fire(struct k_itimer *timer)
{
        struct cpu_timer *ctmr = &timer->it.cpu;

        if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
                /*
                 * User don't want any signal.
                 */
                cpu_timer_setexpires(ctmr, 0);
        } else if (unlikely(timer->sigq == NULL)) {
                /*
                 * This a special case for clock_nanosleep,
                 * not a normal timer from sys_timer_create.
                 */
                wake_up_process(timer->it_process);
                cpu_timer_setexpires(ctmr, 0);
        } else if (!timer->it_interval) {
                /*
                 * One-shot timer.  Clear it as soon as it's fired.
                 */
                posix_timer_event(timer, 0);
                cpu_timer_setexpires(ctmr, 0);
        } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) {
                /*
                 * The signal did not get queued because the signal
                 * was ignored, so we won't get any callback to
                 * reload the timer.  But we need to keep it
                 * ticking in case the signal is deliverable next time.
                 */
                posix_cpu_timer_rearm(timer);
                ++timer->it_requeue_pending;
        }
}

/*
 * Guts of sys_timer_settime for CPU timers.
 * This is called with the timer locked and interrupts disabled.
 * If we return TIMER_RETRY, it's necessary to release the timer's lock
 * and try again.  (This happens when the timer is in the middle of firing.)
 */
static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
                               struct itimerspec64 *new, struct itimerspec64 *old)
{
        clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
        u64 old_expires, new_expires, old_incr, val;
        struct cpu_timer *ctmr = &timer->it.cpu;
        struct sighand_struct *sighand;
        struct task_struct *p;
        unsigned long flags;
        int ret = 0;

        rcu_read_lock();
        p = cpu_timer_task_rcu(timer);
        if (!p) {
                /*
                 * If p has just been reaped, we can no
                 * longer get any information about it at all.
                 */
                rcu_read_unlock();
                return -ESRCH;
        }

        /*
         * Use the to_ktime conversion because that clamps the maximum
         * value to KTIME_MAX and avoid multiplication overflows.
         */
        new_expires = ktime_to_ns(timespec64_to_ktime(new->it_value));

        /*
         * Protect against sighand release/switch in exit/exec and p->cpu_timers
         * and p->signal->cpu_timers read/write in arm_timer()
         */
        sighand = lock_task_sighand(p, &flags);
        /*
         * If p has just been reaped, we can no
         * longer get any information about it at all.
         */
        if (unlikely(sighand == NULL)) {
                rcu_read_unlock();
                return -ESRCH;
        }

        /*
         * Disarm any old timer after extracting its expiry time.
         */
        old_incr = timer->it_interval;
        old_expires = cpu_timer_getexpires(ctmr);

        if (unlikely(timer->it.cpu.firing)) {
                timer->it.cpu.firing = -1;
                ret = TIMER_RETRY;
        } else {
                cpu_timer_dequeue(ctmr);
        }

        /*
         * We need to sample the current value to convert the new
         * value from to relative and absolute, and to convert the
         * old value from absolute to relative.  To set a process
         * timer, we need a sample to balance the thread expiry
         * times (in arm_timer).  With an absolute time, we must
         * check if it's already passed.  In short, we need a sample.
         */
        if (CPUCLOCK_PERTHREAD(timer->it_clock))
                val = cpu_clock_sample(clkid, p);
        else
                val = cpu_clock_sample_group(clkid, p, true);

        if (old) {
                if (old_expires == 0) {
                        old->it_value.tv_sec = 0;
                        old->it_value.tv_nsec = 0;
                } else {
                        /*
                         * Update the timer in case it has overrun already.
                         * If it has, we'll report it as having overrun and
                         * with the next reloaded timer already ticking,
                         * though we are swallowing that pending
                         * notification here to install the new setting.
                         */
                        u64 exp = bump_cpu_timer(timer, val);

                        if (val < exp) {
                                old_expires = exp - val;
                                old->it_value = ns_to_timespec64(old_expires);
                        } else {
                                old->it_value.tv_nsec = 1;
                                old->it_value.tv_sec = 0;
                        }
                }
        }

        if (unlikely(ret)) {
                /*
                 * We are colliding with the timer actually firing.
                 * Punt after filling in the timer's old value, and
                 * disable this firing since we are already reporting
                 * it as an overrun (thanks to bump_cpu_timer above).
                 */
                unlock_task_sighand(p, &flags);
                goto out;
        }

        if (new_expires != 0 && !(timer_flags & TIMER_ABSTIME)) {
                new_expires += val;
        }

        /*
         * Install the new expiry time (or zero).
         * For a timer with no notification action, we don't actually
         * arm the timer (we'll just fake it for timer_gettime).
         */
        cpu_timer_setexpires(ctmr, new_expires);
        if (new_expires != 0 && val < new_expires) {
                arm_timer(timer, p);
        }

        unlock_task_sighand(p, &flags);
        /*
         * Install the new reload setting, and
         * set up the signal and overrun bookkeeping.
         */
        timer->it_interval = timespec64_to_ktime(new->it_interval);

        /*
         * This acts as a modification timestamp for the timer,
         * so any automatic reload attempt will punt on seeing
         * that we have reset the timer manually.
         */
        timer->it_requeue_pending = (timer->it_requeue_pending + 2) &
                ~REQUEUE_PENDING;
        timer->it_overrun_last = 0;
        timer->it_overrun = -1;

        if (new_expires != 0 && !(val < new_expires)) {
                /*
                 * The designated time already passed, so we notify
                 * immediately, even if the thread never runs to
                 * accumulate more time on this clock.
                 */
                cpu_timer_fire(timer);
        }

        ret = 0;
 out:
        rcu_read_unlock();
        if (old)
                old->it_interval = ns_to_timespec64(old_incr);

        return ret;
}

static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp)
{
        clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
        struct cpu_timer *ctmr = &timer->it.cpu;
        u64 now, expires = cpu_timer_getexpires(ctmr);
        struct task_struct *p;

        rcu_read_lock();
        p = cpu_timer_task_rcu(timer);
        if (!p)
                goto out;

        /*
         * Easy part: convert the reload time.
         */
        itp->it_interval = ktime_to_timespec64(timer->it_interval);

        if (!expires)
                goto out;

        /*
         * Sample the clock to take the difference with the expiry time.
         */
        if (CPUCLOCK_PERTHREAD(timer->it_clock))
                now = cpu_clock_sample(clkid, p);
        else
                now = cpu_clock_sample_group(clkid, p, false);

        if (now < expires) {
                itp->it_value = ns_to_timespec64(expires - now);
        } else {
                /*
                 * The timer should have expired already, but the firing
                 * hasn't taken place yet.  Say it's just about to expire.
                 */
                itp->it_value.tv_nsec = 1;
                itp->it_value.tv_sec = 0;
        }
out:
        rcu_read_unlock();
}

#define MAX_COLLECTED        20

static u64 collect_timerqueue(struct timerqueue_head *head,
                              struct list_head *firing, u64 now)
{
        struct timerqueue_node *next;
        int i = 0;

        while ((next = timerqueue_getnext(head))) {
                struct cpu_timer *ctmr;
                u64 expires;

                ctmr = container_of(next, struct cpu_timer, node);
                expires = cpu_timer_getexpires(ctmr);
                /* Limit the number of timers to expire at once */
                if (++i == MAX_COLLECTED || now < expires)
                        return expires;

                ctmr->firing = 1;
                /* See posix_cpu_timer_wait_running() */
                rcu_assign_pointer(ctmr->handling, current);
                cpu_timer_dequeue(ctmr);
                list_add_tail(&ctmr->elist, firing);
        }

        return U64_MAX;
}

static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples,
                                    struct list_head *firing)
{
        struct posix_cputimer_base *base = pct->bases;
        int i;

        for (i = 0; i < CPUCLOCK_MAX; i++, base++) {
                base->nextevt = collect_timerqueue(&base->tqhead, firing,
                                                    samples[i]);
        }
}

static inline void check_dl_overrun(struct task_struct *tsk)
{
        if (tsk->dl.dl_overrun) {
                tsk->dl.dl_overrun = 0;
                __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
        }
}

static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard)
{
        if (time < limit)
                return false;

        if (print_fatal_signals) {
                pr_info("%s Watchdog Timeout (%s): %s[%d]\n",
                        rt ? "RT" : "CPU", hard ? "hard" : "soft",
                        current->comm, task_pid_nr(current));
        }
        __group_send_sig_info(signo, SEND_SIG_PRIV, current);
        return true;
}

/*
 * Check for any per-thread CPU timers that have fired and move them off
 * the tsk->cpu_timers[N] list onto the firing list.  Here we update the
 * tsk->it_*_expires values to reflect the remaining thread CPU timers.
 */
static void check_thread_timers(struct task_struct *tsk,
                                struct list_head *firing)
{
        struct posix_cputimers *pct = &tsk->posix_cputimers;
        u64 samples[CPUCLOCK_MAX];
        unsigned long soft;

        if (dl_task(tsk))
                check_dl_overrun(tsk);

        if (expiry_cache_is_inactive(pct))
                return;

        task_sample_cputime(tsk, samples);
        collect_posix_cputimers(pct, samples, firing);

        /*
         * Check for the special case thread timers.
         */
        soft = task_rlimit(tsk, RLIMIT_RTTIME);
        if (soft != RLIM_INFINITY) {
                /* Task RT timeout is accounted in jiffies. RTTIME is usec */
                unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ);
                unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME);

                /* At the hard limit, send SIGKILL. No further action. */
                if (hard != RLIM_INFINITY &&
                    check_rlimit(rttime, hard, SIGKILL, true, true))
                        return;

                /* At the soft limit, send a SIGXCPU every second */
                if (check_rlimit(rttime, soft, SIGXCPU, true, false)) {
                        soft += USEC_PER_SEC;
                        tsk->signal->rlim[RLIMIT_RTTIME].rlim_cur = soft;
                }
        }

        if (expiry_cache_is_inactive(pct))
                tick_dep_clear_task(tsk, TICK_DEP_BIT_POSIX_TIMER);
}

static inline void stop_process_timers(struct signal_struct *sig)
{
        struct posix_cputimers *pct = &sig->posix_cputimers;

        /* Turn off the active flag. This is done without locking. */
        WRITE_ONCE(pct->timers_active, false);
        tick_dep_clear_signal(sig, TICK_DEP_BIT_POSIX_TIMER);
}

static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
                             u64 *expires, u64 cur_time, int signo)
{
        if (!it->expires)
                return;

        if (cur_time >= it->expires) {
                if (it->incr)
                        it->expires += it->incr;
                else
                        it->expires = 0;

                trace_itimer_expire(signo == SIGPROF ?
                                    ITIMER_PROF : ITIMER_VIRTUAL,
                                    task_tgid(tsk), cur_time);
                __group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
        }

        if (it->expires && it->expires < *expires)
                *expires = it->expires;
}

/*
 * Check for any per-thread CPU timers that have fired and move them
 * off the tsk->*_timers list onto the firing list.  Per-thread timers
 * have already been taken off.
 */
static void check_process_timers(struct task_struct *tsk,
                                 struct list_head *firing)
{
        struct signal_struct *const sig = tsk->signal;
        struct posix_cputimers *pct = &sig->posix_cputimers;
        u64 samples[CPUCLOCK_MAX];
        unsigned long soft;

        /*
         * If there are no active process wide timers (POSIX 1.b, itimers,
         * RLIMIT_CPU) nothing to check. Also skip the process wide timer
         * processing when there is already another task handling them.
         */
        if (!READ_ONCE(pct->timers_active) || pct->expiry_active)
                return;

        /*
         * Signify that a thread is checking for process timers.
         * Write access to this field is protected by the sighand lock.
         */
        pct->expiry_active = true;

        /*
         * Collect the current process totals. Group accounting is active
         * so the sample can be taken directly.
         */
        proc_sample_cputime_atomic(&sig->cputimer.cputime_atomic, samples);
        collect_posix_cputimers(pct, samples, firing);

        /*
         * Check for the special case process timers.
         */
        check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF],
                         &pct->bases[CPUCLOCK_PROF].nextevt,
                         samples[CPUCLOCK_PROF], SIGPROF);
        check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT],
                         &pct->bases[CPUCLOCK_VIRT].nextevt,
                         samples[CPUCLOCK_VIRT], SIGVTALRM);

        soft = task_rlimit(tsk, RLIMIT_CPU);
        if (soft != RLIM_INFINITY) {
                /* RLIMIT_CPU is in seconds. Samples are nanoseconds */
                unsigned long hard = task_rlimit_max(tsk, RLIMIT_CPU);
                u64 ptime = samples[CPUCLOCK_PROF];
                u64 softns = (u64)soft * NSEC_PER_SEC;
                u64 hardns = (u64)hard * NSEC_PER_SEC;

                /* At the hard limit, send SIGKILL. No further action. */
                if (hard != RLIM_INFINITY &&
                    check_rlimit(ptime, hardns, SIGKILL, false, true))
                        return;

                /* At the soft limit, send a SIGXCPU every second */
                if (check_rlimit(ptime, softns, SIGXCPU, false, false)) {
                        sig->rlim[RLIMIT_CPU].rlim_cur = soft + 1;
                        softns += NSEC_PER_SEC;
                }

                /* Update the expiry cache */
                if (softns < pct->bases[CPUCLOCK_PROF].nextevt)
                        pct->bases[CPUCLOCK_PROF].nextevt = softns;
        }

        if (expiry_cache_is_inactive(pct))
                stop_process_timers(sig);

        pct->expiry_active = false;
}

/*
 * This is called from the signal code (via posixtimer_rearm)
 * when the last timer signal was delivered and we have to reload the timer.
 */
static void posix_cpu_timer_rearm(struct k_itimer *timer)
{
        clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
        struct task_struct *p;
        struct sighand_struct *sighand;
        unsigned long flags;
        u64 now;

        rcu_read_lock();
        p = cpu_timer_task_rcu(timer);
        if (!p)
                goto out;

        /* Protect timer list r/w in arm_timer() */
        sighand = lock_task_sighand(p, &flags);
        if (unlikely(sighand == NULL))
                goto out;

        /*
         * Fetch the current sample and update the timer's expiry time.
         */
        if (CPUCLOCK_PERTHREAD(timer->it_clock))
                now = cpu_clock_sample(clkid, p);
        else
                now = cpu_clock_sample_group(clkid, p, true);

        bump_cpu_timer(timer, now);

        /*
         * Now re-arm for the new expiry time.
         */
        arm_timer(timer, p);
        unlock_task_sighand(p, &flags);
out:
        rcu_read_unlock();
}

/**
 * task_cputimers_expired - Check whether posix CPU timers are expired
 *
 * @samples:        Array of current samples for the CPUCLOCK clocks
 * @pct:        Pointer to a posix_cputimers container
 *
 * Returns true if any member of @samples is greater than the corresponding
 * member of @pct->bases[CLK].nextevt. False otherwise
 */
static inline bool
task_cputimers_expired(const u64 *samples, struct posix_cputimers *pct)
{
        int i;

        for (i = 0; i < CPUCLOCK_MAX; i++) {
                if (samples[i] >= pct->bases[i].nextevt)
                        return true;
        }
        return false;
}

/**
 * fastpath_timer_check - POSIX CPU timers fast path.
 *
 * @tsk:        The task (thread) being checked.
 *
 * Check the task and thread group timers.  If both are zero (there are no
 * timers set) return false.  Otherwise snapshot the task and thread group
 * timers and compare them with the corresponding expiration times.  Return
 * true if a timer has expired, else return false.
 */
static inline bool fastpath_timer_check(struct task_struct *tsk)
{
        struct posix_cputimers *pct = &tsk->posix_cputimers;
        struct signal_struct *sig;

        if (!expiry_cache_is_inactive(pct)) {
                u64 samples[CPUCLOCK_MAX];

                task_sample_cputime(tsk, samples);
                if (task_cputimers_expired(samples, pct))
                        return true;
        }

        sig = tsk->signal;
        pct = &sig->posix_cputimers;
        /*
         * Check if thread group timers expired when timers are active and
         * no other thread in the group is already handling expiry for
         * thread group cputimers. These fields are read without the
         * sighand lock. However, this is fine because this is meant to be
         * a fastpath heuristic to determine whether we should try to
         * acquire the sighand lock to handle timer expiry.
         *
         * In the worst case scenario, if concurrently timers_active is set
         * or expiry_active is cleared, but the current thread doesn't see
         * the change yet, the timer checks are delayed until the next
         * thread in the group gets a scheduler interrupt to handle the
         * timer. This isn't an issue in practice because these types of
         * delays with signals actually getting sent are expected.
         */
        if (READ_ONCE(pct->timers_active) && !READ_ONCE(pct->expiry_active)) {
                u64 samples[CPUCLOCK_MAX];

                proc_sample_cputime_atomic(&sig->cputimer.cputime_atomic,
                                           samples);

                if (task_cputimers_expired(samples, pct))
                        return true;
        }

        if (dl_task(tsk) && tsk->dl.dl_overrun)
                return true;

        return false;
}

static void handle_posix_cpu_timers(struct task_struct *tsk);

#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
static void posix_cpu_timers_work(struct callback_head *work)
{
        struct posix_cputimers_work *cw = container_of(work, typeof(*cw), work);

        mutex_lock(&cw->mutex);
        handle_posix_cpu_timers(current);
        mutex_unlock(&cw->mutex);
}

/*
 * Invoked from the posix-timer core when a cancel operation failed because
 * the timer is marked firing. The caller holds rcu_read_lock(), which
 * protects the timer and the task which is expiring it from being freed.
 */
static void posix_cpu_timer_wait_running(struct k_itimer *timr)
{
        struct task_struct *tsk = rcu_dereference(timr->it.cpu.handling);

        /* Has the handling task completed expiry already? */
        if (!tsk)
                return;

        /* Ensure that the task cannot go away */
        get_task_struct(tsk);
        /* Now drop the RCU protection so the mutex can be locked */
        rcu_read_unlock();
        /* Wait on the expiry mutex */
        mutex_lock(&tsk->posix_cputimers_work.mutex);
        /* Release it immediately again. */
        mutex_unlock(&tsk->posix_cputimers_work.mutex);
        /* Drop the task reference. */
        put_task_struct(tsk);
        /* Relock RCU so the callsite is balanced */
        rcu_read_lock();
}

static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr)
{
        /* Ensure that timr->it.cpu.handling task cannot go away */
        rcu_read_lock();
        spin_unlock_irq(&timr->it_lock);
        posix_cpu_timer_wait_running(timr);
        rcu_read_unlock();
        /* @timr is on stack and is valid */
        spin_lock_irq(&timr->it_lock);
}

/*
 * Clear existing posix CPU timers task work.
 */
void clear_posix_cputimers_work(struct task_struct *p)
{
        /*
         * A copied work entry from the old task is not meaningful, clear it.
         * N.B. init_task_work will not do this.
         */
        memset(&p->posix_cputimers_work.work, 0,
               sizeof(p->posix_cputimers_work.work));
        init_task_work(&p->posix_cputimers_work.work,
                       posix_cpu_timers_work);
        mutex_init(&p->posix_cputimers_work.mutex);
        p->posix_cputimers_work.scheduled = false;
}

/*
 * Initialize posix CPU timers task work in init task. Out of line to
 * keep the callback static and to avoid header recursion hell.
 */
void __init posix_cputimers_init_work(void)
{
        clear_posix_cputimers_work(current);
}

/*
 * Note: All operations on tsk->posix_cputimer_work.scheduled happen either
 * in hard interrupt context or in task context with interrupts
 * disabled. Aside of that the writer/reader interaction is always in the
 * context of the current task, which means they are strict per CPU.
 */
static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk)
{
        return tsk->posix_cputimers_work.scheduled;
}

static inline void __run_posix_cpu_timers(struct task_struct *tsk)
{
        if (WARN_ON_ONCE(tsk->posix_cputimers_work.scheduled))
                return;

        /* Schedule task work to actually expire the timers */
        tsk->posix_cputimers_work.scheduled = true;
        task_work_add(tsk, &tsk->posix_cputimers_work.work, TWA_RESUME);
}

static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk,
                                                unsigned long start)
{
        bool ret = true;

        /*
         * On !RT kernels interrupts are disabled while collecting expired
         * timers, so no tick can happen and the fast path check can be
         * reenabled without further checks.
         */
        if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
                tsk->posix_cputimers_work.scheduled = false;
                return true;
        }

        /*
         * On RT enabled kernels ticks can happen while the expired timers
         * are collected under sighand lock. But any tick which observes
         * the CPUTIMERS_WORK_SCHEDULED bit set, does not run the fastpath
         * checks. So reenabling the tick work has do be done carefully:
         *
         * Disable interrupts and run the fast path check if jiffies have
         * advanced since the collecting of expired timers started. If
         * jiffies have not advanced or the fast path check did not find
         * newly expired timers, reenable the fast path check in the timer
         * interrupt. If there are newly expired timers, return false and
         * let the collection loop repeat.
         */
        local_irq_disable();
        if (start != jiffies && fastpath_timer_check(tsk))
                ret = false;
        else
                tsk->posix_cputimers_work.scheduled = false;
        local_irq_enable();

        return ret;
}
#else /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */
static inline void __run_posix_cpu_timers(struct task_struct *tsk)
{
        lockdep_posixtimer_enter();
        handle_posix_cpu_timers(tsk);
        lockdep_posixtimer_exit();
}

static void posix_cpu_timer_wait_running(struct k_itimer *timr)
{
        cpu_relax();
}

static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr)
{
        spin_unlock_irq(&timr->it_lock);
        cpu_relax();
        spin_lock_irq(&timr->it_lock);
}

static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk)
{
        return false;
}

static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk,
                                                unsigned long start)
{
        return true;
}
#endif /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */

static void handle_posix_cpu_timers(struct task_struct *tsk)
{
        struct k_itimer *timer, *next;
        unsigned long flags, start;
        LIST_HEAD(firing);

        if (!lock_task_sighand(tsk, &flags))
                return;

        do {
                /*
                 * On RT locking sighand lock does not disable interrupts,
                 * so this needs to be careful vs. ticks. Store the current
                 * jiffies value.
                 */
                start = READ_ONCE(jiffies);
                barrier();

                /*
                 * Here we take off tsk->signal->cpu_timers[N] and
                 * tsk->cpu_timers[N] all the timers that are firing, and
                 * put them on the firing list.
                 */
                check_thread_timers(tsk, &firing);

                check_process_timers(tsk, &firing);

                /*
                 * The above timer checks have updated the exipry cache and
                 * because nothing can have queued or modified timers after
                 * sighand lock was taken above it is guaranteed to be
                 * consistent. So the next timer interrupt fastpath check
                 * will find valid data.
                 *
                 * If timer expiry runs in the timer interrupt context then
                 * the loop is not relevant as timers will be directly
                 * expired in interrupt context. The stub function below
                 * returns always true which allows the compiler to
                 * optimize the loop out.
                 *
                 * If timer expiry is deferred to task work context then
                 * the following rules apply:
                 *
                 * - On !RT kernels no tick can have happened on this CPU
                 *   after sighand lock was acquired because interrupts are
                 *   disabled. So reenabling task work before dropping
                 *   sighand lock and reenabling interrupts is race free.
                 *
                 * - On RT kernels ticks might have happened but the tick
                 *   work ignored posix CPU timer handling because the
                 *   CPUTIMERS_WORK_SCHEDULED bit is set. Reenabling work
                 *   must be done very carefully including a check whether
                 *   ticks have happened since the start of the timer
                 *   expiry checks. posix_cpu_timers_enable_work() takes
                 *   care of that and eventually lets the expiry checks
                 *   run again.
                 */
        } while (!posix_cpu_timers_enable_work(tsk, start));

        /*
         * We must release sighand lock before taking any timer's lock.
         * There is a potential race with timer deletion here, as the
         * siglock now protects our private firing list.  We have set
         * the firing flag in each timer, so that a deletion attempt
         * that gets the timer lock before we do will give it up and
         * spin until we've taken care of that timer below.
         */
        unlock_task_sighand(tsk, &flags);

        /*
         * Now that all the timers on our list have the firing flag,
         * no one will touch their list entries but us.  We'll take
         * each timer's lock before clearing its firing flag, so no
         * timer call will interfere.
         */
        list_for_each_entry_safe(timer, next, &firing, it.cpu.elist) {
                int cpu_firing;

                /*
                 * spin_lock() is sufficient here even independent of the
                 * expiry context. If expiry happens in hard interrupt
                 * context it's obvious. For task work context it's safe
                 * because all other operations on timer::it_lock happen in
                 * task context (syscall or exit).
                 */
                spin_lock(&timer->it_lock);
                list_del_init(&timer->it.cpu.elist);
                cpu_firing = timer->it.cpu.firing;
                timer->it.cpu.firing = 0;
                /*
                 * The firing flag is -1 if we collided with a reset
                 * of the timer, which already reported this
                 * almost-firing as an overrun.  So don't generate an event.
                 */
                if (likely(cpu_firing >= 0))
                        cpu_timer_fire(timer);
                /* See posix_cpu_timer_wait_running() */
                rcu_assign_pointer(timer->it.cpu.handling, NULL);
                spin_unlock(&timer->it_lock);
        }
}

/*
 * This is called from the timer interrupt handler.  The irq handler has
 * already updated our counts.  We need to check if any timers fire now.
 * Interrupts are disabled.
 */
void run_posix_cpu_timers(void)
{
        struct task_struct *tsk = current;

        lockdep_assert_irqs_disabled();

        /*
         * Ensure that release_task(tsk) can't happen while
         * handle_posix_cpu_timers() is running. Otherwise, a concurrent
         * posix_cpu_timer_del() may fail to lock_task_sighand(tsk) and
         * miss timer->it.cpu.firing != 0.
         */
        if (tsk->exit_state)
                return;

        /*
         * If the actual expiry is deferred to task work context and the
         * work is already scheduled there is no point to do anything here.
         */
        if (posix_cpu_timers_work_scheduled(tsk))
                return;

        /*
         * The fast path checks that there are no expired thread or thread
         * group timers.  If that's so, just return.
         */
        if (!fastpath_timer_check(tsk))
                return;

        __run_posix_cpu_timers(tsk);
}

/*
 * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
 * The tsk->sighand->siglock must be held by the caller.
 */
void set_process_cpu_timer(struct task_struct *tsk, unsigned int clkid,
                           u64 *newval, u64 *oldval)
{
        u64 now, *nextevt;

        if (WARN_ON_ONCE(clkid >= CPUCLOCK_SCHED))
                return;

        nextevt = &tsk->signal->posix_cputimers.bases[clkid].nextevt;
        now = cpu_clock_sample_group(clkid, tsk, true);

        if (oldval) {
                /*
                 * We are setting itimer. The *oldval is absolute and we update
                 * it to be relative, *newval argument is relative and we update
                 * it to be absolute.
                 */
                if (*oldval) {
                        if (*oldval <= now) {
                                /* Just about to fire. */
                                *oldval = TICK_NSEC;
                        } else {
                                *oldval -= now;
                        }
                }

                if (!*newval)
                        return;
                *newval += now;
        }

        /*
         * Update expiration cache if this is the earliest timer. CPUCLOCK_PROF
         * expiry cache is also used by RLIMIT_CPU!.
         */
        if (*newval < *nextevt)
                *nextevt = *newval;

        tick_dep_set_signal(tsk->signal, TICK_DEP_BIT_POSIX_TIMER);
}

static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
                            const struct timespec64 *rqtp)
{
        struct itimerspec64 it;
        struct k_itimer timer;
        u64 expires;
        int error;

        /*
         * Set up a temporary timer and then wait for it to go off.
         */
        memset(&timer, 0, sizeof timer);
        spin_lock_init(&timer.it_lock);
        timer.it_clock = which_clock;
        timer.it_overrun = -1;
        error = posix_cpu_timer_create(&timer);
        timer.it_process = current;

        if (!error) {
                static struct itimerspec64 zero_it;
                struct restart_block *restart;

                memset(&it, 0, sizeof(it));
                it.it_value = *rqtp;

                spin_lock_irq(&timer.it_lock);
                error = posix_cpu_timer_set(&timer, flags, &it, NULL);
                if (error) {
                        spin_unlock_irq(&timer.it_lock);
                        return error;
                }

                while (!signal_pending(current)) {
                        if (!cpu_timer_getexpires(&timer.it.cpu)) {
                                /*
                                 * Our timer fired and was reset, below
                                 * deletion can not fail.
                                 */
                                posix_cpu_timer_del(&timer);
                                spin_unlock_irq(&timer.it_lock);
                                return 0;
                        }

                        /*
                         * Block until cpu_timer_fire (or a signal) wakes us.
                         */
                        __set_current_state(TASK_INTERRUPTIBLE);
                        spin_unlock_irq(&timer.it_lock);
                        schedule();
                        spin_lock_irq(&timer.it_lock);
                }

                /*
                 * We were interrupted by a signal.
                 */
                expires = cpu_timer_getexpires(&timer.it.cpu);
                error = posix_cpu_timer_set(&timer, 0, &zero_it, &it);
                if (!error) {
                        /* Timer is now unarmed, deletion can not fail. */
                        posix_cpu_timer_del(&timer);
                } else {
                        while (error == TIMER_RETRY) {
                                posix_cpu_timer_wait_running_nsleep(&timer);
                                error = posix_cpu_timer_del(&timer);
                        }
                }

                spin_unlock_irq(&timer.it_lock);

                if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) {
                        /*
                         * It actually did fire already.
                         */
                        return 0;
                }

                error = -ERESTART_RESTARTBLOCK;
                /*
                 * Report back to the user the time still remaining.
                 */
                restart = &current->restart_block;
                restart->nanosleep.expires = expires;
                if (restart->nanosleep.type != TT_NONE)
                        error = nanosleep_copyout(restart, &it.it_value);
        }

        return error;
}

static long posix_cpu_nsleep_restart(struct restart_block *restart_block);

static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
                            const struct timespec64 *rqtp)
{
        struct restart_block *restart_block = &current->restart_block;
        int error;

        /*
         * Diagnose required errors first.
         */
        if (CPUCLOCK_PERTHREAD(which_clock) &&
            (CPUCLOCK_PID(which_clock) == 0 ||
             CPUCLOCK_PID(which_clock) == task_pid_vnr(current)))
                return -EINVAL;

        error = do_cpu_nanosleep(which_clock, flags, rqtp);

        if (error == -ERESTART_RESTARTBLOCK) {

                if (flags & TIMER_ABSTIME)
                        return -ERESTARTNOHAND;

                restart_block->nanosleep.clockid = which_clock;
                set_restart_fn(restart_block, posix_cpu_nsleep_restart);
        }
        return error;
}

static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
{
        clockid_t which_clock = restart_block->nanosleep.clockid;
        struct timespec64 t;

        t = ns_to_timespec64(restart_block->nanosleep.expires);

        return do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t);
}

#define PROCESS_CLOCK        make_process_cpuclock(0, CPUCLOCK_SCHED)
#define THREAD_CLOCK        make_thread_cpuclock(0, CPUCLOCK_SCHED)

static int process_cpu_clock_getres(const clockid_t which_clock,
                                    struct timespec64 *tp)
{
        return posix_cpu_clock_getres(PROCESS_CLOCK, tp);
}
static int process_cpu_clock_get(const clockid_t which_clock,
                                 struct timespec64 *tp)
{
        return posix_cpu_clock_get(PROCESS_CLOCK, tp);
}
static int process_cpu_timer_create(struct k_itimer *timer)
{
        timer->it_clock = PROCESS_CLOCK;
        return posix_cpu_timer_create(timer);
}
static int process_cpu_nsleep(const clockid_t which_clock, int flags,
                              const struct timespec64 *rqtp)
{
        return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp);
}
static int thread_cpu_clock_getres(const clockid_t which_clock,
                                   struct timespec64 *tp)
{
        return posix_cpu_clock_getres(THREAD_CLOCK, tp);
}
static int thread_cpu_clock_get(const clockid_t which_clock,
                                struct timespec64 *tp)
{
        return posix_cpu_clock_get(THREAD_CLOCK, tp);
}
static int thread_cpu_timer_create(struct k_itimer *timer)
{
        timer->it_clock = THREAD_CLOCK;
        return posix_cpu_timer_create(timer);
}

const struct k_clock clock_posix_cpu = {
        .clock_getres                = posix_cpu_clock_getres,
        .clock_set                = posix_cpu_clock_set,
        .clock_get_timespec        = posix_cpu_clock_get,
        .timer_create                = posix_cpu_timer_create,
        .nsleep                        = posix_cpu_nsleep,
        .timer_set                = posix_cpu_timer_set,
        .timer_del                = posix_cpu_timer_del,
        .timer_get                = posix_cpu_timer_get,
        .timer_rearm                = posix_cpu_timer_rearm,
        .timer_wait_running        = posix_cpu_timer_wait_running,
};

const struct k_clock clock_process = {
        .clock_getres                = process_cpu_clock_getres,
        .clock_get_timespec        = process_cpu_clock_get,
        .timer_create                = process_cpu_timer_create,
        .nsleep                        = process_cpu_nsleep,
};

const struct k_clock clock_thread = {
        .clock_getres                = thread_cpu_clock_getres,
        .clock_get_timespec        = thread_cpu_clock_get,
        .timer_create                = thread_cpu_timer_create,
};






























    2 

















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * memory buffer pool support
 */
#ifndef _LINUX_MEMPOOL_H
#define _LINUX_MEMPOOL_H

#include <linux/wait.h>
#include <linux/compiler.h>

struct kmem_cache;

typedef void * (mempool_alloc_t)(gfp_t gfp_mask, void *pool_data);
typedef void (mempool_free_t)(void *element, void *pool_data);

typedef struct mempool_s {
        spinlock_t lock;
        int min_nr;                /* nr of elements at *elements */
        int curr_nr;                /* Current nr of elements at *elements */
        void **elements;

        void *pool_data;
        mempool_alloc_t *alloc;
        mempool_free_t *free;
        wait_queue_head_t wait;
} mempool_t;

static inline bool mempool_initialized(mempool_t *pool)
{
        return pool->elements != NULL;
}

void mempool_exit(mempool_t *pool);
int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
                      mempool_free_t *free_fn, void *pool_data,
                      gfp_t gfp_mask, int node_id);
int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
                 mempool_free_t *free_fn, void *pool_data);

extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
                        mempool_free_t *free_fn, void *pool_data);
extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
                        mempool_free_t *free_fn, void *pool_data,
                        gfp_t gfp_mask, int nid);

extern int mempool_resize(mempool_t *pool, int new_min_nr);
extern void mempool_destroy(mempool_t *pool);
extern void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) __malloc;
extern void mempool_free(void *element, mempool_t *pool);

/*
 * A mempool_alloc_t and mempool_free_t that get the memory from
 * a slab cache that is passed in through pool_data.
 * Note: the slab cache may not have a ctor function.
 */
void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data);
void mempool_free_slab(void *element, void *pool_data);

static inline int
mempool_init_slab_pool(mempool_t *pool, int min_nr, struct kmem_cache *kc)
{
        return mempool_init(pool, min_nr, mempool_alloc_slab,
                            mempool_free_slab, (void *) kc);
}

static inline mempool_t *
mempool_create_slab_pool(int min_nr, struct kmem_cache *kc)
{
        return mempool_create(min_nr, mempool_alloc_slab, mempool_free_slab,
                              (void *) kc);
}

/*
 * a mempool_alloc_t and a mempool_free_t to kmalloc and kfree the
 * amount of memory specified by pool_data
 */
void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data);
void mempool_kfree(void *element, void *pool_data);

static inline int mempool_init_kmalloc_pool(mempool_t *pool, int min_nr, size_t size)
{
        return mempool_init(pool, min_nr, mempool_kmalloc,
                            mempool_kfree, (void *) size);
}

static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size)
{
        return mempool_create(min_nr, mempool_kmalloc, mempool_kfree,
                              (void *) size);
}

/*
 * A mempool_alloc_t and mempool_free_t for a simple page allocator that
 * allocates pages of the order specified by pool_data
 */
void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data);
void mempool_free_pages(void *element, void *pool_data);

static inline int mempool_init_page_pool(mempool_t *pool, int min_nr, int order)
{
        return mempool_init(pool, min_nr, mempool_alloc_pages,
                            mempool_free_pages, (void *)(long)order);
}

static inline mempool_t *mempool_create_page_pool(int min_nr, int order)
{
        return mempool_create(min_nr, mempool_alloc_pages, mempool_free_pages,
                              (void *)(long)order);
}

#endif /* _LINUX_MEMPOOL_H */



























    1 


















    1 


    1 



    1 





    1 






    1 


    1 














    1 








    1 
    1 



    1 









































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2010 Red Hat, Inc.
 * Copyright (c) 2016-2021 Christoph Hellwig.
 */
#include <linux/module.h>
#include <linux/compiler.h>
#include <linux/fs.h>
#include <linux/iomap.h>
#include "trace.h"

/*
 * Execute a iomap write on a segment of the mapping that spans a
 * contiguous range of pages that have identical block mapping state.
 *
 * This avoids the need to map pages individually, do individual allocations
 * for each page and most importantly avoid the need for filesystem specific
 * locking per page. Instead, all the operations are amortised over the entire
 * range of pages. It is assumed that the filesystems will lock whatever
 * resources they require in the iomap_begin call, and release them in the
 * iomap_end call.
 */
loff_t
iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
                const struct iomap_ops *ops, void *data, iomap_actor_t actor)
{
        struct iomap iomap = { .type = IOMAP_HOLE };
        struct iomap srcmap = { .type = IOMAP_HOLE };
        loff_t written = 0, ret;
        u64 end;

        trace_iomap_apply(inode, pos, length, flags, ops, actor, _RET_IP_);

        /*
         * Need to map a range from start position for length bytes. This can
         * span multiple pages - it is only guaranteed to return a range of a
         * single type of pages (e.g. all into a hole, all mapped or all
         * unwritten). Failure at this point has nothing to undo.
         *
         * If allocation is required for this range, reserve the space now so
         * that the allocation is guaranteed to succeed later on. Once we copy
         * the data into the page cache pages, then we cannot fail otherwise we
         * expose transient stale data. If the reserve fails, we can safely
         * back out at this point as there is nothing to undo.
         */
        ret = ops->iomap_begin(inode, pos, length, flags, &iomap, &srcmap);
        if (ret)
                return ret;
        if (WARN_ON(iomap.offset > pos)) {
                written = -EIO;
                goto out;
        }
        if (WARN_ON(iomap.length == 0)) {
                written = -EIO;
                goto out;
        }

        trace_iomap_apply_dstmap(inode, &iomap);
        if (srcmap.type != IOMAP_HOLE)
                trace_iomap_apply_srcmap(inode, &srcmap);

        /*
         * Cut down the length to the one actually provided by the filesystem,
         * as it might not be able to give us the whole size that we requested.
         */
        end = iomap.offset + iomap.length;
        if (srcmap.type != IOMAP_HOLE)
                end = min(end, srcmap.offset + srcmap.length);
        if (pos + length > end)
                length = end - pos;

        /*
         * Now that we have guaranteed that the space allocation will succeed,
         * we can do the copy-in page by page without having to worry about
         * failures exposing transient data.
         *
         * To support COW operations, we read in data for partially blocks from
         * the srcmap if the file system filled it in.  In that case we the
         * length needs to be limited to the earlier of the ends of the iomaps.
         * If the file system did not provide a srcmap we pass in the normal
         * iomap into the actors so that they don't need to have special
         * handling for the two cases.
         */
        written = actor(inode, pos, length, data, &iomap,
                        srcmap.type != IOMAP_HOLE ? &srcmap : &iomap);

out:
        /*
         * Now the data has been copied, commit the range we've copied.  This
         * should not fail unless the filesystem has had a fatal error.
         */
        if (ops->iomap_end) {
                ret = ops->iomap_end(inode, pos, length,
                                     written > 0 ? written : 0,
                                     flags, &iomap);
        }

        return written ? written : ret;
}

static inline int iomap_iter_advance(struct iomap_iter *iter)
{
        /* handle the previous iteration (if any) */
        if (iter->iomap.length) {
                if (iter->processed <= 0)
                        return iter->processed;
                if (WARN_ON_ONCE(iter->processed > iomap_length(iter)))
                        return -EIO;
                iter->pos += iter->processed;
                iter->len -= iter->processed;
                if (!iter->len)
                        return 0;
        }

        /* clear the state for the next iteration */
        iter->processed = 0;
        memset(&iter->iomap, 0, sizeof(iter->iomap));
        memset(&iter->srcmap, 0, sizeof(iter->srcmap));
        return 1;
}

static inline void iomap_iter_done(struct iomap_iter *iter)
{
        WARN_ON_ONCE(iter->iomap.offset > iter->pos);
        WARN_ON_ONCE(iter->iomap.length == 0);
        WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos);

        trace_iomap_iter_dstmap(iter->inode, &iter->iomap);
        if (iter->srcmap.type != IOMAP_HOLE)
                trace_iomap_iter_srcmap(iter->inode, &iter->srcmap);
}

/**
 * iomap_iter - iterate over a ranges in a file
 * @iter: iteration structue
 * @ops: iomap ops provided by the file system
 *
 * Iterate over filesystem-provided space mappings for the provided file range.
 *
 * This function handles cleanup of resources acquired for iteration when the
 * filesystem indicates there are no more space mappings, which means that this
 * function must be called in a loop that continues as long it returns a
 * positive value.  If 0 or a negative value is returned, the caller must not
 * return to the loop body.  Within a loop body, there are two ways to break out
 * of the loop body:  leave @iter.processed unchanged, or set it to a negative
 * errno.
 */
int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops)
{
        int ret;

        if (iter->iomap.length && ops->iomap_end) {
                ret = ops->iomap_end(iter->inode, iter->pos, iomap_length(iter),
                                iter->processed > 0 ? iter->processed : 0,
                                iter->flags, &iter->iomap);
                if (ret < 0 && !iter->processed)
                        return ret;
        }

        trace_iomap_iter(iter, ops, _RET_IP_);
        ret = iomap_iter_advance(iter);
        if (ret <= 0)
                return ret;

        ret = ops->iomap_begin(iter->inode, iter->pos, iter->len, iter->flags,
                               &iter->iomap, &iter->srcmap);
        if (ret < 0)
                return ret;
        iomap_iter_done(iter);
        return 1;
}























































































































































































































































































































































































































































































































































































    9 




















    5 




















    1 
    1 














































































































































































    5 



































    1 
    1 





    1 












































    1 

    1 


    1 







    1 









    1 








    1 












    1 















    1 

















    1 



































    5 




    5 















    1 
    1 









    1 
    1 





























    2 


    2 




    2 










    2 

    2 
    2 
    2 
















































































































    1 
    1 
    1 











































    1 
    1 
    1 































    1 
    1 
    1 




    4 
    4 
    4 







    1 
    1 



















































































    2 






























































































    3 







    9 




    9 









    1 

    1 

    1 






    1 
















































































































    3 







    1 




    1 







    1 

    1 


























































































































































    1 
    1 







































































    1 
















































































































































    9 

    5 































    1 

















    1 
    1 
    1 
















    1 







































































    1 





    1 































    1 



































    1 






























    1 

    1 
    1 








    1 












































































































































































































































































    1 


































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Security plug functions
 *
 * Copyright (C) 2001 WireX Communications, Inc <chris@wirex.com>
 * Copyright (C) 2001-2002 Greg Kroah-Hartman <greg@kroah.com>
 * Copyright (C) 2001 Networks Associates Technology, Inc <ssmalley@nai.com>
 * Copyright (C) 2016 Mellanox Technologies
 */

#define pr_fmt(fmt) "LSM: " fmt

#include <linux/bpf.h>
#include <linux/capability.h>
#include <linux/dcache.h>
#include <linux/export.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/kernel_read_file.h>
#include <linux/lsm_hooks.h>
#include <linux/integrity.h>
#include <linux/ima.h>
#include <linux/evm.h>
#include <linux/fsnotify.h>
#include <linux/mman.h>
#include <linux/mount.h>
#include <linux/personality.h>
#include <linux/backing-dev.h>
#include <linux/string.h>
#include <linux/msg.h>
#include <net/flow.h>

#define MAX_LSM_EVM_XATTR        2

/* How many LSMs were built into the kernel? */
#define LSM_COUNT (__end_lsm_info - __start_lsm_info)

/*
 * These are descriptions of the reasons that can be passed to the
 * security_locked_down() LSM hook. Placing this array here allows
 * all security modules to use the same descriptions for auditing
 * purposes.
 */
const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = {
        [LOCKDOWN_NONE] = "none",
        [LOCKDOWN_MODULE_SIGNATURE] = "unsigned module loading",
        [LOCKDOWN_DEV_MEM] = "/dev/mem,kmem,port",
        [LOCKDOWN_EFI_TEST] = "/dev/efi_test access",
        [LOCKDOWN_KEXEC] = "kexec of unsigned images",
        [LOCKDOWN_HIBERNATION] = "hibernation",
        [LOCKDOWN_PCI_ACCESS] = "direct PCI access",
        [LOCKDOWN_IOPORT] = "raw io port access",
        [LOCKDOWN_MSR] = "raw MSR access",
        [LOCKDOWN_ACPI_TABLES] = "modifying ACPI tables",
        [LOCKDOWN_PCMCIA_CIS] = "direct PCMCIA CIS storage",
        [LOCKDOWN_TIOCSSERIAL] = "reconfiguration of serial port IO",
        [LOCKDOWN_MODULE_PARAMETERS] = "unsafe module parameters",
        [LOCKDOWN_MMIOTRACE] = "unsafe mmio",
        [LOCKDOWN_DEBUGFS] = "debugfs access",
        [LOCKDOWN_XMON_WR] = "xmon write access",
        [LOCKDOWN_BPF_WRITE_USER] = "use of bpf to write user RAM",
        [LOCKDOWN_DBG_WRITE_KERNEL] = "use of kgdb/kdb to write kernel RAM",
        [LOCKDOWN_XEN_USER_ACTIONS] = "Xen guest user action",
        [LOCKDOWN_INTEGRITY_MAX] = "integrity",
        [LOCKDOWN_KCORE] = "/proc/kcore access",
        [LOCKDOWN_KPROBES] = "use of kprobes",
        [LOCKDOWN_BPF_READ] = "use of bpf to read kernel RAM",
        [LOCKDOWN_DBG_READ_KERNEL] = "use of kgdb/kdb to read kernel RAM",
        [LOCKDOWN_PERF] = "unsafe use of perf",
        [LOCKDOWN_TRACEFS] = "use of tracefs",
        [LOCKDOWN_XMON_RW] = "xmon read and write access",
        [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality",
};

struct security_hook_heads security_hook_heads __lsm_ro_after_init;
static BLOCKING_NOTIFIER_HEAD(blocking_lsm_notifier_chain);

static struct kmem_cache *lsm_file_cache;
static struct kmem_cache *lsm_inode_cache;

char *lsm_names;
static struct lsm_blob_sizes blob_sizes __lsm_ro_after_init;

/* Boot-time LSM user choice */
static __initdata const char *chosen_lsm_order;
static __initdata const char *chosen_major_lsm;

static __initconst const char * const builtin_lsm_order = CONFIG_LSM;

/* Ordered list of LSMs to initialize. */
static __initdata struct lsm_info **ordered_lsms;
static __initdata struct lsm_info *exclusive;

static __initdata bool debug;
#define init_debug(...)                                                \
        do {                                                        \
                if (debug)                                        \
                        pr_info(__VA_ARGS__);                        \
        } while (0)

static bool __init is_enabled(struct lsm_info *lsm)
{
        if (!lsm->enabled)
                return false;

        return *lsm->enabled;
}

/* Mark an LSM's enabled flag. */
static int lsm_enabled_true __initdata = 1;
static int lsm_enabled_false __initdata = 0;
static void __init set_enabled(struct lsm_info *lsm, bool enabled)
{
        /*
         * When an LSM hasn't configured an enable variable, we can use
         * a hard-coded location for storing the default enabled state.
         */
        if (!lsm->enabled) {
                if (enabled)
                        lsm->enabled = &lsm_enabled_true;
                else
                        lsm->enabled = &lsm_enabled_false;
        } else if (lsm->enabled == &lsm_enabled_true) {
                if (!enabled)
                        lsm->enabled = &lsm_enabled_false;
        } else if (lsm->enabled == &lsm_enabled_false) {
                if (enabled)
                        lsm->enabled = &lsm_enabled_true;
        } else {
                *lsm->enabled = enabled;
        }
}

/* Is an LSM already listed in the ordered LSMs list? */
static bool __init exists_ordered_lsm(struct lsm_info *lsm)
{
        struct lsm_info **check;

        for (check = ordered_lsms; *check; check++)
                if (*check == lsm)
                        return true;

        return false;
}

/* Append an LSM to the list of ordered LSMs to initialize. */
static int last_lsm __initdata;
static void __init append_ordered_lsm(struct lsm_info *lsm, const char *from)
{
        /* Ignore duplicate selections. */
        if (exists_ordered_lsm(lsm))
                return;

        if (WARN(last_lsm == LSM_COUNT, "%s: out of LSM slots!?\n", from))
                return;

        /* Enable this LSM, if it is not already set. */
        if (!lsm->enabled)
                lsm->enabled = &lsm_enabled_true;
        ordered_lsms[last_lsm++] = lsm;

        init_debug("%s ordering: %s (%sabled)\n", from, lsm->name,
                   is_enabled(lsm) ? "en" : "dis");
}

/* Is an LSM allowed to be initialized? */
static bool __init lsm_allowed(struct lsm_info *lsm)
{
        /* Skip if the LSM is disabled. */
        if (!is_enabled(lsm))
                return false;

        /* Not allowed if another exclusive LSM already initialized. */
        if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && exclusive) {
                init_debug("exclusive disabled: %s\n", lsm->name);
                return false;
        }

        return true;
}

static void __init lsm_set_blob_size(int *need, int *lbs)
{
        int offset;

        if (*need > 0) {
                offset = *lbs;
                *lbs += *need;
                *need = offset;
        }
}

static void __init lsm_set_blob_sizes(struct lsm_blob_sizes *needed)
{
        if (!needed)
                return;

        lsm_set_blob_size(&needed->lbs_cred, &blob_sizes.lbs_cred);
        lsm_set_blob_size(&needed->lbs_file, &blob_sizes.lbs_file);
        /*
         * The inode blob gets an rcu_head in addition to
         * what the modules might need.
         */
        if (needed->lbs_inode && blob_sizes.lbs_inode == 0)
                blob_sizes.lbs_inode = sizeof(struct rcu_head);
        lsm_set_blob_size(&needed->lbs_inode, &blob_sizes.lbs_inode);
        lsm_set_blob_size(&needed->lbs_ipc, &blob_sizes.lbs_ipc);
        lsm_set_blob_size(&needed->lbs_msg_msg, &blob_sizes.lbs_msg_msg);
        lsm_set_blob_size(&needed->lbs_task, &blob_sizes.lbs_task);
}

/* Prepare LSM for initialization. */
static void __init prepare_lsm(struct lsm_info *lsm)
{
        int enabled = lsm_allowed(lsm);

        /* Record enablement (to handle any following exclusive LSMs). */
        set_enabled(lsm, enabled);

        /* If enabled, do pre-initialization work. */
        if (enabled) {
                if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && !exclusive) {
                        exclusive = lsm;
                        init_debug("exclusive chosen: %s\n", lsm->name);
                }

                lsm_set_blob_sizes(lsm->blobs);
        }
}

/* Initialize a given LSM, if it is enabled. */
static void __init initialize_lsm(struct lsm_info *lsm)
{
        if (is_enabled(lsm)) {
                int ret;

                init_debug("initializing %s\n", lsm->name);
                ret = lsm->init();
                WARN(ret, "%s failed to initialize: %d\n", lsm->name, ret);
        }
}

/* Populate ordered LSMs list from comma-separated LSM name list. */
static void __init ordered_lsm_parse(const char *order, const char *origin)
{
        struct lsm_info *lsm;
        char *sep, *name, *next;

        /* LSM_ORDER_FIRST is always first. */
        for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
                if (lsm->order == LSM_ORDER_FIRST)
                        append_ordered_lsm(lsm, "first");
        }

        /* Process "security=", if given. */
        if (chosen_major_lsm) {
                struct lsm_info *major;

                /*
                 * To match the original "security=" behavior, this
                 * explicitly does NOT fallback to another Legacy Major
                 * if the selected one was separately disabled: disable
                 * all non-matching Legacy Major LSMs.
                 */
                for (major = __start_lsm_info; major < __end_lsm_info;
                     major++) {
                        if ((major->flags & LSM_FLAG_LEGACY_MAJOR) &&
                            strcmp(major->name, chosen_major_lsm) != 0) {
                                set_enabled(major, false);
                                init_debug("security=%s disabled: %s\n",
                                           chosen_major_lsm, major->name);
                        }
                }
        }

        sep = kstrdup(order, GFP_KERNEL);
        next = sep;
        /* Walk the list, looking for matching LSMs. */
        while ((name = strsep(&next, ",")) != NULL) {
                bool found = false;

                for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
                        if (lsm->order == LSM_ORDER_MUTABLE &&
                            strcmp(lsm->name, name) == 0) {
                                append_ordered_lsm(lsm, origin);
                                found = true;
                        }
                }

                if (!found)
                        init_debug("%s ignored: %s\n", origin, name);
        }

        /* Process "security=", if given. */
        if (chosen_major_lsm) {
                for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
                        if (exists_ordered_lsm(lsm))
                                continue;
                        if (strcmp(lsm->name, chosen_major_lsm) == 0)
                                append_ordered_lsm(lsm, "security=");
                }
        }

        /* Disable all LSMs not in the ordered list. */
        for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
                if (exists_ordered_lsm(lsm))
                        continue;
                set_enabled(lsm, false);
                init_debug("%s disabled: %s\n", origin, lsm->name);
        }

        kfree(sep);
}

static void __init lsm_early_cred(struct cred *cred);
static void __init lsm_early_task(struct task_struct *task);

static int lsm_append(const char *new, char **result);

static void __init ordered_lsm_init(void)
{
        struct lsm_info **lsm;

        ordered_lsms = kcalloc(LSM_COUNT + 1, sizeof(*ordered_lsms),
                                GFP_KERNEL);

        if (chosen_lsm_order) {
                if (chosen_major_lsm) {
                        pr_info("security= is ignored because it is superseded by lsm=\n");
                        chosen_major_lsm = NULL;
                }
                ordered_lsm_parse(chosen_lsm_order, "cmdline");
        } else
                ordered_lsm_parse(builtin_lsm_order, "builtin");

        for (lsm = ordered_lsms; *lsm; lsm++)
                prepare_lsm(*lsm);

        init_debug("cred blob size     = %d\n", blob_sizes.lbs_cred);
        init_debug("file blob size     = %d\n", blob_sizes.lbs_file);
        init_debug("inode blob size    = %d\n", blob_sizes.lbs_inode);
        init_debug("ipc blob size      = %d\n", blob_sizes.lbs_ipc);
        init_debug("msg_msg blob size  = %d\n", blob_sizes.lbs_msg_msg);
        init_debug("task blob size     = %d\n", blob_sizes.lbs_task);

        /*
         * Create any kmem_caches needed for blobs
         */
        if (blob_sizes.lbs_file)
                lsm_file_cache = kmem_cache_create("lsm_file_cache",
                                                   blob_sizes.lbs_file, 0,
                                                   SLAB_PANIC, NULL);
        if (blob_sizes.lbs_inode)
                lsm_inode_cache = kmem_cache_create("lsm_inode_cache",
                                                    blob_sizes.lbs_inode, 0,
                                                    SLAB_PANIC, NULL);

        lsm_early_cred((struct cred *) current->cred);
        lsm_early_task(current);
        for (lsm = ordered_lsms; *lsm; lsm++)
                initialize_lsm(*lsm);

        kfree(ordered_lsms);
}

int __init early_security_init(void)
{
        int i;
        struct hlist_head *list = (struct hlist_head *) &security_hook_heads;
        struct lsm_info *lsm;

        for (i = 0; i < sizeof(security_hook_heads) / sizeof(struct hlist_head);
             i++)
                INIT_HLIST_HEAD(&list[i]);

        for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) {
                if (!lsm->enabled)
                        lsm->enabled = &lsm_enabled_true;
                prepare_lsm(lsm);
                initialize_lsm(lsm);
        }

        return 0;
}

/**
 * security_init - initializes the security framework
 *
 * This should be called early in the kernel initialization sequence.
 */
int __init security_init(void)
{
        struct lsm_info *lsm;

        pr_info("Security Framework initializing\n");

        /*
         * Append the names of the early LSM modules now that kmalloc() is
         * available
         */
        for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) {
                if (lsm->enabled)
                        lsm_append(lsm->name, &lsm_names);
        }

        /* Load LSMs in specified order. */
        ordered_lsm_init();

        return 0;
}

/* Save user chosen LSM */
static int __init choose_major_lsm(char *str)
{
        chosen_major_lsm = str;
        return 1;
}
__setup("security=", choose_major_lsm);

/* Explicitly choose LSM initialization order. */
static int __init choose_lsm_order(char *str)
{
        chosen_lsm_order = str;
        return 1;
}
__setup("lsm=", choose_lsm_order);

/* Enable LSM order debugging. */
static int __init enable_debug(char *str)
{
        debug = true;
        return 1;
}
__setup("lsm.debug", enable_debug);

static bool match_last_lsm(const char *list, const char *lsm)
{
        const char *last;

        if (WARN_ON(!list || !lsm))
                return false;
        last = strrchr(list, ',');
        if (last)
                /* Pass the comma, strcmp() will check for '\0' */
                last++;
        else
                last = list;
        return !strcmp(last, lsm);
}

static int lsm_append(const char *new, char **result)
{
        char *cp;

        if (*result == NULL) {
                *result = kstrdup(new, GFP_KERNEL);
                if (*result == NULL)
                        return -ENOMEM;
        } else {
                /* Check if it is the last registered name */
                if (match_last_lsm(*result, new))
                        return 0;
                cp = kasprintf(GFP_KERNEL, "%s,%s", *result, new);
                if (cp == NULL)
                        return -ENOMEM;
                kfree(*result);
                *result = cp;
        }
        return 0;
}

/**
 * security_add_hooks - Add a modules hooks to the hook lists.
 * @hooks: the hooks to add
 * @count: the number of hooks to add
 * @lsm: the name of the security module
 *
 * Each LSM has to register its hooks with the infrastructure.
 */
void __init security_add_hooks(struct security_hook_list *hooks, int count,
                                char *lsm)
{
        int i;

        for (i = 0; i < count; i++) {
                hooks[i].lsm = lsm;
                hlist_add_tail_rcu(&hooks[i].list, hooks[i].head);
        }

        /*
         * Don't try to append during early_security_init(), we'll come back
         * and fix this up afterwards.
         */
        if (slab_is_available()) {
                if (lsm_append(lsm, &lsm_names) < 0)
                        panic("%s - Cannot get early memory.\n", __func__);
        }
}

int call_blocking_lsm_notifier(enum lsm_event event, void *data)
{
        return blocking_notifier_call_chain(&blocking_lsm_notifier_chain,
                                            event, data);
}
EXPORT_SYMBOL(call_blocking_lsm_notifier);

int register_blocking_lsm_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_register(&blocking_lsm_notifier_chain,
                                                nb);
}
EXPORT_SYMBOL(register_blocking_lsm_notifier);

int unregister_blocking_lsm_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_unregister(&blocking_lsm_notifier_chain,
                                                  nb);
}
EXPORT_SYMBOL(unregister_blocking_lsm_notifier);

/**
 * lsm_cred_alloc - allocate a composite cred blob
 * @cred: the cred that needs a blob
 * @gfp: allocation type
 *
 * Allocate the cred blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_cred_alloc(struct cred *cred, gfp_t gfp)
{
        if (blob_sizes.lbs_cred == 0) {
                cred->security = NULL;
                return 0;
        }

        cred->security = kzalloc(blob_sizes.lbs_cred, gfp);
        if (cred->security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_early_cred - during initialization allocate a composite cred blob
 * @cred: the cred that needs a blob
 *
 * Allocate the cred blob for all the modules
 */
static void __init lsm_early_cred(struct cred *cred)
{
        int rc = lsm_cred_alloc(cred, GFP_KERNEL);

        if (rc)
                panic("%s: Early cred alloc failed.\n", __func__);
}

/**
 * lsm_file_alloc - allocate a composite file blob
 * @file: the file that needs a blob
 *
 * Allocate the file blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_file_alloc(struct file *file)
{
        if (!lsm_file_cache) {
                file->f_security = NULL;
                return 0;
        }

        file->f_security = kmem_cache_zalloc(lsm_file_cache, GFP_KERNEL);
        if (file->f_security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_inode_alloc - allocate a composite inode blob
 * @inode: the inode that needs a blob
 *
 * Allocate the inode blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
int lsm_inode_alloc(struct inode *inode)
{
        if (!lsm_inode_cache) {
                inode->i_security = NULL;
                return 0;
        }

        inode->i_security = kmem_cache_zalloc(lsm_inode_cache, GFP_NOFS);
        if (inode->i_security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_task_alloc - allocate a composite task blob
 * @task: the task that needs a blob
 *
 * Allocate the task blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_task_alloc(struct task_struct *task)
{
        if (blob_sizes.lbs_task == 0) {
                task->security = NULL;
                return 0;
        }

        task->security = kzalloc(blob_sizes.lbs_task, GFP_KERNEL);
        if (task->security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_ipc_alloc - allocate a composite ipc blob
 * @kip: the ipc that needs a blob
 *
 * Allocate the ipc blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_ipc_alloc(struct kern_ipc_perm *kip)
{
        if (blob_sizes.lbs_ipc == 0) {
                kip->security = NULL;
                return 0;
        }

        kip->security = kzalloc(blob_sizes.lbs_ipc, GFP_KERNEL);
        if (kip->security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_msg_msg_alloc - allocate a composite msg_msg blob
 * @mp: the msg_msg that needs a blob
 *
 * Allocate the ipc blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_msg_msg_alloc(struct msg_msg *mp)
{
        if (blob_sizes.lbs_msg_msg == 0) {
                mp->security = NULL;
                return 0;
        }

        mp->security = kzalloc(blob_sizes.lbs_msg_msg, GFP_KERNEL);
        if (mp->security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_early_task - during initialization allocate a composite task blob
 * @task: the task that needs a blob
 *
 * Allocate the task blob for all the modules
 */
static void __init lsm_early_task(struct task_struct *task)
{
        int rc = lsm_task_alloc(task);

        if (rc)
                panic("%s: Early task alloc failed.\n", __func__);
}

/*
 * The default value of the LSM hook is defined in linux/lsm_hook_defs.h and
 * can be accessed with:
 *
 *        LSM_RET_DEFAULT(<hook_name>)
 *
 * The macros below define static constants for the default value of each
 * LSM hook.
 */
#define LSM_RET_DEFAULT(NAME) (NAME##_default)
#define DECLARE_LSM_RET_DEFAULT_void(DEFAULT, NAME)
#define DECLARE_LSM_RET_DEFAULT_int(DEFAULT, NAME) \
        static const int LSM_RET_DEFAULT(NAME) = (DEFAULT);
#define LSM_HOOK(RET, DEFAULT, NAME, ...) \
        DECLARE_LSM_RET_DEFAULT_##RET(DEFAULT, NAME)

#include <linux/lsm_hook_defs.h>
#undef LSM_HOOK

/*
 * Hook list operation macros.
 *
 * call_void_hook:
 *        This is a hook that does not return a value.
 *
 * call_int_hook:
 *        This is a hook that returns a value.
 */

#define call_void_hook(FUNC, ...)                                \
        do {                                                        \
                struct security_hook_list *P;                        \
                                                                \
                hlist_for_each_entry(P, &security_hook_heads.FUNC, list) \
                        P->hook.FUNC(__VA_ARGS__);                \
        } while (0)

#define call_int_hook(FUNC, IRC, ...) ({                        \
        int RC = IRC;                                                \
        do {                                                        \
                struct security_hook_list *P;                        \
                                                                \
                hlist_for_each_entry(P, &security_hook_heads.FUNC, list) { \
                        RC = P->hook.FUNC(__VA_ARGS__);                \
                        if (RC != 0)                                \
                                break;                                \
                }                                                \
        } while (0);                                                \
        RC;                                                        \
})

/* Security operations */

int security_binder_set_context_mgr(const struct cred *mgr)
{
        return call_int_hook(binder_set_context_mgr, 0, mgr);
}

int security_binder_transaction(const struct cred *from,
                                const struct cred *to)
{
        return call_int_hook(binder_transaction, 0, from, to);
}

int security_binder_transfer_binder(const struct cred *from,
                                    const struct cred *to)
{
        return call_int_hook(binder_transfer_binder, 0, from, to);
}

int security_binder_transfer_file(const struct cred *from,
                                  const struct cred *to, struct file *file)
{
        return call_int_hook(binder_transfer_file, 0, from, to, file);
}

int security_ptrace_access_check(struct task_struct *child, unsigned int mode)
{
        return call_int_hook(ptrace_access_check, 0, child, mode);
}

int security_ptrace_traceme(struct task_struct *parent)
{
        return call_int_hook(ptrace_traceme, 0, parent);
}

int security_capget(struct task_struct *target,
                     kernel_cap_t *effective,
                     kernel_cap_t *inheritable,
                     kernel_cap_t *permitted)
{
        return call_int_hook(capget, 0, target,
                                effective, inheritable, permitted);
}

int security_capset(struct cred *new, const struct cred *old,
                    const kernel_cap_t *effective,
                    const kernel_cap_t *inheritable,
                    const kernel_cap_t *permitted)
{
        return call_int_hook(capset, 0, new, old,
                                effective, inheritable, permitted);
}

int security_capable(const struct cred *cred,
                     struct user_namespace *ns,
                     int cap,
                     unsigned int opts)
{
        return call_int_hook(capable, 0, cred, ns, cap, opts);
}

int security_quotactl(int cmds, int type, int id, struct super_block *sb)
{
        return call_int_hook(quotactl, 0, cmds, type, id, sb);
}

int security_quota_on(struct dentry *dentry)
{
        return call_int_hook(quota_on, 0, dentry);
}

int security_syslog(int type)
{
        return call_int_hook(syslog, 0, type);
}

int security_settime64(const struct timespec64 *ts, const struct timezone *tz)
{
        return call_int_hook(settime, 0, ts, tz);
}

int security_vm_enough_memory_mm(struct mm_struct *mm, long pages)
{
        struct security_hook_list *hp;
        int cap_sys_admin = 1;
        int rc;

        /*
         * The module will respond with a positive value if
         * it thinks the __vm_enough_memory() call should be
         * made with the cap_sys_admin set. If all of the modules
         * agree that it should be set it will. If any module
         * thinks it should not be set it won't.
         */
        hlist_for_each_entry(hp, &security_hook_heads.vm_enough_memory, list) {
                rc = hp->hook.vm_enough_memory(mm, pages);
                if (rc <= 0) {
                        cap_sys_admin = 0;
                        break;
                }
        }
        return __vm_enough_memory(mm, pages, cap_sys_admin);
}

int security_bprm_creds_for_exec(struct linux_binprm *bprm)
{
        return call_int_hook(bprm_creds_for_exec, 0, bprm);
}

int security_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file)
{
        return call_int_hook(bprm_creds_from_file, 0, bprm, file);
}

int security_bprm_check(struct linux_binprm *bprm)
{
        int ret;

        ret = call_int_hook(bprm_check_security, 0, bprm);
        if (ret)
                return ret;
        return ima_bprm_check(bprm);
}

void security_bprm_committing_creds(struct linux_binprm *bprm)
{
        call_void_hook(bprm_committing_creds, bprm);
}

void security_bprm_committed_creds(struct linux_binprm *bprm)
{
        call_void_hook(bprm_committed_creds, bprm);
}

int security_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc)
{
        return call_int_hook(fs_context_dup, 0, fc, src_fc);
}

int security_fs_context_parse_param(struct fs_context *fc,
                                    struct fs_parameter *param)
{
        struct security_hook_list *hp;
        int trc;
        int rc = -ENOPARAM;

        hlist_for_each_entry(hp, &security_hook_heads.fs_context_parse_param,
                             list) {
                trc = hp->hook.fs_context_parse_param(fc, param);
                if (trc == 0)
                        rc = 0;
                else if (trc != -ENOPARAM)
                        return trc;
        }
        return rc;
}

int security_sb_alloc(struct super_block *sb)
{
        return call_int_hook(sb_alloc_security, 0, sb);
}

void security_sb_free(struct super_block *sb)
{
        call_void_hook(sb_free_security, sb);
}

void security_free_mnt_opts(void **mnt_opts)
{
        if (!*mnt_opts)
                return;
        call_void_hook(sb_free_mnt_opts, *mnt_opts);
        *mnt_opts = NULL;
}
EXPORT_SYMBOL(security_free_mnt_opts);

int security_sb_eat_lsm_opts(char *options, void **mnt_opts)
{
        return call_int_hook(sb_eat_lsm_opts, 0, options, mnt_opts);
}
EXPORT_SYMBOL(security_sb_eat_lsm_opts);

int security_sb_remount(struct super_block *sb,
                        void *mnt_opts)
{
        return call_int_hook(sb_remount, 0, sb, mnt_opts);
}
EXPORT_SYMBOL(security_sb_remount);

int security_sb_kern_mount(struct super_block *sb)
{
        return call_int_hook(sb_kern_mount, 0, sb);
}

int security_sb_show_options(struct seq_file *m, struct super_block *sb)
{
        return call_int_hook(sb_show_options, 0, m, sb);
}

int security_sb_statfs(struct dentry *dentry)
{
        return call_int_hook(sb_statfs, 0, dentry);
}

int security_sb_mount(const char *dev_name, const struct path *path,
                       const char *type, unsigned long flags, void *data)
{
        return call_int_hook(sb_mount, 0, dev_name, path, type, flags, data);
}

int security_sb_umount(struct vfsmount *mnt, int flags)
{
        return call_int_hook(sb_umount, 0, mnt, flags);
}

int security_sb_pivotroot(const struct path *old_path, const struct path *new_path)
{
        return call_int_hook(sb_pivotroot, 0, old_path, new_path);
}

int security_sb_set_mnt_opts(struct super_block *sb,
                                void *mnt_opts,
                                unsigned long kern_flags,
                                unsigned long *set_kern_flags)
{
        return call_int_hook(sb_set_mnt_opts,
                                mnt_opts ? -EOPNOTSUPP : 0, sb,
                                mnt_opts, kern_flags, set_kern_flags);
}
EXPORT_SYMBOL(security_sb_set_mnt_opts);

int security_sb_clone_mnt_opts(const struct super_block *oldsb,
                                struct super_block *newsb,
                                unsigned long kern_flags,
                                unsigned long *set_kern_flags)
{
        return call_int_hook(sb_clone_mnt_opts, 0, oldsb, newsb,
                                kern_flags, set_kern_flags);
}
EXPORT_SYMBOL(security_sb_clone_mnt_opts);

int security_add_mnt_opt(const char *option, const char *val, int len,
                         void **mnt_opts)
{
        return call_int_hook(sb_add_mnt_opt, -EINVAL,
                                        option, val, len, mnt_opts);
}
EXPORT_SYMBOL(security_add_mnt_opt);

int security_move_mount(const struct path *from_path, const struct path *to_path)
{
        return call_int_hook(move_mount, 0, from_path, to_path);
}

int security_path_notify(const struct path *path, u64 mask,
                                unsigned int obj_type)
{
        return call_int_hook(path_notify, 0, path, mask, obj_type);
}

int security_inode_alloc(struct inode *inode)
{
        int rc = lsm_inode_alloc(inode);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(inode_alloc_security, 0, inode);
        if (unlikely(rc))
                security_inode_free(inode);
        return rc;
}

static void inode_free_by_rcu(struct rcu_head *head)
{
        /*
         * The rcu head is at the start of the inode blob
         */
        kmem_cache_free(lsm_inode_cache, head);
}

void security_inode_free(struct inode *inode)
{
        integrity_inode_free(inode);
        call_void_hook(inode_free_security, inode);
        /*
         * The inode may still be referenced in a path walk and
         * a call to security_inode_permission() can be made
         * after inode_free_security() is called. Ideally, the VFS
         * wouldn't do this, but fixing that is a much harder
         * job. For now, simply free the i_security via RCU, and
         * leave the current inode->i_security pointer intact.
         * The inode will be freed after the RCU grace period too.
         */
        if (inode->i_security)
                call_rcu((struct rcu_head *)inode->i_security,
                                inode_free_by_rcu);
}

int security_dentry_init_security(struct dentry *dentry, int mode,
                                        const struct qstr *name, void **ctx,
                                        u32 *ctxlen)
{
        return call_int_hook(dentry_init_security, -EOPNOTSUPP, dentry, mode,
                                name, ctx, ctxlen);
}
EXPORT_SYMBOL(security_dentry_init_security);

int security_dentry_create_files_as(struct dentry *dentry, int mode,
                                    struct qstr *name,
                                    const struct cred *old, struct cred *new)
{
        return call_int_hook(dentry_create_files_as, 0, dentry, mode,
                                name, old, new);
}
EXPORT_SYMBOL(security_dentry_create_files_as);

int security_inode_init_security(struct inode *inode, struct inode *dir,
                                 const struct qstr *qstr,
                                 const initxattrs initxattrs, void *fs_data)
{
        struct xattr new_xattrs[MAX_LSM_EVM_XATTR + 1];
        struct xattr *lsm_xattr, *evm_xattr, *xattr;
        int ret;

        if (unlikely(IS_PRIVATE(inode)))
                return 0;

        if (!initxattrs)
                return call_int_hook(inode_init_security, -EOPNOTSUPP, inode,
                                     dir, qstr, NULL, NULL, NULL);
        memset(new_xattrs, 0, sizeof(new_xattrs));
        lsm_xattr = new_xattrs;
        ret = call_int_hook(inode_init_security, -EOPNOTSUPP, inode, dir, qstr,
                                                &lsm_xattr->name,
                                                &lsm_xattr->value,
                                                &lsm_xattr->value_len);
        if (ret)
                goto out;

        evm_xattr = lsm_xattr + 1;
        ret = evm_inode_init_security(inode, lsm_xattr, evm_xattr);
        if (ret)
                goto out;
        ret = initxattrs(inode, new_xattrs, fs_data);
out:
        for (xattr = new_xattrs; xattr->value != NULL; xattr++)
                kfree(xattr->value);
        return (ret == -EOPNOTSUPP) ? 0 : ret;
}
EXPORT_SYMBOL(security_inode_init_security);

int security_old_inode_init_security(struct inode *inode, struct inode *dir,
                                     const struct qstr *qstr, const char **name,
                                     void **value, size_t *len)
{
        if (unlikely(IS_PRIVATE(inode)))
                return -EOPNOTSUPP;
        return call_int_hook(inode_init_security, -EOPNOTSUPP, inode, dir,
                             qstr, name, value, len);
}
EXPORT_SYMBOL(security_old_inode_init_security);

#ifdef CONFIG_SECURITY_PATH
int security_path_mknod(const struct path *dir, struct dentry *dentry, umode_t mode,
                        unsigned int dev)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_mknod, 0, dir, dentry, mode, dev);
}
EXPORT_SYMBOL(security_path_mknod);

int security_path_mkdir(const struct path *dir, struct dentry *dentry, umode_t mode)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_mkdir, 0, dir, dentry, mode);
}
EXPORT_SYMBOL(security_path_mkdir);

int security_path_rmdir(const struct path *dir, struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_rmdir, 0, dir, dentry);
}

int security_path_unlink(const struct path *dir, struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_unlink, 0, dir, dentry);
}
EXPORT_SYMBOL(security_path_unlink);

int security_path_symlink(const struct path *dir, struct dentry *dentry,
                          const char *old_name)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_symlink, 0, dir, dentry, old_name);
}

int security_path_link(struct dentry *old_dentry, const struct path *new_dir,
                       struct dentry *new_dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry))))
                return 0;
        return call_int_hook(path_link, 0, old_dentry, new_dir, new_dentry);
}

int security_path_rename(const struct path *old_dir, struct dentry *old_dentry,
                         const struct path *new_dir, struct dentry *new_dentry,
                         unsigned int flags)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)) ||
                     (d_is_positive(new_dentry) && IS_PRIVATE(d_backing_inode(new_dentry)))))
                return 0;

        if (flags & RENAME_EXCHANGE) {
                int err = call_int_hook(path_rename, 0, new_dir, new_dentry,
                                        old_dir, old_dentry);
                if (err)
                        return err;
        }

        return call_int_hook(path_rename, 0, old_dir, old_dentry, new_dir,
                                new_dentry);
}
EXPORT_SYMBOL(security_path_rename);

int security_path_truncate(const struct path *path)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
                return 0;
        return call_int_hook(path_truncate, 0, path);
}

int security_path_chmod(const struct path *path, umode_t mode)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
                return 0;
        return call_int_hook(path_chmod, 0, path, mode);
}

int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
                return 0;
        return call_int_hook(path_chown, 0, path, uid, gid);
}

int security_path_chroot(const struct path *path)
{
        return call_int_hook(path_chroot, 0, path);
}
#endif

int security_inode_create(struct inode *dir, struct dentry *dentry, umode_t mode)
{
        if (unlikely(IS_PRIVATE(dir)))
                return 0;
        return call_int_hook(inode_create, 0, dir, dentry, mode);
}
EXPORT_SYMBOL_GPL(security_inode_create);

int security_inode_link(struct dentry *old_dentry, struct inode *dir,
                         struct dentry *new_dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry))))
                return 0;
        return call_int_hook(inode_link, 0, old_dentry, dir, new_dentry);
}

int security_inode_unlink(struct inode *dir, struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_unlink, 0, dir, dentry);
}

int security_inode_symlink(struct inode *dir, struct dentry *dentry,
                            const char *old_name)
{
        if (unlikely(IS_PRIVATE(dir)))
                return 0;
        return call_int_hook(inode_symlink, 0, dir, dentry, old_name);
}

int security_inode_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
        if (unlikely(IS_PRIVATE(dir)))
                return 0;
        return call_int_hook(inode_mkdir, 0, dir, dentry, mode);
}
EXPORT_SYMBOL_GPL(security_inode_mkdir);

int security_inode_rmdir(struct inode *dir, struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_rmdir, 0, dir, dentry);
}

int security_inode_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
{
        if (unlikely(IS_PRIVATE(dir)))
                return 0;
        return call_int_hook(inode_mknod, 0, dir, dentry, mode, dev);
}

int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry,
                           struct inode *new_dir, struct dentry *new_dentry,
                           unsigned int flags)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)) ||
            (d_is_positive(new_dentry) && IS_PRIVATE(d_backing_inode(new_dentry)))))
                return 0;

        if (flags & RENAME_EXCHANGE) {
                int err = call_int_hook(inode_rename, 0, new_dir, new_dentry,
                                                     old_dir, old_dentry);
                if (err)
                        return err;
        }

        return call_int_hook(inode_rename, 0, old_dir, old_dentry,
                                           new_dir, new_dentry);
}

int security_inode_readlink(struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_readlink, 0, dentry);
}

int security_inode_follow_link(struct dentry *dentry, struct inode *inode,
                               bool rcu)
{
        if (unlikely(IS_PRIVATE(inode)))
                return 0;
        return call_int_hook(inode_follow_link, 0, dentry, inode, rcu);
}

int security_inode_permission(struct inode *inode, int mask)
{
        if (unlikely(IS_PRIVATE(inode)))
                return 0;
        return call_int_hook(inode_permission, 0, inode, mask);
}

int security_inode_setattr(struct dentry *dentry, struct iattr *attr)
{
        int ret;

        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        ret = call_int_hook(inode_setattr, 0, dentry, attr);
        if (ret)
                return ret;
        return evm_inode_setattr(dentry, attr);
}
EXPORT_SYMBOL_GPL(security_inode_setattr);

int security_inode_getattr(const struct path *path)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
                return 0;
        return call_int_hook(inode_getattr, 0, path);
}

int security_inode_setxattr(struct dentry *dentry, const char *name,
                            const void *value, size_t size, int flags)
{
        int ret;

        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        /*
         * SELinux and Smack integrate the cap call,
         * so assume that all LSMs supplying this call do so.
         */
        ret = call_int_hook(inode_setxattr, 1, dentry, name, value, size,
                                flags);

        if (ret == 1)
                ret = cap_inode_setxattr(dentry, name, value, size, flags);
        if (ret)
                return ret;
        ret = ima_inode_setxattr(dentry, name, value, size);
        if (ret)
                return ret;
        return evm_inode_setxattr(dentry, name, value, size);
}

void security_inode_post_setxattr(struct dentry *dentry, const char *name,
                                  const void *value, size_t size, int flags)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return;
        call_void_hook(inode_post_setxattr, dentry, name, value, size, flags);
        evm_inode_post_setxattr(dentry, name, value, size);
}

int security_inode_getxattr(struct dentry *dentry, const char *name)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_getxattr, 0, dentry, name);
}

int security_inode_listxattr(struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_listxattr, 0, dentry);
}

int security_inode_removexattr(struct dentry *dentry, const char *name)
{
        int ret;

        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        /*
         * SELinux and Smack integrate the cap call,
         * so assume that all LSMs supplying this call do so.
         */
        ret = call_int_hook(inode_removexattr, 1, dentry, name);
        if (ret == 1)
                ret = cap_inode_removexattr(dentry, name);
        if (ret)
                return ret;
        ret = ima_inode_removexattr(dentry, name);
        if (ret)
                return ret;
        return evm_inode_removexattr(dentry, name);
}

int security_inode_need_killpriv(struct dentry *dentry)
{
        return call_int_hook(inode_need_killpriv, 0, dentry);
}

int security_inode_killpriv(struct dentry *dentry)
{
        return call_int_hook(inode_killpriv, 0, dentry);
}

int security_inode_getsecurity(struct inode *inode, const char *name, void **buffer, bool alloc)
{
        struct security_hook_list *hp;
        int rc;

        if (unlikely(IS_PRIVATE(inode)))
                return LSM_RET_DEFAULT(inode_getsecurity);
        /*
         * Only one module will provide an attribute with a given name.
         */
        hlist_for_each_entry(hp, &security_hook_heads.inode_getsecurity, list) {
                rc = hp->hook.inode_getsecurity(inode, name, buffer, alloc);
                if (rc != LSM_RET_DEFAULT(inode_getsecurity))
                        return rc;
        }
        return LSM_RET_DEFAULT(inode_getsecurity);
}

int security_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags)
{
        struct security_hook_list *hp;
        int rc;

        if (unlikely(IS_PRIVATE(inode)))
                return LSM_RET_DEFAULT(inode_setsecurity);
        /*
         * Only one module will provide an attribute with a given name.
         */
        hlist_for_each_entry(hp, &security_hook_heads.inode_setsecurity, list) {
                rc = hp->hook.inode_setsecurity(inode, name, value, size,
                                                                flags);
                if (rc != LSM_RET_DEFAULT(inode_setsecurity))
                        return rc;
        }
        return LSM_RET_DEFAULT(inode_setsecurity);
}

int security_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer_size)
{
        if (unlikely(IS_PRIVATE(inode)))
                return 0;
        return call_int_hook(inode_listsecurity, 0, inode, buffer, buffer_size);
}
EXPORT_SYMBOL(security_inode_listsecurity);

void security_inode_getsecid(struct inode *inode, u32 *secid)
{
        call_void_hook(inode_getsecid, inode, secid);
}

int security_inode_copy_up(struct dentry *src, struct cred **new)
{
        return call_int_hook(inode_copy_up, 0, src, new);
}
EXPORT_SYMBOL(security_inode_copy_up);

int security_inode_copy_up_xattr(const char *name)
{
        struct security_hook_list *hp;
        int rc;

        /*
         * The implementation can return 0 (accept the xattr), 1 (discard the
         * xattr), -EOPNOTSUPP if it does not know anything about the xattr or
         * any other error code incase of an error.
         */
        hlist_for_each_entry(hp,
                &security_hook_heads.inode_copy_up_xattr, list) {
                rc = hp->hook.inode_copy_up_xattr(name);
                if (rc != LSM_RET_DEFAULT(inode_copy_up_xattr))
                        return rc;
        }

        return LSM_RET_DEFAULT(inode_copy_up_xattr);
}
EXPORT_SYMBOL(security_inode_copy_up_xattr);

int security_kernfs_init_security(struct kernfs_node *kn_dir,
                                  struct kernfs_node *kn)
{
        return call_int_hook(kernfs_init_security, 0, kn_dir, kn);
}

int security_file_permission(struct file *file, int mask)
{
        int ret;

        ret = call_int_hook(file_permission, 0, file, mask);
        if (ret)
                return ret;

        return fsnotify_perm(file, mask);
}

int security_file_alloc(struct file *file)
{
        int rc = lsm_file_alloc(file);

        if (rc)
                return rc;
        rc = call_int_hook(file_alloc_security, 0, file);
        if (unlikely(rc))
                security_file_free(file);
        return rc;
}

void security_file_free(struct file *file)
{
        void *blob;

        call_void_hook(file_free_security, file);

        blob = file->f_security;
        if (blob) {
                file->f_security = NULL;
                kmem_cache_free(lsm_file_cache, blob);
        }
}

int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
        return call_int_hook(file_ioctl, 0, file, cmd, arg);
}
EXPORT_SYMBOL_GPL(security_file_ioctl);

/**
 * security_file_ioctl_compat() - Check if an ioctl is allowed in compat mode
 * @file: associated file
 * @cmd: ioctl cmd
 * @arg: ioctl arguments
 *
 * Compat version of security_file_ioctl() that correctly handles 32-bit
 * processes running on 64-bit kernels.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_ioctl_compat(struct file *file, unsigned int cmd,
                               unsigned long arg)
{
        return call_int_hook(file_ioctl_compat, 0, file, cmd, arg);
}
EXPORT_SYMBOL_GPL(security_file_ioctl_compat);

static inline unsigned long mmap_prot(struct file *file, unsigned long prot)
{
        /*
         * Does we have PROT_READ and does the application expect
         * it to imply PROT_EXEC?  If not, nothing to talk about...
         */
        if ((prot & (PROT_READ | PROT_EXEC)) != PROT_READ)
                return prot;
        if (!(current->personality & READ_IMPLIES_EXEC))
                return prot;
        /*
         * if that's an anonymous mapping, let it.
         */
        if (!file)
                return prot | PROT_EXEC;
        /*
         * ditto if it's not on noexec mount, except that on !MMU we need
         * NOMMU_MAP_EXEC (== VM_MAYEXEC) in this case
         */
        if (!path_noexec(&file->f_path)) {
#ifndef CONFIG_MMU
                if (file->f_op->mmap_capabilities) {
                        unsigned caps = file->f_op->mmap_capabilities(file);
                        if (!(caps & NOMMU_MAP_EXEC))
                                return prot;
                }
#endif
                return prot | PROT_EXEC;
        }
        /* anything on noexec mount won't get PROT_EXEC */
        return prot;
}

int security_mmap_file(struct file *file, unsigned long prot,
                        unsigned long flags)
{
        unsigned long prot_adj = mmap_prot(file, prot);
        int ret;

        ret = call_int_hook(mmap_file, 0, file, prot, prot_adj, flags);
        if (ret)
                return ret;
        return ima_file_mmap(file, prot, prot_adj, flags);
}

int security_mmap_addr(unsigned long addr)
{
        return call_int_hook(mmap_addr, 0, addr);
}

int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot,
                            unsigned long prot)
{
        int ret;

        ret = call_int_hook(file_mprotect, 0, vma, reqprot, prot);
        if (ret)
                return ret;
        return ima_file_mprotect(vma, prot);
}

int security_file_lock(struct file *file, unsigned int cmd)
{
        return call_int_hook(file_lock, 0, file, cmd);
}

int security_file_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
{
        return call_int_hook(file_fcntl, 0, file, cmd, arg);
}

void security_file_set_fowner(struct file *file)
{
        call_void_hook(file_set_fowner, file);
}

int security_file_send_sigiotask(struct task_struct *tsk,
                                  struct fown_struct *fown, int sig)
{
        return call_int_hook(file_send_sigiotask, 0, tsk, fown, sig);
}

int security_file_receive(struct file *file)
{
        return call_int_hook(file_receive, 0, file);
}

int security_file_open(struct file *file)
{
        int ret;

        ret = call_int_hook(file_open, 0, file);
        if (ret)
                return ret;

        return fsnotify_perm(file, MAY_OPEN);
}

int security_task_alloc(struct task_struct *task, unsigned long clone_flags)
{
        int rc = lsm_task_alloc(task);

        if (rc)
                return rc;
        rc = call_int_hook(task_alloc, 0, task, clone_flags);
        if (unlikely(rc))
                security_task_free(task);
        return rc;
}

void security_task_free(struct task_struct *task)
{
        call_void_hook(task_free, task);

        kfree(task->security);
        task->security = NULL;
}

int security_cred_alloc_blank(struct cred *cred, gfp_t gfp)
{
        int rc = lsm_cred_alloc(cred, gfp);

        if (rc)
                return rc;

        rc = call_int_hook(cred_alloc_blank, 0, cred, gfp);
        if (unlikely(rc))
                security_cred_free(cred);
        return rc;
}

void security_cred_free(struct cred *cred)
{
        /*
         * There is a failure case in prepare_creds() that
         * may result in a call here with ->security being NULL.
         */
        if (unlikely(cred->security == NULL))
                return;

        call_void_hook(cred_free, cred);

        kfree(cred->security);
        cred->security = NULL;
}

int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp)
{
        int rc = lsm_cred_alloc(new, gfp);

        if (rc)
                return rc;

        rc = call_int_hook(cred_prepare, 0, new, old, gfp);
        if (unlikely(rc))
                security_cred_free(new);
        return rc;
}

void security_transfer_creds(struct cred *new, const struct cred *old)
{
        call_void_hook(cred_transfer, new, old);
}

void security_cred_getsecid(const struct cred *c, u32 *secid)
{
        *secid = 0;
        call_void_hook(cred_getsecid, c, secid);
}
EXPORT_SYMBOL(security_cred_getsecid);

int security_kernel_act_as(struct cred *new, u32 secid)
{
        return call_int_hook(kernel_act_as, 0, new, secid);
}

int security_kernel_create_files_as(struct cred *new, struct inode *inode)
{
        return call_int_hook(kernel_create_files_as, 0, new, inode);
}

int security_kernel_module_request(char *kmod_name)
{
        int ret;

        ret = call_int_hook(kernel_module_request, 0, kmod_name);
        if (ret)
                return ret;
        return integrity_kernel_module_request(kmod_name);
}

int security_kernel_read_file(struct file *file, enum kernel_read_file_id id,
                              bool contents)
{
        int ret;

        ret = call_int_hook(kernel_read_file, 0, file, id, contents);
        if (ret)
                return ret;
        return ima_read_file(file, id, contents);
}
EXPORT_SYMBOL_GPL(security_kernel_read_file);

int security_kernel_post_read_file(struct file *file, char *buf, loff_t size,
                                   enum kernel_read_file_id id)
{
        int ret;

        ret = call_int_hook(kernel_post_read_file, 0, file, buf, size, id);
        if (ret)
                return ret;
        return ima_post_read_file(file, buf, size, id);
}
EXPORT_SYMBOL_GPL(security_kernel_post_read_file);

int security_kernel_load_data(enum kernel_load_data_id id, bool contents)
{
        int ret;

        ret = call_int_hook(kernel_load_data, 0, id, contents);
        if (ret)
                return ret;
        return ima_load_data(id, contents);
}
EXPORT_SYMBOL_GPL(security_kernel_load_data);

int security_kernel_post_load_data(char *buf, loff_t size,
                                   enum kernel_load_data_id id,
                                   char *description)
{
        int ret;

        ret = call_int_hook(kernel_post_load_data, 0, buf, size, id,
                            description);
        if (ret)
                return ret;
        return ima_post_load_data(buf, size, id, description);
}
EXPORT_SYMBOL_GPL(security_kernel_post_load_data);

int security_task_fix_setuid(struct cred *new, const struct cred *old,
                             int flags)
{
        return call_int_hook(task_fix_setuid, 0, new, old, flags);
}

int security_task_fix_setgid(struct cred *new, const struct cred *old,
                                 int flags)
{
        return call_int_hook(task_fix_setgid, 0, new, old, flags);
}

int security_task_setpgid(struct task_struct *p, pid_t pgid)
{
        return call_int_hook(task_setpgid, 0, p, pgid);
}

int security_task_getpgid(struct task_struct *p)
{
        return call_int_hook(task_getpgid, 0, p);
}

int security_task_getsid(struct task_struct *p)
{
        return call_int_hook(task_getsid, 0, p);
}

void security_task_getsecid(struct task_struct *p, u32 *secid)
{
        *secid = 0;
        call_void_hook(task_getsecid, p, secid);
}
EXPORT_SYMBOL(security_task_getsecid);

int security_task_setnice(struct task_struct *p, int nice)
{
        return call_int_hook(task_setnice, 0, p, nice);
}

int security_task_setioprio(struct task_struct *p, int ioprio)
{
        return call_int_hook(task_setioprio, 0, p, ioprio);
}

int security_task_getioprio(struct task_struct *p)
{
        return call_int_hook(task_getioprio, 0, p);
}

int security_task_prlimit(const struct cred *cred, const struct cred *tcred,
                          unsigned int flags)
{
        return call_int_hook(task_prlimit, 0, cred, tcred, flags);
}

int security_task_setrlimit(struct task_struct *p, unsigned int resource,
                struct rlimit *new_rlim)
{
        return call_int_hook(task_setrlimit, 0, p, resource, new_rlim);
}

int security_task_setscheduler(struct task_struct *p)
{
        return call_int_hook(task_setscheduler, 0, p);
}

int security_task_getscheduler(struct task_struct *p)
{
        return call_int_hook(task_getscheduler, 0, p);
}

int security_task_movememory(struct task_struct *p)
{
        return call_int_hook(task_movememory, 0, p);
}

int security_task_kill(struct task_struct *p, struct kernel_siginfo *info,
                        int sig, const struct cred *cred)
{
        return call_int_hook(task_kill, 0, p, info, sig, cred);
}

int security_task_prctl(int option, unsigned long arg2, unsigned long arg3,
                         unsigned long arg4, unsigned long arg5)
{
        int thisrc;
        int rc = LSM_RET_DEFAULT(task_prctl);
        struct security_hook_list *hp;

        hlist_for_each_entry(hp, &security_hook_heads.task_prctl, list) {
                thisrc = hp->hook.task_prctl(option, arg2, arg3, arg4, arg5);
                if (thisrc != LSM_RET_DEFAULT(task_prctl)) {
                        rc = thisrc;
                        if (thisrc != 0)
                                break;
                }
        }
        return rc;
}

void security_task_to_inode(struct task_struct *p, struct inode *inode)
{
        call_void_hook(task_to_inode, p, inode);
}

int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag)
{
        return call_int_hook(ipc_permission, 0, ipcp, flag);
}

void security_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid)
{
        *secid = 0;
        call_void_hook(ipc_getsecid, ipcp, secid);
}

int security_msg_msg_alloc(struct msg_msg *msg)
{
        int rc = lsm_msg_msg_alloc(msg);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(msg_msg_alloc_security, 0, msg);
        if (unlikely(rc))
                security_msg_msg_free(msg);
        return rc;
}

void security_msg_msg_free(struct msg_msg *msg)
{
        call_void_hook(msg_msg_free_security, msg);
        kfree(msg->security);
        msg->security = NULL;
}

int security_msg_queue_alloc(struct kern_ipc_perm *msq)
{
        int rc = lsm_ipc_alloc(msq);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(msg_queue_alloc_security, 0, msq);
        if (unlikely(rc))
                security_msg_queue_free(msq);
        return rc;
}

void security_msg_queue_free(struct kern_ipc_perm *msq)
{
        call_void_hook(msg_queue_free_security, msq);
        kfree(msq->security);
        msq->security = NULL;
}

int security_msg_queue_associate(struct kern_ipc_perm *msq, int msqflg)
{
        return call_int_hook(msg_queue_associate, 0, msq, msqflg);
}

int security_msg_queue_msgctl(struct kern_ipc_perm *msq, int cmd)
{
        return call_int_hook(msg_queue_msgctl, 0, msq, cmd);
}

int security_msg_queue_msgsnd(struct kern_ipc_perm *msq,
                               struct msg_msg *msg, int msqflg)
{
        return call_int_hook(msg_queue_msgsnd, 0, msq, msg, msqflg);
}

int security_msg_queue_msgrcv(struct kern_ipc_perm *msq, struct msg_msg *msg,
                               struct task_struct *target, long type, int mode)
{
        return call_int_hook(msg_queue_msgrcv, 0, msq, msg, target, type, mode);
}

int security_shm_alloc(struct kern_ipc_perm *shp)
{
        int rc = lsm_ipc_alloc(shp);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(shm_alloc_security, 0, shp);
        if (unlikely(rc))
                security_shm_free(shp);
        return rc;
}

void security_shm_free(struct kern_ipc_perm *shp)
{
        call_void_hook(shm_free_security, shp);
        kfree(shp->security);
        shp->security = NULL;
}

int security_shm_associate(struct kern_ipc_perm *shp, int shmflg)
{
        return call_int_hook(shm_associate, 0, shp, shmflg);
}

int security_shm_shmctl(struct kern_ipc_perm *shp, int cmd)
{
        return call_int_hook(shm_shmctl, 0, shp, cmd);
}

int security_shm_shmat(struct kern_ipc_perm *shp, char __user *shmaddr, int shmflg)
{
        return call_int_hook(shm_shmat, 0, shp, shmaddr, shmflg);
}

int security_sem_alloc(struct kern_ipc_perm *sma)
{
        int rc = lsm_ipc_alloc(sma);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(sem_alloc_security, 0, sma);
        if (unlikely(rc))
                security_sem_free(sma);
        return rc;
}

void security_sem_free(struct kern_ipc_perm *sma)
{
        call_void_hook(sem_free_security, sma);
        kfree(sma->security);
        sma->security = NULL;
}

int security_sem_associate(struct kern_ipc_perm *sma, int semflg)
{
        return call_int_hook(sem_associate, 0, sma, semflg);
}

int security_sem_semctl(struct kern_ipc_perm *sma, int cmd)
{
        return call_int_hook(sem_semctl, 0, sma, cmd);
}

int security_sem_semop(struct kern_ipc_perm *sma, struct sembuf *sops,
                        unsigned nsops, int alter)
{
        return call_int_hook(sem_semop, 0, sma, sops, nsops, alter);
}

void security_d_instantiate(struct dentry *dentry, struct inode *inode)
{
        if (unlikely(inode && IS_PRIVATE(inode)))
                return;
        call_void_hook(d_instantiate, dentry, inode);
}
EXPORT_SYMBOL(security_d_instantiate);

int security_getprocattr(struct task_struct *p, const char *lsm, char *name,
                                char **value)
{
        struct security_hook_list *hp;

        hlist_for_each_entry(hp, &security_hook_heads.getprocattr, list) {
                if (lsm != NULL && strcmp(lsm, hp->lsm))
                        continue;
                return hp->hook.getprocattr(p, name, value);
        }
        return LSM_RET_DEFAULT(getprocattr);
}

int security_setprocattr(const char *lsm, const char *name, void *value,
                         size_t size)
{
        struct security_hook_list *hp;

        hlist_for_each_entry(hp, &security_hook_heads.setprocattr, list) {
                if (lsm != NULL && strcmp(lsm, hp->lsm))
                        continue;
                return hp->hook.setprocattr(name, value, size);
        }
        return LSM_RET_DEFAULT(setprocattr);
}

int security_netlink_send(struct sock *sk, struct sk_buff *skb)
{
        return call_int_hook(netlink_send, 0, sk, skb);
}

int security_ismaclabel(const char *name)
{
        return call_int_hook(ismaclabel, 0, name);
}
EXPORT_SYMBOL(security_ismaclabel);

int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
{
        struct security_hook_list *hp;
        int rc;

        /*
         * Currently, only one LSM can implement secid_to_secctx (i.e this
         * LSM hook is not "stackable").
         */
        hlist_for_each_entry(hp, &security_hook_heads.secid_to_secctx, list) {
                rc = hp->hook.secid_to_secctx(secid, secdata, seclen);
                if (rc != LSM_RET_DEFAULT(secid_to_secctx))
                        return rc;
        }

        return LSM_RET_DEFAULT(secid_to_secctx);
}
EXPORT_SYMBOL(security_secid_to_secctx);

int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid)
{
        *secid = 0;
        return call_int_hook(secctx_to_secid, 0, secdata, seclen, secid);
}
EXPORT_SYMBOL(security_secctx_to_secid);

void security_release_secctx(char *secdata, u32 seclen)
{
        call_void_hook(release_secctx, secdata, seclen);
}
EXPORT_SYMBOL(security_release_secctx);

void security_inode_invalidate_secctx(struct inode *inode)
{
        call_void_hook(inode_invalidate_secctx, inode);
}
EXPORT_SYMBOL(security_inode_invalidate_secctx);

int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen)
{
        return call_int_hook(inode_notifysecctx, 0, inode, ctx, ctxlen);
}
EXPORT_SYMBOL(security_inode_notifysecctx);

int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen)
{
        return call_int_hook(inode_setsecctx, 0, dentry, ctx, ctxlen);
}
EXPORT_SYMBOL(security_inode_setsecctx);

int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen)
{
        struct security_hook_list *hp;
        int rc;

        /*
         * Only one module will provide a security context.
         */
        hlist_for_each_entry(hp, &security_hook_heads.inode_getsecctx, list) {
                rc = hp->hook.inode_getsecctx(inode, ctx, ctxlen);
                if (rc != LSM_RET_DEFAULT(inode_getsecctx))
                        return rc;
        }

        return LSM_RET_DEFAULT(inode_getsecctx);
}
EXPORT_SYMBOL(security_inode_getsecctx);

#ifdef CONFIG_WATCH_QUEUE
int security_post_notification(const struct cred *w_cred,
                               const struct cred *cred,
                               struct watch_notification *n)
{
        return call_int_hook(post_notification, 0, w_cred, cred, n);
}
#endif /* CONFIG_WATCH_QUEUE */

#ifdef CONFIG_KEY_NOTIFICATIONS
int security_watch_key(struct key *key)
{
        return call_int_hook(watch_key, 0, key);
}
#endif

#ifdef CONFIG_SECURITY_NETWORK

int security_unix_stream_connect(struct sock *sock, struct sock *other, struct sock *newsk)
{
        return call_int_hook(unix_stream_connect, 0, sock, other, newsk);
}
EXPORT_SYMBOL(security_unix_stream_connect);

int security_unix_may_send(struct socket *sock,  struct socket *other)
{
        return call_int_hook(unix_may_send, 0, sock, other);
}
EXPORT_SYMBOL(security_unix_may_send);

int security_socket_create(int family, int type, int protocol, int kern)
{
        return call_int_hook(socket_create, 0, family, type, protocol, kern);
}

int security_socket_post_create(struct socket *sock, int family,
                                int type, int protocol, int kern)
{
        return call_int_hook(socket_post_create, 0, sock, family, type,
                                                protocol, kern);
}

int security_socket_socketpair(struct socket *socka, struct socket *sockb)
{
        return call_int_hook(socket_socketpair, 0, socka, sockb);
}
EXPORT_SYMBOL(security_socket_socketpair);

int security_socket_bind(struct socket *sock, struct sockaddr *address, int addrlen)
{
        return call_int_hook(socket_bind, 0, sock, address, addrlen);
}

int security_socket_connect(struct socket *sock, struct sockaddr *address, int addrlen)
{
        return call_int_hook(socket_connect, 0, sock, address, addrlen);
}

int security_socket_listen(struct socket *sock, int backlog)
{
        return call_int_hook(socket_listen, 0, sock, backlog);
}

int security_socket_accept(struct socket *sock, struct socket *newsock)
{
        return call_int_hook(socket_accept, 0, sock, newsock);
}

int security_socket_sendmsg(struct socket *sock, struct msghdr *msg, int size)
{
        return call_int_hook(socket_sendmsg, 0, sock, msg, size);
}

int security_socket_recvmsg(struct socket *sock, struct msghdr *msg,
                            int size, int flags)
{
        return call_int_hook(socket_recvmsg, 0, sock, msg, size, flags);
}

int security_socket_getsockname(struct socket *sock)
{
        return call_int_hook(socket_getsockname, 0, sock);
}

int security_socket_getpeername(struct socket *sock)
{
        return call_int_hook(socket_getpeername, 0, sock);
}

int security_socket_getsockopt(struct socket *sock, int level, int optname)
{
        return call_int_hook(socket_getsockopt, 0, sock, level, optname);
}

int security_socket_setsockopt(struct socket *sock, int level, int optname)
{
        return call_int_hook(socket_setsockopt, 0, sock, level, optname);
}

int security_socket_shutdown(struct socket *sock, int how)
{
        return call_int_hook(socket_shutdown, 0, sock, how);
}

int security_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
        return call_int_hook(socket_sock_rcv_skb, 0, sk, skb);
}
EXPORT_SYMBOL(security_sock_rcv_skb);

int security_socket_getpeersec_stream(struct socket *sock, sockptr_t optval,
                                      sockptr_t optlen, unsigned int len)
{
        struct security_hook_list *hp;
        int rc;

        /*
         * Only one module will provide a security context.
         */
        hlist_for_each_entry(hp, &security_hook_heads.socket_getpeersec_stream,
                             list) {
                rc = hp->hook.socket_getpeersec_stream(sock, optval, optlen,
                                                       len);
                if (rc != LSM_RET_DEFAULT(socket_getpeersec_stream))
                        return rc;
        }
        return LSM_RET_DEFAULT(socket_getpeersec_stream);
}

int security_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid)
{
        struct security_hook_list *hp;
        int rc;

        /*
         * Only one module will provide a security context.
         */
        hlist_for_each_entry(hp, &security_hook_heads.socket_getpeersec_dgram,
                             list) {
                rc = hp->hook.socket_getpeersec_dgram(sock, skb, secid);
                if (rc != LSM_RET_DEFAULT(socket_getpeersec_dgram))
                        return rc;
        }
        return LSM_RET_DEFAULT(socket_getpeersec_dgram);
}
EXPORT_SYMBOL(security_socket_getpeersec_dgram);

int security_sk_alloc(struct sock *sk, int family, gfp_t priority)
{
        return call_int_hook(sk_alloc_security, 0, sk, family, priority);
}

void security_sk_free(struct sock *sk)
{
        call_void_hook(sk_free_security, sk);
}

void security_sk_clone(const struct sock *sk, struct sock *newsk)
{
        call_void_hook(sk_clone_security, sk, newsk);
}
EXPORT_SYMBOL(security_sk_clone);

void security_sk_classify_flow(struct sock *sk, struct flowi_common *flic)
{
        call_void_hook(sk_getsecid, sk, &flic->flowic_secid);
}
EXPORT_SYMBOL(security_sk_classify_flow);

void security_req_classify_flow(const struct request_sock *req,
                                struct flowi_common *flic)
{
        call_void_hook(req_classify_flow, req, flic);
}
EXPORT_SYMBOL(security_req_classify_flow);

void security_sock_graft(struct sock *sk, struct socket *parent)
{
        call_void_hook(sock_graft, sk, parent);
}
EXPORT_SYMBOL(security_sock_graft);

int security_inet_conn_request(struct sock *sk,
                        struct sk_buff *skb, struct request_sock *req)
{
        return call_int_hook(inet_conn_request, 0, sk, skb, req);
}
EXPORT_SYMBOL(security_inet_conn_request);

void security_inet_csk_clone(struct sock *newsk,
                        const struct request_sock *req)
{
        call_void_hook(inet_csk_clone, newsk, req);
}

void security_inet_conn_established(struct sock *sk,
                        struct sk_buff *skb)
{
        call_void_hook(inet_conn_established, sk, skb);
}
EXPORT_SYMBOL(security_inet_conn_established);

int security_secmark_relabel_packet(u32 secid)
{
        return call_int_hook(secmark_relabel_packet, 0, secid);
}
EXPORT_SYMBOL(security_secmark_relabel_packet);

void security_secmark_refcount_inc(void)
{
        call_void_hook(secmark_refcount_inc);
}
EXPORT_SYMBOL(security_secmark_refcount_inc);

void security_secmark_refcount_dec(void)
{
        call_void_hook(secmark_refcount_dec);
}
EXPORT_SYMBOL(security_secmark_refcount_dec);

int security_tun_dev_alloc_security(void **security)
{
        return call_int_hook(tun_dev_alloc_security, 0, security);
}
EXPORT_SYMBOL(security_tun_dev_alloc_security);

void security_tun_dev_free_security(void *security)
{
        call_void_hook(tun_dev_free_security, security);
}
EXPORT_SYMBOL(security_tun_dev_free_security);

int security_tun_dev_create(void)
{
        return call_int_hook(tun_dev_create, 0);
}
EXPORT_SYMBOL(security_tun_dev_create);

int security_tun_dev_attach_queue(void *security)
{
        return call_int_hook(tun_dev_attach_queue, 0, security);
}
EXPORT_SYMBOL(security_tun_dev_attach_queue);

int security_tun_dev_attach(struct sock *sk, void *security)
{
        return call_int_hook(tun_dev_attach, 0, sk, security);
}
EXPORT_SYMBOL(security_tun_dev_attach);

int security_tun_dev_open(void *security)
{
        return call_int_hook(tun_dev_open, 0, security);
}
EXPORT_SYMBOL(security_tun_dev_open);

int security_sctp_assoc_request(struct sctp_endpoint *ep, struct sk_buff *skb)
{
        return call_int_hook(sctp_assoc_request, 0, ep, skb);
}
EXPORT_SYMBOL(security_sctp_assoc_request);

int security_sctp_bind_connect(struct sock *sk, int optname,
                               struct sockaddr *address, int addrlen)
{
        return call_int_hook(sctp_bind_connect, 0, sk, optname,
                             address, addrlen);
}
EXPORT_SYMBOL(security_sctp_bind_connect);

void security_sctp_sk_clone(struct sctp_endpoint *ep, struct sock *sk,
                            struct sock *newsk)
{
        call_void_hook(sctp_sk_clone, ep, sk, newsk);
}
EXPORT_SYMBOL(security_sctp_sk_clone);

#endif        /* CONFIG_SECURITY_NETWORK */

#ifdef CONFIG_SECURITY_INFINIBAND

int security_ib_pkey_access(void *sec, u64 subnet_prefix, u16 pkey)
{
        return call_int_hook(ib_pkey_access, 0, sec, subnet_prefix, pkey);
}
EXPORT_SYMBOL(security_ib_pkey_access);

int security_ib_endport_manage_subnet(void *sec, const char *dev_name, u8 port_num)
{
        return call_int_hook(ib_endport_manage_subnet, 0, sec, dev_name, port_num);
}
EXPORT_SYMBOL(security_ib_endport_manage_subnet);

int security_ib_alloc_security(void **sec)
{
        return call_int_hook(ib_alloc_security, 0, sec);
}
EXPORT_SYMBOL(security_ib_alloc_security);

void security_ib_free_security(void *sec)
{
        call_void_hook(ib_free_security, sec);
}
EXPORT_SYMBOL(security_ib_free_security);
#endif        /* CONFIG_SECURITY_INFINIBAND */

#ifdef CONFIG_SECURITY_NETWORK_XFRM

int security_xfrm_policy_alloc(struct xfrm_sec_ctx **ctxp,
                               struct xfrm_user_sec_ctx *sec_ctx,
                               gfp_t gfp)
{
        return call_int_hook(xfrm_policy_alloc_security, 0, ctxp, sec_ctx, gfp);
}
EXPORT_SYMBOL(security_xfrm_policy_alloc);

int security_xfrm_policy_clone(struct xfrm_sec_ctx *old_ctx,
                              struct xfrm_sec_ctx **new_ctxp)
{
        return call_int_hook(xfrm_policy_clone_security, 0, old_ctx, new_ctxp);
}

void security_xfrm_policy_free(struct xfrm_sec_ctx *ctx)
{
        call_void_hook(xfrm_policy_free_security, ctx);
}
EXPORT_SYMBOL(security_xfrm_policy_free);

int security_xfrm_policy_delete(struct xfrm_sec_ctx *ctx)
{
        return call_int_hook(xfrm_policy_delete_security, 0, ctx);
}

int security_xfrm_state_alloc(struct xfrm_state *x,
                              struct xfrm_user_sec_ctx *sec_ctx)
{
        return call_int_hook(xfrm_state_alloc, 0, x, sec_ctx);
}
EXPORT_SYMBOL(security_xfrm_state_alloc);

int security_xfrm_state_alloc_acquire(struct xfrm_state *x,
                                      struct xfrm_sec_ctx *polsec, u32 secid)
{
        return call_int_hook(xfrm_state_alloc_acquire, 0, x, polsec, secid);
}

int security_xfrm_state_delete(struct xfrm_state *x)
{
        return call_int_hook(xfrm_state_delete_security, 0, x);
}
EXPORT_SYMBOL(security_xfrm_state_delete);

void security_xfrm_state_free(struct xfrm_state *x)
{
        call_void_hook(xfrm_state_free_security, x);
}

int security_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid, u8 dir)
{
        return call_int_hook(xfrm_policy_lookup, 0, ctx, fl_secid, dir);
}

int security_xfrm_state_pol_flow_match(struct xfrm_state *x,
                                       struct xfrm_policy *xp,
                                       const struct flowi_common *flic)
{
        struct security_hook_list *hp;
        int rc = LSM_RET_DEFAULT(xfrm_state_pol_flow_match);

        /*
         * Since this function is expected to return 0 or 1, the judgment
         * becomes difficult if multiple LSMs supply this call. Fortunately,
         * we can use the first LSM's judgment because currently only SELinux
         * supplies this call.
         *
         * For speed optimization, we explicitly break the loop rather than
         * using the macro
         */
        hlist_for_each_entry(hp, &security_hook_heads.xfrm_state_pol_flow_match,
                                list) {
                rc = hp->hook.xfrm_state_pol_flow_match(x, xp, flic);
                break;
        }
        return rc;
}

int security_xfrm_decode_session(struct sk_buff *skb, u32 *secid)
{
        return call_int_hook(xfrm_decode_session, 0, skb, secid, 1);
}

void security_skb_classify_flow(struct sk_buff *skb, struct flowi_common *flic)
{
        int rc = call_int_hook(xfrm_decode_session, 0, skb, &flic->flowic_secid,
                                0);

        BUG_ON(rc);
}
EXPORT_SYMBOL(security_skb_classify_flow);

#endif        /* CONFIG_SECURITY_NETWORK_XFRM */

#ifdef CONFIG_KEYS

int security_key_alloc(struct key *key, const struct cred *cred,
                       unsigned long flags)
{
        return call_int_hook(key_alloc, 0, key, cred, flags);
}

void security_key_free(struct key *key)
{
        call_void_hook(key_free, key);
}

int security_key_permission(key_ref_t key_ref, const struct cred *cred,
                            enum key_need_perm need_perm)
{
        return call_int_hook(key_permission, 0, key_ref, cred, need_perm);
}

int security_key_getsecurity(struct key *key, char **_buffer)
{
        *_buffer = NULL;
        return call_int_hook(key_getsecurity, 0, key, _buffer);
}

#endif        /* CONFIG_KEYS */

#ifdef CONFIG_AUDIT

int security_audit_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule,
                             gfp_t gfp)
{
        return call_int_hook(audit_rule_init, 0, field, op, rulestr, lsmrule,
                             gfp);
}

int security_audit_rule_known(struct audit_krule *krule)
{
        return call_int_hook(audit_rule_known, 0, krule);
}

void security_audit_rule_free(void *lsmrule)
{
        call_void_hook(audit_rule_free, lsmrule);
}

int security_audit_rule_match(u32 secid, u32 field, u32 op, void *lsmrule)
{
        return call_int_hook(audit_rule_match, 0, secid, field, op, lsmrule);
}
#endif /* CONFIG_AUDIT */

#ifdef CONFIG_BPF_SYSCALL
int security_bpf(int cmd, union bpf_attr *attr, unsigned int size)
{
        return call_int_hook(bpf, 0, cmd, attr, size);
}
int security_bpf_map(struct bpf_map *map, fmode_t fmode)
{
        return call_int_hook(bpf_map, 0, map, fmode);
}
int security_bpf_prog(struct bpf_prog *prog)
{
        return call_int_hook(bpf_prog, 0, prog);
}
int security_bpf_map_alloc(struct bpf_map *map)
{
        return call_int_hook(bpf_map_alloc_security, 0, map);
}
int security_bpf_prog_alloc(struct bpf_prog_aux *aux)
{
        return call_int_hook(bpf_prog_alloc_security, 0, aux);
}
void security_bpf_map_free(struct bpf_map *map)
{
        call_void_hook(bpf_map_free_security, map);
}
void security_bpf_prog_free(struct bpf_prog_aux *aux)
{
        call_void_hook(bpf_prog_free_security, aux);
}
#endif /* CONFIG_BPF_SYSCALL */

int security_locked_down(enum lockdown_reason what)
{
        return call_int_hook(locked_down, 0, what);
}
EXPORT_SYMBOL(security_locked_down);

#ifdef CONFIG_PERF_EVENTS
int security_perf_event_open(struct perf_event_attr *attr, int type)
{
        return call_int_hook(perf_event_open, 0, attr, type);
}

int security_perf_event_alloc(struct perf_event *event)
{
        return call_int_hook(perf_event_alloc, 0, event);
}

void security_perf_event_free(struct perf_event *event)
{
        call_void_hook(perf_event_free, event);
}

int security_perf_event_read(struct perf_event *event)
{
        return call_int_hook(perf_event_read, 0, event);
}

int security_perf_event_write(struct perf_event *event)
{
        return call_int_hook(perf_event_write, 0, event);
}
#endif /* CONFIG_PERF_EVENTS */
































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 






























































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PGTABLE_H
#define _LINUX_PGTABLE_H

#include <linux/pfn.h>
#include <asm/pgtable.h>

#ifndef __ASSEMBLY__
#ifdef CONFIG_MMU

#include <linux/mm_types.h>
#include <linux/bug.h>
#include <linux/errno.h>
#include <asm-generic/pgtable_uffd.h>

#if 5 - defined(__PAGETABLE_P4D_FOLDED) - defined(__PAGETABLE_PUD_FOLDED) - \
        defined(__PAGETABLE_PMD_FOLDED) != CONFIG_PGTABLE_LEVELS
#error CONFIG_PGTABLE_LEVELS is not consistent with __PAGETABLE_{P4D,PUD,PMD}_FOLDED
#endif

/*
 * On almost all architectures and configurations, 0 can be used as the
 * upper ceiling to free_pgtables(): on many architectures it has the same
 * effect as using TASK_SIZE.  However, there is one configuration which
 * must impose a more careful limit, to avoid freeing kernel pgtables.
 */
#ifndef USER_PGTABLES_CEILING
#define USER_PGTABLES_CEILING        0UL
#endif

/*
 * A page table page can be thought of an array like this: pXd_t[PTRS_PER_PxD]
 *
 * The pXx_index() functions return the index of the entry in the page
 * table page which would control the given virtual address
 *
 * As these functions may be used by the same code for different levels of
 * the page table folding, they are always available, regardless of
 * CONFIG_PGTABLE_LEVELS value. For the folded levels they simply return 0
 * because in such cases PTRS_PER_PxD equals 1.
 */

static inline unsigned long pte_index(unsigned long address)
{
        return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
}
#define pte_index pte_index

#ifndef pmd_index
static inline unsigned long pmd_index(unsigned long address)
{
        return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1);
}
#define pmd_index pmd_index
#endif

#ifndef pud_index
static inline unsigned long pud_index(unsigned long address)
{
        return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
}
#define pud_index pud_index
#endif

#ifndef pgd_index
/* Must be a compile-time constant, so implement it as a macro */
#define pgd_index(a)  (((a) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
#endif

#ifndef pte_offset_kernel
static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
{
        return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address);
}
#define pte_offset_kernel pte_offset_kernel
#endif

#if defined(CONFIG_HIGHPTE)
#define pte_offset_map(dir, address)                                \
        ((pte_t *)kmap_atomic(pmd_page(*(dir))) +                \
         pte_index((address)))
#define pte_unmap(pte) kunmap_atomic((pte))
#else
#define pte_offset_map(dir, address)        pte_offset_kernel((dir), (address))
#define pte_unmap(pte) ((void)(pte))        /* NOP */
#endif

/* Find an entry in the second-level page table.. */
#ifndef pmd_offset
static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
{
        return pud_pgtable(*pud) + pmd_index(address);
}
#define pmd_offset pmd_offset
#endif

#ifndef pud_offset
static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
{
        return p4d_pgtable(*p4d) + pud_index(address);
}
#define pud_offset pud_offset
#endif

static inline pgd_t *pgd_offset_pgd(pgd_t *pgd, unsigned long address)
{
        return (pgd + pgd_index(address));
};

/*
 * a shortcut to get a pgd_t in a given mm
 */
#ifndef pgd_offset
#define pgd_offset(mm, address)                pgd_offset_pgd((mm)->pgd, (address))
#endif

/*
 * a shortcut which implies the use of the kernel's pgd, instead
 * of a process's
 */
#ifndef pgd_offset_k
#define pgd_offset_k(address)                pgd_offset(&init_mm, (address))
#endif

/*
 * In many cases it is known that a virtual address is mapped at PMD or PTE
 * level, so instead of traversing all the page table levels, we can get a
 * pointer to the PMD entry in user or kernel page table or translate a virtual
 * address to the pointer in the PTE in the kernel page tables with simple
 * helpers.
 */
static inline pmd_t *pmd_off(struct mm_struct *mm, unsigned long va)
{
        return pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, va), va), va), va);
}

static inline pmd_t *pmd_off_k(unsigned long va)
{
        return pmd_offset(pud_offset(p4d_offset(pgd_offset_k(va), va), va), va);
}

static inline pte_t *virt_to_kpte(unsigned long vaddr)
{
        pmd_t *pmd = pmd_off_k(vaddr);

        return pmd_none(*pmd) ? NULL : pte_offset_kernel(pmd, vaddr);
}

#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
extern int ptep_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pte_t *ptep,
                                 pte_t entry, int dirty);
#endif

#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern int pmdp_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pmd_t *pmdp,
                                 pmd_t entry, int dirty);
extern int pudp_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pud_t *pudp,
                                 pud_t entry, int dirty);
#else
static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
                                        unsigned long address, pmd_t *pmdp,
                                        pmd_t entry, int dirty)
{
        BUILD_BUG();
        return 0;
}
static inline int pudp_set_access_flags(struct vm_area_struct *vma,
                                        unsigned long address, pud_t *pudp,
                                        pud_t entry, int dirty)
{
        BUILD_BUG();
        return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif

#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
                                            unsigned long address,
                                            pte_t *ptep)
{
        pte_t pte = *ptep;
        int r = 1;
        if (!pte_young(pte))
                r = 0;
        else
                set_pte_at(vma->vm_mm, address, ptep, pte_mkold(pte));
        return r;
}
#endif

#ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
                                            unsigned long address,
                                            pmd_t *pmdp)
{
        pmd_t pmd = *pmdp;
        int r = 1;
        if (!pmd_young(pmd))
                r = 0;
        else
                set_pmd_at(vma->vm_mm, address, pmdp, pmd_mkold(pmd));
        return r;
}
#else
static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
                                            unsigned long address,
                                            pmd_t *pmdp)
{
        BUILD_BUG();
        return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif

#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
int ptep_clear_flush_young(struct vm_area_struct *vma,
                           unsigned long address, pte_t *ptep);
#endif

#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
                                  unsigned long address, pmd_t *pmdp);
#else
/*
 * Despite relevant to THP only, this API is called from generic rmap code
 * under PageTransHuge(), hence needs a dummy implementation for !THP
 */
static inline int pmdp_clear_flush_young(struct vm_area_struct *vma,
                                         unsigned long address, pmd_t *pmdp)
{
        BUILD_BUG();
        return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif

#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
                                       unsigned long address,
                                       pte_t *ptep)
{
        pte_t pte = *ptep;
        pte_clear(mm, address, ptep);
        return pte;
}
#endif

#ifndef __HAVE_ARCH_PTEP_GET
static inline pte_t ptep_get(pte_t *ptep)
{
        return READ_ONCE(*ptep);
}
#endif

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
                                            unsigned long address,
                                            pmd_t *pmdp)
{
        pmd_t pmd = *pmdp;
        pmd_clear(pmdp);
        return pmd;
}
#endif /* __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR */
#ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
                                            unsigned long address,
                                            pud_t *pudp)
{
        pud_t pud = *pudp;

        pud_clear(pudp);
        return pud;
}
#endif /* __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR */
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
                                            unsigned long address, pmd_t *pmdp,
                                            int full)
{
        return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
}
#endif

#ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL
static inline pud_t pudp_huge_get_and_clear_full(struct mm_struct *mm,
                                            unsigned long address, pud_t *pudp,
                                            int full)
{
        return pudp_huge_get_and_clear(mm, address, pudp);
}
#endif
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
                                            unsigned long address, pte_t *ptep,
                                            int full)
{
        pte_t pte;
        pte = ptep_get_and_clear(mm, address, ptep);
        return pte;
}
#endif


/*
 * If two threads concurrently fault at the same page, the thread that
 * won the race updates the PTE and its local TLB/Cache. The other thread
 * gives up, simply does nothing, and continues; on architectures where
 * software can update TLB,  local TLB can be updated here to avoid next page
 * fault. This function updates TLB only, do nothing with cache or others.
 * It is the difference with function update_mmu_cache.
 */
#ifndef __HAVE_ARCH_UPDATE_MMU_TLB
static inline void update_mmu_tlb(struct vm_area_struct *vma,
                                unsigned long address, pte_t *ptep)
{
}
#define __HAVE_ARCH_UPDATE_MMU_TLB
#endif

/*
 * Some architectures may be able to avoid expensive synchronization
 * primitives when modifications are made to PTE's which are already
 * not present, or in the process of an address space destruction.
 */
#ifndef __HAVE_ARCH_PTE_CLEAR_NOT_PRESENT_FULL
static inline void pte_clear_not_present_full(struct mm_struct *mm,
                                              unsigned long address,
                                              pte_t *ptep,
                                              int full)
{
        pte_clear(mm, address, ptep);
}
#endif

#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
                              unsigned long address,
                              pte_t *ptep);
#endif

#ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
extern pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma,
                              unsigned long address,
                              pmd_t *pmdp);
extern pud_t pudp_huge_clear_flush(struct vm_area_struct *vma,
                              unsigned long address,
                              pud_t *pudp);
#endif

#ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT
struct mm_struct;
static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep)
{
        pte_t old_pte = *ptep;
        set_pte_at(mm, address, ptep, pte_wrprotect(old_pte));
}
#endif

/*
 * On some architectures hardware does not set page access bit when accessing
 * memory page, it is responsibilty of software setting this bit. It brings
 * out extra page fault penalty to track page access bit. For optimization page
 * access bit can be set during all page fault flow on these arches.
 * To be differentiate with macro pte_mkyoung, this macro is used on platforms
 * where software maintains page access bit.
 */
#ifndef pte_sw_mkyoung
static inline pte_t pte_sw_mkyoung(pte_t pte)
{
        return pte;
}
#define pte_sw_mkyoung        pte_sw_mkyoung
#endif

#ifndef pte_savedwrite
#define pte_savedwrite pte_write
#endif

#ifndef pte_mk_savedwrite
#define pte_mk_savedwrite pte_mkwrite
#endif

#ifndef pte_clear_savedwrite
#define pte_clear_savedwrite pte_wrprotect
#endif

#ifndef pmd_savedwrite
#define pmd_savedwrite pmd_write
#endif

#ifndef pmd_mk_savedwrite
#define pmd_mk_savedwrite pmd_mkwrite
#endif

#ifndef pmd_clear_savedwrite
#define pmd_clear_savedwrite pmd_wrprotect
#endif

#ifndef __HAVE_ARCH_PMDP_SET_WRPROTECT
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline void pmdp_set_wrprotect(struct mm_struct *mm,
                                      unsigned long address, pmd_t *pmdp)
{
        pmd_t old_pmd = *pmdp;
        set_pmd_at(mm, address, pmdp, pmd_wrprotect(old_pmd));
}
#else
static inline void pmdp_set_wrprotect(struct mm_struct *mm,
                                      unsigned long address, pmd_t *pmdp)
{
        BUILD_BUG();
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
#ifndef __HAVE_ARCH_PUDP_SET_WRPROTECT
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static inline void pudp_set_wrprotect(struct mm_struct *mm,
                                      unsigned long address, pud_t *pudp)
{
        pud_t old_pud = *pudp;

        set_pud_at(mm, address, pudp, pud_wrprotect(old_pud));
}
#else
static inline void pudp_set_wrprotect(struct mm_struct *mm,
                                      unsigned long address, pud_t *pudp)
{
        BUILD_BUG();
}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
#endif

#ifndef pmdp_collapse_flush
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
                                 unsigned long address, pmd_t *pmdp);
#else
static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
                                        unsigned long address,
                                        pmd_t *pmdp)
{
        BUILD_BUG();
        return *pmdp;
}
#define pmdp_collapse_flush pmdp_collapse_flush
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif

#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
                                       pgtable_t pgtable);
#endif

#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
#endif

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
 * This is an implementation of pmdp_establish() that is only suitable for an
 * architecture that doesn't have hardware dirty/accessed bits. In this case we
 * can't race with CPU which sets these bits and non-atomic aproach is fine.
 */
static inline pmd_t generic_pmdp_establish(struct vm_area_struct *vma,
                unsigned long address, pmd_t *pmdp, pmd_t pmd)
{
        pmd_t old_pmd = *pmdp;
        set_pmd_at(vma->vm_mm, address, pmdp, pmd);
        return old_pmd;
}
#endif

#ifndef __HAVE_ARCH_PMDP_INVALIDATE
extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
                            pmd_t *pmdp);
#endif

#ifndef __HAVE_ARCH_PTE_SAME
static inline int pte_same(pte_t pte_a, pte_t pte_b)
{
        return pte_val(pte_a) == pte_val(pte_b);
}
#endif

#ifndef __HAVE_ARCH_PTE_UNUSED
/*
 * Some architectures provide facilities to virtualization guests
 * so that they can flag allocated pages as unused. This allows the
 * host to transparently reclaim unused pages. This function returns
 * whether the pte's page is unused.
 */
static inline int pte_unused(pte_t pte)
{
        return 0;
}
#endif

#ifndef pte_access_permitted
#define pte_access_permitted(pte, write) \
        (pte_present(pte) && (!(write) || pte_write(pte)))
#endif

#ifndef pmd_access_permitted
#define pmd_access_permitted(pmd, write) \
        (pmd_present(pmd) && (!(write) || pmd_write(pmd)))
#endif

#ifndef pud_access_permitted
#define pud_access_permitted(pud, write) \
        (pud_present(pud) && (!(write) || pud_write(pud)))
#endif

#ifndef p4d_access_permitted
#define p4d_access_permitted(p4d, write) \
        (p4d_present(p4d) && (!(write) || p4d_write(p4d)))
#endif

#ifndef pgd_access_permitted
#define pgd_access_permitted(pgd, write) \
        (pgd_present(pgd) && (!(write) || pgd_write(pgd)))
#endif

#ifndef __HAVE_ARCH_PMD_SAME
static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
{
        return pmd_val(pmd_a) == pmd_val(pmd_b);
}

static inline int pud_same(pud_t pud_a, pud_t pud_b)
{
        return pud_val(pud_a) == pud_val(pud_b);
}
#endif

#ifndef __HAVE_ARCH_P4D_SAME
static inline int p4d_same(p4d_t p4d_a, p4d_t p4d_b)
{
        return p4d_val(p4d_a) == p4d_val(p4d_b);
}
#endif

#ifndef __HAVE_ARCH_PGD_SAME
static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b)
{
        return pgd_val(pgd_a) == pgd_val(pgd_b);
}
#endif

/*
 * Use set_p*_safe(), and elide TLB flushing, when confident that *no*
 * TLB flush will be required as a result of the "set". For example, use
 * in scenarios where it is known ahead of time that the routine is
 * setting non-present entries, or re-setting an existing entry to the
 * same value. Otherwise, use the typical "set" helpers and flush the
 * TLB.
 */
#define set_pte_safe(ptep, pte) \
({ \
        WARN_ON_ONCE(pte_present(*ptep) && !pte_same(*ptep, pte)); \
        set_pte(ptep, pte); \
})

#define set_pmd_safe(pmdp, pmd) \
({ \
        WARN_ON_ONCE(pmd_present(*pmdp) && !pmd_same(*pmdp, pmd)); \
        set_pmd(pmdp, pmd); \
})

#define set_pud_safe(pudp, pud) \
({ \
        WARN_ON_ONCE(pud_present(*pudp) && !pud_same(*pudp, pud)); \
        set_pud(pudp, pud); \
})

#define set_p4d_safe(p4dp, p4d) \
({ \
        WARN_ON_ONCE(p4d_present(*p4dp) && !p4d_same(*p4dp, p4d)); \
        set_p4d(p4dp, p4d); \
})

#define set_pgd_safe(pgdp, pgd) \
({ \
        WARN_ON_ONCE(pgd_present(*pgdp) && !pgd_same(*pgdp, pgd)); \
        set_pgd(pgdp, pgd); \
})

#ifndef __HAVE_ARCH_DO_SWAP_PAGE
/*
 * Some architectures support metadata associated with a page. When a
 * page is being swapped out, this metadata must be saved so it can be
 * restored when the page is swapped back in. SPARC M7 and newer
 * processors support an ADI (Application Data Integrity) tag for the
 * page as metadata for the page. arch_do_swap_page() can restore this
 * metadata when a page is swapped back in.
 */
static inline void arch_do_swap_page(struct mm_struct *mm,
                                     struct vm_area_struct *vma,
                                     unsigned long addr,
                                     pte_t pte, pte_t oldpte)
{

}
#endif

#ifndef __HAVE_ARCH_UNMAP_ONE
/*
 * Some architectures support metadata associated with a page. When a
 * page is being swapped out, this metadata must be saved so it can be
 * restored when the page is swapped back in. SPARC M7 and newer
 * processors support an ADI (Application Data Integrity) tag for the
 * page as metadata for the page. arch_unmap_one() can save this
 * metadata on a swap-out of a page.
 */
static inline int arch_unmap_one(struct mm_struct *mm,
                                  struct vm_area_struct *vma,
                                  unsigned long addr,
                                  pte_t orig_pte)
{
        return 0;
}
#endif

/*
 * Allow architectures to preserve additional metadata associated with
 * swapped-out pages. The corresponding __HAVE_ARCH_SWAP_* macros and function
 * prototypes must be defined in the arch-specific asm/pgtable.h file.
 */
#ifndef __HAVE_ARCH_PREPARE_TO_SWAP
static inline int arch_prepare_to_swap(struct page *page)
{
        return 0;
}
#endif

#ifndef __HAVE_ARCH_SWAP_INVALIDATE
static inline void arch_swap_invalidate_page(int type, pgoff_t offset)
{
}

static inline void arch_swap_invalidate_area(int type)
{
}
#endif

#ifndef __HAVE_ARCH_SWAP_RESTORE
static inline void arch_swap_restore(swp_entry_t entry, struct page *page)
{
}
#endif

#ifndef __HAVE_ARCH_PGD_OFFSET_GATE
#define pgd_offset_gate(mm, addr)        pgd_offset(mm, addr)
#endif

#ifndef __HAVE_ARCH_MOVE_PTE
#define move_pte(pte, prot, old_addr, new_addr)        (pte)
#endif

#ifndef pte_accessible
# define pte_accessible(mm, pte)        ((void)(pte), 1)
#endif

#ifndef flush_tlb_fix_spurious_fault
#define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address)
#endif

/*
 * When walking page tables, get the address of the next boundary,
 * or the end address of the range if that comes earlier.  Although no
 * vma end wraps to 0, rounded up __boundary may wrap to 0 throughout.
 */

#define pgd_addr_end(addr, end)                                                \
({        unsigned long __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK;        \
        (__boundary - 1 < (end) - 1)? __boundary: (end);                \
})

#ifndef p4d_addr_end
#define p4d_addr_end(addr, end)                                                \
({        unsigned long __boundary = ((addr) + P4D_SIZE) & P4D_MASK;        \
        (__boundary - 1 < (end) - 1)? __boundary: (end);                \
})
#endif

#ifndef pud_addr_end
#define pud_addr_end(addr, end)                                                \
({        unsigned long __boundary = ((addr) + PUD_SIZE) & PUD_MASK;        \
        (__boundary - 1 < (end) - 1)? __boundary: (end);                \
})
#endif

#ifndef pmd_addr_end
#define pmd_addr_end(addr, end)                                                \
({        unsigned long __boundary = ((addr) + PMD_SIZE) & PMD_MASK;        \
        (__boundary - 1 < (end) - 1)? __boundary: (end);                \
})
#endif

/*
 * When walking page tables, we usually want to skip any p?d_none entries;
 * and any p?d_bad entries - reporting the error before resetting to none.
 * Do the tests inline, but report and clear the bad entry in mm/memory.c.
 */
void pgd_clear_bad(pgd_t *);

#ifndef __PAGETABLE_P4D_FOLDED
void p4d_clear_bad(p4d_t *);
#else
#define p4d_clear_bad(p4d)        do { } while (0)
#endif

#ifndef __PAGETABLE_PUD_FOLDED
void pud_clear_bad(pud_t *);
#else
#define pud_clear_bad(p4d)        do { } while (0)
#endif

void pmd_clear_bad(pmd_t *);

static inline int pgd_none_or_clear_bad(pgd_t *pgd)
{
        if (pgd_none(*pgd))
                return 1;
        if (unlikely(pgd_bad(*pgd))) {
                pgd_clear_bad(pgd);
                return 1;
        }
        return 0;
}

static inline int p4d_none_or_clear_bad(p4d_t *p4d)
{
        if (p4d_none(*p4d))
                return 1;
        if (unlikely(p4d_bad(*p4d))) {
                p4d_clear_bad(p4d);
                return 1;
        }
        return 0;
}

static inline int pud_none_or_clear_bad(pud_t *pud)
{
        if (pud_none(*pud))
                return 1;
        if (unlikely(pud_bad(*pud))) {
                pud_clear_bad(pud);
                return 1;
        }
        return 0;
}

static inline int pmd_none_or_clear_bad(pmd_t *pmd)
{
        if (pmd_none(*pmd))
                return 1;
        if (unlikely(pmd_bad(*pmd))) {
                pmd_clear_bad(pmd);
                return 1;
        }
        return 0;
}

static inline pte_t __ptep_modify_prot_start(struct vm_area_struct *vma,
                                             unsigned long addr,
                                             pte_t *ptep)
{
        /*
         * Get the current pte state, but zero it out to make it
         * non-present, preventing the hardware from asynchronously
         * updating it.
         */
        return ptep_get_and_clear(vma->vm_mm, addr, ptep);
}

static inline void __ptep_modify_prot_commit(struct vm_area_struct *vma,
                                             unsigned long addr,
                                             pte_t *ptep, pte_t pte)
{
        /*
         * The pte is non-present, so there's no hardware state to
         * preserve.
         */
        set_pte_at(vma->vm_mm, addr, ptep, pte);
}

#ifndef __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
/*
 * Start a pte protection read-modify-write transaction, which
 * protects against asynchronous hardware modifications to the pte.
 * The intention is not to prevent the hardware from making pte
 * updates, but to prevent any updates it may make from being lost.
 *
 * This does not protect against other software modifications of the
 * pte; the appropriate pte lock must be held over the transation.
 *
 * Note that this interface is intended to be batchable, meaning that
 * ptep_modify_prot_commit may not actually update the pte, but merely
 * queue the update to be done at some later time.  The update must be
 * actually committed before the pte lock is released, however.
 */
static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma,
                                           unsigned long addr,
                                           pte_t *ptep)
{
        return __ptep_modify_prot_start(vma, addr, ptep);
}

/*
 * Commit an update to a pte, leaving any hardware-controlled bits in
 * the PTE unmodified.
 */
static inline void ptep_modify_prot_commit(struct vm_area_struct *vma,
                                           unsigned long addr,
                                           pte_t *ptep, pte_t old_pte, pte_t pte)
{
        __ptep_modify_prot_commit(vma, addr, ptep, pte);
}
#endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */
#endif /* CONFIG_MMU */

/*
 * No-op macros that just return the current protection value. Defined here
 * because these macros can be used even if CONFIG_MMU is not defined.
 */

#ifndef pgprot_nx
#define pgprot_nx(prot)        (prot)
#endif

#ifndef pgprot_noncached
#define pgprot_noncached(prot)        (prot)
#endif

#ifndef pgprot_writecombine
#define pgprot_writecombine pgprot_noncached
#endif

#ifndef pgprot_writethrough
#define pgprot_writethrough pgprot_noncached
#endif

#ifndef pgprot_device
#define pgprot_device pgprot_noncached
#endif

#ifndef pgprot_mhp
#define pgprot_mhp(prot)        (prot)
#endif

#ifdef CONFIG_MMU
#ifndef pgprot_modify
#define pgprot_modify pgprot_modify
static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
{
        if (pgprot_val(oldprot) == pgprot_val(pgprot_noncached(oldprot)))
                newprot = pgprot_noncached(newprot);
        if (pgprot_val(oldprot) == pgprot_val(pgprot_writecombine(oldprot)))
                newprot = pgprot_writecombine(newprot);
        if (pgprot_val(oldprot) == pgprot_val(pgprot_device(oldprot)))
                newprot = pgprot_device(newprot);
        return newprot;
}
#endif
#endif /* CONFIG_MMU */

#ifndef pgprot_encrypted
#define pgprot_encrypted(prot)        (prot)
#endif

#ifndef pgprot_decrypted
#define pgprot_decrypted(prot)        (prot)
#endif

/*
 * A facility to provide lazy MMU batching.  This allows PTE updates and
 * page invalidations to be delayed until a call to leave lazy MMU mode
 * is issued.  Some architectures may benefit from doing this, and it is
 * beneficial for both shadow and direct mode hypervisors, which may batch
 * the PTE updates which happen during this window.  Note that using this
 * interface requires that read hazards be removed from the code.  A read
 * hazard could result in the direct mode hypervisor case, since the actual
 * write to the page tables may not yet have taken place, so reads though
 * a raw PTE pointer after it has been modified are not guaranteed to be
 * up to date.  This mode can only be entered and left under the protection of
 * the page table locks for all page tables which may be modified.  In the UP
 * case, this is required so that preemption is disabled, and in the SMP case,
 * it must synchronize the delayed page table writes properly on other CPUs.
 */
#ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE
#define arch_enter_lazy_mmu_mode()        do {} while (0)
#define arch_leave_lazy_mmu_mode()        do {} while (0)
#define arch_flush_lazy_mmu_mode()        do {} while (0)
#endif

/*
 * A facility to provide batching of the reload of page tables and
 * other process state with the actual context switch code for
 * paravirtualized guests.  By convention, only one of the batched
 * update (lazy) modes (CPU, MMU) should be active at any given time,
 * entry should never be nested, and entry and exits should always be
 * paired.  This is for sanity of maintaining and reasoning about the
 * kernel code.  In this case, the exit (end of the context switch) is
 * in architecture-specific code, and so doesn't need a generic
 * definition.
 */
#ifndef __HAVE_ARCH_START_CONTEXT_SWITCH
#define arch_start_context_switch(prev)        do {} while (0)
#endif

#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
#ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION
static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
{
        return pmd;
}

static inline int pmd_swp_soft_dirty(pmd_t pmd)
{
        return 0;
}

static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
{
        return pmd;
}
#endif
#else /* !CONFIG_HAVE_ARCH_SOFT_DIRTY */
static inline int pte_soft_dirty(pte_t pte)
{
        return 0;
}

static inline int pmd_soft_dirty(pmd_t pmd)
{
        return 0;
}

static inline pte_t pte_mksoft_dirty(pte_t pte)
{
        return pte;
}

static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
{
        return pmd;
}

static inline pte_t pte_clear_soft_dirty(pte_t pte)
{
        return pte;
}

static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
{
        return pmd;
}

static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
{
        return pte;
}

static inline int pte_swp_soft_dirty(pte_t pte)
{
        return 0;
}

static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
{
        return pte;
}

static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
{
        return pmd;
}

static inline int pmd_swp_soft_dirty(pmd_t pmd)
{
        return 0;
}

static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
{
        return pmd;
}
#endif

#ifndef __HAVE_PFNMAP_TRACKING
/*
 * Interfaces that can be used by architecture code to keep track of
 * memory type of pfn mappings specified by the remap_pfn_range,
 * vmf_insert_pfn.
 */

/*
 * track_pfn_remap is called when a _new_ pfn mapping is being established
 * by remap_pfn_range() for physical range indicated by pfn and size.
 */
static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
                                  unsigned long pfn, unsigned long addr,
                                  unsigned long size)
{
        return 0;
}

/*
 * track_pfn_insert is called when a _new_ single pfn is established
 * by vmf_insert_pfn().
 */
static inline void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
                                    pfn_t pfn)
{
}

/*
 * track_pfn_copy is called when vma that is covering the pfnmap gets
 * copied through copy_page_range().
 */
static inline int track_pfn_copy(struct vm_area_struct *vma)
{
        return 0;
}

/*
 * untrack_pfn is called while unmapping a pfnmap for a region.
 * untrack can be called for a specific region indicated by pfn and size or
 * can be for the entire vma (in which case pfn, size are zero).
 */
static inline void untrack_pfn(struct vm_area_struct *vma,
                               unsigned long pfn, unsigned long size)
{
}

/*
 * untrack_pfn_moved is called while mremapping a pfnmap for a new region.
 */
static inline void untrack_pfn_moved(struct vm_area_struct *vma)
{
}
#else
extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
                           unsigned long pfn, unsigned long addr,
                           unsigned long size);
extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
                             pfn_t pfn);
extern int track_pfn_copy(struct vm_area_struct *vma);
extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
                        unsigned long size);
extern void untrack_pfn_moved(struct vm_area_struct *vma);
#endif

#ifdef __HAVE_COLOR_ZERO_PAGE
static inline int is_zero_pfn(unsigned long pfn)
{
        extern unsigned long zero_pfn;
        unsigned long offset_from_zero_pfn = pfn - zero_pfn;
        return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT);
}

#define my_zero_pfn(addr)        page_to_pfn(ZERO_PAGE(addr))

#else
static inline int is_zero_pfn(unsigned long pfn)
{
        extern unsigned long zero_pfn;
        return pfn == zero_pfn;
}

static inline unsigned long my_zero_pfn(unsigned long addr)
{
        extern unsigned long zero_pfn;
        return zero_pfn;
}
#endif

#ifdef CONFIG_MMU

#ifndef CONFIG_TRANSPARENT_HUGEPAGE
static inline int pmd_trans_huge(pmd_t pmd)
{
        return 0;
}
#ifndef pmd_write
static inline int pmd_write(pmd_t pmd)
{
        BUG();
        return 0;
}
#endif /* pmd_write */
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

#ifndef pud_write
static inline int pud_write(pud_t pud)
{
        BUG();
        return 0;
}
#endif /* pud_write */

#if !defined(CONFIG_ARCH_HAS_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE)
static inline int pmd_devmap(pmd_t pmd)
{
        return 0;
}
static inline int pud_devmap(pud_t pud)
{
        return 0;
}
static inline int pgd_devmap(pgd_t pgd)
{
        return 0;
}
#endif

#if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \
        (defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
         !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD))
static inline int pud_trans_huge(pud_t pud)
{
        return 0;
}
#endif

/* See pmd_none_or_trans_huge_or_clear_bad for discussion. */
static inline int pud_none_or_trans_huge_or_dev_or_clear_bad(pud_t *pud)
{
        pud_t pudval = READ_ONCE(*pud);

        if (pud_none(pudval) || pud_trans_huge(pudval) || pud_devmap(pudval))
                return 1;
        if (unlikely(pud_bad(pudval))) {
                pud_clear_bad(pud);
                return 1;
        }
        return 0;
}

/* See pmd_trans_unstable for discussion. */
static inline int pud_trans_unstable(pud_t *pud)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&                        \
        defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
        return pud_none_or_trans_huge_or_dev_or_clear_bad(pud);
#else
        return 0;
#endif
}

#ifndef pmd_read_atomic
static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
{
        /*
         * Depend on compiler for an atomic pmd read. NOTE: this is
         * only going to work, if the pmdval_t isn't larger than
         * an unsigned long.
         */
        return *pmdp;
}
#endif

#ifndef arch_needs_pgtable_deposit
#define arch_needs_pgtable_deposit() (false)
#endif
/*
 * This function is meant to be used by sites walking pagetables with
 * the mmap_lock held in read mode to protect against MADV_DONTNEED and
 * transhuge page faults. MADV_DONTNEED can convert a transhuge pmd
 * into a null pmd and the transhuge page fault can convert a null pmd
 * into an hugepmd or into a regular pmd (if the hugepage allocation
 * fails). While holding the mmap_lock in read mode the pmd becomes
 * stable and stops changing under us only if it's not null and not a
 * transhuge pmd. When those races occurs and this function makes a
 * difference vs the standard pmd_none_or_clear_bad, the result is
 * undefined so behaving like if the pmd was none is safe (because it
 * can return none anyway). The compiler level barrier() is critically
 * important to compute the two checks atomically on the same pmdval.
 *
 * For 32bit kernels with a 64bit large pmd_t this automatically takes
 * care of reading the pmd atomically to avoid SMP race conditions
 * against pmd_populate() when the mmap_lock is hold for reading by the
 * caller (a special atomic read not done by "gcc" as in the generic
 * version above, is also needed when THP is disabled because the page
 * fault can populate the pmd from under us).
 */
static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd)
{
        pmd_t pmdval = pmd_read_atomic(pmd);
        /*
         * The barrier will stabilize the pmdval in a register or on
         * the stack so that it will stop changing under the code.
         *
         * When CONFIG_TRANSPARENT_HUGEPAGE=y on x86 32bit PAE,
         * pmd_read_atomic is allowed to return a not atomic pmdval
         * (for example pointing to an hugepage that has never been
         * mapped in the pmd). The below checks will only care about
         * the low part of the pmd with 32bit PAE x86 anyway, with the
         * exception of pmd_none(). So the important thing is that if
         * the low part of the pmd is found null, the high part will
         * be also null or the pmd_none() check below would be
         * confused.
         */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        barrier();
#endif
        /*
         * !pmd_present() checks for pmd migration entries
         *
         * The complete check uses is_pmd_migration_entry() in linux/swapops.h
         * But using that requires moving current function and pmd_trans_unstable()
         * to linux/swapops.h to resovle dependency, which is too much code move.
         *
         * !pmd_present() is equivalent to is_pmd_migration_entry() currently,
         * because !pmd_present() pages can only be under migration not swapped
         * out.
         *
         * pmd_none() is preseved for future condition checks on pmd migration
         * entries and not confusing with this function name, although it is
         * redundant with !pmd_present().
         */
        if (pmd_none(pmdval) || pmd_trans_huge(pmdval) ||
                (IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION) && !pmd_present(pmdval)))
                return 1;
        if (unlikely(pmd_bad(pmdval))) {
                pmd_clear_bad(pmd);
                return 1;
        }
        return 0;
}

/*
 * This is a noop if Transparent Hugepage Support is not built into
 * the kernel. Otherwise it is equivalent to
 * pmd_none_or_trans_huge_or_clear_bad(), and shall only be called in
 * places that already verified the pmd is not none and they want to
 * walk ptes while holding the mmap sem in read mode (write mode don't
 * need this). If THP is not enabled, the pmd can't go away under the
 * code even if MADV_DONTNEED runs, but if THP is enabled we need to
 * run a pmd_trans_unstable before walking the ptes after
 * split_huge_pmd returns (because it may have run when the pmd become
 * null, but then a page fault can map in a THP and not a regular page).
 */
static inline int pmd_trans_unstable(pmd_t *pmd)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        return pmd_none_or_trans_huge_or_clear_bad(pmd);
#else
        return 0;
#endif
}

#ifndef CONFIG_NUMA_BALANCING
/*
 * Technically a PTE can be PROTNONE even when not doing NUMA balancing but
 * the only case the kernel cares is for NUMA balancing and is only ever set
 * when the VMA is accessible. For PROT_NONE VMAs, the PTEs are not marked
 * _PAGE_PROTNONE so by default, implement the helper as "always no". It
 * is the responsibility of the caller to distinguish between PROT_NONE
 * protections and NUMA hinting fault protections.
 */
static inline int pte_protnone(pte_t pte)
{
        return 0;
}

static inline int pmd_protnone(pmd_t pmd)
{
        return 0;
}
#endif /* CONFIG_NUMA_BALANCING */

#endif /* CONFIG_MMU */

#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP

#ifndef __PAGETABLE_P4D_FOLDED
int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot);
int p4d_clear_huge(p4d_t *p4d);
#else
static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
{
        return 0;
}
static inline int p4d_clear_huge(p4d_t *p4d)
{
        return 0;
}
#endif /* !__PAGETABLE_P4D_FOLDED */

int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot);
int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot);
int pud_clear_huge(pud_t *pud);
int pmd_clear_huge(pmd_t *pmd);
int p4d_free_pud_page(p4d_t *p4d, unsigned long addr);
int pud_free_pmd_page(pud_t *pud, unsigned long addr);
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr);
#else        /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
{
        return 0;
}
static inline int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
{
        return 0;
}
static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
{
        return 0;
}
static inline int p4d_clear_huge(p4d_t *p4d)
{
        return 0;
}
static inline int pud_clear_huge(pud_t *pud)
{
        return 0;
}
static inline int pmd_clear_huge(pmd_t *pmd)
{
        return 0;
}
static inline int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
{
        return 0;
}
static inline int pud_free_pmd_page(pud_t *pud, unsigned long addr)
{
        return 0;
}
static inline int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
{
        return 0;
}
#endif        /* CONFIG_HAVE_ARCH_HUGE_VMAP */

#ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
 * ARCHes with special requirements for evicting THP backing TLB entries can
 * implement this. Otherwise also, it can help optimize normal TLB flush in
 * THP regime. Stock flush_tlb_range() typically has optimization to nuke the
 * entire TLB if flush span is greater than a threshold, which will
 * likely be true for a single huge page. Thus a single THP flush will
 * invalidate the entire TLB which is not desirable.
 * e.g. see arch/arc: flush_pmd_tlb_range
 */
#define flush_pmd_tlb_range(vma, addr, end)        flush_tlb_range(vma, addr, end)
#define flush_pud_tlb_range(vma, addr, end)        flush_tlb_range(vma, addr, end)
#else
#define flush_pmd_tlb_range(vma, addr, end)        BUILD_BUG()
#define flush_pud_tlb_range(vma, addr, end)        BUILD_BUG()
#endif
#endif

struct file;
int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
                        unsigned long size, pgprot_t *vma_prot);

#ifndef CONFIG_X86_ESPFIX64
static inline void init_espfix_bsp(void) { }
#endif

extern void __init pgtable_cache_init(void);

#ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED
static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
{
        return true;
}

static inline bool arch_has_pfn_modify_check(void)
{
        return false;
}
#endif /* !_HAVE_ARCH_PFN_MODIFY_ALLOWED */

/*
 * Architecture PAGE_KERNEL_* fallbacks
 *
 * Some architectures don't define certain PAGE_KERNEL_* flags. This is either
 * because they really don't support them, or the port needs to be updated to
 * reflect the required functionality. Below are a set of relatively safe
 * fallbacks, as best effort, which we can count on in lieu of the architectures
 * not defining them on their own yet.
 */

#ifndef PAGE_KERNEL_RO
# define PAGE_KERNEL_RO PAGE_KERNEL
#endif

#ifndef PAGE_KERNEL_EXEC
# define PAGE_KERNEL_EXEC PAGE_KERNEL
#endif

/*
 * Page Table Modification bits for pgtbl_mod_mask.
 *
 * These are used by the p?d_alloc_track*() set of functions an in the generic
 * vmalloc/ioremap code to track at which page-table levels entries have been
 * modified. Based on that the code can better decide when vmalloc and ioremap
 * mapping changes need to be synchronized to other page-tables in the system.
 */
#define                __PGTBL_PGD_MODIFIED        0
#define                __PGTBL_P4D_MODIFIED        1
#define                __PGTBL_PUD_MODIFIED        2
#define                __PGTBL_PMD_MODIFIED        3
#define                __PGTBL_PTE_MODIFIED        4

#define                PGTBL_PGD_MODIFIED        BIT(__PGTBL_PGD_MODIFIED)
#define                PGTBL_P4D_MODIFIED        BIT(__PGTBL_P4D_MODIFIED)
#define                PGTBL_PUD_MODIFIED        BIT(__PGTBL_PUD_MODIFIED)
#define                PGTBL_PMD_MODIFIED        BIT(__PGTBL_PMD_MODIFIED)
#define                PGTBL_PTE_MODIFIED        BIT(__PGTBL_PTE_MODIFIED)

/* Page-Table Modification Mask */
typedef unsigned int pgtbl_mod_mask;

#endif /* !__ASSEMBLY__ */

#if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT)
#ifdef CONFIG_PHYS_ADDR_T_64BIT
/*
 * ZSMALLOC needs to know the highest PFN on 32-bit architectures
 * with physical address space extension, but falls back to
 * BITS_PER_LONG otherwise.
 */
#error Missing MAX_POSSIBLE_PHYSMEM_BITS definition
#else
#define MAX_POSSIBLE_PHYSMEM_BITS 32
#endif
#endif

#ifndef has_transparent_hugepage
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define has_transparent_hugepage() 1
#else
#define has_transparent_hugepage() 0
#endif
#endif

/*
 * On some architectures it depends on the mm if the p4d/pud or pmd
 * layer of the page table hierarchy is folded or not.
 */
#ifndef mm_p4d_folded
#define mm_p4d_folded(mm)        __is_defined(__PAGETABLE_P4D_FOLDED)
#endif

#ifndef mm_pud_folded
#define mm_pud_folded(mm)        __is_defined(__PAGETABLE_PUD_FOLDED)
#endif

#ifndef mm_pmd_folded
#define mm_pmd_folded(mm)        __is_defined(__PAGETABLE_PMD_FOLDED)
#endif

#ifndef p4d_offset_lockless
#define p4d_offset_lockless(pgdp, pgd, address) p4d_offset(&(pgd), address)
#endif
#ifndef pud_offset_lockless
#define pud_offset_lockless(p4dp, p4d, address) pud_offset(&(p4d), address)
#endif
#ifndef pmd_offset_lockless
#define pmd_offset_lockless(pudp, pud, address) pmd_offset(&(pud), address)
#endif

/*
 * p?d_leaf() - true if this entry is a final mapping to a physical address.
 * This differs from p?d_huge() by the fact that they are always available (if
 * the architecture supports large pages at the appropriate level) even
 * if CONFIG_HUGETLB_PAGE is not defined.
 * Only meaningful when called on a valid entry.
 */
#ifndef pgd_leaf
#define pgd_leaf(x)        0
#endif
#ifndef p4d_leaf
#define p4d_leaf(x)        0
#endif
#ifndef pud_leaf
#define pud_leaf(x)        0
#endif
#ifndef pmd_leaf
#define pmd_leaf(x)        0
#endif

#endif /* _LINUX_PGTABLE_H */



















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Task I/O accounting operations
 */
#ifndef __TASK_IO_ACCOUNTING_OPS_INCLUDED
#define __TASK_IO_ACCOUNTING_OPS_INCLUDED

#include <linux/sched.h>

#ifdef CONFIG_TASK_IO_ACCOUNTING
static inline void task_io_account_read(size_t bytes)
{
        current->ioac.read_bytes += bytes;
}

/*
 * We approximate number of blocks, because we account bytes only.
 * A 'block' is 512 bytes
 */
static inline unsigned long task_io_get_inblock(const struct task_struct *p)
{
        return p->ioac.read_bytes >> 9;
}

static inline void task_io_account_write(size_t bytes)
{
        current->ioac.write_bytes += bytes;
}

/*
 * We approximate number of blocks, because we account bytes only.
 * A 'block' is 512 bytes
 */
static inline unsigned long task_io_get_oublock(const struct task_struct *p)
{
        return p->ioac.write_bytes >> 9;
}

static inline void task_io_account_cancelled_write(size_t bytes)
{
        current->ioac.cancelled_write_bytes += bytes;
}

static inline void task_io_accounting_init(struct task_io_accounting *ioac)
{
        memset(ioac, 0, sizeof(*ioac));
}

static inline void task_blk_io_accounting_add(struct task_io_accounting *dst,
                                                struct task_io_accounting *src)
{
        dst->read_bytes += src->read_bytes;
        dst->write_bytes += src->write_bytes;
        dst->cancelled_write_bytes += src->cancelled_write_bytes;
}

#else

static inline void task_io_account_read(size_t bytes)
{
}

static inline unsigned long task_io_get_inblock(const struct task_struct *p)
{
        return 0;
}

static inline void task_io_account_write(size_t bytes)
{
}

static inline unsigned long task_io_get_oublock(const struct task_struct *p)
{
        return 0;
}

static inline void task_io_account_cancelled_write(size_t bytes)
{
}

static inline void task_io_accounting_init(struct task_io_accounting *ioac)
{
}

static inline void task_blk_io_accounting_add(struct task_io_accounting *dst,
                                                struct task_io_accounting *src)
{
}

#endif /* CONFIG_TASK_IO_ACCOUNTING */

#ifdef CONFIG_TASK_XACCT
static inline void task_chr_io_accounting_add(struct task_io_accounting *dst,
                                                struct task_io_accounting *src)
{
        dst->rchar += src->rchar;
        dst->wchar += src->wchar;
        dst->syscr += src->syscr;
        dst->syscw += src->syscw;
}
#else
static inline void task_chr_io_accounting_add(struct task_io_accounting *dst,
                                                struct task_io_accounting *src)
{
}
#endif /* CONFIG_TASK_XACCT */

static inline void task_io_accounting_add(struct task_io_accounting *dst,
                                                struct task_io_accounting *src)
{
        task_chr_io_accounting_add(dst, src);
        task_blk_io_accounting_add(dst, src);
}
#endif /* __TASK_IO_ACCOUNTING_OPS_INCLUDED */




























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the Interfaces handler.
 *
 * Version:        @(#)dev.h        1.0.10        08/12/93
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Corey Minyard <wf-rch!minyard@relay.EU.net>
 *                Donald J. Becker, <becker@cesdis.gsfc.nasa.gov>
 *                Alan Cox, <alan@lxorguk.ukuu.org.uk>
 *                Bjorn Ekwall. <bj0rn@blox.se>
 *              Pekka Riikonen <priikone@poseidon.pspt.fi>
 *
 *                Moved to /usr/include/linux for NET3
 */
#ifndef _LINUX_NETDEVICE_H
#define _LINUX_NETDEVICE_H

#include <linux/timer.h>
#include <linux/bug.h>
#include <linux/delay.h>
#include <linux/atomic.h>
#include <linux/prefetch.h>
#include <asm/cache.h>
#include <asm/byteorder.h>

#include <linux/percpu.h>
#include <linux/rculist.h>
#include <linux/workqueue.h>
#include <linux/dynamic_queue_limits.h>

#include <linux/ethtool.h>
#include <net/net_namespace.h>
#ifdef CONFIG_DCB
#include <net/dcbnl.h>
#endif
#include <net/netprio_cgroup.h>
#include <net/xdp.h>

#include <linux/netdev_features.h>
#include <linux/neighbour.h>
#include <uapi/linux/netdevice.h>
#include <uapi/linux/if_bonding.h>
#include <uapi/linux/pkt_cls.h>
#include <linux/hashtable.h>

struct netpoll_info;
struct device;
struct phy_device;
struct dsa_port;
struct ip_tunnel_parm;
struct macsec_context;
struct macsec_ops;

struct sfp_bus;
/* 802.11 specific */
struct wireless_dev;
/* 802.15.4 specific */
struct wpan_dev;
struct mpls_dev;
/* UDP Tunnel offloads */
struct udp_tunnel_info;
struct udp_tunnel_nic_info;
struct udp_tunnel_nic;
struct bpf_prog;
struct xdp_buff;

void synchronize_net(void);
void netdev_set_default_ethtool_ops(struct net_device *dev,
                                    const struct ethtool_ops *ops);

/* Backlog congestion levels */
#define NET_RX_SUCCESS                0        /* keep 'em coming, baby */
#define NET_RX_DROP                1        /* packet dropped */

#define MAX_NEST_DEV 8

/*
 * Transmit return codes: transmit return codes originate from three different
 * namespaces:
 *
 * - qdisc return codes
 * - driver transmit return codes
 * - errno values
 *
 * Drivers are allowed to return any one of those in their hard_start_xmit()
 * function. Real network devices commonly used with qdiscs should only return
 * the driver transmit return codes though - when qdiscs are used, the actual
 * transmission happens asynchronously, so the value is not propagated to
 * higher layers. Virtual network devices transmit synchronously; in this case
 * the driver transmit return codes are consumed by dev_queue_xmit(), and all
 * others are propagated to higher layers.
 */

/* qdisc ->enqueue() return codes. */
#define NET_XMIT_SUCCESS        0x00
#define NET_XMIT_DROP                0x01        /* skb dropped                        */
#define NET_XMIT_CN                0x02        /* congestion notification        */
#define NET_XMIT_MASK                0x0f        /* qdisc flags in net/sch_generic.h */

/* NET_XMIT_CN is special. It does not guarantee that this packet is lost. It
 * indicates that the device will soon be dropping packets, or already drops
 * some packets of the same priority; prompting us to send less aggressively. */
#define net_xmit_eval(e)        ((e) == NET_XMIT_CN ? 0 : (e))
#define net_xmit_errno(e)        ((e) != NET_XMIT_CN ? -ENOBUFS : 0)

/* Driver transmit return codes */
#define NETDEV_TX_MASK                0xf0

enum netdev_tx {
        __NETDEV_TX_MIN         = INT_MIN,        /* make sure enum is signed */
        NETDEV_TX_OK         = 0x00,        /* driver took care of packet */
        NETDEV_TX_BUSY         = 0x10,        /* driver tx path was busy*/
};
typedef enum netdev_tx netdev_tx_t;

/*
 * Current order: NETDEV_TX_MASK > NET_XMIT_MASK >= 0 is significant;
 * hard_start_xmit() return < NET_XMIT_MASK means skb was consumed.
 */
static inline bool dev_xmit_complete(int rc)
{
        /*
         * Positive cases with an skb consumed by a driver:
         * - successful transmission (rc == NETDEV_TX_OK)
         * - error while transmitting (rc < 0)
         * - error while queueing to a different device (rc & NET_XMIT_MASK)
         */
        if (likely(rc < NET_XMIT_MASK))
                return true;

        return false;
}

/*
 *        Compute the worst-case header length according to the protocols
 *        used.
 */

#if defined(CONFIG_HYPERV_NET)
# define LL_MAX_HEADER 128
#elif defined(CONFIG_WLAN) || IS_ENABLED(CONFIG_AX25)
# if defined(CONFIG_MAC80211_MESH)
#  define LL_MAX_HEADER 128
# else
#  define LL_MAX_HEADER 96
# endif
#else
# define LL_MAX_HEADER 32
#endif

#if !IS_ENABLED(CONFIG_NET_IPIP) && !IS_ENABLED(CONFIG_NET_IPGRE) && \
    !IS_ENABLED(CONFIG_IPV6_SIT) && !IS_ENABLED(CONFIG_IPV6_TUNNEL)
#define MAX_HEADER LL_MAX_HEADER
#else
#define MAX_HEADER (LL_MAX_HEADER + 48)
#endif

/*
 *        Old network device statistics. Fields are native words
 *        (unsigned long) so they can be read and written atomically.
 */

#define NET_DEV_STAT(FIELD)                        \
        union {                                        \
                unsigned long FIELD;                \
                atomic_long_t __##FIELD;        \
        }

struct net_device_stats {
        NET_DEV_STAT(rx_packets);
        NET_DEV_STAT(tx_packets);
        NET_DEV_STAT(rx_bytes);
        NET_DEV_STAT(tx_bytes);
        NET_DEV_STAT(rx_errors);
        NET_DEV_STAT(tx_errors);
        NET_DEV_STAT(rx_dropped);
        NET_DEV_STAT(tx_dropped);
        NET_DEV_STAT(multicast);
        NET_DEV_STAT(collisions);
        NET_DEV_STAT(rx_length_errors);
        NET_DEV_STAT(rx_over_errors);
        NET_DEV_STAT(rx_crc_errors);
        NET_DEV_STAT(rx_frame_errors);
        NET_DEV_STAT(rx_fifo_errors);
        NET_DEV_STAT(rx_missed_errors);
        NET_DEV_STAT(tx_aborted_errors);
        NET_DEV_STAT(tx_carrier_errors);
        NET_DEV_STAT(tx_fifo_errors);
        NET_DEV_STAT(tx_heartbeat_errors);
        NET_DEV_STAT(tx_window_errors);
        NET_DEV_STAT(rx_compressed);
        NET_DEV_STAT(tx_compressed);
};
#undef NET_DEV_STAT


#include <linux/cache.h>
#include <linux/skbuff.h>

#ifdef CONFIG_RPS
#include <linux/static_key.h>
extern struct static_key_false rps_needed;
extern struct static_key_false rfs_needed;
#endif

struct neighbour;
struct neigh_parms;
struct sk_buff;

struct netdev_hw_addr {
        struct list_head        list;
        unsigned char                addr[MAX_ADDR_LEN];
        unsigned char                type;
#define NETDEV_HW_ADDR_T_LAN                1
#define NETDEV_HW_ADDR_T_SAN                2
#define NETDEV_HW_ADDR_T_UNICAST        3
#define NETDEV_HW_ADDR_T_MULTICAST        4
        bool                        global_use;
        int                        sync_cnt;
        int                        refcount;
        int                        synced;
        struct rcu_head                rcu_head;
};

struct netdev_hw_addr_list {
        struct list_head        list;
        int                        count;
};

#define netdev_hw_addr_list_count(l) ((l)->count)
#define netdev_hw_addr_list_empty(l) (netdev_hw_addr_list_count(l) == 0)
#define netdev_hw_addr_list_for_each(ha, l) \
        list_for_each_entry(ha, &(l)->list, list)

#define netdev_uc_count(dev) netdev_hw_addr_list_count(&(dev)->uc)
#define netdev_uc_empty(dev) netdev_hw_addr_list_empty(&(dev)->uc)
#define netdev_for_each_uc_addr(ha, dev) \
        netdev_hw_addr_list_for_each(ha, &(dev)->uc)

#define netdev_mc_count(dev) netdev_hw_addr_list_count(&(dev)->mc)
#define netdev_mc_empty(dev) netdev_hw_addr_list_empty(&(dev)->mc)
#define netdev_for_each_mc_addr(ha, dev) \
        netdev_hw_addr_list_for_each(ha, &(dev)->mc)

struct hh_cache {
        unsigned int        hh_len;
        seqlock_t        hh_lock;

        /* cached hardware header; allow for machine alignment needs.        */
#define HH_DATA_MOD        16
#define HH_DATA_OFF(__len) \
        (HH_DATA_MOD - (((__len - 1) & (HH_DATA_MOD - 1)) + 1))
#define HH_DATA_ALIGN(__len) \
        (((__len)+(HH_DATA_MOD-1))&~(HH_DATA_MOD - 1))
        unsigned long        hh_data[HH_DATA_ALIGN(LL_MAX_HEADER) / sizeof(long)];
};

/* Reserve HH_DATA_MOD byte-aligned hard_header_len, but at least that much.
 * Alternative is:
 *   dev->hard_header_len ? (dev->hard_header_len +
 *                           (HH_DATA_MOD - 1)) & ~(HH_DATA_MOD - 1) : 0
 *
 * We could use other alignment values, but we must maintain the
 * relationship HH alignment <= LL alignment.
 */
#define LL_RESERVED_SPACE(dev) \
        ((((dev)->hard_header_len + READ_ONCE((dev)->needed_headroom)) \
          & ~(HH_DATA_MOD - 1)) + HH_DATA_MOD)
#define LL_RESERVED_SPACE_EXTRA(dev,extra) \
        ((((dev)->hard_header_len + READ_ONCE((dev)->needed_headroom) + (extra)) \
          & ~(HH_DATA_MOD - 1)) + HH_DATA_MOD)

struct header_ops {
        int        (*create) (struct sk_buff *skb, struct net_device *dev,
                           unsigned short type, const void *daddr,
                           const void *saddr, unsigned int len);
        int        (*parse)(const struct sk_buff *skb, unsigned char *haddr);
        int        (*cache)(const struct neighbour *neigh, struct hh_cache *hh, __be16 type);
        void        (*cache_update)(struct hh_cache *hh,
                                const struct net_device *dev,
                                const unsigned char *haddr);
        bool        (*validate)(const char *ll_header, unsigned int len);
        __be16        (*parse_protocol)(const struct sk_buff *skb);
};

/* These flag bits are private to the generic network queueing
 * layer; they may not be explicitly referenced by any other
 * code.
 */

enum netdev_state_t {
        __LINK_STATE_START,
        __LINK_STATE_PRESENT,
        __LINK_STATE_NOCARRIER,
        __LINK_STATE_LINKWATCH_PENDING,
        __LINK_STATE_DORMANT,
        __LINK_STATE_TESTING,
};


/*
 * This structure holds boot-time configured netdevice settings. They
 * are then used in the device probing.
 */
struct netdev_boot_setup {
        char name[IFNAMSIZ];
        struct ifmap map;
};
#define NETDEV_BOOT_SETUP_MAX 8

int __init netdev_boot_setup(char *str);

struct gro_list {
        struct list_head        list;
        int                        count;
};

/*
 * size of gro hash buckets, must less than bit number of
 * napi_struct::gro_bitmask
 */
#define GRO_HASH_BUCKETS        8

/*
 * Structure for NAPI scheduling similar to tasklet but with weighting
 */
struct napi_struct {
        /* The poll_list must only be managed by the entity which
         * changes the state of the NAPI_STATE_SCHED bit.  This means
         * whoever atomically sets that bit can add this napi_struct
         * to the per-CPU poll_list, and whoever clears that bit
         * can remove from the list right before clearing the bit.
         */
        struct list_head        poll_list;

        unsigned long                state;
        int                        weight;
        int                        defer_hard_irqs_count;
        unsigned long                gro_bitmask;
        int                        (*poll)(struct napi_struct *, int);
#ifdef CONFIG_NETPOLL
        int                        poll_owner;
#endif
        struct net_device        *dev;
        struct gro_list                gro_hash[GRO_HASH_BUCKETS];
        struct sk_buff                *skb;
        struct list_head        rx_list; /* Pending GRO_NORMAL skbs */
        int                        rx_count; /* length of rx_list */
        struct hrtimer                timer;
        struct list_head        dev_list;
        struct hlist_node        napi_hash_node;
        unsigned int                napi_id;
};

enum {
        NAPI_STATE_SCHED,        /* Poll is scheduled */
        NAPI_STATE_MISSED,        /* reschedule a napi */
        NAPI_STATE_DISABLE,        /* Disable pending */
        NAPI_STATE_NPSVC,        /* Netpoll - don't dequeue from poll_list */
        NAPI_STATE_LISTED,        /* NAPI added to system lists */
        NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
        NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
};

enum {
        NAPIF_STATE_SCHED         = BIT(NAPI_STATE_SCHED),
        NAPIF_STATE_MISSED         = BIT(NAPI_STATE_MISSED),
        NAPIF_STATE_DISABLE         = BIT(NAPI_STATE_DISABLE),
        NAPIF_STATE_NPSVC         = BIT(NAPI_STATE_NPSVC),
        NAPIF_STATE_LISTED         = BIT(NAPI_STATE_LISTED),
        NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
        NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
};

enum gro_result {
        GRO_MERGED,
        GRO_MERGED_FREE,
        GRO_HELD,
        GRO_NORMAL,
        GRO_DROP,
        GRO_CONSUMED,
};
typedef enum gro_result gro_result_t;

/*
 * enum rx_handler_result - Possible return values for rx_handlers.
 * @RX_HANDLER_CONSUMED: skb was consumed by rx_handler, do not process it
 * further.
 * @RX_HANDLER_ANOTHER: Do another round in receive path. This is indicated in
 * case skb->dev was changed by rx_handler.
 * @RX_HANDLER_EXACT: Force exact delivery, no wildcard.
 * @RX_HANDLER_PASS: Do nothing, pass the skb as if no rx_handler was called.
 *
 * rx_handlers are functions called from inside __netif_receive_skb(), to do
 * special processing of the skb, prior to delivery to protocol handlers.
 *
 * Currently, a net_device can only have a single rx_handler registered. Trying
 * to register a second rx_handler will return -EBUSY.
 *
 * To register a rx_handler on a net_device, use netdev_rx_handler_register().
 * To unregister a rx_handler on a net_device, use
 * netdev_rx_handler_unregister().
 *
 * Upon return, rx_handler is expected to tell __netif_receive_skb() what to
 * do with the skb.
 *
 * If the rx_handler consumed the skb in some way, it should return
 * RX_HANDLER_CONSUMED. This is appropriate when the rx_handler arranged for
 * the skb to be delivered in some other way.
 *
 * If the rx_handler changed skb->dev, to divert the skb to another
 * net_device, it should return RX_HANDLER_ANOTHER. The rx_handler for the
 * new device will be called if it exists.
 *
 * If the rx_handler decides the skb should be ignored, it should return
 * RX_HANDLER_EXACT. The skb will only be delivered to protocol handlers that
 * are registered on exact device (ptype->dev == skb->dev).
 *
 * If the rx_handler didn't change skb->dev, but wants the skb to be normally
 * delivered, it should return RX_HANDLER_PASS.
 *
 * A device without a registered rx_handler will behave as if rx_handler
 * returned RX_HANDLER_PASS.
 */

enum rx_handler_result {
        RX_HANDLER_CONSUMED,
        RX_HANDLER_ANOTHER,
        RX_HANDLER_EXACT,
        RX_HANDLER_PASS,
};
typedef enum rx_handler_result rx_handler_result_t;
typedef rx_handler_result_t rx_handler_func_t(struct sk_buff **pskb);

void __napi_schedule(struct napi_struct *n);
void __napi_schedule_irqoff(struct napi_struct *n);

static inline bool napi_disable_pending(struct napi_struct *n)
{
        return test_bit(NAPI_STATE_DISABLE, &n->state);
}

bool napi_schedule_prep(struct napi_struct *n);

/**
 *        napi_schedule - schedule NAPI poll
 *        @n: NAPI context
 *
 * Schedule NAPI poll routine to be called if it is not already
 * running.
 */
static inline void napi_schedule(struct napi_struct *n)
{
        if (napi_schedule_prep(n))
                __napi_schedule(n);
}

/**
 *        napi_schedule_irqoff - schedule NAPI poll
 *        @n: NAPI context
 *
 * Variant of napi_schedule(), assuming hard irqs are masked.
 */
static inline void napi_schedule_irqoff(struct napi_struct *n)
{
        if (napi_schedule_prep(n))
                __napi_schedule_irqoff(n);
}

/* Try to reschedule poll. Called by dev->poll() after napi_complete().  */
static inline bool napi_reschedule(struct napi_struct *napi)
{
        if (napi_schedule_prep(napi)) {
                __napi_schedule(napi);
                return true;
        }
        return false;
}

bool napi_complete_done(struct napi_struct *n, int work_done);
/**
 *        napi_complete - NAPI processing complete
 *        @n: NAPI context
 *
 * Mark NAPI processing as complete.
 * Consider using napi_complete_done() instead.
 * Return false if device should avoid rearming interrupts.
 */
static inline bool napi_complete(struct napi_struct *n)
{
        return napi_complete_done(n, 0);
}

/**
 *        napi_disable - prevent NAPI from scheduling
 *        @n: NAPI context
 *
 * Stop NAPI from being scheduled on this context.
 * Waits till any outstanding processing completes.
 */
void napi_disable(struct napi_struct *n);

/**
 *        napi_enable - enable NAPI scheduling
 *        @n: NAPI context
 *
 * Resume NAPI from being scheduled on this context.
 * Must be paired with napi_disable.
 */
static inline void napi_enable(struct napi_struct *n)
{
        BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
        smp_mb__before_atomic();
        clear_bit(NAPI_STATE_SCHED, &n->state);
        clear_bit(NAPI_STATE_NPSVC, &n->state);
}

/**
 *        napi_synchronize - wait until NAPI is not running
 *        @n: NAPI context
 *
 * Wait until NAPI is done being scheduled on this context.
 * Waits till any outstanding processing completes but
 * does not disable future activations.
 */
static inline void napi_synchronize(const struct napi_struct *n)
{
        if (IS_ENABLED(CONFIG_SMP))
                while (test_bit(NAPI_STATE_SCHED, &n->state))
                        msleep(1);
        else
                barrier();
}

/**
 *        napi_if_scheduled_mark_missed - if napi is running, set the
 *        NAPIF_STATE_MISSED
 *        @n: NAPI context
 *
 * If napi is running, set the NAPIF_STATE_MISSED, and return true if
 * NAPI is scheduled.
 **/
static inline bool napi_if_scheduled_mark_missed(struct napi_struct *n)
{
        unsigned long val, new;

        do {
                val = READ_ONCE(n->state);
                if (val & NAPIF_STATE_DISABLE)
                        return true;

                if (!(val & NAPIF_STATE_SCHED))
                        return false;

                new = val | NAPIF_STATE_MISSED;
        } while (cmpxchg(&n->state, val, new) != val);

        return true;
}

enum netdev_queue_state_t {
        __QUEUE_STATE_DRV_XOFF,
        __QUEUE_STATE_STACK_XOFF,
        __QUEUE_STATE_FROZEN,
};

#define QUEUE_STATE_DRV_XOFF        (1 << __QUEUE_STATE_DRV_XOFF)
#define QUEUE_STATE_STACK_XOFF        (1 << __QUEUE_STATE_STACK_XOFF)
#define QUEUE_STATE_FROZEN        (1 << __QUEUE_STATE_FROZEN)

#define QUEUE_STATE_ANY_XOFF        (QUEUE_STATE_DRV_XOFF | QUEUE_STATE_STACK_XOFF)
#define QUEUE_STATE_ANY_XOFF_OR_FROZEN (QUEUE_STATE_ANY_XOFF | \
                                        QUEUE_STATE_FROZEN)
#define QUEUE_STATE_DRV_XOFF_OR_FROZEN (QUEUE_STATE_DRV_XOFF | \
                                        QUEUE_STATE_FROZEN)

/*
 * __QUEUE_STATE_DRV_XOFF is used by drivers to stop the transmit queue.  The
 * netif_tx_* functions below are used to manipulate this flag.  The
 * __QUEUE_STATE_STACK_XOFF flag is used by the stack to stop the transmit
 * queue independently.  The netif_xmit_*stopped functions below are called
 * to check if the queue has been stopped by the driver or stack (either
 * of the XOFF bits are set in the state).  Drivers should not need to call
 * netif_xmit*stopped functions, they should only be using netif_tx_*.
 */

struct netdev_queue {
/*
 * read-mostly part
 */
        struct net_device        *dev;
        struct Qdisc __rcu        *qdisc;
        struct Qdisc                *qdisc_sleeping;
#ifdef CONFIG_SYSFS
        struct kobject                kobj;
#endif
#if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
        int                        numa_node;
#endif
        unsigned long                tx_maxrate;
        /*
         * Number of TX timeouts for this queue
         * (/sys/class/net/DEV/Q/trans_timeout)
         */
        unsigned long                trans_timeout;

        /* Subordinate device that the queue has been assigned to */
        struct net_device        *sb_dev;
#ifdef CONFIG_XDP_SOCKETS
        struct xsk_buff_pool    *pool;
#endif
/*
 * write-mostly part
 */
        spinlock_t                _xmit_lock ____cacheline_aligned_in_smp;
        int                        xmit_lock_owner;
        /*
         * Time (in jiffies) of last Tx
         */
        unsigned long                trans_start;

        unsigned long                state;

#ifdef CONFIG_BQL
        struct dql                dql;
#endif
} ____cacheline_aligned_in_smp;

extern int sysctl_fb_tunnels_only_for_init_net;
extern int sysctl_devconf_inherit_init_net;

/*
 * sysctl_fb_tunnels_only_for_init_net == 0 : For all netns
 *                                     == 1 : For initns only
 *                                     == 2 : For none.
 */
static inline bool net_has_fallback_tunnels(const struct net *net)
{
#if IS_ENABLED(CONFIG_SYSCTL)
        int fb_tunnels_only_for_init_net = READ_ONCE(sysctl_fb_tunnels_only_for_init_net);

        return !fb_tunnels_only_for_init_net ||
                (net_eq(net, &init_net) && fb_tunnels_only_for_init_net == 1);
#else
        return true;
#endif
}

static inline int net_inherit_devconf(void)
{
#if IS_ENABLED(CONFIG_SYSCTL)
        return READ_ONCE(sysctl_devconf_inherit_init_net);
#else
        return 0;
#endif
}

static inline int netdev_queue_numa_node_read(const struct netdev_queue *q)
{
#if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
        return q->numa_node;
#else
        return NUMA_NO_NODE;
#endif
}

static inline void netdev_queue_numa_node_write(struct netdev_queue *q, int node)
{
#if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
        q->numa_node = node;
#endif
}

#ifdef CONFIG_RPS
/*
 * This structure holds an RPS map which can be of variable length.  The
 * map is an array of CPUs.
 */
struct rps_map {
        unsigned int len;
        struct rcu_head rcu;
        u16 cpus[];
};
#define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + ((_num) * sizeof(u16)))

/*
 * The rps_dev_flow structure contains the mapping of a flow to a CPU, the
 * tail pointer for that CPU's input queue at the time of last enqueue, and
 * a hardware filter index.
 */
struct rps_dev_flow {
        u16 cpu;
        u16 filter;
        unsigned int last_qtail;
};
#define RPS_NO_FILTER 0xffff

/*
 * The rps_dev_flow_table structure contains a table of flow mappings.
 */
struct rps_dev_flow_table {
        unsigned int mask;
        struct rcu_head rcu;
        struct rps_dev_flow flows[];
};
#define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \
    ((_num) * sizeof(struct rps_dev_flow)))

/*
 * The rps_sock_flow_table contains mappings of flows to the last CPU
 * on which they were processed by the application (set in recvmsg).
 * Each entry is a 32bit value. Upper part is the high-order bits
 * of flow hash, lower part is CPU number.
 * rps_cpu_mask is used to partition the space, depending on number of
 * possible CPUs : rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1
 * For example, if 64 CPUs are possible, rps_cpu_mask = 0x3f,
 * meaning we use 32-6=26 bits for the hash.
 */
struct rps_sock_flow_table {
        u32        mask;

        u32        ents[] ____cacheline_aligned_in_smp;
};
#define        RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num]))

#define RPS_NO_CPU 0xffff

extern u32 rps_cpu_mask;
extern struct rps_sock_flow_table __rcu *rps_sock_flow_table;

static inline void rps_record_sock_flow(struct rps_sock_flow_table *table,
                                        u32 hash)
{
        if (table && hash) {
                unsigned int index = hash & table->mask;
                u32 val = hash & ~rps_cpu_mask;

                /* We only give a hint, preemption can change CPU under us */
                val |= raw_smp_processor_id();

                /* The following WRITE_ONCE() is paired with the READ_ONCE()
                 * here, and another one in get_rps_cpu().
                 */
                if (READ_ONCE(table->ents[index]) != val)
                        WRITE_ONCE(table->ents[index], val);
        }
}

#ifdef CONFIG_RFS_ACCEL
bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id,
                         u16 filter_id);
#endif
#endif /* CONFIG_RPS */

/* This structure contains an instance of an RX queue. */
struct netdev_rx_queue {
#ifdef CONFIG_RPS
        struct rps_map __rcu                *rps_map;
        struct rps_dev_flow_table __rcu        *rps_flow_table;
#endif
        struct kobject                        kobj;
        struct net_device                *dev;
        struct xdp_rxq_info                xdp_rxq;
#ifdef CONFIG_XDP_SOCKETS
        struct xsk_buff_pool            *pool;
#endif
} ____cacheline_aligned_in_smp;

/*
 * RX queue sysfs structures and functions.
 */
struct rx_queue_attribute {
        struct attribute attr;
        ssize_t (*show)(struct netdev_rx_queue *queue, char *buf);
        ssize_t (*store)(struct netdev_rx_queue *queue,
                         const char *buf, size_t len);
};

#ifdef CONFIG_XPS
/*
 * This structure holds an XPS map which can be of variable length.  The
 * map is an array of queues.
 */
struct xps_map {
        unsigned int len;
        unsigned int alloc_len;
        struct rcu_head rcu;
        u16 queues[];
};
#define XPS_MAP_SIZE(_num) (sizeof(struct xps_map) + ((_num) * sizeof(u16)))
#define XPS_MIN_MAP_ALLOC ((L1_CACHE_ALIGN(offsetof(struct xps_map, queues[1])) \
       - sizeof(struct xps_map)) / sizeof(u16))

/*
 * This structure holds all XPS maps for device.  Maps are indexed by CPU.
 */
struct xps_dev_maps {
        struct rcu_head rcu;
        struct xps_map __rcu *attr_map[]; /* Either CPUs map or RXQs map */
};

#define XPS_CPU_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) +        \
        (nr_cpu_ids * (_tcs) * sizeof(struct xps_map *)))

#define XPS_RXQ_DEV_MAPS_SIZE(_tcs, _rxqs) (sizeof(struct xps_dev_maps) +\
        (_rxqs * (_tcs) * sizeof(struct xps_map *)))

#endif /* CONFIG_XPS */

#define TC_MAX_QUEUE        16
#define TC_BITMASK        15
/* HW offloaded queuing disciplines txq count and offset maps */
struct netdev_tc_txq {
        u16 count;
        u16 offset;
};

#if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
/*
 * This structure is to hold information about the device
 * configured to run FCoE protocol stack.
 */
struct netdev_fcoe_hbainfo {
        char        manufacturer[64];
        char        serial_number[64];
        char        hardware_version[64];
        char        driver_version[64];
        char        optionrom_version[64];
        char        firmware_version[64];
        char        model[256];
        char        model_description[256];
};
#endif

#define MAX_PHYS_ITEM_ID_LEN 32

/* This structure holds a unique identifier to identify some
 * physical item (port for example) used by a netdevice.
 */
struct netdev_phys_item_id {
        unsigned char id[MAX_PHYS_ITEM_ID_LEN];
        unsigned char id_len;
};

static inline bool netdev_phys_item_id_same(struct netdev_phys_item_id *a,
                                            struct netdev_phys_item_id *b)
{
        return a->id_len == b->id_len &&
               memcmp(a->id, b->id, a->id_len) == 0;
}

typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
                                       struct sk_buff *skb,
                                       struct net_device *sb_dev);

enum tc_setup_type {
        TC_SETUP_QDISC_MQPRIO,
        TC_SETUP_CLSU32,
        TC_SETUP_CLSFLOWER,
        TC_SETUP_CLSMATCHALL,
        TC_SETUP_CLSBPF,
        TC_SETUP_BLOCK,
        TC_SETUP_QDISC_CBS,
        TC_SETUP_QDISC_RED,
        TC_SETUP_QDISC_PRIO,
        TC_SETUP_QDISC_MQ,
        TC_SETUP_QDISC_ETF,
        TC_SETUP_ROOT_QDISC,
        TC_SETUP_QDISC_GRED,
        TC_SETUP_QDISC_TAPRIO,
        TC_SETUP_FT,
        TC_SETUP_QDISC_ETS,
        TC_SETUP_QDISC_TBF,
        TC_SETUP_QDISC_FIFO,
};

/* These structures hold the attributes of bpf state that are being passed
 * to the netdevice through the bpf op.
 */
enum bpf_netdev_command {
        /* Set or clear a bpf program used in the earliest stages of packet
         * rx. The prog will have been loaded as BPF_PROG_TYPE_XDP. The callee
         * is responsible for calling bpf_prog_put on any old progs that are
         * stored. In case of error, the callee need not release the new prog
         * reference, but on success it takes ownership and must bpf_prog_put
         * when it is no longer used.
         */
        XDP_SETUP_PROG,
        XDP_SETUP_PROG_HW,
        /* BPF program for offload callbacks, invoked at program load time. */
        BPF_OFFLOAD_MAP_ALLOC,
        BPF_OFFLOAD_MAP_FREE,
        XDP_SETUP_XSK_POOL,
};

struct bpf_prog_offload_ops;
struct netlink_ext_ack;
struct xdp_umem;
struct xdp_dev_bulk_queue;
struct bpf_xdp_link;

enum bpf_xdp_mode {
        XDP_MODE_SKB = 0,
        XDP_MODE_DRV = 1,
        XDP_MODE_HW = 2,
        __MAX_XDP_MODE
};

struct bpf_xdp_entity {
        struct bpf_prog *prog;
        struct bpf_xdp_link *link;
};

struct netdev_bpf {
        enum bpf_netdev_command command;
        union {
                /* XDP_SETUP_PROG */
                struct {
                        u32 flags;
                        struct bpf_prog *prog;
                        struct netlink_ext_ack *extack;
                };
                /* BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE */
                struct {
                        struct bpf_offloaded_map *offmap;
                };
                /* XDP_SETUP_XSK_POOL */
                struct {
                        struct xsk_buff_pool *pool;
                        u16 queue_id;
                } xsk;
        };
};

/* Flags for ndo_xsk_wakeup. */
#define XDP_WAKEUP_RX (1 << 0)
#define XDP_WAKEUP_TX (1 << 1)

#ifdef CONFIG_XFRM_OFFLOAD
struct xfrmdev_ops {
        int        (*xdo_dev_state_add) (struct xfrm_state *x);
        void        (*xdo_dev_state_delete) (struct xfrm_state *x);
        void        (*xdo_dev_state_free) (struct xfrm_state *x);
        bool        (*xdo_dev_offload_ok) (struct sk_buff *skb,
                                       struct xfrm_state *x);
        void        (*xdo_dev_state_advance_esn) (struct xfrm_state *x);
};
#endif

struct dev_ifalias {
        struct rcu_head rcuhead;
        char ifalias[];
};

struct devlink;
struct tlsdev_ops;

struct netdev_name_node {
        struct hlist_node hlist;
        struct list_head list;
        struct net_device *dev;
        const char *name;
};

int netdev_name_node_alt_create(struct net_device *dev, const char *name);
int netdev_name_node_alt_destroy(struct net_device *dev, const char *name);

struct netdev_net_notifier {
        struct list_head list;
        struct notifier_block *nb;
};

/*
 * This structure defines the management hooks for network devices.
 * The following hooks can be defined; unless noted otherwise, they are
 * optional and can be filled with a null pointer.
 *
 * int (*ndo_init)(struct net_device *dev);
 *     This function is called once when a network device is registered.
 *     The network device can use this for any late stage initialization
 *     or semantic validation. It can fail with an error code which will
 *     be propagated back to register_netdev.
 *
 * void (*ndo_uninit)(struct net_device *dev);
 *     This function is called when device is unregistered or when registration
 *     fails. It is not called if init fails.
 *
 * int (*ndo_open)(struct net_device *dev);
 *     This function is called when a network device transitions to the up
 *     state.
 *
 * int (*ndo_stop)(struct net_device *dev);
 *     This function is called when a network device transitions to the down
 *     state.
 *
 * netdev_tx_t (*ndo_start_xmit)(struct sk_buff *skb,
 *                               struct net_device *dev);
 *        Called when a packet needs to be transmitted.
 *        Returns NETDEV_TX_OK.  Can return NETDEV_TX_BUSY, but you should stop
 *        the queue before that can happen; it's for obsolete devices and weird
 *        corner cases, but the stack really does a non-trivial amount
 *        of useless work if you return NETDEV_TX_BUSY.
 *        Required; cannot be NULL.
 *
 * netdev_features_t (*ndo_features_check)(struct sk_buff *skb,
 *                                           struct net_device *dev
 *                                           netdev_features_t features);
 *        Called by core transmit path to determine if device is capable of
 *        performing offload operations on a given packet. This is to give
 *        the device an opportunity to implement any restrictions that cannot
 *        be otherwise expressed by feature flags. The check is called with
 *        the set of features that the stack has calculated and it returns
 *        those the driver believes to be appropriate.
 *
 * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb,
 *                         struct net_device *sb_dev);
 *        Called to decide which queue to use when device supports multiple
 *        transmit queues.
 *
 * void (*ndo_change_rx_flags)(struct net_device *dev, int flags);
 *        This function is called to allow device receiver to make
 *        changes to configuration when multicast or promiscuous is enabled.
 *
 * void (*ndo_set_rx_mode)(struct net_device *dev);
 *        This function is called device changes address list filtering.
 *        If driver handles unicast address filtering, it should set
 *        IFF_UNICAST_FLT in its priv_flags.
 *
 * int (*ndo_set_mac_address)(struct net_device *dev, void *addr);
 *        This function  is called when the Media Access Control address
 *        needs to be changed. If this interface is not defined, the
 *        MAC address can not be changed.
 *
 * int (*ndo_validate_addr)(struct net_device *dev);
 *        Test if Media Access Control address is valid for the device.
 *
 * int (*ndo_do_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd);
 *        Called when a user requests an ioctl which can't be handled by
 *        the generic interface code. If not defined ioctls return
 *        not supported error code.
 *
 * int (*ndo_set_config)(struct net_device *dev, struct ifmap *map);
 *        Used to set network devices bus interface parameters. This interface
 *        is retained for legacy reasons; new devices should use the bus
 *        interface (PCI) for low level management.
 *
 * int (*ndo_change_mtu)(struct net_device *dev, int new_mtu);
 *        Called when a user wants to change the Maximum Transfer Unit
 *        of a device.
 *
 * void (*ndo_tx_timeout)(struct net_device *dev, unsigned int txqueue);
 *        Callback used when the transmitter has not made any progress
 *        for dev->watchdog ticks.
 *
 * void (*ndo_get_stats64)(struct net_device *dev,
 *                         struct rtnl_link_stats64 *storage);
 * struct net_device_stats* (*ndo_get_stats)(struct net_device *dev);
 *        Called when a user wants to get the network device usage
 *        statistics. Drivers must do one of the following:
 *        1. Define @ndo_get_stats64 to fill in a zero-initialised
 *           rtnl_link_stats64 structure passed by the caller.
 *        2. Define @ndo_get_stats to update a net_device_stats structure
 *           (which should normally be dev->stats) and return a pointer to
 *           it. The structure may be changed asynchronously only if each
 *           field is written atomically.
 *        3. Update dev->stats asynchronously and atomically, and define
 *           neither operation.
 *
 * bool (*ndo_has_offload_stats)(const struct net_device *dev, int attr_id)
 *        Return true if this device supports offload stats of this attr_id.
 *
 * int (*ndo_get_offload_stats)(int attr_id, const struct net_device *dev,
 *        void *attr_data)
 *        Get statistics for offload operations by attr_id. Write it into the
 *        attr_data pointer.
 *
 * int (*ndo_vlan_rx_add_vid)(struct net_device *dev, __be16 proto, u16 vid);
 *        If device supports VLAN filtering this function is called when a
 *        VLAN id is registered.
 *
 * int (*ndo_vlan_rx_kill_vid)(struct net_device *dev, __be16 proto, u16 vid);
 *        If device supports VLAN filtering this function is called when a
 *        VLAN id is unregistered.
 *
 * void (*ndo_poll_controller)(struct net_device *dev);
 *
 *        SR-IOV management functions.
 * int (*ndo_set_vf_mac)(struct net_device *dev, int vf, u8* mac);
 * int (*ndo_set_vf_vlan)(struct net_device *dev, int vf, u16 vlan,
 *                          u8 qos, __be16 proto);
 * int (*ndo_set_vf_rate)(struct net_device *dev, int vf, int min_tx_rate,
 *                          int max_tx_rate);
 * int (*ndo_set_vf_spoofchk)(struct net_device *dev, int vf, bool setting);
 * int (*ndo_set_vf_trust)(struct net_device *dev, int vf, bool setting);
 * int (*ndo_get_vf_config)(struct net_device *dev,
 *                            int vf, struct ifla_vf_info *ivf);
 * int (*ndo_set_vf_link_state)(struct net_device *dev, int vf, int link_state);
 * int (*ndo_set_vf_port)(struct net_device *dev, int vf,
 *                          struct nlattr *port[]);
 *
 *      Enable or disable the VF ability to query its RSS Redirection Table and
 *      Hash Key. This is needed since on some devices VF share this information
 *      with PF and querying it may introduce a theoretical security risk.
 * int (*ndo_set_vf_rss_query_en)(struct net_device *dev, int vf, bool setting);
 * int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb);
 * int (*ndo_setup_tc)(struct net_device *dev, enum tc_setup_type type,
 *                       void *type_data);
 *        Called to setup any 'tc' scheduler, classifier or action on @dev.
 *        This is always called from the stack with the rtnl lock held and netif
 *        tx queues stopped. This allows the netdevice to perform queue
 *        management safely.
 *
 *        Fiber Channel over Ethernet (FCoE) offload functions.
 * int (*ndo_fcoe_enable)(struct net_device *dev);
 *        Called when the FCoE protocol stack wants to start using LLD for FCoE
 *        so the underlying device can perform whatever needed configuration or
 *        initialization to support acceleration of FCoE traffic.
 *
 * int (*ndo_fcoe_disable)(struct net_device *dev);
 *        Called when the FCoE protocol stack wants to stop using LLD for FCoE
 *        so the underlying device can perform whatever needed clean-ups to
 *        stop supporting acceleration of FCoE traffic.
 *
 * int (*ndo_fcoe_ddp_setup)(struct net_device *dev, u16 xid,
 *                             struct scatterlist *sgl, unsigned int sgc);
 *        Called when the FCoE Initiator wants to initialize an I/O that
 *        is a possible candidate for Direct Data Placement (DDP). The LLD can
 *        perform necessary setup and returns 1 to indicate the device is set up
 *        successfully to perform DDP on this I/O, otherwise this returns 0.
 *
 * int (*ndo_fcoe_ddp_done)(struct net_device *dev,  u16 xid);
 *        Called when the FCoE Initiator/Target is done with the DDPed I/O as
 *        indicated by the FC exchange id 'xid', so the underlying device can
 *        clean up and reuse resources for later DDP requests.
 *
 * int (*ndo_fcoe_ddp_target)(struct net_device *dev, u16 xid,
 *                              struct scatterlist *sgl, unsigned int sgc);
 *        Called when the FCoE Target wants to initialize an I/O that
 *        is a possible candidate for Direct Data Placement (DDP). The LLD can
 *        perform necessary setup and returns 1 to indicate the device is set up
 *        successfully to perform DDP on this I/O, otherwise this returns 0.
 *
 * int (*ndo_fcoe_get_hbainfo)(struct net_device *dev,
 *                               struct netdev_fcoe_hbainfo *hbainfo);
 *        Called when the FCoE Protocol stack wants information on the underlying
 *        device. This information is utilized by the FCoE protocol stack to
 *        register attributes with Fiber Channel management service as per the
 *        FC-GS Fabric Device Management Information(FDMI) specification.
 *
 * int (*ndo_fcoe_get_wwn)(struct net_device *dev, u64 *wwn, int type);
 *        Called when the underlying device wants to override default World Wide
 *        Name (WWN) generation mechanism in FCoE protocol stack to pass its own
 *        World Wide Port Name (WWPN) or World Wide Node Name (WWNN) to the FCoE
 *        protocol stack to use.
 *
 *        RFS acceleration.
 * int (*ndo_rx_flow_steer)(struct net_device *dev, const struct sk_buff *skb,
 *                            u16 rxq_index, u32 flow_id);
 *        Set hardware filter for RFS.  rxq_index is the target queue index;
 *        flow_id is a flow ID to be passed to rps_may_expire_flow() later.
 *        Return the filter ID on success, or a negative error code.
 *
 *        Slave management functions (for bridge, bonding, etc).
 * int (*ndo_add_slave)(struct net_device *dev, struct net_device *slave_dev);
 *        Called to make another netdev an underling.
 *
 * int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev);
 *        Called to release previously enslaved netdev.
 *
 * struct net_device *(*ndo_get_xmit_slave)(struct net_device *dev,
 *                                            struct sk_buff *skb,
 *                                            bool all_slaves);
 *        Get the xmit slave of master device. If all_slaves is true, function
 *        assume all the slaves can transmit.
 *
 *      Feature/offload setting functions.
 * netdev_features_t (*ndo_fix_features)(struct net_device *dev,
 *                netdev_features_t features);
 *        Adjusts the requested feature flags according to device-specific
 *        constraints, and returns the resulting flags. Must not modify
 *        the device state.
 *
 * int (*ndo_set_features)(struct net_device *dev, netdev_features_t features);
 *        Called to update device configuration to new features. Passed
 *        feature set might be less than what was returned by ndo_fix_features()).
 *        Must return >0 or -errno if it changed dev->features itself.
 *
 * int (*ndo_fdb_add)(struct ndmsg *ndm, struct nlattr *tb[],
 *                      struct net_device *dev,
 *                      const unsigned char *addr, u16 vid, u16 flags,
 *                      struct netlink_ext_ack *extack);
 *        Adds an FDB entry to dev for addr.
 * int (*ndo_fdb_del)(struct ndmsg *ndm, struct nlattr *tb[],
 *                      struct net_device *dev,
 *                      const unsigned char *addr, u16 vid)
 *        Deletes the FDB entry from dev coresponding to addr.
 * int (*ndo_fdb_del_bulk)(struct ndmsg *ndm, struct nlattr *tb[],
 *                           struct net_device *dev,
 *                           u16 vid,
 *                           struct netlink_ext_ack *extack);
 * int (*ndo_fdb_dump)(struct sk_buff *skb, struct netlink_callback *cb,
 *                       struct net_device *dev, struct net_device *filter_dev,
 *                       int *idx)
 *        Used to add FDB entries to dump requests. Implementers should add
 *        entries to skb and update idx with the number of entries.
 *
 * int (*ndo_bridge_setlink)(struct net_device *dev, struct nlmsghdr *nlh,
 *                             u16 flags, struct netlink_ext_ack *extack)
 * int (*ndo_bridge_getlink)(struct sk_buff *skb, u32 pid, u32 seq,
 *                             struct net_device *dev, u32 filter_mask,
 *                             int nlflags)
 * int (*ndo_bridge_dellink)(struct net_device *dev, struct nlmsghdr *nlh,
 *                             u16 flags);
 *
 * int (*ndo_change_carrier)(struct net_device *dev, bool new_carrier);
 *        Called to change device carrier. Soft-devices (like dummy, team, etc)
 *        which do not represent real hardware may define this to allow their
 *        userspace components to manage their virtual carrier state. Devices
 *        that determine carrier state from physical hardware properties (eg
 *        network cables) or protocol-dependent mechanisms (eg
 *        USB_CDC_NOTIFY_NETWORK_CONNECTION) should NOT implement this function.
 *
 * int (*ndo_get_phys_port_id)(struct net_device *dev,
 *                               struct netdev_phys_item_id *ppid);
 *        Called to get ID of physical port of this device. If driver does
 *        not implement this, it is assumed that the hw is not able to have
 *        multiple net devices on single physical port.
 *
 * int (*ndo_get_port_parent_id)(struct net_device *dev,
 *                                 struct netdev_phys_item_id *ppid)
 *        Called to get the parent ID of the physical port of this device.
 *
 * void (*ndo_udp_tunnel_add)(struct net_device *dev,
 *                              struct udp_tunnel_info *ti);
 *        Called by UDP tunnel to notify a driver about the UDP port and socket
 *        address family that a UDP tunnel is listnening to. It is called only
 *        when a new port starts listening. The operation is protected by the
 *        RTNL.
 *
 * void (*ndo_udp_tunnel_del)(struct net_device *dev,
 *                              struct udp_tunnel_info *ti);
 *        Called by UDP tunnel to notify the driver about a UDP port and socket
 *        address family that the UDP tunnel is not listening to anymore. The
 *        operation is protected by the RTNL.
 *
 * void* (*ndo_dfwd_add_station)(struct net_device *pdev,
 *                                 struct net_device *dev)
 *        Called by upper layer devices to accelerate switching or other
 *        station functionality into hardware. 'pdev is the lowerdev
 *        to use for the offload and 'dev' is the net device that will
 *        back the offload. Returns a pointer to the private structure
 *        the upper layer will maintain.
 * void (*ndo_dfwd_del_station)(struct net_device *pdev, void *priv)
 *        Called by upper layer device to delete the station created
 *        by 'ndo_dfwd_add_station'. 'pdev' is the net device backing
 *        the station and priv is the structure returned by the add
 *        operation.
 * int (*ndo_set_tx_maxrate)(struct net_device *dev,
 *                             int queue_index, u32 maxrate);
 *        Called when a user wants to set a max-rate limitation of specific
 *        TX queue.
 * int (*ndo_get_iflink)(const struct net_device *dev);
 *        Called to get the iflink value of this device.
 * void (*ndo_change_proto_down)(struct net_device *dev,
 *                                 bool proto_down);
 *        This function is used to pass protocol port error state information
 *        to the switch driver. The switch driver can react to the proto_down
 *      by doing a phys down on the associated switch port.
 * int (*ndo_fill_metadata_dst)(struct net_device *dev, struct sk_buff *skb);
 *        This function is used to get egress tunnel information for given skb.
 *        This is useful for retrieving outer tunnel header parameters while
 *        sampling packet.
 * void (*ndo_set_rx_headroom)(struct net_device *dev, int needed_headroom);
 *        This function is used to specify the headroom that the skb must
 *        consider when allocation skb during packet reception. Setting
 *        appropriate rx headroom value allows avoiding skb head copy on
 *        forward. Setting a negative value resets the rx headroom to the
 *        default value.
 * int (*ndo_bpf)(struct net_device *dev, struct netdev_bpf *bpf);
 *        This function is used to set or query state related to XDP on the
 *        netdevice and manage BPF offload. See definition of
 *        enum bpf_netdev_command for details.
 * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp,
 *                        u32 flags);
 *        This function is used to submit @n XDP packets for transmit on a
 *        netdevice. Returns number of frames successfully transmitted, frames
 *        that got dropped are freed/returned via xdp_return_frame().
 *        Returns negative number, means general error invoking ndo, meaning
 *        no frames were xmit'ed and core-caller will free all frames.
 * int (*ndo_xsk_wakeup)(struct net_device *dev, u32 queue_id, u32 flags);
 *      This function is used to wake up the softirq, ksoftirqd or kthread
 *        responsible for sending and/or receiving packets on a specific
 *        queue id bound to an AF_XDP socket. The flags field specifies if
 *        only RX, only Tx, or both should be woken up using the flags
 *        XDP_WAKEUP_RX and XDP_WAKEUP_TX.
 * struct devlink_port *(*ndo_get_devlink_port)(struct net_device *dev);
 *        Get devlink port instance associated with a given netdev.
 *        Called with a reference on the netdevice and devlink locks only,
 *        rtnl_lock is not held.
 * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p,
 *                         int cmd);
 *        Add, change, delete or get information on an IPv4 tunnel.
 * struct net_device *(*ndo_get_peer_dev)(struct net_device *dev);
 *        If a device is paired with a peer device, return the peer instance.
 *        The caller must be under RCU read context.
 */
struct net_device_ops {
        int                        (*ndo_init)(struct net_device *dev);
        void                        (*ndo_uninit)(struct net_device *dev);
        int                        (*ndo_open)(struct net_device *dev);
        int                        (*ndo_stop)(struct net_device *dev);
        netdev_tx_t                (*ndo_start_xmit)(struct sk_buff *skb,
                                                  struct net_device *dev);
        netdev_features_t        (*ndo_features_check)(struct sk_buff *skb,
                                                      struct net_device *dev,
                                                      netdev_features_t features);
        u16                        (*ndo_select_queue)(struct net_device *dev,
                                                    struct sk_buff *skb,
                                                    struct net_device *sb_dev);
        void                        (*ndo_change_rx_flags)(struct net_device *dev,
                                                       int flags);
        void                        (*ndo_set_rx_mode)(struct net_device *dev);
        int                        (*ndo_set_mac_address)(struct net_device *dev,
                                                       void *addr);
        int                        (*ndo_validate_addr)(struct net_device *dev);
        int                        (*ndo_do_ioctl)(struct net_device *dev,
                                                struct ifreq *ifr, int cmd);
        int                        (*ndo_set_config)(struct net_device *dev,
                                                  struct ifmap *map);
        int                        (*ndo_change_mtu)(struct net_device *dev,
                                                  int new_mtu);
        int                        (*ndo_neigh_setup)(struct net_device *dev,
                                                   struct neigh_parms *);
        void                        (*ndo_tx_timeout) (struct net_device *dev,
                                                   unsigned int txqueue);

        void                        (*ndo_get_stats64)(struct net_device *dev,
                                                   struct rtnl_link_stats64 *storage);
        bool                        (*ndo_has_offload_stats)(const struct net_device *dev, int attr_id);
        int                        (*ndo_get_offload_stats)(int attr_id,
                                                         const struct net_device *dev,
                                                         void *attr_data);
        struct net_device_stats* (*ndo_get_stats)(struct net_device *dev);

        int                        (*ndo_vlan_rx_add_vid)(struct net_device *dev,
                                                       __be16 proto, u16 vid);
        int                        (*ndo_vlan_rx_kill_vid)(struct net_device *dev,
                                                        __be16 proto, u16 vid);
#ifdef CONFIG_NET_POLL_CONTROLLER
        void                    (*ndo_poll_controller)(struct net_device *dev);
        int                        (*ndo_netpoll_setup)(struct net_device *dev,
                                                     struct netpoll_info *info);
        void                        (*ndo_netpoll_cleanup)(struct net_device *dev);
#endif
        int                        (*ndo_set_vf_mac)(struct net_device *dev,
                                                  int queue, u8 *mac);
        int                        (*ndo_set_vf_vlan)(struct net_device *dev,
                                                   int queue, u16 vlan,
                                                   u8 qos, __be16 proto);
        int                        (*ndo_set_vf_rate)(struct net_device *dev,
                                                   int vf, int min_tx_rate,
                                                   int max_tx_rate);
        int                        (*ndo_set_vf_spoofchk)(struct net_device *dev,
                                                       int vf, bool setting);
        int                        (*ndo_set_vf_trust)(struct net_device *dev,
                                                    int vf, bool setting);
        int                        (*ndo_get_vf_config)(struct net_device *dev,
                                                     int vf,
                                                     struct ifla_vf_info *ivf);
        int                        (*ndo_set_vf_link_state)(struct net_device *dev,
                                                         int vf, int link_state);
        int                        (*ndo_get_vf_stats)(struct net_device *dev,
                                                    int vf,
                                                    struct ifla_vf_stats
                                                    *vf_stats);
        int                        (*ndo_set_vf_port)(struct net_device *dev,
                                                   int vf,
                                                   struct nlattr *port[]);
        int                        (*ndo_get_vf_port)(struct net_device *dev,
                                                   int vf, struct sk_buff *skb);
        int                        (*ndo_get_vf_guid)(struct net_device *dev,
                                                   int vf,
                                                   struct ifla_vf_guid *node_guid,
                                                   struct ifla_vf_guid *port_guid);
        int                        (*ndo_set_vf_guid)(struct net_device *dev,
                                                   int vf, u64 guid,
                                                   int guid_type);
        int                        (*ndo_set_vf_rss_query_en)(
                                                   struct net_device *dev,
                                                   int vf, bool setting);
        int                        (*ndo_setup_tc)(struct net_device *dev,
                                                enum tc_setup_type type,
                                                void *type_data);
#if IS_ENABLED(CONFIG_FCOE)
        int                        (*ndo_fcoe_enable)(struct net_device *dev);
        int                        (*ndo_fcoe_disable)(struct net_device *dev);
        int                        (*ndo_fcoe_ddp_setup)(struct net_device *dev,
                                                      u16 xid,
                                                      struct scatterlist *sgl,
                                                      unsigned int sgc);
        int                        (*ndo_fcoe_ddp_done)(struct net_device *dev,
                                                     u16 xid);
        int                        (*ndo_fcoe_ddp_target)(struct net_device *dev,
                                                       u16 xid,
                                                       struct scatterlist *sgl,
                                                       unsigned int sgc);
        int                        (*ndo_fcoe_get_hbainfo)(struct net_device *dev,
                                                        struct netdev_fcoe_hbainfo *hbainfo);
#endif

#if IS_ENABLED(CONFIG_LIBFCOE)
#define NETDEV_FCOE_WWNN 0
#define NETDEV_FCOE_WWPN 1
        int                        (*ndo_fcoe_get_wwn)(struct net_device *dev,
                                                    u64 *wwn, int type);
#endif

#ifdef CONFIG_RFS_ACCEL
        int                        (*ndo_rx_flow_steer)(struct net_device *dev,
                                                     const struct sk_buff *skb,
                                                     u16 rxq_index,
                                                     u32 flow_id);
#endif
        int                        (*ndo_add_slave)(struct net_device *dev,
                                                 struct net_device *slave_dev,
                                                 struct netlink_ext_ack *extack);
        int                        (*ndo_del_slave)(struct net_device *dev,
                                                 struct net_device *slave_dev);
        struct net_device*        (*ndo_get_xmit_slave)(struct net_device *dev,
                                                      struct sk_buff *skb,
                                                      bool all_slaves);
        netdev_features_t        (*ndo_fix_features)(struct net_device *dev,
                                                    netdev_features_t features);
        int                        (*ndo_set_features)(struct net_device *dev,
                                                    netdev_features_t features);
        int                        (*ndo_neigh_construct)(struct net_device *dev,
                                                       struct neighbour *n);
        void                        (*ndo_neigh_destroy)(struct net_device *dev,
                                                     struct neighbour *n);

        int                        (*ndo_fdb_add)(struct ndmsg *ndm,
                                               struct nlattr *tb[],
                                               struct net_device *dev,
                                               const unsigned char *addr,
                                               u16 vid,
                                               u16 flags,
                                               struct netlink_ext_ack *extack);
        int                        (*ndo_fdb_del)(struct ndmsg *ndm,
                                               struct nlattr *tb[],
                                               struct net_device *dev,
                                               const unsigned char *addr,
                                               u16 vid);
        int                        (*ndo_fdb_del_bulk)(struct ndmsg *ndm,
                                                    struct nlattr *tb[],
                                                    struct net_device *dev,
                                                    u16 vid,
                                                    struct netlink_ext_ack *extack);
        int                        (*ndo_fdb_dump)(struct sk_buff *skb,
                                                struct netlink_callback *cb,
                                                struct net_device *dev,
                                                struct net_device *filter_dev,
                                                int *idx);
        int                        (*ndo_fdb_get)(struct sk_buff *skb,
                                               struct nlattr *tb[],
                                               struct net_device *dev,
                                               const unsigned char *addr,
                                               u16 vid, u32 portid, u32 seq,
                                               struct netlink_ext_ack *extack);
        int                        (*ndo_bridge_setlink)(struct net_device *dev,
                                                      struct nlmsghdr *nlh,
                                                      u16 flags,
                                                      struct netlink_ext_ack *extack);
        int                        (*ndo_bridge_getlink)(struct sk_buff *skb,
                                                      u32 pid, u32 seq,
                                                      struct net_device *dev,
                                                      u32 filter_mask,
                                                      int nlflags);
        int                        (*ndo_bridge_dellink)(struct net_device *dev,
                                                      struct nlmsghdr *nlh,
                                                      u16 flags);
        int                        (*ndo_change_carrier)(struct net_device *dev,
                                                      bool new_carrier);
        int                        (*ndo_get_phys_port_id)(struct net_device *dev,
                                                        struct netdev_phys_item_id *ppid);
        int                        (*ndo_get_port_parent_id)(struct net_device *dev,
                                                          struct netdev_phys_item_id *ppid);
        int                        (*ndo_get_phys_port_name)(struct net_device *dev,
                                                          char *name, size_t len);
        void                        (*ndo_udp_tunnel_add)(struct net_device *dev,
                                                      struct udp_tunnel_info *ti);
        void                        (*ndo_udp_tunnel_del)(struct net_device *dev,
                                                      struct udp_tunnel_info *ti);
        void*                        (*ndo_dfwd_add_station)(struct net_device *pdev,
                                                        struct net_device *dev);
        void                        (*ndo_dfwd_del_station)(struct net_device *pdev,
                                                        void *priv);

        int                        (*ndo_set_tx_maxrate)(struct net_device *dev,
                                                      int queue_index,
                                                      u32 maxrate);
        int                        (*ndo_get_iflink)(const struct net_device *dev);
        int                        (*ndo_change_proto_down)(struct net_device *dev,
                                                         bool proto_down);
        int                        (*ndo_fill_metadata_dst)(struct net_device *dev,
                                                       struct sk_buff *skb);
        void                        (*ndo_set_rx_headroom)(struct net_device *dev,
                                                       int needed_headroom);
        int                        (*ndo_bpf)(struct net_device *dev,
                                           struct netdev_bpf *bpf);
        int                        (*ndo_xdp_xmit)(struct net_device *dev, int n,
                                                struct xdp_frame **xdp,
                                                u32 flags);
        int                        (*ndo_xsk_wakeup)(struct net_device *dev,
                                                  u32 queue_id, u32 flags);
        struct devlink_port *        (*ndo_get_devlink_port)(struct net_device *dev);
        int                        (*ndo_tunnel_ctl)(struct net_device *dev,
                                                  struct ip_tunnel_parm *p, int cmd);
        struct net_device *        (*ndo_get_peer_dev)(struct net_device *dev);
};

/**
 * enum net_device_priv_flags - &struct net_device priv_flags
 *
 * These are the &struct net_device, they are only set internally
 * by drivers and used in the kernel. These flags are invisible to
 * userspace; this means that the order of these flags can change
 * during any kernel release.
 *
 * You should have a pretty good reason to be extending these flags.
 *
 * @IFF_802_1Q_VLAN: 802.1Q VLAN device
 * @IFF_EBRIDGE: Ethernet bridging device
 * @IFF_BONDING: bonding master or slave
 * @IFF_ISATAP: ISATAP interface (RFC4214)
 * @IFF_WAN_HDLC: WAN HDLC device
 * @IFF_XMIT_DST_RELEASE: dev_hard_start_xmit() is allowed to
 *        release skb->dst
 * @IFF_DONT_BRIDGE: disallow bridging this ether dev
 * @IFF_DISABLE_NETPOLL: disable netpoll at run-time
 * @IFF_MACVLAN_PORT: device used as macvlan port
 * @IFF_BRIDGE_PORT: device used as bridge port
 * @IFF_OVS_DATAPATH: device used as Open vSwitch datapath port
 * @IFF_TX_SKB_SHARING: The interface supports sharing skbs on transmit
 * @IFF_UNICAST_FLT: Supports unicast filtering
 * @IFF_TEAM_PORT: device used as team port
 * @IFF_SUPP_NOFCS: device supports sending custom FCS
 * @IFF_LIVE_ADDR_CHANGE: device supports hardware address
 *        change when it's running
 * @IFF_MACVLAN: Macvlan device
 * @IFF_XMIT_DST_RELEASE_PERM: IFF_XMIT_DST_RELEASE not taking into account
 *        underlying stacked devices
 * @IFF_L3MDEV_MASTER: device is an L3 master device
 * @IFF_NO_QUEUE: device can run without qdisc attached
 * @IFF_OPENVSWITCH: device is a Open vSwitch master
 * @IFF_L3MDEV_SLAVE: device is enslaved to an L3 master device
 * @IFF_TEAM: device is a team device
 * @IFF_RXFH_CONFIGURED: device has had Rx Flow indirection table configured
 * @IFF_PHONY_HEADROOM: the headroom value is controlled by an external
 *        entity (i.e. the master device for bridged veth)
 * @IFF_MACSEC: device is a MACsec device
 * @IFF_NO_RX_HANDLER: device doesn't support the rx_handler hook
 * @IFF_FAILOVER: device is a failover master device
 * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device
 * @IFF_L3MDEV_RX_HANDLER: only invoke the rx handler of L3 master device
 * @IFF_LIVE_RENAME_OK: rename is allowed while device is up and running
 */
enum netdev_priv_flags {
        IFF_802_1Q_VLAN                        = 1<<0,
        IFF_EBRIDGE                        = 1<<1,
        IFF_BONDING                        = 1<<2,
        IFF_ISATAP                        = 1<<3,
        IFF_WAN_HDLC                        = 1<<4,
        IFF_XMIT_DST_RELEASE                = 1<<5,
        IFF_DONT_BRIDGE                        = 1<<6,
        IFF_DISABLE_NETPOLL                = 1<<7,
        IFF_MACVLAN_PORT                = 1<<8,
        IFF_BRIDGE_PORT                        = 1<<9,
        IFF_OVS_DATAPATH                = 1<<10,
        IFF_TX_SKB_SHARING                = 1<<11,
        IFF_UNICAST_FLT                        = 1<<12,
        IFF_TEAM_PORT                        = 1<<13,
        IFF_SUPP_NOFCS                        = 1<<14,
        IFF_LIVE_ADDR_CHANGE                = 1<<15,
        IFF_MACVLAN                        = 1<<16,
        IFF_XMIT_DST_RELEASE_PERM        = 1<<17,
        IFF_L3MDEV_MASTER                = 1<<18,
        IFF_NO_QUEUE                        = 1<<19,
        IFF_OPENVSWITCH                        = 1<<20,
        IFF_L3MDEV_SLAVE                = 1<<21,
        IFF_TEAM                        = 1<<22,
        IFF_RXFH_CONFIGURED                = 1<<23,
        IFF_PHONY_HEADROOM                = 1<<24,
        IFF_MACSEC                        = 1<<25,
        IFF_NO_RX_HANDLER                = 1<<26,
        IFF_FAILOVER                        = 1<<27,
        IFF_FAILOVER_SLAVE                = 1<<28,
        IFF_L3MDEV_RX_HANDLER                = 1<<29,
        IFF_LIVE_RENAME_OK                = 1<<30,
};

#define IFF_802_1Q_VLAN                        IFF_802_1Q_VLAN
#define IFF_EBRIDGE                        IFF_EBRIDGE
#define IFF_BONDING                        IFF_BONDING
#define IFF_ISATAP                        IFF_ISATAP
#define IFF_WAN_HDLC                        IFF_WAN_HDLC
#define IFF_XMIT_DST_RELEASE                IFF_XMIT_DST_RELEASE
#define IFF_DONT_BRIDGE                        IFF_DONT_BRIDGE
#define IFF_DISABLE_NETPOLL                IFF_DISABLE_NETPOLL
#define IFF_MACVLAN_PORT                IFF_MACVLAN_PORT
#define IFF_BRIDGE_PORT                        IFF_BRIDGE_PORT
#define IFF_OVS_DATAPATH                IFF_OVS_DATAPATH
#define IFF_TX_SKB_SHARING                IFF_TX_SKB_SHARING
#define IFF_UNICAST_FLT                        IFF_UNICAST_FLT
#define IFF_TEAM_PORT                        IFF_TEAM_PORT
#define IFF_SUPP_NOFCS                        IFF_SUPP_NOFCS
#define IFF_LIVE_ADDR_CHANGE                IFF_LIVE_ADDR_CHANGE
#define IFF_MACVLAN                        IFF_MACVLAN
#define IFF_XMIT_DST_RELEASE_PERM        IFF_XMIT_DST_RELEASE_PERM
#define IFF_L3MDEV_MASTER                IFF_L3MDEV_MASTER
#define IFF_NO_QUEUE                        IFF_NO_QUEUE
#define IFF_OPENVSWITCH                        IFF_OPENVSWITCH
#define IFF_L3MDEV_SLAVE                IFF_L3MDEV_SLAVE
#define IFF_TEAM                        IFF_TEAM
#define IFF_RXFH_CONFIGURED                IFF_RXFH_CONFIGURED
#define IFF_MACSEC                        IFF_MACSEC
#define IFF_NO_RX_HANDLER                IFF_NO_RX_HANDLER
#define IFF_FAILOVER                        IFF_FAILOVER
#define IFF_FAILOVER_SLAVE                IFF_FAILOVER_SLAVE
#define IFF_L3MDEV_RX_HANDLER                IFF_L3MDEV_RX_HANDLER
#define IFF_LIVE_RENAME_OK                IFF_LIVE_RENAME_OK

/* Specifies the type of the struct net_device::ml_priv pointer */
enum netdev_ml_priv_type {
        ML_PRIV_NONE,
        ML_PRIV_CAN,
};

/**
 *        struct net_device - The DEVICE structure.
 *
 *        Actually, this whole structure is a big mistake.  It mixes I/O
 *        data with strictly "high-level" data, and it has to know about
 *        almost every data structure used in the INET module.
 *
 *        @name:        This is the first field of the "visible" part of this structure
 *                (i.e. as seen by users in the "Space.c" file).  It is the name
 *                of the interface.
 *
 *        @name_node:        Name hashlist node
 *        @ifalias:        SNMP alias
 *        @mem_end:        Shared memory end
 *        @mem_start:        Shared memory start
 *        @base_addr:        Device I/O address
 *        @irq:                Device IRQ number
 *
 *        @state:                Generic network queuing layer state, see netdev_state_t
 *        @dev_list:        The global list of network devices
 *        @napi_list:        List entry used for polling NAPI devices
 *        @unreg_list:        List entry  when we are unregistering the
 *                        device; see the function unregister_netdev
 *        @close_list:        List entry used when we are closing the device
 *        @ptype_all:     Device-specific packet handlers for all protocols
 *        @ptype_specific: Device-specific, protocol-specific packet handlers
 *
 *        @adj_list:        Directly linked devices, like slaves for bonding
 *        @features:        Currently active device features
 *        @hw_features:        User-changeable features
 *
 *        @wanted_features:        User-requested features
 *        @vlan_features:                Mask of features inheritable by VLAN devices
 *
 *        @hw_enc_features:        Mask of features inherited by encapsulating devices
 *                                This field indicates what encapsulation
 *                                offloads the hardware is capable of doing,
 *                                and drivers will need to set them appropriately.
 *
 *        @mpls_features:        Mask of features inheritable by MPLS
 *        @gso_partial_features: value(s) from NETIF_F_GSO\*
 *
 *        @ifindex:        interface index
 *        @group:                The group the device belongs to
 *
 *        @stats:                Statistics struct, which was left as a legacy, use
 *                        rtnl_link_stats64 instead
 *
 *        @rx_dropped:        Dropped packets by core network,
 *                        do not use this in drivers
 *        @tx_dropped:        Dropped packets by core network,
 *                        do not use this in drivers
 *        @rx_nohandler:        nohandler dropped packets by core network on
 *                        inactive devices, do not use this in drivers
 *        @carrier_up_count:        Number of times the carrier has been up
 *        @carrier_down_count:        Number of times the carrier has been down
 *
 *        @wireless_handlers:        List of functions to handle Wireless Extensions,
 *                                instead of ioctl,
 *                                see <net/iw_handler.h> for details.
 *        @wireless_data:        Instance data managed by the core of wireless extensions
 *
 *        @netdev_ops:        Includes several pointers to callbacks,
 *                        if one wants to override the ndo_*() functions
 *        @ethtool_ops:        Management operations
 *        @l3mdev_ops:        Layer 3 master device operations
 *        @ndisc_ops:        Includes callbacks for different IPv6 neighbour
 *                        discovery handling. Necessary for e.g. 6LoWPAN.
 *        @xfrmdev_ops:        Transformation offload operations
 *        @tlsdev_ops:        Transport Layer Security offload operations
 *        @header_ops:        Includes callbacks for creating,parsing,caching,etc
 *                        of Layer 2 headers.
 *
 *        @flags:                Interface flags (a la BSD)
 *        @priv_flags:        Like 'flags' but invisible to userspace,
 *                        see if.h for the definitions
 *        @gflags:        Global flags ( kept as legacy )
 *        @padded:        How much padding added by alloc_netdev()
 *        @operstate:        RFC2863 operstate
 *        @link_mode:        Mapping policy to operstate
 *        @if_port:        Selectable AUI, TP, ...
 *        @dma:                DMA channel
 *        @mtu:                Interface MTU value
 *        @min_mtu:        Interface Minimum MTU value
 *        @max_mtu:        Interface Maximum MTU value
 *        @type:                Interface hardware type
 *        @hard_header_len: Maximum hardware header length.
 *        @min_header_len:  Minimum hardware header length
 *
 *        @needed_headroom: Extra headroom the hardware may need, but not in all
 *                          cases can this be guaranteed
 *        @needed_tailroom: Extra tailroom the hardware may need, but not in all
 *                          cases can this be guaranteed. Some cases also use
 *                          LL_MAX_HEADER instead to allocate the skb
 *
 *        interface address info:
 *
 *         @perm_addr:                Permanent hw address
 *         @addr_assign_type:        Hw address assignment type
 *         @addr_len:                Hardware address length
 *        @upper_level:                Maximum depth level of upper devices.
 *        @lower_level:                Maximum depth level of lower devices.
 *        @neigh_priv_len:        Used in neigh_alloc()
 *         @dev_id:                Used to differentiate devices that share
 *                                 the same link layer address
 *         @dev_port:                Used to differentiate devices that share
 *                                 the same function
 *        @addr_list_lock:        XXX: need comments on this one
 *        @name_assign_type:        network interface name assignment type
 *        @uc_promisc:                Counter that indicates promiscuous mode
 *                                has been enabled due to the need to listen to
 *                                additional unicast addresses in a device that
 *                                does not implement ndo_set_rx_mode()
 *        @uc:                        unicast mac addresses
 *        @mc:                        multicast mac addresses
 *        @dev_addrs:                list of device hw addresses
 *        @queues_kset:                Group of all Kobjects in the Tx and RX queues
 *        @promiscuity:                Number of times the NIC is told to work in
 *                                promiscuous mode; if it becomes 0 the NIC will
 *                                exit promiscuous mode
 *        @allmulti:                Counter, enables or disables allmulticast mode
 *
 *        @vlan_info:        VLAN info
 *        @dsa_ptr:        dsa specific data
 *        @tipc_ptr:        TIPC specific data
 *        @atalk_ptr:        AppleTalk link
 *        @ip_ptr:        IPv4 specific data
 *        @ip6_ptr:        IPv6 specific data
 *        @ax25_ptr:        AX.25 specific data
 *        @ieee80211_ptr:        IEEE 802.11 specific data, assign before registering
 *        @ieee802154_ptr: IEEE 802.15.4 low-rate Wireless Personal Area Network
 *                         device struct
 *        @mpls_ptr:        mpls_dev struct pointer
 *
 *        @dev_addr:        Hw address (before bcast,
 *                        because most packets are unicast)
 *
 *        @_rx:                        Array of RX queues
 *        @num_rx_queues:                Number of RX queues
 *                                allocated at register_netdev() time
 *        @real_num_rx_queues:         Number of RX queues currently active in device
 *        @xdp_prog:                XDP sockets filter program pointer
 *        @gro_flush_timeout:        timeout for GRO layer in NAPI
 *        @napi_defer_hard_irqs:        If not zero, provides a counter that would
 *                                allow to avoid NIC hard IRQ, on busy queues.
 *
 *        @rx_handler:                handler for received packets
 *        @rx_handler_data:         XXX: need comments on this one
 *        @miniq_ingress:                ingress/clsact qdisc specific data for
 *                                ingress processing
 *        @ingress_queue:                XXX: need comments on this one
 *        @nf_hooks_ingress:        netfilter hooks executed for ingress packets
 *        @broadcast:                hw bcast address
 *
 *        @rx_cpu_rmap:        CPU reverse-mapping for RX completion interrupts,
 *                        indexed by RX queue number. Assigned by driver.
 *                        This must only be set if the ndo_rx_flow_steer
 *                        operation is defined
 *        @index_hlist:                Device index hash chain
 *
 *        @_tx:                        Array of TX queues
 *        @num_tx_queues:                Number of TX queues allocated at alloc_netdev_mq() time
 *        @real_num_tx_queues:         Number of TX queues currently active in device
 *        @qdisc:                        Root qdisc from userspace point of view
 *        @tx_queue_len:                Max frames per queue allowed
 *        @tx_global_lock:         XXX: need comments on this one
 *        @xdp_bulkq:                XDP device bulk queue
 *        @xps_cpus_map:                all CPUs map for XPS device
 *        @xps_rxqs_map:                all RXQs map for XPS device
 *
 *        @xps_maps:        XXX: need comments on this one
 *        @miniq_egress:                clsact qdisc specific data for
 *                                egress processing
 *        @qdisc_hash:                qdisc hash table
 *        @watchdog_timeo:        Represents the timeout that is used by
 *                                the watchdog (see dev_watchdog())
 *        @watchdog_timer:        List of timers
 *
 *        @proto_down_reason:        reason a netdev interface is held down
 *        @pcpu_refcnt:                Number of references to this device
 *        @todo_list:                Delayed register/unregister
 *        @link_watch_list:        XXX: need comments on this one
 *
 *        @reg_state:                Register/unregister state machine
 *        @dismantle:                Device is going to be freed
 *        @rtnl_link_state:        This enum represents the phases of creating
 *                                a new link
 *
 *        @needs_free_netdev:        Should unregister perform free_netdev?
 *        @priv_destructor:        Called from unregister
 *        @npinfo:                XXX: need comments on this one
 *         @nd_net:                Network namespace this network device is inside
 *
 *         @ml_priv:        Mid-layer private
 *        @ml_priv_type:  Mid-layer private type
 *         @lstats:        Loopback statistics
 *         @tstats:        Tunnel statistics
 *         @dstats:        Dummy statistics
 *         @vstats:        Virtual ethernet statistics
 *
 *        @garp_port:        GARP
 *        @mrp_port:        MRP
 *
 *        @dev:                Class/net/name entry
 *        @sysfs_groups:        Space for optional device, statistics and wireless
 *                        sysfs groups
 *
 *        @sysfs_rx_queue_group:        Space for optional per-rx queue attributes
 *        @rtnl_link_ops:        Rtnl_link_ops
 *
 *        @gso_max_size:        Maximum size of generic segmentation offload
 *        @gso_max_segs:        Maximum number of segments that can be passed to the
 *                        NIC for GSO
 *
 *        @dcbnl_ops:        Data Center Bridging netlink ops
 *        @num_tc:        Number of traffic classes in the net device
 *        @tc_to_txq:        XXX: need comments on this one
 *        @prio_tc_map:        XXX: need comments on this one
 *
 *        @fcoe_ddp_xid:        Max exchange id for FCoE LRO by ddp
 *
 *        @priomap:        XXX: need comments on this one
 *        @phydev:        Physical device may attach itself
 *                        for hardware timestamping
 *        @sfp_bus:        attached &struct sfp_bus structure.
 *
 *        @qdisc_tx_busylock: lockdep class annotating Qdisc->busylock spinlock
 *        @qdisc_running_key: lockdep class annotating Qdisc->running seqcount
 *
 *        @proto_down:        protocol port state information can be sent to the
 *                        switch driver and used to set the phys state of the
 *                        switch port.
 *
 *        @wol_enabled:        Wake-on-LAN is enabled
 *
 *        @net_notifier_list:        List of per-net netdev notifier block
 *                                that follow this device when it is moved
 *                                to another network namespace.
 *
 *        @macsec_ops:    MACsec offloading ops
 *
 *        @udp_tunnel_nic_info:        static structure describing the UDP tunnel
 *                                offload capabilities of the device
 *        @udp_tunnel_nic:        UDP tunnel offload state
 *        @xdp_state:                stores info on attached XDP BPF programs
 *
 *        @nested_level:        Used as as a parameter of spin_lock_nested() of
 *                        dev->addr_list_lock.
 *        @unlink_list:        As netif_addr_lock() can be called recursively,
 *                        keep a list of interfaces to be deleted.
 *
 *        FIXME: cleanup struct net_device such that network protocol info
 *        moves out.
 */

struct net_device {
        char                        name[IFNAMSIZ];
        struct netdev_name_node        *name_node;
        struct dev_ifalias        __rcu *ifalias;
        /*
         *        I/O specific fields
         *        FIXME: Merge these and struct ifmap into one
         */
        unsigned long                mem_end;
        unsigned long                mem_start;
        unsigned long                base_addr;
        int                        irq;

        /*
         *        Some hardware also needs these fields (state,dev_list,
         *        napi_list,unreg_list,close_list) but they are not
         *        part of the usual set specified in Space.c.
         */

        unsigned long                state;

        struct list_head        dev_list;
        struct list_head        napi_list;
        struct list_head        unreg_list;
        struct list_head        close_list;
        struct list_head        ptype_all;
        struct list_head        ptype_specific;

        struct {
                struct list_head upper;
                struct list_head lower;
        } adj_list;

        netdev_features_t        features;
        netdev_features_t        hw_features;
        netdev_features_t        wanted_features;
        netdev_features_t        vlan_features;
        netdev_features_t        hw_enc_features;
        netdev_features_t        mpls_features;
        netdev_features_t        gso_partial_features;

        int                        ifindex;
        int                        group;

        struct net_device_stats        stats;

        atomic_long_t                rx_dropped;
        atomic_long_t                tx_dropped;
        atomic_long_t                rx_nohandler;

        /* Stats to monitor link on/off, flapping */
        atomic_t                carrier_up_count;
        atomic_t                carrier_down_count;

#ifdef CONFIG_WIRELESS_EXT
        const struct iw_handler_def *wireless_handlers;
        struct iw_public_data        *wireless_data;
#endif
        const struct net_device_ops *netdev_ops;
        const struct ethtool_ops *ethtool_ops;
#ifdef CONFIG_NET_L3_MASTER_DEV
        const struct l3mdev_ops        *l3mdev_ops;
#endif
#if IS_ENABLED(CONFIG_IPV6)
        const struct ndisc_ops *ndisc_ops;
#endif

#ifdef CONFIG_XFRM_OFFLOAD
        const struct xfrmdev_ops *xfrmdev_ops;
#endif

#if IS_ENABLED(CONFIG_TLS_DEVICE)
        const struct tlsdev_ops *tlsdev_ops;
#endif

        const struct header_ops *header_ops;

        unsigned int                flags;
        unsigned int                priv_flags;

        unsigned short                gflags;
        unsigned short                padded;

        unsigned char                operstate;
        unsigned char                link_mode;

        unsigned char                if_port;
        unsigned char                dma;

        /* Note : dev->mtu is often read without holding a lock.
         * Writers usually hold RTNL.
         * It is recommended to use READ_ONCE() to annotate the reads,
         * and to use WRITE_ONCE() to annotate the writes.
         */
        unsigned int                mtu;
        unsigned int                min_mtu;
        unsigned int                max_mtu;
        unsigned short                type;
        unsigned short                hard_header_len;
        unsigned char                min_header_len;
        unsigned char                name_assign_type;

        unsigned short                needed_headroom;
        unsigned short                needed_tailroom;

        /* Interface address info. */
        unsigned char                perm_addr[MAX_ADDR_LEN];
        unsigned char                addr_assign_type;
        unsigned char                addr_len;
        unsigned char                upper_level;
        unsigned char                lower_level;

        unsigned short                neigh_priv_len;
        unsigned short          dev_id;
        unsigned short          dev_port;
        spinlock_t                addr_list_lock;

        struct netdev_hw_addr_list        uc;
        struct netdev_hw_addr_list        mc;
        struct netdev_hw_addr_list        dev_addrs;

#ifdef CONFIG_SYSFS
        struct kset                *queues_kset;
#endif
#ifdef CONFIG_LOCKDEP
        struct list_head        unlink_list;
#endif
        unsigned int                promiscuity;
        unsigned int                allmulti;
        bool                        uc_promisc;
#ifdef CONFIG_LOCKDEP
        unsigned char                nested_level;
#endif


        /* Protocol-specific pointers */

#if IS_ENABLED(CONFIG_VLAN_8021Q)
        struct vlan_info __rcu        *vlan_info;
#endif
#if IS_ENABLED(CONFIG_NET_DSA)
        struct dsa_port                *dsa_ptr;
#endif
#if IS_ENABLED(CONFIG_TIPC)
        struct tipc_bearer __rcu *tipc_ptr;
#endif
#if IS_ENABLED(CONFIG_IRDA) || IS_ENABLED(CONFIG_ATALK)
        void                         *atalk_ptr;
#endif
        struct in_device __rcu        *ip_ptr;
        struct inet6_dev __rcu        *ip6_ptr;
#if IS_ENABLED(CONFIG_AX25)
        void                        *ax25_ptr;
#endif
        struct wireless_dev        *ieee80211_ptr;
        struct wpan_dev                *ieee802154_ptr;
#if IS_ENABLED(CONFIG_MPLS_ROUTING)
        struct mpls_dev __rcu        *mpls_ptr;
#endif

/*
 * Cache lines mostly used on receive path (including eth_type_trans())
 */
        /* Interface address info used in eth_type_trans() */
        unsigned char                *dev_addr;

        struct netdev_rx_queue        *_rx;
        unsigned int                num_rx_queues;
        unsigned int                real_num_rx_queues;

        struct bpf_prog __rcu        *xdp_prog;
        unsigned long                gro_flush_timeout;
        int                        napi_defer_hard_irqs;
        rx_handler_func_t __rcu        *rx_handler;
        void __rcu                *rx_handler_data;

#ifdef CONFIG_NET_CLS_ACT
        struct mini_Qdisc __rcu        *miniq_ingress;
#endif
        struct netdev_queue __rcu *ingress_queue;
#ifdef CONFIG_NETFILTER_INGRESS
        struct nf_hook_entries __rcu *nf_hooks_ingress;
#endif

        unsigned char                broadcast[MAX_ADDR_LEN];
#ifdef CONFIG_RFS_ACCEL
        struct cpu_rmap                *rx_cpu_rmap;
#endif
        struct hlist_node        index_hlist;

/*
 * Cache lines mostly used on transmit path
 */
        struct netdev_queue        *_tx ____cacheline_aligned_in_smp;
        unsigned int                num_tx_queues;
        unsigned int                real_num_tx_queues;
        struct Qdisc __rcu        *qdisc;
        unsigned int                tx_queue_len;
        spinlock_t                tx_global_lock;

        struct xdp_dev_bulk_queue __percpu *xdp_bulkq;

#ifdef CONFIG_XPS
        struct xps_dev_maps __rcu *xps_cpus_map;
        struct xps_dev_maps __rcu *xps_rxqs_map;
#endif
#ifdef CONFIG_NET_CLS_ACT
        struct mini_Qdisc __rcu        *miniq_egress;
#endif

#ifdef CONFIG_NET_SCHED
        DECLARE_HASHTABLE        (qdisc_hash, 4);
#endif
        /* These may be needed for future network-power-down code. */
        struct timer_list        watchdog_timer;
        int                        watchdog_timeo;

        u32                     proto_down_reason;

        struct list_head        todo_list;
        int __percpu                *pcpu_refcnt;

        struct list_head        link_watch_list;

        enum { NETREG_UNINITIALIZED=0,
               NETREG_REGISTERED,        /* completed register_netdevice */
               NETREG_UNREGISTERING,        /* called unregister_netdevice */
               NETREG_UNREGISTERED,        /* completed unregister todo */
               NETREG_RELEASED,                /* called free_netdev */
               NETREG_DUMMY,                /* dummy device for NAPI poll */
        } reg_state:8;

        bool dismantle;

        enum {
                RTNL_LINK_INITIALIZED,
                RTNL_LINK_INITIALIZING,
        } rtnl_link_state:16;

        bool needs_free_netdev;
        void (*priv_destructor)(struct net_device *dev);

#ifdef CONFIG_NETPOLL
        struct netpoll_info __rcu        *npinfo;
#endif

        possible_net_t                        nd_net;

        /* mid-layer private */
        void                                *ml_priv;
        enum netdev_ml_priv_type        ml_priv_type;

        union {
                struct pcpu_lstats __percpu                *lstats;
                struct pcpu_sw_netstats __percpu        *tstats;
                struct pcpu_dstats __percpu                *dstats;
        };

#if IS_ENABLED(CONFIG_GARP)
        struct garp_port __rcu        *garp_port;
#endif
#if IS_ENABLED(CONFIG_MRP)
        struct mrp_port __rcu        *mrp_port;
#endif

        struct device                dev;
        const struct attribute_group *sysfs_groups[4];
        const struct attribute_group *sysfs_rx_queue_group;

        const struct rtnl_link_ops *rtnl_link_ops;

        /* for setting kernel sock attribute on TCP connection setup */
#define GSO_MAX_SIZE                65536
        unsigned int                gso_max_size;
#define GSO_MAX_SEGS                65535
        u16                        gso_max_segs;

#ifdef CONFIG_DCB
        const struct dcbnl_rtnl_ops *dcbnl_ops;
#endif
        s16                        num_tc;
        struct netdev_tc_txq        tc_to_txq[TC_MAX_QUEUE];
        u8                        prio_tc_map[TC_BITMASK + 1];

#if IS_ENABLED(CONFIG_FCOE)
        unsigned int                fcoe_ddp_xid;
#endif
#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
        struct netprio_map __rcu *priomap;
#endif
        struct phy_device        *phydev;
        struct sfp_bus                *sfp_bus;
        struct lock_class_key        *qdisc_tx_busylock;
        struct lock_class_key        *qdisc_running_key;
        bool                        proto_down;
        unsigned                wol_enabled:1;

        struct list_head        net_notifier_list;

#if IS_ENABLED(CONFIG_MACSEC)
        /* MACsec management functions */
        const struct macsec_ops *macsec_ops;
#endif
        const struct udp_tunnel_nic_info        *udp_tunnel_nic_info;
        struct udp_tunnel_nic        *udp_tunnel_nic;

        /* protected by rtnl_lock */
        struct bpf_xdp_entity        xdp_state[__MAX_XDP_MODE];
};
#define to_net_dev(d) container_of(d, struct net_device, dev)

static inline bool netif_elide_gro(const struct net_device *dev)
{
        if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
                return true;
        return false;
}

#define        NETDEV_ALIGN                32

static inline
int netdev_get_prio_tc_map(const struct net_device *dev, u32 prio)
{
        return dev->prio_tc_map[prio & TC_BITMASK];
}

static inline
int netdev_set_prio_tc_map(struct net_device *dev, u8 prio, u8 tc)
{
        if (tc >= dev->num_tc)
                return -EINVAL;

        dev->prio_tc_map[prio & TC_BITMASK] = tc & TC_BITMASK;
        return 0;
}

int netdev_txq_to_tc(struct net_device *dev, unsigned int txq);
void netdev_reset_tc(struct net_device *dev);
int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset);
int netdev_set_num_tc(struct net_device *dev, u8 num_tc);

static inline
int netdev_get_num_tc(struct net_device *dev)
{
        return dev->num_tc;
}

static inline void net_prefetch(void *p)
{
        prefetch(p);
#if L1_CACHE_BYTES < 128
        prefetch((u8 *)p + L1_CACHE_BYTES);
#endif
}

static inline void net_prefetchw(void *p)
{
        prefetchw(p);
#if L1_CACHE_BYTES < 128
        prefetchw((u8 *)p + L1_CACHE_BYTES);
#endif
}

void netdev_unbind_sb_channel(struct net_device *dev,
                              struct net_device *sb_dev);
int netdev_bind_sb_channel_queue(struct net_device *dev,
                                 struct net_device *sb_dev,
                                 u8 tc, u16 count, u16 offset);
int netdev_set_sb_channel(struct net_device *dev, u16 channel);
static inline int netdev_get_sb_channel(struct net_device *dev)
{
        return max_t(int, -dev->num_tc, 0);
}

static inline
struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev,
                                         unsigned int index)
{
        return &dev->_tx[index];
}

static inline struct netdev_queue *skb_get_tx_queue(const struct net_device *dev,
                                                    const struct sk_buff *skb)
{
        return netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
}

static inline void netdev_for_each_tx_queue(struct net_device *dev,
                                            void (*f)(struct net_device *,
                                                      struct netdev_queue *,
                                                      void *),
                                            void *arg)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++)
                f(dev, &dev->_tx[i], arg);
}

#define netdev_lockdep_set_classes(dev)                                \
{                                                                \
        static struct lock_class_key qdisc_tx_busylock_key;        \
        static struct lock_class_key qdisc_running_key;                \
        static struct lock_class_key qdisc_xmit_lock_key;        \
        static struct lock_class_key dev_addr_list_lock_key;        \
        unsigned int i;                                                \
                                                                \
        (dev)->qdisc_tx_busylock = &qdisc_tx_busylock_key;        \
        (dev)->qdisc_running_key = &qdisc_running_key;                \
        lockdep_set_class(&(dev)->addr_list_lock,                \
                          &dev_addr_list_lock_key);                \
        for (i = 0; i < (dev)->num_tx_queues; i++)                \
                lockdep_set_class(&(dev)->_tx[i]._xmit_lock,        \
                                  &qdisc_xmit_lock_key);        \
}

u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
                     struct net_device *sb_dev);
struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
                                         struct sk_buff *skb,
                                         struct net_device *sb_dev);

/* returns the headroom that the master device needs to take in account
 * when forwarding to this dev
 */
static inline unsigned netdev_get_fwd_headroom(struct net_device *dev)
{
        return dev->priv_flags & IFF_PHONY_HEADROOM ? 0 : dev->needed_headroom;
}

static inline void netdev_set_rx_headroom(struct net_device *dev, int new_hr)
{
        if (dev->netdev_ops->ndo_set_rx_headroom)
                dev->netdev_ops->ndo_set_rx_headroom(dev, new_hr);
}

/* set the device rx headroom to the dev's default */
static inline void netdev_reset_rx_headroom(struct net_device *dev)
{
        netdev_set_rx_headroom(dev, -1);
}

static inline void *netdev_get_ml_priv(struct net_device *dev,
                                       enum netdev_ml_priv_type type)
{
        if (dev->ml_priv_type != type)
                return NULL;

        return dev->ml_priv;
}

static inline void netdev_set_ml_priv(struct net_device *dev,
                                      void *ml_priv,
                                      enum netdev_ml_priv_type type)
{
        WARN(dev->ml_priv_type && dev->ml_priv_type != type,
             "Overwriting already set ml_priv_type (%u) with different ml_priv_type (%u)!\n",
             dev->ml_priv_type, type);
        WARN(!dev->ml_priv_type && dev->ml_priv,
             "Overwriting already set ml_priv and ml_priv_type is ML_PRIV_NONE!\n");

        dev->ml_priv = ml_priv;
        dev->ml_priv_type = type;
}

/*
 * Net namespace inlines
 */
static inline
struct net *dev_net(const struct net_device *dev)
{
        return read_pnet(&dev->nd_net);
}

static inline
struct net *dev_net_rcu(const struct net_device *dev)
{
        return read_pnet_rcu(&dev->nd_net);
}

static inline
void dev_net_set(struct net_device *dev, struct net *net)
{
        write_pnet(&dev->nd_net, net);
}

/**
 *        netdev_priv - access network device private data
 *        @dev: network device
 *
 * Get network device private data
 */
static inline void *netdev_priv(const struct net_device *dev)
{
        return (char *)dev + ALIGN(sizeof(struct net_device), NETDEV_ALIGN);
}

/* Set the sysfs physical device reference for the network logical device
 * if set prior to registration will cause a symlink during initialization.
 */
#define SET_NETDEV_DEV(net, pdev)        ((net)->dev.parent = (pdev))

/* Set the sysfs device type for the network logical device to allow
 * fine-grained identification of different network device types. For
 * example Ethernet, Wireless LAN, Bluetooth, WiMAX etc.
 */
#define SET_NETDEV_DEVTYPE(net, devtype)        ((net)->dev.type = (devtype))

/* Default NAPI poll() weight
 * Device drivers are strongly advised to not use bigger value
 */
#define NAPI_POLL_WEIGHT 64

/**
 *        netif_napi_add - initialize a NAPI context
 *        @dev:  network device
 *        @napi: NAPI context
 *        @poll: polling function
 *        @weight: default weight
 *
 * netif_napi_add() must be used to initialize a NAPI context prior to calling
 * *any* of the other NAPI-related functions.
 */
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
                    int (*poll)(struct napi_struct *, int), int weight);

/**
 *        netif_tx_napi_add - initialize a NAPI context
 *        @dev:  network device
 *        @napi: NAPI context
 *        @poll: polling function
 *        @weight: default weight
 *
 * This variant of netif_napi_add() should be used from drivers using NAPI
 * to exclusively poll a TX queue.
 * This will avoid we add it into napi_hash[], thus polluting this hash table.
 */
static inline void netif_tx_napi_add(struct net_device *dev,
                                     struct napi_struct *napi,
                                     int (*poll)(struct napi_struct *, int),
                                     int weight)
{
        set_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state);
        netif_napi_add(dev, napi, poll, weight);
}

/**
 *  __netif_napi_del - remove a NAPI context
 *  @napi: NAPI context
 *
 * Warning: caller must observe RCU grace period before freeing memory
 * containing @napi. Drivers might want to call this helper to combine
 * all the needed RCU grace periods into a single one.
 */
void __netif_napi_del(struct napi_struct *napi);

/**
 *  netif_napi_del - remove a NAPI context
 *  @napi: NAPI context
 *
 *  netif_napi_del() removes a NAPI context from the network device NAPI list
 */
static inline void netif_napi_del(struct napi_struct *napi)
{
        __netif_napi_del(napi);
        synchronize_net();
}

struct napi_gro_cb {
        /* Virtual address of skb_shinfo(skb)->frags[0].page + offset. */
        void        *frag0;

        /* Length of frag0. */
        unsigned int frag0_len;

        /* This indicates where we are processing relative to skb->data. */
        int        data_offset;

        /* This is non-zero if the packet cannot be merged with the new skb. */
        u16        flush;

        /* Save the IP ID here and check when we get to the transport layer */
        u16        flush_id;

        /* Number of segments aggregated. */
        u16        count;

        /* Start offset for remote checksum offload */
        u16        gro_remcsum_start;

        /* jiffies when first packet was created/queued */
        unsigned long age;

        /* Used in ipv6_gro_receive() and foo-over-udp */
        u16        proto;

        /* This is non-zero if the packet may be of the same flow. */
        u8        same_flow:1;

        /* Used in tunnel GRO receive */
        u8        encap_mark:1;

        /* GRO checksum is valid */
        u8        csum_valid:1;

        /* Number of checksums via CHECKSUM_UNNECESSARY */
        u8        csum_cnt:3;

        /* Free the skb? */
        u8        free:2;
#define NAPI_GRO_FREE                  1
#define NAPI_GRO_FREE_STOLEN_HEAD 2

        /* Used in foo-over-udp, set in udp[46]_gro_receive */
        u8        is_ipv6:1;

        /* Used in GRE, set in fou/gue_gro_receive */
        u8        is_fou:1;

        /* Used to determine if flush_id can be ignored */
        u8        is_atomic:1;

        /* Number of gro_receive callbacks this packet already went through */
        u8 recursion_counter:4;

        /* GRO is done by frag_list pointer chaining. */
        u8        is_flist:1;

        /* used to support CHECKSUM_COMPLETE for tunneling protocols */
        __wsum        csum;

        /* used in skb_gro_receive() slow path */
        struct sk_buff *last;
};

#define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb)

#define GRO_RECURSION_LIMIT 15
static inline int gro_recursion_inc_test(struct sk_buff *skb)
{
        return ++NAPI_GRO_CB(skb)->recursion_counter == GRO_RECURSION_LIMIT;
}

typedef struct sk_buff *(*gro_receive_t)(struct list_head *, struct sk_buff *);
static inline struct sk_buff *call_gro_receive(gro_receive_t cb,
                                               struct list_head *head,
                                               struct sk_buff *skb)
{
        if (unlikely(gro_recursion_inc_test(skb))) {
                NAPI_GRO_CB(skb)->flush |= 1;
                return NULL;
        }

        return cb(head, skb);
}

typedef struct sk_buff *(*gro_receive_sk_t)(struct sock *, struct list_head *,
                                            struct sk_buff *);
static inline struct sk_buff *call_gro_receive_sk(gro_receive_sk_t cb,
                                                  struct sock *sk,
                                                  struct list_head *head,
                                                  struct sk_buff *skb)
{
        if (unlikely(gro_recursion_inc_test(skb))) {
                NAPI_GRO_CB(skb)->flush |= 1;
                return NULL;
        }

        return cb(sk, head, skb);
}

struct packet_type {
        __be16                        type;        /* This is really htons(ether_type). */
        bool                        ignore_outgoing;
        struct net_device        *dev;        /* NULL is wildcarded here             */
        int                        (*func) (struct sk_buff *,
                                         struct net_device *,
                                         struct packet_type *,
                                         struct net_device *);
        void                        (*list_func) (struct list_head *,
                                              struct packet_type *,
                                              struct net_device *);
        bool                        (*id_match)(struct packet_type *ptype,
                                            struct sock *sk);
        struct net                *af_packet_net;
        void                        *af_packet_priv;
        struct list_head        list;
};

struct offload_callbacks {
        struct sk_buff                *(*gso_segment)(struct sk_buff *skb,
                                                netdev_features_t features);
        struct sk_buff                *(*gro_receive)(struct list_head *head,
                                                struct sk_buff *skb);
        int                        (*gro_complete)(struct sk_buff *skb, int nhoff);
};

struct packet_offload {
        __be16                         type;        /* This is really htons(ether_type). */
        u16                         priority;
        struct offload_callbacks callbacks;
        struct list_head         list;
};

/* often modified stats are per-CPU, other are shared (netdev->stats) */
struct pcpu_sw_netstats {
        u64     rx_packets;
        u64     rx_bytes;
        u64     tx_packets;
        u64     tx_bytes;
        struct u64_stats_sync   syncp;
} __aligned(4 * sizeof(u64));

struct pcpu_lstats {
        u64_stats_t packets;
        u64_stats_t bytes;
        struct u64_stats_sync syncp;
} __aligned(2 * sizeof(u64));

void dev_lstats_read(struct net_device *dev, u64 *packets, u64 *bytes);

static inline void dev_sw_netstats_rx_add(struct net_device *dev, unsigned int len)
{
        struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats);

        u64_stats_update_begin(&tstats->syncp);
        tstats->rx_bytes += len;
        tstats->rx_packets++;
        u64_stats_update_end(&tstats->syncp);
}

static inline void dev_lstats_add(struct net_device *dev, unsigned int len)
{
        struct pcpu_lstats *lstats = this_cpu_ptr(dev->lstats);

        u64_stats_update_begin(&lstats->syncp);
        u64_stats_add(&lstats->bytes, len);
        u64_stats_inc(&lstats->packets);
        u64_stats_update_end(&lstats->syncp);
}

#define __netdev_alloc_pcpu_stats(type, gfp)                                \
({                                                                        \
        typeof(type) __percpu *pcpu_stats = alloc_percpu_gfp(type, gfp);\
        if (pcpu_stats)        {                                                \
                int __cpu;                                                \
                for_each_possible_cpu(__cpu) {                                \
                        typeof(type) *stat;                                \
                        stat = per_cpu_ptr(pcpu_stats, __cpu);                \
                        u64_stats_init(&stat->syncp);                        \
                }                                                        \
        }                                                                \
        pcpu_stats;                                                        \
})

#define netdev_alloc_pcpu_stats(type)                                        \
        __netdev_alloc_pcpu_stats(type, GFP_KERNEL)

enum netdev_lag_tx_type {
        NETDEV_LAG_TX_TYPE_UNKNOWN,
        NETDEV_LAG_TX_TYPE_RANDOM,
        NETDEV_LAG_TX_TYPE_BROADCAST,
        NETDEV_LAG_TX_TYPE_ROUNDROBIN,
        NETDEV_LAG_TX_TYPE_ACTIVEBACKUP,
        NETDEV_LAG_TX_TYPE_HASH,
};

enum netdev_lag_hash {
        NETDEV_LAG_HASH_NONE,
        NETDEV_LAG_HASH_L2,
        NETDEV_LAG_HASH_L34,
        NETDEV_LAG_HASH_L23,
        NETDEV_LAG_HASH_E23,
        NETDEV_LAG_HASH_E34,
        NETDEV_LAG_HASH_UNKNOWN,
};

struct netdev_lag_upper_info {
        enum netdev_lag_tx_type tx_type;
        enum netdev_lag_hash hash_type;
};

struct netdev_lag_lower_state_info {
        u8 link_up : 1,
           tx_enabled : 1;
};

#include <linux/notifier.h>

/* netdevice notifier chain. Please remember to update netdev_cmd_to_name()
 * and the rtnetlink notification exclusion list in rtnetlink_event() when
 * adding new types.
 */
enum netdev_cmd {
        NETDEV_UP        = 1,        /* For now you can't veto a device up/down */
        NETDEV_DOWN,
        NETDEV_REBOOT,                /* Tell a protocol stack a network interface
                                   detected a hardware crash and restarted
                                   - we can use this eg to kick tcp sessions
                                   once done */
        NETDEV_CHANGE,                /* Notify device state change */
        NETDEV_REGISTER,
        NETDEV_UNREGISTER,
        NETDEV_CHANGEMTU,        /* notify after mtu change happened */
        NETDEV_CHANGEADDR,        /* notify after the address change */
        NETDEV_PRE_CHANGEADDR,        /* notify before the address change */
        NETDEV_GOING_DOWN,
        NETDEV_CHANGENAME,
        NETDEV_FEAT_CHANGE,
        NETDEV_BONDING_FAILOVER,
        NETDEV_PRE_UP,
        NETDEV_PRE_TYPE_CHANGE,
        NETDEV_POST_TYPE_CHANGE,
        NETDEV_POST_INIT,
        NETDEV_RELEASE,
        NETDEV_NOTIFY_PEERS,
        NETDEV_JOIN,
        NETDEV_CHANGEUPPER,
        NETDEV_RESEND_IGMP,
        NETDEV_PRECHANGEMTU,        /* notify before mtu change happened */
        NETDEV_CHANGEINFODATA,
        NETDEV_BONDING_INFO,
        NETDEV_PRECHANGEUPPER,
        NETDEV_CHANGELOWERSTATE,
        NETDEV_UDP_TUNNEL_PUSH_INFO,
        NETDEV_UDP_TUNNEL_DROP_INFO,
        NETDEV_CHANGE_TX_QUEUE_LEN,
        NETDEV_CVLAN_FILTER_PUSH_INFO,
        NETDEV_CVLAN_FILTER_DROP_INFO,
        NETDEV_SVLAN_FILTER_PUSH_INFO,
        NETDEV_SVLAN_FILTER_DROP_INFO,
};
const char *netdev_cmd_to_name(enum netdev_cmd cmd);

int register_netdevice_notifier(struct notifier_block *nb);
int unregister_netdevice_notifier(struct notifier_block *nb);
int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb);
int unregister_netdevice_notifier_net(struct net *net,
                                      struct notifier_block *nb);
int register_netdevice_notifier_dev_net(struct net_device *dev,
                                        struct notifier_block *nb,
                                        struct netdev_net_notifier *nn);
int unregister_netdevice_notifier_dev_net(struct net_device *dev,
                                          struct notifier_block *nb,
                                          struct netdev_net_notifier *nn);

struct netdev_notifier_info {
        struct net_device        *dev;
        struct netlink_ext_ack        *extack;
};

struct netdev_notifier_info_ext {
        struct netdev_notifier_info info; /* must be first */
        union {
                u32 mtu;
        } ext;
};

struct netdev_notifier_change_info {
        struct netdev_notifier_info info; /* must be first */
        unsigned int flags_changed;
};

struct netdev_notifier_changeupper_info {
        struct netdev_notifier_info info; /* must be first */
        struct net_device *upper_dev; /* new upper dev */
        bool master; /* is upper dev master */
        bool linking; /* is the notification for link or unlink */
        void *upper_info; /* upper dev info */
};

struct netdev_notifier_changelowerstate_info {
        struct netdev_notifier_info info; /* must be first */
        void *lower_state_info; /* is lower dev state */
};

struct netdev_notifier_pre_changeaddr_info {
        struct netdev_notifier_info info; /* must be first */
        const unsigned char *dev_addr;
};

static inline void netdev_notifier_info_init(struct netdev_notifier_info *info,
                                             struct net_device *dev)
{
        info->dev = dev;
        info->extack = NULL;
}

static inline struct net_device *
netdev_notifier_info_to_dev(const struct netdev_notifier_info *info)
{
        return info->dev;
}

static inline struct netlink_ext_ack *
netdev_notifier_info_to_extack(const struct netdev_notifier_info *info)
{
        return info->extack;
}

int call_netdevice_notifiers(unsigned long val, struct net_device *dev);


extern rwlock_t                                dev_base_lock;                /* Device list lock */

#define for_each_netdev(net, d)                \
                list_for_each_entry(d, &(net)->dev_base_head, dev_list)
#define for_each_netdev_reverse(net, d)        \
                list_for_each_entry_reverse(d, &(net)->dev_base_head, dev_list)
#define for_each_netdev_rcu(net, d)                \
                list_for_each_entry_rcu(d, &(net)->dev_base_head, dev_list)
#define for_each_netdev_safe(net, d, n)        \
                list_for_each_entry_safe(d, n, &(net)->dev_base_head, dev_list)
#define for_each_netdev_continue(net, d)                \
                list_for_each_entry_continue(d, &(net)->dev_base_head, dev_list)
#define for_each_netdev_continue_reverse(net, d)                \
                list_for_each_entry_continue_reverse(d, &(net)->dev_base_head, \
                                                     dev_list)
#define for_each_netdev_continue_rcu(net, d)                \
        list_for_each_entry_continue_rcu(d, &(net)->dev_base_head, dev_list)
#define for_each_netdev_in_bond_rcu(bond, slave)        \
                for_each_netdev_rcu(&init_net, slave)        \
                        if (netdev_master_upper_dev_get_rcu(slave) == (bond))
#define net_device_entry(lh)        list_entry(lh, struct net_device, dev_list)

static inline struct net_device *next_net_device(struct net_device *dev)
{
        struct list_head *lh;
        struct net *net;

        net = dev_net(dev);
        lh = dev->dev_list.next;
        return lh == &net->dev_base_head ? NULL : net_device_entry(lh);
}

static inline struct net_device *next_net_device_rcu(struct net_device *dev)
{
        struct list_head *lh;
        struct net *net;

        net = dev_net(dev);
        lh = rcu_dereference(list_next_rcu(&dev->dev_list));
        return lh == &net->dev_base_head ? NULL : net_device_entry(lh);
}

static inline struct net_device *first_net_device(struct net *net)
{
        return list_empty(&net->dev_base_head) ? NULL :
                net_device_entry(net->dev_base_head.next);
}

static inline struct net_device *first_net_device_rcu(struct net *net)
{
        struct list_head *lh = rcu_dereference(list_next_rcu(&net->dev_base_head));

        return lh == &net->dev_base_head ? NULL : net_device_entry(lh);
}

int netdev_boot_setup_check(struct net_device *dev);
unsigned long netdev_boot_base(const char *prefix, int unit);
struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
                                       const char *hwaddr);
struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type);
struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type);
void dev_add_pack(struct packet_type *pt);
void dev_remove_pack(struct packet_type *pt);
void __dev_remove_pack(struct packet_type *pt);
void dev_add_offload(struct packet_offload *po);
void dev_remove_offload(struct packet_offload *po);

int dev_get_iflink(const struct net_device *dev);
int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb);
struct net_device *__dev_get_by_flags(struct net *net, unsigned short flags,
                                      unsigned short mask);
struct net_device *dev_get_by_name(struct net *net, const char *name);
struct net_device *dev_get_by_name_rcu(struct net *net, const char *name);
struct net_device *__dev_get_by_name(struct net *net, const char *name);
int dev_alloc_name(struct net_device *dev, const char *name);
int dev_open(struct net_device *dev, struct netlink_ext_ack *extack);
void dev_close(struct net_device *dev);
void dev_close_many(struct list_head *head, bool unlink);
void dev_disable_lro(struct net_device *dev);
int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb);
u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
                     struct net_device *sb_dev);
u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
                       struct net_device *sb_dev);

int dev_queue_xmit(struct sk_buff *skb);
int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev);
int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id);

static inline int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
{
        int ret;

        ret = __dev_direct_xmit(skb, queue_id);
        if (!dev_xmit_complete(ret))
                kfree_skb(skb);
        return ret;
}

int register_netdevice(struct net_device *dev);
void unregister_netdevice_queue(struct net_device *dev, struct list_head *head);
void unregister_netdevice_many(struct list_head *head);
static inline void unregister_netdevice(struct net_device *dev)
{
        unregister_netdevice_queue(dev, NULL);
}

int netdev_refcnt_read(const struct net_device *dev);
void free_netdev(struct net_device *dev);
void netdev_freemem(struct net_device *dev);
int init_dummy_netdev(struct net_device *dev);

struct net_device *netdev_get_xmit_slave(struct net_device *dev,
                                         struct sk_buff *skb,
                                         bool all_slaves);
struct net_device *dev_get_by_index(struct net *net, int ifindex);
struct net_device *__dev_get_by_index(struct net *net, int ifindex);
struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
struct net_device *dev_get_by_napi_id(unsigned int napi_id);
int netdev_get_name(struct net *net, char *name, int ifindex);
int dev_restart(struct net_device *dev);
int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb);
int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb);

static inline unsigned int skb_gro_offset(const struct sk_buff *skb)
{
        return NAPI_GRO_CB(skb)->data_offset;
}

static inline unsigned int skb_gro_len(const struct sk_buff *skb)
{
        return skb->len - NAPI_GRO_CB(skb)->data_offset;
}

static inline void skb_gro_pull(struct sk_buff *skb, unsigned int len)
{
        NAPI_GRO_CB(skb)->data_offset += len;
}

static inline void *skb_gro_header_fast(struct sk_buff *skb,
                                        unsigned int offset)
{
        return NAPI_GRO_CB(skb)->frag0 + offset;
}

static inline int skb_gro_header_hard(struct sk_buff *skb, unsigned int hlen)
{
        return NAPI_GRO_CB(skb)->frag0_len < hlen;
}

static inline void skb_gro_frag0_invalidate(struct sk_buff *skb)
{
        NAPI_GRO_CB(skb)->frag0 = NULL;
        NAPI_GRO_CB(skb)->frag0_len = 0;
}

static inline void *skb_gro_header_slow(struct sk_buff *skb, unsigned int hlen,
                                        unsigned int offset)
{
        if (!pskb_may_pull(skb, hlen))
                return NULL;

        skb_gro_frag0_invalidate(skb);
        return skb->data + offset;
}

static inline void *skb_gro_network_header(struct sk_buff *skb)
{
        return (NAPI_GRO_CB(skb)->frag0 ?: skb->data) +
               skb_network_offset(skb);
}

static inline void skb_gro_postpull_rcsum(struct sk_buff *skb,
                                        const void *start, unsigned int len)
{
        if (NAPI_GRO_CB(skb)->csum_valid)
                NAPI_GRO_CB(skb)->csum = csum_sub(NAPI_GRO_CB(skb)->csum,
                                                  csum_partial(start, len, 0));
}

/* GRO checksum functions. These are logical equivalents of the normal
 * checksum functions (in skbuff.h) except that they operate on the GRO
 * offsets and fields in sk_buff.
 */

__sum16 __skb_gro_checksum_complete(struct sk_buff *skb);

static inline bool skb_at_gro_remcsum_start(struct sk_buff *skb)
{
        return (NAPI_GRO_CB(skb)->gro_remcsum_start == skb_gro_offset(skb));
}

static inline bool __skb_gro_checksum_validate_needed(struct sk_buff *skb,
                                                      bool zero_okay,
                                                      __sum16 check)
{
        return ((skb->ip_summed != CHECKSUM_PARTIAL ||
                skb_checksum_start_offset(skb) <
                 skb_gro_offset(skb)) &&
                !skb_at_gro_remcsum_start(skb) &&
                NAPI_GRO_CB(skb)->csum_cnt == 0 &&
                (!zero_okay || check));
}

static inline __sum16 __skb_gro_checksum_validate_complete(struct sk_buff *skb,
                                                           __wsum psum)
{
        if (NAPI_GRO_CB(skb)->csum_valid &&
            !csum_fold(csum_add(psum, NAPI_GRO_CB(skb)->csum)))
                return 0;

        NAPI_GRO_CB(skb)->csum = psum;

        return __skb_gro_checksum_complete(skb);
}

static inline void skb_gro_incr_csum_unnecessary(struct sk_buff *skb)
{
        if (NAPI_GRO_CB(skb)->csum_cnt > 0) {
                /* Consume a checksum from CHECKSUM_UNNECESSARY */
                NAPI_GRO_CB(skb)->csum_cnt--;
        } else {
                /* Update skb for CHECKSUM_UNNECESSARY and csum_level when we
                 * verified a new top level checksum or an encapsulated one
                 * during GRO. This saves work if we fallback to normal path.
                 */
                __skb_incr_checksum_unnecessary(skb);
        }
}

#define __skb_gro_checksum_validate(skb, proto, zero_okay, check,        \
                                    compute_pseudo)                        \
({                                                                        \
        __sum16 __ret = 0;                                                \
        if (__skb_gro_checksum_validate_needed(skb, zero_okay, check))        \
                __ret = __skb_gro_checksum_validate_complete(skb,        \
                                compute_pseudo(skb, proto));                \
        if (!__ret)                                                        \
                skb_gro_incr_csum_unnecessary(skb);                        \
        __ret;                                                                \
})

#define skb_gro_checksum_validate(skb, proto, compute_pseudo)                \
        __skb_gro_checksum_validate(skb, proto, false, 0, compute_pseudo)

#define skb_gro_checksum_validate_zero_check(skb, proto, check,                \
                                             compute_pseudo)                \
        __skb_gro_checksum_validate(skb, proto, true, check, compute_pseudo)

#define skb_gro_checksum_simple_validate(skb)                                \
        __skb_gro_checksum_validate(skb, 0, false, 0, null_compute_pseudo)

static inline bool __skb_gro_checksum_convert_check(struct sk_buff *skb)
{
        return (NAPI_GRO_CB(skb)->csum_cnt == 0 &&
                !NAPI_GRO_CB(skb)->csum_valid);
}

static inline void __skb_gro_checksum_convert(struct sk_buff *skb,
                                              __wsum pseudo)
{
        NAPI_GRO_CB(skb)->csum = ~pseudo;
        NAPI_GRO_CB(skb)->csum_valid = 1;
}

#define skb_gro_checksum_try_convert(skb, proto, compute_pseudo)        \
do {                                                                        \
        if (__skb_gro_checksum_convert_check(skb))                        \
                __skb_gro_checksum_convert(skb,                         \
                                           compute_pseudo(skb, proto));        \
} while (0)

struct gro_remcsum {
        int offset;
        __wsum delta;
};

static inline void skb_gro_remcsum_init(struct gro_remcsum *grc)
{
        grc->offset = 0;
        grc->delta = 0;
}

static inline void *skb_gro_remcsum_process(struct sk_buff *skb, void *ptr,
                                            unsigned int off, size_t hdrlen,
                                            int start, int offset,
                                            struct gro_remcsum *grc,
                                            bool nopartial)
{
        __wsum delta;
        size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);

        BUG_ON(!NAPI_GRO_CB(skb)->csum_valid);

        if (!nopartial) {
                NAPI_GRO_CB(skb)->gro_remcsum_start = off + hdrlen + start;
                return ptr;
        }

        ptr = skb_gro_header_fast(skb, off);
        if (skb_gro_header_hard(skb, off + plen)) {
                ptr = skb_gro_header_slow(skb, off + plen, off);
                if (!ptr)
                        return NULL;
        }

        delta = remcsum_adjust(ptr + hdrlen, NAPI_GRO_CB(skb)->csum,
                               start, offset);

        /* Adjust skb->csum since we changed the packet */
        NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta);

        grc->offset = off + hdrlen + offset;
        grc->delta = delta;

        return ptr;
}

static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb,
                                           struct gro_remcsum *grc)
{
        void *ptr;
        size_t plen = grc->offset + sizeof(u16);

        if (!grc->delta)
                return;

        ptr = skb_gro_header_fast(skb, grc->offset);
        if (skb_gro_header_hard(skb, grc->offset + sizeof(u16))) {
                ptr = skb_gro_header_slow(skb, plen, grc->offset);
                if (!ptr)
                        return;
        }

        remcsum_unadjust((__sum16 *)ptr, grc->delta);
}

#ifdef CONFIG_XFRM_OFFLOAD
static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush)
{
        if (PTR_ERR(pp) != -EINPROGRESS)
                NAPI_GRO_CB(skb)->flush |= flush;
}
static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb,
                                               struct sk_buff *pp,
                                               int flush,
                                               struct gro_remcsum *grc)
{
        if (PTR_ERR(pp) != -EINPROGRESS) {
                NAPI_GRO_CB(skb)->flush |= flush;
                skb_gro_remcsum_cleanup(skb, grc);
                skb->remcsum_offload = 0;
        }
}
#else
static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush)
{
        NAPI_GRO_CB(skb)->flush |= flush;
}
static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb,
                                               struct sk_buff *pp,
                                               int flush,
                                               struct gro_remcsum *grc)
{
        NAPI_GRO_CB(skb)->flush |= flush;
        skb_gro_remcsum_cleanup(skb, grc);
        skb->remcsum_offload = 0;
}
#endif

static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
                                  unsigned short type,
                                  const void *daddr, const void *saddr,
                                  unsigned int len)
{
        if (!dev->header_ops || !dev->header_ops->create)
                return 0;

        return dev->header_ops->create(skb, dev, type, daddr, saddr, len);
}

static inline int dev_parse_header(const struct sk_buff *skb,
                                   unsigned char *haddr)
{
        const struct net_device *dev = skb->dev;

        if (!dev->header_ops || !dev->header_ops->parse)
                return 0;
        return dev->header_ops->parse(skb, haddr);
}

static inline __be16 dev_parse_header_protocol(const struct sk_buff *skb)
{
        const struct net_device *dev = skb->dev;

        if (!dev->header_ops || !dev->header_ops->parse_protocol)
                return 0;
        return dev->header_ops->parse_protocol(skb);
}

/* ll_header must have at least hard_header_len allocated */
static inline bool dev_validate_header(const struct net_device *dev,
                                       char *ll_header, int len)
{
        if (likely(len >= dev->hard_header_len))
                return true;
        if (len < dev->min_header_len)
                return false;

        if (capable(CAP_SYS_RAWIO)) {
                memset(ll_header + len, 0, dev->hard_header_len - len);
                return true;
        }

        if (dev->header_ops && dev->header_ops->validate)
                return dev->header_ops->validate(ll_header, len);

        return false;
}

static inline bool dev_has_header(const struct net_device *dev)
{
        return dev->header_ops && dev->header_ops->create;
}

#ifdef CONFIG_NET_FLOW_LIMIT
#define FLOW_LIMIT_HISTORY        (1 << 7)  /* must be ^2 and !overflow buckets */
struct sd_flow_limit {
        u64                        count;
        unsigned int                num_buckets;
        unsigned int                history_head;
        u16                        history[FLOW_LIMIT_HISTORY];
        u8                        buckets[];
};

extern int netdev_flow_limit_table_len;
#endif /* CONFIG_NET_FLOW_LIMIT */

/*
 * Incoming packets are placed on per-CPU queues
 */
struct softnet_data {
        struct list_head        poll_list;
        struct sk_buff_head        process_queue;

        /* stats */
        unsigned int                processed;
        unsigned int                time_squeeze;
        unsigned int                received_rps;
#ifdef CONFIG_RPS
        struct softnet_data        *rps_ipi_list;
#endif
#ifdef CONFIG_NET_FLOW_LIMIT
        struct sd_flow_limit __rcu *flow_limit;
#endif
        struct Qdisc                *output_queue;
        struct Qdisc                **output_queue_tailp;
        struct sk_buff                *completion_queue;
#ifdef CONFIG_XFRM_OFFLOAD
        struct sk_buff_head        xfrm_backlog;
#endif
        /* written and read only by owning cpu: */
        struct {
                u16 recursion;
                u8  more;
        } xmit;
#ifdef CONFIG_RPS
        /* input_queue_head should be written by cpu owning this struct,
         * and only read by other cpus. Worth using a cache line.
         */
        unsigned int                input_queue_head ____cacheline_aligned_in_smp;

        /* Elements below can be accessed between CPUs for RPS/RFS */
        call_single_data_t        csd ____cacheline_aligned_in_smp;
        struct softnet_data        *rps_ipi_next;
        unsigned int                cpu;
        unsigned int                input_queue_tail;
#endif
        unsigned int                dropped;
        struct sk_buff_head        input_pkt_queue;
        struct napi_struct        backlog;

};

static inline void input_queue_head_incr(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
        sd->input_queue_head++;
#endif
}

static inline void input_queue_tail_incr_save(struct softnet_data *sd,
                                              unsigned int *qtail)
{
#ifdef CONFIG_RPS
        *qtail = ++sd->input_queue_tail;
#endif
}

DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);

static inline int dev_recursion_level(void)
{
        return this_cpu_read(softnet_data.xmit.recursion);
}

#define XMIT_RECURSION_LIMIT        8
static inline bool dev_xmit_recursion(void)
{
        return unlikely(__this_cpu_read(softnet_data.xmit.recursion) >
                        XMIT_RECURSION_LIMIT);
}

static inline void dev_xmit_recursion_inc(void)
{
        __this_cpu_inc(softnet_data.xmit.recursion);
}

static inline void dev_xmit_recursion_dec(void)
{
        __this_cpu_dec(softnet_data.xmit.recursion);
}

void __netif_schedule(struct Qdisc *q);
void netif_schedule_queue(struct netdev_queue *txq);

static inline void netif_tx_schedule_all(struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++)
                netif_schedule_queue(netdev_get_tx_queue(dev, i));
}

static __always_inline void netif_tx_start_queue(struct netdev_queue *dev_queue)
{
        clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state);
}

/**
 *        netif_start_queue - allow transmit
 *        @dev: network device
 *
 *        Allow upper layers to call the device hard_start_xmit routine.
 */
static inline void netif_start_queue(struct net_device *dev)
{
        netif_tx_start_queue(netdev_get_tx_queue(dev, 0));
}

static inline void netif_tx_start_all_queues(struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
                netif_tx_start_queue(txq);
        }
}

void netif_tx_wake_queue(struct netdev_queue *dev_queue);

/**
 *        netif_wake_queue - restart transmit
 *        @dev: network device
 *
 *        Allow upper layers to call the device hard_start_xmit routine.
 *        Used for flow control when transmit resources are available.
 */
static inline void netif_wake_queue(struct net_device *dev)
{
        netif_tx_wake_queue(netdev_get_tx_queue(dev, 0));
}

static inline void netif_tx_wake_all_queues(struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
                netif_tx_wake_queue(txq);
        }
}

static __always_inline void netif_tx_stop_queue(struct netdev_queue *dev_queue)
{
        set_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state);
}

/**
 *        netif_stop_queue - stop transmitted packets
 *        @dev: network device
 *
 *        Stop upper layers calling the device hard_start_xmit routine.
 *        Used for flow control when transmit resources are unavailable.
 */
static inline void netif_stop_queue(struct net_device *dev)
{
        netif_tx_stop_queue(netdev_get_tx_queue(dev, 0));
}

void netif_tx_stop_all_queues(struct net_device *dev);

static inline bool netif_tx_queue_stopped(const struct netdev_queue *dev_queue)
{
        return test_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state);
}

/**
 *        netif_queue_stopped - test if transmit queue is flowblocked
 *        @dev: network device
 *
 *        Test if transmit queue on device is currently unable to send.
 */
static inline bool netif_queue_stopped(const struct net_device *dev)
{
        return netif_tx_queue_stopped(netdev_get_tx_queue(dev, 0));
}

static inline bool netif_xmit_stopped(const struct netdev_queue *dev_queue)
{
        return dev_queue->state & QUEUE_STATE_ANY_XOFF;
}

static inline bool
netif_xmit_frozen_or_stopped(const struct netdev_queue *dev_queue)
{
        return dev_queue->state & QUEUE_STATE_ANY_XOFF_OR_FROZEN;
}

static inline bool
netif_xmit_frozen_or_drv_stopped(const struct netdev_queue *dev_queue)
{
        return dev_queue->state & QUEUE_STATE_DRV_XOFF_OR_FROZEN;
}

/**
 *        netdev_txq_bql_enqueue_prefetchw - prefetch bql data for write
 *        @dev_queue: pointer to transmit queue
 *
 * BQL enabled drivers might use this helper in their ndo_start_xmit(),
 * to give appropriate hint to the CPU.
 */
static inline void netdev_txq_bql_enqueue_prefetchw(struct netdev_queue *dev_queue)
{
#ifdef CONFIG_BQL
        prefetchw(&dev_queue->dql.num_queued);
#endif
}

/**
 *        netdev_txq_bql_complete_prefetchw - prefetch bql data for write
 *        @dev_queue: pointer to transmit queue
 *
 * BQL enabled drivers might use this helper in their TX completion path,
 * to give appropriate hint to the CPU.
 */
static inline void netdev_txq_bql_complete_prefetchw(struct netdev_queue *dev_queue)
{
#ifdef CONFIG_BQL
        prefetchw(&dev_queue->dql.limit);
#endif
}

static inline void netdev_tx_sent_queue(struct netdev_queue *dev_queue,
                                        unsigned int bytes)
{
#ifdef CONFIG_BQL
        dql_queued(&dev_queue->dql, bytes);

        if (likely(dql_avail(&dev_queue->dql) >= 0))
                return;

        set_bit(__QUEUE_STATE_STACK_XOFF, &dev_queue->state);

        /*
         * The XOFF flag must be set before checking the dql_avail below,
         * because in netdev_tx_completed_queue we update the dql_completed
         * before checking the XOFF flag.
         */
        smp_mb();

        /* check again in case another CPU has just made room avail */
        if (unlikely(dql_avail(&dev_queue->dql) >= 0))
                clear_bit(__QUEUE_STATE_STACK_XOFF, &dev_queue->state);
#endif
}

/* Variant of netdev_tx_sent_queue() for drivers that are aware
 * that they should not test BQL status themselves.
 * We do want to change __QUEUE_STATE_STACK_XOFF only for the last
 * skb of a batch.
 * Returns true if the doorbell must be used to kick the NIC.
 */
static inline bool __netdev_tx_sent_queue(struct netdev_queue *dev_queue,
                                          unsigned int bytes,
                                          bool xmit_more)
{
        if (xmit_more) {
#ifdef CONFIG_BQL
                dql_queued(&dev_queue->dql, bytes);
#endif
                return netif_tx_queue_stopped(dev_queue);
        }
        netdev_tx_sent_queue(dev_queue, bytes);
        return true;
}

/**
 *         netdev_sent_queue - report the number of bytes queued to hardware
 *         @dev: network device
 *         @bytes: number of bytes queued to the hardware device queue
 *
 *         Report the number of bytes queued for sending/completion to the network
 *         device hardware queue. @bytes should be a good approximation and should
 *         exactly match netdev_completed_queue() @bytes
 */
static inline void netdev_sent_queue(struct net_device *dev, unsigned int bytes)
{
        netdev_tx_sent_queue(netdev_get_tx_queue(dev, 0), bytes);
}

static inline bool __netdev_sent_queue(struct net_device *dev,
                                       unsigned int bytes,
                                       bool xmit_more)
{
        return __netdev_tx_sent_queue(netdev_get_tx_queue(dev, 0), bytes,
                                      xmit_more);
}

static inline void netdev_tx_completed_queue(struct netdev_queue *dev_queue,
                                             unsigned int pkts, unsigned int bytes)
{
#ifdef CONFIG_BQL
        if (unlikely(!bytes))
                return;

        dql_completed(&dev_queue->dql, bytes);

        /*
         * Without the memory barrier there is a small possiblity that
         * netdev_tx_sent_queue will miss the update and cause the queue to
         * be stopped forever
         */
        smp_mb();

        if (unlikely(dql_avail(&dev_queue->dql) < 0))
                return;

        if (test_and_clear_bit(__QUEUE_STATE_STACK_XOFF, &dev_queue->state))
                netif_schedule_queue(dev_queue);
#endif
}

/**
 *         netdev_completed_queue - report bytes and packets completed by device
 *         @dev: network device
 *         @pkts: actual number of packets sent over the medium
 *         @bytes: actual number of bytes sent over the medium
 *
 *         Report the number of bytes and packets transmitted by the network device
 *         hardware queue over the physical medium, @bytes must exactly match the
 *         @bytes amount passed to netdev_sent_queue()
 */
static inline void netdev_completed_queue(struct net_device *dev,
                                          unsigned int pkts, unsigned int bytes)
{
        netdev_tx_completed_queue(netdev_get_tx_queue(dev, 0), pkts, bytes);
}

static inline void netdev_tx_reset_queue(struct netdev_queue *q)
{
#ifdef CONFIG_BQL
        clear_bit(__QUEUE_STATE_STACK_XOFF, &q->state);
        dql_reset(&q->dql);
#endif
}

/**
 *         netdev_reset_queue - reset the packets and bytes count of a network device
 *         @dev_queue: network device
 *
 *         Reset the bytes and packet count of a network device and clear the
 *         software flow control OFF bit for this network device
 */
static inline void netdev_reset_queue(struct net_device *dev_queue)
{
        netdev_tx_reset_queue(netdev_get_tx_queue(dev_queue, 0));
}

/**
 *         netdev_cap_txqueue - check if selected tx queue exceeds device queues
 *         @dev: network device
 *         @queue_index: given tx queue index
 *
 *         Returns 0 if given tx queue index >= number of device tx queues,
 *         otherwise returns the originally passed tx queue index.
 */
static inline u16 netdev_cap_txqueue(struct net_device *dev, u16 queue_index)
{
        if (unlikely(queue_index >= dev->real_num_tx_queues)) {
                net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n",
                                     dev->name, queue_index,
                                     dev->real_num_tx_queues);
                return 0;
        }

        return queue_index;
}

/**
 *        netif_running - test if up
 *        @dev: network device
 *
 *        Test if the device has been brought up.
 */
static inline bool netif_running(const struct net_device *dev)
{
        return test_bit(__LINK_STATE_START, &dev->state);
}

/*
 * Routines to manage the subqueues on a device.  We only need start,
 * stop, and a check if it's stopped.  All other device management is
 * done at the overall netdevice level.
 * Also test the device if we're multiqueue.
 */

/**
 *        netif_start_subqueue - allow sending packets on subqueue
 *        @dev: network device
 *        @queue_index: sub queue index
 *
 * Start individual transmit queue of a device with multiple transmit queues.
 */
static inline void netif_start_subqueue(struct net_device *dev, u16 queue_index)
{
        struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);

        netif_tx_start_queue(txq);
}

/**
 *        netif_stop_subqueue - stop sending packets on subqueue
 *        @dev: network device
 *        @queue_index: sub queue index
 *
 * Stop individual transmit queue of a device with multiple transmit queues.
 */
static inline void netif_stop_subqueue(struct net_device *dev, u16 queue_index)
{
        struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
        netif_tx_stop_queue(txq);
}

/**
 *        netif_subqueue_stopped - test status of subqueue
 *        @dev: network device
 *        @queue_index: sub queue index
 *
 * Check individual transmit queue of a device with multiple transmit queues.
 */
static inline bool __netif_subqueue_stopped(const struct net_device *dev,
                                            u16 queue_index)
{
        struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);

        return netif_tx_queue_stopped(txq);
}

static inline bool netif_subqueue_stopped(const struct net_device *dev,
                                          struct sk_buff *skb)
{
        return __netif_subqueue_stopped(dev, skb_get_queue_mapping(skb));
}

/**
 *        netif_wake_subqueue - allow sending packets on subqueue
 *        @dev: network device
 *        @queue_index: sub queue index
 *
 * Resume individual transmit queue of a device with multiple transmit queues.
 */
static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
{
        struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);

        netif_tx_wake_queue(txq);
}

#ifdef CONFIG_XPS
int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
                        u16 index);
int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
                          u16 index, bool is_rxqs_map);

/**
 *        netif_attr_test_mask - Test a CPU or Rx queue set in a mask
 *        @j: CPU/Rx queue index
 *        @mask: bitmask of all cpus/rx queues
 *        @nr_bits: number of bits in the bitmask
 *
 * Test if a CPU or Rx queue index is set in a mask of all CPU/Rx queues.
 */
static inline bool netif_attr_test_mask(unsigned long j,
                                        const unsigned long *mask,
                                        unsigned int nr_bits)
{
        cpu_max_bits_warn(j, nr_bits);
        return test_bit(j, mask);
}

/**
 *        netif_attr_test_online - Test for online CPU/Rx queue
 *        @j: CPU/Rx queue index
 *        @online_mask: bitmask for CPUs/Rx queues that are online
 *        @nr_bits: number of bits in the bitmask
 *
 * Returns true if a CPU/Rx queue is online.
 */
static inline bool netif_attr_test_online(unsigned long j,
                                          const unsigned long *online_mask,
                                          unsigned int nr_bits)
{
        cpu_max_bits_warn(j, nr_bits);

        if (online_mask)
                return test_bit(j, online_mask);

        return (j < nr_bits);
}

/**
 *        netif_attrmask_next - get the next CPU/Rx queue in a cpu/Rx queues mask
 *        @n: CPU/Rx queue index
 *        @srcp: the cpumask/Rx queue mask pointer
 *        @nr_bits: number of bits in the bitmask
 *
 * Returns >= nr_bits if no further CPUs/Rx queues set.
 */
static inline unsigned int netif_attrmask_next(int n, const unsigned long *srcp,
                                               unsigned int nr_bits)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpu_max_bits_warn(n, nr_bits);

        if (srcp)
                return find_next_bit(srcp, nr_bits, n + 1);

        return n + 1;
}

/**
 *        netif_attrmask_next_and - get the next CPU/Rx queue in \*src1p & \*src2p
 *        @n: CPU/Rx queue index
 *        @src1p: the first CPUs/Rx queues mask pointer
 *        @src2p: the second CPUs/Rx queues mask pointer
 *        @nr_bits: number of bits in the bitmask
 *
 * Returns >= nr_bits if no further CPUs/Rx queues set in both.
 */
static inline int netif_attrmask_next_and(int n, const unsigned long *src1p,
                                          const unsigned long *src2p,
                                          unsigned int nr_bits)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpu_max_bits_warn(n, nr_bits);

        if (src1p && src2p)
                return find_next_and_bit(src1p, src2p, nr_bits, n + 1);
        else if (src1p)
                return find_next_bit(src1p, nr_bits, n + 1);
        else if (src2p)
                return find_next_bit(src2p, nr_bits, n + 1);

        return n + 1;
}
#else
static inline int netif_set_xps_queue(struct net_device *dev,
                                      const struct cpumask *mask,
                                      u16 index)
{
        return 0;
}

static inline int __netif_set_xps_queue(struct net_device *dev,
                                        const unsigned long *mask,
                                        u16 index, bool is_rxqs_map)
{
        return 0;
}
#endif

/**
 *        netif_is_multiqueue - test if device has multiple transmit queues
 *        @dev: network device
 *
 * Check if device has multiple transmit queues
 */
static inline bool netif_is_multiqueue(const struct net_device *dev)
{
        return dev->num_tx_queues > 1;
}

int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq);

#ifdef CONFIG_SYSFS
int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq);
#else
static inline int netif_set_real_num_rx_queues(struct net_device *dev,
                                                unsigned int rxqs)
{
        dev->real_num_rx_queues = rxqs;
        return 0;
}
#endif

static inline struct netdev_rx_queue *
__netif_get_rx_queue(struct net_device *dev, unsigned int rxq)
{
        return dev->_rx + rxq;
}

#ifdef CONFIG_SYSFS
static inline unsigned int get_netdev_rx_queue_index(
                struct netdev_rx_queue *queue)
{
        struct net_device *dev = queue->dev;
        int index = queue - dev->_rx;

        BUG_ON(index >= dev->num_rx_queues);
        return index;
}
#endif

#define DEFAULT_MAX_NUM_RSS_QUEUES        (8)
int netif_get_num_default_rss_queues(void);

enum skb_free_reason {
        SKB_REASON_CONSUMED,
        SKB_REASON_DROPPED,
};

void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason);
void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason);

/*
 * It is not allowed to call kfree_skb() or consume_skb() from hardware
 * interrupt context or with hardware interrupts being disabled.
 * (in_irq() || irqs_disabled())
 *
 * We provide four helpers that can be used in following contexts :
 *
 * dev_kfree_skb_irq(skb) when caller drops a packet from irq context,
 *  replacing kfree_skb(skb)
 *
 * dev_consume_skb_irq(skb) when caller consumes a packet from irq context.
 *  Typically used in place of consume_skb(skb) in TX completion path
 *
 * dev_kfree_skb_any(skb) when caller doesn't know its current irq context,
 *  replacing kfree_skb(skb)
 *
 * dev_consume_skb_any(skb) when caller doesn't know its current irq context,
 *  and consumed a packet. Used in place of consume_skb(skb)
 */
static inline void dev_kfree_skb_irq(struct sk_buff *skb)
{
        __dev_kfree_skb_irq(skb, SKB_REASON_DROPPED);
}

static inline void dev_consume_skb_irq(struct sk_buff *skb)
{
        __dev_kfree_skb_irq(skb, SKB_REASON_CONSUMED);
}

static inline void dev_kfree_skb_any(struct sk_buff *skb)
{
        __dev_kfree_skb_any(skb, SKB_REASON_DROPPED);
}

static inline void dev_consume_skb_any(struct sk_buff *skb)
{
        __dev_kfree_skb_any(skb, SKB_REASON_CONSUMED);
}

void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog);
int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb);
int netif_rx(struct sk_buff *skb);
int netif_rx_ni(struct sk_buff *skb);
int netif_rx_any_context(struct sk_buff *skb);
int netif_receive_skb(struct sk_buff *skb);
int netif_receive_skb_core(struct sk_buff *skb);
void netif_receive_skb_list(struct list_head *head);
gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
void napi_gro_flush(struct napi_struct *napi, bool flush_old);
struct sk_buff *napi_get_frags(struct napi_struct *napi);
gro_result_t napi_gro_frags(struct napi_struct *napi);
struct packet_offload *gro_find_receive_by_type(__be16 type);
struct packet_offload *gro_find_complete_by_type(__be16 type);

static inline void napi_free_frags(struct napi_struct *napi)
{
        kfree_skb(napi->skb);
        napi->skb = NULL;
}

bool netdev_is_rx_handler_busy(struct net_device *dev);
int netdev_rx_handler_register(struct net_device *dev,
                               rx_handler_func_t *rx_handler,
                               void *rx_handler_data);
void netdev_rx_handler_unregister(struct net_device *dev);

bool dev_valid_name(const char *name);
static inline bool is_socket_ioctl_cmd(unsigned int cmd)
{
        return _IOC_TYPE(cmd) == SOCK_IOC_TYPE;
}
int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr,
                bool *need_copyout);
int dev_ifconf(struct net *net, struct ifconf *, int);
int dev_ethtool(struct net *net, struct ifreq *);
unsigned int dev_get_flags(const struct net_device *);
int __dev_change_flags(struct net_device *dev, unsigned int flags,
                       struct netlink_ext_ack *extack);
int dev_change_flags(struct net_device *dev, unsigned int flags,
                     struct netlink_ext_ack *extack);
void __dev_notify_flags(struct net_device *, unsigned int old_flags,
                        unsigned int gchanges);
int dev_change_name(struct net_device *, const char *);
int dev_set_alias(struct net_device *, const char *, size_t);
int dev_get_alias(const struct net_device *, char *, size_t);
int dev_change_net_namespace(struct net_device *, struct net *, const char *);
int __dev_set_mtu(struct net_device *, int);
int dev_validate_mtu(struct net_device *dev, int mtu,
                     struct netlink_ext_ack *extack);
int dev_set_mtu_ext(struct net_device *dev, int mtu,
                    struct netlink_ext_ack *extack);
int dev_set_mtu(struct net_device *, int);
int dev_change_tx_queue_len(struct net_device *, unsigned long);
void dev_set_group(struct net_device *, int);
int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
                              struct netlink_ext_ack *extack);
int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
                        struct netlink_ext_ack *extack);
int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
                             struct netlink_ext_ack *extack);
int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name);
int dev_change_carrier(struct net_device *, bool new_carrier);
int dev_get_phys_port_id(struct net_device *dev,
                         struct netdev_phys_item_id *ppid);
int dev_get_phys_port_name(struct net_device *dev,
                           char *name, size_t len);
int dev_get_port_parent_id(struct net_device *dev,
                           struct netdev_phys_item_id *ppid, bool recurse);
bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b);
int dev_change_proto_down(struct net_device *dev, bool proto_down);
int dev_change_proto_down_generic(struct net_device *dev, bool proto_down);
void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
                                  u32 value);
struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again);
struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
                                    struct netdev_queue *txq, int *ret);

typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
                      int fd, int expected_fd, u32 flags);
int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode);

int xdp_umem_query(struct net_device *dev, u16 queue_id);

int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
bool is_skb_forwardable(const struct net_device *dev,
                        const struct sk_buff *skb);

static __always_inline int ____dev_forward_skb(struct net_device *dev,
                                               struct sk_buff *skb)
{
        if (skb_orphan_frags(skb, GFP_ATOMIC) ||
            unlikely(!is_skb_forwardable(dev, skb))) {
                atomic_long_inc(&dev->rx_dropped);
                kfree_skb(skb);
                return NET_RX_DROP;
        }

        skb_scrub_packet(skb, !net_eq(dev_net(dev), dev_net(skb->dev)));
        skb->priority = 0;
        return 0;
}

bool dev_nit_active(struct net_device *dev);
void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);

extern int                netdev_budget;
extern unsigned int        netdev_budget_usecs;

/* Called by rtnetlink.c:rtnl_unlock() */
void netdev_run_todo(void);

/**
 *        dev_put - release reference to device
 *        @dev: network device
 *
 * Release reference to device to allow it to be freed.
 */
static inline void dev_put(struct net_device *dev)
{
        if (dev)
                this_cpu_dec(*dev->pcpu_refcnt);
}

/**
 *        dev_hold - get reference to device
 *        @dev: network device
 *
 * Hold reference to device to keep it from being freed.
 */
static inline void dev_hold(struct net_device *dev)
{
        if (dev)
                this_cpu_inc(*dev->pcpu_refcnt);
}

/* Carrier loss detection, dial on demand. The functions netif_carrier_on
 * and _off may be called from IRQ context, but it is caller
 * who is responsible for serialization of these calls.
 *
 * The name carrier is inappropriate, these functions should really be
 * called netif_lowerlayer_*() because they represent the state of any
 * kind of lower layer not just hardware media.
 */

void linkwatch_init_dev(struct net_device *dev);
void linkwatch_fire_event(struct net_device *dev);
void linkwatch_forget_dev(struct net_device *dev);

/**
 *        netif_carrier_ok - test if carrier present
 *        @dev: network device
 *
 * Check if carrier is present on device
 */
static inline bool netif_carrier_ok(const struct net_device *dev)
{
        return !test_bit(__LINK_STATE_NOCARRIER, &dev->state);
}

unsigned long dev_trans_start(struct net_device *dev);

void __netdev_watchdog_up(struct net_device *dev);

void netif_carrier_on(struct net_device *dev);

void netif_carrier_off(struct net_device *dev);

/**
 *        netif_dormant_on - mark device as dormant.
 *        @dev: network device
 *
 * Mark device as dormant (as per RFC2863).
 *
 * The dormant state indicates that the relevant interface is not
 * actually in a condition to pass packets (i.e., it is not 'up') but is
 * in a "pending" state, waiting for some external event.  For "on-
 * demand" interfaces, this new state identifies the situation where the
 * interface is waiting for events to place it in the up state.
 */
static inline void netif_dormant_on(struct net_device *dev)
{
        if (!test_and_set_bit(__LINK_STATE_DORMANT, &dev->state))
                linkwatch_fire_event(dev);
}

/**
 *        netif_dormant_off - set device as not dormant.
 *        @dev: network device
 *
 * Device is not in dormant state.
 */
static inline void netif_dormant_off(struct net_device *dev)
{
        if (test_and_clear_bit(__LINK_STATE_DORMANT, &dev->state))
                linkwatch_fire_event(dev);
}

/**
 *        netif_dormant - test if device is dormant
 *        @dev: network device
 *
 * Check if device is dormant.
 */
static inline bool netif_dormant(const struct net_device *dev)
{
        return test_bit(__LINK_STATE_DORMANT, &dev->state);
}


/**
 *        netif_testing_on - mark device as under test.
 *        @dev: network device
 *
 * Mark device as under test (as per RFC2863).
 *
 * The testing state indicates that some test(s) must be performed on
 * the interface. After completion, of the test, the interface state
 * will change to up, dormant, or down, as appropriate.
 */
static inline void netif_testing_on(struct net_device *dev)
{
        if (!test_and_set_bit(__LINK_STATE_TESTING, &dev->state))
                linkwatch_fire_event(dev);
}

/**
 *        netif_testing_off - set device as not under test.
 *        @dev: network device
 *
 * Device is not in testing state.
 */
static inline void netif_testing_off(struct net_device *dev)
{
        if (test_and_clear_bit(__LINK_STATE_TESTING, &dev->state))
                linkwatch_fire_event(dev);
}

/**
 *        netif_testing - test if device is under test
 *        @dev: network device
 *
 * Check if device is under test
 */
static inline bool netif_testing(const struct net_device *dev)
{
        return test_bit(__LINK_STATE_TESTING, &dev->state);
}


/**
 *        netif_oper_up - test if device is operational
 *        @dev: network device
 *
 * Check if carrier is operational
 */
static inline bool netif_oper_up(const struct net_device *dev)
{
        return (dev->operstate == IF_OPER_UP ||
                dev->operstate == IF_OPER_UNKNOWN /* backward compat */);
}

/**
 *        netif_device_present - is device available or removed
 *        @dev: network device
 *
 * Check if device has not been removed from system.
 */
static inline bool netif_device_present(struct net_device *dev)
{
        return test_bit(__LINK_STATE_PRESENT, &dev->state);
}

void netif_device_detach(struct net_device *dev);

void netif_device_attach(struct net_device *dev);

/*
 * Network interface message level settings
 */

enum {
        NETIF_MSG_DRV_BIT,
        NETIF_MSG_PROBE_BIT,
        NETIF_MSG_LINK_BIT,
        NETIF_MSG_TIMER_BIT,
        NETIF_MSG_IFDOWN_BIT,
        NETIF_MSG_IFUP_BIT,
        NETIF_MSG_RX_ERR_BIT,
        NETIF_MSG_TX_ERR_BIT,
        NETIF_MSG_TX_QUEUED_BIT,
        NETIF_MSG_INTR_BIT,
        NETIF_MSG_TX_DONE_BIT,
        NETIF_MSG_RX_STATUS_BIT,
        NETIF_MSG_PKTDATA_BIT,
        NETIF_MSG_HW_BIT,
        NETIF_MSG_WOL_BIT,

        /* When you add a new bit above, update netif_msg_class_names array
         * in net/ethtool/common.c
         */
        NETIF_MSG_CLASS_COUNT,
};
/* Both ethtool_ops interface and internal driver implementation use u32 */
static_assert(NETIF_MSG_CLASS_COUNT <= 32);

#define __NETIF_MSG_BIT(bit)        ((u32)1 << (bit))
#define __NETIF_MSG(name)        __NETIF_MSG_BIT(NETIF_MSG_ ## name ## _BIT)

#define NETIF_MSG_DRV                __NETIF_MSG(DRV)
#define NETIF_MSG_PROBE                __NETIF_MSG(PROBE)
#define NETIF_MSG_LINK                __NETIF_MSG(LINK)
#define NETIF_MSG_TIMER                __NETIF_MSG(TIMER)
#define NETIF_MSG_IFDOWN        __NETIF_MSG(IFDOWN)
#define NETIF_MSG_IFUP                __NETIF_MSG(IFUP)
#define NETIF_MSG_RX_ERR        __NETIF_MSG(RX_ERR)
#define NETIF_MSG_TX_ERR        __NETIF_MSG(TX_ERR)
#define NETIF_MSG_TX_QUEUED        __NETIF_MSG(TX_QUEUED)
#define NETIF_MSG_INTR                __NETIF_MSG(INTR)
#define NETIF_MSG_TX_DONE        __NETIF_MSG(TX_DONE)
#define NETIF_MSG_RX_STATUS        __NETIF_MSG(RX_STATUS)
#define NETIF_MSG_PKTDATA        __NETIF_MSG(PKTDATA)
#define NETIF_MSG_HW                __NETIF_MSG(HW)
#define NETIF_MSG_WOL                __NETIF_MSG(WOL)

#define netif_msg_drv(p)        ((p)->msg_enable & NETIF_MSG_DRV)
#define netif_msg_probe(p)        ((p)->msg_enable & NETIF_MSG_PROBE)
#define netif_msg_link(p)        ((p)->msg_enable & NETIF_MSG_LINK)
#define netif_msg_timer(p)        ((p)->msg_enable & NETIF_MSG_TIMER)
#define netif_msg_ifdown(p)        ((p)->msg_enable & NETIF_MSG_IFDOWN)
#define netif_msg_ifup(p)        ((p)->msg_enable & NETIF_MSG_IFUP)
#define netif_msg_rx_err(p)        ((p)->msg_enable & NETIF_MSG_RX_ERR)
#define netif_msg_tx_err(p)        ((p)->msg_enable & NETIF_MSG_TX_ERR)
#define netif_msg_tx_queued(p)        ((p)->msg_enable & NETIF_MSG_TX_QUEUED)
#define netif_msg_intr(p)        ((p)->msg_enable & NETIF_MSG_INTR)
#define netif_msg_tx_done(p)        ((p)->msg_enable & NETIF_MSG_TX_DONE)
#define netif_msg_rx_status(p)        ((p)->msg_enable & NETIF_MSG_RX_STATUS)
#define netif_msg_pktdata(p)        ((p)->msg_enable & NETIF_MSG_PKTDATA)
#define netif_msg_hw(p)                ((p)->msg_enable & NETIF_MSG_HW)
#define netif_msg_wol(p)        ((p)->msg_enable & NETIF_MSG_WOL)

static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits)
{
        /* use default */
        if (debug_value < 0 || debug_value >= (sizeof(u32) * 8))
                return default_msg_enable_bits;
        if (debug_value == 0)        /* no output */
                return 0;
        /* set low N bits */
        return (1U << debug_value) - 1;
}

static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu)
{
        spin_lock(&txq->_xmit_lock);
        /* Pairs with READ_ONCE() in __dev_queue_xmit() */
        WRITE_ONCE(txq->xmit_lock_owner, cpu);
}

static inline bool __netif_tx_acquire(struct netdev_queue *txq)
{
        __acquire(&txq->_xmit_lock);
        return true;
}

static inline void __netif_tx_release(struct netdev_queue *txq)
{
        __release(&txq->_xmit_lock);
}

static inline void __netif_tx_lock_bh(struct netdev_queue *txq)
{
        spin_lock_bh(&txq->_xmit_lock);
        /* Pairs with READ_ONCE() in __dev_queue_xmit() */
        WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id());
}

static inline bool __netif_tx_trylock(struct netdev_queue *txq)
{
        bool ok = spin_trylock(&txq->_xmit_lock);

        if (likely(ok)) {
                /* Pairs with READ_ONCE() in __dev_queue_xmit() */
                WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id());
        }
        return ok;
}

static inline void __netif_tx_unlock(struct netdev_queue *txq)
{
        /* Pairs with READ_ONCE() in __dev_queue_xmit() */
        WRITE_ONCE(txq->xmit_lock_owner, -1);
        spin_unlock(&txq->_xmit_lock);
}

static inline void __netif_tx_unlock_bh(struct netdev_queue *txq)
{
        /* Pairs with READ_ONCE() in __dev_queue_xmit() */
        WRITE_ONCE(txq->xmit_lock_owner, -1);
        spin_unlock_bh(&txq->_xmit_lock);
}

static inline void txq_trans_update(struct netdev_queue *txq)
{
        if (txq->xmit_lock_owner != -1)
                txq->trans_start = jiffies;
}

/* legacy drivers only, netdev_start_xmit() sets txq->trans_start */
static inline void netif_trans_update(struct net_device *dev)
{
        struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);

        if (txq->trans_start != jiffies)
                txq->trans_start = jiffies;
}

/**
 *        netif_tx_lock - grab network device transmit lock
 *        @dev: network device
 *
 * Get network device transmit lock
 */
static inline void netif_tx_lock(struct net_device *dev)
{
        unsigned int i;
        int cpu;

        spin_lock(&dev->tx_global_lock);
        cpu = smp_processor_id();
        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);

                /* We are the only thread of execution doing a
                 * freeze, but we have to grab the _xmit_lock in
                 * order to synchronize with threads which are in
                 * the ->hard_start_xmit() handler and already
                 * checked the frozen bit.
                 */
                __netif_tx_lock(txq, cpu);
                set_bit(__QUEUE_STATE_FROZEN, &txq->state);
                __netif_tx_unlock(txq);
        }
}

static inline void netif_tx_lock_bh(struct net_device *dev)
{
        local_bh_disable();
        netif_tx_lock(dev);
}

static inline void netif_tx_unlock(struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);

                /* No need to grab the _xmit_lock here.  If the
                 * queue is not stopped for another reason, we
                 * force a schedule.
                 */
                clear_bit(__QUEUE_STATE_FROZEN, &txq->state);
                netif_schedule_queue(txq);
        }
        spin_unlock(&dev->tx_global_lock);
}

static inline void netif_tx_unlock_bh(struct net_device *dev)
{
        netif_tx_unlock(dev);
        local_bh_enable();
}

#define HARD_TX_LOCK(dev, txq, cpu) {                        \
        if ((dev->features & NETIF_F_LLTX) == 0) {        \
                __netif_tx_lock(txq, cpu);                \
        } else {                                        \
                __netif_tx_acquire(txq);                \
        }                                                \
}

#define HARD_TX_TRYLOCK(dev, txq)                        \
        (((dev->features & NETIF_F_LLTX) == 0) ?        \
                __netif_tx_trylock(txq) :                \
                __netif_tx_acquire(txq))

#define HARD_TX_UNLOCK(dev, txq) {                        \
        if ((dev->features & NETIF_F_LLTX) == 0) {        \
                __netif_tx_unlock(txq);                        \
        } else {                                        \
                __netif_tx_release(txq);                \
        }                                                \
}

static inline void netif_tx_disable(struct net_device *dev)
{
        unsigned int i;
        int cpu;

        local_bh_disable();
        cpu = smp_processor_id();
        spin_lock(&dev->tx_global_lock);
        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);

                __netif_tx_lock(txq, cpu);
                netif_tx_stop_queue(txq);
                __netif_tx_unlock(txq);
        }
        spin_unlock(&dev->tx_global_lock);
        local_bh_enable();
}

static inline void netif_addr_lock(struct net_device *dev)
{
        unsigned char nest_level = 0;

#ifdef CONFIG_LOCKDEP
        nest_level = dev->nested_level;
#endif
        spin_lock_nested(&dev->addr_list_lock, nest_level);
}

static inline void netif_addr_lock_bh(struct net_device *dev)
{
        unsigned char nest_level = 0;

#ifdef CONFIG_LOCKDEP
        nest_level = dev->nested_level;
#endif
        local_bh_disable();
        spin_lock_nested(&dev->addr_list_lock, nest_level);
}

static inline void netif_addr_unlock(struct net_device *dev)
{
        spin_unlock(&dev->addr_list_lock);
}

static inline void netif_addr_unlock_bh(struct net_device *dev)
{
        spin_unlock_bh(&dev->addr_list_lock);
}

/*
 * dev_addrs walker. Should be used only for read access. Call with
 * rcu_read_lock held.
 */
#define for_each_dev_addr(dev, ha) \
                list_for_each_entry_rcu(ha, &dev->dev_addrs.list, list)

/* These functions live elsewhere (drivers/net/net_init.c, but related) */

void ether_setup(struct net_device *dev);

/* Support for loadable net-drivers */
struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
                                    unsigned char name_assign_type,
                                    void (*setup)(struct net_device *),
                                    unsigned int txqs, unsigned int rxqs);
#define alloc_netdev(sizeof_priv, name, name_assign_type, setup) \
        alloc_netdev_mqs(sizeof_priv, name, name_assign_type, setup, 1, 1)

#define alloc_netdev_mq(sizeof_priv, name, name_assign_type, setup, count) \
        alloc_netdev_mqs(sizeof_priv, name, name_assign_type, setup, count, \
                         count)

int register_netdev(struct net_device *dev);
void unregister_netdev(struct net_device *dev);

int devm_register_netdev(struct device *dev, struct net_device *ndev);

/* General hardware address lists handling functions */
int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
                   struct netdev_hw_addr_list *from_list, int addr_len);
void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
                      struct netdev_hw_addr_list *from_list, int addr_len);
int __hw_addr_sync_dev(struct netdev_hw_addr_list *list,
                       struct net_device *dev,
                       int (*sync)(struct net_device *, const unsigned char *),
                       int (*unsync)(struct net_device *,
                                     const unsigned char *));
int __hw_addr_ref_sync_dev(struct netdev_hw_addr_list *list,
                           struct net_device *dev,
                           int (*sync)(struct net_device *,
                                       const unsigned char *, int),
                           int (*unsync)(struct net_device *,
                                         const unsigned char *, int));
void __hw_addr_ref_unsync_dev(struct netdev_hw_addr_list *list,
                              struct net_device *dev,
                              int (*unsync)(struct net_device *,
                                            const unsigned char *, int));
void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list,
                          struct net_device *dev,
                          int (*unsync)(struct net_device *,
                                        const unsigned char *));
void __hw_addr_init(struct netdev_hw_addr_list *list);

/* Functions used for device addresses handling */
static inline void
__dev_addr_set(struct net_device *dev, const u8 *addr, size_t len)
{
        memcpy(dev->dev_addr, addr, len);
}

static inline void dev_addr_set(struct net_device *dev, const u8 *addr)
{
        __dev_addr_set(dev, addr, dev->addr_len);
}

static inline void
dev_addr_mod(struct net_device *dev, unsigned int offset,
             const u8 *addr, size_t len)
{
        memcpy(&dev->dev_addr[offset], addr, len);
}

int dev_addr_add(struct net_device *dev, const unsigned char *addr,
                 unsigned char addr_type);
int dev_addr_del(struct net_device *dev, const unsigned char *addr,
                 unsigned char addr_type);
void dev_addr_flush(struct net_device *dev);
int dev_addr_init(struct net_device *dev);

/* Functions used for unicast addresses handling */
int dev_uc_add(struct net_device *dev, const unsigned char *addr);
int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr);
int dev_uc_del(struct net_device *dev, const unsigned char *addr);
int dev_uc_sync(struct net_device *to, struct net_device *from);
int dev_uc_sync_multiple(struct net_device *to, struct net_device *from);
void dev_uc_unsync(struct net_device *to, struct net_device *from);
void dev_uc_flush(struct net_device *dev);
void dev_uc_init(struct net_device *dev);

/**
 *  __dev_uc_sync - Synchonize device's unicast list
 *  @dev:  device to sync
 *  @sync: function to call if address should be added
 *  @unsync: function to call if address should be removed
 *
 *  Add newly added addresses to the interface, and release
 *  addresses that have been deleted.
 */
static inline int __dev_uc_sync(struct net_device *dev,
                                int (*sync)(struct net_device *,
                                            const unsigned char *),
                                int (*unsync)(struct net_device *,
                                              const unsigned char *))
{
        return __hw_addr_sync_dev(&dev->uc, dev, sync, unsync);
}

/**
 *  __dev_uc_unsync - Remove synchronized addresses from device
 *  @dev:  device to sync
 *  @unsync: function to call if address should be removed
 *
 *  Remove all addresses that were added to the device by dev_uc_sync().
 */
static inline void __dev_uc_unsync(struct net_device *dev,
                                   int (*unsync)(struct net_device *,
                                                 const unsigned char *))
{
        __hw_addr_unsync_dev(&dev->uc, dev, unsync);
}

/* Functions used for multicast addresses handling */
int dev_mc_add(struct net_device *dev, const unsigned char *addr);
int dev_mc_add_global(struct net_device *dev, const unsigned char *addr);
int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr);
int dev_mc_del(struct net_device *dev, const unsigned char *addr);
int dev_mc_del_global(struct net_device *dev, const unsigned char *addr);
int dev_mc_sync(struct net_device *to, struct net_device *from);
int dev_mc_sync_multiple(struct net_device *to, struct net_device *from);
void dev_mc_unsync(struct net_device *to, struct net_device *from);
void dev_mc_flush(struct net_device *dev);
void dev_mc_init(struct net_device *dev);

/**
 *  __dev_mc_sync - Synchonize device's multicast list
 *  @dev:  device to sync
 *  @sync: function to call if address should be added
 *  @unsync: function to call if address should be removed
 *
 *  Add newly added addresses to the interface, and release
 *  addresses that have been deleted.
 */
static inline int __dev_mc_sync(struct net_device *dev,
                                int (*sync)(struct net_device *,
                                            const unsigned char *),
                                int (*unsync)(struct net_device *,
                                              const unsigned char *))
{
        return __hw_addr_sync_dev(&dev->mc, dev, sync, unsync);
}

/**
 *  __dev_mc_unsync - Remove synchronized addresses from device
 *  @dev:  device to sync
 *  @unsync: function to call if address should be removed
 *
 *  Remove all addresses that were added to the device by dev_mc_sync().
 */
static inline void __dev_mc_unsync(struct net_device *dev,
                                   int (*unsync)(struct net_device *,
                                                 const unsigned char *))
{
        __hw_addr_unsync_dev(&dev->mc, dev, unsync);
}

/* Functions used for secondary unicast and multicast support */
void dev_set_rx_mode(struct net_device *dev);
void __dev_set_rx_mode(struct net_device *dev);
int dev_set_promiscuity(struct net_device *dev, int inc);
int dev_set_allmulti(struct net_device *dev, int inc);
void netdev_state_change(struct net_device *dev);
void netdev_notify_peers(struct net_device *dev);
void netdev_features_change(struct net_device *dev);
/* Load a device via the kmod */
void dev_load(struct net *net, const char *name);
struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
                                        struct rtnl_link_stats64 *storage);
void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
                             const struct net_device_stats *netdev_stats);
void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
                           const struct pcpu_sw_netstats __percpu *netstats);

extern int                netdev_max_backlog;
extern int                netdev_tstamp_prequeue;
extern int                weight_p;
extern int                dev_weight_rx_bias;
extern int                dev_weight_tx_bias;
extern int                dev_rx_weight;
extern int                dev_tx_weight;
extern int                gro_normal_batch;

enum {
        NESTED_SYNC_IMM_BIT,
        NESTED_SYNC_TODO_BIT,
};

#define __NESTED_SYNC_BIT(bit)        ((u32)1 << (bit))
#define __NESTED_SYNC(name)        __NESTED_SYNC_BIT(NESTED_SYNC_ ## name ## _BIT)

#define NESTED_SYNC_IMM                __NESTED_SYNC(IMM)
#define NESTED_SYNC_TODO        __NESTED_SYNC(TODO)

struct netdev_nested_priv {
        unsigned char flags;
        void *data;
};

bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev);
struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
                                                     struct list_head **iter);
struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
                                                     struct list_head **iter);

#ifdef CONFIG_LOCKDEP
static LIST_HEAD(net_unlink_list);

static inline void net_unlink_todo(struct net_device *dev)
{
        if (list_empty(&dev->unlink_list))
                list_add_tail(&dev->unlink_list, &net_unlink_list);
}
#endif

/* iterate through upper list, must be called under RCU read lock */
#define netdev_for_each_upper_dev_rcu(dev, updev, iter) \
        for (iter = &(dev)->adj_list.upper, \
             updev = netdev_upper_get_next_dev_rcu(dev, &(iter)); \
             updev; \
             updev = netdev_upper_get_next_dev_rcu(dev, &(iter)))

int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
                                  int (*fn)(struct net_device *upper_dev,
                                            struct netdev_nested_priv *priv),
                                  struct netdev_nested_priv *priv);

bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
                                  struct net_device *upper_dev);

bool netdev_has_any_upper_dev(struct net_device *dev);

void *netdev_lower_get_next_private(struct net_device *dev,
                                    struct list_head **iter);
void *netdev_lower_get_next_private_rcu(struct net_device *dev,
                                        struct list_head **iter);

#define netdev_for_each_lower_private(dev, priv, iter) \
        for (iter = (dev)->adj_list.lower.next, \
             priv = netdev_lower_get_next_private(dev, &(iter)); \
             priv; \
             priv = netdev_lower_get_next_private(dev, &(iter)))

#define netdev_for_each_lower_private_rcu(dev, priv, iter) \
        for (iter = &(dev)->adj_list.lower, \
             priv = netdev_lower_get_next_private_rcu(dev, &(iter)); \
             priv; \
             priv = netdev_lower_get_next_private_rcu(dev, &(iter)))

void *netdev_lower_get_next(struct net_device *dev,
                                struct list_head **iter);

#define netdev_for_each_lower_dev(dev, ldev, iter) \
        for (iter = (dev)->adj_list.lower.next, \
             ldev = netdev_lower_get_next(dev, &(iter)); \
             ldev; \
             ldev = netdev_lower_get_next(dev, &(iter)))

struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
                                             struct list_head **iter);
int netdev_walk_all_lower_dev(struct net_device *dev,
                              int (*fn)(struct net_device *lower_dev,
                                        struct netdev_nested_priv *priv),
                              struct netdev_nested_priv *priv);
int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
                                  int (*fn)(struct net_device *lower_dev,
                                            struct netdev_nested_priv *priv),
                                  struct netdev_nested_priv *priv);

void *netdev_adjacent_get_private(struct list_head *adj_list);
void *netdev_lower_get_first_private_rcu(struct net_device *dev);
struct net_device *netdev_master_upper_dev_get(struct net_device *dev);
struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev);
int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev,
                          struct netlink_ext_ack *extack);
int netdev_master_upper_dev_link(struct net_device *dev,
                                 struct net_device *upper_dev,
                                 void *upper_priv, void *upper_info,
                                 struct netlink_ext_ack *extack);
void netdev_upper_dev_unlink(struct net_device *dev,
                             struct net_device *upper_dev);
int netdev_adjacent_change_prepare(struct net_device *old_dev,
                                   struct net_device *new_dev,
                                   struct net_device *dev,
                                   struct netlink_ext_ack *extack);
void netdev_adjacent_change_commit(struct net_device *old_dev,
                                   struct net_device *new_dev,
                                   struct net_device *dev);
void netdev_adjacent_change_abort(struct net_device *old_dev,
                                  struct net_device *new_dev,
                                  struct net_device *dev);
void netdev_adjacent_rename_links(struct net_device *dev, char *oldname);
void *netdev_lower_dev_get_private(struct net_device *dev,
                                   struct net_device *lower_dev);
void netdev_lower_state_changed(struct net_device *lower_dev,
                                void *lower_state_info);

/* RSS keys are 40 or 52 bytes long */
#define NETDEV_RSS_KEY_LEN 52
extern u8 netdev_rss_key[NETDEV_RSS_KEY_LEN] __read_mostly;
void netdev_rss_key_fill(void *buffer, size_t len);

int skb_checksum_help(struct sk_buff *skb);
int skb_crc32c_csum_help(struct sk_buff *skb);
int skb_csum_hwoffload_help(struct sk_buff *skb,
                            const netdev_features_t features);

struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
                                  netdev_features_t features, bool tx_path);
struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
                                    netdev_features_t features);

struct netdev_bonding_info {
        ifslave        slave;
        ifbond        master;
};

struct netdev_notifier_bonding_info {
        struct netdev_notifier_info info; /* must be first */
        struct netdev_bonding_info  bonding_info;
};

void netdev_bonding_info_change(struct net_device *dev,
                                struct netdev_bonding_info *bonding_info);

#if IS_ENABLED(CONFIG_ETHTOOL_NETLINK)
void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data);
#else
static inline void ethtool_notify(struct net_device *dev, unsigned int cmd,
                                  const void *data)
{
}
#endif

static inline
struct sk_buff *skb_gso_segment(struct sk_buff *skb, netdev_features_t features)
{
        return __skb_gso_segment(skb, features, true);
}
__be16 skb_network_protocol(struct sk_buff *skb, int *depth);

static inline bool can_checksum_protocol(netdev_features_t features,
                                         __be16 protocol)
{
        if (protocol == htons(ETH_P_FCOE))
                return !!(features & NETIF_F_FCOE_CRC);

        /* Assume this is an IP checksum (not SCTP CRC) */

        if (features & NETIF_F_HW_CSUM) {
                /* Can checksum everything */
                return true;
        }

        switch (protocol) {
        case htons(ETH_P_IP):
                return !!(features & NETIF_F_IP_CSUM);
        case htons(ETH_P_IPV6):
                return !!(features & NETIF_F_IPV6_CSUM);
        default:
                return false;
        }
}

#ifdef CONFIG_BUG
void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb);
#else
static inline void netdev_rx_csum_fault(struct net_device *dev,
                                        struct sk_buff *skb)
{
}
#endif
/* rx skb timestamps */
void net_enable_timestamp(void);
void net_disable_timestamp(void);

#ifdef CONFIG_PROC_FS
int __init dev_proc_init(void);
#else
#define dev_proc_init() 0
#endif

static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops,
                                              struct sk_buff *skb, struct net_device *dev,
                                              bool more)
{
        __this_cpu_write(softnet_data.xmit.more, more);
        return ops->ndo_start_xmit(skb, dev);
}

static inline bool netdev_xmit_more(void)
{
        return __this_cpu_read(softnet_data.xmit.more);
}

static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev,
                                            struct netdev_queue *txq, bool more)
{
        const struct net_device_ops *ops = dev->netdev_ops;
        netdev_tx_t rc;

        rc = __netdev_start_xmit(ops, skb, dev, more);
        if (rc == NETDEV_TX_OK)
                txq_trans_update(txq);

        return rc;
}

int netdev_class_create_file_ns(const struct class_attribute *class_attr,
                                const void *ns);
void netdev_class_remove_file_ns(const struct class_attribute *class_attr,
                                 const void *ns);

extern const struct kobj_ns_type_operations net_ns_type_operations;

const char *netdev_drivername(const struct net_device *dev);

void linkwatch_run_queue(void);

static inline netdev_features_t netdev_intersect_features(netdev_features_t f1,
                                                          netdev_features_t f2)
{
        if ((f1 ^ f2) & NETIF_F_HW_CSUM) {
                if (f1 & NETIF_F_HW_CSUM)
                        f1 |= (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
                else
                        f2 |= (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
        }

        return f1 & f2;
}

static inline netdev_features_t netdev_get_wanted_features(
        struct net_device *dev)
{
        return (dev->features & ~dev->hw_features) | dev->wanted_features;
}
netdev_features_t netdev_increment_features(netdev_features_t all,
        netdev_features_t one, netdev_features_t mask);

/* Allow TSO being used on stacked device :
 * Performing the GSO segmentation before last device
 * is a performance improvement.
 */
static inline netdev_features_t netdev_add_tso_features(netdev_features_t features,
                                                        netdev_features_t mask)
{
        return netdev_increment_features(features, NETIF_F_ALL_TSO |
                                         NETIF_F_ALL_FOR_ALL, mask);
}

int __netdev_update_features(struct net_device *dev);
void netdev_update_features(struct net_device *dev);
void netdev_change_features(struct net_device *dev);

void netif_stacked_transfer_operstate(const struct net_device *rootdev,
                                        struct net_device *dev);

netdev_features_t passthru_features_check(struct sk_buff *skb,
                                          struct net_device *dev,
                                          netdev_features_t features);
netdev_features_t netif_skb_features(struct sk_buff *skb);

static inline bool net_gso_ok(netdev_features_t features, int gso_type)
{
        netdev_features_t feature = (netdev_features_t)gso_type << NETIF_F_GSO_SHIFT;

        /* check flags correspondence */
        BUILD_BUG_ON(SKB_GSO_TCPV4   != (NETIF_F_TSO >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_DODGY   != (NETIF_F_GSO_ROBUST >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_TCP_ECN != (NETIF_F_TSO_ECN >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_TCP_FIXEDID != (NETIF_F_TSO_MANGLEID >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_TCPV6   != (NETIF_F_TSO6 >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_FCOE    != (NETIF_F_FSO >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_GRE     != (NETIF_F_GSO_GRE >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_GRE_CSUM != (NETIF_F_GSO_GRE_CSUM >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_IPXIP4  != (NETIF_F_GSO_IPXIP4 >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_IPXIP6  != (NETIF_F_GSO_IPXIP6 >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL != (NETIF_F_GSO_UDP_TUNNEL >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL_CSUM != (NETIF_F_GSO_UDP_TUNNEL_CSUM >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_PARTIAL != (NETIF_F_GSO_PARTIAL >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_SCTP    != (NETIF_F_GSO_SCTP >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_ESP != (NETIF_F_GSO_ESP >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_UDP != (NETIF_F_GSO_UDP >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_UDP_L4 != (NETIF_F_GSO_UDP_L4 >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_FRAGLIST != (NETIF_F_GSO_FRAGLIST >> NETIF_F_GSO_SHIFT));

        return (features & feature) == feature;
}

static inline bool skb_gso_ok(struct sk_buff *skb, netdev_features_t features)
{
        return net_gso_ok(features, skb_shinfo(skb)->gso_type) &&
               (!skb_has_frag_list(skb) || (features & NETIF_F_FRAGLIST));
}

static inline bool netif_needs_gso(struct sk_buff *skb,
                                   netdev_features_t features)
{
        return skb_is_gso(skb) && (!skb_gso_ok(skb, features) ||
                unlikely((skb->ip_summed != CHECKSUM_PARTIAL) &&
                         (skb->ip_summed != CHECKSUM_UNNECESSARY)));
}

static inline void netif_set_gso_max_size(struct net_device *dev,
                                          unsigned int size)
{
        dev->gso_max_size = size;
}

static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol,
                                        int pulled_hlen, u16 mac_offset,
                                        int mac_len)
{
        skb->protocol = protocol;
        skb->encapsulation = 1;
        skb_push(skb, pulled_hlen);
        skb_reset_transport_header(skb);
        skb->mac_header = mac_offset;
        skb->network_header = skb->mac_header + mac_len;
        skb->mac_len = mac_len;
}

static inline bool netif_is_macsec(const struct net_device *dev)
{
        return dev->priv_flags & IFF_MACSEC;
}

static inline bool netif_is_macvlan(const struct net_device *dev)
{
        return dev->priv_flags & IFF_MACVLAN;
}

static inline bool netif_is_macvlan_port(const struct net_device *dev)
{
        return dev->priv_flags & IFF_MACVLAN_PORT;
}

static inline bool netif_is_bond_master(const struct net_device *dev)
{
        return dev->flags & IFF_MASTER && dev->priv_flags & IFF_BONDING;
}

static inline bool netif_is_bond_slave(const struct net_device *dev)
{
        return dev->flags & IFF_SLAVE && dev->priv_flags & IFF_BONDING;
}

static inline bool netif_supports_nofcs(struct net_device *dev)
{
        return dev->priv_flags & IFF_SUPP_NOFCS;
}

static inline bool netif_has_l3_rx_handler(const struct net_device *dev)
{
        return dev->priv_flags & IFF_L3MDEV_RX_HANDLER;
}

static inline bool netif_is_l3_master(const struct net_device *dev)
{
        return dev->priv_flags & IFF_L3MDEV_MASTER;
}

static inline bool netif_is_l3_slave(const struct net_device *dev)
{
        return dev->priv_flags & IFF_L3MDEV_SLAVE;
}

static inline bool netif_is_bridge_master(const struct net_device *dev)
{
        return dev->priv_flags & IFF_EBRIDGE;
}

static inline bool netif_is_bridge_port(const struct net_device *dev)
{
        return dev->priv_flags & IFF_BRIDGE_PORT;
}

static inline bool netif_is_ovs_master(const struct net_device *dev)
{
        return dev->priv_flags & IFF_OPENVSWITCH;
}

static inline bool netif_is_ovs_port(const struct net_device *dev)
{
        return dev->priv_flags & IFF_OVS_DATAPATH;
}

static inline bool netif_is_any_bridge_port(const struct net_device *dev)
{
        return netif_is_bridge_port(dev) || netif_is_ovs_port(dev);
}

static inline bool netif_is_team_master(const struct net_device *dev)
{
        return dev->priv_flags & IFF_TEAM;
}

static inline bool netif_is_team_port(const struct net_device *dev)
{
        return dev->priv_flags & IFF_TEAM_PORT;
}

static inline bool netif_is_lag_master(const struct net_device *dev)
{
        return netif_is_bond_master(dev) || netif_is_team_master(dev);
}

static inline bool netif_is_lag_port(const struct net_device *dev)
{
        return netif_is_bond_slave(dev) || netif_is_team_port(dev);
}

static inline bool netif_is_rxfh_configured(const struct net_device *dev)
{
        return dev->priv_flags & IFF_RXFH_CONFIGURED;
}

static inline bool netif_is_failover(const struct net_device *dev)
{
        return dev->priv_flags & IFF_FAILOVER;
}

static inline bool netif_is_failover_slave(const struct net_device *dev)
{
        return dev->priv_flags & IFF_FAILOVER_SLAVE;
}

/* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */
static inline void netif_keep_dst(struct net_device *dev)
{
        dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM);
}

/* return true if dev can't cope with mtu frames that need vlan tag insertion */
static inline bool netif_reduces_vlan_mtu(struct net_device *dev)
{
        /* TODO: reserve and use an additional IFF bit, if we get more users */
        return dev->priv_flags & IFF_MACSEC;
}

extern struct pernet_operations __net_initdata loopback_net_ops;

/* Logging, debugging and troubleshooting/diagnostic helpers. */

/* netdev_printk helpers, similar to dev_printk */

static inline const char *netdev_name(const struct net_device *dev)
{
        if (!dev->name[0] || strchr(dev->name, '%'))
                return "(unnamed net_device)";
        return dev->name;
}

static inline bool netdev_unregistering(const struct net_device *dev)
{
        return dev->reg_state == NETREG_UNREGISTERING;
}

static inline const char *netdev_reg_state(const struct net_device *dev)
{
        switch (dev->reg_state) {
        case NETREG_UNINITIALIZED: return " (uninitialized)";
        case NETREG_REGISTERED: return "";
        case NETREG_UNREGISTERING: return " (unregistering)";
        case NETREG_UNREGISTERED: return " (unregistered)";
        case NETREG_RELEASED: return " (released)";
        case NETREG_DUMMY: return " (dummy)";
        }

        WARN_ONCE(1, "%s: unknown reg_state %d\n", dev->name, dev->reg_state);
        return " (unknown)";
}

__printf(3, 4) __cold
void netdev_printk(const char *level, const struct net_device *dev,
                   const char *format, ...);
__printf(2, 3) __cold
void netdev_emerg(const struct net_device *dev, const char *format, ...);
__printf(2, 3) __cold
void netdev_alert(const struct net_device *dev, const char *format, ...);
__printf(2, 3) __cold
void netdev_crit(const struct net_device *dev, const char *format, ...);
__printf(2, 3) __cold
void netdev_err(const struct net_device *dev, const char *format, ...);
__printf(2, 3) __cold
void netdev_warn(const struct net_device *dev, const char *format, ...);
__printf(2, 3) __cold
void netdev_notice(const struct net_device *dev, const char *format, ...);
__printf(2, 3) __cold
void netdev_info(const struct net_device *dev, const char *format, ...);

#define netdev_level_once(level, dev, fmt, ...)                        \
do {                                                                \
        static bool __print_once __read_mostly;                        \
                                                                \
        if (!__print_once) {                                        \
                __print_once = true;                                \
                netdev_printk(level, dev, fmt, ##__VA_ARGS__);        \
        }                                                        \
} while (0)

#define netdev_emerg_once(dev, fmt, ...) \
        netdev_level_once(KERN_EMERG, dev, fmt, ##__VA_ARGS__)
#define netdev_alert_once(dev, fmt, ...) \
        netdev_level_once(KERN_ALERT, dev, fmt, ##__VA_ARGS__)
#define netdev_crit_once(dev, fmt, ...) \
        netdev_level_once(KERN_CRIT, dev, fmt, ##__VA_ARGS__)
#define netdev_err_once(dev, fmt, ...) \
        netdev_level_once(KERN_ERR, dev, fmt, ##__VA_ARGS__)
#define netdev_warn_once(dev, fmt, ...) \
        netdev_level_once(KERN_WARNING, dev, fmt, ##__VA_ARGS__)
#define netdev_notice_once(dev, fmt, ...) \
        netdev_level_once(KERN_NOTICE, dev, fmt, ##__VA_ARGS__)
#define netdev_info_once(dev, fmt, ...) \
        netdev_level_once(KERN_INFO, dev, fmt, ##__VA_ARGS__)

#define MODULE_ALIAS_NETDEV(device) \
        MODULE_ALIAS("netdev-" device)

#if defined(CONFIG_DYNAMIC_DEBUG) || \
        (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
#define netdev_dbg(__dev, format, args...)                        \
do {                                                                \
        dynamic_netdev_dbg(__dev, format, ##args);                \
} while (0)
#elif defined(DEBUG)
#define netdev_dbg(__dev, format, args...)                        \
        netdev_printk(KERN_DEBUG, __dev, format, ##args)
#else
#define netdev_dbg(__dev, format, args...)                        \
({                                                                \
        if (0)                                                        \
                netdev_printk(KERN_DEBUG, __dev, format, ##args); \
})
#endif

#if defined(VERBOSE_DEBUG)
#define netdev_vdbg        netdev_dbg
#else

#define netdev_vdbg(dev, format, args...)                        \
({                                                                \
        if (0)                                                        \
                netdev_printk(KERN_DEBUG, dev, format, ##args);        \
        0;                                                        \
})
#endif

/*
 * netdev_WARN() acts like dev_printk(), but with the key difference
 * of using a WARN/WARN_ON to get the message out, including the
 * file/line information and a backtrace.
 */
#define netdev_WARN(dev, format, args...)                        \
        WARN(1, "netdevice: %s%s: " format, netdev_name(dev),        \
             netdev_reg_state(dev), ##args)

#define netdev_WARN_ONCE(dev, format, args...)                                \
        WARN_ONCE(1, "netdevice: %s%s: " format, netdev_name(dev),        \
                  netdev_reg_state(dev), ##args)

/* netif printk helpers, similar to netdev_printk */

#define netif_printk(priv, type, level, dev, fmt, args...)        \
do {                                                                  \
        if (netif_msg_##type(priv))                                \
                netdev_printk(level, (dev), fmt, ##args);        \
} while (0)

#define netif_level(level, priv, type, dev, fmt, args...)        \
do {                                                                \
        if (netif_msg_##type(priv))                                \
                netdev_##level(dev, fmt, ##args);                \
} while (0)

#define netif_emerg(priv, type, dev, fmt, args...)                \
        netif_level(emerg, priv, type, dev, fmt, ##args)
#define netif_alert(priv, type, dev, fmt, args...)                \
        netif_level(alert, priv, type, dev, fmt, ##args)
#define netif_crit(priv, type, dev, fmt, args...)                \
        netif_level(crit, priv, type, dev, fmt, ##args)
#define netif_err(priv, type, dev, fmt, args...)                \
        netif_level(err, priv, type, dev, fmt, ##args)
#define netif_warn(priv, type, dev, fmt, args...)                \
        netif_level(warn, priv, type, dev, fmt, ##args)
#define netif_notice(priv, type, dev, fmt, args...)                \
        netif_level(notice, priv, type, dev, fmt, ##args)
#define netif_info(priv, type, dev, fmt, args...)                \
        netif_level(info, priv, type, dev, fmt, ##args)

#if defined(CONFIG_DYNAMIC_DEBUG) || \
        (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
#define netif_dbg(priv, type, netdev, format, args...)                \
do {                                                                \
        if (netif_msg_##type(priv))                                \
                dynamic_netdev_dbg(netdev, format, ##args);        \
} while (0)
#elif defined(DEBUG)
#define netif_dbg(priv, type, dev, format, args...)                \
        netif_printk(priv, type, KERN_DEBUG, dev, format, ##args)
#else
#define netif_dbg(priv, type, dev, format, args...)                        \
({                                                                        \
        if (0)                                                                \
                netif_printk(priv, type, KERN_DEBUG, dev, format, ##args); \
        0;                                                                \
})
#endif

/* if @cond then downgrade to debug, else print at @level */
#define netif_cond_dbg(priv, type, netdev, cond, level, fmt, args...)     \
        do {                                                              \
                if (cond)                                                 \
                        netif_dbg(priv, type, netdev, fmt, ##args);       \
                else                                                      \
                        netif_ ## level(priv, type, netdev, fmt, ##args); \
        } while (0)

#if defined(VERBOSE_DEBUG)
#define netif_vdbg        netif_dbg
#else
#define netif_vdbg(priv, type, dev, format, args...)                \
({                                                                \
        if (0)                                                        \
                netif_printk(priv, type, KERN_DEBUG, dev, format, ##args); \
        0;                                                        \
})
#endif

/*
 *        The list of packet types we will receive (as opposed to discard)
 *        and the routines to invoke.
 *
 *        Why 16. Because with 16 the only overlap we get on a hash of the
 *        low nibble of the protocol value is RARP/SNAP/X.25.
 *
 *                0800        IP
 *                0001        802.3
 *                0002        AX.25
 *                0004        802.2
 *                8035        RARP
 *                0005        SNAP
 *                0805        X.25
 *                0806        ARP
 *                8137        IPX
 *                0009        Localtalk
 *                86DD        IPv6
 */
#define PTYPE_HASH_SIZE        (16)
#define PTYPE_HASH_MASK        (PTYPE_HASH_SIZE - 1)

extern struct net_device *blackhole_netdev;

/* Note: Avoid these macros in fast path, prefer per-cpu or per-queue counters. */
#define DEV_STATS_INC(DEV, FIELD) atomic_long_inc(&(DEV)->stats.__##FIELD)
#define DEV_STATS_ADD(DEV, FIELD, VAL)         \
                atomic_long_add((VAL), &(DEV)->stats.__##FIELD)
#define DEV_STATS_READ(DEV, FIELD) atomic_long_read(&(DEV)->stats.__##FIELD)

#endif        /* _LINUX_NETDEVICE_H */













































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef _LINUX_IO_URING_H
#define _LINUX_IO_URING_H

#include <linux/sched.h>
#include <linux/xarray.h>

#if defined(CONFIG_IO_URING)
void __io_uring_cancel(bool cancel_all);
void __io_uring_free(struct task_struct *tsk);
bool io_is_uring_fops(struct file *file);

static inline void io_uring_files_cancel(void)
{
        if (current->io_uring)
                __io_uring_cancel(false);
}
static inline void io_uring_task_cancel(void)
{
        if (current->io_uring)
                __io_uring_cancel(true);
}
static inline void io_uring_free(struct task_struct *tsk)
{
        if (tsk->io_uring)
                __io_uring_free(tsk);
}
#else
static inline void io_uring_task_cancel(void)
{
}
static inline void io_uring_files_cancel(void)
{
}
static inline void io_uring_free(struct task_struct *tsk)
{
}
static inline bool io_is_uring_fops(struct file *file)
{
        return false;
}
#endif

#endif
















































































































































































































































    2 


















    5 


    5 










    4 












    4 










    5 

























    1 




































































    1 
    1 
























































    1 
    1 
    1 
    1 

    1 
    1 




















    1 











    1 































    1 







    1 

    1 











    1 






    1 
    1 
    1 





    1 










































    1 






















    1 

















    1 










    1 






































    1 







    1 


    1 
    1 
    1 
    1 
    1 


    1 
    1 
    1 


    1 




    1 












    1 






















    1 












    1 




    1 



















    1 









    1 



















    1 

    1 





















































































































































































































































































































    1 



















    1 



















































































































































    1 



    1 



    1 
    1 



    1 







    1 





































































    5 











    2 
    2 

    2 






    1 




    2 





















































    1 












    2 


    1 



    1 
























    1 





    1 








    1 







    1 



    2 





    5 


























































    5 










    5 

    5 





















    3 





    3 
    3 












    1 


    1 









    1 































































    1 
    1 


    1 











    2 


    2 

    2 
    2 


    2 





    2 







    2 


















    2 


    2 




    2 




    2 





























































































































    1 









    2 

    2 
    1 

    2 

    2 
    2 


    4 

    4 
    2 
    4 
    1 












    3 
    2 


    2 

    3 



















    3 
    3 





































































































































    5 















































































    1 



























































































































































































































































































































































    1 








    1 













    1 












































    1 





    1 


    1 



    1 





    1 


























































    1 























    1 



















































































    1 























    1 















































    2 









    2 






    2 

    2 





    2 


    2 
    2 

    2 
    2 



    2 
    2 







    1 




























    1 
    1 


















    1 













    1 
    1 
























    4 


















    4 


    4 






    4 
    4 





    4 














    1 












    1 




    1 
    1 
















































































































































































































































































































































































































































    1 






    1 
    1 











    3 


    3 
    2 

















































    1 


    1 
    1 









    1 






































    1 

    1 

































































































































































































































































    3 



























































































































































































    1 



































































































    1 


    1 



















































































































































































































































































































    1 





    1 










    1 







    1 




    1 







    1 






    1 



    1 




    1 



    1 
    1 



    1 






    1 












































































































































































































































































































    1 


























































































































































    1 








    1 
    1 






























































































    1 







    1 
    1 






    1 



    1 

    1 











    1 





    1 






































































































































































































































































































































































































































































































































































































































































































































    1 















    1 


    1 

    1 
    1 





















    1 































































































































































































































































































































































































    5 
    5 










































































































































































































    1 











    1 















































































    1 

































    1 
















































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  NSA Security-Enhanced Linux (SELinux) security module
 *
 *  This file contains the SELinux hook function implementations.
 *
 *  Authors:  Stephen Smalley, <sds@tycho.nsa.gov>
 *              Chris Vance, <cvance@nai.com>
 *              Wayne Salamon, <wsalamon@nai.com>
 *              James Morris <jmorris@redhat.com>
 *
 *  Copyright (C) 2001,2002 Networks Associates Technology, Inc.
 *  Copyright (C) 2003-2008 Red Hat, Inc., James Morris <jmorris@redhat.com>
 *                                           Eric Paris <eparis@redhat.com>
 *  Copyright (C) 2004-2005 Trusted Computer Solutions, Inc.
 *                            <dgoeddel@trustedcs.com>
 *  Copyright (C) 2006, 2007, 2009 Hewlett-Packard Development Company, L.P.
 *        Paul Moore <paul@paul-moore.com>
 *  Copyright (C) 2007 Hitachi Software Engineering Co., Ltd.
 *                       Yuichi Nakamura <ynakam@hitachisoft.jp>
 *  Copyright (C) 2016 Mellanox Technologies
 */

#include <linux/init.h>
#include <linux/kd.h>
#include <linux/kernel.h>
#include <linux/kernel_read_file.h>
#include <linux/tracehook.h>
#include <linux/errno.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/lsm_hooks.h>
#include <linux/xattr.h>
#include <linux/capability.h>
#include <linux/unistd.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/proc_fs.h>
#include <linux/swap.h>
#include <linux/spinlock.h>
#include <linux/syscalls.h>
#include <linux/dcache.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/namei.h>
#include <linux/mount.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_ipv6.h>
#include <linux/tty.h>
#include <net/icmp.h>
#include <net/ip.h>                /* for local_port_range[] */
#include <net/tcp.h>                /* struct or_callable used in sock_rcv_skb */
#include <net/inet_connection_sock.h>
#include <net/net_namespace.h>
#include <net/netlabel.h>
#include <linux/uaccess.h>
#include <asm/ioctls.h>
#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/interrupt.h>
#include <linux/netdevice.h>        /* for network interface checks */
#include <net/netlink.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/dccp.h>
#include <linux/sctp.h>
#include <net/sctp/structs.h>
#include <linux/quota.h>
#include <linux/un.h>                /* for Unix socket types */
#include <net/af_unix.h>        /* for Unix socket types */
#include <linux/parser.h>
#include <linux/nfs_mount.h>
#include <net/ipv6.h>
#include <linux/hugetlb.h>
#include <linux/personality.h>
#include <linux/audit.h>
#include <linux/string.h>
#include <linux/mutex.h>
#include <linux/posix-timers.h>
#include <linux/syslog.h>
#include <linux/user_namespace.h>
#include <linux/export.h>
#include <linux/msg.h>
#include <linux/shm.h>
#include <linux/bpf.h>
#include <linux/kernfs.h>
#include <linux/stringhash.h>        /* for hashlen_string() */
#include <uapi/linux/mount.h>
#include <linux/fsnotify.h>
#include <linux/fanotify.h>

#include "avc.h"
#include "objsec.h"
#include "netif.h"
#include "netnode.h"
#include "netport.h"
#include "ibpkey.h"
#include "xfrm.h"
#include "netlabel.h"
#include "audit.h"
#include "avc_ss.h"

struct selinux_state selinux_state;

/* SECMARK reference count */
static atomic_t selinux_secmark_refcount = ATOMIC_INIT(0);

#ifdef CONFIG_SECURITY_SELINUX_DEVELOP
static int selinux_enforcing_boot __initdata;

static int __init enforcing_setup(char *str)
{
        unsigned long enforcing;
        if (!kstrtoul(str, 0, &enforcing))
                selinux_enforcing_boot = enforcing ? 1 : 0;
        return 1;
}
__setup("enforcing=", enforcing_setup);
#else
#define selinux_enforcing_boot 1
#endif

int selinux_enabled_boot __initdata = 1;
#ifdef CONFIG_SECURITY_SELINUX_BOOTPARAM
static int __init selinux_enabled_setup(char *str)
{
        unsigned long enabled;
        if (!kstrtoul(str, 0, &enabled))
                selinux_enabled_boot = enabled ? 1 : 0;
        return 1;
}
__setup("selinux=", selinux_enabled_setup);
#endif

static unsigned int selinux_checkreqprot_boot =
        CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE;

static int __init checkreqprot_setup(char *str)
{
        unsigned long checkreqprot;

        if (!kstrtoul(str, 0, &checkreqprot)) {
                selinux_checkreqprot_boot = checkreqprot ? 1 : 0;
                if (checkreqprot)
                        pr_warn("SELinux: checkreqprot set to 1 via kernel parameter.  This is deprecated and will be rejected in a future kernel release.\n");
        }
        return 1;
}
__setup("checkreqprot=", checkreqprot_setup);

/**
 * selinux_secmark_enabled - Check to see if SECMARK is currently enabled
 *
 * Description:
 * This function checks the SECMARK reference counter to see if any SECMARK
 * targets are currently configured, if the reference counter is greater than
 * zero SECMARK is considered to be enabled.  Returns true (1) if SECMARK is
 * enabled, false (0) if SECMARK is disabled.  If the always_check_network
 * policy capability is enabled, SECMARK is always considered enabled.
 *
 */
static int selinux_secmark_enabled(void)
{
        return (selinux_policycap_alwaysnetwork() ||
                atomic_read(&selinux_secmark_refcount));
}

/**
 * selinux_peerlbl_enabled - Check to see if peer labeling is currently enabled
 *
 * Description:
 * This function checks if NetLabel or labeled IPSEC is enabled.  Returns true
 * (1) if any are enabled or false (0) if neither are enabled.  If the
 * always_check_network policy capability is enabled, peer labeling
 * is always considered enabled.
 *
 */
static int selinux_peerlbl_enabled(void)
{
        return (selinux_policycap_alwaysnetwork() ||
                netlbl_enabled() || selinux_xfrm_enabled());
}

static int selinux_netcache_avc_callback(u32 event)
{
        if (event == AVC_CALLBACK_RESET) {
                sel_netif_flush();
                sel_netnode_flush();
                sel_netport_flush();
                synchronize_net();
        }
        return 0;
}

static int selinux_lsm_notifier_avc_callback(u32 event)
{
        if (event == AVC_CALLBACK_RESET) {
                sel_ib_pkey_flush();
                call_blocking_lsm_notifier(LSM_POLICY_CHANGE, NULL);
        }

        return 0;
}

/*
 * initialise the security for the init task
 */
static void cred_init_security(void)
{
        struct cred *cred = (struct cred *) current->real_cred;
        struct task_security_struct *tsec;

        tsec = selinux_cred(cred);
        tsec->osid = tsec->sid = SECINITSID_KERNEL;
}

/*
 * get the security ID of a set of credentials
 */
static inline u32 cred_sid(const struct cred *cred)
{
        const struct task_security_struct *tsec;

        tsec = selinux_cred(cred);
        return tsec->sid;
}

/*
 * get the objective security ID of a task
 */
static inline u32 task_sid(const struct task_struct *task)
{
        u32 sid;

        rcu_read_lock();
        sid = cred_sid(__task_cred(task));
        rcu_read_unlock();
        return sid;
}

static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dentry);

/*
 * Try reloading inode security labels that have been marked as invalid.  The
 * @may_sleep parameter indicates when sleeping and thus reloading labels is
 * allowed; when set to false, returns -ECHILD when the label is
 * invalid.  The @dentry parameter should be set to a dentry of the inode.
 */
static int __inode_security_revalidate(struct inode *inode,
                                       struct dentry *dentry,
                                       bool may_sleep)
{
        struct inode_security_struct *isec = selinux_inode(inode);

        might_sleep_if(may_sleep);

        if (selinux_initialized(&selinux_state) &&
            isec->initialized != LABEL_INITIALIZED) {
                if (!may_sleep)
                        return -ECHILD;

                /*
                 * Try reloading the inode security label.  This will fail if
                 * @opt_dentry is NULL and no dentry for this inode can be
                 * found; in that case, continue using the old label.
                 */
                inode_doinit_with_dentry(inode, dentry);
        }
        return 0;
}

static struct inode_security_struct *inode_security_novalidate(struct inode *inode)
{
        return selinux_inode(inode);
}

static struct inode_security_struct *inode_security_rcu(struct inode *inode, bool rcu)
{
        int error;

        error = __inode_security_revalidate(inode, NULL, !rcu);
        if (error)
                return ERR_PTR(error);
        return selinux_inode(inode);
}

/*
 * Get the security label of an inode.
 */
static struct inode_security_struct *inode_security(struct inode *inode)
{
        __inode_security_revalidate(inode, NULL, true);
        return selinux_inode(inode);
}

static struct inode_security_struct *backing_inode_security_novalidate(struct dentry *dentry)
{
        struct inode *inode = d_backing_inode(dentry);

        return selinux_inode(inode);
}

/*
 * Get the security label of a dentry's backing inode.
 */
static struct inode_security_struct *backing_inode_security(struct dentry *dentry)
{
        struct inode *inode = d_backing_inode(dentry);

        __inode_security_revalidate(inode, dentry, true);
        return selinux_inode(inode);
}

static void inode_free_security(struct inode *inode)
{
        struct inode_security_struct *isec = selinux_inode(inode);
        struct superblock_security_struct *sbsec;

        if (!isec)
                return;
        sbsec = inode->i_sb->s_security;
        /*
         * As not all inode security structures are in a list, we check for
         * empty list outside of the lock to make sure that we won't waste
         * time taking a lock doing nothing.
         *
         * The list_del_init() function can be safely called more than once.
         * It should not be possible for this function to be called with
         * concurrent list_add(), but for better safety against future changes
         * in the code, we use list_empty_careful() here.
         */
        if (!list_empty_careful(&isec->list)) {
                spin_lock(&sbsec->isec_lock);
                list_del_init(&isec->list);
                spin_unlock(&sbsec->isec_lock);
        }
}

static void superblock_free_security(struct super_block *sb)
{
        struct superblock_security_struct *sbsec = sb->s_security;
        sb->s_security = NULL;
        kfree(sbsec);
}

struct selinux_mnt_opts {
        const char *fscontext, *context, *rootcontext, *defcontext;
};

static void selinux_free_mnt_opts(void *mnt_opts)
{
        struct selinux_mnt_opts *opts = mnt_opts;
        kfree(opts->fscontext);
        kfree(opts->context);
        kfree(opts->rootcontext);
        kfree(opts->defcontext);
        kfree(opts);
}

enum {
        Opt_error = -1,
        Opt_context = 0,
        Opt_defcontext = 1,
        Opt_fscontext = 2,
        Opt_rootcontext = 3,
        Opt_seclabel = 4,
};

#define A(s, has_arg) {#s, sizeof(#s) - 1, Opt_##s, has_arg}
static struct {
        const char *name;
        int len;
        int opt;
        bool has_arg;
} tokens[] = {
        A(context, true),
        A(fscontext, true),
        A(defcontext, true),
        A(rootcontext, true),
        A(seclabel, false),
};
#undef A

static int match_opt_prefix(char *s, int l, char **arg)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(tokens); i++) {
                size_t len = tokens[i].len;
                if (len > l || memcmp(s, tokens[i].name, len))
                        continue;
                if (tokens[i].has_arg) {
                        if (len == l || s[len] != '=')
                                continue;
                        *arg = s + len + 1;
                } else if (len != l)
                        continue;
                return tokens[i].opt;
        }
        return Opt_error;
}

#define SEL_MOUNT_FAIL_MSG "SELinux:  duplicate or incompatible mount options\n"

static int may_context_mount_sb_relabel(u32 sid,
                        struct superblock_security_struct *sbsec,
                        const struct cred *cred)
{
        const struct task_security_struct *tsec = selinux_cred(cred);
        int rc;

        rc = avc_has_perm(&selinux_state,
                          tsec->sid, sbsec->sid, SECCLASS_FILESYSTEM,
                          FILESYSTEM__RELABELFROM, NULL);
        if (rc)
                return rc;

        rc = avc_has_perm(&selinux_state,
                          tsec->sid, sid, SECCLASS_FILESYSTEM,
                          FILESYSTEM__RELABELTO, NULL);
        return rc;
}

static int may_context_mount_inode_relabel(u32 sid,
                        struct superblock_security_struct *sbsec,
                        const struct cred *cred)
{
        const struct task_security_struct *tsec = selinux_cred(cred);
        int rc;
        rc = avc_has_perm(&selinux_state,
                          tsec->sid, sbsec->sid, SECCLASS_FILESYSTEM,
                          FILESYSTEM__RELABELFROM, NULL);
        if (rc)
                return rc;

        rc = avc_has_perm(&selinux_state,
                          sid, sbsec->sid, SECCLASS_FILESYSTEM,
                          FILESYSTEM__ASSOCIATE, NULL);
        return rc;
}

static int selinux_is_genfs_special_handling(struct super_block *sb)
{
        /* Special handling. Genfs but also in-core setxattr handler */
        return        !strcmp(sb->s_type->name, "sysfs") ||
                !strcmp(sb->s_type->name, "pstore") ||
                !strcmp(sb->s_type->name, "debugfs") ||
                !strcmp(sb->s_type->name, "tracefs") ||
                !strcmp(sb->s_type->name, "rootfs") ||
                (selinux_policycap_cgroupseclabel() &&
                 (!strcmp(sb->s_type->name, "cgroup") ||
                  !strcmp(sb->s_type->name, "cgroup2")));
}

static int selinux_is_sblabel_mnt(struct super_block *sb)
{
        struct superblock_security_struct *sbsec = sb->s_security;

        /*
         * IMPORTANT: Double-check logic in this function when adding a new
         * SECURITY_FS_USE_* definition!
         */
        BUILD_BUG_ON(SECURITY_FS_USE_MAX != 7);

        switch (sbsec->behavior) {
        case SECURITY_FS_USE_XATTR:
        case SECURITY_FS_USE_TRANS:
        case SECURITY_FS_USE_TASK:
        case SECURITY_FS_USE_NATIVE:
                return 1;

        case SECURITY_FS_USE_GENFS:
                return selinux_is_genfs_special_handling(sb);

        /* Never allow relabeling on context mounts */
        case SECURITY_FS_USE_MNTPOINT:
        case SECURITY_FS_USE_NONE:
        default:
                return 0;
        }
}

static int sb_finish_set_opts(struct super_block *sb)
{
        struct superblock_security_struct *sbsec = sb->s_security;
        struct dentry *root = sb->s_root;
        struct inode *root_inode = d_backing_inode(root);
        int rc = 0;

        if (sbsec->behavior == SECURITY_FS_USE_XATTR) {
                /* Make sure that the xattr handler exists and that no
                   error other than -ENODATA is returned by getxattr on
                   the root directory.  -ENODATA is ok, as this may be
                   the first boot of the SELinux kernel before we have
                   assigned xattr values to the filesystem. */
                if (!(root_inode->i_opflags & IOP_XATTR)) {
                        pr_warn("SELinux: (dev %s, type %s) has no "
                               "xattr support\n", sb->s_id, sb->s_type->name);
                        rc = -EOPNOTSUPP;
                        goto out;
                }

                rc = __vfs_getxattr(root, root_inode, XATTR_NAME_SELINUX, NULL, 0);
                if (rc < 0 && rc != -ENODATA) {
                        if (rc == -EOPNOTSUPP)
                                pr_warn("SELinux: (dev %s, type "
                                       "%s) has no security xattr handler\n",
                                       sb->s_id, sb->s_type->name);
                        else
                                pr_warn("SELinux: (dev %s, type "
                                       "%s) getxattr errno %d\n", sb->s_id,
                                       sb->s_type->name, -rc);
                        goto out;
                }
        }

        sbsec->flags |= SE_SBINITIALIZED;

        /*
         * Explicitly set or clear SBLABEL_MNT.  It's not sufficient to simply
         * leave the flag untouched because sb_clone_mnt_opts might be handing
         * us a superblock that needs the flag to be cleared.
         */
        if (selinux_is_sblabel_mnt(sb))
                sbsec->flags |= SBLABEL_MNT;
        else
                sbsec->flags &= ~SBLABEL_MNT;

        /* Initialize the root inode. */
        rc = inode_doinit_with_dentry(root_inode, root);

        /* Initialize any other inodes associated with the superblock, e.g.
           inodes created prior to initial policy load or inodes created
           during get_sb by a pseudo filesystem that directly
           populates itself. */
        spin_lock(&sbsec->isec_lock);
        while (!list_empty(&sbsec->isec_head)) {
                struct inode_security_struct *isec =
                                list_first_entry(&sbsec->isec_head,
                                           struct inode_security_struct, list);
                struct inode *inode = isec->inode;
                list_del_init(&isec->list);
                spin_unlock(&sbsec->isec_lock);
                inode = igrab(inode);
                if (inode) {
                        if (!IS_PRIVATE(inode))
                                inode_doinit_with_dentry(inode, NULL);
                        iput(inode);
                }
                spin_lock(&sbsec->isec_lock);
        }
        spin_unlock(&sbsec->isec_lock);
out:
        return rc;
}

static int bad_option(struct superblock_security_struct *sbsec, char flag,
                      u32 old_sid, u32 new_sid)
{
        char mnt_flags = sbsec->flags & SE_MNTMASK;

        /* check if the old mount command had the same options */
        if (sbsec->flags & SE_SBINITIALIZED)
                if (!(sbsec->flags & flag) ||
                    (old_sid != new_sid))
                        return 1;

        /* check if we were passed the same options twice,
         * aka someone passed context=a,context=b
         */
        if (!(sbsec->flags & SE_SBINITIALIZED))
                if (mnt_flags & flag)
                        return 1;
        return 0;
}

static int parse_sid(struct super_block *sb, const char *s, u32 *sid)
{
        int rc = security_context_str_to_sid(&selinux_state, s,
                                             sid, GFP_KERNEL);
        if (rc)
                pr_warn("SELinux: security_context_str_to_sid"
                       "(%s) failed for (dev %s, type %s) errno=%d\n",
                       s, sb->s_id, sb->s_type->name, rc);
        return rc;
}

/*
 * Allow filesystems with binary mount data to explicitly set mount point
 * labeling information.
 */
static int selinux_set_mnt_opts(struct super_block *sb,
                                void *mnt_opts,
                                unsigned long kern_flags,
                                unsigned long *set_kern_flags)
{
        const struct cred *cred = current_cred();
        struct superblock_security_struct *sbsec = sb->s_security;
        struct dentry *root = sbsec->sb->s_root;
        struct selinux_mnt_opts *opts = mnt_opts;
        struct inode_security_struct *root_isec;
        u32 fscontext_sid = 0, context_sid = 0, rootcontext_sid = 0;
        u32 defcontext_sid = 0;
        int rc = 0;

        mutex_lock(&sbsec->lock);

        if (!selinux_initialized(&selinux_state)) {
                if (!opts) {
                        /* Defer initialization until selinux_complete_init,
                           after the initial policy is loaded and the security
                           server is ready to handle calls. */
                        goto out;
                }
                rc = -EINVAL;
                pr_warn("SELinux: Unable to set superblock options "
                        "before the security server is initialized\n");
                goto out;
        }
        if (kern_flags && !set_kern_flags) {
                /* Specifying internal flags without providing a place to
                 * place the results is not allowed */
                rc = -EINVAL;
                goto out;
        }

        /*
         * Binary mount data FS will come through this function twice.  Once
         * from an explicit call and once from the generic calls from the vfs.
         * Since the generic VFS calls will not contain any security mount data
         * we need to skip the double mount verification.
         *
         * This does open a hole in which we will not notice if the first
         * mount using this sb set explict options and a second mount using
         * this sb does not set any security options.  (The first options
         * will be used for both mounts)
         */
        if ((sbsec->flags & SE_SBINITIALIZED) && (sb->s_type->fs_flags & FS_BINARY_MOUNTDATA)
            && !opts)
                goto out;

        root_isec = backing_inode_security_novalidate(root);

        /*
         * parse the mount options, check if they are valid sids.
         * also check if someone is trying to mount the same sb more
         * than once with different security options.
         */
        if (opts) {
                if (opts->fscontext) {
                        rc = parse_sid(sb, opts->fscontext, &fscontext_sid);
                        if (rc)
                                goto out;
                        if (bad_option(sbsec, FSCONTEXT_MNT, sbsec->sid,
                                        fscontext_sid))
                                goto out_double_mount;
                        sbsec->flags |= FSCONTEXT_MNT;
                }
                if (opts->context) {
                        rc = parse_sid(sb, opts->context, &context_sid);
                        if (rc)
                                goto out;
                        if (bad_option(sbsec, CONTEXT_MNT, sbsec->mntpoint_sid,
                                        context_sid))
                                goto out_double_mount;
                        sbsec->flags |= CONTEXT_MNT;
                }
                if (opts->rootcontext) {
                        rc = parse_sid(sb, opts->rootcontext, &rootcontext_sid);
                        if (rc)
                                goto out;
                        if (bad_option(sbsec, ROOTCONTEXT_MNT, root_isec->sid,
                                        rootcontext_sid))
                                goto out_double_mount;
                        sbsec->flags |= ROOTCONTEXT_MNT;
                }
                if (opts->defcontext) {
                        rc = parse_sid(sb, opts->defcontext, &defcontext_sid);
                        if (rc)
                                goto out;
                        if (bad_option(sbsec, DEFCONTEXT_MNT, sbsec->def_sid,
                                        defcontext_sid))
                                goto out_double_mount;
                        sbsec->flags |= DEFCONTEXT_MNT;
                }
        }

        if (sbsec->flags & SE_SBINITIALIZED) {
                /* previously mounted with options, but not on this attempt? */
                if ((sbsec->flags & SE_MNTMASK) && !opts)
                        goto out_double_mount;
                rc = 0;
                goto out;
        }

        if (strcmp(sb->s_type->name, "proc") == 0)
                sbsec->flags |= SE_SBPROC | SE_SBGENFS;

        if (!strcmp(sb->s_type->name, "debugfs") ||
            !strcmp(sb->s_type->name, "tracefs") ||
            !strcmp(sb->s_type->name, "binder") ||
            !strcmp(sb->s_type->name, "bpf") ||
            !strcmp(sb->s_type->name, "pstore"))
                sbsec->flags |= SE_SBGENFS;

        if (!strcmp(sb->s_type->name, "sysfs") ||
            !strcmp(sb->s_type->name, "cgroup") ||
            !strcmp(sb->s_type->name, "cgroup2"))
                sbsec->flags |= SE_SBGENFS | SE_SBGENFS_XATTR;

        if (!sbsec->behavior) {
                /*
                 * Determine the labeling behavior to use for this
                 * filesystem type.
                 */
                rc = security_fs_use(&selinux_state, sb);
                if (rc) {
                        pr_warn("%s: security_fs_use(%s) returned %d\n",
                                        __func__, sb->s_type->name, rc);
                        goto out;
                }
        }

        /*
         * If this is a user namespace mount and the filesystem type is not
         * explicitly whitelisted, then no contexts are allowed on the command
         * line and security labels must be ignored.
         */
        if (sb->s_user_ns != &init_user_ns &&
            strcmp(sb->s_type->name, "tmpfs") &&
            strcmp(sb->s_type->name, "ramfs") &&
            strcmp(sb->s_type->name, "devpts")) {
                if (context_sid || fscontext_sid || rootcontext_sid ||
                    defcontext_sid) {
                        rc = -EACCES;
                        goto out;
                }
                if (sbsec->behavior == SECURITY_FS_USE_XATTR) {
                        sbsec->behavior = SECURITY_FS_USE_MNTPOINT;
                        rc = security_transition_sid(&selinux_state,
                                                     current_sid(),
                                                     current_sid(),
                                                     SECCLASS_FILE, NULL,
                                                     &sbsec->mntpoint_sid);
                        if (rc)
                                goto out;
                }
                goto out_set_opts;
        }

        /* sets the context of the superblock for the fs being mounted. */
        if (fscontext_sid) {
                rc = may_context_mount_sb_relabel(fscontext_sid, sbsec, cred);
                if (rc)
                        goto out;

                sbsec->sid = fscontext_sid;
        }

        /*
         * Switch to using mount point labeling behavior.
         * sets the label used on all file below the mountpoint, and will set
         * the superblock context if not already set.
         */
        if (kern_flags & SECURITY_LSM_NATIVE_LABELS && !context_sid) {
                sbsec->behavior = SECURITY_FS_USE_NATIVE;
                *set_kern_flags |= SECURITY_LSM_NATIVE_LABELS;
        }

        if (context_sid) {
                if (!fscontext_sid) {
                        rc = may_context_mount_sb_relabel(context_sid, sbsec,
                                                          cred);
                        if (rc)
                                goto out;
                        sbsec->sid = context_sid;
                } else {
                        rc = may_context_mount_inode_relabel(context_sid, sbsec,
                                                             cred);
                        if (rc)
                                goto out;
                }
                if (!rootcontext_sid)
                        rootcontext_sid = context_sid;

                sbsec->mntpoint_sid = context_sid;
                sbsec->behavior = SECURITY_FS_USE_MNTPOINT;
        }

        if (rootcontext_sid) {
                rc = may_context_mount_inode_relabel(rootcontext_sid, sbsec,
                                                     cred);
                if (rc)
                        goto out;

                root_isec->sid = rootcontext_sid;
                root_isec->initialized = LABEL_INITIALIZED;
        }

        if (defcontext_sid) {
                if (sbsec->behavior != SECURITY_FS_USE_XATTR &&
                        sbsec->behavior != SECURITY_FS_USE_NATIVE) {
                        rc = -EINVAL;
                        pr_warn("SELinux: defcontext option is "
                               "invalid for this filesystem type\n");
                        goto out;
                }

                if (defcontext_sid != sbsec->def_sid) {
                        rc = may_context_mount_inode_relabel(defcontext_sid,
                                                             sbsec, cred);
                        if (rc)
                                goto out;
                }

                sbsec->def_sid = defcontext_sid;
        }

out_set_opts:
        rc = sb_finish_set_opts(sb);
out:
        mutex_unlock(&sbsec->lock);
        return rc;
out_double_mount:
        rc = -EINVAL;
        pr_warn("SELinux: mount invalid.  Same superblock, different "
               "security settings for (dev %s, type %s)\n", sb->s_id,
               sb->s_type->name);
        goto out;
}

static int selinux_cmp_sb_context(const struct super_block *oldsb,
                                    const struct super_block *newsb)
{
        struct superblock_security_struct *old = oldsb->s_security;
        struct superblock_security_struct *new = newsb->s_security;
        char oldflags = old->flags & SE_MNTMASK;
        char newflags = new->flags & SE_MNTMASK;

        if (oldflags != newflags)
                goto mismatch;
        if ((oldflags & FSCONTEXT_MNT) && old->sid != new->sid)
                goto mismatch;
        if ((oldflags & CONTEXT_MNT) && old->mntpoint_sid != new->mntpoint_sid)
                goto mismatch;
        if ((oldflags & DEFCONTEXT_MNT) && old->def_sid != new->def_sid)
                goto mismatch;
        if (oldflags & ROOTCONTEXT_MNT) {
                struct inode_security_struct *oldroot = backing_inode_security(oldsb->s_root);
                struct inode_security_struct *newroot = backing_inode_security(newsb->s_root);
                if (oldroot->sid != newroot->sid)
                        goto mismatch;
        }
        return 0;
mismatch:
        pr_warn("SELinux: mount invalid.  Same superblock, "
                            "different security settings for (dev %s, "
                            "type %s)\n", newsb->s_id, newsb->s_type->name);
        return -EBUSY;
}

static int selinux_sb_clone_mnt_opts(const struct super_block *oldsb,
                                        struct super_block *newsb,
                                        unsigned long kern_flags,
                                        unsigned long *set_kern_flags)
{
        int rc = 0;
        const struct superblock_security_struct *oldsbsec = oldsb->s_security;
        struct superblock_security_struct *newsbsec = newsb->s_security;

        int set_fscontext =        (oldsbsec->flags & FSCONTEXT_MNT);
        int set_context =        (oldsbsec->flags & CONTEXT_MNT);
        int set_rootcontext =        (oldsbsec->flags & ROOTCONTEXT_MNT);

        /*
         * if the parent was able to be mounted it clearly had no special lsm
         * mount options.  thus we can safely deal with this superblock later
         */
        if (!selinux_initialized(&selinux_state))
                return 0;

        /*
         * Specifying internal flags without providing a place to
         * place the results is not allowed.
         */
        if (kern_flags && !set_kern_flags)
                return -EINVAL;

        /* how can we clone if the old one wasn't set up?? */
        BUG_ON(!(oldsbsec->flags & SE_SBINITIALIZED));

        /* if fs is reusing a sb, make sure that the contexts match */
        if (newsbsec->flags & SE_SBINITIALIZED) {
                if ((kern_flags & SECURITY_LSM_NATIVE_LABELS) && !set_context)
                        *set_kern_flags |= SECURITY_LSM_NATIVE_LABELS;
                return selinux_cmp_sb_context(oldsb, newsb);
        }

        mutex_lock(&newsbsec->lock);

        newsbsec->flags = oldsbsec->flags;

        newsbsec->sid = oldsbsec->sid;
        newsbsec->def_sid = oldsbsec->def_sid;
        newsbsec->behavior = oldsbsec->behavior;

        if (newsbsec->behavior == SECURITY_FS_USE_NATIVE &&
                !(kern_flags & SECURITY_LSM_NATIVE_LABELS) && !set_context) {
                rc = security_fs_use(&selinux_state, newsb);
                if (rc)
                        goto out;
        }

        if (kern_flags & SECURITY_LSM_NATIVE_LABELS && !set_context) {
                newsbsec->behavior = SECURITY_FS_USE_NATIVE;
                *set_kern_flags |= SECURITY_LSM_NATIVE_LABELS;
        }

        if (set_context) {
                u32 sid = oldsbsec->mntpoint_sid;

                if (!set_fscontext)
                        newsbsec->sid = sid;
                if (!set_rootcontext) {
                        struct inode_security_struct *newisec = backing_inode_security(newsb->s_root);
                        newisec->sid = sid;
                }
                newsbsec->mntpoint_sid = sid;
        }
        if (set_rootcontext) {
                const struct inode_security_struct *oldisec = backing_inode_security(oldsb->s_root);
                struct inode_security_struct *newisec = backing_inode_security(newsb->s_root);

                newisec->sid = oldisec->sid;
        }

        sb_finish_set_opts(newsb);
out:
        mutex_unlock(&newsbsec->lock);
        return rc;
}

static int selinux_add_opt(int token, const char *s, void **mnt_opts)
{
        struct selinux_mnt_opts *opts = *mnt_opts;
        bool is_alloc_opts = false;

        if (token == Opt_seclabel)        /* eaten and completely ignored */
                return 0;

        if (!s)
                return -ENOMEM;

        if (!opts) {
                opts = kzalloc(sizeof(struct selinux_mnt_opts), GFP_KERNEL);
                if (!opts)
                        return -ENOMEM;
                *mnt_opts = opts;
                is_alloc_opts = true;
        }

        switch (token) {
        case Opt_context:
                if (opts->context || opts->defcontext)
                        goto Einval;
                opts->context = s;
                break;
        case Opt_fscontext:
                if (opts->fscontext)
                        goto Einval;
                opts->fscontext = s;
                break;
        case Opt_rootcontext:
                if (opts->rootcontext)
                        goto Einval;
                opts->rootcontext = s;
                break;
        case Opt_defcontext:
                if (opts->context || opts->defcontext)
                        goto Einval;
                opts->defcontext = s;
                break;
        }
        return 0;
Einval:
        if (is_alloc_opts) {
                kfree(opts);
                *mnt_opts = NULL;
        }
        pr_warn(SEL_MOUNT_FAIL_MSG);
        return -EINVAL;
}

static int selinux_add_mnt_opt(const char *option, const char *val, int len,
                               void **mnt_opts)
{
        int token = Opt_error;
        int rc, i;

        for (i = 0; i < ARRAY_SIZE(tokens); i++) {
                if (strcmp(option, tokens[i].name) == 0) {
                        token = tokens[i].opt;
                        break;
                }
        }

        if (token == Opt_error)
                return -EINVAL;

        if (token != Opt_seclabel) {
                val = kmemdup_nul(val, len, GFP_KERNEL);
                if (!val) {
                        rc = -ENOMEM;
                        goto free_opt;
                }
        }
        rc = selinux_add_opt(token, val, mnt_opts);
        if (unlikely(rc)) {
                kfree(val);
                goto free_opt;
        }
        return rc;

free_opt:
        if (*mnt_opts) {
                selinux_free_mnt_opts(*mnt_opts);
                *mnt_opts = NULL;
        }
        return rc;
}

static int show_sid(struct seq_file *m, u32 sid)
{
        char *context = NULL;
        u32 len;
        int rc;

        rc = security_sid_to_context(&selinux_state, sid,
                                             &context, &len);
        if (!rc) {
                bool has_comma = context && strchr(context, ',');

                seq_putc(m, '=');
                if (has_comma)
                        seq_putc(m, '\"');
                seq_escape(m, context, "\"\n\\");
                if (has_comma)
                        seq_putc(m, '\"');
        }
        kfree(context);
        return rc;
}

static int selinux_sb_show_options(struct seq_file *m, struct super_block *sb)
{
        struct superblock_security_struct *sbsec = sb->s_security;
        int rc;

        if (!(sbsec->flags & SE_SBINITIALIZED))
                return 0;

        if (!selinux_initialized(&selinux_state))
                return 0;

        if (sbsec->flags & FSCONTEXT_MNT) {
                seq_putc(m, ',');
                seq_puts(m, FSCONTEXT_STR);
                rc = show_sid(m, sbsec->sid);
                if (rc)
                        return rc;
        }
        if (sbsec->flags & CONTEXT_MNT) {
                seq_putc(m, ',');
                seq_puts(m, CONTEXT_STR);
                rc = show_sid(m, sbsec->mntpoint_sid);
                if (rc)
                        return rc;
        }
        if (sbsec->flags & DEFCONTEXT_MNT) {
                seq_putc(m, ',');
                seq_puts(m, DEFCONTEXT_STR);
                rc = show_sid(m, sbsec->def_sid);
                if (rc)
                        return rc;
        }
        if (sbsec->flags & ROOTCONTEXT_MNT) {
                struct dentry *root = sbsec->sb->s_root;
                struct inode_security_struct *isec = backing_inode_security(root);
                seq_putc(m, ',');
                seq_puts(m, ROOTCONTEXT_STR);
                rc = show_sid(m, isec->sid);
                if (rc)
                        return rc;
        }
        if (sbsec->flags & SBLABEL_MNT) {
                seq_putc(m, ',');
                seq_puts(m, SECLABEL_STR);
        }
        return 0;
}

static inline u16 inode_mode_to_security_class(umode_t mode)
{
        switch (mode & S_IFMT) {
        case S_IFSOCK:
                return SECCLASS_SOCK_FILE;
        case S_IFLNK:
                return SECCLASS_LNK_FILE;
        case S_IFREG:
                return SECCLASS_FILE;
        case S_IFBLK:
                return SECCLASS_BLK_FILE;
        case S_IFDIR:
                return SECCLASS_DIR;
        case S_IFCHR:
                return SECCLASS_CHR_FILE;
        case S_IFIFO:
                return SECCLASS_FIFO_FILE;

        }

        return SECCLASS_FILE;
}

static inline int default_protocol_stream(int protocol)
{
        return (protocol == IPPROTO_IP || protocol == IPPROTO_TCP);
}

static inline int default_protocol_dgram(int protocol)
{
        return (protocol == IPPROTO_IP || protocol == IPPROTO_UDP);
}

static inline u16 socket_type_to_security_class(int family, int type, int protocol)
{
        int extsockclass = selinux_policycap_extsockclass();

        switch (family) {
        case PF_UNIX:
                switch (type) {
                case SOCK_STREAM:
                case SOCK_SEQPACKET:
                        return SECCLASS_UNIX_STREAM_SOCKET;
                case SOCK_DGRAM:
                case SOCK_RAW:
                        return SECCLASS_UNIX_DGRAM_SOCKET;
                }
                break;
        case PF_INET:
        case PF_INET6:
                switch (type) {
                case SOCK_STREAM:
                case SOCK_SEQPACKET:
                        if (default_protocol_stream(protocol))
                                return SECCLASS_TCP_SOCKET;
                        else if (extsockclass && protocol == IPPROTO_SCTP)
                                return SECCLASS_SCTP_SOCKET;
                        else
                                return SECCLASS_RAWIP_SOCKET;
                case SOCK_DGRAM:
                        if (default_protocol_dgram(protocol))
                                return SECCLASS_UDP_SOCKET;
                        else if (extsockclass && (protocol == IPPROTO_ICMP ||
                                                  protocol == IPPROTO_ICMPV6))
                                return SECCLASS_ICMP_SOCKET;
                        else
                                return SECCLASS_RAWIP_SOCKET;
                case SOCK_DCCP:
                        return SECCLASS_DCCP_SOCKET;
                default:
                        return SECCLASS_RAWIP_SOCKET;
                }
                break;
        case PF_NETLINK:
                switch (protocol) {
                case NETLINK_ROUTE:
                        return SECCLASS_NETLINK_ROUTE_SOCKET;
                case NETLINK_SOCK_DIAG:
                        return SECCLASS_NETLINK_TCPDIAG_SOCKET;
                case NETLINK_NFLOG:
                        return SECCLASS_NETLINK_NFLOG_SOCKET;
                case NETLINK_XFRM:
                        return SECCLASS_NETLINK_XFRM_SOCKET;
                case NETLINK_SELINUX:
                        return SECCLASS_NETLINK_SELINUX_SOCKET;
                case NETLINK_ISCSI:
                        return SECCLASS_NETLINK_ISCSI_SOCKET;
                case NETLINK_AUDIT:
                        return SECCLASS_NETLINK_AUDIT_SOCKET;
                case NETLINK_FIB_LOOKUP:
                        return SECCLASS_NETLINK_FIB_LOOKUP_SOCKET;
                case NETLINK_CONNECTOR:
                        return SECCLASS_NETLINK_CONNECTOR_SOCKET;
                case NETLINK_NETFILTER:
                        return SECCLASS_NETLINK_NETFILTER_SOCKET;
                case NETLINK_DNRTMSG:
                        return SECCLASS_NETLINK_DNRT_SOCKET;
                case NETLINK_KOBJECT_UEVENT:
                        return SECCLASS_NETLINK_KOBJECT_UEVENT_SOCKET;
                case NETLINK_GENERIC:
                        return SECCLASS_NETLINK_GENERIC_SOCKET;
                case NETLINK_SCSITRANSPORT:
                        return SECCLASS_NETLINK_SCSITRANSPORT_SOCKET;
                case NETLINK_RDMA:
                        return SECCLASS_NETLINK_RDMA_SOCKET;
                case NETLINK_CRYPTO:
                        return SECCLASS_NETLINK_CRYPTO_SOCKET;
                default:
                        return SECCLASS_NETLINK_SOCKET;
                }
        case PF_PACKET:
                return SECCLASS_PACKET_SOCKET;
        case PF_KEY:
                return SECCLASS_KEY_SOCKET;
        case PF_APPLETALK:
                return SECCLASS_APPLETALK_SOCKET;
        }

        if (extsockclass) {
                switch (family) {
                case PF_AX25:
                        return SECCLASS_AX25_SOCKET;
                case PF_IPX:
                        return SECCLASS_IPX_SOCKET;
                case PF_NETROM:
                        return SECCLASS_NETROM_SOCKET;
                case PF_ATMPVC:
                        return SECCLASS_ATMPVC_SOCKET;
                case PF_X25:
                        return SECCLASS_X25_SOCKET;
                case PF_ROSE:
                        return SECCLASS_ROSE_SOCKET;
                case PF_DECnet:
                        return SECCLASS_DECNET_SOCKET;
                case PF_ATMSVC:
                        return SECCLASS_ATMSVC_SOCKET;
                case PF_RDS:
                        return SECCLASS_RDS_SOCKET;
                case PF_IRDA:
                        return SECCLASS_IRDA_SOCKET;
                case PF_PPPOX:
                        return SECCLASS_PPPOX_SOCKET;
                case PF_LLC:
                        return SECCLASS_LLC_SOCKET;
                case PF_CAN:
                        return SECCLASS_CAN_SOCKET;
                case PF_TIPC:
                        return SECCLASS_TIPC_SOCKET;
                case PF_BLUETOOTH:
                        return SECCLASS_BLUETOOTH_SOCKET;
                case PF_IUCV:
                        return SECCLASS_IUCV_SOCKET;
                case PF_RXRPC:
                        return SECCLASS_RXRPC_SOCKET;
                case PF_ISDN:
                        return SECCLASS_ISDN_SOCKET;
                case PF_PHONET:
                        return SECCLASS_PHONET_SOCKET;
                case PF_IEEE802154:
                        return SECCLASS_IEEE802154_SOCKET;
                case PF_CAIF:
                        return SECCLASS_CAIF_SOCKET;
                case PF_ALG:
                        return SECCLASS_ALG_SOCKET;
                case PF_NFC:
                        return SECCLASS_NFC_SOCKET;
                case PF_VSOCK:
                        return SECCLASS_VSOCK_SOCKET;
                case PF_KCM:
                        return SECCLASS_KCM_SOCKET;
                case PF_QIPCRTR:
                        return SECCLASS_QIPCRTR_SOCKET;
                case PF_SMC:
                        return SECCLASS_SMC_SOCKET;
                case PF_XDP:
                        return SECCLASS_XDP_SOCKET;
#if PF_MAX > 45
#error New address family defined, please update this function.
#endif
                }
        }

        return SECCLASS_SOCKET;
}

static int selinux_genfs_get_sid(struct dentry *dentry,
                                 u16 tclass,
                                 u16 flags,
                                 u32 *sid)
{
        int rc;
        struct super_block *sb = dentry->d_sb;
        char *buffer, *path;

        buffer = (char *)__get_free_page(GFP_KERNEL);
        if (!buffer)
                return -ENOMEM;

        path = dentry_path_raw(dentry, buffer, PAGE_SIZE);
        if (IS_ERR(path))
                rc = PTR_ERR(path);
        else {
                if (flags & SE_SBPROC) {
                        /* each process gets a /proc/PID/ entry. Strip off the
                         * PID part to get a valid selinux labeling.
                         * e.g. /proc/1/net/rpc/nfs -> /net/rpc/nfs */
                        while (path[1] >= '0' && path[1] <= '9') {
                                path[1] = '/';
                                path++;
                        }
                }
                rc = security_genfs_sid(&selinux_state, sb->s_type->name,
                                        path, tclass, sid);
                if (rc == -ENOENT) {
                        /* No match in policy, mark as unlabeled. */
                        *sid = SECINITSID_UNLABELED;
                        rc = 0;
                }
        }
        free_page((unsigned long)buffer);
        return rc;
}

static int inode_doinit_use_xattr(struct inode *inode, struct dentry *dentry,
                                  u32 def_sid, u32 *sid)
{
#define INITCONTEXTLEN 255
        char *context;
        unsigned int len;
        int rc;

        len = INITCONTEXTLEN;
        context = kmalloc(len + 1, GFP_NOFS);
        if (!context)
                return -ENOMEM;

        context[len] = '\0';
        rc = __vfs_getxattr(dentry, inode, XATTR_NAME_SELINUX, context, len);
        if (rc == -ERANGE) {
                kfree(context);

                /* Need a larger buffer.  Query for the right size. */
                rc = __vfs_getxattr(dentry, inode, XATTR_NAME_SELINUX, NULL, 0);
                if (rc < 0)
                        return rc;

                len = rc;
                context = kmalloc(len + 1, GFP_NOFS);
                if (!context)
                        return -ENOMEM;

                context[len] = '\0';
                rc = __vfs_getxattr(dentry, inode, XATTR_NAME_SELINUX,
                                    context, len);
        }
        if (rc < 0) {
                kfree(context);
                if (rc != -ENODATA) {
                        pr_warn("SELinux: %s:  getxattr returned %d for dev=%s ino=%ld\n",
                                __func__, -rc, inode->i_sb->s_id, inode->i_ino);
                        return rc;
                }
                *sid = def_sid;
                return 0;
        }

        rc = security_context_to_sid_default(&selinux_state, context, rc, sid,
                                             def_sid, GFP_NOFS);
        if (rc) {
                char *dev = inode->i_sb->s_id;
                unsigned long ino = inode->i_ino;

                if (rc == -EINVAL) {
                        pr_notice_ratelimited("SELinux: inode=%lu on dev=%s was found to have an invalid context=%s.  This indicates you may need to relabel the inode or the filesystem in question.\n",
                                              ino, dev, context);
                } else {
                        pr_warn("SELinux: %s:  context_to_sid(%s) returned %d for dev=%s ino=%ld\n",
                                __func__, context, -rc, dev, ino);
                }
        }
        kfree(context);
        return 0;
}

/* The inode's security attributes must be initialized before first use. */
static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dentry)
{
        struct superblock_security_struct *sbsec = NULL;
        struct inode_security_struct *isec = selinux_inode(inode);
        u32 task_sid, sid = 0;
        u16 sclass;
        struct dentry *dentry;
        int rc = 0;

        if (isec->initialized == LABEL_INITIALIZED)
                return 0;

        spin_lock(&isec->lock);
        if (isec->initialized == LABEL_INITIALIZED)
                goto out_unlock;

        if (isec->sclass == SECCLASS_FILE)
                isec->sclass = inode_mode_to_security_class(inode->i_mode);

        sbsec = inode->i_sb->s_security;
        if (!(sbsec->flags & SE_SBINITIALIZED)) {
                /* Defer initialization until selinux_complete_init,
                   after the initial policy is loaded and the security
                   server is ready to handle calls. */
                spin_lock(&sbsec->isec_lock);
                if (list_empty(&isec->list))
                        list_add(&isec->list, &sbsec->isec_head);
                spin_unlock(&sbsec->isec_lock);
                goto out_unlock;
        }

        sclass = isec->sclass;
        task_sid = isec->task_sid;
        sid = isec->sid;
        isec->initialized = LABEL_PENDING;
        spin_unlock(&isec->lock);

        switch (sbsec->behavior) {
        case SECURITY_FS_USE_NATIVE:
                break;
        case SECURITY_FS_USE_XATTR:
                if (!(inode->i_opflags & IOP_XATTR)) {
                        sid = sbsec->def_sid;
                        break;
                }
                /* Need a dentry, since the xattr API requires one.
                   Life would be simpler if we could just pass the inode. */
                if (opt_dentry) {
                        /* Called from d_instantiate or d_splice_alias. */
                        dentry = dget(opt_dentry);
                } else {
                        /*
                         * Called from selinux_complete_init, try to find a dentry.
                         * Some filesystems really want a connected one, so try
                         * that first.  We could split SECURITY_FS_USE_XATTR in
                         * two, depending upon that...
                         */
                        dentry = d_find_alias(inode);
                        if (!dentry)
                                dentry = d_find_any_alias(inode);
                }
                if (!dentry) {
                        /*
                         * this is can be hit on boot when a file is accessed
                         * before the policy is loaded.  When we load policy we
                         * may find inodes that have no dentry on the
                         * sbsec->isec_head list.  No reason to complain as these
                         * will get fixed up the next time we go through
                         * inode_doinit with a dentry, before these inodes could
                         * be used again by userspace.
                         */
                        goto out_invalid;
                }

                rc = inode_doinit_use_xattr(inode, dentry, sbsec->def_sid,
                                            &sid);
                dput(dentry);
                if (rc)
                        goto out;
                break;
        case SECURITY_FS_USE_TASK:
                sid = task_sid;
                break;
        case SECURITY_FS_USE_TRANS:
                /* Default to the fs SID. */
                sid = sbsec->sid;

                /* Try to obtain a transition SID. */
                rc = security_transition_sid(&selinux_state, task_sid, sid,
                                             sclass, NULL, &sid);
                if (rc)
                        goto out;
                break;
        case SECURITY_FS_USE_MNTPOINT:
                sid = sbsec->mntpoint_sid;
                break;
        default:
                /* Default to the fs superblock SID. */
                sid = sbsec->sid;

                if ((sbsec->flags & SE_SBGENFS) &&
                     (!S_ISLNK(inode->i_mode) ||
                      selinux_policycap_genfs_seclabel_symlinks())) {
                        /* We must have a dentry to determine the label on
                         * procfs inodes */
                        if (opt_dentry) {
                                /* Called from d_instantiate or
                                 * d_splice_alias. */
                                dentry = dget(opt_dentry);
                        } else {
                                /* Called from selinux_complete_init, try to
                                 * find a dentry.  Some filesystems really want
                                 * a connected one, so try that first.
                                 */
                                dentry = d_find_alias(inode);
                                if (!dentry)
                                        dentry = d_find_any_alias(inode);
                        }
                        /*
                         * This can be hit on boot when a file is accessed
                         * before the policy is loaded.  When we load policy we
                         * may find inodes that have no dentry on the
                         * sbsec->isec_head list.  No reason to complain as
                         * these will get fixed up the next time we go through
                         * inode_doinit() with a dentry, before these inodes
                         * could be used again by userspace.
                         */
                        if (!dentry)
                                goto out_invalid;
                        rc = selinux_genfs_get_sid(dentry, sclass,
                                                   sbsec->flags, &sid);
                        if (rc) {
                                dput(dentry);
                                goto out;
                        }

                        if ((sbsec->flags & SE_SBGENFS_XATTR) &&
                            (inode->i_opflags & IOP_XATTR)) {
                                rc = inode_doinit_use_xattr(inode, dentry,
                                                            sid, &sid);
                                if (rc) {
                                        dput(dentry);
                                        goto out;
                                }
                        }
                        dput(dentry);
                }
                break;
        }

out:
        spin_lock(&isec->lock);
        if (isec->initialized == LABEL_PENDING) {
                if (rc) {
                        isec->initialized = LABEL_INVALID;
                        goto out_unlock;
                }
                isec->initialized = LABEL_INITIALIZED;
                isec->sid = sid;
        }

out_unlock:
        spin_unlock(&isec->lock);
        return rc;

out_invalid:
        spin_lock(&isec->lock);
        if (isec->initialized == LABEL_PENDING) {
                isec->initialized = LABEL_INVALID;
                isec->sid = sid;
        }
        spin_unlock(&isec->lock);
        return 0;
}

/* Convert a Linux signal to an access vector. */
static inline u32 signal_to_av(int sig)
{
        u32 perm = 0;

        switch (sig) {
        case SIGCHLD:
                /* Commonly granted from child to parent. */
                perm = PROCESS__SIGCHLD;
                break;
        case SIGKILL:
                /* Cannot be caught or ignored */
                perm = PROCESS__SIGKILL;
                break;
        case SIGSTOP:
                /* Cannot be caught or ignored */
                perm = PROCESS__SIGSTOP;
                break;
        default:
                /* All other signals. */
                perm = PROCESS__SIGNAL;
                break;
        }

        return perm;
}

#if CAP_LAST_CAP > 63
#error Fix SELinux to handle capabilities > 63.
#endif

/* Check whether a task is allowed to use a capability. */
static int cred_has_capability(const struct cred *cred,
                               int cap, unsigned int opts, bool initns)
{
        struct common_audit_data ad;
        struct av_decision avd;
        u16 sclass;
        u32 sid = cred_sid(cred);
        u32 av = CAP_TO_MASK(cap);
        int rc;

        ad.type = LSM_AUDIT_DATA_CAP;
        ad.u.cap = cap;

        switch (CAP_TO_INDEX(cap)) {
        case 0:
                sclass = initns ? SECCLASS_CAPABILITY : SECCLASS_CAP_USERNS;
                break;
        case 1:
                sclass = initns ? SECCLASS_CAPABILITY2 : SECCLASS_CAP2_USERNS;
                break;
        default:
                pr_err("SELinux:  out of range capability %d\n", cap);
                BUG();
                return -EINVAL;
        }

        rc = avc_has_perm_noaudit(&selinux_state,
                                  sid, sid, sclass, av, 0, &avd);
        if (!(opts & CAP_OPT_NOAUDIT)) {
                int rc2 = avc_audit(&selinux_state,
                                    sid, sid, sclass, av, &avd, rc, &ad, 0);
                if (rc2)
                        return rc2;
        }
        return rc;
}

/* Check whether a task has a particular permission to an inode.
   The 'adp' parameter is optional and allows other audit
   data to be passed (e.g. the dentry). */
static int inode_has_perm(const struct cred *cred,
                          struct inode *inode,
                          u32 perms,
                          struct common_audit_data *adp)
{
        struct inode_security_struct *isec;
        u32 sid;

        validate_creds(cred);

        if (unlikely(IS_PRIVATE(inode)))
                return 0;

        sid = cred_sid(cred);
        isec = selinux_inode(inode);

        return avc_has_perm(&selinux_state,
                            sid, isec->sid, isec->sclass, perms, adp);
}

/* Same as inode_has_perm, but pass explicit audit data containing
   the dentry to help the auditing code to more easily generate the
   pathname if needed. */
static inline int dentry_has_perm(const struct cred *cred,
                                  struct dentry *dentry,
                                  u32 av)
{
        struct inode *inode = d_backing_inode(dentry);
        struct common_audit_data ad;

        ad.type = LSM_AUDIT_DATA_DENTRY;
        ad.u.dentry = dentry;
        __inode_security_revalidate(inode, dentry, true);
        return inode_has_perm(cred, inode, av, &ad);
}

/* Same as inode_has_perm, but pass explicit audit data containing
   the path to help the auditing code to more easily generate the
   pathname if needed. */
static inline int path_has_perm(const struct cred *cred,
                                const struct path *path,
                                u32 av)
{
        struct inode *inode = d_backing_inode(path->dentry);
        struct common_audit_data ad;

        ad.type = LSM_AUDIT_DATA_PATH;
        ad.u.path = *path;
        __inode_security_revalidate(inode, path->dentry, true);
        return inode_has_perm(cred, inode, av, &ad);
}

/* Same as path_has_perm, but uses the inode from the file struct. */
static inline int file_path_has_perm(const struct cred *cred,
                                     struct file *file,
                                     u32 av)
{
        struct common_audit_data ad;

        ad.type = LSM_AUDIT_DATA_FILE;
        ad.u.file = file;
        return inode_has_perm(cred, file_inode(file), av, &ad);
}

#ifdef CONFIG_BPF_SYSCALL
static int bpf_fd_pass(struct file *file, u32 sid);
#endif

/* Check whether a task can use an open file descriptor to
   access an inode in a given way.  Check access to the
   descriptor itself, and then use dentry_has_perm to
   check a particular permission to the file.
   Access to the descriptor is implicitly granted if it
   has the same SID as the process.  If av is zero, then
   access to the file is not checked, e.g. for cases
   where only the descriptor is affected like seek. */
static int file_has_perm(const struct cred *cred,
                         struct file *file,
                         u32 av)
{
        struct file_security_struct *fsec = selinux_file(file);
        struct inode *inode = file_inode(file);
        struct common_audit_data ad;
        u32 sid = cred_sid(cred);
        int rc;

        ad.type = LSM_AUDIT_DATA_FILE;
        ad.u.file = file;

        if (sid != fsec->sid) {
                rc = avc_has_perm(&selinux_state,
                                  sid, fsec->sid,
                                  SECCLASS_FD,
                                  FD__USE,
                                  &ad);
                if (rc)
                        goto out;
        }

#ifdef CONFIG_BPF_SYSCALL
        rc = bpf_fd_pass(file, cred_sid(cred));
        if (rc)
                return rc;
#endif

        /* av is zero if only checking access to the descriptor. */
        rc = 0;
        if (av)
                rc = inode_has_perm(cred, inode, av, &ad);

out:
        return rc;
}

/*
 * Determine the label for an inode that might be unioned.
 */
static int
selinux_determine_inode_label(const struct task_security_struct *tsec,
                                 struct inode *dir,
                                 const struct qstr *name, u16 tclass,
                                 u32 *_new_isid)
{
        const struct superblock_security_struct *sbsec = dir->i_sb->s_security;

        if ((sbsec->flags & SE_SBINITIALIZED) &&
            (sbsec->behavior == SECURITY_FS_USE_MNTPOINT)) {
                *_new_isid = sbsec->mntpoint_sid;
        } else if ((sbsec->flags & SBLABEL_MNT) &&
                   tsec->create_sid) {
                *_new_isid = tsec->create_sid;
        } else {
                const struct inode_security_struct *dsec = inode_security(dir);
                return security_transition_sid(&selinux_state, tsec->sid,
                                               dsec->sid, tclass,
                                               name, _new_isid);
        }

        return 0;
}

/* Check whether a task can create a file. */
static int may_create(struct inode *dir,
                      struct dentry *dentry,
                      u16 tclass)
{
        const struct task_security_struct *tsec = selinux_cred(current_cred());
        struct inode_security_struct *dsec;
        struct superblock_security_struct *sbsec;
        u32 sid, newsid;
        struct common_audit_data ad;
        int rc;

        dsec = inode_security(dir);
        sbsec = dir->i_sb->s_security;

        sid = tsec->sid;

        ad.type = LSM_AUDIT_DATA_DENTRY;
        ad.u.dentry = dentry;

        rc = avc_has_perm(&selinux_state,
                          sid, dsec->sid, SECCLASS_DIR,
                          DIR__ADD_NAME | DIR__SEARCH,
                          &ad);
        if (rc)
                return rc;

        rc = selinux_determine_inode_label(tsec, dir, &dentry->d_name, tclass,
                                           &newsid);
        if (rc)
                return rc;

        rc = avc_has_perm(&selinux_state,
                          sid, newsid, tclass, FILE__CREATE, &ad);
        if (rc)
                return rc;

        return avc_has_perm(&selinux_state,
                            newsid, sbsec->sid,
                            SECCLASS_FILESYSTEM,
                            FILESYSTEM__ASSOCIATE, &ad);
}

#define MAY_LINK        0
#define MAY_UNLINK        1
#define MAY_RMDIR        2

/* Check whether a task can link, unlink, or rmdir a file/directory. */
static int may_link(struct inode *dir,
                    struct dentry *dentry,
                    int kind)

{
        struct inode_security_struct *dsec, *isec;
        struct common_audit_data ad;
        u32 sid = current_sid();
        u32 av;
        int rc;

        dsec = inode_security(dir);
        isec = backing_inode_security(dentry);

        ad.type = LSM_AUDIT_DATA_DENTRY;
        ad.u.dentry = dentry;

        av = DIR__SEARCH;
        av |= (kind ? DIR__REMOVE_NAME : DIR__ADD_NAME);
        rc = avc_has_perm(&selinux_state,
                          sid, dsec->sid, SECCLASS_DIR, av, &ad);
        if (rc)
                return rc;

        switch (kind) {
        case MAY_LINK:
                av = FILE__LINK;
                break;
        case MAY_UNLINK:
                av = FILE__UNLINK;
                break;
        case MAY_RMDIR:
                av = DIR__RMDIR;
                break;
        default:
                pr_warn("SELinux: %s:  unrecognized kind %d\n",
                        __func__, kind);
                return 0;
        }

        rc = avc_has_perm(&selinux_state,
                          sid, isec->sid, isec->sclass, av, &ad);
        return rc;
}

static inline int may_rename(struct inode *old_dir,
                             struct dentry *old_dentry,
                             struct inode *new_dir,
                             struct dentry *new_dentry)
{
        struct inode_security_struct *old_dsec, *new_dsec, *old_isec, *new_isec;
        struct common_audit_data ad;
        u32 sid = current_sid();
        u32 av;
        int old_is_dir, new_is_dir;
        int rc;

        old_dsec = inode_security(old_dir);
        old_isec = backing_inode_security(old_dentry);
        old_is_dir = d_is_dir(old_dentry);
        new_dsec = inode_security(new_dir);

        ad.type = LSM_AUDIT_DATA_DENTRY;

        ad.u.dentry = old_dentry;
        rc = avc_has_perm(&selinux_state,
                          sid, old_dsec->sid, SECCLASS_DIR,
                          DIR__REMOVE_NAME | DIR__SEARCH, &ad);
        if (rc)
                return rc;
        rc = avc_has_perm(&selinux_state,
                          sid, old_isec->sid,
                          old_isec->sclass, FILE__RENAME, &ad);
        if (rc)
                return rc;
        if (old_is_dir && new_dir != old_dir) {
                rc = avc_has_perm(&selinux_state,
                                  sid, old_isec->sid,
                                  old_isec->sclass, DIR__REPARENT, &ad);
                if (rc)
                        return rc;
        }

        ad.u.dentry = new_dentry;
        av = DIR__ADD_NAME | DIR__SEARCH;
        if (d_is_positive(new_dentry))
                av |= DIR__REMOVE_NAME;
        rc = avc_has_perm(&selinux_state,
                          sid, new_dsec->sid, SECCLASS_DIR, av, &ad);
        if (rc)
                return rc;
        if (d_is_positive(new_dentry)) {
                new_isec = backing_inode_security(new_dentry);
                new_is_dir = d_is_dir(new_dentry);
                rc = avc_has_perm(&selinux_state,
                                  sid, new_isec->sid,
                                  new_isec->sclass,
                                  (new_is_dir ? DIR__RMDIR : FILE__UNLINK), &ad);
                if (rc)
                        return rc;
        }

        return 0;
}

/* Check whether a task can perform a filesystem operation. */
static int superblock_has_perm(const struct cred *cred,
                               struct super_block *sb,
                               u32 perms,
                               struct common_audit_data *ad)
{
        struct superblock_security_struct *sbsec;
        u32 sid = cred_sid(cred);

        sbsec = sb->s_security;
        return avc_has_perm(&selinux_state,
                            sid, sbsec->sid, SECCLASS_FILESYSTEM, perms, ad);
}

/* Convert a Linux mode and permission mask to an access vector. */
static inline u32 file_mask_to_av(int mode, int mask)
{
        u32 av = 0;

        if (!S_ISDIR(mode)) {
                if (mask & MAY_EXEC)
                        av |= FILE__EXECUTE;
                if (mask & MAY_READ)
                        av |= FILE__READ;

                if (mask & MAY_APPEND)
                        av |= FILE__APPEND;
                else if (mask & MAY_WRITE)
                        av |= FILE__WRITE;

        } else {
                if (mask & MAY_EXEC)
                        av |= DIR__SEARCH;
                if (mask & MAY_WRITE)
                        av |= DIR__WRITE;
                if (mask & MAY_READ)
                        av |= DIR__READ;
        }

        return av;
}

/* Convert a Linux file to an access vector. */
static inline u32 file_to_av(struct file *file)
{
        u32 av = 0;

        if (file->f_mode & FMODE_READ)
                av |= FILE__READ;
        if (file->f_mode & FMODE_WRITE) {
                if (file->f_flags & O_APPEND)
                        av |= FILE__APPEND;
                else
                        av |= FILE__WRITE;
        }
        if (!av) {
                /*
                 * Special file opened with flags 3 for ioctl-only use.
                 */
                av = FILE__IOCTL;
        }

        return av;
}

/*
 * Convert a file to an access vector and include the correct
 * open permission.
 */
static inline u32 open_file_to_av(struct file *file)
{
        u32 av = file_to_av(file);
        struct inode *inode = file_inode(file);

        if (selinux_policycap_openperm() &&
            inode->i_sb->s_magic != SOCKFS_MAGIC)
                av |= FILE__OPEN;

        return av;
}

/* Hook functions begin here. */

static int selinux_binder_set_context_mgr(const struct cred *mgr)
{
        return avc_has_perm(&selinux_state,
                            current_sid(), cred_sid(mgr), SECCLASS_BINDER,
                            BINDER__SET_CONTEXT_MGR, NULL);
}

static int selinux_binder_transaction(const struct cred *from,
                                      const struct cred *to)
{
        u32 mysid = current_sid();
        u32 fromsid = cred_sid(from);
        u32 tosid = cred_sid(to);
        int rc;

        if (mysid != fromsid) {
                rc = avc_has_perm(&selinux_state,
                                  mysid, fromsid, SECCLASS_BINDER,
                                  BINDER__IMPERSONATE, NULL);
                if (rc)
                        return rc;
        }

        return avc_has_perm(&selinux_state, fromsid, tosid,
                            SECCLASS_BINDER, BINDER__CALL, NULL);
}

static int selinux_binder_transfer_binder(const struct cred *from,
                                          const struct cred *to)
{
        return avc_has_perm(&selinux_state,
                            cred_sid(from), cred_sid(to),
                            SECCLASS_BINDER, BINDER__TRANSFER,
                            NULL);
}

static int selinux_binder_transfer_file(const struct cred *from,
                                        const struct cred *to,
                                        struct file *file)
{
        u32 sid = cred_sid(to);
        struct file_security_struct *fsec = selinux_file(file);
        struct dentry *dentry = file->f_path.dentry;
        struct inode_security_struct *isec;
        struct common_audit_data ad;
        int rc;

        ad.type = LSM_AUDIT_DATA_PATH;
        ad.u.path = file->f_path;

        if (sid != fsec->sid) {
                rc = avc_has_perm(&selinux_state,
                                  sid, fsec->sid,
                                  SECCLASS_FD,
                                  FD__USE,
                                  &ad);
                if (rc)
                        return rc;
        }

#ifdef CONFIG_BPF_SYSCALL
        rc = bpf_fd_pass(file, sid);
        if (rc)
                return rc;
#endif

        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;

        isec = backing_inode_security(dentry);
        return avc_has_perm(&selinux_state,
                            sid, isec->sid, isec->sclass, file_to_av(file),
                            &ad);
}

static int selinux_ptrace_access_check(struct task_struct *child,
                                     unsigned int mode)
{
        u32 sid = current_sid();
        u32 csid = task_sid(child);

        if (mode & PTRACE_MODE_READ)
                return avc_has_perm(&selinux_state,
                                    sid, csid, SECCLASS_FILE, FILE__READ, NULL);

        return avc_has_perm(&selinux_state,
                            sid, csid, SECCLASS_PROCESS, PROCESS__PTRACE, NULL);
}

static int selinux_ptrace_traceme(struct task_struct *parent)
{
        return avc_has_perm(&selinux_state,
                            task_sid(parent), current_sid(), SECCLASS_PROCESS,
                            PROCESS__PTRACE, NULL);
}

static int selinux_capget(struct task_struct *target, kernel_cap_t *effective,
                          kernel_cap_t *inheritable, kernel_cap_t *permitted)
{
        return avc_has_perm(&selinux_state,
                            current_sid(), task_sid(target), SECCLASS_PROCESS,
                            PROCESS__GETCAP, NULL);
}

static int selinux_capset(struct cred *new, const struct cred *old,
                          const kernel_cap_t *effective,
                          const kernel_cap_t *inheritable,
                          const kernel_cap_t *permitted)
{
        return avc_has_perm(&selinux_state,
                            cred_sid(old), cred_sid(new), SECCLASS_PROCESS,
                            PROCESS__SETCAP, NULL);
}

/*
 * (This comment used to live with the selinux_task_setuid hook,
 * which was removed).
 *
 * Since setuid only affects the current process, and since the SELinux
 * controls are not based on the Linux identity attributes, SELinux does not
 * need to control this operation.  However, SELinux does control the use of
 * the CAP_SETUID and CAP_SETGID capabilities using the capable hook.
 */

static int selinux_capable(const struct cred *cred, struct user_namespace *ns,
                           int cap, unsigned int opts)
{
        return cred_has_capability(cred, cap, opts, ns == &init_user_ns);
}

static int selinux_quotactl(int cmds, int type, int id, struct super_block *sb)
{
        const struct cred *cred = current_cred();
        int rc = 0;

        if (!sb)
                return 0;

        switch (cmds) {
        case Q_SYNC:
        case Q_QUOTAON:
        case Q_QUOTAOFF:
        case Q_SETINFO:
        case Q_SETQUOTA:
        case Q_XQUOTAOFF:
        case Q_XQUOTAON:
        case Q_XSETQLIM:
                rc = superblock_has_perm(cred, sb, FILESYSTEM__QUOTAMOD, NULL);
                break;
        case Q_GETFMT:
        case Q_GETINFO:
        case Q_GETQUOTA:
        case Q_XGETQUOTA:
        case Q_XGETQSTAT:
        case Q_XGETQSTATV:
        case Q_XGETNEXTQUOTA:
                rc = superblock_has_perm(cred, sb, FILESYSTEM__QUOTAGET, NULL);
                break;
        default:
                rc = 0;  /* let the kernel handle invalid cmds */
                break;
        }
        return rc;
}

static int selinux_quota_on(struct dentry *dentry)
{
        const struct cred *cred = current_cred();

        return dentry_has_perm(cred, dentry, FILE__QUOTAON);
}

static int selinux_syslog(int type)
{
        switch (type) {
        case SYSLOG_ACTION_READ_ALL:        /* Read last kernel messages */
        case SYSLOG_ACTION_SIZE_BUFFER:        /* Return size of the log buffer */
                return avc_has_perm(&selinux_state,
                                    current_sid(), SECINITSID_KERNEL,
                                    SECCLASS_SYSTEM, SYSTEM__SYSLOG_READ, NULL);
        case SYSLOG_ACTION_CONSOLE_OFF:        /* Disable logging to console */
        case SYSLOG_ACTION_CONSOLE_ON:        /* Enable logging to console */
        /* Set level of messages printed to console */
        case SYSLOG_ACTION_CONSOLE_LEVEL:
                return avc_has_perm(&selinux_state,
                                    current_sid(), SECINITSID_KERNEL,
                                    SECCLASS_SYSTEM, SYSTEM__SYSLOG_CONSOLE,
                                    NULL);
        }
        /* All other syslog types */
        return avc_has_perm(&selinux_state,
                            current_sid(), SECINITSID_KERNEL,
                            SECCLASS_SYSTEM, SYSTEM__SYSLOG_MOD, NULL);
}

/*
 * Check that a process has enough memory to allocate a new virtual
 * mapping. 0 means there is enough memory for the allocation to
 * succeed and -ENOMEM implies there is not.
 *
 * Do not audit the selinux permission check, as this is applied to all
 * processes that allocate mappings.
 */
static int selinux_vm_enough_memory(struct mm_struct *mm, long pages)
{
        int rc, cap_sys_admin = 0;

        rc = cred_has_capability(current_cred(), CAP_SYS_ADMIN,
                                 CAP_OPT_NOAUDIT, true);
        if (rc == 0)
                cap_sys_admin = 1;

        return cap_sys_admin;
}

/* binprm security operations */

static u32 ptrace_parent_sid(void)
{
        u32 sid = 0;
        struct task_struct *tracer;

        rcu_read_lock();
        tracer = ptrace_parent(current);
        if (tracer)
                sid = task_sid(tracer);
        rcu_read_unlock();

        return sid;
}

static int check_nnp_nosuid(const struct linux_binprm *bprm,
                            const struct task_security_struct *old_tsec,
                            const struct task_security_struct *new_tsec)
{
        int nnp = (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS);
        int nosuid = !mnt_may_suid(bprm->file->f_path.mnt);
        int rc;
        u32 av;

        if (!nnp && !nosuid)
                return 0; /* neither NNP nor nosuid */

        if (new_tsec->sid == old_tsec->sid)
                return 0; /* No change in credentials */

        /*
         * If the policy enables the nnp_nosuid_transition policy capability,
         * then we permit transitions under NNP or nosuid if the
         * policy allows the corresponding permission between
         * the old and new contexts.
         */
        if (selinux_policycap_nnp_nosuid_transition()) {
                av = 0;
                if (nnp)
                        av |= PROCESS2__NNP_TRANSITION;
                if (nosuid)
                        av |= PROCESS2__NOSUID_TRANSITION;
                rc = avc_has_perm(&selinux_state,
                                  old_tsec->sid, new_tsec->sid,
                                  SECCLASS_PROCESS2, av, NULL);
                if (!rc)
                        return 0;
        }

        /*
         * We also permit NNP or nosuid transitions to bounded SIDs,
         * i.e. SIDs that are guaranteed to only be allowed a subset
         * of the permissions of the current SID.
         */
        rc = security_bounded_transition(&selinux_state, old_tsec->sid,
                                         new_tsec->sid);
        if (!rc)
                return 0;

        /*
         * On failure, preserve the errno values for NNP vs nosuid.
         * NNP:  Operation not permitted for caller.
         * nosuid:  Permission denied to file.
         */
        if (nnp)
                return -EPERM;
        return -EACCES;
}

static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm)
{
        const struct task_security_struct *old_tsec;
        struct task_security_struct *new_tsec;
        struct inode_security_struct *isec;
        struct common_audit_data ad;
        struct inode *inode = file_inode(bprm->file);
        int rc;

        /* SELinux context only depends on initial program or script and not
         * the script interpreter */

        old_tsec = selinux_cred(current_cred());
        new_tsec = selinux_cred(bprm->cred);
        isec = inode_security(inode);

        /* Default to the current task SID. */
        new_tsec->sid = old_tsec->sid;
        new_tsec->osid = old_tsec->sid;

        /* Reset fs, key, and sock SIDs on execve. */
        new_tsec->create_sid = 0;
        new_tsec->keycreate_sid = 0;
        new_tsec->sockcreate_sid = 0;

        if (old_tsec->exec_sid) {
                new_tsec->sid = old_tsec->exec_sid;
                /* Reset exec SID on execve. */
                new_tsec->exec_sid = 0;

                /* Fail on NNP or nosuid if not an allowed transition. */
                rc = check_nnp_nosuid(bprm, old_tsec, new_tsec);
                if (rc)
                        return rc;
        } else {
                /* Check for a default transition on this program. */
                rc = security_transition_sid(&selinux_state, old_tsec->sid,
                                             isec->sid, SECCLASS_PROCESS, NULL,
                                             &new_tsec->sid);
                if (rc)
                        return rc;

                /*
                 * Fallback to old SID on NNP or nosuid if not an allowed
                 * transition.
                 */
                rc = check_nnp_nosuid(bprm, old_tsec, new_tsec);
                if (rc)
                        new_tsec->sid = old_tsec->sid;
        }

        ad.type = LSM_AUDIT_DATA_FILE;
        ad.u.file = bprm->file;

        if (new_tsec->sid == old_tsec->sid) {
                rc = avc_has_perm(&selinux_state,
                                  old_tsec->sid, isec->sid,
                                  SECCLASS_FILE, FILE__EXECUTE_NO_TRANS, &ad);
                if (rc)
                        return rc;
        } else {
                /* Check permissions for the transition. */
                rc = avc_has_perm(&selinux_state,
                                  old_tsec->sid, new_tsec->sid,
                                  SECCLASS_PROCESS, PROCESS__TRANSITION, &ad);
                if (rc)
                        return rc;

                rc = avc_has_perm(&selinux_state,
                                  new_tsec->sid, isec->sid,
                                  SECCLASS_FILE, FILE__ENTRYPOINT, &ad);
                if (rc)
                        return rc;

                /* Check for shared state */
                if (bprm->unsafe & LSM_UNSAFE_SHARE) {
                        rc = avc_has_perm(&selinux_state,
                                          old_tsec->sid, new_tsec->sid,
                                          SECCLASS_PROCESS, PROCESS__SHARE,
                                          NULL);
                        if (rc)
                                return -EPERM;
                }

                /* Make sure that anyone attempting to ptrace over a task that
                 * changes its SID has the appropriate permit */
                if (bprm->unsafe & LSM_UNSAFE_PTRACE) {
                        u32 ptsid = ptrace_parent_sid();
                        if (ptsid != 0) {
                                rc = avc_has_perm(&selinux_state,
                                                  ptsid, new_tsec->sid,
                                                  SECCLASS_PROCESS,
                                                  PROCESS__PTRACE, NULL);
                                if (rc)
                                        return -EPERM;
                        }
                }

                /* Clear any possibly unsafe personality bits on exec: */
                bprm->per_clear |= PER_CLEAR_ON_SETID;

                /* Enable secure mode for SIDs transitions unless
                   the noatsecure permission is granted between
                   the two SIDs, i.e. ahp returns 0. */
                rc = avc_has_perm(&selinux_state,
                                  old_tsec->sid, new_tsec->sid,
                                  SECCLASS_PROCESS, PROCESS__NOATSECURE,
                                  NULL);
                bprm->secureexec |= !!rc;
        }

        return 0;
}

static int match_file(const void *p, struct file *file, unsigned fd)
{
        return file_has_perm(p, file, file_to_av(file)) ? fd + 1 : 0;
}

/* Derived from fs/exec.c:flush_old_files. */
static inline void flush_unauthorized_files(const struct cred *cred,
                                            struct files_struct *files)
{
        struct file *file, *devnull = NULL;
        struct tty_struct *tty;
        int drop_tty = 0;
        unsigned n;

        tty = get_current_tty();
        if (tty) {
                spin_lock(&tty->files_lock);
                if (!list_empty(&tty->tty_files)) {
                        struct tty_file_private *file_priv;

                        /* Revalidate access to controlling tty.
                           Use file_path_has_perm on the tty path directly
                           rather than using file_has_perm, as this particular
                           open file may belong to another process and we are
                           only interested in the inode-based check here. */
                        file_priv = list_first_entry(&tty->tty_files,
                                                struct tty_file_private, list);
                        file = file_priv->file;
                        if (file_path_has_perm(cred, file, FILE__READ | FILE__WRITE))
                                drop_tty = 1;
                }
                spin_unlock(&tty->files_lock);
                tty_kref_put(tty);
        }
        /* Reset controlling tty. */
        if (drop_tty)
                no_tty();

        /* Revalidate access to inherited open files. */
        n = iterate_fd(files, 0, match_file, cred);
        if (!n) /* none found? */
                return;

        devnull = dentry_open(&selinux_null, O_RDWR, cred);
        if (IS_ERR(devnull))
                devnull = NULL;
        /* replace all the matching ones with this */
        do {
                replace_fd(n - 1, devnull, 0);
        } while ((n = iterate_fd(files, n, match_file, cred)) != 0);
        if (devnull)
                fput(devnull);
}

/*
 * Prepare a process for imminent new credential changes due to exec
 */
static void selinux_bprm_committing_creds(struct linux_binprm *bprm)
{
        struct task_security_struct *new_tsec;
        struct rlimit *rlim, *initrlim;
        int rc, i;

        new_tsec = selinux_cred(bprm->cred);
        if (new_tsec->sid == new_tsec->osid)
                return;

        /* Close files for which the new task SID is not authorized. */
        flush_unauthorized_files(bprm->cred, current->files);

        /* Always clear parent death signal on SID transitions. */
        current->pdeath_signal = 0;

        /* Check whether the new SID can inherit resource limits from the old
         * SID.  If not, reset all soft limits to the lower of the current
         * task's hard limit and the init task's soft limit.
         *
         * Note that the setting of hard limits (even to lower them) can be
         * controlled by the setrlimit check.  The inclusion of the init task's
         * soft limit into the computation is to avoid resetting soft limits
         * higher than the default soft limit for cases where the default is
         * lower than the hard limit, e.g. RLIMIT_CORE or RLIMIT_STACK.
         */
        rc = avc_has_perm(&selinux_state,
                          new_tsec->osid, new_tsec->sid, SECCLASS_PROCESS,
                          PROCESS__RLIMITINH, NULL);
        if (rc) {
                /* protect against do_prlimit() */
                task_lock(current);
                for (i = 0; i < RLIM_NLIMITS; i++) {
                        rlim = current->signal->rlim + i;
                        initrlim = init_task.signal->rlim + i;
                        rlim->rlim_cur = min(rlim->rlim_max, initrlim->rlim_cur);
                }
                task_unlock(current);
                if (IS_ENABLED(CONFIG_POSIX_TIMERS))
                        update_rlimit_cpu(current, rlimit(RLIMIT_CPU));
        }
}

/*
 * Clean up the process immediately after the installation of new credentials
 * due to exec
 */
static void selinux_bprm_committed_creds(struct linux_binprm *bprm)
{
        const struct task_security_struct *tsec = selinux_cred(current_cred());
        u32 osid, sid;
        int rc;

        osid = tsec->osid;
        sid = tsec->sid;

        if (sid == osid)
                return;

        /* Check whether the new SID can inherit signal state from the old SID.
         * If not, clear itimers to avoid subsequent signal generation and
         * flush and unblock signals.
         *
         * This must occur _after_ the task SID has been updated so that any
         * kill done after the flush will be checked against the new SID.
         */
        rc = avc_has_perm(&selinux_state,
                          osid, sid, SECCLASS_PROCESS, PROCESS__SIGINH, NULL);
        if (rc) {
                clear_itimer();

                spin_lock_irq(&current->sighand->siglock);
                if (!fatal_signal_pending(current)) {
                        flush_sigqueue(&current->pending);
                        flush_sigqueue(&current->signal->shared_pending);
                        flush_signal_handlers(current, 1);
                        sigemptyset(&current->blocked);
                        recalc_sigpending();
                }
                spin_unlock_irq(&current->sighand->siglock);
        }

        /* Wake up the parent if it is waiting so that it can recheck
         * wait permission to the new task SID. */
        read_lock(&tasklist_lock);
        __wake_up_parent(current, current->real_parent);
        read_unlock(&tasklist_lock);
}

/* superblock security operations */

static int selinux_sb_alloc_security(struct super_block *sb)
{
        struct superblock_security_struct *sbsec;

        sbsec = kzalloc(sizeof(struct superblock_security_struct), GFP_KERNEL);
        if (!sbsec)
                return -ENOMEM;

        mutex_init(&sbsec->lock);
        INIT_LIST_HEAD(&sbsec->isec_head);
        spin_lock_init(&sbsec->isec_lock);
        sbsec->sb = sb;
        sbsec->sid = SECINITSID_UNLABELED;
        sbsec->def_sid = SECINITSID_FILE;
        sbsec->mntpoint_sid = SECINITSID_UNLABELED;
        sb->s_security = sbsec;

        return 0;
}

static void selinux_sb_free_security(struct super_block *sb)
{
        superblock_free_security(sb);
}

static inline int opt_len(const char *s)
{
        bool open_quote = false;
        int len;
        char c;

        for (len = 0; (c = s[len]) != '\0'; len++) {
                if (c == '"')
                        open_quote = !open_quote;
                if (c == ',' && !open_quote)
                        break;
        }
        return len;
}

static int selinux_sb_eat_lsm_opts(char *options, void **mnt_opts)
{
        char *from = options;
        char *to = options;
        bool first = true;
        int rc;

        while (1) {
                int len = opt_len(from);
                int token;
                char *arg = NULL;

                token = match_opt_prefix(from, len, &arg);

                if (token != Opt_error) {
                        char *p, *q;

                        /* strip quotes */
                        if (arg) {
                                for (p = q = arg; p < from + len; p++) {
                                        char c = *p;
                                        if (c != '"')
                                                *q++ = c;
                                }
                                arg = kmemdup_nul(arg, q - arg, GFP_KERNEL);
                                if (!arg) {
                                        rc = -ENOMEM;
                                        goto free_opt;
                                }
                        }
                        rc = selinux_add_opt(token, arg, mnt_opts);
                        if (unlikely(rc)) {
                                kfree(arg);
                                goto free_opt;
                        }
                } else {
                        if (!first) {        // copy with preceding comma
                                from--;
                                len++;
                        }
                        if (to != from)
                                memmove(to, from, len);
                        to += len;
                        first = false;
                }
                if (!from[len])
                        break;
                from += len + 1;
        }
        *to = '\0';
        return 0;

free_opt:
        if (*mnt_opts) {
                selinux_free_mnt_opts(*mnt_opts);
                *mnt_opts = NULL;
        }
        return rc;
}

static int selinux_sb_remount(struct super_block *sb, void *mnt_opts)
{
        struct selinux_mnt_opts *opts = mnt_opts;
        struct superblock_security_struct *sbsec = sb->s_security;
        u32 sid;
        int rc;

        if (!(sbsec->flags & SE_SBINITIALIZED))
                return 0;

        if (!opts)
                return 0;

        if (opts->fscontext) {
                rc = parse_sid(sb, opts->fscontext, &sid);
                if (rc)
                        return rc;
                if (bad_option(sbsec, FSCONTEXT_MNT, sbsec->sid, sid))
                        goto out_bad_option;
        }
        if (opts->context) {
                rc = parse_sid(sb, opts->context, &sid);
                if (rc)
                        return rc;
                if (bad_option(sbsec, CONTEXT_MNT, sbsec->mntpoint_sid, sid))
                        goto out_bad_option;
        }
        if (opts->rootcontext) {
                struct inode_security_struct *root_isec;
                root_isec = backing_inode_security(sb->s_root);
                rc = parse_sid(sb, opts->rootcontext, &sid);
                if (rc)
                        return rc;
                if (bad_option(sbsec, ROOTCONTEXT_MNT, root_isec->sid, sid))
                        goto out_bad_option;
        }
        if (opts->defcontext) {
                rc = parse_sid(sb, opts->defcontext, &sid);
                if (rc)
                        return rc;
                if (bad_option(sbsec, DEFCONTEXT_MNT, sbsec->def_sid, sid))
                        goto out_bad_option;
        }
        return 0;

out_bad_option:
        pr_warn("SELinux: unable to change security options "
               "during remount (dev %s, type=%s)\n", sb->s_id,
               sb->s_type->name);
        return -EINVAL;
}

static int selinux_sb_kern_mount(struct super_block *sb)
{
        const struct cred *cred = current_cred();
        struct common_audit_data ad;

        ad.type = LSM_AUDIT_DATA_DENTRY;
        ad.u.dentry = sb->s_root;
        return superblock_has_perm(cred, sb, FILESYSTEM__MOUNT, &ad);
}

static int selinux_sb_statfs(struct dentry *dentry)
{
        const struct cred *cred = current_cred();
        struct common_audit_data ad;

        ad.type = LSM_AUDIT_DATA_DENTRY;
        ad.u.dentry = dentry->d_sb->s_root;
        return superblock_has_perm(cred, dentry->d_sb, FILESYSTEM__GETATTR, &ad);
}

static int selinux_mount(const char *dev_name,
                         const struct path *path,
                         const char *type,
                         unsigned long flags,
                         void *data)
{
        const struct cred *cred = current_cred();

        if (flags & MS_REMOUNT)
                return superblock_has_perm(cred, path->dentry->d_sb,
                                           FILESYSTEM__REMOUNT, NULL);
        else
                return path_has_perm(cred, path, FILE__MOUNTON);
}

static int selinux_move_mount(const struct path *from_path,
                              const struct path *to_path)
{
        const struct cred *cred = current_cred();

        return path_has_perm(cred, to_path, FILE__MOUNTON);
}

static int selinux_umount(struct vfsmount *mnt, int flags)
{
        const struct cred *cred = current_cred();

        return superblock_has_perm(cred, mnt->mnt_sb,
                                   FILESYSTEM__UNMOUNT, NULL);
}

static int selinux_fs_context_dup(struct fs_context *fc,
                                  struct fs_context *src_fc)
{
        const struct selinux_mnt_opts *src = src_fc->security;
        struct selinux_mnt_opts *opts;

        if (!src)
                return 0;

        fc->security = kzalloc(sizeof(struct selinux_mnt_opts), GFP_KERNEL);
        if (!fc->security)
                return -ENOMEM;

        opts = fc->security;

        if (src->fscontext) {
                opts->fscontext = kstrdup(src->fscontext, GFP_KERNEL);
                if (!opts->fscontext)
                        return -ENOMEM;
        }
        if (src->context) {
                opts->context = kstrdup(src->context, GFP_KERNEL);
                if (!opts->context)
                        return -ENOMEM;
        }
        if (src->rootcontext) {
                opts->rootcontext = kstrdup(src->rootcontext, GFP_KERNEL);
                if (!opts->rootcontext)
                        return -ENOMEM;
        }
        if (src->defcontext) {
                opts->defcontext = kstrdup(src->defcontext, GFP_KERNEL);
                if (!opts->defcontext)
                        return -ENOMEM;
        }
        return 0;
}

static const struct fs_parameter_spec selinux_fs_parameters[] = {
        fsparam_string(CONTEXT_STR,        Opt_context),
        fsparam_string(DEFCONTEXT_STR,        Opt_defcontext),
        fsparam_string(FSCONTEXT_STR,        Opt_fscontext),
        fsparam_string(ROOTCONTEXT_STR,        Opt_rootcontext),
        fsparam_flag  (SECLABEL_STR,        Opt_seclabel),
        {}
};

static int selinux_fs_context_parse_param(struct fs_context *fc,
                                          struct fs_parameter *param)
{
        struct fs_parse_result result;
        int opt, rc;

        opt = fs_parse(fc, selinux_fs_parameters, param, &result);
        if (opt < 0)
                return opt;

        rc = selinux_add_opt(opt, param->string, &fc->security);
        if (!rc)
                param->string = NULL;

        return rc;
}

/* inode security operations */

static int selinux_inode_alloc_security(struct inode *inode)
{
        struct inode_security_struct *isec = selinux_inode(inode);
        u32 sid = current_sid();

        spin_lock_init(&isec->lock);
        INIT_LIST_HEAD(&isec->list);
        isec->inode = inode;
        isec->sid = SECINITSID_UNLABELED;
        isec->sclass = SECCLASS_FILE;
        isec->task_sid = sid;
        isec->initialized = LABEL_INVALID;

        return 0;
}

static void selinux_inode_free_security(struct inode *inode)
{
        inode_free_security(inode);
}

static int selinux_dentry_init_security(struct dentry *dentry, int mode,
                                        const struct qstr *name, void **ctx,
                                        u32 *ctxlen)
{
        u32 newsid;
        int rc;

        rc = selinux_determine_inode_label(selinux_cred(current_cred()),
                                           d_inode(dentry->d_parent), name,
                                           inode_mode_to_security_class(mode),
                                           &newsid);
        if (rc)
                return rc;

        return security_sid_to_context(&selinux_state, newsid, (char **)ctx,
                                       ctxlen);
}

static int selinux_dentry_create_files_as(struct dentry *dentry, int mode,
                                          struct qstr *name,
                                          const struct cred *old,
                                          struct cred *new)
{
        u32 newsid;
        int rc;
        struct task_security_struct *tsec;

        rc = selinux_determine_inode_label(selinux_cred(old),
                                           d_inode(dentry->d_parent), name,
                                           inode_mode_to_security_class(mode),
                                           &newsid);
        if (rc)
                return rc;

        tsec = selinux_cred(new);
        tsec->create_sid = newsid;
        return 0;
}

static int selinux_inode_init_security(struct inode *inode, struct inode *dir,
                                       const struct qstr *qstr,
                                       const char **name,
                                       void **value, size_t *len)
{
        const struct task_security_struct *tsec = selinux_cred(current_cred());
        struct superblock_security_struct *sbsec;
        u32 newsid, clen;
        int rc;
        char *context;

        sbsec = dir->i_sb->s_security;

        newsid = tsec->create_sid;

        rc = selinux_determine_inode_label(tsec, dir, qstr,
                inode_mode_to_security_class(inode->i_mode),
                &newsid);
        if (rc)
                return rc;

        /* Possibly defer initialization to selinux_complete_init. */
        if (sbsec->flags & SE_SBINITIALIZED) {
                struct inode_security_struct *isec = selinux_inode(inode);
                isec->sclass = inode_mode_to_security_class(inode->i_mode);
                isec->sid = newsid;
                isec->initialized = LABEL_INITIALIZED;
        }

        if (!selinux_initialized(&selinux_state) ||
            !(sbsec->flags & SBLABEL_MNT))
                return -EOPNOTSUPP;

        if (name)
                *name = XATTR_SELINUX_SUFFIX;

        if (value && len) {
                rc = security_sid_to_context_force(&selinux_state, newsid,
                                                   &context, &clen);
                if (rc)
                        return rc;
                *value = context;
                *len = clen;
        }

        return 0;
}

static int selinux_inode_create(struct inode *dir, struct dentry *dentry, umode_t mode)
{
        return may_create(dir, dentry, SECCLASS_FILE);
}

static int selinux_inode_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry)
{
        return may_link(dir, old_dentry, MAY_LINK);
}

static int selinux_inode_unlink(struct inode *dir, struct dentry *dentry)
{
        return may_link(dir, dentry, MAY_UNLINK);
}

static int selinux_inode_symlink(struct inode *dir, struct dentry *dentry, const char *name)
{
        return may_create(dir, dentry, SECCLASS_LNK_FILE);
}

static int selinux_inode_mkdir(struct inode *dir, struct dentry *dentry, umode_t mask)
{
        return may_create(dir, dentry, SECCLASS_DIR);
}

static int selinux_inode_rmdir(struct inode *dir, struct dentry *dentry)
{
        return may_link(dir, dentry, MAY_RMDIR);
}

static int selinux_inode_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
{
        return may_create(dir, dentry, inode_mode_to_security_class(mode));
}

static int selinux_inode_rename(struct inode *old_inode, struct dentry *old_dentry,
                                struct inode *new_inode, struct dentry *new_dentry)
{
        return may_rename(old_inode, old_dentry, new_inode, new_dentry);
}

static int selinux_inode_readlink(struct dentry *dentry)
{
        const struct cred *cred = current_cred();

        return dentry_has_perm(cred, dentry, FILE__READ);
}

static int selinux_inode_follow_link(struct dentry *dentry, struct inode *inode,
                                     bool rcu)
{
        const struct cred *cred = current_cred();
        struct common_audit_data ad;
        struct inode_security_struct *isec;
        u32 sid;

        validate_creds(cred);

        ad.type = LSM_AUDIT_DATA_DENTRY;
        ad.u.dentry = dentry;
        sid = cred_sid(cred);
        isec = inode_security_rcu(inode, rcu);
        if (IS_ERR(isec))
                return PTR_ERR(isec);

        return avc_has_perm_flags(&selinux_state,
                                  sid, isec->sid, isec->sclass, FILE__READ, &ad,
                                  rcu ? MAY_NOT_BLOCK : 0);
}

static noinline int audit_inode_permission(struct inode *inode,
                                           u32 perms, u32 audited, u32 denied,
                                           int result)
{
        struct common_audit_data ad;
        struct inode_security_struct *isec = selinux_inode(inode);
        int rc;

        ad.type = LSM_AUDIT_DATA_INODE;
        ad.u.inode = inode;

        rc = slow_avc_audit(&selinux_state,
                            current_sid(), isec->sid, isec->sclass, perms,
                            audited, denied, result, &ad);
        if (rc)
                return rc;
        return 0;
}

static int selinux_inode_permission(struct inode *inode, int mask)
{
        const struct cred *cred = current_cred();
        u32 perms;
        bool from_access;
        bool no_block = mask & MAY_NOT_BLOCK;
        struct inode_security_struct *isec;
        u32 sid;
        struct av_decision avd;
        int rc, rc2;
        u32 audited, denied;

        from_access = mask & MAY_ACCESS;
        mask &= (MAY_READ|MAY_WRITE|MAY_EXEC|MAY_APPEND);

        /* No permission to check.  Existence test. */
        if (!mask)
                return 0;

        validate_creds(cred);

        if (unlikely(IS_PRIVATE(inode)))
                return 0;

        perms = file_mask_to_av(inode->i_mode, mask);

        sid = cred_sid(cred);
        isec = inode_security_rcu(inode, no_block);
        if (IS_ERR(isec))
                return PTR_ERR(isec);

        rc = avc_has_perm_noaudit(&selinux_state,
                                  sid, isec->sid, isec->sclass, perms,
                                  no_block ? AVC_NONBLOCKING : 0,
                                  &avd);
        audited = avc_audit_required(perms, &avd, rc,
                                     from_access ? FILE__AUDIT_ACCESS : 0,
                                     &denied);
        if (likely(!audited))
                return rc;

        /* fall back to ref-walk if we have to generate audit */
        if (no_block)
                return -ECHILD;

        rc2 = audit_inode_permission(inode, perms, audited, denied, rc);
        if (rc2)
                return rc2;
        return rc;
}

static int selinux_inode_setattr(struct dentry *dentry, struct iattr *iattr)
{
        const struct cred *cred = current_cred();
        struct inode *inode = d_backing_inode(dentry);
        unsigned int ia_valid = iattr->ia_valid;
        __u32 av = FILE__WRITE;

        /* ATTR_FORCE is just used for ATTR_KILL_S[UG]ID. */
        if (ia_valid & ATTR_FORCE) {
                ia_valid &= ~(ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_MODE |
                              ATTR_FORCE);
                if (!ia_valid)
                        return 0;
        }

        if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID |
                        ATTR_ATIME_SET | ATTR_MTIME_SET | ATTR_TIMES_SET))
                return dentry_has_perm(cred, dentry, FILE__SETATTR);

        if (selinux_policycap_openperm() &&
            inode->i_sb->s_magic != SOCKFS_MAGIC &&
            (ia_valid & ATTR_SIZE) &&
            !(ia_valid & ATTR_FILE))
                av |= FILE__OPEN;

        return dentry_has_perm(cred, dentry, av);
}

static int selinux_inode_getattr(const struct path *path)
{
        return path_has_perm(current_cred(), path, FILE__GETATTR);
}

static bool has_cap_mac_admin(bool audit)
{
        const struct cred *cred = current_cred();
        unsigned int opts = audit ? CAP_OPT_NONE : CAP_OPT_NOAUDIT;

        if (cap_capable(cred, &init_user_ns, CAP_MAC_ADMIN, opts))
                return false;
        if (cred_has_capability(cred, CAP_MAC_ADMIN, opts, true))
                return false;
        return true;
}

static int selinux_inode_setxattr(struct dentry *dentry, const char *name,
                                  const void *value, size_t size, int flags)
{
        struct inode *inode = d_backing_inode(dentry);
        struct inode_security_struct *isec;
        struct superblock_security_struct *sbsec;
        struct common_audit_data ad;
        u32 newsid, sid = current_sid();
        int rc = 0;

        if (strcmp(name, XATTR_NAME_SELINUX)) {
                rc = cap_inode_setxattr(dentry, name, value, size, flags);
                if (rc)
                        return rc;

                /* Not an attribute we recognize, so just check the
                   ordinary setattr permission. */
                return dentry_has_perm(current_cred(), dentry, FILE__SETATTR);
        }

        if (!selinux_initialized(&selinux_state))
                return (inode_owner_or_capable(inode) ? 0 : -EPERM);

        sbsec = inode->i_sb->s_security;
        if (!(sbsec->flags & SBLABEL_MNT))
                return -EOPNOTSUPP;

        if (!inode_owner_or_capable(inode))
                return -EPERM;

        ad.type = LSM_AUDIT_DATA_DENTRY;
        ad.u.dentry = dentry;

        isec = backing_inode_security(dentry);
        rc = avc_has_perm(&selinux_state,
                          sid, isec->sid, isec->sclass,
                          FILE__RELABELFROM, &ad);
        if (rc)
                return rc;

        rc = security_context_to_sid(&selinux_state, value, size, &newsid,
                                     GFP_KERNEL);
        if (rc == -EINVAL) {
                if (!has_cap_mac_admin(true)) {
                        struct audit_buffer *ab;
                        size_t audit_size;

                        /* We strip a nul only if it is at the end, otherwise the
                         * context contains a nul and we should audit that */
                        if (value) {
                                const char *str = value;

                                if (str[size - 1] == '\0')
                                        audit_size = size - 1;
                                else
                                        audit_size = size;
                        } else {
                                audit_size = 0;
                        }
                        ab = audit_log_start(audit_context(),
                                             GFP_ATOMIC, AUDIT_SELINUX_ERR);
                        audit_log_format(ab, "op=setxattr invalid_context=");
                        audit_log_n_untrustedstring(ab, value, audit_size);
                        audit_log_end(ab);

                        return rc;
                }
                rc = security_context_to_sid_force(&selinux_state, value,
                                                   size, &newsid);
        }
        if (rc)
                return rc;

        rc = avc_has_perm(&selinux_state,
                          sid, newsid, isec->sclass,
                          FILE__RELABELTO, &ad);
        if (rc)
                return rc;

        rc = security_validate_transition(&selinux_state, isec->sid, newsid,
                                          sid, isec->sclass);
        if (rc)
                return rc;

        return avc_has_perm(&selinux_state,
                            newsid,
                            sbsec->sid,
                            SECCLASS_FILESYSTEM,
                            FILESYSTEM__ASSOCIATE,
                            &ad);
}

static void selinux_inode_post_setxattr(struct dentry *dentry, const char *name,
                                        const void *value, size_t size,
                                        int flags)
{
        struct inode *inode = d_backing_inode(dentry);
        struct inode_security_struct *isec;
        u32 newsid;
        int rc;

        if (strcmp(name, XATTR_NAME_SELINUX)) {
                /* Not an attribute we recognize, so nothing to do. */
                return;
        }

        if (!selinux_initialized(&selinux_state)) {
                /* If we haven't even been initialized, then we can't validate
                 * against a policy, so leave the label as invalid. It may
                 * resolve to a valid label on the next revalidation try if
                 * we've since initialized.
                 */
                return;
        }

        rc = security_context_to_sid_force(&selinux_state, value, size,
                                           &newsid);
        if (rc) {
                pr_err("SELinux:  unable to map context to SID"
                       "for (%s, %lu), rc=%d\n",
                       inode->i_sb->s_id, inode->i_ino, -rc);
                return;
        }

        isec = backing_inode_security(dentry);
        spin_lock(&isec->lock);
        isec->sclass = inode_mode_to_security_class(inode->i_mode);
        isec->sid = newsid;
        isec->initialized = LABEL_INITIALIZED;
        spin_unlock(&isec->lock);

        return;
}

static int selinux_inode_getxattr(struct dentry *dentry, const char *name)
{
        const struct cred *cred = current_cred();

        return dentry_has_perm(cred, dentry, FILE__GETATTR);
}

static int selinux_inode_listxattr(struct dentry *dentry)
{
        const struct cred *cred = current_cred();

        return dentry_has_perm(cred, dentry, FILE__GETATTR);
}

static int selinux_inode_removexattr(struct dentry *dentry, const char *name)
{
        if (strcmp(name, XATTR_NAME_SELINUX)) {
                int rc = cap_inode_removexattr(dentry, name);
                if (rc)
                        return rc;

                /* Not an attribute we recognize, so just check the
                   ordinary setattr permission. */
                return dentry_has_perm(current_cred(), dentry, FILE__SETATTR);
        }

        if (!selinux_initialized(&selinux_state))
                return 0;

        /* No one is allowed to remove a SELinux security label.
           You can change the label, but all data must be labeled. */
        return -EACCES;
}

static int selinux_path_notify(const struct path *path, u64 mask,
                                                unsigned int obj_type)
{
        int ret;
        u32 perm;

        struct common_audit_data ad;

        ad.type = LSM_AUDIT_DATA_PATH;
        ad.u.path = *path;

        /*
         * Set permission needed based on the type of mark being set.
         * Performs an additional check for sb watches.
         */
        switch (obj_type) {
        case FSNOTIFY_OBJ_TYPE_VFSMOUNT:
                perm = FILE__WATCH_MOUNT;
                break;
        case FSNOTIFY_OBJ_TYPE_SB:
                perm = FILE__WATCH_SB;
                ret = superblock_has_perm(current_cred(), path->dentry->d_sb,
                                                FILESYSTEM__WATCH, &ad);
                if (ret)
                        return ret;
                break;
        case FSNOTIFY_OBJ_TYPE_INODE:
                perm = FILE__WATCH;
                break;
        default:
                return -EINVAL;
        }

        /* blocking watches require the file:watch_with_perm permission */
        if (mask & (ALL_FSNOTIFY_PERM_EVENTS))
                perm |= FILE__WATCH_WITH_PERM;

        /* watches on read-like events need the file:watch_reads permission */
        if (mask & (FS_ACCESS | FS_ACCESS_PERM | FS_CLOSE_NOWRITE))
                perm |= FILE__WATCH_READS;

        return path_has_perm(current_cred(), path, perm);
}

/*
 * Copy the inode security context value to the user.
 *
 * Permission check is handled by selinux_inode_getxattr hook.
 */
static int selinux_inode_getsecurity(struct inode *inode, const char *name, void **buffer, bool alloc)
{
        u32 size;
        int error;
        char *context = NULL;
        struct inode_security_struct *isec;

        /*
         * If we're not initialized yet, then we can't validate contexts, so
         * just let vfs_getxattr fall back to using the on-disk xattr.
         */
        if (!selinux_initialized(&selinux_state) ||
            strcmp(name, XATTR_SELINUX_SUFFIX))
                return -EOPNOTSUPP;

        /*
         * If the caller has CAP_MAC_ADMIN, then get the raw context
         * value even if it is not defined by current policy; otherwise,
         * use the in-core value under current policy.
         * Use the non-auditing forms of the permission checks since
         * getxattr may be called by unprivileged processes commonly
         * and lack of permission just means that we fall back to the
         * in-core context value, not a denial.
         */
        isec = inode_security(inode);
        if (has_cap_mac_admin(false))
                error = security_sid_to_context_force(&selinux_state,
                                                      isec->sid, &context,
                                                      &size);
        else
                error = security_sid_to_context(&selinux_state, isec->sid,
                                                &context, &size);
        if (error)
                return error;
        error = size;
        if (alloc) {
                *buffer = context;
                goto out_nofree;
        }
        kfree(context);
out_nofree:
        return error;
}

static int selinux_inode_setsecurity(struct inode *inode, const char *name,
                                     const void *value, size_t size, int flags)
{
        struct inode_security_struct *isec = inode_security_novalidate(inode);
        struct superblock_security_struct *sbsec = inode->i_sb->s_security;
        u32 newsid;
        int rc;

        if (strcmp(name, XATTR_SELINUX_SUFFIX))
                return -EOPNOTSUPP;

        if (!(sbsec->flags & SBLABEL_MNT))
                return -EOPNOTSUPP;

        if (!value || !size)
                return -EACCES;

        rc = security_context_to_sid(&selinux_state, value, size, &newsid,
                                     GFP_KERNEL);
        if (rc)
                return rc;

        spin_lock(&isec->lock);
        isec->sclass = inode_mode_to_security_class(inode->i_mode);
        isec->sid = newsid;
        isec->initialized = LABEL_INITIALIZED;
        spin_unlock(&isec->lock);
        return 0;
}

static int selinux_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer_size)
{
        const int len = sizeof(XATTR_NAME_SELINUX);

        if (!selinux_initialized(&selinux_state))
                return 0;

        if (buffer && len <= buffer_size)
                memcpy(buffer, XATTR_NAME_SELINUX, len);
        return len;
}

static void selinux_inode_getsecid(struct inode *inode, u32 *secid)
{
        struct inode_security_struct *isec = inode_security_novalidate(inode);
        *secid = isec->sid;
}

static int selinux_inode_copy_up(struct dentry *src, struct cred **new)
{
        u32 sid;
        struct task_security_struct *tsec;
        struct cred *new_creds = *new;

        if (new_creds == NULL) {
                new_creds = prepare_creds();
                if (!new_creds)
                        return -ENOMEM;
        }

        tsec = selinux_cred(new_creds);
        /* Get label from overlay inode and set it in create_sid */
        selinux_inode_getsecid(d_inode(src), &sid);
        tsec->create_sid = sid;
        *new = new_creds;
        return 0;
}

static int selinux_inode_copy_up_xattr(const char *name)
{
        /* The copy_up hook above sets the initial context on an inode, but we
         * don't then want to overwrite it by blindly copying all the lower
         * xattrs up.  Instead, we have to filter out SELinux-related xattrs.
         */
        if (strcmp(name, XATTR_NAME_SELINUX) == 0)
                return 1; /* Discard */
        /*
         * Any other attribute apart from SELINUX is not claimed, supported
         * by selinux.
         */
        return -EOPNOTSUPP;
}

/* kernfs node operations */

static int selinux_kernfs_init_security(struct kernfs_node *kn_dir,
                                        struct kernfs_node *kn)
{
        const struct task_security_struct *tsec = selinux_cred(current_cred());
        u32 parent_sid, newsid, clen;
        int rc;
        char *context;

        rc = kernfs_xattr_get(kn_dir, XATTR_NAME_SELINUX, NULL, 0);
        if (rc == -ENODATA)
                return 0;
        else if (rc < 0)
                return rc;

        clen = (u32)rc;
        context = kmalloc(clen, GFP_KERNEL);
        if (!context)
                return -ENOMEM;

        rc = kernfs_xattr_get(kn_dir, XATTR_NAME_SELINUX, context, clen);
        if (rc < 0) {
                kfree(context);
                return rc;
        }

        rc = security_context_to_sid(&selinux_state, context, clen, &parent_sid,
                                     GFP_KERNEL);
        kfree(context);
        if (rc)
                return rc;

        if (tsec->create_sid) {
                newsid = tsec->create_sid;
        } else {
                u16 secclass = inode_mode_to_security_class(kn->mode);
                struct qstr q;

                q.name = kn->name;
                q.hash_len = hashlen_string(kn_dir, kn->name);

                rc = security_transition_sid(&selinux_state, tsec->sid,
                                             parent_sid, secclass, &q,
                                             &newsid);
                if (rc)
                        return rc;
        }

        rc = security_sid_to_context_force(&selinux_state, newsid,
                                           &context, &clen);
        if (rc)
                return rc;

        rc = kernfs_xattr_set(kn, XATTR_NAME_SELINUX, context, clen,
                              XATTR_CREATE);
        kfree(context);
        return rc;
}


/* file security operations */

static int selinux_revalidate_file_permission(struct file *file, int mask)
{
        const struct cred *cred = current_cred();
        struct inode *inode = file_inode(file);

        /* file_mask_to_av won't add FILE__WRITE if MAY_APPEND is set */
        if ((file->f_flags & O_APPEND) && (mask & MAY_WRITE))
                mask |= MAY_APPEND;

        return file_has_perm(cred, file,
                             file_mask_to_av(inode->i_mode, mask));
}

static int selinux_file_permission(struct file *file, int mask)
{
        struct inode *inode = file_inode(file);
        struct file_security_struct *fsec = selinux_file(file);
        struct inode_security_struct *isec;
        u32 sid = current_sid();

        if (!mask)
                /* No permission to check.  Existence test. */
                return 0;

        isec = inode_security(inode);
        if (sid == fsec->sid && fsec->isid == isec->sid &&
            fsec->pseqno == avc_policy_seqno(&selinux_state))
                /* No change since file_open check. */
                return 0;

        return selinux_revalidate_file_permission(file, mask);
}

static int selinux_file_alloc_security(struct file *file)
{
        struct file_security_struct *fsec = selinux_file(file);
        u32 sid = current_sid();

        fsec->sid = sid;
        fsec->fown_sid = sid;

        return 0;
}

/*
 * Check whether a task has the ioctl permission and cmd
 * operation to an inode.
 */
static int ioctl_has_perm(const struct cred *cred, struct file *file,
                u32 requested, u16 cmd)
{
        struct common_audit_data ad;
        struct file_security_struct *fsec = selinux_file(file);
        struct inode *inode = file_inode(file);
        struct inode_security_struct *isec;
        struct lsm_ioctlop_audit ioctl;
        u32 ssid = cred_sid(cred);
        int rc;
        u8 driver = cmd >> 8;
        u8 xperm = cmd & 0xff;

        ad.type = LSM_AUDIT_DATA_IOCTL_OP;
        ad.u.op = &ioctl;
        ad.u.op->cmd = cmd;
        ad.u.op->path = file->f_path;

        if (ssid != fsec->sid) {
                rc = avc_has_perm(&selinux_state,
                                  ssid, fsec->sid,
                                SECCLASS_FD,
                                FD__USE,
                                &ad);
                if (rc)
                        goto out;
        }

        if (unlikely(IS_PRIVATE(inode)))
                return 0;

        isec = inode_security(inode);
        rc = avc_has_extended_perms(&selinux_state,
                                    ssid, isec->sid, isec->sclass,
                                    requested, driver, xperm, &ad);
out:
        return rc;
}

static int selinux_file_ioctl(struct file *file, unsigned int cmd,
                              unsigned long arg)
{
        const struct cred *cred = current_cred();
        int error = 0;

        switch (cmd) {
        case FIONREAD:
        case FIBMAP:
        case FIGETBSZ:
        case FS_IOC_GETFLAGS:
        case FS_IOC_GETVERSION:
                error = file_has_perm(cred, file, FILE__GETATTR);
                break;

        case FS_IOC_SETFLAGS:
        case FS_IOC_SETVERSION:
                error = file_has_perm(cred, file, FILE__SETATTR);
                break;

        /* sys_ioctl() checks */
        case FIONBIO:
        case FIOASYNC:
                error = file_has_perm(cred, file, 0);
                break;

        case KDSKBENT:
        case KDSKBSENT:
                error = cred_has_capability(cred, CAP_SYS_TTY_CONFIG,
                                            CAP_OPT_NONE, true);
                break;

        case FIOCLEX:
        case FIONCLEX:
                if (!selinux_policycap_ioctl_skip_cloexec())
                        error = ioctl_has_perm(cred, file, FILE__IOCTL, (u16) cmd);
                break;

        /* default case assumes that the command will go
         * to the file's ioctl() function.
         */
        default:
                error = ioctl_has_perm(cred, file, FILE__IOCTL, (u16) cmd);
        }
        return error;
}

static int selinux_file_ioctl_compat(struct file *file, unsigned int cmd,
                              unsigned long arg)
{
        /*
         * If we are in a 64-bit kernel running 32-bit userspace, we need to
         * make sure we don't compare 32-bit flags to 64-bit flags.
         */
        switch (cmd) {
        case FS_IOC32_GETFLAGS:
                cmd = FS_IOC_GETFLAGS;
                break;
        case FS_IOC32_SETFLAGS:
                cmd = FS_IOC_SETFLAGS;
                break;
        case FS_IOC32_GETVERSION:
                cmd = FS_IOC_GETVERSION;
                break;
        case FS_IOC32_SETVERSION:
                cmd = FS_IOC_SETVERSION;
                break;
        default:
                break;
        }

        return selinux_file_ioctl(file, cmd, arg);
}

static int default_noexec __ro_after_init;

static int file_map_prot_check(struct file *file, unsigned long prot, int shared)
{
        const struct cred *cred = current_cred();
        u32 sid = cred_sid(cred);
        int rc = 0;

        if (default_noexec &&
            (prot & PROT_EXEC) && (!file || IS_PRIVATE(file_inode(file)) ||
                                   (!shared && (prot & PROT_WRITE)))) {
                /*
                 * We are making executable an anonymous mapping or a
                 * private file mapping that will also be writable.
                 * This has an additional check.
                 */
                rc = avc_has_perm(&selinux_state,
                                  sid, sid, SECCLASS_PROCESS,
                                  PROCESS__EXECMEM, NULL);
                if (rc)
                        goto error;
        }

        if (file) {
                /* read access is always possible with a mapping */
                u32 av = FILE__READ;

                /* write access only matters if the mapping is shared */
                if (shared && (prot & PROT_WRITE))
                        av |= FILE__WRITE;

                if (prot & PROT_EXEC)
                        av |= FILE__EXECUTE;

                return file_has_perm(cred, file, av);
        }

error:
        return rc;
}

static int selinux_mmap_addr(unsigned long addr)
{
        int rc = 0;

        if (addr < CONFIG_LSM_MMAP_MIN_ADDR) {
                u32 sid = current_sid();
                rc = avc_has_perm(&selinux_state,
                                  sid, sid, SECCLASS_MEMPROTECT,
                                  MEMPROTECT__MMAP_ZERO, NULL);
        }

        return rc;
}

static int selinux_mmap_file(struct file *file, unsigned long reqprot,
                             unsigned long prot, unsigned long flags)
{
        struct common_audit_data ad;
        int rc;

        if (file) {
                ad.type = LSM_AUDIT_DATA_FILE;
                ad.u.file = file;
                rc = inode_has_perm(current_cred(), file_inode(file),
                                    FILE__MAP, &ad);
                if (rc)
                        return rc;
        }

        if (checkreqprot_get(&selinux_state))
                prot = reqprot;

        return file_map_prot_check(file, prot,
                                   (flags & MAP_TYPE) == MAP_SHARED);
}

static int selinux_file_mprotect(struct vm_area_struct *vma,
                                 unsigned long reqprot,
                                 unsigned long prot)
{
        const struct cred *cred = current_cred();
        u32 sid = cred_sid(cred);

        if (checkreqprot_get(&selinux_state))
                prot = reqprot;

        if (default_noexec &&
            (prot & PROT_EXEC) && !(vma->vm_flags & VM_EXEC)) {
                int rc = 0;
                if (vma->vm_start >= vma->vm_mm->start_brk &&
                    vma->vm_end <= vma->vm_mm->brk) {
                        rc = avc_has_perm(&selinux_state,
                                          sid, sid, SECCLASS_PROCESS,
                                          PROCESS__EXECHEAP, NULL);
                } else if (!vma->vm_file &&
                           ((vma->vm_start <= vma->vm_mm->start_stack &&
                             vma->vm_end >= vma->vm_mm->start_stack) ||
                            vma_is_stack_for_current(vma))) {
                        rc = avc_has_perm(&selinux_state,
                                          sid, sid, SECCLASS_PROCESS,
                                          PROCESS__EXECSTACK, NULL);
                } else if (vma->vm_file && vma->anon_vma) {
                        /*
                         * We are making executable a file mapping that has
                         * had some COW done. Since pages might have been
                         * written, check ability to execute the possibly
                         * modified content.  This typically should only
                         * occur for text relocations.
                         */
                        rc = file_has_perm(cred, vma->vm_file, FILE__EXECMOD);
                }
                if (rc)
                        return rc;
        }

        return file_map_prot_check(vma->vm_file, prot, vma->vm_flags&VM_SHARED);
}

static int selinux_file_lock(struct file *file, unsigned int cmd)
{
        const struct cred *cred = current_cred();

        return file_has_perm(cred, file, FILE__LOCK);
}

static int selinux_file_fcntl(struct file *file, unsigned int cmd,
                              unsigned long arg)
{
        const struct cred *cred = current_cred();
        int err = 0;

        switch (cmd) {
        case F_SETFL:
                if ((file->f_flags & O_APPEND) && !(arg & O_APPEND)) {
                        err = file_has_perm(cred, file, FILE__WRITE);
                        break;
                }
                fallthrough;
        case F_SETOWN:
        case F_SETSIG:
        case F_GETFL:
        case F_GETOWN:
        case F_GETSIG:
        case F_GETOWNER_UIDS:
                /* Just check FD__USE permission */
                err = file_has_perm(cred, file, 0);
                break;
        case F_GETLK:
        case F_SETLK:
        case F_SETLKW:
        case F_OFD_GETLK:
        case F_OFD_SETLK:
        case F_OFD_SETLKW:
#if BITS_PER_LONG == 32
        case F_GETLK64:
        case F_SETLK64:
        case F_SETLKW64:
#endif
                err = file_has_perm(cred, file, FILE__LOCK);
                break;
        }

        return err;
}

static void selinux_file_set_fowner(struct file *file)
{
        struct file_security_struct *fsec;

        fsec = selinux_file(file);
        fsec->fown_sid = current_sid();
}

static int selinux_file_send_sigiotask(struct task_struct *tsk,
                                       struct fown_struct *fown, int signum)
{
        struct file *file;
        u32 sid = task_sid(tsk);
        u32 perm;
        struct file_security_struct *fsec;

        /* struct fown_struct is never outside the context of a struct file */
        file = container_of(fown, struct file, f_owner);

        fsec = selinux_file(file);

        if (!signum)
                perm = signal_to_av(SIGIO); /* as per send_sigio_to_task */
        else
                perm = signal_to_av(signum);

        return avc_has_perm(&selinux_state,
                            fsec->fown_sid, sid,
                            SECCLASS_PROCESS, perm, NULL);
}

static int selinux_file_receive(struct file *file)
{
        const struct cred *cred = current_cred();

        return file_has_perm(cred, file, file_to_av(file));
}

static int selinux_file_open(struct file *file)
{
        struct file_security_struct *fsec;
        struct inode_security_struct *isec;

        fsec = selinux_file(file);
        isec = inode_security(file_inode(file));
        /*
         * Save inode label and policy sequence number
         * at open-time so that selinux_file_permission
         * can determine whether revalidation is necessary.
         * Task label is already saved in the file security
         * struct as its SID.
         */
        fsec->isid = isec->sid;
        fsec->pseqno = avc_policy_seqno(&selinux_state);
        /*
         * Since the inode label or policy seqno may have changed
         * between the selinux_inode_permission check and the saving
         * of state above, recheck that access is still permitted.
         * Otherwise, access might never be revalidated against the
         * new inode label or new policy.
         * This check is not redundant - do not remove.
         */
        return file_path_has_perm(file->f_cred, file, open_file_to_av(file));
}

/* task security operations */

static int selinux_task_alloc(struct task_struct *task,
                              unsigned long clone_flags)
{
        u32 sid = current_sid();

        return avc_has_perm(&selinux_state,
                            sid, sid, SECCLASS_PROCESS, PROCESS__FORK, NULL);
}

/*
 * prepare a new set of credentials for modification
 */
static int selinux_cred_prepare(struct cred *new, const struct cred *old,
                                gfp_t gfp)
{
        const struct task_security_struct *old_tsec = selinux_cred(old);
        struct task_security_struct *tsec = selinux_cred(new);

        *tsec = *old_tsec;
        return 0;
}

/*
 * transfer the SELinux data to a blank set of creds
 */
static void selinux_cred_transfer(struct cred *new, const struct cred *old)
{
        const struct task_security_struct *old_tsec = selinux_cred(old);
        struct task_security_struct *tsec = selinux_cred(new);

        *tsec = *old_tsec;
}

static void selinux_cred_getsecid(const struct cred *c, u32 *secid)
{
        *secid = cred_sid(c);
}

/*
 * set the security data for a kernel service
 * - all the creation contexts are set to unlabelled
 */
static int selinux_kernel_act_as(struct cred *new, u32 secid)
{
        struct task_security_struct *tsec = selinux_cred(new);
        u32 sid = current_sid();
        int ret;

        ret = avc_has_perm(&selinux_state,
                           sid, secid,
                           SECCLASS_KERNEL_SERVICE,
                           KERNEL_SERVICE__USE_AS_OVERRIDE,
                           NULL);
        if (ret == 0) {
                tsec->sid = secid;
                tsec->create_sid = 0;
                tsec->keycreate_sid = 0;
                tsec->sockcreate_sid = 0;
        }
        return ret;
}

/*
 * set the file creation context in a security record to the same as the
 * objective context of the specified inode
 */
static int selinux_kernel_create_files_as(struct cred *new, struct inode *inode)
{
        struct inode_security_struct *isec = inode_security(inode);
        struct task_security_struct *tsec = selinux_cred(new);
        u32 sid = current_sid();
        int ret;

        ret = avc_has_perm(&selinux_state,
                           sid, isec->sid,
                           SECCLASS_KERNEL_SERVICE,
                           KERNEL_SERVICE__CREATE_FILES_AS,
                           NULL);

        if (ret == 0)
                tsec->create_sid = isec->sid;
        return ret;
}

static int selinux_kernel_module_request(char *kmod_name)
{
        struct common_audit_data ad;

        ad.type = LSM_AUDIT_DATA_KMOD;
        ad.u.kmod_name = kmod_name;

        return avc_has_perm(&selinux_state,
                            current_sid(), SECINITSID_KERNEL, SECCLASS_SYSTEM,
                            SYSTEM__MODULE_REQUEST, &ad);
}

static int selinux_kernel_module_from_file(struct file *file)
{
        struct common_audit_data ad;
        struct inode_security_struct *isec;
        struct file_security_struct *fsec;
        u32 sid = current_sid();
        int rc;

        /* init_module */
        if (file == NULL)
                return avc_has_perm(&selinux_state,
                                    sid, sid, SECCLASS_SYSTEM,
                                        SYSTEM__MODULE_LOAD, NULL);

        /* finit_module */

        ad.type = LSM_AUDIT_DATA_FILE;
        ad.u.file = file;

        fsec = selinux_file(file);
        if (sid != fsec->sid) {
                rc = avc_has_perm(&selinux_state,
                                  sid, fsec->sid, SECCLASS_FD, FD__USE, &ad);
                if (rc)
                        return rc;
        }

        isec = inode_security(file_inode(file));
        return avc_has_perm(&selinux_state,
                            sid, isec->sid, SECCLASS_SYSTEM,
                                SYSTEM__MODULE_LOAD, &ad);
}

static int selinux_kernel_read_file(struct file *file,
                                    enum kernel_read_file_id id,
                                    bool contents)
{
        int rc = 0;

        switch (id) {
        case READING_MODULE:
                rc = selinux_kernel_module_from_file(contents ? file : NULL);
                break;
        default:
                break;
        }

        return rc;
}

static int selinux_kernel_load_data(enum kernel_load_data_id id, bool contents)
{
        int rc = 0;

        switch (id) {
        case LOADING_MODULE:
                rc = selinux_kernel_module_from_file(NULL);
        default:
                break;
        }

        return rc;
}

static int selinux_task_setpgid(struct task_struct *p, pid_t pgid)
{
        return avc_has_perm(&selinux_state,
                            current_sid(), task_sid(p), SECCLASS_PROCESS,
                            PROCESS__SETPGID, NULL);
}

static int selinux_task_getpgid(struct task_struct *p)
{
        return avc_has_perm(&selinux_state,
                            current_sid(), task_sid(p), SECCLASS_PROCESS,
                            PROCESS__GETPGID, NULL);
}

static int selinux_task_getsid(struct task_struct *p)
{
        return avc_has_perm(&selinux_state,
                            current_sid(), task_sid(p), SECCLASS_PROCESS,
                            PROCESS__GETSESSION, NULL);
}

static void selinux_task_getsecid(struct task_struct *p, u32 *secid)
{
        *secid = task_sid(p);
}

static int selinux_task_setnice(struct task_struct *p, int nice)
{
        return avc_has_perm(&selinux_state,
                            current_sid(), task_sid(p), SECCLASS_PROCESS,
                            PROCESS__SETSCHED, NULL);
}

static int selinux_task_setioprio(struct task_struct *p, int ioprio)
{
        return avc_has_perm(&selinux_state,
                            current_sid(), task_sid(p), SECCLASS_PROCESS,
                            PROCESS__SETSCHED, NULL);
}

static int selinux_task_getioprio(struct task_struct *p)
{
        return avc_has_perm(&selinux_state,
                            current_sid(), task_sid(p), SECCLASS_PROCESS,
                            PROCESS__GETSCHED, NULL);
}

static int selinux_task_prlimit(const struct cred *cred, const struct cred *tcred,
                                unsigned int flags)
{
        u32 av = 0;

        if (!flags)
                return 0;
        if (flags & LSM_PRLIMIT_WRITE)
                av |= PROCESS__SETRLIMIT;
        if (flags & LSM_PRLIMIT_READ)
                av |= PROCESS__GETRLIMIT;
        return avc_has_perm(&selinux_state,
                            cred_sid(cred), cred_sid(tcred),
                            SECCLASS_PROCESS, av, NULL);
}

static int selinux_task_setrlimit(struct task_struct *p, unsigned int resource,
                struct rlimit *new_rlim)
{
        struct rlimit *old_rlim = p->signal->rlim + resource;

        /* Control the ability to change the hard limit (whether
           lowering or raising it), so that the hard limit can
           later be used as a safe reset point for the soft limit
           upon context transitions.  See selinux_bprm_committing_creds. */
        if (old_rlim->rlim_max != new_rlim->rlim_max)
                return avc_has_perm(&selinux_state,
                                    current_sid(), task_sid(p),
                                    SECCLASS_PROCESS, PROCESS__SETRLIMIT, NULL);

        return 0;
}

static int selinux_task_setscheduler(struct task_struct *p)
{
        return avc_has_perm(&selinux_state,
                            current_sid(), task_sid(p), SECCLASS_PROCESS,
                            PROCESS__SETSCHED, NULL);
}

static int selinux_task_getscheduler(struct task_struct *p)
{
        return avc_has_perm(&selinux_state,
                            current_sid(), task_sid(p), SECCLASS_PROCESS,
                            PROCESS__GETSCHED, NULL);
}

static int selinux_task_movememory(struct task_struct *p)
{
        return avc_has_perm(&selinux_state,
                            current_sid(), task_sid(p), SECCLASS_PROCESS,
                            PROCESS__SETSCHED, NULL);
}

static int selinux_task_kill(struct task_struct *p, struct kernel_siginfo *info,
                                int sig, const struct cred *cred)
{
        u32 secid;
        u32 perm;

        if (!sig)
                perm = PROCESS__SIGNULL; /* null signal; existence test */
        else
                perm = signal_to_av(sig);
        if (!cred)
                secid = current_sid();
        else
                secid = cred_sid(cred);
        return avc_has_perm(&selinux_state,
                            secid, task_sid(p), SECCLASS_PROCESS, perm, NULL);
}

static void selinux_task_to_inode(struct task_struct *p,
                                  struct inode *inode)
{
        struct inode_security_struct *isec = selinux_inode(inode);
        u32 sid = task_sid(p);

        spin_lock(&isec->lock);
        isec->sclass = inode_mode_to_security_class(inode->i_mode);
        isec->sid = sid;
        isec->initialized = LABEL_INITIALIZED;
        spin_unlock(&isec->lock);
}

/* Returns error only if unable to parse addresses */
static int selinux_parse_skb_ipv4(struct sk_buff *skb,
                        struct common_audit_data *ad, u8 *proto)
{
        int offset, ihlen, ret = -EINVAL;
        struct iphdr _iph, *ih;

        offset = skb_network_offset(skb);
        ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
        if (ih == NULL)
                goto out;

        ihlen = ih->ihl * 4;
        if (ihlen < sizeof(_iph))
                goto out;

        ad->u.net->v4info.saddr = ih->saddr;
        ad->u.net->v4info.daddr = ih->daddr;
        ret = 0;

        if (proto)
                *proto = ih->protocol;

        switch (ih->protocol) {
        case IPPROTO_TCP: {
                struct tcphdr _tcph, *th;

                if (ntohs(ih->frag_off) & IP_OFFSET)
                        break;

                offset += ihlen;
                th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
                if (th == NULL)
                        break;

                ad->u.net->sport = th->source;
                ad->u.net->dport = th->dest;
                break;
        }

        case IPPROTO_UDP: {
                struct udphdr _udph, *uh;

                if (ntohs(ih->frag_off) & IP_OFFSET)
                        break;

                offset += ihlen;
                uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph);
                if (uh == NULL)
                        break;

                ad->u.net->sport = uh->source;
                ad->u.net->dport = uh->dest;
                break;
        }

        case IPPROTO_DCCP: {
                struct dccp_hdr _dccph, *dh;

                if (ntohs(ih->frag_off) & IP_OFFSET)
                        break;

                offset += ihlen;
                dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph);
                if (dh == NULL)
                        break;

                ad->u.net->sport = dh->dccph_sport;
                ad->u.net->dport = dh->dccph_dport;
                break;
        }

#if IS_ENABLED(CONFIG_IP_SCTP)
        case IPPROTO_SCTP: {
                struct sctphdr _sctph, *sh;

                if (ntohs(ih->frag_off) & IP_OFFSET)
                        break;

                offset += ihlen;
                sh = skb_header_pointer(skb, offset, sizeof(_sctph), &_sctph);
                if (sh == NULL)
                        break;

                ad->u.net->sport = sh->source;
                ad->u.net->dport = sh->dest;
                break;
        }
#endif
        default:
                break;
        }
out:
        return ret;
}

#if IS_ENABLED(CONFIG_IPV6)

/* Returns error only if unable to parse addresses */
static int selinux_parse_skb_ipv6(struct sk_buff *skb,
                        struct common_audit_data *ad, u8 *proto)
{
        u8 nexthdr;
        int ret = -EINVAL, offset;
        struct ipv6hdr _ipv6h, *ip6;
        __be16 frag_off;

        offset = skb_network_offset(skb);
        ip6 = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
        if (ip6 == NULL)
                goto out;

        ad->u.net->v6info.saddr = ip6->saddr;
        ad->u.net->v6info.daddr = ip6->daddr;
        ret = 0;

        nexthdr = ip6->nexthdr;
        offset += sizeof(_ipv6h);
        offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off);
        if (offset < 0)
                goto out;

        if (proto)
                *proto = nexthdr;

        switch (nexthdr) {
        case IPPROTO_TCP: {
                struct tcphdr _tcph, *th;

                th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
                if (th == NULL)
                        break;

                ad->u.net->sport = th->source;
                ad->u.net->dport = th->dest;
                break;
        }

        case IPPROTO_UDP: {
                struct udphdr _udph, *uh;

                uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph);
                if (uh == NULL)
                        break;

                ad->u.net->sport = uh->source;
                ad->u.net->dport = uh->dest;
                break;
        }

        case IPPROTO_DCCP: {
                struct dccp_hdr _dccph, *dh;

                dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph);
                if (dh == NULL)
                        break;

                ad->u.net->sport = dh->dccph_sport;
                ad->u.net->dport = dh->dccph_dport;
                break;
        }

#if IS_ENABLED(CONFIG_IP_SCTP)
        case IPPROTO_SCTP: {
                struct sctphdr _sctph, *sh;

                sh = skb_header_pointer(skb, offset, sizeof(_sctph), &_sctph);
                if (sh == NULL)
                        break;

                ad->u.net->sport = sh->source;
                ad->u.net->dport = sh->dest;
                break;
        }
#endif
        /* includes fragments */
        default:
                break;
        }
out:
        return ret;
}

#endif /* IPV6 */

static int selinux_parse_skb(struct sk_buff *skb, struct common_audit_data *ad,
                             char **_addrp, int src, u8 *proto)
{
        char *addrp;
        int ret;

        switch (ad->u.net->family) {
        case PF_INET:
                ret = selinux_parse_skb_ipv4(skb, ad, proto);
                if (ret)
                        goto parse_error;
                addrp = (char *)(src ? &ad->u.net->v4info.saddr :
                                       &ad->u.net->v4info.daddr);
                goto okay;

#if IS_ENABLED(CONFIG_IPV6)
        case PF_INET6:
                ret = selinux_parse_skb_ipv6(skb, ad, proto);
                if (ret)
                        goto parse_error;
                addrp = (char *)(src ? &ad->u.net->v6info.saddr :
                                       &ad->u.net->v6info.daddr);
                goto okay;
#endif        /* IPV6 */
        default:
                addrp = NULL;
                goto okay;
        }

parse_error:
        pr_warn(
               "SELinux: failure in selinux_parse_skb(),"
               " unable to parse packet\n");
        return ret;

okay:
        if (_addrp)
                *_addrp = addrp;
        return 0;
}

/**
 * selinux_skb_peerlbl_sid - Determine the peer label of a packet
 * @skb: the packet
 * @family: protocol family
 * @sid: the packet's peer label SID
 *
 * Description:
 * Check the various different forms of network peer labeling and determine
 * the peer label/SID for the packet; most of the magic actually occurs in
 * the security server function security_net_peersid_cmp().  The function
 * returns zero if the value in @sid is valid (although it may be SECSID_NULL)
 * or -EACCES if @sid is invalid due to inconsistencies with the different
 * peer labels.
 *
 */
static int selinux_skb_peerlbl_sid(struct sk_buff *skb, u16 family, u32 *sid)
{
        int err;
        u32 xfrm_sid;
        u32 nlbl_sid;
        u32 nlbl_type;

        err = selinux_xfrm_skb_sid(skb, &xfrm_sid);
        if (unlikely(err))
                return -EACCES;
        err = selinux_netlbl_skbuff_getsid(skb, family, &nlbl_type, &nlbl_sid);
        if (unlikely(err))
                return -EACCES;

        err = security_net_peersid_resolve(&selinux_state, nlbl_sid,
                                           nlbl_type, xfrm_sid, sid);
        if (unlikely(err)) {
                pr_warn(
                       "SELinux: failure in selinux_skb_peerlbl_sid(),"
                       " unable to determine packet's peer label\n");
                return -EACCES;
        }

        return 0;
}

/**
 * selinux_conn_sid - Determine the child socket label for a connection
 * @sk_sid: the parent socket's SID
 * @skb_sid: the packet's SID
 * @conn_sid: the resulting connection SID
 *
 * If @skb_sid is valid then the user:role:type information from @sk_sid is
 * combined with the MLS information from @skb_sid in order to create
 * @conn_sid.  If @skb_sid is not valid then @conn_sid is simply a copy
 * of @sk_sid.  Returns zero on success, negative values on failure.
 *
 */
static int selinux_conn_sid(u32 sk_sid, u32 skb_sid, u32 *conn_sid)
{
        int err = 0;

        if (skb_sid != SECSID_NULL)
                err = security_sid_mls_copy(&selinux_state, sk_sid, skb_sid,
                                            conn_sid);
        else
                *conn_sid = sk_sid;

        return err;
}

/* socket security operations */

static int socket_sockcreate_sid(const struct task_security_struct *tsec,
                                 u16 secclass, u32 *socksid)
{
        if (tsec->sockcreate_sid > SECSID_NULL) {
                *socksid = tsec->sockcreate_sid;
                return 0;
        }

        return security_transition_sid(&selinux_state, tsec->sid, tsec->sid,
                                       secclass, NULL, socksid);
}

static int sock_has_perm(struct sock *sk, u32 perms)
{
        struct sk_security_struct *sksec = sk->sk_security;
        struct common_audit_data ad;
        struct lsm_network_audit net = {0,};

        if (sksec->sid == SECINITSID_KERNEL)
                return 0;

        ad.type = LSM_AUDIT_DATA_NET;
        ad.u.net = &net;
        ad.u.net->sk = sk;

        return avc_has_perm(&selinux_state,
                            current_sid(), sksec->sid, sksec->sclass, perms,
                            &ad);
}

static int selinux_socket_create(int family, int type,
                                 int protocol, int kern)
{
        const struct task_security_struct *tsec = selinux_cred(current_cred());
        u32 newsid;
        u16 secclass;
        int rc;

        if (kern)
                return 0;

        secclass = socket_type_to_security_class(family, type, protocol);
        rc = socket_sockcreate_sid(tsec, secclass, &newsid);
        if (rc)
                return rc;

        return avc_has_perm(&selinux_state,
                            tsec->sid, newsid, secclass, SOCKET__CREATE, NULL);
}

static int selinux_socket_post_create(struct socket *sock, int family,
                                      int type, int protocol, int kern)
{
        const struct task_security_struct *tsec = selinux_cred(current_cred());
        struct inode_security_struct *isec = inode_security_novalidate(SOCK_INODE(sock));
        struct sk_security_struct *sksec;
        u16 sclass = socket_type_to_security_class(family, type, protocol);
        u32 sid = SECINITSID_KERNEL;
        int err = 0;

        if (!kern) {
                err = socket_sockcreate_sid(tsec, sclass, &sid);
                if (err)
                        return err;
        }

        isec->sclass = sclass;
        isec->sid = sid;
        isec->initialized = LABEL_INITIALIZED;

        if (sock->sk) {
                sksec = sock->sk->sk_security;
                sksec->sclass = sclass;
                sksec->sid = sid;
                /* Allows detection of the first association on this socket */
                if (sksec->sclass == SECCLASS_SCTP_SOCKET)
                        sksec->sctp_assoc_state = SCTP_ASSOC_UNSET;

                err = selinux_netlbl_socket_post_create(sock->sk, family);
        }

        return err;
}

static int selinux_socket_socketpair(struct socket *socka,
                                     struct socket *sockb)
{
        struct sk_security_struct *sksec_a = socka->sk->sk_security;
        struct sk_security_struct *sksec_b = sockb->sk->sk_security;

        sksec_a->peer_sid = sksec_b->sid;
        sksec_b->peer_sid = sksec_a->sid;

        return 0;
}

/* Range of port numbers used to automatically bind.
   Need to determine whether we should perform a name_bind
   permission check between the socket and the port number. */

static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, int addrlen)
{
        struct sock *sk = sock->sk;
        struct sk_security_struct *sksec = sk->sk_security;
        u16 family;
        int err;

        err = sock_has_perm(sk, SOCKET__BIND);
        if (err)
                goto out;

        /* If PF_INET or PF_INET6, check name_bind permission for the port. */
        family = sk->sk_family;
        if (family == PF_INET || family == PF_INET6) {
                char *addrp;
                struct common_audit_data ad;
                struct lsm_network_audit net = {0,};
                struct sockaddr_in *addr4 = NULL;
                struct sockaddr_in6 *addr6 = NULL;
                u16 family_sa;
                unsigned short snum;
                u32 sid, node_perm;

                /*
                 * sctp_bindx(3) calls via selinux_sctp_bind_connect()
                 * that validates multiple binding addresses. Because of this
                 * need to check address->sa_family as it is possible to have
                 * sk->sk_family = PF_INET6 with addr->sa_family = AF_INET.
                 */
                if (addrlen < offsetofend(struct sockaddr, sa_family))
                        return -EINVAL;
                family_sa = address->sa_family;
                switch (family_sa) {
                case AF_UNSPEC:
                case AF_INET:
                        if (addrlen < sizeof(struct sockaddr_in))
                                return -EINVAL;
                        addr4 = (struct sockaddr_in *)address;
                        if (family_sa == AF_UNSPEC) {
                                if (family == PF_INET6) {
                                        /* Length check from inet6_bind_sk() */
                                        if (addrlen < SIN6_LEN_RFC2133)
                                                return -EINVAL;
                                        /* Family check from __inet6_bind() */
                                        goto err_af;
                                }
                                /* see __inet_bind(), we only want to allow
                                 * AF_UNSPEC if the address is INADDR_ANY
                                 */
                                if (addr4->sin_addr.s_addr != htonl(INADDR_ANY))
                                        goto err_af;
                                family_sa = AF_INET;
                        }
                        snum = ntohs(addr4->sin_port);
                        addrp = (char *)&addr4->sin_addr.s_addr;
                        break;
                case AF_INET6:
                        if (addrlen < SIN6_LEN_RFC2133)
                                return -EINVAL;
                        addr6 = (struct sockaddr_in6 *)address;
                        snum = ntohs(addr6->sin6_port);
                        addrp = (char *)&addr6->sin6_addr.s6_addr;
                        break;
                default:
                        goto err_af;
                }

                ad.type = LSM_AUDIT_DATA_NET;
                ad.u.net = &net;
                ad.u.net->sport = htons(snum);
                ad.u.net->family = family_sa;

                if (snum) {
                        int low, high;

                        inet_get_local_port_range(sock_net(sk), &low, &high);

                        if (inet_port_requires_bind_service(sock_net(sk), snum) ||
                            snum < low || snum > high) {
                                err = sel_netport_sid(sk->sk_protocol,
                                                      snum, &sid);
                                if (err)
                                        goto out;
                                err = avc_has_perm(&selinux_state,
                                                   sksec->sid, sid,
                                                   sksec->sclass,
                                                   SOCKET__NAME_BIND, &ad);
                                if (err)
                                        goto out;
                        }
                }

                switch (sksec->sclass) {
                case SECCLASS_TCP_SOCKET:
                        node_perm = TCP_SOCKET__NODE_BIND;
                        break;

                case SECCLASS_UDP_SOCKET:
                        node_perm = UDP_SOCKET__NODE_BIND;
                        break;

                case SECCLASS_DCCP_SOCKET:
                        node_perm = DCCP_SOCKET__NODE_BIND;
                        break;

                case SECCLASS_SCTP_SOCKET:
                        node_perm = SCTP_SOCKET__NODE_BIND;
                        break;

                default:
                        node_perm = RAWIP_SOCKET__NODE_BIND;
                        break;
                }

                err = sel_netnode_sid(addrp, family_sa, &sid);
                if (err)
                        goto out;

                if (family_sa == AF_INET)
                        ad.u.net->v4info.saddr = addr4->sin_addr.s_addr;
                else
                        ad.u.net->v6info.saddr = addr6->sin6_addr;

                err = avc_has_perm(&selinux_state,
                                   sksec->sid, sid,
                                   sksec->sclass, node_perm, &ad);
                if (err)
                        goto out;
        }
out:
        return err;
err_af:
        /* Note that SCTP services expect -EINVAL, others -EAFNOSUPPORT. */
        if (sksec->sclass == SECCLASS_SCTP_SOCKET)
                return -EINVAL;
        return -EAFNOSUPPORT;
}

/* This supports connect(2) and SCTP connect services such as sctp_connectx(3)
 * and sctp_sendmsg(3) as described in Documentation/security/SCTP.rst
 */
static int selinux_socket_connect_helper(struct socket *sock,
                                         struct sockaddr *address, int addrlen)
{
        struct sock *sk = sock->sk;
        struct sk_security_struct *sksec = sk->sk_security;
        int err;

        err = sock_has_perm(sk, SOCKET__CONNECT);
        if (err)
                return err;
        if (addrlen < offsetofend(struct sockaddr, sa_family))
                return -EINVAL;

        /* connect(AF_UNSPEC) has special handling, as it is a documented
         * way to disconnect the socket
         */
        if (address->sa_family == AF_UNSPEC)
                return 0;

        /*
         * If a TCP, DCCP or SCTP socket, check name_connect permission
         * for the port.
         */
        if (sksec->sclass == SECCLASS_TCP_SOCKET ||
            sksec->sclass == SECCLASS_DCCP_SOCKET ||
            sksec->sclass == SECCLASS_SCTP_SOCKET) {
                struct common_audit_data ad;
                struct lsm_network_audit net = {0,};
                struct sockaddr_in *addr4 = NULL;
                struct sockaddr_in6 *addr6 = NULL;
                unsigned short snum;
                u32 sid, perm;

                /* sctp_connectx(3) calls via selinux_sctp_bind_connect()
                 * that validates multiple connect addresses. Because of this
                 * need to check address->sa_family as it is possible to have
                 * sk->sk_family = PF_INET6 with addr->sa_family = AF_INET.
                 */
                switch (address->sa_family) {
                case AF_INET:
                        addr4 = (struct sockaddr_in *)address;
                        if (addrlen < sizeof(struct sockaddr_in))
                                return -EINVAL;
                        snum = ntohs(addr4->sin_port);
                        break;
                case AF_INET6:
                        addr6 = (struct sockaddr_in6 *)address;
                        if (addrlen < SIN6_LEN_RFC2133)
                                return -EINVAL;
                        snum = ntohs(addr6->sin6_port);
                        break;
                default:
                        /* Note that SCTP services expect -EINVAL, whereas
                         * others expect -EAFNOSUPPORT.
                         */
                        if (sksec->sclass == SECCLASS_SCTP_SOCKET)
                                return -EINVAL;
                        else
                                return -EAFNOSUPPORT;
                }

                err = sel_netport_sid(sk->sk_protocol, snum, &sid);
                if (err)
                        return err;

                switch (sksec->sclass) {
                case SECCLASS_TCP_SOCKET:
                        perm = TCP_SOCKET__NAME_CONNECT;
                        break;
                case SECCLASS_DCCP_SOCKET:
                        perm = DCCP_SOCKET__NAME_CONNECT;
                        break;
                case SECCLASS_SCTP_SOCKET:
                        perm = SCTP_SOCKET__NAME_CONNECT;
                        break;
                }

                ad.type = LSM_AUDIT_DATA_NET;
                ad.u.net = &net;
                ad.u.net->dport = htons(snum);
                ad.u.net->family = address->sa_family;
                err = avc_has_perm(&selinux_state,
                                   sksec->sid, sid, sksec->sclass, perm, &ad);
                if (err)
                        return err;
        }

        return 0;
}

/* Supports connect(2), see comments in selinux_socket_connect_helper() */
static int selinux_socket_connect(struct socket *sock,
                                  struct sockaddr *address, int addrlen)
{
        int err;
        struct sock *sk = sock->sk;

        err = selinux_socket_connect_helper(sock, address, addrlen);
        if (err)
                return err;

        return selinux_netlbl_socket_connect(sk, address);
}

static int selinux_socket_listen(struct socket *sock, int backlog)
{
        return sock_has_perm(sock->sk, SOCKET__LISTEN);
}

static int selinux_socket_accept(struct socket *sock, struct socket *newsock)
{
        int err;
        struct inode_security_struct *isec;
        struct inode_security_struct *newisec;
        u16 sclass;
        u32 sid;

        err = sock_has_perm(sock->sk, SOCKET__ACCEPT);
        if (err)
                return err;

        isec = inode_security_novalidate(SOCK_INODE(sock));
        spin_lock(&isec->lock);
        sclass = isec->sclass;
        sid = isec->sid;
        spin_unlock(&isec->lock);

        newisec = inode_security_novalidate(SOCK_INODE(newsock));
        newisec->sclass = sclass;
        newisec->sid = sid;
        newisec->initialized = LABEL_INITIALIZED;

        return 0;
}

static int selinux_socket_sendmsg(struct socket *sock, struct msghdr *msg,
                                  int size)
{
        return sock_has_perm(sock->sk, SOCKET__WRITE);
}

static int selinux_socket_recvmsg(struct socket *sock, struct msghdr *msg,
                                  int size, int flags)
{
        return sock_has_perm(sock->sk, SOCKET__READ);
}

static int selinux_socket_getsockname(struct socket *sock)
{
        return sock_has_perm(sock->sk, SOCKET__GETATTR);
}

static int selinux_socket_getpeername(struct socket *sock)
{
        return sock_has_perm(sock->sk, SOCKET__GETATTR);
}

static int selinux_socket_setsockopt(struct socket *sock, int level, int optname)
{
        int err;

        err = sock_has_perm(sock->sk, SOCKET__SETOPT);
        if (err)
                return err;

        return selinux_netlbl_socket_setsockopt(sock, level, optname);
}

static int selinux_socket_getsockopt(struct socket *sock, int level,
                                     int optname)
{
        return sock_has_perm(sock->sk, SOCKET__GETOPT);
}

static int selinux_socket_shutdown(struct socket *sock, int how)
{
        return sock_has_perm(sock->sk, SOCKET__SHUTDOWN);
}

static int selinux_socket_unix_stream_connect(struct sock *sock,
                                              struct sock *other,
                                              struct sock *newsk)
{
        struct sk_security_struct *sksec_sock = sock->sk_security;
        struct sk_security_struct *sksec_other = other->sk_security;
        struct sk_security_struct *sksec_new = newsk->sk_security;
        struct common_audit_data ad;
        struct lsm_network_audit net = {0,};
        int err;

        ad.type = LSM_AUDIT_DATA_NET;
        ad.u.net = &net;
        ad.u.net->sk = other;

        err = avc_has_perm(&selinux_state,
                           sksec_sock->sid, sksec_other->sid,
                           sksec_other->sclass,
                           UNIX_STREAM_SOCKET__CONNECTTO, &ad);
        if (err)
                return err;

        /* server child socket */
        sksec_new->peer_sid = sksec_sock->sid;
        err = security_sid_mls_copy(&selinux_state, sksec_other->sid,
                                    sksec_sock->sid, &sksec_new->sid);
        if (err)
                return err;

        /* connecting socket */
        sksec_sock->peer_sid = sksec_new->sid;

        return 0;
}

static int selinux_socket_unix_may_send(struct socket *sock,
                                        struct socket *other)
{
        struct sk_security_struct *ssec = sock->sk->sk_security;
        struct sk_security_struct *osec = other->sk->sk_security;
        struct common_audit_data ad;
        struct lsm_network_audit net = {0,};

        ad.type = LSM_AUDIT_DATA_NET;
        ad.u.net = &net;
        ad.u.net->sk = other->sk;

        return avc_has_perm(&selinux_state,
                            ssec->sid, osec->sid, osec->sclass, SOCKET__SENDTO,
                            &ad);
}

static int selinux_inet_sys_rcv_skb(struct net *ns, int ifindex,
                                    char *addrp, u16 family, u32 peer_sid,
                                    struct common_audit_data *ad)
{
        int err;
        u32 if_sid;
        u32 node_sid;

        err = sel_netif_sid(ns, ifindex, &if_sid);
        if (err)
                return err;
        err = avc_has_perm(&selinux_state,
                           peer_sid, if_sid,
                           SECCLASS_NETIF, NETIF__INGRESS, ad);
        if (err)
                return err;

        err = sel_netnode_sid(addrp, family, &node_sid);
        if (err)
                return err;
        return avc_has_perm(&selinux_state,
                            peer_sid, node_sid,
                            SECCLASS_NODE, NODE__RECVFROM, ad);
}

static int selinux_sock_rcv_skb_compat(struct sock *sk, struct sk_buff *skb,
                                       u16 family)
{
        int err = 0;
        struct sk_security_struct *sksec = sk->sk_security;
        u32 sk_sid = sksec->sid;
        struct common_audit_data ad;
        struct lsm_network_audit net = {0,};
        char *addrp;

        ad.type = LSM_AUDIT_DATA_NET;
        ad.u.net = &net;
        ad.u.net->netif = skb->skb_iif;
        ad.u.net->family = family;
        err = selinux_parse_skb(skb, &ad, &addrp, 1, NULL);
        if (err)
                return err;

        if (selinux_secmark_enabled()) {
                err = avc_has_perm(&selinux_state,
                                   sk_sid, skb->secmark, SECCLASS_PACKET,
                                   PACKET__RECV, &ad);
                if (err)
                        return err;
        }

        err = selinux_netlbl_sock_rcv_skb(sksec, skb, family, &ad);
        if (err)
                return err;
        err = selinux_xfrm_sock_rcv_skb(sksec->sid, skb, &ad);

        return err;
}

static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
        int err;
        struct sk_security_struct *sksec = sk->sk_security;
        u16 family = sk->sk_family;
        u32 sk_sid = sksec->sid;
        struct common_audit_data ad;
        struct lsm_network_audit net = {0,};
        char *addrp;
        u8 secmark_active;
        u8 peerlbl_active;

        if (family != PF_INET && family != PF_INET6)
                return 0;

        /* Handle mapped IPv4 packets arriving via IPv6 sockets */
        if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP))
                family = PF_INET;

        /* If any sort of compatibility mode is enabled then handoff processing
         * to the selinux_sock_rcv_skb_compat() function to deal with the
         * special handling.  We do this in an attempt to keep this function
         * as fast and as clean as possible. */
        if (!selinux_policycap_netpeer())
                return selinux_sock_rcv_skb_compat(sk, skb, family);

        secmark_active = selinux_secmark_enabled();
        peerlbl_active = selinux_peerlbl_enabled();
        if (!secmark_active && !peerlbl_active)
                return 0;

        ad.type = LSM_AUDIT_DATA_NET;
        ad.u.net = &net;
        ad.u.net->netif = skb->skb_iif;
        ad.u.net->family = family;
        err = selinux_parse_skb(skb, &ad, &addrp, 1, NULL);
        if (err)
                return err;

        if (peerlbl_active) {
                u32 peer_sid;

                err = selinux_skb_peerlbl_sid(skb, family, &peer_sid);
                if (err)
                        return err;
                err = selinux_inet_sys_rcv_skb(sock_net(sk), skb->skb_iif,
                                               addrp, family, peer_sid, &ad);
                if (err) {
                        selinux_netlbl_err(skb, family, err, 0);
                        return err;
                }
                err = avc_has_perm(&selinux_state,
                                   sk_sid, peer_sid, SECCLASS_PEER,
                                   PEER__RECV, &ad);
                if (err) {
                        selinux_netlbl_err(skb, family, err, 0);
                        return err;
                }
        }

        if (secmark_active) {
                err = avc_has_perm(&selinux_state,
                                   sk_sid, skb->secmark, SECCLASS_PACKET,
                                   PACKET__RECV, &ad);
                if (err)
                        return err;
        }

        return err;
}

static int selinux_socket_getpeersec_stream(struct socket *sock,
                                            sockptr_t optval, sockptr_t optlen,
                                            unsigned int len)
{
        int err = 0;
        char *scontext = NULL;
        u32 scontext_len;
        struct sk_security_struct *sksec = sock->sk->sk_security;
        u32 peer_sid = SECSID_NULL;

        if (sksec->sclass == SECCLASS_UNIX_STREAM_SOCKET ||
            sksec->sclass == SECCLASS_TCP_SOCKET ||
            sksec->sclass == SECCLASS_SCTP_SOCKET)
                peer_sid = sksec->peer_sid;
        if (peer_sid == SECSID_NULL)
                return -ENOPROTOOPT;

        err = security_sid_to_context(&selinux_state, peer_sid, &scontext,
                                      &scontext_len);
        if (err)
                return err;
        if (scontext_len > len) {
                err = -ERANGE;
                goto out_len;
        }

        if (copy_to_sockptr(optval, scontext, scontext_len))
                err = -EFAULT;
out_len:
        if (copy_to_sockptr(optlen, &scontext_len, sizeof(scontext_len)))
                err = -EFAULT;
        kfree(scontext);
        return err;
}

static int selinux_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid)
{
        u32 peer_secid = SECSID_NULL;
        u16 family;
        struct inode_security_struct *isec;

        if (skb && skb->protocol == htons(ETH_P_IP))
                family = PF_INET;
        else if (skb && skb->protocol == htons(ETH_P_IPV6))
                family = PF_INET6;
        else if (sock)
                family = sock->sk->sk_family;
        else
                goto out;

        if (sock && family == PF_UNIX) {
                isec = inode_security_novalidate(SOCK_INODE(sock));
                peer_secid = isec->sid;
        } else if (skb)
                selinux_skb_peerlbl_sid(skb, family, &peer_secid);

out:
        *secid = peer_secid;
        if (peer_secid == SECSID_NULL)
                return -EINVAL;
        return 0;
}

static int selinux_sk_alloc_security(struct sock *sk, int family, gfp_t priority)
{
        struct sk_security_struct *sksec;

        sksec = kzalloc(sizeof(*sksec), priority);
        if (!sksec)
                return -ENOMEM;

        sksec->peer_sid = SECINITSID_UNLABELED;
        sksec->sid = SECINITSID_UNLABELED;
        sksec->sclass = SECCLASS_SOCKET;
        selinux_netlbl_sk_security_reset(sksec);
        sk->sk_security = sksec;

        return 0;
}

static void selinux_sk_free_security(struct sock *sk)
{
        struct sk_security_struct *sksec = sk->sk_security;

        sk->sk_security = NULL;
        selinux_netlbl_sk_security_free(sksec);
        kfree(sksec);
}

static void selinux_sk_clone_security(const struct sock *sk, struct sock *newsk)
{
        struct sk_security_struct *sksec = sk->sk_security;
        struct sk_security_struct *newsksec = newsk->sk_security;

        newsksec->sid = sksec->sid;
        newsksec->peer_sid = sksec->peer_sid;
        newsksec->sclass = sksec->sclass;

        selinux_netlbl_sk_security_reset(newsksec);
}

static void selinux_sk_getsecid(struct sock *sk, u32 *secid)
{
        if (!sk)
                *secid = SECINITSID_ANY_SOCKET;
        else {
                struct sk_security_struct *sksec = sk->sk_security;

                *secid = sksec->sid;
        }
}

static void selinux_sock_graft(struct sock *sk, struct socket *parent)
{
        struct inode_security_struct *isec =
                inode_security_novalidate(SOCK_INODE(parent));
        struct sk_security_struct *sksec = sk->sk_security;

        if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6 ||
            sk->sk_family == PF_UNIX)
                isec->sid = sksec->sid;
        sksec->sclass = isec->sclass;
}

/* Called whenever SCTP receives an INIT chunk. This happens when an incoming
 * connect(2), sctp_connectx(3) or sctp_sendmsg(3) (with no association
 * already present).
 */
static int selinux_sctp_assoc_request(struct sctp_endpoint *ep,
                                      struct sk_buff *skb)
{
        struct sk_security_struct *sksec = ep->base.sk->sk_security;
        struct common_audit_data ad;
        struct lsm_network_audit net = {0,};
        u8 peerlbl_active;
        u32 peer_sid = SECINITSID_UNLABELED;
        u32 conn_sid;
        int err = 0;

        if (!selinux_policycap_extsockclass())
                return 0;

        peerlbl_active = selinux_peerlbl_enabled();

        if (peerlbl_active) {
                /* This will return peer_sid = SECSID_NULL if there are
                 * no peer labels, see security_net_peersid_resolve().
                 */
                err = selinux_skb_peerlbl_sid(skb, ep->base.sk->sk_family,
                                              &peer_sid);
                if (err)
                        return err;

                if (peer_sid == SECSID_NULL)
                        peer_sid = SECINITSID_UNLABELED;
        }

        if (sksec->sctp_assoc_state == SCTP_ASSOC_UNSET) {
                sksec->sctp_assoc_state = SCTP_ASSOC_SET;

                /* Here as first association on socket. As the peer SID
                 * was allowed by peer recv (and the netif/node checks),
                 * then it is approved by policy and used as the primary
                 * peer SID for getpeercon(3).
                 */
                sksec->peer_sid = peer_sid;
        } else if  (sksec->peer_sid != peer_sid) {
                /* Other association peer SIDs are checked to enforce
                 * consistency among the peer SIDs.
                 */
                ad.type = LSM_AUDIT_DATA_NET;
                ad.u.net = &net;
                ad.u.net->sk = ep->base.sk;
                err = avc_has_perm(&selinux_state,
                                   sksec->peer_sid, peer_sid, sksec->sclass,
                                   SCTP_SOCKET__ASSOCIATION, &ad);
                if (err)
                        return err;
        }

        /* Compute the MLS component for the connection and store
         * the information in ep. This will be used by SCTP TCP type
         * sockets and peeled off connections as they cause a new
         * socket to be generated. selinux_sctp_sk_clone() will then
         * plug this into the new socket.
         */
        err = selinux_conn_sid(sksec->sid, peer_sid, &conn_sid);
        if (err)
                return err;

        ep->secid = conn_sid;
        ep->peer_secid = peer_sid;

        /* Set any NetLabel labels including CIPSO/CALIPSO options. */
        return selinux_netlbl_sctp_assoc_request(ep, skb);
}

/* Check if sctp IPv4/IPv6 addresses are valid for binding or connecting
 * based on their @optname.
 */
static int selinux_sctp_bind_connect(struct sock *sk, int optname,
                                     struct sockaddr *address,
                                     int addrlen)
{
        int len, err = 0, walk_size = 0;
        void *addr_buf;
        struct sockaddr *addr;
        struct socket *sock;

        if (!selinux_policycap_extsockclass())
                return 0;

        /* Process one or more addresses that may be IPv4 or IPv6 */
        sock = sk->sk_socket;
        addr_buf = address;

        while (walk_size < addrlen) {
                if (walk_size + sizeof(sa_family_t) > addrlen)
                        return -EINVAL;

                addr = addr_buf;
                switch (addr->sa_family) {
                case AF_UNSPEC:
                case AF_INET:
                        len = sizeof(struct sockaddr_in);
                        break;
                case AF_INET6:
                        len = sizeof(struct sockaddr_in6);
                        break;
                default:
                        return -EINVAL;
                }

                if (walk_size + len > addrlen)
                        return -EINVAL;

                err = -EINVAL;
                switch (optname) {
                /* Bind checks */
                case SCTP_PRIMARY_ADDR:
                case SCTP_SET_PEER_PRIMARY_ADDR:
                case SCTP_SOCKOPT_BINDX_ADD:
                        err = selinux_socket_bind(sock, addr, len);
                        break;
                /* Connect checks */
                case SCTP_SOCKOPT_CONNECTX:
                case SCTP_PARAM_SET_PRIMARY:
                case SCTP_PARAM_ADD_IP:
                case SCTP_SENDMSG_CONNECT:
                        err = selinux_socket_connect_helper(sock, addr, len);
                        if (err)
                                return err;

                        /* As selinux_sctp_bind_connect() is called by the
                         * SCTP protocol layer, the socket is already locked,
                         * therefore selinux_netlbl_socket_connect_locked()
                         * is called here. The situations handled are:
                         * sctp_connectx(3), sctp_sendmsg(3), sendmsg(2),
                         * whenever a new IP address is added or when a new
                         * primary address is selected.
                         * Note that an SCTP connect(2) call happens before
                         * the SCTP protocol layer and is handled via
                         * selinux_socket_connect().
                         */
                        err = selinux_netlbl_socket_connect_locked(sk, addr);
                        break;
                }

                if (err)
                        return err;

                addr_buf += len;
                walk_size += len;
        }

        return 0;
}

/* Called whenever a new socket is created by accept(2) or sctp_peeloff(3). */
static void selinux_sctp_sk_clone(struct sctp_endpoint *ep, struct sock *sk,
                                  struct sock *newsk)
{
        struct sk_security_struct *sksec = sk->sk_security;
        struct sk_security_struct *newsksec = newsk->sk_security;

        /* If policy does not support SECCLASS_SCTP_SOCKET then call
         * the non-sctp clone version.
         */
        if (!selinux_policycap_extsockclass())
                return selinux_sk_clone_security(sk, newsk);

        newsksec->sid = ep->secid;
        newsksec->peer_sid = ep->peer_secid;
        newsksec->sclass = sksec->sclass;
        selinux_netlbl_sctp_sk_clone(sk, newsk);
}

static int selinux_inet_conn_request(struct sock *sk, struct sk_buff *skb,
                                     struct request_sock *req)
{
        struct sk_security_struct *sksec = sk->sk_security;
        int err;
        u16 family = req->rsk_ops->family;
        u32 connsid;
        u32 peersid;

        err = selinux_skb_peerlbl_sid(skb, family, &peersid);
        if (err)
                return err;
        err = selinux_conn_sid(sksec->sid, peersid, &connsid);
        if (err)
                return err;
        req->secid = connsid;
        req->peer_secid = peersid;

        return selinux_netlbl_inet_conn_request(req, family);
}

static void selinux_inet_csk_clone(struct sock *newsk,
                                   const struct request_sock *req)
{
        struct sk_security_struct *newsksec = newsk->sk_security;

        newsksec->sid = req->secid;
        newsksec->peer_sid = req->peer_secid;
        /* NOTE: Ideally, we should also get the isec->sid for the
           new socket in sync, but we don't have the isec available yet.
           So we will wait until sock_graft to do it, by which
           time it will have been created and available. */

        /* We don't need to take any sort of lock here as we are the only
         * thread with access to newsksec */
        selinux_netlbl_inet_csk_clone(newsk, req->rsk_ops->family);
}

static void selinux_inet_conn_established(struct sock *sk, struct sk_buff *skb)
{
        u16 family = sk->sk_family;
        struct sk_security_struct *sksec = sk->sk_security;

        /* handle mapped IPv4 packets arriving via IPv6 sockets */
        if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP))
                family = PF_INET;

        selinux_skb_peerlbl_sid(skb, family, &sksec->peer_sid);
}

static int selinux_secmark_relabel_packet(u32 sid)
{
        const struct task_security_struct *__tsec;
        u32 tsid;

        __tsec = selinux_cred(current_cred());
        tsid = __tsec->sid;

        return avc_has_perm(&selinux_state,
                            tsid, sid, SECCLASS_PACKET, PACKET__RELABELTO,
                            NULL);
}

static void selinux_secmark_refcount_inc(void)
{
        atomic_inc(&selinux_secmark_refcount);
}

static void selinux_secmark_refcount_dec(void)
{
        atomic_dec(&selinux_secmark_refcount);
}

static void selinux_req_classify_flow(const struct request_sock *req,
                                      struct flowi_common *flic)
{
        flic->flowic_secid = req->secid;
}

static int selinux_tun_dev_alloc_security(void **security)
{
        struct tun_security_struct *tunsec;

        tunsec = kzalloc(sizeof(*tunsec), GFP_KERNEL);
        if (!tunsec)
                return -ENOMEM;
        tunsec->sid = current_sid();

        *security = tunsec;
        return 0;
}

static void selinux_tun_dev_free_security(void *security)
{
        kfree(security);
}

static int selinux_tun_dev_create(void)
{
        u32 sid = current_sid();

        /* we aren't taking into account the "sockcreate" SID since the socket
         * that is being created here is not a socket in the traditional sense,
         * instead it is a private sock, accessible only to the kernel, and
         * representing a wide range of network traffic spanning multiple
         * connections unlike traditional sockets - check the TUN driver to
         * get a better understanding of why this socket is special */

        return avc_has_perm(&selinux_state,
                            sid, sid, SECCLASS_TUN_SOCKET, TUN_SOCKET__CREATE,
                            NULL);
}

static int selinux_tun_dev_attach_queue(void *security)
{
        struct tun_security_struct *tunsec = security;

        return avc_has_perm(&selinux_state,
                            current_sid(), tunsec->sid, SECCLASS_TUN_SOCKET,
                            TUN_SOCKET__ATTACH_QUEUE, NULL);
}

static int selinux_tun_dev_attach(struct sock *sk, void *security)
{
        struct tun_security_struct *tunsec = security;
        struct sk_security_struct *sksec = sk->sk_security;

        /* we don't currently perform any NetLabel based labeling here and it
         * isn't clear that we would want to do so anyway; while we could apply
         * labeling without the support of the TUN user the resulting labeled
         * traffic from the other end of the connection would almost certainly
         * cause confusion to the TUN user that had no idea network labeling
         * protocols were being used */

        sksec->sid = tunsec->sid;
        sksec->sclass = SECCLASS_TUN_SOCKET;

        return 0;
}

static int selinux_tun_dev_open(void *security)
{
        struct tun_security_struct *tunsec = security;
        u32 sid = current_sid();
        int err;

        err = avc_has_perm(&selinux_state,
                           sid, tunsec->sid, SECCLASS_TUN_SOCKET,
                           TUN_SOCKET__RELABELFROM, NULL);
        if (err)
                return err;
        err = avc_has_perm(&selinux_state,
                           sid, sid, SECCLASS_TUN_SOCKET,
                           TUN_SOCKET__RELABELTO, NULL);
        if (err)
                return err;
        tunsec->sid = sid;

        return 0;
}

#ifdef CONFIG_NETFILTER

static unsigned int selinux_ip_forward(struct sk_buff *skb,
                                       const struct net_device *indev,
                                       u16 family)
{
        int err;
        char *addrp;
        u32 peer_sid;
        struct common_audit_data ad;
        struct lsm_network_audit net = {0,};
        u8 secmark_active;
        u8 netlbl_active;
        u8 peerlbl_active;

        if (!selinux_policycap_netpeer())
                return NF_ACCEPT;

        secmark_active = selinux_secmark_enabled();
        netlbl_active = netlbl_enabled();
        peerlbl_active = selinux_peerlbl_enabled();
        if (!secmark_active && !peerlbl_active)
                return NF_ACCEPT;

        if (selinux_skb_peerlbl_sid(skb, family, &peer_sid) != 0)
                return NF_DROP;

        ad.type = LSM_AUDIT_DATA_NET;
        ad.u.net = &net;
        ad.u.net->netif = indev->ifindex;
        ad.u.net->family = family;
        if (selinux_parse_skb(skb, &ad, &addrp, 1, NULL) != 0)
                return NF_DROP;

        if (peerlbl_active) {
                err = selinux_inet_sys_rcv_skb(dev_net(indev), indev->ifindex,
                                               addrp, family, peer_sid, &ad);
                if (err) {
                        selinux_netlbl_err(skb, family, err, 1);
                        return NF_DROP;
                }
        }

        if (secmark_active)
                if (avc_has_perm(&selinux_state,
                                 peer_sid, skb->secmark,
                                 SECCLASS_PACKET, PACKET__FORWARD_IN, &ad))
                        return NF_DROP;

        if (netlbl_active)
                /* we do this in the FORWARD path and not the POST_ROUTING
                 * path because we want to make sure we apply the necessary
                 * labeling before IPsec is applied so we can leverage AH
                 * protection */
                if (selinux_netlbl_skbuff_setsid(skb, family, peer_sid) != 0)
                        return NF_DROP;

        return NF_ACCEPT;
}

static unsigned int selinux_ipv4_forward(void *priv,
                                         struct sk_buff *skb,
                                         const struct nf_hook_state *state)
{
        return selinux_ip_forward(skb, state->in, PF_INET);
}

#if IS_ENABLED(CONFIG_IPV6)
static unsigned int selinux_ipv6_forward(void *priv,
                                         struct sk_buff *skb,
                                         const struct nf_hook_state *state)
{
        return selinux_ip_forward(skb, state->in, PF_INET6);
}
#endif        /* IPV6 */

static unsigned int selinux_ip_output(struct sk_buff *skb,
                                      u16 family)
{
        struct sock *sk;
        u32 sid;

        if (!netlbl_enabled())
                return NF_ACCEPT;

        /* we do this in the LOCAL_OUT path and not the POST_ROUTING path
         * because we want to make sure we apply the necessary labeling
         * before IPsec is applied so we can leverage AH protection */
        sk = skb->sk;
        if (sk) {
                struct sk_security_struct *sksec;

                if (sk_listener(sk))
                        /* if the socket is the listening state then this
                         * packet is a SYN-ACK packet which means it needs to
                         * be labeled based on the connection/request_sock and
                         * not the parent socket.  unfortunately, we can't
                         * lookup the request_sock yet as it isn't queued on
                         * the parent socket until after the SYN-ACK is sent.
                         * the "solution" is to simply pass the packet as-is
                         * as any IP option based labeling should be copied
                         * from the initial connection request (in the IP
                         * layer).  it is far from ideal, but until we get a
                         * security label in the packet itself this is the
                         * best we can do. */
                        return NF_ACCEPT;

                /* standard practice, label using the parent socket */
                sksec = sk->sk_security;
                sid = sksec->sid;
        } else
                sid = SECINITSID_KERNEL;
        if (selinux_netlbl_skbuff_setsid(skb, family, sid) != 0)
                return NF_DROP;

        return NF_ACCEPT;
}

static unsigned int selinux_ipv4_output(void *priv,
                                        struct sk_buff *skb,
                                        const struct nf_hook_state *state)
{
        return selinux_ip_output(skb, PF_INET);
}

#if IS_ENABLED(CONFIG_IPV6)
static unsigned int selinux_ipv6_output(void *priv,
                                        struct sk_buff *skb,
                                        const struct nf_hook_state *state)
{
        return selinux_ip_output(skb, PF_INET6);
}
#endif        /* IPV6 */

static unsigned int selinux_ip_postroute_compat(struct sk_buff *skb,
                                                int ifindex,
                                                u16 family)
{
        struct sock *sk = skb_to_full_sk(skb);
        struct sk_security_struct *sksec;
        struct common_audit_data ad;
        struct lsm_network_audit net = {0,};
        char *addrp;
        u8 proto = 0;

        if (sk == NULL)
                return NF_ACCEPT;
        sksec = sk->sk_security;

        ad.type = LSM_AUDIT_DATA_NET;
        ad.u.net = &net;
        ad.u.net->netif = ifindex;
        ad.u.net->family = family;
        if (selinux_parse_skb(skb, &ad, &addrp, 0, &proto))
                return NF_DROP;

        if (selinux_secmark_enabled())
                if (avc_has_perm(&selinux_state,
                                 sksec->sid, skb->secmark,
                                 SECCLASS_PACKET, PACKET__SEND, &ad))
                        return NF_DROP_ERR(-ECONNREFUSED);

        if (selinux_xfrm_postroute_last(sksec->sid, skb, &ad, proto))
                return NF_DROP_ERR(-ECONNREFUSED);

        return NF_ACCEPT;
}

static unsigned int selinux_ip_postroute(struct sk_buff *skb,
                                         const struct net_device *outdev,
                                         u16 family)
{
        u32 secmark_perm;
        u32 peer_sid;
        int ifindex = outdev->ifindex;
        struct sock *sk;
        struct common_audit_data ad;
        struct lsm_network_audit net = {0,};
        char *addrp;
        u8 secmark_active;
        u8 peerlbl_active;

        /* If any sort of compatibility mode is enabled then handoff processing
         * to the selinux_ip_postroute_compat() function to deal with the
         * special handling.  We do this in an attempt to keep this function
         * as fast and as clean as possible. */
        if (!selinux_policycap_netpeer())
                return selinux_ip_postroute_compat(skb, ifindex, family);

        secmark_active = selinux_secmark_enabled();
        peerlbl_active = selinux_peerlbl_enabled();
        if (!secmark_active && !peerlbl_active)
                return NF_ACCEPT;

        sk = skb_to_full_sk(skb);

#ifdef CONFIG_XFRM
        /* If skb->dst->xfrm is non-NULL then the packet is undergoing an IPsec
         * packet transformation so allow the packet to pass without any checks
         * since we'll have another chance to perform access control checks
         * when the packet is on it's final way out.
         * NOTE: there appear to be some IPv6 multicast cases where skb->dst
         *       is NULL, in this case go ahead and apply access control.
         * NOTE: if this is a local socket (skb->sk != NULL) that is in the
         *       TCP listening state we cannot wait until the XFRM processing
         *       is done as we will miss out on the SA label if we do;
         *       unfortunately, this means more work, but it is only once per
         *       connection. */
        if (skb_dst(skb) != NULL && skb_dst(skb)->xfrm != NULL &&
            !(sk && sk_listener(sk)))
                return NF_ACCEPT;
#endif

        if (sk == NULL) {
                /* Without an associated socket the packet is either coming
                 * from the kernel or it is being forwarded; check the packet
                 * to determine which and if the packet is being forwarded
                 * query the packet directly to determine the security label. */
                if (skb->skb_iif) {
                        secmark_perm = PACKET__FORWARD_OUT;
                        if (selinux_skb_peerlbl_sid(skb, family, &peer_sid))
                                return NF_DROP;
                } else {
                        secmark_perm = PACKET__SEND;
                        peer_sid = SECINITSID_KERNEL;
                }
        } else if (sk_listener(sk)) {
                /* Locally generated packet but the associated socket is in the
                 * listening state which means this is a SYN-ACK packet.  In
                 * this particular case the correct security label is assigned
                 * to the connection/request_sock but unfortunately we can't
                 * query the request_sock as it isn't queued on the parent
                 * socket until after the SYN-ACK packet is sent; the only
                 * viable choice is to regenerate the label like we do in
                 * selinux_inet_conn_request().  See also selinux_ip_output()
                 * for similar problems. */
                u32 skb_sid;
                struct sk_security_struct *sksec;

                sksec = sk->sk_security;
                if (selinux_skb_peerlbl_sid(skb, family, &skb_sid))
                        return NF_DROP;
                /* At this point, if the returned skb peerlbl is SECSID_NULL
                 * and the packet has been through at least one XFRM
                 * transformation then we must be dealing with the "final"
                 * form of labeled IPsec packet; since we've already applied
                 * all of our access controls on this packet we can safely
                 * pass the packet. */
                if (skb_sid == SECSID_NULL) {
                        switch (family) {
                        case PF_INET:
                                if (IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED)
                                        return NF_ACCEPT;
                                break;
                        case PF_INET6:
                                if (IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED)
                                        return NF_ACCEPT;
                                break;
                        default:
                                return NF_DROP_ERR(-ECONNREFUSED);
                        }
                }
                if (selinux_conn_sid(sksec->sid, skb_sid, &peer_sid))
                        return NF_DROP;
                secmark_perm = PACKET__SEND;
        } else {
                /* Locally generated packet, fetch the security label from the
                 * associated socket. */
                struct sk_security_struct *sksec = sk->sk_security;
                peer_sid = sksec->sid;
                secmark_perm = PACKET__SEND;
        }

        ad.type = LSM_AUDIT_DATA_NET;
        ad.u.net = &net;
        ad.u.net->netif = ifindex;
        ad.u.net->family = family;
        if (selinux_parse_skb(skb, &ad, &addrp, 0, NULL))
                return NF_DROP;

        if (secmark_active)
                if (avc_has_perm(&selinux_state,
                                 peer_sid, skb->secmark,
                                 SECCLASS_PACKET, secmark_perm, &ad))
                        return NF_DROP_ERR(-ECONNREFUSED);

        if (peerlbl_active) {
                u32 if_sid;
                u32 node_sid;

                if (sel_netif_sid(dev_net(outdev), ifindex, &if_sid))
                        return NF_DROP;
                if (avc_has_perm(&selinux_state,
                                 peer_sid, if_sid,
                                 SECCLASS_NETIF, NETIF__EGRESS, &ad))
                        return NF_DROP_ERR(-ECONNREFUSED);

                if (sel_netnode_sid(addrp, family, &node_sid))
                        return NF_DROP;
                if (avc_has_perm(&selinux_state,
                                 peer_sid, node_sid,
                                 SECCLASS_NODE, NODE__SENDTO, &ad))
                        return NF_DROP_ERR(-ECONNREFUSED);
        }

        return NF_ACCEPT;
}

static unsigned int selinux_ipv4_postroute(void *priv,
                                           struct sk_buff *skb,
                                           const struct nf_hook_state *state)
{
        return selinux_ip_postroute(skb, state->out, PF_INET);
}

#if IS_ENABLED(CONFIG_IPV6)
static unsigned int selinux_ipv6_postroute(void *priv,
                                           struct sk_buff *skb,
                                           const struct nf_hook_state *state)
{
        return selinux_ip_postroute(skb, state->out, PF_INET6);
}
#endif        /* IPV6 */

#endif        /* CONFIG_NETFILTER */

static int selinux_netlink_send(struct sock *sk, struct sk_buff *skb)
{
        int rc = 0;
        unsigned int msg_len;
        unsigned int data_len = skb->len;
        unsigned char *data = skb->data;
        struct nlmsghdr *nlh;
        struct sk_security_struct *sksec = sk->sk_security;
        u16 sclass = sksec->sclass;
        u32 perm;

        while (data_len >= nlmsg_total_size(0)) {
                nlh = (struct nlmsghdr *)data;

                /* NOTE: the nlmsg_len field isn't reliably set by some netlink
                 *       users which means we can't reject skb's with bogus
                 *       length fields; our solution is to follow what
                 *       netlink_rcv_skb() does and simply skip processing at
                 *       messages with length fields that are clearly junk
                 */
                if (nlh->nlmsg_len < NLMSG_HDRLEN || nlh->nlmsg_len > data_len)
                        return 0;

                rc = selinux_nlmsg_lookup(sclass, nlh->nlmsg_type, &perm);
                if (rc == 0) {
                        rc = sock_has_perm(sk, perm);
                        if (rc)
                                return rc;
                } else if (rc == -EINVAL) {
                        /* -EINVAL is a missing msg/perm mapping */
                        pr_warn_ratelimited("SELinux: unrecognized netlink"
                                " message: protocol=%hu nlmsg_type=%hu sclass=%s"
                                " pid=%d comm=%s\n",
                                sk->sk_protocol, nlh->nlmsg_type,
                                secclass_map[sclass - 1].name,
                                task_pid_nr(current), current->comm);
                        if (enforcing_enabled(&selinux_state) &&
                            !security_get_allow_unknown(&selinux_state))
                                return rc;
                        rc = 0;
                } else if (rc == -ENOENT) {
                        /* -ENOENT is a missing socket/class mapping, ignore */
                        rc = 0;
                } else {
                        return rc;
                }

                /* move to the next message after applying netlink padding */
                msg_len = NLMSG_ALIGN(nlh->nlmsg_len);
                if (msg_len >= data_len)
                        return 0;
                data_len -= msg_len;
                data += msg_len;
        }

        return rc;
}

static void ipc_init_security(struct ipc_security_struct *isec, u16 sclass)
{
        isec->sclass = sclass;
        isec->sid = current_sid();
}

static int ipc_has_perm(struct kern_ipc_perm *ipc_perms,
                        u32 perms)
{
        struct ipc_security_struct *isec;
        struct common_audit_data ad;
        u32 sid = current_sid();

        isec = selinux_ipc(ipc_perms);

        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = ipc_perms->key;

        return avc_has_perm(&selinux_state,
                            sid, isec->sid, isec->sclass, perms, &ad);
}

static int selinux_msg_msg_alloc_security(struct msg_msg *msg)
{
        struct msg_security_struct *msec;

        msec = selinux_msg_msg(msg);
        msec->sid = SECINITSID_UNLABELED;

        return 0;
}

/* message queue security operations */
static int selinux_msg_queue_alloc_security(struct kern_ipc_perm *msq)
{
        struct ipc_security_struct *isec;
        struct common_audit_data ad;
        u32 sid = current_sid();
        int rc;

        isec = selinux_ipc(msq);
        ipc_init_security(isec, SECCLASS_MSGQ);

        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = msq->key;

        rc = avc_has_perm(&selinux_state,
                          sid, isec->sid, SECCLASS_MSGQ,
                          MSGQ__CREATE, &ad);
        return rc;
}

static int selinux_msg_queue_associate(struct kern_ipc_perm *msq, int msqflg)
{
        struct ipc_security_struct *isec;
        struct common_audit_data ad;
        u32 sid = current_sid();

        isec = selinux_ipc(msq);

        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = msq->key;

        return avc_has_perm(&selinux_state,
                            sid, isec->sid, SECCLASS_MSGQ,
                            MSGQ__ASSOCIATE, &ad);
}

static int selinux_msg_queue_msgctl(struct kern_ipc_perm *msq, int cmd)
{
        int err;
        int perms;

        switch (cmd) {
        case IPC_INFO:
        case MSG_INFO:
                /* No specific object, just general system-wide information. */
                return avc_has_perm(&selinux_state,
                                    current_sid(), SECINITSID_KERNEL,
                                    SECCLASS_SYSTEM, SYSTEM__IPC_INFO, NULL);
        case IPC_STAT:
        case MSG_STAT:
        case MSG_STAT_ANY:
                perms = MSGQ__GETATTR | MSGQ__ASSOCIATE;
                break;
        case IPC_SET:
                perms = MSGQ__SETATTR;
                break;
        case IPC_RMID:
                perms = MSGQ__DESTROY;
                break;
        default:
                return 0;
        }

        err = ipc_has_perm(msq, perms);
        return err;
}

static int selinux_msg_queue_msgsnd(struct kern_ipc_perm *msq, struct msg_msg *msg, int msqflg)
{
        struct ipc_security_struct *isec;
        struct msg_security_struct *msec;
        struct common_audit_data ad;
        u32 sid = current_sid();
        int rc;

        isec = selinux_ipc(msq);
        msec = selinux_msg_msg(msg);

        /*
         * First time through, need to assign label to the message
         */
        if (msec->sid == SECINITSID_UNLABELED) {
                /*
                 * Compute new sid based on current process and
                 * message queue this message will be stored in
                 */
                rc = security_transition_sid(&selinux_state, sid, isec->sid,
                                             SECCLASS_MSG, NULL, &msec->sid);
                if (rc)
                        return rc;
        }

        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = msq->key;

        /* Can this process write to the queue? */
        rc = avc_has_perm(&selinux_state,
                          sid, isec->sid, SECCLASS_MSGQ,
                          MSGQ__WRITE, &ad);
        if (!rc)
                /* Can this process send the message */
                rc = avc_has_perm(&selinux_state,
                                  sid, msec->sid, SECCLASS_MSG,
                                  MSG__SEND, &ad);
        if (!rc)
                /* Can the message be put in the queue? */
                rc = avc_has_perm(&selinux_state,
                                  msec->sid, isec->sid, SECCLASS_MSGQ,
                                  MSGQ__ENQUEUE, &ad);

        return rc;
}

static int selinux_msg_queue_msgrcv(struct kern_ipc_perm *msq, struct msg_msg *msg,
                                    struct task_struct *target,
                                    long type, int mode)
{
        struct ipc_security_struct *isec;
        struct msg_security_struct *msec;
        struct common_audit_data ad;
        u32 sid = task_sid(target);
        int rc;

        isec = selinux_ipc(msq);
        msec = selinux_msg_msg(msg);

        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = msq->key;

        rc = avc_has_perm(&selinux_state,
                          sid, isec->sid,
                          SECCLASS_MSGQ, MSGQ__READ, &ad);
        if (!rc)
                rc = avc_has_perm(&selinux_state,
                                  sid, msec->sid,
                                  SECCLASS_MSG, MSG__RECEIVE, &ad);
        return rc;
}

/* Shared Memory security operations */
static int selinux_shm_alloc_security(struct kern_ipc_perm *shp)
{
        struct ipc_security_struct *isec;
        struct common_audit_data ad;
        u32 sid = current_sid();
        int rc;

        isec = selinux_ipc(shp);
        ipc_init_security(isec, SECCLASS_SHM);

        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = shp->key;

        rc = avc_has_perm(&selinux_state,
                          sid, isec->sid, SECCLASS_SHM,
                          SHM__CREATE, &ad);
        return rc;
}

static int selinux_shm_associate(struct kern_ipc_perm *shp, int shmflg)
{
        struct ipc_security_struct *isec;
        struct common_audit_data ad;
        u32 sid = current_sid();

        isec = selinux_ipc(shp);

        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = shp->key;

        return avc_has_perm(&selinux_state,
                            sid, isec->sid, SECCLASS_SHM,
                            SHM__ASSOCIATE, &ad);
}

/* Note, at this point, shp is locked down */
static int selinux_shm_shmctl(struct kern_ipc_perm *shp, int cmd)
{
        int perms;
        int err;

        switch (cmd) {
        case IPC_INFO:
        case SHM_INFO:
                /* No specific object, just general system-wide information. */
                return avc_has_perm(&selinux_state,
                                    current_sid(), SECINITSID_KERNEL,
                                    SECCLASS_SYSTEM, SYSTEM__IPC_INFO, NULL);
        case IPC_STAT:
        case SHM_STAT:
        case SHM_STAT_ANY:
                perms = SHM__GETATTR | SHM__ASSOCIATE;
                break;
        case IPC_SET:
                perms = SHM__SETATTR;
                break;
        case SHM_LOCK:
        case SHM_UNLOCK:
                perms = SHM__LOCK;
                break;
        case IPC_RMID:
                perms = SHM__DESTROY;
                break;
        default:
                return 0;
        }

        err = ipc_has_perm(shp, perms);
        return err;
}

static int selinux_shm_shmat(struct kern_ipc_perm *shp,
                             char __user *shmaddr, int shmflg)
{
        u32 perms;

        if (shmflg & SHM_RDONLY)
                perms = SHM__READ;
        else
                perms = SHM__READ | SHM__WRITE;

        return ipc_has_perm(shp, perms);
}

/* Semaphore security operations */
static int selinux_sem_alloc_security(struct kern_ipc_perm *sma)
{
        struct ipc_security_struct *isec;
        struct common_audit_data ad;
        u32 sid = current_sid();
        int rc;

        isec = selinux_ipc(sma);
        ipc_init_security(isec, SECCLASS_SEM);

        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = sma->key;

        rc = avc_has_perm(&selinux_state,
                          sid, isec->sid, SECCLASS_SEM,
                          SEM__CREATE, &ad);
        return rc;
}

static int selinux_sem_associate(struct kern_ipc_perm *sma, int semflg)
{
        struct ipc_security_struct *isec;
        struct common_audit_data ad;
        u32 sid = current_sid();

        isec = selinux_ipc(sma);

        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = sma->key;

        return avc_has_perm(&selinux_state,
                            sid, isec->sid, SECCLASS_SEM,
                            SEM__ASSOCIATE, &ad);
}

/* Note, at this point, sma is locked down */
static int selinux_sem_semctl(struct kern_ipc_perm *sma, int cmd)
{
        int err;
        u32 perms;

        switch (cmd) {
        case IPC_INFO:
        case SEM_INFO:
                /* No specific object, just general system-wide information. */
                return avc_has_perm(&selinux_state,
                                    current_sid(), SECINITSID_KERNEL,
                                    SECCLASS_SYSTEM, SYSTEM__IPC_INFO, NULL);
        case GETPID:
        case GETNCNT:
        case GETZCNT:
                perms = SEM__GETATTR;
                break;
        case GETVAL:
        case GETALL:
                perms = SEM__READ;
                break;
        case SETVAL:
        case SETALL:
                perms = SEM__WRITE;
                break;
        case IPC_RMID:
                perms = SEM__DESTROY;
                break;
        case IPC_SET:
                perms = SEM__SETATTR;
                break;
        case IPC_STAT:
        case SEM_STAT:
        case SEM_STAT_ANY:
                perms = SEM__GETATTR | SEM__ASSOCIATE;
                break;
        default:
                return 0;
        }

        err = ipc_has_perm(sma, perms);
        return err;
}

static int selinux_sem_semop(struct kern_ipc_perm *sma,
                             struct sembuf *sops, unsigned nsops, int alter)
{
        u32 perms;

        if (alter)
                perms = SEM__READ | SEM__WRITE;
        else
                perms = SEM__READ;

        return ipc_has_perm(sma, perms);
}

static int selinux_ipc_permission(struct kern_ipc_perm *ipcp, short flag)
{
        u32 av = 0;

        av = 0;
        if (flag & S_IRUGO)
                av |= IPC__UNIX_READ;
        if (flag & S_IWUGO)
                av |= IPC__UNIX_WRITE;

        if (av == 0)
                return 0;

        return ipc_has_perm(ipcp, av);
}

static void selinux_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid)
{
        struct ipc_security_struct *isec = selinux_ipc(ipcp);
        *secid = isec->sid;
}

static void selinux_d_instantiate(struct dentry *dentry, struct inode *inode)
{
        if (inode)
                inode_doinit_with_dentry(inode, dentry);
}

static int selinux_getprocattr(struct task_struct *p,
                               char *name, char **value)
{
        const struct task_security_struct *__tsec;
        u32 sid;
        int error;
        unsigned len;

        rcu_read_lock();
        __tsec = selinux_cred(__task_cred(p));

        if (current != p) {
                error = avc_has_perm(&selinux_state,
                                     current_sid(), __tsec->sid,
                                     SECCLASS_PROCESS, PROCESS__GETATTR, NULL);
                if (error)
                        goto bad;
        }

        if (!strcmp(name, "current"))
                sid = __tsec->sid;
        else if (!strcmp(name, "prev"))
                sid = __tsec->osid;
        else if (!strcmp(name, "exec"))
                sid = __tsec->exec_sid;
        else if (!strcmp(name, "fscreate"))
                sid = __tsec->create_sid;
        else if (!strcmp(name, "keycreate"))
                sid = __tsec->keycreate_sid;
        else if (!strcmp(name, "sockcreate"))
                sid = __tsec->sockcreate_sid;
        else {
                error = -EINVAL;
                goto bad;
        }
        rcu_read_unlock();

        if (!sid)
                return 0;

        error = security_sid_to_context(&selinux_state, sid, value, &len);
        if (error)
                return error;
        return len;

bad:
        rcu_read_unlock();
        return error;
}

static int selinux_setprocattr(const char *name, void *value, size_t size)
{
        struct task_security_struct *tsec;
        struct cred *new;
        u32 mysid = current_sid(), sid = 0, ptsid;
        int error;
        char *str = value;

        /*
         * Basic control over ability to set these attributes at all.
         */
        if (!strcmp(name, "exec"))
                error = avc_has_perm(&selinux_state,
                                     mysid, mysid, SECCLASS_PROCESS,
                                     PROCESS__SETEXEC, NULL);
        else if (!strcmp(name, "fscreate"))
                error = avc_has_perm(&selinux_state,
                                     mysid, mysid, SECCLASS_PROCESS,
                                     PROCESS__SETFSCREATE, NULL);
        else if (!strcmp(name, "keycreate"))
                error = avc_has_perm(&selinux_state,
                                     mysid, mysid, SECCLASS_PROCESS,
                                     PROCESS__SETKEYCREATE, NULL);
        else if (!strcmp(name, "sockcreate"))
                error = avc_has_perm(&selinux_state,
                                     mysid, mysid, SECCLASS_PROCESS,
                                     PROCESS__SETSOCKCREATE, NULL);
        else if (!strcmp(name, "current"))
                error = avc_has_perm(&selinux_state,
                                     mysid, mysid, SECCLASS_PROCESS,
                                     PROCESS__SETCURRENT, NULL);
        else
                error = -EINVAL;
        if (error)
                return error;

        /* Obtain a SID for the context, if one was specified. */
        if (size && str[0] && str[0] != '\n') {
                if (str[size-1] == '\n') {
                        str[size-1] = 0;
                        size--;
                }
                error = security_context_to_sid(&selinux_state, value, size,
                                                &sid, GFP_KERNEL);
                if (error == -EINVAL && !strcmp(name, "fscreate")) {
                        if (!has_cap_mac_admin(true)) {
                                struct audit_buffer *ab;
                                size_t audit_size;

                                /* We strip a nul only if it is at the end, otherwise the
                                 * context contains a nul and we should audit that */
                                if (str[size - 1] == '\0')
                                        audit_size = size - 1;
                                else
                                        audit_size = size;
                                ab = audit_log_start(audit_context(),
                                                     GFP_ATOMIC,
                                                     AUDIT_SELINUX_ERR);
                                audit_log_format(ab, "op=fscreate invalid_context=");
                                audit_log_n_untrustedstring(ab, value, audit_size);
                                audit_log_end(ab);

                                return error;
                        }
                        error = security_context_to_sid_force(
                                                      &selinux_state,
                                                      value, size, &sid);
                }
                if (error)
                        return error;
        }

        new = prepare_creds();
        if (!new)
                return -ENOMEM;

        /* Permission checking based on the specified context is
           performed during the actual operation (execve,
           open/mkdir/...), when we know the full context of the
           operation.  See selinux_bprm_creds_for_exec for the execve
           checks and may_create for the file creation checks. The
           operation will then fail if the context is not permitted. */
        tsec = selinux_cred(new);
        if (!strcmp(name, "exec")) {
                tsec->exec_sid = sid;
        } else if (!strcmp(name, "fscreate")) {
                tsec->create_sid = sid;
        } else if (!strcmp(name, "keycreate")) {
                if (sid) {
                        error = avc_has_perm(&selinux_state, mysid, sid,
                                             SECCLASS_KEY, KEY__CREATE, NULL);
                        if (error)
                                goto abort_change;
                }
                tsec->keycreate_sid = sid;
        } else if (!strcmp(name, "sockcreate")) {
                tsec->sockcreate_sid = sid;
        } else if (!strcmp(name, "current")) {
                error = -EINVAL;
                if (sid == 0)
                        goto abort_change;

                /* Only allow single threaded processes to change context */
                error = -EPERM;
                if (!current_is_single_threaded()) {
                        error = security_bounded_transition(&selinux_state,
                                                            tsec->sid, sid);
                        if (error)
                                goto abort_change;
                }

                /* Check permissions for the transition. */
                error = avc_has_perm(&selinux_state,
                                     tsec->sid, sid, SECCLASS_PROCESS,
                                     PROCESS__DYNTRANSITION, NULL);
                if (error)
                        goto abort_change;

                /* Check for ptracing, and update the task SID if ok.
                   Otherwise, leave SID unchanged and fail. */
                ptsid = ptrace_parent_sid();
                if (ptsid != 0) {
                        error = avc_has_perm(&selinux_state,
                                             ptsid, sid, SECCLASS_PROCESS,
                                             PROCESS__PTRACE, NULL);
                        if (error)
                                goto abort_change;
                }

                tsec->sid = sid;
        } else {
                error = -EINVAL;
                goto abort_change;
        }

        commit_creds(new);
        return size;

abort_change:
        abort_creds(new);
        return error;
}

static int selinux_ismaclabel(const char *name)
{
        return (strcmp(name, XATTR_SELINUX_SUFFIX) == 0);
}

static int selinux_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
{
        return security_sid_to_context(&selinux_state, secid,
                                       secdata, seclen);
}

static int selinux_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid)
{
        return security_context_to_sid(&selinux_state, secdata, seclen,
                                       secid, GFP_KERNEL);
}

static void selinux_release_secctx(char *secdata, u32 seclen)
{
        kfree(secdata);
}

static void selinux_inode_invalidate_secctx(struct inode *inode)
{
        struct inode_security_struct *isec = selinux_inode(inode);

        spin_lock(&isec->lock);
        isec->initialized = LABEL_INVALID;
        spin_unlock(&isec->lock);
}

/*
 *        called with inode->i_mutex locked
 */
static int selinux_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen)
{
        int rc = selinux_inode_setsecurity(inode, XATTR_SELINUX_SUFFIX,
                                           ctx, ctxlen, 0);
        /* Do not return error when suppressing label (SBLABEL_MNT not set). */
        return rc == -EOPNOTSUPP ? 0 : rc;
}

/*
 *        called with inode->i_mutex locked
 */
static int selinux_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen)
{
        return __vfs_setxattr_locked(dentry, XATTR_NAME_SELINUX, ctx, ctxlen, 0,
                                     NULL);
}

static int selinux_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen)
{
        int len = 0;
        len = selinux_inode_getsecurity(inode, XATTR_SELINUX_SUFFIX,
                                                ctx, true);
        if (len < 0)
                return len;
        *ctxlen = len;
        return 0;
}
#ifdef CONFIG_KEYS

static int selinux_key_alloc(struct key *k, const struct cred *cred,
                             unsigned long flags)
{
        const struct task_security_struct *tsec;
        struct key_security_struct *ksec;

        ksec = kzalloc(sizeof(struct key_security_struct), GFP_KERNEL);
        if (!ksec)
                return -ENOMEM;

        tsec = selinux_cred(cred);
        if (tsec->keycreate_sid)
                ksec->sid = tsec->keycreate_sid;
        else
                ksec->sid = tsec->sid;

        k->security = ksec;
        return 0;
}

static void selinux_key_free(struct key *k)
{
        struct key_security_struct *ksec = k->security;

        k->security = NULL;
        kfree(ksec);
}

static int selinux_key_permission(key_ref_t key_ref,
                                  const struct cred *cred,
                                  enum key_need_perm need_perm)
{
        struct key *key;
        struct key_security_struct *ksec;
        u32 perm, sid;

        switch (need_perm) {
        case KEY_NEED_VIEW:
                perm = KEY__VIEW;
                break;
        case KEY_NEED_READ:
                perm = KEY__READ;
                break;
        case KEY_NEED_WRITE:
                perm = KEY__WRITE;
                break;
        case KEY_NEED_SEARCH:
                perm = KEY__SEARCH;
                break;
        case KEY_NEED_LINK:
                perm = KEY__LINK;
                break;
        case KEY_NEED_SETATTR:
                perm = KEY__SETATTR;
                break;
        case KEY_NEED_UNLINK:
        case KEY_SYSADMIN_OVERRIDE:
        case KEY_AUTHTOKEN_OVERRIDE:
        case KEY_DEFER_PERM_CHECK:
                return 0;
        default:
                WARN_ON(1);
                return -EPERM;

        }

        sid = cred_sid(cred);
        key = key_ref_to_ptr(key_ref);
        ksec = key->security;

        return avc_has_perm(&selinux_state,
                            sid, ksec->sid, SECCLASS_KEY, perm, NULL);
}

static int selinux_key_getsecurity(struct key *key, char **_buffer)
{
        struct key_security_struct *ksec = key->security;
        char *context = NULL;
        unsigned len;
        int rc;

        rc = security_sid_to_context(&selinux_state, ksec->sid,
                                     &context, &len);
        if (!rc)
                rc = len;
        *_buffer = context;
        return rc;
}

#ifdef CONFIG_KEY_NOTIFICATIONS
static int selinux_watch_key(struct key *key)
{
        struct key_security_struct *ksec = key->security;
        u32 sid = current_sid();

        return avc_has_perm(&selinux_state,
                            sid, ksec->sid, SECCLASS_KEY, KEY__VIEW, NULL);
}
#endif
#endif

#ifdef CONFIG_SECURITY_INFINIBAND
static int selinux_ib_pkey_access(void *ib_sec, u64 subnet_prefix, u16 pkey_val)
{
        struct common_audit_data ad;
        int err;
        u32 sid = 0;
        struct ib_security_struct *sec = ib_sec;
        struct lsm_ibpkey_audit ibpkey;

        err = sel_ib_pkey_sid(subnet_prefix, pkey_val, &sid);
        if (err)
                return err;

        ad.type = LSM_AUDIT_DATA_IBPKEY;
        ibpkey.subnet_prefix = subnet_prefix;
        ibpkey.pkey = pkey_val;
        ad.u.ibpkey = &ibpkey;
        return avc_has_perm(&selinux_state,
                            sec->sid, sid,
                            SECCLASS_INFINIBAND_PKEY,
                            INFINIBAND_PKEY__ACCESS, &ad);
}

static int selinux_ib_endport_manage_subnet(void *ib_sec, const char *dev_name,
                                            u8 port_num)
{
        struct common_audit_data ad;
        int err;
        u32 sid = 0;
        struct ib_security_struct *sec = ib_sec;
        struct lsm_ibendport_audit ibendport;

        err = security_ib_endport_sid(&selinux_state, dev_name, port_num,
                                      &sid);

        if (err)
                return err;

        ad.type = LSM_AUDIT_DATA_IBENDPORT;
        strncpy(ibendport.dev_name, dev_name, sizeof(ibendport.dev_name));
        ibendport.port = port_num;
        ad.u.ibendport = &ibendport;
        return avc_has_perm(&selinux_state,
                            sec->sid, sid,
                            SECCLASS_INFINIBAND_ENDPORT,
                            INFINIBAND_ENDPORT__MANAGE_SUBNET, &ad);
}

static int selinux_ib_alloc_security(void **ib_sec)
{
        struct ib_security_struct *sec;

        sec = kzalloc(sizeof(*sec), GFP_KERNEL);
        if (!sec)
                return -ENOMEM;
        sec->sid = current_sid();

        *ib_sec = sec;
        return 0;
}

static void selinux_ib_free_security(void *ib_sec)
{
        kfree(ib_sec);
}
#endif

#ifdef CONFIG_BPF_SYSCALL
static int selinux_bpf(int cmd, union bpf_attr *attr,
                                     unsigned int size)
{
        u32 sid = current_sid();
        int ret;

        switch (cmd) {
        case BPF_MAP_CREATE:
                ret = avc_has_perm(&selinux_state,
                                   sid, sid, SECCLASS_BPF, BPF__MAP_CREATE,
                                   NULL);
                break;
        case BPF_PROG_LOAD:
                ret = avc_has_perm(&selinux_state,
                                   sid, sid, SECCLASS_BPF, BPF__PROG_LOAD,
                                   NULL);
                break;
        default:
                ret = 0;
                break;
        }

        return ret;
}

static u32 bpf_map_fmode_to_av(fmode_t fmode)
{
        u32 av = 0;

        if (fmode & FMODE_READ)
                av |= BPF__MAP_READ;
        if (fmode & FMODE_WRITE)
                av |= BPF__MAP_WRITE;
        return av;
}

/* This function will check the file pass through unix socket or binder to see
 * if it is a bpf related object. And apply correspinding checks on the bpf
 * object based on the type. The bpf maps and programs, not like other files and
 * socket, are using a shared anonymous inode inside the kernel as their inode.
 * So checking that inode cannot identify if the process have privilege to
 * access the bpf object and that's why we have to add this additional check in
 * selinux_file_receive and selinux_binder_transfer_files.
 */
static int bpf_fd_pass(struct file *file, u32 sid)
{
        struct bpf_security_struct *bpfsec;
        struct bpf_prog *prog;
        struct bpf_map *map;
        int ret;

        if (file->f_op == &bpf_map_fops) {
                map = file->private_data;
                bpfsec = map->security;
                ret = avc_has_perm(&selinux_state,
                                   sid, bpfsec->sid, SECCLASS_BPF,
                                   bpf_map_fmode_to_av(file->f_mode), NULL);
                if (ret)
                        return ret;
        } else if (file->f_op == &bpf_prog_fops) {
                prog = file->private_data;
                bpfsec = prog->aux->security;
                ret = avc_has_perm(&selinux_state,
                                   sid, bpfsec->sid, SECCLASS_BPF,
                                   BPF__PROG_RUN, NULL);
                if (ret)
                        return ret;
        }
        return 0;
}

static int selinux_bpf_map(struct bpf_map *map, fmode_t fmode)
{
        u32 sid = current_sid();
        struct bpf_security_struct *bpfsec;

        bpfsec = map->security;
        return avc_has_perm(&selinux_state,
                            sid, bpfsec->sid, SECCLASS_BPF,
                            bpf_map_fmode_to_av(fmode), NULL);
}

static int selinux_bpf_prog(struct bpf_prog *prog)
{
        u32 sid = current_sid();
        struct bpf_security_struct *bpfsec;

        bpfsec = prog->aux->security;
        return avc_has_perm(&selinux_state,
                            sid, bpfsec->sid, SECCLASS_BPF,
                            BPF__PROG_RUN, NULL);
}

static int selinux_bpf_map_alloc(struct bpf_map *map)
{
        struct bpf_security_struct *bpfsec;

        bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL);
        if (!bpfsec)
                return -ENOMEM;

        bpfsec->sid = current_sid();
        map->security = bpfsec;

        return 0;
}

static void selinux_bpf_map_free(struct bpf_map *map)
{
        struct bpf_security_struct *bpfsec = map->security;

        map->security = NULL;
        kfree(bpfsec);
}

static int selinux_bpf_prog_alloc(struct bpf_prog_aux *aux)
{
        struct bpf_security_struct *bpfsec;

        bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL);
        if (!bpfsec)
                return -ENOMEM;

        bpfsec->sid = current_sid();
        aux->security = bpfsec;

        return 0;
}

static void selinux_bpf_prog_free(struct bpf_prog_aux *aux)
{
        struct bpf_security_struct *bpfsec = aux->security;

        aux->security = NULL;
        kfree(bpfsec);
}
#endif

static int selinux_lockdown(enum lockdown_reason what)
{
        struct common_audit_data ad;
        u32 sid = current_sid();
        int invalid_reason = (what <= LOCKDOWN_NONE) ||
                             (what == LOCKDOWN_INTEGRITY_MAX) ||
                             (what >= LOCKDOWN_CONFIDENTIALITY_MAX);

        if (WARN(invalid_reason, "Invalid lockdown reason")) {
                audit_log(audit_context(),
                          GFP_ATOMIC, AUDIT_SELINUX_ERR,
                          "lockdown_reason=invalid");
                return -EINVAL;
        }

        ad.type = LSM_AUDIT_DATA_LOCKDOWN;
        ad.u.reason = what;

        if (what <= LOCKDOWN_INTEGRITY_MAX)
                return avc_has_perm(&selinux_state,
                                    sid, sid, SECCLASS_LOCKDOWN,
                                    LOCKDOWN__INTEGRITY, &ad);
        else
                return avc_has_perm(&selinux_state,
                                    sid, sid, SECCLASS_LOCKDOWN,
                                    LOCKDOWN__CONFIDENTIALITY, &ad);
}

struct lsm_blob_sizes selinux_blob_sizes __lsm_ro_after_init = {
        .lbs_cred = sizeof(struct task_security_struct),
        .lbs_file = sizeof(struct file_security_struct),
        .lbs_inode = sizeof(struct inode_security_struct),
        .lbs_ipc = sizeof(struct ipc_security_struct),
        .lbs_msg_msg = sizeof(struct msg_security_struct),
};

#ifdef CONFIG_PERF_EVENTS
static int selinux_perf_event_open(struct perf_event_attr *attr, int type)
{
        u32 requested, sid = current_sid();

        if (type == PERF_SECURITY_OPEN)
                requested = PERF_EVENT__OPEN;
        else if (type == PERF_SECURITY_CPU)
                requested = PERF_EVENT__CPU;
        else if (type == PERF_SECURITY_KERNEL)
                requested = PERF_EVENT__KERNEL;
        else if (type == PERF_SECURITY_TRACEPOINT)
                requested = PERF_EVENT__TRACEPOINT;
        else
                return -EINVAL;

        return avc_has_perm(&selinux_state, sid, sid, SECCLASS_PERF_EVENT,
                            requested, NULL);
}

static int selinux_perf_event_alloc(struct perf_event *event)
{
        struct perf_event_security_struct *perfsec;

        perfsec = kzalloc(sizeof(*perfsec), GFP_KERNEL);
        if (!perfsec)
                return -ENOMEM;

        perfsec->sid = current_sid();
        event->security = perfsec;

        return 0;
}

static void selinux_perf_event_free(struct perf_event *event)
{
        struct perf_event_security_struct *perfsec = event->security;

        event->security = NULL;
        kfree(perfsec);
}

static int selinux_perf_event_read(struct perf_event *event)
{
        struct perf_event_security_struct *perfsec = event->security;
        u32 sid = current_sid();

        return avc_has_perm(&selinux_state, sid, perfsec->sid,
                            SECCLASS_PERF_EVENT, PERF_EVENT__READ, NULL);
}

static int selinux_perf_event_write(struct perf_event *event)
{
        struct perf_event_security_struct *perfsec = event->security;
        u32 sid = current_sid();

        return avc_has_perm(&selinux_state, sid, perfsec->sid,
                            SECCLASS_PERF_EVENT, PERF_EVENT__WRITE, NULL);
}
#endif

/*
 * IMPORTANT NOTE: When adding new hooks, please be careful to keep this order:
 * 1. any hooks that don't belong to (2.) or (3.) below,
 * 2. hooks that both access structures allocated by other hooks, and allocate
 *    structures that can be later accessed by other hooks (mostly "cloning"
 *    hooks),
 * 3. hooks that only allocate structures that can be later accessed by other
 *    hooks ("allocating" hooks).
 *
 * Please follow block comment delimiters in the list to keep this order.
 *
 * This ordering is needed for SELinux runtime disable to work at least somewhat
 * safely. Breaking the ordering rules above might lead to NULL pointer derefs
 * when disabling SELinux at runtime.
 */
static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
        LSM_HOOK_INIT(binder_set_context_mgr, selinux_binder_set_context_mgr),
        LSM_HOOK_INIT(binder_transaction, selinux_binder_transaction),
        LSM_HOOK_INIT(binder_transfer_binder, selinux_binder_transfer_binder),
        LSM_HOOK_INIT(binder_transfer_file, selinux_binder_transfer_file),

        LSM_HOOK_INIT(ptrace_access_check, selinux_ptrace_access_check),
        LSM_HOOK_INIT(ptrace_traceme, selinux_ptrace_traceme),
        LSM_HOOK_INIT(capget, selinux_capget),
        LSM_HOOK_INIT(capset, selinux_capset),
        LSM_HOOK_INIT(capable, selinux_capable),
        LSM_HOOK_INIT(quotactl, selinux_quotactl),
        LSM_HOOK_INIT(quota_on, selinux_quota_on),
        LSM_HOOK_INIT(syslog, selinux_syslog),
        LSM_HOOK_INIT(vm_enough_memory, selinux_vm_enough_memory),

        LSM_HOOK_INIT(netlink_send, selinux_netlink_send),

        LSM_HOOK_INIT(bprm_creds_for_exec, selinux_bprm_creds_for_exec),
        LSM_HOOK_INIT(bprm_committing_creds, selinux_bprm_committing_creds),
        LSM_HOOK_INIT(bprm_committed_creds, selinux_bprm_committed_creds),

        LSM_HOOK_INIT(sb_free_security, selinux_sb_free_security),
        LSM_HOOK_INIT(sb_free_mnt_opts, selinux_free_mnt_opts),
        LSM_HOOK_INIT(sb_remount, selinux_sb_remount),
        LSM_HOOK_INIT(sb_kern_mount, selinux_sb_kern_mount),
        LSM_HOOK_INIT(sb_show_options, selinux_sb_show_options),
        LSM_HOOK_INIT(sb_statfs, selinux_sb_statfs),
        LSM_HOOK_INIT(sb_mount, selinux_mount),
        LSM_HOOK_INIT(sb_umount, selinux_umount),
        LSM_HOOK_INIT(sb_set_mnt_opts, selinux_set_mnt_opts),
        LSM_HOOK_INIT(sb_clone_mnt_opts, selinux_sb_clone_mnt_opts),

        LSM_HOOK_INIT(move_mount, selinux_move_mount),

        LSM_HOOK_INIT(dentry_init_security, selinux_dentry_init_security),
        LSM_HOOK_INIT(dentry_create_files_as, selinux_dentry_create_files_as),

        LSM_HOOK_INIT(inode_free_security, selinux_inode_free_security),
        LSM_HOOK_INIT(inode_init_security, selinux_inode_init_security),
        LSM_HOOK_INIT(inode_create, selinux_inode_create),
        LSM_HOOK_INIT(inode_link, selinux_inode_link),
        LSM_HOOK_INIT(inode_unlink, selinux_inode_unlink),
        LSM_HOOK_INIT(inode_symlink, selinux_inode_symlink),
        LSM_HOOK_INIT(inode_mkdir, selinux_inode_mkdir),
        LSM_HOOK_INIT(inode_rmdir, selinux_inode_rmdir),
        LSM_HOOK_INIT(inode_mknod, selinux_inode_mknod),
        LSM_HOOK_INIT(inode_rename, selinux_inode_rename),
        LSM_HOOK_INIT(inode_readlink, selinux_inode_readlink),
        LSM_HOOK_INIT(inode_follow_link, selinux_inode_follow_link),
        LSM_HOOK_INIT(inode_permission, selinux_inode_permission),
        LSM_HOOK_INIT(inode_setattr, selinux_inode_setattr),
        LSM_HOOK_INIT(inode_getattr, selinux_inode_getattr),
        LSM_HOOK_INIT(inode_setxattr, selinux_inode_setxattr),
        LSM_HOOK_INIT(inode_post_setxattr, selinux_inode_post_setxattr),
        LSM_HOOK_INIT(inode_getxattr, selinux_inode_getxattr),
        LSM_HOOK_INIT(inode_listxattr, selinux_inode_listxattr),
        LSM_HOOK_INIT(inode_removexattr, selinux_inode_removexattr),
        LSM_HOOK_INIT(inode_getsecurity, selinux_inode_getsecurity),
        LSM_HOOK_INIT(inode_setsecurity, selinux_inode_setsecurity),
        LSM_HOOK_INIT(inode_listsecurity, selinux_inode_listsecurity),
        LSM_HOOK_INIT(inode_getsecid, selinux_inode_getsecid),
        LSM_HOOK_INIT(inode_copy_up, selinux_inode_copy_up),
        LSM_HOOK_INIT(inode_copy_up_xattr, selinux_inode_copy_up_xattr),
        LSM_HOOK_INIT(path_notify, selinux_path_notify),

        LSM_HOOK_INIT(kernfs_init_security, selinux_kernfs_init_security),

        LSM_HOOK_INIT(file_permission, selinux_file_permission),
        LSM_HOOK_INIT(file_alloc_security, selinux_file_alloc_security),
        LSM_HOOK_INIT(file_ioctl, selinux_file_ioctl),
        LSM_HOOK_INIT(file_ioctl_compat, selinux_file_ioctl_compat),
        LSM_HOOK_INIT(mmap_file, selinux_mmap_file),
        LSM_HOOK_INIT(mmap_addr, selinux_mmap_addr),
        LSM_HOOK_INIT(file_mprotect, selinux_file_mprotect),
        LSM_HOOK_INIT(file_lock, selinux_file_lock),
        LSM_HOOK_INIT(file_fcntl, selinux_file_fcntl),
        LSM_HOOK_INIT(file_set_fowner, selinux_file_set_fowner),
        LSM_HOOK_INIT(file_send_sigiotask, selinux_file_send_sigiotask),
        LSM_HOOK_INIT(file_receive, selinux_file_receive),

        LSM_HOOK_INIT(file_open, selinux_file_open),

        LSM_HOOK_INIT(task_alloc, selinux_task_alloc),
        LSM_HOOK_INIT(cred_prepare, selinux_cred_prepare),
        LSM_HOOK_INIT(cred_transfer, selinux_cred_transfer),
        LSM_HOOK_INIT(cred_getsecid, selinux_cred_getsecid),
        LSM_HOOK_INIT(kernel_act_as, selinux_kernel_act_as),
        LSM_HOOK_INIT(kernel_create_files_as, selinux_kernel_create_files_as),
        LSM_HOOK_INIT(kernel_module_request, selinux_kernel_module_request),
        LSM_HOOK_INIT(kernel_load_data, selinux_kernel_load_data),
        LSM_HOOK_INIT(kernel_read_file, selinux_kernel_read_file),
        LSM_HOOK_INIT(task_setpgid, selinux_task_setpgid),
        LSM_HOOK_INIT(task_getpgid, selinux_task_getpgid),
        LSM_HOOK_INIT(task_getsid, selinux_task_getsid),
        LSM_HOOK_INIT(task_getsecid, selinux_task_getsecid),
        LSM_HOOK_INIT(task_setnice, selinux_task_setnice),
        LSM_HOOK_INIT(task_setioprio, selinux_task_setioprio),
        LSM_HOOK_INIT(task_getioprio, selinux_task_getioprio),
        LSM_HOOK_INIT(task_prlimit, selinux_task_prlimit),
        LSM_HOOK_INIT(task_setrlimit, selinux_task_setrlimit),
        LSM_HOOK_INIT(task_setscheduler, selinux_task_setscheduler),
        LSM_HOOK_INIT(task_getscheduler, selinux_task_getscheduler),
        LSM_HOOK_INIT(task_movememory, selinux_task_movememory),
        LSM_HOOK_INIT(task_kill, selinux_task_kill),
        LSM_HOOK_INIT(task_to_inode, selinux_task_to_inode),

        LSM_HOOK_INIT(ipc_permission, selinux_ipc_permission),
        LSM_HOOK_INIT(ipc_getsecid, selinux_ipc_getsecid),

        LSM_HOOK_INIT(msg_queue_associate, selinux_msg_queue_associate),
        LSM_HOOK_INIT(msg_queue_msgctl, selinux_msg_queue_msgctl),
        LSM_HOOK_INIT(msg_queue_msgsnd, selinux_msg_queue_msgsnd),
        LSM_HOOK_INIT(msg_queue_msgrcv, selinux_msg_queue_msgrcv),

        LSM_HOOK_INIT(shm_associate, selinux_shm_associate),
        LSM_HOOK_INIT(shm_shmctl, selinux_shm_shmctl),
        LSM_HOOK_INIT(shm_shmat, selinux_shm_shmat),

        LSM_HOOK_INIT(sem_associate, selinux_sem_associate),
        LSM_HOOK_INIT(sem_semctl, selinux_sem_semctl),
        LSM_HOOK_INIT(sem_semop, selinux_sem_semop),

        LSM_HOOK_INIT(d_instantiate, selinux_d_instantiate),

        LSM_HOOK_INIT(getprocattr, selinux_getprocattr),
        LSM_HOOK_INIT(setprocattr, selinux_setprocattr),

        LSM_HOOK_INIT(ismaclabel, selinux_ismaclabel),
        LSM_HOOK_INIT(secctx_to_secid, selinux_secctx_to_secid),
        LSM_HOOK_INIT(release_secctx, selinux_release_secctx),
        LSM_HOOK_INIT(inode_invalidate_secctx, selinux_inode_invalidate_secctx),
        LSM_HOOK_INIT(inode_notifysecctx, selinux_inode_notifysecctx),
        LSM_HOOK_INIT(inode_setsecctx, selinux_inode_setsecctx),

        LSM_HOOK_INIT(unix_stream_connect, selinux_socket_unix_stream_connect),
        LSM_HOOK_INIT(unix_may_send, selinux_socket_unix_may_send),

        LSM_HOOK_INIT(socket_create, selinux_socket_create),
        LSM_HOOK_INIT(socket_post_create, selinux_socket_post_create),
        LSM_HOOK_INIT(socket_socketpair, selinux_socket_socketpair),
        LSM_HOOK_INIT(socket_bind, selinux_socket_bind),
        LSM_HOOK_INIT(socket_connect, selinux_socket_connect),
        LSM_HOOK_INIT(socket_listen, selinux_socket_listen),
        LSM_HOOK_INIT(socket_accept, selinux_socket_accept),
        LSM_HOOK_INIT(socket_sendmsg, selinux_socket_sendmsg),
        LSM_HOOK_INIT(socket_recvmsg, selinux_socket_recvmsg),
        LSM_HOOK_INIT(socket_getsockname, selinux_socket_getsockname),
        LSM_HOOK_INIT(socket_getpeername, selinux_socket_getpeername),
        LSM_HOOK_INIT(socket_getsockopt, selinux_socket_getsockopt),
        LSM_HOOK_INIT(socket_setsockopt, selinux_socket_setsockopt),
        LSM_HOOK_INIT(socket_shutdown, selinux_socket_shutdown),
        LSM_HOOK_INIT(socket_sock_rcv_skb, selinux_socket_sock_rcv_skb),
        LSM_HOOK_INIT(socket_getpeersec_stream,
                        selinux_socket_getpeersec_stream),
        LSM_HOOK_INIT(socket_getpeersec_dgram, selinux_socket_getpeersec_dgram),
        LSM_HOOK_INIT(sk_free_security, selinux_sk_free_security),
        LSM_HOOK_INIT(sk_clone_security, selinux_sk_clone_security),
        LSM_HOOK_INIT(sk_getsecid, selinux_sk_getsecid),
        LSM_HOOK_INIT(sock_graft, selinux_sock_graft),
        LSM_HOOK_INIT(sctp_assoc_request, selinux_sctp_assoc_request),
        LSM_HOOK_INIT(sctp_sk_clone, selinux_sctp_sk_clone),
        LSM_HOOK_INIT(sctp_bind_connect, selinux_sctp_bind_connect),
        LSM_HOOK_INIT(inet_conn_request, selinux_inet_conn_request),
        LSM_HOOK_INIT(inet_csk_clone, selinux_inet_csk_clone),
        LSM_HOOK_INIT(inet_conn_established, selinux_inet_conn_established),
        LSM_HOOK_INIT(secmark_relabel_packet, selinux_secmark_relabel_packet),
        LSM_HOOK_INIT(secmark_refcount_inc, selinux_secmark_refcount_inc),
        LSM_HOOK_INIT(secmark_refcount_dec, selinux_secmark_refcount_dec),
        LSM_HOOK_INIT(req_classify_flow, selinux_req_classify_flow),
        LSM_HOOK_INIT(tun_dev_free_security, selinux_tun_dev_free_security),
        LSM_HOOK_INIT(tun_dev_create, selinux_tun_dev_create),
        LSM_HOOK_INIT(tun_dev_attach_queue, selinux_tun_dev_attach_queue),
        LSM_HOOK_INIT(tun_dev_attach, selinux_tun_dev_attach),
        LSM_HOOK_INIT(tun_dev_open, selinux_tun_dev_open),
#ifdef CONFIG_SECURITY_INFINIBAND
        LSM_HOOK_INIT(ib_pkey_access, selinux_ib_pkey_access),
        LSM_HOOK_INIT(ib_endport_manage_subnet,
                      selinux_ib_endport_manage_subnet),
        LSM_HOOK_INIT(ib_free_security, selinux_ib_free_security),
#endif
#ifdef CONFIG_SECURITY_NETWORK_XFRM
        LSM_HOOK_INIT(xfrm_policy_free_security, selinux_xfrm_policy_free),
        LSM_HOOK_INIT(xfrm_policy_delete_security, selinux_xfrm_policy_delete),
        LSM_HOOK_INIT(xfrm_state_free_security, selinux_xfrm_state_free),
        LSM_HOOK_INIT(xfrm_state_delete_security, selinux_xfrm_state_delete),
        LSM_HOOK_INIT(xfrm_policy_lookup, selinux_xfrm_policy_lookup),
        LSM_HOOK_INIT(xfrm_state_pol_flow_match,
                        selinux_xfrm_state_pol_flow_match),
        LSM_HOOK_INIT(xfrm_decode_session, selinux_xfrm_decode_session),
#endif

#ifdef CONFIG_KEYS
        LSM_HOOK_INIT(key_free, selinux_key_free),
        LSM_HOOK_INIT(key_permission, selinux_key_permission),
        LSM_HOOK_INIT(key_getsecurity, selinux_key_getsecurity),
#ifdef CONFIG_KEY_NOTIFICATIONS
        LSM_HOOK_INIT(watch_key, selinux_watch_key),
#endif
#endif

#ifdef CONFIG_AUDIT
        LSM_HOOK_INIT(audit_rule_known, selinux_audit_rule_known),
        LSM_HOOK_INIT(audit_rule_match, selinux_audit_rule_match),
        LSM_HOOK_INIT(audit_rule_free, selinux_audit_rule_free),
#endif

#ifdef CONFIG_BPF_SYSCALL
        LSM_HOOK_INIT(bpf, selinux_bpf),
        LSM_HOOK_INIT(bpf_map, selinux_bpf_map),
        LSM_HOOK_INIT(bpf_prog, selinux_bpf_prog),
        LSM_HOOK_INIT(bpf_map_free_security, selinux_bpf_map_free),
        LSM_HOOK_INIT(bpf_prog_free_security, selinux_bpf_prog_free),
#endif

#ifdef CONFIG_PERF_EVENTS
        LSM_HOOK_INIT(perf_event_open, selinux_perf_event_open),
        LSM_HOOK_INIT(perf_event_free, selinux_perf_event_free),
        LSM_HOOK_INIT(perf_event_read, selinux_perf_event_read),
        LSM_HOOK_INIT(perf_event_write, selinux_perf_event_write),
#endif

        LSM_HOOK_INIT(locked_down, selinux_lockdown),

        /*
         * PUT "CLONING" (ACCESSING + ALLOCATING) HOOKS HERE
         */
        LSM_HOOK_INIT(fs_context_dup, selinux_fs_context_dup),
        LSM_HOOK_INIT(fs_context_parse_param, selinux_fs_context_parse_param),
        LSM_HOOK_INIT(sb_eat_lsm_opts, selinux_sb_eat_lsm_opts),
        LSM_HOOK_INIT(sb_add_mnt_opt, selinux_add_mnt_opt),
#ifdef CONFIG_SECURITY_NETWORK_XFRM
        LSM_HOOK_INIT(xfrm_policy_clone_security, selinux_xfrm_policy_clone),
#endif

        /*
         * PUT "ALLOCATING" HOOKS HERE
         */
        LSM_HOOK_INIT(msg_msg_alloc_security, selinux_msg_msg_alloc_security),
        LSM_HOOK_INIT(msg_queue_alloc_security,
                      selinux_msg_queue_alloc_security),
        LSM_HOOK_INIT(shm_alloc_security, selinux_shm_alloc_security),
        LSM_HOOK_INIT(sb_alloc_security, selinux_sb_alloc_security),
        LSM_HOOK_INIT(inode_alloc_security, selinux_inode_alloc_security),
        LSM_HOOK_INIT(sem_alloc_security, selinux_sem_alloc_security),
        LSM_HOOK_INIT(secid_to_secctx, selinux_secid_to_secctx),
        LSM_HOOK_INIT(inode_getsecctx, selinux_inode_getsecctx),
        LSM_HOOK_INIT(sk_alloc_security, selinux_sk_alloc_security),
        LSM_HOOK_INIT(tun_dev_alloc_security, selinux_tun_dev_alloc_security),
#ifdef CONFIG_SECURITY_INFINIBAND
        LSM_HOOK_INIT(ib_alloc_security, selinux_ib_alloc_security),
#endif
#ifdef CONFIG_SECURITY_NETWORK_XFRM
        LSM_HOOK_INIT(xfrm_policy_alloc_security, selinux_xfrm_policy_alloc),
        LSM_HOOK_INIT(xfrm_state_alloc, selinux_xfrm_state_alloc),
        LSM_HOOK_INIT(xfrm_state_alloc_acquire,
                      selinux_xfrm_state_alloc_acquire),
#endif
#ifdef CONFIG_KEYS
        LSM_HOOK_INIT(key_alloc, selinux_key_alloc),
#endif
#ifdef CONFIG_AUDIT
        LSM_HOOK_INIT(audit_rule_init, selinux_audit_rule_init),
#endif
#ifdef CONFIG_BPF_SYSCALL
        LSM_HOOK_INIT(bpf_map_alloc_security, selinux_bpf_map_alloc),
        LSM_HOOK_INIT(bpf_prog_alloc_security, selinux_bpf_prog_alloc),
#endif
#ifdef CONFIG_PERF_EVENTS
        LSM_HOOK_INIT(perf_event_alloc, selinux_perf_event_alloc),
#endif
};

static __init int selinux_init(void)
{
        pr_info("SELinux:  Initializing.\n");

        memset(&selinux_state, 0, sizeof(selinux_state));
        enforcing_set(&selinux_state, selinux_enforcing_boot);
        checkreqprot_set(&selinux_state, selinux_checkreqprot_boot);
        selinux_avc_init(&selinux_state.avc);
        mutex_init(&selinux_state.status_lock);
        mutex_init(&selinux_state.policy_mutex);

        /* Set the security state for the initial task. */
        cred_init_security();

        default_noexec = !(VM_DATA_DEFAULT_FLAGS & VM_EXEC);

        avc_init();

        avtab_cache_init();

        ebitmap_cache_init();

        hashtab_cache_init();

        security_add_hooks(selinux_hooks, ARRAY_SIZE(selinux_hooks), "selinux");

        if (avc_add_callback(selinux_netcache_avc_callback, AVC_CALLBACK_RESET))
                panic("SELinux: Unable to register AVC netcache callback\n");

        if (avc_add_callback(selinux_lsm_notifier_avc_callback, AVC_CALLBACK_RESET))
                panic("SELinux: Unable to register AVC LSM notifier callback\n");

        if (selinux_enforcing_boot)
                pr_debug("SELinux:  Starting in enforcing mode\n");
        else
                pr_debug("SELinux:  Starting in permissive mode\n");

        fs_validate_description("selinux", selinux_fs_parameters);

        return 0;
}

static void delayed_superblock_init(struct super_block *sb, void *unused)
{
        selinux_set_mnt_opts(sb, NULL, 0, NULL);
}

void selinux_complete_init(void)
{
        pr_debug("SELinux:  Completing initialization.\n");

        /* Set up any superblocks initialized prior to the policy load. */
        pr_debug("SELinux:  Setting up existing superblocks.\n");
        iterate_supers(delayed_superblock_init, NULL);
}

/* SELinux requires early initialization in order to label
   all processes and objects when they are created. */
DEFINE_LSM(selinux) = {
        .name = "selinux",
        .flags = LSM_FLAG_LEGACY_MAJOR | LSM_FLAG_EXCLUSIVE,
        .enabled = &selinux_enabled_boot,
        .blobs = &selinux_blob_sizes,
        .init = selinux_init,
};

#if defined(CONFIG_NETFILTER)

static const struct nf_hook_ops selinux_nf_ops[] = {
        {
                .hook =                selinux_ipv4_postroute,
                .pf =                NFPROTO_IPV4,
                .hooknum =        NF_INET_POST_ROUTING,
                .priority =        NF_IP_PRI_SELINUX_LAST,
        },
        {
                .hook =                selinux_ipv4_forward,
                .pf =                NFPROTO_IPV4,
                .hooknum =        NF_INET_FORWARD,
                .priority =        NF_IP_PRI_SELINUX_FIRST,
        },
        {
                .hook =                selinux_ipv4_output,
                .pf =                NFPROTO_IPV4,
                .hooknum =        NF_INET_LOCAL_OUT,
                .priority =        NF_IP_PRI_SELINUX_FIRST,
        },
#if IS_ENABLED(CONFIG_IPV6)
        {
                .hook =                selinux_ipv6_postroute,
                .pf =                NFPROTO_IPV6,
                .hooknum =        NF_INET_POST_ROUTING,
                .priority =        NF_IP6_PRI_SELINUX_LAST,
        },
        {
                .hook =                selinux_ipv6_forward,
                .pf =                NFPROTO_IPV6,
                .hooknum =        NF_INET_FORWARD,
                .priority =        NF_IP6_PRI_SELINUX_FIRST,
        },
        {
                .hook =                selinux_ipv6_output,
                .pf =                NFPROTO_IPV6,
                .hooknum =        NF_INET_LOCAL_OUT,
                .priority =        NF_IP6_PRI_SELINUX_FIRST,
        },
#endif        /* IPV6 */
};

static int __net_init selinux_nf_register(struct net *net)
{
        return nf_register_net_hooks(net, selinux_nf_ops,
                                     ARRAY_SIZE(selinux_nf_ops));
}

static void __net_exit selinux_nf_unregister(struct net *net)
{
        nf_unregister_net_hooks(net, selinux_nf_ops,
                                ARRAY_SIZE(selinux_nf_ops));
}

static struct pernet_operations selinux_net_ops = {
        .init = selinux_nf_register,
        .exit = selinux_nf_unregister,
};

static int __init selinux_nf_ip_init(void)
{
        int err;

        if (!selinux_enabled_boot)
                return 0;

        pr_debug("SELinux:  Registering netfilter hooks\n");

        err = register_pernet_subsys(&selinux_net_ops);
        if (err)
                panic("SELinux: register_pernet_subsys: error %d\n", err);

        return 0;
}
__initcall(selinux_nf_ip_init);

#ifdef CONFIG_SECURITY_SELINUX_DISABLE
static void selinux_nf_ip_exit(void)
{
        pr_debug("SELinux:  Unregistering netfilter hooks\n");

        unregister_pernet_subsys(&selinux_net_ops);
}
#endif

#else /* CONFIG_NETFILTER */

#ifdef CONFIG_SECURITY_SELINUX_DISABLE
#define selinux_nf_ip_exit()
#endif

#endif /* CONFIG_NETFILTER */

#ifdef CONFIG_SECURITY_SELINUX_DISABLE
int selinux_disable(struct selinux_state *state)
{
        if (selinux_initialized(state)) {
                /* Not permitted after initial policy load. */
                return -EINVAL;
        }

        if (selinux_disabled(state)) {
                /* Only do this once. */
                return -EINVAL;
        }

        selinux_mark_disabled(state);

        pr_info("SELinux:  Disabled at runtime.\n");

        /*
         * Unregister netfilter hooks.
         * Must be done before security_delete_hooks() to avoid breaking
         * runtime disable.
         */
        selinux_nf_ip_exit();

        security_delete_hooks(selinux_hooks, ARRAY_SIZE(selinux_hooks));

        /* Try to destroy the avc node cache */
        avc_disable();

        /* Unregister selinuxfs. */
        exit_sel_fs();

        return 0;
}
#endif
























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
/* SPDX-License-Identifier: GPL-2.0 */
/*
  File: linux/posix_acl_xattr.h

  Extended attribute system call representation of Access Control Lists.

  Copyright (C) 2000 by Andreas Gruenbacher <a.gruenbacher@computer.org>
  Copyright (C) 2002 SGI - Silicon Graphics, Inc <linux-xfs@oss.sgi.com>
 */
#ifndef _POSIX_ACL_XATTR_H
#define _POSIX_ACL_XATTR_H

#include <uapi/linux/xattr.h>
#include <uapi/linux/posix_acl_xattr.h>
#include <linux/posix_acl.h>

static inline size_t
posix_acl_xattr_size(int count)
{
        return (sizeof(struct posix_acl_xattr_header) +
                (count * sizeof(struct posix_acl_xattr_entry)));
}

static inline int
posix_acl_xattr_count(size_t size)
{
        if (size < sizeof(struct posix_acl_xattr_header))
                return -1;
        size -= sizeof(struct posix_acl_xattr_header);
        if (size % sizeof(struct posix_acl_xattr_entry))
                return -1;
        return size / sizeof(struct posix_acl_xattr_entry);
}

#ifdef CONFIG_FS_POSIX_ACL
void posix_acl_fix_xattr_from_user(void *value, size_t size);
void posix_acl_fix_xattr_to_user(void *value, size_t size);
#else
static inline void posix_acl_fix_xattr_from_user(void *value, size_t size)
{
}
static inline void posix_acl_fix_xattr_to_user(void *value, size_t size)
{
}
#endif

struct posix_acl *posix_acl_from_xattr(struct user_namespace *user_ns, 
                                       const void *value, size_t size);
int posix_acl_to_xattr(struct user_namespace *user_ns,
                       const struct posix_acl *acl, void *buffer, size_t size);

extern const struct xattr_handler posix_acl_access_xattr_handler;
extern const struct xattr_handler posix_acl_default_xattr_handler;

#endif        /* _POSIX_ACL_XATTR_H */



























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NETFILTER_INGRESS_H_
#define _NETFILTER_INGRESS_H_

#include <linux/netfilter.h>
#include <linux/netdevice.h>

#ifdef CONFIG_NETFILTER_INGRESS
static inline bool nf_hook_ingress_active(const struct sk_buff *skb)
{
#ifdef CONFIG_JUMP_LABEL
        if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_INGRESS]))
                return false;
#endif
        return rcu_access_pointer(skb->dev->nf_hooks_ingress);
}

/* caller must hold rcu_read_lock */
static inline int nf_hook_ingress(struct sk_buff *skb)
{
        struct nf_hook_entries *e = rcu_dereference(skb->dev->nf_hooks_ingress);
        struct nf_hook_state state;
        int ret;

        /* Must recheck the ingress hook head, in the event it became NULL
         * after the check in nf_hook_ingress_active evaluated to true.
         */
        if (unlikely(!e))
                return 0;

        nf_hook_state_init(&state, NF_NETDEV_INGRESS,
                           NFPROTO_NETDEV, skb->dev, NULL, NULL,
                           dev_net(skb->dev), NULL);
        ret = nf_hook_slow(skb, &state, e, 0);
        if (ret == 0)
                return -1;

        return ret;
}

static inline void nf_hook_ingress_init(struct net_device *dev)
{
        RCU_INIT_POINTER(dev->nf_hooks_ingress, NULL);
}
#else /* CONFIG_NETFILTER_INGRESS */
static inline int nf_hook_ingress_active(struct sk_buff *skb)
{
        return 0;
}

static inline int nf_hook_ingress(struct sk_buff *skb)
{
        return 0;
}

static inline void nf_hook_ingress_init(struct net_device *dev) {}
#endif /* CONFIG_NETFILTER_INGRESS */
#endif /* _NETFILTER_INGRESS_H_ */


















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 











    2 


    2 

    2 


















































































































    2 













    2 


    2 




























































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
/*
 * mm/rmap.c - physical to virtual reverse mappings
 *
 * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
 * Released under the General Public License (GPL).
 *
 * Simple, low overhead reverse mapping scheme.
 * Please try to keep this thing as modular as possible.
 *
 * Provides methods for unmapping each kind of mapped page:
 * the anon methods track anonymous pages, and
 * the file methods track pages belonging to an inode.
 *
 * Original design by Rik van Riel <riel@conectiva.com.br> 2001
 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
 * Contributions by Hugh Dickins 2003, 2004
 */

/*
 * Lock ordering in mm:
 *
 * inode->i_mutex        (while writing or truncating, not reading or faulting)
 *   mm->mmap_lock
 *     page->flags PG_locked (lock_page)   * (see huegtlbfs below)
 *       hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
 *         mapping->i_mmap_rwsem
 *           hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
 *           anon_vma->rwsem
 *             mm->page_table_lock or pte_lock
 *               pgdat->lru_lock (in mark_page_accessed, isolate_lru_page)
 *               swap_lock (in swap_duplicate, swap_info_get)
 *                 mmlist_lock (in mmput, drain_mmlist and others)
 *                 mapping->private_lock (in __set_page_dirty_buffers)
 *                   mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
 *                     i_pages lock (widely used)
 *                 inode->i_lock (in set_page_dirty's __mark_inode_dirty)
 *                 bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
 *                   sb_lock (within inode_lock in fs/fs-writeback.c)
 *                   i_pages lock (widely used, in set_page_dirty,
 *                             in arch-dependent flush_dcache_mmap_lock,
 *                             within bdi.wb->list_lock in __sync_single_inode)
 *
 * anon_vma->rwsem,mapping->i_mutex      (memory_failure, collect_procs_anon)
 *   ->tasklist_lock
 *     pte map lock
 *
 * * hugetlbfs PageHuge() pages take locks in this order:
 *         mapping->i_mmap_rwsem
 *           hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
 *             page->flags PG_locked (lock_page)
 */

#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/rcupdate.h>
#include <linux/export.h>
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
#include <linux/hugetlb.h>
#include <linux/huge_mm.h>
#include <linux/backing-dev.h>
#include <linux/page_idle.h>
#include <linux/memremap.h>
#include <linux/userfaultfd_k.h>

#include <asm/tlb.h>

#include <trace/events/tlb.h>

#include "internal.h"

static struct kmem_cache *anon_vma_cachep;
static struct kmem_cache *anon_vma_chain_cachep;

static inline struct anon_vma *anon_vma_alloc(void)
{
        struct anon_vma *anon_vma;

        anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
        if (anon_vma) {
                atomic_set(&anon_vma->refcount, 1);
                anon_vma->num_children = 0;
                anon_vma->num_active_vmas = 0;
                anon_vma->parent = anon_vma;
                /*
                 * Initialise the anon_vma root to point to itself. If called
                 * from fork, the root will be reset to the parents anon_vma.
                 */
                anon_vma->root = anon_vma;
        }

        return anon_vma;
}

static inline void anon_vma_free(struct anon_vma *anon_vma)
{
        VM_BUG_ON(atomic_read(&anon_vma->refcount));

        /*
         * Synchronize against page_lock_anon_vma_read() such that
         * we can safely hold the lock without the anon_vma getting
         * freed.
         *
         * Relies on the full mb implied by the atomic_dec_and_test() from
         * put_anon_vma() against the acquire barrier implied by
         * down_read_trylock() from page_lock_anon_vma_read(). This orders:
         *
         * page_lock_anon_vma_read()        VS        put_anon_vma()
         *   down_read_trylock()                  atomic_dec_and_test()
         *   LOCK                                  MB
         *   atomic_read()                          rwsem_is_locked()
         *
         * LOCK should suffice since the actual taking of the lock must
         * happen _before_ what follows.
         */
        might_sleep();
        if (rwsem_is_locked(&anon_vma->root->rwsem)) {
                anon_vma_lock_write(anon_vma);
                anon_vma_unlock_write(anon_vma);
        }

        kmem_cache_free(anon_vma_cachep, anon_vma);
}

static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
{
        return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
}

static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
{
        kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
}

static void anon_vma_chain_link(struct vm_area_struct *vma,
                                struct anon_vma_chain *avc,
                                struct anon_vma *anon_vma)
{
        avc->vma = vma;
        avc->anon_vma = anon_vma;
        list_add(&avc->same_vma, &vma->anon_vma_chain);
        anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
}

/**
 * __anon_vma_prepare - attach an anon_vma to a memory region
 * @vma: the memory region in question
 *
 * This makes sure the memory mapping described by 'vma' has
 * an 'anon_vma' attached to it, so that we can associate the
 * anonymous pages mapped into it with that anon_vma.
 *
 * The common case will be that we already have one, which
 * is handled inline by anon_vma_prepare(). But if
 * not we either need to find an adjacent mapping that we
 * can re-use the anon_vma from (very common when the only
 * reason for splitting a vma has been mprotect()), or we
 * allocate a new one.
 *
 * Anon-vma allocations are very subtle, because we may have
 * optimistically looked up an anon_vma in page_lock_anon_vma_read()
 * and that may actually touch the spinlock even in the newly
 * allocated vma (it depends on RCU to make sure that the
 * anon_vma isn't actually destroyed).
 *
 * As a result, we need to do proper anon_vma locking even
 * for the new allocation. At the same time, we do not want
 * to do any locking for the common case of already having
 * an anon_vma.
 *
 * This must be called with the mmap_lock held for reading.
 */
int __anon_vma_prepare(struct vm_area_struct *vma)
{
        struct mm_struct *mm = vma->vm_mm;
        struct anon_vma *anon_vma, *allocated;
        struct anon_vma_chain *avc;

        might_sleep();

        avc = anon_vma_chain_alloc(GFP_KERNEL);
        if (!avc)
                goto out_enomem;

        anon_vma = find_mergeable_anon_vma(vma);
        allocated = NULL;
        if (!anon_vma) {
                anon_vma = anon_vma_alloc();
                if (unlikely(!anon_vma))
                        goto out_enomem_free_avc;
                anon_vma->num_children++; /* self-parent link for new root */
                allocated = anon_vma;
        }

        anon_vma_lock_write(anon_vma);
        /* page_table_lock to protect against threads */
        spin_lock(&mm->page_table_lock);
        if (likely(!vma->anon_vma)) {
                vma->anon_vma = anon_vma;
                anon_vma_chain_link(vma, avc, anon_vma);
                anon_vma->num_active_vmas++;
                allocated = NULL;
                avc = NULL;
        }
        spin_unlock(&mm->page_table_lock);
        anon_vma_unlock_write(anon_vma);

        if (unlikely(allocated))
                put_anon_vma(allocated);
        if (unlikely(avc))
                anon_vma_chain_free(avc);

        return 0;

 out_enomem_free_avc:
        anon_vma_chain_free(avc);
 out_enomem:
        return -ENOMEM;
}

/*
 * This is a useful helper function for locking the anon_vma root as
 * we traverse the vma->anon_vma_chain, looping over anon_vma's that
 * have the same vma.
 *
 * Such anon_vma's should have the same root, so you'd expect to see
 * just a single mutex_lock for the whole traversal.
 */
static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
{
        struct anon_vma *new_root = anon_vma->root;
        if (new_root != root) {
                if (WARN_ON_ONCE(root))
                        up_write(&root->rwsem);
                root = new_root;
                down_write(&root->rwsem);
        }
        return root;
}

static inline void unlock_anon_vma_root(struct anon_vma *root)
{
        if (root)
                up_write(&root->rwsem);
}

/*
 * Attach the anon_vmas from src to dst.
 * Returns 0 on success, -ENOMEM on failure.
 *
 * anon_vma_clone() is called by __vma_split(), __split_vma(), copy_vma() and
 * anon_vma_fork(). The first three want an exact copy of src, while the last
 * one, anon_vma_fork(), may try to reuse an existing anon_vma to prevent
 * endless growth of anon_vma. Since dst->anon_vma is set to NULL before call,
 * we can identify this case by checking (!dst->anon_vma && src->anon_vma).
 *
 * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find
 * and reuse existing anon_vma which has no vmas and only one child anon_vma.
 * This prevents degradation of anon_vma hierarchy to endless linear chain in
 * case of constantly forking task. On the other hand, an anon_vma with more
 * than one child isn't reused even if there was no alive vma, thus rmap
 * walker has a good chance of avoiding scanning the whole hierarchy when it
 * searches where page is mapped.
 */
int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
{
        struct anon_vma_chain *avc, *pavc;
        struct anon_vma *root = NULL;

        list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
                struct anon_vma *anon_vma;

                avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
                if (unlikely(!avc)) {
                        unlock_anon_vma_root(root);
                        root = NULL;
                        avc = anon_vma_chain_alloc(GFP_KERNEL);
                        if (!avc)
                                goto enomem_failure;
                }
                anon_vma = pavc->anon_vma;
                root = lock_anon_vma_root(root, anon_vma);
                anon_vma_chain_link(dst, avc, anon_vma);

                /*
                 * Reuse existing anon_vma if it has no vma and only one
                 * anon_vma child.
                 *
                 * Root anon_vma is never reused:
                 * it has self-parent reference and at least one child.
                 */
                if (!dst->anon_vma && src->anon_vma &&
                    anon_vma->num_children < 2 &&
                    anon_vma->num_active_vmas == 0)
                        dst->anon_vma = anon_vma;
        }
        if (dst->anon_vma)
                dst->anon_vma->num_active_vmas++;
        unlock_anon_vma_root(root);
        return 0;

 enomem_failure:
        /*
         * dst->anon_vma is dropped here otherwise its degree can be incorrectly
         * decremented in unlink_anon_vmas().
         * We can safely do this because callers of anon_vma_clone() don't care
         * about dst->anon_vma if anon_vma_clone() failed.
         */
        dst->anon_vma = NULL;
        unlink_anon_vmas(dst);
        return -ENOMEM;
}

/*
 * Attach vma to its own anon_vma, as well as to the anon_vmas that
 * the corresponding VMA in the parent process is attached to.
 * Returns 0 on success, non-zero on failure.
 */
int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
{
        struct anon_vma_chain *avc;
        struct anon_vma *anon_vma;
        int error;

        /* Don't bother if the parent process has no anon_vma here. */
        if (!pvma->anon_vma)
                return 0;

        /* Drop inherited anon_vma, we'll reuse existing or allocate new. */
        vma->anon_vma = NULL;

        /*
         * First, attach the new VMA to the parent VMA's anon_vmas,
         * so rmap can find non-COWed pages in child processes.
         */
        error = anon_vma_clone(vma, pvma);
        if (error)
                return error;

        /* An existing anon_vma has been reused, all done then. */
        if (vma->anon_vma)
                return 0;

        /* Then add our own anon_vma. */
        anon_vma = anon_vma_alloc();
        if (!anon_vma)
                goto out_error;
        anon_vma->num_active_vmas++;
        avc = anon_vma_chain_alloc(GFP_KERNEL);
        if (!avc)
                goto out_error_free_anon_vma;

        /*
         * The root anon_vma's spinlock is the lock actually used when we
         * lock any of the anon_vmas in this anon_vma tree.
         */
        anon_vma->root = pvma->anon_vma->root;
        anon_vma->parent = pvma->anon_vma;
        /*
         * With refcounts, an anon_vma can stay around longer than the
         * process it belongs to. The root anon_vma needs to be pinned until
         * this anon_vma is freed, because the lock lives in the root.
         */
        get_anon_vma(anon_vma->root);
        /* Mark this anon_vma as the one where our new (COWed) pages go. */
        vma->anon_vma = anon_vma;
        anon_vma_lock_write(anon_vma);
        anon_vma_chain_link(vma, avc, anon_vma);
        anon_vma->parent->num_children++;
        anon_vma_unlock_write(anon_vma);

        return 0;

 out_error_free_anon_vma:
        put_anon_vma(anon_vma);
 out_error:
        unlink_anon_vmas(vma);
        return -ENOMEM;
}

void unlink_anon_vmas(struct vm_area_struct *vma)
{
        struct anon_vma_chain *avc, *next;
        struct anon_vma *root = NULL;

        /*
         * Unlink each anon_vma chained to the VMA.  This list is ordered
         * from newest to oldest, ensuring the root anon_vma gets freed last.
         */
        list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
                struct anon_vma *anon_vma = avc->anon_vma;

                root = lock_anon_vma_root(root, anon_vma);
                anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);

                /*
                 * Leave empty anon_vmas on the list - we'll need
                 * to free them outside the lock.
                 */
                if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) {
                        anon_vma->parent->num_children--;
                        continue;
                }

                list_del(&avc->same_vma);
                anon_vma_chain_free(avc);
        }
        if (vma->anon_vma)
                vma->anon_vma->num_active_vmas--;
        unlock_anon_vma_root(root);

        /*
         * Iterate the list once more, it now only contains empty and unlinked
         * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
         * needing to write-acquire the anon_vma->root->rwsem.
         */
        list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
                struct anon_vma *anon_vma = avc->anon_vma;

                VM_WARN_ON(anon_vma->num_children);
                VM_WARN_ON(anon_vma->num_active_vmas);
                put_anon_vma(anon_vma);

                list_del(&avc->same_vma);
                anon_vma_chain_free(avc);
        }
}

static void anon_vma_ctor(void *data)
{
        struct anon_vma *anon_vma = data;

        init_rwsem(&anon_vma->rwsem);
        atomic_set(&anon_vma->refcount, 0);
        anon_vma->rb_root = RB_ROOT_CACHED;
}

void __init anon_vma_init(void)
{
        anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
                        0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
                        anon_vma_ctor);
        anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
                        SLAB_PANIC|SLAB_ACCOUNT);
}

/*
 * Getting a lock on a stable anon_vma from a page off the LRU is tricky!
 *
 * Since there is no serialization what so ever against page_remove_rmap()
 * the best this function can do is return a locked anon_vma that might
 * have been relevant to this page.
 *
 * The page might have been remapped to a different anon_vma or the anon_vma
 * returned may already be freed (and even reused).
 *
 * In case it was remapped to a different anon_vma, the new anon_vma will be a
 * child of the old anon_vma, and the anon_vma lifetime rules will therefore
 * ensure that any anon_vma obtained from the page will still be valid for as
 * long as we observe page_mapped() [ hence all those page_mapped() tests ].
 *
 * All users of this function must be very careful when walking the anon_vma
 * chain and verify that the page in question is indeed mapped in it
 * [ something equivalent to page_mapped_in_vma() ].
 *
 * Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from
 * page_remove_rmap() that the anon_vma pointer from page->mapping is valid
 * if there is a mapcount, we can dereference the anon_vma after observing
 * those.
 */
struct anon_vma *page_get_anon_vma(struct page *page)
{
        struct anon_vma *anon_vma = NULL;
        unsigned long anon_mapping;

        rcu_read_lock();
        anon_mapping = (unsigned long)READ_ONCE(page->mapping);
        if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
                goto out;
        if (!page_mapped(page))
                goto out;

        anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
        if (!atomic_inc_not_zero(&anon_vma->refcount)) {
                anon_vma = NULL;
                goto out;
        }

        /*
         * If this page is still mapped, then its anon_vma cannot have been
         * freed.  But if it has been unmapped, we have no security against the
         * anon_vma structure being freed and reused (for another anon_vma:
         * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero()
         * above cannot corrupt).
         */
        if (!page_mapped(page)) {
                rcu_read_unlock();
                put_anon_vma(anon_vma);
                return NULL;
        }
out:
        rcu_read_unlock();

        return anon_vma;
}

/*
 * Similar to page_get_anon_vma() except it locks the anon_vma.
 *
 * Its a little more complex as it tries to keep the fast path to a single
 * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
 * reference like with page_get_anon_vma() and then block on the mutex.
 */
struct anon_vma *page_lock_anon_vma_read(struct page *page)
{
        struct anon_vma *anon_vma = NULL;
        struct anon_vma *root_anon_vma;
        unsigned long anon_mapping;

        rcu_read_lock();
        anon_mapping = (unsigned long)READ_ONCE(page->mapping);
        if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
                goto out;
        if (!page_mapped(page))
                goto out;

        anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
        root_anon_vma = READ_ONCE(anon_vma->root);
        if (down_read_trylock(&root_anon_vma->rwsem)) {
                /*
                 * If the page is still mapped, then this anon_vma is still
                 * its anon_vma, and holding the mutex ensures that it will
                 * not go away, see anon_vma_free().
                 */
                if (!page_mapped(page)) {
                        up_read(&root_anon_vma->rwsem);
                        anon_vma = NULL;
                }
                goto out;
        }

        /* trylock failed, we got to sleep */
        if (!atomic_inc_not_zero(&anon_vma->refcount)) {
                anon_vma = NULL;
                goto out;
        }

        if (!page_mapped(page)) {
                rcu_read_unlock();
                put_anon_vma(anon_vma);
                return NULL;
        }

        /* we pinned the anon_vma, its safe to sleep */
        rcu_read_unlock();
        anon_vma_lock_read(anon_vma);

        if (atomic_dec_and_test(&anon_vma->refcount)) {
                /*
                 * Oops, we held the last refcount, release the lock
                 * and bail -- can't simply use put_anon_vma() because
                 * we'll deadlock on the anon_vma_lock_write() recursion.
                 */
                anon_vma_unlock_read(anon_vma);
                __put_anon_vma(anon_vma);
                anon_vma = NULL;
        }

        return anon_vma;

out:
        rcu_read_unlock();
        return anon_vma;
}

void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
{
        anon_vma_unlock_read(anon_vma);
}

#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
/*
 * Flush TLB entries for recently unmapped pages from remote CPUs. It is
 * important if a PTE was dirty when it was unmapped that it's flushed
 * before any IO is initiated on the page to prevent lost writes. Similarly,
 * it must be flushed before freeing to prevent data leakage.
 */
void try_to_unmap_flush(void)
{
        struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;

        if (!tlb_ubc->flush_required)
                return;

        arch_tlbbatch_flush(&tlb_ubc->arch);
        tlb_ubc->flush_required = false;
        tlb_ubc->writable = false;
}

/* Flush iff there are potentially writable TLB entries that can race with IO */
void try_to_unmap_flush_dirty(void)
{
        struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;

        if (tlb_ubc->writable)
                try_to_unmap_flush();
}

static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
{
        struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;

        arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
        tlb_ubc->flush_required = true;

        /*
         * Ensure compiler does not re-order the setting of tlb_flush_batched
         * before the PTE is cleared.
         */
        barrier();
        mm->tlb_flush_batched = true;

        /*
         * If the PTE was dirty then it's best to assume it's writable. The
         * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
         * before the page is queued for IO.
         */
        if (writable)
                tlb_ubc->writable = true;
}

/*
 * Returns true if the TLB flush should be deferred to the end of a batch of
 * unmap operations to reduce IPIs.
 */
static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
{
        bool should_defer = false;

        if (!(flags & TTU_BATCH_FLUSH))
                return false;

        /* If remote CPUs need to be flushed then defer batch the flush */
        if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
                should_defer = true;
        put_cpu();

        return should_defer;
}

/*
 * Reclaim unmaps pages under the PTL but do not flush the TLB prior to
 * releasing the PTL if TLB flushes are batched. It's possible for a parallel
 * operation such as mprotect or munmap to race between reclaim unmapping
 * the page and flushing the page. If this race occurs, it potentially allows
 * access to data via a stale TLB entry. Tracking all mm's that have TLB
 * batching in flight would be expensive during reclaim so instead track
 * whether TLB batching occurred in the past and if so then do a flush here
 * if required. This will cost one additional flush per reclaim cycle paid
 * by the first operation at risk such as mprotect and mumap.
 *
 * This must be called under the PTL so that an access to tlb_flush_batched
 * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise
 * via the PTL.
 */
void flush_tlb_batched_pending(struct mm_struct *mm)
{
        if (data_race(mm->tlb_flush_batched)) {
                flush_tlb_mm(mm);

                /*
                 * Do not allow the compiler to re-order the clearing of
                 * tlb_flush_batched before the tlb is flushed.
                 */
                barrier();
                mm->tlb_flush_batched = false;
        }
}
#else
static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
{
}

static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
{
        return false;
}
#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */

/*
 * At what user virtual address is page expected in vma?
 * Caller should check the page is actually part of the vma.
 */
unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
{
        if (PageAnon(page)) {
                struct anon_vma *page__anon_vma = page_anon_vma(page);
                /*
                 * Note: swapoff's unuse_vma() is more efficient with this
                 * check, and needs it to match anon_vma when KSM is active.
                 */
                if (!vma->anon_vma || !page__anon_vma ||
                    vma->anon_vma->root != page__anon_vma->root)
                        return -EFAULT;
        } else if (!vma->vm_file) {
                return -EFAULT;
        } else if (vma->vm_file->f_mapping != compound_head(page)->mapping) {
                return -EFAULT;
        }

        return vma_address(page, vma);
}

pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd = NULL;
        pmd_t pmde;

        pgd = pgd_offset(mm, address);
        if (!pgd_present(*pgd))
                goto out;

        p4d = p4d_offset(pgd, address);
        if (!p4d_present(*p4d))
                goto out;

        pud = pud_offset(p4d, address);
        if (!pud_present(*pud))
                goto out;

        pmd = pmd_offset(pud, address);
        /*
         * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
         * without holding anon_vma lock for write.  So when looking for a
         * genuine pmde (in which to find pte), test present and !THP together.
         */
        pmde = *pmd;
        barrier();
        if (!pmd_present(pmde) || pmd_trans_huge(pmde))
                pmd = NULL;
out:
        return pmd;
}

struct page_referenced_arg {
        int mapcount;
        int referenced;
        unsigned long vm_flags;
        struct mem_cgroup *memcg;
};
/*
 * arg: page_referenced_arg will be passed
 */
static bool page_referenced_one(struct page *page, struct vm_area_struct *vma,
                        unsigned long address, void *arg)
{
        struct page_referenced_arg *pra = arg;
        struct page_vma_mapped_walk pvmw = {
                .page = page,
                .vma = vma,
                .address = address,
        };
        int referenced = 0;

        while (page_vma_mapped_walk(&pvmw)) {
                address = pvmw.address;

                if (vma->vm_flags & VM_LOCKED) {
                        page_vma_mapped_walk_done(&pvmw);
                        pra->vm_flags |= VM_LOCKED;
                        return false; /* To break the loop */
                }

                if (pvmw.pte) {
                        if (ptep_clear_flush_young_notify(vma, address,
                                                pvmw.pte)) {
                                /*
                                 * Don't treat a reference through
                                 * a sequentially read mapping as such.
                                 * If the page has been used in another mapping,
                                 * we will catch it; if this other mapping is
                                 * already gone, the unmap path will have set
                                 * PG_referenced or activated the page.
                                 */
                                if (likely(!(vma->vm_flags & VM_SEQ_READ)))
                                        referenced++;
                        }
                } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
                        if (pmdp_clear_flush_young_notify(vma, address,
                                                pvmw.pmd))
                                referenced++;
                } else {
                        /* unexpected pmd-mapped page? */
                        WARN_ON_ONCE(1);
                }

                pra->mapcount--;
        }

        if (referenced)
                clear_page_idle(page);
        if (test_and_clear_page_young(page))
                referenced++;

        if (referenced) {
                pra->referenced++;
                pra->vm_flags |= vma->vm_flags;
        }

        if (!pra->mapcount)
                return false; /* To break the loop */

        return true;
}

static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
{
        struct page_referenced_arg *pra = arg;
        struct mem_cgroup *memcg = pra->memcg;

        if (!mm_match_cgroup(vma->vm_mm, memcg))
                return true;

        return false;
}

/**
 * page_referenced - test if the page was referenced
 * @page: the page to test
 * @is_locked: caller holds lock on the page
 * @memcg: target memory cgroup
 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
 *
 * Quick test_and_clear_referenced for all mappings to a page,
 * returns the number of ptes which referenced the page.
 */
int page_referenced(struct page *page,
                    int is_locked,
                    struct mem_cgroup *memcg,
                    unsigned long *vm_flags)
{
        int we_locked = 0;
        struct page_referenced_arg pra = {
                .mapcount = total_mapcount(page),
                .memcg = memcg,
        };
        struct rmap_walk_control rwc = {
                .rmap_one = page_referenced_one,
                .arg = (void *)&pra,
                .anon_lock = page_lock_anon_vma_read,
        };

        *vm_flags = 0;
        if (!pra.mapcount)
                return 0;

        if (!page_rmapping(page))
                return 0;

        if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
                we_locked = trylock_page(page);
                if (!we_locked)
                        return 1;
        }

        /*
         * If we are reclaiming on behalf of a cgroup, skip
         * counting on behalf of references from different
         * cgroups
         */
        if (memcg) {
                rwc.invalid_vma = invalid_page_referenced_vma;
        }

        rmap_walk(page, &rwc);
        *vm_flags = pra.vm_flags;

        if (we_locked)
                unlock_page(page);

        return pra.referenced;
}

static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
                            unsigned long address, void *arg)
{
        struct page_vma_mapped_walk pvmw = {
                .page = page,
                .vma = vma,
                .address = address,
                .flags = PVMW_SYNC,
        };
        struct mmu_notifier_range range;
        int *cleaned = arg;

        /*
         * We have to assume the worse case ie pmd for invalidation. Note that
         * the page can not be free from this function.
         */
        mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
                                0, vma, vma->vm_mm, address,
                                vma_address_end(page, vma));
        mmu_notifier_invalidate_range_start(&range);

        while (page_vma_mapped_walk(&pvmw)) {
                int ret = 0;

                address = pvmw.address;
                if (pvmw.pte) {
                        pte_t entry;
                        pte_t *pte = pvmw.pte;

                        if (!pte_dirty(*pte) && !pte_write(*pte))
                                continue;

                        flush_cache_page(vma, address, pte_pfn(*pte));
                        entry = ptep_clear_flush(vma, address, pte);
                        entry = pte_wrprotect(entry);
                        entry = pte_mkclean(entry);
                        set_pte_at(vma->vm_mm, address, pte, entry);
                        ret = 1;
                } else {
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
                        pmd_t *pmd = pvmw.pmd;
                        pmd_t entry;

                        if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
                                continue;

                        flush_cache_page(vma, address, page_to_pfn(page));
                        entry = pmdp_invalidate(vma, address, pmd);
                        entry = pmd_wrprotect(entry);
                        entry = pmd_mkclean(entry);
                        set_pmd_at(vma->vm_mm, address, pmd, entry);
                        ret = 1;
#else
                        /* unexpected pmd-mapped page? */
                        WARN_ON_ONCE(1);
#endif
                }

                /*
                 * No need to call mmu_notifier_invalidate_range() as we are
                 * downgrading page table protection not changing it to point
                 * to a new page.
                 *
                 * See Documentation/vm/mmu_notifier.rst
                 */
                if (ret)
                        (*cleaned)++;
        }

        mmu_notifier_invalidate_range_end(&range);

        return true;
}

static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
{
        if (vma->vm_flags & VM_SHARED)
                return false;

        return true;
}

int page_mkclean(struct page *page)
{
        int cleaned = 0;
        struct address_space *mapping;
        struct rmap_walk_control rwc = {
                .arg = (void *)&cleaned,
                .rmap_one = page_mkclean_one,
                .invalid_vma = invalid_mkclean_vma,
        };

        BUG_ON(!PageLocked(page));

        if (!page_mapped(page))
                return 0;

        mapping = page_mapping(page);
        if (!mapping)
                return 0;

        rmap_walk(page, &rwc);

        return cleaned;
}
EXPORT_SYMBOL_GPL(page_mkclean);

/**
 * page_move_anon_rmap - move a page to our anon_vma
 * @page:        the page to move to our anon_vma
 * @vma:        the vma the page belongs to
 *
 * When a page belongs exclusively to one process after a COW event,
 * that page can be moved into the anon_vma that belongs to just that
 * process, so the rmap code will not search the parent or sibling
 * processes.
 */
void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
{
        struct anon_vma *anon_vma = vma->anon_vma;

        page = compound_head(page);

        VM_BUG_ON_PAGE(!PageLocked(page), page);
        VM_BUG_ON_VMA(!anon_vma, vma);

        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
        /*
         * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written
         * simultaneously, so a concurrent reader (eg page_referenced()'s
         * PageAnon()) will not see one without the other.
         */
        WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
}

/**
 * __page_set_anon_rmap - set up new anonymous rmap
 * @page:        Page or Hugepage to add to rmap
 * @vma:        VM area to add page to.
 * @address:        User virtual address of the mapping        
 * @exclusive:        the page is exclusively owned by the current process
 */
static void __page_set_anon_rmap(struct page *page,
        struct vm_area_struct *vma, unsigned long address, int exclusive)
{
        struct anon_vma *anon_vma = vma->anon_vma;

        BUG_ON(!anon_vma);

        if (PageAnon(page))
                return;

        /*
         * If the page isn't exclusively mapped into this vma,
         * we must use the _oldest_ possible anon_vma for the
         * page mapping!
         */
        if (!exclusive)
                anon_vma = anon_vma->root;

        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
        page->mapping = (struct address_space *) anon_vma;
        page->index = linear_page_index(vma, address);
}

/**
 * __page_check_anon_rmap - sanity check anonymous rmap addition
 * @page:        the page to add the mapping to
 * @vma:        the vm area in which the mapping is added
 * @address:        the user virtual address mapped
 */
static void __page_check_anon_rmap(struct page *page,
        struct vm_area_struct *vma, unsigned long address)
{
        /*
         * The page's anon-rmap details (mapping and index) are guaranteed to
         * be set up correctly at this point.
         *
         * We have exclusion against page_add_anon_rmap because the caller
         * always holds the page locked, except if called from page_dup_rmap,
         * in which case the page is already known to be setup.
         *
         * We have exclusion against page_add_new_anon_rmap because those pages
         * are initially only visible via the pagetables, and the pte is locked
         * over the call to page_add_new_anon_rmap.
         */
        VM_BUG_ON_PAGE(page_anon_vma(page)->root != vma->anon_vma->root, page);
        VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address),
                       page);
}

/**
 * page_add_anon_rmap - add pte mapping to an anonymous page
 * @page:        the page to add the mapping to
 * @vma:        the vm area in which the mapping is added
 * @address:        the user virtual address mapped
 * @compound:        charge the page as compound or small page
 *
 * The caller needs to hold the pte lock, and the page must be locked in
 * the anon_vma case: to serialize mapping,index checking after setting,
 * and to ensure that PageAnon is not being upgraded racily to PageKsm
 * (but PageKsm is never downgraded to PageAnon).
 */
void page_add_anon_rmap(struct page *page,
        struct vm_area_struct *vma, unsigned long address, bool compound)
{
        do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0);
}

/*
 * Special version of the above for do_swap_page, which often runs
 * into pages that are exclusively owned by the current process.
 * Everybody else should continue to use page_add_anon_rmap above.
 */
void do_page_add_anon_rmap(struct page *page,
        struct vm_area_struct *vma, unsigned long address, int flags)
{
        bool compound = flags & RMAP_COMPOUND;
        bool first;

        if (unlikely(PageKsm(page)))
                lock_page_memcg(page);
        else
                VM_BUG_ON_PAGE(!PageLocked(page), page);

        if (compound) {
                atomic_t *mapcount;
                VM_BUG_ON_PAGE(!PageLocked(page), page);
                VM_BUG_ON_PAGE(!PageTransHuge(page), page);
                mapcount = compound_mapcount_ptr(page);
                first = atomic_inc_and_test(mapcount);
        } else {
                first = atomic_inc_and_test(&page->_mapcount);
        }

        if (first) {
                int nr = compound ? thp_nr_pages(page) : 1;
                /*
                 * We use the irq-unsafe __{inc|mod}_zone_page_stat because
                 * these counters are not modified in interrupt context, and
                 * pte lock(a spinlock) is held, which implies preemption
                 * disabled.
                 */
                if (compound)
                        __inc_lruvec_page_state(page, NR_ANON_THPS);
                __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
        }

        if (unlikely(PageKsm(page))) {
                unlock_page_memcg(page);
                return;
        }

        /* address might be in next vma when migration races vma_adjust */
        if (first)
                __page_set_anon_rmap(page, vma, address,
                                flags & RMAP_EXCLUSIVE);
        else
                __page_check_anon_rmap(page, vma, address);
}

/**
 * page_add_new_anon_rmap - add pte mapping to a new anonymous page
 * @page:        the page to add the mapping to
 * @vma:        the vm area in which the mapping is added
 * @address:        the user virtual address mapped
 * @compound:        charge the page as compound or small page
 *
 * Same as page_add_anon_rmap but must only be called on *new* pages.
 * This means the inc-and-test can be bypassed.
 * Page does not have to be locked.
 */
void page_add_new_anon_rmap(struct page *page,
        struct vm_area_struct *vma, unsigned long address, bool compound)
{
        int nr = compound ? thp_nr_pages(page) : 1;

        VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
        __SetPageSwapBacked(page);
        if (compound) {
                VM_BUG_ON_PAGE(!PageTransHuge(page), page);
                /* increment count (starts at -1) */
                atomic_set(compound_mapcount_ptr(page), 0);
                if (hpage_pincount_available(page))
                        atomic_set(compound_pincount_ptr(page), 0);

                __inc_lruvec_page_state(page, NR_ANON_THPS);
        } else {
                /* Anon THP always mapped first with PMD */
                VM_BUG_ON_PAGE(PageTransCompound(page), page);
                /* increment count (starts at -1) */
                atomic_set(&page->_mapcount, 0);
        }
        __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
        __page_set_anon_rmap(page, vma, address, 1);
}

/**
 * page_add_file_rmap - add pte mapping to a file page
 * @page: the page to add the mapping to
 * @compound: charge the page as compound or small page
 *
 * The caller needs to hold the pte lock.
 */
void page_add_file_rmap(struct page *page, bool compound)
{
        int i, nr = 1;

        VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
        lock_page_memcg(page);
        if (compound && PageTransHuge(page)) {
                for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
                        if (atomic_inc_and_test(&page[i]._mapcount))
                                nr++;
                }
                if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
                        goto out;
                if (PageSwapBacked(page))
                        __inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
                else
                        __inc_node_page_state(page, NR_FILE_PMDMAPPED);
        } else {
                if (PageTransCompound(page) && page_mapping(page)) {
                        VM_WARN_ON_ONCE(!PageLocked(page));

                        SetPageDoubleMap(compound_head(page));
                        if (PageMlocked(page))
                                clear_page_mlock(compound_head(page));
                }
                if (!atomic_inc_and_test(&page->_mapcount))
                        goto out;
        }
        __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr);
out:
        unlock_page_memcg(page);
}

static void page_remove_file_rmap(struct page *page, bool compound)
{
        int i, nr = 1;

        VM_BUG_ON_PAGE(compound && !PageHead(page), page);

        /* Hugepages are not counted in NR_FILE_MAPPED for now. */
        if (unlikely(PageHuge(page))) {
                /* hugetlb pages are always mapped with pmds */
                atomic_dec(compound_mapcount_ptr(page));
                return;
        }

        /* page still mapped by someone else? */
        if (compound && PageTransHuge(page)) {
                for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
                        if (atomic_add_negative(-1, &page[i]._mapcount))
                                nr++;
                }
                if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
                        return;
                if (PageSwapBacked(page))
                        __dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
                else
                        __dec_node_page_state(page, NR_FILE_PMDMAPPED);
        } else {
                if (!atomic_add_negative(-1, &page->_mapcount))
                        return;
        }

        /*
         * We use the irq-unsafe __{inc|mod}_lruvec_page_state because
         * these counters are not modified in interrupt context, and
         * pte lock(a spinlock) is held, which implies preemption disabled.
         */
        __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr);

        if (unlikely(PageMlocked(page)))
                clear_page_mlock(page);
}

static void page_remove_anon_compound_rmap(struct page *page)
{
        int i, nr;

        if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
                return;

        /* Hugepages are not counted in NR_ANON_PAGES for now. */
        if (unlikely(PageHuge(page)))
                return;

        if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
                return;

        __dec_lruvec_page_state(page, NR_ANON_THPS);

        if (TestClearPageDoubleMap(page)) {
                /*
                 * Subpages can be mapped with PTEs too. Check how many of
                 * them are still mapped.
                 */
                for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
                        if (atomic_add_negative(-1, &page[i]._mapcount))
                                nr++;
                }

                /*
                 * Queue the page for deferred split if at least one small
                 * page of the compound page is unmapped, but at least one
                 * small page is still mapped.
                 */
                if (nr && nr < thp_nr_pages(page))
                        deferred_split_huge_page(page);
        } else {
                nr = thp_nr_pages(page);
        }

        if (unlikely(PageMlocked(page)))
                clear_page_mlock(page);

        if (nr)
                __mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr);
}

/**
 * page_remove_rmap - take down pte mapping from a page
 * @page:        page to remove mapping from
 * @compound:        uncharge the page as compound or small page
 *
 * The caller needs to hold the pte lock.
 */
void page_remove_rmap(struct page *page, bool compound)
{
        lock_page_memcg(page);

        if (!PageAnon(page)) {
                page_remove_file_rmap(page, compound);
                goto out;
        }

        if (compound) {
                page_remove_anon_compound_rmap(page);
                goto out;
        }

        /* page still mapped by someone else? */
        if (!atomic_add_negative(-1, &page->_mapcount))
                goto out;

        /*
         * We use the irq-unsafe __{inc|mod}_zone_page_stat because
         * these counters are not modified in interrupt context, and
         * pte lock(a spinlock) is held, which implies preemption disabled.
         */
        __dec_lruvec_page_state(page, NR_ANON_MAPPED);

        if (unlikely(PageMlocked(page)))
                clear_page_mlock(page);

        if (PageTransCompound(page))
                deferred_split_huge_page(compound_head(page));

        /*
         * It would be tidy to reset the PageAnon mapping here,
         * but that might overwrite a racing page_add_anon_rmap
         * which increments mapcount after us but sets mapping
         * before us: so leave the reset to free_unref_page,
         * and remember that it's only reliable while mapped.
         * Leaving it set also helps swapoff to reinstate ptes
         * faster for those pages still in swapcache.
         */
out:
        unlock_page_memcg(page);
}

/*
 * @arg: enum ttu_flags will be passed to this argument
 */
static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                     unsigned long address, void *arg)
{
        struct mm_struct *mm = vma->vm_mm;
        struct page_vma_mapped_walk pvmw = {
                .page = page,
                .vma = vma,
                .address = address,
        };
        pte_t pteval;
        struct page *subpage;
        bool ret = true;
        struct mmu_notifier_range range;
        enum ttu_flags flags = (enum ttu_flags)(long)arg;

        /*
         * When racing against e.g. zap_pte_range() on another cpu,
         * in between its ptep_get_and_clear_full() and page_remove_rmap(),
         * try_to_unmap() may return false when it is about to become true,
         * if page table locking is skipped: use TTU_SYNC to wait for that.
         */
        if (flags & TTU_SYNC)
                pvmw.flags = PVMW_SYNC;

        /* munlock has nothing to gain from examining un-locked vmas */
        if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
                return true;

        if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
            is_zone_device_page(page) && !is_device_private_page(page))
                return true;

        if (flags & TTU_SPLIT_HUGE_PMD) {
                split_huge_pmd_address(vma, address,
                                flags & TTU_SPLIT_FREEZE, page);
        }

        /*
         * For THP, we have to assume the worse case ie pmd for invalidation.
         * For hugetlb, it could be much worse if we need to do pud
         * invalidation in the case of pmd sharing.
         *
         * Note that the page can not be free in this function as call of
         * try_to_unmap() must hold a reference on the page.
         */
        range.end = PageKsm(page) ?
                        address + PAGE_SIZE : vma_address_end(page, vma);
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
                                address, range.end);
        if (PageHuge(page)) {
                /*
                 * If sharing is possible, start and end will be adjusted
                 * accordingly.
                 */
                adjust_range_if_pmd_sharing_possible(vma, &range.start,
                                                     &range.end);
        }
        mmu_notifier_invalidate_range_start(&range);

        while (page_vma_mapped_walk(&pvmw)) {
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
                /* PMD-mapped THP migration entry */
                if (!pvmw.pte && (flags & TTU_MIGRATION)) {
                        VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page);

                        set_pmd_migration_entry(&pvmw, page);
                        continue;
                }
#endif

                /*
                 * If the page is mlock()d, we cannot swap it out.
                 * If it's recently referenced (perhaps page_referenced
                 * skipped over this mm) then we should reactivate it.
                 */
                if (!(flags & TTU_IGNORE_MLOCK)) {
                        if (vma->vm_flags & VM_LOCKED) {
                                /* PTE-mapped THP are never mlocked */
                                if (!PageTransCompound(page)) {
                                        /*
                                         * Holding pte lock, we do *not* need
                                         * mmap_lock here
                                         */
                                        mlock_vma_page(page);
                                }
                                ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }
                        if (flags & TTU_MUNLOCK)
                                continue;
                }

                /* Unexpected PMD-mapped THP? */
                VM_BUG_ON_PAGE(!pvmw.pte, page);

                subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
                address = pvmw.address;

                if (PageHuge(page) && !PageAnon(page)) {
                        struct mmu_gather tlb;

                        /*
                         * To call huge_pmd_unshare, i_mmap_rwsem must be
                         * held in write mode.  Caller needs to explicitly
                         * do this outside rmap routines.
                         */
                        VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
                        tlb_gather_mmu_vma(&tlb, vma, range.start, range.end);
                        if (huge_pmd_unshare(&tlb, vma, &address, pvmw.pte)) {
                                /*
                                 * huge_pmd_unshare unmapped an entire PMD
                                 * page.  There is no way of knowing exactly
                                 * which PMDs may be cached for this mm, so
                                 * we must flush them all.  start/end were
                                 * already adjusted above to cover this range.
                                 */
                                flush_cache_range(vma, range.start, range.end);
                                huge_pmd_unshare_flush(&tlb, vma);
                                mmu_notifier_invalidate_range(mm, range.start,
                                                              range.end);
                                tlb_finish_mmu(&tlb, range.start, range.end);

                                /*
                                 * The PMD table was unmapped,
                                 * consequently unmapping the folio.
                                 */
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }
                        tlb_finish_mmu(&tlb, range.start, range.end);
                }

                if (IS_ENABLED(CONFIG_MIGRATION) &&
                    (flags & TTU_MIGRATION) &&
                    is_zone_device_page(page)) {
                        swp_entry_t entry;
                        pte_t swp_pte;

                        pteval = ptep_get_and_clear(mm, pvmw.address, pvmw.pte);

                        /*
                         * Store the pfn of the page in a special migration
                         * pte. do_swap_page() will wait until the migration
                         * pte is removed and then restart fault handling.
                         */
                        entry = make_migration_entry(page, 0);
                        swp_pte = swp_entry_to_pte(entry);

                        /*
                         * pteval maps a zone device page and is therefore
                         * a swap pte.
                         */
                        if (pte_swp_soft_dirty(pteval))
                                swp_pte = pte_swp_mksoft_dirty(swp_pte);
                        if (pte_swp_uffd_wp(pteval))
                                swp_pte = pte_swp_mkuffd_wp(swp_pte);
                        set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
                        /*
                         * No need to invalidate here it will synchronize on
                         * against the special swap migration pte.
                         *
                         * The assignment to subpage above was computed from a
                         * swap PTE which results in an invalid pointer.
                         * Since only PAGE_SIZE pages can currently be
                         * migrated, just set it to page. This will need to be
                         * changed when hugepage migrations to device private
                         * memory are supported.
                         */
                        subpage = page;
                        goto discard;
                }

                /* Nuke the page table entry. */
                flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
                if (should_defer_flush(mm, flags)) {
                        /*
                         * We clear the PTE but do not flush so potentially
                         * a remote CPU could still be writing to the page.
                         * If the entry was previously clean then the
                         * architecture must guarantee that a clear->dirty
                         * transition on a cached TLB entry is written through
                         * and traps if the PTE is unmapped.
                         */
                        pteval = ptep_get_and_clear(mm, address, pvmw.pte);

                        set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
                } else {
                        pteval = ptep_clear_flush(vma, address, pvmw.pte);
                }

                /* Move the dirty bit to the page. Now the pte is gone. */
                if (pte_dirty(pteval))
                        set_page_dirty(page);

                /* Update high watermark before we lower rss */
                update_hiwater_rss(mm);

                if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
                        pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
                        if (PageHuge(page)) {
                                hugetlb_count_sub(compound_nr(page), mm);
                                set_huge_swap_pte_at(mm, address,
                                                     pvmw.pte, pteval,
                                                     vma_mmu_pagesize(vma));
                        } else {
                                dec_mm_counter(mm, mm_counter(page));
                                set_pte_at(mm, address, pvmw.pte, pteval);
                        }

                } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
                        /*
                         * The guest indicated that the page content is of no
                         * interest anymore. Simply discard the pte, vmscan
                         * will take care of the rest.
                         * A future reference will then fault in a new zero
                         * page. When userfaultfd is active, we must not drop
                         * this page though, as its main user (postcopy
                         * migration) will not expect userfaults on already
                         * copied pages.
                         */
                        dec_mm_counter(mm, mm_counter(page));
                        /* We have to invalidate as we cleared the pte */
                        mmu_notifier_invalidate_range(mm, address,
                                                      address + PAGE_SIZE);
                } else if (IS_ENABLED(CONFIG_MIGRATION) &&
                                (flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) {
                        swp_entry_t entry;
                        pte_t swp_pte;

                        if (arch_unmap_one(mm, vma, address, pteval) < 0) {
                                set_pte_at(mm, address, pvmw.pte, pteval);
                                ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }

                        /*
                         * Store the pfn of the page in a special migration
                         * pte. do_swap_page() will wait until the migration
                         * pte is removed and then restart fault handling.
                         */
                        entry = make_migration_entry(subpage,
                                        pte_write(pteval));
                        swp_pte = swp_entry_to_pte(entry);
                        if (pte_soft_dirty(pteval))
                                swp_pte = pte_swp_mksoft_dirty(swp_pte);
                        if (pte_uffd_wp(pteval))
                                swp_pte = pte_swp_mkuffd_wp(swp_pte);
                        set_pte_at(mm, address, pvmw.pte, swp_pte);
                        /*
                         * No need to invalidate here it will synchronize on
                         * against the special swap migration pte.
                         */
                } else if (PageAnon(page)) {
                        swp_entry_t entry = { .val = page_private(subpage) };
                        pte_t swp_pte;
                        /*
                         * Store the swap location in the pte.
                         * See handle_pte_fault() ...
                         */
                        if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) {
                                WARN_ON_ONCE(1);
                                ret = false;
                                /* We have to invalidate as we cleared the pte */
                                mmu_notifier_invalidate_range(mm, address,
                                                        address + PAGE_SIZE);
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }

                        /* MADV_FREE page check */
                        if (!PageSwapBacked(page)) {
                                int ref_count, map_count;

                                /*
                                 * Synchronize with gup_pte_range():
                                 * - clear PTE; barrier; read refcount
                                 * - inc refcount; barrier; read PTE
                                 */
                                smp_mb();

                                ref_count = page_ref_count(page);
                                map_count = page_mapcount(page);

                                /*
                                 * Order reads for page refcount and dirty flag
                                 * (see comments in __remove_mapping()).
                                 */
                                smp_rmb();

                                /*
                                 * The only page refs must be one from isolation
                                 * plus the rmap(s) (dropped by discard:).
                                 */
                                if (ref_count == 1 + map_count &&
                                    !PageDirty(page)) {
                                        /* Invalidate as we cleared the pte */
                                        mmu_notifier_invalidate_range(mm,
                                                address, address + PAGE_SIZE);
                                        dec_mm_counter(mm, MM_ANONPAGES);
                                        goto discard;
                                }

                                /*
                                 * If the page was redirtied, it cannot be
                                 * discarded. Remap the page to page table.
                                 */
                                set_pte_at(mm, address, pvmw.pte, pteval);
                                SetPageSwapBacked(page);
                                ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }

                        if (swap_duplicate(entry) < 0) {
                                set_pte_at(mm, address, pvmw.pte, pteval);
                                ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }
                        if (arch_unmap_one(mm, vma, address, pteval) < 0) {
                                set_pte_at(mm, address, pvmw.pte, pteval);
                                ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }
                        if (list_empty(&mm->mmlist)) {
                                spin_lock(&mmlist_lock);
                                if (list_empty(&mm->mmlist))
                                        list_add(&mm->mmlist, &init_mm.mmlist);
                                spin_unlock(&mmlist_lock);
                        }
                        dec_mm_counter(mm, MM_ANONPAGES);
                        inc_mm_counter(mm, MM_SWAPENTS);
                        swp_pte = swp_entry_to_pte(entry);
                        if (pte_soft_dirty(pteval))
                                swp_pte = pte_swp_mksoft_dirty(swp_pte);
                        if (pte_uffd_wp(pteval))
                                swp_pte = pte_swp_mkuffd_wp(swp_pte);
                        set_pte_at(mm, address, pvmw.pte, swp_pte);
                        /* Invalidate as we cleared the pte */
                        mmu_notifier_invalidate_range(mm, address,
                                                      address + PAGE_SIZE);
                } else {
                        /*
                         * This is a locked file-backed page, thus it cannot
                         * be removed from the page cache and replaced by a new
                         * page before mmu_notifier_invalidate_range_end, so no
                         * concurrent thread might update its page table to
                         * point at new page while a device still is using this
                         * page.
                         *
                         * See Documentation/vm/mmu_notifier.rst
                         */
                        dec_mm_counter(mm, mm_counter_file(page));
                }
discard:
                /*
                 * No need to call mmu_notifier_invalidate_range() it has be
                 * done above for all cases requiring it to happen under page
                 * table lock before mmu_notifier_invalidate_range_end()
                 *
                 * See Documentation/vm/mmu_notifier.rst
                 */
                page_remove_rmap(subpage, PageHuge(page));
                put_page(page);
        }

        mmu_notifier_invalidate_range_end(&range);

        return ret;
}

static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
{
        return vma_is_temporary_stack(vma);
}

static int page_not_mapped(struct page *page)
{
        return !page_mapped(page);
}

/**
 * try_to_unmap - try to remove all page table mappings to a page
 * @page: the page to get unmapped
 * @flags: action and flags
 *
 * Tries to remove all the page table entries which are mapping this
 * page, used in the pageout path.  Caller must hold the page lock.
 *
 * If unmap is successful, return true. Otherwise, false.
 */
bool try_to_unmap(struct page *page, enum ttu_flags flags)
{
        struct rmap_walk_control rwc = {
                .rmap_one = try_to_unmap_one,
                .arg = (void *)flags,
                .done = page_not_mapped,
                .anon_lock = page_lock_anon_vma_read,
        };

        /*
         * During exec, a temporary VMA is setup and later moved.
         * The VMA is moved under the anon_vma lock but not the
         * page tables leading to a race where migration cannot
         * find the migration ptes. Rather than increasing the
         * locking requirements of exec(), migration skips
         * temporary VMAs until after exec() completes.
         */
        if ((flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))
            && !PageKsm(page) && PageAnon(page))
                rwc.invalid_vma = invalid_migration_vma;

        if (flags & TTU_RMAP_LOCKED)
                rmap_walk_locked(page, &rwc);
        else
                rmap_walk(page, &rwc);

        /*
         * When racing against e.g. zap_pte_range() on another cpu,
         * in between its ptep_get_and_clear_full() and page_remove_rmap(),
         * try_to_unmap() may return false when it is about to become true,
         * if page table locking is skipped: use TTU_SYNC to wait for that.
         */
        return !page_mapcount(page);
}

/**
 * try_to_munlock - try to munlock a page
 * @page: the page to be munlocked
 *
 * Called from munlock code.  Checks all of the VMAs mapping the page
 * to make sure nobody else has this page mlocked. The page will be
 * returned with PG_mlocked cleared if no other vmas have it mlocked.
 */

void try_to_munlock(struct page *page)
{
        struct rmap_walk_control rwc = {
                .rmap_one = try_to_unmap_one,
                .arg = (void *)TTU_MUNLOCK,
                .done = page_not_mapped,
                .anon_lock = page_lock_anon_vma_read,

        };

        VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page);
        VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);

        rmap_walk(page, &rwc);
}

void __put_anon_vma(struct anon_vma *anon_vma)
{
        struct anon_vma *root = anon_vma->root;

        anon_vma_free(anon_vma);
        if (root != anon_vma && atomic_dec_and_test(&root->refcount))
                anon_vma_free(root);
}

static struct anon_vma *rmap_walk_anon_lock(struct page *page,
                                        struct rmap_walk_control *rwc)
{
        struct anon_vma *anon_vma;

        if (rwc->anon_lock)
                return rwc->anon_lock(page);

        /*
         * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
         * because that depends on page_mapped(); but not all its usages
         * are holding mmap_lock. Users without mmap_lock are required to
         * take a reference count to prevent the anon_vma disappearing
         */
        anon_vma = page_anon_vma(page);
        if (!anon_vma)
                return NULL;

        anon_vma_lock_read(anon_vma);
        return anon_vma;
}

/*
 * rmap_walk_anon - do something to anonymous page using the object-based
 * rmap method
 * @page: the page to be handled
 * @rwc: control variable according to each walk type
 *
 * Find all the mappings of a page using the mapping pointer and the vma chains
 * contained in the anon_vma struct it points to.
 *
 * When called from try_to_munlock(), the mmap_lock of the mm containing the vma
 * where the page was found will be held for write.  So, we won't recheck
 * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
 * LOCKED.
 */
static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
                bool locked)
{
        struct anon_vma *anon_vma;
        pgoff_t pgoff_start, pgoff_end;
        struct anon_vma_chain *avc;

        if (locked) {
                anon_vma = page_anon_vma(page);
                /* anon_vma disappear under us? */
                VM_BUG_ON_PAGE(!anon_vma, page);
        } else {
                anon_vma = rmap_walk_anon_lock(page, rwc);
        }
        if (!anon_vma)
                return;

        pgoff_start = page_to_pgoff(page);
        pgoff_end = pgoff_start + thp_nr_pages(page) - 1;
        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
                        pgoff_start, pgoff_end) {
                struct vm_area_struct *vma = avc->vma;
                unsigned long address = vma_address(page, vma);

                VM_BUG_ON_VMA(address == -EFAULT, vma);
                cond_resched();

                if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
                        continue;

                if (!rwc->rmap_one(page, vma, address, rwc->arg))
                        break;
                if (rwc->done && rwc->done(page))
                        break;
        }

        if (!locked)
                anon_vma_unlock_read(anon_vma);
}

/*
 * rmap_walk_file - do something to file page using the object-based rmap method
 * @page: the page to be handled
 * @rwc: control variable according to each walk type
 *
 * Find all the mappings of a page using the mapping pointer and the vma chains
 * contained in the address_space struct it points to.
 *
 * When called from try_to_munlock(), the mmap_lock of the mm containing the vma
 * where the page was found will be held for write.  So, we won't recheck
 * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
 * LOCKED.
 */
static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
                bool locked)
{
        struct address_space *mapping = page_mapping(page);
        pgoff_t pgoff_start, pgoff_end;
        struct vm_area_struct *vma;

        /*
         * The page lock not only makes sure that page->mapping cannot
         * suddenly be NULLified by truncation, it makes sure that the
         * structure at mapping cannot be freed and reused yet,
         * so we can safely take mapping->i_mmap_rwsem.
         */
        VM_BUG_ON_PAGE(!PageLocked(page), page);

        if (!mapping)
                return;

        pgoff_start = page_to_pgoff(page);
        pgoff_end = pgoff_start + thp_nr_pages(page) - 1;
        if (!locked)
                i_mmap_lock_read(mapping);
        vma_interval_tree_foreach(vma, &mapping->i_mmap,
                        pgoff_start, pgoff_end) {
                unsigned long address = vma_address(page, vma);

                VM_BUG_ON_VMA(address == -EFAULT, vma);
                cond_resched();

                if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
                        continue;

                if (!rwc->rmap_one(page, vma, address, rwc->arg))
                        goto done;
                if (rwc->done && rwc->done(page))
                        goto done;
        }

done:
        if (!locked)
                i_mmap_unlock_read(mapping);
}

void rmap_walk(struct page *page, struct rmap_walk_control *rwc)
{
        if (unlikely(PageKsm(page)))
                rmap_walk_ksm(page, rwc);
        else if (PageAnon(page))
                rmap_walk_anon(page, rwc, false);
        else
                rmap_walk_file(page, rwc, false);
}

/* Like rmap_walk, but caller holds relevant rmap lock */
void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc)
{
        /* no ksm support for now */
        VM_BUG_ON_PAGE(PageKsm(page), page);
        if (PageAnon(page))
                rmap_walk_anon(page, rwc, true);
        else
                rmap_walk_file(page, rwc, true);
}

#ifdef CONFIG_HUGETLB_PAGE
/*
 * The following two functions are for anonymous (private mapped) hugepages.
 * Unlike common anonymous pages, anonymous hugepages have no accounting code
 * and no lru code, because we handle hugepages differently from common pages.
 */
void hugepage_add_anon_rmap(struct page *page,
                            struct vm_area_struct *vma, unsigned long address)
{
        struct anon_vma *anon_vma = vma->anon_vma;
        int first;

        BUG_ON(!PageLocked(page));
        BUG_ON(!anon_vma);
        /* address might be in next vma when migration races vma_adjust */
        first = atomic_inc_and_test(compound_mapcount_ptr(page));
        if (first)
                __page_set_anon_rmap(page, vma, address, 0);
}

void hugepage_add_new_anon_rmap(struct page *page,
                        struct vm_area_struct *vma, unsigned long address)
{
        BUG_ON(address < vma->vm_start || address >= vma->vm_end);
        atomic_set(compound_mapcount_ptr(page), 0);
        if (hpage_pincount_available(page))
                atomic_set(compound_pincount_ptr(page), 0);

        __page_set_anon_rmap(page, vma, address, 1);
}
#endif /* CONFIG_HUGETLB_PAGE */








































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Common values for SHA algorithms
 */

#ifndef _CRYPTO_SHA_H
#define _CRYPTO_SHA_H

#include <linux/types.h>

#define SHA1_DIGEST_SIZE        20
#define SHA1_BLOCK_SIZE         64

#define SHA224_DIGEST_SIZE        28
#define SHA224_BLOCK_SIZE        64

#define SHA256_DIGEST_SIZE      32
#define SHA256_BLOCK_SIZE       64

#define SHA384_DIGEST_SIZE      48
#define SHA384_BLOCK_SIZE       128

#define SHA512_DIGEST_SIZE      64
#define SHA512_BLOCK_SIZE       128

#define SHA1_H0                0x67452301UL
#define SHA1_H1                0xefcdab89UL
#define SHA1_H2                0x98badcfeUL
#define SHA1_H3                0x10325476UL
#define SHA1_H4                0xc3d2e1f0UL

#define SHA224_H0        0xc1059ed8UL
#define SHA224_H1        0x367cd507UL
#define SHA224_H2        0x3070dd17UL
#define SHA224_H3        0xf70e5939UL
#define SHA224_H4        0xffc00b31UL
#define SHA224_H5        0x68581511UL
#define SHA224_H6        0x64f98fa7UL
#define SHA224_H7        0xbefa4fa4UL

#define SHA256_H0        0x6a09e667UL
#define SHA256_H1        0xbb67ae85UL
#define SHA256_H2        0x3c6ef372UL
#define SHA256_H3        0xa54ff53aUL
#define SHA256_H4        0x510e527fUL
#define SHA256_H5        0x9b05688cUL
#define SHA256_H6        0x1f83d9abUL
#define SHA256_H7        0x5be0cd19UL

#define SHA384_H0        0xcbbb9d5dc1059ed8ULL
#define SHA384_H1        0x629a292a367cd507ULL
#define SHA384_H2        0x9159015a3070dd17ULL
#define SHA384_H3        0x152fecd8f70e5939ULL
#define SHA384_H4        0x67332667ffc00b31ULL
#define SHA384_H5        0x8eb44a8768581511ULL
#define SHA384_H6        0xdb0c2e0d64f98fa7ULL
#define SHA384_H7        0x47b5481dbefa4fa4ULL

#define SHA512_H0        0x6a09e667f3bcc908ULL
#define SHA512_H1        0xbb67ae8584caa73bULL
#define SHA512_H2        0x3c6ef372fe94f82bULL
#define SHA512_H3        0xa54ff53a5f1d36f1ULL
#define SHA512_H4        0x510e527fade682d1ULL
#define SHA512_H5        0x9b05688c2b3e6c1fULL
#define SHA512_H6        0x1f83d9abfb41bd6bULL
#define SHA512_H7        0x5be0cd19137e2179ULL

extern const u8 sha1_zero_message_hash[SHA1_DIGEST_SIZE];

extern const u8 sha224_zero_message_hash[SHA224_DIGEST_SIZE];

extern const u8 sha256_zero_message_hash[SHA256_DIGEST_SIZE];

extern const u8 sha384_zero_message_hash[SHA384_DIGEST_SIZE];

extern const u8 sha512_zero_message_hash[SHA512_DIGEST_SIZE];

struct sha1_state {
        u32 state[SHA1_DIGEST_SIZE / 4];
        u64 count;
        u8 buffer[SHA1_BLOCK_SIZE];
};

struct sha256_state {
        u32 state[SHA256_DIGEST_SIZE / 4];
        u64 count;
        u8 buf[SHA256_BLOCK_SIZE];
};

struct sha512_state {
        u64 state[SHA512_DIGEST_SIZE / 8];
        u64 count[2];
        u8 buf[SHA512_BLOCK_SIZE];
};

struct shash_desc;

extern int crypto_sha1_update(struct shash_desc *desc, const u8 *data,
                              unsigned int len);

extern int crypto_sha1_finup(struct shash_desc *desc, const u8 *data,
                             unsigned int len, u8 *hash);

extern int crypto_sha256_update(struct shash_desc *desc, const u8 *data,
                              unsigned int len);

extern int crypto_sha256_finup(struct shash_desc *desc, const u8 *data,
                               unsigned int len, u8 *hash);

extern int crypto_sha512_update(struct shash_desc *desc, const u8 *data,
                              unsigned int len);

extern int crypto_sha512_finup(struct shash_desc *desc, const u8 *data,
                               unsigned int len, u8 *hash);

/*
 * An implementation of SHA-1's compression function.  Don't use in new code!
 * You shouldn't be using SHA-1, and even if you *have* to use SHA-1, this isn't
 * the correct way to hash something with SHA-1 (use crypto_shash instead).
 */
#define SHA1_DIGEST_WORDS        (SHA1_DIGEST_SIZE / 4)
#define SHA1_WORKSPACE_WORDS        16
void sha1_init(__u32 *buf);
void sha1_transform(__u32 *digest, const char *data, __u32 *W);

/*
 * Stand-alone implementation of the SHA256 algorithm. It is designed to
 * have as little dependencies as possible so it can be used in the
 * kexec_file purgatory. In other cases you should generally use the
 * hash APIs from include/crypto/hash.h. Especially when hashing large
 * amounts of data as those APIs may be hw-accelerated.
 *
 * For details see lib/crypto/sha256.c
 */

static inline void sha256_init(struct sha256_state *sctx)
{
        sctx->state[0] = SHA256_H0;
        sctx->state[1] = SHA256_H1;
        sctx->state[2] = SHA256_H2;
        sctx->state[3] = SHA256_H3;
        sctx->state[4] = SHA256_H4;
        sctx->state[5] = SHA256_H5;
        sctx->state[6] = SHA256_H6;
        sctx->state[7] = SHA256_H7;
        sctx->count = 0;
}
void sha256_update(struct sha256_state *sctx, const u8 *data, unsigned int len);
void sha256_final(struct sha256_state *sctx, u8 *out);
void sha256(const u8 *data, unsigned int len, u8 *out);

static inline void sha224_init(struct sha256_state *sctx)
{
        sctx->state[0] = SHA224_H0;
        sctx->state[1] = SHA224_H1;
        sctx->state[2] = SHA224_H2;
        sctx->state[3] = SHA224_H3;
        sctx->state[4] = SHA224_H4;
        sctx->state[5] = SHA224_H5;
        sctx->state[6] = SHA224_H6;
        sctx->state[7] = SHA224_H7;
        sctx->count = 0;
}
void sha224_update(struct sha256_state *sctx, const u8 *data, unsigned int len);
void sha224_final(struct sha256_state *sctx, u8 *out);

#endif































































































































































































































    3 



















































































    2 

















































































































































































































































































    3 

























    1 
    3 




























































































































































































































    3 








































    3 








































    3 













































































































































































































































































































































































































































































































    4 


























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PGTABLE_H
#define _ASM_X86_PGTABLE_H

#include <linux/mem_encrypt.h>
#include <asm/page.h>
#include <asm/pgtable_types.h>

/*
 * Macro to mark a page protection value as UC-
 */
#define pgprot_noncached(prot)                                                \
        ((boot_cpu_data.x86 > 3)                                        \
         ? (__pgprot(pgprot_val(prot) |                                        \
                     cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS)))        \
         : (prot))

/*
 * Macros to add or remove encryption attribute
 */
#define pgprot_encrypted(prot)        __pgprot(__sme_set(pgprot_val(prot)))
#define pgprot_decrypted(prot)        __pgprot(__sme_clr(pgprot_val(prot)))

#ifndef __ASSEMBLY__
#include <asm/x86_init.h>
#include <asm/fpu/xstate.h>
#include <asm/fpu/api.h>
#include <asm-generic/pgtable_uffd.h>

extern pgd_t early_top_pgt[PTRS_PER_PGD];
bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd);

void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm);
void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm,
                                   bool user);
void ptdump_walk_pgd_level_checkwx(void);
void ptdump_walk_user_pgd_level_checkwx(void);

#ifdef CONFIG_DEBUG_WX
#define debug_checkwx()                ptdump_walk_pgd_level_checkwx()
#define debug_checkwx_user()        ptdump_walk_user_pgd_level_checkwx()
#else
#define debug_checkwx()                do { } while (0)
#define debug_checkwx_user()        do { } while (0)
#endif

/*
 * ZERO_PAGE is a global shared page that is always zero: used
 * for zero-mapped memory areas etc..
 */
extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]
        __visible;
#define ZERO_PAGE(vaddr) ((void)(vaddr),virt_to_page(empty_zero_page))

extern spinlock_t pgd_lock;
extern struct list_head pgd_list;

extern struct mm_struct *pgd_page_get_mm(struct page *page);

extern pmdval_t early_pmd_flags;

#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else  /* !CONFIG_PARAVIRT_XXL */
#define set_pte(ptep, pte)                native_set_pte(ptep, pte)

#define set_pte_atomic(ptep, pte)                                        \
        native_set_pte_atomic(ptep, pte)

#define set_pmd(pmdp, pmd)                native_set_pmd(pmdp, pmd)

#ifndef __PAGETABLE_P4D_FOLDED
#define set_pgd(pgdp, pgd)                native_set_pgd(pgdp, pgd)
#define pgd_clear(pgd)                        (pgtable_l5_enabled() ? native_pgd_clear(pgd) : 0)
#endif

#ifndef set_p4d
# define set_p4d(p4dp, p4d)                native_set_p4d(p4dp, p4d)
#endif

#ifndef __PAGETABLE_PUD_FOLDED
#define p4d_clear(p4d)                        native_p4d_clear(p4d)
#endif

#ifndef set_pud
# define set_pud(pudp, pud)                native_set_pud(pudp, pud)
#endif

#ifndef __PAGETABLE_PUD_FOLDED
#define pud_clear(pud)                        native_pud_clear(pud)
#endif

#define pte_clear(mm, addr, ptep)        native_pte_clear(mm, addr, ptep)
#define pmd_clear(pmd)                        native_pmd_clear(pmd)

#define pgd_val(x)        native_pgd_val(x)
#define __pgd(x)        native_make_pgd(x)

#ifndef __PAGETABLE_P4D_FOLDED
#define p4d_val(x)        native_p4d_val(x)
#define __p4d(x)        native_make_p4d(x)
#endif

#ifndef __PAGETABLE_PUD_FOLDED
#define pud_val(x)        native_pud_val(x)
#define __pud(x)        native_make_pud(x)
#endif

#ifndef __PAGETABLE_PMD_FOLDED
#define pmd_val(x)        native_pmd_val(x)
#define __pmd(x)        native_make_pmd(x)
#endif

#define pte_val(x)        native_pte_val(x)
#define __pte(x)        native_make_pte(x)

#define arch_end_context_switch(prev)        do {} while(0)
#endif        /* CONFIG_PARAVIRT_XXL */

/*
 * The following only work if pte_present() is true.
 * Undefined behaviour if not..
 */
static inline int pte_dirty(pte_t pte)
{
        return pte_flags(pte) & _PAGE_DIRTY;
}


static inline u32 read_pkru(void)
{
        if (boot_cpu_has(X86_FEATURE_OSPKE))
                return rdpkru();
        return 0;
}

static inline void write_pkru(u32 pkru)
{
        struct pkru_state *pk;

        if (!boot_cpu_has(X86_FEATURE_OSPKE))
                return;

        pk = get_xsave_addr(&current->thread.fpu.state.xsave, XFEATURE_PKRU);

        /*
         * The PKRU value in xstate needs to be in sync with the value that is
         * written to the CPU. The FPU restore on return to userland would
         * otherwise load the previous value again.
         */
        fpregs_lock();
        if (pk)
                pk->pkru = pkru;
        __write_pkru(pkru);
        fpregs_unlock();
}

static inline int pte_young(pte_t pte)
{
        return pte_flags(pte) & _PAGE_ACCESSED;
}

static inline int pmd_dirty(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_DIRTY;
}

static inline int pmd_young(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_ACCESSED;
}

static inline int pud_dirty(pud_t pud)
{
        return pud_flags(pud) & _PAGE_DIRTY;
}

static inline int pud_young(pud_t pud)
{
        return pud_flags(pud) & _PAGE_ACCESSED;
}

static inline int pte_write(pte_t pte)
{
        return pte_flags(pte) & _PAGE_RW;
}

static inline int pte_huge(pte_t pte)
{
        return pte_flags(pte) & _PAGE_PSE;
}

static inline int pte_global(pte_t pte)
{
        return pte_flags(pte) & _PAGE_GLOBAL;
}

static inline int pte_exec(pte_t pte)
{
        return !(pte_flags(pte) & _PAGE_NX);
}

static inline int pte_special(pte_t pte)
{
        return pte_flags(pte) & _PAGE_SPECIAL;
}

/* Entries that were set to PROT_NONE are inverted */

static inline u64 protnone_mask(u64 val);

static inline unsigned long pte_pfn(pte_t pte)
{
        phys_addr_t pfn = pte_val(pte);
        pfn ^= protnone_mask(pfn);
        return (pfn & PTE_PFN_MASK) >> PAGE_SHIFT;
}

static inline unsigned long pmd_pfn(pmd_t pmd)
{
        phys_addr_t pfn = pmd_val(pmd);
        pfn ^= protnone_mask(pfn);
        return (pfn & pmd_pfn_mask(pmd)) >> PAGE_SHIFT;
}

static inline unsigned long pud_pfn(pud_t pud)
{
        phys_addr_t pfn = pud_val(pud);
        pfn ^= protnone_mask(pfn);
        return (pfn & pud_pfn_mask(pud)) >> PAGE_SHIFT;
}

static inline unsigned long p4d_pfn(p4d_t p4d)
{
        return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT;
}

static inline unsigned long pgd_pfn(pgd_t pgd)
{
        return (pgd_val(pgd) & PTE_PFN_MASK) >> PAGE_SHIFT;
}

#define p4d_leaf        p4d_large
static inline int p4d_large(p4d_t p4d)
{
        /* No 512 GiB pages yet */
        return 0;
}

#define pte_page(pte)        pfn_to_page(pte_pfn(pte))

#define pmd_leaf        pmd_large
static inline int pmd_large(pmd_t pte)
{
        return pmd_flags(pte) & _PAGE_PSE;
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* NOTE: when predicate huge page, consider also pmd_devmap, or use pmd_large */
static inline int pmd_trans_huge(pmd_t pmd)
{
        return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
}

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static inline int pud_trans_huge(pud_t pud)
{
        return (pud_val(pud) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
}
#endif

#define has_transparent_hugepage has_transparent_hugepage
static inline int has_transparent_hugepage(void)
{
        return boot_cpu_has(X86_FEATURE_PSE);
}

#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
static inline int pmd_devmap(pmd_t pmd)
{
        return !!(pmd_val(pmd) & _PAGE_DEVMAP);
}

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static inline int pud_devmap(pud_t pud)
{
        return !!(pud_val(pud) & _PAGE_DEVMAP);
}
#else
static inline int pud_devmap(pud_t pud)
{
        return 0;
}
#endif

static inline int pgd_devmap(pgd_t pgd)
{
        return 0;
}
#endif
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
{
        pteval_t v = native_pte_val(pte);

        return native_make_pte(v | set);
}

static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
{
        pteval_t v = native_pte_val(pte);

        return native_make_pte(v & ~clear);
}

#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
static inline int pte_uffd_wp(pte_t pte)
{
        return pte_flags(pte) & _PAGE_UFFD_WP;
}

static inline pte_t pte_mkuffd_wp(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_UFFD_WP);
}

static inline pte_t pte_clear_uffd_wp(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_UFFD_WP);
}
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */

static inline pte_t pte_mkclean(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_DIRTY);
}

static inline pte_t pte_mkold(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_ACCESSED);
}

static inline pte_t pte_wrprotect(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_RW);
}

static inline pte_t pte_mkexec(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_NX);
}

static inline pte_t pte_mkdirty(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
}

static inline pte_t pte_mkyoung(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_ACCESSED);
}

static inline pte_t pte_mkwrite(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_RW);
}

static inline pte_t pte_mkhuge(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_PSE);
}

static inline pte_t pte_clrhuge(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_PSE);
}

static inline pte_t pte_mkglobal(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_GLOBAL);
}

static inline pte_t pte_clrglobal(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_GLOBAL);
}

static inline pte_t pte_mkspecial(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_SPECIAL);
}

static inline pte_t pte_mkdevmap(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_SPECIAL|_PAGE_DEVMAP);
}

static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
{
        pmdval_t v = native_pmd_val(pmd);

        return native_make_pmd(v | set);
}

static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
{
        pmdval_t v = native_pmd_val(pmd);

        return native_make_pmd(v & ~clear);
}

#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
static inline int pmd_uffd_wp(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_UFFD_WP;
}

static inline pmd_t pmd_mkuffd_wp(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_UFFD_WP);
}

static inline pmd_t pmd_clear_uffd_wp(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_UFFD_WP);
}
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */

static inline pmd_t pmd_mkold(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_ACCESSED);
}

static inline pmd_t pmd_mkclean(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_DIRTY);
}

static inline pmd_t pmd_wrprotect(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_RW);
}

static inline pmd_t pmd_mkdirty(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
}

static inline pmd_t pmd_mkdevmap(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_DEVMAP);
}

static inline pmd_t pmd_mkhuge(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_PSE);
}

static inline pmd_t pmd_mkyoung(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_ACCESSED);
}

static inline pmd_t pmd_mkwrite(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_RW);
}

static inline pud_t pud_set_flags(pud_t pud, pudval_t set)
{
        pudval_t v = native_pud_val(pud);

        return native_make_pud(v | set);
}

static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear)
{
        pudval_t v = native_pud_val(pud);

        return native_make_pud(v & ~clear);
}

static inline pud_t pud_mkold(pud_t pud)
{
        return pud_clear_flags(pud, _PAGE_ACCESSED);
}

static inline pud_t pud_mkclean(pud_t pud)
{
        return pud_clear_flags(pud, _PAGE_DIRTY);
}

static inline pud_t pud_wrprotect(pud_t pud)
{
        return pud_clear_flags(pud, _PAGE_RW);
}

static inline pud_t pud_mkdirty(pud_t pud)
{
        return pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
}

static inline pud_t pud_mkdevmap(pud_t pud)
{
        return pud_set_flags(pud, _PAGE_DEVMAP);
}

static inline pud_t pud_mkhuge(pud_t pud)
{
        return pud_set_flags(pud, _PAGE_PSE);
}

static inline pud_t pud_mkyoung(pud_t pud)
{
        return pud_set_flags(pud, _PAGE_ACCESSED);
}

static inline pud_t pud_mkwrite(pud_t pud)
{
        return pud_set_flags(pud, _PAGE_RW);
}

#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
static inline int pte_soft_dirty(pte_t pte)
{
        return pte_flags(pte) & _PAGE_SOFT_DIRTY;
}

static inline int pmd_soft_dirty(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_SOFT_DIRTY;
}

static inline int pud_soft_dirty(pud_t pud)
{
        return pud_flags(pud) & _PAGE_SOFT_DIRTY;
}

static inline pte_t pte_mksoft_dirty(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_SOFT_DIRTY);
}

static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY);
}

static inline pud_t pud_mksoft_dirty(pud_t pud)
{
        return pud_set_flags(pud, _PAGE_SOFT_DIRTY);
}

static inline pte_t pte_clear_soft_dirty(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_SOFT_DIRTY);
}

static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY);
}

static inline pud_t pud_clear_soft_dirty(pud_t pud)
{
        return pud_clear_flags(pud, _PAGE_SOFT_DIRTY);
}

#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */

/*
 * Mask out unsupported bits in a present pgprot.  Non-present pgprots
 * can use those bits for other purposes, so leave them be.
 */
static inline pgprotval_t massage_pgprot(pgprot_t pgprot)
{
        pgprotval_t protval = pgprot_val(pgprot);

        if (protval & _PAGE_PRESENT)
                protval &= __supported_pte_mask;

        return protval;
}

static inline pgprotval_t check_pgprot(pgprot_t pgprot)
{
        pgprotval_t massaged_val = massage_pgprot(pgprot);

        /* mmdebug.h can not be included here because of dependencies */
#ifdef CONFIG_DEBUG_VM
        WARN_ONCE(pgprot_val(pgprot) != massaged_val,
                  "attempted to set unsupported pgprot: %016llx "
                  "bits: %016llx supported: %016llx\n",
                  (u64)pgprot_val(pgprot),
                  (u64)pgprot_val(pgprot) ^ massaged_val,
                  (u64)__supported_pte_mask);
#endif

        return massaged_val;
}

static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
{
        phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
        pfn ^= protnone_mask(pgprot_val(pgprot));
        pfn &= PTE_PFN_MASK;
        return __pte(pfn | check_pgprot(pgprot));
}

static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
{
        phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
        pfn ^= protnone_mask(pgprot_val(pgprot));
        pfn &= PHYSICAL_PMD_PAGE_MASK;
        return __pmd(pfn | check_pgprot(pgprot));
}

static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot)
{
        phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
        pfn ^= protnone_mask(pgprot_val(pgprot));
        pfn &= PHYSICAL_PUD_PAGE_MASK;
        return __pud(pfn | check_pgprot(pgprot));
}

static inline pmd_t pmd_mkinvalid(pmd_t pmd)
{
        return pfn_pmd(pmd_pfn(pmd),
                      __pgprot(pmd_flags(pmd) & ~(_PAGE_PRESENT|_PAGE_PROTNONE)));
}

static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask);

static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
        pteval_t val = pte_val(pte), oldval = val;

        /*
         * Chop off the NX bit (if present), and add the NX portion of
         * the newprot (if present):
         */
        val &= _PAGE_CHG_MASK;
        val |= check_pgprot(newprot) & ~_PAGE_CHG_MASK;
        val = flip_protnone_guard(oldval, val, PTE_PFN_MASK);
        return __pte(val);
}

static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
{
        pmdval_t val = pmd_val(pmd), oldval = val;

        val &= _HPAGE_CHG_MASK;
        val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK;
        val = flip_protnone_guard(oldval, val, PHYSICAL_PMD_PAGE_MASK);
        return __pmd(val);
}

/*
 * mprotect needs to preserve PAT and encryption bits when updating
 * vm_page_prot
 */
#define pgprot_modify pgprot_modify
static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
{
        pgprotval_t preservebits = pgprot_val(oldprot) & _PAGE_CHG_MASK;
        pgprotval_t addbits = pgprot_val(newprot) & ~_PAGE_CHG_MASK;
        return __pgprot(preservebits | addbits);
}

#define pte_pgprot(x) __pgprot(pte_flags(x))
#define pmd_pgprot(x) __pgprot(pmd_flags(x))
#define pud_pgprot(x) __pgprot(pud_flags(x))
#define p4d_pgprot(x) __pgprot(p4d_flags(x))

#define canon_pgprot(p) __pgprot(massage_pgprot(p))

static inline pgprot_t arch_filter_pgprot(pgprot_t prot)
{
        return canon_pgprot(prot);
}

static inline int is_new_memtype_allowed(u64 paddr, unsigned long size,
                                         enum page_cache_mode pcm,
                                         enum page_cache_mode new_pcm)
{
        /*
         * PAT type is always WB for untracked ranges, so no need to check.
         */
        if (x86_platform.is_untracked_pat_range(paddr, paddr + size))
                return 1;

        /*
         * Certain new memtypes are not allowed with certain
         * requested memtype:
         * - request is uncached, return cannot be write-back
         * - request is write-combine, return cannot be write-back
         * - request is write-through, return cannot be write-back
         * - request is write-through, return cannot be write-combine
         */
        if ((pcm == _PAGE_CACHE_MODE_UC_MINUS &&
             new_pcm == _PAGE_CACHE_MODE_WB) ||
            (pcm == _PAGE_CACHE_MODE_WC &&
             new_pcm == _PAGE_CACHE_MODE_WB) ||
            (pcm == _PAGE_CACHE_MODE_WT &&
             new_pcm == _PAGE_CACHE_MODE_WB) ||
            (pcm == _PAGE_CACHE_MODE_WT &&
             new_pcm == _PAGE_CACHE_MODE_WC)) {
                return 0;
        }

        return 1;
}

pmd_t *populate_extra_pmd(unsigned long vaddr);
pte_t *populate_extra_pte(unsigned long vaddr);

#ifdef CONFIG_PAGE_TABLE_ISOLATION
pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd);

/*
 * Take a PGD location (pgdp) and a pgd value that needs to be set there.
 * Populates the user and returns the resulting PGD that must be set in
 * the kernel copy of the page tables.
 */
static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
{
        if (!static_cpu_has(X86_FEATURE_PTI))
                return pgd;
        return __pti_set_user_pgtbl(pgdp, pgd);
}
#else   /* CONFIG_PAGE_TABLE_ISOLATION */
static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
{
        return pgd;
}
#endif  /* CONFIG_PAGE_TABLE_ISOLATION */

#endif        /* __ASSEMBLY__ */


#ifdef CONFIG_X86_32
# include <asm/pgtable_32.h>
#else
# include <asm/pgtable_64.h>
#endif

#ifndef __ASSEMBLY__
#include <linux/mm_types.h>
#include <linux/mmdebug.h>
#include <linux/log2.h>
#include <asm/fixmap.h>

static inline int pte_none(pte_t pte)
{
        return !(pte.pte & ~(_PAGE_KNL_ERRATUM_MASK));
}

#define __HAVE_ARCH_PTE_SAME
static inline int pte_same(pte_t a, pte_t b)
{
        return a.pte == b.pte;
}

static inline int pte_present(pte_t a)
{
        return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
}

#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
static inline int pte_devmap(pte_t a)
{
        return (pte_flags(a) & _PAGE_DEVMAP) == _PAGE_DEVMAP;
}
#endif

#define pte_accessible pte_accessible
static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
{
        if (pte_flags(a) & _PAGE_PRESENT)
                return true;

        if ((pte_flags(a) & _PAGE_PROTNONE) &&
                        mm_tlb_flush_pending(mm))
                return true;

        return false;
}

static inline int pmd_present(pmd_t pmd)
{
        /*
         * Checking for _PAGE_PSE is needed too because
         * split_huge_page will temporarily clear the present bit (but
         * the _PAGE_PSE flag will remain set at all times while the
         * _PAGE_PRESENT bit is clear).
         */
        return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE);
}

#ifdef CONFIG_NUMA_BALANCING
/*
 * These work without NUMA balancing but the kernel does not care. See the
 * comment in include/linux/pgtable.h
 */
static inline int pte_protnone(pte_t pte)
{
        return (pte_flags(pte) & (_PAGE_PROTNONE | _PAGE_PRESENT))
                == _PAGE_PROTNONE;
}

static inline int pmd_protnone(pmd_t pmd)
{
        return (pmd_flags(pmd) & (_PAGE_PROTNONE | _PAGE_PRESENT))
                == _PAGE_PROTNONE;
}
#endif /* CONFIG_NUMA_BALANCING */

static inline int pmd_none(pmd_t pmd)
{
        /* Only check low word on 32-bit platforms, since it might be
           out of sync with upper half. */
        unsigned long val = native_pmd_val(pmd);
        return (val & ~_PAGE_KNL_ERRATUM_MASK) == 0;
}

static inline unsigned long pmd_page_vaddr(pmd_t pmd)
{
        return (unsigned long)__va(pmd_val(pmd) & pmd_pfn_mask(pmd));
}

/*
 * Currently stuck as a macro due to indirect forward reference to
 * linux/mmzone.h's __section_mem_map_addr() definition:
 */
#define pmd_page(pmd)        pfn_to_page(pmd_pfn(pmd))

/*
 * Conversion functions: convert a page and protection to a page entry,
 * and a page entry and page directory to the page they refer to.
 *
 * (Currently stuck as a macro because of indirect forward reference
 * to linux/mm.h:page_to_nid())
 */
#define mk_pte(page, pgprot)   pfn_pte(page_to_pfn(page), (pgprot))

static inline int pmd_bad(pmd_t pmd)
{
        return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
}

static inline unsigned long pages_to_mb(unsigned long npg)
{
        return npg >> (20 - PAGE_SHIFT);
}

#if CONFIG_PGTABLE_LEVELS > 2
static inline int pud_none(pud_t pud)
{
        return (native_pud_val(pud) & ~(_PAGE_KNL_ERRATUM_MASK)) == 0;
}

static inline int pud_present(pud_t pud)
{
        return pud_flags(pud) & _PAGE_PRESENT;
}

static inline pmd_t *pud_pgtable(pud_t pud)
{
        return (pmd_t *)__va(pud_val(pud) & pud_pfn_mask(pud));
}

/*
 * Currently stuck as a macro due to indirect forward reference to
 * linux/mmzone.h's __section_mem_map_addr() definition:
 */
#define pud_page(pud)        pfn_to_page(pud_pfn(pud))

#define pud_leaf        pud_large
static inline int pud_large(pud_t pud)
{
        return (pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) ==
                (_PAGE_PSE | _PAGE_PRESENT);
}

static inline int pud_bad(pud_t pud)
{
        return (pud_flags(pud) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
}
#else
#define pud_leaf        pud_large
static inline int pud_large(pud_t pud)
{
        return 0;
}
#endif        /* CONFIG_PGTABLE_LEVELS > 2 */

#if CONFIG_PGTABLE_LEVELS > 3
static inline int p4d_none(p4d_t p4d)
{
        return (native_p4d_val(p4d) & ~(_PAGE_KNL_ERRATUM_MASK)) == 0;
}

static inline int p4d_present(p4d_t p4d)
{
        return p4d_flags(p4d) & _PAGE_PRESENT;
}

static inline pud_t *p4d_pgtable(p4d_t p4d)
{
        return (pud_t *)__va(p4d_val(p4d) & p4d_pfn_mask(p4d));
}

/*
 * Currently stuck as a macro due to indirect forward reference to
 * linux/mmzone.h's __section_mem_map_addr() definition:
 */
#define p4d_page(p4d)        pfn_to_page(p4d_pfn(p4d))

static inline int p4d_bad(p4d_t p4d)
{
        unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER;

        if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
                ignore_flags |= _PAGE_NX;

        return (p4d_flags(p4d) & ~ignore_flags) != 0;
}
#endif  /* CONFIG_PGTABLE_LEVELS > 3 */

static inline unsigned long p4d_index(unsigned long address)
{
        return (address >> P4D_SHIFT) & (PTRS_PER_P4D - 1);
}

#if CONFIG_PGTABLE_LEVELS > 4
static inline int pgd_present(pgd_t pgd)
{
        if (!pgtable_l5_enabled())
                return 1;
        return pgd_flags(pgd) & _PAGE_PRESENT;
}

static inline unsigned long pgd_page_vaddr(pgd_t pgd)
{
        return (unsigned long)__va((unsigned long)pgd_val(pgd) & PTE_PFN_MASK);
}

/*
 * Currently stuck as a macro due to indirect forward reference to
 * linux/mmzone.h's __section_mem_map_addr() definition:
 */
#define pgd_page(pgd)        pfn_to_page(pgd_pfn(pgd))

/* to find an entry in a page-table-directory. */
static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
{
        if (!pgtable_l5_enabled())
                return (p4d_t *)pgd;
        return (p4d_t *)pgd_page_vaddr(*pgd) + p4d_index(address);
}

static inline int pgd_bad(pgd_t pgd)
{
        unsigned long ignore_flags = _PAGE_USER;

        if (!pgtable_l5_enabled())
                return 0;

        if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
                ignore_flags |= _PAGE_NX;

        return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
}

static inline int pgd_none(pgd_t pgd)
{
        if (!pgtable_l5_enabled())
                return 0;
        /*
         * There is no need to do a workaround for the KNL stray
         * A/D bit erratum here.  PGDs only point to page tables
         * except on 32-bit non-PAE which is not supported on
         * KNL.
         */
        return !native_pgd_val(pgd);
}
#endif        /* CONFIG_PGTABLE_LEVELS > 4 */

#endif        /* __ASSEMBLY__ */

#define KERNEL_PGD_BOUNDARY        pgd_index(PAGE_OFFSET)
#define KERNEL_PGD_PTRS                (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY)

#ifndef __ASSEMBLY__

extern int direct_gbpages;
void init_mem_mapping(void);
void early_alloc_pgt_buf(void);
extern void memblock_find_dma_reserve(void);
void __init poking_init(void);
unsigned long init_memory_mapping(unsigned long start,
                                  unsigned long end, pgprot_t prot);

#ifdef CONFIG_X86_64
extern pgd_t trampoline_pgd_entry;
#endif

/* local pte updates need not use xchg for locking */
static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
{
        pte_t res = *ptep;

        /* Pure native function needs no input for mm, addr */
        native_pte_clear(NULL, 0, ptep);
        return res;
}

static inline pmd_t native_local_pmdp_get_and_clear(pmd_t *pmdp)
{
        pmd_t res = *pmdp;

        native_pmd_clear(pmdp);
        return res;
}

static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp)
{
        pud_t res = *pudp;

        native_pud_clear(pudp);
        return res;
}

static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
                              pte_t *ptep, pte_t pte)
{
        set_pte(ptep, pte);
}

static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
                              pmd_t *pmdp, pmd_t pmd)
{
        set_pmd(pmdp, pmd);
}

static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
                              pud_t *pudp, pud_t pud)
{
        native_set_pud(pudp, pud);
}

/*
 * We only update the dirty/accessed state if we set
 * the dirty bit by hand in the kernel, since the hardware
 * will do the accessed bit for us, and we don't want to
 * race with other CPU's that might be updating the dirty
 * bit at the same time.
 */
struct vm_area_struct;

#define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
extern int ptep_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pte_t *ptep,
                                 pte_t entry, int dirty);

#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
extern int ptep_test_and_clear_young(struct vm_area_struct *vma,
                                     unsigned long addr, pte_t *ptep);

#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
extern int ptep_clear_flush_young(struct vm_area_struct *vma,
                                  unsigned long address, pte_t *ptep);

#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
                                       pte_t *ptep)
{
        pte_t pte = native_ptep_get_and_clear(ptep);
        return pte;
}

#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
                                            unsigned long addr, pte_t *ptep,
                                            int full)
{
        pte_t pte;
        if (full) {
                /*
                 * Full address destruction in progress; paravirt does not
                 * care about updates and native needs no locking
                 */
                pte = native_local_ptep_get_and_clear(ptep);
        } else {
                pte = ptep_get_and_clear(mm, addr, ptep);
        }
        return pte;
}

#define __HAVE_ARCH_PTEP_SET_WRPROTECT
static inline void ptep_set_wrprotect(struct mm_struct *mm,
                                      unsigned long addr, pte_t *ptep)
{
        clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte);
}

#define flush_tlb_fix_spurious_fault(vma, address) do { } while (0)

#define mk_pmd(page, pgprot)   pfn_pmd(page_to_pfn(page), (pgprot))

#define  __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
extern int pmdp_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pmd_t *pmdp,
                                 pmd_t entry, int dirty);
extern int pudp_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pud_t *pudp,
                                 pud_t entry, int dirty);

#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
                                     unsigned long addr, pmd_t *pmdp);
extern int pudp_test_and_clear_young(struct vm_area_struct *vma,
                                     unsigned long addr, pud_t *pudp);

#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
                                  unsigned long address, pmd_t *pmdp);


#define pmd_write pmd_write
static inline int pmd_write(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_RW;
}

#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr,
                                       pmd_t *pmdp)
{
        return native_pmdp_get_and_clear(pmdp);
}

#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
                                        unsigned long addr, pud_t *pudp)
{
        return native_pudp_get_and_clear(pudp);
}

#define __HAVE_ARCH_PMDP_SET_WRPROTECT
static inline void pmdp_set_wrprotect(struct mm_struct *mm,
                                      unsigned long addr, pmd_t *pmdp)
{
        clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
}

#define pud_write pud_write
static inline int pud_write(pud_t pud)
{
        return pud_flags(pud) & _PAGE_RW;
}

#ifndef pmdp_establish
#define pmdp_establish pmdp_establish
static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
                unsigned long address, pmd_t *pmdp, pmd_t pmd)
{
        if (IS_ENABLED(CONFIG_SMP)) {
                return xchg(pmdp, pmd);
        } else {
                pmd_t old = *pmdp;
                WRITE_ONCE(*pmdp, pmd);
                return old;
        }
}
#endif
/*
 * Page table pages are page-aligned.  The lower half of the top
 * level is used for userspace and the top half for the kernel.
 *
 * Returns true for parts of the PGD that map userspace and
 * false for the parts that map the kernel.
 */
static inline bool pgdp_maps_userspace(void *__ptr)
{
        unsigned long ptr = (unsigned long)__ptr;

        return (((ptr & ~PAGE_MASK) / sizeof(pgd_t)) < PGD_KERNEL_START);
}

#define pgd_leaf        pgd_large
static inline int pgd_large(pgd_t pgd) { return 0; }

#ifdef CONFIG_PAGE_TABLE_ISOLATION
/*
 * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages
 * (8k-aligned and 8k in size).  The kernel one is at the beginning 4k and
 * the user one is in the last 4k.  To switch between them, you
 * just need to flip the 12th bit in their addresses.
 */
#define PTI_PGTABLE_SWITCH_BIT        PAGE_SHIFT

/*
 * This generates better code than the inline assembly in
 * __set_bit().
 */
static inline void *ptr_set_bit(void *ptr, int bit)
{
        unsigned long __ptr = (unsigned long)ptr;

        __ptr |= BIT(bit);
        return (void *)__ptr;
}
static inline void *ptr_clear_bit(void *ptr, int bit)
{
        unsigned long __ptr = (unsigned long)ptr;

        __ptr &= ~BIT(bit);
        return (void *)__ptr;
}

static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp)
{
        return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
}

static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp)
{
        return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
}

static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp)
{
        return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
}

static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp)
{
        return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
}
#endif /* CONFIG_PAGE_TABLE_ISOLATION */

/*
 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
 *
 *  dst - pointer to pgd range anwhere on a pgd page
 *  src - ""
 *  count - the number of pgds to copy.
 *
 * dst and src can be on the same page, but the range must not overlap,
 * and must not cross a page boundary.
 */
static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
{
        memcpy(dst, src, count * sizeof(pgd_t));
#ifdef CONFIG_PAGE_TABLE_ISOLATION
        if (!static_cpu_has(X86_FEATURE_PTI))
                return;
        /* Clone the user space pgd as well */
        memcpy(kernel_to_user_pgdp(dst), kernel_to_user_pgdp(src),
               count * sizeof(pgd_t));
#endif
}

#define PTE_SHIFT ilog2(PTRS_PER_PTE)
static inline int page_level_shift(enum pg_level level)
{
        return (PAGE_SHIFT - PTE_SHIFT) + level * PTE_SHIFT;
}
static inline unsigned long page_level_size(enum pg_level level)
{
        return 1UL << page_level_shift(level);
}
static inline unsigned long page_level_mask(enum pg_level level)
{
        return ~(page_level_size(level) - 1);
}

/*
 * The x86 doesn't have any external MMU info: the kernel page
 * tables contain all the necessary information.
 */
static inline void update_mmu_cache(struct vm_area_struct *vma,
                unsigned long addr, pte_t *ptep)
{
}
static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
                unsigned long addr, pmd_t *pmd)
{
}
static inline void update_mmu_cache_pud(struct vm_area_struct *vma,
                unsigned long addr, pud_t *pud)
{
}

#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY);
}

static inline int pte_swp_soft_dirty(pte_t pte)
{
        return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY;
}

static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
}

#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_SWP_SOFT_DIRTY);
}

static inline int pmd_swp_soft_dirty(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_SWP_SOFT_DIRTY;
}

static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_SWP_SOFT_DIRTY);
}
#endif
#endif

#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
static inline pte_t pte_swp_mkuffd_wp(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_SWP_UFFD_WP);
}

static inline int pte_swp_uffd_wp(pte_t pte)
{
        return pte_flags(pte) & _PAGE_SWP_UFFD_WP;
}

static inline pte_t pte_swp_clear_uffd_wp(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_SWP_UFFD_WP);
}

static inline pmd_t pmd_swp_mkuffd_wp(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_SWP_UFFD_WP);
}

static inline int pmd_swp_uffd_wp(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_SWP_UFFD_WP;
}

static inline pmd_t pmd_swp_clear_uffd_wp(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_SWP_UFFD_WP);
}
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */

#define PKRU_AD_BIT 0x1u
#define PKRU_WD_BIT 0x2u
#define PKRU_BITS_PER_PKEY 2

#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
extern u32 init_pkru_value;
#else
#define init_pkru_value        0
#endif

static inline bool __pkru_allows_read(u32 pkru, u16 pkey)
{
        int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
        return !(pkru & (PKRU_AD_BIT << pkru_pkey_bits));
}

static inline bool __pkru_allows_write(u32 pkru, u16 pkey)
{
        int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
        /*
         * Access-disable disables writes too so we need to check
         * both bits here.
         */
        return !(pkru & ((PKRU_AD_BIT|PKRU_WD_BIT) << pkru_pkey_bits));
}

static inline u16 pte_flags_pkey(unsigned long pte_flags)
{
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
        /* ifdef to avoid doing 59-bit shift on 32-bit values */
        return (pte_flags & _PAGE_PKEY_MASK) >> _PAGE_BIT_PKEY_BIT0;
#else
        return 0;
#endif
}

static inline bool __pkru_allows_pkey(u16 pkey, bool write)
{
        u32 pkru = read_pkru();

        if (!__pkru_allows_read(pkru, pkey))
                return false;
        if (write && !__pkru_allows_write(pkru, pkey))
                return false;

        return true;
}

/*
 * 'pteval' can come from a PTE, PMD or PUD.  We only check
 * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
 * same value on all 3 types.
 */
static inline bool __pte_access_permitted(unsigned long pteval, bool write)
{
        unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;

        if (write)
                need_pte_bits |= _PAGE_RW;

        if ((pteval & need_pte_bits) != need_pte_bits)
                return 0;

        return __pkru_allows_pkey(pte_flags_pkey(pteval), write);
}

#define pte_access_permitted pte_access_permitted
static inline bool pte_access_permitted(pte_t pte, bool write)
{
        return __pte_access_permitted(pte_val(pte), write);
}

#define pmd_access_permitted pmd_access_permitted
static inline bool pmd_access_permitted(pmd_t pmd, bool write)
{
        return __pte_access_permitted(pmd_val(pmd), write);
}

#define pud_access_permitted pud_access_permitted
static inline bool pud_access_permitted(pud_t pud, bool write)
{
        return __pte_access_permitted(pud_val(pud), write);
}

#define __HAVE_ARCH_PFN_MODIFY_ALLOWED 1
extern bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot);

static inline bool arch_has_pfn_modify_check(void)
{
        return boot_cpu_has_bug(X86_BUG_L1TF);
}

#define arch_faults_on_old_pte arch_faults_on_old_pte
static inline bool arch_faults_on_old_pte(void)
{
        return false;
}

#endif        /* __ASSEMBLY__ */

#endif /* _ASM_X86_PGTABLE_H */


























































































































































































































































   10 


   10 
































    2 











































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Credentials management - see Documentation/security/credentials.rst
 *
 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#ifndef _LINUX_CRED_H
#define _LINUX_CRED_H

#include <linux/capability.h>
#include <linux/init.h>
#include <linux/key.h>
#include <linux/atomic.h>
#include <linux/uidgid.h>
#include <linux/sched.h>
#include <linux/sched/user.h>

struct cred;
struct inode;

/*
 * COW Supplementary groups list
 */
struct group_info {
        atomic_t        usage;
        int                ngroups;
        kgid_t                gid[0];
} __randomize_layout;

/**
 * get_group_info - Get a reference to a group info structure
 * @group_info: The group info to reference
 *
 * This gets a reference to a set of supplementary groups.
 *
 * If the caller is accessing a task's credentials, they must hold the RCU read
 * lock when reading.
 */
static inline struct group_info *get_group_info(struct group_info *gi)
{
        atomic_inc(&gi->usage);
        return gi;
}

/**
 * put_group_info - Release a reference to a group info structure
 * @group_info: The group info to release
 */
#define put_group_info(group_info)                        \
do {                                                        \
        if (atomic_dec_and_test(&(group_info)->usage))        \
                groups_free(group_info);                \
} while (0)

extern struct group_info init_groups;
#ifdef CONFIG_MULTIUSER
extern struct group_info *groups_alloc(int);
extern void groups_free(struct group_info *);

extern int in_group_p(kgid_t);
extern int in_egroup_p(kgid_t);
extern int groups_search(const struct group_info *, kgid_t);

extern int set_current_groups(struct group_info *);
extern void set_groups(struct cred *, struct group_info *);
extern bool may_setgroups(void);
extern void groups_sort(struct group_info *);
#else
static inline void groups_free(struct group_info *group_info)
{
}

static inline int in_group_p(kgid_t grp)
{
        return 1;
}
static inline int in_egroup_p(kgid_t grp)
{
        return 1;
}
static inline int groups_search(const struct group_info *group_info, kgid_t grp)
{
        return 1;
}
#endif

/*
 * The security context of a task
 *
 * The parts of the context break down into two categories:
 *
 *  (1) The objective context of a task.  These parts are used when some other
 *        task is attempting to affect this one.
 *
 *  (2) The subjective context.  These details are used when the task is acting
 *        upon another object, be that a file, a task, a key or whatever.
 *
 * Note that some members of this structure belong to both categories - the
 * LSM security pointer for instance.
 *
 * A task has two security pointers.  task->real_cred points to the objective
 * context that defines that task's actual details.  The objective part of this
 * context is used whenever that task is acted upon.
 *
 * task->cred points to the subjective context that defines the details of how
 * that task is going to act upon another object.  This may be overridden
 * temporarily to point to another security context, but normally points to the
 * same context as task->real_cred.
 */
struct cred {
        atomic_long_t        usage;
#ifdef CONFIG_DEBUG_CREDENTIALS
        atomic_t        subscribers;        /* number of processes subscribed */
        void                *put_addr;
        unsigned        magic;
#define CRED_MAGIC        0x43736564
#define CRED_MAGIC_DEAD        0x44656144
#endif
        kuid_t                uid;                /* real UID of the task */
        kgid_t                gid;                /* real GID of the task */
        kuid_t                suid;                /* saved UID of the task */
        kgid_t                sgid;                /* saved GID of the task */
        kuid_t                euid;                /* effective UID of the task */
        kgid_t                egid;                /* effective GID of the task */
        kuid_t                fsuid;                /* UID for VFS ops */
        kgid_t                fsgid;                /* GID for VFS ops */
        unsigned        securebits;        /* SUID-less security management */
        kernel_cap_t        cap_inheritable; /* caps our children can inherit */
        kernel_cap_t        cap_permitted;        /* caps we're permitted */
        kernel_cap_t        cap_effective;        /* caps we can actually use */
        kernel_cap_t        cap_bset;        /* capability bounding set */
        kernel_cap_t        cap_ambient;        /* Ambient capability set */
#ifdef CONFIG_KEYS
        unsigned char        jit_keyring;        /* default keyring to attach requested
                                         * keys to */
        struct key        *session_keyring; /* keyring inherited over fork */
        struct key        *process_keyring; /* keyring private to this process */
        struct key        *thread_keyring; /* keyring private to this thread */
        struct key        *request_key_auth; /* assumed request_key authority */
#endif
#ifdef CONFIG_SECURITY
        void                *security;        /* subjective LSM security */
#endif
        struct user_struct *user;        /* real user ID subscription */
        struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */
        struct group_info *group_info;        /* supplementary groups for euid/fsgid */
        /* RCU deletion */
        union {
                int non_rcu;                        /* Can we skip RCU deletion? */
                struct rcu_head        rcu;                /* RCU deletion hook */
        };
} __randomize_layout;

extern void __put_cred(struct cred *);
extern void exit_creds(struct task_struct *);
extern int copy_creds(struct task_struct *, unsigned long);
extern const struct cred *get_task_cred(struct task_struct *);
extern struct cred *cred_alloc_blank(void);
extern struct cred *prepare_creds(void);
extern struct cred *prepare_exec_creds(void);
extern int commit_creds(struct cred *);
extern void abort_creds(struct cred *);
extern const struct cred *override_creds(const struct cred *);
extern void revert_creds(const struct cred *);
extern struct cred *prepare_kernel_cred(struct task_struct *);
extern int change_create_files_as(struct cred *, struct inode *);
extern int set_security_override(struct cred *, u32);
extern int set_security_override_from_ctx(struct cred *, const char *);
extern int set_create_files_as(struct cred *, struct inode *);
extern int cred_fscmp(const struct cred *, const struct cred *);
extern void __init cred_init(void);

/*
 * check for validity of credentials
 */
#ifdef CONFIG_DEBUG_CREDENTIALS
extern void __invalid_creds(const struct cred *, const char *, unsigned);
extern void __validate_process_creds(struct task_struct *,
                                     const char *, unsigned);

extern bool creds_are_invalid(const struct cred *cred);

static inline void __validate_creds(const struct cred *cred,
                                    const char *file, unsigned line)
{
        if (unlikely(creds_are_invalid(cred)))
                __invalid_creds(cred, file, line);
}

#define validate_creds(cred)                                \
do {                                                        \
        __validate_creds((cred), __FILE__, __LINE__);        \
} while(0)

#define validate_process_creds()                                \
do {                                                                \
        __validate_process_creds(current, __FILE__, __LINE__);        \
} while(0)

extern void validate_creds_for_do_exit(struct task_struct *);
#else
static inline void validate_creds(const struct cred *cred)
{
}
static inline void validate_creds_for_do_exit(struct task_struct *tsk)
{
}
static inline void validate_process_creds(void)
{
}
#endif

static inline bool cap_ambient_invariant_ok(const struct cred *cred)
{
        return cap_issubset(cred->cap_ambient,
                            cap_intersect(cred->cap_permitted,
                                          cred->cap_inheritable));
}

/**
 * get_new_cred - Get a reference on a new set of credentials
 * @cred: The new credentials to reference
 *
 * Get a reference on the specified set of new credentials.  The caller must
 * release the reference.
 */
static inline struct cred *get_new_cred(struct cred *cred)
{
        atomic_long_inc(&cred->usage);
        return cred;
}

/**
 * get_cred - Get a reference on a set of credentials
 * @cred: The credentials to reference
 *
 * Get a reference on the specified set of credentials.  The caller must
 * release the reference.  If %NULL is passed, it is returned with no action.
 *
 * This is used to deal with a committed set of credentials.  Although the
 * pointer is const, this will temporarily discard the const and increment the
 * usage count.  The purpose of this is to attempt to catch at compile time the
 * accidental alteration of a set of credentials that should be considered
 * immutable.
 */
static inline const struct cred *get_cred(const struct cred *cred)
{
        struct cred *nonconst_cred = (struct cred *) cred;
        if (!cred)
                return cred;
        validate_creds(cred);
        nonconst_cred->non_rcu = 0;
        return get_new_cred(nonconst_cred);
}

static inline const struct cred *get_cred_rcu(const struct cred *cred)
{
        struct cred *nonconst_cred = (struct cred *) cred;
        if (!cred)
                return NULL;
        if (!atomic_long_inc_not_zero(&nonconst_cred->usage))
                return NULL;
        validate_creds(cred);
        nonconst_cred->non_rcu = 0;
        return cred;
}

/**
 * put_cred - Release a reference to a set of credentials
 * @cred: The credentials to release
 *
 * Release a reference to a set of credentials, deleting them when the last ref
 * is released.  If %NULL is passed, nothing is done.
 *
 * This takes a const pointer to a set of credentials because the credentials
 * on task_struct are attached by const pointers to prevent accidental
 * alteration of otherwise immutable credential sets.
 */
static inline void put_cred(const struct cred *_cred)
{
        struct cred *cred = (struct cred *) _cred;

        if (cred) {
                validate_creds(cred);
                if (atomic_long_dec_and_test(&(cred)->usage))
                        __put_cred(cred);
        }
}

/**
 * current_cred - Access the current task's subjective credentials
 *
 * Access the subjective credentials of the current task.  RCU-safe,
 * since nobody else can modify it.
 */
#define current_cred() \
        rcu_dereference_protected(current->cred, 1)

/**
 * current_real_cred - Access the current task's objective credentials
 *
 * Access the objective credentials of the current task.  RCU-safe,
 * since nobody else can modify it.
 */
#define current_real_cred() \
        rcu_dereference_protected(current->real_cred, 1)

/**
 * __task_cred - Access a task's objective credentials
 * @task: The task to query
 *
 * Access the objective credentials of a task.  The caller must hold the RCU
 * readlock.
 *
 * The result of this function should not be passed directly to get_cred();
 * rather get_task_cred() should be used instead.
 */
#define __task_cred(task)        \
        rcu_dereference((task)->real_cred)

/**
 * get_current_cred - Get the current task's subjective credentials
 *
 * Get the subjective credentials of the current task, pinning them so that
 * they can't go away.  Accessing the current task's credentials directly is
 * not permitted.
 */
#define get_current_cred()                                \
        (get_cred(current_cred()))

/**
 * get_current_user - Get the current task's user_struct
 *
 * Get the user record of the current task, pinning it so that it can't go
 * away.
 */
#define get_current_user()                                \
({                                                        \
        struct user_struct *__u;                        \
        const struct cred *__cred;                        \
        __cred = current_cred();                        \
        __u = get_uid(__cred->user);                        \
        __u;                                                \
})

/**
 * get_current_groups - Get the current task's supplementary group list
 *
 * Get the supplementary group list of the current task, pinning it so that it
 * can't go away.
 */
#define get_current_groups()                                \
({                                                        \
        struct group_info *__groups;                        \
        const struct cred *__cred;                        \
        __cred = current_cred();                        \
        __groups = get_group_info(__cred->group_info);        \
        __groups;                                        \
})

#define task_cred_xxx(task, xxx)                        \
({                                                        \
        __typeof__(((struct cred *)NULL)->xxx) ___val;        \
        rcu_read_lock();                                \
        ___val = __task_cred((task))->xxx;                \
        rcu_read_unlock();                                \
        ___val;                                                \
})

#define task_uid(task)                (task_cred_xxx((task), uid))
#define task_euid(task)                (task_cred_xxx((task), euid))

#define current_cred_xxx(xxx)                        \
({                                                \
        current_cred()->xxx;                        \
})

#define current_uid()                (current_cred_xxx(uid))
#define current_gid()                (current_cred_xxx(gid))
#define current_euid()                (current_cred_xxx(euid))
#define current_egid()                (current_cred_xxx(egid))
#define current_suid()                (current_cred_xxx(suid))
#define current_sgid()                (current_cred_xxx(sgid))
#define current_fsuid()         (current_cred_xxx(fsuid))
#define current_fsgid()         (current_cred_xxx(fsgid))
#define current_cap()                (current_cred_xxx(cap_effective))
#define current_user()                (current_cred_xxx(user))

extern struct user_namespace init_user_ns;
#ifdef CONFIG_USER_NS
#define current_user_ns()        (current_cred_xxx(user_ns))
#else
static inline struct user_namespace *current_user_ns(void)
{
        return &init_user_ns;
}
#endif


#define current_uid_gid(_uid, _gid)                \
do {                                                \
        const struct cred *__cred;                \
        __cred = current_cred();                \
        *(_uid) = __cred->uid;                        \
        *(_gid) = __cred->gid;                        \
} while(0)

#define current_euid_egid(_euid, _egid)                \
do {                                                \
        const struct cred *__cred;                \
        __cred = current_cred();                \
        *(_euid) = __cred->euid;                \
        *(_egid) = __cred->egid;                \
} while(0)

#define current_fsuid_fsgid(_fsuid, _fsgid)        \
do {                                                \
        const struct cred *__cred;                \
        __cred = current_cred();                \
        *(_fsuid) = __cred->fsuid;                \
        *(_fsgid) = __cred->fsgid;                \
} while(0)

#endif /* _LINUX_CRED_H */















































    1 

    1 


    1 










    1 



    1 
    1 
    1 


































































    1 

























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#ifndef INTERNAL_IO_WQ_H
#define INTERNAL_IO_WQ_H

#include <linux/refcount.h>

struct io_wq;

enum {
        IO_WQ_WORK_CANCEL        = 1,
        IO_WQ_WORK_HASHED        = 2,
        IO_WQ_WORK_UNBOUND        = 4,
        IO_WQ_WORK_CONCURRENT        = 16,

        IO_WQ_HASH_SHIFT        = 24,        /* upper 8 bits are used for hash key */
};

enum io_wq_cancel {
        IO_WQ_CANCEL_OK,        /* cancelled before started */
        IO_WQ_CANCEL_RUNNING,        /* found, running, and attempted cancelled */
        IO_WQ_CANCEL_NOTFOUND,        /* work not found */
};

struct io_wq_work_node {
        struct io_wq_work_node *next;
};

struct io_wq_work_list {
        struct io_wq_work_node *first;
        struct io_wq_work_node *last;
};

static inline void wq_list_add_after(struct io_wq_work_node *node,
                                     struct io_wq_work_node *pos,
                                     struct io_wq_work_list *list)
{
        struct io_wq_work_node *next = pos->next;

        pos->next = node;
        node->next = next;
        if (!next)
                list->last = node;
}

static inline void wq_list_add_tail(struct io_wq_work_node *node,
                                    struct io_wq_work_list *list)
{
        node->next = NULL;
        if (!list->first) {
                list->last = node;
                WRITE_ONCE(list->first, node);
        } else {
                list->last->next = node;
                list->last = node;
        }
}

static inline void wq_list_cut(struct io_wq_work_list *list,
                               struct io_wq_work_node *last,
                               struct io_wq_work_node *prev)
{
        /* first in the list, if prev==NULL */
        if (!prev)
                WRITE_ONCE(list->first, last->next);
        else
                prev->next = last->next;

        if (last == list->last)
                list->last = prev;
        last->next = NULL;
}

static inline void wq_list_del(struct io_wq_work_list *list,
                               struct io_wq_work_node *node,
                               struct io_wq_work_node *prev)
{
        wq_list_cut(list, node, prev);
}

#define wq_list_for_each(pos, prv, head)                        \
        for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)

#define wq_list_empty(list)        (READ_ONCE((list)->first) == NULL)
#define INIT_WQ_LIST(list)        do {                                \
        (list)->first = NULL;                                        \
        (list)->last = NULL;                                        \
} while (0)

struct io_wq_work {
        struct io_wq_work_node list;
        unsigned flags;
};

static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
{
        if (!work->list.next)
                return NULL;

        return container_of(work->list.next, struct io_wq_work, list);
}

typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *);
typedef void (io_wq_work_fn)(struct io_wq_work *);

struct io_wq_hash {
        refcount_t refs;
        unsigned long map;
        struct wait_queue_head wait;
};

static inline void io_wq_put_hash(struct io_wq_hash *hash)
{
        if (refcount_dec_and_test(&hash->refs))
                kfree(hash);
}

struct io_wq_data {
        struct io_wq_hash *hash;
        struct task_struct *task;
        io_wq_work_fn *do_work;
        free_work_fn *free_work;
};

struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
void io_wq_exit_start(struct io_wq *wq);
void io_wq_put_and_exit(struct io_wq *wq);

void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
void io_wq_hash_work(struct io_wq_work *work, void *val);

int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask);
int io_wq_max_workers(struct io_wq *wq, int *new_count);
bool io_wq_worker_stopped(void);

static inline bool io_wq_is_hashed(struct io_wq_work *work)
{
        return work->flags & IO_WQ_WORK_HASHED;
}

typedef bool (work_cancel_fn)(struct io_wq_work *, void *);

enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
                                        void *data, bool cancel_all);

#if defined(CONFIG_IO_WQ)
extern void io_wq_worker_sleeping(struct task_struct *);
extern void io_wq_worker_running(struct task_struct *);
#else
static inline void io_wq_worker_sleeping(struct task_struct *tsk)
{
}
static inline void io_wq_worker_running(struct task_struct *tsk)
{
}
#endif

static inline bool io_wq_current_is_worker(void)
{
        return in_task() && (current->flags & PF_IO_WORKER) &&
                current->pf_io_worker;
}
#endif


























































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2007, 2008, 2009 Siemens AG
 *
 * Written by:
 * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
 */

#ifndef __NET_CFG802154_H
#define __NET_CFG802154_H

#include <linux/ieee802154.h>
#include <linux/netdevice.h>
#include <linux/mutex.h>
#include <linux/bug.h>

#include <net/nl802154.h>

struct wpan_phy;
struct wpan_phy_cca;

#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL
struct ieee802154_llsec_device_key;
struct ieee802154_llsec_seclevel;
struct ieee802154_llsec_params;
struct ieee802154_llsec_device;
struct ieee802154_llsec_table;
struct ieee802154_llsec_key_id;
struct ieee802154_llsec_key;
#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */

struct cfg802154_ops {
        struct net_device * (*add_virtual_intf_deprecated)(struct wpan_phy *wpan_phy,
                                                           const char *name,
                                                           unsigned char name_assign_type,
                                                           int type);
        void        (*del_virtual_intf_deprecated)(struct wpan_phy *wpan_phy,
                                               struct net_device *dev);
        int        (*suspend)(struct wpan_phy *wpan_phy);
        int        (*resume)(struct wpan_phy *wpan_phy);
        int        (*add_virtual_intf)(struct wpan_phy *wpan_phy,
                                    const char *name,
                                    unsigned char name_assign_type,
                                    enum nl802154_iftype type,
                                    __le64 extended_addr);
        int        (*del_virtual_intf)(struct wpan_phy *wpan_phy,
                                    struct wpan_dev *wpan_dev);
        int        (*set_channel)(struct wpan_phy *wpan_phy, u8 page, u8 channel);
        int        (*set_cca_mode)(struct wpan_phy *wpan_phy,
                                const struct wpan_phy_cca *cca);
        int     (*set_cca_ed_level)(struct wpan_phy *wpan_phy, s32 ed_level);
        int     (*set_tx_power)(struct wpan_phy *wpan_phy, s32 power);
        int        (*set_pan_id)(struct wpan_phy *wpan_phy,
                              struct wpan_dev *wpan_dev, __le16 pan_id);
        int        (*set_short_addr)(struct wpan_phy *wpan_phy,
                                  struct wpan_dev *wpan_dev, __le16 short_addr);
        int        (*set_backoff_exponent)(struct wpan_phy *wpan_phy,
                                        struct wpan_dev *wpan_dev, u8 min_be,
                                        u8 max_be);
        int        (*set_max_csma_backoffs)(struct wpan_phy *wpan_phy,
                                         struct wpan_dev *wpan_dev,
                                         u8 max_csma_backoffs);
        int        (*set_max_frame_retries)(struct wpan_phy *wpan_phy,
                                         struct wpan_dev *wpan_dev,
                                         s8 max_frame_retries);
        int        (*set_lbt_mode)(struct wpan_phy *wpan_phy,
                                struct wpan_dev *wpan_dev, bool mode);
        int        (*set_ackreq_default)(struct wpan_phy *wpan_phy,
                                      struct wpan_dev *wpan_dev, bool ackreq);
#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL
        void        (*get_llsec_table)(struct wpan_phy *wpan_phy,
                                   struct wpan_dev *wpan_dev,
                                   struct ieee802154_llsec_table **table);
        void        (*lock_llsec_table)(struct wpan_phy *wpan_phy,
                                    struct wpan_dev *wpan_dev);
        void        (*unlock_llsec_table)(struct wpan_phy *wpan_phy,
                                      struct wpan_dev *wpan_dev);
        /* TODO remove locking/get table callbacks, this is part of the
         * nl802154 interface and should be accessible from ieee802154 layer.
         */
        int        (*get_llsec_params)(struct wpan_phy *wpan_phy,
                                    struct wpan_dev *wpan_dev,
                                    struct ieee802154_llsec_params *params);
        int        (*set_llsec_params)(struct wpan_phy *wpan_phy,
                                    struct wpan_dev *wpan_dev,
                                    const struct ieee802154_llsec_params *params,
                                    int changed);
        int        (*add_llsec_key)(struct wpan_phy *wpan_phy,
                                 struct wpan_dev *wpan_dev,
                                 const struct ieee802154_llsec_key_id *id,
                                 const struct ieee802154_llsec_key *key);
        int        (*del_llsec_key)(struct wpan_phy *wpan_phy,
                                 struct wpan_dev *wpan_dev,
                                 const struct ieee802154_llsec_key_id *id);
        int        (*add_seclevel)(struct wpan_phy *wpan_phy,
                                 struct wpan_dev *wpan_dev,
                                 const struct ieee802154_llsec_seclevel *sl);
        int        (*del_seclevel)(struct wpan_phy *wpan_phy,
                                 struct wpan_dev *wpan_dev,
                                 const struct ieee802154_llsec_seclevel *sl);
        int        (*add_device)(struct wpan_phy *wpan_phy,
                              struct wpan_dev *wpan_dev,
                              const struct ieee802154_llsec_device *dev);
        int        (*del_device)(struct wpan_phy *wpan_phy,
                              struct wpan_dev *wpan_dev, __le64 extended_addr);
        int        (*add_devkey)(struct wpan_phy *wpan_phy,
                              struct wpan_dev *wpan_dev,
                              __le64 extended_addr,
                              const struct ieee802154_llsec_device_key *key);
        int        (*del_devkey)(struct wpan_phy *wpan_phy,
                              struct wpan_dev *wpan_dev,
                              __le64 extended_addr,
                              const struct ieee802154_llsec_device_key *key);
#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */
};

static inline bool
wpan_phy_supported_bool(bool b, enum nl802154_supported_bool_states st)
{
        switch (st) {
        case NL802154_SUPPORTED_BOOL_TRUE:
                return b;
        case NL802154_SUPPORTED_BOOL_FALSE:
                return !b;
        case NL802154_SUPPORTED_BOOL_BOTH:
                return true;
        default:
                WARN_ON(1);
        }

        return false;
}

struct wpan_phy_supported {
        u32 channels[IEEE802154_MAX_PAGE + 1],
            cca_modes, cca_opts, iftypes;
        enum nl802154_supported_bool_states lbt;
        u8 min_minbe, max_minbe, min_maxbe, max_maxbe,
           min_csma_backoffs, max_csma_backoffs;
        s8 min_frame_retries, max_frame_retries;
        size_t tx_powers_size, cca_ed_levels_size;
        const s32 *tx_powers, *cca_ed_levels;
};

struct wpan_phy_cca {
        enum nl802154_cca_modes mode;
        enum nl802154_cca_opts opt;
};

static inline bool
wpan_phy_cca_cmp(const struct wpan_phy_cca *a, const struct wpan_phy_cca *b)
{
        if (a->mode != b->mode)
                return false;

        if (a->mode == NL802154_CCA_ENERGY_CARRIER)
                return a->opt == b->opt;

        return true;
}

/**
 * @WPAN_PHY_FLAG_TRANSMIT_POWER: Indicates that transceiver will support
 *        transmit power setting.
 * @WPAN_PHY_FLAG_CCA_ED_LEVEL: Indicates that transceiver will support cca ed
 *        level setting.
 * @WPAN_PHY_FLAG_CCA_MODE: Indicates that transceiver will support cca mode
 *        setting.
 */
enum wpan_phy_flags {
        WPAN_PHY_FLAG_TXPOWER                = BIT(1),
        WPAN_PHY_FLAG_CCA_ED_LEVEL        = BIT(2),
        WPAN_PHY_FLAG_CCA_MODE                = BIT(3),
};

struct wpan_phy {
        /* If multiple wpan_phys are registered and you're handed e.g.
         * a regular netdev with assigned ieee802154_ptr, you won't
         * know whether it points to a wpan_phy your driver has registered
         * or not. Assign this to something global to your driver to
         * help determine whether you own this wpan_phy or not.
         */
        const void *privid;

        u32 flags;

        /*
         * This is a PIB according to 802.15.4-2011.
         * We do not provide timing-related variables, as they
         * aren't used outside of driver
         */
        u8 current_channel;
        u8 current_page;
        struct wpan_phy_supported supported;
        /* current transmit_power in mBm */
        s32 transmit_power;
        struct wpan_phy_cca cca;

        __le64 perm_extended_addr;

        /* current cca ed threshold in mBm */
        s32 cca_ed_level;

        /* PHY depended MAC PIB values */

        /* 802.15.4 acronym: Tdsym in usec */
        u8 symbol_duration;
        /* lifs and sifs periods timing */
        u16 lifs_period;
        u16 sifs_period;

        struct device dev;

        /* the network namespace this phy lives in currently */
        possible_net_t _net;

        char priv[] __aligned(NETDEV_ALIGN);
};

static inline struct net *wpan_phy_net(struct wpan_phy *wpan_phy)
{
        return read_pnet(&wpan_phy->_net);
}

static inline void wpan_phy_net_set(struct wpan_phy *wpan_phy, struct net *net)
{
        write_pnet(&wpan_phy->_net, net);
}

struct ieee802154_addr {
        u8 mode;
        __le16 pan_id;
        union {
                __le16 short_addr;
                __le64 extended_addr;
        };
};

struct ieee802154_llsec_key_id {
        u8 mode;
        u8 id;
        union {
                struct ieee802154_addr device_addr;
                __le32 short_source;
                __le64 extended_source;
        };
};

#define IEEE802154_LLSEC_KEY_SIZE 16

struct ieee802154_llsec_key {
        u8 frame_types;
        u32 cmd_frame_ids;
        /* TODO replace with NL802154_KEY_SIZE */
        u8 key[IEEE802154_LLSEC_KEY_SIZE];
};

struct ieee802154_llsec_key_entry {
        struct list_head list;
        struct rcu_head rcu;

        struct ieee802154_llsec_key_id id;
        struct ieee802154_llsec_key *key;
};

struct ieee802154_llsec_params {
        bool enabled;

        __be32 frame_counter;
        u8 out_level;
        struct ieee802154_llsec_key_id out_key;

        __le64 default_key_source;

        __le16 pan_id;
        __le64 hwaddr;
        __le64 coord_hwaddr;
        __le16 coord_shortaddr;
};

struct ieee802154_llsec_table {
        struct list_head keys;
        struct list_head devices;
        struct list_head security_levels;
};

struct ieee802154_llsec_seclevel {
        struct list_head list;

        u8 frame_type;
        u8 cmd_frame_id;
        bool device_override;
        u32 sec_levels;
};

struct ieee802154_llsec_device {
        struct list_head list;

        __le16 pan_id;
        __le16 short_addr;
        __le64 hwaddr;
        u32 frame_counter;
        bool seclevel_exempt;

        u8 key_mode;
        struct list_head keys;
};

struct ieee802154_llsec_device_key {
        struct list_head list;

        struct ieee802154_llsec_key_id key_id;
        u32 frame_counter;
};

struct wpan_dev_header_ops {
        /* TODO create callback currently assumes ieee802154_mac_cb inside
         * skb->cb. This should be changed to give these information as
         * parameter.
         */
        int        (*create)(struct sk_buff *skb, struct net_device *dev,
                          const struct ieee802154_addr *daddr,
                          const struct ieee802154_addr *saddr,
                          unsigned int len);
};

struct wpan_dev {
        struct wpan_phy *wpan_phy;
        int iftype;

        /* the remainder of this struct should be private to cfg802154 */
        struct list_head list;
        struct net_device *netdev;

        const struct wpan_dev_header_ops *header_ops;

        /* lowpan interface, set when the wpan_dev belongs to one lowpan_dev */
        struct net_device *lowpan_dev;

        u32 identifier;

        /* MAC PIB */
        __le16 pan_id;
        __le16 short_addr;
        __le64 extended_addr;

        /* MAC BSN field */
        atomic_t bsn;
        /* MAC DSN field */
        atomic_t dsn;

        u8 min_be;
        u8 max_be;
        u8 csma_retries;
        s8 frame_retries;

        bool lbt;

        bool promiscuous_mode;

        /* fallback for acknowledgment bit setting */
        bool ackreq;
};

#define to_phy(_dev)        container_of(_dev, struct wpan_phy, dev)

static inline int
wpan_dev_hard_header(struct sk_buff *skb, struct net_device *dev,
                     const struct ieee802154_addr *daddr,
                     const struct ieee802154_addr *saddr,
                     unsigned int len)
{
        struct wpan_dev *wpan_dev = dev->ieee802154_ptr;

        return wpan_dev->header_ops->create(skb, dev, daddr, saddr, len);
}

struct wpan_phy *
wpan_phy_new(const struct cfg802154_ops *ops, size_t priv_size);
static inline void wpan_phy_set_dev(struct wpan_phy *phy, struct device *dev)
{
        phy->dev.parent = dev;
}

int wpan_phy_register(struct wpan_phy *phy);
void wpan_phy_unregister(struct wpan_phy *phy);
void wpan_phy_free(struct wpan_phy *phy);
/* Same semantics as for class_for_each_device */
int wpan_phy_for_each(int (*fn)(struct wpan_phy *phy, void *data), void *data);

static inline void *wpan_phy_priv(struct wpan_phy *phy)
{
        BUG_ON(!phy);
        return &phy->priv;
}

struct wpan_phy *wpan_phy_find(const char *str);

static inline void wpan_phy_put(struct wpan_phy *phy)
{
        put_device(&phy->dev);
}

static inline const char *wpan_phy_name(struct wpan_phy *phy)
{
        return dev_name(&phy->dev);
}

#endif /* __NET_CFG802154_H */






























































































































































    1 































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _MM_PERCPU_INTERNAL_H
#define _MM_PERCPU_INTERNAL_H

#include <linux/types.h>
#include <linux/percpu.h>

/*
 * There are two chunk types: root and memcg-aware.
 * Chunks of each type have separate slots list.
 *
 * Memcg-aware chunks have an attached vector of obj_cgroup pointers, which is
 * used to store memcg membership data of a percpu object.  Obj_cgroups are
 * ref-counted pointers to a memory cgroup with an ability to switch dynamically
 * to the parent memory cgroup.  This allows to reclaim a deleted memory cgroup
 * without reclaiming of all outstanding objects, which hold a reference at it.
 */
enum pcpu_chunk_type {
        PCPU_CHUNK_ROOT,
#ifdef CONFIG_MEMCG_KMEM
        PCPU_CHUNK_MEMCG,
#endif
        PCPU_NR_CHUNK_TYPES,
        PCPU_FAIL_ALLOC = PCPU_NR_CHUNK_TYPES
};

/*
 * pcpu_block_md is the metadata block struct.
 * Each chunk's bitmap is split into a number of full blocks.
 * All units are in terms of bits.
 *
 * The scan hint is the largest known contiguous area before the contig hint.
 * It is not necessarily the actual largest contig hint though.  There is an
 * invariant that the scan_hint_start > contig_hint_start iff
 * scan_hint == contig_hint.  This is necessary because when scanning forward,
 * we don't know if a new contig hint would be better than the current one.
 */
struct pcpu_block_md {
        int                        scan_hint;        /* scan hint for block */
        int                        scan_hint_start; /* block relative starting
                                                    position of the scan hint */
        int                     contig_hint;    /* contig hint for block */
        int                     contig_hint_start; /* block relative starting
                                                      position of the contig hint */
        int                     left_free;      /* size of free space along
                                                   the left side of the block */
        int                     right_free;     /* size of free space along
                                                   the right side of the block */
        int                     first_free;     /* block position of first free */
        int                        nr_bits;        /* total bits responsible for */
};

struct pcpu_chunk {
#ifdef CONFIG_PERCPU_STATS
        int                        nr_alloc;        /* # of allocations */
        size_t                        max_alloc_size; /* largest allocation size */
#endif

        struct list_head        list;                /* linked to pcpu_slot lists */
        int                        free_bytes;        /* free bytes in the chunk */
        struct pcpu_block_md        chunk_md;
        void                        *base_addr;        /* base address of this chunk */

        unsigned long                *alloc_map;        /* allocation map */
        unsigned long                *bound_map;        /* boundary map */
        struct pcpu_block_md        *md_blocks;        /* metadata blocks */

        void                        *data;                /* chunk data */
        bool                        immutable;        /* no [de]population allowed */
        int                        start_offset;        /* the overlap with the previous
                                                   region to have a page aligned
                                                   base_addr */
        int                        end_offset;        /* additional area required to
                                                   have the region end page
                                                   aligned */
#ifdef CONFIG_MEMCG_KMEM
        struct obj_cgroup        **obj_cgroups;        /* vector of object cgroups */
#endif

        int                        nr_pages;        /* # of pages served by this chunk */
        int                        nr_populated;        /* # of populated pages */
        int                     nr_empty_pop_pages; /* # of empty populated pages */
        unsigned long                populated[];        /* populated bitmap */
};

extern spinlock_t pcpu_lock;

extern struct list_head *pcpu_chunk_lists;
extern int pcpu_nr_slots;
extern int pcpu_nr_empty_pop_pages[];

extern struct pcpu_chunk *pcpu_first_chunk;
extern struct pcpu_chunk *pcpu_reserved_chunk;

/**
 * pcpu_chunk_nr_blocks - converts nr_pages to # of md_blocks
 * @chunk: chunk of interest
 *
 * This conversion is from the number of physical pages that the chunk
 * serves to the number of bitmap blocks used.
 */
static inline int pcpu_chunk_nr_blocks(struct pcpu_chunk *chunk)
{
        return chunk->nr_pages * PAGE_SIZE / PCPU_BITMAP_BLOCK_SIZE;
}

/**
 * pcpu_nr_pages_to_map_bits - converts the pages to size of bitmap
 * @pages: number of physical pages
 *
 * This conversion is from physical pages to the number of bits
 * required in the bitmap.
 */
static inline int pcpu_nr_pages_to_map_bits(int pages)
{
        return pages * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
}

/**
 * pcpu_chunk_map_bits - helper to convert nr_pages to size of bitmap
 * @chunk: chunk of interest
 *
 * This conversion is from the number of physical pages that the chunk
 * serves to the number of bits in the bitmap.
 */
static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk)
{
        return pcpu_nr_pages_to_map_bits(chunk->nr_pages);
}

#ifdef CONFIG_MEMCG_KMEM
static inline enum pcpu_chunk_type pcpu_chunk_type(struct pcpu_chunk *chunk)
{
        if (chunk->obj_cgroups)
                return PCPU_CHUNK_MEMCG;
        return PCPU_CHUNK_ROOT;
}

static inline bool pcpu_is_memcg_chunk(enum pcpu_chunk_type chunk_type)
{
        return chunk_type == PCPU_CHUNK_MEMCG;
}

#else
static inline enum pcpu_chunk_type pcpu_chunk_type(struct pcpu_chunk *chunk)
{
        return PCPU_CHUNK_ROOT;
}

static inline bool pcpu_is_memcg_chunk(enum pcpu_chunk_type chunk_type)
{
        return false;
}
#endif

static inline struct list_head *pcpu_chunk_list(enum pcpu_chunk_type chunk_type)
{
        return &pcpu_chunk_lists[pcpu_nr_slots *
                                 pcpu_is_memcg_chunk(chunk_type)];
}

#ifdef CONFIG_PERCPU_STATS

#include <linux/spinlock.h>

struct percpu_stats {
        u64 nr_alloc;                /* lifetime # of allocations */
        u64 nr_dealloc;                /* lifetime # of deallocations */
        u64 nr_cur_alloc;        /* current # of allocations */
        u64 nr_max_alloc;        /* max # of live allocations */
        u32 nr_chunks;                /* current # of live chunks */
        u32 nr_max_chunks;        /* max # of live chunks */
        size_t min_alloc_size;        /* min allocaiton size */
        size_t max_alloc_size;        /* max allocation size */
};

extern struct percpu_stats pcpu_stats;
extern struct pcpu_alloc_info pcpu_stats_ai;

/*
 * For debug purposes. We don't care about the flexible array.
 */
static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai)
{
        memcpy(&pcpu_stats_ai, ai, sizeof(struct pcpu_alloc_info));

        /* initialize min_alloc_size to unit_size */
        pcpu_stats.min_alloc_size = pcpu_stats_ai.unit_size;
}

/*
 * pcpu_stats_area_alloc - increment area allocation stats
 * @chunk: the location of the area being allocated
 * @size: size of area to allocate in bytes
 *
 * CONTEXT:
 * pcpu_lock.
 */
static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size)
{
        lockdep_assert_held(&pcpu_lock);

        pcpu_stats.nr_alloc++;
        pcpu_stats.nr_cur_alloc++;
        pcpu_stats.nr_max_alloc =
                max(pcpu_stats.nr_max_alloc, pcpu_stats.nr_cur_alloc);
        pcpu_stats.min_alloc_size =
                min(pcpu_stats.min_alloc_size, size);
        pcpu_stats.max_alloc_size =
                max(pcpu_stats.max_alloc_size, size);

        chunk->nr_alloc++;
        chunk->max_alloc_size = max(chunk->max_alloc_size, size);
}

/*
 * pcpu_stats_area_dealloc - decrement allocation stats
 * @chunk: the location of the area being deallocated
 *
 * CONTEXT:
 * pcpu_lock.
 */
static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk)
{
        lockdep_assert_held(&pcpu_lock);

        pcpu_stats.nr_dealloc++;
        pcpu_stats.nr_cur_alloc--;

        chunk->nr_alloc--;
}

/*
 * pcpu_stats_chunk_alloc - increment chunk stats
 */
static inline void pcpu_stats_chunk_alloc(void)
{
        unsigned long flags;
        spin_lock_irqsave(&pcpu_lock, flags);

        pcpu_stats.nr_chunks++;
        pcpu_stats.nr_max_chunks =
                max(pcpu_stats.nr_max_chunks, pcpu_stats.nr_chunks);

        spin_unlock_irqrestore(&pcpu_lock, flags);
}

/*
 * pcpu_stats_chunk_dealloc - decrement chunk stats
 */
static inline void pcpu_stats_chunk_dealloc(void)
{
        unsigned long flags;
        spin_lock_irqsave(&pcpu_lock, flags);

        pcpu_stats.nr_chunks--;

        spin_unlock_irqrestore(&pcpu_lock, flags);
}

#else

static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai)
{
}

static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size)
{
}

static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk)
{
}

static inline void pcpu_stats_chunk_alloc(void)
{
}

static inline void pcpu_stats_chunk_dealloc(void)
{
}

#endif /* !CONFIG_PERCPU_STATS */

#endif




















































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_KASAN_H
#define _LINUX_KASAN_H

#include <linux/types.h>

struct kmem_cache;
struct page;
struct vm_struct;
struct task_struct;

#ifdef CONFIG_KASAN

#include <linux/pgtable.h>
#include <asm/kasan.h>

/* kasan_data struct is used in KUnit tests for KASAN expected failures */
struct kunit_kasan_expectation {
        bool report_expected;
        bool report_found;
};

extern unsigned char kasan_early_shadow_page[PAGE_SIZE];
extern pte_t kasan_early_shadow_pte[PTRS_PER_PTE];
extern pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD];
extern pud_t kasan_early_shadow_pud[PTRS_PER_PUD];
extern p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D];

int kasan_populate_early_shadow(const void *shadow_start,
                                const void *shadow_end);

static inline void *kasan_mem_to_shadow(const void *addr)
{
        return (void *)((unsigned long)addr >> KASAN_SHADOW_SCALE_SHIFT)
                + KASAN_SHADOW_OFFSET;
}

/* Enable reporting bugs after kasan_disable_current() */
extern void kasan_enable_current(void);

/* Disable reporting bugs for current task */
extern void kasan_disable_current(void);

void kasan_unpoison_shadow(const void *address, size_t size);

void kasan_unpoison_task_stack(struct task_struct *task);

void kasan_alloc_pages(struct page *page, unsigned int order);
void kasan_free_pages(struct page *page, unsigned int order);

void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
                        slab_flags_t *flags);

void kasan_poison_slab(struct page *page);
void kasan_unpoison_object_data(struct kmem_cache *cache, void *object);
void kasan_poison_object_data(struct kmem_cache *cache, void *object);
void * __must_check kasan_init_slab_obj(struct kmem_cache *cache,
                                        const void *object);

void * __must_check kasan_kmalloc_large(const void *ptr, size_t size,
                                                gfp_t flags);
void kasan_kfree_large(void *ptr, unsigned long ip);
void kasan_poison_kfree(void *ptr, unsigned long ip);
void * __must_check kasan_kmalloc(struct kmem_cache *s, const void *object,
                                        size_t size, gfp_t flags);
void * __must_check kasan_krealloc(const void *object, size_t new_size,
                                        gfp_t flags);

void * __must_check kasan_slab_alloc(struct kmem_cache *s, void *object,
                                        gfp_t flags);
bool kasan_slab_free(struct kmem_cache *s, void *object, unsigned long ip);

struct kasan_cache {
        int alloc_meta_offset;
        int free_meta_offset;
};

/*
 * These functions provide a special case to support backing module
 * allocations with real shadow memory. With KASAN vmalloc, the special
 * case is unnecessary, as the work is handled in the generic case.
 */
#ifndef CONFIG_KASAN_VMALLOC
int kasan_module_alloc(void *addr, size_t size);
void kasan_free_shadow(const struct vm_struct *vm);
#else
static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
static inline void kasan_free_shadow(const struct vm_struct *vm) {}
#endif

int kasan_add_zero_shadow(void *start, unsigned long size);
void kasan_remove_zero_shadow(void *start, unsigned long size);

size_t __ksize(const void *);
static inline void kasan_unpoison_slab(const void *ptr)
{
        kasan_unpoison_shadow(ptr, __ksize(ptr));
}
size_t kasan_metadata_size(struct kmem_cache *cache);

bool kasan_save_enable_multi_shot(void);
void kasan_restore_multi_shot(bool enabled);

#else /* CONFIG_KASAN */

static inline void kasan_unpoison_shadow(const void *address, size_t size) {}

static inline void kasan_unpoison_task_stack(struct task_struct *task) {}

static inline void kasan_enable_current(void) {}
static inline void kasan_disable_current(void) {}

static inline void kasan_alloc_pages(struct page *page, unsigned int order) {}
static inline void kasan_free_pages(struct page *page, unsigned int order) {}

static inline void kasan_cache_create(struct kmem_cache *cache,
                                      unsigned int *size,
                                      slab_flags_t *flags) {}

static inline void kasan_poison_slab(struct page *page) {}
static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
                                        void *object) {}
static inline void kasan_poison_object_data(struct kmem_cache *cache,
                                        void *object) {}
static inline void *kasan_init_slab_obj(struct kmem_cache *cache,
                                const void *object)
{
        return (void *)object;
}

static inline void *kasan_kmalloc_large(void *ptr, size_t size, gfp_t flags)
{
        return ptr;
}
static inline void kasan_kfree_large(void *ptr, unsigned long ip) {}
static inline void kasan_poison_kfree(void *ptr, unsigned long ip) {}
static inline void *kasan_kmalloc(struct kmem_cache *s, const void *object,
                                size_t size, gfp_t flags)
{
        return (void *)object;
}
static inline void *kasan_krealloc(const void *object, size_t new_size,
                                 gfp_t flags)
{
        return (void *)object;
}

static inline void *kasan_slab_alloc(struct kmem_cache *s, void *object,
                                   gfp_t flags)
{
        return object;
}
static inline bool kasan_slab_free(struct kmem_cache *s, void *object,
                                   unsigned long ip)
{
        return false;
}

static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
static inline void kasan_free_shadow(const struct vm_struct *vm) {}

static inline int kasan_add_zero_shadow(void *start, unsigned long size)
{
        return 0;
}
static inline void kasan_remove_zero_shadow(void *start,
                                        unsigned long size)
{}

static inline void kasan_unpoison_slab(const void *ptr) { }
static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; }

#endif /* CONFIG_KASAN */

#ifdef CONFIG_KASAN_GENERIC

#define KASAN_SHADOW_INIT 0

void kasan_cache_shrink(struct kmem_cache *cache);
void kasan_cache_shutdown(struct kmem_cache *cache);
void kasan_record_aux_stack(void *ptr);

#else /* CONFIG_KASAN_GENERIC */

static inline void kasan_cache_shrink(struct kmem_cache *cache) {}
static inline void kasan_cache_shutdown(struct kmem_cache *cache) {}
static inline void kasan_record_aux_stack(void *ptr) {}

#endif /* CONFIG_KASAN_GENERIC */

#ifdef CONFIG_KASAN_SW_TAGS

#define KASAN_SHADOW_INIT 0xFF

void kasan_init_tags(void);

void *kasan_reset_tag(const void *addr);

bool kasan_report(unsigned long addr, size_t size,
                bool is_write, unsigned long ip);

#else /* CONFIG_KASAN_SW_TAGS */

static inline void kasan_init_tags(void) { }

static inline void *kasan_reset_tag(const void *addr)
{
        return (void *)addr;
}

#endif /* CONFIG_KASAN_SW_TAGS */

#ifdef CONFIG_KASAN_VMALLOC
int kasan_populate_vmalloc(unsigned long addr, unsigned long size);
void kasan_poison_vmalloc(const void *start, unsigned long size);
void kasan_unpoison_vmalloc(const void *start, unsigned long size);
void kasan_release_vmalloc(unsigned long start, unsigned long end,
                           unsigned long free_region_start,
                           unsigned long free_region_end);
#else
static inline int kasan_populate_vmalloc(unsigned long start,
                                        unsigned long size)
{
        return 0;
}

static inline void kasan_poison_vmalloc(const void *start, unsigned long size)
{ }
static inline void kasan_unpoison_vmalloc(const void *start, unsigned long size)
{ }
static inline void kasan_release_vmalloc(unsigned long start,
                                         unsigned long end,
                                         unsigned long free_region_start,
                                         unsigned long free_region_end) {}
#endif

#ifdef CONFIG_KASAN
void kasan_non_canonical_hook(unsigned long addr);
#else /* CONFIG_KASAN */
static inline void kasan_non_canonical_hook(unsigned long addr) { }
#endif /* CONFIG_KASAN */

#endif /* LINUX_KASAN_H */








































































































































































































































































































    1 


    1 



































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_HIGHMEM_H
#define _LINUX_HIGHMEM_H

#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/bug.h>
#include <linux/mm.h>
#include <linux/uaccess.h>
#include <linux/hardirq.h>

#include <asm/cacheflush.h>

#ifndef ARCH_HAS_FLUSH_ANON_PAGE
static inline void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vmaddr)
{
}
#endif

#ifndef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
static inline void flush_kernel_dcache_page(struct page *page)
{
}
static inline void flush_kernel_vmap_range(void *vaddr, int size)
{
}
static inline void invalidate_kernel_vmap_range(void *vaddr, int size)
{
}
#endif

#include <asm/kmap_types.h>

#ifdef CONFIG_HIGHMEM
extern void *kmap_atomic_high_prot(struct page *page, pgprot_t prot);
extern void kunmap_atomic_high(void *kvaddr);
#include <asm/highmem.h>

#ifndef ARCH_HAS_KMAP_FLUSH_TLB
static inline void kmap_flush_tlb(unsigned long addr) { }
#endif

#ifndef kmap_prot
#define kmap_prot PAGE_KERNEL
#endif

void *kmap_high(struct page *page);
static inline void *kmap(struct page *page)
{
        void *addr;

        might_sleep();
        if (!PageHighMem(page))
                addr = page_address(page);
        else
                addr = kmap_high(page);
        kmap_flush_tlb((unsigned long)addr);
        return addr;
}

void kunmap_high(struct page *page);

static inline void kunmap(struct page *page)
{
        might_sleep();
        if (!PageHighMem(page))
                return;
        kunmap_high(page);
}

/*
 * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
 * no global lock is needed and because the kmap code must perform a global TLB
 * invalidation when the kmap pool wraps.
 *
 * However when holding an atomic kmap it is not legal to sleep, so atomic
 * kmaps are appropriate for short, tight code paths only.
 *
 * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap
 * gives a more generic (and caching) interface. But kmap_atomic can
 * be used in IRQ contexts, so in some (very limited) cases we need
 * it.
 */
static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
{
        preempt_disable();
        pagefault_disable();
        if (!PageHighMem(page))
                return page_address(page);
        return kmap_atomic_high_prot(page, prot);
}
#define kmap_atomic(page)        kmap_atomic_prot(page, kmap_prot)

/* declarations for linux/mm/highmem.c */
unsigned int nr_free_highpages(void);
extern atomic_long_t _totalhigh_pages;
static inline unsigned long totalhigh_pages(void)
{
        return (unsigned long)atomic_long_read(&_totalhigh_pages);
}

static inline void totalhigh_pages_inc(void)
{
        atomic_long_inc(&_totalhigh_pages);
}

static inline void totalhigh_pages_dec(void)
{
        atomic_long_dec(&_totalhigh_pages);
}

static inline void totalhigh_pages_add(long count)
{
        atomic_long_add(count, &_totalhigh_pages);
}

static inline void totalhigh_pages_set(long val)
{
        atomic_long_set(&_totalhigh_pages, val);
}

void kmap_flush_unused(void);

struct page *kmap_to_page(void *addr);

#else /* CONFIG_HIGHMEM */

static inline unsigned int nr_free_highpages(void) { return 0; }

static inline struct page *kmap_to_page(void *addr)
{
        return virt_to_page(addr);
}

static inline unsigned long totalhigh_pages(void) { return 0UL; }

static inline void *kmap(struct page *page)
{
        might_sleep();
        return page_address(page);
}

static inline void kunmap_high(struct page *page)
{
}

static inline void kunmap(struct page *page)
{
#ifdef ARCH_HAS_FLUSH_ON_KUNMAP
        kunmap_flush_on_unmap(page_address(page));
#endif
}

static inline void *kmap_atomic(struct page *page)
{
        preempt_disable();
        pagefault_disable();
        return page_address(page);
}
#define kmap_atomic_prot(page, prot)        kmap_atomic(page)

static inline void kunmap_atomic_high(void *addr)
{
        /*
         * Mostly nothing to do in the CONFIG_HIGHMEM=n case as kunmap_atomic()
         * handles re-enabling faults + preemption
         */
#ifdef ARCH_HAS_FLUSH_ON_KUNMAP
        kunmap_flush_on_unmap(addr);
#endif
}

#define kmap_atomic_pfn(pfn)        kmap_atomic(pfn_to_page(pfn))

#define kmap_flush_unused()        do {} while(0)

#endif /* CONFIG_HIGHMEM */

#if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)

DECLARE_PER_CPU(int, __kmap_atomic_idx);

static inline int kmap_atomic_idx_push(void)
{
        int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1;

#ifdef CONFIG_DEBUG_HIGHMEM
        WARN_ON_ONCE(in_irq() && !irqs_disabled());
        BUG_ON(idx >= KM_TYPE_NR);
#endif
        return idx;
}

static inline int kmap_atomic_idx(void)
{
        return __this_cpu_read(__kmap_atomic_idx) - 1;
}

static inline void kmap_atomic_idx_pop(void)
{
#ifdef CONFIG_DEBUG_HIGHMEM
        int idx = __this_cpu_dec_return(__kmap_atomic_idx);

        BUG_ON(idx < 0);
#else
        __this_cpu_dec(__kmap_atomic_idx);
#endif
}

#endif

/*
 * Prevent people trying to call kunmap_atomic() as if it were kunmap()
 * kunmap_atomic() should get the return value of kmap_atomic, not the page.
 */
#define kunmap_atomic(addr)                                     \
do {                                                            \
        BUILD_BUG_ON(__same_type((addr), struct page *));       \
        kunmap_atomic_high(addr);                                  \
        pagefault_enable();                                     \
        preempt_enable();                                       \
} while (0)


/* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */
#ifndef clear_user_highpage
static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
{
        void *addr = kmap_atomic(page);
        clear_user_page(addr, vaddr, page);
        kunmap_atomic(addr);
}
#endif

#ifndef __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
/**
 * __alloc_zeroed_user_highpage - Allocate a zeroed HIGHMEM page for a VMA with caller-specified movable GFP flags
 * @movableflags: The GFP flags related to the pages future ability to move like __GFP_MOVABLE
 * @vma: The VMA the page is to be allocated for
 * @vaddr: The virtual address the page will be inserted into
 *
 * This function will allocate a page for a VMA but the caller is expected
 * to specify via movableflags whether the page will be movable in the
 * future or not
 *
 * An architecture may override this function by defining
 * __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE and providing their own
 * implementation.
 */
static inline struct page *
__alloc_zeroed_user_highpage(gfp_t movableflags,
                        struct vm_area_struct *vma,
                        unsigned long vaddr)
{
        struct page *page = alloc_page_vma(GFP_HIGHUSER | movableflags,
                        vma, vaddr);

        if (page)
                clear_user_highpage(page, vaddr);

        return page;
}
#endif

/**
 * alloc_zeroed_user_highpage_movable - Allocate a zeroed HIGHMEM page for a VMA that the caller knows can move
 * @vma: The VMA the page is to be allocated for
 * @vaddr: The virtual address the page will be inserted into
 *
 * This function will allocate a page for a VMA that the caller knows will
 * be able to migrate in the future using move_pages() or reclaimed
 */
static inline struct page *
alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
                                        unsigned long vaddr)
{
        return __alloc_zeroed_user_highpage(__GFP_MOVABLE, vma, vaddr);
}

static inline void clear_highpage(struct page *page)
{
        void *kaddr = kmap_atomic(page);
        clear_page(kaddr);
        kunmap_atomic(kaddr);
}

static inline void zero_user_segments(struct page *page,
        unsigned start1, unsigned end1,
        unsigned start2, unsigned end2)
{
        void *kaddr = kmap_atomic(page);

        BUG_ON(end1 > PAGE_SIZE || end2 > PAGE_SIZE);

        if (end1 > start1)
                memset(kaddr + start1, 0, end1 - start1);

        if (end2 > start2)
                memset(kaddr + start2, 0, end2 - start2);

        kunmap_atomic(kaddr);
        flush_dcache_page(page);
}

static inline void zero_user_segment(struct page *page,
        unsigned start, unsigned end)
{
        zero_user_segments(page, start, end, 0, 0);
}

static inline void zero_user(struct page *page,
        unsigned start, unsigned size)
{
        zero_user_segments(page, start, start + size, 0, 0);
}

#ifndef __HAVE_ARCH_COPY_USER_HIGHPAGE

static inline void copy_user_highpage(struct page *to, struct page *from,
        unsigned long vaddr, struct vm_area_struct *vma)
{
        char *vfrom, *vto;

        vfrom = kmap_atomic(from);
        vto = kmap_atomic(to);
        copy_user_page(vto, vfrom, vaddr, to);
        kunmap_atomic(vto);
        kunmap_atomic(vfrom);
}

#endif

#ifndef __HAVE_ARCH_COPY_HIGHPAGE

static inline void copy_highpage(struct page *to, struct page *from)
{
        char *vfrom, *vto;

        vfrom = kmap_atomic(from);
        vto = kmap_atomic(to);
        copy_page(vto, vfrom);
        kunmap_atomic(vto);
        kunmap_atomic(vfrom);
}

#endif

static inline void memcpy_from_page(char *to, struct page *page,
                                    size_t offset, size_t len)
{
        char *from = kmap_atomic(page);

        memcpy(to, from + offset, len);
        kunmap_atomic(from);
}

static inline void memcpy_to_page(struct page *page, size_t offset,
                                  const char *from, size_t len)
{
        char *to = kmap_atomic(page);

        memcpy(to + offset, from, len);
        kunmap_atomic(to);
}

#endif /* _LINUX_HIGHMEM_H */





























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SECCOMP_H
#define _LINUX_SECCOMP_H

#include <uapi/linux/seccomp.h>

#define SECCOMP_FILTER_FLAG_MASK        (SECCOMP_FILTER_FLAG_TSYNC | \
                                         SECCOMP_FILTER_FLAG_LOG | \
                                         SECCOMP_FILTER_FLAG_SPEC_ALLOW | \
                                         SECCOMP_FILTER_FLAG_NEW_LISTENER | \
                                         SECCOMP_FILTER_FLAG_TSYNC_ESRCH)

/* sizeof() the first published struct seccomp_notif_addfd */
#define SECCOMP_NOTIFY_ADDFD_SIZE_VER0 24
#define SECCOMP_NOTIFY_ADDFD_SIZE_LATEST SECCOMP_NOTIFY_ADDFD_SIZE_VER0

#ifdef CONFIG_SECCOMP

#include <linux/thread_info.h>
#include <linux/atomic.h>
#include <asm/seccomp.h>

struct seccomp_filter;
/**
 * struct seccomp - the state of a seccomp'ed process
 *
 * @mode:  indicates one of the valid values above for controlled
 *         system calls available to a process.
 * @filter: must always point to a valid seccomp-filter or NULL as it is
 *          accessed without locking during system call entry.
 *
 *          @filter must only be accessed from the context of current as there
 *          is no read locking.
 */
struct seccomp {
        int mode;
        atomic_t filter_count;
        struct seccomp_filter *filter;
};

#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
extern int __secure_computing(const struct seccomp_data *sd);
static inline int secure_computing(void)
{
        if (unlikely(test_thread_flag(TIF_SECCOMP)))
                return  __secure_computing(NULL);
        return 0;
}
#else
extern void secure_computing_strict(int this_syscall);
#endif

extern long prctl_get_seccomp(void);
extern long prctl_set_seccomp(unsigned long, void __user *);

static inline int seccomp_mode(struct seccomp *s)
{
        return s->mode;
}

#else /* CONFIG_SECCOMP */

#include <linux/errno.h>

struct seccomp { };
struct seccomp_filter { };
struct seccomp_data;

#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
static inline int secure_computing(void) { return 0; }
#else
static inline void secure_computing_strict(int this_syscall) { return; }
#endif
static inline int __secure_computing(const struct seccomp_data *sd) { return 0; }

static inline long prctl_get_seccomp(void)
{
        return -EINVAL;
}

static inline long prctl_set_seccomp(unsigned long arg2, char __user *arg3)
{
        return -EINVAL;
}

static inline int seccomp_mode(struct seccomp *s)
{
        return SECCOMP_MODE_DISABLED;
}
#endif /* CONFIG_SECCOMP */

#ifdef CONFIG_SECCOMP_FILTER
extern void seccomp_filter_release(struct task_struct *tsk);
extern void get_seccomp_filter(struct task_struct *tsk);
#else  /* CONFIG_SECCOMP_FILTER */
static inline void seccomp_filter_release(struct task_struct *tsk)
{
        return;
}
static inline void get_seccomp_filter(struct task_struct *tsk)
{
        return;
}
#endif /* CONFIG_SECCOMP_FILTER */

#if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
extern long seccomp_get_filter(struct task_struct *task,
                               unsigned long filter_off, void __user *data);
extern long seccomp_get_metadata(struct task_struct *task,
                                 unsigned long filter_off, void __user *data);
#else
static inline long seccomp_get_filter(struct task_struct *task,
                                      unsigned long n, void __user *data)
{
        return -EINVAL;
}
static inline long seccomp_get_metadata(struct task_struct *task,
                                        unsigned long filter_off,
                                        void __user *data)
{
        return -EINVAL;
}
#endif /* CONFIG_SECCOMP_FILTER && CONFIG_CHECKPOINT_RESTORE */
#endif /* _LINUX_SECCOMP_H */



































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
// SPDX-License-Identifier: GPL-2.0

#ifndef _LINUX_KERNEL_TRACE_H
#define _LINUX_KERNEL_TRACE_H

#include <linux/fs.h>
#include <linux/atomic.h>
#include <linux/sched.h>
#include <linux/clocksource.h>
#include <linux/ring_buffer.h>
#include <linux/mmiotrace.h>
#include <linux/tracepoint.h>
#include <linux/ftrace.h>
#include <linux/trace.h>
#include <linux/hw_breakpoint.h>
#include <linux/trace_seq.h>
#include <linux/trace_events.h>
#include <linux/compiler.h>
#include <linux/glob.h>
#include <linux/irq_work.h>
#include <linux/workqueue.h>
#include <linux/ctype.h>

#ifdef CONFIG_FTRACE_SYSCALLS
#include <asm/unistd.h>                /* For NR_SYSCALLS             */
#include <asm/syscall.h>        /* some archs define it here */
#endif

enum trace_type {
        __TRACE_FIRST_TYPE = 0,

        TRACE_FN,
        TRACE_CTX,
        TRACE_WAKE,
        TRACE_STACK,
        TRACE_PRINT,
        TRACE_BPRINT,
        TRACE_MMIO_RW,
        TRACE_MMIO_MAP,
        TRACE_BRANCH,
        TRACE_GRAPH_RET,
        TRACE_GRAPH_ENT,
        TRACE_USER_STACK,
        TRACE_BLK,
        TRACE_BPUTS,
        TRACE_HWLAT,
        TRACE_RAW_DATA,

        __TRACE_LAST_TYPE,
};


#undef __field
#define __field(type, item)                type        item;

#undef __field_fn
#define __field_fn(type, item)                type        item;

#undef __field_packed
#define __field_packed(type, item)        type        item;

#undef __field_struct
#define __field_struct(type, item)        __field(type, item)

#undef __field_desc
#define __field_desc(type, container, item)

#undef __field_desc_packed
#define __field_desc_packed(type, container, item)

#undef __array
#define __array(type, item, size)        type        item[size];

#undef __array_desc
#define __array_desc(type, container, item, size)

#undef __dynamic_array
#define __dynamic_array(type, item)        type        item[];

#undef F_STRUCT
#define F_STRUCT(args...)                args

#undef FTRACE_ENTRY
#define FTRACE_ENTRY(name, struct_name, id, tstruct, print)                \
        struct struct_name {                                                \
                struct trace_entry        ent;                                \
                tstruct                                                        \
        }

#undef FTRACE_ENTRY_DUP
#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk)

#undef FTRACE_ENTRY_REG
#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print,        regfn)        \
        FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print))

#undef FTRACE_ENTRY_PACKED
#define FTRACE_ENTRY_PACKED(name, struct_name, id, tstruct, print)        \
        FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) __packed

#include "trace_entries.h"

/* Use this for memory failure errors */
#define MEM_FAIL(condition, fmt, ...) ({                        \
        static bool __section(".data.once") __warned;                \
        int __ret_warn_once = !!(condition);                        \
                                                                \
        if (unlikely(__ret_warn_once && !__warned)) {                \
                __warned = true;                                \
                pr_err("ERROR: " fmt, ##__VA_ARGS__);                \
        }                                                        \
        unlikely(__ret_warn_once);                                \
})

#define HIST_STACKTRACE_DEPTH        16
#define HIST_STACKTRACE_SIZE        (HIST_STACKTRACE_DEPTH * sizeof(unsigned long))
#define HIST_STACKTRACE_SKIP        5

/*
 * syscalls are special, and need special handling, this is why
 * they are not included in trace_entries.h
 */
struct syscall_trace_enter {
        struct trace_entry        ent;
        int                        nr;
        unsigned long                args[];
};

struct syscall_trace_exit {
        struct trace_entry        ent;
        int                        nr;
        long                        ret;
};

struct kprobe_trace_entry_head {
        struct trace_entry        ent;
        unsigned long                ip;
};

struct kretprobe_trace_entry_head {
        struct trace_entry        ent;
        unsigned long                func;
        unsigned long                ret_ip;
};

/*
 * trace_flag_type is an enumeration that holds different
 * states when a trace occurs. These are:
 *  IRQS_OFF                - interrupts were disabled
 *  IRQS_NOSUPPORT        - arch does not support irqs_disabled_flags
 *  NEED_RESCHED        - reschedule is requested
 *  HARDIRQ                - inside an interrupt handler
 *  SOFTIRQ                - inside a softirq handler
 */
enum trace_flag_type {
        TRACE_FLAG_IRQS_OFF                = 0x01,
        TRACE_FLAG_IRQS_NOSUPPORT        = 0x02,
        TRACE_FLAG_NEED_RESCHED                = 0x04,
        TRACE_FLAG_HARDIRQ                = 0x08,
        TRACE_FLAG_SOFTIRQ                = 0x10,
        TRACE_FLAG_PREEMPT_RESCHED        = 0x20,
        TRACE_FLAG_NMI                        = 0x40,
};

#define TRACE_BUF_SIZE                1024

struct trace_array;

/*
 * The CPU trace array - it consists of thousands of trace entries
 * plus some other descriptor data: (for example which task started
 * the trace, etc.)
 */
struct trace_array_cpu {
        atomic_t                disabled;
        void                        *buffer_page;        /* ring buffer spare */

        unsigned long                entries;
        unsigned long                saved_latency;
        unsigned long                critical_start;
        unsigned long                critical_end;
        unsigned long                critical_sequence;
        unsigned long                nice;
        unsigned long                policy;
        unsigned long                rt_priority;
        unsigned long                skipped_entries;
        u64                        preempt_timestamp;
        pid_t                        pid;
        kuid_t                        uid;
        char                        comm[TASK_COMM_LEN];

#ifdef CONFIG_FUNCTION_TRACER
        int                        ftrace_ignore_pid;
#endif
        bool                        ignore_pid;
};

struct tracer;
struct trace_option_dentry;

struct array_buffer {
        struct trace_array                *tr;
        struct trace_buffer                *buffer;
        struct trace_array_cpu __percpu        *data;
        u64                                time_start;
        int                                cpu;
};

#define TRACE_FLAGS_MAX_SIZE                32

struct trace_options {
        struct tracer                        *tracer;
        struct trace_option_dentry        *topts;
};

struct trace_pid_list {
        int                                pid_max;
        unsigned long                        *pids;
};

enum {
        TRACE_PIDS                = BIT(0),
        TRACE_NO_PIDS                = BIT(1),
};

static inline bool pid_type_enabled(int type, struct trace_pid_list *pid_list,
                                    struct trace_pid_list *no_pid_list)
{
        /* Return true if the pid list in type has pids */
        return ((type & TRACE_PIDS) && pid_list) ||
                ((type & TRACE_NO_PIDS) && no_pid_list);
}

static inline bool still_need_pid_events(int type, struct trace_pid_list *pid_list,
                                         struct trace_pid_list *no_pid_list)
{
        /*
         * Turning off what is in @type, return true if the "other"
         * pid list, still has pids in it.
         */
        return (!(type & TRACE_PIDS) && pid_list) ||
                (!(type & TRACE_NO_PIDS) && no_pid_list);
}

typedef bool (*cond_update_fn_t)(struct trace_array *tr, void *cond_data);

/**
 * struct cond_snapshot - conditional snapshot data and callback
 *
 * The cond_snapshot structure encapsulates a callback function and
 * data associated with the snapshot for a given tracing instance.
 *
 * When a snapshot is taken conditionally, by invoking
 * tracing_snapshot_cond(tr, cond_data), the cond_data passed in is
 * passed in turn to the cond_snapshot.update() function.  That data
 * can be compared by the update() implementation with the cond_data
 * contained within the struct cond_snapshot instance associated with
 * the trace_array.  Because the tr->max_lock is held throughout the
 * update() call, the update() function can directly retrieve the
 * cond_snapshot and cond_data associated with the per-instance
 * snapshot associated with the trace_array.
 *
 * The cond_snapshot.update() implementation can save data to be
 * associated with the snapshot if it decides to, and returns 'true'
 * in that case, or it returns 'false' if the conditional snapshot
 * shouldn't be taken.
 *
 * The cond_snapshot instance is created and associated with the
 * user-defined cond_data by tracing_cond_snapshot_enable().
 * Likewise, the cond_snapshot instance is destroyed and is no longer
 * associated with the trace instance by
 * tracing_cond_snapshot_disable().
 *
 * The method below is required.
 *
 * @update: When a conditional snapshot is invoked, the update()
 *        callback function is invoked with the tr->max_lock held.  The
 *        update() implementation signals whether or not to actually
 *        take the snapshot, by returning 'true' if so, 'false' if no
 *        snapshot should be taken.  Because the max_lock is held for
 *        the duration of update(), the implementation is safe to
 *        directly retrieved and save any implementation data it needs
 *        to in association with the snapshot.
 */
struct cond_snapshot {
        void                                *cond_data;
        cond_update_fn_t                update;
};

/*
 * The trace array - an array of per-CPU trace arrays. This is the
 * highest level data structure that individual tracers deal with.
 * They have on/off state as well:
 */
struct trace_array {
        struct list_head        list;
        char                        *name;
        struct array_buffer        array_buffer;
#ifdef CONFIG_TRACER_MAX_TRACE
        /*
         * The max_buffer is used to snapshot the trace when a maximum
         * latency is reached, or when the user initiates a snapshot.
         * Some tracers will use this to store a maximum trace while
         * it continues examining live traces.
         *
         * The buffers for the max_buffer are set up the same as the array_buffer
         * When a snapshot is taken, the buffer of the max_buffer is swapped
         * with the buffer of the array_buffer and the buffers are reset for
         * the array_buffer so the tracing can continue.
         */
        struct array_buffer        max_buffer;
        bool                        allocated_snapshot;
#endif
#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
        unsigned long                max_latency;
#ifdef CONFIG_FSNOTIFY
        struct dentry                *d_max_latency;
        struct work_struct        fsnotify_work;
        struct irq_work                fsnotify_irqwork;
#endif
#endif
        struct trace_pid_list        __rcu *filtered_pids;
        struct trace_pid_list        __rcu *filtered_no_pids;
        /*
         * max_lock is used to protect the swapping of buffers
         * when taking a max snapshot. The buffers themselves are
         * protected by per_cpu spinlocks. But the action of the swap
         * needs its own lock.
         *
         * This is defined as a arch_spinlock_t in order to help
         * with performance when lockdep debugging is enabled.
         *
         * It is also used in other places outside the update_max_tr
         * so it needs to be defined outside of the
         * CONFIG_TRACER_MAX_TRACE.
         */
        arch_spinlock_t                max_lock;
        int                        buffer_disabled;
#ifdef CONFIG_FTRACE_SYSCALLS
        int                        sys_refcount_enter;
        int                        sys_refcount_exit;
        struct trace_event_file __rcu *enter_syscall_files[NR_syscalls];
        struct trace_event_file __rcu *exit_syscall_files[NR_syscalls];
#endif
        int                        stop_count;
        int                        clock_id;
        int                        nr_topts;
        bool                        clear_trace;
        int                        buffer_percent;
        unsigned int                n_err_log_entries;
        struct tracer                *current_trace;
        unsigned int                trace_flags;
        unsigned char                trace_flags_index[TRACE_FLAGS_MAX_SIZE];
        unsigned int                flags;
        raw_spinlock_t                start_lock;
        struct list_head        err_log;
        struct dentry                *dir;
        struct dentry                *options;
        struct dentry                *percpu_dir;
        struct dentry                *event_dir;
        struct trace_options        *topts;
        struct list_head        systems;
        struct list_head        events;
        struct trace_event_file *trace_marker_file;
        cpumask_var_t                tracing_cpumask; /* only trace on set CPUs */
        /* one per_cpu trace_pipe can be opened by only one user */
        cpumask_var_t                pipe_cpumask;
        int                        ref;
        int                        trace_ref;
#ifdef CONFIG_FUNCTION_TRACER
        struct ftrace_ops        *ops;
        struct trace_pid_list        __rcu *function_pids;
        struct trace_pid_list        __rcu *function_no_pids;
#ifdef CONFIG_DYNAMIC_FTRACE
        /* All of these are protected by the ftrace_lock */
        struct list_head        func_probes;
        struct list_head        mod_trace;
        struct list_head        mod_notrace;
#endif
        /* function tracing enabled */
        int                        function_enabled;
#endif
        int                        time_stamp_abs_ref;
        struct list_head        hist_vars;
#ifdef CONFIG_TRACER_SNAPSHOT
        struct cond_snapshot        *cond_snapshot;
#endif
};

enum {
        TRACE_ARRAY_FL_GLOBAL        = (1 << 0)
};

extern struct list_head ftrace_trace_arrays;

extern struct mutex trace_types_lock;

extern int trace_array_get(struct trace_array *tr);
extern int tracing_check_open_get_tr(struct trace_array *tr);
extern struct trace_array *trace_array_find(const char *instance);
extern struct trace_array *trace_array_find_get(const char *instance);

extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs);
extern int tracing_set_clock(struct trace_array *tr, const char *clockstr);

extern bool trace_clock_in_ns(struct trace_array *tr);

/*
 * The global tracer (top) should be the first trace array added,
 * but we check the flag anyway.
 */
static inline struct trace_array *top_trace_array(void)
{
        struct trace_array *tr;

        if (list_empty(&ftrace_trace_arrays))
                return NULL;

        tr = list_entry(ftrace_trace_arrays.prev,
                        typeof(*tr), list);
        WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL));
        return tr;
}

#define FTRACE_CMP_TYPE(var, type) \
        __builtin_types_compatible_p(typeof(var), type *)

#undef IF_ASSIGN
#define IF_ASSIGN(var, entry, etype, id)                        \
        if (FTRACE_CMP_TYPE(var, etype)) {                        \
                var = (typeof(var))(entry);                        \
                WARN_ON(id != 0 && (entry)->type != id);        \
                break;                                                \
        }

/* Will cause compile errors if type is not found. */
extern void __ftrace_bad_type(void);

/*
 * The trace_assign_type is a verifier that the entry type is
 * the same as the type being assigned. To add new types simply
 * add a line with the following format:
 *
 * IF_ASSIGN(var, ent, type, id);
 *
 *  Where "type" is the trace type that includes the trace_entry
 *  as the "ent" item. And "id" is the trace identifier that is
 *  used in the trace_type enum.
 *
 *  If the type can have more than one id, then use zero.
 */
#define trace_assign_type(var, ent)                                        \
        do {                                                                \
                IF_ASSIGN(var, ent, struct ftrace_entry, TRACE_FN);        \
                IF_ASSIGN(var, ent, struct ctx_switch_entry, 0);        \
                IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK);        \
                IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
                IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);        \
                IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT);        \
                IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS);        \
                IF_ASSIGN(var, ent, struct hwlat_entry, TRACE_HWLAT);        \
                IF_ASSIGN(var, ent, struct raw_data_entry, TRACE_RAW_DATA);\
                IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,                \
                          TRACE_MMIO_RW);                                \
                IF_ASSIGN(var, ent, struct trace_mmiotrace_map,                \
                          TRACE_MMIO_MAP);                                \
                IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
                IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry,        \
                          TRACE_GRAPH_ENT);                \
                IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry,        \
                          TRACE_GRAPH_RET);                \
                __ftrace_bad_type();                                        \
        } while (0)

/*
 * An option specific to a tracer. This is a boolean value.
 * The bit is the bit index that sets its value on the
 * flags value in struct tracer_flags.
 */
struct tracer_opt {
        const char        *name; /* Will appear on the trace_options file */
        u32                bit; /* Mask assigned in val field in tracer_flags */
};

/*
 * The set of specific options for a tracer. Your tracer
 * have to set the initial value of the flags val.
 */
struct tracer_flags {
        u32                        val;
        struct tracer_opt        *opts;
        struct tracer                *trace;
};

/* Makes more easy to define a tracer opt */
#define TRACER_OPT(s, b)        .name = #s, .bit = b


struct trace_option_dentry {
        struct tracer_opt                *opt;
        struct tracer_flags                *flags;
        struct trace_array                *tr;
        struct dentry                        *entry;
};

/**
 * struct tracer - a specific tracer and its callbacks to interact with tracefs
 * @name: the name chosen to select it on the available_tracers file
 * @init: called when one switches to this tracer (echo name > current_tracer)
 * @reset: called when one switches to another tracer
 * @start: called when tracing is unpaused (echo 1 > tracing_on)
 * @stop: called when tracing is paused (echo 0 > tracing_on)
 * @update_thresh: called when tracing_thresh is updated
 * @open: called when the trace file is opened
 * @pipe_open: called when the trace_pipe file is opened
 * @close: called when the trace file is released
 * @pipe_close: called when the trace_pipe file is released
 * @read: override the default read callback on trace_pipe
 * @splice_read: override the default splice_read callback on trace_pipe
 * @selftest: selftest to run on boot (see trace_selftest.c)
 * @print_headers: override the first lines that describe your columns
 * @print_line: callback that prints a trace
 * @set_flag: signals one of your private flags changed (trace_options file)
 * @flags: your private flags
 */
struct tracer {
        const char                *name;
        int                        (*init)(struct trace_array *tr);
        void                        (*reset)(struct trace_array *tr);
        void                        (*start)(struct trace_array *tr);
        void                        (*stop)(struct trace_array *tr);
        int                        (*update_thresh)(struct trace_array *tr);
        void                        (*open)(struct trace_iterator *iter);
        void                        (*pipe_open)(struct trace_iterator *iter);
        void                        (*close)(struct trace_iterator *iter);
        void                        (*pipe_close)(struct trace_iterator *iter);
        ssize_t                        (*read)(struct trace_iterator *iter,
                                        struct file *filp, char __user *ubuf,
                                        size_t cnt, loff_t *ppos);
        ssize_t                        (*splice_read)(struct trace_iterator *iter,
                                               struct file *filp,
                                               loff_t *ppos,
                                               struct pipe_inode_info *pipe,
                                               size_t len,
                                               unsigned int flags);
#ifdef CONFIG_FTRACE_STARTUP_TEST
        int                        (*selftest)(struct tracer *trace,
                                            struct trace_array *tr);
#endif
        void                        (*print_header)(struct seq_file *m);
        enum print_line_t        (*print_line)(struct trace_iterator *iter);
        /* If you handled the flag setting, return 0 */
        int                        (*set_flag)(struct trace_array *tr,
                                            u32 old_flags, u32 bit, int set);
        /* Return 0 if OK with change, else return non-zero */
        int                        (*flag_changed)(struct trace_array *tr,
                                                u32 mask, int set);
        struct tracer                *next;
        struct tracer_flags        *flags;
        int                        enabled;
        bool                        print_max;
        bool                        allow_instances;
#ifdef CONFIG_TRACER_MAX_TRACE
        bool                        use_max_tr;
#endif
        /* True if tracer cannot be enabled in kernel param */
        bool                        noboot;
};


/* Only current can touch trace_recursion */

/*
 * For function tracing recursion:
 *  The order of these bits are important.
 *
 *  When function tracing occurs, the following steps are made:
 *   If arch does not support a ftrace feature:
 *    call internal function (uses INTERNAL bits) which calls...
 *   If callback is registered to the "global" list, the list
 *    function is called and recursion checks the GLOBAL bits.
 *    then this function calls...
 *   The function callback, which can use the FTRACE bits to
 *    check for recursion.
 */
enum {
        /* Function recursion bits */
        TRACE_FTRACE_BIT,
        TRACE_FTRACE_NMI_BIT,
        TRACE_FTRACE_IRQ_BIT,
        TRACE_FTRACE_SIRQ_BIT,
        TRACE_FTRACE_TRANSITION_BIT,

        /* Internal use recursion bits */
        TRACE_INTERNAL_BIT,
        TRACE_INTERNAL_NMI_BIT,
        TRACE_INTERNAL_IRQ_BIT,
        TRACE_INTERNAL_SIRQ_BIT,
        TRACE_INTERNAL_TRANSITION_BIT,

        TRACE_BRANCH_BIT,
/*
 * Abuse of the trace_recursion.
 * As we need a way to maintain state if we are tracing the function
 * graph in irq because we want to trace a particular function that
 * was called in irq context but we have irq tracing off. Since this
 * can only be modified by current, we can reuse trace_recursion.
 */
        TRACE_IRQ_BIT,

        /* Set if the function is in the set_graph_function file */
        TRACE_GRAPH_BIT,

        /*
         * In the very unlikely case that an interrupt came in
         * at a start of graph tracing, and we want to trace
         * the function in that interrupt, the depth can be greater
         * than zero, because of the preempted start of a previous
         * trace. In an even more unlikely case, depth could be 2
         * if a softirq interrupted the start of graph tracing,
         * followed by an interrupt preempting a start of graph
         * tracing in the softirq, and depth can even be 3
         * if an NMI came in at the start of an interrupt function
         * that preempted a softirq start of a function that
         * preempted normal context!!!! Luckily, it can't be
         * greater than 3, so the next two bits are a mask
         * of what the depth is when we set TRACE_GRAPH_BIT
         */

        TRACE_GRAPH_DEPTH_START_BIT,
        TRACE_GRAPH_DEPTH_END_BIT,

        /*
         * To implement set_graph_notrace, if this bit is set, we ignore
         * function graph tracing of called functions, until the return
         * function is called to clear it.
         */
        TRACE_GRAPH_NOTRACE_BIT,
};

#define trace_recursion_set(bit)        do { (current)->trace_recursion |= (1<<(bit)); } while (0)
#define trace_recursion_clear(bit)        do { (current)->trace_recursion &= ~(1<<(bit)); } while (0)
#define trace_recursion_test(bit)        ((current)->trace_recursion & (1<<(bit)))

#define trace_recursion_depth() \
        (((current)->trace_recursion >> TRACE_GRAPH_DEPTH_START_BIT) & 3)
#define trace_recursion_set_depth(depth) \
        do {                                                                \
                current->trace_recursion &=                                \
                        ~(3 << TRACE_GRAPH_DEPTH_START_BIT);                \
                current->trace_recursion |=                                \
                        ((depth) & 3) << TRACE_GRAPH_DEPTH_START_BIT;        \
        } while (0)

#define TRACE_CONTEXT_BITS        4

#define TRACE_FTRACE_START        TRACE_FTRACE_BIT

#define TRACE_LIST_START        TRACE_INTERNAL_BIT

#define TRACE_CONTEXT_MASK        ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1)

enum {
        TRACE_CTX_NMI,
        TRACE_CTX_IRQ,
        TRACE_CTX_SOFTIRQ,
        TRACE_CTX_NORMAL,
        TRACE_CTX_TRANSITION,
};

static __always_inline int trace_get_context_bit(void)
{
        int bit;

        if (in_interrupt()) {
                if (in_nmi())
                        bit = TRACE_CTX_NMI;

                else if (in_irq())
                        bit = TRACE_CTX_IRQ;
                else
                        bit = TRACE_CTX_SOFTIRQ;
        } else
                bit = TRACE_CTX_NORMAL;

        return bit;
}

static __always_inline int trace_test_and_set_recursion(int start)
{
        unsigned int val = current->trace_recursion;
        int bit;

        bit = trace_get_context_bit() + start;
        if (unlikely(val & (1 << bit))) {
                /*
                 * It could be that preempt_count has not been updated during
                 * a switch between contexts. Allow for a single recursion.
                 */
                bit = start + TRACE_CTX_TRANSITION;
                if (trace_recursion_test(bit))
                        return -1;
                trace_recursion_set(bit);
                barrier();
                return bit;
        }

        val |= 1 << bit;
        current->trace_recursion = val;
        barrier();

        return bit;
}

static __always_inline void trace_clear_recursion(int bit)
{
        unsigned int val = current->trace_recursion;

        bit = 1 << bit;
        val &= ~bit;

        barrier();
        current->trace_recursion = val;
}

static inline struct ring_buffer_iter *
trace_buffer_iter(struct trace_iterator *iter, int cpu)
{
        return iter->buffer_iter ? iter->buffer_iter[cpu] : NULL;
}

int tracer_init(struct tracer *t, struct trace_array *tr);
int tracing_is_enabled(void);
void tracing_reset_online_cpus(struct array_buffer *buf);
void tracing_reset_current(int cpu);
void tracing_reset_all_online_cpus(void);
void tracing_reset_all_online_cpus_unlocked(void);
int tracing_open_generic(struct inode *inode, struct file *filp);
int tracing_open_generic_tr(struct inode *inode, struct file *filp);
int tracing_open_file_tr(struct inode *inode, struct file *filp);
int tracing_release_file_tr(struct inode *inode, struct file *filp);
bool tracing_is_disabled(void);
bool tracer_tracing_is_on(struct trace_array *tr);
void tracer_tracing_on(struct trace_array *tr);
void tracer_tracing_off(struct trace_array *tr);
struct dentry *trace_create_file(const char *name,
                                 umode_t mode,
                                 struct dentry *parent,
                                 void *data,
                                 const struct file_operations *fops);

int tracing_init_dentry(void);

struct ring_buffer_event;

struct ring_buffer_event *
trace_buffer_lock_reserve(struct trace_buffer *buffer,
                          int type,
                          unsigned long len,
                          unsigned long flags,
                          int pc);

struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
                                                struct trace_array_cpu *data);

struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
                                          int *ent_cpu, u64 *ent_ts);

void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer,
                                        struct ring_buffer_event *event);

const char *trace_event_format(struct trace_iterator *iter, const char *fmt);

int trace_empty(struct trace_iterator *iter);

void *trace_find_next_entry_inc(struct trace_iterator *iter);

void trace_init_global_iter(struct trace_iterator *iter);

void tracing_iter_reset(struct trace_iterator *iter, int cpu);

unsigned long trace_total_entries_cpu(struct trace_array *tr, int cpu);
unsigned long trace_total_entries(struct trace_array *tr);

void trace_function(struct trace_array *tr,
                    unsigned long ip,
                    unsigned long parent_ip,
                    unsigned long flags, int pc);
void trace_graph_function(struct trace_array *tr,
                    unsigned long ip,
                    unsigned long parent_ip,
                    unsigned long flags, int pc);
void trace_latency_header(struct seq_file *m);
void trace_default_header(struct seq_file *m);
void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
int trace_empty(struct trace_iterator *iter);

void trace_graph_return(struct ftrace_graph_ret *trace);
int trace_graph_entry(struct ftrace_graph_ent *trace);
void set_graph_array(struct trace_array *tr);

void tracing_start_cmdline_record(void);
void tracing_stop_cmdline_record(void);
void tracing_start_tgid_record(void);
void tracing_stop_tgid_record(void);

int register_tracer(struct tracer *type);
int is_tracing_stopped(void);

loff_t tracing_lseek(struct file *file, loff_t offset, int whence);

extern cpumask_var_t __read_mostly tracing_buffer_mask;

#define for_each_tracing_cpu(cpu)        \
        for_each_cpu(cpu, tracing_buffer_mask)

extern unsigned long nsecs_to_usecs(unsigned long nsecs);

extern unsigned long tracing_thresh;

/* PID filtering */

extern int pid_max;

bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids,
                             pid_t search_pid);
bool trace_ignore_this_task(struct trace_pid_list *filtered_pids,
                            struct trace_pid_list *filtered_no_pids,
                            struct task_struct *task);
void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
                                  struct task_struct *self,
                                  struct task_struct *task);
void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos);
void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos);
int trace_pid_show(struct seq_file *m, void *v);
void trace_free_pid_list(struct trace_pid_list *pid_list);
int trace_pid_write(struct trace_pid_list *filtered_pids,
                    struct trace_pid_list **new_pid_list,
                    const char __user *ubuf, size_t cnt);

#ifdef CONFIG_TRACER_MAX_TRACE
void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu,
                   void *cond_data);
void update_max_tr_single(struct trace_array *tr,
                          struct task_struct *tsk, int cpu);
#endif /* CONFIG_TRACER_MAX_TRACE */

#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \
        defined(CONFIG_FSNOTIFY)

void latency_fsnotify(struct trace_array *tr);

#else

static inline void latency_fsnotify(struct trace_array *tr) { }

#endif

#ifdef CONFIG_STACKTRACE
void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
                   int pc);
#else
static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
                                 int skip, int pc)
{
}
#endif /* CONFIG_STACKTRACE */

extern u64 ftrace_now(int cpu);

extern void trace_find_cmdline(int pid, char comm[]);
extern int trace_find_tgid(int pid);
extern void trace_event_follow_fork(struct trace_array *tr, bool enable);

#ifdef CONFIG_DYNAMIC_FTRACE
extern unsigned long ftrace_update_tot_cnt;
extern unsigned long ftrace_number_of_pages;
extern unsigned long ftrace_number_of_groups;
void ftrace_init_trace_array(struct trace_array *tr);
#else
static inline void ftrace_init_trace_array(struct trace_array *tr) { }
#endif
#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
extern int DYN_FTRACE_TEST_NAME(void);
#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2
extern int DYN_FTRACE_TEST_NAME2(void);

extern bool ring_buffer_expanded;
extern bool tracing_selftest_disabled;

#ifdef CONFIG_FTRACE_STARTUP_TEST
extern void __init disable_tracing_selftest(const char *reason);

extern int trace_selftest_startup_function(struct tracer *trace,
                                           struct trace_array *tr);
extern int trace_selftest_startup_function_graph(struct tracer *trace,
                                                 struct trace_array *tr);
extern int trace_selftest_startup_irqsoff(struct tracer *trace,
                                          struct trace_array *tr);
extern int trace_selftest_startup_preemptoff(struct tracer *trace,
                                             struct trace_array *tr);
extern int trace_selftest_startup_preemptirqsoff(struct tracer *trace,
                                                 struct trace_array *tr);
extern int trace_selftest_startup_wakeup(struct tracer *trace,
                                         struct trace_array *tr);
extern int trace_selftest_startup_nop(struct tracer *trace,
                                         struct trace_array *tr);
extern int trace_selftest_startup_branch(struct tracer *trace,
                                         struct trace_array *tr);
/*
 * Tracer data references selftest functions that only occur
 * on boot up. These can be __init functions. Thus, when selftests
 * are enabled, then the tracers need to reference __init functions.
 */
#define __tracer_data                __refdata
#else
static inline void __init disable_tracing_selftest(const char *reason)
{
}
/* Tracers are seldom changed. Optimize when selftests are disabled. */
#define __tracer_data                __read_mostly
#endif /* CONFIG_FTRACE_STARTUP_TEST */

extern void *head_page(struct trace_array_cpu *data);
extern unsigned long long ns2usecs(u64 nsec);
extern int
trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
extern int
trace_vprintk(unsigned long ip, const char *fmt, va_list args);
extern int
trace_array_vprintk(struct trace_array *tr,
                    unsigned long ip, const char *fmt, va_list args);
int trace_array_printk_buf(struct trace_buffer *buffer,
                           unsigned long ip, const char *fmt, ...);
void trace_printk_seq(struct trace_seq *s);
enum print_line_t print_trace_line(struct trace_iterator *iter);

extern char trace_find_mark(unsigned long long duration);

struct ftrace_hash;

struct ftrace_mod_load {
        struct list_head        list;
        char                        *func;
        char                        *module;
        int                         enable;
};

enum {
        FTRACE_HASH_FL_MOD        = (1 << 0),
};

struct ftrace_hash {
        unsigned long                size_bits;
        struct hlist_head        *buckets;
        unsigned long                count;
        unsigned long                flags;
        struct rcu_head                rcu;
};

struct ftrace_func_entry *
ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip);

static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash)
{
        return !hash || !(hash->count || (hash->flags & FTRACE_HASH_FL_MOD));
}

/* Standard output formatting function used for function return traces */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER

/* Flag options */
#define TRACE_GRAPH_PRINT_OVERRUN       0x1
#define TRACE_GRAPH_PRINT_CPU           0x2
#define TRACE_GRAPH_PRINT_OVERHEAD      0x4
#define TRACE_GRAPH_PRINT_PROC          0x8
#define TRACE_GRAPH_PRINT_DURATION      0x10
#define TRACE_GRAPH_PRINT_ABS_TIME      0x20
#define TRACE_GRAPH_PRINT_REL_TIME      0x40
#define TRACE_GRAPH_PRINT_IRQS          0x80
#define TRACE_GRAPH_PRINT_TAIL          0x100
#define TRACE_GRAPH_SLEEP_TIME          0x200
#define TRACE_GRAPH_GRAPH_TIME          0x400
#define TRACE_GRAPH_PRINT_FILL_SHIFT        28
#define TRACE_GRAPH_PRINT_FILL_MASK        (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT)

extern void ftrace_graph_sleep_time_control(bool enable);

#ifdef CONFIG_FUNCTION_PROFILER
extern void ftrace_graph_graph_time_control(bool enable);
#else
static inline void ftrace_graph_graph_time_control(bool enable) { }
#endif

extern enum print_line_t
print_graph_function_flags(struct trace_iterator *iter, u32 flags);
extern void print_graph_headers_flags(struct seq_file *s, u32 flags);
extern void
trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
extern void graph_trace_open(struct trace_iterator *iter);
extern void graph_trace_close(struct trace_iterator *iter);
extern int __trace_graph_entry(struct trace_array *tr,
                               struct ftrace_graph_ent *trace,
                               unsigned long flags, int pc);
extern void __trace_graph_return(struct trace_array *tr,
                                 struct ftrace_graph_ret *trace,
                                 unsigned long flags, int pc);

#ifdef CONFIG_DYNAMIC_FTRACE
extern struct ftrace_hash __rcu *ftrace_graph_hash;
extern struct ftrace_hash __rcu *ftrace_graph_notrace_hash;

static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace)
{
        unsigned long addr = trace->func;
        int ret = 0;
        struct ftrace_hash *hash;

        preempt_disable_notrace();

        /*
         * Have to open code "rcu_dereference_sched()" because the
         * function graph tracer can be called when RCU is not
         * "watching".
         * Protected with schedule_on_each_cpu(ftrace_sync)
         */
        hash = rcu_dereference_protected(ftrace_graph_hash, !preemptible());

        if (ftrace_hash_empty(hash)) {
                ret = 1;
                goto out;
        }

        if (ftrace_lookup_ip(hash, addr)) {

                /*
                 * This needs to be cleared on the return functions
                 * when the depth is zero.
                 */
                trace_recursion_set(TRACE_GRAPH_BIT);
                trace_recursion_set_depth(trace->depth);

                /*
                 * If no irqs are to be traced, but a set_graph_function
                 * is set, and called by an interrupt handler, we still
                 * want to trace it.
                 */
                if (in_irq())
                        trace_recursion_set(TRACE_IRQ_BIT);
                else
                        trace_recursion_clear(TRACE_IRQ_BIT);
                ret = 1;
        }

out:
        preempt_enable_notrace();
        return ret;
}

static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace)
{
        if (trace_recursion_test(TRACE_GRAPH_BIT) &&
            trace->depth == trace_recursion_depth())
                trace_recursion_clear(TRACE_GRAPH_BIT);
}

static inline int ftrace_graph_notrace_addr(unsigned long addr)
{
        int ret = 0;
        struct ftrace_hash *notrace_hash;

        preempt_disable_notrace();

        /*
         * Have to open code "rcu_dereference_sched()" because the
         * function graph tracer can be called when RCU is not
         * "watching".
         * Protected with schedule_on_each_cpu(ftrace_sync)
         */
        notrace_hash = rcu_dereference_protected(ftrace_graph_notrace_hash,
                                                 !preemptible());

        if (ftrace_lookup_ip(notrace_hash, addr))
                ret = 1;

        preempt_enable_notrace();
        return ret;
}
#else
static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace)
{
        return 1;
}

static inline int ftrace_graph_notrace_addr(unsigned long addr)
{
        return 0;
}
static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace)
{ }
#endif /* CONFIG_DYNAMIC_FTRACE */

extern unsigned int fgraph_max_depth;

static inline bool ftrace_graph_ignore_func(struct ftrace_graph_ent *trace)
{
        /* trace it when it is-nested-in or is a function enabled. */
        return !(trace_recursion_test(TRACE_GRAPH_BIT) ||
                 ftrace_graph_addr(trace)) ||
                (trace->depth < 0) ||
                (fgraph_max_depth && trace->depth >= fgraph_max_depth);
}

#else /* CONFIG_FUNCTION_GRAPH_TRACER */
static inline enum print_line_t
print_graph_function_flags(struct trace_iterator *iter, u32 flags)
{
        return TRACE_TYPE_UNHANDLED;
}
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */

extern struct list_head ftrace_pids;

#ifdef CONFIG_FUNCTION_TRACER

#define FTRACE_PID_IGNORE        -1
#define FTRACE_PID_TRACE        -2

struct ftrace_func_command {
        struct list_head        list;
        char                        *name;
        int                        (*func)(struct trace_array *tr,
                                        struct ftrace_hash *hash,
                                        char *func, char *cmd,
                                        char *params, int enable);
};
extern bool ftrace_filter_param __initdata;
static inline int ftrace_trace_task(struct trace_array *tr)
{
        return this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid) !=
                FTRACE_PID_IGNORE;
}
extern int ftrace_is_dead(void);
int ftrace_create_function_files(struct trace_array *tr,
                                 struct dentry *parent);
void ftrace_destroy_function_files(struct trace_array *tr);
int ftrace_allocate_ftrace_ops(struct trace_array *tr);
void ftrace_free_ftrace_ops(struct trace_array *tr);
void ftrace_init_global_array_ops(struct trace_array *tr);
void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func);
void ftrace_reset_array_ops(struct trace_array *tr);
void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer);
void ftrace_init_tracefs_toplevel(struct trace_array *tr,
                                  struct dentry *d_tracer);
void ftrace_clear_pids(struct trace_array *tr);
int init_function_trace(void);
void ftrace_pid_follow_fork(struct trace_array *tr, bool enable);
#else
static inline int ftrace_trace_task(struct trace_array *tr)
{
        return 1;
}
static inline int ftrace_is_dead(void) { return 0; }
static inline int
ftrace_create_function_files(struct trace_array *tr,
                             struct dentry *parent)
{
        return 0;
}
static inline int ftrace_allocate_ftrace_ops(struct trace_array *tr)
{
        return 0;
}
static inline void ftrace_free_ftrace_ops(struct trace_array *tr) { }
static inline void ftrace_destroy_function_files(struct trace_array *tr) { }
static inline __init void
ftrace_init_global_array_ops(struct trace_array *tr) { }
static inline void ftrace_reset_array_ops(struct trace_array *tr) { }
static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { }
static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d) { }
static inline void ftrace_clear_pids(struct trace_array *tr) { }
static inline int init_function_trace(void) { return 0; }
static inline void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) { }
/* ftace_func_t type is not defined, use macro instead of static inline */
#define ftrace_init_array_ops(tr, func) do { } while (0)
#endif /* CONFIG_FUNCTION_TRACER */

#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE)

struct ftrace_probe_ops {
        void                        (*func)(unsigned long ip,
                                        unsigned long parent_ip,
                                        struct trace_array *tr,
                                        struct ftrace_probe_ops *ops,
                                        void *data);
        int                        (*init)(struct ftrace_probe_ops *ops,
                                        struct trace_array *tr,
                                        unsigned long ip, void *init_data,
                                        void **data);
        void                        (*free)(struct ftrace_probe_ops *ops,
                                        struct trace_array *tr,
                                        unsigned long ip, void *data);
        int                        (*print)(struct seq_file *m,
                                         unsigned long ip,
                                         struct ftrace_probe_ops *ops,
                                         void *data);
};

struct ftrace_func_mapper;
typedef int (*ftrace_mapper_func)(void *data);

struct ftrace_func_mapper *allocate_ftrace_func_mapper(void);
void **ftrace_func_mapper_find_ip(struct ftrace_func_mapper *mapper,
                                           unsigned long ip);
int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper,
                               unsigned long ip, void *data);
void *ftrace_func_mapper_remove_ip(struct ftrace_func_mapper *mapper,
                                   unsigned long ip);
void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper,
                             ftrace_mapper_func free_func);

extern int
register_ftrace_function_probe(char *glob, struct trace_array *tr,
                               struct ftrace_probe_ops *ops, void *data);
extern int
unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr,
                                      struct ftrace_probe_ops *ops);
extern void clear_ftrace_function_probes(struct trace_array *tr);

int register_ftrace_command(struct ftrace_func_command *cmd);
int unregister_ftrace_command(struct ftrace_func_command *cmd);

void ftrace_create_filter_files(struct ftrace_ops *ops,
                                struct dentry *parent);
void ftrace_destroy_filter_files(struct ftrace_ops *ops);

extern int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
                             int len, int reset);
extern int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
                              int len, int reset);
#else
struct ftrace_func_command;

static inline __init int register_ftrace_command(struct ftrace_func_command *cmd)
{
        return -EINVAL;
}
static inline __init int unregister_ftrace_command(char *cmd_name)
{
        return -EINVAL;
}
static inline void clear_ftrace_function_probes(struct trace_array *tr)
{
}

/*
 * The ops parameter passed in is usually undefined.
 * This must be a macro.
 */
#define ftrace_create_filter_files(ops, parent) do { } while (0)
#define ftrace_destroy_filter_files(ops) do { } while (0)
#endif /* CONFIG_FUNCTION_TRACER && CONFIG_DYNAMIC_FTRACE */

bool ftrace_event_is_function(struct trace_event_call *call);

/*
 * struct trace_parser - servers for reading the user input separated by spaces
 * @cont: set if the input is not complete - no final space char was found
 * @buffer: holds the parsed user input
 * @idx: user input length
 * @size: buffer size
 */
struct trace_parser {
        bool                cont;
        bool                fail;
        char                *buffer;
        unsigned        idx;
        unsigned        size;
};

static inline bool trace_parser_loaded(struct trace_parser *parser)
{
        return !parser->fail && parser->idx != 0;
}

static inline bool trace_parser_cont(struct trace_parser *parser)
{
        return parser->cont;
}

static inline void trace_parser_clear(struct trace_parser *parser)
{
        parser->cont = false;
        parser->idx = 0;
}

static inline void trace_parser_fail(struct trace_parser *parser)
{
        parser->fail = true;
}

extern int trace_parser_get_init(struct trace_parser *parser, int size);
extern void trace_parser_put(struct trace_parser *parser);
extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
        size_t cnt, loff_t *ppos);

/*
 * Only create function graph options if function graph is configured.
 */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
# define FGRAPH_FLAGS                                                \
                C(DISPLAY_GRAPH,        "display-graph"),
#else
# define FGRAPH_FLAGS
#endif

#ifdef CONFIG_BRANCH_TRACER
# define BRANCH_FLAGS                                        \
                C(BRANCH,                "branch"),
#else
# define BRANCH_FLAGS
#endif

#ifdef CONFIG_FUNCTION_TRACER
# define FUNCTION_FLAGS                                                \
                C(FUNCTION,                "function-trace"),        \
                C(FUNC_FORK,                "function-fork"),
# define FUNCTION_DEFAULT_FLAGS                TRACE_ITER_FUNCTION
#else
# define FUNCTION_FLAGS
# define FUNCTION_DEFAULT_FLAGS                0UL
# define TRACE_ITER_FUNC_FORK                0UL
#endif

#ifdef CONFIG_STACKTRACE
# define STACK_FLAGS                                \
                C(STACKTRACE,                "stacktrace"),
#else
# define STACK_FLAGS
#endif

/*
 * trace_iterator_flags is an enumeration that defines bit
 * positions into trace_flags that controls the output.
 *
 * NOTE: These bits must match the trace_options array in
 *       trace.c (this macro guarantees it).
 */
#define TRACE_FLAGS                                                \
                C(PRINT_PARENT,                "print-parent"),        \
                C(SYM_OFFSET,                "sym-offset"),                \
                C(SYM_ADDR,                "sym-addr"),                \
                C(VERBOSE,                "verbose"),                \
                C(RAW,                        "raw"),                        \
                C(HEX,                        "hex"),                        \
                C(BIN,                        "bin"),                        \
                C(BLOCK,                "block"),                \
                C(PRINTK,                "trace_printk"),        \
                C(ANNOTATE,                "annotate"),                \
                C(USERSTACKTRACE,        "userstacktrace"),        \
                C(SYM_USEROBJ,                "sym-userobj"),                \
                C(PRINTK_MSGONLY,        "printk-msg-only"),        \
                C(CONTEXT_INFO,                "context-info"),   /* Print pid/cpu/time */ \
                C(LATENCY_FMT,                "latency-format"),        \
                C(RECORD_CMD,                "record-cmd"),                \
                C(RECORD_TGID,                "record-tgid"),                \
                C(OVERWRITE,                "overwrite"),                \
                C(STOP_ON_FREE,                "disable_on_free"),        \
                C(IRQ_INFO,                "irq-info"),                \
                C(MARKERS,                "markers"),                \
                C(EVENT_FORK,                "event-fork"),                \
                C(PAUSE_ON_TRACE,        "pause-on-trace"),        \
                FUNCTION_FLAGS                                        \
                FGRAPH_FLAGS                                        \
                STACK_FLAGS                                        \
                BRANCH_FLAGS

/*
 * By defining C, we can make TRACE_FLAGS a list of bit names
 * that will define the bits for the flag masks.
 */
#undef C
#define C(a, b) TRACE_ITER_##a##_BIT

enum trace_iterator_bits {
        TRACE_FLAGS
        /* Make sure we don't go more than we have bits for */
        TRACE_ITER_LAST_BIT
};

/*
 * By redefining C, we can make TRACE_FLAGS a list of masks that
 * use the bits as defined above.
 */
#undef C
#define C(a, b) TRACE_ITER_##a = (1 << TRACE_ITER_##a##_BIT)

enum trace_iterator_flags { TRACE_FLAGS };

/*
 * TRACE_ITER_SYM_MASK masks the options in trace_flags that
 * control the output of kernel symbols.
 */
#define TRACE_ITER_SYM_MASK \
        (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)

extern struct tracer nop_trace;

#ifdef CONFIG_BRANCH_TRACER
extern int enable_branch_tracing(struct trace_array *tr);
extern void disable_branch_tracing(void);
static inline int trace_branch_enable(struct trace_array *tr)
{
        if (tr->trace_flags & TRACE_ITER_BRANCH)
                return enable_branch_tracing(tr);
        return 0;
}
static inline void trace_branch_disable(void)
{
        /* due to races, always disable */
        disable_branch_tracing();
}
#else
static inline int trace_branch_enable(struct trace_array *tr)
{
        return 0;
}
static inline void trace_branch_disable(void)
{
}
#endif /* CONFIG_BRANCH_TRACER */

/* set ring buffers to default size if not already done so */
int tracing_update_buffers(void);

struct ftrace_event_field {
        struct list_head        link;
        const char                *name;
        const char                *type;
        int                        filter_type;
        int                        offset;
        int                        size;
        int                        is_signed;
};

struct prog_entry;

struct event_filter {
        struct prog_entry __rcu        *prog;
        char                        *filter_string;
};

struct event_subsystem {
        struct list_head        list;
        const char                *name;
        struct event_filter        *filter;
        int                        ref_count;
};

struct trace_subsystem_dir {
        struct list_head                list;
        struct event_subsystem                *subsystem;
        struct trace_array                *tr;
        struct dentry                        *entry;
        int                                ref_count;
        int                                nr_events;
};

extern int call_filter_check_discard(struct trace_event_call *call, void *rec,
                                     struct trace_buffer *buffer,
                                     struct ring_buffer_event *event);

void trace_buffer_unlock_commit_regs(struct trace_array *tr,
                                     struct trace_buffer *buffer,
                                     struct ring_buffer_event *event,
                                     unsigned long flags, int pc,
                                     struct pt_regs *regs);

static inline void trace_buffer_unlock_commit(struct trace_array *tr,
                                              struct trace_buffer *buffer,
                                              struct ring_buffer_event *event,
                                              unsigned long flags, int pc)
{
        trace_buffer_unlock_commit_regs(tr, buffer, event, flags, pc, NULL);
}

DECLARE_PER_CPU(struct ring_buffer_event *, trace_buffered_event);
DECLARE_PER_CPU(int, trace_buffered_event_cnt);
void trace_buffered_event_disable(void);
void trace_buffered_event_enable(void);

static inline void
__trace_event_discard_commit(struct trace_buffer *buffer,
                             struct ring_buffer_event *event)
{
        if (this_cpu_read(trace_buffered_event) == event) {
                /* Simply release the temp buffer */
                this_cpu_dec(trace_buffered_event_cnt);
                return;
        }
        ring_buffer_discard_commit(buffer, event);
}

/*
 * Helper function for event_trigger_unlock_commit{_regs}().
 * If there are event triggers attached to this event that requires
 * filtering against its fields, then they will be called as the
 * entry already holds the field information of the current event.
 *
 * It also checks if the event should be discarded or not.
 * It is to be discarded if the event is soft disabled and the
 * event was only recorded to process triggers, or if the event
 * filter is active and this event did not match the filters.
 *
 * Returns true if the event is discarded, false otherwise.
 */
static inline bool
__event_trigger_test_discard(struct trace_event_file *file,
                             struct trace_buffer *buffer,
                             struct ring_buffer_event *event,
                             void *entry,
                             enum event_trigger_type *tt)
{
        unsigned long eflags = file->flags;

        if (eflags & EVENT_FILE_FL_TRIGGER_COND)
                *tt = event_triggers_call(file, entry, event);

        if (likely(!(file->flags & (EVENT_FILE_FL_SOFT_DISABLED |
                                    EVENT_FILE_FL_FILTERED |
                                    EVENT_FILE_FL_PID_FILTER))))
                return false;

        if (file->flags & EVENT_FILE_FL_SOFT_DISABLED)
                goto discard;

        if (file->flags & EVENT_FILE_FL_FILTERED &&
            !filter_match_preds(file->filter, entry))
                goto discard;

        if ((file->flags & EVENT_FILE_FL_PID_FILTER) &&
            trace_event_ignore_this_pid(file))
                goto discard;

        return false;
 discard:
        __trace_event_discard_commit(buffer, event);
        return true;
}

/**
 * event_trigger_unlock_commit - handle triggers and finish event commit
 * @file: The file pointer assoctiated to the event
 * @buffer: The ring buffer that the event is being written to
 * @event: The event meta data in the ring buffer
 * @entry: The event itself
 * @irq_flags: The state of the interrupts at the start of the event
 * @pc: The state of the preempt count at the start of the event.
 *
 * This is a helper function to handle triggers that require data
 * from the event itself. It also tests the event against filters and
 * if the event is soft disabled and should be discarded.
 */
static inline void
event_trigger_unlock_commit(struct trace_event_file *file,
                            struct trace_buffer *buffer,
                            struct ring_buffer_event *event,
                            void *entry, unsigned long irq_flags, int pc)
{
        enum event_trigger_type tt = ETT_NONE;

        if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
                trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc);

        if (tt)
                event_triggers_post_call(file, tt);
}

/**
 * event_trigger_unlock_commit_regs - handle triggers and finish event commit
 * @file: The file pointer assoctiated to the event
 * @buffer: The ring buffer that the event is being written to
 * @event: The event meta data in the ring buffer
 * @entry: The event itself
 * @irq_flags: The state of the interrupts at the start of the event
 * @pc: The state of the preempt count at the start of the event.
 *
 * This is a helper function to handle triggers that require data
 * from the event itself. It also tests the event against filters and
 * if the event is soft disabled and should be discarded.
 *
 * Same as event_trigger_unlock_commit() but calls
 * trace_buffer_unlock_commit_regs() instead of trace_buffer_unlock_commit().
 */
static inline void
event_trigger_unlock_commit_regs(struct trace_event_file *file,
                                 struct trace_buffer *buffer,
                                 struct ring_buffer_event *event,
                                 void *entry, unsigned long irq_flags, int pc,
                                 struct pt_regs *regs)
{
        enum event_trigger_type tt = ETT_NONE;

        if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
                trace_buffer_unlock_commit_regs(file->tr, buffer, event,
                                                irq_flags, pc, regs);

        if (tt)
                event_triggers_post_call(file, tt);
}

#define FILTER_PRED_INVALID        ((unsigned short)-1)
#define FILTER_PRED_IS_RIGHT        (1 << 15)
#define FILTER_PRED_FOLD        (1 << 15)

/*
 * The max preds is the size of unsigned short with
 * two flags at the MSBs. One bit is used for both the IS_RIGHT
 * and FOLD flags. The other is reserved.
 *
 * 2^14 preds is way more than enough.
 */
#define MAX_FILTER_PRED                16384

struct filter_pred;
struct regex;

typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);

typedef int (*regex_match_func)(char *str, struct regex *r, int len);

enum regex_type {
        MATCH_FULL = 0,
        MATCH_FRONT_ONLY,
        MATCH_MIDDLE_ONLY,
        MATCH_END_ONLY,
        MATCH_GLOB,
        MATCH_INDEX,
};

struct regex {
        char                        pattern[MAX_FILTER_STR_VAL];
        int                        len;
        int                        field_len;
        regex_match_func        match;
};

struct filter_pred {
        filter_pred_fn_t         fn;
        u64                         val;
        struct regex                regex;
        unsigned short                *ops;
        struct ftrace_event_field *field;
        int                         offset;
        int                        not;
        int                         op;
};

static inline bool is_string_field(struct ftrace_event_field *field)
{
        return field->filter_type == FILTER_DYN_STRING ||
               field->filter_type == FILTER_STATIC_STRING ||
               field->filter_type == FILTER_PTR_STRING ||
               field->filter_type == FILTER_COMM;
}

static inline bool is_function_field(struct ftrace_event_field *field)
{
        return field->filter_type == FILTER_TRACE_FN;
}

extern enum regex_type
filter_parse_regex(char *buff, int len, char **search, int *not);
extern void print_event_filter(struct trace_event_file *file,
                               struct trace_seq *s);
extern int apply_event_filter(struct trace_event_file *file,
                              char *filter_string);
extern int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
                                        char *filter_string);
extern void print_subsystem_event_filter(struct event_subsystem *system,
                                         struct trace_seq *s);
extern int filter_assign_type(const char *type);
extern int create_event_filter(struct trace_array *tr,
                               struct trace_event_call *call,
                               char *filter_str, bool set_str,
                               struct event_filter **filterp);
extern void free_event_filter(struct event_filter *filter);

struct ftrace_event_field *
trace_find_event_field(struct trace_event_call *call, char *name);

extern void trace_event_enable_cmd_record(bool enable);
extern void trace_event_enable_tgid_record(bool enable);

extern int event_trace_init(void);
extern int init_events(void);
extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
extern int event_trace_del_tracer(struct trace_array *tr);
extern void __trace_early_add_events(struct trace_array *tr);

extern struct trace_event_file *__find_event_file(struct trace_array *tr,
                                                  const char *system,
                                                  const char *event);
extern struct trace_event_file *find_event_file(struct trace_array *tr,
                                                const char *system,
                                                const char *event);

static inline void *event_file_data(struct file *filp)
{
        return READ_ONCE(file_inode(filp)->i_private);
}

extern struct mutex event_mutex;
extern struct list_head ftrace_events;

extern const struct file_operations event_trigger_fops;
extern const struct file_operations event_hist_fops;
extern const struct file_operations event_hist_debug_fops;
extern const struct file_operations event_inject_fops;

#ifdef CONFIG_HIST_TRIGGERS
extern int register_trigger_hist_cmd(void);
extern int register_trigger_hist_enable_disable_cmds(void);
#else
static inline int register_trigger_hist_cmd(void) { return 0; }
static inline int register_trigger_hist_enable_disable_cmds(void) { return 0; }
#endif

extern int register_trigger_cmds(void);
extern void clear_event_triggers(struct trace_array *tr);

struct event_trigger_data {
        unsigned long                        count;
        int                                ref;
        struct event_trigger_ops        *ops;
        struct event_command                *cmd_ops;
        struct event_filter __rcu        *filter;
        char                                *filter_str;
        void                                *private_data;
        bool                                paused;
        bool                                paused_tmp;
        struct list_head                list;
        char                                *name;
        struct list_head                named_list;
        struct event_trigger_data        *named_data;
};

/* Avoid typos */
#define ENABLE_EVENT_STR        "enable_event"
#define DISABLE_EVENT_STR        "disable_event"
#define ENABLE_HIST_STR                "enable_hist"
#define DISABLE_HIST_STR        "disable_hist"

struct enable_trigger_data {
        struct trace_event_file                *file;
        bool                                enable;
        bool                                hist;
};

extern int event_enable_trigger_print(struct seq_file *m,
                                      struct event_trigger_ops *ops,
                                      struct event_trigger_data *data);
extern void event_enable_trigger_free(struct event_trigger_ops *ops,
                                      struct event_trigger_data *data);
extern int event_enable_trigger_func(struct event_command *cmd_ops,
                                     struct trace_event_file *file,
                                     char *glob, char *cmd, char *param);
extern int event_enable_register_trigger(char *glob,
                                         struct event_trigger_ops *ops,
                                         struct event_trigger_data *data,
                                         struct trace_event_file *file);
extern void event_enable_unregister_trigger(char *glob,
                                            struct event_trigger_ops *ops,
                                            struct event_trigger_data *test,
                                            struct trace_event_file *file);
extern void trigger_data_free(struct event_trigger_data *data);
extern int event_trigger_init(struct event_trigger_ops *ops,
                              struct event_trigger_data *data);
extern int trace_event_trigger_enable_disable(struct trace_event_file *file,
                                              int trigger_enable);
extern void update_cond_flag(struct trace_event_file *file);
extern int set_trigger_filter(char *filter_str,
                              struct event_trigger_data *trigger_data,
                              struct trace_event_file *file);
extern struct event_trigger_data *find_named_trigger(const char *name);
extern bool is_named_trigger(struct event_trigger_data *test);
extern int save_named_trigger(const char *name,
                              struct event_trigger_data *data);
extern void del_named_trigger(struct event_trigger_data *data);
extern void pause_named_trigger(struct event_trigger_data *data);
extern void unpause_named_trigger(struct event_trigger_data *data);
extern void set_named_trigger_data(struct event_trigger_data *data,
                                   struct event_trigger_data *named_data);
extern struct event_trigger_data *
get_named_trigger_data(struct event_trigger_data *data);
extern int register_event_command(struct event_command *cmd);
extern int unregister_event_command(struct event_command *cmd);
extern int register_trigger_hist_enable_disable_cmds(void);

extern void event_file_get(struct trace_event_file *file);
extern void event_file_put(struct trace_event_file *file);

/**
 * struct event_trigger_ops - callbacks for trace event triggers
 *
 * The methods in this structure provide per-event trigger hooks for
 * various trigger operations.
 *
 * All the methods below, except for @init() and @free(), must be
 * implemented.
 *
 * @func: The trigger 'probe' function called when the triggering
 *        event occurs.  The data passed into this callback is the data
 *        that was supplied to the event_command @reg() function that
 *        registered the trigger (see struct event_command) along with
 *        the trace record, rec.
 *
 * @init: An optional initialization function called for the trigger
 *        when the trigger is registered (via the event_command reg()
 *        function).  This can be used to perform per-trigger
 *        initialization such as incrementing a per-trigger reference
 *        count, for instance.  This is usually implemented by the
 *        generic utility function @event_trigger_init() (see
 *        trace_event_triggers.c).
 *
 * @free: An optional de-initialization function called for the
 *        trigger when the trigger is unregistered (via the
 *        event_command @reg() function).  This can be used to perform
 *        per-trigger de-initialization such as decrementing a
 *        per-trigger reference count and freeing corresponding trigger
 *        data, for instance.  This is usually implemented by the
 *        generic utility function @event_trigger_free() (see
 *        trace_event_triggers.c).
 *
 * @print: The callback function invoked to have the trigger print
 *        itself.  This is usually implemented by a wrapper function
 *        that calls the generic utility function @event_trigger_print()
 *        (see trace_event_triggers.c).
 */
struct event_trigger_ops {
        void                        (*func)(struct event_trigger_data *data,
                                        void *rec,
                                        struct ring_buffer_event *rbe);
        int                        (*init)(struct event_trigger_ops *ops,
                                        struct event_trigger_data *data);
        void                        (*free)(struct event_trigger_ops *ops,
                                        struct event_trigger_data *data);
        int                        (*print)(struct seq_file *m,
                                         struct event_trigger_ops *ops,
                                         struct event_trigger_data *data);
};

/**
 * struct event_command - callbacks and data members for event commands
 *
 * Event commands are invoked by users by writing the command name
 * into the 'trigger' file associated with a trace event.  The
 * parameters associated with a specific invocation of an event
 * command are used to create an event trigger instance, which is
 * added to the list of trigger instances associated with that trace
 * event.  When the event is hit, the set of triggers associated with
 * that event is invoked.
 *
 * The data members in this structure provide per-event command data
 * for various event commands.
 *
 * All the data members below, except for @post_trigger, must be set
 * for each event command.
 *
 * @name: The unique name that identifies the event command.  This is
 *        the name used when setting triggers via trigger files.
 *
 * @trigger_type: A unique id that identifies the event command
 *        'type'.  This value has two purposes, the first to ensure that
 *        only one trigger of the same type can be set at a given time
 *        for a particular event e.g. it doesn't make sense to have both
 *        a traceon and traceoff trigger attached to a single event at
 *        the same time, so traceon and traceoff have the same type
 *        though they have different names.  The @trigger_type value is
 *        also used as a bit value for deferring the actual trigger
 *        action until after the current event is finished.  Some
 *        commands need to do this if they themselves log to the trace
 *        buffer (see the @post_trigger() member below).  @trigger_type
 *        values are defined by adding new values to the trigger_type
 *        enum in include/linux/trace_events.h.
 *
 * @flags: See the enum event_command_flags below.
 *
 * All the methods below, except for @set_filter() and @unreg_all(),
 * must be implemented.
 *
 * @func: The callback function responsible for parsing and
 *        registering the trigger written to the 'trigger' file by the
 *        user.  It allocates the trigger instance and registers it with
 *        the appropriate trace event.  It makes use of the other
 *        event_command callback functions to orchestrate this, and is
 *        usually implemented by the generic utility function
 *        @event_trigger_callback() (see trace_event_triggers.c).
 *
 * @reg: Adds the trigger to the list of triggers associated with the
 *        event, and enables the event trigger itself, after
 *        initializing it (via the event_trigger_ops @init() function).
 *        This is also where commands can use the @trigger_type value to
 *        make the decision as to whether or not multiple instances of
 *        the trigger should be allowed.  This is usually implemented by
 *        the generic utility function @register_trigger() (see
 *        trace_event_triggers.c).
 *
 * @unreg: Removes the trigger from the list of triggers associated
 *        with the event, and disables the event trigger itself, after
 *        initializing it (via the event_trigger_ops @free() function).
 *        This is usually implemented by the generic utility function
 *        @unregister_trigger() (see trace_event_triggers.c).
 *
 * @unreg_all: An optional function called to remove all the triggers
 *        from the list of triggers associated with the event.  Called
 *        when a trigger file is opened in truncate mode.
 *
 * @set_filter: An optional function called to parse and set a filter
 *        for the trigger.  If no @set_filter() method is set for the
 *        event command, filters set by the user for the command will be
 *        ignored.  This is usually implemented by the generic utility
 *        function @set_trigger_filter() (see trace_event_triggers.c).
 *
 * @get_trigger_ops: The callback function invoked to retrieve the
 *        event_trigger_ops implementation associated with the command.
 */
struct event_command {
        struct list_head        list;
        char                        *name;
        enum event_trigger_type        trigger_type;
        int                        flags;
        int                        (*func)(struct event_command *cmd_ops,
                                        struct trace_event_file *file,
                                        char *glob, char *cmd, char *params);
        int                        (*reg)(char *glob,
                                       struct event_trigger_ops *ops,
                                       struct event_trigger_data *data,
                                       struct trace_event_file *file);
        void                        (*unreg)(char *glob,
                                         struct event_trigger_ops *ops,
                                         struct event_trigger_data *data,
                                         struct trace_event_file *file);
        void                        (*unreg_all)(struct trace_event_file *file);
        int                        (*set_filter)(char *filter_str,
                                              struct event_trigger_data *data,
                                              struct trace_event_file *file);
        struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param);
};

/**
 * enum event_command_flags - flags for struct event_command
 *
 * @POST_TRIGGER: A flag that says whether or not this command needs
 *        to have its action delayed until after the current event has
 *        been closed.  Some triggers need to avoid being invoked while
 *        an event is currently in the process of being logged, since
 *        the trigger may itself log data into the trace buffer.  Thus
 *        we make sure the current event is committed before invoking
 *        those triggers.  To do that, the trigger invocation is split
 *        in two - the first part checks the filter using the current
 *        trace record; if a command has the @post_trigger flag set, it
 *        sets a bit for itself in the return value, otherwise it
 *        directly invokes the trigger.  Once all commands have been
 *        either invoked or set their return flag, the current record is
 *        either committed or discarded.  At that point, if any commands
 *        have deferred their triggers, those commands are finally
 *        invoked following the close of the current event.  In other
 *        words, if the event_trigger_ops @func() probe implementation
 *        itself logs to the trace buffer, this flag should be set,
 *        otherwise it can be left unspecified.
 *
 * @NEEDS_REC: A flag that says whether or not this command needs
 *        access to the trace record in order to perform its function,
 *        regardless of whether or not it has a filter associated with
 *        it (filters make a trigger require access to the trace record
 *        but are not always present).
 */
enum event_command_flags {
        EVENT_CMD_FL_POST_TRIGGER        = 1,
        EVENT_CMD_FL_NEEDS_REC                = 2,
};

static inline bool event_command_post_trigger(struct event_command *cmd_ops)
{
        return cmd_ops->flags & EVENT_CMD_FL_POST_TRIGGER;
}

static inline bool event_command_needs_rec(struct event_command *cmd_ops)
{
        return cmd_ops->flags & EVENT_CMD_FL_NEEDS_REC;
}

extern int trace_event_enable_disable(struct trace_event_file *file,
                                      int enable, int soft_disable);
extern int tracing_alloc_snapshot(void);
extern void tracing_snapshot_cond(struct trace_array *tr, void *cond_data);
extern int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update);

extern int tracing_snapshot_cond_disable(struct trace_array *tr);
extern void *tracing_cond_snapshot_data(struct trace_array *tr);

extern const char *__start___trace_bprintk_fmt[];
extern const char *__stop___trace_bprintk_fmt[];

extern const char *__start___tracepoint_str[];
extern const char *__stop___tracepoint_str[];

void trace_printk_control(bool enabled);
void trace_printk_start_comm(void);
int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);

/* Used from boot time tracer */
extern int trace_set_options(struct trace_array *tr, char *option);
extern int tracing_set_tracer(struct trace_array *tr, const char *buf);
extern ssize_t tracing_resize_ring_buffer(struct trace_array *tr,
                                          unsigned long size, int cpu_id);
extern int tracing_set_cpumask(struct trace_array *tr,
                                cpumask_var_t tracing_cpumask_new);


#define MAX_EVENT_NAME_LEN        64

extern int trace_run_command(const char *buf, int (*createfn)(int, char**));
extern ssize_t trace_parse_run_command(struct file *file,
                const char __user *buffer, size_t count, loff_t *ppos,
                int (*createfn)(int, char**));

extern unsigned int err_pos(char *cmd, const char *str);
extern void tracing_log_err(struct trace_array *tr,
                            const char *loc, const char *cmd,
                            const char **errs, u8 type, u8 pos);

/*
 * Normal trace_printk() and friends allocates special buffers
 * to do the manipulation, as well as saves the print formats
 * into sections to display. But the trace infrastructure wants
 * to use these without the added overhead at the price of being
 * a bit slower (used mainly for warnings, where we don't care
 * about performance). The internal_trace_puts() is for such
 * a purpose.
 */
#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str))

#undef FTRACE_ENTRY
#define FTRACE_ENTRY(call, struct_name, id, tstruct, print)        \
        extern struct trace_event_call                                        \
        __aligned(4) event_##call;
#undef FTRACE_ENTRY_DUP
#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print)        \
        FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
#undef FTRACE_ENTRY_PACKED
#define FTRACE_ENTRY_PACKED(call, struct_name, id, tstruct, print) \
        FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))

#include "trace_entries.h"

#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER)
int perf_ftrace_event_register(struct trace_event_call *call,
                               enum trace_reg type, void *data);
#else
#define perf_ftrace_event_register NULL
#endif

#ifdef CONFIG_FTRACE_SYSCALLS
void init_ftrace_syscalls(void);
const char *get_syscall_name(int syscall);
#else
static inline void init_ftrace_syscalls(void) { }
static inline const char *get_syscall_name(int syscall)
{
        return NULL;
}
#endif

#ifdef CONFIG_EVENT_TRACING
void trace_event_init(void);
void trace_event_eval_update(struct trace_eval_map **map, int len);
/* Used from boot time tracer */
extern int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set);
extern int trigger_process_regex(struct trace_event_file *file, char *buff);
#else
static inline void __init trace_event_init(void) { }
static inline void trace_event_eval_update(struct trace_eval_map **map, int len) { }
#endif

#ifdef CONFIG_TRACER_SNAPSHOT
void tracing_snapshot_instance(struct trace_array *tr);
int tracing_alloc_snapshot_instance(struct trace_array *tr);
#else
static inline void tracing_snapshot_instance(struct trace_array *tr) { }
static inline int tracing_alloc_snapshot_instance(struct trace_array *tr)
{
        return 0;
}
#endif

#ifdef CONFIG_PREEMPT_TRACER
void tracer_preempt_on(unsigned long a0, unsigned long a1);
void tracer_preempt_off(unsigned long a0, unsigned long a1);
#else
static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { }
static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { }
#endif
#ifdef CONFIG_IRQSOFF_TRACER
void tracer_hardirqs_on(unsigned long a0, unsigned long a1);
void tracer_hardirqs_off(unsigned long a0, unsigned long a1);
#else
static inline void tracer_hardirqs_on(unsigned long a0, unsigned long a1) { }
static inline void tracer_hardirqs_off(unsigned long a0, unsigned long a1) { }
#endif

extern struct trace_iterator *tracepoint_print_iter;

/*
 * Reset the state of the trace_iterator so that it can read consumed data.
 * Normally, the trace_iterator is used for reading the data when it is not
 * consumed, and must retain state.
 */
static __always_inline void trace_iterator_reset(struct trace_iterator *iter)
{
        const size_t offset = offsetof(struct trace_iterator, seq);

        /*
         * Keep gcc from complaining about overwriting more than just one
         * member in the structure.
         */
        memset((char *)iter + offset, 0, sizeof(struct trace_iterator) - offset);

        iter->pos = -1;
}

/* Check the name is good for event/group/fields */
static inline bool is_good_name(const char *name)
{
        if (!isalpha(*name) && *name != '_')
                return false;
        while (*++name != '\0') {
                if (!isalpha(*name) && !isdigit(*name) && *name != '_')
                        return false;
        }
        return true;
}

#endif /* _LINUX_KERNEL_TRACE_H */





































































    1 


    1 







    1 































































































































    2 




    1 
























































































    1 



    1 



    1 

























































































































































































    1 






























    1 

    1 






    1 

    1 




    1 




















































































































    1 






    1 







    1 
    1 



    1 

    1 
    1 

    1 
    1 



    1 













    1 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/proc/inode.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/cache.h>
#include <linux/time.h>
#include <linux/proc_fs.h>
#include <linux/kernel.h>
#include <linux/pid_namespace.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/stat.h>
#include <linux/completion.h>
#include <linux/poll.h>
#include <linux/printk.h>
#include <linux/file.h>
#include <linux/limits.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/sysctl.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/mount.h>
#include <linux/bug.h>

#include <linux/uaccess.h>

#include "internal.h"

static void proc_evict_inode(struct inode *inode)
{
        struct proc_dir_entry *de;
        struct ctl_table_header *head;
        struct proc_inode *ei = PROC_I(inode);

        truncate_inode_pages_final(&inode->i_data);
        clear_inode(inode);

        /* Stop tracking associated processes */
        if (ei->pid) {
                proc_pid_evict_inode(ei);
                ei->pid = NULL;
        }

        /* Let go of any associated proc directory entry */
        de = ei->pde;
        if (de) {
                pde_put(de);
                ei->pde = NULL;
        }

        head = ei->sysctl;
        if (head) {
                WRITE_ONCE(ei->sysctl, NULL);
                proc_sys_evict_inode(inode, head);
        }
}

static struct kmem_cache *proc_inode_cachep __ro_after_init;
static struct kmem_cache *pde_opener_cache __ro_after_init;

static struct inode *proc_alloc_inode(struct super_block *sb)
{
        struct proc_inode *ei;

        ei = kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL);
        if (!ei)
                return NULL;
        ei->pid = NULL;
        ei->fd = 0;
        ei->op.proc_get_link = NULL;
        ei->pde = NULL;
        ei->sysctl = NULL;
        ei->sysctl_entry = NULL;
        INIT_HLIST_NODE(&ei->sibling_inodes);
        ei->ns_ops = NULL;
        return &ei->vfs_inode;
}

static void proc_free_inode(struct inode *inode)
{
        kmem_cache_free(proc_inode_cachep, PROC_I(inode));
}

static void init_once(void *foo)
{
        struct proc_inode *ei = (struct proc_inode *) foo;

        inode_init_once(&ei->vfs_inode);
}

void __init proc_init_kmemcache(void)
{
        proc_inode_cachep = kmem_cache_create("proc_inode_cache",
                                             sizeof(struct proc_inode),
                                             0, (SLAB_RECLAIM_ACCOUNT|
                                                SLAB_MEM_SPREAD|SLAB_ACCOUNT|
                                                SLAB_PANIC),
                                             init_once);
        pde_opener_cache =
                kmem_cache_create("pde_opener", sizeof(struct pde_opener), 0,
                                  SLAB_ACCOUNT|SLAB_PANIC, NULL);
        proc_dir_entry_cache = kmem_cache_create_usercopy(
                "proc_dir_entry", SIZEOF_PDE, 0, SLAB_PANIC,
                offsetof(struct proc_dir_entry, inline_name),
                SIZEOF_PDE_INLINE_NAME, NULL);
        BUILD_BUG_ON(sizeof(struct proc_dir_entry) >= SIZEOF_PDE);
}

void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock)
{
        struct inode *inode;
        struct proc_inode *ei;
        struct hlist_node *node;
        struct super_block *old_sb = NULL;

        rcu_read_lock();
        for (;;) {
                struct super_block *sb;
                node = hlist_first_rcu(inodes);
                if (!node)
                        break;
                ei = hlist_entry(node, struct proc_inode, sibling_inodes);
                spin_lock(lock);
                hlist_del_init_rcu(&ei->sibling_inodes);
                spin_unlock(lock);

                inode = &ei->vfs_inode;
                sb = inode->i_sb;
                if ((sb != old_sb) && !atomic_inc_not_zero(&sb->s_active))
                        continue;
                inode = igrab(inode);
                rcu_read_unlock();
                if (sb != old_sb) {
                        if (old_sb)
                                deactivate_super(old_sb);
                        old_sb = sb;
                }
                if (unlikely(!inode)) {
                        rcu_read_lock();
                        continue;
                }

                if (S_ISDIR(inode->i_mode)) {
                        struct dentry *dir = d_find_any_alias(inode);
                        if (dir) {
                                d_invalidate(dir);
                                dput(dir);
                        }
                } else {
                        struct dentry *dentry;
                        while ((dentry = d_find_alias(inode))) {
                                d_invalidate(dentry);
                                dput(dentry);
                        }
                }
                iput(inode);

                rcu_read_lock();
        }
        rcu_read_unlock();
        if (old_sb)
                deactivate_super(old_sb);
}

static inline const char *hidepid2str(enum proc_hidepid v)
{
        switch (v) {
                case HIDEPID_OFF: return "off";
                case HIDEPID_NO_ACCESS: return "noaccess";
                case HIDEPID_INVISIBLE: return "invisible";
                case HIDEPID_NOT_PTRACEABLE: return "ptraceable";
        }
        WARN_ONCE(1, "bad hide_pid value: %d\n", v);
        return "unknown";
}

static int proc_show_options(struct seq_file *seq, struct dentry *root)
{
        struct proc_fs_info *fs_info = proc_sb_info(root->d_sb);

        if (!gid_eq(fs_info->pid_gid, GLOBAL_ROOT_GID))
                seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, fs_info->pid_gid));
        if (fs_info->hide_pid != HIDEPID_OFF)
                seq_printf(seq, ",hidepid=%s", hidepid2str(fs_info->hide_pid));
        if (fs_info->pidonly != PROC_PIDONLY_OFF)
                seq_printf(seq, ",subset=pid");

        return 0;
}

const struct super_operations proc_sops = {
        .alloc_inode        = proc_alloc_inode,
        .free_inode        = proc_free_inode,
        .drop_inode        = generic_delete_inode,
        .evict_inode        = proc_evict_inode,
        .statfs                = simple_statfs,
        .show_options        = proc_show_options,
};

enum {BIAS = -1U<<31};

static inline int use_pde(struct proc_dir_entry *pde)
{
        return likely(atomic_inc_unless_negative(&pde->in_use));
}

static void unuse_pde(struct proc_dir_entry *pde)
{
        if (unlikely(atomic_dec_return(&pde->in_use) == BIAS))
                complete(pde->pde_unload_completion);
}

/* pde is locked on entry, unlocked on exit */
static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
        __releases(&pde->pde_unload_lock)
{
        /*
         * close() (proc_reg_release()) can't delete an entry and proceed:
         * ->release hook needs to be available at the right moment.
         *
         * rmmod (remove_proc_entry() et al) can't delete an entry and proceed:
         * "struct file" needs to be available at the right moment.
         *
         * Therefore, first process to enter this function does ->release() and
         * signals its completion to the other process which does nothing.
         */
        if (pdeo->closing) {
                /* somebody else is doing that, just wait */
                DECLARE_COMPLETION_ONSTACK(c);
                pdeo->c = &c;
                spin_unlock(&pde->pde_unload_lock);
                wait_for_completion(&c);
        } else {
                struct file *file;
                struct completion *c;

                pdeo->closing = true;
                spin_unlock(&pde->pde_unload_lock);
                file = pdeo->file;
                pde->proc_ops->proc_release(file_inode(file), file);
                spin_lock(&pde->pde_unload_lock);
                /* After ->release. */
                list_del(&pdeo->lh);
                c = pdeo->c;
                spin_unlock(&pde->pde_unload_lock);
                if (unlikely(c))
                        complete(c);
                kmem_cache_free(pde_opener_cache, pdeo);
        }
}

void proc_entry_rundown(struct proc_dir_entry *de)
{
        DECLARE_COMPLETION_ONSTACK(c);
        /* Wait until all existing callers into module are done. */
        de->pde_unload_completion = &c;
        if (atomic_add_return(BIAS, &de->in_use) != BIAS)
                wait_for_completion(&c);

        /* ->pde_openers list can't grow from now on. */

        spin_lock(&de->pde_unload_lock);
        while (!list_empty(&de->pde_openers)) {
                struct pde_opener *pdeo;
                pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh);
                close_pdeo(de, pdeo);
                spin_lock(&de->pde_unload_lock);
        }
        spin_unlock(&de->pde_unload_lock);
}

static loff_t pde_lseek(struct proc_dir_entry *pde, struct file *file, loff_t offset, int whence)
{
        typeof_member(struct proc_ops, proc_lseek) lseek;

        lseek = pde->proc_ops->proc_lseek;
        if (!lseek)
                lseek = default_llseek;
        return lseek(file, offset, whence);
}

static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
{
        struct proc_dir_entry *pde = PDE(file_inode(file));
        loff_t rv = -EINVAL;

        if (pde_is_permanent(pde)) {
                return pde_lseek(pde, file, offset, whence);
        } else if (use_pde(pde)) {
                rv = pde_lseek(pde, file, offset, whence);
                unuse_pde(pde);
        }
        return rv;
}

static ssize_t proc_reg_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
        struct proc_dir_entry *pde = PDE(file_inode(iocb->ki_filp));
        ssize_t ret;

        if (pde_is_permanent(pde))
                return pde->proc_ops->proc_read_iter(iocb, iter);

        if (!use_pde(pde))
                return -EIO;
        ret = pde->proc_ops->proc_read_iter(iocb, iter);
        unuse_pde(pde);
        return ret;
}

static ssize_t pde_read(struct proc_dir_entry *pde, struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
        typeof_member(struct proc_ops, proc_read) read;

        read = pde->proc_ops->proc_read;
        if (read)
                return read(file, buf, count, ppos);
        return -EIO;
}

static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
        struct proc_dir_entry *pde = PDE(file_inode(file));
        ssize_t rv = -EIO;

        if (pde_is_permanent(pde)) {
                return pde_read(pde, file, buf, count, ppos);
        } else if (use_pde(pde)) {
                rv = pde_read(pde, file, buf, count, ppos);
                unuse_pde(pde);
        }
        return rv;
}

static ssize_t pde_write(struct proc_dir_entry *pde, struct file *file, const char __user *buf, size_t count, loff_t *ppos)
{
        typeof_member(struct proc_ops, proc_write) write;

        write = pde->proc_ops->proc_write;
        if (write)
                return write(file, buf, count, ppos);
        return -EIO;
}

static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
{
        struct proc_dir_entry *pde = PDE(file_inode(file));
        ssize_t rv = -EIO;

        if (pde_is_permanent(pde)) {
                return pde_write(pde, file, buf, count, ppos);
        } else if (use_pde(pde)) {
                rv = pde_write(pde, file, buf, count, ppos);
                unuse_pde(pde);
        }
        return rv;
}

static __poll_t pde_poll(struct proc_dir_entry *pde, struct file *file, struct poll_table_struct *pts)
{
        typeof_member(struct proc_ops, proc_poll) poll;

        poll = pde->proc_ops->proc_poll;
        if (poll)
                return poll(file, pts);
        return DEFAULT_POLLMASK;
}

static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts)
{
        struct proc_dir_entry *pde = PDE(file_inode(file));
        __poll_t rv = DEFAULT_POLLMASK;

        if (pde_is_permanent(pde)) {
                return pde_poll(pde, file, pts);
        } else if (use_pde(pde)) {
                rv = pde_poll(pde, file, pts);
                unuse_pde(pde);
        }
        return rv;
}

static long pde_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg)
{
        typeof_member(struct proc_ops, proc_ioctl) ioctl;

        ioctl = pde->proc_ops->proc_ioctl;
        if (ioctl)
                return ioctl(file, cmd, arg);
        return -ENOTTY;
}

static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
        struct proc_dir_entry *pde = PDE(file_inode(file));
        long rv = -ENOTTY;

        if (pde_is_permanent(pde)) {
                return pde_ioctl(pde, file, cmd, arg);
        } else if (use_pde(pde)) {
                rv = pde_ioctl(pde, file, cmd, arg);
                unuse_pde(pde);
        }
        return rv;
}

#ifdef CONFIG_COMPAT
static long pde_compat_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg)
{
        typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl;

        compat_ioctl = pde->proc_ops->proc_compat_ioctl;
        if (compat_ioctl)
                return compat_ioctl(file, cmd, arg);
        return -ENOTTY;
}

static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
        struct proc_dir_entry *pde = PDE(file_inode(file));
        long rv = -ENOTTY;
        if (pde_is_permanent(pde)) {
                return pde_compat_ioctl(pde, file, cmd, arg);
        } else if (use_pde(pde)) {
                rv = pde_compat_ioctl(pde, file, cmd, arg);
                unuse_pde(pde);
        }
        return rv;
}
#endif

static int pde_mmap(struct proc_dir_entry *pde, struct file *file, struct vm_area_struct *vma)
{
        typeof_member(struct proc_ops, proc_mmap) mmap;

        mmap = pde->proc_ops->proc_mmap;
        if (mmap)
                return mmap(file, vma);
        return -EIO;
}

static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct proc_dir_entry *pde = PDE(file_inode(file));
        int rv = -EIO;

        if (pde_is_permanent(pde)) {
                return pde_mmap(pde, file, vma);
        } else if (use_pde(pde)) {
                rv = pde_mmap(pde, file, vma);
                unuse_pde(pde);
        }
        return rv;
}

static unsigned long
pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned long orig_addr,
                           unsigned long len, unsigned long pgoff,
                           unsigned long flags)
{
        typeof_member(struct proc_ops, proc_get_unmapped_area) get_area;

        get_area = pde->proc_ops->proc_get_unmapped_area;
#ifdef CONFIG_MMU
        if (!get_area)
                get_area = current->mm->get_unmapped_area;
#endif
        if (get_area)
                return get_area(file, orig_addr, len, pgoff, flags);
        return orig_addr;
}

static unsigned long
proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr,
                           unsigned long len, unsigned long pgoff,
                           unsigned long flags)
{
        struct proc_dir_entry *pde = PDE(file_inode(file));
        unsigned long rv = -EIO;

        if (pde_is_permanent(pde)) {
                return pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags);
        } else if (use_pde(pde)) {
                rv = pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags);
                unuse_pde(pde);
        }
        return rv;
}

static int proc_reg_open(struct inode *inode, struct file *file)
{
        struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
        struct proc_dir_entry *pde = PDE(inode);
        int rv = 0;
        typeof_member(struct proc_ops, proc_open) open;
        typeof_member(struct proc_ops, proc_release) release;
        struct pde_opener *pdeo;

        if (pde_is_permanent(pde)) {
                open = pde->proc_ops->proc_open;
                if (open)
                        rv = open(inode, file);
                return rv;
        }

        if (fs_info->pidonly == PROC_PIDONLY_ON)
                return -ENOENT;

        /*
         * Ensure that
         * 1) PDE's ->release hook will be called no matter what
         *    either normally by close()/->release, or forcefully by
         *    rmmod/remove_proc_entry.
         *
         * 2) rmmod isn't blocked by opening file in /proc and sitting on
         *    the descriptor (including "rmmod foo </proc/foo" scenario).
         *
         * Save every "struct file" with custom ->release hook.
         */
        if (!use_pde(pde))
                return -ENOENT;

        release = pde->proc_ops->proc_release;
        if (release) {
                pdeo = kmem_cache_alloc(pde_opener_cache, GFP_KERNEL);
                if (!pdeo) {
                        rv = -ENOMEM;
                        goto out_unuse;
                }
        }

        open = pde->proc_ops->proc_open;
        if (open)
                rv = open(inode, file);

        if (release) {
                if (rv == 0) {
                        /* To know what to release. */
                        pdeo->file = file;
                        pdeo->closing = false;
                        pdeo->c = NULL;
                        spin_lock(&pde->pde_unload_lock);
                        list_add(&pdeo->lh, &pde->pde_openers);
                        spin_unlock(&pde->pde_unload_lock);
                } else
                        kmem_cache_free(pde_opener_cache, pdeo);
        }

out_unuse:
        unuse_pde(pde);
        return rv;
}

static int proc_reg_release(struct inode *inode, struct file *file)
{
        struct proc_dir_entry *pde = PDE(inode);
        struct pde_opener *pdeo;

        if (pde_is_permanent(pde)) {
                typeof_member(struct proc_ops, proc_release) release;

                release = pde->proc_ops->proc_release;
                if (release) {
                        return release(inode, file);
                }
                return 0;
        }

        spin_lock(&pde->pde_unload_lock);
        list_for_each_entry(pdeo, &pde->pde_openers, lh) {
                if (pdeo->file == file) {
                        close_pdeo(pde, pdeo);
                        return 0;
                }
        }
        spin_unlock(&pde->pde_unload_lock);
        return 0;
}

static const struct file_operations proc_reg_file_ops = {
        .llseek                = proc_reg_llseek,
        .read                = proc_reg_read,
        .write                = proc_reg_write,
        .poll                = proc_reg_poll,
        .unlocked_ioctl        = proc_reg_unlocked_ioctl,
        .mmap                = proc_reg_mmap,
        .get_unmapped_area = proc_reg_get_unmapped_area,
        .open                = proc_reg_open,
        .release        = proc_reg_release,
};

static const struct file_operations proc_iter_file_ops = {
        .llseek                = proc_reg_llseek,
        .read_iter        = proc_reg_read_iter,
        .write                = proc_reg_write,
        .splice_read        = generic_file_splice_read,
        .poll                = proc_reg_poll,
        .unlocked_ioctl        = proc_reg_unlocked_ioctl,
        .mmap                = proc_reg_mmap,
        .get_unmapped_area = proc_reg_get_unmapped_area,
        .open                = proc_reg_open,
        .release        = proc_reg_release,
};

#ifdef CONFIG_COMPAT
static const struct file_operations proc_reg_file_ops_compat = {
        .llseek                = proc_reg_llseek,
        .read                = proc_reg_read,
        .write                = proc_reg_write,
        .poll                = proc_reg_poll,
        .unlocked_ioctl        = proc_reg_unlocked_ioctl,
        .compat_ioctl        = proc_reg_compat_ioctl,
        .mmap                = proc_reg_mmap,
        .get_unmapped_area = proc_reg_get_unmapped_area,
        .open                = proc_reg_open,
        .release        = proc_reg_release,
};

static const struct file_operations proc_iter_file_ops_compat = {
        .llseek                = proc_reg_llseek,
        .read_iter        = proc_reg_read_iter,
        .splice_read        = generic_file_splice_read,
        .write                = proc_reg_write,
        .poll                = proc_reg_poll,
        .unlocked_ioctl        = proc_reg_unlocked_ioctl,
        .compat_ioctl        = proc_reg_compat_ioctl,
        .mmap                = proc_reg_mmap,
        .get_unmapped_area = proc_reg_get_unmapped_area,
        .open                = proc_reg_open,
        .release        = proc_reg_release,
};
#endif

static void proc_put_link(void *p)
{
        unuse_pde(p);
}

static const char *proc_get_link(struct dentry *dentry,
                                 struct inode *inode,
                                 struct delayed_call *done)
{
        struct proc_dir_entry *pde = PDE(inode);
        if (!use_pde(pde))
                return ERR_PTR(-EINVAL);
        set_delayed_call(done, proc_put_link, pde);
        return pde->data;
}

const struct inode_operations proc_link_inode_operations = {
        .get_link        = proc_get_link,
};

struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
{
        struct inode *inode = new_inode(sb);

        if (!inode) {
                pde_put(de);
                return NULL;
        }

        inode->i_ino = de->low_ino;
        inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
        PROC_I(inode)->pde = de;
        if (is_empty_pde(de)) {
                make_empty_dir_inode(inode);
                return inode;
        }

        if (de->mode) {
                inode->i_mode = de->mode;
                inode->i_uid = de->uid;
                inode->i_gid = de->gid;
        }
        if (de->size)
                inode->i_size = de->size;
        if (de->nlink)
                set_nlink(inode, de->nlink);

        if (S_ISREG(inode->i_mode)) {
                inode->i_op = de->proc_iops;
                if (pde_has_proc_read_iter(de))
                        inode->i_fop = &proc_iter_file_ops;
                else
                        inode->i_fop = &proc_reg_file_ops;
#ifdef CONFIG_COMPAT
                if (pde_has_proc_compat_ioctl(de)) {
                        if (pde_has_proc_read_iter(de))
                                inode->i_fop = &proc_iter_file_ops_compat;
                        else
                                inode->i_fop = &proc_reg_file_ops_compat;
                }
#endif
        } else if (S_ISDIR(inode->i_mode)) {
                inode->i_op = de->proc_iops;
                inode->i_fop = de->proc_dir_ops;
        } else if (S_ISLNK(inode->i_mode)) {
                inode->i_op = de->proc_iops;
                inode->i_fop = NULL;
        } else {
                BUG();
        }
        return inode;
}


























































































    2 













































    2 






    2 













































































































































































































































































































































































































































    2 



















    2 
    2 
    2 
    2 







































































































































    2 




    2 






    2 





































































































































    2 






    2 






















    2 
































    2 













    2 


    2 







    2 


    2 















    2 





    2 

    2 





    2 



    2 

    2 



    2 


    2 
    2 








    2 















    2 


    2 






    2 


    2 
    2 




    2 




    2 





    2 
    2 



    2 



    2 







    2 
    2 










    2 




    2 
    2 




    1 

    1 





    2 






















    2 







    2 











































    2 
    2 




    2 





    2 










    2 


    2 
    2 




    2 











    2 


    2 








    2 



    2 

    2 



    2 












    2 






    2 

    2 











    2 




    2 
















    2 














    2 
    2 



    2 






    2 






    2 



    2 





























































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ext4/ialloc.c
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  BSD ufs-inspired inode and directory allocation by
 *  Stephen Tweedie (sct@redhat.com), 1993
 *  Big-endian to little-endian byte-swapping/bitmaps by
 *        David S. Miller (davem@caip.rutgers.edu), 1995
 */

#include <linux/time.h>
#include <linux/fs.h>
#include <linux/stat.h>
#include <linux/string.h>
#include <linux/quotaops.h>
#include <linux/buffer_head.h>
#include <linux/random.h>
#include <linux/bitops.h>
#include <linux/blkdev.h>
#include <linux/cred.h>

#include <asm/byteorder.h>

#include "ext4.h"
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"

#include <trace/events/ext4.h>

/*
 * ialloc.c contains the inodes allocation and deallocation routines
 */

/*
 * The free inodes are managed by bitmaps.  A file system contains several
 * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
 * block for inodes, N blocks for the inode table and data blocks.
 *
 * The file system contains group descriptors which are located after the
 * super block.  Each descriptor contains the number of the bitmap block and
 * the free blocks count in the block.
 */

/*
 * To avoid calling the atomic setbit hundreds or thousands of times, we only
 * need to use it within a single byte (to ensure we get endianness right).
 * We can use memset for the rest of the bitmap as there are no other users.
 */
void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
{
        int i;

        if (start_bit >= end_bit)
                return;

        ext4_debug("mark end bits +%d through +%d used\n", start_bit, end_bit);
        for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++)
                ext4_set_bit(i, bitmap);
        if (i < end_bit)
                memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3);
}

void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate)
{
        if (uptodate) {
                set_buffer_uptodate(bh);
                set_bitmap_uptodate(bh);
        }
        unlock_buffer(bh);
        put_bh(bh);
}

static int ext4_validate_inode_bitmap(struct super_block *sb,
                                      struct ext4_group_desc *desc,
                                      ext4_group_t block_group,
                                      struct buffer_head *bh)
{
        ext4_fsblk_t        blk;
        struct ext4_group_info *grp;

        if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
                return 0;

        grp = ext4_get_group_info(sb, block_group);

        if (buffer_verified(bh))
                return 0;
        if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
                return -EFSCORRUPTED;

        ext4_lock_group(sb, block_group);
        if (buffer_verified(bh))
                goto verified;
        blk = ext4_inode_bitmap(sb, desc);
        if (!ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh,
                                           EXT4_INODES_PER_GROUP(sb) / 8) ||
            ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) {
                ext4_unlock_group(sb, block_group);
                ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
                           "inode_bitmap = %llu", block_group, blk);
                ext4_mark_group_bitmap_corrupted(sb, block_group,
                                        EXT4_GROUP_INFO_IBITMAP_CORRUPT);
                return -EFSBADCRC;
        }
        set_buffer_verified(bh);
verified:
        ext4_unlock_group(sb, block_group);
        return 0;
}

/*
 * Read the inode allocation bitmap for a given block_group, reading
 * into the specified slot in the superblock's bitmap cache.
 *
 * Return buffer_head of bitmap on success, or an ERR_PTR on error.
 */
static struct buffer_head *
ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
{
        struct ext4_group_desc *desc;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct buffer_head *bh = NULL;
        ext4_fsblk_t bitmap_blk;
        int err;

        desc = ext4_get_group_desc(sb, block_group, NULL);
        if (!desc)
                return ERR_PTR(-EFSCORRUPTED);

        bitmap_blk = ext4_inode_bitmap(sb, desc);
        if ((bitmap_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
            (bitmap_blk >= ext4_blocks_count(sbi->s_es))) {
                ext4_error(sb, "Invalid inode bitmap blk %llu in "
                           "block_group %u", bitmap_blk, block_group);
                ext4_mark_group_bitmap_corrupted(sb, block_group,
                                        EXT4_GROUP_INFO_IBITMAP_CORRUPT);
                return ERR_PTR(-EFSCORRUPTED);
        }
        bh = sb_getblk(sb, bitmap_blk);
        if (unlikely(!bh)) {
                ext4_warning(sb, "Cannot read inode bitmap - "
                             "block_group = %u, inode_bitmap = %llu",
                             block_group, bitmap_blk);
                return ERR_PTR(-ENOMEM);
        }
        if (bitmap_uptodate(bh))
                goto verify;

        lock_buffer(bh);
        if (bitmap_uptodate(bh)) {
                unlock_buffer(bh);
                goto verify;
        }

        ext4_lock_group(sb, block_group);
        if (ext4_has_group_desc_csum(sb) &&
            (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) {
                if (block_group == 0) {
                        ext4_unlock_group(sb, block_group);
                        unlock_buffer(bh);
                        ext4_error(sb, "Inode bitmap for bg 0 marked "
                                   "uninitialized");
                        err = -EFSCORRUPTED;
                        goto out;
                }
                memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
                ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb),
                                     sb->s_blocksize * 8, bh->b_data);
                set_bitmap_uptodate(bh);
                set_buffer_uptodate(bh);
                set_buffer_verified(bh);
                ext4_unlock_group(sb, block_group);
                unlock_buffer(bh);
                return bh;
        }
        ext4_unlock_group(sb, block_group);

        if (buffer_uptodate(bh)) {
                /*
                 * if not uninit if bh is uptodate,
                 * bitmap is also uptodate
                 */
                set_bitmap_uptodate(bh);
                unlock_buffer(bh);
                goto verify;
        }
        /*
         * submit the buffer_head for reading
         */
        trace_ext4_load_inode_bitmap(sb, block_group);
        ext4_read_bh(bh, REQ_META | REQ_PRIO, ext4_end_bitmap_read);
        ext4_simulate_fail_bh(sb, bh, EXT4_SIM_IBITMAP_EIO);
        if (!buffer_uptodate(bh)) {
                put_bh(bh);
                ext4_error_err(sb, EIO, "Cannot read inode bitmap - "
                               "block_group = %u, inode_bitmap = %llu",
                               block_group, bitmap_blk);
                ext4_mark_group_bitmap_corrupted(sb, block_group,
                                EXT4_GROUP_INFO_IBITMAP_CORRUPT);
                return ERR_PTR(-EIO);
        }

verify:
        err = ext4_validate_inode_bitmap(sb, desc, block_group, bh);
        if (err)
                goto out;
        return bh;
out:
        put_bh(bh);
        return ERR_PTR(err);
}

/*
 * NOTE! When we get the inode, we're the only people
 * that have access to it, and as such there are no
 * race conditions we have to worry about. The inode
 * is not on the hash-lists, and it cannot be reached
 * through the filesystem because the directory entry
 * has been deleted earlier.
 *
 * HOWEVER: we must make sure that we get no aliases,
 * which means that we have to call "clear_inode()"
 * _before_ we mark the inode not in use in the inode
 * bitmaps. Otherwise a newly created file might use
 * the same inode number (not actually the same pointer
 * though), and then we'd have two inodes sharing the
 * same inode number and space on the harddisk.
 */
void ext4_free_inode(handle_t *handle, struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        int is_directory;
        unsigned long ino;
        struct buffer_head *bitmap_bh = NULL;
        struct buffer_head *bh2;
        ext4_group_t block_group;
        unsigned long bit;
        struct ext4_group_desc *gdp;
        struct ext4_super_block *es;
        struct ext4_sb_info *sbi;
        int fatal = 0, err, count, cleared;
        struct ext4_group_info *grp;

        if (!sb) {
                printk(KERN_ERR "EXT4-fs: %s:%d: inode on "
                       "nonexistent device\n", __func__, __LINE__);
                return;
        }
        if (atomic_read(&inode->i_count) > 1) {
                ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d",
                         __func__, __LINE__, inode->i_ino,
                         atomic_read(&inode->i_count));
                return;
        }
        if (inode->i_nlink) {
                ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n",
                         __func__, __LINE__, inode->i_ino, inode->i_nlink);
                return;
        }
        sbi = EXT4_SB(sb);

        ino = inode->i_ino;
        ext4_debug("freeing inode %lu\n", ino);
        trace_ext4_free_inode(inode);

        dquot_initialize(inode);
        dquot_free_inode(inode);

        is_directory = S_ISDIR(inode->i_mode);

        /* Do this BEFORE marking the inode not in use or returning an error */
        ext4_clear_inode(inode);

        es = sbi->s_es;
        if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
                ext4_error(sb, "reserved or nonexistent inode %lu", ino);
                goto error_return;
        }
        block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
        bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
        bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
        /* Don't bother if the inode bitmap is corrupt. */
        if (IS_ERR(bitmap_bh)) {
                fatal = PTR_ERR(bitmap_bh);
                bitmap_bh = NULL;
                goto error_return;
        }
        if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
                grp = ext4_get_group_info(sb, block_group);
                if (!grp || unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) {
                        fatal = -EFSCORRUPTED;
                        goto error_return;
                }
        }

        BUFFER_TRACE(bitmap_bh, "get_write_access");
        fatal = ext4_journal_get_write_access(handle, bitmap_bh);
        if (fatal)
                goto error_return;

        fatal = -ESRCH;
        gdp = ext4_get_group_desc(sb, block_group, &bh2);
        if (gdp) {
                BUFFER_TRACE(bh2, "get_write_access");
                fatal = ext4_journal_get_write_access(handle, bh2);
        }
        ext4_lock_group(sb, block_group);
        cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data);
        if (fatal || !cleared) {
                ext4_unlock_group(sb, block_group);
                goto out;
        }

        count = ext4_free_inodes_count(sb, gdp) + 1;
        ext4_free_inodes_set(sb, gdp, count);
        if (is_directory) {
                count = ext4_used_dirs_count(sb, gdp) - 1;
                ext4_used_dirs_set(sb, gdp, count);
                if (percpu_counter_initialized(&sbi->s_dirs_counter))
                        percpu_counter_dec(&sbi->s_dirs_counter);
        }
        ext4_inode_bitmap_csum_set(sb, block_group, gdp, bitmap_bh,
                                   EXT4_INODES_PER_GROUP(sb) / 8);
        ext4_group_desc_csum_set(sb, block_group, gdp);
        ext4_unlock_group(sb, block_group);

        if (percpu_counter_initialized(&sbi->s_freeinodes_counter))
                percpu_counter_inc(&sbi->s_freeinodes_counter);
        if (sbi->s_log_groups_per_flex) {
                struct flex_groups *fg;

                fg = sbi_array_rcu_deref(sbi, s_flex_groups,
                                         ext4_flex_group(sbi, block_group));
                atomic_inc(&fg->free_inodes);
                if (is_directory)
                        atomic_dec(&fg->used_dirs);
        }
        BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
        fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
out:
        if (cleared) {
                BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
                err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
                if (!fatal)
                        fatal = err;
        } else {
                ext4_error(sb, "bit already cleared for inode %lu", ino);
                ext4_mark_group_bitmap_corrupted(sb, block_group,
                                        EXT4_GROUP_INFO_IBITMAP_CORRUPT);
        }

error_return:
        brelse(bitmap_bh);
        ext4_std_error(sb, fatal);
}

struct orlov_stats {
        __u64 free_clusters;
        __u32 free_inodes;
        __u32 used_dirs;
};

/*
 * Helper function for Orlov's allocator; returns critical information
 * for a particular block group or flex_bg.  If flex_size is 1, then g
 * is a block group number; otherwise it is flex_bg number.
 */
static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
                            int flex_size, struct orlov_stats *stats)
{
        struct ext4_group_desc *desc;

        if (flex_size > 1) {
                struct flex_groups *fg = sbi_array_rcu_deref(EXT4_SB(sb),
                                                             s_flex_groups, g);
                stats->free_inodes = atomic_read(&fg->free_inodes);
                stats->free_clusters = atomic64_read(&fg->free_clusters);
                stats->used_dirs = atomic_read(&fg->used_dirs);
                return;
        }

        desc = ext4_get_group_desc(sb, g, NULL);
        if (desc) {
                stats->free_inodes = ext4_free_inodes_count(sb, desc);
                stats->free_clusters = ext4_free_group_clusters(sb, desc);
                stats->used_dirs = ext4_used_dirs_count(sb, desc);
        } else {
                stats->free_inodes = 0;
                stats->free_clusters = 0;
                stats->used_dirs = 0;
        }
}

/*
 * Orlov's allocator for directories.
 *
 * We always try to spread first-level directories.
 *
 * If there are blockgroups with both free inodes and free clusters counts
 * not worse than average we return one with smallest directory count.
 * Otherwise we simply return a random group.
 *
 * For the rest rules look so:
 *
 * It's OK to put directory into a group unless
 * it has too many directories already (max_dirs) or
 * it has too few free inodes left (min_inodes) or
 * it has too few free clusters left (min_clusters) or
 * Parent's group is preferred, if it doesn't satisfy these
 * conditions we search cyclically through the rest. If none
 * of the groups look good we just look for a group with more
 * free inodes than average (starting at parent's group).
 */

static int find_group_orlov(struct super_block *sb, struct inode *parent,
                            ext4_group_t *group, umode_t mode,
                            const struct qstr *qstr)
{
        ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_group_t real_ngroups = ext4_get_groups_count(sb);
        int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
        unsigned int freei, avefreei, grp_free;
        ext4_fsblk_t freec, avefreec;
        unsigned int ndirs;
        int max_dirs, min_inodes;
        ext4_grpblk_t min_clusters;
        ext4_group_t i, grp, g, ngroups;
        struct ext4_group_desc *desc;
        struct orlov_stats stats;
        int flex_size = ext4_flex_bg_size(sbi);
        struct dx_hash_info hinfo;

        ngroups = real_ngroups;
        if (flex_size > 1) {
                ngroups = (real_ngroups + flex_size - 1) >>
                        sbi->s_log_groups_per_flex;
                parent_group >>= sbi->s_log_groups_per_flex;
        }

        freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
        avefreei = freei / ngroups;
        freec = percpu_counter_read_positive(&sbi->s_freeclusters_counter);
        avefreec = freec;
        do_div(avefreec, ngroups);
        ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);

        if (S_ISDIR(mode) &&
            ((parent == d_inode(sb->s_root)) ||
             (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) {
                int best_ndir = inodes_per_group;
                int ret = -1;

                if (qstr) {
                        hinfo.hash_version = DX_HASH_HALF_MD4;
                        hinfo.seed = sbi->s_hash_seed;
                        ext4fs_dirhash(parent, qstr->name, qstr->len, &hinfo);
                        grp = hinfo.hash;
                } else
                        grp = prandom_u32();
                parent_group = (unsigned)grp % ngroups;
                for (i = 0; i < ngroups; i++) {
                        g = (parent_group + i) % ngroups;
                        get_orlov_stats(sb, g, flex_size, &stats);
                        if (!stats.free_inodes)
                                continue;
                        if (stats.used_dirs >= best_ndir)
                                continue;
                        if (stats.free_inodes < avefreei)
                                continue;
                        if (stats.free_clusters < avefreec)
                                continue;
                        grp = g;
                        ret = 0;
                        best_ndir = stats.used_dirs;
                }
                if (ret)
                        goto fallback;
        found_flex_bg:
                if (flex_size == 1) {
                        *group = grp;
                        return 0;
                }

                /*
                 * We pack inodes at the beginning of the flexgroup's
                 * inode tables.  Block allocation decisions will do
                 * something similar, although regular files will
                 * start at 2nd block group of the flexgroup.  See
                 * ext4_ext_find_goal() and ext4_find_near().
                 */
                grp *= flex_size;
                for (i = 0; i < flex_size; i++) {
                        if (grp+i >= real_ngroups)
                                break;
                        desc = ext4_get_group_desc(sb, grp+i, NULL);
                        if (desc && ext4_free_inodes_count(sb, desc)) {
                                *group = grp+i;
                                return 0;
                        }
                }
                goto fallback;
        }

        max_dirs = ndirs / ngroups + inodes_per_group*flex_size / 16;
        min_inodes = avefreei - inodes_per_group*flex_size / 4;
        if (min_inodes < 1)
                min_inodes = 1;
        min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4;
        if (min_clusters < 0)
                min_clusters = 0;

        /*
         * Start looking in the flex group where we last allocated an
         * inode for this parent directory
         */
        if (EXT4_I(parent)->i_last_alloc_group != ~0) {
                parent_group = EXT4_I(parent)->i_last_alloc_group;
                if (flex_size > 1)
                        parent_group >>= sbi->s_log_groups_per_flex;
        }

        for (i = 0; i < ngroups; i++) {
                grp = (parent_group + i) % ngroups;
                get_orlov_stats(sb, grp, flex_size, &stats);
                if (stats.used_dirs >= max_dirs)
                        continue;
                if (stats.free_inodes < min_inodes)
                        continue;
                if (stats.free_clusters < min_clusters)
                        continue;
                goto found_flex_bg;
        }

fallback:
        ngroups = real_ngroups;
        avefreei = freei / ngroups;
fallback_retry:
        parent_group = EXT4_I(parent)->i_block_group;
        for (i = 0; i < ngroups; i++) {
                grp = (parent_group + i) % ngroups;
                desc = ext4_get_group_desc(sb, grp, NULL);
                if (desc) {
                        grp_free = ext4_free_inodes_count(sb, desc);
                        if (grp_free && grp_free >= avefreei) {
                                *group = grp;
                                return 0;
                        }
                }
        }

        if (avefreei) {
                /*
                 * The free-inodes counter is approximate, and for really small
                 * filesystems the above test can fail to find any blockgroups
                 */
                avefreei = 0;
                goto fallback_retry;
        }

        return -1;
}

static int find_group_other(struct super_block *sb, struct inode *parent,
                            ext4_group_t *group, umode_t mode)
{
        ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
        ext4_group_t i, last, ngroups = ext4_get_groups_count(sb);
        struct ext4_group_desc *desc;
        int flex_size = ext4_flex_bg_size(EXT4_SB(sb));

        /*
         * Try to place the inode is the same flex group as its
         * parent.  If we can't find space, use the Orlov algorithm to
         * find another flex group, and store that information in the
         * parent directory's inode information so that use that flex
         * group for future allocations.
         */
        if (flex_size > 1) {
                int retry = 0;

        try_again:
                parent_group &= ~(flex_size-1);
                last = parent_group + flex_size;
                if (last > ngroups)
                        last = ngroups;
                for  (i = parent_group; i < last; i++) {
                        desc = ext4_get_group_desc(sb, i, NULL);
                        if (desc && ext4_free_inodes_count(sb, desc)) {
                                *group = i;
                                return 0;
                        }
                }
                if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
                        retry = 1;
                        parent_group = EXT4_I(parent)->i_last_alloc_group;
                        goto try_again;
                }
                /*
                 * If this didn't work, use the Orlov search algorithm
                 * to find a new flex group; we pass in the mode to
                 * avoid the topdir algorithms.
                 */
                *group = parent_group + flex_size;
                if (*group > ngroups)
                        *group = 0;
                return find_group_orlov(sb, parent, group, mode, NULL);
        }

        /*
         * Try to place the inode in its parent directory
         */
        *group = parent_group;
        desc = ext4_get_group_desc(sb, *group, NULL);
        if (desc && ext4_free_inodes_count(sb, desc) &&
            ext4_free_group_clusters(sb, desc))
                return 0;

        /*
         * We're going to place this inode in a different blockgroup from its
         * parent.  We want to cause files in a common directory to all land in
         * the same blockgroup.  But we want files which are in a different
         * directory which shares a blockgroup with our parent to land in a
         * different blockgroup.
         *
         * So add our directory's i_ino into the starting point for the hash.
         */
        *group = (*group + parent->i_ino) % ngroups;

        /*
         * Use a quadratic hash to find a group with a free inode and some free
         * blocks.
         */
        for (i = 1; i < ngroups; i <<= 1) {
                *group += i;
                if (*group >= ngroups)
                        *group -= ngroups;
                desc = ext4_get_group_desc(sb, *group, NULL);
                if (desc && ext4_free_inodes_count(sb, desc) &&
                    ext4_free_group_clusters(sb, desc))
                        return 0;
        }

        /*
         * That failed: try linear search for a free inode, even if that group
         * has no free blocks.
         */
        *group = parent_group;
        for (i = 0; i < ngroups; i++) {
                if (++*group >= ngroups)
                        *group = 0;
                desc = ext4_get_group_desc(sb, *group, NULL);
                if (desc && ext4_free_inodes_count(sb, desc))
                        return 0;
        }

        return -1;
}

/*
 * In no journal mode, if an inode has recently been deleted, we want
 * to avoid reusing it until we're reasonably sure the inode table
 * block has been written back to disk.  (Yes, these values are
 * somewhat arbitrary...)
 */
#define RECENTCY_MIN        60
#define RECENTCY_DIRTY        300

static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
{
        struct ext4_group_desc        *gdp;
        struct ext4_inode        *raw_inode;
        struct buffer_head        *bh;
        int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
        int offset, ret = 0;
        int recentcy = RECENTCY_MIN;
        u32 dtime, now;

        gdp = ext4_get_group_desc(sb, group, NULL);
        if (unlikely(!gdp))
                return 0;

        /* Inode was never used in this filesystem? */
        if (ext4_has_group_desc_csum(sb) &&
            (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT) ||
             ino >= EXT4_INODES_PER_GROUP(sb) - ext4_itable_unused_count(sb, gdp)))
                return 0;

        bh = sb_find_get_block(sb, ext4_inode_table(sb, gdp) +
                       (ino / inodes_per_block));
        if (!bh || !buffer_uptodate(bh))
                /*
                 * If the block is not in the buffer cache, then it
                 * must have been written out.
                 */
                goto out;

        offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb);
        raw_inode = (struct ext4_inode *) (bh->b_data + offset);

        /* i_dtime is only 32 bits on disk, but we only care about relative
         * times in the range of a few minutes (i.e. long enough to sync a
         * recently-deleted inode to disk), so using the low 32 bits of the
         * clock (a 68 year range) is enough, see time_before32() */
        dtime = le32_to_cpu(raw_inode->i_dtime);
        now = ktime_get_real_seconds();
        if (buffer_dirty(bh))
                recentcy += RECENTCY_DIRTY;

        if (dtime && time_before32(dtime, now) &&
            time_before32(now, dtime + recentcy))
                ret = 1;
out:
        brelse(bh);
        return ret;
}

static int find_inode_bit(struct super_block *sb, ext4_group_t group,
                          struct buffer_head *bitmap, unsigned long *ino)
{
        bool check_recently_deleted = EXT4_SB(sb)->s_journal == NULL;
        unsigned long recently_deleted_ino = EXT4_INODES_PER_GROUP(sb);

next:
        *ino = ext4_find_next_zero_bit((unsigned long *)
                                       bitmap->b_data,
                                       EXT4_INODES_PER_GROUP(sb), *ino);
        if (*ino >= EXT4_INODES_PER_GROUP(sb))
                goto not_found;

        if (check_recently_deleted && recently_deleted(sb, group, *ino)) {
                recently_deleted_ino = *ino;
                *ino = *ino + 1;
                if (*ino < EXT4_INODES_PER_GROUP(sb))
                        goto next;
                goto not_found;
        }
        return 1;
not_found:
        if (recently_deleted_ino >= EXT4_INODES_PER_GROUP(sb))
                return 0;
        /*
         * Not reusing recently deleted inodes is mostly a preference. We don't
         * want to report ENOSPC or skew allocation patterns because of that.
         * So return even recently deleted inode if we could find better in the
         * given range.
         */
        *ino = recently_deleted_ino;
        return 1;
}

int ext4_mark_inode_used(struct super_block *sb, int ino)
{
        unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
        struct buffer_head *inode_bitmap_bh = NULL, *group_desc_bh = NULL;
        struct ext4_group_desc *gdp;
        ext4_group_t group;
        int bit;
        int err;

        if (ino < EXT4_FIRST_INO(sb) || ino > max_ino)
                return -EFSCORRUPTED;

        group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
        bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
        inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
        if (IS_ERR(inode_bitmap_bh))
                return PTR_ERR(inode_bitmap_bh);

        if (ext4_test_bit(bit, inode_bitmap_bh->b_data)) {
                err = 0;
                goto out;
        }

        gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
        if (!gdp || !group_desc_bh) {
                err = -EINVAL;
                goto out;
        }

        ext4_set_bit(bit, inode_bitmap_bh->b_data);

        BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
        err = ext4_handle_dirty_metadata(NULL, NULL, inode_bitmap_bh);
        if (err) {
                ext4_std_error(sb, err);
                goto out;
        }
        err = sync_dirty_buffer(inode_bitmap_bh);
        if (err) {
                ext4_std_error(sb, err);
                goto out;
        }

        /* We may have to initialize the block bitmap if it isn't already */
        if (ext4_has_group_desc_csum(sb) &&
            gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                struct buffer_head *block_bitmap_bh;

                block_bitmap_bh = ext4_read_block_bitmap(sb, group);
                if (IS_ERR(block_bitmap_bh)) {
                        err = PTR_ERR(block_bitmap_bh);
                        goto out;
                }

                BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
                err = ext4_handle_dirty_metadata(NULL, NULL, block_bitmap_bh);
                sync_dirty_buffer(block_bitmap_bh);

                /* recheck and clear flag under lock if we still need to */
                ext4_lock_group(sb, group);
                if (ext4_has_group_desc_csum(sb) &&
                    (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
                        gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
                        ext4_free_group_clusters_set(sb, gdp,
                                ext4_free_clusters_after_init(sb, group, gdp));
                        ext4_block_bitmap_csum_set(sb, group, gdp,
                                                   block_bitmap_bh);
                        ext4_group_desc_csum_set(sb, group, gdp);
                }
                ext4_unlock_group(sb, group);
                brelse(block_bitmap_bh);

                if (err) {
                        ext4_std_error(sb, err);
                        goto out;
                }
        }

        /* Update the relevant bg descriptor fields */
        if (ext4_has_group_desc_csum(sb)) {
                int free;

                ext4_lock_group(sb, group); /* while we modify the bg desc */
                free = EXT4_INODES_PER_GROUP(sb) -
                        ext4_itable_unused_count(sb, gdp);
                if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
                        gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
                        free = 0;
                }

                /*
                 * Check the relative inode number against the last used
                 * relative inode number in this group. if it is greater
                 * we need to update the bg_itable_unused count
                 */
                if (bit >= free)
                        ext4_itable_unused_set(sb, gdp,
                                        (EXT4_INODES_PER_GROUP(sb) - bit - 1));
        } else {
                ext4_lock_group(sb, group);
        }

        ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
        if (ext4_has_group_desc_csum(sb)) {
                ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh,
                                           EXT4_INODES_PER_GROUP(sb) / 8);
                ext4_group_desc_csum_set(sb, group, gdp);
        }

        ext4_unlock_group(sb, group);
        err = ext4_handle_dirty_metadata(NULL, NULL, group_desc_bh);
        sync_dirty_buffer(group_desc_bh);
out:
        brelse(inode_bitmap_bh);
        return err;
}

static int ext4_xattr_credits_for_new_inode(struct inode *dir, mode_t mode,
                                            bool encrypt)
{
        struct super_block *sb = dir->i_sb;
        int nblocks = 0;
#ifdef CONFIG_EXT4_FS_POSIX_ACL
        struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT);

        if (IS_ERR(p))
                return PTR_ERR(p);
        if (p) {
                int acl_size = p->a_count * sizeof(ext4_acl_entry);

                nblocks += (S_ISDIR(mode) ? 2 : 1) *
                        __ext4_xattr_set_credits(sb, NULL /* inode */,
                                                 NULL /* block_bh */, acl_size,
                                                 true /* is_create */);
                posix_acl_release(p);
        }
#endif

#ifdef CONFIG_SECURITY
        {
                int num_security_xattrs = 1;

#ifdef CONFIG_INTEGRITY
                num_security_xattrs++;
#endif
                /*
                 * We assume that security xattrs are never more than 1k.
                 * In practice they are under 128 bytes.
                 */
                nblocks += num_security_xattrs *
                        __ext4_xattr_set_credits(sb, NULL /* inode */,
                                                 NULL /* block_bh */, 1024,
                                                 true /* is_create */);
        }
#endif
        if (encrypt)
                nblocks += __ext4_xattr_set_credits(sb,
                                                    NULL /* inode */,
                                                    NULL /* block_bh */,
                                                    FSCRYPT_SET_CONTEXT_MAX_SIZE,
                                                    true /* is_create */);
        return nblocks;
}

/*
 * There are two policies for allocating an inode.  If the new inode is
 * a directory, then a forward search is made for a block group with both
 * free space and a low directory-to-inode ratio; if that fails, then of
 * the groups with above-average free space, that group with the fewest
 * directories already is chosen.
 *
 * For other inodes, search forward from the parent directory's block
 * group to find a free inode.
 */
struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
                               umode_t mode, const struct qstr *qstr,
                               __u32 goal, uid_t *owner, __u32 i_flags,
                               int handle_type, unsigned int line_no,
                               int nblocks)
{
        struct super_block *sb;
        struct buffer_head *inode_bitmap_bh = NULL;
        struct buffer_head *group_desc_bh;
        ext4_group_t ngroups, group = 0;
        unsigned long ino = 0;
        struct inode *inode;
        struct ext4_group_desc *gdp = NULL;
        struct ext4_inode_info *ei;
        struct ext4_sb_info *sbi;
        int ret2, err;
        struct inode *ret;
        ext4_group_t i;
        ext4_group_t flex_group;
        struct ext4_group_info *grp = NULL;
        bool encrypt = false;

        /* Cannot create files in a deleted directory */
        if (!dir || !dir->i_nlink)
                return ERR_PTR(-EPERM);

        sb = dir->i_sb;
        sbi = EXT4_SB(sb);

        if (unlikely(ext4_forced_shutdown(sbi)))
                return ERR_PTR(-EIO);

        ngroups = ext4_get_groups_count(sb);
        trace_ext4_request_inode(dir, mode);
        inode = new_inode(sb);
        if (!inode)
                return ERR_PTR(-ENOMEM);
        ei = EXT4_I(inode);

        /*
         * Initialize owners and quota early so that we don't have to account
         * for quota initialization worst case in standard inode creating
         * transaction
         */
        if (owner) {
                inode->i_mode = mode;
                i_uid_write(inode, owner[0]);
                i_gid_write(inode, owner[1]);
        } else if (test_opt(sb, GRPID)) {
                inode->i_mode = mode;
                inode->i_uid = current_fsuid();
                inode->i_gid = dir->i_gid;
        } else
                inode_init_owner(inode, dir, mode);

        if (ext4_has_feature_project(sb) &&
            ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT))
                ei->i_projid = EXT4_I(dir)->i_projid;
        else
                ei->i_projid = make_kprojid(&init_user_ns, EXT4_DEF_PROJID);

        if (!(i_flags & EXT4_EA_INODE_FL)) {
                err = fscrypt_prepare_new_inode(dir, inode, &encrypt);
                if (err)
                        goto out;
        }

        err = dquot_initialize(inode);
        if (err)
                goto out;

        if (!handle && sbi->s_journal && !(i_flags & EXT4_EA_INODE_FL)) {
                ret2 = ext4_xattr_credits_for_new_inode(dir, mode, encrypt);
                if (ret2 < 0) {
                        err = ret2;
                        goto out;
                }
                nblocks += ret2;
        }

        if (!goal)
                goal = sbi->s_inode_goal;

        if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) {
                group = (goal - 1) / EXT4_INODES_PER_GROUP(sb);
                ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb);
                ret2 = 0;
                goto got_group;
        }

        if (S_ISDIR(mode))
                ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
        else
                ret2 = find_group_other(sb, dir, &group, mode);

got_group:
        EXT4_I(dir)->i_last_alloc_group = group;
        err = -ENOSPC;
        if (ret2 == -1)
                goto out;

        /*
         * Normally we will only go through one pass of this loop,
         * unless we get unlucky and it turns out the group we selected
         * had its last inode grabbed by someone else.
         */
        for (i = 0; i < ngroups; i++, ino = 0) {
                err = -EIO;

                gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
                if (!gdp)
                        goto out;

                /*
                 * Check free inodes count before loading bitmap.
                 */
                if (ext4_free_inodes_count(sb, gdp) == 0)
                        goto next_group;

                if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
                        grp = ext4_get_group_info(sb, group);
                        /*
                         * Skip groups with already-known suspicious inode
                         * tables
                         */
                        if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
                                goto next_group;
                }

                brelse(inode_bitmap_bh);
                inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
                /* Skip groups with suspicious inode tables */
                if (IS_ERR(inode_bitmap_bh)) {
                        inode_bitmap_bh = NULL;
                        goto next_group;
                }
                if (!(sbi->s_mount_state & EXT4_FC_REPLAY) &&
                    EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
                        goto next_group;

repeat_in_this_group:
                ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino);
                if (!ret2)
                        goto next_group;

                if (group == 0 && (ino + 1) < EXT4_FIRST_INO(sb)) {
                        ext4_error(sb, "reserved inode found cleared - "
                                   "inode=%lu", ino + 1);
                        ext4_mark_group_bitmap_corrupted(sb, group,
                                        EXT4_GROUP_INFO_IBITMAP_CORRUPT);
                        goto next_group;
                }

                if ((!(sbi->s_mount_state & EXT4_FC_REPLAY)) && !handle) {
                        BUG_ON(nblocks <= 0);
                        handle = __ext4_journal_start_sb(dir->i_sb, line_no,
                                 handle_type, nblocks, 0,
                                 ext4_trans_default_revoke_credits(sb));
                        if (IS_ERR(handle)) {
                                err = PTR_ERR(handle);
                                ext4_std_error(sb, err);
                                goto out;
                        }
                }
                BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
                err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
                if (err) {
                        ext4_std_error(sb, err);
                        goto out;
                }
                ext4_lock_group(sb, group);
                ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
                if (ret2) {
                        /* Someone already took the bit. Repeat the search
                         * with lock held.
                         */
                        ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino);
                        if (ret2) {
                                ext4_set_bit(ino, inode_bitmap_bh->b_data);
                                ret2 = 0;
                        } else {
                                ret2 = 1; /* we didn't grab the inode */
                        }
                }
                ext4_unlock_group(sb, group);
                ino++;                /* the inode bitmap is zero-based */
                if (!ret2)
                        goto got; /* we grabbed the inode! */

                if (ino < EXT4_INODES_PER_GROUP(sb))
                        goto repeat_in_this_group;
next_group:
                if (++group == ngroups)
                        group = 0;
        }
        err = -ENOSPC;
        goto out;

got:
        BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
        err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
        if (err) {
                ext4_std_error(sb, err);
                goto out;
        }

        BUFFER_TRACE(group_desc_bh, "get_write_access");
        err = ext4_journal_get_write_access(handle, group_desc_bh);
        if (err) {
                ext4_std_error(sb, err);
                goto out;
        }

        /* We may have to initialize the block bitmap if it isn't already */
        if (ext4_has_group_desc_csum(sb) &&
            gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                struct buffer_head *block_bitmap_bh;

                block_bitmap_bh = ext4_read_block_bitmap(sb, group);
                if (IS_ERR(block_bitmap_bh)) {
                        err = PTR_ERR(block_bitmap_bh);
                        goto out;
                }
                BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
                err = ext4_journal_get_write_access(handle, block_bitmap_bh);
                if (err) {
                        brelse(block_bitmap_bh);
                        ext4_std_error(sb, err);
                        goto out;
                }

                BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
                err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh);

                /* recheck and clear flag under lock if we still need to */
                ext4_lock_group(sb, group);
                if (ext4_has_group_desc_csum(sb) &&
                    (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
                        gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
                        ext4_free_group_clusters_set(sb, gdp,
                                ext4_free_clusters_after_init(sb, group, gdp));
                        ext4_block_bitmap_csum_set(sb, group, gdp,
                                                   block_bitmap_bh);
                        ext4_group_desc_csum_set(sb, group, gdp);
                }
                ext4_unlock_group(sb, group);
                brelse(block_bitmap_bh);

                if (err) {
                        ext4_std_error(sb, err);
                        goto out;
                }
        }

        /* Update the relevant bg descriptor fields */
        if (ext4_has_group_desc_csum(sb)) {
                int free;
                struct ext4_group_info *grp = NULL;

                if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
                        grp = ext4_get_group_info(sb, group);
                        if (!grp) {
                                err = -EFSCORRUPTED;
                                goto out;
                        }
                        down_read(&grp->alloc_sem); /*
                                                     * protect vs itable
                                                     * lazyinit
                                                     */
                }
                ext4_lock_group(sb, group); /* while we modify the bg desc */
                free = EXT4_INODES_PER_GROUP(sb) -
                        ext4_itable_unused_count(sb, gdp);
                if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
                        gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
                        free = 0;
                }
                /*
                 * Check the relative inode number against the last used
                 * relative inode number in this group. if it is greater
                 * we need to update the bg_itable_unused count
                 */
                if (ino > free)
                        ext4_itable_unused_set(sb, gdp,
                                        (EXT4_INODES_PER_GROUP(sb) - ino));
                if (!(sbi->s_mount_state & EXT4_FC_REPLAY))
                        up_read(&grp->alloc_sem);
        } else {
                ext4_lock_group(sb, group);
        }

        ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
        if (S_ISDIR(mode)) {
                ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1);
                if (sbi->s_log_groups_per_flex) {
                        ext4_group_t f = ext4_flex_group(sbi, group);

                        atomic_inc(&sbi_array_rcu_deref(sbi, s_flex_groups,
                                                        f)->used_dirs);
                }
        }
        if (ext4_has_group_desc_csum(sb)) {
                ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh,
                                           EXT4_INODES_PER_GROUP(sb) / 8);
                ext4_group_desc_csum_set(sb, group, gdp);
        }
        ext4_unlock_group(sb, group);

        BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
        err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
        if (err) {
                ext4_std_error(sb, err);
                goto out;
        }

        percpu_counter_dec(&sbi->s_freeinodes_counter);
        if (S_ISDIR(mode))
                percpu_counter_inc(&sbi->s_dirs_counter);

        if (sbi->s_log_groups_per_flex) {
                flex_group = ext4_flex_group(sbi, group);
                atomic_dec(&sbi_array_rcu_deref(sbi, s_flex_groups,
                                                flex_group)->free_inodes);
        }

        inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
        /* This is the optimal IO size (for stat), not the fs block size */
        inode->i_blocks = 0;
        inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
        ei->i_crtime = inode->i_mtime;

        memset(ei->i_data, 0, sizeof(ei->i_data));
        ei->i_dir_start_lookup = 0;
        ei->i_disksize = 0;

        /* Don't inherit extent flag from directory, amongst others. */
        ei->i_flags =
                ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
        ei->i_flags |= i_flags;
        ei->i_file_acl = 0;
        ei->i_dtime = 0;
        ei->i_block_group = group;
        ei->i_last_alloc_group = ~0;

        ext4_set_inode_flags(inode, true);
        if (IS_DIRSYNC(inode))
                ext4_handle_sync(handle);
        if (insert_inode_locked(inode) < 0) {
                /*
                 * Likely a bitmap corruption causing inode to be allocated
                 * twice.
                 */
                err = -EIO;
                ext4_error(sb, "failed to insert inode %lu: doubly allocated?",
                           inode->i_ino);
                ext4_mark_group_bitmap_corrupted(sb, group,
                                        EXT4_GROUP_INFO_IBITMAP_CORRUPT);
                goto out;
        }
        inode->i_generation = prandom_u32();

        /* Precompute checksum seed for inode metadata */
        if (ext4_has_metadata_csum(sb)) {
                __u32 csum;
                __le32 inum = cpu_to_le32(inode->i_ino);
                __le32 gen = cpu_to_le32(inode->i_generation);
                csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
                                   sizeof(inum));
                ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
                                              sizeof(gen));
        }

        ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
        ext4_set_inode_state(inode, EXT4_STATE_NEW);

        ei->i_extra_isize = sbi->s_want_extra_isize;
        ei->i_inline_off = 0;
        if (ext4_has_feature_inline_data(sb) &&
            (!(ei->i_flags & EXT4_DAX_FL) || S_ISDIR(mode)))
                ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
        ret = inode;
        err = dquot_alloc_inode(inode);
        if (err)
                goto fail_drop;

        /*
         * Since the encryption xattr will always be unique, create it first so
         * that it's less likely to end up in an external xattr block and
         * prevent its deduplication.
         */
        if (encrypt) {
                err = fscrypt_set_context(inode, handle);
                if (err)
                        goto fail_free_drop;
        }

        if (!(ei->i_flags & EXT4_EA_INODE_FL)) {
                err = ext4_init_acl(handle, inode, dir);
                if (err)
                        goto fail_free_drop;

                err = ext4_init_security(handle, inode, dir, qstr);
                if (err)
                        goto fail_free_drop;
        }

        if (ext4_has_feature_extents(sb)) {
                /* set extent flag only for directory, file and normal symlink*/
                if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
                        ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
                        ext4_ext_tree_init(handle, inode);
                }
        }

        if (ext4_handle_valid(handle)) {
                ei->i_sync_tid = handle->h_transaction->t_tid;
                ei->i_datasync_tid = handle->h_transaction->t_tid;
        }

        err = ext4_mark_inode_dirty(handle, inode);
        if (err) {
                ext4_std_error(sb, err);
                goto fail_free_drop;
        }

        ext4_debug("allocating inode %lu\n", inode->i_ino);
        trace_ext4_allocate_inode(inode, dir, mode);
        brelse(inode_bitmap_bh);
        return ret;

fail_free_drop:
        dquot_free_inode(inode);
fail_drop:
        clear_nlink(inode);
        unlock_new_inode(inode);
out:
        dquot_drop(inode);
        inode->i_flags |= S_NOQUOTA;
        iput(inode);
        brelse(inode_bitmap_bh);
        return ERR_PTR(err);
}

/* Verify that we are loading a valid orphan from disk */
struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
{
        unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
        ext4_group_t block_group;
        int bit;
        struct buffer_head *bitmap_bh = NULL;
        struct inode *inode = NULL;
        int err = -EFSCORRUPTED;

        if (ino < EXT4_FIRST_INO(sb) || ino > max_ino)
                goto bad_orphan;

        block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
        bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
        bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
        if (IS_ERR(bitmap_bh))
                return ERR_CAST(bitmap_bh);

        /* Having the inode bit set should be a 100% indicator that this
         * is a valid orphan (no e2fsck run on fs).  Orphans also include
         * inodes that were being truncated, so we can't check i_nlink==0.
         */
        if (!ext4_test_bit(bit, bitmap_bh->b_data))
                goto bad_orphan;

        inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                ext4_error_err(sb, -err,
                               "couldn't read orphan inode %lu (err %d)",
                               ino, err);
                brelse(bitmap_bh);
                return inode;
        }

        /*
         * If the orphans has i_nlinks > 0 then it should be able to
         * be truncated, otherwise it won't be removed from the orphan
         * list during processing and an infinite loop will result.
         * Similarly, it must not be a bad inode.
         */
        if ((inode->i_nlink && !ext4_can_truncate(inode)) ||
            is_bad_inode(inode))
                goto bad_orphan;

        if (NEXT_ORPHAN(inode) > max_ino)
                goto bad_orphan;
        brelse(bitmap_bh);
        return inode;

bad_orphan:
        ext4_error(sb, "bad orphan inode %lu", ino);
        if (bitmap_bh)
                printk(KERN_ERR "ext4_test_bit(bit=%d, block=%llu) = %d\n",
                       bit, (unsigned long long)bitmap_bh->b_blocknr,
                       ext4_test_bit(bit, bitmap_bh->b_data));
        if (inode) {
                printk(KERN_ERR "is_bad_inode(inode)=%d\n",
                       is_bad_inode(inode));
                printk(KERN_ERR "NEXT_ORPHAN(inode)=%u\n",
                       NEXT_ORPHAN(inode));
                printk(KERN_ERR "max_ino=%lu\n", max_ino);
                printk(KERN_ERR "i_nlink=%u\n", inode->i_nlink);
                /* Avoid freeing blocks if we got a bad deleted inode */
                if (inode->i_nlink == 0)
                        inode->i_blocks = 0;
                iput(inode);
        }
        brelse(bitmap_bh);
        return ERR_PTR(err);
}

unsigned long ext4_count_free_inodes(struct super_block *sb)
{
        unsigned long desc_count;
        struct ext4_group_desc *gdp;
        ext4_group_t i, ngroups = ext4_get_groups_count(sb);
#ifdef EXT4FS_DEBUG
        struct ext4_super_block *es;
        unsigned long bitmap_count, x;
        struct buffer_head *bitmap_bh = NULL;

        es = EXT4_SB(sb)->s_es;
        desc_count = 0;
        bitmap_count = 0;
        gdp = NULL;
        for (i = 0; i < ngroups; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
                desc_count += ext4_free_inodes_count(sb, gdp);
                brelse(bitmap_bh);
                bitmap_bh = ext4_read_inode_bitmap(sb, i);
                if (IS_ERR(bitmap_bh)) {
                        bitmap_bh = NULL;
                        continue;
                }

                x = ext4_count_free(bitmap_bh->b_data,
                                    EXT4_INODES_PER_GROUP(sb) / 8);
                printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
                        (unsigned long) i, ext4_free_inodes_count(sb, gdp), x);
                bitmap_count += x;
        }
        brelse(bitmap_bh);
        printk(KERN_DEBUG "ext4_count_free_inodes: "
               "stored = %u, computed = %lu, %lu\n",
               le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
        return desc_count;
#else
        desc_count = 0;
        for (i = 0; i < ngroups; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
                desc_count += ext4_free_inodes_count(sb, gdp);
                cond_resched();
        }
        return desc_count;
#endif
}

/* Called at mount-time, super-block is locked */
unsigned long ext4_count_dirs(struct super_block * sb)
{
        unsigned long count = 0;
        ext4_group_t i, ngroups = ext4_get_groups_count(sb);

        for (i = 0; i < ngroups; i++) {
                struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
                count += ext4_used_dirs_count(sb, gdp);
        }
        return count;
}

/*
 * Zeroes not yet zeroed inode table - just write zeroes through the whole
 * inode table. Must be called without any spinlock held. The only place
 * where it is called from on active part of filesystem is ext4lazyinit
 * thread, so we do not need any special locks, however we have to prevent
 * inode allocation from the current group, so we take alloc_sem lock, to
 * block ext4_new_inode() until we are finished.
 */
int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
                                 int barrier)
{
        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_group_desc *gdp = NULL;
        struct buffer_head *group_desc_bh;
        handle_t *handle;
        ext4_fsblk_t blk;
        int num, ret = 0, used_blks = 0;
        unsigned long used_inos = 0;

        /* This should not happen, but just to be sure check this */
        if (sb_rdonly(sb)) {
                ret = 1;
                goto out;
        }

        gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
        if (!gdp || !grp)
                goto out;

        /*
         * We do not need to lock this, because we are the only one
         * handling this flag.
         */
        if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
                goto out;

        handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                goto out;
        }

        down_write(&grp->alloc_sem);
        /*
         * If inode bitmap was already initialized there may be some
         * used inodes so we need to skip blocks with used inodes in
         * inode table.
         */
        if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) {
                used_inos = EXT4_INODES_PER_GROUP(sb) -
                            ext4_itable_unused_count(sb, gdp);
                used_blks = DIV_ROUND_UP(used_inos, sbi->s_inodes_per_block);

                /* Bogus inode unused count? */
                if (used_blks < 0 || used_blks > sbi->s_itb_per_group) {
                        ext4_error(sb, "Something is wrong with group %u: "
                                   "used itable blocks: %d; "
                                   "itable unused count: %u",
                                   group, used_blks,
                                   ext4_itable_unused_count(sb, gdp));
                        ret = 1;
                        goto err_out;
                }

                used_inos += group * EXT4_INODES_PER_GROUP(sb);
                /*
                 * Are there some uninitialized inodes in the inode table
                 * before the first normal inode?
                 */
                if ((used_blks != sbi->s_itb_per_group) &&
                     (used_inos < EXT4_FIRST_INO(sb))) {
                        ext4_error(sb, "Something is wrong with group %u: "
                                   "itable unused count: %u; "
                                   "itables initialized count: %ld",
                                   group, ext4_itable_unused_count(sb, gdp),
                                   used_inos);
                        ret = 1;
                        goto err_out;
                }
        }

        blk = ext4_inode_table(sb, gdp) + used_blks;
        num = sbi->s_itb_per_group - used_blks;

        BUFFER_TRACE(group_desc_bh, "get_write_access");
        ret = ext4_journal_get_write_access(handle,
                                            group_desc_bh);
        if (ret)
                goto err_out;

        /*
         * Skip zeroout if the inode table is full. But we set the ZEROED
         * flag anyway, because obviously, when it is full it does not need
         * further zeroing.
         */
        if (unlikely(num == 0))
                goto skip_zeroout;

        ext4_debug("going to zero out inode table in group %d\n",
                   group);
        ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS);
        if (ret < 0)
                goto err_out;
        if (barrier)
                blkdev_issue_flush(sb->s_bdev, GFP_NOFS);

skip_zeroout:
        ext4_lock_group(sb, group);
        gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
        ext4_group_desc_csum_set(sb, group, gdp);
        ext4_unlock_group(sb, group);

        BUFFER_TRACE(group_desc_bh,
                     "call ext4_handle_dirty_metadata");
        ret = ext4_handle_dirty_metadata(handle, NULL,
                                         group_desc_bh);

err_out:
        up_write(&grp->alloc_sem);
        ext4_journal_stop(handle);
out:
        return ret;
}



















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *  pm_wakeup.h - Power management wakeup interface
 *
 *  Copyright (C) 2008 Alan Stern
 *  Copyright (C) 2010 Rafael J. Wysocki, Novell Inc.
 */

#ifndef _LINUX_PM_WAKEUP_H
#define _LINUX_PM_WAKEUP_H

#ifndef _DEVICE_H_
# error "please don't include this file directly"
#endif

#include <linux/types.h>

struct wake_irq;

/**
 * struct wakeup_source - Representation of wakeup sources
 *
 * @name: Name of the wakeup source
 * @id: Wakeup source id
 * @entry: Wakeup source list entry
 * @lock: Wakeup source lock
 * @wakeirq: Optional device specific wakeirq
 * @timer: Wakeup timer list
 * @timer_expires: Wakeup timer expiration
 * @total_time: Total time this wakeup source has been active.
 * @max_time: Maximum time this wakeup source has been continuously active.
 * @last_time: Monotonic clock when the wakeup source's was touched last time.
 * @prevent_sleep_time: Total time this source has been preventing autosleep.
 * @event_count: Number of signaled wakeup events.
 * @active_count: Number of times the wakeup source was activated.
 * @relax_count: Number of times the wakeup source was deactivated.
 * @expire_count: Number of times the wakeup source's timeout has expired.
 * @wakeup_count: Number of times the wakeup source might abort suspend.
 * @dev: Struct device for sysfs statistics about the wakeup source.
 * @active: Status of the wakeup source.
 * @autosleep_enabled: Autosleep is active, so update @prevent_sleep_time.
 */
struct wakeup_source {
        const char                 *name;
        int                        id;
        struct list_head        entry;
        spinlock_t                lock;
        struct wake_irq                *wakeirq;
        struct timer_list        timer;
        unsigned long                timer_expires;
        ktime_t total_time;
        ktime_t max_time;
        ktime_t last_time;
        ktime_t start_prevent_time;
        ktime_t prevent_sleep_time;
        unsigned long                event_count;
        unsigned long                active_count;
        unsigned long                relax_count;
        unsigned long                expire_count;
        unsigned long                wakeup_count;
        struct device                *dev;
        bool                        active:1;
        bool                        autosleep_enabled:1;
};

#define for_each_wakeup_source(ws) \
        for ((ws) = wakeup_sources_walk_start();        \
             (ws);                                        \
             (ws) = wakeup_sources_walk_next((ws)))

#ifdef CONFIG_PM_SLEEP

/*
 * Changes to device_may_wakeup take effect on the next pm state change.
 */

static inline bool device_can_wakeup(struct device *dev)
{
        return dev->power.can_wakeup;
}

static inline bool device_may_wakeup(struct device *dev)
{
        return dev->power.can_wakeup && !!dev->power.wakeup;
}

static inline void device_set_wakeup_path(struct device *dev)
{
        dev->power.wakeup_path = true;
}

/* drivers/base/power/wakeup.c */
extern struct wakeup_source *wakeup_source_create(const char *name);
extern void wakeup_source_destroy(struct wakeup_source *ws);
extern void wakeup_source_add(struct wakeup_source *ws);
extern void wakeup_source_remove(struct wakeup_source *ws);
extern struct wakeup_source *wakeup_source_register(struct device *dev,
                                                    const char *name);
extern void wakeup_source_unregister(struct wakeup_source *ws);
extern int wakeup_sources_read_lock(void);
extern void wakeup_sources_read_unlock(int idx);
extern struct wakeup_source *wakeup_sources_walk_start(void);
extern struct wakeup_source *wakeup_sources_walk_next(struct wakeup_source *ws);
extern int device_wakeup_enable(struct device *dev);
extern int device_wakeup_disable(struct device *dev);
extern void device_set_wakeup_capable(struct device *dev, bool capable);
extern int device_init_wakeup(struct device *dev, bool val);
extern int device_set_wakeup_enable(struct device *dev, bool enable);
extern void __pm_stay_awake(struct wakeup_source *ws);
extern void pm_stay_awake(struct device *dev);
extern void __pm_relax(struct wakeup_source *ws);
extern void pm_relax(struct device *dev);
extern void pm_wakeup_ws_event(struct wakeup_source *ws, unsigned int msec, bool hard);
extern void pm_wakeup_dev_event(struct device *dev, unsigned int msec, bool hard);

#else /* !CONFIG_PM_SLEEP */

static inline void device_set_wakeup_capable(struct device *dev, bool capable)
{
        dev->power.can_wakeup = capable;
}

static inline bool device_can_wakeup(struct device *dev)
{
        return dev->power.can_wakeup;
}

static inline struct wakeup_source *wakeup_source_create(const char *name)
{
        return NULL;
}

static inline void wakeup_source_destroy(struct wakeup_source *ws) {}

static inline void wakeup_source_add(struct wakeup_source *ws) {}

static inline void wakeup_source_remove(struct wakeup_source *ws) {}

static inline struct wakeup_source *wakeup_source_register(struct device *dev,
                                                           const char *name)
{
        return NULL;
}

static inline void wakeup_source_unregister(struct wakeup_source *ws) {}

static inline int device_wakeup_enable(struct device *dev)
{
        dev->power.should_wakeup = true;
        return 0;
}

static inline int device_wakeup_disable(struct device *dev)
{
        dev->power.should_wakeup = false;
        return 0;
}

static inline int device_set_wakeup_enable(struct device *dev, bool enable)
{
        dev->power.should_wakeup = enable;
        return 0;
}

static inline int device_init_wakeup(struct device *dev, bool val)
{
        device_set_wakeup_capable(dev, val);
        device_set_wakeup_enable(dev, val);
        return 0;
}

static inline bool device_may_wakeup(struct device *dev)
{
        return dev->power.can_wakeup && dev->power.should_wakeup;
}

static inline void device_set_wakeup_path(struct device *dev) {}

static inline void __pm_stay_awake(struct wakeup_source *ws) {}

static inline void pm_stay_awake(struct device *dev) {}

static inline void __pm_relax(struct wakeup_source *ws) {}

static inline void pm_relax(struct device *dev) {}

static inline void pm_wakeup_ws_event(struct wakeup_source *ws,
                                      unsigned int msec, bool hard) {}

static inline void pm_wakeup_dev_event(struct device *dev, unsigned int msec,
                                       bool hard) {}

#endif /* !CONFIG_PM_SLEEP */

static inline void __pm_wakeup_event(struct wakeup_source *ws, unsigned int msec)
{
        return pm_wakeup_ws_event(ws, msec, false);
}

static inline void pm_wakeup_event(struct device *dev, unsigned int msec)
{
        return pm_wakeup_dev_event(dev, msec, false);
}

static inline void pm_wakeup_hard_event(struct device *dev)
{
        return pm_wakeup_dev_event(dev, 0, true);
}

#endif /* _LINUX_PM_WAKEUP_H */






















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 














































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
    1 































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
/* auditsc.c -- System-call auditing support
 * Handles all system-call specific auditing features.
 *
 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
 * Copyright 2005 Hewlett-Packard Development Company, L.P.
 * Copyright (C) 2005, 2006 IBM Corporation
 * All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 * Written by Rickard E. (Rik) Faith <faith@redhat.com>
 *
 * Many of the ideas implemented here are from Stephen C. Tweedie,
 * especially the idea of avoiding a copy by using getname.
 *
 * The method for actual interception of syscall entry and exit (not in
 * this file -- see entry.S) is based on a GPL'd patch written by
 * okir@suse.de and Copyright 2003 SuSE Linux AG.
 *
 * POSIX message queue support added by George Wilson <ltcgcw@us.ibm.com>,
 * 2006.
 *
 * The support of additional filter rules compares (>, <, >=, <=) was
 * added by Dustin Kirkland <dustin.kirkland@us.ibm.com>, 2005.
 *
 * Modified by Amy Griffis <amy.griffis@hp.com> to collect additional
 * filesystem information.
 *
 * Subject and object context labeling support added by <danjones@us.ibm.com>
 * and <dustin.kirkland@us.ibm.com> for LSPP certification compliance.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/init.h>
#include <asm/types.h>
#include <linux/atomic.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/mm.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/mount.h>
#include <linux/socket.h>
#include <linux/mqueue.h>
#include <linux/audit.h>
#include <linux/personality.h>
#include <linux/time.h>
#include <linux/netlink.h>
#include <linux/compiler.h>
#include <asm/unistd.h>
#include <linux/security.h>
#include <linux/list.h>
#include <linux/binfmts.h>
#include <linux/highmem.h>
#include <linux/syscalls.h>
#include <asm/syscall.h>
#include <linux/capability.h>
#include <linux/fs_struct.h>
#include <linux/compat.h>
#include <linux/ctype.h>
#include <linux/string.h>
#include <linux/uaccess.h>
#include <linux/fsnotify_backend.h>
#include <uapi/linux/limits.h>
#include <uapi/linux/netfilter/nf_tables.h>

#include "audit.h"

/* flags stating the success for a syscall */
#define AUDITSC_INVALID 0
#define AUDITSC_SUCCESS 1
#define AUDITSC_FAILURE 2

/* no execve audit message should be longer than this (userspace limits),
 * see the note near the top of audit_log_execve_info() about this value */
#define MAX_EXECVE_AUDIT_LEN 7500

/* max length to print of cmdline/proctitle value during audit */
#define MAX_PROCTITLE_AUDIT_LEN 128

/* number of audit rules */
int audit_n_rules;

/* determines whether we collect data for signals sent */
int audit_signals;

struct audit_aux_data {
        struct audit_aux_data        *next;
        int                        type;
};

#define AUDIT_AUX_IPCPERM        0

/* Number of target pids per aux struct. */
#define AUDIT_AUX_PIDS        16

struct audit_aux_data_pids {
        struct audit_aux_data        d;
        pid_t                        target_pid[AUDIT_AUX_PIDS];
        kuid_t                        target_auid[AUDIT_AUX_PIDS];
        kuid_t                        target_uid[AUDIT_AUX_PIDS];
        unsigned int                target_sessionid[AUDIT_AUX_PIDS];
        u32                        target_sid[AUDIT_AUX_PIDS];
        char                         target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN];
        int                        pid_count;
};

struct audit_aux_data_bprm_fcaps {
        struct audit_aux_data        d;
        struct audit_cap_data        fcap;
        unsigned int                fcap_ver;
        struct audit_cap_data        old_pcap;
        struct audit_cap_data        new_pcap;
};

struct audit_tree_refs {
        struct audit_tree_refs *next;
        struct audit_chunk *c[31];
};

struct audit_nfcfgop_tab {
        enum audit_nfcfgop        op;
        const char                *s;
};

static const struct audit_nfcfgop_tab audit_nfcfgs[] = {
        { AUDIT_XT_OP_REGISTER,                        "xt_register"                   },
        { AUDIT_XT_OP_REPLACE,                        "xt_replace"                   },
        { AUDIT_XT_OP_UNREGISTER,                "xt_unregister"                   },
        { AUDIT_NFT_OP_TABLE_REGISTER,                "nft_register_table"           },
        { AUDIT_NFT_OP_TABLE_UNREGISTER,        "nft_unregister_table"           },
        { AUDIT_NFT_OP_CHAIN_REGISTER,                "nft_register_chain"           },
        { AUDIT_NFT_OP_CHAIN_UNREGISTER,        "nft_unregister_chain"           },
        { AUDIT_NFT_OP_RULE_REGISTER,                "nft_register_rule"           },
        { AUDIT_NFT_OP_RULE_UNREGISTER,                "nft_unregister_rule"           },
        { AUDIT_NFT_OP_SET_REGISTER,                "nft_register_set"           },
        { AUDIT_NFT_OP_SET_UNREGISTER,                "nft_unregister_set"           },
        { AUDIT_NFT_OP_SETELEM_REGISTER,        "nft_register_setelem"           },
        { AUDIT_NFT_OP_SETELEM_UNREGISTER,        "nft_unregister_setelem"   },
        { AUDIT_NFT_OP_GEN_REGISTER,                "nft_register_gen"           },
        { AUDIT_NFT_OP_OBJ_REGISTER,                "nft_register_obj"           },
        { AUDIT_NFT_OP_OBJ_UNREGISTER,                "nft_unregister_obj"           },
        { AUDIT_NFT_OP_OBJ_RESET,                "nft_reset_obj"                   },
        { AUDIT_NFT_OP_FLOWTABLE_REGISTER,        "nft_register_flowtable"   },
        { AUDIT_NFT_OP_FLOWTABLE_UNREGISTER,        "nft_unregister_flowtable" },
        { AUDIT_NFT_OP_INVALID,                        "nft_invalid"                   },
};

static int audit_match_perm(struct audit_context *ctx, int mask)
{
        unsigned n;
        if (unlikely(!ctx))
                return 0;
        n = ctx->major;

        switch (audit_classify_syscall(ctx->arch, n)) {
        case 0:        /* native */
                if ((mask & AUDIT_PERM_WRITE) &&
                     audit_match_class(AUDIT_CLASS_WRITE, n))
                        return 1;
                if ((mask & AUDIT_PERM_READ) &&
                     audit_match_class(AUDIT_CLASS_READ, n))
                        return 1;
                if ((mask & AUDIT_PERM_ATTR) &&
                     audit_match_class(AUDIT_CLASS_CHATTR, n))
                        return 1;
                return 0;
        case 1: /* 32bit on biarch */
                if ((mask & AUDIT_PERM_WRITE) &&
                     audit_match_class(AUDIT_CLASS_WRITE_32, n))
                        return 1;
                if ((mask & AUDIT_PERM_READ) &&
                     audit_match_class(AUDIT_CLASS_READ_32, n))
                        return 1;
                if ((mask & AUDIT_PERM_ATTR) &&
                     audit_match_class(AUDIT_CLASS_CHATTR_32, n))
                        return 1;
                return 0;
        case 2: /* open */
                return mask & ACC_MODE(ctx->argv[1]);
        case 3: /* openat */
                return mask & ACC_MODE(ctx->argv[2]);
        case 4: /* socketcall */
                return ((mask & AUDIT_PERM_WRITE) && ctx->argv[0] == SYS_BIND);
        case 5: /* execve */
                return mask & AUDIT_PERM_EXEC;
        default:
                return 0;
        }
}

static int audit_match_filetype(struct audit_context *ctx, int val)
{
        struct audit_names *n;
        umode_t mode = (umode_t)val;

        if (unlikely(!ctx))
                return 0;

        list_for_each_entry(n, &ctx->names_list, list) {
                if ((n->ino != AUDIT_INO_UNSET) &&
                    ((n->mode & S_IFMT) == mode))
                        return 1;
        }

        return 0;
}

/*
 * We keep a linked list of fixed-sized (31 pointer) arrays of audit_chunk *;
 * ->first_trees points to its beginning, ->trees - to the current end of data.
 * ->tree_count is the number of free entries in array pointed to by ->trees.
 * Original condition is (NULL, NULL, 0); as soon as it grows we never revert to NULL,
 * "empty" becomes (p, p, 31) afterwards.  We don't shrink the list (and seriously,
 * it's going to remain 1-element for almost any setup) until we free context itself.
 * References in it _are_ dropped - at the same time we free/drop aux stuff.
 */

static void audit_set_auditable(struct audit_context *ctx)
{
        if (!ctx->prio) {
                ctx->prio = 1;
                ctx->current_state = AUDIT_RECORD_CONTEXT;
        }
}

static int put_tree_ref(struct audit_context *ctx, struct audit_chunk *chunk)
{
        struct audit_tree_refs *p = ctx->trees;
        int left = ctx->tree_count;
        if (likely(left)) {
                p->c[--left] = chunk;
                ctx->tree_count = left;
                return 1;
        }
        if (!p)
                return 0;
        p = p->next;
        if (p) {
                p->c[30] = chunk;
                ctx->trees = p;
                ctx->tree_count = 30;
                return 1;
        }
        return 0;
}

static int grow_tree_refs(struct audit_context *ctx)
{
        struct audit_tree_refs *p = ctx->trees;
        ctx->trees = kzalloc(sizeof(struct audit_tree_refs), GFP_KERNEL);
        if (!ctx->trees) {
                ctx->trees = p;
                return 0;
        }
        if (p)
                p->next = ctx->trees;
        else
                ctx->first_trees = ctx->trees;
        ctx->tree_count = 31;
        return 1;
}

static void unroll_tree_refs(struct audit_context *ctx,
                      struct audit_tree_refs *p, int count)
{
        struct audit_tree_refs *q;
        int n;
        if (!p) {
                /* we started with empty chain */
                p = ctx->first_trees;
                count = 31;
                /* if the very first allocation has failed, nothing to do */
                if (!p)
                        return;
        }
        n = count;
        for (q = p; q != ctx->trees; q = q->next, n = 31) {
                while (n--) {
                        audit_put_chunk(q->c[n]);
                        q->c[n] = NULL;
                }
        }
        while (n-- > ctx->tree_count) {
                audit_put_chunk(q->c[n]);
                q->c[n] = NULL;
        }
        ctx->trees = p;
        ctx->tree_count = count;
}

static void free_tree_refs(struct audit_context *ctx)
{
        struct audit_tree_refs *p, *q;
        for (p = ctx->first_trees; p; p = q) {
                q = p->next;
                kfree(p);
        }
}

static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
{
        struct audit_tree_refs *p;
        int n;
        if (!tree)
                return 0;
        /* full ones */
        for (p = ctx->first_trees; p != ctx->trees; p = p->next) {
                for (n = 0; n < 31; n++)
                        if (audit_tree_match(p->c[n], tree))
                                return 1;
        }
        /* partial */
        if (p) {
                for (n = ctx->tree_count; n < 31; n++)
                        if (audit_tree_match(p->c[n], tree))
                                return 1;
        }
        return 0;
}

static int audit_compare_uid(kuid_t uid,
                             struct audit_names *name,
                             struct audit_field *f,
                             struct audit_context *ctx)
{
        struct audit_names *n;
        int rc;
 
        if (name) {
                rc = audit_uid_comparator(uid, f->op, name->uid);
                if (rc)
                        return rc;
        }
 
        if (ctx) {
                list_for_each_entry(n, &ctx->names_list, list) {
                        rc = audit_uid_comparator(uid, f->op, n->uid);
                        if (rc)
                                return rc;
                }
        }
        return 0;
}

static int audit_compare_gid(kgid_t gid,
                             struct audit_names *name,
                             struct audit_field *f,
                             struct audit_context *ctx)
{
        struct audit_names *n;
        int rc;
 
        if (name) {
                rc = audit_gid_comparator(gid, f->op, name->gid);
                if (rc)
                        return rc;
        }
 
        if (ctx) {
                list_for_each_entry(n, &ctx->names_list, list) {
                        rc = audit_gid_comparator(gid, f->op, n->gid);
                        if (rc)
                                return rc;
                }
        }
        return 0;
}

static int audit_field_compare(struct task_struct *tsk,
                               const struct cred *cred,
                               struct audit_field *f,
                               struct audit_context *ctx,
                               struct audit_names *name)
{
        switch (f->val) {
        /* process to file object comparisons */
        case AUDIT_COMPARE_UID_TO_OBJ_UID:
                return audit_compare_uid(cred->uid, name, f, ctx);
        case AUDIT_COMPARE_GID_TO_OBJ_GID:
                return audit_compare_gid(cred->gid, name, f, ctx);
        case AUDIT_COMPARE_EUID_TO_OBJ_UID:
                return audit_compare_uid(cred->euid, name, f, ctx);
        case AUDIT_COMPARE_EGID_TO_OBJ_GID:
                return audit_compare_gid(cred->egid, name, f, ctx);
        case AUDIT_COMPARE_AUID_TO_OBJ_UID:
                return audit_compare_uid(audit_get_loginuid(tsk), name, f, ctx);
        case AUDIT_COMPARE_SUID_TO_OBJ_UID:
                return audit_compare_uid(cred->suid, name, f, ctx);
        case AUDIT_COMPARE_SGID_TO_OBJ_GID:
                return audit_compare_gid(cred->sgid, name, f, ctx);
        case AUDIT_COMPARE_FSUID_TO_OBJ_UID:
                return audit_compare_uid(cred->fsuid, name, f, ctx);
        case AUDIT_COMPARE_FSGID_TO_OBJ_GID:
                return audit_compare_gid(cred->fsgid, name, f, ctx);
        /* uid comparisons */
        case AUDIT_COMPARE_UID_TO_AUID:
                return audit_uid_comparator(cred->uid, f->op,
                                            audit_get_loginuid(tsk));
        case AUDIT_COMPARE_UID_TO_EUID:
                return audit_uid_comparator(cred->uid, f->op, cred->euid);
        case AUDIT_COMPARE_UID_TO_SUID:
                return audit_uid_comparator(cred->uid, f->op, cred->suid);
        case AUDIT_COMPARE_UID_TO_FSUID:
                return audit_uid_comparator(cred->uid, f->op, cred->fsuid);
        /* auid comparisons */
        case AUDIT_COMPARE_AUID_TO_EUID:
                return audit_uid_comparator(audit_get_loginuid(tsk), f->op,
                                            cred->euid);
        case AUDIT_COMPARE_AUID_TO_SUID:
                return audit_uid_comparator(audit_get_loginuid(tsk), f->op,
                                            cred->suid);
        case AUDIT_COMPARE_AUID_TO_FSUID:
                return audit_uid_comparator(audit_get_loginuid(tsk), f->op,
                                            cred->fsuid);
        /* euid comparisons */
        case AUDIT_COMPARE_EUID_TO_SUID:
                return audit_uid_comparator(cred->euid, f->op, cred->suid);
        case AUDIT_COMPARE_EUID_TO_FSUID:
                return audit_uid_comparator(cred->euid, f->op, cred->fsuid);
        /* suid comparisons */
        case AUDIT_COMPARE_SUID_TO_FSUID:
                return audit_uid_comparator(cred->suid, f->op, cred->fsuid);
        /* gid comparisons */
        case AUDIT_COMPARE_GID_TO_EGID:
                return audit_gid_comparator(cred->gid, f->op, cred->egid);
        case AUDIT_COMPARE_GID_TO_SGID:
                return audit_gid_comparator(cred->gid, f->op, cred->sgid);
        case AUDIT_COMPARE_GID_TO_FSGID:
                return audit_gid_comparator(cred->gid, f->op, cred->fsgid);
        /* egid comparisons */
        case AUDIT_COMPARE_EGID_TO_SGID:
                return audit_gid_comparator(cred->egid, f->op, cred->sgid);
        case AUDIT_COMPARE_EGID_TO_FSGID:
                return audit_gid_comparator(cred->egid, f->op, cred->fsgid);
        /* sgid comparison */
        case AUDIT_COMPARE_SGID_TO_FSGID:
                return audit_gid_comparator(cred->sgid, f->op, cred->fsgid);
        default:
                WARN(1, "Missing AUDIT_COMPARE define.  Report as a bug\n");
                return 0;
        }
        return 0;
}

/* Determine if any context name data matches a rule's watch data */
/* Compare a task_struct with an audit_rule.  Return 1 on match, 0
 * otherwise.
 *
 * If task_creation is true, this is an explicit indication that we are
 * filtering a task rule at task creation time.  This and tsk == current are
 * the only situations where tsk->cred may be accessed without an rcu read lock.
 */
static int audit_filter_rules(struct task_struct *tsk,
                              struct audit_krule *rule,
                              struct audit_context *ctx,
                              struct audit_names *name,
                              enum audit_state *state,
                              bool task_creation)
{
        const struct cred *cred;
        int i, need_sid = 1;
        u32 sid;
        unsigned int sessionid;

        cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation);

        for (i = 0; i < rule->field_count; i++) {
                struct audit_field *f = &rule->fields[i];
                struct audit_names *n;
                int result = 0;
                pid_t pid;

                switch (f->type) {
                case AUDIT_PID:
                        pid = task_tgid_nr(tsk);
                        result = audit_comparator(pid, f->op, f->val);
                        break;
                case AUDIT_PPID:
                        if (ctx) {
                                if (!ctx->ppid)
                                        ctx->ppid = task_ppid_nr(tsk);
                                result = audit_comparator(ctx->ppid, f->op, f->val);
                        }
                        break;
                case AUDIT_EXE:
                        result = audit_exe_compare(tsk, rule->exe);
                        if (f->op == Audit_not_equal)
                                result = !result;
                        break;
                case AUDIT_UID:
                        result = audit_uid_comparator(cred->uid, f->op, f->uid);
                        break;
                case AUDIT_EUID:
                        result = audit_uid_comparator(cred->euid, f->op, f->uid);
                        break;
                case AUDIT_SUID:
                        result = audit_uid_comparator(cred->suid, f->op, f->uid);
                        break;
                case AUDIT_FSUID:
                        result = audit_uid_comparator(cred->fsuid, f->op, f->uid);
                        break;
                case AUDIT_GID:
                        result = audit_gid_comparator(cred->gid, f->op, f->gid);
                        if (f->op == Audit_equal) {
                                if (!result)
                                        result = groups_search(cred->group_info, f->gid);
                        } else if (f->op == Audit_not_equal) {
                                if (result)
                                        result = !groups_search(cred->group_info, f->gid);
                        }
                        break;
                case AUDIT_EGID:
                        result = audit_gid_comparator(cred->egid, f->op, f->gid);
                        if (f->op == Audit_equal) {
                                if (!result)
                                        result = groups_search(cred->group_info, f->gid);
                        } else if (f->op == Audit_not_equal) {
                                if (result)
                                        result = !groups_search(cred->group_info, f->gid);
                        }
                        break;
                case AUDIT_SGID:
                        result = audit_gid_comparator(cred->sgid, f->op, f->gid);
                        break;
                case AUDIT_FSGID:
                        result = audit_gid_comparator(cred->fsgid, f->op, f->gid);
                        break;
                case AUDIT_SESSIONID:
                        sessionid = audit_get_sessionid(tsk);
                        result = audit_comparator(sessionid, f->op, f->val);
                        break;
                case AUDIT_PERS:
                        result = audit_comparator(tsk->personality, f->op, f->val);
                        break;
                case AUDIT_ARCH:
                        if (ctx)
                                result = audit_comparator(ctx->arch, f->op, f->val);
                        break;

                case AUDIT_EXIT:
                        if (ctx && ctx->return_valid)
                                result = audit_comparator(ctx->return_code, f->op, f->val);
                        break;
                case AUDIT_SUCCESS:
                        if (ctx && ctx->return_valid) {
                                if (f->val)
                                        result = audit_comparator(ctx->return_valid, f->op, AUDITSC_SUCCESS);
                                else
                                        result = audit_comparator(ctx->return_valid, f->op, AUDITSC_FAILURE);
                        }
                        break;
                case AUDIT_DEVMAJOR:
                        if (name) {
                                if (audit_comparator(MAJOR(name->dev), f->op, f->val) ||
                                    audit_comparator(MAJOR(name->rdev), f->op, f->val))
                                        ++result;
                        } else if (ctx) {
                                list_for_each_entry(n, &ctx->names_list, list) {
                                        if (audit_comparator(MAJOR(n->dev), f->op, f->val) ||
                                            audit_comparator(MAJOR(n->rdev), f->op, f->val)) {
                                                ++result;
                                                break;
                                        }
                                }
                        }
                        break;
                case AUDIT_DEVMINOR:
                        if (name) {
                                if (audit_comparator(MINOR(name->dev), f->op, f->val) ||
                                    audit_comparator(MINOR(name->rdev), f->op, f->val))
                                        ++result;
                        } else if (ctx) {
                                list_for_each_entry(n, &ctx->names_list, list) {
                                        if (audit_comparator(MINOR(n->dev), f->op, f->val) ||
                                            audit_comparator(MINOR(n->rdev), f->op, f->val)) {
                                                ++result;
                                                break;
                                        }
                                }
                        }
                        break;
                case AUDIT_INODE:
                        if (name)
                                result = audit_comparator(name->ino, f->op, f->val);
                        else if (ctx) {
                                list_for_each_entry(n, &ctx->names_list, list) {
                                        if (audit_comparator(n->ino, f->op, f->val)) {
                                                ++result;
                                                break;
                                        }
                                }
                        }
                        break;
                case AUDIT_OBJ_UID:
                        if (name) {
                                result = audit_uid_comparator(name->uid, f->op, f->uid);
                        } else if (ctx) {
                                list_for_each_entry(n, &ctx->names_list, list) {
                                        if (audit_uid_comparator(n->uid, f->op, f->uid)) {
                                                ++result;
                                                break;
                                        }
                                }
                        }
                        break;
                case AUDIT_OBJ_GID:
                        if (name) {
                                result = audit_gid_comparator(name->gid, f->op, f->gid);
                        } else if (ctx) {
                                list_for_each_entry(n, &ctx->names_list, list) {
                                        if (audit_gid_comparator(n->gid, f->op, f->gid)) {
                                                ++result;
                                                break;
                                        }
                                }
                        }
                        break;
                case AUDIT_WATCH:
                        if (name) {
                                result = audit_watch_compare(rule->watch,
                                                             name->ino,
                                                             name->dev);
                                if (f->op == Audit_not_equal)
                                        result = !result;
                        }
                        break;
                case AUDIT_DIR:
                        if (ctx) {
                                result = match_tree_refs(ctx, rule->tree);
                                if (f->op == Audit_not_equal)
                                        result = !result;
                        }
                        break;
                case AUDIT_LOGINUID:
                        result = audit_uid_comparator(audit_get_loginuid(tsk),
                                                      f->op, f->uid);
                        break;
                case AUDIT_LOGINUID_SET:
                        result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val);
                        break;
                case AUDIT_SADDR_FAM:
                        if (ctx && ctx->sockaddr)
                                result = audit_comparator(ctx->sockaddr->ss_family,
                                                          f->op, f->val);
                        break;
                case AUDIT_SUBJ_USER:
                case AUDIT_SUBJ_ROLE:
                case AUDIT_SUBJ_TYPE:
                case AUDIT_SUBJ_SEN:
                case AUDIT_SUBJ_CLR:
                        /* NOTE: this may return negative values indicating
                           a temporary error.  We simply treat this as a
                           match for now to avoid losing information that
                           may be wanted.   An error message will also be
                           logged upon error */
                        if (f->lsm_rule) {
                                if (need_sid) {
                                        security_task_getsecid(tsk, &sid);
                                        need_sid = 0;
                                }
                                result = security_audit_rule_match(sid, f->type,
                                                                   f->op,
                                                                   f->lsm_rule);
                        }
                        break;
                case AUDIT_OBJ_USER:
                case AUDIT_OBJ_ROLE:
                case AUDIT_OBJ_TYPE:
                case AUDIT_OBJ_LEV_LOW:
                case AUDIT_OBJ_LEV_HIGH:
                        /* The above note for AUDIT_SUBJ_USER...AUDIT_SUBJ_CLR
                           also applies here */
                        if (f->lsm_rule) {
                                /* Find files that match */
                                if (name) {
                                        result = security_audit_rule_match(
                                                                name->osid,
                                                                f->type,
                                                                f->op,
                                                                f->lsm_rule);
                                } else if (ctx) {
                                        list_for_each_entry(n, &ctx->names_list, list) {
                                                if (security_audit_rule_match(
                                                                n->osid,
                                                                f->type,
                                                                f->op,
                                                                f->lsm_rule)) {
                                                        ++result;
                                                        break;
                                                }
                                        }
                                }
                                /* Find ipc objects that match */
                                if (!ctx || ctx->type != AUDIT_IPC)
                                        break;
                                if (security_audit_rule_match(ctx->ipc.osid,
                                                              f->type, f->op,
                                                              f->lsm_rule))
                                        ++result;
                        }
                        break;
                case AUDIT_ARG0:
                case AUDIT_ARG1:
                case AUDIT_ARG2:
                case AUDIT_ARG3:
                        if (ctx)
                                result = audit_comparator(ctx->argv[f->type-AUDIT_ARG0], f->op, f->val);
                        break;
                case AUDIT_FILTERKEY:
                        /* ignore this field for filtering */
                        result = 1;
                        break;
                case AUDIT_PERM:
                        result = audit_match_perm(ctx, f->val);
                        if (f->op == Audit_not_equal)
                                result = !result;
                        break;
                case AUDIT_FILETYPE:
                        result = audit_match_filetype(ctx, f->val);
                        if (f->op == Audit_not_equal)
                                result = !result;
                        break;
                case AUDIT_FIELD_COMPARE:
                        result = audit_field_compare(tsk, cred, f, ctx, name);
                        break;
                }
                if (!result)
                        return 0;
        }

        if (ctx) {
                if (rule->prio <= ctx->prio)
                        return 0;
                if (rule->filterkey) {
                        kfree(ctx->filterkey);
                        ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
                }
                ctx->prio = rule->prio;
        }
        switch (rule->action) {
        case AUDIT_NEVER:
                *state = AUDIT_DISABLED;
                break;
        case AUDIT_ALWAYS:
                *state = AUDIT_RECORD_CONTEXT;
                break;
        }
        return 1;
}

/* At process creation time, we can determine if system-call auditing is
 * completely disabled for this task.  Since we only have the task
 * structure at this point, we can only check uid and gid.
 */
static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
{
        struct audit_entry *e;
        enum audit_state   state;

        rcu_read_lock();
        list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) {
                if (audit_filter_rules(tsk, &e->rule, NULL, NULL,
                                       &state, true)) {
                        if (state == AUDIT_RECORD_CONTEXT)
                                *key = kstrdup(e->rule.filterkey, GFP_ATOMIC);
                        rcu_read_unlock();
                        return state;
                }
        }
        rcu_read_unlock();
        return AUDIT_BUILD_CONTEXT;
}

static int audit_in_mask(const struct audit_krule *rule, unsigned long val)
{
        int word, bit;

        if (val > 0xffffffff)
                return false;

        word = AUDIT_WORD(val);
        if (word >= AUDIT_BITMASK_SIZE)
                return false;

        bit = AUDIT_BIT(val);

        return rule->mask[word] & bit;
}

/* At syscall entry and exit time, this filter is called if the
 * audit_state is not low enough that auditing cannot take place, but is
 * also not high enough that we already know we have to write an audit
 * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT).
 */
static enum audit_state audit_filter_syscall(struct task_struct *tsk,
                                             struct audit_context *ctx,
                                             struct list_head *list)
{
        struct audit_entry *e;
        enum audit_state state;

        if (auditd_test_task(tsk))
                return AUDIT_DISABLED;

        rcu_read_lock();
        list_for_each_entry_rcu(e, list, list) {
                if (audit_in_mask(&e->rule, ctx->major) &&
                    audit_filter_rules(tsk, &e->rule, ctx, NULL,
                                       &state, false)) {
                        rcu_read_unlock();
                        ctx->current_state = state;
                        return state;
                }
        }
        rcu_read_unlock();
        return AUDIT_BUILD_CONTEXT;
}

/*
 * Given an audit_name check the inode hash table to see if they match.
 * Called holding the rcu read lock to protect the use of audit_inode_hash
 */
static int audit_filter_inode_name(struct task_struct *tsk,
                                   struct audit_names *n,
                                   struct audit_context *ctx) {
        int h = audit_hash_ino((u32)n->ino);
        struct list_head *list = &audit_inode_hash[h];
        struct audit_entry *e;
        enum audit_state state;

        list_for_each_entry_rcu(e, list, list) {
                if (audit_in_mask(&e->rule, ctx->major) &&
                    audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) {
                        ctx->current_state = state;
                        return 1;
                }
        }
        return 0;
}

/* At syscall exit time, this filter is called if any audit_names have been
 * collected during syscall processing.  We only check rules in sublists at hash
 * buckets applicable to the inode numbers in audit_names.
 * Regarding audit_state, same rules apply as for audit_filter_syscall().
 */
void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
{
        struct audit_names *n;

        if (auditd_test_task(tsk))
                return;

        rcu_read_lock();

        list_for_each_entry(n, &ctx->names_list, list) {
                if (audit_filter_inode_name(tsk, n, ctx))
                        break;
        }
        rcu_read_unlock();
}

static inline void audit_proctitle_free(struct audit_context *context)
{
        kfree(context->proctitle.value);
        context->proctitle.value = NULL;
        context->proctitle.len = 0;
}

static inline void audit_free_module(struct audit_context *context)
{
        if (context->type == AUDIT_KERN_MODULE) {
                kfree(context->module.name);
                context->module.name = NULL;
        }
}
static inline void audit_free_names(struct audit_context *context)
{
        struct audit_names *n, *next;

        list_for_each_entry_safe(n, next, &context->names_list, list) {
                list_del(&n->list);
                if (n->name)
                        putname(n->name);
                if (n->should_free)
                        kfree(n);
        }
        context->name_count = 0;
        path_put(&context->pwd);
        context->pwd.dentry = NULL;
        context->pwd.mnt = NULL;
}

static inline void audit_free_aux(struct audit_context *context)
{
        struct audit_aux_data *aux;

        while ((aux = context->aux)) {
                context->aux = aux->next;
                kfree(aux);
        }
        while ((aux = context->aux_pids)) {
                context->aux_pids = aux->next;
                kfree(aux);
        }
}

static inline struct audit_context *audit_alloc_context(enum audit_state state)
{
        struct audit_context *context;

        context = kzalloc(sizeof(*context), GFP_KERNEL);
        if (!context)
                return NULL;
        context->state = state;
        context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
        INIT_LIST_HEAD(&context->killed_trees);
        INIT_LIST_HEAD(&context->names_list);
        return context;
}

/**
 * audit_alloc - allocate an audit context block for a task
 * @tsk: task
 *
 * Filter on the task information and allocate a per-task audit context
 * if necessary.  Doing so turns on system call auditing for the
 * specified task.  This is called from copy_process, so no lock is
 * needed.
 */
int audit_alloc(struct task_struct *tsk)
{
        struct audit_context *context;
        enum audit_state     state;
        char *key = NULL;

        if (likely(!audit_ever_enabled))
                return 0; /* Return if not auditing. */

        state = audit_filter_task(tsk, &key);
        if (state == AUDIT_DISABLED) {
                clear_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
                return 0;
        }

        if (!(context = audit_alloc_context(state))) {
                kfree(key);
                audit_log_lost("out of memory in audit_alloc");
                return -ENOMEM;
        }
        context->filterkey = key;

        audit_set_context(tsk, context);
        set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
        return 0;
}

static inline void audit_free_context(struct audit_context *context)
{
        audit_free_module(context);
        audit_free_names(context);
        unroll_tree_refs(context, NULL, 0);
        free_tree_refs(context);
        audit_free_aux(context);
        kfree(context->filterkey);
        kfree(context->sockaddr);
        audit_proctitle_free(context);
        kfree(context);
}

static int audit_log_pid_context(struct audit_context *context, pid_t pid,
                                 kuid_t auid, kuid_t uid, unsigned int sessionid,
                                 u32 sid, char *comm)
{
        struct audit_buffer *ab;
        char *ctx = NULL;
        u32 len;
        int rc = 0;

        ab = audit_log_start(context, GFP_KERNEL, AUDIT_OBJ_PID);
        if (!ab)
                return rc;

        audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid,
                         from_kuid(&init_user_ns, auid),
                         from_kuid(&init_user_ns, uid), sessionid);
        if (sid) {
                if (security_secid_to_secctx(sid, &ctx, &len)) {
                        audit_log_format(ab, " obj=(none)");
                        rc = 1;
                } else {
                        audit_log_format(ab, " obj=%s", ctx);
                        security_release_secctx(ctx, len);
                }
        }
        audit_log_format(ab, " ocomm=");
        audit_log_untrustedstring(ab, comm);
        audit_log_end(ab);

        return rc;
}

static void audit_log_execve_info(struct audit_context *context,
                                  struct audit_buffer **ab)
{
        long len_max;
        long len_rem;
        long len_full;
        long len_buf;
        long len_abuf = 0;
        long len_tmp;
        bool require_data;
        bool encode;
        unsigned int iter;
        unsigned int arg;
        char *buf_head;
        char *buf;
        const char __user *p = (const char __user *)current->mm->arg_start;

        /* NOTE: this buffer needs to be large enough to hold all the non-arg
         *       data we put in the audit record for this argument (see the
         *       code below) ... at this point in time 96 is plenty */
        char abuf[96];

        /* NOTE: we set MAX_EXECVE_AUDIT_LEN to a rather arbitrary limit, the
         *       current value of 7500 is not as important as the fact that it
         *       is less than 8k, a setting of 7500 gives us plenty of wiggle
         *       room if we go over a little bit in the logging below */
        WARN_ON_ONCE(MAX_EXECVE_AUDIT_LEN > 7500);
        len_max = MAX_EXECVE_AUDIT_LEN;

        /* scratch buffer to hold the userspace args */
        buf_head = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL);
        if (!buf_head) {
                audit_panic("out of memory for argv string");
                return;
        }
        buf = buf_head;

        audit_log_format(*ab, "argc=%d", context->execve.argc);

        len_rem = len_max;
        len_buf = 0;
        len_full = 0;
        require_data = true;
        encode = false;
        iter = 0;
        arg = 0;
        do {
                /* NOTE: we don't ever want to trust this value for anything
                 *       serious, but the audit record format insists we
                 *       provide an argument length for really long arguments,
                 *       e.g. > MAX_EXECVE_AUDIT_LEN, so we have no choice but
                 *       to use strncpy_from_user() to obtain this value for
                 *       recording in the log, although we don't use it
                 *       anywhere here to avoid a double-fetch problem */
                if (len_full == 0)
                        len_full = strnlen_user(p, MAX_ARG_STRLEN) - 1;

                /* read more data from userspace */
                if (require_data) {
                        /* can we make more room in the buffer? */
                        if (buf != buf_head) {
                                memmove(buf_head, buf, len_buf);
                                buf = buf_head;
                        }

                        /* fetch as much as we can of the argument */
                        len_tmp = strncpy_from_user(&buf_head[len_buf], p,
                                                    len_max - len_buf);
                        if (len_tmp == -EFAULT) {
                                /* unable to copy from userspace */
                                send_sig(SIGKILL, current, 0);
                                goto out;
                        } else if (len_tmp == (len_max - len_buf)) {
                                /* buffer is not large enough */
                                require_data = true;
                                /* NOTE: if we are going to span multiple
                                 *       buffers force the encoding so we stand
                                 *       a chance at a sane len_full value and
                                 *       consistent record encoding */
                                encode = true;
                                len_full = len_full * 2;
                                p += len_tmp;
                        } else {
                                require_data = false;
                                if (!encode)
                                        encode = audit_string_contains_control(
                                                                buf, len_tmp);
                                /* try to use a trusted value for len_full */
                                if (len_full < len_max)
                                        len_full = (encode ?
                                                    len_tmp * 2 : len_tmp);
                                p += len_tmp + 1;
                        }
                        len_buf += len_tmp;
                        buf_head[len_buf] = '\0';

                        /* length of the buffer in the audit record? */
                        len_abuf = (encode ? len_buf * 2 : len_buf + 2);
                }

                /* write as much as we can to the audit log */
                if (len_buf >= 0) {
                        /* NOTE: some magic numbers here - basically if we
                         *       can't fit a reasonable amount of data into the
                         *       existing audit buffer, flush it and start with
                         *       a new buffer */
                        if ((sizeof(abuf) + 8) > len_rem) {
                                len_rem = len_max;
                                audit_log_end(*ab);
                                *ab = audit_log_start(context,
                                                      GFP_KERNEL, AUDIT_EXECVE);
                                if (!*ab)
                                        goto out;
                        }

                        /* create the non-arg portion of the arg record */
                        len_tmp = 0;
                        if (require_data || (iter > 0) ||
                            ((len_abuf + sizeof(abuf)) > len_rem)) {
                                if (iter == 0) {
                                        len_tmp += snprintf(&abuf[len_tmp],
                                                        sizeof(abuf) - len_tmp,
                                                        " a%d_len=%lu",
                                                        arg, len_full);
                                }
                                len_tmp += snprintf(&abuf[len_tmp],
                                                    sizeof(abuf) - len_tmp,
                                                    " a%d[%d]=", arg, iter++);
                        } else
                                len_tmp += snprintf(&abuf[len_tmp],
                                                    sizeof(abuf) - len_tmp,
                                                    " a%d=", arg);
                        WARN_ON(len_tmp >= sizeof(abuf));
                        abuf[sizeof(abuf) - 1] = '\0';

                        /* log the arg in the audit record */
                        audit_log_format(*ab, "%s", abuf);
                        len_rem -= len_tmp;
                        len_tmp = len_buf;
                        if (encode) {
                                if (len_abuf > len_rem)
                                        len_tmp = len_rem / 2; /* encoding */
                                audit_log_n_hex(*ab, buf, len_tmp);
                                len_rem -= len_tmp * 2;
                                len_abuf -= len_tmp * 2;
                        } else {
                                if (len_abuf > len_rem)
                                        len_tmp = len_rem - 2; /* quotes */
                                audit_log_n_string(*ab, buf, len_tmp);
                                len_rem -= len_tmp + 2;
                                /* don't subtract the "2" because we still need
                                 * to add quotes to the remaining string */
                                len_abuf -= len_tmp;
                        }
                        len_buf -= len_tmp;
                        buf += len_tmp;
                }

                /* ready to move to the next argument? */
                if ((len_buf == 0) && !require_data) {
                        arg++;
                        iter = 0;
                        len_full = 0;
                        require_data = true;
                        encode = false;
                }
        } while (arg < context->execve.argc);

        /* NOTE: the caller handles the final audit_log_end() call */

out:
        kfree(buf_head);
}

static void audit_log_cap(struct audit_buffer *ab, char *prefix,
                          kernel_cap_t *cap)
{
        int i;

        if (cap_isclear(*cap)) {
                audit_log_format(ab, " %s=0", prefix);
                return;
        }
        audit_log_format(ab, " %s=", prefix);
        CAP_FOR_EACH_U32(i)
                audit_log_format(ab, "%08x", cap->cap[CAP_LAST_U32 - i]);
}

static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
{
        if (name->fcap_ver == -1) {
                audit_log_format(ab, " cap_fe=? cap_fver=? cap_fp=? cap_fi=?");
                return;
        }
        audit_log_cap(ab, "cap_fp", &name->fcap.permitted);
        audit_log_cap(ab, "cap_fi", &name->fcap.inheritable);
        audit_log_format(ab, " cap_fe=%d cap_fver=%x cap_frootid=%d",
                         name->fcap.fE, name->fcap_ver,
                         from_kuid(&init_user_ns, name->fcap.rootid));
}

static void audit_log_time(struct audit_context *context, struct audit_buffer **ab)
{
        const struct audit_ntp_data *ntp = &context->time.ntp_data;
        const struct timespec64 *tk = &context->time.tk_injoffset;
        static const char * const ntp_name[] = {
                "offset",
                "freq",
                "status",
                "tai",
                "tick",
                "adjust",
        };
        int type;

        if (context->type == AUDIT_TIME_ADJNTPVAL) {
                for (type = 0; type < AUDIT_NTP_NVALS; type++) {
                        if (ntp->vals[type].newval != ntp->vals[type].oldval) {
                                if (!*ab) {
                                        *ab = audit_log_start(context,
                                                        GFP_KERNEL,
                                                        AUDIT_TIME_ADJNTPVAL);
                                        if (!*ab)
                                                return;
                                }
                                audit_log_format(*ab, "op=%s old=%lli new=%lli",
                                                 ntp_name[type],
                                                 ntp->vals[type].oldval,
                                                 ntp->vals[type].newval);
                                audit_log_end(*ab);
                                *ab = NULL;
                        }
                }
        }
        if (tk->tv_sec != 0 || tk->tv_nsec != 0) {
                if (!*ab) {
                        *ab = audit_log_start(context, GFP_KERNEL,
                                              AUDIT_TIME_INJOFFSET);
                        if (!*ab)
                                return;
                }
                audit_log_format(*ab, "sec=%lli nsec=%li",
                                 (long long)tk->tv_sec, tk->tv_nsec);
                audit_log_end(*ab);
                *ab = NULL;
        }
}

static void show_special(struct audit_context *context, int *call_panic)
{
        struct audit_buffer *ab;
        int i;

        ab = audit_log_start(context, GFP_KERNEL, context->type);
        if (!ab)
                return;

        switch (context->type) {
        case AUDIT_SOCKETCALL: {
                int nargs = context->socketcall.nargs;
                audit_log_format(ab, "nargs=%d", nargs);
                for (i = 0; i < nargs; i++)
                        audit_log_format(ab, " a%d=%lx", i,
                                context->socketcall.args[i]);
                break; }
        case AUDIT_IPC: {
                u32 osid = context->ipc.osid;

                audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho",
                                 from_kuid(&init_user_ns, context->ipc.uid),
                                 from_kgid(&init_user_ns, context->ipc.gid),
                                 context->ipc.mode);
                if (osid) {
                        char *ctx = NULL;
                        u32 len;
                        if (security_secid_to_secctx(osid, &ctx, &len)) {
                                audit_log_format(ab, " osid=%u", osid);
                                *call_panic = 1;
                        } else {
                                audit_log_format(ab, " obj=%s", ctx);
                                security_release_secctx(ctx, len);
                        }
                }
                if (context->ipc.has_perm) {
                        audit_log_end(ab);
                        ab = audit_log_start(context, GFP_KERNEL,
                                             AUDIT_IPC_SET_PERM);
                        if (unlikely(!ab))
                                return;
                        audit_log_format(ab,
                                "qbytes=%lx ouid=%u ogid=%u mode=%#ho",
                                context->ipc.qbytes,
                                context->ipc.perm_uid,
                                context->ipc.perm_gid,
                                context->ipc.perm_mode);
                }
                break; }
        case AUDIT_MQ_OPEN:
                audit_log_format(ab,
                        "oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld "
                        "mq_msgsize=%ld mq_curmsgs=%ld",
                        context->mq_open.oflag, context->mq_open.mode,
                        context->mq_open.attr.mq_flags,
                        context->mq_open.attr.mq_maxmsg,
                        context->mq_open.attr.mq_msgsize,
                        context->mq_open.attr.mq_curmsgs);
                break;
        case AUDIT_MQ_SENDRECV:
                audit_log_format(ab,
                        "mqdes=%d msg_len=%zd msg_prio=%u "
                        "abs_timeout_sec=%lld abs_timeout_nsec=%ld",
                        context->mq_sendrecv.mqdes,
                        context->mq_sendrecv.msg_len,
                        context->mq_sendrecv.msg_prio,
                        (long long) context->mq_sendrecv.abs_timeout.tv_sec,
                        context->mq_sendrecv.abs_timeout.tv_nsec);
                break;
        case AUDIT_MQ_NOTIFY:
                audit_log_format(ab, "mqdes=%d sigev_signo=%d",
                                context->mq_notify.mqdes,
                                context->mq_notify.sigev_signo);
                break;
        case AUDIT_MQ_GETSETATTR: {
                struct mq_attr *attr = &context->mq_getsetattr.mqstat;
                audit_log_format(ab,
                        "mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld "
                        "mq_curmsgs=%ld ",
                        context->mq_getsetattr.mqdes,
                        attr->mq_flags, attr->mq_maxmsg,
                        attr->mq_msgsize, attr->mq_curmsgs);
                break; }
        case AUDIT_CAPSET:
                audit_log_format(ab, "pid=%d", context->capset.pid);
                audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable);
                audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted);
                audit_log_cap(ab, "cap_pe", &context->capset.cap.effective);
                audit_log_cap(ab, "cap_pa", &context->capset.cap.ambient);
                break;
        case AUDIT_MMAP:
                audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd,
                                 context->mmap.flags);
                break;
        case AUDIT_EXECVE:
                audit_log_execve_info(context, &ab);
                break;
        case AUDIT_KERN_MODULE:
                audit_log_format(ab, "name=");
                if (context->module.name) {
                        audit_log_untrustedstring(ab, context->module.name);
                } else
                        audit_log_format(ab, "(null)");

                break;
        case AUDIT_TIME_ADJNTPVAL:
        case AUDIT_TIME_INJOFFSET:
                /* this call deviates from the rest, eating the buffer */
                audit_log_time(context, &ab);
                break;
        }
        audit_log_end(ab);
}

static inline int audit_proctitle_rtrim(char *proctitle, int len)
{
        char *end = proctitle + len - 1;
        while (end > proctitle && !isprint(*end))
                end--;

        /* catch the case where proctitle is only 1 non-print character */
        len = end - proctitle + 1;
        len -= isprint(proctitle[len-1]) == 0;
        return len;
}

/*
 * audit_log_name - produce AUDIT_PATH record from struct audit_names
 * @context: audit_context for the task
 * @n: audit_names structure with reportable details
 * @path: optional path to report instead of audit_names->name
 * @record_num: record number to report when handling a list of names
 * @call_panic: optional pointer to int that will be updated if secid fails
 */
static void audit_log_name(struct audit_context *context, struct audit_names *n,
                    const struct path *path, int record_num, int *call_panic)
{
        struct audit_buffer *ab;

        ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
        if (!ab)
                return;

        audit_log_format(ab, "item=%d", record_num);

        if (path)
                audit_log_d_path(ab, " name=", path);
        else if (n->name) {
                switch (n->name_len) {
                case AUDIT_NAME_FULL:
                        /* log the full path */
                        audit_log_format(ab, " name=");
                        audit_log_untrustedstring(ab, n->name->name);
                        break;
                case 0:
                        /* name was specified as a relative path and the
                         * directory component is the cwd
                         */
                        audit_log_d_path(ab, " name=", &context->pwd);
                        break;
                default:
                        /* log the name's directory component */
                        audit_log_format(ab, " name=");
                        audit_log_n_untrustedstring(ab, n->name->name,
                                                    n->name_len);
                }
        } else
                audit_log_format(ab, " name=(null)");

        if (n->ino != AUDIT_INO_UNSET)
                audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#ho ouid=%u ogid=%u rdev=%02x:%02x",
                                 n->ino,
                                 MAJOR(n->dev),
                                 MINOR(n->dev),
                                 n->mode,
                                 from_kuid(&init_user_ns, n->uid),
                                 from_kgid(&init_user_ns, n->gid),
                                 MAJOR(n->rdev),
                                 MINOR(n->rdev));
        if (n->osid != 0) {
                char *ctx = NULL;
                u32 len;

                if (security_secid_to_secctx(
                        n->osid, &ctx, &len)) {
                        audit_log_format(ab, " osid=%u", n->osid);
                        if (call_panic)
                                *call_panic = 2;
                } else {
                        audit_log_format(ab, " obj=%s", ctx);
                        security_release_secctx(ctx, len);
                }
        }

        /* log the audit_names record type */
        switch (n->type) {
        case AUDIT_TYPE_NORMAL:
                audit_log_format(ab, " nametype=NORMAL");
                break;
        case AUDIT_TYPE_PARENT:
                audit_log_format(ab, " nametype=PARENT");
                break;
        case AUDIT_TYPE_CHILD_DELETE:
                audit_log_format(ab, " nametype=DELETE");
                break;
        case AUDIT_TYPE_CHILD_CREATE:
                audit_log_format(ab, " nametype=CREATE");
                break;
        default:
                audit_log_format(ab, " nametype=UNKNOWN");
                break;
        }

        audit_log_fcaps(ab, n);
        audit_log_end(ab);
}

static void audit_log_proctitle(void)
{
        int res;
        char *buf;
        char *msg = "(null)";
        int len = strlen(msg);
        struct audit_context *context = audit_context();
        struct audit_buffer *ab;

        if (!context || context->dummy)
                return;

        ab = audit_log_start(context, GFP_KERNEL, AUDIT_PROCTITLE);
        if (!ab)
                return;        /* audit_panic or being filtered */

        audit_log_format(ab, "proctitle=");

        /* Not  cached */
        if (!context->proctitle.value) {
                buf = kmalloc(MAX_PROCTITLE_AUDIT_LEN, GFP_KERNEL);
                if (!buf)
                        goto out;
                /* Historically called this from procfs naming */
                res = get_cmdline(current, buf, MAX_PROCTITLE_AUDIT_LEN);
                if (res == 0) {
                        kfree(buf);
                        goto out;
                }
                res = audit_proctitle_rtrim(buf, res);
                if (res == 0) {
                        kfree(buf);
                        goto out;
                }
                context->proctitle.value = buf;
                context->proctitle.len = res;
        }
        msg = context->proctitle.value;
        len = context->proctitle.len;
out:
        audit_log_n_untrustedstring(ab, msg, len);
        audit_log_end(ab);
}

static void audit_log_exit(void)
{
        int i, call_panic = 0;
        struct audit_context *context = audit_context();
        struct audit_buffer *ab;
        struct audit_aux_data *aux;
        struct audit_names *n;

        context->personality = current->personality;

        ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
        if (!ab)
                return;                /* audit_panic has been called */
        audit_log_format(ab, "arch=%x syscall=%d",
                         context->arch, context->major);
        if (context->personality != PER_LINUX)
                audit_log_format(ab, " per=%lx", context->personality);
        if (context->return_valid)
                audit_log_format(ab, " success=%s exit=%ld",
                                 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
                                 context->return_code);

        audit_log_format(ab,
                         " a0=%lx a1=%lx a2=%lx a3=%lx items=%d",
                         context->argv[0],
                         context->argv[1],
                         context->argv[2],
                         context->argv[3],
                         context->name_count);

        audit_log_task_info(ab);
        audit_log_key(ab, context->filterkey);
        audit_log_end(ab);

        for (aux = context->aux; aux; aux = aux->next) {

                ab = audit_log_start(context, GFP_KERNEL, aux->type);
                if (!ab)
                        continue; /* audit_panic has been called */

                switch (aux->type) {

                case AUDIT_BPRM_FCAPS: {
                        struct audit_aux_data_bprm_fcaps *axs = (void *)aux;
                        audit_log_format(ab, "fver=%x", axs->fcap_ver);
                        audit_log_cap(ab, "fp", &axs->fcap.permitted);
                        audit_log_cap(ab, "fi", &axs->fcap.inheritable);
                        audit_log_format(ab, " fe=%d", axs->fcap.fE);
                        audit_log_cap(ab, "old_pp", &axs->old_pcap.permitted);
                        audit_log_cap(ab, "old_pi", &axs->old_pcap.inheritable);
                        audit_log_cap(ab, "old_pe", &axs->old_pcap.effective);
                        audit_log_cap(ab, "old_pa", &axs->old_pcap.ambient);
                        audit_log_cap(ab, "pp", &axs->new_pcap.permitted);
                        audit_log_cap(ab, "pi", &axs->new_pcap.inheritable);
                        audit_log_cap(ab, "pe", &axs->new_pcap.effective);
                        audit_log_cap(ab, "pa", &axs->new_pcap.ambient);
                        audit_log_format(ab, " frootid=%d",
                                         from_kuid(&init_user_ns,
                                                   axs->fcap.rootid));
                        break; }

                }
                audit_log_end(ab);
        }

        if (context->type)
                show_special(context, &call_panic);

        if (context->fds[0] >= 0) {
                ab = audit_log_start(context, GFP_KERNEL, AUDIT_FD_PAIR);
                if (ab) {
                        audit_log_format(ab, "fd0=%d fd1=%d",
                                        context->fds[0], context->fds[1]);
                        audit_log_end(ab);
                }
        }

        if (context->sockaddr_len) {
                ab = audit_log_start(context, GFP_KERNEL, AUDIT_SOCKADDR);
                if (ab) {
                        audit_log_format(ab, "saddr=");
                        audit_log_n_hex(ab, (void *)context->sockaddr,
                                        context->sockaddr_len);
                        audit_log_end(ab);
                }
        }

        for (aux = context->aux_pids; aux; aux = aux->next) {
                struct audit_aux_data_pids *axs = (void *)aux;

                for (i = 0; i < axs->pid_count; i++)
                        if (audit_log_pid_context(context, axs->target_pid[i],
                                                  axs->target_auid[i],
                                                  axs->target_uid[i],
                                                  axs->target_sessionid[i],
                                                  axs->target_sid[i],
                                                  axs->target_comm[i]))
                                call_panic = 1;
        }

        if (context->target_pid &&
            audit_log_pid_context(context, context->target_pid,
                                  context->target_auid, context->target_uid,
                                  context->target_sessionid,
                                  context->target_sid, context->target_comm))
                        call_panic = 1;

        if (context->pwd.dentry && context->pwd.mnt) {
                ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
                if (ab) {
                        audit_log_d_path(ab, "cwd=", &context->pwd);
                        audit_log_end(ab);
                }
        }

        i = 0;
        list_for_each_entry(n, &context->names_list, list) {
                if (n->hidden)
                        continue;
                audit_log_name(context, n, NULL, i++, &call_panic);
        }

        audit_log_proctitle();

        /* Send end of event record to help user space know we are finished */
        ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
        if (ab)
                audit_log_end(ab);
        if (call_panic)
                audit_panic("error converting sid to string");
}

/**
 * __audit_free - free a per-task audit context
 * @tsk: task whose audit context block to free
 *
 * Called from copy_process and do_exit
 */
void __audit_free(struct task_struct *tsk)
{
        struct audit_context *context = tsk->audit_context;

        if (!context)
                return;

        if (!list_empty(&context->killed_trees))
                audit_kill_trees(context);

        /* We are called either by do_exit() or the fork() error handling code;
         * in the former case tsk == current and in the latter tsk is a
         * random task_struct that doesn't doesn't have any meaningful data we
         * need to log via audit_log_exit().
         */
        if (tsk == current && !context->dummy && context->in_syscall) {
                context->return_valid = 0;
                context->return_code = 0;

                audit_filter_syscall(tsk, context,
                                     &audit_filter_list[AUDIT_FILTER_EXIT]);
                audit_filter_inodes(tsk, context);
                if (context->current_state == AUDIT_RECORD_CONTEXT)
                        audit_log_exit();
        }

        audit_set_context(tsk, NULL);
        audit_free_context(context);
}

/**
 * __audit_syscall_entry - fill in an audit record at syscall entry
 * @major: major syscall type (function)
 * @a1: additional syscall register 1
 * @a2: additional syscall register 2
 * @a3: additional syscall register 3
 * @a4: additional syscall register 4
 *
 * Fill in audit context at syscall entry.  This only happens if the
 * audit context was created when the task was created and the state or
 * filters demand the audit context be built.  If the state from the
 * per-task filter or from the per-syscall filter is AUDIT_RECORD_CONTEXT,
 * then the record will be written at syscall exit time (otherwise, it
 * will only be written if another part of the kernel requests that it
 * be written).
 */
void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
                           unsigned long a3, unsigned long a4)
{
        struct audit_context *context = audit_context();
        enum audit_state     state;

        if (!audit_enabled || !context)
                return;

        BUG_ON(context->in_syscall || context->name_count);

        state = context->state;
        if (state == AUDIT_DISABLED)
                return;

        context->dummy = !audit_n_rules;
        if (!context->dummy && state == AUDIT_BUILD_CONTEXT) {
                context->prio = 0;
                if (auditd_test_task(current))
                        return;
        }

        context->arch            = syscall_get_arch(current);
        context->major      = major;
        context->argv[0]    = a1;
        context->argv[1]    = a2;
        context->argv[2]    = a3;
        context->argv[3]    = a4;
        context->serial     = 0;
        context->in_syscall = 1;
        context->current_state  = state;
        context->ppid       = 0;
        ktime_get_coarse_real_ts64(&context->ctime);
}

/**
 * __audit_syscall_exit - deallocate audit context after a system call
 * @success: success value of the syscall
 * @return_code: return value of the syscall
 *
 * Tear down after system call.  If the audit context has been marked as
 * auditable (either because of the AUDIT_RECORD_CONTEXT state from
 * filtering, or because some other part of the kernel wrote an audit
 * message), then write out the syscall information.  In call cases,
 * free the names stored from getname().
 */
void __audit_syscall_exit(int success, long return_code)
{
        struct audit_context *context;

        context = audit_context();
        if (!context)
                return;

        if (!list_empty(&context->killed_trees))
                audit_kill_trees(context);

        if (!context->dummy && context->in_syscall) {
                if (success)
                        context->return_valid = AUDITSC_SUCCESS;
                else
                        context->return_valid = AUDITSC_FAILURE;

                /*
                 * we need to fix up the return code in the audit logs if the
                 * actual return codes are later going to be fixed up by the
                 * arch specific signal handlers
                 *
                 * This is actually a test for:
                 * (rc == ERESTARTSYS ) || (rc == ERESTARTNOINTR) ||
                 * (rc == ERESTARTNOHAND) || (rc == ERESTART_RESTARTBLOCK)
                 *
                 * but is faster than a bunch of ||
                 */
                if (unlikely(return_code <= -ERESTARTSYS) &&
                    (return_code >= -ERESTART_RESTARTBLOCK) &&
                    (return_code != -ENOIOCTLCMD))
                        context->return_code = -EINTR;
                else
                        context->return_code  = return_code;

                audit_filter_syscall(current, context,
                                     &audit_filter_list[AUDIT_FILTER_EXIT]);
                audit_filter_inodes(current, context);
                if (context->current_state == AUDIT_RECORD_CONTEXT)
                        audit_log_exit();
        }

        context->in_syscall = 0;
        context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;

        audit_free_module(context);
        audit_free_names(context);
        unroll_tree_refs(context, NULL, 0);
        audit_free_aux(context);
        context->aux = NULL;
        context->aux_pids = NULL;
        context->target_pid = 0;
        context->target_sid = 0;
        context->sockaddr_len = 0;
        context->type = 0;
        context->fds[0] = -1;
        if (context->state != AUDIT_RECORD_CONTEXT) {
                kfree(context->filterkey);
                context->filterkey = NULL;
        }
}

static inline void handle_one(const struct inode *inode)
{
        struct audit_context *context;
        struct audit_tree_refs *p;
        struct audit_chunk *chunk;
        int count;
        if (likely(!inode->i_fsnotify_marks))
                return;
        context = audit_context();
        p = context->trees;
        count = context->tree_count;
        rcu_read_lock();
        chunk = audit_tree_lookup(inode);
        rcu_read_unlock();
        if (!chunk)
                return;
        if (likely(put_tree_ref(context, chunk)))
                return;
        if (unlikely(!grow_tree_refs(context))) {
                pr_warn("out of memory, audit has lost a tree reference\n");
                audit_set_auditable(context);
                audit_put_chunk(chunk);
                unroll_tree_refs(context, p, count);
                return;
        }
        put_tree_ref(context, chunk);
}

static void handle_path(const struct dentry *dentry)
{
        struct audit_context *context;
        struct audit_tree_refs *p;
        const struct dentry *d, *parent;
        struct audit_chunk *drop;
        unsigned long seq;
        int count;

        context = audit_context();
        p = context->trees;
        count = context->tree_count;
retry:
        drop = NULL;
        d = dentry;
        rcu_read_lock();
        seq = read_seqbegin(&rename_lock);
        for(;;) {
                struct inode *inode = d_backing_inode(d);
                if (inode && unlikely(inode->i_fsnotify_marks)) {
                        struct audit_chunk *chunk;
                        chunk = audit_tree_lookup(inode);
                        if (chunk) {
                                if (unlikely(!put_tree_ref(context, chunk))) {
                                        drop = chunk;
                                        break;
                                }
                        }
                }
                parent = d->d_parent;
                if (parent == d)
                        break;
                d = parent;
        }
        if (unlikely(read_seqretry(&rename_lock, seq) || drop)) {  /* in this order */
                rcu_read_unlock();
                if (!drop) {
                        /* just a race with rename */
                        unroll_tree_refs(context, p, count);
                        goto retry;
                }
                audit_put_chunk(drop);
                if (grow_tree_refs(context)) {
                        /* OK, got more space */
                        unroll_tree_refs(context, p, count);
                        goto retry;
                }
                /* too bad */
                pr_warn("out of memory, audit has lost a tree reference\n");
                unroll_tree_refs(context, p, count);
                audit_set_auditable(context);
                return;
        }
        rcu_read_unlock();
}

static struct audit_names *audit_alloc_name(struct audit_context *context,
                                                unsigned char type)
{
        struct audit_names *aname;

        if (context->name_count < AUDIT_NAMES) {
                aname = &context->preallocated_names[context->name_count];
                memset(aname, 0, sizeof(*aname));
        } else {
                aname = kzalloc(sizeof(*aname), GFP_NOFS);
                if (!aname)
                        return NULL;
                aname->should_free = true;
        }

        aname->ino = AUDIT_INO_UNSET;
        aname->type = type;
        list_add_tail(&aname->list, &context->names_list);

        context->name_count++;
        return aname;
}

/**
 * __audit_reusename - fill out filename with info from existing entry
 * @uptr: userland ptr to pathname
 *
 * Search the audit_names list for the current audit context. If there is an
 * existing entry with a matching "uptr" then return the filename
 * associated with that audit_name. If not, return NULL.
 */
struct filename *
__audit_reusename(const __user char *uptr)
{
        struct audit_context *context = audit_context();
        struct audit_names *n;

        list_for_each_entry(n, &context->names_list, list) {
                if (!n->name)
                        continue;
                if (n->name->uptr == uptr) {
                        n->name->refcnt++;
                        return n->name;
                }
        }
        return NULL;
}

inline void _audit_getcwd(struct audit_context *context)
{
        if (!context->pwd.dentry)
                get_fs_pwd(current->fs, &context->pwd);
}

void __audit_getcwd(void)
{
        struct audit_context *context = audit_context();

        if (context->in_syscall)
                _audit_getcwd(context);
}

/**
 * __audit_getname - add a name to the list
 * @name: name to add
 *
 * Add a name to the list of audit names for this context.
 * Called from fs/namei.c:getname().
 */
void __audit_getname(struct filename *name)
{
        struct audit_context *context = audit_context();
        struct audit_names *n;

        if (!context->in_syscall)
                return;

        n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
        if (!n)
                return;

        n->name = name;
        n->name_len = AUDIT_NAME_FULL;
        name->aname = n;
        name->refcnt++;

        _audit_getcwd(context);
}

static inline int audit_copy_fcaps(struct audit_names *name,
                                   const struct dentry *dentry)
{
        struct cpu_vfs_cap_data caps;
        int rc;

        if (!dentry)
                return 0;

        rc = get_vfs_caps_from_disk(dentry, &caps);
        if (rc)
                return rc;

        name->fcap.permitted = caps.permitted;
        name->fcap.inheritable = caps.inheritable;
        name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
        name->fcap.rootid = caps.rootid;
        name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >>
                                VFS_CAP_REVISION_SHIFT;

        return 0;
}

/* Copy inode data into an audit_names. */
static void audit_copy_inode(struct audit_names *name,
                             const struct dentry *dentry,
                             struct inode *inode, unsigned int flags)
{
        name->ino   = inode->i_ino;
        name->dev   = inode->i_sb->s_dev;
        name->mode  = inode->i_mode;
        name->uid   = inode->i_uid;
        name->gid   = inode->i_gid;
        name->rdev  = inode->i_rdev;
        security_inode_getsecid(inode, &name->osid);
        if (flags & AUDIT_INODE_NOEVAL) {
                name->fcap_ver = -1;
                return;
        }
        audit_copy_fcaps(name, dentry);
}

/**
 * __audit_inode - store the inode and device from a lookup
 * @name: name being audited
 * @dentry: dentry being audited
 * @flags: attributes for this particular entry
 */
void __audit_inode(struct filename *name, const struct dentry *dentry,
                   unsigned int flags)
{
        struct audit_context *context = audit_context();
        struct inode *inode = d_backing_inode(dentry);
        struct audit_names *n;
        bool parent = flags & AUDIT_INODE_PARENT;
        struct audit_entry *e;
        struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS];
        int i;

        if (!context->in_syscall)
                return;

        rcu_read_lock();
        list_for_each_entry_rcu(e, list, list) {
                for (i = 0; i < e->rule.field_count; i++) {
                        struct audit_field *f = &e->rule.fields[i];

                        if (f->type == AUDIT_FSTYPE
                            && audit_comparator(inode->i_sb->s_magic,
                                                f->op, f->val)
                            && e->rule.action == AUDIT_NEVER) {
                                rcu_read_unlock();
                                return;
                        }
                }
        }
        rcu_read_unlock();

        if (!name)
                goto out_alloc;

        /*
         * If we have a pointer to an audit_names entry already, then we can
         * just use it directly if the type is correct.
         */
        n = name->aname;
        if (n) {
                if (parent) {
                        if (n->type == AUDIT_TYPE_PARENT ||
                            n->type == AUDIT_TYPE_UNKNOWN)
                                goto out;
                } else {
                        if (n->type != AUDIT_TYPE_PARENT)
                                goto out;
                }
        }

        list_for_each_entry_reverse(n, &context->names_list, list) {
                if (n->ino) {
                        /* valid inode number, use that for the comparison */
                        if (n->ino != inode->i_ino ||
                            n->dev != inode->i_sb->s_dev)
                                continue;
                } else if (n->name) {
                        /* inode number has not been set, check the name */
                        if (strcmp(n->name->name, name->name))
                                continue;
                } else
                        /* no inode and no name (?!) ... this is odd ... */
                        continue;

                /* match the correct record type */
                if (parent) {
                        if (n->type == AUDIT_TYPE_PARENT ||
                            n->type == AUDIT_TYPE_UNKNOWN)
                                goto out;
                } else {
                        if (n->type != AUDIT_TYPE_PARENT)
                                goto out;
                }
        }

out_alloc:
        /* unable to find an entry with both a matching name and type */
        n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
        if (!n)
                return;
        if (name) {
                n->name = name;
                name->refcnt++;
        }

out:
        if (parent) {
                n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL;
                n->type = AUDIT_TYPE_PARENT;
                if (flags & AUDIT_INODE_HIDDEN)
                        n->hidden = true;
        } else {
                n->name_len = AUDIT_NAME_FULL;
                n->type = AUDIT_TYPE_NORMAL;
        }
        handle_path(dentry);
        audit_copy_inode(n, dentry, inode, flags & AUDIT_INODE_NOEVAL);
}

void __audit_file(const struct file *file)
{
        __audit_inode(NULL, file->f_path.dentry, 0);
}

/**
 * __audit_inode_child - collect inode info for created/removed objects
 * @parent: inode of dentry parent
 * @dentry: dentry being audited
 * @type:   AUDIT_TYPE_* value that we're looking for
 *
 * For syscalls that create or remove filesystem objects, audit_inode
 * can only collect information for the filesystem object's parent.
 * This call updates the audit context with the child's information.
 * Syscalls that create a new filesystem object must be hooked after
 * the object is created.  Syscalls that remove a filesystem object
 * must be hooked prior, in order to capture the target inode during
 * unsuccessful attempts.
 */
void __audit_inode_child(struct inode *parent,
                         const struct dentry *dentry,
                         const unsigned char type)
{
        struct audit_context *context = audit_context();
        struct inode *inode = d_backing_inode(dentry);
        const struct qstr *dname = &dentry->d_name;
        struct audit_names *n, *found_parent = NULL, *found_child = NULL;
        struct audit_entry *e;
        struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS];
        int i;

        if (!context->in_syscall)
                return;

        rcu_read_lock();
        list_for_each_entry_rcu(e, list, list) {
                for (i = 0; i < e->rule.field_count; i++) {
                        struct audit_field *f = &e->rule.fields[i];

                        if (f->type == AUDIT_FSTYPE
                            && audit_comparator(parent->i_sb->s_magic,
                                                f->op, f->val)
                            && e->rule.action == AUDIT_NEVER) {
                                rcu_read_unlock();
                                return;
                        }
                }
        }
        rcu_read_unlock();

        if (inode)
                handle_one(inode);

        /* look for a parent entry first */
        list_for_each_entry(n, &context->names_list, list) {
                if (!n->name ||
                    (n->type != AUDIT_TYPE_PARENT &&
                     n->type != AUDIT_TYPE_UNKNOWN))
                        continue;

                if (n->ino == parent->i_ino && n->dev == parent->i_sb->s_dev &&
                    !audit_compare_dname_path(dname,
                                              n->name->name, n->name_len)) {
                        if (n->type == AUDIT_TYPE_UNKNOWN)
                                n->type = AUDIT_TYPE_PARENT;
                        found_parent = n;
                        break;
                }
        }

        cond_resched();

        /* is there a matching child entry? */
        list_for_each_entry(n, &context->names_list, list) {
                /* can only match entries that have a name */
                if (!n->name ||
                    (n->type != type && n->type != AUDIT_TYPE_UNKNOWN))
                        continue;

                if (!strcmp(dname->name, n->name->name) ||
                    !audit_compare_dname_path(dname, n->name->name,
                                                found_parent ?
                                                found_parent->name_len :
                                                AUDIT_NAME_FULL)) {
                        if (n->type == AUDIT_TYPE_UNKNOWN)
                                n->type = type;
                        found_child = n;
                        break;
                }
        }

        if (!found_parent) {
                /* create a new, "anonymous" parent record */
                n = audit_alloc_name(context, AUDIT_TYPE_PARENT);
                if (!n)
                        return;
                audit_copy_inode(n, NULL, parent, 0);
        }

        if (!found_child) {
                found_child = audit_alloc_name(context, type);
                if (!found_child)
                        return;

                /* Re-use the name belonging to the slot for a matching parent
                 * directory. All names for this context are relinquished in
                 * audit_free_names() */
                if (found_parent) {
                        found_child->name = found_parent->name;
                        found_child->name_len = AUDIT_NAME_FULL;
                        found_child->name->refcnt++;
                }
        }

        if (inode)
                audit_copy_inode(found_child, dentry, inode, 0);
        else
                found_child->ino = AUDIT_INO_UNSET;
}
EXPORT_SYMBOL_GPL(__audit_inode_child);

/**
 * auditsc_get_stamp - get local copies of audit_context values
 * @ctx: audit_context for the task
 * @t: timespec64 to store time recorded in the audit_context
 * @serial: serial value that is recorded in the audit_context
 *
 * Also sets the context as auditable.
 */
int auditsc_get_stamp(struct audit_context *ctx,
                       struct timespec64 *t, unsigned int *serial)
{
        if (!ctx->in_syscall)
                return 0;
        if (!ctx->serial)
                ctx->serial = audit_serial();
        t->tv_sec  = ctx->ctime.tv_sec;
        t->tv_nsec = ctx->ctime.tv_nsec;
        *serial    = ctx->serial;
        if (!ctx->prio) {
                ctx->prio = 1;
                ctx->current_state = AUDIT_RECORD_CONTEXT;
        }
        return 1;
}

/**
 * __audit_mq_open - record audit data for a POSIX MQ open
 * @oflag: open flag
 * @mode: mode bits
 * @attr: queue attributes
 *
 */
void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
{
        struct audit_context *context = audit_context();

        if (attr)
                memcpy(&context->mq_open.attr, attr, sizeof(struct mq_attr));
        else
                memset(&context->mq_open.attr, 0, sizeof(struct mq_attr));

        context->mq_open.oflag = oflag;
        context->mq_open.mode = mode;

        context->type = AUDIT_MQ_OPEN;
}

/**
 * __audit_mq_sendrecv - record audit data for a POSIX MQ timed send/receive
 * @mqdes: MQ descriptor
 * @msg_len: Message length
 * @msg_prio: Message priority
 * @abs_timeout: Message timeout in absolute time
 *
 */
void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
                        const struct timespec64 *abs_timeout)
{
        struct audit_context *context = audit_context();
        struct timespec64 *p = &context->mq_sendrecv.abs_timeout;

        if (abs_timeout)
                memcpy(p, abs_timeout, sizeof(*p));
        else
                memset(p, 0, sizeof(*p));

        context->mq_sendrecv.mqdes = mqdes;
        context->mq_sendrecv.msg_len = msg_len;
        context->mq_sendrecv.msg_prio = msg_prio;

        context->type = AUDIT_MQ_SENDRECV;
}

/**
 * __audit_mq_notify - record audit data for a POSIX MQ notify
 * @mqdes: MQ descriptor
 * @notification: Notification event
 *
 */

void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification)
{
        struct audit_context *context = audit_context();

        if (notification)
                context->mq_notify.sigev_signo = notification->sigev_signo;
        else
                context->mq_notify.sigev_signo = 0;

        context->mq_notify.mqdes = mqdes;
        context->type = AUDIT_MQ_NOTIFY;
}

/**
 * __audit_mq_getsetattr - record audit data for a POSIX MQ get/set attribute
 * @mqdes: MQ descriptor
 * @mqstat: MQ flags
 *
 */
void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
{
        struct audit_context *context = audit_context();
        context->mq_getsetattr.mqdes = mqdes;
        context->mq_getsetattr.mqstat = *mqstat;
        context->type = AUDIT_MQ_GETSETATTR;
}

/**
 * __audit_ipc_obj - record audit data for ipc object
 * @ipcp: ipc permissions
 *
 */
void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
{
        struct audit_context *context = audit_context();
        context->ipc.uid = ipcp->uid;
        context->ipc.gid = ipcp->gid;
        context->ipc.mode = ipcp->mode;
        context->ipc.has_perm = 0;
        security_ipc_getsecid(ipcp, &context->ipc.osid);
        context->type = AUDIT_IPC;
}

/**
 * __audit_ipc_set_perm - record audit data for new ipc permissions
 * @qbytes: msgq bytes
 * @uid: msgq user id
 * @gid: msgq group id
 * @mode: msgq mode (permissions)
 *
 * Called only after audit_ipc_obj().
 */
void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode)
{
        struct audit_context *context = audit_context();

        context->ipc.qbytes = qbytes;
        context->ipc.perm_uid = uid;
        context->ipc.perm_gid = gid;
        context->ipc.perm_mode = mode;
        context->ipc.has_perm = 1;
}

void __audit_bprm(struct linux_binprm *bprm)
{
        struct audit_context *context = audit_context();

        context->type = AUDIT_EXECVE;
        context->execve.argc = bprm->argc;
}


/**
 * __audit_socketcall - record audit data for sys_socketcall
 * @nargs: number of args, which should not be more than AUDITSC_ARGS.
 * @args: args array
 *
 */
int __audit_socketcall(int nargs, unsigned long *args)
{
        struct audit_context *context = audit_context();

        if (nargs <= 0 || nargs > AUDITSC_ARGS || !args)
                return -EINVAL;
        context->type = AUDIT_SOCKETCALL;
        context->socketcall.nargs = nargs;
        memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long));
        return 0;
}

/**
 * __audit_fd_pair - record audit data for pipe and socketpair
 * @fd1: the first file descriptor
 * @fd2: the second file descriptor
 *
 */
void __audit_fd_pair(int fd1, int fd2)
{
        struct audit_context *context = audit_context();
        context->fds[0] = fd1;
        context->fds[1] = fd2;
}

/**
 * __audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto
 * @len: data length in user space
 * @a: data address in kernel space
 *
 * Returns 0 for success or NULL context or < 0 on error.
 */
int __audit_sockaddr(int len, void *a)
{
        struct audit_context *context = audit_context();

        if (!context->sockaddr) {
                void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL);
                if (!p)
                        return -ENOMEM;
                context->sockaddr = p;
        }

        context->sockaddr_len = len;
        memcpy(context->sockaddr, a, len);
        return 0;
}

void __audit_ptrace(struct task_struct *t)
{
        struct audit_context *context = audit_context();

        context->target_pid = task_tgid_nr(t);
        context->target_auid = audit_get_loginuid(t);
        context->target_uid = task_uid(t);
        context->target_sessionid = audit_get_sessionid(t);
        security_task_getsecid(t, &context->target_sid);
        memcpy(context->target_comm, t->comm, TASK_COMM_LEN);
}

/**
 * audit_signal_info_syscall - record signal info for syscalls
 * @t: task being signaled
 *
 * If the audit subsystem is being terminated, record the task (pid)
 * and uid that is doing that.
 */
int audit_signal_info_syscall(struct task_struct *t)
{
        struct audit_aux_data_pids *axp;
        struct audit_context *ctx = audit_context();
        kuid_t t_uid = task_uid(t);

        if (!audit_signals || audit_dummy_context())
                return 0;

        /* optimize the common case by putting first signal recipient directly
         * in audit_context */
        if (!ctx->target_pid) {
                ctx->target_pid = task_tgid_nr(t);
                ctx->target_auid = audit_get_loginuid(t);
                ctx->target_uid = t_uid;
                ctx->target_sessionid = audit_get_sessionid(t);
                security_task_getsecid(t, &ctx->target_sid);
                memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN);
                return 0;
        }

        axp = (void *)ctx->aux_pids;
        if (!axp || axp->pid_count == AUDIT_AUX_PIDS) {
                axp = kzalloc(sizeof(*axp), GFP_ATOMIC);
                if (!axp)
                        return -ENOMEM;

                axp->d.type = AUDIT_OBJ_PID;
                axp->d.next = ctx->aux_pids;
                ctx->aux_pids = (void *)axp;
        }
        BUG_ON(axp->pid_count >= AUDIT_AUX_PIDS);

        axp->target_pid[axp->pid_count] = task_tgid_nr(t);
        axp->target_auid[axp->pid_count] = audit_get_loginuid(t);
        axp->target_uid[axp->pid_count] = t_uid;
        axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
        security_task_getsecid(t, &axp->target_sid[axp->pid_count]);
        memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN);
        axp->pid_count++;

        return 0;
}

/**
 * __audit_log_bprm_fcaps - store information about a loading bprm and relevant fcaps
 * @bprm: pointer to the bprm being processed
 * @new: the proposed new credentials
 * @old: the old credentials
 *
 * Simply check if the proc already has the caps given by the file and if not
 * store the priv escalation info for later auditing at the end of the syscall
 *
 * -Eric
 */
int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
                           const struct cred *new, const struct cred *old)
{
        struct audit_aux_data_bprm_fcaps *ax;
        struct audit_context *context = audit_context();
        struct cpu_vfs_cap_data vcaps;

        ax = kmalloc(sizeof(*ax), GFP_KERNEL);
        if (!ax)
                return -ENOMEM;

        ax->d.type = AUDIT_BPRM_FCAPS;
        ax->d.next = context->aux;
        context->aux = (void *)ax;

        get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);

        ax->fcap.permitted = vcaps.permitted;
        ax->fcap.inheritable = vcaps.inheritable;
        ax->fcap.fE = !!(vcaps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
        ax->fcap.rootid = vcaps.rootid;
        ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT;

        ax->old_pcap.permitted   = old->cap_permitted;
        ax->old_pcap.inheritable = old->cap_inheritable;
        ax->old_pcap.effective   = old->cap_effective;
        ax->old_pcap.ambient     = old->cap_ambient;

        ax->new_pcap.permitted   = new->cap_permitted;
        ax->new_pcap.inheritable = new->cap_inheritable;
        ax->new_pcap.effective   = new->cap_effective;
        ax->new_pcap.ambient     = new->cap_ambient;
        return 0;
}

/**
 * __audit_log_capset - store information about the arguments to the capset syscall
 * @new: the new credentials
 * @old: the old (current) credentials
 *
 * Record the arguments userspace sent to sys_capset for later printing by the
 * audit system if applicable
 */
void __audit_log_capset(const struct cred *new, const struct cred *old)
{
        struct audit_context *context = audit_context();
        context->capset.pid = task_tgid_nr(current);
        context->capset.cap.effective   = new->cap_effective;
        context->capset.cap.inheritable = new->cap_effective;
        context->capset.cap.permitted   = new->cap_permitted;
        context->capset.cap.ambient     = new->cap_ambient;
        context->type = AUDIT_CAPSET;
}

void __audit_mmap_fd(int fd, int flags)
{
        struct audit_context *context = audit_context();
        context->mmap.fd = fd;
        context->mmap.flags = flags;
        context->type = AUDIT_MMAP;
}

void __audit_log_kern_module(char *name)
{
        struct audit_context *context = audit_context();

        context->module.name = kstrdup(name, GFP_KERNEL);
        if (!context->module.name)
                audit_log_lost("out of memory in __audit_log_kern_module");
        context->type = AUDIT_KERN_MODULE;
}

void __audit_fanotify(unsigned int response)
{
        audit_log(audit_context(), GFP_KERNEL,
                AUDIT_FANOTIFY,        "resp=%u", response);
}

void __audit_tk_injoffset(struct timespec64 offset)
{
        struct audit_context *context = audit_context();

        /* only set type if not already set by NTP */
        if (!context->type)
                context->type = AUDIT_TIME_INJOFFSET;
        memcpy(&context->time.tk_injoffset, &offset, sizeof(offset));
}

void __audit_ntp_log(const struct audit_ntp_data *ad)
{
        struct audit_context *context = audit_context();
        int type;

        for (type = 0; type < AUDIT_NTP_NVALS; type++)
                if (ad->vals[type].newval != ad->vals[type].oldval) {
                        /* unconditionally set type, overwriting TK */
                        context->type = AUDIT_TIME_ADJNTPVAL;
                        memcpy(&context->time.ntp_data, ad, sizeof(*ad));
                        break;
                }
}

void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries,
                       enum audit_nfcfgop op, gfp_t gfp)
{
        struct audit_buffer *ab;
        char comm[sizeof(current->comm)];

        ab = audit_log_start(audit_context(), gfp, AUDIT_NETFILTER_CFG);
        if (!ab)
                return;
        audit_log_format(ab, "table=%s family=%u entries=%u op=%s",
                         name, af, nentries, audit_nfcfgs[op].s);

        audit_log_format(ab, " pid=%u", task_pid_nr(current));
        audit_log_task_context(ab); /* subj= */
        audit_log_format(ab, " comm=");
        audit_log_untrustedstring(ab, get_task_comm(comm, current));
        audit_log_end(ab);
}
EXPORT_SYMBOL_GPL(__audit_log_nfcfg);

static void audit_log_task(struct audit_buffer *ab)
{
        kuid_t auid, uid;
        kgid_t gid;
        unsigned int sessionid;
        char comm[sizeof(current->comm)];

        auid = audit_get_loginuid(current);
        sessionid = audit_get_sessionid(current);
        current_uid_gid(&uid, &gid);

        audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
                         from_kuid(&init_user_ns, auid),
                         from_kuid(&init_user_ns, uid),
                         from_kgid(&init_user_ns, gid),
                         sessionid);
        audit_log_task_context(ab);
        audit_log_format(ab, " pid=%d comm=", task_tgid_nr(current));
        audit_log_untrustedstring(ab, get_task_comm(comm, current));
        audit_log_d_path_exe(ab, current->mm);
}

/**
 * audit_core_dumps - record information about processes that end abnormally
 * @signr: signal value
 *
 * If a process ends with a core dump, something fishy is going on and we
 * should record the event for investigation.
 */
void audit_core_dumps(long signr)
{
        struct audit_buffer *ab;

        if (!audit_enabled)
                return;

        if (signr == SIGQUIT)        /* don't care for those */
                return;

        ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_ANOM_ABEND);
        if (unlikely(!ab))
                return;
        audit_log_task(ab);
        audit_log_format(ab, " sig=%ld res=1", signr);
        audit_log_end(ab);
}

/**
 * audit_seccomp - record information about a seccomp action
 * @syscall: syscall number
 * @signr: signal value
 * @code: the seccomp action
 *
 * Record the information associated with a seccomp action. Event filtering for
 * seccomp actions that are not to be logged is done in seccomp_log().
 * Therefore, this function forces auditing independent of the audit_enabled
 * and dummy context state because seccomp actions should be logged even when
 * audit is not in use.
 */
void audit_seccomp(unsigned long syscall, long signr, int code)
{
        struct audit_buffer *ab;

        ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_SECCOMP);
        if (unlikely(!ab))
                return;
        audit_log_task(ab);
        audit_log_format(ab, " sig=%ld arch=%x syscall=%ld compat=%d ip=0x%lx code=0x%x",
                         signr, syscall_get_arch(current), syscall,
                         in_compat_syscall(), KSTK_EIP(current), code);
        audit_log_end(ab);
}

void audit_seccomp_actions_logged(const char *names, const char *old_names,
                                  int res)
{
        struct audit_buffer *ab;

        if (!audit_enabled)
                return;

        ab = audit_log_start(audit_context(), GFP_KERNEL,
                             AUDIT_CONFIG_CHANGE);
        if (unlikely(!ab))
                return;

        audit_log_format(ab,
                         "op=seccomp-logging actions=%s old-actions=%s res=%d",
                         names, old_names, res);
        audit_log_end(ab);
}

struct list_head *audit_killed_trees(void)
{
        struct audit_context *ctx = audit_context();
        if (likely(!ctx || !ctx->in_syscall))
                return NULL;
        return &ctx->killed_trees;
}










































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MBCACHE_H
#define _LINUX_MBCACHE_H

#include <linux/hash.h>
#include <linux/list_bl.h>
#include <linux/list.h>
#include <linux/atomic.h>
#include <linux/fs.h>

struct mb_cache;

/* Cache entry flags */
enum {
        MBE_REFERENCED_B = 0,
        MBE_REUSABLE_B
};

struct mb_cache_entry {
        /* List of entries in cache - protected by cache->c_list_lock */
        struct list_head        e_list;
        /*
         * Hash table list - protected by hash chain bitlock. The entry is
         * guaranteed to be hashed while e_refcnt > 0.
         */
        struct hlist_bl_node        e_hash_list;
        /*
         * Entry refcount. Once it reaches zero, entry is unhashed and freed.
         * While refcount > 0, the entry is guaranteed to stay in the hash and
         * e.g. mb_cache_entry_try_delete() will fail.
         */
        atomic_t                e_refcnt;
        /* Key in hash - stable during lifetime of the entry */
        u32                        e_key;
        unsigned long                e_flags;
        /* User provided value - stable during lifetime of the entry */
        u64                        e_value;
};

struct mb_cache *mb_cache_create(int bucket_bits);
void mb_cache_destroy(struct mb_cache *cache);

int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
                          u64 value, bool reusable);
void __mb_cache_entry_free(struct mb_cache *cache,
                           struct mb_cache_entry *entry);
void mb_cache_entry_wait_unused(struct mb_cache_entry *entry);
static inline void mb_cache_entry_put(struct mb_cache *cache,
                                      struct mb_cache_entry *entry)
{
        unsigned int cnt = atomic_dec_return(&entry->e_refcnt);

        if (cnt > 0) {
                if (cnt <= 2)
                        wake_up_var(&entry->e_refcnt);
                return;
        }
        __mb_cache_entry_free(cache, entry);
}

struct mb_cache_entry *mb_cache_entry_delete_or_get(struct mb_cache *cache,
                                                    u32 key, u64 value);
void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value);
struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
                                          u64 value);
struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
                                                 u32 key);
struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,
                                                struct mb_cache_entry *entry);
void mb_cache_entry_touch(struct mb_cache *cache,
                          struct mb_cache_entry *entry);

#endif        /* _LINUX_MBCACHE_H */





















































































    2 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Prevent the compiler from merging or refetching reads or writes. The
 * compiler is also forbidden from reordering successive instances of
 * READ_ONCE and WRITE_ONCE, but only when the compiler is aware of some
 * particular ordering. One way to make the compiler aware of ordering is to
 * put the two invocations of READ_ONCE or WRITE_ONCE in different C
 * statements.
 *
 * These two macros will also work on aggregate data types like structs or
 * unions.
 *
 * Their two major use cases are: (1) Mediating communication between
 * process-level code and irq/NMI handlers, all running on the same CPU,
 * and (2) Ensuring that the compiler does not fold, spindle, or otherwise
 * mutilate accesses that either do not require ordering or that interact
 * with an explicit memory barrier or atomic instruction that provides the
 * required ordering.
 */
#ifndef __ASM_GENERIC_RWONCE_H
#define __ASM_GENERIC_RWONCE_H

#ifndef __ASSEMBLY__

#include <linux/compiler_types.h>
#include <linux/kasan-checks.h>
#include <linux/kcsan-checks.h>

/*
 * Yes, this permits 64-bit accesses on 32-bit architectures. These will
 * actually be atomic in some cases (namely Armv7 + LPAE), but for others we
 * rely on the access being split into 2x32-bit accesses for a 32-bit quantity
 * (e.g. a virtual address) and a strong prevailing wind.
 */
#define compiletime_assert_rwonce_type(t)                                        \
        compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long),        \
                "Unsupported access size for {READ,WRITE}_ONCE().")

/*
 * Use __READ_ONCE() instead of READ_ONCE() if you do not require any
 * atomicity. Note that this may result in tears!
 */
#ifndef __READ_ONCE
#define __READ_ONCE(x)        (*(const volatile __unqual_scalar_typeof(x) *)&(x))
#endif

#define READ_ONCE(x)                                                        \
({                                                                        \
        compiletime_assert_rwonce_type(x);                                \
        __READ_ONCE(x);                                                        \
})

#define __WRITE_ONCE(x, val)                                                \
do {                                                                        \
        *(volatile typeof(x) *)&(x) = (val);                                \
} while (0)

#define WRITE_ONCE(x, val)                                                \
do {                                                                        \
        compiletime_assert_rwonce_type(x);                                \
        __WRITE_ONCE(x, val);                                                \
} while (0)

static __no_sanitize_or_inline
unsigned long __read_once_word_nocheck(const void *addr)
{
        return __READ_ONCE(*(unsigned long *)addr);
}

/*
 * Use READ_ONCE_NOCHECK() instead of READ_ONCE() if you need to load a
 * word from memory atomically but without telling KASAN/KCSAN. This is
 * usually used by unwinding code when walking the stack of a running process.
 */
#define READ_ONCE_NOCHECK(x)                                                \
({                                                                        \
        compiletime_assert(sizeof(x) == sizeof(unsigned long),                \
                "Unsupported access size for READ_ONCE_NOCHECK().");        \
        (typeof(x))__read_once_word_nocheck(&(x));                        \
})

static __no_kasan_or_inline
unsigned long read_word_at_a_time(const void *addr)
{
        kasan_check_read(addr, 1);
        return *(unsigned long *)addr;
}

#endif /* __ASSEMBLY__ */
#endif        /* __ASM_GENERIC_RWONCE_H */









































    1 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ASM_GENERIC_GETORDER_H
#define __ASM_GENERIC_GETORDER_H

#ifndef __ASSEMBLY__

#include <linux/compiler.h>
#include <linux/log2.h>

/**
 * get_order - Determine the allocation order of a memory size
 * @size: The size for which to get the order
 *
 * Determine the allocation order of a particular sized block of memory.  This
 * is on a logarithmic scale, where:
 *
 *        0 -> 2^0 * PAGE_SIZE and below
 *        1 -> 2^1 * PAGE_SIZE to 2^0 * PAGE_SIZE + 1
 *        2 -> 2^2 * PAGE_SIZE to 2^1 * PAGE_SIZE + 1
 *        3 -> 2^3 * PAGE_SIZE to 2^2 * PAGE_SIZE + 1
 *        4 -> 2^4 * PAGE_SIZE to 2^3 * PAGE_SIZE + 1
 *        ...
 *
 * The order returned is used to find the smallest allocation granule required
 * to hold an object of the specified size.
 *
 * The result is undefined if the size is 0.
 */
static inline __attribute_const__ int get_order(unsigned long size)
{
        if (__builtin_constant_p(size)) {
                if (!size)
                        return BITS_PER_LONG - PAGE_SHIFT;

                if (size < (1UL << PAGE_SHIFT))
                        return 0;

                return ilog2((size) - 1) - PAGE_SHIFT + 1;
        }

        size--;
        size >>= PAGE_SHIFT;
#if BITS_PER_LONG == 32
        return fls(size);
#else
        return fls64(size);
#endif
}

#endif        /* __ASSEMBLY__ */

#endif        /* __ASM_GENERIC_GETORDER_H */






















































































































































































































































































    5 



    5 
    5 
    5 
    5 
    5 







    5 








    5 
    5 



    5 



















    5 
    5 
    5 

    5 
























    5 








    5 
    5 



    5 


















    5 


















    5 
























    1 



    1 



















    5 


















    4 























    1 


    1 
    1 



















    2 


















    3 








































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
// SPDX-License-Identifier: GPL-2.0-only

#include <linux/export.h>
#include <linux/nsproxy.h>
#include <linux/slab.h>
#include <linux/sched/signal.h>
#include <linux/user_namespace.h>
#include <linux/proc_ns.h>
#include <linux/highuid.h>
#include <linux/cred.h>
#include <linux/securebits.h>
#include <linux/keyctl.h>
#include <linux/key-type.h>
#include <keys/user-type.h>
#include <linux/seq_file.h>
#include <linux/fs.h>
#include <linux/uaccess.h>
#include <linux/ctype.h>
#include <linux/projid.h>
#include <linux/fs_struct.h>
#include <linux/bsearch.h>
#include <linux/sort.h>

static struct kmem_cache *user_ns_cachep __read_mostly;
static DEFINE_MUTEX(userns_state_mutex);

static bool new_idmap_permitted(const struct file *file,
                                struct user_namespace *ns, int cap_setid,
                                struct uid_gid_map *map);
static void free_user_ns(struct work_struct *work);

static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid)
{
        return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES);
}

static void dec_user_namespaces(struct ucounts *ucounts)
{
        return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES);
}

static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
{
        /* Start with the same capabilities as init but useless for doing
         * anything as the capabilities are bound to the new user namespace.
         */
        cred->securebits = SECUREBITS_DEFAULT;
        cred->cap_inheritable = CAP_EMPTY_SET;
        cred->cap_permitted = CAP_FULL_SET;
        cred->cap_effective = CAP_FULL_SET;
        cred->cap_ambient = CAP_EMPTY_SET;
        cred->cap_bset = CAP_FULL_SET;
#ifdef CONFIG_KEYS
        key_put(cred->request_key_auth);
        cred->request_key_auth = NULL;
#endif
        /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
        cred->user_ns = user_ns;
}

/*
 * Create a new user namespace, deriving the creator from the user in the
 * passed credentials, and replacing that user with the new root user for the
 * new namespace.
 *
 * This is called by copy_creds(), which will finish setting the target task's
 * credentials.
 */
int create_user_ns(struct cred *new)
{
        struct user_namespace *ns, *parent_ns = new->user_ns;
        kuid_t owner = new->euid;
        kgid_t group = new->egid;
        struct ucounts *ucounts;
        int ret, i;

        ret = -ENOSPC;
        if (parent_ns->level > 32)
                goto fail;

        ucounts = inc_user_namespaces(parent_ns, owner);
        if (!ucounts)
                goto fail;

        /*
         * Verify that we can not violate the policy of which files
         * may be accessed that is specified by the root directory,
         * by verifing that the root directory is at the root of the
         * mount namespace which allows all files to be accessed.
         */
        ret = -EPERM;
        if (current_chrooted())
                goto fail_dec;

        /* The creator needs a mapping in the parent user namespace
         * or else we won't be able to reasonably tell userspace who
         * created a user_namespace.
         */
        ret = -EPERM;
        if (!kuid_has_mapping(parent_ns, owner) ||
            !kgid_has_mapping(parent_ns, group))
                goto fail_dec;

        ret = -ENOMEM;
        ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL);
        if (!ns)
                goto fail_dec;

        ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP);
        ret = ns_alloc_inum(&ns->ns);
        if (ret)
                goto fail_free;
        ns->ns.ops = &userns_operations;

        atomic_set(&ns->count, 1);
        /* Leave the new->user_ns reference with the new user namespace. */
        ns->parent = parent_ns;
        ns->level = parent_ns->level + 1;
        ns->owner = owner;
        ns->group = group;
        INIT_WORK(&ns->work, free_user_ns);
        for (i = 0; i < UCOUNT_COUNTS; i++) {
                ns->ucount_max[i] = INT_MAX;
        }
        ns->ucounts = ucounts;

        /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
        mutex_lock(&userns_state_mutex);
        ns->flags = parent_ns->flags;
        mutex_unlock(&userns_state_mutex);

#ifdef CONFIG_KEYS
        INIT_LIST_HEAD(&ns->keyring_name_list);
        init_rwsem(&ns->keyring_sem);
#endif
        ret = -ENOMEM;
        if (!setup_userns_sysctls(ns))
                goto fail_keyring;

        set_cred_user_ns(new, ns);
        return 0;
fail_keyring:
#ifdef CONFIG_PERSISTENT_KEYRINGS
        key_put(ns->persistent_keyring_register);
#endif
        ns_free_inum(&ns->ns);
fail_free:
        kmem_cache_free(user_ns_cachep, ns);
fail_dec:
        dec_user_namespaces(ucounts);
fail:
        return ret;
}

int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
{
        struct cred *cred;
        int err = -ENOMEM;

        if (!(unshare_flags & CLONE_NEWUSER))
                return 0;

        cred = prepare_creds();
        if (cred) {
                err = create_user_ns(cred);
                if (err)
                        put_cred(cred);
                else
                        *new_cred = cred;
        }

        return err;
}

static void free_user_ns(struct work_struct *work)
{
        struct user_namespace *parent, *ns =
                container_of(work, struct user_namespace, work);

        do {
                struct ucounts *ucounts = ns->ucounts;
                parent = ns->parent;
                if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
                        kfree(ns->gid_map.forward);
                        kfree(ns->gid_map.reverse);
                }
                if (ns->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
                        kfree(ns->uid_map.forward);
                        kfree(ns->uid_map.reverse);
                }
                if (ns->projid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
                        kfree(ns->projid_map.forward);
                        kfree(ns->projid_map.reverse);
                }
                retire_userns_sysctls(ns);
                key_free_user_ns(ns);
                ns_free_inum(&ns->ns);
                kmem_cache_free(user_ns_cachep, ns);
                dec_user_namespaces(ucounts);
                ns = parent;
        } while (atomic_dec_and_test(&parent->count));
}

void __put_user_ns(struct user_namespace *ns)
{
        schedule_work(&ns->work);
}
EXPORT_SYMBOL(__put_user_ns);

/**
 * idmap_key struct holds the information necessary to find an idmapping in a
 * sorted idmap array. It is passed to cmp_map_id() as first argument.
 */
struct idmap_key {
        bool map_up; /* true  -> id from kid; false -> kid from id */
        u32 id; /* id to find */
        u32 count; /* == 0 unless used with map_id_range_down() */
};

/**
 * cmp_map_id - Function to be passed to bsearch() to find the requested
 * idmapping. Expects struct idmap_key to be passed via @k.
 */
static int cmp_map_id(const void *k, const void *e)
{
        u32 first, last, id2;
        const struct idmap_key *key = k;
        const struct uid_gid_extent *el = e;

        id2 = key->id + key->count - 1;

        /* handle map_id_{down,up}() */
        if (key->map_up)
                first = el->lower_first;
        else
                first = el->first;

        last = first + el->count - 1;

        if (key->id >= first && key->id <= last &&
            (id2 >= first && id2 <= last))
                return 0;

        if (key->id < first || id2 < first)
                return -1;

        return 1;
}

/**
 * map_id_range_down_max - Find idmap via binary search in ordered idmap array.
 * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
 */
static struct uid_gid_extent *
map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count)
{
        struct idmap_key key;

        key.map_up = false;
        key.count = count;
        key.id = id;

        return bsearch(&key, map->forward, extents,
                       sizeof(struct uid_gid_extent), cmp_map_id);
}

/**
 * map_id_range_down_base - Find idmap via binary search in static extent array.
 * Can only be called if number of mappings is equal or less than
 * UID_GID_MAP_MAX_BASE_EXTENTS.
 */
static struct uid_gid_extent *
map_id_range_down_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count)
{
        unsigned idx;
        u32 first, last, id2;

        id2 = id + count - 1;

        /* Find the matching extent */
        for (idx = 0; idx < extents; idx++) {
                first = map->extent[idx].first;
                last = first + map->extent[idx].count - 1;
                if (id >= first && id <= last &&
                    (id2 >= first && id2 <= last))
                        return &map->extent[idx];
        }
        return NULL;
}

static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
{
        struct uid_gid_extent *extent;
        unsigned extents = map->nr_extents;
        smp_rmb();

        if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                extent = map_id_range_down_base(extents, map, id, count);
        else
                extent = map_id_range_down_max(extents, map, id, count);

        /* Map the id or note failure */
        if (extent)
                id = (id - extent->first) + extent->lower_first;
        else
                id = (u32) -1;

        return id;
}

static u32 map_id_down(struct uid_gid_map *map, u32 id)
{
        return map_id_range_down(map, id, 1);
}

/**
 * map_id_up_base - Find idmap via binary search in static extent array.
 * Can only be called if number of mappings is equal or less than
 * UID_GID_MAP_MAX_BASE_EXTENTS.
 */
static struct uid_gid_extent *
map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id)
{
        unsigned idx;
        u32 first, last;

        /* Find the matching extent */
        for (idx = 0; idx < extents; idx++) {
                first = map->extent[idx].lower_first;
                last = first + map->extent[idx].count - 1;
                if (id >= first && id <= last)
                        return &map->extent[idx];
        }
        return NULL;
}

/**
 * map_id_up_max - Find idmap via binary search in ordered idmap array.
 * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
 */
static struct uid_gid_extent *
map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id)
{
        struct idmap_key key;

        key.map_up = true;
        key.count = 1;
        key.id = id;

        return bsearch(&key, map->reverse, extents,
                       sizeof(struct uid_gid_extent), cmp_map_id);
}

static u32 map_id_up(struct uid_gid_map *map, u32 id)
{
        struct uid_gid_extent *extent;
        unsigned extents = map->nr_extents;
        smp_rmb();

        if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                extent = map_id_up_base(extents, map, id);
        else
                extent = map_id_up_max(extents, map, id);

        /* Map the id or note failure */
        if (extent)
                id = (id - extent->lower_first) + extent->first;
        else
                id = (u32) -1;

        return id;
}

/**
 *        make_kuid - Map a user-namespace uid pair into a kuid.
 *        @ns:  User namespace that the uid is in
 *        @uid: User identifier
 *
 *        Maps a user-namespace uid pair into a kernel internal kuid,
 *        and returns that kuid.
 *
 *        When there is no mapping defined for the user-namespace uid
 *        pair INVALID_UID is returned.  Callers are expected to test
 *        for and handle INVALID_UID being returned.  INVALID_UID
 *        may be tested for using uid_valid().
 */
kuid_t make_kuid(struct user_namespace *ns, uid_t uid)
{
        /* Map the uid to a global kernel uid */
        return KUIDT_INIT(map_id_down(&ns->uid_map, uid));
}
EXPORT_SYMBOL(make_kuid);

/**
 *        from_kuid - Create a uid from a kuid user-namespace pair.
 *        @targ: The user namespace we want a uid in.
 *        @kuid: The kernel internal uid to start with.
 *
 *        Map @kuid into the user-namespace specified by @targ and
 *        return the resulting uid.
 *
 *        There is always a mapping into the initial user_namespace.
 *
 *        If @kuid has no mapping in @targ (uid_t)-1 is returned.
 */
uid_t from_kuid(struct user_namespace *targ, kuid_t kuid)
{
        /* Map the uid from a global kernel uid */
        return map_id_up(&targ->uid_map, __kuid_val(kuid));
}
EXPORT_SYMBOL(from_kuid);

/**
 *        from_kuid_munged - Create a uid from a kuid user-namespace pair.
 *        @targ: The user namespace we want a uid in.
 *        @kuid: The kernel internal uid to start with.
 *
 *        Map @kuid into the user-namespace specified by @targ and
 *        return the resulting uid.
 *
 *        There is always a mapping into the initial user_namespace.
 *
 *        Unlike from_kuid from_kuid_munged never fails and always
 *        returns a valid uid.  This makes from_kuid_munged appropriate
 *        for use in syscalls like stat and getuid where failing the
 *        system call and failing to provide a valid uid are not an
 *        options.
 *
 *        If @kuid has no mapping in @targ overflowuid is returned.
 */
uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid)
{
        uid_t uid;
        uid = from_kuid(targ, kuid);

        if (uid == (uid_t) -1)
                uid = overflowuid;
        return uid;
}
EXPORT_SYMBOL(from_kuid_munged);

/**
 *        make_kgid - Map a user-namespace gid pair into a kgid.
 *        @ns:  User namespace that the gid is in
 *        @gid: group identifier
 *
 *        Maps a user-namespace gid pair into a kernel internal kgid,
 *        and returns that kgid.
 *
 *        When there is no mapping defined for the user-namespace gid
 *        pair INVALID_GID is returned.  Callers are expected to test
 *        for and handle INVALID_GID being returned.  INVALID_GID may be
 *        tested for using gid_valid().
 */
kgid_t make_kgid(struct user_namespace *ns, gid_t gid)
{
        /* Map the gid to a global kernel gid */
        return KGIDT_INIT(map_id_down(&ns->gid_map, gid));
}
EXPORT_SYMBOL(make_kgid);

/**
 *        from_kgid - Create a gid from a kgid user-namespace pair.
 *        @targ: The user namespace we want a gid in.
 *        @kgid: The kernel internal gid to start with.
 *
 *        Map @kgid into the user-namespace specified by @targ and
 *        return the resulting gid.
 *
 *        There is always a mapping into the initial user_namespace.
 *
 *        If @kgid has no mapping in @targ (gid_t)-1 is returned.
 */
gid_t from_kgid(struct user_namespace *targ, kgid_t kgid)
{
        /* Map the gid from a global kernel gid */
        return map_id_up(&targ->gid_map, __kgid_val(kgid));
}
EXPORT_SYMBOL(from_kgid);

/**
 *        from_kgid_munged - Create a gid from a kgid user-namespace pair.
 *        @targ: The user namespace we want a gid in.
 *        @kgid: The kernel internal gid to start with.
 *
 *        Map @kgid into the user-namespace specified by @targ and
 *        return the resulting gid.
 *
 *        There is always a mapping into the initial user_namespace.
 *
 *        Unlike from_kgid from_kgid_munged never fails and always
 *        returns a valid gid.  This makes from_kgid_munged appropriate
 *        for use in syscalls like stat and getgid where failing the
 *        system call and failing to provide a valid gid are not options.
 *
 *        If @kgid has no mapping in @targ overflowgid is returned.
 */
gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid)
{
        gid_t gid;
        gid = from_kgid(targ, kgid);

        if (gid == (gid_t) -1)
                gid = overflowgid;
        return gid;
}
EXPORT_SYMBOL(from_kgid_munged);

/**
 *        make_kprojid - Map a user-namespace projid pair into a kprojid.
 *        @ns:  User namespace that the projid is in
 *        @projid: Project identifier
 *
 *        Maps a user-namespace uid pair into a kernel internal kuid,
 *        and returns that kuid.
 *
 *        When there is no mapping defined for the user-namespace projid
 *        pair INVALID_PROJID is returned.  Callers are expected to test
 *        for and handle INVALID_PROJID being returned.  INVALID_PROJID
 *        may be tested for using projid_valid().
 */
kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid)
{
        /* Map the uid to a global kernel uid */
        return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid));
}
EXPORT_SYMBOL(make_kprojid);

/**
 *        from_kprojid - Create a projid from a kprojid user-namespace pair.
 *        @targ: The user namespace we want a projid in.
 *        @kprojid: The kernel internal project identifier to start with.
 *
 *        Map @kprojid into the user-namespace specified by @targ and
 *        return the resulting projid.
 *
 *        There is always a mapping into the initial user_namespace.
 *
 *        If @kprojid has no mapping in @targ (projid_t)-1 is returned.
 */
projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid)
{
        /* Map the uid from a global kernel uid */
        return map_id_up(&targ->projid_map, __kprojid_val(kprojid));
}
EXPORT_SYMBOL(from_kprojid);

/**
 *        from_kprojid_munged - Create a projiid from a kprojid user-namespace pair.
 *        @targ: The user namespace we want a projid in.
 *        @kprojid: The kernel internal projid to start with.
 *
 *        Map @kprojid into the user-namespace specified by @targ and
 *        return the resulting projid.
 *
 *        There is always a mapping into the initial user_namespace.
 *
 *        Unlike from_kprojid from_kprojid_munged never fails and always
 *        returns a valid projid.  This makes from_kprojid_munged
 *        appropriate for use in syscalls like stat and where
 *        failing the system call and failing to provide a valid projid are
 *        not an options.
 *
 *        If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned.
 */
projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid)
{
        projid_t projid;
        projid = from_kprojid(targ, kprojid);

        if (projid == (projid_t) -1)
                projid = OVERFLOW_PROJID;
        return projid;
}
EXPORT_SYMBOL(from_kprojid_munged);


static int uid_m_show(struct seq_file *seq, void *v)
{
        struct user_namespace *ns = seq->private;
        struct uid_gid_extent *extent = v;
        struct user_namespace *lower_ns;
        uid_t lower;

        lower_ns = seq_user_ns(seq);
        if ((lower_ns == ns) && lower_ns->parent)
                lower_ns = lower_ns->parent;

        lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first));

        seq_printf(seq, "%10u %10u %10u\n",
                extent->first,
                lower,
                extent->count);

        return 0;
}

static int gid_m_show(struct seq_file *seq, void *v)
{
        struct user_namespace *ns = seq->private;
        struct uid_gid_extent *extent = v;
        struct user_namespace *lower_ns;
        gid_t lower;

        lower_ns = seq_user_ns(seq);
        if ((lower_ns == ns) && lower_ns->parent)
                lower_ns = lower_ns->parent;

        lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first));

        seq_printf(seq, "%10u %10u %10u\n",
                extent->first,
                lower,
                extent->count);

        return 0;
}

static int projid_m_show(struct seq_file *seq, void *v)
{
        struct user_namespace *ns = seq->private;
        struct uid_gid_extent *extent = v;
        struct user_namespace *lower_ns;
        projid_t lower;

        lower_ns = seq_user_ns(seq);
        if ((lower_ns == ns) && lower_ns->parent)
                lower_ns = lower_ns->parent;

        lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first));

        seq_printf(seq, "%10u %10u %10u\n",
                extent->first,
                lower,
                extent->count);

        return 0;
}

static void *m_start(struct seq_file *seq, loff_t *ppos,
                     struct uid_gid_map *map)
{
        loff_t pos = *ppos;
        unsigned extents = map->nr_extents;
        smp_rmb();

        if (pos >= extents)
                return NULL;

        if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                return &map->extent[pos];

        return &map->forward[pos];
}

static void *uid_m_start(struct seq_file *seq, loff_t *ppos)
{
        struct user_namespace *ns = seq->private;

        return m_start(seq, ppos, &ns->uid_map);
}

static void *gid_m_start(struct seq_file *seq, loff_t *ppos)
{
        struct user_namespace *ns = seq->private;

        return m_start(seq, ppos, &ns->gid_map);
}

static void *projid_m_start(struct seq_file *seq, loff_t *ppos)
{
        struct user_namespace *ns = seq->private;

        return m_start(seq, ppos, &ns->projid_map);
}

static void *m_next(struct seq_file *seq, void *v, loff_t *pos)
{
        (*pos)++;
        return seq->op->start(seq, pos);
}

static void m_stop(struct seq_file *seq, void *v)
{
        return;
}

const struct seq_operations proc_uid_seq_operations = {
        .start = uid_m_start,
        .stop = m_stop,
        .next = m_next,
        .show = uid_m_show,
};

const struct seq_operations proc_gid_seq_operations = {
        .start = gid_m_start,
        .stop = m_stop,
        .next = m_next,
        .show = gid_m_show,
};

const struct seq_operations proc_projid_seq_operations = {
        .start = projid_m_start,
        .stop = m_stop,
        .next = m_next,
        .show = projid_m_show,
};

static bool mappings_overlap(struct uid_gid_map *new_map,
                             struct uid_gid_extent *extent)
{
        u32 upper_first, lower_first, upper_last, lower_last;
        unsigned idx;

        upper_first = extent->first;
        lower_first = extent->lower_first;
        upper_last = upper_first + extent->count - 1;
        lower_last = lower_first + extent->count - 1;

        for (idx = 0; idx < new_map->nr_extents; idx++) {
                u32 prev_upper_first, prev_lower_first;
                u32 prev_upper_last, prev_lower_last;
                struct uid_gid_extent *prev;

                if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                        prev = &new_map->extent[idx];
                else
                        prev = &new_map->forward[idx];

                prev_upper_first = prev->first;
                prev_lower_first = prev->lower_first;
                prev_upper_last = prev_upper_first + prev->count - 1;
                prev_lower_last = prev_lower_first + prev->count - 1;

                /* Does the upper range intersect a previous extent? */
                if ((prev_upper_first <= upper_last) &&
                    (prev_upper_last >= upper_first))
                        return true;

                /* Does the lower range intersect a previous extent? */
                if ((prev_lower_first <= lower_last) &&
                    (prev_lower_last >= lower_first))
                        return true;
        }
        return false;
}

/**
 * insert_extent - Safely insert a new idmap extent into struct uid_gid_map.
 * Takes care to allocate a 4K block of memory if the number of mappings exceeds
 * UID_GID_MAP_MAX_BASE_EXTENTS.
 */
static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent)
{
        struct uid_gid_extent *dest;

        if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) {
                struct uid_gid_extent *forward;

                /* Allocate memory for 340 mappings. */
                forward = kmalloc_array(UID_GID_MAP_MAX_EXTENTS,
                                        sizeof(struct uid_gid_extent),
                                        GFP_KERNEL);
                if (!forward)
                        return -ENOMEM;

                /* Copy over memory. Only set up memory for the forward pointer.
                 * Defer the memory setup for the reverse pointer.
                 */
                memcpy(forward, map->extent,
                       map->nr_extents * sizeof(map->extent[0]));

                map->forward = forward;
                map->reverse = NULL;
        }

        if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS)
                dest = &map->extent[map->nr_extents];
        else
                dest = &map->forward[map->nr_extents];

        *dest = *extent;
        map->nr_extents++;
        return 0;
}

/* cmp function to sort() forward mappings */
static int cmp_extents_forward(const void *a, const void *b)
{
        const struct uid_gid_extent *e1 = a;
        const struct uid_gid_extent *e2 = b;

        if (e1->first < e2->first)
                return -1;

        if (e1->first > e2->first)
                return 1;

        return 0;
}

/* cmp function to sort() reverse mappings */
static int cmp_extents_reverse(const void *a, const void *b)
{
        const struct uid_gid_extent *e1 = a;
        const struct uid_gid_extent *e2 = b;

        if (e1->lower_first < e2->lower_first)
                return -1;

        if (e1->lower_first > e2->lower_first)
                return 1;

        return 0;
}

/**
 * sort_idmaps - Sorts an array of idmap entries.
 * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
 */
static int sort_idmaps(struct uid_gid_map *map)
{
        if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                return 0;

        /* Sort forward array. */
        sort(map->forward, map->nr_extents, sizeof(struct uid_gid_extent),
             cmp_extents_forward, NULL);

        /* Only copy the memory from forward we actually need. */
        map->reverse = kmemdup(map->forward,
                               map->nr_extents * sizeof(struct uid_gid_extent),
                               GFP_KERNEL);
        if (!map->reverse)
                return -ENOMEM;

        /* Sort reverse array. */
        sort(map->reverse, map->nr_extents, sizeof(struct uid_gid_extent),
             cmp_extents_reverse, NULL);

        return 0;
}

/**
 * verify_root_map() - check the uid 0 mapping
 * @file: idmapping file
 * @map_ns: user namespace of the target process
 * @new_map: requested idmap
 *
 * If a process requests mapping parent uid 0 into the new ns, verify that the
 * process writing the map had the CAP_SETFCAP capability as the target process
 * will be able to write fscaps that are valid in ancestor user namespaces.
 *
 * Return: true if the mapping is allowed, false if not.
 */
static bool verify_root_map(const struct file *file,
                            struct user_namespace *map_ns,
                            struct uid_gid_map *new_map)
{
        int idx;
        const struct user_namespace *file_ns = file->f_cred->user_ns;
        struct uid_gid_extent *extent0 = NULL;

        for (idx = 0; idx < new_map->nr_extents; idx++) {
                if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                        extent0 = &new_map->extent[idx];
                else
                        extent0 = &new_map->forward[idx];
                if (extent0->lower_first == 0)
                        break;

                extent0 = NULL;
        }

        if (!extent0)
                return true;

        if (map_ns == file_ns) {
                /* The process unshared its ns and is writing to its own
                 * /proc/self/uid_map.  User already has full capabilites in
                 * the new namespace.  Verify that the parent had CAP_SETFCAP
                 * when it unshared.
                 * */
                if (!file_ns->parent_could_setfcap)
                        return false;
        } else {
                /* Process p1 is writing to uid_map of p2, who is in a child
                 * user namespace to p1's.  Verify that the opener of the map
                 * file has CAP_SETFCAP against the parent of the new map
                 * namespace */
                if (!file_ns_capable(file, map_ns->parent, CAP_SETFCAP))
                        return false;
        }

        return true;
}

static ssize_t map_write(struct file *file, const char __user *buf,
                         size_t count, loff_t *ppos,
                         int cap_setid,
                         struct uid_gid_map *map,
                         struct uid_gid_map *parent_map)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *map_ns = seq->private;
        struct uid_gid_map new_map;
        unsigned idx;
        struct uid_gid_extent extent;
        char *kbuf = NULL, *pos, *next_line;
        ssize_t ret;

        /* Only allow < page size writes at the beginning of the file */
        if ((*ppos != 0) || (count >= PAGE_SIZE))
                return -EINVAL;

        /* Slurp in the user data */
        kbuf = memdup_user_nul(buf, count);
        if (IS_ERR(kbuf))
                return PTR_ERR(kbuf);

        /*
         * The userns_state_mutex serializes all writes to any given map.
         *
         * Any map is only ever written once.
         *
         * An id map fits within 1 cache line on most architectures.
         *
         * On read nothing needs to be done unless you are on an
         * architecture with a crazy cache coherency model like alpha.
         *
         * There is a one time data dependency between reading the
         * count of the extents and the values of the extents.  The
         * desired behavior is to see the values of the extents that
         * were written before the count of the extents.
         *
         * To achieve this smp_wmb() is used on guarantee the write
         * order and smp_rmb() is guaranteed that we don't have crazy
         * architectures returning stale data.
         */
        mutex_lock(&userns_state_mutex);

        memset(&new_map, 0, sizeof(struct uid_gid_map));

        ret = -EPERM;
        /* Only allow one successful write to the map */
        if (map->nr_extents != 0)
                goto out;

        /*
         * Adjusting namespace settings requires capabilities on the target.
         */
        if (cap_valid(cap_setid) && !file_ns_capable(file, map_ns, CAP_SYS_ADMIN))
                goto out;

        /* Parse the user data */
        ret = -EINVAL;
        pos = kbuf;
        for (; pos; pos = next_line) {

                /* Find the end of line and ensure I don't look past it */
                next_line = strchr(pos, '\n');
                if (next_line) {
                        *next_line = '\0';
                        next_line++;
                        if (*next_line == '\0')
                                next_line = NULL;
                }

                pos = skip_spaces(pos);
                extent.first = simple_strtoul(pos, &pos, 10);
                if (!isspace(*pos))
                        goto out;

                pos = skip_spaces(pos);
                extent.lower_first = simple_strtoul(pos, &pos, 10);
                if (!isspace(*pos))
                        goto out;

                pos = skip_spaces(pos);
                extent.count = simple_strtoul(pos, &pos, 10);
                if (*pos && !isspace(*pos))
                        goto out;

                /* Verify there is not trailing junk on the line */
                pos = skip_spaces(pos);
                if (*pos != '\0')
                        goto out;

                /* Verify we have been given valid starting values */
                if ((extent.first == (u32) -1) ||
                    (extent.lower_first == (u32) -1))
                        goto out;

                /* Verify count is not zero and does not cause the
                 * extent to wrap
                 */
                if ((extent.first + extent.count) <= extent.first)
                        goto out;
                if ((extent.lower_first + extent.count) <=
                     extent.lower_first)
                        goto out;

                /* Do the ranges in extent overlap any previous extents? */
                if (mappings_overlap(&new_map, &extent))
                        goto out;

                if ((new_map.nr_extents + 1) == UID_GID_MAP_MAX_EXTENTS &&
                    (next_line != NULL))
                        goto out;

                ret = insert_extent(&new_map, &extent);
                if (ret < 0)
                        goto out;
                ret = -EINVAL;
        }
        /* Be very certaint the new map actually exists */
        if (new_map.nr_extents == 0)
                goto out;

        ret = -EPERM;
        /* Validate the user is allowed to use user id's mapped to. */
        if (!new_idmap_permitted(file, map_ns, cap_setid, &new_map))
                goto out;

        ret = -EPERM;
        /* Map the lower ids from the parent user namespace to the
         * kernel global id space.
         */
        for (idx = 0; idx < new_map.nr_extents; idx++) {
                struct uid_gid_extent *e;
                u32 lower_first;

                if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                        e = &new_map.extent[idx];
                else
                        e = &new_map.forward[idx];

                lower_first = map_id_range_down(parent_map,
                                                e->lower_first,
                                                e->count);

                /* Fail if we can not map the specified extent to
                 * the kernel global id space.
                 */
                if (lower_first == (u32) -1)
                        goto out;

                e->lower_first = lower_first;
        }

        /*
         * If we want to use binary search for lookup, this clones the extent
         * array and sorts both copies.
         */
        ret = sort_idmaps(&new_map);
        if (ret < 0)
                goto out;

        /* Install the map */
        if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) {
                memcpy(map->extent, new_map.extent,
                       new_map.nr_extents * sizeof(new_map.extent[0]));
        } else {
                map->forward = new_map.forward;
                map->reverse = new_map.reverse;
        }
        smp_wmb();
        map->nr_extents = new_map.nr_extents;

        *ppos = count;
        ret = count;
out:
        if (ret < 0 && new_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
                kfree(new_map.forward);
                kfree(new_map.reverse);
                map->forward = NULL;
                map->reverse = NULL;
                map->nr_extents = 0;
        }

        mutex_unlock(&userns_state_mutex);
        kfree(kbuf);
        return ret;
}

ssize_t proc_uid_map_write(struct file *file, const char __user *buf,
                           size_t size, loff_t *ppos)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *ns = seq->private;
        struct user_namespace *seq_ns = seq_user_ns(seq);

        if (!ns->parent)
                return -EPERM;

        if ((seq_ns != ns) && (seq_ns != ns->parent))
                return -EPERM;

        return map_write(file, buf, size, ppos, CAP_SETUID,
                         &ns->uid_map, &ns->parent->uid_map);
}

ssize_t proc_gid_map_write(struct file *file, const char __user *buf,
                           size_t size, loff_t *ppos)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *ns = seq->private;
        struct user_namespace *seq_ns = seq_user_ns(seq);

        if (!ns->parent)
                return -EPERM;

        if ((seq_ns != ns) && (seq_ns != ns->parent))
                return -EPERM;

        return map_write(file, buf, size, ppos, CAP_SETGID,
                         &ns->gid_map, &ns->parent->gid_map);
}

ssize_t proc_projid_map_write(struct file *file, const char __user *buf,
                              size_t size, loff_t *ppos)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *ns = seq->private;
        struct user_namespace *seq_ns = seq_user_ns(seq);

        if (!ns->parent)
                return -EPERM;

        if ((seq_ns != ns) && (seq_ns != ns->parent))
                return -EPERM;

        /* Anyone can set any valid project id no capability needed */
        return map_write(file, buf, size, ppos, -1,
                         &ns->projid_map, &ns->parent->projid_map);
}

static bool new_idmap_permitted(const struct file *file,
                                struct user_namespace *ns, int cap_setid,
                                struct uid_gid_map *new_map)
{
        const struct cred *cred = file->f_cred;

        if (cap_setid == CAP_SETUID && !verify_root_map(file, ns, new_map))
                return false;

        /* Don't allow mappings that would allow anything that wouldn't
         * be allowed without the establishment of unprivileged mappings.
         */
        if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) &&
            uid_eq(ns->owner, cred->euid)) {
                u32 id = new_map->extent[0].lower_first;
                if (cap_setid == CAP_SETUID) {
                        kuid_t uid = make_kuid(ns->parent, id);
                        if (uid_eq(uid, cred->euid))
                                return true;
                } else if (cap_setid == CAP_SETGID) {
                        kgid_t gid = make_kgid(ns->parent, id);
                        if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) &&
                            gid_eq(gid, cred->egid))
                                return true;
                }
        }

        /* Allow anyone to set a mapping that doesn't require privilege */
        if (!cap_valid(cap_setid))
                return true;

        /* Allow the specified ids if we have the appropriate capability
         * (CAP_SETUID or CAP_SETGID) over the parent user namespace.
         * And the opener of the id file also had the approprpiate capability.
         */
        if (ns_capable(ns->parent, cap_setid) &&
            file_ns_capable(file, ns->parent, cap_setid))
                return true;

        return false;
}

int proc_setgroups_show(struct seq_file *seq, void *v)
{
        struct user_namespace *ns = seq->private;
        unsigned long userns_flags = READ_ONCE(ns->flags);

        seq_printf(seq, "%s\n",
                   (userns_flags & USERNS_SETGROUPS_ALLOWED) ?
                   "allow" : "deny");
        return 0;
}

ssize_t proc_setgroups_write(struct file *file, const char __user *buf,
                             size_t count, loff_t *ppos)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *ns = seq->private;
        char kbuf[8], *pos;
        bool setgroups_allowed;
        ssize_t ret;

        /* Only allow a very narrow range of strings to be written */
        ret = -EINVAL;
        if ((*ppos != 0) || (count >= sizeof(kbuf)))
                goto out;

        /* What was written? */
        ret = -EFAULT;
        if (copy_from_user(kbuf, buf, count))
                goto out;
        kbuf[count] = '\0';
        pos = kbuf;

        /* What is being requested? */
        ret = -EINVAL;
        if (strncmp(pos, "allow", 5) == 0) {
                pos += 5;
                setgroups_allowed = true;
        }
        else if (strncmp(pos, "deny", 4) == 0) {
                pos += 4;
                setgroups_allowed = false;
        }
        else
                goto out;

        /* Verify there is not trailing junk on the line */
        pos = skip_spaces(pos);
        if (*pos != '\0')
                goto out;

        ret = -EPERM;
        mutex_lock(&userns_state_mutex);
        if (setgroups_allowed) {
                /* Enabling setgroups after setgroups has been disabled
                 * is not allowed.
                 */
                if (!(ns->flags & USERNS_SETGROUPS_ALLOWED))
                        goto out_unlock;
        } else {
                /* Permanently disabling setgroups after setgroups has
                 * been enabled by writing the gid_map is not allowed.
                 */
                if (ns->gid_map.nr_extents != 0)
                        goto out_unlock;
                ns->flags &= ~USERNS_SETGROUPS_ALLOWED;
        }
        mutex_unlock(&userns_state_mutex);

        /* Report a successful write */
        *ppos = count;
        ret = count;
out:
        return ret;
out_unlock:
        mutex_unlock(&userns_state_mutex);
        goto out;
}

bool userns_may_setgroups(const struct user_namespace *ns)
{
        bool allowed;

        mutex_lock(&userns_state_mutex);
        /* It is not safe to use setgroups until a gid mapping in
         * the user namespace has been established.
         */
        allowed = ns->gid_map.nr_extents != 0;
        /* Is setgroups allowed? */
        allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED);
        mutex_unlock(&userns_state_mutex);

        return allowed;
}

/*
 * Returns true if @child is the same namespace or a descendant of
 * @ancestor.
 */
bool in_userns(const struct user_namespace *ancestor,
               const struct user_namespace *child)
{
        const struct user_namespace *ns;
        for (ns = child; ns->level > ancestor->level; ns = ns->parent)
                ;
        return (ns == ancestor);
}

bool current_in_userns(const struct user_namespace *target_ns)
{
        return in_userns(target_ns, current_user_ns());
}
EXPORT_SYMBOL(current_in_userns);

static inline struct user_namespace *to_user_ns(struct ns_common *ns)
{
        return container_of(ns, struct user_namespace, ns);
}

static struct ns_common *userns_get(struct task_struct *task)
{
        struct user_namespace *user_ns;

        rcu_read_lock();
        user_ns = get_user_ns(__task_cred(task)->user_ns);
        rcu_read_unlock();

        return user_ns ? &user_ns->ns : NULL;
}

static void userns_put(struct ns_common *ns)
{
        put_user_ns(to_user_ns(ns));
}

static int userns_install(struct nsset *nsset, struct ns_common *ns)
{
        struct user_namespace *user_ns = to_user_ns(ns);
        struct cred *cred;

        /* Don't allow gaining capabilities by reentering
         * the same user namespace.
         */
        if (user_ns == current_user_ns())
                return -EINVAL;

        /* Tasks that share a thread group must share a user namespace */
        if (!thread_group_empty(current))
                return -EINVAL;

        if (current->fs->users != 1)
                return -EINVAL;

        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        cred = nsset_cred(nsset);
        if (!cred)
                return -EINVAL;

        put_user_ns(cred->user_ns);
        set_cred_user_ns(cred, get_user_ns(user_ns));

        return 0;
}

struct ns_common *ns_get_owner(struct ns_common *ns)
{
        struct user_namespace *my_user_ns = current_user_ns();
        struct user_namespace *owner, *p;

        /* See if the owner is in the current user namespace */
        owner = p = ns->ops->owner(ns);
        for (;;) {
                if (!p)
                        return ERR_PTR(-EPERM);
                if (p == my_user_ns)
                        break;
                p = p->parent;
        }

        return &get_user_ns(owner)->ns;
}

static struct user_namespace *userns_owner(struct ns_common *ns)
{
        return to_user_ns(ns)->parent;
}

const struct proc_ns_operations userns_operations = {
        .name                = "user",
        .type                = CLONE_NEWUSER,
        .get                = userns_get,
        .put                = userns_put,
        .install        = userns_install,
        .owner                = userns_owner,
        .get_parent        = ns_get_owner,
};

static __init int user_namespaces_init(void)
{
        user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
        return 0;
}
subsys_initcall(user_namespaces_init);



















































































































    1 












































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * A security identifier table (sidtab) is a lookup table
 * of security context structures indexed by SID value.
 *
 * Original author: Stephen Smalley, <sds@tycho.nsa.gov>
 * Author: Ondrej Mosnacek, <omosnacek@gmail.com>
 *
 * Copyright (C) 2018 Red Hat, Inc.
 */
#ifndef _SS_SIDTAB_H_
#define _SS_SIDTAB_H_

#include <linux/spinlock_types.h>
#include <linux/log2.h>
#include <linux/hashtable.h>

#include "context.h"

struct sidtab_entry {
        u32 sid;
        u32 hash;
        struct context context;
#if CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE > 0
        struct sidtab_str_cache __rcu *cache;
#endif
        struct hlist_node list;
};

union sidtab_entry_inner {
        struct sidtab_node_inner *ptr_inner;
        struct sidtab_node_leaf  *ptr_leaf;
};

/* align node size to page boundary */
#define SIDTAB_NODE_ALLOC_SHIFT PAGE_SHIFT
#define SIDTAB_NODE_ALLOC_SIZE  PAGE_SIZE

#define size_to_shift(size) ((size) == 1 ? 1 : (const_ilog2((size) - 1) + 1))

#define SIDTAB_INNER_SHIFT \
        (SIDTAB_NODE_ALLOC_SHIFT - size_to_shift(sizeof(union sidtab_entry_inner)))
#define SIDTAB_INNER_ENTRIES ((size_t)1 << SIDTAB_INNER_SHIFT)
#define SIDTAB_LEAF_ENTRIES \
        (SIDTAB_NODE_ALLOC_SIZE / sizeof(struct sidtab_entry))

#define SIDTAB_MAX_BITS 32
#define SIDTAB_MAX U32_MAX
/* ensure enough tree levels for SIDTAB_MAX entries */
#define SIDTAB_MAX_LEVEL \
        DIV_ROUND_UP(SIDTAB_MAX_BITS - size_to_shift(SIDTAB_LEAF_ENTRIES), \
                     SIDTAB_INNER_SHIFT)

struct sidtab_node_leaf {
        struct sidtab_entry entries[SIDTAB_LEAF_ENTRIES];
};

struct sidtab_node_inner {
        union sidtab_entry_inner entries[SIDTAB_INNER_ENTRIES];
};

struct sidtab_isid_entry {
        int set;
        struct sidtab_entry entry;
};

struct sidtab_convert_params {
        int (*func)(struct context *oldc, struct context *newc, void *args, gfp_t gfp_flags);
        void *args;
        struct sidtab *target;
};

#define SIDTAB_HASH_BITS CONFIG_SECURITY_SELINUX_SIDTAB_HASH_BITS
#define SIDTAB_HASH_BUCKETS (1 << SIDTAB_HASH_BITS)

struct sidtab {
        /*
         * lock-free read access only for as many items as a prior read of
         * 'count'
         */
        union sidtab_entry_inner roots[SIDTAB_MAX_LEVEL + 1];
        /*
         * access atomically via {READ|WRITE}_ONCE(); only increment under
         * spinlock
         */
        u32 count;
        /* access only under spinlock */
        struct sidtab_convert_params *convert;
        bool frozen;
        spinlock_t lock;

#if CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE > 0
        /* SID -> context string cache */
        u32 cache_free_slots;
        struct list_head cache_lru_list;
        spinlock_t cache_lock;
#endif

        /* index == SID - 1 (no entry for SECSID_NULL) */
        struct sidtab_isid_entry isids[SECINITSID_NUM];

        /* Hash table for fast reverse context-to-sid lookups. */
        DECLARE_HASHTABLE(context_to_sid, SIDTAB_HASH_BITS);
};

int sidtab_init(struct sidtab *s);
int sidtab_set_initial(struct sidtab *s, u32 sid, struct context *context);
struct sidtab_entry *sidtab_search_entry(struct sidtab *s, u32 sid);
struct sidtab_entry *sidtab_search_entry_force(struct sidtab *s, u32 sid);

static inline struct context *sidtab_search(struct sidtab *s, u32 sid)
{
        struct sidtab_entry *entry = sidtab_search_entry(s, sid);

        return entry ? &entry->context : NULL;
}

static inline struct context *sidtab_search_force(struct sidtab *s, u32 sid)
{
        struct sidtab_entry *entry = sidtab_search_entry_force(s, sid);

        return entry ? &entry->context : NULL;
}

int sidtab_convert(struct sidtab *s, struct sidtab_convert_params *params);

void sidtab_cancel_convert(struct sidtab *s);

void sidtab_freeze_begin(struct sidtab *s, unsigned long *flags) __acquires(&s->lock);
void sidtab_freeze_end(struct sidtab *s, unsigned long *flags) __releases(&s->lock);

int sidtab_context_to_sid(struct sidtab *s, struct context *context, u32 *sid);

void sidtab_destroy(struct sidtab *s);

int sidtab_hash_stats(struct sidtab *sidtab, char *page);

#if CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE > 0
void sidtab_sid2str_put(struct sidtab *s, struct sidtab_entry *entry,
                        const char *str, u32 str_len);
int sidtab_sid2str_get(struct sidtab *s, struct sidtab_entry *entry,
                       char **out, u32 *out_len);
#else
static inline void sidtab_sid2str_put(struct sidtab *s,
                                      struct sidtab_entry *entry,
                                      const char *str, u32 str_len)
{
}
static inline int sidtab_sid2str_get(struct sidtab *s,
                                     struct sidtab_entry *entry,
                                     char **out, u32 *out_len)
{
        return -ENOENT;
}
#endif /* CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE > 0 */

#endif        /* _SS_SIDTAB_H_ */





















    2 







    2 





    2 




    2 









































































    2 






    2 



    2 


    2 

















    2 
    2 














    2 







    1 



    2 
    2 


    2 









    2 










































    1 
    1 


    1 

































































































































































































    2 





    2 



    1 
    2 

    1 


    1 


    2 


































































































































































































































    1 
    1 


    1 






    1 

    1 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2016 Facebook
 * Copyright (C) 2013-2014 Jens Axboe
 */

#include <linux/sched.h>
#include <linux/random.h>
#include <linux/sbitmap.h>
#include <linux/seq_file.h>

/*
 * See if we have deferred clears that we can batch move
 */
static inline bool sbitmap_deferred_clear(struct sbitmap *sb, int index)
{
        unsigned long mask, val;
        bool ret = false;
        unsigned long flags;

        spin_lock_irqsave(&sb->map[index].swap_lock, flags);

        if (!sb->map[index].cleared)
                goto out_unlock;

        /*
         * First get a stable cleared mask, setting the old mask to 0.
         */
        mask = xchg(&sb->map[index].cleared, 0);

        /*
         * Now clear the masked bits in our free word
         */
        do {
                val = sb->map[index].word;
        } while (cmpxchg(&sb->map[index].word, val, val & ~mask) != val);

        ret = true;
out_unlock:
        spin_unlock_irqrestore(&sb->map[index].swap_lock, flags);
        return ret;
}

int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
                      gfp_t flags, int node)
{
        unsigned int bits_per_word;
        unsigned int i;

        if (shift < 0) {
                shift = ilog2(BITS_PER_LONG);
                /*
                 * If the bitmap is small, shrink the number of bits per word so
                 * we spread over a few cachelines, at least. If less than 4
                 * bits, just forget about it, it's not going to work optimally
                 * anyway.
                 */
                if (depth >= 4) {
                        while ((4U << shift) > depth)
                                shift--;
                }
        }
        bits_per_word = 1U << shift;
        if (bits_per_word > BITS_PER_LONG)
                return -EINVAL;

        sb->shift = shift;
        sb->depth = depth;
        sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word);

        if (depth == 0) {
                sb->map = NULL;
                return 0;
        }

        sb->map = kcalloc_node(sb->map_nr, sizeof(*sb->map), flags, node);
        if (!sb->map)
                return -ENOMEM;

        for (i = 0; i < sb->map_nr; i++) {
                sb->map[i].depth = min(depth, bits_per_word);
                depth -= sb->map[i].depth;
                spin_lock_init(&sb->map[i].swap_lock);
        }
        return 0;
}
EXPORT_SYMBOL_GPL(sbitmap_init_node);

void sbitmap_resize(struct sbitmap *sb, unsigned int depth)
{
        unsigned int bits_per_word = 1U << sb->shift;
        unsigned int i;

        for (i = 0; i < sb->map_nr; i++)
                sbitmap_deferred_clear(sb, i);

        sb->depth = depth;
        sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word);

        for (i = 0; i < sb->map_nr; i++) {
                sb->map[i].depth = min(depth, bits_per_word);
                depth -= sb->map[i].depth;
        }
}
EXPORT_SYMBOL_GPL(sbitmap_resize);

static int __sbitmap_get_word(unsigned long *word, unsigned long depth,
                              unsigned int hint, bool wrap)
{
        unsigned int orig_hint = hint;
        int nr;

        while (1) {
                nr = find_next_zero_bit(word, depth, hint);
                if (unlikely(nr >= depth)) {
                        /*
                         * We started with an offset, and we didn't reset the
                         * offset to 0 in a failure case, so start from 0 to
                         * exhaust the map.
                         */
                        if (orig_hint && hint && wrap) {
                                hint = orig_hint = 0;
                                continue;
                        }
                        return -1;
                }

                if (!test_and_set_bit_lock(nr, word))
                        break;

                hint = nr + 1;
                if (hint >= depth - 1)
                        hint = 0;
        }

        return nr;
}

static int sbitmap_find_bit_in_index(struct sbitmap *sb, int index,
                                     unsigned int alloc_hint, bool round_robin)
{
        int nr;

        do {
                nr = __sbitmap_get_word(&sb->map[index].word,
                                        sb->map[index].depth, alloc_hint,
                                        !round_robin);
                if (nr != -1)
                        break;
                if (!sbitmap_deferred_clear(sb, index))
                        break;
        } while (1);

        return nr;
}

int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
{
        unsigned int i, index;
        int nr = -1;

        index = SB_NR_TO_INDEX(sb, alloc_hint);

        /*
         * Unless we're doing round robin tag allocation, just use the
         * alloc_hint to find the right word index. No point in looping
         * twice in find_next_zero_bit() for that case.
         */
        if (round_robin)
                alloc_hint = SB_NR_TO_BIT(sb, alloc_hint);
        else
                alloc_hint = 0;

        for (i = 0; i < sb->map_nr; i++) {
                nr = sbitmap_find_bit_in_index(sb, index, alloc_hint,
                                                round_robin);
                if (nr != -1) {
                        nr += index << sb->shift;
                        break;
                }

                /* Jump to next index. */
                alloc_hint = 0;
                if (++index >= sb->map_nr)
                        index = 0;
        }

        return nr;
}
EXPORT_SYMBOL_GPL(sbitmap_get);

int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint,
                        unsigned long shallow_depth)
{
        unsigned int i, index;
        int nr = -1;

        index = SB_NR_TO_INDEX(sb, alloc_hint);

        for (i = 0; i < sb->map_nr; i++) {
again:
                nr = __sbitmap_get_word(&sb->map[index].word,
                                        min(sb->map[index].depth, shallow_depth),
                                        SB_NR_TO_BIT(sb, alloc_hint), true);
                if (nr != -1) {
                        nr += index << sb->shift;
                        break;
                }

                if (sbitmap_deferred_clear(sb, index))
                        goto again;

                /* Jump to next index. */
                index++;
                alloc_hint = index << sb->shift;

                if (index >= sb->map_nr) {
                        index = 0;
                        alloc_hint = 0;
                }
        }

        return nr;
}
EXPORT_SYMBOL_GPL(sbitmap_get_shallow);

bool sbitmap_any_bit_set(const struct sbitmap *sb)
{
        unsigned int i;

        for (i = 0; i < sb->map_nr; i++) {
                if (sb->map[i].word & ~sb->map[i].cleared)
                        return true;
        }
        return false;
}
EXPORT_SYMBOL_GPL(sbitmap_any_bit_set);

static unsigned int __sbitmap_weight(const struct sbitmap *sb, bool set)
{
        unsigned int i, weight = 0;

        for (i = 0; i < sb->map_nr; i++) {
                const struct sbitmap_word *word = &sb->map[i];

                if (set)
                        weight += bitmap_weight(&word->word, word->depth);
                else
                        weight += bitmap_weight(&word->cleared, word->depth);
        }
        return weight;
}

static unsigned int sbitmap_weight(const struct sbitmap *sb)
{
        return __sbitmap_weight(sb, true);
}

static unsigned int sbitmap_cleared(const struct sbitmap *sb)
{
        return __sbitmap_weight(sb, false);
}

void sbitmap_show(struct sbitmap *sb, struct seq_file *m)
{
        seq_printf(m, "depth=%u\n", sb->depth);
        seq_printf(m, "busy=%u\n", sbitmap_weight(sb) - sbitmap_cleared(sb));
        seq_printf(m, "cleared=%u\n", sbitmap_cleared(sb));
        seq_printf(m, "bits_per_word=%u\n", 1U << sb->shift);
        seq_printf(m, "map_nr=%u\n", sb->map_nr);
}
EXPORT_SYMBOL_GPL(sbitmap_show);

static inline void emit_byte(struct seq_file *m, unsigned int offset, u8 byte)
{
        if ((offset & 0xf) == 0) {
                if (offset != 0)
                        seq_putc(m, '\n');
                seq_printf(m, "%08x:", offset);
        }
        if ((offset & 0x1) == 0)
                seq_putc(m, ' ');
        seq_printf(m, "%02x", byte);
}

void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m)
{
        u8 byte = 0;
        unsigned int byte_bits = 0;
        unsigned int offset = 0;
        int i;

        for (i = 0; i < sb->map_nr; i++) {
                unsigned long word = READ_ONCE(sb->map[i].word);
                unsigned long cleared = READ_ONCE(sb->map[i].cleared);
                unsigned int word_bits = READ_ONCE(sb->map[i].depth);

                word &= ~cleared;

                while (word_bits > 0) {
                        unsigned int bits = min(8 - byte_bits, word_bits);

                        byte |= (word & (BIT(bits) - 1)) << byte_bits;
                        byte_bits += bits;
                        if (byte_bits == 8) {
                                emit_byte(m, offset, byte);
                                byte = 0;
                                byte_bits = 0;
                                offset++;
                        }
                        word >>= bits;
                        word_bits -= bits;
                }
        }
        if (byte_bits) {
                emit_byte(m, offset, byte);
                offset++;
        }
        if (offset)
                seq_putc(m, '\n');
}
EXPORT_SYMBOL_GPL(sbitmap_bitmap_show);

static unsigned int sbq_calc_wake_batch(struct sbitmap_queue *sbq,
                                        unsigned int depth)
{
        unsigned int wake_batch;
        unsigned int shallow_depth;

        /*
         * For each batch, we wake up one queue. We need to make sure that our
         * batch size is small enough that the full depth of the bitmap,
         * potentially limited by a shallow depth, is enough to wake up all of
         * the queues.
         *
         * Each full word of the bitmap has bits_per_word bits, and there might
         * be a partial word. There are depth / bits_per_word full words and
         * depth % bits_per_word bits left over. In bitwise arithmetic:
         *
         * bits_per_word = 1 << shift
         * depth / bits_per_word = depth >> shift
         * depth % bits_per_word = depth & ((1 << shift) - 1)
         *
         * Each word can be limited to sbq->min_shallow_depth bits.
         */
        shallow_depth = min(1U << sbq->sb.shift, sbq->min_shallow_depth);
        depth = ((depth >> sbq->sb.shift) * shallow_depth +
                 min(depth & ((1U << sbq->sb.shift) - 1), shallow_depth));
        wake_batch = clamp_t(unsigned int, depth / SBQ_WAIT_QUEUES, 1,
                             SBQ_WAKE_BATCH);

        return wake_batch;
}

int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
                            int shift, bool round_robin, gfp_t flags, int node)
{
        int ret;
        int i;

        ret = sbitmap_init_node(&sbq->sb, depth, shift, flags, node);
        if (ret)
                return ret;

        sbq->alloc_hint = alloc_percpu_gfp(unsigned int, flags);
        if (!sbq->alloc_hint) {
                sbitmap_free(&sbq->sb);
                return -ENOMEM;
        }

        if (depth && !round_robin) {
                for_each_possible_cpu(i)
                        *per_cpu_ptr(sbq->alloc_hint, i) = prandom_u32() % depth;
        }

        sbq->min_shallow_depth = UINT_MAX;
        sbq->wake_batch = sbq_calc_wake_batch(sbq, depth);
        atomic_set(&sbq->wake_index, 0);
        atomic_set(&sbq->ws_active, 0);

        sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node);
        if (!sbq->ws) {
                free_percpu(sbq->alloc_hint);
                sbitmap_free(&sbq->sb);
                return -ENOMEM;
        }

        for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
                init_waitqueue_head(&sbq->ws[i].wait);
                atomic_set(&sbq->ws[i].wait_cnt, sbq->wake_batch);
        }

        sbq->round_robin = round_robin;
        return 0;
}
EXPORT_SYMBOL_GPL(sbitmap_queue_init_node);

static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
                                            unsigned int depth)
{
        unsigned int wake_batch = sbq_calc_wake_batch(sbq, depth);
        int i;

        if (sbq->wake_batch != wake_batch) {
                WRITE_ONCE(sbq->wake_batch, wake_batch);
                /*
                 * Pairs with the memory barrier in sbitmap_queue_wake_up()
                 * to ensure that the batch size is updated before the wait
                 * counts.
                 */
                smp_mb();
                for (i = 0; i < SBQ_WAIT_QUEUES; i++)
                        atomic_set(&sbq->ws[i].wait_cnt, 1);
        }
}

void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth)
{
        sbitmap_queue_update_wake_batch(sbq, depth);
        sbitmap_resize(&sbq->sb, depth);
}
EXPORT_SYMBOL_GPL(sbitmap_queue_resize);

int __sbitmap_queue_get(struct sbitmap_queue *sbq)
{
        unsigned int hint, depth;
        int nr;

        hint = this_cpu_read(*sbq->alloc_hint);
        depth = READ_ONCE(sbq->sb.depth);
        if (unlikely(hint >= depth)) {
                hint = depth ? prandom_u32() % depth : 0;
                this_cpu_write(*sbq->alloc_hint, hint);
        }
        nr = sbitmap_get(&sbq->sb, hint, sbq->round_robin);

        if (nr == -1) {
                /* If the map is full, a hint won't do us much good. */
                this_cpu_write(*sbq->alloc_hint, 0);
        } else if (nr == hint || unlikely(sbq->round_robin)) {
                /* Only update the hint if we used it. */
                hint = nr + 1;
                if (hint >= depth - 1)
                        hint = 0;
                this_cpu_write(*sbq->alloc_hint, hint);
        }

        return nr;
}
EXPORT_SYMBOL_GPL(__sbitmap_queue_get);

int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
                                unsigned int shallow_depth)
{
        unsigned int hint, depth;
        int nr;

        WARN_ON_ONCE(shallow_depth < sbq->min_shallow_depth);

        hint = this_cpu_read(*sbq->alloc_hint);
        depth = READ_ONCE(sbq->sb.depth);
        if (unlikely(hint >= depth)) {
                hint = depth ? prandom_u32() % depth : 0;
                this_cpu_write(*sbq->alloc_hint, hint);
        }
        nr = sbitmap_get_shallow(&sbq->sb, hint, shallow_depth);

        if (nr == -1) {
                /* If the map is full, a hint won't do us much good. */
                this_cpu_write(*sbq->alloc_hint, 0);
        } else if (nr == hint || unlikely(sbq->round_robin)) {
                /* Only update the hint if we used it. */
                hint = nr + 1;
                if (hint >= depth - 1)
                        hint = 0;
                this_cpu_write(*sbq->alloc_hint, hint);
        }

        return nr;
}
EXPORT_SYMBOL_GPL(__sbitmap_queue_get_shallow);

void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq,
                                     unsigned int min_shallow_depth)
{
        sbq->min_shallow_depth = min_shallow_depth;
        sbitmap_queue_update_wake_batch(sbq, sbq->sb.depth);
}
EXPORT_SYMBOL_GPL(sbitmap_queue_min_shallow_depth);

static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
{
        int i, wake_index;

        if (!atomic_read(&sbq->ws_active))
                return NULL;

        wake_index = atomic_read(&sbq->wake_index);
        for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
                struct sbq_wait_state *ws = &sbq->ws[wake_index];

                if (waitqueue_active(&ws->wait)) {
                        if (wake_index != atomic_read(&sbq->wake_index))
                                atomic_set(&sbq->wake_index, wake_index);
                        return ws;
                }

                wake_index = sbq_index_inc(wake_index);
        }

        return NULL;
}

static bool __sbq_wake_up(struct sbitmap_queue *sbq)
{
        struct sbq_wait_state *ws;
        unsigned int wake_batch;
        int wait_cnt;

        ws = sbq_wake_ptr(sbq);
        if (!ws)
                return false;

        wait_cnt = atomic_dec_return(&ws->wait_cnt);
        if (wait_cnt <= 0) {
                int ret;

                wake_batch = READ_ONCE(sbq->wake_batch);

                /*
                 * Pairs with the memory barrier in sbitmap_queue_resize() to
                 * ensure that we see the batch size update before the wait
                 * count is reset.
                 */
                smp_mb__before_atomic();

                /*
                 * For concurrent callers of this, the one that failed the
                 * atomic_cmpxhcg() race should call this function again
                 * to wakeup a new batch on a different 'ws'.
                 */
                ret = atomic_cmpxchg(&ws->wait_cnt, wait_cnt, wake_batch);
                if (ret == wait_cnt) {
                        sbq_index_atomic_inc(&sbq->wake_index);
                        wake_up_nr(&ws->wait, wake_batch);
                        return false;
                }

                return true;
        }

        return false;
}

void sbitmap_queue_wake_up(struct sbitmap_queue *sbq)
{
        while (__sbq_wake_up(sbq))
                ;
}
EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up);

void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
                         unsigned int cpu)
{
        /*
         * Once the clear bit is set, the bit may be allocated out.
         *
         * Orders READ/WRITE on the asssociated instance(such as request
         * of blk_mq) by this bit for avoiding race with re-allocation,
         * and its pair is the memory barrier implied in __sbitmap_get_word.
         *
         * One invariant is that the clear bit has to be zero when the bit
         * is in use.
         */
        smp_mb__before_atomic();
        sbitmap_deferred_clear_bit(&sbq->sb, nr);

        /*
         * Pairs with the memory barrier in set_current_state() to ensure the
         * proper ordering of clear_bit_unlock()/waitqueue_active() in the waker
         * and test_and_set_bit_lock()/prepare_to_wait()/finish_wait() in the
         * waiter. See the comment on waitqueue_active().
         */
        smp_mb__after_atomic();
        sbitmap_queue_wake_up(sbq);

        if (likely(!sbq->round_robin && nr < sbq->sb.depth))
                *per_cpu_ptr(sbq->alloc_hint, cpu) = nr;
}
EXPORT_SYMBOL_GPL(sbitmap_queue_clear);

void sbitmap_queue_wake_all(struct sbitmap_queue *sbq)
{
        int i, wake_index;

        /*
         * Pairs with the memory barrier in set_current_state() like in
         * sbitmap_queue_wake_up().
         */
        smp_mb();
        wake_index = atomic_read(&sbq->wake_index);
        for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
                struct sbq_wait_state *ws = &sbq->ws[wake_index];

                if (waitqueue_active(&ws->wait))
                        wake_up(&ws->wait);

                wake_index = sbq_index_inc(wake_index);
        }
}
EXPORT_SYMBOL_GPL(sbitmap_queue_wake_all);

void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m)
{
        bool first;
        int i;

        sbitmap_show(&sbq->sb, m);

        seq_puts(m, "alloc_hint={");
        first = true;
        for_each_possible_cpu(i) {
                if (!first)
                        seq_puts(m, ", ");
                first = false;
                seq_printf(m, "%u", *per_cpu_ptr(sbq->alloc_hint, i));
        }
        seq_puts(m, "}\n");

        seq_printf(m, "wake_batch=%u\n", sbq->wake_batch);
        seq_printf(m, "wake_index=%d\n", atomic_read(&sbq->wake_index));
        seq_printf(m, "ws_active=%d\n", atomic_read(&sbq->ws_active));

        seq_puts(m, "ws={\n");
        for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
                struct sbq_wait_state *ws = &sbq->ws[i];

                seq_printf(m, "\t{.wait_cnt=%d, .wait=%s},\n",
                           atomic_read(&ws->wait_cnt),
                           waitqueue_active(&ws->wait) ? "active" : "inactive");
        }
        seq_puts(m, "}\n");

        seq_printf(m, "round_robin=%d\n", sbq->round_robin);
        seq_printf(m, "min_shallow_depth=%u\n", sbq->min_shallow_depth);
}
EXPORT_SYMBOL_GPL(sbitmap_queue_show);

void sbitmap_add_wait_queue(struct sbitmap_queue *sbq,
                            struct sbq_wait_state *ws,
                            struct sbq_wait *sbq_wait)
{
        if (!sbq_wait->sbq) {
                sbq_wait->sbq = sbq;
                atomic_inc(&sbq->ws_active);
                add_wait_queue(&ws->wait, &sbq_wait->wait);
        }
}
EXPORT_SYMBOL_GPL(sbitmap_add_wait_queue);

void sbitmap_del_wait_queue(struct sbq_wait *sbq_wait)
{
        list_del_init(&sbq_wait->wait.entry);
        if (sbq_wait->sbq) {
                atomic_dec(&sbq_wait->sbq->ws_active);
                sbq_wait->sbq = NULL;
        }
}
EXPORT_SYMBOL_GPL(sbitmap_del_wait_queue);

void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq,
                             struct sbq_wait_state *ws,
                             struct sbq_wait *sbq_wait, int state)
{
        if (!sbq_wait->sbq) {
                atomic_inc(&sbq->ws_active);
                sbq_wait->sbq = sbq;
        }
        prepare_to_wait_exclusive(&ws->wait, &sbq_wait->wait, state);
}
EXPORT_SYMBOL_GPL(sbitmap_prepare_to_wait);

void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws,
                         struct sbq_wait *sbq_wait)
{
        finish_wait(&ws->wait, &sbq_wait->wait);
        if (sbq_wait->sbq) {
                atomic_dec(&sbq->ws_active);
                sbq_wait->sbq = NULL;
        }
}
EXPORT_SYMBOL_GPL(sbitmap_finish_wait);


















































































    1 




















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef IOPRIO_H
#define IOPRIO_H

#include <linux/sched.h>
#include <linux/sched/rt.h>
#include <linux/iocontext.h>

/*
 * Gives us 8 prio classes with 13-bits of data for each class
 */
#define IOPRIO_CLASS_SHIFT        (13)
#define IOPRIO_PRIO_MASK        ((1UL << IOPRIO_CLASS_SHIFT) - 1)

#define IOPRIO_PRIO_CLASS(mask)        ((mask) >> IOPRIO_CLASS_SHIFT)
#define IOPRIO_PRIO_DATA(mask)        ((mask) & IOPRIO_PRIO_MASK)
#define IOPRIO_PRIO_VALUE(class, data)        (((class) << IOPRIO_CLASS_SHIFT) | data)

#define ioprio_valid(mask)        (IOPRIO_PRIO_CLASS((mask)) != IOPRIO_CLASS_NONE)

/*
 * These are the io priority groups as implemented by CFQ. RT is the realtime
 * class, it always gets premium service. BE is the best-effort scheduling
 * class, the default for any process. IDLE is the idle scheduling class, it
 * is only served when no one else is using the disk.
 */
enum {
        IOPRIO_CLASS_NONE,
        IOPRIO_CLASS_RT,
        IOPRIO_CLASS_BE,
        IOPRIO_CLASS_IDLE,
};

/*
 * 8 best effort priority levels are supported
 */
#define IOPRIO_BE_NR        (8)

enum {
        IOPRIO_WHO_PROCESS = 1,
        IOPRIO_WHO_PGRP,
        IOPRIO_WHO_USER,
};

/*
 * Fallback BE priority
 */
#define IOPRIO_NORM        (4)

/*
 * if process has set io priority explicitly, use that. if not, convert
 * the cpu scheduler nice value to an io priority
 */
static inline int task_nice_ioprio(struct task_struct *task)
{
        return (task_nice(task) + 20) / 5;
}

/*
 * This is for the case where the task hasn't asked for a specific IO class.
 * Check for idle and rt task process, and return appropriate IO class.
 */
static inline int task_nice_ioclass(struct task_struct *task)
{
        if (task->policy == SCHED_IDLE)
                return IOPRIO_CLASS_IDLE;
        else if (task_is_realtime(task))
                return IOPRIO_CLASS_RT;
        else
                return IOPRIO_CLASS_BE;
}

/*
 * If the calling process has set an I/O priority, use that. Otherwise, return
 * the default I/O priority.
 */
static inline int get_current_ioprio(void)
{
        struct io_context *ioc = current->io_context;

        if (ioc)
                return ioc->ioprio;
        return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
}

/*
 * For inheritance, return the highest of the two given priorities
 */
extern int ioprio_best(unsigned short aprio, unsigned short bprio);

extern int set_task_ioprio(struct task_struct *task, int ioprio);

#ifdef CONFIG_BLOCK
extern int ioprio_check_cap(int ioprio);
#else
static inline int ioprio_check_cap(int ioprio)
{
        return -ENOTBLK;
}
#endif /* CONFIG_BLOCK */

#endif

































































































































































































































































































    1 


























































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * kernfs.h - pseudo filesystem decoupled from vfs locking
 */

#ifndef __LINUX_KERNFS_H
#define __LINUX_KERNFS_H

#include <linux/kernel.h>
#include <linux/err.h>
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/idr.h>
#include <linux/lockdep.h>
#include <linux/rbtree.h>
#include <linux/atomic.h>
#include <linux/uidgid.h>
#include <linux/wait.h>

struct file;
struct dentry;
struct iattr;
struct seq_file;
struct vm_area_struct;
struct super_block;
struct file_system_type;
struct poll_table_struct;
struct fs_context;

struct kernfs_fs_context;
struct kernfs_open_node;
struct kernfs_iattrs;

enum kernfs_node_type {
        KERNFS_DIR                = 0x0001,
        KERNFS_FILE                = 0x0002,
        KERNFS_LINK                = 0x0004,
};

#define KERNFS_TYPE_MASK                0x000f
#define KERNFS_FLAG_MASK                ~KERNFS_TYPE_MASK
#define KERNFS_MAX_USER_XATTRS                128
#define KERNFS_USER_XATTR_SIZE_LIMIT        (128 << 10)

enum kernfs_node_flag {
        KERNFS_ACTIVATED        = 0x0010,
        KERNFS_NS                = 0x0020,
        KERNFS_HAS_SEQ_SHOW        = 0x0040,
        KERNFS_HAS_MMAP                = 0x0080,
        KERNFS_LOCKDEP                = 0x0100,
        KERNFS_SUICIDAL                = 0x0400,
        KERNFS_SUICIDED                = 0x0800,
        KERNFS_EMPTY_DIR        = 0x1000,
        KERNFS_HAS_RELEASE        = 0x2000,
};

/* @flags for kernfs_create_root() */
enum kernfs_root_flag {
        /*
         * kernfs_nodes are created in the deactivated state and invisible.
         * They require explicit kernfs_activate() to become visible.  This
         * can be used to make related nodes become visible atomically
         * after all nodes are created successfully.
         */
        KERNFS_ROOT_CREATE_DEACTIVATED                = 0x0001,

        /*
         * For regular files, if the opener has CAP_DAC_OVERRIDE, open(2)
         * succeeds regardless of the RW permissions.  sysfs had an extra
         * layer of enforcement where open(2) fails with -EACCES regardless
         * of CAP_DAC_OVERRIDE if the permission doesn't have the
         * respective read or write access at all (none of S_IRUGO or
         * S_IWUGO) or the respective operation isn't implemented.  The
         * following flag enables that behavior.
         */
        KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK        = 0x0002,

        /*
         * The filesystem supports exportfs operation, so userspace can use
         * fhandle to access nodes of the fs.
         */
        KERNFS_ROOT_SUPPORT_EXPORTOP                = 0x0004,

        /*
         * Support user xattrs to be written to nodes rooted at this root.
         */
        KERNFS_ROOT_SUPPORT_USER_XATTR                = 0x0008,
};

/* type-specific structures for kernfs_node union members */
struct kernfs_elem_dir {
        unsigned long                subdirs;
        /* children rbtree starts here and goes through kn->rb */
        struct rb_root                children;

        /*
         * The kernfs hierarchy this directory belongs to.  This fits
         * better directly in kernfs_node but is here to save space.
         */
        struct kernfs_root        *root;
};

struct kernfs_elem_symlink {
        struct kernfs_node        *target_kn;
};

struct kernfs_elem_attr {
        const struct kernfs_ops        *ops;
        struct kernfs_open_node        *open;
        loff_t                        size;
        struct kernfs_node        *notify_next;        /* for kernfs_notify() */
};

/*
 * kernfs_node - the building block of kernfs hierarchy.  Each and every
 * kernfs node is represented by single kernfs_node.  Most fields are
 * private to kernfs and shouldn't be accessed directly by kernfs users.
 *
 * As long as s_count reference is held, the kernfs_node itself is
 * accessible.  Dereferencing elem or any other outer entity requires
 * active reference.
 */
struct kernfs_node {
        atomic_t                count;
        atomic_t                active;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map        dep_map;
#endif
        /*
         * Use kernfs_get_parent() and kernfs_name/path() instead of
         * accessing the following two fields directly.  If the node is
         * never moved to a different parent, it is safe to access the
         * parent directly.
         */
        struct kernfs_node        *parent;
        const char                *name;

        struct rb_node                rb;

        const void                *ns;        /* namespace tag */
        unsigned int                hash;        /* ns + name hash */
        union {
                struct kernfs_elem_dir                dir;
                struct kernfs_elem_symlink        symlink;
                struct kernfs_elem_attr                attr;
        };

        void                        *priv;

        /*
         * 64bit unique ID.  On 64bit ino setups, id is the ino.  On 32bit,
         * the low 32bits are ino and upper generation.
         */
        u64                        id;

        unsigned short                flags;
        umode_t                        mode;
        struct kernfs_iattrs        *iattr;
};

/*
 * kernfs_syscall_ops may be specified on kernfs_create_root() to support
 * syscalls.  These optional callbacks are invoked on the matching syscalls
 * and can perform any kernfs operations which don't necessarily have to be
 * the exact operation requested.  An active reference is held for each
 * kernfs_node parameter.
 */
struct kernfs_syscall_ops {
        int (*show_options)(struct seq_file *sf, struct kernfs_root *root);

        int (*mkdir)(struct kernfs_node *parent, const char *name,
                     umode_t mode);
        int (*rmdir)(struct kernfs_node *kn);
        int (*rename)(struct kernfs_node *kn, struct kernfs_node *new_parent,
                      const char *new_name);
        int (*show_path)(struct seq_file *sf, struct kernfs_node *kn,
                         struct kernfs_root *root);
};

struct kernfs_root {
        /* published fields */
        struct kernfs_node        *kn;
        unsigned int                flags;        /* KERNFS_ROOT_* flags */

        /* private fields, do not use outside kernfs proper */
        struct idr                ino_idr;
        u32                        last_id_lowbits;
        u32                        id_highbits;
        struct kernfs_syscall_ops *syscall_ops;

        /* list of kernfs_super_info of this root, protected by kernfs_mutex */
        struct list_head        supers;

        wait_queue_head_t        deactivate_waitq;
};

struct kernfs_open_file {
        /* published fields */
        struct kernfs_node        *kn;
        struct file                *file;
        struct seq_file                *seq_file;
        void                        *priv;

        /* private fields, do not use outside kernfs proper */
        struct mutex                mutex;
        struct mutex                prealloc_mutex;
        int                        event;
        struct list_head        list;
        char                        *prealloc_buf;

        size_t                        atomic_write_len;
        bool                        mmapped:1;
        bool                        released:1;
        const struct vm_operations_struct *vm_ops;
};

struct kernfs_ops {
        /*
         * Optional open/release methods.  Both are called with
         * @of->seq_file populated.
         */
        int (*open)(struct kernfs_open_file *of);
        void (*release)(struct kernfs_open_file *of);

        /*
         * Read is handled by either seq_file or raw_read().
         *
         * If seq_show() is present, seq_file path is active.  Other seq
         * operations are optional and if not implemented, the behavior is
         * equivalent to single_open().  @sf->private points to the
         * associated kernfs_open_file.
         *
         * read() is bounced through kernel buffer and a read larger than
         * PAGE_SIZE results in partial operation of PAGE_SIZE.
         */
        int (*seq_show)(struct seq_file *sf, void *v);

        void *(*seq_start)(struct seq_file *sf, loff_t *ppos);
        void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos);
        void (*seq_stop)(struct seq_file *sf, void *v);

        ssize_t (*read)(struct kernfs_open_file *of, char *buf, size_t bytes,
                        loff_t off);

        /*
         * write() is bounced through kernel buffer.  If atomic_write_len
         * is not set, a write larger than PAGE_SIZE results in partial
         * operations of PAGE_SIZE chunks.  If atomic_write_len is set,
         * writes upto the specified size are executed atomically but
         * larger ones are rejected with -E2BIG.
         */
        size_t atomic_write_len;
        /*
         * "prealloc" causes a buffer to be allocated at open for
         * all read/write requests.  As ->seq_show uses seq_read()
         * which does its own allocation, it is incompatible with
         * ->prealloc.  Provide ->read and ->write with ->prealloc.
         */
        bool prealloc;
        ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t bytes,
                         loff_t off);

        __poll_t (*poll)(struct kernfs_open_file *of,
                         struct poll_table_struct *pt);

        int (*mmap)(struct kernfs_open_file *of, struct vm_area_struct *vma);

#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lock_class_key        lockdep_key;
#endif
};

/*
 * The kernfs superblock creation/mount parameter context.
 */
struct kernfs_fs_context {
        struct kernfs_root        *root;                /* Root of the hierarchy being mounted */
        void                        *ns_tag;        /* Namespace tag of the mount (or NULL) */
        unsigned long                magic;                /* File system specific magic number */

        /* The following are set/used by kernfs_mount() */
        bool                        new_sb_created;        /* Set to T if we allocated a new sb */
};

#ifdef CONFIG_KERNFS

static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn)
{
        return kn->flags & KERNFS_TYPE_MASK;
}

static inline ino_t kernfs_id_ino(u64 id)
{
        /* id is ino if ino_t is 64bit; otherwise, low 32bits */
        if (sizeof(ino_t) >= sizeof(u64))
                return id;
        else
                return (u32)id;
}

static inline u32 kernfs_id_gen(u64 id)
{
        /* gen is fixed at 1 if ino_t is 64bit; otherwise, high 32bits */
        if (sizeof(ino_t) >= sizeof(u64))
                return 1;
        else
                return id >> 32;
}

static inline ino_t kernfs_ino(struct kernfs_node *kn)
{
        return kernfs_id_ino(kn->id);
}

static inline ino_t kernfs_gen(struct kernfs_node *kn)
{
        return kernfs_id_gen(kn->id);
}

/**
 * kernfs_enable_ns - enable namespace under a directory
 * @kn: directory of interest, should be empty
 *
 * This is to be called right after @kn is created to enable namespace
 * under it.  All children of @kn must have non-NULL namespace tags and
 * only the ones which match the super_block's tag will be visible.
 */
static inline void kernfs_enable_ns(struct kernfs_node *kn)
{
        WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR);
        WARN_ON_ONCE(!RB_EMPTY_ROOT(&kn->dir.children));
        kn->flags |= KERNFS_NS;
}

/**
 * kernfs_ns_enabled - test whether namespace is enabled
 * @kn: the node to test
 *
 * Test whether namespace filtering is enabled for the children of @ns.
 */
static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
{
        return kn->flags & KERNFS_NS;
}

int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen);
int kernfs_path_from_node(struct kernfs_node *root_kn, struct kernfs_node *kn,
                          char *buf, size_t buflen);
void pr_cont_kernfs_name(struct kernfs_node *kn);
void pr_cont_kernfs_path(struct kernfs_node *kn);
struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn);
struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
                                           const char *name, const void *ns);
struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent,
                                           const char *path, const void *ns);
void kernfs_get(struct kernfs_node *kn);
void kernfs_put(struct kernfs_node *kn);

struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry);
struct kernfs_root *kernfs_root_from_sb(struct super_block *sb);
struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn);

struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
                                  struct super_block *sb);
struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
                                       unsigned int flags, void *priv);
void kernfs_destroy_root(struct kernfs_root *root);

struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
                                         const char *name, umode_t mode,
                                         kuid_t uid, kgid_t gid,
                                         void *priv, const void *ns);
struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent,
                                            const char *name);
struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
                                         const char *name, umode_t mode,
                                         kuid_t uid, kgid_t gid,
                                         loff_t size,
                                         const struct kernfs_ops *ops,
                                         void *priv, const void *ns,
                                         struct lock_class_key *key);
struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
                                       const char *name,
                                       struct kernfs_node *target);
void kernfs_activate(struct kernfs_node *kn);
void kernfs_remove(struct kernfs_node *kn);
void kernfs_break_active_protection(struct kernfs_node *kn);
void kernfs_unbreak_active_protection(struct kernfs_node *kn);
bool kernfs_remove_self(struct kernfs_node *kn);
int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
                             const void *ns);
int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
                     const char *new_name, const void *new_ns);
int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr);
__poll_t kernfs_generic_poll(struct kernfs_open_file *of,
                             struct poll_table_struct *pt);
void kernfs_notify(struct kernfs_node *kn);

int kernfs_xattr_get(struct kernfs_node *kn, const char *name,
                     void *value, size_t size);
int kernfs_xattr_set(struct kernfs_node *kn, const char *name,
                     const void *value, size_t size, int flags);

const void *kernfs_super_ns(struct super_block *sb);
int kernfs_get_tree(struct fs_context *fc);
void kernfs_free_fs_context(struct fs_context *fc);
void kernfs_kill_sb(struct super_block *sb);

void kernfs_init(void);

struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
                                                   u64 id);
#else        /* CONFIG_KERNFS */

static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn)
{ return 0; }        /* whatever */

static inline void kernfs_enable_ns(struct kernfs_node *kn) { }

static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
{ return false; }

static inline int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
{ return -ENOSYS; }

static inline int kernfs_path_from_node(struct kernfs_node *root_kn,
                                        struct kernfs_node *kn,
                                        char *buf, size_t buflen)
{ return -ENOSYS; }

static inline void pr_cont_kernfs_name(struct kernfs_node *kn) { }
static inline void pr_cont_kernfs_path(struct kernfs_node *kn) { }

static inline struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn)
{ return NULL; }

static inline struct kernfs_node *
kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name,
                       const void *ns)
{ return NULL; }
static inline struct kernfs_node *
kernfs_walk_and_get_ns(struct kernfs_node *parent, const char *path,
                       const void *ns)
{ return NULL; }

static inline void kernfs_get(struct kernfs_node *kn) { }
static inline void kernfs_put(struct kernfs_node *kn) { }

static inline struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry)
{ return NULL; }

static inline struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
{ return NULL; }

static inline struct inode *
kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn)
{ return NULL; }

static inline struct kernfs_root *
kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags,
                   void *priv)
{ return ERR_PTR(-ENOSYS); }

static inline void kernfs_destroy_root(struct kernfs_root *root) { }

static inline struct kernfs_node *
kernfs_create_dir_ns(struct kernfs_node *parent, const char *name,
                     umode_t mode, kuid_t uid, kgid_t gid,
                     void *priv, const void *ns)
{ return ERR_PTR(-ENOSYS); }

static inline struct kernfs_node *
__kernfs_create_file(struct kernfs_node *parent, const char *name,
                     umode_t mode, kuid_t uid, kgid_t gid,
                     loff_t size, const struct kernfs_ops *ops,
                     void *priv, const void *ns, struct lock_class_key *key)
{ return ERR_PTR(-ENOSYS); }

static inline struct kernfs_node *
kernfs_create_link(struct kernfs_node *parent, const char *name,
                   struct kernfs_node *target)
{ return ERR_PTR(-ENOSYS); }

static inline void kernfs_activate(struct kernfs_node *kn) { }

static inline void kernfs_remove(struct kernfs_node *kn) { }

static inline bool kernfs_remove_self(struct kernfs_node *kn)
{ return false; }

static inline int kernfs_remove_by_name_ns(struct kernfs_node *kn,
                                           const char *name, const void *ns)
{ return -ENOSYS; }

static inline int kernfs_rename_ns(struct kernfs_node *kn,
                                   struct kernfs_node *new_parent,
                                   const char *new_name, const void *new_ns)
{ return -ENOSYS; }

static inline int kernfs_setattr(struct kernfs_node *kn,
                                 const struct iattr *iattr)
{ return -ENOSYS; }

static inline void kernfs_notify(struct kernfs_node *kn) { }

static inline int kernfs_xattr_get(struct kernfs_node *kn, const char *name,
                                   void *value, size_t size)
{ return -ENOSYS; }

static inline int kernfs_xattr_set(struct kernfs_node *kn, const char *name,
                                   const void *value, size_t size, int flags)
{ return -ENOSYS; }

static inline const void *kernfs_super_ns(struct super_block *sb)
{ return NULL; }

static inline int kernfs_get_tree(struct fs_context *fc)
{ return -ENOSYS; }

static inline void kernfs_free_fs_context(struct fs_context *fc) { }

static inline void kernfs_kill_sb(struct super_block *sb) { }

static inline void kernfs_init(void) { }

#endif        /* CONFIG_KERNFS */

/**
 * kernfs_path - build full path of a given node
 * @kn: kernfs_node of interest
 * @buf: buffer to copy @kn's name into
 * @buflen: size of @buf
 *
 * If @kn is NULL result will be "(null)".
 *
 * Returns the length of the full path.  If the full length is equal to or
 * greater than @buflen, @buf contains the truncated path with the trailing
 * '\0'.  On error, -errno is returned.
 */
static inline int kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen)
{
        return kernfs_path_from_node(kn, NULL, buf, buflen);
}

static inline struct kernfs_node *
kernfs_find_and_get(struct kernfs_node *kn, const char *name)
{
        return kernfs_find_and_get_ns(kn, name, NULL);
}

static inline struct kernfs_node *
kernfs_walk_and_get(struct kernfs_node *kn, const char *path)
{
        return kernfs_walk_and_get_ns(kn, path, NULL);
}

static inline struct kernfs_node *
kernfs_create_dir(struct kernfs_node *parent, const char *name, umode_t mode,
                  void *priv)
{
        return kernfs_create_dir_ns(parent, name, mode,
                                    GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
                                    priv, NULL);
}

static inline struct kernfs_node *
kernfs_create_file_ns(struct kernfs_node *parent, const char *name,
                      umode_t mode, kuid_t uid, kgid_t gid,
                      loff_t size, const struct kernfs_ops *ops,
                      void *priv, const void *ns)
{
        struct lock_class_key *key = NULL;

#ifdef CONFIG_DEBUG_LOCK_ALLOC
        key = (struct lock_class_key *)&ops->lockdep_key;
#endif
        return __kernfs_create_file(parent, name, mode, uid, gid,
                                    size, ops, priv, ns, key);
}

static inline struct kernfs_node *
kernfs_create_file(struct kernfs_node *parent, const char *name, umode_t mode,
                   loff_t size, const struct kernfs_ops *ops, void *priv)
{
        return kernfs_create_file_ns(parent, name, mode,
                                     GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
                                     size, ops, priv, NULL);
}

static inline int kernfs_remove_by_name(struct kernfs_node *parent,
                                        const char *name)
{
        return kernfs_remove_by_name_ns(parent, name, NULL);
}

static inline int kernfs_rename(struct kernfs_node *kn,
                                struct kernfs_node *new_parent,
                                const char *new_name)
{
        return kernfs_rename_ns(kn, new_parent, new_name, NULL);
}

#endif        /* __LINUX_KERNFS_H */























































    2 























































































































































    5 



























   11 















    1 






















































































    1 






























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_BITOPS_H
#define _ASM_X86_BITOPS_H

/*
 * Copyright 1992, Linus Torvalds.
 *
 * Note: inlines with more than a single statement should be marked
 * __always_inline to avoid problems with older gcc's inlining heuristics.
 */

#ifndef _LINUX_BITOPS_H
#error only <linux/bitops.h> can be included directly
#endif

#include <linux/compiler.h>
#include <asm/alternative.h>
#include <asm/rmwcc.h>
#include <asm/barrier.h>

#if BITS_PER_LONG == 32
# define _BITOPS_LONG_SHIFT 5
#elif BITS_PER_LONG == 64
# define _BITOPS_LONG_SHIFT 6
#else
# error "Unexpected BITS_PER_LONG"
#endif

#define BIT_64(n)                        (U64_C(1) << (n))

/*
 * These have to be done with inline assembly: that way the bit-setting
 * is guaranteed to be atomic. All bit operations return 0 if the bit
 * was cleared before the operation and != 0 if it was not.
 *
 * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
 */

#define RLONG_ADDR(x)                         "m" (*(volatile long *) (x))
#define WBYTE_ADDR(x)                        "+m" (*(volatile char *) (x))

#define ADDR                                RLONG_ADDR(addr)

/*
 * We do the locked ops that don't return the old value as
 * a mask operation on a byte.
 */
#define CONST_MASK_ADDR(nr, addr)        WBYTE_ADDR((void *)(addr) + ((nr)>>3))
#define CONST_MASK(nr)                        (1 << ((nr) & 7))

static __always_inline void
arch_set_bit(long nr, volatile unsigned long *addr)
{
        if (__builtin_constant_p(nr)) {
                asm volatile(LOCK_PREFIX "orb %b1,%0"
                        : CONST_MASK_ADDR(nr, addr)
                        : "iq" (CONST_MASK(nr))
                        : "memory");
        } else {
                asm volatile(LOCK_PREFIX __ASM_SIZE(bts) " %1,%0"
                        : : RLONG_ADDR(addr), "Ir" (nr) : "memory");
        }
}

static __always_inline void
arch___set_bit(long nr, volatile unsigned long *addr)
{
        asm volatile(__ASM_SIZE(bts) " %1,%0" : : ADDR, "Ir" (nr) : "memory");
}

static __always_inline void
arch_clear_bit(long nr, volatile unsigned long *addr)
{
        if (__builtin_constant_p(nr)) {
                asm volatile(LOCK_PREFIX "andb %b1,%0"
                        : CONST_MASK_ADDR(nr, addr)
                        : "iq" (~CONST_MASK(nr)));
        } else {
                asm volatile(LOCK_PREFIX __ASM_SIZE(btr) " %1,%0"
                        : : RLONG_ADDR(addr), "Ir" (nr) : "memory");
        }
}

static __always_inline void
arch_clear_bit_unlock(long nr, volatile unsigned long *addr)
{
        barrier();
        arch_clear_bit(nr, addr);
}

static __always_inline void
arch___clear_bit(long nr, volatile unsigned long *addr)
{
        asm volatile(__ASM_SIZE(btr) " %1,%0" : : ADDR, "Ir" (nr) : "memory");
}

static __always_inline bool
arch_clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr)
{
        bool negative;
        asm volatile(LOCK_PREFIX "andb %2,%1"
                CC_SET(s)
                : CC_OUT(s) (negative), WBYTE_ADDR(addr)
                : "ir" ((char) ~(1 << nr)) : "memory");
        return negative;
}
#define arch_clear_bit_unlock_is_negative_byte                                 \
        arch_clear_bit_unlock_is_negative_byte

static __always_inline void
arch___clear_bit_unlock(long nr, volatile unsigned long *addr)
{
        arch___clear_bit(nr, addr);
}

static __always_inline void
arch___change_bit(long nr, volatile unsigned long *addr)
{
        asm volatile(__ASM_SIZE(btc) " %1,%0" : : ADDR, "Ir" (nr) : "memory");
}

static __always_inline void
arch_change_bit(long nr, volatile unsigned long *addr)
{
        if (__builtin_constant_p(nr)) {
                asm volatile(LOCK_PREFIX "xorb %b1,%0"
                        : CONST_MASK_ADDR(nr, addr)
                        : "iq" (CONST_MASK(nr)));
        } else {
                asm volatile(LOCK_PREFIX __ASM_SIZE(btc) " %1,%0"
                        : : RLONG_ADDR(addr), "Ir" (nr) : "memory");
        }
}

static __always_inline bool
arch_test_and_set_bit(long nr, volatile unsigned long *addr)
{
        return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(bts), *addr, c, "Ir", nr);
}

static __always_inline bool
arch_test_and_set_bit_lock(long nr, volatile unsigned long *addr)
{
        return arch_test_and_set_bit(nr, addr);
}

static __always_inline bool
arch___test_and_set_bit(long nr, volatile unsigned long *addr)
{
        bool oldbit;

        asm(__ASM_SIZE(bts) " %2,%1"
            CC_SET(c)
            : CC_OUT(c) (oldbit)
            : ADDR, "Ir" (nr) : "memory");
        return oldbit;
}

static __always_inline bool
arch_test_and_clear_bit(long nr, volatile unsigned long *addr)
{
        return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btr), *addr, c, "Ir", nr);
}

/*
 * Note: the operation is performed atomically with respect to
 * the local CPU, but not other CPUs. Portable code should not
 * rely on this behaviour.
 * KVM relies on this behaviour on x86 for modifying memory that is also
 * accessed from a hypervisor on the same CPU if running in a VM: don't change
 * this without also updating arch/x86/kernel/kvm.c
 */
static __always_inline bool
arch___test_and_clear_bit(long nr, volatile unsigned long *addr)
{
        bool oldbit;

        asm volatile(__ASM_SIZE(btr) " %2,%1"
                     CC_SET(c)
                     : CC_OUT(c) (oldbit)
                     : ADDR, "Ir" (nr) : "memory");
        return oldbit;
}

static __always_inline bool
arch___test_and_change_bit(long nr, volatile unsigned long *addr)
{
        bool oldbit;

        asm volatile(__ASM_SIZE(btc) " %2,%1"
                     CC_SET(c)
                     : CC_OUT(c) (oldbit)
                     : ADDR, "Ir" (nr) : "memory");

        return oldbit;
}

static __always_inline bool
arch_test_and_change_bit(long nr, volatile unsigned long *addr)
{
        return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btc), *addr, c, "Ir", nr);
}

static __always_inline bool constant_test_bit(long nr, const volatile unsigned long *addr)
{
        return ((1UL << (nr & (BITS_PER_LONG-1))) &
                (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
}

static __always_inline bool variable_test_bit(long nr, volatile const unsigned long *addr)
{
        bool oldbit;

        asm volatile(__ASM_SIZE(bt) " %2,%1"
                     CC_SET(c)
                     : CC_OUT(c) (oldbit)
                     : "m" (*(unsigned long *)addr), "Ir" (nr) : "memory");

        return oldbit;
}

#define arch_test_bit(nr, addr)                        \
        (__builtin_constant_p((nr))                \
         ? constant_test_bit((nr), (addr))        \
         : variable_test_bit((nr), (addr)))

/**
 * __ffs - find first set bit in word
 * @word: The word to search
 *
 * Undefined if no bit exists, so code should check against 0 first.
 */
static __always_inline unsigned long __ffs(unsigned long word)
{
        asm("rep; bsf %1,%0"
                : "=r" (word)
                : "rm" (word));
        return word;
}

/**
 * ffz - find first zero bit in word
 * @word: The word to search
 *
 * Undefined if no zero exists, so code should check against ~0UL first.
 */
static __always_inline unsigned long ffz(unsigned long word)
{
        asm("rep; bsf %1,%0"
                : "=r" (word)
                : "r" (~word));
        return word;
}

/*
 * __fls: find last set bit in word
 * @word: The word to search
 *
 * Undefined if no set bit exists, so code should check against 0 first.
 */
static __always_inline unsigned long __fls(unsigned long word)
{
        asm("bsr %1,%0"
            : "=r" (word)
            : "rm" (word));
        return word;
}

#undef ADDR

#ifdef __KERNEL__
/**
 * ffs - find first set bit in word
 * @x: the word to search
 *
 * This is defined the same way as the libc and compiler builtin ffs
 * routines, therefore differs in spirit from the other bitops.
 *
 * ffs(value) returns 0 if value is 0 or the position of the first
 * set bit if value is nonzero. The first (least significant) bit
 * is at position 1.
 */
static __always_inline int ffs(int x)
{
        int r;

#ifdef CONFIG_X86_64
        /*
         * AMD64 says BSFL won't clobber the dest reg if x==0; Intel64 says the
         * dest reg is undefined if x==0, but their CPU architect says its
         * value is written to set it to the same as before, except that the
         * top 32 bits will be cleared.
         *
         * We cannot do this on 32 bits because at the very least some
         * 486 CPUs did not behave this way.
         */
        asm("bsfl %1,%0"
            : "=r" (r)
            : "rm" (x), "0" (-1));
#elif defined(CONFIG_X86_CMOV)
        asm("bsfl %1,%0\n\t"
            "cmovzl %2,%0"
            : "=&r" (r) : "rm" (x), "r" (-1));
#else
        asm("bsfl %1,%0\n\t"
            "jnz 1f\n\t"
            "movl $-1,%0\n"
            "1:" : "=r" (r) : "rm" (x));
#endif
        return r + 1;
}

/**
 * fls - find last set bit in word
 * @x: the word to search
 *
 * This is defined in a similar way as the libc and compiler builtin
 * ffs, but returns the position of the most significant set bit.
 *
 * fls(value) returns 0 if value is 0 or the position of the last
 * set bit if value is nonzero. The last (most significant) bit is
 * at position 32.
 */
static __always_inline int fls(unsigned int x)
{
        int r;

#ifdef CONFIG_X86_64
        /*
         * AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the
         * dest reg is undefined if x==0, but their CPU architect says its
         * value is written to set it to the same as before, except that the
         * top 32 bits will be cleared.
         *
         * We cannot do this on 32 bits because at the very least some
         * 486 CPUs did not behave this way.
         */
        asm("bsrl %1,%0"
            : "=r" (r)
            : "rm" (x), "0" (-1));
#elif defined(CONFIG_X86_CMOV)
        asm("bsrl %1,%0\n\t"
            "cmovzl %2,%0"
            : "=&r" (r) : "rm" (x), "rm" (-1));
#else
        asm("bsrl %1,%0\n\t"
            "jnz 1f\n\t"
            "movl $-1,%0\n"
            "1:" : "=r" (r) : "rm" (x));
#endif
        return r + 1;
}

/**
 * fls64 - find last set bit in a 64-bit word
 * @x: the word to search
 *
 * This is defined in a similar way as the libc and compiler builtin
 * ffsll, but returns the position of the most significant set bit.
 *
 * fls64(value) returns 0 if value is 0 or the position of the last
 * set bit if value is nonzero. The last (most significant) bit is
 * at position 64.
 */
#ifdef CONFIG_X86_64
static __always_inline int fls64(__u64 x)
{
        int bitpos = -1;
        /*
         * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
         * dest reg is undefined if x==0, but their CPU architect says its
         * value is written to set it to the same as before.
         */
        asm("bsrq %1,%q0"
            : "+r" (bitpos)
            : "rm" (x));
        return bitpos + 1;
}
#else
#include <asm-generic/bitops/fls64.h>
#endif

#include <asm-generic/bitops/find.h>

#include <asm-generic/bitops/sched.h>

#include <asm/arch_hweight.h>

#include <asm-generic/bitops/const_hweight.h>

#include <asm-generic/bitops/instrumented-atomic.h>
#include <asm-generic/bitops/instrumented-non-atomic.h>
#include <asm-generic/bitops/instrumented-lock.h>

#include <asm-generic/bitops/le.h>

#include <asm-generic/bitops/ext2-atomic-setbit.h>

#endif /* __KERNEL__ */
#endif /* _ASM_X86_BITOPS_H */



















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
// SPDX-License-Identifier: GPL-2.0
/*
 * linux/fs/ext4/truncate.h
 *
 * Common inline functions needed for truncate support
 */

/*
 * Truncate blocks that were not used by write. We have to truncate the
 * pagecache as well so that corresponding buffers get properly unmapped.
 */
static inline void ext4_truncate_failed_write(struct inode *inode)
{
        /*
         * We don't need to call ext4_break_layouts() because the blocks we
         * are truncating were never visible to userspace.
         */
        down_write(&EXT4_I(inode)->i_mmap_sem);
        truncate_inode_pages(inode->i_mapping, inode->i_size);
        ext4_truncate(inode);
        up_write(&EXT4_I(inode)->i_mmap_sem);
}

/*
 * Work out how many blocks we need to proceed with the next chunk of a
 * truncate transaction.
 */
static inline unsigned long ext4_blocks_for_truncate(struct inode *inode)
{
        ext4_lblk_t needed;

        needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);

        /* Give ourselves just enough room to cope with inodes in which
         * i_blocks is corrupt: we've seen disk corruptions in the past
         * which resulted in random data in an inode which looked enough
         * like a regular file for ext4 to try to delete it.  Things
         * will go a bit crazy if that happens, but at least we should
         * try not to panic the whole kernel. */
        if (needed < 2)
                needed = 2;

        /* But we need to bound the transaction so we don't overflow the
         * journal. */
        if (needed > EXT4_MAX_TRANS_DATA)
                needed = EXT4_MAX_TRANS_DATA;

        return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
}




























    2 



    2 












    2 
    2 





























































    2 



    2 




























































































































































































































































































































































































































































































    2 


    2 
    2 


    2 
    2 
    2 






























    2 









































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2002,2003 by Andreas Gruenbacher <a.gruenbacher@computer.org>
 *
 * Fixes from William Schumacher incorporated on 15 March 2001.
 *    (Reported by Charles Bertsch, <CBertsch@microtest.com>).
 */

/*
 *  This file contains generic functions for manipulating
 *  POSIX 1003.1e draft standard 17 ACLs.
 */

#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/atomic.h>
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/cred.h>
#include <linux/posix_acl.h>
#include <linux/posix_acl_xattr.h>
#include <linux/xattr.h>
#include <linux/export.h>
#include <linux/user_namespace.h>

static struct posix_acl **acl_by_type(struct inode *inode, int type)
{
        switch (type) {
        case ACL_TYPE_ACCESS:
                return &inode->i_acl;
        case ACL_TYPE_DEFAULT:
                return &inode->i_default_acl;
        default:
                BUG();
        }
}

struct posix_acl *get_cached_acl(struct inode *inode, int type)
{
        struct posix_acl **p = acl_by_type(inode, type);
        struct posix_acl *acl;

        for (;;) {
                rcu_read_lock();
                acl = rcu_dereference(*p);
                if (!acl || is_uncached_acl(acl) ||
                    refcount_inc_not_zero(&acl->a_refcount))
                        break;
                rcu_read_unlock();
                cpu_relax();
        }
        rcu_read_unlock();
        return acl;
}
EXPORT_SYMBOL(get_cached_acl);

struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type)
{
        return rcu_dereference(*acl_by_type(inode, type));
}
EXPORT_SYMBOL(get_cached_acl_rcu);

void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl)
{
        struct posix_acl **p = acl_by_type(inode, type);
        struct posix_acl *old;

        old = xchg(p, posix_acl_dup(acl));
        if (!is_uncached_acl(old))
                posix_acl_release(old);
}
EXPORT_SYMBOL(set_cached_acl);

static void __forget_cached_acl(struct posix_acl **p)
{
        struct posix_acl *old;

        old = xchg(p, ACL_NOT_CACHED);
        if (!is_uncached_acl(old))
                posix_acl_release(old);
}

void forget_cached_acl(struct inode *inode, int type)
{
        __forget_cached_acl(acl_by_type(inode, type));
}
EXPORT_SYMBOL(forget_cached_acl);

void forget_all_cached_acls(struct inode *inode)
{
        __forget_cached_acl(&inode->i_acl);
        __forget_cached_acl(&inode->i_default_acl);
}
EXPORT_SYMBOL(forget_all_cached_acls);

struct posix_acl *get_acl(struct inode *inode, int type)
{
        void *sentinel;
        struct posix_acl **p;
        struct posix_acl *acl;

        /*
         * The sentinel is used to detect when another operation like
         * set_cached_acl() or forget_cached_acl() races with get_acl().
         * It is guaranteed that is_uncached_acl(sentinel) is true.
         */

        acl = get_cached_acl(inode, type);
        if (!is_uncached_acl(acl))
                return acl;

        if (!IS_POSIXACL(inode))
                return NULL;

        sentinel = uncached_acl_sentinel(current);
        p = acl_by_type(inode, type);

        /*
         * If the ACL isn't being read yet, set our sentinel.  Otherwise, the
         * current value of the ACL will not be ACL_NOT_CACHED and so our own
         * sentinel will not be set; another task will update the cache.  We
         * could wait for that other task to complete its job, but it's easier
         * to just call ->get_acl to fetch the ACL ourself.  (This is going to
         * be an unlikely race.)
         */
        if (cmpxchg(p, ACL_NOT_CACHED, sentinel) != ACL_NOT_CACHED)
                /* fall through */ ;

        /*
         * Normally, the ACL returned by ->get_acl will be cached.
         * A filesystem can prevent that by calling
         * forget_cached_acl(inode, type) in ->get_acl.
         *
         * If the filesystem doesn't have a get_acl() function at all, we'll
         * just create the negative cache entry.
         */
        if (!inode->i_op->get_acl) {
                set_cached_acl(inode, type, NULL);
                return NULL;
        }
        acl = inode->i_op->get_acl(inode, type);

        if (IS_ERR(acl)) {
                /*
                 * Remove our sentinel so that we don't block future attempts
                 * to cache the ACL.
                 */
                cmpxchg(p, sentinel, ACL_NOT_CACHED);
                return acl;
        }

        /*
         * Cache the result, but only if our sentinel is still in place.
         */
        posix_acl_dup(acl);
        if (unlikely(cmpxchg(p, sentinel, acl) != sentinel))
                posix_acl_release(acl);
        return acl;
}
EXPORT_SYMBOL(get_acl);

/*
 * Init a fresh posix_acl
 */
void
posix_acl_init(struct posix_acl *acl, int count)
{
        refcount_set(&acl->a_refcount, 1);
        acl->a_count = count;
}
EXPORT_SYMBOL(posix_acl_init);

/*
 * Allocate a new ACL with the specified number of entries.
 */
struct posix_acl *
posix_acl_alloc(int count, gfp_t flags)
{
        const size_t size = sizeof(struct posix_acl) +
                            count * sizeof(struct posix_acl_entry);
        struct posix_acl *acl = kmalloc(size, flags);
        if (acl)
                posix_acl_init(acl, count);
        return acl;
}
EXPORT_SYMBOL(posix_acl_alloc);

/*
 * Clone an ACL.
 */
static struct posix_acl *
posix_acl_clone(const struct posix_acl *acl, gfp_t flags)
{
        struct posix_acl *clone = NULL;

        if (acl) {
                int size = sizeof(struct posix_acl) + acl->a_count *
                           sizeof(struct posix_acl_entry);
                clone = kmemdup(acl, size, flags);
                if (clone)
                        refcount_set(&clone->a_refcount, 1);
        }
        return clone;
}

/*
 * Check if an acl is valid. Returns 0 if it is, or -E... otherwise.
 */
int
posix_acl_valid(struct user_namespace *user_ns, const struct posix_acl *acl)
{
        const struct posix_acl_entry *pa, *pe;
        int state = ACL_USER_OBJ;
        int needs_mask = 0;

        FOREACH_ACL_ENTRY(pa, acl, pe) {
                if (pa->e_perm & ~(ACL_READ|ACL_WRITE|ACL_EXECUTE))
                        return -EINVAL;
                switch (pa->e_tag) {
                        case ACL_USER_OBJ:
                                if (state == ACL_USER_OBJ) {
                                        state = ACL_USER;
                                        break;
                                }
                                return -EINVAL;

                        case ACL_USER:
                                if (state != ACL_USER)
                                        return -EINVAL;
                                if (!kuid_has_mapping(user_ns, pa->e_uid))
                                        return -EINVAL;
                                needs_mask = 1;
                                break;

                        case ACL_GROUP_OBJ:
                                if (state == ACL_USER) {
                                        state = ACL_GROUP;
                                        break;
                                }
                                return -EINVAL;

                        case ACL_GROUP:
                                if (state != ACL_GROUP)
                                        return -EINVAL;
                                if (!kgid_has_mapping(user_ns, pa->e_gid))
                                        return -EINVAL;
                                needs_mask = 1;
                                break;

                        case ACL_MASK:
                                if (state != ACL_GROUP)
                                        return -EINVAL;
                                state = ACL_OTHER;
                                break;

                        case ACL_OTHER:
                                if (state == ACL_OTHER ||
                                    (state == ACL_GROUP && !needs_mask)) {
                                        state = 0;
                                        break;
                                }
                                return -EINVAL;

                        default:
                                return -EINVAL;
                }
        }
        if (state == 0)
                return 0;
        return -EINVAL;
}
EXPORT_SYMBOL(posix_acl_valid);

/*
 * Returns 0 if the acl can be exactly represented in the traditional
 * file mode permission bits, or else 1. Returns -E... on error.
 */
int
posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p)
{
        const struct posix_acl_entry *pa, *pe;
        umode_t mode = 0;
        int not_equiv = 0;

        /*
         * A null ACL can always be presented as mode bits.
         */
        if (!acl)
                return 0;

        FOREACH_ACL_ENTRY(pa, acl, pe) {
                switch (pa->e_tag) {
                        case ACL_USER_OBJ:
                                mode |= (pa->e_perm & S_IRWXO) << 6;
                                break;
                        case ACL_GROUP_OBJ:
                                mode |= (pa->e_perm & S_IRWXO) << 3;
                                break;
                        case ACL_OTHER:
                                mode |= pa->e_perm & S_IRWXO;
                                break;
                        case ACL_MASK:
                                mode = (mode & ~S_IRWXG) |
                                       ((pa->e_perm & S_IRWXO) << 3);
                                not_equiv = 1;
                                break;
                        case ACL_USER:
                        case ACL_GROUP:
                                not_equiv = 1;
                                break;
                        default:
                                return -EINVAL;
                }
        }
        if (mode_p)
                *mode_p = (*mode_p & ~S_IRWXUGO) | mode;
        return not_equiv;
}
EXPORT_SYMBOL(posix_acl_equiv_mode);

/*
 * Create an ACL representing the file mode permission bits of an inode.
 */
struct posix_acl *
posix_acl_from_mode(umode_t mode, gfp_t flags)
{
        struct posix_acl *acl = posix_acl_alloc(3, flags);
        if (!acl)
                return ERR_PTR(-ENOMEM);

        acl->a_entries[0].e_tag  = ACL_USER_OBJ;
        acl->a_entries[0].e_perm = (mode & S_IRWXU) >> 6;

        acl->a_entries[1].e_tag  = ACL_GROUP_OBJ;
        acl->a_entries[1].e_perm = (mode & S_IRWXG) >> 3;

        acl->a_entries[2].e_tag  = ACL_OTHER;
        acl->a_entries[2].e_perm = (mode & S_IRWXO);
        return acl;
}
EXPORT_SYMBOL(posix_acl_from_mode);

/*
 * Return 0 if current is granted want access to the inode
 * by the acl. Returns -E... otherwise.
 */
int
posix_acl_permission(struct inode *inode, const struct posix_acl *acl, int want)
{
        const struct posix_acl_entry *pa, *pe, *mask_obj;
        int found = 0;

        want &= MAY_READ | MAY_WRITE | MAY_EXEC;

        FOREACH_ACL_ENTRY(pa, acl, pe) {
                switch(pa->e_tag) {
                        case ACL_USER_OBJ:
                                /* (May have been checked already) */
                                if (uid_eq(inode->i_uid, current_fsuid()))
                                        goto check_perm;
                                break;
                        case ACL_USER:
                                if (uid_eq(pa->e_uid, current_fsuid()))
                                        goto mask;
                                break;
                        case ACL_GROUP_OBJ:
                                if (in_group_p(inode->i_gid)) {
                                        found = 1;
                                        if ((pa->e_perm & want) == want)
                                                goto mask;
                                }
                                break;
                        case ACL_GROUP:
                                if (in_group_p(pa->e_gid)) {
                                        found = 1;
                                        if ((pa->e_perm & want) == want)
                                                goto mask;
                                }
                                break;
                        case ACL_MASK:
                                break;
                        case ACL_OTHER:
                                if (found)
                                        return -EACCES;
                                else
                                        goto check_perm;
                        default:
                                return -EIO;
                }
        }
        return -EIO;

mask:
        for (mask_obj = pa+1; mask_obj != pe; mask_obj++) {
                if (mask_obj->e_tag == ACL_MASK) {
                        if ((pa->e_perm & mask_obj->e_perm & want) == want)
                                return 0;
                        return -EACCES;
                }
        }

check_perm:
        if ((pa->e_perm & want) == want)
                return 0;
        return -EACCES;
}

/*
 * Modify acl when creating a new inode. The caller must ensure the acl is
 * only referenced once.
 *
 * mode_p initially must contain the mode parameter to the open() / creat()
 * system calls. All permissions that are not granted by the acl are removed.
 * The permissions in the acl are changed to reflect the mode_p parameter.
 */
static int posix_acl_create_masq(struct posix_acl *acl, umode_t *mode_p)
{
        struct posix_acl_entry *pa, *pe;
        struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL;
        umode_t mode = *mode_p;
        int not_equiv = 0;

        /* assert(atomic_read(acl->a_refcount) == 1); */

        FOREACH_ACL_ENTRY(pa, acl, pe) {
                switch(pa->e_tag) {
                        case ACL_USER_OBJ:
                                pa->e_perm &= (mode >> 6) | ~S_IRWXO;
                                mode &= (pa->e_perm << 6) | ~S_IRWXU;
                                break;

                        case ACL_USER:
                        case ACL_GROUP:
                                not_equiv = 1;
                                break;

                        case ACL_GROUP_OBJ:
                                group_obj = pa;
                                break;

                        case ACL_OTHER:
                                pa->e_perm &= mode | ~S_IRWXO;
                                mode &= pa->e_perm | ~S_IRWXO;
                                break;

                        case ACL_MASK:
                                mask_obj = pa;
                                not_equiv = 1;
                                break;

                        default:
                                return -EIO;
                }
        }

        if (mask_obj) {
                mask_obj->e_perm &= (mode >> 3) | ~S_IRWXO;
                mode &= (mask_obj->e_perm << 3) | ~S_IRWXG;
        } else {
                if (!group_obj)
                        return -EIO;
                group_obj->e_perm &= (mode >> 3) | ~S_IRWXO;
                mode &= (group_obj->e_perm << 3) | ~S_IRWXG;
        }

        *mode_p = (*mode_p & ~S_IRWXUGO) | mode;
        return not_equiv;
}

/*
 * Modify the ACL for the chmod syscall.
 */
static int __posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode)
{
        struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL;
        struct posix_acl_entry *pa, *pe;

        /* assert(atomic_read(acl->a_refcount) == 1); */

        FOREACH_ACL_ENTRY(pa, acl, pe) {
                switch(pa->e_tag) {
                        case ACL_USER_OBJ:
                                pa->e_perm = (mode & S_IRWXU) >> 6;
                                break;

                        case ACL_USER:
                        case ACL_GROUP:
                                break;

                        case ACL_GROUP_OBJ:
                                group_obj = pa;
                                break;

                        case ACL_MASK:
                                mask_obj = pa;
                                break;

                        case ACL_OTHER:
                                pa->e_perm = (mode & S_IRWXO);
                                break;

                        default:
                                return -EIO;
                }
        }

        if (mask_obj) {
                mask_obj->e_perm = (mode & S_IRWXG) >> 3;
        } else {
                if (!group_obj)
                        return -EIO;
                group_obj->e_perm = (mode & S_IRWXG) >> 3;
        }

        return 0;
}

int
__posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p)
{
        struct posix_acl *clone = posix_acl_clone(*acl, gfp);
        int err = -ENOMEM;
        if (clone) {
                err = posix_acl_create_masq(clone, mode_p);
                if (err < 0) {
                        posix_acl_release(clone);
                        clone = NULL;
                }
        }
        posix_acl_release(*acl);
        *acl = clone;
        return err;
}
EXPORT_SYMBOL(__posix_acl_create);

int
__posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode)
{
        struct posix_acl *clone = posix_acl_clone(*acl, gfp);
        int err = -ENOMEM;
        if (clone) {
                err = __posix_acl_chmod_masq(clone, mode);
                if (err) {
                        posix_acl_release(clone);
                        clone = NULL;
                }
        }
        posix_acl_release(*acl);
        *acl = clone;
        return err;
}
EXPORT_SYMBOL(__posix_acl_chmod);

int
posix_acl_chmod(struct inode *inode, umode_t mode)
{
        struct posix_acl *acl;
        int ret = 0;

        if (!IS_POSIXACL(inode))
                return 0;
        if (!inode->i_op->set_acl)
                return -EOPNOTSUPP;

        acl = get_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR_OR_NULL(acl)) {
                if (acl == ERR_PTR(-EOPNOTSUPP))
                        return 0;
                return PTR_ERR(acl);
        }

        ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode);
        if (ret)
                return ret;
        ret = inode->i_op->set_acl(inode, acl, ACL_TYPE_ACCESS);
        posix_acl_release(acl);
        return ret;
}
EXPORT_SYMBOL(posix_acl_chmod);

int
posix_acl_create(struct inode *dir, umode_t *mode,
                struct posix_acl **default_acl, struct posix_acl **acl)
{
        struct posix_acl *p;
        struct posix_acl *clone;
        int ret;

        *acl = NULL;
        *default_acl = NULL;

        if (S_ISLNK(*mode) || !IS_POSIXACL(dir))
                return 0;

        p = get_acl(dir, ACL_TYPE_DEFAULT);
        if (!p || p == ERR_PTR(-EOPNOTSUPP)) {
                *mode &= ~current_umask();
                return 0;
        }
        if (IS_ERR(p))
                return PTR_ERR(p);

        ret = -ENOMEM;
        clone = posix_acl_clone(p, GFP_NOFS);
        if (!clone)
                goto err_release;

        ret = posix_acl_create_masq(clone, mode);
        if (ret < 0)
                goto err_release_clone;

        if (ret == 0)
                posix_acl_release(clone);
        else
                *acl = clone;

        if (!S_ISDIR(*mode))
                posix_acl_release(p);
        else
                *default_acl = p;

        return 0;

err_release_clone:
        posix_acl_release(clone);
err_release:
        posix_acl_release(p);
        return ret;
}
EXPORT_SYMBOL_GPL(posix_acl_create);

/**
 * posix_acl_update_mode  -  update mode in set_acl
 * @inode: target inode
 * @mode_p: mode (pointer) for update
 * @acl: acl pointer
 *
 * Update the file mode when setting an ACL: compute the new file permission
 * bits based on the ACL.  In addition, if the ACL is equivalent to the new
 * file mode, set *@acl to NULL to indicate that no ACL should be set.
 *
 * As with chmod, clear the setgid bit if the caller is not in the owning group
 * or capable of CAP_FSETID (see inode_change_ok).
 *
 * Called from set_acl inode operations.
 */
int posix_acl_update_mode(struct inode *inode, umode_t *mode_p,
                          struct posix_acl **acl)
{
        umode_t mode = inode->i_mode;
        int error;

        error = posix_acl_equiv_mode(*acl, &mode);
        if (error < 0)
                return error;
        if (error == 0)
                *acl = NULL;
        if (!in_group_p(inode->i_gid) &&
            !capable_wrt_inode_uidgid(inode, CAP_FSETID))
                mode &= ~S_ISGID;
        *mode_p = mode;
        return 0;
}
EXPORT_SYMBOL(posix_acl_update_mode);

/*
 * Fix up the uids and gids in posix acl extended attributes in place.
 */
static void posix_acl_fix_xattr_userns(
        struct user_namespace *to, struct user_namespace *from,
        void *value, size_t size)
{
        struct posix_acl_xattr_header *header = value;
        struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end;
        int count;
        kuid_t uid;
        kgid_t gid;

        if (!value)
                return;
        if (size < sizeof(struct posix_acl_xattr_header))
                return;
        if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
                return;

        count = posix_acl_xattr_count(size);
        if (count < 0)
                return;
        if (count == 0)
                return;

        for (end = entry + count; entry != end; entry++) {
                switch(le16_to_cpu(entry->e_tag)) {
                case ACL_USER:
                        uid = make_kuid(from, le32_to_cpu(entry->e_id));
                        entry->e_id = cpu_to_le32(from_kuid(to, uid));
                        break;
                case ACL_GROUP:
                        gid = make_kgid(from, le32_to_cpu(entry->e_id));
                        entry->e_id = cpu_to_le32(from_kgid(to, gid));
                        break;
                default:
                        break;
                }
        }
}

void posix_acl_fix_xattr_from_user(void *value, size_t size)
{
        struct user_namespace *user_ns = current_user_ns();
        if (user_ns == &init_user_ns)
                return;
        posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size);
}

void posix_acl_fix_xattr_to_user(void *value, size_t size)
{
        struct user_namespace *user_ns = current_user_ns();
        if (user_ns == &init_user_ns)
                return;
        posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size);
}

/*
 * Convert from extended attribute to in-memory representation.
 */
struct posix_acl *
posix_acl_from_xattr(struct user_namespace *user_ns,
                     const void *value, size_t size)
{
        const struct posix_acl_xattr_header *header = value;
        const struct posix_acl_xattr_entry *entry = (const void *)(header + 1), *end;
        int count;
        struct posix_acl *acl;
        struct posix_acl_entry *acl_e;

        if (!value)
                return NULL;
        if (size < sizeof(struct posix_acl_xattr_header))
                 return ERR_PTR(-EINVAL);
        if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
                return ERR_PTR(-EOPNOTSUPP);

        count = posix_acl_xattr_count(size);
        if (count < 0)
                return ERR_PTR(-EINVAL);
        if (count == 0)
                return NULL;
        
        acl = posix_acl_alloc(count, GFP_NOFS);
        if (!acl)
                return ERR_PTR(-ENOMEM);
        acl_e = acl->a_entries;
        
        for (end = entry + count; entry != end; acl_e++, entry++) {
                acl_e->e_tag  = le16_to_cpu(entry->e_tag);
                acl_e->e_perm = le16_to_cpu(entry->e_perm);

                switch(acl_e->e_tag) {
                        case ACL_USER_OBJ:
                        case ACL_GROUP_OBJ:
                        case ACL_MASK:
                        case ACL_OTHER:
                                break;

                        case ACL_USER:
                                acl_e->e_uid =
                                        make_kuid(user_ns,
                                                  le32_to_cpu(entry->e_id));
                                if (!uid_valid(acl_e->e_uid))
                                        goto fail;
                                break;
                        case ACL_GROUP:
                                acl_e->e_gid =
                                        make_kgid(user_ns,
                                                  le32_to_cpu(entry->e_id));
                                if (!gid_valid(acl_e->e_gid))
                                        goto fail;
                                break;

                        default:
                                goto fail;
                }
        }
        return acl;

fail:
        posix_acl_release(acl);
        return ERR_PTR(-EINVAL);
}
EXPORT_SYMBOL (posix_acl_from_xattr);

/*
 * Convert from in-memory to extended attribute representation.
 */
int
posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
                   void *buffer, size_t size)
{
        struct posix_acl_xattr_header *ext_acl = buffer;
        struct posix_acl_xattr_entry *ext_entry;
        int real_size, n;

        real_size = posix_acl_xattr_size(acl->a_count);
        if (!buffer)
                return real_size;
        if (real_size > size)
                return -ERANGE;

        ext_entry = (void *)(ext_acl + 1);
        ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);

        for (n=0; n < acl->a_count; n++, ext_entry++) {
                const struct posix_acl_entry *acl_e = &acl->a_entries[n];
                ext_entry->e_tag  = cpu_to_le16(acl_e->e_tag);
                ext_entry->e_perm = cpu_to_le16(acl_e->e_perm);
                switch(acl_e->e_tag) {
                case ACL_USER:
                        ext_entry->e_id =
                                cpu_to_le32(from_kuid(user_ns, acl_e->e_uid));
                        break;
                case ACL_GROUP:
                        ext_entry->e_id =
                                cpu_to_le32(from_kgid(user_ns, acl_e->e_gid));
                        break;
                default:
                        ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
                        break;
                }
        }
        return real_size;
}
EXPORT_SYMBOL (posix_acl_to_xattr);

static int
posix_acl_xattr_get(const struct xattr_handler *handler,
                    struct dentry *unused, struct inode *inode,
                    const char *name, void *value, size_t size)
{
        struct posix_acl *acl;
        int error;

        if (!IS_POSIXACL(inode))
                return -EOPNOTSUPP;
        if (S_ISLNK(inode->i_mode))
                return -EOPNOTSUPP;

        acl = get_acl(inode, handler->flags);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (acl == NULL)
                return -ENODATA;

        error = posix_acl_to_xattr(&init_user_ns, acl, value, size);
        posix_acl_release(acl);

        return error;
}

int
set_posix_acl(struct inode *inode, int type, struct posix_acl *acl)
{
        if (!IS_POSIXACL(inode))
                return -EOPNOTSUPP;
        if (!inode->i_op->set_acl)
                return -EOPNOTSUPP;

        if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
                return acl ? -EACCES : 0;
        if (!inode_owner_or_capable(inode))
                return -EPERM;

        if (acl) {
                int ret = posix_acl_valid(inode->i_sb->s_user_ns, acl);
                if (ret)
                        return ret;
        }
        return inode->i_op->set_acl(inode, acl, type);
}
EXPORT_SYMBOL(set_posix_acl);

static int
posix_acl_xattr_set(const struct xattr_handler *handler,
                    struct dentry *unused, struct inode *inode,
                    const char *name, const void *value,
                    size_t size, int flags)
{
        struct posix_acl *acl = NULL;
        int ret;

        if (value) {
                acl = posix_acl_from_xattr(&init_user_ns, value, size);
                if (IS_ERR(acl))
                        return PTR_ERR(acl);
        }
        ret = set_posix_acl(inode, handler->flags, acl);
        posix_acl_release(acl);
        return ret;
}

static bool
posix_acl_xattr_list(struct dentry *dentry)
{
        return IS_POSIXACL(d_backing_inode(dentry));
}

const struct xattr_handler posix_acl_access_xattr_handler = {
        .name = XATTR_NAME_POSIX_ACL_ACCESS,
        .flags = ACL_TYPE_ACCESS,
        .list = posix_acl_xattr_list,
        .get = posix_acl_xattr_get,
        .set = posix_acl_xattr_set,
};
EXPORT_SYMBOL_GPL(posix_acl_access_xattr_handler);

const struct xattr_handler posix_acl_default_xattr_handler = {
        .name = XATTR_NAME_POSIX_ACL_DEFAULT,
        .flags = ACL_TYPE_DEFAULT,
        .list = posix_acl_xattr_list,
        .get = posix_acl_xattr_get,
        .set = posix_acl_xattr_set,
};
EXPORT_SYMBOL_GPL(posix_acl_default_xattr_handler);

int simple_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
        int error;

        if (type == ACL_TYPE_ACCESS) {
                error = posix_acl_update_mode(inode,
                                &inode->i_mode, &acl);
                if (error)
                        return error;
        }

        inode->i_ctime = current_time(inode);
        set_cached_acl(inode, type, acl);
        return 0;
}

int simple_acl_create(struct inode *dir, struct inode *inode)
{
        struct posix_acl *default_acl, *acl;
        int error;

        error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
        if (error)
                return error;

        set_cached_acl(inode, ACL_TYPE_DEFAULT, default_acl);
        set_cached_acl(inode, ACL_TYPE_ACCESS, acl);

        if (default_acl)
                posix_acl_release(default_acl);
        if (acl)
                posix_acl_release(acl);
        return 0;
}






















































































































































































































































































































    1 





































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_JIFFIES_H
#define _LINUX_JIFFIES_H

#include <linux/cache.h>
#include <linux/limits.h>
#include <linux/math64.h>
#include <linux/minmax.h>
#include <linux/types.h>
#include <linux/time.h>
#include <linux/timex.h>
#include <vdso/jiffies.h>
#include <asm/param.h>                        /* for HZ */
#include <generated/timeconst.h>

/*
 * The following defines establish the engineering parameters of the PLL
 * model. The HZ variable establishes the timer interrupt frequency, 100 Hz
 * for the SunOS kernel, 256 Hz for the Ultrix kernel and 1024 Hz for the
 * OSF/1 kernel. The SHIFT_HZ define expresses the same value as the
 * nearest power of two in order to avoid hardware multiply operations.
 */
#if HZ >= 12 && HZ < 24
# define SHIFT_HZ        4
#elif HZ >= 24 && HZ < 48
# define SHIFT_HZ        5
#elif HZ >= 48 && HZ < 96
# define SHIFT_HZ        6
#elif HZ >= 96 && HZ < 192
# define SHIFT_HZ        7
#elif HZ >= 192 && HZ < 384
# define SHIFT_HZ        8
#elif HZ >= 384 && HZ < 768
# define SHIFT_HZ        9
#elif HZ >= 768 && HZ < 1536
# define SHIFT_HZ        10
#elif HZ >= 1536 && HZ < 3072
# define SHIFT_HZ        11
#elif HZ >= 3072 && HZ < 6144
# define SHIFT_HZ        12
#elif HZ >= 6144 && HZ < 12288
# define SHIFT_HZ        13
#else
# error Invalid value of HZ.
#endif

/* Suppose we want to divide two numbers NOM and DEN: NOM/DEN, then we can
 * improve accuracy by shifting LSH bits, hence calculating:
 *     (NOM << LSH) / DEN
 * This however means trouble for large NOM, because (NOM << LSH) may no
 * longer fit in 32 bits. The following way of calculating this gives us
 * some slack, under the following conditions:
 *   - (NOM / DEN) fits in (32 - LSH) bits.
 *   - (NOM % DEN) fits in (32 - LSH) bits.
 */
#define SH_DIV(NOM,DEN,LSH) (   (((NOM) / (DEN)) << (LSH))              \
                             + ((((NOM) % (DEN)) << (LSH)) + (DEN) / 2) / (DEN))

/* LATCH is used in the interval timer and ftape setup. */
#define LATCH ((CLOCK_TICK_RATE + HZ/2) / HZ)        /* For divider */

extern int register_refined_jiffies(long clock_tick_rate);

/* TICK_USEC is the time between ticks in usec assuming SHIFTED_HZ */
#define TICK_USEC ((USEC_PER_SEC + HZ/2) / HZ)

/* USER_TICK_USEC is the time between ticks in usec assuming fake USER_HZ */
#define USER_TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ)

#ifndef __jiffy_arch_data
#define __jiffy_arch_data
#endif

/*
 * The 64-bit value is not atomic - you MUST NOT read it
 * without sampling the sequence number in jiffies_lock.
 * get_jiffies_64() will do this for you as appropriate.
 */
extern u64 __cacheline_aligned_in_smp jiffies_64;
extern unsigned long volatile __cacheline_aligned_in_smp __jiffy_arch_data jiffies;

#if (BITS_PER_LONG < 64)
u64 get_jiffies_64(void);
#else
static inline u64 get_jiffies_64(void)
{
        return (u64)jiffies;
}
#endif

/*
 *        These inlines deal with timer wrapping correctly. You are 
 *        strongly encouraged to use them
 *        1. Because people otherwise forget
 *        2. Because if the timer wrap changes in future you won't have to
 *           alter your driver code.
 *
 * time_after(a,b) returns true if the time a is after time b.
 *
 * Do this with "<0" and ">=0" to only test the sign of the result. A
 * good compiler would generate better code (and a really good compiler
 * wouldn't care). Gcc is currently neither.
 */
#define time_after(a,b)                \
        (typecheck(unsigned long, a) && \
         typecheck(unsigned long, b) && \
         ((long)((b) - (a)) < 0))
#define time_before(a,b)        time_after(b,a)

#define time_after_eq(a,b)        \
        (typecheck(unsigned long, a) && \
         typecheck(unsigned long, b) && \
         ((long)((a) - (b)) >= 0))
#define time_before_eq(a,b)        time_after_eq(b,a)

/*
 * Calculate whether a is in the range of [b, c].
 */
#define time_in_range(a,b,c) \
        (time_after_eq(a,b) && \
         time_before_eq(a,c))

/*
 * Calculate whether a is in the range of [b, c).
 */
#define time_in_range_open(a,b,c) \
        (time_after_eq(a,b) && \
         time_before(a,c))

/* Same as above, but does so with platform independent 64bit types.
 * These must be used when utilizing jiffies_64 (i.e. return value of
 * get_jiffies_64() */
#define time_after64(a,b)        \
        (typecheck(__u64, a) &&        \
         typecheck(__u64, b) && \
         ((__s64)((b) - (a)) < 0))
#define time_before64(a,b)        time_after64(b,a)

#define time_after_eq64(a,b)        \
        (typecheck(__u64, a) && \
         typecheck(__u64, b) && \
         ((__s64)((a) - (b)) >= 0))
#define time_before_eq64(a,b)        time_after_eq64(b,a)

#define time_in_range64(a, b, c) \
        (time_after_eq64(a, b) && \
         time_before_eq64(a, c))

/*
 * These four macros compare jiffies and 'a' for convenience.
 */

/* time_is_before_jiffies(a) return true if a is before jiffies */
#define time_is_before_jiffies(a) time_after(jiffies, a)
#define time_is_before_jiffies64(a) time_after64(get_jiffies_64(), a)

/* time_is_after_jiffies(a) return true if a is after jiffies */
#define time_is_after_jiffies(a) time_before(jiffies, a)
#define time_is_after_jiffies64(a) time_before64(get_jiffies_64(), a)

/* time_is_before_eq_jiffies(a) return true if a is before or equal to jiffies*/
#define time_is_before_eq_jiffies(a) time_after_eq(jiffies, a)
#define time_is_before_eq_jiffies64(a) time_after_eq64(get_jiffies_64(), a)

/* time_is_after_eq_jiffies(a) return true if a is after or equal to jiffies*/
#define time_is_after_eq_jiffies(a) time_before_eq(jiffies, a)
#define time_is_after_eq_jiffies64(a) time_before_eq64(get_jiffies_64(), a)

/*
 * Have the 32 bit jiffies value wrap 5 minutes after boot
 * so jiffies wrap bugs show up earlier.
 */
#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ))

/*
 * Change timeval to jiffies, trying to avoid the
 * most obvious overflows..
 *
 * And some not so obvious.
 *
 * Note that we don't want to return LONG_MAX, because
 * for various timeout reasons we often end up having
 * to wait "jiffies+1" in order to guarantee that we wait
 * at _least_ "jiffies" - so "jiffies+1" had better still
 * be positive.
 */
#define MAX_JIFFY_OFFSET ((LONG_MAX >> 1)-1)

extern unsigned long preset_lpj;

/*
 * We want to do realistic conversions of time so we need to use the same
 * values the update wall clock code uses as the jiffies size.  This value
 * is: TICK_NSEC (which is defined in timex.h).  This
 * is a constant and is in nanoseconds.  We will use scaled math
 * with a set of scales defined here as SEC_JIFFIE_SC,  USEC_JIFFIE_SC and
 * NSEC_JIFFIE_SC.  Note that these defines contain nothing but
 * constants and so are computed at compile time.  SHIFT_HZ (computed in
 * timex.h) adjusts the scaling for different HZ values.

 * Scaled math???  What is that?
 *
 * Scaled math is a way to do integer math on values that would,
 * otherwise, either overflow, underflow, or cause undesired div
 * instructions to appear in the execution path.  In short, we "scale"
 * up the operands so they take more bits (more precision, less
 * underflow), do the desired operation and then "scale" the result back
 * by the same amount.  If we do the scaling by shifting we avoid the
 * costly mpy and the dastardly div instructions.

 * Suppose, for example, we want to convert from seconds to jiffies
 * where jiffies is defined in nanoseconds as NSEC_PER_JIFFIE.  The
 * simple math is: jiff = (sec * NSEC_PER_SEC) / NSEC_PER_JIFFIE; We
 * observe that (NSEC_PER_SEC / NSEC_PER_JIFFIE) is a constant which we
 * might calculate at compile time, however, the result will only have
 * about 3-4 bits of precision (less for smaller values of HZ).
 *
 * So, we scale as follows:
 * jiff = (sec) * (NSEC_PER_SEC / NSEC_PER_JIFFIE);
 * jiff = ((sec) * ((NSEC_PER_SEC * SCALE)/ NSEC_PER_JIFFIE)) / SCALE;
 * Then we make SCALE a power of two so:
 * jiff = ((sec) * ((NSEC_PER_SEC << SCALE)/ NSEC_PER_JIFFIE)) >> SCALE;
 * Now we define:
 * #define SEC_CONV = ((NSEC_PER_SEC << SCALE)/ NSEC_PER_JIFFIE))
 * jiff = (sec * SEC_CONV) >> SCALE;
 *
 * Often the math we use will expand beyond 32-bits so we tell C how to
 * do this and pass the 64-bit result of the mpy through the ">> SCALE"
 * which should take the result back to 32-bits.  We want this expansion
 * to capture as much precision as possible.  At the same time we don't
 * want to overflow so we pick the SCALE to avoid this.  In this file,
 * that means using a different scale for each range of HZ values (as
 * defined in timex.h).
 *
 * For those who want to know, gcc will give a 64-bit result from a "*"
 * operator if the result is a long long AND at least one of the
 * operands is cast to long long (usually just prior to the "*" so as
 * not to confuse it into thinking it really has a 64-bit operand,
 * which, buy the way, it can do, but it takes more code and at least 2
 * mpys).

 * We also need to be aware that one second in nanoseconds is only a
 * couple of bits away from overflowing a 32-bit word, so we MUST use
 * 64-bits to get the full range time in nanoseconds.

 */

/*
 * Here are the scales we will use.  One for seconds, nanoseconds and
 * microseconds.
 *
 * Within the limits of cpp we do a rough cut at the SEC_JIFFIE_SC and
 * check if the sign bit is set.  If not, we bump the shift count by 1.
 * (Gets an extra bit of precision where we can use it.)
 * We know it is set for HZ = 1024 and HZ = 100 not for 1000.
 * Haven't tested others.

 * Limits of cpp (for #if expressions) only long (no long long), but
 * then we only need the most signicant bit.
 */

#define SEC_JIFFIE_SC (31 - SHIFT_HZ)
#if !((((NSEC_PER_SEC << 2) / TICK_NSEC) << (SEC_JIFFIE_SC - 2)) & 0x80000000)
#undef SEC_JIFFIE_SC
#define SEC_JIFFIE_SC (32 - SHIFT_HZ)
#endif
#define NSEC_JIFFIE_SC (SEC_JIFFIE_SC + 29)
#define SEC_CONVERSION ((unsigned long)((((u64)NSEC_PER_SEC << SEC_JIFFIE_SC) +\
                                TICK_NSEC -1) / (u64)TICK_NSEC))

#define NSEC_CONVERSION ((unsigned long)((((u64)1 << NSEC_JIFFIE_SC) +\
                                        TICK_NSEC -1) / (u64)TICK_NSEC))
/*
 * The maximum jiffie value is (MAX_INT >> 1).  Here we translate that
 * into seconds.  The 64-bit case will overflow if we are not careful,
 * so use the messy SH_DIV macro to do it.  Still all constants.
 */
#if BITS_PER_LONG < 64
# define MAX_SEC_IN_JIFFIES \
        (long)((u64)((u64)MAX_JIFFY_OFFSET * TICK_NSEC) / NSEC_PER_SEC)
#else        /* take care of overflow on 64 bits machines */
# define MAX_SEC_IN_JIFFIES \
        (SH_DIV((MAX_JIFFY_OFFSET >> SEC_JIFFIE_SC) * TICK_NSEC, NSEC_PER_SEC, 1) - 1)

#endif

/*
 * Convert various time units to each other:
 */
extern unsigned int jiffies_to_msecs(const unsigned long j);
extern unsigned int jiffies_to_usecs(const unsigned long j);

static inline u64 jiffies_to_nsecs(const unsigned long j)
{
        return (u64)jiffies_to_usecs(j) * NSEC_PER_USEC;
}

extern u64 jiffies64_to_nsecs(u64 j);
extern u64 jiffies64_to_msecs(u64 j);

extern unsigned long __msecs_to_jiffies(const unsigned int m);
#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
/*
 * HZ is equal to or smaller than 1000, and 1000 is a nice round
 * multiple of HZ, divide with the factor between them, but round
 * upwards:
 */
static inline unsigned long _msecs_to_jiffies(const unsigned int m)
{
        return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ);
}
#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
/*
 * HZ is larger than 1000, and HZ is a nice round multiple of 1000 -
 * simply multiply with the factor between them.
 *
 * But first make sure the multiplication result cannot overflow:
 */
static inline unsigned long _msecs_to_jiffies(const unsigned int m)
{
        if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
                return MAX_JIFFY_OFFSET;
        return m * (HZ / MSEC_PER_SEC);
}
#else
/*
 * Generic case - multiply, round and divide. But first check that if
 * we are doing a net multiplication, that we wouldn't overflow:
 */
static inline unsigned long _msecs_to_jiffies(const unsigned int m)
{
        if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
                return MAX_JIFFY_OFFSET;

        return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) >> MSEC_TO_HZ_SHR32;
}
#endif
/**
 * msecs_to_jiffies: - convert milliseconds to jiffies
 * @m:        time in milliseconds
 *
 * conversion is done as follows:
 *
 * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET)
 *
 * - 'too large' values [that would result in larger than
 *   MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
 *
 * - all other values are converted to jiffies by either multiplying
 *   the input value by a factor or dividing it with a factor and
 *   handling any 32-bit overflows.
 *   for the details see _msecs_to_jiffies()
 *
 * msecs_to_jiffies() checks for the passed in value being a constant
 * via __builtin_constant_p() allowing gcc to eliminate most of the
 * code, __msecs_to_jiffies() is called if the value passed does not
 * allow constant folding and the actual conversion must be done at
 * runtime.
 * the HZ range specific helpers _msecs_to_jiffies() are called both
 * directly here and from __msecs_to_jiffies() in the case where
 * constant folding is not possible.
 */
static __always_inline unsigned long msecs_to_jiffies(const unsigned int m)
{
        if (__builtin_constant_p(m)) {
                if ((int)m < 0)
                        return MAX_JIFFY_OFFSET;
                return _msecs_to_jiffies(m);
        } else {
                return __msecs_to_jiffies(m);
        }
}

extern unsigned long __usecs_to_jiffies(const unsigned int u);
#if !(USEC_PER_SEC % HZ)
static inline unsigned long _usecs_to_jiffies(const unsigned int u)
{
        return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ);
}
#else
static inline unsigned long _usecs_to_jiffies(const unsigned int u)
{
        return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32)
                >> USEC_TO_HZ_SHR32;
}
#endif

/**
 * usecs_to_jiffies: - convert microseconds to jiffies
 * @u:        time in microseconds
 *
 * conversion is done as follows:
 *
 * - 'too large' values [that would result in larger than
 *   MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
 *
 * - all other values are converted to jiffies by either multiplying
 *   the input value by a factor or dividing it with a factor and
 *   handling any 32-bit overflows as for msecs_to_jiffies.
 *
 * usecs_to_jiffies() checks for the passed in value being a constant
 * via __builtin_constant_p() allowing gcc to eliminate most of the
 * code, __usecs_to_jiffies() is called if the value passed does not
 * allow constant folding and the actual conversion must be done at
 * runtime.
 * the HZ range specific helpers _usecs_to_jiffies() are called both
 * directly here and from __msecs_to_jiffies() in the case where
 * constant folding is not possible.
 */
static __always_inline unsigned long usecs_to_jiffies(const unsigned int u)
{
        if (__builtin_constant_p(u)) {
                if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
                        return MAX_JIFFY_OFFSET;
                return _usecs_to_jiffies(u);
        } else {
                return __usecs_to_jiffies(u);
        }
}

extern unsigned long timespec64_to_jiffies(const struct timespec64 *value);
extern void jiffies_to_timespec64(const unsigned long jiffies,
                                  struct timespec64 *value);
extern clock_t jiffies_to_clock_t(unsigned long x);
static inline clock_t jiffies_delta_to_clock_t(long delta)
{
        return jiffies_to_clock_t(max(0L, delta));
}

static inline unsigned int jiffies_delta_to_msecs(long delta)
{
        return jiffies_to_msecs(max(0L, delta));
}

extern unsigned long clock_t_to_jiffies(unsigned long x);
extern u64 jiffies_64_to_clock_t(u64 x);
extern u64 nsec_to_clock_t(u64 x);
extern u64 nsecs_to_jiffies64(u64 n);
extern unsigned long nsecs_to_jiffies(u64 n);

#define TIMESTAMP_SIZE        30

#endif





































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
/*
 *  include/linux/eventpoll.h ( Efficient event polling implementation )
 *  Copyright (C) 2001,...,2006         Davide Libenzi
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  Davide Libenzi <davidel@xmailserver.org>
 *
 */

#ifndef _UAPI_LINUX_EVENTPOLL_H
#define _UAPI_LINUX_EVENTPOLL_H

/* For O_CLOEXEC */
#include <linux/fcntl.h>
#include <linux/types.h>

/* Flags for epoll_create1.  */
#define EPOLL_CLOEXEC O_CLOEXEC

/* Valid opcodes to issue to sys_epoll_ctl() */
#define EPOLL_CTL_ADD 1
#define EPOLL_CTL_DEL 2
#define EPOLL_CTL_MOD 3

/* Epoll event masks */
#define EPOLLIN                (__force __poll_t)0x00000001
#define EPOLLPRI        (__force __poll_t)0x00000002
#define EPOLLOUT        (__force __poll_t)0x00000004
#define EPOLLERR        (__force __poll_t)0x00000008
#define EPOLLHUP        (__force __poll_t)0x00000010
#define EPOLLNVAL        (__force __poll_t)0x00000020
#define EPOLLRDNORM        (__force __poll_t)0x00000040
#define EPOLLRDBAND        (__force __poll_t)0x00000080
#define EPOLLWRNORM        (__force __poll_t)0x00000100
#define EPOLLWRBAND        (__force __poll_t)0x00000200
#define EPOLLMSG        (__force __poll_t)0x00000400
#define EPOLLRDHUP        (__force __poll_t)0x00002000

/*
 * Internal flag - wakeup generated by io_uring, used to detect recursion back
 * into the io_uring poll handler.
 */
#define EPOLL_URING_WAKE        ((__force __poll_t)(1U << 27))

/* Set exclusive wakeup mode for the target file descriptor */
#define EPOLLEXCLUSIVE        ((__force __poll_t)(1U << 28))

/*
 * Request the handling of system wakeup events so as to prevent system suspends
 * from happening while those events are being processed.
 *
 * Assuming neither EPOLLET nor EPOLLONESHOT is set, system suspends will not be
 * re-allowed until epoll_wait is called again after consuming the wakeup
 * event(s).
 *
 * Requires CAP_BLOCK_SUSPEND
 */
#define EPOLLWAKEUP        ((__force __poll_t)(1U << 29))

/* Set the One Shot behaviour for the target file descriptor */
#define EPOLLONESHOT        ((__force __poll_t)(1U << 30))

/* Set the Edge Triggered behaviour for the target file descriptor */
#define EPOLLET                ((__force __poll_t)(1U << 31))

/* 
 * On x86-64 make the 64bit structure have the same alignment as the
 * 32bit structure. This makes 32bit emulation easier.
 *
 * UML/x86_64 needs the same packing as x86_64
 */
#ifdef __x86_64__
#define EPOLL_PACKED __attribute__((packed))
#else
#define EPOLL_PACKED
#endif

struct epoll_event {
        __poll_t events;
        __u64 data;
} EPOLL_PACKED;

#ifdef CONFIG_PM_SLEEP
static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev)
{
        if ((epev->events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND))
                epev->events &= ~EPOLLWAKEUP;
}
#else
static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev)
{
        epev->events &= ~EPOLLWAKEUP;
}
#endif
#endif /* _UAPI_LINUX_EVENTPOLL_H */















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
#undef TRACE_SYSTEM
#define TRACE_SYSTEM rtc

#if !defined(_TRACE_RTC_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_RTC_H

#include <linux/rtc.h>
#include <linux/tracepoint.h>

DECLARE_EVENT_CLASS(rtc_time_alarm_class,

        TP_PROTO(time64_t secs, int err),

        TP_ARGS(secs, err),

        TP_STRUCT__entry(
                __field(time64_t, secs)
                __field(int, err)
        ),

        TP_fast_assign(
                __entry->secs = secs;
                __entry->err = err;
        ),

        TP_printk("UTC (%lld) (%d)",
                  __entry->secs, __entry->err
        )
);

DEFINE_EVENT(rtc_time_alarm_class, rtc_set_time,

        TP_PROTO(time64_t secs, int err),

        TP_ARGS(secs, err)
);

DEFINE_EVENT(rtc_time_alarm_class, rtc_read_time,

        TP_PROTO(time64_t secs, int err),

        TP_ARGS(secs, err)
);

DEFINE_EVENT(rtc_time_alarm_class, rtc_set_alarm,

        TP_PROTO(time64_t secs, int err),

        TP_ARGS(secs, err)
);

DEFINE_EVENT(rtc_time_alarm_class, rtc_read_alarm,

        TP_PROTO(time64_t secs, int err),

        TP_ARGS(secs, err)
);

TRACE_EVENT(rtc_irq_set_freq,

        TP_PROTO(int freq, int err),

        TP_ARGS(freq, err),

        TP_STRUCT__entry(
                __field(int, freq)
                __field(int, err)
        ),

        TP_fast_assign(
                __entry->freq = freq;
                __entry->err = err;
        ),

        TP_printk("set RTC periodic IRQ frequency:%u (%d)",
                  __entry->freq, __entry->err
        )
);

TRACE_EVENT(rtc_irq_set_state,

        TP_PROTO(int enabled, int err),

        TP_ARGS(enabled, err),

        TP_STRUCT__entry(
                __field(int, enabled)
                __field(int, err)
        ),

        TP_fast_assign(
                __entry->enabled = enabled;
                __entry->err = err;
        ),

        TP_printk("%s RTC 2^N Hz periodic IRQs (%d)",
                  __entry->enabled ? "enable" : "disable",
                  __entry->err
        )
);

TRACE_EVENT(rtc_alarm_irq_enable,

        TP_PROTO(unsigned int enabled, int err),

        TP_ARGS(enabled, err),

        TP_STRUCT__entry(
                __field(unsigned int, enabled)
                __field(int, err)
        ),

        TP_fast_assign(
                __entry->enabled = enabled;
                __entry->err = err;
        ),

        TP_printk("%s RTC alarm IRQ (%d)",
                  __entry->enabled ? "enable" : "disable",
                  __entry->err
        )
);

DECLARE_EVENT_CLASS(rtc_offset_class,

        TP_PROTO(long offset, int err),

        TP_ARGS(offset, err),

        TP_STRUCT__entry(
                __field(long, offset)
                __field(int, err)
        ),

        TP_fast_assign(
                __entry->offset = offset;
                __entry->err = err;
        ),

        TP_printk("RTC offset: %ld (%d)",
                  __entry->offset, __entry->err
        )
);

DEFINE_EVENT(rtc_offset_class, rtc_set_offset,

        TP_PROTO(long offset, int err),

        TP_ARGS(offset, err)
);

DEFINE_EVENT(rtc_offset_class, rtc_read_offset,

        TP_PROTO(long offset, int err),

        TP_ARGS(offset, err)
);

DECLARE_EVENT_CLASS(rtc_timer_class,

        TP_PROTO(struct rtc_timer *timer),

        TP_ARGS(timer),

        TP_STRUCT__entry(
                __field(struct rtc_timer *, timer)
                __field(ktime_t, expires)
                __field(ktime_t, period)
        ),

        TP_fast_assign(
                __entry->timer = timer;
                __entry->expires = timer->node.expires;
                __entry->period = timer->period;
        ),

        TP_printk("RTC timer:(%p) expires:%lld period:%lld",
                  __entry->timer, __entry->expires, __entry->period
        )
);

DEFINE_EVENT(rtc_timer_class, rtc_timer_enqueue,

        TP_PROTO(struct rtc_timer *timer),

        TP_ARGS(timer)
);

DEFINE_EVENT(rtc_timer_class, rtc_timer_dequeue,

        TP_PROTO(struct rtc_timer *timer),

        TP_ARGS(timer)
);

DEFINE_EVENT(rtc_timer_class, rtc_timer_fired,

        TP_PROTO(struct rtc_timer *timer),

        TP_ARGS(timer)
);

#endif /* _TRACE_RTC_H */

/* This part must be outside protection */
#include <trace/define_trace.h>












































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
/*
 *  include/linux/ktime.h
 *
 *  ktime_t - nanosecond-resolution time format.
 *
 *   Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de>
 *   Copyright(C) 2005, Red Hat, Inc., Ingo Molnar
 *
 *  data type definitions, declarations, prototypes and macros.
 *
 *  Started by: Thomas Gleixner and Ingo Molnar
 *
 *  Credits:
 *
 *          Roman Zippel provided the ideas and primary code snippets of
 *          the ktime_t union and further simplifications of the original
 *          code.
 *
 *  For licencing details see kernel-base/COPYING
 */
#ifndef _LINUX_KTIME_H
#define _LINUX_KTIME_H

#include <linux/time.h>
#include <linux/jiffies.h>
#include <asm/bug.h>

/* Nanosecond scalar representation for kernel time values */
typedef s64        ktime_t;

/**
 * ktime_set - Set a ktime_t variable from a seconds/nanoseconds value
 * @secs:        seconds to set
 * @nsecs:        nanoseconds to set
 *
 * Return: The ktime_t representation of the value.
 */
static inline ktime_t ktime_set(const s64 secs, const unsigned long nsecs)
{
        if (unlikely(secs >= KTIME_SEC_MAX))
                return KTIME_MAX;

        return secs * NSEC_PER_SEC + (s64)nsecs;
}

/* Subtract two ktime_t variables. rem = lhs -rhs: */
#define ktime_sub(lhs, rhs)        ((lhs) - (rhs))

/* Add two ktime_t variables. res = lhs + rhs: */
#define ktime_add(lhs, rhs)        ((lhs) + (rhs))

/*
 * Same as ktime_add(), but avoids undefined behaviour on overflow; however,
 * this means that you must check the result for overflow yourself.
 */
#define ktime_add_unsafe(lhs, rhs)        ((u64) (lhs) + (rhs))

/*
 * Add a ktime_t variable and a scalar nanosecond value.
 * res = kt + nsval:
 */
#define ktime_add_ns(kt, nsval)                ((kt) + (nsval))

/*
 * Subtract a scalar nanosecod from a ktime_t variable
 * res = kt - nsval:
 */
#define ktime_sub_ns(kt, nsval)                ((kt) - (nsval))

/* convert a timespec64 to ktime_t format: */
static inline ktime_t timespec64_to_ktime(struct timespec64 ts)
{
        return ktime_set(ts.tv_sec, ts.tv_nsec);
}

/* Map the ktime_t to timespec conversion to ns_to_timespec function */
#define ktime_to_timespec64(kt)                ns_to_timespec64((kt))

/* Convert ktime_t to nanoseconds */
static inline s64 ktime_to_ns(const ktime_t kt)
{
        return kt;
}

/**
 * ktime_compare - Compares two ktime_t variables for less, greater or equal
 * @cmp1:        comparable1
 * @cmp2:        comparable2
 *
 * Return: ...
 *   cmp1  < cmp2: return <0
 *   cmp1 == cmp2: return 0
 *   cmp1  > cmp2: return >0
 */
static inline int ktime_compare(const ktime_t cmp1, const ktime_t cmp2)
{
        if (cmp1 < cmp2)
                return -1;
        if (cmp1 > cmp2)
                return 1;
        return 0;
}

/**
 * ktime_after - Compare if a ktime_t value is bigger than another one.
 * @cmp1:        comparable1
 * @cmp2:        comparable2
 *
 * Return: true if cmp1 happened after cmp2.
 */
static inline bool ktime_after(const ktime_t cmp1, const ktime_t cmp2)
{
        return ktime_compare(cmp1, cmp2) > 0;
}

/**
 * ktime_before - Compare if a ktime_t value is smaller than another one.
 * @cmp1:        comparable1
 * @cmp2:        comparable2
 *
 * Return: true if cmp1 happened before cmp2.
 */
static inline bool ktime_before(const ktime_t cmp1, const ktime_t cmp2)
{
        return ktime_compare(cmp1, cmp2) < 0;
}

#if BITS_PER_LONG < 64
extern s64 __ktime_divns(const ktime_t kt, s64 div);
static inline s64 ktime_divns(const ktime_t kt, s64 div)
{
        /*
         * Negative divisors could cause an inf loop,
         * so bug out here.
         */
        BUG_ON(div < 0);
        if (__builtin_constant_p(div) && !(div >> 32)) {
                s64 ns = kt;
                u64 tmp = ns < 0 ? -ns : ns;

                do_div(tmp, div);
                return ns < 0 ? -tmp : tmp;
        } else {
                return __ktime_divns(kt, div);
        }
}
#else /* BITS_PER_LONG < 64 */
static inline s64 ktime_divns(const ktime_t kt, s64 div)
{
        /*
         * 32-bit implementation cannot handle negative divisors,
         * so catch them on 64bit as well.
         */
        WARN_ON(div < 0);
        return kt / div;
}
#endif

static inline s64 ktime_to_us(const ktime_t kt)
{
        return ktime_divns(kt, NSEC_PER_USEC);
}

static inline s64 ktime_to_ms(const ktime_t kt)
{
        return ktime_divns(kt, NSEC_PER_MSEC);
}

static inline s64 ktime_us_delta(const ktime_t later, const ktime_t earlier)
{
       return ktime_to_us(ktime_sub(later, earlier));
}

static inline s64 ktime_ms_delta(const ktime_t later, const ktime_t earlier)
{
        return ktime_to_ms(ktime_sub(later, earlier));
}

static inline ktime_t ktime_add_us(const ktime_t kt, const u64 usec)
{
        return ktime_add_ns(kt, usec * NSEC_PER_USEC);
}

static inline ktime_t ktime_add_ms(const ktime_t kt, const u64 msec)
{
        return ktime_add_ns(kt, msec * NSEC_PER_MSEC);
}

static inline ktime_t ktime_sub_us(const ktime_t kt, const u64 usec)
{
        return ktime_sub_ns(kt, usec * NSEC_PER_USEC);
}

static inline ktime_t ktime_sub_ms(const ktime_t kt, const u64 msec)
{
        return ktime_sub_ns(kt, msec * NSEC_PER_MSEC);
}

extern ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs);

/**
 * ktime_to_timespec64_cond - convert a ktime_t variable to timespec64
 *                            format only if the variable contains data
 * @kt:                the ktime_t variable to convert
 * @ts:                the timespec variable to store the result in
 *
 * Return: %true if there was a successful conversion, %false if kt was 0.
 */
static inline __must_check bool ktime_to_timespec64_cond(const ktime_t kt,
                                                       struct timespec64 *ts)
{
        if (kt) {
                *ts = ktime_to_timespec64(kt);
                return true;
        } else {
                return false;
        }
}

#include <vdso/ktime.h>

static inline ktime_t ns_to_ktime(u64 ns)
{
        return ns;
}

static inline ktime_t ms_to_ktime(u64 ms)
{
        return ms * NSEC_PER_MSEC;
}

# include <linux/timekeeping.h>
# include <linux/timekeeping32.h>

#endif






































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/fsnotify_backend.h>
#include <linux/inotify.h>
#include <linux/slab.h> /* struct kmem_cache */

struct inotify_event_info {
        struct fsnotify_event fse;
        u32 mask;
        int wd;
        u32 sync_cookie;
        int name_len;
        char name[];
};

struct inotify_inode_mark {
        struct fsnotify_mark fsn_mark;
        int wd;
};

static inline struct inotify_event_info *INOTIFY_E(struct fsnotify_event *fse)
{
        return container_of(fse, struct inotify_event_info, fse);
}

/*
 * INOTIFY_USER_FLAGS represents all of the mask bits that we expose to
 * userspace.  There is at least one bit (FS_EVENT_ON_CHILD) which is
 * used only internally to the kernel.
 */
#define INOTIFY_USER_MASK (IN_ALL_EVENTS)

static inline __u32 inotify_mark_user_mask(struct fsnotify_mark *fsn_mark)
{
        __u32 mask = fsn_mark->mask & INOTIFY_USER_MASK;

        if (fsn_mark->flags & FSNOTIFY_MARK_FLAG_EXCL_UNLINK)
                mask |= IN_EXCL_UNLINK;
        if (fsn_mark->flags & FSNOTIFY_MARK_FLAG_IN_ONESHOT)
                mask |= IN_ONESHOT;

        return mask;
}

extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
                                           struct fsnotify_group *group);
extern int inotify_handle_inode_event(struct fsnotify_mark *inode_mark,
                                      u32 mask, struct inode *inode,
                                      struct inode *dir,
                                      const struct qstr *name, u32 cookie);

extern const struct fsnotify_ops inotify_fsnotify_ops;
extern struct kmem_cache *inotify_inode_mark_cachep;

#ifdef CONFIG_INOTIFY_USER
static inline void dec_inotify_instances(struct ucounts *ucounts)
{
        dec_ucount(ucounts, UCOUNT_INOTIFY_INSTANCES);
}

static inline struct ucounts *inc_inotify_watches(struct ucounts *ucounts)
{
        return inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_INOTIFY_WATCHES);
}

static inline void dec_inotify_watches(struct ucounts *ucounts)
{
        dec_ucount(ucounts, UCOUNT_INOTIFY_WATCHES);
}
#endif
























































    9 


























    9 

















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PERCPU_COUNTER_H
#define _LINUX_PERCPU_COUNTER_H
/*
 * A simple "approximate counter" for use in ext2 and ext3 superblocks.
 *
 * WARNING: these things are HUGE.  4 kbytes per counter on 32-way P4.
 */

#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/list.h>
#include <linux/threads.h>
#include <linux/percpu.h>
#include <linux/types.h>
#include <linux/gfp.h>

#ifdef CONFIG_SMP

struct percpu_counter {
        raw_spinlock_t lock;
        s64 count;
#ifdef CONFIG_HOTPLUG_CPU
        struct list_head list;        /* All percpu_counters are on a list */
#endif
        s32 __percpu *counters;
};

extern int percpu_counter_batch;

int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
                          struct lock_class_key *key);

#define percpu_counter_init(fbc, value, gfp)                                \
        ({                                                                \
                static struct lock_class_key __key;                        \
                                                                        \
                __percpu_counter_init(fbc, value, gfp, &__key);                \
        })

void percpu_counter_destroy(struct percpu_counter *fbc);
void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount,
                              s32 batch);
s64 __percpu_counter_sum(struct percpu_counter *fbc);
int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch);
void percpu_counter_sync(struct percpu_counter *fbc);

static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
{
        return __percpu_counter_compare(fbc, rhs, percpu_counter_batch);
}

static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
{
        percpu_counter_add_batch(fbc, amount, percpu_counter_batch);
}

static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
{
        s64 ret = __percpu_counter_sum(fbc);
        return ret < 0 ? 0 : ret;
}

static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
{
        return __percpu_counter_sum(fbc);
}

static inline s64 percpu_counter_read(struct percpu_counter *fbc)
{
        return fbc->count;
}

/*
 * It is possible for the percpu_counter_read() to return a small negative
 * number for some counter which should never be negative.
 *
 */
static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
{
        /* Prevent reloads of fbc->count */
        s64 ret = READ_ONCE(fbc->count);

        if (ret >= 0)
                return ret;
        return 0;
}

static inline bool percpu_counter_initialized(struct percpu_counter *fbc)
{
        return (fbc->counters != NULL);
}

#else /* !CONFIG_SMP */

struct percpu_counter {
        s64 count;
};

static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount,
                                      gfp_t gfp)
{
        fbc->count = amount;
        return 0;
}

static inline void percpu_counter_destroy(struct percpu_counter *fbc)
{
}

static inline void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
{
        fbc->count = amount;
}

static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
{
        if (fbc->count > rhs)
                return 1;
        else if (fbc->count < rhs)
                return -1;
        else
                return 0;
}

static inline int
__percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch)
{
        return percpu_counter_compare(fbc, rhs);
}

static inline void
percpu_counter_add(struct percpu_counter *fbc, s64 amount)
{
        preempt_disable();
        fbc->count += amount;
        preempt_enable();
}

static inline void
percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch)
{
        percpu_counter_add(fbc, amount);
}

static inline s64 percpu_counter_read(struct percpu_counter *fbc)
{
        return fbc->count;
}

/*
 * percpu_counter is intended to track positive numbers. In the UP case the
 * number should never be negative.
 */
static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
{
        return fbc->count;
}

static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
{
        return percpu_counter_read_positive(fbc);
}

static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
{
        return percpu_counter_read(fbc);
}

static inline bool percpu_counter_initialized(struct percpu_counter *fbc)
{
        return true;
}

static inline void percpu_counter_sync(struct percpu_counter *fbc)
{
}
#endif        /* CONFIG_SMP */

static inline void percpu_counter_inc(struct percpu_counter *fbc)
{
        percpu_counter_add(fbc, 1);
}

static inline void percpu_counter_dec(struct percpu_counter *fbc)
{
        percpu_counter_add(fbc, -1);
}

static inline void percpu_counter_sub(struct percpu_counter *fbc, s64 amount)
{
        percpu_counter_add(fbc, -amount);
}

#endif /* _LINUX_PERCPU_COUNTER_H */
















































    1 

    1 


    1 


















    1 

    1 
    1 





    1 
    1 
    1 



    1 






































































































































































    1 



    1 



    1 
    1 





































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
// SPDX-License-Identifier: GPL-2.0-only
/*
 * proc/fs/generic.c --- generic routines for the proc-fs
 *
 * This file contains generic proc-fs routines for handling
 * directories and files.
 * 
 * Copyright (C) 1991, 1992 Linus Torvalds.
 * Copyright (C) 1997 Theodore Ts'o
 */

#include <linux/cache.h>
#include <linux/errno.h>
#include <linux/time.h>
#include <linux/proc_fs.h>
#include <linux/stat.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/namei.h>
#include <linux/slab.h>
#include <linux/printk.h>
#include <linux/mount.h>
#include <linux/init.h>
#include <linux/idr.h>
#include <linux/bitops.h>
#include <linux/spinlock.h>
#include <linux/completion.h>
#include <linux/uaccess.h>
#include <linux/seq_file.h>

#include "internal.h"

static DEFINE_RWLOCK(proc_subdir_lock);

struct kmem_cache *proc_dir_entry_cache __ro_after_init;

void pde_free(struct proc_dir_entry *pde)
{
        if (S_ISLNK(pde->mode))
                kfree(pde->data);
        if (pde->name != pde->inline_name)
                kfree(pde->name);
        kmem_cache_free(proc_dir_entry_cache, pde);
}

static int proc_match(const char *name, struct proc_dir_entry *de, unsigned int len)
{
        if (len < de->namelen)
                return -1;
        if (len > de->namelen)
                return 1;

        return memcmp(name, de->name, len);
}

static struct proc_dir_entry *pde_subdir_first(struct proc_dir_entry *dir)
{
        return rb_entry_safe(rb_first(&dir->subdir), struct proc_dir_entry,
                             subdir_node);
}

static struct proc_dir_entry *pde_subdir_next(struct proc_dir_entry *dir)
{
        return rb_entry_safe(rb_next(&dir->subdir_node), struct proc_dir_entry,
                             subdir_node);
}

static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir,
                                              const char *name,
                                              unsigned int len)
{
        struct rb_node *node = dir->subdir.rb_node;

        while (node) {
                struct proc_dir_entry *de = rb_entry(node,
                                                     struct proc_dir_entry,
                                                     subdir_node);
                int result = proc_match(name, de, len);

                if (result < 0)
                        node = node->rb_left;
                else if (result > 0)
                        node = node->rb_right;
                else
                        return de;
        }
        return NULL;
}

static bool pde_subdir_insert(struct proc_dir_entry *dir,
                              struct proc_dir_entry *de)
{
        struct rb_root *root = &dir->subdir;
        struct rb_node **new = &root->rb_node, *parent = NULL;

        /* Figure out where to put new node */
        while (*new) {
                struct proc_dir_entry *this = rb_entry(*new,
                                                       struct proc_dir_entry,
                                                       subdir_node);
                int result = proc_match(de->name, this, de->namelen);

                parent = *new;
                if (result < 0)
                        new = &(*new)->rb_left;
                else if (result > 0)
                        new = &(*new)->rb_right;
                else
                        return false;
        }

        /* Add new node and rebalance tree. */
        rb_link_node(&de->subdir_node, parent, new);
        rb_insert_color(&de->subdir_node, root);
        return true;
}

static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
{
        struct inode *inode = d_inode(dentry);
        struct proc_dir_entry *de = PDE(inode);
        int error;

        error = setattr_prepare(dentry, iattr);
        if (error)
                return error;

        setattr_copy(inode, iattr);
        mark_inode_dirty(inode);

        proc_set_user(de, inode->i_uid, inode->i_gid);
        de->mode = inode->i_mode;
        return 0;
}

static int proc_getattr(const struct path *path, struct kstat *stat,
                        u32 request_mask, unsigned int query_flags)
{
        struct inode *inode = d_inode(path->dentry);
        struct proc_dir_entry *de = PDE(inode);
        if (de) {
                nlink_t nlink = READ_ONCE(de->nlink);
                if (nlink > 0) {
                        set_nlink(inode, nlink);
                }
        }

        generic_fillattr(inode, stat);
        return 0;
}

static const struct inode_operations proc_file_inode_operations = {
        .setattr        = proc_notify_change,
};

/*
 * This function parses a name such as "tty/driver/serial", and
 * returns the struct proc_dir_entry for "/proc/tty/driver", and
 * returns "serial" in residual.
 */
static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret,
                             const char **residual)
{
        const char                     *cp = name, *next;
        struct proc_dir_entry        *de;

        de = *ret;
        if (!de)
                de = &proc_root;

        while (1) {
                next = strchr(cp, '/');
                if (!next)
                        break;

                de = pde_subdir_find(de, cp, next - cp);
                if (!de) {
                        WARN(1, "name '%s'\n", name);
                        return -ENOENT;
                }
                cp = next + 1;
        }
        *residual = cp;
        *ret = de;
        return 0;
}

static int xlate_proc_name(const char *name, struct proc_dir_entry **ret,
                           const char **residual)
{
        int rv;

        read_lock(&proc_subdir_lock);
        rv = __xlate_proc_name(name, ret, residual);
        read_unlock(&proc_subdir_lock);
        return rv;
}

static DEFINE_IDA(proc_inum_ida);

#define PROC_DYNAMIC_FIRST 0xF0000000U

/*
 * Return an inode number between PROC_DYNAMIC_FIRST and
 * 0xffffffff, or zero on failure.
 */
int proc_alloc_inum(unsigned int *inum)
{
        int i;

        i = ida_simple_get(&proc_inum_ida, 0, UINT_MAX - PROC_DYNAMIC_FIRST + 1,
                           GFP_KERNEL);
        if (i < 0)
                return i;

        *inum = PROC_DYNAMIC_FIRST + (unsigned int)i;
        return 0;
}

void proc_free_inum(unsigned int inum)
{
        ida_simple_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST);
}

static int proc_misc_d_revalidate(struct dentry *dentry, unsigned int flags)
{
        if (flags & LOOKUP_RCU)
                return -ECHILD;

        if (atomic_read(&PDE(d_inode(dentry))->in_use) < 0)
                return 0; /* revalidate */
        return 1;
}

static int proc_misc_d_delete(const struct dentry *dentry)
{
        return atomic_read(&PDE(d_inode(dentry))->in_use) < 0;
}

static const struct dentry_operations proc_misc_dentry_ops = {
        .d_revalidate        = proc_misc_d_revalidate,
        .d_delete        = proc_misc_d_delete,
};

/*
 * Don't create negative dentries here, return -ENOENT by hand
 * instead.
 */
struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry,
                              struct proc_dir_entry *de)
{
        struct inode *inode;

        read_lock(&proc_subdir_lock);
        de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len);
        if (de) {
                pde_get(de);
                read_unlock(&proc_subdir_lock);
                inode = proc_get_inode(dir->i_sb, de);
                if (!inode)
                        return ERR_PTR(-ENOMEM);
                d_set_d_op(dentry, de->proc_dops);
                return d_splice_alias(inode, dentry);
        }
        read_unlock(&proc_subdir_lock);
        return ERR_PTR(-ENOENT);
}

struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
                unsigned int flags)
{
        struct proc_fs_info *fs_info = proc_sb_info(dir->i_sb);

        if (fs_info->pidonly == PROC_PIDONLY_ON)
                return ERR_PTR(-ENOENT);

        return proc_lookup_de(dir, dentry, PDE(dir));
}

/*
 * This returns non-zero if at EOF, so that the /proc
 * root directory can use this and check if it should
 * continue with the <pid> entries..
 *
 * Note that the VFS-layer doesn't care about the return
 * value of the readdir() call, as long as it's non-negative
 * for success..
 */
int proc_readdir_de(struct file *file, struct dir_context *ctx,
                    struct proc_dir_entry *de)
{
        int i;

        if (!dir_emit_dots(file, ctx))
                return 0;

        i = ctx->pos - 2;
        read_lock(&proc_subdir_lock);
        de = pde_subdir_first(de);
        for (;;) {
                if (!de) {
                        read_unlock(&proc_subdir_lock);
                        return 0;
                }
                if (!i)
                        break;
                de = pde_subdir_next(de);
                i--;
        }

        do {
                struct proc_dir_entry *next;
                pde_get(de);
                read_unlock(&proc_subdir_lock);
                if (!dir_emit(ctx, de->name, de->namelen,
                            de->low_ino, de->mode >> 12)) {
                        pde_put(de);
                        return 0;
                }
                ctx->pos++;
                read_lock(&proc_subdir_lock);
                next = pde_subdir_next(de);
                pde_put(de);
                de = next;
        } while (de);
        read_unlock(&proc_subdir_lock);
        return 1;
}

int proc_readdir(struct file *file, struct dir_context *ctx)
{
        struct inode *inode = file_inode(file);
        struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);

        if (fs_info->pidonly == PROC_PIDONLY_ON)
                return 1;

        return proc_readdir_de(file, ctx, PDE(inode));
}

/*
 * These are the generic /proc directory operations. They
 * use the in-memory "struct proc_dir_entry" tree to parse
 * the /proc directory.
 */
static const struct file_operations proc_dir_operations = {
        .llseek                        = generic_file_llseek,
        .read                        = generic_read_dir,
        .iterate_shared                = proc_readdir,
};

static int proc_net_d_revalidate(struct dentry *dentry, unsigned int flags)
{
        return 0;
}

const struct dentry_operations proc_net_dentry_ops = {
        .d_revalidate        = proc_net_d_revalidate,
        .d_delete        = always_delete_dentry,
};

/*
 * proc directories can do almost nothing..
 */
static const struct inode_operations proc_dir_inode_operations = {
        .lookup                = proc_lookup,
        .getattr        = proc_getattr,
        .setattr        = proc_notify_change,
};

/* returns the registered entry, or frees dp and returns NULL on failure */
struct proc_dir_entry *proc_register(struct proc_dir_entry *dir,
                struct proc_dir_entry *dp)
{
        if (proc_alloc_inum(&dp->low_ino))
                goto out_free_entry;

        write_lock(&proc_subdir_lock);
        dp->parent = dir;
        if (pde_subdir_insert(dir, dp) == false) {
                WARN(1, "proc_dir_entry '%s/%s' already registered\n",
                     dir->name, dp->name);
                write_unlock(&proc_subdir_lock);
                goto out_free_inum;
        }
        dir->nlink++;
        write_unlock(&proc_subdir_lock);

        return dp;
out_free_inum:
        proc_free_inum(dp->low_ino);
out_free_entry:
        pde_free(dp);
        return NULL;
}

static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
                                          const char *name,
                                          umode_t mode,
                                          nlink_t nlink)
{
        struct proc_dir_entry *ent = NULL;
        const char *fn;
        struct qstr qstr;

        if (xlate_proc_name(name, parent, &fn) != 0)
                goto out;
        qstr.name = fn;
        qstr.len = strlen(fn);
        if (qstr.len == 0 || qstr.len >= 256) {
                WARN(1, "name len %u\n", qstr.len);
                return NULL;
        }
        if (qstr.len == 1 && fn[0] == '.') {
                WARN(1, "name '.'\n");
                return NULL;
        }
        if (qstr.len == 2 && fn[0] == '.' && fn[1] == '.') {
                WARN(1, "name '..'\n");
                return NULL;
        }
        if (*parent == &proc_root && name_to_int(&qstr) != ~0U) {
                WARN(1, "create '/proc/%s' by hand\n", qstr.name);
                return NULL;
        }
        if (is_empty_pde(*parent)) {
                WARN(1, "attempt to add to permanently empty directory");
                return NULL;
        }

        ent = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL);
        if (!ent)
                goto out;

        if (qstr.len + 1 <= SIZEOF_PDE_INLINE_NAME) {
                ent->name = ent->inline_name;
        } else {
                ent->name = kmalloc(qstr.len + 1, GFP_KERNEL);
                if (!ent->name) {
                        pde_free(ent);
                        return NULL;
                }
        }

        memcpy(ent->name, fn, qstr.len + 1);
        ent->namelen = qstr.len;
        ent->mode = mode;
        ent->nlink = nlink;
        ent->subdir = RB_ROOT;
        refcount_set(&ent->refcnt, 1);
        spin_lock_init(&ent->pde_unload_lock);
        INIT_LIST_HEAD(&ent->pde_openers);
        proc_set_user(ent, (*parent)->uid, (*parent)->gid);

        ent->proc_dops = &proc_misc_dentry_ops;
        /* Revalidate everything under /proc/${pid}/net */
        if ((*parent)->proc_dops == &proc_net_dentry_ops)
                pde_force_lookup(ent);

out:
        return ent;
}

struct proc_dir_entry *proc_symlink(const char *name,
                struct proc_dir_entry *parent, const char *dest)
{
        struct proc_dir_entry *ent;

        ent = __proc_create(&parent, name,
                          (S_IFLNK | S_IRUGO | S_IWUGO | S_IXUGO),1);

        if (ent) {
                ent->data = kmalloc((ent->size=strlen(dest))+1, GFP_KERNEL);
                if (ent->data) {
                        strcpy((char*)ent->data,dest);
                        ent->proc_iops = &proc_link_inode_operations;
                        ent = proc_register(parent, ent);
                } else {
                        pde_free(ent);
                        ent = NULL;
                }
        }
        return ent;
}
EXPORT_SYMBOL(proc_symlink);

struct proc_dir_entry *_proc_mkdir(const char *name, umode_t mode,
                struct proc_dir_entry *parent, void *data, bool force_lookup)
{
        struct proc_dir_entry *ent;

        if (mode == 0)
                mode = S_IRUGO | S_IXUGO;

        ent = __proc_create(&parent, name, S_IFDIR | mode, 2);
        if (ent) {
                ent->data = data;
                ent->proc_dir_ops = &proc_dir_operations;
                ent->proc_iops = &proc_dir_inode_operations;
                if (force_lookup) {
                        pde_force_lookup(ent);
                }
                ent = proc_register(parent, ent);
        }
        return ent;
}
EXPORT_SYMBOL_GPL(_proc_mkdir);

struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode,
                struct proc_dir_entry *parent, void *data)
{
        return _proc_mkdir(name, mode, parent, data, false);
}
EXPORT_SYMBOL_GPL(proc_mkdir_data);

struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode,
                                       struct proc_dir_entry *parent)
{
        return proc_mkdir_data(name, mode, parent, NULL);
}
EXPORT_SYMBOL(proc_mkdir_mode);

struct proc_dir_entry *proc_mkdir(const char *name,
                struct proc_dir_entry *parent)
{
        return proc_mkdir_data(name, 0, parent, NULL);
}
EXPORT_SYMBOL(proc_mkdir);

struct proc_dir_entry *proc_create_mount_point(const char *name)
{
        umode_t mode = S_IFDIR | S_IRUGO | S_IXUGO;
        struct proc_dir_entry *ent, *parent = NULL;

        ent = __proc_create(&parent, name, mode, 2);
        if (ent) {
                ent->data = NULL;
                ent->proc_dir_ops = NULL;
                ent->proc_iops = NULL;
                ent = proc_register(parent, ent);
        }
        return ent;
}
EXPORT_SYMBOL(proc_create_mount_point);

struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode,
                struct proc_dir_entry **parent, void *data)
{
        struct proc_dir_entry *p;

        if ((mode & S_IFMT) == 0)
                mode |= S_IFREG;
        if ((mode & S_IALLUGO) == 0)
                mode |= S_IRUGO;
        if (WARN_ON_ONCE(!S_ISREG(mode)))
                return NULL;

        p = __proc_create(parent, name, mode, 1);
        if (p) {
                p->proc_iops = &proc_file_inode_operations;
                p->data = data;
        }
        return p;
}

static void pde_set_flags(struct proc_dir_entry *pde)
{
        if (pde->proc_ops->proc_flags & PROC_ENTRY_PERMANENT)
                pde->flags |= PROC_ENTRY_PERMANENT;
        if (pde->proc_ops->proc_read_iter)
                pde->flags |= PROC_ENTRY_proc_read_iter;
#ifdef CONFIG_COMPAT
        if (pde->proc_ops->proc_compat_ioctl)
                pde->flags |= PROC_ENTRY_proc_compat_ioctl;
#endif
}

struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
                struct proc_dir_entry *parent,
                const struct proc_ops *proc_ops, void *data)
{
        struct proc_dir_entry *p;

        p = proc_create_reg(name, mode, &parent, data);
        if (!p)
                return NULL;
        p->proc_ops = proc_ops;
        pde_set_flags(p);
        return proc_register(parent, p);
}
EXPORT_SYMBOL(proc_create_data);
 
struct proc_dir_entry *proc_create(const char *name, umode_t mode,
                                   struct proc_dir_entry *parent,
                                   const struct proc_ops *proc_ops)
{
        return proc_create_data(name, mode, parent, proc_ops, NULL);
}
EXPORT_SYMBOL(proc_create);

static int proc_seq_open(struct inode *inode, struct file *file)
{
        struct proc_dir_entry *de = PDE(inode);

        if (de->state_size)
                return seq_open_private(file, de->seq_ops, de->state_size);
        return seq_open(file, de->seq_ops);
}

static int proc_seq_release(struct inode *inode, struct file *file)
{
        struct proc_dir_entry *de = PDE(inode);

        if (de->state_size)
                return seq_release_private(inode, file);
        return seq_release(inode, file);
}

static const struct proc_ops proc_seq_ops = {
        /* not permanent -- can call into arbitrary seq_operations */
        .proc_open        = proc_seq_open,
        .proc_read_iter        = seq_read_iter,
        .proc_lseek        = seq_lseek,
        .proc_release        = proc_seq_release,
};

struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode,
                struct proc_dir_entry *parent, const struct seq_operations *ops,
                unsigned int state_size, void *data)
{
        struct proc_dir_entry *p;

        p = proc_create_reg(name, mode, &parent, data);
        if (!p)
                return NULL;
        p->proc_ops = &proc_seq_ops;
        p->seq_ops = ops;
        p->state_size = state_size;
        pde_set_flags(p);
        return proc_register(parent, p);
}
EXPORT_SYMBOL(proc_create_seq_private);

static int proc_single_open(struct inode *inode, struct file *file)
{
        struct proc_dir_entry *de = PDE(inode);

        return single_open(file, de->single_show, de->data);
}

static const struct proc_ops proc_single_ops = {
        /* not permanent -- can call into arbitrary ->single_show */
        .proc_open        = proc_single_open,
        .proc_read_iter = seq_read_iter,
        .proc_lseek        = seq_lseek,
        .proc_release        = single_release,
};

struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode,
                struct proc_dir_entry *parent,
                int (*show)(struct seq_file *, void *), void *data)
{
        struct proc_dir_entry *p;

        p = proc_create_reg(name, mode, &parent, data);
        if (!p)
                return NULL;
        p->proc_ops = &proc_single_ops;
        p->single_show = show;
        pde_set_flags(p);
        return proc_register(parent, p);
}
EXPORT_SYMBOL(proc_create_single_data);

void proc_set_size(struct proc_dir_entry *de, loff_t size)
{
        de->size = size;
}
EXPORT_SYMBOL(proc_set_size);

void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid)
{
        de->uid = uid;
        de->gid = gid;
}
EXPORT_SYMBOL(proc_set_user);

void pde_put(struct proc_dir_entry *pde)
{
        if (refcount_dec_and_test(&pde->refcnt)) {
                proc_free_inum(pde->low_ino);
                pde_free(pde);
        }
}

static void pde_erase(struct proc_dir_entry *pde, struct proc_dir_entry *parent)
{
        rb_erase(&pde->subdir_node, &parent->subdir);
        RB_CLEAR_NODE(&pde->subdir_node);
}

/*
 * Remove a /proc entry and free it if it's not currently in use.
 */
void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
{
        struct proc_dir_entry *de = NULL;
        const char *fn = name;
        unsigned int len;

        write_lock(&proc_subdir_lock);
        if (__xlate_proc_name(name, &parent, &fn) != 0) {
                write_unlock(&proc_subdir_lock);
                return;
        }
        len = strlen(fn);

        de = pde_subdir_find(parent, fn, len);
        if (de) {
                if (unlikely(pde_is_permanent(de))) {
                        WARN(1, "removing permanent /proc entry '%s'", de->name);
                        de = NULL;
                } else {
                        pde_erase(de, parent);
                        if (S_ISDIR(de->mode))
                                parent->nlink--;
                }
        }
        write_unlock(&proc_subdir_lock);
        if (!de) {
                WARN(1, "name '%s'\n", name);
                return;
        }

        proc_entry_rundown(de);

        WARN(pde_subdir_first(de),
             "%s: removing non-empty directory '%s/%s', leaking at least '%s'\n",
             __func__, de->parent->name, de->name, pde_subdir_first(de)->name);
        pde_put(de);
}
EXPORT_SYMBOL(remove_proc_entry);

int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
{
        struct proc_dir_entry *root = NULL, *de, *next;
        const char *fn = name;
        unsigned int len;

        write_lock(&proc_subdir_lock);
        if (__xlate_proc_name(name, &parent, &fn) != 0) {
                write_unlock(&proc_subdir_lock);
                return -ENOENT;
        }
        len = strlen(fn);

        root = pde_subdir_find(parent, fn, len);
        if (!root) {
                write_unlock(&proc_subdir_lock);
                return -ENOENT;
        }
        if (unlikely(pde_is_permanent(root))) {
                write_unlock(&proc_subdir_lock);
                WARN(1, "removing permanent /proc entry '%s/%s'",
                        root->parent->name, root->name);
                return -EINVAL;
        }
        pde_erase(root, parent);

        de = root;
        while (1) {
                next = pde_subdir_first(de);
                if (next) {
                        if (unlikely(pde_is_permanent(next))) {
                                write_unlock(&proc_subdir_lock);
                                WARN(1, "removing permanent /proc entry '%s/%s'",
                                        next->parent->name, next->name);
                                return -EINVAL;
                        }
                        pde_erase(next, de);
                        de = next;
                        continue;
                }
                next = de->parent;
                if (S_ISDIR(de->mode))
                        next->nlink--;
                write_unlock(&proc_subdir_lock);

                proc_entry_rundown(de);
                if (de == root)
                        break;
                pde_put(de);

                write_lock(&proc_subdir_lock);
                de = next;
        }
        pde_put(root);
        return 0;
}
EXPORT_SYMBOL(remove_proc_subtree);

void *proc_get_parent_data(const struct inode *inode)
{
        struct proc_dir_entry *de = PDE(inode);
        return de->parent->data;
}
EXPORT_SYMBOL_GPL(proc_get_parent_data);

void proc_remove(struct proc_dir_entry *de)
{
        if (de)
                remove_proc_subtree(de->name, de->parent);
}
EXPORT_SYMBOL(proc_remove);

void *PDE_DATA(const struct inode *inode)
{
        return __PDE_DATA(inode);
}
EXPORT_SYMBOL(PDE_DATA);

/*
 * Pull a user buffer into memory and pass it to the file's write handler if
 * one is supplied.  The ->write() method is permitted to modify the
 * kernel-side buffer.
 */
ssize_t proc_simple_write(struct file *f, const char __user *ubuf, size_t size,
                          loff_t *_pos)
{
        struct proc_dir_entry *pde = PDE(file_inode(f));
        char *buf;
        int ret;

        if (!pde->write)
                return -EACCES;
        if (size == 0 || size > PAGE_SIZE - 1)
                return -EINVAL;
        buf = memdup_user_nul(ubuf, size);
        if (IS_ERR(buf))
                return PTR_ERR(buf);
        ret = pde->write(f, buf, size);
        kfree(buf);
        return ret == 0 ? size : ret;
}
















































































    1 













































































































































































































































































































































































    4 

































































    1 


















    1 


























    2 










































    1 





































































































































    1 



    1 

    1 



    1 




    1 



























































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PAGEMAP_H
#define _LINUX_PAGEMAP_H

/*
 * Copyright 1995 Linus Torvalds
 */
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/list.h>
#include <linux/highmem.h>
#include <linux/compiler.h>
#include <linux/uaccess.h>
#include <linux/gfp.h>
#include <linux/bitops.h>
#include <linux/hardirq.h> /* for in_interrupt() */
#include <linux/hugetlb_inline.h>

struct pagevec;

/*
 * Bits in mapping->flags.
 */
enum mapping_flags {
        AS_EIO                = 0,        /* IO error on async write */
        AS_ENOSPC        = 1,        /* ENOSPC on async write */
        AS_MM_ALL_LOCKS        = 2,        /* under mm_take_all_locks() */
        AS_UNEVICTABLE        = 3,        /* e.g., ramdisk, SHM_LOCK */
        AS_EXITING        = 4,         /* final truncate in progress */
        /* writeback related tags are not used */
        AS_NO_WRITEBACK_TAGS = 5,
        AS_THP_SUPPORT = 6,        /* THPs supported */
};

/**
 * mapping_set_error - record a writeback error in the address_space
 * @mapping: the mapping in which an error should be set
 * @error: the error to set in the mapping
 *
 * When writeback fails in some way, we must record that error so that
 * userspace can be informed when fsync and the like are called.  We endeavor
 * to report errors on any file that was open at the time of the error.  Some
 * internal callers also need to know when writeback errors have occurred.
 *
 * When a writeback error occurs, most filesystems will want to call
 * mapping_set_error to record the error in the mapping so that it can be
 * reported when the application calls fsync(2).
 */
static inline void mapping_set_error(struct address_space *mapping, int error)
{
        if (likely(!error))
                return;

        /* Record in wb_err for checkers using errseq_t based tracking */
        __filemap_set_wb_err(mapping, error);

        /* Record it in superblock */
        if (mapping->host)
                errseq_set(&mapping->host->i_sb->s_wb_err, error);

        /* Record it in flags for now, for legacy callers */
        if (error == -ENOSPC)
                set_bit(AS_ENOSPC, &mapping->flags);
        else
                set_bit(AS_EIO, &mapping->flags);
}

static inline void mapping_set_unevictable(struct address_space *mapping)
{
        set_bit(AS_UNEVICTABLE, &mapping->flags);
}

static inline void mapping_clear_unevictable(struct address_space *mapping)
{
        clear_bit(AS_UNEVICTABLE, &mapping->flags);
}

static inline bool mapping_unevictable(struct address_space *mapping)
{
        return mapping && test_bit(AS_UNEVICTABLE, &mapping->flags);
}

static inline void mapping_set_exiting(struct address_space *mapping)
{
        set_bit(AS_EXITING, &mapping->flags);
}

static inline int mapping_exiting(struct address_space *mapping)
{
        return test_bit(AS_EXITING, &mapping->flags);
}

static inline void mapping_set_no_writeback_tags(struct address_space *mapping)
{
        set_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags);
}

static inline int mapping_use_writeback_tags(struct address_space *mapping)
{
        return !test_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags);
}

static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
{
        return mapping->gfp_mask;
}

/* Restricts the given gfp_mask to what the mapping allows. */
static inline gfp_t mapping_gfp_constraint(struct address_space *mapping,
                gfp_t gfp_mask)
{
        return mapping_gfp_mask(mapping) & gfp_mask;
}

/*
 * This is non-atomic.  Only to be used before the mapping is activated.
 * Probably needs a barrier...
 */
static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
{
        m->gfp_mask = mask;
}

static inline bool mapping_thp_support(struct address_space *mapping)
{
        return test_bit(AS_THP_SUPPORT, &mapping->flags);
}

static inline int filemap_nr_thps(struct address_space *mapping)
{
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
        return atomic_read(&mapping->nr_thps);
#else
        return 0;
#endif
}

static inline void filemap_nr_thps_inc(struct address_space *mapping)
{
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
        if (!mapping_thp_support(mapping))
                atomic_inc(&mapping->nr_thps);
#else
        WARN_ON_ONCE(1);
#endif
}

static inline void filemap_nr_thps_dec(struct address_space *mapping)
{
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
        if (!mapping_thp_support(mapping))
                atomic_dec(&mapping->nr_thps);
#else
        WARN_ON_ONCE(1);
#endif
}

void release_pages(struct page **pages, int nr);

/*
 * speculatively take a reference to a page.
 * If the page is free (_refcount == 0), then _refcount is untouched, and 0
 * is returned. Otherwise, _refcount is incremented by 1 and 1 is returned.
 *
 * This function must be called inside the same rcu_read_lock() section as has
 * been used to lookup the page in the pagecache radix-tree (or page table):
 * this allows allocators to use a synchronize_rcu() to stabilize _refcount.
 *
 * Unless an RCU grace period has passed, the count of all pages coming out
 * of the allocator must be considered unstable. page_count may return higher
 * than expected, and put_page must be able to do the right thing when the
 * page has been finished with, no matter what it is subsequently allocated
 * for (because put_page is what is used here to drop an invalid speculative
 * reference).
 *
 * This is the interesting part of the lockless pagecache (and lockless
 * get_user_pages) locking protocol, where the lookup-side (eg. find_get_page)
 * has the following pattern:
 * 1. find page in radix tree
 * 2. conditionally increment refcount
 * 3. check the page is still in pagecache (if no, goto 1)
 *
 * Remove-side that cares about stability of _refcount (eg. reclaim) has the
 * following (with the i_pages lock held):
 * A. atomically check refcount is correct and set it to 0 (atomic_cmpxchg)
 * B. remove page from pagecache
 * C. free the page
 *
 * There are 2 critical interleavings that matter:
 * - 2 runs before A: in this case, A sees elevated refcount and bails out
 * - A runs before 2: in this case, 2 sees zero refcount and retries;
 *   subsequently, B will complete and 1 will find no page, causing the
 *   lookup to return NULL.
 *
 * It is possible that between 1 and 2, the page is removed then the exact same
 * page is inserted into the same position in pagecache. That's OK: the
 * old find_get_page using a lock could equally have run before or after
 * such a re-insertion, depending on order that locks are granted.
 *
 * Lookups racing against pagecache insertion isn't a big problem: either 1
 * will find the page or it will not. Likewise, the old find_get_page could run
 * either before the insertion or afterwards, depending on timing.
 */
static inline int __page_cache_add_speculative(struct page *page, int count)
{
#ifdef CONFIG_TINY_RCU
# ifdef CONFIG_PREEMPT_COUNT
        VM_BUG_ON(!in_atomic() && !irqs_disabled());
# endif
        /*
         * Preempt must be disabled here - we rely on rcu_read_lock doing
         * this for us.
         *
         * Pagecache won't be truncated from interrupt context, so if we have
         * found a page in the radix tree here, we have pinned its refcount by
         * disabling preempt, and hence no need for the "speculative get" that
         * SMP requires.
         */
        VM_BUG_ON_PAGE(page_count(page) == 0, page);
        page_ref_add(page, count);

#else
        if (unlikely(!page_ref_add_unless(page, count, 0))) {
                /*
                 * Either the page has been freed, or will be freed.
                 * In either case, retry here and the caller should
                 * do the right thing (see comments above).
                 */
                return 0;
        }
#endif
        VM_BUG_ON_PAGE(PageTail(page), page);

        return 1;
}

static inline int page_cache_get_speculative(struct page *page)
{
        return __page_cache_add_speculative(page, 1);
}

static inline int page_cache_add_speculative(struct page *page, int count)
{
        return __page_cache_add_speculative(page, count);
}

/**
 * attach_page_private - Attach private data to a page.
 * @page: Page to attach data to.
 * @data: Data to attach to page.
 *
 * Attaching private data to a page increments the page's reference count.
 * The data must be detached before the page will be freed.
 */
static inline void attach_page_private(struct page *page, void *data)
{
        get_page(page);
        set_page_private(page, (unsigned long)data);
        SetPagePrivate(page);
}

/**
 * detach_page_private - Detach private data from a page.
 * @page: Page to detach data from.
 *
 * Removes the data that was previously attached to the page and decrements
 * the refcount on the page.
 *
 * Return: Data that was attached to the page.
 */
static inline void *detach_page_private(struct page *page)
{
        void *data = (void *)page_private(page);

        if (!PagePrivate(page))
                return NULL;
        ClearPagePrivate(page);
        set_page_private(page, 0);
        put_page(page);

        return data;
}

#ifdef CONFIG_NUMA
extern struct page *__page_cache_alloc(gfp_t gfp);
#else
static inline struct page *__page_cache_alloc(gfp_t gfp)
{
        return alloc_pages(gfp, 0);
}
#endif

static inline struct page *page_cache_alloc(struct address_space *x)
{
        return __page_cache_alloc(mapping_gfp_mask(x));
}

static inline gfp_t readahead_gfp_mask(struct address_space *x)
{
        return mapping_gfp_mask(x) | __GFP_NORETRY | __GFP_NOWARN;
}

typedef int filler_t(void *, struct page *);

pgoff_t page_cache_next_miss(struct address_space *mapping,
                             pgoff_t index, unsigned long max_scan);
pgoff_t page_cache_prev_miss(struct address_space *mapping,
                             pgoff_t index, unsigned long max_scan);

#define FGP_ACCESSED                0x00000001
#define FGP_LOCK                0x00000002
#define FGP_CREAT                0x00000004
#define FGP_WRITE                0x00000008
#define FGP_NOFS                0x00000010
#define FGP_NOWAIT                0x00000020
#define FGP_FOR_MMAP                0x00000040
#define FGP_HEAD                0x00000080

struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
                int fgp_flags, gfp_t cache_gfp_mask);

/**
 * find_get_page - find and get a page reference
 * @mapping: the address_space to search
 * @offset: the page index
 *
 * Looks up the page cache slot at @mapping & @offset.  If there is a
 * page cache page, it is returned with an increased refcount.
 *
 * Otherwise, %NULL is returned.
 */
static inline struct page *find_get_page(struct address_space *mapping,
                                        pgoff_t offset)
{
        return pagecache_get_page(mapping, offset, 0, 0);
}

static inline struct page *find_get_page_flags(struct address_space *mapping,
                                        pgoff_t offset, int fgp_flags)
{
        return pagecache_get_page(mapping, offset, fgp_flags, 0);
}

/**
 * find_lock_page - locate, pin and lock a pagecache page
 * @mapping: the address_space to search
 * @index: the page index
 *
 * Looks up the page cache entry at @mapping & @index.  If there is a
 * page cache page, it is returned locked and with an increased
 * refcount.
 *
 * Context: May sleep.
 * Return: A struct page or %NULL if there is no page in the cache for this
 * index.
 */
static inline struct page *find_lock_page(struct address_space *mapping,
                                        pgoff_t index)
{
        return pagecache_get_page(mapping, index, FGP_LOCK, 0);
}

/**
 * find_lock_head - Locate, pin and lock a pagecache page.
 * @mapping: The address_space to search.
 * @index: The page index.
 *
 * Looks up the page cache entry at @mapping & @index.  If there is a
 * page cache page, its head page is returned locked and with an increased
 * refcount.
 *
 * Context: May sleep.
 * Return: A struct page which is !PageTail, or %NULL if there is no page
 * in the cache for this index.
 */
static inline struct page *find_lock_head(struct address_space *mapping,
                                        pgoff_t index)
{
        return pagecache_get_page(mapping, index, FGP_LOCK | FGP_HEAD, 0);
}

/**
 * find_or_create_page - locate or add a pagecache page
 * @mapping: the page's address_space
 * @index: the page's index into the mapping
 * @gfp_mask: page allocation mode
 *
 * Looks up the page cache slot at @mapping & @offset.  If there is a
 * page cache page, it is returned locked and with an increased
 * refcount.
 *
 * If the page is not present, a new page is allocated using @gfp_mask
 * and added to the page cache and the VM's LRU list.  The page is
 * returned locked and with an increased refcount.
 *
 * On memory exhaustion, %NULL is returned.
 *
 * find_or_create_page() may sleep, even if @gfp_flags specifies an
 * atomic allocation!
 */
static inline struct page *find_or_create_page(struct address_space *mapping,
                                        pgoff_t index, gfp_t gfp_mask)
{
        return pagecache_get_page(mapping, index,
                                        FGP_LOCK|FGP_ACCESSED|FGP_CREAT,
                                        gfp_mask);
}

/**
 * grab_cache_page_nowait - returns locked page at given index in given cache
 * @mapping: target address_space
 * @index: the page index
 *
 * Same as grab_cache_page(), but do not wait if the page is unavailable.
 * This is intended for speculative data generators, where the data can
 * be regenerated if the page couldn't be grabbed.  This routine should
 * be safe to call while holding the lock for another page.
 *
 * Clear __GFP_FS when allocating the page to avoid recursion into the fs
 * and deadlock against the caller's locked page.
 */
static inline struct page *grab_cache_page_nowait(struct address_space *mapping,
                                pgoff_t index)
{
        return pagecache_get_page(mapping, index,
                        FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT,
                        mapping_gfp_mask(mapping));
}

/* Does this page contain this index? */
static inline bool thp_contains(struct page *head, pgoff_t index)
{
        /* HugeTLBfs indexes the page cache in units of hpage_size */
        if (PageHuge(head))
                return head->index == index;
        return page_index(head) == (index & ~(thp_nr_pages(head) - 1UL));
}

/*
 * Given the page we found in the page cache, return the page corresponding
 * to this index in the file
 */
static inline struct page *find_subpage(struct page *head, pgoff_t index)
{
        /* HugeTLBfs wants the head page regardless */
        if (PageHuge(head))
                return head;

        return head + (index & (thp_nr_pages(head) - 1));
}

unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
                          unsigned int nr_entries, struct page **entries,
                          pgoff_t *indices);
unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
                        pgoff_t end, unsigned int nr_pages,
                        struct page **pages);
static inline unsigned find_get_pages(struct address_space *mapping,
                        pgoff_t *start, unsigned int nr_pages,
                        struct page **pages)
{
        return find_get_pages_range(mapping, start, (pgoff_t)-1, nr_pages,
                                    pages);
}
unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
                               unsigned int nr_pages, struct page **pages);
unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
                        pgoff_t end, xa_mark_t tag, unsigned int nr_pages,
                        struct page **pages);
static inline unsigned find_get_pages_tag(struct address_space *mapping,
                        pgoff_t *index, xa_mark_t tag, unsigned int nr_pages,
                        struct page **pages)
{
        return find_get_pages_range_tag(mapping, index, (pgoff_t)-1, tag,
                                        nr_pages, pages);
}

struct page *grab_cache_page_write_begin(struct address_space *mapping,
                        pgoff_t index, unsigned flags);

/*
 * Returns locked page at given index in given cache, creating it if needed.
 */
static inline struct page *grab_cache_page(struct address_space *mapping,
                                                                pgoff_t index)
{
        return find_or_create_page(mapping, index, mapping_gfp_mask(mapping));
}

extern struct page * read_cache_page(struct address_space *mapping,
                                pgoff_t index, filler_t *filler, void *data);
extern struct page * read_cache_page_gfp(struct address_space *mapping,
                                pgoff_t index, gfp_t gfp_mask);
extern int read_cache_pages(struct address_space *mapping,
                struct list_head *pages, filler_t *filler, void *data);

static inline struct page *read_mapping_page(struct address_space *mapping,
                                pgoff_t index, void *data)
{
        return read_cache_page(mapping, index, NULL, data);
}

/*
 * Get index of the page within radix-tree (but not for hugetlb pages).
 * (TODO: remove once hugetlb pages will have ->index in PAGE_SIZE)
 */
static inline pgoff_t page_to_index(struct page *page)
{
        pgoff_t pgoff;

        if (likely(!PageTransTail(page)))
                return page->index;

        /*
         *  We don't initialize ->index for tail pages: calculate based on
         *  head page
         */
        pgoff = compound_head(page)->index;
        pgoff += page - compound_head(page);
        return pgoff;
}

extern pgoff_t hugetlb_basepage_index(struct page *page);

/*
 * Get the offset in PAGE_SIZE (even for hugetlb pages).
 * (TODO: hugetlb pages should have ->index in PAGE_SIZE)
 */
static inline pgoff_t page_to_pgoff(struct page *page)
{
        if (unlikely(PageHuge(page)))
                return hugetlb_basepage_index(page);
        return page_to_index(page);
}

/*
 * Return byte-offset into filesystem object for page.
 */
static inline loff_t page_offset(struct page *page)
{
        return ((loff_t)page->index) << PAGE_SHIFT;
}

static inline loff_t page_file_offset(struct page *page)
{
        return ((loff_t)page_index(page)) << PAGE_SHIFT;
}

extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
                                     unsigned long address);

static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
                                        unsigned long address)
{
        pgoff_t pgoff;
        if (unlikely(is_vm_hugetlb_page(vma)))
                return linear_hugepage_index(vma, address);
        pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
        pgoff += vma->vm_pgoff;
        return pgoff;
}

struct wait_page_key {
        struct page *page;
        int bit_nr;
        int page_match;
};

struct wait_page_queue {
        struct page *page;
        int bit_nr;
        wait_queue_entry_t wait;
};

static inline bool wake_page_match(struct wait_page_queue *wait_page,
                                  struct wait_page_key *key)
{
        if (wait_page->page != key->page)
               return false;
        key->page_match = 1;

        if (wait_page->bit_nr != key->bit_nr)
                return false;

        return true;
}

extern void __lock_page(struct page *page);
extern int __lock_page_killable(struct page *page);
extern int __lock_page_async(struct page *page, struct wait_page_queue *wait);
extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
                                unsigned int flags);
extern void unlock_page(struct page *page);

/*
 * Return true if the page was successfully locked
 */
static inline int trylock_page(struct page *page)
{
        page = compound_head(page);
        return (likely(!test_and_set_bit_lock(PG_locked, &page->flags)));
}

/*
 * lock_page may only be called if we have the page's inode pinned.
 */
static inline void lock_page(struct page *page)
{
        might_sleep();
        if (!trylock_page(page))
                __lock_page(page);
}

/*
 * lock_page_killable is like lock_page but can be interrupted by fatal
 * signals.  It returns 0 if it locked the page and -EINTR if it was
 * killed while waiting.
 */
static inline int lock_page_killable(struct page *page)
{
        might_sleep();
        if (!trylock_page(page))
                return __lock_page_killable(page);
        return 0;
}

/*
 * lock_page_async - Lock the page, unless this would block. If the page
 * is already locked, then queue a callback when the page becomes unlocked.
 * This callback can then retry the operation.
 *
 * Returns 0 if the page is locked successfully, or -EIOCBQUEUED if the page
 * was already locked and the callback defined in 'wait' was queued.
 */
static inline int lock_page_async(struct page *page,
                                  struct wait_page_queue *wait)
{
        if (!trylock_page(page))
                return __lock_page_async(page, wait);
        return 0;
}

/*
 * lock_page_or_retry - Lock the page, unless this would block and the
 * caller indicated that it can handle a retry.
 *
 * Return value and mmap_lock implications depend on flags; see
 * __lock_page_or_retry().
 */
static inline int lock_page_or_retry(struct page *page, struct mm_struct *mm,
                                     unsigned int flags)
{
        might_sleep();
        return trylock_page(page) || __lock_page_or_retry(page, mm, flags);
}

/*
 * This is exported only for wait_on_page_locked/wait_on_page_writeback, etc.,
 * and should not be used directly.
 */
extern void wait_on_page_bit(struct page *page, int bit_nr);
extern int wait_on_page_bit_killable(struct page *page, int bit_nr);

/* 
 * Wait for a page to be unlocked.
 *
 * This must be called with the caller "holding" the page,
 * ie with increased "page->count" so that the page won't
 * go away during the wait..
 */
static inline void wait_on_page_locked(struct page *page)
{
        if (PageLocked(page))
                wait_on_page_bit(compound_head(page), PG_locked);
}

static inline int wait_on_page_locked_killable(struct page *page)
{
        if (!PageLocked(page))
                return 0;
        return wait_on_page_bit_killable(compound_head(page), PG_locked);
}

extern void put_and_wait_on_page_locked(struct page *page);

void wait_on_page_writeback(struct page *page);
extern void end_page_writeback(struct page *page);
void wait_for_stable_page(struct page *page);

void page_endio(struct page *page, bool is_write, int err);

/*
 * Add an arbitrary waiter to a page's wait queue
 */
extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter);

/*
 * Fault everything in given userspace address range in.
 */
static inline int fault_in_pages_writeable(char __user *uaddr, int size)
{
        char __user *end = uaddr + size - 1;

        if (unlikely(size == 0))
                return 0;

        if (unlikely(uaddr > end))
                return -EFAULT;
        /*
         * Writing zeroes into userspace here is OK, because we know that if
         * the zero gets there, we'll be overwriting it.
         */
        do {
                if (unlikely(__put_user(0, uaddr) != 0))
                        return -EFAULT;
                uaddr += PAGE_SIZE;
        } while (uaddr <= end);

        /* Check whether the range spilled into the next page. */
        if (((unsigned long)uaddr & PAGE_MASK) ==
                        ((unsigned long)end & PAGE_MASK))
                return __put_user(0, end);

        return 0;
}

static inline int fault_in_pages_readable(const char __user *uaddr, int size)
{
        volatile char c;
        const char __user *end = uaddr + size - 1;

        if (unlikely(size == 0))
                return 0;

        if (unlikely(uaddr > end))
                return -EFAULT;

        do {
                if (unlikely(__get_user(c, uaddr) != 0))
                        return -EFAULT;
                uaddr += PAGE_SIZE;
        } while (uaddr <= end);

        /* Check whether the range spilled into the next page. */
        if (((unsigned long)uaddr & PAGE_MASK) ==
                        ((unsigned long)end & PAGE_MASK)) {
                return __get_user(c, end);
        }

        (void)c;
        return 0;
}

int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
                                pgoff_t index, gfp_t gfp_mask);
int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
                                pgoff_t index, gfp_t gfp_mask);
extern void delete_from_page_cache(struct page *page);
extern void __delete_from_page_cache(struct page *page, void *shadow);
int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
void delete_from_page_cache_batch(struct address_space *mapping,
                                  struct pagevec *pvec);

/*
 * Like add_to_page_cache_locked, but used to add newly allocated pages:
 * the page is new, so we can just run __SetPageLocked() against it.
 */
static inline int add_to_page_cache(struct page *page,
                struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)
{
        int error;

        __SetPageLocked(page);
        error = add_to_page_cache_locked(page, mapping, offset, gfp_mask);
        if (unlikely(error))
                __ClearPageLocked(page);
        return error;
}

/**
 * struct readahead_control - Describes a readahead request.
 *
 * A readahead request is for consecutive pages.  Filesystems which
 * implement the ->readahead method should call readahead_page() or
 * readahead_page_batch() in a loop and attempt to start I/O against
 * each page in the request.
 *
 * Most of the fields in this struct are private and should be accessed
 * by the functions below.
 *
 * @file: The file, used primarily by network filesystems for authentication.
 *          May be NULL if invoked internally by the filesystem.
 * @mapping: Readahead this filesystem object.
 */
struct readahead_control {
        struct file *file;
        struct address_space *mapping;
/* private: use the readahead_* accessors instead */
        pgoff_t _index;
        unsigned int _nr_pages;
        unsigned int _batch_count;
};

#define DEFINE_READAHEAD(rac, f, m, i)                                        \
        struct readahead_control rac = {                                \
                .file = f,                                                \
                .mapping = m,                                                \
                ._index = i,                                                \
        }

#define VM_READAHEAD_PAGES        (SZ_128K / PAGE_SIZE)

void page_cache_ra_unbounded(struct readahead_control *,
                unsigned long nr_to_read, unsigned long lookahead_count);
void page_cache_sync_ra(struct readahead_control *, struct file_ra_state *,
                unsigned long req_count);
void page_cache_async_ra(struct readahead_control *, struct file_ra_state *,
                struct page *, unsigned long req_count);

/**
 * page_cache_sync_readahead - generic file readahead
 * @mapping: address_space which holds the pagecache and I/O vectors
 * @ra: file_ra_state which holds the readahead state
 * @file: Used by the filesystem for authentication.
 * @index: Index of first page to be read.
 * @req_count: Total number of pages being read by the caller.
 *
 * page_cache_sync_readahead() should be called when a cache miss happened:
 * it will submit the read.  The readahead logic may decide to piggyback more
 * pages onto the read request if access patterns suggest it will improve
 * performance.
 */
static inline
void page_cache_sync_readahead(struct address_space *mapping,
                struct file_ra_state *ra, struct file *file, pgoff_t index,
                unsigned long req_count)
{
        DEFINE_READAHEAD(ractl, file, mapping, index);
        page_cache_sync_ra(&ractl, ra, req_count);
}

/**
 * page_cache_async_readahead - file readahead for marked pages
 * @mapping: address_space which holds the pagecache and I/O vectors
 * @ra: file_ra_state which holds the readahead state
 * @file: Used by the filesystem for authentication.
 * @page: The page at @index which triggered the readahead call.
 * @index: Index of first page to be read.
 * @req_count: Total number of pages being read by the caller.
 *
 * page_cache_async_readahead() should be called when a page is used which
 * is marked as PageReadahead; this is a marker to suggest that the application
 * has used up enough of the readahead window that we should start pulling in
 * more pages.
 */
static inline
void page_cache_async_readahead(struct address_space *mapping,
                struct file_ra_state *ra, struct file *file,
                struct page *page, pgoff_t index, unsigned long req_count)
{
        DEFINE_READAHEAD(ractl, file, mapping, index);
        page_cache_async_ra(&ractl, ra, page, req_count);
}

/**
 * readahead_page - Get the next page to read.
 * @rac: The current readahead request.
 *
 * Context: The page is locked and has an elevated refcount.  The caller
 * should decreases the refcount once the page has been submitted for I/O
 * and unlock the page once all I/O to that page has completed.
 * Return: A pointer to the next page, or %NULL if we are done.
 */
static inline struct page *readahead_page(struct readahead_control *rac)
{
        struct page *page;

        BUG_ON(rac->_batch_count > rac->_nr_pages);
        rac->_nr_pages -= rac->_batch_count;
        rac->_index += rac->_batch_count;

        if (!rac->_nr_pages) {
                rac->_batch_count = 0;
                return NULL;
        }

        page = xa_load(&rac->mapping->i_pages, rac->_index);
        VM_BUG_ON_PAGE(!PageLocked(page), page);
        rac->_batch_count = thp_nr_pages(page);

        return page;
}

static inline unsigned int __readahead_batch(struct readahead_control *rac,
                struct page **array, unsigned int array_sz)
{
        unsigned int i = 0;
        XA_STATE(xas, &rac->mapping->i_pages, 0);
        struct page *page;

        BUG_ON(rac->_batch_count > rac->_nr_pages);
        rac->_nr_pages -= rac->_batch_count;
        rac->_index += rac->_batch_count;
        rac->_batch_count = 0;

        xas_set(&xas, rac->_index);
        rcu_read_lock();
        xas_for_each(&xas, page, rac->_index + rac->_nr_pages - 1) {
                if (xas_retry(&xas, page))
                        continue;
                VM_BUG_ON_PAGE(!PageLocked(page), page);
                VM_BUG_ON_PAGE(PageTail(page), page);
                array[i++] = page;
                rac->_batch_count += thp_nr_pages(page);

                /*
                 * The page cache isn't using multi-index entries yet,
                 * so the xas cursor needs to be manually moved to the
                 * next index.  This can be removed once the page cache
                 * is converted.
                 */
                if (PageHead(page))
                        xas_set(&xas, rac->_index + rac->_batch_count);

                if (i == array_sz)
                        break;
        }
        rcu_read_unlock();

        return i;
}

/**
 * readahead_page_batch - Get a batch of pages to read.
 * @rac: The current readahead request.
 * @array: An array of pointers to struct page.
 *
 * Context: The pages are locked and have an elevated refcount.  The caller
 * should decreases the refcount once the page has been submitted for I/O
 * and unlock the page once all I/O to that page has completed.
 * Return: The number of pages placed in the array.  0 indicates the request
 * is complete.
 */
#define readahead_page_batch(rac, array)                                \
        __readahead_batch(rac, array, ARRAY_SIZE(array))

/**
 * readahead_pos - The byte offset into the file of this readahead request.
 * @rac: The readahead request.
 */
static inline loff_t readahead_pos(struct readahead_control *rac)
{
        return (loff_t)rac->_index * PAGE_SIZE;
}

/**
 * readahead_length - The number of bytes in this readahead request.
 * @rac: The readahead request.
 */
static inline loff_t readahead_length(struct readahead_control *rac)
{
        return (loff_t)rac->_nr_pages * PAGE_SIZE;
}

/**
 * readahead_index - The index of the first page in this readahead request.
 * @rac: The readahead request.
 */
static inline pgoff_t readahead_index(struct readahead_control *rac)
{
        return rac->_index;
}

/**
 * readahead_count - The number of pages in this readahead request.
 * @rac: The readahead request.
 */
static inline unsigned int readahead_count(struct readahead_control *rac)
{
        return rac->_nr_pages;
}

static inline unsigned long dir_pages(struct inode *inode)
{
        return (unsigned long)(inode->i_size + PAGE_SIZE - 1) >>
                               PAGE_SHIFT;
}

/**
 * page_mkwrite_check_truncate - check if page was truncated
 * @page: the page to check
 * @inode: the inode to check the page against
 *
 * Returns the number of bytes in the page up to EOF,
 * or -EFAULT if the page was truncated.
 */
static inline int page_mkwrite_check_truncate(struct page *page,
                                              struct inode *inode)
{
        loff_t size = i_size_read(inode);
        pgoff_t index = size >> PAGE_SHIFT;
        int offset = offset_in_page(size);

        if (page->mapping != inode->i_mapping)
                return -EFAULT;

        /* page is wholly inside EOF */
        if (page->index < index)
                return PAGE_SIZE;
        /* page is wholly past EOF */
        if (page->index > index || !offset)
                return -EFAULT;
        /* page is partially inside EOF */
        return offset;
}

/**
 * i_blocks_per_page - How many blocks fit in this page.
 * @inode: The inode which contains the blocks.
 * @page: The page (head page if the page is a THP).
 *
 * If the block size is larger than the size of this page, return zero.
 *
 * Context: The caller should hold a refcount on the page to prevent it
 * from being split.
 * Return: The number of filesystem blocks covered by this page.
 */
static inline
unsigned int i_blocks_per_page(struct inode *inode, struct page *page)
{
        return thp_size(page) >> inode->i_blkbits;
}
#endif /* _LINUX_PAGEMAP_H */















































































    2 
    1 


    2 


    2 







    2 











    2 



    2 



    1 


    1 








    1 









    1 






    1 







    1 









    1 





    1 






    2 





















































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
// SPDX-License-Identifier: GPL-2.0
/*
 * Tag allocation using scalable bitmaps. Uses active queue tracking to support
 * fairer distribution of tags between multiple submitters when a shared tag map
 * is used.
 *
 * Copyright (C) 2013-2014 Jens Axboe
 */
#include <linux/kernel.h>
#include <linux/module.h>

#include <linux/blk-mq.h>
#include <linux/delay.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-tag.h"

/*
 * If a previously inactive queue goes active, bump the active user count.
 * We need to do this before try to allocate driver tag, then even if fail
 * to get tag when first time, the other shared-tag users could reserve
 * budget for it.
 */
bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
        if (blk_mq_is_sbitmap_shared(hctx->flags)) {
                struct request_queue *q = hctx->queue;
                struct blk_mq_tag_set *set = q->tag_set;

                if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) &&
                    !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
                        atomic_inc(&set->active_queues_shared_sbitmap);
        } else {
                if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
                    !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
                        atomic_inc(&hctx->tags->active_queues);
        }

        return true;
}

/*
 * Wakeup all potentially sleeping on tags
 */
void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
{
        sbitmap_queue_wake_all(tags->bitmap_tags);
        if (include_reserve)
                sbitmap_queue_wake_all(tags->breserved_tags);
}

/*
 * If a previously busy queue goes inactive, potential waiters could now
 * be allowed to queue. Wake them up and check.
 */
void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
{
        struct blk_mq_tags *tags = hctx->tags;
        struct request_queue *q = hctx->queue;
        struct blk_mq_tag_set *set = q->tag_set;

        if (blk_mq_is_sbitmap_shared(hctx->flags)) {
                if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE,
                                        &q->queue_flags))
                        return;
                atomic_dec(&set->active_queues_shared_sbitmap);
        } else {
                if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
                        return;
                atomic_dec(&tags->active_queues);
        }

        blk_mq_tag_wakeup_all(tags, false);
}

static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
                            struct sbitmap_queue *bt)
{
        if (!data->q->elevator && !(data->flags & BLK_MQ_REQ_RESERVED) &&
                        !hctx_may_queue(data->hctx, bt))
                return BLK_MQ_NO_TAG;

        if (data->shallow_depth)
                return __sbitmap_queue_get_shallow(bt, data->shallow_depth);
        else
                return __sbitmap_queue_get(bt);
}

unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
{
        struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
        struct sbitmap_queue *bt;
        struct sbq_wait_state *ws;
        DEFINE_SBQ_WAIT(wait);
        unsigned int tag_offset;
        int tag;

        if (data->flags & BLK_MQ_REQ_RESERVED) {
                if (unlikely(!tags->nr_reserved_tags)) {
                        WARN_ON_ONCE(1);
                        return BLK_MQ_NO_TAG;
                }
                bt = tags->breserved_tags;
                tag_offset = 0;
        } else {
                bt = tags->bitmap_tags;
                tag_offset = tags->nr_reserved_tags;
        }

        tag = __blk_mq_get_tag(data, bt);
        if (tag != BLK_MQ_NO_TAG)
                goto found_tag;

        if (data->flags & BLK_MQ_REQ_NOWAIT)
                return BLK_MQ_NO_TAG;

        ws = bt_wait_ptr(bt, data->hctx);
        do {
                struct sbitmap_queue *bt_prev;

                /*
                 * We're out of tags on this hardware queue, kick any
                 * pending IO submits before going to sleep waiting for
                 * some to complete.
                 */
                blk_mq_run_hw_queue(data->hctx, false);

                /*
                 * Retry tag allocation after running the hardware queue,
                 * as running the queue may also have found completions.
                 */
                tag = __blk_mq_get_tag(data, bt);
                if (tag != BLK_MQ_NO_TAG)
                        break;

                sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE);

                tag = __blk_mq_get_tag(data, bt);
                if (tag != BLK_MQ_NO_TAG)
                        break;

                bt_prev = bt;
                io_schedule();

                sbitmap_finish_wait(bt, ws, &wait);

                data->ctx = blk_mq_get_ctx(data->q);
                data->hctx = blk_mq_map_queue(data->q, data->cmd_flags,
                                                data->ctx);
                tags = blk_mq_tags_from_data(data);
                if (data->flags & BLK_MQ_REQ_RESERVED)
                        bt = tags->breserved_tags;
                else
                        bt = tags->bitmap_tags;

                /*
                 * If destination hw queue is changed, fake wake up on
                 * previous queue for compensating the wake up miss, so
                 * other allocations on previous queue won't be starved.
                 */
                if (bt != bt_prev)
                        sbitmap_queue_wake_up(bt_prev);

                ws = bt_wait_ptr(bt, data->hctx);
        } while (1);

        sbitmap_finish_wait(bt, ws, &wait);

found_tag:
        /*
         * Give up this allocation if the hctx is inactive.  The caller will
         * retry on an active hctx.
         */
        if (unlikely(test_bit(BLK_MQ_S_INACTIVE, &data->hctx->state))) {
                blk_mq_put_tag(tags, data->ctx, tag + tag_offset);
                return BLK_MQ_NO_TAG;
        }
        return tag + tag_offset;
}

void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
                    unsigned int tag)
{
        if (!blk_mq_tag_is_reserved(tags, tag)) {
                const int real_tag = tag - tags->nr_reserved_tags;

                BUG_ON(real_tag >= tags->nr_tags);
                sbitmap_queue_clear(tags->bitmap_tags, real_tag, ctx->cpu);
        } else {
                BUG_ON(tag >= tags->nr_reserved_tags);
                sbitmap_queue_clear(tags->breserved_tags, tag, ctx->cpu);
        }
}

struct bt_iter_data {
        struct blk_mq_hw_ctx *hctx;
        busy_iter_fn *fn;
        void *data;
        bool reserved;
};

static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags,
                unsigned int bitnr)
{
        struct request *rq;
        unsigned long flags;

        spin_lock_irqsave(&tags->lock, flags);
        rq = tags->rqs[bitnr];
        if (!rq || rq->tag != bitnr || !refcount_inc_not_zero(&rq->ref))
                rq = NULL;
        spin_unlock_irqrestore(&tags->lock, flags);
        return rq;
}

static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
{
        struct bt_iter_data *iter_data = data;
        struct blk_mq_hw_ctx *hctx = iter_data->hctx;
        struct blk_mq_tags *tags = hctx->tags;
        bool reserved = iter_data->reserved;
        struct request *rq;
        bool ret = true;

        if (!reserved)
                bitnr += tags->nr_reserved_tags;
        /*
         * We can hit rq == NULL here, because the tagging functions
         * test and set the bit before assigning ->rqs[].
         */
        rq = blk_mq_find_and_get_req(tags, bitnr);
        if (!rq)
                return true;

        if (rq->q == hctx->queue && rq->mq_hctx == hctx)
                ret = iter_data->fn(hctx, rq, iter_data->data, reserved);
        blk_mq_put_rq_ref(rq);
        return ret;
}

/**
 * bt_for_each - iterate over the requests associated with a hardware queue
 * @hctx:        Hardware queue to examine.
 * @bt:                sbitmap to examine. This is either the breserved_tags member
 *                or the bitmap_tags member of struct blk_mq_tags.
 * @fn:                Pointer to the function that will be called for each request
 *                associated with @hctx that has been assigned a driver tag.
 *                @fn will be called as follows: @fn(@hctx, rq, @data, @reserved)
 *                where rq is a pointer to a request. Return true to continue
 *                iterating tags, false to stop.
 * @data:        Will be passed as third argument to @fn.
 * @reserved:        Indicates whether @bt is the breserved_tags member or the
 *                bitmap_tags member of struct blk_mq_tags.
 */
static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt,
                        busy_iter_fn *fn, void *data, bool reserved)
{
        struct bt_iter_data iter_data = {
                .hctx = hctx,
                .fn = fn,
                .data = data,
                .reserved = reserved,
        };

        sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data);
}

struct bt_tags_iter_data {
        struct blk_mq_tags *tags;
        busy_tag_iter_fn *fn;
        void *data;
        unsigned int flags;
};

#define BT_TAG_ITER_RESERVED                (1 << 0)
#define BT_TAG_ITER_STARTED                (1 << 1)
#define BT_TAG_ITER_STATIC_RQS                (1 << 2)

static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
{
        struct bt_tags_iter_data *iter_data = data;
        struct blk_mq_tags *tags = iter_data->tags;
        bool reserved = iter_data->flags & BT_TAG_ITER_RESERVED;
        struct request *rq;
        bool ret = true;
        bool iter_static_rqs = !!(iter_data->flags & BT_TAG_ITER_STATIC_RQS);

        if (!reserved)
                bitnr += tags->nr_reserved_tags;

        /*
         * We can hit rq == NULL here, because the tagging functions
         * test and set the bit before assigning ->rqs[].
         */
        if (iter_static_rqs)
                rq = tags->static_rqs[bitnr];
        else
                rq = blk_mq_find_and_get_req(tags, bitnr);
        if (!rq)
                return true;

        if (!(iter_data->flags & BT_TAG_ITER_STARTED) ||
            blk_mq_request_started(rq))
                ret = iter_data->fn(rq, iter_data->data, reserved);
        if (!iter_static_rqs)
                blk_mq_put_rq_ref(rq);
        return ret;
}

/**
 * bt_tags_for_each - iterate over the requests in a tag map
 * @tags:        Tag map to iterate over.
 * @bt:                sbitmap to examine. This is either the breserved_tags member
 *                or the bitmap_tags member of struct blk_mq_tags.
 * @fn:                Pointer to the function that will be called for each started
 *                request. @fn will be called as follows: @fn(rq, @data,
 *                @reserved) where rq is a pointer to a request. Return true
 *                to continue iterating tags, false to stop.
 * @data:        Will be passed as second argument to @fn.
 * @flags:        BT_TAG_ITER_*
 */
static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt,
                             busy_tag_iter_fn *fn, void *data, unsigned int flags)
{
        struct bt_tags_iter_data iter_data = {
                .tags = tags,
                .fn = fn,
                .data = data,
                .flags = flags,
        };

        if (tags->rqs)
                sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data);
}

static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags,
                busy_tag_iter_fn *fn, void *priv, unsigned int flags)
{
        WARN_ON_ONCE(flags & BT_TAG_ITER_RESERVED);

        if (tags->nr_reserved_tags)
                bt_tags_for_each(tags, tags->breserved_tags, fn, priv,
                                 flags | BT_TAG_ITER_RESERVED);
        bt_tags_for_each(tags, tags->bitmap_tags, fn, priv, flags);
}

/**
 * blk_mq_all_tag_iter - iterate over all requests in a tag map
 * @tags:        Tag map to iterate over.
 * @fn:                Pointer to the function that will be called for each
 *                request. @fn will be called as follows: @fn(rq, @priv,
 *                reserved) where rq is a pointer to a request. 'reserved'
 *                indicates whether or not @rq is a reserved request. Return
 *                true to continue iterating tags, false to stop.
 * @priv:        Will be passed as second argument to @fn.
 *
 * Caller has to pass the tag map from which requests are allocated.
 */
void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
                void *priv)
{
        __blk_mq_all_tag_iter(tags, fn, priv, BT_TAG_ITER_STATIC_RQS);
}

/**
 * blk_mq_tagset_busy_iter - iterate over all started requests in a tag set
 * @tagset:        Tag set to iterate over.
 * @fn:                Pointer to the function that will be called for each started
 *                request. @fn will be called as follows: @fn(rq, @priv,
 *                reserved) where rq is a pointer to a request. 'reserved'
 *                indicates whether or not @rq is a reserved request. Return
 *                true to continue iterating tags, false to stop.
 * @priv:        Will be passed as second argument to @fn.
 *
 * We grab one request reference before calling @fn and release it after
 * @fn returns.
 */
void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
                busy_tag_iter_fn *fn, void *priv)
{
        int i;

        for (i = 0; i < tagset->nr_hw_queues; i++) {
                if (tagset->tags && tagset->tags[i])
                        __blk_mq_all_tag_iter(tagset->tags[i], fn, priv,
                                              BT_TAG_ITER_STARTED);
        }
}
EXPORT_SYMBOL(blk_mq_tagset_busy_iter);

static bool blk_mq_tagset_count_completed_rqs(struct request *rq,
                void *data, bool reserved)
{
        unsigned *count = data;

        if (blk_mq_request_completed(rq))
                (*count)++;
        return true;
}

/**
 * blk_mq_tagset_wait_completed_request - wait until all completed req's
 * complete funtion is run
 * @tagset:        Tag set to drain completed request
 *
 * Note: This function has to be run after all IO queues are shutdown
 */
void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset)
{
        while (true) {
                unsigned count = 0;

                blk_mq_tagset_busy_iter(tagset,
                                blk_mq_tagset_count_completed_rqs, &count);
                if (!count)
                        break;
                msleep(5);
        }
}
EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request);

/**
 * blk_mq_queue_tag_busy_iter - iterate over all requests with a driver tag
 * @q:                Request queue to examine.
 * @fn:                Pointer to the function that will be called for each request
 *                on @q. @fn will be called as follows: @fn(hctx, rq, @priv,
 *                reserved) where rq is a pointer to a request and hctx points
 *                to the hardware queue associated with the request. 'reserved'
 *                indicates whether or not @rq is a reserved request.
 * @priv:        Will be passed as third argument to @fn.
 *
 * Note: if @q->tag_set is shared with other request queues then @fn will be
 * called for all requests on all queues that share that tag set and not only
 * for requests associated with @q.
 */
void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
                void *priv)
{
        struct blk_mq_hw_ctx *hctx;
        int i;

        /*
         * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx
         * while the queue is frozen. So we can use q_usage_counter to avoid
         * racing with it.
         */
        if (!percpu_ref_tryget(&q->q_usage_counter))
                return;

        queue_for_each_hw_ctx(q, hctx, i) {
                struct blk_mq_tags *tags = hctx->tags;

                /*
                 * If no software queues are currently mapped to this
                 * hardware queue, there's nothing to check
                 */
                if (!blk_mq_hw_queue_mapped(hctx))
                        continue;

                if (tags->nr_reserved_tags)
                        bt_for_each(hctx, tags->breserved_tags, fn, priv, true);
                bt_for_each(hctx, tags->bitmap_tags, fn, priv, false);
        }
        blk_queue_exit(q);
}

static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth,
                    bool round_robin, int node)
{
        return sbitmap_queue_init_node(bt, depth, -1, round_robin, GFP_KERNEL,
                                       node);
}

static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
                                   int node, int alloc_policy)
{
        unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
        bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR;

        if (bt_alloc(&tags->__bitmap_tags, depth, round_robin, node))
                return -ENOMEM;
        if (bt_alloc(&tags->__breserved_tags, tags->nr_reserved_tags,
                     round_robin, node))
                goto free_bitmap_tags;

        tags->bitmap_tags = &tags->__bitmap_tags;
        tags->breserved_tags = &tags->__breserved_tags;

        return 0;
free_bitmap_tags:
        sbitmap_queue_free(&tags->__bitmap_tags);
        return -ENOMEM;
}

int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int flags)
{
        unsigned int depth = set->queue_depth - set->reserved_tags;
        int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);
        bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR;
        int i, node = set->numa_node;

        if (bt_alloc(&set->__bitmap_tags, depth, round_robin, node))
                return -ENOMEM;
        if (bt_alloc(&set->__breserved_tags, set->reserved_tags,
                     round_robin, node))
                goto free_bitmap_tags;

        for (i = 0; i < set->nr_hw_queues; i++) {
                struct blk_mq_tags *tags = set->tags[i];

                tags->bitmap_tags = &set->__bitmap_tags;
                tags->breserved_tags = &set->__breserved_tags;
        }

        return 0;
free_bitmap_tags:
        sbitmap_queue_free(&set->__bitmap_tags);
        return -ENOMEM;
}

void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set)
{
        sbitmap_queue_free(&set->__bitmap_tags);
        sbitmap_queue_free(&set->__breserved_tags);
}

struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
                                     unsigned int reserved_tags,
                                     int node, unsigned int flags)
{
        int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(flags);
        struct blk_mq_tags *tags;

        if (total_tags > BLK_MQ_TAG_MAX) {
                pr_err("blk-mq: tag depth too large\n");
                return NULL;
        }

        tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node);
        if (!tags)
                return NULL;

        tags->nr_tags = total_tags;
        tags->nr_reserved_tags = reserved_tags;
        spin_lock_init(&tags->lock);

        if (flags & BLK_MQ_F_TAG_HCTX_SHARED)
                return tags;

        if (blk_mq_init_bitmap_tags(tags, node, alloc_policy) < 0) {
                kfree(tags);
                return NULL;
        }
        return tags;
}

void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags)
{
        if (!(flags & BLK_MQ_F_TAG_HCTX_SHARED)) {
                sbitmap_queue_free(tags->bitmap_tags);
                sbitmap_queue_free(tags->breserved_tags);
        }
        kfree(tags);
}

int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
                            struct blk_mq_tags **tagsptr, unsigned int tdepth,
                            bool can_grow)
{
        struct blk_mq_tags *tags = *tagsptr;

        if (tdepth <= tags->nr_reserved_tags)
                return -EINVAL;

        /*
         * If we are allowed to grow beyond the original size, allocate
         * a new set of tags before freeing the old one.
         */
        if (tdepth > tags->nr_tags) {
                struct blk_mq_tag_set *set = hctx->queue->tag_set;
                /* Only sched tags can grow, so clear HCTX_SHARED flag  */
                unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
                struct blk_mq_tags *new;
                bool ret;

                if (!can_grow)
                        return -EINVAL;

                /*
                 * We need some sort of upper limit, set it high enough that
                 * no valid use cases should require more.
                 */
                if (tdepth > 16 * BLKDEV_MAX_RQ)
                        return -EINVAL;

                new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth,
                                tags->nr_reserved_tags, flags);
                if (!new)
                        return -ENOMEM;
                ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth);
                if (ret) {
                        blk_mq_free_rq_map(new, flags);
                        return -ENOMEM;
                }

                blk_mq_free_rqs(set, *tagsptr, hctx->queue_num);
                blk_mq_free_rq_map(*tagsptr, flags);
                *tagsptr = new;
        } else {
                /*
                 * Don't need (or can't) update reserved tags here, they
                 * remain static and should never need resizing.
                 */
                sbitmap_queue_resize(tags->bitmap_tags,
                                tdepth - tags->nr_reserved_tags);
        }

        return 0;
}

void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int size)
{
        sbitmap_queue_resize(&set->__bitmap_tags, size - set->reserved_tags);
}

/**
 * blk_mq_unique_tag() - return a tag that is unique queue-wide
 * @rq: request for which to compute a unique tag
 *
 * The tag field in struct request is unique per hardware queue but not over
 * all hardware queues. Hence this function that returns a tag with the
 * hardware context index in the upper bits and the per hardware queue tag in
 * the lower bits.
 *
 * Note: When called for a request that is queued on a non-multiqueue request
 * queue, the hardware context index is set to zero.
 */
u32 blk_mq_unique_tag(struct request *rq)
{
        return (rq->mq_hctx->queue_num << BLK_MQ_UNIQUE_TAG_BITS) |
                (rq->tag & BLK_MQ_UNIQUE_TAG_MASK);
}
EXPORT_SYMBOL(blk_mq_unique_tag);










































































































































































































































































































































    7 
























   10 




    1 




    1 














    2 














   10 




    1 




    2 




    3 






















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_SPINLOCK_H
#define __LINUX_SPINLOCK_H

/*
 * include/linux/spinlock.h - generic spinlock/rwlock declarations
 *
 * here's the role of the various spinlock/rwlock related include files:
 *
 * on SMP builds:
 *
 *  asm/spinlock_types.h: contains the arch_spinlock_t/arch_rwlock_t and the
 *                        initializers
 *
 *  linux/spinlock_types.h:
 *                        defines the generic type and initializers
 *
 *  asm/spinlock.h:       contains the arch_spin_*()/etc. lowlevel
 *                        implementations, mostly inline assembly code
 *
 *   (also included on UP-debug builds:)
 *
 *  linux/spinlock_api_smp.h:
 *                        contains the prototypes for the _spin_*() APIs.
 *
 *  linux/spinlock.h:     builds the final spin_*() APIs.
 *
 * on UP builds:
 *
 *  linux/spinlock_type_up.h:
 *                        contains the generic, simplified UP spinlock type.
 *                        (which is an empty structure on non-debug builds)
 *
 *  linux/spinlock_types.h:
 *                        defines the generic type and initializers
 *
 *  linux/spinlock_up.h:
 *                        contains the arch_spin_*()/etc. version of UP
 *                        builds. (which are NOPs on non-debug, non-preempt
 *                        builds)
 *
 *   (included on UP-non-debug builds:)
 *
 *  linux/spinlock_api_up.h:
 *                        builds the _spin_*() APIs.
 *
 *  linux/spinlock.h:     builds the final spin_*() APIs.
 */

#include <linux/typecheck.h>
#include <linux/preempt.h>
#include <linux/linkage.h>
#include <linux/compiler.h>
#include <linux/irqflags.h>
#include <linux/thread_info.h>
#include <linux/kernel.h>
#include <linux/stringify.h>
#include <linux/bottom_half.h>
#include <linux/lockdep.h>
#include <linux/cleanup.h>
#include <asm/barrier.h>
#include <asm/mmiowb.h>


/*
 * Must define these before including other files, inline functions need them
 */
#define LOCK_SECTION_NAME ".text..lock."KBUILD_BASENAME

#define LOCK_SECTION_START(extra)               \
        ".subsection 1\n\t"                     \
        extra                                   \
        ".ifndef " LOCK_SECTION_NAME "\n\t"     \
        LOCK_SECTION_NAME ":\n\t"               \
        ".endif\n"

#define LOCK_SECTION_END                        \
        ".previous\n\t"

#define __lockfunc __section(".spinlock.text")

/*
 * Pull the arch_spinlock_t and arch_rwlock_t definitions:
 */
#include <linux/spinlock_types.h>

/*
 * Pull the arch_spin*() functions/declarations (UP-nondebug doesn't need them):
 */
#ifdef CONFIG_SMP
# include <asm/spinlock.h>
#else
# include <linux/spinlock_up.h>
#endif

#ifdef CONFIG_DEBUG_SPINLOCK
  extern void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
                                   struct lock_class_key *key, short inner);

# define raw_spin_lock_init(lock)                                        \
do {                                                                        \
        static struct lock_class_key __key;                                \
                                                                        \
        __raw_spin_lock_init((lock), #lock, &__key, LD_WAIT_SPIN);        \
} while (0)

#else
# define raw_spin_lock_init(lock)                                \
        do { *(lock) = __RAW_SPIN_LOCK_UNLOCKED(lock); } while (0)
#endif

#define raw_spin_is_locked(lock)        arch_spin_is_locked(&(lock)->raw_lock)

#ifdef arch_spin_is_contended
#define raw_spin_is_contended(lock)        arch_spin_is_contended(&(lock)->raw_lock)
#else
#define raw_spin_is_contended(lock)        (((void)(lock), 0))
#endif /*arch_spin_is_contended*/

/*
 * smp_mb__after_spinlock() provides the equivalent of a full memory barrier
 * between program-order earlier lock acquisitions and program-order later
 * memory accesses.
 *
 * This guarantees that the following two properties hold:
 *
 *   1) Given the snippet:
 *
 *          { X = 0;  Y = 0; }
 *
 *          CPU0                                CPU1
 *
 *          WRITE_ONCE(X, 1);                WRITE_ONCE(Y, 1);
 *          spin_lock(S);                        smp_mb();
 *          smp_mb__after_spinlock();        r1 = READ_ONCE(X);
 *          r0 = READ_ONCE(Y);
 *          spin_unlock(S);
 *
 *      it is forbidden that CPU0 does not observe CPU1's store to Y (r0 = 0)
 *      and CPU1 does not observe CPU0's store to X (r1 = 0); see the comments
 *      preceding the call to smp_mb__after_spinlock() in __schedule() and in
 *      try_to_wake_up().
 *
 *   2) Given the snippet:
 *
 *  { X = 0;  Y = 0; }
 *
 *  CPU0                CPU1                                CPU2
 *
 *  spin_lock(S);        spin_lock(S);                        r1 = READ_ONCE(Y);
 *  WRITE_ONCE(X, 1);        smp_mb__after_spinlock();        smp_rmb();
 *  spin_unlock(S);        r0 = READ_ONCE(X);                r2 = READ_ONCE(X);
 *                        WRITE_ONCE(Y, 1);
 *                        spin_unlock(S);
 *
 *      it is forbidden that CPU0's critical section executes before CPU1's
 *      critical section (r0 = 1), CPU2 observes CPU1's store to Y (r1 = 1)
 *      and CPU2 does not observe CPU0's store to X (r2 = 0); see the comments
 *      preceding the calls to smp_rmb() in try_to_wake_up() for similar
 *      snippets but "projected" onto two CPUs.
 *
 * Property (2) upgrades the lock to an RCsc lock.
 *
 * Since most load-store architectures implement ACQUIRE with an smp_mb() after
 * the LL/SC loop, they need no further barriers. Similarly all our TSO
 * architectures imply an smp_mb() for each atomic instruction and equally don't
 * need more.
 *
 * Architectures that can implement ACQUIRE better need to take care.
 */
#ifndef smp_mb__after_spinlock
#define smp_mb__after_spinlock()        do { } while (0)
#endif

#ifdef CONFIG_DEBUG_SPINLOCK
 extern void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock);
#define do_raw_spin_lock_flags(lock, flags) do_raw_spin_lock(lock)
 extern int do_raw_spin_trylock(raw_spinlock_t *lock);
 extern void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock);
#else
static inline void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock)
{
        __acquire(lock);
        arch_spin_lock(&lock->raw_lock);
        mmiowb_spin_lock();
}

#ifndef arch_spin_lock_flags
#define arch_spin_lock_flags(lock, flags)        arch_spin_lock(lock)
#endif

static inline void
do_raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long *flags) __acquires(lock)
{
        __acquire(lock);
        arch_spin_lock_flags(&lock->raw_lock, *flags);
        mmiowb_spin_lock();
}

static inline int do_raw_spin_trylock(raw_spinlock_t *lock)
{
        int ret = arch_spin_trylock(&(lock)->raw_lock);

        if (ret)
                mmiowb_spin_lock();

        return ret;
}

static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
{
        mmiowb_spin_unlock();
        arch_spin_unlock(&lock->raw_lock);
        __release(lock);
}
#endif

/*
 * Define the various spin_lock methods.  Note we define these
 * regardless of whether CONFIG_SMP or CONFIG_PREEMPTION are set. The
 * various methods are defined as nops in the case they are not
 * required.
 */
#define raw_spin_trylock(lock)        __cond_lock(lock, _raw_spin_trylock(lock))

#define raw_spin_lock(lock)        _raw_spin_lock(lock)

#ifdef CONFIG_DEBUG_LOCK_ALLOC
# define raw_spin_lock_nested(lock, subclass) \
        _raw_spin_lock_nested(lock, subclass)

# define raw_spin_lock_nest_lock(lock, nest_lock)                        \
         do {                                                                \
                 typecheck(struct lockdep_map *, &(nest_lock)->dep_map);\
                 _raw_spin_lock_nest_lock(lock, &(nest_lock)->dep_map);        \
         } while (0)
#else
/*
 * Always evaluate the 'subclass' argument to avoid that the compiler
 * warns about set-but-not-used variables when building with
 * CONFIG_DEBUG_LOCK_ALLOC=n and with W=1.
 */
# define raw_spin_lock_nested(lock, subclass)                \
        _raw_spin_lock(((void)(subclass), (lock)))
# define raw_spin_lock_nest_lock(lock, nest_lock)        _raw_spin_lock(lock)
#endif

#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)

#define raw_spin_lock_irqsave(lock, flags)                        \
        do {                                                \
                typecheck(unsigned long, flags);        \
                flags = _raw_spin_lock_irqsave(lock);        \
        } while (0)

#ifdef CONFIG_DEBUG_LOCK_ALLOC
#define raw_spin_lock_irqsave_nested(lock, flags, subclass)                \
        do {                                                                \
                typecheck(unsigned long, flags);                        \
                flags = _raw_spin_lock_irqsave_nested(lock, subclass);        \
        } while (0)
#else
#define raw_spin_lock_irqsave_nested(lock, flags, subclass)                \
        do {                                                                \
                typecheck(unsigned long, flags);                        \
                flags = _raw_spin_lock_irqsave(lock);                        \
        } while (0)
#endif

#else

#define raw_spin_lock_irqsave(lock, flags)                \
        do {                                                \
                typecheck(unsigned long, flags);        \
                _raw_spin_lock_irqsave(lock, flags);        \
        } while (0)

#define raw_spin_lock_irqsave_nested(lock, flags, subclass)        \
        raw_spin_lock_irqsave(lock, flags)

#endif

#define raw_spin_lock_irq(lock)                _raw_spin_lock_irq(lock)
#define raw_spin_lock_bh(lock)                _raw_spin_lock_bh(lock)
#define raw_spin_unlock(lock)                _raw_spin_unlock(lock)
#define raw_spin_unlock_irq(lock)        _raw_spin_unlock_irq(lock)

#define raw_spin_unlock_irqrestore(lock, flags)                \
        do {                                                        \
                typecheck(unsigned long, flags);                \
                _raw_spin_unlock_irqrestore(lock, flags);        \
        } while (0)
#define raw_spin_unlock_bh(lock)        _raw_spin_unlock_bh(lock)

#define raw_spin_trylock_bh(lock) \
        __cond_lock(lock, _raw_spin_trylock_bh(lock))

#define raw_spin_trylock_irq(lock) \
({ \
        local_irq_disable(); \
        raw_spin_trylock(lock) ? \
        1 : ({ local_irq_enable(); 0;  }); \
})

#define raw_spin_trylock_irqsave(lock, flags) \
({ \
        local_irq_save(flags); \
        raw_spin_trylock(lock) ? \
        1 : ({ local_irq_restore(flags); 0; }); \
})

/* Include rwlock functions */
#include <linux/rwlock.h>

/*
 * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
 */
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
# include <linux/spinlock_api_smp.h>
#else
# include <linux/spinlock_api_up.h>
#endif

/*
 * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
 */

static __always_inline raw_spinlock_t *spinlock_check(spinlock_t *lock)
{
        return &lock->rlock;
}

#ifdef CONFIG_DEBUG_SPINLOCK

# define spin_lock_init(lock)                                        \
do {                                                                \
        static struct lock_class_key __key;                        \
                                                                \
        __raw_spin_lock_init(spinlock_check(lock),                \
                             #lock, &__key, LD_WAIT_CONFIG);        \
} while (0)

#else

# define spin_lock_init(_lock)                        \
do {                                                \
        spinlock_check(_lock);                        \
        *(_lock) = __SPIN_LOCK_UNLOCKED(_lock);        \
} while (0)

#endif

static __always_inline void spin_lock(spinlock_t *lock)
{
        raw_spin_lock(&lock->rlock);
}

static __always_inline void spin_lock_bh(spinlock_t *lock)
{
        raw_spin_lock_bh(&lock->rlock);
}

static __always_inline int spin_trylock(spinlock_t *lock)
{
        return raw_spin_trylock(&lock->rlock);
}

#define spin_lock_nested(lock, subclass)                        \
do {                                                                \
        raw_spin_lock_nested(spinlock_check(lock), subclass);        \
} while (0)

#define spin_lock_nest_lock(lock, nest_lock)                                \
do {                                                                        \
        raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock);        \
} while (0)

static __always_inline void spin_lock_irq(spinlock_t *lock)
{
        raw_spin_lock_irq(&lock->rlock);
}

#define spin_lock_irqsave(lock, flags)                                \
do {                                                                \
        raw_spin_lock_irqsave(spinlock_check(lock), flags);        \
} while (0)

#define spin_lock_irqsave_nested(lock, flags, subclass)                        \
do {                                                                        \
        raw_spin_lock_irqsave_nested(spinlock_check(lock), flags, subclass); \
} while (0)

static __always_inline void spin_unlock(spinlock_t *lock)
{
        raw_spin_unlock(&lock->rlock);
}

static __always_inline void spin_unlock_bh(spinlock_t *lock)
{
        raw_spin_unlock_bh(&lock->rlock);
}

static __always_inline void spin_unlock_irq(spinlock_t *lock)
{
        raw_spin_unlock_irq(&lock->rlock);
}

static __always_inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
{
        raw_spin_unlock_irqrestore(&lock->rlock, flags);
}

static __always_inline int spin_trylock_bh(spinlock_t *lock)
{
        return raw_spin_trylock_bh(&lock->rlock);
}

static __always_inline int spin_trylock_irq(spinlock_t *lock)
{
        return raw_spin_trylock_irq(&lock->rlock);
}

#define spin_trylock_irqsave(lock, flags)                        \
({                                                                \
        raw_spin_trylock_irqsave(spinlock_check(lock), flags); \
})

/**
 * spin_is_locked() - Check whether a spinlock is locked.
 * @lock: Pointer to the spinlock.
 *
 * This function is NOT required to provide any memory ordering
 * guarantees; it could be used for debugging purposes or, when
 * additional synchronization is needed, accompanied with other
 * constructs (memory barriers) enforcing the synchronization.
 *
 * Returns: 1 if @lock is locked, 0 otherwise.
 *
 * Note that the function only tells you that the spinlock is
 * seen to be locked, not that it is locked on your CPU.
 *
 * Further, on CONFIG_SMP=n builds with CONFIG_DEBUG_SPINLOCK=n,
 * the return value is always 0 (see include/linux/spinlock_up.h).
 * Therefore you should not rely heavily on the return value.
 */
static __always_inline int spin_is_locked(spinlock_t *lock)
{
        return raw_spin_is_locked(&lock->rlock);
}

static __always_inline int spin_is_contended(spinlock_t *lock)
{
        return raw_spin_is_contended(&lock->rlock);
}

#define assert_spin_locked(lock)        assert_raw_spin_locked(&(lock)->rlock)

/*
 * Pull the atomic_t declaration:
 * (asm-mips/atomic.h needs above definitions)
 */
#include <linux/atomic.h>
/**
 * atomic_dec_and_lock - lock on reaching reference count zero
 * @atomic: the atomic counter
 * @lock: the spinlock in question
 *
 * Decrements @atomic by 1.  If the result is 0, returns true and locks
 * @lock.  Returns false for all other cases.
 */
extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
#define atomic_dec_and_lock(atomic, lock) \
                __cond_lock(lock, _atomic_dec_and_lock(atomic, lock))

extern int _atomic_dec_and_lock_irqsave(atomic_t *atomic, spinlock_t *lock,
                                        unsigned long *flags);
#define atomic_dec_and_lock_irqsave(atomic, lock, flags) \
                __cond_lock(lock, _atomic_dec_and_lock_irqsave(atomic, lock, &(flags)))

int __alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *lock_mask,
                             size_t max_size, unsigned int cpu_mult,
                             gfp_t gfp, const char *name,
                             struct lock_class_key *key);

#define alloc_bucket_spinlocks(locks, lock_mask, max_size, cpu_mult, gfp)    \
        ({                                                                     \
                static struct lock_class_key key;                             \
                int ret;                                                     \
                                                                             \
                ret = __alloc_bucket_spinlocks(locks, lock_mask, max_size,   \
                                               cpu_mult, gfp, #locks, &key); \
                ret;                                                             \
        })

void free_bucket_spinlocks(spinlock_t *locks);

DEFINE_LOCK_GUARD_1(raw_spinlock, raw_spinlock_t,
                    raw_spin_lock(_T->lock),
                    raw_spin_unlock(_T->lock))

DEFINE_LOCK_GUARD_1(raw_spinlock_nested, raw_spinlock_t,
                    raw_spin_lock_nested(_T->lock, SINGLE_DEPTH_NESTING),
                    raw_spin_unlock(_T->lock))

DEFINE_LOCK_GUARD_1(raw_spinlock_irq, raw_spinlock_t,
                    raw_spin_lock_irq(_T->lock),
                    raw_spin_unlock_irq(_T->lock))

DEFINE_LOCK_GUARD_1(raw_spinlock_irqsave, raw_spinlock_t,
                    raw_spin_lock_irqsave(_T->lock, _T->flags),
                    raw_spin_unlock_irqrestore(_T->lock, _T->flags),
                    unsigned long flags)

DEFINE_LOCK_GUARD_1(spinlock, spinlock_t,
                    spin_lock(_T->lock),
                    spin_unlock(_T->lock))

DEFINE_LOCK_GUARD_1(spinlock_irq, spinlock_t,
                    spin_lock_irq(_T->lock),
                    spin_unlock_irq(_T->lock))

DEFINE_LOCK_GUARD_1(spinlock_irqsave, spinlock_t,
                    spin_lock_irqsave(_T->lock, _T->flags),
                    spin_unlock_irqrestore(_T->lock, _T->flags),
                    unsigned long flags)

#undef __LINUX_INSIDE_SPINLOCK_H
#endif /* __LINUX_SPINLOCK_H */































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
/*
   BlueZ - Bluetooth protocol stack for Linux
   Copyright (C) 2000-2001 Qualcomm Incorporated

   Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License version 2 as
   published by the Free Software Foundation;

   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
   IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
   CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

   ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
   COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
   SOFTWARE IS DISCLAIMED.
*/

#ifndef __BLUETOOTH_H
#define __BLUETOOTH_H

#include <linux/poll.h>
#include <net/sock.h>
#include <linux/seq_file.h>

#define BT_SUBSYS_VERSION        2
#define BT_SUBSYS_REVISION        22

#ifndef AF_BLUETOOTH
#define AF_BLUETOOTH        31
#define PF_BLUETOOTH        AF_BLUETOOTH
#endif

/* Bluetooth versions */
#define BLUETOOTH_VER_1_1        1
#define BLUETOOTH_VER_1_2        2
#define BLUETOOTH_VER_2_0        3
#define BLUETOOTH_VER_2_1        4
#define BLUETOOTH_VER_4_0        6

/* Reserv for core and drivers use */
#define BT_SKB_RESERVE        8

#define BTPROTO_L2CAP        0
#define BTPROTO_HCI        1
#define BTPROTO_SCO        2
#define BTPROTO_RFCOMM        3
#define BTPROTO_BNEP        4
#define BTPROTO_CMTP        5
#define BTPROTO_HIDP        6
#define BTPROTO_AVDTP        7

#define SOL_HCI                0
#define SOL_L2CAP        6
#define SOL_SCO                17
#define SOL_RFCOMM        18

#define BT_SECURITY        4
struct bt_security {
        __u8 level;
        __u8 key_size;
};
#define BT_SECURITY_SDP                0
#define BT_SECURITY_LOW                1
#define BT_SECURITY_MEDIUM        2
#define BT_SECURITY_HIGH        3
#define BT_SECURITY_FIPS        4

#define BT_DEFER_SETUP        7

#define BT_FLUSHABLE        8

#define BT_FLUSHABLE_OFF        0
#define BT_FLUSHABLE_ON                1

#define BT_POWER        9
struct bt_power {
        __u8 force_active;
};
#define BT_POWER_FORCE_ACTIVE_OFF 0
#define BT_POWER_FORCE_ACTIVE_ON  1

#define BT_CHANNEL_POLICY        10

/* BR/EDR only (default policy)
 *   AMP controllers cannot be used.
 *   Channel move requests from the remote device are denied.
 *   If the L2CAP channel is currently using AMP, move the channel to BR/EDR.
 */
#define BT_CHANNEL_POLICY_BREDR_ONLY                0

/* BR/EDR Preferred
 *   Allow use of AMP controllers.
 *   If the L2CAP channel is currently on AMP, move it to BR/EDR.
 *   Channel move requests from the remote device are allowed.
 */
#define BT_CHANNEL_POLICY_BREDR_PREFERRED        1

/* AMP Preferred
 *   Allow use of AMP controllers
 *   If the L2CAP channel is currently on BR/EDR and AMP controller
 *     resources are available, initiate a channel move to AMP.
 *   Channel move requests from the remote device are allowed.
 *   If the L2CAP socket has not been connected yet, try to create
 *     and configure the channel directly on an AMP controller rather
 *     than BR/EDR.
 */
#define BT_CHANNEL_POLICY_AMP_PREFERRED                2

#define BT_VOICE                11
struct bt_voice {
        __u16 setting;
};

#define BT_VOICE_TRANSPARENT                        0x0003
#define BT_VOICE_CVSD_16BIT                        0x0060

#define BT_SNDMTU                12
#define BT_RCVMTU                13
#define BT_PHY                        14

#define BT_PHY_BR_1M_1SLOT        0x00000001
#define BT_PHY_BR_1M_3SLOT        0x00000002
#define BT_PHY_BR_1M_5SLOT        0x00000004
#define BT_PHY_EDR_2M_1SLOT        0x00000008
#define BT_PHY_EDR_2M_3SLOT        0x00000010
#define BT_PHY_EDR_2M_5SLOT        0x00000020
#define BT_PHY_EDR_3M_1SLOT        0x00000040
#define BT_PHY_EDR_3M_3SLOT        0x00000080
#define BT_PHY_EDR_3M_5SLOT        0x00000100
#define BT_PHY_LE_1M_TX                0x00000200
#define BT_PHY_LE_1M_RX                0x00000400
#define BT_PHY_LE_2M_TX                0x00000800
#define BT_PHY_LE_2M_RX                0x00001000
#define BT_PHY_LE_CODED_TX        0x00002000
#define BT_PHY_LE_CODED_RX        0x00004000

#define BT_MODE                        15

#define BT_MODE_BASIC                0x00
#define BT_MODE_ERTM                0x01
#define BT_MODE_STREAMING        0x02
#define BT_MODE_LE_FLOWCTL        0x03
#define BT_MODE_EXT_FLOWCTL        0x04

#define BT_PKT_STATUS          16

#define BT_SCM_PKT_STATUS        0x03

__printf(1, 2)
void bt_info(const char *fmt, ...);
__printf(1, 2)
void bt_warn(const char *fmt, ...);
__printf(1, 2)
void bt_err(const char *fmt, ...);
#if IS_ENABLED(CONFIG_BT_FEATURE_DEBUG)
void bt_dbg_set(bool enable);
bool bt_dbg_get(void);
__printf(1, 2)
void bt_dbg(const char *fmt, ...);
#endif
__printf(1, 2)
void bt_warn_ratelimited(const char *fmt, ...);
__printf(1, 2)
void bt_err_ratelimited(const char *fmt, ...);

#define BT_INFO(fmt, ...)        bt_info(fmt "\n", ##__VA_ARGS__)
#define BT_WARN(fmt, ...)        bt_warn(fmt "\n", ##__VA_ARGS__)
#define BT_ERR(fmt, ...)        bt_err(fmt "\n", ##__VA_ARGS__)

#if IS_ENABLED(CONFIG_BT_FEATURE_DEBUG)
#define BT_DBG(fmt, ...)        bt_dbg(fmt "\n", ##__VA_ARGS__)
#else
#define BT_DBG(fmt, ...)        pr_debug(fmt "\n", ##__VA_ARGS__)
#endif

#define bt_dev_name(hdev) ((hdev) ? (hdev)->name : "null")

#define bt_dev_info(hdev, fmt, ...)                                \
        BT_INFO("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__)
#define bt_dev_warn(hdev, fmt, ...)                                \
        BT_WARN("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__)
#define bt_dev_err(hdev, fmt, ...)                                \
        BT_ERR("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__)
#define bt_dev_dbg(hdev, fmt, ...)                                \
        BT_DBG("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__)

#define bt_dev_warn_ratelimited(hdev, fmt, ...)                        \
        bt_warn_ratelimited("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__)
#define bt_dev_err_ratelimited(hdev, fmt, ...)                        \
        bt_err_ratelimited("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__)

/* Connection and socket states */
enum {
        BT_CONNECTED = 1, /* Equal to TCP_ESTABLISHED to make net code happy */
        BT_OPEN,
        BT_BOUND,
        BT_LISTEN,
        BT_CONNECT,
        BT_CONNECT2,
        BT_CONFIG,
        BT_DISCONN,
        BT_CLOSED
};

/* If unused will be removed by compiler */
static inline const char *state_to_string(int state)
{
        switch (state) {
        case BT_CONNECTED:
                return "BT_CONNECTED";
        case BT_OPEN:
                return "BT_OPEN";
        case BT_BOUND:
                return "BT_BOUND";
        case BT_LISTEN:
                return "BT_LISTEN";
        case BT_CONNECT:
                return "BT_CONNECT";
        case BT_CONNECT2:
                return "BT_CONNECT2";
        case BT_CONFIG:
                return "BT_CONFIG";
        case BT_DISCONN:
                return "BT_DISCONN";
        case BT_CLOSED:
                return "BT_CLOSED";
        }

        return "invalid state";
}

/* BD Address */
typedef struct {
        __u8 b[6];
} __packed bdaddr_t;

/* BD Address type */
#define BDADDR_BREDR                0x00
#define BDADDR_LE_PUBLIC        0x01
#define BDADDR_LE_RANDOM        0x02

static inline bool bdaddr_type_is_valid(u8 type)
{
        switch (type) {
        case BDADDR_BREDR:
        case BDADDR_LE_PUBLIC:
        case BDADDR_LE_RANDOM:
                return true;
        }

        return false;
}

static inline bool bdaddr_type_is_le(u8 type)
{
        switch (type) {
        case BDADDR_LE_PUBLIC:
        case BDADDR_LE_RANDOM:
                return true;
        }

        return false;
}

#define BDADDR_ANY  (&(bdaddr_t) {{0, 0, 0, 0, 0, 0}})
#define BDADDR_NONE (&(bdaddr_t) {{0xff, 0xff, 0xff, 0xff, 0xff, 0xff}})

/* Copy, swap, convert BD Address */
static inline int bacmp(const bdaddr_t *ba1, const bdaddr_t *ba2)
{
        return memcmp(ba1, ba2, sizeof(bdaddr_t));
}
static inline void bacpy(bdaddr_t *dst, const bdaddr_t *src)
{
        memcpy(dst, src, sizeof(bdaddr_t));
}

void baswap(bdaddr_t *dst, const bdaddr_t *src);

/* Common socket structures and functions */

#define bt_sk(__sk) ((struct bt_sock *) __sk)

struct bt_sock {
        struct sock sk;
        struct list_head accept_q;
        struct sock *parent;
        unsigned long flags;
        void (*skb_msg_name)(struct sk_buff *, void *, int *);
        void (*skb_put_cmsg)(struct sk_buff *, struct msghdr *, struct sock *);
};

enum {
        BT_SK_DEFER_SETUP,
        BT_SK_SUSPEND,
};

struct bt_sock_list {
        struct hlist_head head;
        rwlock_t          lock;
#ifdef CONFIG_PROC_FS
        int (* custom_seq_show)(struct seq_file *, void *);
#endif
};

int  bt_sock_register(int proto, const struct net_proto_family *ops);
void bt_sock_unregister(int proto);
void bt_sock_link(struct bt_sock_list *l, struct sock *s);
void bt_sock_unlink(struct bt_sock_list *l, struct sock *s);
int  bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
                     int flags);
int  bt_sock_stream_recvmsg(struct socket *sock, struct msghdr *msg,
                            size_t len, int flags);
__poll_t bt_sock_poll(struct file *file, struct socket *sock, poll_table *wait);
int  bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
int  bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo);
int  bt_sock_wait_ready(struct sock *sk, unsigned long flags);

void bt_accept_enqueue(struct sock *parent, struct sock *sk, bool bh);
void bt_accept_unlink(struct sock *sk);
struct sock *bt_accept_dequeue(struct sock *parent, struct socket *newsock);

/* Skb helpers */
struct l2cap_ctrl {
        u8        sframe:1,
                poll:1,
                final:1,
                fcs:1,
                sar:2,
                super:2;

        u16        reqseq;
        u16        txseq;
        u8        retries;
        __le16  psm;
        bdaddr_t bdaddr;
        struct l2cap_chan *chan;
};

struct sco_ctrl {
        u8        pkt_status;
};

struct hci_dev;

typedef void (*hci_req_complete_t)(struct hci_dev *hdev, u8 status, u16 opcode);
typedef void (*hci_req_complete_skb_t)(struct hci_dev *hdev, u8 status,
                                       u16 opcode, struct sk_buff *skb);

#define HCI_REQ_START        BIT(0)
#define HCI_REQ_SKB        BIT(1)

struct hci_ctrl {
        u16 opcode;
        u8 req_flags;
        u8 req_event;
        union {
                hci_req_complete_t req_complete;
                hci_req_complete_skb_t req_complete_skb;
        };
};

struct bt_skb_cb {
        u8 pkt_type;
        u8 force_active;
        u16 expect;
        u8 incoming:1;
        union {
                struct l2cap_ctrl l2cap;
                struct sco_ctrl sco;
                struct hci_ctrl hci;
        };
};
#define bt_cb(skb) ((struct bt_skb_cb *)((skb)->cb))

#define hci_skb_pkt_type(skb) bt_cb((skb))->pkt_type
#define hci_skb_expect(skb) bt_cb((skb))->expect
#define hci_skb_opcode(skb) bt_cb((skb))->hci.opcode

static inline struct sk_buff *bt_skb_alloc(unsigned int len, gfp_t how)
{
        struct sk_buff *skb;

        skb = alloc_skb(len + BT_SKB_RESERVE, how);
        if (skb)
                skb_reserve(skb, BT_SKB_RESERVE);
        return skb;
}

static inline struct sk_buff *bt_skb_send_alloc(struct sock *sk,
                                        unsigned long len, int nb, int *err)
{
        struct sk_buff *skb;

        skb = sock_alloc_send_skb(sk, len + BT_SKB_RESERVE, nb, err);
        if (skb)
                skb_reserve(skb, BT_SKB_RESERVE);

        if (!skb && *err)
                return NULL;

        *err = sock_error(sk);
        if (*err)
                goto out;

        if (sk->sk_shutdown) {
                *err = -ECONNRESET;
                goto out;
        }

        return skb;

out:
        kfree_skb(skb);
        return NULL;
}

/* Shall not be called with lock_sock held */
static inline struct sk_buff *bt_skb_sendmsg(struct sock *sk,
                                             struct msghdr *msg,
                                             size_t len, size_t mtu,
                                             size_t headroom, size_t tailroom)
{
        struct sk_buff *skb;
        size_t size = min_t(size_t, len, mtu);
        int err;

        skb = bt_skb_send_alloc(sk, size + headroom + tailroom,
                                msg->msg_flags & MSG_DONTWAIT, &err);
        if (!skb)
                return ERR_PTR(err);

        skb_reserve(skb, headroom);
        skb_tailroom_reserve(skb, mtu, tailroom);

        if (!copy_from_iter_full(skb_put(skb, size), size, &msg->msg_iter)) {
                kfree_skb(skb);
                return ERR_PTR(-EFAULT);
        }

        skb->priority = sk->sk_priority;

        return skb;
}

/* Similar to bt_skb_sendmsg but can split the msg into multiple fragments
 * accourding to the MTU.
 */
static inline struct sk_buff *bt_skb_sendmmsg(struct sock *sk,
                                              struct msghdr *msg,
                                              size_t len, size_t mtu,
                                              size_t headroom, size_t tailroom)
{
        struct sk_buff *skb, **frag;

        skb = bt_skb_sendmsg(sk, msg, len, mtu, headroom, tailroom);
        if (IS_ERR_OR_NULL(skb))
                return skb;

        len -= skb->len;
        if (!len)
                return skb;

        /* Add remaining data over MTU as continuation fragments */
        frag = &skb_shinfo(skb)->frag_list;
        while (len) {
                struct sk_buff *tmp;

                tmp = bt_skb_sendmsg(sk, msg, len, mtu, headroom, tailroom);
                if (IS_ERR(tmp)) {
                        return skb;
                }

                len -= tmp->len;

                *frag = tmp;
                frag = &(*frag)->next;
        }

        return skb;
}

static inline int bt_copy_from_sockptr(void *dst, size_t dst_size,
                                       sockptr_t src, size_t src_size)
{
        if (dst_size > src_size)
                return -EINVAL;

        return copy_from_sockptr(dst, src, dst_size);
}

int bt_to_errno(u16 code);

void hci_sock_set_flag(struct sock *sk, int nr);
void hci_sock_clear_flag(struct sock *sk, int nr);
int hci_sock_test_flag(struct sock *sk, int nr);
unsigned short hci_sock_get_channel(struct sock *sk);
u32 hci_sock_get_cookie(struct sock *sk);

int hci_sock_init(void);
void hci_sock_cleanup(void);

int bt_sysfs_init(void);
void bt_sysfs_cleanup(void);

int bt_procfs_init(struct net *net, const char *name,
                   struct bt_sock_list *sk_list,
                   int (*seq_show)(struct seq_file *, void *));
void bt_procfs_cleanup(struct net *net, const char *name);

extern struct dentry *bt_debugfs;

int l2cap_init(void);
void l2cap_exit(void);

#if IS_ENABLED(CONFIG_BT_BREDR)
int sco_init(void);
void sco_exit(void);
#else
static inline int sco_init(void)
{
        return 0;
}

static inline void sco_exit(void)
{
}
#endif

int mgmt_init(void);
void mgmt_exit(void);

void bt_sock_reclassify_lock(struct sock *sk, int proto);

#endif /* __BLUETOOTH_H */



























































































































































    1 





    1 










    1 














































    1 




    1 




















    1 












































































    1 










    1 

































    1 

    1 



















































































































































































































































































































    1 










    1 




    1 












    1 



    1 



    1 








    1 



    1 



















































































































































































    1 



    1 














    1 



    1 
    1 













    1 




    1 
    1 


    1 

    1 

    1 

    1 


















    1 









    1 




    1 






    1 

































































































    1 
    1 



    1 
    1 



    1 







































    1 
    1 







    1 

    1 

    1 









    1 











    1 
    1 
    1 
    1 

































































































































































































































































































































































































    1 





































































































































































































































    1 












    1 


    1 
    1 




    1 

    1 










    1 













    1 












































































    1 


    1 




    1 


    1 
    1 



































    1 























    1 


















    1 














































































































































































































































































    1 

    1 





















    1 


    1 


















    1 







    1 












































































































































































    1 












    1 







    1 
















    1 
    1 





    1 



    1 


































































































































































































































































































































































































































































































































































































































































































































    1 






























































































































































































































































































































































































































































































































































































    1 


    1 











    1 
    1 

























































































































































































































































































































































    1 





    1 





    1 






    1 



























    1 

























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
/*
 * Resizable virtual memory filesystem for Linux.
 *
 * Copyright (C) 2000 Linus Torvalds.
 *                 2000 Transmeta Corp.
 *                 2000-2001 Christoph Rohland
 *                 2000-2001 SAP AG
 *                 2002 Red Hat Inc.
 * Copyright (C) 2002-2011 Hugh Dickins.
 * Copyright (C) 2011 Google Inc.
 * Copyright (C) 2002-2005 VERITAS Software Corporation.
 * Copyright (C) 2004 Andi Kleen, SuSE Labs
 *
 * Extended attribute support for tmpfs:
 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
 *
 * tiny-shmem:
 * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
 *
 * This file is released under the GPL.
 */

#include <linux/fs.h>
#include <linux/init.h>
#include <linux/vfs.h>
#include <linux/mount.h>
#include <linux/ramfs.h>
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/random.h>
#include <linux/sched/signal.h>
#include <linux/export.h>
#include <linux/swap.h>
#include <linux/uio.h>
#include <linux/khugepaged.h>
#include <linux/hugetlb.h>
#include <linux/frontswap.h>
#include <linux/fs_parser.h>

#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */

static struct vfsmount *shm_mnt;

#ifdef CONFIG_SHMEM
/*
 * This virtual memory filesystem is heavily based on the ramfs. It
 * extends ramfs by the ability to use swap and honor resource limits
 * which makes it a completely usable filesystem.
 */

#include <linux/xattr.h>
#include <linux/exportfs.h>
#include <linux/posix_acl.h>
#include <linux/posix_acl_xattr.h>
#include <linux/mman.h>
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/shmem_fs.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/pagevec.h>
#include <linux/percpu_counter.h>
#include <linux/falloc.h>
#include <linux/splice.h>
#include <linux/security.h>
#include <linux/swapops.h>
#include <linux/mempolicy.h>
#include <linux/namei.h>
#include <linux/ctype.h>
#include <linux/migrate.h>
#include <linux/highmem.h>
#include <linux/seq_file.h>
#include <linux/magic.h>
#include <linux/syscalls.h>
#include <linux/fcntl.h>
#include <uapi/linux/memfd.h>
#include <linux/userfaultfd_k.h>
#include <linux/rmap.h>
#include <linux/uuid.h>

#include <linux/uaccess.h>

#include "internal.h"

#define BLOCKS_PER_PAGE  (PAGE_SIZE/512)
#define VM_ACCT(size)    (PAGE_ALIGN(size) >> PAGE_SHIFT)

/* Pretend that each entry is of this size in directory's i_size */
#define BOGO_DIRENT_SIZE 20

/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
#define SHORT_SYMLINK_LEN 128

/*
 * shmem_fallocate communicates with shmem_fault or shmem_writepage via
 * inode->i_private (with i_mutex making sure that it has only one user at
 * a time): we would prefer not to enlarge the shmem inode just for that.
 */
struct shmem_falloc {
        wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
        pgoff_t start;                /* start of range currently being fallocated */
        pgoff_t next;                /* the next page offset to be fallocated */
        pgoff_t nr_falloced;        /* how many new pages have been fallocated */
        pgoff_t nr_unswapped;        /* how often writepage refused to swap out */
};

struct shmem_options {
        unsigned long long blocks;
        unsigned long long inodes;
        struct mempolicy *mpol;
        kuid_t uid;
        kgid_t gid;
        umode_t mode;
        bool full_inums;
        int huge;
        int seen;
#define SHMEM_SEEN_BLOCKS 1
#define SHMEM_SEEN_INODES 2
#define SHMEM_SEEN_HUGE 4
#define SHMEM_SEEN_INUMS 8
};

#ifdef CONFIG_TMPFS
static unsigned long shmem_default_max_blocks(void)
{
        return totalram_pages() / 2;
}

static unsigned long shmem_default_max_inodes(void)
{
        unsigned long nr_pages = totalram_pages();

        return min(nr_pages - totalhigh_pages(), nr_pages / 2);
}
#endif

static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
static int shmem_replace_page(struct page **pagep, gfp_t gfp,
                                struct shmem_inode_info *info, pgoff_t index);
static int shmem_swapin_page(struct inode *inode, pgoff_t index,
                             struct page **pagep, enum sgp_type sgp,
                             gfp_t gfp, struct vm_area_struct *vma,
                             vm_fault_t *fault_type);
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
                struct page **pagep, enum sgp_type sgp,
                gfp_t gfp, struct vm_area_struct *vma,
                struct vm_fault *vmf, vm_fault_t *fault_type);

int shmem_getpage(struct inode *inode, pgoff_t index,
                struct page **pagep, enum sgp_type sgp)
{
        return shmem_getpage_gfp(inode, index, pagep, sgp,
                mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL);
}

static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
{
        return sb->s_fs_info;
}

/*
 * shmem_file_setup pre-accounts the whole fixed size of a VM object,
 * for shared memory and for shared anonymous (/dev/zero) mappings
 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
 * consistent with the pre-accounting of private mappings ...
 */
static inline int shmem_acct_size(unsigned long flags, loff_t size)
{
        return (flags & VM_NORESERVE) ?
                0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
}

static inline void shmem_unacct_size(unsigned long flags, loff_t size)
{
        if (!(flags & VM_NORESERVE))
                vm_unacct_memory(VM_ACCT(size));
}

static inline int shmem_reacct_size(unsigned long flags,
                loff_t oldsize, loff_t newsize)
{
        if (!(flags & VM_NORESERVE)) {
                if (VM_ACCT(newsize) > VM_ACCT(oldsize))
                        return security_vm_enough_memory_mm(current->mm,
                                        VM_ACCT(newsize) - VM_ACCT(oldsize));
                else if (VM_ACCT(newsize) < VM_ACCT(oldsize))
                        vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize));
        }
        return 0;
}

/*
 * ... whereas tmpfs objects are accounted incrementally as
 * pages are allocated, in order to allow large sparse files.
 * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
 */
static inline int shmem_acct_block(unsigned long flags, long pages)
{
        if (!(flags & VM_NORESERVE))
                return 0;

        return security_vm_enough_memory_mm(current->mm,
                        pages * VM_ACCT(PAGE_SIZE));
}

static inline void shmem_unacct_blocks(unsigned long flags, long pages)
{
        if (flags & VM_NORESERVE)
                vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
}

static inline bool shmem_inode_acct_block(struct inode *inode, long pages)
{
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);

        if (shmem_acct_block(info->flags, pages))
                return false;

        if (sbinfo->max_blocks) {
                if (percpu_counter_compare(&sbinfo->used_blocks,
                                           sbinfo->max_blocks - pages) > 0)
                        goto unacct;
                percpu_counter_add(&sbinfo->used_blocks, pages);
        }

        return true;

unacct:
        shmem_unacct_blocks(info->flags, pages);
        return false;
}

static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages)
{
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);

        if (sbinfo->max_blocks)
                percpu_counter_sub(&sbinfo->used_blocks, pages);
        shmem_unacct_blocks(info->flags, pages);
}

static const struct super_operations shmem_ops;
static const struct address_space_operations shmem_aops;
static const struct file_operations shmem_file_operations;
static const struct inode_operations shmem_inode_operations;
static const struct inode_operations shmem_dir_inode_operations;
static const struct inode_operations shmem_special_inode_operations;
static const struct vm_operations_struct shmem_vm_ops;
static struct file_system_type shmem_fs_type;

bool vma_is_shmem(struct vm_area_struct *vma)
{
        return vma->vm_ops == &shmem_vm_ops;
}

static LIST_HEAD(shmem_swaplist);
static DEFINE_MUTEX(shmem_swaplist_mutex);

/*
 * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and
 * produces a novel ino for the newly allocated inode.
 *
 * It may also be called when making a hard link to permit the space needed by
 * each dentry. However, in that case, no new inode number is needed since that
 * internally draws from another pool of inode numbers (currently global
 * get_next_ino()). This case is indicated by passing NULL as inop.
 */
#define SHMEM_INO_BATCH 1024
static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
{
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
        ino_t ino;

        if (!(sb->s_flags & SB_KERNMOUNT)) {
                spin_lock(&sbinfo->stat_lock);
                if (sbinfo->max_inodes) {
                        if (!sbinfo->free_inodes) {
                                spin_unlock(&sbinfo->stat_lock);
                                return -ENOSPC;
                        }
                        sbinfo->free_inodes--;
                }
                if (inop) {
                        ino = sbinfo->next_ino++;
                        if (unlikely(is_zero_ino(ino)))
                                ino = sbinfo->next_ino++;
                        if (unlikely(!sbinfo->full_inums &&
                                     ino > UINT_MAX)) {
                                /*
                                 * Emulate get_next_ino uint wraparound for
                                 * compatibility
                                 */
                                if (IS_ENABLED(CONFIG_64BIT))
                                        pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n",
                                                __func__, MINOR(sb->s_dev));
                                sbinfo->next_ino = 1;
                                ino = sbinfo->next_ino++;
                        }
                        *inop = ino;
                }
                spin_unlock(&sbinfo->stat_lock);
        } else if (inop) {
                /*
                 * __shmem_file_setup, one of our callers, is lock-free: it
                 * doesn't hold stat_lock in shmem_reserve_inode since
                 * max_inodes is always 0, and is called from potentially
                 * unknown contexts. As such, use a per-cpu batched allocator
                 * which doesn't require the per-sb stat_lock unless we are at
                 * the batch boundary.
                 *
                 * We don't need to worry about inode{32,64} since SB_KERNMOUNT
                 * shmem mounts are not exposed to userspace, so we don't need
                 * to worry about things like glibc compatibility.
                 */
                ino_t *next_ino;
                next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu());
                ino = *next_ino;
                if (unlikely(ino % SHMEM_INO_BATCH == 0)) {
                        spin_lock(&sbinfo->stat_lock);
                        ino = sbinfo->next_ino;
                        sbinfo->next_ino += SHMEM_INO_BATCH;
                        spin_unlock(&sbinfo->stat_lock);
                        if (unlikely(is_zero_ino(ino)))
                                ino++;
                }
                *inop = ino;
                *next_ino = ++ino;
                put_cpu();
        }

        return 0;
}

static void shmem_free_inode(struct super_block *sb)
{
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
        if (sbinfo->max_inodes) {
                spin_lock(&sbinfo->stat_lock);
                sbinfo->free_inodes++;
                spin_unlock(&sbinfo->stat_lock);
        }
}

/**
 * shmem_recalc_inode - recalculate the block usage of an inode
 * @inode: inode to recalc
 *
 * We have to calculate the free blocks since the mm can drop
 * undirtied hole pages behind our back.
 *
 * But normally   info->alloced == inode->i_mapping->nrpages + info->swapped
 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
 *
 * It has to be called with the spinlock held.
 */
static void shmem_recalc_inode(struct inode *inode)
{
        struct shmem_inode_info *info = SHMEM_I(inode);
        long freed;

        freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
        if (freed > 0) {
                info->alloced -= freed;
                inode->i_blocks -= freed * BLOCKS_PER_PAGE;
                shmem_inode_unacct_blocks(inode, freed);
        }
}

bool shmem_charge(struct inode *inode, long pages)
{
        struct shmem_inode_info *info = SHMEM_I(inode);
        unsigned long flags;

        if (!shmem_inode_acct_block(inode, pages))
                return false;

        /* nrpages adjustment first, then shmem_recalc_inode() when balanced */
        inode->i_mapping->nrpages += pages;

        spin_lock_irqsave(&info->lock, flags);
        info->alloced += pages;
        inode->i_blocks += pages * BLOCKS_PER_PAGE;
        shmem_recalc_inode(inode);
        spin_unlock_irqrestore(&info->lock, flags);

        return true;
}

void shmem_uncharge(struct inode *inode, long pages)
{
        struct shmem_inode_info *info = SHMEM_I(inode);
        unsigned long flags;

        /* nrpages adjustment done by __delete_from_page_cache() or caller */

        spin_lock_irqsave(&info->lock, flags);
        info->alloced -= pages;
        inode->i_blocks -= pages * BLOCKS_PER_PAGE;
        shmem_recalc_inode(inode);
        spin_unlock_irqrestore(&info->lock, flags);

        shmem_inode_unacct_blocks(inode, pages);
}

/*
 * Replace item expected in xarray by a new item, while holding xa_lock.
 */
static int shmem_replace_entry(struct address_space *mapping,
                        pgoff_t index, void *expected, void *replacement)
{
        XA_STATE(xas, &mapping->i_pages, index);
        void *item;

        VM_BUG_ON(!expected);
        VM_BUG_ON(!replacement);
        item = xas_load(&xas);
        if (item != expected)
                return -ENOENT;
        xas_store(&xas, replacement);
        return 0;
}

/*
 * Sometimes, before we decide whether to proceed or to fail, we must check
 * that an entry was not already brought back from swap by a racing thread.
 *
 * Checking page is not enough: by the time a SwapCache page is locked, it
 * might be reused, and again be SwapCache, using the same swap as before.
 */
static bool shmem_confirm_swap(struct address_space *mapping,
                               pgoff_t index, swp_entry_t swap)
{
        return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
}

/*
 * Definitions for "huge tmpfs": tmpfs mounted with the huge= option
 *
 * SHMEM_HUGE_NEVER:
 *        disables huge pages for the mount;
 * SHMEM_HUGE_ALWAYS:
 *        enables huge pages for the mount;
 * SHMEM_HUGE_WITHIN_SIZE:
 *        only allocate huge pages if the page will be fully within i_size,
 *        also respect fadvise()/madvise() hints;
 * SHMEM_HUGE_ADVISE:
 *        only allocate huge pages if requested with fadvise()/madvise();
 */

#define SHMEM_HUGE_NEVER        0
#define SHMEM_HUGE_ALWAYS        1
#define SHMEM_HUGE_WITHIN_SIZE        2
#define SHMEM_HUGE_ADVISE        3

/*
 * Special values.
 * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled:
 *
 * SHMEM_HUGE_DENY:
 *        disables huge on shm_mnt and all mounts, for emergency use;
 * SHMEM_HUGE_FORCE:
 *        enables huge on shm_mnt and all mounts, w/o needing option, for testing;
 *
 */
#define SHMEM_HUGE_DENY                (-1)
#define SHMEM_HUGE_FORCE        (-2)

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* ifdef here to avoid bloating shmem.o when not necessary */

static int shmem_huge __read_mostly;

#if defined(CONFIG_SYSFS)
static int shmem_parse_huge(const char *str)
{
        if (!strcmp(str, "never"))
                return SHMEM_HUGE_NEVER;
        if (!strcmp(str, "always"))
                return SHMEM_HUGE_ALWAYS;
        if (!strcmp(str, "within_size"))
                return SHMEM_HUGE_WITHIN_SIZE;
        if (!strcmp(str, "advise"))
                return SHMEM_HUGE_ADVISE;
        if (!strcmp(str, "deny"))
                return SHMEM_HUGE_DENY;
        if (!strcmp(str, "force"))
                return SHMEM_HUGE_FORCE;
        return -EINVAL;
}
#endif

#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
static const char *shmem_format_huge(int huge)
{
        switch (huge) {
        case SHMEM_HUGE_NEVER:
                return "never";
        case SHMEM_HUGE_ALWAYS:
                return "always";
        case SHMEM_HUGE_WITHIN_SIZE:
                return "within_size";
        case SHMEM_HUGE_ADVISE:
                return "advise";
        case SHMEM_HUGE_DENY:
                return "deny";
        case SHMEM_HUGE_FORCE:
                return "force";
        default:
                VM_BUG_ON(1);
                return "bad_val";
        }
}
#endif

static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
                struct shrink_control *sc, unsigned long nr_to_split)
{
        LIST_HEAD(list), *pos, *next;
        LIST_HEAD(to_remove);
        struct inode *inode;
        struct shmem_inode_info *info;
        struct page *page;
        unsigned long batch = sc ? sc->nr_to_scan : 128;
        int split = 0;

        if (list_empty(&sbinfo->shrinklist))
                return SHRINK_STOP;

        spin_lock(&sbinfo->shrinklist_lock);
        list_for_each_safe(pos, next, &sbinfo->shrinklist) {
                info = list_entry(pos, struct shmem_inode_info, shrinklist);

                /* pin the inode */
                inode = igrab(&info->vfs_inode);

                /* inode is about to be evicted */
                if (!inode) {
                        list_del_init(&info->shrinklist);
                        goto next;
                }

                /* Check if there's anything to gain */
                if (round_up(inode->i_size, PAGE_SIZE) ==
                                round_up(inode->i_size, HPAGE_PMD_SIZE)) {
                        list_move(&info->shrinklist, &to_remove);
                        goto next;
                }

                list_move(&info->shrinklist, &list);
next:
                sbinfo->shrinklist_len--;
                if (!--batch)
                        break;
        }
        spin_unlock(&sbinfo->shrinklist_lock);

        list_for_each_safe(pos, next, &to_remove) {
                info = list_entry(pos, struct shmem_inode_info, shrinklist);
                inode = &info->vfs_inode;
                list_del_init(&info->shrinklist);
                iput(inode);
        }

        list_for_each_safe(pos, next, &list) {
                int ret;

                info = list_entry(pos, struct shmem_inode_info, shrinklist);
                inode = &info->vfs_inode;

                if (nr_to_split && split >= nr_to_split)
                        goto move_back;

                page = find_get_page(inode->i_mapping,
                                (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);
                if (!page)
                        goto drop;

                /* No huge page at the end of the file: nothing to split */
                if (!PageTransHuge(page)) {
                        put_page(page);
                        goto drop;
                }

                /*
                 * Move the inode on the list back to shrinklist if we failed
                 * to lock the page at this time.
                 *
                 * Waiting for the lock may lead to deadlock in the
                 * reclaim path.
                 */
                if (!trylock_page(page)) {
                        put_page(page);
                        goto move_back;
                }

                ret = split_huge_page(page);
                unlock_page(page);
                put_page(page);

                /* If split failed move the inode on the list back to shrinklist */
                if (ret)
                        goto move_back;

                split++;
drop:
                list_del_init(&info->shrinklist);
                goto put;
move_back:
                /*
                 * Make sure the inode is either on the global list or deleted
                 * from any local list before iput() since it could be deleted
                 * in another thread once we put the inode (then the local list
                 * is corrupted).
                 */
                spin_lock(&sbinfo->shrinklist_lock);
                list_move(&info->shrinklist, &sbinfo->shrinklist);
                sbinfo->shrinklist_len++;
                spin_unlock(&sbinfo->shrinklist_lock);
put:
                iput(inode);
        }

        return split;
}

static long shmem_unused_huge_scan(struct super_block *sb,
                struct shrink_control *sc)
{
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);

        if (!READ_ONCE(sbinfo->shrinklist_len))
                return SHRINK_STOP;

        return shmem_unused_huge_shrink(sbinfo, sc, 0);
}

static long shmem_unused_huge_count(struct super_block *sb,
                struct shrink_control *sc)
{
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
        return READ_ONCE(sbinfo->shrinklist_len);
}
#else /* !CONFIG_TRANSPARENT_HUGEPAGE */

#define shmem_huge SHMEM_HUGE_DENY

static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
                struct shrink_control *sc, unsigned long nr_to_split)
{
        return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo)
{
        if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
            (shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) &&
            shmem_huge != SHMEM_HUGE_DENY)
                return true;
        return false;
}

/*
 * Like add_to_page_cache_locked, but error if expected item has gone.
 */
static int shmem_add_to_page_cache(struct page *page,
                                   struct address_space *mapping,
                                   pgoff_t index, void *expected, gfp_t gfp,
                                   struct mm_struct *charge_mm)
{
        XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page));
        unsigned long i = 0;
        unsigned long nr = compound_nr(page);
        int error;

        VM_BUG_ON_PAGE(PageTail(page), page);
        VM_BUG_ON_PAGE(index != round_down(index, nr), page);
        VM_BUG_ON_PAGE(!PageLocked(page), page);
        VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
        VM_BUG_ON(expected && PageTransHuge(page));

        page_ref_add(page, nr);
        page->mapping = mapping;
        page->index = index;

        if (!PageSwapCache(page)) {
                error = mem_cgroup_charge(page, charge_mm, gfp);
                if (error) {
                        if (PageTransHuge(page)) {
                                count_vm_event(THP_FILE_FALLBACK);
                                count_vm_event(THP_FILE_FALLBACK_CHARGE);
                        }
                        goto error;
                }
        }
        cgroup_throttle_swaprate(page, gfp);

        do {
                void *entry;
                xas_lock_irq(&xas);
                entry = xas_find_conflict(&xas);
                if (entry != expected)
                        xas_set_err(&xas, -EEXIST);
                xas_create_range(&xas);
                if (xas_error(&xas))
                        goto unlock;
next:
                xas_store(&xas, page);
                if (++i < nr) {
                        xas_next(&xas);
                        goto next;
                }
                if (PageTransHuge(page)) {
                        count_vm_event(THP_FILE_ALLOC);
                        __inc_node_page_state(page, NR_SHMEM_THPS);
                }
                mapping->nrpages += nr;
                __mod_lruvec_page_state(page, NR_FILE_PAGES, nr);
                __mod_lruvec_page_state(page, NR_SHMEM, nr);
unlock:
                xas_unlock_irq(&xas);
        } while (xas_nomem(&xas, gfp));

        if (xas_error(&xas)) {
                error = xas_error(&xas);
                goto error;
        }

        return 0;
error:
        page->mapping = NULL;
        page_ref_sub(page, nr);
        return error;
}

/*
 * Like delete_from_page_cache, but substitutes swap for page.
 */
static void shmem_delete_from_page_cache(struct page *page, void *radswap)
{
        struct address_space *mapping = page->mapping;
        int error;

        VM_BUG_ON_PAGE(PageCompound(page), page);

        xa_lock_irq(&mapping->i_pages);
        error = shmem_replace_entry(mapping, page->index, page, radswap);
        page->mapping = NULL;
        mapping->nrpages--;
        __dec_lruvec_page_state(page, NR_FILE_PAGES);
        __dec_lruvec_page_state(page, NR_SHMEM);
        xa_unlock_irq(&mapping->i_pages);
        put_page(page);
        BUG_ON(error);
}

/*
 * Remove swap entry from page cache, free the swap and its page cache.
 */
static int shmem_free_swap(struct address_space *mapping,
                           pgoff_t index, void *radswap)
{
        void *old;

        old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0);
        if (old != radswap)
                return -ENOENT;
        free_swap_and_cache(radix_to_swp_entry(radswap));
        return 0;
}

/*
 * Determine (in bytes) how many of the shmem object's pages mapped by the
 * given offsets are swapped out.
 *
 * This is safe to call without i_mutex or the i_pages lock thanks to RCU,
 * as long as the inode doesn't go away and racy results are not a problem.
 */
unsigned long shmem_partial_swap_usage(struct address_space *mapping,
                                                pgoff_t start, pgoff_t end)
{
        XA_STATE(xas, &mapping->i_pages, start);
        struct page *page;
        unsigned long swapped = 0;

        rcu_read_lock();
        xas_for_each(&xas, page, end - 1) {
                if (xas_retry(&xas, page))
                        continue;
                if (xa_is_value(page))
                        swapped++;

                if (need_resched()) {
                        xas_pause(&xas);
                        cond_resched_rcu();
                }
        }

        rcu_read_unlock();

        return swapped << PAGE_SHIFT;
}

/*
 * Determine (in bytes) how many of the shmem object's pages mapped by the
 * given vma is swapped out.
 *
 * This is safe to call without i_mutex or the i_pages lock thanks to RCU,
 * as long as the inode doesn't go away and racy results are not a problem.
 */
unsigned long shmem_swap_usage(struct vm_area_struct *vma)
{
        struct inode *inode = file_inode(vma->vm_file);
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct address_space *mapping = inode->i_mapping;
        unsigned long swapped;

        /* Be careful as we don't hold info->lock */
        swapped = READ_ONCE(info->swapped);

        /*
         * The easier cases are when the shmem object has nothing in swap, or
         * the vma maps it whole. Then we can simply use the stats that we
         * already track.
         */
        if (!swapped)
                return 0;

        if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size)
                return swapped << PAGE_SHIFT;

        /* Here comes the more involved part */
        return shmem_partial_swap_usage(mapping,
                        linear_page_index(vma, vma->vm_start),
                        linear_page_index(vma, vma->vm_end));
}

/*
 * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
 */
void shmem_unlock_mapping(struct address_space *mapping)
{
        struct pagevec pvec;
        pgoff_t indices[PAGEVEC_SIZE];
        pgoff_t index = 0;

        pagevec_init(&pvec);
        /*
         * Minor point, but we might as well stop if someone else SHM_LOCKs it.
         */
        while (!mapping_unevictable(mapping)) {
                /*
                 * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it
                 * has finished, if it hits a row of PAGEVEC_SIZE swap entries.
                 */
                pvec.nr = find_get_entries(mapping, index,
                                           PAGEVEC_SIZE, pvec.pages, indices);
                if (!pvec.nr)
                        break;
                index = indices[pvec.nr - 1] + 1;
                pagevec_remove_exceptionals(&pvec);
                check_move_unevictable_pages(&pvec);
                pagevec_release(&pvec);
                cond_resched();
        }
}

/*
 * Check whether a hole-punch or truncation needs to split a huge page,
 * returning true if no split was required, or the split has been successful.
 *
 * Eviction (or truncation to 0 size) should never need to split a huge page;
 * but in rare cases might do so, if shmem_undo_range() failed to trylock on
 * head, and then succeeded to trylock on tail.
 *
 * A split can only succeed when there are no additional references on the
 * huge page: so the split below relies upon find_get_entries() having stopped
 * when it found a subpage of the huge page, without getting further references.
 */
static bool shmem_punch_compound(struct page *page, pgoff_t start, pgoff_t end)
{
        if (!PageTransCompound(page))
                return true;

        /* Just proceed to delete a huge page wholly within the range punched */
        if (PageHead(page) &&
            page->index >= start && page->index + HPAGE_PMD_NR <= end)
                return true;

        /* Try to split huge page, so we can truly punch the hole or truncate */
        return split_huge_page(page) >= 0;
}

/*
 * Remove range of pages and swap entries from page cache, and free them.
 * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
 */
static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
                                                                 bool unfalloc)
{
        struct address_space *mapping = inode->i_mapping;
        struct shmem_inode_info *info = SHMEM_I(inode);
        pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
        pgoff_t end = (lend + 1) >> PAGE_SHIFT;
        unsigned int partial_start = lstart & (PAGE_SIZE - 1);
        unsigned int partial_end = (lend + 1) & (PAGE_SIZE - 1);
        struct pagevec pvec;
        pgoff_t indices[PAGEVEC_SIZE];
        long nr_swaps_freed = 0;
        pgoff_t index;
        int i;

        if (lend == -1)
                end = -1;        /* unsigned, so actually very big */

        pagevec_init(&pvec);
        index = start;
        while (index < end) {
                pvec.nr = find_get_entries(mapping, index,
                        min(end - index, (pgoff_t)PAGEVEC_SIZE),
                        pvec.pages, indices);
                if (!pvec.nr)
                        break;
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        struct page *page = pvec.pages[i];

                        index = indices[i];
                        if (index >= end)
                                break;

                        if (xa_is_value(page)) {
                                if (unfalloc)
                                        continue;
                                nr_swaps_freed += !shmem_free_swap(mapping,
                                                                index, page);
                                continue;
                        }

                        VM_BUG_ON_PAGE(page_to_pgoff(page) != index, page);

                        if (!trylock_page(page))
                                continue;

                        if ((!unfalloc || !PageUptodate(page)) &&
                            page_mapping(page) == mapping) {
                                VM_BUG_ON_PAGE(PageWriteback(page), page);
                                if (shmem_punch_compound(page, start, end))
                                        truncate_inode_page(mapping, page);
                        }
                        unlock_page(page);
                }
                pagevec_remove_exceptionals(&pvec);
                pagevec_release(&pvec);
                cond_resched();
                index++;
        }

        if (partial_start) {
                struct page *page = NULL;
                shmem_getpage(inode, start - 1, &page, SGP_READ);
                if (page) {
                        unsigned int top = PAGE_SIZE;
                        if (start > end) {
                                top = partial_end;
                                partial_end = 0;
                        }
                        zero_user_segment(page, partial_start, top);
                        set_page_dirty(page);
                        unlock_page(page);
                        put_page(page);
                }
        }
        if (partial_end) {
                struct page *page = NULL;
                shmem_getpage(inode, end, &page, SGP_READ);
                if (page) {
                        zero_user_segment(page, 0, partial_end);
                        set_page_dirty(page);
                        unlock_page(page);
                        put_page(page);
                }
        }
        if (start >= end)
                return;

        index = start;
        while (index < end) {
                cond_resched();

                pvec.nr = find_get_entries(mapping, index,
                                min(end - index, (pgoff_t)PAGEVEC_SIZE),
                                pvec.pages, indices);
                if (!pvec.nr) {
                        /* If all gone or hole-punch or unfalloc, we're done */
                        if (index == start || end != -1)
                                break;
                        /* But if truncating, restart to make sure all gone */
                        index = start;
                        continue;
                }
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        struct page *page = pvec.pages[i];

                        index = indices[i];
                        if (index >= end)
                                break;

                        if (xa_is_value(page)) {
                                if (unfalloc)
                                        continue;
                                if (shmem_free_swap(mapping, index, page)) {
                                        /* Swap was replaced by page: retry */
                                        index--;
                                        break;
                                }
                                nr_swaps_freed++;
                                continue;
                        }

                        lock_page(page);

                        if (!unfalloc || !PageUptodate(page)) {
                                if (page_mapping(page) != mapping) {
                                        /* Page was replaced by swap: retry */
                                        unlock_page(page);
                                        index--;
                                        break;
                                }
                                VM_BUG_ON_PAGE(PageWriteback(page), page);
                                if (shmem_punch_compound(page, start, end))
                                        truncate_inode_page(mapping, page);
                                else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
                                        /* Wipe the page and don't get stuck */
                                        clear_highpage(page);
                                        flush_dcache_page(page);
                                        set_page_dirty(page);
                                        if (index <
                                            round_up(start, HPAGE_PMD_NR))
                                                start = index + 1;
                                }
                        }
                        unlock_page(page);
                }
                pagevec_remove_exceptionals(&pvec);
                pagevec_release(&pvec);
                index++;
        }

        spin_lock_irq(&info->lock);
        info->swapped -= nr_swaps_freed;
        shmem_recalc_inode(inode);
        spin_unlock_irq(&info->lock);
}

void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
{
        shmem_undo_range(inode, lstart, lend, false);
        inode->i_ctime = inode->i_mtime = current_time(inode);
}
EXPORT_SYMBOL_GPL(shmem_truncate_range);

static int shmem_getattr(const struct path *path, struct kstat *stat,
                         u32 request_mask, unsigned int query_flags)
{
        struct inode *inode = path->dentry->d_inode;
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct shmem_sb_info *sb_info = SHMEM_SB(inode->i_sb);

        if (info->alloced - info->swapped != inode->i_mapping->nrpages) {
                spin_lock_irq(&info->lock);
                shmem_recalc_inode(inode);
                spin_unlock_irq(&info->lock);
        }
        generic_fillattr(inode, stat);

        if (is_huge_enabled(sb_info))
                stat->blksize = HPAGE_PMD_SIZE;

        return 0;
}

static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
{
        struct inode *inode = d_inode(dentry);
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
        int error;

        error = setattr_prepare(dentry, attr);
        if (error)
                return error;

        if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
                loff_t oldsize = inode->i_size;
                loff_t newsize = attr->ia_size;

                /* protected by i_mutex */
                if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
                    (newsize > oldsize && (info->seals & F_SEAL_GROW)))
                        return -EPERM;

                if (newsize != oldsize) {
                        error = shmem_reacct_size(SHMEM_I(inode)->flags,
                                        oldsize, newsize);
                        if (error)
                                return error;
                        i_size_write(inode, newsize);
                        inode->i_ctime = inode->i_mtime = current_time(inode);
                }
                if (newsize <= oldsize) {
                        loff_t holebegin = round_up(newsize, PAGE_SIZE);
                        if (oldsize > holebegin)
                                unmap_mapping_range(inode->i_mapping,
                                                        holebegin, 0, 1);
                        if (info->alloced)
                                shmem_truncate_range(inode,
                                                        newsize, (loff_t)-1);
                        /* unmap again to remove racily COWed private pages */
                        if (oldsize > holebegin)
                                unmap_mapping_range(inode->i_mapping,
                                                        holebegin, 0, 1);

                        /*
                         * Part of the huge page can be beyond i_size: subject
                         * to shrink under memory pressure.
                         */
                        if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
                                spin_lock(&sbinfo->shrinklist_lock);
                                /*
                                 * _careful to defend against unlocked access to
                                 * ->shrink_list in shmem_unused_huge_shrink()
                                 */
                                if (list_empty_careful(&info->shrinklist)) {
                                        list_add_tail(&info->shrinklist,
                                                        &sbinfo->shrinklist);
                                        sbinfo->shrinklist_len++;
                                }
                                spin_unlock(&sbinfo->shrinklist_lock);
                        }
                }
        }

        setattr_copy(inode, attr);
        if (attr->ia_valid & ATTR_MODE)
                error = posix_acl_chmod(inode, inode->i_mode);
        return error;
}

static void shmem_evict_inode(struct inode *inode)
{
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);

        if (inode->i_mapping->a_ops == &shmem_aops) {
                shmem_unacct_size(info->flags, inode->i_size);
                inode->i_size = 0;
                shmem_truncate_range(inode, 0, (loff_t)-1);
                if (!list_empty(&info->shrinklist)) {
                        spin_lock(&sbinfo->shrinklist_lock);
                        if (!list_empty(&info->shrinklist)) {
                                list_del_init(&info->shrinklist);
                                sbinfo->shrinklist_len--;
                        }
                        spin_unlock(&sbinfo->shrinklist_lock);
                }
                while (!list_empty(&info->swaplist)) {
                        /* Wait while shmem_unuse() is scanning this inode... */
                        wait_var_event(&info->stop_eviction,
                                       !atomic_read(&info->stop_eviction));
                        mutex_lock(&shmem_swaplist_mutex);
                        /* ...but beware of the race if we peeked too early */
                        if (!atomic_read(&info->stop_eviction))
                                list_del_init(&info->swaplist);
                        mutex_unlock(&shmem_swaplist_mutex);
                }
        }

        simple_xattrs_free(&info->xattrs);
        WARN_ON(inode->i_blocks);
        shmem_free_inode(inode->i_sb);
        clear_inode(inode);
}

extern struct swap_info_struct *swap_info[];

static int shmem_find_swap_entries(struct address_space *mapping,
                                   pgoff_t start, unsigned int nr_entries,
                                   struct page **entries, pgoff_t *indices,
                                   unsigned int type, bool frontswap)
{
        XA_STATE(xas, &mapping->i_pages, start);
        struct page *page;
        swp_entry_t entry;
        unsigned int ret = 0;

        if (!nr_entries)
                return 0;

        rcu_read_lock();
        xas_for_each(&xas, page, ULONG_MAX) {
                if (xas_retry(&xas, page))
                        continue;

                if (!xa_is_value(page))
                        continue;

                entry = radix_to_swp_entry(page);
                if (swp_type(entry) != type)
                        continue;
                if (frontswap &&
                    !frontswap_test(swap_info[type], swp_offset(entry)))
                        continue;

                indices[ret] = xas.xa_index;
                entries[ret] = page;

                if (need_resched()) {
                        xas_pause(&xas);
                        cond_resched_rcu();
                }
                if (++ret == nr_entries)
                        break;
        }
        rcu_read_unlock();

        return ret;
}

/*
 * Move the swapped pages for an inode to page cache. Returns the count
 * of pages swapped in, or the error in case of failure.
 */
static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec,
                                    pgoff_t *indices)
{
        int i = 0;
        int ret = 0;
        int error = 0;
        struct address_space *mapping = inode->i_mapping;

        for (i = 0; i < pvec.nr; i++) {
                struct page *page = pvec.pages[i];

                if (!xa_is_value(page))
                        continue;
                error = shmem_swapin_page(inode, indices[i],
                                          &page, SGP_CACHE,
                                          mapping_gfp_mask(mapping),
                                          NULL, NULL);
                if (error == 0) {
                        unlock_page(page);
                        put_page(page);
                        ret++;
                }
                if (error == -ENOMEM)
                        break;
                error = 0;
        }
        return error ? error : ret;
}

/*
 * If swap found in inode, free it and move page from swapcache to filecache.
 */
static int shmem_unuse_inode(struct inode *inode, unsigned int type,
                             bool frontswap, unsigned long *fs_pages_to_unuse)
{
        struct address_space *mapping = inode->i_mapping;
        pgoff_t start = 0;
        struct pagevec pvec;
        pgoff_t indices[PAGEVEC_SIZE];
        bool frontswap_partial = (frontswap && *fs_pages_to_unuse > 0);
        int ret = 0;

        pagevec_init(&pvec);
        do {
                unsigned int nr_entries = PAGEVEC_SIZE;

                if (frontswap_partial && *fs_pages_to_unuse < PAGEVEC_SIZE)
                        nr_entries = *fs_pages_to_unuse;

                pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries,
                                                  pvec.pages, indices,
                                                  type, frontswap);
                if (pvec.nr == 0) {
                        ret = 0;
                        break;
                }

                ret = shmem_unuse_swap_entries(inode, pvec, indices);
                if (ret < 0)
                        break;

                if (frontswap_partial) {
                        *fs_pages_to_unuse -= ret;
                        if (*fs_pages_to_unuse == 0) {
                                ret = FRONTSWAP_PAGES_UNUSED;
                                break;
                        }
                }

                start = indices[pvec.nr - 1];
        } while (true);

        return ret;
}

/*
 * Read all the shared memory data that resides in the swap
 * device 'type' back into memory, so the swap device can be
 * unused.
 */
int shmem_unuse(unsigned int type, bool frontswap,
                unsigned long *fs_pages_to_unuse)
{
        struct shmem_inode_info *info, *next;
        int error = 0;

        if (list_empty(&shmem_swaplist))
                return 0;

        mutex_lock(&shmem_swaplist_mutex);
        list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
                if (!info->swapped) {
                        list_del_init(&info->swaplist);
                        continue;
                }
                /*
                 * Drop the swaplist mutex while searching the inode for swap;
                 * but before doing so, make sure shmem_evict_inode() will not
                 * remove placeholder inode from swaplist, nor let it be freed
                 * (igrab() would protect from unlink, but not from unmount).
                 */
                atomic_inc(&info->stop_eviction);
                mutex_unlock(&shmem_swaplist_mutex);

                error = shmem_unuse_inode(&info->vfs_inode, type, frontswap,
                                          fs_pages_to_unuse);
                cond_resched();

                mutex_lock(&shmem_swaplist_mutex);
                next = list_next_entry(info, swaplist);
                if (!info->swapped)
                        list_del_init(&info->swaplist);
                if (atomic_dec_and_test(&info->stop_eviction))
                        wake_up_var(&info->stop_eviction);
                if (error)
                        break;
        }
        mutex_unlock(&shmem_swaplist_mutex);

        return error;
}

/*
 * Move the page from the page cache to the swap cache.
 */
static int shmem_writepage(struct page *page, struct writeback_control *wbc)
{
        struct shmem_inode_info *info;
        struct address_space *mapping;
        struct inode *inode;
        swp_entry_t swap;
        pgoff_t index;

        VM_BUG_ON_PAGE(PageCompound(page), page);
        BUG_ON(!PageLocked(page));
        mapping = page->mapping;
        index = page->index;
        inode = mapping->host;
        info = SHMEM_I(inode);
        if (info->flags & VM_LOCKED)
                goto redirty;
        if (!total_swap_pages)
                goto redirty;

        /*
         * Our capabilities prevent regular writeback or sync from ever calling
         * shmem_writepage; but a stacking filesystem might use ->writepage of
         * its underlying filesystem, in which case tmpfs should write out to
         * swap only in response to memory pressure, and not for the writeback
         * threads or sync.
         */
        if (!wbc->for_reclaim) {
                WARN_ON_ONCE(1);        /* Still happens? Tell us about it! */
                goto redirty;
        }

        /*
         * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
         * value into swapfile.c, the only way we can correctly account for a
         * fallocated page arriving here is now to initialize it and write it.
         *
         * That's okay for a page already fallocated earlier, but if we have
         * not yet completed the fallocation, then (a) we want to keep track
         * of this page in case we have to undo it, and (b) it may not be a
         * good idea to continue anyway, once we're pushing into swap.  So
         * reactivate the page, and let shmem_fallocate() quit when too many.
         */
        if (!PageUptodate(page)) {
                if (inode->i_private) {
                        struct shmem_falloc *shmem_falloc;
                        spin_lock(&inode->i_lock);
                        shmem_falloc = inode->i_private;
                        if (shmem_falloc &&
                            !shmem_falloc->waitq &&
                            index >= shmem_falloc->start &&
                            index < shmem_falloc->next)
                                shmem_falloc->nr_unswapped++;
                        else
                                shmem_falloc = NULL;
                        spin_unlock(&inode->i_lock);
                        if (shmem_falloc)
                                goto redirty;
                }
                clear_highpage(page);
                flush_dcache_page(page);
                SetPageUptodate(page);
        }

        swap = get_swap_page(page);
        if (!swap.val)
                goto redirty;

        /*
         * Add inode to shmem_unuse()'s list of swapped-out inodes,
         * if it's not already there.  Do it now before the page is
         * moved to swap cache, when its pagelock no longer protects
         * the inode from eviction.  But don't unlock the mutex until
         * we've incremented swapped, because shmem_unuse_inode() will
         * prune a !swapped inode from the swaplist under this mutex.
         */
        mutex_lock(&shmem_swaplist_mutex);
        if (list_empty(&info->swaplist))
                list_add(&info->swaplist, &shmem_swaplist);

        if (add_to_swap_cache(page, swap,
                        __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
                        NULL) == 0) {
                spin_lock_irq(&info->lock);
                shmem_recalc_inode(inode);
                info->swapped++;
                spin_unlock_irq(&info->lock);

                swap_shmem_alloc(swap);
                shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));

                mutex_unlock(&shmem_swaplist_mutex);
                BUG_ON(page_mapped(page));
                swap_writepage(page, wbc);
                return 0;
        }

        mutex_unlock(&shmem_swaplist_mutex);
        put_swap_page(page, swap);
redirty:
        set_page_dirty(page);
        if (wbc->for_reclaim)
                return AOP_WRITEPAGE_ACTIVATE;        /* Return with page locked */
        unlock_page(page);
        return 0;
}

#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
{
        char buffer[64];

        if (!mpol || mpol->mode == MPOL_DEFAULT)
                return;                /* show nothing */

        mpol_to_str(buffer, sizeof(buffer), mpol);

        seq_printf(seq, ",mpol=%s", buffer);
}

static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
{
        struct mempolicy *mpol = NULL;
        if (sbinfo->mpol) {
                spin_lock(&sbinfo->stat_lock);        /* prevent replace/use races */
                mpol = sbinfo->mpol;
                mpol_get(mpol);
                spin_unlock(&sbinfo->stat_lock);
        }
        return mpol;
}
#else /* !CONFIG_NUMA || !CONFIG_TMPFS */
static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
{
}
static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
{
        return NULL;
}
#endif /* CONFIG_NUMA && CONFIG_TMPFS */
#ifndef CONFIG_NUMA
#define vm_policy vm_private_data
#endif

static void shmem_pseudo_vma_init(struct vm_area_struct *vma,
                struct shmem_inode_info *info, pgoff_t index)
{
        /* Create a pseudo vma that just contains the policy */
        vma_init(vma, NULL);
        /* Bias interleave by inode number to distribute better across nodes */
        vma->vm_pgoff = index + info->vfs_inode.i_ino;
        vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index);
}

static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
{
        /* Drop reference taken by mpol_shared_policy_lookup() */
        mpol_cond_put(vma->vm_policy);
}

static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
                        struct shmem_inode_info *info, pgoff_t index)
{
        struct vm_area_struct pvma;
        struct page *page;
        struct vm_fault vmf;

        shmem_pseudo_vma_init(&pvma, info, index);
        vmf.vma = &pvma;
        vmf.address = 0;
        page = swap_cluster_readahead(swap, gfp, &vmf);
        shmem_pseudo_vma_destroy(&pvma);

        return page;
}

static struct page *shmem_alloc_hugepage(gfp_t gfp,
                struct shmem_inode_info *info, pgoff_t index)
{
        struct vm_area_struct pvma;
        struct address_space *mapping = info->vfs_inode.i_mapping;
        pgoff_t hindex;
        struct page *page;

        hindex = round_down(index, HPAGE_PMD_NR);
        if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1,
                                                                XA_PRESENT))
                return NULL;

        shmem_pseudo_vma_init(&pvma, info, hindex);
        page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
                        HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true);
        shmem_pseudo_vma_destroy(&pvma);
        if (page)
                prep_transhuge_page(page);
        else
                count_vm_event(THP_FILE_FALLBACK);
        return page;
}

static struct page *shmem_alloc_page(gfp_t gfp,
                        struct shmem_inode_info *info, pgoff_t index)
{
        struct vm_area_struct pvma;
        struct page *page;

        shmem_pseudo_vma_init(&pvma, info, index);
        page = alloc_page_vma(gfp, &pvma, 0);
        shmem_pseudo_vma_destroy(&pvma);

        return page;
}

static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
                struct inode *inode,
                pgoff_t index, bool huge)
{
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct page *page;
        int nr;
        int err = -ENOSPC;

        if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
                huge = false;
        nr = huge ? HPAGE_PMD_NR : 1;

        if (!shmem_inode_acct_block(inode, nr))
                goto failed;

        if (huge)
                page = shmem_alloc_hugepage(gfp, info, index);
        else
                page = shmem_alloc_page(gfp, info, index);
        if (page) {
                __SetPageLocked(page);
                __SetPageSwapBacked(page);
                return page;
        }

        err = -ENOMEM;
        shmem_inode_unacct_blocks(inode, nr);
failed:
        return ERR_PTR(err);
}

/*
 * When a page is moved from swapcache to shmem filecache (either by the
 * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of
 * shmem_unuse_inode()), it may have been read in earlier from swap, in
 * ignorance of the mapping it belongs to.  If that mapping has special
 * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
 * we may need to copy to a suitable page before moving to filecache.
 *
 * In a future release, this may well be extended to respect cpuset and
 * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
 * but for now it is a simple matter of zone.
 */
static bool shmem_should_replace_page(struct page *page, gfp_t gfp)
{
        return page_zonenum(page) > gfp_zone(gfp);
}

static int shmem_replace_page(struct page **pagep, gfp_t gfp,
                                struct shmem_inode_info *info, pgoff_t index)
{
        struct page *oldpage, *newpage;
        struct address_space *swap_mapping;
        swp_entry_t entry;
        pgoff_t swap_index;
        int error;

        oldpage = *pagep;
        entry.val = page_private(oldpage);
        swap_index = swp_offset(entry);
        swap_mapping = page_mapping(oldpage);

        /*
         * We have arrived here because our zones are constrained, so don't
         * limit chance of success by further cpuset and node constraints.
         */
        gfp &= ~GFP_CONSTRAINT_MASK;
        newpage = shmem_alloc_page(gfp, info, index);
        if (!newpage)
                return -ENOMEM;

        get_page(newpage);
        copy_highpage(newpage, oldpage);
        flush_dcache_page(newpage);

        __SetPageLocked(newpage);
        __SetPageSwapBacked(newpage);
        SetPageUptodate(newpage);
        set_page_private(newpage, entry.val);
        SetPageSwapCache(newpage);

        /*
         * Our caller will very soon move newpage out of swapcache, but it's
         * a nice clean interface for us to replace oldpage by newpage there.
         */
        xa_lock_irq(&swap_mapping->i_pages);
        error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage);
        if (!error) {
                mem_cgroup_migrate(oldpage, newpage);
                __inc_lruvec_page_state(newpage, NR_FILE_PAGES);
                __dec_lruvec_page_state(oldpage, NR_FILE_PAGES);
        }
        xa_unlock_irq(&swap_mapping->i_pages);

        if (unlikely(error)) {
                /*
                 * Is this possible?  I think not, now that our callers check
                 * both PageSwapCache and page_private after getting page lock;
                 * but be defensive.  Reverse old to newpage for clear and free.
                 */
                oldpage = newpage;
        } else {
                lru_cache_add(newpage);
                *pagep = newpage;
        }

        ClearPageSwapCache(oldpage);
        set_page_private(oldpage, 0);

        unlock_page(oldpage);
        put_page(oldpage);
        put_page(oldpage);
        return error;
}

/*
 * Swap in the page pointed to by *pagep.
 * Caller has to make sure that *pagep contains a valid swapped page.
 * Returns 0 and the page in pagep if success. On failure, returns the
 * error code and NULL in *pagep.
 */
static int shmem_swapin_page(struct inode *inode, pgoff_t index,
                             struct page **pagep, enum sgp_type sgp,
                             gfp_t gfp, struct vm_area_struct *vma,
                             vm_fault_t *fault_type)
{
        struct address_space *mapping = inode->i_mapping;
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm;
        struct page *page;
        swp_entry_t swap;
        int error;

        VM_BUG_ON(!*pagep || !xa_is_value(*pagep));
        swap = radix_to_swp_entry(*pagep);
        *pagep = NULL;

        /* Look it up and read it in.. */
        page = lookup_swap_cache(swap, NULL, 0);
        if (!page) {
                /* Or update major stats only when swapin succeeds?? */
                if (fault_type) {
                        *fault_type |= VM_FAULT_MAJOR;
                        count_vm_event(PGMAJFAULT);
                        count_memcg_event_mm(charge_mm, PGMAJFAULT);
                }
                /* Here we actually start the io */
                page = shmem_swapin(swap, gfp, info, index);
                if (!page) {
                        error = -ENOMEM;
                        goto failed;
                }
        }

        /* We have to do this with page locked to prevent races */
        lock_page(page);
        if (!PageSwapCache(page) || page_private(page) != swap.val ||
            !shmem_confirm_swap(mapping, index, swap)) {
                error = -EEXIST;
                goto unlock;
        }
        if (!PageUptodate(page)) {
                error = -EIO;
                goto failed;
        }
        wait_on_page_writeback(page);

        /*
         * Some architectures may have to restore extra metadata to the
         * physical page after reading from swap.
         */
        arch_swap_restore(swap, page);

        if (shmem_should_replace_page(page, gfp)) {
                error = shmem_replace_page(&page, gfp, info, index);
                if (error)
                        goto failed;
        }

        error = shmem_add_to_page_cache(page, mapping, index,
                                        swp_to_radix_entry(swap), gfp,
                                        charge_mm);
        if (error)
                goto failed;

        spin_lock_irq(&info->lock);
        info->swapped--;
        shmem_recalc_inode(inode);
        spin_unlock_irq(&info->lock);

        if (sgp == SGP_WRITE)
                mark_page_accessed(page);

        delete_from_swap_cache(page);
        set_page_dirty(page);
        swap_free(swap);

        *pagep = page;
        return 0;
failed:
        if (!shmem_confirm_swap(mapping, index, swap))
                error = -EEXIST;
unlock:
        if (page) {
                unlock_page(page);
                put_page(page);
        }

        return error;
}

/*
 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
 *
 * If we allocate a new one we do not mark it dirty. That's up to the
 * vm. If we swap it in we mark it dirty since we also free the swap
 * entry since a page cannot live in both the swap and page cache.
 *
 * vmf and fault_type are only supplied by shmem_fault:
 * otherwise they are NULL.
 */
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
        struct page **pagep, enum sgp_type sgp, gfp_t gfp,
        struct vm_area_struct *vma, struct vm_fault *vmf,
                        vm_fault_t *fault_type)
{
        struct address_space *mapping = inode->i_mapping;
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct shmem_sb_info *sbinfo;
        struct mm_struct *charge_mm;
        struct page *page;
        enum sgp_type sgp_huge = sgp;
        pgoff_t hindex = index;
        int error;
        int once = 0;
        int alloced = 0;

        if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
                return -EFBIG;
        if (sgp == SGP_NOHUGE || sgp == SGP_HUGE)
                sgp = SGP_CACHE;
repeat:
        if (sgp <= SGP_CACHE &&
            ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
                return -EINVAL;
        }

        sbinfo = SHMEM_SB(inode->i_sb);
        charge_mm = vma ? vma->vm_mm : current->mm;

        page = find_lock_entry(mapping, index);
        if (xa_is_value(page)) {
                error = shmem_swapin_page(inode, index, &page,
                                          sgp, gfp, vma, fault_type);
                if (error == -EEXIST)
                        goto repeat;

                *pagep = page;
                return error;
        }

        if (page)
                hindex = page->index;
        if (page && sgp == SGP_WRITE)
                mark_page_accessed(page);

        /* fallocated page? */
        if (page && !PageUptodate(page)) {
                if (sgp != SGP_READ)
                        goto clear;
                unlock_page(page);
                put_page(page);
                page = NULL;
                hindex = index;
        }
        if (page || sgp == SGP_READ)
                goto out;

        /*
         * Fast cache lookup did not find it:
         * bring it back from swap or allocate.
         */

        if (vma && userfaultfd_missing(vma)) {
                *fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
                return 0;
        }

        /* shmem_symlink() */
        if (mapping->a_ops != &shmem_aops)
                goto alloc_nohuge;
        if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE)
                goto alloc_nohuge;
        if (shmem_huge == SHMEM_HUGE_FORCE)
                goto alloc_huge;
        switch (sbinfo->huge) {
        case SHMEM_HUGE_NEVER:
                goto alloc_nohuge;
        case SHMEM_HUGE_WITHIN_SIZE: {
                loff_t i_size;
                pgoff_t off;

                off = round_up(index, HPAGE_PMD_NR);
                i_size = round_up(i_size_read(inode), PAGE_SIZE);
                if (i_size >= HPAGE_PMD_SIZE &&
                    i_size >> PAGE_SHIFT >= off)
                        goto alloc_huge;

                fallthrough;
        }
        case SHMEM_HUGE_ADVISE:
                if (sgp_huge == SGP_HUGE)
                        goto alloc_huge;
                /* TODO: implement fadvise() hints */
                goto alloc_nohuge;
        }

alloc_huge:
        page = shmem_alloc_and_acct_page(gfp, inode, index, true);
        if (IS_ERR(page)) {
alloc_nohuge:
                page = shmem_alloc_and_acct_page(gfp, inode,
                                                 index, false);
        }
        if (IS_ERR(page)) {
                int retry = 5;

                error = PTR_ERR(page);
                page = NULL;
                if (error != -ENOSPC)
                        goto unlock;
                /*
                 * Try to reclaim some space by splitting a huge page
                 * beyond i_size on the filesystem.
                 */
                while (retry--) {
                        int ret;

                        ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
                        if (ret == SHRINK_STOP)
                                break;
                        if (ret)
                                goto alloc_nohuge;
                }
                goto unlock;
        }

        if (PageTransHuge(page))
                hindex = round_down(index, HPAGE_PMD_NR);
        else
                hindex = index;

        if (sgp == SGP_WRITE)
                __SetPageReferenced(page);

        error = shmem_add_to_page_cache(page, mapping, hindex,
                                        NULL, gfp & GFP_RECLAIM_MASK,
                                        charge_mm);
        if (error)
                goto unacct;
        lru_cache_add(page);

        spin_lock_irq(&info->lock);
        info->alloced += compound_nr(page);
        inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
        shmem_recalc_inode(inode);
        spin_unlock_irq(&info->lock);
        alloced = true;

        if (PageTransHuge(page) &&
            DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
                        hindex + HPAGE_PMD_NR - 1) {
                /*
                 * Part of the huge page is beyond i_size: subject
                 * to shrink under memory pressure.
                 */
                spin_lock(&sbinfo->shrinklist_lock);
                /*
                 * _careful to defend against unlocked access to
                 * ->shrink_list in shmem_unused_huge_shrink()
                 */
                if (list_empty_careful(&info->shrinklist)) {
                        list_add_tail(&info->shrinklist,
                                      &sbinfo->shrinklist);
                        sbinfo->shrinklist_len++;
                }
                spin_unlock(&sbinfo->shrinklist_lock);
        }

        /*
         * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
         */
        if (sgp == SGP_FALLOC)
                sgp = SGP_WRITE;
clear:
        /*
         * Let SGP_WRITE caller clear ends if write does not fill page;
         * but SGP_FALLOC on a page fallocated earlier must initialize
         * it now, lest undo on failure cancel our earlier guarantee.
         */
        if (sgp != SGP_WRITE && !PageUptodate(page)) {
                int i;

                for (i = 0; i < compound_nr(page); i++) {
                        clear_highpage(page + i);
                        flush_dcache_page(page + i);
                }
                SetPageUptodate(page);
        }

        /* Perhaps the file has been truncated since we checked */
        if (sgp <= SGP_CACHE &&
            ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
                if (alloced) {
                        ClearPageDirty(page);
                        delete_from_page_cache(page);
                        spin_lock_irq(&info->lock);
                        shmem_recalc_inode(inode);
                        spin_unlock_irq(&info->lock);
                }
                error = -EINVAL;
                goto unlock;
        }
out:
        *pagep = page + index - hindex;
        return 0;

        /*
         * Error recovery.
         */
unacct:
        shmem_inode_unacct_blocks(inode, compound_nr(page));

        if (PageTransHuge(page)) {
                unlock_page(page);
                put_page(page);
                goto alloc_nohuge;
        }
unlock:
        if (page) {
                unlock_page(page);
                put_page(page);
        }
        if (error == -ENOSPC && !once++) {
                spin_lock_irq(&info->lock);
                shmem_recalc_inode(inode);
                spin_unlock_irq(&info->lock);
                goto repeat;
        }
        if (error == -EEXIST)
                goto repeat;
        return error;
}

/*
 * This is like autoremove_wake_function, but it removes the wait queue
 * entry unconditionally - even if something else had already woken the
 * target.
 */
static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
        int ret = default_wake_function(wait, mode, sync, key);
        list_del_init(&wait->entry);
        return ret;
}

static vm_fault_t shmem_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct inode *inode = file_inode(vma->vm_file);
        gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
        enum sgp_type sgp;
        int err;
        vm_fault_t ret = VM_FAULT_LOCKED;

        /*
         * Trinity finds that probing a hole which tmpfs is punching can
         * prevent the hole-punch from ever completing: which in turn
         * locks writers out with its hold on i_mutex.  So refrain from
         * faulting pages into the hole while it's being punched.  Although
         * shmem_undo_range() does remove the additions, it may be unable to
         * keep up, as each new page needs its own unmap_mapping_range() call,
         * and the i_mmap tree grows ever slower to scan if new vmas are added.
         *
         * It does not matter if we sometimes reach this check just before the
         * hole-punch begins, so that one fault then races with the punch:
         * we just need to make racing faults a rare case.
         *
         * The implementation below would be much simpler if we just used a
         * standard mutex or completion: but we cannot take i_mutex in fault,
         * and bloating every shmem inode for this unlikely case would be sad.
         */
        if (unlikely(inode->i_private)) {
                struct shmem_falloc *shmem_falloc;

                spin_lock(&inode->i_lock);
                shmem_falloc = inode->i_private;
                if (shmem_falloc &&
                    shmem_falloc->waitq &&
                    vmf->pgoff >= shmem_falloc->start &&
                    vmf->pgoff < shmem_falloc->next) {
                        struct file *fpin;
                        wait_queue_head_t *shmem_falloc_waitq;
                        DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);

                        ret = VM_FAULT_NOPAGE;
                        fpin = maybe_unlock_mmap_for_io(vmf, NULL);
                        if (fpin)
                                ret = VM_FAULT_RETRY;

                        shmem_falloc_waitq = shmem_falloc->waitq;
                        prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
                                        TASK_UNINTERRUPTIBLE);
                        spin_unlock(&inode->i_lock);
                        schedule();

                        /*
                         * shmem_falloc_waitq points into the shmem_fallocate()
                         * stack of the hole-punching task: shmem_falloc_waitq
                         * is usually invalid by the time we reach here, but
                         * finish_wait() does not dereference it in that case;
                         * though i_lock needed lest racing with wake_up_all().
                         */
                        spin_lock(&inode->i_lock);
                        finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
                        spin_unlock(&inode->i_lock);

                        if (fpin)
                                fput(fpin);
                        return ret;
                }
                spin_unlock(&inode->i_lock);
        }

        sgp = SGP_CACHE;

        if ((vma->vm_flags & VM_NOHUGEPAGE) ||
            test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
                sgp = SGP_NOHUGE;
        else if (vma->vm_flags & VM_HUGEPAGE)
                sgp = SGP_HUGE;

        err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp,
                                  gfp, vma, vmf, &ret);
        if (err)
                return vmf_error(err);
        return ret;
}

unsigned long shmem_get_unmapped_area(struct file *file,
                                      unsigned long uaddr, unsigned long len,
                                      unsigned long pgoff, unsigned long flags)
{
        unsigned long (*get_area)(struct file *,
                unsigned long, unsigned long, unsigned long, unsigned long);
        unsigned long addr;
        unsigned long offset;
        unsigned long inflated_len;
        unsigned long inflated_addr;
        unsigned long inflated_offset;

        if (len > TASK_SIZE)
                return -ENOMEM;

        get_area = current->mm->get_unmapped_area;
        addr = get_area(file, uaddr, len, pgoff, flags);

        if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
                return addr;
        if (IS_ERR_VALUE(addr))
                return addr;
        if (addr & ~PAGE_MASK)
                return addr;
        if (addr > TASK_SIZE - len)
                return addr;

        if (shmem_huge == SHMEM_HUGE_DENY)
                return addr;
        if (len < HPAGE_PMD_SIZE)
                return addr;
        if (flags & MAP_FIXED)
                return addr;
        /*
         * Our priority is to support MAP_SHARED mapped hugely;
         * and support MAP_PRIVATE mapped hugely too, until it is COWed.
         * But if caller specified an address hint and we allocated area there
         * successfully, respect that as before.
         */
        if (uaddr == addr)
                return addr;

        if (shmem_huge != SHMEM_HUGE_FORCE) {
                struct super_block *sb;

                if (file) {
                        VM_BUG_ON(file->f_op != &shmem_file_operations);
                        sb = file_inode(file)->i_sb;
                } else {
                        /*
                         * Called directly from mm/mmap.c, or drivers/char/mem.c
                         * for "/dev/zero", to create a shared anonymous object.
                         */
                        if (IS_ERR(shm_mnt))
                                return addr;
                        sb = shm_mnt->mnt_sb;
                }
                if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER)
                        return addr;
        }

        offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1);
        if (offset && offset + len < 2 * HPAGE_PMD_SIZE)
                return addr;
        if ((addr & (HPAGE_PMD_SIZE-1)) == offset)
                return addr;

        inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE;
        if (inflated_len > TASK_SIZE)
                return addr;
        if (inflated_len < len)
                return addr;

        inflated_addr = get_area(NULL, uaddr, inflated_len, 0, flags);
        if (IS_ERR_VALUE(inflated_addr))
                return addr;
        if (inflated_addr & ~PAGE_MASK)
                return addr;

        inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1);
        inflated_addr += offset - inflated_offset;
        if (inflated_offset > offset)
                inflated_addr += HPAGE_PMD_SIZE;

        if (inflated_addr > TASK_SIZE - len)
                return addr;
        return inflated_addr;
}

#ifdef CONFIG_NUMA
static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
{
        struct inode *inode = file_inode(vma->vm_file);
        return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
}

static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
                                          unsigned long addr)
{
        struct inode *inode = file_inode(vma->vm_file);
        pgoff_t index;

        index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
        return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
}
#endif

int shmem_lock(struct file *file, int lock, struct user_struct *user)
{
        struct inode *inode = file_inode(file);
        struct shmem_inode_info *info = SHMEM_I(inode);
        int retval = -ENOMEM;

        /*
         * What serializes the accesses to info->flags?
         * ipc_lock_object() when called from shmctl_do_lock(),
         * no serialization needed when called from shm_destroy().
         */
        if (lock && !(info->flags & VM_LOCKED)) {
                if (!user_shm_lock(inode->i_size, user))
                        goto out_nomem;
                info->flags |= VM_LOCKED;
                mapping_set_unevictable(file->f_mapping);
        }
        if (!lock && (info->flags & VM_LOCKED) && user) {
                user_shm_unlock(inode->i_size, user);
                info->flags &= ~VM_LOCKED;
                mapping_clear_unevictable(file->f_mapping);
        }
        retval = 0;

out_nomem:
        return retval;
}

static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct shmem_inode_info *info = SHMEM_I(file_inode(file));
        int ret;

        ret = seal_check_write(info->seals, vma);
        if (ret)
                return ret;

        file_accessed(file);
        vma->vm_ops = &shmem_vm_ops;
        if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
                        ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
                        (vma->vm_end & HPAGE_PMD_MASK)) {
                khugepaged_enter(vma, vma->vm_flags);
        }
        return 0;
}

static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir,
                                     umode_t mode, dev_t dev, unsigned long flags)
{
        struct inode *inode;
        struct shmem_inode_info *info;
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
        ino_t ino;

        if (shmem_reserve_inode(sb, &ino))
                return NULL;

        inode = new_inode(sb);
        if (inode) {
                inode->i_ino = ino;
                inode_init_owner(inode, dir, mode);
                inode->i_blocks = 0;
                inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
                inode->i_generation = prandom_u32();
                info = SHMEM_I(inode);
                memset(info, 0, (char *)inode - (char *)info);
                spin_lock_init(&info->lock);
                atomic_set(&info->stop_eviction, 0);
                info->seals = F_SEAL_SEAL;
                info->flags = flags & VM_NORESERVE;
                INIT_LIST_HEAD(&info->shrinklist);
                INIT_LIST_HEAD(&info->swaplist);
                simple_xattrs_init(&info->xattrs);
                cache_no_acl(inode);

                switch (mode & S_IFMT) {
                default:
                        inode->i_op = &shmem_special_inode_operations;
                        init_special_inode(inode, mode, dev);
                        break;
                case S_IFREG:
                        inode->i_mapping->a_ops = &shmem_aops;
                        inode->i_op = &shmem_inode_operations;
                        inode->i_fop = &shmem_file_operations;
                        mpol_shared_policy_init(&info->policy,
                                                 shmem_get_sbmpol(sbinfo));
                        break;
                case S_IFDIR:
                        inc_nlink(inode);
                        /* Some things misbehave if size == 0 on a directory */
                        inode->i_size = 2 * BOGO_DIRENT_SIZE;
                        inode->i_op = &shmem_dir_inode_operations;
                        inode->i_fop = &simple_dir_operations;
                        break;
                case S_IFLNK:
                        /*
                         * Must not load anything in the rbtree,
                         * mpol_free_shared_policy will not be called.
                         */
                        mpol_shared_policy_init(&info->policy, NULL);
                        break;
                }

                lockdep_annotate_inode_mutex_key(inode);
        } else
                shmem_free_inode(sb);
        return inode;
}

bool shmem_mapping(struct address_space *mapping)
{
        return mapping->a_ops == &shmem_aops;
}

static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
                                  pmd_t *dst_pmd,
                                  struct vm_area_struct *dst_vma,
                                  unsigned long dst_addr,
                                  unsigned long src_addr,
                                  bool zeropage,
                                  struct page **pagep)
{
        struct inode *inode = file_inode(dst_vma->vm_file);
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct address_space *mapping = inode->i_mapping;
        gfp_t gfp = mapping_gfp_mask(mapping);
        pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
        spinlock_t *ptl;
        void *page_kaddr;
        struct page *page;
        pte_t _dst_pte, *dst_pte;
        int ret;
        pgoff_t offset, max_off;

        ret = -ENOMEM;
        if (!shmem_inode_acct_block(inode, 1)) {
                /*
                 * We may have got a page, returned -ENOENT triggering a retry,
                 * and now we find ourselves with -ENOMEM. Release the page, to
                 * avoid a BUG_ON in our caller.
                 */
                if (unlikely(*pagep)) {
                        put_page(*pagep);
                        *pagep = NULL;
                }
                goto out;
        }

        if (!*pagep) {
                page = shmem_alloc_page(gfp, info, pgoff);
                if (!page)
                        goto out_unacct_blocks;

                if (!zeropage) {        /* mcopy_atomic */
                        page_kaddr = kmap_atomic(page);
                        ret = copy_from_user(page_kaddr,
                                             (const void __user *)src_addr,
                                             PAGE_SIZE);
                        kunmap_atomic(page_kaddr);

                        /* fallback to copy_from_user outside mmap_lock */
                        if (unlikely(ret)) {
                                *pagep = page;
                                shmem_inode_unacct_blocks(inode, 1);
                                /* don't free the page */
                                return -ENOENT;
                        }
                } else {                /* mfill_zeropage_atomic */
                        clear_highpage(page);
                }
        } else {
                page = *pagep;
                *pagep = NULL;
        }

        VM_BUG_ON(PageLocked(page) || PageSwapBacked(page));
        __SetPageLocked(page);
        __SetPageSwapBacked(page);
        __SetPageUptodate(page);

        ret = -EFAULT;
        offset = linear_page_index(dst_vma, dst_addr);
        max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
        if (unlikely(offset >= max_off))
                goto out_release;

        ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
                                      gfp & GFP_RECLAIM_MASK, dst_mm);
        if (ret)
                goto out_release;

        _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
        if (dst_vma->vm_flags & VM_WRITE)
                _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
        else {
                /*
                 * We don't set the pte dirty if the vma has no
                 * VM_WRITE permission, so mark the page dirty or it
                 * could be freed from under us. We could do it
                 * unconditionally before unlock_page(), but doing it
                 * only if VM_WRITE is not set is faster.
                 */
                set_page_dirty(page);
        }

        dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);

        ret = -EFAULT;
        max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
        if (unlikely(offset >= max_off))
                goto out_release_unlock;

        ret = -EEXIST;
        if (!pte_none(*dst_pte))
                goto out_release_unlock;

        lru_cache_add(page);

        spin_lock_irq(&info->lock);
        info->alloced++;
        inode->i_blocks += BLOCKS_PER_PAGE;
        shmem_recalc_inode(inode);
        spin_unlock_irq(&info->lock);

        inc_mm_counter(dst_mm, mm_counter_file(page));
        page_add_file_rmap(page, false);
        set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);

        /* No need to invalidate - it was non-present before */
        update_mmu_cache(dst_vma, dst_addr, dst_pte);
        pte_unmap_unlock(dst_pte, ptl);
        unlock_page(page);
        ret = 0;
out:
        return ret;
out_release_unlock:
        pte_unmap_unlock(dst_pte, ptl);
        ClearPageDirty(page);
        delete_from_page_cache(page);
out_release:
        unlock_page(page);
        put_page(page);
out_unacct_blocks:
        shmem_inode_unacct_blocks(inode, 1);
        goto out;
}

int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
                           pmd_t *dst_pmd,
                           struct vm_area_struct *dst_vma,
                           unsigned long dst_addr,
                           unsigned long src_addr,
                           struct page **pagep)
{
        return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
                                      dst_addr, src_addr, false, pagep);
}

int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
                             pmd_t *dst_pmd,
                             struct vm_area_struct *dst_vma,
                             unsigned long dst_addr)
{
        struct page *page = NULL;

        return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
                                      dst_addr, 0, true, &page);
}

#ifdef CONFIG_TMPFS
static const struct inode_operations shmem_symlink_inode_operations;
static const struct inode_operations shmem_short_symlink_operations;

#ifdef CONFIG_TMPFS_XATTR
static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
#else
#define shmem_initxattrs NULL
#endif

static int
shmem_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata)
{
        struct inode *inode = mapping->host;
        struct shmem_inode_info *info = SHMEM_I(inode);
        pgoff_t index = pos >> PAGE_SHIFT;

        /* i_mutex is held by caller */
        if (unlikely(info->seals & (F_SEAL_GROW |
                                   F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
                if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
                        return -EPERM;
                if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
                        return -EPERM;
        }

        return shmem_getpage(inode, index, pagep, SGP_WRITE);
}

static int
shmem_write_end(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned copied,
                        struct page *page, void *fsdata)
{
        struct inode *inode = mapping->host;

        if (pos + copied > inode->i_size)
                i_size_write(inode, pos + copied);

        if (!PageUptodate(page)) {
                struct page *head = compound_head(page);
                if (PageTransCompound(page)) {
                        int i;

                        for (i = 0; i < HPAGE_PMD_NR; i++) {
                                if (head + i == page)
                                        continue;
                                clear_highpage(head + i);
                                flush_dcache_page(head + i);
                        }
                }
                if (copied < PAGE_SIZE) {
                        unsigned from = pos & (PAGE_SIZE - 1);
                        zero_user_segments(page, 0, from,
                                        from + copied, PAGE_SIZE);
                }
                SetPageUptodate(head);
        }
        set_page_dirty(page);
        unlock_page(page);
        put_page(page);

        return copied;
}

static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
        struct address_space *mapping = inode->i_mapping;
        pgoff_t index;
        unsigned long offset;
        enum sgp_type sgp = SGP_READ;
        int error = 0;
        ssize_t retval = 0;
        loff_t *ppos = &iocb->ki_pos;

        /*
         * Might this read be for a stacking filesystem?  Then when reading
         * holes of a sparse file, we actually need to allocate those pages,
         * and even mark them dirty, so it cannot exceed the max_blocks limit.
         */
        if (!iter_is_iovec(to))
                sgp = SGP_CACHE;

        index = *ppos >> PAGE_SHIFT;
        offset = *ppos & ~PAGE_MASK;

        for (;;) {
                struct page *page = NULL;
                pgoff_t end_index;
                unsigned long nr, ret;
                loff_t i_size = i_size_read(inode);

                end_index = i_size >> PAGE_SHIFT;
                if (index > end_index)
                        break;
                if (index == end_index) {
                        nr = i_size & ~PAGE_MASK;
                        if (nr <= offset)
                                break;
                }

                error = shmem_getpage(inode, index, &page, sgp);
                if (error) {
                        if (error == -EINVAL)
                                error = 0;
                        break;
                }
                if (page) {
                        if (sgp == SGP_CACHE)
                                set_page_dirty(page);
                        unlock_page(page);
                }

                /*
                 * We must evaluate after, since reads (unlike writes)
                 * are called without i_mutex protection against truncate
                 */
                nr = PAGE_SIZE;
                i_size = i_size_read(inode);
                end_index = i_size >> PAGE_SHIFT;
                if (index == end_index) {
                        nr = i_size & ~PAGE_MASK;
                        if (nr <= offset) {
                                if (page)
                                        put_page(page);
                                break;
                        }
                }
                nr -= offset;

                if (page) {
                        /*
                         * If users can be writing to this page using arbitrary
                         * virtual addresses, take care about potential aliasing
                         * before reading the page on the kernel side.
                         */
                        if (mapping_writably_mapped(mapping))
                                flush_dcache_page(page);
                        /*
                         * Mark the page accessed if we read the beginning.
                         */
                        if (!offset)
                                mark_page_accessed(page);
                } else {
                        page = ZERO_PAGE(0);
                        get_page(page);
                }

                /*
                 * Ok, we have the page, and it's up-to-date, so
                 * now we can copy it to user space...
                 */
                ret = copy_page_to_iter(page, offset, nr, to);
                retval += ret;
                offset += ret;
                index += offset >> PAGE_SHIFT;
                offset &= ~PAGE_MASK;

                put_page(page);
                if (!iov_iter_count(to))
                        break;
                if (ret < nr) {
                        error = -EFAULT;
                        break;
                }
                cond_resched();
        }

        *ppos = ((loff_t) index << PAGE_SHIFT) + offset;
        file_accessed(file);
        return retval ? retval : error;
}

/*
 * llseek SEEK_DATA or SEEK_HOLE through the page cache.
 */
static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
                                    pgoff_t index, pgoff_t end, int whence)
{
        struct page *page;
        struct pagevec pvec;
        pgoff_t indices[PAGEVEC_SIZE];
        bool done = false;
        int i;

        pagevec_init(&pvec);
        pvec.nr = 1;                /* start small: we may be there already */
        while (!done) {
                pvec.nr = find_get_entries(mapping, index,
                                        pvec.nr, pvec.pages, indices);
                if (!pvec.nr) {
                        if (whence == SEEK_DATA)
                                index = end;
                        break;
                }
                for (i = 0; i < pvec.nr; i++, index++) {
                        if (index < indices[i]) {
                                if (whence == SEEK_HOLE) {
                                        done = true;
                                        break;
                                }
                                index = indices[i];
                        }
                        page = pvec.pages[i];
                        if (page && !xa_is_value(page)) {
                                if (!PageUptodate(page))
                                        page = NULL;
                        }
                        if (index >= end ||
                            (page && whence == SEEK_DATA) ||
                            (!page && whence == SEEK_HOLE)) {
                                done = true;
                                break;
                        }
                }
                pagevec_remove_exceptionals(&pvec);
                pagevec_release(&pvec);
                pvec.nr = PAGEVEC_SIZE;
                cond_resched();
        }
        return index;
}

static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
{
        struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;
        pgoff_t start, end;
        loff_t new_offset;

        if (whence != SEEK_DATA && whence != SEEK_HOLE)
                return generic_file_llseek_size(file, offset, whence,
                                        MAX_LFS_FILESIZE, i_size_read(inode));
        inode_lock(inode);
        /* We're holding i_mutex so we can access i_size directly */

        if (offset < 0 || offset >= inode->i_size)
                offset = -ENXIO;
        else {
                start = offset >> PAGE_SHIFT;
                end = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
                new_offset = shmem_seek_hole_data(mapping, start, end, whence);
                new_offset <<= PAGE_SHIFT;
                if (new_offset > offset) {
                        if (new_offset < inode->i_size)
                                offset = new_offset;
                        else if (whence == SEEK_DATA)
                                offset = -ENXIO;
                        else
                                offset = inode->i_size;
                }
        }

        if (offset >= 0)
                offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
        inode_unlock(inode);
        return offset;
}

static long shmem_fallocate(struct file *file, int mode, loff_t offset,
                                                         loff_t len)
{
        struct inode *inode = file_inode(file);
        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct shmem_falloc shmem_falloc;
        pgoff_t start, index, end;
        int error;

        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
                return -EOPNOTSUPP;

        inode_lock(inode);

        if (mode & FALLOC_FL_PUNCH_HOLE) {
                struct address_space *mapping = file->f_mapping;
                loff_t unmap_start = round_up(offset, PAGE_SIZE);
                loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
                DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);

                /* protected by i_mutex */
                if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
                        error = -EPERM;
                        goto out;
                }

                shmem_falloc.waitq = &shmem_falloc_waitq;
                shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
                shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
                spin_lock(&inode->i_lock);
                inode->i_private = &shmem_falloc;
                spin_unlock(&inode->i_lock);

                if ((u64)unmap_end > (u64)unmap_start)
                        unmap_mapping_range(mapping, unmap_start,
                                            1 + unmap_end - unmap_start, 0);
                shmem_truncate_range(inode, offset, offset + len - 1);
                /* No need to unmap again: hole-punching leaves COWed pages */

                spin_lock(&inode->i_lock);
                inode->i_private = NULL;
                wake_up_all(&shmem_falloc_waitq);
                WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head));
                spin_unlock(&inode->i_lock);
                error = 0;
                goto out;
        }

        /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
        error = inode_newsize_ok(inode, offset + len);
        if (error)
                goto out;

        if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
                error = -EPERM;
                goto out;
        }

        start = offset >> PAGE_SHIFT;
        end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
        /* Try to avoid a swapstorm if len is impossible to satisfy */
        if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
                error = -ENOSPC;
                goto out;
        }

        shmem_falloc.waitq = NULL;
        shmem_falloc.start = start;
        shmem_falloc.next  = start;
        shmem_falloc.nr_falloced = 0;
        shmem_falloc.nr_unswapped = 0;
        spin_lock(&inode->i_lock);
        inode->i_private = &shmem_falloc;
        spin_unlock(&inode->i_lock);

        for (index = start; index < end; index++) {
                struct page *page;

                /*
                 * Good, the fallocate(2) manpage permits EINTR: we may have
                 * been interrupted because we are using up too much memory.
                 */
                if (signal_pending(current))
                        error = -EINTR;
                else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
                        error = -ENOMEM;
                else
                        error = shmem_getpage(inode, index, &page, SGP_FALLOC);
                if (error) {
                        /* Remove the !PageUptodate pages we added */
                        if (index > start) {
                                shmem_undo_range(inode,
                                    (loff_t)start << PAGE_SHIFT,
                                    ((loff_t)index << PAGE_SHIFT) - 1, true);
                        }
                        goto undone;
                }

                /*
                 * Inform shmem_writepage() how far we have reached.
                 * No need for lock or barrier: we have the page lock.
                 */
                shmem_falloc.next++;
                if (!PageUptodate(page))
                        shmem_falloc.nr_falloced++;

                /*
                 * If !PageUptodate, leave it that way so that freeable pages
                 * can be recognized if we need to rollback on error later.
                 * But set_page_dirty so that memory pressure will swap rather
                 * than free the pages we are allocating (and SGP_CACHE pages
                 * might still be clean: we now need to mark those dirty too).
                 */
                set_page_dirty(page);
                unlock_page(page);
                put_page(page);
                cond_resched();
        }

        if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
                i_size_write(inode, offset + len);
        inode->i_ctime = current_time(inode);
undone:
        spin_lock(&inode->i_lock);
        inode->i_private = NULL;
        spin_unlock(&inode->i_lock);
out:
        inode_unlock(inode);
        return error;
}

static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
{
        struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);

        buf->f_type = TMPFS_MAGIC;
        buf->f_bsize = PAGE_SIZE;
        buf->f_namelen = NAME_MAX;
        if (sbinfo->max_blocks) {
                buf->f_blocks = sbinfo->max_blocks;
                buf->f_bavail =
                buf->f_bfree  = sbinfo->max_blocks -
                                percpu_counter_sum(&sbinfo->used_blocks);
        }
        if (sbinfo->max_inodes) {
                buf->f_files = sbinfo->max_inodes;
                buf->f_ffree = sbinfo->free_inodes;
        }
        /* else leave those fields 0 like simple_statfs */
        return 0;
}

/*
 * File creation. Allocate an inode, and we're done..
 */
static int
shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
{
        struct inode *inode;
        int error = -ENOSPC;

        inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
        if (inode) {
                error = simple_acl_create(dir, inode);
                if (error)
                        goto out_iput;
                error = security_inode_init_security(inode, dir,
                                                     &dentry->d_name,
                                                     shmem_initxattrs, NULL);
                if (error && error != -EOPNOTSUPP)
                        goto out_iput;

                error = 0;
                dir->i_size += BOGO_DIRENT_SIZE;
                dir->i_ctime = dir->i_mtime = current_time(dir);
                d_instantiate(dentry, inode);
                dget(dentry); /* Extra count - pin the dentry in core */
        }
        return error;
out_iput:
        iput(inode);
        return error;
}

static int
shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
{
        struct inode *inode;
        int error = -ENOSPC;

        inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE);
        if (inode) {
                error = security_inode_init_security(inode, dir,
                                                     NULL,
                                                     shmem_initxattrs, NULL);
                if (error && error != -EOPNOTSUPP)
                        goto out_iput;
                error = simple_acl_create(dir, inode);
                if (error)
                        goto out_iput;
                d_tmpfile(dentry, inode);
        }
        return error;
out_iput:
        iput(inode);
        return error;
}

static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
        int error;

        if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
                return error;
        inc_nlink(dir);
        return 0;
}

static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode,
                bool excl)
{
        return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
}

/*
 * Link a file..
 */
static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
{
        struct inode *inode = d_inode(old_dentry);
        int ret = 0;

        /*
         * No ordinary (disk based) filesystem counts links as inodes;
         * but each new link needs a new dentry, pinning lowmem, and
         * tmpfs dentries cannot be pruned until they are unlinked.
         * But if an O_TMPFILE file is linked into the tmpfs, the
         * first link must skip that, to get the accounting right.
         */
        if (inode->i_nlink) {
                ret = shmem_reserve_inode(inode->i_sb, NULL);
                if (ret)
                        goto out;
        }

        dir->i_size += BOGO_DIRENT_SIZE;
        inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
        inc_nlink(inode);
        ihold(inode);        /* New dentry reference */
        dget(dentry);                /* Extra pinning count for the created dentry */
        d_instantiate(dentry, inode);
out:
        return ret;
}

static int shmem_unlink(struct inode *dir, struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);

        if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
                shmem_free_inode(inode->i_sb);

        dir->i_size -= BOGO_DIRENT_SIZE;
        inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
        drop_nlink(inode);
        dput(dentry);        /* Undo the count from "create" - this does all the work */
        return 0;
}

static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
{
        if (!simple_empty(dentry))
                return -ENOTEMPTY;

        drop_nlink(d_inode(dentry));
        drop_nlink(dir);
        return shmem_unlink(dir, dentry);
}

static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
{
        bool old_is_dir = d_is_dir(old_dentry);
        bool new_is_dir = d_is_dir(new_dentry);

        if (old_dir != new_dir && old_is_dir != new_is_dir) {
                if (old_is_dir) {
                        drop_nlink(old_dir);
                        inc_nlink(new_dir);
                } else {
                        drop_nlink(new_dir);
                        inc_nlink(old_dir);
                }
        }
        old_dir->i_ctime = old_dir->i_mtime =
        new_dir->i_ctime = new_dir->i_mtime =
        d_inode(old_dentry)->i_ctime =
        d_inode(new_dentry)->i_ctime = current_time(old_dir);

        return 0;
}

static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry)
{
        struct dentry *whiteout;
        int error;

        whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name);
        if (!whiteout)
                return -ENOMEM;

        error = shmem_mknod(old_dir, whiteout,
                            S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
        dput(whiteout);
        if (error)
                return error;

        /*
         * Cheat and hash the whiteout while the old dentry is still in
         * place, instead of playing games with FS_RENAME_DOES_D_MOVE.
         *
         * d_lookup() will consistently find one of them at this point,
         * not sure which one, but that isn't even important.
         */
        d_rehash(whiteout);
        return 0;
}

/*
 * The VFS layer already does all the dentry stuff for rename,
 * we just have to decrement the usage count for the target if
 * it exists so that the VFS layer correctly free's it when it
 * gets overwritten.
 */
static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags)
{
        struct inode *inode = d_inode(old_dentry);
        int they_are_dirs = S_ISDIR(inode->i_mode);

        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
                return -EINVAL;

        if (flags & RENAME_EXCHANGE)
                return shmem_exchange(old_dir, old_dentry, new_dir, new_dentry);

        if (!simple_empty(new_dentry))
                return -ENOTEMPTY;

        if (flags & RENAME_WHITEOUT) {
                int error;

                error = shmem_whiteout(old_dir, old_dentry);
                if (error)
                        return error;
        }

        if (d_really_is_positive(new_dentry)) {
                (void) shmem_unlink(new_dir, new_dentry);
                if (they_are_dirs) {
                        drop_nlink(d_inode(new_dentry));
                        drop_nlink(old_dir);
                }
        } else if (they_are_dirs) {
                drop_nlink(old_dir);
                inc_nlink(new_dir);
        }

        old_dir->i_size -= BOGO_DIRENT_SIZE;
        new_dir->i_size += BOGO_DIRENT_SIZE;
        old_dir->i_ctime = old_dir->i_mtime =
        new_dir->i_ctime = new_dir->i_mtime =
        inode->i_ctime = current_time(old_dir);
        return 0;
}

static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
{
        int error;
        int len;
        struct inode *inode;
        struct page *page;

        len = strlen(symname) + 1;
        if (len > PAGE_SIZE)
                return -ENAMETOOLONG;

        inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK | 0777, 0,
                                VM_NORESERVE);
        if (!inode)
                return -ENOSPC;

        error = security_inode_init_security(inode, dir, &dentry->d_name,
                                             shmem_initxattrs, NULL);
        if (error && error != -EOPNOTSUPP) {
                iput(inode);
                return error;
        }

        inode->i_size = len-1;
        if (len <= SHORT_SYMLINK_LEN) {
                inode->i_link = kmemdup(symname, len, GFP_KERNEL);
                if (!inode->i_link) {
                        iput(inode);
                        return -ENOMEM;
                }
                inode->i_op = &shmem_short_symlink_operations;
        } else {
                inode_nohighmem(inode);
                error = shmem_getpage(inode, 0, &page, SGP_WRITE);
                if (error) {
                        iput(inode);
                        return error;
                }
                inode->i_mapping->a_ops = &shmem_aops;
                inode->i_op = &shmem_symlink_inode_operations;
                memcpy(page_address(page), symname, len);
                SetPageUptodate(page);
                set_page_dirty(page);
                unlock_page(page);
                put_page(page);
        }
        dir->i_size += BOGO_DIRENT_SIZE;
        dir->i_ctime = dir->i_mtime = current_time(dir);
        d_instantiate(dentry, inode);
        dget(dentry);
        return 0;
}

static void shmem_put_link(void *arg)
{
        mark_page_accessed(arg);
        put_page(arg);
}

static const char *shmem_get_link(struct dentry *dentry,
                                  struct inode *inode,
                                  struct delayed_call *done)
{
        struct page *page = NULL;
        int error;
        if (!dentry) {
                page = find_get_page(inode->i_mapping, 0);
                if (!page)
                        return ERR_PTR(-ECHILD);
                if (!PageUptodate(page)) {
                        put_page(page);
                        return ERR_PTR(-ECHILD);
                }
        } else {
                error = shmem_getpage(inode, 0, &page, SGP_READ);
                if (error)
                        return ERR_PTR(error);
                unlock_page(page);
        }
        set_delayed_call(done, shmem_put_link, page);
        return page_address(page);
}

#ifdef CONFIG_TMPFS_XATTR
/*
 * Superblocks without xattr inode operations may get some security.* xattr
 * support from the LSM "for free". As soon as we have any other xattrs
 * like ACLs, we also need to implement the security.* handlers at
 * filesystem level, though.
 */

/*
 * Callback for security_inode_init_security() for acquiring xattrs.
 */
static int shmem_initxattrs(struct inode *inode,
                            const struct xattr *xattr_array,
                            void *fs_info)
{
        struct shmem_inode_info *info = SHMEM_I(inode);
        const struct xattr *xattr;
        struct simple_xattr *new_xattr;
        size_t len;

        for (xattr = xattr_array; xattr->name != NULL; xattr++) {
                new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
                if (!new_xattr)
                        return -ENOMEM;

                len = strlen(xattr->name) + 1;
                new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
                                          GFP_KERNEL);
                if (!new_xattr->name) {
                        kvfree(new_xattr);
                        return -ENOMEM;
                }

                memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
                       XATTR_SECURITY_PREFIX_LEN);
                memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
                       xattr->name, len);

                simple_xattr_list_add(&info->xattrs, new_xattr);
        }

        return 0;
}

static int shmem_xattr_handler_get(const struct xattr_handler *handler,
                                   struct dentry *unused, struct inode *inode,
                                   const char *name, void *buffer, size_t size)
{
        struct shmem_inode_info *info = SHMEM_I(inode);

        name = xattr_full_name(handler, name);
        return simple_xattr_get(&info->xattrs, name, buffer, size);
}

static int shmem_xattr_handler_set(const struct xattr_handler *handler,
                                   struct dentry *unused, struct inode *inode,
                                   const char *name, const void *value,
                                   size_t size, int flags)
{
        struct shmem_inode_info *info = SHMEM_I(inode);

        name = xattr_full_name(handler, name);
        return simple_xattr_set(&info->xattrs, name, value, size, flags, NULL);
}

static const struct xattr_handler shmem_security_xattr_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .get = shmem_xattr_handler_get,
        .set = shmem_xattr_handler_set,
};

static const struct xattr_handler shmem_trusted_xattr_handler = {
        .prefix = XATTR_TRUSTED_PREFIX,
        .get = shmem_xattr_handler_get,
        .set = shmem_xattr_handler_set,
};

static const struct xattr_handler *shmem_xattr_handlers[] = {
#ifdef CONFIG_TMPFS_POSIX_ACL
        &posix_acl_access_xattr_handler,
        &posix_acl_default_xattr_handler,
#endif
        &shmem_security_xattr_handler,
        &shmem_trusted_xattr_handler,
        NULL
};

static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
        struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
        return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size);
}
#endif /* CONFIG_TMPFS_XATTR */

static const struct inode_operations shmem_short_symlink_operations = {
        .get_link        = simple_get_link,
#ifdef CONFIG_TMPFS_XATTR
        .listxattr        = shmem_listxattr,
#endif
};

static const struct inode_operations shmem_symlink_inode_operations = {
        .get_link        = shmem_get_link,
#ifdef CONFIG_TMPFS_XATTR
        .listxattr        = shmem_listxattr,
#endif
};

static struct dentry *shmem_get_parent(struct dentry *child)
{
        return ERR_PTR(-ESTALE);
}

static int shmem_match(struct inode *ino, void *vfh)
{
        __u32 *fh = vfh;
        __u64 inum = fh[2];
        inum = (inum << 32) | fh[1];
        return ino->i_ino == inum && fh[0] == ino->i_generation;
}

/* Find any alias of inode, but prefer a hashed alias */
static struct dentry *shmem_find_alias(struct inode *inode)
{
        struct dentry *alias = d_find_alias(inode);

        return alias ?: d_find_any_alias(inode);
}


static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
                struct fid *fid, int fh_len, int fh_type)
{
        struct inode *inode;
        struct dentry *dentry = NULL;
        u64 inum;

        if (fh_len < 3)
                return NULL;

        inum = fid->raw[2];
        inum = (inum << 32) | fid->raw[1];

        inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
                        shmem_match, fid->raw);
        if (inode) {
                dentry = shmem_find_alias(inode);
                iput(inode);
        }

        return dentry;
}

static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
                                struct inode *parent)
{
        if (*len < 3) {
                *len = 3;
                return FILEID_INVALID;
        }

        if (inode_unhashed(inode)) {
                /* Unfortunately insert_inode_hash is not idempotent,
                 * so as we hash inodes here rather than at creation
                 * time, we need a lock to ensure we only try
                 * to do it once
                 */
                static DEFINE_SPINLOCK(lock);
                spin_lock(&lock);
                if (inode_unhashed(inode))
                        __insert_inode_hash(inode,
                                            inode->i_ino + inode->i_generation);
                spin_unlock(&lock);
        }

        fh[0] = inode->i_generation;
        fh[1] = inode->i_ino;
        fh[2] = ((__u64)inode->i_ino) >> 32;

        *len = 3;
        return 1;
}

static const struct export_operations shmem_export_ops = {
        .get_parent     = shmem_get_parent,
        .encode_fh      = shmem_encode_fh,
        .fh_to_dentry        = shmem_fh_to_dentry,
};

enum shmem_param {
        Opt_gid,
        Opt_huge,
        Opt_mode,
        Opt_mpol,
        Opt_nr_blocks,
        Opt_nr_inodes,
        Opt_size,
        Opt_uid,
        Opt_inode32,
        Opt_inode64,
};

static const struct constant_table shmem_param_enums_huge[] = {
        {"never",        SHMEM_HUGE_NEVER },
        {"always",        SHMEM_HUGE_ALWAYS },
        {"within_size",        SHMEM_HUGE_WITHIN_SIZE },
        {"advise",        SHMEM_HUGE_ADVISE },
        {}
};

const struct fs_parameter_spec shmem_fs_parameters[] = {
        fsparam_u32   ("gid",                Opt_gid),
        fsparam_enum  ("huge",                Opt_huge,  shmem_param_enums_huge),
        fsparam_u32oct("mode",                Opt_mode),
        fsparam_string("mpol",                Opt_mpol),
        fsparam_string("nr_blocks",        Opt_nr_blocks),
        fsparam_string("nr_inodes",        Opt_nr_inodes),
        fsparam_string("size",                Opt_size),
        fsparam_u32   ("uid",                Opt_uid),
        fsparam_flag  ("inode32",        Opt_inode32),
        fsparam_flag  ("inode64",        Opt_inode64),
        {}
};

static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
{
        struct shmem_options *ctx = fc->fs_private;
        struct fs_parse_result result;
        unsigned long long size;
        char *rest;
        int opt;
        kuid_t kuid;
        kgid_t kgid;

        opt = fs_parse(fc, shmem_fs_parameters, param, &result);
        if (opt < 0)
                return opt;

        switch (opt) {
        case Opt_size:
                size = memparse(param->string, &rest);
                if (*rest == '%') {
                        size <<= PAGE_SHIFT;
                        size *= totalram_pages();
                        do_div(size, 100);
                        rest++;
                }
                if (*rest)
                        goto bad_value;
                ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE);
                ctx->seen |= SHMEM_SEEN_BLOCKS;
                break;
        case Opt_nr_blocks:
                ctx->blocks = memparse(param->string, &rest);
                if (*rest)
                        goto bad_value;
                ctx->seen |= SHMEM_SEEN_BLOCKS;
                break;
        case Opt_nr_inodes:
                ctx->inodes = memparse(param->string, &rest);
                if (*rest)
                        goto bad_value;
                ctx->seen |= SHMEM_SEEN_INODES;
                break;
        case Opt_mode:
                ctx->mode = result.uint_32 & 07777;
                break;
        case Opt_uid:
                kuid = make_kuid(current_user_ns(), result.uint_32);
                if (!uid_valid(kuid))
                        goto bad_value;

                /*
                 * The requested uid must be representable in the
                 * filesystem's idmapping.
                 */
                if (!kuid_has_mapping(fc->user_ns, kuid))
                        goto bad_value;

                ctx->uid = kuid;
                break;
        case Opt_gid:
                kgid = make_kgid(current_user_ns(), result.uint_32);
                if (!gid_valid(kgid))
                        goto bad_value;

                /*
                 * The requested gid must be representable in the
                 * filesystem's idmapping.
                 */
                if (!kgid_has_mapping(fc->user_ns, kgid))
                        goto bad_value;

                ctx->gid = kgid;
                break;
        case Opt_huge:
                ctx->huge = result.uint_32;
                if (ctx->huge != SHMEM_HUGE_NEVER &&
                    !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
                      has_transparent_hugepage()))
                        goto unsupported_parameter;
                ctx->seen |= SHMEM_SEEN_HUGE;
                break;
        case Opt_mpol:
                if (IS_ENABLED(CONFIG_NUMA)) {
                        mpol_put(ctx->mpol);
                        ctx->mpol = NULL;
                        if (mpol_parse_str(param->string, &ctx->mpol))
                                goto bad_value;
                        break;
                }
                goto unsupported_parameter;
        case Opt_inode32:
                ctx->full_inums = false;
                ctx->seen |= SHMEM_SEEN_INUMS;
                break;
        case Opt_inode64:
                if (sizeof(ino_t) < 8) {
                        return invalfc(fc,
                                       "Cannot use inode64 with <64bit inums in kernel\n");
                }
                ctx->full_inums = true;
                ctx->seen |= SHMEM_SEEN_INUMS;
                break;
        }
        return 0;

unsupported_parameter:
        return invalfc(fc, "Unsupported parameter '%s'", param->key);
bad_value:
        return invalfc(fc, "Bad value for '%s'", param->key);
}

static int shmem_parse_options(struct fs_context *fc, void *data)
{
        char *options = data;

        if (options) {
                int err = security_sb_eat_lsm_opts(options, &fc->security);
                if (err)
                        return err;
        }

        while (options != NULL) {
                char *this_char = options;
                for (;;) {
                        /*
                         * NUL-terminate this option: unfortunately,
                         * mount options form a comma-separated list,
                         * but mpol's nodelist may also contain commas.
                         */
                        options = strchr(options, ',');
                        if (options == NULL)
                                break;
                        options++;
                        if (!isdigit(*options)) {
                                options[-1] = '\0';
                                break;
                        }
                }
                if (*this_char) {
                        char *value = strchr(this_char,'=');
                        size_t len = 0;
                        int err;

                        if (value) {
                                *value++ = '\0';
                                len = strlen(value);
                        }
                        err = vfs_parse_fs_string(fc, this_char, value, len);
                        if (err < 0)
                                return err;
                }
        }
        return 0;
}

/*
 * Reconfigure a shmem filesystem.
 *
 * Note that we disallow change from limited->unlimited blocks/inodes while any
 * are in use; but we must separately disallow unlimited->limited, because in
 * that case we have no record of how much is already in use.
 */
static int shmem_reconfigure(struct fs_context *fc)
{
        struct shmem_options *ctx = fc->fs_private;
        struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
        unsigned long inodes;
        const char *err;

        spin_lock(&sbinfo->stat_lock);
        inodes = sbinfo->max_inodes - sbinfo->free_inodes;
        if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
                if (!sbinfo->max_blocks) {
                        err = "Cannot retroactively limit size";
                        goto out;
                }
                if (percpu_counter_compare(&sbinfo->used_blocks,
                                           ctx->blocks) > 0) {
                        err = "Too small a size for current use";
                        goto out;
                }
        }
        if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) {
                if (!sbinfo->max_inodes) {
                        err = "Cannot retroactively limit inodes";
                        goto out;
                }
                if (ctx->inodes < inodes) {
                        err = "Too few inodes for current use";
                        goto out;
                }
        }

        if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums &&
            sbinfo->next_ino > UINT_MAX) {
                err = "Current inum too high to switch to 32-bit inums";
                goto out;
        }

        if (ctx->seen & SHMEM_SEEN_HUGE)
                sbinfo->huge = ctx->huge;
        if (ctx->seen & SHMEM_SEEN_INUMS)
                sbinfo->full_inums = ctx->full_inums;
        if (ctx->seen & SHMEM_SEEN_BLOCKS)
                sbinfo->max_blocks  = ctx->blocks;
        if (ctx->seen & SHMEM_SEEN_INODES) {
                sbinfo->max_inodes  = ctx->inodes;
                sbinfo->free_inodes = ctx->inodes - inodes;
        }

        /*
         * Preserve previous mempolicy unless mpol remount option was specified.
         */
        if (ctx->mpol) {
                mpol_put(sbinfo->mpol);
                sbinfo->mpol = ctx->mpol;        /* transfers initial ref */
                ctx->mpol = NULL;
        }
        spin_unlock(&sbinfo->stat_lock);
        return 0;
out:
        spin_unlock(&sbinfo->stat_lock);
        return invalfc(fc, "%s", err);
}

static int shmem_show_options(struct seq_file *seq, struct dentry *root)
{
        struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);

        if (sbinfo->max_blocks != shmem_default_max_blocks())
                seq_printf(seq, ",size=%luk",
                        sbinfo->max_blocks << (PAGE_SHIFT - 10));
        if (sbinfo->max_inodes != shmem_default_max_inodes())
                seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
        if (sbinfo->mode != (0777 | S_ISVTX))
                seq_printf(seq, ",mode=%03ho", sbinfo->mode);
        if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
                seq_printf(seq, ",uid=%u",
                                from_kuid_munged(&init_user_ns, sbinfo->uid));
        if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
                seq_printf(seq, ",gid=%u",
                                from_kgid_munged(&init_user_ns, sbinfo->gid));

        /*
         * Showing inode{64,32} might be useful even if it's the system default,
         * since then people don't have to resort to checking both here and
         * /proc/config.gz to confirm 64-bit inums were successfully applied
         * (which may not even exist if IKCONFIG_PROC isn't enabled).
         *
         * We hide it when inode64 isn't the default and we are using 32-bit
         * inodes, since that probably just means the feature isn't even under
         * consideration.
         *
         * As such:
         *
         *                     +-----------------+-----------------+
         *                     | TMPFS_INODE64=y | TMPFS_INODE64=n |
         *  +------------------+-----------------+-----------------+
         *  | full_inums=true  | show            | show            |
         *  | full_inums=false | show            | hide            |
         *  +------------------+-----------------+-----------------+
         *
         */
        if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums)
                seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32));
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
        if (sbinfo->huge)
                seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
#endif
        shmem_show_mpol(seq, sbinfo->mpol);
        return 0;
}

#endif /* CONFIG_TMPFS */

static void shmem_put_super(struct super_block *sb)
{
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);

        free_percpu(sbinfo->ino_batch);
        percpu_counter_destroy(&sbinfo->used_blocks);
        mpol_put(sbinfo->mpol);
        kfree(sbinfo);
        sb->s_fs_info = NULL;
}

static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
{
        struct shmem_options *ctx = fc->fs_private;
        struct inode *inode;
        struct shmem_sb_info *sbinfo;
        int err = -ENOMEM;

        /* Round up to L1_CACHE_BYTES to resist false sharing */
        sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
                                L1_CACHE_BYTES), GFP_KERNEL);
        if (!sbinfo)
                return -ENOMEM;

        sb->s_fs_info = sbinfo;

#ifdef CONFIG_TMPFS
        /*
         * Per default we only allow half of the physical ram per
         * tmpfs instance, limiting inodes to one per page of lowmem;
         * but the internal instance is left unlimited.
         */
        if (!(sb->s_flags & SB_KERNMOUNT)) {
                if (!(ctx->seen & SHMEM_SEEN_BLOCKS))
                        ctx->blocks = shmem_default_max_blocks();
                if (!(ctx->seen & SHMEM_SEEN_INODES))
                        ctx->inodes = shmem_default_max_inodes();
                if (!(ctx->seen & SHMEM_SEEN_INUMS))
                        ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
        } else {
                sb->s_flags |= SB_NOUSER;
        }
        sb->s_export_op = &shmem_export_ops;
        sb->s_flags |= SB_NOSEC;
#else
        sb->s_flags |= SB_NOUSER;
#endif
        sbinfo->max_blocks = ctx->blocks;
        sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes;
        if (sb->s_flags & SB_KERNMOUNT) {
                sbinfo->ino_batch = alloc_percpu(ino_t);
                if (!sbinfo->ino_batch)
                        goto failed;
        }
        sbinfo->uid = ctx->uid;
        sbinfo->gid = ctx->gid;
        sbinfo->full_inums = ctx->full_inums;
        sbinfo->mode = ctx->mode;
        sbinfo->huge = ctx->huge;
        sbinfo->mpol = ctx->mpol;
        ctx->mpol = NULL;

        spin_lock_init(&sbinfo->stat_lock);
        if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
                goto failed;
        spin_lock_init(&sbinfo->shrinklist_lock);
        INIT_LIST_HEAD(&sbinfo->shrinklist);

        sb->s_maxbytes = MAX_LFS_FILESIZE;
        sb->s_blocksize = PAGE_SIZE;
        sb->s_blocksize_bits = PAGE_SHIFT;
        sb->s_magic = TMPFS_MAGIC;
        sb->s_op = &shmem_ops;
        sb->s_time_gran = 1;
#ifdef CONFIG_TMPFS_XATTR
        sb->s_xattr = shmem_xattr_handlers;
#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
        sb->s_flags |= SB_POSIXACL;
#endif
        uuid_gen(&sb->s_uuid);

        inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
        if (!inode)
                goto failed;
        inode->i_uid = sbinfo->uid;
        inode->i_gid = sbinfo->gid;
        sb->s_root = d_make_root(inode);
        if (!sb->s_root)
                goto failed;
        return 0;

failed:
        shmem_put_super(sb);
        return err;
}

static int shmem_get_tree(struct fs_context *fc)
{
        return get_tree_nodev(fc, shmem_fill_super);
}

static void shmem_free_fc(struct fs_context *fc)
{
        struct shmem_options *ctx = fc->fs_private;

        if (ctx) {
                mpol_put(ctx->mpol);
                kfree(ctx);
        }
}

static const struct fs_context_operations shmem_fs_context_ops = {
        .free                        = shmem_free_fc,
        .get_tree                = shmem_get_tree,
#ifdef CONFIG_TMPFS
        .parse_monolithic        = shmem_parse_options,
        .parse_param                = shmem_parse_one,
        .reconfigure                = shmem_reconfigure,
#endif
};

static struct kmem_cache *shmem_inode_cachep;

static struct inode *shmem_alloc_inode(struct super_block *sb)
{
        struct shmem_inode_info *info;
        info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
        if (!info)
                return NULL;
        return &info->vfs_inode;
}

static void shmem_free_in_core_inode(struct inode *inode)
{
        if (S_ISLNK(inode->i_mode))
                kfree(inode->i_link);
        kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
}

static void shmem_destroy_inode(struct inode *inode)
{
        if (S_ISREG(inode->i_mode))
                mpol_free_shared_policy(&SHMEM_I(inode)->policy);
}

static void shmem_init_inode(void *foo)
{
        struct shmem_inode_info *info = foo;
        inode_init_once(&info->vfs_inode);
}

static void shmem_init_inodecache(void)
{
        shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
                                sizeof(struct shmem_inode_info),
                                0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
}

static void shmem_destroy_inodecache(void)
{
        kmem_cache_destroy(shmem_inode_cachep);
}

static const struct address_space_operations shmem_aops = {
        .writepage        = shmem_writepage,
        .set_page_dirty        = __set_page_dirty_no_writeback,
#ifdef CONFIG_TMPFS
        .write_begin        = shmem_write_begin,
        .write_end        = shmem_write_end,
#endif
#ifdef CONFIG_MIGRATION
        .migratepage        = migrate_page,
#endif
        .error_remove_page = generic_error_remove_page,
};

static const struct file_operations shmem_file_operations = {
        .mmap                = shmem_mmap,
        .get_unmapped_area = shmem_get_unmapped_area,
#ifdef CONFIG_TMPFS
        .llseek                = shmem_file_llseek,
        .read_iter        = shmem_file_read_iter,
        .write_iter        = generic_file_write_iter,
        .fsync                = noop_fsync,
        .splice_read        = generic_file_splice_read,
        .splice_write        = iter_file_splice_write,
        .fallocate        = shmem_fallocate,
#endif
};

static const struct inode_operations shmem_inode_operations = {
        .getattr        = shmem_getattr,
        .setattr        = shmem_setattr,
#ifdef CONFIG_TMPFS_XATTR
        .listxattr        = shmem_listxattr,
        .set_acl        = simple_set_acl,
#endif
};

static const struct inode_operations shmem_dir_inode_operations = {
#ifdef CONFIG_TMPFS
        .create                = shmem_create,
        .lookup                = simple_lookup,
        .link                = shmem_link,
        .unlink                = shmem_unlink,
        .symlink        = shmem_symlink,
        .mkdir                = shmem_mkdir,
        .rmdir                = shmem_rmdir,
        .mknod                = shmem_mknod,
        .rename                = shmem_rename2,
        .tmpfile        = shmem_tmpfile,
#endif
#ifdef CONFIG_TMPFS_XATTR
        .listxattr        = shmem_listxattr,
#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
        .setattr        = shmem_setattr,
        .set_acl        = simple_set_acl,
#endif
};

static const struct inode_operations shmem_special_inode_operations = {
#ifdef CONFIG_TMPFS_XATTR
        .listxattr        = shmem_listxattr,
#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
        .setattr        = shmem_setattr,
        .set_acl        = simple_set_acl,
#endif
};

static const struct super_operations shmem_ops = {
        .alloc_inode        = shmem_alloc_inode,
        .free_inode        = shmem_free_in_core_inode,
        .destroy_inode        = shmem_destroy_inode,
#ifdef CONFIG_TMPFS
        .statfs                = shmem_statfs,
        .show_options        = shmem_show_options,
#endif
        .evict_inode        = shmem_evict_inode,
        .drop_inode        = generic_delete_inode,
        .put_super        = shmem_put_super,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        .nr_cached_objects        = shmem_unused_huge_count,
        .free_cached_objects        = shmem_unused_huge_scan,
#endif
};

static const struct vm_operations_struct shmem_vm_ops = {
        .fault                = shmem_fault,
        .map_pages        = filemap_map_pages,
#ifdef CONFIG_NUMA
        .set_policy     = shmem_set_policy,
        .get_policy     = shmem_get_policy,
#endif
};

int shmem_init_fs_context(struct fs_context *fc)
{
        struct shmem_options *ctx;

        ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL);
        if (!ctx)
                return -ENOMEM;

        ctx->mode = 0777 | S_ISVTX;
        ctx->uid = current_fsuid();
        ctx->gid = current_fsgid();

        fc->fs_private = ctx;
        fc->ops = &shmem_fs_context_ops;
        return 0;
}

static struct file_system_type shmem_fs_type = {
        .owner                = THIS_MODULE,
        .name                = "tmpfs",
        .init_fs_context = shmem_init_fs_context,
#ifdef CONFIG_TMPFS
        .parameters        = shmem_fs_parameters,
#endif
        .kill_sb        = kill_litter_super,
        .fs_flags        = FS_USERNS_MOUNT | FS_THP_SUPPORT,
};

int __init shmem_init(void)
{
        int error;

        shmem_init_inodecache();

        error = register_filesystem(&shmem_fs_type);
        if (error) {
                pr_err("Could not register tmpfs\n");
                goto out2;
        }

        shm_mnt = kern_mount(&shmem_fs_type);
        if (IS_ERR(shm_mnt)) {
                error = PTR_ERR(shm_mnt);
                pr_err("Could not kern_mount tmpfs\n");
                goto out1;
        }

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
                SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
        else
                shmem_huge = 0; /* just in case it was patched */
#endif
        return 0;

out1:
        unregister_filesystem(&shmem_fs_type);
out2:
        shmem_destroy_inodecache();
        shm_mnt = ERR_PTR(error);
        return error;
}

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
static ssize_t shmem_enabled_show(struct kobject *kobj,
                struct kobj_attribute *attr, char *buf)
{
        static const int values[] = {
                SHMEM_HUGE_ALWAYS,
                SHMEM_HUGE_WITHIN_SIZE,
                SHMEM_HUGE_ADVISE,
                SHMEM_HUGE_NEVER,
                SHMEM_HUGE_DENY,
                SHMEM_HUGE_FORCE,
        };
        int i, count;

        for (i = 0, count = 0; i < ARRAY_SIZE(values); i++) {
                const char *fmt = shmem_huge == values[i] ? "[%s] " : "%s ";

                count += sprintf(buf + count, fmt,
                                shmem_format_huge(values[i]));
        }
        buf[count - 1] = '\n';
        return count;
}

static ssize_t shmem_enabled_store(struct kobject *kobj,
                struct kobj_attribute *attr, const char *buf, size_t count)
{
        char tmp[16];
        int huge;

        if (count + 1 > sizeof(tmp))
                return -EINVAL;
        memcpy(tmp, buf, count);
        tmp[count] = '\0';
        if (count && tmp[count - 1] == '\n')
                tmp[count - 1] = '\0';

        huge = shmem_parse_huge(tmp);
        if (huge == -EINVAL)
                return -EINVAL;
        if (!has_transparent_hugepage() &&
                        huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY)
                return -EINVAL;

        shmem_huge = huge;
        if (shmem_huge > SHMEM_HUGE_DENY)
                SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
        return count;
}

struct kobj_attribute shmem_enabled_attr =
        __ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
bool shmem_huge_enabled(struct vm_area_struct *vma)
{
        struct inode *inode = file_inode(vma->vm_file);
        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
        loff_t i_size;
        pgoff_t off;

        if (!transhuge_vma_enabled(vma, vma->vm_flags))
                return false;
        if (shmem_huge == SHMEM_HUGE_FORCE)
                return true;
        if (shmem_huge == SHMEM_HUGE_DENY)
                return false;
        switch (sbinfo->huge) {
                case SHMEM_HUGE_NEVER:
                        return false;
                case SHMEM_HUGE_ALWAYS:
                        return true;
                case SHMEM_HUGE_WITHIN_SIZE:
                        off = round_up(vma->vm_pgoff, HPAGE_PMD_NR);
                        i_size = round_up(i_size_read(inode), PAGE_SIZE);
                        if (i_size >= HPAGE_PMD_SIZE &&
                                        i_size >> PAGE_SHIFT >= off)
                                return true;
                        fallthrough;
                case SHMEM_HUGE_ADVISE:
                        /* TODO: implement fadvise() hints */
                        return (vma->vm_flags & VM_HUGEPAGE);
                default:
                        VM_BUG_ON(1);
                        return false;
        }
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

#else /* !CONFIG_SHMEM */

/*
 * tiny-shmem: simple shmemfs and tmpfs using ramfs code
 *
 * This is intended for small system where the benefits of the full
 * shmem code (swap-backed and resource-limited) are outweighed by
 * their complexity. On systems without swap this code should be
 * effectively equivalent, but much lighter weight.
 */

static struct file_system_type shmem_fs_type = {
        .name                = "tmpfs",
        .init_fs_context = ramfs_init_fs_context,
        .parameters        = ramfs_fs_parameters,
        .kill_sb        = ramfs_kill_sb,
        .fs_flags        = FS_USERNS_MOUNT,
};

int __init shmem_init(void)
{
        BUG_ON(register_filesystem(&shmem_fs_type) != 0);

        shm_mnt = kern_mount(&shmem_fs_type);
        BUG_ON(IS_ERR(shm_mnt));

        return 0;
}

int shmem_unuse(unsigned int type, bool frontswap,
                unsigned long *fs_pages_to_unuse)
{
        return 0;
}

int shmem_lock(struct file *file, int lock, struct user_struct *user)
{
        return 0;
}

void shmem_unlock_mapping(struct address_space *mapping)
{
}

#ifdef CONFIG_MMU
unsigned long shmem_get_unmapped_area(struct file *file,
                                      unsigned long addr, unsigned long len,
                                      unsigned long pgoff, unsigned long flags)
{
        return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
}
#endif

void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
{
        truncate_inode_pages_range(inode->i_mapping, lstart, lend);
}
EXPORT_SYMBOL_GPL(shmem_truncate_range);

#define shmem_vm_ops                                generic_file_vm_ops
#define shmem_file_operations                        ramfs_file_operations
#define shmem_get_inode(sb, dir, mode, dev, flags)        ramfs_get_inode(sb, dir, mode, dev)
#define shmem_acct_size(flags, size)                0
#define shmem_unacct_size(flags, size)                do {} while (0)

#endif /* CONFIG_SHMEM */

/* common code */

static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size,
                                       unsigned long flags, unsigned int i_flags)
{
        struct inode *inode;
        struct file *res;

        if (IS_ERR(mnt))
                return ERR_CAST(mnt);

        if (size < 0 || size > MAX_LFS_FILESIZE)
                return ERR_PTR(-EINVAL);

        if (shmem_acct_size(flags, size))
                return ERR_PTR(-ENOMEM);

        inode = shmem_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0,
                                flags);
        if (unlikely(!inode)) {
                shmem_unacct_size(flags, size);
                return ERR_PTR(-ENOSPC);
        }
        inode->i_flags |= i_flags;
        inode->i_size = size;
        clear_nlink(inode);        /* It is unlinked */
        res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
        if (!IS_ERR(res))
                res = alloc_file_pseudo(inode, mnt, name, O_RDWR,
                                &shmem_file_operations);
        if (IS_ERR(res))
                iput(inode);
        return res;
}

/**
 * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
 *         kernel internal.  There will be NO LSM permission checks against the
 *         underlying inode.  So users of this interface must do LSM checks at a
 *        higher layer.  The users are the big_key and shm implementations.  LSM
 *        checks are provided at the key or shm level rather than the inode.
 * @name: name for dentry (to be seen in /proc/<pid>/maps
 * @size: size to be set for the file
 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
 */
struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
{
        return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE);
}

/**
 * shmem_file_setup - get an unlinked file living in tmpfs
 * @name: name for dentry (to be seen in /proc/<pid>/maps
 * @size: size to be set for the file
 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
 */
struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
{
        return __shmem_file_setup(shm_mnt, name, size, flags, 0);
}
EXPORT_SYMBOL_GPL(shmem_file_setup);

/**
 * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs
 * @mnt: the tmpfs mount where the file will be created
 * @name: name for dentry (to be seen in /proc/<pid>/maps
 * @size: size to be set for the file
 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
 */
struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name,
                                       loff_t size, unsigned long flags)
{
        return __shmem_file_setup(mnt, name, size, flags, 0);
}
EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);

/**
 * shmem_zero_setup - setup a shared anonymous mapping
 * @vma: the vma to be mmapped is prepared by do_mmap
 */
int shmem_zero_setup(struct vm_area_struct *vma)
{
        struct file *file;
        loff_t size = vma->vm_end - vma->vm_start;

        /*
         * Cloning a new file under mmap_lock leads to a lock ordering conflict
         * between XFS directory reading and selinux: since this file is only
         * accessible to the user through its mapping, use S_PRIVATE flag to
         * bypass file security, in the same way as shmem_kernel_file_setup().
         */
        file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags);
        if (IS_ERR(file))
                return PTR_ERR(file);

        if (vma->vm_file)
                fput(vma->vm_file);
        vma->vm_file = file;
        vma->vm_ops = &shmem_vm_ops;

        if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
                        ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
                        (vma->vm_end & HPAGE_PMD_MASK)) {
                khugepaged_enter(vma, vma->vm_flags);
        }

        return 0;
}

/**
 * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags.
 * @mapping:        the page's address_space
 * @index:        the page index
 * @gfp:        the page allocator flags to use if allocating
 *
 * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
 * with any new page allocations done using the specified allocation flags.
 * But read_cache_page_gfp() uses the ->readpage() method: which does not
 * suit tmpfs, since it may have pages in swapcache, and needs to find those
 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
 *
 * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
 * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
 */
struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
                                         pgoff_t index, gfp_t gfp)
{
#ifdef CONFIG_SHMEM
        struct inode *inode = mapping->host;
        struct page *page;
        int error;

        BUG_ON(mapping->a_ops != &shmem_aops);
        error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE,
                                  gfp, NULL, NULL, NULL);
        if (error)
                page = ERR_PTR(error);
        else
                unlock_page(page);
        return page;
#else
        /*
         * The tiny !SHMEM case uses ramfs without swap
         */
        return read_cache_page_gfp(mapping, index, gfp);
#endif
}
EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);

































































    4 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_CTYPE_H
#define _LINUX_CTYPE_H

/*
 * NOTE! This ctype does not handle EOF like the standard C
 * library is required to.
 */

#define _U        0x01        /* upper */
#define _L        0x02        /* lower */
#define _D        0x04        /* digit */
#define _C        0x08        /* cntrl */
#define _P        0x10        /* punct */
#define _S        0x20        /* white space (space/lf/tab) */
#define _X        0x40        /* hex digit */
#define _SP        0x80        /* hard space (0x20) */

extern const unsigned char _ctype[];

#define __ismask(x) (_ctype[(int)(unsigned char)(x)])

#define isalnum(c)        ((__ismask(c)&(_U|_L|_D)) != 0)
#define isalpha(c)        ((__ismask(c)&(_U|_L)) != 0)
#define iscntrl(c)        ((__ismask(c)&(_C)) != 0)
static inline int isdigit(int c)
{
        return '0' <= c && c <= '9';
}
#define isgraph(c)        ((__ismask(c)&(_P|_U|_L|_D)) != 0)
#define islower(c)        ((__ismask(c)&(_L)) != 0)
#define isprint(c)        ((__ismask(c)&(_P|_U|_L|_D|_SP)) != 0)
#define ispunct(c)        ((__ismask(c)&(_P)) != 0)
/* Note: isspace() must return false for %NUL-terminator */
#define isspace(c)        ((__ismask(c)&(_S)) != 0)
#define isupper(c)        ((__ismask(c)&(_U)) != 0)
#define isxdigit(c)        ((__ismask(c)&(_D|_X)) != 0)

#define isascii(c) (((unsigned char)(c))<=0x7f)
#define toascii(c) (((unsigned char)(c))&0x7f)

static inline unsigned char __tolower(unsigned char c)
{
        if (isupper(c))
                c -= 'A'-'a';
        return c;
}

static inline unsigned char __toupper(unsigned char c)
{
        if (islower(c))
                c -= 'a'-'A';
        return c;
}

#define tolower(c) __tolower(c)
#define toupper(c) __toupper(c)

/*
 * Fast implementation of tolower() for internal usage. Do not use in your
 * code.
 */
static inline char _tolower(const char c)
{
        return c | 0x20;
}

/* Fast check for octal digit */
static inline int isodigit(const char c)
{
        return c >= '0' && c <= '7';
}

#endif




























































































































































































































































































































































































































































































































































































































































































































































































































































    3 

















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Hash: Hash algorithms under the crypto API
 * 
 * Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au>
 */

#ifndef _CRYPTO_HASH_H
#define _CRYPTO_HASH_H

#include <linux/crypto.h>
#include <linux/string.h>

struct crypto_ahash;

/**
 * DOC: Message Digest Algorithm Definitions
 *
 * These data structures define modular message digest algorithm
 * implementations, managed via crypto_register_ahash(),
 * crypto_register_shash(), crypto_unregister_ahash() and
 * crypto_unregister_shash().
 */

/**
 * struct hash_alg_common - define properties of message digest
 * @digestsize: Size of the result of the transformation. A buffer of this size
 *                must be available to the @final and @finup calls, so they can
 *                store the resulting hash into it. For various predefined sizes,
 *                search include/crypto/ using
 *                git grep _DIGEST_SIZE include/crypto.
 * @statesize: Size of the block for partial state of the transformation. A
 *               buffer of this size must be passed to the @export function as it
 *               will save the partial state of the transformation into it. On the
 *               other side, the @import function will load the state from a
 *               buffer of this size as well.
 * @base: Start of data structure of cipher algorithm. The common data
 *          structure of crypto_alg contains information common to all ciphers.
 *          The hash_alg_common data structure now adds the hash-specific
 *          information.
 */
struct hash_alg_common {
        unsigned int digestsize;
        unsigned int statesize;

        struct crypto_alg base;
};

struct ahash_request {
        struct crypto_async_request base;

        unsigned int nbytes;
        struct scatterlist *src;
        u8 *result;

        /* This field may only be used by the ahash API code. */
        void *priv;

        void *__ctx[] CRYPTO_MINALIGN_ATTR;
};

/**
 * struct ahash_alg - asynchronous message digest definition
 * @init: **[mandatory]** Initialize the transformation context. Intended only to initialize the
 *          state of the HASH transformation at the beginning. This shall fill in
 *          the internal structures used during the entire duration of the whole
 *          transformation. No data processing happens at this point. Driver code
 *          implementation must not use req->result.
 * @update: **[mandatory]** Push a chunk of data into the driver for transformation. This
 *           function actually pushes blocks of data from upper layers into the
 *           driver, which then passes those to the hardware as seen fit. This
 *           function must not finalize the HASH transformation by calculating the
 *           final message digest as this only adds more data into the
 *           transformation. This function shall not modify the transformation
 *           context, as this function may be called in parallel with the same
 *           transformation object. Data processing can happen synchronously
 *           [SHASH] or asynchronously [AHASH] at this point. Driver must not use
 *           req->result.
 * @final: **[mandatory]** Retrieve result from the driver. This function finalizes the
 *           transformation and retrieves the resulting hash from the driver and
 *           pushes it back to upper layers. No data processing happens at this
 *           point unless hardware requires it to finish the transformation
 *           (then the data buffered by the device driver is processed).
 * @finup: **[optional]** Combination of @update and @final. This function is effectively a
 *           combination of @update and @final calls issued in sequence. As some
 *           hardware cannot do @update and @final separately, this callback was
 *           added to allow such hardware to be used at least by IPsec. Data
 *           processing can happen synchronously [SHASH] or asynchronously [AHASH]
 *           at this point.
 * @digest: Combination of @init and @update and @final. This function
 *            effectively behaves as the entire chain of operations, @init,
 *            @update and @final issued in sequence. Just like @finup, this was
 *            added for hardware which cannot do even the @finup, but can only do
 *            the whole transformation in one run. Data processing can happen
 *            synchronously [SHASH] or asynchronously [AHASH] at this point.
 * @setkey: Set optional key used by the hashing algorithm. Intended to push
 *            optional key used by the hashing algorithm from upper layers into
 *            the driver. This function can store the key in the transformation
 *            context or can outright program it into the hardware. In the former
 *            case, one must be careful to program the key into the hardware at
 *            appropriate time and one must be careful that .setkey() can be
 *            called multiple times during the existence of the transformation
 *            object. Not  all hashing algorithms do implement this function as it
 *            is only needed for keyed message digests. SHAx/MDx/CRCx do NOT
 *            implement this function. HMAC(MDx)/HMAC(SHAx)/CMAC(AES) do implement
 *            this function. This function must be called before any other of the
 *            @init, @update, @final, @finup, @digest is called. No data
 *            processing happens at this point.
 * @export: Export partial state of the transformation. This function dumps the
 *            entire state of the ongoing transformation into a provided block of
 *            data so it can be @import 'ed back later on. This is useful in case
 *            you want to save partial result of the transformation after
 *            processing certain amount of data and reload this partial result
 *            multiple times later on for multiple re-use. No data processing
 *            happens at this point. Driver must not use req->result.
 * @import: Import partial state of the transformation. This function loads the
 *            entire state of the ongoing transformation from a provided block of
 *            data so the transformation can continue from this point onward. No
 *            data processing happens at this point. Driver must not use
 *            req->result.
 * @init_tfm: Initialize the cryptographic transformation object.
 *              This function is called only once at the instantiation
 *              time, right after the transformation context was
 *              allocated. In case the cryptographic hardware has
 *              some special requirements which need to be handled
 *              by software, this function shall check for the precise
 *              requirement of the transformation and put any software
 *              fallbacks in place.
 * @exit_tfm: Deinitialize the cryptographic transformation object.
 *              This is a counterpart to @init_tfm, used to remove
 *              various changes set in @init_tfm.
 * @halg: see struct hash_alg_common
 */
struct ahash_alg {
        int (*init)(struct ahash_request *req);
        int (*update)(struct ahash_request *req);
        int (*final)(struct ahash_request *req);
        int (*finup)(struct ahash_request *req);
        int (*digest)(struct ahash_request *req);
        int (*export)(struct ahash_request *req, void *out);
        int (*import)(struct ahash_request *req, const void *in);
        int (*setkey)(struct crypto_ahash *tfm, const u8 *key,
                      unsigned int keylen);
        int (*init_tfm)(struct crypto_ahash *tfm);
        void (*exit_tfm)(struct crypto_ahash *tfm);

        struct hash_alg_common halg;
};

struct shash_desc {
        struct crypto_shash *tfm;
        void *__ctx[] __aligned(ARCH_SLAB_MINALIGN);
};

#define HASH_MAX_DIGESTSIZE         64

/*
 * Worst case is hmac(sha3-224-generic).  Its context is a nested 'shash_desc'
 * containing a 'struct sha3_state'.
 */
#define HASH_MAX_DESCSIZE        (sizeof(struct shash_desc) + 360)

#define HASH_MAX_STATESIZE        512

#define SHASH_DESC_ON_STACK(shash, ctx)                                             \
        char __##shash##_desc[sizeof(struct shash_desc) + HASH_MAX_DESCSIZE] \
                __aligned(__alignof__(struct shash_desc));                     \
        struct shash_desc *shash = (struct shash_desc *)__##shash##_desc

/**
 * struct shash_alg - synchronous message digest definition
 * @init: see struct ahash_alg
 * @update: see struct ahash_alg
 * @final: see struct ahash_alg
 * @finup: see struct ahash_alg
 * @digest: see struct ahash_alg
 * @export: see struct ahash_alg
 * @import: see struct ahash_alg
 * @setkey: see struct ahash_alg
 * @init_tfm: Initialize the cryptographic transformation object.
 *              This function is called only once at the instantiation
 *              time, right after the transformation context was
 *              allocated. In case the cryptographic hardware has
 *              some special requirements which need to be handled
 *              by software, this function shall check for the precise
 *              requirement of the transformation and put any software
 *              fallbacks in place.
 * @exit_tfm: Deinitialize the cryptographic transformation object.
 *              This is a counterpart to @init_tfm, used to remove
 *              various changes set in @init_tfm.
 * @digestsize: see struct ahash_alg
 * @statesize: see struct ahash_alg
 * @descsize: Size of the operational state for the message digest. This state
 *               size is the memory size that needs to be allocated for
 *              shash_desc.__ctx
 * @base: internally used
 */
struct shash_alg {
        int (*init)(struct shash_desc *desc);
        int (*update)(struct shash_desc *desc, const u8 *data,
                      unsigned int len);
        int (*final)(struct shash_desc *desc, u8 *out);
        int (*finup)(struct shash_desc *desc, const u8 *data,
                     unsigned int len, u8 *out);
        int (*digest)(struct shash_desc *desc, const u8 *data,
                      unsigned int len, u8 *out);
        int (*export)(struct shash_desc *desc, void *out);
        int (*import)(struct shash_desc *desc, const void *in);
        int (*setkey)(struct crypto_shash *tfm, const u8 *key,
                      unsigned int keylen);
        int (*init_tfm)(struct crypto_shash *tfm);
        void (*exit_tfm)(struct crypto_shash *tfm);

        unsigned int descsize;

        /* These fields must match hash_alg_common. */
        unsigned int digestsize
                __attribute__ ((aligned(__alignof__(struct hash_alg_common))));
        unsigned int statesize;

        struct crypto_alg base;
};

struct crypto_ahash {
        int (*init)(struct ahash_request *req);
        int (*update)(struct ahash_request *req);
        int (*final)(struct ahash_request *req);
        int (*finup)(struct ahash_request *req);
        int (*digest)(struct ahash_request *req);
        int (*export)(struct ahash_request *req, void *out);
        int (*import)(struct ahash_request *req, const void *in);
        int (*setkey)(struct crypto_ahash *tfm, const u8 *key,
                      unsigned int keylen);

        unsigned int reqsize;
        struct crypto_tfm base;
};

struct crypto_shash {
        unsigned int descsize;
        struct crypto_tfm base;
};

/**
 * DOC: Asynchronous Message Digest API
 *
 * The asynchronous message digest API is used with the ciphers of type
 * CRYPTO_ALG_TYPE_AHASH (listed as type "ahash" in /proc/crypto)
 *
 * The asynchronous cipher operation discussion provided for the
 * CRYPTO_ALG_TYPE_SKCIPHER API applies here as well.
 */

static inline struct crypto_ahash *__crypto_ahash_cast(struct crypto_tfm *tfm)
{
        return container_of(tfm, struct crypto_ahash, base);
}

/**
 * crypto_alloc_ahash() - allocate ahash cipher handle
 * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
 *              ahash cipher
 * @type: specifies the type of the cipher
 * @mask: specifies the mask for the cipher
 *
 * Allocate a cipher handle for an ahash. The returned struct
 * crypto_ahash is the cipher handle that is required for any subsequent
 * API invocation for that ahash.
 *
 * Return: allocated cipher handle in case of success; IS_ERR() is true in case
 *           of an error, PTR_ERR() returns the error code.
 */
struct crypto_ahash *crypto_alloc_ahash(const char *alg_name, u32 type,
                                        u32 mask);

static inline struct crypto_tfm *crypto_ahash_tfm(struct crypto_ahash *tfm)
{
        return &tfm->base;
}

/**
 * crypto_free_ahash() - zeroize and free the ahash handle
 * @tfm: cipher handle to be freed
 *
 * If @tfm is a NULL or error pointer, this function does nothing.
 */
static inline void crypto_free_ahash(struct crypto_ahash *tfm)
{
        crypto_destroy_tfm(tfm, crypto_ahash_tfm(tfm));
}

/**
 * crypto_has_ahash() - Search for the availability of an ahash.
 * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
 *              ahash
 * @type: specifies the type of the ahash
 * @mask: specifies the mask for the ahash
 *
 * Return: true when the ahash is known to the kernel crypto API; false
 *           otherwise
 */
int crypto_has_ahash(const char *alg_name, u32 type, u32 mask);

static inline const char *crypto_ahash_alg_name(struct crypto_ahash *tfm)
{
        return crypto_tfm_alg_name(crypto_ahash_tfm(tfm));
}

static inline const char *crypto_ahash_driver_name(struct crypto_ahash *tfm)
{
        return crypto_tfm_alg_driver_name(crypto_ahash_tfm(tfm));
}

static inline unsigned int crypto_ahash_alignmask(
        struct crypto_ahash *tfm)
{
        return crypto_tfm_alg_alignmask(crypto_ahash_tfm(tfm));
}

/**
 * crypto_ahash_blocksize() - obtain block size for cipher
 * @tfm: cipher handle
 *
 * The block size for the message digest cipher referenced with the cipher
 * handle is returned.
 *
 * Return: block size of cipher
 */
static inline unsigned int crypto_ahash_blocksize(struct crypto_ahash *tfm)
{
        return crypto_tfm_alg_blocksize(crypto_ahash_tfm(tfm));
}

static inline struct hash_alg_common *__crypto_hash_alg_common(
        struct crypto_alg *alg)
{
        return container_of(alg, struct hash_alg_common, base);
}

static inline struct hash_alg_common *crypto_hash_alg_common(
        struct crypto_ahash *tfm)
{
        return __crypto_hash_alg_common(crypto_ahash_tfm(tfm)->__crt_alg);
}

/**
 * crypto_ahash_digestsize() - obtain message digest size
 * @tfm: cipher handle
 *
 * The size for the message digest created by the message digest cipher
 * referenced with the cipher handle is returned.
 *
 *
 * Return: message digest size of cipher
 */
static inline unsigned int crypto_ahash_digestsize(struct crypto_ahash *tfm)
{
        return crypto_hash_alg_common(tfm)->digestsize;
}

/**
 * crypto_ahash_statesize() - obtain size of the ahash state
 * @tfm: cipher handle
 *
 * Return the size of the ahash state. With the crypto_ahash_export()
 * function, the caller can export the state into a buffer whose size is
 * defined with this function.
 *
 * Return: size of the ahash state
 */
static inline unsigned int crypto_ahash_statesize(struct crypto_ahash *tfm)
{
        return crypto_hash_alg_common(tfm)->statesize;
}

static inline u32 crypto_ahash_get_flags(struct crypto_ahash *tfm)
{
        return crypto_tfm_get_flags(crypto_ahash_tfm(tfm));
}

static inline void crypto_ahash_set_flags(struct crypto_ahash *tfm, u32 flags)
{
        crypto_tfm_set_flags(crypto_ahash_tfm(tfm), flags);
}

static inline void crypto_ahash_clear_flags(struct crypto_ahash *tfm, u32 flags)
{
        crypto_tfm_clear_flags(crypto_ahash_tfm(tfm), flags);
}

/**
 * crypto_ahash_reqtfm() - obtain cipher handle from request
 * @req: asynchronous request handle that contains the reference to the ahash
 *         cipher handle
 *
 * Return the ahash cipher handle that is registered with the asynchronous
 * request handle ahash_request.
 *
 * Return: ahash cipher handle
 */
static inline struct crypto_ahash *crypto_ahash_reqtfm(
        struct ahash_request *req)
{
        return __crypto_ahash_cast(req->base.tfm);
}

/**
 * crypto_ahash_reqsize() - obtain size of the request data structure
 * @tfm: cipher handle
 *
 * Return: size of the request data
 */
static inline unsigned int crypto_ahash_reqsize(struct crypto_ahash *tfm)
{
        return tfm->reqsize;
}

static inline void *ahash_request_ctx(struct ahash_request *req)
{
        return req->__ctx;
}

/**
 * crypto_ahash_setkey - set key for cipher handle
 * @tfm: cipher handle
 * @key: buffer holding the key
 * @keylen: length of the key in bytes
 *
 * The caller provided key is set for the ahash cipher. The cipher
 * handle must point to a keyed hash in order for this function to succeed.
 *
 * Return: 0 if the setting of the key was successful; < 0 if an error occurred
 */
int crypto_ahash_setkey(struct crypto_ahash *tfm, const u8 *key,
                        unsigned int keylen);

/**
 * crypto_ahash_finup() - update and finalize message digest
 * @req: reference to the ahash_request handle that holds all information
 *         needed to perform the cipher operation
 *
 * This function is a "short-hand" for the function calls of
 * crypto_ahash_update and crypto_ahash_final. The parameters have the same
 * meaning as discussed for those separate functions.
 *
 * Return: see crypto_ahash_final()
 */
int crypto_ahash_finup(struct ahash_request *req);

/**
 * crypto_ahash_final() - calculate message digest
 * @req: reference to the ahash_request handle that holds all information
 *         needed to perform the cipher operation
 *
 * Finalize the message digest operation and create the message digest
 * based on all data added to the cipher handle. The message digest is placed
 * into the output buffer registered with the ahash_request handle.
 *
 * Return:
 * 0                if the message digest was successfully calculated;
 * -EINPROGRESS        if data is feeded into hardware (DMA) or queued for later;
 * -EBUSY        if queue is full and request should be resubmitted later;
 * other < 0        if an error occurred
 */
int crypto_ahash_final(struct ahash_request *req);

/**
 * crypto_ahash_digest() - calculate message digest for a buffer
 * @req: reference to the ahash_request handle that holds all information
 *         needed to perform the cipher operation
 *
 * This function is a "short-hand" for the function calls of crypto_ahash_init,
 * crypto_ahash_update and crypto_ahash_final. The parameters have the same
 * meaning as discussed for those separate three functions.
 *
 * Return: see crypto_ahash_final()
 */
int crypto_ahash_digest(struct ahash_request *req);

/**
 * crypto_ahash_export() - extract current message digest state
 * @req: reference to the ahash_request handle whose state is exported
 * @out: output buffer of sufficient size that can hold the hash state
 *
 * This function exports the hash state of the ahash_request handle into the
 * caller-allocated output buffer out which must have sufficient size (e.g. by
 * calling crypto_ahash_statesize()).
 *
 * Return: 0 if the export was successful; < 0 if an error occurred
 */
static inline int crypto_ahash_export(struct ahash_request *req, void *out)
{
        return crypto_ahash_reqtfm(req)->export(req, out);
}

/**
 * crypto_ahash_import() - import message digest state
 * @req: reference to ahash_request handle the state is imported into
 * @in: buffer holding the state
 *
 * This function imports the hash state into the ahash_request handle from the
 * input buffer. That buffer should have been generated with the
 * crypto_ahash_export function.
 *
 * Return: 0 if the import was successful; < 0 if an error occurred
 */
static inline int crypto_ahash_import(struct ahash_request *req, const void *in)
{
        struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);

        if (crypto_ahash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
                return -ENOKEY;

        return tfm->import(req, in);
}

/**
 * crypto_ahash_init() - (re)initialize message digest handle
 * @req: ahash_request handle that already is initialized with all necessary
 *         data using the ahash_request_* API functions
 *
 * The call (re-)initializes the message digest referenced by the ahash_request
 * handle. Any potentially existing state created by previous operations is
 * discarded.
 *
 * Return: see crypto_ahash_final()
 */
static inline int crypto_ahash_init(struct ahash_request *req)
{
        struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);

        if (crypto_ahash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
                return -ENOKEY;

        return tfm->init(req);
}

/**
 * crypto_ahash_update() - add data to message digest for processing
 * @req: ahash_request handle that was previously initialized with the
 *         crypto_ahash_init call.
 *
 * Updates the message digest state of the &ahash_request handle. The input data
 * is pointed to by the scatter/gather list registered in the &ahash_request
 * handle
 *
 * Return: see crypto_ahash_final()
 */
static inline int crypto_ahash_update(struct ahash_request *req)
{
        struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
        struct crypto_alg *alg = tfm->base.__crt_alg;
        unsigned int nbytes = req->nbytes;
        int ret;

        crypto_stats_get(alg);
        ret = crypto_ahash_reqtfm(req)->update(req);
        crypto_stats_ahash_update(nbytes, ret, alg);
        return ret;
}

/**
 * DOC: Asynchronous Hash Request Handle
 *
 * The &ahash_request data structure contains all pointers to data
 * required for the asynchronous cipher operation. This includes the cipher
 * handle (which can be used by multiple &ahash_request instances), pointer
 * to plaintext and the message digest output buffer, asynchronous callback
 * function, etc. It acts as a handle to the ahash_request_* API calls in a
 * similar way as ahash handle to the crypto_ahash_* API calls.
 */

/**
 * ahash_request_set_tfm() - update cipher handle reference in request
 * @req: request handle to be modified
 * @tfm: cipher handle that shall be added to the request handle
 *
 * Allow the caller to replace the existing ahash handle in the request
 * data structure with a different one.
 */
static inline void ahash_request_set_tfm(struct ahash_request *req,
                                         struct crypto_ahash *tfm)
{
        req->base.tfm = crypto_ahash_tfm(tfm);
}

/**
 * ahash_request_alloc() - allocate request data structure
 * @tfm: cipher handle to be registered with the request
 * @gfp: memory allocation flag that is handed to kmalloc by the API call.
 *
 * Allocate the request data structure that must be used with the ahash
 * message digest API calls. During
 * the allocation, the provided ahash handle
 * is registered in the request data structure.
 *
 * Return: allocated request handle in case of success, or NULL if out of memory
 */
static inline struct ahash_request *ahash_request_alloc(
        struct crypto_ahash *tfm, gfp_t gfp)
{
        struct ahash_request *req;

        req = kmalloc(sizeof(struct ahash_request) +
                      crypto_ahash_reqsize(tfm), gfp);

        if (likely(req))
                ahash_request_set_tfm(req, tfm);

        return req;
}

/**
 * ahash_request_free() - zeroize and free the request data structure
 * @req: request data structure cipher handle to be freed
 */
static inline void ahash_request_free(struct ahash_request *req)
{
        kfree_sensitive(req);
}

static inline void ahash_request_zero(struct ahash_request *req)
{
        memzero_explicit(req, sizeof(*req) +
                              crypto_ahash_reqsize(crypto_ahash_reqtfm(req)));
}

static inline struct ahash_request *ahash_request_cast(
        struct crypto_async_request *req)
{
        return container_of(req, struct ahash_request, base);
}

/**
 * ahash_request_set_callback() - set asynchronous callback function
 * @req: request handle
 * @flags: specify zero or an ORing of the flags
 *           CRYPTO_TFM_REQ_MAY_BACKLOG the request queue may back log and
 *           increase the wait queue beyond the initial maximum size;
 *           CRYPTO_TFM_REQ_MAY_SLEEP the request processing may sleep
 * @compl: callback function pointer to be registered with the request handle
 * @data: The data pointer refers to memory that is not used by the kernel
 *          crypto API, but provided to the callback function for it to use. Here,
 *          the caller can provide a reference to memory the callback function can
 *          operate on. As the callback function is invoked asynchronously to the
 *          related functionality, it may need to access data structures of the
 *          related functionality which can be referenced using this pointer. The
 *          callback function can access the memory via the "data" field in the
 *          &crypto_async_request data structure provided to the callback function.
 *
 * This function allows setting the callback function that is triggered once
 * the cipher operation completes.
 *
 * The callback function is registered with the &ahash_request handle and
 * must comply with the following template::
 *
 *        void callback_function(struct crypto_async_request *req, int error)
 */
static inline void ahash_request_set_callback(struct ahash_request *req,
                                              u32 flags,
                                              crypto_completion_t compl,
                                              void *data)
{
        req->base.complete = compl;
        req->base.data = data;
        req->base.flags = flags;
}

/**
 * ahash_request_set_crypt() - set data buffers
 * @req: ahash_request handle to be updated
 * @src: source scatter/gather list
 * @result: buffer that is filled with the message digest -- the caller must
 *            ensure that the buffer has sufficient space by, for example, calling
 *            crypto_ahash_digestsize()
 * @nbytes: number of bytes to process from the source scatter/gather list
 *
 * By using this call, the caller references the source scatter/gather list.
 * The source scatter/gather list points to the data the message digest is to
 * be calculated for.
 */
static inline void ahash_request_set_crypt(struct ahash_request *req,
                                           struct scatterlist *src, u8 *result,
                                           unsigned int nbytes)
{
        req->src = src;
        req->nbytes = nbytes;
        req->result = result;
}

/**
 * DOC: Synchronous Message Digest API
 *
 * The synchronous message digest API is used with the ciphers of type
 * CRYPTO_ALG_TYPE_SHASH (listed as type "shash" in /proc/crypto)
 *
 * The message digest API is able to maintain state information for the
 * caller.
 *
 * The synchronous message digest API can store user-related context in its
 * shash_desc request data structure.
 */

/**
 * crypto_alloc_shash() - allocate message digest handle
 * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
 *              message digest cipher
 * @type: specifies the type of the cipher
 * @mask: specifies the mask for the cipher
 *
 * Allocate a cipher handle for a message digest. The returned &struct
 * crypto_shash is the cipher handle that is required for any subsequent
 * API invocation for that message digest.
 *
 * Return: allocated cipher handle in case of success; IS_ERR() is true in case
 *           of an error, PTR_ERR() returns the error code.
 */
struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type,
                                        u32 mask);

static inline struct crypto_tfm *crypto_shash_tfm(struct crypto_shash *tfm)
{
        return &tfm->base;
}

/**
 * crypto_free_shash() - zeroize and free the message digest handle
 * @tfm: cipher handle to be freed
 *
 * If @tfm is a NULL or error pointer, this function does nothing.
 */
static inline void crypto_free_shash(struct crypto_shash *tfm)
{
        crypto_destroy_tfm(tfm, crypto_shash_tfm(tfm));
}

static inline const char *crypto_shash_alg_name(struct crypto_shash *tfm)
{
        return crypto_tfm_alg_name(crypto_shash_tfm(tfm));
}

static inline const char *crypto_shash_driver_name(struct crypto_shash *tfm)
{
        return crypto_tfm_alg_driver_name(crypto_shash_tfm(tfm));
}

static inline unsigned int crypto_shash_alignmask(
        struct crypto_shash *tfm)
{
        return crypto_tfm_alg_alignmask(crypto_shash_tfm(tfm));
}

/**
 * crypto_shash_blocksize() - obtain block size for cipher
 * @tfm: cipher handle
 *
 * The block size for the message digest cipher referenced with the cipher
 * handle is returned.
 *
 * Return: block size of cipher
 */
static inline unsigned int crypto_shash_blocksize(struct crypto_shash *tfm)
{
        return crypto_tfm_alg_blocksize(crypto_shash_tfm(tfm));
}

static inline struct shash_alg *__crypto_shash_alg(struct crypto_alg *alg)
{
        return container_of(alg, struct shash_alg, base);
}

static inline struct shash_alg *crypto_shash_alg(struct crypto_shash *tfm)
{
        return __crypto_shash_alg(crypto_shash_tfm(tfm)->__crt_alg);
}

/**
 * crypto_shash_digestsize() - obtain message digest size
 * @tfm: cipher handle
 *
 * The size for the message digest created by the message digest cipher
 * referenced with the cipher handle is returned.
 *
 * Return: digest size of cipher
 */
static inline unsigned int crypto_shash_digestsize(struct crypto_shash *tfm)
{
        return crypto_shash_alg(tfm)->digestsize;
}

static inline unsigned int crypto_shash_statesize(struct crypto_shash *tfm)
{
        return crypto_shash_alg(tfm)->statesize;
}

static inline u32 crypto_shash_get_flags(struct crypto_shash *tfm)
{
        return crypto_tfm_get_flags(crypto_shash_tfm(tfm));
}

static inline void crypto_shash_set_flags(struct crypto_shash *tfm, u32 flags)
{
        crypto_tfm_set_flags(crypto_shash_tfm(tfm), flags);
}

static inline void crypto_shash_clear_flags(struct crypto_shash *tfm, u32 flags)
{
        crypto_tfm_clear_flags(crypto_shash_tfm(tfm), flags);
}

/**
 * crypto_shash_descsize() - obtain the operational state size
 * @tfm: cipher handle
 *
 * The size of the operational state the cipher needs during operation is
 * returned for the hash referenced with the cipher handle. This size is
 * required to calculate the memory requirements to allow the caller allocating
 * sufficient memory for operational state.
 *
 * The operational state is defined with struct shash_desc where the size of
 * that data structure is to be calculated as
 * sizeof(struct shash_desc) + crypto_shash_descsize(alg)
 *
 * Return: size of the operational state
 */
static inline unsigned int crypto_shash_descsize(struct crypto_shash *tfm)
{
        return tfm->descsize;
}

static inline void *shash_desc_ctx(struct shash_desc *desc)
{
        return desc->__ctx;
}

/**
 * crypto_shash_setkey() - set key for message digest
 * @tfm: cipher handle
 * @key: buffer holding the key
 * @keylen: length of the key in bytes
 *
 * The caller provided key is set for the keyed message digest cipher. The
 * cipher handle must point to a keyed message digest cipher in order for this
 * function to succeed.
 *
 * Context: Any context.
 * Return: 0 if the setting of the key was successful; < 0 if an error occurred
 */
int crypto_shash_setkey(struct crypto_shash *tfm, const u8 *key,
                        unsigned int keylen);

/**
 * crypto_shash_digest() - calculate message digest for buffer
 * @desc: see crypto_shash_final()
 * @data: see crypto_shash_update()
 * @len: see crypto_shash_update()
 * @out: see crypto_shash_final()
 *
 * This function is a "short-hand" for the function calls of crypto_shash_init,
 * crypto_shash_update and crypto_shash_final. The parameters have the same
 * meaning as discussed for those separate three functions.
 *
 * Context: Any context.
 * Return: 0 if the message digest creation was successful; < 0 if an error
 *           occurred
 */
int crypto_shash_digest(struct shash_desc *desc, const u8 *data,
                        unsigned int len, u8 *out);

/**
 * crypto_shash_tfm_digest() - calculate message digest for buffer
 * @tfm: hash transformation object
 * @data: see crypto_shash_update()
 * @len: see crypto_shash_update()
 * @out: see crypto_shash_final()
 *
 * This is a simplified version of crypto_shash_digest() for users who don't
 * want to allocate their own hash descriptor (shash_desc).  Instead,
 * crypto_shash_tfm_digest() takes a hash transformation object (crypto_shash)
 * directly, and it allocates a hash descriptor on the stack internally.
 * Note that this stack allocation may be fairly large.
 *
 * Context: Any context.
 * Return: 0 on success; < 0 if an error occurred.
 */
int crypto_shash_tfm_digest(struct crypto_shash *tfm, const u8 *data,
                            unsigned int len, u8 *out);

/**
 * crypto_shash_export() - extract operational state for message digest
 * @desc: reference to the operational state handle whose state is exported
 * @out: output buffer of sufficient size that can hold the hash state
 *
 * This function exports the hash state of the operational state handle into the
 * caller-allocated output buffer out which must have sufficient size (e.g. by
 * calling crypto_shash_descsize).
 *
 * Context: Any context.
 * Return: 0 if the export creation was successful; < 0 if an error occurred
 */
static inline int crypto_shash_export(struct shash_desc *desc, void *out)
{
        return crypto_shash_alg(desc->tfm)->export(desc, out);
}

/**
 * crypto_shash_import() - import operational state
 * @desc: reference to the operational state handle the state imported into
 * @in: buffer holding the state
 *
 * This function imports the hash state into the operational state handle from
 * the input buffer. That buffer should have been generated with the
 * crypto_ahash_export function.
 *
 * Context: Any context.
 * Return: 0 if the import was successful; < 0 if an error occurred
 */
static inline int crypto_shash_import(struct shash_desc *desc, const void *in)
{
        struct crypto_shash *tfm = desc->tfm;

        if (crypto_shash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
                return -ENOKEY;

        return crypto_shash_alg(tfm)->import(desc, in);
}

/**
 * crypto_shash_init() - (re)initialize message digest
 * @desc: operational state handle that is already filled
 *
 * The call (re-)initializes the message digest referenced by the
 * operational state handle. Any potentially existing state created by
 * previous operations is discarded.
 *
 * Context: Any context.
 * Return: 0 if the message digest initialization was successful; < 0 if an
 *           error occurred
 */
static inline int crypto_shash_init(struct shash_desc *desc)
{
        struct crypto_shash *tfm = desc->tfm;

        if (crypto_shash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
                return -ENOKEY;

        return crypto_shash_alg(tfm)->init(desc);
}

/**
 * crypto_shash_update() - add data to message digest for processing
 * @desc: operational state handle that is already initialized
 * @data: input data to be added to the message digest
 * @len: length of the input data
 *
 * Updates the message digest state of the operational state handle.
 *
 * Context: Any context.
 * Return: 0 if the message digest update was successful; < 0 if an error
 *           occurred
 */
int crypto_shash_update(struct shash_desc *desc, const u8 *data,
                        unsigned int len);

/**
 * crypto_shash_final() - calculate message digest
 * @desc: operational state handle that is already filled with data
 * @out: output buffer filled with the message digest
 *
 * Finalize the message digest operation and create the message digest
 * based on all data added to the cipher handle. The message digest is placed
 * into the output buffer. The caller must ensure that the output buffer is
 * large enough by using crypto_shash_digestsize.
 *
 * Context: Any context.
 * Return: 0 if the message digest creation was successful; < 0 if an error
 *           occurred
 */
int crypto_shash_final(struct shash_desc *desc, u8 *out);

/**
 * crypto_shash_finup() - calculate message digest of buffer
 * @desc: see crypto_shash_final()
 * @data: see crypto_shash_update()
 * @len: see crypto_shash_update()
 * @out: see crypto_shash_final()
 *
 * This function is a "short-hand" for the function calls of
 * crypto_shash_update and crypto_shash_final. The parameters have the same
 * meaning as discussed for those separate functions.
 *
 * Context: Any context.
 * Return: 0 if the message digest creation was successful; < 0 if an error
 *           occurred
 */
int crypto_shash_finup(struct shash_desc *desc, const u8 *data,
                       unsigned int len, u8 *out);

static inline void shash_desc_zero(struct shash_desc *desc)
{
        memzero_explicit(desc,
                         sizeof(*desc) + crypto_shash_descsize(desc->tfm));
}

#endif        /* _CRYPTO_HASH_H */
















































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
/* SPDX-License-Identifier: GPL-2.0-only */

#ifndef _NET_ETHTOOL_NETLINK_H
#define _NET_ETHTOOL_NETLINK_H

#include <linux/ethtool_netlink.h>
#include <linux/netdevice.h>
#include <net/genetlink.h>
#include <net/sock.h>

struct ethnl_req_info;

int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info,
                               const struct nlattr *nest, struct net *net,
                               struct netlink_ext_ack *extack,
                               bool require_dev);
int ethnl_fill_reply_header(struct sk_buff *skb, struct net_device *dev,
                            u16 attrtype);
struct sk_buff *ethnl_reply_init(size_t payload, struct net_device *dev, u8 cmd,
                                 u16 hdr_attrtype, struct genl_info *info,
                                 void **ehdrp);
void *ethnl_dump_put(struct sk_buff *skb, struct netlink_callback *cb, u8 cmd);
void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd);
int ethnl_multicast(struct sk_buff *skb, struct net_device *dev);

/**
 * ethnl_strz_size() - calculate attribute length for fixed size string
 * @s: ETH_GSTRING_LEN sized string (may not be null terminated)
 *
 * Return: total length of an attribute with null terminated string from @s
 */
static inline int ethnl_strz_size(const char *s)
{
        return nla_total_size(strnlen(s, ETH_GSTRING_LEN) + 1);
}

/**
 * ethnl_put_strz() - put string attribute with fixed size string
 * @skb:     skb with the message
 * @attrype: attribute type
 * @s:       ETH_GSTRING_LEN sized string (may not be null terminated)
 *
 * Puts an attribute with null terminated string from @s into the message.
 *
 * Return: 0 on success, negative error code on failure
 */
static inline int ethnl_put_strz(struct sk_buff *skb, u16 attrtype,
                                 const char *s)
{
        unsigned int len = strnlen(s, ETH_GSTRING_LEN);
        struct nlattr *attr;

        attr = nla_reserve(skb, attrtype, len + 1);
        if (!attr)
                return -EMSGSIZE;

        memcpy(nla_data(attr), s, len);
        ((char *)nla_data(attr))[len] = '\0';
        return 0;
}

/**
 * ethnl_update_u32() - update u32 value from NLA_U32 attribute
 * @dst:  value to update
 * @attr: netlink attribute with new value or null
 * @mod:  pointer to bool for modification tracking
 *
 * Copy the u32 value from NLA_U32 netlink attribute @attr into variable
 * pointed to by @dst; do nothing if @attr is null. Bool pointed to by @mod
 * is set to true if this function changed the value of *dst, otherwise it
 * is left as is.
 */
static inline void ethnl_update_u32(u32 *dst, const struct nlattr *attr,
                                    bool *mod)
{
        u32 val;

        if (!attr)
                return;
        val = nla_get_u32(attr);
        if (*dst == val)
                return;

        *dst = val;
        *mod = true;
}

/**
 * ethnl_update_u8() - update u8 value from NLA_U8 attribute
 * @dst:  value to update
 * @attr: netlink attribute with new value or null
 * @mod:  pointer to bool for modification tracking
 *
 * Copy the u8 value from NLA_U8 netlink attribute @attr into variable
 * pointed to by @dst; do nothing if @attr is null. Bool pointed to by @mod
 * is set to true if this function changed the value of *dst, otherwise it
 * is left as is.
 */
static inline void ethnl_update_u8(u8 *dst, const struct nlattr *attr,
                                   bool *mod)
{
        u8 val;

        if (!attr)
                return;
        val = nla_get_u8(attr);
        if (*dst == val)
                return;

        *dst = val;
        *mod = true;
}

/**
 * ethnl_update_bool32() - update u32 used as bool from NLA_U8 attribute
 * @dst:  value to update
 * @attr: netlink attribute with new value or null
 * @mod:  pointer to bool for modification tracking
 *
 * Use the u8 value from NLA_U8 netlink attribute @attr to set u32 variable
 * pointed to by @dst to 0 (if zero) or 1 (if not); do nothing if @attr is
 * null. Bool pointed to by @mod is set to true if this function changed the
 * logical value of *dst, otherwise it is left as is.
 */
static inline void ethnl_update_bool32(u32 *dst, const struct nlattr *attr,
                                       bool *mod)
{
        u8 val;

        if (!attr)
                return;
        val = !!nla_get_u8(attr);
        if (!!*dst == val)
                return;

        *dst = val;
        *mod = true;
}

/**
 * ethnl_update_binary() - update binary data from NLA_BINARY atribute
 * @dst:  value to update
 * @len:  destination buffer length
 * @attr: netlink attribute with new value or null
 * @mod:  pointer to bool for modification tracking
 *
 * Use the u8 value from NLA_U8 netlink attribute @attr to rewrite data block
 * of length @len at @dst by attribute payload; do nothing if @attr is null.
 * Bool pointed to by @mod is set to true if this function changed the logical
 * value of *dst, otherwise it is left as is.
 */
static inline void ethnl_update_binary(void *dst, unsigned int len,
                                       const struct nlattr *attr, bool *mod)
{
        if (!attr)
                return;
        if (nla_len(attr) < len)
                len = nla_len(attr);
        if (!memcmp(dst, nla_data(attr), len))
                return;

        memcpy(dst, nla_data(attr), len);
        *mod = true;
}

/**
 * ethnl_update_bitfield32() - update u32 value from NLA_BITFIELD32 attribute
 * @dst:  value to update
 * @attr: netlink attribute with new value or null
 * @mod:  pointer to bool for modification tracking
 *
 * Update bits in u32 value which are set in attribute's mask to values from
 * attribute's value. Do nothing if @attr is null or the value wouldn't change;
 * otherwise, set bool pointed to by @mod to true.
 */
static inline void ethnl_update_bitfield32(u32 *dst, const struct nlattr *attr,
                                           bool *mod)
{
        struct nla_bitfield32 change;
        u32 newval;

        if (!attr)
                return;
        change = nla_get_bitfield32(attr);
        newval = (*dst & ~change.selector) | (change.value & change.selector);
        if (*dst == newval)
                return;

        *dst = newval;
        *mod = true;
}

/**
 * ethnl_reply_header_size() - total size of reply header
 *
 * This is an upper estimate so that we do not need to hold RTNL lock longer
 * than necessary (to prevent rename between size estimate and composing the
 * message). Accounts only for device ifindex and name as those are the only
 * attributes ethnl_fill_reply_header() puts into the reply header.
 */
static inline unsigned int ethnl_reply_header_size(void)
{
        return nla_total_size(nla_total_size(sizeof(u32)) +
                              nla_total_size(IFNAMSIZ));
}

/* GET request handling */

/* Unified processing of GET requests uses two data structures: request info
 * and reply data. Request info holds information parsed from client request
 * and its stays constant through all request processing. Reply data holds data
 * retrieved from ethtool_ops callbacks or other internal sources which is used
 * to compose the reply. When processing a dump request, request info is filled
 * only once (when the request message is parsed) but reply data is filled for
 * each reply message.
 *
 * Both structures consist of part common for all request types (struct
 * ethnl_req_info and struct ethnl_reply_data defined below) and optional
 * parts specific for each request type. Common part always starts at offset 0.
 */

/**
 * struct ethnl_req_info - base type of request information for GET requests
 * @dev:   network device the request is for (may be null)
 * @flags: request flags common for all request types
 *
 * This is a common base for request specific structures holding data from
 * parsed userspace request. These always embed struct ethnl_req_info at
 * zero offset.
 */
struct ethnl_req_info {
        struct net_device        *dev;
        u32                        flags;
};

/**
 * struct ethnl_reply_data - base type of reply data for GET requests
 * @dev:       device for current reply message; in single shot requests it is
 *             equal to &ethnl_req_info.dev; in dumps it's different for each
 *             reply message
 *
 * This is a common base for request specific structures holding data for
 * kernel reply message. These always embed struct ethnl_reply_data at zero
 * offset.
 */
struct ethnl_reply_data {
        struct net_device                *dev;
};

static inline int ethnl_ops_begin(struct net_device *dev)
{
        if (dev && dev->reg_state == NETREG_UNREGISTERING)
                return -ENODEV;

        if (dev && dev->ethtool_ops->begin)
                return dev->ethtool_ops->begin(dev);
        else
                return 0;
}

static inline void ethnl_ops_complete(struct net_device *dev)
{
        if (dev && dev->ethtool_ops->complete)
                dev->ethtool_ops->complete(dev);
}

/**
 * struct ethnl_request_ops - unified handling of GET requests
 * @request_cmd:      command id for request (GET)
 * @reply_cmd:        command id for reply (GET_REPLY)
 * @hdr_attr:         attribute type for request header
 * @req_info_size:    size of request info
 * @reply_data_size:  size of reply data
 * @allow_nodev_do:   allow non-dump request with no device identification
 * @parse_request:
 *        Parse request except common header (struct ethnl_req_info). Common
 *        header is already filled on entry, the rest up to @repdata_offset
 *        is zero initialized. This callback should only modify type specific
 *        request info by parsed attributes from request message.
 * @prepare_data:
 *        Retrieve and prepare data needed to compose a reply message. Calls to
 *        ethtool_ops handlers are limited to this callback. Common reply data
 *        (struct ethnl_reply_data) is filled on entry, type specific part after
 *        it is zero initialized. This callback should only modify the type
 *        specific part of reply data. Device identification from struct
 *        ethnl_reply_data is to be used as for dump requests, it iterates
 *        through network devices while dev member of struct ethnl_req_info
 *        points to the device from client request.
 * @reply_size:
 *        Estimate reply message size. Returned value must be sufficient for
 *        message payload without common reply header. The callback may returned
 *        estimate higher than actual message size if exact calculation would
 *        not be worth the saved memory space.
 * @fill_reply:
 *        Fill reply message payload (except for common header) from reply data.
 *        The callback must not generate more payload than previously called
 *        ->reply_size() estimated.
 * @cleanup_data:
 *        Optional cleanup called when reply data is no longer needed. Can be
 *        used e.g. to free any additional data structures outside the main
 *        structure which were allocated by ->prepare_data(). When processing
 *        dump requests, ->cleanup() is called for each message.
 *
 * Description of variable parts of GET request handling when using the
 * unified infrastructure. When used, a pointer to an instance of this
 * structure is to be added to &ethnl_default_requests array and generic
 * handlers ethnl_default_doit(), ethnl_default_dumpit(),
 * ethnl_default_start() and ethnl_default_done() used in @ethtool_genl_ops;
 * ethnl_default_notify() can be used in @ethnl_notify_handlers to send
 * notifications of the corresponding type.
 */
struct ethnl_request_ops {
        u8                        request_cmd;
        u8                        reply_cmd;
        u16                        hdr_attr;
        unsigned int                req_info_size;
        unsigned int                reply_data_size;
        bool                        allow_nodev_do;

        int (*parse_request)(struct ethnl_req_info *req_info,
                             struct nlattr **tb,
                             struct netlink_ext_ack *extack);
        int (*prepare_data)(const struct ethnl_req_info *req_info,
                            struct ethnl_reply_data *reply_data,
                            struct genl_info *info);
        int (*reply_size)(const struct ethnl_req_info *req_info,
                          const struct ethnl_reply_data *reply_data);
        int (*fill_reply)(struct sk_buff *skb,
                          const struct ethnl_req_info *req_info,
                          const struct ethnl_reply_data *reply_data);
        void (*cleanup_data)(struct ethnl_reply_data *reply_data);
};

/* request handlers */

extern const struct ethnl_request_ops ethnl_strset_request_ops;
extern const struct ethnl_request_ops ethnl_linkinfo_request_ops;
extern const struct ethnl_request_ops ethnl_linkmodes_request_ops;
extern const struct ethnl_request_ops ethnl_linkstate_request_ops;
extern const struct ethnl_request_ops ethnl_debug_request_ops;
extern const struct ethnl_request_ops ethnl_wol_request_ops;
extern const struct ethnl_request_ops ethnl_features_request_ops;
extern const struct ethnl_request_ops ethnl_privflags_request_ops;
extern const struct ethnl_request_ops ethnl_rings_request_ops;
extern const struct ethnl_request_ops ethnl_channels_request_ops;
extern const struct ethnl_request_ops ethnl_coalesce_request_ops;
extern const struct ethnl_request_ops ethnl_pause_request_ops;
extern const struct ethnl_request_ops ethnl_eee_request_ops;
extern const struct ethnl_request_ops ethnl_tsinfo_request_ops;

extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1];
extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1];
extern const struct nla_policy ethnl_strset_get_policy[ETHTOOL_A_STRSET_COUNTS_ONLY + 1];
extern const struct nla_policy ethnl_linkinfo_get_policy[ETHTOOL_A_LINKINFO_HEADER + 1];
extern const struct nla_policy ethnl_linkinfo_set_policy[ETHTOOL_A_LINKINFO_TP_MDIX_CTRL + 1];
extern const struct nla_policy ethnl_linkmodes_get_policy[ETHTOOL_A_LINKMODES_HEADER + 1];
extern const struct nla_policy ethnl_linkmodes_set_policy[ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG + 1];
extern const struct nla_policy ethnl_linkstate_get_policy[ETHTOOL_A_LINKSTATE_HEADER + 1];
extern const struct nla_policy ethnl_debug_get_policy[ETHTOOL_A_DEBUG_HEADER + 1];
extern const struct nla_policy ethnl_debug_set_policy[ETHTOOL_A_DEBUG_MSGMASK + 1];
extern const struct nla_policy ethnl_wol_get_policy[ETHTOOL_A_WOL_HEADER + 1];
extern const struct nla_policy ethnl_wol_set_policy[ETHTOOL_A_WOL_SOPASS + 1];
extern const struct nla_policy ethnl_features_get_policy[ETHTOOL_A_FEATURES_HEADER + 1];
extern const struct nla_policy ethnl_features_set_policy[ETHTOOL_A_FEATURES_WANTED + 1];
extern const struct nla_policy ethnl_privflags_get_policy[ETHTOOL_A_PRIVFLAGS_HEADER + 1];
extern const struct nla_policy ethnl_privflags_set_policy[ETHTOOL_A_PRIVFLAGS_FLAGS + 1];
extern const struct nla_policy ethnl_rings_get_policy[ETHTOOL_A_RINGS_HEADER + 1];
extern const struct nla_policy ethnl_rings_set_policy[ETHTOOL_A_RINGS_TX + 1];
extern const struct nla_policy ethnl_channels_get_policy[ETHTOOL_A_CHANNELS_HEADER + 1];
extern const struct nla_policy ethnl_channels_set_policy[ETHTOOL_A_CHANNELS_COMBINED_COUNT + 1];
extern const struct nla_policy ethnl_coalesce_get_policy[ETHTOOL_A_COALESCE_HEADER + 1];
extern const struct nla_policy ethnl_coalesce_set_policy[ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL + 1];
extern const struct nla_policy ethnl_pause_get_policy[ETHTOOL_A_PAUSE_HEADER + 1];
extern const struct nla_policy ethnl_pause_set_policy[ETHTOOL_A_PAUSE_TX + 1];
extern const struct nla_policy ethnl_eee_get_policy[ETHTOOL_A_EEE_HEADER + 1];
extern const struct nla_policy ethnl_eee_set_policy[ETHTOOL_A_EEE_TX_LPI_TIMER + 1];
extern const struct nla_policy ethnl_tsinfo_get_policy[ETHTOOL_A_TSINFO_HEADER + 1];
extern const struct nla_policy ethnl_cable_test_act_policy[ETHTOOL_A_CABLE_TEST_HEADER + 1];
extern const struct nla_policy ethnl_cable_test_tdr_act_policy[ETHTOOL_A_CABLE_TEST_TDR_CFG + 1];
extern const struct nla_policy ethnl_tunnel_info_get_policy[ETHTOOL_A_TUNNEL_INFO_HEADER + 1];

int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info);
int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info);
int ethnl_set_debug(struct sk_buff *skb, struct genl_info *info);
int ethnl_set_wol(struct sk_buff *skb, struct genl_info *info);
int ethnl_set_features(struct sk_buff *skb, struct genl_info *info);
int ethnl_set_privflags(struct sk_buff *skb, struct genl_info *info);
int ethnl_set_rings(struct sk_buff *skb, struct genl_info *info);
int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info);
int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info);
int ethnl_set_pause(struct sk_buff *skb, struct genl_info *info);
int ethnl_set_eee(struct sk_buff *skb, struct genl_info *info);
int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info);
int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info);
int ethnl_tunnel_info_doit(struct sk_buff *skb, struct genl_info *info);
int ethnl_tunnel_info_start(struct netlink_callback *cb);
int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb);

#endif /* _NET_ETHTOOL_NETLINK_H */

















































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_DESC_H
#define _ASM_X86_DESC_H

#include <asm/desc_defs.h>
#include <asm/ldt.h>
#include <asm/mmu.h>
#include <asm/fixmap.h>
#include <asm/irq_vectors.h>
#include <asm/cpu_entry_area.h>

#include <linux/smp.h>
#include <linux/percpu.h>

static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *info)
{
        desc->limit0                = info->limit & 0x0ffff;

        desc->base0                = (info->base_addr & 0x0000ffff);
        desc->base1                = (info->base_addr & 0x00ff0000) >> 16;

        desc->type                = (info->read_exec_only ^ 1) << 1;
        desc->type               |= info->contents << 2;
        /* Set the ACCESS bit so it can be mapped RO */
        desc->type               |= 1;

        desc->s                        = 1;
        desc->dpl                = 0x3;
        desc->p                        = info->seg_not_present ^ 1;
        desc->limit1                = (info->limit & 0xf0000) >> 16;
        desc->avl                = info->useable;
        desc->d                        = info->seg_32bit;
        desc->g                        = info->limit_in_pages;

        desc->base2                = (info->base_addr & 0xff000000) >> 24;
        /*
         * Don't allow setting of the lm bit. It would confuse
         * user_64bit_mode and would get overridden by sysret anyway.
         */
        desc->l                        = 0;
}

struct gdt_page {
        struct desc_struct gdt[GDT_ENTRIES];
} __attribute__((aligned(PAGE_SIZE)));

DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);

/* Provide the original GDT */
static inline struct desc_struct *get_cpu_gdt_rw(unsigned int cpu)
{
        return per_cpu(gdt_page, cpu).gdt;
}

/* Provide the current original GDT */
static inline struct desc_struct *get_current_gdt_rw(void)
{
        return this_cpu_ptr(&gdt_page)->gdt;
}

/* Provide the fixmap address of the remapped GDT */
static inline struct desc_struct *get_cpu_gdt_ro(int cpu)
{
        return (struct desc_struct *)&get_cpu_entry_area(cpu)->gdt;
}

/* Provide the current read-only GDT */
static inline struct desc_struct *get_current_gdt_ro(void)
{
        return get_cpu_gdt_ro(smp_processor_id());
}

/* Provide the physical address of the GDT page. */
static inline phys_addr_t get_cpu_gdt_paddr(unsigned int cpu)
{
        return per_cpu_ptr_to_phys(get_cpu_gdt_rw(cpu));
}

static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
                             unsigned dpl, unsigned ist, unsigned seg)
{
        gate->offset_low        = (u16) func;
        gate->bits.p                = 1;
        gate->bits.dpl                = dpl;
        gate->bits.zero                = 0;
        gate->bits.type                = type;
        gate->offset_middle        = (u16) (func >> 16);
#ifdef CONFIG_X86_64
        gate->segment                = __KERNEL_CS;
        gate->bits.ist                = ist;
        gate->reserved                = 0;
        gate->offset_high        = (u32) (func >> 32);
#else
        gate->segment                = seg;
        gate->bits.ist                = 0;
#endif
}

static inline int desc_empty(const void *ptr)
{
        const u32 *desc = ptr;

        return !(desc[0] | desc[1]);
}

#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else
#define load_TR_desc()                                native_load_tr_desc()
#define load_gdt(dtr)                                native_load_gdt(dtr)
#define load_idt(dtr)                                native_load_idt(dtr)
#define load_tr(tr)                                asm volatile("ltr %0"::"m" (tr))
#define load_ldt(ldt)                                asm volatile("lldt %0"::"m" (ldt))

#define store_gdt(dtr)                                native_store_gdt(dtr)
#define store_tr(tr)                                (tr = native_store_tr())

#define load_TLS(t, cpu)                        native_load_tls(t, cpu)
#define set_ldt                                        native_set_ldt

#define write_ldt_entry(dt, entry, desc)        native_write_ldt_entry(dt, entry, desc)
#define write_gdt_entry(dt, entry, desc, type)        native_write_gdt_entry(dt, entry, desc, type)
#define write_idt_entry(dt, entry, g)                native_write_idt_entry(dt, entry, g)

static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
{
}

static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
{
}
#endif        /* CONFIG_PARAVIRT_XXL */

#define store_ldt(ldt) asm("sldt %0" : "=m"(ldt))

static inline void native_write_idt_entry(gate_desc *idt, int entry, const gate_desc *gate)
{
        memcpy(&idt[entry], gate, sizeof(*gate));
}

static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry, const void *desc)
{
        memcpy(&ldt[entry], desc, 8);
}

static inline void
native_write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc, int type)
{
        unsigned int size;

        switch (type) {
        case DESC_TSS:        size = sizeof(tss_desc);        break;
        case DESC_LDT:        size = sizeof(ldt_desc);        break;
        default:        size = sizeof(*gdt);                break;
        }

        memcpy(&gdt[entry], desc, size);
}

static inline void set_tssldt_descriptor(void *d, unsigned long addr,
                                         unsigned type, unsigned size)
{
        struct ldttss_desc *desc = d;

        memset(desc, 0, sizeof(*desc));

        desc->limit0                = (u16) size;
        desc->base0                = (u16) addr;
        desc->base1                = (addr >> 16) & 0xFF;
        desc->type                = type;
        desc->p                        = 1;
        desc->limit1                = (size >> 16) & 0xF;
        desc->base2                = (addr >> 24) & 0xFF;
#ifdef CONFIG_X86_64
        desc->base3                = (u32) (addr >> 32);
#endif
}

static inline void __set_tss_desc(unsigned cpu, unsigned int entry, struct x86_hw_tss *addr)
{
        struct desc_struct *d = get_cpu_gdt_rw(cpu);
        tss_desc tss;

        set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS,
                              __KERNEL_TSS_LIMIT);
        write_gdt_entry(d, entry, &tss, DESC_TSS);
}

#define set_tss_desc(cpu, addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)

static inline void native_set_ldt(const void *addr, unsigned int entries)
{
        if (likely(entries == 0))
                asm volatile("lldt %w0"::"q" (0));
        else {
                unsigned cpu = smp_processor_id();
                ldt_desc ldt;

                set_tssldt_descriptor(&ldt, (unsigned long)addr, DESC_LDT,
                                      entries * LDT_ENTRY_SIZE - 1);
                write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_LDT,
                                &ldt, DESC_LDT);
                asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
        }
}

static inline void native_load_gdt(const struct desc_ptr *dtr)
{
        asm volatile("lgdt %0"::"m" (*dtr));
}

static __always_inline void native_load_idt(const struct desc_ptr *dtr)
{
        asm volatile("lidt %0"::"m" (*dtr));
}

static inline void native_store_gdt(struct desc_ptr *dtr)
{
        asm volatile("sgdt %0":"=m" (*dtr));
}

static inline void store_idt(struct desc_ptr *dtr)
{
        asm volatile("sidt %0":"=m" (*dtr));
}

/*
 * The LTR instruction marks the TSS GDT entry as busy. On 64-bit, the GDT is
 * a read-only remapping. To prevent a page fault, the GDT is switched to the
 * original writeable version when needed.
 */
#ifdef CONFIG_X86_64
static inline void native_load_tr_desc(void)
{
        struct desc_ptr gdt;
        int cpu = raw_smp_processor_id();
        bool restore = 0;
        struct desc_struct *fixmap_gdt;

        native_store_gdt(&gdt);
        fixmap_gdt = get_cpu_gdt_ro(cpu);

        /*
         * If the current GDT is the read-only fixmap, swap to the original
         * writeable version. Swap back at the end.
         */
        if (gdt.address == (unsigned long)fixmap_gdt) {
                load_direct_gdt(cpu);
                restore = 1;
        }
        asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
        if (restore)
                load_fixmap_gdt(cpu);
}
#else
static inline void native_load_tr_desc(void)
{
        asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
}
#endif

static inline unsigned long native_store_tr(void)
{
        unsigned long tr;

        asm volatile("str %0":"=r" (tr));

        return tr;
}

static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
{
        struct desc_struct *gdt = get_cpu_gdt_rw(cpu);
        unsigned int i;

        for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
                gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
}

DECLARE_PER_CPU(bool, __tss_limit_invalid);

static inline void force_reload_TR(void)
{
        struct desc_struct *d = get_current_gdt_rw();
        tss_desc tss;

        memcpy(&tss, &d[GDT_ENTRY_TSS], sizeof(tss_desc));

        /*
         * LTR requires an available TSS, and the TSS is currently
         * busy.  Make it be available so that LTR will work.
         */
        tss.type = DESC_TSS;
        write_gdt_entry(d, GDT_ENTRY_TSS, &tss, DESC_TSS);

        load_TR_desc();
        this_cpu_write(__tss_limit_invalid, false);
}

/*
 * Call this if you need the TSS limit to be correct, which should be the case
 * if and only if you have TIF_IO_BITMAP set or you're switching to a task
 * with TIF_IO_BITMAP set.
 */
static inline void refresh_tss_limit(void)
{
        DEBUG_LOCKS_WARN_ON(preemptible());

        if (unlikely(this_cpu_read(__tss_limit_invalid)))
                force_reload_TR();
}

/*
 * If you do something evil that corrupts the cached TSS limit (I'm looking
 * at you, VMX exits), call this function.
 *
 * The optimization here is that the TSS limit only matters for Linux if the
 * IO bitmap is in use.  If the TSS limit gets forced to its minimum value,
 * everything works except that IO bitmap will be ignored and all CPL 3 IO
 * instructions will #GP, which is exactly what we want for normal tasks.
 */
static inline void invalidate_tss_limit(void)
{
        DEBUG_LOCKS_WARN_ON(preemptible());

        if (unlikely(test_thread_flag(TIF_IO_BITMAP)))
                force_reload_TR();
        else
                this_cpu_write(__tss_limit_invalid, true);
}

/* This intentionally ignores lm, since 32-bit apps don't have that field. */
#define LDT_empty(info)                                        \
        ((info)->base_addr                == 0        &&        \
         (info)->limit                        == 0        &&        \
         (info)->contents                == 0        &&        \
         (info)->read_exec_only                == 1        &&        \
         (info)->seg_32bit                == 0        &&        \
         (info)->limit_in_pages                == 0        &&        \
         (info)->seg_not_present        == 1        &&        \
         (info)->useable                == 0)

/* Lots of programs expect an all-zero user_desc to mean "no segment at all". */
static inline bool LDT_zero(const struct user_desc *info)
{
        return (info->base_addr                == 0 &&
                info->limit                == 0 &&
                info->contents                == 0 &&
                info->read_exec_only        == 0 &&
                info->seg_32bit                == 0 &&
                info->limit_in_pages        == 0 &&
                info->seg_not_present        == 0 &&
                info->useable                == 0);
}

static inline void clear_LDT(void)
{
        set_ldt(NULL, 0);
}

static inline unsigned long get_desc_base(const struct desc_struct *desc)
{
        return (unsigned)(desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24));
}

static inline void set_desc_base(struct desc_struct *desc, unsigned long base)
{
        desc->base0 = base & 0xffff;
        desc->base1 = (base >> 16) & 0xff;
        desc->base2 = (base >> 24) & 0xff;
}

static inline unsigned long get_desc_limit(const struct desc_struct *desc)
{
        return desc->limit0 | (desc->limit1 << 16);
}

static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit)
{
        desc->limit0 = limit & 0xffff;
        desc->limit1 = (limit >> 16) & 0xf;
}

void alloc_intr_gate(unsigned int n, const void *addr);

static inline void init_idt_data(struct idt_data *data, unsigned int n,
                                 const void *addr)
{
        BUG_ON(n > 0xFF);

        memset(data, 0, sizeof(*data));
        data->vector        = n;
        data->addr        = addr;
        data->segment        = __KERNEL_CS;
        data->bits.type        = GATE_INTERRUPT;
        data->bits.p        = 1;
}

static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d)
{
        unsigned long addr = (unsigned long) d->addr;

        gate->offset_low        = (u16) addr;
        gate->segment                = (u16) d->segment;
        gate->bits                = d->bits;
        gate->offset_middle        = (u16) (addr >> 16);
#ifdef CONFIG_X86_64
        gate->offset_high        = (u32) (addr >> 32);
        gate->reserved                = 0;
#endif
}

extern unsigned long system_vectors[];

extern void load_current_idt(void);
extern void idt_setup_early_handler(void);
extern void idt_setup_early_traps(void);
extern void idt_setup_traps(void);
extern void idt_setup_apic_and_irq_gates(void);
extern bool idt_is_f00f_address(unsigned long address);

#ifdef CONFIG_X86_64
extern void idt_setup_early_pf(void);
extern void idt_setup_ist_traps(void);
#else
static inline void idt_setup_early_pf(void) { }
static inline void idt_setup_ist_traps(void) { }
#endif

extern void idt_invalidate(void *addr);

#endif /* _ASM_X86_DESC_H */

























































    4 























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MMIOTRACE_H
#define _LINUX_MMIOTRACE_H

#include <linux/types.h>
#include <linux/list.h>

struct kmmio_probe;
struct pt_regs;

typedef void (*kmmio_pre_handler_t)(struct kmmio_probe *,
                                struct pt_regs *, unsigned long addr);
typedef void (*kmmio_post_handler_t)(struct kmmio_probe *,
                                unsigned long condition, struct pt_regs *);

struct kmmio_probe {
        /* kmmio internal list: */
        struct list_head        list;
        /* start location of the probe point: */
        unsigned long                addr;
        /* length of the probe region: */
        unsigned long                len;
        /* Called before addr is executed: */
        kmmio_pre_handler_t        pre_handler;
        /* Called after addr is executed: */
        kmmio_post_handler_t        post_handler;
        void                        *private;
};

extern unsigned int kmmio_count;

extern int register_kmmio_probe(struct kmmio_probe *p);
extern void unregister_kmmio_probe(struct kmmio_probe *p);
extern int kmmio_init(void);
extern void kmmio_cleanup(void);

#ifdef CONFIG_MMIOTRACE
/* kmmio is active by some kmmio_probes? */
static inline int is_kmmio_active(void)
{
        return kmmio_count;
}

/* Called from page fault handler. */
extern int kmmio_handler(struct pt_regs *regs, unsigned long addr);

/* Called from ioremap.c */
extern void mmiotrace_ioremap(resource_size_t offset, unsigned long size,
                                                        void __iomem *addr);
extern void mmiotrace_iounmap(volatile void __iomem *addr);

/* For anyone to insert markers. Remember trailing newline. */
extern __printf(1, 2) int mmiotrace_printk(const char *fmt, ...);
#else /* !CONFIG_MMIOTRACE: */
static inline int is_kmmio_active(void)
{
        return 0;
}

static inline int kmmio_handler(struct pt_regs *regs, unsigned long addr)
{
        return 0;
}

static inline void mmiotrace_ioremap(resource_size_t offset,
                                        unsigned long size, void __iomem *addr)
{
}

static inline void mmiotrace_iounmap(volatile void __iomem *addr)
{
}

static inline __printf(1, 2) int mmiotrace_printk(const char *fmt, ...)
{
        return 0;
}
#endif /* CONFIG_MMIOTRACE */

enum mm_io_opcode {
        MMIO_READ        = 0x1,        /* struct mmiotrace_rw */
        MMIO_WRITE        = 0x2,        /* struct mmiotrace_rw */
        MMIO_PROBE        = 0x3,        /* struct mmiotrace_map */
        MMIO_UNPROBE        = 0x4,        /* struct mmiotrace_map */
        MMIO_UNKNOWN_OP = 0x5,        /* struct mmiotrace_rw */
};

struct mmiotrace_rw {
        resource_size_t        phys;        /* PCI address of register */
        unsigned long        value;
        unsigned long        pc;        /* optional program counter */
        int                map_id;
        unsigned char        opcode;        /* one of MMIO_{READ,WRITE,UNKNOWN_OP} */
        unsigned char        width;        /* size of register access in bytes */
};

struct mmiotrace_map {
        resource_size_t        phys;        /* base address in PCI space */
        unsigned long        virt;        /* base virtual address */
        unsigned long        len;        /* mapping size */
        int                map_id;
        unsigned char        opcode;        /* MMIO_PROBE or MMIO_UNPROBE */
};

/* in kernel/trace/trace_mmiotrace.c */
extern void enable_mmiotrace(void);
extern void disable_mmiotrace(void);
extern void mmio_trace_rw(struct mmiotrace_rw *rw);
extern void mmio_trace_mapping(struct mmiotrace_map *map);
extern __printf(1, 0) int mmio_trace_printk(const char *fmt, va_list args);

#endif /* _LINUX_MMIOTRACE_H */























































    1 














    1 

    1 
    1 
    1 
    1 
























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 


























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
// SPDX-License-Identifier: GPL-2.0-only
/*
 *        fs/libfs.c
 *        Library for filesystems writers.
 */

#include <linux/blkdev.h>
#include <linux/export.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/cred.h>
#include <linux/mount.h>
#include <linux/vfs.h>
#include <linux/quotaops.h>
#include <linux/mutex.h>
#include <linux/namei.h>
#include <linux/exportfs.h>
#include <linux/writeback.h>
#include <linux/buffer_head.h> /* sync_mapping_buffers */
#include <linux/fs_context.h>
#include <linux/pseudo_fs.h>
#include <linux/fsnotify.h>
#include <linux/unicode.h>
#include <linux/fscrypt.h>

#include <linux/uaccess.h>

#include "internal.h"

int simple_getattr(const struct path *path, struct kstat *stat,
                   u32 request_mask, unsigned int query_flags)
{
        struct inode *inode = d_inode(path->dentry);
        generic_fillattr(inode, stat);
        stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9);
        return 0;
}
EXPORT_SYMBOL(simple_getattr);

int simple_statfs(struct dentry *dentry, struct kstatfs *buf)
{
        buf->f_type = dentry->d_sb->s_magic;
        buf->f_bsize = PAGE_SIZE;
        buf->f_namelen = NAME_MAX;
        return 0;
}
EXPORT_SYMBOL(simple_statfs);

/*
 * Retaining negative dentries for an in-memory filesystem just wastes
 * memory and lookup time: arrange for them to be deleted immediately.
 */
int always_delete_dentry(const struct dentry *dentry)
{
        return 1;
}
EXPORT_SYMBOL(always_delete_dentry);

const struct dentry_operations simple_dentry_operations = {
        .d_delete = always_delete_dentry,
};
EXPORT_SYMBOL(simple_dentry_operations);

/*
 * Lookup the data. This is trivial - if the dentry didn't already
 * exist, we know it is negative.  Set d_op to delete negative dentries.
 */
struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
        if (dentry->d_name.len > NAME_MAX)
                return ERR_PTR(-ENAMETOOLONG);
        if (!dentry->d_sb->s_d_op)
                d_set_d_op(dentry, &simple_dentry_operations);
        d_add(dentry, NULL);
        return NULL;
}
EXPORT_SYMBOL(simple_lookup);

int dcache_dir_open(struct inode *inode, struct file *file)
{
        file->private_data = d_alloc_cursor(file->f_path.dentry);

        return file->private_data ? 0 : -ENOMEM;
}
EXPORT_SYMBOL(dcache_dir_open);

int dcache_dir_close(struct inode *inode, struct file *file)
{
        dput(file->private_data);
        return 0;
}
EXPORT_SYMBOL(dcache_dir_close);

/* parent is locked at least shared */
/*
 * Returns an element of siblings' list.
 * We are looking for <count>th positive after <p>; if
 * found, dentry is grabbed and returned to caller.
 * If no such element exists, NULL is returned.
 */
static struct dentry *scan_positives(struct dentry *cursor,
                                        struct list_head *p,
                                        loff_t count,
                                        struct dentry *last)
{
        struct dentry *dentry = cursor->d_parent, *found = NULL;

        spin_lock(&dentry->d_lock);
        while ((p = p->next) != &dentry->d_subdirs) {
                struct dentry *d = list_entry(p, struct dentry, d_child);
                // we must at least skip cursors, to avoid livelocks
                if (d->d_flags & DCACHE_DENTRY_CURSOR)
                        continue;
                if (simple_positive(d) && !--count) {
                        spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
                        if (simple_positive(d))
                                found = dget_dlock(d);
                        spin_unlock(&d->d_lock);
                        if (likely(found))
                                break;
                        count = 1;
                }
                if (need_resched()) {
                        list_move(&cursor->d_child, p);
                        p = &cursor->d_child;
                        spin_unlock(&dentry->d_lock);
                        cond_resched();
                        spin_lock(&dentry->d_lock);
                }
        }
        spin_unlock(&dentry->d_lock);
        dput(last);
        return found;
}

loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
{
        struct dentry *dentry = file->f_path.dentry;
        switch (whence) {
                case 1:
                        offset += file->f_pos;
                        fallthrough;
                case 0:
                        if (offset >= 0)
                                break;
                        fallthrough;
                default:
                        return -EINVAL;
        }
        if (offset != file->f_pos) {
                struct dentry *cursor = file->private_data;
                struct dentry *to = NULL;

                inode_lock_shared(dentry->d_inode);

                if (offset > 2)
                        to = scan_positives(cursor, &dentry->d_subdirs,
                                            offset - 2, NULL);
                spin_lock(&dentry->d_lock);
                if (to)
                        list_move(&cursor->d_child, &to->d_child);
                else
                        list_del_init(&cursor->d_child);
                spin_unlock(&dentry->d_lock);
                dput(to);

                file->f_pos = offset;

                inode_unlock_shared(dentry->d_inode);
        }
        return offset;
}
EXPORT_SYMBOL(dcache_dir_lseek);

/* Relationship between i_mode and the DT_xxx types */
static inline unsigned char dt_type(struct inode *inode)
{
        return (inode->i_mode >> 12) & 15;
}

/*
 * Directory is locked and all positive dentries in it are safe, since
 * for ramfs-type trees they can't go away without unlink() or rmdir(),
 * both impossible due to the lock on directory.
 */

int dcache_readdir(struct file *file, struct dir_context *ctx)
{
        struct dentry *dentry = file->f_path.dentry;
        struct dentry *cursor = file->private_data;
        struct list_head *anchor = &dentry->d_subdirs;
        struct dentry *next = NULL;
        struct list_head *p;

        if (!dir_emit_dots(file, ctx))
                return 0;

        if (ctx->pos == 2)
                p = anchor;
        else if (!list_empty(&cursor->d_child))
                p = &cursor->d_child;
        else
                return 0;

        while ((next = scan_positives(cursor, p, 1, next)) != NULL) {
                if (!dir_emit(ctx, next->d_name.name, next->d_name.len,
                              d_inode(next)->i_ino, dt_type(d_inode(next))))
                        break;
                ctx->pos++;
                p = &next->d_child;
        }
        spin_lock(&dentry->d_lock);
        if (next)
                list_move_tail(&cursor->d_child, &next->d_child);
        else
                list_del_init(&cursor->d_child);
        spin_unlock(&dentry->d_lock);
        dput(next);

        return 0;
}
EXPORT_SYMBOL(dcache_readdir);

ssize_t generic_read_dir(struct file *filp, char __user *buf, size_t siz, loff_t *ppos)
{
        return -EISDIR;
}
EXPORT_SYMBOL(generic_read_dir);

const struct file_operations simple_dir_operations = {
        .open                = dcache_dir_open,
        .release        = dcache_dir_close,
        .llseek                = dcache_dir_lseek,
        .read                = generic_read_dir,
        .iterate_shared        = dcache_readdir,
        .fsync                = noop_fsync,
};
EXPORT_SYMBOL(simple_dir_operations);

const struct inode_operations simple_dir_inode_operations = {
        .lookup                = simple_lookup,
};
EXPORT_SYMBOL(simple_dir_inode_operations);

static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev)
{
        struct dentry *child = NULL;
        struct list_head *p = prev ? &prev->d_child : &parent->d_subdirs;

        spin_lock(&parent->d_lock);
        while ((p = p->next) != &parent->d_subdirs) {
                struct dentry *d = container_of(p, struct dentry, d_child);
                if (simple_positive(d)) {
                        spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
                        if (simple_positive(d))
                                child = dget_dlock(d);
                        spin_unlock(&d->d_lock);
                        if (likely(child))
                                break;
                }
        }
        spin_unlock(&parent->d_lock);
        dput(prev);
        return child;
}

void simple_recursive_removal(struct dentry *dentry,
                              void (*callback)(struct dentry *))
{
        struct dentry *this = dget(dentry);
        while (true) {
                struct dentry *victim = NULL, *child;
                struct inode *inode = this->d_inode;

                inode_lock_nested(inode, I_MUTEX_CHILD);
                if (d_is_dir(this))
                        inode->i_flags |= S_DEAD;
                while ((child = find_next_child(this, victim)) == NULL) {
                        // kill and ascend
                        // update metadata while it's still locked
                        inode->i_ctime = current_time(inode);
                        clear_nlink(inode);
                        inode_unlock(inode);
                        victim = this;
                        this = this->d_parent;
                        inode = this->d_inode;
                        inode_lock_nested(inode, I_MUTEX_CHILD);
                        if (simple_positive(victim)) {
                                d_invalidate(victim);        // avoid lost mounts
                                if (d_is_dir(victim))
                                        fsnotify_rmdir(inode, victim);
                                else
                                        fsnotify_unlink(inode, victim);
                                if (callback)
                                        callback(victim);
                                dput(victim);                // unpin it
                        }
                        if (victim == dentry) {
                                inode->i_ctime = inode->i_mtime =
                                        current_time(inode);
                                if (d_is_dir(dentry))
                                        drop_nlink(inode);
                                inode_unlock(inode);
                                dput(dentry);
                                return;
                        }
                }
                inode_unlock(inode);
                this = child;
        }
}
EXPORT_SYMBOL(simple_recursive_removal);

static const struct super_operations simple_super_operations = {
        .statfs                = simple_statfs,
};

static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc)
{
        struct pseudo_fs_context *ctx = fc->fs_private;
        struct inode *root;

        s->s_maxbytes = MAX_LFS_FILESIZE;
        s->s_blocksize = PAGE_SIZE;
        s->s_blocksize_bits = PAGE_SHIFT;
        s->s_magic = ctx->magic;
        s->s_op = ctx->ops ?: &simple_super_operations;
        s->s_xattr = ctx->xattr;
        s->s_time_gran = 1;
        root = new_inode(s);
        if (!root)
                return -ENOMEM;

        /*
         * since this is the first inode, make it number 1. New inodes created
         * after this must take care not to collide with it (by passing
         * max_reserved of 1 to iunique).
         */
        root->i_ino = 1;
        root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
        root->i_atime = root->i_mtime = root->i_ctime = current_time(root);
        s->s_root = d_make_root(root);
        if (!s->s_root)
                return -ENOMEM;
        s->s_d_op = ctx->dops;
        return 0;
}

static int pseudo_fs_get_tree(struct fs_context *fc)
{
        return get_tree_nodev(fc, pseudo_fs_fill_super);
}

static void pseudo_fs_free(struct fs_context *fc)
{
        kfree(fc->fs_private);
}

static const struct fs_context_operations pseudo_fs_context_ops = {
        .free                = pseudo_fs_free,
        .get_tree        = pseudo_fs_get_tree,
};

/*
 * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
 * will never be mountable)
 */
struct pseudo_fs_context *init_pseudo(struct fs_context *fc,
                                        unsigned long magic)
{
        struct pseudo_fs_context *ctx;

        ctx = kzalloc(sizeof(struct pseudo_fs_context), GFP_KERNEL);
        if (likely(ctx)) {
                ctx->magic = magic;
                fc->fs_private = ctx;
                fc->ops = &pseudo_fs_context_ops;
                fc->sb_flags |= SB_NOUSER;
                fc->global = true;
        }
        return ctx;
}
EXPORT_SYMBOL(init_pseudo);

int simple_open(struct inode *inode, struct file *file)
{
        if (inode->i_private)
                file->private_data = inode->i_private;
        return 0;
}
EXPORT_SYMBOL(simple_open);

int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
{
        struct inode *inode = d_inode(old_dentry);

        inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
        inc_nlink(inode);
        ihold(inode);
        dget(dentry);
        d_instantiate(dentry, inode);
        return 0;
}
EXPORT_SYMBOL(simple_link);

int simple_empty(struct dentry *dentry)
{
        struct dentry *child;
        int ret = 0;

        spin_lock(&dentry->d_lock);
        list_for_each_entry(child, &dentry->d_subdirs, d_child) {
                spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
                if (simple_positive(child)) {
                        spin_unlock(&child->d_lock);
                        goto out;
                }
                spin_unlock(&child->d_lock);
        }
        ret = 1;
out:
        spin_unlock(&dentry->d_lock);
        return ret;
}
EXPORT_SYMBOL(simple_empty);

int simple_unlink(struct inode *dir, struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);

        inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
        drop_nlink(inode);
        dput(dentry);
        return 0;
}
EXPORT_SYMBOL(simple_unlink);

int simple_rmdir(struct inode *dir, struct dentry *dentry)
{
        if (!simple_empty(dentry))
                return -ENOTEMPTY;

        drop_nlink(d_inode(dentry));
        simple_unlink(dir, dentry);
        drop_nlink(dir);
        return 0;
}
EXPORT_SYMBOL(simple_rmdir);

int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
                  struct inode *new_dir, struct dentry *new_dentry,
                  unsigned int flags)
{
        struct inode *inode = d_inode(old_dentry);
        int they_are_dirs = d_is_dir(old_dentry);

        if (flags & ~RENAME_NOREPLACE)
                return -EINVAL;

        if (!simple_empty(new_dentry))
                return -ENOTEMPTY;

        if (d_really_is_positive(new_dentry)) {
                simple_unlink(new_dir, new_dentry);
                if (they_are_dirs) {
                        drop_nlink(d_inode(new_dentry));
                        drop_nlink(old_dir);
                }
        } else if (they_are_dirs) {
                drop_nlink(old_dir);
                inc_nlink(new_dir);
        }

        old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime =
                new_dir->i_mtime = inode->i_ctime = current_time(old_dir);

        return 0;
}
EXPORT_SYMBOL(simple_rename);

/**
 * simple_setattr - setattr for simple filesystem
 * @dentry: dentry
 * @iattr: iattr structure
 *
 * Returns 0 on success, -error on failure.
 *
 * simple_setattr is a simple ->setattr implementation without a proper
 * implementation of size changes.
 *
 * It can either be used for in-memory filesystems or special files
 * on simple regular filesystems.  Anything that needs to change on-disk
 * or wire state on size changes needs its own setattr method.
 */
int simple_setattr(struct dentry *dentry, struct iattr *iattr)
{
        struct inode *inode = d_inode(dentry);
        int error;

        error = setattr_prepare(dentry, iattr);
        if (error)
                return error;

        if (iattr->ia_valid & ATTR_SIZE)
                truncate_setsize(inode, iattr->ia_size);
        setattr_copy(inode, iattr);
        mark_inode_dirty(inode);
        return 0;
}
EXPORT_SYMBOL(simple_setattr);

int simple_readpage(struct file *file, struct page *page)
{
        clear_highpage(page);
        flush_dcache_page(page);
        SetPageUptodate(page);
        unlock_page(page);
        return 0;
}
EXPORT_SYMBOL(simple_readpage);

int simple_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata)
{
        struct page *page;
        pgoff_t index;

        index = pos >> PAGE_SHIFT;

        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
                return -ENOMEM;

        *pagep = page;

        if (!PageUptodate(page) && (len != PAGE_SIZE)) {
                unsigned from = pos & (PAGE_SIZE - 1);

                zero_user_segments(page, 0, from, from + len, PAGE_SIZE);
        }
        return 0;
}
EXPORT_SYMBOL(simple_write_begin);

/**
 * simple_write_end - .write_end helper for non-block-device FSes
 * @file: See .write_end of address_space_operations
 * @mapping:                 "
 * @pos:                 "
 * @len:                 "
 * @copied:                 "
 * @page:                 "
 * @fsdata:                 "
 *
 * simple_write_end does the minimum needed for updating a page after writing is
 * done. It has the same API signature as the .write_end of
 * address_space_operations vector. So it can just be set onto .write_end for
 * FSes that don't need any other processing. i_mutex is assumed to be held.
 * Block based filesystems should use generic_write_end().
 * NOTE: Even though i_size might get updated by this function, mark_inode_dirty
 * is not called, so a filesystem that actually does store data in .write_inode
 * should extend on what's done here with a call to mark_inode_dirty() in the
 * case that i_size has changed.
 *
 * Use *ONLY* with simple_readpage()
 */
int simple_write_end(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned copied,
                        struct page *page, void *fsdata)
{
        struct inode *inode = page->mapping->host;
        loff_t last_pos = pos + copied;

        /* zero the stale part of the page if we did a short copy */
        if (!PageUptodate(page)) {
                if (copied < len) {
                        unsigned from = pos & (PAGE_SIZE - 1);

                        zero_user(page, from + copied, len - copied);
                }
                SetPageUptodate(page);
        }
        /*
         * No need to use i_size_read() here, the i_size
         * cannot change under us because we hold the i_mutex.
         */
        if (last_pos > inode->i_size)
                i_size_write(inode, last_pos);

        set_page_dirty(page);
        unlock_page(page);
        put_page(page);

        return copied;
}
EXPORT_SYMBOL(simple_write_end);

/*
 * the inodes created here are not hashed. If you use iunique to generate
 * unique inode values later for this filesystem, then you must take care
 * to pass it an appropriate max_reserved value to avoid collisions.
 */
int simple_fill_super(struct super_block *s, unsigned long magic,
                      const struct tree_descr *files)
{
        struct inode *inode;
        struct dentry *root;
        struct dentry *dentry;
        int i;

        s->s_blocksize = PAGE_SIZE;
        s->s_blocksize_bits = PAGE_SHIFT;
        s->s_magic = magic;
        s->s_op = &simple_super_operations;
        s->s_time_gran = 1;

        inode = new_inode(s);
        if (!inode)
                return -ENOMEM;
        /*
         * because the root inode is 1, the files array must not contain an
         * entry at index 1
         */
        inode->i_ino = 1;
        inode->i_mode = S_IFDIR | 0755;
        inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
        inode->i_op = &simple_dir_inode_operations;
        inode->i_fop = &simple_dir_operations;
        set_nlink(inode, 2);
        root = d_make_root(inode);
        if (!root)
                return -ENOMEM;
        for (i = 0; !files->name || files->name[0]; i++, files++) {
                if (!files->name)
                        continue;

                /* warn if it tries to conflict with the root inode */
                if (unlikely(i == 1))
                        printk(KERN_WARNING "%s: %s passed in a files array"
                                "with an index of 1!\n", __func__,
                                s->s_type->name);

                dentry = d_alloc_name(root, files->name);
                if (!dentry)
                        goto out;
                inode = new_inode(s);
                if (!inode) {
                        dput(dentry);
                        goto out;
                }
                inode->i_mode = S_IFREG | files->mode;
                inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
                inode->i_fop = files->ops;
                inode->i_ino = i;
                d_add(dentry, inode);
        }
        s->s_root = root;
        return 0;
out:
        d_genocide(root);
        shrink_dcache_parent(root);
        dput(root);
        return -ENOMEM;
}
EXPORT_SYMBOL(simple_fill_super);

static DEFINE_SPINLOCK(pin_fs_lock);

int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *count)
{
        struct vfsmount *mnt = NULL;
        spin_lock(&pin_fs_lock);
        if (unlikely(!*mount)) {
                spin_unlock(&pin_fs_lock);
                mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
                if (IS_ERR(mnt))
                        return PTR_ERR(mnt);
                spin_lock(&pin_fs_lock);
                if (!*mount)
                        *mount = mnt;
        }
        mntget(*mount);
        ++*count;
        spin_unlock(&pin_fs_lock);
        mntput(mnt);
        return 0;
}
EXPORT_SYMBOL(simple_pin_fs);

void simple_release_fs(struct vfsmount **mount, int *count)
{
        struct vfsmount *mnt;
        spin_lock(&pin_fs_lock);
        mnt = *mount;
        if (!--*count)
                *mount = NULL;
        spin_unlock(&pin_fs_lock);
        mntput(mnt);
}
EXPORT_SYMBOL(simple_release_fs);

/**
 * simple_read_from_buffer - copy data from the buffer to user space
 * @to: the user space buffer to read to
 * @count: the maximum number of bytes to read
 * @ppos: the current position in the buffer
 * @from: the buffer to read from
 * @available: the size of the buffer
 *
 * The simple_read_from_buffer() function reads up to @count bytes from the
 * buffer @from at offset @ppos into the user space address starting at @to.
 *
 * On success, the number of bytes read is returned and the offset @ppos is
 * advanced by this number, or negative value is returned on error.
 **/
ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
                                const void *from, size_t available)
{
        loff_t pos = *ppos;
        size_t ret;

        if (pos < 0)
                return -EINVAL;
        if (pos >= available || !count)
                return 0;
        if (count > available - pos)
                count = available - pos;
        ret = copy_to_user(to, from + pos, count);
        if (ret == count)
                return -EFAULT;
        count -= ret;
        *ppos = pos + count;
        return count;
}
EXPORT_SYMBOL(simple_read_from_buffer);

/**
 * simple_write_to_buffer - copy data from user space to the buffer
 * @to: the buffer to write to
 * @available: the size of the buffer
 * @ppos: the current position in the buffer
 * @from: the user space buffer to read from
 * @count: the maximum number of bytes to read
 *
 * The simple_write_to_buffer() function reads up to @count bytes from the user
 * space address starting at @from into the buffer @to at offset @ppos.
 *
 * On success, the number of bytes written is returned and the offset @ppos is
 * advanced by this number, or negative value is returned on error.
 **/
ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
                const void __user *from, size_t count)
{
        loff_t pos = *ppos;
        size_t res;

        if (pos < 0)
                return -EINVAL;
        if (pos >= available || !count)
                return 0;
        if (count > available - pos)
                count = available - pos;
        res = copy_from_user(to + pos, from, count);
        if (res == count)
                return -EFAULT;
        count -= res;
        *ppos = pos + count;
        return count;
}
EXPORT_SYMBOL(simple_write_to_buffer);

/**
 * memory_read_from_buffer - copy data from the buffer
 * @to: the kernel space buffer to read to
 * @count: the maximum number of bytes to read
 * @ppos: the current position in the buffer
 * @from: the buffer to read from
 * @available: the size of the buffer
 *
 * The memory_read_from_buffer() function reads up to @count bytes from the
 * buffer @from at offset @ppos into the kernel space address starting at @to.
 *
 * On success, the number of bytes read is returned and the offset @ppos is
 * advanced by this number, or negative value is returned on error.
 **/
ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
                                const void *from, size_t available)
{
        loff_t pos = *ppos;

        if (pos < 0)
                return -EINVAL;
        if (pos >= available)
                return 0;
        if (count > available - pos)
                count = available - pos;
        memcpy(to, from + pos, count);
        *ppos = pos + count;

        return count;
}
EXPORT_SYMBOL(memory_read_from_buffer);

/*
 * Transaction based IO.
 * The file expects a single write which triggers the transaction, and then
 * possibly a read which collects the result - which is stored in a
 * file-local buffer.
 */

void simple_transaction_set(struct file *file, size_t n)
{
        struct simple_transaction_argresp *ar = file->private_data;

        BUG_ON(n > SIMPLE_TRANSACTION_LIMIT);

        /*
         * The barrier ensures that ar->size will really remain zero until
         * ar->data is ready for reading.
         */
        smp_mb();
        ar->size = n;
}
EXPORT_SYMBOL(simple_transaction_set);

char *simple_transaction_get(struct file *file, const char __user *buf, size_t size)
{
        struct simple_transaction_argresp *ar;
        static DEFINE_SPINLOCK(simple_transaction_lock);

        if (size > SIMPLE_TRANSACTION_LIMIT - 1)
                return ERR_PTR(-EFBIG);

        ar = (struct simple_transaction_argresp *)get_zeroed_page(GFP_KERNEL);
        if (!ar)
                return ERR_PTR(-ENOMEM);

        spin_lock(&simple_transaction_lock);

        /* only one write allowed per open */
        if (file->private_data) {
                spin_unlock(&simple_transaction_lock);
                free_page((unsigned long)ar);
                return ERR_PTR(-EBUSY);
        }

        file->private_data = ar;

        spin_unlock(&simple_transaction_lock);

        if (copy_from_user(ar->data, buf, size))
                return ERR_PTR(-EFAULT);

        return ar->data;
}
EXPORT_SYMBOL(simple_transaction_get);

ssize_t simple_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
{
        struct simple_transaction_argresp *ar = file->private_data;

        if (!ar)
                return 0;
        return simple_read_from_buffer(buf, size, pos, ar->data, ar->size);
}
EXPORT_SYMBOL(simple_transaction_read);

int simple_transaction_release(struct inode *inode, struct file *file)
{
        free_page((unsigned long)file->private_data);
        return 0;
}
EXPORT_SYMBOL(simple_transaction_release);

/* Simple attribute files */

struct simple_attr {
        int (*get)(void *, u64 *);
        int (*set)(void *, u64);
        char get_buf[24];        /* enough to store a u64 and "\n\0" */
        char set_buf[24];
        void *data;
        const char *fmt;        /* format for read operation */
        struct mutex mutex;        /* protects access to these buffers */
};

/* simple_attr_open is called by an actual attribute open file operation
 * to set the attribute specific access operations. */
int simple_attr_open(struct inode *inode, struct file *file,
                     int (*get)(void *, u64 *), int (*set)(void *, u64),
                     const char *fmt)
{
        struct simple_attr *attr;

        attr = kzalloc(sizeof(*attr), GFP_KERNEL);
        if (!attr)
                return -ENOMEM;

        attr->get = get;
        attr->set = set;
        attr->data = inode->i_private;
        attr->fmt = fmt;
        mutex_init(&attr->mutex);

        file->private_data = attr;

        return nonseekable_open(inode, file);
}
EXPORT_SYMBOL_GPL(simple_attr_open);

int simple_attr_release(struct inode *inode, struct file *file)
{
        kfree(file->private_data);
        return 0;
}
EXPORT_SYMBOL_GPL(simple_attr_release);        /* GPL-only?  This?  Really? */

/* read from the buffer that is filled with the get function */
ssize_t simple_attr_read(struct file *file, char __user *buf,
                         size_t len, loff_t *ppos)
{
        struct simple_attr *attr;
        size_t size;
        ssize_t ret;

        attr = file->private_data;

        if (!attr->get)
                return -EACCES;

        ret = mutex_lock_interruptible(&attr->mutex);
        if (ret)
                return ret;

        if (*ppos && attr->get_buf[0]) {
                /* continued read */
                size = strlen(attr->get_buf);
        } else {
                /* first read */
                u64 val;
                ret = attr->get(attr->data, &val);
                if (ret)
                        goto out;

                size = scnprintf(attr->get_buf, sizeof(attr->get_buf),
                                 attr->fmt, (unsigned long long)val);
        }

        ret = simple_read_from_buffer(buf, len, ppos, attr->get_buf, size);
out:
        mutex_unlock(&attr->mutex);
        return ret;
}
EXPORT_SYMBOL_GPL(simple_attr_read);

/* interpret the buffer as a number to call the set function with */
static ssize_t simple_attr_write_xsigned(struct file *file, const char __user *buf,
                          size_t len, loff_t *ppos, bool is_signed)
{
        struct simple_attr *attr;
        unsigned long long val;
        size_t size;
        ssize_t ret;

        attr = file->private_data;
        if (!attr->set)
                return -EACCES;

        ret = mutex_lock_interruptible(&attr->mutex);
        if (ret)
                return ret;

        ret = -EFAULT;
        size = min(sizeof(attr->set_buf) - 1, len);
        if (copy_from_user(attr->set_buf, buf, size))
                goto out;

        attr->set_buf[size] = '\0';
        if (is_signed)
                ret = kstrtoll(attr->set_buf, 0, &val);
        else
                ret = kstrtoull(attr->set_buf, 0, &val);
        if (ret)
                goto out;
        ret = attr->set(attr->data, val);
        if (ret == 0)
                ret = len; /* on success, claim we got the whole input */
out:
        mutex_unlock(&attr->mutex);
        return ret;
}

ssize_t simple_attr_write(struct file *file, const char __user *buf,
                          size_t len, loff_t *ppos)
{
        return simple_attr_write_xsigned(file, buf, len, ppos, false);
}
EXPORT_SYMBOL_GPL(simple_attr_write);

ssize_t simple_attr_write_signed(struct file *file, const char __user *buf,
                          size_t len, loff_t *ppos)
{
        return simple_attr_write_xsigned(file, buf, len, ppos, true);
}
EXPORT_SYMBOL_GPL(simple_attr_write_signed);

/**
 * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation
 * @sb:                filesystem to do the file handle conversion on
 * @fid:        file handle to convert
 * @fh_len:        length of the file handle in bytes
 * @fh_type:        type of file handle
 * @get_inode:        filesystem callback to retrieve inode
 *
 * This function decodes @fid as long as it has one of the well-known
 * Linux filehandle types and calls @get_inode on it to retrieve the
 * inode for the object specified in the file handle.
 */
struct dentry *generic_fh_to_dentry(struct super_block *sb, struct fid *fid,
                int fh_len, int fh_type, struct inode *(*get_inode)
                        (struct super_block *sb, u64 ino, u32 gen))
{
        struct inode *inode = NULL;

        if (fh_len < 2)
                return NULL;

        switch (fh_type) {
        case FILEID_INO32_GEN:
        case FILEID_INO32_GEN_PARENT:
                inode = get_inode(sb, fid->i32.ino, fid->i32.gen);
                break;
        }

        return d_obtain_alias(inode);
}
EXPORT_SYMBOL_GPL(generic_fh_to_dentry);

/**
 * generic_fh_to_parent - generic helper for the fh_to_parent export operation
 * @sb:                filesystem to do the file handle conversion on
 * @fid:        file handle to convert
 * @fh_len:        length of the file handle in bytes
 * @fh_type:        type of file handle
 * @get_inode:        filesystem callback to retrieve inode
 *
 * This function decodes @fid as long as it has one of the well-known
 * Linux filehandle types and calls @get_inode on it to retrieve the
 * inode for the _parent_ object specified in the file handle if it
 * is specified in the file handle, or NULL otherwise.
 */
struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid,
                int fh_len, int fh_type, struct inode *(*get_inode)
                        (struct super_block *sb, u64 ino, u32 gen))
{
        struct inode *inode = NULL;

        if (fh_len <= 2)
                return NULL;

        switch (fh_type) {
        case FILEID_INO32_GEN_PARENT:
                inode = get_inode(sb, fid->i32.parent_ino,
                                  (fh_len > 3 ? fid->i32.parent_gen : 0));
                break;
        }

        return d_obtain_alias(inode);
}
EXPORT_SYMBOL_GPL(generic_fh_to_parent);

/**
 * __generic_file_fsync - generic fsync implementation for simple filesystems
 *
 * @file:        file to synchronize
 * @start:        start offset in bytes
 * @end:        end offset in bytes (inclusive)
 * @datasync:        only synchronize essential metadata if true
 *
 * This is a generic implementation of the fsync method for simple
 * filesystems which track all non-inode metadata in the buffers list
 * hanging off the address_space structure.
 */
int __generic_file_fsync(struct file *file, loff_t start, loff_t end,
                                 int datasync)
{
        struct inode *inode = file->f_mapping->host;
        int err;
        int ret;

        err = file_write_and_wait_range(file, start, end);
        if (err)
                return err;

        inode_lock(inode);
        ret = sync_mapping_buffers(inode->i_mapping);
        if (!(inode->i_state & I_DIRTY_ALL))
                goto out;
        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
                goto out;

        err = sync_inode_metadata(inode, 1);
        if (ret == 0)
                ret = err;

out:
        inode_unlock(inode);
        /* check and advance again to catch errors after syncing out buffers */
        err = file_check_and_advance_wb_err(file);
        if (ret == 0)
                ret = err;
        return ret;
}
EXPORT_SYMBOL(__generic_file_fsync);

/**
 * generic_file_fsync - generic fsync implementation for simple filesystems
 *                        with flush
 * @file:        file to synchronize
 * @start:        start offset in bytes
 * @end:        end offset in bytes (inclusive)
 * @datasync:        only synchronize essential metadata if true
 *
 */

int generic_file_fsync(struct file *file, loff_t start, loff_t end,
                       int datasync)
{
        struct inode *inode = file->f_mapping->host;
        int err;

        err = __generic_file_fsync(file, start, end, datasync);
        if (err)
                return err;
        return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
}
EXPORT_SYMBOL(generic_file_fsync);

/**
 * generic_check_addressable - Check addressability of file system
 * @blocksize_bits:        log of file system block size
 * @num_blocks:                number of blocks in file system
 *
 * Determine whether a file system with @num_blocks blocks (and a
 * block size of 2**@blocksize_bits) is addressable by the sector_t
 * and page cache of the system.  Return 0 if so and -EFBIG otherwise.
 */
int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks)
{
        u64 last_fs_block = num_blocks - 1;
        u64 last_fs_page =
                last_fs_block >> (PAGE_SHIFT - blocksize_bits);

        if (unlikely(num_blocks == 0))
                return 0;

        if ((blocksize_bits < 9) || (blocksize_bits > PAGE_SHIFT))
                return -EINVAL;

        if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) ||
            (last_fs_page > (pgoff_t)(~0ULL))) {
                return -EFBIG;
        }
        return 0;
}
EXPORT_SYMBOL(generic_check_addressable);

/*
 * No-op implementation of ->fsync for in-memory filesystems.
 */
int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
        return 0;
}
EXPORT_SYMBOL(noop_fsync);

int noop_set_page_dirty(struct page *page)
{
        /*
         * Unlike __set_page_dirty_no_writeback that handles dirty page
         * tracking in the page object, dax does all dirty tracking in
         * the inode address_space in response to mkwrite faults. In the
         * dax case we only need to worry about potentially dirty CPU
         * caches, not dirty page cache pages to write back.
         *
         * This callback is defined to prevent fallback to
         * __set_page_dirty_buffers() in set_page_dirty().
         */
        return 0;
}
EXPORT_SYMBOL_GPL(noop_set_page_dirty);

void noop_invalidatepage(struct page *page, unsigned int offset,
                unsigned int length)
{
        /*
         * There is no page cache to invalidate in the dax case, however
         * we need this callback defined to prevent falling back to
         * block_invalidatepage() in do_invalidatepage().
         */
}
EXPORT_SYMBOL_GPL(noop_invalidatepage);

ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
        /*
         * iomap based filesystems support direct I/O without need for
         * this callback. However, it still needs to be set in
         * inode->a_ops so that open/fcntl know that direct I/O is
         * generally supported.
         */
        return -EINVAL;
}
EXPORT_SYMBOL_GPL(noop_direct_IO);

/* Because kfree isn't assignment-compatible with void(void*) ;-/ */
void kfree_link(void *p)
{
        kfree(p);
}
EXPORT_SYMBOL(kfree_link);

/*
 * nop .set_page_dirty method so that people can use .page_mkwrite on
 * anon inodes.
 */
static int anon_set_page_dirty(struct page *page)
{
        return 0;
};

/*
 * A single inode exists for all anon_inode files. Contrary to pipes,
 * anon_inode inodes have no associated per-instance data, so we need
 * only allocate one of them.
 */
struct inode *alloc_anon_inode(struct super_block *s)
{
        static const struct address_space_operations anon_aops = {
                .set_page_dirty = anon_set_page_dirty,
        };
        struct inode *inode = new_inode_pseudo(s);

        if (!inode)
                return ERR_PTR(-ENOMEM);

        inode->i_ino = get_next_ino();
        inode->i_mapping->a_ops = &anon_aops;

        /*
         * Mark the inode dirty from the very beginning,
         * that way it will never be moved to the dirty
         * list because mark_inode_dirty() will think
         * that it already _is_ on the dirty list.
         */
        inode->i_state = I_DIRTY;
        inode->i_mode = S_IRUSR | S_IWUSR;
        inode->i_uid = current_fsuid();
        inode->i_gid = current_fsgid();
        inode->i_flags |= S_PRIVATE;
        inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
        return inode;
}
EXPORT_SYMBOL(alloc_anon_inode);

/**
 * simple_nosetlease - generic helper for prohibiting leases
 * @filp: file pointer
 * @arg: type of lease to obtain
 * @flp: new lease supplied for insertion
 * @priv: private data for lm_setup operation
 *
 * Generic helper for filesystems that do not wish to allow leases to be set.
 * All arguments are ignored and it just returns -EINVAL.
 */
int
simple_nosetlease(struct file *filp, long arg, struct file_lock **flp,
                  void **priv)
{
        return -EINVAL;
}
EXPORT_SYMBOL(simple_nosetlease);

/**
 * simple_get_link - generic helper to get the target of "fast" symlinks
 * @dentry: not used here
 * @inode: the symlink inode
 * @done: not used here
 *
 * Generic helper for filesystems to use for symlink inodes where a pointer to
 * the symlink target is stored in ->i_link.  NOTE: this isn't normally called,
 * since as an optimization the path lookup code uses any non-NULL ->i_link
 * directly, without calling ->get_link().  But ->get_link() still must be set,
 * to mark the inode_operations as being for a symlink.
 *
 * Return: the symlink target
 */
const char *simple_get_link(struct dentry *dentry, struct inode *inode,
                            struct delayed_call *done)
{
        return inode->i_link;
}
EXPORT_SYMBOL(simple_get_link);

const struct inode_operations simple_symlink_inode_operations = {
        .get_link = simple_get_link,
};
EXPORT_SYMBOL(simple_symlink_inode_operations);

/*
 * Operations for a permanently empty directory.
 */
static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
        return ERR_PTR(-ENOENT);
}

static int empty_dir_getattr(const struct path *path, struct kstat *stat,
                             u32 request_mask, unsigned int query_flags)
{
        struct inode *inode = d_inode(path->dentry);
        generic_fillattr(inode, stat);
        return 0;
}

static int empty_dir_setattr(struct dentry *dentry, struct iattr *attr)
{
        return -EPERM;
}

static ssize_t empty_dir_listxattr(struct dentry *dentry, char *list, size_t size)
{
        return -EOPNOTSUPP;
}

static const struct inode_operations empty_dir_inode_operations = {
        .lookup                = empty_dir_lookup,
        .permission        = generic_permission,
        .setattr        = empty_dir_setattr,
        .getattr        = empty_dir_getattr,
        .listxattr        = empty_dir_listxattr,
};

static loff_t empty_dir_llseek(struct file *file, loff_t offset, int whence)
{
        /* An empty directory has two entries . and .. at offsets 0 and 1 */
        return generic_file_llseek_size(file, offset, whence, 2, 2);
}

static int empty_dir_readdir(struct file *file, struct dir_context *ctx)
{
        dir_emit_dots(file, ctx);
        return 0;
}

static const struct file_operations empty_dir_operations = {
        .llseek                = empty_dir_llseek,
        .read                = generic_read_dir,
        .iterate_shared        = empty_dir_readdir,
        .fsync                = noop_fsync,
};


void make_empty_dir_inode(struct inode *inode)
{
        set_nlink(inode, 2);
        inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
        inode->i_uid = GLOBAL_ROOT_UID;
        inode->i_gid = GLOBAL_ROOT_GID;
        inode->i_rdev = 0;
        inode->i_size = 0;
        inode->i_blkbits = PAGE_SHIFT;
        inode->i_blocks = 0;

        inode->i_op = &empty_dir_inode_operations;
        inode->i_opflags &= ~IOP_XATTR;
        inode->i_fop = &empty_dir_operations;
}

bool is_empty_dir_inode(struct inode *inode)
{
        return (inode->i_fop == &empty_dir_operations) &&
                (inode->i_op == &empty_dir_inode_operations);
}

#ifdef CONFIG_UNICODE
/*
 * Determine if the name of a dentry should be casefolded.
 *
 * Return: if names will need casefolding
 */
static bool needs_casefold(const struct inode *dir)
{
        return IS_CASEFOLDED(dir) && dir->i_sb->s_encoding;
}

/**
 * generic_ci_d_compare - generic d_compare implementation for casefolding filesystems
 * @dentry:        dentry whose name we are checking against
 * @len:        len of name of dentry
 * @str:        str pointer to name of dentry
 * @name:        Name to compare against
 *
 * Return: 0 if names match, 1 if mismatch, or -ERRNO
 */
int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
                          const char *str, const struct qstr *name)
{
        const struct dentry *parent = READ_ONCE(dentry->d_parent);
        const struct inode *dir = READ_ONCE(parent->d_inode);
        const struct super_block *sb = dentry->d_sb;
        const struct unicode_map *um = sb->s_encoding;
        struct qstr qstr = QSTR_INIT(str, len);
        char strbuf[DNAME_INLINE_LEN];
        int ret;

        if (!dir || !needs_casefold(dir))
                goto fallback;
        /*
         * If the dentry name is stored in-line, then it may be concurrently
         * modified by a rename.  If this happens, the VFS will eventually retry
         * the lookup, so it doesn't matter what ->d_compare() returns.
         * However, it's unsafe to call utf8_strncasecmp() with an unstable
         * string.  Therefore, we have to copy the name into a temporary buffer.
         */
        if (len <= DNAME_INLINE_LEN - 1) {
                memcpy(strbuf, str, len);
                strbuf[len] = 0;
                qstr.name = strbuf;
                /* prevent compiler from optimizing out the temporary buffer */
                barrier();
        }
        ret = utf8_strncasecmp(um, name, &qstr);
        if (ret >= 0)
                return ret;

        if (sb_has_strict_encoding(sb))
                return -EINVAL;
fallback:
        if (len != name->len)
                return 1;
        return !!memcmp(str, name->name, len);
}
EXPORT_SYMBOL(generic_ci_d_compare);

/**
 * generic_ci_d_hash - generic d_hash implementation for casefolding filesystems
 * @dentry:        dentry of the parent directory
 * @str:        qstr of name whose hash we should fill in
 *
 * Return: 0 if hash was successful or unchanged, and -EINVAL on error
 */
int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
{
        const struct inode *dir = READ_ONCE(dentry->d_inode);
        struct super_block *sb = dentry->d_sb;
        const struct unicode_map *um = sb->s_encoding;
        int ret = 0;

        if (!dir || !needs_casefold(dir))
                return 0;

        ret = utf8_casefold_hash(um, dentry, str);
        if (ret < 0 && sb_has_strict_encoding(sb))
                return -EINVAL;
        return 0;
}
EXPORT_SYMBOL(generic_ci_d_hash);
#endif
















































































    5 


    5 






    5 





    5 



    5 






























    6 




    5 




    5 

    6 






















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  fs/anon_inodes.c
 *
 *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
 *
 *  Thanks to Arnd Bergmann for code review and suggestions.
 *  More changes for Thomas Gleixner suggestions.
 *
 */

#include <linux/cred.h>
#include <linux/file.h>
#include <linux/poll.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/magic.h>
#include <linux/anon_inodes.h>
#include <linux/pseudo_fs.h>

#include <linux/uaccess.h>

static struct vfsmount *anon_inode_mnt __read_mostly;
static struct inode *anon_inode_inode;

/*
 * anon_inodefs_dname() is called from d_path().
 */
static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
{
        return dynamic_dname(dentry, buffer, buflen, "anon_inode:%s",
                                dentry->d_name.name);
}

static const struct dentry_operations anon_inodefs_dentry_operations = {
        .d_dname        = anon_inodefs_dname,
};

static int anon_inodefs_init_fs_context(struct fs_context *fc)
{
        struct pseudo_fs_context *ctx = init_pseudo(fc, ANON_INODE_FS_MAGIC);
        if (!ctx)
                return -ENOMEM;
        ctx->dops = &anon_inodefs_dentry_operations;
        return 0;
}

static struct file_system_type anon_inode_fs_type = {
        .name                = "anon_inodefs",
        .init_fs_context = anon_inodefs_init_fs_context,
        .kill_sb        = kill_anon_super,
};

/**
 * anon_inode_getfile - creates a new file instance by hooking it up to an
 *                      anonymous inode, and a dentry that describe the "class"
 *                      of the file
 *
 * @name:    [in]    name of the "class" of the new file
 * @fops:    [in]    file operations for the new file
 * @priv:    [in]    private data for the new file (will be file's private_data)
 * @flags:   [in]    flags
 *
 * Creates a new file by hooking it on a single inode. This is useful for files
 * that do not need to have a full-fledged inode in order to operate correctly.
 * All the files created with anon_inode_getfile() will share a single inode,
 * hence saving memory and avoiding code duplication for the file/inode/dentry
 * setup.  Returns the newly created file* or an error pointer.
 */
struct file *anon_inode_getfile(const char *name,
                                const struct file_operations *fops,
                                void *priv, int flags)
{
        struct file *file;

        if (IS_ERR(anon_inode_inode))
                return ERR_PTR(-ENODEV);

        if (fops->owner && !try_module_get(fops->owner))
                return ERR_PTR(-ENOENT);

        /*
         * We know the anon_inode inode count is always greater than zero,
         * so ihold() is safe.
         */
        ihold(anon_inode_inode);
        file = alloc_file_pseudo(anon_inode_inode, anon_inode_mnt, name,
                                 flags & (O_ACCMODE | O_NONBLOCK), fops);
        if (IS_ERR(file))
                goto err;

        file->f_mapping = anon_inode_inode->i_mapping;

        file->private_data = priv;

        return file;

err:
        iput(anon_inode_inode);
        module_put(fops->owner);
        return file;
}
EXPORT_SYMBOL_GPL(anon_inode_getfile);

/**
 * anon_inode_getfd - creates a new file instance by hooking it up to an
 *                    anonymous inode, and a dentry that describe the "class"
 *                    of the file
 *
 * @name:    [in]    name of the "class" of the new file
 * @fops:    [in]    file operations for the new file
 * @priv:    [in]    private data for the new file (will be file's private_data)
 * @flags:   [in]    flags
 *
 * Creates a new file by hooking it on a single inode. This is useful for files
 * that do not need to have a full-fledged inode in order to operate correctly.
 * All the files created with anon_inode_getfd() will share a single inode,
 * hence saving memory and avoiding code duplication for the file/inode/dentry
 * setup.  Returns new descriptor or an error code.
 */
int anon_inode_getfd(const char *name, const struct file_operations *fops,
                     void *priv, int flags)
{
        int error, fd;
        struct file *file;

        error = get_unused_fd_flags(flags);
        if (error < 0)
                return error;
        fd = error;

        file = anon_inode_getfile(name, fops, priv, flags);
        if (IS_ERR(file)) {
                error = PTR_ERR(file);
                goto err_put_unused_fd;
        }
        fd_install(fd, file);

        return fd;

err_put_unused_fd:
        put_unused_fd(fd);
        return error;
}
EXPORT_SYMBOL_GPL(anon_inode_getfd);

static int __init anon_inode_init(void)
{
        anon_inode_mnt = kern_mount(&anon_inode_fs_type);
        if (IS_ERR(anon_inode_mnt))
                panic("anon_inode_init() kernel mount failed (%ld)\n", PTR_ERR(anon_inode_mnt));

        anon_inode_inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
        if (IS_ERR(anon_inode_inode))
                panic("anon_inode_init() inode allocation failed (%ld)\n", PTR_ERR(anon_inode_inode));

        return 0;
}

fs_initcall(anon_inode_init);




















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 






    1 

    1 


















    1 
















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
// SPDX-License-Identifier: GPL-2.0
/*
 * linux/ipc/sem.c
 * Copyright (C) 1992 Krishna Balasubramanian
 * Copyright (C) 1995 Eric Schenk, Bruno Haible
 *
 * /proc/sysvipc/sem support (c) 1999 Dragos Acostachioaie <dragos@iname.com>
 *
 * SMP-threaded, sysctl's added
 * (c) 1999 Manfred Spraul <manfred@colorfullife.com>
 * Enforced range limit on SEM_UNDO
 * (c) 2001 Red Hat Inc
 * Lockless wakeup
 * (c) 2003 Manfred Spraul <manfred@colorfullife.com>
 * (c) 2016 Davidlohr Bueso <dave@stgolabs.net>
 * Further wakeup optimizations, documentation
 * (c) 2010 Manfred Spraul <manfred@colorfullife.com>
 *
 * support for audit of ipc object properties and permission changes
 * Dustin Kirkland <dustin.kirkland@us.ibm.com>
 *
 * namespaces support
 * OpenVZ, SWsoft Inc.
 * Pavel Emelianov <xemul@openvz.org>
 *
 * Implementation notes: (May 2010)
 * This file implements System V semaphores.
 *
 * User space visible behavior:
 * - FIFO ordering for semop() operations (just FIFO, not starvation
 *   protection)
 * - multiple semaphore operations that alter the same semaphore in
 *   one semop() are handled.
 * - sem_ctime (time of last semctl()) is updated in the IPC_SET, SETVAL and
 *   SETALL calls.
 * - two Linux specific semctl() commands: SEM_STAT, SEM_INFO.
 * - undo adjustments at process exit are limited to 0..SEMVMX.
 * - namespace are supported.
 * - SEMMSL, SEMMNS, SEMOPM and SEMMNI can be configured at runtine by writing
 *   to /proc/sys/kernel/sem.
 * - statistics about the usage are reported in /proc/sysvipc/sem.
 *
 * Internals:
 * - scalability:
 *   - all global variables are read-mostly.
 *   - semop() calls and semctl(RMID) are synchronized by RCU.
 *   - most operations do write operations (actually: spin_lock calls) to
 *     the per-semaphore array structure.
 *   Thus: Perfect SMP scaling between independent semaphore arrays.
 *         If multiple semaphores in one array are used, then cache line
 *         trashing on the semaphore array spinlock will limit the scaling.
 * - semncnt and semzcnt are calculated on demand in count_semcnt()
 * - the task that performs a successful semop() scans the list of all
 *   sleeping tasks and completes any pending operations that can be fulfilled.
 *   Semaphores are actively given to waiting tasks (necessary for FIFO).
 *   (see update_queue())
 * - To improve the scalability, the actual wake-up calls are performed after
 *   dropping all locks. (see wake_up_sem_queue_prepare())
 * - All work is done by the waker, the woken up task does not have to do
 *   anything - not even acquiring a lock or dropping a refcount.
 * - A woken up task may not even touch the semaphore array anymore, it may
 *   have been destroyed already by a semctl(RMID).
 * - UNDO values are stored in an array (one per process and per
 *   semaphore array, lazily allocated). For backwards compatibility, multiple
 *   modes for the UNDO variables are supported (per process, per thread)
 *   (see copy_semundo, CLONE_SYSVSEM)
 * - There are two lists of the pending operations: a per-array list
 *   and per-semaphore list (stored in the array). This allows to achieve FIFO
 *   ordering without always scanning all pending operations.
 *   The worst-case behavior is nevertheless O(N^2) for N wakeups.
 */

#include <linux/compat.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/init.h>
#include <linux/proc_fs.h>
#include <linux/time.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/audit.h>
#include <linux/capability.h>
#include <linux/seq_file.h>
#include <linux/rwsem.h>
#include <linux/nsproxy.h>
#include <linux/ipc_namespace.h>
#include <linux/sched/wake_q.h>
#include <linux/nospec.h>
#include <linux/rhashtable.h>

#include <linux/uaccess.h>
#include "util.h"

/* One semaphore structure for each semaphore in the system. */
struct sem {
        int        semval;                /* current value */
        /*
         * PID of the process that last modified the semaphore. For
         * Linux, specifically these are:
         *  - semop
         *  - semctl, via SETVAL and SETALL.
         *  - at task exit when performing undo adjustments (see exit_sem).
         */
        struct pid *sempid;
        spinlock_t        lock;        /* spinlock for fine-grained semtimedop */
        struct list_head pending_alter; /* pending single-sop operations */
                                        /* that alter the semaphore */
        struct list_head pending_const; /* pending single-sop operations */
                                        /* that do not alter the semaphore*/
        time64_t         sem_otime;        /* candidate for sem_otime */
} ____cacheline_aligned_in_smp;

/* One sem_array data structure for each set of semaphores in the system. */
struct sem_array {
        struct kern_ipc_perm        sem_perm;        /* permissions .. see ipc.h */
        time64_t                sem_ctime;        /* create/last semctl() time */
        struct list_head        pending_alter;        /* pending operations */
                                                /* that alter the array */
        struct list_head        pending_const;        /* pending complex operations */
                                                /* that do not alter semvals */
        struct list_head        list_id;        /* undo requests on this array */
        int                        sem_nsems;        /* no. of semaphores in array */
        int                        complex_count;        /* pending complex operations */
        unsigned int                use_global_lock;/* >0: global lock required */

        struct sem                sems[];
} __randomize_layout;

/* One queue for each sleeping process in the system. */
struct sem_queue {
        struct list_head        list;         /* queue of pending operations */
        struct task_struct        *sleeper; /* this process */
        struct sem_undo                *undo;         /* undo structure */
        struct pid                *pid;         /* process id of requesting process */
        int                        status;         /* completion status of operation */
        struct sembuf                *sops;         /* array of pending operations */
        struct sembuf                *blocking; /* the operation that blocked */
        int                        nsops;         /* number of operations */
        bool                        alter;         /* does *sops alter the array? */
        bool                    dupsop;         /* sops on more than one sem_num */
};

/* Each task has a list of undo requests. They are executed automatically
 * when the process exits.
 */
struct sem_undo {
        struct list_head        list_proc;        /* per-process list: *
                                                 * all undos from one process
                                                 * rcu protected */
        struct rcu_head                rcu;                /* rcu struct for sem_undo */
        struct sem_undo_list        *ulp;                /* back ptr to sem_undo_list */
        struct list_head        list_id;        /* per semaphore array list:
                                                 * all undos for one array */
        int                        semid;                /* semaphore set identifier */
        short                        *semadj;        /* array of adjustments */
                                                /* one per semaphore */
};

/* sem_undo_list controls shared access to the list of sem_undo structures
 * that may be shared among all a CLONE_SYSVSEM task group.
 */
struct sem_undo_list {
        refcount_t                refcnt;
        spinlock_t                lock;
        struct list_head        list_proc;
};


#define sem_ids(ns)        ((ns)->ids[IPC_SEM_IDS])

static int newary(struct ipc_namespace *, struct ipc_params *);
static void freeary(struct ipc_namespace *, struct kern_ipc_perm *);
#ifdef CONFIG_PROC_FS
static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
#endif

#define SEMMSL_FAST        256 /* 512 bytes on stack */
#define SEMOPM_FAST        64  /* ~ 372 bytes on stack */

/*
 * Switching from the mode suitable for simple ops
 * to the mode for complex ops is costly. Therefore:
 * use some hysteresis
 */
#define USE_GLOBAL_LOCK_HYSTERESIS        10

/*
 * Locking:
 * a) global sem_lock() for read/write
 *        sem_undo.id_next,
 *        sem_array.complex_count,
 *        sem_array.pending{_alter,_const},
 *        sem_array.sem_undo
 *
 * b) global or semaphore sem_lock() for read/write:
 *        sem_array.sems[i].pending_{const,alter}:
 *
 * c) special:
 *        sem_undo_list.list_proc:
 *        * undo_list->lock for write
 *        * rcu for read
 *        use_global_lock:
 *        * global sem_lock() for write
 *        * either local or global sem_lock() for read.
 *
 * Memory ordering:
 * Most ordering is enforced by using spin_lock() and spin_unlock().
 *
 * Exceptions:
 * 1) use_global_lock: (SEM_BARRIER_1)
 * Setting it from non-zero to 0 is a RELEASE, this is ensured by
 * using smp_store_release(): Immediately after setting it to 0,
 * a simple op can start.
 * Testing if it is non-zero is an ACQUIRE, this is ensured by using
 * smp_load_acquire().
 * Setting it from 0 to non-zero must be ordered with regards to
 * this smp_load_acquire(), this is guaranteed because the smp_load_acquire()
 * is inside a spin_lock() and after a write from 0 to non-zero a
 * spin_lock()+spin_unlock() is done.
 *
 * 2) queue.status: (SEM_BARRIER_2)
 * Initialization is done while holding sem_lock(), so no further barrier is
 * required.
 * Setting it to a result code is a RELEASE, this is ensured by both a
 * smp_store_release() (for case a) and while holding sem_lock()
 * (for case b).
 * The AQUIRE when reading the result code without holding sem_lock() is
 * achieved by using READ_ONCE() + smp_acquire__after_ctrl_dep().
 * (case a above).
 * Reading the result code while holding sem_lock() needs no further barriers,
 * the locks inside sem_lock() enforce ordering (case b above)
 *
 * 3) current->state:
 * current->state is set to TASK_INTERRUPTIBLE while holding sem_lock().
 * The wakeup is handled using the wake_q infrastructure. wake_q wakeups may
 * happen immediately after calling wake_q_add. As wake_q_add_safe() is called
 * when holding sem_lock(), no further barriers are required.
 *
 * See also ipc/mqueue.c for more details on the covered races.
 */

#define sc_semmsl        sem_ctls[0]
#define sc_semmns        sem_ctls[1]
#define sc_semopm        sem_ctls[2]
#define sc_semmni        sem_ctls[3]

void sem_init_ns(struct ipc_namespace *ns)
{
        ns->sc_semmsl = SEMMSL;
        ns->sc_semmns = SEMMNS;
        ns->sc_semopm = SEMOPM;
        ns->sc_semmni = SEMMNI;
        ns->used_sems = 0;
        ipc_init_ids(&ns->ids[IPC_SEM_IDS]);
}

#ifdef CONFIG_IPC_NS
void sem_exit_ns(struct ipc_namespace *ns)
{
        free_ipcs(ns, &sem_ids(ns), freeary);
        idr_destroy(&ns->ids[IPC_SEM_IDS].ipcs_idr);
        rhashtable_destroy(&ns->ids[IPC_SEM_IDS].key_ht);
}
#endif

void __init sem_init(void)
{
        sem_init_ns(&init_ipc_ns);
        ipc_init_proc_interface("sysvipc/sem",
                                "       key      semid perms      nsems   uid   gid  cuid  cgid      otime      ctime\n",
                                IPC_SEM_IDS, sysvipc_sem_proc_show);
}

/**
 * unmerge_queues - unmerge queues, if possible.
 * @sma: semaphore array
 *
 * The function unmerges the wait queues if complex_count is 0.
 * It must be called prior to dropping the global semaphore array lock.
 */
static void unmerge_queues(struct sem_array *sma)
{
        struct sem_queue *q, *tq;

        /* complex operations still around? */
        if (sma->complex_count)
                return;
        /*
         * We will switch back to simple mode.
         * Move all pending operation back into the per-semaphore
         * queues.
         */
        list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
                struct sem *curr;
                curr = &sma->sems[q->sops[0].sem_num];

                list_add_tail(&q->list, &curr->pending_alter);
        }
        INIT_LIST_HEAD(&sma->pending_alter);
}

/**
 * merge_queues - merge single semop queues into global queue
 * @sma: semaphore array
 *
 * This function merges all per-semaphore queues into the global queue.
 * It is necessary to achieve FIFO ordering for the pending single-sop
 * operations when a multi-semop operation must sleep.
 * Only the alter operations must be moved, the const operations can stay.
 */
static void merge_queues(struct sem_array *sma)
{
        int i;
        for (i = 0; i < sma->sem_nsems; i++) {
                struct sem *sem = &sma->sems[i];

                list_splice_init(&sem->pending_alter, &sma->pending_alter);
        }
}

static void sem_rcu_free(struct rcu_head *head)
{
        struct kern_ipc_perm *p = container_of(head, struct kern_ipc_perm, rcu);
        struct sem_array *sma = container_of(p, struct sem_array, sem_perm);

        security_sem_free(&sma->sem_perm);
        kvfree(sma);
}

/*
 * Enter the mode suitable for non-simple operations:
 * Caller must own sem_perm.lock.
 */
static void complexmode_enter(struct sem_array *sma)
{
        int i;
        struct sem *sem;

        if (sma->use_global_lock > 0)  {
                /*
                 * We are already in global lock mode.
                 * Nothing to do, just reset the
                 * counter until we return to simple mode.
                 */
                sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS;
                return;
        }
        sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS;

        for (i = 0; i < sma->sem_nsems; i++) {
                sem = &sma->sems[i];
                spin_lock(&sem->lock);
                spin_unlock(&sem->lock);
        }
}

/*
 * Try to leave the mode that disallows simple operations:
 * Caller must own sem_perm.lock.
 */
static void complexmode_tryleave(struct sem_array *sma)
{
        if (sma->complex_count)  {
                /* Complex ops are sleeping.
                 * We must stay in complex mode
                 */
                return;
        }
        if (sma->use_global_lock == 1) {

                /* See SEM_BARRIER_1 for purpose/pairing */
                smp_store_release(&sma->use_global_lock, 0);
        } else {
                sma->use_global_lock--;
        }
}

#define SEM_GLOBAL_LOCK        (-1)
/*
 * If the request contains only one semaphore operation, and there are
 * no complex transactions pending, lock only the semaphore involved.
 * Otherwise, lock the entire semaphore array, since we either have
 * multiple semaphores in our own semops, or we need to look at
 * semaphores from other pending complex operations.
 */
static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
                              int nsops)
{
        struct sem *sem;
        int idx;

        if (nsops != 1) {
                /* Complex operation - acquire a full lock */
                ipc_lock_object(&sma->sem_perm);

                /* Prevent parallel simple ops */
                complexmode_enter(sma);
                return SEM_GLOBAL_LOCK;
        }

        /*
         * Only one semaphore affected - try to optimize locking.
         * Optimized locking is possible if no complex operation
         * is either enqueued or processed right now.
         *
         * Both facts are tracked by use_global_mode.
         */
        idx = array_index_nospec(sops->sem_num, sma->sem_nsems);
        sem = &sma->sems[idx];

        /*
         * Initial check for use_global_lock. Just an optimization,
         * no locking, no memory barrier.
         */
        if (!sma->use_global_lock) {
                /*
                 * It appears that no complex operation is around.
                 * Acquire the per-semaphore lock.
                 */
                spin_lock(&sem->lock);

                /* see SEM_BARRIER_1 for purpose/pairing */
                if (!smp_load_acquire(&sma->use_global_lock)) {
                        /* fast path successful! */
                        return sops->sem_num;
                }
                spin_unlock(&sem->lock);
        }

        /* slow path: acquire the full lock */
        ipc_lock_object(&sma->sem_perm);

        if (sma->use_global_lock == 0) {
                /*
                 * The use_global_lock mode ended while we waited for
                 * sma->sem_perm.lock. Thus we must switch to locking
                 * with sem->lock.
                 * Unlike in the fast path, there is no need to recheck
                 * sma->use_global_lock after we have acquired sem->lock:
                 * We own sma->sem_perm.lock, thus use_global_lock cannot
                 * change.
                 */
                spin_lock(&sem->lock);

                ipc_unlock_object(&sma->sem_perm);
                return sops->sem_num;
        } else {
                /*
                 * Not a false alarm, thus continue to use the global lock
                 * mode. No need for complexmode_enter(), this was done by
                 * the caller that has set use_global_mode to non-zero.
                 */
                return SEM_GLOBAL_LOCK;
        }
}

static inline void sem_unlock(struct sem_array *sma, int locknum)
{
        if (locknum == SEM_GLOBAL_LOCK) {
                unmerge_queues(sma);
                complexmode_tryleave(sma);
                ipc_unlock_object(&sma->sem_perm);
        } else {
                struct sem *sem = &sma->sems[locknum];
                spin_unlock(&sem->lock);
        }
}

/*
 * sem_lock_(check_) routines are called in the paths where the rwsem
 * is not held.
 *
 * The caller holds the RCU read lock.
 */
static inline struct sem_array *sem_obtain_object(struct ipc_namespace *ns, int id)
{
        struct kern_ipc_perm *ipcp = ipc_obtain_object_idr(&sem_ids(ns), id);

        if (IS_ERR(ipcp))
                return ERR_CAST(ipcp);

        return container_of(ipcp, struct sem_array, sem_perm);
}

static inline struct sem_array *sem_obtain_object_check(struct ipc_namespace *ns,
                                                        int id)
{
        struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&sem_ids(ns), id);

        if (IS_ERR(ipcp))
                return ERR_CAST(ipcp);

        return container_of(ipcp, struct sem_array, sem_perm);
}

static inline void sem_lock_and_putref(struct sem_array *sma)
{
        sem_lock(sma, NULL, -1);
        ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
}

static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s)
{
        ipc_rmid(&sem_ids(ns), &s->sem_perm);
}

static struct sem_array *sem_alloc(size_t nsems)
{
        struct sem_array *sma;

        if (nsems > (INT_MAX - sizeof(*sma)) / sizeof(sma->sems[0]))
                return NULL;

        sma = kvzalloc(struct_size(sma, sems, nsems), GFP_KERNEL_ACCOUNT);
        if (unlikely(!sma))
                return NULL;

        return sma;
}

/**
 * newary - Create a new semaphore set
 * @ns: namespace
 * @params: ptr to the structure that contains key, semflg and nsems
 *
 * Called with sem_ids.rwsem held (as a writer)
 */
static int newary(struct ipc_namespace *ns, struct ipc_params *params)
{
        int retval;
        struct sem_array *sma;
        key_t key = params->key;
        int nsems = params->u.nsems;
        int semflg = params->flg;
        int i;

        if (!nsems)
                return -EINVAL;
        if (ns->used_sems + nsems > ns->sc_semmns)
                return -ENOSPC;

        sma = sem_alloc(nsems);
        if (!sma)
                return -ENOMEM;

        sma->sem_perm.mode = (semflg & S_IRWXUGO);
        sma->sem_perm.key = key;

        sma->sem_perm.security = NULL;
        retval = security_sem_alloc(&sma->sem_perm);
        if (retval) {
                kvfree(sma);
                return retval;
        }

        for (i = 0; i < nsems; i++) {
                INIT_LIST_HEAD(&sma->sems[i].pending_alter);
                INIT_LIST_HEAD(&sma->sems[i].pending_const);
                spin_lock_init(&sma->sems[i].lock);
        }

        sma->complex_count = 0;
        sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS;
        INIT_LIST_HEAD(&sma->pending_alter);
        INIT_LIST_HEAD(&sma->pending_const);
        INIT_LIST_HEAD(&sma->list_id);
        sma->sem_nsems = nsems;
        sma->sem_ctime = ktime_get_real_seconds();

        /* ipc_addid() locks sma upon success. */
        retval = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni);
        if (retval < 0) {
                ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
                return retval;
        }
        ns->used_sems += nsems;

        sem_unlock(sma, -1);
        rcu_read_unlock();

        return sma->sem_perm.id;
}


/*
 * Called with sem_ids.rwsem and ipcp locked.
 */
static int sem_more_checks(struct kern_ipc_perm *ipcp, struct ipc_params *params)
{
        struct sem_array *sma;

        sma = container_of(ipcp, struct sem_array, sem_perm);
        if (params->u.nsems > sma->sem_nsems)
                return -EINVAL;

        return 0;
}

long ksys_semget(key_t key, int nsems, int semflg)
{
        struct ipc_namespace *ns;
        static const struct ipc_ops sem_ops = {
                .getnew = newary,
                .associate = security_sem_associate,
                .more_checks = sem_more_checks,
        };
        struct ipc_params sem_params;

        ns = current->nsproxy->ipc_ns;

        if (nsems < 0 || nsems > ns->sc_semmsl)
                return -EINVAL;

        sem_params.key = key;
        sem_params.flg = semflg;
        sem_params.u.nsems = nsems;

        return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params);
}

SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg)
{
        return ksys_semget(key, nsems, semflg);
}

/**
 * perform_atomic_semop[_slow] - Attempt to perform semaphore
 *                               operations on a given array.
 * @sma: semaphore array
 * @q: struct sem_queue that describes the operation
 *
 * Caller blocking are as follows, based the value
 * indicated by the semaphore operation (sem_op):
 *
 *  (1) >0 never blocks.
 *  (2)  0 (wait-for-zero operation): semval is non-zero.
 *  (3) <0 attempting to decrement semval to a value smaller than zero.
 *
 * Returns 0 if the operation was possible.
 * Returns 1 if the operation is impossible, the caller must sleep.
 * Returns <0 for error codes.
 */
static int perform_atomic_semop_slow(struct sem_array *sma, struct sem_queue *q)
{
        int result, sem_op, nsops;
        struct pid *pid;
        struct sembuf *sop;
        struct sem *curr;
        struct sembuf *sops;
        struct sem_undo *un;

        sops = q->sops;
        nsops = q->nsops;
        un = q->undo;

        for (sop = sops; sop < sops + nsops; sop++) {
                int idx = array_index_nospec(sop->sem_num, sma->sem_nsems);
                curr = &sma->sems[idx];
                sem_op = sop->sem_op;
                result = curr->semval;

                if (!sem_op && result)
                        goto would_block;

                result += sem_op;
                if (result < 0)
                        goto would_block;
                if (result > SEMVMX)
                        goto out_of_range;

                if (sop->sem_flg & SEM_UNDO) {
                        int undo = un->semadj[sop->sem_num] - sem_op;
                        /* Exceeding the undo range is an error. */
                        if (undo < (-SEMAEM - 1) || undo > SEMAEM)
                                goto out_of_range;
                        un->semadj[sop->sem_num] = undo;
                }

                curr->semval = result;
        }

        sop--;
        pid = q->pid;
        while (sop >= sops) {
                ipc_update_pid(&sma->sems[sop->sem_num].sempid, pid);
                sop--;
        }

        return 0;

out_of_range:
        result = -ERANGE;
        goto undo;

would_block:
        q->blocking = sop;

        if (sop->sem_flg & IPC_NOWAIT)
                result = -EAGAIN;
        else
                result = 1;

undo:
        sop--;
        while (sop >= sops) {
                sem_op = sop->sem_op;
                sma->sems[sop->sem_num].semval -= sem_op;
                if (sop->sem_flg & SEM_UNDO)
                        un->semadj[sop->sem_num] += sem_op;
                sop--;
        }

        return result;
}

static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q)
{
        int result, sem_op, nsops;
        struct sembuf *sop;
        struct sem *curr;
        struct sembuf *sops;
        struct sem_undo *un;

        sops = q->sops;
        nsops = q->nsops;
        un = q->undo;

        if (unlikely(q->dupsop))
                return perform_atomic_semop_slow(sma, q);

        /*
         * We scan the semaphore set twice, first to ensure that the entire
         * operation can succeed, therefore avoiding any pointless writes
         * to shared memory and having to undo such changes in order to block
         * until the operations can go through.
         */
        for (sop = sops; sop < sops + nsops; sop++) {
                int idx = array_index_nospec(sop->sem_num, sma->sem_nsems);

                curr = &sma->sems[idx];
                sem_op = sop->sem_op;
                result = curr->semval;

                if (!sem_op && result)
                        goto would_block; /* wait-for-zero */

                result += sem_op;
                if (result < 0)
                        goto would_block;

                if (result > SEMVMX)
                        return -ERANGE;

                if (sop->sem_flg & SEM_UNDO) {
                        int undo = un->semadj[sop->sem_num] - sem_op;

                        /* Exceeding the undo range is an error. */
                        if (undo < (-SEMAEM - 1) || undo > SEMAEM)
                                return -ERANGE;
                }
        }

        for (sop = sops; sop < sops + nsops; sop++) {
                curr = &sma->sems[sop->sem_num];
                sem_op = sop->sem_op;
                result = curr->semval;

                if (sop->sem_flg & SEM_UNDO) {
                        int undo = un->semadj[sop->sem_num] - sem_op;

                        un->semadj[sop->sem_num] = undo;
                }
                curr->semval += sem_op;
                ipc_update_pid(&curr->sempid, q->pid);
        }

        return 0;

would_block:
        q->blocking = sop;
        return sop->sem_flg & IPC_NOWAIT ? -EAGAIN : 1;
}

static inline void wake_up_sem_queue_prepare(struct sem_queue *q, int error,
                                             struct wake_q_head *wake_q)
{
        struct task_struct *sleeper;

        sleeper = get_task_struct(q->sleeper);

        /* see SEM_BARRIER_2 for purpuse/pairing */
        smp_store_release(&q->status, error);

        wake_q_add_safe(wake_q, sleeper);
}

static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
{
        list_del(&q->list);
        if (q->nsops > 1)
                sma->complex_count--;
}

/** check_restart(sma, q)
 * @sma: semaphore array
 * @q: the operation that just completed
 *
 * update_queue is O(N^2) when it restarts scanning the whole queue of
 * waiting operations. Therefore this function checks if the restart is
 * really necessary. It is called after a previously waiting operation
 * modified the array.
 * Note that wait-for-zero operations are handled without restart.
 */
static inline int check_restart(struct sem_array *sma, struct sem_queue *q)
{
        /* pending complex alter operations are too difficult to analyse */
        if (!list_empty(&sma->pending_alter))
                return 1;

        /* we were a sleeping complex operation. Too difficult */
        if (q->nsops > 1)
                return 1;

        /* It is impossible that someone waits for the new value:
         * - complex operations always restart.
         * - wait-for-zero are handled seperately.
         * - q is a previously sleeping simple operation that
         *   altered the array. It must be a decrement, because
         *   simple increments never sleep.
         * - If there are older (higher priority) decrements
         *   in the queue, then they have observed the original
         *   semval value and couldn't proceed. The operation
         *   decremented to value - thus they won't proceed either.
         */
        return 0;
}

/**
 * wake_const_ops - wake up non-alter tasks
 * @sma: semaphore array.
 * @semnum: semaphore that was modified.
 * @wake_q: lockless wake-queue head.
 *
 * wake_const_ops must be called after a semaphore in a semaphore array
 * was set to 0. If complex const operations are pending, wake_const_ops must
 * be called with semnum = -1, as well as with the number of each modified
 * semaphore.
 * The tasks that must be woken up are added to @wake_q. The return code
 * is stored in q->pid.
 * The function returns 1 if at least one operation was completed successfully.
 */
static int wake_const_ops(struct sem_array *sma, int semnum,
                          struct wake_q_head *wake_q)
{
        struct sem_queue *q, *tmp;
        struct list_head *pending_list;
        int semop_completed = 0;

        if (semnum == -1)
                pending_list = &sma->pending_const;
        else
                pending_list = &sma->sems[semnum].pending_const;

        list_for_each_entry_safe(q, tmp, pending_list, list) {
                int error = perform_atomic_semop(sma, q);

                if (error > 0)
                        continue;
                /* operation completed, remove from queue & wakeup */
                unlink_queue(sma, q);

                wake_up_sem_queue_prepare(q, error, wake_q);
                if (error == 0)
                        semop_completed = 1;
        }

        return semop_completed;
}

/**
 * do_smart_wakeup_zero - wakeup all wait for zero tasks
 * @sma: semaphore array
 * @sops: operations that were performed
 * @nsops: number of operations
 * @wake_q: lockless wake-queue head
 *
 * Checks all required queue for wait-for-zero operations, based
 * on the actual changes that were performed on the semaphore array.
 * The function returns 1 if at least one operation was completed successfully.
 */
static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
                                int nsops, struct wake_q_head *wake_q)
{
        int i;
        int semop_completed = 0;
        int got_zero = 0;

        /* first: the per-semaphore queues, if known */
        if (sops) {
                for (i = 0; i < nsops; i++) {
                        int num = sops[i].sem_num;

                        if (sma->sems[num].semval == 0) {
                                got_zero = 1;
                                semop_completed |= wake_const_ops(sma, num, wake_q);
                        }
                }
        } else {
                /*
                 * No sops means modified semaphores not known.
                 * Assume all were changed.
                 */
                for (i = 0; i < sma->sem_nsems; i++) {
                        if (sma->sems[i].semval == 0) {
                                got_zero = 1;
                                semop_completed |= wake_const_ops(sma, i, wake_q);
                        }
                }
        }
        /*
         * If one of the modified semaphores got 0,
         * then check the global queue, too.
         */
        if (got_zero)
                semop_completed |= wake_const_ops(sma, -1, wake_q);

        return semop_completed;
}


/**
 * update_queue - look for tasks that can be completed.
 * @sma: semaphore array.
 * @semnum: semaphore that was modified.
 * @wake_q: lockless wake-queue head.
 *
 * update_queue must be called after a semaphore in a semaphore array
 * was modified. If multiple semaphores were modified, update_queue must
 * be called with semnum = -1, as well as with the number of each modified
 * semaphore.
 * The tasks that must be woken up are added to @wake_q. The return code
 * is stored in q->pid.
 * The function internally checks if const operations can now succeed.
 *
 * The function return 1 if at least one semop was completed successfully.
 */
static int update_queue(struct sem_array *sma, int semnum, struct wake_q_head *wake_q)
{
        struct sem_queue *q, *tmp;
        struct list_head *pending_list;
        int semop_completed = 0;

        if (semnum == -1)
                pending_list = &sma->pending_alter;
        else
                pending_list = &sma->sems[semnum].pending_alter;

again:
        list_for_each_entry_safe(q, tmp, pending_list, list) {
                int error, restart;

                /* If we are scanning the single sop, per-semaphore list of
                 * one semaphore and that semaphore is 0, then it is not
                 * necessary to scan further: simple increments
                 * that affect only one entry succeed immediately and cannot
                 * be in the  per semaphore pending queue, and decrements
                 * cannot be successful if the value is already 0.
                 */
                if (semnum != -1 && sma->sems[semnum].semval == 0)
                        break;

                error = perform_atomic_semop(sma, q);

                /* Does q->sleeper still need to sleep? */
                if (error > 0)
                        continue;

                unlink_queue(sma, q);

                if (error) {
                        restart = 0;
                } else {
                        semop_completed = 1;
                        do_smart_wakeup_zero(sma, q->sops, q->nsops, wake_q);
                        restart = check_restart(sma, q);
                }

                wake_up_sem_queue_prepare(q, error, wake_q);
                if (restart)
                        goto again;
        }
        return semop_completed;
}

/**
 * set_semotime - set sem_otime
 * @sma: semaphore array
 * @sops: operations that modified the array, may be NULL
 *
 * sem_otime is replicated to avoid cache line trashing.
 * This function sets one instance to the current time.
 */
static void set_semotime(struct sem_array *sma, struct sembuf *sops)
{
        if (sops == NULL) {
                sma->sems[0].sem_otime = ktime_get_real_seconds();
        } else {
                sma->sems[sops[0].sem_num].sem_otime =
                                                ktime_get_real_seconds();
        }
}

/**
 * do_smart_update - optimized update_queue
 * @sma: semaphore array
 * @sops: operations that were performed
 * @nsops: number of operations
 * @otime: force setting otime
 * @wake_q: lockless wake-queue head
 *
 * do_smart_update() does the required calls to update_queue and wakeup_zero,
 * based on the actual changes that were performed on the semaphore array.
 * Note that the function does not do the actual wake-up: the caller is
 * responsible for calling wake_up_q().
 * It is safe to perform this call after dropping all locks.
 */
static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsops,
                            int otime, struct wake_q_head *wake_q)
{
        int i;

        otime |= do_smart_wakeup_zero(sma, sops, nsops, wake_q);

        if (!list_empty(&sma->pending_alter)) {
                /* semaphore array uses the global queue - just process it. */
                otime |= update_queue(sma, -1, wake_q);
        } else {
                if (!sops) {
                        /*
                         * No sops, thus the modified semaphores are not
                         * known. Check all.
                         */
                        for (i = 0; i < sma->sem_nsems; i++)
                                otime |= update_queue(sma, i, wake_q);
                } else {
                        /*
                         * Check the semaphores that were increased:
                         * - No complex ops, thus all sleeping ops are
                         *   decrease.
                         * - if we decreased the value, then any sleeping
                         *   semaphore ops wont be able to run: If the
                         *   previous value was too small, then the new
                         *   value will be too small, too.
                         */
                        for (i = 0; i < nsops; i++) {
                                if (sops[i].sem_op > 0) {
                                        otime |= update_queue(sma,
                                                              sops[i].sem_num, wake_q);
                                }
                        }
                }
        }
        if (otime)
                set_semotime(sma, sops);
}

/*
 * check_qop: Test if a queued operation sleeps on the semaphore semnum
 */
static int check_qop(struct sem_array *sma, int semnum, struct sem_queue *q,
                        bool count_zero)
{
        struct sembuf *sop = q->blocking;

        /*
         * Linux always (since 0.99.10) reported a task as sleeping on all
         * semaphores. This violates SUS, therefore it was changed to the
         * standard compliant behavior.
         * Give the administrators a chance to notice that an application
         * might misbehave because it relies on the Linux behavior.
         */
        pr_info_once("semctl(GETNCNT/GETZCNT) is since 3.16 Single Unix Specification compliant.\n"
                        "The task %s (%d) triggered the difference, watch for misbehavior.\n",
                        current->comm, task_pid_nr(current));

        if (sop->sem_num != semnum)
                return 0;

        if (count_zero && sop->sem_op == 0)
                return 1;
        if (!count_zero && sop->sem_op < 0)
                return 1;

        return 0;
}

/* The following counts are associated to each semaphore:
 *   semncnt        number of tasks waiting on semval being nonzero
 *   semzcnt        number of tasks waiting on semval being zero
 *
 * Per definition, a task waits only on the semaphore of the first semop
 * that cannot proceed, even if additional operation would block, too.
 */
static int count_semcnt(struct sem_array *sma, ushort semnum,
                        bool count_zero)
{
        struct list_head *l;
        struct sem_queue *q;
        int semcnt;

        semcnt = 0;
        /* First: check the simple operations. They are easy to evaluate */
        if (count_zero)
                l = &sma->sems[semnum].pending_const;
        else
                l = &sma->sems[semnum].pending_alter;

        list_for_each_entry(q, l, list) {
                /* all task on a per-semaphore list sleep on exactly
                 * that semaphore
                 */
                semcnt++;
        }

        /* Then: check the complex operations. */
        list_for_each_entry(q, &sma->pending_alter, list) {
                semcnt += check_qop(sma, semnum, q, count_zero);
        }
        if (count_zero) {
                list_for_each_entry(q, &sma->pending_const, list) {
                        semcnt += check_qop(sma, semnum, q, count_zero);
                }
        }
        return semcnt;
}

/* Free a semaphore set. freeary() is called with sem_ids.rwsem locked
 * as a writer and the spinlock for this semaphore set hold. sem_ids.rwsem
 * remains locked on exit.
 */
static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
{
        struct sem_undo *un, *tu;
        struct sem_queue *q, *tq;
        struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm);
        int i;
        DEFINE_WAKE_Q(wake_q);

        /* Free the existing undo structures for this semaphore set.  */
        ipc_assert_locked_object(&sma->sem_perm);
        list_for_each_entry_safe(un, tu, &sma->list_id, list_id) {
                list_del(&un->list_id);
                spin_lock(&un->ulp->lock);
                un->semid = -1;
                list_del_rcu(&un->list_proc);
                spin_unlock(&un->ulp->lock);
                kfree_rcu(un, rcu);
        }

        /* Wake up all pending processes and let them fail with EIDRM. */
        list_for_each_entry_safe(q, tq, &sma->pending_const, list) {
                unlink_queue(sma, q);
                wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
        }

        list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
                unlink_queue(sma, q);
                wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
        }
        for (i = 0; i < sma->sem_nsems; i++) {
                struct sem *sem = &sma->sems[i];
                list_for_each_entry_safe(q, tq, &sem->pending_const, list) {
                        unlink_queue(sma, q);
                        wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
                }
                list_for_each_entry_safe(q, tq, &sem->pending_alter, list) {
                        unlink_queue(sma, q);
                        wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
                }
                ipc_update_pid(&sem->sempid, NULL);
        }

        /* Remove the semaphore set from the IDR */
        sem_rmid(ns, sma);
        sem_unlock(sma, -1);
        rcu_read_unlock();

        wake_up_q(&wake_q);
        ns->used_sems -= sma->sem_nsems;
        ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
}

static unsigned long copy_semid_to_user(void __user *buf, struct semid64_ds *in, int version)
{
        switch (version) {
        case IPC_64:
                return copy_to_user(buf, in, sizeof(*in));
        case IPC_OLD:
            {
                struct semid_ds out;

                memset(&out, 0, sizeof(out));

                ipc64_perm_to_ipc_perm(&in->sem_perm, &out.sem_perm);

                out.sem_otime        = in->sem_otime;
                out.sem_ctime        = in->sem_ctime;
                out.sem_nsems        = in->sem_nsems;

                return copy_to_user(buf, &out, sizeof(out));
            }
        default:
                return -EINVAL;
        }
}

static time64_t get_semotime(struct sem_array *sma)
{
        int i;
        time64_t res;

        res = sma->sems[0].sem_otime;
        for (i = 1; i < sma->sem_nsems; i++) {
                time64_t to = sma->sems[i].sem_otime;

                if (to > res)
                        res = to;
        }
        return res;
}

static int semctl_stat(struct ipc_namespace *ns, int semid,
                         int cmd, struct semid64_ds *semid64)
{
        struct sem_array *sma;
        time64_t semotime;
        int err;

        memset(semid64, 0, sizeof(*semid64));

        rcu_read_lock();
        if (cmd == SEM_STAT || cmd == SEM_STAT_ANY) {
                sma = sem_obtain_object(ns, semid);
                if (IS_ERR(sma)) {
                        err = PTR_ERR(sma);
                        goto out_unlock;
                }
        } else { /* IPC_STAT */
                sma = sem_obtain_object_check(ns, semid);
                if (IS_ERR(sma)) {
                        err = PTR_ERR(sma);
                        goto out_unlock;
                }
        }

        /* see comment for SHM_STAT_ANY */
        if (cmd == SEM_STAT_ANY)
                audit_ipc_obj(&sma->sem_perm);
        else {
                err = -EACCES;
                if (ipcperms(ns, &sma->sem_perm, S_IRUGO))
                        goto out_unlock;
        }

        err = security_sem_semctl(&sma->sem_perm, cmd);
        if (err)
                goto out_unlock;

        ipc_lock_object(&sma->sem_perm);

        if (!ipc_valid_object(&sma->sem_perm)) {
                ipc_unlock_object(&sma->sem_perm);
                err = -EIDRM;
                goto out_unlock;
        }

        kernel_to_ipc64_perm(&sma->sem_perm, &semid64->sem_perm);
        semotime = get_semotime(sma);
        semid64->sem_otime = semotime;
        semid64->sem_ctime = sma->sem_ctime;
#ifndef CONFIG_64BIT
        semid64->sem_otime_high = semotime >> 32;
        semid64->sem_ctime_high = sma->sem_ctime >> 32;
#endif
        semid64->sem_nsems = sma->sem_nsems;

        if (cmd == IPC_STAT) {
                /*
                 * As defined in SUS:
                 * Return 0 on success
                 */
                err = 0;
        } else {
                /*
                 * SEM_STAT and SEM_STAT_ANY (both Linux specific)
                 * Return the full id, including the sequence number
                 */
                err = sma->sem_perm.id;
        }
        ipc_unlock_object(&sma->sem_perm);
out_unlock:
        rcu_read_unlock();
        return err;
}

static int semctl_info(struct ipc_namespace *ns, int semid,
                         int cmd, void __user *p)
{
        struct seminfo seminfo;
        int max_idx;
        int err;

        err = security_sem_semctl(NULL, cmd);
        if (err)
                return err;

        memset(&seminfo, 0, sizeof(seminfo));
        seminfo.semmni = ns->sc_semmni;
        seminfo.semmns = ns->sc_semmns;
        seminfo.semmsl = ns->sc_semmsl;
        seminfo.semopm = ns->sc_semopm;
        seminfo.semvmx = SEMVMX;
        seminfo.semmnu = SEMMNU;
        seminfo.semmap = SEMMAP;
        seminfo.semume = SEMUME;
        down_read(&sem_ids(ns).rwsem);
        if (cmd == SEM_INFO) {
                seminfo.semusz = sem_ids(ns).in_use;
                seminfo.semaem = ns->used_sems;
        } else {
                seminfo.semusz = SEMUSZ;
                seminfo.semaem = SEMAEM;
        }
        max_idx = ipc_get_maxidx(&sem_ids(ns));
        up_read(&sem_ids(ns).rwsem);
        if (copy_to_user(p, &seminfo, sizeof(struct seminfo)))
                return -EFAULT;
        return (max_idx < 0) ? 0 : max_idx;
}

static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
                int val)
{
        struct sem_undo *un;
        struct sem_array *sma;
        struct sem *curr;
        int err;
        DEFINE_WAKE_Q(wake_q);

        if (val > SEMVMX || val < 0)
                return -ERANGE;

        rcu_read_lock();
        sma = sem_obtain_object_check(ns, semid);
        if (IS_ERR(sma)) {
                rcu_read_unlock();
                return PTR_ERR(sma);
        }

        if (semnum < 0 || semnum >= sma->sem_nsems) {
                rcu_read_unlock();
                return -EINVAL;
        }


        if (ipcperms(ns, &sma->sem_perm, S_IWUGO)) {
                rcu_read_unlock();
                return -EACCES;
        }

        err = security_sem_semctl(&sma->sem_perm, SETVAL);
        if (err) {
                rcu_read_unlock();
                return -EACCES;
        }

        sem_lock(sma, NULL, -1);

        if (!ipc_valid_object(&sma->sem_perm)) {
                sem_unlock(sma, -1);
                rcu_read_unlock();
                return -EIDRM;
        }

        semnum = array_index_nospec(semnum, sma->sem_nsems);
        curr = &sma->sems[semnum];

        ipc_assert_locked_object(&sma->sem_perm);
        list_for_each_entry(un, &sma->list_id, list_id)
                un->semadj[semnum] = 0;

        curr->semval = val;
        ipc_update_pid(&curr->sempid, task_tgid(current));
        sma->sem_ctime = ktime_get_real_seconds();
        /* maybe some queued-up processes were waiting for this */
        do_smart_update(sma, NULL, 0, 0, &wake_q);
        sem_unlock(sma, -1);
        rcu_read_unlock();
        wake_up_q(&wake_q);
        return 0;
}

static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
                int cmd, void __user *p)
{
        struct sem_array *sma;
        struct sem *curr;
        int err, nsems;
        ushort fast_sem_io[SEMMSL_FAST];
        ushort *sem_io = fast_sem_io;
        DEFINE_WAKE_Q(wake_q);

        rcu_read_lock();
        sma = sem_obtain_object_check(ns, semid);
        if (IS_ERR(sma)) {
                rcu_read_unlock();
                return PTR_ERR(sma);
        }

        nsems = sma->sem_nsems;

        err = -EACCES;
        if (ipcperms(ns, &sma->sem_perm, cmd == SETALL ? S_IWUGO : S_IRUGO))
                goto out_rcu_wakeup;

        err = security_sem_semctl(&sma->sem_perm, cmd);
        if (err)
                goto out_rcu_wakeup;

        err = -EACCES;
        switch (cmd) {
        case GETALL:
        {
                ushort __user *array = p;
                int i;

                sem_lock(sma, NULL, -1);
                if (!ipc_valid_object(&sma->sem_perm)) {
                        err = -EIDRM;
                        goto out_unlock;
                }
                if (nsems > SEMMSL_FAST) {
                        if (!ipc_rcu_getref(&sma->sem_perm)) {
                                err = -EIDRM;
                                goto out_unlock;
                        }
                        sem_unlock(sma, -1);
                        rcu_read_unlock();
                        sem_io = kvmalloc_array(nsems, sizeof(ushort),
                                                GFP_KERNEL);
                        if (sem_io == NULL) {
                                ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
                                return -ENOMEM;
                        }

                        rcu_read_lock();
                        sem_lock_and_putref(sma);
                        if (!ipc_valid_object(&sma->sem_perm)) {
                                err = -EIDRM;
                                goto out_unlock;
                        }
                }
                for (i = 0; i < sma->sem_nsems; i++)
                        sem_io[i] = sma->sems[i].semval;
                sem_unlock(sma, -1);
                rcu_read_unlock();
                err = 0;
                if (copy_to_user(array, sem_io, nsems*sizeof(ushort)))
                        err = -EFAULT;
                goto out_free;
        }
        case SETALL:
        {
                int i;
                struct sem_undo *un;

                if (!ipc_rcu_getref(&sma->sem_perm)) {
                        err = -EIDRM;
                        goto out_rcu_wakeup;
                }
                rcu_read_unlock();

                if (nsems > SEMMSL_FAST) {
                        sem_io = kvmalloc_array(nsems, sizeof(ushort),
                                                GFP_KERNEL);
                        if (sem_io == NULL) {
                                ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
                                return -ENOMEM;
                        }
                }

                if (copy_from_user(sem_io, p, nsems*sizeof(ushort))) {
                        ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
                        err = -EFAULT;
                        goto out_free;
                }

                for (i = 0; i < nsems; i++) {
                        if (sem_io[i] > SEMVMX) {
                                ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
                                err = -ERANGE;
                                goto out_free;
                        }
                }
                rcu_read_lock();
                sem_lock_and_putref(sma);
                if (!ipc_valid_object(&sma->sem_perm)) {
                        err = -EIDRM;
                        goto out_unlock;
                }

                for (i = 0; i < nsems; i++) {
                        sma->sems[i].semval = sem_io[i];
                        ipc_update_pid(&sma->sems[i].sempid, task_tgid(current));
                }

                ipc_assert_locked_object(&sma->sem_perm);
                list_for_each_entry(un, &sma->list_id, list_id) {
                        for (i = 0; i < nsems; i++)
                                un->semadj[i] = 0;
                }
                sma->sem_ctime = ktime_get_real_seconds();
                /* maybe some queued-up processes were waiting for this */
                do_smart_update(sma, NULL, 0, 0, &wake_q);
                err = 0;
                goto out_unlock;
        }
        /* GETVAL, GETPID, GETNCTN, GETZCNT: fall-through */
        }
        err = -EINVAL;
        if (semnum < 0 || semnum >= nsems)
                goto out_rcu_wakeup;

        sem_lock(sma, NULL, -1);
        if (!ipc_valid_object(&sma->sem_perm)) {
                err = -EIDRM;
                goto out_unlock;
        }

        semnum = array_index_nospec(semnum, nsems);
        curr = &sma->sems[semnum];

        switch (cmd) {
        case GETVAL:
                err = curr->semval;
                goto out_unlock;
        case GETPID:
                err = pid_vnr(curr->sempid);
                goto out_unlock;
        case GETNCNT:
                err = count_semcnt(sma, semnum, 0);
                goto out_unlock;
        case GETZCNT:
                err = count_semcnt(sma, semnum, 1);
                goto out_unlock;
        }

out_unlock:
        sem_unlock(sma, -1);
out_rcu_wakeup:
        rcu_read_unlock();
        wake_up_q(&wake_q);
out_free:
        if (sem_io != fast_sem_io)
                kvfree(sem_io);
        return err;
}

static inline unsigned long
copy_semid_from_user(struct semid64_ds *out, void __user *buf, int version)
{
        switch (version) {
        case IPC_64:
                if (copy_from_user(out, buf, sizeof(*out)))
                        return -EFAULT;
                return 0;
        case IPC_OLD:
            {
                struct semid_ds tbuf_old;

                if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old)))
                        return -EFAULT;

                out->sem_perm.uid        = tbuf_old.sem_perm.uid;
                out->sem_perm.gid        = tbuf_old.sem_perm.gid;
                out->sem_perm.mode        = tbuf_old.sem_perm.mode;

                return 0;
            }
        default:
                return -EINVAL;
        }
}

/*
 * This function handles some semctl commands which require the rwsem
 * to be held in write mode.
 * NOTE: no locks must be held, the rwsem is taken inside this function.
 */
static int semctl_down(struct ipc_namespace *ns, int semid,
                       int cmd, struct semid64_ds *semid64)
{
        struct sem_array *sma;
        int err;
        struct kern_ipc_perm *ipcp;

        down_write(&sem_ids(ns).rwsem);
        rcu_read_lock();

        ipcp = ipcctl_obtain_check(ns, &sem_ids(ns), semid, cmd,
                                      &semid64->sem_perm, 0);
        if (IS_ERR(ipcp)) {
                err = PTR_ERR(ipcp);
                goto out_unlock1;
        }

        sma = container_of(ipcp, struct sem_array, sem_perm);

        err = security_sem_semctl(&sma->sem_perm, cmd);
        if (err)
                goto out_unlock1;

        switch (cmd) {
        case IPC_RMID:
                sem_lock(sma, NULL, -1);
                /* freeary unlocks the ipc object and rcu */
                freeary(ns, ipcp);
                goto out_up;
        case IPC_SET:
                sem_lock(sma, NULL, -1);
                err = ipc_update_perm(&semid64->sem_perm, ipcp);
                if (err)
                        goto out_unlock0;
                sma->sem_ctime = ktime_get_real_seconds();
                break;
        default:
                err = -EINVAL;
                goto out_unlock1;
        }

out_unlock0:
        sem_unlock(sma, -1);
out_unlock1:
        rcu_read_unlock();
out_up:
        up_write(&sem_ids(ns).rwsem);
        return err;
}

static long ksys_semctl(int semid, int semnum, int cmd, unsigned long arg, int version)
{
        struct ipc_namespace *ns;
        void __user *p = (void __user *)arg;
        struct semid64_ds semid64;
        int err;

        if (semid < 0)
                return -EINVAL;

        ns = current->nsproxy->ipc_ns;

        switch (cmd) {
        case IPC_INFO:
        case SEM_INFO:
                return semctl_info(ns, semid, cmd, p);
        case IPC_STAT:
        case SEM_STAT:
        case SEM_STAT_ANY:
                err = semctl_stat(ns, semid, cmd, &semid64);
                if (err < 0)
                        return err;
                if (copy_semid_to_user(p, &semid64, version))
                        err = -EFAULT;
                return err;
        case GETALL:
        case GETVAL:
        case GETPID:
        case GETNCNT:
        case GETZCNT:
        case SETALL:
                return semctl_main(ns, semid, semnum, cmd, p);
        case SETVAL: {
                int val;
#if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN)
                /* big-endian 64bit */
                val = arg >> 32;
#else
                /* 32bit or little-endian 64bit */
                val = arg;
#endif
                return semctl_setval(ns, semid, semnum, val);
        }
        case IPC_SET:
                if (copy_semid_from_user(&semid64, p, version))
                        return -EFAULT;
                fallthrough;
        case IPC_RMID:
                return semctl_down(ns, semid, cmd, &semid64);
        default:
                return -EINVAL;
        }
}

SYSCALL_DEFINE4(semctl, int, semid, int, semnum, int, cmd, unsigned long, arg)
{
        return ksys_semctl(semid, semnum, cmd, arg, IPC_64);
}

#ifdef CONFIG_ARCH_WANT_IPC_PARSE_VERSION
long ksys_old_semctl(int semid, int semnum, int cmd, unsigned long arg)
{
        int version = ipc_parse_version(&cmd);

        return ksys_semctl(semid, semnum, cmd, arg, version);
}

SYSCALL_DEFINE4(old_semctl, int, semid, int, semnum, int, cmd, unsigned long, arg)
{
        return ksys_old_semctl(semid, semnum, cmd, arg);
}
#endif

#ifdef CONFIG_COMPAT

struct compat_semid_ds {
        struct compat_ipc_perm sem_perm;
        old_time32_t sem_otime;
        old_time32_t sem_ctime;
        compat_uptr_t sem_base;
        compat_uptr_t sem_pending;
        compat_uptr_t sem_pending_last;
        compat_uptr_t undo;
        unsigned short sem_nsems;
};

static int copy_compat_semid_from_user(struct semid64_ds *out, void __user *buf,
                                        int version)
{
        memset(out, 0, sizeof(*out));
        if (version == IPC_64) {
                struct compat_semid64_ds __user *p = buf;
                return get_compat_ipc64_perm(&out->sem_perm, &p->sem_perm);
        } else {
                struct compat_semid_ds __user *p = buf;
                return get_compat_ipc_perm(&out->sem_perm, &p->sem_perm);
        }
}

static int copy_compat_semid_to_user(void __user *buf, struct semid64_ds *in,
                                        int version)
{
        if (version == IPC_64) {
                struct compat_semid64_ds v;
                memset(&v, 0, sizeof(v));
                to_compat_ipc64_perm(&v.sem_perm, &in->sem_perm);
                v.sem_otime         = lower_32_bits(in->sem_otime);
                v.sem_otime_high = upper_32_bits(in->sem_otime);
                v.sem_ctime         = lower_32_bits(in->sem_ctime);
                v.sem_ctime_high = upper_32_bits(in->sem_ctime);
                v.sem_nsems = in->sem_nsems;
                return copy_to_user(buf, &v, sizeof(v));
        } else {
                struct compat_semid_ds v;
                memset(&v, 0, sizeof(v));
                to_compat_ipc_perm(&v.sem_perm, &in->sem_perm);
                v.sem_otime = in->sem_otime;
                v.sem_ctime = in->sem_ctime;
                v.sem_nsems = in->sem_nsems;
                return copy_to_user(buf, &v, sizeof(v));
        }
}

static long compat_ksys_semctl(int semid, int semnum, int cmd, int arg, int version)
{
        void __user *p = compat_ptr(arg);
        struct ipc_namespace *ns;
        struct semid64_ds semid64;
        int err;

        ns = current->nsproxy->ipc_ns;

        if (semid < 0)
                return -EINVAL;

        switch (cmd & (~IPC_64)) {
        case IPC_INFO:
        case SEM_INFO:
                return semctl_info(ns, semid, cmd, p);
        case IPC_STAT:
        case SEM_STAT:
        case SEM_STAT_ANY:
                err = semctl_stat(ns, semid, cmd, &semid64);
                if (err < 0)
                        return err;
                if (copy_compat_semid_to_user(p, &semid64, version))
                        err = -EFAULT;
                return err;
        case GETVAL:
        case GETPID:
        case GETNCNT:
        case GETZCNT:
        case GETALL:
        case SETALL:
                return semctl_main(ns, semid, semnum, cmd, p);
        case SETVAL:
                return semctl_setval(ns, semid, semnum, arg);
        case IPC_SET:
                if (copy_compat_semid_from_user(&semid64, p, version))
                        return -EFAULT;
                fallthrough;
        case IPC_RMID:
                return semctl_down(ns, semid, cmd, &semid64);
        default:
                return -EINVAL;
        }
}

COMPAT_SYSCALL_DEFINE4(semctl, int, semid, int, semnum, int, cmd, int, arg)
{
        return compat_ksys_semctl(semid, semnum, cmd, arg, IPC_64);
}

#ifdef CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION
long compat_ksys_old_semctl(int semid, int semnum, int cmd, int arg)
{
        int version = compat_ipc_parse_version(&cmd);

        return compat_ksys_semctl(semid, semnum, cmd, arg, version);
}

COMPAT_SYSCALL_DEFINE4(old_semctl, int, semid, int, semnum, int, cmd, int, arg)
{
        return compat_ksys_old_semctl(semid, semnum, cmd, arg);
}
#endif
#endif

/* If the task doesn't already have a undo_list, then allocate one
 * here.  We guarantee there is only one thread using this undo list,
 * and current is THE ONE
 *
 * If this allocation and assignment succeeds, but later
 * portions of this code fail, there is no need to free the sem_undo_list.
 * Just let it stay associated with the task, and it'll be freed later
 * at exit time.
 *
 * This can block, so callers must hold no locks.
 */
static inline int get_undo_list(struct sem_undo_list **undo_listp)
{
        struct sem_undo_list *undo_list;

        undo_list = current->sysvsem.undo_list;
        if (!undo_list) {
                undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL_ACCOUNT);
                if (undo_list == NULL)
                        return -ENOMEM;
                spin_lock_init(&undo_list->lock);
                refcount_set(&undo_list->refcnt, 1);
                INIT_LIST_HEAD(&undo_list->list_proc);

                current->sysvsem.undo_list = undo_list;
        }
        *undo_listp = undo_list;
        return 0;
}

static struct sem_undo *__lookup_undo(struct sem_undo_list *ulp, int semid)
{
        struct sem_undo *un;

        list_for_each_entry_rcu(un, &ulp->list_proc, list_proc,
                                spin_is_locked(&ulp->lock)) {
                if (un->semid == semid)
                        return un;
        }
        return NULL;
}

static struct sem_undo *lookup_undo(struct sem_undo_list *ulp, int semid)
{
        struct sem_undo *un;

        assert_spin_locked(&ulp->lock);

        un = __lookup_undo(ulp, semid);
        if (un) {
                list_del_rcu(&un->list_proc);
                list_add_rcu(&un->list_proc, &ulp->list_proc);
        }
        return un;
}

/**
 * find_alloc_undo - lookup (and if not present create) undo array
 * @ns: namespace
 * @semid: semaphore array id
 *
 * The function looks up (and if not present creates) the undo structure.
 * The size of the undo structure depends on the size of the semaphore
 * array, thus the alloc path is not that straightforward.
 * Lifetime-rules: sem_undo is rcu-protected, on success, the function
 * performs a rcu_read_lock().
 */
static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid)
{
        struct sem_array *sma;
        struct sem_undo_list *ulp;
        struct sem_undo *un, *new;
        int nsems, error;

        error = get_undo_list(&ulp);
        if (error)
                return ERR_PTR(error);

        rcu_read_lock();
        spin_lock(&ulp->lock);
        un = lookup_undo(ulp, semid);
        spin_unlock(&ulp->lock);
        if (likely(un != NULL))
                goto out;

        /* no undo structure around - allocate one. */
        /* step 1: figure out the size of the semaphore array */
        sma = sem_obtain_object_check(ns, semid);
        if (IS_ERR(sma)) {
                rcu_read_unlock();
                return ERR_CAST(sma);
        }

        nsems = sma->sem_nsems;
        if (!ipc_rcu_getref(&sma->sem_perm)) {
                rcu_read_unlock();
                un = ERR_PTR(-EIDRM);
                goto out;
        }
        rcu_read_unlock();

        /* step 2: allocate new undo structure */
        new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL_ACCOUNT);
        if (!new) {
                ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
                return ERR_PTR(-ENOMEM);
        }

        /* step 3: Acquire the lock on semaphore array */
        rcu_read_lock();
        sem_lock_and_putref(sma);
        if (!ipc_valid_object(&sma->sem_perm)) {
                sem_unlock(sma, -1);
                rcu_read_unlock();
                kfree(new);
                un = ERR_PTR(-EIDRM);
                goto out;
        }
        spin_lock(&ulp->lock);

        /*
         * step 4: check for races: did someone else allocate the undo struct?
         */
        un = lookup_undo(ulp, semid);
        if (un) {
                kfree(new);
                goto success;
        }
        /* step 5: initialize & link new undo structure */
        new->semadj = (short *) &new[1];
        new->ulp = ulp;
        new->semid = semid;
        assert_spin_locked(&ulp->lock);
        list_add_rcu(&new->list_proc, &ulp->list_proc);
        ipc_assert_locked_object(&sma->sem_perm);
        list_add(&new->list_id, &sma->list_id);
        un = new;

success:
        spin_unlock(&ulp->lock);
        sem_unlock(sma, -1);
out:
        return un;
}

static long do_semtimedop(int semid, struct sembuf __user *tsops,
                unsigned nsops, const struct timespec64 *timeout)
{
        int error = -EINVAL;
        struct sem_array *sma;
        struct sembuf fast_sops[SEMOPM_FAST];
        struct sembuf *sops = fast_sops, *sop;
        struct sem_undo *un;
        int max, locknum;
        bool undos = false, alter = false, dupsop = false;
        struct sem_queue queue;
        unsigned long dup = 0, jiffies_left = 0;
        struct ipc_namespace *ns;

        ns = current->nsproxy->ipc_ns;

        if (nsops < 1 || semid < 0)
                return -EINVAL;
        if (nsops > ns->sc_semopm)
                return -E2BIG;
        if (nsops > SEMOPM_FAST) {
                sops = kvmalloc_array(nsops, sizeof(*sops), GFP_KERNEL);
                if (sops == NULL)
                        return -ENOMEM;
        }

        if (copy_from_user(sops, tsops, nsops * sizeof(*tsops))) {
                error =  -EFAULT;
                goto out_free;
        }

        if (timeout) {
                if (timeout->tv_sec < 0 || timeout->tv_nsec < 0 ||
                        timeout->tv_nsec >= 1000000000L) {
                        error = -EINVAL;
                        goto out_free;
                }
                jiffies_left = timespec64_to_jiffies(timeout);
        }

        max = 0;
        for (sop = sops; sop < sops + nsops; sop++) {
                unsigned long mask = 1ULL << ((sop->sem_num) % BITS_PER_LONG);

                if (sop->sem_num >= max)
                        max = sop->sem_num;
                if (sop->sem_flg & SEM_UNDO)
                        undos = true;
                if (dup & mask) {
                        /*
                         * There was a previous alter access that appears
                         * to have accessed the same semaphore, thus use
                         * the dupsop logic. "appears", because the detection
                         * can only check % BITS_PER_LONG.
                         */
                        dupsop = true;
                }
                if (sop->sem_op != 0) {
                        alter = true;
                        dup |= mask;
                }
        }

        if (undos) {
                /* On success, find_alloc_undo takes the rcu_read_lock */
                un = find_alloc_undo(ns, semid);
                if (IS_ERR(un)) {
                        error = PTR_ERR(un);
                        goto out_free;
                }
        } else {
                un = NULL;
                rcu_read_lock();
        }

        sma = sem_obtain_object_check(ns, semid);
        if (IS_ERR(sma)) {
                rcu_read_unlock();
                error = PTR_ERR(sma);
                goto out_free;
        }

        error = -EFBIG;
        if (max >= sma->sem_nsems) {
                rcu_read_unlock();
                goto out_free;
        }

        error = -EACCES;
        if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO)) {
                rcu_read_unlock();
                goto out_free;
        }

        error = security_sem_semop(&sma->sem_perm, sops, nsops, alter);
        if (error) {
                rcu_read_unlock();
                goto out_free;
        }

        error = -EIDRM;
        locknum = sem_lock(sma, sops, nsops);
        /*
         * We eventually might perform the following check in a lockless
         * fashion, considering ipc_valid_object() locking constraints.
         * If nsops == 1 and there is no contention for sem_perm.lock, then
         * only a per-semaphore lock is held and it's OK to proceed with the
         * check below. More details on the fine grained locking scheme
         * entangled here and why it's RMID race safe on comments at sem_lock()
         */
        if (!ipc_valid_object(&sma->sem_perm))
                goto out_unlock_free;
        /*
         * semid identifiers are not unique - find_alloc_undo may have
         * allocated an undo structure, it was invalidated by an RMID
         * and now a new array with received the same id. Check and fail.
         * This case can be detected checking un->semid. The existence of
         * "un" itself is guaranteed by rcu.
         */
        if (un && un->semid == -1)
                goto out_unlock_free;

        queue.sops = sops;
        queue.nsops = nsops;
        queue.undo = un;
        queue.pid = task_tgid(current);
        queue.alter = alter;
        queue.dupsop = dupsop;

        error = perform_atomic_semop(sma, &queue);
        if (error == 0) { /* non-blocking succesfull path */
                DEFINE_WAKE_Q(wake_q);

                /*
                 * If the operation was successful, then do
                 * the required updates.
                 */
                if (alter)
                        do_smart_update(sma, sops, nsops, 1, &wake_q);
                else
                        set_semotime(sma, sops);

                sem_unlock(sma, locknum);
                rcu_read_unlock();
                wake_up_q(&wake_q);

                goto out_free;
        }
        if (error < 0) /* non-blocking error path */
                goto out_unlock_free;

        /*
         * We need to sleep on this operation, so we put the current
         * task into the pending queue and go to sleep.
         */
        if (nsops == 1) {
                struct sem *curr;
                int idx = array_index_nospec(sops->sem_num, sma->sem_nsems);
                curr = &sma->sems[idx];

                if (alter) {
                        if (sma->complex_count) {
                                list_add_tail(&queue.list,
                                                &sma->pending_alter);
                        } else {

                                list_add_tail(&queue.list,
                                                &curr->pending_alter);
                        }
                } else {
                        list_add_tail(&queue.list, &curr->pending_const);
                }
        } else {
                if (!sma->complex_count)
                        merge_queues(sma);

                if (alter)
                        list_add_tail(&queue.list, &sma->pending_alter);
                else
                        list_add_tail(&queue.list, &sma->pending_const);

                sma->complex_count++;
        }

        do {
                /* memory ordering ensured by the lock in sem_lock() */
                WRITE_ONCE(queue.status, -EINTR);
                queue.sleeper = current;

                /* memory ordering is ensured by the lock in sem_lock() */
                __set_current_state(TASK_INTERRUPTIBLE);
                sem_unlock(sma, locknum);
                rcu_read_unlock();

                if (timeout)
                        jiffies_left = schedule_timeout(jiffies_left);
                else
                        schedule();

                /*
                 * fastpath: the semop has completed, either successfully or
                 * not, from the syscall pov, is quite irrelevant to us at this
                 * point; we're done.
                 *
                 * We _do_ care, nonetheless, about being awoken by a signal or
                 * spuriously.  The queue.status is checked again in the
                 * slowpath (aka after taking sem_lock), such that we can detect
                 * scenarios where we were awakened externally, during the
                 * window between wake_q_add() and wake_up_q().
                 */
                rcu_read_lock();
                error = READ_ONCE(queue.status);
                if (error != -EINTR) {
                        /* see SEM_BARRIER_2 for purpose/pairing */
                        smp_acquire__after_ctrl_dep();
                        rcu_read_unlock();
                        goto out_free;
                }

                locknum = sem_lock(sma, sops, nsops);

                if (!ipc_valid_object(&sma->sem_perm))
                        goto out_unlock_free;

                /*
                 * No necessity for any barrier: We are protect by sem_lock()
                 */
                error = READ_ONCE(queue.status);

                /*
                 * If queue.status != -EINTR we are woken up by another process.
                 * Leave without unlink_queue(), but with sem_unlock().
                 */
                if (error != -EINTR)
                        goto out_unlock_free;

                /*
                 * If an interrupt occurred we have to clean up the queue.
                 */
                if (timeout && jiffies_left == 0)
                        error = -EAGAIN;
        } while (error == -EINTR && !signal_pending(current)); /* spurious */

        unlink_queue(sma, &queue);

out_unlock_free:
        sem_unlock(sma, locknum);
        rcu_read_unlock();
out_free:
        if (sops != fast_sops)
                kvfree(sops);
        return error;
}

long ksys_semtimedop(int semid, struct sembuf __user *tsops,
                     unsigned int nsops, const struct __kernel_timespec __user *timeout)
{
        if (timeout) {
                struct timespec64 ts;
                if (get_timespec64(&ts, timeout))
                        return -EFAULT;
                return do_semtimedop(semid, tsops, nsops, &ts);
        }
        return do_semtimedop(semid, tsops, nsops, NULL);
}

SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
                unsigned int, nsops, const struct __kernel_timespec __user *, timeout)
{
        return ksys_semtimedop(semid, tsops, nsops, timeout);
}

#ifdef CONFIG_COMPAT_32BIT_TIME
long compat_ksys_semtimedop(int semid, struct sembuf __user *tsems,
                            unsigned int nsops,
                            const struct old_timespec32 __user *timeout)
{
        if (timeout) {
                struct timespec64 ts;
                if (get_old_timespec32(&ts, timeout))
                        return -EFAULT;
                return do_semtimedop(semid, tsems, nsops, &ts);
        }
        return do_semtimedop(semid, tsems, nsops, NULL);
}

SYSCALL_DEFINE4(semtimedop_time32, int, semid, struct sembuf __user *, tsems,
                       unsigned int, nsops,
                       const struct old_timespec32 __user *, timeout)
{
        return compat_ksys_semtimedop(semid, tsems, nsops, timeout);
}
#endif

SYSCALL_DEFINE3(semop, int, semid, struct sembuf __user *, tsops,
                unsigned, nsops)
{
        return do_semtimedop(semid, tsops, nsops, NULL);
}

/* If CLONE_SYSVSEM is set, establish sharing of SEM_UNDO state between
 * parent and child tasks.
 */

int copy_semundo(unsigned long clone_flags, struct task_struct *tsk)
{
        struct sem_undo_list *undo_list;
        int error;

        if (clone_flags & CLONE_SYSVSEM) {
                error = get_undo_list(&undo_list);
                if (error)
                        return error;
                refcount_inc(&undo_list->refcnt);
                tsk->sysvsem.undo_list = undo_list;
        } else
                tsk->sysvsem.undo_list = NULL;

        return 0;
}

/*
 * add semadj values to semaphores, free undo structures.
 * undo structures are not freed when semaphore arrays are destroyed
 * so some of them may be out of date.
 * IMPLEMENTATION NOTE: There is some confusion over whether the
 * set of adjustments that needs to be done should be done in an atomic
 * manner or not. That is, if we are attempting to decrement the semval
 * should we queue up and wait until we can do so legally?
 * The original implementation attempted to do this (queue and wait).
 * The current implementation does not do so. The POSIX standard
 * and SVID should be consulted to determine what behavior is mandated.
 */
void exit_sem(struct task_struct *tsk)
{
        struct sem_undo_list *ulp;

        ulp = tsk->sysvsem.undo_list;
        if (!ulp)
                return;
        tsk->sysvsem.undo_list = NULL;

        if (!refcount_dec_and_test(&ulp->refcnt))
                return;

        for (;;) {
                struct sem_array *sma;
                struct sem_undo *un;
                int semid, i;
                DEFINE_WAKE_Q(wake_q);

                cond_resched();

                rcu_read_lock();
                un = list_entry_rcu(ulp->list_proc.next,
                                    struct sem_undo, list_proc);
                if (&un->list_proc == &ulp->list_proc) {
                        /*
                         * We must wait for freeary() before freeing this ulp,
                         * in case we raced with last sem_undo. There is a small
                         * possibility where we exit while freeary() didn't
                         * finish unlocking sem_undo_list.
                         */
                        spin_lock(&ulp->lock);
                        spin_unlock(&ulp->lock);
                        rcu_read_unlock();
                        break;
                }
                spin_lock(&ulp->lock);
                semid = un->semid;
                spin_unlock(&ulp->lock);

                /* exit_sem raced with IPC_RMID, nothing to do */
                if (semid == -1) {
                        rcu_read_unlock();
                        continue;
                }

                sma = sem_obtain_object_check(tsk->nsproxy->ipc_ns, semid);
                /* exit_sem raced with IPC_RMID, nothing to do */
                if (IS_ERR(sma)) {
                        rcu_read_unlock();
                        continue;
                }

                sem_lock(sma, NULL, -1);
                /* exit_sem raced with IPC_RMID, nothing to do */
                if (!ipc_valid_object(&sma->sem_perm)) {
                        sem_unlock(sma, -1);
                        rcu_read_unlock();
                        continue;
                }
                un = __lookup_undo(ulp, semid);
                if (un == NULL) {
                        /* exit_sem raced with IPC_RMID+semget() that created
                         * exactly the same semid. Nothing to do.
                         */
                        sem_unlock(sma, -1);
                        rcu_read_unlock();
                        continue;
                }

                /* remove un from the linked lists */
                ipc_assert_locked_object(&sma->sem_perm);
                list_del(&un->list_id);

                spin_lock(&ulp->lock);
                list_del_rcu(&un->list_proc);
                spin_unlock(&ulp->lock);

                /* perform adjustments registered in un */
                for (i = 0; i < sma->sem_nsems; i++) {
                        struct sem *semaphore = &sma->sems[i];
                        if (un->semadj[i]) {
                                semaphore->semval += un->semadj[i];
                                /*
                                 * Range checks of the new semaphore value,
                                 * not defined by sus:
                                 * - Some unices ignore the undo entirely
                                 *   (e.g. HP UX 11i 11.22, Tru64 V5.1)
                                 * - some cap the value (e.g. FreeBSD caps
                                 *   at 0, but doesn't enforce SEMVMX)
                                 *
                                 * Linux caps the semaphore value, both at 0
                                 * and at SEMVMX.
                                 *
                                 *        Manfred <manfred@colorfullife.com>
                                 */
                                if (semaphore->semval < 0)
                                        semaphore->semval = 0;
                                if (semaphore->semval > SEMVMX)
                                        semaphore->semval = SEMVMX;
                                ipc_update_pid(&semaphore->sempid, task_tgid(current));
                        }
                }
                /* maybe some queued-up processes were waiting for this */
                do_smart_update(sma, NULL, 0, 1, &wake_q);
                sem_unlock(sma, -1);
                rcu_read_unlock();
                wake_up_q(&wake_q);

                kfree_rcu(un, rcu);
        }
        kfree(ulp);
}

#ifdef CONFIG_PROC_FS
static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
{
        struct user_namespace *user_ns = seq_user_ns(s);
        struct kern_ipc_perm *ipcp = it;
        struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm);
        time64_t sem_otime;

        /*
         * The proc interface isn't aware of sem_lock(), it calls
         * ipc_lock_object() directly (in sysvipc_find_ipc).
         * In order to stay compatible with sem_lock(), we must
         * enter / leave complex_mode.
         */
        complexmode_enter(sma);

        sem_otime = get_semotime(sma);

        seq_printf(s,
                   "%10d %10d  %4o %10u %5u %5u %5u %5u %10llu %10llu\n",
                   sma->sem_perm.key,
                   sma->sem_perm.id,
                   sma->sem_perm.mode,
                   sma->sem_nsems,
                   from_kuid_munged(user_ns, sma->sem_perm.uid),
                   from_kgid_munged(user_ns, sma->sem_perm.gid),
                   from_kuid_munged(user_ns, sma->sem_perm.cuid),
                   from_kgid_munged(user_ns, sma->sem_perm.cgid),
                   sem_otime,
                   sma->sem_ctime);

        complexmode_tryleave(sma);

        return 0;
}
#endif























































































    4 






































































    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/mount.h>
#include <linux/seq_file.h>
#include <linux/poll.h>
#include <linux/ns_common.h>
#include <linux/fs_pin.h>

struct mnt_namespace {
        atomic_t                count;
        struct ns_common        ns;
        struct mount *        root;
        /*
         * Traversal and modification of .list is protected by either
         * - taking namespace_sem for write, OR
         * - taking namespace_sem for read AND taking .ns_lock.
         */
        struct list_head        list;
        spinlock_t                ns_lock;
        struct user_namespace        *user_ns;
        struct ucounts                *ucounts;
        u64                        seq;        /* Sequence number to prevent loops */
        wait_queue_head_t poll;
        u64 event;
        unsigned int                mounts; /* # of mounts in the namespace */
        unsigned int                pending_mounts;
} __randomize_layout;

struct mnt_pcp {
        int mnt_count;
        int mnt_writers;
};

struct mountpoint {
        struct hlist_node m_hash;
        struct dentry *m_dentry;
        struct hlist_head m_list;
        int m_count;
};

struct mount {
        struct hlist_node mnt_hash;
        struct mount *mnt_parent;
        struct dentry *mnt_mountpoint;
        struct vfsmount mnt;
        union {
                struct rcu_head mnt_rcu;
                struct llist_node mnt_llist;
        };
#ifdef CONFIG_SMP
        struct mnt_pcp __percpu *mnt_pcp;
#else
        int mnt_count;
        int mnt_writers;
#endif
        struct list_head mnt_mounts;        /* list of children, anchored here */
        struct list_head mnt_child;        /* and going through their mnt_child */
        struct list_head mnt_instance;        /* mount instance on sb->s_mounts */
        const char *mnt_devname;        /* Name of device e.g. /dev/dsk/hda1 */
        struct list_head mnt_list;
        struct list_head mnt_expire;        /* link in fs-specific expiry list */
        struct list_head mnt_share;        /* circular list of shared mounts */
        struct list_head mnt_slave_list;/* list of slave mounts */
        struct list_head mnt_slave;        /* slave list entry */
        struct mount *mnt_master;        /* slave is on master->mnt_slave_list */
        struct mnt_namespace *mnt_ns;        /* containing namespace */
        struct mountpoint *mnt_mp;        /* where is it mounted */
        union {
                struct hlist_node mnt_mp_list;        /* list mounts with the same mountpoint */
                struct hlist_node mnt_umount;
        };
        struct list_head mnt_umounting; /* list entry for umount propagation */
#ifdef CONFIG_FSNOTIFY
        struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
        __u32 mnt_fsnotify_mask;
#endif
        int mnt_id;                        /* mount identifier */
        int mnt_group_id;                /* peer group identifier */
        int mnt_expiry_mark;                /* true if marked for expiry */
        struct hlist_head mnt_pins;
        struct hlist_head mnt_stuck_children;
} __randomize_layout;

#define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */

static inline struct mount *real_mount(struct vfsmount *mnt)
{
        return container_of(mnt, struct mount, mnt);
}

static inline int mnt_has_parent(struct mount *mnt)
{
        return mnt != mnt->mnt_parent;
}

static inline int is_mounted(struct vfsmount *mnt)
{
        /* neither detached nor internal? */
        return !IS_ERR_OR_NULL(real_mount(mnt)->mnt_ns);
}

extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);

extern int __legitimize_mnt(struct vfsmount *, unsigned);
extern bool legitimize_mnt(struct vfsmount *, unsigned);

static inline bool __path_is_mountpoint(const struct path *path)
{
        struct mount *m = __lookup_mnt(path->mnt, path->dentry);
        return m && likely(!(m->mnt.mnt_flags & MNT_SYNC_UMOUNT));
}

extern void __detach_mounts(struct dentry *dentry);

static inline void detach_mounts(struct dentry *dentry)
{
        if (!d_mountpoint(dentry))
                return;
        __detach_mounts(dentry);
}

static inline void get_mnt_ns(struct mnt_namespace *ns)
{
        atomic_inc(&ns->count);
}

extern seqlock_t mount_lock;

static inline void lock_mount_hash(void)
{
        write_seqlock(&mount_lock);
}

static inline void unlock_mount_hash(void)
{
        write_sequnlock(&mount_lock);
}

struct proc_mounts {
        struct mnt_namespace *ns;
        struct path root;
        int (*show)(struct seq_file *, struct vfsmount *);
        struct mount cursor;
};

extern const struct seq_operations mounts_op;

extern bool __is_local_mountpoint(struct dentry *dentry);
static inline bool is_local_mountpoint(struct dentry *dentry)
{
        if (!d_mountpoint(dentry))
                return false;

        return __is_local_mountpoint(dentry);
}

static inline bool is_anon_ns(struct mnt_namespace *ns)
{
        return ns->seq == 0;
}

extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor);














































































































































































    3 










    3 
























































































































































    1 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Percpu refcounts:
 * (C) 2012 Google, Inc.
 * Author: Kent Overstreet <koverstreet@google.com>
 *
 * This implements a refcount with similar semantics to atomic_t - atomic_inc(),
 * atomic_dec_and_test() - but percpu.
 *
 * There's one important difference between percpu refs and normal atomic_t
 * refcounts; you have to keep track of your initial refcount, and then when you
 * start shutting down you call percpu_ref_kill() _before_ dropping the initial
 * refcount.
 *
 * The refcount will have a range of 0 to ((1U << 31) - 1), i.e. one bit less
 * than an atomic_t - this is because of the way shutdown works, see
 * percpu_ref_kill()/PERCPU_COUNT_BIAS.
 *
 * Before you call percpu_ref_kill(), percpu_ref_put() does not check for the
 * refcount hitting 0 - it can't, if it was in percpu mode. percpu_ref_kill()
 * puts the ref back in single atomic_t mode, collecting the per cpu refs and
 * issuing the appropriate barriers, and then marks the ref as shutting down so
 * that percpu_ref_put() will check for the ref hitting 0.  After it returns,
 * it's safe to drop the initial ref.
 *
 * USAGE:
 *
 * See fs/aio.c for some example usage; it's used there for struct kioctx, which
 * is created when userspaces calls io_setup(), and destroyed when userspace
 * calls io_destroy() or the process exits.
 *
 * In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it
 * removes the kioctx from the proccess's table of kioctxs and kills percpu_ref.
 * After that, there can't be any new users of the kioctx (from lookup_ioctx())
 * and it's then safe to drop the initial ref with percpu_ref_put().
 *
 * Note that the free path, free_ioctx(), needs to go through explicit call_rcu()
 * to synchronize with RCU protected lookup_ioctx().  percpu_ref operations don't
 * imply RCU grace periods of any kind and if a user wants to combine percpu_ref
 * with RCU protection, it must be done explicitly.
 *
 * Code that does a two stage shutdown like this often needs some kind of
 * explicit synchronization to ensure the initial refcount can only be dropped
 * once - percpu_ref_kill() does this for you, it returns true once and false if
 * someone else already called it. The aio code uses it this way, but it's not
 * necessary if the code has some other mechanism to synchronize teardown.
 * around.
 */

#ifndef _LINUX_PERCPU_REFCOUNT_H
#define _LINUX_PERCPU_REFCOUNT_H

#include <linux/atomic.h>
#include <linux/kernel.h>
#include <linux/percpu.h>
#include <linux/rcupdate.h>
#include <linux/gfp.h>

struct percpu_ref;
typedef void (percpu_ref_func_t)(struct percpu_ref *);

/* flags set in the lower bits of percpu_ref->percpu_count_ptr */
enum {
        __PERCPU_REF_ATOMIC        = 1LU << 0,        /* operating in atomic mode */
        __PERCPU_REF_DEAD        = 1LU << 1,        /* (being) killed */
        __PERCPU_REF_ATOMIC_DEAD = __PERCPU_REF_ATOMIC | __PERCPU_REF_DEAD,

        __PERCPU_REF_FLAG_BITS        = 2,
};

/* @flags for percpu_ref_init() */
enum {
        /*
         * Start w/ ref == 1 in atomic mode.  Can be switched to percpu
         * operation using percpu_ref_switch_to_percpu().  If initialized
         * with this flag, the ref will stay in atomic mode until
         * percpu_ref_switch_to_percpu() is invoked on it.
         * Implies ALLOW_REINIT.
         */
        PERCPU_REF_INIT_ATOMIC        = 1 << 0,

        /*
         * Start dead w/ ref == 0 in atomic mode.  Must be revived with
         * percpu_ref_reinit() before used.  Implies INIT_ATOMIC and
         * ALLOW_REINIT.
         */
        PERCPU_REF_INIT_DEAD        = 1 << 1,

        /*
         * Allow switching from atomic mode to percpu mode.
         */
        PERCPU_REF_ALLOW_REINIT        = 1 << 2,
};

struct percpu_ref_data {
        atomic_long_t                count;
        percpu_ref_func_t        *release;
        percpu_ref_func_t        *confirm_switch;
        bool                        force_atomic:1;
        bool                        allow_reinit:1;
        struct rcu_head                rcu;
        struct percpu_ref        *ref;
};

struct percpu_ref {
        /*
         * The low bit of the pointer indicates whether the ref is in percpu
         * mode; if set, then get/put will manipulate the atomic_t.
         */
        unsigned long                percpu_count_ptr;

        /*
         * 'percpu_ref' is often embedded into user structure, and only
         * 'percpu_count_ptr' is required in fast path, move other fields
         * into 'percpu_ref_data', so we can reduce memory footprint in
         * fast path.
         */
        struct percpu_ref_data  *data;
};

int __must_check percpu_ref_init(struct percpu_ref *ref,
                                 percpu_ref_func_t *release, unsigned int flags,
                                 gfp_t gfp);
void percpu_ref_exit(struct percpu_ref *ref);
void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
                                 percpu_ref_func_t *confirm_switch);
void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref);
void percpu_ref_switch_to_percpu(struct percpu_ref *ref);
void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
                                 percpu_ref_func_t *confirm_kill);
void percpu_ref_resurrect(struct percpu_ref *ref);
void percpu_ref_reinit(struct percpu_ref *ref);
bool percpu_ref_is_zero(struct percpu_ref *ref);

/**
 * percpu_ref_kill - drop the initial ref
 * @ref: percpu_ref to kill
 *
 * Must be used to drop the initial ref on a percpu refcount; must be called
 * precisely once before shutdown.
 *
 * Switches @ref into atomic mode before gathering up the percpu counters
 * and dropping the initial ref.
 *
 * There are no implied RCU grace periods between kill and release.
 */
static inline void percpu_ref_kill(struct percpu_ref *ref)
{
        percpu_ref_kill_and_confirm(ref, NULL);
}

/*
 * Internal helper.  Don't use outside percpu-refcount proper.  The
 * function doesn't return the pointer and let the caller test it for NULL
 * because doing so forces the compiler to generate two conditional
 * branches as it can't assume that @ref->percpu_count is not NULL.
 */
static inline bool __ref_is_percpu(struct percpu_ref *ref,
                                          unsigned long __percpu **percpu_countp)
{
        unsigned long percpu_ptr;

        /*
         * The value of @ref->percpu_count_ptr is tested for
         * !__PERCPU_REF_ATOMIC, which may be set asynchronously, and then
         * used as a pointer.  If the compiler generates a separate fetch
         * when using it as a pointer, __PERCPU_REF_ATOMIC may be set in
         * between contaminating the pointer value, meaning that
         * READ_ONCE() is required when fetching it.
         *
         * The dependency ordering from the READ_ONCE() pairs
         * with smp_store_release() in __percpu_ref_switch_to_percpu().
         */
        percpu_ptr = READ_ONCE(ref->percpu_count_ptr);

        /*
         * Theoretically, the following could test just ATOMIC; however,
         * then we'd have to mask off DEAD separately as DEAD may be
         * visible without ATOMIC if we race with percpu_ref_kill().  DEAD
         * implies ATOMIC anyway.  Test them together.
         */
        if (unlikely(percpu_ptr & __PERCPU_REF_ATOMIC_DEAD))
                return false;

        *percpu_countp = (unsigned long __percpu *)percpu_ptr;
        return true;
}

/**
 * percpu_ref_get_many - increment a percpu refcount
 * @ref: percpu_ref to get
 * @nr: number of references to get
 *
 * Analogous to atomic_long_add().
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline void percpu_ref_get_many(struct percpu_ref *ref, unsigned long nr)
{
        unsigned long __percpu *percpu_count;

        rcu_read_lock();

        if (__ref_is_percpu(ref, &percpu_count))
                this_cpu_add(*percpu_count, nr);
        else
                atomic_long_add(nr, &ref->data->count);

        rcu_read_unlock();
}

/**
 * percpu_ref_get - increment a percpu refcount
 * @ref: percpu_ref to get
 *
 * Analagous to atomic_long_inc().
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline void percpu_ref_get(struct percpu_ref *ref)
{
        percpu_ref_get_many(ref, 1);
}

/**
 * percpu_ref_tryget_many - try to increment a percpu refcount
 * @ref: percpu_ref to try-get
 * @nr: number of references to get
 *
 * Increment a percpu refcount  by @nr unless its count already reached zero.
 * Returns %true on success; %false on failure.
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline bool percpu_ref_tryget_many(struct percpu_ref *ref,
                                          unsigned long nr)
{
        unsigned long __percpu *percpu_count;
        bool ret;

        rcu_read_lock();

        if (__ref_is_percpu(ref, &percpu_count)) {
                this_cpu_add(*percpu_count, nr);
                ret = true;
        } else {
                ret = atomic_long_add_unless(&ref->data->count, nr, 0);
        }

        rcu_read_unlock();

        return ret;
}

/**
 * percpu_ref_tryget - try to increment a percpu refcount
 * @ref: percpu_ref to try-get
 *
 * Increment a percpu refcount unless its count already reached zero.
 * Returns %true on success; %false on failure.
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline bool percpu_ref_tryget(struct percpu_ref *ref)
{
        return percpu_ref_tryget_many(ref, 1);
}

/**
 * percpu_ref_tryget_live - try to increment a live percpu refcount
 * @ref: percpu_ref to try-get
 *
 * Increment a percpu refcount unless it has already been killed.  Returns
 * %true on success; %false on failure.
 *
 * Completion of percpu_ref_kill() in itself doesn't guarantee that this
 * function will fail.  For such guarantee, percpu_ref_kill_and_confirm()
 * should be used.  After the confirm_kill callback is invoked, it's
 * guaranteed that no new reference will be given out by
 * percpu_ref_tryget_live().
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline bool percpu_ref_tryget_live(struct percpu_ref *ref)
{
        unsigned long __percpu *percpu_count;
        bool ret = false;

        rcu_read_lock();

        if (__ref_is_percpu(ref, &percpu_count)) {
                this_cpu_inc(*percpu_count);
                ret = true;
        } else if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) {
                ret = atomic_long_inc_not_zero(&ref->data->count);
        }

        rcu_read_unlock();

        return ret;
}

/**
 * percpu_ref_put_many - decrement a percpu refcount
 * @ref: percpu_ref to put
 * @nr: number of references to put
 *
 * Decrement the refcount, and if 0, call the release function (which was passed
 * to percpu_ref_init())
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline void percpu_ref_put_many(struct percpu_ref *ref, unsigned long nr)
{
        unsigned long __percpu *percpu_count;

        rcu_read_lock();

        if (__ref_is_percpu(ref, &percpu_count))
                this_cpu_sub(*percpu_count, nr);
        else if (unlikely(atomic_long_sub_and_test(nr, &ref->data->count)))
                ref->data->release(ref);

        rcu_read_unlock();
}

/**
 * percpu_ref_put - decrement a percpu refcount
 * @ref: percpu_ref to put
 *
 * Decrement the refcount, and if 0, call the release function (which was passed
 * to percpu_ref_init())
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline void percpu_ref_put(struct percpu_ref *ref)
{
        percpu_ref_put_many(ref, 1);
}

/**
 * percpu_ref_is_dying - test whether a percpu refcount is dying or dead
 * @ref: percpu_ref to test
 *
 * Returns %true if @ref is dying or dead.
 *
 * This function is safe to call as long as @ref is between init and exit
 * and the caller is responsible for synchronizing against state changes.
 */
static inline bool percpu_ref_is_dying(struct percpu_ref *ref)
{
        return ref->percpu_count_ptr & __PERCPU_REF_DEAD;
}

#endif















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright 2002-2005, Instant802 Networks, Inc.
 * Copyright 2005, Devicescape Software, Inc.
 * Copyright 2006-2007        Jiri Benc <jbenc@suse.cz>
 * Copyright 2007-2010        Johannes Berg <johannes@sipsolutions.net>
 * Copyright 2013-2015  Intel Mobile Communications GmbH
 * Copyright (C) 2018-2020 Intel Corporation
 */

#ifndef IEEE80211_I_H
#define IEEE80211_I_H

#include <linux/kernel.h>
#include <linux/device.h>
#include <linux/if_ether.h>
#include <linux/interrupt.h>
#include <linux/list.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/workqueue.h>
#include <linux/types.h>
#include <linux/spinlock.h>
#include <linux/etherdevice.h>
#include <linux/leds.h>
#include <linux/idr.h>
#include <linux/rhashtable.h>
#include <net/ieee80211_radiotap.h>
#include <net/cfg80211.h>
#include <net/mac80211.h>
#include <net/fq.h>
#include "key.h"
#include "sta_info.h"
#include "debug.h"

extern const struct cfg80211_ops mac80211_config_ops;

struct ieee80211_local;

/* Maximum number of broadcast/multicast frames to buffer when some of the
 * associated stations are using power saving. */
#define AP_MAX_BC_BUFFER 128

/* Maximum number of frames buffered to all STAs, including multicast frames.
 * Note: increasing this limit increases the potential memory requirement. Each
 * frame can be up to about 2 kB long. */
#define TOTAL_MAX_TX_BUFFER 512

/* Required encryption head and tailroom */
#define IEEE80211_ENCRYPT_HEADROOM 8
#define IEEE80211_ENCRYPT_TAILROOM 18

/* power level hasn't been configured (or set to automatic) */
#define IEEE80211_UNSET_POWER_LEVEL        INT_MIN

/*
 * Some APs experience problems when working with U-APSD. Decreasing the
 * probability of that happening by using legacy mode for all ACs but VO isn't
 * enough.
 *
 * Cisco 4410N originally forced us to enable VO by default only because it
 * treated non-VO ACs as legacy.
 *
 * However some APs (notably Netgear R7000) silently reclassify packets to
 * different ACs. Since u-APSD ACs require trigger frames for frame retrieval
 * clients would never see some frames (e.g. ARP responses) or would fetch them
 * accidentally after a long time.
 *
 * It makes little sense to enable u-APSD queues by default because it needs
 * userspace applications to be aware of it to actually take advantage of the
 * possible additional powersavings. Implicitly depending on driver autotrigger
 * frame support doesn't make much sense.
 */
#define IEEE80211_DEFAULT_UAPSD_QUEUES 0

#define IEEE80211_DEFAULT_MAX_SP_LEN                \
        IEEE80211_WMM_IE_STA_QOSINFO_SP_ALL

extern const u8 ieee80211_ac_to_qos_mask[IEEE80211_NUM_ACS];

#define IEEE80211_DEAUTH_FRAME_LEN        (24 /* hdr */ + 2 /* reason */)

#define IEEE80211_MAX_NAN_INSTANCE_ID 255

struct ieee80211_bss {
        u32 device_ts_beacon, device_ts_presp;

        bool wmm_used;
        bool uapsd_supported;

#define IEEE80211_MAX_SUPP_RATES 32
        u8 supp_rates[IEEE80211_MAX_SUPP_RATES];
        size_t supp_rates_len;
        struct ieee80211_rate *beacon_rate;

        u32 vht_cap_info;

        /*
         * During association, we save an ERP value from a probe response so
         * that we can feed ERP info to the driver when handling the
         * association completes. these fields probably won't be up-to-date
         * otherwise, you probably don't want to use them.
         */
        bool has_erp_value;
        u8 erp_value;

        /* Keep track of the corruption of the last beacon/probe response. */
        u8 corrupt_data;

        /* Keep track of what bits of information we have valid info for. */
        u8 valid_data;
};

/**
 * enum ieee80211_bss_corrupt_data_flags - BSS data corruption flags
 * @IEEE80211_BSS_CORRUPT_BEACON: last beacon frame received was corrupted
 * @IEEE80211_BSS_CORRUPT_PROBE_RESP: last probe response received was corrupted
 *
 * These are bss flags that are attached to a bss in the
 * @corrupt_data field of &struct ieee80211_bss.
 */
enum ieee80211_bss_corrupt_data_flags {
        IEEE80211_BSS_CORRUPT_BEACON                = BIT(0),
        IEEE80211_BSS_CORRUPT_PROBE_RESP        = BIT(1)
};

/**
 * enum ieee80211_bss_valid_data_flags - BSS valid data flags
 * @IEEE80211_BSS_VALID_WMM: WMM/UAPSD data was gathered from non-corrupt IE
 * @IEEE80211_BSS_VALID_RATES: Supported rates were gathered from non-corrupt IE
 * @IEEE80211_BSS_VALID_ERP: ERP flag was gathered from non-corrupt IE
 *
 * These are bss flags that are attached to a bss in the
 * @valid_data field of &struct ieee80211_bss.  They show which parts
 * of the data structure were received as a result of an un-corrupted
 * beacon/probe response.
 */
enum ieee80211_bss_valid_data_flags {
        IEEE80211_BSS_VALID_WMM                        = BIT(1),
        IEEE80211_BSS_VALID_RATES                = BIT(2),
        IEEE80211_BSS_VALID_ERP                        = BIT(3)
};

typedef unsigned __bitwise ieee80211_tx_result;
#define TX_CONTINUE        ((__force ieee80211_tx_result) 0u)
#define TX_DROP                ((__force ieee80211_tx_result) 1u)
#define TX_QUEUED        ((__force ieee80211_tx_result) 2u)

#define IEEE80211_TX_UNICAST                BIT(1)
#define IEEE80211_TX_PS_BUFFERED        BIT(2)

struct ieee80211_tx_data {
        struct sk_buff *skb;
        struct sk_buff_head skbs;
        struct ieee80211_local *local;
        struct ieee80211_sub_if_data *sdata;
        struct sta_info *sta;
        struct ieee80211_key *key;
        struct ieee80211_tx_rate rate;

        unsigned int flags;
};


typedef unsigned __bitwise ieee80211_rx_result;
#define RX_CONTINUE                ((__force ieee80211_rx_result) 0u)
#define RX_DROP_UNUSABLE        ((__force ieee80211_rx_result) 1u)
#define RX_DROP_MONITOR                ((__force ieee80211_rx_result) 2u)
#define RX_QUEUED                ((__force ieee80211_rx_result) 3u)

/**
 * enum ieee80211_packet_rx_flags - packet RX flags
 * @IEEE80211_RX_AMSDU: a-MSDU packet
 * @IEEE80211_RX_MALFORMED_ACTION_FRM: action frame is malformed
 * @IEEE80211_RX_DEFERRED_RELEASE: frame was subjected to receive reordering
 *
 * These are per-frame flags that are attached to a frame in the
 * @rx_flags field of &struct ieee80211_rx_status.
 */
enum ieee80211_packet_rx_flags {
        IEEE80211_RX_AMSDU                        = BIT(3),
        IEEE80211_RX_MALFORMED_ACTION_FRM        = BIT(4),
        IEEE80211_RX_DEFERRED_RELEASE                = BIT(5),
};

/**
 * enum ieee80211_rx_flags - RX data flags
 *
 * @IEEE80211_RX_CMNTR: received on cooked monitor already
 * @IEEE80211_RX_BEACON_REPORTED: This frame was already reported
 *        to cfg80211_report_obss_beacon().
 *
 * These flags are used across handling multiple interfaces
 * for a single frame.
 */
enum ieee80211_rx_flags {
        IEEE80211_RX_CMNTR                = BIT(0),
        IEEE80211_RX_BEACON_REPORTED        = BIT(1),
};

struct ieee80211_rx_data {
        struct list_head *list;
        struct sk_buff *skb;
        struct ieee80211_local *local;
        struct ieee80211_sub_if_data *sdata;
        struct sta_info *sta;
        struct ieee80211_key *key;

        unsigned int flags;

        /*
         * Index into sequence numbers array, 0..16
         * since the last (16) is used for non-QoS,
         * will be 16 on non-QoS frames.
         */
        int seqno_idx;

        /*
         * Index into the security IV/PN arrays, 0..16
         * since the last (16) is used for CCMP-encrypted
         * management frames, will be set to 16 on mgmt
         * frames and 0 on non-QoS frames.
         */
        int security_idx;

        union {
                struct {
                        u32 iv32;
                        u16 iv16;
                } tkip;
                struct {
                        u8 pn[IEEE80211_CCMP_PN_LEN];
                } ccm_gcm;
        };
};

struct ieee80211_csa_settings {
        const u16 *counter_offsets_beacon;
        const u16 *counter_offsets_presp;

        int n_counter_offsets_beacon;
        int n_counter_offsets_presp;

        u8 count;
};

struct beacon_data {
        u8 *head, *tail;
        int head_len, tail_len;
        struct ieee80211_meshconf_ie *meshconf;
        u16 cntdwn_counter_offsets[IEEE80211_MAX_CNTDWN_COUNTERS_NUM];
        u8 cntdwn_current_counter;
        struct rcu_head rcu_head;
};

struct probe_resp {
        struct rcu_head rcu_head;
        int len;
        u16 cntdwn_counter_offsets[IEEE80211_MAX_CNTDWN_COUNTERS_NUM];
        u8 data[];
};

struct fils_discovery_data {
        struct rcu_head rcu_head;
        int len;
        u8 data[];
};

struct unsol_bcast_probe_resp_data {
        struct rcu_head rcu_head;
        int len;
        u8 data[];
};

struct ps_data {
        /* yes, this looks ugly, but guarantees that we can later use
         * bitmap_empty :)
         * NB: don't touch this bitmap, use sta_info_{set,clear}_tim_bit */
        u8 tim[sizeof(unsigned long) * BITS_TO_LONGS(IEEE80211_MAX_AID + 1)]
                        __aligned(__alignof__(unsigned long));
        struct sk_buff_head bc_buf;
        atomic_t num_sta_ps; /* number of stations in PS mode */
        int dtim_count;
        bool dtim_bc_mc;
};

struct ieee80211_if_ap {
        struct beacon_data __rcu *beacon;
        struct probe_resp __rcu *probe_resp;
        struct fils_discovery_data __rcu *fils_discovery;
        struct unsol_bcast_probe_resp_data __rcu *unsol_bcast_probe_resp;

        /* to be used after channel switch. */
        struct cfg80211_beacon_data *next_beacon;
        struct list_head vlans; /* write-protected with RTNL and local->mtx */

        struct ps_data ps;
        atomic_t num_mcast_sta; /* number of stations receiving multicast */

        bool multicast_to_unicast;
};

struct ieee80211_if_wds {
        struct sta_info *sta;
        u8 remote_addr[ETH_ALEN];
};

struct ieee80211_if_vlan {
        struct list_head list; /* write-protected with RTNL and local->mtx */

        /* used for all tx if the VLAN is configured to 4-addr mode */
        struct sta_info __rcu *sta;
        atomic_t num_mcast_sta; /* number of stations receiving multicast */
};

struct mesh_stats {
        __u32 fwded_mcast;                /* Mesh forwarded multicast frames */
        __u32 fwded_unicast;                /* Mesh forwarded unicast frames */
        __u32 fwded_frames;                /* Mesh total forwarded frames */
        __u32 dropped_frames_ttl;        /* Not transmitted since mesh_ttl == 0*/
        __u32 dropped_frames_no_route;        /* Not transmitted, no route found */
        __u32 dropped_frames_congestion;/* Not forwarded due to congestion */
};

#define PREQ_Q_F_START                0x1
#define PREQ_Q_F_REFRESH        0x2
struct mesh_preq_queue {
        struct list_head list;
        u8 dst[ETH_ALEN];
        u8 flags;
};

struct ieee80211_roc_work {
        struct list_head list;

        struct ieee80211_sub_if_data *sdata;

        struct ieee80211_channel *chan;

        bool started, abort, hw_begun, notified;
        bool on_channel;

        unsigned long start_time;

        u32 duration, req_duration;
        struct sk_buff *frame;
        u64 cookie, mgmt_tx_cookie;
        enum ieee80211_roc_type type;
};

/* flags used in struct ieee80211_if_managed.flags */
enum ieee80211_sta_flags {
        IEEE80211_STA_CONNECTION_POLL        = BIT(1),
        IEEE80211_STA_CONTROL_PORT        = BIT(2),
        IEEE80211_STA_DISABLE_HT        = BIT(4),
        IEEE80211_STA_MFP_ENABLED        = BIT(6),
        IEEE80211_STA_UAPSD_ENABLED        = BIT(7),
        IEEE80211_STA_NULLFUNC_ACKED        = BIT(8),
        IEEE80211_STA_RESET_SIGNAL_AVE        = BIT(9),
        IEEE80211_STA_DISABLE_40MHZ        = BIT(10),
        IEEE80211_STA_DISABLE_VHT        = BIT(11),
        IEEE80211_STA_DISABLE_80P80MHZ        = BIT(12),
        IEEE80211_STA_DISABLE_160MHZ        = BIT(13),
        IEEE80211_STA_DISABLE_WMM        = BIT(14),
        IEEE80211_STA_ENABLE_RRM        = BIT(15),
        IEEE80211_STA_DISABLE_HE        = BIT(16),
};

struct ieee80211_mgd_auth_data {
        struct cfg80211_bss *bss;
        unsigned long timeout;
        int tries;
        u16 algorithm, expected_transaction;

        u8 key[WLAN_KEY_LEN_WEP104];
        u8 key_len, key_idx;
        bool done, waiting;
        bool peer_confirmed;
        bool timeout_started;

        u16 sae_trans, sae_status;
        size_t data_len;
        u8 data[];
};

struct ieee80211_mgd_assoc_data {
        struct cfg80211_bss *bss;
        const u8 *supp_rates;

        unsigned long timeout;
        int tries;

        u16 capability;
        u8 prev_bssid[ETH_ALEN];
        u8 ssid[IEEE80211_MAX_SSID_LEN];
        u8 ssid_len;
        u8 supp_rates_len;
        bool wmm, uapsd;
        bool need_beacon;
        bool synced;
        bool timeout_started;

        u8 ap_ht_param;

        struct ieee80211_vht_cap ap_vht_cap;

        u8 fils_nonces[2 * FILS_NONCE_LEN];
        u8 fils_kek[FILS_MAX_KEK_LEN];
        size_t fils_kek_len;

        size_t ie_len;
        u8 ie[];
};

struct ieee80211_sta_tx_tspec {
        /* timestamp of the first packet in the time slice */
        unsigned long time_slice_start;

        u32 admitted_time; /* in usecs, unlike over the air */
        u8 tsid;
        s8 up; /* signed to be able to invalidate with -1 during teardown */

        /* consumed TX time in microseconds in the time slice */
        u32 consumed_tx_time;
        enum {
                TX_TSPEC_ACTION_NONE = 0,
                TX_TSPEC_ACTION_DOWNGRADE,
                TX_TSPEC_ACTION_STOP_DOWNGRADE,
        } action;
        bool downgraded;
};

DECLARE_EWMA(beacon_signal, 4, 4)

struct ieee80211_if_managed {
        struct timer_list timer;
        struct timer_list conn_mon_timer;
        struct timer_list bcn_mon_timer;
        struct timer_list chswitch_timer;
        struct work_struct monitor_work;
        struct work_struct chswitch_work;
        struct work_struct beacon_connection_loss_work;
        struct work_struct csa_connection_drop_work;

        unsigned long beacon_timeout;
        unsigned long probe_timeout;
        int probe_send_count;
        bool nullfunc_failed;
        bool connection_loss;

        struct cfg80211_bss *associated;
        struct ieee80211_mgd_auth_data *auth_data;
        struct ieee80211_mgd_assoc_data *assoc_data;

        u8 bssid[ETH_ALEN] __aligned(2);

        bool powersave; /* powersave requested for this iface */
        bool broken_ap; /* AP is broken -- turn off powersave */
        bool have_beacon;
        u8 dtim_period;
        enum ieee80211_smps_mode req_smps, /* requested smps mode */
                                 driver_smps_mode; /* smps mode request */

        struct work_struct request_smps_work;

        unsigned int flags;

        bool csa_waiting_bcn;
        bool csa_ignored_same_chan;

        bool beacon_crc_valid;
        u32 beacon_crc;

        bool status_acked;
        bool status_received;
        __le16 status_fc;

        enum {
                IEEE80211_MFP_DISABLED,
                IEEE80211_MFP_OPTIONAL,
                IEEE80211_MFP_REQUIRED
        } mfp; /* management frame protection */

        /*
         * Bitmask of enabled u-apsd queues,
         * IEEE80211_WMM_IE_STA_QOSINFO_AC_BE & co. Needs a new association
         * to take effect.
         */
        unsigned int uapsd_queues;

        /*
         * Maximum number of buffered frames AP can deliver during a
         * service period, IEEE80211_WMM_IE_STA_QOSINFO_SP_ALL or similar.
         * Needs a new association to take effect.
         */
        unsigned int uapsd_max_sp_len;

        int wmm_last_param_set;
        int mu_edca_last_param_set;

        u8 use_4addr;

        s16 p2p_noa_index;

        struct ewma_beacon_signal ave_beacon_signal;

        /*
         * Number of Beacon frames used in ave_beacon_signal. This can be used
         * to avoid generating less reliable cqm events that would be based
         * only on couple of received frames.
         */
        unsigned int count_beacon_signal;

        /* Number of times beacon loss was invoked. */
        unsigned int beacon_loss_count;

        /*
         * Last Beacon frame signal strength average (ave_beacon_signal / 16)
         * that triggered a cqm event. 0 indicates that no event has been
         * generated for the current association.
         */
        int last_cqm_event_signal;

        /*
         * State variables for keeping track of RSSI of the AP currently
         * connected to and informing driver when RSSI has gone
         * below/above a certain threshold.
         */
        int rssi_min_thold, rssi_max_thold;
        int last_ave_beacon_signal;

        struct ieee80211_ht_cap ht_capa; /* configured ht-cap over-rides */
        struct ieee80211_ht_cap ht_capa_mask; /* Valid parts of ht_capa */
        struct ieee80211_vht_cap vht_capa; /* configured VHT overrides */
        struct ieee80211_vht_cap vht_capa_mask; /* Valid parts of vht_capa */
        struct ieee80211_s1g_cap s1g_capa; /* configured S1G overrides */
        struct ieee80211_s1g_cap s1g_capa_mask; /* valid s1g_capa bits */

        /* TDLS support */
        u8 tdls_peer[ETH_ALEN] __aligned(2);
        struct delayed_work tdls_peer_del_work;
        struct sk_buff *orig_teardown_skb; /* The original teardown skb */
        struct sk_buff *teardown_skb; /* A copy to send through the AP */
        spinlock_t teardown_lock; /* To lock changing teardown_skb */
        bool tdls_chan_switch_prohibited;
        bool tdls_wider_bw_prohibited;

        /* WMM-AC TSPEC support */
        struct ieee80211_sta_tx_tspec tx_tspec[IEEE80211_NUM_ACS];
        /* Use a separate work struct so that we can do something here
         * while the sdata->work is flushing the queues, for example.
         * otherwise, in scenarios where we hardly get any traffic out
         * on the BE queue, but there's a lot of VO traffic, we might
         * get stuck in a downgraded situation and flush takes forever.
         */
        struct delayed_work tx_tspec_wk;

        /* Information elements from the last transmitted (Re)Association
         * Request frame.
         */
        u8 *assoc_req_ies;
        size_t assoc_req_ies_len;
};

struct ieee80211_if_ibss {
        struct timer_list timer;
        struct work_struct csa_connection_drop_work;

        unsigned long last_scan_completed;

        u32 basic_rates;

        bool fixed_bssid;
        bool fixed_channel;
        bool privacy;

        bool control_port;
        bool userspace_handles_dfs;

        u8 bssid[ETH_ALEN] __aligned(2);
        u8 ssid[IEEE80211_MAX_SSID_LEN];
        u8 ssid_len, ie_len;
        u8 *ie;
        struct cfg80211_chan_def chandef;

        unsigned long ibss_join_req;
        /* probe response/beacon for IBSS */
        struct beacon_data __rcu *presp;

        struct ieee80211_ht_cap ht_capa; /* configured ht-cap over-rides */
        struct ieee80211_ht_cap ht_capa_mask; /* Valid parts of ht_capa */

        spinlock_t incomplete_lock;
        struct list_head incomplete_stations;

        enum {
                IEEE80211_IBSS_MLME_SEARCH,
                IEEE80211_IBSS_MLME_JOINED,
        } state;
};

/**
 * struct ieee80211_if_ocb - OCB mode state
 *
 * @housekeeping_timer: timer for periodic invocation of a housekeeping task
 * @wrkq_flags: OCB deferred task action
 * @incomplete_lock: delayed STA insertion lock
 * @incomplete_stations: list of STAs waiting for delayed insertion
 * @joined: indication if the interface is connected to an OCB network
 */
struct ieee80211_if_ocb {
        struct timer_list housekeeping_timer;
        unsigned long wrkq_flags;

        spinlock_t incomplete_lock;
        struct list_head incomplete_stations;

        bool joined;
};

/**
 * struct ieee80211_mesh_sync_ops - Extensible synchronization framework interface
 *
 * these declarations define the interface, which enables
 * vendor-specific mesh synchronization
 *
 */
struct ieee802_11_elems;
struct ieee80211_mesh_sync_ops {
        void (*rx_bcn_presp)(struct ieee80211_sub_if_data *sdata,
                             u16 stype,
                             struct ieee80211_mgmt *mgmt,
                             struct ieee802_11_elems *elems,
                             struct ieee80211_rx_status *rx_status);

        /* should be called with beacon_data under RCU read lock */
        void (*adjust_tsf)(struct ieee80211_sub_if_data *sdata,
                           struct beacon_data *beacon);
        /* add other framework functions here */
};

struct mesh_csa_settings {
        struct rcu_head rcu_head;
        struct cfg80211_csa_settings settings;
};

/**
 * struct mesh_table
 *
 * @known_gates: list of known mesh gates and their mpaths by the station. The
 * gate's mpath may or may not be resolved and active.
 * @gates_lock: protects updates to known_gates
 * @rhead: the rhashtable containing struct mesh_paths, keyed by dest addr
 * @walk_head: linked list containing all mesh_path objects
 * @walk_lock: lock protecting walk_head
 * @entries: number of entries in the table
 */
struct mesh_table {
        struct hlist_head known_gates;
        spinlock_t gates_lock;
        struct rhashtable rhead;
        struct hlist_head walk_head;
        spinlock_t walk_lock;
        atomic_t entries;                /* Up to MAX_MESH_NEIGHBOURS */
};

struct ieee80211_if_mesh {
        struct timer_list housekeeping_timer;
        struct timer_list mesh_path_timer;
        struct timer_list mesh_path_root_timer;

        unsigned long wrkq_flags;
        unsigned long mbss_changed;

        bool userspace_handles_dfs;

        u8 mesh_id[IEEE80211_MAX_MESH_ID_LEN];
        size_t mesh_id_len;
        /* Active Path Selection Protocol Identifier */
        u8 mesh_pp_id;
        /* Active Path Selection Metric Identifier */
        u8 mesh_pm_id;
        /* Congestion Control Mode Identifier */
        u8 mesh_cc_id;
        /* Synchronization Protocol Identifier */
        u8 mesh_sp_id;
        /* Authentication Protocol Identifier */
        u8 mesh_auth_id;
        /* Local mesh Sequence Number */
        u32 sn;
        /* Last used PREQ ID */
        u32 preq_id;
        atomic_t mpaths;
        /* Timestamp of last SN update */
        unsigned long last_sn_update;
        /* Time when it's ok to send next PERR */
        unsigned long next_perr;
        /* Timestamp of last PREQ sent */
        unsigned long last_preq;
        struct mesh_rmc *rmc;
        spinlock_t mesh_preq_queue_lock;
        struct mesh_preq_queue preq_queue;
        int preq_queue_len;
        struct mesh_stats mshstats;
        struct mesh_config mshcfg;
        atomic_t estab_plinks;
        u32 mesh_seqnum;
        bool accepting_plinks;
        int num_gates;
        struct beacon_data __rcu *beacon;
        const u8 *ie;
        u8 ie_len;
        enum {
                IEEE80211_MESH_SEC_NONE = 0x0,
                IEEE80211_MESH_SEC_AUTHED = 0x1,
                IEEE80211_MESH_SEC_SECURED = 0x2,
        } security;
        bool user_mpm;
        /* Extensible Synchronization Framework */
        const struct ieee80211_mesh_sync_ops *sync_ops;
        s64 sync_offset_clockdrift_max;
        spinlock_t sync_offset_lock;
        /* mesh power save */
        enum nl80211_mesh_power_mode nonpeer_pm;
        int ps_peers_light_sleep;
        int ps_peers_deep_sleep;
        struct ps_data ps;
        /* Channel Switching Support */
        struct mesh_csa_settings __rcu *csa;
        enum {
                IEEE80211_MESH_CSA_ROLE_NONE,
                IEEE80211_MESH_CSA_ROLE_INIT,
                IEEE80211_MESH_CSA_ROLE_REPEATER,
        } csa_role;
        u8 chsw_ttl;
        u16 pre_value;

        /* offset from skb->data while building IE */
        int meshconf_offset;

        struct mesh_table mesh_paths;
        struct mesh_table mpp_paths; /* Store paths for MPP&MAP */
        int mesh_paths_generation;
        int mpp_paths_generation;
};

#ifdef CONFIG_MAC80211_MESH
#define IEEE80211_IFSTA_MESH_CTR_INC(msh, name)        \
        do { (msh)->mshstats.name++; } while (0)
#else
#define IEEE80211_IFSTA_MESH_CTR_INC(msh, name) \
        do { } while (0)
#endif

/**
 * enum ieee80211_sub_if_data_flags - virtual interface flags
 *
 * @IEEE80211_SDATA_ALLMULTI: interface wants all multicast packets
 * @IEEE80211_SDATA_OPERATING_GMODE: operating in G-only mode
 * @IEEE80211_SDATA_DONT_BRIDGE_PACKETS: bridge packets between
 *        associated stations and deliver multicast frames both
 *        back to wireless media and to the local net stack.
 * @IEEE80211_SDATA_DISCONNECT_RESUME: Disconnect after resume.
 * @IEEE80211_SDATA_IN_DRIVER: indicates interface was added to driver
 * @IEEE80211_SDATA_DISCONNECT_HW_RESTART: Disconnect after hardware restart
 *  recovery
 */
enum ieee80211_sub_if_data_flags {
        IEEE80211_SDATA_ALLMULTI                = BIT(0),
        IEEE80211_SDATA_OPERATING_GMODE                = BIT(2),
        IEEE80211_SDATA_DONT_BRIDGE_PACKETS        = BIT(3),
        IEEE80211_SDATA_DISCONNECT_RESUME        = BIT(4),
        IEEE80211_SDATA_IN_DRIVER                = BIT(5),
        IEEE80211_SDATA_DISCONNECT_HW_RESTART        = BIT(6),
};

/**
 * enum ieee80211_sdata_state_bits - virtual interface state bits
 * @SDATA_STATE_RUNNING: virtual interface is up & running; this
 *        mirrors netif_running() but is separate for interface type
 *        change handling while the interface is up
 * @SDATA_STATE_OFFCHANNEL: This interface is currently in offchannel
 *        mode, so queues are stopped
 * @SDATA_STATE_OFFCHANNEL_BEACON_STOPPED: Beaconing was stopped due
 *        to offchannel, reset when offchannel returns
 */
enum ieee80211_sdata_state_bits {
        SDATA_STATE_RUNNING,
        SDATA_STATE_OFFCHANNEL,
        SDATA_STATE_OFFCHANNEL_BEACON_STOPPED,
};

/**
 * enum ieee80211_chanctx_mode - channel context configuration mode
 *
 * @IEEE80211_CHANCTX_SHARED: channel context may be used by
 *        multiple interfaces
 * @IEEE80211_CHANCTX_EXCLUSIVE: channel context can be used
 *        only by a single interface. This can be used for example for
 *        non-fixed channel IBSS.
 */
enum ieee80211_chanctx_mode {
        IEEE80211_CHANCTX_SHARED,
        IEEE80211_CHANCTX_EXCLUSIVE
};

/**
 * enum ieee80211_chanctx_replace_state - channel context replacement state
 *
 * This is used for channel context in-place reservations that require channel
 * context switch/swap.
 *
 * @IEEE80211_CHANCTX_REPLACE_NONE: no replacement is taking place
 * @IEEE80211_CHANCTX_WILL_BE_REPLACED: this channel context will be replaced
 *        by a (not yet registered) channel context pointed by %replace_ctx.
 * @IEEE80211_CHANCTX_REPLACES_OTHER: this (not yet registered) channel context
 *        replaces an existing channel context pointed to by %replace_ctx.
 */
enum ieee80211_chanctx_replace_state {
        IEEE80211_CHANCTX_REPLACE_NONE,
        IEEE80211_CHANCTX_WILL_BE_REPLACED,
        IEEE80211_CHANCTX_REPLACES_OTHER,
};

struct ieee80211_chanctx {
        struct list_head list;
        struct rcu_head rcu_head;

        struct list_head assigned_vifs;
        struct list_head reserved_vifs;

        enum ieee80211_chanctx_replace_state replace_state;
        struct ieee80211_chanctx *replace_ctx;

        enum ieee80211_chanctx_mode mode;
        bool driver_present;

        struct ieee80211_chanctx_conf conf;
};

struct mac80211_qos_map {
        struct cfg80211_qos_map qos_map;
        struct rcu_head rcu_head;
};

enum txq_info_flags {
        IEEE80211_TXQ_STOP,
        IEEE80211_TXQ_AMPDU,
        IEEE80211_TXQ_NO_AMSDU,
        IEEE80211_TXQ_STOP_NETIF_TX,
};

/**
 * struct txq_info - per tid queue
 *
 * @tin: contains packets split into multiple flows
 * @def_flow: used as a fallback flow when a packet destined to @tin hashes to
 *        a fq_flow which is already owned by a different tin
 * @def_cvars: codel vars for @def_flow
 * @frags: used to keep fragments created after dequeue
 * @schedule_order: used with ieee80211_local->active_txqs
 * @schedule_round: counter to prevent infinite loops on TXQ scheduling
 */
struct txq_info {
        struct fq_tin tin;
        struct fq_flow def_flow;
        struct codel_vars def_cvars;
        struct codel_stats cstats;
        struct sk_buff_head frags;
        struct list_head schedule_order;
        u16 schedule_round;
        unsigned long flags;

        /* keep last! */
        struct ieee80211_txq txq;
};

struct ieee80211_if_mntr {
        u32 flags;
        u8 mu_follow_addr[ETH_ALEN] __aligned(2);

        struct list_head list;
};

/**
 * struct ieee80211_if_nan - NAN state
 *
 * @conf: current NAN configuration
 * @func_ids: a bitmap of available instance_id's
 */
struct ieee80211_if_nan {
        struct cfg80211_nan_conf conf;

        /* protects function_inst_ids */
        spinlock_t func_lock;
        struct idr function_inst_ids;
};

struct ieee80211_sub_if_data {
        struct list_head list;

        struct wireless_dev wdev;

        /* keys */
        struct list_head key_list;

        /* count for keys needing tailroom space allocation */
        int crypto_tx_tailroom_needed_cnt;
        int crypto_tx_tailroom_pending_dec;
        struct delayed_work dec_tailroom_needed_wk;

        struct net_device *dev;
        struct ieee80211_local *local;

        unsigned int flags;

        unsigned long state;

        char name[IFNAMSIZ];

        struct ieee80211_fragment_cache frags;

        /* TID bitmap for NoAck policy */
        u16 noack_map;

        /* bit field of ACM bits (BIT(802.1D tag)) */
        u8 wmm_acm;

        struct ieee80211_key __rcu *keys[NUM_DEFAULT_KEYS +
                                         NUM_DEFAULT_MGMT_KEYS +
                                         NUM_DEFAULT_BEACON_KEYS];
        struct ieee80211_key __rcu *default_unicast_key;
        struct ieee80211_key __rcu *default_multicast_key;
        struct ieee80211_key __rcu *default_mgmt_key;
        struct ieee80211_key __rcu *default_beacon_key;

        u16 sequence_number;
        __be16 control_port_protocol;
        bool control_port_no_encrypt;
        bool control_port_no_preauth;
        bool control_port_over_nl80211;
        int encrypt_headroom;

        atomic_t num_tx_queued;
        struct ieee80211_tx_queue_params tx_conf[IEEE80211_NUM_ACS];
        struct mac80211_qos_map __rcu *qos_map;

        struct work_struct csa_finalize_work;
        bool csa_block_tx; /* write-protected by sdata_lock and local->mtx */
        struct cfg80211_chan_def csa_chandef;

        struct list_head assigned_chanctx_list; /* protected by chanctx_mtx */
        struct list_head reserved_chanctx_list; /* protected by chanctx_mtx */

        /* context reservation -- protected with chanctx_mtx */
        struct ieee80211_chanctx *reserved_chanctx;
        struct cfg80211_chan_def reserved_chandef;
        bool reserved_radar_required;
        bool reserved_ready;

        /* used to reconfigure hardware SM PS */
        struct work_struct recalc_smps;

        struct work_struct work;
        struct sk_buff_head skb_queue;

        u8 needed_rx_chains;
        enum ieee80211_smps_mode smps_mode;

        int user_power_level; /* in dBm */
        int ap_power_level; /* in dBm */

        bool radar_required;
        struct delayed_work dfs_cac_timer_work;

        /*
         * AP this belongs to: self in AP mode and
         * corresponding AP in VLAN mode, NULL for
         * all others (might be needed later in IBSS)
         */
        struct ieee80211_if_ap *bss;

        /* bitmap of allowed (non-MCS) rate indexes for rate control */
        u32 rc_rateidx_mask[NUM_NL80211_BANDS];

        bool rc_has_mcs_mask[NUM_NL80211_BANDS];
        u8  rc_rateidx_mcs_mask[NUM_NL80211_BANDS][IEEE80211_HT_MCS_MASK_LEN];

        bool rc_has_vht_mcs_mask[NUM_NL80211_BANDS];
        u16 rc_rateidx_vht_mcs_mask[NUM_NL80211_BANDS][NL80211_VHT_NSS_MAX];

        /* Beacon frame (non-MCS) rate (as a bitmap) */
        u32 beacon_rateidx_mask[NUM_NL80211_BANDS];
        bool beacon_rate_set;

        union {
                struct ieee80211_if_ap ap;
                struct ieee80211_if_wds wds;
                struct ieee80211_if_vlan vlan;
                struct ieee80211_if_managed mgd;
                struct ieee80211_if_ibss ibss;
                struct ieee80211_if_mesh mesh;
                struct ieee80211_if_ocb ocb;
                struct ieee80211_if_mntr mntr;
                struct ieee80211_if_nan nan;
        } u;

#ifdef CONFIG_MAC80211_DEBUGFS
        struct {
                struct dentry *subdir_stations;
                struct dentry *default_unicast_key;
                struct dentry *default_multicast_key;
                struct dentry *default_mgmt_key;
                struct dentry *default_beacon_key;
        } debugfs;
#endif

        /* must be last, dynamically sized area in this! */
        struct ieee80211_vif vif;
};

static inline
struct ieee80211_sub_if_data *vif_to_sdata(struct ieee80211_vif *p)
{
        return container_of(p, struct ieee80211_sub_if_data, vif);
}

static inline void sdata_lock(struct ieee80211_sub_if_data *sdata)
        __acquires(&sdata->wdev.mtx)
{
        mutex_lock(&sdata->wdev.mtx);
        __acquire(&sdata->wdev.mtx);
}

static inline void sdata_unlock(struct ieee80211_sub_if_data *sdata)
        __releases(&sdata->wdev.mtx)
{
        mutex_unlock(&sdata->wdev.mtx);
        __release(&sdata->wdev.mtx);
}

#define sdata_dereference(p, sdata) \
        rcu_dereference_protected(p, lockdep_is_held(&sdata->wdev.mtx))

static inline void
sdata_assert_lock(struct ieee80211_sub_if_data *sdata)
{
        lockdep_assert_held(&sdata->wdev.mtx);
}

static inline int
ieee80211_chandef_get_shift(struct cfg80211_chan_def *chandef)
{
        switch (chandef->width) {
        case NL80211_CHAN_WIDTH_5:
                return 2;
        case NL80211_CHAN_WIDTH_10:
                return 1;
        default:
                return 0;
        }
}

static inline int
ieee80211_vif_get_shift(struct ieee80211_vif *vif)
{
        struct ieee80211_chanctx_conf *chanctx_conf;
        int shift = 0;

        rcu_read_lock();
        chanctx_conf = rcu_dereference(vif->chanctx_conf);
        if (chanctx_conf)
                shift = ieee80211_chandef_get_shift(&chanctx_conf->def);
        rcu_read_unlock();

        return shift;
}

enum {
        IEEE80211_RX_MSG        = 1,
        IEEE80211_TX_STATUS_MSG        = 2,
};

enum queue_stop_reason {
        IEEE80211_QUEUE_STOP_REASON_DRIVER,
        IEEE80211_QUEUE_STOP_REASON_PS,
        IEEE80211_QUEUE_STOP_REASON_CSA,
        IEEE80211_QUEUE_STOP_REASON_AGGREGATION,
        IEEE80211_QUEUE_STOP_REASON_SUSPEND,
        IEEE80211_QUEUE_STOP_REASON_SKB_ADD,
        IEEE80211_QUEUE_STOP_REASON_OFFCHANNEL,
        IEEE80211_QUEUE_STOP_REASON_FLUSH,
        IEEE80211_QUEUE_STOP_REASON_TDLS_TEARDOWN,
        IEEE80211_QUEUE_STOP_REASON_RESERVE_TID,
        IEEE80211_QUEUE_STOP_REASON_IFTYPE_CHANGE,

        IEEE80211_QUEUE_STOP_REASONS,
};

#ifdef CONFIG_MAC80211_LEDS
struct tpt_led_trigger {
        char name[32];
        const struct ieee80211_tpt_blink *blink_table;
        unsigned int blink_table_len;
        struct timer_list timer;
        struct ieee80211_local *local;
        unsigned long prev_traffic;
        unsigned long tx_bytes, rx_bytes;
        unsigned int active, want;
        bool running;
};
#endif

/**
 * mac80211 scan flags - currently active scan mode
 *
 * @SCAN_SW_SCANNING: We're currently in the process of scanning but may as
 *        well be on the operating channel
 * @SCAN_HW_SCANNING: The hardware is scanning for us, we have no way to
 *        determine if we are on the operating channel or not
 * @SCAN_ONCHANNEL_SCANNING:  Do a software scan on only the current operating
 *        channel. This should not interrupt normal traffic.
 * @SCAN_COMPLETED: Set for our scan work function when the driver reported
 *        that the scan completed.
 * @SCAN_ABORTED: Set for our scan work function when the driver reported
 *        a scan complete for an aborted scan.
 * @SCAN_HW_CANCELLED: Set for our scan work function when the scan is being
 *        cancelled.
 * @SCAN_BEACON_WAIT: Set whenever we're passive scanning because of radar/no-IR
 *        and could send a probe request after receiving a beacon.
 * @SCAN_BEACON_DONE: Beacon received, we can now send a probe request
 */
enum {
        SCAN_SW_SCANNING,
        SCAN_HW_SCANNING,
        SCAN_ONCHANNEL_SCANNING,
        SCAN_COMPLETED,
        SCAN_ABORTED,
        SCAN_HW_CANCELLED,
        SCAN_BEACON_WAIT,
        SCAN_BEACON_DONE,
};

/**
 * enum mac80211_scan_state - scan state machine states
 *
 * @SCAN_DECISION: Main entry point to the scan state machine, this state
 *        determines if we should keep on scanning or switch back to the
 *        operating channel
 * @SCAN_SET_CHANNEL: Set the next channel to be scanned
 * @SCAN_SEND_PROBE: Send probe requests and wait for probe responses
 * @SCAN_SUSPEND: Suspend the scan and go back to operating channel to
 *        send out data
 * @SCAN_RESUME: Resume the scan and scan the next channel
 * @SCAN_ABORT: Abort the scan and go back to operating channel
 */
enum mac80211_scan_state {
        SCAN_DECISION,
        SCAN_SET_CHANNEL,
        SCAN_SEND_PROBE,
        SCAN_SUSPEND,
        SCAN_RESUME,
        SCAN_ABORT,
};

struct ieee80211_local {
        /* embed the driver visible part.
         * don't cast (use the static inlines below), but we keep
         * it first anyway so they become a no-op */
        struct ieee80211_hw hw;

        struct fq fq;
        struct codel_vars *cvars;
        struct codel_params cparams;

        /* protects active_txqs and txqi->schedule_order */
        spinlock_t active_txq_lock[IEEE80211_NUM_ACS];
        struct list_head active_txqs[IEEE80211_NUM_ACS];
        u16 schedule_round[IEEE80211_NUM_ACS];

        u16 airtime_flags;
        u32 aql_txq_limit_low[IEEE80211_NUM_ACS];
        u32 aql_txq_limit_high[IEEE80211_NUM_ACS];
        u32 aql_threshold;
        atomic_t aql_total_pending_airtime;

        const struct ieee80211_ops *ops;

        /*
         * private workqueue to mac80211. mac80211 makes this accessible
         * via ieee80211_queue_work()
         */
        struct workqueue_struct *workqueue;

        unsigned long queue_stop_reasons[IEEE80211_MAX_QUEUES];
        int q_stop_reasons[IEEE80211_MAX_QUEUES][IEEE80211_QUEUE_STOP_REASONS];
        /* also used to protect ampdu_ac_queue and amdpu_ac_stop_refcnt */
        spinlock_t queue_stop_reason_lock;

        int open_count;
        int monitors, cooked_mntrs;
        /* number of interfaces with corresponding FIF_ flags */
        int fif_fcsfail, fif_plcpfail, fif_control, fif_other_bss, fif_pspoll,
            fif_probe_req;
        bool probe_req_reg;
        bool rx_mcast_action_reg;
        unsigned int filter_flags; /* FIF_* */

        bool wiphy_ciphers_allocated;

        bool use_chanctx;

        /* protects the aggregated multicast list and filter calls */
        spinlock_t filter_lock;

        /* used for uploading changed mc list */
        struct work_struct reconfig_filter;

        /* aggregated multicast list */
        struct netdev_hw_addr_list mc_list;

        bool tim_in_locked_section; /* see ieee80211_beacon_get() */

        /*
         * suspended is true if we finished all the suspend _and_ we have
         * not yet come up from resume. This is to be used by mac80211
         * to ensure driver sanity during suspend and mac80211's own
         * sanity. It can eventually be used for WoW as well.
         */
        bool suspended;

        /*
         * Resuming is true while suspended, but when we're reprogramming the
         * hardware -- at that time it's allowed to use ieee80211_queue_work()
         * again even though some other parts of the stack are still suspended
         * and we still drop received frames to avoid waking the stack.
         */
        bool resuming;

        /*
         * quiescing is true during the suspend process _only_ to
         * ease timer cancelling etc.
         */
        bool quiescing;

        /* device is started */
        bool started;

        /* device is during a HW reconfig */
        bool in_reconfig;

        /* wowlan is enabled -- don't reconfig on resume */
        bool wowlan;

        struct work_struct radar_detected_work;

        /* number of RX chains the hardware has */
        u8 rx_chains;

        /* bitmap of which sbands were copied */
        u8 sband_allocated;

        int tx_headroom; /* required headroom for hardware/radiotap */

        /* Tasklet and skb queue to process calls from IRQ mode. All frames
         * added to skb_queue will be processed, but frames in
         * skb_queue_unreliable may be dropped if the total length of these
         * queues increases over the limit. */
#define IEEE80211_IRQSAFE_QUEUE_LIMIT 128
        struct tasklet_struct tasklet;
        struct sk_buff_head skb_queue;
        struct sk_buff_head skb_queue_unreliable;

        spinlock_t rx_path_lock;

        /* Station data */
        /*
         * The mutex only protects the list, hash table and
         * counter, reads are done with RCU.
         */
        struct mutex sta_mtx;
        spinlock_t tim_lock;
        unsigned long num_sta;
        struct list_head sta_list;
        struct rhltable sta_hash;
        struct timer_list sta_cleanup;
        int sta_generation;

        struct sk_buff_head pending[IEEE80211_MAX_QUEUES];
        struct tasklet_struct tx_pending_tasklet;
        struct tasklet_struct wake_txqs_tasklet;

        atomic_t agg_queue_stop[IEEE80211_MAX_QUEUES];

        /* number of interfaces with allmulti RX */
        atomic_t iff_allmultis;

        struct rate_control_ref *rate_ctrl;

        struct arc4_ctx wep_tx_ctx;
        struct arc4_ctx wep_rx_ctx;
        u32 wep_iv;

        /* see iface.c */
        struct list_head interfaces;
        struct list_head mon_list; /* only that are IFF_UP && !cooked */
        struct mutex iflist_mtx;

        /*
         * Key mutex, protects sdata's key_list and sta_info's
         * key pointers and ptk_idx (write access, they're RCU.)
         */
        struct mutex key_mtx;

        /* mutex for scan and work locking */
        struct mutex mtx;

        /* Scanning and BSS list */
        unsigned long scanning;
        struct cfg80211_ssid scan_ssid;
        struct cfg80211_scan_request *int_scan_req;
        struct cfg80211_scan_request __rcu *scan_req;
        struct ieee80211_scan_request *hw_scan_req;
        struct cfg80211_chan_def scan_chandef;
        enum nl80211_band hw_scan_band;
        int scan_channel_idx;
        int scan_ies_len;
        int hw_scan_ies_bufsize;
        struct cfg80211_scan_info scan_info;

        struct work_struct sched_scan_stopped_work;
        struct ieee80211_sub_if_data __rcu *sched_scan_sdata;
        struct cfg80211_sched_scan_request __rcu *sched_scan_req;
        u8 scan_addr[ETH_ALEN];

        unsigned long leave_oper_channel_time;
        enum mac80211_scan_state next_scan_state;
        struct delayed_work scan_work;
        struct ieee80211_sub_if_data __rcu *scan_sdata;
        /* For backward compatibility only -- do not use */
        struct cfg80211_chan_def _oper_chandef;

        /* Temporary remain-on-channel for off-channel operations */
        struct ieee80211_channel *tmp_channel;

        /* channel contexts */
        struct list_head chanctx_list;
        struct mutex chanctx_mtx;

#ifdef CONFIG_MAC80211_LEDS
        struct led_trigger tx_led, rx_led, assoc_led, radio_led;
        struct led_trigger tpt_led;
        atomic_t tx_led_active, rx_led_active, assoc_led_active;
        atomic_t radio_led_active, tpt_led_active;
        struct tpt_led_trigger *tpt_led_trigger;
#endif

#ifdef CONFIG_MAC80211_DEBUG_COUNTERS
        /* SNMP counters */
        /* dot11CountersTable */
        u32 dot11TransmittedFragmentCount;
        u32 dot11MulticastTransmittedFrameCount;
        u32 dot11FailedCount;
        u32 dot11RetryCount;
        u32 dot11MultipleRetryCount;
        u32 dot11FrameDuplicateCount;
        u32 dot11ReceivedFragmentCount;
        u32 dot11MulticastReceivedFrameCount;
        u32 dot11TransmittedFrameCount;

        /* TX/RX handler statistics */
        unsigned int tx_handlers_drop;
        unsigned int tx_handlers_queued;
        unsigned int tx_handlers_drop_wep;
        unsigned int tx_handlers_drop_not_assoc;
        unsigned int tx_handlers_drop_unauth_port;
        unsigned int rx_handlers_drop;
        unsigned int rx_handlers_queued;
        unsigned int rx_handlers_drop_nullfunc;
        unsigned int rx_handlers_drop_defrag;
        unsigned int tx_expand_skb_head;
        unsigned int tx_expand_skb_head_cloned;
        unsigned int rx_expand_skb_head_defrag;
        unsigned int rx_handlers_fragments;
        unsigned int tx_status_drop;
#define I802_DEBUG_INC(c) (c)++
#else /* CONFIG_MAC80211_DEBUG_COUNTERS */
#define I802_DEBUG_INC(c) do { } while (0)
#endif /* CONFIG_MAC80211_DEBUG_COUNTERS */


        int total_ps_buffered; /* total number of all buffered unicast and
                                * multicast packets for power saving stations
                                */

        bool pspolling;
        /*
         * PS can only be enabled when we have exactly one managed
         * interface (and monitors) in PS, this then points there.
         */
        struct ieee80211_sub_if_data *ps_sdata;
        struct work_struct dynamic_ps_enable_work;
        struct work_struct dynamic_ps_disable_work;
        struct timer_list dynamic_ps_timer;
        struct notifier_block ifa_notifier;
        struct notifier_block ifa6_notifier;

        /*
         * The dynamic ps timeout configured from user space via WEXT -
         * this will override whatever chosen by mac80211 internally.
         */
        int dynamic_ps_forced_timeout;

        int user_power_level; /* in dBm, for all interfaces */

        enum ieee80211_smps_mode smps_mode;

        struct work_struct restart_work;

#ifdef CONFIG_MAC80211_DEBUGFS
        struct local_debugfsdentries {
                struct dentry *rcdir;
                struct dentry *keys;
        } debugfs;
        bool force_tx_status;
#endif

        /*
         * Remain-on-channel support
         */
        struct delayed_work roc_work;
        struct list_head roc_list;
        struct work_struct hw_roc_start, hw_roc_done;
        unsigned long hw_roc_start_time;
        u64 roc_cookie_counter;

        struct idr ack_status_frames;
        spinlock_t ack_status_lock;

        struct ieee80211_sub_if_data __rcu *p2p_sdata;

        /* virtual monitor interface */
        struct ieee80211_sub_if_data __rcu *monitor_sdata;
        struct cfg80211_chan_def monitor_chandef;

        /* extended capabilities provided by mac80211 */
        u8 ext_capa[8];

        /* TDLS channel switch */
        struct work_struct tdls_chsw_work;
        struct sk_buff_head skb_queue_tdls_chsw;
};

static inline struct ieee80211_sub_if_data *
IEEE80211_DEV_TO_SUB_IF(struct net_device *dev)
{
        return netdev_priv(dev);
}

static inline struct ieee80211_sub_if_data *
IEEE80211_WDEV_TO_SUB_IF(struct wireless_dev *wdev)
{
        return container_of(wdev, struct ieee80211_sub_if_data, wdev);
}

static inline struct ieee80211_supported_band *
ieee80211_get_sband(struct ieee80211_sub_if_data *sdata)
{
        struct ieee80211_local *local = sdata->local;
        struct ieee80211_chanctx_conf *chanctx_conf;
        enum nl80211_band band;

        rcu_read_lock();
        chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);

        if (!chanctx_conf) {
                rcu_read_unlock();
                return NULL;
        }

        band = chanctx_conf->def.chan->band;
        rcu_read_unlock();

        return local->hw.wiphy->bands[band];
}

/* this struct holds the value parsing from channel switch IE  */
struct ieee80211_csa_ie {
        struct cfg80211_chan_def chandef;
        u8 mode;
        u8 count;
        u8 ttl;
        u16 pre_value;
        u16 reason_code;
        u32 max_switch_time;
};

/* Parsed Information Elements */
struct ieee802_11_elems {
        const u8 *ie_start;
        size_t total_len;

        /* pointers to IEs */
        const struct ieee80211_tdls_lnkie *lnk_id;
        const struct ieee80211_ch_switch_timing *ch_sw_timing;
        const u8 *ext_capab;
        const u8 *ssid;
        const u8 *supp_rates;
        const u8 *ds_params;
        const struct ieee80211_tim_ie *tim;
        const u8 *rsn;
        const u8 *rsnx;
        const u8 *erp_info;
        const u8 *ext_supp_rates;
        const u8 *wmm_info;
        const u8 *wmm_param;
        const struct ieee80211_ht_cap *ht_cap_elem;
        const struct ieee80211_ht_operation *ht_operation;
        const struct ieee80211_vht_cap *vht_cap_elem;
        const struct ieee80211_vht_operation *vht_operation;
        const struct ieee80211_meshconf_ie *mesh_config;
        const u8 *he_cap;
        const struct ieee80211_he_operation *he_operation;
        const struct ieee80211_he_spr *he_spr;
        const struct ieee80211_mu_edca_param_set *mu_edca_param_set;
        const struct ieee80211_he_6ghz_capa *he_6ghz_capa;
        const u8 *uora_element;
        const u8 *mesh_id;
        const u8 *peering;
        const __le16 *awake_window;
        const u8 *preq;
        const u8 *prep;
        const u8 *perr;
        const struct ieee80211_rann_ie *rann;
        const struct ieee80211_channel_sw_ie *ch_switch_ie;
        const struct ieee80211_ext_chansw_ie *ext_chansw_ie;
        const struct ieee80211_wide_bw_chansw_ie *wide_bw_chansw_ie;
        const u8 *max_channel_switch_time;
        const u8 *country_elem;
        const u8 *pwr_constr_elem;
        const u8 *cisco_dtpc_elem;
        const struct ieee80211_timeout_interval_ie *timeout_int;
        const u8 *opmode_notif;
        const struct ieee80211_sec_chan_offs_ie *sec_chan_offs;
        struct ieee80211_mesh_chansw_params_ie *mesh_chansw_params_ie;
        const struct ieee80211_bss_max_idle_period_ie *max_idle_period_ie;
        const struct ieee80211_multiple_bssid_configuration *mbssid_config_ie;
        const struct ieee80211_bssid_index *bssid_index;
        u8 max_bssid_indicator;
        u8 dtim_count;
        u8 dtim_period;
        const struct ieee80211_addba_ext_ie *addba_ext_ie;
        const struct ieee80211_s1g_cap *s1g_capab;
        const struct ieee80211_s1g_oper_ie *s1g_oper;
        const struct ieee80211_s1g_bcn_compat_ie *s1g_bcn_compat;
        const struct ieee80211_aid_response_ie *aid_resp;

        /* length of them, respectively */
        u8 ext_capab_len;
        u8 ssid_len;
        u8 supp_rates_len;
        u8 tim_len;
        u8 rsn_len;
        u8 rsnx_len;
        u8 ext_supp_rates_len;
        u8 wmm_info_len;
        u8 wmm_param_len;
        u8 he_cap_len;
        u8 mesh_id_len;
        u8 peering_len;
        u8 preq_len;
        u8 prep_len;
        u8 perr_len;
        u8 country_elem_len;
        u8 bssid_index_len;

        void *nontx_profile;

        /* whether a parse error occurred while retrieving these elements */
        bool parse_error;
};

static inline struct ieee80211_local *hw_to_local(
        struct ieee80211_hw *hw)
{
        return container_of(hw, struct ieee80211_local, hw);
}

static inline struct txq_info *to_txq_info(struct ieee80211_txq *txq)
{
        return container_of(txq, struct txq_info, txq);
}

static inline bool txq_has_queue(struct ieee80211_txq *txq)
{
        struct txq_info *txqi = to_txq_info(txq);

        return !(skb_queue_empty(&txqi->frags) && !txqi->tin.backlog_packets);
}

static inline int ieee80211_bssid_match(const u8 *raddr, const u8 *addr)
{
        return ether_addr_equal(raddr, addr) ||
               is_broadcast_ether_addr(raddr);
}

static inline bool
ieee80211_have_rx_timestamp(struct ieee80211_rx_status *status)
{
        WARN_ON_ONCE(status->flag & RX_FLAG_MACTIME_START &&
                     status->flag & RX_FLAG_MACTIME_END);
        if (status->flag & (RX_FLAG_MACTIME_START | RX_FLAG_MACTIME_END))
                return true;
        /* can't handle non-legacy preamble yet */
        if (status->flag & RX_FLAG_MACTIME_PLCP_START &&
            status->encoding == RX_ENC_LEGACY)
                return true;
        return false;
}

void ieee80211_vif_inc_num_mcast(struct ieee80211_sub_if_data *sdata);
void ieee80211_vif_dec_num_mcast(struct ieee80211_sub_if_data *sdata);

/* This function returns the number of multicast stations connected to this
 * interface. It returns -1 if that number is not tracked, that is for netdevs
 * not in AP or AP_VLAN mode or when using 4addr.
 */
static inline int
ieee80211_vif_get_num_mcast_if(struct ieee80211_sub_if_data *sdata)
{
        if (sdata->vif.type == NL80211_IFTYPE_AP)
                return atomic_read(&sdata->u.ap.num_mcast_sta);
        if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN && !sdata->u.vlan.sta)
                return atomic_read(&sdata->u.vlan.num_mcast_sta);
        return -1;
}

u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local,
                                     struct ieee80211_rx_status *status,
                                     unsigned int mpdu_len,
                                     unsigned int mpdu_offset);
int ieee80211_hw_config(struct ieee80211_local *local, u32 changed);
void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx);
void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata,
                                      u32 changed);
void ieee80211_configure_filter(struct ieee80211_local *local);
u32 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata);

u64 ieee80211_mgmt_tx_cookie(struct ieee80211_local *local);
int ieee80211_attach_ack_skb(struct ieee80211_local *local, struct sk_buff *skb,
                             u64 *cookie, gfp_t gfp);

void ieee80211_check_fast_rx(struct sta_info *sta);
void __ieee80211_check_fast_rx_iface(struct ieee80211_sub_if_data *sdata);
void ieee80211_check_fast_rx_iface(struct ieee80211_sub_if_data *sdata);
void ieee80211_clear_fast_rx(struct sta_info *sta);

/* STA code */
void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata);
int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
                       struct cfg80211_auth_request *req);
int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
                        struct cfg80211_assoc_request *req);
int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
                         struct cfg80211_deauth_request *req);
int ieee80211_mgd_disassoc(struct ieee80211_sub_if_data *sdata,
                           struct cfg80211_disassoc_request *req);
void ieee80211_send_pspoll(struct ieee80211_local *local,
                           struct ieee80211_sub_if_data *sdata);
void ieee80211_recalc_ps(struct ieee80211_local *local);
void ieee80211_recalc_ps_vif(struct ieee80211_sub_if_data *sdata);
int ieee80211_set_arp_filter(struct ieee80211_sub_if_data *sdata);
void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata);
void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
                                  struct sk_buff *skb);
void ieee80211_sta_rx_queued_ext(struct ieee80211_sub_if_data *sdata,
                                 struct sk_buff *skb);
void ieee80211_sta_reset_beacon_monitor(struct ieee80211_sub_if_data *sdata);
void ieee80211_sta_reset_conn_monitor(struct ieee80211_sub_if_data *sdata);
void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata);
void ieee80211_mgd_conn_tx_status(struct ieee80211_sub_if_data *sdata,
                                  __le16 fc, bool acked);
void ieee80211_mgd_quiesce(struct ieee80211_sub_if_data *sdata);
void ieee80211_sta_restart(struct ieee80211_sub_if_data *sdata);
void ieee80211_sta_handle_tspec_ac_params(struct ieee80211_sub_if_data *sdata);

/* IBSS code */
void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local);
void ieee80211_ibss_setup_sdata(struct ieee80211_sub_if_data *sdata);
void ieee80211_ibss_rx_no_sta(struct ieee80211_sub_if_data *sdata,
                              const u8 *bssid, const u8 *addr, u32 supp_rates);
int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata,
                        struct cfg80211_ibss_params *params);
int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata);
void ieee80211_ibss_work(struct ieee80211_sub_if_data *sdata);
void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
                                   struct sk_buff *skb);
int ieee80211_ibss_csa_beacon(struct ieee80211_sub_if_data *sdata,
                              struct cfg80211_csa_settings *csa_settings);
int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata);
void ieee80211_ibss_stop(struct ieee80211_sub_if_data *sdata);

/* OCB code */
void ieee80211_ocb_work(struct ieee80211_sub_if_data *sdata);
void ieee80211_ocb_rx_no_sta(struct ieee80211_sub_if_data *sdata,
                             const u8 *bssid, const u8 *addr, u32 supp_rates);
void ieee80211_ocb_setup_sdata(struct ieee80211_sub_if_data *sdata);
int ieee80211_ocb_join(struct ieee80211_sub_if_data *sdata,
                       struct ocb_setup *setup);
int ieee80211_ocb_leave(struct ieee80211_sub_if_data *sdata);

/* mesh code */
void ieee80211_mesh_work(struct ieee80211_sub_if_data *sdata);
void ieee80211_mesh_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
                                   struct sk_buff *skb);
int ieee80211_mesh_csa_beacon(struct ieee80211_sub_if_data *sdata,
                              struct cfg80211_csa_settings *csa_settings);
int ieee80211_mesh_finish_csa(struct ieee80211_sub_if_data *sdata);

/* scan/BSS handling */
void ieee80211_scan_work(struct work_struct *work);
int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata,
                                const u8 *ssid, u8 ssid_len,
                                struct ieee80211_channel **channels,
                                unsigned int n_channels,
                                enum nl80211_bss_scan_width scan_width);
int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata,
                           struct cfg80211_scan_request *req);
void ieee80211_scan_cancel(struct ieee80211_local *local);
void ieee80211_run_deferred_scan(struct ieee80211_local *local);
void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb);

void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local);
struct ieee80211_bss *
ieee80211_bss_info_update(struct ieee80211_local *local,
                          struct ieee80211_rx_status *rx_status,
                          struct ieee80211_mgmt *mgmt,
                          size_t len,
                          struct ieee80211_channel *channel);
void ieee80211_rx_bss_put(struct ieee80211_local *local,
                          struct ieee80211_bss *bss);

/* scheduled scan handling */
int
__ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
                                     struct cfg80211_sched_scan_request *req);
int ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
                                       struct cfg80211_sched_scan_request *req);
int ieee80211_request_sched_scan_stop(struct ieee80211_local *local);
void ieee80211_sched_scan_end(struct ieee80211_local *local);
void ieee80211_sched_scan_stopped_work(struct work_struct *work);

/* off-channel/mgmt-tx */
void ieee80211_offchannel_stop_vifs(struct ieee80211_local *local);
void ieee80211_offchannel_return(struct ieee80211_local *local);
void ieee80211_roc_setup(struct ieee80211_local *local);
void ieee80211_start_next_roc(struct ieee80211_local *local);
void ieee80211_roc_purge(struct ieee80211_local *local,
                         struct ieee80211_sub_if_data *sdata);
int ieee80211_remain_on_channel(struct wiphy *wiphy, struct wireless_dev *wdev,
                                struct ieee80211_channel *chan,
                                unsigned int duration, u64 *cookie);
int ieee80211_cancel_remain_on_channel(struct wiphy *wiphy,
                                       struct wireless_dev *wdev, u64 cookie);
int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
                      struct cfg80211_mgmt_tx_params *params, u64 *cookie);
int ieee80211_mgmt_tx_cancel_wait(struct wiphy *wiphy,
                                  struct wireless_dev *wdev, u64 cookie);

/* channel switch handling */
void ieee80211_csa_finalize_work(struct work_struct *work);
int ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
                             struct cfg80211_csa_settings *params);

/* interface handling */
#define MAC80211_SUPPORTED_FEATURES_TX        (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | \
                                         NETIF_F_HW_CSUM | NETIF_F_SG | \
                                         NETIF_F_HIGHDMA | NETIF_F_GSO_SOFTWARE)
#define MAC80211_SUPPORTED_FEATURES_RX        (NETIF_F_RXCSUM)
#define MAC80211_SUPPORTED_FEATURES        (MAC80211_SUPPORTED_FEATURES_TX | \
                                         MAC80211_SUPPORTED_FEATURES_RX)

int ieee80211_iface_init(void);
void ieee80211_iface_exit(void);
int ieee80211_if_add(struct ieee80211_local *local, const char *name,
                     unsigned char name_assign_type,
                     struct wireless_dev **new_wdev, enum nl80211_iftype type,
                     struct vif_params *params);
int ieee80211_if_change_type(struct ieee80211_sub_if_data *sdata,
                             enum nl80211_iftype type);
void ieee80211_if_remove(struct ieee80211_sub_if_data *sdata);
void ieee80211_remove_interfaces(struct ieee80211_local *local);
u32 ieee80211_idle_off(struct ieee80211_local *local);
void ieee80211_recalc_idle(struct ieee80211_local *local);
void ieee80211_adjust_monitor_flags(struct ieee80211_sub_if_data *sdata,
                                    const int offset);
int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up);
void ieee80211_sdata_stop(struct ieee80211_sub_if_data *sdata);
int ieee80211_add_virtual_monitor(struct ieee80211_local *local);
void ieee80211_del_virtual_monitor(struct ieee80211_local *local);

bool __ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata);
void ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata,
                              bool update_bss);
void ieee80211_recalc_offload(struct ieee80211_local *local);

static inline bool ieee80211_sdata_running(struct ieee80211_sub_if_data *sdata)
{
        return test_bit(SDATA_STATE_RUNNING, &sdata->state);
}

/* tx handling */
void ieee80211_clear_tx_pending(struct ieee80211_local *local);
void ieee80211_tx_pending(unsigned long data);
netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb,
                                         struct net_device *dev);
netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb,
                                       struct net_device *dev);
netdev_tx_t ieee80211_subif_start_xmit_8023(struct sk_buff *skb,
                                            struct net_device *dev);
void __ieee80211_subif_start_xmit(struct sk_buff *skb,
                                  struct net_device *dev,
                                  u32 info_flags,
                                  u32 ctrl_flags,
                                  u64 *cookie);
void ieee80211_purge_tx_queue(struct ieee80211_hw *hw,
                              struct sk_buff_head *skbs);
struct sk_buff *
ieee80211_build_data_template(struct ieee80211_sub_if_data *sdata,
                              struct sk_buff *skb, u32 info_flags);
void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb,
                          struct ieee80211_supported_band *sband,
                          int retry_count, int shift, bool send_to_cooked,
                          struct ieee80211_tx_status *status);

void ieee80211_check_fast_xmit(struct sta_info *sta);
void ieee80211_check_fast_xmit_all(struct ieee80211_local *local);
void ieee80211_check_fast_xmit_iface(struct ieee80211_sub_if_data *sdata);
void ieee80211_clear_fast_xmit(struct sta_info *sta);
int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev,
                              const u8 *buf, size_t len,
                              const u8 *dest, __be16 proto, bool unencrypted,
                              u64 *cookie);
int ieee80211_probe_mesh_link(struct wiphy *wiphy, struct net_device *dev,
                              const u8 *buf, size_t len);

/* HT */
void ieee80211_apply_htcap_overrides(struct ieee80211_sub_if_data *sdata,
                                     struct ieee80211_sta_ht_cap *ht_cap);
bool ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata,
                                       struct ieee80211_supported_band *sband,
                                       const struct ieee80211_ht_cap *ht_cap_ie,
                                       struct sta_info *sta);
void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata,
                          const u8 *da, u16 tid,
                          u16 initiator, u16 reason_code);
int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata,
                               enum ieee80211_smps_mode smps, const u8 *da,
                               const u8 *bssid);
void ieee80211_request_smps_ap_work(struct work_struct *work);
void ieee80211_request_smps_mgd_work(struct work_struct *work);
bool ieee80211_smps_is_restrictive(enum ieee80211_smps_mode smps_mode_old,
                                   enum ieee80211_smps_mode smps_mode_new);

void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
                                     u16 initiator, u16 reason, bool stop);
void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
                                    u16 initiator, u16 reason, bool stop);
void ___ieee80211_start_rx_ba_session(struct sta_info *sta,
                                      u8 dialog_token, u16 timeout,
                                      u16 start_seq_num, u16 ba_policy, u16 tid,
                                      u16 buf_size, bool tx, bool auto_seq,
                                      const struct ieee80211_addba_ext_ie *addbaext);
void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta,
                                         enum ieee80211_agg_stop_reason reason);
void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata,
                             struct sta_info *sta,
                             struct ieee80211_mgmt *mgmt, size_t len);
void ieee80211_process_addba_resp(struct ieee80211_local *local,
                                  struct sta_info *sta,
                                  struct ieee80211_mgmt *mgmt,
                                  size_t len);
void ieee80211_process_addba_request(struct ieee80211_local *local,
                                     struct sta_info *sta,
                                     struct ieee80211_mgmt *mgmt,
                                     size_t len);

int __ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
                                   enum ieee80211_agg_stop_reason reason);
int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
                                    enum ieee80211_agg_stop_reason reason);
void ieee80211_start_tx_ba_cb(struct sta_info *sta, int tid,
                              struct tid_ampdu_tx *tid_tx);
void ieee80211_stop_tx_ba_cb(struct sta_info *sta, int tid,
                             struct tid_ampdu_tx *tid_tx);
void ieee80211_ba_session_work(struct work_struct *work);
void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid);
void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid);

u8 ieee80211_mcs_to_chains(const struct ieee80211_mcs_info *mcs);
enum nl80211_smps_mode
ieee80211_smps_mode_to_smps_mode(enum ieee80211_smps_mode smps);

/* VHT */
void
ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
                                    struct ieee80211_supported_band *sband,
                                    const struct ieee80211_vht_cap *vht_cap_ie,
                                    struct sta_info *sta);
enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct sta_info *sta);
enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta);
void ieee80211_sta_set_rx_nss(struct sta_info *sta);
enum ieee80211_sta_rx_bandwidth
ieee80211_chan_width_to_rx_bw(enum nl80211_chan_width width);
enum nl80211_chan_width ieee80211_sta_cap_chan_bw(struct sta_info *sta);
void ieee80211_sta_set_rx_nss(struct sta_info *sta);
void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata,
                                 struct ieee80211_mgmt *mgmt);
u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
                                  struct sta_info *sta, u8 opmode,
                                  enum nl80211_band band);
void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
                                 struct sta_info *sta, u8 opmode,
                                 enum nl80211_band band);
void ieee80211_apply_vhtcap_overrides(struct ieee80211_sub_if_data *sdata,
                                      struct ieee80211_sta_vht_cap *vht_cap);
void ieee80211_get_vht_mask_from_cap(__le16 vht_cap,
                                     u16 vht_mask[NL80211_VHT_NSS_MAX]);
enum nl80211_chan_width
ieee80211_sta_rx_bw_to_chan_width(struct sta_info *sta);

/* HE */
void
ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata,
                                  struct ieee80211_supported_band *sband,
                                  const u8 *he_cap_ie, u8 he_cap_len,
                                  const struct ieee80211_he_6ghz_capa *he_6ghz_capa,
                                  struct sta_info *sta);
void
ieee80211_he_spr_ie_to_bss_conf(struct ieee80211_vif *vif,
                                const struct ieee80211_he_spr *he_spr_ie_elem);

void
ieee80211_he_op_ie_to_bss_conf(struct ieee80211_vif *vif,
                        const struct ieee80211_he_operation *he_op_ie_elem);

/* S1G */
void ieee80211_s1g_sta_rate_init(struct sta_info *sta);

/* Spectrum management */
void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata,
                                       struct ieee80211_mgmt *mgmt,
                                       size_t len);
/**
 * ieee80211_parse_ch_switch_ie - parses channel switch IEs
 * @sdata: the sdata of the interface which has received the frame
 * @elems: parsed 802.11 elements received with the frame
 * @current_band: indicates the current band
 * @vht_cap_info: VHT capabilities of the transmitter
 * @sta_flags: contains information about own capabilities and restrictions
 *        to decide which channel switch announcements can be accepted. Only the
 *        following subset of &enum ieee80211_sta_flags are evaluated:
 *        %IEEE80211_STA_DISABLE_HT, %IEEE80211_STA_DISABLE_VHT,
 *        %IEEE80211_STA_DISABLE_40MHZ, %IEEE80211_STA_DISABLE_80P80MHZ,
 *        %IEEE80211_STA_DISABLE_160MHZ.
 * @bssid: the currently connected bssid (for reporting)
 * @csa_ie: parsed 802.11 csa elements on count, mode, chandef and mesh ttl.
        All of them will be filled with if success only.
 * Return: 0 on success, <0 on error and >0 if there is nothing to parse.
 */
int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata,
                                 struct ieee802_11_elems *elems,
                                 enum nl80211_band current_band,
                                 u32 vht_cap_info,
                                 u32 sta_flags, u8 *bssid,
                                 struct ieee80211_csa_ie *csa_ie);

/* Suspend/resume and hw reconfiguration */
int ieee80211_reconfig(struct ieee80211_local *local);
void ieee80211_stop_device(struct ieee80211_local *local);

int __ieee80211_suspend(struct ieee80211_hw *hw,
                        struct cfg80211_wowlan *wowlan);

static inline int __ieee80211_resume(struct ieee80211_hw *hw)
{
        struct ieee80211_local *local = hw_to_local(hw);

        WARN(test_bit(SCAN_HW_SCANNING, &local->scanning) &&
             !test_bit(SCAN_COMPLETED, &local->scanning),
                "%s: resume with hardware scan still in progress\n",
                wiphy_name(hw->wiphy));

        return ieee80211_reconfig(hw_to_local(hw));
}

/* utility functions/constants */
extern const void *const mac80211_wiphy_privid; /* for wiphy privid */
int ieee80211_frame_duration(enum nl80211_band band, size_t len,
                             int rate, int erp, int short_preamble,
                             int shift);
void ieee80211_regulatory_limit_wmm_params(struct ieee80211_sub_if_data *sdata,
                                           struct ieee80211_tx_queue_params *qparam,
                                           int ac);
void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata,
                               bool bss_notify, bool enable_qos);
void ieee80211_xmit(struct ieee80211_sub_if_data *sdata,
                    struct sta_info *sta, struct sk_buff *skb);

void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
                                 struct sk_buff *skb, int tid,
                                 enum nl80211_band band);

/* sta_out needs to be checked for ERR_PTR() before using */
int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata,
                            struct sk_buff *skb,
                            struct sta_info **sta_out);

static inline void
ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
                          struct sk_buff *skb, int tid,
                          enum nl80211_band band)
{
        rcu_read_lock();
        __ieee80211_tx_skb_tid_band(sdata, skb, tid, band);
        rcu_read_unlock();
}

static inline void ieee80211_tx_skb_tid(struct ieee80211_sub_if_data *sdata,
                                        struct sk_buff *skb, int tid)
{
        struct ieee80211_chanctx_conf *chanctx_conf;

        rcu_read_lock();
        chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
        if (WARN_ON(!chanctx_conf)) {
                rcu_read_unlock();
                kfree_skb(skb);
                return;
        }

        __ieee80211_tx_skb_tid_band(sdata, skb, tid,
                                    chanctx_conf->def.chan->band);
        rcu_read_unlock();
}

static inline void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata,
                                    struct sk_buff *skb)
{
        /* Send all internal mgmt frames on VO. Accordingly set TID to 7. */
        ieee80211_tx_skb_tid(sdata, skb, 7);
}

u32 ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action,
                               struct ieee802_11_elems *elems,
                               u64 filter, u32 crc, u8 *transmitter_bssid,
                               u8 *bss_bssid);
static inline void ieee802_11_parse_elems(const u8 *start, size_t len,
                                          bool action,
                                          struct ieee802_11_elems *elems,
                                          u8 *transmitter_bssid,
                                          u8 *bss_bssid)
{
        ieee802_11_parse_elems_crc(start, len, action, elems, 0, 0,
                                   transmitter_bssid, bss_bssid);
}


extern const int ieee802_1d_to_ac[8];

static inline int ieee80211_ac_from_tid(int tid)
{
        return ieee802_1d_to_ac[tid & 7];
}

void ieee80211_dynamic_ps_enable_work(struct work_struct *work);
void ieee80211_dynamic_ps_disable_work(struct work_struct *work);
void ieee80211_dynamic_ps_timer(struct timer_list *t);
void ieee80211_send_nullfunc(struct ieee80211_local *local,
                             struct ieee80211_sub_if_data *sdata,
                             bool powersave);
void ieee80211_send_4addr_nullfunc(struct ieee80211_local *local,
                                   struct ieee80211_sub_if_data *sdata);
void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata,
                             struct ieee80211_hdr *hdr, bool ack, u16 tx_time);

void ieee80211_wake_queues_by_reason(struct ieee80211_hw *hw,
                                     unsigned long queues,
                                     enum queue_stop_reason reason,
                                     bool refcounted);
void ieee80211_stop_vif_queues(struct ieee80211_local *local,
                               struct ieee80211_sub_if_data *sdata,
                               enum queue_stop_reason reason);
void ieee80211_wake_vif_queues(struct ieee80211_local *local,
                               struct ieee80211_sub_if_data *sdata,
                               enum queue_stop_reason reason);
void ieee80211_stop_queues_by_reason(struct ieee80211_hw *hw,
                                     unsigned long queues,
                                     enum queue_stop_reason reason,
                                     bool refcounted);
void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue,
                                    enum queue_stop_reason reason,
                                    bool refcounted);
void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue,
                                    enum queue_stop_reason reason,
                                    bool refcounted);
void ieee80211_propagate_queue_wake(struct ieee80211_local *local, int queue);
void ieee80211_add_pending_skb(struct ieee80211_local *local,
                               struct sk_buff *skb);
void ieee80211_add_pending_skbs(struct ieee80211_local *local,
                                struct sk_buff_head *skbs);
void ieee80211_flush_queues(struct ieee80211_local *local,
                            struct ieee80211_sub_if_data *sdata, bool drop);
void __ieee80211_flush_queues(struct ieee80211_local *local,
                              struct ieee80211_sub_if_data *sdata,
                              unsigned int queues, bool drop);

static inline bool ieee80211_can_run_worker(struct ieee80211_local *local)
{
        /*
         * It's unsafe to try to do any work during reconfigure flow.
         * When the flow ends the work will be requeued.
         */
        if (local->in_reconfig)
                return false;

        /*
         * If quiescing is set, we are racing with __ieee80211_suspend.
         * __ieee80211_suspend flushes the workers after setting quiescing,
         * and we check quiescing / suspended before enqueing new workers.
         * We should abort the worker to avoid the races below.
         */
        if (local->quiescing)
                return false;

        /*
         * We might already be suspended if the following scenario occurs:
         * __ieee80211_suspend                Control path
         *
         *                                if (local->quiescing)
         *                                        return;
         * local->quiescing = true;
         * flush_workqueue();
         *                                queue_work(...);
         * local->suspended = true;
         * local->quiescing = false;
         *                                worker starts running...
         */
        if (local->suspended)
                return false;

        return true;
}

int ieee80211_txq_setup_flows(struct ieee80211_local *local);
void ieee80211_txq_set_params(struct ieee80211_local *local);
void ieee80211_txq_teardown_flows(struct ieee80211_local *local);
void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata,
                        struct sta_info *sta,
                        struct txq_info *txq, int tid);
void ieee80211_txq_purge(struct ieee80211_local *local,
                         struct txq_info *txqi);
void ieee80211_txq_remove_vlan(struct ieee80211_local *local,
                               struct ieee80211_sub_if_data *sdata);
void ieee80211_fill_txq_stats(struct cfg80211_txq_stats *txqstats,
                              struct txq_info *txqi);
void ieee80211_wake_txqs(unsigned long data);
void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
                         u16 transaction, u16 auth_alg, u16 status,
                         const u8 *extra, size_t extra_len, const u8 *bssid,
                         const u8 *da, const u8 *key, u8 key_len, u8 key_idx,
                         u32 tx_flags);
void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata,
                                    const u8 *da, const u8 *bssid,
                                    u16 stype, u16 reason,
                                    bool send_frame, u8 *frame_buf);

enum {
        IEEE80211_PROBE_FLAG_DIRECTED                = BIT(0),
        IEEE80211_PROBE_FLAG_MIN_CONTENT        = BIT(1),
        IEEE80211_PROBE_FLAG_RANDOM_SN                = BIT(2),
};

int ieee80211_build_preq_ies(struct ieee80211_sub_if_data *sdata, u8 *buffer,
                             size_t buffer_len,
                             struct ieee80211_scan_ies *ie_desc,
                             const u8 *ie, size_t ie_len,
                             u8 bands_used, u32 *rate_masks,
                             struct cfg80211_chan_def *chandef,
                             u32 flags);
struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata,
                                          const u8 *src, const u8 *dst,
                                          u32 ratemask,
                                          struct ieee80211_channel *chan,
                                          const u8 *ssid, size_t ssid_len,
                                          const u8 *ie, size_t ie_len,
                                          u32 flags);
u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata,
                            struct ieee802_11_elems *elems,
                            enum nl80211_band band, u32 *basic_rates);
int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata,
                                 enum ieee80211_smps_mode smps_mode);
void ieee80211_recalc_smps(struct ieee80211_sub_if_data *sdata);
void ieee80211_recalc_min_chandef(struct ieee80211_sub_if_data *sdata);

size_t ieee80211_ie_split_vendor(const u8 *ies, size_t ielen, size_t offset);
u8 *ieee80211_ie_build_ht_cap(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap,
                              u16 cap);
u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap,
                               const struct cfg80211_chan_def *chandef,
                               u16 prot_mode, bool rifs_mode);
void ieee80211_ie_build_wide_bw_cs(u8 *pos,
                                   const struct cfg80211_chan_def *chandef);
u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap,
                               u32 cap);
u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap,
                                const struct cfg80211_chan_def *chandef);
u8 ieee80211_ie_len_he_cap(struct ieee80211_sub_if_data *sdata, u8 iftype);
u8 *ieee80211_ie_build_he_cap(u8 *pos,
                              const struct ieee80211_sta_he_cap *he_cap,
                              u8 *end);
void ieee80211_ie_build_he_6ghz_cap(struct ieee80211_sub_if_data *sdata,
                                    struct sk_buff *skb);
u8 *ieee80211_ie_build_he_oper(u8 *pos, struct cfg80211_chan_def *chandef);
int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef,
                             const struct ieee80211_supported_band *sband,
                             const u8 *srates, int srates_len, u32 *rates);
int ieee80211_add_srates_ie(struct ieee80211_sub_if_data *sdata,
                            struct sk_buff *skb, bool need_basic,
                            enum nl80211_band band);
int ieee80211_add_ext_srates_ie(struct ieee80211_sub_if_data *sdata,
                                struct sk_buff *skb, bool need_basic,
                                enum nl80211_band band);
u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo);
void ieee80211_add_s1g_capab_ie(struct ieee80211_sub_if_data *sdata,
                                struct ieee80211_sta_s1g_cap *caps,
                                struct sk_buff *skb);
void ieee80211_add_aid_request_ie(struct ieee80211_sub_if_data *sdata,
                                  struct sk_buff *skb);

/* channel management */
bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper,
                               struct cfg80211_chan_def *chandef);
bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap_info,
                                const struct ieee80211_vht_operation *oper,
                                const struct ieee80211_ht_operation *htop,
                                struct cfg80211_chan_def *chandef);
bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_sub_if_data *sdata,
                                    const struct ieee80211_he_operation *he_oper,
                                    struct cfg80211_chan_def *chandef);
bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper,
                                struct cfg80211_chan_def *chandef);
u32 ieee80211_chandef_downgrade(struct cfg80211_chan_def *c);

int __must_check
ieee80211_vif_use_channel(struct ieee80211_sub_if_data *sdata,
                          const struct cfg80211_chan_def *chandef,
                          enum ieee80211_chanctx_mode mode);
int __must_check
ieee80211_vif_reserve_chanctx(struct ieee80211_sub_if_data *sdata,
                              const struct cfg80211_chan_def *chandef,
                              enum ieee80211_chanctx_mode mode,
                              bool radar_required);
int __must_check
ieee80211_vif_use_reserved_context(struct ieee80211_sub_if_data *sdata);
int ieee80211_vif_unreserve_chanctx(struct ieee80211_sub_if_data *sdata);

int __must_check
ieee80211_vif_change_bandwidth(struct ieee80211_sub_if_data *sdata,
                               const struct cfg80211_chan_def *chandef,
                               u32 *changed);
void ieee80211_vif_release_channel(struct ieee80211_sub_if_data *sdata);
void ieee80211_vif_vlan_copy_chanctx(struct ieee80211_sub_if_data *sdata);
void ieee80211_vif_copy_chanctx_to_vlans(struct ieee80211_sub_if_data *sdata,
                                         bool clear);
int ieee80211_chanctx_refcount(struct ieee80211_local *local,
                               struct ieee80211_chanctx *ctx);

void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local,
                                   struct ieee80211_chanctx *chanctx);
void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local,
                                      struct ieee80211_chanctx *ctx);
bool ieee80211_is_radar_required(struct ieee80211_local *local);

void ieee80211_dfs_cac_timer(unsigned long data);
void ieee80211_dfs_cac_timer_work(struct work_struct *work);
void ieee80211_dfs_cac_cancel(struct ieee80211_local *local);
void ieee80211_dfs_radar_detected_work(struct work_struct *work);
int ieee80211_send_action_csa(struct ieee80211_sub_if_data *sdata,
                              struct cfg80211_csa_settings *csa_settings);

bool ieee80211_cs_valid(const struct ieee80211_cipher_scheme *cs);
bool ieee80211_cs_list_valid(const struct ieee80211_cipher_scheme *cs, int n);
const struct ieee80211_cipher_scheme *
ieee80211_cs_get(struct ieee80211_local *local, u32 cipher,
                 enum nl80211_iftype iftype);
int ieee80211_cs_headroom(struct ieee80211_local *local,
                          struct cfg80211_crypto_settings *crypto,
                          enum nl80211_iftype iftype);
void ieee80211_recalc_dtim(struct ieee80211_local *local,
                           struct ieee80211_sub_if_data *sdata);
int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata,
                                 const struct cfg80211_chan_def *chandef,
                                 enum ieee80211_chanctx_mode chanmode,
                                 u8 radar_detect);
int ieee80211_max_num_channels(struct ieee80211_local *local);
enum nl80211_chan_width ieee80211_get_sta_bw(struct ieee80211_sta *sta);
void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local,
                                       struct ieee80211_chanctx *ctx);

/* TDLS */
int ieee80211_tdls_mgmt(struct wiphy *wiphy, struct net_device *dev,
                        const u8 *peer, u8 action_code, u8 dialog_token,
                        u16 status_code, u32 peer_capability,
                        bool initiator, const u8 *extra_ies,
                        size_t extra_ies_len);
int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev,
                        const u8 *peer, enum nl80211_tdls_operation oper);
void ieee80211_tdls_peer_del_work(struct work_struct *wk);
int ieee80211_tdls_channel_switch(struct wiphy *wiphy, struct net_device *dev,
                                  const u8 *addr, u8 oper_class,
                                  struct cfg80211_chan_def *chandef);
void ieee80211_tdls_cancel_channel_switch(struct wiphy *wiphy,
                                          struct net_device *dev,
                                          const u8 *addr);
void ieee80211_teardown_tdls_peers(struct ieee80211_sub_if_data *sdata);
void ieee80211_tdls_chsw_work(struct work_struct *wk);
void ieee80211_tdls_handle_disconnect(struct ieee80211_sub_if_data *sdata,
                                      const u8 *peer, u16 reason);
const char *ieee80211_get_reason_code_string(u16 reason_code);
u16 ieee80211_encode_usf(int val);
u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len,
                        enum nl80211_iftype type);

extern const struct ethtool_ops ieee80211_ethtool_ops;

u32 ieee80211_calc_expected_tx_airtime(struct ieee80211_hw *hw,
                                       struct ieee80211_vif *vif,
                                       struct ieee80211_sta *pubsta,
                                       int len, bool ampdu);
#ifdef CONFIG_MAC80211_NOINLINE
#define debug_noinline noinline
#else
#define debug_noinline
#endif

void ieee80211_init_frag_cache(struct ieee80211_fragment_cache *cache);
void ieee80211_destroy_frag_cache(struct ieee80211_fragment_cache *cache);

#endif /* IEEE80211_I_H */


























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_CHECKSUM_64_H
#define _ASM_X86_CHECKSUM_64_H

/*
 * Checksums for x86-64
 * Copyright 2002 by Andi Kleen, SuSE Labs
 * with some code from asm-x86/checksum.h
 */

#include <linux/compiler.h>
#include <linux/uaccess.h>
#include <asm/byteorder.h>

/**
 * csum_fold - Fold and invert a 32bit checksum.
 * sum: 32bit unfolded sum
 *
 * Fold a 32bit running checksum to 16bit and invert it. This is usually
 * the last step before putting a checksum into a packet.
 * Make sure not to mix with 64bit checksums.
 */
static inline __sum16 csum_fold(__wsum sum)
{
        asm("  addl %1,%0\n"
            "  adcl $0xffff,%0"
            : "=r" (sum)
            : "r" ((__force u32)sum << 16),
              "0" ((__force u32)sum & 0xffff0000));
        return (__force __sum16)(~(__force u32)sum >> 16);
}

/*
 *        This is a version of ip_compute_csum() optimized for IP headers,
 *        which always checksum on 4 octet boundaries.
 *
 *        By Jorge Cwik <jorge@laser.satlink.net>, adapted for linux by
 *        Arnt Gulbrandsen.
 */

/**
 * ip_fast_csum - Compute the IPv4 header checksum efficiently.
 * iph: ipv4 header
 * ihl: length of header / 4
 */
static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
{
        unsigned int sum;

        asm("  movl (%1), %0\n"
            "  subl $4, %2\n"
            "  jbe 2f\n"
            "  addl 4(%1), %0\n"
            "  adcl 8(%1), %0\n"
            "  adcl 12(%1), %0\n"
            "1: adcl 16(%1), %0\n"
            "  lea 4(%1), %1\n"
            "  decl %2\n"
            "  jne        1b\n"
            "  adcl $0, %0\n"
            "  movl %0, %2\n"
            "  shrl $16, %0\n"
            "  addw %w2, %w0\n"
            "  adcl $0, %0\n"
            "  notl %0\n"
            "2:"
        /* Since the input registers which are loaded with iph and ihl
           are modified, we must also specify them as outputs, or gcc
           will assume they contain their original values. */
            : "=r" (sum), "=r" (iph), "=r" (ihl)
            : "1" (iph), "2" (ihl)
            : "memory");
        return (__force __sum16)sum;
}

/**
 * csum_tcpup_nofold - Compute an IPv4 pseudo header checksum.
 * @saddr: source address
 * @daddr: destination address
 * @len: length of packet
 * @proto: ip protocol of packet
 * @sum: initial sum to be added in (32bit unfolded)
 *
 * Returns the pseudo header checksum the input data. Result is
 * 32bit unfolded.
 */
static inline __wsum
csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
                   __u8 proto, __wsum sum)
{
        asm("  addl %1, %0\n"
            "  adcl %2, %0\n"
            "  adcl %3, %0\n"
            "  adcl $0, %0\n"
            : "=r" (sum)
            : "g" (daddr), "g" (saddr),
              "g" ((len + proto)<<8), "0" (sum));
        return sum;
}


/**
 * csum_tcpup_magic - Compute an IPv4 pseudo header checksum.
 * @saddr: source address
 * @daddr: destination address
 * @len: length of packet
 * @proto: ip protocol of packet
 * @sum: initial sum to be added in (32bit unfolded)
 *
 * Returns the 16bit pseudo header checksum the input data already
 * complemented and ready to be filled in.
 */
static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
                                        __u32 len, __u8 proto,
                                        __wsum sum)
{
        return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
}

/**
 * csum_partial - Compute an internet checksum.
 * @buff: buffer to be checksummed
 * @len: length of buffer.
 * @sum: initial sum to be added in (32bit unfolded)
 *
 * Returns the 32bit unfolded internet checksum of the buffer.
 * Before filling it in it needs to be csum_fold()'ed.
 * buff should be aligned to a 64bit boundary if possible.
 */
extern __wsum csum_partial(const void *buff, int len, __wsum sum);

/* Do not call this directly. Use the wrappers below */
extern __visible __wsum csum_partial_copy_generic(const void *src, void *dst, int len);

extern __wsum csum_and_copy_from_user(const void __user *src, void *dst, int len);
extern __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len);
extern __wsum csum_partial_copy_nocheck(const void *src, void *dst, int len);

/**
 * ip_compute_csum - Compute an 16bit IP checksum.
 * @buff: buffer address.
 * @len: length of buffer.
 *
 * Returns the 16bit folded/inverted checksum of the passed buffer.
 * Ready to fill in.
 */
extern __sum16 ip_compute_csum(const void *buff, int len);

/**
 * csum_ipv6_magic - Compute checksum of an IPv6 pseudo header.
 * @saddr: source address
 * @daddr: destination address
 * @len: length of packet
 * @proto: protocol of packet
 * @sum: initial sum (32bit unfolded) to be added in
 *
 * Computes an IPv6 pseudo header checksum. This sum is added the checksum
 * into UDP/TCP packets and contains some link layer information.
 * Returns the unfolded 32bit checksum.
 */

struct in6_addr;

#define _HAVE_ARCH_IPV6_CSUM 1
extern __sum16
csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr,
                __u32 len, __u8 proto, __wsum sum);

static inline unsigned add32_with_carry(unsigned a, unsigned b)
{
        asm("addl %2,%0\n\t"
            "adcl $0,%0"
            : "=r" (a)
            : "0" (a), "rm" (b));
        return a;
}

#define HAVE_ARCH_CSUM_ADD
static inline __wsum csum_add(__wsum csum, __wsum addend)
{
        return (__force __wsum)add32_with_carry((__force unsigned)csum,
                                                (__force unsigned)addend);
}

#endif /* _ASM_X86_CHECKSUM_64_H */











































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NF_CONNTRACK_COMMON_H
#define _NF_CONNTRACK_COMMON_H

#include <linux/refcount.h>
#include <uapi/linux/netfilter/nf_conntrack_common.h>

struct ip_conntrack_stat {
        unsigned int found;
        unsigned int invalid;
        unsigned int insert;
        unsigned int insert_failed;
        unsigned int clash_resolve;
        unsigned int drop;
        unsigned int early_drop;
        unsigned int error;
        unsigned int expect_new;
        unsigned int expect_create;
        unsigned int expect_delete;
        unsigned int search_restart;
};

#define NFCT_INFOMASK        7UL
#define NFCT_PTRMASK        ~(NFCT_INFOMASK)

struct nf_conntrack {
        refcount_t use;
};

void nf_conntrack_destroy(struct nf_conntrack *nfct);
static inline void nf_conntrack_put(struct nf_conntrack *nfct)
{
        if (nfct && refcount_dec_and_test(&nfct->use))
                nf_conntrack_destroy(nfct);
}
static inline void nf_conntrack_get(struct nf_conntrack *nfct)
{
        if (nfct)
                refcount_inc(&nfct->use);
}

#endif /* _NF_CONNTRACK_COMMON_H */







































    2 
























    2 











    2 





















































    1 





    1 



    1 



    1 



    1 









    1 






    1 





























    1 

    1 





    1 











    1 
























    1 



    1 

    1 

    1 
    1 
    1 
    1 
    1 






























    1 







    1 








    1 










    1 






















    1 



    1 


    1 
    1 



    1 














    1 



    1 





    1 








    1 






    1 


    1 






    1 

    1 


    1 






    1 
    1 



    1 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/attr.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *  changes by Thomas Schoebel-Theuer
 */

#include <linux/export.h>
#include <linux/time.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/sched/signal.h>
#include <linux/capability.h>
#include <linux/fsnotify.h>
#include <linux/fcntl.h>
#include <linux/security.h>
#include <linux/evm.h>
#include <linux/ima.h>

#include "internal.h"

/**
 * setattr_should_drop_sgid - determine whether the setgid bit needs to be
 *                            removed
 * @inode:        inode to check
 *
 * This function determines whether the setgid bit needs to be removed.
 * We retain backwards compatibility and require setgid bit to be removed
 * unconditionally if S_IXGRP is set. Otherwise we have the exact same
 * requirements as setattr_prepare() and setattr_copy().
 *
 * Return: ATTR_KILL_SGID if setgid bit needs to be removed, 0 otherwise.
 */
int setattr_should_drop_sgid(const struct inode *inode)
{
        umode_t mode = inode->i_mode;

        if (!(mode & S_ISGID))
                return 0;
        if (mode & S_IXGRP)
                return ATTR_KILL_SGID;
        if (!in_group_or_capable(inode, inode->i_gid))
                return ATTR_KILL_SGID;
        return 0;
}

/**
 * setattr_should_drop_suidgid - determine whether the set{g,u}id bit needs to
 *                               be dropped
 * @inode:        inode to check
 *
 * This function determines whether the set{g,u}id bits need to be removed.
 * If the setuid bit needs to be removed ATTR_KILL_SUID is returned. If the
 * setgid bit needs to be removed ATTR_KILL_SGID is returned. If both
 * set{g,u}id bits need to be removed the corresponding mask of both flags is
 * returned.
 *
 * Return: A mask of ATTR_KILL_S{G,U}ID indicating which - if any - setid bits
 * to remove, 0 otherwise.
 */
int setattr_should_drop_suidgid(struct inode *inode)
{
        umode_t mode = inode->i_mode;
        int kill = 0;

        /* suid always must be killed */
        if (unlikely(mode & S_ISUID))
                kill = ATTR_KILL_SUID;

        kill |= setattr_should_drop_sgid(inode);

        if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
                return kill;

        return 0;
}
EXPORT_SYMBOL(setattr_should_drop_suidgid);

static bool chown_ok(const struct inode *inode, kuid_t uid)
{
        if (uid_eq(current_fsuid(), inode->i_uid) &&
            uid_eq(uid, inode->i_uid))
                return true;
        if (capable_wrt_inode_uidgid(inode, CAP_CHOWN))
                return true;
        if (uid_eq(inode->i_uid, INVALID_UID) &&
            ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
                return true;
        return false;
}

static bool chgrp_ok(const struct inode *inode, kgid_t gid)
{
        if (uid_eq(current_fsuid(), inode->i_uid) &&
            (in_group_p(gid) || gid_eq(gid, inode->i_gid)))
                return true;
        if (capable_wrt_inode_uidgid(inode, CAP_CHOWN))
                return true;
        if (gid_eq(inode->i_gid, INVALID_GID) &&
            ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
                return true;
        return false;
}

/**
 * setattr_prepare - check if attribute changes to a dentry are allowed
 * @dentry:        dentry to check
 * @attr:        attributes to change
 *
 * Check if we are allowed to change the attributes contained in @attr
 * in the given dentry.  This includes the normal unix access permission
 * checks, as well as checks for rlimits and others. The function also clears
 * SGID bit from mode if user is not allowed to set it. Also file capabilities
 * and IMA extended attributes are cleared if ATTR_KILL_PRIV is set.
 *
 * Should be called as the first thing in ->setattr implementations,
 * possibly after taking additional locks.
 */
int setattr_prepare(struct dentry *dentry, struct iattr *attr)
{
        struct inode *inode = d_inode(dentry);
        unsigned int ia_valid = attr->ia_valid;

        /*
         * First check size constraints.  These can't be overriden using
         * ATTR_FORCE.
         */
        if (ia_valid & ATTR_SIZE) {
                int error = inode_newsize_ok(inode, attr->ia_size);
                if (error)
                        return error;
        }

        /* If force is set do it anyway. */
        if (ia_valid & ATTR_FORCE)
                goto kill_priv;

        /* Make sure a caller can chown. */
        if ((ia_valid & ATTR_UID) && !chown_ok(inode, attr->ia_uid))
                return -EPERM;

        /* Make sure caller can chgrp. */
        if ((ia_valid & ATTR_GID) && !chgrp_ok(inode, attr->ia_gid))
                return -EPERM;

        /* Make sure a caller can chmod. */
        if (ia_valid & ATTR_MODE) {
                if (!inode_owner_or_capable(inode))
                        return -EPERM;
                /* Also check the setgid bit! */
                if (!in_group_or_capable(inode, (ia_valid & ATTR_GID) ?
                                                attr->ia_gid : inode->i_gid))
                        attr->ia_mode &= ~S_ISGID;
        }

        /* Check for setting the inode time. */
        if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) {
                if (!inode_owner_or_capable(inode))
                        return -EPERM;
        }

kill_priv:
        /* User has permission for the change */
        if (ia_valid & ATTR_KILL_PRIV) {
                int error;

                error = security_inode_killpriv(dentry);
                if (error)
                        return error;
        }

        return 0;
}
EXPORT_SYMBOL(setattr_prepare);

/**
 * inode_newsize_ok - may this inode be truncated to a given size
 * @inode:        the inode to be truncated
 * @offset:        the new size to assign to the inode
 *
 * inode_newsize_ok must be called with i_mutex held.
 *
 * inode_newsize_ok will check filesystem limits and ulimits to check that the
 * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ
 * when necessary. Caller must not proceed with inode size change if failure is
 * returned. @inode must be a file (not directory), with appropriate
 * permissions to allow truncate (inode_newsize_ok does NOT check these
 * conditions).
 *
 * Return: 0 on success, -ve errno on failure
 */
int inode_newsize_ok(const struct inode *inode, loff_t offset)
{
        if (offset < 0)
                return -EINVAL;
        if (inode->i_size < offset) {
                unsigned long limit;

                limit = rlimit(RLIMIT_FSIZE);
                if (limit != RLIM_INFINITY && offset > limit)
                        goto out_sig;
                if (offset > inode->i_sb->s_maxbytes)
                        goto out_big;
        } else {
                /*
                 * truncation of in-use swapfiles is disallowed - it would
                 * cause subsequent swapout to scribble on the now-freed
                 * blocks.
                 */
                if (IS_SWAPFILE(inode))
                        return -ETXTBSY;
        }

        return 0;
out_sig:
        send_sig(SIGXFSZ, current, 0);
out_big:
        return -EFBIG;
}
EXPORT_SYMBOL(inode_newsize_ok);

/**
 * setattr_copy - copy simple metadata updates into the generic inode
 * @inode:        the inode to be updated
 * @attr:        the new attributes
 *
 * setattr_copy must be called with i_mutex held.
 *
 * setattr_copy updates the inode's metadata with that specified
 * in attr. Noticeably missing is inode size update, which is more complex
 * as it requires pagecache updates.
 *
 * The inode is not marked as dirty after this operation. The rationale is
 * that for "simple" filesystems, the struct inode is the inode storage.
 * The caller is free to mark the inode dirty afterwards if needed.
 */
void setattr_copy(struct inode *inode, const struct iattr *attr)
{
        unsigned int ia_valid = attr->ia_valid;

        if (ia_valid & ATTR_UID)
                inode->i_uid = attr->ia_uid;
        if (ia_valid & ATTR_GID)
                inode->i_gid = attr->ia_gid;
        if (ia_valid & ATTR_ATIME)
                inode->i_atime = attr->ia_atime;
        if (ia_valid & ATTR_MTIME)
                inode->i_mtime = attr->ia_mtime;
        if (ia_valid & ATTR_CTIME)
                inode->i_ctime = attr->ia_ctime;
        if (ia_valid & ATTR_MODE) {
                umode_t mode = attr->ia_mode;
                if (!in_group_or_capable(inode, inode->i_gid))
                        mode &= ~S_ISGID;
                inode->i_mode = mode;
        }
}
EXPORT_SYMBOL(setattr_copy);

/**
 * notify_change - modify attributes of a filesytem object
 * @dentry:        object affected
 * @attr:        new attributes
 * @delegated_inode: returns inode, if the inode is delegated
 *
 * The caller must hold the i_mutex on the affected object.
 *
 * If notify_change discovers a delegation in need of breaking,
 * it will return -EWOULDBLOCK and return a reference to the inode in
 * delegated_inode.  The caller should then break the delegation and
 * retry.  Because breaking a delegation may take a long time, the
 * caller should drop the i_mutex before doing so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.  Also, passing NULL is fine for callers holding
 * the file open for write, as there can be no conflicting delegation in
 * that case.
 */
int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **delegated_inode)
{
        struct inode *inode = dentry->d_inode;
        umode_t mode = inode->i_mode;
        int error;
        struct timespec64 now;
        unsigned int ia_valid = attr->ia_valid;

        WARN_ON_ONCE(!inode_is_locked(inode));

        if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) {
                if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
                        return -EPERM;
        }

        /*
         * If utimes(2) and friends are called with times == NULL (or both
         * times are UTIME_NOW), then we need to check for write permission
         */
        if (ia_valid & ATTR_TOUCH) {
                if (IS_IMMUTABLE(inode))
                        return -EPERM;

                if (!inode_owner_or_capable(inode)) {
                        error = inode_permission(inode, MAY_WRITE);
                        if (error)
                                return error;
                }
        }

        if ((ia_valid & ATTR_MODE)) {
                /*
                 * Don't allow changing the mode of symlinks:
                 *
                 * (1) The vfs doesn't take the mode of symlinks into account
                 *     during permission checking.
                 * (2) This has never worked correctly. Most major filesystems
                 *     did return EOPNOTSUPP due to interactions with POSIX ACLs
                 *     but did still updated the mode of the symlink.
                 *     This inconsistency led system call wrapper providers such
                 *     as libc to block changing the mode of symlinks with
                 *     EOPNOTSUPP already.
                 * (3) To even do this in the first place one would have to use
                 *     specific file descriptors and quite some effort.
                 */
                if (S_ISLNK(inode->i_mode))
                        return -EOPNOTSUPP;

                /* Flag setting protected by i_mutex */
                if (is_sxid(attr->ia_mode))
                        inode->i_flags &= ~S_NOSEC;
        }

        now = current_time(inode);

        attr->ia_ctime = now;
        if (!(ia_valid & ATTR_ATIME_SET))
                attr->ia_atime = now;
        else
                attr->ia_atime = timestamp_truncate(attr->ia_atime, inode);
        if (!(ia_valid & ATTR_MTIME_SET))
                attr->ia_mtime = now;
        else
                attr->ia_mtime = timestamp_truncate(attr->ia_mtime, inode);

        if (ia_valid & ATTR_KILL_PRIV) {
                error = security_inode_need_killpriv(dentry);
                if (error < 0)
                        return error;
                if (error == 0)
                        ia_valid = attr->ia_valid &= ~ATTR_KILL_PRIV;
        }

        /*
         * We now pass ATTR_KILL_S*ID to the lower level setattr function so
         * that the function has the ability to reinterpret a mode change
         * that's due to these bits. This adds an implicit restriction that
         * no function will ever call notify_change with both ATTR_MODE and
         * ATTR_KILL_S*ID set.
         */
        if ((ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) &&
            (ia_valid & ATTR_MODE))
                BUG();

        if (ia_valid & ATTR_KILL_SUID) {
                if (mode & S_ISUID) {
                        ia_valid = attr->ia_valid |= ATTR_MODE;
                        attr->ia_mode = (inode->i_mode & ~S_ISUID);
                }
        }
        if (ia_valid & ATTR_KILL_SGID) {
                if (mode & S_ISGID) {
                        if (!(ia_valid & ATTR_MODE)) {
                                ia_valid = attr->ia_valid |= ATTR_MODE;
                                attr->ia_mode = inode->i_mode;
                        }
                        attr->ia_mode &= ~S_ISGID;
                }
        }
        if (!(attr->ia_valid & ~(ATTR_KILL_SUID | ATTR_KILL_SGID)))
                return 0;

        /*
         * Verify that uid/gid changes are valid in the target
         * namespace of the superblock.
         */
        if (ia_valid & ATTR_UID &&
            !kuid_has_mapping(inode->i_sb->s_user_ns, attr->ia_uid))
                return -EOVERFLOW;
        if (ia_valid & ATTR_GID &&
            !kgid_has_mapping(inode->i_sb->s_user_ns, attr->ia_gid))
                return -EOVERFLOW;

        /* Don't allow modifications of files with invalid uids or
         * gids unless those uids & gids are being made valid.
         */
        if (!(ia_valid & ATTR_UID) && !uid_valid(inode->i_uid))
                return -EOVERFLOW;
        if (!(ia_valid & ATTR_GID) && !gid_valid(inode->i_gid))
                return -EOVERFLOW;

        error = security_inode_setattr(dentry, attr);
        if (error)
                return error;
        error = try_break_deleg(inode, delegated_inode);
        if (error)
                return error;

        if (inode->i_op->setattr)
                error = inode->i_op->setattr(dentry, attr);
        else
                error = simple_setattr(dentry, attr);

        if (!error) {
                fsnotify_change(dentry, ia_valid);
                ima_inode_post_setattr(dentry);
                evm_inode_post_setattr(dentry, ia_valid);
        }

        return error;
}
EXPORT_SYMBOL(notify_change);






















































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Connection state tracking for netfilter.  This is separated from,
 * but required by, the (future) NAT layer; it can also be used by an iptables
 * extension.
 *
 * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
 *        - generalize L3 protocol dependent part.
 *
 * Derived from include/linux/netfiter_ipv4/ip_conntrack.h
 */

#ifndef _NF_CONNTRACK_H
#define _NF_CONNTRACK_H

#include <linux/bitops.h>
#include <linux/compiler.h>

#include <linux/netfilter/nf_conntrack_common.h>
#include <linux/netfilter/nf_conntrack_tcp.h>
#include <linux/netfilter/nf_conntrack_dccp.h>
#include <linux/netfilter/nf_conntrack_sctp.h>
#include <linux/netfilter/nf_conntrack_proto_gre.h>

#include <net/netfilter/nf_conntrack_tuple.h>

struct nf_ct_udp {
        unsigned long        stream_ts;
};

/* per conntrack: protocol private data */
union nf_conntrack_proto {
        /* insert conntrack proto private data here */
        struct nf_ct_dccp dccp;
        struct ip_ct_sctp sctp;
        struct ip_ct_tcp tcp;
        struct nf_ct_udp udp;
        struct nf_ct_gre gre;
        unsigned int tmpl_padto;
};

union nf_conntrack_expect_proto {
        /* insert expect proto private data here */
};

struct nf_conntrack_net {
        unsigned int users4;
        unsigned int users6;
        unsigned int users_bridge;
};

#include <linux/types.h>
#include <linux/skbuff.h>

#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>

struct nf_conn {
        /* Usage count in here is 1 for hash table, 1 per skb,
         * plus 1 for any connection(s) we are `master' for
         *
         * Hint, SKB address this struct and refcnt via skb->_nfct and
         * helpers nf_conntrack_get() and nf_conntrack_put().
         * Helper nf_ct_put() equals nf_conntrack_put() by dec refcnt,
         * beware nf_ct_get() is different and don't inc refcnt.
         */
        struct nf_conntrack ct_general;

        spinlock_t        lock;
        /* jiffies32 when this ct is considered dead */
        u32 timeout;

#ifdef CONFIG_NF_CONNTRACK_ZONES
        struct nf_conntrack_zone zone;
#endif
        /* XXX should I move this to the tail ? - Y.K */
        /* These are my tuples; original and reply */
        struct nf_conntrack_tuple_hash tuplehash[IP_CT_DIR_MAX];

        /* Have we seen traffic both ways yet? (bitset) */
        unsigned long status;

        u16                cpu;
        possible_net_t ct_net;

#if IS_ENABLED(CONFIG_NF_NAT)
        struct hlist_node        nat_bysource;
#endif
        /* all members below initialized via memset */
        struct { } __nfct_init_offset;

        /* If we were expected by an expectation, this will be it */
        struct nf_conn *master;

#if defined(CONFIG_NF_CONNTRACK_MARK)
        u_int32_t mark;
#endif

#ifdef CONFIG_NF_CONNTRACK_SECMARK
        u_int32_t secmark;
#endif

        /* Extensions */
        struct nf_ct_ext *ext;

        /* Storage reserved for other modules, must be the last member */
        union nf_conntrack_proto proto;
};

static inline struct nf_conn *
nf_ct_tuplehash_to_ctrack(const struct nf_conntrack_tuple_hash *hash)
{
        return container_of(hash, struct nf_conn,
                            tuplehash[hash->tuple.dst.dir]);
}

static inline u_int16_t nf_ct_l3num(const struct nf_conn *ct)
{
        return ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
}

static inline u_int8_t nf_ct_protonum(const struct nf_conn *ct)
{
        return ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum;
}

#define nf_ct_tuple(ct, dir) (&(ct)->tuplehash[dir].tuple)

/* get master conntrack via master expectation */
#define master_ct(conntr) (conntr->master)

extern struct net init_net;

static inline struct net *nf_ct_net(const struct nf_conn *ct)
{
        return read_pnet(&ct->ct_net);
}

/* Alter reply tuple (maybe alter helper). */
void nf_conntrack_alter_reply(struct nf_conn *ct,
                              const struct nf_conntrack_tuple *newreply);

/* Is this tuple taken? (ignoring any belonging to the given
   conntrack). */
int nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
                             const struct nf_conn *ignored_conntrack);

/* Return conntrack_info and tuple hash for given skb. */
static inline struct nf_conn *
nf_ct_get(const struct sk_buff *skb, enum ip_conntrack_info *ctinfo)
{
        unsigned long nfct = skb_get_nfct(skb);

        *ctinfo = nfct & NFCT_INFOMASK;
        return (struct nf_conn *)(nfct & NFCT_PTRMASK);
}

/* decrement reference count on a conntrack */
static inline void nf_ct_put(struct nf_conn *ct)
{
        WARN_ON(!ct);
        nf_conntrack_put(&ct->ct_general);
}

/* Protocol module loading */
int nf_ct_l3proto_try_module_get(unsigned short l3proto);
void nf_ct_l3proto_module_put(unsigned short l3proto);

/* load module; enable/disable conntrack in this namespace */
int nf_ct_netns_get(struct net *net, u8 nfproto);
void nf_ct_netns_put(struct net *net, u8 nfproto);

/*
 * Allocate a hashtable of hlist_head (if nulls == 0),
 * or hlist_nulls_head (if nulls == 1)
 */
void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls);

int nf_conntrack_hash_check_insert(struct nf_conn *ct);
bool nf_ct_delete(struct nf_conn *ct, u32 pid, int report);

bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
                       u_int16_t l3num, struct net *net,
                       struct nf_conntrack_tuple *tuple);

void __nf_ct_refresh_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
                          const struct sk_buff *skb,
                          u32 extra_jiffies, bool do_acct);

/* Refresh conntrack for this many jiffies and do accounting */
static inline void nf_ct_refresh_acct(struct nf_conn *ct,
                                      enum ip_conntrack_info ctinfo,
                                      const struct sk_buff *skb,
                                      u32 extra_jiffies)
{
        __nf_ct_refresh_acct(ct, ctinfo, skb, extra_jiffies, true);
}

/* Refresh conntrack for this many jiffies */
static inline void nf_ct_refresh(struct nf_conn *ct,
                                 const struct sk_buff *skb,
                                 u32 extra_jiffies)
{
        __nf_ct_refresh_acct(ct, 0, skb, extra_jiffies, false);
}

/* kill conntrack and do accounting */
bool nf_ct_kill_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
                     const struct sk_buff *skb);

/* kill conntrack without accounting */
static inline bool nf_ct_kill(struct nf_conn *ct)
{
        return nf_ct_delete(ct, 0, 0);
}

/* Set all unconfirmed conntrack as dying */
void nf_ct_unconfirmed_destroy(struct net *);

/* Iterate over all conntracks: if iter returns true, it's deleted. */
void nf_ct_iterate_cleanup_net(struct net *net,
                               int (*iter)(struct nf_conn *i, void *data),
                               void *data, u32 portid, int report);

/* also set unconfirmed conntracks as dying. Only use in module exit path. */
void nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data),
                           void *data);

struct nf_conntrack_zone;

void nf_conntrack_free(struct nf_conn *ct);
struct nf_conn *nf_conntrack_alloc(struct net *net,
                                   const struct nf_conntrack_zone *zone,
                                   const struct nf_conntrack_tuple *orig,
                                   const struct nf_conntrack_tuple *repl,
                                   gfp_t gfp);

static inline int nf_ct_is_template(const struct nf_conn *ct)
{
        return test_bit(IPS_TEMPLATE_BIT, &ct->status);
}

/* It's confirmed if it is, or has been in the hash table. */
static inline int nf_ct_is_confirmed(const struct nf_conn *ct)
{
        return test_bit(IPS_CONFIRMED_BIT, &ct->status);
}

static inline int nf_ct_is_dying(const struct nf_conn *ct)
{
        return test_bit(IPS_DYING_BIT, &ct->status);
}

/* Packet is received from loopback */
static inline bool nf_is_loopback_packet(const struct sk_buff *skb)
{
        return skb->dev && skb->skb_iif && skb->dev->flags & IFF_LOOPBACK;
}

#define nfct_time_stamp ((u32)(jiffies))

/* jiffies until ct expires, 0 if already expired */
static inline unsigned long nf_ct_expires(const struct nf_conn *ct)
{
        s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp;

        return timeout > 0 ? timeout : 0;
}

static inline bool nf_ct_is_expired(const struct nf_conn *ct)
{
        return (__s32)(READ_ONCE(ct->timeout) - nfct_time_stamp) <= 0;
}

/* use after obtaining a reference count */
static inline bool nf_ct_should_gc(const struct nf_conn *ct)
{
        return nf_ct_is_expired(ct) && nf_ct_is_confirmed(ct) &&
               !nf_ct_is_dying(ct);
}

#define        NF_CT_DAY        (86400 * HZ)

/* Set an arbitrary timeout large enough not to ever expire, this save
 * us a check for the IPS_OFFLOAD_BIT from the packet path via
 * nf_ct_is_expired().
 */
static inline void nf_ct_offload_timeout(struct nf_conn *ct)
{
        if (nf_ct_expires(ct) < NF_CT_DAY / 2)
                WRITE_ONCE(ct->timeout, nfct_time_stamp + NF_CT_DAY);
}

struct kernel_param;

int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp);
int nf_conntrack_hash_resize(unsigned int hashsize);

extern struct hlist_nulls_head *nf_conntrack_hash;
extern unsigned int nf_conntrack_htable_size;
extern seqcount_spinlock_t nf_conntrack_generation;
extern unsigned int nf_conntrack_max;

/* must be called with rcu read lock held */
static inline void
nf_conntrack_get_ht(struct hlist_nulls_head **hash, unsigned int *hsize)
{
        struct hlist_nulls_head *hptr;
        unsigned int sequence, hsz;

        do {
                sequence = read_seqcount_begin(&nf_conntrack_generation);
                hsz = nf_conntrack_htable_size;
                hptr = nf_conntrack_hash;
        } while (read_seqcount_retry(&nf_conntrack_generation, sequence));

        *hash = hptr;
        *hsize = hsz;
}

struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
                                 const struct nf_conntrack_zone *zone,
                                 gfp_t flags);
void nf_ct_tmpl_free(struct nf_conn *tmpl);

u32 nf_ct_get_id(const struct nf_conn *ct);

static inline void
nf_ct_set(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info info)
{
        skb_set_nfct(skb, (unsigned long)ct | info);
}

#define NF_CT_STAT_INC(net, count)          __this_cpu_inc((net)->ct.stat->count)
#define NF_CT_STAT_INC_ATOMIC(net, count) this_cpu_inc((net)->ct.stat->count)
#define NF_CT_STAT_ADD_ATOMIC(net, count, v) this_cpu_add((net)->ct.stat->count, (v))

#define MODULE_ALIAS_NFCT_HELPER(helper) \
        MODULE_ALIAS("nfct-helper-" helper)

#endif /* _NF_CONNTRACK_H */






































































































    1 
    1 




    1 

    1 

    1 

    1 




    1 



    1 








    1 






















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/drivers/base/map.c
 *
 * (C) Copyright Al Viro 2002,2003
 *
 * NOTE: data structure needs to be changed.  It works, but for large dev_t
 * it will be too slow.  It is isolated, though, so these changes will be
 * local to that file.
 */

#include <linux/module.h>
#include <linux/slab.h>
#include <linux/mutex.h>
#include <linux/kdev_t.h>
#include <linux/kobject.h>
#include <linux/kobj_map.h>

struct kobj_map {
        struct probe {
                struct probe *next;
                dev_t dev;
                unsigned long range;
                struct module *owner;
                kobj_probe_t *get;
                int (*lock)(dev_t, void *);
                void *data;
        } *probes[255];
        struct mutex *lock;
};

int kobj_map(struct kobj_map *domain, dev_t dev, unsigned long range,
             struct module *module, kobj_probe_t *probe,
             int (*lock)(dev_t, void *), void *data)
{
        unsigned n = MAJOR(dev + range - 1) - MAJOR(dev) + 1;
        unsigned index = MAJOR(dev);
        unsigned i;
        struct probe *p;

        if (n > 255)
                n = 255;

        p = kmalloc_array(n, sizeof(struct probe), GFP_KERNEL);
        if (p == NULL)
                return -ENOMEM;

        for (i = 0; i < n; i++, p++) {
                p->owner = module;
                p->get = probe;
                p->lock = lock;
                p->dev = dev;
                p->range = range;
                p->data = data;
        }
        mutex_lock(domain->lock);
        for (i = 0, p -= n; i < n; i++, p++, index++) {
                struct probe **s = &domain->probes[index % 255];
                while (*s && (*s)->range < range)
                        s = &(*s)->next;
                p->next = *s;
                *s = p;
        }
        mutex_unlock(domain->lock);
        return 0;
}

void kobj_unmap(struct kobj_map *domain, dev_t dev, unsigned long range)
{
        unsigned n = MAJOR(dev + range - 1) - MAJOR(dev) + 1;
        unsigned index = MAJOR(dev);
        unsigned i;
        struct probe *found = NULL;

        if (n > 255)
                n = 255;

        mutex_lock(domain->lock);
        for (i = 0; i < n; i++, index++) {
                struct probe **s;
                for (s = &domain->probes[index % 255]; *s; s = &(*s)->next) {
                        struct probe *p = *s;
                        if (p->dev == dev && p->range == range) {
                                *s = p->next;
                                if (!found)
                                        found = p;
                                break;
                        }
                }
        }
        mutex_unlock(domain->lock);
        kfree(found);
}

struct kobject *kobj_lookup(struct kobj_map *domain, dev_t dev, int *index)
{
        struct kobject *kobj;
        struct probe *p;
        unsigned long best = ~0UL;

retry:
        mutex_lock(domain->lock);
        for (p = domain->probes[MAJOR(dev) % 255]; p; p = p->next) {
                struct kobject *(*probe)(dev_t, int *, void *);
                struct module *owner;
                void *data;

                if (p->dev > dev || p->dev + p->range - 1 < dev)
                        continue;
                if (p->range - 1 >= best)
                        break;
                if (!try_module_get(p->owner))
                        continue;
                owner = p->owner;
                data = p->data;
                probe = p->get;
                best = p->range - 1;
                *index = dev - p->dev;
                if (p->lock && p->lock(dev, data) < 0) {
                        module_put(owner);
                        continue;
                }
                mutex_unlock(domain->lock);
                kobj = probe(dev, index, data);
                /* Currently ->owner protects _only_ ->probe() itself. */
                module_put(owner);
                if (kobj)
                        return kobj;
                goto retry;
        }
        mutex_unlock(domain->lock);
        return NULL;
}

struct kobj_map *kobj_map_init(kobj_probe_t *base_probe, struct mutex *lock)
{
        struct kobj_map *p = kmalloc(sizeof(struct kobj_map), GFP_KERNEL);
        struct probe *base = kzalloc(sizeof(*base), GFP_KERNEL);
        int i;

        if ((p == NULL) || (base == NULL)) {
                kfree(p);
                kfree(base);
                return NULL;
        }

        base->dev = 1;
        base->range = ~0;
        base->get = base_probe;
        for (i = 0; i < 255; i++)
                p->probes[i] = base;
        p->lock = lock;
        return p;
}











































































































































































































































































































































































































































































































































































































    1 
































































































    1 




























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_SIGNAL_H
#define _LINUX_SCHED_SIGNAL_H

#include <linux/rculist.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/sched/jobctl.h>
#include <linux/sched/task.h>
#include <linux/cred.h>
#include <linux/refcount.h>
#include <linux/posix-timers.h>
#include <linux/mm_types.h>
#include <asm/ptrace.h>

/*
 * Types defining task->signal and task->sighand and APIs using them:
 */

struct sighand_struct {
        spinlock_t                siglock;
        refcount_t                count;
        wait_queue_head_t        signalfd_wqh;
        struct k_sigaction        action[_NSIG];
};

/*
 * Per-process accounting stats:
 */
struct pacct_struct {
        int                        ac_flag;
        long                        ac_exitcode;
        unsigned long                ac_mem;
        u64                        ac_utime, ac_stime;
        unsigned long                ac_minflt, ac_majflt;
};

struct cpu_itimer {
        u64 expires;
        u64 incr;
};

/*
 * This is the atomic variant of task_cputime, which can be used for
 * storing and updating task_cputime statistics without locking.
 */
struct task_cputime_atomic {
        atomic64_t utime;
        atomic64_t stime;
        atomic64_t sum_exec_runtime;
};

#define INIT_CPUTIME_ATOMIC \
        (struct task_cputime_atomic) {                                \
                .utime = ATOMIC64_INIT(0),                        \
                .stime = ATOMIC64_INIT(0),                        \
                .sum_exec_runtime = ATOMIC64_INIT(0),                \
        }
/**
 * struct thread_group_cputimer - thread group interval timer counts
 * @cputime_atomic:        atomic thread group interval timers.
 *
 * This structure contains the version of task_cputime, above, that is
 * used for thread group CPU timer calculations.
 */
struct thread_group_cputimer {
        struct task_cputime_atomic cputime_atomic;
};

struct multiprocess_signals {
        sigset_t signal;
        struct hlist_node node;
};

/*
 * NOTE! "signal_struct" does not have its own
 * locking, because a shared signal_struct always
 * implies a shared sighand_struct, so locking
 * sighand_struct is always a proper superset of
 * the locking of signal_struct.
 */
struct signal_struct {
        refcount_t                sigcnt;
        atomic_t                live;
        int                        nr_threads;
        struct list_head        thread_head;

        wait_queue_head_t        wait_chldexit;        /* for wait4() */

        /* current thread group signal load-balancing target: */
        struct task_struct        *curr_target;

        /* shared signal handling: */
        struct sigpending        shared_pending;

        /* For collecting multiprocess signals during fork */
        struct hlist_head        multiprocess;

        /* thread group exit support */
        int                        group_exit_code;
        /* overloaded:
         * - notify group_exit_task when ->count is equal to notify_count
         * - everyone except group_exit_task is stopped during signal delivery
         *   of fatal signals, group_exit_task processes the signal.
         */
        int                        notify_count;
        struct task_struct        *group_exit_task;

        /* thread group stop support, overloads group_exit_code too */
        int                        group_stop_count;
        unsigned int                flags; /* see SIGNAL_* flags below */

        /*
         * PR_SET_CHILD_SUBREAPER marks a process, like a service
         * manager, to re-parent orphan (double-forking) child processes
         * to this process instead of 'init'. The service manager is
         * able to receive SIGCHLD signals and is able to investigate
         * the process until it calls wait(). All children of this
         * process will inherit a flag if they should look for a
         * child_subreaper process at exit.
         */
        unsigned int                is_child_subreaper:1;
        unsigned int                has_child_subreaper:1;

#ifdef CONFIG_POSIX_TIMERS

        /* POSIX.1b Interval Timers */
        unsigned int                next_posix_timer_id;
        struct list_head        posix_timers;

        /* ITIMER_REAL timer for the process */
        struct hrtimer real_timer;
        ktime_t it_real_incr;

        /*
         * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use
         * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these
         * values are defined to 0 and 1 respectively
         */
        struct cpu_itimer it[2];

        /*
         * Thread group totals for process CPU timers.
         * See thread_group_cputimer(), et al, for details.
         */
        struct thread_group_cputimer cputimer;

#endif
        /* Empty if CONFIG_POSIX_TIMERS=n */
        struct posix_cputimers posix_cputimers;

        /* PID/PID hash table linkage. */
        struct pid *pids[PIDTYPE_MAX];

#ifdef CONFIG_NO_HZ_FULL
        atomic_t tick_dep_mask;
#endif

        struct pid *tty_old_pgrp;

        /* boolean value for session group leader */
        int leader;

        struct tty_struct *tty; /* NULL if no tty */

#ifdef CONFIG_SCHED_AUTOGROUP
        struct autogroup *autogroup;
#endif
        /*
         * Cumulative resource counters for dead threads in the group,
         * and for reaped dead child processes forked by this group.
         * Live threads maintain their own counters and add to these
         * in __exit_signal, except for the group leader.
         */
        seqlock_t stats_lock;
        u64 utime, stime, cutime, cstime;
        u64 gtime;
        u64 cgtime;
        struct prev_cputime prev_cputime;
        unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
        unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
        unsigned long inblock, oublock, cinblock, coublock;
        unsigned long maxrss, cmaxrss;
        struct task_io_accounting ioac;

        /*
         * Cumulative ns of schedule CPU time fo dead threads in the
         * group, not including a zombie group leader, (This only differs
         * from jiffies_to_ns(utime + stime) if sched_clock uses something
         * other than jiffies.)
         */
        unsigned long long sum_sched_runtime;

        /*
         * We don't bother to synchronize most readers of this at all,
         * because there is no reader checking a limit that actually needs
         * to get both rlim_cur and rlim_max atomically, and either one
         * alone is a single word that can safely be read normally.
         * getrlimit/setrlimit use task_lock(current->group_leader) to
         * protect this instead of the siglock, because they really
         * have no need to disable irqs.
         */
        struct rlimit rlim[RLIM_NLIMITS];

#ifdef CONFIG_BSD_PROCESS_ACCT
        struct pacct_struct pacct;        /* per-process accounting information */
#endif
#ifdef CONFIG_TASKSTATS
        struct taskstats *stats;
#endif
#ifdef CONFIG_AUDIT
        unsigned audit_tty;
        struct tty_audit_buf *tty_audit_buf;
#endif

        /*
         * Thread is the potential origin of an oom condition; kill first on
         * oom
         */
        bool oom_flag_origin;
        short oom_score_adj;                /* OOM kill score adjustment */
        short oom_score_adj_min;        /* OOM kill score adjustment min value.
                                         * Only settable by CAP_SYS_RESOURCE. */
        struct mm_struct *oom_mm;        /* recorded mm when the thread group got
                                         * killed by the oom killer */

        struct mutex cred_guard_mutex;        /* guard against foreign influences on
                                         * credential calculations
                                         * (notably. ptrace)
                                         * Deprecated do not use in new code.
                                         * Use exec_update_lock instead.
                                         */
        struct rw_semaphore exec_update_lock;        /* Held while task_struct is
                                                 * being updated during exec,
                                                 * and may have inconsistent
                                                 * permissions.
                                                 */
} __randomize_layout;

/*
 * Bits in flags field of signal_struct.
 */
#define SIGNAL_STOP_STOPPED        0x00000001 /* job control stop in effect */
#define SIGNAL_STOP_CONTINUED        0x00000002 /* SIGCONT since WCONTINUED reap */
#define SIGNAL_GROUP_EXIT        0x00000004 /* group exit in progress */
#define SIGNAL_GROUP_COREDUMP        0x00000008 /* coredump in progress */
/*
 * Pending notifications to parent.
 */
#define SIGNAL_CLD_STOPPED        0x00000010
#define SIGNAL_CLD_CONTINUED        0x00000020
#define SIGNAL_CLD_MASK                (SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED)

#define SIGNAL_UNKILLABLE        0x00000040 /* for init: ignore fatal signals */

#define SIGNAL_STOP_MASK (SIGNAL_CLD_MASK | SIGNAL_STOP_STOPPED | \
                          SIGNAL_STOP_CONTINUED)

static inline void signal_set_stop_flags(struct signal_struct *sig,
                                         unsigned int flags)
{
        WARN_ON(sig->flags & (SIGNAL_GROUP_EXIT|SIGNAL_GROUP_COREDUMP));
        sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags;
}

/* If true, all threads except ->group_exit_task have pending SIGKILL */
static inline int signal_group_exit(const struct signal_struct *sig)
{
        return        (sig->flags & SIGNAL_GROUP_EXIT) ||
                (sig->group_exit_task != NULL);
}

extern void flush_signals(struct task_struct *);
extern void ignore_signals(struct task_struct *);
extern void flush_signal_handlers(struct task_struct *, int force_default);
extern int dequeue_signal(struct task_struct *task,
                          sigset_t *mask, kernel_siginfo_t *info);

static inline int kernel_dequeue_signal(void)
{
        struct task_struct *task = current;
        kernel_siginfo_t __info;
        int ret;

        spin_lock_irq(&task->sighand->siglock);
        ret = dequeue_signal(task, &task->blocked, &__info);
        spin_unlock_irq(&task->sighand->siglock);

        return ret;
}

static inline void kernel_signal_stop(void)
{
        spin_lock_irq(&current->sighand->siglock);
        if (current->jobctl & JOBCTL_STOP_DEQUEUED)
                set_special_state(TASK_STOPPED);
        spin_unlock_irq(&current->sighand->siglock);

        schedule();
}
#ifdef __ARCH_SI_TRAPNO
# define ___ARCH_SI_TRAPNO(_a1) , _a1
#else
# define ___ARCH_SI_TRAPNO(_a1)
#endif
#ifdef __ia64__
# define ___ARCH_SI_IA64(_a1, _a2, _a3) , _a1, _a2, _a3
#else
# define ___ARCH_SI_IA64(_a1, _a2, _a3)
#endif

int force_sig_fault_to_task(int sig, int code, void __user *addr
        ___ARCH_SI_TRAPNO(int trapno)
        ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
        , struct task_struct *t);
int force_sig_fault(int sig, int code, void __user *addr
        ___ARCH_SI_TRAPNO(int trapno)
        ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr));
int send_sig_fault(int sig, int code, void __user *addr
        ___ARCH_SI_TRAPNO(int trapno)
        ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
        , struct task_struct *t);

int force_sig_mceerr(int code, void __user *, short);
int send_sig_mceerr(int code, void __user *, short, struct task_struct *);

int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper);
int force_sig_pkuerr(void __user *addr, u32 pkey);

int force_sig_ptrace_errno_trap(int errno, void __user *addr);

extern int send_sig_info(int, struct kernel_siginfo *, struct task_struct *);
extern void force_sigsegv(int sig);
extern int force_sig_info(struct kernel_siginfo *);
extern int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp);
extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid);
extern int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr, struct pid *,
                                const struct cred *);
extern int kill_pgrp(struct pid *pid, int sig, int priv);
extern int kill_pid(struct pid *pid, int sig, int priv);
extern __must_check bool do_notify_parent(struct task_struct *, int);
extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
extern void force_sig(int);
extern int send_sig(int, struct task_struct *, int);
extern int zap_other_threads(struct task_struct *p);
extern struct sigqueue *sigqueue_alloc(void);
extern void sigqueue_free(struct sigqueue *);
extern int send_sigqueue(struct sigqueue *, struct pid *, enum pid_type);
extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);

static inline void clear_notify_signal(void)
{
        clear_thread_flag(TIF_NOTIFY_SIGNAL);
        smp_mb__after_atomic();
}

static inline int restart_syscall(void)
{
        set_tsk_thread_flag(current, TIF_SIGPENDING);
        return -ERESTARTNOINTR;
}

static inline int task_sigpending(struct task_struct *p)
{
        return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
}

static inline int signal_pending(struct task_struct *p)
{
        /*
         * TIF_NOTIFY_SIGNAL isn't really a signal, but it requires the same
         * behavior in terms of ensuring that we break out of wait loops
         * so that notify signal callbacks can be processed.
         */
        if (unlikely(test_tsk_thread_flag(p, TIF_NOTIFY_SIGNAL)))
                return 1;
        return task_sigpending(p);
}

static inline int __fatal_signal_pending(struct task_struct *p)
{
        return unlikely(sigismember(&p->pending.signal, SIGKILL));
}

static inline int fatal_signal_pending(struct task_struct *p)
{
        return task_sigpending(p) && __fatal_signal_pending(p);
}

static inline int signal_pending_state(long state, struct task_struct *p)
{
        if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL)))
                return 0;
        if (!signal_pending(p))
                return 0;

        return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
}

/*
 * This should only be used in fault handlers to decide whether we
 * should stop the current fault routine to handle the signals
 * instead, especially with the case where we've got interrupted with
 * a VM_FAULT_RETRY.
 */
static inline bool fault_signal_pending(vm_fault_t fault_flags,
                                        struct pt_regs *regs)
{
        return unlikely((fault_flags & VM_FAULT_RETRY) &&
                        (fatal_signal_pending(current) ||
                         (user_mode(regs) && signal_pending(current))));
}

/*
 * Reevaluate whether the task has signals pending delivery.
 * Wake the task if so.
 * This is required every time the blocked sigset_t changes.
 * callers must hold sighand->siglock.
 */
extern void recalc_sigpending_and_wake(struct task_struct *t);
extern void recalc_sigpending(void);
extern void calculate_sigpending(void);

extern void signal_wake_up_state(struct task_struct *t, unsigned int state);

static inline void signal_wake_up(struct task_struct *t, bool resume)
{
        signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0);
}
static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume)
{
        signal_wake_up_state(t, resume ? __TASK_TRACED : 0);
}

void task_join_group_stop(struct task_struct *task);

#ifdef TIF_RESTORE_SIGMASK
/*
 * Legacy restore_sigmask accessors.  These are inefficient on
 * SMP architectures because they require atomic operations.
 */

/**
 * set_restore_sigmask() - make sure saved_sigmask processing gets done
 *
 * This sets TIF_RESTORE_SIGMASK and ensures that the arch signal code
 * will run before returning to user mode, to process the flag.  For
 * all callers, TIF_SIGPENDING is already set or it's no harm to set
 * it.  TIF_RESTORE_SIGMASK need not be in the set of bits that the
 * arch code will notice on return to user mode, in case those bits
 * are scarce.  We set TIF_SIGPENDING here to ensure that the arch
 * signal code always gets run when TIF_RESTORE_SIGMASK is set.
 */
static inline void set_restore_sigmask(void)
{
        set_thread_flag(TIF_RESTORE_SIGMASK);
}

static inline void clear_tsk_restore_sigmask(struct task_struct *task)
{
        clear_tsk_thread_flag(task, TIF_RESTORE_SIGMASK);
}

static inline void clear_restore_sigmask(void)
{
        clear_thread_flag(TIF_RESTORE_SIGMASK);
}
static inline bool test_tsk_restore_sigmask(struct task_struct *task)
{
        return test_tsk_thread_flag(task, TIF_RESTORE_SIGMASK);
}
static inline bool test_restore_sigmask(void)
{
        return test_thread_flag(TIF_RESTORE_SIGMASK);
}
static inline bool test_and_clear_restore_sigmask(void)
{
        return test_and_clear_thread_flag(TIF_RESTORE_SIGMASK);
}

#else        /* TIF_RESTORE_SIGMASK */

/* Higher-quality implementation, used if TIF_RESTORE_SIGMASK doesn't exist. */
static inline void set_restore_sigmask(void)
{
        current->restore_sigmask = true;
}
static inline void clear_tsk_restore_sigmask(struct task_struct *task)
{
        task->restore_sigmask = false;
}
static inline void clear_restore_sigmask(void)
{
        current->restore_sigmask = false;
}
static inline bool test_restore_sigmask(void)
{
        return current->restore_sigmask;
}
static inline bool test_tsk_restore_sigmask(struct task_struct *task)
{
        return task->restore_sigmask;
}
static inline bool test_and_clear_restore_sigmask(void)
{
        if (!current->restore_sigmask)
                return false;
        current->restore_sigmask = false;
        return true;
}
#endif

static inline void restore_saved_sigmask(void)
{
        if (test_and_clear_restore_sigmask())
                __set_current_blocked(&current->saved_sigmask);
}

extern int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize);

static inline void restore_saved_sigmask_unless(bool interrupted)
{
        if (interrupted)
                WARN_ON(!signal_pending(current));
        else
                restore_saved_sigmask();
}

static inline sigset_t *sigmask_to_save(void)
{
        sigset_t *res = &current->blocked;
        if (unlikely(test_restore_sigmask()))
                res = &current->saved_sigmask;
        return res;
}

static inline int kill_cad_pid(int sig, int priv)
{
        return kill_pid(cad_pid, sig, priv);
}

/* These can be the second arg to send_sig_info/send_group_sig_info.  */
#define SEND_SIG_NOINFO ((struct kernel_siginfo *) 0)
#define SEND_SIG_PRIV        ((struct kernel_siginfo *) 1)

static inline int __on_sig_stack(unsigned long sp)
{
#ifdef CONFIG_STACK_GROWSUP
        return sp >= current->sas_ss_sp &&
                sp - current->sas_ss_sp < current->sas_ss_size;
#else
        return sp > current->sas_ss_sp &&
                sp - current->sas_ss_sp <= current->sas_ss_size;
#endif
}

/*
 * True if we are on the alternate signal stack.
 */
static inline int on_sig_stack(unsigned long sp)
{
        /*
         * If the signal stack is SS_AUTODISARM then, by construction, we
         * can't be on the signal stack unless user code deliberately set
         * SS_AUTODISARM when we were already on it.
         *
         * This improves reliability: if user state gets corrupted such that
         * the stack pointer points very close to the end of the signal stack,
         * then this check will enable the signal to be handled anyway.
         */
        if (current->sas_ss_flags & SS_AUTODISARM)
                return 0;

        return __on_sig_stack(sp);
}

static inline int sas_ss_flags(unsigned long sp)
{
        if (!current->sas_ss_size)
                return SS_DISABLE;

        return on_sig_stack(sp) ? SS_ONSTACK : 0;
}

static inline void sas_ss_reset(struct task_struct *p)
{
        p->sas_ss_sp = 0;
        p->sas_ss_size = 0;
        p->sas_ss_flags = SS_DISABLE;
}

static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig)
{
        if (unlikely((ksig->ka.sa.sa_flags & SA_ONSTACK)) && ! sas_ss_flags(sp))
#ifdef CONFIG_STACK_GROWSUP
                return current->sas_ss_sp;
#else
                return current->sas_ss_sp + current->sas_ss_size;
#endif
        return sp;
}

extern void __cleanup_sighand(struct sighand_struct *);
extern void flush_itimer_signals(void);

#define tasklist_empty() \
        list_empty(&init_task.tasks)

#define next_task(p) \
        list_entry_rcu((p)->tasks.next, struct task_struct, tasks)

#define for_each_process(p) \
        for (p = &init_task ; (p = next_task(p)) != &init_task ; )

extern bool current_is_single_threaded(void);

/*
 * Careful: do_each_thread/while_each_thread is a double loop so
 *          'break' will not work as expected - use goto instead.
 */
#define do_each_thread(g, t) \
        for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do

#define while_each_thread(g, t) \
        while ((t = next_thread(t)) != g)

#define __for_each_thread(signal, t)        \
        list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node)

#define for_each_thread(p, t)                \
        __for_each_thread((p)->signal, t)

/* Careful: this is a double loop, 'break' won't work as expected. */
#define for_each_process_thread(p, t)        \
        for_each_process(p) for_each_thread(p, t)

typedef int (*proc_visitor)(struct task_struct *p, void *data);
void walk_process_tree(struct task_struct *top, proc_visitor, void *);

static inline
struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
{
        struct pid *pid;
        if (type == PIDTYPE_PID)
                pid = task_pid(task);
        else
                pid = task->signal->pids[type];
        return pid;
}

static inline struct pid *task_tgid(struct task_struct *task)
{
        return task->signal->pids[PIDTYPE_TGID];
}

/*
 * Without tasklist or RCU lock it is not safe to dereference
 * the result of task_pgrp/task_session even if task == current,
 * we can race with another thread doing sys_setsid/sys_setpgid.
 */
static inline struct pid *task_pgrp(struct task_struct *task)
{
        return task->signal->pids[PIDTYPE_PGID];
}

static inline struct pid *task_session(struct task_struct *task)
{
        return task->signal->pids[PIDTYPE_SID];
}

static inline int get_nr_threads(struct task_struct *task)
{
        return task->signal->nr_threads;
}

static inline bool thread_group_leader(struct task_struct *p)
{
        return p->exit_signal >= 0;
}

static inline
bool same_thread_group(struct task_struct *p1, struct task_struct *p2)
{
        return p1->signal == p2->signal;
}

static inline struct task_struct *next_thread(const struct task_struct *p)
{
        return list_entry_rcu(p->thread_group.next,
                              struct task_struct, thread_group);
}

static inline int thread_group_empty(struct task_struct *p)
{
        return list_empty(&p->thread_group);
}

#define delay_group_leader(p) \
                (thread_group_leader(p) && !thread_group_empty(p))

extern bool thread_group_exited(struct pid *pid);

extern struct sighand_struct *__lock_task_sighand(struct task_struct *task,
                                                        unsigned long *flags);

static inline struct sighand_struct *lock_task_sighand(struct task_struct *task,
                                                       unsigned long *flags)
{
        struct sighand_struct *ret;

        ret = __lock_task_sighand(task, flags);
        (void)__cond_lock(&task->sighand->siglock, ret);
        return ret;
}

static inline void unlock_task_sighand(struct task_struct *task,
                                                unsigned long *flags)
{
        spin_unlock_irqrestore(&task->sighand->siglock, *flags);
}

static inline unsigned long task_rlimit(const struct task_struct *task,
                unsigned int limit)
{
        return READ_ONCE(task->signal->rlim[limit].rlim_cur);
}

static inline unsigned long task_rlimit_max(const struct task_struct *task,
                unsigned int limit)
{
        return READ_ONCE(task->signal->rlim[limit].rlim_max);
}

static inline unsigned long rlimit(unsigned int limit)
{
        return task_rlimit(current, limit);
}

static inline unsigned long rlimit_max(unsigned int limit)
{
        return task_rlimit_max(current, limit);
}

#endif /* _LINUX_SCHED_SIGNAL_H */






































































































































































































































































































































































































































































































































































































































































    1 


    1 





















































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
// SPDX-License-Identifier: GPL-2.0-or-later
/* Basic authentication token and access key management
 *
 * Copyright (C) 2004-2008 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#include <linux/export.h>
#include <linux/init.h>
#include <linux/poison.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/security.h>
#include <linux/workqueue.h>
#include <linux/random.h>
#include <linux/ima.h>
#include <linux/err.h>
#include "internal.h"

struct kmem_cache *key_jar;
struct rb_root                key_serial_tree; /* tree of keys indexed by serial */
DEFINE_SPINLOCK(key_serial_lock);

struct rb_root        key_user_tree; /* tree of quota records indexed by UID */
DEFINE_SPINLOCK(key_user_lock);

unsigned int key_quota_root_maxkeys = 1000000;        /* root's key count quota */
unsigned int key_quota_root_maxbytes = 25000000; /* root's key space quota */
unsigned int key_quota_maxkeys = 200;                /* general key count quota */
unsigned int key_quota_maxbytes = 20000;        /* general key space quota */

static LIST_HEAD(key_types_list);
static DECLARE_RWSEM(key_types_sem);

/* We serialise key instantiation and link */
DEFINE_MUTEX(key_construction_mutex);

#ifdef KEY_DEBUGGING
void __key_check(const struct key *key)
{
        printk("__key_check: key %p {%08x} should be {%08x}\n",
               key, key->magic, KEY_DEBUG_MAGIC);
        BUG();
}
#endif

/*
 * Get the key quota record for a user, allocating a new record if one doesn't
 * already exist.
 */
struct key_user *key_user_lookup(kuid_t uid)
{
        struct key_user *candidate = NULL, *user;
        struct rb_node *parent, **p;

try_again:
        parent = NULL;
        p = &key_user_tree.rb_node;
        spin_lock(&key_user_lock);

        /* search the tree for a user record with a matching UID */
        while (*p) {
                parent = *p;
                user = rb_entry(parent, struct key_user, node);

                if (uid_lt(uid, user->uid))
                        p = &(*p)->rb_left;
                else if (uid_gt(uid, user->uid))
                        p = &(*p)->rb_right;
                else
                        goto found;
        }

        /* if we get here, we failed to find a match in the tree */
        if (!candidate) {
                /* allocate a candidate user record if we don't already have
                 * one */
                spin_unlock(&key_user_lock);

                user = NULL;
                candidate = kmalloc(sizeof(struct key_user), GFP_KERNEL);
                if (unlikely(!candidate))
                        goto out;

                /* the allocation may have scheduled, so we need to repeat the
                 * search lest someone else added the record whilst we were
                 * asleep */
                goto try_again;
        }

        /* if we get here, then the user record still hadn't appeared on the
         * second pass - so we use the candidate record */
        refcount_set(&candidate->usage, 1);
        atomic_set(&candidate->nkeys, 0);
        atomic_set(&candidate->nikeys, 0);
        candidate->uid = uid;
        candidate->qnkeys = 0;
        candidate->qnbytes = 0;
        spin_lock_init(&candidate->lock);
        mutex_init(&candidate->cons_lock);

        rb_link_node(&candidate->node, parent, p);
        rb_insert_color(&candidate->node, &key_user_tree);
        spin_unlock(&key_user_lock);
        user = candidate;
        goto out;

        /* okay - we found a user record for this UID */
found:
        refcount_inc(&user->usage);
        spin_unlock(&key_user_lock);
        kfree(candidate);
out:
        return user;
}

/*
 * Dispose of a user structure
 */
void key_user_put(struct key_user *user)
{
        if (refcount_dec_and_lock(&user->usage, &key_user_lock)) {
                rb_erase(&user->node, &key_user_tree);
                spin_unlock(&key_user_lock);

                kfree(user);
        }
}

/*
 * Allocate a serial number for a key.  These are assigned randomly to avoid
 * security issues through covert channel problems.
 */
static inline void key_alloc_serial(struct key *key)
{
        struct rb_node *parent, **p;
        struct key *xkey;

        /* propose a random serial number and look for a hole for it in the
         * serial number tree */
        do {
                get_random_bytes(&key->serial, sizeof(key->serial));

                key->serial >>= 1; /* negative numbers are not permitted */
        } while (key->serial < 3);

        spin_lock(&key_serial_lock);

attempt_insertion:
        parent = NULL;
        p = &key_serial_tree.rb_node;

        while (*p) {
                parent = *p;
                xkey = rb_entry(parent, struct key, serial_node);

                if (key->serial < xkey->serial)
                        p = &(*p)->rb_left;
                else if (key->serial > xkey->serial)
                        p = &(*p)->rb_right;
                else
                        goto serial_exists;
        }

        /* we've found a suitable hole - arrange for this key to occupy it */
        rb_link_node(&key->serial_node, parent, p);
        rb_insert_color(&key->serial_node, &key_serial_tree);

        spin_unlock(&key_serial_lock);
        return;

        /* we found a key with the proposed serial number - walk the tree from
         * that point looking for the next unused serial number */
serial_exists:
        for (;;) {
                key->serial++;
                if (key->serial < 3) {
                        key->serial = 3;
                        goto attempt_insertion;
                }

                parent = rb_next(parent);
                if (!parent)
                        goto attempt_insertion;

                xkey = rb_entry(parent, struct key, serial_node);
                if (key->serial < xkey->serial)
                        goto attempt_insertion;
        }
}

/**
 * key_alloc - Allocate a key of the specified type.
 * @type: The type of key to allocate.
 * @desc: The key description to allow the key to be searched out.
 * @uid: The owner of the new key.
 * @gid: The group ID for the new key's group permissions.
 * @cred: The credentials specifying UID namespace.
 * @perm: The permissions mask of the new key.
 * @flags: Flags specifying quota properties.
 * @restrict_link: Optional link restriction for new keyrings.
 *
 * Allocate a key of the specified type with the attributes given.  The key is
 * returned in an uninstantiated state and the caller needs to instantiate the
 * key before returning.
 *
 * The restrict_link structure (if not NULL) will be freed when the
 * keyring is destroyed, so it must be dynamically allocated.
 *
 * The user's key count quota is updated to reflect the creation of the key and
 * the user's key data quota has the default for the key type reserved.  The
 * instantiation function should amend this as necessary.  If insufficient
 * quota is available, -EDQUOT will be returned.
 *
 * The LSM security modules can prevent a key being created, in which case
 * -EACCES will be returned.
 *
 * Returns a pointer to the new key if successful and an error code otherwise.
 *
 * Note that the caller needs to ensure the key type isn't uninstantiated.
 * Internally this can be done by locking key_types_sem.  Externally, this can
 * be done by either never unregistering the key type, or making sure
 * key_alloc() calls don't race with module unloading.
 */
struct key *key_alloc(struct key_type *type, const char *desc,
                      kuid_t uid, kgid_t gid, const struct cred *cred,
                      key_perm_t perm, unsigned long flags,
                      struct key_restriction *restrict_link)
{
        struct key_user *user = NULL;
        struct key *key;
        size_t desclen, quotalen;
        int ret;

        key = ERR_PTR(-EINVAL);
        if (!desc || !*desc)
                goto error;

        if (type->vet_description) {
                ret = type->vet_description(desc);
                if (ret < 0) {
                        key = ERR_PTR(ret);
                        goto error;
                }
        }

        desclen = strlen(desc);
        quotalen = desclen + 1 + type->def_datalen;

        /* get hold of the key tracking for this user */
        user = key_user_lookup(uid);
        if (!user)
                goto no_memory_1;

        /* check that the user's quota permits allocation of another key and
         * its description */
        if (!(flags & KEY_ALLOC_NOT_IN_QUOTA)) {
                unsigned maxkeys = uid_eq(uid, GLOBAL_ROOT_UID) ?
                        key_quota_root_maxkeys : key_quota_maxkeys;
                unsigned maxbytes = uid_eq(uid, GLOBAL_ROOT_UID) ?
                        key_quota_root_maxbytes : key_quota_maxbytes;

                spin_lock(&user->lock);
                if (!(flags & KEY_ALLOC_QUOTA_OVERRUN)) {
                        if (user->qnkeys + 1 > maxkeys ||
                            user->qnbytes + quotalen > maxbytes ||
                            user->qnbytes + quotalen < user->qnbytes)
                                goto no_quota;
                }

                user->qnkeys++;
                user->qnbytes += quotalen;
                spin_unlock(&user->lock);
        }

        /* allocate and initialise the key and its description */
        key = kmem_cache_zalloc(key_jar, GFP_KERNEL);
        if (!key)
                goto no_memory_2;

        key->index_key.desc_len = desclen;
        key->index_key.description = kmemdup(desc, desclen + 1, GFP_KERNEL);
        if (!key->index_key.description)
                goto no_memory_3;
        key->index_key.type = type;
        key_set_index_key(&key->index_key);

        refcount_set(&key->usage, 1);
        init_rwsem(&key->sem);
        lockdep_set_class(&key->sem, &type->lock_class);
        key->user = user;
        key->quotalen = quotalen;
        key->datalen = type->def_datalen;
        key->uid = uid;
        key->gid = gid;
        key->perm = perm;
        key->expiry = TIME64_MAX;
        key->restrict_link = restrict_link;
        key->last_used_at = ktime_get_real_seconds();

        if (!(flags & KEY_ALLOC_NOT_IN_QUOTA))
                key->flags |= 1 << KEY_FLAG_IN_QUOTA;
        if (flags & KEY_ALLOC_BUILT_IN)
                key->flags |= 1 << KEY_FLAG_BUILTIN;
        if (flags & KEY_ALLOC_UID_KEYRING)
                key->flags |= 1 << KEY_FLAG_UID_KEYRING;
        if (flags & KEY_ALLOC_SET_KEEP)
                key->flags |= 1 << KEY_FLAG_KEEP;

#ifdef KEY_DEBUGGING
        key->magic = KEY_DEBUG_MAGIC;
#endif

        /* let the security module know about the key */
        ret = security_key_alloc(key, cred, flags);
        if (ret < 0)
                goto security_error;

        /* publish the key by giving it a serial number */
        refcount_inc(&key->domain_tag->usage);
        atomic_inc(&user->nkeys);
        key_alloc_serial(key);

error:
        return key;

security_error:
        kfree(key->description);
        kmem_cache_free(key_jar, key);
        if (!(flags & KEY_ALLOC_NOT_IN_QUOTA)) {
                spin_lock(&user->lock);
                user->qnkeys--;
                user->qnbytes -= quotalen;
                spin_unlock(&user->lock);
        }
        key_user_put(user);
        key = ERR_PTR(ret);
        goto error;

no_memory_3:
        kmem_cache_free(key_jar, key);
no_memory_2:
        if (!(flags & KEY_ALLOC_NOT_IN_QUOTA)) {
                spin_lock(&user->lock);
                user->qnkeys--;
                user->qnbytes -= quotalen;
                spin_unlock(&user->lock);
        }
        key_user_put(user);
no_memory_1:
        key = ERR_PTR(-ENOMEM);
        goto error;

no_quota:
        spin_unlock(&user->lock);
        key_user_put(user);
        key = ERR_PTR(-EDQUOT);
        goto error;
}
EXPORT_SYMBOL(key_alloc);

/**
 * key_payload_reserve - Adjust data quota reservation for the key's payload
 * @key: The key to make the reservation for.
 * @datalen: The amount of data payload the caller now wants.
 *
 * Adjust the amount of the owning user's key data quota that a key reserves.
 * If the amount is increased, then -EDQUOT may be returned if there isn't
 * enough free quota available.
 *
 * If successful, 0 is returned.
 */
int key_payload_reserve(struct key *key, size_t datalen)
{
        int delta = (int)datalen - key->datalen;
        int ret = 0;

        key_check(key);

        /* contemplate the quota adjustment */
        if (delta != 0 && test_bit(KEY_FLAG_IN_QUOTA, &key->flags)) {
                unsigned maxbytes = uid_eq(key->user->uid, GLOBAL_ROOT_UID) ?
                        key_quota_root_maxbytes : key_quota_maxbytes;

                spin_lock(&key->user->lock);

                if (delta > 0 &&
                    (key->user->qnbytes + delta > maxbytes ||
                     key->user->qnbytes + delta < key->user->qnbytes)) {
                        ret = -EDQUOT;
                }
                else {
                        key->user->qnbytes += delta;
                        key->quotalen += delta;
                }
                spin_unlock(&key->user->lock);
        }

        /* change the recorded data length if that didn't generate an error */
        if (ret == 0)
                key->datalen = datalen;

        return ret;
}
EXPORT_SYMBOL(key_payload_reserve);

/*
 * Change the key state to being instantiated.
 */
static void mark_key_instantiated(struct key *key, int reject_error)
{
        /* Commit the payload before setting the state; barrier versus
         * key_read_state().
         */
        smp_store_release(&key->state,
                          (reject_error < 0) ? reject_error : KEY_IS_POSITIVE);
}

/*
 * Instantiate a key and link it into the target keyring atomically.  Must be
 * called with the target keyring's semaphore writelocked.  The target key's
 * semaphore need not be locked as instantiation is serialised by
 * key_construction_mutex.
 */
static int __key_instantiate_and_link(struct key *key,
                                      struct key_preparsed_payload *prep,
                                      struct key *keyring,
                                      struct key *authkey,
                                      struct assoc_array_edit **_edit)
{
        int ret, awaken;

        key_check(key);
        key_check(keyring);

        awaken = 0;
        ret = -EBUSY;

        mutex_lock(&key_construction_mutex);

        /* can't instantiate twice */
        if (key->state == KEY_IS_UNINSTANTIATED) {
                /* instantiate the key */
                ret = key->type->instantiate(key, prep);

                if (ret == 0) {
                        /* mark the key as being instantiated */
                        atomic_inc(&key->user->nikeys);
                        mark_key_instantiated(key, 0);
                        notify_key(key, NOTIFY_KEY_INSTANTIATED, 0);

                        if (test_and_clear_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags))
                                awaken = 1;

                        /* and link it into the destination keyring */
                        if (keyring) {
                                if (test_bit(KEY_FLAG_KEEP, &keyring->flags))
                                        set_bit(KEY_FLAG_KEEP, &key->flags);

                                __key_link(keyring, key, _edit);
                        }

                        /* disable the authorisation key */
                        if (authkey)
                                key_invalidate(authkey);

                        if (prep->expiry != TIME64_MAX)
                                key_set_expiry(key, prep->expiry);
                }
        }

        mutex_unlock(&key_construction_mutex);

        /* wake up anyone waiting for a key to be constructed */
        if (awaken)
                wake_up_bit(&key->flags, KEY_FLAG_USER_CONSTRUCT);

        return ret;
}

/**
 * key_instantiate_and_link - Instantiate a key and link it into the keyring.
 * @key: The key to instantiate.
 * @data: The data to use to instantiate the keyring.
 * @datalen: The length of @data.
 * @keyring: Keyring to create a link in on success (or NULL).
 * @authkey: The authorisation token permitting instantiation.
 *
 * Instantiate a key that's in the uninstantiated state using the provided data
 * and, if successful, link it in to the destination keyring if one is
 * supplied.
 *
 * If successful, 0 is returned, the authorisation token is revoked and anyone
 * waiting for the key is woken up.  If the key was already instantiated,
 * -EBUSY will be returned.
 */
int key_instantiate_and_link(struct key *key,
                             const void *data,
                             size_t datalen,
                             struct key *keyring,
                             struct key *authkey)
{
        struct key_preparsed_payload prep;
        struct assoc_array_edit *edit = NULL;
        int ret;

        memset(&prep, 0, sizeof(prep));
        prep.data = data;
        prep.datalen = datalen;
        prep.quotalen = key->type->def_datalen;
        prep.expiry = TIME64_MAX;
        if (key->type->preparse) {
                ret = key->type->preparse(&prep);
                if (ret < 0)
                        goto error;
        }

        if (keyring) {
                ret = __key_link_lock(keyring, &key->index_key);
                if (ret < 0)
                        goto error;

                ret = __key_link_begin(keyring, &key->index_key, &edit);
                if (ret < 0)
                        goto error_link_end;

                if (keyring->restrict_link && keyring->restrict_link->check) {
                        struct key_restriction *keyres = keyring->restrict_link;

                        ret = keyres->check(keyring, key->type, &prep.payload,
                                            keyres->key);
                        if (ret < 0)
                                goto error_link_end;
                }
        }

        ret = __key_instantiate_and_link(key, &prep, keyring, authkey, &edit);

error_link_end:
        if (keyring)
                __key_link_end(keyring, &key->index_key, edit);

error:
        if (key->type->preparse)
                key->type->free_preparse(&prep);
        return ret;
}

EXPORT_SYMBOL(key_instantiate_and_link);

/**
 * key_reject_and_link - Negatively instantiate a key and link it into the keyring.
 * @key: The key to instantiate.
 * @timeout: The timeout on the negative key.
 * @error: The error to return when the key is hit.
 * @keyring: Keyring to create a link in on success (or NULL).
 * @authkey: The authorisation token permitting instantiation.
 *
 * Negatively instantiate a key that's in the uninstantiated state and, if
 * successful, set its timeout and stored error and link it in to the
 * destination keyring if one is supplied.  The key and any links to the key
 * will be automatically garbage collected after the timeout expires.
 *
 * Negative keys are used to rate limit repeated request_key() calls by causing
 * them to return the stored error code (typically ENOKEY) until the negative
 * key expires.
 *
 * If successful, 0 is returned, the authorisation token is revoked and anyone
 * waiting for the key is woken up.  If the key was already instantiated,
 * -EBUSY will be returned.
 */
int key_reject_and_link(struct key *key,
                        unsigned timeout,
                        unsigned error,
                        struct key *keyring,
                        struct key *authkey)
{
        struct assoc_array_edit *edit = NULL;
        int ret, awaken, link_ret = 0;

        key_check(key);
        key_check(keyring);

        awaken = 0;
        ret = -EBUSY;

        if (keyring) {
                if (keyring->restrict_link)
                        return -EPERM;

                link_ret = __key_link_lock(keyring, &key->index_key);
                if (link_ret == 0) {
                        link_ret = __key_link_begin(keyring, &key->index_key, &edit);
                        if (link_ret < 0)
                                __key_link_end(keyring, &key->index_key, edit);
                }
        }

        mutex_lock(&key_construction_mutex);

        /* can't instantiate twice */
        if (key->state == KEY_IS_UNINSTANTIATED) {
                /* mark the key as being negatively instantiated */
                atomic_inc(&key->user->nikeys);
                mark_key_instantiated(key, -error);
                notify_key(key, NOTIFY_KEY_INSTANTIATED, -error);
                key_set_expiry(key, ktime_get_real_seconds() + timeout);

                if (test_and_clear_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags))
                        awaken = 1;

                ret = 0;

                /* and link it into the destination keyring */
                if (keyring && link_ret == 0)
                        __key_link(keyring, key, &edit);

                /* disable the authorisation key */
                if (authkey)
                        key_invalidate(authkey);
        }

        mutex_unlock(&key_construction_mutex);

        if (keyring && link_ret == 0)
                __key_link_end(keyring, &key->index_key, edit);

        /* wake up anyone waiting for a key to be constructed */
        if (awaken)
                wake_up_bit(&key->flags, KEY_FLAG_USER_CONSTRUCT);

        return ret == 0 ? link_ret : ret;
}
EXPORT_SYMBOL(key_reject_and_link);

/**
 * key_put - Discard a reference to a key.
 * @key: The key to discard a reference from.
 *
 * Discard a reference to a key, and when all the references are gone, we
 * schedule the cleanup task to come and pull it out of the tree in process
 * context at some later time.
 */
void key_put(struct key *key)
{
        if (key) {
                key_check(key);

                if (refcount_dec_and_test(&key->usage))
                        schedule_work(&key_gc_work);
        }
}
EXPORT_SYMBOL(key_put);

/*
 * Find a key by its serial number.
 */
struct key *key_lookup(key_serial_t id)
{
        struct rb_node *n;
        struct key *key;

        spin_lock(&key_serial_lock);

        /* search the tree for the specified key */
        n = key_serial_tree.rb_node;
        while (n) {
                key = rb_entry(n, struct key, serial_node);

                if (id < key->serial)
                        n = n->rb_left;
                else if (id > key->serial)
                        n = n->rb_right;
                else
                        goto found;
        }

not_found:
        key = ERR_PTR(-ENOKEY);
        goto error;

found:
        /* A key is allowed to be looked up only if someone still owns a
         * reference to it - otherwise it's awaiting the gc.
         */
        if (!refcount_inc_not_zero(&key->usage))
                goto not_found;

error:
        spin_unlock(&key_serial_lock);
        return key;
}

/*
 * Find and lock the specified key type against removal.
 *
 * We return with the sem read-locked if successful.  If the type wasn't
 * available -ENOKEY is returned instead.
 */
struct key_type *key_type_lookup(const char *type)
{
        struct key_type *ktype;

        down_read(&key_types_sem);

        /* look up the key type to see if it's one of the registered kernel
         * types */
        list_for_each_entry(ktype, &key_types_list, link) {
                if (strcmp(ktype->name, type) == 0)
                        goto found_kernel_type;
        }

        up_read(&key_types_sem);
        ktype = ERR_PTR(-ENOKEY);

found_kernel_type:
        return ktype;
}

void key_set_timeout(struct key *key, unsigned timeout)
{
        time64_t expiry = TIME64_MAX;

        /* make the changes with the locks held to prevent races */
        down_write(&key->sem);

        if (timeout > 0)
                expiry = ktime_get_real_seconds() + timeout;
        key_set_expiry(key, expiry);

        up_write(&key->sem);
}
EXPORT_SYMBOL_GPL(key_set_timeout);

/*
 * Unlock a key type locked by key_type_lookup().
 */
void key_type_put(struct key_type *ktype)
{
        up_read(&key_types_sem);
}

/*
 * Attempt to update an existing key.
 *
 * The key is given to us with an incremented refcount that we need to discard
 * if we get an error.
 */
static inline key_ref_t __key_update(key_ref_t key_ref,
                                     struct key_preparsed_payload *prep)
{
        struct key *key = key_ref_to_ptr(key_ref);
        int ret;

        /* need write permission on the key to update it */
        ret = key_permission(key_ref, KEY_NEED_WRITE);
        if (ret < 0)
                goto error;

        ret = -EEXIST;
        if (!key->type->update)
                goto error;

        down_write(&key->sem);

        ret = key->type->update(key, prep);
        if (ret == 0) {
                /* Updating a negative key positively instantiates it */
                mark_key_instantiated(key, 0);
                notify_key(key, NOTIFY_KEY_UPDATED, 0);
        }

        up_write(&key->sem);

        if (ret < 0)
                goto error;
out:
        return key_ref;

error:
        key_put(key);
        key_ref = ERR_PTR(ret);
        goto out;
}

/**
 * key_create_or_update - Update or create and instantiate a key.
 * @keyring_ref: A pointer to the destination keyring with possession flag.
 * @type: The type of key.
 * @description: The searchable description for the key.
 * @payload: The data to use to instantiate or update the key.
 * @plen: The length of @payload.
 * @perm: The permissions mask for a new key.
 * @flags: The quota flags for a new key.
 *
 * Search the destination keyring for a key of the same description and if one
 * is found, update it, otherwise create and instantiate a new one and create a
 * link to it from that keyring.
 *
 * If perm is KEY_PERM_UNDEF then an appropriate key permissions mask will be
 * concocted.
 *
 * Returns a pointer to the new key if successful, -ENODEV if the key type
 * wasn't available, -ENOTDIR if the keyring wasn't a keyring, -EACCES if the
 * caller isn't permitted to modify the keyring or the LSM did not permit
 * creation of the key.
 *
 * On success, the possession flag from the keyring ref will be tacked on to
 * the key ref before it is returned.
 */
key_ref_t key_create_or_update(key_ref_t keyring_ref,
                               const char *type,
                               const char *description,
                               const void *payload,
                               size_t plen,
                               key_perm_t perm,
                               unsigned long flags)
{
        struct keyring_index_key index_key = {
                .description        = description,
        };
        struct key_preparsed_payload prep;
        struct assoc_array_edit *edit = NULL;
        const struct cred *cred = current_cred();
        struct key *keyring, *key = NULL;
        key_ref_t key_ref;
        int ret;
        struct key_restriction *restrict_link = NULL;

        /* look up the key type to see if it's one of the registered kernel
         * types */
        index_key.type = key_type_lookup(type);
        if (IS_ERR(index_key.type)) {
                key_ref = ERR_PTR(-ENODEV);
                goto error;
        }

        key_ref = ERR_PTR(-EINVAL);
        if (!index_key.type->instantiate ||
            (!index_key.description && !index_key.type->preparse))
                goto error_put_type;

        keyring = key_ref_to_ptr(keyring_ref);

        key_check(keyring);

        if (!(flags & KEY_ALLOC_BYPASS_RESTRICTION))
                restrict_link = keyring->restrict_link;

        key_ref = ERR_PTR(-ENOTDIR);
        if (keyring->type != &key_type_keyring)
                goto error_put_type;

        memset(&prep, 0, sizeof(prep));
        prep.data = payload;
        prep.datalen = plen;
        prep.quotalen = index_key.type->def_datalen;
        prep.expiry = TIME64_MAX;
        if (index_key.type->preparse) {
                ret = index_key.type->preparse(&prep);
                if (ret < 0) {
                        key_ref = ERR_PTR(ret);
                        goto error_free_prep;
                }
                if (!index_key.description)
                        index_key.description = prep.description;
                key_ref = ERR_PTR(-EINVAL);
                if (!index_key.description)
                        goto error_free_prep;
        }
        index_key.desc_len = strlen(index_key.description);
        key_set_index_key(&index_key);

        ret = __key_link_lock(keyring, &index_key);
        if (ret < 0) {
                key_ref = ERR_PTR(ret);
                goto error_free_prep;
        }

        ret = __key_link_begin(keyring, &index_key, &edit);
        if (ret < 0) {
                key_ref = ERR_PTR(ret);
                goto error_link_end;
        }

        if (restrict_link && restrict_link->check) {
                ret = restrict_link->check(keyring, index_key.type,
                                           &prep.payload, restrict_link->key);
                if (ret < 0) {
                        key_ref = ERR_PTR(ret);
                        goto error_link_end;
                }
        }

        /* if we're going to allocate a new key, we're going to have
         * to modify the keyring */
        ret = key_permission(keyring_ref, KEY_NEED_WRITE);
        if (ret < 0) {
                key_ref = ERR_PTR(ret);
                goto error_link_end;
        }

        /* if it's possible to update this type of key, search for an existing
         * key of the same type and description in the destination keyring and
         * update that instead if possible
         */
        if (index_key.type->update) {
                key_ref = find_key_to_update(keyring_ref, &index_key);
                if (key_ref)
                        goto found_matching_key;
        }

        /* if the client doesn't provide, decide on the permissions we want */
        if (perm == KEY_PERM_UNDEF) {
                perm = KEY_POS_VIEW | KEY_POS_SEARCH | KEY_POS_LINK | KEY_POS_SETATTR;
                perm |= KEY_USR_VIEW;

                if (index_key.type->read)
                        perm |= KEY_POS_READ;

                if (index_key.type == &key_type_keyring ||
                    index_key.type->update)
                        perm |= KEY_POS_WRITE;
        }

        /* allocate a new key */
        key = key_alloc(index_key.type, index_key.description,
                        cred->fsuid, cred->fsgid, cred, perm, flags, NULL);
        if (IS_ERR(key)) {
                key_ref = ERR_CAST(key);
                goto error_link_end;
        }

        /* instantiate it and link it into the target keyring */
        ret = __key_instantiate_and_link(key, &prep, keyring, NULL, &edit);
        if (ret < 0) {
                key_put(key);
                key_ref = ERR_PTR(ret);
                goto error_link_end;
        }

        ima_post_key_create_or_update(keyring, key, payload, plen,
                                      flags, true);

        key_ref = make_key_ref(key, is_key_possessed(keyring_ref));

error_link_end:
        __key_link_end(keyring, &index_key, edit);
error_free_prep:
        if (index_key.type->preparse)
                index_key.type->free_preparse(&prep);
error_put_type:
        key_type_put(index_key.type);
error:
        return key_ref;

 found_matching_key:
        /* we found a matching key, so we're going to try to update it
         * - we can drop the locks first as we have the key pinned
         */
        __key_link_end(keyring, &index_key, edit);

        key = key_ref_to_ptr(key_ref);
        if (test_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags)) {
                ret = wait_for_key_construction(key, true);
                if (ret < 0) {
                        key_ref_put(key_ref);
                        key_ref = ERR_PTR(ret);
                        goto error_free_prep;
                }
        }

        key_ref = __key_update(key_ref, &prep);

        if (!IS_ERR(key_ref))
                ima_post_key_create_or_update(keyring, key,
                                              payload, plen,
                                              flags, false);

        goto error_free_prep;
}
EXPORT_SYMBOL(key_create_or_update);

/**
 * key_update - Update a key's contents.
 * @key_ref: The pointer (plus possession flag) to the key.
 * @payload: The data to be used to update the key.
 * @plen: The length of @payload.
 *
 * Attempt to update the contents of a key with the given payload data.  The
 * caller must be granted Write permission on the key.  Negative keys can be
 * instantiated by this method.
 *
 * Returns 0 on success, -EACCES if not permitted and -EOPNOTSUPP if the key
 * type does not support updating.  The key type may return other errors.
 */
int key_update(key_ref_t key_ref, const void *payload, size_t plen)
{
        struct key_preparsed_payload prep;
        struct key *key = key_ref_to_ptr(key_ref);
        int ret;

        key_check(key);

        /* the key must be writable */
        ret = key_permission(key_ref, KEY_NEED_WRITE);
        if (ret < 0)
                return ret;

        /* attempt to update it if supported */
        if (!key->type->update)
                return -EOPNOTSUPP;

        memset(&prep, 0, sizeof(prep));
        prep.data = payload;
        prep.datalen = plen;
        prep.quotalen = key->type->def_datalen;
        prep.expiry = TIME64_MAX;
        if (key->type->preparse) {
                ret = key->type->preparse(&prep);
                if (ret < 0)
                        goto error;
        }

        down_write(&key->sem);

        ret = key->type->update(key, &prep);
        if (ret == 0) {
                /* Updating a negative key positively instantiates it */
                mark_key_instantiated(key, 0);
                notify_key(key, NOTIFY_KEY_UPDATED, 0);
        }

        up_write(&key->sem);

error:
        if (key->type->preparse)
                key->type->free_preparse(&prep);
        return ret;
}
EXPORT_SYMBOL(key_update);

/**
 * key_revoke - Revoke a key.
 * @key: The key to be revoked.
 *
 * Mark a key as being revoked and ask the type to free up its resources.  The
 * revocation timeout is set and the key and all its links will be
 * automatically garbage collected after key_gc_delay amount of time if they
 * are not manually dealt with first.
 */
void key_revoke(struct key *key)
{
        time64_t time;

        key_check(key);

        /* make sure no one's trying to change or use the key when we mark it
         * - we tell lockdep that we might nest because we might be revoking an
         *   authorisation key whilst holding the sem on a key we've just
         *   instantiated
         */
        down_write_nested(&key->sem, 1);
        if (!test_and_set_bit(KEY_FLAG_REVOKED, &key->flags)) {
                notify_key(key, NOTIFY_KEY_REVOKED, 0);
                if (key->type->revoke)
                        key->type->revoke(key);

                /* set the death time to no more than the expiry time */
                time = ktime_get_real_seconds();
                if (key->revoked_at == 0 || key->revoked_at > time) {
                        key->revoked_at = time;
                        key_schedule_gc(key->revoked_at + key_gc_delay);
                }
        }

        up_write(&key->sem);
}
EXPORT_SYMBOL(key_revoke);

/**
 * key_invalidate - Invalidate a key.
 * @key: The key to be invalidated.
 *
 * Mark a key as being invalidated and have it cleaned up immediately.  The key
 * is ignored by all searches and other operations from this point.
 */
void key_invalidate(struct key *key)
{
        kenter("%d", key_serial(key));

        key_check(key);

        if (!test_bit(KEY_FLAG_INVALIDATED, &key->flags)) {
                down_write_nested(&key->sem, 1);
                if (!test_and_set_bit(KEY_FLAG_INVALIDATED, &key->flags)) {
                        notify_key(key, NOTIFY_KEY_INVALIDATED, 0);
                        key_schedule_gc_links();
                }
                up_write(&key->sem);
        }
}
EXPORT_SYMBOL(key_invalidate);

/**
 * generic_key_instantiate - Simple instantiation of a key from preparsed data
 * @key: The key to be instantiated
 * @prep: The preparsed data to load.
 *
 * Instantiate a key from preparsed data.  We assume we can just copy the data
 * in directly and clear the old pointers.
 *
 * This can be pointed to directly by the key type instantiate op pointer.
 */
int generic_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
{
        int ret;

        pr_devel("==>%s()\n", __func__);

        ret = key_payload_reserve(key, prep->quotalen);
        if (ret == 0) {
                rcu_assign_keypointer(key, prep->payload.data[0]);
                key->payload.data[1] = prep->payload.data[1];
                key->payload.data[2] = prep->payload.data[2];
                key->payload.data[3] = prep->payload.data[3];
                prep->payload.data[0] = NULL;
                prep->payload.data[1] = NULL;
                prep->payload.data[2] = NULL;
                prep->payload.data[3] = NULL;
        }
        pr_devel("<==%s() = %d\n", __func__, ret);
        return ret;
}
EXPORT_SYMBOL(generic_key_instantiate);

/**
 * register_key_type - Register a type of key.
 * @ktype: The new key type.
 *
 * Register a new key type.
 *
 * Returns 0 on success or -EEXIST if a type of this name already exists.
 */
int register_key_type(struct key_type *ktype)
{
        struct key_type *p;
        int ret;

        memset(&ktype->lock_class, 0, sizeof(ktype->lock_class));

        ret = -EEXIST;
        down_write(&key_types_sem);

        /* disallow key types with the same name */
        list_for_each_entry(p, &key_types_list, link) {
                if (strcmp(p->name, ktype->name) == 0)
                        goto out;
        }

        /* store the type */
        list_add(&ktype->link, &key_types_list);

        pr_notice("Key type %s registered\n", ktype->name);
        ret = 0;

out:
        up_write(&key_types_sem);
        return ret;
}
EXPORT_SYMBOL(register_key_type);

/**
 * unregister_key_type - Unregister a type of key.
 * @ktype: The key type.
 *
 * Unregister a key type and mark all the extant keys of this type as dead.
 * Those keys of this type are then destroyed to get rid of their payloads and
 * they and their links will be garbage collected as soon as possible.
 */
void unregister_key_type(struct key_type *ktype)
{
        down_write(&key_types_sem);
        list_del_init(&ktype->link);
        downgrade_write(&key_types_sem);
        key_gc_keytype(ktype);
        pr_notice("Key type %s unregistered\n", ktype->name);
        up_read(&key_types_sem);
}
EXPORT_SYMBOL(unregister_key_type);

/*
 * Initialise the key management state.
 */
void __init key_init(void)
{
        /* allocate a slab in which we can store keys */
        key_jar = kmem_cache_create("key_jar", sizeof(struct key),
                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);

        /* add the special key types */
        list_add_tail(&key_type_keyring.link, &key_types_list);
        list_add_tail(&key_type_dead.link, &key_types_list);
        list_add_tail(&key_type_user.link, &key_types_list);
        list_add_tail(&key_type_logon.link, &key_types_list);

        /* record the root user tracking */
        rb_link_node(&root_key_user.node,
                     NULL,
                     &key_user_tree.rb_node);

        rb_insert_color(&root_key_user.node,
                        &key_user_tree);
}




































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_STRING_HELPERS_H_
#define _LINUX_STRING_HELPERS_H_

#include <linux/ctype.h>
#include <linux/types.h>

struct file;
struct task_struct;

/* Descriptions of the types of units to
 * print in */
enum string_size_units {
        STRING_UNITS_10,        /* use powers of 10^3 (standard SI) */
        STRING_UNITS_2,                /* use binary powers of 2^10 */
};

void string_get_size(u64 size, u64 blk_size, enum string_size_units units,
                     char *buf, int len);

#define UNESCAPE_SPACE                0x01
#define UNESCAPE_OCTAL                0x02
#define UNESCAPE_HEX                0x04
#define UNESCAPE_SPECIAL        0x08
#define UNESCAPE_ANY                \
        (UNESCAPE_SPACE | UNESCAPE_OCTAL | UNESCAPE_HEX | UNESCAPE_SPECIAL)

int string_unescape(char *src, char *dst, size_t size, unsigned int flags);

static inline int string_unescape_inplace(char *buf, unsigned int flags)
{
        return string_unescape(buf, buf, 0, flags);
}

static inline int string_unescape_any(char *src, char *dst, size_t size)
{
        return string_unescape(src, dst, size, UNESCAPE_ANY);
}

static inline int string_unescape_any_inplace(char *buf)
{
        return string_unescape_any(buf, buf, 0);
}

#define ESCAPE_SPACE                0x01
#define ESCAPE_SPECIAL                0x02
#define ESCAPE_NULL                0x04
#define ESCAPE_OCTAL                0x08
#define ESCAPE_ANY                \
        (ESCAPE_SPACE | ESCAPE_OCTAL | ESCAPE_SPECIAL | ESCAPE_NULL)
#define ESCAPE_NP                0x10
#define ESCAPE_ANY_NP                (ESCAPE_ANY | ESCAPE_NP)
#define ESCAPE_HEX                0x20

int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
                unsigned int flags, const char *only);

int string_escape_mem_ascii(const char *src, size_t isz, char *dst,
                                        size_t osz);

static inline int string_escape_mem_any_np(const char *src, size_t isz,
                char *dst, size_t osz, const char *only)
{
        return string_escape_mem(src, isz, dst, osz, ESCAPE_ANY_NP, only);
}

static inline int string_escape_str(const char *src, char *dst, size_t sz,
                unsigned int flags, const char *only)
{
        return string_escape_mem(src, strlen(src), dst, sz, flags, only);
}

static inline int string_escape_str_any_np(const char *src, char *dst,
                size_t sz, const char *only)
{
        return string_escape_str(src, dst, sz, ESCAPE_ANY_NP, only);
}

static inline void string_upper(char *dst, const char *src)
{
        do {
                *dst++ = toupper(*src);
        } while (*src++);
}

static inline void string_lower(char *dst, const char *src)
{
        do {
                *dst++ = tolower(*src);
        } while (*src++);
}

char *kstrdup_quotable(const char *src, gfp_t gfp);
char *kstrdup_quotable_cmdline(struct task_struct *task, gfp_t gfp);
char *kstrdup_quotable_file(struct file *file, gfp_t gfp);

void kfree_strarray(char **array, size_t n);

#endif




















































































































    5 



































































    1 






    1 













    1 


































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Security server interface.
 *
 * Author : Stephen Smalley, <sds@tycho.nsa.gov>
 *
 */

#ifndef _SELINUX_SECURITY_H_
#define _SELINUX_SECURITY_H_

#include <linux/compiler.h>
#include <linux/dcache.h>
#include <linux/magic.h>
#include <linux/types.h>
#include <linux/rcupdate.h>
#include <linux/refcount.h>
#include <linux/workqueue.h>
#include "flask.h"
#include "policycap.h"

#define SECSID_NULL                        0x00000000 /* unspecified SID */
#define SECSID_WILD                        0xffffffff /* wildcard SID */
#define SECCLASS_NULL                        0x0000 /* no class */

/* Identify specific policy version changes */
#define POLICYDB_VERSION_BASE                15
#define POLICYDB_VERSION_BOOL                16
#define POLICYDB_VERSION_IPV6                17
#define POLICYDB_VERSION_NLCLASS        18
#define POLICYDB_VERSION_VALIDATETRANS        19
#define POLICYDB_VERSION_MLS                19
#define POLICYDB_VERSION_AVTAB                20
#define POLICYDB_VERSION_RANGETRANS        21
#define POLICYDB_VERSION_POLCAP                22
#define POLICYDB_VERSION_PERMISSIVE        23
#define POLICYDB_VERSION_BOUNDARY        24
#define POLICYDB_VERSION_FILENAME_TRANS        25
#define POLICYDB_VERSION_ROLETRANS        26
#define POLICYDB_VERSION_NEW_OBJECT_DEFAULTS        27
#define POLICYDB_VERSION_DEFAULT_TYPE        28
#define POLICYDB_VERSION_CONSTRAINT_NAMES        29
#define POLICYDB_VERSION_XPERMS_IOCTL        30
#define POLICYDB_VERSION_INFINIBAND                31
#define POLICYDB_VERSION_GLBLUB                32
#define POLICYDB_VERSION_COMP_FTRANS        33 /* compressed filename transitions */

/* Range of policy versions we understand*/
#define POLICYDB_VERSION_MIN   POLICYDB_VERSION_BASE
#define POLICYDB_VERSION_MAX   POLICYDB_VERSION_COMP_FTRANS

/* Mask for just the mount related flags */
#define SE_MNTMASK        0x0f
/* Super block security struct flags for mount options */
/* BE CAREFUL, these need to be the low order bits for selinux_get_mnt_opts */
#define CONTEXT_MNT        0x01
#define FSCONTEXT_MNT        0x02
#define ROOTCONTEXT_MNT        0x04
#define DEFCONTEXT_MNT        0x08
#define SBLABEL_MNT        0x10
/* Non-mount related flags */
#define SE_SBINITIALIZED        0x0100
#define SE_SBPROC                0x0200
#define SE_SBGENFS                0x0400
#define SE_SBGENFS_XATTR        0x0800

#define CONTEXT_STR        "context"
#define FSCONTEXT_STR        "fscontext"
#define ROOTCONTEXT_STR        "rootcontext"
#define DEFCONTEXT_STR        "defcontext"
#define SECLABEL_STR "seclabel"

struct netlbl_lsm_secattr;

extern int selinux_enabled_boot;

/*
 * type_datum properties
 * available at the kernel policy version >= POLICYDB_VERSION_BOUNDARY
 */
#define TYPEDATUM_PROPERTY_PRIMARY        0x0001
#define TYPEDATUM_PROPERTY_ATTRIBUTE        0x0002

/* limitation of boundary depth  */
#define POLICYDB_BOUNDS_MAXDEPTH        4

struct selinux_avc;
struct selinux_policy;

struct selinux_state {
#ifdef CONFIG_SECURITY_SELINUX_DISABLE
        bool disabled;
#endif
#ifdef CONFIG_SECURITY_SELINUX_DEVELOP
        bool enforcing;
#endif
        bool checkreqprot;
        bool initialized;
        bool policycap[__POLICYDB_CAPABILITY_MAX];

        struct page *status_page;
        struct mutex status_lock;

        struct selinux_avc *avc;
        struct selinux_policy __rcu *policy;
        struct mutex policy_mutex;
} __randomize_layout;

void selinux_avc_init(struct selinux_avc **avc);

extern struct selinux_state selinux_state;

static inline bool selinux_initialized(const struct selinux_state *state)
{
        /* do a synchronized load to avoid race conditions */
        return smp_load_acquire(&state->initialized);
}

static inline void selinux_mark_initialized(struct selinux_state *state)
{
        /* do a synchronized write to avoid race conditions */
        smp_store_release(&state->initialized, true);
}

#ifdef CONFIG_SECURITY_SELINUX_DEVELOP
static inline bool enforcing_enabled(struct selinux_state *state)
{
        return READ_ONCE(state->enforcing);
}

static inline void enforcing_set(struct selinux_state *state, bool value)
{
        WRITE_ONCE(state->enforcing, value);
}
#else
static inline bool enforcing_enabled(struct selinux_state *state)
{
        return true;
}

static inline void enforcing_set(struct selinux_state *state, bool value)
{
}
#endif

static inline bool checkreqprot_get(const struct selinux_state *state)
{
        return READ_ONCE(state->checkreqprot);
}

static inline void checkreqprot_set(struct selinux_state *state, bool value)
{
        WRITE_ONCE(state->checkreqprot, value);
}

#ifdef CONFIG_SECURITY_SELINUX_DISABLE
static inline bool selinux_disabled(struct selinux_state *state)
{
        return READ_ONCE(state->disabled);
}

static inline void selinux_mark_disabled(struct selinux_state *state)
{
        WRITE_ONCE(state->disabled, true);
}
#else
static inline bool selinux_disabled(struct selinux_state *state)
{
        return false;
}
#endif

static inline bool selinux_policycap_netpeer(void)
{
        struct selinux_state *state = &selinux_state;

        return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_NETPEER]);
}

static inline bool selinux_policycap_openperm(void)
{
        struct selinux_state *state = &selinux_state;

        return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_OPENPERM]);
}

static inline bool selinux_policycap_extsockclass(void)
{
        struct selinux_state *state = &selinux_state;

        return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_EXTSOCKCLASS]);
}

static inline bool selinux_policycap_alwaysnetwork(void)
{
        struct selinux_state *state = &selinux_state;

        return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_ALWAYSNETWORK]);
}

static inline bool selinux_policycap_cgroupseclabel(void)
{
        struct selinux_state *state = &selinux_state;

        return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_CGROUPSECLABEL]);
}

static inline bool selinux_policycap_nnp_nosuid_transition(void)
{
        struct selinux_state *state = &selinux_state;

        return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_NNP_NOSUID_TRANSITION]);
}

static inline bool selinux_policycap_genfs_seclabel_symlinks(void)
{
        struct selinux_state *state = &selinux_state;

        return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_GENFS_SECLABEL_SYMLINKS]);
}

static inline bool selinux_policycap_ioctl_skip_cloexec(void)
{
        struct selinux_state *state = &selinux_state;

        return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_IOCTL_SKIP_CLOEXEC]);
}

struct selinux_policy_convert_data;

struct selinux_load_state {
        struct selinux_policy *policy;
        struct selinux_policy_convert_data *convert_data;
};

int security_mls_enabled(struct selinux_state *state);
int security_load_policy(struct selinux_state *state,
                         void *data, size_t len,
                         struct selinux_load_state *load_state);
void selinux_policy_commit(struct selinux_state *state,
                           struct selinux_load_state *load_state);
void selinux_policy_cancel(struct selinux_state *state,
                           struct selinux_load_state *load_state);
int security_read_policy(struct selinux_state *state,
                         void **data, size_t *len);

int security_policycap_supported(struct selinux_state *state,
                                 unsigned int req_cap);

#define SEL_VEC_MAX 32
struct av_decision {
        u32 allowed;
        u32 auditallow;
        u32 auditdeny;
        u32 seqno;
        u32 flags;
};

#define XPERMS_ALLOWED 1
#define XPERMS_AUDITALLOW 2
#define XPERMS_DONTAUDIT 4

#define security_xperm_set(perms, x) (perms[x >> 5] |= 1 << (x & 0x1f))
#define security_xperm_test(perms, x) (1 & (perms[x >> 5] >> (x & 0x1f)))
struct extended_perms_data {
        u32 p[8];
};

struct extended_perms_decision {
        u8 used;
        u8 driver;
        struct extended_perms_data *allowed;
        struct extended_perms_data *auditallow;
        struct extended_perms_data *dontaudit;
};

struct extended_perms {
        u16 len;        /* length associated decision chain */
        struct extended_perms_data drivers; /* flag drivers that are used */
};

/* definitions of av_decision.flags */
#define AVD_FLAGS_PERMISSIVE        0x0001

void security_compute_av(struct selinux_state *state,
                         u32 ssid, u32 tsid,
                         u16 tclass, struct av_decision *avd,
                         struct extended_perms *xperms);

void security_compute_xperms_decision(struct selinux_state *state,
                                      u32 ssid, u32 tsid, u16 tclass,
                                      u8 driver,
                                      struct extended_perms_decision *xpermd);

void security_compute_av_user(struct selinux_state *state,
                              u32 ssid, u32 tsid,
                              u16 tclass, struct av_decision *avd);

int security_transition_sid(struct selinux_state *state,
                            u32 ssid, u32 tsid, u16 tclass,
                            const struct qstr *qstr, u32 *out_sid);

int security_transition_sid_user(struct selinux_state *state,
                                 u32 ssid, u32 tsid, u16 tclass,
                                 const char *objname, u32 *out_sid);

int security_member_sid(struct selinux_state *state, u32 ssid, u32 tsid,
                        u16 tclass, u32 *out_sid);

int security_change_sid(struct selinux_state *state, u32 ssid, u32 tsid,
                        u16 tclass, u32 *out_sid);

int security_sid_to_context(struct selinux_state *state, u32 sid,
                            char **scontext, u32 *scontext_len);

int security_sid_to_context_force(struct selinux_state *state,
                                  u32 sid, char **scontext, u32 *scontext_len);

int security_sid_to_context_inval(struct selinux_state *state,
                                  u32 sid, char **scontext, u32 *scontext_len);

int security_context_to_sid(struct selinux_state *state,
                            const char *scontext, u32 scontext_len,
                            u32 *out_sid, gfp_t gfp);

int security_context_str_to_sid(struct selinux_state *state,
                                const char *scontext, u32 *out_sid, gfp_t gfp);

int security_context_to_sid_default(struct selinux_state *state,
                                    const char *scontext, u32 scontext_len,
                                    u32 *out_sid, u32 def_sid, gfp_t gfp_flags);

int security_context_to_sid_force(struct selinux_state *state,
                                  const char *scontext, u32 scontext_len,
                                  u32 *sid);

int security_get_user_sids(struct selinux_state *state,
                           u32 callsid, char *username,
                           u32 **sids, u32 *nel);

int security_port_sid(struct selinux_state *state,
                      u8 protocol, u16 port, u32 *out_sid);

int security_ib_pkey_sid(struct selinux_state *state,
                         u64 subnet_prefix, u16 pkey_num, u32 *out_sid);

int security_ib_endport_sid(struct selinux_state *state,
                            const char *dev_name, u8 port_num, u32 *out_sid);

int security_netif_sid(struct selinux_state *state,
                       char *name, u32 *if_sid);

int security_node_sid(struct selinux_state *state,
                      u16 domain, void *addr, u32 addrlen,
                      u32 *out_sid);

int security_validate_transition(struct selinux_state *state,
                                 u32 oldsid, u32 newsid, u32 tasksid,
                                 u16 tclass);

int security_validate_transition_user(struct selinux_state *state,
                                      u32 oldsid, u32 newsid, u32 tasksid,
                                      u16 tclass);

int security_bounded_transition(struct selinux_state *state,
                                u32 oldsid, u32 newsid);

int security_sid_mls_copy(struct selinux_state *state,
                          u32 sid, u32 mls_sid, u32 *new_sid);

int security_net_peersid_resolve(struct selinux_state *state,
                                 u32 nlbl_sid, u32 nlbl_type,
                                 u32 xfrm_sid,
                                 u32 *peer_sid);

int security_get_classes(struct selinux_policy *policy,
                         char ***classes, int *nclasses);
int security_get_permissions(struct selinux_policy *policy,
                             char *class, char ***perms, int *nperms);
int security_get_reject_unknown(struct selinux_state *state);
int security_get_allow_unknown(struct selinux_state *state);

#define SECURITY_FS_USE_XATTR                1 /* use xattr */
#define SECURITY_FS_USE_TRANS                2 /* use transition SIDs, e.g. devpts/tmpfs */
#define SECURITY_FS_USE_TASK                3 /* use task SIDs, e.g. pipefs/sockfs */
#define SECURITY_FS_USE_GENFS                4 /* use the genfs support */
#define SECURITY_FS_USE_NONE                5 /* no labeling support */
#define SECURITY_FS_USE_MNTPOINT        6 /* use mountpoint labeling */
#define SECURITY_FS_USE_NATIVE                7 /* use native label support */
#define SECURITY_FS_USE_MAX                7 /* Highest SECURITY_FS_USE_XXX */

int security_fs_use(struct selinux_state *state, struct super_block *sb);

int security_genfs_sid(struct selinux_state *state,
                       const char *fstype, char *name, u16 sclass,
                       u32 *sid);

int selinux_policy_genfs_sid(struct selinux_policy *policy,
                       const char *fstype, char *name, u16 sclass,
                       u32 *sid);

#ifdef CONFIG_NETLABEL
int security_netlbl_secattr_to_sid(struct selinux_state *state,
                                   struct netlbl_lsm_secattr *secattr,
                                   u32 *sid);

int security_netlbl_sid_to_secattr(struct selinux_state *state,
                                   u32 sid,
                                   struct netlbl_lsm_secattr *secattr);
#else
static inline int security_netlbl_secattr_to_sid(struct selinux_state *state,
                                            struct netlbl_lsm_secattr *secattr,
                                            u32 *sid)
{
        return -EIDRM;
}

static inline int security_netlbl_sid_to_secattr(struct selinux_state *state,
                                         u32 sid,
                                         struct netlbl_lsm_secattr *secattr)
{
        return -ENOENT;
}
#endif /* CONFIG_NETLABEL */

const char *security_get_initial_sid_context(u32 sid);

/*
 * status notifier using mmap interface
 */
extern struct page *selinux_kernel_status_page(struct selinux_state *state);

#define SELINUX_KERNEL_STATUS_VERSION        1
struct selinux_kernel_status {
        u32        version;        /* version number of thie structure */
        u32        sequence;        /* sequence number of seqlock logic */
        u32        enforcing;        /* current setting of enforcing mode */
        u32        policyload;        /* times of policy reloaded */
        u32        deny_unknown;        /* current setting of deny_unknown */
        /*
         * The version > 0 supports above members.
         */
} __packed;

extern void selinux_status_update_setenforce(struct selinux_state *state,
                                             int enforcing);
extern void selinux_status_update_policyload(struct selinux_state *state,
                                             int seqno);
extern void selinux_complete_init(void);
extern int selinux_disable(struct selinux_state *state);
extern void exit_sel_fs(void);
extern struct path selinux_null;
extern struct vfsmount *selinuxfs_mount;
extern void selnl_notify_setenforce(int val);
extern void selnl_notify_policyload(u32 seqno);
extern int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm);

extern void avtab_cache_init(void);
extern void ebitmap_cache_init(void);
extern void hashtab_cache_init(void);
extern int security_sidtab_hash_stats(struct selinux_state *state, char *page);

#endif /* _SELINUX_SECURITY_H_ */



































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * VLAN                An implementation of 802.1Q VLAN tagging.
 *
 * Authors:        Ben Greear <greearb@candelatech.com>
 */
#ifndef _LINUX_IF_VLAN_H_
#define _LINUX_IF_VLAN_H_

#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/rtnetlink.h>
#include <linux/bug.h>
#include <uapi/linux/if_vlan.h>

#define VLAN_HLEN        4                /* The additional bytes required by VLAN
                                         * (in addition to the Ethernet header)
                                         */
#define VLAN_ETH_HLEN        18                /* Total octets in header.         */
#define VLAN_ETH_ZLEN        64                /* Min. octets in frame sans FCS */

/*
 * According to 802.3ac, the packet can be 4 bytes longer. --Klika Jan
 */
#define VLAN_ETH_DATA_LEN        1500        /* Max. octets in payload         */
#define VLAN_ETH_FRAME_LEN        1518        /* Max. octets in frame sans FCS */

#define VLAN_MAX_DEPTH        8                /* Max. number of nested VLAN tags parsed */

/*
 *         struct vlan_hdr - vlan header
 *         @h_vlan_TCI: priority and VLAN ID
 *        @h_vlan_encapsulated_proto: packet type ID or len
 */
struct vlan_hdr {
        __be16        h_vlan_TCI;
        __be16        h_vlan_encapsulated_proto;
};

/**
 *        struct vlan_ethhdr - vlan ethernet header (ethhdr + vlan_hdr)
 *        @h_dest: destination ethernet address
 *        @h_source: source ethernet address
 *        @h_vlan_proto: ethernet protocol
 *        @h_vlan_TCI: priority and VLAN ID
 *        @h_vlan_encapsulated_proto: packet type ID or len
 */
struct vlan_ethhdr {
        unsigned char        h_dest[ETH_ALEN];
        unsigned char        h_source[ETH_ALEN];
        __be16                h_vlan_proto;
        __be16                h_vlan_TCI;
        __be16                h_vlan_encapsulated_proto;
};

#include <linux/skbuff.h>

static inline struct vlan_ethhdr *vlan_eth_hdr(const struct sk_buff *skb)
{
        return (struct vlan_ethhdr *)skb_mac_header(skb);
}

/* Prefer this version in TX path, instead of
 * skb_reset_mac_header() + vlan_eth_hdr()
 */
static inline struct vlan_ethhdr *skb_vlan_eth_hdr(const struct sk_buff *skb)
{
        return (struct vlan_ethhdr *)skb->data;
}

#define VLAN_PRIO_MASK                0xe000 /* Priority Code Point */
#define VLAN_PRIO_SHIFT                13
#define VLAN_CFI_MASK                0x1000 /* Canonical Format Indicator / Drop Eligible Indicator */
#define VLAN_VID_MASK                0x0fff /* VLAN Identifier */
#define VLAN_N_VID                4096

/* found in socket.c */
extern void vlan_ioctl_set(int (*hook)(struct net *, void __user *));

static inline bool is_vlan_dev(const struct net_device *dev)
{
        return dev->priv_flags & IFF_802_1Q_VLAN;
}

#define skb_vlan_tag_present(__skb)        ((__skb)->vlan_present)
#define skb_vlan_tag_get(__skb)                ((__skb)->vlan_tci)
#define skb_vlan_tag_get_id(__skb)        ((__skb)->vlan_tci & VLAN_VID_MASK)
#define skb_vlan_tag_get_cfi(__skb)        (!!((__skb)->vlan_tci & VLAN_CFI_MASK))
#define skb_vlan_tag_get_prio(__skb)        (((__skb)->vlan_tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT)

static inline int vlan_get_rx_ctag_filter_info(struct net_device *dev)
{
        ASSERT_RTNL();
        return notifier_to_errno(call_netdevice_notifiers(NETDEV_CVLAN_FILTER_PUSH_INFO, dev));
}

static inline void vlan_drop_rx_ctag_filter_info(struct net_device *dev)
{
        ASSERT_RTNL();
        call_netdevice_notifiers(NETDEV_CVLAN_FILTER_DROP_INFO, dev);
}

static inline int vlan_get_rx_stag_filter_info(struct net_device *dev)
{
        ASSERT_RTNL();
        return notifier_to_errno(call_netdevice_notifiers(NETDEV_SVLAN_FILTER_PUSH_INFO, dev));
}

static inline void vlan_drop_rx_stag_filter_info(struct net_device *dev)
{
        ASSERT_RTNL();
        call_netdevice_notifiers(NETDEV_SVLAN_FILTER_DROP_INFO, dev);
}

/**
 *        struct vlan_pcpu_stats - VLAN percpu rx/tx stats
 *        @rx_packets: number of received packets
 *        @rx_bytes: number of received bytes
 *        @rx_multicast: number of received multicast packets
 *        @tx_packets: number of transmitted packets
 *        @tx_bytes: number of transmitted bytes
 *        @syncp: synchronization point for 64bit counters
 *        @rx_errors: number of rx errors
 *        @tx_dropped: number of tx drops
 */
struct vlan_pcpu_stats {
        u64                        rx_packets;
        u64                        rx_bytes;
        u64                        rx_multicast;
        u64                        tx_packets;
        u64                        tx_bytes;
        struct u64_stats_sync        syncp;
        u32                        rx_errors;
        u32                        tx_dropped;
};

#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)

extern struct net_device *__vlan_find_dev_deep_rcu(struct net_device *real_dev,
                                               __be16 vlan_proto, u16 vlan_id);
extern int vlan_for_each(struct net_device *dev,
                         int (*action)(struct net_device *dev, int vid,
                                       void *arg), void *arg);
extern struct net_device *vlan_dev_real_dev(const struct net_device *dev);
extern u16 vlan_dev_vlan_id(const struct net_device *dev);
extern __be16 vlan_dev_vlan_proto(const struct net_device *dev);

/**
 *        struct vlan_priority_tci_mapping - vlan egress priority mappings
 *        @priority: skb priority
 *        @vlan_qos: vlan priority: (skb->priority << 13) & 0xE000
 *        @next: pointer to next struct
 */
struct vlan_priority_tci_mapping {
        u32                                        priority;
        u16                                        vlan_qos;
        struct vlan_priority_tci_mapping        *next;
};

struct proc_dir_entry;
struct netpoll;

/**
 *        struct vlan_dev_priv - VLAN private device data
 *        @nr_ingress_mappings: number of ingress priority mappings
 *        @ingress_priority_map: ingress priority mappings
 *        @nr_egress_mappings: number of egress priority mappings
 *        @egress_priority_map: hash of egress priority mappings
 *        @vlan_proto: VLAN encapsulation protocol
 *        @vlan_id: VLAN identifier
 *        @flags: device flags
 *        @real_dev: underlying netdevice
 *        @real_dev_addr: address of underlying netdevice
 *        @dent: proc dir entry
 *        @vlan_pcpu_stats: ptr to percpu rx stats
 */
struct vlan_dev_priv {
        unsigned int                                nr_ingress_mappings;
        u32                                        ingress_priority_map[8];
        unsigned int                                nr_egress_mappings;
        struct vlan_priority_tci_mapping        *egress_priority_map[16];

        __be16                                        vlan_proto;
        u16                                        vlan_id;
        u16                                        flags;

        struct net_device                        *real_dev;
        unsigned char                                real_dev_addr[ETH_ALEN];

        struct proc_dir_entry                        *dent;
        struct vlan_pcpu_stats __percpu                *vlan_pcpu_stats;
#ifdef CONFIG_NET_POLL_CONTROLLER
        struct netpoll                                *netpoll;
#endif
};

static inline struct vlan_dev_priv *vlan_dev_priv(const struct net_device *dev)
{
        return netdev_priv(dev);
}

static inline u16
vlan_dev_get_egress_qos_mask(struct net_device *dev, u32 skprio)
{
        struct vlan_priority_tci_mapping *mp;

        smp_rmb(); /* coupled with smp_wmb() in vlan_dev_set_egress_priority() */

        mp = vlan_dev_priv(dev)->egress_priority_map[(skprio & 0xF)];
        while (mp) {
                if (mp->priority == skprio) {
                        return mp->vlan_qos; /* This should already be shifted
                                              * to mask correctly with the
                                              * VLAN's TCI */
                }
                mp = mp->next;
        }
        return 0;
}

extern bool vlan_do_receive(struct sk_buff **skb);

extern int vlan_vid_add(struct net_device *dev, __be16 proto, u16 vid);
extern void vlan_vid_del(struct net_device *dev, __be16 proto, u16 vid);

extern int vlan_vids_add_by_dev(struct net_device *dev,
                                const struct net_device *by_dev);
extern void vlan_vids_del_by_dev(struct net_device *dev,
                                 const struct net_device *by_dev);

extern bool vlan_uses_dev(const struct net_device *dev);

#else
static inline struct net_device *
__vlan_find_dev_deep_rcu(struct net_device *real_dev,
                     __be16 vlan_proto, u16 vlan_id)
{
        return NULL;
}

static inline int
vlan_for_each(struct net_device *dev,
              int (*action)(struct net_device *dev, int vid, void *arg),
              void *arg)
{
        return 0;
}

static inline struct net_device *vlan_dev_real_dev(const struct net_device *dev)
{
        WARN_ON_ONCE(1);
        return NULL;
}

static inline u16 vlan_dev_vlan_id(const struct net_device *dev)
{
        WARN_ON_ONCE(1);
        return 0;
}

static inline __be16 vlan_dev_vlan_proto(const struct net_device *dev)
{
        WARN_ON_ONCE(1);
        return 0;
}

static inline u16 vlan_dev_get_egress_qos_mask(struct net_device *dev,
                                               u32 skprio)
{
        return 0;
}

static inline bool vlan_do_receive(struct sk_buff **skb)
{
        return false;
}

static inline int vlan_vid_add(struct net_device *dev, __be16 proto, u16 vid)
{
        return 0;
}

static inline void vlan_vid_del(struct net_device *dev, __be16 proto, u16 vid)
{
}

static inline int vlan_vids_add_by_dev(struct net_device *dev,
                                       const struct net_device *by_dev)
{
        return 0;
}

static inline void vlan_vids_del_by_dev(struct net_device *dev,
                                        const struct net_device *by_dev)
{
}

static inline bool vlan_uses_dev(const struct net_device *dev)
{
        return false;
}
#endif

/**
 * eth_type_vlan - check for valid vlan ether type.
 * @ethertype: ether type to check
 *
 * Returns true if the ether type is a vlan ether type.
 */
static inline bool eth_type_vlan(__be16 ethertype)
{
        switch (ethertype) {
        case htons(ETH_P_8021Q):
        case htons(ETH_P_8021AD):
                return true;
        default:
                return false;
        }
}

static inline bool vlan_hw_offload_capable(netdev_features_t features,
                                           __be16 proto)
{
        if (proto == htons(ETH_P_8021Q) && features & NETIF_F_HW_VLAN_CTAG_TX)
                return true;
        if (proto == htons(ETH_P_8021AD) && features & NETIF_F_HW_VLAN_STAG_TX)
                return true;
        return false;
}

/**
 * __vlan_insert_inner_tag - inner VLAN tag inserting
 * @skb: skbuff to tag
 * @vlan_proto: VLAN encapsulation protocol
 * @vlan_tci: VLAN TCI to insert
 * @mac_len: MAC header length including outer vlan headers
 *
 * Inserts the VLAN tag into @skb as part of the payload at offset mac_len
 * Returns error if skb_cow_head fails.
 *
 * Does not change skb->protocol so this function can be used during receive.
 */
static inline int __vlan_insert_inner_tag(struct sk_buff *skb,
                                          __be16 vlan_proto, u16 vlan_tci,
                                          unsigned int mac_len)
{
        struct vlan_ethhdr *veth;

        if (skb_cow_head(skb, VLAN_HLEN) < 0)
                return -ENOMEM;

        skb_push(skb, VLAN_HLEN);

        /* Move the mac header sans proto to the beginning of the new header. */
        if (likely(mac_len > ETH_TLEN))
                memmove(skb->data, skb->data + VLAN_HLEN, mac_len - ETH_TLEN);
        skb->mac_header -= VLAN_HLEN;

        veth = (struct vlan_ethhdr *)(skb->data + mac_len - ETH_HLEN);

        /* first, the ethernet type */
        if (likely(mac_len >= ETH_TLEN)) {
                /* h_vlan_encapsulated_proto should already be populated, and
                 * skb->data has space for h_vlan_proto
                 */
                veth->h_vlan_proto = vlan_proto;
        } else {
                /* h_vlan_encapsulated_proto should not be populated, and
                 * skb->data has no space for h_vlan_proto
                 */
                veth->h_vlan_encapsulated_proto = skb->protocol;
        }

        /* now, the TCI */
        veth->h_vlan_TCI = htons(vlan_tci);

        return 0;
}

/**
 * __vlan_insert_tag - regular VLAN tag inserting
 * @skb: skbuff to tag
 * @vlan_proto: VLAN encapsulation protocol
 * @vlan_tci: VLAN TCI to insert
 *
 * Inserts the VLAN tag into @skb as part of the payload
 * Returns error if skb_cow_head fails.
 *
 * Does not change skb->protocol so this function can be used during receive.
 */
static inline int __vlan_insert_tag(struct sk_buff *skb,
                                    __be16 vlan_proto, u16 vlan_tci)
{
        return __vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, ETH_HLEN);
}

/**
 * vlan_insert_inner_tag - inner VLAN tag inserting
 * @skb: skbuff to tag
 * @vlan_proto: VLAN encapsulation protocol
 * @vlan_tci: VLAN TCI to insert
 * @mac_len: MAC header length including outer vlan headers
 *
 * Inserts the VLAN tag into @skb as part of the payload at offset mac_len
 * Returns a VLAN tagged skb. If a new skb is created, @skb is freed.
 *
 * Following the skb_unshare() example, in case of error, the calling function
 * doesn't have to worry about freeing the original skb.
 *
 * Does not change skb->protocol so this function can be used during receive.
 */
static inline struct sk_buff *vlan_insert_inner_tag(struct sk_buff *skb,
                                                    __be16 vlan_proto,
                                                    u16 vlan_tci,
                                                    unsigned int mac_len)
{
        int err;

        err = __vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, mac_len);
        if (err) {
                dev_kfree_skb_any(skb);
                return NULL;
        }
        return skb;
}

/**
 * vlan_insert_tag - regular VLAN tag inserting
 * @skb: skbuff to tag
 * @vlan_proto: VLAN encapsulation protocol
 * @vlan_tci: VLAN TCI to insert
 *
 * Inserts the VLAN tag into @skb as part of the payload
 * Returns a VLAN tagged skb. If a new skb is created, @skb is freed.
 *
 * Following the skb_unshare() example, in case of error, the calling function
 * doesn't have to worry about freeing the original skb.
 *
 * Does not change skb->protocol so this function can be used during receive.
 */
static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb,
                                              __be16 vlan_proto, u16 vlan_tci)
{
        return vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, ETH_HLEN);
}

/**
 * vlan_insert_tag_set_proto - regular VLAN tag inserting
 * @skb: skbuff to tag
 * @vlan_proto: VLAN encapsulation protocol
 * @vlan_tci: VLAN TCI to insert
 *
 * Inserts the VLAN tag into @skb as part of the payload
 * Returns a VLAN tagged skb. If a new skb is created, @skb is freed.
 *
 * Following the skb_unshare() example, in case of error, the calling function
 * doesn't have to worry about freeing the original skb.
 */
static inline struct sk_buff *vlan_insert_tag_set_proto(struct sk_buff *skb,
                                                        __be16 vlan_proto,
                                                        u16 vlan_tci)
{
        skb = vlan_insert_tag(skb, vlan_proto, vlan_tci);
        if (skb)
                skb->protocol = vlan_proto;
        return skb;
}

/**
 * __vlan_hwaccel_clear_tag - clear hardware accelerated VLAN info
 * @skb: skbuff to clear
 *
 * Clears the VLAN information from @skb
 */
static inline void __vlan_hwaccel_clear_tag(struct sk_buff *skb)
{
        skb->vlan_present = 0;
}

/**
 * __vlan_hwaccel_copy_tag - copy hardware accelerated VLAN info from another skb
 * @dst: skbuff to copy to
 * @src: skbuff to copy from
 *
 * Copies VLAN information from @src to @dst (for branchless code)
 */
static inline void __vlan_hwaccel_copy_tag(struct sk_buff *dst, const struct sk_buff *src)
{
        dst->vlan_present = src->vlan_present;
        dst->vlan_proto = src->vlan_proto;
        dst->vlan_tci = src->vlan_tci;
}

/*
 * __vlan_hwaccel_push_inside - pushes vlan tag to the payload
 * @skb: skbuff to tag
 *
 * Pushes the VLAN tag from @skb->vlan_tci inside to the payload.
 *
 * Following the skb_unshare() example, in case of error, the calling function
 * doesn't have to worry about freeing the original skb.
 */
static inline struct sk_buff *__vlan_hwaccel_push_inside(struct sk_buff *skb)
{
        skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
                                        skb_vlan_tag_get(skb));
        if (likely(skb))
                __vlan_hwaccel_clear_tag(skb);
        return skb;
}

/**
 * __vlan_hwaccel_put_tag - hardware accelerated VLAN inserting
 * @skb: skbuff to tag
 * @vlan_proto: VLAN encapsulation protocol
 * @vlan_tci: VLAN TCI to insert
 *
 * Puts the VLAN TCI in @skb->vlan_tci and lets the device do the rest
 */
static inline void __vlan_hwaccel_put_tag(struct sk_buff *skb,
                                          __be16 vlan_proto, u16 vlan_tci)
{
        skb->vlan_proto = vlan_proto;
        skb->vlan_tci = vlan_tci;
        skb->vlan_present = 1;
}

/**
 * __vlan_get_tag - get the VLAN ID that is part of the payload
 * @skb: skbuff to query
 * @vlan_tci: buffer to store value
 *
 * Returns error if the skb is not of VLAN type
 */
static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci)
{
        struct vlan_ethhdr *veth = skb_vlan_eth_hdr(skb);

        if (!eth_type_vlan(veth->h_vlan_proto))
                return -EINVAL;

        *vlan_tci = ntohs(veth->h_vlan_TCI);
        return 0;
}

/**
 * __vlan_hwaccel_get_tag - get the VLAN ID that is in @skb->cb[]
 * @skb: skbuff to query
 * @vlan_tci: buffer to store value
 *
 * Returns error if @skb->vlan_tci is not set correctly
 */
static inline int __vlan_hwaccel_get_tag(const struct sk_buff *skb,
                                         u16 *vlan_tci)
{
        if (skb_vlan_tag_present(skb)) {
                *vlan_tci = skb_vlan_tag_get(skb);
                return 0;
        } else {
                *vlan_tci = 0;
                return -EINVAL;
        }
}

/**
 * vlan_get_tag - get the VLAN ID from the skb
 * @skb: skbuff to query
 * @vlan_tci: buffer to store value
 *
 * Returns error if the skb is not VLAN tagged
 */
static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci)
{
        if (skb->dev->features & NETIF_F_HW_VLAN_CTAG_TX) {
                return __vlan_hwaccel_get_tag(skb, vlan_tci);
        } else {
                return __vlan_get_tag(skb, vlan_tci);
        }
}

/**
 * vlan_get_protocol - get protocol EtherType.
 * @skb: skbuff to query
 * @type: first vlan protocol
 * @mac_offset: MAC offset
 * @depth: buffer to store length of eth and vlan tags in bytes
 *
 * Returns the EtherType of the packet, regardless of whether it is
 * vlan encapsulated (normal or hardware accelerated) or not.
 */
static inline __be16 __vlan_get_protocol_offset(const struct sk_buff *skb,
                                                __be16 type,
                                                int mac_offset,
                                                int *depth)
{
        unsigned int vlan_depth = skb->mac_len, parse_depth = VLAN_MAX_DEPTH;

        /* if type is 802.1Q/AD then the header should already be
         * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
         * ETH_HLEN otherwise
         */
        if (eth_type_vlan(type)) {
                if (vlan_depth) {
                        if (WARN_ON(vlan_depth < VLAN_HLEN))
                                return 0;
                        vlan_depth -= VLAN_HLEN;
                } else {
                        vlan_depth = ETH_HLEN;
                }
                do {
                        struct vlan_hdr vhdr, *vh;

                        vh = skb_header_pointer(skb, mac_offset + vlan_depth,
                                                sizeof(vhdr), &vhdr);
                        if (unlikely(!vh || !--parse_depth))
                                return 0;

                        type = vh->h_vlan_encapsulated_proto;
                        vlan_depth += VLAN_HLEN;
                } while (eth_type_vlan(type));
        }

        if (depth)
                *depth = vlan_depth;

        return type;
}

static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type,
                                         int *depth)
{
        return __vlan_get_protocol_offset(skb, type, 0, depth);
}

/**
 * vlan_get_protocol - get protocol EtherType.
 * @skb: skbuff to query
 *
 * Returns the EtherType of the packet, regardless of whether it is
 * vlan encapsulated (normal or hardware accelerated) or not.
 */
static inline __be16 vlan_get_protocol(const struct sk_buff *skb)
{
        return __vlan_get_protocol(skb, skb->protocol, NULL);
}

/* This version of __vlan_get_protocol() also pulls mac header in skb->head */
static inline __be16 vlan_get_protocol_and_depth(struct sk_buff *skb,
                                                 __be16 type, int *depth)
{
        int maclen;

        type = __vlan_get_protocol(skb, type, &maclen);

        if (type) {
                if (!pskb_may_pull(skb, maclen))
                        type = 0;
                else if (depth)
                        *depth = maclen;
        }
        return type;
}

/* A getter for the SKB protocol field which will handle VLAN tags consistently
 * whether VLAN acceleration is enabled or not.
 */
static inline __be16 skb_protocol(const struct sk_buff *skb, bool skip_vlan)
{
        if (!skip_vlan)
                /* VLAN acceleration strips the VLAN header from the skb and
                 * moves it to skb->vlan_proto
                 */
                return skb_vlan_tag_present(skb) ? skb->vlan_proto : skb->protocol;

        return vlan_get_protocol(skb);
}

static inline void vlan_set_encap_proto(struct sk_buff *skb,
                                        struct vlan_hdr *vhdr)
{
        __be16 proto;
        unsigned short *rawp;

        /*
         * Was a VLAN packet, grab the encapsulated protocol, which the layer
         * three protocols care about.
         */

        proto = vhdr->h_vlan_encapsulated_proto;
        if (eth_proto_is_802_3(proto)) {
                skb->protocol = proto;
                return;
        }

        rawp = (unsigned short *)(vhdr + 1);
        if (*rawp == 0xFFFF)
                /*
                 * This is a magic hack to spot IPX packets. Older Novell
                 * breaks the protocol design and runs IPX over 802.3 without
                 * an 802.2 LLC layer. We look for FFFF which isn't a used
                 * 802.2 SSAP/DSAP. This won't work for fault tolerant netware
                 * but does for the rest.
                 */
                skb->protocol = htons(ETH_P_802_3);
        else
                /*
                 * Real 802.2 LLC
                 */
                skb->protocol = htons(ETH_P_802_2);
}

/**
 * skb_vlan_tagged - check if skb is vlan tagged.
 * @skb: skbuff to query
 *
 * Returns true if the skb is tagged, regardless of whether it is hardware
 * accelerated or not.
 */
static inline bool skb_vlan_tagged(const struct sk_buff *skb)
{
        if (!skb_vlan_tag_present(skb) &&
            likely(!eth_type_vlan(skb->protocol)))
                return false;

        return true;
}

/**
 * skb_vlan_tagged_multi - check if skb is vlan tagged with multiple headers.
 * @skb: skbuff to query
 *
 * Returns true if the skb is tagged with multiple vlan headers, regardless
 * of whether it is hardware accelerated or not.
 */
static inline bool skb_vlan_tagged_multi(struct sk_buff *skb)
{
        __be16 protocol = skb->protocol;

        if (!skb_vlan_tag_present(skb)) {
                struct vlan_ethhdr *veh;

                if (likely(!eth_type_vlan(protocol)))
                        return false;

                if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN)))
                        return false;

                veh = skb_vlan_eth_hdr(skb);
                protocol = veh->h_vlan_encapsulated_proto;
        }

        if (!eth_type_vlan(protocol))
                return false;

        return true;
}

/**
 * vlan_features_check - drop unsafe features for skb with multiple tags.
 * @skb: skbuff to query
 * @features: features to be checked
 *
 * Returns features without unsafe ones if the skb has multiple tags.
 */
static inline netdev_features_t vlan_features_check(struct sk_buff *skb,
                                                    netdev_features_t features)
{
        if (skb_vlan_tagged_multi(skb)) {
                /* In the case of multi-tagged packets, use a direct mask
                 * instead of using netdev_interesect_features(), to make
                 * sure that only devices supporting NETIF_F_HW_CSUM will
                 * have checksum offloading support.
                 */
                features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_HW_CSUM |
                            NETIF_F_FRAGLIST | NETIF_F_HW_VLAN_CTAG_TX |
                            NETIF_F_HW_VLAN_STAG_TX;
        }

        return features;
}

/**
 * compare_vlan_header - Compare two vlan headers
 * @h1: Pointer to vlan header
 * @h2: Pointer to vlan header
 *
 * Compare two vlan headers, returns 0 if equal.
 *
 * Please note that alignment of h1 & h2 are only guaranteed to be 16 bits.
 */
static inline unsigned long compare_vlan_header(const struct vlan_hdr *h1,
                                                const struct vlan_hdr *h2)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
        return *(u32 *)h1 ^ *(u32 *)h2;
#else
        return ((__force u32)h1->h_vlan_TCI ^ (__force u32)h2->h_vlan_TCI) |
               ((__force u32)h1->h_vlan_encapsulated_proto ^
                (__force u32)h2->h_vlan_encapsulated_proto);
#endif
}
#endif /* !(_LINUX_IF_VLAN_H_) */

























































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
#ifndef __LINUX_MROUTE_BASE_H
#define __LINUX_MROUTE_BASE_H

#include <linux/netdevice.h>
#include <linux/rhashtable-types.h>
#include <linux/spinlock.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/fib_notifier.h>
#include <net/ip_fib.h>

/**
 * struct vif_device - interface representor for multicast routing
 * @dev: network device being used
 * @bytes_in: statistic; bytes ingressing
 * @bytes_out: statistic; bytes egresing
 * @pkt_in: statistic; packets ingressing
 * @pkt_out: statistic; packets egressing
 * @rate_limit: Traffic shaping (NI)
 * @threshold: TTL threshold
 * @flags: Control flags
 * @link: Physical interface index
 * @dev_parent_id: device parent id
 * @local: Local address
 * @remote: Remote address for tunnels
 */
struct vif_device {
        struct net_device *dev;
        unsigned long bytes_in, bytes_out;
        unsigned long pkt_in, pkt_out;
        unsigned long rate_limit;
        unsigned char threshold;
        unsigned short flags;
        int link;

        /* Currently only used by ipmr */
        struct netdev_phys_item_id dev_parent_id;
        __be32 local, remote;
};

struct vif_entry_notifier_info {
        struct fib_notifier_info info;
        struct net_device *dev;
        unsigned short vif_index;
        unsigned short vif_flags;
        u32 tb_id;
};

static inline int mr_call_vif_notifier(struct notifier_block *nb,
                                       unsigned short family,
                                       enum fib_event_type event_type,
                                       struct vif_device *vif,
                                       unsigned short vif_index, u32 tb_id,
                                       struct netlink_ext_ack *extack)
{
        struct vif_entry_notifier_info info = {
                .info = {
                        .family = family,
                        .extack = extack,
                },
                .dev = vif->dev,
                .vif_index = vif_index,
                .vif_flags = vif->flags,
                .tb_id = tb_id,
        };

        return call_fib_notifier(nb, event_type, &info.info);
}

static inline int mr_call_vif_notifiers(struct net *net,
                                        unsigned short family,
                                        enum fib_event_type event_type,
                                        struct vif_device *vif,
                                        unsigned short vif_index, u32 tb_id,
                                        unsigned int *ipmr_seq)
{
        struct vif_entry_notifier_info info = {
                .info = {
                        .family = family,
                },
                .dev = vif->dev,
                .vif_index = vif_index,
                .vif_flags = vif->flags,
                .tb_id = tb_id,
        };

        ASSERT_RTNL();
        (*ipmr_seq)++;
        return call_fib_notifiers(net, event_type, &info.info);
}

#ifndef MAXVIFS
/* This one is nasty; value is defined in uapi using different symbols for
 * mroute and morute6 but both map into same 32.
 */
#define MAXVIFS        32
#endif

#define VIF_EXISTS(_mrt, _idx) (!!((_mrt)->vif_table[_idx].dev))

/* mfc_flags:
 * MFC_STATIC - the entry was added statically (not by a routing daemon)
 * MFC_OFFLOAD - the entry was offloaded to the hardware
 */
enum {
        MFC_STATIC = BIT(0),
        MFC_OFFLOAD = BIT(1),
};

/**
 * struct mr_mfc - common multicast routing entries
 * @mnode: rhashtable list
 * @mfc_parent: source interface (iif)
 * @mfc_flags: entry flags
 * @expires: unresolved entry expire time
 * @unresolved: unresolved cached skbs
 * @last_assert: time of last assert
 * @minvif: minimum VIF id
 * @maxvif: maximum VIF id
 * @bytes: bytes that have passed for this entry
 * @pkt: packets that have passed for this entry
 * @wrong_if: number of wrong source interface hits
 * @lastuse: time of last use of the group (traffic or update)
 * @ttls: OIF TTL threshold array
 * @refcount: reference count for this entry
 * @list: global entry list
 * @rcu: used for entry destruction
 * @free: Operation used for freeing an entry under RCU
 */
struct mr_mfc {
        struct rhlist_head mnode;
        unsigned short mfc_parent;
        int mfc_flags;

        union {
                struct {
                        unsigned long expires;
                        struct sk_buff_head unresolved;
                } unres;
                struct {
                        unsigned long last_assert;
                        int minvif;
                        int maxvif;
                        unsigned long bytes;
                        unsigned long pkt;
                        unsigned long wrong_if;
                        unsigned long lastuse;
                        unsigned char ttls[MAXVIFS];
                        refcount_t refcount;
                } res;
        } mfc_un;
        struct list_head list;
        struct rcu_head        rcu;
        void (*free)(struct rcu_head *head);
};

static inline void mr_cache_put(struct mr_mfc *c)
{
        if (refcount_dec_and_test(&c->mfc_un.res.refcount))
                call_rcu(&c->rcu, c->free);
}

static inline void mr_cache_hold(struct mr_mfc *c)
{
        refcount_inc(&c->mfc_un.res.refcount);
}

struct mfc_entry_notifier_info {
        struct fib_notifier_info info;
        struct mr_mfc *mfc;
        u32 tb_id;
};

static inline int mr_call_mfc_notifier(struct notifier_block *nb,
                                       unsigned short family,
                                       enum fib_event_type event_type,
                                       struct mr_mfc *mfc, u32 tb_id,
                                       struct netlink_ext_ack *extack)
{
        struct mfc_entry_notifier_info info = {
                .info = {
                        .family = family,
                        .extack = extack,
                },
                .mfc = mfc,
                .tb_id = tb_id
        };

        return call_fib_notifier(nb, event_type, &info.info);
}

static inline int mr_call_mfc_notifiers(struct net *net,
                                        unsigned short family,
                                        enum fib_event_type event_type,
                                        struct mr_mfc *mfc, u32 tb_id,
                                        unsigned int *ipmr_seq)
{
        struct mfc_entry_notifier_info info = {
                .info = {
                        .family = family,
                },
                .mfc = mfc,
                .tb_id = tb_id
        };

        ASSERT_RTNL();
        (*ipmr_seq)++;
        return call_fib_notifiers(net, event_type, &info.info);
}

struct mr_table;

/**
 * struct mr_table_ops - callbacks and info for protocol-specific ops
 * @rht_params: parameters for accessing the MFC hash
 * @cmparg_any: a hash key to be used for matching on (*,*) routes
 */
struct mr_table_ops {
        const struct rhashtable_params *rht_params;
        void *cmparg_any;
};

/**
 * struct mr_table - a multicast routing table
 * @list: entry within a list of multicast routing tables
 * @net: net where this table belongs
 * @ops: protocol specific operations
 * @id: identifier of the table
 * @mroute_sk: socket associated with the table
 * @ipmr_expire_timer: timer for handling unresolved routes
 * @mfc_unres_queue: list of unresolved MFC entries
 * @vif_table: array containing all possible vifs
 * @mfc_hash: Hash table of all resolved routes for easy lookup
 * @mfc_cache_list: list of resovled routes for possible traversal
 * @maxvif: Identifier of highest value vif currently in use
 * @cache_resolve_queue_len: current size of unresolved queue
 * @mroute_do_assert: Whether to inform userspace on wrong ingress
 * @mroute_do_pim: Whether to receive IGMP PIMv1
 * @mroute_reg_vif_num: PIM-device vif index
 */
struct mr_table {
        struct list_head        list;
        possible_net_t                net;
        struct mr_table_ops        ops;
        u32                        id;
        struct sock __rcu        *mroute_sk;
        struct timer_list        ipmr_expire_timer;
        struct list_head        mfc_unres_queue;
        struct vif_device        vif_table[MAXVIFS];
        struct rhltable                mfc_hash;
        struct list_head        mfc_cache_list;
        int                        maxvif;
        atomic_t                cache_resolve_queue_len;
        bool                        mroute_do_assert;
        bool                        mroute_do_pim;
        bool                        mroute_do_wrvifwhole;
        int                        mroute_reg_vif_num;
};

#ifdef CONFIG_IP_MROUTE_COMMON
void vif_device_init(struct vif_device *v,
                     struct net_device *dev,
                     unsigned long rate_limit,
                     unsigned char threshold,
                     unsigned short flags,
                     unsigned short get_iflink_mask);

struct mr_table *
mr_table_alloc(struct net *net, u32 id,
               struct mr_table_ops *ops,
               void (*expire_func)(struct timer_list *t),
               void (*table_set)(struct mr_table *mrt,
                                 struct net *net));

/* These actually return 'struct mr_mfc *', but to avoid need for explicit
 * castings they simply return void.
 */
void *mr_mfc_find_parent(struct mr_table *mrt,
                         void *hasharg, int parent);
void *mr_mfc_find_any_parent(struct mr_table *mrt, int vifi);
void *mr_mfc_find_any(struct mr_table *mrt, int vifi, void *hasharg);

int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
                   struct mr_mfc *c, struct rtmsg *rtm);
int mr_table_dump(struct mr_table *mrt, struct sk_buff *skb,
                  struct netlink_callback *cb,
                  int (*fill)(struct mr_table *mrt, struct sk_buff *skb,
                              u32 portid, u32 seq, struct mr_mfc *c,
                              int cmd, int flags),
                  spinlock_t *lock, struct fib_dump_filter *filter);
int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
                     struct mr_table *(*iter)(struct net *net,
                                              struct mr_table *mrt),
                     int (*fill)(struct mr_table *mrt,
                                 struct sk_buff *skb,
                                 u32 portid, u32 seq, struct mr_mfc *c,
                                 int cmd, int flags),
                     spinlock_t *lock, struct fib_dump_filter *filter);

int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family,
            int (*rules_dump)(struct net *net,
                              struct notifier_block *nb,
                              struct netlink_ext_ack *extack),
            struct mr_table *(*mr_iter)(struct net *net,
                                        struct mr_table *mrt),
            rwlock_t *mrt_lock, struct netlink_ext_ack *extack);
#else
static inline void vif_device_init(struct vif_device *v,
                                   struct net_device *dev,
                                   unsigned long rate_limit,
                                   unsigned char threshold,
                                   unsigned short flags,
                                   unsigned short get_iflink_mask)
{
}

static inline void *mr_mfc_find_parent(struct mr_table *mrt,
                                       void *hasharg, int parent)
{
        return NULL;
}

static inline void *mr_mfc_find_any_parent(struct mr_table *mrt,
                                           int vifi)
{
        return NULL;
}

static inline struct mr_mfc *mr_mfc_find_any(struct mr_table *mrt,
                                             int vifi, void *hasharg)
{
        return NULL;
}

static inline int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
                                 struct mr_mfc *c, struct rtmsg *rtm)
{
        return -EINVAL;
}

static inline int
mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
                 struct mr_table *(*iter)(struct net *net,
                                          struct mr_table *mrt),
                 int (*fill)(struct mr_table *mrt,
                             struct sk_buff *skb,
                             u32 portid, u32 seq, struct mr_mfc *c,
                             int cmd, int flags),
                 spinlock_t *lock, struct fib_dump_filter *filter)
{
        return -EINVAL;
}

static inline int mr_dump(struct net *net, struct notifier_block *nb,
                          unsigned short family,
                          int (*rules_dump)(struct net *net,
                                            struct notifier_block *nb,
                                            struct netlink_ext_ack *extack),
                          struct mr_table *(*mr_iter)(struct net *net,
                                                      struct mr_table *mrt),
                          rwlock_t *mrt_lock, struct netlink_ext_ack *extack)
{
        return -EINVAL;
}
#endif

static inline void *mr_mfc_find(struct mr_table *mrt, void *hasharg)
{
        return mr_mfc_find_parent(mrt, hasharg, -1);
}

#ifdef CONFIG_PROC_FS
struct mr_vif_iter {
        struct seq_net_private p;
        struct mr_table *mrt;
        int ct;
};

struct mr_mfc_iter {
        struct seq_net_private p;
        struct mr_table *mrt;
        struct list_head *cache;

        /* Lock protecting the mr_table's unresolved queue */
        spinlock_t *lock;
};

#ifdef CONFIG_IP_MROUTE_COMMON
void *mr_vif_seq_idx(struct net *net, struct mr_vif_iter *iter, loff_t pos);
void *mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos);

static inline void *mr_vif_seq_start(struct seq_file *seq, loff_t *pos)
{
        return *pos ? mr_vif_seq_idx(seq_file_net(seq),
                                     seq->private, *pos - 1)
                    : SEQ_START_TOKEN;
}

/* These actually return 'struct mr_mfc *', but to avoid need for explicit
 * castings they simply return void.
 */
void *mr_mfc_seq_idx(struct net *net,
                     struct mr_mfc_iter *it, loff_t pos);
void *mr_mfc_seq_next(struct seq_file *seq, void *v,
                      loff_t *pos);

static inline void *mr_mfc_seq_start(struct seq_file *seq, loff_t *pos,
                                     struct mr_table *mrt, spinlock_t *lock)
{
        struct mr_mfc_iter *it = seq->private;

        it->mrt = mrt;
        it->cache = NULL;
        it->lock = lock;

        return *pos ? mr_mfc_seq_idx(seq_file_net(seq),
                                     seq->private, *pos - 1)
                    : SEQ_START_TOKEN;
}

static inline void mr_mfc_seq_stop(struct seq_file *seq, void *v)
{
        struct mr_mfc_iter *it = seq->private;
        struct mr_table *mrt = it->mrt;

        if (it->cache == &mrt->mfc_unres_queue)
                spin_unlock_bh(it->lock);
        else if (it->cache == &mrt->mfc_cache_list)
                rcu_read_unlock();
}
#else
static inline void *mr_vif_seq_idx(struct net *net, struct mr_vif_iter *iter,
                                   loff_t pos)
{
        return NULL;
}

static inline void *mr_vif_seq_next(struct seq_file *seq,
                                    void *v, loff_t *pos)
{
        return NULL;
}

static inline void *mr_vif_seq_start(struct seq_file *seq, loff_t *pos)
{
        return NULL;
}

static inline void *mr_mfc_seq_idx(struct net *net,
                                   struct mr_mfc_iter *it, loff_t pos)
{
        return NULL;
}

static inline void *mr_mfc_seq_next(struct seq_file *seq, void *v,
                                    loff_t *pos)
{
        return NULL;
}

static inline void *mr_mfc_seq_start(struct seq_file *seq, loff_t *pos,
                                     struct mr_table *mrt, spinlock_t *lock)
{
        return NULL;
}

static inline void mr_mfc_seq_stop(struct seq_file *seq, void *v)
{
}
#endif
#endif
#endif




















































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * This is <linux/capability.h>
 *
 * Andrew G. Morgan <morgan@kernel.org>
 * Alexander Kjeldaas <astor@guardian.no>
 * with help from Aleph1, Roland Buresund and Andrew Main.
 *
 * See here for the libcap library ("POSIX draft" compliance):
 *
 * ftp://www.kernel.org/pub/linux/libs/security/linux-privs/kernel-2.6/
 */
#ifndef _LINUX_CAPABILITY_H
#define _LINUX_CAPABILITY_H

#include <uapi/linux/capability.h>
#include <linux/uidgid.h>

#define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3
#define _KERNEL_CAPABILITY_U32S    _LINUX_CAPABILITY_U32S_3

extern int file_caps_enabled;

typedef struct kernel_cap_struct {
        __u32 cap[_KERNEL_CAPABILITY_U32S];
} kernel_cap_t;

/* same as vfs_ns_cap_data but in cpu endian and always filled completely */
struct cpu_vfs_cap_data {
        __u32 magic_etc;
        kernel_cap_t permitted;
        kernel_cap_t inheritable;
        kuid_t rootid;
};

#define _USER_CAP_HEADER_SIZE  (sizeof(struct __user_cap_header_struct))
#define _KERNEL_CAP_T_SIZE     (sizeof(kernel_cap_t))


struct file;
struct inode;
struct dentry;
struct task_struct;
struct user_namespace;

extern const kernel_cap_t __cap_empty_set;
extern const kernel_cap_t __cap_init_eff_set;

/*
 * Internal kernel functions only
 */

#define CAP_FOR_EACH_U32(__capi)  \
        for (__capi = 0; __capi < _KERNEL_CAPABILITY_U32S; ++__capi)

/*
 * CAP_FS_MASK and CAP_NFSD_MASKS:
 *
 * The fs mask is all the privileges that fsuid==0 historically meant.
 * At one time in the past, that included CAP_MKNOD and CAP_LINUX_IMMUTABLE.
 *
 * It has never meant setting security.* and trusted.* xattrs.
 *
 * We could also define fsmask as follows:
 *   1. CAP_FS_MASK is the privilege to bypass all fs-related DAC permissions
 *   2. The security.* and trusted.* xattrs are fs-related MAC permissions
 */

# define CAP_FS_MASK_B0     (CAP_TO_MASK(CAP_CHOWN)                \
                            | CAP_TO_MASK(CAP_MKNOD)                \
                            | CAP_TO_MASK(CAP_DAC_OVERRIDE)        \
                            | CAP_TO_MASK(CAP_DAC_READ_SEARCH)        \
                            | CAP_TO_MASK(CAP_FOWNER)                \
                            | CAP_TO_MASK(CAP_FSETID))

# define CAP_FS_MASK_B1     (CAP_TO_MASK(CAP_MAC_OVERRIDE))

#if _KERNEL_CAPABILITY_U32S != 2
# error Fix up hand-coded capability macro initializers
#else /* HAND-CODED capability initializers */

#define CAP_LAST_U32                        ((_KERNEL_CAPABILITY_U32S) - 1)
#define CAP_LAST_U32_VALID_MASK                (CAP_TO_MASK(CAP_LAST_CAP + 1) -1)

# define CAP_EMPTY_SET    ((kernel_cap_t){{ 0, 0 }})
# define CAP_FULL_SET     ((kernel_cap_t){{ ~0, CAP_LAST_U32_VALID_MASK }})
# define CAP_FS_SET       ((kernel_cap_t){{ CAP_FS_MASK_B0 \
                                    | CAP_TO_MASK(CAP_LINUX_IMMUTABLE), \
                                    CAP_FS_MASK_B1 } })
# define CAP_NFSD_SET     ((kernel_cap_t){{ CAP_FS_MASK_B0 \
                                    | CAP_TO_MASK(CAP_SYS_RESOURCE), \
                                    CAP_FS_MASK_B1 } })

#endif /* _KERNEL_CAPABILITY_U32S != 2 */

# define cap_clear(c)         do { (c) = __cap_empty_set; } while (0)

#define cap_raise(c, flag)  ((c).cap[CAP_TO_INDEX(flag)] |= CAP_TO_MASK(flag))
#define cap_lower(c, flag)  ((c).cap[CAP_TO_INDEX(flag)] &= ~CAP_TO_MASK(flag))
#define cap_raised(c, flag) ((c).cap[CAP_TO_INDEX(flag)] & CAP_TO_MASK(flag))

#define CAP_BOP_ALL(c, a, b, OP)                                    \
do {                                                                \
        unsigned __capi;                                            \
        CAP_FOR_EACH_U32(__capi) {                                  \
                c.cap[__capi] = a.cap[__capi] OP b.cap[__capi];     \
        }                                                           \
} while (0)

#define CAP_UOP_ALL(c, a, OP)                                       \
do {                                                                \
        unsigned __capi;                                            \
        CAP_FOR_EACH_U32(__capi) {                                  \
                c.cap[__capi] = OP a.cap[__capi];                   \
        }                                                           \
} while (0)

static inline kernel_cap_t cap_combine(const kernel_cap_t a,
                                       const kernel_cap_t b)
{
        kernel_cap_t dest;
        CAP_BOP_ALL(dest, a, b, |);
        return dest;
}

static inline kernel_cap_t cap_intersect(const kernel_cap_t a,
                                         const kernel_cap_t b)
{
        kernel_cap_t dest;
        CAP_BOP_ALL(dest, a, b, &);
        return dest;
}

static inline kernel_cap_t cap_drop(const kernel_cap_t a,
                                    const kernel_cap_t drop)
{
        kernel_cap_t dest;
        CAP_BOP_ALL(dest, a, drop, &~);
        return dest;
}

static inline kernel_cap_t cap_invert(const kernel_cap_t c)
{
        kernel_cap_t dest;
        CAP_UOP_ALL(dest, c, ~);
        return dest;
}

static inline bool cap_isclear(const kernel_cap_t a)
{
        unsigned __capi;
        CAP_FOR_EACH_U32(__capi) {
                if (a.cap[__capi] != 0)
                        return false;
        }
        return true;
}

/*
 * Check if "a" is a subset of "set".
 * return true if ALL of the capabilities in "a" are also in "set"
 *        cap_issubset(0101, 1111) will return true
 * return false if ANY of the capabilities in "a" are not in "set"
 *        cap_issubset(1111, 0101) will return false
 */
static inline bool cap_issubset(const kernel_cap_t a, const kernel_cap_t set)
{
        kernel_cap_t dest;
        dest = cap_drop(a, set);
        return cap_isclear(dest);
}

/* Used to decide between falling back on the old suser() or fsuser(). */

static inline kernel_cap_t cap_drop_fs_set(const kernel_cap_t a)
{
        const kernel_cap_t __cap_fs_set = CAP_FS_SET;
        return cap_drop(a, __cap_fs_set);
}

static inline kernel_cap_t cap_raise_fs_set(const kernel_cap_t a,
                                            const kernel_cap_t permitted)
{
        const kernel_cap_t __cap_fs_set = CAP_FS_SET;
        return cap_combine(a,
                           cap_intersect(permitted, __cap_fs_set));
}

static inline kernel_cap_t cap_drop_nfsd_set(const kernel_cap_t a)
{
        const kernel_cap_t __cap_fs_set = CAP_NFSD_SET;
        return cap_drop(a, __cap_fs_set);
}

static inline kernel_cap_t cap_raise_nfsd_set(const kernel_cap_t a,
                                              const kernel_cap_t permitted)
{
        const kernel_cap_t __cap_nfsd_set = CAP_NFSD_SET;
        return cap_combine(a,
                           cap_intersect(permitted, __cap_nfsd_set));
}

#ifdef CONFIG_MULTIUSER
extern bool has_capability(struct task_struct *t, int cap);
extern bool has_ns_capability(struct task_struct *t,
                              struct user_namespace *ns, int cap);
extern bool has_capability_noaudit(struct task_struct *t, int cap);
extern bool has_ns_capability_noaudit(struct task_struct *t,
                                      struct user_namespace *ns, int cap);
extern bool capable(int cap);
extern bool ns_capable(struct user_namespace *ns, int cap);
extern bool ns_capable_noaudit(struct user_namespace *ns, int cap);
extern bool ns_capable_setid(struct user_namespace *ns, int cap);
#else
static inline bool has_capability(struct task_struct *t, int cap)
{
        return true;
}
static inline bool has_ns_capability(struct task_struct *t,
                              struct user_namespace *ns, int cap)
{
        return true;
}
static inline bool has_capability_noaudit(struct task_struct *t, int cap)
{
        return true;
}
static inline bool has_ns_capability_noaudit(struct task_struct *t,
                                      struct user_namespace *ns, int cap)
{
        return true;
}
static inline bool capable(int cap)
{
        return true;
}
static inline bool ns_capable(struct user_namespace *ns, int cap)
{
        return true;
}
static inline bool ns_capable_noaudit(struct user_namespace *ns, int cap)
{
        return true;
}
static inline bool ns_capable_setid(struct user_namespace *ns, int cap)
{
        return true;
}
#endif /* CONFIG_MULTIUSER */
extern bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode);
extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap);
extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap);
extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns);
static inline bool perfmon_capable(void)
{
        return capable(CAP_PERFMON) || capable(CAP_SYS_ADMIN);
}

static inline bool bpf_capable(void)
{
        return capable(CAP_BPF) || capable(CAP_SYS_ADMIN);
}

static inline bool checkpoint_restore_ns_capable(struct user_namespace *ns)
{
        return ns_capable(ns, CAP_CHECKPOINT_RESTORE) ||
                ns_capable(ns, CAP_SYS_ADMIN);
}

/* audit system wants to get cap info from files as well */
extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);

extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size);

#endif /* !_LINUX_CAPABILITY_H */
























































    1 

    1 
    1 
    1 



    1 
































































































































































































































































    1 





    1 
    1 





    1 
    1 




































































































































































































































































    1 













































































    1 







    1 
    1 


    1 
    1 
























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/isofs/rock.c
 *
 *  (C) 1992, 1993  Eric Youngdale
 *
 *  Rock Ridge Extensions to iso9660
 */

#include <linux/slab.h>
#include <linux/pagemap.h>

#include "isofs.h"
#include "rock.h"

/*
 * These functions are designed to read the system areas of a directory record
 * and extract relevant information.  There are different functions provided
 * depending upon what information we need at the time.  One function fills
 * out an inode structure, a second one extracts a filename, a third one
 * returns a symbolic link name, and a fourth one returns the extent number
 * for the file.
 */

#define SIG(A,B) ((A) | ((B) << 8))        /* isonum_721() */

struct rock_state {
        void *buffer;
        unsigned char *chr;
        int len;
        int cont_size;
        int cont_extent;
        int cont_offset;
        int cont_loops;
        struct inode *inode;
};

/*
 * This is a way of ensuring that we have something in the system
 * use fields that is compatible with Rock Ridge.  Return zero on success.
 */

static int check_sp(struct rock_ridge *rr, struct inode *inode)
{
        if (rr->u.SP.magic[0] != 0xbe)
                return -1;
        if (rr->u.SP.magic[1] != 0xef)
                return -1;
        ISOFS_SB(inode->i_sb)->s_rock_offset = rr->u.SP.skip;
        return 0;
}

static void setup_rock_ridge(struct iso_directory_record *de,
                        struct inode *inode, struct rock_state *rs)
{
        rs->len = sizeof(struct iso_directory_record) + de->name_len[0];
        if (rs->len & 1)
                (rs->len)++;
        rs->chr = (unsigned char *)de + rs->len;
        rs->len = *((unsigned char *)de) - rs->len;
        if (rs->len < 0)
                rs->len = 0;

        if (ISOFS_SB(inode->i_sb)->s_rock_offset != -1) {
                rs->len -= ISOFS_SB(inode->i_sb)->s_rock_offset;
                rs->chr += ISOFS_SB(inode->i_sb)->s_rock_offset;
                if (rs->len < 0)
                        rs->len = 0;
        }
}

static void init_rock_state(struct rock_state *rs, struct inode *inode)
{
        memset(rs, 0, sizeof(*rs));
        rs->inode = inode;
}

/* Maximum number of Rock Ridge continuation entries */
#define RR_MAX_CE_ENTRIES 32

/*
 * Returns 0 if the caller should continue scanning, 1 if the scan must end
 * and -ve on error.
 */
static int rock_continue(struct rock_state *rs)
{
        int ret = 1;
        int blocksize = 1 << rs->inode->i_blkbits;
        const int min_de_size = offsetof(struct rock_ridge, u);

        kfree(rs->buffer);
        rs->buffer = NULL;

        if ((unsigned)rs->cont_offset > blocksize - min_de_size ||
            (unsigned)rs->cont_size > blocksize ||
            (unsigned)(rs->cont_offset + rs->cont_size) > blocksize) {
                printk(KERN_NOTICE "rock: corrupted directory entry. "
                        "extent=%d, offset=%d, size=%d\n",
                        rs->cont_extent, rs->cont_offset, rs->cont_size);
                ret = -EIO;
                goto out;
        }

        if (rs->cont_extent) {
                struct buffer_head *bh;

                rs->buffer = kmalloc(rs->cont_size, GFP_KERNEL);
                if (!rs->buffer) {
                        ret = -ENOMEM;
                        goto out;
                }
                ret = -EIO;
                if (++rs->cont_loops >= RR_MAX_CE_ENTRIES)
                        goto out;
                bh = sb_bread(rs->inode->i_sb, rs->cont_extent);
                if (bh) {
                        memcpy(rs->buffer, bh->b_data + rs->cont_offset,
                                        rs->cont_size);
                        put_bh(bh);
                        rs->chr = rs->buffer;
                        rs->len = rs->cont_size;
                        rs->cont_extent = 0;
                        rs->cont_size = 0;
                        rs->cont_offset = 0;
                        return 0;
                }
                printk("Unable to read rock-ridge attributes\n");
        }
out:
        kfree(rs->buffer);
        rs->buffer = NULL;
        return ret;
}

/*
 * We think there's a record of type `sig' at rs->chr.  Parse the signature
 * and make sure that there's really room for a record of that type.
 */
static int rock_check_overflow(struct rock_state *rs, int sig)
{
        int len;

        switch (sig) {
        case SIG('S', 'P'):
                len = sizeof(struct SU_SP_s);
                break;
        case SIG('C', 'E'):
                len = sizeof(struct SU_CE_s);
                break;
        case SIG('E', 'R'):
                len = sizeof(struct SU_ER_s);
                break;
        case SIG('R', 'R'):
                len = sizeof(struct RR_RR_s);
                break;
        case SIG('P', 'X'):
                len = sizeof(struct RR_PX_s);
                break;
        case SIG('P', 'N'):
                len = sizeof(struct RR_PN_s);
                break;
        case SIG('S', 'L'):
                len = sizeof(struct RR_SL_s);
                break;
        case SIG('N', 'M'):
                len = sizeof(struct RR_NM_s);
                break;
        case SIG('C', 'L'):
                len = sizeof(struct RR_CL_s);
                break;
        case SIG('P', 'L'):
                len = sizeof(struct RR_PL_s);
                break;
        case SIG('T', 'F'):
                len = sizeof(struct RR_TF_s);
                break;
        case SIG('Z', 'F'):
                len = sizeof(struct RR_ZF_s);
                break;
        default:
                len = 0;
                break;
        }
        len += offsetof(struct rock_ridge, u);
        if (len > rs->len) {
                printk(KERN_NOTICE "rock: directory entry would overflow "
                                "storage\n");
                printk(KERN_NOTICE "rock: sig=0x%02x, size=%d, remaining=%d\n",
                                sig, len, rs->len);
                return -EIO;
        }
        return 0;
}

/*
 * return length of name field; 0: not found, -1: to be ignored
 */
int get_rock_ridge_filename(struct iso_directory_record *de,
                            char *retname, struct inode *inode)
{
        struct rock_state rs;
        struct rock_ridge *rr;
        int sig;
        int retnamlen = 0;
        int truncate = 0;
        int ret = 0;
        char *p;
        int len;

        if (!ISOFS_SB(inode->i_sb)->s_rock)
                return 0;
        *retname = 0;

        init_rock_state(&rs, inode);
        setup_rock_ridge(de, inode, &rs);
repeat:

        while (rs.len > 2) { /* There may be one byte for padding somewhere */
                rr = (struct rock_ridge *)rs.chr;
                /*
                 * Ignore rock ridge info if rr->len is out of range, but
                 * don't return -EIO because that would make the file
                 * invisible.
                 */
                if (rr->len < 3)
                        goto out;        /* Something got screwed up here */
                sig = isonum_721(rs.chr);
                if (rock_check_overflow(&rs, sig))
                        goto eio;
                rs.chr += rr->len;
                rs.len -= rr->len;
                /*
                 * As above, just ignore the rock ridge info if rr->len
                 * is bogus.
                 */
                if (rs.len < 0)
                        goto out;        /* Something got screwed up here */

                switch (sig) {
                case SIG('R', 'R'):
                        if ((rr->u.RR.flags[0] & RR_NM) == 0)
                                goto out;
                        break;
                case SIG('S', 'P'):
                        if (check_sp(rr, inode))
                                goto out;
                        break;
                case SIG('C', 'E'):
                        rs.cont_extent = isonum_733(rr->u.CE.extent);
                        rs.cont_offset = isonum_733(rr->u.CE.offset);
                        rs.cont_size = isonum_733(rr->u.CE.size);
                        break;
                case SIG('N', 'M'):
                        if (truncate)
                                break;
                        if (rr->len < 5)
                                break;
                        /*
                         * If the flags are 2 or 4, this indicates '.' or '..'.
                         * We don't want to do anything with this, because it
                         * screws up the code that calls us.  We don't really
                         * care anyways, since we can just use the non-RR
                         * name.
                         */
                        if (rr->u.NM.flags & 6)
                                break;

                        if (rr->u.NM.flags & ~1) {
                                printk("Unsupported NM flag settings (%d)\n",
                                        rr->u.NM.flags);
                                break;
                        }
                        len = rr->len - 5;
                        if (retnamlen + len >= 254) {
                                truncate = 1;
                                break;
                        }
                        p = memchr(rr->u.NM.name, '\0', len);
                        if (unlikely(p))
                                len = p - rr->u.NM.name;
                        memcpy(retname + retnamlen, rr->u.NM.name, len);
                        retnamlen += len;
                        retname[retnamlen] = '\0';
                        break;
                case SIG('R', 'E'):
                        kfree(rs.buffer);
                        return -1;
                default:
                        break;
                }
        }
        ret = rock_continue(&rs);
        if (ret == 0)
                goto repeat;
        if (ret == 1)
                return retnamlen; /* If 0, this file did not have a NM field */
out:
        kfree(rs.buffer);
        return ret;
eio:
        ret = -EIO;
        goto out;
}

#define RR_REGARD_XA 1
#define RR_RELOC_DE 2

static int
parse_rock_ridge_inode_internal(struct iso_directory_record *de,
                                struct inode *inode, int flags)
{
        int symlink_len = 0;
        int cnt, sig;
        unsigned int reloc_block;
        struct inode *reloc;
        struct rock_ridge *rr;
        int rootflag;
        struct rock_state rs;
        int ret = 0;

        if (!ISOFS_SB(inode->i_sb)->s_rock)
                return 0;

        init_rock_state(&rs, inode);
        setup_rock_ridge(de, inode, &rs);
        if (flags & RR_REGARD_XA) {
                rs.chr += 14;
                rs.len -= 14;
                if (rs.len < 0)
                        rs.len = 0;
        }

repeat:
        while (rs.len > 2) { /* There may be one byte for padding somewhere */
                rr = (struct rock_ridge *)rs.chr;
                /*
                 * Ignore rock ridge info if rr->len is out of range, but
                 * don't return -EIO because that would make the file
                 * invisible.
                 */
                if (rr->len < 3)
                        goto out;        /* Something got screwed up here */
                sig = isonum_721(rs.chr);
                if (rock_check_overflow(&rs, sig))
                        goto eio;
                rs.chr += rr->len;
                rs.len -= rr->len;
                /*
                 * As above, just ignore the rock ridge info if rr->len
                 * is bogus.
                 */
                if (rs.len < 0)
                        goto out;        /* Something got screwed up here */

                switch (sig) {
#ifndef CONFIG_ZISOFS                /* No flag for SF or ZF */
                case SIG('R', 'R'):
                        if ((rr->u.RR.flags[0] &
                             (RR_PX | RR_TF | RR_SL | RR_CL)) == 0)
                                goto out;
                        break;
#endif
                case SIG('S', 'P'):
                        if (check_sp(rr, inode))
                                goto out;
                        break;
                case SIG('C', 'E'):
                        rs.cont_extent = isonum_733(rr->u.CE.extent);
                        rs.cont_offset = isonum_733(rr->u.CE.offset);
                        rs.cont_size = isonum_733(rr->u.CE.size);
                        break;
                case SIG('E', 'R'):
                        /* Invalid length of ER tag id? */
                        if (rr->u.ER.len_id + offsetof(struct rock_ridge, u.ER.data) > rr->len)
                                goto out;
                        ISOFS_SB(inode->i_sb)->s_rock = 1;
                        printk(KERN_DEBUG "ISO 9660 Extensions: ");
                        {
                                int p;
                                for (p = 0; p < rr->u.ER.len_id; p++)
                                        printk(KERN_CONT "%c", rr->u.ER.data[p]);
                        }
                        printk(KERN_CONT "\n");
                        break;
                case SIG('P', 'X'):
                        inode->i_mode = isonum_733(rr->u.PX.mode);
                        set_nlink(inode, isonum_733(rr->u.PX.n_links));
                        i_uid_write(inode, isonum_733(rr->u.PX.uid));
                        i_gid_write(inode, isonum_733(rr->u.PX.gid));
                        break;
                case SIG('P', 'N'):
                        {
                                int high, low;
                                high = isonum_733(rr->u.PN.dev_high);
                                low = isonum_733(rr->u.PN.dev_low);
                                /*
                                 * The Rock Ridge standard specifies that if
                                 * sizeof(dev_t) <= 4, then the high field is
                                 * unused, and the device number is completely
                                 * stored in the low field.  Some writers may
                                 * ignore this subtlety,
                                 * and as a result we test to see if the entire
                                 * device number is
                                 * stored in the low field, and use that.
                                 */
                                if ((low & ~0xff) && high == 0) {
                                        inode->i_rdev =
                                            MKDEV(low >> 8, low & 0xff);
                                } else {
                                        inode->i_rdev =
                                            MKDEV(high, low);
                                }
                        }
                        break;
                case SIG('T', 'F'):
                        /*
                         * Some RRIP writers incorrectly place ctime in the
                         * TF_CREATE field. Try to handle this correctly for
                         * either case.
                         */
                        /* Rock ridge never appears on a High Sierra disk */
                        cnt = 0;
                        if (rr->u.TF.flags & TF_CREATE) {
                                inode->i_ctime.tv_sec =
                                    iso_date(rr->u.TF.times[cnt++].time,
                                             0);
                                inode->i_ctime.tv_nsec = 0;
                        }
                        if (rr->u.TF.flags & TF_MODIFY) {
                                inode->i_mtime.tv_sec =
                                    iso_date(rr->u.TF.times[cnt++].time,
                                             0);
                                inode->i_mtime.tv_nsec = 0;
                        }
                        if (rr->u.TF.flags & TF_ACCESS) {
                                inode->i_atime.tv_sec =
                                    iso_date(rr->u.TF.times[cnt++].time,
                                             0);
                                inode->i_atime.tv_nsec = 0;
                        }
                        if (rr->u.TF.flags & TF_ATTRIBUTES) {
                                inode->i_ctime.tv_sec =
                                    iso_date(rr->u.TF.times[cnt++].time,
                                             0);
                                inode->i_ctime.tv_nsec = 0;
                        }
                        break;
                case SIG('S', 'L'):
                        {
                                int slen;
                                struct SL_component *slp;
                                struct SL_component *oldslp;
                                slen = rr->len - 5;
                                slp = &rr->u.SL.link;
                                inode->i_size = symlink_len;
                                while (slen > 1) {
                                        rootflag = 0;
                                        switch (slp->flags & ~1) {
                                        case 0:
                                                inode->i_size +=
                                                    slp->len;
                                                break;
                                        case 2:
                                                inode->i_size += 1;
                                                break;
                                        case 4:
                                                inode->i_size += 2;
                                                break;
                                        case 8:
                                                rootflag = 1;
                                                inode->i_size += 1;
                                                break;
                                        default:
                                                printk("Symlink component flag "
                                                        "not implemented\n");
                                        }
                                        slen -= slp->len + 2;
                                        oldslp = slp;
                                        slp = (struct SL_component *)
                                                (((char *)slp) + slp->len + 2);

                                        if (slen < 2) {
                                                if (((rr->u.SL.
                                                      flags & 1) != 0)
                                                    &&
                                                    ((oldslp->
                                                      flags & 1) == 0))
                                                        inode->i_size +=
                                                            1;
                                                break;
                                        }

                                        /*
                                         * If this component record isn't
                                         * continued, then append a '/'.
                                         */
                                        if (!rootflag
                                            && (oldslp->flags & 1) == 0)
                                                inode->i_size += 1;
                                }
                        }
                        symlink_len = inode->i_size;
                        break;
                case SIG('R', 'E'):
                        printk(KERN_WARNING "Attempt to read inode for "
                                        "relocated directory\n");
                        goto out;
                case SIG('C', 'L'):
                        if (flags & RR_RELOC_DE) {
                                printk(KERN_ERR
                                       "ISOFS: Recursive directory relocation "
                                       "is not supported\n");
                                goto eio;
                        }
                        reloc_block = isonum_733(rr->u.CL.location);
                        if (reloc_block == ISOFS_I(inode)->i_iget5_block &&
                            ISOFS_I(inode)->i_iget5_offset == 0) {
                                printk(KERN_ERR
                                       "ISOFS: Directory relocation points to "
                                       "itself\n");
                                goto eio;
                        }
                        ISOFS_I(inode)->i_first_extent = reloc_block;
                        reloc = isofs_iget_reloc(inode->i_sb, reloc_block, 0);
                        if (IS_ERR(reloc)) {
                                ret = PTR_ERR(reloc);
                                goto out;
                        }
                        inode->i_mode = reloc->i_mode;
                        set_nlink(inode, reloc->i_nlink);
                        inode->i_uid = reloc->i_uid;
                        inode->i_gid = reloc->i_gid;
                        inode->i_rdev = reloc->i_rdev;
                        inode->i_size = reloc->i_size;
                        inode->i_blocks = reloc->i_blocks;
                        inode->i_atime = reloc->i_atime;
                        inode->i_ctime = reloc->i_ctime;
                        inode->i_mtime = reloc->i_mtime;
                        iput(reloc);
                        break;
#ifdef CONFIG_ZISOFS
                case SIG('Z', 'F'): {
                        int algo;

                        if (ISOFS_SB(inode->i_sb)->s_nocompress)
                                break;
                        algo = isonum_721(rr->u.ZF.algorithm);
                        if (algo == SIG('p', 'z')) {
                                int block_shift =
                                        isonum_711(&rr->u.ZF.parms[1]);
                                if (block_shift > 17) {
                                        printk(KERN_WARNING "isofs: "
                                                "Can't handle ZF block "
                                                "size of 2^%d\n",
                                                block_shift);
                                } else {
                                        /*
                                         * Note: we don't change
                                         * i_blocks here
                                         */
                                        ISOFS_I(inode)->i_file_format =
                                                isofs_file_compressed;
                                        /*
                                         * Parameters to compression
                                         * algorithm (header size,
                                         * block size)
                                         */
                                        ISOFS_I(inode)->i_format_parm[0] =
                                                isonum_711(&rr->u.ZF.parms[0]);
                                        ISOFS_I(inode)->i_format_parm[1] =
                                                isonum_711(&rr->u.ZF.parms[1]);
                                        inode->i_size =
                                            isonum_733(rr->u.ZF.
                                                       real_size);
                                }
                        } else {
                                printk(KERN_WARNING
                                       "isofs: Unknown ZF compression "
                                                "algorithm: %c%c\n",
                                       rr->u.ZF.algorithm[0],
                                       rr->u.ZF.algorithm[1]);
                        }
                        break;
                }
#endif
                default:
                        break;
                }
        }
        ret = rock_continue(&rs);
        if (ret == 0)
                goto repeat;
        if (ret == 1)
                ret = 0;
out:
        kfree(rs.buffer);
        return ret;
eio:
        ret = -EIO;
        goto out;
}

static char *get_symlink_chunk(char *rpnt, struct rock_ridge *rr, char *plimit)
{
        int slen;
        int rootflag;
        struct SL_component *oldslp;
        struct SL_component *slp;
        slen = rr->len - 5;
        slp = &rr->u.SL.link;
        while (slen > 1) {
                rootflag = 0;
                switch (slp->flags & ~1) {
                case 0:
                        if (slp->len > plimit - rpnt)
                                return NULL;
                        memcpy(rpnt, slp->text, slp->len);
                        rpnt += slp->len;
                        break;
                case 2:
                        if (rpnt >= plimit)
                                return NULL;
                        *rpnt++ = '.';
                        break;
                case 4:
                        if (2 > plimit - rpnt)
                                return NULL;
                        *rpnt++ = '.';
                        *rpnt++ = '.';
                        break;
                case 8:
                        if (rpnt >= plimit)
                                return NULL;
                        rootflag = 1;
                        *rpnt++ = '/';
                        break;
                default:
                        printk("Symlink component flag not implemented (%d)\n",
                               slp->flags);
                }
                slen -= slp->len + 2;
                oldslp = slp;
                slp = (struct SL_component *)((char *)slp + slp->len + 2);

                if (slen < 2) {
                        /*
                         * If there is another SL record, and this component
                         * record isn't continued, then add a slash.
                         */
                        if ((!rootflag) && (rr->u.SL.flags & 1) &&
                            !(oldslp->flags & 1)) {
                                if (rpnt >= plimit)
                                        return NULL;
                                *rpnt++ = '/';
                        }
                        break;
                }

                /*
                 * If this component record isn't continued, then append a '/'.
                 */
                if (!rootflag && !(oldslp->flags & 1)) {
                        if (rpnt >= plimit)
                                return NULL;
                        *rpnt++ = '/';
                }
        }
        return rpnt;
}

int parse_rock_ridge_inode(struct iso_directory_record *de, struct inode *inode,
                           int relocated)
{
        int flags = relocated ? RR_RELOC_DE : 0;
        int result = parse_rock_ridge_inode_internal(de, inode, flags);

        /*
         * if rockridge flag was reset and we didn't look for attributes
         * behind eventual XA attributes, have a look there
         */
        if ((ISOFS_SB(inode->i_sb)->s_rock_offset == -1)
            && (ISOFS_SB(inode->i_sb)->s_rock == 2)) {
                result = parse_rock_ridge_inode_internal(de, inode,
                                                         flags | RR_REGARD_XA);
        }
        return result;
}

/*
 * readpage() for symlinks: reads symlink contents into the page and either
 * makes it uptodate and returns 0 or returns error (-EIO)
 */
static int rock_ridge_symlink_readpage(struct file *file, struct page *page)
{
        struct inode *inode = page->mapping->host;
        struct iso_inode_info *ei = ISOFS_I(inode);
        struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
        char *link = page_address(page);
        unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
        struct buffer_head *bh;
        char *rpnt = link;
        unsigned char *pnt;
        struct iso_directory_record *raw_de;
        unsigned long block, offset;
        int sig;
        struct rock_ridge *rr;
        struct rock_state rs;
        int ret;

        if (!sbi->s_rock)
                goto error;

        init_rock_state(&rs, inode);
        block = ei->i_iget5_block;
        bh = sb_bread(inode->i_sb, block);
        if (!bh)
                goto out_noread;

        offset = ei->i_iget5_offset;
        pnt = (unsigned char *)bh->b_data + offset;

        raw_de = (struct iso_directory_record *)pnt;

        /*
         * If we go past the end of the buffer, there is some sort of error.
         */
        if (offset + *pnt > bufsize)
                goto out_bad_span;

        /*
         * Now test for possible Rock Ridge extensions which will override
         * some of these numbers in the inode structure.
         */

        setup_rock_ridge(raw_de, inode, &rs);

repeat:
        while (rs.len > 2) { /* There may be one byte for padding somewhere */
                rr = (struct rock_ridge *)rs.chr;
                if (rr->len < 3)
                        goto out;        /* Something got screwed up here */
                sig = isonum_721(rs.chr);
                if (rock_check_overflow(&rs, sig))
                        goto out;
                rs.chr += rr->len;
                rs.len -= rr->len;
                if (rs.len < 0)
                        goto out;        /* corrupted isofs */

                switch (sig) {
                case SIG('R', 'R'):
                        if ((rr->u.RR.flags[0] & RR_SL) == 0)
                                goto out;
                        break;
                case SIG('S', 'P'):
                        if (check_sp(rr, inode))
                                goto out;
                        break;
                case SIG('S', 'L'):
                        rpnt = get_symlink_chunk(rpnt, rr,
                                                 link + (PAGE_SIZE - 1));
                        if (rpnt == NULL)
                                goto out;
                        break;
                case SIG('C', 'E'):
                        /* This tells is if there is a continuation record */
                        rs.cont_extent = isonum_733(rr->u.CE.extent);
                        rs.cont_offset = isonum_733(rr->u.CE.offset);
                        rs.cont_size = isonum_733(rr->u.CE.size);
                default:
                        break;
                }
        }
        ret = rock_continue(&rs);
        if (ret == 0)
                goto repeat;
        if (ret < 0)
                goto fail;

        if (rpnt == link)
                goto fail;
        brelse(bh);
        *rpnt = '\0';
        SetPageUptodate(page);
        unlock_page(page);
        return 0;

        /* error exit from macro */
out:
        kfree(rs.buffer);
        goto fail;
out_noread:
        printk("unable to read i-node block");
        goto fail;
out_bad_span:
        printk("symlink spans iso9660 blocks\n");
fail:
        brelse(bh);
error:
        SetPageError(page);
        unlock_page(page);
        return -EIO;
}

const struct address_space_operations isofs_symlink_aops = {
        .readpage = rock_ridge_symlink_readpage
};



































































































































































































































































































































    3 



















    1 


















    3 




















    1 







    3 
























    1 









































































































    4 


























    3 



























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_DCACHE_H
#define __LINUX_DCACHE_H

#include <linux/atomic.h>
#include <linux/list.h>
#include <linux/rculist.h>
#include <linux/rculist_bl.h>
#include <linux/spinlock.h>
#include <linux/seqlock.h>
#include <linux/cache.h>
#include <linux/rcupdate.h>
#include <linux/lockref.h>
#include <linux/stringhash.h>
#include <linux/wait.h>

struct path;
struct vfsmount;

/*
 * linux/include/linux/dcache.h
 *
 * Dirent cache data structures
 *
 * (C) Copyright 1997 Thomas Schoebel-Theuer,
 * with heavy changes by Linus Torvalds
 */

#define IS_ROOT(x) ((x) == (x)->d_parent)

/* The hash is always the low bits of hash_len */
#ifdef __LITTLE_ENDIAN
 #define HASH_LEN_DECLARE u32 hash; u32 len
 #define bytemask_from_count(cnt)        (~(~0ul << (cnt)*8))
#else
 #define HASH_LEN_DECLARE u32 len; u32 hash
 #define bytemask_from_count(cnt)        (~(~0ul >> (cnt)*8))
#endif

/*
 * "quick string" -- eases parameter passing, but more importantly
 * saves "metadata" about the string (ie length and the hash).
 *
 * hash comes first so it snuggles against d_parent in the
 * dentry.
 */
struct qstr {
        union {
                struct {
                        HASH_LEN_DECLARE;
                };
                u64 hash_len;
        };
        const unsigned char *name;
};

#define QSTR_INIT(n,l) { { { .len = l } }, .name = n }

extern const struct qstr empty_name;
extern const struct qstr slash_name;

struct dentry_stat_t {
        long nr_dentry;
        long nr_unused;
        long age_limit;                /* age in seconds */
        long want_pages;        /* pages requested by system */
        long nr_negative;        /* # of unused negative dentries */
        long dummy;                /* Reserved for future use */
};
extern struct dentry_stat_t dentry_stat;

/*
 * Try to keep struct dentry aligned on 64 byte cachelines (this will
 * give reasonable cacheline footprint with larger lines without the
 * large memory footprint increase).
 */
#ifdef CONFIG_64BIT
# define DNAME_INLINE_LEN 32 /* 192 bytes */
#else
# ifdef CONFIG_SMP
#  define DNAME_INLINE_LEN 36 /* 128 bytes */
# else
#  define DNAME_INLINE_LEN 40 /* 128 bytes */
# endif
#endif

#define d_lock        d_lockref.lock

struct dentry {
        /* RCU lookup touched fields */
        unsigned int d_flags;                /* protected by d_lock */
        seqcount_spinlock_t d_seq;        /* per dentry seqlock */
        struct hlist_bl_node d_hash;        /* lookup hash list */
        struct dentry *d_parent;        /* parent directory */
        struct qstr d_name;
        struct inode *d_inode;                /* Where the name belongs to - NULL is
                                         * negative */
        unsigned char d_iname[DNAME_INLINE_LEN];        /* small names */

        /* Ref lookup also touches following */
        struct lockref d_lockref;        /* per-dentry lock and refcount */
        const struct dentry_operations *d_op;
        struct super_block *d_sb;        /* The root of the dentry tree */
        unsigned long d_time;                /* used by d_revalidate */
        void *d_fsdata;                        /* fs-specific data */

        union {
                struct list_head d_lru;                /* LRU list */
                wait_queue_head_t *d_wait;        /* in-lookup ones only */
        };
        struct list_head d_child;        /* child of parent list */
        struct list_head d_subdirs;        /* our children */
        /*
         * d_alias and d_rcu can share memory
         */
        union {
                struct hlist_node d_alias;        /* inode alias list */
                struct hlist_bl_node d_in_lookup_hash;        /* only for in-lookup ones */
                 struct rcu_head d_rcu;
        } d_u;
} __randomize_layout;

/*
 * dentry->d_lock spinlock nesting subclasses:
 *
 * 0: normal
 * 1: nested
 */
enum dentry_d_lock_class
{
        DENTRY_D_LOCK_NORMAL, /* implicitly used by plain spin_lock() APIs. */
        DENTRY_D_LOCK_NESTED
};

struct dentry_operations {
        int (*d_revalidate)(struct dentry *, unsigned int);
        int (*d_weak_revalidate)(struct dentry *, unsigned int);
        int (*d_hash)(const struct dentry *, struct qstr *);
        int (*d_compare)(const struct dentry *,
                        unsigned int, const char *, const struct qstr *);
        int (*d_delete)(const struct dentry *);
        int (*d_init)(struct dentry *);
        void (*d_release)(struct dentry *);
        void (*d_prune)(struct dentry *);
        void (*d_iput)(struct dentry *, struct inode *);
        char *(*d_dname)(struct dentry *, char *, int);
        struct vfsmount *(*d_automount)(struct path *);
        int (*d_manage)(const struct path *, bool);
        struct dentry *(*d_real)(struct dentry *, const struct inode *);
} ____cacheline_aligned;

/*
 * Locking rules for dentry_operations callbacks are to be found in
 * Documentation/filesystems/locking.rst. Keep it updated!
 *
 * FUrther descriptions are found in Documentation/filesystems/vfs.rst.
 * Keep it updated too!
 */

/* d_flags entries */
#define DCACHE_OP_HASH                        0x00000001
#define DCACHE_OP_COMPARE                0x00000002
#define DCACHE_OP_REVALIDATE                0x00000004
#define DCACHE_OP_DELETE                0x00000008
#define DCACHE_OP_PRUNE                        0x00000010

#define        DCACHE_DISCONNECTED                0x00000020
     /* This dentry is possibly not currently connected to the dcache tree, in
      * which case its parent will either be itself, or will have this flag as
      * well.  nfsd will not use a dentry with this bit set, but will first
      * endeavour to clear the bit either by discovering that it is connected,
      * or by performing lookup operations.   Any filesystem which supports
      * nfsd_operations MUST have a lookup function which, if it finds a
      * directory inode with a DCACHE_DISCONNECTED dentry, will d_move that
      * dentry into place and return that dentry rather than the passed one,
      * typically using d_splice_alias. */

#define DCACHE_REFERENCED                0x00000040 /* Recently used, don't discard. */

#define DCACHE_DONTCACHE                0x00000080 /* Purge from memory on final dput() */

#define DCACHE_CANT_MOUNT                0x00000100
#define DCACHE_GENOCIDE                        0x00000200
#define DCACHE_SHRINK_LIST                0x00000400

#define DCACHE_OP_WEAK_REVALIDATE        0x00000800

#define DCACHE_NFSFS_RENAMED                0x00001000
     /* this dentry has been "silly renamed" and has to be deleted on the last
      * dput() */
#define DCACHE_COOKIE                        0x00002000 /* For use by dcookie subsystem */
#define DCACHE_FSNOTIFY_PARENT_WATCHED        0x00004000
     /* Parent inode is watched by some fsnotify listener */

#define DCACHE_DENTRY_KILLED                0x00008000

#define DCACHE_MOUNTED                        0x00010000 /* is a mountpoint */
#define DCACHE_NEED_AUTOMOUNT                0x00020000 /* handle automount on this dir */
#define DCACHE_MANAGE_TRANSIT                0x00040000 /* manage transit from this dirent */
#define DCACHE_MANAGED_DENTRY \
        (DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT)

#define DCACHE_LRU_LIST                        0x00080000

#define DCACHE_ENTRY_TYPE                0x00700000
#define DCACHE_MISS_TYPE                0x00000000 /* Negative dentry (maybe fallthru to nowhere) */
#define DCACHE_WHITEOUT_TYPE                0x00100000 /* Whiteout dentry (stop pathwalk) */
#define DCACHE_DIRECTORY_TYPE                0x00200000 /* Normal directory */
#define DCACHE_AUTODIR_TYPE                0x00300000 /* Lookupless directory (presumed automount) */
#define DCACHE_REGULAR_TYPE                0x00400000 /* Regular file type (or fallthru to such) */
#define DCACHE_SPECIAL_TYPE                0x00500000 /* Other file type (or fallthru to such) */
#define DCACHE_SYMLINK_TYPE                0x00600000 /* Symlink (or fallthru to such) */

#define DCACHE_MAY_FREE                        0x00800000
#define DCACHE_FALLTHRU                        0x01000000 /* Fall through to lower layer */
#define DCACHE_NOKEY_NAME                0x02000000 /* Encrypted name encoded without key */
#define DCACHE_OP_REAL                        0x04000000

#define DCACHE_PAR_LOOKUP                0x10000000 /* being looked up (with parent locked shared) */
#define DCACHE_DENTRY_CURSOR                0x20000000
#define DCACHE_NORCU                        0x40000000 /* No RCU delay for freeing */

extern seqlock_t rename_lock;

/*
 * These are the low-level FS interfaces to the dcache..
 */
extern void d_instantiate(struct dentry *, struct inode *);
extern void d_instantiate_new(struct dentry *, struct inode *);
extern struct dentry * d_instantiate_unique(struct dentry *, struct inode *);
extern struct dentry * d_instantiate_anon(struct dentry *, struct inode *);
extern void __d_drop(struct dentry *dentry);
extern void d_drop(struct dentry *dentry);
extern void d_delete(struct dentry *);
extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op);

/* allocate/de-allocate */
extern struct dentry * d_alloc(struct dentry *, const struct qstr *);
extern struct dentry * d_alloc_anon(struct super_block *);
extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *,
                                        wait_queue_head_t *);
extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
extern struct dentry * d_exact_alias(struct dentry *, struct inode *);
extern struct dentry *d_find_any_alias(struct inode *inode);
extern struct dentry * d_obtain_alias(struct inode *);
extern struct dentry * d_obtain_root(struct inode *);
extern void shrink_dcache_sb(struct super_block *);
extern void shrink_dcache_parent(struct dentry *);
extern void shrink_dcache_for_umount(struct super_block *);
extern void d_invalidate(struct dentry *);

/* only used at mount-time */
extern struct dentry * d_make_root(struct inode *);

/* <clickety>-<click> the ramfs-type tree */
extern void d_genocide(struct dentry *);

extern void d_tmpfile(struct dentry *, struct inode *);

extern struct dentry *d_find_alias(struct inode *);
extern void d_prune_aliases(struct inode *);

/* test whether we have any submounts in a subdir tree */
extern int path_has_submounts(const struct path *);

/*
 * This adds the entry to the hash queues.
 */
extern void d_rehash(struct dentry *);
 
extern void d_add(struct dentry *, struct inode *);

/* used for rename() and baskets */
extern void d_move(struct dentry *, struct dentry *);
extern void d_exchange(struct dentry *, struct dentry *);
extern struct dentry *d_ancestor(struct dentry *, struct dentry *);

/* appendix may either be NULL or be used for transname suffixes */
extern struct dentry *d_lookup(const struct dentry *, const struct qstr *);
extern struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *);
extern struct dentry *__d_lookup(const struct dentry *, const struct qstr *);
extern struct dentry *__d_lookup_rcu(const struct dentry *parent,
                                const struct qstr *name, unsigned *seq);

static inline unsigned d_count(const struct dentry *dentry)
{
        return dentry->d_lockref.count;
}

/*
 * helper function for dentry_operations.d_dname() members
 */
extern __printf(4, 5)
char *dynamic_dname(struct dentry *, char *, int, const char *, ...);

extern char *__d_path(const struct path *, const struct path *, char *, int);
extern char *d_absolute_path(const struct path *, char *, int);
extern char *d_path(const struct path *, char *, int);
extern char *dentry_path_raw(struct dentry *, char *, int);
extern char *dentry_path(struct dentry *, char *, int);

/* Allocation counts.. */

/**
 *        dget, dget_dlock -        get a reference to a dentry
 *        @dentry: dentry to get a reference to
 *
 *        Given a dentry or %NULL pointer increment the reference count
 *        if appropriate and return the dentry. A dentry will not be 
 *        destroyed when it has references.
 */
static inline struct dentry *dget_dlock(struct dentry *dentry)
{
        if (dentry)
                dentry->d_lockref.count++;
        return dentry;
}

static inline struct dentry *dget(struct dentry *dentry)
{
        if (dentry)
                lockref_get(&dentry->d_lockref);
        return dentry;
}

extern struct dentry *dget_parent(struct dentry *dentry);

/**
 *        d_unhashed -        is dentry hashed
 *        @dentry: entry to check
 *
 *        Returns true if the dentry passed is not currently hashed.
 */
 
static inline int d_unhashed(const struct dentry *dentry)
{
        return hlist_bl_unhashed(&dentry->d_hash);
}

static inline int d_unlinked(const struct dentry *dentry)
{
        return d_unhashed(dentry) && !IS_ROOT(dentry);
}

static inline int cant_mount(const struct dentry *dentry)
{
        return (dentry->d_flags & DCACHE_CANT_MOUNT);
}

static inline void dont_mount(struct dentry *dentry)
{
        spin_lock(&dentry->d_lock);
        dentry->d_flags |= DCACHE_CANT_MOUNT;
        spin_unlock(&dentry->d_lock);
}

extern void __d_lookup_done(struct dentry *);

static inline int d_in_lookup(const struct dentry *dentry)
{
        return dentry->d_flags & DCACHE_PAR_LOOKUP;
}

static inline void d_lookup_done(struct dentry *dentry)
{
        if (unlikely(d_in_lookup(dentry))) {
                spin_lock(&dentry->d_lock);
                __d_lookup_done(dentry);
                spin_unlock(&dentry->d_lock);
        }
}

extern void dput(struct dentry *);

static inline bool d_managed(const struct dentry *dentry)
{
        return dentry->d_flags & DCACHE_MANAGED_DENTRY;
}

static inline bool d_mountpoint(const struct dentry *dentry)
{
        return dentry->d_flags & DCACHE_MOUNTED;
}

/*
 * Directory cache entry type accessor functions.
 */
static inline unsigned __d_entry_type(const struct dentry *dentry)
{
        return dentry->d_flags & DCACHE_ENTRY_TYPE;
}

static inline bool d_is_miss(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_MISS_TYPE;
}

static inline bool d_is_whiteout(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_WHITEOUT_TYPE;
}

static inline bool d_can_lookup(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_DIRECTORY_TYPE;
}

static inline bool d_is_autodir(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_AUTODIR_TYPE;
}

static inline bool d_is_dir(const struct dentry *dentry)
{
        return d_can_lookup(dentry) || d_is_autodir(dentry);
}

static inline bool d_is_symlink(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_SYMLINK_TYPE;
}

static inline bool d_is_reg(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_REGULAR_TYPE;
}

static inline bool d_is_special(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_SPECIAL_TYPE;
}

static inline bool d_is_file(const struct dentry *dentry)
{
        return d_is_reg(dentry) || d_is_special(dentry);
}

static inline bool d_is_negative(const struct dentry *dentry)
{
        // TODO: check d_is_whiteout(dentry) also.
        return d_is_miss(dentry);
}

static inline bool d_flags_negative(unsigned flags)
{
        return (flags & DCACHE_ENTRY_TYPE) == DCACHE_MISS_TYPE;
}

static inline bool d_is_positive(const struct dentry *dentry)
{
        return !d_is_negative(dentry);
}

/**
 * d_really_is_negative - Determine if a dentry is really negative (ignoring fallthroughs)
 * @dentry: The dentry in question
 *
 * Returns true if the dentry represents either an absent name or a name that
 * doesn't map to an inode (ie. ->d_inode is NULL).  The dentry could represent
 * a true miss, a whiteout that isn't represented by a 0,0 chardev or a
 * fallthrough marker in an opaque directory.
 *
 * Note!  (1) This should be used *only* by a filesystem to examine its own
 * dentries.  It should not be used to look at some other filesystem's
 * dentries.  (2) It should also be used in combination with d_inode() to get
 * the inode.  (3) The dentry may have something attached to ->d_lower and the
 * type field of the flags may be set to something other than miss or whiteout.
 */
static inline bool d_really_is_negative(const struct dentry *dentry)
{
        return dentry->d_inode == NULL;
}

/**
 * d_really_is_positive - Determine if a dentry is really positive (ignoring fallthroughs)
 * @dentry: The dentry in question
 *
 * Returns true if the dentry represents a name that maps to an inode
 * (ie. ->d_inode is not NULL).  The dentry might still represent a whiteout if
 * that is represented on medium as a 0,0 chardev.
 *
 * Note!  (1) This should be used *only* by a filesystem to examine its own
 * dentries.  It should not be used to look at some other filesystem's
 * dentries.  (2) It should also be used in combination with d_inode() to get
 * the inode.
 */
static inline bool d_really_is_positive(const struct dentry *dentry)
{
        return dentry->d_inode != NULL;
}

static inline int simple_positive(const struct dentry *dentry)
{
        return d_really_is_positive(dentry) && !d_unhashed(dentry);
}

extern void d_set_fallthru(struct dentry *dentry);

static inline bool d_is_fallthru(const struct dentry *dentry)
{
        return dentry->d_flags & DCACHE_FALLTHRU;
}


extern int sysctl_vfs_cache_pressure;

static inline unsigned long vfs_pressure_ratio(unsigned long val)
{
        return mult_frac(val, sysctl_vfs_cache_pressure, 100);
}

/**
 * d_inode - Get the actual inode of this dentry
 * @dentry: The dentry to query
 *
 * This is the helper normal filesystems should use to get at their own inodes
 * in their own dentries and ignore the layering superimposed upon them.
 */
static inline struct inode *d_inode(const struct dentry *dentry)
{
        return dentry->d_inode;
}

/**
 * d_inode_rcu - Get the actual inode of this dentry with READ_ONCE()
 * @dentry: The dentry to query
 *
 * This is the helper normal filesystems should use to get at their own inodes
 * in their own dentries and ignore the layering superimposed upon them.
 */
static inline struct inode *d_inode_rcu(const struct dentry *dentry)
{
        return READ_ONCE(dentry->d_inode);
}

/**
 * d_backing_inode - Get upper or lower inode we should be using
 * @upper: The upper layer
 *
 * This is the helper that should be used to get at the inode that will be used
 * if this dentry were to be opened as a file.  The inode may be on the upper
 * dentry or it may be on a lower dentry pinned by the upper.
 *
 * Normal filesystems should not use this to access their own inodes.
 */
static inline struct inode *d_backing_inode(const struct dentry *upper)
{
        struct inode *inode = upper->d_inode;

        return inode;
}

/**
 * d_backing_dentry - Get upper or lower dentry we should be using
 * @upper: The upper layer
 *
 * This is the helper that should be used to get the dentry of the inode that
 * will be used if this dentry were opened as a file.  It may be the upper
 * dentry or it may be a lower dentry pinned by the upper.
 *
 * Normal filesystems should not use this to access their own dentries.
 */
static inline struct dentry *d_backing_dentry(struct dentry *upper)
{
        return upper;
}

/**
 * d_real - Return the real dentry
 * @dentry: the dentry to query
 * @inode: inode to select the dentry from multiple layers (can be NULL)
 *
 * If dentry is on a union/overlay, then return the underlying, real dentry.
 * Otherwise return the dentry itself.
 *
 * See also: Documentation/filesystems/vfs.rst
 */
static inline struct dentry *d_real(struct dentry *dentry,
                                    const struct inode *inode)
{
        if (unlikely(dentry->d_flags & DCACHE_OP_REAL))
                return dentry->d_op->d_real(dentry, inode);
        else
                return dentry;
}

/**
 * d_real_inode - Return the real inode
 * @dentry: The dentry to query
 *
 * If dentry is on a union/overlay, then return the underlying, real inode.
 * Otherwise return d_inode().
 */
static inline struct inode *d_real_inode(const struct dentry *dentry)
{
        /* This usage of d_real() results in const dentry */
        return d_backing_inode(d_real((struct dentry *) dentry, NULL));
}

struct name_snapshot {
        struct qstr name;
        unsigned char inline_name[DNAME_INLINE_LEN];
};
void take_dentry_name_snapshot(struct name_snapshot *, struct dentry *);
void release_dentry_name_snapshot(struct name_snapshot *);

#endif        /* __LINUX_DCACHE_H */






































































































































































































































































































































































































































































































































































    1 

    1 








    1 














































































































































































































































































































    1 

    1 

























    1 













































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
 */

/*
 * fsnotify inode mark locking/lifetime/and refcnting
 *
 * REFCNT:
 * The group->recnt and mark->refcnt tell how many "things" in the kernel
 * currently are referencing the objects. Both kind of objects typically will
 * live inside the kernel with a refcnt of 2, one for its creation and one for
 * the reference a group and a mark hold to each other.
 * If you are holding the appropriate locks, you can take a reference and the
 * object itself is guaranteed to survive until the reference is dropped.
 *
 * LOCKING:
 * There are 3 locks involved with fsnotify inode marks and they MUST be taken
 * in order as follows:
 *
 * group->mark_mutex
 * mark->lock
 * mark->connector->lock
 *
 * group->mark_mutex protects the marks_list anchored inside a given group and
 * each mark is hooked via the g_list.  It also protects the groups private
 * data (i.e group limits).

 * mark->lock protects the marks attributes like its masks and flags.
 * Furthermore it protects the access to a reference of the group that the mark
 * is assigned to as well as the access to a reference of the inode/vfsmount
 * that is being watched by the mark.
 *
 * mark->connector->lock protects the list of marks anchored inside an
 * inode / vfsmount and each mark is hooked via the i_list.
 *
 * A list of notification marks relating to inode / mnt is contained in
 * fsnotify_mark_connector. That structure is alive as long as there are any
 * marks in the list and is also protected by fsnotify_mark_srcu. A mark gets
 * detached from fsnotify_mark_connector when last reference to the mark is
 * dropped.  Thus having mark reference is enough to protect mark->connector
 * pointer and to make sure fsnotify_mark_connector cannot disappear. Also
 * because we remove mark from g_list before dropping mark reference associated
 * with that, any mark found through g_list is guaranteed to have
 * mark->connector set until we drop group->mark_mutex.
 *
 * LIFETIME:
 * Inode marks survive between when they are added to an inode and when their
 * refcnt==0. Marks are also protected by fsnotify_mark_srcu.
 *
 * The inode mark can be cleared for a number of different reasons including:
 * - The inode is unlinked for the last time.  (fsnotify_inode_remove)
 * - The inode is being evicted from cache. (fsnotify_inode_delete)
 * - The fs the inode is on is unmounted.  (fsnotify_inode_delete/fsnotify_unmount_inodes)
 * - Something explicitly requests that it be removed.  (fsnotify_destroy_mark)
 * - The fsnotify_group associated with the mark is going away and all such marks
 *   need to be cleaned up. (fsnotify_clear_marks_by_group)
 *
 * This has the very interesting property of being able to run concurrently with
 * any (or all) other directions.
 */

#include <linux/fs.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/kthread.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/srcu.h>
#include <linux/ratelimit.h>

#include <linux/atomic.h>

#include <linux/fsnotify_backend.h>
#include "fsnotify.h"

#define FSNOTIFY_REAPER_DELAY        (1)        /* 1 jiffy */

struct srcu_struct fsnotify_mark_srcu;
struct kmem_cache *fsnotify_mark_connector_cachep;

static DEFINE_SPINLOCK(destroy_lock);
static LIST_HEAD(destroy_list);
static struct fsnotify_mark_connector *connector_destroy_list;

static void fsnotify_mark_destroy_workfn(struct work_struct *work);
static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn);

static void fsnotify_connector_destroy_workfn(struct work_struct *work);
static DECLARE_WORK(connector_reaper_work, fsnotify_connector_destroy_workfn);

void fsnotify_get_mark(struct fsnotify_mark *mark)
{
        WARN_ON_ONCE(!refcount_read(&mark->refcnt));
        refcount_inc(&mark->refcnt);
}

static __u32 *fsnotify_conn_mask_p(struct fsnotify_mark_connector *conn)
{
        if (conn->type == FSNOTIFY_OBJ_TYPE_INODE)
                return &fsnotify_conn_inode(conn)->i_fsnotify_mask;
        else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT)
                return &fsnotify_conn_mount(conn)->mnt_fsnotify_mask;
        else if (conn->type == FSNOTIFY_OBJ_TYPE_SB)
                return &fsnotify_conn_sb(conn)->s_fsnotify_mask;
        return NULL;
}

__u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn)
{
        if (WARN_ON(!fsnotify_valid_obj_type(conn->type)))
                return 0;

        return *fsnotify_conn_mask_p(conn);
}

static void fsnotify_get_inode_ref(struct inode *inode)
{
        ihold(inode);
        atomic_long_inc(&inode->i_sb->s_fsnotify_connectors);
}

/*
 * Grab or drop inode reference for the connector if needed.
 *
 * When it's time to drop the reference, we only clear the HAS_IREF flag and
 * return the inode object. fsnotify_drop_object() will be resonsible for doing
 * iput() outside of spinlocks. This happens when last mark that wanted iref is
 * detached.
 */
static struct inode *fsnotify_update_iref(struct fsnotify_mark_connector *conn,
                                          bool want_iref)
{
        bool has_iref = conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF;
        struct inode *inode = NULL;

        if (conn->type != FSNOTIFY_OBJ_TYPE_INODE ||
            want_iref == has_iref)
                return NULL;

        if (want_iref) {
                /* Pin inode if any mark wants inode refcount held */
                fsnotify_get_inode_ref(fsnotify_conn_inode(conn));
                conn->flags |= FSNOTIFY_CONN_FLAG_HAS_IREF;
        } else {
                /* Unpin inode after detach of last mark that wanted iref */
                inode = fsnotify_conn_inode(conn);
                conn->flags &= ~FSNOTIFY_CONN_FLAG_HAS_IREF;
        }

        return inode;
}

static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
{
        u32 new_mask = 0;
        bool want_iref = false;
        struct fsnotify_mark *mark;

        assert_spin_locked(&conn->lock);
        /* We can get detached connector here when inode is getting unlinked. */
        if (!fsnotify_valid_obj_type(conn->type))
                return NULL;
        hlist_for_each_entry(mark, &conn->list, obj_list) {
                if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED))
                        continue;
                new_mask |= fsnotify_calc_mask(mark);
                if (conn->type == FSNOTIFY_OBJ_TYPE_INODE &&
                    !(mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF))
                        want_iref = true;
        }
        *fsnotify_conn_mask_p(conn) = new_mask;

        return fsnotify_update_iref(conn, want_iref);
}

static bool fsnotify_conn_watches_children(
                                        struct fsnotify_mark_connector *conn)
{
        if (conn->type != FSNOTIFY_OBJ_TYPE_INODE)
                return false;

        return fsnotify_inode_watches_children(fsnotify_conn_inode(conn));
}

static void fsnotify_conn_set_children_dentry_flags(
                                        struct fsnotify_mark_connector *conn)
{
        if (conn->type != FSNOTIFY_OBJ_TYPE_INODE)
                return;

        fsnotify_set_children_dentry_flags(fsnotify_conn_inode(conn));
}

/*
 * Calculate mask of events for a list of marks. The caller must make sure
 * connector and connector->obj cannot disappear under us.  Callers achieve
 * this by holding a mark->lock or mark->group->mark_mutex for a mark on this
 * list.
 */
void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
{
        bool update_children;

        if (!conn)
                return;

        spin_lock(&conn->lock);
        update_children = !fsnotify_conn_watches_children(conn);
        __fsnotify_recalc_mask(conn);
        update_children &= fsnotify_conn_watches_children(conn);
        spin_unlock(&conn->lock);
        /*
         * Set children's PARENT_WATCHED flags only if parent started watching.
         * When parent stops watching, we clear false positive PARENT_WATCHED
         * flags lazily in __fsnotify_parent().
         */
        if (update_children)
                fsnotify_conn_set_children_dentry_flags(conn);
}

/* Free all connectors queued for freeing once SRCU period ends */
static void fsnotify_connector_destroy_workfn(struct work_struct *work)
{
        struct fsnotify_mark_connector *conn, *free;

        spin_lock(&destroy_lock);
        conn = connector_destroy_list;
        connector_destroy_list = NULL;
        spin_unlock(&destroy_lock);

        synchronize_srcu(&fsnotify_mark_srcu);
        while (conn) {
                free = conn;
                conn = conn->destroy_next;
                kmem_cache_free(fsnotify_mark_connector_cachep, free);
        }
}

static void fsnotify_put_inode_ref(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;

        iput(inode);
        if (atomic_long_dec_and_test(&sb->s_fsnotify_connectors))
                wake_up_var(&sb->s_fsnotify_connectors);
}

static void fsnotify_get_sb_connectors(struct fsnotify_mark_connector *conn)
{
        struct super_block *sb = fsnotify_connector_sb(conn);

        if (sb)
                atomic_long_inc(&sb->s_fsnotify_connectors);
}

static void fsnotify_put_sb_connectors(struct fsnotify_mark_connector *conn)
{
        struct super_block *sb = fsnotify_connector_sb(conn);

        if (sb && atomic_long_dec_and_test(&sb->s_fsnotify_connectors))
                wake_up_var(&sb->s_fsnotify_connectors);
}

static void *fsnotify_detach_connector_from_object(
                                        struct fsnotify_mark_connector *conn,
                                        unsigned int *type)
{
        struct inode *inode = NULL;

        *type = conn->type;
        if (conn->type == FSNOTIFY_OBJ_TYPE_DETACHED)
                return NULL;

        if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) {
                inode = fsnotify_conn_inode(conn);
                inode->i_fsnotify_mask = 0;

                /* Unpin inode when detaching from connector */
                if (!(conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF))
                        inode = NULL;
        } else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
                fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0;
        } else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) {
                fsnotify_conn_sb(conn)->s_fsnotify_mask = 0;
        }

        fsnotify_put_sb_connectors(conn);
        rcu_assign_pointer(*(conn->obj), NULL);
        conn->obj = NULL;
        conn->type = FSNOTIFY_OBJ_TYPE_DETACHED;

        return inode;
}

static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark)
{
        struct fsnotify_group *group = mark->group;

        if (WARN_ON_ONCE(!group))
                return;
        group->ops->free_mark(mark);
        fsnotify_put_group(group);
}

/* Drop object reference originally held by a connector */
static void fsnotify_drop_object(unsigned int type, void *objp)
{
        if (!objp)
                return;
        /* Currently only inode references are passed to be dropped */
        if (WARN_ON_ONCE(type != FSNOTIFY_OBJ_TYPE_INODE))
                return;
        fsnotify_put_inode_ref(objp);
}

void fsnotify_put_mark(struct fsnotify_mark *mark)
{
        struct fsnotify_mark_connector *conn = READ_ONCE(mark->connector);
        void *objp = NULL;
        unsigned int type = FSNOTIFY_OBJ_TYPE_DETACHED;
        bool free_conn = false;

        /* Catch marks that were actually never attached to object */
        if (!conn) {
                if (refcount_dec_and_test(&mark->refcnt))
                        fsnotify_final_mark_destroy(mark);
                return;
        }

        /*
         * We have to be careful so that traversals of obj_list under lock can
         * safely grab mark reference.
         */
        if (!refcount_dec_and_lock(&mark->refcnt, &conn->lock))
                return;

        hlist_del_init_rcu(&mark->obj_list);
        if (hlist_empty(&conn->list)) {
                objp = fsnotify_detach_connector_from_object(conn, &type);
                free_conn = true;
        } else {
                objp = __fsnotify_recalc_mask(conn);
                type = conn->type;
        }
        WRITE_ONCE(mark->connector, NULL);
        spin_unlock(&conn->lock);

        fsnotify_drop_object(type, objp);

        if (free_conn) {
                spin_lock(&destroy_lock);
                conn->destroy_next = connector_destroy_list;
                connector_destroy_list = conn;
                spin_unlock(&destroy_lock);
                queue_work(system_unbound_wq, &connector_reaper_work);
        }
        /*
         * Note that we didn't update flags telling whether inode cares about
         * what's happening with children. We update these flags from
         * __fsnotify_parent() lazily when next event happens on one of our
         * children.
         */
        spin_lock(&destroy_lock);
        list_add(&mark->g_list, &destroy_list);
        spin_unlock(&destroy_lock);
        queue_delayed_work(system_unbound_wq, &reaper_work,
                           FSNOTIFY_REAPER_DELAY);
}
EXPORT_SYMBOL_GPL(fsnotify_put_mark);

/*
 * Get mark reference when we found the mark via lockless traversal of object
 * list. Mark can be already removed from the list by now and on its way to be
 * destroyed once SRCU period ends.
 *
 * Also pin the group so it doesn't disappear under us.
 */
static bool fsnotify_get_mark_safe(struct fsnotify_mark *mark)
{
        if (!mark)
                return true;

        if (refcount_inc_not_zero(&mark->refcnt)) {
                spin_lock(&mark->lock);
                if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) {
                        /* mark is attached, group is still alive then */
                        atomic_inc(&mark->group->user_waits);
                        spin_unlock(&mark->lock);
                        return true;
                }
                spin_unlock(&mark->lock);
                fsnotify_put_mark(mark);
        }
        return false;
}

/*
 * Puts marks and wakes up group destruction if necessary.
 *
 * Pairs with fsnotify_get_mark_safe()
 */
static void fsnotify_put_mark_wake(struct fsnotify_mark *mark)
{
        if (mark) {
                struct fsnotify_group *group = mark->group;

                fsnotify_put_mark(mark);
                /*
                 * We abuse notification_waitq on group shutdown for waiting for
                 * all marks pinned when waiting for userspace.
                 */
                if (atomic_dec_and_test(&group->user_waits) && group->shutdown)
                        wake_up(&group->notification_waitq);
        }
}

bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info)
        __releases(&fsnotify_mark_srcu)
{
        int type;

        fsnotify_foreach_iter_type(type) {
                /* This can fail if mark is being removed */
                if (!fsnotify_get_mark_safe(iter_info->marks[type])) {
                        __release(&fsnotify_mark_srcu);
                        goto fail;
                }
        }

        /*
         * Now that both marks are pinned by refcount in the inode / vfsmount
         * lists, we can drop SRCU lock, and safely resume the list iteration
         * once userspace returns.
         */
        srcu_read_unlock(&fsnotify_mark_srcu, iter_info->srcu_idx);

        return true;

fail:
        for (type--; type >= 0; type--)
                fsnotify_put_mark_wake(iter_info->marks[type]);
        return false;
}

void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info)
        __acquires(&fsnotify_mark_srcu)
{
        int type;

        iter_info->srcu_idx = srcu_read_lock(&fsnotify_mark_srcu);
        fsnotify_foreach_iter_type(type)
                fsnotify_put_mark_wake(iter_info->marks[type]);
}

/*
 * Mark mark as detached, remove it from group list. Mark still stays in object
 * list until its last reference is dropped. Note that we rely on mark being
 * removed from group list before corresponding reference to it is dropped. In
 * particular we rely on mark->connector being valid while we hold
 * group->mark_mutex if we found the mark through g_list.
 *
 * Must be called with group->mark_mutex held. The caller must either hold
 * reference to the mark or be protected by fsnotify_mark_srcu.
 */
void fsnotify_detach_mark(struct fsnotify_mark *mark)
{
        fsnotify_group_assert_locked(mark->group);
        WARN_ON_ONCE(!srcu_read_lock_held(&fsnotify_mark_srcu) &&
                     refcount_read(&mark->refcnt) < 1 +
                        !!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED));

        spin_lock(&mark->lock);
        /* something else already called this function on this mark */
        if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
                spin_unlock(&mark->lock);
                return;
        }
        mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED;
        list_del_init(&mark->g_list);
        spin_unlock(&mark->lock);

        /* Drop mark reference acquired in fsnotify_add_mark_locked() */
        fsnotify_put_mark(mark);
}

/*
 * Free fsnotify mark. The mark is actually only marked as being freed.  The
 * freeing is actually happening only once last reference to the mark is
 * dropped from a workqueue which first waits for srcu period end.
 *
 * Caller must have a reference to the mark or be protected by
 * fsnotify_mark_srcu.
 */
void fsnotify_free_mark(struct fsnotify_mark *mark)
{
        struct fsnotify_group *group = mark->group;

        spin_lock(&mark->lock);
        /* something else already called this function on this mark */
        if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
                spin_unlock(&mark->lock);
                return;
        }
        mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
        spin_unlock(&mark->lock);

        /*
         * Some groups like to know that marks are being freed.  This is a
         * callback to the group function to let it know that this mark
         * is being freed.
         */
        if (group->ops->freeing_mark)
                group->ops->freeing_mark(mark, group);
}

void fsnotify_destroy_mark(struct fsnotify_mark *mark,
                           struct fsnotify_group *group)
{
        fsnotify_group_lock(group);
        fsnotify_detach_mark(mark);
        fsnotify_group_unlock(group);
        fsnotify_free_mark(mark);
}
EXPORT_SYMBOL_GPL(fsnotify_destroy_mark);

/*
 * Sorting function for lists of fsnotify marks.
 *
 * Fanotify supports different notification classes (reflected as priority of
 * notification group). Events shall be passed to notification groups in
 * decreasing priority order. To achieve this marks in notification lists for
 * inodes and vfsmounts are sorted so that priorities of corresponding groups
 * are descending.
 *
 * Furthermore correct handling of the ignore mask requires processing inode
 * and vfsmount marks of each group together. Using the group address as
 * further sort criterion provides a unique sorting order and thus we can
 * merge inode and vfsmount lists of marks in linear time and find groups
 * present in both lists.
 *
 * A return value of 1 signifies that b has priority over a.
 * A return value of 0 signifies that the two marks have to be handled together.
 * A return value of -1 signifies that a has priority over b.
 */
int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
{
        if (a == b)
                return 0;
        if (!a)
                return 1;
        if (!b)
                return -1;
        if (a->priority < b->priority)
                return 1;
        if (a->priority > b->priority)
                return -1;
        if (a < b)
                return 1;
        return -1;
}

static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
                                               unsigned int obj_type,
                                               __kernel_fsid_t *fsid)
{
        struct fsnotify_mark_connector *conn;

        conn = kmem_cache_alloc(fsnotify_mark_connector_cachep, GFP_KERNEL);
        if (!conn)
                return -ENOMEM;
        spin_lock_init(&conn->lock);
        INIT_HLIST_HEAD(&conn->list);
        conn->flags = 0;
        conn->type = obj_type;
        conn->obj = connp;
        /* Cache fsid of filesystem containing the object */
        if (fsid) {
                conn->fsid = *fsid;
                conn->flags = FSNOTIFY_CONN_FLAG_HAS_FSID;
        } else {
                conn->fsid.val[0] = conn->fsid.val[1] = 0;
                conn->flags = 0;
        }
        fsnotify_get_sb_connectors(conn);

        /*
         * cmpxchg() provides the barrier so that readers of *connp can see
         * only initialized structure
         */
        if (cmpxchg(connp, NULL, conn)) {
                /* Someone else created list structure for us */
                fsnotify_put_sb_connectors(conn);
                kmem_cache_free(fsnotify_mark_connector_cachep, conn);
        }

        return 0;
}

/*
 * Get mark connector, make sure it is alive and return with its lock held.
 * This is for users that get connector pointer from inode or mount. Users that
 * hold reference to a mark on the list may directly lock connector->lock as
 * they are sure list cannot go away under them.
 */
static struct fsnotify_mark_connector *fsnotify_grab_connector(
                                                fsnotify_connp_t *connp)
{
        struct fsnotify_mark_connector *conn;
        int idx;

        idx = srcu_read_lock(&fsnotify_mark_srcu);
        conn = srcu_dereference(*connp, &fsnotify_mark_srcu);
        if (!conn)
                goto out;
        spin_lock(&conn->lock);
        if (conn->type == FSNOTIFY_OBJ_TYPE_DETACHED) {
                spin_unlock(&conn->lock);
                srcu_read_unlock(&fsnotify_mark_srcu, idx);
                return NULL;
        }
out:
        srcu_read_unlock(&fsnotify_mark_srcu, idx);
        return conn;
}

/*
 * Add mark into proper place in given list of marks. These marks may be used
 * for the fsnotify backend to determine which event types should be delivered
 * to which group and for which inodes. These marks are ordered according to
 * priority, highest number first, and then by the group's location in memory.
 */
static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
                                  fsnotify_connp_t *connp,
                                  unsigned int obj_type,
                                  int add_flags, __kernel_fsid_t *fsid)
{
        struct fsnotify_mark *lmark, *last = NULL;
        struct fsnotify_mark_connector *conn;
        int cmp;
        int err = 0;

        if (WARN_ON(!fsnotify_valid_obj_type(obj_type)))
                return -EINVAL;

        /* Backend is expected to check for zero fsid (e.g. tmpfs) */
        if (fsid && WARN_ON_ONCE(!fsid->val[0] && !fsid->val[1]))
                return -ENODEV;

restart:
        spin_lock(&mark->lock);
        conn = fsnotify_grab_connector(connp);
        if (!conn) {
                spin_unlock(&mark->lock);
                err = fsnotify_attach_connector_to_object(connp, obj_type,
                                                          fsid);
                if (err)
                        return err;
                goto restart;
        } else if (fsid && !(conn->flags & FSNOTIFY_CONN_FLAG_HAS_FSID)) {
                conn->fsid = *fsid;
                /* Pairs with smp_rmb() in fanotify_get_fsid() */
                smp_wmb();
                conn->flags |= FSNOTIFY_CONN_FLAG_HAS_FSID;
        } else if (fsid && (conn->flags & FSNOTIFY_CONN_FLAG_HAS_FSID) &&
                   (fsid->val[0] != conn->fsid.val[0] ||
                    fsid->val[1] != conn->fsid.val[1])) {
                /*
                 * Backend is expected to check for non uniform fsid
                 * (e.g. btrfs), but maybe we missed something?
                 * Only allow setting conn->fsid once to non zero fsid.
                 * inotify and non-fid fanotify groups do not set nor test
                 * conn->fsid.
                 */
                pr_warn_ratelimited("%s: fsid mismatch on object of type %u: "
                                    "%x.%x != %x.%x\n", __func__, conn->type,
                                    fsid->val[0], fsid->val[1],
                                    conn->fsid.val[0], conn->fsid.val[1]);
                err = -EXDEV;
                goto out_err;
        }

        /* is mark the first mark? */
        if (hlist_empty(&conn->list)) {
                hlist_add_head_rcu(&mark->obj_list, &conn->list);
                goto added;
        }

        /* should mark be in the middle of the current list? */
        hlist_for_each_entry(lmark, &conn->list, obj_list) {
                last = lmark;

                if ((lmark->group == mark->group) &&
                    (lmark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) &&
                    !(mark->group->flags & FSNOTIFY_GROUP_DUPS)) {
                        err = -EEXIST;
                        goto out_err;
                }

                cmp = fsnotify_compare_groups(lmark->group, mark->group);
                if (cmp >= 0) {
                        hlist_add_before_rcu(&mark->obj_list, &lmark->obj_list);
                        goto added;
                }
        }

        BUG_ON(last == NULL);
        /* mark should be the last entry.  last is the current last entry */
        hlist_add_behind_rcu(&mark->obj_list, &last->obj_list);
added:
        /*
         * Since connector is attached to object using cmpxchg() we are
         * guaranteed that connector initialization is fully visible by anyone
         * seeing mark->connector set.
         */
        WRITE_ONCE(mark->connector, conn);
out_err:
        spin_unlock(&conn->lock);
        spin_unlock(&mark->lock);
        return err;
}

/*
 * Attach an initialized mark to a given group and fs object.
 * These marks may be used for the fsnotify backend to determine which
 * event types should be delivered to which group.
 */
int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
                             fsnotify_connp_t *connp, unsigned int obj_type,
                             int add_flags, __kernel_fsid_t *fsid)
{
        struct fsnotify_group *group = mark->group;
        int ret = 0;

        fsnotify_group_assert_locked(group);

        /*
         * LOCKING ORDER!!!!
         * group->mark_mutex
         * mark->lock
         * mark->connector->lock
         */
        spin_lock(&mark->lock);
        mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED;

        list_add(&mark->g_list, &group->marks_list);
        fsnotify_get_mark(mark); /* for g_list */
        spin_unlock(&mark->lock);

        ret = fsnotify_add_mark_list(mark, connp, obj_type, add_flags, fsid);
        if (ret)
                goto err;

        fsnotify_recalc_mask(mark->connector);

        return ret;
err:
        spin_lock(&mark->lock);
        mark->flags &= ~(FSNOTIFY_MARK_FLAG_ALIVE |
                         FSNOTIFY_MARK_FLAG_ATTACHED);
        list_del_init(&mark->g_list);
        spin_unlock(&mark->lock);

        fsnotify_put_mark(mark);
        return ret;
}

int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp,
                      unsigned int obj_type, int add_flags,
                      __kernel_fsid_t *fsid)
{
        int ret;
        struct fsnotify_group *group = mark->group;

        fsnotify_group_lock(group);
        ret = fsnotify_add_mark_locked(mark, connp, obj_type, add_flags, fsid);
        fsnotify_group_unlock(group);
        return ret;
}
EXPORT_SYMBOL_GPL(fsnotify_add_mark);

/*
 * Given a list of marks, find the mark associated with given group. If found
 * take a reference to that mark and return it, else return NULL.
 */
struct fsnotify_mark *fsnotify_find_mark(fsnotify_connp_t *connp,
                                         struct fsnotify_group *group)
{
        struct fsnotify_mark_connector *conn;
        struct fsnotify_mark *mark;

        conn = fsnotify_grab_connector(connp);
        if (!conn)
                return NULL;

        hlist_for_each_entry(mark, &conn->list, obj_list) {
                if (mark->group == group &&
                    (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
                        fsnotify_get_mark(mark);
                        spin_unlock(&conn->lock);
                        return mark;
                }
        }
        spin_unlock(&conn->lock);
        return NULL;
}
EXPORT_SYMBOL_GPL(fsnotify_find_mark);

/* Clear any marks in a group with given type mask */
void fsnotify_clear_marks_by_group(struct fsnotify_group *group,
                                   unsigned int obj_type)
{
        struct fsnotify_mark *lmark, *mark;
        LIST_HEAD(to_free);
        struct list_head *head = &to_free;

        /* Skip selection step if we want to clear all marks. */
        if (obj_type == FSNOTIFY_OBJ_TYPE_ANY) {
                head = &group->marks_list;
                goto clear;
        }
        /*
         * We have to be really careful here. Anytime we drop mark_mutex, e.g.
         * fsnotify_clear_marks_by_inode() can come and free marks. Even in our
         * to_free list so we have to use mark_mutex even when accessing that
         * list. And freeing mark requires us to drop mark_mutex. So we can
         * reliably free only the first mark in the list. That's why we first
         * move marks to free to to_free list in one go and then free marks in
         * to_free list one by one.
         */
        fsnotify_group_lock(group);
        list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
                if (mark->connector->type == obj_type)
                        list_move(&mark->g_list, &to_free);
        }
        fsnotify_group_unlock(group);

clear:
        while (1) {
                fsnotify_group_lock(group);
                if (list_empty(head)) {
                        fsnotify_group_unlock(group);
                        break;
                }
                mark = list_first_entry(head, struct fsnotify_mark, g_list);
                fsnotify_get_mark(mark);
                fsnotify_detach_mark(mark);
                fsnotify_group_unlock(group);
                fsnotify_free_mark(mark);
                fsnotify_put_mark(mark);
        }
}

/* Destroy all marks attached to an object via connector */
void fsnotify_destroy_marks(fsnotify_connp_t *connp)
{
        struct fsnotify_mark_connector *conn;
        struct fsnotify_mark *mark, *old_mark = NULL;
        void *objp;
        unsigned int type;

        conn = fsnotify_grab_connector(connp);
        if (!conn)
                return;
        /*
         * We have to be careful since we can race with e.g.
         * fsnotify_clear_marks_by_group() and once we drop the conn->lock, the
         * list can get modified. However we are holding mark reference and
         * thus our mark cannot be removed from obj_list so we can continue
         * iteration after regaining conn->lock.
         */
        hlist_for_each_entry(mark, &conn->list, obj_list) {
                fsnotify_get_mark(mark);
                spin_unlock(&conn->lock);
                if (old_mark)
                        fsnotify_put_mark(old_mark);
                old_mark = mark;
                fsnotify_destroy_mark(mark, mark->group);
                spin_lock(&conn->lock);
        }
        /*
         * Detach list from object now so that we don't pin inode until all
         * mark references get dropped. It would lead to strange results such
         * as delaying inode deletion or blocking unmount.
         */
        objp = fsnotify_detach_connector_from_object(conn, &type);
        spin_unlock(&conn->lock);
        if (old_mark)
                fsnotify_put_mark(old_mark);
        fsnotify_drop_object(type, objp);
}

/*
 * Nothing fancy, just initialize lists and locks and counters.
 */
void fsnotify_init_mark(struct fsnotify_mark *mark,
                        struct fsnotify_group *group)
{
        memset(mark, 0, sizeof(*mark));
        spin_lock_init(&mark->lock);
        refcount_set(&mark->refcnt, 1);
        fsnotify_get_group(group);
        mark->group = group;
        WRITE_ONCE(mark->connector, NULL);
}
EXPORT_SYMBOL_GPL(fsnotify_init_mark);

/*
 * Destroy all marks in destroy_list, waits for SRCU period to finish before
 * actually freeing marks.
 */
static void fsnotify_mark_destroy_workfn(struct work_struct *work)
{
        struct fsnotify_mark *mark, *next;
        struct list_head private_destroy_list;

        spin_lock(&destroy_lock);
        /* exchange the list head */
        list_replace_init(&destroy_list, &private_destroy_list);
        spin_unlock(&destroy_lock);

        synchronize_srcu(&fsnotify_mark_srcu);

        list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) {
                list_del_init(&mark->g_list);
                fsnotify_final_mark_destroy(mark);
        }
}

/* Wait for all marks queued for destruction to be actually destroyed */
void fsnotify_wait_marks_destroyed(void)
{
        flush_delayed_work(&reaper_work);
}
EXPORT_SYMBOL_GPL(fsnotify_wait_marks_destroyed);
































































































































































































































































































































    4 



    1 



























    1 

























































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* audit.h -- Auditing support
 *
 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
 * All Rights Reserved.
 *
 * Written by Rickard E. (Rik) Faith <faith@redhat.com>
 */
#ifndef _LINUX_AUDIT_H_
#define _LINUX_AUDIT_H_

#include <linux/sched.h>
#include <linux/ptrace.h>
#include <uapi/linux/audit.h>
#include <uapi/linux/netfilter/nf_tables.h>

#define AUDIT_INO_UNSET ((unsigned long)-1)
#define AUDIT_DEV_UNSET ((dev_t)-1)

struct audit_sig_info {
        uid_t                uid;
        pid_t                pid;
        char                ctx[];
};

struct audit_buffer;
struct audit_context;
struct inode;
struct netlink_skb_parms;
struct path;
struct linux_binprm;
struct mq_attr;
struct mqstat;
struct audit_watch;
struct audit_tree;
struct sk_buff;

struct audit_krule {
        u32                        pflags;
        u32                        flags;
        u32                        listnr;
        u32                        action;
        u32                        mask[AUDIT_BITMASK_SIZE];
        u32                        buflen; /* for data alloc on list rules */
        u32                        field_count;
        char                        *filterkey; /* ties events to rules */
        struct audit_field        *fields;
        struct audit_field        *arch_f; /* quick access to arch field */
        struct audit_field        *inode_f; /* quick access to an inode field */
        struct audit_watch        *watch;        /* associated watch */
        struct audit_tree        *tree;        /* associated watched tree */
        struct audit_fsnotify_mark        *exe;
        struct list_head        rlist;        /* entry in audit_{watch,tree}.rules list */
        struct list_head        list;        /* for AUDIT_LIST* purposes only */
        u64                        prio;
};

/* Flag to indicate legacy AUDIT_LOGINUID unset usage */
#define AUDIT_LOGINUID_LEGACY                0x1

struct audit_field {
        u32                                type;
        union {
                u32                        val;
                kuid_t                        uid;
                kgid_t                        gid;
                struct {
                        char                *lsm_str;
                        void                *lsm_rule;
                };
        };
        u32                                op;
};

enum audit_ntp_type {
        AUDIT_NTP_OFFSET,
        AUDIT_NTP_FREQ,
        AUDIT_NTP_STATUS,
        AUDIT_NTP_TAI,
        AUDIT_NTP_TICK,
        AUDIT_NTP_ADJUST,

        AUDIT_NTP_NVALS /* count */
};

#ifdef CONFIG_AUDITSYSCALL
struct audit_ntp_val {
        long long oldval, newval;
};

struct audit_ntp_data {
        struct audit_ntp_val vals[AUDIT_NTP_NVALS];
};
#else
struct audit_ntp_data {};
#endif

enum audit_nfcfgop {
        AUDIT_XT_OP_REGISTER,
        AUDIT_XT_OP_REPLACE,
        AUDIT_XT_OP_UNREGISTER,
        AUDIT_NFT_OP_TABLE_REGISTER,
        AUDIT_NFT_OP_TABLE_UNREGISTER,
        AUDIT_NFT_OP_CHAIN_REGISTER,
        AUDIT_NFT_OP_CHAIN_UNREGISTER,
        AUDIT_NFT_OP_RULE_REGISTER,
        AUDIT_NFT_OP_RULE_UNREGISTER,
        AUDIT_NFT_OP_SET_REGISTER,
        AUDIT_NFT_OP_SET_UNREGISTER,
        AUDIT_NFT_OP_SETELEM_REGISTER,
        AUDIT_NFT_OP_SETELEM_UNREGISTER,
        AUDIT_NFT_OP_GEN_REGISTER,
        AUDIT_NFT_OP_OBJ_REGISTER,
        AUDIT_NFT_OP_OBJ_UNREGISTER,
        AUDIT_NFT_OP_OBJ_RESET,
        AUDIT_NFT_OP_FLOWTABLE_REGISTER,
        AUDIT_NFT_OP_FLOWTABLE_UNREGISTER,
        AUDIT_NFT_OP_INVALID,
};

extern int is_audit_feature_set(int which);

extern int __init audit_register_class(int class, unsigned *list);
extern int audit_classify_syscall(int abi, unsigned syscall);
extern int audit_classify_arch(int arch);
/* only for compat system calls */
extern unsigned compat_write_class[];
extern unsigned compat_read_class[];
extern unsigned compat_dir_class[];
extern unsigned compat_chattr_class[];
extern unsigned compat_signal_class[];

extern int audit_classify_compat_syscall(int abi, unsigned syscall);

/* audit_names->type values */
#define        AUDIT_TYPE_UNKNOWN        0        /* we don't know yet */
#define        AUDIT_TYPE_NORMAL        1        /* a "normal" audit record */
#define        AUDIT_TYPE_PARENT        2        /* a parent audit record */
#define        AUDIT_TYPE_CHILD_DELETE 3        /* a child being deleted */
#define        AUDIT_TYPE_CHILD_CREATE 4        /* a child being created */

/* maximized args number that audit_socketcall can process */
#define AUDITSC_ARGS                6

/* bit values for ->signal->audit_tty */
#define AUDIT_TTY_ENABLE        BIT(0)
#define AUDIT_TTY_LOG_PASSWD        BIT(1)

struct filename;

#define AUDIT_OFF        0
#define AUDIT_ON        1
#define AUDIT_LOCKED        2
#ifdef CONFIG_AUDIT
/* These are defined in audit.c */
                                /* Public API */
extern __printf(4, 5)
void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
               const char *fmt, ...);

extern struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type);
extern __printf(2, 3)
void audit_log_format(struct audit_buffer *ab, const char *fmt, ...);
extern void                    audit_log_end(struct audit_buffer *ab);
extern bool                    audit_string_contains_control(const char *string,
                                                          size_t len);
extern void                    audit_log_n_hex(struct audit_buffer *ab,
                                          const unsigned char *buf,
                                          size_t len);
extern void                    audit_log_n_string(struct audit_buffer *ab,
                                               const char *buf,
                                               size_t n);
extern void                    audit_log_n_untrustedstring(struct audit_buffer *ab,
                                                        const char *string,
                                                        size_t n);
extern void                    audit_log_untrustedstring(struct audit_buffer *ab,
                                                      const char *string);
extern void                    audit_log_d_path(struct audit_buffer *ab,
                                             const char *prefix,
                                             const struct path *path);
extern void                    audit_log_key(struct audit_buffer *ab,
                                          char *key);
extern void                    audit_log_path_denied(int type,
                                                  const char *operation);
extern void                    audit_log_lost(const char *message);

extern int audit_log_task_context(struct audit_buffer *ab);
extern void audit_log_task_info(struct audit_buffer *ab);

extern int                    audit_update_lsm_rules(void);

                                /* Private API (for audit.c only) */
extern int audit_rule_change(int type, int seq, void *data, size_t datasz);
extern int audit_list_rules_send(struct sk_buff *request_skb, int seq);

extern int audit_set_loginuid(kuid_t loginuid);

static inline kuid_t audit_get_loginuid(struct task_struct *tsk)
{
        return tsk->loginuid;
}

static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
{
        return tsk->sessionid;
}

extern u32 audit_enabled;

extern int audit_signal_info(int sig, struct task_struct *t);

#else /* CONFIG_AUDIT */
static inline __printf(4, 5)
void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
               const char *fmt, ...)
{ }
static inline struct audit_buffer *audit_log_start(struct audit_context *ctx,
                                                   gfp_t gfp_mask, int type)
{
        return NULL;
}
static inline __printf(2, 3)
void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
{ }
static inline void audit_log_end(struct audit_buffer *ab)
{ }
static inline void audit_log_n_hex(struct audit_buffer *ab,
                                   const unsigned char *buf, size_t len)
{ }
static inline void audit_log_n_string(struct audit_buffer *ab,
                                      const char *buf, size_t n)
{ }
static inline void  audit_log_n_untrustedstring(struct audit_buffer *ab,
                                                const char *string, size_t n)
{ }
static inline void audit_log_untrustedstring(struct audit_buffer *ab,
                                             const char *string)
{ }
static inline void audit_log_d_path(struct audit_buffer *ab,
                                    const char *prefix,
                                    const struct path *path)
{ }
static inline void audit_log_key(struct audit_buffer *ab, char *key)
{ }
static inline void audit_log_path_denied(int type, const char *operation)
{ }
static inline int audit_log_task_context(struct audit_buffer *ab)
{
        return 0;
}
static inline void audit_log_task_info(struct audit_buffer *ab)
{ }

static inline kuid_t audit_get_loginuid(struct task_struct *tsk)
{
        return INVALID_UID;
}

static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
{
        return AUDIT_SID_UNSET;
}

#define audit_enabled AUDIT_OFF

static inline int audit_signal_info(int sig, struct task_struct *t)
{
        return 0;
}

#endif /* CONFIG_AUDIT */

#ifdef CONFIG_AUDIT_COMPAT_GENERIC
#define audit_is_compat(arch)  (!((arch) & __AUDIT_ARCH_64BIT))
#else
#define audit_is_compat(arch)  false
#endif

#define AUDIT_INODE_PARENT        1        /* dentry represents the parent */
#define AUDIT_INODE_HIDDEN        2        /* audit record should be hidden */
#define AUDIT_INODE_NOEVAL        4        /* audit record incomplete */

#ifdef CONFIG_AUDITSYSCALL
#include <asm/syscall.h> /* for syscall_get_arch() */

/* These are defined in auditsc.c */
                                /* Public API */
extern int  audit_alloc(struct task_struct *task);
extern void __audit_free(struct task_struct *task);
extern void __audit_syscall_entry(int major, unsigned long a0, unsigned long a1,
                                  unsigned long a2, unsigned long a3);
extern void __audit_syscall_exit(int ret_success, long ret_value);
extern struct filename *__audit_reusename(const __user char *uptr);
extern void __audit_getname(struct filename *name);
extern void __audit_getcwd(void);
extern void __audit_inode(struct filename *name, const struct dentry *dentry,
                                unsigned int flags);
extern void __audit_file(const struct file *);
extern void __audit_inode_child(struct inode *parent,
                                const struct dentry *dentry,
                                const unsigned char type);
extern void audit_seccomp(unsigned long syscall, long signr, int code);
extern void audit_seccomp_actions_logged(const char *names,
                                         const char *old_names, int res);
extern void __audit_ptrace(struct task_struct *t);

static inline void audit_set_context(struct task_struct *task, struct audit_context *ctx)
{
        task->audit_context = ctx;
}

static inline struct audit_context *audit_context(void)
{
        return current->audit_context;
}

static inline bool audit_dummy_context(void)
{
        void *p = audit_context();
        return !p || *(int *)p;
}
static inline void audit_free(struct task_struct *task)
{
        if (unlikely(task->audit_context))
                __audit_free(task);
}
static inline void audit_syscall_entry(int major, unsigned long a0,
                                       unsigned long a1, unsigned long a2,
                                       unsigned long a3)
{
        if (unlikely(audit_context()))
                __audit_syscall_entry(major, a0, a1, a2, a3);
}
static inline void audit_syscall_exit(void *pt_regs)
{
        if (unlikely(audit_context())) {
                int success = is_syscall_success(pt_regs);
                long return_code = regs_return_value(pt_regs);

                __audit_syscall_exit(success, return_code);
        }
}
static inline struct filename *audit_reusename(const __user char *name)
{
        if (unlikely(!audit_dummy_context()))
                return __audit_reusename(name);
        return NULL;
}
static inline void audit_getname(struct filename *name)
{
        if (unlikely(!audit_dummy_context()))
                __audit_getname(name);
}
static inline void audit_getcwd(void)
{
        if (unlikely(audit_context()))
                __audit_getcwd();
}
static inline void audit_inode(struct filename *name,
                                const struct dentry *dentry,
                                unsigned int aflags) {
        if (unlikely(!audit_dummy_context()))
                __audit_inode(name, dentry, aflags);
}
static inline void audit_file(struct file *file)
{
        if (unlikely(!audit_dummy_context()))
                __audit_file(file);
}
static inline void audit_inode_parent_hidden(struct filename *name,
                                                const struct dentry *dentry)
{
        if (unlikely(!audit_dummy_context()))
                __audit_inode(name, dentry,
                                AUDIT_INODE_PARENT | AUDIT_INODE_HIDDEN);
}
static inline void audit_inode_child(struct inode *parent,
                                     const struct dentry *dentry,
                                     const unsigned char type) {
        if (unlikely(!audit_dummy_context()))
                __audit_inode_child(parent, dentry, type);
}
void audit_core_dumps(long signr);

static inline void audit_ptrace(struct task_struct *t)
{
        if (unlikely(!audit_dummy_context()))
                __audit_ptrace(t);
}

                                /* Private API (for audit.c only) */
extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp);
extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode);
extern void __audit_bprm(struct linux_binprm *bprm);
extern int __audit_socketcall(int nargs, unsigned long *args);
extern int __audit_sockaddr(int len, void *addr);
extern void __audit_fd_pair(int fd1, int fd2);
extern void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr);
extern void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout);
extern void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification);
extern void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat);
extern int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
                                  const struct cred *new,
                                  const struct cred *old);
extern void __audit_log_capset(const struct cred *new, const struct cred *old);
extern void __audit_mmap_fd(int fd, int flags);
extern void __audit_log_kern_module(char *name);
extern void __audit_fanotify(unsigned int response);
extern void __audit_tk_injoffset(struct timespec64 offset);
extern void __audit_ntp_log(const struct audit_ntp_data *ad);
extern void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries,
                              enum audit_nfcfgop op, gfp_t gfp);

static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
{
        if (unlikely(!audit_dummy_context()))
                __audit_ipc_obj(ipcp);
}
static inline void audit_fd_pair(int fd1, int fd2)
{
        if (unlikely(!audit_dummy_context()))
                __audit_fd_pair(fd1, fd2);
}
static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode)
{
        if (unlikely(!audit_dummy_context()))
                __audit_ipc_set_perm(qbytes, uid, gid, mode);
}
static inline void audit_bprm(struct linux_binprm *bprm)
{
        if (unlikely(!audit_dummy_context()))
                __audit_bprm(bprm);
}
static inline int audit_socketcall(int nargs, unsigned long *args)
{
        if (unlikely(!audit_dummy_context()))
                return __audit_socketcall(nargs, args);
        return 0;
}

static inline int audit_socketcall_compat(int nargs, u32 *args)
{
        unsigned long a[AUDITSC_ARGS];
        int i;

        if (audit_dummy_context())
                return 0;

        for (i = 0; i < nargs; i++)
                a[i] = (unsigned long)args[i];
        return __audit_socketcall(nargs, a);
}

static inline int audit_sockaddr(int len, void *addr)
{
        if (unlikely(!audit_dummy_context()))
                return __audit_sockaddr(len, addr);
        return 0;
}
static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
{
        if (unlikely(!audit_dummy_context()))
                __audit_mq_open(oflag, mode, attr);
}
static inline void audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout)
{
        if (unlikely(!audit_dummy_context()))
                __audit_mq_sendrecv(mqdes, msg_len, msg_prio, abs_timeout);
}
static inline void audit_mq_notify(mqd_t mqdes, const struct sigevent *notification)
{
        if (unlikely(!audit_dummy_context()))
                __audit_mq_notify(mqdes, notification);
}
static inline void audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
{
        if (unlikely(!audit_dummy_context()))
                __audit_mq_getsetattr(mqdes, mqstat);
}

static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm,
                                       const struct cred *new,
                                       const struct cred *old)
{
        if (unlikely(!audit_dummy_context()))
                return __audit_log_bprm_fcaps(bprm, new, old);
        return 0;
}

static inline void audit_log_capset(const struct cred *new,
                                   const struct cred *old)
{
        if (unlikely(!audit_dummy_context()))
                __audit_log_capset(new, old);
}

static inline void audit_mmap_fd(int fd, int flags)
{
        if (unlikely(!audit_dummy_context()))
                __audit_mmap_fd(fd, flags);
}

static inline void audit_log_kern_module(char *name)
{
        if (!audit_dummy_context())
                __audit_log_kern_module(name);
}

static inline void audit_fanotify(unsigned int response)
{
        if (!audit_dummy_context())
                __audit_fanotify(response);
}

static inline void audit_tk_injoffset(struct timespec64 offset)
{
        /* ignore no-op events */
        if (offset.tv_sec == 0 && offset.tv_nsec == 0)
                return;

        if (!audit_dummy_context())
                __audit_tk_injoffset(offset);
}

static inline void audit_ntp_init(struct audit_ntp_data *ad)
{
        memset(ad, 0, sizeof(*ad));
}

static inline void audit_ntp_set_old(struct audit_ntp_data *ad,
                                     enum audit_ntp_type type, long long val)
{
        ad->vals[type].oldval = val;
}

static inline void audit_ntp_set_new(struct audit_ntp_data *ad,
                                     enum audit_ntp_type type, long long val)
{
        ad->vals[type].newval = val;
}

static inline void audit_ntp_log(const struct audit_ntp_data *ad)
{
        if (!audit_dummy_context())
                __audit_ntp_log(ad);
}

static inline void audit_log_nfcfg(const char *name, u8 af,
                                   unsigned int nentries,
                                   enum audit_nfcfgop op, gfp_t gfp)
{
        if (audit_enabled)
                __audit_log_nfcfg(name, af, nentries, op, gfp);
}

extern int audit_n_rules;
extern int audit_signals;
#else /* CONFIG_AUDITSYSCALL */
static inline int audit_alloc(struct task_struct *task)
{
        return 0;
}
static inline void audit_free(struct task_struct *task)
{ }
static inline void audit_syscall_entry(int major, unsigned long a0,
                                       unsigned long a1, unsigned long a2,
                                       unsigned long a3)
{ }
static inline void audit_syscall_exit(void *pt_regs)
{ }
static inline bool audit_dummy_context(void)
{
        return true;
}
static inline void audit_set_context(struct task_struct *task, struct audit_context *ctx)
{ }
static inline struct audit_context *audit_context(void)
{
        return NULL;
}
static inline struct filename *audit_reusename(const __user char *name)
{
        return NULL;
}
static inline void audit_getname(struct filename *name)
{ }
static inline void audit_getcwd(void)
{ }
static inline void audit_inode(struct filename *name,
                                const struct dentry *dentry,
                                unsigned int aflags)
{ }
static inline void audit_file(struct file *file)
{
}
static inline void audit_inode_parent_hidden(struct filename *name,
                                const struct dentry *dentry)
{ }
static inline void audit_inode_child(struct inode *parent,
                                     const struct dentry *dentry,
                                     const unsigned char type)
{ }
static inline void audit_core_dumps(long signr)
{ }
static inline void audit_seccomp(unsigned long syscall, long signr, int code)
{ }
static inline void audit_seccomp_actions_logged(const char *names,
                                                const char *old_names, int res)
{ }
static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
{ }
static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid,
                                        gid_t gid, umode_t mode)
{ }
static inline void audit_bprm(struct linux_binprm *bprm)
{ }
static inline int audit_socketcall(int nargs, unsigned long *args)
{
        return 0;
}

static inline int audit_socketcall_compat(int nargs, u32 *args)
{
        return 0;
}

static inline void audit_fd_pair(int fd1, int fd2)
{ }
static inline int audit_sockaddr(int len, void *addr)
{
        return 0;
}
static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
{ }
static inline void audit_mq_sendrecv(mqd_t mqdes, size_t msg_len,
                                     unsigned int msg_prio,
                                     const struct timespec64 *abs_timeout)
{ }
static inline void audit_mq_notify(mqd_t mqdes,
                                   const struct sigevent *notification)
{ }
static inline void audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
{ }
static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm,
                                       const struct cred *new,
                                       const struct cred *old)
{
        return 0;
}
static inline void audit_log_capset(const struct cred *new,
                                    const struct cred *old)
{ }
static inline void audit_mmap_fd(int fd, int flags)
{ }

static inline void audit_log_kern_module(char *name)
{
}

static inline void audit_fanotify(unsigned int response)
{ }

static inline void audit_tk_injoffset(struct timespec64 offset)
{ }

static inline void audit_ntp_init(struct audit_ntp_data *ad)
{ }

static inline void audit_ntp_set_old(struct audit_ntp_data *ad,
                                     enum audit_ntp_type type, long long val)
{ }

static inline void audit_ntp_set_new(struct audit_ntp_data *ad,
                                     enum audit_ntp_type type, long long val)
{ }

static inline void audit_ntp_log(const struct audit_ntp_data *ad)
{ }

static inline void audit_ptrace(struct task_struct *t)
{ }

static inline void audit_log_nfcfg(const char *name, u8 af,
                                   unsigned int nentries,
                                   enum audit_nfcfgop op, gfp_t gfp)
{ }

#define audit_n_rules 0
#define audit_signals 0
#endif /* CONFIG_AUDITSYSCALL */

static inline bool audit_loginuid_set(struct task_struct *tsk)
{
        return uid_valid(audit_get_loginuid(tsk));
}

#endif


































































    1 













    1 



    1 
























































    1 


    1 











    1 















    1 






    1 


    1 

    1 









    1 
    1 

























    1 






    1 




    1 






















    1 

































































































    1 




























    1 





    1 
    1 


    1 



    1 




    1 






































    1 



    1 



    1 




    1 



    1 



    1 





























    1 













    1 





    1 


    1 



    1 

















    1 







    1 





    1 


    1 






    1 






    1 

































































































































































    1 

    1 

    1 
    1 































    1 





















































    1 


    1 
    1 



    1 





    1 






























    1 













    1 




    1 




    1 






    1 

    1 






    1 



    1 

    1 



    1 
    1 





    1 
















    1 











    1 
    1 
    1 




































































































































































































































































































































































    1 









    1 
    1 


    1 

    1 




    1 







    1 










    1 



    1 







    1 



    1 
































    1 










    1 




















    1 








































    1 


    1 






    1 

    1 





















    1 






    1 























    1 



    1 


    1 






    1 

    1 




    1 













    1 






    1 






    1 
    1 


















































    1 
    1 

    1 
    1 


    1 



    1 



    1 


    1 













    1 
    1 





































    1 





    1 

    1 


    1 













































    1 



    1 


    1 







    1 























    1 
    1 




















    1 














    1 
    1 
    1 

















































    1 
    1 


    1 


























    1 






    1 































    1 























    1 








    1 





    1 
    1 





    1 








    1 




    1 










    1 





























    1 



    1 

























    1 

    1 



    1 
    1 



    1 

    1 



    1 






    1 

    1 









    1 











    1 






















    1 



    1 



    1 



    1 
    1 







































































































































































































    1 


    1 















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 




































    1 



    1 

























    1 













    1 






    1 

















    1 













    1 








































    1 


    1 









    1 














    1 
    1 


    1 







    1 












    1 


    1 




    1 
    1 



    1 


    1 
    1 


















    1 

    1 

    1 

    1 


    1 
    1 









    1 






    1 

























    1 
    1 





















    1 
















    1 






    1 

    1 

    1 


































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
 * Written by Alex Tomas <alex@clusterfs.com>
 *
 * Architecture independence:
 *   Copyright (c) 2005, Bull S.A.
 *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
 */

/*
 * Extents support for EXT4
 *
 * TODO:
 *   - ext4*_error() should be used in some situations
 *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
 *   - smart tree reduction
 */

#include <linux/fs.h>
#include <linux/time.h>
#include <linux/jbd2.h>
#include <linux/highuid.h>
#include <linux/pagemap.h>
#include <linux/quotaops.h>
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/fiemap.h>
#include <linux/backing-dev.h>
#include <linux/iomap.h>
#include "ext4_jbd2.h"
#include "ext4_extents.h"
#include "xattr.h"

#include <trace/events/ext4.h>

/*
 * used by extent splitting.
 */
#define EXT4_EXT_MAY_ZEROOUT        0x1  /* safe to zeroout if split fails \
                                        due to ENOSPC */
#define EXT4_EXT_MARK_UNWRIT1        0x2  /* mark first half unwritten */
#define EXT4_EXT_MARK_UNWRIT2        0x4  /* mark second half unwritten */

#define EXT4_EXT_DATA_VALID1        0x8  /* first half contains valid data */
#define EXT4_EXT_DATA_VALID2        0x10 /* second half contains valid data */

static __le32 ext4_extent_block_csum(struct inode *inode,
                                     struct ext4_extent_header *eh)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        __u32 csum;

        csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh,
                           EXT4_EXTENT_TAIL_OFFSET(eh));
        return cpu_to_le32(csum);
}

static int ext4_extent_block_csum_verify(struct inode *inode,
                                         struct ext4_extent_header *eh)
{
        struct ext4_extent_tail *et;

        if (!ext4_has_metadata_csum(inode->i_sb))
                return 1;

        et = find_ext4_extent_tail(eh);
        if (et->et_checksum != ext4_extent_block_csum(inode, eh))
                return 0;
        return 1;
}

static void ext4_extent_block_csum_set(struct inode *inode,
                                       struct ext4_extent_header *eh)
{
        struct ext4_extent_tail *et;

        if (!ext4_has_metadata_csum(inode->i_sb))
                return;

        et = find_ext4_extent_tail(eh);
        et->et_checksum = ext4_extent_block_csum(inode, eh);
}

static int ext4_split_extent_at(handle_t *handle,
                             struct inode *inode,
                             struct ext4_ext_path **ppath,
                             ext4_lblk_t split,
                             int split_flag,
                             int flags);

static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
{
        /*
         * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
         * moment, get_block can be called only for blocks inside i_size since
         * page cache has been already dropped and writes are blocked by
         * i_mutex. So we can safely drop the i_data_sem here.
         */
        BUG_ON(EXT4_JOURNAL(inode) == NULL);
        ext4_discard_preallocations(inode, 0);
        up_write(&EXT4_I(inode)->i_data_sem);
        *dropped = 1;
        return 0;
}

/*
 * Make sure 'handle' has at least 'check_cred' credits. If not, restart
 * transaction with 'restart_cred' credits. The function drops i_data_sem
 * when restarting transaction and gets it after transaction is restarted.
 *
 * The function returns 0 on success, 1 if transaction had to be restarted,
 * and < 0 in case of fatal error.
 */
int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
                                int check_cred, int restart_cred,
                                int revoke_cred)
{
        int ret;
        int dropped = 0;

        ret = ext4_journal_ensure_credits_fn(handle, check_cred, restart_cred,
                revoke_cred, ext4_ext_trunc_restart_fn(inode, &dropped));
        if (dropped)
                down_write(&EXT4_I(inode)->i_data_sem);
        return ret;
}

/*
 * could return:
 *  - EROFS
 *  - ENOMEM
 */
static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
                                struct ext4_ext_path *path)
{
        int err = 0;

        if (path->p_bh) {
                /* path points to block */
                BUFFER_TRACE(path->p_bh, "get_write_access");
                err = ext4_journal_get_write_access(handle, path->p_bh);
                /*
                 * The extent buffer's verified bit will be set again in
                 * __ext4_ext_dirty(). We could leave an inconsistent
                 * buffer if the extents updating procudure break off du
                 * to some error happens, force to check it again.
                 */
                if (!err)
                        clear_buffer_verified(path->p_bh);
        }
        /* path points to leaf/index in inode body */
        /* we use in-core data, no need to protect them */
        return err;
}

/*
 * could return:
 *  - EROFS
 *  - ENOMEM
 *  - EIO
 */
static int __ext4_ext_dirty(const char *where, unsigned int line,
                            handle_t *handle, struct inode *inode,
                            struct ext4_ext_path *path)
{
        int err;

        WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
        if (path->p_bh) {
                ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh));
                /* path points to block */
                err = __ext4_handle_dirty_metadata(where, line, handle,
                                                   inode, path->p_bh);
                /* Extents updating done, re-set verified flag */
                if (!err)
                        set_buffer_verified(path->p_bh);
        } else {
                /* path points to leaf/index in inode body */
                err = ext4_mark_inode_dirty(handle, inode);
        }
        return err;
}

#define ext4_ext_dirty(handle, inode, path) \
                __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))

static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
                              struct ext4_ext_path *path,
                              ext4_lblk_t block)
{
        if (path) {
                int depth = path->p_depth;
                struct ext4_extent *ex;

                /*
                 * Try to predict block placement assuming that we are
                 * filling in a file which will eventually be
                 * non-sparse --- i.e., in the case of libbfd writing
                 * an ELF object sections out-of-order but in a way
                 * the eventually results in a contiguous object or
                 * executable file, or some database extending a table
                 * space file.  However, this is actually somewhat
                 * non-ideal if we are writing a sparse file such as
                 * qemu or KVM writing a raw image file that is going
                 * to stay fairly sparse, since it will end up
                 * fragmenting the file system's free space.  Maybe we
                 * should have some hueristics or some way to allow
                 * userspace to pass a hint to file system,
                 * especially if the latter case turns out to be
                 * common.
                 */
                ex = path[depth].p_ext;
                if (ex) {
                        ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
                        ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);

                        if (block > ext_block)
                                return ext_pblk + (block - ext_block);
                        else
                                return ext_pblk - (ext_block - block);
                }

                /* it looks like index is empty;
                 * try to find starting block from index itself */
                if (path[depth].p_bh)
                        return path[depth].p_bh->b_blocknr;
        }

        /* OK. use inode's group */
        return ext4_inode_to_goal_block(inode);
}

/*
 * Allocation for a meta data block
 */
static ext4_fsblk_t
ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
                        struct ext4_ext_path *path,
                        struct ext4_extent *ex, int *err, unsigned int flags)
{
        ext4_fsblk_t goal, newblock;

        goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
        newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
                                        NULL, err);
        return newblock;
}

static inline int ext4_ext_space_block(struct inode *inode, int check)
{
        int size;

        size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
                        / sizeof(struct ext4_extent);
#ifdef AGGRESSIVE_TEST
        if (!check && size > 6)
                size = 6;
#endif
        return size;
}

static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
{
        int size;

        size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
                        / sizeof(struct ext4_extent_idx);
#ifdef AGGRESSIVE_TEST
        if (!check && size > 5)
                size = 5;
#endif
        return size;
}

static inline int ext4_ext_space_root(struct inode *inode, int check)
{
        int size;

        size = sizeof(EXT4_I(inode)->i_data);
        size -= sizeof(struct ext4_extent_header);
        size /= sizeof(struct ext4_extent);
#ifdef AGGRESSIVE_TEST
        if (!check && size > 3)
                size = 3;
#endif
        return size;
}

static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
{
        int size;

        size = sizeof(EXT4_I(inode)->i_data);
        size -= sizeof(struct ext4_extent_header);
        size /= sizeof(struct ext4_extent_idx);
#ifdef AGGRESSIVE_TEST
        if (!check && size > 4)
                size = 4;
#endif
        return size;
}

static inline int
ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
                           struct ext4_ext_path **ppath, ext4_lblk_t lblk,
                           int nofail)
{
        struct ext4_ext_path *path = *ppath;
        int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext);
        int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO;

        if (nofail)
                flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL;

        return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ?
                        EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0,
                        flags);
}

static int
ext4_ext_max_entries(struct inode *inode, int depth)
{
        int max;

        if (depth == ext_depth(inode)) {
                if (depth == 0)
                        max = ext4_ext_space_root(inode, 1);
                else
                        max = ext4_ext_space_root_idx(inode, 1);
        } else {
                if (depth == 0)
                        max = ext4_ext_space_block(inode, 1);
                else
                        max = ext4_ext_space_block_idx(inode, 1);
        }

        return max;
}

static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
{
        ext4_fsblk_t block = ext4_ext_pblock(ext);
        int len = ext4_ext_get_actual_len(ext);
        ext4_lblk_t lblock = le32_to_cpu(ext->ee_block);

        /*
         * We allow neither:
         *  - zero length
         *  - overflow/wrap-around
         */
        if (lblock + len <= lblock)
                return 0;
        return ext4_inode_block_valid(inode, block, len);
}

static int ext4_valid_extent_idx(struct inode *inode,
                                struct ext4_extent_idx *ext_idx)
{
        ext4_fsblk_t block = ext4_idx_pblock(ext_idx);

        return ext4_inode_block_valid(inode, block, 1);
}

static int ext4_valid_extent_entries(struct inode *inode,
                                     struct ext4_extent_header *eh,
                                     ext4_lblk_t lblk, ext4_fsblk_t *pblk,
                                     int depth)
{
        unsigned short entries;
        ext4_lblk_t lblock = 0;
        ext4_lblk_t cur = 0;

        if (eh->eh_entries == 0)
                return 1;

        entries = le16_to_cpu(eh->eh_entries);

        if (depth == 0) {
                /* leaf entries */
                struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);

                /*
                 * The logical block in the first entry should equal to
                 * the number in the index block.
                 */
                if (depth != ext_depth(inode) &&
                    lblk != le32_to_cpu(ext->ee_block))
                        return 0;
                while (entries) {
                        if (!ext4_valid_extent(inode, ext))
                                return 0;

                        /* Check for overlapping extents */
                        lblock = le32_to_cpu(ext->ee_block);
                        if (lblock < cur) {
                                *pblk = ext4_ext_pblock(ext);
                                return 0;
                        }
                        cur = lblock + ext4_ext_get_actual_len(ext);
                        ext++;
                        entries--;
                }
        } else {
                struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);

                /*
                 * The logical block in the first entry should equal to
                 * the number in the parent index block.
                 */
                if (depth != ext_depth(inode) &&
                    lblk != le32_to_cpu(ext_idx->ei_block))
                        return 0;
                while (entries) {
                        if (!ext4_valid_extent_idx(inode, ext_idx))
                                return 0;

                        /* Check for overlapping index extents */
                        lblock = le32_to_cpu(ext_idx->ei_block);
                        if (lblock < cur) {
                                *pblk = ext4_idx_pblock(ext_idx);
                                return 0;
                        }
                        ext_idx++;
                        entries--;
                        cur = lblock + 1;
                }
        }
        return 1;
}

static int __ext4_ext_check(const char *function, unsigned int line,
                            struct inode *inode, struct ext4_extent_header *eh,
                            int depth, ext4_fsblk_t pblk, ext4_lblk_t lblk)
{
        const char *error_msg;
        int max = 0, err = -EFSCORRUPTED;

        if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
                error_msg = "invalid magic";
                goto corrupted;
        }
        if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
                error_msg = "unexpected eh_depth";
                goto corrupted;
        }
        if (unlikely(eh->eh_max == 0)) {
                error_msg = "invalid eh_max";
                goto corrupted;
        }
        max = ext4_ext_max_entries(inode, depth);
        if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
                error_msg = "too large eh_max";
                goto corrupted;
        }
        if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
                error_msg = "invalid eh_entries";
                goto corrupted;
        }
        if (unlikely((eh->eh_entries == 0) && (depth > 0))) {
                error_msg = "eh_entries is 0 but eh_depth is > 0";
                goto corrupted;
        }
        if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) {
                error_msg = "invalid extent entries";
                goto corrupted;
        }
        if (unlikely(depth > 32)) {
                error_msg = "too large eh_depth";
                goto corrupted;
        }
        /* Verify checksum on non-root extent tree nodes */
        if (ext_depth(inode) != depth &&
            !ext4_extent_block_csum_verify(inode, eh)) {
                error_msg = "extent tree corrupted";
                err = -EFSBADCRC;
                goto corrupted;
        }
        return 0;

corrupted:
        ext4_error_inode_err(inode, function, line, 0, -err,
                             "pblk %llu bad header/extent: %s - magic %x, "
                             "entries %u, max %u(%u), depth %u(%u)",
                             (unsigned long long) pblk, error_msg,
                             le16_to_cpu(eh->eh_magic),
                             le16_to_cpu(eh->eh_entries),
                             le16_to_cpu(eh->eh_max),
                             max, le16_to_cpu(eh->eh_depth), depth);
        return err;
}

#define ext4_ext_check(inode, eh, depth, pblk)                        \
        __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk), 0)

int ext4_ext_check_inode(struct inode *inode)
{
        return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0);
}

static void ext4_cache_extents(struct inode *inode,
                               struct ext4_extent_header *eh)
{
        struct ext4_extent *ex = EXT_FIRST_EXTENT(eh);
        ext4_lblk_t prev = 0;
        int i;

        for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
                unsigned int status = EXTENT_STATUS_WRITTEN;
                ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
                int len = ext4_ext_get_actual_len(ex);

                if (prev && (prev != lblk))
                        ext4_es_cache_extent(inode, prev, lblk - prev, ~0,
                                             EXTENT_STATUS_HOLE);

                if (ext4_ext_is_unwritten(ex))
                        status = EXTENT_STATUS_UNWRITTEN;
                ext4_es_cache_extent(inode, lblk, len,
                                     ext4_ext_pblock(ex), status);
                prev = lblk + len;
        }
}

static struct buffer_head *
__read_extent_tree_block(const char *function, unsigned int line,
                         struct inode *inode, struct ext4_extent_idx *idx,
                         int depth, int flags)
{
        struct buffer_head                *bh;
        int                                err;
        gfp_t                                gfp_flags = __GFP_MOVABLE | GFP_NOFS;
        ext4_fsblk_t                        pblk;

        if (flags & EXT4_EX_NOFAIL)
                gfp_flags |= __GFP_NOFAIL;

        pblk = ext4_idx_pblock(idx);
        bh = sb_getblk_gfp(inode->i_sb, pblk, gfp_flags);
        if (unlikely(!bh))
                return ERR_PTR(-ENOMEM);

        if (!bh_uptodate_or_lock(bh)) {
                trace_ext4_ext_load_extent(inode, pblk, _RET_IP_);
                err = ext4_read_bh(bh, 0, NULL);
                if (err < 0)
                        goto errout;
        }
        if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
                return bh;
        err = __ext4_ext_check(function, line, inode, ext_block_hdr(bh),
                               depth, pblk, le32_to_cpu(idx->ei_block));
        if (err)
                goto errout;
        set_buffer_verified(bh);
        /*
         * If this is a leaf block, cache all of its entries
         */
        if (!(flags & EXT4_EX_NOCACHE) && depth == 0) {
                struct ext4_extent_header *eh = ext_block_hdr(bh);
                ext4_cache_extents(inode, eh);
        }
        return bh;
errout:
        put_bh(bh);
        return ERR_PTR(err);

}

#define read_extent_tree_block(inode, idx, depth, flags)                \
        __read_extent_tree_block(__func__, __LINE__, (inode), (idx),        \
                                 (depth), (flags))

/*
 * This function is called to cache a file's extent information in the
 * extent status tree
 */
int ext4_ext_precache(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_ext_path *path = NULL;
        struct buffer_head *bh;
        int i = 0, depth, ret = 0;

        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                return 0;        /* not an extent-mapped inode */

        down_read(&ei->i_data_sem);
        depth = ext_depth(inode);

        /* Don't cache anything if there are no external extent blocks */
        if (!depth) {
                up_read(&ei->i_data_sem);
                return ret;
        }

        path = kcalloc(depth + 1, sizeof(struct ext4_ext_path),
                       GFP_NOFS);
        if (path == NULL) {
                up_read(&ei->i_data_sem);
                return -ENOMEM;
        }

        path[0].p_hdr = ext_inode_hdr(inode);
        ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0);
        if (ret)
                goto out;
        path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr);
        while (i >= 0) {
                /*
                 * If this is a leaf block or we've reached the end of
                 * the index block, go up
                 */
                if ((i == depth) ||
                    path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) {
                        brelse(path[i].p_bh);
                        path[i].p_bh = NULL;
                        i--;
                        continue;
                }
                bh = read_extent_tree_block(inode, path[i].p_idx++,
                                            depth - i - 1,
                                            EXT4_EX_FORCE_CACHE);
                if (IS_ERR(bh)) {
                        ret = PTR_ERR(bh);
                        break;
                }
                i++;
                path[i].p_bh = bh;
                path[i].p_hdr = ext_block_hdr(bh);
                path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr);
        }
        ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
out:
        up_read(&ei->i_data_sem);
        ext4_ext_drop_refs(path);
        kfree(path);
        return ret;
}

#ifdef EXT_DEBUG
static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
{
        int k, l = path->p_depth;

        ext_debug(inode, "path:");
        for (k = 0; k <= l; k++, path++) {
                if (path->p_idx) {
                        ext_debug(inode, "  %d->%llu",
                                  le32_to_cpu(path->p_idx->ei_block),
                                  ext4_idx_pblock(path->p_idx));
                } else if (path->p_ext) {
                        ext_debug(inode, "  %d:[%d]%d:%llu ",
                                  le32_to_cpu(path->p_ext->ee_block),
                                  ext4_ext_is_unwritten(path->p_ext),
                                  ext4_ext_get_actual_len(path->p_ext),
                                  ext4_ext_pblock(path->p_ext));
                } else
                        ext_debug(inode, "  []");
        }
        ext_debug(inode, "\n");
}

static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
{
        int depth = ext_depth(inode);
        struct ext4_extent_header *eh;
        struct ext4_extent *ex;
        int i;

        if (!path)
                return;

        eh = path[depth].p_hdr;
        ex = EXT_FIRST_EXTENT(eh);

        ext_debug(inode, "Displaying leaf extents\n");

        for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
                ext_debug(inode, "%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
                          ext4_ext_is_unwritten(ex),
                          ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
        }
        ext_debug(inode, "\n");
}

static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
                        ext4_fsblk_t newblock, int level)
{
        int depth = ext_depth(inode);
        struct ext4_extent *ex;

        if (depth != level) {
                struct ext4_extent_idx *idx;
                idx = path[level].p_idx;
                while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
                        ext_debug(inode, "%d: move %d:%llu in new index %llu\n",
                                  level, le32_to_cpu(idx->ei_block),
                                  ext4_idx_pblock(idx), newblock);
                        idx++;
                }

                return;
        }

        ex = path[depth].p_ext;
        while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
                ext_debug(inode, "move %d:%llu:[%d]%d in new leaf %llu\n",
                                le32_to_cpu(ex->ee_block),
                                ext4_ext_pblock(ex),
                                ext4_ext_is_unwritten(ex),
                                ext4_ext_get_actual_len(ex),
                                newblock);
                ex++;
        }
}

#else
#define ext4_ext_show_path(inode, path)
#define ext4_ext_show_leaf(inode, path)
#define ext4_ext_show_move(inode, path, newblock, level)
#endif

void ext4_ext_drop_refs(struct ext4_ext_path *path)
{
        int depth, i;

        if (!path)
                return;
        depth = path->p_depth;
        for (i = 0; i <= depth; i++, path++) {
                brelse(path->p_bh);
                path->p_bh = NULL;
        }
}

/*
 * ext4_ext_binsearch_idx:
 * binary search for the closest index of the given block
 * the header must be checked before calling this
 */
static void
ext4_ext_binsearch_idx(struct inode *inode,
                        struct ext4_ext_path *path, ext4_lblk_t block)
{
        struct ext4_extent_header *eh = path->p_hdr;
        struct ext4_extent_idx *r, *l, *m;


        ext_debug(inode, "binsearch for %u(idx):  ", block);

        l = EXT_FIRST_INDEX(eh) + 1;
        r = EXT_LAST_INDEX(eh);
        while (l <= r) {
                m = l + (r - l) / 2;
                if (block < le32_to_cpu(m->ei_block))
                        r = m - 1;
                else
                        l = m + 1;
                ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l,
                          le32_to_cpu(l->ei_block), m, le32_to_cpu(m->ei_block),
                          r, le32_to_cpu(r->ei_block));
        }

        path->p_idx = l - 1;
        ext_debug(inode, "  -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block),
                  ext4_idx_pblock(path->p_idx));

#ifdef CHECK_BINSEARCH
        {
                struct ext4_extent_idx *chix, *ix;
                int k;

                chix = ix = EXT_FIRST_INDEX(eh);
                for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
                        if (k != 0 && le32_to_cpu(ix->ei_block) <=
                            le32_to_cpu(ix[-1].ei_block)) {
                                printk(KERN_DEBUG "k=%d, ix=0x%p, "
                                       "first=0x%p\n", k,
                                       ix, EXT_FIRST_INDEX(eh));
                                printk(KERN_DEBUG "%u <= %u\n",
                                       le32_to_cpu(ix->ei_block),
                                       le32_to_cpu(ix[-1].ei_block));
                        }
                        BUG_ON(k && le32_to_cpu(ix->ei_block)
                                           <= le32_to_cpu(ix[-1].ei_block));
                        if (block < le32_to_cpu(ix->ei_block))
                                break;
                        chix = ix;
                }
                BUG_ON(chix != path->p_idx);
        }
#endif

}

/*
 * ext4_ext_binsearch:
 * binary search for closest extent of the given block
 * the header must be checked before calling this
 */
static void
ext4_ext_binsearch(struct inode *inode,
                struct ext4_ext_path *path, ext4_lblk_t block)
{
        struct ext4_extent_header *eh = path->p_hdr;
        struct ext4_extent *r, *l, *m;

        if (eh->eh_entries == 0) {
                /*
                 * this leaf is empty:
                 * we get such a leaf in split/add case
                 */
                return;
        }

        ext_debug(inode, "binsearch for %u:  ", block);

        l = EXT_FIRST_EXTENT(eh) + 1;
        r = EXT_LAST_EXTENT(eh);

        while (l <= r) {
                m = l + (r - l) / 2;
                if (block < le32_to_cpu(m->ee_block))
                        r = m - 1;
                else
                        l = m + 1;
                ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l,
                          le32_to_cpu(l->ee_block), m, le32_to_cpu(m->ee_block),
                          r, le32_to_cpu(r->ee_block));
        }

        path->p_ext = l - 1;
        ext_debug(inode, "  -> %d:%llu:[%d]%d ",
                        le32_to_cpu(path->p_ext->ee_block),
                        ext4_ext_pblock(path->p_ext),
                        ext4_ext_is_unwritten(path->p_ext),
                        ext4_ext_get_actual_len(path->p_ext));

#ifdef CHECK_BINSEARCH
        {
                struct ext4_extent *chex, *ex;
                int k;

                chex = ex = EXT_FIRST_EXTENT(eh);
                for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
                        BUG_ON(k && le32_to_cpu(ex->ee_block)
                                          <= le32_to_cpu(ex[-1].ee_block));
                        if (block < le32_to_cpu(ex->ee_block))
                                break;
                        chex = ex;
                }
                BUG_ON(chex != path->p_ext);
        }
#endif

}

void ext4_ext_tree_init(handle_t *handle, struct inode *inode)
{
        struct ext4_extent_header *eh;

        eh = ext_inode_hdr(inode);
        eh->eh_depth = 0;
        eh->eh_entries = 0;
        eh->eh_magic = EXT4_EXT_MAGIC;
        eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
        eh->eh_generation = 0;
        ext4_mark_inode_dirty(handle, inode);
}

struct ext4_ext_path *
ext4_find_extent(struct inode *inode, ext4_lblk_t block,
                 struct ext4_ext_path **orig_path, int flags)
{
        struct ext4_extent_header *eh;
        struct buffer_head *bh;
        struct ext4_ext_path *path = orig_path ? *orig_path : NULL;
        short int depth, i, ppos = 0;
        int ret;
        gfp_t gfp_flags = GFP_NOFS;

        if (flags & EXT4_EX_NOFAIL)
                gfp_flags |= __GFP_NOFAIL;

        eh = ext_inode_hdr(inode);
        depth = ext_depth(inode);
        if (depth < 0 || depth > EXT4_MAX_EXTENT_DEPTH) {
                EXT4_ERROR_INODE(inode, "inode has invalid extent depth: %d",
                                 depth);
                ret = -EFSCORRUPTED;
                goto err;
        }

        if (path) {
                ext4_ext_drop_refs(path);
                if (depth > path[0].p_maxdepth) {
                        kfree(path);
                        *orig_path = path = NULL;
                }
        }
        if (!path) {
                /* account possible depth increase */
                path = kcalloc(depth + 2, sizeof(struct ext4_ext_path),
                                gfp_flags);
                if (unlikely(!path))
                        return ERR_PTR(-ENOMEM);
                path[0].p_maxdepth = depth + 1;
        }
        path[0].p_hdr = eh;
        path[0].p_bh = NULL;

        i = depth;
        if (!(flags & EXT4_EX_NOCACHE) && depth == 0)
                ext4_cache_extents(inode, eh);
        /* walk through the tree */
        while (i) {
                ext_debug(inode, "depth %d: num %d, max %d\n",
                          ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));

                ext4_ext_binsearch_idx(inode, path + ppos, block);
                path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
                path[ppos].p_depth = i;
                path[ppos].p_ext = NULL;

                bh = read_extent_tree_block(inode, path[ppos].p_idx, --i, flags);
                if (IS_ERR(bh)) {
                        ret = PTR_ERR(bh);
                        goto err;
                }

                eh = ext_block_hdr(bh);
                ppos++;
                path[ppos].p_bh = bh;
                path[ppos].p_hdr = eh;
        }

        path[ppos].p_depth = i;
        path[ppos].p_ext = NULL;
        path[ppos].p_idx = NULL;

        /* find extent */
        ext4_ext_binsearch(inode, path + ppos, block);
        /* if not an empty leaf */
        if (path[ppos].p_ext)
                path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);

        ext4_ext_show_path(inode, path);

        if (orig_path)
                *orig_path = path;
        return path;

err:
        ext4_ext_drop_refs(path);
        kfree(path);
        if (orig_path)
                *orig_path = NULL;
        return ERR_PTR(ret);
}

/*
 * ext4_ext_insert_index:
 * insert new index [@logical;@ptr] into the block at @curp;
 * check where to insert: before @curp or after @curp
 */
static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
                                 struct ext4_ext_path *curp,
                                 int logical, ext4_fsblk_t ptr)
{
        struct ext4_extent_idx *ix;
        int len, err;

        err = ext4_ext_get_access(handle, inode, curp);
        if (err)
                return err;

        if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
                EXT4_ERROR_INODE(inode,
                                 "logical %d == ei_block %d!",
                                 logical, le32_to_cpu(curp->p_idx->ei_block));
                return -EFSCORRUPTED;
        }

        if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
                             >= le16_to_cpu(curp->p_hdr->eh_max))) {
                EXT4_ERROR_INODE(inode,
                                 "eh_entries %d >= eh_max %d!",
                                 le16_to_cpu(curp->p_hdr->eh_entries),
                                 le16_to_cpu(curp->p_hdr->eh_max));
                return -EFSCORRUPTED;
        }

        if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
                /* insert after */
                ext_debug(inode, "insert new index %d after: %llu\n",
                          logical, ptr);
                ix = curp->p_idx + 1;
        } else {
                /* insert before */
                ext_debug(inode, "insert new index %d before: %llu\n",
                          logical, ptr);
                ix = curp->p_idx;
        }

        if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
                EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
                return -EFSCORRUPTED;
        }

        len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
        BUG_ON(len < 0);
        if (len > 0) {
                ext_debug(inode, "insert new index %d: "
                                "move %d indices from 0x%p to 0x%p\n",
                                logical, len, ix, ix + 1);
                memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
        }

        ix->ei_block = cpu_to_le32(logical);
        ext4_idx_store_pblock(ix, ptr);
        le16_add_cpu(&curp->p_hdr->eh_entries, 1);

        if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
                EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
                return -EFSCORRUPTED;
        }

        err = ext4_ext_dirty(handle, inode, curp);
        ext4_std_error(inode->i_sb, err);

        return err;
}

/*
 * ext4_ext_split:
 * inserts new subtree into the path, using free index entry
 * at depth @at:
 * - allocates all needed blocks (new leaf and all intermediate index blocks)
 * - makes decision where to split
 * - moves remaining extents and index entries (right to the split point)
 *   into the newly allocated blocks
 * - initializes subtree
 */
static int ext4_ext_split(handle_t *handle, struct inode *inode,
                          unsigned int flags,
                          struct ext4_ext_path *path,
                          struct ext4_extent *newext, int at)
{
        struct buffer_head *bh = NULL;
        int depth = ext_depth(inode);
        struct ext4_extent_header *neh;
        struct ext4_extent_idx *fidx;
        int i = at, k, m, a;
        ext4_fsblk_t newblock, oldblock;
        __le32 border;
        ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
        gfp_t gfp_flags = GFP_NOFS;
        int err = 0;
        size_t ext_size = 0;

        if (flags & EXT4_EX_NOFAIL)
                gfp_flags |= __GFP_NOFAIL;

        /* make decision: where to split? */
        /* FIXME: now decision is simplest: at current extent */

        /* if current leaf will be split, then we should use
         * border from split point */
        if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
                EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
                return -EFSCORRUPTED;
        }
        if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
                border = path[depth].p_ext[1].ee_block;
                ext_debug(inode, "leaf will be split."
                                " next leaf starts at %d\n",
                                  le32_to_cpu(border));
        } else {
                border = newext->ee_block;
                ext_debug(inode, "leaf will be added."
                                " next leaf starts at %d\n",
                                le32_to_cpu(border));
        }

        /*
         * If error occurs, then we break processing
         * and mark filesystem read-only. index won't
         * be inserted and tree will be in consistent
         * state. Next mount will repair buffers too.
         */

        /*
         * Get array to track all allocated blocks.
         * We need this to handle errors and free blocks
         * upon them.
         */
        ablocks = kcalloc(depth, sizeof(ext4_fsblk_t), gfp_flags);
        if (!ablocks)
                return -ENOMEM;

        /* allocate all needed blocks */
        ext_debug(inode, "allocate %d blocks for indexes/leaf\n", depth - at);
        for (a = 0; a < depth - at; a++) {
                newblock = ext4_ext_new_meta_block(handle, inode, path,
                                                   newext, &err, flags);
                if (newblock == 0)
                        goto cleanup;
                ablocks[a] = newblock;
        }

        /* initialize new leaf */
        newblock = ablocks[--a];
        if (unlikely(newblock == 0)) {
                EXT4_ERROR_INODE(inode, "newblock == 0!");
                err = -EFSCORRUPTED;
                goto cleanup;
        }
        bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
        if (unlikely(!bh)) {
                err = -ENOMEM;
                goto cleanup;
        }
        lock_buffer(bh);

        err = ext4_journal_get_create_access(handle, bh);
        if (err)
                goto cleanup;

        neh = ext_block_hdr(bh);
        neh->eh_entries = 0;
        neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
        neh->eh_magic = EXT4_EXT_MAGIC;
        neh->eh_depth = 0;
        neh->eh_generation = 0;

        /* move remainder of path[depth] to the new leaf */
        if (unlikely(path[depth].p_hdr->eh_entries !=
                     path[depth].p_hdr->eh_max)) {
                EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
                                 path[depth].p_hdr->eh_entries,
                                 path[depth].p_hdr->eh_max);
                err = -EFSCORRUPTED;
                goto cleanup;
        }
        /* start copy from next extent */
        m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
        ext4_ext_show_move(inode, path, newblock, depth);
        if (m) {
                struct ext4_extent *ex;
                ex = EXT_FIRST_EXTENT(neh);
                memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
                le16_add_cpu(&neh->eh_entries, m);
        }

        /* zero out unused area in the extent block */
        ext_size = sizeof(struct ext4_extent_header) +
                sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries);
        memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);
        ext4_extent_block_csum_set(inode, neh);
        set_buffer_uptodate(bh);
        unlock_buffer(bh);

        err = ext4_handle_dirty_metadata(handle, inode, bh);
        if (err)
                goto cleanup;
        brelse(bh);
        bh = NULL;

        /* correct old leaf */
        if (m) {
                err = ext4_ext_get_access(handle, inode, path + depth);
                if (err)
                        goto cleanup;
                le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
                err = ext4_ext_dirty(handle, inode, path + depth);
                if (err)
                        goto cleanup;

        }

        /* create intermediate indexes */
        k = depth - at - 1;
        if (unlikely(k < 0)) {
                EXT4_ERROR_INODE(inode, "k %d < 0!", k);
                err = -EFSCORRUPTED;
                goto cleanup;
        }
        if (k)
                ext_debug(inode, "create %d intermediate indices\n", k);
        /* insert new index into current index block */
        /* current depth stored in i var */
        i = depth - 1;
        while (k--) {
                oldblock = newblock;
                newblock = ablocks[--a];
                bh = sb_getblk(inode->i_sb, newblock);
                if (unlikely(!bh)) {
                        err = -ENOMEM;
                        goto cleanup;
                }
                lock_buffer(bh);

                err = ext4_journal_get_create_access(handle, bh);
                if (err)
                        goto cleanup;

                neh = ext_block_hdr(bh);
                neh->eh_entries = cpu_to_le16(1);
                neh->eh_magic = EXT4_EXT_MAGIC;
                neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
                neh->eh_depth = cpu_to_le16(depth - i);
                neh->eh_generation = 0;
                fidx = EXT_FIRST_INDEX(neh);
                fidx->ei_block = border;
                ext4_idx_store_pblock(fidx, oldblock);

                ext_debug(inode, "int.index at %d (block %llu): %u -> %llu\n",
                                i, newblock, le32_to_cpu(border), oldblock);

                /* move remainder of path[i] to the new index block */
                if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
                                        EXT_LAST_INDEX(path[i].p_hdr))) {
                        EXT4_ERROR_INODE(inode,
                                         "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
                                         le32_to_cpu(path[i].p_ext->ee_block));
                        err = -EFSCORRUPTED;
                        goto cleanup;
                }
                /* start copy indexes */
                m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
                ext_debug(inode, "cur 0x%p, last 0x%p\n", path[i].p_idx,
                                EXT_MAX_INDEX(path[i].p_hdr));
                ext4_ext_show_move(inode, path, newblock, i);
                if (m) {
                        memmove(++fidx, path[i].p_idx,
                                sizeof(struct ext4_extent_idx) * m);
                        le16_add_cpu(&neh->eh_entries, m);
                }
                /* zero out unused area in the extent block */
                ext_size = sizeof(struct ext4_extent_header) +
                   (sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries));
                memset(bh->b_data + ext_size, 0,
                        inode->i_sb->s_blocksize - ext_size);
                ext4_extent_block_csum_set(inode, neh);
                set_buffer_uptodate(bh);
                unlock_buffer(bh);

                err = ext4_handle_dirty_metadata(handle, inode, bh);
                if (err)
                        goto cleanup;
                brelse(bh);
                bh = NULL;

                /* correct old index */
                if (m) {
                        err = ext4_ext_get_access(handle, inode, path + i);
                        if (err)
                                goto cleanup;
                        le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
                        err = ext4_ext_dirty(handle, inode, path + i);
                        if (err)
                                goto cleanup;
                }

                i--;
        }

        /* insert new index */
        err = ext4_ext_insert_index(handle, inode, path + at,
                                    le32_to_cpu(border), newblock);

cleanup:
        if (bh) {
                if (buffer_locked(bh))
                        unlock_buffer(bh);
                brelse(bh);
        }

        if (err) {
                /* free all allocated blocks in error case */
                for (i = 0; i < depth; i++) {
                        if (!ablocks[i])
                                continue;
                        ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
                                         EXT4_FREE_BLOCKS_METADATA);
                }
        }
        kfree(ablocks);

        return err;
}

/*
 * ext4_ext_grow_indepth:
 * implements tree growing procedure:
 * - allocates new block
 * - moves top-level data (index block or leaf) into the new block
 * - initializes new top-level, creating index that points to the
 *   just created block
 */
static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
                                 unsigned int flags)
{
        struct ext4_extent_header *neh;
        struct buffer_head *bh;
        ext4_fsblk_t newblock, goal = 0;
        struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
        int err = 0;
        size_t ext_size = 0;

        /* Try to prepend new index to old one */
        if (ext_depth(inode))
                goal = ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode)));
        if (goal > le32_to_cpu(es->s_first_data_block)) {
                flags |= EXT4_MB_HINT_TRY_GOAL;
                goal--;
        } else
                goal = ext4_inode_to_goal_block(inode);
        newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
                                        NULL, &err);
        if (newblock == 0)
                return err;

        bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
        if (unlikely(!bh))
                return -ENOMEM;
        lock_buffer(bh);

        err = ext4_journal_get_create_access(handle, bh);
        if (err) {
                unlock_buffer(bh);
                goto out;
        }

        ext_size = sizeof(EXT4_I(inode)->i_data);
        /* move top-level index/leaf into new block */
        memmove(bh->b_data, EXT4_I(inode)->i_data, ext_size);
        /* zero out unused area in the extent block */
        memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);

        /* set size of new block */
        neh = ext_block_hdr(bh);
        /* old root could have indexes or leaves
         * so calculate e_max right way */
        if (ext_depth(inode))
                neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
        else
                neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
        neh->eh_magic = EXT4_EXT_MAGIC;
        ext4_extent_block_csum_set(inode, neh);
        set_buffer_uptodate(bh);
        unlock_buffer(bh);

        err = ext4_handle_dirty_metadata(handle, inode, bh);
        if (err)
                goto out;

        /* Update top-level index: num,max,pointer */
        neh = ext_inode_hdr(inode);
        neh->eh_entries = cpu_to_le16(1);
        ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
        if (neh->eh_depth == 0) {
                /* Root extent block becomes index block */
                neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
                EXT_FIRST_INDEX(neh)->ei_block =
                        EXT_FIRST_EXTENT(neh)->ee_block;
        }
        ext_debug(inode, "new root: num %d(%d), lblock %d, ptr %llu\n",
                  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
                  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
                  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));

        le16_add_cpu(&neh->eh_depth, 1);
        err = ext4_mark_inode_dirty(handle, inode);
out:
        brelse(bh);

        return err;
}

/*
 * ext4_ext_create_new_leaf:
 * finds empty index and adds new leaf.
 * if no free index is found, then it requests in-depth growing.
 */
static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
                                    unsigned int mb_flags,
                                    unsigned int gb_flags,
                                    struct ext4_ext_path **ppath,
                                    struct ext4_extent *newext)
{
        struct ext4_ext_path *path = *ppath;
        struct ext4_ext_path *curp;
        int depth, i, err = 0;

repeat:
        i = depth = ext_depth(inode);

        /* walk up to the tree and look for free index entry */
        curp = path + depth;
        while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
                i--;
                curp--;
        }

        /* we use already allocated block for index block,
         * so subsequent data blocks should be contiguous */
        if (EXT_HAS_FREE_INDEX(curp)) {
                /* if we found index with free entry, then use that
                 * entry: create all needed subtree and add new leaf */
                err = ext4_ext_split(handle, inode, mb_flags, path, newext, i);
                if (err)
                        goto out;

                /* refill path */
                path = ext4_find_extent(inode,
                                    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
                                    ppath, gb_flags);
                if (IS_ERR(path))
                        err = PTR_ERR(path);
        } else {
                /* tree is full, time to grow in depth */
                err = ext4_ext_grow_indepth(handle, inode, mb_flags);
                if (err)
                        goto out;

                /* refill path */
                path = ext4_find_extent(inode,
                                   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
                                    ppath, gb_flags);
                if (IS_ERR(path)) {
                        err = PTR_ERR(path);
                        goto out;
                }

                /*
                 * only first (depth 0 -> 1) produces free space;
                 * in all other cases we have to split the grown tree
                 */
                depth = ext_depth(inode);
                if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
                        /* now we need to split */
                        goto repeat;
                }
        }

out:
        return err;
}

/*
 * search the closest allocated block to the left for *logical
 * and returns it at @logical + it's physical address at @phys
 * if *logical is the smallest allocated block, the function
 * returns 0 at @phys
 * return value contains 0 (success) or error code
 */
static int ext4_ext_search_left(struct inode *inode,
                                struct ext4_ext_path *path,
                                ext4_lblk_t *logical, ext4_fsblk_t *phys)
{
        struct ext4_extent_idx *ix;
        struct ext4_extent *ex;
        int depth, ee_len;

        if (unlikely(path == NULL)) {
                EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
                return -EFSCORRUPTED;
        }
        depth = path->p_depth;
        *phys = 0;

        if (depth == 0 && path->p_ext == NULL)
                return 0;

        /* usually extent in the path covers blocks smaller
         * then *logical, but it can be that extent is the
         * first one in the file */

        ex = path[depth].p_ext;
        ee_len = ext4_ext_get_actual_len(ex);
        if (*logical < le32_to_cpu(ex->ee_block)) {
                if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
                        EXT4_ERROR_INODE(inode,
                                         "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
                                         *logical, le32_to_cpu(ex->ee_block));
                        return -EFSCORRUPTED;
                }
                while (--depth >= 0) {
                        ix = path[depth].p_idx;
                        if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
                                EXT4_ERROR_INODE(inode,
                                  "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
                                  ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
                                  EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
                le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
                                  depth);
                                return -EFSCORRUPTED;
                        }
                }
                return 0;
        }

        if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
                EXT4_ERROR_INODE(inode,
                                 "logical %d < ee_block %d + ee_len %d!",
                                 *logical, le32_to_cpu(ex->ee_block), ee_len);
                return -EFSCORRUPTED;
        }

        *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
        *phys = ext4_ext_pblock(ex) + ee_len - 1;
        return 0;
}

/*
 * Search the closest allocated block to the right for *logical
 * and returns it at @logical + it's physical address at @phys.
 * If not exists, return 0 and @phys is set to 0. We will return
 * 1 which means we found an allocated block and ret_ex is valid.
 * Or return a (< 0) error code.
 */
static int ext4_ext_search_right(struct inode *inode,
                                 struct ext4_ext_path *path,
                                 ext4_lblk_t *logical, ext4_fsblk_t *phys,
                                 struct ext4_extent *ret_ex, int flags)
{
        struct buffer_head *bh = NULL;
        struct ext4_extent_header *eh;
        struct ext4_extent_idx *ix;
        struct ext4_extent *ex;
        int depth;        /* Note, NOT eh_depth; depth from top of tree */
        int ee_len;

        if (unlikely(path == NULL)) {
                EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
                return -EFSCORRUPTED;
        }
        depth = path->p_depth;
        *phys = 0;

        if (depth == 0 && path->p_ext == NULL)
                return 0;

        /* usually extent in the path covers blocks smaller
         * then *logical, but it can be that extent is the
         * first one in the file */

        ex = path[depth].p_ext;
        ee_len = ext4_ext_get_actual_len(ex);
        if (*logical < le32_to_cpu(ex->ee_block)) {
                if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
                        EXT4_ERROR_INODE(inode,
                                         "first_extent(path[%d].p_hdr) != ex",
                                         depth);
                        return -EFSCORRUPTED;
                }
                while (--depth >= 0) {
                        ix = path[depth].p_idx;
                        if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
                                EXT4_ERROR_INODE(inode,
                                                 "ix != EXT_FIRST_INDEX *logical %d!",
                                                 *logical);
                                return -EFSCORRUPTED;
                        }
                }
                goto found_extent;
        }

        if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
                EXT4_ERROR_INODE(inode,
                                 "logical %d < ee_block %d + ee_len %d!",
                                 *logical, le32_to_cpu(ex->ee_block), ee_len);
                return -EFSCORRUPTED;
        }

        if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
                /* next allocated block in this leaf */
                ex++;
                goto found_extent;
        }

        /* go up and search for index to the right */
        while (--depth >= 0) {
                ix = path[depth].p_idx;
                if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
                        goto got_index;
        }

        /* we've gone up to the root and found no index to the right */
        return 0;

got_index:
        /* we've found index to the right, let's
         * follow it and find the closest allocated
         * block to the right */
        ix++;
        while (++depth < path->p_depth) {
                /* subtract from p_depth to get proper eh_depth */
                bh = read_extent_tree_block(inode, ix, path->p_depth - depth,
                                            flags);
                if (IS_ERR(bh))
                        return PTR_ERR(bh);
                eh = ext_block_hdr(bh);
                ix = EXT_FIRST_INDEX(eh);
                put_bh(bh);
        }

        bh = read_extent_tree_block(inode, ix, path->p_depth - depth, flags);
        if (IS_ERR(bh))
                return PTR_ERR(bh);
        eh = ext_block_hdr(bh);
        ex = EXT_FIRST_EXTENT(eh);
found_extent:
        *logical = le32_to_cpu(ex->ee_block);
        *phys = ext4_ext_pblock(ex);
        if (ret_ex)
                *ret_ex = *ex;
        if (bh)
                put_bh(bh);
        return 1;
}

/*
 * ext4_ext_next_allocated_block:
 * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
 * NOTE: it considers block number from index entry as
 * allocated block. Thus, index entries have to be consistent
 * with leaves.
 */
ext4_lblk_t
ext4_ext_next_allocated_block(struct ext4_ext_path *path)
{
        int depth;

        BUG_ON(path == NULL);
        depth = path->p_depth;

        if (depth == 0 && path->p_ext == NULL)
                return EXT_MAX_BLOCKS;

        while (depth >= 0) {
                struct ext4_ext_path *p = &path[depth];

                if (depth == path->p_depth) {
                        /* leaf */
                        if (p->p_ext && p->p_ext != EXT_LAST_EXTENT(p->p_hdr))
                                return le32_to_cpu(p->p_ext[1].ee_block);
                } else {
                        /* index */
                        if (p->p_idx != EXT_LAST_INDEX(p->p_hdr))
                                return le32_to_cpu(p->p_idx[1].ei_block);
                }
                depth--;
        }

        return EXT_MAX_BLOCKS;
}

/*
 * ext4_ext_next_leaf_block:
 * returns first allocated block from next leaf or EXT_MAX_BLOCKS
 */
static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path)
{
        int depth;

        BUG_ON(path == NULL);
        depth = path->p_depth;

        /* zero-tree has no leaf blocks at all */
        if (depth == 0)
                return EXT_MAX_BLOCKS;

        /* go to index block */
        depth--;

        while (depth >= 0) {
                if (path[depth].p_idx !=
                                EXT_LAST_INDEX(path[depth].p_hdr))
                        return (ext4_lblk_t)
                                le32_to_cpu(path[depth].p_idx[1].ei_block);
                depth--;
        }

        return EXT_MAX_BLOCKS;
}

/*
 * ext4_ext_correct_indexes:
 * if leaf gets modified and modified extent is first in the leaf,
 * then we have to correct all indexes above.
 * TODO: do we need to correct tree in all cases?
 */
static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
                                struct ext4_ext_path *path)
{
        struct ext4_extent_header *eh;
        int depth = ext_depth(inode);
        struct ext4_extent *ex;
        __le32 border;
        int k, err = 0;

        eh = path[depth].p_hdr;
        ex = path[depth].p_ext;

        if (unlikely(ex == NULL || eh == NULL)) {
                EXT4_ERROR_INODE(inode,
                                 "ex %p == NULL or eh %p == NULL", ex, eh);
                return -EFSCORRUPTED;
        }

        if (depth == 0) {
                /* there is no tree at all */
                return 0;
        }

        if (ex != EXT_FIRST_EXTENT(eh)) {
                /* we correct tree if first leaf got modified only */
                return 0;
        }

        /*
         * TODO: we need correction if border is smaller than current one
         */
        k = depth - 1;
        border = path[depth].p_ext->ee_block;
        err = ext4_ext_get_access(handle, inode, path + k);
        if (err)
                return err;
        path[k].p_idx->ei_block = border;
        err = ext4_ext_dirty(handle, inode, path + k);
        if (err)
                return err;

        while (k--) {
                /* change all left-side indexes */
                if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
                        break;
                err = ext4_ext_get_access(handle, inode, path + k);
                if (err)
                        break;
                path[k].p_idx->ei_block = border;
                err = ext4_ext_dirty(handle, inode, path + k);
                if (err)
                        break;
        }

        return err;
}

static int ext4_can_extents_be_merged(struct inode *inode,
                                      struct ext4_extent *ex1,
                                      struct ext4_extent *ex2)
{
        unsigned short ext1_ee_len, ext2_ee_len;

        if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2))
                return 0;

        ext1_ee_len = ext4_ext_get_actual_len(ex1);
        ext2_ee_len = ext4_ext_get_actual_len(ex2);

        if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
                        le32_to_cpu(ex2->ee_block))
                return 0;

        if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
                return 0;

        if (ext4_ext_is_unwritten(ex1) &&
            ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)
                return 0;
#ifdef AGGRESSIVE_TEST
        if (ext1_ee_len >= 4)
                return 0;
#endif

        if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
                return 1;
        return 0;
}

/*
 * This function tries to merge the "ex" extent to the next extent in the tree.
 * It always tries to merge towards right. If you want to merge towards
 * left, pass "ex - 1" as argument instead of "ex".
 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
 * 1 if they got merged.
 */
static int ext4_ext_try_to_merge_right(struct inode *inode,
                                 struct ext4_ext_path *path,
                                 struct ext4_extent *ex)
{
        struct ext4_extent_header *eh;
        unsigned int depth, len;
        int merge_done = 0, unwritten;

        depth = ext_depth(inode);
        BUG_ON(path[depth].p_hdr == NULL);
        eh = path[depth].p_hdr;

        while (ex < EXT_LAST_EXTENT(eh)) {
                if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
                        break;
                /* merge with next extent! */
                unwritten = ext4_ext_is_unwritten(ex);
                ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
                                + ext4_ext_get_actual_len(ex + 1));
                if (unwritten)
                        ext4_ext_mark_unwritten(ex);

                if (ex + 1 < EXT_LAST_EXTENT(eh)) {
                        len = (EXT_LAST_EXTENT(eh) - ex - 1)
                                * sizeof(struct ext4_extent);
                        memmove(ex + 1, ex + 2, len);
                }
                le16_add_cpu(&eh->eh_entries, -1);
                merge_done = 1;
                WARN_ON(eh->eh_entries == 0);
                if (!eh->eh_entries)
                        EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
        }

        return merge_done;
}

/*
 * This function does a very simple check to see if we can collapse
 * an extent tree with a single extent tree leaf block into the inode.
 */
static void ext4_ext_try_to_merge_up(handle_t *handle,
                                     struct inode *inode,
                                     struct ext4_ext_path *path)
{
        size_t s;
        unsigned max_root = ext4_ext_space_root(inode, 0);
        ext4_fsblk_t blk;

        if ((path[0].p_depth != 1) ||
            (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) ||
            (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root))
                return;

        /*
         * We need to modify the block allocation bitmap and the block
         * group descriptor to release the extent tree block.  If we
         * can't get the journal credits, give up.
         */
        if (ext4_journal_extend(handle, 2,
                        ext4_free_metadata_revoke_credits(inode->i_sb, 1)))
                return;

        /*
         * Copy the extent data up to the inode
         */
        blk = ext4_idx_pblock(path[0].p_idx);
        s = le16_to_cpu(path[1].p_hdr->eh_entries) *
                sizeof(struct ext4_extent_idx);
        s += sizeof(struct ext4_extent_header);

        path[1].p_maxdepth = path[0].p_maxdepth;
        memcpy(path[0].p_hdr, path[1].p_hdr, s);
        path[0].p_depth = 0;
        path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
                (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr));
        path[0].p_hdr->eh_max = cpu_to_le16(max_root);

        brelse(path[1].p_bh);
        path[1].p_bh = NULL;
        ext4_free_blocks(handle, inode, NULL, blk, 1,
                         EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
}

/*
 * This function tries to merge the @ex extent to neighbours in the tree, then
 * tries to collapse the extent tree into the inode.
 */
static void ext4_ext_try_to_merge(handle_t *handle,
                                  struct inode *inode,
                                  struct ext4_ext_path *path,
                                  struct ext4_extent *ex)
{
        struct ext4_extent_header *eh;
        unsigned int depth;
        int merge_done = 0;

        depth = ext_depth(inode);
        BUG_ON(path[depth].p_hdr == NULL);
        eh = path[depth].p_hdr;

        if (ex > EXT_FIRST_EXTENT(eh))
                merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);

        if (!merge_done)
                (void) ext4_ext_try_to_merge_right(inode, path, ex);

        ext4_ext_try_to_merge_up(handle, inode, path);
}

/*
 * check if a portion of the "newext" extent overlaps with an
 * existing extent.
 *
 * If there is an overlap discovered, it updates the length of the newext
 * such that there will be no overlap, and then returns 1.
 * If there is no overlap found, it returns 0.
 */
static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
                                           struct inode *inode,
                                           struct ext4_extent *newext,
                                           struct ext4_ext_path *path)
{
        ext4_lblk_t b1, b2;
        unsigned int depth, len1;
        unsigned int ret = 0;

        b1 = le32_to_cpu(newext->ee_block);
        len1 = ext4_ext_get_actual_len(newext);
        depth = ext_depth(inode);
        if (!path[depth].p_ext)
                goto out;
        b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block));

        /*
         * get the next allocated block if the extent in the path
         * is before the requested block(s)
         */
        if (b2 < b1) {
                b2 = ext4_ext_next_allocated_block(path);
                if (b2 == EXT_MAX_BLOCKS)
                        goto out;
                b2 = EXT4_LBLK_CMASK(sbi, b2);
        }

        /* check for wrap through zero on extent logical start block*/
        if (b1 + len1 < b1) {
                len1 = EXT_MAX_BLOCKS - b1;
                newext->ee_len = cpu_to_le16(len1);
                ret = 1;
        }

        /* check for overlap */
        if (b1 + len1 > b2) {
                newext->ee_len = cpu_to_le16(b2 - b1);
                ret = 1;
        }
out:
        return ret;
}

/*
 * ext4_ext_insert_extent:
 * tries to merge requested extent into the existing extent or
 * inserts requested extent as new one into the tree,
 * creating new leaf in the no-space case.
 */
int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
                                struct ext4_ext_path **ppath,
                                struct ext4_extent *newext, int gb_flags)
{
        struct ext4_ext_path *path = *ppath;
        struct ext4_extent_header *eh;
        struct ext4_extent *ex, *fex;
        struct ext4_extent *nearex; /* nearest extent */
        struct ext4_ext_path *npath = NULL;
        int depth, len, err;
        ext4_lblk_t next;
        int mb_flags = 0, unwritten;

        if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
                mb_flags |= EXT4_MB_DELALLOC_RESERVED;
        if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
                EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
                return -EFSCORRUPTED;
        }
        depth = ext_depth(inode);
        ex = path[depth].p_ext;
        eh = path[depth].p_hdr;
        if (unlikely(path[depth].p_hdr == NULL)) {
                EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
                return -EFSCORRUPTED;
        }

        /* try to insert block into found extent and return */
        if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) {

                /*
                 * Try to see whether we should rather test the extent on
                 * right from ex, or from the left of ex. This is because
                 * ext4_find_extent() can return either extent on the
                 * left, or on the right from the searched position. This
                 * will make merging more effective.
                 */
                if (ex < EXT_LAST_EXTENT(eh) &&
                    (le32_to_cpu(ex->ee_block) +
                    ext4_ext_get_actual_len(ex) <
                    le32_to_cpu(newext->ee_block))) {
                        ex += 1;
                        goto prepend;
                } else if ((ex > EXT_FIRST_EXTENT(eh)) &&
                           (le32_to_cpu(newext->ee_block) +
                           ext4_ext_get_actual_len(newext) <
                           le32_to_cpu(ex->ee_block)))
                        ex -= 1;

                /* Try to append newex to the ex */
                if (ext4_can_extents_be_merged(inode, ex, newext)) {
                        ext_debug(inode, "append [%d]%d block to %u:[%d]%d"
                                  "(from %llu)\n",
                                  ext4_ext_is_unwritten(newext),
                                  ext4_ext_get_actual_len(newext),
                                  le32_to_cpu(ex->ee_block),
                                  ext4_ext_is_unwritten(ex),
                                  ext4_ext_get_actual_len(ex),
                                  ext4_ext_pblock(ex));
                        err = ext4_ext_get_access(handle, inode,
                                                  path + depth);
                        if (err)
                                return err;
                        unwritten = ext4_ext_is_unwritten(ex);
                        ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
                                        + ext4_ext_get_actual_len(newext));
                        if (unwritten)
                                ext4_ext_mark_unwritten(ex);
                        eh = path[depth].p_hdr;
                        nearex = ex;
                        goto merge;
                }

prepend:
                /* Try to prepend newex to the ex */
                if (ext4_can_extents_be_merged(inode, newext, ex)) {
                        ext_debug(inode, "prepend %u[%d]%d block to %u:[%d]%d"
                                  "(from %llu)\n",
                                  le32_to_cpu(newext->ee_block),
                                  ext4_ext_is_unwritten(newext),
                                  ext4_ext_get_actual_len(newext),
                                  le32_to_cpu(ex->ee_block),
                                  ext4_ext_is_unwritten(ex),
                                  ext4_ext_get_actual_len(ex),
                                  ext4_ext_pblock(ex));
                        err = ext4_ext_get_access(handle, inode,
                                                  path + depth);
                        if (err)
                                return err;

                        unwritten = ext4_ext_is_unwritten(ex);
                        ex->ee_block = newext->ee_block;
                        ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
                        ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
                                        + ext4_ext_get_actual_len(newext));
                        if (unwritten)
                                ext4_ext_mark_unwritten(ex);
                        eh = path[depth].p_hdr;
                        nearex = ex;
                        goto merge;
                }
        }

        depth = ext_depth(inode);
        eh = path[depth].p_hdr;
        if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
                goto has_space;

        /* probably next leaf has space for us? */
        fex = EXT_LAST_EXTENT(eh);
        next = EXT_MAX_BLOCKS;
        if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
                next = ext4_ext_next_leaf_block(path);
        if (next != EXT_MAX_BLOCKS) {
                ext_debug(inode, "next leaf block - %u\n", next);
                BUG_ON(npath != NULL);
                npath = ext4_find_extent(inode, next, NULL, gb_flags);
                if (IS_ERR(npath))
                        return PTR_ERR(npath);
                BUG_ON(npath->p_depth != path->p_depth);
                eh = npath[depth].p_hdr;
                if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
                        ext_debug(inode, "next leaf isn't full(%d)\n",
                                  le16_to_cpu(eh->eh_entries));
                        path = npath;
                        goto has_space;
                }
                ext_debug(inode, "next leaf has no free space(%d,%d)\n",
                          le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
        }

        /*
         * There is no free space in the found leaf.
         * We're gonna add a new leaf in the tree.
         */
        if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
                mb_flags |= EXT4_MB_USE_RESERVED;
        err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
                                       ppath, newext);
        if (err)
                goto cleanup;
        path = *ppath;
        depth = ext_depth(inode);
        eh = path[depth].p_hdr;

has_space:
        nearex = path[depth].p_ext;

        err = ext4_ext_get_access(handle, inode, path + depth);
        if (err)
                goto cleanup;

        if (!nearex) {
                /* there is no extent in this leaf, create first one */
                ext_debug(inode, "first extent in the leaf: %u:%llu:[%d]%d\n",
                                le32_to_cpu(newext->ee_block),
                                ext4_ext_pblock(newext),
                                ext4_ext_is_unwritten(newext),
                                ext4_ext_get_actual_len(newext));
                nearex = EXT_FIRST_EXTENT(eh);
        } else {
                if (le32_to_cpu(newext->ee_block)
                           > le32_to_cpu(nearex->ee_block)) {
                        /* Insert after */
                        ext_debug(inode, "insert %u:%llu:[%d]%d before: "
                                        "nearest %p\n",
                                        le32_to_cpu(newext->ee_block),
                                        ext4_ext_pblock(newext),
                                        ext4_ext_is_unwritten(newext),
                                        ext4_ext_get_actual_len(newext),
                                        nearex);
                        nearex++;
                } else {
                        /* Insert before */
                        BUG_ON(newext->ee_block == nearex->ee_block);
                        ext_debug(inode, "insert %u:%llu:[%d]%d after: "
                                        "nearest %p\n",
                                        le32_to_cpu(newext->ee_block),
                                        ext4_ext_pblock(newext),
                                        ext4_ext_is_unwritten(newext),
                                        ext4_ext_get_actual_len(newext),
                                        nearex);
                }
                len = EXT_LAST_EXTENT(eh) - nearex + 1;
                if (len > 0) {
                        ext_debug(inode, "insert %u:%llu:[%d]%d: "
                                        "move %d extents from 0x%p to 0x%p\n",
                                        le32_to_cpu(newext->ee_block),
                                        ext4_ext_pblock(newext),
                                        ext4_ext_is_unwritten(newext),
                                        ext4_ext_get_actual_len(newext),
                                        len, nearex, nearex + 1);
                        memmove(nearex + 1, nearex,
                                len * sizeof(struct ext4_extent));
                }
        }

        le16_add_cpu(&eh->eh_entries, 1);
        path[depth].p_ext = nearex;
        nearex->ee_block = newext->ee_block;
        ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
        nearex->ee_len = newext->ee_len;

merge:
        /* try to merge extents */
        if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO))
                ext4_ext_try_to_merge(handle, inode, path, nearex);


        /* time to correct all indexes above */
        err = ext4_ext_correct_indexes(handle, inode, path);
        if (err)
                goto cleanup;

        err = ext4_ext_dirty(handle, inode, path + path->p_depth);

cleanup:
        ext4_ext_drop_refs(npath);
        kfree(npath);
        return err;
}

static int ext4_fill_es_cache_info(struct inode *inode,
                                   ext4_lblk_t block, ext4_lblk_t num,
                                   struct fiemap_extent_info *fieinfo)
{
        ext4_lblk_t next, end = block + num - 1;
        struct extent_status es;
        unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
        unsigned int flags;
        int err;

        while (block <= end) {
                next = 0;
                flags = 0;
                if (!ext4_es_lookup_extent(inode, block, &next, &es))
                        break;
                if (ext4_es_is_unwritten(&es))
                        flags |= FIEMAP_EXTENT_UNWRITTEN;
                if (ext4_es_is_delayed(&es))
                        flags |= (FIEMAP_EXTENT_DELALLOC |
                                  FIEMAP_EXTENT_UNKNOWN);
                if (ext4_es_is_hole(&es))
                        flags |= EXT4_FIEMAP_EXTENT_HOLE;
                if (next == 0)
                        flags |= FIEMAP_EXTENT_LAST;
                if (flags & (FIEMAP_EXTENT_DELALLOC|
                             EXT4_FIEMAP_EXTENT_HOLE))
                        es.es_pblk = 0;
                else
                        es.es_pblk = ext4_es_pblock(&es);
                err = fiemap_fill_next_extent(fieinfo,
                                (__u64)es.es_lblk << blksize_bits,
                                (__u64)es.es_pblk << blksize_bits,
                                (__u64)es.es_len << blksize_bits,
                                flags);
                if (next == 0)
                        break;
                block = next;
                if (err < 0)
                        return err;
                if (err == 1)
                        return 0;
        }
        return 0;
}


/*
 * ext4_ext_find_hole - find hole around given block according to the given path
 * @inode:        inode we lookup in
 * @path:        path in extent tree to @lblk
 * @lblk:        pointer to logical block around which we want to determine hole
 *
 * Determine hole length (and start if easily possible) around given logical
 * block. We don't try too hard to find the beginning of the hole but @path
 * actually points to extent before @lblk, we provide it.
 *
 * The function returns the length of a hole starting at @lblk. We update @lblk
 * to the beginning of the hole if we managed to find it.
 */
static ext4_lblk_t ext4_ext_find_hole(struct inode *inode,
                                      struct ext4_ext_path *path,
                                      ext4_lblk_t *lblk)
{
        int depth = ext_depth(inode);
        struct ext4_extent *ex;
        ext4_lblk_t len;

        ex = path[depth].p_ext;
        if (ex == NULL) {
                /* there is no extent yet, so gap is [0;-] */
                *lblk = 0;
                len = EXT_MAX_BLOCKS;
        } else if (*lblk < le32_to_cpu(ex->ee_block)) {
                len = le32_to_cpu(ex->ee_block) - *lblk;
        } else if (*lblk >= le32_to_cpu(ex->ee_block)
                        + ext4_ext_get_actual_len(ex)) {
                ext4_lblk_t next;

                *lblk = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
                next = ext4_ext_next_allocated_block(path);
                BUG_ON(next == *lblk);
                len = next - *lblk;
        } else {
                BUG();
        }
        return len;
}

/*
 * ext4_ext_rm_idx:
 * removes index from the index block.
 */
static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
                        struct ext4_ext_path *path, int depth)
{
        int err;
        ext4_fsblk_t leaf;

        /* free index block */
        depth--;
        path = path + depth;
        leaf = ext4_idx_pblock(path->p_idx);
        if (unlikely(path->p_hdr->eh_entries == 0)) {
                EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
                return -EFSCORRUPTED;
        }
        err = ext4_ext_get_access(handle, inode, path);
        if (err)
                return err;

        if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) {
                int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx;
                len *= sizeof(struct ext4_extent_idx);
                memmove(path->p_idx, path->p_idx + 1, len);
        }

        le16_add_cpu(&path->p_hdr->eh_entries, -1);
        err = ext4_ext_dirty(handle, inode, path);
        if (err)
                return err;
        ext_debug(inode, "index is empty, remove it, free block %llu\n", leaf);
        trace_ext4_ext_rm_idx(inode, leaf);

        ext4_free_blocks(handle, inode, NULL, leaf, 1,
                         EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);

        while (--depth >= 0) {
                if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr))
                        break;
                path--;
                err = ext4_ext_get_access(handle, inode, path);
                if (err)
                        break;
                path->p_idx->ei_block = (path+1)->p_idx->ei_block;
                err = ext4_ext_dirty(handle, inode, path);
                if (err)
                        break;
        }
        return err;
}

/*
 * ext4_ext_calc_credits_for_single_extent:
 * This routine returns max. credits that needed to insert an extent
 * to the extent tree.
 * When pass the actual path, the caller should calculate credits
 * under i_data_sem.
 */
int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
                                                struct ext4_ext_path *path)
{
        if (path) {
                int depth = ext_depth(inode);
                int ret = 0;

                /* probably there is space in leaf? */
                if (le16_to_cpu(path[depth].p_hdr->eh_entries)
                                < le16_to_cpu(path[depth].p_hdr->eh_max)) {

                        /*
                         *  There are some space in the leaf tree, no
                         *  need to account for leaf block credit
                         *
                         *  bitmaps and block group descriptor blocks
                         *  and other metadata blocks still need to be
                         *  accounted.
                         */
                        /* 1 bitmap, 1 block group descriptor */
                        ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
                        return ret;
                }
        }

        return ext4_chunk_trans_blocks(inode, nrblocks);
}

/*
 * How many index/leaf blocks need to change/allocate to add @extents extents?
 *
 * If we add a single extent, then in the worse case, each tree level
 * index/leaf need to be changed in case of the tree split.
 *
 * If more extents are inserted, they could cause the whole tree split more
 * than once, but this is really rare.
 */
int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
{
        int index;

        /* If we are converting the inline data, only one is needed here. */
        if (ext4_has_inline_data(inode))
                return 1;

        /*
         * Extent tree can change between the time we estimate credits and
         * the time we actually modify the tree. Assume the worst case.
         */
        if (extents <= 1)
                index = EXT4_MAX_EXTENT_DEPTH * 2;
        else
                index = EXT4_MAX_EXTENT_DEPTH * 3;

        return index;
}

static inline int get_default_free_blocks_flags(struct inode *inode)
{
        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
            ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE))
                return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
        else if (ext4_should_journal_data(inode))
                return EXT4_FREE_BLOCKS_FORGET;
        return 0;
}

/*
 * ext4_rereserve_cluster - increment the reserved cluster count when
 *                          freeing a cluster with a pending reservation
 *
 * @inode - file containing the cluster
 * @lblk - logical block in cluster to be reserved
 *
 * Increments the reserved cluster count and adjusts quota in a bigalloc
 * file system when freeing a partial cluster containing at least one
 * delayed and unwritten block.  A partial cluster meeting that
 * requirement will have a pending reservation.  If so, the
 * RERESERVE_CLUSTER flag is used when calling ext4_free_blocks() to
 * defer reserved and allocated space accounting to a subsequent call
 * to this function.
 */
static void ext4_rereserve_cluster(struct inode *inode, ext4_lblk_t lblk)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);

        dquot_reclaim_block(inode, EXT4_C2B(sbi, 1));

        spin_lock(&ei->i_block_reservation_lock);
        ei->i_reserved_data_blocks++;
        percpu_counter_add(&sbi->s_dirtyclusters_counter, 1);
        spin_unlock(&ei->i_block_reservation_lock);

        percpu_counter_add(&sbi->s_freeclusters_counter, 1);
        ext4_remove_pending(inode, lblk);
}

static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
                              struct ext4_extent *ex,
                              struct partial_cluster *partial,
                              ext4_lblk_t from, ext4_lblk_t to)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        unsigned short ee_len = ext4_ext_get_actual_len(ex);
        ext4_fsblk_t last_pblk, pblk;
        ext4_lblk_t num;
        int flags;

        /* only extent tail removal is allowed */
        if (from < le32_to_cpu(ex->ee_block) ||
            to != le32_to_cpu(ex->ee_block) + ee_len - 1) {
                ext4_error(sbi->s_sb,
                           "strange request: removal(2) %u-%u from %u:%u",
                           from, to, le32_to_cpu(ex->ee_block), ee_len);
                return 0;
        }

#ifdef EXTENTS_STATS
        spin_lock(&sbi->s_ext_stats_lock);
        sbi->s_ext_blocks += ee_len;
        sbi->s_ext_extents++;
        if (ee_len < sbi->s_ext_min)
                sbi->s_ext_min = ee_len;
        if (ee_len > sbi->s_ext_max)
                sbi->s_ext_max = ee_len;
        if (ext_depth(inode) > sbi->s_depth_max)
                sbi->s_depth_max = ext_depth(inode);
        spin_unlock(&sbi->s_ext_stats_lock);
#endif

        trace_ext4_remove_blocks(inode, ex, from, to, partial);

        /*
         * if we have a partial cluster, and it's different from the
         * cluster of the last block in the extent, we free it
         */
        last_pblk = ext4_ext_pblock(ex) + ee_len - 1;

        if (partial->state != initial &&
            partial->pclu != EXT4_B2C(sbi, last_pblk)) {
                if (partial->state == tofree) {
                        flags = get_default_free_blocks_flags(inode);
                        if (ext4_is_pending(inode, partial->lblk))
                                flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
                        ext4_free_blocks(handle, inode, NULL,
                                         EXT4_C2B(sbi, partial->pclu),
                                         sbi->s_cluster_ratio, flags);
                        if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
                                ext4_rereserve_cluster(inode, partial->lblk);
                }
                partial->state = initial;
        }

        num = le32_to_cpu(ex->ee_block) + ee_len - from;
        pblk = ext4_ext_pblock(ex) + ee_len - num;

        /*
         * We free the partial cluster at the end of the extent (if any),
         * unless the cluster is used by another extent (partial_cluster
         * state is nofree).  If a partial cluster exists here, it must be
         * shared with the last block in the extent.
         */
        flags = get_default_free_blocks_flags(inode);

        /* partial, left end cluster aligned, right end unaligned */
        if ((EXT4_LBLK_COFF(sbi, to) != sbi->s_cluster_ratio - 1) &&
            (EXT4_LBLK_CMASK(sbi, to) >= from) &&
            (partial->state != nofree)) {
                if (ext4_is_pending(inode, to))
                        flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
                ext4_free_blocks(handle, inode, NULL,
                                 EXT4_PBLK_CMASK(sbi, last_pblk),
                                 sbi->s_cluster_ratio, flags);
                if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
                        ext4_rereserve_cluster(inode, to);
                partial->state = initial;
                flags = get_default_free_blocks_flags(inode);
        }

        flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;

        /*
         * For bigalloc file systems, we never free a partial cluster
         * at the beginning of the extent.  Instead, we check to see if we
         * need to free it on a subsequent call to ext4_remove_blocks,
         * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space.
         */
        flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
        ext4_free_blocks(handle, inode, NULL, pblk, num, flags);

        /* reset the partial cluster if we've freed past it */
        if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, pblk))
                partial->state = initial;

        /*
         * If we've freed the entire extent but the beginning is not left
         * cluster aligned and is not marked as ineligible for freeing we
         * record the partial cluster at the beginning of the extent.  It
         * wasn't freed by the preceding ext4_free_blocks() call, and we
         * need to look farther to the left to determine if it's to be freed
         * (not shared with another extent). Else, reset the partial
         * cluster - we're either  done freeing or the beginning of the
         * extent is left cluster aligned.
         */
        if (EXT4_LBLK_COFF(sbi, from) && num == ee_len) {
                if (partial->state == initial) {
                        partial->pclu = EXT4_B2C(sbi, pblk);
                        partial->lblk = from;
                        partial->state = tofree;
                }
        } else {
                partial->state = initial;
        }

        return 0;
}

/*
 * ext4_ext_rm_leaf() Removes the extents associated with the
 * blocks appearing between "start" and "end".  Both "start"
 * and "end" must appear in the same extent or EIO is returned.
 *
 * @handle: The journal handle
 * @inode:  The files inode
 * @path:   The path to the leaf
 * @partial_cluster: The cluster which we'll have to free if all extents
 *                   has been released from it.  However, if this value is
 *                   negative, it's a cluster just to the right of the
 *                   punched region and it must not be freed.
 * @start:  The first block to remove
 * @end:   The last block to remove
 */
static int
ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                 struct ext4_ext_path *path,
                 struct partial_cluster *partial,
                 ext4_lblk_t start, ext4_lblk_t end)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        int err = 0, correct_index = 0;
        int depth = ext_depth(inode), credits, revoke_credits;
        struct ext4_extent_header *eh;
        ext4_lblk_t a, b;
        unsigned num;
        ext4_lblk_t ex_ee_block;
        unsigned short ex_ee_len;
        unsigned unwritten = 0;
        struct ext4_extent *ex;
        ext4_fsblk_t pblk;

        /* the header must be checked already in ext4_ext_remove_space() */
        ext_debug(inode, "truncate since %u in leaf to %u\n", start, end);
        if (!path[depth].p_hdr)
                path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
        eh = path[depth].p_hdr;
        if (unlikely(path[depth].p_hdr == NULL)) {
                EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
                return -EFSCORRUPTED;
        }
        /* find where to start removing */
        ex = path[depth].p_ext;
        if (!ex)
                ex = EXT_LAST_EXTENT(eh);

        ex_ee_block = le32_to_cpu(ex->ee_block);
        ex_ee_len = ext4_ext_get_actual_len(ex);

        trace_ext4_ext_rm_leaf(inode, start, ex, partial);

        while (ex >= EXT_FIRST_EXTENT(eh) &&
                        ex_ee_block + ex_ee_len > start) {

                if (ext4_ext_is_unwritten(ex))
                        unwritten = 1;
                else
                        unwritten = 0;

                ext_debug(inode, "remove ext %u:[%d]%d\n", ex_ee_block,
                          unwritten, ex_ee_len);
                path[depth].p_ext = ex;

                a = ex_ee_block > start ? ex_ee_block : start;
                b = ex_ee_block+ex_ee_len - 1 < end ?
                        ex_ee_block+ex_ee_len - 1 : end;

                ext_debug(inode, "  border %u:%u\n", a, b);

                /* If this extent is beyond the end of the hole, skip it */
                if (end < ex_ee_block) {
                        /*
                         * We're going to skip this extent and move to another,
                         * so note that its first cluster is in use to avoid
                         * freeing it when removing blocks.  Eventually, the
                         * right edge of the truncated/punched region will
                         * be just to the left.
                         */
                        if (sbi->s_cluster_ratio > 1) {
                                pblk = ext4_ext_pblock(ex);
                                partial->pclu = EXT4_B2C(sbi, pblk);
                                partial->state = nofree;
                        }
                        ex--;
                        ex_ee_block = le32_to_cpu(ex->ee_block);
                        ex_ee_len = ext4_ext_get_actual_len(ex);
                        continue;
                } else if (b != ex_ee_block + ex_ee_len - 1) {
                        EXT4_ERROR_INODE(inode,
                                         "can not handle truncate %u:%u "
                                         "on extent %u:%u",
                                         start, end, ex_ee_block,
                                         ex_ee_block + ex_ee_len - 1);
                        err = -EFSCORRUPTED;
                        goto out;
                } else if (a != ex_ee_block) {
                        /* remove tail of the extent */
                        num = a - ex_ee_block;
                } else {
                        /* remove whole extent: excellent! */
                        num = 0;
                }
                /*
                 * 3 for leaf, sb, and inode plus 2 (bmap and group
                 * descriptor) for each block group; assume two block
                 * groups plus ex_ee_len/blocks_per_block_group for
                 * the worst case
                 */
                credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
                if (ex == EXT_FIRST_EXTENT(eh)) {
                        correct_index = 1;
                        credits += (ext_depth(inode)) + 1;
                }
                credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
                /*
                 * We may end up freeing some index blocks and data from the
                 * punched range. Note that partial clusters are accounted for
                 * by ext4_free_data_revoke_credits().
                 */
                revoke_credits =
                        ext4_free_metadata_revoke_credits(inode->i_sb,
                                                          ext_depth(inode)) +
                        ext4_free_data_revoke_credits(inode, b - a + 1);

                err = ext4_datasem_ensure_credits(handle, inode, credits,
                                                  credits, revoke_credits);
                if (err) {
                        if (err > 0)
                                err = -EAGAIN;
                        goto out;
                }

                err = ext4_ext_get_access(handle, inode, path + depth);
                if (err)
                        goto out;

                err = ext4_remove_blocks(handle, inode, ex, partial, a, b);
                if (err)
                        goto out;

                if (num == 0)
                        /* this extent is removed; mark slot entirely unused */
                        ext4_ext_store_pblock(ex, 0);

                ex->ee_len = cpu_to_le16(num);
                /*
                 * Do not mark unwritten if all the blocks in the
                 * extent have been removed.
                 */
                if (unwritten && num)
                        ext4_ext_mark_unwritten(ex);
                /*
                 * If the extent was completely released,
                 * we need to remove it from the leaf
                 */
                if (num == 0) {
                        if (end != EXT_MAX_BLOCKS - 1) {
                                /*
                                 * For hole punching, we need to scoot all the
                                 * extents up when an extent is removed so that
                                 * we dont have blank extents in the middle
                                 */
                                memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
                                        sizeof(struct ext4_extent));

                                /* Now get rid of the one at the end */
                                memset(EXT_LAST_EXTENT(eh), 0,
                                        sizeof(struct ext4_extent));
                        }
                        le16_add_cpu(&eh->eh_entries, -1);
                }

                err = ext4_ext_dirty(handle, inode, path + depth);
                if (err)
                        goto out;

                ext_debug(inode, "new extent: %u:%u:%llu\n", ex_ee_block, num,
                                ext4_ext_pblock(ex));
                ex--;
                ex_ee_block = le32_to_cpu(ex->ee_block);
                ex_ee_len = ext4_ext_get_actual_len(ex);
        }

        if (correct_index && eh->eh_entries)
                err = ext4_ext_correct_indexes(handle, inode, path);

        /*
         * If there's a partial cluster and at least one extent remains in
         * the leaf, free the partial cluster if it isn't shared with the
         * current extent.  If it is shared with the current extent
         * we reset the partial cluster because we've reached the start of the
         * truncated/punched region and we're done removing blocks.
         */
        if (partial->state == tofree && ex >= EXT_FIRST_EXTENT(eh)) {
                pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
                if (partial->pclu != EXT4_B2C(sbi, pblk)) {
                        int flags = get_default_free_blocks_flags(inode);

                        if (ext4_is_pending(inode, partial->lblk))
                                flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
                        ext4_free_blocks(handle, inode, NULL,
                                         EXT4_C2B(sbi, partial->pclu),
                                         sbi->s_cluster_ratio, flags);
                        if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
                                ext4_rereserve_cluster(inode, partial->lblk);
                }
                partial->state = initial;
        }

        /* if this leaf is free, then we should
         * remove it from index block above */
        if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
                err = ext4_ext_rm_idx(handle, inode, path, depth);

out:
        return err;
}

/*
 * ext4_ext_more_to_rm:
 * returns 1 if current index has to be freed (even partial)
 */
static int
ext4_ext_more_to_rm(struct ext4_ext_path *path)
{
        BUG_ON(path->p_idx == NULL);

        if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
                return 0;

        /*
         * if truncate on deeper level happened, it wasn't partial,
         * so we have to consider current index for truncation
         */
        if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
                return 0;
        return 1;
}

int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
                          ext4_lblk_t end)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        int depth = ext_depth(inode);
        struct ext4_ext_path *path = NULL;
        struct partial_cluster partial;
        handle_t *handle;
        int i = 0, err = 0;
        int flags = EXT4_EX_NOCACHE | EXT4_EX_NOFAIL;

        partial.pclu = 0;
        partial.lblk = 0;
        partial.state = initial;

        ext_debug(inode, "truncate since %u to %u\n", start, end);

        /* probably first extent we're gonna free will be last in block */
        handle = ext4_journal_start_with_revoke(inode, EXT4_HT_TRUNCATE,
                        depth + 1,
                        ext4_free_metadata_revoke_credits(inode->i_sb, depth));
        if (IS_ERR(handle))
                return PTR_ERR(handle);

again:
        trace_ext4_ext_remove_space(inode, start, end, depth);

        /*
         * Check if we are removing extents inside the extent tree. If that
         * is the case, we are going to punch a hole inside the extent tree
         * so we have to check whether we need to split the extent covering
         * the last block to remove so we can easily remove the part of it
         * in ext4_ext_rm_leaf().
         */
        if (end < EXT_MAX_BLOCKS - 1) {
                struct ext4_extent *ex;
                ext4_lblk_t ee_block, ex_end, lblk;
                ext4_fsblk_t pblk;

                /* find extent for or closest extent to this block */
                path = ext4_find_extent(inode, end, NULL, flags);
                if (IS_ERR(path)) {
                        ext4_journal_stop(handle);
                        return PTR_ERR(path);
                }
                depth = ext_depth(inode);
                /* Leaf not may not exist only if inode has no blocks at all */
                ex = path[depth].p_ext;
                if (!ex) {
                        if (depth) {
                                EXT4_ERROR_INODE(inode,
                                                 "path[%d].p_hdr == NULL",
                                                 depth);
                                err = -EFSCORRUPTED;
                        }
                        goto out;
                }

                ee_block = le32_to_cpu(ex->ee_block);
                ex_end = ee_block + ext4_ext_get_actual_len(ex) - 1;

                /*
                 * See if the last block is inside the extent, if so split
                 * the extent at 'end' block so we can easily remove the
                 * tail of the first part of the split extent in
                 * ext4_ext_rm_leaf().
                 */
                if (end >= ee_block && end < ex_end) {

                        /*
                         * If we're going to split the extent, note that
                         * the cluster containing the block after 'end' is
                         * in use to avoid freeing it when removing blocks.
                         */
                        if (sbi->s_cluster_ratio > 1) {
                                pblk = ext4_ext_pblock(ex) + end - ee_block + 1;
                                partial.pclu = EXT4_B2C(sbi, pblk);
                                partial.state = nofree;
                        }

                        /*
                         * Split the extent in two so that 'end' is the last
                         * block in the first new extent. Also we should not
                         * fail removing space due to ENOSPC so try to use
                         * reserved block if that happens.
                         */
                        err = ext4_force_split_extent_at(handle, inode, &path,
                                                         end + 1, 1);
                        if (err < 0)
                                goto out;

                } else if (sbi->s_cluster_ratio > 1 && end >= ex_end &&
                           partial.state == initial) {
                        /*
                         * If we're punching, there's an extent to the right.
                         * If the partial cluster hasn't been set, set it to
                         * that extent's first cluster and its state to nofree
                         * so it won't be freed should it contain blocks to be
                         * removed. If it's already set (tofree/nofree), we're
                         * retrying and keep the original partial cluster info
                         * so a cluster marked tofree as a result of earlier
                         * extent removal is not lost.
                         */
                        lblk = ex_end + 1;
                        err = ext4_ext_search_right(inode, path, &lblk, &pblk,
                                                    NULL, flags);
                        if (err < 0)
                                goto out;
                        if (pblk) {
                                partial.pclu = EXT4_B2C(sbi, pblk);
                                partial.state = nofree;
                        }
                }
        }
        /*
         * We start scanning from right side, freeing all the blocks
         * after i_size and walking into the tree depth-wise.
         */
        depth = ext_depth(inode);
        if (path) {
                int k = i = depth;
                while (--k > 0)
                        path[k].p_block =
                                le16_to_cpu(path[k].p_hdr->eh_entries)+1;
        } else {
                path = kcalloc(depth + 1, sizeof(struct ext4_ext_path),
                               GFP_NOFS | __GFP_NOFAIL);
                if (path == NULL) {
                        ext4_journal_stop(handle);
                        return -ENOMEM;
                }
                path[0].p_maxdepth = path[0].p_depth = depth;
                path[0].p_hdr = ext_inode_hdr(inode);
                i = 0;

                if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) {
                        err = -EFSCORRUPTED;
                        goto out;
                }
        }
        err = 0;

        while (i >= 0 && err == 0) {
                if (i == depth) {
                        /* this is leaf block */
                        err = ext4_ext_rm_leaf(handle, inode, path,
                                               &partial, start, end);
                        /* root level has p_bh == NULL, brelse() eats this */
                        brelse(path[i].p_bh);
                        path[i].p_bh = NULL;
                        i--;
                        continue;
                }

                /* this is index block */
                if (!path[i].p_hdr) {
                        ext_debug(inode, "initialize header\n");
                        path[i].p_hdr = ext_block_hdr(path[i].p_bh);
                }

                if (!path[i].p_idx) {
                        /* this level hasn't been touched yet */
                        path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
                        path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
                        ext_debug(inode, "init index ptr: hdr 0x%p, num %d\n",
                                  path[i].p_hdr,
                                  le16_to_cpu(path[i].p_hdr->eh_entries));
                } else {
                        /* we were already here, see at next index */
                        path[i].p_idx--;
                }

                ext_debug(inode, "level %d - index, first 0x%p, cur 0x%p\n",
                                i, EXT_FIRST_INDEX(path[i].p_hdr),
                                path[i].p_idx);
                if (ext4_ext_more_to_rm(path + i)) {
                        struct buffer_head *bh;
                        /* go to the next level */
                        ext_debug(inode, "move to level %d (block %llu)\n",
                                  i + 1, ext4_idx_pblock(path[i].p_idx));
                        memset(path + i + 1, 0, sizeof(*path));
                        bh = read_extent_tree_block(inode, path[i].p_idx,
                                                    depth - i - 1, flags);
                        if (IS_ERR(bh)) {
                                /* should we reset i_size? */
                                err = PTR_ERR(bh);
                                break;
                        }
                        /* Yield here to deal with large extent trees.
                         * Should be a no-op if we did IO above. */
                        cond_resched();
                        if (WARN_ON(i + 1 > depth)) {
                                err = -EFSCORRUPTED;
                                break;
                        }
                        path[i + 1].p_bh = bh;

                        /* save actual number of indexes since this
                         * number is changed at the next iteration */
                        path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
                        i++;
                } else {
                        /* we finished processing this index, go up */
                        if (path[i].p_hdr->eh_entries == 0 && i > 0) {
                                /* index is empty, remove it;
                                 * handle must be already prepared by the
                                 * truncatei_leaf() */
                                err = ext4_ext_rm_idx(handle, inode, path, i);
                        }
                        /* root level has p_bh == NULL, brelse() eats this */
                        brelse(path[i].p_bh);
                        path[i].p_bh = NULL;
                        i--;
                        ext_debug(inode, "return to level %d\n", i);
                }
        }

        trace_ext4_ext_remove_space_done(inode, start, end, depth, &partial,
                                         path->p_hdr->eh_entries);

        /*
         * if there's a partial cluster and we have removed the first extent
         * in the file, then we also free the partial cluster, if any
         */
        if (partial.state == tofree && err == 0) {
                int flags = get_default_free_blocks_flags(inode);

                if (ext4_is_pending(inode, partial.lblk))
                        flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
                ext4_free_blocks(handle, inode, NULL,
                                 EXT4_C2B(sbi, partial.pclu),
                                 sbi->s_cluster_ratio, flags);
                if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
                        ext4_rereserve_cluster(inode, partial.lblk);
                partial.state = initial;
        }

        /* TODO: flexible tree reduction should be here */
        if (path->p_hdr->eh_entries == 0) {
                /*
                 * truncate to zero freed all the tree,
                 * so we need to correct eh_depth
                 */
                err = ext4_ext_get_access(handle, inode, path);
                if (err == 0) {
                        ext_inode_hdr(inode)->eh_depth = 0;
                        ext_inode_hdr(inode)->eh_max =
                                cpu_to_le16(ext4_ext_space_root(inode, 0));
                        err = ext4_ext_dirty(handle, inode, path);
                }
        }
out:
        ext4_ext_drop_refs(path);
        kfree(path);
        path = NULL;
        if (err == -EAGAIN)
                goto again;
        ext4_journal_stop(handle);

        return err;
}

/*
 * called at mount time
 */
void ext4_ext_init(struct super_block *sb)
{
        /*
         * possible initialization would be here
         */

        if (ext4_has_feature_extents(sb)) {
#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
                printk(KERN_INFO "EXT4-fs: file extents enabled"
#ifdef AGGRESSIVE_TEST
                       ", aggressive tests"
#endif
#ifdef CHECK_BINSEARCH
                       ", check binsearch"
#endif
#ifdef EXTENTS_STATS
                       ", stats"
#endif
                       "\n");
#endif
#ifdef EXTENTS_STATS
                spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
                EXT4_SB(sb)->s_ext_min = 1 << 30;
                EXT4_SB(sb)->s_ext_max = 0;
#endif
        }
}

/*
 * called at umount time
 */
void ext4_ext_release(struct super_block *sb)
{
        if (!ext4_has_feature_extents(sb))
                return;

#ifdef EXTENTS_STATS
        if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
                struct ext4_sb_info *sbi = EXT4_SB(sb);
                printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
                        sbi->s_ext_blocks, sbi->s_ext_extents,
                        sbi->s_ext_blocks / sbi->s_ext_extents);
                printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
                        sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
        }
#endif
}

static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
{
        ext4_lblk_t  ee_block;
        ext4_fsblk_t ee_pblock;
        unsigned int ee_len;

        ee_block  = le32_to_cpu(ex->ee_block);
        ee_len    = ext4_ext_get_actual_len(ex);
        ee_pblock = ext4_ext_pblock(ex);

        if (ee_len == 0)
                return 0;

        ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
                              EXTENT_STATUS_WRITTEN);
        return 0;
}

/* FIXME!! we need to try to merge to left or right after zero-out  */
static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
{
        ext4_fsblk_t ee_pblock;
        unsigned int ee_len;

        ee_len    = ext4_ext_get_actual_len(ex);
        ee_pblock = ext4_ext_pblock(ex);
        return ext4_issue_zeroout(inode, le32_to_cpu(ex->ee_block), ee_pblock,
                                  ee_len);
}

/*
 * ext4_split_extent_at() splits an extent at given block.
 *
 * @handle: the journal handle
 * @inode: the file inode
 * @path: the path to the extent
 * @split: the logical block where the extent is splitted.
 * @split_flags: indicates if the extent could be zeroout if split fails, and
 *                 the states(init or unwritten) of new extents.
 * @flags: flags used to insert new extent to extent tree.
 *
 *
 * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
 * of which are determined by split_flag.
 *
 * There are two cases:
 *  a> the extent are splitted into two extent.
 *  b> split is not needed, and just mark the extent.
 *
 * return 0 on success.
 */
static int ext4_split_extent_at(handle_t *handle,
                             struct inode *inode,
                             struct ext4_ext_path **ppath,
                             ext4_lblk_t split,
                             int split_flag,
                             int flags)
{
        struct ext4_ext_path *path = *ppath;
        ext4_fsblk_t newblock;
        ext4_lblk_t ee_block;
        struct ext4_extent *ex, newex, orig_ex, zero_ex;
        struct ext4_extent *ex2 = NULL;
        unsigned int ee_len, depth;
        int err = 0;

        BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) ==
               (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2));

        /* Do not cache extents that are in the process of being modified. */
        flags |= EXT4_EX_NOCACHE;

        ext_debug(inode, "logical block %llu\n", (unsigned long long)split);

        ext4_ext_show_leaf(inode, path);

        depth = ext_depth(inode);
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
        newblock = split - ee_block + ext4_ext_pblock(ex);

        BUG_ON(split < ee_block || split >= (ee_block + ee_len));
        BUG_ON(!ext4_ext_is_unwritten(ex) &&
               split_flag & (EXT4_EXT_MAY_ZEROOUT |
                             EXT4_EXT_MARK_UNWRIT1 |
                             EXT4_EXT_MARK_UNWRIT2));

        err = ext4_ext_get_access(handle, inode, path + depth);
        if (err)
                goto out;

        if (split == ee_block) {
                /*
                 * case b: block @split is the block that the extent begins with
                 * then we just change the state of the extent, and splitting
                 * is not needed.
                 */
                if (split_flag & EXT4_EXT_MARK_UNWRIT2)
                        ext4_ext_mark_unwritten(ex);
                else
                        ext4_ext_mark_initialized(ex);

                if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
                        ext4_ext_try_to_merge(handle, inode, path, ex);

                err = ext4_ext_dirty(handle, inode, path + path->p_depth);
                goto out;
        }

        /* case a */
        memcpy(&orig_ex, ex, sizeof(orig_ex));
        ex->ee_len = cpu_to_le16(split - ee_block);
        if (split_flag & EXT4_EXT_MARK_UNWRIT1)
                ext4_ext_mark_unwritten(ex);

        /*
         * path may lead to new leaf, not to original leaf any more
         * after ext4_ext_insert_extent() returns,
         */
        err = ext4_ext_dirty(handle, inode, path + depth);
        if (err)
                goto fix_extent_len;

        ex2 = &newex;
        ex2->ee_block = cpu_to_le32(split);
        ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));
        ext4_ext_store_pblock(ex2, newblock);
        if (split_flag & EXT4_EXT_MARK_UNWRIT2)
                ext4_ext_mark_unwritten(ex2);

        err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags);
        if (err && err != -ENOSPC && err != -EDQUOT && err != -ENOMEM)
                goto out_err;
        if (!err)
                goto out;

        /*
         * Update path is required because previous ext4_ext_insert_extent()
         * may have freed or reallocated the path. Using EXT4_EX_NOFAIL
         * guarantees that ext4_find_extent() will not return -ENOMEM,
         * otherwise -ENOMEM will cause a retry in do_writepages(), and a
         * WARN_ON may be triggered in ext4_da_update_reserve_space() due to
         * an incorrect ee_len causing the i_reserved_data_blocks exception.
         */
        path = ext4_find_extent(inode, ee_block, ppath,
                                flags | EXT4_EX_NOFAIL);
        if (IS_ERR(path)) {
                EXT4_ERROR_INODE(inode, "Failed split extent on %u, err %ld",
                                 split, PTR_ERR(path));
                err = PTR_ERR(path);
                goto out_err;
        }
        depth = ext_depth(inode);
        ex = path[depth].p_ext;

        if (EXT4_EXT_MAY_ZEROOUT & split_flag) {
                if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
                        if (split_flag & EXT4_EXT_DATA_VALID1) {
                                err = ext4_ext_zeroout(inode, ex2);
                                zero_ex.ee_block = ex2->ee_block;
                                zero_ex.ee_len = cpu_to_le16(
                                                ext4_ext_get_actual_len(ex2));
                                ext4_ext_store_pblock(&zero_ex,
                                                      ext4_ext_pblock(ex2));
                        } else {
                                err = ext4_ext_zeroout(inode, ex);
                                zero_ex.ee_block = ex->ee_block;
                                zero_ex.ee_len = cpu_to_le16(
                                                ext4_ext_get_actual_len(ex));
                                ext4_ext_store_pblock(&zero_ex,
                                                      ext4_ext_pblock(ex));
                        }
                } else {
                        err = ext4_ext_zeroout(inode, &orig_ex);
                        zero_ex.ee_block = orig_ex.ee_block;
                        zero_ex.ee_len = cpu_to_le16(
                                                ext4_ext_get_actual_len(&orig_ex));
                        ext4_ext_store_pblock(&zero_ex,
                                              ext4_ext_pblock(&orig_ex));
                }

                if (!err) {
                        /* update the extent length and mark as initialized */
                        ex->ee_len = cpu_to_le16(ee_len);
                        ext4_ext_try_to_merge(handle, inode, path, ex);
                        err = ext4_ext_dirty(handle, inode, path + path->p_depth);
                        if (!err)
                                /* update extent status tree */
                                err = ext4_zeroout_es(inode, &zero_ex);
                        /* If we failed at this point, we don't know in which
                         * state the extent tree exactly is so don't try to fix
                         * length of the original extent as it may do even more
                         * damage.
                         */
                        goto out;
                }
        }

fix_extent_len:
        ex->ee_len = orig_ex.ee_len;
        /*
         * Ignore ext4_ext_dirty return value since we are already in error path
         * and err is a non-zero error code.
         */
        ext4_ext_dirty(handle, inode, path + path->p_depth);
        return err;
out_err:
        /* Remove all remaining potentially stale extents. */
        ext4_es_remove_extent(inode, ee_block, ee_len);
out:
        ext4_ext_show_leaf(inode, *ppath);
        return err;
}

/*
 * ext4_split_extent() splits an extent and mark extent which is covered
 * by @map as split_flags indicates
 *
 * It may result in splitting the extent into multiple extents (up to three)
 * There are three possibilities:
 *   a> There is no split required
 *   b> Splits in two extents: Split is happening at either end of the extent
 *   c> Splits in three extents: Somone is splitting in middle of the extent
 *
 */
static int ext4_split_extent(handle_t *handle,
                              struct inode *inode,
                              struct ext4_ext_path **ppath,
                              struct ext4_map_blocks *map,
                              int split_flag,
                              int flags)
{
        struct ext4_ext_path *path = *ppath;
        ext4_lblk_t ee_block;
        struct ext4_extent *ex;
        unsigned int ee_len, depth;
        int err = 0;
        int unwritten;
        int split_flag1, flags1;
        int allocated = map->m_len;

        depth = ext_depth(inode);
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
        unwritten = ext4_ext_is_unwritten(ex);

        /* Do not cache extents that are in the process of being modified. */
        flags |= EXT4_EX_NOCACHE;

        if (map->m_lblk + map->m_len < ee_block + ee_len) {
                split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
                flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
                if (unwritten)
                        split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
                                       EXT4_EXT_MARK_UNWRIT2;
                if (split_flag & EXT4_EXT_DATA_VALID2)
                        split_flag1 |= EXT4_EXT_DATA_VALID1;
                err = ext4_split_extent_at(handle, inode, ppath,
                                map->m_lblk + map->m_len, split_flag1, flags1);
                if (err)
                        goto out;
        } else {
                allocated = ee_len - (map->m_lblk - ee_block);
        }
        /*
         * Update path is required because previous ext4_split_extent_at() may
         * result in split of original leaf or extent zeroout.
         */
        path = ext4_find_extent(inode, map->m_lblk, ppath, flags);
        if (IS_ERR(path))
                return PTR_ERR(path);
        depth = ext_depth(inode);
        ex = path[depth].p_ext;
        if (!ex) {
                EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
                                 (unsigned long) map->m_lblk);
                return -EFSCORRUPTED;
        }
        unwritten = ext4_ext_is_unwritten(ex);
        split_flag1 = 0;

        if (map->m_lblk >= ee_block) {
                split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
                if (unwritten) {
                        split_flag1 |= EXT4_EXT_MARK_UNWRIT1;
                        split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
                                                     EXT4_EXT_MARK_UNWRIT2);
                }
                err = ext4_split_extent_at(handle, inode, ppath,
                                map->m_lblk, split_flag1, flags);
                if (err)
                        goto out;
        }

        ext4_ext_show_leaf(inode, *ppath);
out:
        return err ? err : allocated;
}

/*
 * This function is called by ext4_ext_map_blocks() if someone tries to write
 * to an unwritten extent. It may result in splitting the unwritten
 * extent into multiple extents (up to three - one initialized and two
 * unwritten).
 * There are three possibilities:
 *   a> There is no split required: Entire extent should be initialized
 *   b> Splits in two extents: Write is happening at either end of the extent
 *   c> Splits in three extents: Somone is writing in middle of the extent
 *
 * Pre-conditions:
 *  - The extent pointed to by 'path' is unwritten.
 *  - The extent pointed to by 'path' contains a superset
 *    of the logical span [map->m_lblk, map->m_lblk + map->m_len).
 *
 * Post-conditions on success:
 *  - the returned value is the number of blocks beyond map->l_lblk
 *    that are allocated and initialized.
 *    It is guaranteed to be >= map->m_len.
 */
static int ext4_ext_convert_to_initialized(handle_t *handle,
                                           struct inode *inode,
                                           struct ext4_map_blocks *map,
                                           struct ext4_ext_path **ppath,
                                           int flags)
{
        struct ext4_ext_path *path = *ppath;
        struct ext4_sb_info *sbi;
        struct ext4_extent_header *eh;
        struct ext4_map_blocks split_map;
        struct ext4_extent zero_ex1, zero_ex2;
        struct ext4_extent *ex, *abut_ex;
        ext4_lblk_t ee_block, eof_block;
        unsigned int ee_len, depth, map_len = map->m_len;
        int err = 0;
        int split_flag = EXT4_EXT_DATA_VALID2;
        int allocated = 0;
        unsigned int max_zeroout = 0;

        ext_debug(inode, "logical block %llu, max_blocks %u\n",
                  (unsigned long long)map->m_lblk, map_len);

        sbi = EXT4_SB(inode->i_sb);
        eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1)
                        >> inode->i_sb->s_blocksize_bits;
        if (eof_block < map->m_lblk + map_len)
                eof_block = map->m_lblk + map_len;

        depth = ext_depth(inode);
        eh = path[depth].p_hdr;
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
        zero_ex1.ee_len = 0;
        zero_ex2.ee_len = 0;

        trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);

        /* Pre-conditions */
        BUG_ON(!ext4_ext_is_unwritten(ex));
        BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));

        /*
         * Attempt to transfer newly initialized blocks from the currently
         * unwritten extent to its neighbor. This is much cheaper
         * than an insertion followed by a merge as those involve costly
         * memmove() calls. Transferring to the left is the common case in
         * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE)
         * followed by append writes.
         *
         * Limitations of the current logic:
         *  - L1: we do not deal with writes covering the whole extent.
         *    This would require removing the extent if the transfer
         *    is possible.
         *  - L2: we only attempt to merge with an extent stored in the
         *    same extent tree node.
         */
        if ((map->m_lblk == ee_block) &&
                /* See if we can merge left */
                (map_len < ee_len) &&                /*L1*/
                (ex > EXT_FIRST_EXTENT(eh))) {        /*L2*/
                ext4_lblk_t prev_lblk;
                ext4_fsblk_t prev_pblk, ee_pblk;
                unsigned int prev_len;

                abut_ex = ex - 1;
                prev_lblk = le32_to_cpu(abut_ex->ee_block);
                prev_len = ext4_ext_get_actual_len(abut_ex);
                prev_pblk = ext4_ext_pblock(abut_ex);
                ee_pblk = ext4_ext_pblock(ex);

                /*
                 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
                 * upon those conditions:
                 * - C1: abut_ex is initialized,
                 * - C2: abut_ex is logically abutting ex,
                 * - C3: abut_ex is physically abutting ex,
                 * - C4: abut_ex can receive the additional blocks without
                 *   overflowing the (initialized) length limit.
                 */
                if ((!ext4_ext_is_unwritten(abut_ex)) &&                /*C1*/
                        ((prev_lblk + prev_len) == ee_block) &&                /*C2*/
                        ((prev_pblk + prev_len) == ee_pblk) &&                /*C3*/
                        (prev_len < (EXT_INIT_MAX_LEN - map_len))) {        /*C4*/
                        err = ext4_ext_get_access(handle, inode, path + depth);
                        if (err)
                                goto out;

                        trace_ext4_ext_convert_to_initialized_fastpath(inode,
                                map, ex, abut_ex);

                        /* Shift the start of ex by 'map_len' blocks */
                        ex->ee_block = cpu_to_le32(ee_block + map_len);
                        ext4_ext_store_pblock(ex, ee_pblk + map_len);
                        ex->ee_len = cpu_to_le16(ee_len - map_len);
                        ext4_ext_mark_unwritten(ex); /* Restore the flag */

                        /* Extend abut_ex by 'map_len' blocks */
                        abut_ex->ee_len = cpu_to_le16(prev_len + map_len);

                        /* Result: number of initialized blocks past m_lblk */
                        allocated = map_len;
                }
        } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) &&
                   (map_len < ee_len) &&        /*L1*/
                   ex < EXT_LAST_EXTENT(eh)) {        /*L2*/
                /* See if we can merge right */
                ext4_lblk_t next_lblk;
                ext4_fsblk_t next_pblk, ee_pblk;
                unsigned int next_len;

                abut_ex = ex + 1;
                next_lblk = le32_to_cpu(abut_ex->ee_block);
                next_len = ext4_ext_get_actual_len(abut_ex);
                next_pblk = ext4_ext_pblock(abut_ex);
                ee_pblk = ext4_ext_pblock(ex);

                /*
                 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
                 * upon those conditions:
                 * - C1: abut_ex is initialized,
                 * - C2: abut_ex is logically abutting ex,
                 * - C3: abut_ex is physically abutting ex,
                 * - C4: abut_ex can receive the additional blocks without
                 *   overflowing the (initialized) length limit.
                 */
                if ((!ext4_ext_is_unwritten(abut_ex)) &&                /*C1*/
                    ((map->m_lblk + map_len) == next_lblk) &&                /*C2*/
                    ((ee_pblk + ee_len) == next_pblk) &&                /*C3*/
                    (next_len < (EXT_INIT_MAX_LEN - map_len))) {        /*C4*/
                        err = ext4_ext_get_access(handle, inode, path + depth);
                        if (err)
                                goto out;

                        trace_ext4_ext_convert_to_initialized_fastpath(inode,
                                map, ex, abut_ex);

                        /* Shift the start of abut_ex by 'map_len' blocks */
                        abut_ex->ee_block = cpu_to_le32(next_lblk - map_len);
                        ext4_ext_store_pblock(abut_ex, next_pblk - map_len);
                        ex->ee_len = cpu_to_le16(ee_len - map_len);
                        ext4_ext_mark_unwritten(ex); /* Restore the flag */

                        /* Extend abut_ex by 'map_len' blocks */
                        abut_ex->ee_len = cpu_to_le16(next_len + map_len);

                        /* Result: number of initialized blocks past m_lblk */
                        allocated = map_len;
                }
        }
        if (allocated) {
                /* Mark the block containing both extents as dirty */
                err = ext4_ext_dirty(handle, inode, path + depth);

                /* Update path to point to the right extent */
                path[depth].p_ext = abut_ex;
                goto out;
        } else
                allocated = ee_len - (map->m_lblk - ee_block);

        WARN_ON(map->m_lblk < ee_block);
        /*
         * It is safe to convert extent to initialized via explicit
         * zeroout only if extent is fully inside i_size or new_size.
         */
        split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;

        if (EXT4_EXT_MAY_ZEROOUT & split_flag)
                max_zeroout = sbi->s_extent_max_zeroout_kb >>
                        (inode->i_sb->s_blocksize_bits - 10);

        /*
         * five cases:
         * 1. split the extent into three extents.
         * 2. split the extent into two extents, zeroout the head of the first
         *    extent.
         * 3. split the extent into two extents, zeroout the tail of the second
         *    extent.
         * 4. split the extent into two extents with out zeroout.
         * 5. no splitting needed, just possibly zeroout the head and / or the
         *    tail of the extent.
         */
        split_map.m_lblk = map->m_lblk;
        split_map.m_len = map->m_len;

        if (max_zeroout && (allocated > split_map.m_len)) {
                if (allocated <= max_zeroout) {
                        /* case 3 or 5 */
                        zero_ex1.ee_block =
                                 cpu_to_le32(split_map.m_lblk +
                                             split_map.m_len);
                        zero_ex1.ee_len =
                                cpu_to_le16(allocated - split_map.m_len);
                        ext4_ext_store_pblock(&zero_ex1,
                                ext4_ext_pblock(ex) + split_map.m_lblk +
                                split_map.m_len - ee_block);
                        err = ext4_ext_zeroout(inode, &zero_ex1);
                        if (err)
                                goto out;
                        split_map.m_len = allocated;
                }
                if (split_map.m_lblk - ee_block + split_map.m_len <
                                                                max_zeroout) {
                        /* case 2 or 5 */
                        if (split_map.m_lblk != ee_block) {
                                zero_ex2.ee_block = ex->ee_block;
                                zero_ex2.ee_len = cpu_to_le16(split_map.m_lblk -
                                                        ee_block);
                                ext4_ext_store_pblock(&zero_ex2,
                                                      ext4_ext_pblock(ex));
                                err = ext4_ext_zeroout(inode, &zero_ex2);
                                if (err)
                                        goto out;
                        }

                        split_map.m_len += split_map.m_lblk - ee_block;
                        split_map.m_lblk = ee_block;
                        allocated = map->m_len;
                }
        }

        err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag,
                                flags);
        if (err > 0)
                err = 0;
out:
        /* If we have gotten a failure, don't zero out status tree */
        if (!err) {
                err = ext4_zeroout_es(inode, &zero_ex1);
                if (!err)
                        err = ext4_zeroout_es(inode, &zero_ex2);
        }
        return err ? err : allocated;
}

/*
 * This function is called by ext4_ext_map_blocks() from
 * ext4_get_blocks_dio_write() when DIO to write
 * to an unwritten extent.
 *
 * Writing to an unwritten extent may result in splitting the unwritten
 * extent into multiple initialized/unwritten extents (up to three)
 * There are three possibilities:
 *   a> There is no split required: Entire extent should be unwritten
 *   b> Splits in two extents: Write is happening at either end of the extent
 *   c> Splits in three extents: Somone is writing in middle of the extent
 *
 * This works the same way in the case of initialized -> unwritten conversion.
 *
 * One of more index blocks maybe needed if the extent tree grow after
 * the unwritten extent split. To prevent ENOSPC occur at the IO
 * complete, we need to split the unwritten extent before DIO submit
 * the IO. The unwritten extent called at this time will be split
 * into three unwritten extent(at most). After IO complete, the part
 * being filled will be convert to initialized by the end_io callback function
 * via ext4_convert_unwritten_extents().
 *
 * Returns the size of unwritten extent to be written on success.
 */
static int ext4_split_convert_extents(handle_t *handle,
                                        struct inode *inode,
                                        struct ext4_map_blocks *map,
                                        struct ext4_ext_path **ppath,
                                        int flags)
{
        struct ext4_ext_path *path = *ppath;
        ext4_lblk_t eof_block;
        ext4_lblk_t ee_block;
        struct ext4_extent *ex;
        unsigned int ee_len;
        int split_flag = 0, depth;

        ext_debug(inode, "logical block %llu, max_blocks %u\n",
                  (unsigned long long)map->m_lblk, map->m_len);

        eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1)
                        >> inode->i_sb->s_blocksize_bits;
        if (eof_block < map->m_lblk + map->m_len)
                eof_block = map->m_lblk + map->m_len;
        /*
         * It is safe to convert extent to initialized via explicit
         * zeroout only if extent is fully inside i_size or new_size.
         */
        depth = ext_depth(inode);
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);

        /* Convert to unwritten */
        if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
                split_flag |= EXT4_EXT_DATA_VALID1;
        /* Split the existing unwritten extent */
        } else if (flags & (EXT4_GET_BLOCKS_UNWRIT_EXT |
                            EXT4_GET_BLOCKS_CONVERT)) {
                split_flag |= ee_block + ee_len <= eof_block ?
                              EXT4_EXT_MAY_ZEROOUT : 0;
                split_flag |= EXT4_EXT_MARK_UNWRIT2;
                /* Convert to initialized */
                if (flags & EXT4_GET_BLOCKS_CONVERT)
                        split_flag |= EXT4_EXT_DATA_VALID2;
        }
        flags |= EXT4_GET_BLOCKS_PRE_IO;
        return ext4_split_extent(handle, inode, ppath, map, split_flag, flags);
}

static int ext4_convert_unwritten_extents_endio(handle_t *handle,
                                                struct inode *inode,
                                                struct ext4_map_blocks *map,
                                                struct ext4_ext_path **ppath)
{
        struct ext4_ext_path *path = *ppath;
        struct ext4_extent *ex;
        ext4_lblk_t ee_block;
        unsigned int ee_len;
        int depth;
        int err = 0;

        depth = ext_depth(inode);
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);

        ext_debug(inode, "logical block %llu, max_blocks %u\n",
                  (unsigned long long)ee_block, ee_len);

        /* If extent is larger than requested it is a clear sign that we still
         * have some extent state machine issues left. So extent_split is still
         * required.
         * TODO: Once all related issues will be fixed this situation should be
         * illegal.
         */
        if (ee_block != map->m_lblk || ee_len > map->m_len) {
#ifdef CONFIG_EXT4_DEBUG
                ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu,"
                             " len %u; IO logical block %llu, len %u",
                             inode->i_ino, (unsigned long long)ee_block, ee_len,
                             (unsigned long long)map->m_lblk, map->m_len);
#endif
                err = ext4_split_convert_extents(handle, inode, map, ppath,
                                                 EXT4_GET_BLOCKS_CONVERT);
                if (err < 0)
                        return err;
                path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
                if (IS_ERR(path))
                        return PTR_ERR(path);
                depth = ext_depth(inode);
                ex = path[depth].p_ext;
        }

        err = ext4_ext_get_access(handle, inode, path + depth);
        if (err)
                goto out;
        /* first mark the extent as initialized */
        ext4_ext_mark_initialized(ex);

        /* note: ext4_ext_correct_indexes() isn't needed here because
         * borders are not changed
         */
        ext4_ext_try_to_merge(handle, inode, path, ex);

        /* Mark modified extent as dirty */
        err = ext4_ext_dirty(handle, inode, path + path->p_depth);
out:
        ext4_ext_show_leaf(inode, path);
        return err;
}

static int
convert_initialized_extent(handle_t *handle, struct inode *inode,
                           struct ext4_map_blocks *map,
                           struct ext4_ext_path **ppath,
                           unsigned int *allocated)
{
        struct ext4_ext_path *path = *ppath;
        struct ext4_extent *ex;
        ext4_lblk_t ee_block;
        unsigned int ee_len;
        int depth;
        int err = 0;

        /*
         * Make sure that the extent is no bigger than we support with
         * unwritten extent
         */
        if (map->m_len > EXT_UNWRITTEN_MAX_LEN)
                map->m_len = EXT_UNWRITTEN_MAX_LEN / 2;

        depth = ext_depth(inode);
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);

        ext_debug(inode, "logical block %llu, max_blocks %u\n",
                  (unsigned long long)ee_block, ee_len);

        if (ee_block != map->m_lblk || ee_len > map->m_len) {
                err = ext4_split_convert_extents(handle, inode, map, ppath,
                                EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
                if (err < 0)
                        return err;
                path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
                if (IS_ERR(path))
                        return PTR_ERR(path);
                depth = ext_depth(inode);
                ex = path[depth].p_ext;
                if (!ex) {
                        EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
                                         (unsigned long) map->m_lblk);
                        return -EFSCORRUPTED;
                }
        }

        err = ext4_ext_get_access(handle, inode, path + depth);
        if (err)
                return err;
        /* first mark the extent as unwritten */
        ext4_ext_mark_unwritten(ex);

        /* note: ext4_ext_correct_indexes() isn't needed here because
         * borders are not changed
         */
        ext4_ext_try_to_merge(handle, inode, path, ex);

        /* Mark modified extent as dirty */
        err = ext4_ext_dirty(handle, inode, path + path->p_depth);
        if (err)
                return err;
        ext4_ext_show_leaf(inode, path);

        ext4_update_inode_fsync_trans(handle, inode, 1);

        map->m_flags |= EXT4_MAP_UNWRITTEN;
        if (*allocated > map->m_len)
                *allocated = map->m_len;
        map->m_len = *allocated;
        return 0;
}

static int
ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
                        struct ext4_map_blocks *map,
                        struct ext4_ext_path **ppath, int flags,
                        unsigned int allocated, ext4_fsblk_t newblock)
{
        int ret = 0;
        int err = 0;

        ext_debug(inode, "logical block %llu, max_blocks %u, flags 0x%x, allocated %u\n",
                  (unsigned long long)map->m_lblk, map->m_len, flags,
                  allocated);
        ext4_ext_show_leaf(inode, *ppath);

        /*
         * When writing into unwritten space, we should not fail to
         * allocate metadata blocks for the new extent block if needed.
         */
        flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;

        trace_ext4_ext_handle_unwritten_extents(inode, map, flags,
                                                    allocated, newblock);

        /* get_block() before submitting IO, split the extent */
        if (flags & EXT4_GET_BLOCKS_PRE_IO) {
                ret = ext4_split_convert_extents(handle, inode, map, ppath,
                                         flags);
                if (ret < 0) {
                        err = ret;
                        goto out2;
                }
                /*
                 * shouldn't get a 0 return when splitting an extent unless
                 * m_len is 0 (bug) or extent has been corrupted
                 */
                if (unlikely(ret == 0)) {
                        EXT4_ERROR_INODE(inode,
                                         "unexpected ret == 0, m_len = %u",
                                         map->m_len);
                        err = -EFSCORRUPTED;
                        goto out2;
                }
                map->m_flags |= EXT4_MAP_UNWRITTEN;
                goto out;
        }
        /* IO end_io complete, convert the filled extent to written */
        if (flags & EXT4_GET_BLOCKS_CONVERT) {
                err = ext4_convert_unwritten_extents_endio(handle, inode, map,
                                                           ppath);
                if (err < 0)
                        goto out2;
                ext4_update_inode_fsync_trans(handle, inode, 1);
                goto map_out;
        }
        /* buffered IO cases */
        /*
         * repeat fallocate creation request
         * we already have an unwritten extent
         */
        if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
                map->m_flags |= EXT4_MAP_UNWRITTEN;
                goto map_out;
        }

        /* buffered READ or buffered write_begin() lookup */
        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
                /*
                 * We have blocks reserved already.  We
                 * return allocated blocks so that delalloc
                 * won't do block reservation for us.  But
                 * the buffer head will be unmapped so that
                 * a read from the block returns 0s.
                 */
                map->m_flags |= EXT4_MAP_UNWRITTEN;
                goto out1;
        }

        /*
         * Default case when (flags & EXT4_GET_BLOCKS_CREATE) == 1.
         * For buffered writes, at writepage time, etc.  Convert a
         * discovered unwritten extent to written.
         */
        ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags);
        if (ret < 0) {
                err = ret;
                goto out2;
        }
        ext4_update_inode_fsync_trans(handle, inode, 1);
        /*
         * shouldn't get a 0 return when converting an unwritten extent
         * unless m_len is 0 (bug) or extent has been corrupted
         */
        if (unlikely(ret == 0)) {
                EXT4_ERROR_INODE(inode, "unexpected ret == 0, m_len = %u",
                                 map->m_len);
                err = -EFSCORRUPTED;
                goto out2;
        }

out:
        allocated = ret;
        map->m_flags |= EXT4_MAP_NEW;
map_out:
        map->m_flags |= EXT4_MAP_MAPPED;
out1:
        map->m_pblk = newblock;
        if (allocated > map->m_len)
                allocated = map->m_len;
        map->m_len = allocated;
        ext4_ext_show_leaf(inode, *ppath);
out2:
        return err ? err : allocated;
}

/*
 * get_implied_cluster_alloc - check to see if the requested
 * allocation (in the map structure) overlaps with a cluster already
 * allocated in an extent.
 *        @sb        The filesystem superblock structure
 *        @map        The requested lblk->pblk mapping
 *        @ex        The extent structure which might contain an implied
 *                        cluster allocation
 *
 * This function is called by ext4_ext_map_blocks() after we failed to
 * find blocks that were already in the inode's extent tree.  Hence,
 * we know that the beginning of the requested region cannot overlap
 * the extent from the inode's extent tree.  There are three cases we
 * want to catch.  The first is this case:
 *
 *                 |--- cluster # N--|
 *    |--- extent ---|        |---- requested region ---|
 *                        |==========|
 *
 * The second case that we need to test for is this one:
 *
 *   |--------- cluster # N ----------------|
 *           |--- requested region --|   |------- extent ----|
 *           |=======================|
 *
 * The third case is when the requested region lies between two extents
 * within the same cluster:
 *          |------------- cluster # N-------------|
 * |----- ex -----|                  |---- ex_right ----|
 *                  |------ requested region ------|
 *                  |================|
 *
 * In each of the above cases, we need to set the map->m_pblk and
 * map->m_len so it corresponds to the return the extent labelled as
 * "|====|" from cluster #N, since it is already in use for data in
 * cluster EXT4_B2C(sbi, map->m_lblk).        We will then return 1 to
 * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
 * as a new "allocated" block region.  Otherwise, we will return 0 and
 * ext4_ext_map_blocks() will then allocate one or more new clusters
 * by calling ext4_mb_new_blocks().
 */
static int get_implied_cluster_alloc(struct super_block *sb,
                                     struct ext4_map_blocks *map,
                                     struct ext4_extent *ex,
                                     struct ext4_ext_path *path)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
        ext4_lblk_t ex_cluster_start, ex_cluster_end;
        ext4_lblk_t rr_cluster_start;
        ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
        ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
        unsigned short ee_len = ext4_ext_get_actual_len(ex);

        /* The extent passed in that we are trying to match */
        ex_cluster_start = EXT4_B2C(sbi, ee_block);
        ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);

        /* The requested region passed into ext4_map_blocks() */
        rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);

        if ((rr_cluster_start == ex_cluster_end) ||
            (rr_cluster_start == ex_cluster_start)) {
                if (rr_cluster_start == ex_cluster_end)
                        ee_start += ee_len - 1;
                map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset;
                map->m_len = min(map->m_len,
                                 (unsigned) sbi->s_cluster_ratio - c_offset);
                /*
                 * Check for and handle this case:
                 *
                 *   |--------- cluster # N-------------|
                 *                       |------- extent ----|
                 *           |--- requested region ---|
                 *           |===========|
                 */

                if (map->m_lblk < ee_block)
                        map->m_len = min(map->m_len, ee_block - map->m_lblk);

                /*
                 * Check for the case where there is already another allocated
                 * block to the right of 'ex' but before the end of the cluster.
                 *
                 *          |------------- cluster # N-------------|
                 * |----- ex -----|                  |---- ex_right ----|
                 *                  |------ requested region ------|
                 *                  |================|
                 */
                if (map->m_lblk > ee_block) {
                        ext4_lblk_t next = ext4_ext_next_allocated_block(path);
                        map->m_len = min(map->m_len, next - map->m_lblk);
                }

                trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
                return 1;
        }

        trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
        return 0;
}

/*
 * Determine hole length around the given logical block, first try to
 * locate and expand the hole from the given @path, and then adjust it
 * if it's partially or completely converted to delayed extents, insert
 * it into the extent cache tree if it's indeed a hole, finally return
 * the length of the determined extent.
 */
static ext4_lblk_t ext4_ext_determine_insert_hole(struct inode *inode,
                                                  struct ext4_ext_path *path,
                                                  ext4_lblk_t lblk)
{
        ext4_lblk_t hole_start, len;
        struct extent_status es;

        hole_start = lblk;
        len = ext4_ext_find_hole(inode, path, &hole_start);
again:
        ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start,
                                  hole_start + len - 1, &es);
        if (!es.es_len)
                goto insert_hole;

        /*
         * There's a delalloc extent in the hole, handle it if the delalloc
         * extent is in front of, behind and straddle the queried range.
         */
        if (lblk >= es.es_lblk + es.es_len) {
                /*
                 * The delalloc extent is in front of the queried range,
                 * find again from the queried start block.
                 */
                len -= lblk - hole_start;
                hole_start = lblk;
                goto again;
        } else if (in_range(lblk, es.es_lblk, es.es_len)) {
                /*
                 * The delalloc extent containing lblk, it must have been
                 * added after ext4_map_blocks() checked the extent status
                 * tree, adjust the length to the delalloc extent's after
                 * lblk.
                 */
                len = es.es_lblk + es.es_len - lblk;
                return len;
        } else {
                /*
                 * The delalloc extent is partially or completely behind
                 * the queried range, update hole length until the
                 * beginning of the delalloc extent.
                 */
                len = min(es.es_lblk - hole_start, len);
        }

insert_hole:
        /* Put just found gap into cache to speed up subsequent requests */
        ext_debug(inode, " -> %u:%u\n", hole_start, len);
        ext4_es_insert_extent(inode, hole_start, len, ~0, EXTENT_STATUS_HOLE);

        /* Update hole_len to reflect hole size after lblk */
        if (hole_start != lblk)
                len -= lblk - hole_start;

        return len;
}

/*
 * Block allocation/map/preallocation routine for extents based files
 *
 *
 * Need to be called with
 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
 *
 * return > 0, number of blocks already mapped/allocated
 *          if create == 0 and these are pre-allocated blocks
 *                  buffer head is unmapped
 *          otherwise blocks are mapped
 *
 * return = 0, if plain look up failed (blocks have not been allocated)
 *          buffer head is unmapped
 *
 * return < 0, error case.
 */
int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        struct ext4_map_blocks *map, int flags)
{
        struct ext4_ext_path *path = NULL;
        struct ext4_extent newex, *ex, ex2;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        ext4_fsblk_t newblock = 0, pblk;
        int err = 0, depth, ret;
        unsigned int allocated = 0, offset = 0;
        unsigned int allocated_clusters = 0;
        struct ext4_allocation_request ar;
        ext4_lblk_t cluster_offset;

        ext_debug(inode, "blocks %u/%u requested\n", map->m_lblk, map->m_len);
        trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);

        /* find extent for this block */
        path = ext4_find_extent(inode, map->m_lblk, NULL, 0);
        if (IS_ERR(path)) {
                err = PTR_ERR(path);
                path = NULL;
                goto out;
        }

        depth = ext_depth(inode);

        /*
         * consistent leaf must not be empty;
         * this situation is possible, though, _during_ tree modification;
         * this is why assert can't be put in ext4_find_extent()
         */
        if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
                EXT4_ERROR_INODE(inode, "bad extent address "
                                 "lblock: %lu, depth: %d pblock %lld",
                                 (unsigned long) map->m_lblk, depth,
                                 path[depth].p_block);
                err = -EFSCORRUPTED;
                goto out;
        }

        ex = path[depth].p_ext;
        if (ex) {
                ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
                ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
                unsigned short ee_len;


                /*
                 * unwritten extents are treated as holes, except that
                 * we split out initialized portions during a write.
                 */
                ee_len = ext4_ext_get_actual_len(ex);

                trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);

                /* if found extent covers block, simply return it */
                if (in_range(map->m_lblk, ee_block, ee_len)) {
                        newblock = map->m_lblk - ee_block + ee_start;
                        /* number of remaining blocks in the extent */
                        allocated = ee_len - (map->m_lblk - ee_block);
                        ext_debug(inode, "%u fit into %u:%d -> %llu\n",
                                  map->m_lblk, ee_block, ee_len, newblock);

                        /*
                         * If the extent is initialized check whether the
                         * caller wants to convert it to unwritten.
                         */
                        if ((!ext4_ext_is_unwritten(ex)) &&
                            (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
                                err = convert_initialized_extent(handle,
                                        inode, map, &path, &allocated);
                                goto out;
                        } else if (!ext4_ext_is_unwritten(ex)) {
                                map->m_flags |= EXT4_MAP_MAPPED;
                                map->m_pblk = newblock;
                                if (allocated > map->m_len)
                                        allocated = map->m_len;
                                map->m_len = allocated;
                                ext4_ext_show_leaf(inode, path);
                                goto out;
                        }

                        ret = ext4_ext_handle_unwritten_extents(
                                handle, inode, map, &path, flags,
                                allocated, newblock);
                        if (ret < 0)
                                err = ret;
                        else
                                allocated = ret;
                        goto out;
                }
        }

        /*
         * requested block isn't allocated yet;
         * we couldn't try to create block if create flag is zero
         */
        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
                ext4_lblk_t len;

                len = ext4_ext_determine_insert_hole(inode, path, map->m_lblk);

                map->m_pblk = 0;
                map->m_len = min_t(unsigned int, map->m_len, len);
                goto out;
        }

        /*
         * Okay, we need to do block allocation.
         */
        newex.ee_block = cpu_to_le32(map->m_lblk);
        cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);

        /*
         * If we are doing bigalloc, check to see if the extent returned
         * by ext4_find_extent() implies a cluster we can use.
         */
        if (cluster_offset && ex &&
            get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
                ar.len = allocated = map->m_len;
                newblock = map->m_pblk;
                goto got_allocated_blocks;
        }

        /* find neighbour allocated blocks */
        ar.lleft = map->m_lblk;
        err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
        if (err)
                goto out;
        ar.lright = map->m_lblk;
        err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright,
                                    &ex2, 0);
        if (err < 0)
                goto out;

        /* Check if the extent after searching to the right implies a
         * cluster we can use. */
        if ((sbi->s_cluster_ratio > 1) && err &&
            get_implied_cluster_alloc(inode->i_sb, map, &ex2, path)) {
                ar.len = allocated = map->m_len;
                newblock = map->m_pblk;
                goto got_allocated_blocks;
        }

        /*
         * See if request is beyond maximum number of blocks we can have in
         * a single extent. For an initialized extent this limit is
         * EXT_INIT_MAX_LEN and for an unwritten extent this limit is
         * EXT_UNWRITTEN_MAX_LEN.
         */
        if (map->m_len > EXT_INIT_MAX_LEN &&
            !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
                map->m_len = EXT_INIT_MAX_LEN;
        else if (map->m_len > EXT_UNWRITTEN_MAX_LEN &&
                 (flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
                map->m_len = EXT_UNWRITTEN_MAX_LEN;

        /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
        newex.ee_len = cpu_to_le16(map->m_len);
        err = ext4_ext_check_overlap(sbi, inode, &newex, path);
        if (err)
                allocated = ext4_ext_get_actual_len(&newex);
        else
                allocated = map->m_len;

        /* allocate new block */
        ar.inode = inode;
        ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
        ar.logical = map->m_lblk;
        /*
         * We calculate the offset from the beginning of the cluster
         * for the logical block number, since when we allocate a
         * physical cluster, the physical block should start at the
         * same offset from the beginning of the cluster.  This is
         * needed so that future calls to get_implied_cluster_alloc()
         * work correctly.
         */
        offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
        ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
        ar.goal -= offset;
        ar.logical -= offset;
        if (S_ISREG(inode->i_mode))
                ar.flags = EXT4_MB_HINT_DATA;
        else
                /* disable in-core preallocation for non-regular files */
                ar.flags = 0;
        if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
                ar.flags |= EXT4_MB_HINT_NOPREALLOC;
        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
                ar.flags |= EXT4_MB_DELALLOC_RESERVED;
        if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
                ar.flags |= EXT4_MB_USE_RESERVED;
        newblock = ext4_mb_new_blocks(handle, &ar, &err);
        if (!newblock)
                goto out;
        allocated_clusters = ar.len;
        ar.len = EXT4_C2B(sbi, ar.len) - offset;
        ext_debug(inode, "allocate new block: goal %llu, found %llu/%u, requested %u\n",
                  ar.goal, newblock, ar.len, allocated);
        if (ar.len > allocated)
                ar.len = allocated;

got_allocated_blocks:
        /* try to insert new extent into found leaf and return */
        pblk = newblock + offset;
        ext4_ext_store_pblock(&newex, pblk);
        newex.ee_len = cpu_to_le16(ar.len);
        /* Mark unwritten */
        if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
                ext4_ext_mark_unwritten(&newex);
                map->m_flags |= EXT4_MAP_UNWRITTEN;
        }

        err = ext4_ext_insert_extent(handle, inode, &path, &newex, flags);
        if (err) {
                if (allocated_clusters) {
                        int fb_flags = 0;

                        /*
                         * free data blocks we just allocated.
                         * not a good idea to call discard here directly,
                         * but otherwise we'd need to call it every free().
                         */
                        ext4_discard_preallocations(inode, 0);
                        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
                                fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE;
                        ext4_free_blocks(handle, inode, NULL, newblock,
                                         EXT4_C2B(sbi, allocated_clusters),
                                         fb_flags);
                }
                goto out;
        }

        /*
         * Reduce the reserved cluster count to reflect successful deferred
         * allocation of delayed allocated clusters or direct allocation of
         * clusters discovered to be delayed allocated.  Once allocated, a
         * cluster is not included in the reserved count.
         */
        if (test_opt(inode->i_sb, DELALLOC) && allocated_clusters) {
                if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
                        /*
                         * When allocating delayed allocated clusters, simply
                         * reduce the reserved cluster count and claim quota
                         */
                        ext4_da_update_reserve_space(inode, allocated_clusters,
                                                        1);
                } else {
                        ext4_lblk_t lblk, len;
                        unsigned int n;

                        /*
                         * When allocating non-delayed allocated clusters
                         * (from fallocate, filemap, DIO, or clusters
                         * allocated when delalloc has been disabled by
                         * ext4_nonda_switch), reduce the reserved cluster
                         * count by the number of allocated clusters that
                         * have previously been delayed allocated.  Quota
                         * has been claimed by ext4_mb_new_blocks() above,
                         * so release the quota reservations made for any
                         * previously delayed allocated clusters.
                         */
                        lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk);
                        len = allocated_clusters << sbi->s_cluster_bits;
                        n = ext4_es_delayed_clu(inode, lblk, len);
                        if (n > 0)
                                ext4_da_update_reserve_space(inode, (int) n, 0);
                }
        }

        /*
         * Cache the extent and update transaction to commit on fdatasync only
         * when it is _not_ an unwritten extent.
         */
        if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0)
                ext4_update_inode_fsync_trans(handle, inode, 1);
        else
                ext4_update_inode_fsync_trans(handle, inode, 0);

        map->m_flags |= (EXT4_MAP_NEW | EXT4_MAP_MAPPED);
        map->m_pblk = pblk;
        map->m_len = ar.len;
        allocated = map->m_len;
        ext4_ext_show_leaf(inode, path);
out:
        ext4_ext_drop_refs(path);
        kfree(path);

        trace_ext4_ext_map_blocks_exit(inode, flags, map,
                                       err ? err : allocated);
        return err ? err : allocated;
}

int ext4_ext_truncate(handle_t *handle, struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        ext4_lblk_t last_block;
        int err = 0;

        /*
         * TODO: optimization is possible here.
         * Probably we need not scan at all,
         * because page truncation is enough.
         */

        /* we have to know where to truncate from in crash case */
        EXT4_I(inode)->i_disksize = inode->i_size;
        err = ext4_mark_inode_dirty(handle, inode);
        if (err)
                return err;

        last_block = (inode->i_size + sb->s_blocksize - 1)
                        >> EXT4_BLOCK_SIZE_BITS(sb);
retry:
        err = ext4_es_remove_extent(inode, last_block,
                                    EXT_MAX_BLOCKS - last_block);
        if (err == -ENOMEM) {
                cond_resched();
                congestion_wait(BLK_RW_ASYNC, HZ/50);
                goto retry;
        }
        if (err)
                return err;
retry_remove_space:
        err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
        if (err == -ENOMEM) {
                cond_resched();
                congestion_wait(BLK_RW_ASYNC, HZ/50);
                goto retry_remove_space;
        }
        return err;
}

static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
                                  ext4_lblk_t len, loff_t new_size,
                                  int flags)
{
        struct inode *inode = file_inode(file);
        handle_t *handle;
        int ret = 0;
        int ret2 = 0, ret3 = 0;
        int retries = 0;
        int depth = 0;
        struct ext4_map_blocks map;
        unsigned int credits;
        loff_t epos;

        BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS));
        map.m_lblk = offset;
        map.m_len = len;
        /*
         * Don't normalize the request if it can fit in one extent so
         * that it doesn't get unnecessarily split into multiple
         * extents.
         */
        if (len <= EXT_UNWRITTEN_MAX_LEN)
                flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;

        /*
         * credits to insert 1 extent into extent tree
         */
        credits = ext4_chunk_trans_blocks(inode, len);
        depth = ext_depth(inode);

retry:
        while (ret >= 0 && len) {
                /*
                 * Recalculate credits when extent tree depth changes.
                 */
                if (depth != ext_depth(inode)) {
                        credits = ext4_chunk_trans_blocks(inode, len);
                        depth = ext_depth(inode);
                }

                handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
                                            credits);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        break;
                }
                ret = ext4_map_blocks(handle, inode, &map, flags);
                if (ret <= 0) {
                        ext4_debug("inode #%lu: block %u: len %u: "
                                   "ext4_ext_map_blocks returned %d",
                                   inode->i_ino, map.m_lblk,
                                   map.m_len, ret);
                        ext4_mark_inode_dirty(handle, inode);
                        ret2 = ext4_journal_stop(handle);
                        break;
                }
                map.m_lblk += ret;
                map.m_len = len = len - ret;
                epos = (loff_t)map.m_lblk << inode->i_blkbits;
                inode->i_ctime = current_time(inode);
                if (new_size) {
                        if (epos > new_size)
                                epos = new_size;
                        if (ext4_update_inode_size(inode, epos) & 0x1)
                                inode->i_mtime = inode->i_ctime;
                }
                ret2 = ext4_mark_inode_dirty(handle, inode);
                ext4_update_inode_fsync_trans(handle, inode, 1);
                ret3 = ext4_journal_stop(handle);
                ret2 = ret3 ? ret3 : ret2;
                if (unlikely(ret2))
                        break;
        }
        if (ret == -ENOSPC &&
                        ext4_should_retry_alloc(inode->i_sb, &retries)) {
                ret = 0;
                goto retry;
        }

        return ret > 0 ? ret2 : ret;
}

static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len);

static int ext4_insert_range(struct file *file, loff_t offset, loff_t len);

static long ext4_zero_range(struct file *file, loff_t offset,
                            loff_t len, int mode)
{
        struct inode *inode = file_inode(file);
        handle_t *handle = NULL;
        unsigned int max_blocks;
        loff_t new_size = 0;
        int ret = 0;
        int flags;
        int credits;
        int partial_begin, partial_end;
        loff_t start, end;
        ext4_lblk_t lblk;
        unsigned int blkbits = inode->i_blkbits;

        trace_ext4_zero_range(inode, offset, len, mode);

        /* Call ext4_force_commit to flush all data in case of data=journal. */
        if (ext4_should_journal_data(inode)) {
                ret = ext4_force_commit(inode->i_sb);
                if (ret)
                        return ret;
        }

        /*
         * Round up offset. This is not fallocate, we need to zero out
         * blocks, so convert interior block aligned part of the range to
         * unwritten and possibly manually zero out unaligned parts of the
         * range.
         */
        start = round_up(offset, 1 << blkbits);
        end = round_down((offset + len), 1 << blkbits);

        if (start < offset || end > offset + len)
                return -EINVAL;
        partial_begin = offset & ((1 << blkbits) - 1);
        partial_end = (offset + len) & ((1 << blkbits) - 1);

        lblk = start >> blkbits;
        max_blocks = (end >> blkbits);
        if (max_blocks < lblk)
                max_blocks = 0;
        else
                max_blocks -= lblk;

        inode_lock(inode);

        /*
         * Indirect files do not support unwritten extents
         */
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
                ret = -EOPNOTSUPP;
                goto out_mutex;
        }

        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
            (offset + len > inode->i_size ||
             offset + len > EXT4_I(inode)->i_disksize)) {
                new_size = offset + len;
                ret = inode_newsize_ok(inode, new_size);
                if (ret)
                        goto out_mutex;
        }

        flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;

        /* Wait all existing dio workers, newcomers will block on i_mutex */
        inode_dio_wait(inode);

        ret = file_modified(file);
        if (ret)
                goto out_mutex;

        /* Preallocate the range including the unaligned edges */
        if (partial_begin || partial_end) {
                ret = ext4_alloc_file_blocks(file,
                                round_down(offset, 1 << blkbits) >> blkbits,
                                (round_up((offset + len), 1 << blkbits) -
                                 round_down(offset, 1 << blkbits)) >> blkbits,
                                new_size, flags);
                if (ret)
                        goto out_mutex;

        }

        /* Zero range excluding the unaligned edges */
        if (max_blocks > 0) {
                flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
                          EXT4_EX_NOCACHE);

                /*
                 * Prevent page faults from reinstantiating pages we have
                 * released from page cache.
                 */
                down_write(&EXT4_I(inode)->i_mmap_sem);

                ret = ext4_break_layouts(inode);
                if (ret) {
                        up_write(&EXT4_I(inode)->i_mmap_sem);
                        goto out_mutex;
                }

                ret = ext4_update_disksize_before_punch(inode, offset, len);
                if (ret) {
                        up_write(&EXT4_I(inode)->i_mmap_sem);
                        goto out_mutex;
                }
                /* Now release the pages and zero block aligned part of pages */
                truncate_pagecache_range(inode, start, end - 1);
                inode->i_mtime = inode->i_ctime = current_time(inode);

                ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
                                             flags);
                up_write(&EXT4_I(inode)->i_mmap_sem);
                if (ret)
                        goto out_mutex;
        }
        if (!partial_begin && !partial_end)
                goto out_mutex;

        /*
         * In worst case we have to writeout two nonadjacent unwritten
         * blocks and update the inode
         */
        credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1;
        if (ext4_should_journal_data(inode))
                credits += 2;
        handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                ext4_std_error(inode->i_sb, ret);
                goto out_mutex;
        }

        inode->i_mtime = inode->i_ctime = current_time(inode);
        if (new_size)
                ext4_update_inode_size(inode, new_size);
        ret = ext4_mark_inode_dirty(handle, inode);
        if (unlikely(ret))
                goto out_handle;
        /* Zero out partial block at the edges of the range */
        ret = ext4_zero_partial_blocks(handle, inode, offset, len);
        if (ret >= 0)
                ext4_update_inode_fsync_trans(handle, inode, 1);

        if (file->f_flags & O_SYNC)
                ext4_handle_sync(handle);

out_handle:
        ext4_journal_stop(handle);
out_mutex:
        inode_unlock(inode);
        return ret;
}

/*
 * preallocate space for a file. This implements ext4's fallocate file
 * operation, which gets called from sys_fallocate system call.
 * For block-mapped files, posix_fallocate should fall back to the method
 * of writing zeroes to the required new blocks (the same behavior which is
 * expected for file systems which do not support fallocate() system call).
 */
long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
        struct inode *inode = file_inode(file);
        loff_t new_size = 0;
        unsigned int max_blocks;
        int ret = 0;
        int flags;
        ext4_lblk_t lblk;
        unsigned int blkbits = inode->i_blkbits;

        /*
         * Encrypted inodes can't handle collapse range or insert
         * range since we would need to re-encrypt blocks with a
         * different IV or XTS tweak (which are based on the logical
         * block number).
         */
        if (IS_ENCRYPTED(inode) &&
            (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
                return -EOPNOTSUPP;

        /* Return error if mode is not supported */
        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
                     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
                     FALLOC_FL_INSERT_RANGE))
                return -EOPNOTSUPP;

        inode_lock(inode);
        ret = ext4_convert_inline_data(inode);
        inode_unlock(inode);
        if (ret)
                goto exit;

        if (mode & FALLOC_FL_PUNCH_HOLE) {
                ret = ext4_punch_hole(file, offset, len);
                goto exit;
        }

        if (mode & FALLOC_FL_COLLAPSE_RANGE) {
                ret = ext4_collapse_range(file, offset, len);
                goto exit;
        }

        if (mode & FALLOC_FL_INSERT_RANGE) {
                ret = ext4_insert_range(file, offset, len);
                goto exit;
        }

        if (mode & FALLOC_FL_ZERO_RANGE) {
                ret = ext4_zero_range(file, offset, len, mode);
                goto exit;
        }
        trace_ext4_fallocate_enter(inode, offset, len, mode);
        lblk = offset >> blkbits;

        max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
        flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;

        inode_lock(inode);

        /*
         * We only support preallocation for extent-based files only
         */
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
                ret = -EOPNOTSUPP;
                goto out;
        }

        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
            (offset + len > inode->i_size ||
             offset + len > EXT4_I(inode)->i_disksize)) {
                new_size = offset + len;
                ret = inode_newsize_ok(inode, new_size);
                if (ret)
                        goto out;
        }

        /* Wait all existing dio workers, newcomers will block on i_mutex */
        inode_dio_wait(inode);

        ret = file_modified(file);
        if (ret)
                goto out;

        ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags);
        if (ret)
                goto out;

        if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
                ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
                                        EXT4_I(inode)->i_sync_tid);
        }
out:
        inode_unlock(inode);
        trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
exit:
        return ret;
}

/*
 * This function convert a range of blocks to written extents
 * The caller of this function will pass the start offset and the size.
 * all unwritten extents within this range will be converted to
 * written extents.
 *
 * This function is called from the direct IO end io call back
 * function, to convert the fallocated extents after IO is completed.
 * Returns 0 on success.
 */
int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
                                   loff_t offset, ssize_t len)
{
        unsigned int max_blocks;
        int ret = 0, ret2 = 0, ret3 = 0;
        struct ext4_map_blocks map;
        unsigned int blkbits = inode->i_blkbits;
        unsigned int credits = 0;

        map.m_lblk = offset >> blkbits;
        max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);

        if (!handle) {
                /*
                 * credits to insert 1 extent into extent tree
                 */
                credits = ext4_chunk_trans_blocks(inode, max_blocks);
        }
        while (ret >= 0 && ret < max_blocks) {
                map.m_lblk += ret;
                map.m_len = (max_blocks -= ret);
                if (credits) {
                        handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
                                                    credits);
                        if (IS_ERR(handle)) {
                                ret = PTR_ERR(handle);
                                break;
                        }
                }
                ret = ext4_map_blocks(handle, inode, &map,
                                      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
                if (ret <= 0)
                        ext4_warning(inode->i_sb,
                                     "inode #%lu: block %u: len %u: "
                                     "ext4_ext_map_blocks returned %d",
                                     inode->i_ino, map.m_lblk,
                                     map.m_len, ret);
                ret2 = ext4_mark_inode_dirty(handle, inode);
                if (credits) {
                        ret3 = ext4_journal_stop(handle);
                        if (unlikely(ret3))
                                ret2 = ret3;
                }

                if (ret <= 0 || ret2)
                        break;
        }
        return ret > 0 ? ret2 : ret;
}

int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end)
{
        int ret = 0, err = 0;
        struct ext4_io_end_vec *io_end_vec;

        /*
         * This is somewhat ugly but the idea is clear: When transaction is
         * reserved, everything goes into it. Otherwise we rather start several
         * smaller transactions for conversion of each extent separately.
         */
        if (handle) {
                handle = ext4_journal_start_reserved(handle,
                                                     EXT4_HT_EXT_CONVERT);
                if (IS_ERR(handle))
                        return PTR_ERR(handle);
        }

        list_for_each_entry(io_end_vec, &io_end->list_vec, list) {
                ret = ext4_convert_unwritten_extents(handle, io_end->inode,
                                                     io_end_vec->offset,
                                                     io_end_vec->size);
                if (ret)
                        break;
        }

        if (handle)
                err = ext4_journal_stop(handle);

        return ret < 0 ? ret : err;
}

static int ext4_iomap_xattr_fiemap(struct inode *inode, struct iomap *iomap)
{
        __u64 physical = 0;
        __u64 length = 0;
        int blockbits = inode->i_sb->s_blocksize_bits;
        int error = 0;
        u16 iomap_type;

        /* in-inode? */
        if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
                struct ext4_iloc iloc;
                int offset;        /* offset of xattr in inode */

                error = ext4_get_inode_loc(inode, &iloc);
                if (error)
                        return error;
                physical = (__u64)iloc.bh->b_blocknr << blockbits;
                offset = EXT4_GOOD_OLD_INODE_SIZE +
                                EXT4_I(inode)->i_extra_isize;
                physical += offset;
                length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
                brelse(iloc.bh);
                iomap_type = IOMAP_INLINE;
        } else if (EXT4_I(inode)->i_file_acl) { /* external block */
                physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
                length = inode->i_sb->s_blocksize;
                iomap_type = IOMAP_MAPPED;
        } else {
                /* no in-inode or external block for xattr, so return -ENOENT */
                error = -ENOENT;
                goto out;
        }

        iomap->addr = physical;
        iomap->offset = 0;
        iomap->length = length;
        iomap->type = iomap_type;
        iomap->flags = 0;
out:
        return error;
}

static int ext4_iomap_xattr_begin(struct inode *inode, loff_t offset,
                                  loff_t length, unsigned flags,
                                  struct iomap *iomap, struct iomap *srcmap)
{
        int error;

        error = ext4_iomap_xattr_fiemap(inode, iomap);
        if (error == 0 && (offset >= iomap->length))
                error = -ENOENT;
        return error;
}

static const struct iomap_ops ext4_iomap_xattr_ops = {
        .iomap_begin                = ext4_iomap_xattr_begin,
};

static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len)
{
        u64 maxbytes = ext4_get_maxbytes(inode);

        if (*len == 0)
                return -EINVAL;
        if (start > maxbytes)
                return -EFBIG;

        /*
         * Shrink request scope to what the fs can actually handle.
         */
        if (*len > maxbytes || (maxbytes - *len) < start)
                *len = maxbytes - start;
        return 0;
}

int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                u64 start, u64 len)
{
        int error = 0;

        if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
                error = ext4_ext_precache(inode);
                if (error)
                        return error;
                fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
        }

        /*
         * For bitmap files the maximum size limit could be smaller than
         * s_maxbytes, so check len here manually instead of just relying on the
         * generic check.
         */
        error = ext4_fiemap_check_ranges(inode, start, &len);
        if (error)
                return error;

        if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
                fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR;
                return iomap_fiemap(inode, fieinfo, start, len,
                                    &ext4_iomap_xattr_ops);
        }

        return iomap_fiemap(inode, fieinfo, start, len, &ext4_iomap_report_ops);
}

int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
                      __u64 start, __u64 len)
{
        ext4_lblk_t start_blk, len_blks;
        __u64 last_blk;
        int error = 0;

        if (ext4_has_inline_data(inode)) {
                int has_inline;

                down_read(&EXT4_I(inode)->xattr_sem);
                has_inline = ext4_has_inline_data(inode);
                up_read(&EXT4_I(inode)->xattr_sem);
                if (has_inline)
                        return 0;
        }

        if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
                inode_lock_shared(inode);
                error = ext4_ext_precache(inode);
                inode_unlock_shared(inode);
                if (error)
                        return error;
                fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
        }

        error = fiemap_prep(inode, fieinfo, start, &len, 0);
        if (error)
                return error;

        error = ext4_fiemap_check_ranges(inode, start, &len);
        if (error)
                return error;

        start_blk = start >> inode->i_sb->s_blocksize_bits;
        last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
        if (last_blk >= EXT_MAX_BLOCKS)
                last_blk = EXT_MAX_BLOCKS-1;
        len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;

        /*
         * Walk the extent tree gathering extent information
         * and pushing extents back to the user.
         */
        return ext4_fill_es_cache_info(inode, start_blk, len_blks, fieinfo);
}

/*
 * ext4_ext_shift_path_extents:
 * Shift the extents of a path structure lying between path[depth].p_ext
 * and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells
 * if it is right shift or left shift operation.
 */
static int
ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
                            struct inode *inode, handle_t *handle,
                            enum SHIFT_DIRECTION SHIFT)
{
        int depth, err = 0;
        struct ext4_extent *ex_start, *ex_last;
        bool update = false;
        int credits, restart_credits;
        depth = path->p_depth;

        while (depth >= 0) {
                if (depth == path->p_depth) {
                        ex_start = path[depth].p_ext;
                        if (!ex_start)
                                return -EFSCORRUPTED;

                        ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
                        /* leaf + sb + inode */
                        credits = 3;
                        if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) {
                                update = true;
                                /* extent tree + sb + inode */
                                credits = depth + 2;
                        }

                        restart_credits = ext4_writepage_trans_blocks(inode);
                        err = ext4_datasem_ensure_credits(handle, inode, credits,
                                        restart_credits, 0);
                        if (err) {
                                if (err > 0)
                                        err = -EAGAIN;
                                goto out;
                        }

                        err = ext4_ext_get_access(handle, inode, path + depth);
                        if (err)
                                goto out;

                        while (ex_start <= ex_last) {
                                if (SHIFT == SHIFT_LEFT) {
                                        le32_add_cpu(&ex_start->ee_block,
                                                -shift);
                                        /* Try to merge to the left. */
                                        if ((ex_start >
                                            EXT_FIRST_EXTENT(path[depth].p_hdr))
                                            &&
                                            ext4_ext_try_to_merge_right(inode,
                                            path, ex_start - 1))
                                                ex_last--;
                                        else
                                                ex_start++;
                                } else {
                                        le32_add_cpu(&ex_last->ee_block, shift);
                                        ext4_ext_try_to_merge_right(inode, path,
                                                ex_last);
                                        ex_last--;
                                }
                        }
                        err = ext4_ext_dirty(handle, inode, path + depth);
                        if (err)
                                goto out;

                        if (--depth < 0 || !update)
                                break;
                }

                /* Update index too */
                err = ext4_ext_get_access(handle, inode, path + depth);
                if (err)
                        goto out;

                if (SHIFT == SHIFT_LEFT)
                        le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
                else
                        le32_add_cpu(&path[depth].p_idx->ei_block, shift);
                err = ext4_ext_dirty(handle, inode, path + depth);
                if (err)
                        goto out;

                /* we are done if current index is not a starting index */
                if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
                        break;

                depth--;
        }

out:
        return err;
}

/*
 * ext4_ext_shift_extents:
 * All the extents which lies in the range from @start to the last allocated
 * block for the @inode are shifted either towards left or right (depending
 * upon @SHIFT) by @shift blocks.
 * On success, 0 is returned, error otherwise.
 */
static int
ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
                       ext4_lblk_t start, ext4_lblk_t shift,
                       enum SHIFT_DIRECTION SHIFT)
{
        struct ext4_ext_path *path;
        int ret = 0, depth;
        struct ext4_extent *extent;
        ext4_lblk_t stop, *iterator, ex_start, ex_end;
        ext4_lblk_t tmp = EXT_MAX_BLOCKS;

        /* Let path point to the last extent */
        path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
                                EXT4_EX_NOCACHE);
        if (IS_ERR(path))
                return PTR_ERR(path);

        depth = path->p_depth;
        extent = path[depth].p_ext;
        if (!extent)
                goto out;

        stop = le32_to_cpu(extent->ee_block);

       /*
        * For left shifts, make sure the hole on the left is big enough to
        * accommodate the shift.  For right shifts, make sure the last extent
        * won't be shifted beyond EXT_MAX_BLOCKS.
        */
        if (SHIFT == SHIFT_LEFT) {
                path = ext4_find_extent(inode, start - 1, &path,
                                        EXT4_EX_NOCACHE);
                if (IS_ERR(path))
                        return PTR_ERR(path);
                depth = path->p_depth;
                extent =  path[depth].p_ext;
                if (extent) {
                        ex_start = le32_to_cpu(extent->ee_block);
                        ex_end = le32_to_cpu(extent->ee_block) +
                                ext4_ext_get_actual_len(extent);
                } else {
                        ex_start = 0;
                        ex_end = 0;
                }

                if ((start == ex_start && shift > ex_start) ||
                    (shift > start - ex_end)) {
                        ret = -EINVAL;
                        goto out;
                }
        } else {
                if (shift > EXT_MAX_BLOCKS -
                    (stop + ext4_ext_get_actual_len(extent))) {
                        ret = -EINVAL;
                        goto out;
                }
        }

        /*
         * In case of left shift, iterator points to start and it is increased
         * till we reach stop. In case of right shift, iterator points to stop
         * and it is decreased till we reach start.
         */
again:
        ret = 0;
        if (SHIFT == SHIFT_LEFT)
                iterator = &start;
        else
                iterator = &stop;

        if (tmp != EXT_MAX_BLOCKS)
                *iterator = tmp;

        /*
         * Its safe to start updating extents.  Start and stop are unsigned, so
         * in case of right shift if extent with 0 block is reached, iterator
         * becomes NULL to indicate the end of the loop.
         */
        while (iterator && start <= stop) {
                path = ext4_find_extent(inode, *iterator, &path,
                                        EXT4_EX_NOCACHE);
                if (IS_ERR(path))
                        return PTR_ERR(path);
                depth = path->p_depth;
                extent = path[depth].p_ext;
                if (!extent) {
                        EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
                                         (unsigned long) *iterator);
                        ret = -EFSCORRUPTED;
                        goto out;
                }
                if (SHIFT == SHIFT_LEFT && *iterator >
                    le32_to_cpu(extent->ee_block)) {
                        /* Hole, move to the next extent */
                        if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) {
                                path[depth].p_ext++;
                        } else {
                                *iterator = ext4_ext_next_allocated_block(path);
                                continue;
                        }
                }

                tmp = *iterator;
                if (SHIFT == SHIFT_LEFT) {
                        extent = EXT_LAST_EXTENT(path[depth].p_hdr);
                        *iterator = le32_to_cpu(extent->ee_block) +
                                        ext4_ext_get_actual_len(extent);
                } else {
                        extent = EXT_FIRST_EXTENT(path[depth].p_hdr);
                        if (le32_to_cpu(extent->ee_block) > start)
                                *iterator = le32_to_cpu(extent->ee_block) - 1;
                        else if (le32_to_cpu(extent->ee_block) == start)
                                iterator = NULL;
                        else {
                                extent = EXT_LAST_EXTENT(path[depth].p_hdr);
                                while (le32_to_cpu(extent->ee_block) >= start)
                                        extent--;

                                if (extent == EXT_LAST_EXTENT(path[depth].p_hdr))
                                        break;

                                extent++;
                                iterator = NULL;
                        }
                        path[depth].p_ext = extent;
                }
                ret = ext4_ext_shift_path_extents(path, shift, inode,
                                handle, SHIFT);
                /* iterator can be NULL which means we should break */
                if (ret == -EAGAIN)
                        goto again;
                if (ret)
                        break;
        }
out:
        ext4_ext_drop_refs(path);
        kfree(path);
        return ret;
}

/*
 * ext4_collapse_range:
 * This implements the fallocate's collapse range functionality for ext4
 * Returns: 0 and non-zero on error.
 */
static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
{
        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
        ext4_lblk_t punch_start, punch_stop;
        handle_t *handle;
        unsigned int credits;
        loff_t new_size, ioffset;
        int ret;

        /*
         * We need to test this early because xfstests assumes that a
         * collapse range of (0, 1) will return EOPNOTSUPP if the file
         * system does not support collapse range.
         */
        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                return -EOPNOTSUPP;

        /* Collapse range works only on fs cluster size aligned regions. */
        if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
                return -EINVAL;

        trace_ext4_collapse_range(inode, offset, len);

        punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
        punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);

        /* Call ext4_force_commit to flush all data in case of data=journal. */
        if (ext4_should_journal_data(inode)) {
                ret = ext4_force_commit(inode->i_sb);
                if (ret)
                        return ret;
        }

        inode_lock(inode);
        /*
         * There is no need to overlap collapse range with EOF, in which case
         * it is effectively a truncate operation
         */
        if (offset + len >= inode->i_size) {
                ret = -EINVAL;
                goto out_mutex;
        }

        /* Currently just for extent based files */
        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                ret = -EOPNOTSUPP;
                goto out_mutex;
        }

        /* Wait for existing dio to complete */
        inode_dio_wait(inode);

        ret = file_modified(file);
        if (ret)
                goto out_mutex;

        /*
         * Prevent page faults from reinstantiating pages we have released from
         * page cache.
         */
        down_write(&EXT4_I(inode)->i_mmap_sem);

        ret = ext4_break_layouts(inode);
        if (ret)
                goto out_mmap;

        /*
         * Need to round down offset to be aligned with page size boundary
         * for page size > block size.
         */
        ioffset = round_down(offset, PAGE_SIZE);
        /*
         * Write tail of the last page before removed range since it will get
         * removed from the page cache below.
         */
        ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset);
        if (ret)
                goto out_mmap;
        /*
         * Write data that will be shifted to preserve them when discarding
         * page cache below. We are also protected from pages becoming dirty
         * by i_mmap_sem.
         */
        ret = filemap_write_and_wait_range(inode->i_mapping, offset + len,
                                           LLONG_MAX);
        if (ret)
                goto out_mmap;
        truncate_pagecache(inode, ioffset);

        credits = ext4_writepage_trans_blocks(inode);
        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                goto out_mmap;
        }
        ext4_fc_start_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE);

        down_write(&EXT4_I(inode)->i_data_sem);
        ext4_discard_preallocations(inode, 0);

        ret = ext4_es_remove_extent(inode, punch_start,
                                    EXT_MAX_BLOCKS - punch_start);
        if (ret) {
                up_write(&EXT4_I(inode)->i_data_sem);
                goto out_stop;
        }

        ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
        if (ret) {
                up_write(&EXT4_I(inode)->i_data_sem);
                goto out_stop;
        }
        ext4_discard_preallocations(inode, 0);

        ret = ext4_ext_shift_extents(inode, handle, punch_stop,
                                     punch_stop - punch_start, SHIFT_LEFT);
        if (ret) {
                up_write(&EXT4_I(inode)->i_data_sem);
                goto out_stop;
        }

        new_size = inode->i_size - len;
        i_size_write(inode, new_size);
        EXT4_I(inode)->i_disksize = new_size;

        up_write(&EXT4_I(inode)->i_data_sem);
        if (IS_SYNC(inode))
                ext4_handle_sync(handle);
        inode->i_mtime = inode->i_ctime = current_time(inode);
        ret = ext4_mark_inode_dirty(handle, inode);
        ext4_update_inode_fsync_trans(handle, inode, 1);

out_stop:
        ext4_journal_stop(handle);
        ext4_fc_stop_ineligible(sb);
out_mmap:
        up_write(&EXT4_I(inode)->i_mmap_sem);
out_mutex:
        inode_unlock(inode);
        return ret;
}

/*
 * ext4_insert_range:
 * This function implements the FALLOC_FL_INSERT_RANGE flag of fallocate.
 * The data blocks starting from @offset to the EOF are shifted by @len
 * towards right to create a hole in the @inode. Inode size is increased
 * by len bytes.
 * Returns 0 on success, error otherwise.
 */
static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
{
        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
        handle_t *handle;
        struct ext4_ext_path *path;
        struct ext4_extent *extent;
        ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0;
        unsigned int credits, ee_len;
        int ret = 0, depth, split_flag = 0;
        loff_t ioffset;

        /*
         * We need to test this early because xfstests assumes that an
         * insert range of (0, 1) will return EOPNOTSUPP if the file
         * system does not support insert range.
         */
        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                return -EOPNOTSUPP;

        /* Insert range works only on fs cluster size aligned regions. */
        if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
                return -EINVAL;

        trace_ext4_insert_range(inode, offset, len);

        offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb);
        len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb);

        /* Call ext4_force_commit to flush all data in case of data=journal */
        if (ext4_should_journal_data(inode)) {
                ret = ext4_force_commit(inode->i_sb);
                if (ret)
                        return ret;
        }

        inode_lock(inode);
        /* Currently just for extent based files */
        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                ret = -EOPNOTSUPP;
                goto out_mutex;
        }

        /* Check whether the maximum file size would be exceeded */
        if (len > inode->i_sb->s_maxbytes - inode->i_size) {
                ret = -EFBIG;
                goto out_mutex;
        }

        /* Offset must be less than i_size */
        if (offset >= inode->i_size) {
                ret = -EINVAL;
                goto out_mutex;
        }

        /* Wait for existing dio to complete */
        inode_dio_wait(inode);

        ret = file_modified(file);
        if (ret)
                goto out_mutex;

        /*
         * Prevent page faults from reinstantiating pages we have released from
         * page cache.
         */
        down_write(&EXT4_I(inode)->i_mmap_sem);

        ret = ext4_break_layouts(inode);
        if (ret)
                goto out_mmap;

        /*
         * Need to round down to align start offset to page size boundary
         * for page size > block size.
         */
        ioffset = round_down(offset, PAGE_SIZE);
        /* Write out all dirty pages */
        ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
                        LLONG_MAX);
        if (ret)
                goto out_mmap;
        truncate_pagecache(inode, ioffset);

        credits = ext4_writepage_trans_blocks(inode);
        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                goto out_mmap;
        }
        ext4_fc_start_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE);

        /* Expand file to avoid data loss if there is error while shifting */
        inode->i_size += len;
        EXT4_I(inode)->i_disksize += len;
        inode->i_mtime = inode->i_ctime = current_time(inode);
        ret = ext4_mark_inode_dirty(handle, inode);
        if (ret)
                goto out_stop;

        down_write(&EXT4_I(inode)->i_data_sem);
        ext4_discard_preallocations(inode, 0);

        path = ext4_find_extent(inode, offset_lblk, NULL, 0);
        if (IS_ERR(path)) {
                up_write(&EXT4_I(inode)->i_data_sem);
                ret = PTR_ERR(path);
                goto out_stop;
        }

        depth = ext_depth(inode);
        extent = path[depth].p_ext;
        if (extent) {
                ee_start_lblk = le32_to_cpu(extent->ee_block);
                ee_len = ext4_ext_get_actual_len(extent);

                /*
                 * If offset_lblk is not the starting block of extent, split
                 * the extent @offset_lblk
                 */
                if ((offset_lblk > ee_start_lblk) &&
                                (offset_lblk < (ee_start_lblk + ee_len))) {
                        if (ext4_ext_is_unwritten(extent))
                                split_flag = EXT4_EXT_MARK_UNWRIT1 |
                                        EXT4_EXT_MARK_UNWRIT2;
                        ret = ext4_split_extent_at(handle, inode, &path,
                                        offset_lblk, split_flag,
                                        EXT4_EX_NOCACHE |
                                        EXT4_GET_BLOCKS_PRE_IO |
                                        EXT4_GET_BLOCKS_METADATA_NOFAIL);
                }

                ext4_ext_drop_refs(path);
                kfree(path);
                if (ret < 0) {
                        up_write(&EXT4_I(inode)->i_data_sem);
                        goto out_stop;
                }
        } else {
                ext4_ext_drop_refs(path);
                kfree(path);
        }

        ret = ext4_es_remove_extent(inode, offset_lblk,
                        EXT_MAX_BLOCKS - offset_lblk);
        if (ret) {
                up_write(&EXT4_I(inode)->i_data_sem);
                goto out_stop;
        }

        /*
         * if offset_lblk lies in a hole which is at start of file, use
         * ee_start_lblk to shift extents
         */
        ret = ext4_ext_shift_extents(inode, handle,
                ee_start_lblk > offset_lblk ? ee_start_lblk : offset_lblk,
                len_lblk, SHIFT_RIGHT);

        up_write(&EXT4_I(inode)->i_data_sem);
        if (IS_SYNC(inode))
                ext4_handle_sync(handle);
        if (ret >= 0)
                ext4_update_inode_fsync_trans(handle, inode, 1);

out_stop:
        ext4_journal_stop(handle);
        ext4_fc_stop_ineligible(sb);
out_mmap:
        up_write(&EXT4_I(inode)->i_mmap_sem);
out_mutex:
        inode_unlock(inode);
        return ret;
}

/**
 * ext4_swap_extents() - Swap extents between two inodes
 * @handle: handle for this transaction
 * @inode1:        First inode
 * @inode2:        Second inode
 * @lblk1:        Start block for first inode
 * @lblk2:        Start block for second inode
 * @count:        Number of blocks to swap
 * @unwritten: Mark second inode's extents as unwritten after swap
 * @erp:        Pointer to save error value
 *
 * This helper routine does exactly what is promise "swap extents". All other
 * stuff such as page-cache locking consistency, bh mapping consistency or
 * extent's data copying must be performed by caller.
 * Locking:
 *                 i_mutex is held for both inodes
 *                 i_data_sem is locked for write for both inodes
 * Assumptions:
 *                All pages from requested range are locked for both inodes
 */
int
ext4_swap_extents(handle_t *handle, struct inode *inode1,
                  struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2,
                  ext4_lblk_t count, int unwritten, int *erp)
{
        struct ext4_ext_path *path1 = NULL;
        struct ext4_ext_path *path2 = NULL;
        int replaced_count = 0;

        BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem));
        BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem));
        BUG_ON(!inode_is_locked(inode1));
        BUG_ON(!inode_is_locked(inode2));

        *erp = ext4_es_remove_extent(inode1, lblk1, count);
        if (unlikely(*erp))
                return 0;
        *erp = ext4_es_remove_extent(inode2, lblk2, count);
        if (unlikely(*erp))
                return 0;

        while (count) {
                struct ext4_extent *ex1, *ex2, tmp_ex;
                ext4_lblk_t e1_blk, e2_blk;
                int e1_len, e2_len, len;
                int split = 0;

                path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE);
                if (IS_ERR(path1)) {
                        *erp = PTR_ERR(path1);
                        path1 = NULL;
                finish:
                        count = 0;
                        goto repeat;
                }
                path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE);
                if (IS_ERR(path2)) {
                        *erp = PTR_ERR(path2);
                        path2 = NULL;
                        goto finish;
                }
                ex1 = path1[path1->p_depth].p_ext;
                ex2 = path2[path2->p_depth].p_ext;
                /* Do we have something to swap ? */
                if (unlikely(!ex2 || !ex1))
                        goto finish;

                e1_blk = le32_to_cpu(ex1->ee_block);
                e2_blk = le32_to_cpu(ex2->ee_block);
                e1_len = ext4_ext_get_actual_len(ex1);
                e2_len = ext4_ext_get_actual_len(ex2);

                /* Hole handling */
                if (!in_range(lblk1, e1_blk, e1_len) ||
                    !in_range(lblk2, e2_blk, e2_len)) {
                        ext4_lblk_t next1, next2;

                        /* if hole after extent, then go to next extent */
                        next1 = ext4_ext_next_allocated_block(path1);
                        next2 = ext4_ext_next_allocated_block(path2);
                        /* If hole before extent, then shift to that extent */
                        if (e1_blk > lblk1)
                                next1 = e1_blk;
                        if (e2_blk > lblk2)
                                next2 = e2_blk;
                        /* Do we have something to swap */
                        if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS)
                                goto finish;
                        /* Move to the rightest boundary */
                        len = next1 - lblk1;
                        if (len < next2 - lblk2)
                                len = next2 - lblk2;
                        if (len > count)
                                len = count;
                        lblk1 += len;
                        lblk2 += len;
                        count -= len;
                        goto repeat;
                }

                /* Prepare left boundary */
                if (e1_blk < lblk1) {
                        split = 1;
                        *erp = ext4_force_split_extent_at(handle, inode1,
                                                &path1, lblk1, 0);
                        if (unlikely(*erp))
                                goto finish;
                }
                if (e2_blk < lblk2) {
                        split = 1;
                        *erp = ext4_force_split_extent_at(handle, inode2,
                                                &path2,  lblk2, 0);
                        if (unlikely(*erp))
                                goto finish;
                }
                /* ext4_split_extent_at() may result in leaf extent split,
                 * path must to be revalidated. */
                if (split)
                        goto repeat;

                /* Prepare right boundary */
                len = count;
                if (len > e1_blk + e1_len - lblk1)
                        len = e1_blk + e1_len - lblk1;
                if (len > e2_blk + e2_len - lblk2)
                        len = e2_blk + e2_len - lblk2;

                if (len != e1_len) {
                        split = 1;
                        *erp = ext4_force_split_extent_at(handle, inode1,
                                                &path1, lblk1 + len, 0);
                        if (unlikely(*erp))
                                goto finish;
                }
                if (len != e2_len) {
                        split = 1;
                        *erp = ext4_force_split_extent_at(handle, inode2,
                                                &path2, lblk2 + len, 0);
                        if (*erp)
                                goto finish;
                }
                /* ext4_split_extent_at() may result in leaf extent split,
                 * path must to be revalidated. */
                if (split)
                        goto repeat;

                BUG_ON(e2_len != e1_len);
                *erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth);
                if (unlikely(*erp))
                        goto finish;
                *erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth);
                if (unlikely(*erp))
                        goto finish;

                /* Both extents are fully inside boundaries. Swap it now */
                tmp_ex = *ex1;
                ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2));
                ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex));
                ex1->ee_len = cpu_to_le16(e2_len);
                ex2->ee_len = cpu_to_le16(e1_len);
                if (unwritten)
                        ext4_ext_mark_unwritten(ex2);
                if (ext4_ext_is_unwritten(&tmp_ex))
                        ext4_ext_mark_unwritten(ex1);

                ext4_ext_try_to_merge(handle, inode2, path2, ex2);
                ext4_ext_try_to_merge(handle, inode1, path1, ex1);
                *erp = ext4_ext_dirty(handle, inode2, path2 +
                                      path2->p_depth);
                if (unlikely(*erp))
                        goto finish;
                *erp = ext4_ext_dirty(handle, inode1, path1 +
                                      path1->p_depth);
                /*
                 * Looks scarry ah..? second inode already points to new blocks,
                 * and it was successfully dirtied. But luckily error may happen
                 * only due to journal error, so full transaction will be
                 * aborted anyway.
                 */
                if (unlikely(*erp))
                        goto finish;
                lblk1 += len;
                lblk2 += len;
                replaced_count += len;
                count -= len;

        repeat:
                ext4_ext_drop_refs(path1);
                kfree(path1);
                ext4_ext_drop_refs(path2);
                kfree(path2);
                path1 = path2 = NULL;
        }
        return replaced_count;
}

/*
 * ext4_clu_mapped - determine whether any block in a logical cluster has
 *                   been mapped to a physical cluster
 *
 * @inode - file containing the logical cluster
 * @lclu - logical cluster of interest
 *
 * Returns 1 if any block in the logical cluster is mapped, signifying
 * that a physical cluster has been allocated for it.  Otherwise,
 * returns 0.  Can also return negative error codes.  Derived from
 * ext4_ext_map_blocks().
 */
int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_ext_path *path;
        int depth, mapped = 0, err = 0;
        struct ext4_extent *extent;
        ext4_lblk_t first_lblk, first_lclu, last_lclu;

        /*
         * if data can be stored inline, the logical cluster isn't
         * mapped - no physical clusters have been allocated, and the
         * file has no extents
         */
        if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) ||
            ext4_has_inline_data(inode))
                return 0;

        /* search for the extent closest to the first block in the cluster */
        path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0);
        if (IS_ERR(path)) {
                err = PTR_ERR(path);
                path = NULL;
                goto out;
        }

        depth = ext_depth(inode);

        /*
         * A consistent leaf must not be empty.  This situation is possible,
         * though, _during_ tree modification, and it's why an assert can't
         * be put in ext4_find_extent().
         */
        if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
                EXT4_ERROR_INODE(inode,
                    "bad extent address - lblock: %lu, depth: %d, pblock: %lld",
                                 (unsigned long) EXT4_C2B(sbi, lclu),
                                 depth, path[depth].p_block);
                err = -EFSCORRUPTED;
                goto out;
        }

        extent = path[depth].p_ext;

        /* can't be mapped if the extent tree is empty */
        if (extent == NULL)
                goto out;

        first_lblk = le32_to_cpu(extent->ee_block);
        first_lclu = EXT4_B2C(sbi, first_lblk);

        /*
         * Three possible outcomes at this point - found extent spanning
         * the target cluster, to the left of the target cluster, or to the
         * right of the target cluster.  The first two cases are handled here.
         * The last case indicates the target cluster is not mapped.
         */
        if (lclu >= first_lclu) {
                last_lclu = EXT4_B2C(sbi, first_lblk +
                                     ext4_ext_get_actual_len(extent) - 1);
                if (lclu <= last_lclu) {
                        mapped = 1;
                } else {
                        first_lblk = ext4_ext_next_allocated_block(path);
                        first_lclu = EXT4_B2C(sbi, first_lblk);
                        if (lclu == first_lclu)
                                mapped = 1;
                }
        }

out:
        ext4_ext_drop_refs(path);
        kfree(path);

        return err ? err : mapped;
}

/*
 * Updates physical block address and unwritten status of extent
 * starting at lblk start and of len. If such an extent doesn't exist,
 * this function splits the extent tree appropriately to create an
 * extent like this.  This function is called in the fast commit
 * replay path.  Returns 0 on success and error on failure.
 */
int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start,
                              int len, int unwritten, ext4_fsblk_t pblk)
{
        struct ext4_ext_path *path;
        struct ext4_extent *ex;
        int ret;

        path = ext4_find_extent(inode, start, NULL, 0);
        if (IS_ERR(path))
                return PTR_ERR(path);
        ex = path[path->p_depth].p_ext;
        if (!ex) {
                ret = -EFSCORRUPTED;
                goto out;
        }

        if (le32_to_cpu(ex->ee_block) != start ||
                ext4_ext_get_actual_len(ex) != len) {
                /* We need to split this extent to match our extent first */
                down_write(&EXT4_I(inode)->i_data_sem);
                ret = ext4_force_split_extent_at(NULL, inode, &path, start, 1);
                up_write(&EXT4_I(inode)->i_data_sem);
                if (ret)
                        goto out;

                path = ext4_find_extent(inode, start, &path, 0);
                if (IS_ERR(path))
                        return PTR_ERR(path);
                ex = path[path->p_depth].p_ext;
                WARN_ON(le32_to_cpu(ex->ee_block) != start);

                if (ext4_ext_get_actual_len(ex) != len) {
                        down_write(&EXT4_I(inode)->i_data_sem);
                        ret = ext4_force_split_extent_at(NULL, inode, &path,
                                                         start + len, 1);
                        up_write(&EXT4_I(inode)->i_data_sem);
                        if (ret)
                                goto out;

                        path = ext4_find_extent(inode, start, &path, 0);
                        if (IS_ERR(path))
                                return PTR_ERR(path);
                        ex = path[path->p_depth].p_ext;
                }
        }
        if (unwritten)
                ext4_ext_mark_unwritten(ex);
        else
                ext4_ext_mark_initialized(ex);
        ext4_ext_store_pblock(ex, pblk);
        down_write(&EXT4_I(inode)->i_data_sem);
        ret = ext4_ext_dirty(NULL, inode, &path[path->p_depth]);
        up_write(&EXT4_I(inode)->i_data_sem);
out:
        ext4_ext_drop_refs(path);
        kfree(path);
        ext4_mark_inode_dirty(NULL, inode);
        return ret;
}

/* Try to shrink the extent tree */
void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end)
{
        struct ext4_ext_path *path = NULL;
        struct ext4_extent *ex;
        ext4_lblk_t old_cur, cur = 0;

        while (cur < end) {
                path = ext4_find_extent(inode, cur, NULL, 0);
                if (IS_ERR(path))
                        return;
                ex = path[path->p_depth].p_ext;
                if (!ex) {
                        ext4_ext_drop_refs(path);
                        kfree(path);
                        ext4_mark_inode_dirty(NULL, inode);
                        return;
                }
                old_cur = cur;
                cur = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
                if (cur <= old_cur)
                        cur = old_cur + 1;
                ext4_ext_try_to_merge(NULL, inode, path, ex);
                down_write(&EXT4_I(inode)->i_data_sem);
                ext4_ext_dirty(NULL, inode, &path[path->p_depth]);
                up_write(&EXT4_I(inode)->i_data_sem);
                ext4_mark_inode_dirty(NULL, inode);
                ext4_ext_drop_refs(path);
                kfree(path);
        }
}

/* Check if *cur is a hole and if it is, skip it */
static int skip_hole(struct inode *inode, ext4_lblk_t *cur)
{
        int ret;
        struct ext4_map_blocks map;

        map.m_lblk = *cur;
        map.m_len = ((inode->i_size) >> inode->i_sb->s_blocksize_bits) - *cur;

        ret = ext4_map_blocks(NULL, inode, &map, 0);
        if (ret < 0)
                return ret;
        if (ret != 0)
                return 0;
        *cur = *cur + map.m_len;
        return 0;
}

/* Count number of blocks used by this inode and update i_blocks */
int ext4_ext_replay_set_iblocks(struct inode *inode)
{
        struct ext4_ext_path *path = NULL, *path2 = NULL;
        struct ext4_extent *ex;
        ext4_lblk_t cur = 0, end;
        int numblks = 0, i, ret = 0;
        ext4_fsblk_t cmp1, cmp2;
        struct ext4_map_blocks map;

        /* Determin the size of the file first */
        path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
                                        EXT4_EX_NOCACHE);
        if (IS_ERR(path))
                return PTR_ERR(path);
        ex = path[path->p_depth].p_ext;
        if (!ex) {
                ext4_ext_drop_refs(path);
                kfree(path);
                goto out;
        }
        end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
        ext4_ext_drop_refs(path);
        kfree(path);

        /* Count the number of data blocks */
        cur = 0;
        while (cur < end) {
                map.m_lblk = cur;
                map.m_len = end - cur;
                ret = ext4_map_blocks(NULL, inode, &map, 0);
                if (ret < 0)
                        break;
                if (ret > 0)
                        numblks += ret;
                cur = cur + map.m_len;
        }

        /*
         * Count the number of extent tree blocks. We do it by looking up
         * two successive extents and determining the difference between
         * their paths. When path is different for 2 successive extents
         * we compare the blocks in the path at each level and increment
         * iblocks by total number of differences found.
         */
        cur = 0;
        ret = skip_hole(inode, &cur);
        if (ret < 0)
                goto out;
        path = ext4_find_extent(inode, cur, NULL, 0);
        if (IS_ERR(path))
                goto out;
        numblks += path->p_depth;
        ext4_ext_drop_refs(path);
        kfree(path);
        while (cur < end) {
                path = ext4_find_extent(inode, cur, NULL, 0);
                if (IS_ERR(path))
                        break;
                ex = path[path->p_depth].p_ext;
                if (!ex) {
                        ext4_ext_drop_refs(path);
                        kfree(path);
                        return 0;
                }
                cur = max(cur + 1, le32_to_cpu(ex->ee_block) +
                                        ext4_ext_get_actual_len(ex));
                ret = skip_hole(inode, &cur);
                if (ret < 0) {
                        ext4_ext_drop_refs(path);
                        kfree(path);
                        break;
                }
                path2 = ext4_find_extent(inode, cur, NULL, 0);
                if (IS_ERR(path2)) {
                        ext4_ext_drop_refs(path);
                        kfree(path);
                        break;
                }
                ex = path2[path2->p_depth].p_ext;
                for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) {
                        cmp1 = cmp2 = 0;
                        if (i <= path->p_depth)
                                cmp1 = path[i].p_bh ?
                                        path[i].p_bh->b_blocknr : 0;
                        if (i <= path2->p_depth)
                                cmp2 = path2[i].p_bh ?
                                        path2[i].p_bh->b_blocknr : 0;
                        if (cmp1 != cmp2 && cmp2 != 0)
                                numblks++;
                }
                ext4_ext_drop_refs(path);
                ext4_ext_drop_refs(path2);
                kfree(path);
                kfree(path2);
        }

out:
        inode->i_blocks = numblks << (inode->i_sb->s_blocksize_bits - 9);
        ext4_mark_inode_dirty(NULL, inode);
        return 0;
}

int ext4_ext_clear_bb(struct inode *inode)
{
        struct ext4_ext_path *path = NULL;
        struct ext4_extent *ex;
        ext4_lblk_t cur = 0, end;
        int j, ret = 0;
        struct ext4_map_blocks map;

        /* Determin the size of the file first */
        path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
                                        EXT4_EX_NOCACHE);
        if (IS_ERR(path))
                return PTR_ERR(path);
        ex = path[path->p_depth].p_ext;
        if (!ex) {
                ext4_ext_drop_refs(path);
                kfree(path);
                return 0;
        }
        end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
        ext4_ext_drop_refs(path);
        kfree(path);

        cur = 0;
        while (cur < end) {
                map.m_lblk = cur;
                map.m_len = end - cur;
                ret = ext4_map_blocks(NULL, inode, &map, 0);
                if (ret < 0)
                        break;
                if (ret > 0) {
                        path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
                        if (!IS_ERR_OR_NULL(path)) {
                                for (j = 0; j < path->p_depth; j++) {

                                        ext4_mb_mark_bb(inode->i_sb,
                                                        path[j].p_block, 1, 0);
                                        ext4_fc_record_regions(inode->i_sb, inode->i_ino,
                                                        0, path[j].p_block, 1, 1);
                                }
                                ext4_ext_drop_refs(path);
                                kfree(path);
                        }
                        ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
                        ext4_fc_record_regions(inode->i_sb, inode->i_ino,
                                        map.m_lblk, map.m_pblk, map.m_len, 1);
                }
                cur = cur + map.m_len;
        }

        return 0;
}

































































    1 




    1 
    1 




    1 




    1 





    1 
    1 

    1 
    1 


















    1 
    1 








    1 

    1 
    1 








    1 













































































    1 












    1 




    1 


    1 

    1 







    1 
    1 









    1 


    1 







    1 











    1 




    1 
    1 


















    1 

    1 



    1 


    1 




    1 














    1 
    1 

    1 





    1 

    1 












































    1 









    1 



































    1 



    1 







    1 
    1 





    1 













    1 






    1 



    1 







    1 











    1 










    1 





    1 










    1 









    1 

    1 









    1 








    1 


    1 



















    1 



    1 



    1 



    1 

    1 



















    1 


    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2010 Red Hat, Inc.
 * Copyright (c) 2016-2018 Christoph Hellwig.
 */
#include <linux/module.h>
#include <linux/compiler.h>
#include <linux/fs.h>
#include <linux/iomap.h>
#include <linux/backing-dev.h>
#include <linux/uio.h>
#include <linux/task_io_accounting_ops.h>
#include "trace.h"

#include "../internal.h"

/*
 * Private flags for iomap_dio, must not overlap with the public ones in
 * iomap.h:
 */
#define IOMAP_DIO_WRITE_FUA        (1 << 28)
#define IOMAP_DIO_NEED_SYNC        (1 << 29)
#define IOMAP_DIO_WRITE                (1 << 30)
#define IOMAP_DIO_DIRTY                (1 << 31)

struct iomap_dio {
        struct kiocb                *iocb;
        const struct iomap_dio_ops *dops;
        loff_t                        i_size;
        loff_t                        size;
        atomic_t                ref;
        unsigned                flags;
        int                        error;
        bool                        wait_for_completion;

        union {
                /* used during submission and for synchronous completion: */
                struct {
                        struct iov_iter                *iter;
                        struct task_struct        *waiter;
                        struct request_queue        *last_queue;
                        blk_qc_t                cookie;
                } submit;

                /* used for aio completion: */
                struct {
                        struct work_struct        work;
                } aio;
        };
};

int iomap_dio_iopoll(struct kiocb *kiocb, bool spin)
{
        struct request_queue *q = READ_ONCE(kiocb->private);

        if (!q)
                return 0;
        return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin);
}
EXPORT_SYMBOL_GPL(iomap_dio_iopoll);

static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap,
                struct bio *bio, loff_t pos)
{
        atomic_inc(&dio->ref);

        if (dio->iocb->ki_flags & IOCB_HIPRI)
                bio_set_polled(bio, dio->iocb);

        dio->submit.last_queue = bdev_get_queue(iomap->bdev);
        if (dio->dops && dio->dops->submit_io)
                dio->submit.cookie = dio->dops->submit_io(
                                file_inode(dio->iocb->ki_filp),
                                iomap, bio, pos);
        else
                dio->submit.cookie = submit_bio(bio);
}

ssize_t iomap_dio_complete(struct iomap_dio *dio)
{
        const struct iomap_dio_ops *dops = dio->dops;
        struct kiocb *iocb = dio->iocb;
        struct inode *inode = file_inode(iocb->ki_filp);
        loff_t offset = iocb->ki_pos;
        ssize_t ret = dio->error;

        if (dops && dops->end_io)
                ret = dops->end_io(iocb, dio->size, ret, dio->flags);

        if (likely(!ret)) {
                ret = dio->size;
                /* check for short read */
                if (offset + ret > dio->i_size &&
                    !(dio->flags & IOMAP_DIO_WRITE))
                        ret = dio->i_size - offset;
        }

        /*
         * Try again to invalidate clean pages which might have been cached by
         * non-direct readahead, or faulted in by get_user_pages() if the source
         * of the write was an mmap'ed region of the file we're writing.  Either
         * one is a pretty crazy thing to do, so we don't support it 100%.  If
         * this invalidation fails, tough, the write still worked...
         *
         * And this page cache invalidation has to be after ->end_io(), as some
         * filesystems convert unwritten extents to real allocations in
         * ->end_io() when necessary, otherwise a racing buffer read would cache
         * zeros from unwritten extents.
         */
        if (!dio->error && dio->size &&
            (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
                int err;
                err = invalidate_inode_pages2_range(inode->i_mapping,
                                offset >> PAGE_SHIFT,
                                (offset + dio->size - 1) >> PAGE_SHIFT);
                if (err)
                        dio_warn_stale_pagecache(iocb->ki_filp);
        }

        inode_dio_end(file_inode(iocb->ki_filp));

        if (ret > 0) {
                iocb->ki_pos += ret;

                /*
                 * If this is a DSYNC write, make sure we push it to stable
                 * storage now that we've written data.
                 */
                if (dio->flags & IOMAP_DIO_NEED_SYNC)
                        ret = generic_write_sync(iocb, ret);
        }
        kfree(dio);
        return ret;
}
EXPORT_SYMBOL_GPL(iomap_dio_complete);

static void iomap_dio_complete_work(struct work_struct *work)
{
        struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
        struct kiocb *iocb = dio->iocb;

        iocb->ki_complete(iocb, iomap_dio_complete(dio), 0);
}

/*
 * Set an error in the dio if none is set yet.  We have to use cmpxchg
 * as the submission context and the completion context(s) can race to
 * update the error.
 */
static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
{
        cmpxchg(&dio->error, 0, ret);
}

static void iomap_dio_bio_end_io(struct bio *bio)
{
        struct iomap_dio *dio = bio->bi_private;
        bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);

        if (bio->bi_status)
                iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));

        if (atomic_dec_and_test(&dio->ref)) {
                if (dio->wait_for_completion) {
                        struct task_struct *waiter = dio->submit.waiter;
                        WRITE_ONCE(dio->submit.waiter, NULL);
                        blk_wake_io_task(waiter);
                } else if (dio->flags & IOMAP_DIO_WRITE) {
                        struct inode *inode = file_inode(dio->iocb->ki_filp);

                        INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
                        queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
                } else {
                        iomap_dio_complete_work(&dio->aio.work);
                }
        }

        if (should_dirty) {
                bio_check_pages_dirty(bio);
        } else {
                bio_release_pages(bio, false);
                bio_put(bio);
        }
}

static void
iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
                unsigned len)
{
        struct page *page = ZERO_PAGE(0);
        int flags = REQ_SYNC | REQ_IDLE;
        struct bio *bio;

        bio = bio_alloc(GFP_KERNEL, 1);
        bio_set_dev(bio, iomap->bdev);
        bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
        bio->bi_private = dio;
        bio->bi_end_io = iomap_dio_bio_end_io;

        get_page(page);
        __bio_add_page(bio, page, len, 0);
        bio_set_op_attrs(bio, REQ_OP_WRITE, flags);
        iomap_dio_submit_bio(dio, iomap, bio, pos);
}

static loff_t
iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
                struct iomap_dio *dio, struct iomap *iomap)
{
        unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
        unsigned int fs_block_size = i_blocksize(inode), pad;
        unsigned int align = iov_iter_alignment(dio->submit.iter);
        struct bio *bio;
        bool need_zeroout = false;
        bool use_fua = false;
        int nr_pages, ret = 0;
        size_t copied = 0;
        size_t orig_count;

        if ((pos | length | align) & ((1 << blkbits) - 1))
                return -EINVAL;

        if (iomap->type == IOMAP_UNWRITTEN) {
                dio->flags |= IOMAP_DIO_UNWRITTEN;
                need_zeroout = true;
        }

        if (iomap->flags & IOMAP_F_SHARED)
                dio->flags |= IOMAP_DIO_COW;

        if (iomap->flags & IOMAP_F_NEW) {
                need_zeroout = true;
        } else if (iomap->type == IOMAP_MAPPED) {
                /*
                 * Use a FUA write if we need datasync semantics, this is a pure
                 * data IO that doesn't require any metadata updates (including
                 * after IO completion such as unwritten extent conversion) and
                 * the underlying device supports FUA. This allows us to avoid
                 * cache flushes on IO completion.
                 */
                if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
                    (dio->flags & IOMAP_DIO_WRITE_FUA) &&
                    blk_queue_fua(bdev_get_queue(iomap->bdev)))
                        use_fua = true;
        }

        /*
         * Save the original count and trim the iter to just the extent we
         * are operating on right now.  The iter will be re-expanded once
         * we are done.
         */
        orig_count = iov_iter_count(dio->submit.iter);
        iov_iter_truncate(dio->submit.iter, length);

        nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES);
        if (nr_pages <= 0) {
                ret = nr_pages;
                goto out;
        }

        if (need_zeroout) {
                /* zero out from the start of the block to the write offset */
                pad = pos & (fs_block_size - 1);
                if (pad)
                        iomap_dio_zero(dio, iomap, pos - pad, pad);
        }

        do {
                size_t n;

                /*
                 * If completions already occurred and reported errors, give up now and
                 * don't bother submitting more bios.
                 */
                if (unlikely(data_race(dio->error))) {
                        ret = 0;
                        goto out;
                }

                bio = bio_alloc(GFP_KERNEL, nr_pages);
                bio_set_dev(bio, iomap->bdev);
                bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
                bio->bi_write_hint = dio->iocb->ki_hint;
                bio->bi_ioprio = dio->iocb->ki_ioprio;
                bio->bi_private = dio;
                bio->bi_end_io = iomap_dio_bio_end_io;

                ret = bio_iov_iter_get_pages(bio, dio->submit.iter);
                if (unlikely(ret)) {
                        /*
                         * We have to stop part way through an IO. We must fall
                         * through to the sub-block tail zeroing here, otherwise
                         * this short IO may expose stale data in the tail of
                         * the block we haven't written data to.
                         */
                        bio_put(bio);
                        goto zero_tail;
                }

                n = bio->bi_iter.bi_size;
                if (dio->flags & IOMAP_DIO_WRITE) {
                        bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
                        if (use_fua)
                                bio->bi_opf |= REQ_FUA;
                        else
                                dio->flags &= ~IOMAP_DIO_WRITE_FUA;
                        task_io_account_write(n);
                } else {
                        bio->bi_opf = REQ_OP_READ;
                        if (dio->flags & IOMAP_DIO_DIRTY)
                                bio_set_pages_dirty(bio);
                }

                dio->size += n;
                copied += n;

                nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES);
                iomap_dio_submit_bio(dio, iomap, bio, pos);
                pos += n;
        } while (nr_pages);

        /*
         * We need to zeroout the tail of a sub-block write if the extent type
         * requires zeroing or the write extends beyond EOF. If we don't zero
         * the block tail in the latter case, we can expose stale data via mmap
         * reads of the EOF block.
         */
zero_tail:
        if (need_zeroout ||
            ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) {
                /* zero out from the end of the write to the end of the block */
                pad = pos & (fs_block_size - 1);
                if (pad)
                        iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
        }
out:
        /* Undo iter limitation to current extent */
        iov_iter_reexpand(dio->submit.iter, orig_count - copied);
        if (copied)
                return copied;
        return ret;
}

static loff_t
iomap_dio_hole_actor(loff_t length, struct iomap_dio *dio)
{
        length = iov_iter_zero(length, dio->submit.iter);
        dio->size += length;
        return length;
}

static loff_t
iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length,
                struct iomap_dio *dio, struct iomap *iomap)
{
        struct iov_iter *iter = dio->submit.iter;
        size_t copied;

        BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data));

        if (dio->flags & IOMAP_DIO_WRITE) {
                loff_t size = inode->i_size;

                if (pos > size)
                        memset(iomap->inline_data + size, 0, pos - size);
                copied = copy_from_iter(iomap->inline_data + pos, length, iter);
                if (copied) {
                        if (pos + copied > size)
                                i_size_write(inode, pos + copied);
                        mark_inode_dirty(inode);
                }
        } else {
                copied = copy_to_iter(iomap->inline_data + pos, length, iter);
        }
        dio->size += copied;
        return copied;
}

static loff_t
iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
                void *data, struct iomap *iomap, struct iomap *srcmap)
{
        struct iomap_dio *dio = data;

        switch (iomap->type) {
        case IOMAP_HOLE:
                if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE))
                        return -EIO;
                return iomap_dio_hole_actor(length, dio);
        case IOMAP_UNWRITTEN:
                if (!(dio->flags & IOMAP_DIO_WRITE))
                        return iomap_dio_hole_actor(length, dio);
                return iomap_dio_bio_actor(inode, pos, length, dio, iomap);
        case IOMAP_MAPPED:
                return iomap_dio_bio_actor(inode, pos, length, dio, iomap);
        case IOMAP_INLINE:
                return iomap_dio_inline_actor(inode, pos, length, dio, iomap);
        case IOMAP_DELALLOC:
                /*
                 * DIO is not serialised against mmap() access at all, and so
                 * if the page_mkwrite occurs between the writeback and the
                 * iomap_apply() call in the DIO path, then it will see the
                 * DELALLOC block that the page-mkwrite allocated.
                 */
                pr_warn_ratelimited("Direct I/O collision with buffered writes! File: %pD4 Comm: %.20s\n",
                                    dio->iocb->ki_filp, current->comm);
                return -EIO;
        default:
                WARN_ON_ONCE(1);
                return -EIO;
        }
}

/*
 * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
 * is being issued as AIO or not.  This allows us to optimise pure data writes
 * to use REQ_FUA rather than requiring generic_write_sync() to issue a
 * REQ_FLUSH post write. This is slightly tricky because a single request here
 * can be mapped into multiple disjoint IOs and only a subset of the IOs issued
 * may be pure data writes. In that case, we still need to do a full data sync
 * completion.
 *
 * Returns -ENOTBLK In case of a page invalidation invalidation failure for
 * writes.  The callers needs to fall back to buffered I/O in this case.
 */
struct iomap_dio *
__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
                const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
                bool wait_for_completion)
{
        struct address_space *mapping = iocb->ki_filp->f_mapping;
        struct inode *inode = file_inode(iocb->ki_filp);
        size_t count = iov_iter_count(iter);
        loff_t pos = iocb->ki_pos;
        loff_t end = iocb->ki_pos + count - 1, ret = 0;
        unsigned int flags = IOMAP_DIRECT;
        struct blk_plug plug;
        struct iomap_dio *dio;

        if (!count)
                return NULL;

        if (WARN_ON(is_sync_kiocb(iocb) && !wait_for_completion))
                return ERR_PTR(-EIO);

        dio = kmalloc(sizeof(*dio), GFP_KERNEL);
        if (!dio)
                return ERR_PTR(-ENOMEM);

        dio->iocb = iocb;
        atomic_set(&dio->ref, 1);
        dio->size = 0;
        dio->i_size = i_size_read(inode);
        dio->dops = dops;
        dio->error = 0;
        dio->flags = 0;

        dio->submit.iter = iter;
        dio->submit.waiter = current;
        dio->submit.cookie = BLK_QC_T_NONE;
        dio->submit.last_queue = NULL;

        if (iov_iter_rw(iter) == READ) {
                if (pos >= dio->i_size)
                        goto out_free_dio;

                if (iter_is_iovec(iter))
                        dio->flags |= IOMAP_DIO_DIRTY;
        } else {
                flags |= IOMAP_WRITE;
                dio->flags |= IOMAP_DIO_WRITE;

                /* for data sync or sync, we need sync completion processing */
                if (iocb->ki_flags & IOCB_DSYNC)
                        dio->flags |= IOMAP_DIO_NEED_SYNC;

                /*
                 * For datasync only writes, we optimistically try using FUA for
                 * this IO.  Any non-FUA write that occurs will clear this flag,
                 * hence we know before completion whether a cache flush is
                 * necessary.
                 */
                if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC)
                        dio->flags |= IOMAP_DIO_WRITE_FUA;
        }

        if (iocb->ki_flags & IOCB_NOWAIT) {
                if (filemap_range_has_page(mapping, pos, end)) {
                        ret = -EAGAIN;
                        goto out_free_dio;
                }
                flags |= IOMAP_NOWAIT;
        }

        ret = filemap_write_and_wait_range(mapping, pos, end);
        if (ret)
                goto out_free_dio;

        if (iov_iter_rw(iter) == WRITE) {
                /*
                 * Try to invalidate cache pages for the range we are writing.
                 * If this invalidation fails, let the caller fall back to
                 * buffered I/O.
                 */
                if (invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,
                                end >> PAGE_SHIFT)) {
                        trace_iomap_dio_invalidate_fail(inode, pos, count);
                        ret = -ENOTBLK;
                        goto out_free_dio;
                }

                if (!wait_for_completion && !inode->i_sb->s_dio_done_wq) {
                        ret = sb_init_dio_done_wq(inode->i_sb);
                        if (ret < 0)
                                goto out_free_dio;
                }
        }

        inode_dio_begin(inode);

        blk_start_plug(&plug);
        do {
                ret = iomap_apply(inode, pos, count, flags, ops, dio,
                                iomap_dio_actor);
                if (ret <= 0) {
                        /* magic error code to fall back to buffered I/O */
                        if (ret == -ENOTBLK) {
                                wait_for_completion = true;
                                ret = 0;
                        }
                        break;
                }
                pos += ret;

                if (iov_iter_rw(iter) == READ && pos >= dio->i_size) {
                        /*
                         * We only report that we've read data up to i_size.
                         * Revert iter to a state corresponding to that as
                         * some callers (such as splice code) rely on it.
                         */
                        iov_iter_revert(iter, pos - dio->i_size);
                        break;
                }
        } while ((count = iov_iter_count(iter)) > 0);
        blk_finish_plug(&plug);

        if (ret < 0)
                iomap_dio_set_error(dio, ret);

        /*
         * If all the writes we issued were FUA, we don't need to flush the
         * cache on IO completion. Clear the sync flag for this case.
         */
        if (dio->flags & IOMAP_DIO_WRITE_FUA)
                dio->flags &= ~IOMAP_DIO_NEED_SYNC;

        WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie);
        WRITE_ONCE(iocb->private, dio->submit.last_queue);

        /*
         * We are about to drop our additional submission reference, which
         * might be the last reference to the dio.  There are three different
         * ways we can progress here:
         *
         *  (a) If this is the last reference we will always complete and free
         *        the dio ourselves.
         *  (b) If this is not the last reference, and we serve an asynchronous
         *        iocb, we must never touch the dio after the decrement, the
         *        I/O completion handler will complete and free it.
         *  (c) If this is not the last reference, but we serve a synchronous
         *        iocb, the I/O completion handler will wake us up on the drop
         *        of the final reference, and we will complete and free it here
         *        after we got woken by the I/O completion handler.
         */
        dio->wait_for_completion = wait_for_completion;
        if (!atomic_dec_and_test(&dio->ref)) {
                if (!wait_for_completion)
                        return ERR_PTR(-EIOCBQUEUED);

                for (;;) {
                        set_current_state(TASK_UNINTERRUPTIBLE);
                        if (!READ_ONCE(dio->submit.waiter))
                                break;

                        if (!(iocb->ki_flags & IOCB_HIPRI) ||
                            !dio->submit.last_queue ||
                            !blk_poll(dio->submit.last_queue,
                                         dio->submit.cookie, true))
                                blk_io_schedule();
                }
                __set_current_state(TASK_RUNNING);
        }

        return dio;

out_free_dio:
        kfree(dio);
        if (ret)
                return ERR_PTR(ret);
        return NULL;
}
EXPORT_SYMBOL_GPL(__iomap_dio_rw);

ssize_t
iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
                const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
                bool wait_for_completion)
{
        struct iomap_dio *dio;

        dio = __iomap_dio_rw(iocb, iter, ops, dops, wait_for_completion);
        if (IS_ERR_OR_NULL(dio))
                return PTR_ERR_OR_ZERO(dio);
        return iomap_dio_complete(dio);
}
EXPORT_SYMBOL_GPL(iomap_dio_rw);

























































































































































































    6 


    6 





    4 




    4 

































































































































    1 
    1 

    4 

    1 
    1 

    2 
    4 

    2 

    6 














    1 

    2 






    1 









    1 

    1 


    1 




















    1 








    4 

    1 


    1 




































































    2 































    2 









    2 







    2 











    1 























    1 






























































































































































































































































































    1 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Macros for manipulating and testing page->flags
 */

#ifndef PAGE_FLAGS_H
#define PAGE_FLAGS_H

#include <linux/types.h>
#include <linux/bug.h>
#include <linux/mmdebug.h>
#ifndef __GENERATING_BOUNDS_H
#include <linux/mm_types.h>
#include <generated/bounds.h>
#endif /* !__GENERATING_BOUNDS_H */

/*
 * Various page->flags bits:
 *
 * PG_reserved is set for special pages. The "struct page" of such a page
 * should in general not be touched (e.g. set dirty) except by its owner.
 * Pages marked as PG_reserved include:
 * - Pages part of the kernel image (including vDSO) and similar (e.g. BIOS,
 *   initrd, HW tables)
 * - Pages reserved or allocated early during boot (before the page allocator
 *   was initialized). This includes (depending on the architecture) the
 *   initial vmemmap, initial page tables, crashkernel, elfcorehdr, and much
 *   much more. Once (if ever) freed, PG_reserved is cleared and they will
 *   be given to the page allocator.
 * - Pages falling into physical memory gaps - not IORESOURCE_SYSRAM. Trying
 *   to read/write these pages might end badly. Don't touch!
 * - The zero page(s)
 * - Pages not added to the page allocator when onlining a section because
 *   they were excluded via the online_page_callback() or because they are
 *   PG_hwpoison.
 * - Pages allocated in the context of kexec/kdump (loaded kernel image,
 *   control pages, vmcoreinfo)
 * - MMIO/DMA pages. Some architectures don't allow to ioremap pages that are
 *   not marked PG_reserved (as they might be in use by somebody else who does
 *   not respect the caching strategy).
 * - Pages part of an offline section (struct pages of offline sections should
 *   not be trusted as they will be initialized when first onlined).
 * - MCA pages on ia64
 * - Pages holding CPU notes for POWER Firmware Assisted Dump
 * - Device memory (e.g. PMEM, DAX, HMM)
 * Some PG_reserved pages will be excluded from the hibernation image.
 * PG_reserved does in general not hinder anybody from dumping or swapping
 * and is no longer required for remap_pfn_range(). ioremap might require it.
 * Consequently, PG_reserved for a page mapped into user space can indicate
 * the zero page, the vDSO, MMIO pages or device memory.
 *
 * The PG_private bitflag is set on pagecache pages if they contain filesystem
 * specific data (which is normally at page->private). It can be used by
 * private allocations for its own usage.
 *
 * During initiation of disk I/O, PG_locked is set. This bit is set before I/O
 * and cleared when writeback _starts_ or when read _completes_. PG_writeback
 * is set before writeback starts and cleared when it finishes.
 *
 * PG_locked also pins a page in pagecache, and blocks truncation of the file
 * while it is held.
 *
 * page_waitqueue(page) is a wait queue of all tasks waiting for the page
 * to become unlocked.
 *
 * PG_swapbacked is set when a page uses swap as a backing storage.  This are
 * usually PageAnon or shmem pages but please note that even anonymous pages
 * might lose their PG_swapbacked flag when they simply can be dropped (e.g. as
 * a result of MADV_FREE).
 *
 * PG_uptodate tells whether the page's contents is valid.  When a read
 * completes, the page becomes uptodate, unless a disk I/O error happened.
 *
 * PG_referenced, PG_reclaim are used for page reclaim for anonymous and
 * file-backed pagecache (see mm/vmscan.c).
 *
 * PG_error is set to indicate that an I/O error occurred on this page.
 *
 * PG_arch_1 is an architecture specific page state bit.  The generic code
 * guarantees that this bit is cleared for a page when it first is entered into
 * the page cache.
 *
 * PG_hwpoison indicates that a page got corrupted in hardware and contains
 * data with incorrect ECC bits that triggered a machine check. Accessing is
 * not safe since it may cause another machine check. Don't touch!
 */

/*
 * Don't use the *_dontuse flags.  Use the macros.  Otherwise you'll break
 * locked- and dirty-page accounting.
 *
 * The page flags field is split into two parts, the main flags area
 * which extends from the low bits upwards, and the fields area which
 * extends from the high bits downwards.
 *
 *  | FIELD | ... | FLAGS |
 *  N-1           ^       0
 *               (NR_PAGEFLAGS)
 *
 * The fields area is reserved for fields mapping zone, node (for NUMA) and
 * SPARSEMEM section (for variants of SPARSEMEM that require section ids like
 * SPARSEMEM_EXTREME with !SPARSEMEM_VMEMMAP).
 */
enum pageflags {
        PG_locked,                /* Page is locked. Don't touch. */
        PG_referenced,
        PG_uptodate,
        PG_dirty,
        PG_lru,
        PG_active,
        PG_workingset,
        PG_waiters,                /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */
        PG_error,
        PG_slab,
        PG_owner_priv_1,        /* Owner use. If pagecache, fs may use*/
        PG_arch_1,
        PG_reserved,
        PG_private,                /* If pagecache, has fs-private data */
        PG_private_2,                /* If pagecache, has fs aux data */
        PG_writeback,                /* Page is under writeback */
        PG_head,                /* A head page */
        PG_mappedtodisk,        /* Has blocks allocated on-disk */
        PG_reclaim,                /* To be reclaimed asap */
        PG_swapbacked,                /* Page is backed by RAM/swap */
        PG_unevictable,                /* Page is "unevictable"  */
#ifdef CONFIG_MMU
        PG_mlocked,                /* Page is vma mlocked */
#endif
#ifdef CONFIG_ARCH_USES_PG_UNCACHED
        PG_uncached,                /* Page has been mapped as uncached */
#endif
#ifdef CONFIG_MEMORY_FAILURE
        PG_hwpoison,                /* hardware poisoned page. Don't touch */
#endif
#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
        PG_young,
        PG_idle,
#endif
#ifdef CONFIG_64BIT
        PG_arch_2,
#endif
        __NR_PAGEFLAGS,

        /* Filesystems */
        PG_checked = PG_owner_priv_1,

        /* SwapBacked */
        PG_swapcache = PG_owner_priv_1,        /* Swap page: swp_entry_t in private */

        /* Two page bits are conscripted by FS-Cache to maintain local caching
         * state.  These bits are set on pages belonging to the netfs's inodes
         * when those inodes are being locally cached.
         */
        PG_fscache = PG_private_2,        /* page backed by cache */

        /* XEN */
        /* Pinned in Xen as a read-only pagetable page. */
        PG_pinned = PG_owner_priv_1,
        /* Pinned as part of domain save (see xen_mm_pin_all()). */
        PG_savepinned = PG_dirty,
        /* Has a grant mapping of another (foreign) domain's page. */
        PG_foreign = PG_owner_priv_1,
        /* Remapped by swiotlb-xen. */
        PG_xen_remapped = PG_owner_priv_1,

        /* SLOB */
        PG_slob_free = PG_private,

        /* Compound pages. Stored in first tail page's flags */
        PG_double_map = PG_workingset,

        /* non-lru isolated movable page */
        PG_isolated = PG_reclaim,

        /* Only valid for buddy pages. Used to track pages that are reported */
        PG_reported = PG_uptodate,
};

#ifndef __GENERATING_BOUNDS_H

struct page;        /* forward declaration */

static inline struct page *compound_head(struct page *page)
{
        unsigned long head = READ_ONCE(page->compound_head);

        if (unlikely(head & 1))
                return (struct page *) (head - 1);
        return page;
}

static __always_inline int PageTail(struct page *page)
{
        return READ_ONCE(page->compound_head) & 1;
}

static __always_inline int PageCompound(struct page *page)
{
        return test_bit(PG_head, &page->flags) || PageTail(page);
}

#define        PAGE_POISON_PATTERN        -1l
static inline int PagePoisoned(const struct page *page)
{
        return page->flags == PAGE_POISON_PATTERN;
}

#ifdef CONFIG_DEBUG_VM
void page_init_poison(struct page *page, size_t size);
#else
static inline void page_init_poison(struct page *page, size_t size)
{
}
#endif

/*
 * Page flags policies wrt compound pages
 *
 * PF_POISONED_CHECK
 *     check if this struct page poisoned/uninitialized
 *
 * PF_ANY:
 *     the page flag is relevant for small, head and tail pages.
 *
 * PF_HEAD:
 *     for compound page all operations related to the page flag applied to
 *     head page.
 *
 * PF_ONLY_HEAD:
 *     for compound page, callers only ever operate on the head page.
 *
 * PF_NO_TAIL:
 *     modifications of the page flag must be done on small or head pages,
 *     checks can be done on tail pages too.
 *
 * PF_NO_COMPOUND:
 *     the page flag is not relevant for compound pages.
 *
 * PF_SECOND:
 *     the page flag is stored in the first tail page.
 */
#define PF_POISONED_CHECK(page) ({                                        \
                VM_BUG_ON_PGFLAGS(PagePoisoned(page), page);                \
                page; })
#define PF_ANY(page, enforce)        PF_POISONED_CHECK(page)
#define PF_HEAD(page, enforce)        PF_POISONED_CHECK(compound_head(page))
#define PF_ONLY_HEAD(page, enforce) ({                                        \
                VM_BUG_ON_PGFLAGS(PageTail(page), page);                \
                PF_POISONED_CHECK(page); })
#define PF_NO_TAIL(page, enforce) ({                                        \
                VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page);        \
                PF_POISONED_CHECK(compound_head(page)); })
#define PF_NO_COMPOUND(page, enforce) ({                                \
                VM_BUG_ON_PGFLAGS(enforce && PageCompound(page), page);        \
                PF_POISONED_CHECK(page); })
#define PF_SECOND(page, enforce) ({                                        \
                VM_BUG_ON_PGFLAGS(!PageHead(page), page);                \
                PF_POISONED_CHECK(&page[1]); })

/*
 * Macros to create function definitions for page flags
 */
#define TESTPAGEFLAG(uname, lname, policy)                                \
static __always_inline int Page##uname(struct page *page)                \
        { return test_bit(PG_##lname, &policy(page, 0)->flags); }

#define SETPAGEFLAG(uname, lname, policy)                                \
static __always_inline void SetPage##uname(struct page *page)                \
        { set_bit(PG_##lname, &policy(page, 1)->flags); }

#define CLEARPAGEFLAG(uname, lname, policy)                                \
static __always_inline void ClearPage##uname(struct page *page)                \
        { clear_bit(PG_##lname, &policy(page, 1)->flags); }

#define __SETPAGEFLAG(uname, lname, policy)                                \
static __always_inline void __SetPage##uname(struct page *page)                \
        { __set_bit(PG_##lname, &policy(page, 1)->flags); }

#define __CLEARPAGEFLAG(uname, lname, policy)                                \
static __always_inline void __ClearPage##uname(struct page *page)        \
        { __clear_bit(PG_##lname, &policy(page, 1)->flags); }

#define TESTSETFLAG(uname, lname, policy)                                \
static __always_inline int TestSetPage##uname(struct page *page)        \
        { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); }

#define TESTCLEARFLAG(uname, lname, policy)                                \
static __always_inline int TestClearPage##uname(struct page *page)        \
        { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }

#define PAGEFLAG(uname, lname, policy)                                        \
        TESTPAGEFLAG(uname, lname, policy)                                \
        SETPAGEFLAG(uname, lname, policy)                                \
        CLEARPAGEFLAG(uname, lname, policy)

#define __PAGEFLAG(uname, lname, policy)                                \
        TESTPAGEFLAG(uname, lname, policy)                                \
        __SETPAGEFLAG(uname, lname, policy)                                \
        __CLEARPAGEFLAG(uname, lname, policy)

#define TESTSCFLAG(uname, lname, policy)                                \
        TESTSETFLAG(uname, lname, policy)                                \
        TESTCLEARFLAG(uname, lname, policy)

#define TESTPAGEFLAG_FALSE(uname)                                        \
static inline int Page##uname(const struct page *page) { return 0; }

#define SETPAGEFLAG_NOOP(uname)                                                \
static inline void SetPage##uname(struct page *page) {  }

#define CLEARPAGEFLAG_NOOP(uname)                                        \
static inline void ClearPage##uname(struct page *page) {  }

#define __CLEARPAGEFLAG_NOOP(uname)                                        \
static inline void __ClearPage##uname(struct page *page) {  }

#define TESTSETFLAG_FALSE(uname)                                        \
static inline int TestSetPage##uname(struct page *page) { return 0; }

#define TESTCLEARFLAG_FALSE(uname)                                        \
static inline int TestClearPage##uname(struct page *page) { return 0; }

#define PAGEFLAG_FALSE(uname) TESTPAGEFLAG_FALSE(uname)                        \
        SETPAGEFLAG_NOOP(uname) CLEARPAGEFLAG_NOOP(uname)

#define TESTSCFLAG_FALSE(uname)                                                \
        TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname)

__PAGEFLAG(Locked, locked, PF_NO_TAIL)
PAGEFLAG(Waiters, waiters, PF_ONLY_HEAD) __CLEARPAGEFLAG(Waiters, waiters, PF_ONLY_HEAD)
PAGEFLAG(Error, error, PF_NO_TAIL) TESTCLEARFLAG(Error, error, PF_NO_TAIL)
PAGEFLAG(Referenced, referenced, PF_HEAD)
        TESTCLEARFLAG(Referenced, referenced, PF_HEAD)
        __SETPAGEFLAG(Referenced, referenced, PF_HEAD)
PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD)
        __CLEARPAGEFLAG(Dirty, dirty, PF_HEAD)
PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD)
PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD)
        TESTCLEARFLAG(Active, active, PF_HEAD)
PAGEFLAG(Workingset, workingset, PF_HEAD)
        TESTCLEARFLAG(Workingset, workingset, PF_HEAD)
__PAGEFLAG(Slab, slab, PF_NO_TAIL)
__PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL)
PAGEFLAG(Checked, checked, PF_NO_COMPOUND)           /* Used by some filesystems */

/* Xen */
PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND)
        TESTSCFLAG(Pinned, pinned, PF_NO_COMPOUND)
PAGEFLAG(SavePinned, savepinned, PF_NO_COMPOUND);
PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND);
PAGEFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND)
        TESTCLEARFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND)

PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
        __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
        __SETPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
        __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
        __SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)

/*
 * Private page markings that may be used by the filesystem that owns the page
 * for its own purposes.
 * - PG_private and PG_private_2 cause releasepage() and co to be invoked
 */
PAGEFLAG(Private, private, PF_ANY) __SETPAGEFLAG(Private, private, PF_ANY)
        __CLEARPAGEFLAG(Private, private, PF_ANY)
PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY)
PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
        TESTCLEARFLAG(OwnerPriv1, owner_priv_1, PF_ANY)

/*
 * Only test-and-set exist for PG_writeback.  The unconditional operators are
 * risky: they bypass page accounting.
 */
TESTPAGEFLAG(Writeback, writeback, PF_NO_TAIL)
        TESTSCFLAG(Writeback, writeback, PF_NO_TAIL)
PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_TAIL)

/* PG_readahead is only used for reads; PG_reclaim is only for writes */
PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL)
        TESTCLEARFLAG(Reclaim, reclaim, PF_NO_TAIL)
PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND)
        TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND)

#ifdef CONFIG_HIGHMEM
/*
 * Must use a macro here due to header dependency issues. page_zone() is not
 * available at this point.
 */
#define PageHighMem(__p) is_highmem_idx(page_zonenum(__p))
#else
PAGEFLAG_FALSE(HighMem)
#endif

#ifdef CONFIG_SWAP
static __always_inline int PageSwapCache(struct page *page)
{
#ifdef CONFIG_THP_SWAP
        page = compound_head(page);
#endif
        return PageSwapBacked(page) && test_bit(PG_swapcache, &page->flags);

}
SETPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL)
CLEARPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL)
#else
PAGEFLAG_FALSE(SwapCache)
#endif

PAGEFLAG(Unevictable, unevictable, PF_HEAD)
        __CLEARPAGEFLAG(Unevictable, unevictable, PF_HEAD)
        TESTCLEARFLAG(Unevictable, unevictable, PF_HEAD)

#ifdef CONFIG_MMU
PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
        __CLEARPAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
        TESTSCFLAG(Mlocked, mlocked, PF_NO_TAIL)
#else
PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked)
        TESTSCFLAG_FALSE(Mlocked)
#endif

#ifdef CONFIG_ARCH_USES_PG_UNCACHED
PAGEFLAG(Uncached, uncached, PF_NO_COMPOUND)
#else
PAGEFLAG_FALSE(Uncached)
#endif

#ifdef CONFIG_MEMORY_FAILURE
PAGEFLAG(HWPoison, hwpoison, PF_ANY)
TESTSCFLAG(HWPoison, hwpoison, PF_ANY)
#define __PG_HWPOISON (1UL << PG_hwpoison)
extern bool take_page_off_buddy(struct page *page);
#else
PAGEFLAG_FALSE(HWPoison)
#define __PG_HWPOISON 0
#endif

#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
TESTPAGEFLAG(Young, young, PF_ANY)
SETPAGEFLAG(Young, young, PF_ANY)
TESTCLEARFLAG(Young, young, PF_ANY)
PAGEFLAG(Idle, idle, PF_ANY)
#endif

/*
 * PageReported() is used to track reported free pages within the Buddy
 * allocator. We can use the non-atomic version of the test and set
 * operations as both should be shielded with the zone lock to prevent
 * any possible races on the setting or clearing of the bit.
 */
__PAGEFLAG(Reported, reported, PF_NO_COMPOUND)

/*
 * On an anonymous page mapped into a user virtual memory area,
 * page->mapping points to its anon_vma, not to a struct address_space;
 * with the PAGE_MAPPING_ANON bit set to distinguish it.  See rmap.h.
 *
 * On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled,
 * the PAGE_MAPPING_MOVABLE bit may be set along with the PAGE_MAPPING_ANON
 * bit; and then page->mapping points, not to an anon_vma, but to a private
 * structure which KSM associates with that merged page.  See ksm.h.
 *
 * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is used for non-lru movable
 * page and then page->mapping points a struct address_space.
 *
 * Please note that, confusingly, "page_mapping" refers to the inode
 * address_space which maps the page from disk; whereas "page_mapped"
 * refers to user virtual address space into which the page is mapped.
 */
#define PAGE_MAPPING_ANON        0x1
#define PAGE_MAPPING_MOVABLE        0x2
#define PAGE_MAPPING_KSM        (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
#define PAGE_MAPPING_FLAGS        (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)

static __always_inline int PageMappingFlags(struct page *page)
{
        return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) != 0;
}

static __always_inline int PageAnon(struct page *page)
{
        page = compound_head(page);
        return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
}

static __always_inline int __PageMovable(struct page *page)
{
        return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
                                PAGE_MAPPING_MOVABLE;
}

#ifdef CONFIG_KSM
/*
 * A KSM page is one of those write-protected "shared pages" or "merged pages"
 * which KSM maps into multiple mms, wherever identical anonymous page content
 * is found in VM_MERGEABLE vmas.  It's a PageAnon page, pointing not to any
 * anon_vma, but to that page's node of the stable tree.
 */
static __always_inline int PageKsm(struct page *page)
{
        page = compound_head(page);
        return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
                                PAGE_MAPPING_KSM;
}
#else
TESTPAGEFLAG_FALSE(Ksm)
#endif

u64 stable_page_flags(struct page *page);

static inline int PageUptodate(struct page *page)
{
        int ret;
        page = compound_head(page);
        ret = test_bit(PG_uptodate, &(page)->flags);
        /*
         * Must ensure that the data we read out of the page is loaded
         * _after_ we've loaded page->flags to check for PageUptodate.
         * We can skip the barrier if the page is not uptodate, because
         * we wouldn't be reading anything from it.
         *
         * See SetPageUptodate() for the other side of the story.
         */
        if (ret)
                smp_rmb();

        return ret;
}

static __always_inline void __SetPageUptodate(struct page *page)
{
        VM_BUG_ON_PAGE(PageTail(page), page);
        smp_wmb();
        __set_bit(PG_uptodate, &page->flags);
}

static __always_inline void SetPageUptodate(struct page *page)
{
        VM_BUG_ON_PAGE(PageTail(page), page);
        /*
         * Memory barrier must be issued before setting the PG_uptodate bit,
         * so that all previous stores issued in order to bring the page
         * uptodate are actually visible before PageUptodate becomes true.
         */
        smp_wmb();
        set_bit(PG_uptodate, &page->flags);
}

CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL)

int test_clear_page_writeback(struct page *page);
int __test_set_page_writeback(struct page *page, bool keep_write);

#define test_set_page_writeback(page)                        \
        __test_set_page_writeback(page, false)
#define test_set_page_writeback_keepwrite(page)        \
        __test_set_page_writeback(page, true)

static inline void set_page_writeback(struct page *page)
{
        test_set_page_writeback(page);
}

static inline void set_page_writeback_keepwrite(struct page *page)
{
        test_set_page_writeback_keepwrite(page);
}

__PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY)

static __always_inline void set_compound_head(struct page *page, struct page *head)
{
        WRITE_ONCE(page->compound_head, (unsigned long)head + 1);
}

static __always_inline void clear_compound_head(struct page *page)
{
        WRITE_ONCE(page->compound_head, 0);
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline void ClearPageCompound(struct page *page)
{
        BUG_ON(!PageHead(page));
        ClearPageHead(page);
}
#endif

#define PG_head_mask ((1UL << PG_head))

#ifdef CONFIG_HUGETLB_PAGE
int PageHuge(struct page *page);
int PageHeadHuge(struct page *page);
bool page_huge_active(struct page *page);
#else
TESTPAGEFLAG_FALSE(Huge)
TESTPAGEFLAG_FALSE(HeadHuge)

static inline bool page_huge_active(struct page *page)
{
        return 0;
}
#endif


#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
 * PageHuge() only returns true for hugetlbfs pages, but not for
 * normal or transparent huge pages.
 *
 * PageTransHuge() returns true for both transparent huge and
 * hugetlbfs pages, but not normal pages. PageTransHuge() can only be
 * called only in the core VM paths where hugetlbfs pages can't exist.
 */
static inline int PageTransHuge(struct page *page)
{
        VM_BUG_ON_PAGE(PageTail(page), page);
        return PageHead(page);
}

/*
 * PageTransCompound returns true for both transparent huge pages
 * and hugetlbfs pages, so it should only be called when it's known
 * that hugetlbfs pages aren't involved.
 */
static inline int PageTransCompound(struct page *page)
{
        return PageCompound(page);
}

/*
 * PageTransCompoundMap is the same as PageTransCompound, but it also
 * guarantees the primary MMU has the entire compound page mapped
 * through pmd_trans_huge, which in turn guarantees the secondary MMUs
 * can also map the entire compound page. This allows the secondary
 * MMUs to call get_user_pages() only once for each compound page and
 * to immediately map the entire compound page with a single secondary
 * MMU fault. If there will be a pmd split later, the secondary MMUs
 * will get an update through the MMU notifier invalidation through
 * split_huge_pmd().
 *
 * Unlike PageTransCompound, this is safe to be called only while
 * split_huge_pmd() cannot run from under us, like if protected by the
 * MMU notifier, otherwise it may result in page->_mapcount check false
 * positives.
 *
 * We have to treat page cache THP differently since every subpage of it
 * would get _mapcount inc'ed once it is PMD mapped.  But, it may be PTE
 * mapped in the current process so comparing subpage's _mapcount to
 * compound_mapcount to filter out PTE mapped case.
 */
static inline int PageTransCompoundMap(struct page *page)
{
        struct page *head;

        if (!PageTransCompound(page))
                return 0;

        if (PageAnon(page))
                return atomic_read(&page->_mapcount) < 0;

        head = compound_head(page);
        /* File THP is PMD mapped and not PTE mapped */
        return atomic_read(&page->_mapcount) ==
               atomic_read(compound_mapcount_ptr(head));
}

/*
 * PageTransTail returns true for both transparent huge pages
 * and hugetlbfs pages, so it should only be called when it's known
 * that hugetlbfs pages aren't involved.
 */
static inline int PageTransTail(struct page *page)
{
        return PageTail(page);
}

/*
 * PageDoubleMap indicates that the compound page is mapped with PTEs as well
 * as PMDs.
 *
 * This is required for optimization of rmap operations for THP: we can postpone
 * per small page mapcount accounting (and its overhead from atomic operations)
 * until the first PMD split.
 *
 * For the page PageDoubleMap means ->_mapcount in all sub-pages is offset up
 * by one. This reference will go away with last compound_mapcount.
 *
 * See also __split_huge_pmd_locked() and page_remove_anon_compound_rmap().
 */
PAGEFLAG(DoubleMap, double_map, PF_SECOND)
        TESTSCFLAG(DoubleMap, double_map, PF_SECOND)
#else
TESTPAGEFLAG_FALSE(TransHuge)
TESTPAGEFLAG_FALSE(TransCompound)
TESTPAGEFLAG_FALSE(TransCompoundMap)
TESTPAGEFLAG_FALSE(TransTail)
PAGEFLAG_FALSE(DoubleMap)
        TESTSCFLAG_FALSE(DoubleMap)
#endif

/*
 * For pages that are never mapped to userspace (and aren't PageSlab),
 * page_type may be used.  Because it is initialised to -1, we invert the
 * sense of the bit, so __SetPageFoo *clears* the bit used for PageFoo, and
 * __ClearPageFoo *sets* the bit used for PageFoo.  We reserve a few high and
 * low bits so that an underflow or overflow of page_mapcount() won't be
 * mistaken for a page type value.
 */

#define PAGE_TYPE_BASE        0xf0000000
/* Reserve                0x0000007f to catch underflows of page_mapcount */
#define PAGE_MAPCOUNT_RESERVE        -128
#define PG_buddy        0x00000080
#define PG_offline        0x00000100
#define PG_kmemcg        0x00000200
#define PG_table        0x00000400
#define PG_guard        0x00000800

#define PageType(page, flag)                                                \
        ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)

static inline int page_has_type(struct page *page)
{
        return (int)page->page_type < PAGE_MAPCOUNT_RESERVE;
}

#define PAGE_TYPE_OPS(uname, lname)                                        \
static __always_inline int Page##uname(struct page *page)                \
{                                                                        \
        return PageType(page, PG_##lname);                                \
}                                                                        \
static __always_inline void __SetPage##uname(struct page *page)                \
{                                                                        \
        VM_BUG_ON_PAGE(!PageType(page, 0), page);                        \
        page->page_type &= ~PG_##lname;                                        \
}                                                                        \
static __always_inline void __ClearPage##uname(struct page *page)        \
{                                                                        \
        VM_BUG_ON_PAGE(!Page##uname(page), page);                        \
        page->page_type |= PG_##lname;                                        \
}

/*
 * PageBuddy() indicates that the page is free and in the buddy system
 * (see mm/page_alloc.c).
 */
PAGE_TYPE_OPS(Buddy, buddy)

/*
 * PageOffline() indicates that the page is logically offline although the
 * containing section is online. (e.g. inflated in a balloon driver or
 * not onlined when onlining the section).
 * The content of these pages is effectively stale. Such pages should not
 * be touched (read/write/dump/save) except by their owner.
 *
 * If a driver wants to allow to offline unmovable PageOffline() pages without
 * putting them back to the buddy, it can do so via the memory notifier by
 * decrementing the reference count in MEM_GOING_OFFLINE and incrementing the
 * reference count in MEM_CANCEL_OFFLINE. When offlining, the PageOffline()
 * pages (now with a reference count of zero) are treated like free pages,
 * allowing the containing memory block to get offlined. A driver that
 * relies on this feature is aware that re-onlining the memory block will
 * require to re-set the pages PageOffline() and not giving them to the
 * buddy via online_page_callback_t.
 */
PAGE_TYPE_OPS(Offline, offline)

/*
 * If kmemcg is enabled, the buddy allocator will set PageKmemcg() on
 * pages allocated with __GFP_ACCOUNT. It gets cleared on page free.
 */
PAGE_TYPE_OPS(Kmemcg, kmemcg)

/*
 * Marks pages in use as page tables.
 */
PAGE_TYPE_OPS(Table, table)

/*
 * Marks guardpages used with debug_pagealloc.
 */
PAGE_TYPE_OPS(Guard, guard)

extern bool is_free_buddy_page(struct page *page);

__PAGEFLAG(Isolated, isolated, PF_ANY);

/*
 * If network-based swap is enabled, sl*b must keep track of whether pages
 * were allocated from pfmemalloc reserves.
 */
static inline int PageSlabPfmemalloc(struct page *page)
{
        VM_BUG_ON_PAGE(!PageSlab(page), page);
        return PageActive(page);
}

static inline void SetPageSlabPfmemalloc(struct page *page)
{
        VM_BUG_ON_PAGE(!PageSlab(page), page);
        SetPageActive(page);
}

static inline void __ClearPageSlabPfmemalloc(struct page *page)
{
        VM_BUG_ON_PAGE(!PageSlab(page), page);
        __ClearPageActive(page);
}

static inline void ClearPageSlabPfmemalloc(struct page *page)
{
        VM_BUG_ON_PAGE(!PageSlab(page), page);
        ClearPageActive(page);
}

#ifdef CONFIG_MMU
#define __PG_MLOCKED                (1UL << PG_mlocked)
#else
#define __PG_MLOCKED                0
#endif

/*
 * Flags checked when a page is freed.  Pages being freed should not have
 * these flags set.  It they are, there is a problem.
 */
#define PAGE_FLAGS_CHECK_AT_FREE                                \
        (1UL << PG_lru                | 1UL << PG_locked        |        \
         1UL << PG_private        | 1UL << PG_private_2        |        \
         1UL << PG_writeback        | 1UL << PG_reserved        |        \
         1UL << PG_slab                | 1UL << PG_active         |        \
         1UL << PG_unevictable        | __PG_MLOCKED)

/*
 * Flags checked when a page is prepped for return by the page allocator.
 * Pages being prepped should not have these flags set.  It they are set,
 * there has been a kernel bug or struct page corruption.
 *
 * __PG_HWPOISON is exceptional because it needs to be kept beyond page's
 * alloc-free cycle to prevent from reusing the page.
 */
#define PAGE_FLAGS_CHECK_AT_PREP        \
        (((1UL << NR_PAGEFLAGS) - 1) & ~__PG_HWPOISON)

#define PAGE_FLAGS_PRIVATE                                \
        (1UL << PG_private | 1UL << PG_private_2)
/**
 * page_has_private - Determine if page has private stuff
 * @page: The page to be checked
 *
 * Determine if a page has private stuff, indicating that release routines
 * should be invoked upon it.
 */
static inline int page_has_private(struct page *page)
{
        return !!(page->flags & PAGE_FLAGS_PRIVATE);
}

#undef PF_ANY
#undef PF_HEAD
#undef PF_ONLY_HEAD
#undef PF_NO_TAIL
#undef PF_NO_COMPOUND
#undef PF_SECOND
#endif /* !__GENERATING_BOUNDS_H */

#endif        /* PAGE_FLAGS_H */
































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MINMAX_H
#define _LINUX_MINMAX_H

#include <linux/build_bug.h>
#include <linux/compiler.h>
#include <linux/const.h>
#include <linux/types.h>

/*
 * min()/max()/clamp() macros must accomplish several things:
 *
 * - Avoid multiple evaluations of the arguments (so side-effects like
 *   "x++" happen only once) when non-constant.
 * - Perform signed v unsigned type-checking (to generate compile
 *   errors instead of nasty runtime surprises).
 * - Unsigned char/short are always promoted to signed int and can be
 *   compared against signed or unsigned arguments.
 * - Unsigned arguments can be compared against non-negative signed constants.
 * - Comparison of a signed argument against an unsigned constant fails
 *   even if the constant is below __INT_MAX__ and could be cast to int.
 */
#define __typecheck(x, y) \
        (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1)))

/*
 * __sign_use for integer expressions:
 *   bit #0 set if ok for unsigned comparisons
 *   bit #1 set if ok for signed comparisons
 *
 * In particular, statically non-negative signed integer expressions
 * are ok for both.
 *
 * NOTE! Unsigned types smaller than 'int' are implicitly converted to 'int'
 * in expressions, and are accepted for signed conversions for now.
 * This is debatable.
 *
 * Note that 'x' is the original expression, and 'ux' is the unique variable
 * that contains the value.
 *
 * We use 'ux' for pure type checking, and 'x' for when we need to look at the
 * value (but without evaluating it for side effects!
 * Careful to only ever evaluate it with sizeof() or __builtin_constant_p() etc).
 *
 * Pointers end up being checked by the normal C type rules at the actual
 * comparison, and these expressions only need to be careful to not cause
 * warnings for pointer use.
 */
#define __sign_use(ux) (is_signed_type(typeof(ux)) ? \
        (2 + __is_nonneg(ux)) : (1 + 2 * (sizeof(ux) < 4)))

/*
 * Check whether a signed value is always non-negative.
 *
 * A cast is needed to avoid any warnings from values that aren't signed
 * integer types (in which case the result doesn't matter).
 *
 * On 64-bit any integer or pointer type can safely be cast to 'long long'.
 * But on 32-bit we need to avoid warnings about casting pointers to integers
 * of different sizes without truncating 64-bit values so 'long' or 'long long'
 * must be used depending on the size of the value.
 *
 * This does not work for 128-bit signed integers since the cast would truncate
 * them, but we do not use s128 types in the kernel (we do use 'u128',
 * but they are handled by the !is_signed_type() case).
 */
#if __SIZEOF_POINTER__ == __SIZEOF_LONG_LONG__
#define __is_nonneg(ux) statically_true((long long)(ux) >= 0)
#else
#define __is_nonneg(ux) statically_true( \
        (typeof(__builtin_choose_expr(sizeof(ux) > 4, 1LL, 1L)))(ux) >= 0)
#endif

#define __types_ok(ux, uy) \
        (__sign_use(ux) & __sign_use(uy))

#define __types_ok3(ux, uy, uz) \
        (__sign_use(ux) & __sign_use(uy) & __sign_use(uz))

#define __cmp_op_min <
#define __cmp_op_max >

#define __cmp(op, x, y)        ((x) __cmp_op_##op (y) ? (x) : (y))

#define __cmp_once_unique(op, type, x, y, ux, uy) \
        ({ type ux = (x); type uy = (y); __cmp(op, ux, uy); })

#define __cmp_once(op, type, x, y) \
        __cmp_once_unique(op, type, x, y, __UNIQUE_ID(x_), __UNIQUE_ID(y_))

#define __careful_cmp_once(op, x, y, ux, uy) ({                \
        __auto_type ux = (x); __auto_type uy = (y);        \
        BUILD_BUG_ON_MSG(!__types_ok(ux, uy),                \
                #op"("#x", "#y") signedness error");        \
        __cmp(op, ux, uy); })

#define __careful_cmp(op, x, y) \
        __careful_cmp_once(op, x, y, __UNIQUE_ID(x_), __UNIQUE_ID(y_))

/**
 * min - return minimum of two values of the same or compatible types
 * @x: first value
 * @y: second value
 */
#define min(x, y)        __careful_cmp(min, x, y)

/**
 * max - return maximum of two values of the same or compatible types
 * @x: first value
 * @y: second value
 */
#define max(x, y)        __careful_cmp(max, x, y)

/**
 * umin - return minimum of two non-negative values
 *   Signed types are zero extended to match a larger unsigned type.
 * @x: first value
 * @y: second value
 */
#define umin(x, y)        \
        __careful_cmp(min, (x) + 0u + 0ul + 0ull, (y) + 0u + 0ul + 0ull)

/**
 * umax - return maximum of two non-negative values
 * @x: first value
 * @y: second value
 */
#define umax(x, y)        \
        __careful_cmp(max, (x) + 0u + 0ul + 0ull, (y) + 0u + 0ul + 0ull)

#define __careful_op3(op, x, y, z, ux, uy, uz) ({                        \
        __auto_type ux = (x); __auto_type uy = (y);__auto_type uz = (z);\
        BUILD_BUG_ON_MSG(!__types_ok3(ux, uy, uz),                        \
                #op"3("#x", "#y", "#z") signedness error");                \
        __cmp(op, ux, __cmp(op, uy, uz)); })

/**
 * min3 - return minimum of three values
 * @x: first value
 * @y: second value
 * @z: third value
 */
#define min3(x, y, z) \
        __careful_op3(min, x, y, z, __UNIQUE_ID(x_), __UNIQUE_ID(y_), __UNIQUE_ID(z_))

/**
 * max3 - return maximum of three values
 * @x: first value
 * @y: second value
 * @z: third value
 */
#define max3(x, y, z) \
        __careful_op3(max, x, y, z, __UNIQUE_ID(x_), __UNIQUE_ID(y_), __UNIQUE_ID(z_))

/**
 * min_t - return minimum of two values, using the specified type
 * @type: data type to use
 * @x: first value
 * @y: second value
 */
#define min_t(type, x, y) __cmp_once(min, type, x, y)

/**
 * max_t - return maximum of two values, using the specified type
 * @type: data type to use
 * @x: first value
 * @y: second value
 */
#define max_t(type, x, y) __cmp_once(max, type, x, y)

/**
 * min_not_zero - return the minimum that is _not_ zero, unless both are zero
 * @x: value1
 * @y: value2
 */
#define min_not_zero(x, y) ({                        \
        typeof(x) __x = (x);                        \
        typeof(y) __y = (y);                        \
        __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); })

#define __clamp(val, lo, hi)        \
        ((val) >= (hi) ? (hi) : ((val) <= (lo) ? (lo) : (val)))

#define __clamp_once(type, val, lo, hi, uval, ulo, uhi) ({                        \
        type uval = (val);                                                        \
        type ulo = (lo);                                                        \
        type uhi = (hi);                                                        \
        BUILD_BUG_ON_MSG(statically_true(ulo > uhi),                                \
                "clamp() low limit " #lo " greater than high limit " #hi);        \
        BUILD_BUG_ON_MSG(!__types_ok3(uval, ulo, uhi),                                \
                "clamp("#val", "#lo", "#hi") signedness error");                \
        __clamp(uval, ulo, uhi); })

#define __careful_clamp(type, val, lo, hi) \
        __clamp_once(type, val, lo, hi, __UNIQUE_ID(v_), __UNIQUE_ID(l_), __UNIQUE_ID(h_))

/**
 * clamp - return a value clamped to a given range with typechecking
 * @val: current value
 * @lo: lowest allowable value
 * @hi: highest allowable value
 *
 * This macro checks @val/@lo/@hi to make sure they have compatible
 * signedness.
 */
#define clamp(val, lo, hi) __careful_clamp(__auto_type, val, lo, hi)

/**
 * clamp_t - return a value clamped to a given range using a given type
 * @type: the type of variable to use
 * @val: current value
 * @lo: minimum allowable value
 * @hi: maximum allowable value
 *
 * This macro does no typechecking and uses temporary variables of type
 * @type to make all the comparisons.
 */
#define clamp_t(type, val, lo, hi) __careful_clamp(type, val, lo, hi)

/**
 * clamp_val - return a value clamped to a given range using val's type
 * @val: current value
 * @lo: minimum allowable value
 * @hi: maximum allowable value
 *
 * This macro does no typechecking and uses temporary variables of whatever
 * type the input argument @val is.  This is useful when @val is an unsigned
 * type and @lo and @hi are literals that will otherwise be assigned a signed
 * integer type.
 */
#define clamp_val(val, lo, hi) __careful_clamp(typeof(val), val, lo, hi)

/*
 * Do not check the array parameter using __must_be_array().
 * In the following legit use-case where the "array" passed is a simple pointer,
 * __must_be_array() will return a failure.
 * --- 8< ---
 * int *buff
 * ...
 * min = min_array(buff, nb_items);
 * --- 8< ---
 *
 * The first typeof(&(array)[0]) is needed in order to support arrays of both
 * 'int *buff' and 'int buff[N]' types.
 *
 * The array can be an array of const items.
 * typeof() keeps the const qualifier. Use __unqual_scalar_typeof() in order
 * to discard the const qualifier for the __element variable.
 */
#define __minmax_array(op, array, len) ({                                \
        typeof(&(array)[0]) __array = (array);                                \
        typeof(len) __len = (len);                                        \
        __unqual_scalar_typeof(__array[0]) __element = __array[--__len];\
        while (__len--)                                                        \
                __element = op(__element, __array[__len]);                \
        __element; })

/**
 * min_array - return minimum of values present in an array
 * @array: array
 * @len: array length
 *
 * Note that @len must not be zero (empty array).
 */
#define min_array(array, len) __minmax_array(min, array, len)

/**
 * max_array - return maximum of values present in an array
 * @array: array
 * @len: array length
 *
 * Note that @len must not be zero (empty array).
 */
#define max_array(array, len) __minmax_array(max, array, len)

static inline bool in_range64(u64 val, u64 start, u64 len)
{
        return (val - start) < len;
}

static inline bool in_range32(u32 val, u32 start, u32 len)
{
        return (val - start) < len;
}

/**
 * in_range - Determine if a value lies within a range.
 * @val: Value to test.
 * @start: First value in range.
 * @len: Number of values in range.
 *
 * This is more efficient than "if (start <= val && val < (start + len))".
 * It also gives a different answer if @start + @len overflows the size of
 * the type by a sufficient amount to encompass @val.  Decide for yourself
 * which behaviour you want, or prove that start + len never overflow.
 * Do not blindly replace one form with the other.
 */
#define in_range(val, start, len)                                        \
        ((sizeof(start) | sizeof(len) | sizeof(val)) <= sizeof(u32) ?        \
                in_range32(val, start, len) : in_range64(val, start, len))

/**
 * swap - swap values of @a and @b
 * @a: first value
 * @b: second value
 */
#define swap(a, b) \
        do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)

/*
 * Use these carefully: no type checking, and uses the arguments
 * multiple times. Use for obvious constants only.
 */
#define MIN(a, b) __cmp(min, a, b)
#define MAX(a, b) __cmp(max, a, b)
#define MIN_T(type, a, b) __cmp(min, (type)(a), (type)(b))
#define MAX_T(type, a, b) __cmp(max, (type)(a), (type)(b))

#endif        /* _LINUX_MINMAX_H */




















































































































    2 


    2 




























    2 





















    2 
    1 

    1 





    1 

























    1 
    1 




    2 
    1 







































    1 






















































    2 

































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef INT_BLK_MQ_H
#define INT_BLK_MQ_H

#include "blk-stat.h"
#include "blk-mq-tag.h"

struct blk_mq_tag_set;

struct blk_mq_ctxs {
        struct kobject kobj;
        struct blk_mq_ctx __percpu        *queue_ctx;
};

/**
 * struct blk_mq_ctx - State for a software queue facing the submitting CPUs
 */
struct blk_mq_ctx {
        struct {
                spinlock_t                lock;
                struct list_head        rq_lists[HCTX_MAX_TYPES];
        } ____cacheline_aligned_in_smp;

        unsigned int                cpu;
        unsigned short                index_hw[HCTX_MAX_TYPES];
        struct blk_mq_hw_ctx         *hctxs[HCTX_MAX_TYPES];

        /* incremented at dispatch time */
        unsigned long                rq_dispatched[2];
        unsigned long                rq_merged;

        /* incremented at completion time */
        unsigned long                ____cacheline_aligned_in_smp rq_completed[2];

        struct request_queue        *queue;
        struct blk_mq_ctxs      *ctxs;
        struct kobject                kobj;
} ____cacheline_aligned_in_smp;

void blk_mq_exit_queue(struct request_queue *q);
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
void blk_mq_wake_waiters(struct request_queue *q);
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *,
                             unsigned int);
void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
                                bool kick_requeue_list);
void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
                                        struct blk_mq_ctx *start);
void blk_mq_put_rq_ref(struct request *rq);

/*
 * Internal helpers for allocating/freeing the request map
 */
void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
                     unsigned int hctx_idx);
void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags);
struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
                                        unsigned int hctx_idx,
                                        unsigned int nr_tags,
                                        unsigned int reserved_tags,
                                        unsigned int flags);
int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
                     unsigned int hctx_idx, unsigned int depth);

/*
 * Internal helpers for request insertion into sw queues
 */
void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
                                bool at_head);
void blk_mq_request_bypass_insert(struct request *rq, bool at_head,
                                  bool run_queue);
void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
                                struct list_head *list);

/* Used by blk_insert_cloned_request() to issue request directly */
blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last);
void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
                                    struct list_head *list);

/*
 * CPU -> queue mappings
 */
extern int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int);

/*
 * blk_mq_map_queue_type() - map (hctx_type,cpu) to hardware queue
 * @q: request queue
 * @type: the hctx type index
 * @cpu: CPU
 */
static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *q,
                                                          enum hctx_type type,
                                                          unsigned int cpu)
{
        return q->queue_hw_ctx[q->tag_set->map[type].mq_map[cpu]];
}

/*
 * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue
 * @q: request queue
 * @flags: request command flags
 * @cpu: cpu ctx
 */
static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
                                                     unsigned int flags,
                                                     struct blk_mq_ctx *ctx)
{
        enum hctx_type type = HCTX_TYPE_DEFAULT;

        /*
         * The caller ensure that if REQ_HIPRI, poll must be enabled.
         */
        if (flags & REQ_HIPRI)
                type = HCTX_TYPE_POLL;
        else if ((flags & REQ_OP_MASK) == REQ_OP_READ)
                type = HCTX_TYPE_READ;
        
        return ctx->hctxs[type];
}

/*
 * sysfs helpers
 */
extern void blk_mq_sysfs_init(struct request_queue *q);
extern void blk_mq_sysfs_deinit(struct request_queue *q);
extern int __blk_mq_register_dev(struct device *dev, struct request_queue *q);
extern int blk_mq_sysfs_register(struct request_queue *q);
extern void blk_mq_sysfs_unregister(struct request_queue *q);
extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);

void blk_mq_release(struct request_queue *q);

static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
                                           unsigned int cpu)
{
        return per_cpu_ptr(q->queue_ctx, cpu);
}

/*
 * This assumes per-cpu software queueing queues. They could be per-node
 * as well, for instance. For now this is hardcoded as-is. Note that we don't
 * care about preemption, since we know the ctx's are persistent. This does
 * mean that we can't rely on ctx always matching the currently running CPU.
 */
static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
{
        return __blk_mq_get_ctx(q, raw_smp_processor_id());
}

struct blk_mq_alloc_data {
        /* input parameter */
        struct request_queue *q;
        blk_mq_req_flags_t flags;
        unsigned int shallow_depth;
        unsigned int cmd_flags;

        /* input & output parameter */
        struct blk_mq_ctx *ctx;
        struct blk_mq_hw_ctx *hctx;
};

static inline bool blk_mq_is_sbitmap_shared(unsigned int flags)
{
        return flags & BLK_MQ_F_TAG_HCTX_SHARED;
}

static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data)
{
        if (data->q->elevator)
                return data->hctx->sched_tags;

        return data->hctx->tags;
}

static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx)
{
        /* Fast path: hardware queue is not stopped most of the time. */
        if (likely(!test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
                return false;

        /*
         * This barrier is used to order adding of dispatch list before and
         * the test of BLK_MQ_S_STOPPED below. Pairs with the memory barrier
         * in blk_mq_start_stopped_hw_queue() so that dispatch code could
         * either see BLK_MQ_S_STOPPED is cleared or dispatch list is not
         * empty to avoid missing dispatching requests.
         */
        smp_mb();

        return test_bit(BLK_MQ_S_STOPPED, &hctx->state);
}

static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
{
        return hctx->nr_ctx && hctx->tags;
}

unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part);
void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
                         unsigned int inflight[2]);

static inline void blk_mq_put_dispatch_budget(struct request_queue *q)
{
        if (q->mq_ops->put_budget)
                q->mq_ops->put_budget(q);
}

static inline bool blk_mq_get_dispatch_budget(struct request_queue *q)
{
        if (q->mq_ops->get_budget)
                return q->mq_ops->get_budget(q);
        return true;
}

static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx)
{
        if (blk_mq_is_sbitmap_shared(hctx->flags))
                atomic_inc(&hctx->queue->nr_active_requests_shared_sbitmap);
        else
                atomic_inc(&hctx->nr_active);
}

static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx)
{
        if (blk_mq_is_sbitmap_shared(hctx->flags))
                atomic_dec(&hctx->queue->nr_active_requests_shared_sbitmap);
        else
                atomic_dec(&hctx->nr_active);
}

static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx)
{
        if (blk_mq_is_sbitmap_shared(hctx->flags))
                return atomic_read(&hctx->queue->nr_active_requests_shared_sbitmap);
        return atomic_read(&hctx->nr_active);
}
static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
                                           struct request *rq)
{
        blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag);
        rq->tag = BLK_MQ_NO_TAG;

        if (rq->rq_flags & RQF_MQ_INFLIGHT) {
                rq->rq_flags &= ~RQF_MQ_INFLIGHT;
                __blk_mq_dec_active_requests(hctx);
        }
}

static inline void blk_mq_put_driver_tag(struct request *rq)
{
        if (rq->tag == BLK_MQ_NO_TAG || rq->internal_tag == BLK_MQ_NO_TAG)
                return;

        __blk_mq_put_driver_tag(rq->mq_hctx, rq);
}

static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
{
        int cpu;

        for_each_possible_cpu(cpu)
                qmap->mq_map[cpu] = 0;
}

/*
 * blk_mq_plug() - Get caller context plug
 * @q: request queue
 * @bio : the bio being submitted by the caller context
 *
 * Plugging, by design, may delay the insertion of BIOs into the elevator in
 * order to increase BIO merging opportunities. This however can cause BIO
 * insertion order to change from the order in which submit_bio() is being
 * executed in the case of multiple contexts concurrently issuing BIOs to a
 * device, even if these context are synchronized to tightly control BIO issuing
 * order. While this is not a problem with regular block devices, this ordering
 * change can cause write BIO failures with zoned block devices as these
 * require sequential write patterns to zones. Prevent this from happening by
 * ignoring the plug state of a BIO issuing context if the target request queue
 * is for a zoned block device and the BIO to plug is a write operation.
 *
 * Return current->plug if the bio can be plugged and NULL otherwise
 */
static inline struct blk_plug *blk_mq_plug(struct request_queue *q,
                                           struct bio *bio)
{
        /*
         * For regular block devices or read operations, use the context plug
         * which may be NULL if blk_start_plug() was not executed.
         */
        if (!blk_queue_is_zoned(q) || !op_is_write(bio_op(bio)))
                return current->plug;

        /* Zoned block device write operation case: do not plug the BIO */
        return NULL;
}

/*
 * For shared tag users, we track the number of currently active users
 * and attempt to provide a fair share of the tag depth for each of them.
 */
static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
                                  struct sbitmap_queue *bt)
{
        unsigned int depth, users;

        if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
                return true;

        /*
         * Don't try dividing an ant
         */
        if (bt->sb.depth == 1)
                return true;

        if (blk_mq_is_sbitmap_shared(hctx->flags)) {
                struct request_queue *q = hctx->queue;
                struct blk_mq_tag_set *set = q->tag_set;

                if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
                        return true;
                users = atomic_read(&set->active_queues_shared_sbitmap);
        } else {
                if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
                        return true;
                users = atomic_read(&hctx->tags->active_queues);
        }

        if (!users)
                return true;

        /*
         * Allow at least some tags
         */
        depth = max((bt->sb.depth + users - 1) / users, 4U);
        return __blk_mq_active_requests(hctx) < depth;
}


#endif









































































































































































































    1 




















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_COMPAT_H
#define _ASM_X86_COMPAT_H

/*
 * Architecture specific compatibility types
 */
#include <linux/types.h>
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
#include <asm/processor.h>
#include <asm/user32.h>
#include <asm/unistd.h>

#include <asm-generic/compat.h>

#define COMPAT_USER_HZ                100
#define COMPAT_UTS_MACHINE        "i686\0\0"

typedef u16                __compat_uid_t;
typedef u16                __compat_gid_t;
typedef u32                __compat_uid32_t;
typedef u32                __compat_gid32_t;
typedef u16                compat_mode_t;
typedef u16                compat_dev_t;
typedef u16                compat_nlink_t;
typedef u16                compat_ipc_pid_t;
typedef u32                compat_caddr_t;
typedef __kernel_fsid_t        compat_fsid_t;

struct compat_stat {
        u32                st_dev;
        compat_ino_t        st_ino;
        compat_mode_t        st_mode;
        compat_nlink_t        st_nlink;
        __compat_uid_t        st_uid;
        __compat_gid_t        st_gid;
        u32                st_rdev;
        u32                st_size;
        u32                st_blksize;
        u32                st_blocks;
        u32                st_atime;
        u32                st_atime_nsec;
        u32                st_mtime;
        u32                st_mtime_nsec;
        u32                st_ctime;
        u32                st_ctime_nsec;
        u32                __unused4;
        u32                __unused5;
};

struct compat_flock {
        short                l_type;
        short                l_whence;
        compat_off_t        l_start;
        compat_off_t        l_len;
        compat_pid_t        l_pid;
};

#define F_GETLK64        12        /*  using 'struct flock64' */
#define F_SETLK64        13
#define F_SETLKW64        14

/*
 * IA32 uses 4 byte alignment for 64 bit quantities,
 * so we need to pack this structure.
 */
struct compat_flock64 {
        short                l_type;
        short                l_whence;
        compat_loff_t        l_start;
        compat_loff_t        l_len;
        compat_pid_t        l_pid;
} __attribute__((packed));

struct compat_statfs {
        int                f_type;
        int                f_bsize;
        int                f_blocks;
        int                f_bfree;
        int                f_bavail;
        int                f_files;
        int                f_ffree;
        compat_fsid_t        f_fsid;
        int                f_namelen;        /* SunOS ignores this field. */
        int                f_frsize;
        int                f_flags;
        int                f_spare[4];
};

#define COMPAT_RLIM_INFINITY                0xffffffff

typedef u32                compat_old_sigset_t;        /* at least 32 bits */

#define _COMPAT_NSIG                64
#define _COMPAT_NSIG_BPW        32

typedef u32               compat_sigset_word;

#define COMPAT_OFF_T_MAX        0x7fffffff

struct compat_ipc64_perm {
        compat_key_t key;
        __compat_uid32_t uid;
        __compat_gid32_t gid;
        __compat_uid32_t cuid;
        __compat_gid32_t cgid;
        unsigned short mode;
        unsigned short __pad1;
        unsigned short seq;
        unsigned short __pad2;
        compat_ulong_t unused1;
        compat_ulong_t unused2;
};

struct compat_semid64_ds {
        struct compat_ipc64_perm sem_perm;
        compat_ulong_t sem_otime;
        compat_ulong_t sem_otime_high;
        compat_ulong_t sem_ctime;
        compat_ulong_t sem_ctime_high;
        compat_ulong_t sem_nsems;
        compat_ulong_t __unused3;
        compat_ulong_t __unused4;
};

struct compat_msqid64_ds {
        struct compat_ipc64_perm msg_perm;
        compat_ulong_t msg_stime;
        compat_ulong_t msg_stime_high;
        compat_ulong_t msg_rtime;
        compat_ulong_t msg_rtime_high;
        compat_ulong_t msg_ctime;
        compat_ulong_t msg_ctime_high;
        compat_ulong_t msg_cbytes;
        compat_ulong_t msg_qnum;
        compat_ulong_t msg_qbytes;
        compat_pid_t   msg_lspid;
        compat_pid_t   msg_lrpid;
        compat_ulong_t __unused4;
        compat_ulong_t __unused5;
};

struct compat_shmid64_ds {
        struct compat_ipc64_perm shm_perm;
        compat_size_t  shm_segsz;
        compat_ulong_t shm_atime;
        compat_ulong_t shm_atime_high;
        compat_ulong_t shm_dtime;
        compat_ulong_t shm_dtime_high;
        compat_ulong_t shm_ctime;
        compat_ulong_t shm_ctime_high;
        compat_pid_t   shm_cpid;
        compat_pid_t   shm_lpid;
        compat_ulong_t shm_nattch;
        compat_ulong_t __unused4;
        compat_ulong_t __unused5;
};

/*
 * The type of struct elf_prstatus.pr_reg in compatible core dumps.
 */
typedef struct user_regs_struct compat_elf_gregset_t;

/* Full regset -- prstatus on x32, otherwise on ia32 */
#define PRSTATUS_SIZE(S, R) (R != sizeof(S.pr_reg) ? 144 : 296)
#define SET_PR_FPVALID(S, V, R) \
  do { *(int *) (((void *) &((S)->pr_reg)) + R) = (V); } \
  while (0)

#ifdef CONFIG_X86_X32_ABI
#define COMPAT_USE_64BIT_TIME \
        (!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT))
#endif

static inline void __user *arch_compat_alloc_user_space(long len)
{
        compat_uptr_t sp;

        if (test_thread_flag(TIF_IA32)) {
                sp = task_pt_regs(current)->sp;
        } else {
                /* -128 for the x32 ABI redzone */
                sp = task_pt_regs(current)->sp - 128;
        }

        return (void __user *)round_down(sp - len, 16);
}

static inline bool in_x32_syscall(void)
{
#ifdef CONFIG_X86_X32_ABI
        if (task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT)
                return true;
#endif
        return false;
}

static inline bool in_32bit_syscall(void)
{
        return in_ia32_syscall() || in_x32_syscall();
}

#ifdef CONFIG_COMPAT
static inline bool in_compat_syscall(void)
{
        return in_32bit_syscall();
}
#define in_compat_syscall in_compat_syscall        /* override the generic impl */
#define compat_need_64bit_alignment_fixup in_ia32_syscall
#endif

struct compat_siginfo;

#ifdef CONFIG_X86_X32_ABI
int copy_siginfo_to_user32(struct compat_siginfo __user *to,
                const kernel_siginfo_t *from);
#define copy_siginfo_to_user32 copy_siginfo_to_user32
#endif /* CONFIG_X86_X32_ABI */

#endif /* _ASM_X86_COMPAT_H */




































   14 



   14 






































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#ifdef CONFIG_PREEMPTIRQ_TRACEPOINTS

#undef TRACE_SYSTEM
#define TRACE_SYSTEM preemptirq

#if !defined(_TRACE_PREEMPTIRQ_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_PREEMPTIRQ_H

#include <linux/ktime.h>
#include <linux/tracepoint.h>
#include <linux/string.h>
#include <asm/sections.h>

DECLARE_EVENT_CLASS(preemptirq_template,

        TP_PROTO(unsigned long ip, unsigned long parent_ip),

        TP_ARGS(ip, parent_ip),

        TP_STRUCT__entry(
                __field(s32, caller_offs)
                __field(s32, parent_offs)
        ),

        TP_fast_assign(
                __entry->caller_offs = (s32)(ip - (unsigned long)_stext);
                __entry->parent_offs = (s32)(parent_ip - (unsigned long)_stext);
        ),

        TP_printk("caller=%pS parent=%pS",
                  (void *)((unsigned long)(_stext) + __entry->caller_offs),
                  (void *)((unsigned long)(_stext) + __entry->parent_offs))
);

#ifdef CONFIG_TRACE_IRQFLAGS
DEFINE_EVENT(preemptirq_template, irq_disable,
             TP_PROTO(unsigned long ip, unsigned long parent_ip),
             TP_ARGS(ip, parent_ip));

DEFINE_EVENT(preemptirq_template, irq_enable,
             TP_PROTO(unsigned long ip, unsigned long parent_ip),
             TP_ARGS(ip, parent_ip));
#else
#define trace_irq_enable(...)
#define trace_irq_disable(...)
#define trace_irq_enable_rcuidle(...)
#define trace_irq_disable_rcuidle(...)
#endif

#ifdef CONFIG_TRACE_PREEMPT_TOGGLE
DEFINE_EVENT(preemptirq_template, preempt_disable,
             TP_PROTO(unsigned long ip, unsigned long parent_ip),
             TP_ARGS(ip, parent_ip));

DEFINE_EVENT(preemptirq_template, preempt_enable,
             TP_PROTO(unsigned long ip, unsigned long parent_ip),
             TP_ARGS(ip, parent_ip));
#else
#define trace_preempt_enable(...)
#define trace_preempt_disable(...)
#define trace_preempt_enable_rcuidle(...)
#define trace_preempt_disable_rcuidle(...)
#endif

#endif /* _TRACE_PREEMPTIRQ_H */

#include <trace/define_trace.h>

#else /* !CONFIG_PREEMPTIRQ_TRACEPOINTS */
#define trace_irq_enable(...)
#define trace_irq_disable(...)
#define trace_irq_enable_rcuidle(...)
#define trace_irq_disable_rcuidle(...)
#define trace_preempt_enable(...)
#define trace_preempt_disable(...)
#define trace_preempt_enable_rcuidle(...)
#define trace_preempt_disable_rcuidle(...)
#endif





































































































































































































































    1 





    1 








    1 
    1 


    1 



























    1 









    1 





    1 
































    1 
    1 



    1 







































































    1 





    1 

    1 








    1 






    1 
    1 


    1 







    1 






































































































    1 














































    1 





    1 
    1 

    1 

    1 
    1 



    1 










    1 

    1 
























    1 
    1 








































    1 

    1 













































    1 










    1 



    1 




    1 













































    1 





    1 


    1 

    1 
    1 




    1 

    1 


    1 

    1 
















































    1 



    1 











    1 






















































































































































































    1 




    1 


    1 







    1 



































    1 








    1 

    1 








    1 




























    1 



    1 






    1 

    1 





    1 


    1 










    1 




    1 



    1 









































































































































































































































































































































































































    1 























































































    1 
    1 
    1 












    1 

    1 



    1 













    1 

    1 











    1 















    1 
    1 
    1 







    1 









































    1 


















    1 


    1 



    1 


    1 
































































    1 



























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
// SPDX-License-Identifier: GPL-2.0-only
/*
 * mm/percpu.c - percpu memory allocator
 *
 * Copyright (C) 2009                SUSE Linux Products GmbH
 * Copyright (C) 2009                Tejun Heo <tj@kernel.org>
 *
 * Copyright (C) 2017                Facebook Inc.
 * Copyright (C) 2017                Dennis Zhou <dennis@kernel.org>
 *
 * The percpu allocator handles both static and dynamic areas.  Percpu
 * areas are allocated in chunks which are divided into units.  There is
 * a 1-to-1 mapping for units to possible cpus.  These units are grouped
 * based on NUMA properties of the machine.
 *
 *  c0                           c1                         c2
 *  -------------------          -------------------        ------------
 * | u0 | u1 | u2 | u3 |        | u0 | u1 | u2 | u3 |      | u0 | u1 | u
 *  -------------------  ......  -------------------  ....  ------------
 *
 * Allocation is done by offsets into a unit's address space.  Ie., an
 * area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0,
 * c1:u1, c1:u2, etc.  On NUMA machines, the mapping may be non-linear
 * and even sparse.  Access is handled by configuring percpu base
 * registers according to the cpu to unit mappings and offsetting the
 * base address using pcpu_unit_size.
 *
 * There is special consideration for the first chunk which must handle
 * the static percpu variables in the kernel image as allocation services
 * are not online yet.  In short, the first chunk is structured like so:
 *
 *                  <Static | [Reserved] | Dynamic>
 *
 * The static data is copied from the original section managed by the
 * linker.  The reserved section, if non-zero, primarily manages static
 * percpu variables from kernel modules.  Finally, the dynamic section
 * takes care of normal allocations.
 *
 * The allocator organizes chunks into lists according to free size and
 * memcg-awareness.  To make a percpu allocation memcg-aware the __GFP_ACCOUNT
 * flag should be passed.  All memcg-aware allocations are sharing one set
 * of chunks and all unaccounted allocations and allocations performed
 * by processes belonging to the root memory cgroup are using the second set.
 *
 * The allocator tries to allocate from the fullest chunk first. Each chunk
 * is managed by a bitmap with metadata blocks.  The allocation map is updated
 * on every allocation and free to reflect the current state while the boundary
 * map is only updated on allocation.  Each metadata block contains
 * information to help mitigate the need to iterate over large portions
 * of the bitmap.  The reverse mapping from page to chunk is stored in
 * the page's index.  Lastly, units are lazily backed and grow in unison.
 *
 * There is a unique conversion that goes on here between bytes and bits.
 * Each bit represents a fragment of size PCPU_MIN_ALLOC_SIZE.  The chunk
 * tracks the number of pages it is responsible for in nr_pages.  Helper
 * functions are used to convert from between the bytes, bits, and blocks.
 * All hints are managed in bits unless explicitly stated.
 *
 * To use this allocator, arch code should do the following:
 *
 * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
 *   regular address to percpu pointer and back if they need to be
 *   different from the default
 *
 * - use pcpu_setup_first_chunk() during percpu area initialization to
 *   setup the first chunk containing the kernel static percpu area
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/bitmap.h>
#include <linux/memblock.h>
#include <linux/err.h>
#include <linux/lcm.h>
#include <linux/list.h>
#include <linux/log2.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/percpu.h>
#include <linux/pfn.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
#include <linux/workqueue.h>
#include <linux/kmemleak.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/memcontrol.h>

#include <asm/cacheflush.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
#include <asm/io.h>

#define CREATE_TRACE_POINTS
#include <trace/events/percpu.h>

#include "percpu-internal.h"

/* the slots are sorted by free bytes left, 1-31 bytes share the same slot */
#define PCPU_SLOT_BASE_SHIFT                5
/* chunks in slots below this are subject to being sidelined on failed alloc */
#define PCPU_SLOT_FAIL_THRESHOLD        3

#define PCPU_EMPTY_POP_PAGES_LOW        2
#define PCPU_EMPTY_POP_PAGES_HIGH        4

#ifdef CONFIG_SMP
/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
#ifndef __addr_to_pcpu_ptr
#define __addr_to_pcpu_ptr(addr)                                        \
        (void __percpu *)((unsigned long)(addr) -                        \
                          (unsigned long)pcpu_base_addr        +                \
                          (unsigned long)__per_cpu_start)
#endif
#ifndef __pcpu_ptr_to_addr
#define __pcpu_ptr_to_addr(ptr)                                                \
        (void __force *)((unsigned long)(ptr) +                                \
                         (unsigned long)pcpu_base_addr -                \
                         (unsigned long)__per_cpu_start)
#endif
#else        /* CONFIG_SMP */
/* on UP, it's always identity mapped */
#define __addr_to_pcpu_ptr(addr)        (void __percpu *)(addr)
#define __pcpu_ptr_to_addr(ptr)                (void __force *)(ptr)
#endif        /* CONFIG_SMP */

static int pcpu_unit_pages __ro_after_init;
static int pcpu_unit_size __ro_after_init;
static int pcpu_nr_units __ro_after_init;
static int pcpu_atom_size __ro_after_init;
int pcpu_nr_slots __ro_after_init;
static size_t pcpu_chunk_struct_size __ro_after_init;

/* cpus with the lowest and highest unit addresses */
static unsigned int pcpu_low_unit_cpu __ro_after_init;
static unsigned int pcpu_high_unit_cpu __ro_after_init;

/* the address of the first chunk which starts with the kernel static area */
void *pcpu_base_addr __ro_after_init;
EXPORT_SYMBOL_GPL(pcpu_base_addr);

static const int *pcpu_unit_map __ro_after_init;                /* cpu -> unit */
const unsigned long *pcpu_unit_offsets __ro_after_init;        /* cpu -> unit offset */

/* group information, used for vm allocation */
static int pcpu_nr_groups __ro_after_init;
static const unsigned long *pcpu_group_offsets __ro_after_init;
static const size_t *pcpu_group_sizes __ro_after_init;

/*
 * The first chunk which always exists.  Note that unlike other
 * chunks, this one can be allocated and mapped in several different
 * ways and thus often doesn't live in the vmalloc area.
 */
struct pcpu_chunk *pcpu_first_chunk __ro_after_init;

/*
 * Optional reserved chunk.  This chunk reserves part of the first
 * chunk and serves it for reserved allocations.  When the reserved
 * region doesn't exist, the following variable is NULL.
 */
struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;

DEFINE_SPINLOCK(pcpu_lock);        /* all internal data structures */
static DEFINE_MUTEX(pcpu_alloc_mutex);        /* chunk create/destroy, [de]pop, map ext */

struct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */

/* chunks which need their map areas extended, protected by pcpu_lock */
static LIST_HEAD(pcpu_map_extend_chunks);

/*
 * The number of empty populated pages by chunk type, protected by pcpu_lock.
 * The reserved chunk doesn't contribute to the count.
 */
int pcpu_nr_empty_pop_pages[PCPU_NR_CHUNK_TYPES];

/*
 * The number of populated pages in use by the allocator, protected by
 * pcpu_lock.  This number is kept per a unit per chunk (i.e. when a page gets
 * allocated/deallocated, it is allocated/deallocated in all units of a chunk
 * and increments/decrements this count by 1).
 */
static unsigned long pcpu_nr_populated;

/*
 * Balance work is used to populate or destroy chunks asynchronously.  We
 * try to keep the number of populated free pages between
 * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
 * empty chunk.
 */
static void pcpu_balance_workfn(struct work_struct *work);
static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
static bool pcpu_async_enabled __read_mostly;
static bool pcpu_atomic_alloc_failed;

static void pcpu_schedule_balance_work(void)
{
        if (pcpu_async_enabled)
                schedule_work(&pcpu_balance_work);
}

/**
 * pcpu_addr_in_chunk - check if the address is served from this chunk
 * @chunk: chunk of interest
 * @addr: percpu address
 *
 * RETURNS:
 * True if the address is served from this chunk.
 */
static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr)
{
        void *start_addr, *end_addr;

        if (!chunk)
                return false;

        start_addr = chunk->base_addr + chunk->start_offset;
        end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE -
                   chunk->end_offset;

        return addr >= start_addr && addr < end_addr;
}

static int __pcpu_size_to_slot(int size)
{
        int highbit = fls(size);        /* size is in bytes */
        return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
}

static int pcpu_size_to_slot(int size)
{
        if (size == pcpu_unit_size)
                return pcpu_nr_slots - 1;
        return __pcpu_size_to_slot(size);
}

static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
{
        const struct pcpu_block_md *chunk_md = &chunk->chunk_md;

        if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE ||
            chunk_md->contig_hint == 0)
                return 0;

        return pcpu_size_to_slot(chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE);
}

/* set the pointer to a chunk in a page struct */
static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
{
        page->index = (unsigned long)pcpu;
}

/* obtain pointer to a chunk from a page struct */
static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
{
        return (struct pcpu_chunk *)page->index;
}

static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
{
        return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
}

static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx)
{
        return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT);
}

static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
                                     unsigned int cpu, int page_idx)
{
        return (unsigned long)chunk->base_addr +
               pcpu_unit_page_offset(cpu, page_idx);
}

/*
 * The following are helper functions to help access bitmaps and convert
 * between bitmap offsets to address offsets.
 */
static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index)
{
        return chunk->alloc_map +
               (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG);
}

static unsigned long pcpu_off_to_block_index(int off)
{
        return off / PCPU_BITMAP_BLOCK_BITS;
}

static unsigned long pcpu_off_to_block_off(int off)
{
        return off & (PCPU_BITMAP_BLOCK_BITS - 1);
}

static unsigned long pcpu_block_off_to_off(int index, int off)
{
        return index * PCPU_BITMAP_BLOCK_BITS + off;
}

/*
 * pcpu_next_hint - determine which hint to use
 * @block: block of interest
 * @alloc_bits: size of allocation
 *
 * This determines if we should scan based on the scan_hint or first_free.
 * In general, we want to scan from first_free to fulfill allocations by
 * first fit.  However, if we know a scan_hint at position scan_hint_start
 * cannot fulfill an allocation, we can begin scanning from there knowing
 * the contig_hint will be our fallback.
 */
static int pcpu_next_hint(struct pcpu_block_md *block, int alloc_bits)
{
        /*
         * The three conditions below determine if we can skip past the
         * scan_hint.  First, does the scan hint exist.  Second, is the
         * contig_hint after the scan_hint (possibly not true iff
         * contig_hint == scan_hint).  Third, is the allocation request
         * larger than the scan_hint.
         */
        if (block->scan_hint &&
            block->contig_hint_start > block->scan_hint_start &&
            alloc_bits > block->scan_hint)
                return block->scan_hint_start + block->scan_hint;

        return block->first_free;
}

/**
 * pcpu_next_md_free_region - finds the next hint free area
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of free area
 *
 * Helper function for pcpu_for_each_md_free_region.  It checks
 * block->contig_hint and performs aggregation across blocks to find the
 * next hint.  It modifies bit_off and bits in-place to be consumed in the
 * loop.
 */
static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off,
                                     int *bits)
{
        int i = pcpu_off_to_block_index(*bit_off);
        int block_off = pcpu_off_to_block_off(*bit_off);
        struct pcpu_block_md *block;

        *bits = 0;
        for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
             block++, i++) {
                /* handles contig area across blocks */
                if (*bits) {
                        *bits += block->left_free;
                        if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
                                continue;
                        return;
                }

                /*
                 * This checks three things.  First is there a contig_hint to
                 * check.  Second, have we checked this hint before by
                 * comparing the block_off.  Third, is this the same as the
                 * right contig hint.  In the last case, it spills over into
                 * the next block and should be handled by the contig area
                 * across blocks code.
                 */
                *bits = block->contig_hint;
                if (*bits && block->contig_hint_start >= block_off &&
                    *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) {
                        *bit_off = pcpu_block_off_to_off(i,
                                        block->contig_hint_start);
                        return;
                }
                /* reset to satisfy the second predicate above */
                block_off = 0;

                *bits = block->right_free;
                *bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free;
        }
}

/**
 * pcpu_next_fit_region - finds fit areas for a given allocation request
 * @chunk: chunk of interest
 * @alloc_bits: size of allocation
 * @align: alignment of area (max PAGE_SIZE)
 * @bit_off: chunk offset
 * @bits: size of free area
 *
 * Finds the next free region that is viable for use with a given size and
 * alignment.  This only returns if there is a valid area to be used for this
 * allocation.  block->first_free is returned if the allocation request fits
 * within the block to see if the request can be fulfilled prior to the contig
 * hint.
 */
static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
                                 int align, int *bit_off, int *bits)
{
        int i = pcpu_off_to_block_index(*bit_off);
        int block_off = pcpu_off_to_block_off(*bit_off);
        struct pcpu_block_md *block;

        *bits = 0;
        for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
             block++, i++) {
                /* handles contig area across blocks */
                if (*bits) {
                        *bits += block->left_free;
                        if (*bits >= alloc_bits)
                                return;
                        if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
                                continue;
                }

                /* check block->contig_hint */
                *bits = ALIGN(block->contig_hint_start, align) -
                        block->contig_hint_start;
                /*
                 * This uses the block offset to determine if this has been
                 * checked in the prior iteration.
                 */
                if (block->contig_hint &&
                    block->contig_hint_start >= block_off &&
                    block->contig_hint >= *bits + alloc_bits) {
                        int start = pcpu_next_hint(block, alloc_bits);

                        *bits += alloc_bits + block->contig_hint_start -
                                 start;
                        *bit_off = pcpu_block_off_to_off(i, start);
                        return;
                }
                /* reset to satisfy the second predicate above */
                block_off = 0;

                *bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free,
                                 align);
                *bits = PCPU_BITMAP_BLOCK_BITS - *bit_off;
                *bit_off = pcpu_block_off_to_off(i, *bit_off);
                if (*bits >= alloc_bits)
                        return;
        }

        /* no valid offsets were found - fail condition */
        *bit_off = pcpu_chunk_map_bits(chunk);
}

/*
 * Metadata free area iterators.  These perform aggregation of free areas
 * based on the metadata blocks and return the offset @bit_off and size in
 * bits of the free area @bits.  pcpu_for_each_fit_region only returns when
 * a fit is found for the allocation request.
 */
#define pcpu_for_each_md_free_region(chunk, bit_off, bits)                \
        for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits));        \
             (bit_off) < pcpu_chunk_map_bits((chunk));                        \
             (bit_off) += (bits) + 1,                                        \
             pcpu_next_md_free_region((chunk), &(bit_off), &(bits)))

#define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits)     \
        for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
                                  &(bits));                                      \
             (bit_off) < pcpu_chunk_map_bits((chunk));                              \
             (bit_off) += (bits),                                              \
             pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
                                  &(bits)))

/**
 * pcpu_mem_zalloc - allocate memory
 * @size: bytes to allocate
 * @gfp: allocation flags
 *
 * Allocate @size bytes.  If @size is smaller than PAGE_SIZE,
 * kzalloc() is used; otherwise, the equivalent of vzalloc() is used.
 * This is to facilitate passing through whitelisted flags.  The
 * returned memory is always zeroed.
 *
 * RETURNS:
 * Pointer to the allocated area on success, NULL on failure.
 */
static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
{
        if (WARN_ON_ONCE(!slab_is_available()))
                return NULL;

        if (size <= PAGE_SIZE)
                return kzalloc(size, gfp);
        else
                return __vmalloc(size, gfp | __GFP_ZERO);
}

/**
 * pcpu_mem_free - free memory
 * @ptr: memory to free
 *
 * Free @ptr.  @ptr should have been allocated using pcpu_mem_zalloc().
 */
static void pcpu_mem_free(void *ptr)
{
        kvfree(ptr);
}

static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot,
                              bool move_front)
{
        if (chunk != pcpu_reserved_chunk) {
                struct list_head *pcpu_slot;

                pcpu_slot = pcpu_chunk_list(pcpu_chunk_type(chunk));
                if (move_front)
                        list_move(&chunk->list, &pcpu_slot[slot]);
                else
                        list_move_tail(&chunk->list, &pcpu_slot[slot]);
        }
}

static void pcpu_chunk_move(struct pcpu_chunk *chunk, int slot)
{
        __pcpu_chunk_move(chunk, slot, true);
}

/**
 * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
 * @chunk: chunk of interest
 * @oslot: the previous slot it was on
 *
 * This function is called after an allocation or free changed @chunk.
 * New slot according to the changed state is determined and @chunk is
 * moved to the slot.  Note that the reserved chunk is never put on
 * chunk slots.
 *
 * CONTEXT:
 * pcpu_lock.
 */
static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
{
        int nslot = pcpu_chunk_slot(chunk);

        if (oslot != nslot)
                __pcpu_chunk_move(chunk, nslot, oslot < nslot);
}

/*
 * pcpu_update_empty_pages - update empty page counters
 * @chunk: chunk of interest
 * @nr: nr of empty pages
 *
 * This is used to keep track of the empty pages now based on the premise
 * a md_block covers a page.  The hint update functions recognize if a block
 * is made full or broken to calculate deltas for keeping track of free pages.
 */
static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr)
{
        chunk->nr_empty_pop_pages += nr;
        if (chunk != pcpu_reserved_chunk)
                pcpu_nr_empty_pop_pages[pcpu_chunk_type(chunk)] += nr;
}

/*
 * pcpu_region_overlap - determines if two regions overlap
 * @a: start of first region, inclusive
 * @b: end of first region, exclusive
 * @x: start of second region, inclusive
 * @y: end of second region, exclusive
 *
 * This is used to determine if the hint region [a, b) overlaps with the
 * allocated region [x, y).
 */
static inline bool pcpu_region_overlap(int a, int b, int x, int y)
{
        return (a < y) && (x < b);
}

/**
 * pcpu_block_update - updates a block given a free area
 * @block: block of interest
 * @start: start offset in block
 * @end: end offset in block
 *
 * Updates a block given a known free area.  The region [start, end) is
 * expected to be the entirety of the free area within a block.  Chooses
 * the best starting offset if the contig hints are equal.
 */
static void pcpu_block_update(struct pcpu_block_md *block, int start, int end)
{
        int contig = end - start;

        block->first_free = min(block->first_free, start);
        if (start == 0)
                block->left_free = contig;

        if (end == block->nr_bits)
                block->right_free = contig;

        if (contig > block->contig_hint) {
                /* promote the old contig_hint to be the new scan_hint */
                if (start > block->contig_hint_start) {
                        if (block->contig_hint > block->scan_hint) {
                                block->scan_hint_start =
                                        block->contig_hint_start;
                                block->scan_hint = block->contig_hint;
                        } else if (start < block->scan_hint_start) {
                                /*
                                 * The old contig_hint == scan_hint.  But, the
                                 * new contig is larger so hold the invariant
                                 * scan_hint_start < contig_hint_start.
                                 */
                                block->scan_hint = 0;
                        }
                } else {
                        block->scan_hint = 0;
                }
                block->contig_hint_start = start;
                block->contig_hint = contig;
        } else if (contig == block->contig_hint) {
                if (block->contig_hint_start &&
                    (!start ||
                     __ffs(start) > __ffs(block->contig_hint_start))) {
                        /* start has a better alignment so use it */
                        block->contig_hint_start = start;
                        if (start < block->scan_hint_start &&
                            block->contig_hint > block->scan_hint)
                                block->scan_hint = 0;
                } else if (start > block->scan_hint_start ||
                           block->contig_hint > block->scan_hint) {
                        /*
                         * Knowing contig == contig_hint, update the scan_hint
                         * if it is farther than or larger than the current
                         * scan_hint.
                         */
                        block->scan_hint_start = start;
                        block->scan_hint = contig;
                }
        } else {
                /*
                 * The region is smaller than the contig_hint.  So only update
                 * the scan_hint if it is larger than or equal and farther than
                 * the current scan_hint.
                 */
                if ((start < block->contig_hint_start &&
                     (contig > block->scan_hint ||
                      (contig == block->scan_hint &&
                       start > block->scan_hint_start)))) {
                        block->scan_hint_start = start;
                        block->scan_hint = contig;
                }
        }
}

/*
 * pcpu_block_update_scan - update a block given a free area from a scan
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of free area
 *
 * Finding the final allocation spot first goes through pcpu_find_block_fit()
 * to find a block that can hold the allocation and then pcpu_alloc_area()
 * where a scan is used.  When allocations require specific alignments,
 * we can inadvertently create holes which will not be seen in the alloc
 * or free paths.
 *
 * This takes a given free area hole and updates a block as it may change the
 * scan_hint.  We need to scan backwards to ensure we don't miss free bits
 * from alignment.
 */
static void pcpu_block_update_scan(struct pcpu_chunk *chunk, int bit_off,
                                   int bits)
{
        int s_off = pcpu_off_to_block_off(bit_off);
        int e_off = s_off + bits;
        int s_index, l_bit;
        struct pcpu_block_md *block;

        if (e_off > PCPU_BITMAP_BLOCK_BITS)
                return;

        s_index = pcpu_off_to_block_index(bit_off);
        block = chunk->md_blocks + s_index;

        /* scan backwards in case of alignment skipping free bits */
        l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), s_off);
        s_off = (s_off == l_bit) ? 0 : l_bit + 1;

        pcpu_block_update(block, s_off, e_off);
}

/**
 * pcpu_chunk_refresh_hint - updates metadata about a chunk
 * @chunk: chunk of interest
 * @full_scan: if we should scan from the beginning
 *
 * Iterates over the metadata blocks to find the largest contig area.
 * A full scan can be avoided on the allocation path as this is triggered
 * if we broke the contig_hint.  In doing so, the scan_hint will be before
 * the contig_hint or after if the scan_hint == contig_hint.  This cannot
 * be prevented on freeing as we want to find the largest area possibly
 * spanning blocks.
 */
static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan)
{
        struct pcpu_block_md *chunk_md = &chunk->chunk_md;
        int bit_off, bits;

        /* promote scan_hint to contig_hint */
        if (!full_scan && chunk_md->scan_hint) {
                bit_off = chunk_md->scan_hint_start + chunk_md->scan_hint;
                chunk_md->contig_hint_start = chunk_md->scan_hint_start;
                chunk_md->contig_hint = chunk_md->scan_hint;
                chunk_md->scan_hint = 0;
        } else {
                bit_off = chunk_md->first_free;
                chunk_md->contig_hint = 0;
        }

        bits = 0;
        pcpu_for_each_md_free_region(chunk, bit_off, bits)
                pcpu_block_update(chunk_md, bit_off, bit_off + bits);
}

/**
 * pcpu_block_refresh_hint
 * @chunk: chunk of interest
 * @index: index of the metadata block
 *
 * Scans over the block beginning at first_free and updates the block
 * metadata accordingly.
 */
static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
{
        struct pcpu_block_md *block = chunk->md_blocks + index;
        unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
        unsigned int rs, re, start;        /* region start, region end */

        /* promote scan_hint to contig_hint */
        if (block->scan_hint) {
                start = block->scan_hint_start + block->scan_hint;
                block->contig_hint_start = block->scan_hint_start;
                block->contig_hint = block->scan_hint;
                block->scan_hint = 0;
        } else {
                start = block->first_free;
                block->contig_hint = 0;
        }

        block->right_free = 0;

        /* iterate over free areas and update the contig hints */
        bitmap_for_each_clear_region(alloc_map, rs, re, start,
                                     PCPU_BITMAP_BLOCK_BITS)
                pcpu_block_update(block, rs, re);
}

/**
 * pcpu_block_update_hint_alloc - update hint on allocation path
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of request
 *
 * Updates metadata for the allocation path.  The metadata only has to be
 * refreshed by a full scan iff the chunk's contig hint is broken.  Block level
 * scans are required if the block's contig hint is broken.
 */
static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
                                         int bits)
{
        struct pcpu_block_md *chunk_md = &chunk->chunk_md;
        int nr_empty_pages = 0;
        struct pcpu_block_md *s_block, *e_block, *block;
        int s_index, e_index;        /* block indexes of the freed allocation */
        int s_off, e_off;        /* block offsets of the freed allocation */

        /*
         * Calculate per block offsets.
         * The calculation uses an inclusive range, but the resulting offsets
         * are [start, end).  e_index always points to the last block in the
         * range.
         */
        s_index = pcpu_off_to_block_index(bit_off);
        e_index = pcpu_off_to_block_index(bit_off + bits - 1);
        s_off = pcpu_off_to_block_off(bit_off);
        e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;

        s_block = chunk->md_blocks + s_index;
        e_block = chunk->md_blocks + e_index;

        /*
         * Update s_block.
         * block->first_free must be updated if the allocation takes its place.
         * If the allocation breaks the contig_hint, a scan is required to
         * restore this hint.
         */
        if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
                nr_empty_pages++;

        if (s_off == s_block->first_free)
                s_block->first_free = find_next_zero_bit(
                                        pcpu_index_alloc_map(chunk, s_index),
                                        PCPU_BITMAP_BLOCK_BITS,
                                        s_off + bits);

        if (pcpu_region_overlap(s_block->scan_hint_start,
                                s_block->scan_hint_start + s_block->scan_hint,
                                s_off,
                                s_off + bits))
                s_block->scan_hint = 0;

        if (pcpu_region_overlap(s_block->contig_hint_start,
                                s_block->contig_hint_start +
                                s_block->contig_hint,
                                s_off,
                                s_off + bits)) {
                /* block contig hint is broken - scan to fix it */
                if (!s_off)
                        s_block->left_free = 0;
                pcpu_block_refresh_hint(chunk, s_index);
        } else {
                /* update left and right contig manually */
                s_block->left_free = min(s_block->left_free, s_off);
                if (s_index == e_index)
                        s_block->right_free = min_t(int, s_block->right_free,
                                        PCPU_BITMAP_BLOCK_BITS - e_off);
                else
                        s_block->right_free = 0;
        }

        /*
         * Update e_block.
         */
        if (s_index != e_index) {
                if (e_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
                        nr_empty_pages++;

                /*
                 * When the allocation is across blocks, the end is along
                 * the left part of the e_block.
                 */
                e_block->first_free = find_next_zero_bit(
                                pcpu_index_alloc_map(chunk, e_index),
                                PCPU_BITMAP_BLOCK_BITS, e_off);

                if (e_off == PCPU_BITMAP_BLOCK_BITS) {
                        /* reset the block */
                        e_block++;
                } else {
                        if (e_off > e_block->scan_hint_start)
                                e_block->scan_hint = 0;

                        e_block->left_free = 0;
                        if (e_off > e_block->contig_hint_start) {
                                /* contig hint is broken - scan to fix it */
                                pcpu_block_refresh_hint(chunk, e_index);
                        } else {
                                e_block->right_free =
                                        min_t(int, e_block->right_free,
                                              PCPU_BITMAP_BLOCK_BITS - e_off);
                        }
                }

                /* update in-between md_blocks */
                nr_empty_pages += (e_index - s_index - 1);
                for (block = s_block + 1; block < e_block; block++) {
                        block->scan_hint = 0;
                        block->contig_hint = 0;
                        block->left_free = 0;
                        block->right_free = 0;
                }
        }

        if (nr_empty_pages)
                pcpu_update_empty_pages(chunk, -nr_empty_pages);

        if (pcpu_region_overlap(chunk_md->scan_hint_start,
                                chunk_md->scan_hint_start +
                                chunk_md->scan_hint,
                                bit_off,
                                bit_off + bits))
                chunk_md->scan_hint = 0;

        /*
         * The only time a full chunk scan is required is if the chunk
         * contig hint is broken.  Otherwise, it means a smaller space
         * was used and therefore the chunk contig hint is still correct.
         */
        if (pcpu_region_overlap(chunk_md->contig_hint_start,
                                chunk_md->contig_hint_start +
                                chunk_md->contig_hint,
                                bit_off,
                                bit_off + bits))
                pcpu_chunk_refresh_hint(chunk, false);
}

/**
 * pcpu_block_update_hint_free - updates the block hints on the free path
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of request
 *
 * Updates metadata for the allocation path.  This avoids a blind block
 * refresh by making use of the block contig hints.  If this fails, it scans
 * forward and backward to determine the extent of the free area.  This is
 * capped at the boundary of blocks.
 *
 * A chunk update is triggered if a page becomes free, a block becomes free,
 * or the free spans across blocks.  This tradeoff is to minimize iterating
 * over the block metadata to update chunk_md->contig_hint.
 * chunk_md->contig_hint may be off by up to a page, but it will never be more
 * than the available space.  If the contig hint is contained in one block, it
 * will be accurate.
 */
static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
                                        int bits)
{
        int nr_empty_pages = 0;
        struct pcpu_block_md *s_block, *e_block, *block;
        int s_index, e_index;        /* block indexes of the freed allocation */
        int s_off, e_off;        /* block offsets of the freed allocation */
        int start, end;                /* start and end of the whole free area */

        /*
         * Calculate per block offsets.
         * The calculation uses an inclusive range, but the resulting offsets
         * are [start, end).  e_index always points to the last block in the
         * range.
         */
        s_index = pcpu_off_to_block_index(bit_off);
        e_index = pcpu_off_to_block_index(bit_off + bits - 1);
        s_off = pcpu_off_to_block_off(bit_off);
        e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;

        s_block = chunk->md_blocks + s_index;
        e_block = chunk->md_blocks + e_index;

        /*
         * Check if the freed area aligns with the block->contig_hint.
         * If it does, then the scan to find the beginning/end of the
         * larger free area can be avoided.
         *
         * start and end refer to beginning and end of the free area
         * within each their respective blocks.  This is not necessarily
         * the entire free area as it may span blocks past the beginning
         * or end of the block.
         */
        start = s_off;
        if (s_off == s_block->contig_hint + s_block->contig_hint_start) {
                start = s_block->contig_hint_start;
        } else {
                /*
                 * Scan backwards to find the extent of the free area.
                 * find_last_bit returns the starting bit, so if the start bit
                 * is returned, that means there was no last bit and the
                 * remainder of the chunk is free.
                 */
                int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index),
                                          start);
                start = (start == l_bit) ? 0 : l_bit + 1;
        }

        end = e_off;
        if (e_off == e_block->contig_hint_start)
                end = e_block->contig_hint_start + e_block->contig_hint;
        else
                end = find_next_bit(pcpu_index_alloc_map(chunk, e_index),
                                    PCPU_BITMAP_BLOCK_BITS, end);

        /* update s_block */
        e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS;
        if (!start && e_off == PCPU_BITMAP_BLOCK_BITS)
                nr_empty_pages++;
        pcpu_block_update(s_block, start, e_off);

        /* freeing in the same block */
        if (s_index != e_index) {
                /* update e_block */
                if (end == PCPU_BITMAP_BLOCK_BITS)
                        nr_empty_pages++;
                pcpu_block_update(e_block, 0, end);

                /* reset md_blocks in the middle */
                nr_empty_pages += (e_index - s_index - 1);
                for (block = s_block + 1; block < e_block; block++) {
                        block->first_free = 0;
                        block->scan_hint = 0;
                        block->contig_hint_start = 0;
                        block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
                        block->left_free = PCPU_BITMAP_BLOCK_BITS;
                        block->right_free = PCPU_BITMAP_BLOCK_BITS;
                }
        }

        if (nr_empty_pages)
                pcpu_update_empty_pages(chunk, nr_empty_pages);

        /*
         * Refresh chunk metadata when the free makes a block free or spans
         * across blocks.  The contig_hint may be off by up to a page, but if
         * the contig_hint is contained in a block, it will be accurate with
         * the else condition below.
         */
        if (((end - start) >= PCPU_BITMAP_BLOCK_BITS) || s_index != e_index)
                pcpu_chunk_refresh_hint(chunk, true);
        else
                pcpu_block_update(&chunk->chunk_md,
                                  pcpu_block_off_to_off(s_index, start),
                                  end);
}

/**
 * pcpu_is_populated - determines if the region is populated
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of area
 * @next_off: return value for the next offset to start searching
 *
 * For atomic allocations, check if the backing pages are populated.
 *
 * RETURNS:
 * Bool if the backing pages are populated.
 * next_index is to skip over unpopulated blocks in pcpu_find_block_fit.
 */
static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
                              int *next_off)
{
        unsigned int page_start, page_end, rs, re;

        page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
        page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);

        rs = page_start;
        bitmap_next_clear_region(chunk->populated, &rs, &re, page_end);
        if (rs >= page_end)
                return true;

        *next_off = re * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
        return false;
}

/**
 * pcpu_find_block_fit - finds the block index to start searching
 * @chunk: chunk of interest
 * @alloc_bits: size of request in allocation units
 * @align: alignment of area (max PAGE_SIZE bytes)
 * @pop_only: use populated regions only
 *
 * Given a chunk and an allocation spec, find the offset to begin searching
 * for a free region.  This iterates over the bitmap metadata blocks to
 * find an offset that will be guaranteed to fit the requirements.  It is
 * not quite first fit as if the allocation does not fit in the contig hint
 * of a block or chunk, it is skipped.  This errs on the side of caution
 * to prevent excess iteration.  Poor alignment can cause the allocator to
 * skip over blocks and chunks that have valid free areas.
 *
 * RETURNS:
 * The offset in the bitmap to begin searching.
 * -1 if no offset is found.
 */
static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
                               size_t align, bool pop_only)
{
        struct pcpu_block_md *chunk_md = &chunk->chunk_md;
        int bit_off, bits, next_off;

        /*
         * Check to see if the allocation can fit in the chunk's contig hint.
         * This is an optimization to prevent scanning by assuming if it
         * cannot fit in the global hint, there is memory pressure and creating
         * a new chunk would happen soon.
         */
        bit_off = ALIGN(chunk_md->contig_hint_start, align) -
                  chunk_md->contig_hint_start;
        if (bit_off + alloc_bits > chunk_md->contig_hint)
                return -1;

        bit_off = pcpu_next_hint(chunk_md, alloc_bits);
        bits = 0;
        pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) {
                if (!pop_only || pcpu_is_populated(chunk, bit_off, bits,
                                                   &next_off))
                        break;

                bit_off = next_off;
                bits = 0;
        }

        if (bit_off == pcpu_chunk_map_bits(chunk))
                return -1;

        return bit_off;
}

/*
 * pcpu_find_zero_area - modified from bitmap_find_next_zero_area_off()
 * @map: the address to base the search on
 * @size: the bitmap size in bits
 * @start: the bitnumber to start searching at
 * @nr: the number of zeroed bits we're looking for
 * @align_mask: alignment mask for zero area
 * @largest_off: offset of the largest area skipped
 * @largest_bits: size of the largest area skipped
 *
 * The @align_mask should be one less than a power of 2.
 *
 * This is a modified version of bitmap_find_next_zero_area_off() to remember
 * the largest area that was skipped.  This is imperfect, but in general is
 * good enough.  The largest remembered region is the largest failed region
 * seen.  This does not include anything we possibly skipped due to alignment.
 * pcpu_block_update_scan() does scan backwards to try and recover what was
 * lost to alignment.  While this can cause scanning to miss earlier possible
 * free areas, smaller allocations will eventually fill those holes.
 */
static unsigned long pcpu_find_zero_area(unsigned long *map,
                                         unsigned long size,
                                         unsigned long start,
                                         unsigned long nr,
                                         unsigned long align_mask,
                                         unsigned long *largest_off,
                                         unsigned long *largest_bits)
{
        unsigned long index, end, i, area_off, area_bits;
again:
        index = find_next_zero_bit(map, size, start);

        /* Align allocation */
        index = __ALIGN_MASK(index, align_mask);
        area_off = index;

        end = index + nr;
        if (end > size)
                return end;
        i = find_next_bit(map, end, index);
        if (i < end) {
                area_bits = i - area_off;
                /* remember largest unused area with best alignment */
                if (area_bits > *largest_bits ||
                    (area_bits == *largest_bits && *largest_off &&
                     (!area_off || __ffs(area_off) > __ffs(*largest_off)))) {
                        *largest_off = area_off;
                        *largest_bits = area_bits;
                }

                start = i + 1;
                goto again;
        }
        return index;
}

/**
 * pcpu_alloc_area - allocates an area from a pcpu_chunk
 * @chunk: chunk of interest
 * @alloc_bits: size of request in allocation units
 * @align: alignment of area (max PAGE_SIZE)
 * @start: bit_off to start searching
 *
 * This function takes in a @start offset to begin searching to fit an
 * allocation of @alloc_bits with alignment @align.  It needs to scan
 * the allocation map because if it fits within the block's contig hint,
 * @start will be block->first_free. This is an attempt to fill the
 * allocation prior to breaking the contig hint.  The allocation and
 * boundary maps are updated accordingly if it confirms a valid
 * free area.
 *
 * RETURNS:
 * Allocated addr offset in @chunk on success.
 * -1 if no matching area is found.
 */
static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
                           size_t align, int start)
{
        struct pcpu_block_md *chunk_md = &chunk->chunk_md;
        size_t align_mask = (align) ? (align - 1) : 0;
        unsigned long area_off = 0, area_bits = 0;
        int bit_off, end, oslot;

        lockdep_assert_held(&pcpu_lock);

        oslot = pcpu_chunk_slot(chunk);

        /*
         * Search to find a fit.
         */
        end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS,
                    pcpu_chunk_map_bits(chunk));
        bit_off = pcpu_find_zero_area(chunk->alloc_map, end, start, alloc_bits,
                                      align_mask, &area_off, &area_bits);
        if (bit_off >= end)
                return -1;

        if (area_bits)
                pcpu_block_update_scan(chunk, area_off, area_bits);

        /* update alloc map */
        bitmap_set(chunk->alloc_map, bit_off, alloc_bits);

        /* update boundary map */
        set_bit(bit_off, chunk->bound_map);
        bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1);
        set_bit(bit_off + alloc_bits, chunk->bound_map);

        chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE;

        /* update first free bit */
        if (bit_off == chunk_md->first_free)
                chunk_md->first_free = find_next_zero_bit(
                                        chunk->alloc_map,
                                        pcpu_chunk_map_bits(chunk),
                                        bit_off + alloc_bits);

        pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits);

        pcpu_chunk_relocate(chunk, oslot);

        return bit_off * PCPU_MIN_ALLOC_SIZE;
}

/**
 * pcpu_free_area - frees the corresponding offset
 * @chunk: chunk of interest
 * @off: addr offset into chunk
 *
 * This function determines the size of an allocation to free using
 * the boundary bitmap and clears the allocation map.
 *
 * RETURNS:
 * Number of freed bytes.
 */
static int pcpu_free_area(struct pcpu_chunk *chunk, int off)
{
        struct pcpu_block_md *chunk_md = &chunk->chunk_md;
        int bit_off, bits, end, oslot, freed;

        lockdep_assert_held(&pcpu_lock);
        pcpu_stats_area_dealloc(chunk);

        oslot = pcpu_chunk_slot(chunk);

        bit_off = off / PCPU_MIN_ALLOC_SIZE;

        /* find end index */
        end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
                            bit_off + 1);
        bits = end - bit_off;
        bitmap_clear(chunk->alloc_map, bit_off, bits);

        freed = bits * PCPU_MIN_ALLOC_SIZE;

        /* update metadata */
        chunk->free_bytes += freed;

        /* update first free bit */
        chunk_md->first_free = min(chunk_md->first_free, bit_off);

        pcpu_block_update_hint_free(chunk, bit_off, bits);

        pcpu_chunk_relocate(chunk, oslot);

        return freed;
}

static void pcpu_init_md_block(struct pcpu_block_md *block, int nr_bits)
{
        block->scan_hint = 0;
        block->contig_hint = nr_bits;
        block->left_free = nr_bits;
        block->right_free = nr_bits;
        block->first_free = 0;
        block->nr_bits = nr_bits;
}

static void pcpu_init_md_blocks(struct pcpu_chunk *chunk)
{
        struct pcpu_block_md *md_block;

        /* init the chunk's block */
        pcpu_init_md_block(&chunk->chunk_md, pcpu_chunk_map_bits(chunk));

        for (md_block = chunk->md_blocks;
             md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk);
             md_block++)
                pcpu_init_md_block(md_block, PCPU_BITMAP_BLOCK_BITS);
}

/**
 * pcpu_alloc_first_chunk - creates chunks that serve the first chunk
 * @tmp_addr: the start of the region served
 * @map_size: size of the region served
 *
 * This is responsible for creating the chunks that serve the first chunk.  The
 * base_addr is page aligned down of @tmp_addr while the region end is page
 * aligned up.  Offsets are kept track of to determine the region served. All
 * this is done to appease the bitmap allocator in avoiding partial blocks.
 *
 * RETURNS:
 * Chunk serving the region at @tmp_addr of @map_size.
 */
static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
                                                         int map_size)
{
        struct pcpu_chunk *chunk;
        unsigned long aligned_addr, lcm_align;
        int start_offset, offset_bits, region_size, region_bits;
        size_t alloc_size;

        /* region calculations */
        aligned_addr = tmp_addr & PAGE_MASK;

        start_offset = tmp_addr - aligned_addr;

        /*
         * Align the end of the region with the LCM of PAGE_SIZE and
         * PCPU_BITMAP_BLOCK_SIZE.  One of these constants is a multiple of
         * the other.
         */
        lcm_align = lcm(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE);
        region_size = ALIGN(start_offset + map_size, lcm_align);

        /* allocate chunk */
        alloc_size = struct_size(chunk, populated,
                                 BITS_TO_LONGS(region_size >> PAGE_SHIFT));
        chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
        if (!chunk)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      alloc_size);

        INIT_LIST_HEAD(&chunk->list);

        chunk->base_addr = (void *)aligned_addr;
        chunk->start_offset = start_offset;
        chunk->end_offset = region_size - chunk->start_offset - map_size;

        chunk->nr_pages = region_size >> PAGE_SHIFT;
        region_bits = pcpu_chunk_map_bits(chunk);

        alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]);
        chunk->alloc_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
        if (!chunk->alloc_map)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      alloc_size);

        alloc_size =
                BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]);
        chunk->bound_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
        if (!chunk->bound_map)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      alloc_size);

        alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]);
        chunk->md_blocks = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
        if (!chunk->md_blocks)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      alloc_size);

#ifdef CONFIG_MEMCG_KMEM
        /* first chunk isn't memcg-aware */
        chunk->obj_cgroups = NULL;
#endif
        pcpu_init_md_blocks(chunk);

        /* manage populated page bitmap */
        chunk->immutable = true;
        bitmap_fill(chunk->populated, chunk->nr_pages);
        chunk->nr_populated = chunk->nr_pages;
        chunk->nr_empty_pop_pages = chunk->nr_pages;

        chunk->free_bytes = map_size;

        if (chunk->start_offset) {
                /* hide the beginning of the bitmap */
                offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE;
                bitmap_set(chunk->alloc_map, 0, offset_bits);
                set_bit(0, chunk->bound_map);
                set_bit(offset_bits, chunk->bound_map);

                chunk->chunk_md.first_free = offset_bits;

                pcpu_block_update_hint_alloc(chunk, 0, offset_bits);
        }

        if (chunk->end_offset) {
                /* hide the end of the bitmap */
                offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE;
                bitmap_set(chunk->alloc_map,
                           pcpu_chunk_map_bits(chunk) - offset_bits,
                           offset_bits);
                set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE,
                        chunk->bound_map);
                set_bit(region_bits, chunk->bound_map);

                pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk)
                                             - offset_bits, offset_bits);
        }

        return chunk;
}

static struct pcpu_chunk *pcpu_alloc_chunk(enum pcpu_chunk_type type, gfp_t gfp)
{
        struct pcpu_chunk *chunk;
        int region_bits;

        chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp);
        if (!chunk)
                return NULL;

        INIT_LIST_HEAD(&chunk->list);
        chunk->nr_pages = pcpu_unit_pages;
        region_bits = pcpu_chunk_map_bits(chunk);

        chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
                                           sizeof(chunk->alloc_map[0]), gfp);
        if (!chunk->alloc_map)
                goto alloc_map_fail;

        chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
                                           sizeof(chunk->bound_map[0]), gfp);
        if (!chunk->bound_map)
                goto bound_map_fail;

        chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) *
                                           sizeof(chunk->md_blocks[0]), gfp);
        if (!chunk->md_blocks)
                goto md_blocks_fail;

#ifdef CONFIG_MEMCG_KMEM
        if (pcpu_is_memcg_chunk(type)) {
                chunk->obj_cgroups =
                        pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) *
                                        sizeof(struct obj_cgroup *), gfp);
                if (!chunk->obj_cgroups)
                        goto objcg_fail;
        }
#endif

        pcpu_init_md_blocks(chunk);

        /* init metadata */
        chunk->free_bytes = chunk->nr_pages * PAGE_SIZE;

        return chunk;

#ifdef CONFIG_MEMCG_KMEM
objcg_fail:
        pcpu_mem_free(chunk->md_blocks);
#endif
md_blocks_fail:
        pcpu_mem_free(chunk->bound_map);
bound_map_fail:
        pcpu_mem_free(chunk->alloc_map);
alloc_map_fail:
        pcpu_mem_free(chunk);

        return NULL;
}

static void pcpu_free_chunk(struct pcpu_chunk *chunk)
{
        if (!chunk)
                return;
#ifdef CONFIG_MEMCG_KMEM
        pcpu_mem_free(chunk->obj_cgroups);
#endif
        pcpu_mem_free(chunk->md_blocks);
        pcpu_mem_free(chunk->bound_map);
        pcpu_mem_free(chunk->alloc_map);
        pcpu_mem_free(chunk);
}

/**
 * pcpu_chunk_populated - post-population bookkeeping
 * @chunk: pcpu_chunk which got populated
 * @page_start: the start page
 * @page_end: the end page
 *
 * Pages in [@page_start,@page_end) have been populated to @chunk.  Update
 * the bookkeeping information accordingly.  Must be called after each
 * successful population.
 *
 * If this is @for_alloc, do not increment pcpu_nr_empty_pop_pages because it
 * is to serve an allocation in that area.
 */
static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
                                 int page_end)
{
        int nr = page_end - page_start;

        lockdep_assert_held(&pcpu_lock);

        bitmap_set(chunk->populated, page_start, nr);
        chunk->nr_populated += nr;
        pcpu_nr_populated += nr;

        pcpu_update_empty_pages(chunk, nr);
}

/**
 * pcpu_chunk_depopulated - post-depopulation bookkeeping
 * @chunk: pcpu_chunk which got depopulated
 * @page_start: the start page
 * @page_end: the end page
 *
 * Pages in [@page_start,@page_end) have been depopulated from @chunk.
 * Update the bookkeeping information accordingly.  Must be called after
 * each successful depopulation.
 */
static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
                                   int page_start, int page_end)
{
        int nr = page_end - page_start;

        lockdep_assert_held(&pcpu_lock);

        bitmap_clear(chunk->populated, page_start, nr);
        chunk->nr_populated -= nr;
        pcpu_nr_populated -= nr;

        pcpu_update_empty_pages(chunk, -nr);
}

/*
 * Chunk management implementation.
 *
 * To allow different implementations, chunk alloc/free and
 * [de]population are implemented in a separate file which is pulled
 * into this file and compiled together.  The following functions
 * should be implemented.
 *
 * pcpu_populate_chunk                - populate the specified range of a chunk
 * pcpu_depopulate_chunk        - depopulate the specified range of a chunk
 * pcpu_create_chunk                - create a new chunk
 * pcpu_destroy_chunk                - destroy a chunk, always preceded by full depop
 * pcpu_addr_to_page                - translate address to physical address
 * pcpu_verify_alloc_info        - check alloc_info is acceptable during init
 */
static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
                               int page_start, int page_end, gfp_t gfp);
static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
                                  int page_start, int page_end);
static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type,
                                            gfp_t gfp);
static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
static struct page *pcpu_addr_to_page(void *addr);
static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);

#ifdef CONFIG_NEED_PER_CPU_KM
#include "percpu-km.c"
#else
#include "percpu-vm.c"
#endif

/**
 * pcpu_chunk_addr_search - determine chunk containing specified address
 * @addr: address for which the chunk needs to be determined.
 *
 * This is an internal function that handles all but static allocations.
 * Static percpu address values should never be passed into the allocator.
 *
 * RETURNS:
 * The address of the found chunk.
 */
static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
{
        /* is it in the dynamic region (first chunk)? */
        if (pcpu_addr_in_chunk(pcpu_first_chunk, addr))
                return pcpu_first_chunk;

        /* is it in the reserved region? */
        if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr))
                return pcpu_reserved_chunk;

        /*
         * The address is relative to unit0 which might be unused and
         * thus unmapped.  Offset the address to the unit space of the
         * current processor before looking it up in the vmalloc
         * space.  Note that any possible cpu id can be used here, so
         * there's no need to worry about preemption or cpu hotplug.
         */
        addr += pcpu_unit_offsets[raw_smp_processor_id()];
        return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
}

#ifdef CONFIG_MEMCG_KMEM
static enum pcpu_chunk_type pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
                                                     struct obj_cgroup **objcgp)
{
        struct obj_cgroup *objcg;

        if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT))
                return PCPU_CHUNK_ROOT;

        objcg = get_obj_cgroup_from_current();
        if (!objcg)
                return PCPU_CHUNK_ROOT;

        if (obj_cgroup_charge(objcg, gfp, size * num_possible_cpus())) {
                obj_cgroup_put(objcg);
                return PCPU_FAIL_ALLOC;
        }

        *objcgp = objcg;
        return PCPU_CHUNK_MEMCG;
}

static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
                                       struct pcpu_chunk *chunk, int off,
                                       size_t size)
{
        if (!objcg)
                return;

        if (chunk) {
                chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg;

                rcu_read_lock();
                mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
                                size * num_possible_cpus());
                rcu_read_unlock();
        } else {
                obj_cgroup_uncharge(objcg, size * num_possible_cpus());
                obj_cgroup_put(objcg);
        }
}

static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
{
        struct obj_cgroup *objcg;

        if (!pcpu_is_memcg_chunk(pcpu_chunk_type(chunk)))
                return;

        objcg = chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT];
        chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL;

        obj_cgroup_uncharge(objcg, size * num_possible_cpus());

        rcu_read_lock();
        mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
                        -(size * num_possible_cpus()));
        rcu_read_unlock();

        obj_cgroup_put(objcg);
}

#else /* CONFIG_MEMCG_KMEM */
static enum pcpu_chunk_type
pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp)
{
        return PCPU_CHUNK_ROOT;
}

static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
                                       struct pcpu_chunk *chunk, int off,
                                       size_t size)
{
}

static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
{
}
#endif /* CONFIG_MEMCG_KMEM */

/**
 * pcpu_alloc - the percpu allocator
 * @size: size of area to allocate in bytes
 * @align: alignment of area (max PAGE_SIZE)
 * @reserved: allocate from the reserved chunk if available
 * @gfp: allocation flags
 *
 * Allocate percpu area of @size bytes aligned at @align.  If @gfp doesn't
 * contain %GFP_KERNEL, the allocation is atomic. If @gfp has __GFP_NOWARN
 * then no warning will be triggered on invalid or failed allocation
 * requests.
 *
 * RETURNS:
 * Percpu pointer to the allocated area on success, NULL on failure.
 */
static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
                                 gfp_t gfp)
{
        gfp_t pcpu_gfp;
        bool is_atomic;
        bool do_warn;
        enum pcpu_chunk_type type;
        struct list_head *pcpu_slot;
        struct obj_cgroup *objcg = NULL;
        static int warn_limit = 10;
        struct pcpu_chunk *chunk, *next;
        const char *err;
        int slot, off, cpu, ret;
        unsigned long flags;
        void __percpu *ptr;
        size_t bits, bit_align;

        gfp = current_gfp_context(gfp);
        /* whitelisted flags that can be passed to the backing allocators */
        pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
        is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
        do_warn = !(gfp & __GFP_NOWARN);

        /*
         * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE,
         * therefore alignment must be a minimum of that many bytes.
         * An allocation may have internal fragmentation from rounding up
         * of up to PCPU_MIN_ALLOC_SIZE - 1 bytes.
         */
        if (unlikely(align < PCPU_MIN_ALLOC_SIZE))
                align = PCPU_MIN_ALLOC_SIZE;

        size = ALIGN(size, PCPU_MIN_ALLOC_SIZE);
        bits = size >> PCPU_MIN_ALLOC_SHIFT;
        bit_align = align >> PCPU_MIN_ALLOC_SHIFT;

        if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
                     !is_power_of_2(align))) {
                WARN(do_warn, "illegal size (%zu) or align (%zu) for percpu allocation\n",
                     size, align);
                return NULL;
        }

        type = pcpu_memcg_pre_alloc_hook(size, gfp, &objcg);
        if (unlikely(type == PCPU_FAIL_ALLOC))
                return NULL;
        pcpu_slot = pcpu_chunk_list(type);

        if (!is_atomic) {
                /*
                 * pcpu_balance_workfn() allocates memory under this mutex,
                 * and it may wait for memory reclaim. Allow current task
                 * to become OOM victim, in case of memory pressure.
                 */
                if (gfp & __GFP_NOFAIL) {
                        mutex_lock(&pcpu_alloc_mutex);
                } else if (mutex_lock_killable(&pcpu_alloc_mutex)) {
                        pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
                        return NULL;
                }
        }

        spin_lock_irqsave(&pcpu_lock, flags);

        /* serve reserved allocations from the reserved chunk if available */
        if (reserved && pcpu_reserved_chunk) {
                chunk = pcpu_reserved_chunk;

                off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic);
                if (off < 0) {
                        err = "alloc from reserved chunk failed";
                        goto fail_unlock;
                }

                off = pcpu_alloc_area(chunk, bits, bit_align, off);
                if (off >= 0)
                        goto area_found;

                err = "alloc from reserved chunk failed";
                goto fail_unlock;
        }

restart:
        /* search through normal chunks */
        for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
                list_for_each_entry_safe(chunk, next, &pcpu_slot[slot], list) {
                        off = pcpu_find_block_fit(chunk, bits, bit_align,
                                                  is_atomic);
                        if (off < 0) {
                                if (slot < PCPU_SLOT_FAIL_THRESHOLD)
                                        pcpu_chunk_move(chunk, 0);
                                continue;
                        }

                        off = pcpu_alloc_area(chunk, bits, bit_align, off);
                        if (off >= 0)
                                goto area_found;

                }
        }

        spin_unlock_irqrestore(&pcpu_lock, flags);

        /*
         * No space left.  Create a new chunk.  We don't want multiple
         * tasks to create chunks simultaneously.  Serialize and create iff
         * there's still no empty chunk after grabbing the mutex.
         */
        if (is_atomic) {
                err = "atomic alloc failed, no space left";
                goto fail;
        }

        if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
                chunk = pcpu_create_chunk(type, pcpu_gfp);
                if (!chunk) {
                        err = "failed to allocate new chunk";
                        goto fail;
                }

                spin_lock_irqsave(&pcpu_lock, flags);
                pcpu_chunk_relocate(chunk, -1);
        } else {
                spin_lock_irqsave(&pcpu_lock, flags);
        }

        goto restart;

area_found:
        pcpu_stats_area_alloc(chunk, size);
        spin_unlock_irqrestore(&pcpu_lock, flags);

        /* populate if not all pages are already there */
        if (!is_atomic) {
                unsigned int page_start, page_end, rs, re;

                page_start = PFN_DOWN(off);
                page_end = PFN_UP(off + size);

                bitmap_for_each_clear_region(chunk->populated, rs, re,
                                             page_start, page_end) {
                        WARN_ON(chunk->immutable);

                        ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp);

                        spin_lock_irqsave(&pcpu_lock, flags);
                        if (ret) {
                                pcpu_free_area(chunk, off);
                                err = "failed to populate";
                                goto fail_unlock;
                        }
                        pcpu_chunk_populated(chunk, rs, re);
                        spin_unlock_irqrestore(&pcpu_lock, flags);
                }

                mutex_unlock(&pcpu_alloc_mutex);
        }

        if (pcpu_nr_empty_pop_pages[type] < PCPU_EMPTY_POP_PAGES_LOW)
                pcpu_schedule_balance_work();

        /* clear the areas and return address relative to base address */
        for_each_possible_cpu(cpu)
                memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);

        ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
        kmemleak_alloc_percpu(ptr, size, gfp);

        trace_percpu_alloc_percpu(reserved, is_atomic, size, align,
                        chunk->base_addr, off, ptr);

        pcpu_memcg_post_alloc_hook(objcg, chunk, off, size);

        return ptr;

fail_unlock:
        spin_unlock_irqrestore(&pcpu_lock, flags);
fail:
        trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);

        if (!is_atomic && do_warn && warn_limit) {
                pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
                        size, align, is_atomic, err);
                dump_stack();
                if (!--warn_limit)
                        pr_info("limit reached, disable warning\n");
        }
        if (is_atomic) {
                /* see the flag handling in pcpu_blance_workfn() */
                pcpu_atomic_alloc_failed = true;
                pcpu_schedule_balance_work();
        } else {
                mutex_unlock(&pcpu_alloc_mutex);
        }

        pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);

        return NULL;
}

/**
 * __alloc_percpu_gfp - allocate dynamic percpu area
 * @size: size of area to allocate in bytes
 * @align: alignment of area (max PAGE_SIZE)
 * @gfp: allocation flags
 *
 * Allocate zero-filled percpu area of @size bytes aligned at @align.  If
 * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can
 * be called from any context but is a lot more likely to fail. If @gfp
 * has __GFP_NOWARN then no warning will be triggered on invalid or failed
 * allocation requests.
 *
 * RETURNS:
 * Percpu pointer to the allocated area on success, NULL on failure.
 */
void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp)
{
        return pcpu_alloc(size, align, false, gfp);
}
EXPORT_SYMBOL_GPL(__alloc_percpu_gfp);

/**
 * __alloc_percpu - allocate dynamic percpu area
 * @size: size of area to allocate in bytes
 * @align: alignment of area (max PAGE_SIZE)
 *
 * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL).
 */
void __percpu *__alloc_percpu(size_t size, size_t align)
{
        return pcpu_alloc(size, align, false, GFP_KERNEL);
}
EXPORT_SYMBOL_GPL(__alloc_percpu);

/**
 * __alloc_reserved_percpu - allocate reserved percpu area
 * @size: size of area to allocate in bytes
 * @align: alignment of area (max PAGE_SIZE)
 *
 * Allocate zero-filled percpu area of @size bytes aligned at @align
 * from reserved percpu area if arch has set it up; otherwise,
 * allocation is served from the same dynamic area.  Might sleep.
 * Might trigger writeouts.
 *
 * CONTEXT:
 * Does GFP_KERNEL allocation.
 *
 * RETURNS:
 * Percpu pointer to the allocated area on success, NULL on failure.
 */
void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
{
        return pcpu_alloc(size, align, true, GFP_KERNEL);
}

/**
 * __pcpu_balance_workfn - manage the amount of free chunks and populated pages
 * @type: chunk type
 *
 * Reclaim all fully free chunks except for the first one.  This is also
 * responsible for maintaining the pool of empty populated pages.  However,
 * it is possible that this is called when physical memory is scarce causing
 * OOM killer to be triggered.  We should avoid doing so until an actual
 * allocation causes the failure as it is possible that requests can be
 * serviced from already backed regions.
 */
static void __pcpu_balance_workfn(enum pcpu_chunk_type type)
{
        /* gfp flags passed to underlying allocators */
        const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
        LIST_HEAD(to_free);
        struct list_head *pcpu_slot = pcpu_chunk_list(type);
        struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
        struct pcpu_chunk *chunk, *next;
        int slot, nr_to_pop, ret;

        /*
         * There's no reason to keep around multiple unused chunks and VM
         * areas can be scarce.  Destroy all free chunks except for one.
         */
        mutex_lock(&pcpu_alloc_mutex);
        spin_lock_irq(&pcpu_lock);

        list_for_each_entry_safe(chunk, next, free_head, list) {
                WARN_ON(chunk->immutable);

                /* spare the first one */
                if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
                        continue;

                list_move(&chunk->list, &to_free);
        }

        spin_unlock_irq(&pcpu_lock);

        list_for_each_entry_safe(chunk, next, &to_free, list) {
                unsigned int rs, re;

                bitmap_for_each_set_region(chunk->populated, rs, re, 0,
                                           chunk->nr_pages) {
                        pcpu_depopulate_chunk(chunk, rs, re);
                        spin_lock_irq(&pcpu_lock);
                        pcpu_chunk_depopulated(chunk, rs, re);
                        spin_unlock_irq(&pcpu_lock);
                }
                pcpu_destroy_chunk(chunk);
                cond_resched();
        }

        /*
         * Ensure there are certain number of free populated pages for
         * atomic allocs.  Fill up from the most packed so that atomic
         * allocs don't increase fragmentation.  If atomic allocation
         * failed previously, always populate the maximum amount.  This
         * should prevent atomic allocs larger than PAGE_SIZE from keeping
         * failing indefinitely; however, large atomic allocs are not
         * something we support properly and can be highly unreliable and
         * inefficient.
         */
retry_pop:
        if (pcpu_atomic_alloc_failed) {
                nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
                /* best effort anyway, don't worry about synchronization */
                pcpu_atomic_alloc_failed = false;
        } else {
                nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
                                  pcpu_nr_empty_pop_pages[type],
                                  0, PCPU_EMPTY_POP_PAGES_HIGH);
        }

        for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
                unsigned int nr_unpop = 0, rs, re;

                if (!nr_to_pop)
                        break;

                spin_lock_irq(&pcpu_lock);
                list_for_each_entry(chunk, &pcpu_slot[slot], list) {
                        nr_unpop = chunk->nr_pages - chunk->nr_populated;
                        if (nr_unpop)
                                break;
                }
                spin_unlock_irq(&pcpu_lock);

                if (!nr_unpop)
                        continue;

                /* @chunk can't go away while pcpu_alloc_mutex is held */
                bitmap_for_each_clear_region(chunk->populated, rs, re, 0,
                                             chunk->nr_pages) {
                        int nr = min_t(int, re - rs, nr_to_pop);

                        ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
                        if (!ret) {
                                nr_to_pop -= nr;
                                spin_lock_irq(&pcpu_lock);
                                pcpu_chunk_populated(chunk, rs, rs + nr);
                                spin_unlock_irq(&pcpu_lock);
                        } else {
                                nr_to_pop = 0;
                        }

                        if (!nr_to_pop)
                                break;
                }
        }

        if (nr_to_pop) {
                /* ran out of chunks to populate, create a new one and retry */
                chunk = pcpu_create_chunk(type, gfp);
                if (chunk) {
                        spin_lock_irq(&pcpu_lock);
                        pcpu_chunk_relocate(chunk, -1);
                        spin_unlock_irq(&pcpu_lock);
                        goto retry_pop;
                }
        }

        mutex_unlock(&pcpu_alloc_mutex);
}

/**
 * pcpu_balance_workfn - manage the amount of free chunks and populated pages
 * @work: unused
 *
 * Call __pcpu_balance_workfn() for each chunk type.
 */
static void pcpu_balance_workfn(struct work_struct *work)
{
        enum pcpu_chunk_type type;

        for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
                __pcpu_balance_workfn(type);
}

/**
 * free_percpu - free percpu area
 * @ptr: pointer to area to free
 *
 * Free percpu area @ptr.
 *
 * CONTEXT:
 * Can be called from atomic context.
 */
void free_percpu(void __percpu *ptr)
{
        void *addr;
        struct pcpu_chunk *chunk;
        unsigned long flags;
        int size, off;
        bool need_balance = false;
        struct list_head *pcpu_slot;

        if (!ptr)
                return;

        kmemleak_free_percpu(ptr);

        addr = __pcpu_ptr_to_addr(ptr);

        spin_lock_irqsave(&pcpu_lock, flags);

        chunk = pcpu_chunk_addr_search(addr);
        off = addr - chunk->base_addr;

        size = pcpu_free_area(chunk, off);

        pcpu_slot = pcpu_chunk_list(pcpu_chunk_type(chunk));

        pcpu_memcg_free_hook(chunk, off, size);

        /* if there are more than one fully free chunks, wake up grim reaper */
        if (chunk->free_bytes == pcpu_unit_size) {
                struct pcpu_chunk *pos;

                list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
                        if (pos != chunk) {
                                need_balance = true;
                                break;
                        }
        }

        trace_percpu_free_percpu(chunk->base_addr, off, ptr);

        spin_unlock_irqrestore(&pcpu_lock, flags);

        if (need_balance)
                pcpu_schedule_balance_work();
}
EXPORT_SYMBOL_GPL(free_percpu);

bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
{
#ifdef CONFIG_SMP
        const size_t static_size = __per_cpu_end - __per_cpu_start;
        void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
        unsigned int cpu;

        for_each_possible_cpu(cpu) {
                void *start = per_cpu_ptr(base, cpu);
                void *va = (void *)addr;

                if (va >= start && va < start + static_size) {
                        if (can_addr) {
                                *can_addr = (unsigned long) (va - start);
                                *can_addr += (unsigned long)
                                        per_cpu_ptr(base, get_boot_cpu_id());
                        }
                        return true;
                }
        }
#endif
        /* on UP, can't distinguish from other static vars, always false */
        return false;
}

/**
 * is_kernel_percpu_address - test whether address is from static percpu area
 * @addr: address to test
 *
 * Test whether @addr belongs to in-kernel static percpu area.  Module
 * static percpu areas are not considered.  For those, use
 * is_module_percpu_address().
 *
 * RETURNS:
 * %true if @addr is from in-kernel static percpu area, %false otherwise.
 */
bool is_kernel_percpu_address(unsigned long addr)
{
        return __is_kernel_percpu_address(addr, NULL);
}

/**
 * per_cpu_ptr_to_phys - convert translated percpu address to physical address
 * @addr: the address to be converted to physical address
 *
 * Given @addr which is dereferenceable address obtained via one of
 * percpu access macros, this function translates it into its physical
 * address.  The caller is responsible for ensuring @addr stays valid
 * until this function finishes.
 *
 * percpu allocator has special setup for the first chunk, which currently
 * supports either embedding in linear address space or vmalloc mapping,
 * and, from the second one, the backing allocator (currently either vm or
 * km) provides translation.
 *
 * The addr can be translated simply without checking if it falls into the
 * first chunk. But the current code reflects better how percpu allocator
 * actually works, and the verification can discover both bugs in percpu
 * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current
 * code.
 *
 * RETURNS:
 * The physical address for @addr.
 */
phys_addr_t per_cpu_ptr_to_phys(void *addr)
{
        void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
        bool in_first_chunk = false;
        unsigned long first_low, first_high;
        unsigned int cpu;

        /*
         * The following test on unit_low/high isn't strictly
         * necessary but will speed up lookups of addresses which
         * aren't in the first chunk.
         *
         * The address check is against full chunk sizes.  pcpu_base_addr
         * points to the beginning of the first chunk including the
         * static region.  Assumes good intent as the first chunk may
         * not be full (ie. < pcpu_unit_pages in size).
         */
        first_low = (unsigned long)pcpu_base_addr +
                    pcpu_unit_page_offset(pcpu_low_unit_cpu, 0);
        first_high = (unsigned long)pcpu_base_addr +
                     pcpu_unit_page_offset(pcpu_high_unit_cpu, pcpu_unit_pages);
        if ((unsigned long)addr >= first_low &&
            (unsigned long)addr < first_high) {
                for_each_possible_cpu(cpu) {
                        void *start = per_cpu_ptr(base, cpu);

                        if (addr >= start && addr < start + pcpu_unit_size) {
                                in_first_chunk = true;
                                break;
                        }
                }
        }

        if (in_first_chunk) {
                if (!is_vmalloc_addr(addr))
                        return __pa(addr);
                else
                        return page_to_phys(vmalloc_to_page(addr)) +
                               offset_in_page(addr);
        } else
                return page_to_phys(pcpu_addr_to_page(addr)) +
                       offset_in_page(addr);
}

/**
 * pcpu_alloc_alloc_info - allocate percpu allocation info
 * @nr_groups: the number of groups
 * @nr_units: the number of units
 *
 * Allocate ai which is large enough for @nr_groups groups containing
 * @nr_units units.  The returned ai's groups[0].cpu_map points to the
 * cpu_map array which is long enough for @nr_units and filled with
 * NR_CPUS.  It's the caller's responsibility to initialize cpu_map
 * pointer of other groups.
 *
 * RETURNS:
 * Pointer to the allocated pcpu_alloc_info on success, NULL on
 * failure.
 */
struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
                                                      int nr_units)
{
        struct pcpu_alloc_info *ai;
        size_t base_size, ai_size;
        void *ptr;
        int unit;

        base_size = ALIGN(struct_size(ai, groups, nr_groups),
                          __alignof__(ai->groups[0].cpu_map[0]));
        ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);

        ptr = memblock_alloc(PFN_ALIGN(ai_size), PAGE_SIZE);
        if (!ptr)
                return NULL;
        ai = ptr;
        ptr += base_size;

        ai->groups[0].cpu_map = ptr;

        for (unit = 0; unit < nr_units; unit++)
                ai->groups[0].cpu_map[unit] = NR_CPUS;

        ai->nr_groups = nr_groups;
        ai->__ai_size = PFN_ALIGN(ai_size);

        return ai;
}

/**
 * pcpu_free_alloc_info - free percpu allocation info
 * @ai: pcpu_alloc_info to free
 *
 * Free @ai which was allocated by pcpu_alloc_alloc_info().
 */
void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
{
        memblock_free_early(__pa(ai), ai->__ai_size);
}

/**
 * pcpu_dump_alloc_info - print out information about pcpu_alloc_info
 * @lvl: loglevel
 * @ai: allocation info to dump
 *
 * Print out information about @ai using loglevel @lvl.
 */
static void pcpu_dump_alloc_info(const char *lvl,
                                 const struct pcpu_alloc_info *ai)
{
        int group_width = 1, cpu_width = 1, width;
        char empty_str[] = "--------";
        int alloc = 0, alloc_end = 0;
        int group, v;
        int upa, apl;        /* units per alloc, allocs per line */

        v = ai->nr_groups;
        while (v /= 10)
                group_width++;

        v = num_possible_cpus();
        while (v /= 10)
                cpu_width++;
        empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';

        upa = ai->alloc_size / ai->unit_size;
        width = upa * (cpu_width + 1) + group_width + 3;
        apl = rounddown_pow_of_two(max(60 / width, 1));

        printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
               lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
               ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);

        for (group = 0; group < ai->nr_groups; group++) {
                const struct pcpu_group_info *gi = &ai->groups[group];
                int unit = 0, unit_end = 0;

                BUG_ON(gi->nr_units % upa);
                for (alloc_end += gi->nr_units / upa;
                     alloc < alloc_end; alloc++) {
                        if (!(alloc % apl)) {
                                pr_cont("\n");
                                printk("%spcpu-alloc: ", lvl);
                        }
                        pr_cont("[%0*d] ", group_width, group);

                        for (unit_end += upa; unit < unit_end; unit++)
                                if (gi->cpu_map[unit] != NR_CPUS)
                                        pr_cont("%0*d ",
                                                cpu_width, gi->cpu_map[unit]);
                                else
                                        pr_cont("%s ", empty_str);
                }
        }
        pr_cont("\n");
}

/**
 * pcpu_setup_first_chunk - initialize the first percpu chunk
 * @ai: pcpu_alloc_info describing how to percpu area is shaped
 * @base_addr: mapped address
 *
 * Initialize the first percpu chunk which contains the kernel static
 * percpu area.  This function is to be called from arch percpu area
 * setup path.
 *
 * @ai contains all information necessary to initialize the first
 * chunk and prime the dynamic percpu allocator.
 *
 * @ai->static_size is the size of static percpu area.
 *
 * @ai->reserved_size, if non-zero, specifies the amount of bytes to
 * reserve after the static area in the first chunk.  This reserves
 * the first chunk such that it's available only through reserved
 * percpu allocation.  This is primarily used to serve module percpu
 * static areas on architectures where the addressing model has
 * limited offset range for symbol relocations to guarantee module
 * percpu symbols fall inside the relocatable range.
 *
 * @ai->dyn_size determines the number of bytes available for dynamic
 * allocation in the first chunk.  The area between @ai->static_size +
 * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused.
 *
 * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE
 * and equal to or larger than @ai->static_size + @ai->reserved_size +
 * @ai->dyn_size.
 *
 * @ai->atom_size is the allocation atom size and used as alignment
 * for vm areas.
 *
 * @ai->alloc_size is the allocation size and always multiple of
 * @ai->atom_size.  This is larger than @ai->atom_size if
 * @ai->unit_size is larger than @ai->atom_size.
 *
 * @ai->nr_groups and @ai->groups describe virtual memory layout of
 * percpu areas.  Units which should be colocated are put into the
 * same group.  Dynamic VM areas will be allocated according to these
 * groupings.  If @ai->nr_groups is zero, a single group containing
 * all units is assumed.
 *
 * The caller should have mapped the first chunk at @base_addr and
 * copied static data to each unit.
 *
 * The first chunk will always contain a static and a dynamic region.
 * However, the static region is not managed by any chunk.  If the first
 * chunk also contains a reserved region, it is served by two chunks -
 * one for the reserved region and one for the dynamic region.  They
 * share the same vm, but use offset regions in the area allocation map.
 * The chunk serving the dynamic region is circulated in the chunk slots
 * and available for dynamic allocation like any other chunk.
 */
void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
                                   void *base_addr)
{
        size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
        size_t static_size, dyn_size;
        struct pcpu_chunk *chunk;
        unsigned long *group_offsets;
        size_t *group_sizes;
        unsigned long *unit_off;
        unsigned int cpu;
        int *unit_map;
        int group, unit, i;
        int map_size;
        unsigned long tmp_addr;
        size_t alloc_size;
        enum pcpu_chunk_type type;

#define PCPU_SETUP_BUG_ON(cond)        do {                                        \
        if (unlikely(cond)) {                                                \
                pr_emerg("failed to initialize, %s\n", #cond);                \
                pr_emerg("cpu_possible_mask=%*pb\n",                        \
                         cpumask_pr_args(cpu_possible_mask));                \
                pcpu_dump_alloc_info(KERN_EMERG, ai);                        \
                BUG();                                                        \
        }                                                                \
} while (0)

        /* sanity checks */
        PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
#ifdef CONFIG_SMP
        PCPU_SETUP_BUG_ON(!ai->static_size);
        PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start));
#endif
        PCPU_SETUP_BUG_ON(!base_addr);
        PCPU_SETUP_BUG_ON(offset_in_page(base_addr));
        PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
        PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size));
        PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
        PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE));
        PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
        PCPU_SETUP_BUG_ON(!ai->dyn_size);
        PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE));
        PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) ||
                            IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE)));
        PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);

        /* process group information and build config tables accordingly */
        alloc_size = ai->nr_groups * sizeof(group_offsets[0]);
        group_offsets = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
        if (!group_offsets)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      alloc_size);

        alloc_size = ai->nr_groups * sizeof(group_sizes[0]);
        group_sizes = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
        if (!group_sizes)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      alloc_size);

        alloc_size = nr_cpu_ids * sizeof(unit_map[0]);
        unit_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
        if (!unit_map)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      alloc_size);

        alloc_size = nr_cpu_ids * sizeof(unit_off[0]);
        unit_off = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
        if (!unit_off)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      alloc_size);

        for (cpu = 0; cpu < nr_cpu_ids; cpu++)
                unit_map[cpu] = UINT_MAX;

        pcpu_low_unit_cpu = NR_CPUS;
        pcpu_high_unit_cpu = NR_CPUS;

        for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
                const struct pcpu_group_info *gi = &ai->groups[group];

                group_offsets[group] = gi->base_offset;
                group_sizes[group] = gi->nr_units * ai->unit_size;

                for (i = 0; i < gi->nr_units; i++) {
                        cpu = gi->cpu_map[i];
                        if (cpu == NR_CPUS)
                                continue;

                        PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids);
                        PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
                        PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);

                        unit_map[cpu] = unit + i;
                        unit_off[cpu] = gi->base_offset + i * ai->unit_size;

                        /* determine low/high unit_cpu */
                        if (pcpu_low_unit_cpu == NR_CPUS ||
                            unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
                                pcpu_low_unit_cpu = cpu;
                        if (pcpu_high_unit_cpu == NR_CPUS ||
                            unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
                                pcpu_high_unit_cpu = cpu;
                }
        }
        pcpu_nr_units = unit;

        for_each_possible_cpu(cpu)
                PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);

        /* we're done parsing the input, undefine BUG macro and dump config */
#undef PCPU_SETUP_BUG_ON
        pcpu_dump_alloc_info(KERN_DEBUG, ai);

        pcpu_nr_groups = ai->nr_groups;
        pcpu_group_offsets = group_offsets;
        pcpu_group_sizes = group_sizes;
        pcpu_unit_map = unit_map;
        pcpu_unit_offsets = unit_off;

        /* determine basic parameters */
        pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
        pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
        pcpu_atom_size = ai->atom_size;
        pcpu_chunk_struct_size = struct_size(chunk, populated,
                                             BITS_TO_LONGS(pcpu_unit_pages));

        pcpu_stats_save_ai(ai);

        /*
         * Allocate chunk slots.  The additional last slot is for
         * empty chunks.
         */
        pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
        pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots *
                                          sizeof(pcpu_chunk_lists[0]) *
                                          PCPU_NR_CHUNK_TYPES,
                                          SMP_CACHE_BYTES);
        if (!pcpu_chunk_lists)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]) *
                      PCPU_NR_CHUNK_TYPES);

        for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
                for (i = 0; i < pcpu_nr_slots; i++)
                        INIT_LIST_HEAD(&pcpu_chunk_list(type)[i]);

        /*
         * The end of the static region needs to be aligned with the
         * minimum allocation size as this offsets the reserved and
         * dynamic region.  The first chunk ends page aligned by
         * expanding the dynamic region, therefore the dynamic region
         * can be shrunk to compensate while still staying above the
         * configured sizes.
         */
        static_size = ALIGN(ai->static_size, PCPU_MIN_ALLOC_SIZE);
        dyn_size = ai->dyn_size - (static_size - ai->static_size);

        /*
         * Initialize first chunk.
         * If the reserved_size is non-zero, this initializes the reserved
         * chunk.  If the reserved_size is zero, the reserved chunk is NULL
         * and the dynamic region is initialized here.  The first chunk,
         * pcpu_first_chunk, will always point to the chunk that serves
         * the dynamic region.
         */
        tmp_addr = (unsigned long)base_addr + static_size;
        map_size = ai->reserved_size ?: dyn_size;
        chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);

        /* init dynamic chunk if necessary */
        if (ai->reserved_size) {
                pcpu_reserved_chunk = chunk;

                tmp_addr = (unsigned long)base_addr + static_size +
                           ai->reserved_size;
                map_size = dyn_size;
                chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
        }

        /* link the first chunk in */
        pcpu_first_chunk = chunk;
        pcpu_nr_empty_pop_pages[PCPU_CHUNK_ROOT] = pcpu_first_chunk->nr_empty_pop_pages;
        pcpu_chunk_relocate(pcpu_first_chunk, -1);

        /* include all regions of the first chunk */
        pcpu_nr_populated += PFN_DOWN(size_sum);

        pcpu_stats_chunk_alloc();
        trace_percpu_create_chunk(base_addr);

        /* we're done */
        pcpu_base_addr = base_addr;
}

#ifdef CONFIG_SMP

const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = {
        [PCPU_FC_AUTO]        = "auto",
        [PCPU_FC_EMBED]        = "embed",
        [PCPU_FC_PAGE]        = "page",
};

enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;

static int __init percpu_alloc_setup(char *str)
{
        if (!str)
                return -EINVAL;

        if (0)
                /* nada */;
#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
        else if (!strcmp(str, "embed"))
                pcpu_chosen_fc = PCPU_FC_EMBED;
#endif
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
        else if (!strcmp(str, "page"))
                pcpu_chosen_fc = PCPU_FC_PAGE;
#endif
        else
                pr_warn("unknown allocator %s specified\n", str);

        return 0;
}
early_param("percpu_alloc", percpu_alloc_setup);

/*
 * pcpu_embed_first_chunk() is used by the generic percpu setup.
 * Build it if needed by the arch config or the generic setup is going
 * to be used.
 */
#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
        !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
#define BUILD_EMBED_FIRST_CHUNK
#endif

/* build pcpu_page_first_chunk() iff needed by the arch config */
#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
#define BUILD_PAGE_FIRST_CHUNK
#endif

/* pcpu_build_alloc_info() is used by both embed and page first chunk */
#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
/**
 * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
 * @reserved_size: the size of reserved percpu area in bytes
 * @dyn_size: minimum free size for dynamic allocation in bytes
 * @atom_size: allocation atom size
 * @cpu_distance_fn: callback to determine distance between cpus, optional
 *
 * This function determines grouping of units, their mappings to cpus
 * and other parameters considering needed percpu size, allocation
 * atom size and distances between CPUs.
 *
 * Groups are always multiples of atom size and CPUs which are of
 * LOCAL_DISTANCE both ways are grouped together and share space for
 * units in the same group.  The returned configuration is guaranteed
 * to have CPUs on different nodes on different groups and >=75% usage
 * of allocated virtual address space.
 *
 * RETURNS:
 * On success, pointer to the new allocation_info is returned.  On
 * failure, ERR_PTR value is returned.
 */
static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
                                size_t reserved_size, size_t dyn_size,
                                size_t atom_size,
                                pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
{
        static int group_map[NR_CPUS] __initdata;
        static int group_cnt[NR_CPUS] __initdata;
        const size_t static_size = __per_cpu_end - __per_cpu_start;
        int nr_groups = 1, nr_units = 0;
        size_t size_sum, min_unit_size, alloc_size;
        int upa, max_upa, best_upa;        /* units_per_alloc */
        int last_allocs, group, unit;
        unsigned int cpu, tcpu;
        struct pcpu_alloc_info *ai;
        unsigned int *cpu_map;

        /* this function may be called multiple times */
        memset(group_map, 0, sizeof(group_map));
        memset(group_cnt, 0, sizeof(group_cnt));

        /* calculate size_sum and ensure dyn_size is enough for early alloc */
        size_sum = PFN_ALIGN(static_size + reserved_size +
                            max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
        dyn_size = size_sum - static_size - reserved_size;

        /*
         * Determine min_unit_size, alloc_size and max_upa such that
         * alloc_size is multiple of atom_size and is the smallest
         * which can accommodate 4k aligned segments which are equal to
         * or larger than min_unit_size.
         */
        min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);

        /* determine the maximum # of units that can fit in an allocation */
        alloc_size = roundup(min_unit_size, atom_size);
        upa = alloc_size / min_unit_size;
        while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
                upa--;
        max_upa = upa;

        /* group cpus according to their proximity */
        for_each_possible_cpu(cpu) {
                group = 0;
        next_group:
                for_each_possible_cpu(tcpu) {
                        if (cpu == tcpu)
                                break;
                        if (group_map[tcpu] == group && cpu_distance_fn &&
                            (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
                             cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
                                group++;
                                nr_groups = max(nr_groups, group + 1);
                                goto next_group;
                        }
                }
                group_map[cpu] = group;
                group_cnt[group]++;
        }

        /*
         * Wasted space is caused by a ratio imbalance of upa to group_cnt.
         * Expand the unit_size until we use >= 75% of the units allocated.
         * Related to atom_size, which could be much larger than the unit_size.
         */
        last_allocs = INT_MAX;
        for (upa = max_upa; upa; upa--) {
                int allocs = 0, wasted = 0;

                if (alloc_size % upa || (offset_in_page(alloc_size / upa)))
                        continue;

                for (group = 0; group < nr_groups; group++) {
                        int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
                        allocs += this_allocs;
                        wasted += this_allocs * upa - group_cnt[group];
                }

                /*
                 * Don't accept if wastage is over 1/3.  The
                 * greater-than comparison ensures upa==1 always
                 * passes the following check.
                 */
                if (wasted > num_possible_cpus() / 3)
                        continue;

                /* and then don't consume more memory */
                if (allocs > last_allocs)
                        break;
                last_allocs = allocs;
                best_upa = upa;
        }
        upa = best_upa;

        /* allocate and fill alloc_info */
        for (group = 0; group < nr_groups; group++)
                nr_units += roundup(group_cnt[group], upa);

        ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
        if (!ai)
                return ERR_PTR(-ENOMEM);
        cpu_map = ai->groups[0].cpu_map;

        for (group = 0; group < nr_groups; group++) {
                ai->groups[group].cpu_map = cpu_map;
                cpu_map += roundup(group_cnt[group], upa);
        }

        ai->static_size = static_size;
        ai->reserved_size = reserved_size;
        ai->dyn_size = dyn_size;
        ai->unit_size = alloc_size / upa;
        ai->atom_size = atom_size;
        ai->alloc_size = alloc_size;

        for (group = 0, unit = 0; group < nr_groups; group++) {
                struct pcpu_group_info *gi = &ai->groups[group];

                /*
                 * Initialize base_offset as if all groups are located
                 * back-to-back.  The caller should update this to
                 * reflect actual allocation.
                 */
                gi->base_offset = unit * ai->unit_size;

                for_each_possible_cpu(cpu)
                        if (group_map[cpu] == group)
                                gi->cpu_map[gi->nr_units++] = cpu;
                gi->nr_units = roundup(gi->nr_units, upa);
                unit += gi->nr_units;
        }
        BUG_ON(unit != nr_units);

        return ai;
}
#endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */

#if defined(BUILD_EMBED_FIRST_CHUNK)
/**
 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
 * @reserved_size: the size of reserved percpu area in bytes
 * @dyn_size: minimum free size for dynamic allocation in bytes
 * @atom_size: allocation atom size
 * @cpu_distance_fn: callback to determine distance between cpus, optional
 * @alloc_fn: function to allocate percpu page
 * @free_fn: function to free percpu page
 *
 * This is a helper to ease setting up embedded first percpu chunk and
 * can be called where pcpu_setup_first_chunk() is expected.
 *
 * If this function is used to setup the first chunk, it is allocated
 * by calling @alloc_fn and used as-is without being mapped into
 * vmalloc area.  Allocations are always whole multiples of @atom_size
 * aligned to @atom_size.
 *
 * This enables the first chunk to piggy back on the linear physical
 * mapping which often uses larger page size.  Please note that this
 * can result in very sparse cpu->unit mapping on NUMA machines thus
 * requiring large vmalloc address space.  Don't use this allocator if
 * vmalloc space is not orders of magnitude larger than distances
 * between node memory addresses (ie. 32bit NUMA machines).
 *
 * @dyn_size specifies the minimum dynamic area size.
 *
 * If the needed size is smaller than the minimum or specified unit
 * size, the leftover is returned using @free_fn.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
                                  size_t atom_size,
                                  pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
                                  pcpu_fc_alloc_fn_t alloc_fn,
                                  pcpu_fc_free_fn_t free_fn)
{
        void *base = (void *)ULONG_MAX;
        void **areas = NULL;
        struct pcpu_alloc_info *ai;
        size_t size_sum, areas_size;
        unsigned long max_distance;
        int group, i, highest_group, rc = 0;

        ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
                                   cpu_distance_fn);
        if (IS_ERR(ai))
                return PTR_ERR(ai);

        size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
        areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));

        areas = memblock_alloc(areas_size, SMP_CACHE_BYTES);
        if (!areas) {
                rc = -ENOMEM;
                goto out_free;
        }

        /* allocate, copy and determine base address & max_distance */
        highest_group = 0;
        for (group = 0; group < ai->nr_groups; group++) {
                struct pcpu_group_info *gi = &ai->groups[group];
                unsigned int cpu = NR_CPUS;
                void *ptr;

                for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
                        cpu = gi->cpu_map[i];
                BUG_ON(cpu == NR_CPUS);

                /* allocate space for the whole group */
                ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
                if (!ptr) {
                        rc = -ENOMEM;
                        goto out_free_areas;
                }
                /* kmemleak tracks the percpu allocations separately */
                kmemleak_free(ptr);
                areas[group] = ptr;

                base = min(ptr, base);
                if (ptr > areas[highest_group])
                        highest_group = group;
        }
        max_distance = areas[highest_group] - base;
        max_distance += ai->unit_size * ai->groups[highest_group].nr_units;

        /* warn if maximum distance is further than 75% of vmalloc space */
        if (max_distance > VMALLOC_TOTAL * 3 / 4) {
                pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n",
                                max_distance, VMALLOC_TOTAL);
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
                /* and fail if we have fallback */
                rc = -EINVAL;
                goto out_free_areas;
#endif
        }

        /*
         * Copy data and free unused parts.  This should happen after all
         * allocations are complete; otherwise, we may end up with
         * overlapping groups.
         */
        for (group = 0; group < ai->nr_groups; group++) {
                struct pcpu_group_info *gi = &ai->groups[group];
                void *ptr = areas[group];

                for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
                        if (gi->cpu_map[i] == NR_CPUS) {
                                /* unused unit, free whole */
                                free_fn(ptr, ai->unit_size);
                                continue;
                        }
                        /* copy and return the unused part */
                        memcpy(ptr, __per_cpu_load, ai->static_size);
                        free_fn(ptr + size_sum, ai->unit_size - size_sum);
                }
        }

        /* base address is now known, determine group base offsets */
        for (group = 0; group < ai->nr_groups; group++) {
                ai->groups[group].base_offset = areas[group] - base;
        }

        pr_info("Embedded %zu pages/cpu s%zu r%zu d%zu u%zu\n",
                PFN_DOWN(size_sum), ai->static_size, ai->reserved_size,
                ai->dyn_size, ai->unit_size);

        pcpu_setup_first_chunk(ai, base);
        goto out_free;

out_free_areas:
        for (group = 0; group < ai->nr_groups; group++)
                if (areas[group])
                        free_fn(areas[group],
                                ai->groups[group].nr_units * ai->unit_size);
out_free:
        pcpu_free_alloc_info(ai);
        if (areas)
                memblock_free_early(__pa(areas), areas_size);
        return rc;
}
#endif /* BUILD_EMBED_FIRST_CHUNK */

#ifdef BUILD_PAGE_FIRST_CHUNK
/**
 * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
 * @reserved_size: the size of reserved percpu area in bytes
 * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
 * @free_fn: function to free percpu page, always called with PAGE_SIZE
 * @populate_pte_fn: function to populate pte
 *
 * This is a helper to ease setting up page-remapped first percpu
 * chunk and can be called where pcpu_setup_first_chunk() is expected.
 *
 * This is the basic allocator.  Static percpu area is allocated
 * page-by-page into vmalloc area.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
int __init pcpu_page_first_chunk(size_t reserved_size,
                                 pcpu_fc_alloc_fn_t alloc_fn,
                                 pcpu_fc_free_fn_t free_fn,
                                 pcpu_fc_populate_pte_fn_t populate_pte_fn)
{
        static struct vm_struct vm;
        struct pcpu_alloc_info *ai;
        char psize_str[16];
        int unit_pages;
        size_t pages_size;
        struct page **pages;
        int unit, i, j, rc = 0;
        int upa;
        int nr_g0_units;

        snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);

        ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);
        if (IS_ERR(ai))
                return PTR_ERR(ai);
        BUG_ON(ai->nr_groups != 1);
        upa = ai->alloc_size/ai->unit_size;
        nr_g0_units = roundup(num_possible_cpus(), upa);
        if (WARN_ON(ai->groups[0].nr_units != nr_g0_units)) {
                pcpu_free_alloc_info(ai);
                return -EINVAL;
        }

        unit_pages = ai->unit_size >> PAGE_SHIFT;

        /* unaligned allocations can't be freed, round up to page size */
        pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
                               sizeof(pages[0]));
        pages = memblock_alloc(pages_size, SMP_CACHE_BYTES);
        if (!pages)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      pages_size);

        /* allocate pages */
        j = 0;
        for (unit = 0; unit < num_possible_cpus(); unit++) {
                unsigned int cpu = ai->groups[0].cpu_map[unit];
                for (i = 0; i < unit_pages; i++) {
                        void *ptr;

                        ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
                        if (!ptr) {
                                pr_warn("failed to allocate %s page for cpu%u\n",
                                                psize_str, cpu);
                                goto enomem;
                        }
                        /* kmemleak tracks the percpu allocations separately */
                        kmemleak_free(ptr);
                        pages[j++] = virt_to_page(ptr);
                }
        }

        /* allocate vm area, map the pages and copy static data */
        vm.flags = VM_ALLOC;
        vm.size = num_possible_cpus() * ai->unit_size;
        vm_area_register_early(&vm, PAGE_SIZE);

        for (unit = 0; unit < num_possible_cpus(); unit++) {
                unsigned long unit_addr =
                        (unsigned long)vm.addr + unit * ai->unit_size;

                for (i = 0; i < unit_pages; i++)
                        populate_pte_fn(unit_addr + (i << PAGE_SHIFT));

                /* pte already populated, the following shouldn't fail */
                rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
                                      unit_pages);
                if (rc < 0)
                        panic("failed to map percpu area, err=%d\n", rc);

                /*
                 * FIXME: Archs with virtual cache should flush local
                 * cache for the linear mapping here - something
                 * equivalent to flush_cache_vmap() on the local cpu.
                 * flush_cache_vmap() can't be used as most supporting
                 * data structures are not set up yet.
                 */

                /* copy static data */
                memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
        }

        /* we're ready, commit */
        pr_info("%d %s pages/cpu s%zu r%zu d%zu\n",
                unit_pages, psize_str, ai->static_size,
                ai->reserved_size, ai->dyn_size);

        pcpu_setup_first_chunk(ai, vm.addr);
        goto out_free_ar;

enomem:
        while (--j >= 0)
                free_fn(page_address(pages[j]), PAGE_SIZE);
        rc = -ENOMEM;
out_free_ar:
        memblock_free_early(__pa(pages), pages_size);
        pcpu_free_alloc_info(ai);
        return rc;
}
#endif /* BUILD_PAGE_FIRST_CHUNK */

#ifndef        CONFIG_HAVE_SETUP_PER_CPU_AREA
/*
 * Generic SMP percpu area setup.
 *
 * The embedding helper is used because its behavior closely resembles
 * the original non-dynamic generic percpu area setup.  This is
 * important because many archs have addressing restrictions and might
 * fail if the percpu area is located far away from the previous
 * location.  As an added bonus, in non-NUMA cases, embedding is
 * generally a good idea TLB-wise because percpu area can piggy back
 * on the physical linear memory mapping which uses large page
 * mappings on applicable archs.
 */
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(__per_cpu_offset);

static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
                                       size_t align)
{
        return  memblock_alloc_from(size, align, __pa(MAX_DMA_ADDRESS));
}

static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
{
        memblock_free_early(__pa(ptr), size);
}

void __init setup_per_cpu_areas(void)
{
        unsigned long delta;
        unsigned int cpu;
        int rc;

        /*
         * Always reserve area for module percpu variables.  That's
         * what the legacy allocator did.
         */
        rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
                                    PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
                                    pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
        if (rc < 0)
                panic("Failed to initialize percpu areas.");

        delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
        for_each_possible_cpu(cpu)
                __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
}
#endif        /* CONFIG_HAVE_SETUP_PER_CPU_AREA */

#else        /* CONFIG_SMP */

/*
 * UP percpu area setup.
 *
 * UP always uses km-based percpu allocator with identity mapping.
 * Static percpu variables are indistinguishable from the usual static
 * variables and don't require any special preparation.
 */
void __init setup_per_cpu_areas(void)
{
        const size_t unit_size =
                roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
                                         PERCPU_DYNAMIC_RESERVE));
        struct pcpu_alloc_info *ai;
        void *fc;

        ai = pcpu_alloc_alloc_info(1, 1);
        fc = memblock_alloc_from(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
        if (!ai || !fc)
                panic("Failed to allocate memory for percpu areas.");
        /* kmemleak tracks the percpu allocations separately */
        kmemleak_free(fc);

        ai->dyn_size = unit_size;
        ai->unit_size = unit_size;
        ai->atom_size = unit_size;
        ai->alloc_size = unit_size;
        ai->groups[0].nr_units = 1;
        ai->groups[0].cpu_map[0] = 0;

        pcpu_setup_first_chunk(ai, fc);
        pcpu_free_alloc_info(ai);
}

#endif        /* CONFIG_SMP */

/*
 * pcpu_nr_pages - calculate total number of populated backing pages
 *
 * This reflects the number of pages populated to back chunks.  Metadata is
 * excluded in the number exposed in meminfo as the number of backing pages
 * scales with the number of cpus and can quickly outweigh the memory used for
 * metadata.  It also keeps this calculation nice and simple.
 *
 * RETURNS:
 * Total number of populated backing pages in use by the allocator.
 */
unsigned long pcpu_nr_pages(void)
{
        return pcpu_nr_populated * pcpu_nr_units;
}

/*
 * Percpu allocator is initialized early during boot when neither slab or
 * workqueue is available.  Plug async management until everything is up
 * and running.
 */
static int __init percpu_enable_async(void)
{
        pcpu_async_enabled = true;
        return 0;
}
subsys_initcall(percpu_enable_async);

































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    4 
    4 


    3 

    3 


    3 

    3 



    3 

    2 


    3 
    3 






















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
// SPDX-License-Identifier: GPL-2.0-only
/*
 * mm/mmap.c
 *
 * Written by obz.
 *
 * Address space accounting code        <alan@lxorguk.ukuu.org.uk>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/mm.h>
#include <linux/vmacache.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/syscalls.h>
#include <linux/capability.h>
#include <linux/init.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/personality.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
#include <linux/shmem_fs.h>
#include <linux/profile.h>
#include <linux/export.h>
#include <linux/mount.h>
#include <linux/mempolicy.h>
#include <linux/rmap.h>
#include <linux/mmu_notifier.h>
#include <linux/mmdebug.h>
#include <linux/perf_event.h>
#include <linux/audit.h>
#include <linux/khugepaged.h>
#include <linux/uprobes.h>
#include <linux/rbtree_augmented.h>
#include <linux/notifier.h>
#include <linux/memory.h>
#include <linux/printk.h>
#include <linux/userfaultfd_k.h>
#include <linux/moduleparam.h>
#include <linux/pkeys.h>
#include <linux/oom.h>
#include <linux/sched/mm.h>
#include <linux/memfd.h>

#include <linux/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>

#define CREATE_TRACE_POINTS
#include <trace/events/mmap.h>

#include "internal.h"

#ifndef arch_mmap_check
#define arch_mmap_check(addr, len, flags)        (0)
#endif

#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN;
const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX;
int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS;
#endif
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
const int mmap_rnd_compat_bits_min = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN;
const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX;
int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
#endif

static bool ignore_rlimit_data;
core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);

static void unmap_region(struct mm_struct *mm,
                struct vm_area_struct *vma, struct vm_area_struct *prev,
                unsigned long start, unsigned long end);

/* description of effects of mapping type and prot in current implementation.
 * this is due to the limited x86 page protection hardware.  The expected
 * behavior is in parens:
 *
 * map_type        prot
 *                PROT_NONE        PROT_READ        PROT_WRITE        PROT_EXEC
 * MAP_SHARED        r: (no) no        r: (yes) yes        r: (no) yes        r: (no) yes
 *                w: (no) no        w: (no) no        w: (yes) yes        w: (no) no
 *                x: (no) no        x: (no) yes        x: (no) yes        x: (yes) yes
 *
 * MAP_PRIVATE        r: (no) no        r: (yes) yes        r: (no) yes        r: (no) yes
 *                w: (no) no        w: (no) no        w: (copy) copy        w: (no) no
 *                x: (no) no        x: (no) yes        x: (no) yes        x: (yes) yes
 */
pgprot_t protection_map[16] __ro_after_init = {
        __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
        __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
};

#ifndef CONFIG_ARCH_HAS_FILTER_PGPROT
static inline pgprot_t arch_filter_pgprot(pgprot_t prot)
{
        return prot;
}
#endif

pgprot_t vm_get_page_prot(unsigned long vm_flags)
{
        pgprot_t ret = __pgprot(pgprot_val(protection_map[vm_flags &
                                (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) |
                        pgprot_val(arch_vm_get_page_prot(vm_flags)));

        return arch_filter_pgprot(ret);
}
EXPORT_SYMBOL(vm_get_page_prot);

static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
{
        return pgprot_modify(oldprot, vm_get_page_prot(vm_flags));
}

/* Update vma->vm_page_prot to reflect vma->vm_flags. */
void vma_set_page_prot(struct vm_area_struct *vma)
{
        unsigned long vm_flags = vma->vm_flags;
        pgprot_t vm_page_prot;

        vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags);
        if (vma_wants_writenotify(vma, vm_page_prot)) {
                vm_flags &= ~VM_SHARED;
                vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags);
        }
        /* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */
        WRITE_ONCE(vma->vm_page_prot, vm_page_prot);
}

/*
 * Requires inode->i_mapping->i_mmap_rwsem
 */
static void __remove_shared_vm_struct(struct vm_area_struct *vma,
                struct file *file, struct address_space *mapping)
{
        if (vma->vm_flags & VM_DENYWRITE)
                allow_write_access(file);
        if (vma_is_shared_maywrite(vma))
                mapping_unmap_writable(mapping);

        flush_dcache_mmap_lock(mapping);
        vma_interval_tree_remove(vma, &mapping->i_mmap);
        flush_dcache_mmap_unlock(mapping);
}

/*
 * Unlink a file-based vm structure from its interval tree, to hide
 * vma from rmap and vmtruncate before freeing its page tables.
 */
void unlink_file_vma(struct vm_area_struct *vma)
{
        struct file *file = vma->vm_file;

        if (file) {
                struct address_space *mapping = file->f_mapping;
                i_mmap_lock_write(mapping);
                __remove_shared_vm_struct(vma, file, mapping);
                i_mmap_unlock_write(mapping);
        }
}

/*
 * Close a vm structure and free it, returning the next.
 */
static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
{
        struct vm_area_struct *next = vma->vm_next;

        might_sleep();
        vma_close(vma);
        if (vma->vm_file)
                fput(vma->vm_file);
        mpol_put(vma_policy(vma));
        vm_area_free(vma);
        return next;
}

static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags,
                struct list_head *uf);
SYSCALL_DEFINE1(brk, unsigned long, brk)
{
        unsigned long retval;
        unsigned long newbrk, oldbrk, origbrk;
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *next;
        unsigned long min_brk;
        bool populate;
        bool downgraded = false;
        LIST_HEAD(uf);

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        origbrk = mm->brk;

#ifdef CONFIG_COMPAT_BRK
        /*
         * CONFIG_COMPAT_BRK can still be overridden by setting
         * randomize_va_space to 2, which will still cause mm->start_brk
         * to be arbitrarily shifted
         */
        if (current->brk_randomized)
                min_brk = mm->start_brk;
        else
                min_brk = mm->end_data;
#else
        min_brk = mm->start_brk;
#endif
        if (brk < min_brk)
                goto out;

        /*
         * Check against rlimit here. If this check is done later after the test
         * of oldbrk with newbrk then it can escape the test and let the data
         * segment grow beyond its set limit the in case where the limit is
         * not page aligned -Ram Gupta
         */
        if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk,
                              mm->end_data, mm->start_data))
                goto out;

        newbrk = PAGE_ALIGN(brk);
        oldbrk = PAGE_ALIGN(mm->brk);
        if (oldbrk == newbrk) {
                mm->brk = brk;
                goto success;
        }

        /*
         * Always allow shrinking brk.
         * __do_munmap() may downgrade mmap_lock to read.
         */
        if (brk <= mm->brk) {
                int ret;

                /*
                 * mm->brk must to be protected by write mmap_lock so update it
                 * before downgrading mmap_lock. When __do_munmap() fails,
                 * mm->brk will be restored from origbrk.
                 */
                mm->brk = brk;
                ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true);
                if (ret < 0) {
                        mm->brk = origbrk;
                        goto out;
                } else if (ret == 1) {
                        downgraded = true;
                }
                goto success;
        }

        /* Check against existing mmap mappings. */
        next = find_vma(mm, oldbrk);
        if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
                goto out;

        /* Ok, looks good - let it rip. */
        if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0)
                goto out;
        mm->brk = brk;

success:
        populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
        if (downgraded)
                mmap_read_unlock(mm);
        else
                mmap_write_unlock(mm);
        userfaultfd_unmap_complete(mm, &uf);
        if (populate)
                mm_populate(oldbrk, newbrk - oldbrk);
        return brk;

out:
        retval = origbrk;
        mmap_write_unlock(mm);
        return retval;
}

static inline unsigned long vma_compute_gap(struct vm_area_struct *vma)
{
        unsigned long gap, prev_end;

        /*
         * Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we
         * allow two stack_guard_gaps between them here, and when choosing
         * an unmapped area; whereas when expanding we only require one.
         * That's a little inconsistent, but keeps the code here simpler.
         */
        gap = vm_start_gap(vma);
        if (vma->vm_prev) {
                prev_end = vm_end_gap(vma->vm_prev);
                if (gap > prev_end)
                        gap -= prev_end;
                else
                        gap = 0;
        }
        return gap;
}

#ifdef CONFIG_DEBUG_VM_RB
static unsigned long vma_compute_subtree_gap(struct vm_area_struct *vma)
{
        unsigned long max = vma_compute_gap(vma), subtree_gap;
        if (vma->vm_rb.rb_left) {
                subtree_gap = rb_entry(vma->vm_rb.rb_left,
                                struct vm_area_struct, vm_rb)->rb_subtree_gap;
                if (subtree_gap > max)
                        max = subtree_gap;
        }
        if (vma->vm_rb.rb_right) {
                subtree_gap = rb_entry(vma->vm_rb.rb_right,
                                struct vm_area_struct, vm_rb)->rb_subtree_gap;
                if (subtree_gap > max)
                        max = subtree_gap;
        }
        return max;
}

static int browse_rb(struct mm_struct *mm)
{
        struct rb_root *root = &mm->mm_rb;
        int i = 0, j, bug = 0;
        struct rb_node *nd, *pn = NULL;
        unsigned long prev = 0, pend = 0;

        for (nd = rb_first(root); nd; nd = rb_next(nd)) {
                struct vm_area_struct *vma;
                vma = rb_entry(nd, struct vm_area_struct, vm_rb);
                if (vma->vm_start < prev) {
                        pr_emerg("vm_start %lx < prev %lx\n",
                                  vma->vm_start, prev);
                        bug = 1;
                }
                if (vma->vm_start < pend) {
                        pr_emerg("vm_start %lx < pend %lx\n",
                                  vma->vm_start, pend);
                        bug = 1;
                }
                if (vma->vm_start > vma->vm_end) {
                        pr_emerg("vm_start %lx > vm_end %lx\n",
                                  vma->vm_start, vma->vm_end);
                        bug = 1;
                }
                spin_lock(&mm->page_table_lock);
                if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
                        pr_emerg("free gap %lx, correct %lx\n",
                               vma->rb_subtree_gap,
                               vma_compute_subtree_gap(vma));
                        bug = 1;
                }
                spin_unlock(&mm->page_table_lock);
                i++;
                pn = nd;
                prev = vma->vm_start;
                pend = vma->vm_end;
        }
        j = 0;
        for (nd = pn; nd; nd = rb_prev(nd))
                j++;
        if (i != j) {
                pr_emerg("backwards %d, forwards %d\n", j, i);
                bug = 1;
        }
        return bug ? -1 : i;
}

static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
{
        struct rb_node *nd;

        for (nd = rb_first(root); nd; nd = rb_next(nd)) {
                struct vm_area_struct *vma;
                vma = rb_entry(nd, struct vm_area_struct, vm_rb);
                VM_BUG_ON_VMA(vma != ignore &&
                        vma->rb_subtree_gap != vma_compute_subtree_gap(vma),
                        vma);
        }
}

static void validate_mm(struct mm_struct *mm)
{
        int bug = 0;
        int i = 0;
        unsigned long highest_address = 0;
        struct vm_area_struct *vma = mm->mmap;

        while (vma) {
                struct anon_vma *anon_vma = vma->anon_vma;
                struct anon_vma_chain *avc;

                if (anon_vma) {
                        anon_vma_lock_read(anon_vma);
                        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
                                anon_vma_interval_tree_verify(avc);
                        anon_vma_unlock_read(anon_vma);
                }

                highest_address = vm_end_gap(vma);
                vma = vma->vm_next;
                i++;
        }
        if (i != mm->map_count) {
                pr_emerg("map_count %d vm_next %d\n", mm->map_count, i);
                bug = 1;
        }
        if (highest_address != mm->highest_vm_end) {
                pr_emerg("mm->highest_vm_end %lx, found %lx\n",
                          mm->highest_vm_end, highest_address);
                bug = 1;
        }
        i = browse_rb(mm);
        if (i != mm->map_count) {
                if (i != -1)
                        pr_emerg("map_count %d rb %d\n", mm->map_count, i);
                bug = 1;
        }
        VM_BUG_ON_MM(bug, mm);
}
#else
#define validate_mm_rb(root, ignore) do { } while (0)
#define validate_mm(mm) do { } while (0)
#endif

RB_DECLARE_CALLBACKS_MAX(static, vma_gap_callbacks,
                         struct vm_area_struct, vm_rb,
                         unsigned long, rb_subtree_gap, vma_compute_gap)

/*
 * Update augmented rbtree rb_subtree_gap values after vma->vm_start or
 * vma->vm_prev->vm_end values changed, without modifying the vma's position
 * in the rbtree.
 */
static void vma_gap_update(struct vm_area_struct *vma)
{
        /*
         * As it turns out, RB_DECLARE_CALLBACKS_MAX() already created
         * a callback function that does exactly what we want.
         */
        vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
}

static inline void vma_rb_insert(struct vm_area_struct *vma,
                                 struct rb_root *root)
{
        /* All rb_subtree_gap values must be consistent prior to insertion */
        validate_mm_rb(root, NULL);

        rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
}

static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
{
        /*
         * Note rb_erase_augmented is a fairly large inline function,
         * so make sure we instantiate it only once with our desired
         * augmented rbtree callbacks.
         */
        rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
}

static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma,
                                                struct rb_root *root,
                                                struct vm_area_struct *ignore)
{
        /*
         * All rb_subtree_gap values must be consistent prior to erase,
         * with the possible exception of
         *
         * a. the "next" vma being erased if next->vm_start was reduced in
         *    __vma_adjust() -> __vma_unlink()
         * b. the vma being erased in detach_vmas_to_be_unmapped() ->
         *    vma_rb_erase()
         */
        validate_mm_rb(root, ignore);

        __vma_rb_erase(vma, root);
}

static __always_inline void vma_rb_erase(struct vm_area_struct *vma,
                                         struct rb_root *root)
{
        vma_rb_erase_ignore(vma, root, vma);
}

/*
 * vma has some anon_vma assigned, and is already inserted on that
 * anon_vma's interval trees.
 *
 * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
 * vma must be removed from the anon_vma's interval trees using
 * anon_vma_interval_tree_pre_update_vma().
 *
 * After the update, the vma will be reinserted using
 * anon_vma_interval_tree_post_update_vma().
 *
 * The entire update must be protected by exclusive mmap_lock and by
 * the root anon_vma's mutex.
 */
static inline void
anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
{
        struct anon_vma_chain *avc;

        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
                anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
}

static inline void
anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
{
        struct anon_vma_chain *avc;

        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
                anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
}

static int find_vma_links(struct mm_struct *mm, unsigned long addr,
                unsigned long end, struct vm_area_struct **pprev,
                struct rb_node ***rb_link, struct rb_node **rb_parent)
{
        struct rb_node **__rb_link, *__rb_parent, *rb_prev;

        __rb_link = &mm->mm_rb.rb_node;
        rb_prev = __rb_parent = NULL;

        while (*__rb_link) {
                struct vm_area_struct *vma_tmp;

                __rb_parent = *__rb_link;
                vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);

                if (vma_tmp->vm_end > addr) {
                        /* Fail if an existing vma overlaps the area */
                        if (vma_tmp->vm_start < end)
                                return -ENOMEM;
                        __rb_link = &__rb_parent->rb_left;
                } else {
                        rb_prev = __rb_parent;
                        __rb_link = &__rb_parent->rb_right;
                }
        }

        *pprev = NULL;
        if (rb_prev)
                *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
        *rb_link = __rb_link;
        *rb_parent = __rb_parent;
        return 0;
}

/*
 * vma_next() - Get the next VMA.
 * @mm: The mm_struct.
 * @vma: The current vma.
 *
 * If @vma is NULL, return the first vma in the mm.
 *
 * Returns: The next VMA after @vma.
 */
static inline struct vm_area_struct *vma_next(struct mm_struct *mm,
                                         struct vm_area_struct *vma)
{
        if (!vma)
                return mm->mmap;

        return vma->vm_next;
}

/*
 * munmap_vma_range() - munmap VMAs that overlap a range.
 * @mm: The mm struct
 * @start: The start of the range.
 * @len: The length of the range.
 * @pprev: pointer to the pointer that will be set to previous vm_area_struct
 * @rb_link: the rb_node
 * @rb_parent: the parent rb_node
 *
 * Find all the vm_area_struct that overlap from @start to
 * @end and munmap them.  Set @pprev to the previous vm_area_struct.
 *
 * Returns: -ENOMEM on munmap failure or 0 on success.
 */
static inline int
munmap_vma_range(struct mm_struct *mm, unsigned long start, unsigned long len,
                 struct vm_area_struct **pprev, struct rb_node ***link,
                 struct rb_node **parent, struct list_head *uf)
{

        while (find_vma_links(mm, start, start + len, pprev, link, parent))
                if (do_munmap(mm, start, len, uf))
                        return -ENOMEM;

        return 0;
}
static unsigned long count_vma_pages_range(struct mm_struct *mm,
                unsigned long addr, unsigned long end)
{
        unsigned long nr_pages = 0;
        struct vm_area_struct *vma;

        /* Find first overlaping mapping */
        vma = find_vma_intersection(mm, addr, end);
        if (!vma)
                return 0;

        nr_pages = (min(end, vma->vm_end) -
                max(addr, vma->vm_start)) >> PAGE_SHIFT;

        /* Iterate over the rest of the overlaps */
        for (vma = vma->vm_next; vma; vma = vma->vm_next) {
                unsigned long overlap_len;

                if (vma->vm_start > end)
                        break;

                overlap_len = min(end, vma->vm_end) - vma->vm_start;
                nr_pages += overlap_len >> PAGE_SHIFT;
        }

        return nr_pages;
}

void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
                struct rb_node **rb_link, struct rb_node *rb_parent)
{
        /* Update tracking information for the gap following the new vma. */
        if (vma->vm_next)
                vma_gap_update(vma->vm_next);
        else
                mm->highest_vm_end = vm_end_gap(vma);

        /*
         * vma->vm_prev wasn't known when we followed the rbtree to find the
         * correct insertion point for that vma. As a result, we could not
         * update the vma vm_rb parents rb_subtree_gap values on the way down.
         * So, we first insert the vma with a zero rb_subtree_gap value
         * (to be consistent with what we did on the way down), and then
         * immediately update the gap to the correct value. Finally we
         * rebalance the rbtree after all augmented values have been set.
         */
        rb_link_node(&vma->vm_rb, rb_parent, rb_link);
        vma->rb_subtree_gap = 0;
        vma_gap_update(vma);
        vma_rb_insert(vma, &mm->mm_rb);
}

static void __vma_link_file(struct vm_area_struct *vma)
{
        struct file *file;

        file = vma->vm_file;
        if (file) {
                struct address_space *mapping = file->f_mapping;

                if (vma->vm_flags & VM_DENYWRITE)
                        put_write_access(file_inode(file));
                if (vma_is_shared_maywrite(vma))
                        mapping_allow_writable(mapping);

                flush_dcache_mmap_lock(mapping);
                vma_interval_tree_insert(vma, &mapping->i_mmap);
                flush_dcache_mmap_unlock(mapping);
        }
}

static void
__vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
        struct vm_area_struct *prev, struct rb_node **rb_link,
        struct rb_node *rb_parent)
{
        __vma_link_list(mm, vma, prev);
        __vma_link_rb(mm, vma, rb_link, rb_parent);
}

static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
                        struct vm_area_struct *prev, struct rb_node **rb_link,
                        struct rb_node *rb_parent)
{
        struct address_space *mapping = NULL;

        if (vma->vm_file) {
                mapping = vma->vm_file->f_mapping;
                i_mmap_lock_write(mapping);
        }

        __vma_link(mm, vma, prev, rb_link, rb_parent);
        __vma_link_file(vma);

        if (mapping)
                i_mmap_unlock_write(mapping);

        mm->map_count++;
        validate_mm(mm);
}

/*
 * Helper for vma_adjust() in the split_vma insert case: insert a vma into the
 * mm's list and rbtree.  It has already been inserted into the interval tree.
 */
static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
        struct vm_area_struct *prev;
        struct rb_node **rb_link, *rb_parent;

        if (find_vma_links(mm, vma->vm_start, vma->vm_end,
                           &prev, &rb_link, &rb_parent))
                BUG();
        __vma_link(mm, vma, prev, rb_link, rb_parent);
        mm->map_count++;
}

static __always_inline void __vma_unlink(struct mm_struct *mm,
                                                struct vm_area_struct *vma,
                                                struct vm_area_struct *ignore)
{
        vma_rb_erase_ignore(vma, &mm->mm_rb, ignore);
        __vma_unlink_list(mm, vma);
        /* Kill the cache */
        vmacache_invalidate(mm);
}

/*
 * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
 * is already present in an i_mmap tree without adjusting the tree.
 * The following helper function should be used when such adjustments
 * are necessary.  The "insert" vma (if any) is to be inserted
 * before we drop the necessary locks.
 */
int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
        unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
        struct vm_area_struct *expand)
{
        struct mm_struct *mm = vma->vm_mm;
        struct vm_area_struct *next = vma->vm_next, *orig_vma = vma;
        struct address_space *mapping = NULL;
        struct rb_root_cached *root = NULL;
        struct anon_vma *anon_vma = NULL;
        struct file *file = vma->vm_file;
        bool start_changed = false, end_changed = false;
        long adjust_next = 0;
        int remove_next = 0;

        if (next && !insert) {
                struct vm_area_struct *exporter = NULL, *importer = NULL;

                if (end >= next->vm_end) {
                        /*
                         * vma expands, overlapping all the next, and
                         * perhaps the one after too (mprotect case 6).
                         * The only other cases that gets here are
                         * case 1, case 7 and case 8.
                         */
                        if (next == expand) {
                                /*
                                 * The only case where we don't expand "vma"
                                 * and we expand "next" instead is case 8.
                                 */
                                VM_WARN_ON(end != next->vm_end);
                                /*
                                 * remove_next == 3 means we're
                                 * removing "vma" and that to do so we
                                 * swapped "vma" and "next".
                                 */
                                remove_next = 3;
                                VM_WARN_ON(file != next->vm_file);
                                swap(vma, next);
                        } else {
                                VM_WARN_ON(expand != vma);
                                /*
                                 * case 1, 6, 7, remove_next == 2 is case 6,
                                 * remove_next == 1 is case 1 or 7.
                                 */
                                remove_next = 1 + (end > next->vm_end);
                                VM_WARN_ON(remove_next == 2 &&
                                           end != next->vm_next->vm_end);
                                /* trim end to next, for case 6 first pass */
                                end = next->vm_end;
                        }

                        exporter = next;
                        importer = vma;

                        /*
                         * If next doesn't have anon_vma, import from vma after
                         * next, if the vma overlaps with it.
                         */
                        if (remove_next == 2 && !next->anon_vma)
                                exporter = next->vm_next;

                } else if (end > next->vm_start) {
                        /*
                         * vma expands, overlapping part of the next:
                         * mprotect case 5 shifting the boundary up.
                         */
                        adjust_next = (end - next->vm_start);
                        exporter = next;
                        importer = vma;
                        VM_WARN_ON(expand != importer);
                } else if (end < vma->vm_end) {
                        /*
                         * vma shrinks, and !insert tells it's not
                         * split_vma inserting another: so it must be
                         * mprotect case 4 shifting the boundary down.
                         */
                        adjust_next = -(vma->vm_end - end);
                        exporter = vma;
                        importer = next;
                        VM_WARN_ON(expand != importer);
                }

                /*
                 * Easily overlooked: when mprotect shifts the boundary,
                 * make sure the expanding vma has anon_vma set if the
                 * shrinking vma had, to cover any anon pages imported.
                 */
                if (exporter && exporter->anon_vma && !importer->anon_vma) {
                        int error;

                        importer->anon_vma = exporter->anon_vma;
                        error = anon_vma_clone(importer, exporter);
                        if (error)
                                return error;
                }
        }
again:
        /*
         * Get rid of huge pages and shared page tables straddling the split
         * boundary.
         */
        vma_adjust_trans_huge(orig_vma, start, end, adjust_next);
        if (is_vm_hugetlb_page(orig_vma)) {
                hugetlb_split(orig_vma, start);
                hugetlb_split(orig_vma, end);
        }

        if (file) {
                mapping = file->f_mapping;
                root = &mapping->i_mmap;
                uprobe_munmap(vma, vma->vm_start, vma->vm_end);

                if (adjust_next)
                        uprobe_munmap(next, next->vm_start, next->vm_end);

                i_mmap_lock_write(mapping);
                if (insert) {
                        /*
                         * Put into interval tree now, so instantiated pages
                         * are visible to arm/parisc __flush_dcache_page
                         * throughout; but we cannot insert into address
                         * space until vma start or end is updated.
                         */
                        __vma_link_file(insert);
                }
        }

        anon_vma = vma->anon_vma;
        if (!anon_vma && adjust_next)
                anon_vma = next->anon_vma;
        if (anon_vma) {
                VM_WARN_ON(adjust_next && next->anon_vma &&
                           anon_vma != next->anon_vma);
                anon_vma_lock_write(anon_vma);
                anon_vma_interval_tree_pre_update_vma(vma);
                if (adjust_next)
                        anon_vma_interval_tree_pre_update_vma(next);
        }

        if (file) {
                flush_dcache_mmap_lock(mapping);
                vma_interval_tree_remove(vma, root);
                if (adjust_next)
                        vma_interval_tree_remove(next, root);
        }

        if (start != vma->vm_start) {
                vma->vm_start = start;
                start_changed = true;
        }
        if (end != vma->vm_end) {
                vma->vm_end = end;
                end_changed = true;
        }
        vma->vm_pgoff = pgoff;
        if (adjust_next) {
                next->vm_start += adjust_next;
                next->vm_pgoff += adjust_next >> PAGE_SHIFT;
        }

        if (file) {
                if (adjust_next)
                        vma_interval_tree_insert(next, root);
                vma_interval_tree_insert(vma, root);
                flush_dcache_mmap_unlock(mapping);
        }

        if (remove_next) {
                /*
                 * vma_merge has merged next into vma, and needs
                 * us to remove next before dropping the locks.
                 */
                if (remove_next != 3)
                        __vma_unlink(mm, next, next);
                else
                        /*
                         * vma is not before next if they've been
                         * swapped.
                         *
                         * pre-swap() next->vm_start was reduced so
                         * tell validate_mm_rb to ignore pre-swap()
                         * "next" (which is stored in post-swap()
                         * "vma").
                         */
                        __vma_unlink(mm, next, vma);
                if (file)
                        __remove_shared_vm_struct(next, file, mapping);
        } else if (insert) {
                /*
                 * split_vma has split insert from vma, and needs
                 * us to insert it before dropping the locks
                 * (it may either follow vma or precede it).
                 */
                __insert_vm_struct(mm, insert);
        } else {
                if (start_changed)
                        vma_gap_update(vma);
                if (end_changed) {
                        if (!next)
                                mm->highest_vm_end = vm_end_gap(vma);
                        else if (!adjust_next)
                                vma_gap_update(next);
                }
        }

        if (anon_vma) {
                anon_vma_interval_tree_post_update_vma(vma);
                if (adjust_next)
                        anon_vma_interval_tree_post_update_vma(next);
                anon_vma_unlock_write(anon_vma);
        }

        if (file) {
                i_mmap_unlock_write(mapping);
                uprobe_mmap(vma);

                if (adjust_next)
                        uprobe_mmap(next);
        }

        if (remove_next) {
                if (file) {
                        uprobe_munmap(next, next->vm_start, next->vm_end);
                        fput(file);
                }
                if (next->anon_vma)
                        anon_vma_merge(vma, next);
                mm->map_count--;
                mpol_put(vma_policy(next));
                vm_area_free(next);
                /*
                 * In mprotect's case 6 (see comments on vma_merge),
                 * we must remove another next too. It would clutter
                 * up the code too much to do both in one go.
                 */
                if (remove_next != 3) {
                        /*
                         * If "next" was removed and vma->vm_end was
                         * expanded (up) over it, in turn
                         * "next->vm_prev->vm_end" changed and the
                         * "vma->vm_next" gap must be updated.
                         */
                        next = vma->vm_next;
                } else {
                        /*
                         * For the scope of the comment "next" and
                         * "vma" considered pre-swap(): if "vma" was
                         * removed, next->vm_start was expanded (down)
                         * over it and the "next" gap must be updated.
                         * Because of the swap() the post-swap() "vma"
                         * actually points to pre-swap() "next"
                         * (post-swap() "next" as opposed is now a
                         * dangling pointer).
                         */
                        next = vma;
                }
                if (remove_next == 2) {
                        remove_next = 1;
                        end = next->vm_end;
                        goto again;
                }
                else if (next)
                        vma_gap_update(next);
                else {
                        /*
                         * If remove_next == 2 we obviously can't
                         * reach this path.
                         *
                         * If remove_next == 3 we can't reach this
                         * path because pre-swap() next is always not
                         * NULL. pre-swap() "next" is not being
                         * removed and its next->vm_end is not altered
                         * (and furthermore "end" already matches
                         * next->vm_end in remove_next == 3).
                         *
                         * We reach this only in the remove_next == 1
                         * case if the "next" vma that was removed was
                         * the highest vma of the mm. However in such
                         * case next->vm_end == "end" and the extended
                         * "vma" has vma->vm_end == next->vm_end so
                         * mm->highest_vm_end doesn't need any update
                         * in remove_next == 1 case.
                         */
                        VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma));
                }
        }
        if (insert && file)
                uprobe_mmap(insert);

        validate_mm(mm);

        return 0;
}

/*
 * If the vma has a ->close operation then the driver probably needs to release
 * per-vma resources, so we don't attempt to merge those.
 */
static inline int is_mergeable_vma(struct vm_area_struct *vma,
                                struct file *file, unsigned long vm_flags,
                                struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
{
        /*
         * VM_SOFTDIRTY should not prevent from VMA merging, if we
         * match the flags but dirty bit -- the caller should mark
         * merged VMA as dirty. If dirty bit won't be excluded from
         * comparison, we increase pressure on the memory system forcing
         * the kernel to generate new VMAs when old one could be
         * extended instead.
         */
        if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
                return 0;
        if (vma->vm_file != file)
                return 0;
        if (vma->vm_ops && vma->vm_ops->close)
                return 0;
        if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
                return 0;
        return 1;
}

static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
                                        struct anon_vma *anon_vma2,
                                        struct vm_area_struct *vma)
{
        /*
         * The list_is_singular() test is to avoid merging VMA cloned from
         * parents. This can improve scalability caused by anon_vma lock.
         */
        if ((!anon_vma1 || !anon_vma2) && (!vma ||
                list_is_singular(&vma->anon_vma_chain)))
                return 1;
        return anon_vma1 == anon_vma2;
}

/*
 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
 * in front of (at a lower virtual address and file offset than) the vma.
 *
 * We cannot merge two vmas if they have differently assigned (non-NULL)
 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
 *
 * We don't check here for the merged mmap wrapping around the end of pagecache
 * indices (16TB on ia32) because do_mmap() does not permit mmap's which
 * wrap, nor mmaps which cover the final page at index -1UL.
 */
static int
can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
                     struct anon_vma *anon_vma, struct file *file,
                     pgoff_t vm_pgoff,
                     struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
{
        if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
            is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
                if (vma->vm_pgoff == vm_pgoff)
                        return 1;
        }
        return 0;
}

/*
 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
 * beyond (at a higher virtual address and file offset than) the vma.
 *
 * We cannot merge two vmas if they have differently assigned (non-NULL)
 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
 */
static int
can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
                    struct anon_vma *anon_vma, struct file *file,
                    pgoff_t vm_pgoff,
                    struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
{
        if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
            is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
                pgoff_t vm_pglen;
                vm_pglen = vma_pages(vma);
                if (vma->vm_pgoff + vm_pglen == vm_pgoff)
                        return 1;
        }
        return 0;
}

/*
 * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
 * whether that can be merged with its predecessor or its successor.
 * Or both (it neatly fills a hole).
 *
 * In most cases - when called for mmap, brk or mremap - [addr,end) is
 * certain not to be mapped by the time vma_merge is called; but when
 * called for mprotect, it is certain to be already mapped (either at
 * an offset within prev, or at the start of next), and the flags of
 * this area are about to be changed to vm_flags - and the no-change
 * case has already been eliminated.
 *
 * The following mprotect cases have to be considered, where AAAA is
 * the area passed down from mprotect_fixup, never extending beyond one
 * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after:
 *
 *     AAAA             AAAA                   AAAA
 *    PPPPPPNNNNNN    PPPPPPNNNNNN       PPPPPPNNNNNN
 *    cannot merge    might become       might become
 *                    PPNNNNNNNNNN       PPPPPPPPPPNN
 *    mmap, brk or    case 4 below       case 5 below
 *    mremap move:
 *                        AAAA               AAAA
 *                    PPPP    NNNN       PPPPNNNNXXXX
 *                    might become       might become
 *                    PPPPPPPPPPPP 1 or  PPPPPPPPPPPP 6 or
 *                    PPPPPPPPNNNN 2 or  PPPPPPPPXXXX 7 or
 *                    PPPPNNNNNNNN 3     PPPPXXXXXXXX 8
 *
 * It is important for case 8 that the vma NNNN overlapping the
 * region AAAA is never going to extended over XXXX. Instead XXXX must
 * be extended in region AAAA and NNNN must be removed. This way in
 * all cases where vma_merge succeeds, the moment vma_adjust drops the
 * rmap_locks, the properties of the merged vma will be already
 * correct for the whole merged range. Some of those properties like
 * vm_page_prot/vm_flags may be accessed by rmap_walks and they must
 * be correct for the whole merged range immediately after the
 * rmap_locks are released. Otherwise if XXXX would be removed and
 * NNNN would be extended over the XXXX range, remove_migration_ptes
 * or other rmap walkers (if working on addresses beyond the "end"
 * parameter) may establish ptes with the wrong permissions of NNNN
 * instead of the right permissions of XXXX.
 */
struct vm_area_struct *vma_merge(struct mm_struct *mm,
                        struct vm_area_struct *prev, unsigned long addr,
                        unsigned long end, unsigned long vm_flags,
                        struct anon_vma *anon_vma, struct file *file,
                        pgoff_t pgoff, struct mempolicy *policy,
                        struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
{
        pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
        struct vm_area_struct *area, *next;
        int err;

        /*
         * We later require that vma->vm_flags == vm_flags,
         * so this tests vma->vm_flags & VM_SPECIAL, too.
         */
        if (vm_flags & VM_SPECIAL)
                return NULL;

        next = vma_next(mm, prev);
        area = next;
        if (area && area->vm_end == end)                /* cases 6, 7, 8 */
                next = next->vm_next;

        /* verify some invariant that must be enforced by the caller */
        VM_WARN_ON(prev && addr <= prev->vm_start);
        VM_WARN_ON(area && end > area->vm_end);
        VM_WARN_ON(addr >= end);

        /*
         * Can it merge with the predecessor?
         */
        if (prev && prev->vm_end == addr &&
                        mpol_equal(vma_policy(prev), policy) &&
                        can_vma_merge_after(prev, vm_flags,
                                            anon_vma, file, pgoff,
                                            vm_userfaultfd_ctx)) {
                /*
                 * OK, it can.  Can we now merge in the successor as well?
                 */
                if (next && end == next->vm_start &&
                                mpol_equal(policy, vma_policy(next)) &&
                                can_vma_merge_before(next, vm_flags,
                                                     anon_vma, file,
                                                     pgoff+pglen,
                                                     vm_userfaultfd_ctx) &&
                                is_mergeable_anon_vma(prev->anon_vma,
                                                      next->anon_vma, NULL)) {
                                                        /* cases 1, 6 */
                        err = __vma_adjust(prev, prev->vm_start,
                                         next->vm_end, prev->vm_pgoff, NULL,
                                         prev);
                } else                                        /* cases 2, 5, 7 */
                        err = __vma_adjust(prev, prev->vm_start,
                                         end, prev->vm_pgoff, NULL, prev);
                if (err)
                        return NULL;
                khugepaged_enter_vma_merge(prev, vm_flags);
                return prev;
        }

        /*
         * Can this new request be merged in front of next?
         */
        if (next && end == next->vm_start &&
                        mpol_equal(policy, vma_policy(next)) &&
                        can_vma_merge_before(next, vm_flags,
                                             anon_vma, file, pgoff+pglen,
                                             vm_userfaultfd_ctx)) {
                if (prev && addr < prev->vm_end)        /* case 4 */
                        err = __vma_adjust(prev, prev->vm_start,
                                         addr, prev->vm_pgoff, NULL, next);
                else {                                        /* cases 3, 8 */
                        err = __vma_adjust(area, addr, next->vm_end,
                                         next->vm_pgoff - pglen, NULL, next);
                        /*
                         * In case 3 area is already equal to next and
                         * this is a noop, but in case 8 "area" has
                         * been removed and next was expanded over it.
                         */
                        area = next;
                }
                if (err)
                        return NULL;
                khugepaged_enter_vma_merge(area, vm_flags);
                return area;
        }

        return NULL;
}

/*
 * Rough compatibility check to quickly see if it's even worth looking
 * at sharing an anon_vma.
 *
 * They need to have the same vm_file, and the flags can only differ
 * in things that mprotect may change.
 *
 * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that
 * we can merge the two vma's. For example, we refuse to merge a vma if
 * there is a vm_ops->close() function, because that indicates that the
 * driver is doing some kind of reference counting. But that doesn't
 * really matter for the anon_vma sharing case.
 */
static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b)
{
        return a->vm_end == b->vm_start &&
                mpol_equal(vma_policy(a), vma_policy(b)) &&
                a->vm_file == b->vm_file &&
                !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) &&
                b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
}

/*
 * Do some basic sanity checking to see if we can re-use the anon_vma
 * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be
 * the same as 'old', the other will be the new one that is trying
 * to share the anon_vma.
 *
 * NOTE! This runs with mm_sem held for reading, so it is possible that
 * the anon_vma of 'old' is concurrently in the process of being set up
 * by another page fault trying to merge _that_. But that's ok: if it
 * is being set up, that automatically means that it will be a singleton
 * acceptable for merging, so we can do all of this optimistically. But
 * we do that READ_ONCE() to make sure that we never re-load the pointer.
 *
 * IOW: that the "list_is_singular()" test on the anon_vma_chain only
 * matters for the 'stable anon_vma' case (ie the thing we want to avoid
 * is to return an anon_vma that is "complex" due to having gone through
 * a fork).
 *
 * We also make sure that the two vma's are compatible (adjacent,
 * and with the same memory policies). That's all stable, even with just
 * a read lock on the mm_sem.
 */
static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b)
{
        if (anon_vma_compatible(a, b)) {
                struct anon_vma *anon_vma = READ_ONCE(old->anon_vma);

                if (anon_vma && list_is_singular(&old->anon_vma_chain))
                        return anon_vma;
        }
        return NULL;
}

/*
 * find_mergeable_anon_vma is used by anon_vma_prepare, to check
 * neighbouring vmas for a suitable anon_vma, before it goes off
 * to allocate a new anon_vma.  It checks because a repetitive
 * sequence of mprotects and faults may otherwise lead to distinct
 * anon_vmas being allocated, preventing vma merge in subsequent
 * mprotect.
 */
struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
{
        struct anon_vma *anon_vma = NULL;

        /* Try next first. */
        if (vma->vm_next) {
                anon_vma = reusable_anon_vma(vma->vm_next, vma, vma->vm_next);
                if (anon_vma)
                        return anon_vma;
        }

        /* Try prev next. */
        if (vma->vm_prev)
                anon_vma = reusable_anon_vma(vma->vm_prev, vma->vm_prev, vma);

        /*
         * We might reach here with anon_vma == NULL if we can't find
         * any reusable anon_vma.
         * There's no absolute need to look only at touching neighbours:
         * we could search further afield for "compatible" anon_vmas.
         * But it would probably just be a waste of time searching,
         * or lead to too many vmas hanging off the same anon_vma.
         * We're trying to allow mprotect remerging later on,
         * not trying to minimize memory used for anon_vmas.
         */
        return anon_vma;
}

/*
 * If a hint addr is less than mmap_min_addr change hint to be as
 * low as possible but still greater than mmap_min_addr
 */
static inline unsigned long round_hint_to_min(unsigned long hint)
{
        hint &= PAGE_MASK;
        if (((void *)hint != NULL) &&
            (hint < mmap_min_addr))
                return PAGE_ALIGN(mmap_min_addr);
        return hint;
}

static inline int mlock_future_check(struct mm_struct *mm,
                                     unsigned long flags,
                                     unsigned long len)
{
        unsigned long locked, lock_limit;

        /*  mlock MCL_FUTURE? */
        if (flags & VM_LOCKED) {
                locked = len >> PAGE_SHIFT;
                locked += mm->locked_vm;
                lock_limit = rlimit(RLIMIT_MEMLOCK);
                lock_limit >>= PAGE_SHIFT;
                if (locked > lock_limit && !capable(CAP_IPC_LOCK))
                        return -EAGAIN;
        }
        return 0;
}

static inline u64 file_mmap_size_max(struct file *file, struct inode *inode)
{
        if (S_ISREG(inode->i_mode))
                return MAX_LFS_FILESIZE;

        if (S_ISBLK(inode->i_mode))
                return MAX_LFS_FILESIZE;

        if (S_ISSOCK(inode->i_mode))
                return MAX_LFS_FILESIZE;

        /* Special "we do even unsigned file positions" case */
        if (file->f_mode & FMODE_UNSIGNED_OFFSET)
                return 0;

        /* Yes, random drivers might want more. But I'm tired of buggy drivers */
        return ULONG_MAX;
}

static inline bool file_mmap_ok(struct file *file, struct inode *inode,
                                unsigned long pgoff, unsigned long len)
{
        u64 maxsize = file_mmap_size_max(file, inode);

        if (maxsize && len > maxsize)
                return false;
        maxsize -= len;
        if (pgoff > maxsize >> PAGE_SHIFT)
                return false;
        return true;
}

/*
 * The caller must write-lock current->mm->mmap_lock.
 */
unsigned long do_mmap(struct file *file, unsigned long addr,
                        unsigned long len, unsigned long prot,
                        unsigned long flags, unsigned long pgoff,
                        unsigned long *populate, struct list_head *uf)
{
        struct mm_struct *mm = current->mm;
        vm_flags_t vm_flags;
        int pkey = 0;

        *populate = 0;

        if (!len)
                return -EINVAL;

        /*
         * Does the application expect PROT_READ to imply PROT_EXEC?
         *
         * (the exception is when the underlying filesystem is noexec
         *  mounted, in which case we dont add PROT_EXEC.)
         */
        if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
                if (!(file && path_noexec(&file->f_path)))
                        prot |= PROT_EXEC;

        /* force arch specific MAP_FIXED handling in get_unmapped_area */
        if (flags & MAP_FIXED_NOREPLACE)
                flags |= MAP_FIXED;

        if (!(flags & MAP_FIXED))
                addr = round_hint_to_min(addr);

        /* Careful about overflows.. */
        len = PAGE_ALIGN(len);
        if (!len)
                return -ENOMEM;

        /* offset overflow? */
        if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
                return -EOVERFLOW;

        /* Too many mappings? */
        if (mm->map_count > sysctl_max_map_count)
                return -ENOMEM;

        /* Obtain the address to map to. we verify (or select) it and ensure
         * that it represents a valid section of the address space.
         */
        addr = get_unmapped_area(file, addr, len, pgoff, flags);
        if (IS_ERR_VALUE(addr))
                return addr;

        if (flags & MAP_FIXED_NOREPLACE) {
                struct vm_area_struct *vma = find_vma(mm, addr);

                if (vma && vma->vm_start < addr + len)
                        return -EEXIST;
        }

        if (prot == PROT_EXEC) {
                pkey = execute_only_pkey(mm);
                if (pkey < 0)
                        pkey = 0;
        }

        /* Do simple checking here so the lower-level routines won't have
         * to. we assume access permissions have been handled by the open
         * of the memory object, so we don't do any here.
         */
        vm_flags = calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(file, flags) |
                        mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;

        if (flags & MAP_LOCKED)
                if (!can_do_mlock())
                        return -EPERM;

        if (mlock_future_check(mm, vm_flags, len))
                return -EAGAIN;

        if (file) {
                struct inode *inode = file_inode(file);
                unsigned int seals = memfd_file_seals(file);
                unsigned long flags_mask;

                if (!file_mmap_ok(file, inode, pgoff, len))
                        return -EOVERFLOW;

                flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags;

                switch (flags & MAP_TYPE) {
                case MAP_SHARED:
                        /*
                         * Force use of MAP_SHARED_VALIDATE with non-legacy
                         * flags. E.g. MAP_SYNC is dangerous to use with
                         * MAP_SHARED as you don't know which consistency model
                         * you will get. We silently ignore unsupported flags
                         * with MAP_SHARED to preserve backward compatibility.
                         */
                        flags &= LEGACY_MAP_MASK;
                        fallthrough;
                case MAP_SHARED_VALIDATE:
                        if (flags & ~flags_mask)
                                return -EOPNOTSUPP;
                        if (prot & PROT_WRITE) {
                                if (!(file->f_mode & FMODE_WRITE))
                                        return -EACCES;
                                if (IS_SWAPFILE(file->f_mapping->host))
                                        return -ETXTBSY;
                        }

                        /*
                         * Make sure we don't allow writing to an append-only
                         * file..
                         */
                        if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
                                return -EACCES;

                        /*
                         * Make sure there are no mandatory locks on the file.
                         */
                        if (locks_verify_locked(file))
                                return -EAGAIN;

                        vm_flags |= VM_SHARED | VM_MAYSHARE;
                        if (!(file->f_mode & FMODE_WRITE))
                                vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
                        else if (is_readonly_sealed(seals, vm_flags))
                                vm_flags &= ~VM_MAYWRITE;
                        fallthrough;
                case MAP_PRIVATE:
                        if (!(file->f_mode & FMODE_READ))
                                return -EACCES;
                        if (path_noexec(&file->f_path)) {
                                if (vm_flags & VM_EXEC)
                                        return -EPERM;
                                vm_flags &= ~VM_MAYEXEC;
                        }

                        if (!file->f_op->mmap)
                                return -ENODEV;
                        if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
                                return -EINVAL;
                        break;

                default:
                        return -EINVAL;
                }
        } else {
                switch (flags & MAP_TYPE) {
                case MAP_SHARED:
                        if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
                                return -EINVAL;
                        /*
                         * Ignore pgoff.
                         */
                        pgoff = 0;
                        vm_flags |= VM_SHARED | VM_MAYSHARE;
                        break;
                case MAP_PRIVATE:
                        /*
                         * Set pgoff according to addr for anon_vma.
                         */
                        pgoff = addr >> PAGE_SHIFT;
                        break;
                default:
                        return -EINVAL;
                }
        }

        /*
         * Set 'VM_NORESERVE' if we should not account for the
         * memory use of this mapping.
         */
        if (flags & MAP_NORESERVE) {
                /* We honor MAP_NORESERVE if allowed to overcommit */
                if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
                        vm_flags |= VM_NORESERVE;

                /* hugetlb applies strict overcommit unless MAP_NORESERVE */
                if (file && is_file_hugepages(file))
                        vm_flags |= VM_NORESERVE;
        }

        addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
        if (!IS_ERR_VALUE(addr) &&
            ((vm_flags & VM_LOCKED) ||
             (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
                *populate = len;
        return addr;
}

unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
                              unsigned long prot, unsigned long flags,
                              unsigned long fd, unsigned long pgoff)
{
        struct file *file = NULL;
        unsigned long retval;

        if (!(flags & MAP_ANONYMOUS)) {
                audit_mmap_fd(fd, flags);
                file = fget(fd);
                if (!file)
                        return -EBADF;
                if (is_file_hugepages(file)) {
                        len = ALIGN(len, huge_page_size(hstate_file(file)));
                } else if (unlikely(flags & MAP_HUGETLB)) {
                        retval = -EINVAL;
                        goto out_fput;
                }
        } else if (flags & MAP_HUGETLB) {
                struct user_struct *user = NULL;
                struct hstate *hs;

                hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
                if (!hs)
                        return -EINVAL;

                len = ALIGN(len, huge_page_size(hs));
                /*
                 * VM_NORESERVE is used because the reservations will be
                 * taken when vm_ops->mmap() is called
                 * A dummy user value is used because we are not locking
                 * memory so no accounting is necessary
                 */
                file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
                                VM_NORESERVE,
                                &user, HUGETLB_ANONHUGE_INODE,
                                (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
                if (IS_ERR(file))
                        return PTR_ERR(file);
        }

        flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);

        retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
out_fput:
        if (file)
                fput(file);
        return retval;
}

SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
                unsigned long, prot, unsigned long, flags,
                unsigned long, fd, unsigned long, pgoff)
{
        return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
}

#ifdef __ARCH_WANT_SYS_OLD_MMAP
struct mmap_arg_struct {
        unsigned long addr;
        unsigned long len;
        unsigned long prot;
        unsigned long flags;
        unsigned long fd;
        unsigned long offset;
};

SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
{
        struct mmap_arg_struct a;

        if (copy_from_user(&a, arg, sizeof(a)))
                return -EFAULT;
        if (offset_in_page(a.offset))
                return -EINVAL;

        return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
                               a.offset >> PAGE_SHIFT);
}
#endif /* __ARCH_WANT_SYS_OLD_MMAP */

/*
 * Some shared mappings will want the pages marked read-only
 * to track write events. If so, we'll downgrade vm_page_prot
 * to the private version (using protection_map[] without the
 * VM_SHARED bit).
 */
int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
{
        vm_flags_t vm_flags = vma->vm_flags;
        const struct vm_operations_struct *vm_ops = vma->vm_ops;

        /* If it was private or non-writable, the write bit is already clear */
        if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
                return 0;

        /* The backer wishes to know when pages are first written to? */
        if (vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite))
                return 1;

        /* The open routine did something to the protections that pgprot_modify
         * won't preserve? */
        if (pgprot_val(vm_page_prot) !=
            pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags)))
                return 0;

        /*
         * Do we need to track softdirty? hugetlb does not support softdirty
         * tracking yet.
         */
        if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY) &&
            !is_vm_hugetlb_page(vma))
                return 1;

        /* Specialty mapping? */
        if (vm_flags & VM_PFNMAP)
                return 0;

        /* Can the mapping track the dirty pages? */
        return vma->vm_file && vma->vm_file->f_mapping &&
                mapping_can_writeback(vma->vm_file->f_mapping);
}

/*
 * We account for memory if it's a private writeable mapping,
 * not hugepages and VM_NORESERVE wasn't set.
 */
static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
{
        /*
         * hugetlb has its own accounting separate from the core VM
         * VM_HUGETLB may not be set yet so we cannot check for that flag.
         */
        if (file && is_file_hugepages(file))
                return 0;

        return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
}

static unsigned long __mmap_region(struct file *file, unsigned long addr,
                unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
                struct list_head *uf)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma, *prev, *merge;
        int error;
        struct rb_node **rb_link, *rb_parent;
        unsigned long charged = 0;

        /* Check against address space limit. */
        if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
                unsigned long nr_pages;

                /*
                 * MAP_FIXED may remove pages of mappings that intersects with
                 * requested mapping. Account for the pages it would unmap.
                 */
                nr_pages = count_vma_pages_range(mm, addr, addr + len);

                if (!may_expand_vm(mm, vm_flags,
                                        (len >> PAGE_SHIFT) - nr_pages))
                        return -ENOMEM;
        }

        /* Clear old maps, set up prev, rb_link, rb_parent, and uf */
        if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
                return -ENOMEM;
        /*
         * Private writable mapping: check memory availability
         */
        if (accountable_mapping(file, vm_flags)) {
                charged = len >> PAGE_SHIFT;
                if (security_vm_enough_memory_mm(mm, charged))
                        return -ENOMEM;
                vm_flags |= VM_ACCOUNT;
        }

        /*
         * Can we just expand an old mapping?
         */
        vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
                        NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
        if (vma)
                goto out;

        /*
         * Determine the object being mapped and call the appropriate
         * specific mapper. the address has already been validated, but
         * not unmapped, but the maps are removed from the list.
         */
        vma = vm_area_alloc(mm);
        if (!vma) {
                error = -ENOMEM;
                goto unacct_error;
        }

        vma->vm_start = addr;
        vma->vm_end = addr + len;
        vma->vm_flags = vm_flags;
        vma->vm_page_prot = vm_get_page_prot(vm_flags);
        vma->vm_pgoff = pgoff;

        if (file) {
                if (vm_flags & VM_DENYWRITE) {
                        error = deny_write_access(file);
                        if (error)
                                goto free_vma;
                }

                /* ->mmap() can change vma->vm_file, but must guarantee that
                 * vma_link() below can deny write-access if VM_DENYWRITE is set
                 * and map writably if VM_SHARED is set. This usually means the
                 * new file must not have been exposed to user-space, yet.
                 */
                vma->vm_file = get_file(file);
                error = mmap_file(file, vma);
                if (error)
                        goto unmap_and_free_file_vma;

                /* Can addr have changed??
                 *
                 * Answer: Yes, several device drivers can do it in their
                 *         f_op->mmap method. -DaveM
                 * Bug: If addr is changed, prev, rb_link, rb_parent should
                 *      be updated for vma_link()
                 */
                WARN_ON_ONCE(addr != vma->vm_start);

                /*
                 * Drivers should not permit writability when previously it was
                 * disallowed.
                 */
                VM_WARN_ON_ONCE(vm_flags != vma->vm_flags &&
                                !(vm_flags & VM_MAYWRITE) &&
                                (vma->vm_flags & VM_MAYWRITE));

                addr = vma->vm_start;

                /* If vm_flags changed after mmap_file(), we should try merge vma again
                 * as we may succeed this time.
                 */
                if (unlikely(vm_flags != vma->vm_flags && prev)) {
                        merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags,
                                NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX);
                        if (merge) {
                                /* ->mmap() can change vma->vm_file and fput the original file. So
                                 * fput the vma->vm_file here or we would add an extra fput for file
                                 * and cause general protection fault ultimately.
                                 */
                                fput(vma->vm_file);
                                vm_area_free(vma);
                                vma = merge;
                                /* Update vm_flags to pick up the change. */
                                vm_flags = vma->vm_flags;
                                goto unmap_writable;
                        }
                }

                vm_flags = vma->vm_flags;
        } else if (vm_flags & VM_SHARED) {
                error = shmem_zero_setup(vma);
                if (error)
                        goto free_vma;
        } else {
                vma_set_anonymous(vma);
        }

#ifdef CONFIG_SPARC64
        /* TODO: Fix SPARC ADI! */
        WARN_ON_ONCE(!arch_validate_flags(vm_flags));
#endif

        vma_link(mm, vma, prev, rb_link, rb_parent);
        if (file) {
unmap_writable:
                if (vm_flags & VM_DENYWRITE)
                        allow_write_access(file);
        }
        file = vma->vm_file;
out:
        perf_event_mmap(vma);

        vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
        if (vm_flags & VM_LOCKED) {
                if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
                                        is_vm_hugetlb_page(vma) ||
                                        vma == get_gate_vma(current->mm))
                        vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
                else
                        mm->locked_vm += (len >> PAGE_SHIFT);
        }

        if (file)
                uprobe_mmap(vma);

        /*
         * New (or expanded) vma always get soft dirty status.
         * Otherwise user-space soft-dirty page tracker won't
         * be able to distinguish situation when vma area unmapped,
         * then new mapped in-place (which must be aimed as
         * a completely new data area).
         */
        vma->vm_flags |= VM_SOFTDIRTY;

        vma_set_page_prot(vma);

        return addr;

unmap_and_free_file_vma:
        vma->vm_file = NULL;
        fput(file);

        /* Undo any partial mapping done by a device driver. */
        unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
        if (vm_flags & VM_DENYWRITE)
                allow_write_access(file);
free_vma:
        vm_area_free(vma);
unacct_error:
        if (charged)
                vm_unacct_memory(charged);
        return error;
}

static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
{
        /*
         * We implement the search by looking for an rbtree node that
         * immediately follows a suitable gap. That is,
         * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length;
         * - gap_end   = vma->vm_start        >= info->low_limit  + length;
         * - gap_end - gap_start >= length
         */

        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
        unsigned long length, low_limit, high_limit, gap_start, gap_end;

        /* Adjust search length to account for worst case alignment overhead */
        length = info->length + info->align_mask;
        if (length < info->length)
                return -ENOMEM;

        /* Adjust search limits by the desired length */
        if (info->high_limit < length)
                return -ENOMEM;
        high_limit = info->high_limit - length;

        if (info->low_limit > high_limit)
                return -ENOMEM;
        low_limit = info->low_limit + length;

        /* Check if rbtree root looks promising */
        if (RB_EMPTY_ROOT(&mm->mm_rb))
                goto check_highest;
        vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
        if (vma->rb_subtree_gap < length)
                goto check_highest;

        while (true) {
                /* Visit left subtree if it looks promising */
                gap_end = vm_start_gap(vma);
                if (gap_end >= low_limit && vma->vm_rb.rb_left) {
                        struct vm_area_struct *left =
                                rb_entry(vma->vm_rb.rb_left,
                                         struct vm_area_struct, vm_rb);
                        if (left->rb_subtree_gap >= length) {
                                vma = left;
                                continue;
                        }
                }

                gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
check_current:
                /* Check if current node has a suitable gap */
                if (gap_start > high_limit)
                        return -ENOMEM;
                if (gap_end >= low_limit &&
                    gap_end > gap_start && gap_end - gap_start >= length)
                        goto found;

                /* Visit right subtree if it looks promising */
                if (vma->vm_rb.rb_right) {
                        struct vm_area_struct *right =
                                rb_entry(vma->vm_rb.rb_right,
                                         struct vm_area_struct, vm_rb);
                        if (right->rb_subtree_gap >= length) {
                                vma = right;
                                continue;
                        }
                }

                /* Go back up the rbtree to find next candidate node */
                while (true) {
                        struct rb_node *prev = &vma->vm_rb;
                        if (!rb_parent(prev))
                                goto check_highest;
                        vma = rb_entry(rb_parent(prev),
                                       struct vm_area_struct, vm_rb);
                        if (prev == vma->vm_rb.rb_left) {
                                gap_start = vm_end_gap(vma->vm_prev);
                                gap_end = vm_start_gap(vma);
                                goto check_current;
                        }
                }
        }

check_highest:
        /* Check highest gap, which does not precede any rbtree node */
        gap_start = mm->highest_vm_end;
        gap_end = ULONG_MAX;  /* Only for VM_BUG_ON below */
        if (gap_start > high_limit)
                return -ENOMEM;

found:
        /* We found a suitable gap. Clip it with the original low_limit. */
        if (gap_start < info->low_limit)
                gap_start = info->low_limit;

        /* Adjust gap address to the desired alignment */
        gap_start += (info->align_offset - gap_start) & info->align_mask;

        VM_BUG_ON(gap_start + info->length > info->high_limit);
        VM_BUG_ON(gap_start + info->length > gap_end);
        return gap_start;
}

static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
        unsigned long length, low_limit, high_limit, gap_start, gap_end;

        /* Adjust search length to account for worst case alignment overhead */
        length = info->length + info->align_mask;
        if (length < info->length)
                return -ENOMEM;

        /*
         * Adjust search limits by the desired length.
         * See implementation comment at top of unmapped_area().
         */
        gap_end = info->high_limit;
        if (gap_end < length)
                return -ENOMEM;
        high_limit = gap_end - length;

        if (info->low_limit > high_limit)
                return -ENOMEM;
        low_limit = info->low_limit + length;

        /* Check highest gap, which does not precede any rbtree node */
        gap_start = mm->highest_vm_end;
        if (gap_start <= high_limit)
                goto found_highest;

        /* Check if rbtree root looks promising */
        if (RB_EMPTY_ROOT(&mm->mm_rb))
                return -ENOMEM;
        vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
        if (vma->rb_subtree_gap < length)
                return -ENOMEM;

        while (true) {
                /* Visit right subtree if it looks promising */
                gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
                if (gap_start <= high_limit && vma->vm_rb.rb_right) {
                        struct vm_area_struct *right =
                                rb_entry(vma->vm_rb.rb_right,
                                         struct vm_area_struct, vm_rb);
                        if (right->rb_subtree_gap >= length) {
                                vma = right;
                                continue;
                        }
                }

check_current:
                /* Check if current node has a suitable gap */
                gap_end = vm_start_gap(vma);
                if (gap_end < low_limit)
                        return -ENOMEM;
                if (gap_start <= high_limit &&
                    gap_end > gap_start && gap_end - gap_start >= length)
                        goto found;

                /* Visit left subtree if it looks promising */
                if (vma->vm_rb.rb_left) {
                        struct vm_area_struct *left =
                                rb_entry(vma->vm_rb.rb_left,
                                         struct vm_area_struct, vm_rb);
                        if (left->rb_subtree_gap >= length) {
                                vma = left;
                                continue;
                        }
                }

                /* Go back up the rbtree to find next candidate node */
                while (true) {
                        struct rb_node *prev = &vma->vm_rb;
                        if (!rb_parent(prev))
                                return -ENOMEM;
                        vma = rb_entry(rb_parent(prev),
                                       struct vm_area_struct, vm_rb);
                        if (prev == vma->vm_rb.rb_right) {
                                gap_start = vma->vm_prev ?
                                        vm_end_gap(vma->vm_prev) : 0;
                                goto check_current;
                        }
                }
        }

found:
        /* We found a suitable gap. Clip it with the original high_limit. */
        if (gap_end > info->high_limit)
                gap_end = info->high_limit;

found_highest:
        /* Compute highest gap address at the desired alignment */
        gap_end -= info->length;
        gap_end -= (gap_end - info->align_offset) & info->align_mask;

        VM_BUG_ON(gap_end < info->low_limit);
        VM_BUG_ON(gap_end < gap_start);
        return gap_end;
}

/*
 * Search for an unmapped address range.
 *
 * We are looking for a range that:
 * - does not intersect with any VMA;
 * - is contained within the [low_limit, high_limit) interval;
 * - is at least the desired size.
 * - satisfies (begin_addr & align_mask) == (align_offset & align_mask)
 */
unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info)
{
        unsigned long addr;

        if (info->flags & VM_UNMAPPED_AREA_TOPDOWN)
                addr = unmapped_area_topdown(info);
        else
                addr = unmapped_area(info);

        trace_vm_unmapped_area(addr, info);
        return addr;
}

/* Get an address range which is currently unmapped.
 * For shmat() with addr=0.
 *
 * Ugly calling convention alert:
 * Return value with the low bits set means error value,
 * ie
 *        if (ret & ~PAGE_MASK)
 *                error = ret;
 *
 * This function "knows" that -ENOMEM has the bits set.
 */
#ifndef HAVE_ARCH_UNMAPPED_AREA
unsigned long
arch_get_unmapped_area(struct file *filp, unsigned long addr,
                unsigned long len, unsigned long pgoff, unsigned long flags)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma, *prev;
        struct vm_unmapped_area_info info;
        const unsigned long mmap_end = arch_get_mmap_end(addr);

        if (len > mmap_end - mmap_min_addr)
                return -ENOMEM;

        if (flags & MAP_FIXED)
                return addr;

        if (addr) {
                addr = PAGE_ALIGN(addr);
                vma = find_vma_prev(mm, addr, &prev);
                if (mmap_end - len >= addr && addr >= mmap_min_addr &&
                    (!vma || addr + len <= vm_start_gap(vma)) &&
                    (!prev || addr >= vm_end_gap(prev)))
                        return addr;
        }

        info.flags = 0;
        info.length = len;
        info.low_limit = mm->mmap_base;
        info.high_limit = mmap_end;
        info.align_mask = 0;
        info.align_offset = 0;
        return vm_unmapped_area(&info);
}
#endif

/*
 * This mmap-allocator allocates new areas top-down from below the
 * stack's low limit (the base):
 */
#ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
unsigned long
arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
                          unsigned long len, unsigned long pgoff,
                          unsigned long flags)
{
        struct vm_area_struct *vma, *prev;
        struct mm_struct *mm = current->mm;
        struct vm_unmapped_area_info info;
        const unsigned long mmap_end = arch_get_mmap_end(addr);

        /* requested length too big for entire address space */
        if (len > mmap_end - mmap_min_addr)
                return -ENOMEM;

        if (flags & MAP_FIXED)
                return addr;

        /* requesting a specific address */
        if (addr) {
                addr = PAGE_ALIGN(addr);
                vma = find_vma_prev(mm, addr, &prev);
                if (mmap_end - len >= addr && addr >= mmap_min_addr &&
                                (!vma || addr + len <= vm_start_gap(vma)) &&
                                (!prev || addr >= vm_end_gap(prev)))
                        return addr;
        }

        info.flags = VM_UNMAPPED_AREA_TOPDOWN;
        info.length = len;
        info.low_limit = max(PAGE_SIZE, mmap_min_addr);
        info.high_limit = arch_get_mmap_base(addr, mm->mmap_base);
        info.align_mask = 0;
        info.align_offset = 0;
        addr = vm_unmapped_area(&info);

        /*
         * A failed mmap() very likely causes application failure,
         * so fall back to the bottom-up function here. This scenario
         * can happen with large stack limits and large mmap()
         * allocations.
         */
        if (offset_in_page(addr)) {
                VM_BUG_ON(addr != -ENOMEM);
                info.flags = 0;
                info.low_limit = TASK_UNMAPPED_BASE;
                info.high_limit = mmap_end;
                addr = vm_unmapped_area(&info);
        }

        return addr;
}
#endif

unsigned long
get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
                unsigned long pgoff, unsigned long flags)
{
        unsigned long (*get_area)(struct file *, unsigned long,
                                  unsigned long, unsigned long, unsigned long);

        unsigned long error = arch_mmap_check(addr, len, flags);
        if (error)
                return error;

        /* Careful about overflows.. */
        if (len > TASK_SIZE)
                return -ENOMEM;

        get_area = current->mm->get_unmapped_area;
        if (file) {
                if (file->f_op->get_unmapped_area)
                        get_area = file->f_op->get_unmapped_area;
        } else if (flags & MAP_SHARED) {
                /*
                 * mmap_region() will call shmem_zero_setup() to create a file,
                 * so use shmem's get_unmapped_area in case it can be huge.
                 * do_mmap() will clear pgoff, so match alignment.
                 */
                pgoff = 0;
                get_area = shmem_get_unmapped_area;
        }

        addr = get_area(file, addr, len, pgoff, flags);
        if (IS_ERR_VALUE(addr))
                return addr;

        if (addr > TASK_SIZE - len)
                return -ENOMEM;
        if (offset_in_page(addr))
                return -EINVAL;

        error = security_mmap_addr(addr);
        return error ? error : addr;
}

EXPORT_SYMBOL(get_unmapped_area);

/* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
        struct rb_node *rb_node;
        struct vm_area_struct *vma;

        /* Check the cache first. */
        vma = vmacache_find(mm, addr);
        if (likely(vma))
                return vma;

        rb_node = mm->mm_rb.rb_node;

        while (rb_node) {
                struct vm_area_struct *tmp;

                tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);

                if (tmp->vm_end > addr) {
                        vma = tmp;
                        if (tmp->vm_start <= addr)
                                break;
                        rb_node = rb_node->rb_left;
                } else
                        rb_node = rb_node->rb_right;
        }

        if (vma)
                vmacache_update(addr, vma);
        return vma;
}

EXPORT_SYMBOL(find_vma);

/*
 * Same as find_vma, but also return a pointer to the previous VMA in *pprev.
 */
struct vm_area_struct *
find_vma_prev(struct mm_struct *mm, unsigned long addr,
                        struct vm_area_struct **pprev)
{
        struct vm_area_struct *vma;

        vma = find_vma(mm, addr);
        if (vma) {
                *pprev = vma->vm_prev;
        } else {
                struct rb_node *rb_node = rb_last(&mm->mm_rb);

                *pprev = rb_node ? rb_entry(rb_node, struct vm_area_struct, vm_rb) : NULL;
        }
        return vma;
}

/*
 * Verify that the stack growth is acceptable and
 * update accounting. This is shared with both the
 * grow-up and grow-down cases.
 */
static int acct_stack_growth(struct vm_area_struct *vma,
                             unsigned long size, unsigned long grow)
{
        struct mm_struct *mm = vma->vm_mm;
        unsigned long new_start;

        /* address space limit tests */
        if (!may_expand_vm(mm, vma->vm_flags, grow))
                return -ENOMEM;

        /* Stack limit test */
        if (size > rlimit(RLIMIT_STACK))
                return -ENOMEM;

        /* mlock limit tests */
        if (vma->vm_flags & VM_LOCKED) {
                unsigned long locked;
                unsigned long limit;
                locked = mm->locked_vm + grow;
                limit = rlimit(RLIMIT_MEMLOCK);
                limit >>= PAGE_SHIFT;
                if (locked > limit && !capable(CAP_IPC_LOCK))
                        return -ENOMEM;
        }

        /* Check to ensure the stack will not grow into a hugetlb-only region */
        new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
                        vma->vm_end - size;
        if (is_hugepage_only_range(vma->vm_mm, new_start, size))
                return -EFAULT;

        /*
         * Overcommit..  This must be the final test, as it will
         * update security statistics.
         */
        if (security_vm_enough_memory_mm(mm, grow))
                return -ENOMEM;

        return 0;
}

#if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)
/*
 * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
 * vma is the last one with address > vma->vm_end.  Have to extend vma.
 */
int expand_upwards(struct vm_area_struct *vma, unsigned long address)
{
        struct mm_struct *mm = vma->vm_mm;
        struct vm_area_struct *next;
        unsigned long gap_addr;
        int error = 0;

        if (!(vma->vm_flags & VM_GROWSUP))
                return -EFAULT;

        /* Guard against exceeding limits of the address space. */
        address &= PAGE_MASK;
        if (address >= (TASK_SIZE & PAGE_MASK))
                return -ENOMEM;
        address += PAGE_SIZE;

        /* Enforce stack_guard_gap */
        gap_addr = address + stack_guard_gap;

        /* Guard against overflow */
        if (gap_addr < address || gap_addr > TASK_SIZE)
                gap_addr = TASK_SIZE;

        next = vma->vm_next;
        if (next && next->vm_start < gap_addr && vma_is_accessible(next)) {
                if (!(next->vm_flags & VM_GROWSUP))
                        return -ENOMEM;
                /* Check that both stack segments have the same anon_vma? */
        }

        /* We must make sure the anon_vma is allocated. */
        if (unlikely(anon_vma_prepare(vma)))
                return -ENOMEM;

        /*
         * vma->vm_start/vm_end cannot change under us because the caller
         * is required to hold the mmap_lock in read mode.  We need the
         * anon_vma lock to serialize against concurrent expand_stacks.
         */
        anon_vma_lock_write(vma->anon_vma);

        /* Somebody else might have raced and expanded it already */
        if (address > vma->vm_end) {
                unsigned long size, grow;

                size = address - vma->vm_start;
                grow = (address - vma->vm_end) >> PAGE_SHIFT;

                error = -ENOMEM;
                if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
                        error = acct_stack_growth(vma, size, grow);
                        if (!error) {
                                /*
                                 * vma_gap_update() doesn't support concurrent
                                 * updates, but we only hold a shared mmap_lock
                                 * lock here, so we need to protect against
                                 * concurrent vma expansions.
                                 * anon_vma_lock_write() doesn't help here, as
                                 * we don't guarantee that all growable vmas
                                 * in a mm share the same root anon vma.
                                 * So, we reuse mm->page_table_lock to guard
                                 * against concurrent vma expansions.
                                 */
                                spin_lock(&mm->page_table_lock);
                                if (vma->vm_flags & VM_LOCKED)
                                        mm->locked_vm += grow;
                                vm_stat_account(mm, vma->vm_flags, grow);
                                anon_vma_interval_tree_pre_update_vma(vma);
                                vma->vm_end = address;
                                anon_vma_interval_tree_post_update_vma(vma);
                                if (vma->vm_next)
                                        vma_gap_update(vma->vm_next);
                                else
                                        mm->highest_vm_end = vm_end_gap(vma);
                                spin_unlock(&mm->page_table_lock);

                                perf_event_mmap(vma);
                        }
                }
        }
        anon_vma_unlock_write(vma->anon_vma);
        khugepaged_enter_vma_merge(vma, vma->vm_flags);
        validate_mm(mm);
        return error;
}
#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */

/*
 * vma is the first one with address < vma->vm_start.  Have to extend vma.
 */
int expand_downwards(struct vm_area_struct *vma,
                                   unsigned long address)
{
        struct mm_struct *mm = vma->vm_mm;
        struct vm_area_struct *prev;
        int error = 0;

        address &= PAGE_MASK;
        if (address < mmap_min_addr)
                return -EPERM;

        /* Enforce stack_guard_gap */
        prev = vma->vm_prev;
        /* Check that both stack segments have the same anon_vma? */
        if (prev && !(prev->vm_flags & VM_GROWSDOWN) &&
                        vma_is_accessible(prev)) {
                if (address - prev->vm_end < stack_guard_gap)
                        return -ENOMEM;
        }

        /* We must make sure the anon_vma is allocated. */
        if (unlikely(anon_vma_prepare(vma)))
                return -ENOMEM;

        /*
         * vma->vm_start/vm_end cannot change under us because the caller
         * is required to hold the mmap_lock in read mode.  We need the
         * anon_vma lock to serialize against concurrent expand_stacks.
         */
        anon_vma_lock_write(vma->anon_vma);

        /* Somebody else might have raced and expanded it already */
        if (address < vma->vm_start) {
                unsigned long size, grow;

                size = vma->vm_end - address;
                grow = (vma->vm_start - address) >> PAGE_SHIFT;

                error = -ENOMEM;
                if (grow <= vma->vm_pgoff) {
                        error = acct_stack_growth(vma, size, grow);
                        if (!error) {
                                /*
                                 * vma_gap_update() doesn't support concurrent
                                 * updates, but we only hold a shared mmap_lock
                                 * lock here, so we need to protect against
                                 * concurrent vma expansions.
                                 * anon_vma_lock_write() doesn't help here, as
                                 * we don't guarantee that all growable vmas
                                 * in a mm share the same root anon vma.
                                 * So, we reuse mm->page_table_lock to guard
                                 * against concurrent vma expansions.
                                 */
                                spin_lock(&mm->page_table_lock);
                                if (vma->vm_flags & VM_LOCKED)
                                        mm->locked_vm += grow;
                                vm_stat_account(mm, vma->vm_flags, grow);
                                anon_vma_interval_tree_pre_update_vma(vma);
                                vma->vm_start = address;
                                vma->vm_pgoff -= grow;
                                anon_vma_interval_tree_post_update_vma(vma);
                                vma_gap_update(vma);
                                spin_unlock(&mm->page_table_lock);

                                perf_event_mmap(vma);
                        }
                }
        }
        anon_vma_unlock_write(vma->anon_vma);
        khugepaged_enter_vma_merge(vma, vma->vm_flags);
        validate_mm(mm);
        return error;
}

/* enforced gap between the expanding stack and other mappings. */
unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;

static int __init cmdline_parse_stack_guard_gap(char *p)
{
        unsigned long val;
        char *endptr;

        val = simple_strtoul(p, &endptr, 10);
        if (!*endptr)
                stack_guard_gap = val << PAGE_SHIFT;

        return 1;
}
__setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);

#ifdef CONFIG_STACK_GROWSUP
int expand_stack(struct vm_area_struct *vma, unsigned long address)
{
        return expand_upwards(vma, address);
}

struct vm_area_struct *
find_extend_vma(struct mm_struct *mm, unsigned long addr)
{
        struct vm_area_struct *vma, *prev;

        addr &= PAGE_MASK;
        vma = find_vma_prev(mm, addr, &prev);
        if (vma && (vma->vm_start <= addr))
                return vma;
        /* don't alter vm_end if the coredump is running */
        if (!prev || expand_stack(prev, addr))
                return NULL;
        if (prev->vm_flags & VM_LOCKED)
                populate_vma_page_range(prev, addr, prev->vm_end, NULL);
        return prev;
}
#else
int expand_stack(struct vm_area_struct *vma, unsigned long address)
{
        return expand_downwards(vma, address);
}

struct vm_area_struct *
find_extend_vma(struct mm_struct *mm, unsigned long addr)
{
        struct vm_area_struct *vma;
        unsigned long start;

        addr &= PAGE_MASK;
        vma = find_vma(mm, addr);
        if (!vma)
                return NULL;
        if (vma->vm_start <= addr)
                return vma;
        if (!(vma->vm_flags & VM_GROWSDOWN))
                return NULL;
        start = vma->vm_start;
        if (expand_stack(vma, addr))
                return NULL;
        if (vma->vm_flags & VM_LOCKED)
                populate_vma_page_range(vma, addr, start, NULL);
        return vma;
}
#endif

EXPORT_SYMBOL_GPL(find_extend_vma);

/*
 * Ok - we have the memory areas we should free on the vma list,
 * so release them, and do the vma updates.
 *
 * Called with the mm semaphore held.
 */
static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
{
        unsigned long nr_accounted = 0;

        /* Update high watermark before we lower total_vm */
        update_hiwater_vm(mm);
        do {
                long nrpages = vma_pages(vma);

                if (vma->vm_flags & VM_ACCOUNT)
                        nr_accounted += nrpages;
                vm_stat_account(mm, vma->vm_flags, -nrpages);
                vma = remove_vma(vma);
        } while (vma);
        vm_unacct_memory(nr_accounted);
        validate_mm(mm);
}

/*
 * Get rid of page table information in the indicated region.
 *
 * Called with the mm semaphore held.
 */
static void unmap_region(struct mm_struct *mm,
                struct vm_area_struct *vma, struct vm_area_struct *prev,
                unsigned long start, unsigned long end)
{
        struct vm_area_struct *next = vma_next(mm, prev);
        struct mmu_gather tlb;
        struct vm_area_struct *cur_vma;

        lru_add_drain();
        tlb_gather_mmu(&tlb, mm, start, end);
        update_hiwater_rss(mm);
        unmap_vmas(&tlb, vma, start, end);

        /*
         * Ensure we have no stale TLB entries by the time this mapping is
         * removed from the rmap.
         * Note that we don't have to worry about nested flushes here because
         * we're holding the mm semaphore for removing the mapping - so any
         * concurrent flush in this region has to be coming through the rmap,
         * and we synchronize against that using the rmap lock.
         */
        for (cur_vma = vma; cur_vma; cur_vma = cur_vma->vm_next) {
                if ((cur_vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) != 0) {
                        tlb_flush_mmu(&tlb);
                        break;
                }
        }

        free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
                                 next ? next->vm_start : USER_PGTABLES_CEILING);
        tlb_finish_mmu(&tlb, start, end);
}

/*
 * Create a list of vma's touched by the unmap, removing them from the mm's
 * vma list as we go..
 */
static bool
detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
        struct vm_area_struct *prev, unsigned long end)
{
        struct vm_area_struct **insertion_point;
        struct vm_area_struct *tail_vma = NULL;

        insertion_point = (prev ? &prev->vm_next : &mm->mmap);
        vma->vm_prev = NULL;
        do {
                vma_rb_erase(vma, &mm->mm_rb);
                mm->map_count--;
                tail_vma = vma;
                vma = vma->vm_next;
        } while (vma && vma->vm_start < end);
        *insertion_point = vma;
        if (vma) {
                vma->vm_prev = prev;
                vma_gap_update(vma);
        } else
                mm->highest_vm_end = prev ? vm_end_gap(prev) : 0;
        tail_vma->vm_next = NULL;

        /* Kill the cache */
        vmacache_invalidate(mm);

        /*
         * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or
         * VM_GROWSUP VMA. Such VMAs can change their size under
         * down_read(mmap_lock) and collide with the VMA we are about to unmap.
         */
        if (vma && (vma->vm_flags & VM_GROWSDOWN))
                return false;
        if (prev && (prev->vm_flags & VM_GROWSUP))
                return false;
        return true;
}

/*
 * __split_vma() bypasses sysctl_max_map_count checking.  We use this where it
 * has already been checked or doesn't make sense to fail.
 */
int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
                unsigned long addr, int new_below)
{
        struct vm_area_struct *new;
        int err;

        if (vma->vm_ops && vma->vm_ops->split) {
                err = vma->vm_ops->split(vma, addr);
                if (err)
                        return err;
        }

        new = vm_area_dup(vma);
        if (!new)
                return -ENOMEM;

        if (new_below)
                new->vm_end = addr;
        else {
                new->vm_start = addr;
                new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
        }

        err = vma_dup_policy(vma, new);
        if (err)
                goto out_free_vma;

        err = anon_vma_clone(new, vma);
        if (err)
                goto out_free_mpol;

        if (new->vm_file)
                get_file(new->vm_file);

        if (new->vm_ops && new->vm_ops->open)
                new->vm_ops->open(new);

        if (new_below)
                err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
                        ((addr - new->vm_start) >> PAGE_SHIFT), new);
        else
                err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);

        /* Success. */
        if (!err)
                return 0;

        /* Clean everything up if vma_adjust failed. */
        vma_close(new);
        if (new->vm_file)
                fput(new->vm_file);
        unlink_anon_vmas(new);
 out_free_mpol:
        mpol_put(vma_policy(new));
 out_free_vma:
        vm_area_free(new);
        return err;
}

/*
 * Split a vma into two pieces at address 'addr', a new vma is allocated
 * either for the first part or the tail.
 */
int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
              unsigned long addr, int new_below)
{
        if (mm->map_count >= sysctl_max_map_count)
                return -ENOMEM;

        return __split_vma(mm, vma, addr, new_below);
}

/* Munmap is split into 2 main parts -- this part which finds
 * what needs doing, and the areas themselves, which do the
 * work.  This now handles partial unmappings.
 * Jeremy Fitzhardinge <jeremy@goop.org>
 */
int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
                struct list_head *uf, bool downgrade)
{
        unsigned long end;
        struct vm_area_struct *vma, *prev, *last;

        if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
                return -EINVAL;

        len = PAGE_ALIGN(len);
        end = start + len;
        if (len == 0)
                return -EINVAL;

        /*
         * arch_unmap() might do unmaps itself.  It must be called
         * and finish any rbtree manipulation before this code
         * runs and also starts to manipulate the rbtree.
         */
        arch_unmap(mm, start, end);

        /* Find the first overlapping VMA */
        vma = find_vma(mm, start);
        if (!vma)
                return 0;
        prev = vma->vm_prev;
        /* we have  start < vma->vm_end  */

        /* if it doesn't overlap, we have nothing.. */
        if (vma->vm_start >= end)
                return 0;

        /*
         * If we need to split any vma, do it now to save pain later.
         *
         * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
         * unmapped vm_area_struct will remain in use: so lower split_vma
         * places tmp vma above, and higher split_vma places tmp vma below.
         */
        if (start > vma->vm_start) {
                int error;

                /*
                 * Make sure that map_count on return from munmap() will
                 * not exceed its limit; but let map_count go just above
                 * its limit temporarily, to help free resources as expected.
                 */
                if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
                        return -ENOMEM;

                error = __split_vma(mm, vma, start, 0);
                if (error)
                        return error;
                prev = vma;
        }

        /* Does it split the last one? */
        last = find_vma(mm, end);
        if (last && end > last->vm_start) {
                int error = __split_vma(mm, last, end, 1);
                if (error)
                        return error;
        }
        vma = vma_next(mm, prev);

        if (unlikely(uf)) {
                /*
                 * If userfaultfd_unmap_prep returns an error the vmas
                 * will remain splitted, but userland will get a
                 * highly unexpected error anyway. This is no
                 * different than the case where the first of the two
                 * __split_vma fails, but we don't undo the first
                 * split, despite we could. This is unlikely enough
                 * failure that it's not worth optimizing it for.
                 */
                int error = userfaultfd_unmap_prep(vma, start, end, uf);
                if (error)
                        return error;
        }

        /*
         * unlock any mlock()ed ranges before detaching vmas
         */
        if (mm->locked_vm) {
                struct vm_area_struct *tmp = vma;
                while (tmp && tmp->vm_start < end) {
                        if (tmp->vm_flags & VM_LOCKED) {
                                mm->locked_vm -= vma_pages(tmp);
                                munlock_vma_pages_all(tmp);
                        }

                        tmp = tmp->vm_next;
                }
        }

        /* Detach vmas from rbtree */
        if (!detach_vmas_to_be_unmapped(mm, vma, prev, end))
                downgrade = false;

        if (downgrade)
                mmap_write_downgrade(mm);

        unmap_region(mm, vma, prev, start, end);

        /* Fix up all other VM information */
        remove_vma_list(mm, vma);

        return downgrade ? 1 : 0;
}

int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
              struct list_head *uf)
{
        return __do_munmap(mm, start, len, uf, false);
}

unsigned long mmap_region(struct file *file, unsigned long addr,
                          unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
                          struct list_head *uf)
{
        unsigned long ret;
        bool writable_file_mapping = false;

        /* Allow architectures to sanity-check the vm_flags. */
        if (!arch_validate_flags(vm_flags))
                return -EINVAL;

        /* Map writable and ensure this isn't a sealed memfd. */
        if (file && is_shared_maywrite(vm_flags)) {
                int error = mapping_map_writable(file->f_mapping);

                if (error)
                        return error;
                writable_file_mapping = true;
        }

        ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf);

        /* Clear our write mapping regardless of error. */
        if (writable_file_mapping)
                mapping_unmap_writable(file->f_mapping);

        validate_mm(current->mm);
        return ret;
}

static int __vm_munmap(unsigned long start, size_t len, bool downgrade)
{
        int ret;
        struct mm_struct *mm = current->mm;
        LIST_HEAD(uf);

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        ret = __do_munmap(mm, start, len, &uf, downgrade);
        /*
         * Returning 1 indicates mmap_lock is downgraded.
         * But 1 is not legal return value of vm_munmap() and munmap(), reset
         * it to 0 before return.
         */
        if (ret == 1) {
                mmap_read_unlock(mm);
                ret = 0;
        } else
                mmap_write_unlock(mm);

        userfaultfd_unmap_complete(mm, &uf);
        return ret;
}

int vm_munmap(unsigned long start, size_t len)
{
        return __vm_munmap(start, len, false);
}
EXPORT_SYMBOL(vm_munmap);

SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
{
        addr = untagged_addr(addr);
        profile_munmap(addr);
        return __vm_munmap(addr, len, true);
}


/*
 * Emulation of deprecated remap_file_pages() syscall.
 */
SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
                unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
{

        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
        unsigned long populate = 0;
        unsigned long ret = -EINVAL;
        struct file *file;

        pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.rst.\n",
                     current->comm, current->pid);

        if (prot)
                return ret;
        start = start & PAGE_MASK;
        size = size & PAGE_MASK;

        if (start + size <= start)
                return ret;

        /* Does pgoff wrap? */
        if (pgoff + (size >> PAGE_SHIFT) < pgoff)
                return ret;

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        vma = find_vma(mm, start);

        if (!vma || !(vma->vm_flags & VM_SHARED))
                goto out;

        if (start < vma->vm_start)
                goto out;

        if (start + size > vma->vm_end) {
                struct vm_area_struct *next;

                for (next = vma->vm_next; next; next = next->vm_next) {
                        /* hole between vmas ? */
                        if (next->vm_start != next->vm_prev->vm_end)
                                goto out;

                        if (next->vm_file != vma->vm_file)
                                goto out;

                        if (next->vm_flags != vma->vm_flags)
                                goto out;

                        if (start + size <= next->vm_end)
                                break;
                }

                if (!next)
                        goto out;
        }

        prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
        prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
        prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;

        flags &= MAP_NONBLOCK;
        flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
        if (vma->vm_flags & VM_LOCKED) {
                struct vm_area_struct *tmp;
                flags |= MAP_LOCKED;

                /* drop PG_Mlocked flag for over-mapped range */
                for (tmp = vma; tmp->vm_start >= start + size;
                                tmp = tmp->vm_next) {
                        /*
                         * Split pmd and munlock page on the border
                         * of the range.
                         */
                        vma_adjust_trans_huge(tmp, start, start + size, 0);

                        munlock_vma_pages_range(tmp,
                                        max(tmp->vm_start, start),
                                        min(tmp->vm_end, start + size));
                }
        }

        file = get_file(vma->vm_file);
        ret = do_mmap(vma->vm_file, start, size,
                        prot, flags, pgoff, &populate, NULL);
        fput(file);
out:
        mmap_write_unlock(mm);
        if (populate)
                mm_populate(ret, populate);
        if (!IS_ERR_VALUE(ret))
                ret = 0;
        return ret;
}

/*
 *  this is really a simplified "do_mmap".  it only handles
 *  anonymous maps.  eventually we may be able to do some
 *  brk-specific accounting here.
 */
static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long flags, struct list_head *uf)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma, *prev;
        struct rb_node **rb_link, *rb_parent;
        pgoff_t pgoff = addr >> PAGE_SHIFT;
        int error;
        unsigned long mapped_addr;

        /* Until we need other flags, refuse anything except VM_EXEC. */
        if ((flags & (~VM_EXEC)) != 0)
                return -EINVAL;
        flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;

        mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
        if (IS_ERR_VALUE(mapped_addr))
                return mapped_addr;

        error = mlock_future_check(mm, mm->def_flags, len);
        if (error)
                return error;

        /* Clear old maps, set up prev, rb_link, rb_parent, and uf */
        if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
                return -ENOMEM;

        /* Check against address space limits *after* clearing old maps... */
        if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
                return -ENOMEM;

        if (mm->map_count > sysctl_max_map_count)
                return -ENOMEM;

        if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
                return -ENOMEM;

        /* Can we just expand an old private anonymous mapping? */
        vma = vma_merge(mm, prev, addr, addr + len, flags,
                        NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX);
        if (vma)
                goto out;

        /*
         * create a vma struct for an anonymous mapping
         */
        vma = vm_area_alloc(mm);
        if (!vma) {
                vm_unacct_memory(len >> PAGE_SHIFT);
                return -ENOMEM;
        }

        vma_set_anonymous(vma);
        vma->vm_start = addr;
        vma->vm_end = addr + len;
        vma->vm_pgoff = pgoff;
        vma->vm_flags = flags;
        vma->vm_page_prot = vm_get_page_prot(flags);
        vma_link(mm, vma, prev, rb_link, rb_parent);
out:
        perf_event_mmap(vma);
        mm->total_vm += len >> PAGE_SHIFT;
        mm->data_vm += len >> PAGE_SHIFT;
        if (flags & VM_LOCKED)
                mm->locked_vm += (len >> PAGE_SHIFT);
        vma->vm_flags |= VM_SOFTDIRTY;
        return 0;
}

int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
{
        struct mm_struct *mm = current->mm;
        unsigned long len;
        int ret;
        bool populate;
        LIST_HEAD(uf);

        len = PAGE_ALIGN(request);
        if (len < request)
                return -ENOMEM;
        if (!len)
                return 0;

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        ret = do_brk_flags(addr, len, flags, &uf);
        populate = ((mm->def_flags & VM_LOCKED) != 0);
        mmap_write_unlock(mm);
        userfaultfd_unmap_complete(mm, &uf);
        if (populate && !ret)
                mm_populate(addr, len);
        return ret;
}
EXPORT_SYMBOL(vm_brk_flags);

int vm_brk(unsigned long addr, unsigned long len)
{
        return vm_brk_flags(addr, len, 0);
}
EXPORT_SYMBOL(vm_brk);

/* Release all mmaps. */
void exit_mmap(struct mm_struct *mm)
{
        struct mmu_gather tlb;
        struct vm_area_struct *vma;
        unsigned long nr_accounted = 0;

        /* mm's last user has gone, and its about to be pulled down */
        mmu_notifier_release(mm);

        if (unlikely(mm_is_oom_victim(mm))) {
                /*
                 * Manually reap the mm to free as much memory as possible.
                 * Then, as the oom reaper does, set MMF_OOM_SKIP to disregard
                 * this mm from further consideration.  Taking mm->mmap_lock for
                 * write after setting MMF_OOM_SKIP will guarantee that the oom
                 * reaper will not run on this mm again after mmap_lock is
                 * dropped.
                 *
                 * Nothing can be holding mm->mmap_lock here and the above call
                 * to mmu_notifier_release(mm) ensures mmu notifier callbacks in
                 * __oom_reap_task_mm() will not block.
                 *
                 * This needs to be done before calling munlock_vma_pages_all(),
                 * which clears VM_LOCKED, otherwise the oom reaper cannot
                 * reliably test it.
                 */
                (void)__oom_reap_task_mm(mm);

                set_bit(MMF_OOM_SKIP, &mm->flags);
                mmap_write_lock(mm);
                mmap_write_unlock(mm);
        }

        if (mm->locked_vm) {
                vma = mm->mmap;
                while (vma) {
                        if (vma->vm_flags & VM_LOCKED)
                                munlock_vma_pages_all(vma);
                        vma = vma->vm_next;
                }
        }

        arch_exit_mmap(mm);

        vma = mm->mmap;
        if (!vma)        /* Can happen if dup_mmap() received an OOM */
                return;

        lru_add_drain();
        flush_cache_mm(mm);
        tlb_gather_mmu(&tlb, mm, 0, -1);
        /* update_hiwater_rss(mm) here? but nobody should be looking */
        /* Use -1 here to ensure all VMAs in the mm are unmapped */
        unmap_vmas(&tlb, vma, 0, -1);
        free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
        tlb_finish_mmu(&tlb, 0, -1);

        /*
         * Walk the list again, actually closing and freeing it,
         * with preemption enabled, without holding any MM locks.
         */
        while (vma) {
                if (vma->vm_flags & VM_ACCOUNT)
                        nr_accounted += vma_pages(vma);
                vma = remove_vma(vma);
                cond_resched();
        }
        vm_unacct_memory(nr_accounted);
}

/* Insert vm structure into process list sorted by address
 * and into the inode's i_mmap tree.  If vm_file is non-NULL
 * then i_mmap_rwsem is taken here.
 */
int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
        struct vm_area_struct *prev;
        struct rb_node **rb_link, *rb_parent;

        if (find_vma_links(mm, vma->vm_start, vma->vm_end,
                           &prev, &rb_link, &rb_parent))
                return -ENOMEM;
        if ((vma->vm_flags & VM_ACCOUNT) &&
             security_vm_enough_memory_mm(mm, vma_pages(vma)))
                return -ENOMEM;

        /*
         * The vm_pgoff of a purely anonymous vma should be irrelevant
         * until its first write fault, when page's anon_vma and index
         * are set.  But now set the vm_pgoff it will almost certainly
         * end up with (unless mremap moves it elsewhere before that
         * first wfault), so /proc/pid/maps tells a consistent story.
         *
         * By setting it to reflect the virtual start address of the
         * vma, merges and splits can happen in a seamless way, just
         * using the existing file pgoff checks and manipulations.
         * Similarly in do_mmap and in do_brk_flags.
         */
        if (vma_is_anonymous(vma)) {
                BUG_ON(vma->anon_vma);
                vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
        }

        vma_link(mm, vma, prev, rb_link, rb_parent);
        return 0;
}

/*
 * Copy the vma structure to a new location in the same mm,
 * prior to moving page table entries, to effect an mremap move.
 */
struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
        unsigned long addr, unsigned long len, pgoff_t pgoff,
        bool *need_rmap_locks)
{
        struct vm_area_struct *vma = *vmap;
        unsigned long vma_start = vma->vm_start;
        struct mm_struct *mm = vma->vm_mm;
        struct vm_area_struct *new_vma, *prev;
        struct rb_node **rb_link, *rb_parent;
        bool faulted_in_anon_vma = true;

        /*
         * If anonymous vma has not yet been faulted, update new pgoff
         * to match new location, to increase its chance of merging.
         */
        if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
                pgoff = addr >> PAGE_SHIFT;
                faulted_in_anon_vma = false;
        }

        if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
                return NULL;        /* should never get here */
        new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
                            vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
                            vma->vm_userfaultfd_ctx);
        if (new_vma) {
                /*
                 * Source vma may have been merged into new_vma
                 */
                if (unlikely(vma_start >= new_vma->vm_start &&
                             vma_start < new_vma->vm_end)) {
                        /*
                         * The only way we can get a vma_merge with
                         * self during an mremap is if the vma hasn't
                         * been faulted in yet and we were allowed to
                         * reset the dst vma->vm_pgoff to the
                         * destination address of the mremap to allow
                         * the merge to happen. mremap must change the
                         * vm_pgoff linearity between src and dst vmas
                         * (in turn preventing a vma_merge) to be
                         * safe. It is only safe to keep the vm_pgoff
                         * linear if there are no pages mapped yet.
                         */
                        VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma);
                        *vmap = vma = new_vma;
                }
                *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
        } else {
                new_vma = vm_area_dup(vma);
                if (!new_vma)
                        goto out;
                new_vma->vm_start = addr;
                new_vma->vm_end = addr + len;
                new_vma->vm_pgoff = pgoff;
                if (vma_dup_policy(vma, new_vma))
                        goto out_free_vma;
                if (anon_vma_clone(new_vma, vma))
                        goto out_free_mempol;
                if (new_vma->vm_file)
                        get_file(new_vma->vm_file);
                if (new_vma->vm_ops && new_vma->vm_ops->open)
                        new_vma->vm_ops->open(new_vma);
                vma_link(mm, new_vma, prev, rb_link, rb_parent);
                *need_rmap_locks = false;
        }
        return new_vma;

out_free_mempol:
        mpol_put(vma_policy(new_vma));
out_free_vma:
        vm_area_free(new_vma);
out:
        return NULL;
}

/*
 * Return true if the calling process may expand its vm space by the passed
 * number of pages
 */
bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
{
        if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT)
                return false;

        if (is_data_mapping(flags) &&
            mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) {
                /* Workaround for Valgrind */
                if (rlimit(RLIMIT_DATA) == 0 &&
                    mm->data_vm + npages <= rlimit_max(RLIMIT_DATA) >> PAGE_SHIFT)
                        return true;

                pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Update limits%s.\n",
                             current->comm, current->pid,
                             (mm->data_vm + npages) << PAGE_SHIFT,
                             rlimit(RLIMIT_DATA),
                             ignore_rlimit_data ? "" : " or use boot option ignore_rlimit_data");

                if (!ignore_rlimit_data)
                        return false;
        }

        return true;
}

void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
{
        mm->total_vm += npages;

        if (is_exec_mapping(flags))
                mm->exec_vm += npages;
        else if (is_stack_mapping(flags))
                mm->stack_vm += npages;
        else if (is_data_mapping(flags))
                mm->data_vm += npages;
}

static vm_fault_t special_mapping_fault(struct vm_fault *vmf);

/*
 * Having a close hook prevents vma merging regardless of flags.
 */
static void special_mapping_close(struct vm_area_struct *vma)
{
}

static const char *special_mapping_name(struct vm_area_struct *vma)
{
        return ((struct vm_special_mapping *)vma->vm_private_data)->name;
}

static int special_mapping_mremap(struct vm_area_struct *new_vma)
{
        struct vm_special_mapping *sm = new_vma->vm_private_data;

        if (WARN_ON_ONCE(current->mm != new_vma->vm_mm))
                return -EFAULT;

        if (sm->mremap)
                return sm->mremap(sm, new_vma);

        return 0;
}

static const struct vm_operations_struct special_mapping_vmops = {
        .close = special_mapping_close,
        .fault = special_mapping_fault,
        .mremap = special_mapping_mremap,
        .name = special_mapping_name,
        /* vDSO code relies that VVAR can't be accessed remotely */
        .access = NULL,
};

static const struct vm_operations_struct legacy_special_mapping_vmops = {
        .close = special_mapping_close,
        .fault = special_mapping_fault,
};

static vm_fault_t special_mapping_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        pgoff_t pgoff;
        struct page **pages;

        if (vma->vm_ops == &legacy_special_mapping_vmops) {
                pages = vma->vm_private_data;
        } else {
                struct vm_special_mapping *sm = vma->vm_private_data;

                if (sm->fault)
                        return sm->fault(sm, vmf->vma, vmf);

                pages = sm->pages;
        }

        for (pgoff = vmf->pgoff; pgoff && *pages; ++pages)
                pgoff--;

        if (*pages) {
                struct page *page = *pages;
                get_page(page);
                vmf->page = page;
                return 0;
        }

        return VM_FAULT_SIGBUS;
}

static struct vm_area_struct *__install_special_mapping(
        struct mm_struct *mm,
        unsigned long addr, unsigned long len,
        unsigned long vm_flags, void *priv,
        const struct vm_operations_struct *ops)
{
        int ret;
        struct vm_area_struct *vma;

        vma = vm_area_alloc(mm);
        if (unlikely(vma == NULL))
                return ERR_PTR(-ENOMEM);

        vma->vm_start = addr;
        vma->vm_end = addr + len;

        vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY;
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);

        vma->vm_ops = ops;
        vma->vm_private_data = priv;

        ret = insert_vm_struct(mm, vma);
        if (ret)
                goto out;

        vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT);

        perf_event_mmap(vma);

        return vma;

out:
        vm_area_free(vma);
        return ERR_PTR(ret);
}

bool vma_is_special_mapping(const struct vm_area_struct *vma,
        const struct vm_special_mapping *sm)
{
        return vma->vm_private_data == sm &&
                (vma->vm_ops == &special_mapping_vmops ||
                 vma->vm_ops == &legacy_special_mapping_vmops);
}

/*
 * Called with mm->mmap_lock held for writing.
 * Insert a new vma covering the given region, with the given flags.
 * Its pages are supplied by the given array of struct page *.
 * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.
 * The region past the last page supplied will always produce SIGBUS.
 * The array pointer and the pages it points to are assumed to stay alive
 * for as long as this mapping might exist.
 */
struct vm_area_struct *_install_special_mapping(
        struct mm_struct *mm,
        unsigned long addr, unsigned long len,
        unsigned long vm_flags, const struct vm_special_mapping *spec)
{
        return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec,
                                        &special_mapping_vmops);
}

int install_special_mapping(struct mm_struct *mm,
                            unsigned long addr, unsigned long len,
                            unsigned long vm_flags, struct page **pages)
{
        struct vm_area_struct *vma = __install_special_mapping(
                mm, addr, len, vm_flags, (void *)pages,
                &legacy_special_mapping_vmops);

        return PTR_ERR_OR_ZERO(vma);
}

static DEFINE_MUTEX(mm_all_locks_mutex);

static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
{
        if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
                /*
                 * The LSB of head.next can't change from under us
                 * because we hold the mm_all_locks_mutex.
                 */
                down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock);
                /*
                 * We can safely modify head.next after taking the
                 * anon_vma->root->rwsem. If some other vma in this mm shares
                 * the same anon_vma we won't take it again.
                 *
                 * No need of atomic instructions here, head.next
                 * can't change from under us thanks to the
                 * anon_vma->root->rwsem.
                 */
                if (__test_and_set_bit(0, (unsigned long *)
                                       &anon_vma->root->rb_root.rb_root.rb_node))
                        BUG();
        }
}

static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
{
        if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
                /*
                 * AS_MM_ALL_LOCKS can't change from under us because
                 * we hold the mm_all_locks_mutex.
                 *
                 * Operations on ->flags have to be atomic because
                 * even if AS_MM_ALL_LOCKS is stable thanks to the
                 * mm_all_locks_mutex, there may be other cpus
                 * changing other bitflags in parallel to us.
                 */
                if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
                        BUG();
                down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock);
        }
}

/*
 * This operation locks against the VM for all pte/vma/mm related
 * operations that could ever happen on a certain mm. This includes
 * vmtruncate, try_to_unmap, and all page faults.
 *
 * The caller must take the mmap_lock in write mode before calling
 * mm_take_all_locks(). The caller isn't allowed to release the
 * mmap_lock until mm_drop_all_locks() returns.
 *
 * mmap_lock in write mode is required in order to block all operations
 * that could modify pagetables and free pages without need of
 * altering the vma layout. It's also needed in write mode to avoid new
 * anon_vmas to be associated with existing vmas.
 *
 * A single task can't take more than one mm_take_all_locks() in a row
 * or it would deadlock.
 *
 * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
 * mapping->flags avoid to take the same lock twice, if more than one
 * vma in this mm is backed by the same anon_vma or address_space.
 *
 * We take locks in following order, accordingly to comment at beginning
 * of mm/rmap.c:
 *   - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for
 *     hugetlb mapping);
 *   - all i_mmap_rwsem locks;
 *   - all anon_vma->rwseml
 *
 * We can take all locks within these types randomly because the VM code
 * doesn't nest them and we protected from parallel mm_take_all_locks() by
 * mm_all_locks_mutex.
 *
 * mm_take_all_locks() and mm_drop_all_locks are expensive operations
 * that may have to take thousand of locks.
 *
 * mm_take_all_locks() can fail if it's interrupted by signals.
 */
int mm_take_all_locks(struct mm_struct *mm)
{
        struct vm_area_struct *vma;
        struct anon_vma_chain *avc;

        BUG_ON(mmap_read_trylock(mm));

        mutex_lock(&mm_all_locks_mutex);

        for (vma = mm->mmap; vma; vma = vma->vm_next) {
                if (signal_pending(current))
                        goto out_unlock;
                if (vma->vm_file && vma->vm_file->f_mapping &&
                                is_vm_hugetlb_page(vma))
                        vm_lock_mapping(mm, vma->vm_file->f_mapping);
        }

        for (vma = mm->mmap; vma; vma = vma->vm_next) {
                if (signal_pending(current))
                        goto out_unlock;
                if (vma->vm_file && vma->vm_file->f_mapping &&
                                !is_vm_hugetlb_page(vma))
                        vm_lock_mapping(mm, vma->vm_file->f_mapping);
        }

        for (vma = mm->mmap; vma; vma = vma->vm_next) {
                if (signal_pending(current))
                        goto out_unlock;
                if (vma->anon_vma)
                        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
                                vm_lock_anon_vma(mm, avc->anon_vma);
        }

        return 0;

out_unlock:
        mm_drop_all_locks(mm);
        return -EINTR;
}

static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
{
        if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
                /*
                 * The LSB of head.next can't change to 0 from under
                 * us because we hold the mm_all_locks_mutex.
                 *
                 * We must however clear the bitflag before unlocking
                 * the vma so the users using the anon_vma->rb_root will
                 * never see our bitflag.
                 *
                 * No need of atomic instructions here, head.next
                 * can't change from under us until we release the
                 * anon_vma->root->rwsem.
                 */
                if (!__test_and_clear_bit(0, (unsigned long *)
                                          &anon_vma->root->rb_root.rb_root.rb_node))
                        BUG();
                anon_vma_unlock_write(anon_vma);
        }
}

static void vm_unlock_mapping(struct address_space *mapping)
{
        if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
                /*
                 * AS_MM_ALL_LOCKS can't change to 0 from under us
                 * because we hold the mm_all_locks_mutex.
                 */
                i_mmap_unlock_write(mapping);
                if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
                                        &mapping->flags))
                        BUG();
        }
}

/*
 * The mmap_lock cannot be released by the caller until
 * mm_drop_all_locks() returns.
 */
void mm_drop_all_locks(struct mm_struct *mm)
{
        struct vm_area_struct *vma;
        struct anon_vma_chain *avc;

        BUG_ON(mmap_read_trylock(mm));
        BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));

        for (vma = mm->mmap; vma; vma = vma->vm_next) {
                if (vma->anon_vma)
                        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
                                vm_unlock_anon_vma(avc->anon_vma);
                if (vma->vm_file && vma->vm_file->f_mapping)
                        vm_unlock_mapping(vma->vm_file->f_mapping);
        }

        mutex_unlock(&mm_all_locks_mutex);
}

/*
 * initialise the percpu counter for VM
 */
void __init mmap_init(void)
{
        int ret;

        ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
        VM_BUG_ON(ret);
}

/*
 * Initialise sysctl_user_reserve_kbytes.
 *
 * This is intended to prevent a user from starting a single memory hogging
 * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
 * mode.
 *
 * The default value is min(3% of free memory, 128MB)
 * 128MB is enough to recover with sshd/login, bash, and top/kill.
 */
static int init_user_reserve(void)
{
        unsigned long free_kbytes;

        free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);

        sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
        return 0;
}
subsys_initcall(init_user_reserve);

/*
 * Initialise sysctl_admin_reserve_kbytes.
 *
 * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
 * to log in and kill a memory hogging process.
 *
 * Systems with more than 256MB will reserve 8MB, enough to recover
 * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
 * only reserve 3% of free pages by default.
 */
static int init_admin_reserve(void)
{
        unsigned long free_kbytes;

        free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);

        sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
        return 0;
}
subsys_initcall(init_admin_reserve);

/*
 * Reinititalise user and admin reserves if memory is added or removed.
 *
 * The default user reserve max is 128MB, and the default max for the
 * admin reserve is 8MB. These are usually, but not always, enough to
 * enable recovery from a memory hogging process using login/sshd, a shell,
 * and tools like top. It may make sense to increase or even disable the
 * reserve depending on the existence of swap or variations in the recovery
 * tools. So, the admin may have changed them.
 *
 * If memory is added and the reserves have been eliminated or increased above
 * the default max, then we'll trust the admin.
 *
 * If memory is removed and there isn't enough free memory, then we
 * need to reset the reserves.
 *
 * Otherwise keep the reserve set by the admin.
 */
static int reserve_mem_notifier(struct notifier_block *nb,
                             unsigned long action, void *data)
{
        unsigned long tmp, free_kbytes;

        switch (action) {
        case MEM_ONLINE:
                /* Default max is 128MB. Leave alone if modified by operator. */
                tmp = sysctl_user_reserve_kbytes;
                if (0 < tmp && tmp < (1UL << 17))
                        init_user_reserve();

                /* Default max is 8MB.  Leave alone if modified by operator. */
                tmp = sysctl_admin_reserve_kbytes;
                if (0 < tmp && tmp < (1UL << 13))
                        init_admin_reserve();

                break;
        case MEM_OFFLINE:
                free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);

                if (sysctl_user_reserve_kbytes > free_kbytes) {
                        init_user_reserve();
                        pr_info("vm.user_reserve_kbytes reset to %lu\n",
                                sysctl_user_reserve_kbytes);
                }

                if (sysctl_admin_reserve_kbytes > free_kbytes) {
                        init_admin_reserve();
                        pr_info("vm.admin_reserve_kbytes reset to %lu\n",
                                sysctl_admin_reserve_kbytes);
                }
                break;
        default:
                break;
        }
        return NOTIFY_OK;
}

static struct notifier_block reserve_mem_nb = {
        .notifier_call = reserve_mem_notifier,
};

static int __meminit init_reserve_notifier(void)
{
        if (register_hotmemory_notifier(&reserve_mem_nb))
                pr_err("Failed registering memory add/remove notifier for admin reserve\n");

        return 0;
}
subsys_initcall(init_reserve_notifier);






































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef LLIST_H
#define LLIST_H
/*
 * Lock-less NULL terminated single linked list
 *
 * Cases where locking is not needed:
 * If there are multiple producers and multiple consumers, llist_add can be
 * used in producers and llist_del_all can be used in consumers simultaneously
 * without locking. Also a single consumer can use llist_del_first while
 * multiple producers simultaneously use llist_add, without any locking.
 *
 * Cases where locking is needed:
 * If we have multiple consumers with llist_del_first used in one consumer, and
 * llist_del_first or llist_del_all used in other consumers, then a lock is
 * needed.  This is because llist_del_first depends on list->first->next not
 * changing, but without lock protection, there's no way to be sure about that
 * if a preemption happens in the middle of the delete operation and on being
 * preempted back, the list->first is the same as before causing the cmpxchg in
 * llist_del_first to succeed. For example, while a llist_del_first operation
 * is in progress in one consumer, then a llist_del_first, llist_add,
 * llist_add (or llist_del_all, llist_add, llist_add) sequence in another
 * consumer may cause violations.
 *
 * This can be summarized as follows:
 *
 *           |   add    | del_first |  del_all
 * add       |    -     |     -     |     -
 * del_first |          |     L     |     L
 * del_all   |          |           |     -
 *
 * Where, a particular row's operation can happen concurrently with a column's
 * operation, with "-" being no lock needed, while "L" being lock is needed.
 *
 * The list entries deleted via llist_del_all can be traversed with
 * traversing function such as llist_for_each etc.  But the list
 * entries can not be traversed safely before deleted from the list.
 * The order of deleted entries is from the newest to the oldest added
 * one.  If you want to traverse from the oldest to the newest, you
 * must reverse the order by yourself before traversing.
 *
 * The basic atomic operation of this list is cmpxchg on long.  On
 * architectures that don't have NMI-safe cmpxchg implementation, the
 * list can NOT be used in NMI handlers.  So code that uses the list in
 * an NMI handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.
 *
 * Copyright 2010,2011 Intel Corp.
 *   Author: Huang Ying <ying.huang@intel.com>
 */

#include <linux/atomic.h>
#include <linux/kernel.h>

struct llist_head {
        struct llist_node *first;
};

struct llist_node {
        struct llist_node *next;
};

#define LLIST_HEAD_INIT(name)        { NULL }
#define LLIST_HEAD(name)        struct llist_head name = LLIST_HEAD_INIT(name)

/**
 * init_llist_head - initialize lock-less list head
 * @head:        the head for your lock-less list
 */
static inline void init_llist_head(struct llist_head *list)
{
        list->first = NULL;
}

/**
 * llist_entry - get the struct of this entry
 * @ptr:        the &struct llist_node pointer.
 * @type:        the type of the struct this is embedded in.
 * @member:        the name of the llist_node within the struct.
 */
#define llist_entry(ptr, type, member)                \
        container_of(ptr, type, member)

/**
 * member_address_is_nonnull - check whether the member address is not NULL
 * @ptr:        the object pointer (struct type * that contains the llist_node)
 * @member:        the name of the llist_node within the struct.
 *
 * This macro is conceptually the same as
 *        &ptr->member != NULL
 * but it works around the fact that compilers can decide that taking a member
 * address is never a NULL pointer.
 *
 * Real objects that start at a high address and have a member at NULL are
 * unlikely to exist, but such pointers may be returned e.g. by the
 * container_of() macro.
 */
#define member_address_is_nonnull(ptr, member)        \
        ((uintptr_t)(ptr) + offsetof(typeof(*(ptr)), member) != 0)

/**
 * llist_for_each - iterate over some deleted entries of a lock-less list
 * @pos:        the &struct llist_node to use as a loop cursor
 * @node:        the first entry of deleted list entries
 *
 * In general, some entries of the lock-less list can be traversed
 * safely only after being deleted from list, so start with an entry
 * instead of list head.
 *
 * If being used on entries deleted from lock-less list directly, the
 * traverse order is from the newest to the oldest added entry.  If
 * you want to traverse from the oldest to the newest, you must
 * reverse the order by yourself before traversing.
 */
#define llist_for_each(pos, node)                        \
        for ((pos) = (node); pos; (pos) = (pos)->next)

/**
 * llist_for_each_safe - iterate over some deleted entries of a lock-less list
 *                         safe against removal of list entry
 * @pos:        the &struct llist_node to use as a loop cursor
 * @n:                another &struct llist_node to use as temporary storage
 * @node:        the first entry of deleted list entries
 *
 * In general, some entries of the lock-less list can be traversed
 * safely only after being deleted from list, so start with an entry
 * instead of list head.
 *
 * If being used on entries deleted from lock-less list directly, the
 * traverse order is from the newest to the oldest added entry.  If
 * you want to traverse from the oldest to the newest, you must
 * reverse the order by yourself before traversing.
 */
#define llist_for_each_safe(pos, n, node)                        \
        for ((pos) = (node); (pos) && ((n) = (pos)->next, true); (pos) = (n))

/**
 * llist_for_each_entry - iterate over some deleted entries of lock-less list of given type
 * @pos:        the type * to use as a loop cursor.
 * @node:        the fist entry of deleted list entries.
 * @member:        the name of the llist_node with the struct.
 *
 * In general, some entries of the lock-less list can be traversed
 * safely only after being removed from list, so start with an entry
 * instead of list head.
 *
 * If being used on entries deleted from lock-less list directly, the
 * traverse order is from the newest to the oldest added entry.  If
 * you want to traverse from the oldest to the newest, you must
 * reverse the order by yourself before traversing.
 */
#define llist_for_each_entry(pos, node, member)                                \
        for ((pos) = llist_entry((node), typeof(*(pos)), member);        \
             member_address_is_nonnull(pos, member);                        \
             (pos) = llist_entry((pos)->member.next, typeof(*(pos)), member))

/**
 * llist_for_each_entry_safe - iterate over some deleted entries of lock-less list of given type
 *                               safe against removal of list entry
 * @pos:        the type * to use as a loop cursor.
 * @n:                another type * to use as temporary storage
 * @node:        the first entry of deleted list entries.
 * @member:        the name of the llist_node with the struct.
 *
 * In general, some entries of the lock-less list can be traversed
 * safely only after being removed from list, so start with an entry
 * instead of list head.
 *
 * If being used on entries deleted from lock-less list directly, the
 * traverse order is from the newest to the oldest added entry.  If
 * you want to traverse from the oldest to the newest, you must
 * reverse the order by yourself before traversing.
 */
#define llist_for_each_entry_safe(pos, n, node, member)                               \
        for (pos = llist_entry((node), typeof(*pos), member);                       \
             member_address_is_nonnull(pos, member) &&                               \
                (n = llist_entry(pos->member.next, typeof(*n), member), true); \
             pos = n)

/**
 * llist_empty - tests whether a lock-less list is empty
 * @head:        the list to test
 *
 * Not guaranteed to be accurate or up to date.  Just a quick way to
 * test whether the list is empty without deleting something from the
 * list.
 */
static inline bool llist_empty(const struct llist_head *head)
{
        return READ_ONCE(head->first) == NULL;
}

static inline struct llist_node *llist_next(struct llist_node *node)
{
        return node->next;
}

extern bool llist_add_batch(struct llist_node *new_first,
                            struct llist_node *new_last,
                            struct llist_head *head);
/**
 * llist_add - add a new entry
 * @new:        new entry to be added
 * @head:        the head for your lock-less list
 *
 * Returns true if the list was empty prior to adding this entry.
 */
static inline bool llist_add(struct llist_node *new, struct llist_head *head)
{
        return llist_add_batch(new, new, head);
}

/**
 * llist_del_all - delete all entries from lock-less list
 * @head:        the head of lock-less list to delete all entries
 *
 * If list is empty, return NULL, otherwise, delete all entries and
 * return the pointer to the first entry.  The order of entries
 * deleted is from the newest to the oldest added one.
 */
static inline struct llist_node *llist_del_all(struct llist_head *head)
{
        return xchg(&head->first, NULL);
}

extern struct llist_node *llist_del_first(struct llist_head *head);

struct llist_node *llist_reverse_order(struct llist_node *head);

#endif /* LLIST_H */



























































































































































































































































































































































    1 


    1 











    1 








































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
// SPDX-License-Identifier: GPL-2.0-or-later
/* Asymmetric public-key cryptography key type
 *
 * See Documentation/crypto/asymmetric-keys.rst
 *
 * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */
#include <keys/asymmetric-subtype.h>
#include <keys/asymmetric-parser.h>
#include <crypto/public_key.h>
#include <linux/seq_file.h>
#include <linux/module.h>
#include <linux/overflow.h>
#include <linux/slab.h>
#include <linux/ctype.h>
#include <keys/system_keyring.h>
#include <keys/user-type.h>
#include "asymmetric_keys.h"

MODULE_LICENSE("GPL");

const char *const key_being_used_for[NR__KEY_BEING_USED_FOR] = {
        [VERIFYING_MODULE_SIGNATURE]                = "mod sig",
        [VERIFYING_FIRMWARE_SIGNATURE]                = "firmware sig",
        [VERIFYING_KEXEC_PE_SIGNATURE]                = "kexec PE sig",
        [VERIFYING_KEY_SIGNATURE]                = "key sig",
        [VERIFYING_KEY_SELF_SIGNATURE]                = "key self sig",
        [VERIFYING_UNSPECIFIED_SIGNATURE]        = "unspec sig",
};
EXPORT_SYMBOL_GPL(key_being_used_for);

static LIST_HEAD(asymmetric_key_parsers);
static DECLARE_RWSEM(asymmetric_key_parsers_sem);

/**
 * find_asymmetric_key - Find a key by ID.
 * @keyring: The keys to search.
 * @id_0: The first ID to look for or NULL.
 * @id_1: The second ID to look for or NULL.
 * @partial: Use partial match if true, exact if false.
 *
 * Find a key in the given keyring by identifier.  The preferred identifier is
 * the id_0 and the fallback identifier is the id_1.  If both are given, the
 * lookup is by the former, but the latter must also match.
 */
struct key *find_asymmetric_key(struct key *keyring,
                                const struct asymmetric_key_id *id_0,
                                const struct asymmetric_key_id *id_1,
                                bool partial)
{
        struct key *key;
        key_ref_t ref;
        const char *lookup;
        char *req, *p;
        int len;

        BUG_ON(!id_0 && !id_1);

        if (id_0) {
                lookup = id_0->data;
                len = id_0->len;
        } else {
                lookup = id_1->data;
                len = id_1->len;
        }

        /* Construct an identifier "id:<keyid>". */
        p = req = kmalloc(2 + 1 + len * 2 + 1, GFP_KERNEL);
        if (!req)
                return ERR_PTR(-ENOMEM);

        if (partial) {
                *p++ = 'i';
                *p++ = 'd';
        } else {
                *p++ = 'e';
                *p++ = 'x';
        }
        *p++ = ':';
        p = bin2hex(p, lookup, len);
        *p = 0;

        pr_debug("Look up: \"%s\"\n", req);

        ref = keyring_search(make_key_ref(keyring, 1),
                             &key_type_asymmetric, req, true);
        if (IS_ERR(ref))
                pr_debug("Request for key '%s' err %ld\n", req, PTR_ERR(ref));
        kfree(req);

        if (IS_ERR(ref)) {
                switch (PTR_ERR(ref)) {
                        /* Hide some search errors */
                case -EACCES:
                case -ENOTDIR:
                case -EAGAIN:
                        return ERR_PTR(-ENOKEY);
                default:
                        return ERR_CAST(ref);
                }
        }

        key = key_ref_to_ptr(ref);
        if (id_0 && id_1) {
                const struct asymmetric_key_ids *kids = asymmetric_key_ids(key);

                if (!kids->id[1]) {
                        pr_debug("First ID matches, but second is missing\n");
                        goto reject;
                }
                if (!asymmetric_key_id_same(id_1, kids->id[1])) {
                        pr_debug("First ID matches, but second does not\n");
                        goto reject;
                }
        }

        pr_devel("<==%s() = 0 [%x]\n", __func__, key_serial(key));
        return key;

reject:
        key_put(key);
        return ERR_PTR(-EKEYREJECTED);
}
EXPORT_SYMBOL_GPL(find_asymmetric_key);

/**
 * asymmetric_key_generate_id: Construct an asymmetric key ID
 * @val_1: First binary blob
 * @len_1: Length of first binary blob
 * @val_2: Second binary blob
 * @len_2: Length of second binary blob
 *
 * Construct an asymmetric key ID from a pair of binary blobs.
 */
struct asymmetric_key_id *asymmetric_key_generate_id(const void *val_1,
                                                     size_t len_1,
                                                     const void *val_2,
                                                     size_t len_2)
{
        struct asymmetric_key_id *kid;
        size_t kid_sz;
        size_t len;

        if (check_add_overflow(len_1, len_2, &len))
                return ERR_PTR(-EOVERFLOW);
        if (check_add_overflow(sizeof(struct asymmetric_key_id), len, &kid_sz))
                return ERR_PTR(-EOVERFLOW);
        kid = kmalloc(kid_sz, GFP_KERNEL);
        if (!kid)
                return ERR_PTR(-ENOMEM);
        kid->len = len;
        memcpy(kid->data, val_1, len_1);
        memcpy(kid->data + len_1, val_2, len_2);
        return kid;
}
EXPORT_SYMBOL_GPL(asymmetric_key_generate_id);

/**
 * asymmetric_key_id_same - Return true if two asymmetric keys IDs are the same.
 * @kid_1, @kid_2: The key IDs to compare
 */
bool asymmetric_key_id_same(const struct asymmetric_key_id *kid1,
                            const struct asymmetric_key_id *kid2)
{
        if (!kid1 || !kid2)
                return false;
        if (kid1->len != kid2->len)
                return false;
        return memcmp(kid1->data, kid2->data, kid1->len) == 0;
}
EXPORT_SYMBOL_GPL(asymmetric_key_id_same);

/**
 * asymmetric_key_id_partial - Return true if two asymmetric keys IDs
 * partially match
 * @kid_1, @kid_2: The key IDs to compare
 */
bool asymmetric_key_id_partial(const struct asymmetric_key_id *kid1,
                               const struct asymmetric_key_id *kid2)
{
        if (!kid1 || !kid2)
                return false;
        if (kid1->len < kid2->len)
                return false;
        return memcmp(kid1->data + (kid1->len - kid2->len),
                      kid2->data, kid2->len) == 0;
}
EXPORT_SYMBOL_GPL(asymmetric_key_id_partial);

/**
 * asymmetric_match_key_ids - Search asymmetric key IDs
 * @kids: The list of key IDs to check
 * @match_id: The key ID we're looking for
 * @match: The match function to use
 */
static bool asymmetric_match_key_ids(
        const struct asymmetric_key_ids *kids,
        const struct asymmetric_key_id *match_id,
        bool (*match)(const struct asymmetric_key_id *kid1,
                      const struct asymmetric_key_id *kid2))
{
        int i;

        if (!kids || !match_id)
                return false;
        for (i = 0; i < ARRAY_SIZE(kids->id); i++)
                if (match(kids->id[i], match_id))
                        return true;
        return false;
}

/* helper function can be called directly with pre-allocated memory */
inline int __asymmetric_key_hex_to_key_id(const char *id,
                                   struct asymmetric_key_id *match_id,
                                   size_t hexlen)
{
        match_id->len = hexlen;
        return hex2bin(match_id->data, id, hexlen);
}

/**
 * asymmetric_key_hex_to_key_id - Convert a hex string into a key ID.
 * @id: The ID as a hex string.
 */
struct asymmetric_key_id *asymmetric_key_hex_to_key_id(const char *id)
{
        struct asymmetric_key_id *match_id;
        size_t asciihexlen;
        int ret;

        if (!*id)
                return ERR_PTR(-EINVAL);
        asciihexlen = strlen(id);
        if (asciihexlen & 1)
                return ERR_PTR(-EINVAL);

        match_id = kmalloc(sizeof(struct asymmetric_key_id) + asciihexlen / 2,
                           GFP_KERNEL);
        if (!match_id)
                return ERR_PTR(-ENOMEM);
        ret = __asymmetric_key_hex_to_key_id(id, match_id, asciihexlen / 2);
        if (ret < 0) {
                kfree(match_id);
                return ERR_PTR(-EINVAL);
        }
        return match_id;
}

/*
 * Match asymmetric keys by an exact match on an ID.
 */
static bool asymmetric_key_cmp(const struct key *key,
                               const struct key_match_data *match_data)
{
        const struct asymmetric_key_ids *kids = asymmetric_key_ids(key);
        const struct asymmetric_key_id *match_id = match_data->preparsed;

        return asymmetric_match_key_ids(kids, match_id,
                                        asymmetric_key_id_same);
}

/*
 * Match asymmetric keys by a partial match on an IDs.
 */
static bool asymmetric_key_cmp_partial(const struct key *key,
                                       const struct key_match_data *match_data)
{
        const struct asymmetric_key_ids *kids = asymmetric_key_ids(key);
        const struct asymmetric_key_id *match_id = match_data->preparsed;

        return asymmetric_match_key_ids(kids, match_id,
                                        asymmetric_key_id_partial);
}

/*
 * Preparse the match criterion.  If we don't set lookup_type and cmp,
 * the default will be an exact match on the key description.
 *
 * There are some specifiers for matching key IDs rather than by the key
 * description:
 *
 *        "id:<id>" - find a key by partial match on any available ID
 *        "ex:<id>" - find a key by exact match on any available ID
 *
 * These have to be searched by iteration rather than by direct lookup because
 * the key is hashed according to its description.
 */
static int asymmetric_key_match_preparse(struct key_match_data *match_data)
{
        struct asymmetric_key_id *match_id;
        const char *spec = match_data->raw_data;
        const char *id;
        bool (*cmp)(const struct key *, const struct key_match_data *) =
                asymmetric_key_cmp;

        if (!spec || !*spec)
                return -EINVAL;
        if (spec[0] == 'i' &&
            spec[1] == 'd' &&
            spec[2] == ':') {
                id = spec + 3;
                cmp = asymmetric_key_cmp_partial;
        } else if (spec[0] == 'e' &&
                   spec[1] == 'x' &&
                   spec[2] == ':') {
                id = spec + 3;
        } else {
                goto default_match;
        }

        match_id = asymmetric_key_hex_to_key_id(id);
        if (IS_ERR(match_id))
                return PTR_ERR(match_id);

        match_data->preparsed = match_id;
        match_data->cmp = cmp;
        match_data->lookup_type = KEYRING_SEARCH_LOOKUP_ITERATE;
        return 0;

default_match:
        return 0;
}

/*
 * Free the preparsed the match criterion.
 */
static void asymmetric_key_match_free(struct key_match_data *match_data)
{
        kfree(match_data->preparsed);
}

/*
 * Describe the asymmetric key
 */
static void asymmetric_key_describe(const struct key *key, struct seq_file *m)
{
        const struct asymmetric_key_subtype *subtype = asymmetric_key_subtype(key);
        const struct asymmetric_key_ids *kids = asymmetric_key_ids(key);
        const struct asymmetric_key_id *kid;
        const unsigned char *p;
        int n;

        seq_puts(m, key->description);

        if (subtype) {
                seq_puts(m, ": ");
                subtype->describe(key, m);

                if (kids && kids->id[1]) {
                        kid = kids->id[1];
                        seq_putc(m, ' ');
                        n = kid->len;
                        p = kid->data;
                        if (n > 4) {
                                p += n - 4;
                                n = 4;
                        }
                        seq_printf(m, "%*phN", n, p);
                }

                seq_puts(m, " [");
                /* put something here to indicate the key's capabilities */
                seq_putc(m, ']');
        }
}

/*
 * Preparse a asymmetric payload to get format the contents appropriately for the
 * internal payload to cut down on the number of scans of the data performed.
 *
 * We also generate a proposed description from the contents of the key that
 * can be used to name the key if the user doesn't want to provide one.
 */
static int asymmetric_key_preparse(struct key_preparsed_payload *prep)
{
        struct asymmetric_key_parser *parser;
        int ret;

        pr_devel("==>%s()\n", __func__);

        if (prep->datalen == 0)
                return -EINVAL;

        down_read(&asymmetric_key_parsers_sem);

        ret = -EBADMSG;
        list_for_each_entry(parser, &asymmetric_key_parsers, link) {
                pr_debug("Trying parser '%s'\n", parser->name);

                ret = parser->parse(prep);
                if (ret != -EBADMSG) {
                        pr_debug("Parser recognised the format (ret %d)\n",
                                 ret);
                        break;
                }
        }

        up_read(&asymmetric_key_parsers_sem);
        pr_devel("<==%s() = %d\n", __func__, ret);
        return ret;
}

/*
 * Clean up the key ID list
 */
static void asymmetric_key_free_kids(struct asymmetric_key_ids *kids)
{
        int i;

        if (kids) {
                for (i = 0; i < ARRAY_SIZE(kids->id); i++)
                        kfree(kids->id[i]);
                kfree(kids);
        }
}

/*
 * Clean up the preparse data
 */
static void asymmetric_key_free_preparse(struct key_preparsed_payload *prep)
{
        struct asymmetric_key_subtype *subtype = prep->payload.data[asym_subtype];
        struct asymmetric_key_ids *kids = prep->payload.data[asym_key_ids];

        pr_devel("==>%s()\n", __func__);

        if (subtype) {
                subtype->destroy(prep->payload.data[asym_crypto],
                                 prep->payload.data[asym_auth]);
                module_put(subtype->owner);
        }
        asymmetric_key_free_kids(kids);
        kfree(prep->description);
}

/*
 * dispose of the data dangling from the corpse of a asymmetric key
 */
static void asymmetric_key_destroy(struct key *key)
{
        struct asymmetric_key_subtype *subtype = asymmetric_key_subtype(key);
        struct asymmetric_key_ids *kids = key->payload.data[asym_key_ids];
        void *data = key->payload.data[asym_crypto];
        void *auth = key->payload.data[asym_auth];

        key->payload.data[asym_crypto] = NULL;
        key->payload.data[asym_subtype] = NULL;
        key->payload.data[asym_key_ids] = NULL;
        key->payload.data[asym_auth] = NULL;

        if (subtype) {
                subtype->destroy(data, auth);
                module_put(subtype->owner);
        }

        asymmetric_key_free_kids(kids);
}

static struct key_restriction *asymmetric_restriction_alloc(
        key_restrict_link_func_t check,
        struct key *key)
{
        struct key_restriction *keyres =
                kzalloc(sizeof(struct key_restriction), GFP_KERNEL);

        if (!keyres)
                return ERR_PTR(-ENOMEM);

        keyres->check = check;
        keyres->key = key;
        keyres->keytype = &key_type_asymmetric;

        return keyres;
}

/*
 * look up keyring restrict functions for asymmetric keys
 */
static struct key_restriction *asymmetric_lookup_restriction(
        const char *restriction)
{
        char *restrict_method;
        char *parse_buf;
        char *next;
        struct key_restriction *ret = ERR_PTR(-EINVAL);

        if (strcmp("builtin_trusted", restriction) == 0)
                return asymmetric_restriction_alloc(
                        restrict_link_by_builtin_trusted, NULL);

        if (strcmp("builtin_and_secondary_trusted", restriction) == 0)
                return asymmetric_restriction_alloc(
                        restrict_link_by_builtin_and_secondary_trusted, NULL);

        parse_buf = kstrndup(restriction, PAGE_SIZE, GFP_KERNEL);
        if (!parse_buf)
                return ERR_PTR(-ENOMEM);

        next = parse_buf;
        restrict_method = strsep(&next, ":");

        if ((strcmp(restrict_method, "key_or_keyring") == 0) && next) {
                char *key_text;
                key_serial_t serial;
                struct key *key;
                key_restrict_link_func_t link_fn =
                        restrict_link_by_key_or_keyring;
                bool allow_null_key = false;

                key_text = strsep(&next, ":");

                if (next) {
                        if (strcmp(next, "chain") != 0)
                                goto out;

                        link_fn = restrict_link_by_key_or_keyring_chain;
                        allow_null_key = true;
                }

                if (kstrtos32(key_text, 0, &serial) < 0)
                        goto out;

                if ((serial == 0) && allow_null_key) {
                        key = NULL;
                } else {
                        key = key_lookup(serial);
                        if (IS_ERR(key)) {
                                ret = ERR_CAST(key);
                                goto out;
                        }
                }

                ret = asymmetric_restriction_alloc(link_fn, key);
                if (IS_ERR(ret))
                        key_put(key);
        }

out:
        kfree(parse_buf);
        return ret;
}

int asymmetric_key_eds_op(struct kernel_pkey_params *params,
                          const void *in, void *out)
{
        const struct asymmetric_key_subtype *subtype;
        struct key *key = params->key;
        int ret;

        pr_devel("==>%s()\n", __func__);

        if (key->type != &key_type_asymmetric)
                return -EINVAL;
        subtype = asymmetric_key_subtype(key);
        if (!subtype ||
            !key->payload.data[0])
                return -EINVAL;
        if (!subtype->eds_op)
                return -ENOTSUPP;

        ret = subtype->eds_op(params, in, out);

        pr_devel("<==%s() = %d\n", __func__, ret);
        return ret;
}

static int asymmetric_key_verify_signature(struct kernel_pkey_params *params,
                                           const void *in, const void *in2)
{
        struct public_key_signature sig = {
                .s_size                = params->in2_len,
                .digest_size        = params->in_len,
                .encoding        = params->encoding,
                .hash_algo        = params->hash_algo,
                .digest                = (void *)in,
                .s                = (void *)in2,
        };

        return verify_signature(params->key, &sig);
}

struct key_type key_type_asymmetric = {
        .name                        = "asymmetric",
        .preparse                = asymmetric_key_preparse,
        .free_preparse                = asymmetric_key_free_preparse,
        .instantiate                = generic_key_instantiate,
        .match_preparse                = asymmetric_key_match_preparse,
        .match_free                = asymmetric_key_match_free,
        .destroy                = asymmetric_key_destroy,
        .describe                = asymmetric_key_describe,
        .lookup_restriction        = asymmetric_lookup_restriction,
        .asym_query                = query_asymmetric_key,
        .asym_eds_op                = asymmetric_key_eds_op,
        .asym_verify_signature        = asymmetric_key_verify_signature,
};
EXPORT_SYMBOL_GPL(key_type_asymmetric);

/**
 * register_asymmetric_key_parser - Register a asymmetric key blob parser
 * @parser: The parser to register
 */
int register_asymmetric_key_parser(struct asymmetric_key_parser *parser)
{
        struct asymmetric_key_parser *cursor;
        int ret;

        down_write(&asymmetric_key_parsers_sem);

        list_for_each_entry(cursor, &asymmetric_key_parsers, link) {
                if (strcmp(cursor->name, parser->name) == 0) {
                        pr_err("Asymmetric key parser '%s' already registered\n",
                               parser->name);
                        ret = -EEXIST;
                        goto out;
                }
        }

        list_add_tail(&parser->link, &asymmetric_key_parsers);

        pr_notice("Asymmetric key parser '%s' registered\n", parser->name);
        ret = 0;

out:
        up_write(&asymmetric_key_parsers_sem);
        return ret;
}
EXPORT_SYMBOL_GPL(register_asymmetric_key_parser);

/**
 * unregister_asymmetric_key_parser - Unregister a asymmetric key blob parser
 * @parser: The parser to unregister
 */
void unregister_asymmetric_key_parser(struct asymmetric_key_parser *parser)
{
        down_write(&asymmetric_key_parsers_sem);
        list_del(&parser->link);
        up_write(&asymmetric_key_parsers_sem);

        pr_notice("Asymmetric key parser '%s' unregistered\n", parser->name);
}
EXPORT_SYMBOL_GPL(unregister_asymmetric_key_parser);

/*
 * Module stuff
 */
static int __init asymmetric_key_init(void)
{
        return register_key_type(&key_type_asymmetric);
}

static void __exit asymmetric_key_cleanup(void)
{
        unregister_key_type(&key_type_asymmetric);
}

module_init(asymmetric_key_init);
module_exit(asymmetric_key_cleanup);


































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Definitions for key type implementations
 *
 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#ifndef _LINUX_KEY_TYPE_H
#define _LINUX_KEY_TYPE_H

#include <linux/key.h>
#include <linux/errno.h>

#ifdef CONFIG_KEYS

struct kernel_pkey_query;
struct kernel_pkey_params;

/*
 * Pre-parsed payload, used by key add, update and instantiate.
 *
 * This struct will be cleared and data and datalen will be set with the data
 * and length parameters from the caller and quotalen will be set from
 * def_datalen from the key type.  Then if the preparse() op is provided by the
 * key type, that will be called.  Then the struct will be passed to the
 * instantiate() or the update() op.
 *
 * If the preparse() op is given, the free_preparse() op will be called to
 * clear the contents.
 */
struct key_preparsed_payload {
        char                *description;        /* Proposed key description (or NULL) */
        union key_payload payload;        /* Proposed payload */
        const void        *data;                /* Raw data */
        size_t                datalen;        /* Raw datalen */
        size_t                quotalen;        /* Quota length for proposed payload */
        time64_t        expiry;                /* Expiry time of key */
} __randomize_layout;

typedef int (*request_key_actor_t)(struct key *auth_key, void *aux);

/*
 * Preparsed matching criterion.
 */
struct key_match_data {
        /* Comparison function, defaults to exact description match, but can be
         * overridden by type->match_preparse().  Should return true if a match
         * is found and false if not.
         */
        bool (*cmp)(const struct key *key,
                    const struct key_match_data *match_data);

        const void        *raw_data;        /* Raw match data */
        void                *preparsed;        /* For ->match_preparse() to stash stuff */
        unsigned        lookup_type;        /* Type of lookup for this search. */
#define KEYRING_SEARCH_LOOKUP_DIRECT        0x0000        /* Direct lookup by description. */
#define KEYRING_SEARCH_LOOKUP_ITERATE        0x0001        /* Iterative search. */
};

/*
 * kernel managed key type definition
 */
struct key_type {
        /* name of the type */
        const char *name;

        /* default payload length for quota precalculation (optional)
         * - this can be used instead of calling key_payload_reserve(), that
         *   function only needs to be called if the real datalen is different
         */
        size_t def_datalen;

        unsigned int flags;
#define KEY_TYPE_NET_DOMAIN        0x00000001 /* Keys of this type have a net namespace domain */
#define KEY_TYPE_INSTANT_REAP        0x00000002 /* Keys of this type don't have a delay after expiring */

        /* vet a description */
        int (*vet_description)(const char *description);

        /* Preparse the data blob from userspace that is to be the payload,
         * generating a proposed description and payload that will be handed to
         * the instantiate() and update() ops.
         */
        int (*preparse)(struct key_preparsed_payload *prep);

        /* Free a preparse data structure.
         */
        void (*free_preparse)(struct key_preparsed_payload *prep);

        /* instantiate a key of this type
         * - this method should call key_payload_reserve() to determine if the
         *   user's quota will hold the payload
         */
        int (*instantiate)(struct key *key, struct key_preparsed_payload *prep);

        /* update a key of this type (optional)
         * - this method should call key_payload_reserve() to recalculate the
         *   quota consumption
         * - the key must be locked against read when modifying
         */
        int (*update)(struct key *key, struct key_preparsed_payload *prep);

        /* Preparse the data supplied to ->match() (optional).  The
         * data to be preparsed can be found in match_data->raw_data.
         * The lookup type can also be set by this function.
         */
        int (*match_preparse)(struct key_match_data *match_data);

        /* Free preparsed match data (optional).  This should be supplied it
         * ->match_preparse() is supplied. */
        void (*match_free)(struct key_match_data *match_data);

        /* clear some of the data from a key on revokation (optional)
         * - the key's semaphore will be write-locked by the caller
         */
        void (*revoke)(struct key *key);

        /* clear the data from a key (optional) */
        void (*destroy)(struct key *key);

        /* describe a key */
        void (*describe)(const struct key *key, struct seq_file *p);

        /* read a key's data (optional)
         * - permission checks will be done by the caller
         * - the key's semaphore will be readlocked by the caller
         * - should return the amount of data that could be read, no matter how
         *   much is copied into the buffer
         * - shouldn't do the copy if the buffer is NULL
         */
        long (*read)(const struct key *key, char *buffer, size_t buflen);

        /* handle request_key() for this type instead of invoking
         * /sbin/request-key (optional)
         * - key is the key to instantiate
         * - authkey is the authority to assume when instantiating this key
         * - op is the operation to be done, usually "create"
         * - the call must not return until the instantiation process has run
         *   its course
         */
        request_key_actor_t request_key;

        /* Look up a keyring access restriction (optional)
         *
         * - NULL is a valid return value (meaning the requested restriction
         *   is known but will never block addition of a key)
         * - should return -EINVAL if the restriction is unknown
         */
        struct key_restriction *(*lookup_restriction)(const char *params);

        /* Asymmetric key accessor functions. */
        int (*asym_query)(const struct kernel_pkey_params *params,
                          struct kernel_pkey_query *info);
        int (*asym_eds_op)(struct kernel_pkey_params *params,
                           const void *in, void *out);
        int (*asym_verify_signature)(struct kernel_pkey_params *params,
                                     const void *in, const void *in2);

        /* internal fields */
        struct list_head        link;                /* link in types list */
        struct lock_class_key        lock_class;        /* key->sem lock class */
} __randomize_layout;

extern struct key_type key_type_keyring;

extern int register_key_type(struct key_type *ktype);
extern void unregister_key_type(struct key_type *ktype);

extern int key_payload_reserve(struct key *key, size_t datalen);
extern int key_instantiate_and_link(struct key *key,
                                    const void *data,
                                    size_t datalen,
                                    struct key *keyring,
                                    struct key *authkey);
extern int key_reject_and_link(struct key *key,
                               unsigned timeout,
                               unsigned error,
                               struct key *keyring,
                               struct key *authkey);
extern void complete_request_key(struct key *authkey, int error);

static inline int key_negate_and_link(struct key *key,
                                      unsigned timeout,
                                      struct key *keyring,
                                      struct key *authkey)
{
        return key_reject_and_link(key, timeout, ENOKEY, keyring, authkey);
}

extern int generic_key_instantiate(struct key *key, struct key_preparsed_payload *prep);

#endif /* CONFIG_KEYS */
#endif /* _LINUX_KEY_TYPE_H */








































































































































    1 










    1 









    3 




    3 


    3 



























    1 






    5 













































































    5 



    7 























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Variant of atomic_t specialized for reference counts.
 *
 * The interface matches the atomic_t interface (to aid in porting) but only
 * provides the few functions one should use for reference counting.
 *
 * Saturation semantics
 * ====================
 *
 * refcount_t differs from atomic_t in that the counter saturates at
 * REFCOUNT_SATURATED and will not move once there. This avoids wrapping the
 * counter and causing 'spurious' use-after-free issues. In order to avoid the
 * cost associated with introducing cmpxchg() loops into all of the saturating
 * operations, we temporarily allow the counter to take on an unchecked value
 * and then explicitly set it to REFCOUNT_SATURATED on detecting that underflow
 * or overflow has occurred. Although this is racy when multiple threads
 * access the refcount concurrently, by placing REFCOUNT_SATURATED roughly
 * equidistant from 0 and INT_MAX we minimise the scope for error:
 *
 *                                    INT_MAX     REFCOUNT_SATURATED   UINT_MAX
 *   0                          (0x7fff_ffff)    (0xc000_0000)    (0xffff_ffff)
 *   +--------------------------------+----------------+----------------+
 *                                     <---------- bad value! ---------->
 *
 * (in a signed view of the world, the "bad value" range corresponds to
 * a negative counter value).
 *
 * As an example, consider a refcount_inc() operation that causes the counter
 * to overflow:
 *
 *         int old = atomic_fetch_add_relaxed(r);
 *        // old is INT_MAX, refcount now INT_MIN (0x8000_0000)
 *        if (old < 0)
 *                atomic_set(r, REFCOUNT_SATURATED);
 *
 * If another thread also performs a refcount_inc() operation between the two
 * atomic operations, then the count will continue to edge closer to 0. If it
 * reaches a value of 1 before /any/ of the threads reset it to the saturated
 * value, then a concurrent refcount_dec_and_test() may erroneously free the
 * underlying object.
 * Linux limits the maximum number of tasks to PID_MAX_LIMIT, which is currently
 * 0x400000 (and can't easily be raised in the future beyond FUTEX_TID_MASK).
 * With the current PID limit, if no batched refcounting operations are used and
 * the attacker can't repeatedly trigger kernel oopses in the middle of refcount
 * operations, this makes it impossible for a saturated refcount to leave the
 * saturation range, even if it is possible for multiple uses of the same
 * refcount to nest in the context of a single task:
 *
 *     (UINT_MAX+1-REFCOUNT_SATURATED) / PID_MAX_LIMIT =
 *     0x40000000 / 0x400000 = 0x100 = 256
 *
 * If hundreds of references are added/removed with a single refcounting
 * operation, it may potentially be possible to leave the saturation range; but
 * given the precise timing details involved with the round-robin scheduling of
 * each thread manipulating the refcount and the need to hit the race multiple
 * times in succession, there doesn't appear to be a practical avenue of attack
 * even if using refcount_add() operations with larger increments.
 *
 * Memory ordering
 * ===============
 *
 * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
 * and provide only what is strictly required for refcounts.
 *
 * The increments are fully relaxed; these will not provide ordering. The
 * rationale is that whatever is used to obtain the object we're increasing the
 * reference count on will provide the ordering. For locked data structures,
 * its the lock acquire, for RCU/lockless data structures its the dependent
 * load.
 *
 * Do note that inc_not_zero() provides a control dependency which will order
 * future stores against the inc, this ensures we'll never modify the object
 * if we did not in fact acquire a reference.
 *
 * The decrements will provide release order, such that all the prior loads and
 * stores will be issued before, it also provides a control dependency, which
 * will order us against the subsequent free().
 *
 * The control dependency is against the load of the cmpxchg (ll/sc) that
 * succeeded. This means the stores aren't fully ordered, but this is fine
 * because the 1->0 transition indicates no concurrency.
 *
 * Note that the allocator is responsible for ordering things between free()
 * and alloc().
 *
 * The decrements dec_and_test() and sub_and_test() also provide acquire
 * ordering on success.
 *
 */

#ifndef _LINUX_REFCOUNT_H
#define _LINUX_REFCOUNT_H

#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/compiler.h>
#include <linux/limits.h>
#include <linux/spinlock_types.h>

struct mutex;

/**
 * struct refcount_t - variant of atomic_t specialized for reference counts
 * @refs: atomic_t counter field
 *
 * The counter saturates at REFCOUNT_SATURATED and will not move once
 * there. This avoids wrapping the counter and causing 'spurious'
 * use-after-free bugs.
 */
typedef struct refcount_struct {
        atomic_t refs;
} refcount_t;

#define REFCOUNT_INIT(n)        { .refs = ATOMIC_INIT(n), }
#define REFCOUNT_MAX                INT_MAX
#define REFCOUNT_SATURATED        (INT_MIN / 2)

enum refcount_saturation_type {
        REFCOUNT_ADD_NOT_ZERO_OVF,
        REFCOUNT_ADD_OVF,
        REFCOUNT_ADD_UAF,
        REFCOUNT_SUB_UAF,
        REFCOUNT_DEC_LEAK,
};

void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t);

/**
 * refcount_set - set a refcount's value
 * @r: the refcount
 * @n: value to which the refcount will be set
 */
static inline void refcount_set(refcount_t *r, int n)
{
        atomic_set(&r->refs, n);
}

/**
 * refcount_read - get a refcount's value
 * @r: the refcount
 *
 * Return: the refcount's value
 */
static inline unsigned int refcount_read(const refcount_t *r)
{
        return atomic_read(&r->refs);
}

static inline __must_check bool __refcount_add_not_zero(int i, refcount_t *r, int *oldp)
{
        int old = refcount_read(r);

        do {
                if (!old)
                        break;
        } while (!atomic_try_cmpxchg_relaxed(&r->refs, &old, old + i));

        if (oldp)
                *oldp = old;

        if (unlikely(old < 0 || old + i < 0))
                refcount_warn_saturate(r, REFCOUNT_ADD_NOT_ZERO_OVF);

        return old;
}

/**
 * refcount_add_not_zero - add a value to a refcount unless it is 0
 * @i: the value to add to the refcount
 * @r: the refcount
 *
 * Will saturate at REFCOUNT_SATURATED and WARN.
 *
 * Provides no memory ordering, it is assumed the caller has guaranteed the
 * object memory to be stable (RCU, etc.). It does provide a control dependency
 * and thereby orders future stores. See the comment on top.
 *
 * Use of this function is not recommended for the normal reference counting
 * use case in which references are taken and released one at a time.  In these
 * cases, refcount_inc(), or one of its variants, should instead be used to
 * increment a reference count.
 *
 * Return: false if the passed refcount is 0, true otherwise
 */
static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r)
{
        return __refcount_add_not_zero(i, r, NULL);
}

static inline void __refcount_add(int i, refcount_t *r, int *oldp)
{
        int old = atomic_fetch_add_relaxed(i, &r->refs);

        if (oldp)
                *oldp = old;

        if (unlikely(!old))
                refcount_warn_saturate(r, REFCOUNT_ADD_UAF);
        else if (unlikely(old < 0 || old + i < 0))
                refcount_warn_saturate(r, REFCOUNT_ADD_OVF);
}

/**
 * refcount_add - add a value to a refcount
 * @i: the value to add to the refcount
 * @r: the refcount
 *
 * Similar to atomic_add(), but will saturate at REFCOUNT_SATURATED and WARN.
 *
 * Provides no memory ordering, it is assumed the caller has guaranteed the
 * object memory to be stable (RCU, etc.). It does provide a control dependency
 * and thereby orders future stores. See the comment on top.
 *
 * Use of this function is not recommended for the normal reference counting
 * use case in which references are taken and released one at a time.  In these
 * cases, refcount_inc(), or one of its variants, should instead be used to
 * increment a reference count.
 */
static inline void refcount_add(int i, refcount_t *r)
{
        __refcount_add(i, r, NULL);
}

static inline __must_check bool __refcount_inc_not_zero(refcount_t *r, int *oldp)
{
        return __refcount_add_not_zero(1, r, oldp);
}

/**
 * refcount_inc_not_zero - increment a refcount unless it is 0
 * @r: the refcount to increment
 *
 * Similar to atomic_inc_not_zero(), but will saturate at REFCOUNT_SATURATED
 * and WARN.
 *
 * Provides no memory ordering, it is assumed the caller has guaranteed the
 * object memory to be stable (RCU, etc.). It does provide a control dependency
 * and thereby orders future stores. See the comment on top.
 *
 * Return: true if the increment was successful, false otherwise
 */
static inline __must_check bool refcount_inc_not_zero(refcount_t *r)
{
        return __refcount_inc_not_zero(r, NULL);
}

static inline void __refcount_inc(refcount_t *r, int *oldp)
{
        __refcount_add(1, r, oldp);
}

/**
 * refcount_inc - increment a refcount
 * @r: the refcount to increment
 *
 * Similar to atomic_inc(), but will saturate at REFCOUNT_SATURATED and WARN.
 *
 * Provides no memory ordering, it is assumed the caller already has a
 * reference on the object.
 *
 * Will WARN if the refcount is 0, as this represents a possible use-after-free
 * condition.
 */
static inline void refcount_inc(refcount_t *r)
{
        __refcount_inc(r, NULL);
}

static inline __must_check bool __refcount_sub_and_test(int i, refcount_t *r, int *oldp)
{
        int old = atomic_fetch_sub_release(i, &r->refs);

        if (oldp)
                *oldp = old;

        if (old == i) {
                smp_acquire__after_ctrl_dep();
                return true;
        }

        if (unlikely(old < 0 || old - i < 0))
                refcount_warn_saturate(r, REFCOUNT_SUB_UAF);

        return false;
}

/**
 * refcount_sub_and_test - subtract from a refcount and test if it is 0
 * @i: amount to subtract from the refcount
 * @r: the refcount
 *
 * Similar to atomic_dec_and_test(), but it will WARN, return false and
 * ultimately leak on underflow and will fail to decrement when saturated
 * at REFCOUNT_SATURATED.
 *
 * Provides release memory ordering, such that prior loads and stores are done
 * before, and provides an acquire ordering on success such that free()
 * must come after.
 *
 * Use of this function is not recommended for the normal reference counting
 * use case in which references are taken and released one at a time.  In these
 * cases, refcount_dec(), or one of its variants, should instead be used to
 * decrement a reference count.
 *
 * Return: true if the resulting refcount is 0, false otherwise
 */
static inline __must_check bool refcount_sub_and_test(int i, refcount_t *r)
{
        return __refcount_sub_and_test(i, r, NULL);
}

static inline __must_check bool __refcount_dec_and_test(refcount_t *r, int *oldp)
{
        return __refcount_sub_and_test(1, r, oldp);
}

/**
 * refcount_dec_and_test - decrement a refcount and test if it is 0
 * @r: the refcount
 *
 * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to
 * decrement when saturated at REFCOUNT_SATURATED.
 *
 * Provides release memory ordering, such that prior loads and stores are done
 * before, and provides an acquire ordering on success such that free()
 * must come after.
 *
 * Return: true if the resulting refcount is 0, false otherwise
 */
static inline __must_check bool refcount_dec_and_test(refcount_t *r)
{
        return __refcount_dec_and_test(r, NULL);
}

static inline void __refcount_dec(refcount_t *r, int *oldp)
{
        int old = atomic_fetch_sub_release(1, &r->refs);

        if (oldp)
                *oldp = old;

        if (unlikely(old <= 1))
                refcount_warn_saturate(r, REFCOUNT_DEC_LEAK);
}

/**
 * refcount_dec - decrement a refcount
 * @r: the refcount
 *
 * Similar to atomic_dec(), it will WARN on underflow and fail to decrement
 * when saturated at REFCOUNT_SATURATED.
 *
 * Provides release memory ordering, such that prior loads and stores are done
 * before.
 */
static inline void refcount_dec(refcount_t *r)
{
        __refcount_dec(r, NULL);
}

extern __must_check bool refcount_dec_if_one(refcount_t *r);
extern __must_check bool refcount_dec_not_one(refcount_t *r);
extern __must_check bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock);
extern __must_check bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock);
extern __must_check bool refcount_dec_and_lock_irqsave(refcount_t *r,
                                                       spinlock_t *lock,
                                                       unsigned long *flags);
#endif /* _LINUX_REFCOUNT_H */





























































    1 
























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
/* SPDX-License-Identifier: GPL-2.0 */
/* rwsem.h: R/W semaphores, public interface
 *
 * Written by David Howells (dhowells@redhat.com).
 * Derived from asm-i386/semaphore.h
 */

#ifndef _LINUX_RWSEM_H
#define _LINUX_RWSEM_H

#include <linux/linkage.h>

#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/atomic.h>
#include <linux/err.h>
#include <linux/cleanup.h>

#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
#include <linux/osq_lock.h>
#endif

/*
 * For an uncontended rwsem, count and owner are the only fields a task
 * needs to touch when acquiring the rwsem. So they are put next to each
 * other to increase the chance that they will share the same cacheline.
 *
 * In a contended rwsem, the owner is likely the most frequently accessed
 * field in the structure as the optimistic waiter that holds the osq lock
 * will spin on owner. For an embedded rwsem, other hot fields in the
 * containing structure should be moved further away from the rwsem to
 * reduce the chance that they will share the same cacheline causing
 * cacheline bouncing problem.
 */
struct rw_semaphore {
        atomic_long_t count;
        /*
         * Write owner or one of the read owners as well flags regarding
         * the current state of the rwsem. Can be used as a speculative
         * check to see if the write owner is running on the cpu.
         */
        atomic_long_t owner;
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
        struct optimistic_spin_queue osq; /* spinner MCS lock */
#endif
        raw_spinlock_t wait_lock;
        struct list_head wait_list;
#ifdef CONFIG_DEBUG_RWSEMS
        void *magic;
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map        dep_map;
#endif
};

/* In all implementations count != 0 means locked */
static inline int rwsem_is_locked(struct rw_semaphore *sem)
{
        return atomic_long_read(&sem->count) != 0;
}

#define RWSEM_UNLOCKED_VALUE                0L
#define __RWSEM_COUNT_INIT(name)        .count = ATOMIC_LONG_INIT(RWSEM_UNLOCKED_VALUE)

/* Common initializer macros and functions */

#ifdef CONFIG_DEBUG_LOCK_ALLOC
# define __RWSEM_DEP_MAP_INIT(lockname)                        \
        .dep_map = {                                        \
                .name = #lockname,                        \
                .wait_type_inner = LD_WAIT_SLEEP,        \
        },
#else
# define __RWSEM_DEP_MAP_INIT(lockname)
#endif

#ifdef CONFIG_DEBUG_RWSEMS
# define __RWSEM_DEBUG_INIT(lockname) .magic = &lockname,
#else
# define __RWSEM_DEBUG_INIT(lockname)
#endif

#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
#define __RWSEM_OPT_INIT(lockname) .osq = OSQ_LOCK_UNLOCKED,
#else
#define __RWSEM_OPT_INIT(lockname)
#endif

#define __RWSEM_INITIALIZER(name)                                \
        { __RWSEM_COUNT_INIT(name),                                \
          .owner = ATOMIC_LONG_INIT(0),                                \
          __RWSEM_OPT_INIT(name)                                \
          .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),\
          .wait_list = LIST_HEAD_INIT((name).wait_list),        \
          __RWSEM_DEBUG_INIT(name)                                \
          __RWSEM_DEP_MAP_INIT(name) }

#define DECLARE_RWSEM(name) \
        struct rw_semaphore name = __RWSEM_INITIALIZER(name)

extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
                         struct lock_class_key *key);

#define init_rwsem(sem)                                                \
do {                                                                \
        static struct lock_class_key __key;                        \
                                                                \
        __init_rwsem((sem), #sem, &__key);                        \
} while (0)

/*
 * This is the same regardless of which rwsem implementation that is being used.
 * It is just a heuristic meant to be called by somebody alreadying holding the
 * rwsem to see if somebody from an incompatible type is wanting access to the
 * lock.
 */
static inline int rwsem_is_contended(struct rw_semaphore *sem)
{
        return !list_empty(&sem->wait_list);
}

/*
 * lock for reading
 */
extern void down_read(struct rw_semaphore *sem);
extern int __must_check down_read_interruptible(struct rw_semaphore *sem);
extern int __must_check down_read_killable(struct rw_semaphore *sem);

/*
 * trylock for reading -- returns 1 if successful, 0 if contention
 */
extern int down_read_trylock(struct rw_semaphore *sem);

/*
 * lock for writing
 */
extern void down_write(struct rw_semaphore *sem);
extern int __must_check down_write_killable(struct rw_semaphore *sem);

/*
 * trylock for writing -- returns 1 if successful, 0 if contention
 */
extern int down_write_trylock(struct rw_semaphore *sem);

/*
 * release a read lock
 */
extern void up_read(struct rw_semaphore *sem);

/*
 * release a write lock
 */
extern void up_write(struct rw_semaphore *sem);

DEFINE_GUARD(rwsem_read, struct rw_semaphore *, down_read(_T), up_read(_T))
DEFINE_GUARD(rwsem_write, struct rw_semaphore *, down_write(_T), up_write(_T))

DEFINE_FREE(up_read, struct rw_semaphore *, if (_T) up_read(_T))
DEFINE_FREE(up_write, struct rw_semaphore *, if (_T) up_write(_T))


/*
 * downgrade write lock to read lock
 */
extern void downgrade_write(struct rw_semaphore *sem);

#ifdef CONFIG_DEBUG_LOCK_ALLOC
/*
 * nested locking. NOTE: rwsems are not allowed to recurse
 * (which occurs if the same task tries to acquire the same
 * lock instance multiple times), but multiple locks of the
 * same lock class might be taken, if the order of the locks
 * is always the same. This ordering rule can be expressed
 * to lockdep via the _nested() APIs, but enumerating the
 * subclasses that are used. (If the nesting relationship is
 * static then another method for expressing nested locking is
 * the explicit definition of lock class keys and the use of
 * lockdep_set_class() at lock initialization time.
 * See Documentation/locking/lockdep-design.rst for more details.)
 */
extern void down_read_nested(struct rw_semaphore *sem, int subclass);
extern int __must_check down_read_killable_nested(struct rw_semaphore *sem, int subclass);
extern void down_write_nested(struct rw_semaphore *sem, int subclass);
extern int down_write_killable_nested(struct rw_semaphore *sem, int subclass);
extern void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest_lock);

# define down_write_nest_lock(sem, nest_lock)                        \
do {                                                                \
        typecheck(struct lockdep_map *, &(nest_lock)->dep_map);        \
        _down_write_nest_lock(sem, &(nest_lock)->dep_map);        \
} while (0);

/*
 * Take/release a lock when not the owner will release it.
 *
 * [ This API should be avoided as much as possible - the
 *   proper abstraction for this case is completions. ]
 */
extern void down_read_non_owner(struct rw_semaphore *sem);
extern void up_read_non_owner(struct rw_semaphore *sem);
#else
# define down_read_nested(sem, subclass)                down_read(sem)
# define down_read_killable_nested(sem, subclass)        down_read_killable(sem)
# define down_write_nest_lock(sem, nest_lock)        down_write(sem)
# define down_write_nested(sem, subclass)        down_write(sem)
# define down_write_killable_nested(sem, subclass)        down_write_killable(sem)
# define down_read_non_owner(sem)                down_read(sem)
# define up_read_non_owner(sem)                        up_read(sem)
#endif

#endif /* _LINUX_RWSEM_H */
































































    1 




































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM sched

#if !defined(_TRACE_SCHED_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_SCHED_H

#include <linux/kthread.h>
#include <linux/sched/numa_balancing.h>
#include <linux/tracepoint.h>
#include <linux/binfmts.h>

/*
 * Tracepoint for calling kthread_stop, performed to end a kthread:
 */
TRACE_EVENT(sched_kthread_stop,

        TP_PROTO(struct task_struct *t),

        TP_ARGS(t),

        TP_STRUCT__entry(
                __array(        char,        comm,        TASK_COMM_LEN        )
                __field(        pid_t,        pid                        )
        ),

        TP_fast_assign(
                memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
                __entry->pid        = t->pid;
        ),

        TP_printk("comm=%s pid=%d", __entry->comm, __entry->pid)
);

/*
 * Tracepoint for the return value of the kthread stopping:
 */
TRACE_EVENT(sched_kthread_stop_ret,

        TP_PROTO(int ret),

        TP_ARGS(ret),

        TP_STRUCT__entry(
                __field(        int,        ret        )
        ),

        TP_fast_assign(
                __entry->ret        = ret;
        ),

        TP_printk("ret=%d", __entry->ret)
);

/**
 * sched_kthread_work_queue_work - called when a work gets queued
 * @worker:        pointer to the kthread_worker
 * @work:        pointer to struct kthread_work
 *
 * This event occurs when a work is queued immediately or once a
 * delayed work is actually queued (ie: once the delay has been
 * reached).
 */
TRACE_EVENT(sched_kthread_work_queue_work,

        TP_PROTO(struct kthread_worker *worker,
                 struct kthread_work *work),

        TP_ARGS(worker, work),

        TP_STRUCT__entry(
                __field( void *,        work        )
                __field( void *,        function)
                __field( void *,        worker)
        ),

        TP_fast_assign(
                __entry->work                = work;
                __entry->function        = work->func;
                __entry->worker                = worker;
        ),

        TP_printk("work struct=%p function=%ps worker=%p",
                  __entry->work, __entry->function, __entry->worker)
);

/**
 * sched_kthread_work_execute_start - called immediately before the work callback
 * @work:        pointer to struct kthread_work
 *
 * Allows to track kthread work execution.
 */
TRACE_EVENT(sched_kthread_work_execute_start,

        TP_PROTO(struct kthread_work *work),

        TP_ARGS(work),

        TP_STRUCT__entry(
                __field( void *,        work        )
                __field( void *,        function)
        ),

        TP_fast_assign(
                __entry->work                = work;
                __entry->function        = work->func;
        ),

        TP_printk("work struct %p: function %ps", __entry->work, __entry->function)
);

/**
 * sched_kthread_work_execute_end - called immediately after the work callback
 * @work:        pointer to struct work_struct
 * @function:   pointer to worker function
 *
 * Allows to track workqueue execution.
 */
TRACE_EVENT(sched_kthread_work_execute_end,

        TP_PROTO(struct kthread_work *work, kthread_work_func_t function),

        TP_ARGS(work, function),

        TP_STRUCT__entry(
                __field( void *,        work        )
                __field( void *,        function)
        ),

        TP_fast_assign(
                __entry->work                = work;
                __entry->function        = function;
        ),

        TP_printk("work struct %p: function %ps", __entry->work, __entry->function)
);

/*
 * Tracepoint for waking up a task:
 */
DECLARE_EVENT_CLASS(sched_wakeup_template,

        TP_PROTO(struct task_struct *p),

        TP_ARGS(__perf_task(p)),

        TP_STRUCT__entry(
                __array(        char,        comm,        TASK_COMM_LEN        )
                __field(        pid_t,        pid                        )
                __field(        int,        prio                        )
                __field(        int,        success                        )
                __field(        int,        target_cpu                )
        ),

        TP_fast_assign(
                memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
                __entry->pid                = p->pid;
                __entry->prio                = p->prio; /* XXX SCHED_DEADLINE */
                __entry->success        = 1; /* rudiment, kill when possible */
                __entry->target_cpu        = task_cpu(p);
        ),

        TP_printk("comm=%s pid=%d prio=%d target_cpu=%03d",
                  __entry->comm, __entry->pid, __entry->prio,
                  __entry->target_cpu)
);

/*
 * Tracepoint called when waking a task; this tracepoint is guaranteed to be
 * called from the waking context.
 */
DEFINE_EVENT(sched_wakeup_template, sched_waking,
             TP_PROTO(struct task_struct *p),
             TP_ARGS(p));

/*
 * Tracepoint called when the task is actually woken; p->state == TASK_RUNNNG.
 * It is not always called from the waking context.
 */
DEFINE_EVENT(sched_wakeup_template, sched_wakeup,
             TP_PROTO(struct task_struct *p),
             TP_ARGS(p));

/*
 * Tracepoint for waking up a new task:
 */
DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
             TP_PROTO(struct task_struct *p),
             TP_ARGS(p));

#ifdef CREATE_TRACE_POINTS
static inline long __trace_sched_switch_state(bool preempt, struct task_struct *p)
{
        unsigned int state;

#ifdef CONFIG_SCHED_DEBUG
        BUG_ON(p != current);
#endif /* CONFIG_SCHED_DEBUG */

        /*
         * Preemption ignores task state, therefore preempted tasks are always
         * RUNNING (we will not have dequeued if state != RUNNING).
         */
        if (preempt)
                return TASK_REPORT_MAX;

        /*
         * task_state_index() uses fls() and returns a value from 0-8 range.
         * Decrement it by 1 (except TASK_RUNNING state i.e 0) before using
         * it for left shift operation to get the correct task->state
         * mapping.
         */
        state = task_state_index(p);

        return state ? (1 << (state - 1)) : state;
}
#endif /* CREATE_TRACE_POINTS */

/*
 * Tracepoint for task switches, performed by the scheduler:
 */
TRACE_EVENT(sched_switch,

        TP_PROTO(bool preempt,
                 struct task_struct *prev,
                 struct task_struct *next),

        TP_ARGS(preempt, prev, next),

        TP_STRUCT__entry(
                __array(        char,        prev_comm,        TASK_COMM_LEN        )
                __field(        pid_t,        prev_pid                        )
                __field(        int,        prev_prio                        )
                __field(        long,        prev_state                        )
                __array(        char,        next_comm,        TASK_COMM_LEN        )
                __field(        pid_t,        next_pid                        )
                __field(        int,        next_prio                        )
        ),

        TP_fast_assign(
                memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
                __entry->prev_pid        = prev->pid;
                __entry->prev_prio        = prev->prio;
                __entry->prev_state        = __trace_sched_switch_state(preempt, prev);
                memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
                __entry->next_pid        = next->pid;
                __entry->next_prio        = next->prio;
                /* XXX SCHED_DEADLINE */
        ),

        TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d",
                __entry->prev_comm, __entry->prev_pid, __entry->prev_prio,

                (__entry->prev_state & (TASK_REPORT_MAX - 1)) ?
                  __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|",
                                { TASK_INTERRUPTIBLE, "S" },
                                { TASK_UNINTERRUPTIBLE, "D" },
                                { __TASK_STOPPED, "T" },
                                { __TASK_TRACED, "t" },
                                { EXIT_DEAD, "X" },
                                { EXIT_ZOMBIE, "Z" },
                                { TASK_PARKED, "P" },
                                { TASK_DEAD, "I" }) :
                  "R",

                __entry->prev_state & TASK_REPORT_MAX ? "+" : "",
                __entry->next_comm, __entry->next_pid, __entry->next_prio)
);

/*
 * Tracepoint for a task being migrated:
 */
TRACE_EVENT(sched_migrate_task,

        TP_PROTO(struct task_struct *p, int dest_cpu),

        TP_ARGS(p, dest_cpu),

        TP_STRUCT__entry(
                __array(        char,        comm,        TASK_COMM_LEN        )
                __field(        pid_t,        pid                        )
                __field(        int,        prio                        )
                __field(        int,        orig_cpu                )
                __field(        int,        dest_cpu                )
        ),

        TP_fast_assign(
                memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
                __entry->pid                = p->pid;
                __entry->prio                = p->prio; /* XXX SCHED_DEADLINE */
                __entry->orig_cpu        = task_cpu(p);
                __entry->dest_cpu        = dest_cpu;
        ),

        TP_printk("comm=%s pid=%d prio=%d orig_cpu=%d dest_cpu=%d",
                  __entry->comm, __entry->pid, __entry->prio,
                  __entry->orig_cpu, __entry->dest_cpu)
);

DECLARE_EVENT_CLASS(sched_process_template,

        TP_PROTO(struct task_struct *p),

        TP_ARGS(p),

        TP_STRUCT__entry(
                __array(        char,        comm,        TASK_COMM_LEN        )
                __field(        pid_t,        pid                        )
                __field(        int,        prio                        )
        ),

        TP_fast_assign(
                memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
                __entry->pid                = p->pid;
                __entry->prio                = p->prio; /* XXX SCHED_DEADLINE */
        ),

        TP_printk("comm=%s pid=%d prio=%d",
                  __entry->comm, __entry->pid, __entry->prio)
);

/*
 * Tracepoint for freeing a task:
 */
DEFINE_EVENT(sched_process_template, sched_process_free,
             TP_PROTO(struct task_struct *p),
             TP_ARGS(p));

/*
 * Tracepoint for a task exiting:
 */
DEFINE_EVENT(sched_process_template, sched_process_exit,
             TP_PROTO(struct task_struct *p),
             TP_ARGS(p));

/*
 * Tracepoint for waiting on task to unschedule:
 */
DEFINE_EVENT(sched_process_template, sched_wait_task,
        TP_PROTO(struct task_struct *p),
        TP_ARGS(p));

/*
 * Tracepoint for a waiting task:
 */
TRACE_EVENT(sched_process_wait,

        TP_PROTO(struct pid *pid),

        TP_ARGS(pid),

        TP_STRUCT__entry(
                __array(        char,        comm,        TASK_COMM_LEN        )
                __field(        pid_t,        pid                        )
                __field(        int,        prio                        )
        ),

        TP_fast_assign(
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
                __entry->pid                = pid_nr(pid);
                __entry->prio                = current->prio; /* XXX SCHED_DEADLINE */
        ),

        TP_printk("comm=%s pid=%d prio=%d",
                  __entry->comm, __entry->pid, __entry->prio)
);

/*
 * Tracepoint for do_fork:
 */
TRACE_EVENT(sched_process_fork,

        TP_PROTO(struct task_struct *parent, struct task_struct *child),

        TP_ARGS(parent, child),

        TP_STRUCT__entry(
                __array(        char,        parent_comm,        TASK_COMM_LEN        )
                __field(        pid_t,        parent_pid                        )
                __array(        char,        child_comm,        TASK_COMM_LEN        )
                __field(        pid_t,        child_pid                        )
        ),

        TP_fast_assign(
                memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN);
                __entry->parent_pid        = parent->pid;
                memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN);
                __entry->child_pid        = child->pid;
        ),

        TP_printk("comm=%s pid=%d child_comm=%s child_pid=%d",
                __entry->parent_comm, __entry->parent_pid,
                __entry->child_comm, __entry->child_pid)
);

/*
 * Tracepoint for exec:
 */
TRACE_EVENT(sched_process_exec,

        TP_PROTO(struct task_struct *p, pid_t old_pid,
                 struct linux_binprm *bprm),

        TP_ARGS(p, old_pid, bprm),

        TP_STRUCT__entry(
                __string(        filename,        bprm->filename        )
                __field(        pid_t,                pid                )
                __field(        pid_t,                old_pid                )
        ),

        TP_fast_assign(
                __assign_str(filename, bprm->filename);
                __entry->pid                = p->pid;
                __entry->old_pid        = old_pid;
        ),

        TP_printk("filename=%s pid=%d old_pid=%d", __get_str(filename),
                  __entry->pid, __entry->old_pid)
);


#ifdef CONFIG_SCHEDSTATS
#define DEFINE_EVENT_SCHEDSTAT DEFINE_EVENT
#define DECLARE_EVENT_CLASS_SCHEDSTAT DECLARE_EVENT_CLASS
#else
#define DEFINE_EVENT_SCHEDSTAT DEFINE_EVENT_NOP
#define DECLARE_EVENT_CLASS_SCHEDSTAT DECLARE_EVENT_CLASS_NOP
#endif

/*
 * XXX the below sched_stat tracepoints only apply to SCHED_OTHER/BATCH/IDLE
 *     adding sched_stat support to SCHED_FIFO/RR would be welcome.
 */
DECLARE_EVENT_CLASS_SCHEDSTAT(sched_stat_template,

        TP_PROTO(struct task_struct *tsk, u64 delay),

        TP_ARGS(__perf_task(tsk), __perf_count(delay)),

        TP_STRUCT__entry(
                __array( char,        comm,        TASK_COMM_LEN        )
                __field( pid_t,        pid                        )
                __field( u64,        delay                        )
        ),

        TP_fast_assign(
                memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
                __entry->pid        = tsk->pid;
                __entry->delay        = delay;
        ),

        TP_printk("comm=%s pid=%d delay=%Lu [ns]",
                        __entry->comm, __entry->pid,
                        (unsigned long long)__entry->delay)
);

/*
 * Tracepoint for accounting wait time (time the task is runnable
 * but not actually running due to scheduler contention).
 */
DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_wait,
             TP_PROTO(struct task_struct *tsk, u64 delay),
             TP_ARGS(tsk, delay));

/*
 * Tracepoint for accounting sleep time (time the task is not runnable,
 * including iowait, see below).
 */
DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_sleep,
             TP_PROTO(struct task_struct *tsk, u64 delay),
             TP_ARGS(tsk, delay));

/*
 * Tracepoint for accounting iowait time (time the task is not runnable
 * due to waiting on IO to complete).
 */
DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_iowait,
             TP_PROTO(struct task_struct *tsk, u64 delay),
             TP_ARGS(tsk, delay));

/*
 * Tracepoint for accounting blocked time (time the task is in uninterruptible).
 */
DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_blocked,
             TP_PROTO(struct task_struct *tsk, u64 delay),
             TP_ARGS(tsk, delay));

/*
 * Tracepoint for accounting runtime (time the task is executing
 * on a CPU).
 */
DECLARE_EVENT_CLASS(sched_stat_runtime,

        TP_PROTO(struct task_struct *tsk, u64 runtime, u64 vruntime),

        TP_ARGS(tsk, __perf_count(runtime), vruntime),

        TP_STRUCT__entry(
                __array( char,        comm,        TASK_COMM_LEN        )
                __field( pid_t,        pid                        )
                __field( u64,        runtime                        )
                __field( u64,        vruntime                        )
        ),

        TP_fast_assign(
                memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
                __entry->pid                = tsk->pid;
                __entry->runtime        = runtime;
                __entry->vruntime        = vruntime;
        ),

        TP_printk("comm=%s pid=%d runtime=%Lu [ns] vruntime=%Lu [ns]",
                        __entry->comm, __entry->pid,
                        (unsigned long long)__entry->runtime,
                        (unsigned long long)__entry->vruntime)
);

DEFINE_EVENT(sched_stat_runtime, sched_stat_runtime,
             TP_PROTO(struct task_struct *tsk, u64 runtime, u64 vruntime),
             TP_ARGS(tsk, runtime, vruntime));

/*
 * Tracepoint for showing priority inheritance modifying a tasks
 * priority.
 */
TRACE_EVENT(sched_pi_setprio,

        TP_PROTO(struct task_struct *tsk, struct task_struct *pi_task),

        TP_ARGS(tsk, pi_task),

        TP_STRUCT__entry(
                __array( char,        comm,        TASK_COMM_LEN        )
                __field( pid_t,        pid                        )
                __field( int,        oldprio                        )
                __field( int,        newprio                        )
        ),

        TP_fast_assign(
                memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
                __entry->pid                = tsk->pid;
                __entry->oldprio        = tsk->prio;
                __entry->newprio        = pi_task ?
                                min(tsk->normal_prio, pi_task->prio) :
                                tsk->normal_prio;
                /* XXX SCHED_DEADLINE bits missing */
        ),

        TP_printk("comm=%s pid=%d oldprio=%d newprio=%d",
                        __entry->comm, __entry->pid,
                        __entry->oldprio, __entry->newprio)
);

#ifdef CONFIG_DETECT_HUNG_TASK
TRACE_EVENT(sched_process_hang,
        TP_PROTO(struct task_struct *tsk),
        TP_ARGS(tsk),

        TP_STRUCT__entry(
                __array( char,        comm,        TASK_COMM_LEN        )
                __field( pid_t,        pid                        )
        ),

        TP_fast_assign(
                memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
                __entry->pid = tsk->pid;
        ),

        TP_printk("comm=%s pid=%d", __entry->comm, __entry->pid)
);
#endif /* CONFIG_DETECT_HUNG_TASK */

/*
 * Tracks migration of tasks from one runqueue to another. Can be used to
 * detect if automatic NUMA balancing is bouncing between nodes.
 */
TRACE_EVENT(sched_move_numa,

        TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu),

        TP_ARGS(tsk, src_cpu, dst_cpu),

        TP_STRUCT__entry(
                __field( pid_t,        pid                        )
                __field( pid_t,        tgid                        )
                __field( pid_t,        ngid                        )
                __field( int,        src_cpu                        )
                __field( int,        src_nid                        )
                __field( int,        dst_cpu                        )
                __field( int,        dst_nid                        )
        ),

        TP_fast_assign(
                __entry->pid                = task_pid_nr(tsk);
                __entry->tgid                = task_tgid_nr(tsk);
                __entry->ngid                = task_numa_group_id(tsk);
                __entry->src_cpu        = src_cpu;
                __entry->src_nid        = cpu_to_node(src_cpu);
                __entry->dst_cpu        = dst_cpu;
                __entry->dst_nid        = cpu_to_node(dst_cpu);
        ),

        TP_printk("pid=%d tgid=%d ngid=%d src_cpu=%d src_nid=%d dst_cpu=%d dst_nid=%d",
                        __entry->pid, __entry->tgid, __entry->ngid,
                        __entry->src_cpu, __entry->src_nid,
                        __entry->dst_cpu, __entry->dst_nid)
);

DECLARE_EVENT_CLASS(sched_numa_pair_template,

        TP_PROTO(struct task_struct *src_tsk, int src_cpu,
                 struct task_struct *dst_tsk, int dst_cpu),

        TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu),

        TP_STRUCT__entry(
                __field( pid_t,        src_pid                        )
                __field( pid_t,        src_tgid                )
                __field( pid_t,        src_ngid                )
                __field( int,        src_cpu                        )
                __field( int,        src_nid                        )
                __field( pid_t,        dst_pid                        )
                __field( pid_t,        dst_tgid                )
                __field( pid_t,        dst_ngid                )
                __field( int,        dst_cpu                        )
                __field( int,        dst_nid                        )
        ),

        TP_fast_assign(
                __entry->src_pid        = task_pid_nr(src_tsk);
                __entry->src_tgid        = task_tgid_nr(src_tsk);
                __entry->src_ngid        = task_numa_group_id(src_tsk);
                __entry->src_cpu        = src_cpu;
                __entry->src_nid        = cpu_to_node(src_cpu);
                __entry->dst_pid        = dst_tsk ? task_pid_nr(dst_tsk) : 0;
                __entry->dst_tgid        = dst_tsk ? task_tgid_nr(dst_tsk) : 0;
                __entry->dst_ngid        = dst_tsk ? task_numa_group_id(dst_tsk) : 0;
                __entry->dst_cpu        = dst_cpu;
                __entry->dst_nid        = dst_cpu >= 0 ? cpu_to_node(dst_cpu) : -1;
        ),

        TP_printk("src_pid=%d src_tgid=%d src_ngid=%d src_cpu=%d src_nid=%d dst_pid=%d dst_tgid=%d dst_ngid=%d dst_cpu=%d dst_nid=%d",
                        __entry->src_pid, __entry->src_tgid, __entry->src_ngid,
                        __entry->src_cpu, __entry->src_nid,
                        __entry->dst_pid, __entry->dst_tgid, __entry->dst_ngid,
                        __entry->dst_cpu, __entry->dst_nid)
);

DEFINE_EVENT(sched_numa_pair_template, sched_stick_numa,

        TP_PROTO(struct task_struct *src_tsk, int src_cpu,
                 struct task_struct *dst_tsk, int dst_cpu),

        TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu)
);

DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa,

        TP_PROTO(struct task_struct *src_tsk, int src_cpu,
                 struct task_struct *dst_tsk, int dst_cpu),

        TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu)
);


/*
 * Tracepoint for waking a polling cpu without an IPI.
 */
TRACE_EVENT(sched_wake_idle_without_ipi,

        TP_PROTO(int cpu),

        TP_ARGS(cpu),

        TP_STRUCT__entry(
                __field(        int,        cpu        )
        ),

        TP_fast_assign(
                __entry->cpu        = cpu;
        ),

        TP_printk("cpu=%d", __entry->cpu)
);

/*
 * Following tracepoints are not exported in tracefs and provide hooking
 * mechanisms only for testing and debugging purposes.
 *
 * Postfixed with _tp to make them easily identifiable in the code.
 */
DECLARE_TRACE(pelt_cfs_tp,
        TP_PROTO(struct cfs_rq *cfs_rq),
        TP_ARGS(cfs_rq));

DECLARE_TRACE(pelt_rt_tp,
        TP_PROTO(struct rq *rq),
        TP_ARGS(rq));

DECLARE_TRACE(pelt_dl_tp,
        TP_PROTO(struct rq *rq),
        TP_ARGS(rq));

DECLARE_TRACE(pelt_thermal_tp,
        TP_PROTO(struct rq *rq),
        TP_ARGS(rq));

DECLARE_TRACE(pelt_irq_tp,
        TP_PROTO(struct rq *rq),
        TP_ARGS(rq));

DECLARE_TRACE(pelt_se_tp,
        TP_PROTO(struct sched_entity *se),
        TP_ARGS(se));

DECLARE_TRACE(sched_cpu_capacity_tp,
        TP_PROTO(struct rq *rq),
        TP_ARGS(rq));

DECLARE_TRACE(sched_overutilized_tp,
        TP_PROTO(struct root_domain *rd, bool overutilized),
        TP_ARGS(rd, overutilized));

DECLARE_TRACE(sched_util_est_cfs_tp,
        TP_PROTO(struct cfs_rq *cfs_rq),
        TP_ARGS(cfs_rq));

DECLARE_TRACE(sched_util_est_se_tp,
        TP_PROTO(struct sched_entity *se),
        TP_ARGS(se));

DECLARE_TRACE(sched_update_nr_running_tp,
        TP_PROTO(struct rq *rq, int change),
        TP_ARGS(rq, change));

#endif /* _TRACE_SCHED_H */

/* This part must be outside protection */
#include <trace/define_trace.h>













1
2
3
4
5
6
7
8
9
10
11
12
/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef _NET_CORE_SOCK_DESTRUCTOR_H
#define _NET_CORE_SOCK_DESTRUCTOR_H
#include <net/tcp.h>

static inline bool is_skb_wmem(const struct sk_buff *skb)
{
        return skb->destructor == sock_wfree ||
               skb->destructor == __sock_wfree ||
               (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree);
}
#endif























































































































































































































































































    2 



    2 




    2 







    2 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
// SPDX-License-Identifier: GPL-2.0
/*
 * linux/fs/ext4/acl.c
 *
 * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
 */

#include <linux/quotaops.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#include "xattr.h"
#include "acl.h"

/*
 * Convert from filesystem to in-memory representation.
 */
static struct posix_acl *
ext4_acl_from_disk(const void *value, size_t size)
{
        const char *end = (char *)value + size;
        int n, count;
        struct posix_acl *acl;

        if (!value)
                return NULL;
        if (size < sizeof(ext4_acl_header))
                 return ERR_PTR(-EINVAL);
        if (((ext4_acl_header *)value)->a_version !=
            cpu_to_le32(EXT4_ACL_VERSION))
                return ERR_PTR(-EINVAL);
        value = (char *)value + sizeof(ext4_acl_header);
        count = ext4_acl_count(size);
        if (count < 0)
                return ERR_PTR(-EINVAL);
        if (count == 0)
                return NULL;
        acl = posix_acl_alloc(count, GFP_NOFS);
        if (!acl)
                return ERR_PTR(-ENOMEM);
        for (n = 0; n < count; n++) {
                ext4_acl_entry *entry =
                        (ext4_acl_entry *)value;
                if ((char *)value + sizeof(ext4_acl_entry_short) > end)
                        goto fail;
                acl->a_entries[n].e_tag  = le16_to_cpu(entry->e_tag);
                acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);

                switch (acl->a_entries[n].e_tag) {
                case ACL_USER_OBJ:
                case ACL_GROUP_OBJ:
                case ACL_MASK:
                case ACL_OTHER:
                        value = (char *)value +
                                sizeof(ext4_acl_entry_short);
                        break;

                case ACL_USER:
                        value = (char *)value + sizeof(ext4_acl_entry);
                        if ((char *)value > end)
                                goto fail;
                        acl->a_entries[n].e_uid =
                                make_kuid(&init_user_ns,
                                          le32_to_cpu(entry->e_id));
                        break;
                case ACL_GROUP:
                        value = (char *)value + sizeof(ext4_acl_entry);
                        if ((char *)value > end)
                                goto fail;
                        acl->a_entries[n].e_gid =
                                make_kgid(&init_user_ns,
                                          le32_to_cpu(entry->e_id));
                        break;

                default:
                        goto fail;
                }
        }
        if (value != end)
                goto fail;
        return acl;

fail:
        posix_acl_release(acl);
        return ERR_PTR(-EINVAL);
}

/*
 * Convert from in-memory to filesystem representation.
 */
static void *
ext4_acl_to_disk(const struct posix_acl *acl, size_t *size)
{
        ext4_acl_header *ext_acl;
        char *e;
        size_t n;

        *size = ext4_acl_size(acl->a_count);
        ext_acl = kmalloc(sizeof(ext4_acl_header) + acl->a_count *
                        sizeof(ext4_acl_entry), GFP_NOFS);
        if (!ext_acl)
                return ERR_PTR(-ENOMEM);
        ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION);
        e = (char *)ext_acl + sizeof(ext4_acl_header);
        for (n = 0; n < acl->a_count; n++) {
                const struct posix_acl_entry *acl_e = &acl->a_entries[n];
                ext4_acl_entry *entry = (ext4_acl_entry *)e;
                entry->e_tag  = cpu_to_le16(acl_e->e_tag);
                entry->e_perm = cpu_to_le16(acl_e->e_perm);
                switch (acl_e->e_tag) {
                case ACL_USER:
                        entry->e_id = cpu_to_le32(
                                from_kuid(&init_user_ns, acl_e->e_uid));
                        e += sizeof(ext4_acl_entry);
                        break;
                case ACL_GROUP:
                        entry->e_id = cpu_to_le32(
                                from_kgid(&init_user_ns, acl_e->e_gid));
                        e += sizeof(ext4_acl_entry);
                        break;

                case ACL_USER_OBJ:
                case ACL_GROUP_OBJ:
                case ACL_MASK:
                case ACL_OTHER:
                        e += sizeof(ext4_acl_entry_short);
                        break;

                default:
                        goto fail;
                }
        }
        return (char *)ext_acl;

fail:
        kfree(ext_acl);
        return ERR_PTR(-EINVAL);
}

/*
 * Inode operation get_posix_acl().
 *
 * inode->i_mutex: don't care
 */
struct posix_acl *
ext4_get_acl(struct inode *inode, int type)
{
        int name_index;
        char *value = NULL;
        struct posix_acl *acl;
        int retval;

        switch (type) {
        case ACL_TYPE_ACCESS:
                name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
                break;
        case ACL_TYPE_DEFAULT:
                name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
                break;
        default:
                BUG();
        }
        retval = ext4_xattr_get(inode, name_index, "", NULL, 0);
        if (retval > 0) {
                value = kmalloc(retval, GFP_NOFS);
                if (!value)
                        return ERR_PTR(-ENOMEM);
                retval = ext4_xattr_get(inode, name_index, "", value, retval);
        }
        if (retval > 0)
                acl = ext4_acl_from_disk(value, retval);
        else if (retval == -ENODATA || retval == -ENOSYS)
                acl = NULL;
        else
                acl = ERR_PTR(retval);
        kfree(value);

        return acl;
}

/*
 * Set the access or default ACL of an inode.
 *
 * inode->i_mutex: down unless called from ext4_new_inode
 */
static int
__ext4_set_acl(handle_t *handle, struct inode *inode, int type,
             struct posix_acl *acl, int xattr_flags)
{
        int name_index;
        void *value = NULL;
        size_t size = 0;
        int error;

        switch (type) {
        case ACL_TYPE_ACCESS:
                name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
                break;

        case ACL_TYPE_DEFAULT:
                name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
                if (!S_ISDIR(inode->i_mode))
                        return acl ? -EACCES : 0;
                break;

        default:
                return -EINVAL;
        }
        if (acl) {
                value = ext4_acl_to_disk(acl, &size);
                if (IS_ERR(value))
                        return (int)PTR_ERR(value);
        }

        error = ext4_xattr_set_handle(handle, inode, name_index, "",
                                      value, size, xattr_flags);

        kfree(value);
        if (!error)
                set_cached_acl(inode, type, acl);

        return error;
}

int
ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
        handle_t *handle;
        int error, credits, retries = 0;
        size_t acl_size = acl ? ext4_acl_size(acl->a_count) : 0;
        umode_t mode = inode->i_mode;
        int update_mode = 0;

        error = dquot_initialize(inode);
        if (error)
                return error;
retry:
        error = ext4_xattr_set_credits(inode, acl_size, false /* is_create */,
                                       &credits);
        if (error)
                return error;

        handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
        if (IS_ERR(handle))
                return PTR_ERR(handle);

        if ((type == ACL_TYPE_ACCESS) && acl) {
                error = posix_acl_update_mode(inode, &mode, &acl);
                if (error)
                        goto out_stop;
                if (mode != inode->i_mode)
                        update_mode = 1;
        }

        error = __ext4_set_acl(handle, inode, type, acl, 0 /* xattr_flags */);
        if (!error && update_mode) {
                inode->i_mode = mode;
                inode->i_ctime = current_time(inode);
                error = ext4_mark_inode_dirty(handle, inode);
        }
out_stop:
        ext4_journal_stop(handle);
        if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
                goto retry;
        return error;
}

/*
 * Initialize the ACLs of a new inode. Called from ext4_new_inode.
 *
 * dir->i_mutex: down
 * inode->i_mutex: up (access to inode is still exclusive)
 */
int
ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
{
        struct posix_acl *default_acl, *acl;
        int error;

        error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
        if (error)
                return error;

        if (default_acl) {
                error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT,
                                       default_acl, XATTR_CREATE);
                posix_acl_release(default_acl);
        } else {
                inode->i_default_acl = NULL;
        }
        if (acl) {
                if (!error)
                        error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS,
                                               acl, XATTR_CREATE);
                posix_acl_release(acl);
        } else {
                inode->i_acl = NULL;
        }
        return error;
}








































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright 2019 Google LLC
 */

#ifndef __LINUX_BLK_CRYPTO_H
#define __LINUX_BLK_CRYPTO_H

#include <linux/types.h>

enum blk_crypto_mode_num {
        BLK_ENCRYPTION_MODE_INVALID,
        BLK_ENCRYPTION_MODE_AES_256_XTS,
        BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV,
        BLK_ENCRYPTION_MODE_ADIANTUM,
        BLK_ENCRYPTION_MODE_MAX,
};

#define BLK_CRYPTO_MAX_KEY_SIZE                64
/**
 * struct blk_crypto_config - an inline encryption key's crypto configuration
 * @crypto_mode: encryption algorithm this key is for
 * @data_unit_size: the data unit size for all encryption/decryptions with this
 *        key.  This is the size in bytes of each individual plaintext and
 *        ciphertext.  This is always a power of 2.  It might be e.g. the
 *        filesystem block size or the disk sector size.
 * @dun_bytes: the maximum number of bytes of DUN used when using this key
 */
struct blk_crypto_config {
        enum blk_crypto_mode_num crypto_mode;
        unsigned int data_unit_size;
        unsigned int dun_bytes;
};

/**
 * struct blk_crypto_key - an inline encryption key
 * @crypto_cfg: the crypto configuration (like crypto_mode, key size) for this
 *                key
 * @data_unit_size_bits: log2 of data_unit_size
 * @size: size of this key in bytes (determined by @crypto_cfg.crypto_mode)
 * @raw: the raw bytes of this key.  Only the first @size bytes are used.
 *
 * A blk_crypto_key is immutable once created, and many bios can reference it at
 * the same time.  It must not be freed until all bios using it have completed
 * and it has been evicted from all devices on which it may have been used.
 */
struct blk_crypto_key {
        struct blk_crypto_config crypto_cfg;
        unsigned int data_unit_size_bits;
        unsigned int size;
        u8 raw[BLK_CRYPTO_MAX_KEY_SIZE];
};

#define BLK_CRYPTO_MAX_IV_SIZE                32
#define BLK_CRYPTO_DUN_ARRAY_SIZE        (BLK_CRYPTO_MAX_IV_SIZE / sizeof(u64))

/**
 * struct bio_crypt_ctx - an inline encryption context
 * @bc_key: the key, algorithm, and data unit size to use
 * @bc_dun: the data unit number (starting IV) to use
 *
 * A bio_crypt_ctx specifies that the contents of the bio will be encrypted (for
 * write requests) or decrypted (for read requests) inline by the storage device
 * or controller, or by the crypto API fallback.
 */
struct bio_crypt_ctx {
        const struct blk_crypto_key        *bc_key;
        u64                                bc_dun[BLK_CRYPTO_DUN_ARRAY_SIZE];
};

#include <linux/blk_types.h>
#include <linux/blkdev.h>

struct request;
struct request_queue;

#ifdef CONFIG_BLK_INLINE_ENCRYPTION

static inline bool bio_has_crypt_ctx(struct bio *bio)
{
        return bio->bi_crypt_context;
}

void bio_crypt_set_ctx(struct bio *bio, const struct blk_crypto_key *key,
                       const u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE],
                       gfp_t gfp_mask);

bool bio_crypt_dun_is_contiguous(const struct bio_crypt_ctx *bc,
                                 unsigned int bytes,
                                 const u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]);

int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key,
                        enum blk_crypto_mode_num crypto_mode,
                        unsigned int dun_bytes,
                        unsigned int data_unit_size);

int blk_crypto_start_using_key(const struct blk_crypto_key *key,
                               struct request_queue *q);

void blk_crypto_evict_key(struct request_queue *q,
                          const struct blk_crypto_key *key);

bool blk_crypto_config_supported(struct request_queue *q,
                                 const struct blk_crypto_config *cfg);

#else /* CONFIG_BLK_INLINE_ENCRYPTION */

static inline bool bio_has_crypt_ctx(struct bio *bio)
{
        return false;
}

#endif /* CONFIG_BLK_INLINE_ENCRYPTION */

int __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask);
/**
 * bio_crypt_clone - clone bio encryption context
 * @dst: destination bio
 * @src: source bio
 * @gfp_mask: memory allocation flags
 *
 * If @src has an encryption context, clone it to @dst.
 *
 * Return: 0 on success, -ENOMEM if out of memory.  -ENOMEM is only possible if
 *           @gfp_mask doesn't include %__GFP_DIRECT_RECLAIM.
 */
static inline int bio_crypt_clone(struct bio *dst, struct bio *src,
                                  gfp_t gfp_mask)
{
        if (bio_has_crypt_ctx(src))
                return __bio_crypt_clone(dst, src, gfp_mask);
        return 0;
}

#endif /* __LINUX_BLK_CRYPTO_H */















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (c) 2020 Christoph Hellwig.
 *
 * Support for "universal" pointers that can point to either kernel or userspace
 * memory.
 */
#ifndef _LINUX_SOCKPTR_H
#define _LINUX_SOCKPTR_H

#include <linux/slab.h>
#include <linux/uaccess.h>

typedef struct {
        union {
                void                *kernel;
                void __user        *user;
        };
        bool                is_kernel : 1;
} sockptr_t;

static inline bool sockptr_is_kernel(sockptr_t sockptr)
{
        return sockptr.is_kernel;
}

static inline sockptr_t KERNEL_SOCKPTR(void *p)
{
        return (sockptr_t) { .kernel = p, .is_kernel = true };
}

static inline sockptr_t USER_SOCKPTR(void __user *p)
{
        return (sockptr_t) { .user = p };
}

static inline bool sockptr_is_null(sockptr_t sockptr)
{
        if (sockptr_is_kernel(sockptr))
                return !sockptr.kernel;
        return !sockptr.user;
}

static inline int copy_from_sockptr_offset(void *dst, sockptr_t src,
                size_t offset, size_t size)
{
        if (!sockptr_is_kernel(src))
                return copy_from_user(dst, src.user + offset, size);
        memcpy(dst, src.kernel + offset, size);
        return 0;
}

static inline int copy_from_sockptr(void *dst, sockptr_t src, size_t size)
{
        return copy_from_sockptr_offset(dst, src, 0, size);
}

static inline int copy_to_sockptr_offset(sockptr_t dst, size_t offset,
                const void *src, size_t size)
{
        if (!sockptr_is_kernel(dst))
                return copy_to_user(dst.user + offset, src, size);
        memcpy(dst.kernel + offset, src, size);
        return 0;
}

static inline int copy_to_sockptr(sockptr_t dst, const void *src, size_t size)
{
        return copy_to_sockptr_offset(dst, 0, src, size);
}

static inline void *memdup_sockptr(sockptr_t src, size_t len)
{
        void *p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN);

        if (!p)
                return ERR_PTR(-ENOMEM);
        if (copy_from_sockptr(p, src, len)) {
                kfree(p);
                return ERR_PTR(-EFAULT);
        }
        return p;
}

static inline void *memdup_sockptr_nul(sockptr_t src, size_t len)
{
        char *p = kmalloc_track_caller(len + 1, GFP_KERNEL);

        if (!p)
                return ERR_PTR(-ENOMEM);
        if (copy_from_sockptr(p, src, len)) {
                kfree(p);
                return ERR_PTR(-EFAULT);
        }
        p[len] = '\0';
        return p;
}

static inline long strncpy_from_sockptr(char *dst, sockptr_t src, size_t count)
{
        if (sockptr_is_kernel(src)) {
                size_t len = min(strnlen(src.kernel, count - 1) + 1, count);

                memcpy(dst, src.kernel, len);
                return len;
        }
        return strncpy_from_user(dst, src.user, count);
}

#endif /* _LINUX_SOCKPTR_H */







    6 




1
2
3
4
5
6
7
8
9
10
11
/* SPDX-License-Identifier: GPL-2.0 */
#include <asm/processor.h>

static inline int phys_addr_valid(resource_size_t addr)
{
#ifdef CONFIG_PHYS_ADDR_T_64BIT
        return !(addr >> boot_cpu_data.x86_phys_bits);
#else
        return 1;
#endif
}

























































    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Asymmetric public-key cryptography key subtype
 *
 * See Documentation/crypto/asymmetric-keys.rst
 *
 * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#ifndef _KEYS_ASYMMETRIC_SUBTYPE_H
#define _KEYS_ASYMMETRIC_SUBTYPE_H

#include <linux/seq_file.h>
#include <keys/asymmetric-type.h>

struct kernel_pkey_query;
struct kernel_pkey_params;
struct public_key_signature;

/*
 * Keys of this type declare a subtype that indicates the handlers and
 * capabilities.
 */
struct asymmetric_key_subtype {
        struct module                *owner;
        const char                *name;
        unsigned short                name_len;        /* length of name */

        /* Describe a key of this subtype for /proc/keys */
        void (*describe)(const struct key *key, struct seq_file *m);

        /* Destroy a key of this subtype */
        void (*destroy)(void *payload_crypto, void *payload_auth);

        int (*query)(const struct kernel_pkey_params *params,
                     struct kernel_pkey_query *info);

        /* Encrypt/decrypt/sign data */
        int (*eds_op)(struct kernel_pkey_params *params,
                      const void *in, void *out);

        /* Verify the signature on a key of this subtype (optional) */
        int (*verify_signature)(const struct key *key,
                                const struct public_key_signature *sig);
};

/**
 * asymmetric_key_subtype - Get the subtype from an asymmetric key
 * @key: The key of interest.
 *
 * Retrieves and returns the subtype pointer of the asymmetric key from the
 * type-specific data attached to the key.
 */
static inline
struct asymmetric_key_subtype *asymmetric_key_subtype(const struct key *key)
{
        return key->payload.data[asym_subtype];
}

#endif /* _KEYS_ASYMMETRIC_SUBTYPE_H */

















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Internal header to deal with irq_desc->status which will be renamed
 * to irq_desc->settings.
 */
enum {
        _IRQ_DEFAULT_INIT_FLAGS        = IRQ_DEFAULT_INIT_FLAGS,
        _IRQ_PER_CPU                = IRQ_PER_CPU,
        _IRQ_LEVEL                = IRQ_LEVEL,
        _IRQ_NOPROBE                = IRQ_NOPROBE,
        _IRQ_NOREQUEST                = IRQ_NOREQUEST,
        _IRQ_NOTHREAD                = IRQ_NOTHREAD,
        _IRQ_NOAUTOEN                = IRQ_NOAUTOEN,
        _IRQ_MOVE_PCNTXT        = IRQ_MOVE_PCNTXT,
        _IRQ_NO_BALANCING        = IRQ_NO_BALANCING,
        _IRQ_NESTED_THREAD        = IRQ_NESTED_THREAD,
        _IRQ_PER_CPU_DEVID        = IRQ_PER_CPU_DEVID,
        _IRQ_IS_POLLED                = IRQ_IS_POLLED,
        _IRQ_DISABLE_UNLAZY        = IRQ_DISABLE_UNLAZY,
        _IRQ_HIDDEN                = IRQ_HIDDEN,
        _IRQF_MODIFY_MASK        = IRQF_MODIFY_MASK,
};

#define IRQ_PER_CPU                GOT_YOU_MORON
#define IRQ_NO_BALANCING        GOT_YOU_MORON
#define IRQ_LEVEL                GOT_YOU_MORON
#define IRQ_NOPROBE                GOT_YOU_MORON
#define IRQ_NOREQUEST                GOT_YOU_MORON
#define IRQ_NOTHREAD                GOT_YOU_MORON
#define IRQ_NOAUTOEN                GOT_YOU_MORON
#define IRQ_NESTED_THREAD        GOT_YOU_MORON
#define IRQ_PER_CPU_DEVID        GOT_YOU_MORON
#define IRQ_IS_POLLED                GOT_YOU_MORON
#define IRQ_DISABLE_UNLAZY        GOT_YOU_MORON
#define IRQ_HIDDEN                GOT_YOU_MORON
#undef IRQF_MODIFY_MASK
#define IRQF_MODIFY_MASK        GOT_YOU_MORON

static inline void
irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
{
        desc->status_use_accessors &= ~(clr & _IRQF_MODIFY_MASK);
        desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
}

static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
{
        return desc->status_use_accessors & _IRQ_PER_CPU;
}

static inline bool irq_settings_is_per_cpu_devid(struct irq_desc *desc)
{
        return desc->status_use_accessors & _IRQ_PER_CPU_DEVID;
}

static inline void irq_settings_set_per_cpu(struct irq_desc *desc)
{
        desc->status_use_accessors |= _IRQ_PER_CPU;
}

static inline void irq_settings_set_no_balancing(struct irq_desc *desc)
{
        desc->status_use_accessors |= _IRQ_NO_BALANCING;
}

static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc)
{
        return desc->status_use_accessors & _IRQ_NO_BALANCING;
}

static inline u32 irq_settings_get_trigger_mask(struct irq_desc *desc)
{
        return desc->status_use_accessors & IRQ_TYPE_SENSE_MASK;
}

static inline void
irq_settings_set_trigger_mask(struct irq_desc *desc, u32 mask)
{
        desc->status_use_accessors &= ~IRQ_TYPE_SENSE_MASK;
        desc->status_use_accessors |= mask & IRQ_TYPE_SENSE_MASK;
}

static inline bool irq_settings_is_level(struct irq_desc *desc)
{
        return desc->status_use_accessors & _IRQ_LEVEL;
}

static inline void irq_settings_clr_level(struct irq_desc *desc)
{
        desc->status_use_accessors &= ~_IRQ_LEVEL;
}

static inline void irq_settings_set_level(struct irq_desc *desc)
{
        desc->status_use_accessors |= _IRQ_LEVEL;
}

static inline bool irq_settings_can_request(struct irq_desc *desc)
{
        return !(desc->status_use_accessors & _IRQ_NOREQUEST);
}

static inline void irq_settings_clr_norequest(struct irq_desc *desc)
{
        desc->status_use_accessors &= ~_IRQ_NOREQUEST;
}

static inline void irq_settings_set_norequest(struct irq_desc *desc)
{
        desc->status_use_accessors |= _IRQ_NOREQUEST;
}

static inline bool irq_settings_can_thread(struct irq_desc *desc)
{
        return !(desc->status_use_accessors & _IRQ_NOTHREAD);
}

static inline void irq_settings_clr_nothread(struct irq_desc *desc)
{
        desc->status_use_accessors &= ~_IRQ_NOTHREAD;
}

static inline void irq_settings_set_nothread(struct irq_desc *desc)
{
        desc->status_use_accessors |= _IRQ_NOTHREAD;
}

static inline bool irq_settings_can_probe(struct irq_desc *desc)
{
        return !(desc->status_use_accessors & _IRQ_NOPROBE);
}

static inline void irq_settings_clr_noprobe(struct irq_desc *desc)
{
        desc->status_use_accessors &= ~_IRQ_NOPROBE;
}

static inline void irq_settings_set_noprobe(struct irq_desc *desc)
{
        desc->status_use_accessors |= _IRQ_NOPROBE;
}

static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc)
{
        return desc->status_use_accessors & _IRQ_MOVE_PCNTXT;
}

static inline bool irq_settings_can_autoenable(struct irq_desc *desc)
{
        return !(desc->status_use_accessors & _IRQ_NOAUTOEN);
}

static inline bool irq_settings_is_nested_thread(struct irq_desc *desc)
{
        return desc->status_use_accessors & _IRQ_NESTED_THREAD;
}

static inline bool irq_settings_is_polled(struct irq_desc *desc)
{
        return desc->status_use_accessors & _IRQ_IS_POLLED;
}

static inline bool irq_settings_disable_unlazy(struct irq_desc *desc)
{
        return desc->status_use_accessors & _IRQ_DISABLE_UNLAZY;
}

static inline void irq_settings_clr_disable_unlazy(struct irq_desc *desc)
{
        desc->status_use_accessors &= ~_IRQ_DISABLE_UNLAZY;
}

static inline bool irq_settings_is_hidden(struct irq_desc *desc)
{
        return desc->status_use_accessors & _IRQ_HIDDEN;
}























































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NET_FLOW_DISSECTOR_H
#define _NET_FLOW_DISSECTOR_H

#include <linux/types.h>
#include <linux/in6.h>
#include <linux/siphash.h>
#include <linux/string.h>
#include <uapi/linux/if_ether.h>

struct bpf_prog;
struct net;
struct sk_buff;

/**
 * struct flow_dissector_key_control:
 * @thoff: Transport header offset
 */
struct flow_dissector_key_control {
        u16        thoff;
        u16        addr_type;
        u32        flags;
};

#define FLOW_DIS_IS_FRAGMENT        BIT(0)
#define FLOW_DIS_FIRST_FRAG        BIT(1)
#define FLOW_DIS_ENCAPSULATION        BIT(2)

enum flow_dissect_ret {
        FLOW_DISSECT_RET_OUT_GOOD,
        FLOW_DISSECT_RET_OUT_BAD,
        FLOW_DISSECT_RET_PROTO_AGAIN,
        FLOW_DISSECT_RET_IPPROTO_AGAIN,
        FLOW_DISSECT_RET_CONTINUE,
};

/**
 * struct flow_dissector_key_basic:
 * @n_proto: Network header protocol (eg. IPv4/IPv6)
 * @ip_proto: Transport header protocol (eg. TCP/UDP)
 */
struct flow_dissector_key_basic {
        __be16        n_proto;
        u8        ip_proto;
        u8        padding;
};

struct flow_dissector_key_tags {
        u32        flow_label;
};

struct flow_dissector_key_vlan {
        union {
                struct {
                        u16        vlan_id:12,
                                vlan_dei:1,
                                vlan_priority:3;
                };
                __be16        vlan_tci;
        };
        __be16        vlan_tpid;
        __be16        vlan_eth_type;
        u16        padding;
};

struct flow_dissector_mpls_lse {
        u32        mpls_ttl:8,
                mpls_bos:1,
                mpls_tc:3,
                mpls_label:20;
};

#define FLOW_DIS_MPLS_MAX 7
struct flow_dissector_key_mpls {
        struct flow_dissector_mpls_lse ls[FLOW_DIS_MPLS_MAX]; /* Label Stack */
        u8 used_lses; /* One bit set for each Label Stack Entry in use */
};

static inline void dissector_set_mpls_lse(struct flow_dissector_key_mpls *mpls,
                                          int lse_index)
{
        mpls->used_lses |= 1 << lse_index;
}

#define FLOW_DIS_TUN_OPTS_MAX 255
/**
 * struct flow_dissector_key_enc_opts:
 * @data: tunnel option data
 * @len: length of tunnel option data
 * @dst_opt_type: tunnel option type
 */
struct flow_dissector_key_enc_opts {
        u8 data[FLOW_DIS_TUN_OPTS_MAX];        /* Using IP_TUNNEL_OPTS_MAX is desired
                                         * here but seems difficult to #include
                                         */
        u8 len;
        __be16 dst_opt_type;
};

struct flow_dissector_key_keyid {
        __be32        keyid;
};

/**
 * struct flow_dissector_key_ipv4_addrs:
 * @src: source ip address
 * @dst: destination ip address
 */
struct flow_dissector_key_ipv4_addrs {
        /* (src,dst) must be grouped, in the same way than in IP header */
        __be32 src;
        __be32 dst;
};

/**
 * struct flow_dissector_key_ipv6_addrs:
 * @src: source ip address
 * @dst: destination ip address
 */
struct flow_dissector_key_ipv6_addrs {
        /* (src,dst) must be grouped, in the same way than in IP header */
        struct in6_addr src;
        struct in6_addr dst;
};

/**
 * struct flow_dissector_key_tipc:
 * @key: source node address combined with selector
 */
struct flow_dissector_key_tipc {
        __be32 key;
};

/**
 * struct flow_dissector_key_addrs:
 * @v4addrs: IPv4 addresses
 * @v6addrs: IPv6 addresses
 */
struct flow_dissector_key_addrs {
        union {
                struct flow_dissector_key_ipv4_addrs v4addrs;
                struct flow_dissector_key_ipv6_addrs v6addrs;
                struct flow_dissector_key_tipc tipckey;
        };
};

/**
 * flow_dissector_key_arp:
 *        @ports: Operation, source and target addresses for an ARP header
 *              for Ethernet hardware addresses and IPv4 protocol addresses
 *                sip: Sender IP address
 *                tip: Target IP address
 *                op:  Operation
 *                sha: Sender hardware address
 *                tpa: Target hardware address
 */
struct flow_dissector_key_arp {
        __u32 sip;
        __u32 tip;
        __u8 op;
        unsigned char sha[ETH_ALEN];
        unsigned char tha[ETH_ALEN];
};

/**
 * flow_dissector_key_tp_ports:
 *        @ports: port numbers of Transport header
 *                src: source port number
 *                dst: destination port number
 */
struct flow_dissector_key_ports {
        union {
                __be32 ports;
                struct {
                        __be16 src;
                        __be16 dst;
                };
        };
};

/**
 * struct flow_dissector_key_ports_range
 * @tp: port number from packet
 * @tp_min: min port number in range
 * @tp_max: max port number in range
 */
struct flow_dissector_key_ports_range {
        union {
                struct flow_dissector_key_ports tp;
                struct {
                        struct flow_dissector_key_ports tp_min;
                        struct flow_dissector_key_ports tp_max;
                };
        };
};

/**
 * flow_dissector_key_icmp:
 *                type: ICMP type
 *                code: ICMP code
 *                id:   session identifier
 */
struct flow_dissector_key_icmp {
        struct {
                u8 type;
                u8 code;
        };
        u16 id;
};

/**
 * struct flow_dissector_key_eth_addrs:
 * @src: source Ethernet address
 * @dst: destination Ethernet address
 */
struct flow_dissector_key_eth_addrs {
        /* (dst,src) must be grouped, in the same way than in ETH header */
        unsigned char dst[ETH_ALEN];
        unsigned char src[ETH_ALEN];
};

/**
 * struct flow_dissector_key_tcp:
 * @flags: flags
 */
struct flow_dissector_key_tcp {
        __be16 flags;
};

/**
 * struct flow_dissector_key_ip:
 * @tos: tos
 * @ttl: ttl
 */
struct flow_dissector_key_ip {
        __u8        tos;
        __u8        ttl;
};

/**
 * struct flow_dissector_key_meta:
 * @ingress_ifindex: ingress ifindex
 * @ingress_iftype: ingress interface type
 */
struct flow_dissector_key_meta {
        int ingress_ifindex;
        u16 ingress_iftype;
};

/**
 * struct flow_dissector_key_ct:
 * @ct_state: conntrack state after converting with map
 * @ct_mark: conttrack mark
 * @ct_zone: conntrack zone
 * @ct_labels: conntrack labels
 */
struct flow_dissector_key_ct {
        u16        ct_state;
        u16        ct_zone;
        u32        ct_mark;
        u32        ct_labels[4];
};

/**
 * struct flow_dissector_key_hash:
 * @hash: hash value
 */
struct flow_dissector_key_hash {
        u32 hash;
};

enum flow_dissector_key_id {
        FLOW_DISSECTOR_KEY_CONTROL, /* struct flow_dissector_key_control */
        FLOW_DISSECTOR_KEY_BASIC, /* struct flow_dissector_key_basic */
        FLOW_DISSECTOR_KEY_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */
        FLOW_DISSECTOR_KEY_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */
        FLOW_DISSECTOR_KEY_PORTS, /* struct flow_dissector_key_ports */
        FLOW_DISSECTOR_KEY_PORTS_RANGE, /* struct flow_dissector_key_ports */
        FLOW_DISSECTOR_KEY_ICMP, /* struct flow_dissector_key_icmp */
        FLOW_DISSECTOR_KEY_ETH_ADDRS, /* struct flow_dissector_key_eth_addrs */
        FLOW_DISSECTOR_KEY_TIPC, /* struct flow_dissector_key_tipc */
        FLOW_DISSECTOR_KEY_ARP, /* struct flow_dissector_key_arp */
        FLOW_DISSECTOR_KEY_VLAN, /* struct flow_dissector_key_vlan */
        FLOW_DISSECTOR_KEY_FLOW_LABEL, /* struct flow_dissector_key_tags */
        FLOW_DISSECTOR_KEY_GRE_KEYID, /* struct flow_dissector_key_keyid */
        FLOW_DISSECTOR_KEY_MPLS_ENTROPY, /* struct flow_dissector_key_keyid */
        FLOW_DISSECTOR_KEY_ENC_KEYID, /* struct flow_dissector_key_keyid */
        FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */
        FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */
        FLOW_DISSECTOR_KEY_ENC_CONTROL, /* struct flow_dissector_key_control */
        FLOW_DISSECTOR_KEY_ENC_PORTS, /* struct flow_dissector_key_ports */
        FLOW_DISSECTOR_KEY_MPLS, /* struct flow_dissector_key_mpls */
        FLOW_DISSECTOR_KEY_TCP, /* struct flow_dissector_key_tcp */
        FLOW_DISSECTOR_KEY_IP, /* struct flow_dissector_key_ip */
        FLOW_DISSECTOR_KEY_CVLAN, /* struct flow_dissector_key_vlan */
        FLOW_DISSECTOR_KEY_ENC_IP, /* struct flow_dissector_key_ip */
        FLOW_DISSECTOR_KEY_ENC_OPTS, /* struct flow_dissector_key_enc_opts */
        FLOW_DISSECTOR_KEY_META, /* struct flow_dissector_key_meta */
        FLOW_DISSECTOR_KEY_CT, /* struct flow_dissector_key_ct */
        FLOW_DISSECTOR_KEY_HASH, /* struct flow_dissector_key_hash */

        FLOW_DISSECTOR_KEY_MAX,
};

#define FLOW_DISSECTOR_F_PARSE_1ST_FRAG                BIT(0)
#define FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL        BIT(1)
#define FLOW_DISSECTOR_F_STOP_AT_ENCAP                BIT(2)

struct flow_dissector_key {
        enum flow_dissector_key_id key_id;
        size_t offset; /* offset of struct flow_dissector_key_*
                          in target the struct */
};

struct flow_dissector {
        unsigned int used_keys; /* each bit repesents presence of one key id */
        unsigned short int offset[FLOW_DISSECTOR_KEY_MAX];
};

struct flow_keys_basic {
        struct flow_dissector_key_control control;
        struct flow_dissector_key_basic basic;
};

struct flow_keys {
        struct flow_dissector_key_control control;
#define FLOW_KEYS_HASH_START_FIELD basic
        struct flow_dissector_key_basic basic __aligned(SIPHASH_ALIGNMENT);
        struct flow_dissector_key_tags tags;
        struct flow_dissector_key_vlan vlan;
        struct flow_dissector_key_vlan cvlan;
        struct flow_dissector_key_keyid keyid;
        struct flow_dissector_key_ports ports;
        struct flow_dissector_key_icmp icmp;
        /* 'addrs' must be the last member */
        struct flow_dissector_key_addrs addrs;
};

#define FLOW_KEYS_HASH_OFFSET                \
        offsetof(struct flow_keys, FLOW_KEYS_HASH_START_FIELD)

__be32 flow_get_u32_src(const struct flow_keys *flow);
__be32 flow_get_u32_dst(const struct flow_keys *flow);

extern struct flow_dissector flow_keys_dissector;
extern struct flow_dissector flow_keys_basic_dissector;

/* struct flow_keys_digest:
 *
 * This structure is used to hold a digest of the full flow keys. This is a
 * larger "hash" of a flow to allow definitively matching specific flows where
 * the 32 bit skb->hash is not large enough. The size is limited to 16 bytes so
 * that it can be used in CB of skb (see sch_choke for an example).
 */
#define FLOW_KEYS_DIGEST_LEN        16
struct flow_keys_digest {
        u8        data[FLOW_KEYS_DIGEST_LEN];
};

void make_flow_keys_digest(struct flow_keys_digest *digest,
                           const struct flow_keys *flow);

static inline bool flow_keys_have_l4(const struct flow_keys *keys)
{
        return (keys->ports.ports || keys->tags.flow_label);
}

u32 flow_hash_from_keys(struct flow_keys *keys);
void skb_flow_get_icmp_tci(const struct sk_buff *skb,
                           struct flow_dissector_key_icmp *key_icmp,
                           void *data, int thoff, int hlen);

static inline bool dissector_uses_key(const struct flow_dissector *flow_dissector,
                                      enum flow_dissector_key_id key_id)
{
        return flow_dissector->used_keys & (1 << key_id);
}

static inline void *skb_flow_dissector_target(struct flow_dissector *flow_dissector,
                                              enum flow_dissector_key_id key_id,
                                              void *target_container)
{
        return ((char *)target_container) + flow_dissector->offset[key_id];
}

struct bpf_flow_dissector {
        struct bpf_flow_keys        *flow_keys;
        const struct sk_buff        *skb;
        void                        *data;
        void                        *data_end;
};

static inline void
flow_dissector_init_keys(struct flow_dissector_key_control *key_control,
                         struct flow_dissector_key_basic *key_basic)
{
        memset(key_control, 0, sizeof(*key_control));
        memset(key_basic, 0, sizeof(*key_basic));
}

#ifdef CONFIG_BPF_SYSCALL
int flow_dissector_bpf_prog_attach_check(struct net *net,
                                         struct bpf_prog *prog);
#endif /* CONFIG_BPF_SYSCALL */

#endif




















    1 





















    2 
    2 






    2 






    2 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
// SPDX-License-Identifier: GPL-2.0
/*
 * linux/fs/ext4/xattr_security.c
 * Handler for storing security labels as extended attributes.
 */

#include <linux/string.h>
#include <linux/fs.h>
#include <linux/security.h>
#include <linux/slab.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#include "xattr.h"

static int
ext4_xattr_security_get(const struct xattr_handler *handler,
                        struct dentry *unused, struct inode *inode,
                        const char *name, void *buffer, size_t size)
{
        return ext4_xattr_get(inode, EXT4_XATTR_INDEX_SECURITY,
                              name, buffer, size);
}

static int
ext4_xattr_security_set(const struct xattr_handler *handler,
                        struct dentry *unused, struct inode *inode,
                        const char *name, const void *value,
                        size_t size, int flags)
{
        return ext4_xattr_set(inode, EXT4_XATTR_INDEX_SECURITY,
                              name, value, size, flags);
}

static int
ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array,
                void *fs_info)
{
        const struct xattr *xattr;
        handle_t *handle = fs_info;
        int err = 0;

        for (xattr = xattr_array; xattr->name != NULL; xattr++) {
                err = ext4_xattr_set_handle(handle, inode,
                                            EXT4_XATTR_INDEX_SECURITY,
                                            xattr->name, xattr->value,
                                            xattr->value_len, XATTR_CREATE);
                if (err < 0)
                        break;
        }
        return err;
}

int
ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
                   const struct qstr *qstr)
{
        return security_inode_init_security(inode, dir, qstr,
                                            &ext4_initxattrs, handle);
}

const struct xattr_handler ext4_xattr_security_handler = {
        .prefix        = XATTR_SECURITY_PREFIX,
        .get        = ext4_xattr_security_get,
        .set        = ext4_xattr_security_set,
};



























    3 
    3 





    3 
    3 
    3 





























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_LOCAL_LOCK_H
# error "Do not include directly, include linux/local_lock.h"
#endif

#include <linux/percpu-defs.h>
#include <linux/lockdep.h>

typedef struct {
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map        dep_map;
        struct task_struct        *owner;
#endif
} local_lock_t;

#ifdef CONFIG_DEBUG_LOCK_ALLOC
# define LOCAL_LOCK_DEBUG_INIT(lockname)                \
        .dep_map = {                                        \
                .name = #lockname,                        \
                .wait_type_inner = LD_WAIT_CONFIG,        \
                .lock_type = LD_LOCK_PERCPU,                \
        },                                                \
        .owner = NULL,

static inline void local_lock_acquire(local_lock_t *l)
{
        lock_map_acquire(&l->dep_map);
        DEBUG_LOCKS_WARN_ON(l->owner);
        l->owner = current;
}

static inline void local_lock_release(local_lock_t *l)
{
        DEBUG_LOCKS_WARN_ON(l->owner != current);
        l->owner = NULL;
        lock_map_release(&l->dep_map);
}

static inline void local_lock_debug_init(local_lock_t *l)
{
        l->owner = NULL;
}
#else /* CONFIG_DEBUG_LOCK_ALLOC */
# define LOCAL_LOCK_DEBUG_INIT(lockname)
static inline void local_lock_acquire(local_lock_t *l) { }
static inline void local_lock_release(local_lock_t *l) { }
static inline void local_lock_debug_init(local_lock_t *l) { }
#endif /* !CONFIG_DEBUG_LOCK_ALLOC */

#define INIT_LOCAL_LOCK(lockname)        { LOCAL_LOCK_DEBUG_INIT(lockname) }

#define __local_lock_init(lock)                                        \
do {                                                                \
        static struct lock_class_key __key;                        \
                                                                \
        debug_check_no_locks_freed((void *)lock, sizeof(*lock));\
        lockdep_init_map_type(&(lock)->dep_map, #lock, &__key,  \
                              0, LD_WAIT_CONFIG, LD_WAIT_INV,        \
                              LD_LOCK_PERCPU);                        \
        local_lock_debug_init(lock);                                \
} while (0)

#define __local_lock(lock)                                        \
        do {                                                        \
                preempt_disable();                                \
                local_lock_acquire(this_cpu_ptr(lock));                \
        } while (0)

#define __local_lock_irq(lock)                                        \
        do {                                                        \
                local_irq_disable();                                \
                local_lock_acquire(this_cpu_ptr(lock));                \
        } while (0)

#define __local_lock_irqsave(lock, flags)                        \
        do {                                                        \
                local_irq_save(flags);                                \
                local_lock_acquire(this_cpu_ptr(lock));                \
        } while (0)

#define __local_unlock(lock)                                        \
        do {                                                        \
                local_lock_release(this_cpu_ptr(lock));                \
                preempt_enable();                                \
        } while (0)

#define __local_unlock_irq(lock)                                \
        do {                                                        \
                local_lock_release(this_cpu_ptr(lock));                \
                local_irq_enable();                                \
        } while (0)

#define __local_unlock_irqrestore(lock, flags)                        \
        do {                                                        \
                local_lock_release(this_cpu_ptr(lock));                \
                local_irq_restore(flags);                        \
        } while (0)





























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_UTSNAME_H
#define _LINUX_UTSNAME_H


#include <linux/sched.h>
#include <linux/kref.h>
#include <linux/nsproxy.h>
#include <linux/ns_common.h>
#include <linux/err.h>
#include <uapi/linux/utsname.h>

enum uts_proc {
        UTS_PROC_OSTYPE,
        UTS_PROC_OSRELEASE,
        UTS_PROC_VERSION,
        UTS_PROC_HOSTNAME,
        UTS_PROC_DOMAINNAME,
};

struct user_namespace;
extern struct user_namespace init_user_ns;

struct uts_namespace {
        struct kref kref;
        struct new_utsname name;
        struct user_namespace *user_ns;
        struct ucounts *ucounts;
        struct ns_common ns;
} __randomize_layout;
extern struct uts_namespace init_uts_ns;

#ifdef CONFIG_UTS_NS
static inline void get_uts_ns(struct uts_namespace *ns)
{
        kref_get(&ns->kref);
}

extern struct uts_namespace *copy_utsname(unsigned long flags,
        struct user_namespace *user_ns, struct uts_namespace *old_ns);
extern void free_uts_ns(struct kref *kref);

static inline void put_uts_ns(struct uts_namespace *ns)
{
        kref_put(&ns->kref, free_uts_ns);
}

void uts_ns_init(void);
#else
static inline void get_uts_ns(struct uts_namespace *ns)
{
}

static inline void put_uts_ns(struct uts_namespace *ns)
{
}

static inline struct uts_namespace *copy_utsname(unsigned long flags,
        struct user_namespace *user_ns, struct uts_namespace *old_ns)
{
        if (flags & CLONE_NEWUTS)
                return ERR_PTR(-EINVAL);

        return old_ns;
}

static inline void uts_ns_init(void)
{
}
#endif

#ifdef CONFIG_PROC_SYSCTL
extern void uts_proc_notify(enum uts_proc proc);
#else
static inline void uts_proc_notify(enum uts_proc proc)
{
}
#endif

static inline struct new_utsname *utsname(void)
{
        return &current->nsproxy->uts_ns->name;
}

static inline struct new_utsname *init_utsname(void)
{
        return &init_uts_ns.name;
}

extern struct rw_semaphore uts_sem;

#endif /* _LINUX_UTSNAME_H */
























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_KERNEL_STAT_H
#define _LINUX_KERNEL_STAT_H

#include <linux/smp.h>
#include <linux/threads.h>
#include <linux/percpu.h>
#include <linux/cpumask.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
#include <linux/vtime.h>
#include <asm/irq.h>

/*
 * 'kernel_stat.h' contains the definitions needed for doing
 * some kernel statistics (CPU usage, context switches ...),
 * used by rstatd/perfmeter
 */

enum cpu_usage_stat {
        CPUTIME_USER,
        CPUTIME_NICE,
        CPUTIME_SYSTEM,
        CPUTIME_SOFTIRQ,
        CPUTIME_IRQ,
        CPUTIME_IDLE,
        CPUTIME_IOWAIT,
        CPUTIME_STEAL,
        CPUTIME_GUEST,
        CPUTIME_GUEST_NICE,
        NR_STATS,
};

struct kernel_cpustat {
        u64 cpustat[NR_STATS];
};

struct kernel_stat {
        unsigned long irqs_sum;
        unsigned int softirqs[NR_SOFTIRQS];
};

DECLARE_PER_CPU(struct kernel_stat, kstat);
DECLARE_PER_CPU(struct kernel_cpustat, kernel_cpustat);

/* Must have preemption disabled for this to be meaningful. */
#define kstat_this_cpu this_cpu_ptr(&kstat)
#define kcpustat_this_cpu this_cpu_ptr(&kernel_cpustat)
#define kstat_cpu(cpu) per_cpu(kstat, cpu)
#define kcpustat_cpu(cpu) per_cpu(kernel_cpustat, cpu)

extern unsigned long long nr_context_switches(void);

extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu);
extern void kstat_incr_irq_this_cpu(unsigned int irq);

static inline void kstat_incr_softirqs_this_cpu(unsigned int irq)
{
        __this_cpu_inc(kstat.softirqs[irq]);
}

static inline unsigned int kstat_softirqs_cpu(unsigned int irq, int cpu)
{
       return kstat_cpu(cpu).softirqs[irq];
}

/*
 * Number of interrupts per specific IRQ source, since bootup
 */
extern unsigned int kstat_irqs(unsigned int irq);
extern unsigned int kstat_irqs_usr(unsigned int irq);

/*
 * Number of interrupts per cpu, since bootup
 */
static inline unsigned long kstat_cpu_irqs_sum(unsigned int cpu)
{
        return kstat_cpu(cpu).irqs_sum;
}

#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
extern u64 kcpustat_field(struct kernel_cpustat *kcpustat,
                          enum cpu_usage_stat usage, int cpu);
extern void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu);
#else
static inline u64 kcpustat_field(struct kernel_cpustat *kcpustat,
                                 enum cpu_usage_stat usage, int cpu)
{
        return kcpustat->cpustat[usage];
}

static inline void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu)
{
        *dst = kcpustat_cpu(cpu);
}

#endif

extern void account_user_time(struct task_struct *, u64);
extern void account_guest_time(struct task_struct *, u64);
extern void account_system_time(struct task_struct *, int, u64);
extern void account_system_index_time(struct task_struct *, u64,
                                      enum cpu_usage_stat);
extern void account_steal_time(u64);
extern void account_idle_time(u64);
extern u64 get_idle_time(struct kernel_cpustat *kcs, int cpu);

#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
static inline void account_process_tick(struct task_struct *tsk, int user)
{
        vtime_flush(tsk);
}
#else
extern void account_process_tick(struct task_struct *, int user);
#endif

extern void account_idle_ticks(unsigned long ticks);

#endif /* _LINUX_KERNEL_STAT_H */






















































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _X_TABLES_H
#define _X_TABLES_H


#include <linux/netdevice.h>
#include <linux/static_key.h>
#include <linux/netfilter.h>
#include <uapi/linux/netfilter/x_tables.h>

/* Test a struct->invflags and a boolean for inequality */
#define NF_INVF(ptr, flag, boolean)                                        \
        ((boolean) ^ !!((ptr)->invflags & (flag)))

/**
 * struct xt_action_param - parameters for matches/targets
 *
 * @match:        the match extension
 * @target:        the target extension
 * @matchinfo:        per-match data
 * @targetinfo:        per-target data
 * @state:        pointer to hook state this packet came from
 * @fragoff:        packet is a fragment, this is the data offset
 * @thoff:        position of transport header relative to skb->data
 *
 * Fields written to by extensions:
 *
 * @hotdrop:        drop packet if we had inspection problems
 */
struct xt_action_param {
        union {
                const struct xt_match *match;
                const struct xt_target *target;
        };
        union {
                const void *matchinfo, *targinfo;
        };
        const struct nf_hook_state *state;
        int fragoff;
        unsigned int thoff;
        bool hotdrop;
};

static inline struct net *xt_net(const struct xt_action_param *par)
{
        return par->state->net;
}

static inline struct net_device *xt_in(const struct xt_action_param *par)
{
        return par->state->in;
}

static inline const char *xt_inname(const struct xt_action_param *par)
{
        return par->state->in->name;
}

static inline struct net_device *xt_out(const struct xt_action_param *par)
{
        return par->state->out;
}

static inline const char *xt_outname(const struct xt_action_param *par)
{
        return par->state->out->name;
}

static inline unsigned int xt_hooknum(const struct xt_action_param *par)
{
        return par->state->hook;
}

static inline u_int8_t xt_family(const struct xt_action_param *par)
{
        return par->state->pf;
}

/**
 * struct xt_mtchk_param - parameters for match extensions'
 * checkentry functions
 *
 * @net:        network namespace through which the check was invoked
 * @table:        table the rule is tried to be inserted into
 * @entryinfo:        the family-specific rule data
 *                 (struct ipt_ip, ip6t_ip, arpt_arp or (note) ebt_entry)
 * @match:        struct xt_match through which this function was invoked
 * @matchinfo:        per-match data
 * @hook_mask:        via which hooks the new rule is reachable
 * Other fields as above.
 */
struct xt_mtchk_param {
        struct net *net;
        const char *table;
        const void *entryinfo;
        const struct xt_match *match;
        void *matchinfo;
        unsigned int hook_mask;
        u_int8_t family;
        bool nft_compat;
};

/**
 * struct xt_mdtor_param - match destructor parameters
 * Fields as above.
 */
struct xt_mtdtor_param {
        struct net *net;
        const struct xt_match *match;
        void *matchinfo;
        u_int8_t family;
};

/**
 * struct xt_tgchk_param - parameters for target extensions'
 * checkentry functions
 *
 * @entryinfo:        the family-specific rule data
 *                 (struct ipt_entry, ip6t_entry, arpt_entry, ebt_entry)
 *
 * Other fields see above.
 */
struct xt_tgchk_param {
        struct net *net;
        const char *table;
        const void *entryinfo;
        const struct xt_target *target;
        void *targinfo;
        unsigned int hook_mask;
        u_int8_t family;
        bool nft_compat;
};

/* Target destructor parameters */
struct xt_tgdtor_param {
        struct net *net;
        const struct xt_target *target;
        void *targinfo;
        u_int8_t family;
};

struct xt_match {
        struct list_head list;

        const char name[XT_EXTENSION_MAXNAMELEN];
        u_int8_t revision;

        /* Return true or false: return FALSE and set *hotdrop = 1 to
           force immediate packet drop. */
        /* Arguments changed since 2.6.9, as this must now handle
           non-linear skb, using skb_header_pointer and
           skb_ip_make_writable. */
        bool (*match)(const struct sk_buff *skb,
                      struct xt_action_param *);

        /* Called when user tries to insert an entry of this type. */
        int (*checkentry)(const struct xt_mtchk_param *);

        /* Called when entry of this type deleted. */
        void (*destroy)(const struct xt_mtdtor_param *);
#ifdef CONFIG_COMPAT
        /* Called when userspace align differs from kernel space one */
        void (*compat_from_user)(void *dst, const void *src);
        int (*compat_to_user)(void __user *dst, const void *src);
#endif
        /* Set this to THIS_MODULE if you are a module, otherwise NULL */
        struct module *me;

        const char *table;
        unsigned int matchsize;
        unsigned int usersize;
#ifdef CONFIG_COMPAT
        unsigned int compatsize;
#endif
        unsigned int hooks;
        unsigned short proto;

        unsigned short family;
};

/* Registration hooks for targets. */
struct xt_target {
        struct list_head list;

        const char name[XT_EXTENSION_MAXNAMELEN];
        u_int8_t revision;

        /* Returns verdict. Argument order changed since 2.6.9, as this
           must now handle non-linear skbs, using skb_copy_bits and
           skb_ip_make_writable. */
        unsigned int (*target)(struct sk_buff *skb,
                               const struct xt_action_param *);

        /* Called when user tries to insert an entry of this type:
           hook_mask is a bitmask of hooks from which it can be
           called. */
        /* Should return 0 on success or an error code otherwise (-Exxxx). */
        int (*checkentry)(const struct xt_tgchk_param *);

        /* Called when entry of this type deleted. */
        void (*destroy)(const struct xt_tgdtor_param *);
#ifdef CONFIG_COMPAT
        /* Called when userspace align differs from kernel space one */
        void (*compat_from_user)(void *dst, const void *src);
        int (*compat_to_user)(void __user *dst, const void *src);
#endif
        /* Set this to THIS_MODULE if you are a module, otherwise NULL */
        struct module *me;

        const char *table;
        unsigned int targetsize;
        unsigned int usersize;
#ifdef CONFIG_COMPAT
        unsigned int compatsize;
#endif
        unsigned int hooks;
        unsigned short proto;

        unsigned short family;
};

/* Furniture shopping... */
struct xt_table {
        struct list_head list;

        /* What hooks you will enter on */
        unsigned int valid_hooks;

        /* Man behind the curtain... */
        struct xt_table_info *private;

        /* Set this to THIS_MODULE if you are a module, otherwise NULL */
        struct module *me;

        u_int8_t af;                /* address/protocol family */
        int priority;                /* hook order */

        /* called when table is needed in the given netns */
        int (*table_init)(struct net *net);

        /* A unique name... */
        const char name[XT_TABLE_MAXNAMELEN];
};

#include <linux/netfilter_ipv4.h>

/* The table itself */
struct xt_table_info {
        /* Size per table */
        unsigned int size;
        /* Number of entries: FIXME. --RR */
        unsigned int number;
        /* Initial number of entries. Needed for module usage count */
        unsigned int initial_entries;

        /* Entry points and underflows */
        unsigned int hook_entry[NF_INET_NUMHOOKS];
        unsigned int underflow[NF_INET_NUMHOOKS];

        /*
         * Number of user chains. Since tables cannot have loops, at most
         * @stacksize jumps (number of user chains) can possibly be made.
         */
        unsigned int stacksize;
        void ***jumpstack;

        unsigned char entries[] __aligned(8);
};

int xt_register_target(struct xt_target *target);
void xt_unregister_target(struct xt_target *target);
int xt_register_targets(struct xt_target *target, unsigned int n);
void xt_unregister_targets(struct xt_target *target, unsigned int n);

int xt_register_match(struct xt_match *target);
void xt_unregister_match(struct xt_match *target);
int xt_register_matches(struct xt_match *match, unsigned int n);
void xt_unregister_matches(struct xt_match *match, unsigned int n);

int xt_check_entry_offsets(const void *base, const char *elems,
                           unsigned int target_offset,
                           unsigned int next_offset);

int xt_check_table_hooks(const struct xt_table_info *info, unsigned int valid_hooks);

unsigned int *xt_alloc_entry_offsets(unsigned int size);
bool xt_find_jump_offset(const unsigned int *offsets,
                         unsigned int target, unsigned int size);

int xt_check_proc_name(const char *name, unsigned int size);

int xt_check_match(struct xt_mtchk_param *, unsigned int size, u16 proto,
                   bool inv_proto);
int xt_check_target(struct xt_tgchk_param *, unsigned int size, u16 proto,
                    bool inv_proto);

int xt_match_to_user(const struct xt_entry_match *m,
                     struct xt_entry_match __user *u);
int xt_target_to_user(const struct xt_entry_target *t,
                      struct xt_entry_target __user *u);
int xt_data_to_user(void __user *dst, const void *src,
                    int usersize, int size, int aligned_size);

void *xt_copy_counters(sockptr_t arg, unsigned int len,
                       struct xt_counters_info *info);
struct xt_counters *xt_counters_alloc(unsigned int counters);

struct xt_table *xt_register_table(struct net *net,
                                   const struct xt_table *table,
                                   struct xt_table_info *bootstrap,
                                   struct xt_table_info *newinfo);
void *xt_unregister_table(struct xt_table *table);

struct xt_table_info *xt_replace_table(struct xt_table *table,
                                       unsigned int num_counters,
                                       struct xt_table_info *newinfo,
                                       int *error);

struct xt_match *xt_find_match(u8 af, const char *name, u8 revision);
struct xt_match *xt_request_find_match(u8 af, const char *name, u8 revision);
struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision);
int xt_find_revision(u8 af, const char *name, u8 revision, int target,
                     int *err);

struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
                                    const char *name);
struct xt_table *xt_request_find_table_lock(struct net *net, u_int8_t af,
                                            const char *name);
void xt_table_unlock(struct xt_table *t);

int xt_proto_init(struct net *net, u_int8_t af);
void xt_proto_fini(struct net *net, u_int8_t af);

struct xt_table_info *xt_alloc_table_info(unsigned int size);
void xt_free_table_info(struct xt_table_info *info);

/**
 * xt_recseq - recursive seqcount for netfilter use
 *
 * Packet processing changes the seqcount only if no recursion happened
 * get_counters() can use read_seqcount_begin()/read_seqcount_retry(),
 * because we use the normal seqcount convention :
 * Low order bit set to 1 if a writer is active.
 */
DECLARE_PER_CPU(seqcount_t, xt_recseq);

/* xt_tee_enabled - true if x_tables needs to handle reentrancy
 *
 * Enabled if current ip(6)tables ruleset has at least one -j TEE rule.
 */
extern struct static_key xt_tee_enabled;

/**
 * xt_write_recseq_begin - start of a write section
 *
 * Begin packet processing : all readers must wait the end
 * 1) Must be called with preemption disabled
 * 2) softirqs must be disabled too (or we should use this_cpu_add())
 * Returns :
 *  1 if no recursion on this cpu
 *  0 if recursion detected
 */
static inline unsigned int xt_write_recseq_begin(void)
{
        unsigned int addend;

        /*
         * Low order bit of sequence is set if we already
         * called xt_write_recseq_begin().
         */
        addend = (__this_cpu_read(xt_recseq.sequence) + 1) & 1;

        /*
         * This is kind of a write_seqcount_begin(), but addend is 0 or 1
         * We dont check addend value to avoid a test and conditional jump,
         * since addend is most likely 1
         */
        __this_cpu_add(xt_recseq.sequence, addend);
        smp_mb();

        return addend;
}

/**
 * xt_write_recseq_end - end of a write section
 * @addend: return value from previous xt_write_recseq_begin()
 *
 * End packet processing : all readers can proceed
 * 1) Must be called with preemption disabled
 * 2) softirqs must be disabled too (or we should use this_cpu_add())
 */
static inline void xt_write_recseq_end(unsigned int addend)
{
        /* this is kind of a write_seqcount_end(), but addend is 0 or 1 */
        smp_wmb();
        __this_cpu_add(xt_recseq.sequence, addend);
}

/*
 * This helper is performance critical and must be inlined
 */
static inline unsigned long ifname_compare_aligned(const char *_a,
                                                   const char *_b,
                                                   const char *_mask)
{
        const unsigned long *a = (const unsigned long *)_a;
        const unsigned long *b = (const unsigned long *)_b;
        const unsigned long *mask = (const unsigned long *)_mask;
        unsigned long ret;

        ret = (a[0] ^ b[0]) & mask[0];
        if (IFNAMSIZ > sizeof(unsigned long))
                ret |= (a[1] ^ b[1]) & mask[1];
        if (IFNAMSIZ > 2 * sizeof(unsigned long))
                ret |= (a[2] ^ b[2]) & mask[2];
        if (IFNAMSIZ > 3 * sizeof(unsigned long))
                ret |= (a[3] ^ b[3]) & mask[3];
        BUILD_BUG_ON(IFNAMSIZ > 4 * sizeof(unsigned long));
        return ret;
}

struct xt_percpu_counter_alloc_state {
        unsigned int off;
        const char __percpu *mem;
};

bool xt_percpu_counter_alloc(struct xt_percpu_counter_alloc_state *state,
                             struct xt_counters *counter);
void xt_percpu_counter_free(struct xt_counters *cnt);

static inline struct xt_counters *
xt_get_this_cpu_counter(struct xt_counters *cnt)
{
        if (nr_cpu_ids > 1)
                return this_cpu_ptr((void __percpu *) (unsigned long) cnt->pcnt);

        return cnt;
}

static inline struct xt_counters *
xt_get_per_cpu_counter(struct xt_counters *cnt, unsigned int cpu)
{
        if (nr_cpu_ids > 1)
                return per_cpu_ptr((void __percpu *) (unsigned long) cnt->pcnt, cpu);

        return cnt;
}

struct nf_hook_ops *xt_hook_ops_alloc(const struct xt_table *, nf_hookfn *);

#ifdef CONFIG_COMPAT
#include <net/compat.h>

struct compat_xt_entry_match {
        union {
                struct {
                        u_int16_t match_size;
                        char name[XT_FUNCTION_MAXNAMELEN - 1];
                        u_int8_t revision;
                } user;
                struct {
                        u_int16_t match_size;
                        compat_uptr_t match;
                } kernel;
                u_int16_t match_size;
        } u;
        unsigned char data[];
};

struct compat_xt_entry_target {
        union {
                struct {
                        u_int16_t target_size;
                        char name[XT_FUNCTION_MAXNAMELEN - 1];
                        u_int8_t revision;
                } user;
                struct {
                        u_int16_t target_size;
                        compat_uptr_t target;
                } kernel;
                u_int16_t target_size;
        } u;
        unsigned char data[];
};

/* FIXME: this works only on 32 bit tasks
 * need to change whole approach in order to calculate align as function of
 * current task alignment */

struct compat_xt_counters {
        compat_u64 pcnt, bcnt;                        /* Packet and byte counters */
};

struct compat_xt_counters_info {
        char name[XT_TABLE_MAXNAMELEN];
        compat_uint_t num_counters;
        struct compat_xt_counters counters[];
};

struct _compat_xt_align {
        __u8 u8;
        __u16 u16;
        __u32 u32;
        compat_u64 u64;
};

#define COMPAT_XT_ALIGN(s) __ALIGN_KERNEL((s), __alignof__(struct _compat_xt_align))

void xt_compat_lock(u_int8_t af);
void xt_compat_unlock(u_int8_t af);

int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta);
void xt_compat_flush_offsets(u_int8_t af);
int xt_compat_init_offsets(u8 af, unsigned int number);
int xt_compat_calc_jump(u_int8_t af, unsigned int offset);

int xt_compat_match_offset(const struct xt_match *match);
void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr,
                              unsigned int *size);
int xt_compat_match_to_user(const struct xt_entry_match *m,
                            void __user **dstptr, unsigned int *size);

int xt_compat_target_offset(const struct xt_target *target);
void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr,
                                unsigned int *size);
int xt_compat_target_to_user(const struct xt_entry_target *t,
                             void __user **dstptr, unsigned int *size);
int xt_compat_check_entry_offsets(const void *base, const char *elems,
                                  unsigned int target_offset,
                                  unsigned int next_offset);

#endif /* CONFIG_COMPAT */
#endif /* _X_TABLES_H */












































































































    1 


    1 
    1 


    1 

















    1 









    1 







    1 

















    1 












    1 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ext4/fsync.c
 *
 *  Copyright (C) 1993  Stephen Tweedie (sct@redhat.com)
 *  from
 *  Copyright (C) 1992  Remy Card (card@masi.ibp.fr)
 *                      Laboratoire MASI - Institut Blaise Pascal
 *                      Universite Pierre et Marie Curie (Paris VI)
 *  from
 *  linux/fs/minix/truncate.c   Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  ext4fs fsync primitive
 *
 *  Big-endian to little-endian byte-swapping/bitmaps by
 *        David S. Miller (davem@caip.rutgers.edu), 1995
 *
 *  Removed unnecessary code duplication for little endian machines
 *  and excessive __inline__s.
 *        Andi Kleen, 1997
 *
 * Major simplications and cleanup - we only need to do the metadata, because
 * we can depend on generic_block_fdatasync() to sync the data blocks.
 */

#include <linux/time.h>
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>

#include "ext4.h"
#include "ext4_jbd2.h"

#include <trace/events/ext4.h>

/*
 * If we're not journaling and this is a just-created file, we have to
 * sync our parent directory (if it was freshly created) since
 * otherwise it will only be written by writeback, leaving a huge
 * window during which a crash may lose the file.  This may apply for
 * the parent directory's parent as well, and so on recursively, if
 * they are also freshly created.
 */
static int ext4_sync_parent(struct inode *inode)
{
        struct dentry *dentry, *next;
        int ret = 0;

        if (!ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY))
                return 0;
        dentry = d_find_any_alias(inode);
        if (!dentry)
                return 0;
        while (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
                ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);

                next = dget_parent(dentry);
                dput(dentry);
                dentry = next;
                inode = dentry->d_inode;

                /*
                 * The directory inode may have gone through rmdir by now. But
                 * the inode itself and its blocks are still allocated (we hold
                 * a reference to the inode via its dentry), so it didn't go
                 * through ext4_evict_inode()) and so we are safe to flush
                 * metadata blocks and the inode.
                 */
                ret = sync_mapping_buffers(inode->i_mapping);
                if (ret)
                        break;
                ret = sync_inode_metadata(inode, 1);
                if (ret)
                        break;
        }
        dput(dentry);
        return ret;
}

static int ext4_fsync_nojournal(struct inode *inode, bool datasync,
                                bool *needs_barrier)
{
        int ret, err;

        ret = sync_mapping_buffers(inode->i_mapping);
        if (!(inode->i_state & I_DIRTY_ALL))
                return ret;
        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
                return ret;

        err = sync_inode_metadata(inode, 1);
        if (!ret)
                ret = err;

        if (!ret)
                ret = ext4_sync_parent(inode);
        if (test_opt(inode->i_sb, BARRIER))
                *needs_barrier = true;

        return ret;
}

static int ext4_fsync_journal(struct inode *inode, bool datasync,
                             bool *needs_barrier)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
        tid_t commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;

        if (journal->j_flags & JBD2_BARRIER &&
            !jbd2_trans_will_send_data_barrier(journal, commit_tid))
                *needs_barrier = true;

        return ext4_fc_commit(journal, commit_tid);
}

/*
 * akpm: A new design for ext4_sync_file().
 *
 * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
 * There cannot be a transaction open by this task.
 * Another task could have dirtied this inode.  Its data can be in any
 * state in the journalling system.
 *
 * What we do is just kick off a commit and wait on it.  This will snapshot the
 * inode to disk.
 */
int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
        int ret = 0, err;
        bool needs_barrier = false;
        struct inode *inode = file->f_mapping->host;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

        if (unlikely(ext4_forced_shutdown(sbi)))
                return -EIO;

        J_ASSERT(ext4_journal_current_handle() == NULL);

        trace_ext4_sync_file_enter(file, datasync);

        if (sb_rdonly(inode->i_sb)) {
                /* Make sure that we read updated s_mount_flags value */
                smp_rmb();
                if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED))
                        ret = -EROFS;
                goto out;
        }

        ret = file_write_and_wait_range(file, start, end);
        if (ret)
                goto out;

        /*
         * data=writeback,ordered:
         *  The caller's filemap_fdatawrite()/wait will sync the data.
         *  Metadata is in the journal, we wait for proper transaction to
         *  commit here.
         *
         * data=journal:
         *  filemap_fdatawrite won't do anything (the buffers are clean).
         *  ext4_force_commit will write the file data into the journal and
         *  will wait on that.
         *  filemap_fdatawait() will encounter a ton of newly-dirtied pages
         *  (they were dirtied by commit).  But that's OK - the blocks are
         *  safe in-journal, which is all fsync() needs to ensure.
         */
        if (!sbi->s_journal)
                ret = ext4_fsync_nojournal(inode, datasync, &needs_barrier);
        else if (ext4_should_journal_data(inode))
                ret = ext4_force_commit(inode->i_sb);
        else
                ret = ext4_fsync_journal(inode, datasync, &needs_barrier);

        if (needs_barrier) {
                err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
                if (!ret)
                        ret = err;
        }
out:
        err = file_check_and_advance_wb_err(file);
        if (ret == 0)
                ret = err;
        trace_ext4_sync_file_exit(inode, ret);
        return ret;
}

































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _IPV6_H
#define _IPV6_H

#include <uapi/linux/ipv6.h>

#define ipv6_optlen(p)  (((p)->hdrlen+1) << 3)
#define ipv6_authlen(p) (((p)->hdrlen+2) << 2)
/*
 * This structure contains configuration options per IPv6 link.
 */
struct ipv6_devconf {
        __s32                forwarding;
        __s32                hop_limit;
        __s32                mtu6;
        __s32                accept_ra;
        __s32                accept_redirects;
        __s32                autoconf;
        __s32                dad_transmits;
        __s32                rtr_solicits;
        __s32                rtr_solicit_interval;
        __s32                rtr_solicit_max_interval;
        __s32                rtr_solicit_delay;
        __s32                force_mld_version;
        __s32                mldv1_unsolicited_report_interval;
        __s32                mldv2_unsolicited_report_interval;
        __s32                use_tempaddr;
        __s32                temp_valid_lft;
        __s32                temp_prefered_lft;
        __s32                regen_max_retry;
        __s32                max_desync_factor;
        __s32                max_addresses;
        __s32                accept_ra_defrtr;
        __s32                accept_ra_min_hop_limit;
        __s32                accept_ra_min_lft;
        __s32                accept_ra_pinfo;
        __s32                ignore_routes_with_linkdown;
#ifdef CONFIG_IPV6_ROUTER_PREF
        __s32                accept_ra_rtr_pref;
        __s32                rtr_probe_interval;
#ifdef CONFIG_IPV6_ROUTE_INFO
        __s32                accept_ra_rt_info_min_plen;
        __s32                accept_ra_rt_info_max_plen;
#endif
#endif
        __s32                proxy_ndp;
        __s32                accept_source_route;
        __s32                accept_ra_from_local;
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
        __s32                optimistic_dad;
        __s32                use_optimistic;
#endif
#ifdef CONFIG_IPV6_MROUTE
        atomic_t        mc_forwarding;
#endif
        __s32                disable_ipv6;
        __s32                drop_unicast_in_l2_multicast;
        __s32                accept_dad;
        __s32                force_tllao;
        __s32           ndisc_notify;
        __s32                suppress_frag_ndisc;
        __s32                accept_ra_mtu;
        __s32                drop_unsolicited_na;
        struct ipv6_stable_secret {
                bool initialized;
                struct in6_addr secret;
        } stable_secret;
        __s32                use_oif_addrs_only;
        __s32                keep_addr_on_down;
        __s32                seg6_enabled;
#ifdef CONFIG_IPV6_SEG6_HMAC
        __s32                seg6_require_hmac;
#endif
        __u32                enhanced_dad;
        __u32                addr_gen_mode;
        __s32                disable_policy;
        __s32           ndisc_tclass;
        __s32                rpl_seg_enabled;

        struct ctl_table_header *sysctl_header;
};

struct ipv6_params {
        __s32 disable_ipv6;
        __s32 autoconf;
};
extern struct ipv6_params ipv6_defaults;
#include <linux/tcp.h>
#include <linux/udp.h>

#include <net/inet_sock.h>

static inline struct ipv6hdr *ipv6_hdr(const struct sk_buff *skb)
{
        return (struct ipv6hdr *)skb_network_header(skb);
}

static inline struct ipv6hdr *inner_ipv6_hdr(const struct sk_buff *skb)
{
        return (struct ipv6hdr *)skb_inner_network_header(skb);
}

static inline struct ipv6hdr *ipipv6_hdr(const struct sk_buff *skb)
{
        return (struct ipv6hdr *)skb_transport_header(skb);
}

static inline unsigned int ipv6_transport_len(const struct sk_buff *skb)
{
        return ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr) -
               skb_network_header_len(skb);
}

/* 
   This structure contains results of exthdrs parsing
   as offsets from skb->nh.
 */

struct inet6_skb_parm {
        int                        iif;
        __be16                        ra;
        __u16                        dst0;
        __u16                        srcrt;
        __u16                        dst1;
        __u16                        lastopt;
        __u16                        nhoff;
        __u16                        flags;
#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
        __u16                        dsthao;
#endif
        __u16                        frag_max_size;

#define IP6SKB_XFRM_TRANSFORMED        1
#define IP6SKB_FORWARDED        2
#define IP6SKB_REROUTED                4
#define IP6SKB_ROUTERALERT        8
#define IP6SKB_FRAGMENTED      16
#define IP6SKB_HOPBYHOP        32
#define IP6SKB_L3SLAVE         64
#define IP6SKB_JUMBOGRAM      128
};

#if defined(CONFIG_NET_L3_MASTER_DEV)
static inline bool ipv6_l3mdev_skb(__u16 flags)
{
        return flags & IP6SKB_L3SLAVE;
}
#else
static inline bool ipv6_l3mdev_skb(__u16 flags)
{
        return false;
}
#endif

#define IP6CB(skb)        ((struct inet6_skb_parm*)((skb)->cb))
#define IP6CBMTU(skb)        ((struct ip6_mtuinfo *)((skb)->cb))

static inline int inet6_iif(const struct sk_buff *skb)
{
        bool l3_slave = ipv6_l3mdev_skb(IP6CB(skb)->flags);

        return l3_slave ? skb->skb_iif : IP6CB(skb)->iif;
}

static inline bool inet6_is_jumbogram(const struct sk_buff *skb)
{
        return !!(IP6CB(skb)->flags & IP6SKB_JUMBOGRAM);
}

/* can not be used in TCP layer after tcp_v6_fill_cb */
static inline int inet6_sdif(const struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
        if (skb && ipv6_l3mdev_skb(IP6CB(skb)->flags))
                return IP6CB(skb)->iif;
#endif
        return 0;
}

struct tcp6_request_sock {
        struct tcp_request_sock          tcp6rsk_tcp;
};

struct ipv6_mc_socklist;
struct ipv6_ac_socklist;
struct ipv6_fl_socklist;

struct inet6_cork {
        struct ipv6_txoptions *opt;
        u8 hop_limit;
        u8 tclass;
};

/**
 * struct ipv6_pinfo - ipv6 private area
 *
 * In the struct sock hierarchy (tcp6_sock, upd6_sock, etc)
 * this _must_ be the last member, so that inet6_sk_generic
 * is able to calculate its offset from the base struct sock
 * by using the struct proto->slab_obj_size member. -acme
 */
struct ipv6_pinfo {
        struct in6_addr         saddr;
        struct in6_pktinfo        sticky_pktinfo;
        const struct in6_addr                *daddr_cache;
#ifdef CONFIG_IPV6_SUBTREES
        const struct in6_addr                *saddr_cache;
#endif

        __be32                        flow_label;
        __u32                        frag_size;

        /*
         * Packed in 16bits.
         * Omit one shift by putting the signed field at MSB.
         */
#if defined(__BIG_ENDIAN_BITFIELD)
        __s16                        hop_limit:9;
        __u16                        __unused_1:7;
#else
        __u16                        __unused_1:7;
        __s16                        hop_limit:9;
#endif

#if defined(__BIG_ENDIAN_BITFIELD)
        /* Packed in 16bits. */
        __s16                        mcast_hops:9;
        __u16                        __unused_2:6,
                                mc_loop:1;
#else
        __u16                        mc_loop:1,
                                __unused_2:6;
        __s16                        mcast_hops:9;
#endif
        int                        ucast_oif;
        int                        mcast_oif;

        /* pktoption flags */
        union {
                struct {
                        __u16        srcrt:1,
                                osrcrt:1,
                                rxinfo:1,
                                rxoinfo:1,
                                rxhlim:1,
                                rxohlim:1,
                                hopopts:1,
                                ohopopts:1,
                                dstopts:1,
                                odstopts:1,
                                rxflow:1,
                                rxtclass:1,
                                rxpmtu:1,
                                rxorigdstaddr:1,
                                recvfragsize:1;
                                /* 1 bits hole */
                } bits;
                __u16                all;
        } rxopt;

        /* sockopt flags */
        __u16                        recverr:1,
                                sndflow:1,
                                repflow:1,
                                pmtudisc:3,
                                padding:1,        /* 1 bit hole */
                                srcprefs:3,        /* 001: prefer temporary address
                                                 * 010: prefer public address
                                                 * 100: prefer care-of address
                                                 */
                                dontfrag:1,
                                autoflowlabel:1,
                                autoflowlabel_set:1,
                                mc_all:1,
                                recverr_rfc4884:1,
                                rtalert_isolate:1;
        __u8                        min_hopcount;
        __u8                        tclass;
        __be32                        rcv_flowinfo;

        __u32                        dst_cookie;
        __u32                        rx_dst_cookie;

        struct ipv6_mc_socklist        __rcu *ipv6_mc_list;
        struct ipv6_ac_socklist        *ipv6_ac_list;
        struct ipv6_fl_socklist __rcu *ipv6_fl_list;

        struct ipv6_txoptions __rcu        *opt;
        struct sk_buff                *pktoptions;
        struct sk_buff                *rxpmtu;
        struct inet6_cork        cork;
};

/* WARNING: don't change the layout of the members in {raw,udp,tcp}6_sock! */
struct raw6_sock {
        /* inet_sock has to be the first member of raw6_sock */
        struct inet_sock        inet;
        __u32                        checksum;        /* perform checksum */
        __u32                        offset;                /* checksum offset  */
        struct icmp6_filter        filter;
        __u32                        ip6mr_table;
        /* ipv6_pinfo has to be the last member of raw6_sock, see inet6_sk_generic */
        struct ipv6_pinfo        inet6;
};

struct udp6_sock {
        struct udp_sock          udp;
        /* ipv6_pinfo has to be the last member of udp6_sock, see inet6_sk_generic */
        struct ipv6_pinfo inet6;
};

struct tcp6_sock {
        struct tcp_sock          tcp;
        /* ipv6_pinfo has to be the last member of tcp6_sock, see inet6_sk_generic */
        struct ipv6_pinfo inet6;
};

extern int inet6_sk_rebuild_header(struct sock *sk);

struct tcp6_timewait_sock {
        struct tcp_timewait_sock   tcp6tw_tcp;
};

#if IS_ENABLED(CONFIG_IPV6)
bool ipv6_mod_enabled(void);

static inline struct ipv6_pinfo *inet6_sk(const struct sock *__sk)
{
        return sk_fullsock(__sk) ? inet_sk(__sk)->pinet6 : NULL;
}

static inline struct raw6_sock *raw6_sk(const struct sock *sk)
{
        return (struct raw6_sock *)sk;
}

#define __ipv6_only_sock(sk)        (sk->sk_ipv6only)
#define ipv6_only_sock(sk)        (__ipv6_only_sock(sk))
#define ipv6_sk_rxinfo(sk)        ((sk)->sk_family == PF_INET6 && \
                                 inet6_sk(sk)->rxopt.bits.rxinfo)

static inline const struct in6_addr *inet6_rcv_saddr(const struct sock *sk)
{
        if (sk->sk_family == AF_INET6)
                return &sk->sk_v6_rcv_saddr;
        return NULL;
}

static inline int inet_v6_ipv6only(const struct sock *sk)
{
        /* ipv6only field is at same position for timewait and other sockets */
        return ipv6_only_sock(sk);
}
#else
#define __ipv6_only_sock(sk)        0
#define ipv6_only_sock(sk)        0
#define ipv6_sk_rxinfo(sk)        0

static inline bool ipv6_mod_enabled(void)
{
        return false;
}

static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk)
{
        return NULL;
}

static inline struct inet6_request_sock *
                        inet6_rsk(const struct request_sock *rsk)
{
        return NULL;
}

static inline struct raw6_sock *raw6_sk(const struct sock *sk)
{
        return NULL;
}

#define inet6_rcv_saddr(__sk)        NULL
#define tcp_twsk_ipv6only(__sk)                0
#define inet_v6_ipv6only(__sk)                0
#endif /* IS_ENABLED(CONFIG_IPV6) */
#endif /* _IPV6_H */






















































































    1 
    2 


































































    1 
    2 





























    1 
































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PID_H
#define _LINUX_PID_H

#include <linux/rculist.h>
#include <linux/wait.h>
#include <linux/refcount.h>

enum pid_type
{
        PIDTYPE_PID,
        PIDTYPE_TGID,
        PIDTYPE_PGID,
        PIDTYPE_SID,
        PIDTYPE_MAX,
};

/*
 * What is struct pid?
 *
 * A struct pid is the kernel's internal notion of a process identifier.
 * It refers to individual tasks, process groups, and sessions.  While
 * there are processes attached to it the struct pid lives in a hash
 * table, so it and then the processes that it refers to can be found
 * quickly from the numeric pid value.  The attached processes may be
 * quickly accessed by following pointers from struct pid.
 *
 * Storing pid_t values in the kernel and referring to them later has a
 * problem.  The process originally with that pid may have exited and the
 * pid allocator wrapped, and another process could have come along
 * and been assigned that pid.
 *
 * Referring to user space processes by holding a reference to struct
 * task_struct has a problem.  When the user space process exits
 * the now useless task_struct is still kept.  A task_struct plus a
 * stack consumes around 10K of low kernel memory.  More precisely
 * this is THREAD_SIZE + sizeof(struct task_struct).  By comparison
 * a struct pid is about 64 bytes.
 *
 * Holding a reference to struct pid solves both of these problems.
 * It is small so holding a reference does not consume a lot of
 * resources, and since a new struct pid is allocated when the numeric pid
 * value is reused (when pids wrap around) we don't mistakenly refer to new
 * processes.
 */


/*
 * struct upid is used to get the id of the struct pid, as it is
 * seen in particular namespace. Later the struct pid is found with
 * find_pid_ns() using the int nr and struct pid_namespace *ns.
 */

struct upid {
        int nr;
        struct pid_namespace *ns;
};

struct pid
{
        refcount_t count;
        unsigned int level;
        spinlock_t lock;
        /* lists of tasks that use this pid */
        struct hlist_head tasks[PIDTYPE_MAX];
        struct hlist_head inodes;
        /* wait queue for pidfd notifications */
        wait_queue_head_t wait_pidfd;
        struct rcu_head rcu;
        struct upid numbers[1];
};

extern struct pid init_struct_pid;

extern const struct file_operations pidfd_fops;

struct file;

extern struct pid *pidfd_pid(const struct file *file);
struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags);
int pidfd_create(struct pid *pid, unsigned int flags);
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret);

static inline struct pid *get_pid(struct pid *pid)
{
        if (pid)
                refcount_inc(&pid->count);
        return pid;
}

extern void put_pid(struct pid *pid);
extern struct task_struct *pid_task(struct pid *pid, enum pid_type);
static inline bool pid_has_task(struct pid *pid, enum pid_type type)
{
        return !hlist_empty(&pid->tasks[type]);
}
extern struct task_struct *get_pid_task(struct pid *pid, enum pid_type);

extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type);

/*
 * these helpers must be called with the tasklist_lock write-held.
 */
extern void attach_pid(struct task_struct *task, enum pid_type);
extern void detach_pid(struct task_struct *task, enum pid_type);
extern void change_pid(struct task_struct *task, enum pid_type,
                        struct pid *pid);
extern void exchange_tids(struct task_struct *task, struct task_struct *old);
extern void transfer_pid(struct task_struct *old, struct task_struct *new,
                         enum pid_type);

struct pid_namespace;
extern struct pid_namespace init_pid_ns;

extern int pid_max;
extern int pid_max_min, pid_max_max;

/*
 * look up a PID in the hash table. Must be called with the tasklist_lock
 * or rcu_read_lock() held.
 *
 * find_pid_ns() finds the pid in the namespace specified
 * find_vpid() finds the pid by its virtual id, i.e. in the current namespace
 *
 * see also find_task_by_vpid() set in include/linux/sched.h
 */
extern struct pid *find_pid_ns(int nr, struct pid_namespace *ns);
extern struct pid *find_vpid(int nr);

/*
 * Lookup a PID in the hash table, and return with it's count elevated.
 */
extern struct pid *find_get_pid(int nr);
extern struct pid *find_ge_pid(int nr, struct pid_namespace *);

extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
                             size_t set_tid_size);
extern void free_pid(struct pid *pid);
extern void disable_pid_allocation(struct pid_namespace *ns);

/*
 * ns_of_pid() returns the pid namespace in which the specified pid was
 * allocated.
 *
 * NOTE:
 *         ns_of_pid() is expected to be called for a process (task) that has
 *         an attached 'struct pid' (see attach_pid(), detach_pid()) i.e @pid
 *         is expected to be non-NULL. If @pid is NULL, caller should handle
 *         the resulting NULL pid-ns.
 */
static inline struct pid_namespace *ns_of_pid(struct pid *pid)
{
        struct pid_namespace *ns = NULL;
        if (pid)
                ns = pid->numbers[pid->level].ns;
        return ns;
}

/*
 * is_child_reaper returns true if the pid is the init process
 * of the current namespace. As this one could be checked before
 * pid_ns->child_reaper is assigned in copy_process, we check
 * with the pid number.
 */
static inline bool is_child_reaper(struct pid *pid)
{
        return pid->numbers[pid->level].nr == 1;
}

/*
 * the helpers to get the pid's id seen from different namespaces
 *
 * pid_nr()    : global id, i.e. the id seen from the init namespace;
 * pid_vnr()   : virtual id, i.e. the id seen from the pid namespace of
 *               current.
 * pid_nr_ns() : id seen from the ns specified.
 *
 * see also task_xid_nr() etc in include/linux/sched.h
 */

static inline pid_t pid_nr(struct pid *pid)
{
        pid_t nr = 0;
        if (pid)
                nr = pid->numbers[0].nr;
        return nr;
}

pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns);
pid_t pid_vnr(struct pid *pid);

#define do_each_pid_task(pid, type, task)                                \
        do {                                                                \
                if ((pid) != NULL)                                        \
                        hlist_for_each_entry_rcu((task),                \
                                &(pid)->tasks[type], pid_links[type]) {

                        /*
                         * Both old and new leaders may be attached to
                         * the same pid in the middle of de_thread().
                         */
#define while_each_pid_task(pid, type, task)                                \
                                if (type == PIDTYPE_PID)                \
                                        break;                                \
                        }                                                \
        } while (0)

#define do_each_pid_thread(pid, type, task)                                \
        do_each_pid_task(pid, type, task) {                                \
                struct task_struct *tg___ = task;                        \
                for_each_thread(tg___, task) {

#define while_each_pid_thread(pid, type, task)                                \
                }                                                        \
                task = tg___;                                                \
        } while_each_pid_task(pid, type, task)
#endif /* _LINUX_PID_H */




















































































































































































































































































































































    1 










    1 





































































    1 









































































































































































































































































































































































































































































































































































































































































































































































































































    1 








    1 








    1 






































    1 





    1 


    1 















    1 





























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  fs/eventpoll.c (Efficient event retrieval implementation)
 *  Copyright (C) 2001,...,2009         Davide Libenzi
 *
 *  Davide Libenzi <davidel@xmailserver.org>
 */

#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/sched/signal.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/signal.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/string.h>
#include <linux/list.h>
#include <linux/hash.h>
#include <linux/spinlock.h>
#include <linux/syscalls.h>
#include <linux/rbtree.h>
#include <linux/wait.h>
#include <linux/eventpoll.h>
#include <linux/mount.h>
#include <linux/bitops.h>
#include <linux/mutex.h>
#include <linux/anon_inodes.h>
#include <linux/device.h>
#include <linux/uaccess.h>
#include <asm/io.h>
#include <asm/mman.h>
#include <linux/atomic.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/compat.h>
#include <linux/rculist.h>
#include <net/busy_poll.h>

/*
 * LOCKING:
 * There are three level of locking required by epoll :
 *
 * 1) epmutex (mutex)
 * 2) ep->mtx (mutex)
 * 3) ep->lock (rwlock)
 *
 * The acquire order is the one listed above, from 1 to 3.
 * We need a rwlock (ep->lock) because we manipulate objects
 * from inside the poll callback, that might be triggered from
 * a wake_up() that in turn might be called from IRQ context.
 * So we can't sleep inside the poll callback and hence we need
 * a spinlock. During the event transfer loop (from kernel to
 * user space) we could end up sleeping due a copy_to_user(), so
 * we need a lock that will allow us to sleep. This lock is a
 * mutex (ep->mtx). It is acquired during the event transfer loop,
 * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file().
 * Then we also need a global mutex to serialize eventpoll_release_file()
 * and ep_free().
 * This mutex is acquired by ep_free() during the epoll file
 * cleanup path and it is also acquired by eventpoll_release_file()
 * if a file has been pushed inside an epoll set and it is then
 * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL).
 * It is also acquired when inserting an epoll fd onto another epoll
 * fd. We do this so that we walk the epoll tree and ensure that this
 * insertion does not create a cycle of epoll file descriptors, which
 * could lead to deadlock. We need a global mutex to prevent two
 * simultaneous inserts (A into B and B into A) from racing and
 * constructing a cycle without either insert observing that it is
 * going to.
 * It is necessary to acquire multiple "ep->mtx"es at once in the
 * case when one epoll fd is added to another. In this case, we
 * always acquire the locks in the order of nesting (i.e. after
 * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired
 * before e2->mtx). Since we disallow cycles of epoll file
 * descriptors, this ensures that the mutexes are well-ordered. In
 * order to communicate this nesting to lockdep, when walking a tree
 * of epoll file descriptors, we use the current recursion depth as
 * the lockdep subkey.
 * It is possible to drop the "ep->mtx" and to use the global
 * mutex "epmutex" (together with "ep->lock") to have it working,
 * but having "ep->mtx" will make the interface more scalable.
 * Events that require holding "epmutex" are very rare, while for
 * normal operations the epoll private "ep->mtx" will guarantee
 * a better scalability.
 */

/* Epoll private bits inside the event mask */
#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)

#define EPOLLINOUT_BITS (EPOLLIN | EPOLLOUT)

#define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | EPOLLERR | EPOLLHUP | \
                                EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE)

/* Maximum number of nesting allowed inside epoll sets */
#define EP_MAX_NESTS 4

#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))

#define EP_UNACTIVE_PTR ((void *) -1L)

#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))

struct epoll_filefd {
        struct file *file;
        int fd;
} __packed;

/*
 * Structure used to track possible nested calls, for too deep recursions
 * and loop cycles.
 */
struct nested_call_node {
        struct list_head llink;
        void *cookie;
        void *ctx;
};

/*
 * This structure is used as collector for nested calls, to check for
 * maximum recursion dept and loop cycles.
 */
struct nested_calls {
        struct list_head tasks_call_list;
        spinlock_t lock;
};

/*
 * Each file descriptor added to the eventpoll interface will
 * have an entry of this type linked to the "rbr" RB tree.
 * Avoid increasing the size of this struct, there can be many thousands
 * of these on a server and we do not want this to take another cache line.
 */
struct epitem {
        union {
                /* RB tree node links this structure to the eventpoll RB tree */
                struct rb_node rbn;
                /* Used to free the struct epitem */
                struct rcu_head rcu;
        };

        /* List header used to link this structure to the eventpoll ready list */
        struct list_head rdllink;

        /*
         * Works together "struct eventpoll"->ovflist in keeping the
         * single linked chain of items.
         */
        struct epitem *next;

        /* The file descriptor information this item refers to */
        struct epoll_filefd ffd;

        /* Number of active wait queue attached to poll operations */
        int nwait;

        /* List containing poll wait queues */
        struct list_head pwqlist;

        /* The "container" of this item */
        struct eventpoll *ep;

        /* List header used to link this item to the "struct file" items list */
        struct list_head fllink;

        /* wakeup_source used when EPOLLWAKEUP is set */
        struct wakeup_source __rcu *ws;

        /* The structure that describe the interested events and the source fd */
        struct epoll_event event;
};

/*
 * This structure is stored inside the "private_data" member of the file
 * structure and represents the main data structure for the eventpoll
 * interface.
 */
struct eventpoll {
        /*
         * This mutex is used to ensure that files are not removed
         * while epoll is using them. This is held during the event
         * collection loop, the file cleanup path, the epoll file exit
         * code and the ctl operations.
         */
        struct mutex mtx;

        /* Wait queue used by sys_epoll_wait() */
        wait_queue_head_t wq;

        /* Wait queue used by file->poll() */
        wait_queue_head_t poll_wait;

        /* List of ready file descriptors */
        struct list_head rdllist;

        /* Lock which protects rdllist and ovflist */
        rwlock_t lock;

        /* RB tree root used to store monitored fd structs */
        struct rb_root_cached rbr;

        /*
         * This is a single linked list that chains all the "struct epitem" that
         * happened while transferring ready events to userspace w/out
         * holding ->lock.
         */
        struct epitem *ovflist;

        /* wakeup_source used when ep_scan_ready_list is running */
        struct wakeup_source *ws;

        /* The user that created the eventpoll descriptor */
        struct user_struct *user;

        struct file *file;

        /* used to optimize loop detection check */
        u64 gen;

#ifdef CONFIG_NET_RX_BUSY_POLL
        /* used to track busy poll napi_id */
        unsigned int napi_id;
#endif

#ifdef CONFIG_DEBUG_LOCK_ALLOC
        /* tracks wakeup nests for lockdep validation */
        u8 nests;
#endif
};

/* Wait structure used by the poll hooks */
struct eppoll_entry {
        /* List header used to link this structure to the "struct epitem" */
        struct list_head llink;

        /* The "base" pointer is set to the container "struct epitem" */
        struct epitem *base;

        /*
         * Wait queue item that will be linked to the target file wait
         * queue head.
         */
        wait_queue_entry_t wait;

        /* The wait queue head that linked the "wait" wait queue item */
        wait_queue_head_t *whead;
};

/* Wrapper struct used by poll queueing */
struct ep_pqueue {
        poll_table pt;
        struct epitem *epi;
};

/* Used by the ep_send_events() function as callback private data */
struct ep_send_events_data {
        int maxevents;
        struct epoll_event __user *events;
        int res;
};

/*
 * Configuration options available inside /proc/sys/fs/epoll/
 */
/* Maximum number of epoll watched descriptors, per user */
static long max_user_watches __read_mostly;

/*
 * This mutex is used to serialize ep_free() and eventpoll_release_file().
 */
static DEFINE_MUTEX(epmutex);

static u64 loop_check_gen = 0;

/* Used to check for epoll file descriptor inclusion loops */
static struct nested_calls poll_loop_ncalls;

/* Slab cache used to allocate "struct epitem" */
static struct kmem_cache *epi_cache __read_mostly;

/* Slab cache used to allocate "struct eppoll_entry" */
static struct kmem_cache *pwq_cache __read_mostly;

/*
 * List of files with newly added links, where we may need to limit the number
 * of emanating paths. Protected by the epmutex.
 */
static LIST_HEAD(tfile_check_list);

#ifdef CONFIG_SYSCTL

#include <linux/sysctl.h>

static long long_zero;
static long long_max = LONG_MAX;

struct ctl_table epoll_table[] = {
        {
                .procname        = "max_user_watches",
                .data                = &max_user_watches,
                .maxlen                = sizeof(max_user_watches),
                .mode                = 0644,
                .proc_handler        = proc_doulongvec_minmax,
                .extra1                = &long_zero,
                .extra2                = &long_max,
        },
        { }
};
#endif /* CONFIG_SYSCTL */

static const struct file_operations eventpoll_fops;

static inline int is_file_epoll(struct file *f)
{
        return f->f_op == &eventpoll_fops;
}

/* Setup the structure that is used as key for the RB tree */
static inline void ep_set_ffd(struct epoll_filefd *ffd,
                              struct file *file, int fd)
{
        ffd->file = file;
        ffd->fd = fd;
}

/* Compare RB tree keys */
static inline int ep_cmp_ffd(struct epoll_filefd *p1,
                             struct epoll_filefd *p2)
{
        return (p1->file > p2->file ? +1:
                (p1->file < p2->file ? -1 : p1->fd - p2->fd));
}

/* Tells us if the item is currently linked */
static inline int ep_is_linked(struct epitem *epi)
{
        return !list_empty(&epi->rdllink);
}

static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p)
{
        return container_of(p, struct eppoll_entry, wait);
}

/* Get the "struct epitem" from a wait queue pointer */
static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p)
{
        return container_of(p, struct eppoll_entry, wait)->base;
}

/* Get the "struct epitem" from an epoll queue wrapper */
static inline struct epitem *ep_item_from_epqueue(poll_table *p)
{
        return container_of(p, struct ep_pqueue, pt)->epi;
}

/* Initialize the poll safe wake up structure */
static void ep_nested_calls_init(struct nested_calls *ncalls)
{
        INIT_LIST_HEAD(&ncalls->tasks_call_list);
        spin_lock_init(&ncalls->lock);
}

/**
 * ep_events_available - Checks if ready events might be available.
 *
 * @ep: Pointer to the eventpoll context.
 *
 * Returns: Returns a value different than zero if ready events are available,
 *          or zero otherwise.
 */
static inline int ep_events_available(struct eventpoll *ep)
{
        return !list_empty_careful(&ep->rdllist) ||
                READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
}

#ifdef CONFIG_NET_RX_BUSY_POLL
static bool ep_busy_loop_end(void *p, unsigned long start_time)
{
        struct eventpoll *ep = p;

        return ep_events_available(ep) || busy_loop_timeout(start_time);
}

/*
 * Busy poll if globally on and supporting sockets found && no events,
 * busy loop will return if need_resched or ep_events_available.
 *
 * we must do our busy polling with irqs enabled
 */
static void ep_busy_loop(struct eventpoll *ep, int nonblock)
{
        unsigned int napi_id = READ_ONCE(ep->napi_id);

        if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on())
                napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep);
}

static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
{
        if (ep->napi_id)
                ep->napi_id = 0;
}

/*
 * Set epoll busy poll NAPI ID from sk.
 */
static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
{
        struct eventpoll *ep;
        unsigned int napi_id;
        struct socket *sock;
        struct sock *sk;
        int err;

        if (!net_busy_loop_on())
                return;

        sock = sock_from_file(epi->ffd.file, &err);
        if (!sock)
                return;

        sk = sock->sk;
        if (!sk)
                return;

        napi_id = READ_ONCE(sk->sk_napi_id);
        ep = epi->ep;

        /* Non-NAPI IDs can be rejected
         *        or
         * Nothing to do if we already have this ID
         */
        if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id)
                return;

        /* record NAPI ID for use in next busy poll */
        ep->napi_id = napi_id;
}

#else

static inline void ep_busy_loop(struct eventpoll *ep, int nonblock)
{
}

static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
{
}

static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
{
}

#endif /* CONFIG_NET_RX_BUSY_POLL */

/**
 * ep_call_nested - Perform a bound (possibly) nested call, by checking
 *                  that the recursion limit is not exceeded, and that
 *                  the same nested call (by the meaning of same cookie) is
 *                  no re-entered.
 *
 * @ncalls: Pointer to the nested_calls structure to be used for this call.
 * @nproc: Nested call core function pointer.
 * @priv: Opaque data to be passed to the @nproc callback.
 * @cookie: Cookie to be used to identify this nested call.
 * @ctx: This instance context.
 *
 * Returns: Returns the code returned by the @nproc callback, or -1 if
 *          the maximum recursion limit has been exceeded.
 */
static int ep_call_nested(struct nested_calls *ncalls,
                          int (*nproc)(void *, void *, int), void *priv,
                          void *cookie, void *ctx)
{
        int error, call_nests = 0;
        unsigned long flags;
        struct list_head *lsthead = &ncalls->tasks_call_list;
        struct nested_call_node *tncur;
        struct nested_call_node tnode;

        spin_lock_irqsave(&ncalls->lock, flags);

        /*
         * Try to see if the current task is already inside this wakeup call.
         * We use a list here, since the population inside this set is always
         * very much limited.
         */
        list_for_each_entry(tncur, lsthead, llink) {
                if (tncur->ctx == ctx &&
                    (tncur->cookie == cookie || ++call_nests > EP_MAX_NESTS)) {
                        /*
                         * Ops ... loop detected or maximum nest level reached.
                         * We abort this wake by breaking the cycle itself.
                         */
                        error = -1;
                        goto out_unlock;
                }
        }

        /* Add the current task and cookie to the list */
        tnode.ctx = ctx;
        tnode.cookie = cookie;
        list_add(&tnode.llink, lsthead);

        spin_unlock_irqrestore(&ncalls->lock, flags);

        /* Call the nested function */
        error = (*nproc)(priv, cookie, call_nests);

        /* Remove the current task from the list */
        spin_lock_irqsave(&ncalls->lock, flags);
        list_del(&tnode.llink);
out_unlock:
        spin_unlock_irqrestore(&ncalls->lock, flags);

        return error;
}

/*
 * As described in commit 0ccf831cb lockdep: annotate epoll
 * the use of wait queues used by epoll is done in a very controlled
 * manner. Wake ups can nest inside each other, but are never done
 * with the same locking. For example:
 *
 *   dfd = socket(...);
 *   efd1 = epoll_create();
 *   efd2 = epoll_create();
 *   epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...);
 *   epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...);
 *
 * When a packet arrives to the device underneath "dfd", the net code will
 * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a
 * callback wakeup entry on that queue, and the wake_up() performed by the
 * "dfd" net code will end up in ep_poll_callback(). At this point epoll
 * (efd1) notices that it may have some event ready, so it needs to wake up
 * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake()
 * that ends up in another wake_up(), after having checked about the
 * recursion constraints. That are, no more than EP_MAX_POLLWAKE_NESTS, to
 * avoid stack blasting.
 *
 * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle
 * this special case of epoll.
 */
#ifdef CONFIG_DEBUG_LOCK_ALLOC

static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi,
                             unsigned pollflags)
{
        struct eventpoll *ep_src;
        unsigned long flags;
        u8 nests = 0;

        /*
         * To set the subclass or nesting level for spin_lock_irqsave_nested()
         * it might be natural to create a per-cpu nest count. However, since
         * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can
         * schedule() in the -rt kernel, the per-cpu variable are no longer
         * protected. Thus, we are introducing a per eventpoll nest field.
         * If we are not being call from ep_poll_callback(), epi is NULL and
         * we are at the first level of nesting, 0. Otherwise, we are being
         * called from ep_poll_callback() and if a previous wakeup source is
         * not an epoll file itself, we are at depth 1 since the wakeup source
         * is depth 0. If the wakeup source is a previous epoll file in the
         * wakeup chain then we use its nests value and record ours as
         * nests + 1. The previous epoll file nests value is stable since its
         * already holding its own poll_wait.lock.
         */
        if (epi) {
                if ((is_file_epoll(epi->ffd.file))) {
                        ep_src = epi->ffd.file->private_data;
                        nests = ep_src->nests;
                } else {
                        nests = 1;
                }
        }
        spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);
        ep->nests = nests + 1;
        wake_up_locked_poll(&ep->poll_wait, EPOLLIN | pollflags);
        ep->nests = 0;
        spin_unlock_irqrestore(&ep->poll_wait.lock, flags);
}

#else

static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi,
                             unsigned pollflags)
{
        wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags);
}

#endif

static void ep_remove_wait_queue(struct eppoll_entry *pwq)
{
        wait_queue_head_t *whead;

        rcu_read_lock();
        /*
         * If it is cleared by POLLFREE, it should be rcu-safe.
         * If we read NULL we need a barrier paired with
         * smp_store_release() in ep_poll_callback(), otherwise
         * we rely on whead->lock.
         */
        whead = smp_load_acquire(&pwq->whead);
        if (whead)
                remove_wait_queue(whead, &pwq->wait);
        rcu_read_unlock();
}

/*
 * This function unregisters poll callbacks from the associated file
 * descriptor.  Must be called with "mtx" held (or "epmutex" if called from
 * ep_free).
 */
static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
{
        struct list_head *lsthead = &epi->pwqlist;
        struct eppoll_entry *pwq;

        while (!list_empty(lsthead)) {
                pwq = list_first_entry(lsthead, struct eppoll_entry, llink);

                list_del(&pwq->llink);
                ep_remove_wait_queue(pwq);
                kmem_cache_free(pwq_cache, pwq);
        }
}

/* call only when ep->mtx is held */
static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi)
{
        return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx));
}

/* call only when ep->mtx is held */
static inline void ep_pm_stay_awake(struct epitem *epi)
{
        struct wakeup_source *ws = ep_wakeup_source(epi);

        if (ws)
                __pm_stay_awake(ws);
}

static inline bool ep_has_wakeup_source(struct epitem *epi)
{
        return rcu_access_pointer(epi->ws) ? true : false;
}

/* call when ep->mtx cannot be held (ep_poll_callback) */
static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
{
        struct wakeup_source *ws;

        rcu_read_lock();
        ws = rcu_dereference(epi->ws);
        if (ws)
                __pm_stay_awake(ws);
        rcu_read_unlock();
}

/**
 * ep_scan_ready_list - Scans the ready list in a way that makes possible for
 *                      the scan code, to call f_op->poll(). Also allows for
 *                      O(NumReady) performance.
 *
 * @ep: Pointer to the epoll private data structure.
 * @sproc: Pointer to the scan callback.
 * @priv: Private opaque data passed to the @sproc callback.
 * @depth: The current depth of recursive f_op->poll calls.
 * @ep_locked: caller already holds ep->mtx
 *
 * Returns: The same integer error code returned by the @sproc callback.
 */
static __poll_t ep_scan_ready_list(struct eventpoll *ep,
                              __poll_t (*sproc)(struct eventpoll *,
                                           struct list_head *, void *),
                              void *priv, int depth, bool ep_locked)
{
        __poll_t res;
        struct epitem *epi, *nepi;
        LIST_HEAD(txlist);

        lockdep_assert_irqs_enabled();

        /*
         * We need to lock this because we could be hit by
         * eventpoll_release_file() and epoll_ctl().
         */

        if (!ep_locked)
                mutex_lock_nested(&ep->mtx, depth);

        /*
         * Steal the ready list, and re-init the original one to the
         * empty list. Also, set ep->ovflist to NULL so that events
         * happening while looping w/out locks, are not lost. We cannot
         * have the poll callback to queue directly on ep->rdllist,
         * because we want the "sproc" callback to be able to do it
         * in a lockless way.
         */
        write_lock_irq(&ep->lock);
        list_splice_init(&ep->rdllist, &txlist);
        WRITE_ONCE(ep->ovflist, NULL);
        write_unlock_irq(&ep->lock);

        /*
         * Now call the callback function.
         */
        res = (*sproc)(ep, &txlist, priv);

        write_lock_irq(&ep->lock);
        /*
         * During the time we spent inside the "sproc" callback, some
         * other events might have been queued by the poll callback.
         * We re-insert them inside the main ready-list here.
         */
        for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL;
             nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
                /*
                 * We need to check if the item is already in the list.
                 * During the "sproc" callback execution time, items are
                 * queued into ->ovflist but the "txlist" might already
                 * contain them, and the list_splice() below takes care of them.
                 */
                if (!ep_is_linked(epi)) {
                        /*
                         * ->ovflist is LIFO, so we have to reverse it in order
                         * to keep in FIFO.
                         */
                        list_add(&epi->rdllink, &ep->rdllist);
                        ep_pm_stay_awake(epi);
                }
        }
        /*
         * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
         * releasing the lock, events will be queued in the normal way inside
         * ep->rdllist.
         */
        WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);

        /*
         * Quickly re-inject items left on "txlist".
         */
        list_splice(&txlist, &ep->rdllist);
        __pm_relax(ep->ws);

        if (!list_empty(&ep->rdllist)) {
                if (waitqueue_active(&ep->wq))
                        wake_up(&ep->wq);
        }

        write_unlock_irq(&ep->lock);

        if (!ep_locked)
                mutex_unlock(&ep->mtx);

        return res;
}

static void epi_rcu_free(struct rcu_head *head)
{
        struct epitem *epi = container_of(head, struct epitem, rcu);
        kmem_cache_free(epi_cache, epi);
}

/*
 * Removes a "struct epitem" from the eventpoll RB tree and deallocates
 * all the associated resources. Must be called with "mtx" held.
 */
static int ep_remove(struct eventpoll *ep, struct epitem *epi)
{
        struct file *file = epi->ffd.file;

        lockdep_assert_irqs_enabled();

        /*
         * Removes poll wait queue hooks.
         */
        ep_unregister_pollwait(ep, epi);

        /* Remove the current item from the list of epoll hooks */
        spin_lock(&file->f_lock);
        list_del_rcu(&epi->fllink);
        spin_unlock(&file->f_lock);

        rb_erase_cached(&epi->rbn, &ep->rbr);

        write_lock_irq(&ep->lock);
        if (ep_is_linked(epi))
                list_del_init(&epi->rdllink);
        write_unlock_irq(&ep->lock);

        wakeup_source_unregister(ep_wakeup_source(epi));
        /*
         * At this point it is safe to free the eventpoll item. Use the union
         * field epi->rcu, since we are trying to minimize the size of
         * 'struct epitem'. The 'rbn' field is no longer in use. Protected by
         * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
         * use of the rbn field.
         */
        call_rcu(&epi->rcu, epi_rcu_free);

        atomic_long_dec(&ep->user->epoll_watches);

        return 0;
}

static void ep_free(struct eventpoll *ep)
{
        struct rb_node *rbp;
        struct epitem *epi;

        /* We need to release all tasks waiting for these file */
        if (waitqueue_active(&ep->poll_wait))
                ep_poll_safewake(ep, NULL, 0);

        /*
         * We need to lock this because we could be hit by
         * eventpoll_release_file() while we're freeing the "struct eventpoll".
         * We do not need to hold "ep->mtx" here because the epoll file
         * is on the way to be removed and no one has references to it
         * anymore. The only hit might come from eventpoll_release_file() but
         * holding "epmutex" is sufficient here.
         */
        mutex_lock(&epmutex);

        /*
         * Walks through the whole tree by unregistering poll callbacks.
         */
        for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
                epi = rb_entry(rbp, struct epitem, rbn);

                ep_unregister_pollwait(ep, epi);
                cond_resched();
        }

        /*
         * Walks through the whole tree by freeing each "struct epitem". At this
         * point we are sure no poll callbacks will be lingering around, and also by
         * holding "epmutex" we can be sure that no file cleanup code will hit
         * us during this operation. So we can avoid the lock on "ep->lock".
         * We do not need to lock ep->mtx, either, we only do it to prevent
         * a lockdep warning.
         */
        mutex_lock(&ep->mtx);
        while ((rbp = rb_first_cached(&ep->rbr)) != NULL) {
                epi = rb_entry(rbp, struct epitem, rbn);
                ep_remove(ep, epi);
                cond_resched();
        }
        mutex_unlock(&ep->mtx);

        mutex_unlock(&epmutex);
        mutex_destroy(&ep->mtx);
        free_uid(ep->user);
        wakeup_source_unregister(ep->ws);
        kfree(ep);
}

static int ep_eventpoll_release(struct inode *inode, struct file *file)
{
        struct eventpoll *ep = file->private_data;

        if (ep)
                ep_free(ep);

        return 0;
}

static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
                               void *priv);
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
                                 poll_table *pt);

/*
 * Differs from ep_eventpoll_poll() in that internal callers already have
 * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()
 * is correctly annotated.
 */
static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
                                 int depth)
{
        struct eventpoll *ep;
        bool locked;

        pt->_key = epi->event.events;
        if (!is_file_epoll(epi->ffd.file))
                return vfs_poll(epi->ffd.file, pt) & epi->event.events;

        ep = epi->ffd.file->private_data;
        poll_wait(epi->ffd.file, &ep->poll_wait, pt);
        locked = pt && (pt->_qproc == ep_ptable_queue_proc);

        return ep_scan_ready_list(epi->ffd.file->private_data,
                                  ep_read_events_proc, &depth, depth,
                                  locked) & epi->event.events;
}

static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
                               void *priv)
{
        struct epitem *epi, *tmp;
        poll_table pt;
        int depth = *(int *)priv;

        init_poll_funcptr(&pt, NULL);
        depth++;

        list_for_each_entry_safe(epi, tmp, head, rdllink) {
                if (ep_item_poll(epi, &pt, depth)) {
                        return EPOLLIN | EPOLLRDNORM;
                } else {
                        /*
                         * Item has been dropped into the ready list by the poll
                         * callback, but it's not actually ready, as far as
                         * caller requested events goes. We can remove it here.
                         */
                        __pm_relax(ep_wakeup_source(epi));
                        list_del_init(&epi->rdllink);
                }
        }

        return 0;
}

static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait)
{
        struct eventpoll *ep = file->private_data;
        int depth = 0;

        /* Insert inside our poll wait queue */
        poll_wait(file, &ep->poll_wait, wait);

        /*
         * Proceed to find out if wanted events are really available inside
         * the ready list.
         */
        return ep_scan_ready_list(ep, ep_read_events_proc,
                                  &depth, depth, false);
}

#ifdef CONFIG_PROC_FS
static void ep_show_fdinfo(struct seq_file *m, struct file *f)
{
        struct eventpoll *ep = f->private_data;
        struct rb_node *rbp;

        mutex_lock(&ep->mtx);
        for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
                struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
                struct inode *inode = file_inode(epi->ffd.file);

                seq_printf(m, "tfd: %8d events: %8x data: %16llx "
                           " pos:%lli ino:%lx sdev:%x\n",
                           epi->ffd.fd, epi->event.events,
                           (long long)epi->event.data,
                           (long long)epi->ffd.file->f_pos,
                           inode->i_ino, inode->i_sb->s_dev);
                if (seq_has_overflowed(m))
                        break;
        }
        mutex_unlock(&ep->mtx);
}
#endif

/* File callbacks that implement the eventpoll file behaviour */
static const struct file_operations eventpoll_fops = {
#ifdef CONFIG_PROC_FS
        .show_fdinfo        = ep_show_fdinfo,
#endif
        .release        = ep_eventpoll_release,
        .poll                = ep_eventpoll_poll,
        .llseek                = noop_llseek,
};

/*
 * This is called from eventpoll_release() to unlink files from the eventpoll
 * interface. We need to have this facility to cleanup correctly files that are
 * closed without being removed from the eventpoll interface.
 */
void eventpoll_release_file(struct file *file)
{
        struct eventpoll *ep;
        struct epitem *epi, *next;

        /*
         * We don't want to get "file->f_lock" because it is not
         * necessary. It is not necessary because we're in the "struct file"
         * cleanup path, and this means that no one is using this file anymore.
         * So, for example, epoll_ctl() cannot hit here since if we reach this
         * point, the file counter already went to zero and fget() would fail.
         * The only hit might come from ep_free() but by holding the mutex
         * will correctly serialize the operation. We do need to acquire
         * "ep->mtx" after "epmutex" because ep_remove() requires it when called
         * from anywhere but ep_free().
         *
         * Besides, ep_remove() acquires the lock, so we can't hold it here.
         */
        mutex_lock(&epmutex);
        list_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) {
                ep = epi->ep;
                mutex_lock_nested(&ep->mtx, 0);
                ep_remove(ep, epi);
                mutex_unlock(&ep->mtx);
        }
        mutex_unlock(&epmutex);
}

static int ep_alloc(struct eventpoll **pep)
{
        int error;
        struct user_struct *user;
        struct eventpoll *ep;

        user = get_current_user();
        error = -ENOMEM;
        ep = kzalloc(sizeof(*ep), GFP_KERNEL);
        if (unlikely(!ep))
                goto free_uid;

        mutex_init(&ep->mtx);
        rwlock_init(&ep->lock);
        init_waitqueue_head(&ep->wq);
        init_waitqueue_head(&ep->poll_wait);
        INIT_LIST_HEAD(&ep->rdllist);
        ep->rbr = RB_ROOT_CACHED;
        ep->ovflist = EP_UNACTIVE_PTR;
        ep->user = user;

        *pep = ep;

        return 0;

free_uid:
        free_uid(user);
        return error;
}

/*
 * Search the file inside the eventpoll tree. The RB tree operations
 * are protected by the "mtx" mutex, and ep_find() must be called with
 * "mtx" held.
 */
static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
{
        int kcmp;
        struct rb_node *rbp;
        struct epitem *epi, *epir = NULL;
        struct epoll_filefd ffd;

        ep_set_ffd(&ffd, file, fd);
        for (rbp = ep->rbr.rb_root.rb_node; rbp; ) {
                epi = rb_entry(rbp, struct epitem, rbn);
                kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
                if (kcmp > 0)
                        rbp = rbp->rb_right;
                else if (kcmp < 0)
                        rbp = rbp->rb_left;
                else {
                        epir = epi;
                        break;
                }
        }

        return epir;
}

#ifdef CONFIG_KCMP
static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff)
{
        struct rb_node *rbp;
        struct epitem *epi;

        for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
                epi = rb_entry(rbp, struct epitem, rbn);
                if (epi->ffd.fd == tfd) {
                        if (toff == 0)
                                return epi;
                        else
                                toff--;
                }
                cond_resched();
        }

        return NULL;
}

struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
                                     unsigned long toff)
{
        struct file *file_raw;
        struct eventpoll *ep;
        struct epitem *epi;

        if (!is_file_epoll(file))
                return ERR_PTR(-EINVAL);

        ep = file->private_data;

        mutex_lock(&ep->mtx);
        epi = ep_find_tfd(ep, tfd, toff);
        if (epi)
                file_raw = epi->ffd.file;
        else
                file_raw = ERR_PTR(-ENOENT);
        mutex_unlock(&ep->mtx);

        return file_raw;
}
#endif /* CONFIG_KCMP */

/**
 * Adds a new entry to the tail of the list in a lockless way, i.e.
 * multiple CPUs are allowed to call this function concurrently.
 *
 * Beware: it is necessary to prevent any other modifications of the
 *         existing list until all changes are completed, in other words
 *         concurrent list_add_tail_lockless() calls should be protected
 *         with a read lock, where write lock acts as a barrier which
 *         makes sure all list_add_tail_lockless() calls are fully
 *         completed.
 *
 *        Also an element can be locklessly added to the list only in one
 *        direction i.e. either to the tail either to the head, otherwise
 *        concurrent access will corrupt the list.
 *
 * Returns %false if element has been already added to the list, %true
 * otherwise.
 */
static inline bool list_add_tail_lockless(struct list_head *new,
                                          struct list_head *head)
{
        struct list_head *prev;

        /*
         * This is simple 'new->next = head' operation, but cmpxchg()
         * is used in order to detect that same element has been just
         * added to the list from another CPU: the winner observes
         * new->next == new.
         */
        if (cmpxchg(&new->next, new, head) != new)
                return false;

        /*
         * Initially ->next of a new element must be updated with the head
         * (we are inserting to the tail) and only then pointers are atomically
         * exchanged.  XCHG guarantees memory ordering, thus ->next should be
         * updated before pointers are actually swapped and pointers are
         * swapped before prev->next is updated.
         */

        prev = xchg(&head->prev, new);

        /*
         * It is safe to modify prev->next and new->prev, because a new element
         * is added only to the tail and new->next is updated before XCHG.
         */

        prev->next = new;
        new->prev = prev;

        return true;
}

/**
 * Chains a new epi entry to the tail of the ep->ovflist in a lockless way,
 * i.e. multiple CPUs are allowed to call this function concurrently.
 *
 * Returns %false if epi element has been already chained, %true otherwise.
 */
static inline bool chain_epi_lockless(struct epitem *epi)
{
        struct eventpoll *ep = epi->ep;

        /* Fast preliminary check */
        if (epi->next != EP_UNACTIVE_PTR)
                return false;

        /* Check that the same epi has not been just chained from another CPU */
        if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
                return false;

        /* Atomically exchange tail */
        epi->next = xchg(&ep->ovflist, epi);

        return true;
}

/*
 * This is the callback that is passed to the wait queue wakeup
 * mechanism. It is called by the stored file descriptors when they
 * have events to report.
 *
 * This callback takes a read lock in order not to content with concurrent
 * events from another file descriptors, thus all modifications to ->rdllist
 * or ->ovflist are lockless.  Read lock is paired with the write lock from
 * ep_scan_ready_list(), which stops all list modifications and guarantees
 * that lists state is seen correctly.
 *
 * Another thing worth to mention is that ep_poll_callback() can be called
 * concurrently for the same @epi from different CPUs if poll table was inited
 * with several wait queues entries.  Plural wakeup from different CPUs of a
 * single wait queue is serialized by wq.lock, but the case when multiple wait
 * queues are used should be detected accordingly.  This is detected using
 * cmpxchg() operation.
 */
static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
        int pwake = 0;
        struct epitem *epi = ep_item_from_wait(wait);
        struct eventpoll *ep = epi->ep;
        __poll_t pollflags = key_to_poll(key);
        unsigned long flags;
        int ewake = 0;

        read_lock_irqsave(&ep->lock, flags);

        ep_set_busy_poll_napi_id(epi);

        /*
         * If the event mask does not contain any poll(2) event, we consider the
         * descriptor to be disabled. This condition is likely the effect of the
         * EPOLLONESHOT bit that disables the descriptor when an event is received,
         * until the next EPOLL_CTL_MOD will be issued.
         */
        if (!(epi->event.events & ~EP_PRIVATE_BITS))
                goto out_unlock;

        /*
         * Check the events coming with the callback. At this stage, not
         * every device reports the events in the "key" parameter of the
         * callback. We need to be able to handle both cases here, hence the
         * test for "key" != NULL before the event match test.
         */
        if (pollflags && !(pollflags & epi->event.events))
                goto out_unlock;

        /*
         * If we are transferring events to userspace, we can hold no locks
         * (because we're accessing user memory, and because of linux f_op->poll()
         * semantics). All the events that happen during that period of time are
         * chained in ep->ovflist and requeued later on.
         */
        if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
                if (chain_epi_lockless(epi))
                        ep_pm_stay_awake_rcu(epi);
        } else if (!ep_is_linked(epi)) {
                /* In the usual case, add event to ready list. */
                if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist))
                        ep_pm_stay_awake_rcu(epi);
        }

        /*
         * Wake up ( if active ) both the eventpoll wait list and the ->poll()
         * wait list.
         */
        if (waitqueue_active(&ep->wq)) {
                if ((epi->event.events & EPOLLEXCLUSIVE) &&
                                        !(pollflags & POLLFREE)) {
                        switch (pollflags & EPOLLINOUT_BITS) {
                        case EPOLLIN:
                                if (epi->event.events & EPOLLIN)
                                        ewake = 1;
                                break;
                        case EPOLLOUT:
                                if (epi->event.events & EPOLLOUT)
                                        ewake = 1;
                                break;
                        case 0:
                                ewake = 1;
                                break;
                        }
                }
                if (sync)
                        wake_up_sync(&ep->wq);
                else
                        wake_up(&ep->wq);
        }
        if (waitqueue_active(&ep->poll_wait))
                pwake++;

out_unlock:
        read_unlock_irqrestore(&ep->lock, flags);

        /* We have to call this outside the lock */
        if (pwake)
                ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE);

        if (!(epi->event.events & EPOLLEXCLUSIVE))
                ewake = 1;

        if (pollflags & POLLFREE) {
                /*
                 * If we race with ep_remove_wait_queue() it can miss
                 * ->whead = NULL and do another remove_wait_queue() after
                 * us, so we can't use __remove_wait_queue().
                 */
                list_del_init(&wait->entry);
                /*
                 * ->whead != NULL protects us from the race with ep_free()
                 * or ep_remove(), ep_remove_wait_queue() takes whead->lock
                 * held by the caller. Once we nullify it, nothing protects
                 * ep/epi or even wait.
                 */
                smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
        }

        return ewake;
}

/*
 * This is the callback that is used to add our wait queue to the
 * target file wakeup lists.
 */
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
                                 poll_table *pt)
{
        struct epitem *epi = ep_item_from_epqueue(pt);
        struct eppoll_entry *pwq;

        if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
                init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
                pwq->whead = whead;
                pwq->base = epi;
                if (epi->event.events & EPOLLEXCLUSIVE)
                        add_wait_queue_exclusive(whead, &pwq->wait);
                else
                        add_wait_queue(whead, &pwq->wait);
                list_add_tail(&pwq->llink, &epi->pwqlist);
                epi->nwait++;
        } else {
                /* We have to signal that an error occurred */
                epi->nwait = -1;
        }
}

static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
{
        int kcmp;
        struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL;
        struct epitem *epic;
        bool leftmost = true;

        while (*p) {
                parent = *p;
                epic = rb_entry(parent, struct epitem, rbn);
                kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
                if (kcmp > 0) {
                        p = &parent->rb_right;
                        leftmost = false;
                } else
                        p = &parent->rb_left;
        }
        rb_link_node(&epi->rbn, parent, p);
        rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost);
}



#define PATH_ARR_SIZE 5
/*
 * These are the number paths of length 1 to 5, that we are allowing to emanate
 * from a single file of interest. For example, we allow 1000 paths of length
 * 1, to emanate from each file of interest. This essentially represents the
 * potential wakeup paths, which need to be limited in order to avoid massive
 * uncontrolled wakeup storms. The common use case should be a single ep which
 * is connected to n file sources. In this case each file source has 1 path
 * of length 1. Thus, the numbers below should be more than sufficient. These
 * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
 * and delete can't add additional paths. Protected by the epmutex.
 */
static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
static int path_count[PATH_ARR_SIZE];

static int path_count_inc(int nests)
{
        /* Allow an arbitrary number of depth 1 paths */
        if (nests == 0)
                return 0;

        if (++path_count[nests] > path_limits[nests])
                return -1;
        return 0;
}

static void path_count_init(void)
{
        int i;

        for (i = 0; i < PATH_ARR_SIZE; i++)
                path_count[i] = 0;
}

static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
{
        int error = 0;
        struct file *file = priv;
        struct file *child_file;
        struct epitem *epi;

        /* CTL_DEL can remove links here, but that can't increase our count */
        rcu_read_lock();
        list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) {
                child_file = epi->ep->file;
                if (is_file_epoll(child_file)) {
                        if (list_empty(&child_file->f_ep_links)) {
                                if (path_count_inc(call_nests)) {
                                        error = -1;
                                        break;
                                }
                        } else {
                                error = ep_call_nested(&poll_loop_ncalls,
                                                        reverse_path_check_proc,
                                                        child_file, child_file,
                                                        current);
                        }
                        if (error != 0)
                                break;
                } else {
                        printk(KERN_ERR "reverse_path_check_proc: "
                                "file is not an ep!\n");
                }
        }
        rcu_read_unlock();
        return error;
}

/**
 * reverse_path_check - The tfile_check_list is list of file *, which have
 *                      links that are proposed to be newly added. We need to
 *                      make sure that those added links don't add too many
 *                      paths such that we will spend all our time waking up
 *                      eventpoll objects.
 *
 * Returns: Returns zero if the proposed links don't create too many paths,
 *            -1 otherwise.
 */
static int reverse_path_check(void)
{
        int error = 0;
        struct file *current_file;

        /* let's call this for all tfiles */
        list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
                path_count_init();
                error = ep_call_nested(&poll_loop_ncalls,
                                        reverse_path_check_proc, current_file,
                                        current_file, current);
                if (error)
                        break;
        }
        return error;
}

static int ep_create_wakeup_source(struct epitem *epi)
{
        struct name_snapshot n;
        struct wakeup_source *ws;

        if (!epi->ep->ws) {
                epi->ep->ws = wakeup_source_register(NULL, "eventpoll");
                if (!epi->ep->ws)
                        return -ENOMEM;
        }

        take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry);
        ws = wakeup_source_register(NULL, n.name.name);
        release_dentry_name_snapshot(&n);

        if (!ws)
                return -ENOMEM;
        rcu_assign_pointer(epi->ws, ws);

        return 0;
}

/* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */
static noinline void ep_destroy_wakeup_source(struct epitem *epi)
{
        struct wakeup_source *ws = ep_wakeup_source(epi);

        RCU_INIT_POINTER(epi->ws, NULL);

        /*
         * wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is
         * used internally by wakeup_source_remove, too (called by
         * wakeup_source_unregister), so we cannot use call_rcu
         */
        synchronize_rcu();
        wakeup_source_unregister(ws);
}

/*
 * Must be called with "mtx" held.
 */
static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
                     struct file *tfile, int fd, int full_check)
{
        int error, pwake = 0;
        __poll_t revents;
        long user_watches;
        struct epitem *epi;
        struct ep_pqueue epq;

        lockdep_assert_irqs_enabled();

        user_watches = atomic_long_read(&ep->user->epoll_watches);
        if (unlikely(user_watches >= max_user_watches))
                return -ENOSPC;
        if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
                return -ENOMEM;

        /* Item initialization follow here ... */
        INIT_LIST_HEAD(&epi->rdllink);
        INIT_LIST_HEAD(&epi->fllink);
        INIT_LIST_HEAD(&epi->pwqlist);
        epi->ep = ep;
        ep_set_ffd(&epi->ffd, tfile, fd);
        epi->event = *event;
        epi->nwait = 0;
        epi->next = EP_UNACTIVE_PTR;
        if (epi->event.events & EPOLLWAKEUP) {
                error = ep_create_wakeup_source(epi);
                if (error)
                        goto error_create_wakeup_source;
        } else {
                RCU_INIT_POINTER(epi->ws, NULL);
        }

        /* Add the current item to the list of active epoll hook for this file */
        spin_lock(&tfile->f_lock);
        list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
        spin_unlock(&tfile->f_lock);

        /*
         * Add the current item to the RB tree. All RB tree operations are
         * protected by "mtx", and ep_insert() is called with "mtx" held.
         */
        ep_rbtree_insert(ep, epi);

        /* now check if we've created too many backpaths */
        error = -EINVAL;
        if (full_check && reverse_path_check())
                goto error_remove_epi;

        /* Initialize the poll table using the queue callback */
        epq.epi = epi;
        init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

        /*
         * Attach the item to the poll hooks and get current event bits.
         * We can safely use the file* here because its usage count has
         * been increased by the caller of this function. Note that after
         * this operation completes, the poll callback can start hitting
         * the new item.
         */
        revents = ep_item_poll(epi, &epq.pt, 1);

        /*
         * We have to check if something went wrong during the poll wait queue
         * install process. Namely an allocation for a wait queue failed due
         * high memory pressure.
         */
        error = -ENOMEM;
        if (epi->nwait < 0)
                goto error_unregister;

        /* We have to drop the new item inside our item list to keep track of it */
        write_lock_irq(&ep->lock);

        /* record NAPI ID of new item if present */
        ep_set_busy_poll_napi_id(epi);

        /* If the file is already "ready" we drop it inside the ready list */
        if (revents && !ep_is_linked(epi)) {
                list_add_tail(&epi->rdllink, &ep->rdllist);
                ep_pm_stay_awake(epi);

                /* Notify waiting tasks that events are available */
                if (waitqueue_active(&ep->wq))
                        wake_up(&ep->wq);
                if (waitqueue_active(&ep->poll_wait))
                        pwake++;
        }

        write_unlock_irq(&ep->lock);

        atomic_long_inc(&ep->user->epoll_watches);

        /* We have to call this outside the lock */
        if (pwake)
                ep_poll_safewake(ep, NULL, 0);

        return 0;

error_unregister:
        ep_unregister_pollwait(ep, epi);
error_remove_epi:
        spin_lock(&tfile->f_lock);
        list_del_rcu(&epi->fllink);
        spin_unlock(&tfile->f_lock);

        rb_erase_cached(&epi->rbn, &ep->rbr);

        /*
         * We need to do this because an event could have been arrived on some
         * allocated wait queue. Note that we don't care about the ep->ovflist
         * list, since that is used/cleaned only inside a section bound by "mtx".
         * And ep_insert() is called with "mtx" held.
         */
        write_lock_irq(&ep->lock);
        if (ep_is_linked(epi))
                list_del_init(&epi->rdllink);
        write_unlock_irq(&ep->lock);

        wakeup_source_unregister(ep_wakeup_source(epi));

error_create_wakeup_source:
        kmem_cache_free(epi_cache, epi);

        return error;
}

/*
 * Modify the interest event mask by dropping an event if the new mask
 * has a match in the current file status. Must be called with "mtx" held.
 */
static int ep_modify(struct eventpoll *ep, struct epitem *epi,
                     const struct epoll_event *event)
{
        int pwake = 0;
        poll_table pt;

        lockdep_assert_irqs_enabled();

        init_poll_funcptr(&pt, NULL);

        /*
         * Set the new event interest mask before calling f_op->poll();
         * otherwise we might miss an event that happens between the
         * f_op->poll() call and the new event set registering.
         */
        epi->event.events = event->events; /* need barrier below */
        epi->event.data = event->data; /* protected by mtx */
        if (epi->event.events & EPOLLWAKEUP) {
                if (!ep_has_wakeup_source(epi))
                        ep_create_wakeup_source(epi);
        } else if (ep_has_wakeup_source(epi)) {
                ep_destroy_wakeup_source(epi);
        }

        /*
         * The following barrier has two effects:
         *
         * 1) Flush epi changes above to other CPUs.  This ensures
         *    we do not miss events from ep_poll_callback if an
         *    event occurs immediately after we call f_op->poll().
         *    We need this because we did not take ep->lock while
         *    changing epi above (but ep_poll_callback does take
         *    ep->lock).
         *
         * 2) We also need to ensure we do not miss _past_ events
         *    when calling f_op->poll().  This barrier also
         *    pairs with the barrier in wq_has_sleeper (see
         *    comments for wq_has_sleeper).
         *
         * This barrier will now guarantee ep_poll_callback or f_op->poll
         * (or both) will notice the readiness of an item.
         */
        smp_mb();

        /*
         * Get current event bits. We can safely use the file* here because
         * its usage count has been increased by the caller of this function.
         * If the item is "hot" and it is not registered inside the ready
         * list, push it inside.
         */
        if (ep_item_poll(epi, &pt, 1)) {
                write_lock_irq(&ep->lock);
                if (!ep_is_linked(epi)) {
                        list_add_tail(&epi->rdllink, &ep->rdllist);
                        ep_pm_stay_awake(epi);

                        /* Notify waiting tasks that events are available */
                        if (waitqueue_active(&ep->wq))
                                wake_up(&ep->wq);
                        if (waitqueue_active(&ep->poll_wait))
                                pwake++;
                }
                write_unlock_irq(&ep->lock);
        }

        /* We have to call this outside the lock */
        if (pwake)
                ep_poll_safewake(ep, NULL, 0);

        return 0;
}

static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
                               void *priv)
{
        struct ep_send_events_data *esed = priv;
        __poll_t revents;
        struct epitem *epi, *tmp;
        struct epoll_event __user *uevent = esed->events;
        struct wakeup_source *ws;
        poll_table pt;

        init_poll_funcptr(&pt, NULL);
        esed->res = 0;

        /*
         * We can loop without lock because we are passed a task private list.
         * Items cannot vanish during the loop because ep_scan_ready_list() is
         * holding "mtx" during this call.
         */
        lockdep_assert_held(&ep->mtx);

        list_for_each_entry_safe(epi, tmp, head, rdllink) {
                if (esed->res >= esed->maxevents)
                        break;

                /*
                 * Activate ep->ws before deactivating epi->ws to prevent
                 * triggering auto-suspend here (in case we reactive epi->ws
                 * below).
                 *
                 * This could be rearranged to delay the deactivation of epi->ws
                 * instead, but then epi->ws would temporarily be out of sync
                 * with ep_is_linked().
                 */
                ws = ep_wakeup_source(epi);
                if (ws) {
                        if (ws->active)
                                __pm_stay_awake(ep->ws);
                        __pm_relax(ws);
                }

                list_del_init(&epi->rdllink);

                /*
                 * If the event mask intersect the caller-requested one,
                 * deliver the event to userspace. Again, ep_scan_ready_list()
                 * is holding ep->mtx, so no operations coming from userspace
                 * can change the item.
                 */
                revents = ep_item_poll(epi, &pt, 1);
                if (!revents)
                        continue;

                if (__put_user(revents, &uevent->events) ||
                    __put_user(epi->event.data, &uevent->data)) {
                        list_add(&epi->rdllink, head);
                        ep_pm_stay_awake(epi);
                        if (!esed->res)
                                esed->res = -EFAULT;
                        return 0;
                }
                esed->res++;
                uevent++;
                if (epi->event.events & EPOLLONESHOT)
                        epi->event.events &= EP_PRIVATE_BITS;
                else if (!(epi->event.events & EPOLLET)) {
                        /*
                         * If this file has been added with Level
                         * Trigger mode, we need to insert back inside
                         * the ready list, so that the next call to
                         * epoll_wait() will check again the events
                         * availability. At this point, no one can insert
                         * into ep->rdllist besides us. The epoll_ctl()
                         * callers are locked out by
                         * ep_scan_ready_list() holding "mtx" and the
                         * poll callback will queue them in ep->ovflist.
                         */
                        list_add_tail(&epi->rdllink, &ep->rdllist);
                        ep_pm_stay_awake(epi);
                }
        }

        return 0;
}

static int ep_send_events(struct eventpoll *ep,
                          struct epoll_event __user *events, int maxevents)
{
        struct ep_send_events_data esed;

        esed.maxevents = maxevents;
        esed.events = events;

        ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);
        return esed.res;
}

static inline struct timespec64 ep_set_mstimeout(long ms)
{
        struct timespec64 now, ts = {
                .tv_sec = ms / MSEC_PER_SEC,
                .tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
        };

        ktime_get_ts64(&now);
        return timespec64_add_safe(now, ts);
}

/*
 * autoremove_wake_function, but remove even on failure to wake up, because we
 * know that default_wake_function/ttwu will only fail if the thread is already
 * woken, and in that case the ep_poll loop will remove the entry anyways, not
 * try to reuse it.
 */
static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,
                                       unsigned int mode, int sync, void *key)
{
        int ret = default_wake_function(wq_entry, mode, sync, key);

        /*
         * Pairs with list_empty_careful in ep_poll, and ensures future loop
         * iterations see the cause of this wakeup.
         */
        list_del_init_careful(&wq_entry->entry);
        return ret;
}

/**
 * ep_poll - Retrieves ready events, and delivers them to the caller supplied
 *           event buffer.
 *
 * @ep: Pointer to the eventpoll context.
 * @events: Pointer to the userspace buffer where the ready events should be
 *          stored.
 * @maxevents: Size (in terms of number of events) of the caller event buffer.
 * @timeout: Maximum timeout for the ready events fetch operation, in
 *           milliseconds. If the @timeout is zero, the function will not block,
 *           while if the @timeout is less than zero, the function will block
 *           until at least one event has been retrieved (or an error
 *           occurred).
 *
 * Returns: Returns the number of ready events which have been fetched, or an
 *          error code, in case of error.
 */
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
                   int maxevents, long timeout)
{
        int res = 0, eavail, timed_out = 0;
        u64 slack = 0;
        wait_queue_entry_t wait;
        ktime_t expires, *to = NULL;

        lockdep_assert_irqs_enabled();

        if (timeout > 0) {
                struct timespec64 end_time = ep_set_mstimeout(timeout);

                slack = select_estimate_accuracy(&end_time);
                to = &expires;
                *to = timespec64_to_ktime(end_time);
        } else if (timeout == 0) {
                /*
                 * Avoid the unnecessary trip to the wait queue loop, if the
                 * caller specified a non blocking operation. We still need
                 * lock because we could race and not see an epi being added
                 * to the ready list while in irq callback. Thus incorrectly
                 * returning 0 back to userspace.
                 */
                timed_out = 1;

                write_lock_irq(&ep->lock);
                eavail = ep_events_available(ep);
                write_unlock_irq(&ep->lock);

                goto send_events;
        }

fetch_events:

        if (!ep_events_available(ep))
                ep_busy_loop(ep, timed_out);

        eavail = ep_events_available(ep);
        if (eavail)
                goto send_events;

        /*
         * Busy poll timed out.  Drop NAPI ID for now, we can add
         * it back in when we have moved a socket with a valid NAPI
         * ID onto the ready list.
         */
        ep_reset_busy_poll_napi_id(ep);

        do {
                /*
                 * Internally init_wait() uses autoremove_wake_function(),
                 * thus wait entry is removed from the wait queue on each
                 * wakeup. Why it is important? In case of several waiters
                 * each new wakeup will hit the next waiter, giving it the
                 * chance to harvest new event. Otherwise wakeup can be
                 * lost. This is also good performance-wise, because on
                 * normal wakeup path no need to call __remove_wait_queue()
                 * explicitly, thus ep->lock is not taken, which halts the
                 * event delivery.
                 *
                 * In fact, we now use an even more aggressive function that
                 * unconditionally removes, because we don't reuse the wait
                 * entry between loop iterations. This lets us also avoid the
                 * performance issue if a process is killed, causing all of its
                 * threads to wake up without being removed normally.
                 */
                init_wait(&wait);
                wait.func = ep_autoremove_wake_function;

                write_lock_irq(&ep->lock);
                /*
                 * Barrierless variant, waitqueue_active() is called under
                 * the same lock on wakeup ep_poll_callback() side, so it
                 * is safe to avoid an explicit barrier.
                 */
                __set_current_state(TASK_INTERRUPTIBLE);

                /*
                 * Do the final check under the lock. ep_scan_ready_list()
                 * plays with two lists (->rdllist and ->ovflist) and there
                 * is always a race when both lists are empty for short
                 * period of time although events are pending, so lock is
                 * important.
                 */
                eavail = ep_events_available(ep);
                if (!eavail) {
                        if (signal_pending(current))
                                res = -EINTR;
                        else
                                __add_wait_queue_exclusive(&ep->wq, &wait);
                }
                write_unlock_irq(&ep->lock);

                if (!eavail && !res)
                        timed_out = !schedule_hrtimeout_range(to, slack,
                                                              HRTIMER_MODE_ABS);

                /*
                 * We were woken up, thus go and try to harvest some events.
                 * If timed out and still on the wait queue, recheck eavail
                 * carefully under lock, below.
                 */
                eavail = 1;
        } while (0);

        __set_current_state(TASK_RUNNING);

        if (!list_empty_careful(&wait.entry)) {
                write_lock_irq(&ep->lock);
                /*
                 * If the thread timed out and is not on the wait queue, it
                 * means that the thread was woken up after its timeout expired
                 * before it could reacquire the lock. Thus, when wait.entry is
                 * empty, it needs to harvest events.
                 */
                if (timed_out)
                        eavail = list_empty(&wait.entry);
                __remove_wait_queue(&ep->wq, &wait);
                write_unlock_irq(&ep->lock);
        }

send_events:
        if (fatal_signal_pending(current)) {
                /*
                 * Always short-circuit for fatal signals to allow
                 * threads to make a timely exit without the chance of
                 * finding more events available and fetching
                 * repeatedly.
                 */
                res = -EINTR;
        }
        /*
         * Try to transfer events to user space. In case we get 0 events and
         * there's still timeout left over, we go trying again in search of
         * more luck.
         */
        if (!res && eavail &&
            !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
                goto fetch_events;

        return res;
}

/**
 * ep_loop_check_proc - Callback function to be passed to the @ep_call_nested()
 *                      API, to verify that adding an epoll file inside another
 *                      epoll structure, does not violate the constraints, in
 *                      terms of closed loops, or too deep chains (which can
 *                      result in excessive stack usage).
 *
 * @priv: Pointer to the epoll file to be currently checked.
 * @cookie: Original cookie for this call. This is the top-of-the-chain epoll
 *          data structure pointer.
 * @call_nests: Current dept of the @ep_call_nested() call stack.
 *
 * Returns: Returns zero if adding the epoll @file inside current epoll
 *          structure @ep does not violate the constraints, or -1 otherwise.
 */
static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
{
        int error = 0;
        struct file *file = priv;
        struct eventpoll *ep = file->private_data;
        struct eventpoll *ep_tovisit;
        struct rb_node *rbp;
        struct epitem *epi;

        mutex_lock_nested(&ep->mtx, call_nests + 1);
        ep->gen = loop_check_gen;
        for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
                epi = rb_entry(rbp, struct epitem, rbn);
                if (unlikely(is_file_epoll(epi->ffd.file))) {
                        ep_tovisit = epi->ffd.file->private_data;
                        if (ep_tovisit->gen == loop_check_gen)
                                continue;
                        error = ep_call_nested(&poll_loop_ncalls,
                                        ep_loop_check_proc, epi->ffd.file,
                                        ep_tovisit, current);
                        if (error != 0)
                                break;
                } else {
                        /*
                         * If we've reached a file that is not associated with
                         * an ep, then we need to check if the newly added
                         * links are going to add too many wakeup paths. We do
                         * this by adding it to the tfile_check_list, if it's
                         * not already there, and calling reverse_path_check()
                         * during ep_insert().
                         */
                        if (list_empty(&epi->ffd.file->f_tfile_llink)) {
                                if (get_file_rcu(epi->ffd.file))
                                        list_add(&epi->ffd.file->f_tfile_llink,
                                                 &tfile_check_list);
                        }
                }
        }
        mutex_unlock(&ep->mtx);

        return error;
}

/**
 * ep_loop_check - Performs a check to verify that adding an epoll file (@file)
 *                 another epoll file (represented by @ep) does not create
 *                 closed loops or too deep chains.
 *
 * @ep: Pointer to the epoll private data structure.
 * @file: Pointer to the epoll file to be checked.
 *
 * Returns: Returns zero if adding the epoll @file inside current epoll
 *          structure @ep does not violate the constraints, or -1 otherwise.
 */
static int ep_loop_check(struct eventpoll *ep, struct file *file)
{
        return ep_call_nested(&poll_loop_ncalls,
                              ep_loop_check_proc, file, ep, current);
}

static void clear_tfile_check_list(void)
{
        struct file *file;

        /* first clear the tfile_check_list */
        while (!list_empty(&tfile_check_list)) {
                file = list_first_entry(&tfile_check_list, struct file,
                                        f_tfile_llink);
                list_del_init(&file->f_tfile_llink);
                fput(file);
        }
        INIT_LIST_HEAD(&tfile_check_list);
}

/*
 * Open an eventpoll file descriptor.
 */
static int do_epoll_create(int flags)
{
        int error, fd;
        struct eventpoll *ep = NULL;
        struct file *file;

        /* Check the EPOLL_* constant for consistency.  */
        BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);

        if (flags & ~EPOLL_CLOEXEC)
                return -EINVAL;
        /*
         * Create the internal data structure ("struct eventpoll").
         */
        error = ep_alloc(&ep);
        if (error < 0)
                return error;
        /*
         * Creates all the items needed to setup an eventpoll file. That is,
         * a file structure and a free file descriptor.
         */
        fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
        if (fd < 0) {
                error = fd;
                goto out_free_ep;
        }
        file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
                                 O_RDWR | (flags & O_CLOEXEC));
        if (IS_ERR(file)) {
                error = PTR_ERR(file);
                goto out_free_fd;
        }
        ep->file = file;
        fd_install(fd, file);
        return fd;

out_free_fd:
        put_unused_fd(fd);
out_free_ep:
        ep_free(ep);
        return error;
}

SYSCALL_DEFINE1(epoll_create1, int, flags)
{
        return do_epoll_create(flags);
}

SYSCALL_DEFINE1(epoll_create, int, size)
{
        if (size <= 0)
                return -EINVAL;

        return do_epoll_create(0);
}

static inline int epoll_mutex_lock(struct mutex *mutex, int depth,
                                   bool nonblock)
{
        if (!nonblock) {
                mutex_lock_nested(mutex, depth);
                return 0;
        }
        if (mutex_trylock(mutex))
                return 0;
        return -EAGAIN;
}

int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
                 bool nonblock)
{
        int error;
        int full_check = 0;
        struct fd f, tf;
        struct eventpoll *ep;
        struct epitem *epi;
        struct eventpoll *tep = NULL;

        error = -EBADF;
        f = fdget(epfd);
        if (!f.file)
                goto error_return;

        /* Get the "struct file *" for the target file */
        tf = fdget(fd);
        if (!tf.file)
                goto error_fput;

        /* The target file descriptor must support poll */
        error = -EPERM;
        if (!file_can_poll(tf.file))
                goto error_tgt_fput;

        /* Check if EPOLLWAKEUP is allowed */
        if (ep_op_has_event(op))
                ep_take_care_of_epollwakeup(epds);

        /*
         * We have to check that the file structure underneath the file descriptor
         * the user passed to us _is_ an eventpoll file. And also we do not permit
         * adding an epoll file descriptor inside itself.
         */
        error = -EINVAL;
        if (f.file == tf.file || !is_file_epoll(f.file))
                goto error_tgt_fput;

        /*
         * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,
         * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
         * Also, we do not currently supported nested exclusive wakeups.
         */
        if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
                if (op == EPOLL_CTL_MOD)
                        goto error_tgt_fput;
                if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
                                (epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
                        goto error_tgt_fput;
        }

        /*
         * At this point it is safe to assume that the "private_data" contains
         * our own data structure.
         */
        ep = f.file->private_data;

        /*
         * When we insert an epoll file descriptor, inside another epoll file
         * descriptor, there is the change of creating closed loops, which are
         * better be handled here, than in more critical paths. While we are
         * checking for loops we also determine the list of files reachable
         * and hang them on the tfile_check_list, so we can check that we
         * haven't created too many possible wakeup paths.
         *
         * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
         * the epoll file descriptor is attaching directly to a wakeup source,
         * unless the epoll file descriptor is nested. The purpose of taking the
         * 'epmutex' on add is to prevent complex toplogies such as loops and
         * deep wakeup paths from forming in parallel through multiple
         * EPOLL_CTL_ADD operations.
         */
        error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
        if (error)
                goto error_tgt_fput;
        if (op == EPOLL_CTL_ADD) {
                if (!list_empty(&f.file->f_ep_links) ||
                                ep->gen == loop_check_gen ||
                                                is_file_epoll(tf.file)) {
                        mutex_unlock(&ep->mtx);
                        error = epoll_mutex_lock(&epmutex, 0, nonblock);
                        if (error)
                                goto error_tgt_fput;
                        loop_check_gen++;
                        full_check = 1;
                        if (is_file_epoll(tf.file)) {
                                error = -ELOOP;
                                if (ep_loop_check(ep, tf.file) != 0)
                                        goto error_tgt_fput;
                        } else {
                                get_file(tf.file);
                                list_add(&tf.file->f_tfile_llink,
                                                        &tfile_check_list);
                        }
                        error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
                        if (error)
                                goto error_tgt_fput;
                        if (is_file_epoll(tf.file)) {
                                tep = tf.file->private_data;
                                error = epoll_mutex_lock(&tep->mtx, 1, nonblock);
                                if (error) {
                                        mutex_unlock(&ep->mtx);
                                        goto error_tgt_fput;
                                }
                        }
                }
        }

        /*
         * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
         * above, we can be sure to be able to use the item looked up by
         * ep_find() till we release the mutex.
         */
        epi = ep_find(ep, tf.file, fd);

        error = -EINVAL;
        switch (op) {
        case EPOLL_CTL_ADD:
                if (!epi) {
                        epds->events |= EPOLLERR | EPOLLHUP;
                        error = ep_insert(ep, epds, tf.file, fd, full_check);
                } else
                        error = -EEXIST;
                break;
        case EPOLL_CTL_DEL:
                if (epi)
                        error = ep_remove(ep, epi);
                else
                        error = -ENOENT;
                break;
        case EPOLL_CTL_MOD:
                if (epi) {
                        if (!(epi->event.events & EPOLLEXCLUSIVE)) {
                                epds->events |= EPOLLERR | EPOLLHUP;
                                error = ep_modify(ep, epi, epds);
                        }
                } else
                        error = -ENOENT;
                break;
        }
        if (tep != NULL)
                mutex_unlock(&tep->mtx);
        mutex_unlock(&ep->mtx);

error_tgt_fput:
        if (full_check) {
                clear_tfile_check_list();
                loop_check_gen++;
                mutex_unlock(&epmutex);
        }

        fdput(tf);
error_fput:
        fdput(f);
error_return:

        return error;
}

/*
 * The following function implements the controller interface for
 * the eventpoll file that enables the insertion/removal/change of
 * file descriptors inside the interest set.
 */
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
                struct epoll_event __user *, event)
{
        struct epoll_event epds;

        if (ep_op_has_event(op) &&
            copy_from_user(&epds, event, sizeof(struct epoll_event)))
                return -EFAULT;

        return do_epoll_ctl(epfd, op, fd, &epds, false);
}

/*
 * Implement the event wait interface for the eventpoll file. It is the kernel
 * part of the user space epoll_wait(2).
 */
static int do_epoll_wait(int epfd, struct epoll_event __user *events,
                         int maxevents, int timeout)
{
        int error;
        struct fd f;
        struct eventpoll *ep;

        /* The maximum number of event must be greater than zero */
        if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
                return -EINVAL;

        /* Verify that the area passed by the user is writeable */
        if (!access_ok(events, maxevents * sizeof(struct epoll_event)))
                return -EFAULT;

        /* Get the "struct file *" for the eventpoll file */
        f = fdget(epfd);
        if (!f.file)
                return -EBADF;

        /*
         * We have to check that the file structure underneath the fd
         * the user passed to us _is_ an eventpoll file.
         */
        error = -EINVAL;
        if (!is_file_epoll(f.file))
                goto error_fput;

        /*
         * At this point it is safe to assume that the "private_data" contains
         * our own data structure.
         */
        ep = f.file->private_data;

        /* Time to fish for events ... */
        error = ep_poll(ep, events, maxevents, timeout);

error_fput:
        fdput(f);
        return error;
}

SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
                int, maxevents, int, timeout)
{
        return do_epoll_wait(epfd, events, maxevents, timeout);
}

/*
 * Implement the event wait interface for the eventpoll file. It is the kernel
 * part of the user space epoll_pwait(2).
 */
SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
                int, maxevents, int, timeout, const sigset_t __user *, sigmask,
                size_t, sigsetsize)
{
        int error;

        /*
         * If the caller wants a certain signal mask to be set during the wait,
         * we apply it here.
         */
        error = set_user_sigmask(sigmask, sigsetsize);
        if (error)
                return error;

        error = do_epoll_wait(epfd, events, maxevents, timeout);
        restore_saved_sigmask_unless(error == -EINTR);

        return error;
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
                        struct epoll_event __user *, events,
                        int, maxevents, int, timeout,
                        const compat_sigset_t __user *, sigmask,
                        compat_size_t, sigsetsize)
{
        long err;

        /*
         * If the caller wants a certain signal mask to be set during the wait,
         * we apply it here.
         */
        err = set_compat_user_sigmask(sigmask, sigsetsize);
        if (err)
                return err;

        err = do_epoll_wait(epfd, events, maxevents, timeout);
        restore_saved_sigmask_unless(err == -EINTR);

        return err;
}
#endif

static int __init eventpoll_init(void)
{
        struct sysinfo si;

        si_meminfo(&si);
        /*
         * Allows top 4% of lomem to be allocated for epoll watches (per user).
         */
        max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
                EP_ITEM_COST;
        BUG_ON(max_user_watches < 0);

        /*
         * Initialize the structure used to perform epoll file descriptor
         * inclusion loops checks.
         */
        ep_nested_calls_init(&poll_loop_ncalls);

        /*
         * We can have many thousands of epitems, so prevent this from
         * using an extra cache line on 64-bit (and smaller) CPUs
         */
        BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128);

        /* Allocates slab cache used to allocate "struct epitem" items */
        epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);

        /* Allocates slab cache used to allocate "struct eppoll_entry" */
        pwq_cache = kmem_cache_create("eventpoll_pwq",
                sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);

        return 0;
}
fs_initcall(eventpoll_init);



















































    2 


    2 
































    1 









    1 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ext4/bitmap.c
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 */

#include <linux/buffer_head.h>
#include "ext4.h"

unsigned int ext4_count_free(char *bitmap, unsigned int numchars)
{
        return numchars * BITS_PER_BYTE - memweight(bitmap, numchars);
}

int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
                                  struct ext4_group_desc *gdp,
                                  struct buffer_head *bh, int sz)
{
        __u32 hi;
        __u32 provided, calculated;
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (!ext4_has_metadata_csum(sb))
                return 1;

        provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo);
        calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
        if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) {
                hi = le16_to_cpu(gdp->bg_inode_bitmap_csum_hi);
                provided |= (hi << 16);
        } else
                calculated &= 0xFFFF;

        return provided == calculated;
}

void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
                                struct ext4_group_desc *gdp,
                                struct buffer_head *bh, int sz)
{
        __u32 csum;
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (!ext4_has_metadata_csum(sb))
                return;

        csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
        gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
        if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END)
                gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16);
}

int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
                                  struct ext4_group_desc *gdp,
                                  struct buffer_head *bh)
{
        __u32 hi;
        __u32 provided, calculated;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8;

        if (!ext4_has_metadata_csum(sb))
                return 1;

        provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo);
        calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
        if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) {
                hi = le16_to_cpu(gdp->bg_block_bitmap_csum_hi);
                provided |= (hi << 16);
        } else
                calculated &= 0xFFFF;

        if (provided == calculated)
                return 1;

        return 0;
}

void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
                                struct ext4_group_desc *gdp,
                                struct buffer_head *bh)
{
        int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8;
        __u32 csum;
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (!ext4_has_metadata_csum(sb))
                return;

        csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
        gdp->bg_block_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
        if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END)
                gdp->bg_block_bitmap_csum_hi = cpu_to_le16(csum >> 16);
}
































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * fs/kernfs/kernfs-internal.h - kernfs internal header file
 *
 * Copyright (c) 2001-3 Patrick Mochel
 * Copyright (c) 2007 SUSE Linux Products GmbH
 * Copyright (c) 2007, 2013 Tejun Heo <teheo@suse.de>
 */

#ifndef __KERNFS_INTERNAL_H
#define __KERNFS_INTERNAL_H

#include <linux/lockdep.h>
#include <linux/fs.h>
#include <linux/mutex.h>
#include <linux/xattr.h>

#include <linux/kernfs.h>
#include <linux/fs_context.h>

struct kernfs_iattrs {
        kuid_t                        ia_uid;
        kgid_t                        ia_gid;
        struct timespec64        ia_atime;
        struct timespec64        ia_mtime;
        struct timespec64        ia_ctime;

        struct simple_xattrs        xattrs;
        atomic_t                nr_user_xattrs;
        atomic_t                user_xattr_size;
};

/* +1 to avoid triggering overflow warning when negating it */
#define KN_DEACTIVATED_BIAS                (INT_MIN + 1)

/* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */

/**
 * kernfs_root - find out the kernfs_root a kernfs_node belongs to
 * @kn: kernfs_node of interest
 *
 * Return the kernfs_root @kn belongs to.
 */
static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn)
{
        /* if parent exists, it's always a dir; otherwise, @sd is a dir */
        if (kn->parent)
                kn = kn->parent;
        return kn->dir.root;
}

/*
 * mount.c
 */
struct kernfs_super_info {
        struct super_block        *sb;

        /*
         * The root associated with this super_block.  Each super_block is
         * identified by the root and ns it's associated with.
         */
        struct kernfs_root        *root;

        /*
         * Each sb is associated with one namespace tag, currently the
         * network namespace of the task which mounted this kernfs
         * instance.  If multiple tags become necessary, make the following
         * an array and compare kernfs_node tag against every entry.
         */
        const void                *ns;

        /* anchored at kernfs_root->supers, protected by kernfs_mutex */
        struct list_head        node;
};
#define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info))

static inline struct kernfs_node *kernfs_dentry_node(struct dentry *dentry)
{
        if (d_really_is_negative(dentry))
                return NULL;
        return d_inode(dentry)->i_private;
}

extern const struct super_operations kernfs_sops;
extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;

/*
 * inode.c
 */
extern const struct xattr_handler *kernfs_xattr_handlers[];
void kernfs_evict_inode(struct inode *inode);
int kernfs_iop_permission(struct inode *inode, int mask);
int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr);
int kernfs_iop_getattr(const struct path *path, struct kstat *stat,
                       u32 request_mask, unsigned int query_flags);
ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size);
int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr);

/*
 * dir.c
 */
extern struct mutex kernfs_mutex;
extern const struct dentry_operations kernfs_dops;
extern const struct file_operations kernfs_dir_fops;
extern const struct inode_operations kernfs_dir_iops;

struct kernfs_node *kernfs_get_active(struct kernfs_node *kn);
void kernfs_put_active(struct kernfs_node *kn);
int kernfs_add_one(struct kernfs_node *kn);
struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
                                    const char *name, umode_t mode,
                                    kuid_t uid, kgid_t gid,
                                    unsigned flags);

/*
 * file.c
 */
extern const struct file_operations kernfs_file_fops;

void kernfs_drain_open_files(struct kernfs_node *kn);

/*
 * symlink.c
 */
extern const struct inode_operations kernfs_symlink_iops;

#endif        /* __KERNFS_INTERNAL_H */































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM vsyscall

#if !defined(__VSYSCALL_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
#define __VSYSCALL_TRACE_H

#include <linux/tracepoint.h>

TRACE_EVENT(emulate_vsyscall,

            TP_PROTO(int nr),

            TP_ARGS(nr),

            TP_STRUCT__entry(__field(int, nr)),

            TP_fast_assign(
                           __entry->nr = nr;
                           ),

            TP_printk("nr = %d", __entry->nr)
);

#endif

#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH ../../arch/x86/entry/vsyscall/
#define TRACE_INCLUDE_FILE vsyscall_trace
#include <trace/define_trace.h>





































   11 


   11 

    1 
   11 










    4 
    6 



    4 


    4 





   11 










    5 








   10 









    1 












    3 
    3 
    3 















    1 
    1 
    1 










    1 
    1 



    1 

    1 












































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
// SPDX-License-Identifier: GPL-2.0-or-later
/* bit search implementation
 *
 * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * Copyright (C) 2008 IBM Corporation
 * 'find_last_bit' is written by Rusty Russell <rusty@rustcorp.com.au>
 * (Inspired by David Howell's find_next_bit implementation)
 *
 * Rewritten by Yury Norov <yury.norov@gmail.com> to decrease
 * size and improve performance, 2015.
 */

#include <linux/bitops.h>
#include <linux/bitmap.h>
#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/minmax.h>

#if !defined(find_next_bit) || !defined(find_next_zero_bit) ||                        \
        !defined(find_next_bit_le) || !defined(find_next_zero_bit_le) ||        \
        !defined(find_next_and_bit)
/*
 * This is a common helper function for find_next_bit, find_next_zero_bit, and
 * find_next_and_bit. The differences are:
 *  - The "invert" argument, which is XORed with each fetched word before
 *    searching it for one bits.
 *  - The optional "addr2", which is anded with "addr1" if present.
 */
static unsigned long _find_next_bit(const unsigned long *addr1,
                const unsigned long *addr2, unsigned long nbits,
                unsigned long start, unsigned long invert, unsigned long le)
{
        unsigned long tmp, mask;

        if (unlikely(start >= nbits))
                return nbits;

        tmp = addr1[start / BITS_PER_LONG];
        if (addr2)
                tmp &= addr2[start / BITS_PER_LONG];
        tmp ^= invert;

        /* Handle 1st word. */
        mask = BITMAP_FIRST_WORD_MASK(start);
        if (le)
                mask = swab(mask);

        tmp &= mask;

        start = round_down(start, BITS_PER_LONG);

        while (!tmp) {
                start += BITS_PER_LONG;
                if (start >= nbits)
                        return nbits;

                tmp = addr1[start / BITS_PER_LONG];
                if (addr2)
                        tmp &= addr2[start / BITS_PER_LONG];
                tmp ^= invert;
        }

        if (le)
                tmp = swab(tmp);

        return min(start + __ffs(tmp), nbits);
}
#endif

#ifndef find_next_bit
/*
 * Find the next set bit in a memory region.
 */
unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
                            unsigned long offset)
{
        return _find_next_bit(addr, NULL, size, offset, 0UL, 0);
}
EXPORT_SYMBOL(find_next_bit);
#endif

#ifndef find_next_zero_bit
unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
                                 unsigned long offset)
{
        return _find_next_bit(addr, NULL, size, offset, ~0UL, 0);
}
EXPORT_SYMBOL(find_next_zero_bit);
#endif

#if !defined(find_next_and_bit)
unsigned long find_next_and_bit(const unsigned long *addr1,
                const unsigned long *addr2, unsigned long size,
                unsigned long offset)
{
        return _find_next_bit(addr1, addr2, size, offset, 0UL, 0);
}
EXPORT_SYMBOL(find_next_and_bit);
#endif

#ifndef find_first_bit
/*
 * Find the first set bit in a memory region.
 */
unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
{
        unsigned long idx;

        for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
                if (addr[idx])
                        return min(idx * BITS_PER_LONG + __ffs(addr[idx]), size);
        }

        return size;
}
EXPORT_SYMBOL(find_first_bit);
#endif

#ifndef find_first_zero_bit
/*
 * Find the first cleared bit in a memory region.
 */
unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
{
        unsigned long idx;

        for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
                if (addr[idx] != ~0UL)
                        return min(idx * BITS_PER_LONG + ffz(addr[idx]), size);
        }

        return size;
}
EXPORT_SYMBOL(find_first_zero_bit);
#endif

#ifndef find_last_bit
unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
{
        if (size) {
                unsigned long val = BITMAP_LAST_WORD_MASK(size);
                unsigned long idx = (size-1) / BITS_PER_LONG;

                do {
                        val &= addr[idx];
                        if (val)
                                return idx * BITS_PER_LONG + __fls(val);

                        val = ~0ul;
                } while (idx--);
        }
        return size;
}
EXPORT_SYMBOL(find_last_bit);
#endif

#ifdef __BIG_ENDIAN

#ifndef find_next_zero_bit_le
unsigned long find_next_zero_bit_le(const void *addr, unsigned
                long size, unsigned long offset)
{
        return _find_next_bit(addr, NULL, size, offset, ~0UL, 1);
}
EXPORT_SYMBOL(find_next_zero_bit_le);
#endif

#ifndef find_next_bit_le
unsigned long find_next_bit_le(const void *addr, unsigned
                long size, unsigned long offset)
{
        return _find_next_bit(addr, NULL, size, offset, 0UL, 1);
}
EXPORT_SYMBOL(find_next_bit_le);
#endif

#endif /* __BIG_ENDIAN */

unsigned long find_next_clump8(unsigned long *clump, const unsigned long *addr,
                               unsigned long size, unsigned long offset)
{
        offset = find_next_bit(addr, size, offset);
        if (offset == size)
                return size;

        offset = round_down(offset, 8);
        *clump = bitmap_get_value8(addr, offset);

        return offset;
}
EXPORT_SYMBOL(find_next_clump8);







































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef _ASM_X86_INAT_H
#define _ASM_X86_INAT_H
/*
 * x86 instruction attributes
 *
 * Written by Masami Hiramatsu <mhiramat@redhat.com>
 */
#include <asm/inat_types.h> /* __ignore_sync_check__ */

/*
 * Internal bits. Don't use bitmasks directly, because these bits are
 * unstable. You should use checking functions.
 */

#define INAT_OPCODE_TABLE_SIZE 256
#define INAT_GROUP_TABLE_SIZE 8

/* Legacy last prefixes */
#define INAT_PFX_OPNDSZ        1        /* 0x66 */ /* LPFX1 */
#define INAT_PFX_REPE        2        /* 0xF3 */ /* LPFX2 */
#define INAT_PFX_REPNE        3        /* 0xF2 */ /* LPFX3 */
/* Other Legacy prefixes */
#define INAT_PFX_LOCK        4        /* 0xF0 */
#define INAT_PFX_CS        5        /* 0x2E */
#define INAT_PFX_DS        6        /* 0x3E */
#define INAT_PFX_ES        7        /* 0x26 */
#define INAT_PFX_FS        8        /* 0x64 */
#define INAT_PFX_GS        9        /* 0x65 */
#define INAT_PFX_SS        10        /* 0x36 */
#define INAT_PFX_ADDRSZ        11        /* 0x67 */
/* x86-64 REX prefix */
#define INAT_PFX_REX        12        /* 0x4X */
/* AVX VEX prefixes */
#define INAT_PFX_VEX2        13        /* 2-bytes VEX prefix */
#define INAT_PFX_VEX3        14        /* 3-bytes VEX prefix */
#define INAT_PFX_EVEX        15        /* EVEX prefix */

#define INAT_LSTPFX_MAX        3
#define INAT_LGCPFX_MAX        11

/* Immediate size */
#define INAT_IMM_BYTE                1
#define INAT_IMM_WORD                2
#define INAT_IMM_DWORD                3
#define INAT_IMM_QWORD                4
#define INAT_IMM_PTR                5
#define INAT_IMM_VWORD32        6
#define INAT_IMM_VWORD                7

/* Legacy prefix */
#define INAT_PFX_OFFS        0
#define INAT_PFX_BITS        4
#define INAT_PFX_MAX    ((1 << INAT_PFX_BITS) - 1)
#define INAT_PFX_MASK        (INAT_PFX_MAX << INAT_PFX_OFFS)
/* Escape opcodes */
#define INAT_ESC_OFFS        (INAT_PFX_OFFS + INAT_PFX_BITS)
#define INAT_ESC_BITS        2
#define INAT_ESC_MAX        ((1 << INAT_ESC_BITS) - 1)
#define INAT_ESC_MASK        (INAT_ESC_MAX << INAT_ESC_OFFS)
/* Group opcodes (1-16) */
#define INAT_GRP_OFFS        (INAT_ESC_OFFS + INAT_ESC_BITS)
#define INAT_GRP_BITS        5
#define INAT_GRP_MAX        ((1 << INAT_GRP_BITS) - 1)
#define INAT_GRP_MASK        (INAT_GRP_MAX << INAT_GRP_OFFS)
/* Immediates */
#define INAT_IMM_OFFS        (INAT_GRP_OFFS + INAT_GRP_BITS)
#define INAT_IMM_BITS        3
#define INAT_IMM_MASK        (((1 << INAT_IMM_BITS) - 1) << INAT_IMM_OFFS)
/* Flags */
#define INAT_FLAG_OFFS        (INAT_IMM_OFFS + INAT_IMM_BITS)
#define INAT_MODRM        (1 << (INAT_FLAG_OFFS))
#define INAT_FORCE64        (1 << (INAT_FLAG_OFFS + 1))
#define INAT_SCNDIMM        (1 << (INAT_FLAG_OFFS + 2))
#define INAT_MOFFSET        (1 << (INAT_FLAG_OFFS + 3))
#define INAT_VARIANT        (1 << (INAT_FLAG_OFFS + 4))
#define INAT_VEXOK        (1 << (INAT_FLAG_OFFS + 5))
#define INAT_VEXONLY        (1 << (INAT_FLAG_OFFS + 6))
#define INAT_EVEXONLY        (1 << (INAT_FLAG_OFFS + 7))
/* Attribute making macros for attribute tables */
#define INAT_MAKE_PREFIX(pfx)        (pfx << INAT_PFX_OFFS)
#define INAT_MAKE_ESCAPE(esc)        (esc << INAT_ESC_OFFS)
#define INAT_MAKE_GROUP(grp)        ((grp << INAT_GRP_OFFS) | INAT_MODRM)
#define INAT_MAKE_IMM(imm)        (imm << INAT_IMM_OFFS)

/* Identifiers for segment registers */
#define INAT_SEG_REG_IGNORE        0
#define INAT_SEG_REG_DEFAULT        1
#define INAT_SEG_REG_CS                2
#define INAT_SEG_REG_SS                3
#define INAT_SEG_REG_DS                4
#define INAT_SEG_REG_ES                5
#define INAT_SEG_REG_FS                6
#define INAT_SEG_REG_GS                7

/* Attribute search APIs */
extern insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode);
extern int inat_get_last_prefix_id(insn_byte_t last_pfx);
extern insn_attr_t inat_get_escape_attribute(insn_byte_t opcode,
                                             int lpfx_id,
                                             insn_attr_t esc_attr);
extern insn_attr_t inat_get_group_attribute(insn_byte_t modrm,
                                            int lpfx_id,
                                            insn_attr_t esc_attr);
extern insn_attr_t inat_get_avx_attribute(insn_byte_t opcode,
                                          insn_byte_t vex_m,
                                          insn_byte_t vex_pp);

/* Attribute checking functions */
static inline int inat_is_legacy_prefix(insn_attr_t attr)
{
        attr &= INAT_PFX_MASK;
        return attr && attr <= INAT_LGCPFX_MAX;
}

static inline int inat_is_address_size_prefix(insn_attr_t attr)
{
        return (attr & INAT_PFX_MASK) == INAT_PFX_ADDRSZ;
}

static inline int inat_is_operand_size_prefix(insn_attr_t attr)
{
        return (attr & INAT_PFX_MASK) == INAT_PFX_OPNDSZ;
}

static inline int inat_is_rex_prefix(insn_attr_t attr)
{
        return (attr & INAT_PFX_MASK) == INAT_PFX_REX;
}

static inline int inat_last_prefix_id(insn_attr_t attr)
{
        if ((attr & INAT_PFX_MASK) > INAT_LSTPFX_MAX)
                return 0;
        else
                return attr & INAT_PFX_MASK;
}

static inline int inat_is_vex_prefix(insn_attr_t attr)
{
        attr &= INAT_PFX_MASK;
        return attr == INAT_PFX_VEX2 || attr == INAT_PFX_VEX3 ||
               attr == INAT_PFX_EVEX;
}

static inline int inat_is_evex_prefix(insn_attr_t attr)
{
        return (attr & INAT_PFX_MASK) == INAT_PFX_EVEX;
}

static inline int inat_is_vex3_prefix(insn_attr_t attr)
{
        return (attr & INAT_PFX_MASK) == INAT_PFX_VEX3;
}

static inline int inat_is_escape(insn_attr_t attr)
{
        return attr & INAT_ESC_MASK;
}

static inline int inat_escape_id(insn_attr_t attr)
{
        return (attr & INAT_ESC_MASK) >> INAT_ESC_OFFS;
}

static inline int inat_is_group(insn_attr_t attr)
{
        return attr & INAT_GRP_MASK;
}

static inline int inat_group_id(insn_attr_t attr)
{
        return (attr & INAT_GRP_MASK) >> INAT_GRP_OFFS;
}

static inline int inat_group_common_attribute(insn_attr_t attr)
{
        return attr & ~INAT_GRP_MASK;
}

static inline int inat_has_immediate(insn_attr_t attr)
{
        return attr & INAT_IMM_MASK;
}

static inline int inat_immediate_size(insn_attr_t attr)
{
        return (attr & INAT_IMM_MASK) >> INAT_IMM_OFFS;
}

static inline int inat_has_modrm(insn_attr_t attr)
{
        return attr & INAT_MODRM;
}

static inline int inat_is_force64(insn_attr_t attr)
{
        return attr & INAT_FORCE64;
}

static inline int inat_has_second_immediate(insn_attr_t attr)
{
        return attr & INAT_SCNDIMM;
}

static inline int inat_has_moffset(insn_attr_t attr)
{
        return attr & INAT_MOFFSET;
}

static inline int inat_has_variant(insn_attr_t attr)
{
        return attr & INAT_VARIANT;
}

static inline int inat_accept_vex(insn_attr_t attr)
{
        return attr & INAT_VEXOK;
}

static inline int inat_must_vex(insn_attr_t attr)
{
        return attr & (INAT_VEXONLY | INAT_EVEXONLY);
}

static inline int inat_must_evex(insn_attr_t attr)
{
        return attr & INAT_EVEXONLY;
}
#endif























































































    1 
    8 














    7 

    7 






























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * descriptor table internals; you almost certainly want file.h instead.
 */

#ifndef __LINUX_FDTABLE_H
#define __LINUX_FDTABLE_H

#include <linux/posix_types.h>
#include <linux/compiler.h>
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/nospec.h>
#include <linux/types.h>
#include <linux/init.h>
#include <linux/fs.h>

#include <linux/atomic.h>

/*
 * The default fd array needs to be at least BITS_PER_LONG,
 * as this is the granularity returned by copy_fdset().
 */
#define NR_OPEN_DEFAULT BITS_PER_LONG
#define NR_OPEN_MAX ~0U

struct fdtable {
        unsigned int max_fds;
        struct file __rcu **fd;      /* current fd array */
        unsigned long *close_on_exec;
        unsigned long *open_fds;
        unsigned long *full_fds_bits;
        struct rcu_head rcu;
};

static inline bool close_on_exec(unsigned int fd, const struct fdtable *fdt)
{
        return test_bit(fd, fdt->close_on_exec);
}

static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt)
{
        return test_bit(fd, fdt->open_fds);
}

/*
 * Open file table structure
 */
struct files_struct {
  /*
   * read mostly part
   */
        atomic_t count;
        bool resize_in_progress;
        wait_queue_head_t resize_wait;

        struct fdtable __rcu *fdt;
        struct fdtable fdtab;
  /*
   * written part on a separate cache line in SMP
   */
        spinlock_t file_lock ____cacheline_aligned_in_smp;
        unsigned int next_fd;
        unsigned long close_on_exec_init[1];
        unsigned long open_fds_init[1];
        unsigned long full_fds_bits_init[1];
        struct file __rcu * fd_array[NR_OPEN_DEFAULT];
};

struct file_operations;
struct vfsmount;
struct dentry;

#define rcu_dereference_check_fdtable(files, fdtfd) \
        rcu_dereference_check((fdtfd), lockdep_is_held(&(files)->file_lock))

#define files_fdtable(files) \
        rcu_dereference_check_fdtable((files), (files)->fdt)

/*
 * The caller must ensure that fd table isn't shared or hold rcu or file lock
 */
static inline struct file *files_lookup_fd_raw(struct files_struct *files, unsigned int fd)
{
        struct fdtable *fdt = rcu_dereference_raw(files->fdt);

        if (fd < fdt->max_fds) {
                fd = array_index_nospec(fd, fdt->max_fds);
                return rcu_dereference_raw(fdt->fd[fd]);
        }
        return NULL;
}

static inline struct file *files_lookup_fd_locked(struct files_struct *files, unsigned int fd)
{
        RCU_LOCKDEP_WARN(!lockdep_is_held(&files->file_lock),
                           "suspicious rcu_dereference_check() usage");
        return files_lookup_fd_raw(files, fd);
}

static inline struct file *files_lookup_fd_rcu(struct files_struct *files, unsigned int fd)
{
        RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
                           "suspicious rcu_dereference_check() usage");
        return files_lookup_fd_raw(files, fd);
}

static inline struct file *lookup_fd_rcu(unsigned int fd)
{
        return files_lookup_fd_rcu(current->files, fd);
}

struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd);
struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *fd);

struct task_struct;

struct files_struct *get_files_struct(struct task_struct *);
void put_files_struct(struct files_struct *fs);
int unshare_files(void);
struct files_struct *dup_fd(struct files_struct *, unsigned, int *) __latent_entropy;
void do_close_on_exec(struct files_struct *);
int iterate_fd(struct files_struct *, unsigned,
                int (*)(const void *, struct file *, unsigned),
                const void *);

extern int close_fd(unsigned int fd);
extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags);
extern int close_fd_get_file(unsigned int fd, struct file **res);
extern int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
                      struct files_struct **new_fdp);

extern struct kmem_cache *files_cachep;

#endif /* __LINUX_FDTABLE_H */

































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_TTY_H
#define _LINUX_TTY_H

#include <linux/fs.h>
#include <linux/major.h>
#include <linux/termios.h>
#include <linux/workqueue.h>
#include <linux/tty_driver.h>
#include <linux/tty_ldisc.h>
#include <linux/mutex.h>
#include <linux/tty_flags.h>
#include <linux/seq_file.h>
#include <uapi/linux/tty.h>
#include <linux/rwsem.h>
#include <linux/llist.h>


/*
 * (Note: the *_driver.minor_start values 1, 64, 128, 192 are
 * hardcoded at present.)
 */
#define NR_UNIX98_PTY_DEFAULT        4096      /* Default maximum for Unix98 ptys */
#define NR_UNIX98_PTY_RESERVE        1024          /* Default reserve for main devpts */
#define NR_UNIX98_PTY_MAX        (1 << MINORBITS) /* Absolute limit */

/*
 * This character is the same as _POSIX_VDISABLE: it cannot be used as
 * a c_cc[] character, but indicates that a particular special character
 * isn't in use (eg VINTR has no character etc)
 */
#define __DISABLED_CHAR '\0'

struct tty_buffer {
        union {
                struct tty_buffer *next;
                struct llist_node free;
        };
        int used;
        int size;
        int commit;
        int read;
        int flags;
        /* Data points here */
        unsigned long data[];
};

/* Values for .flags field of tty_buffer */
#define TTYB_NORMAL        1        /* buffer has no flags buffer */

static inline unsigned char *char_buf_ptr(struct tty_buffer *b, int ofs)
{
        return ((unsigned char *)b->data) + ofs;
}

static inline char *flag_buf_ptr(struct tty_buffer *b, int ofs)
{
        return (char *)char_buf_ptr(b, ofs) + b->size;
}

struct tty_bufhead {
        struct tty_buffer *head;        /* Queue head */
        struct work_struct work;
        struct mutex           lock;
        atomic_t           priority;
        struct tty_buffer sentinel;
        struct llist_head free;                /* Free queue head */
        atomic_t           mem_used;    /* In-use buffers excluding free list */
        int                   mem_limit;
        struct tty_buffer *tail;        /* Active buffer */
};
/*
 * When a break, frame error, or parity error happens, these codes are
 * stuffed into the flags buffer.
 */
#define TTY_NORMAL        0
#define TTY_BREAK        1
#define TTY_FRAME        2
#define TTY_PARITY        3
#define TTY_OVERRUN        4

#define INTR_CHAR(tty) ((tty)->termios.c_cc[VINTR])
#define QUIT_CHAR(tty) ((tty)->termios.c_cc[VQUIT])
#define ERASE_CHAR(tty) ((tty)->termios.c_cc[VERASE])
#define KILL_CHAR(tty) ((tty)->termios.c_cc[VKILL])
#define EOF_CHAR(tty) ((tty)->termios.c_cc[VEOF])
#define TIME_CHAR(tty) ((tty)->termios.c_cc[VTIME])
#define MIN_CHAR(tty) ((tty)->termios.c_cc[VMIN])
#define SWTC_CHAR(tty) ((tty)->termios.c_cc[VSWTC])
#define START_CHAR(tty) ((tty)->termios.c_cc[VSTART])
#define STOP_CHAR(tty) ((tty)->termios.c_cc[VSTOP])
#define SUSP_CHAR(tty) ((tty)->termios.c_cc[VSUSP])
#define EOL_CHAR(tty) ((tty)->termios.c_cc[VEOL])
#define REPRINT_CHAR(tty) ((tty)->termios.c_cc[VREPRINT])
#define DISCARD_CHAR(tty) ((tty)->termios.c_cc[VDISCARD])
#define WERASE_CHAR(tty) ((tty)->termios.c_cc[VWERASE])
#define LNEXT_CHAR(tty)        ((tty)->termios.c_cc[VLNEXT])
#define EOL2_CHAR(tty) ((tty)->termios.c_cc[VEOL2])

#define _I_FLAG(tty, f)        ((tty)->termios.c_iflag & (f))
#define _O_FLAG(tty, f)        ((tty)->termios.c_oflag & (f))
#define _C_FLAG(tty, f)        ((tty)->termios.c_cflag & (f))
#define _L_FLAG(tty, f)        ((tty)->termios.c_lflag & (f))

#define I_IGNBRK(tty)        _I_FLAG((tty), IGNBRK)
#define I_BRKINT(tty)        _I_FLAG((tty), BRKINT)
#define I_IGNPAR(tty)        _I_FLAG((tty), IGNPAR)
#define I_PARMRK(tty)        _I_FLAG((tty), PARMRK)
#define I_INPCK(tty)        _I_FLAG((tty), INPCK)
#define I_ISTRIP(tty)        _I_FLAG((tty), ISTRIP)
#define I_INLCR(tty)        _I_FLAG((tty), INLCR)
#define I_IGNCR(tty)        _I_FLAG((tty), IGNCR)
#define I_ICRNL(tty)        _I_FLAG((tty), ICRNL)
#define I_IUCLC(tty)        _I_FLAG((tty), IUCLC)
#define I_IXON(tty)        _I_FLAG((tty), IXON)
#define I_IXANY(tty)        _I_FLAG((tty), IXANY)
#define I_IXOFF(tty)        _I_FLAG((tty), IXOFF)
#define I_IMAXBEL(tty)        _I_FLAG((tty), IMAXBEL)
#define I_IUTF8(tty)        _I_FLAG((tty), IUTF8)

#define O_OPOST(tty)        _O_FLAG((tty), OPOST)
#define O_OLCUC(tty)        _O_FLAG((tty), OLCUC)
#define O_ONLCR(tty)        _O_FLAG((tty), ONLCR)
#define O_OCRNL(tty)        _O_FLAG((tty), OCRNL)
#define O_ONOCR(tty)        _O_FLAG((tty), ONOCR)
#define O_ONLRET(tty)        _O_FLAG((tty), ONLRET)
#define O_OFILL(tty)        _O_FLAG((tty), OFILL)
#define O_OFDEL(tty)        _O_FLAG((tty), OFDEL)
#define O_NLDLY(tty)        _O_FLAG((tty), NLDLY)
#define O_CRDLY(tty)        _O_FLAG((tty), CRDLY)
#define O_TABDLY(tty)        _O_FLAG((tty), TABDLY)
#define O_BSDLY(tty)        _O_FLAG((tty), BSDLY)
#define O_VTDLY(tty)        _O_FLAG((tty), VTDLY)
#define O_FFDLY(tty)        _O_FLAG((tty), FFDLY)

#define C_BAUD(tty)        _C_FLAG((tty), CBAUD)
#define C_CSIZE(tty)        _C_FLAG((tty), CSIZE)
#define C_CSTOPB(tty)        _C_FLAG((tty), CSTOPB)
#define C_CREAD(tty)        _C_FLAG((tty), CREAD)
#define C_PARENB(tty)        _C_FLAG((tty), PARENB)
#define C_PARODD(tty)        _C_FLAG((tty), PARODD)
#define C_HUPCL(tty)        _C_FLAG((tty), HUPCL)
#define C_CLOCAL(tty)        _C_FLAG((tty), CLOCAL)
#define C_CIBAUD(tty)        _C_FLAG((tty), CIBAUD)
#define C_CRTSCTS(tty)        _C_FLAG((tty), CRTSCTS)
#define C_CMSPAR(tty)        _C_FLAG((tty), CMSPAR)

#define L_ISIG(tty)        _L_FLAG((tty), ISIG)
#define L_ICANON(tty)        _L_FLAG((tty), ICANON)
#define L_XCASE(tty)        _L_FLAG((tty), XCASE)
#define L_ECHO(tty)        _L_FLAG((tty), ECHO)
#define L_ECHOE(tty)        _L_FLAG((tty), ECHOE)
#define L_ECHOK(tty)        _L_FLAG((tty), ECHOK)
#define L_ECHONL(tty)        _L_FLAG((tty), ECHONL)
#define L_NOFLSH(tty)        _L_FLAG((tty), NOFLSH)
#define L_TOSTOP(tty)        _L_FLAG((tty), TOSTOP)
#define L_ECHOCTL(tty)        _L_FLAG((tty), ECHOCTL)
#define L_ECHOPRT(tty)        _L_FLAG((tty), ECHOPRT)
#define L_ECHOKE(tty)        _L_FLAG((tty), ECHOKE)
#define L_FLUSHO(tty)        _L_FLAG((tty), FLUSHO)
#define L_PENDIN(tty)        _L_FLAG((tty), PENDIN)
#define L_IEXTEN(tty)        _L_FLAG((tty), IEXTEN)
#define L_EXTPROC(tty)        _L_FLAG((tty), EXTPROC)

struct device;
struct signal_struct;

/*
 * Port level information. Each device keeps its own port level information
 * so provide a common structure for those ports wanting to use common support
 * routines.
 *
 * The tty port has a different lifetime to the tty so must be kept apart.
 * In addition be careful as tty -> port mappings are valid for the life
 * of the tty object but in many cases port -> tty mappings are valid only
 * until a hangup so don't use the wrong path.
 */

struct tty_port;

struct tty_port_operations {
        /* Return 1 if the carrier is raised */
        int (*carrier_raised)(struct tty_port *port);
        /* Control the DTR line */
        void (*dtr_rts)(struct tty_port *port, int raise);
        /* Called when the last close completes or a hangup finishes
           IFF the port was initialized. Do not use to free resources. Called
           under the port mutex to serialize against activate/shutdowns */
        void (*shutdown)(struct tty_port *port);
        /* Called under the port mutex from tty_port_open, serialized using
           the port mutex */
        /* FIXME: long term getting the tty argument *out* of this would be
           good for consoles */
        int (*activate)(struct tty_port *port, struct tty_struct *tty);
        /* Called on the final put of a port */
        void (*destruct)(struct tty_port *port);
};

struct tty_port_client_operations {
        int (*receive_buf)(struct tty_port *port, const unsigned char *, const unsigned char *, size_t);
        void (*write_wakeup)(struct tty_port *port);
};

extern const struct tty_port_client_operations tty_port_default_client_ops;

struct tty_port {
        struct tty_bufhead        buf;                /* Locked internally */
        struct tty_struct        *tty;                /* Back pointer */
        struct tty_struct        *itty;                /* internal back ptr */
        const struct tty_port_operations *ops;        /* Port operations */
        const struct tty_port_client_operations *client_ops; /* Port client operations */
        spinlock_t                lock;                /* Lock protecting tty field */
        int                        blocked_open;        /* Waiting to open */
        int                        count;                /* Usage count */
        wait_queue_head_t        open_wait;        /* Open waiters */
        wait_queue_head_t        delta_msr_wait;        /* Modem status change */
        unsigned long                flags;                /* User TTY flags ASYNC_ */
        unsigned long                iflags;                /* Internal flags TTY_PORT_ */
        unsigned char                console:1,        /* port is a console */
                                low_latency:1;        /* optional: tune for latency */
        struct mutex                mutex;                /* Locking */
        struct mutex                buf_mutex;        /* Buffer alloc lock */
        unsigned char                *xmit_buf;        /* Optional buffer */
        unsigned int                close_delay;        /* Close port delay */
        unsigned int                closing_wait;        /* Delay for output */
        int                        drain_delay;        /* Set to zero if no pure time
                                                   based drain is needed else
                                                   set to size of fifo */
        struct kref                kref;                /* Ref counter */
        void                         *client_data;
};

/* tty_port::iflags bits -- use atomic bit ops */
#define TTY_PORT_INITIALIZED        0        /* device is initialized */
#define TTY_PORT_SUSPENDED        1        /* device is suspended */
#define TTY_PORT_ACTIVE                2        /* device is open */

/*
 * uart drivers: use the uart_port::status field and the UPSTAT_* defines
 * for s/w-based flow control steering and carrier detection status
 */
#define TTY_PORT_CTS_FLOW        3        /* h/w flow control enabled */
#define TTY_PORT_CHECK_CD        4        /* carrier detect enabled */
#define TTY_PORT_KOPENED        5        /* device exclusively opened by
                                           kernel */

/*
 * Where all of the state associated with a tty is kept while the tty
 * is open.  Since the termios state should be kept even if the tty
 * has been closed --- for things like the baud rate, etc --- it is
 * not stored here, but rather a pointer to the real state is stored
 * here.  Possible the winsize structure should have the same
 * treatment, but (1) the default 80x24 is usually right and (2) it's
 * most often used by a windowing system, which will set the correct
 * size each time the window is created or resized anyway.
 *                                                 - TYT, 9/14/92
 */

struct tty_operations;

struct tty_struct {
        int        magic;
        struct kref kref;
        struct device *dev;
        struct tty_driver *driver;
        const struct tty_operations *ops;
        int index;

        /* Protects ldisc changes: Lock tty not pty */
        struct ld_semaphore ldisc_sem;
        struct tty_ldisc *ldisc;

        struct mutex atomic_write_lock;
        struct mutex legacy_mutex;
        struct mutex throttle_mutex;
        struct rw_semaphore termios_rwsem;
        struct mutex winsize_mutex;
        spinlock_t ctrl_lock;
        spinlock_t flow_lock;
        /* Termios values are protected by the termios rwsem */
        struct ktermios termios, termios_locked;
        char name[64];
        struct pid *pgrp;                /* Protected by ctrl lock */
        /*
         * Writes protected by both ctrl lock and legacy mutex, readers must use
         * at least one of them.
         */
        struct pid *session;
        unsigned long flags;
        int count;
        struct winsize winsize;                /* winsize_mutex */
        unsigned long stopped:1,        /* flow_lock */
                      flow_stopped:1,
                      unused:BITS_PER_LONG - 2;
        int hw_stopped;
        unsigned long ctrl_status:8,        /* ctrl_lock */
                      packet:1,
                      unused_ctrl:BITS_PER_LONG - 9;
        unsigned int receive_room;        /* Bytes free for queue */
        int flow_change;

        struct tty_struct *link;
        struct fasync_struct *fasync;
        wait_queue_head_t write_wait;
        wait_queue_head_t read_wait;
        struct work_struct hangup_work;
        void *disc_data;
        void *driver_data;
        spinlock_t files_lock;                /* protects tty_files list */
        struct list_head tty_files;

#define N_TTY_BUF_SIZE 4096

        int closing;
        unsigned char *write_buf;
        int write_cnt;
        /* If the tty has a pending do_SAK, queue it here - akpm */
        struct work_struct SAK_work;
        struct tty_port *port;
} __randomize_layout;

/* Each of a tty's open files has private_data pointing to tty_file_private */
struct tty_file_private {
        struct tty_struct *tty;
        struct file *file;
        struct list_head list;
};

/* tty magic number */
#define TTY_MAGIC                0x5401

/*
 * These bits are used in the flags field of the tty structure.
 *
 * So that interrupts won't be able to mess up the queues,
 * copy_to_cooked must be atomic with respect to itself, as must
 * tty->write.  Thus, you must use the inline functions set_bit() and
 * clear_bit() to make things atomic.
 */
#define TTY_THROTTLED                 0        /* Call unthrottle() at threshold min */
#define TTY_IO_ERROR                 1        /* Cause an I/O error (may be no ldisc too) */
#define TTY_OTHER_CLOSED         2        /* Other side (if any) has closed */
#define TTY_EXCLUSIVE                 3        /* Exclusive open mode */
#define TTY_DO_WRITE_WAKEUP         5        /* Call write_wakeup after queuing new */
#define TTY_LDISC_OPEN                 11        /* Line discipline is open */
#define TTY_PTY_LOCK                 16        /* pty private */
#define TTY_NO_WRITE_SPLIT         17        /* Preserve write boundaries to driver */
#define TTY_HUPPED                 18        /* Post driver->hangup() */
#define TTY_HUPPING                19        /* Hangup in progress */
#define TTY_LDISC_CHANGING        20        /* Change pending - non-block IO */
#define TTY_LDISC_HALTED        22        /* Line discipline is halted */

static inline bool tty_io_nonblock(struct tty_struct *tty, struct file *file)
{
        return file->f_flags & O_NONBLOCK ||
                test_bit(TTY_LDISC_CHANGING, &tty->flags);
}

static inline bool tty_io_error(struct tty_struct *tty)
{
        return test_bit(TTY_IO_ERROR, &tty->flags);
}

static inline bool tty_throttled(struct tty_struct *tty)
{
        return test_bit(TTY_THROTTLED, &tty->flags);
}

#ifdef CONFIG_TTY
extern void tty_kref_put(struct tty_struct *tty);
extern struct pid *tty_get_pgrp(struct tty_struct *tty);
extern void tty_vhangup_self(void);
extern void disassociate_ctty(int priv);
extern dev_t tty_devnum(struct tty_struct *tty);
extern void proc_clear_tty(struct task_struct *p);
extern struct tty_struct *get_current_tty(void);
/* tty_io.c */
extern int __init tty_init(void);
extern const char *tty_name(const struct tty_struct *tty);
extern struct tty_struct *tty_kopen(dev_t device);
extern void tty_kclose(struct tty_struct *tty);
extern int tty_dev_name_to_number(const char *name, dev_t *number);
#else
static inline void tty_kref_put(struct tty_struct *tty)
{ }
static inline struct pid *tty_get_pgrp(struct tty_struct *tty)
{ return NULL; }
static inline void tty_vhangup_self(void)
{ }
static inline void disassociate_ctty(int priv)
{ }
static inline dev_t tty_devnum(struct tty_struct *tty)
{ return 0; }
static inline void proc_clear_tty(struct task_struct *p)
{ }
static inline struct tty_struct *get_current_tty(void)
{ return NULL; }
/* tty_io.c */
static inline int __init tty_init(void)
{ return 0; }
static inline const char *tty_name(const struct tty_struct *tty)
{ return "(none)"; }
static inline struct tty_struct *tty_kopen(dev_t device)
{ return ERR_PTR(-ENODEV); }
static inline void tty_kclose(struct tty_struct *tty)
{ }
static inline int tty_dev_name_to_number(const char *name, dev_t *number)
{ return -ENOTSUPP; }
#endif

extern struct ktermios tty_std_termios;

extern int vcs_init(void);

extern struct class *tty_class;

/**
 *        tty_kref_get                -        get a tty reference
 *        @tty: tty device
 *
 *        Return a new reference to a tty object. The caller must hold
 *        sufficient locks/counts to ensure that their existing reference cannot
 *        go away
 */

static inline struct tty_struct *tty_kref_get(struct tty_struct *tty)
{
        if (tty)
                kref_get(&tty->kref);
        return tty;
}

extern const char *tty_driver_name(const struct tty_struct *tty);
extern void tty_wait_until_sent(struct tty_struct *tty, long timeout);
extern void stop_tty(struct tty_struct *tty);
extern void start_tty(struct tty_struct *tty);
extern int tty_register_driver(struct tty_driver *driver);
extern int tty_unregister_driver(struct tty_driver *driver);
extern struct device *tty_register_device(struct tty_driver *driver,
                                          unsigned index, struct device *dev);
extern struct device *tty_register_device_attr(struct tty_driver *driver,
                                unsigned index, struct device *device,
                                void *drvdata,
                                const struct attribute_group **attr_grp);
extern void tty_unregister_device(struct tty_driver *driver, unsigned index);
extern void tty_write_message(struct tty_struct *tty, char *msg);
extern int tty_send_xchar(struct tty_struct *tty, char ch);
extern int tty_put_char(struct tty_struct *tty, unsigned char c);
extern int tty_chars_in_buffer(struct tty_struct *tty);
extern int tty_write_room(struct tty_struct *tty);
extern void tty_driver_flush_buffer(struct tty_struct *tty);
extern void tty_throttle(struct tty_struct *tty);
extern void tty_unthrottle(struct tty_struct *tty);
extern int tty_throttle_safe(struct tty_struct *tty);
extern int tty_unthrottle_safe(struct tty_struct *tty);
extern int tty_do_resize(struct tty_struct *tty, struct winsize *ws);
extern int is_current_pgrp_orphaned(void);
extern void tty_hangup(struct tty_struct *tty);
extern void tty_vhangup(struct tty_struct *tty);
extern int tty_hung_up_p(struct file *filp);
extern void do_SAK(struct tty_struct *tty);
extern void __do_SAK(struct tty_struct *tty);
extern void no_tty(void);
extern speed_t tty_termios_baud_rate(struct ktermios *termios);
extern void tty_termios_encode_baud_rate(struct ktermios *termios,
                                                speed_t ibaud, speed_t obaud);
extern void tty_encode_baud_rate(struct tty_struct *tty,
                                                speed_t ibaud, speed_t obaud);

/**
 *        tty_get_baud_rate        -        get tty bit rates
 *        @tty: tty to query
 *
 *        Returns the baud rate as an integer for this terminal. The
 *        termios lock must be held by the caller and the terminal bit
 *        flags may be updated.
 *
 *        Locking: none
 */
static inline speed_t tty_get_baud_rate(struct tty_struct *tty)
{
        return tty_termios_baud_rate(&tty->termios);
}

extern void tty_termios_copy_hw(struct ktermios *new, struct ktermios *old);
extern int tty_termios_hw_change(const struct ktermios *a, const struct ktermios *b);
extern int tty_set_termios(struct tty_struct *tty, struct ktermios *kt);

extern struct tty_ldisc *tty_ldisc_ref(struct tty_struct *);
extern void tty_ldisc_deref(struct tty_ldisc *);
extern struct tty_ldisc *tty_ldisc_ref_wait(struct tty_struct *);
extern const struct seq_operations tty_ldiscs_seq_ops;

extern void tty_wakeup(struct tty_struct *tty);
extern void tty_ldisc_flush(struct tty_struct *tty);

extern int tty_mode_ioctl(struct tty_struct *tty, struct file *file,
                        unsigned int cmd, unsigned long arg);
extern int tty_perform_flush(struct tty_struct *tty, unsigned long arg);
extern struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx);
extern void tty_release_struct(struct tty_struct *tty, int idx);
extern void tty_init_termios(struct tty_struct *tty);
extern void tty_save_termios(struct tty_struct *tty);
extern int tty_standard_install(struct tty_driver *driver,
                struct tty_struct *tty);

extern struct mutex tty_mutex;

extern void tty_port_init(struct tty_port *port);
extern void tty_port_link_device(struct tty_port *port,
                struct tty_driver *driver, unsigned index);
extern struct device *tty_port_register_device(struct tty_port *port,
                struct tty_driver *driver, unsigned index,
                struct device *device);
extern struct device *tty_port_register_device_attr(struct tty_port *port,
                struct tty_driver *driver, unsigned index,
                struct device *device, void *drvdata,
                const struct attribute_group **attr_grp);
extern struct device *tty_port_register_device_serdev(struct tty_port *port,
                struct tty_driver *driver, unsigned index,
                struct device *device);
extern struct device *tty_port_register_device_attr_serdev(struct tty_port *port,
                struct tty_driver *driver, unsigned index,
                struct device *device, void *drvdata,
                const struct attribute_group **attr_grp);
extern void tty_port_unregister_device(struct tty_port *port,
                struct tty_driver *driver, unsigned index);
extern int tty_port_alloc_xmit_buf(struct tty_port *port);
extern void tty_port_free_xmit_buf(struct tty_port *port);
extern void tty_port_destroy(struct tty_port *port);
extern void tty_port_put(struct tty_port *port);

static inline struct tty_port *tty_port_get(struct tty_port *port)
{
        if (port && kref_get_unless_zero(&port->kref))
                return port;
        return NULL;
}

/* If the cts flow control is enabled, return true. */
static inline bool tty_port_cts_enabled(struct tty_port *port)
{
        return test_bit(TTY_PORT_CTS_FLOW, &port->iflags);
}

static inline void tty_port_set_cts_flow(struct tty_port *port, bool val)
{
        if (val)
                set_bit(TTY_PORT_CTS_FLOW, &port->iflags);
        else
                clear_bit(TTY_PORT_CTS_FLOW, &port->iflags);
}

static inline bool tty_port_active(struct tty_port *port)
{
        return test_bit(TTY_PORT_ACTIVE, &port->iflags);
}

static inline void tty_port_set_active(struct tty_port *port, bool val)
{
        if (val)
                set_bit(TTY_PORT_ACTIVE, &port->iflags);
        else
                clear_bit(TTY_PORT_ACTIVE, &port->iflags);
}

static inline bool tty_port_check_carrier(struct tty_port *port)
{
        return test_bit(TTY_PORT_CHECK_CD, &port->iflags);
}

static inline void tty_port_set_check_carrier(struct tty_port *port, bool val)
{
        if (val)
                set_bit(TTY_PORT_CHECK_CD, &port->iflags);
        else
                clear_bit(TTY_PORT_CHECK_CD, &port->iflags);
}

static inline bool tty_port_suspended(struct tty_port *port)
{
        return test_bit(TTY_PORT_SUSPENDED, &port->iflags);
}

static inline void tty_port_set_suspended(struct tty_port *port, bool val)
{
        if (val)
                set_bit(TTY_PORT_SUSPENDED, &port->iflags);
        else
                clear_bit(TTY_PORT_SUSPENDED, &port->iflags);
}

static inline bool tty_port_initialized(struct tty_port *port)
{
        return test_bit(TTY_PORT_INITIALIZED, &port->iflags);
}

static inline void tty_port_set_initialized(struct tty_port *port, bool val)
{
        if (val)
                set_bit(TTY_PORT_INITIALIZED, &port->iflags);
        else
                clear_bit(TTY_PORT_INITIALIZED, &port->iflags);
}

static inline bool tty_port_kopened(struct tty_port *port)
{
        return test_bit(TTY_PORT_KOPENED, &port->iflags);
}

static inline void tty_port_set_kopened(struct tty_port *port, bool val)
{
        if (val)
                set_bit(TTY_PORT_KOPENED, &port->iflags);
        else
                clear_bit(TTY_PORT_KOPENED, &port->iflags);
}

extern struct tty_struct *tty_port_tty_get(struct tty_port *port);
extern void tty_port_tty_set(struct tty_port *port, struct tty_struct *tty);
extern int tty_port_carrier_raised(struct tty_port *port);
extern void tty_port_raise_dtr_rts(struct tty_port *port);
extern void tty_port_lower_dtr_rts(struct tty_port *port);
extern void tty_port_hangup(struct tty_port *port);
extern void tty_port_tty_hangup(struct tty_port *port, bool check_clocal);
extern void tty_port_tty_wakeup(struct tty_port *port);
extern int tty_port_block_til_ready(struct tty_port *port,
                                struct tty_struct *tty, struct file *filp);
extern int tty_port_close_start(struct tty_port *port,
                                struct tty_struct *tty, struct file *filp);
extern void tty_port_close_end(struct tty_port *port, struct tty_struct *tty);
extern void tty_port_close(struct tty_port *port,
                                struct tty_struct *tty, struct file *filp);
extern int tty_port_install(struct tty_port *port, struct tty_driver *driver,
                                struct tty_struct *tty);
extern int tty_port_open(struct tty_port *port,
                                struct tty_struct *tty, struct file *filp);
static inline int tty_port_users(struct tty_port *port)
{
        return port->count + port->blocked_open;
}

extern int tty_register_ldisc(int disc, struct tty_ldisc_ops *new_ldisc);
extern int tty_unregister_ldisc(int disc);
extern int tty_set_ldisc(struct tty_struct *tty, int disc);
extern int tty_ldisc_receive_buf(struct tty_ldisc *ld, const unsigned char *p,
                                 char *f, int count);

/* n_tty.c */
extern void n_tty_inherit_ops(struct tty_ldisc_ops *ops);
#ifdef CONFIG_TTY
extern void __init n_tty_init(void);
#else
static inline void n_tty_init(void) { }
#endif

/* tty_audit.c */
#ifdef CONFIG_AUDIT
extern void tty_audit_exit(void);
extern void tty_audit_fork(struct signal_struct *sig);
extern int tty_audit_push(void);
#else
static inline void tty_audit_exit(void)
{
}
static inline void tty_audit_fork(struct signal_struct *sig)
{
}
static inline int tty_audit_push(void)
{
        return 0;
}
#endif

/* tty_ioctl.c */
extern int n_tty_ioctl_helper(struct tty_struct *tty, struct file *file,
                       unsigned int cmd, unsigned long arg);

/* vt.c */

extern int vt_ioctl(struct tty_struct *tty,
                    unsigned int cmd, unsigned long arg);

extern long vt_compat_ioctl(struct tty_struct *tty,
                     unsigned int cmd, unsigned long arg);

/* tty_mutex.c */
/* functions for preparation of BKL removal */
extern void tty_lock(struct tty_struct *tty);
extern int  tty_lock_interruptible(struct tty_struct *tty);
extern void tty_unlock(struct tty_struct *tty);
extern void tty_lock_slave(struct tty_struct *tty);
extern void tty_unlock_slave(struct tty_struct *tty);
extern void tty_set_lock_subclass(struct tty_struct *tty);

#ifdef CONFIG_PROC_FS
extern void proc_tty_register_driver(struct tty_driver *);
extern void proc_tty_unregister_driver(struct tty_driver *);
#else
static inline void proc_tty_register_driver(struct tty_driver *d) {}
static inline void proc_tty_unregister_driver(struct tty_driver *d) {}
#endif

#endif


























































































































    1 












    1 
    1 
    1 
    1 






































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#ifndef _LINUX_JHASH_H
#define _LINUX_JHASH_H

/* jhash.h: Jenkins hash support.
 *
 * Copyright (C) 2006. Bob Jenkins (bob_jenkins@burtleburtle.net)
 *
 * https://burtleburtle.net/bob/hash/
 *
 * These are the credits from Bob's sources:
 *
 * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
 *
 * These are functions for producing 32-bit hashes for hash table lookup.
 * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
 * are externally useful functions.  Routines to test the hash are included
 * if SELF_TEST is defined.  You can use this free for any purpose.  It's in
 * the public domain.  It has no warranty.
 *
 * Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@netfilter.org)
 *
 * I've modified Bob's hash to be useful in the Linux kernel, and
 * any bugs present are my fault.
 * Jozsef
 */
#include <linux/bitops.h>
#include <linux/unaligned/packed_struct.h>

/* Best hash sizes are of power of two */
#define jhash_size(n)   ((u32)1<<(n))
/* Mask the hash value, i.e (value & jhash_mask(n)) instead of (value % n) */
#define jhash_mask(n)   (jhash_size(n)-1)

/* __jhash_mix -- mix 3 32-bit values reversibly. */
#define __jhash_mix(a, b, c)                        \
{                                                \
        a -= c;  a ^= rol32(c, 4);  c += b;        \
        b -= a;  b ^= rol32(a, 6);  a += c;        \
        c -= b;  c ^= rol32(b, 8);  b += a;        \
        a -= c;  a ^= rol32(c, 16); c += b;        \
        b -= a;  b ^= rol32(a, 19); a += c;        \
        c -= b;  c ^= rol32(b, 4);  b += a;        \
}

/* __jhash_final - final mixing of 3 32-bit values (a,b,c) into c */
#define __jhash_final(a, b, c)                        \
{                                                \
        c ^= b; c -= rol32(b, 14);                \
        a ^= c; a -= rol32(c, 11);                \
        b ^= a; b -= rol32(a, 25);                \
        c ^= b; c -= rol32(b, 16);                \
        a ^= c; a -= rol32(c, 4);                \
        b ^= a; b -= rol32(a, 14);                \
        c ^= b; c -= rol32(b, 24);                \
}

/* An arbitrary initial parameter */
#define JHASH_INITVAL                0xdeadbeef

/* jhash - hash an arbitrary key
 * @k: sequence of bytes as key
 * @length: the length of the key
 * @initval: the previous hash, or an arbitray value
 *
 * The generic version, hashes an arbitrary sequence of bytes.
 * No alignment or length assumptions are made about the input key.
 *
 * Returns the hash value of the key. The result depends on endianness.
 */
static inline u32 jhash(const void *key, u32 length, u32 initval)
{
        u32 a, b, c;
        const u8 *k = key;

        /* Set up the internal state */
        a = b = c = JHASH_INITVAL + length + initval;

        /* All but the last block: affect some 32 bits of (a,b,c) */
        while (length > 12) {
                a += __get_unaligned_cpu32(k);
                b += __get_unaligned_cpu32(k + 4);
                c += __get_unaligned_cpu32(k + 8);
                __jhash_mix(a, b, c);
                length -= 12;
                k += 12;
        }
        /* Last block: affect all 32 bits of (c) */
        switch (length) {
        case 12: c += (u32)k[11]<<24;        fallthrough;
        case 11: c += (u32)k[10]<<16;        fallthrough;
        case 10: c += (u32)k[9]<<8;        fallthrough;
        case 9:  c += k[8];                fallthrough;
        case 8:  b += (u32)k[7]<<24;        fallthrough;
        case 7:  b += (u32)k[6]<<16;        fallthrough;
        case 6:  b += (u32)k[5]<<8;        fallthrough;
        case 5:  b += k[4];                fallthrough;
        case 4:  a += (u32)k[3]<<24;        fallthrough;
        case 3:  a += (u32)k[2]<<16;        fallthrough;
        case 2:  a += (u32)k[1]<<8;        fallthrough;
        case 1:  a += k[0];
                 __jhash_final(a, b, c);
                 break;
        case 0: /* Nothing left to add */
                break;
        }

        return c;
}

/* jhash2 - hash an array of u32's
 * @k: the key which must be an array of u32's
 * @length: the number of u32's in the key
 * @initval: the previous hash, or an arbitray value
 *
 * Returns the hash value of the key.
 */
static inline u32 jhash2(const u32 *k, u32 length, u32 initval)
{
        u32 a, b, c;

        /* Set up the internal state */
        a = b = c = JHASH_INITVAL + (length<<2) + initval;

        /* Handle most of the key */
        while (length > 3) {
                a += k[0];
                b += k[1];
                c += k[2];
                __jhash_mix(a, b, c);
                length -= 3;
                k += 3;
        }

        /* Handle the last 3 u32's */
        switch (length) {
        case 3: c += k[2];        fallthrough;
        case 2: b += k[1];        fallthrough;
        case 1: a += k[0];
                __jhash_final(a, b, c);
                break;
        case 0:        /* Nothing left to add */
                break;
        }

        return c;
}


/* __jhash_nwords - hash exactly 3, 2 or 1 word(s) */
static inline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval)
{
        a += initval;
        b += initval;
        c += initval;

        __jhash_final(a, b, c);

        return c;
}

static inline u32 jhash_3words(u32 a, u32 b, u32 c, u32 initval)
{
        return __jhash_nwords(a, b, c, initval + JHASH_INITVAL + (3 << 2));
}

static inline u32 jhash_2words(u32 a, u32 b, u32 initval)
{
        return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2));
}

static inline u32 jhash_1word(u32 a, u32 initval)
{
        return __jhash_nwords(a, 0, 0, initval + JHASH_INITVAL + (1 << 2));
}

#endif /* _LINUX_JHASH_H */






























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 





















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                The User Datagram Protocol (UDP).
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 *                Alan Cox, <alan@lxorguk.ukuu.org.uk>
 *                Hirokazu Takahashi, <taka@valinux.co.jp>
 *
 * Fixes:
 *                Alan Cox        :        verify_area() calls
 *                Alan Cox        :         stopped close while in use off icmp
 *                                        messages. Not a fix but a botch that
 *                                        for udp at least is 'valid'.
 *                Alan Cox        :        Fixed icmp handling properly
 *                Alan Cox        :         Correct error for oversized datagrams
 *                Alan Cox        :        Tidied select() semantics.
 *                Alan Cox        :        udp_err() fixed properly, also now
 *                                        select and read wake correctly on errors
 *                Alan Cox        :        udp_send verify_area moved to avoid mem leak
 *                Alan Cox        :        UDP can count its memory
 *                Alan Cox        :        send to an unknown connection causes
 *                                        an ECONNREFUSED off the icmp, but
 *                                        does NOT close.
 *                Alan Cox        :        Switched to new sk_buff handlers. No more backlog!
 *                Alan Cox        :        Using generic datagram code. Even smaller and the PEEK
 *                                        bug no longer crashes it.
 *                Fred Van Kempen        :         Net2e support for sk->broadcast.
 *                Alan Cox        :        Uses skb_free_datagram
 *                Alan Cox        :        Added get/set sockopt support.
 *                Alan Cox        :        Broadcasting without option set returns EACCES.
 *                Alan Cox        :        No wakeup calls. Instead we now use the callbacks.
 *                Alan Cox        :        Use ip_tos and ip_ttl
 *                Alan Cox        :        SNMP Mibs
 *                Alan Cox        :        MSG_DONTROUTE, and 0.0.0.0 support.
 *                Matt Dillon        :        UDP length checks.
 *                Alan Cox        :        Smarter af_inet used properly.
 *                Alan Cox        :        Use new kernel side addressing.
 *                Alan Cox        :        Incorrect return on truncated datagram receive.
 *        Arnt Gulbrandsen         :        New udp_send and stuff
 *                Alan Cox        :        Cache last socket
 *                Alan Cox        :        Route cache
 *                Jon Peatfield        :        Minor efficiency fix to sendto().
 *                Mike Shaver        :        RFC1122 checks.
 *                Alan Cox        :        Nonblocking error fix.
 *        Willy Konynenberg        :        Transparent proxying support.
 *                Mike McLagan        :        Routing by source
 *                David S. Miller        :        New socket lookup architecture.
 *                                        Last socket cache retained as it
 *                                        does have a high hit rate.
 *                Olaf Kirch        :        Don't linearise iovec on sendmsg.
 *                Andi Kleen        :        Some cleanups, cache destination entry
 *                                        for connect.
 *        Vitaly E. Lavrov        :        Transparent proxy revived after year coma.
 *                Melvin Smith        :        Check msg_name not msg_namelen in sendto(),
 *                                        return ENOTCONN for unconnected sockets (POSIX)
 *                Janos Farkas        :        don't deliver multi/broadcasts to a different
 *                                        bound-to-device socket
 *        Hirokazu Takahashi        :        HW checksumming for outgoing UDP
 *                                        datagrams.
 *        Hirokazu Takahashi        :        sendfile() on UDP works now.
 *                Arnaldo C. Melo :        convert /proc/net/udp to seq_file
 *        YOSHIFUJI Hideaki @USAGI and:        Support IPV6_V6ONLY socket option, which
 *        Alexey Kuznetsov:                allow both IPv4 and IPv6 sockets to bind
 *                                        a single port at the same time.
 *        Derek Atkins <derek@ihtfp.com>: Add Encapulation Support
 *        James Chapman                :        Add L2TP encapsulation type.
 */

#define pr_fmt(fmt) "UDP: " fmt

#include <linux/uaccess.h>
#include <asm/ioctls.h>
#include <linux/memblock.h>
#include <linux/highmem.h>
#include <linux/swap.h>
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/module.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/igmp.h>
#include <linux/inetdevice.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/timer.h>
#include <linux/mm.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/slab.h>
#include <net/tcp_states.h>
#include <linux/skbuff.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <net/net_namespace.h>
#include <net/icmp.h>
#include <net/inet_hashtables.h>
#include <net/ip_tunnels.h>
#include <net/route.h>
#include <net/checksum.h>
#include <net/xfrm.h>
#include <trace/events/udp.h>
#include <linux/static_key.h>
#include <linux/btf_ids.h>
#include <trace/events/skb.h>
#include <net/busy_poll.h>
#include "udp_impl.h"
#include <net/sock_reuseport.h>
#include <net/addrconf.h>
#include <net/udp_tunnel.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6_stubs.h>
#endif

struct udp_table udp_table __read_mostly;
EXPORT_SYMBOL(udp_table);

long sysctl_udp_mem[3] __read_mostly;
EXPORT_SYMBOL(sysctl_udp_mem);

atomic_long_t udp_memory_allocated;
EXPORT_SYMBOL(udp_memory_allocated);

#define MAX_UDP_PORTS 65536
#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)

static int udp_lib_lport_inuse(struct net *net, __u16 num,
                               const struct udp_hslot *hslot,
                               unsigned long *bitmap,
                               struct sock *sk, unsigned int log)
{
        struct sock *sk2;
        kuid_t uid = sock_i_uid(sk);

        sk_for_each(sk2, &hslot->head) {
                if (net_eq(sock_net(sk2), net) &&
                    sk2 != sk &&
                    (bitmap || udp_sk(sk2)->udp_port_hash == num) &&
                    (!sk2->sk_reuse || !sk->sk_reuse) &&
                    (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
                     sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
                    inet_rcv_saddr_equal(sk, sk2, true)) {
                        if (sk2->sk_reuseport && sk->sk_reuseport &&
                            !rcu_access_pointer(sk->sk_reuseport_cb) &&
                            uid_eq(uid, sock_i_uid(sk2))) {
                                if (!bitmap)
                                        return 0;
                        } else {
                                if (!bitmap)
                                        return 1;
                                __set_bit(udp_sk(sk2)->udp_port_hash >> log,
                                          bitmap);
                        }
                }
        }
        return 0;
}

/*
 * Note: we still hold spinlock of primary hash chain, so no other writer
 * can insert/delete a socket with local_port == num
 */
static int udp_lib_lport_inuse2(struct net *net, __u16 num,
                                struct udp_hslot *hslot2,
                                struct sock *sk)
{
        struct sock *sk2;
        kuid_t uid = sock_i_uid(sk);
        int res = 0;

        spin_lock(&hslot2->lock);
        udp_portaddr_for_each_entry(sk2, &hslot2->head) {
                if (net_eq(sock_net(sk2), net) &&
                    sk2 != sk &&
                    (udp_sk(sk2)->udp_port_hash == num) &&
                    (!sk2->sk_reuse || !sk->sk_reuse) &&
                    (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
                     sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
                    inet_rcv_saddr_equal(sk, sk2, true)) {
                        if (sk2->sk_reuseport && sk->sk_reuseport &&
                            !rcu_access_pointer(sk->sk_reuseport_cb) &&
                            uid_eq(uid, sock_i_uid(sk2))) {
                                res = 0;
                        } else {
                                res = 1;
                        }
                        break;
                }
        }
        spin_unlock(&hslot2->lock);
        return res;
}

static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot)
{
        struct net *net = sock_net(sk);
        kuid_t uid = sock_i_uid(sk);
        struct sock *sk2;

        sk_for_each(sk2, &hslot->head) {
                if (net_eq(sock_net(sk2), net) &&
                    sk2 != sk &&
                    sk2->sk_family == sk->sk_family &&
                    ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
                    (udp_sk(sk2)->udp_port_hash == udp_sk(sk)->udp_port_hash) &&
                    (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
                    sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
                    inet_rcv_saddr_equal(sk, sk2, false)) {
                        return reuseport_add_sock(sk, sk2,
                                                  inet_rcv_saddr_any(sk));
                }
        }

        return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
}

/**
 *  udp_lib_get_port  -  UDP/-Lite port lookup for IPv4 and IPv6
 *
 *  @sk:          socket struct in question
 *  @snum:        port number to look up
 *  @hash2_nulladdr: AF-dependent hash value in secondary hash chains,
 *                   with NULL address
 */
int udp_lib_get_port(struct sock *sk, unsigned short snum,
                     unsigned int hash2_nulladdr)
{
        struct udp_hslot *hslot, *hslot2;
        struct udp_table *udptable = sk->sk_prot->h.udp_table;
        int    error = 1;
        struct net *net = sock_net(sk);

        if (!snum) {
                int low, high, remaining;
                unsigned int rand;
                unsigned short first, last;
                DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN);

                inet_get_local_port_range(net, &low, &high);
                remaining = (high - low) + 1;

                rand = prandom_u32();
                first = reciprocal_scale(rand, remaining) + low;
                /*
                 * force rand to be an odd multiple of UDP_HTABLE_SIZE
                 */
                rand = (rand | 1) * (udptable->mask + 1);
                last = first + udptable->mask + 1;
                do {
                        hslot = udp_hashslot(udptable, net, first);
                        bitmap_zero(bitmap, PORTS_PER_CHAIN);
                        spin_lock_bh(&hslot->lock);
                        udp_lib_lport_inuse(net, snum, hslot, bitmap, sk,
                                            udptable->log);

                        snum = first;
                        /*
                         * Iterate on all possible values of snum for this hash.
                         * Using steps of an odd multiple of UDP_HTABLE_SIZE
                         * give us randomization and full range coverage.
                         */
                        do {
                                if (low <= snum && snum <= high &&
                                    !test_bit(snum >> udptable->log, bitmap) &&
                                    !inet_is_local_reserved_port(net, snum))
                                        goto found;
                                snum += rand;
                        } while (snum != first);
                        spin_unlock_bh(&hslot->lock);
                        cond_resched();
                } while (++first != last);
                goto fail;
        } else {
                hslot = udp_hashslot(udptable, net, snum);
                spin_lock_bh(&hslot->lock);
                if (hslot->count > 10) {
                        int exist;
                        unsigned int slot2 = udp_sk(sk)->udp_portaddr_hash ^ snum;

                        slot2          &= udptable->mask;
                        hash2_nulladdr &= udptable->mask;

                        hslot2 = udp_hashslot2(udptable, slot2);
                        if (hslot->count < hslot2->count)
                                goto scan_primary_hash;

                        exist = udp_lib_lport_inuse2(net, snum, hslot2, sk);
                        if (!exist && (hash2_nulladdr != slot2)) {
                                hslot2 = udp_hashslot2(udptable, hash2_nulladdr);
                                exist = udp_lib_lport_inuse2(net, snum, hslot2,
                                                             sk);
                        }
                        if (exist)
                                goto fail_unlock;
                        else
                                goto found;
                }
scan_primary_hash:
                if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, 0))
                        goto fail_unlock;
        }
found:
        inet_sk(sk)->inet_num = snum;
        udp_sk(sk)->udp_port_hash = snum;
        udp_sk(sk)->udp_portaddr_hash ^= snum;
        if (sk_unhashed(sk)) {
                if (sk->sk_reuseport &&
                    udp_reuseport_add_sock(sk, hslot)) {
                        inet_sk(sk)->inet_num = 0;
                        udp_sk(sk)->udp_port_hash = 0;
                        udp_sk(sk)->udp_portaddr_hash ^= snum;
                        goto fail_unlock;
                }

                sock_set_flag(sk, SOCK_RCU_FREE);

                sk_add_node_rcu(sk, &hslot->head);
                hslot->count++;
                sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);

                hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
                spin_lock(&hslot2->lock);
                if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
                    sk->sk_family == AF_INET6)
                        hlist_add_tail_rcu(&udp_sk(sk)->udp_portaddr_node,
                                           &hslot2->head);
                else
                        hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
                                           &hslot2->head);
                hslot2->count++;
                spin_unlock(&hslot2->lock);
        }

        error = 0;
fail_unlock:
        spin_unlock_bh(&hslot->lock);
fail:
        return error;
}
EXPORT_SYMBOL(udp_lib_get_port);

int udp_v4_get_port(struct sock *sk, unsigned short snum)
{
        unsigned int hash2_nulladdr =
                ipv4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum);
        unsigned int hash2_partial =
                ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);

        /* precompute partial secondary hash */
        udp_sk(sk)->udp_portaddr_hash = hash2_partial;
        return udp_lib_get_port(sk, snum, hash2_nulladdr);
}

static int compute_score(struct sock *sk, struct net *net,
                         __be32 saddr, __be16 sport,
                         __be32 daddr, unsigned short hnum,
                         int dif, int sdif)
{
        int score;
        struct inet_sock *inet;
        bool dev_match;

        if (!net_eq(sock_net(sk), net) ||
            udp_sk(sk)->udp_port_hash != hnum ||
            ipv6_only_sock(sk))
                return -1;

        if (sk->sk_rcv_saddr != daddr)
                return -1;

        score = (sk->sk_family == PF_INET) ? 2 : 1;

        inet = inet_sk(sk);
        if (inet->inet_daddr) {
                if (inet->inet_daddr != saddr)
                        return -1;
                score += 4;
        }

        if (inet->inet_dport) {
                if (inet->inet_dport != sport)
                        return -1;
                score += 4;
        }

        dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
                                        dif, sdif);
        if (!dev_match)
                return -1;
        if (sk->sk_bound_dev_if)
                score += 4;

        if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
                score++;
        return score;
}

INDIRECT_CALLABLE_SCOPE
u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport,
                const __be32 faddr, const __be16 fport)
{
        static u32 udp_ehash_secret __read_mostly;

        net_get_random_once(&udp_ehash_secret, sizeof(udp_ehash_secret));

        return __inet_ehashfn(laddr, lport, faddr, fport,
                              udp_ehash_secret + net_hash_mix(net));
}

/* called with rcu_read_lock() */
static struct sock *udp4_lib_lookup2(struct net *net,
                                     __be32 saddr, __be16 sport,
                                     __be32 daddr, unsigned int hnum,
                                     int dif, int sdif,
                                     struct udp_hslot *hslot2,
                                     struct sk_buff *skb)
{
        struct sock *sk, *result;
        int score, badness;
        bool need_rescore;

        result = NULL;
        badness = 0;
        udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
                need_rescore = false;
rescore:
                score = compute_score(need_rescore ? result : sk, net, saddr,
                                      sport, daddr, hnum, dif, sdif);
                if (score > badness) {
                        badness = score;

                        if (need_rescore)
                                continue;

                        if (sk->sk_state == TCP_ESTABLISHED) {
                                result = sk;
                                continue;
                        }

                        result = inet_lookup_reuseport(net, sk, skb, sizeof(struct udphdr),
                                                       saddr, sport, daddr, hnum, udp_ehashfn);
                        if (!result) {
                                result = sk;
                                continue;
                        }

                        /* Fall back to scoring if group has connections */
                        if (!reuseport_has_conns(sk))
                                return result;

                        /* Reuseport logic returned an error, keep original score. */
                        if (IS_ERR(result))
                                continue;

                        /* compute_score is too long of a function to be
                         * inlined, and calling it again here yields
                         * measureable overhead for some
                         * workloads. Work around it by jumping
                         * backwards to rescore 'result'.
                         */
                        need_rescore = true;
                        goto rescore;
                }
        }
        return result;
}

static struct sock *udp4_lookup_run_bpf(struct net *net,
                                        struct udp_table *udptable,
                                        struct sk_buff *skb,
                                        __be32 saddr, __be16 sport,
                                        __be32 daddr, u16 hnum)
{
        struct sock *sk, *reuse_sk;
        bool no_reuseport;

        if (udptable != &udp_table)
                return NULL; /* only UDP is supported */

        no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_UDP,
                                            saddr, sport, daddr, hnum, &sk);
        if (no_reuseport || IS_ERR_OR_NULL(sk))
                return sk;

        reuse_sk = inet_lookup_reuseport(net, sk, skb, sizeof(struct udphdr),
                                         saddr, sport, daddr, hnum, udp_ehashfn);
        if (reuse_sk)
                sk = reuse_sk;
        return sk;
}

/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
 * harder than this. -DaveM
 */
struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
                __be16 sport, __be32 daddr, __be16 dport, int dif,
                int sdif, struct udp_table *udptable, struct sk_buff *skb)
{
        unsigned short hnum = ntohs(dport);
        unsigned int hash2, slot2;
        struct udp_hslot *hslot2;
        struct sock *result, *sk;

        hash2 = ipv4_portaddr_hash(net, daddr, hnum);
        slot2 = hash2 & udptable->mask;
        hslot2 = &udptable->hash2[slot2];

        /* Lookup connected or non-wildcard socket */
        result = udp4_lib_lookup2(net, saddr, sport,
                                  daddr, hnum, dif, sdif,
                                  hslot2, skb);
        if (!IS_ERR_OR_NULL(result) && result->sk_state == TCP_ESTABLISHED)
                goto done;

        /* Lookup redirect from BPF */
        if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
                sk = udp4_lookup_run_bpf(net, udptable, skb,
                                         saddr, sport, daddr, hnum);
                if (sk) {
                        result = sk;
                        goto done;
                }
        }

        /* Got non-wildcard socket or error on first lookup */
        if (result)
                goto done;

        /* Lookup wildcard sockets */
        hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
        slot2 = hash2 & udptable->mask;
        hslot2 = &udptable->hash2[slot2];

        result = udp4_lib_lookup2(net, saddr, sport,
                                  htonl(INADDR_ANY), hnum, dif, sdif,
                                  hslot2, skb);
done:
        if (IS_ERR(result))
                return NULL;
        return result;
}
EXPORT_SYMBOL_GPL(__udp4_lib_lookup);

static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
                                                 __be16 sport, __be16 dport,
                                                 struct udp_table *udptable)
{
        const struct iphdr *iph = ip_hdr(skb);

        return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport,
                                 iph->daddr, dport, inet_iif(skb),
                                 inet_sdif(skb), udptable, skb);
}

struct sock *udp4_lib_lookup_skb(struct sk_buff *skb,
                                 __be16 sport, __be16 dport)
{
        const struct iphdr *iph = ip_hdr(skb);

        return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport,
                                 iph->daddr, dport, inet_iif(skb),
                                 inet_sdif(skb), &udp_table, NULL);
}
EXPORT_SYMBOL_GPL(udp4_lib_lookup_skb);

/* Must be called under rcu_read_lock().
 * Does increment socket refcount.
 */
#if IS_ENABLED(CONFIG_NF_TPROXY_IPV4) || IS_ENABLED(CONFIG_NF_SOCKET_IPV4)
struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
                             __be32 daddr, __be16 dport, int dif)
{
        struct sock *sk;

        sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport,
                               dif, 0, &udp_table, NULL);
        if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
                sk = NULL;
        return sk;
}
EXPORT_SYMBOL_GPL(udp4_lib_lookup);
#endif

static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
                                       __be16 loc_port, __be32 loc_addr,
                                       __be16 rmt_port, __be32 rmt_addr,
                                       int dif, int sdif, unsigned short hnum)
{
        struct inet_sock *inet = inet_sk(sk);

        if (!net_eq(sock_net(sk), net) ||
            udp_sk(sk)->udp_port_hash != hnum ||
            (inet->inet_daddr && inet->inet_daddr != rmt_addr) ||
            (inet->inet_dport != rmt_port && inet->inet_dport) ||
            (inet->inet_rcv_saddr && inet->inet_rcv_saddr != loc_addr) ||
            ipv6_only_sock(sk) ||
            !udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
                return false;
        if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif, sdif))
                return false;
        return true;
}

DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
EXPORT_SYMBOL(udp_encap_needed_key);

#if IS_ENABLED(CONFIG_IPV6)
DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
EXPORT_SYMBOL(udpv6_encap_needed_key);
#endif

void udp_encap_enable(void)
{
        static_branch_inc(&udp_encap_needed_key);
}
EXPORT_SYMBOL(udp_encap_enable);

void udp_encap_disable(void)
{
        static_branch_dec(&udp_encap_needed_key);
}
EXPORT_SYMBOL(udp_encap_disable);

/* Handler for tunnels with arbitrary destination ports: no socket lookup, go
 * through error handlers in encapsulations looking for a match.
 */
static int __udp4_lib_err_encap_no_sk(struct sk_buff *skb, u32 info)
{
        int i;

        for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) {
                int (*handler)(struct sk_buff *skb, u32 info);
                const struct ip_tunnel_encap_ops *encap;

                encap = rcu_dereference(iptun_encaps[i]);
                if (!encap)
                        continue;
                handler = encap->err_handler;
                if (handler && !handler(skb, info))
                        return 0;
        }

        return -ENOENT;
}

/* Try to match ICMP errors to UDP tunnels by looking up a socket without
 * reversing source and destination port: this will match tunnels that force the
 * same destination port on both endpoints (e.g. VXLAN, GENEVE). Note that
 * lwtunnels might actually break this assumption by being configured with
 * different destination ports on endpoints, in this case we won't be able to
 * trace ICMP messages back to them.
 *
 * If this doesn't match any socket, probe tunnels with arbitrary destination
 * ports (e.g. FoU, GUE): there, the receiving socket is useless, as the port
 * we've sent packets to won't necessarily match the local destination port.
 *
 * Then ask the tunnel implementation to match the error against a valid
 * association.
 *
 * Return an error if we can't find a match, the socket if we need further
 * processing, zero otherwise.
 */
static struct sock *__udp4_lib_err_encap(struct net *net,
                                         const struct iphdr *iph,
                                         struct udphdr *uh,
                                         struct udp_table *udptable,
                                         struct sk_buff *skb, u32 info)
{
        int network_offset, transport_offset;
        struct sock *sk;

        network_offset = skb_network_offset(skb);
        transport_offset = skb_transport_offset(skb);

        /* Network header needs to point to the outer IPv4 header inside ICMP */
        skb_reset_network_header(skb);

        /* Transport header needs to point to the UDP header */
        skb_set_transport_header(skb, iph->ihl << 2);

        sk = __udp4_lib_lookup(net, iph->daddr, uh->source,
                               iph->saddr, uh->dest, skb->dev->ifindex, 0,
                               udptable, NULL);
        if (sk) {
                int (*lookup)(struct sock *sk, struct sk_buff *skb);
                struct udp_sock *up = udp_sk(sk);

                lookup = READ_ONCE(up->encap_err_lookup);
                if (!lookup || lookup(sk, skb))
                        sk = NULL;
        }

        if (!sk)
                sk = ERR_PTR(__udp4_lib_err_encap_no_sk(skb, info));

        skb_set_transport_header(skb, transport_offset);
        skb_set_network_header(skb, network_offset);

        return sk;
}

/*
 * This routine is called by the ICMP module when it gets some
 * sort of error condition.  If err < 0 then the socket should
 * be closed and the error returned to the user.  If err > 0
 * it's just the icmp type << 8 | icmp code.
 * Header points to the ip header of the error packet. We move
 * on past this. Then (as it used to claim before adjustment)
 * header points to the first 8 bytes of the udp header.  We need
 * to find the appropriate port.
 */

int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
{
        struct inet_sock *inet;
        const struct iphdr *iph = (const struct iphdr *)skb->data;
        struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2));
        const int type = icmp_hdr(skb)->type;
        const int code = icmp_hdr(skb)->code;
        bool tunnel = false;
        struct sock *sk;
        int harderr;
        int err;
        struct net *net = dev_net(skb->dev);

        sk = __udp4_lib_lookup(net, iph->daddr, uh->dest,
                               iph->saddr, uh->source, skb->dev->ifindex,
                               inet_sdif(skb), udptable, NULL);
        if (!sk) {
                /* No socket for error: try tunnels before discarding */
                sk = ERR_PTR(-ENOENT);
                if (static_branch_unlikely(&udp_encap_needed_key)) {
                        sk = __udp4_lib_err_encap(net, iph, uh, udptable, skb,
                                                  info);
                        if (!sk)
                                return 0;
                }

                if (IS_ERR(sk)) {
                        __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
                        return PTR_ERR(sk);
                }

                tunnel = true;
        }

        err = 0;
        harderr = 0;
        inet = inet_sk(sk);

        switch (type) {
        default:
        case ICMP_TIME_EXCEEDED:
                err = EHOSTUNREACH;
                break;
        case ICMP_SOURCE_QUENCH:
                goto out;
        case ICMP_PARAMETERPROB:
                err = EPROTO;
                harderr = 1;
                break;
        case ICMP_DEST_UNREACH:
                if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
                        ipv4_sk_update_pmtu(skb, sk, info);
                        if (inet->pmtudisc != IP_PMTUDISC_DONT) {
                                err = EMSGSIZE;
                                harderr = 1;
                                break;
                        }
                        goto out;
                }
                err = EHOSTUNREACH;
                if (code <= NR_ICMP_UNREACH) {
                        harderr = icmp_err_convert[code].fatal;
                        err = icmp_err_convert[code].errno;
                }
                break;
        case ICMP_REDIRECT:
                ipv4_sk_redirect(skb, sk);
                goto out;
        }

        /*
         *      RFC1122: OK.  Passes ICMP errors back to application, as per
         *        4.1.3.3.
         */
        if (tunnel) {
                /* ...not for tunnels though: we don't have a sending socket */
                goto out;
        }
        if (!inet->recverr) {
                if (!harderr || sk->sk_state != TCP_ESTABLISHED)
                        goto out;
        } else
                ip_icmp_error(sk, skb, err, uh->dest, info, (u8 *)(uh+1));

        sk->sk_err = err;
        sk->sk_error_report(sk);
out:
        return 0;
}

int udp_err(struct sk_buff *skb, u32 info)
{
        return __udp4_lib_err(skb, info, &udp_table);
}

/*
 * Throw away all pending data and cancel the corking. Socket is locked.
 */
void udp_flush_pending_frames(struct sock *sk)
{
        struct udp_sock *up = udp_sk(sk);

        if (up->pending) {
                up->len = 0;
                up->pending = 0;
                ip_flush_pending_frames(sk);
        }
}
EXPORT_SYMBOL(udp_flush_pending_frames);

/**
 *         udp4_hwcsum  -  handle outgoing HW checksumming
 *         @skb:         sk_buff containing the filled-in UDP header
 *                 (checksum field must be zeroed out)
 *        @src:        source IP address
 *        @dst:        destination IP address
 */
void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
{
        struct udphdr *uh = udp_hdr(skb);
        int offset = skb_transport_offset(skb);
        int len = skb->len - offset;
        int hlen = len;
        __wsum csum = 0;

        if (!skb_has_frag_list(skb)) {
                /*
                 * Only one fragment on the socket.
                 */
                skb->csum_start = skb_transport_header(skb) - skb->head;
                skb->csum_offset = offsetof(struct udphdr, check);
                uh->check = ~csum_tcpudp_magic(src, dst, len,
                                               IPPROTO_UDP, 0);
        } else {
                struct sk_buff *frags;

                /*
                 * HW-checksum won't work as there are two or more
                 * fragments on the socket so that all csums of sk_buffs
                 * should be together
                 */
                skb_walk_frags(skb, frags) {
                        csum = csum_add(csum, frags->csum);
                        hlen -= frags->len;
                }

                csum = skb_checksum(skb, offset, hlen, csum);
                skb->ip_summed = CHECKSUM_NONE;

                uh->check = csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, csum);
                if (uh->check == 0)
                        uh->check = CSUM_MANGLED_0;
        }
}
EXPORT_SYMBOL_GPL(udp4_hwcsum);

/* Function to set UDP checksum for an IPv4 UDP packet. This is intended
 * for the simple case like when setting the checksum for a UDP tunnel.
 */
void udp_set_csum(bool nocheck, struct sk_buff *skb,
                  __be32 saddr, __be32 daddr, int len)
{
        struct udphdr *uh = udp_hdr(skb);

        if (nocheck) {
                uh->check = 0;
        } else if (skb_is_gso(skb)) {
                uh->check = ~udp_v4_check(len, saddr, daddr, 0);
        } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
                uh->check = 0;
                uh->check = udp_v4_check(len, saddr, daddr, lco_csum(skb));
                if (uh->check == 0)
                        uh->check = CSUM_MANGLED_0;
        } else {
                skb->ip_summed = CHECKSUM_PARTIAL;
                skb->csum_start = skb_transport_header(skb) - skb->head;
                skb->csum_offset = offsetof(struct udphdr, check);
                uh->check = ~udp_v4_check(len, saddr, daddr, 0);
        }
}
EXPORT_SYMBOL(udp_set_csum);

static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
                        struct inet_cork *cork)
{
        struct sock *sk = skb->sk;
        struct inet_sock *inet = inet_sk(sk);
        struct udphdr *uh;
        int err = 0;
        int is_udplite = IS_UDPLITE(sk);
        int offset = skb_transport_offset(skb);
        int len = skb->len - offset;
        int datalen = len - sizeof(*uh);
        __wsum csum = 0;

        /*
         * Create a UDP header
         */
        uh = udp_hdr(skb);
        uh->source = inet->inet_sport;
        uh->dest = fl4->fl4_dport;
        uh->len = htons(len);
        uh->check = 0;

        if (cork->gso_size) {
                const int hlen = skb_network_header_len(skb) +
                                 sizeof(struct udphdr);

                if (hlen + min_t(int, datalen, cork->gso_size) > cork->fragsize) {
                        kfree_skb(skb);
                        return -EMSGSIZE;
                }
                if (datalen > cork->gso_size * UDP_MAX_SEGMENTS) {
                        kfree_skb(skb);
                        return -EINVAL;
                }
                if (sk->sk_no_check_tx) {
                        kfree_skb(skb);
                        return -EINVAL;
                }
                if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite ||
                    dst_xfrm(skb_dst(skb))) {
                        kfree_skb(skb);
                        return -EIO;
                }

                if (datalen > cork->gso_size) {
                        skb_shinfo(skb)->gso_size = cork->gso_size;
                        skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
                        skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen,
                                                                 cork->gso_size);
                }
                goto csum_partial;
        }

        if (is_udplite)                                   /*     UDP-Lite      */
                csum = udplite_csum(skb);

        else if (sk->sk_no_check_tx) {                         /* UDP csum off */

                skb->ip_summed = CHECKSUM_NONE;
                goto send;

        } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
csum_partial:

                udp4_hwcsum(skb, fl4->saddr, fl4->daddr);
                goto send;

        } else
                csum = udp_csum(skb);

        /* add protocol-dependent pseudo-header */
        uh->check = csum_tcpudp_magic(fl4->saddr, fl4->daddr, len,
                                      sk->sk_protocol, csum);
        if (uh->check == 0)
                uh->check = CSUM_MANGLED_0;

send:
        err = ip_send_skb(sock_net(sk), skb);
        if (err) {
                if (err == -ENOBUFS && !inet->recverr) {
                        UDP_INC_STATS(sock_net(sk),
                                      UDP_MIB_SNDBUFERRORS, is_udplite);
                        err = 0;
                }
        } else
                UDP_INC_STATS(sock_net(sk),
                              UDP_MIB_OUTDATAGRAMS, is_udplite);
        return err;
}

/*
 * Push out all pending data as one UDP datagram. Socket is locked.
 */
int udp_push_pending_frames(struct sock *sk)
{
        struct udp_sock  *up = udp_sk(sk);
        struct inet_sock *inet = inet_sk(sk);
        struct flowi4 *fl4 = &inet->cork.fl.u.ip4;
        struct sk_buff *skb;
        int err = 0;

        skb = ip_finish_skb(sk, fl4);
        if (!skb)
                goto out;

        err = udp_send_skb(skb, fl4, &inet->cork.base);

out:
        up->len = 0;
        up->pending = 0;
        return err;
}
EXPORT_SYMBOL(udp_push_pending_frames);

static int __udp_cmsg_send(struct cmsghdr *cmsg, u16 *gso_size)
{
        switch (cmsg->cmsg_type) {
        case UDP_SEGMENT:
                if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u16)))
                        return -EINVAL;
                *gso_size = *(__u16 *)CMSG_DATA(cmsg);
                return 0;
        default:
                return -EINVAL;
        }
}

int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size)
{
        struct cmsghdr *cmsg;
        bool need_ip = false;
        int err;

        for_each_cmsghdr(cmsg, msg) {
                if (!CMSG_OK(msg, cmsg))
                        return -EINVAL;

                if (cmsg->cmsg_level != SOL_UDP) {
                        need_ip = true;
                        continue;
                }

                err = __udp_cmsg_send(cmsg, gso_size);
                if (err)
                        return err;
        }

        return need_ip;
}
EXPORT_SYMBOL_GPL(udp_cmsg_send);

int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
        struct inet_sock *inet = inet_sk(sk);
        struct udp_sock *up = udp_sk(sk);
        DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
        struct flowi4 fl4_stack;
        struct flowi4 *fl4;
        int ulen = len;
        struct ipcm_cookie ipc;
        struct rtable *rt = NULL;
        int free = 0;
        int connected = 0;
        __be32 daddr, faddr, saddr;
        __be16 dport;
        u8  tos;
        int err, is_udplite = IS_UDPLITE(sk);
        int corkreq = READ_ONCE(up->corkflag) || msg->msg_flags&MSG_MORE;
        int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
        struct sk_buff *skb;
        struct ip_options_data opt_copy;

        if (len > 0xFFFF)
                return -EMSGSIZE;

        /*
         *        Check the flags.
         */

        if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */
                return -EOPNOTSUPP;

        getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;

        fl4 = &inet->cork.fl.u.ip4;
        if (up->pending) {
                /*
                 * There are pending frames.
                 * The socket lock must be held while it's corked.
                 */
                lock_sock(sk);
                if (likely(up->pending)) {
                        if (unlikely(up->pending != AF_INET)) {
                                release_sock(sk);
                                return -EINVAL;
                        }
                        goto do_append_data;
                }
                release_sock(sk);
        }
        ulen += sizeof(struct udphdr);

        /*
         *        Get and verify the address.
         */
        if (usin) {
                if (msg->msg_namelen < sizeof(*usin))
                        return -EINVAL;
                if (usin->sin_family != AF_INET) {
                        if (usin->sin_family != AF_UNSPEC)
                                return -EAFNOSUPPORT;
                }

                daddr = usin->sin_addr.s_addr;
                dport = usin->sin_port;
                if (dport == 0)
                        return -EINVAL;
        } else {
                if (sk->sk_state != TCP_ESTABLISHED)
                        return -EDESTADDRREQ;
                daddr = inet->inet_daddr;
                dport = inet->inet_dport;
                /* Open fast path for connected socket.
                   Route will not be used, if at least one option is set.
                 */
                connected = 1;
        }

        ipcm_init_sk(&ipc, inet);
        ipc.gso_size = READ_ONCE(up->gso_size);

        if (msg->msg_controllen) {
                err = udp_cmsg_send(sk, msg, &ipc.gso_size);
                if (err > 0) {
                        err = ip_cmsg_send(sk, msg, &ipc,
                                           sk->sk_family == AF_INET6);
                        connected = 0;
                }
                if (unlikely(err < 0)) {
                        kfree(ipc.opt);
                        return err;
                }
                if (ipc.opt)
                        free = 1;
        }
        if (!ipc.opt) {
                struct ip_options_rcu *inet_opt;

                rcu_read_lock();
                inet_opt = rcu_dereference(inet->inet_opt);
                if (inet_opt) {
                        memcpy(&opt_copy, inet_opt,
                               sizeof(*inet_opt) + inet_opt->opt.optlen);
                        ipc.opt = &opt_copy.opt;
                }
                rcu_read_unlock();
        }

        if (cgroup_bpf_enabled && !connected) {
                err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
                                            (struct sockaddr *)usin, &ipc.addr);
                if (err)
                        goto out_free;
                if (usin) {
                        if (usin->sin_port == 0) {
                                /* BPF program set invalid port. Reject it. */
                                err = -EINVAL;
                                goto out_free;
                        }
                        daddr = usin->sin_addr.s_addr;
                        dport = usin->sin_port;
                }
        }

        saddr = ipc.addr;
        ipc.addr = faddr = daddr;

        if (ipc.opt && ipc.opt->opt.srr) {
                if (!daddr) {
                        err = -EINVAL;
                        goto out_free;
                }
                faddr = ipc.opt->opt.faddr;
                connected = 0;
        }
        tos = get_rttos(&ipc, inet);
        if (sock_flag(sk, SOCK_LOCALROUTE) ||
            (msg->msg_flags & MSG_DONTROUTE) ||
            (ipc.opt && ipc.opt->opt.is_strictroute)) {
                tos |= RTO_ONLINK;
                connected = 0;
        }

        if (ipv4_is_multicast(daddr)) {
                if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
                        ipc.oif = inet->mc_index;
                if (!saddr)
                        saddr = inet->mc_addr;
                connected = 0;
        } else if (!ipc.oif) {
                ipc.oif = inet->uc_index;
        } else if (ipv4_is_lbcast(daddr) && inet->uc_index) {
                /* oif is set, packet is to local broadcast and
                 * uc_index is set. oif is most likely set
                 * by sk_bound_dev_if. If uc_index != oif check if the
                 * oif is an L3 master and uc_index is an L3 slave.
                 * If so, we want to allow the send using the uc_index.
                 */
                if (ipc.oif != inet->uc_index &&
                    ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk),
                                                              inet->uc_index)) {
                        ipc.oif = inet->uc_index;
                }
        }

        if (connected)
                rt = (struct rtable *)sk_dst_check(sk, 0);

        if (!rt) {
                struct net *net = sock_net(sk);
                __u8 flow_flags = inet_sk_flowi_flags(sk);

                fl4 = &fl4_stack;

                flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, tos,
                                   RT_SCOPE_UNIVERSE, sk->sk_protocol,
                                   flow_flags,
                                   faddr, saddr, dport, inet->inet_sport,
                                   sk->sk_uid);

                security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4));
                rt = ip_route_output_flow(net, fl4, sk);
                if (IS_ERR(rt)) {
                        err = PTR_ERR(rt);
                        rt = NULL;
                        if (err == -ENETUNREACH)
                                IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
                        goto out;
                }

                err = -EACCES;
                if ((rt->rt_flags & RTCF_BROADCAST) &&
                    !sock_flag(sk, SOCK_BROADCAST))
                        goto out;
                if (connected)
                        sk_dst_set(sk, dst_clone(&rt->dst));
        }

        if (msg->msg_flags&MSG_CONFIRM)
                goto do_confirm;
back_from_confirm:

        saddr = fl4->saddr;
        if (!ipc.addr)
                daddr = ipc.addr = fl4->daddr;

        /* Lockless fast path for the non-corking case. */
        if (!corkreq) {
                struct inet_cork cork;

                skb = ip_make_skb(sk, fl4, getfrag, msg, ulen,
                                  sizeof(struct udphdr), &ipc, &rt,
                                  &cork, msg->msg_flags);
                err = PTR_ERR(skb);
                if (!IS_ERR_OR_NULL(skb))
                        err = udp_send_skb(skb, fl4, &cork);
                goto out;
        }

        lock_sock(sk);
        if (unlikely(up->pending)) {
                /* The socket is already corked while preparing it. */
                /* ... which is an evident application bug. --ANK */
                release_sock(sk);

                net_dbg_ratelimited("socket already corked\n");
                err = -EINVAL;
                goto out;
        }
        /*
         *        Now cork the socket to pend data.
         */
        fl4 = &inet->cork.fl.u.ip4;
        fl4->daddr = daddr;
        fl4->saddr = saddr;
        fl4->fl4_dport = dport;
        fl4->fl4_sport = inet->inet_sport;
        up->pending = AF_INET;

do_append_data:
        up->len += ulen;
        err = ip_append_data(sk, fl4, getfrag, msg, ulen,
                             sizeof(struct udphdr), &ipc, &rt,
                             corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
        if (err)
                udp_flush_pending_frames(sk);
        else if (!corkreq)
                err = udp_push_pending_frames(sk);
        else if (unlikely(skb_queue_empty(&sk->sk_write_queue)))
                up->pending = 0;
        release_sock(sk);

out:
        ip_rt_put(rt);
out_free:
        if (free)
                kfree(ipc.opt);
        if (!err)
                return len;
        /*
         * ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space.  Reporting
         * ENOBUFS might not be good (it's not tunable per se), but otherwise
         * we don't have a good statistic (IpOutDiscards but it can be too many
         * things).  We could add another new stat but at least for now that
         * seems like overkill.
         */
        if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
                UDP_INC_STATS(sock_net(sk),
                              UDP_MIB_SNDBUFERRORS, is_udplite);
        }
        return err;

do_confirm:
        if (msg->msg_flags & MSG_PROBE)
                dst_confirm_neigh(&rt->dst, &fl4->daddr);
        if (!(msg->msg_flags&MSG_PROBE) || len)
                goto back_from_confirm;
        err = 0;
        goto out;
}
EXPORT_SYMBOL(udp_sendmsg);

int udp_sendpage(struct sock *sk, struct page *page, int offset,
                 size_t size, int flags)
{
        struct inet_sock *inet = inet_sk(sk);
        struct udp_sock *up = udp_sk(sk);
        int ret;

        if (flags & MSG_SENDPAGE_NOTLAST)
                flags |= MSG_MORE;

        if (!up->pending) {
                struct msghdr msg = {        .msg_flags = flags|MSG_MORE };

                /* Call udp_sendmsg to specify destination address which
                 * sendpage interface can't pass.
                 * This will succeed only when the socket is connected.
                 */
                ret = udp_sendmsg(sk, &msg, 0);
                if (ret < 0)
                        return ret;
        }

        lock_sock(sk);

        if (unlikely(!up->pending)) {
                release_sock(sk);

                net_dbg_ratelimited("cork failed\n");
                return -EINVAL;
        }

        ret = ip_append_page(sk, &inet->cork.fl.u.ip4,
                             page, offset, size, flags);
        if (ret == -EOPNOTSUPP) {
                release_sock(sk);
                return sock_no_sendpage(sk->sk_socket, page, offset,
                                        size, flags);
        }
        if (ret < 0) {
                udp_flush_pending_frames(sk);
                goto out;
        }

        up->len += size;
        if (!(READ_ONCE(up->corkflag) || (flags&MSG_MORE)))
                ret = udp_push_pending_frames(sk);
        if (!ret)
                ret = size;
out:
        release_sock(sk);
        return ret;
}

#define UDP_SKB_IS_STATELESS 0x80000000

/* all head states (dst, sk, nf conntrack) except skb extensions are
 * cleared by udp_rcv().
 *
 * We need to preserve secpath, if present, to eventually process
 * IP_CMSG_PASSSEC at recvmsg() time.
 *
 * Other extensions can be cleared.
 */
static bool udp_try_make_stateless(struct sk_buff *skb)
{
        if (!skb_has_extensions(skb))
                return true;

        if (!secpath_exists(skb)) {
                skb_ext_reset(skb);
                return true;
        }

        return false;
}

static void udp_set_dev_scratch(struct sk_buff *skb)
{
        struct udp_dev_scratch *scratch = udp_skb_scratch(skb);

        BUILD_BUG_ON(sizeof(struct udp_dev_scratch) > sizeof(long));
        scratch->_tsize_state = skb->truesize;
#if BITS_PER_LONG == 64
        scratch->len = skb->len;
        scratch->csum_unnecessary = !!skb_csum_unnecessary(skb);
        scratch->is_linear = !skb_is_nonlinear(skb);
#endif
        if (udp_try_make_stateless(skb))
                scratch->_tsize_state |= UDP_SKB_IS_STATELESS;
}

static void udp_skb_csum_unnecessary_set(struct sk_buff *skb)
{
        /* We come here after udp_lib_checksum_complete() returned 0.
         * This means that __skb_checksum_complete() might have
         * set skb->csum_valid to 1.
         * On 64bit platforms, we can set csum_unnecessary
         * to true, but only if the skb is not shared.
         */
#if BITS_PER_LONG == 64
        if (!skb_shared(skb))
                udp_skb_scratch(skb)->csum_unnecessary = true;
#endif
}

static int udp_skb_truesize(struct sk_buff *skb)
{
        return udp_skb_scratch(skb)->_tsize_state & ~UDP_SKB_IS_STATELESS;
}

static bool udp_skb_has_head_state(struct sk_buff *skb)
{
        return !(udp_skb_scratch(skb)->_tsize_state & UDP_SKB_IS_STATELESS);
}

/* fully reclaim rmem/fwd memory allocated for skb */
static void udp_rmem_release(struct sock *sk, unsigned int size,
                             int partial, bool rx_queue_lock_held)
{
        struct udp_sock *up = udp_sk(sk);
        struct sk_buff_head *sk_queue;
        unsigned int amt;

        if (likely(partial)) {
                up->forward_deficit += size;
                size = up->forward_deficit;
                if (size < (sk->sk_rcvbuf >> 2) &&
                    !skb_queue_empty(&up->reader_queue))
                        return;
        } else {
                size += up->forward_deficit;
        }
        up->forward_deficit = 0;

        /* acquire the sk_receive_queue for fwd allocated memory scheduling,
         * if the called don't held it already
         */
        sk_queue = &sk->sk_receive_queue;
        if (!rx_queue_lock_held)
                spin_lock(&sk_queue->lock);

        amt = (size + sk->sk_forward_alloc - partial) & ~(PAGE_SIZE - 1);
        sk->sk_forward_alloc += size - amt;

        if (amt)
                __sk_mem_reduce_allocated(sk, amt >> SK_MEM_QUANTUM_SHIFT);

        atomic_sub(size, &sk->sk_rmem_alloc);

        /* this can save us from acquiring the rx queue lock on next receive */
        skb_queue_splice_tail_init(sk_queue, &up->reader_queue);

        if (!rx_queue_lock_held)
                spin_unlock(&sk_queue->lock);
}

/* Note: called with reader_queue.lock held.
 * Instead of using skb->truesize here, find a copy of it in skb->dev_scratch
 * This avoids a cache line miss while receive_queue lock is held.
 * Look at __udp_enqueue_schedule_skb() to find where this copy is done.
 */
void udp_skb_destructor(struct sock *sk, struct sk_buff *skb)
{
        prefetch(&skb->data);
        udp_rmem_release(sk, udp_skb_truesize(skb), 1, false);
}
EXPORT_SYMBOL(udp_skb_destructor);

/* as above, but the caller held the rx queue lock, too */
static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb)
{
        prefetch(&skb->data);
        udp_rmem_release(sk, udp_skb_truesize(skb), 1, true);
}

/* Idea of busylocks is to let producers grab an extra spinlock
 * to relieve pressure on the receive_queue spinlock shared by consumer.
 * Under flood, this means that only one producer can be in line
 * trying to acquire the receive_queue spinlock.
 * These busylock can be allocated on a per cpu manner, instead of a
 * per socket one (that would consume a cache line per socket)
 */
static int udp_busylocks_log __read_mostly;
static spinlock_t *udp_busylocks __read_mostly;

static spinlock_t *busylock_acquire(void *ptr)
{
        spinlock_t *busy;

        busy = udp_busylocks + hash_ptr(ptr, udp_busylocks_log);
        spin_lock(busy);
        return busy;
}

static void busylock_release(spinlock_t *busy)
{
        if (busy)
                spin_unlock(busy);
}

int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
{
        struct sk_buff_head *list = &sk->sk_receive_queue;
        int rmem, delta, amt, err = -ENOMEM;
        spinlock_t *busy = NULL;
        int size;

        /* try to avoid the costly atomic add/sub pair when the receive
         * queue is full; always allow at least a packet
         */
        rmem = atomic_read(&sk->sk_rmem_alloc);
        if (rmem > sk->sk_rcvbuf)
                goto drop;

        /* Under mem pressure, it might be helpful to help udp_recvmsg()
         * having linear skbs :
         * - Reduce memory overhead and thus increase receive queue capacity
         * - Less cache line misses at copyout() time
         * - Less work at consume_skb() (less alien page frag freeing)
         */
        if (rmem > (sk->sk_rcvbuf >> 1)) {
                skb_condense(skb);

                busy = busylock_acquire(sk);
        }
        size = skb->truesize;
        udp_set_dev_scratch(skb);

        /* we drop only if the receive buf is full and the receive
         * queue contains some other skb
         */
        rmem = atomic_add_return(size, &sk->sk_rmem_alloc);
        if (rmem > (size + (unsigned int)sk->sk_rcvbuf))
                goto uncharge_drop;

        spin_lock(&list->lock);
        if (size >= sk->sk_forward_alloc) {
                amt = sk_mem_pages(size);
                delta = amt << SK_MEM_QUANTUM_SHIFT;
                if (!__sk_mem_raise_allocated(sk, delta, amt, SK_MEM_RECV)) {
                        err = -ENOBUFS;
                        spin_unlock(&list->lock);
                        goto uncharge_drop;
                }

                sk->sk_forward_alloc += delta;
        }

        sk->sk_forward_alloc -= size;

        /* no need to setup a destructor, we will explicitly release the
         * forward allocated memory on dequeue
         */
        sock_skb_set_dropcount(sk, skb);

        __skb_queue_tail(list, skb);
        spin_unlock(&list->lock);

        if (!sock_flag(sk, SOCK_DEAD))
                sk->sk_data_ready(sk);

        busylock_release(busy);
        return 0;

uncharge_drop:
        atomic_sub(skb->truesize, &sk->sk_rmem_alloc);

drop:
        atomic_inc(&sk->sk_drops);
        busylock_release(busy);
        return err;
}
EXPORT_SYMBOL_GPL(__udp_enqueue_schedule_skb);

void udp_destruct_common(struct sock *sk)
{
        /* reclaim completely the forward allocated memory */
        struct udp_sock *up = udp_sk(sk);
        unsigned int total = 0;
        struct sk_buff *skb;

        skb_queue_splice_tail_init(&sk->sk_receive_queue, &up->reader_queue);
        while ((skb = __skb_dequeue(&up->reader_queue)) != NULL) {
                total += skb->truesize;
                kfree_skb(skb);
        }
        udp_rmem_release(sk, total, 0, true);
}
EXPORT_SYMBOL_GPL(udp_destruct_common);

static void udp_destruct_sock(struct sock *sk)
{
        udp_destruct_common(sk);
        inet_sock_destruct(sk);
}

int udp_init_sock(struct sock *sk)
{
        skb_queue_head_init(&udp_sk(sk)->reader_queue);
        sk->sk_destruct = udp_destruct_sock;
        return 0;
}

void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len)
{
        if (unlikely(READ_ONCE(sk->sk_peek_off) >= 0)) {
                bool slow = lock_sock_fast(sk);

                sk_peek_offset_bwd(sk, len);
                unlock_sock_fast(sk, slow);
        }

        if (!skb_unref(skb))
                return;

        /* In the more common cases we cleared the head states previously,
         * see __udp_queue_rcv_skb().
         */
        if (unlikely(udp_skb_has_head_state(skb)))
                skb_release_head_state(skb);
        __consume_stateless_skb(skb);
}
EXPORT_SYMBOL_GPL(skb_consume_udp);

static struct sk_buff *__first_packet_length(struct sock *sk,
                                             struct sk_buff_head *rcvq,
                                             unsigned int *total)
{
        struct sk_buff *skb;

        while ((skb = skb_peek(rcvq)) != NULL) {
                if (udp_lib_checksum_complete(skb)) {
                        __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS,
                                        IS_UDPLITE(sk));
                        __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS,
                                        IS_UDPLITE(sk));
                        atomic_inc(&sk->sk_drops);
                        __skb_unlink(skb, rcvq);
                        *total += skb->truesize;
                        kfree_skb(skb);
                } else {
                        udp_skb_csum_unnecessary_set(skb);
                        break;
                }
        }
        return skb;
}

/**
 *        first_packet_length        - return length of first packet in receive queue
 *        @sk: socket
 *
 *        Drops all bad checksum frames, until a valid one is found.
 *        Returns the length of found skb, or -1 if none is found.
 */
static int first_packet_length(struct sock *sk)
{
        struct sk_buff_head *rcvq = &udp_sk(sk)->reader_queue;
        struct sk_buff_head *sk_queue = &sk->sk_receive_queue;
        unsigned int total = 0;
        struct sk_buff *skb;
        int res;

        spin_lock_bh(&rcvq->lock);
        skb = __first_packet_length(sk, rcvq, &total);
        if (!skb && !skb_queue_empty_lockless(sk_queue)) {
                spin_lock(&sk_queue->lock);
                skb_queue_splice_tail_init(sk_queue, rcvq);
                spin_unlock(&sk_queue->lock);

                skb = __first_packet_length(sk, rcvq, &total);
        }
        res = skb ? skb->len : -1;
        if (total)
                udp_rmem_release(sk, total, 1, false);
        spin_unlock_bh(&rcvq->lock);
        return res;
}

/*
 *        IOCTL requests applicable to the UDP protocol
 */

int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
{
        switch (cmd) {
        case SIOCOUTQ:
        {
                int amount = sk_wmem_alloc_get(sk);

                return put_user(amount, (int __user *)arg);
        }

        case SIOCINQ:
        {
                int amount = max_t(int, 0, first_packet_length(sk));

                return put_user(amount, (int __user *)arg);
        }

        default:
                return -ENOIOCTLCMD;
        }

        return 0;
}
EXPORT_SYMBOL(udp_ioctl);

struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
                               int noblock, int *off, int *err)
{
        struct sk_buff_head *sk_queue = &sk->sk_receive_queue;
        struct sk_buff_head *queue;
        struct sk_buff *last;
        long timeo;
        int error;

        queue = &udp_sk(sk)->reader_queue;
        flags |= noblock ? MSG_DONTWAIT : 0;
        timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
        do {
                struct sk_buff *skb;

                error = sock_error(sk);
                if (error)
                        break;

                error = -EAGAIN;
                do {
                        spin_lock_bh(&queue->lock);
                        skb = __skb_try_recv_from_queue(sk, queue, flags, off,
                                                        err, &last);
                        if (skb) {
                                if (!(flags & MSG_PEEK))
                                        udp_skb_destructor(sk, skb);
                                spin_unlock_bh(&queue->lock);
                                return skb;
                        }

                        if (skb_queue_empty_lockless(sk_queue)) {
                                spin_unlock_bh(&queue->lock);
                                goto busy_check;
                        }

                        /* refill the reader queue and walk it again
                         * keep both queues locked to avoid re-acquiring
                         * the sk_receive_queue lock if fwd memory scheduling
                         * is needed.
                         */
                        spin_lock(&sk_queue->lock);
                        skb_queue_splice_tail_init(sk_queue, queue);

                        skb = __skb_try_recv_from_queue(sk, queue, flags, off,
                                                        err, &last);
                        if (skb && !(flags & MSG_PEEK))
                                udp_skb_dtor_locked(sk, skb);
                        spin_unlock(&sk_queue->lock);
                        spin_unlock_bh(&queue->lock);
                        if (skb)
                                return skb;

busy_check:
                        if (!sk_can_busy_loop(sk))
                                break;

                        sk_busy_loop(sk, flags & MSG_DONTWAIT);
                } while (!skb_queue_empty_lockless(sk_queue));

                /* sk_queue is empty, reader_queue may contain peeked packets */
        } while (timeo &&
                 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
                                              &error, &timeo,
                                              (struct sk_buff *)sk_queue));

        *err = error;
        return NULL;
}
EXPORT_SYMBOL(__skb_recv_udp);

/*
 *         This should be easy, if there is something there we
 *         return it, otherwise we block.
 */

int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
                int flags, int *addr_len)
{
        struct inet_sock *inet = inet_sk(sk);
        DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
        struct sk_buff *skb;
        unsigned int ulen, copied;
        int off, err, peeking = flags & MSG_PEEK;
        int is_udplite = IS_UDPLITE(sk);
        bool checksum_valid = false;

        if (flags & MSG_ERRQUEUE)
                return ip_recv_error(sk, msg, len, addr_len);

try_again:
        off = sk_peek_offset(sk, flags);
        skb = __skb_recv_udp(sk, flags, noblock, &off, &err);
        if (!skb)
                return err;

        ulen = udp_skb_len(skb);
        copied = len;
        if (copied > ulen - off)
                copied = ulen - off;
        else if (copied < ulen)
                msg->msg_flags |= MSG_TRUNC;

        /*
         * If checksum is needed at all, try to do it while copying the
         * data.  If the data is truncated, or if we only want a partial
         * coverage checksum (UDP-Lite), do it before the copy.
         */

        if (copied < ulen || peeking ||
            (is_udplite && UDP_SKB_CB(skb)->partial_cov)) {
                checksum_valid = udp_skb_csum_unnecessary(skb) ||
                                !__udp_lib_checksum_complete(skb);
                if (!checksum_valid)
                        goto csum_copy_err;
        }

        if (checksum_valid || udp_skb_csum_unnecessary(skb)) {
                if (udp_skb_is_linear(skb))
                        err = copy_linear_skb(skb, copied, off, &msg->msg_iter);
                else
                        err = skb_copy_datagram_msg(skb, off, msg, copied);
        } else {
                err = skb_copy_and_csum_datagram_msg(skb, off, msg);

                if (err == -EINVAL)
                        goto csum_copy_err;
        }

        if (unlikely(err)) {
                if (!peeking) {
                        atomic_inc(&sk->sk_drops);
                        UDP_INC_STATS(sock_net(sk),
                                      UDP_MIB_INERRORS, is_udplite);
                }
                kfree_skb(skb);
                return err;
        }

        if (!peeking)
                UDP_INC_STATS(sock_net(sk),
                              UDP_MIB_INDATAGRAMS, is_udplite);

        sock_recv_ts_and_drops(msg, sk, skb);

        /* Copy the address. */
        if (sin) {
                sin->sin_family = AF_INET;
                sin->sin_port = udp_hdr(skb)->source;
                sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
                memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
                *addr_len = sizeof(*sin);

                if (cgroup_bpf_enabled)
                        BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk,
                                                        (struct sockaddr *)sin);
        }

        if (udp_sk(sk)->gro_enabled)
                udp_cmsg_recv(msg, sk, skb);

        if (inet->cmsg_flags)
                ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off);

        err = copied;
        if (flags & MSG_TRUNC)
                err = ulen;

        skb_consume_udp(sk, skb, peeking ? -err : err);
        return err;

csum_copy_err:
        if (!__sk_queue_drop_skb(sk, &udp_sk(sk)->reader_queue, skb, flags,
                                 udp_skb_destructor)) {
                UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
                UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
        }
        kfree_skb(skb);

        /* starting over for a new packet, but check if we need to yield */
        cond_resched();
        msg->msg_flags &= ~MSG_TRUNC;
        goto try_again;
}

int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
        /* This check is replicated from __ip4_datagram_connect() and
         * intended to prevent BPF program called below from accessing bytes
         * that are out of the bound specified by user in addr_len.
         */
        if (addr_len < sizeof(struct sockaddr_in))
                return -EINVAL;

        return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr);
}
EXPORT_SYMBOL(udp_pre_connect);

int __udp_disconnect(struct sock *sk, int flags)
{
        struct inet_sock *inet = inet_sk(sk);
        /*
         *        1003.1g - break association.
         */

        sk->sk_state = TCP_CLOSE;
        inet->inet_daddr = 0;
        inet->inet_dport = 0;
        sock_rps_reset_rxhash(sk);
        sk->sk_bound_dev_if = 0;
        if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) {
                inet_reset_saddr(sk);
                if (sk->sk_prot->rehash &&
                    (sk->sk_userlocks & SOCK_BINDPORT_LOCK))
                        sk->sk_prot->rehash(sk);
        }

        if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) {
                sk->sk_prot->unhash(sk);
                inet->inet_sport = 0;
        }
        sk_dst_reset(sk);
        return 0;
}
EXPORT_SYMBOL(__udp_disconnect);

int udp_disconnect(struct sock *sk, int flags)
{
        lock_sock(sk);
        __udp_disconnect(sk, flags);
        release_sock(sk);
        return 0;
}
EXPORT_SYMBOL(udp_disconnect);

void udp_lib_unhash(struct sock *sk)
{
        if (sk_hashed(sk)) {
                struct udp_table *udptable = sk->sk_prot->h.udp_table;
                struct udp_hslot *hslot, *hslot2;

                hslot  = udp_hashslot(udptable, sock_net(sk),
                                      udp_sk(sk)->udp_port_hash);
                hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);

                spin_lock_bh(&hslot->lock);
                if (rcu_access_pointer(sk->sk_reuseport_cb))
                        reuseport_detach_sock(sk);
                if (sk_del_node_init_rcu(sk)) {
                        hslot->count--;
                        inet_sk(sk)->inet_num = 0;
                        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);

                        spin_lock(&hslot2->lock);
                        hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
                        hslot2->count--;
                        spin_unlock(&hslot2->lock);
                }
                spin_unlock_bh(&hslot->lock);
        }
}
EXPORT_SYMBOL(udp_lib_unhash);

/*
 * inet_rcv_saddr was changed, we must rehash secondary hash
 */
void udp_lib_rehash(struct sock *sk, u16 newhash)
{
        if (sk_hashed(sk)) {
                struct udp_table *udptable = sk->sk_prot->h.udp_table;
                struct udp_hslot *hslot, *hslot2, *nhslot2;

                hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
                nhslot2 = udp_hashslot2(udptable, newhash);
                udp_sk(sk)->udp_portaddr_hash = newhash;

                if (hslot2 != nhslot2 ||
                    rcu_access_pointer(sk->sk_reuseport_cb)) {
                        hslot = udp_hashslot(udptable, sock_net(sk),
                                             udp_sk(sk)->udp_port_hash);
                        /* we must lock primary chain too */
                        spin_lock_bh(&hslot->lock);
                        if (rcu_access_pointer(sk->sk_reuseport_cb))
                                reuseport_detach_sock(sk);

                        if (hslot2 != nhslot2) {
                                spin_lock(&hslot2->lock);
                                hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
                                hslot2->count--;
                                spin_unlock(&hslot2->lock);

                                spin_lock(&nhslot2->lock);
                                hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
                                                         &nhslot2->head);
                                nhslot2->count++;
                                spin_unlock(&nhslot2->lock);
                        }

                        spin_unlock_bh(&hslot->lock);
                }
        }
}
EXPORT_SYMBOL(udp_lib_rehash);

void udp_v4_rehash(struct sock *sk)
{
        u16 new_hash = ipv4_portaddr_hash(sock_net(sk),
                                          inet_sk(sk)->inet_rcv_saddr,
                                          inet_sk(sk)->inet_num);
        udp_lib_rehash(sk, new_hash);
}

static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
        int rc;

        if (inet_sk(sk)->inet_daddr) {
                sock_rps_save_rxhash(sk, skb);
                sk_mark_napi_id(sk, skb);
                sk_incoming_cpu_update(sk);
        } else {
                sk_mark_napi_id_once(sk, skb);
        }

        rc = __udp_enqueue_schedule_skb(sk, skb);
        if (rc < 0) {
                int is_udplite = IS_UDPLITE(sk);

                /* Note that an ENOMEM error is charged twice */
                if (rc == -ENOMEM)
                        UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS,
                                        is_udplite);
                UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
                kfree_skb(skb);
                trace_udp_fail_queue_rcv_skb(rc, sk);
                return -1;
        }

        return 0;
}

/* returns:
 *  -1: error
 *   0: success
 *  >0: "udp encap" protocol resubmission
 *
 * Note that in the success and error cases, the skb is assumed to
 * have either been requeued or freed.
 */
static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
{
        struct udp_sock *up = udp_sk(sk);
        int is_udplite = IS_UDPLITE(sk);

        /*
         *        Charge it to the socket, dropping if the queue is full.
         */
        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
                goto drop;
        nf_reset_ct(skb);

        if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) {
                int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);

                /*
                 * This is an encapsulation socket so pass the skb to
                 * the socket's udp_encap_rcv() hook. Otherwise, just
                 * fall through and pass this up the UDP socket.
                 * up->encap_rcv() returns the following value:
                 * =0 if skb was successfully passed to the encap
                 *    handler or was discarded by it.
                 * >0 if skb should be passed on to UDP.
                 * <0 if skb should be resubmitted as proto -N
                 */

                /* if we're overly short, let UDP handle it */
                encap_rcv = READ_ONCE(up->encap_rcv);
                if (encap_rcv) {
                        int ret;

                        /* Verify checksum before giving to encap */
                        if (udp_lib_checksum_complete(skb))
                                goto csum_error;

                        ret = encap_rcv(sk, skb);
                        if (ret <= 0) {
                                __UDP_INC_STATS(sock_net(sk),
                                                UDP_MIB_INDATAGRAMS,
                                                is_udplite);
                                return -ret;
                        }
                }

                /* FALLTHROUGH -- it's a UDP Packet */
        }

        /*
         *         UDP-Lite specific tests, ignored on UDP sockets
         */
        if ((up->pcflag & UDPLITE_RECV_CC)  &&  UDP_SKB_CB(skb)->partial_cov) {

                /*
                 * MIB statistics other than incrementing the error count are
                 * disabled for the following two types of errors: these depend
                 * on the application settings, not on the functioning of the
                 * protocol stack as such.
                 *
                 * RFC 3828 here recommends (sec 3.3): "There should also be a
                 * way ... to ... at least let the receiving application block
                 * delivery of packets with coverage values less than a value
                 * provided by the application."
                 */
                if (up->pcrlen == 0) {          /* full coverage was set  */
                        net_dbg_ratelimited("UDPLite: partial coverage %d while full coverage %d requested\n",
                                            UDP_SKB_CB(skb)->cscov, skb->len);
                        goto drop;
                }
                /* The next case involves violating the min. coverage requested
                 * by the receiver. This is subtle: if receiver wants x and x is
                 * greater than the buffersize/MTU then receiver will complain
                 * that it wants x while sender emits packets of smaller size y.
                 * Therefore the above ...()->partial_cov statement is essential.
                 */
                if (UDP_SKB_CB(skb)->cscov  <  up->pcrlen) {
                        net_dbg_ratelimited("UDPLite: coverage %d too small, need min %d\n",
                                            UDP_SKB_CB(skb)->cscov, up->pcrlen);
                        goto drop;
                }
        }

        prefetch(&sk->sk_rmem_alloc);
        if (rcu_access_pointer(sk->sk_filter) &&
            udp_lib_checksum_complete(skb))
                        goto csum_error;

        if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr)))
                goto drop;

        udp_csum_pull_header(skb);

        ipv4_pktinfo_prepare(sk, skb);
        return __udp_queue_rcv_skb(sk, skb);

csum_error:
        __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
drop:
        __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
        atomic_inc(&sk->sk_drops);
        kfree_skb(skb);
        return -1;
}

static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
        struct sk_buff *next, *segs;
        int ret;

        if (likely(!udp_unexpected_gso(sk, skb)))
                return udp_queue_rcv_one_skb(sk, skb);

        BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_GSO_CB_OFFSET);
        __skb_push(skb, -skb_mac_offset(skb));
        segs = udp_rcv_segment(sk, skb, true);
        skb_list_walk_safe(segs, skb, next) {
                __skb_pull(skb, skb_transport_offset(skb));
                ret = udp_queue_rcv_one_skb(sk, skb);
                if (ret > 0)
                        ip_protocol_deliver_rcu(dev_net(skb->dev), skb, ret);
        }
        return 0;
}

/* For TCP sockets, sk_rx_dst is protected by socket lock
 * For UDP, we use xchg() to guard against concurrent changes.
 */
bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
{
        struct dst_entry *old;

        if (dst_hold_safe(dst)) {
                old = xchg((__force struct dst_entry **)&sk->sk_rx_dst, dst);
                dst_release(old);
                return old != dst;
        }
        return false;
}
EXPORT_SYMBOL(udp_sk_rx_dst_set);

/*
 *        Multicasts and broadcasts go to each listener.
 *
 *        Note: called only from the BH handler context.
 */
static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
                                    struct udphdr  *uh,
                                    __be32 saddr, __be32 daddr,
                                    struct udp_table *udptable,
                                    int proto)
{
        struct sock *sk, *first = NULL;
        unsigned short hnum = ntohs(uh->dest);
        struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
        unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
        unsigned int offset = offsetof(typeof(*sk), sk_node);
        int dif = skb->dev->ifindex;
        int sdif = inet_sdif(skb);
        struct hlist_node *node;
        struct sk_buff *nskb;

        if (use_hash2) {
                hash2_any = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
                            udptable->mask;
                hash2 = ipv4_portaddr_hash(net, daddr, hnum) & udptable->mask;
start_lookup:
                hslot = &udptable->hash2[hash2];
                offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
        }

        sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) {
                if (!__udp_is_mcast_sock(net, sk, uh->dest, daddr,
                                         uh->source, saddr, dif, sdif, hnum))
                        continue;

                if (!first) {
                        first = sk;
                        continue;
                }
                nskb = skb_clone(skb, GFP_ATOMIC);

                if (unlikely(!nskb)) {
                        atomic_inc(&sk->sk_drops);
                        __UDP_INC_STATS(net, UDP_MIB_RCVBUFERRORS,
                                        IS_UDPLITE(sk));
                        __UDP_INC_STATS(net, UDP_MIB_INERRORS,
                                        IS_UDPLITE(sk));
                        continue;
                }
                if (udp_queue_rcv_skb(sk, nskb) > 0)
                        consume_skb(nskb);
        }

        /* Also lookup *:port if we are using hash2 and haven't done so yet. */
        if (use_hash2 && hash2 != hash2_any) {
                hash2 = hash2_any;
                goto start_lookup;
        }

        if (first) {
                if (udp_queue_rcv_skb(first, skb) > 0)
                        consume_skb(skb);
        } else {
                kfree_skb(skb);
                __UDP_INC_STATS(net, UDP_MIB_IGNOREDMULTI,
                                proto == IPPROTO_UDPLITE);
        }
        return 0;
}

/* Initialize UDP checksum. If exited with zero value (success),
 * CHECKSUM_UNNECESSARY means, that no more checks are required.
 * Otherwise, csum completion requires checksumming packet body,
 * including udp header and folding it to skb->csum.
 */
static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
                                 int proto)
{
        int err;

        UDP_SKB_CB(skb)->partial_cov = 0;
        UDP_SKB_CB(skb)->cscov = skb->len;

        if (proto == IPPROTO_UDPLITE) {
                err = udplite_checksum_init(skb, uh);
                if (err)
                        return err;

                if (UDP_SKB_CB(skb)->partial_cov) {
                        skb->csum = inet_compute_pseudo(skb, proto);
                        return 0;
                }
        }

        /* Note, we are only interested in != 0 or == 0, thus the
         * force to int.
         */
        err = (__force int)skb_checksum_init_zero_check(skb, proto, uh->check,
                                                        inet_compute_pseudo);
        if (err)
                return err;

        if (skb->ip_summed == CHECKSUM_COMPLETE && !skb->csum_valid) {
                /* If SW calculated the value, we know it's bad */
                if (skb->csum_complete_sw)
                        return 1;

                /* HW says the value is bad. Let's validate that.
                 * skb->csum is no longer the full packet checksum,
                 * so don't treat it as such.
                 */
                skb_checksum_complete_unset(skb);
        }

        return 0;
}

/* wrapper for udp_queue_rcv_skb tacking care of csum conversion and
 * return code conversion for ip layer consumption
 */
static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
                               struct udphdr *uh)
{
        int ret;

        if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
                skb_checksum_try_convert(skb, IPPROTO_UDP, inet_compute_pseudo);

        ret = udp_queue_rcv_skb(sk, skb);

        /* a return value > 0 means to resubmit the input, but
         * it wants the return to be -protocol, or 0
         */
        if (ret > 0)
                return -ret;
        return 0;
}

/*
 *        All we need to do is get the socket, and then do a checksum.
 */

int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
                   int proto)
{
        struct sock *sk;
        struct udphdr *uh;
        unsigned short ulen;
        struct rtable *rt = skb_rtable(skb);
        __be32 saddr, daddr;
        struct net *net = dev_net(skb->dev);
        bool refcounted;

        /*
         *  Validate the packet.
         */
        if (!pskb_may_pull(skb, sizeof(struct udphdr)))
                goto drop;                /* No space for header. */

        uh   = udp_hdr(skb);
        ulen = ntohs(uh->len);
        saddr = ip_hdr(skb)->saddr;
        daddr = ip_hdr(skb)->daddr;

        if (ulen > skb->len)
                goto short_packet;

        if (proto == IPPROTO_UDP) {
                /* UDP validates ulen. */
                if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen))
                        goto short_packet;
                uh = udp_hdr(skb);
        }

        if (udp4_csum_init(skb, uh, proto))
                goto csum_error;

        sk = skb_steal_sock(skb, &refcounted);
        if (sk) {
                struct dst_entry *dst = skb_dst(skb);
                int ret;

                if (unlikely(rcu_dereference(sk->sk_rx_dst) != dst))
                        udp_sk_rx_dst_set(sk, dst);

                ret = udp_unicast_rcv_skb(sk, skb, uh);
                if (refcounted)
                        sock_put(sk);
                return ret;
        }

        if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
                return __udp4_lib_mcast_deliver(net, skb, uh,
                                                saddr, daddr, udptable, proto);

        sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
        if (sk)
                return udp_unicast_rcv_skb(sk, skb, uh);

        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
                goto drop;
        nf_reset_ct(skb);

        /* No socket. Drop packet silently, if checksum is wrong */
        if (udp_lib_checksum_complete(skb))
                goto csum_error;

        __UDP_INC_STATS(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);

        /*
         * Hmm.  We got an UDP packet to a port to which we
         * don't wanna listen.  Ignore it.
         */
        kfree_skb(skb);
        return 0;

short_packet:
        net_dbg_ratelimited("UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n",
                            proto == IPPROTO_UDPLITE ? "Lite" : "",
                            &saddr, ntohs(uh->source),
                            ulen, skb->len,
                            &daddr, ntohs(uh->dest));
        goto drop;

csum_error:
        /*
         * RFC1122: OK.  Discards the bad packet silently (as far as
         * the network is concerned, anyway) as per 4.1.3.4 (MUST).
         */
        net_dbg_ratelimited("UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n",
                            proto == IPPROTO_UDPLITE ? "Lite" : "",
                            &saddr, ntohs(uh->source), &daddr, ntohs(uh->dest),
                            ulen);
        __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE);
drop:
        __UDP_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
        kfree_skb(skb);
        return 0;
}

/* We can only early demux multicast if there is a single matching socket.
 * If more than one socket found returns NULL
 */
static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net,
                                                  __be16 loc_port, __be32 loc_addr,
                                                  __be16 rmt_port, __be32 rmt_addr,
                                                  int dif, int sdif)
{
        struct sock *sk, *result;
        unsigned short hnum = ntohs(loc_port);
        unsigned int slot = udp_hashfn(net, hnum, udp_table.mask);
        struct udp_hslot *hslot = &udp_table.hash[slot];

        /* Do not bother scanning a too big list */
        if (hslot->count > 10)
                return NULL;

        result = NULL;
        sk_for_each_rcu(sk, &hslot->head) {
                if (__udp_is_mcast_sock(net, sk, loc_port, loc_addr,
                                        rmt_port, rmt_addr, dif, sdif, hnum)) {
                        if (result)
                                return NULL;
                        result = sk;
                }
        }

        return result;
}

/* For unicast we should only early demux connected sockets or we can
 * break forwarding setups.  The chains here can be long so only check
 * if the first socket is an exact match and if not move on.
 */
static struct sock *__udp4_lib_demux_lookup(struct net *net,
                                            __be16 loc_port, __be32 loc_addr,
                                            __be16 rmt_port, __be32 rmt_addr,
                                            int dif, int sdif)
{
        unsigned short hnum = ntohs(loc_port);
        unsigned int hash2 = ipv4_portaddr_hash(net, loc_addr, hnum);
        unsigned int slot2 = hash2 & udp_table.mask;
        struct udp_hslot *hslot2 = &udp_table.hash2[slot2];
        INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr);
        const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum);
        struct sock *sk;

        udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
                if (INET_MATCH(net, sk, acookie, ports, dif, sdif))
                        return sk;
                /* Only check first socket in chain */
                break;
        }
        return NULL;
}

int udp_v4_early_demux(struct sk_buff *skb)
{
        struct net *net = dev_net(skb->dev);
        struct in_device *in_dev = NULL;
        const struct iphdr *iph;
        const struct udphdr *uh;
        struct sock *sk = NULL;
        struct dst_entry *dst;
        int dif = skb->dev->ifindex;
        int sdif = inet_sdif(skb);
        int ours;

        /* validate the packet */
        if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr)))
                return 0;

        iph = ip_hdr(skb);
        uh = udp_hdr(skb);

        if (skb->pkt_type == PACKET_MULTICAST) {
                in_dev = __in_dev_get_rcu(skb->dev);

                if (!in_dev)
                        return 0;

                ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr,
                                       iph->protocol);
                if (!ours)
                        return 0;

                sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr,
                                                   uh->source, iph->saddr,
                                                   dif, sdif);
        } else if (skb->pkt_type == PACKET_HOST) {
                sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr,
                                             uh->source, iph->saddr, dif, sdif);
        }

        if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt))
                return 0;

        skb->sk = sk;
        skb->destructor = sock_efree;
        dst = rcu_dereference(sk->sk_rx_dst);

        if (dst)
                dst = dst_check(dst, 0);
        if (dst) {
                u32 itag = 0;

                /* set noref for now.
                 * any place which wants to hold dst has to call
                 * dst_hold_safe()
                 */
                skb_dst_set_noref(skb, dst);

                /* for unconnected multicast sockets we need to validate
                 * the source on each packet
                 */
                if (!inet_sk(sk)->inet_daddr && in_dev)
                        return ip_mc_validate_source(skb, iph->daddr,
                                                     iph->saddr,
                                                     iph->tos & IPTOS_RT_MASK,
                                                     skb->dev, in_dev, &itag);
        }
        return 0;
}

int udp_rcv(struct sk_buff *skb)
{
        return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP);
}

void udp_destroy_sock(struct sock *sk)
{
        struct udp_sock *up = udp_sk(sk);
        bool slow = lock_sock_fast(sk);

        /* protects from races with udp_abort() */
        sock_set_flag(sk, SOCK_DEAD);
        udp_flush_pending_frames(sk);
        unlock_sock_fast(sk, slow);
        if (static_branch_unlikely(&udp_encap_needed_key)) {
                if (up->encap_type) {
                        void (*encap_destroy)(struct sock *sk);
                        encap_destroy = READ_ONCE(up->encap_destroy);
                        if (encap_destroy)
                                encap_destroy(sk);
                }
                if (up->encap_enabled)
                        static_branch_dec(&udp_encap_needed_key);
        }
}

/*
 *        Socket option code for UDP
 */
int udp_lib_setsockopt(struct sock *sk, int level, int optname,
                       sockptr_t optval, unsigned int optlen,
                       int (*push_pending_frames)(struct sock *))
{
        struct udp_sock *up = udp_sk(sk);
        int val, valbool;
        int err = 0;
        int is_udplite = IS_UDPLITE(sk);

        if (optlen < sizeof(int))
                return -EINVAL;

        if (copy_from_sockptr(&val, optval, sizeof(val)))
                return -EFAULT;

        valbool = val ? 1 : 0;

        switch (optname) {
        case UDP_CORK:
                if (val != 0) {
                        WRITE_ONCE(up->corkflag, 1);
                } else {
                        WRITE_ONCE(up->corkflag, 0);
                        lock_sock(sk);
                        push_pending_frames(sk);
                        release_sock(sk);
                }
                break;

        case UDP_ENCAP:
                switch (val) {
                case 0:
#ifdef CONFIG_XFRM
                case UDP_ENCAP_ESPINUDP:
                case UDP_ENCAP_ESPINUDP_NON_IKE:
#if IS_ENABLED(CONFIG_IPV6)
                        if (sk->sk_family == AF_INET6)
                                WRITE_ONCE(up->encap_rcv,
                                           ipv6_stub->xfrm6_udp_encap_rcv);
                        else
#endif
                                WRITE_ONCE(up->encap_rcv,
                                           xfrm4_udp_encap_rcv);
#endif
                        fallthrough;
                case UDP_ENCAP_L2TPINUDP:
                        up->encap_type = val;
                        lock_sock(sk);
                        udp_tunnel_encap_enable(sk->sk_socket);
                        release_sock(sk);
                        break;
                default:
                        err = -ENOPROTOOPT;
                        break;
                }
                break;

        case UDP_NO_CHECK6_TX:
                up->no_check6_tx = valbool;
                break;

        case UDP_NO_CHECK6_RX:
                up->no_check6_rx = valbool;
                break;

        case UDP_SEGMENT:
                if (val < 0 || val > USHRT_MAX)
                        return -EINVAL;
                WRITE_ONCE(up->gso_size, val);
                break;

        case UDP_GRO:
                lock_sock(sk);

                /* when enabling GRO, accept the related GSO packet type */
                if (valbool)
                        udp_tunnel_encap_enable(sk->sk_socket);
                up->gro_enabled = valbool;
                up->accept_udp_l4 = valbool;
                release_sock(sk);
                break;

        /*
         *         UDP-Lite's partial checksum coverage (RFC 3828).
         */
        /* The sender sets actual checksum coverage length via this option.
         * The case coverage > packet length is handled by send module. */
        case UDPLITE_SEND_CSCOV:
                if (!is_udplite)         /* Disable the option on UDP sockets */
                        return -ENOPROTOOPT;
                if (val != 0 && val < 8) /* Illegal coverage: use default (8) */
                        val = 8;
                else if (val > USHRT_MAX)
                        val = USHRT_MAX;
                up->pcslen = val;
                up->pcflag |= UDPLITE_SEND_CC;
                break;

        /* The receiver specifies a minimum checksum coverage value. To make
         * sense, this should be set to at least 8 (as done below). If zero is
         * used, this again means full checksum coverage.                     */
        case UDPLITE_RECV_CSCOV:
                if (!is_udplite)         /* Disable the option on UDP sockets */
                        return -ENOPROTOOPT;
                if (val != 0 && val < 8) /* Avoid silly minimal values.       */
                        val = 8;
                else if (val > USHRT_MAX)
                        val = USHRT_MAX;
                up->pcrlen = val;
                up->pcflag |= UDPLITE_RECV_CC;
                break;

        default:
                err = -ENOPROTOOPT;
                break;
        }

        return err;
}
EXPORT_SYMBOL(udp_lib_setsockopt);

int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
                   unsigned int optlen)
{
        if (level == SOL_UDP  ||  level == SOL_UDPLITE)
                return udp_lib_setsockopt(sk, level, optname,
                                          optval, optlen,
                                          udp_push_pending_frames);
        return ip_setsockopt(sk, level, optname, optval, optlen);
}

int udp_lib_getsockopt(struct sock *sk, int level, int optname,
                       char __user *optval, int __user *optlen)
{
        struct udp_sock *up = udp_sk(sk);
        int val, len;

        if (get_user(len, optlen))
                return -EFAULT;

        if (len < 0)
                return -EINVAL;

        len = min_t(unsigned int, len, sizeof(int));

        switch (optname) {
        case UDP_CORK:
                val = READ_ONCE(up->corkflag);
                break;

        case UDP_ENCAP:
                val = up->encap_type;
                break;

        case UDP_NO_CHECK6_TX:
                val = up->no_check6_tx;
                break;

        case UDP_NO_CHECK6_RX:
                val = up->no_check6_rx;
                break;

        case UDP_SEGMENT:
                val = READ_ONCE(up->gso_size);
                break;

        case UDP_GRO:
                val = up->gro_enabled;
                break;

        /* The following two cannot be changed on UDP sockets, the return is
         * always 0 (which corresponds to the full checksum coverage of UDP). */
        case UDPLITE_SEND_CSCOV:
                val = up->pcslen;
                break;

        case UDPLITE_RECV_CSCOV:
                val = up->pcrlen;
                break;

        default:
                return -ENOPROTOOPT;
        }

        if (put_user(len, optlen))
                return -EFAULT;
        if (copy_to_user(optval, &val, len))
                return -EFAULT;
        return 0;
}
EXPORT_SYMBOL(udp_lib_getsockopt);

int udp_getsockopt(struct sock *sk, int level, int optname,
                   char __user *optval, int __user *optlen)
{
        if (level == SOL_UDP  ||  level == SOL_UDPLITE)
                return udp_lib_getsockopt(sk, level, optname, optval, optlen);
        return ip_getsockopt(sk, level, optname, optval, optlen);
}

/**
 *         udp_poll - wait for a UDP event.
 *        @file: - file struct
 *        @sock: - socket
 *        @wait: - poll table
 *
 *        This is same as datagram poll, except for the special case of
 *        blocking sockets. If application is using a blocking fd
 *        and a packet with checksum error is in the queue;
 *        then it could get return from select indicating data available
 *        but then block when reading it. Add special case code
 *        to work around these arguably broken applications.
 */
__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait)
{
        __poll_t mask = datagram_poll(file, sock, wait);
        struct sock *sk = sock->sk;

        if (!skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
                mask |= EPOLLIN | EPOLLRDNORM;

        /* Check for false positives due to checksum errors */
        if ((mask & EPOLLRDNORM) && !(file->f_flags & O_NONBLOCK) &&
            !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1)
                mask &= ~(EPOLLIN | EPOLLRDNORM);

        return mask;

}
EXPORT_SYMBOL(udp_poll);

int udp_abort(struct sock *sk, int err)
{
        lock_sock(sk);

        /* udp{v6}_destroy_sock() sets it under the sk lock, avoid racing
         * with close()
         */
        if (sock_flag(sk, SOCK_DEAD))
                goto out;

        sk->sk_err = err;
        sk->sk_error_report(sk);
        __udp_disconnect(sk, 0);

out:
        release_sock(sk);

        return 0;
}
EXPORT_SYMBOL_GPL(udp_abort);

struct proto udp_prot = {
        .name                        = "UDP",
        .owner                        = THIS_MODULE,
        .close                        = udp_lib_close,
        .pre_connect                = udp_pre_connect,
        .connect                = ip4_datagram_connect,
        .disconnect                = udp_disconnect,
        .ioctl                        = udp_ioctl,
        .init                        = udp_init_sock,
        .destroy                = udp_destroy_sock,
        .setsockopt                = udp_setsockopt,
        .getsockopt                = udp_getsockopt,
        .sendmsg                = udp_sendmsg,
        .recvmsg                = udp_recvmsg,
        .sendpage                = udp_sendpage,
        .release_cb                = ip4_datagram_release_cb,
        .hash                        = udp_lib_hash,
        .unhash                        = udp_lib_unhash,
        .rehash                        = udp_v4_rehash,
        .get_port                = udp_v4_get_port,
        .memory_allocated        = &udp_memory_allocated,
        .sysctl_mem                = sysctl_udp_mem,
        .sysctl_wmem_offset        = offsetof(struct net, ipv4.sysctl_udp_wmem_min),
        .sysctl_rmem_offset        = offsetof(struct net, ipv4.sysctl_udp_rmem_min),
        .obj_size                = sizeof(struct udp_sock),
        .h.udp_table                = &udp_table,
        .diag_destroy                = udp_abort,
};
EXPORT_SYMBOL(udp_prot);

/* ------------------------------------------------------------------------ */
#ifdef CONFIG_PROC_FS

static struct sock *udp_get_first(struct seq_file *seq, int start)
{
        struct sock *sk;
        struct udp_seq_afinfo *afinfo;
        struct udp_iter_state *state = seq->private;
        struct net *net = seq_file_net(seq);

        if (state->bpf_seq_afinfo)
                afinfo = state->bpf_seq_afinfo;
        else
                afinfo = PDE_DATA(file_inode(seq->file));

        for (state->bucket = start; state->bucket <= afinfo->udp_table->mask;
             ++state->bucket) {
                struct udp_hslot *hslot = &afinfo->udp_table->hash[state->bucket];

                if (hlist_empty(&hslot->head))
                        continue;

                spin_lock_bh(&hslot->lock);
                sk_for_each(sk, &hslot->head) {
                        if (!net_eq(sock_net(sk), net))
                                continue;
                        if (afinfo->family == AF_UNSPEC ||
                            sk->sk_family == afinfo->family)
                                goto found;
                }
                spin_unlock_bh(&hslot->lock);
        }
        sk = NULL;
found:
        return sk;
}

static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
{
        struct udp_seq_afinfo *afinfo;
        struct udp_iter_state *state = seq->private;
        struct net *net = seq_file_net(seq);

        if (state->bpf_seq_afinfo)
                afinfo = state->bpf_seq_afinfo;
        else
                afinfo = PDE_DATA(file_inode(seq->file));

        do {
                sk = sk_next(sk);
        } while (sk && (!net_eq(sock_net(sk), net) ||
                        (afinfo->family != AF_UNSPEC &&
                         sk->sk_family != afinfo->family)));

        if (!sk) {
                if (state->bucket <= afinfo->udp_table->mask)
                        spin_unlock_bh(&afinfo->udp_table->hash[state->bucket].lock);
                return udp_get_first(seq, state->bucket + 1);
        }
        return sk;
}

static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
{
        struct sock *sk = udp_get_first(seq, 0);

        if (sk)
                while (pos && (sk = udp_get_next(seq, sk)) != NULL)
                        --pos;
        return pos ? NULL : sk;
}

void *udp_seq_start(struct seq_file *seq, loff_t *pos)
{
        struct udp_iter_state *state = seq->private;
        state->bucket = MAX_UDP_PORTS;

        return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
}
EXPORT_SYMBOL(udp_seq_start);

void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        struct sock *sk;

        if (v == SEQ_START_TOKEN)
                sk = udp_get_idx(seq, 0);
        else
                sk = udp_get_next(seq, v);

        ++*pos;
        return sk;
}
EXPORT_SYMBOL(udp_seq_next);

void udp_seq_stop(struct seq_file *seq, void *v)
{
        struct udp_seq_afinfo *afinfo;
        struct udp_iter_state *state = seq->private;

        if (state->bpf_seq_afinfo)
                afinfo = state->bpf_seq_afinfo;
        else
                afinfo = PDE_DATA(file_inode(seq->file));

        if (state->bucket <= afinfo->udp_table->mask)
                spin_unlock_bh(&afinfo->udp_table->hash[state->bucket].lock);
}
EXPORT_SYMBOL(udp_seq_stop);

/* ------------------------------------------------------------------------ */
static void udp4_format_sock(struct sock *sp, struct seq_file *f,
                int bucket)
{
        struct inet_sock *inet = inet_sk(sp);
        __be32 dest = inet->inet_daddr;
        __be32 src  = inet->inet_rcv_saddr;
        __u16 destp          = ntohs(inet->inet_dport);
        __u16 srcp          = ntohs(inet->inet_sport);

        seq_printf(f, "%5d: %08X:%04X %08X:%04X"
                " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u",
                bucket, src, srcp, dest, destp, sp->sk_state,
                sk_wmem_alloc_get(sp),
                udp_rqueue_get(sp),
                0, 0L, 0,
                from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
                0, sock_i_ino(sp),
                refcount_read(&sp->sk_refcnt), sp,
                atomic_read(&sp->sk_drops));
}

int udp4_seq_show(struct seq_file *seq, void *v)
{
        seq_setwidth(seq, 127);
        if (v == SEQ_START_TOKEN)
                seq_puts(seq, "   sl  local_address rem_address   st tx_queue "
                           "rx_queue tr tm->when retrnsmt   uid  timeout "
                           "inode ref pointer drops");
        else {
                struct udp_iter_state *state = seq->private;

                udp4_format_sock(v, seq, state->bucket);
        }
        seq_pad(seq, '\n');
        return 0;
}

#ifdef CONFIG_BPF_SYSCALL
struct bpf_iter__udp {
        __bpf_md_ptr(struct bpf_iter_meta *, meta);
        __bpf_md_ptr(struct udp_sock *, udp_sk);
        uid_t uid __aligned(8);
        int bucket __aligned(8);
};

static int udp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
                             struct udp_sock *udp_sk, uid_t uid, int bucket)
{
        struct bpf_iter__udp ctx;

        meta->seq_num--;  /* skip SEQ_START_TOKEN */
        ctx.meta = meta;
        ctx.udp_sk = udp_sk;
        ctx.uid = uid;
        ctx.bucket = bucket;
        return bpf_iter_run_prog(prog, &ctx);
}

static int bpf_iter_udp_seq_show(struct seq_file *seq, void *v)
{
        struct udp_iter_state *state = seq->private;
        struct bpf_iter_meta meta;
        struct bpf_prog *prog;
        struct sock *sk = v;
        uid_t uid;

        if (v == SEQ_START_TOKEN)
                return 0;

        uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
        meta.seq = seq;
        prog = bpf_iter_get_info(&meta, false);
        return udp_prog_seq_show(prog, &meta, v, uid, state->bucket);
}

static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v)
{
        struct bpf_iter_meta meta;
        struct bpf_prog *prog;

        if (!v) {
                meta.seq = seq;
                prog = bpf_iter_get_info(&meta, true);
                if (prog)
                        (void)udp_prog_seq_show(prog, &meta, v, 0, 0);
        }

        udp_seq_stop(seq, v);
}

static const struct seq_operations bpf_iter_udp_seq_ops = {
        .start                = udp_seq_start,
        .next                = udp_seq_next,
        .stop                = bpf_iter_udp_seq_stop,
        .show                = bpf_iter_udp_seq_show,
};
#endif

const struct seq_operations udp_seq_ops = {
        .start                = udp_seq_start,
        .next                = udp_seq_next,
        .stop                = udp_seq_stop,
        .show                = udp4_seq_show,
};
EXPORT_SYMBOL(udp_seq_ops);

static struct udp_seq_afinfo udp4_seq_afinfo = {
        .family                = AF_INET,
        .udp_table        = &udp_table,
};

static int __net_init udp4_proc_init_net(struct net *net)
{
        if (!proc_create_net_data("udp", 0444, net->proc_net, &udp_seq_ops,
                        sizeof(struct udp_iter_state), &udp4_seq_afinfo))
                return -ENOMEM;
        return 0;
}

static void __net_exit udp4_proc_exit_net(struct net *net)
{
        remove_proc_entry("udp", net->proc_net);
}

static struct pernet_operations udp4_net_ops = {
        .init = udp4_proc_init_net,
        .exit = udp4_proc_exit_net,
};

int __init udp4_proc_init(void)
{
        return register_pernet_subsys(&udp4_net_ops);
}

void udp4_proc_exit(void)
{
        unregister_pernet_subsys(&udp4_net_ops);
}
#endif /* CONFIG_PROC_FS */

static __initdata unsigned long uhash_entries;
static int __init set_uhash_entries(char *str)
{
        ssize_t ret;

        if (!str)
                return 0;

        ret = kstrtoul(str, 0, &uhash_entries);
        if (ret)
                return 0;

        if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN)
                uhash_entries = UDP_HTABLE_SIZE_MIN;
        return 1;
}
__setup("uhash_entries=", set_uhash_entries);

void __init udp_table_init(struct udp_table *table, const char *name)
{
        unsigned int i;

        table->hash = alloc_large_system_hash(name,
                                              2 * sizeof(struct udp_hslot),
                                              uhash_entries,
                                              21, /* one slot per 2 MB */
                                              0,
                                              &table->log,
                                              &table->mask,
                                              UDP_HTABLE_SIZE_MIN,
                                              64 * 1024);

        table->hash2 = table->hash + (table->mask + 1);
        for (i = 0; i <= table->mask; i++) {
                INIT_HLIST_HEAD(&table->hash[i].head);
                table->hash[i].count = 0;
                spin_lock_init(&table->hash[i].lock);
        }
        for (i = 0; i <= table->mask; i++) {
                INIT_HLIST_HEAD(&table->hash2[i].head);
                table->hash2[i].count = 0;
                spin_lock_init(&table->hash2[i].lock);
        }
}

u32 udp_flow_hashrnd(void)
{
        static u32 hashrnd __read_mostly;

        net_get_random_once(&hashrnd, sizeof(hashrnd));

        return hashrnd;
}
EXPORT_SYMBOL(udp_flow_hashrnd);

static void __udp_sysctl_init(struct net *net)
{
        net->ipv4.sysctl_udp_rmem_min = SK_MEM_QUANTUM;
        net->ipv4.sysctl_udp_wmem_min = SK_MEM_QUANTUM;

#ifdef CONFIG_NET_L3_MASTER_DEV
        net->ipv4.sysctl_udp_l3mdev_accept = 0;
#endif
}

static int __net_init udp_sysctl_init(struct net *net)
{
        __udp_sysctl_init(net);
        return 0;
}

static struct pernet_operations __net_initdata udp_sysctl_ops = {
        .init        = udp_sysctl_init,
};

#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(udp, struct bpf_iter_meta *meta,
                     struct udp_sock *udp_sk, uid_t uid, int bucket)

static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
{
        struct udp_iter_state *st = priv_data;
        struct udp_seq_afinfo *afinfo;
        int ret;

        afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
        if (!afinfo)
                return -ENOMEM;

        afinfo->family = AF_UNSPEC;
        afinfo->udp_table = &udp_table;
        st->bpf_seq_afinfo = afinfo;
        ret = bpf_iter_init_seq_net(priv_data, aux);
        if (ret)
                kfree(afinfo);
        return ret;
}

static void bpf_iter_fini_udp(void *priv_data)
{
        struct udp_iter_state *st = priv_data;

        kfree(st->bpf_seq_afinfo);
        bpf_iter_fini_seq_net(priv_data);
}

static const struct bpf_iter_seq_info udp_seq_info = {
        .seq_ops                = &bpf_iter_udp_seq_ops,
        .init_seq_private        = bpf_iter_init_udp,
        .fini_seq_private        = bpf_iter_fini_udp,
        .seq_priv_size                = sizeof(struct udp_iter_state),
};

static struct bpf_iter_reg udp_reg_info = {
        .target                        = "udp",
        .ctx_arg_info_size        = 1,
        .ctx_arg_info                = {
                { offsetof(struct bpf_iter__udp, udp_sk),
                  PTR_TO_BTF_ID_OR_NULL },
        },
        .seq_info                = &udp_seq_info,
};

static void __init bpf_iter_register(void)
{
        udp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UDP];
        if (bpf_iter_reg_target(&udp_reg_info))
                pr_warn("Warning: could not register bpf iterator udp\n");
}
#endif

void __init udp_init(void)
{
        unsigned long limit;
        unsigned int i;

        udp_table_init(&udp_table, "UDP");
        limit = nr_free_buffer_pages() / 8;
        limit = max(limit, 128UL);
        sysctl_udp_mem[0] = limit / 4 * 3;
        sysctl_udp_mem[1] = limit;
        sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2;

        __udp_sysctl_init(&init_net);

        /* 16 spinlocks per cpu */
        udp_busylocks_log = ilog2(nr_cpu_ids) + 4;
        udp_busylocks = kmalloc(sizeof(spinlock_t) << udp_busylocks_log,
                                GFP_KERNEL);
        if (!udp_busylocks)
                panic("UDP: failed to alloc udp_busylocks\n");
        for (i = 0; i < (1U << udp_busylocks_log); i++)
                spin_lock_init(udp_busylocks + i);

        if (register_pernet_subsys(&udp_sysctl_ops))
                panic("UDP: failed to init sysctl parameters.\n");

#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
        bpf_iter_register();
#endif
}































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Released under the GPLv2 only.
 */

#include <linux/pm.h>
#include <linux/acpi.h>

struct usb_hub_descriptor;
struct usb_dev_state;

/* Functions local to drivers/usb/core/ */

extern int usb_create_sysfs_dev_files(struct usb_device *dev);
extern void usb_remove_sysfs_dev_files(struct usb_device *dev);
extern void usb_create_sysfs_intf_files(struct usb_interface *intf);
extern void usb_remove_sysfs_intf_files(struct usb_interface *intf);
extern int usb_create_ep_devs(struct device *parent,
                                struct usb_host_endpoint *endpoint,
                                struct usb_device *udev);
extern void usb_remove_ep_devs(struct usb_host_endpoint *endpoint);

extern void usb_enable_endpoint(struct usb_device *dev,
                struct usb_host_endpoint *ep, bool reset_toggle);
extern void usb_enable_interface(struct usb_device *dev,
                struct usb_interface *intf, bool reset_toggles);
extern void usb_disable_endpoint(struct usb_device *dev, unsigned int epaddr,
                bool reset_hardware);
extern void usb_disable_interface(struct usb_device *dev,
                struct usb_interface *intf, bool reset_hardware);
extern void usb_release_interface_cache(struct kref *ref);
extern void usb_disable_device(struct usb_device *dev, int skip_ep0);
extern int usb_deauthorize_device(struct usb_device *);
extern int usb_authorize_device(struct usb_device *);
extern void usb_deauthorize_interface(struct usb_interface *);
extern void usb_authorize_interface(struct usb_interface *);
extern void usb_detect_quirks(struct usb_device *udev);
extern void usb_detect_interface_quirks(struct usb_device *udev);
extern void usb_release_quirk_list(void);
extern bool usb_endpoint_is_ignored(struct usb_device *udev,
                struct usb_host_interface *intf,
                struct usb_endpoint_descriptor *epd);
extern int usb_remove_device(struct usb_device *udev);

extern struct usb_device_descriptor *usb_get_device_descriptor(
                struct usb_device *udev);
extern int usb_set_isoch_delay(struct usb_device *dev);
extern int usb_get_bos_descriptor(struct usb_device *dev);
extern void usb_release_bos_descriptor(struct usb_device *dev);
extern char *usb_cache_string(struct usb_device *udev, int index);
extern int usb_set_configuration(struct usb_device *dev, int configuration);
extern int usb_choose_configuration(struct usb_device *udev);
extern int usb_generic_driver_probe(struct usb_device *udev);
extern void usb_generic_driver_disconnect(struct usb_device *udev);
extern int usb_generic_driver_suspend(struct usb_device *udev,
                pm_message_t msg);
extern int usb_generic_driver_resume(struct usb_device *udev,
                pm_message_t msg);

static inline unsigned usb_get_max_power(struct usb_device *udev,
                struct usb_host_config *c)
{
        /* SuperSpeed power is in 8 mA units; others are in 2 mA units */
        unsigned mul = (udev->speed >= USB_SPEED_SUPER ? 8 : 2);

        return c->desc.bMaxPower * mul;
}

extern void usb_kick_hub_wq(struct usb_device *dev);
extern int usb_match_one_id_intf(struct usb_device *dev,
                                 struct usb_host_interface *intf,
                                 const struct usb_device_id *id);
extern int usb_match_device(struct usb_device *dev,
                            const struct usb_device_id *id);
extern const struct usb_device_id *usb_device_match_id(struct usb_device *udev,
                                const struct usb_device_id *id);
extern bool usb_driver_applicable(struct usb_device *udev,
                                  struct usb_device_driver *udrv);
extern void usb_forced_unbind_intf(struct usb_interface *intf);
extern void usb_unbind_and_rebind_marked_interfaces(struct usb_device *udev);

extern void usb_hub_release_all_ports(struct usb_device *hdev,
                struct usb_dev_state *owner);
extern bool usb_device_is_owned(struct usb_device *udev);

extern int  usb_hub_init(void);
extern void usb_hub_cleanup(void);
extern int usb_major_init(void);
extern void usb_major_cleanup(void);
extern int usb_device_supports_lpm(struct usb_device *udev);
extern int usb_port_disable(struct usb_device *udev);

#ifdef        CONFIG_PM

extern int usb_suspend(struct device *dev, pm_message_t msg);
extern int usb_resume(struct device *dev, pm_message_t msg);
extern int usb_resume_complete(struct device *dev);

extern int usb_port_suspend(struct usb_device *dev, pm_message_t msg);
extern int usb_port_resume(struct usb_device *dev, pm_message_t msg);

extern void usb_autosuspend_device(struct usb_device *udev);
extern int usb_autoresume_device(struct usb_device *udev);
extern int usb_remote_wakeup(struct usb_device *dev);
extern int usb_runtime_suspend(struct device *dev);
extern int usb_runtime_resume(struct device *dev);
extern int usb_runtime_idle(struct device *dev);
extern int usb_enable_usb2_hardware_lpm(struct usb_device *udev);
extern int usb_disable_usb2_hardware_lpm(struct usb_device *udev);

extern void usbfs_notify_suspend(struct usb_device *udev);
extern void usbfs_notify_resume(struct usb_device *udev);

#else

static inline int usb_port_suspend(struct usb_device *udev, pm_message_t msg)
{
        return 0;
}

static inline int usb_port_resume(struct usb_device *udev, pm_message_t msg)
{
        return 0;
}

#define usb_autosuspend_device(udev)                do {} while (0)
static inline int usb_autoresume_device(struct usb_device *udev)
{
        return 0;
}

static inline int usb_enable_usb2_hardware_lpm(struct usb_device *udev)
{
        return 0;
}

static inline int usb_disable_usb2_hardware_lpm(struct usb_device *udev)
{
        return 0;
}

#endif

extern struct bus_type usb_bus_type;
extern struct mutex usb_port_peer_mutex;
extern struct device_type usb_device_type;
extern struct device_type usb_if_device_type;
extern struct device_type usb_ep_device_type;
extern struct device_type usb_port_device_type;
extern struct usb_device_driver usb_generic_driver;

static inline int is_usb_device(const struct device *dev)
{
        return dev->type == &usb_device_type;
}

static inline int is_usb_interface(const struct device *dev)
{
        return dev->type == &usb_if_device_type;
}

static inline int is_usb_endpoint(const struct device *dev)
{
        return dev->type == &usb_ep_device_type;
}

static inline int is_usb_port(const struct device *dev)
{
        return dev->type == &usb_port_device_type;
}

static inline int is_root_hub(struct usb_device *udev)
{
        return (udev->parent == NULL);
}

/* Do the same for device drivers and interface drivers. */

static inline int is_usb_device_driver(struct device_driver *drv)
{
        return container_of(drv, struct usbdrv_wrap, driver)->
                        for_devices;
}

/* for labeling diagnostics */
extern const char *usbcore_name;

/* sysfs stuff */
extern const struct attribute_group *usb_device_groups[];
extern const struct attribute_group *usb_interface_groups[];

/* usbfs stuff */
extern struct usb_driver usbfs_driver;
extern const struct file_operations usbfs_devices_fops;
extern const struct file_operations usbdev_file_operations;

extern int usb_devio_init(void);
extern void usb_devio_cleanup(void);

/*
 * Firmware specific cookie identifying a port's location. '0' == no location
 * data available
 */
typedef u32 usb_port_location_t;

/* internal notify stuff */
extern void usb_notify_add_device(struct usb_device *udev);
extern void usb_notify_remove_device(struct usb_device *udev);
extern void usb_notify_add_bus(struct usb_bus *ubus);
extern void usb_notify_remove_bus(struct usb_bus *ubus);
extern void usb_hub_adjust_deviceremovable(struct usb_device *hdev,
                struct usb_hub_descriptor *desc);

#ifdef CONFIG_ACPI
extern int usb_acpi_register(void);
extern void usb_acpi_unregister(void);
extern acpi_handle usb_get_hub_port_acpi_handle(struct usb_device *hdev,
        int port1);
#else
static inline int usb_acpi_register(void) { return 0; };
static inline void usb_acpi_unregister(void) { };
#endif







































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *  ALSA sequencer Timer
 *  Copyright (c) 1998-1999 by Frank van de Pol <fvdpol@coil.demon.nl>
 */
#ifndef __SND_SEQ_TIMER_H
#define __SND_SEQ_TIMER_H

#include <sound/timer.h>
#include <sound/seq_kernel.h>

struct snd_seq_timer_tick {
        snd_seq_tick_time_t        cur_tick;        /* current tick */
        unsigned long                resolution;        /* time per tick in nsec */
        unsigned long                fraction;        /* current time per tick in nsec */
};

struct snd_seq_timer {
        /* ... tempo / offset / running state */

        unsigned int                running:1,        /* running state of queue */        
                                initialized:1;        /* timer is initialized */

        unsigned int                tempo;                /* current tempo, us/tick */
        int                        ppq;                /* time resolution, ticks/quarter */

        snd_seq_real_time_t        cur_time;        /* current time */
        struct snd_seq_timer_tick        tick;        /* current tick */
        int tick_updated;
        
        int                        type;                /* timer type */
        struct snd_timer_id        alsa_id;        /* ALSA's timer ID */
        struct snd_timer_instance        *timeri;        /* timer instance */
        unsigned int                ticks;
        unsigned long                preferred_resolution; /* timer resolution, ticks/sec */

        unsigned int skew;
        unsigned int skew_base;

        struct timespec64        last_update;         /* time of last clock update, used for interpolation */

        spinlock_t lock;
};


/* create new timer (constructor) */
struct snd_seq_timer *snd_seq_timer_new(void);

/* delete timer (destructor) */
void snd_seq_timer_delete(struct snd_seq_timer **tmr);

/* */
static inline void snd_seq_timer_update_tick(struct snd_seq_timer_tick *tick,
                                             unsigned long resolution)
{
        if (tick->resolution > 0) {
                tick->fraction += resolution;
                tick->cur_tick += (unsigned int)(tick->fraction / tick->resolution);
                tick->fraction %= tick->resolution;
        }
}


/* compare timestamp between events */
/* return 1 if a >= b; otherwise return 0 */
static inline int snd_seq_compare_tick_time(snd_seq_tick_time_t *a, snd_seq_tick_time_t *b)
{
        /* compare ticks */
        return (*a >= *b);
}

static inline int snd_seq_compare_real_time(snd_seq_real_time_t *a, snd_seq_real_time_t *b)
{
        /* compare real time */
        if (a->tv_sec > b->tv_sec)
                return 1;
        if ((a->tv_sec == b->tv_sec) && (a->tv_nsec >= b->tv_nsec))
                return 1;
        return 0;
}


static inline void snd_seq_sanity_real_time(snd_seq_real_time_t *tm)
{
        while (tm->tv_nsec >= 1000000000) {
                /* roll-over */
                tm->tv_nsec -= 1000000000;
                tm->tv_sec++;
        }
}


/* increment timestamp */
static inline void snd_seq_inc_real_time(snd_seq_real_time_t *tm, snd_seq_real_time_t *inc)
{
        tm->tv_sec  += inc->tv_sec;
        tm->tv_nsec += inc->tv_nsec;
        snd_seq_sanity_real_time(tm);
}

static inline void snd_seq_inc_time_nsec(snd_seq_real_time_t *tm, unsigned long nsec)
{
        tm->tv_nsec  += nsec;
        snd_seq_sanity_real_time(tm);
}

/* called by timer isr */
struct snd_seq_queue;
int snd_seq_timer_open(struct snd_seq_queue *q);
int snd_seq_timer_close(struct snd_seq_queue *q);
int snd_seq_timer_midi_open(struct snd_seq_queue *q);
int snd_seq_timer_midi_close(struct snd_seq_queue *q);
void snd_seq_timer_defaults(struct snd_seq_timer *tmr);
void snd_seq_timer_reset(struct snd_seq_timer *tmr);
int snd_seq_timer_stop(struct snd_seq_timer *tmr);
int snd_seq_timer_start(struct snd_seq_timer *tmr);
int snd_seq_timer_continue(struct snd_seq_timer *tmr);
int snd_seq_timer_set_tempo(struct snd_seq_timer *tmr, int tempo);
int snd_seq_timer_set_tempo_ppq(struct snd_seq_timer *tmr, int tempo, int ppq);
int snd_seq_timer_set_position_tick(struct snd_seq_timer *tmr, snd_seq_tick_time_t position);
int snd_seq_timer_set_position_time(struct snd_seq_timer *tmr, snd_seq_real_time_t position);
int snd_seq_timer_set_skew(struct snd_seq_timer *tmr, unsigned int skew, unsigned int base);
snd_seq_real_time_t snd_seq_timer_get_cur_time(struct snd_seq_timer *tmr,
                                               bool adjust_ktime);
snd_seq_tick_time_t snd_seq_timer_get_cur_tick(struct snd_seq_timer *tmr);

extern int seq_default_timer_class;
extern int seq_default_timer_sclass;
extern int seq_default_timer_card;
extern int seq_default_timer_device;
extern int seq_default_timer_subdevice;
extern int seq_default_timer_resolution;

#endif































































































































































































































































































































































































































































































































    1 









    1 

























    1 






































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_GFP_H
#define __LINUX_GFP_H

#include <linux/mmdebug.h>
#include <linux/mmzone.h>
#include <linux/stddef.h>
#include <linux/linkage.h>
#include <linux/topology.h>

struct vm_area_struct;

/*
 * In case of changes, please don't forget to update
 * include/trace/events/mmflags.h and tools/perf/builtin-kmem.c
 */

/* Plain integer GFP bitmasks. Do not use this directly. */
#define ___GFP_DMA                0x01u
#define ___GFP_HIGHMEM                0x02u
#define ___GFP_DMA32                0x04u
#define ___GFP_MOVABLE                0x08u
#define ___GFP_RECLAIMABLE        0x10u
#define ___GFP_HIGH                0x20u
#define ___GFP_IO                0x40u
#define ___GFP_FS                0x80u
#define ___GFP_ZERO                0x100u
#define ___GFP_ATOMIC                0x200u
#define ___GFP_DIRECT_RECLAIM        0x400u
#define ___GFP_KSWAPD_RECLAIM        0x800u
#define ___GFP_WRITE                0x1000u
#define ___GFP_NOWARN                0x2000u
#define ___GFP_RETRY_MAYFAIL        0x4000u
#define ___GFP_NOFAIL                0x8000u
#define ___GFP_NORETRY                0x10000u
#define ___GFP_MEMALLOC                0x20000u
#define ___GFP_COMP                0x40000u
#define ___GFP_NOMEMALLOC        0x80000u
#define ___GFP_HARDWALL                0x100000u
#define ___GFP_THISNODE                0x200000u
#define ___GFP_ACCOUNT                0x400000u
#ifdef CONFIG_LOCKDEP
#define ___GFP_NOLOCKDEP        0x800000u
#else
#define ___GFP_NOLOCKDEP        0
#endif
/* If the above are modified, __GFP_BITS_SHIFT may need updating */

/*
 * Physical address zone modifiers (see linux/mmzone.h - low four bits)
 *
 * Do not put any conditional on these. If necessary modify the definitions
 * without the underscores and use them consistently. The definitions here may
 * be used in bit comparisons.
 */
#define __GFP_DMA        ((__force gfp_t)___GFP_DMA)
#define __GFP_HIGHMEM        ((__force gfp_t)___GFP_HIGHMEM)
#define __GFP_DMA32        ((__force gfp_t)___GFP_DMA32)
#define __GFP_MOVABLE        ((__force gfp_t)___GFP_MOVABLE)  /* ZONE_MOVABLE allowed */
#define GFP_ZONEMASK        (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE)

/**
 * DOC: Page mobility and placement hints
 *
 * Page mobility and placement hints
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 *
 * These flags provide hints about how mobile the page is. Pages with similar
 * mobility are placed within the same pageblocks to minimise problems due
 * to external fragmentation.
 *
 * %__GFP_MOVABLE (also a zone modifier) indicates that the page can be
 * moved by page migration during memory compaction or can be reclaimed.
 *
 * %__GFP_RECLAIMABLE is used for slab allocations that specify
 * SLAB_RECLAIM_ACCOUNT and whose pages can be freed via shrinkers.
 *
 * %__GFP_WRITE indicates the caller intends to dirty the page. Where possible,
 * these pages will be spread between local zones to avoid all the dirty
 * pages being in one zone (fair zone allocation policy).
 *
 * %__GFP_HARDWALL enforces the cpuset memory allocation policy.
 *
 * %__GFP_THISNODE forces the allocation to be satisfied from the requested
 * node with no fallbacks or placement policy enforcements.
 *
 * %__GFP_ACCOUNT causes the allocation to be accounted to kmemcg.
 */
#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE)
#define __GFP_WRITE        ((__force gfp_t)___GFP_WRITE)
#define __GFP_HARDWALL   ((__force gfp_t)___GFP_HARDWALL)
#define __GFP_THISNODE        ((__force gfp_t)___GFP_THISNODE)
#define __GFP_ACCOUNT        ((__force gfp_t)___GFP_ACCOUNT)

/**
 * DOC: Watermark modifiers
 *
 * Watermark modifiers -- controls access to emergency reserves
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 *
 * %__GFP_HIGH indicates that the caller is high-priority and that granting
 * the request is necessary before the system can make forward progress.
 * For example, creating an IO context to clean pages.
 *
 * %__GFP_ATOMIC indicates that the caller cannot reclaim or sleep and is
 * high priority. Users are typically interrupt handlers. This may be
 * used in conjunction with %__GFP_HIGH
 *
 * %__GFP_MEMALLOC allows access to all memory. This should only be used when
 * the caller guarantees the allocation will allow more memory to be freed
 * very shortly e.g. process exiting or swapping. Users either should
 * be the MM or co-ordinating closely with the VM (e.g. swap over NFS).
 * Users of this flag have to be extremely careful to not deplete the reserve
 * completely and implement a throttling mechanism which controls the
 * consumption of the reserve based on the amount of freed memory.
 * Usage of a pre-allocated pool (e.g. mempool) should be always considered
 * before using this flag.
 *
 * %__GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves.
 * This takes precedence over the %__GFP_MEMALLOC flag if both are set.
 */
#define __GFP_ATOMIC        ((__force gfp_t)___GFP_ATOMIC)
#define __GFP_HIGH        ((__force gfp_t)___GFP_HIGH)
#define __GFP_MEMALLOC        ((__force gfp_t)___GFP_MEMALLOC)
#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC)

/**
 * DOC: Reclaim modifiers
 *
 * Reclaim modifiers
 * ~~~~~~~~~~~~~~~~~
 * Please note that all the following flags are only applicable to sleepable
 * allocations (e.g. %GFP_NOWAIT and %GFP_ATOMIC will ignore them).
 *
 * %__GFP_IO can start physical IO.
 *
 * %__GFP_FS can call down to the low-level FS. Clearing the flag avoids the
 * allocator recursing into the filesystem which might already be holding
 * locks.
 *
 * %__GFP_DIRECT_RECLAIM indicates that the caller may enter direct reclaim.
 * This flag can be cleared to avoid unnecessary delays when a fallback
 * option is available.
 *
 * %__GFP_KSWAPD_RECLAIM indicates that the caller wants to wake kswapd when
 * the low watermark is reached and have it reclaim pages until the high
 * watermark is reached. A caller may wish to clear this flag when fallback
 * options are available and the reclaim is likely to disrupt the system. The
 * canonical example is THP allocation where a fallback is cheap but
 * reclaim/compaction may cause indirect stalls.
 *
 * %__GFP_RECLAIM is shorthand to allow/forbid both direct and kswapd reclaim.
 *
 * The default allocator behavior depends on the request size. We have a concept
 * of so called costly allocations (with order > %PAGE_ALLOC_COSTLY_ORDER).
 * !costly allocations are too essential to fail so they are implicitly
 * non-failing by default (with some exceptions like OOM victims might fail so
 * the caller still has to check for failures) while costly requests try to be
 * not disruptive and back off even without invoking the OOM killer.
 * The following three modifiers might be used to override some of these
 * implicit rules
 *
 * %__GFP_NORETRY: The VM implementation will try only very lightweight
 * memory direct reclaim to get some memory under memory pressure (thus
 * it can sleep). It will avoid disruptive actions like OOM killer. The
 * caller must handle the failure which is quite likely to happen under
 * heavy memory pressure. The flag is suitable when failure can easily be
 * handled at small cost, such as reduced throughput
 *
 * %__GFP_RETRY_MAYFAIL: The VM implementation will retry memory reclaim
 * procedures that have previously failed if there is some indication
 * that progress has been made else where.  It can wait for other
 * tasks to attempt high level approaches to freeing memory such as
 * compaction (which removes fragmentation) and page-out.
 * There is still a definite limit to the number of retries, but it is
 * a larger limit than with %__GFP_NORETRY.
 * Allocations with this flag may fail, but only when there is
 * genuinely little unused memory. While these allocations do not
 * directly trigger the OOM killer, their failure indicates that
 * the system is likely to need to use the OOM killer soon.  The
 * caller must handle failure, but can reasonably do so by failing
 * a higher-level request, or completing it only in a much less
 * efficient manner.
 * If the allocation does fail, and the caller is in a position to
 * free some non-essential memory, doing so could benefit the system
 * as a whole.
 *
 * %__GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller
 * cannot handle allocation failures. The allocation could block
 * indefinitely but will never return with failure. Testing for
 * failure is pointless.
 * New users should be evaluated carefully (and the flag should be
 * used only when there is no reasonable failure policy) but it is
 * definitely preferable to use the flag rather than opencode endless
 * loop around allocator.
 * Using this flag for costly allocations is _highly_ discouraged.
 */
#define __GFP_IO        ((__force gfp_t)___GFP_IO)
#define __GFP_FS        ((__force gfp_t)___GFP_FS)
#define __GFP_DIRECT_RECLAIM        ((__force gfp_t)___GFP_DIRECT_RECLAIM) /* Caller can reclaim */
#define __GFP_KSWAPD_RECLAIM        ((__force gfp_t)___GFP_KSWAPD_RECLAIM) /* kswapd can wake */
#define __GFP_RECLAIM ((__force gfp_t)(___GFP_DIRECT_RECLAIM|___GFP_KSWAPD_RECLAIM))
#define __GFP_RETRY_MAYFAIL        ((__force gfp_t)___GFP_RETRY_MAYFAIL)
#define __GFP_NOFAIL        ((__force gfp_t)___GFP_NOFAIL)
#define __GFP_NORETRY        ((__force gfp_t)___GFP_NORETRY)

/**
 * DOC: Action modifiers
 *
 * Action modifiers
 * ~~~~~~~~~~~~~~~~
 *
 * %__GFP_NOWARN suppresses allocation failure reports.
 *
 * %__GFP_COMP address compound page metadata.
 *
 * %__GFP_ZERO returns a zeroed page on success.
 */
#define __GFP_NOWARN        ((__force gfp_t)___GFP_NOWARN)
#define __GFP_COMP        ((__force gfp_t)___GFP_COMP)
#define __GFP_ZERO        ((__force gfp_t)___GFP_ZERO)

/* Disable lockdep for GFP context tracking */
#define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)

/* Room for N __GFP_FOO bits */
#define __GFP_BITS_SHIFT (23 + IS_ENABLED(CONFIG_LOCKDEP))
#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))

/**
 * DOC: Useful GFP flag combinations
 *
 * Useful GFP flag combinations
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 *
 * Useful GFP flag combinations that are commonly used. It is recommended
 * that subsystems start with one of these combinations and then set/clear
 * %__GFP_FOO flags as necessary.
 *
 * %GFP_ATOMIC users can not sleep and need the allocation to succeed. A lower
 * watermark is applied to allow access to "atomic reserves".
 * The current implementation doesn't support NMI and few other strict
 * non-preemptive contexts (e.g. raw_spin_lock). The same applies to %GFP_NOWAIT.
 *
 * %GFP_KERNEL is typical for kernel-internal allocations. The caller requires
 * %ZONE_NORMAL or a lower zone for direct access but can direct reclaim.
 *
 * %GFP_KERNEL_ACCOUNT is the same as GFP_KERNEL, except the allocation is
 * accounted to kmemcg.
 *
 * %GFP_NOWAIT is for kernel allocations that should not stall for direct
 * reclaim, start physical IO or use any filesystem callback.
 *
 * %GFP_NOIO will use direct reclaim to discard clean pages or slab pages
 * that do not require the starting of any physical IO.
 * Please try to avoid using this flag directly and instead use
 * memalloc_noio_{save,restore} to mark the whole scope which cannot
 * perform any IO with a short explanation why. All allocation requests
 * will inherit GFP_NOIO implicitly.
 *
 * %GFP_NOFS will use direct reclaim but will not use any filesystem interfaces.
 * Please try to avoid using this flag directly and instead use
 * memalloc_nofs_{save,restore} to mark the whole scope which cannot/shouldn't
 * recurse into the FS layer with a short explanation why. All allocation
 * requests will inherit GFP_NOFS implicitly.
 *
 * %GFP_USER is for userspace allocations that also need to be directly
 * accessibly by the kernel or hardware. It is typically used by hardware
 * for buffers that are mapped to userspace (e.g. graphics) that hardware
 * still must DMA to. cpuset limits are enforced for these allocations.
 *
 * %GFP_DMA exists for historical reasons and should be avoided where possible.
 * The flags indicates that the caller requires that the lowest zone be
 * used (%ZONE_DMA or 16M on x86-64). Ideally, this would be removed but
 * it would require careful auditing as some users really require it and
 * others use the flag to avoid lowmem reserves in %ZONE_DMA and treat the
 * lowest zone as a type of emergency reserve.
 *
 * %GFP_DMA32 is similar to %GFP_DMA except that the caller requires a 32-bit
 * address.
 *
 * %GFP_HIGHUSER is for userspace allocations that may be mapped to userspace,
 * do not need to be directly accessible by the kernel but that cannot
 * move once in use. An example may be a hardware allocation that maps
 * data directly into userspace but has no addressing limitations.
 *
 * %GFP_HIGHUSER_MOVABLE is for userspace allocations that the kernel does not
 * need direct access to but can use kmap() when access is required. They
 * are expected to be movable via page reclaim or page migration. Typically,
 * pages on the LRU would also be allocated with %GFP_HIGHUSER_MOVABLE.
 *
 * %GFP_TRANSHUGE and %GFP_TRANSHUGE_LIGHT are used for THP allocations. They
 * are compound allocations that will generally fail quickly if memory is not
 * available and will not wake kswapd/kcompactd on failure. The _LIGHT
 * version does not attempt reclaim/compaction at all and is by default used
 * in page fault path, while the non-light is used by khugepaged.
 */
#define GFP_ATOMIC        (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM)
#define GFP_KERNEL        (__GFP_RECLAIM | __GFP_IO | __GFP_FS)
#define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT)
#define GFP_NOWAIT        (__GFP_KSWAPD_RECLAIM)
#define GFP_NOIO        (__GFP_RECLAIM)
#define GFP_NOFS        (__GFP_RECLAIM | __GFP_IO)
#define GFP_USER        (__GFP_RECLAIM | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
#define GFP_DMA                __GFP_DMA
#define GFP_DMA32        __GFP_DMA32
#define GFP_HIGHUSER        (GFP_USER | __GFP_HIGHMEM)
#define GFP_HIGHUSER_MOVABLE        (GFP_HIGHUSER | __GFP_MOVABLE)
#define GFP_TRANSHUGE_LIGHT        ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
                         __GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM)
#define GFP_TRANSHUGE        (GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM)

/* Convert GFP flags to their corresponding migrate type */
#define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
#define GFP_MOVABLE_SHIFT 3

static inline int gfp_migratetype(const gfp_t gfp_flags)
{
        VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
        BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE);
        BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE);

        if (unlikely(page_group_by_mobility_disabled))
                return MIGRATE_UNMOVABLE;

        /* Group based on mobility */
        return (gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT;
}
#undef GFP_MOVABLE_MASK
#undef GFP_MOVABLE_SHIFT

static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
{
        return !!(gfp_flags & __GFP_DIRECT_RECLAIM);
}

/**
 * gfpflags_normal_context - is gfp_flags a normal sleepable context?
 * @gfp_flags: gfp_flags to test
 *
 * Test whether @gfp_flags indicates that the allocation is from the
 * %current context and allowed to sleep.
 *
 * An allocation being allowed to block doesn't mean it owns the %current
 * context.  When direct reclaim path tries to allocate memory, the
 * allocation context is nested inside whatever %current was doing at the
 * time of the original allocation.  The nested allocation may be allowed
 * to block but modifying anything %current owns can corrupt the outer
 * context's expectations.
 *
 * %true result from this function indicates that the allocation context
 * can sleep and use anything that's associated with %current.
 */
static inline bool gfpflags_normal_context(const gfp_t gfp_flags)
{
        return (gfp_flags & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC)) ==
                __GFP_DIRECT_RECLAIM;
}

#ifdef CONFIG_HIGHMEM
#define OPT_ZONE_HIGHMEM ZONE_HIGHMEM
#else
#define OPT_ZONE_HIGHMEM ZONE_NORMAL
#endif

#ifdef CONFIG_ZONE_DMA
#define OPT_ZONE_DMA ZONE_DMA
#else
#define OPT_ZONE_DMA ZONE_NORMAL
#endif

#ifdef CONFIG_ZONE_DMA32
#define OPT_ZONE_DMA32 ZONE_DMA32
#else
#define OPT_ZONE_DMA32 ZONE_NORMAL
#endif

/*
 * GFP_ZONE_TABLE is a word size bitstring that is used for looking up the
 * zone to use given the lowest 4 bits of gfp_t. Entries are GFP_ZONES_SHIFT
 * bits long and there are 16 of them to cover all possible combinations of
 * __GFP_DMA, __GFP_DMA32, __GFP_MOVABLE and __GFP_HIGHMEM.
 *
 * The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA.
 * But GFP_MOVABLE is not only a zone specifier but also an allocation
 * policy. Therefore __GFP_MOVABLE plus another zone selector is valid.
 * Only 1 bit of the lowest 3 bits (DMA,DMA32,HIGHMEM) can be set to "1".
 *
 *       bit       result
 *       =================
 *       0x0    => NORMAL
 *       0x1    => DMA or NORMAL
 *       0x2    => HIGHMEM or NORMAL
 *       0x3    => BAD (DMA+HIGHMEM)
 *       0x4    => DMA32 or NORMAL
 *       0x5    => BAD (DMA+DMA32)
 *       0x6    => BAD (HIGHMEM+DMA32)
 *       0x7    => BAD (HIGHMEM+DMA32+DMA)
 *       0x8    => NORMAL (MOVABLE+0)
 *       0x9    => DMA or NORMAL (MOVABLE+DMA)
 *       0xa    => MOVABLE (Movable is valid only if HIGHMEM is set too)
 *       0xb    => BAD (MOVABLE+HIGHMEM+DMA)
 *       0xc    => DMA32 or NORMAL (MOVABLE+DMA32)
 *       0xd    => BAD (MOVABLE+DMA32+DMA)
 *       0xe    => BAD (MOVABLE+DMA32+HIGHMEM)
 *       0xf    => BAD (MOVABLE+DMA32+HIGHMEM+DMA)
 *
 * GFP_ZONES_SHIFT must be <= 2 on 32 bit platforms.
 */

#if defined(CONFIG_ZONE_DEVICE) && (MAX_NR_ZONES-1) <= 4
/* ZONE_DEVICE is not a valid GFP zone specifier */
#define GFP_ZONES_SHIFT 2
#else
#define GFP_ZONES_SHIFT ZONES_SHIFT
#endif

#if 16 * GFP_ZONES_SHIFT > BITS_PER_LONG
#error GFP_ZONES_SHIFT too large to create GFP_ZONE_TABLE integer
#endif

#define GFP_ZONE_TABLE ( \
        (ZONE_NORMAL << 0 * GFP_ZONES_SHIFT)                                       \
        | (OPT_ZONE_DMA << ___GFP_DMA * GFP_ZONES_SHIFT)                       \
        | (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * GFP_ZONES_SHIFT)               \
        | (OPT_ZONE_DMA32 << ___GFP_DMA32 * GFP_ZONES_SHIFT)                       \
        | (ZONE_NORMAL << ___GFP_MOVABLE * GFP_ZONES_SHIFT)                       \
        | (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * GFP_ZONES_SHIFT)    \
        | (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * GFP_ZONES_SHIFT)\
        | (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * GFP_ZONES_SHIFT)\
)

/*
 * GFP_ZONE_BAD is a bitmap for all combinations of __GFP_DMA, __GFP_DMA32
 * __GFP_HIGHMEM and __GFP_MOVABLE that are not permitted. One flag per
 * entry starting with bit 0. Bit is set if the combination is not
 * allowed.
 */
#define GFP_ZONE_BAD ( \
        1 << (___GFP_DMA | ___GFP_HIGHMEM)                                      \
        | 1 << (___GFP_DMA | ___GFP_DMA32)                                      \
        | 1 << (___GFP_DMA32 | ___GFP_HIGHMEM)                                      \
        | 1 << (___GFP_DMA | ___GFP_DMA32 | ___GFP_HIGHMEM)                      \
        | 1 << (___GFP_MOVABLE | ___GFP_HIGHMEM | ___GFP_DMA)                      \
        | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA)                      \
        | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_HIGHMEM)                      \
        | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA | ___GFP_HIGHMEM)  \
)

static inline enum zone_type gfp_zone(gfp_t flags)
{
        enum zone_type z;
        int bit = (__force int) (flags & GFP_ZONEMASK);

        z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) &
                                         ((1 << GFP_ZONES_SHIFT) - 1);
        VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1);
        return z;
}

/*
 * There is only one page-allocator function, and two main namespaces to
 * it. The alloc_page*() variants return 'struct page *' and as such
 * can allocate highmem pages, the *get*page*() variants return
 * virtual kernel addresses to the allocated page(s).
 */

static inline int gfp_zonelist(gfp_t flags)
{
#ifdef CONFIG_NUMA
        if (unlikely(flags & __GFP_THISNODE))
                return ZONELIST_NOFALLBACK;
#endif
        return ZONELIST_FALLBACK;
}

/*
 * We get the zone list from the current node and the gfp_mask.
 * This zone list contains a maximum of MAXNODES*MAX_NR_ZONES zones.
 * There are two zonelists per node, one for all zones with memory and
 * one containing just zones from the node the zonelist belongs to.
 *
 * For the normal case of non-DISCONTIGMEM systems the NODE_DATA() gets
 * optimized to &contig_page_data at compile-time.
 */
static inline struct zonelist *node_zonelist(int nid, gfp_t flags)
{
        return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags);
}

#ifndef HAVE_ARCH_FREE_PAGE
static inline void arch_free_page(struct page *page, int order) { }
#endif
#ifndef HAVE_ARCH_ALLOC_PAGE
static inline void arch_alloc_page(struct page *page, int order) { }
#endif
#ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE
static inline int arch_make_page_accessible(struct page *page)
{
        return 0;
}
#endif

struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
                                                        nodemask_t *nodemask);

static inline struct page *
__alloc_pages(gfp_t gfp_mask, unsigned int order, int preferred_nid)
{
        return __alloc_pages_nodemask(gfp_mask, order, preferred_nid, NULL);
}

/*
 * Allocate pages, preferring the node given as nid. The node must be valid and
 * online. For more general interface, see alloc_pages_node().
 */
static inline struct page *
__alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
{
        VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
        VM_WARN_ON((gfp_mask & __GFP_THISNODE) && !node_online(nid));

        return __alloc_pages(gfp_mask, order, nid);
}

/*
 * Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE,
 * prefer the current CPU's closest node. Otherwise node must be valid and
 * online.
 */
static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
                                                unsigned int order)
{
        if (nid == NUMA_NO_NODE)
                nid = numa_mem_id();

        return __alloc_pages_node(nid, gfp_mask, order);
}

#ifdef CONFIG_NUMA
extern struct page *alloc_pages_current(gfp_t gfp_mask, unsigned order);

static inline struct page *
alloc_pages(gfp_t gfp_mask, unsigned int order)
{
        return alloc_pages_current(gfp_mask, order);
}
extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
                        struct vm_area_struct *vma, unsigned long addr,
                        int node, bool hugepage);
#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
        alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true)
#else
static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order)
{
        return alloc_pages_node(numa_node_id(), gfp_mask, order);
}
#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\
        alloc_pages(gfp_mask, order)
#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
        alloc_pages(gfp_mask, order)
#endif
#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
#define alloc_page_vma(gfp_mask, vma, addr)                        \
        alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false)

extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
extern unsigned long get_zeroed_page(gfp_t gfp_mask);

void *alloc_pages_exact(size_t size, gfp_t gfp_mask);
void free_pages_exact(void *virt, size_t size);
void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);

#define __get_free_page(gfp_mask) \
                __get_free_pages((gfp_mask), 0)

#define __get_dma_pages(gfp_mask, order) \
                __get_free_pages((gfp_mask) | GFP_DMA, (order))

extern void __free_pages(struct page *page, unsigned int order);
extern void free_pages(unsigned long addr, unsigned int order);
extern void free_unref_page(struct page *page);
extern void free_unref_page_list(struct list_head *list);

struct page_frag_cache;
extern void __page_frag_cache_drain(struct page *page, unsigned int count);
extern void *page_frag_alloc(struct page_frag_cache *nc,
                             unsigned int fragsz, gfp_t gfp_mask);
extern void page_frag_free(void *addr);

#define __free_page(page) __free_pages((page), 0)
#define free_page(addr) free_pages((addr), 0)

void page_alloc_init(void);
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
void drain_all_pages(struct zone *zone);
void drain_local_pages(struct zone *zone);

void page_alloc_init_late(void);

/*
 * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
 * GFP flags are used before interrupts are enabled. Once interrupts are
 * enabled, it is set to __GFP_BITS_MASK while the system is running. During
 * hibernation, it is used by PM to avoid I/O during memory allocation while
 * devices are suspended.
 */
extern gfp_t gfp_allowed_mask;

/* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */
bool gfp_pfmemalloc_allowed(gfp_t gfp_mask);

extern void pm_restrict_gfp_mask(void);
extern void pm_restore_gfp_mask(void);

#ifdef CONFIG_PM_SLEEP
extern bool pm_suspended_storage(void);
#else
static inline bool pm_suspended_storage(void)
{
        return false;
}
#endif /* CONFIG_PM_SLEEP */

/*
 * Check if the gfp flags allow compaction - GFP_NOIO is a really
 * tricky context because the migration might require IO.
 */
static inline bool gfp_compaction_allowed(gfp_t gfp_mask)
{
        return IS_ENABLED(CONFIG_COMPACTION) && (gfp_mask & __GFP_IO);
}

#ifdef CONFIG_CONTIG_ALLOC
/* The below functions must be run on a range from a single zone. */
extern int alloc_contig_range(unsigned long start, unsigned long end,
                              unsigned migratetype, gfp_t gfp_mask);
extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
                                       int nid, nodemask_t *nodemask);
#endif
void free_contig_range(unsigned long pfn, unsigned int nr_pages);

#ifdef CONFIG_CMA
/* CMA stuff */
extern void init_cma_reserved_pageblock(struct page *page);
#endif

#endif /* __LINUX_GFP_H */




































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
/*
 * Performance events x86 architecture header
 *
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2009 Jaswinder Singh Rajput
 *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
 *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
 *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
 *  Copyright (C) 2009 Google, Inc., Stephane Eranian
 *
 *  For licencing details see kernel-base/COPYING
 */

#include <linux/perf_event.h>

#include <asm/intel_ds.h>

/* To enable MSR tracing please use the generic trace points. */

/*
 *          |   NHM/WSM    |      SNB     |
 * register -------------------------------
 *          |  HT  | no HT |  HT  | no HT |
 *-----------------------------------------
 * offcore  | core | core  | cpu  | core  |
 * lbr_sel  | core | core  | cpu  | core  |
 * ld_lat   | cpu  | core  | cpu  | core  |
 *-----------------------------------------
 *
 * Given that there is a small number of shared regs,
 * we can pre-allocate their slot in the per-cpu
 * per-core reg tables.
 */
enum extra_reg_type {
        EXTRA_REG_NONE  = -1,        /* not used */

        EXTRA_REG_RSP_0 = 0,        /* offcore_response_0 */
        EXTRA_REG_RSP_1 = 1,        /* offcore_response_1 */
        EXTRA_REG_LBR   = 2,        /* lbr_select */
        EXTRA_REG_LDLAT = 3,        /* ld_lat_threshold */
        EXTRA_REG_FE    = 4,    /* fe_* */

        EXTRA_REG_MAX                /* number of entries needed */
};

struct event_constraint {
        union {
                unsigned long        idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
                u64                idxmsk64;
        };
        u64                code;
        u64                cmask;
        int                weight;
        int                overlap;
        int                flags;
        unsigned int        size;
};

static inline bool constraint_match(struct event_constraint *c, u64 ecode)
{
        return ((ecode & c->cmask) - c->code) <= (u64)c->size;
}

/*
 * struct hw_perf_event.flags flags
 */
#define PERF_X86_EVENT_PEBS_LDLAT        0x0001 /* ld+ldlat data address sampling */
#define PERF_X86_EVENT_PEBS_ST                0x0002 /* st data address sampling */
#define PERF_X86_EVENT_PEBS_ST_HSW        0x0004 /* haswell style datala, store */
#define PERF_X86_EVENT_PEBS_LD_HSW        0x0008 /* haswell style datala, load */
#define PERF_X86_EVENT_PEBS_NA_HSW        0x0010 /* haswell style datala, unknown */
#define PERF_X86_EVENT_EXCL                0x0020 /* HT exclusivity on counter */
#define PERF_X86_EVENT_DYNAMIC                0x0040 /* dynamic alloc'd constraint */
#define PERF_X86_EVENT_RDPMC_ALLOWED        0x0080 /* grant rdpmc permission */
#define PERF_X86_EVENT_EXCL_ACCT        0x0100 /* accounted EXCL event */
#define PERF_X86_EVENT_AUTO_RELOAD        0x0200 /* use PEBS auto-reload */
#define PERF_X86_EVENT_LARGE_PEBS        0x0400 /* use large PEBS */
#define PERF_X86_EVENT_PEBS_VIA_PT        0x0800 /* use PT buffer for PEBS */
#define PERF_X86_EVENT_PAIR                0x1000 /* Large Increment per Cycle */
#define PERF_X86_EVENT_LBR_SELECT        0x2000 /* Save/Restore MSR_LBR_SELECT */
#define PERF_X86_EVENT_TOPDOWN                0x4000 /* Count Topdown slots/metrics events */

static inline bool is_topdown_count(struct perf_event *event)
{
        return event->hw.flags & PERF_X86_EVENT_TOPDOWN;
}

static inline bool is_metric_event(struct perf_event *event)
{
        u64 config = event->attr.config;

        return ((config & ARCH_PERFMON_EVENTSEL_EVENT) == 0) &&
                ((config & INTEL_ARCH_EVENT_MASK) >= INTEL_TD_METRIC_RETIRING)  &&
                ((config & INTEL_ARCH_EVENT_MASK) <= INTEL_TD_METRIC_MAX);
}

static inline bool is_slots_event(struct perf_event *event)
{
        return (event->attr.config & INTEL_ARCH_EVENT_MASK) == INTEL_TD_SLOTS;
}

static inline bool is_topdown_event(struct perf_event *event)
{
        return is_metric_event(event) || is_slots_event(event);
}

struct amd_nb {
        int nb_id;  /* NorthBridge id */
        int refcnt; /* reference count */
        struct perf_event *owners[X86_PMC_IDX_MAX];
        struct event_constraint event_constraints[X86_PMC_IDX_MAX];
};

#define PEBS_COUNTER_MASK        ((1ULL << MAX_PEBS_EVENTS) - 1)
#define PEBS_PMI_AFTER_EACH_RECORD BIT_ULL(60)
#define PEBS_OUTPUT_OFFSET        61
#define PEBS_OUTPUT_MASK        (3ull << PEBS_OUTPUT_OFFSET)
#define PEBS_OUTPUT_PT                (1ull << PEBS_OUTPUT_OFFSET)
#define PEBS_VIA_PT_MASK        (PEBS_OUTPUT_PT | PEBS_PMI_AFTER_EACH_RECORD)

/*
 * Flags PEBS can handle without an PMI.
 *
 * TID can only be handled by flushing at context switch.
 * REGS_USER can be handled for events limited to ring 3.
 *
 */
#define LARGE_PEBS_FLAGS \
        (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
        PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
        PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
        PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \
        PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER | \
        PERF_SAMPLE_PERIOD)

#define PEBS_GP_REGS                        \
        ((1ULL << PERF_REG_X86_AX)    | \
         (1ULL << PERF_REG_X86_BX)    | \
         (1ULL << PERF_REG_X86_CX)    | \
         (1ULL << PERF_REG_X86_DX)    | \
         (1ULL << PERF_REG_X86_DI)    | \
         (1ULL << PERF_REG_X86_SI)    | \
         (1ULL << PERF_REG_X86_SP)    | \
         (1ULL << PERF_REG_X86_BP)    | \
         (1ULL << PERF_REG_X86_IP)    | \
         (1ULL << PERF_REG_X86_FLAGS) | \
         (1ULL << PERF_REG_X86_R8)    | \
         (1ULL << PERF_REG_X86_R9)    | \
         (1ULL << PERF_REG_X86_R10)   | \
         (1ULL << PERF_REG_X86_R11)   | \
         (1ULL << PERF_REG_X86_R12)   | \
         (1ULL << PERF_REG_X86_R13)   | \
         (1ULL << PERF_REG_X86_R14)   | \
         (1ULL << PERF_REG_X86_R15))

/*
 * Per register state.
 */
struct er_account {
        raw_spinlock_t      lock;        /* per-core: protect structure */
        u64                 config;        /* extra MSR config */
        u64                 reg;        /* extra MSR number */
        atomic_t            ref;        /* reference count */
};

/*
 * Per core/cpu state
 *
 * Used to coordinate shared registers between HT threads or
 * among events on a single PMU.
 */
struct intel_shared_regs {
        struct er_account       regs[EXTRA_REG_MAX];
        int                     refcnt;                /* per-core: #HT threads */
        unsigned                core_id;        /* per-core: core id */
};

enum intel_excl_state_type {
        INTEL_EXCL_UNUSED    = 0, /* counter is unused */
        INTEL_EXCL_SHARED    = 1, /* counter can be used by both threads */
        INTEL_EXCL_EXCLUSIVE = 2, /* counter can be used by one thread only */
};

struct intel_excl_states {
        enum intel_excl_state_type state[X86_PMC_IDX_MAX];
        bool sched_started; /* true if scheduling has started */
};

struct intel_excl_cntrs {
        raw_spinlock_t        lock;

        struct intel_excl_states states[2];

        union {
                u16        has_exclusive[2];
                u32        exclusive_present;
        };

        int                refcnt;                /* per-core: #HT threads */
        unsigned        core_id;        /* per-core: core id */
};

struct x86_perf_task_context;
#define MAX_LBR_ENTRIES                32

enum {
        LBR_FORMAT_32                = 0x00,
        LBR_FORMAT_LIP                = 0x01,
        LBR_FORMAT_EIP                = 0x02,
        LBR_FORMAT_EIP_FLAGS        = 0x03,
        LBR_FORMAT_EIP_FLAGS2        = 0x04,
        LBR_FORMAT_INFO                = 0x05,
        LBR_FORMAT_TIME                = 0x06,
        LBR_FORMAT_MAX_KNOWN    = LBR_FORMAT_TIME,
};

enum {
        X86_PERF_KFREE_SHARED = 0,
        X86_PERF_KFREE_EXCL   = 1,
        X86_PERF_KFREE_MAX
};

struct cpu_hw_events {
        /*
         * Generic x86 PMC bits
         */
        struct perf_event        *events[X86_PMC_IDX_MAX]; /* in counter order */
        unsigned long                active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
        unsigned long                running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
        int                        enabled;

        int                        n_events; /* the # of events in the below arrays */
        int                        n_added;  /* the # last events in the below arrays;
                                             they've never been enabled yet */
        int                        n_txn;    /* the # last events in the below arrays;
                                             added in the current transaction */
        int                        n_txn_pair;
        int                        n_txn_metric;
        int                        assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
        u64                        tags[X86_PMC_IDX_MAX];

        struct perf_event        *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
        struct event_constraint        *event_constraint[X86_PMC_IDX_MAX];

        int                        n_excl; /* the number of exclusive events */

        unsigned int                txn_flags;
        int                        is_fake;

        /*
         * Intel DebugStore bits
         */
        struct debug_store        *ds;
        void                        *ds_pebs_vaddr;
        void                        *ds_bts_vaddr;
        u64                        pebs_enabled;
        int                        n_pebs;
        int                        n_large_pebs;
        int                        n_pebs_via_pt;
        int                        pebs_output;

        /* Current super set of events hardware configuration */
        u64                        pebs_data_cfg;
        u64                        active_pebs_data_cfg;
        int                        pebs_record_size;

        /*
         * Intel LBR bits
         */
        int                                lbr_users;
        int                                lbr_pebs_users;
        struct perf_branch_stack        lbr_stack;
        struct perf_branch_entry        lbr_entries[MAX_LBR_ENTRIES];
        union {
                struct er_account                *lbr_sel;
                struct er_account                *lbr_ctl;
        };
        u64                                br_sel;
        void                                *last_task_ctx;
        int                                last_log_id;
        int                                lbr_select;
        void                                *lbr_xsave;

        /*
         * Intel host/guest exclude bits
         */
        u64                                intel_ctrl_guest_mask;
        u64                                intel_ctrl_host_mask;
        struct perf_guest_switch_msr        guest_switch_msrs[X86_PMC_IDX_MAX];

        /*
         * Intel checkpoint mask
         */
        u64                                intel_cp_status;

        /*
         * manage shared (per-core, per-cpu) registers
         * used on Intel NHM/WSM/SNB
         */
        struct intel_shared_regs        *shared_regs;
        /*
         * manage exclusive counter access between hyperthread
         */
        struct event_constraint *constraint_list; /* in enable order */
        struct intel_excl_cntrs                *excl_cntrs;
        int excl_thread_id; /* 0 or 1 */

        /*
         * SKL TSX_FORCE_ABORT shadow
         */
        u64                                tfa_shadow;

        /*
         * Perf Metrics
         */
        /* number of accepted metrics events */
        int                                n_metric;

        /*
         * AMD specific bits
         */
        struct amd_nb                        *amd_nb;
        /* Inverted mask of bits to clear in the perf_ctr ctrl registers */
        u64                                perf_ctr_virt_mask;
        int                                n_pair; /* Large increment events */

        void                                *kfree_on_online[X86_PERF_KFREE_MAX];

        struct pmu                        *pmu;
};

#define __EVENT_CONSTRAINT_RANGE(c, e, n, m, w, o, f) {        \
        { .idxmsk64 = (n) },                \
        .code = (c),                        \
        .size = (e) - (c),                \
        .cmask = (m),                        \
        .weight = (w),                        \
        .overlap = (o),                        \
        .flags = f,                        \
}

#define __EVENT_CONSTRAINT(c, n, m, w, o, f) \
        __EVENT_CONSTRAINT_RANGE(c, c, n, m, w, o, f)

#define EVENT_CONSTRAINT(c, n, m)        \
        __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0)

/*
 * The constraint_match() function only works for 'simple' event codes
 * and not for extended (AMD64_EVENTSEL_EVENT) events codes.
 */
#define EVENT_CONSTRAINT_RANGE(c, e, n, m) \
        __EVENT_CONSTRAINT_RANGE(c, e, n, m, HWEIGHT(n), 0, 0)

#define INTEL_EXCLEVT_CONSTRAINT(c, n)        \
        __EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT, HWEIGHT(n),\
                           0, PERF_X86_EVENT_EXCL)

/*
 * The overlap flag marks event constraints with overlapping counter
 * masks. This is the case if the counter mask of such an event is not
 * a subset of any other counter mask of a constraint with an equal or
 * higher weight, e.g.:
 *
 *  c_overlaps = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
 *  c_another1 = EVENT_CONSTRAINT(0, 0x07, 0);
 *  c_another2 = EVENT_CONSTRAINT(0, 0x38, 0);
 *
 * The event scheduler may not select the correct counter in the first
 * cycle because it needs to know which subsequent events will be
 * scheduled. It may fail to schedule the events then. So we set the
 * overlap flag for such constraints to give the scheduler a hint which
 * events to select for counter rescheduling.
 *
 * Care must be taken as the rescheduling algorithm is O(n!) which
 * will increase scheduling cycles for an over-committed system
 * dramatically.  The number of such EVENT_CONSTRAINT_OVERLAP() macros
 * and its counter masks must be kept at a minimum.
 */
#define EVENT_CONSTRAINT_OVERLAP(c, n, m)        \
        __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1, 0)

/*
 * Constraint on the Event code.
 */
#define INTEL_EVENT_CONSTRAINT(c, n)        \
        EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)

/*
 * Constraint on a range of Event codes
 */
#define INTEL_EVENT_CONSTRAINT_RANGE(c, e, n)                        \
        EVENT_CONSTRAINT_RANGE(c, e, n, ARCH_PERFMON_EVENTSEL_EVENT)

/*
 * Constraint on the Event code + UMask + fixed-mask
 *
 * filter mask to validate fixed counter events.
 * the following filters disqualify for fixed counters:
 *  - inv
 *  - edge
 *  - cnt-mask
 *  - in_tx
 *  - in_tx_checkpointed
 *  The other filters are supported by fixed counters.
 *  The any-thread option is supported starting with v3.
 */
#define FIXED_EVENT_FLAGS (X86_RAW_EVENT_MASK|HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)
#define FIXED_EVENT_CONSTRAINT(c, n)        \
        EVENT_CONSTRAINT(c, (1ULL << (32+n)), FIXED_EVENT_FLAGS)

/*
 * The special metric counters do not actually exist. They are calculated from
 * the combination of the FxCtr3 + MSR_PERF_METRICS.
 *
 * The special metric counters are mapped to a dummy offset for the scheduler.
 * The sharing between multiple users of the same metric without multiplexing
 * is not allowed, even though the hardware supports that in principle.
 */

#define METRIC_EVENT_CONSTRAINT(c, n)                                        \
        EVENT_CONSTRAINT(c, (1ULL << (INTEL_PMC_IDX_METRIC_BASE + n)),        \
                         INTEL_ARCH_EVENT_MASK)

/*
 * Constraint on the Event code + UMask
 */
#define INTEL_UEVENT_CONSTRAINT(c, n)        \
        EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)

/* Constraint on specific umask bit only + event */
#define INTEL_UBIT_EVENT_CONSTRAINT(c, n)        \
        EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT|(c))

/* Like UEVENT_CONSTRAINT, but match flags too */
#define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n)        \
        EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)

#define INTEL_EXCLUEVT_CONSTRAINT(c, n)        \
        __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
                           HWEIGHT(n), 0, PERF_X86_EVENT_EXCL)

#define INTEL_PLD_CONSTRAINT(c, n)        \
        __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
                           HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT)

#define INTEL_PST_CONSTRAINT(c, n)        \
        __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
                          HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST)

/* Event constraint, but match on all event flags too. */
#define INTEL_FLAGS_EVENT_CONSTRAINT(c, n) \
        EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS)

#define INTEL_FLAGS_EVENT_CONSTRAINT_RANGE(c, e, n)                        \
        EVENT_CONSTRAINT_RANGE(c, e, n, ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS)

/* Check only flags, but allow all event/umask */
#define INTEL_ALL_EVENT_CONSTRAINT(code, n)        \
        EVENT_CONSTRAINT(code, n, X86_ALL_EVENT_FLAGS)

/* Check flags and event code, and set the HSW store flag */
#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_ST(code, n) \
        __EVENT_CONSTRAINT(code, n,                         \
                          ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
                          HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)

/* Check flags and event code, and set the HSW load flag */
#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(code, n) \
        __EVENT_CONSTRAINT(code, n,                        \
                          ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
                          HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)

#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(code, end, n) \
        __EVENT_CONSTRAINT_RANGE(code, end, n,                                \
                          ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
                          HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)

#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(code, n) \
        __EVENT_CONSTRAINT(code, n,                        \
                          ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
                          HWEIGHT(n), 0, \
                          PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)

/* Check flags and event code/umask, and set the HSW store flag */
#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(code, n) \
        __EVENT_CONSTRAINT(code, n,                         \
                          INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
                          HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)

#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(code, n) \
        __EVENT_CONSTRAINT(code, n,                        \
                          INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
                          HWEIGHT(n), 0, \
                          PERF_X86_EVENT_PEBS_ST_HSW|PERF_X86_EVENT_EXCL)

/* Check flags and event code/umask, and set the HSW load flag */
#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(code, n) \
        __EVENT_CONSTRAINT(code, n,                         \
                          INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
                          HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)

#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(code, n) \
        __EVENT_CONSTRAINT(code, n,                        \
                          INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
                          HWEIGHT(n), 0, \
                          PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)

/* Check flags and event code/umask, and set the HSW N/A flag */
#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \
        __EVENT_CONSTRAINT(code, n,                         \
                          INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
                          HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_NA_HSW)


/*
 * We define the end marker as having a weight of -1
 * to enable blacklisting of events using a counter bitmask
 * of zero and thus a weight of zero.
 * The end marker has a weight that cannot possibly be
 * obtained from counting the bits in the bitmask.
 */
#define EVENT_CONSTRAINT_END { .weight = -1 }

/*
 * Check for end marker with weight == -1
 */
#define for_each_event_constraint(e, c)        \
        for ((e) = (c); (e)->weight != -1; (e)++)

/*
 * Extra registers for specific events.
 *
 * Some events need large masks and require external MSRs.
 * Those extra MSRs end up being shared for all events on
 * a PMU and sometimes between PMU of sibling HT threads.
 * In either case, the kernel needs to handle conflicting
 * accesses to those extra, shared, regs. The data structure
 * to manage those registers is stored in cpu_hw_event.
 */
struct extra_reg {
        unsigned int                event;
        unsigned int                msr;
        u64                        config_mask;
        u64                        valid_mask;
        int                        idx;  /* per_xxx->regs[] reg index */
        bool                        extra_msr_access;
};

#define EVENT_EXTRA_REG(e, ms, m, vm, i) {        \
        .event = (e),                        \
        .msr = (ms),                        \
        .config_mask = (m),                \
        .valid_mask = (vm),                \
        .idx = EXTRA_REG_##i,                \
        .extra_msr_access = true,        \
        }

#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx)        \
        EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx)

#define INTEL_UEVENT_EXTRA_REG(event, msr, vm, idx) \
        EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT | \
                        ARCH_PERFMON_EVENTSEL_UMASK, vm, idx)

#define INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(c) \
        INTEL_UEVENT_EXTRA_REG(c, \
                               MSR_PEBS_LD_LAT_THRESHOLD, \
                               0xffff, \
                               LDLAT)

#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0)

union perf_capabilities {
        struct {
                u64        lbr_format:6;
                u64        pebs_trap:1;
                u64        pebs_arch_reg:1;
                u64        pebs_format:4;
                u64        smm_freeze:1;
                /*
                 * PMU supports separate counter range for writing
                 * values > 32bit.
                 */
                u64        full_width_write:1;
                u64     pebs_baseline:1;
                u64        perf_metrics:1;
                u64        pebs_output_pt_available:1;
                u64        anythread_deprecated:1;
        };
        u64        capabilities;
};

struct x86_pmu_quirk {
        struct x86_pmu_quirk *next;
        void (*func)(void);
};

union x86_pmu_config {
        struct {
                u64 event:8,
                    umask:8,
                    usr:1,
                    os:1,
                    edge:1,
                    pc:1,
                    interrupt:1,
                    __reserved1:1,
                    en:1,
                    inv:1,
                    cmask:8,
                    event2:4,
                    __reserved2:4,
                    go:1,
                    ho:1;
        } bits;
        u64 value;
};

#define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value

enum {
        x86_lbr_exclusive_lbr,
        x86_lbr_exclusive_bts,
        x86_lbr_exclusive_pt,
        x86_lbr_exclusive_max,
};

/*
 * struct x86_pmu - generic x86 pmu
 */
struct x86_pmu {
        /*
         * Generic x86 PMC bits
         */
        const char        *name;
        int                version;
        int                (*handle_irq)(struct pt_regs *);
        void                (*disable_all)(void);
        void                (*enable_all)(int added);
        void                (*enable)(struct perf_event *);
        void                (*disable)(struct perf_event *);
        void                (*add)(struct perf_event *);
        void                (*del)(struct perf_event *);
        void                (*read)(struct perf_event *event);
        int                (*hw_config)(struct perf_event *event);
        int                (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
        unsigned        eventsel;
        unsigned        perfctr;
        int                (*addr_offset)(int index, bool eventsel);
        int                (*rdpmc_index)(int index);
        u64                (*event_map)(int);
        int                max_events;
        int                num_counters;
        int                num_counters_fixed;
        int                cntval_bits;
        u64                cntval_mask;
        union {
                        unsigned long events_maskl;
                        unsigned long events_mask[BITS_TO_LONGS(ARCH_PERFMON_EVENTS_COUNT)];
        };
        int                events_mask_len;
        int                apic;
        u64                max_period;
        struct event_constraint *
                        (*get_event_constraints)(struct cpu_hw_events *cpuc,
                                                 int idx,
                                                 struct perf_event *event);

        void                (*put_event_constraints)(struct cpu_hw_events *cpuc,
                                                 struct perf_event *event);

        void                (*start_scheduling)(struct cpu_hw_events *cpuc);

        void                (*commit_scheduling)(struct cpu_hw_events *cpuc, int idx, int cntr);

        void                (*stop_scheduling)(struct cpu_hw_events *cpuc);

        struct event_constraint *event_constraints;
        struct x86_pmu_quirk *quirks;
        int                perfctr_second_write;
        u64                (*limit_period)(struct perf_event *event, u64 l);

        /* PMI handler bits */
        unsigned int        late_ack                :1,
                        enabled_ack                :1,
                        counter_freezing        :1;
        /*
         * sysfs attrs
         */
        int                attr_rdpmc_broken;
        int                attr_rdpmc;
        struct attribute **format_attrs;

        ssize_t                (*events_sysfs_show)(char *page, u64 config);
        const struct attribute_group **attr_update;

        unsigned long        attr_freeze_on_smi;

        /*
         * CPU Hotplug hooks
         */
        int                (*cpu_prepare)(int cpu);
        void                (*cpu_starting)(int cpu);
        void                (*cpu_dying)(int cpu);
        void                (*cpu_dead)(int cpu);

        void                (*check_microcode)(void);
        void                (*sched_task)(struct perf_event_context *ctx,
                                      bool sched_in);

        /*
         * Intel Arch Perfmon v2+
         */
        u64                        intel_ctrl;
        union perf_capabilities intel_cap;

        /*
         * Intel DebugStore bits
         */
        unsigned int        bts                        :1,
                        bts_active                :1,
                        pebs                        :1,
                        pebs_active                :1,
                        pebs_broken                :1,
                        pebs_prec_dist                :1,
                        pebs_no_tlb                :1,
                        pebs_no_isolation        :1;
        int                pebs_record_size;
        int                pebs_buffer_size;
        int                max_pebs_events;
        void                (*drain_pebs)(struct pt_regs *regs, struct perf_sample_data *data);
        struct event_constraint *pebs_constraints;
        void                (*pebs_aliases)(struct perf_event *event);
        unsigned long        large_pebs_flags;
        u64                rtm_abort_event;

        /*
         * Intel LBR
         */
        unsigned int        lbr_tos, lbr_from, lbr_to,
                        lbr_info, lbr_nr;           /* LBR base regs and size */
        union {
                u64        lbr_sel_mask;                   /* LBR_SELECT valid bits */
                u64        lbr_ctl_mask;                   /* LBR_CTL valid bits */
        };
        union {
                const int        *lbr_sel_map;           /* lbr_select mappings */
                int                *lbr_ctl_map;           /* LBR_CTL mappings */
        };
        bool                lbr_double_abort;           /* duplicated lbr aborts */
        bool                lbr_pt_coexist;                   /* (LBR|BTS) may coexist with PT */

        /*
         * Intel Architectural LBR CPUID Enumeration
         */
        unsigned int        lbr_depth_mask:8;
        unsigned int        lbr_deep_c_reset:1;
        unsigned int        lbr_lip:1;
        unsigned int        lbr_cpl:1;
        unsigned int        lbr_filter:1;
        unsigned int        lbr_call_stack:1;
        unsigned int        lbr_mispred:1;
        unsigned int        lbr_timed_lbr:1;
        unsigned int        lbr_br_type:1;

        void                (*lbr_reset)(void);
        void                (*lbr_read)(struct cpu_hw_events *cpuc);
        void                (*lbr_save)(void *ctx);
        void                (*lbr_restore)(void *ctx);

        /*
         * Intel PT/LBR/BTS are exclusive
         */
        atomic_t        lbr_exclusive[x86_lbr_exclusive_max];

        /*
         * Intel perf metrics
         */
        u64                (*update_topdown_event)(struct perf_event *event);
        int                (*set_topdown_event_period)(struct perf_event *event);

        /*
         * perf task context (i.e. struct perf_event_context::task_ctx_data)
         * switch helper to bridge calls from perf/core to perf/x86.
         * See struct pmu::swap_task_ctx() usage for examples;
         */
        void                (*swap_task_ctx)(struct perf_event_context *prev,
                                         struct perf_event_context *next);

        /*
         * AMD bits
         */
        unsigned int        amd_nb_constraints : 1;
        u64                perf_ctr_pair_en;

        /*
         * Extra registers for events
         */
        struct extra_reg *extra_regs;
        unsigned int flags;

        /*
         * Intel host/guest support (KVM)
         */
        struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);

        /*
         * Check period value for PERF_EVENT_IOC_PERIOD ioctl.
         */
        int (*check_period) (struct perf_event *event, u64 period);

        int (*aux_output_match) (struct perf_event *event);
};

struct x86_perf_task_context_opt {
        int lbr_callstack_users;
        int lbr_stack_state;
        int log_id;
};

struct x86_perf_task_context {
        u64 lbr_sel;
        int tos;
        int valid_lbrs;
        struct x86_perf_task_context_opt opt;
        struct lbr_entry lbr[MAX_LBR_ENTRIES];
};

struct x86_perf_task_context_arch_lbr {
        struct x86_perf_task_context_opt opt;
        struct lbr_entry entries[];
};

/*
 * Add padding to guarantee the 64-byte alignment of the state buffer.
 *
 * The structure is dynamically allocated. The size of the LBR state may vary
 * based on the number of LBR registers.
 *
 * Do not put anything after the LBR state.
 */
struct x86_perf_task_context_arch_lbr_xsave {
        struct x86_perf_task_context_opt                opt;

        union {
                struct xregs_state                        xsave;
                struct {
                        struct fxregs_state                i387;
                        struct xstate_header                header;
                        struct arch_lbr_state                lbr;
                } __attribute__ ((packed, aligned (XSAVE_ALIGNMENT)));
        };
};

#define x86_add_quirk(func_)                                                \
do {                                                                        \
        static struct x86_pmu_quirk __quirk __initdata = {                \
                .func = func_,                                                \
        };                                                                \
        __quirk.next = x86_pmu.quirks;                                        \
        x86_pmu.quirks = &__quirk;                                        \
} while (0)

/*
 * x86_pmu flags
 */
#define PMU_FL_NO_HT_SHARING        0x1 /* no hyper-threading resource sharing */
#define PMU_FL_HAS_RSP_1        0x2 /* has 2 equivalent offcore_rsp regs   */
#define PMU_FL_EXCL_CNTRS        0x4 /* has exclusive counter requirements  */
#define PMU_FL_EXCL_ENABLED        0x8 /* exclusive counter active */
#define PMU_FL_PEBS_ALL                0x10 /* all events are valid PEBS events */
#define PMU_FL_TFA                0x20 /* deal with TSX force abort */
#define PMU_FL_PAIR                0x40 /* merge counters for large incr. events */

#define EVENT_VAR(_id)  event_attr_##_id
#define EVENT_PTR(_id) &event_attr_##_id.attr.attr

#define EVENT_ATTR(_name, _id)                                                \
static struct perf_pmu_events_attr EVENT_VAR(_id) = {                        \
        .attr                = __ATTR(_name, 0444, events_sysfs_show, NULL),        \
        .id                = PERF_COUNT_HW_##_id,                                \
        .event_str        = NULL,                                                \
};

#define EVENT_ATTR_STR(_name, v, str)                                        \
static struct perf_pmu_events_attr event_attr_##v = {                        \
        .attr                = __ATTR(_name, 0444, events_sysfs_show, NULL),        \
        .id                = 0,                                                \
        .event_str        = str,                                                \
};

#define EVENT_ATTR_STR_HT(_name, v, noht, ht)                                \
static struct perf_pmu_events_ht_attr event_attr_##v = {                \
        .attr                = __ATTR(_name, 0444, events_ht_sysfs_show, NULL),\
        .id                = 0,                                                \
        .event_str_noht        = noht,                                                \
        .event_str_ht        = ht,                                                \
}

struct pmu *x86_get_pmu(unsigned int cpu);
extern struct x86_pmu x86_pmu __read_mostly;

static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx)
{
        if (static_cpu_has(X86_FEATURE_ARCH_LBR))
                return &((struct x86_perf_task_context_arch_lbr *)ctx)->opt;

        return &((struct x86_perf_task_context *)ctx)->opt;
}

static inline bool x86_pmu_has_lbr_callstack(void)
{
        return  x86_pmu.lbr_sel_map &&
                x86_pmu.lbr_sel_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] > 0;
}

DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);

int x86_perf_event_set_period(struct perf_event *event);

/*
 * Generalized hw caching related hw_event table, filled
 * in on a per model basis. A value of 0 means
 * 'not supported', -1 means 'hw_event makes no sense on
 * this CPU', any other value means the raw hw_event
 * ID.
 */

#define C(x) PERF_COUNT_HW_CACHE_##x

extern u64 __read_mostly hw_cache_event_ids
                                [PERF_COUNT_HW_CACHE_MAX]
                                [PERF_COUNT_HW_CACHE_OP_MAX]
                                [PERF_COUNT_HW_CACHE_RESULT_MAX];
extern u64 __read_mostly hw_cache_extra_regs
                                [PERF_COUNT_HW_CACHE_MAX]
                                [PERF_COUNT_HW_CACHE_OP_MAX]
                                [PERF_COUNT_HW_CACHE_RESULT_MAX];

u64 x86_perf_event_update(struct perf_event *event);

static inline unsigned int x86_pmu_config_addr(int index)
{
        return x86_pmu.eventsel + (x86_pmu.addr_offset ?
                                   x86_pmu.addr_offset(index, true) : index);
}

static inline unsigned int x86_pmu_event_addr(int index)
{
        return x86_pmu.perfctr + (x86_pmu.addr_offset ?
                                  x86_pmu.addr_offset(index, false) : index);
}

static inline int x86_pmu_rdpmc_index(int index)
{
        return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index;
}

int x86_add_exclusive(unsigned int what);

void x86_del_exclusive(unsigned int what);

int x86_reserve_hardware(void);

void x86_release_hardware(void);

int x86_pmu_max_precise(void);

void hw_perf_lbr_event_destroy(struct perf_event *event);

int x86_setup_perfctr(struct perf_event *event);

int x86_pmu_hw_config(struct perf_event *event);

void x86_pmu_disable_all(void);

static inline bool is_counter_pair(struct hw_perf_event *hwc)
{
        return hwc->flags & PERF_X86_EVENT_PAIR;
}

static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
                                          u64 enable_mask)
{
        u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask);

        if (hwc->extra_reg.reg)
                wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);

        /*
         * Add enabled Merge event on next counter
         * if large increment event being enabled on this counter
         */
        if (is_counter_pair(hwc))
                wrmsrl(x86_pmu_config_addr(hwc->idx + 1), x86_pmu.perf_ctr_pair_en);

        wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask);
}

void x86_pmu_enable_all(int added);

int perf_assign_events(struct event_constraint **constraints, int n,
                        int wmin, int wmax, int gpmax, int *assign);
int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);

void x86_pmu_stop(struct perf_event *event, int flags);

static inline void x86_pmu_disable_event(struct perf_event *event)
{
        u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask);
        struct hw_perf_event *hwc = &event->hw;

        wrmsrl(hwc->config_base, hwc->config & ~disable_mask);

        if (is_counter_pair(hwc))
                wrmsrl(x86_pmu_config_addr(hwc->idx + 1), 0);
}

void x86_pmu_enable_event(struct perf_event *event);

int x86_pmu_handle_irq(struct pt_regs *regs);

extern struct event_constraint emptyconstraint;

extern struct event_constraint unconstrained;

static inline bool kernel_ip(unsigned long ip)
{
#ifdef CONFIG_X86_32
        return ip > PAGE_OFFSET;
#else
        return (long)ip < 0;
#endif
}

/*
 * Not all PMUs provide the right context information to place the reported IP
 * into full context. Specifically segment registers are typically not
 * supplied.
 *
 * Assuming the address is a linear address (it is for IBS), we fake the CS and
 * vm86 mode using the known zero-based code segment and 'fix up' the registers
 * to reflect this.
 *
 * Intel PEBS/LBR appear to typically provide the effective address, nothing
 * much we can do about that but pray and treat it like a linear address.
 */
static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip)
{
        regs->cs = kernel_ip(ip) ? __KERNEL_CS : __USER_CS;
        if (regs->flags & X86_VM_MASK)
                regs->flags ^= (PERF_EFLAGS_VM | X86_VM_MASK);
        regs->ip = ip;
}

ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event);
ssize_t intel_event_sysfs_show(char *page, u64 config);

ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
                          char *page);
ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr,
                          char *page);

#ifdef CONFIG_CPU_SUP_AMD

int amd_pmu_init(void);

#else /* CONFIG_CPU_SUP_AMD */

static inline int amd_pmu_init(void)
{
        return 0;
}

#endif /* CONFIG_CPU_SUP_AMD */

static inline int is_pebs_pt(struct perf_event *event)
{
        return !!(event->hw.flags & PERF_X86_EVENT_PEBS_VIA_PT);
}

#ifdef CONFIG_CPU_SUP_INTEL

static inline bool intel_pmu_has_bts_period(struct perf_event *event, u64 period)
{
        struct hw_perf_event *hwc = &event->hw;
        unsigned int hw_event, bts_event;

        /*
         * Only use BTS for fixed rate period==1 events.
         */
        if (event->attr.freq || period != 1)
                return false;

        /*
         * BTS doesn't virtualize.
         */
        if (event->attr.exclude_host)
                return false;

        hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
        bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);

        return hw_event == bts_event;
}

static inline bool intel_pmu_has_bts(struct perf_event *event)
{
        struct hw_perf_event *hwc = &event->hw;

        return intel_pmu_has_bts_period(event, hwc->sample_period);
}

int intel_pmu_save_and_restart(struct perf_event *event);

struct event_constraint *
x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
                          struct perf_event *event);

extern int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu);
extern void intel_cpuc_finish(struct cpu_hw_events *cpuc);

int intel_pmu_init(void);

void init_debug_store_on_cpu(int cpu);

void fini_debug_store_on_cpu(int cpu);

void release_ds_buffers(void);

void reserve_ds_buffers(void);

void release_lbr_buffers(void);

void reserve_lbr_buffers(void);

extern struct event_constraint bts_constraint;
extern struct event_constraint vlbr_constraint;

void intel_pmu_enable_bts(u64 config);

void intel_pmu_disable_bts(void);

int intel_pmu_drain_bts_buffer(void);

extern struct event_constraint intel_core2_pebs_event_constraints[];

extern struct event_constraint intel_atom_pebs_event_constraints[];

extern struct event_constraint intel_slm_pebs_event_constraints[];

extern struct event_constraint intel_glm_pebs_event_constraints[];

extern struct event_constraint intel_glp_pebs_event_constraints[];

extern struct event_constraint intel_nehalem_pebs_event_constraints[];

extern struct event_constraint intel_westmere_pebs_event_constraints[];

extern struct event_constraint intel_snb_pebs_event_constraints[];

extern struct event_constraint intel_ivb_pebs_event_constraints[];

extern struct event_constraint intel_hsw_pebs_event_constraints[];

extern struct event_constraint intel_bdw_pebs_event_constraints[];

extern struct event_constraint intel_skl_pebs_event_constraints[];

extern struct event_constraint intel_icl_pebs_event_constraints[];

struct event_constraint *intel_pebs_constraints(struct perf_event *event);

void intel_pmu_pebs_add(struct perf_event *event);

void intel_pmu_pebs_del(struct perf_event *event);

void intel_pmu_pebs_enable(struct perf_event *event);

void intel_pmu_pebs_disable(struct perf_event *event);

void intel_pmu_pebs_enable_all(void);

void intel_pmu_pebs_disable_all(void);

void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in);

void intel_pmu_auto_reload_read(struct perf_event *event);

void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr);

void intel_ds_init(void);

void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev,
                                 struct perf_event_context *next);

void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);

u64 lbr_from_signext_quirk_wr(u64 val);

void intel_pmu_lbr_reset(void);

void intel_pmu_lbr_reset_32(void);

void intel_pmu_lbr_reset_64(void);

void intel_pmu_lbr_add(struct perf_event *event);

void intel_pmu_lbr_del(struct perf_event *event);

void intel_pmu_lbr_enable_all(bool pmi);

void intel_pmu_lbr_disable_all(void);

void intel_pmu_lbr_read(void);

void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc);

void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc);

void intel_pmu_lbr_save(void *ctx);

void intel_pmu_lbr_restore(void *ctx);

void intel_pmu_lbr_init_core(void);

void intel_pmu_lbr_init_nhm(void);

void intel_pmu_lbr_init_atom(void);

void intel_pmu_lbr_init_slm(void);

void intel_pmu_lbr_init_snb(void);

void intel_pmu_lbr_init_hsw(void);

void intel_pmu_lbr_init_skl(void);

void intel_pmu_lbr_init_knl(void);

void intel_pmu_arch_lbr_init(void);

void intel_pmu_pebs_data_source_nhm(void);

void intel_pmu_pebs_data_source_skl(bool pmem);

int intel_pmu_setup_lbr_filter(struct perf_event *event);

void intel_pt_interrupt(void);

int intel_bts_interrupt(void);

void intel_bts_enable_local(void);

void intel_bts_disable_local(void);

int p4_pmu_init(void);

int p6_pmu_init(void);

int knc_pmu_init(void);

static inline int is_ht_workaround_enabled(void)
{
        return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED);
}

#else /* CONFIG_CPU_SUP_INTEL */

static inline void reserve_ds_buffers(void)
{
}

static inline void release_ds_buffers(void)
{
}

static inline void release_lbr_buffers(void)
{
}

static inline void reserve_lbr_buffers(void)
{
}

static inline int intel_pmu_init(void)
{
        return 0;
}

static inline int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu)
{
        return 0;
}

static inline void intel_cpuc_finish(struct cpu_hw_events *cpuc)
{
}

static inline int is_ht_workaround_enabled(void)
{
        return 0;
}
#endif /* CONFIG_CPU_SUP_INTEL */

#if ((defined CONFIG_CPU_SUP_CENTAUR) || (defined CONFIG_CPU_SUP_ZHAOXIN))
int zhaoxin_pmu_init(void);
#else
static inline int zhaoxin_pmu_init(void)
{
        return 0;
}
#endif /*CONFIG_CPU_SUP_CENTAUR or CONFIG_CPU_SUP_ZHAOXIN*/























































































































































    1 

























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM libata

#if !defined(_TRACE_LIBATA_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_LIBATA_H

#include <linux/ata.h>
#include <linux/libata.h>
#include <linux/tracepoint.h>
#include <linux/trace_seq.h>

#define ata_opcode_name(opcode)        { opcode, #opcode }
#define show_opcode_name(val)                                        \
        __print_symbolic(val,                                        \
                 ata_opcode_name(ATA_CMD_DEV_RESET),                \
                 ata_opcode_name(ATA_CMD_CHK_POWER),                \
                 ata_opcode_name(ATA_CMD_STANDBY),                \
                 ata_opcode_name(ATA_CMD_IDLE),                        \
                 ata_opcode_name(ATA_CMD_EDD),                        \
                 ata_opcode_name(ATA_CMD_DOWNLOAD_MICRO),        \
                 ata_opcode_name(ATA_CMD_DOWNLOAD_MICRO_DMA),        \
                 ata_opcode_name(ATA_CMD_NOP),                        \
                 ata_opcode_name(ATA_CMD_FLUSH),                \
                 ata_opcode_name(ATA_CMD_FLUSH_EXT),                \
                 ata_opcode_name(ATA_CMD_ID_ATA),                \
                 ata_opcode_name(ATA_CMD_ID_ATAPI),                \
                 ata_opcode_name(ATA_CMD_SERVICE),                \
                 ata_opcode_name(ATA_CMD_READ),                        \
                 ata_opcode_name(ATA_CMD_READ_EXT),                \
                 ata_opcode_name(ATA_CMD_READ_QUEUED),                \
                 ata_opcode_name(ATA_CMD_READ_STREAM_EXT),        \
                 ata_opcode_name(ATA_CMD_READ_STREAM_DMA_EXT),        \
                 ata_opcode_name(ATA_CMD_WRITE),                \
                 ata_opcode_name(ATA_CMD_WRITE_EXT),                \
                 ata_opcode_name(ATA_CMD_WRITE_QUEUED),                \
                 ata_opcode_name(ATA_CMD_WRITE_STREAM_EXT),        \
                 ata_opcode_name(ATA_CMD_WRITE_STREAM_DMA_EXT), \
                 ata_opcode_name(ATA_CMD_WRITE_FUA_EXT),        \
                 ata_opcode_name(ATA_CMD_WRITE_QUEUED_FUA_EXT), \
                 ata_opcode_name(ATA_CMD_FPDMA_READ),                \
                 ata_opcode_name(ATA_CMD_FPDMA_WRITE),                \
                 ata_opcode_name(ATA_CMD_NCQ_NON_DATA),                \
                 ata_opcode_name(ATA_CMD_FPDMA_SEND),                \
                 ata_opcode_name(ATA_CMD_FPDMA_RECV),                \
                 ata_opcode_name(ATA_CMD_PIO_READ),                \
                 ata_opcode_name(ATA_CMD_PIO_READ_EXT),                \
                 ata_opcode_name(ATA_CMD_PIO_WRITE),                \
                 ata_opcode_name(ATA_CMD_PIO_WRITE_EXT),        \
                 ata_opcode_name(ATA_CMD_READ_MULTI),                \
                 ata_opcode_name(ATA_CMD_READ_MULTI_EXT),        \
                 ata_opcode_name(ATA_CMD_WRITE_MULTI),                \
                 ata_opcode_name(ATA_CMD_WRITE_MULTI_EXT),        \
                 ata_opcode_name(ATA_CMD_WRITE_MULTI_FUA_EXT),        \
                 ata_opcode_name(ATA_CMD_SET_FEATURES),                \
                 ata_opcode_name(ATA_CMD_SET_MULTI),                \
                 ata_opcode_name(ATA_CMD_PACKET),                \
                 ata_opcode_name(ATA_CMD_VERIFY),                \
                 ata_opcode_name(ATA_CMD_VERIFY_EXT),                \
                 ata_opcode_name(ATA_CMD_WRITE_UNCORR_EXT),        \
                 ata_opcode_name(ATA_CMD_STANDBYNOW1),                \
                 ata_opcode_name(ATA_CMD_IDLEIMMEDIATE),        \
                 ata_opcode_name(ATA_CMD_SLEEP),                \
                 ata_opcode_name(ATA_CMD_INIT_DEV_PARAMS),        \
                 ata_opcode_name(ATA_CMD_READ_NATIVE_MAX),        \
                 ata_opcode_name(ATA_CMD_READ_NATIVE_MAX_EXT),        \
                 ata_opcode_name(ATA_CMD_SET_MAX),                \
                 ata_opcode_name(ATA_CMD_SET_MAX_EXT),                \
                 ata_opcode_name(ATA_CMD_READ_LOG_EXT),                \
                 ata_opcode_name(ATA_CMD_WRITE_LOG_EXT),        \
                 ata_opcode_name(ATA_CMD_READ_LOG_DMA_EXT),        \
                 ata_opcode_name(ATA_CMD_WRITE_LOG_DMA_EXT),        \
                 ata_opcode_name(ATA_CMD_TRUSTED_NONDATA),        \
                 ata_opcode_name(ATA_CMD_TRUSTED_RCV),                \
                 ata_opcode_name(ATA_CMD_TRUSTED_RCV_DMA),        \
                 ata_opcode_name(ATA_CMD_TRUSTED_SND),                \
                 ata_opcode_name(ATA_CMD_TRUSTED_SND_DMA),        \
                 ata_opcode_name(ATA_CMD_PMP_READ),                \
                 ata_opcode_name(ATA_CMD_PMP_READ_DMA),                \
                 ata_opcode_name(ATA_CMD_PMP_WRITE),                \
                 ata_opcode_name(ATA_CMD_PMP_WRITE_DMA),        \
                 ata_opcode_name(ATA_CMD_CONF_OVERLAY),                \
                 ata_opcode_name(ATA_CMD_SEC_SET_PASS),                \
                 ata_opcode_name(ATA_CMD_SEC_UNLOCK),                \
                 ata_opcode_name(ATA_CMD_SEC_ERASE_PREP),        \
                 ata_opcode_name(ATA_CMD_SEC_ERASE_UNIT),        \
                 ata_opcode_name(ATA_CMD_SEC_FREEZE_LOCK),        \
                 ata_opcode_name(ATA_CMD_SEC_DISABLE_PASS),        \
                 ata_opcode_name(ATA_CMD_CONFIG_STREAM),        \
                 ata_opcode_name(ATA_CMD_SMART),                \
                 ata_opcode_name(ATA_CMD_MEDIA_LOCK),                \
                 ata_opcode_name(ATA_CMD_MEDIA_UNLOCK),                \
                 ata_opcode_name(ATA_CMD_DSM),                        \
                 ata_opcode_name(ATA_CMD_CHK_MED_CRD_TYP),        \
                 ata_opcode_name(ATA_CMD_CFA_REQ_EXT_ERR),        \
                 ata_opcode_name(ATA_CMD_CFA_WRITE_NE),                \
                 ata_opcode_name(ATA_CMD_CFA_TRANS_SECT),        \
                 ata_opcode_name(ATA_CMD_CFA_ERASE),                \
                 ata_opcode_name(ATA_CMD_CFA_WRITE_MULT_NE),        \
                 ata_opcode_name(ATA_CMD_REQ_SENSE_DATA),        \
                 ata_opcode_name(ATA_CMD_SANITIZE_DEVICE),        \
                 ata_opcode_name(ATA_CMD_ZAC_MGMT_IN),                \
                 ata_opcode_name(ATA_CMD_ZAC_MGMT_OUT),                \
                 ata_opcode_name(ATA_CMD_RESTORE),                \
                 ata_opcode_name(ATA_CMD_READ_LONG),                \
                 ata_opcode_name(ATA_CMD_READ_LONG_ONCE),        \
                 ata_opcode_name(ATA_CMD_WRITE_LONG),                \
                 ata_opcode_name(ATA_CMD_WRITE_LONG_ONCE))

#define ata_error_name(result)        { result, #result }
#define show_error_name(val)                                \
        __print_symbolic(val,                                \
                ata_error_name(ATA_ICRC),                \
                ata_error_name(ATA_UNC),                \
                ata_error_name(ATA_MC),                        \
                ata_error_name(ATA_IDNF),                \
                ata_error_name(ATA_MCR),                \
                ata_error_name(ATA_ABORTED),                \
                ata_error_name(ATA_TRK0NF),                \
                ata_error_name(ATA_AMNF))

#define ata_protocol_name(proto)        { proto, #proto }
#define show_protocol_name(val)                                \
        __print_symbolic(val,                                \
                ata_protocol_name(ATA_PROT_UNKNOWN),        \
                ata_protocol_name(ATA_PROT_NODATA),        \
                ata_protocol_name(ATA_PROT_PIO),        \
                ata_protocol_name(ATA_PROT_DMA),        \
                ata_protocol_name(ATA_PROT_NCQ),        \
                ata_protocol_name(ATA_PROT_NCQ_NODATA),        \
                ata_protocol_name(ATAPI_PROT_NODATA),        \
                ata_protocol_name(ATAPI_PROT_PIO),        \
                ata_protocol_name(ATAPI_PROT_DMA))

const char *libata_trace_parse_status(struct trace_seq*, unsigned char);
#define __parse_status(s) libata_trace_parse_status(p, s)

const char *libata_trace_parse_eh_action(struct trace_seq *, unsigned int);
#define __parse_eh_action(a) libata_trace_parse_eh_action(p, a)

const char *libata_trace_parse_eh_err_mask(struct trace_seq *, unsigned int);
#define __parse_eh_err_mask(m) libata_trace_parse_eh_err_mask(p, m)

const char *libata_trace_parse_qc_flags(struct trace_seq *, unsigned int);
#define __parse_qc_flags(f) libata_trace_parse_qc_flags(p, f)

const char *libata_trace_parse_subcmd(struct trace_seq *, unsigned char,
                                      unsigned char, unsigned char);
#define __parse_subcmd(c,f,h) libata_trace_parse_subcmd(p, c, f, h)

TRACE_EVENT(ata_qc_issue,

        TP_PROTO(struct ata_queued_cmd *qc),

        TP_ARGS(qc),

        TP_STRUCT__entry(
                __field( unsigned int,        ata_port )
                __field( unsigned int,        ata_dev        )
                __field( unsigned int,        tag        )
                __field( unsigned char,        cmd        )
                __field( unsigned char,        dev        )
                __field( unsigned char,        lbal        )
                __field( unsigned char,        lbam        )
                __field( unsigned char,        lbah        )
                __field( unsigned char,        nsect        )
                __field( unsigned char,        feature        )
                __field( unsigned char,        hob_lbal )
                __field( unsigned char,        hob_lbam )
                __field( unsigned char,        hob_lbah )
                __field( unsigned char,        hob_nsect )
                __field( unsigned char,        hob_feature )
                __field( unsigned char,        ctl )
                __field( unsigned char,        proto )
                __field( unsigned long,        flags )
        ),

        TP_fast_assign(
                __entry->ata_port        = qc->ap->print_id;
                __entry->ata_dev        = qc->dev->link->pmp + qc->dev->devno;
                __entry->tag                = qc->tag;
                __entry->proto                = qc->tf.protocol;
                __entry->cmd                = qc->tf.command;
                __entry->dev                = qc->tf.device;
                __entry->lbal                = qc->tf.lbal;
                __entry->lbam                = qc->tf.lbam;
                __entry->lbah                = qc->tf.lbah;
                __entry->hob_lbal        = qc->tf.hob_lbal;
                __entry->hob_lbam        = qc->tf.hob_lbam;
                __entry->hob_lbah        = qc->tf.hob_lbah;
                __entry->feature        = qc->tf.feature;
                __entry->hob_feature        = qc->tf.hob_feature;
                __entry->nsect                = qc->tf.nsect;
                __entry->hob_nsect        = qc->tf.hob_nsect;
        ),

        TP_printk("ata_port=%u ata_dev=%u tag=%d proto=%s cmd=%s%s " \
                  " tf=(%02x/%02x:%02x:%02x:%02x:%02x/%02x:%02x:%02x:%02x:%02x/%02x)",
                  __entry->ata_port, __entry->ata_dev, __entry->tag,
                  show_protocol_name(__entry->proto),
                  show_opcode_name(__entry->cmd),
                  __parse_subcmd(__entry->cmd, __entry->feature, __entry->hob_nsect),
                  __entry->cmd, __entry->feature, __entry->nsect,
                  __entry->lbal, __entry->lbam, __entry->lbah,
                  __entry->hob_feature, __entry->hob_nsect,
                  __entry->hob_lbal, __entry->hob_lbam, __entry->hob_lbah,
                  __entry->dev)
);

DECLARE_EVENT_CLASS(ata_qc_complete_template,

        TP_PROTO(struct ata_queued_cmd *qc),

        TP_ARGS(qc),

        TP_STRUCT__entry(
                __field( unsigned int,        ata_port )
                __field( unsigned int,        ata_dev        )
                __field( unsigned int,        tag        )
                __field( unsigned char,        status        )
                __field( unsigned char,        dev        )
                __field( unsigned char,        lbal        )
                __field( unsigned char,        lbam        )
                __field( unsigned char,        lbah        )
                __field( unsigned char,        nsect        )
                __field( unsigned char,        error        )
                __field( unsigned char,        hob_lbal )
                __field( unsigned char,        hob_lbam )
                __field( unsigned char,        hob_lbah )
                __field( unsigned char,        hob_nsect )
                __field( unsigned char,        hob_feature )
                __field( unsigned char,        ctl )
                __field( unsigned long,        flags )
        ),

        TP_fast_assign(
                __entry->ata_port        = qc->ap->print_id;
                __entry->ata_dev        = qc->dev->link->pmp + qc->dev->devno;
                __entry->tag                = qc->tag;
                __entry->status                = qc->result_tf.command;
                __entry->dev                = qc->result_tf.device;
                __entry->lbal                = qc->result_tf.lbal;
                __entry->lbam                = qc->result_tf.lbam;
                __entry->lbah                = qc->result_tf.lbah;
                __entry->hob_lbal        = qc->result_tf.hob_lbal;
                __entry->hob_lbam        = qc->result_tf.hob_lbam;
                __entry->hob_lbah        = qc->result_tf.hob_lbah;
                __entry->error                = qc->result_tf.feature;
                __entry->hob_feature        = qc->result_tf.hob_feature;
                __entry->nsect                = qc->result_tf.nsect;
                __entry->hob_nsect        = qc->result_tf.hob_nsect;
                __entry->flags                = qc->flags;
        ),

        TP_printk("ata_port=%u ata_dev=%u tag=%d flags=%s status=%s " \
                  " res=(%02x/%02x:%02x:%02x:%02x:%02x/%02x:%02x:%02x:%02x:%02x/%02x)",
                  __entry->ata_port, __entry->ata_dev, __entry->tag,
                  __parse_qc_flags(__entry->flags),
                  __parse_status(__entry->status),
                  __entry->status, __entry->error, __entry->nsect,
                  __entry->lbal, __entry->lbam, __entry->lbah,
                  __entry->hob_feature, __entry->hob_nsect,
                  __entry->hob_lbal, __entry->hob_lbam, __entry->hob_lbah,
                  __entry->dev)
);

DEFINE_EVENT(ata_qc_complete_template, ata_qc_complete_internal,
             TP_PROTO(struct ata_queued_cmd *qc),
             TP_ARGS(qc));

DEFINE_EVENT(ata_qc_complete_template, ata_qc_complete_failed,
             TP_PROTO(struct ata_queued_cmd *qc),
             TP_ARGS(qc));

DEFINE_EVENT(ata_qc_complete_template, ata_qc_complete_done,
             TP_PROTO(struct ata_queued_cmd *qc),
             TP_ARGS(qc));

TRACE_EVENT(ata_eh_link_autopsy,

        TP_PROTO(struct ata_device *dev, unsigned int eh_action, unsigned int eh_err_mask),

        TP_ARGS(dev, eh_action, eh_err_mask),

        TP_STRUCT__entry(
                __field( unsigned int,        ata_port )
                __field( unsigned int,        ata_dev        )
                __field( unsigned int,        eh_action )
                __field( unsigned int,        eh_err_mask)
        ),

        TP_fast_assign(
                __entry->ata_port        = dev->link->ap->print_id;
                __entry->ata_dev        = dev->link->pmp + dev->devno;
                __entry->eh_action        = eh_action;
                __entry->eh_err_mask        = eh_err_mask;
        ),

        TP_printk("ata_port=%u ata_dev=%u eh_action=%s err_mask=%s",
                  __entry->ata_port, __entry->ata_dev,
                  __parse_eh_action(__entry->eh_action),
                  __parse_eh_err_mask(__entry->eh_err_mask))
);

TRACE_EVENT(ata_eh_link_autopsy_qc,

        TP_PROTO(struct ata_queued_cmd *qc),

        TP_ARGS(qc),

        TP_STRUCT__entry(
                __field( unsigned int,        ata_port )
                __field( unsigned int,        ata_dev        )
                __field( unsigned int,        tag        )
                __field( unsigned int,        qc_flags )
                __field( unsigned int,        eh_err_mask)
        ),

        TP_fast_assign(
                __entry->ata_port        = qc->ap->print_id;
                __entry->ata_dev        = qc->dev->link->pmp + qc->dev->devno;
                __entry->tag                = qc->tag;
                __entry->qc_flags        = qc->flags;
                __entry->eh_err_mask        = qc->err_mask;
        ),

        TP_printk("ata_port=%u ata_dev=%u tag=%d flags=%s err_mask=%s",
                  __entry->ata_port, __entry->ata_dev, __entry->tag,
                  __parse_qc_flags(__entry->qc_flags),
                  __parse_eh_err_mask(__entry->eh_err_mask))
);

#endif /*  _TRACE_LIBATA_H */

/* This part must be outside protection */
#include <trace/define_trace.h>














































































































































    2 






















































































































































































    3 








    3 






































































































































    2 



























    3 



    1 




    2 









































    1 
    1 

    1 






































































































































































































































































































































































































































































































































































    1 








    1 
















































    1 







    1 






    1 







    1 

    1 




























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
// SPDX-License-Identifier: GPL-2.0

/*
 * fs/ext4/fast_commit.c
 *
 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
 *
 * Ext4 fast commits routines.
 */
#include "ext4.h"
#include "ext4_jbd2.h"
#include "ext4_extents.h"
#include "mballoc.h"

/*
 * Ext4 Fast Commits
 * -----------------
 *
 * Ext4 fast commits implement fine grained journalling for Ext4.
 *
 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
 * TLV during the recovery phase. For the scenarios for which we currently
 * don't have replay code, fast commit falls back to full commits.
 * Fast commits record delta in one of the following three categories.
 *
 * (A) Directory entry updates:
 *
 * - EXT4_FC_TAG_UNLINK                - records directory entry unlink
 * - EXT4_FC_TAG_LINK                - records directory entry link
 * - EXT4_FC_TAG_CREAT                - records inode and directory entry creation
 *
 * (B) File specific data range updates:
 *
 * - EXT4_FC_TAG_ADD_RANGE        - records addition of new blocks to an inode
 * - EXT4_FC_TAG_DEL_RANGE        - records deletion of blocks from an inode
 *
 * (C) Inode metadata (mtime / ctime etc):
 *
 * - EXT4_FC_TAG_INODE                - record the inode that should be replayed
 *                                  during recovery. Note that iblocks field is
 *                                  not replayed and instead derived during
 *                                  replay.
 * Commit Operation
 * ----------------
 * With fast commits, we maintain all the directory entry operations in the
 * order in which they are issued in an in-memory queue. This queue is flushed
 * to disk during the commit operation. We also maintain a list of inodes
 * that need to be committed during a fast commit in another in memory queue of
 * inodes. During the commit operation, we commit in the following order:
 *
 * [1] Lock inodes for any further data updates by setting COMMITTING state
 * [2] Submit data buffers of all the inodes
 * [3] Wait for [2] to complete
 * [4] Commit all the directory entry updates in the fast commit space
 * [5] Commit all the changed inode structures
 * [6] Write tail tag (this tag ensures the atomicity, please read the following
 *     section for more details).
 * [7] Wait for [4], [5] and [6] to complete.
 *
 * All the inode updates must call ext4_fc_start_update() before starting an
 * update. If such an ongoing update is present, fast commit waits for it to
 * complete. The completion of such an update is marked by
 * ext4_fc_stop_update().
 *
 * Fast Commit Ineligibility
 * -------------------------
 * Not all operations are supported by fast commits today (e.g extended
 * attributes). Fast commit ineligibility is marked by calling one of the
 * two following functions:
 *
 * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
 *   back to full commit. This is useful in case of transient errors.
 *
 * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
 *   the fast commits happening between ext4_fc_start_ineligible() and
 *   ext4_fc_stop_ineligible() and one fast commit after the call to
 *   ext4_fc_stop_ineligible() to fall back to full commits. It is important to
 *   make one more fast commit to fall back to full commit after stop call so
 *   that it guaranteed that the fast commit ineligible operation contained
 *   within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
 *   followed by at least 1 full commit.
 *
 * Atomicity of commits
 * --------------------
 * In order to guarantee atomicity during the commit operation, fast commit
 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
 * tag contains CRC of the contents and TID of the transaction after which
 * this fast commit should be applied. Recovery code replays fast commit
 * logs only if there's at least 1 valid tail present. For every fast commit
 * operation, there is 1 tail. This means, we may end up with multiple tails
 * in the fast commit space. Here's an example:
 *
 * - Create a new file A and remove existing file B
 * - fsync()
 * - Append contents to file A
 * - Truncate file A
 * - fsync()
 *
 * The fast commit space at the end of above operations would look like this:
 *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
 *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
 *
 * Replay code should thus check for all the valid tails in the FC area.
 *
 * TODOs
 * -----
 * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
 *    eligible update must be protected within ext4_fc_start_update() and
 *    ext4_fc_stop_update(). These routines are called at much higher
 *    routines. This can be made more fine grained by combining with
 *    ext4_journal_start().
 *
 * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
 *
 * 3) Handle more ineligible cases.
 */

#include <trace/events/ext4.h>
static struct kmem_cache *ext4_fc_dentry_cachep;

static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
{
        BUFFER_TRACE(bh, "");
        if (uptodate) {
                ext4_debug("%s: Block %lld up-to-date",
                           __func__, bh->b_blocknr);
                set_buffer_uptodate(bh);
        } else {
                ext4_debug("%s: Block %lld not up-to-date",
                           __func__, bh->b_blocknr);
                clear_buffer_uptodate(bh);
        }

        unlock_buffer(bh);
}

static inline void ext4_fc_reset_inode(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);

        ei->i_fc_lblk_start = 0;
        ei->i_fc_lblk_len = 0;
}

void ext4_fc_init_inode(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);

        ext4_fc_reset_inode(inode);
        ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
        INIT_LIST_HEAD(&ei->i_fc_list);
        init_waitqueue_head(&ei->i_fc_wait);
        atomic_set(&ei->i_fc_updates, 0);
}

/* This function must be called with sbi->s_fc_lock held. */
static void ext4_fc_wait_committing_inode(struct inode *inode)
__releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
{
        wait_queue_head_t *wq;
        struct ext4_inode_info *ei = EXT4_I(inode);

#if (BITS_PER_LONG < 64)
        DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
                        EXT4_STATE_FC_COMMITTING);
        wq = bit_waitqueue(&ei->i_state_flags,
                                EXT4_STATE_FC_COMMITTING);
#else
        DEFINE_WAIT_BIT(wait, &ei->i_flags,
                        EXT4_STATE_FC_COMMITTING);
        wq = bit_waitqueue(&ei->i_flags,
                                EXT4_STATE_FC_COMMITTING);
#endif
        lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
        prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
        spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
        schedule();
        finish_wait(wq, &wait.wq_entry);
}

/*
 * Inform Ext4's fast about start of an inode update
 *
 * This function is called by the high level call VFS callbacks before
 * performing any inode update. This function blocks if there's an ongoing
 * fast commit on the inode in question.
 */
void ext4_fc_start_update(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);

        if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
            (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
                return;

restart:
        spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
        if (list_empty(&ei->i_fc_list))
                goto out;

        if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
                ext4_fc_wait_committing_inode(inode);
                goto restart;
        }
out:
        atomic_inc(&ei->i_fc_updates);
        spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
}

/*
 * Stop inode update and wake up waiting fast commits if any.
 */
void ext4_fc_stop_update(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);

        if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
            (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
                return;

        if (atomic_dec_and_test(&ei->i_fc_updates))
                wake_up_all(&ei->i_fc_wait);
}

/*
 * Remove inode from fast commit list. If the inode is being committed
 * we wait until inode commit is done.
 */
void ext4_fc_del(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);

        if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
            (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
                return;

restart:
        spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
        if (list_empty(&ei->i_fc_list)) {
                spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
                return;
        }

        if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
                ext4_fc_wait_committing_inode(inode);
                goto restart;
        }
        list_del_init(&ei->i_fc_list);
        spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
}

/*
 * Mark file system as fast commit ineligible. This means that next commit
 * operation would result in a full jbd2 commit.
 */
void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
            (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
                return;

        ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
        WARN_ON(reason >= EXT4_FC_REASON_MAX);
        sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
}

/*
 * Start a fast commit ineligible update. Any commits that happen while
 * such an operation is in progress fall back to full commits.
 */
void ext4_fc_start_ineligible(struct super_block *sb, int reason)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
            (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
                return;

        WARN_ON(reason >= EXT4_FC_REASON_MAX);
        sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
        atomic_inc(&sbi->s_fc_ineligible_updates);
}

/*
 * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
 * to ensure that after stopping the ineligible update, at least one full
 * commit takes place.
 */
void ext4_fc_stop_ineligible(struct super_block *sb)
{
        if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
            (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
                return;

        ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
        atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
}

static inline int ext4_fc_is_ineligible(struct super_block *sb)
{
        return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) ||
                atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
}

/*
 * Generic fast commit tracking function. If this is the first time this we are
 * called after a full commit, we initialize fast commit fields and then call
 * __fc_track_fn() with update = 0. If we have already been called after a full
 * commit, we pass update = 1. Based on that, the track function can determine
 * if it needs to track a field for the first time or if it needs to just
 * update the previously tracked value.
 *
 * If enqueue is set, this function enqueues the inode in fast commit list.
 */
static int ext4_fc_track_template(
        handle_t *handle, struct inode *inode,
        int (*__fc_track_fn)(struct inode *, void *, bool),
        void *args, int enqueue)
{
        bool update = false;
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        tid_t tid = 0;
        int ret;

        if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
            (sbi->s_mount_state & EXT4_FC_REPLAY))
                return -EOPNOTSUPP;

        if (ext4_fc_is_ineligible(inode->i_sb))
                return -EINVAL;

        tid = handle->h_transaction->t_tid;
        mutex_lock(&ei->i_fc_lock);
        if (tid == ei->i_sync_tid) {
                update = true;
        } else {
                ext4_fc_reset_inode(inode);
                ei->i_sync_tid = tid;
        }
        ret = __fc_track_fn(inode, args, update);
        mutex_unlock(&ei->i_fc_lock);

        if (!enqueue)
                return ret;

        spin_lock(&sbi->s_fc_lock);
        if (list_empty(&EXT4_I(inode)->i_fc_list))
                list_add_tail(&EXT4_I(inode)->i_fc_list,
                                (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
                                &sbi->s_fc_q[FC_Q_STAGING] :
                                &sbi->s_fc_q[FC_Q_MAIN]);
        spin_unlock(&sbi->s_fc_lock);

        return ret;
}

struct __track_dentry_update_args {
        struct dentry *dentry;
        int op;
};

/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
static int __track_dentry_update(struct inode *inode, void *arg, bool update)
{
        struct ext4_fc_dentry_update *node;
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct __track_dentry_update_args *dentry_update =
                (struct __track_dentry_update_args *)arg;
        struct dentry *dentry = dentry_update->dentry;
        struct inode *dir = dentry->d_parent->d_inode;
        struct super_block *sb = inode->i_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        mutex_unlock(&ei->i_fc_lock);

        if (IS_ENCRYPTED(dir)) {
                ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME);
                mutex_lock(&ei->i_fc_lock);
                return -EOPNOTSUPP;
        }

        node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
        if (!node) {
                ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM);
                mutex_lock(&ei->i_fc_lock);
                return -ENOMEM;
        }

        node->fcd_op = dentry_update->op;
        node->fcd_parent = dir->i_ino;
        node->fcd_ino = inode->i_ino;
        if (dentry->d_name.len > DNAME_INLINE_LEN) {
                node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
                if (!node->fcd_name.name) {
                        kmem_cache_free(ext4_fc_dentry_cachep, node);
                        ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM);
                        mutex_lock(&ei->i_fc_lock);
                        return -ENOMEM;
                }
                memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
                        dentry->d_name.len);
        } else {
                memcpy(node->fcd_iname, dentry->d_name.name,
                        dentry->d_name.len);
                node->fcd_name.name = node->fcd_iname;
        }
        node->fcd_name.len = dentry->d_name.len;

        spin_lock(&sbi->s_fc_lock);
        if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
                list_add_tail(&node->fcd_list,
                                &sbi->s_fc_dentry_q[FC_Q_STAGING]);
        else
                list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
        spin_unlock(&sbi->s_fc_lock);
        mutex_lock(&ei->i_fc_lock);

        return 0;
}

void __ext4_fc_track_unlink(handle_t *handle,
                struct inode *inode, struct dentry *dentry)
{
        struct __track_dentry_update_args args;
        int ret;

        args.dentry = dentry;
        args.op = EXT4_FC_TAG_UNLINK;

        ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
                                        (void *)&args, 0);
        trace_ext4_fc_track_unlink(inode, dentry, ret);
}

void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
{
        __ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
}

void __ext4_fc_track_link(handle_t *handle,
        struct inode *inode, struct dentry *dentry)
{
        struct __track_dentry_update_args args;
        int ret;

        args.dentry = dentry;
        args.op = EXT4_FC_TAG_LINK;

        ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
                                        (void *)&args, 0);
        trace_ext4_fc_track_link(inode, dentry, ret);
}

void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
{
        __ext4_fc_track_link(handle, d_inode(dentry), dentry);
}

void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
                          struct dentry *dentry)
{
        struct __track_dentry_update_args args;
        int ret;

        args.dentry = dentry;
        args.op = EXT4_FC_TAG_CREAT;

        ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
                                        (void *)&args, 0);
        trace_ext4_fc_track_create(inode, dentry, ret);
}

void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
{
        __ext4_fc_track_create(handle, d_inode(dentry), dentry);
}

/* __track_fn for inode tracking */
static int __track_inode(struct inode *inode, void *arg, bool update)
{
        if (update)
                return -EEXIST;

        EXT4_I(inode)->i_fc_lblk_len = 0;

        return 0;
}

void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
{
        int ret;

        if (S_ISDIR(inode->i_mode))
                return;

        if (ext4_should_journal_data(inode)) {
                ext4_fc_mark_ineligible(inode->i_sb,
                                        EXT4_FC_REASON_INODE_JOURNAL_DATA);
                return;
        }

        ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
        trace_ext4_fc_track_inode(inode, ret);
}

struct __track_range_args {
        ext4_lblk_t start, end;
};

/* __track_fn for tracking data updates */
static int __track_range(struct inode *inode, void *arg, bool update)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        ext4_lblk_t oldstart;
        struct __track_range_args *__arg =
                (struct __track_range_args *)arg;

        if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
                ext4_debug("Special inode %ld being modified\n", inode->i_ino);
                return -ECANCELED;
        }

        oldstart = ei->i_fc_lblk_start;

        if (update && ei->i_fc_lblk_len > 0) {
                ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
                ei->i_fc_lblk_len =
                        max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
                                ei->i_fc_lblk_start + 1;
        } else {
                ei->i_fc_lblk_start = __arg->start;
                ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
        }

        return 0;
}

void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
                         ext4_lblk_t end)
{
        struct __track_range_args args;
        int ret;

        if (S_ISDIR(inode->i_mode))
                return;

        args.start = start;
        args.end = end;

        ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);

        trace_ext4_fc_track_range(inode, start, end, ret);
}

static void ext4_fc_submit_bh(struct super_block *sb)
{
        int write_flags = REQ_SYNC;
        struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;

        /* TODO: REQ_FUA | REQ_PREFLUSH is unnecessarily expensive. */
        if (test_opt(sb, BARRIER))
                write_flags |= REQ_FUA | REQ_PREFLUSH;
        lock_buffer(bh);
        set_buffer_dirty(bh);
        set_buffer_uptodate(bh);
        bh->b_end_io = ext4_end_buffer_io_sync;
        submit_bh(REQ_OP_WRITE, write_flags, bh);
        EXT4_SB(sb)->s_fc_bh = NULL;
}

/* Ext4 commit path routines */

/* memzero and update CRC */
static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
                                u32 *crc)
{
        void *ret;

        ret = memset(dst, 0, len);
        if (crc)
                *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
        return ret;
}

/*
 * Allocate len bytes on a fast commit buffer.
 *
 * During the commit time this function is used to manage fast commit
 * block space. We don't split a fast commit log onto different
 * blocks. So this function makes sure that if there's not enough space
 * on the current block, the remaining space in the current block is
 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
 * new block is from jbd2 and CRC is updated to reflect the padding
 * we added.
 */
static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
{
        struct ext4_fc_tl *tl;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct buffer_head *bh;
        int bsize = sbi->s_journal->j_blocksize;
        int ret, off = sbi->s_fc_bytes % bsize;
        int pad_len;

        /*
         * After allocating len, we should have space at least for a 0 byte
         * padding.
         */
        if (len + sizeof(struct ext4_fc_tl) > bsize)
                return NULL;

        if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
                /*
                 * Only allocate from current buffer if we have enough space for
                 * this request AND we have space to add a zero byte padding.
                 */
                if (!sbi->s_fc_bh) {
                        ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
                        if (ret)
                                return NULL;
                        sbi->s_fc_bh = bh;
                }
                sbi->s_fc_bytes += len;
                return sbi->s_fc_bh->b_data + off;
        }
        /* Need to add PAD tag */
        tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
        tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
        pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
        tl->fc_len = cpu_to_le16(pad_len);
        if (crc)
                *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
        if (pad_len > 0)
                ext4_fc_memzero(sb, tl + 1, pad_len, crc);
        /* Don't leak uninitialized memory in the unused last byte. */
        *((u8 *)(tl + 1) + pad_len) = 0;

        ext4_fc_submit_bh(sb);

        ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
        if (ret)
                return NULL;
        sbi->s_fc_bh = bh;
        sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
        return sbi->s_fc_bh->b_data;
}

/* memcpy to fc reserved space and update CRC */
static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
                                int len, u32 *crc)
{
        if (crc)
                *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
        return memcpy(dst, src, len);
}

/*
 * Complete a fast commit by writing tail tag.
 *
 * Writing tail tag marks the end of a fast commit. In order to guarantee
 * atomicity, after writing tail tag, even if there's space remaining
 * in the block, next commit shouldn't use it. That's why tail tag
 * has the length as that of the remaining space on the block.
 */
static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_fc_tl tl;
        struct ext4_fc_tail tail;
        int off, bsize = sbi->s_journal->j_blocksize;
        u8 *dst;

        /*
         * ext4_fc_reserve_space takes care of allocating an extra block if
         * there's no enough space on this block for accommodating this tail.
         */
        dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
        if (!dst)
                return -ENOSPC;

        off = sbi->s_fc_bytes % bsize;

        tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
        tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
        sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);

        ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
        dst += sizeof(tl);
        tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
        ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
        dst += sizeof(tail.fc_tid);
        tail.fc_crc = cpu_to_le32(crc);
        ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
        dst += sizeof(tail.fc_crc);
        memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */

        ext4_fc_submit_bh(sb);

        return 0;
}

/*
 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
 * Returns false if there's not enough space.
 */
static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
                           u32 *crc)
{
        struct ext4_fc_tl tl;
        u8 *dst;

        dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
        if (!dst)
                return false;

        tl.fc_tag = cpu_to_le16(tag);
        tl.fc_len = cpu_to_le16(len);

        ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
        ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);

        return true;
}

/* Same as above, but adds dentry tlv. */
static  bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
                                        int parent_ino, int ino, int dlen,
                                        const unsigned char *dname,
                                        u32 *crc)
{
        struct ext4_fc_dentry_info fcd;
        struct ext4_fc_tl tl;
        u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
                                        crc);

        if (!dst)
                return false;

        fcd.fc_parent_ino = cpu_to_le32(parent_ino);
        fcd.fc_ino = cpu_to_le32(ino);
        tl.fc_tag = cpu_to_le16(tag);
        tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
        ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
        dst += sizeof(tl);
        ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
        dst += sizeof(fcd);
        ext4_fc_memcpy(sb, dst, dname, dlen, crc);
        dst += dlen;

        return true;
}

/*
 * Writes inode in the fast commit space under TLV with tag @tag.
 * Returns 0 on success, error on failure.
 */
static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
        int ret;
        struct ext4_iloc iloc;
        struct ext4_fc_inode fc_inode;
        struct ext4_fc_tl tl;
        u8 *dst;

        ret = ext4_get_inode_loc(inode, &iloc);
        if (ret)
                return ret;

        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
                inode_len += ei->i_extra_isize;

        fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
        tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
        tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));

        ret = -ECANCELED;
        dst = ext4_fc_reserve_space(inode->i_sb,
                        sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
        if (!dst)
                goto err;

        if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
                goto err;
        dst += sizeof(tl);
        if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
                goto err;
        dst += sizeof(fc_inode);
        if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
                                        inode_len, crc))
                goto err;
        ret = 0;
err:
        brelse(iloc.bh);
        return ret;
}

/*
 * Writes updated data ranges for the inode in question. Updates CRC.
 * Returns 0 on success, error otherwise.
 */
static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
{
        ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_map_blocks map;
        struct ext4_fc_add_range fc_ext;
        struct ext4_fc_del_range lrange;
        struct ext4_extent *ex;
        int ret;

        mutex_lock(&ei->i_fc_lock);
        if (ei->i_fc_lblk_len == 0) {
                mutex_unlock(&ei->i_fc_lock);
                return 0;
        }
        old_blk_size = ei->i_fc_lblk_start;
        new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
        ei->i_fc_lblk_len = 0;
        mutex_unlock(&ei->i_fc_lock);

        cur_lblk_off = old_blk_size;
        jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
                  __func__, cur_lblk_off, new_blk_size, inode->i_ino);

        while (cur_lblk_off <= new_blk_size) {
                map.m_lblk = cur_lblk_off;
                map.m_len = new_blk_size - cur_lblk_off + 1;
                ret = ext4_map_blocks(NULL, inode, &map, 0);
                if (ret < 0)
                        return -ECANCELED;

                if (map.m_len == 0) {
                        cur_lblk_off++;
                        continue;
                }

                if (ret == 0) {
                        lrange.fc_ino = cpu_to_le32(inode->i_ino);
                        lrange.fc_lblk = cpu_to_le32(map.m_lblk);
                        lrange.fc_len = cpu_to_le32(map.m_len);
                        if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
                                            sizeof(lrange), (u8 *)&lrange, crc))
                                return -ENOSPC;
                } else {
                        unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
                                EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;

                        /* Limit the number of blocks in one extent */
                        map.m_len = min(max, map.m_len);

                        fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
                        ex = (struct ext4_extent *)&fc_ext.fc_ex;
                        ex->ee_block = cpu_to_le32(map.m_lblk);
                        ex->ee_len = cpu_to_le16(map.m_len);
                        ext4_ext_store_pblock(ex, map.m_pblk);
                        if (map.m_flags & EXT4_MAP_UNWRITTEN)
                                ext4_ext_mark_unwritten(ex);
                        else
                                ext4_ext_mark_initialized(ex);
                        if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
                                            sizeof(fc_ext), (u8 *)&fc_ext, crc))
                                return -ENOSPC;
                }

                cur_lblk_off += map.m_len;
        }

        return 0;
}


/* Submit data for all the fast commit inodes */
static int ext4_fc_submit_inode_data_all(journal_t *journal)
{
        struct super_block *sb = (struct super_block *)(journal->j_private);
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_inode_info *ei;
        struct list_head *pos;
        int ret = 0;

        spin_lock(&sbi->s_fc_lock);
        ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
        list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
                ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
                ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
                while (atomic_read(&ei->i_fc_updates)) {
                        DEFINE_WAIT(wait);

                        prepare_to_wait(&ei->i_fc_wait, &wait,
                                                TASK_UNINTERRUPTIBLE);
                        if (atomic_read(&ei->i_fc_updates)) {
                                spin_unlock(&sbi->s_fc_lock);
                                schedule();
                                spin_lock(&sbi->s_fc_lock);
                        }
                        finish_wait(&ei->i_fc_wait, &wait);
                }
                spin_unlock(&sbi->s_fc_lock);
                ret = jbd2_submit_inode_data(READ_ONCE(ei->jinode));
                if (ret)
                        return ret;
                spin_lock(&sbi->s_fc_lock);
        }
        spin_unlock(&sbi->s_fc_lock);

        return ret;
}

/* Wait for completion of data for all the fast commit inodes */
static int ext4_fc_wait_inode_data_all(journal_t *journal)
{
        struct super_block *sb = (struct super_block *)(journal->j_private);
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_inode_info *pos, *n;
        int ret = 0;

        spin_lock(&sbi->s_fc_lock);
        list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
                if (!ext4_test_inode_state(&pos->vfs_inode,
                                           EXT4_STATE_FC_COMMITTING))
                        continue;
                spin_unlock(&sbi->s_fc_lock);

                ret = jbd2_wait_inode_data(journal, READ_ONCE(pos->jinode));
                if (ret)
                        return ret;
                spin_lock(&sbi->s_fc_lock);
        }
        spin_unlock(&sbi->s_fc_lock);

        return 0;
}

/* Commit all the directory entry updates */
static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
__acquires(&sbi->s_fc_lock)
__releases(&sbi->s_fc_lock)
{
        struct super_block *sb = (struct super_block *)(journal->j_private);
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_fc_dentry_update *fc_dentry;
        struct inode *inode;
        struct list_head *pos, *n, *fcd_pos, *fcd_n;
        struct ext4_inode_info *ei;
        int ret;

        if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
                return 0;
        list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) {
                fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update,
                                        fcd_list);
                if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
                        spin_unlock(&sbi->s_fc_lock);
                        if (!ext4_fc_add_dentry_tlv(
                                sb, fc_dentry->fcd_op,
                                fc_dentry->fcd_parent, fc_dentry->fcd_ino,
                                fc_dentry->fcd_name.len,
                                fc_dentry->fcd_name.name, crc)) {
                                ret = -ENOSPC;
                                goto lock_and_exit;
                        }
                        spin_lock(&sbi->s_fc_lock);
                        continue;
                }

                inode = NULL;
                list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
                        ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
                        if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
                                inode = &ei->vfs_inode;
                                break;
                        }
                }
                /*
                 * If we don't find inode in our list, then it was deleted,
                 * in which case, we don't need to record it's create tag.
                 */
                if (!inode)
                        continue;
                spin_unlock(&sbi->s_fc_lock);

                /*
                 * We first write the inode and then the create dirent. This
                 * allows the recovery code to create an unnamed inode first
                 * and then link it to a directory entry. This allows us
                 * to use namei.c routines almost as is and simplifies
                 * the recovery code.
                 */
                ret = ext4_fc_write_inode(inode, crc);
                if (ret)
                        goto lock_and_exit;

                ret = ext4_fc_write_inode_data(inode, crc);
                if (ret)
                        goto lock_and_exit;

                if (!ext4_fc_add_dentry_tlv(
                        sb, fc_dentry->fcd_op,
                        fc_dentry->fcd_parent, fc_dentry->fcd_ino,
                        fc_dentry->fcd_name.len,
                        fc_dentry->fcd_name.name, crc)) {
                        ret = -ENOSPC;
                        goto lock_and_exit;
                }

                spin_lock(&sbi->s_fc_lock);
        }
        return 0;
lock_and_exit:
        spin_lock(&sbi->s_fc_lock);
        return ret;
}

static int ext4_fc_perform_commit(journal_t *journal)
{
        struct super_block *sb = (struct super_block *)(journal->j_private);
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_inode_info *iter;
        struct ext4_fc_head head;
        struct list_head *pos;
        struct inode *inode;
        struct blk_plug plug;
        int ret = 0;
        u32 crc = 0;

        ret = ext4_fc_submit_inode_data_all(journal);
        if (ret)
                return ret;

        ret = ext4_fc_wait_inode_data_all(journal);
        if (ret)
                return ret;

        /*
         * If file system device is different from journal device, issue a cache
         * flush before we start writing fast commit blocks.
         */
        if (journal->j_fs_dev != journal->j_dev)
                blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS);

        blk_start_plug(&plug);
        if (sbi->s_fc_bytes == 0) {
                /*
                 * Add a head tag only if this is the first fast commit
                 * in this TID.
                 */
                head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
                head.fc_tid = cpu_to_le32(
                        sbi->s_journal->j_running_transaction->t_tid);
                if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
                        (u8 *)&head, &crc)) {
                        ret = -ENOSPC;
                        goto out;
                }
        }

        spin_lock(&sbi->s_fc_lock);
        ret = ext4_fc_commit_dentry_updates(journal, &crc);
        if (ret) {
                spin_unlock(&sbi->s_fc_lock);
                goto out;
        }

        list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
                iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
                inode = &iter->vfs_inode;
                if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
                        continue;

                spin_unlock(&sbi->s_fc_lock);
                ret = ext4_fc_write_inode_data(inode, &crc);
                if (ret)
                        goto out;
                ret = ext4_fc_write_inode(inode, &crc);
                if (ret)
                        goto out;
                spin_lock(&sbi->s_fc_lock);
        }
        spin_unlock(&sbi->s_fc_lock);

        ret = ext4_fc_write_tail(sb, crc);

out:
        blk_finish_plug(&plug);
        return ret;
}

/*
 * The main commit entry point. Performs a fast commit for transaction
 * commit_tid if needed. If it's not possible to perform a fast commit
 * due to various reasons, we fall back to full commit. Returns 0
 * on success, error otherwise.
 */
int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
{
        struct super_block *sb = (struct super_block *)(journal->j_private);
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int nblks = 0, ret, bsize = journal->j_blocksize;
        int subtid = atomic_read(&sbi->s_fc_subtid);
        int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
        ktime_t start_time, commit_time;

        trace_ext4_fc_commit_start(sb);

        start_time = ktime_get();

        if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
                (ext4_fc_is_ineligible(sb))) {
                reason = EXT4_FC_REASON_INELIGIBLE;
                goto out;
        }

restart_fc:
        ret = jbd2_fc_begin_commit(journal, commit_tid);
        if (ret == -EALREADY) {
                /* There was an ongoing commit, check if we need to restart */
                if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
                        commit_tid > journal->j_commit_sequence)
                        goto restart_fc;
                reason = EXT4_FC_REASON_ALREADY_COMMITTED;
                goto out;
        } else if (ret) {
                sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
                reason = EXT4_FC_REASON_FC_START_FAILED;
                goto out;
        }

        fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
        ret = ext4_fc_perform_commit(journal);
        if (ret < 0) {
                sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
                reason = EXT4_FC_REASON_FC_FAILED;
                goto out;
        }
        nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
        ret = jbd2_fc_wait_bufs(journal, nblks);
        if (ret < 0) {
                sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
                reason = EXT4_FC_REASON_FC_FAILED;
                goto out;
        }
        atomic_inc(&sbi->s_fc_subtid);
        jbd2_fc_end_commit(journal);
out:
        /* Has any ineligible update happened since we started? */
        if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
                sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
                reason = EXT4_FC_REASON_INELIGIBLE;
        }

        spin_lock(&sbi->s_fc_lock);
        if (reason != EXT4_FC_REASON_OK &&
                reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
                sbi->s_fc_stats.fc_ineligible_commits++;
        } else {
                sbi->s_fc_stats.fc_num_commits++;
                sbi->s_fc_stats.fc_numblks += nblks;
        }
        spin_unlock(&sbi->s_fc_lock);
        nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
        trace_ext4_fc_commit_stop(sb, nblks, reason);
        commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
        /*
         * weight the commit time higher than the average time so we don't
         * react too strongly to vast changes in the commit time
         */
        if (likely(sbi->s_fc_avg_commit_time))
                sbi->s_fc_avg_commit_time = (commit_time +
                                sbi->s_fc_avg_commit_time * 3) / 4;
        else
                sbi->s_fc_avg_commit_time = commit_time;
        jbd_debug(1,
                "Fast commit ended with blks = %d, reason = %d, subtid - %d",
                nblks, reason, subtid);
        if (reason == EXT4_FC_REASON_FC_FAILED)
                return jbd2_fc_end_commit_fallback(journal);
        if (reason == EXT4_FC_REASON_FC_START_FAILED ||
                reason == EXT4_FC_REASON_INELIGIBLE)
                return jbd2_complete_transaction(journal, commit_tid);
        return 0;
}

/*
 * Fast commit cleanup routine. This is called after every fast commit and
 * full commit. full is true if we are called after a full commit.
 */
static void ext4_fc_cleanup(journal_t *journal, int full)
{
        struct super_block *sb = journal->j_private;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_inode_info *iter;
        struct ext4_fc_dentry_update *fc_dentry;
        struct list_head *pos, *n;

        if (full && sbi->s_fc_bh)
                sbi->s_fc_bh = NULL;

        jbd2_fc_release_bufs(journal);

        spin_lock(&sbi->s_fc_lock);
        list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
                iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
                list_del_init(&iter->i_fc_list);
                ext4_clear_inode_state(&iter->vfs_inode,
                                       EXT4_STATE_FC_COMMITTING);
                ext4_fc_reset_inode(&iter->vfs_inode);
                /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
                smp_mb();
#if (BITS_PER_LONG < 64)
                wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
#else
                wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
#endif
        }

        while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
                fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
                                             struct ext4_fc_dentry_update,
                                             fcd_list);
                list_del_init(&fc_dentry->fcd_list);
                spin_unlock(&sbi->s_fc_lock);

                if (fc_dentry->fcd_name.name &&
                        fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
                        kfree(fc_dentry->fcd_name.name);
                kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
                spin_lock(&sbi->s_fc_lock);
        }

        list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
                                &sbi->s_fc_dentry_q[FC_Q_MAIN]);
        list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
                                &sbi->s_fc_q[FC_Q_MAIN]);

        ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
        ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);

        if (full)
                sbi->s_fc_bytes = 0;
        spin_unlock(&sbi->s_fc_lock);
        trace_ext4_fc_stats(sb);
}

/* Ext4 Replay Path Routines */

/* Helper struct for dentry replay routines */
struct dentry_info_args {
        int parent_ino, dname_len, ino, inode_len;
        char *dname;
};

static inline void tl_to_darg(struct dentry_info_args *darg,
                              struct  ext4_fc_tl *tl, u8 *val)
{
        struct ext4_fc_dentry_info fcd;

        memcpy(&fcd, val, sizeof(fcd));

        darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
        darg->ino = le32_to_cpu(fcd.fc_ino);
        darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
        darg->dname_len = le16_to_cpu(tl->fc_len) -
                sizeof(struct ext4_fc_dentry_info);
}

/* Unlink replay function */
static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
                                 u8 *val)
{
        struct inode *inode, *old_parent;
        struct qstr entry;
        struct dentry_info_args darg;
        int ret = 0;

        tl_to_darg(&darg, tl, val);

        trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
                        darg.parent_ino, darg.dname_len);

        entry.name = darg.dname;
        entry.len = darg.dname_len;
        inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);

        if (IS_ERR(inode)) {
                jbd_debug(1, "Inode %d not found", darg.ino);
                return 0;
        }

        old_parent = ext4_iget(sb, darg.parent_ino,
                                EXT4_IGET_NORMAL);
        if (IS_ERR(old_parent)) {
                jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
                iput(inode);
                return 0;
        }

        ret = __ext4_unlink(old_parent, &entry, inode, NULL);
        /* -ENOENT ok coz it might not exist anymore. */
        if (ret == -ENOENT)
                ret = 0;
        iput(old_parent);
        iput(inode);
        return ret;
}

static int ext4_fc_replay_link_internal(struct super_block *sb,
                                struct dentry_info_args *darg,
                                struct inode *inode)
{
        struct inode *dir = NULL;
        struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
        struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
        int ret = 0;

        dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
        if (IS_ERR(dir)) {
                jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
                dir = NULL;
                goto out;
        }

        dentry_dir = d_obtain_alias(dir);
        if (IS_ERR(dentry_dir)) {
                jbd_debug(1, "Failed to obtain dentry");
                dentry_dir = NULL;
                goto out;
        }

        dentry_inode = d_alloc(dentry_dir, &qstr_dname);
        if (!dentry_inode) {
                jbd_debug(1, "Inode dentry not created.");
                ret = -ENOMEM;
                goto out;
        }

        ret = __ext4_link(dir, inode, dentry_inode);
        /*
         * It's possible that link already existed since data blocks
         * for the dir in question got persisted before we crashed OR
         * we replayed this tag and crashed before the entire replay
         * could complete.
         */
        if (ret && ret != -EEXIST) {
                jbd_debug(1, "Failed to link\n");
                goto out;
        }

        ret = 0;
out:
        if (dentry_dir) {
                d_drop(dentry_dir);
                dput(dentry_dir);
        } else if (dir) {
                iput(dir);
        }
        if (dentry_inode) {
                d_drop(dentry_inode);
                dput(dentry_inode);
        }

        return ret;
}

/* Link replay function */
static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
                               u8 *val)
{
        struct inode *inode;
        struct dentry_info_args darg;
        int ret = 0;

        tl_to_darg(&darg, tl, val);
        trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
                        darg.parent_ino, darg.dname_len);

        inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
        if (IS_ERR(inode)) {
                jbd_debug(1, "Inode not found.");
                return 0;
        }

        ret = ext4_fc_replay_link_internal(sb, &darg, inode);
        iput(inode);
        return ret;
}

/*
 * Record all the modified inodes during replay. We use this later to setup
 * block bitmaps correctly.
 */
static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
{
        struct ext4_fc_replay_state *state;
        int i;

        state = &EXT4_SB(sb)->s_fc_replay_state;
        for (i = 0; i < state->fc_modified_inodes_used; i++)
                if (state->fc_modified_inodes[i] == ino)
                        return 0;
        if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
                int *fc_modified_inodes;

                fc_modified_inodes = krealloc(state->fc_modified_inodes,
                                sizeof(int) * (state->fc_modified_inodes_size +
                                EXT4_FC_REPLAY_REALLOC_INCREMENT),
                                GFP_KERNEL);
                if (!fc_modified_inodes)
                        return -ENOMEM;
                state->fc_modified_inodes = fc_modified_inodes;
                state->fc_modified_inodes_size +=
                        EXT4_FC_REPLAY_REALLOC_INCREMENT;
        }
        state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
        return 0;
}

/*
 * Inode replay function
 */
static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
                                u8 *val)
{
        struct ext4_fc_inode fc_inode;
        struct ext4_inode *raw_inode;
        struct ext4_inode *raw_fc_inode;
        struct inode *inode = NULL;
        struct ext4_iloc iloc;
        int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
        struct ext4_extent_header *eh;

        memcpy(&fc_inode, val, sizeof(fc_inode));

        ino = le32_to_cpu(fc_inode.fc_ino);
        trace_ext4_fc_replay(sb, tag, ino, 0, 0);

        inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
        if (!IS_ERR(inode)) {
                ext4_ext_clear_bb(inode);
                iput(inode);
        }
        inode = NULL;

        ret = ext4_fc_record_modified_inode(sb, ino);
        if (ret)
                goto out;

        raw_fc_inode = (struct ext4_inode *)
                (val + offsetof(struct ext4_fc_inode, fc_raw_inode));
        ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
        if (ret)
                goto out;

        inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
        raw_inode = ext4_raw_inode(&iloc);

        memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
        memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
                inode_len - offsetof(struct ext4_inode, i_generation));
        if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
                eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
                if (eh->eh_magic != EXT4_EXT_MAGIC) {
                        memset(eh, 0, sizeof(*eh));
                        eh->eh_magic = EXT4_EXT_MAGIC;
                        eh->eh_max = cpu_to_le16(
                                (sizeof(raw_inode->i_block) -
                                 sizeof(struct ext4_extent_header))
                                 / sizeof(struct ext4_extent));
                }
        } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
                memcpy(raw_inode->i_block, raw_fc_inode->i_block,
                        sizeof(raw_inode->i_block));
        }

        /* Immediately update the inode on disk. */
        ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
        if (ret)
                goto out_brelse;
        ret = sync_dirty_buffer(iloc.bh);
        if (ret)
                goto out_brelse;
        ret = ext4_mark_inode_used(sb, ino);
        if (ret)
                goto out_brelse;

        /* Given that we just wrote the inode on disk, this SHOULD succeed. */
        inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
        if (IS_ERR(inode)) {
                jbd_debug(1, "Inode not found.");
                inode = NULL;
                ret = -EFSCORRUPTED;
                goto out_brelse;
        }

        /*
         * Our allocator could have made different decisions than before
         * crashing. This should be fixed but until then, we calculate
         * the number of blocks the inode.
         */
        ext4_ext_replay_set_iblocks(inode);

        inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
        ext4_reset_inode_seed(inode);

        ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
        ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
        sync_dirty_buffer(iloc.bh);
out_brelse:
        brelse(iloc.bh);
out:
        iput(inode);
        if (!ret)
                blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);

        return ret;
}

/*
 * Dentry create replay function.
 *
 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
 * inode for which we are trying to create a dentry here, should already have
 * been replayed before we start here.
 */
static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
                                 u8 *val)
{
        int ret = 0;
        struct inode *inode = NULL;
        struct inode *dir = NULL;
        struct dentry_info_args darg;

        tl_to_darg(&darg, tl, val);

        trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
                        darg.parent_ino, darg.dname_len);

        /* This takes care of update group descriptor and other metadata */
        ret = ext4_mark_inode_used(sb, darg.ino);
        if (ret)
                goto out;

        inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
        if (IS_ERR(inode)) {
                jbd_debug(1, "inode %d not found.", darg.ino);
                inode = NULL;
                ret = -EINVAL;
                goto out;
        }

        if (S_ISDIR(inode->i_mode)) {
                /*
                 * If we are creating a directory, we need to make sure that the
                 * dot and dot dot dirents are setup properly.
                 */
                dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
                if (IS_ERR(dir)) {
                        jbd_debug(1, "Dir %d not found.", darg.ino);
                        goto out;
                }
                ret = ext4_init_new_dir(NULL, dir, inode);
                iput(dir);
                if (ret) {
                        ret = 0;
                        goto out;
                }
        }
        ret = ext4_fc_replay_link_internal(sb, &darg, inode);
        if (ret)
                goto out;
        set_nlink(inode, 1);
        ext4_mark_inode_dirty(NULL, inode);
out:
        if (inode)
                iput(inode);
        return ret;
}

/*
 * Record physical disk regions which are in use as per fast commit area,
 * and used by inodes during replay phase. Our simple replay phase
 * allocator excludes these regions from allocation.
 */
int ext4_fc_record_regions(struct super_block *sb, int ino,
                ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
{
        struct ext4_fc_replay_state *state;
        struct ext4_fc_alloc_region *region;

        state = &EXT4_SB(sb)->s_fc_replay_state;
        /*
         * during replay phase, the fc_regions_valid may not same as
         * fc_regions_used, update it when do new additions.
         */
        if (replay && state->fc_regions_used != state->fc_regions_valid)
                state->fc_regions_used = state->fc_regions_valid;
        if (state->fc_regions_used == state->fc_regions_size) {
                struct ext4_fc_alloc_region *fc_regions;

                fc_regions = krealloc(state->fc_regions,
                                      sizeof(struct ext4_fc_alloc_region) *
                                      (state->fc_regions_size +
                                       EXT4_FC_REPLAY_REALLOC_INCREMENT),
                                      GFP_KERNEL);
                if (!fc_regions)
                        return -ENOMEM;
                state->fc_regions_size +=
                        EXT4_FC_REPLAY_REALLOC_INCREMENT;
                state->fc_regions = fc_regions;
        }
        region = &state->fc_regions[state->fc_regions_used++];
        region->ino = ino;
        region->lblk = lblk;
        region->pblk = pblk;
        region->len = len;

        if (replay)
                state->fc_regions_valid++;

        return 0;
}

/* Replay add range tag */
static int ext4_fc_replay_add_range(struct super_block *sb,
                                    struct ext4_fc_tl *tl, u8 *val)
{
        struct ext4_fc_add_range fc_add_ex;
        struct ext4_extent newex, *ex;
        struct inode *inode;
        ext4_lblk_t start, cur;
        int remaining, len;
        ext4_fsblk_t start_pblk;
        struct ext4_map_blocks map;
        struct ext4_ext_path *path = NULL;
        int ret;

        memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
        ex = (struct ext4_extent *)&fc_add_ex.fc_ex;

        trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
                le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
                ext4_ext_get_actual_len(ex));

        inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
        if (IS_ERR(inode)) {
                jbd_debug(1, "Inode not found.");
                return 0;
        }

        ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
        if (ret)
                goto out;

        start = le32_to_cpu(ex->ee_block);
        start_pblk = ext4_ext_pblock(ex);
        len = ext4_ext_get_actual_len(ex);

        cur = start;
        remaining = len;
        jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
                  start, start_pblk, len, ext4_ext_is_unwritten(ex),
                  inode->i_ino);

        while (remaining > 0) {
                map.m_lblk = cur;
                map.m_len = remaining;
                map.m_pblk = 0;
                ret = ext4_map_blocks(NULL, inode, &map, 0);

                if (ret < 0)
                        goto out;

                if (ret == 0) {
                        /* Range is not mapped */
                        path = ext4_find_extent(inode, cur, NULL, 0);
                        if (IS_ERR(path))
                                goto out;
                        memset(&newex, 0, sizeof(newex));
                        newex.ee_block = cpu_to_le32(cur);
                        ext4_ext_store_pblock(
                                &newex, start_pblk + cur - start);
                        newex.ee_len = cpu_to_le16(map.m_len);
                        if (ext4_ext_is_unwritten(ex))
                                ext4_ext_mark_unwritten(&newex);
                        down_write(&EXT4_I(inode)->i_data_sem);
                        ret = ext4_ext_insert_extent(
                                NULL, inode, &path, &newex, 0);
                        up_write((&EXT4_I(inode)->i_data_sem));
                        ext4_ext_drop_refs(path);
                        kfree(path);
                        if (ret)
                                goto out;
                        goto next;
                }

                if (start_pblk + cur - start != map.m_pblk) {
                        /*
                         * Logical to physical mapping changed. This can happen
                         * if this range was removed and then reallocated to
                         * map to new physical blocks during a fast commit.
                         */
                        ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
                                        ext4_ext_is_unwritten(ex),
                                        start_pblk + cur - start);
                        if (ret)
                                goto out;
                        /*
                         * Mark the old blocks as free since they aren't used
                         * anymore. We maintain an array of all the modified
                         * inodes. In case these blocks are still used at either
                         * a different logical range in the same inode or in
                         * some different inode, we will mark them as allocated
                         * at the end of the FC replay using our array of
                         * modified inodes.
                         */
                        ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
                        goto next;
                }

                /* Range is mapped and needs a state change */
                jbd_debug(1, "Converting from %ld to %d %lld",
                                map.m_flags & EXT4_MAP_UNWRITTEN,
                        ext4_ext_is_unwritten(ex), map.m_pblk);
                ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
                                        ext4_ext_is_unwritten(ex), map.m_pblk);
                if (ret)
                        goto out;
                /*
                 * We may have split the extent tree while toggling the state.
                 * Try to shrink the extent tree now.
                 */
                ext4_ext_replay_shrink_inode(inode, start + len);
next:
                cur += map.m_len;
                remaining -= map.m_len;
        }
        ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
                                        sb->s_blocksize_bits);
out:
        iput(inode);
        return 0;
}

/* Replay DEL_RANGE tag */
static int
ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
                         u8 *val)
{
        struct inode *inode;
        struct ext4_fc_del_range lrange;
        struct ext4_map_blocks map;
        ext4_lblk_t cur, remaining;
        int ret;

        memcpy(&lrange, val, sizeof(lrange));
        cur = le32_to_cpu(lrange.fc_lblk);
        remaining = le32_to_cpu(lrange.fc_len);

        trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
                le32_to_cpu(lrange.fc_ino), cur, remaining);

        inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
        if (IS_ERR(inode)) {
                jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
                return 0;
        }

        ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
        if (ret)
                goto out;

        jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
                        inode->i_ino, le32_to_cpu(lrange.fc_lblk),
                        le32_to_cpu(lrange.fc_len));
        while (remaining > 0) {
                map.m_lblk = cur;
                map.m_len = remaining;

                ret = ext4_map_blocks(NULL, inode, &map, 0);
                if (ret < 0)
                        goto out;
                if (ret > 0) {
                        remaining -= ret;
                        cur += ret;
                        ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
                } else {
                        remaining -= map.m_len;
                        cur += map.m_len;
                }
        }

        down_write(&EXT4_I(inode)->i_data_sem);
        ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
                                le32_to_cpu(lrange.fc_lblk) +
                                le32_to_cpu(lrange.fc_len) - 1);
        up_write(&EXT4_I(inode)->i_data_sem);
        if (ret)
                goto out;
        ext4_ext_replay_shrink_inode(inode,
                i_size_read(inode) >> sb->s_blocksize_bits);
        ext4_mark_inode_dirty(NULL, inode);
out:
        iput(inode);
        return 0;
}

static inline const char *tag2str(u16 tag)
{
        switch (tag) {
        case EXT4_FC_TAG_LINK:
                return "TAG_ADD_ENTRY";
        case EXT4_FC_TAG_UNLINK:
                return "TAG_DEL_ENTRY";
        case EXT4_FC_TAG_ADD_RANGE:
                return "TAG_ADD_RANGE";
        case EXT4_FC_TAG_CREAT:
                return "TAG_CREAT_DENTRY";
        case EXT4_FC_TAG_DEL_RANGE:
                return "TAG_DEL_RANGE";
        case EXT4_FC_TAG_INODE:
                return "TAG_INODE";
        case EXT4_FC_TAG_PAD:
                return "TAG_PAD";
        case EXT4_FC_TAG_TAIL:
                return "TAG_TAIL";
        case EXT4_FC_TAG_HEAD:
                return "TAG_HEAD";
        default:
                return "TAG_ERROR";
        }
}

static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
{
        struct ext4_fc_replay_state *state;
        struct inode *inode;
        struct ext4_ext_path *path = NULL;
        struct ext4_map_blocks map;
        int i, ret, j;
        ext4_lblk_t cur, end;

        state = &EXT4_SB(sb)->s_fc_replay_state;
        for (i = 0; i < state->fc_modified_inodes_used; i++) {
                inode = ext4_iget(sb, state->fc_modified_inodes[i],
                        EXT4_IGET_NORMAL);
                if (IS_ERR(inode)) {
                        jbd_debug(1, "Inode %d not found.",
                                state->fc_modified_inodes[i]);
                        continue;
                }
                cur = 0;
                end = EXT_MAX_BLOCKS;
                while (cur < end) {
                        map.m_lblk = cur;
                        map.m_len = end - cur;

                        ret = ext4_map_blocks(NULL, inode, &map, 0);
                        if (ret < 0)
                                break;

                        if (ret > 0) {
                                path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
                                if (!IS_ERR(path)) {
                                        for (j = 0; j < path->p_depth; j++)
                                                ext4_mb_mark_bb(inode->i_sb,
                                                        path[j].p_block, 1, 1);
                                        ext4_ext_drop_refs(path);
                                        kfree(path);
                                }
                                cur += ret;
                                ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
                                                        map.m_len, 1);
                        } else {
                                cur = cur + (map.m_len ? map.m_len : 1);
                        }
                }
                iput(inode);
        }
}

/*
 * Check if block is in excluded regions for block allocation. The simple
 * allocator that runs during replay phase is calls this function to see
 * if it is okay to use a block.
 */
bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
{
        int i;
        struct ext4_fc_replay_state *state;

        state = &EXT4_SB(sb)->s_fc_replay_state;
        for (i = 0; i < state->fc_regions_valid; i++) {
                if (state->fc_regions[i].ino == 0 ||
                        state->fc_regions[i].len == 0)
                        continue;
                if (blk >= state->fc_regions[i].pblk &&
                    blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
                        return true;
        }
        return false;
}

/* Cleanup function called after replay */
void ext4_fc_replay_cleanup(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        sbi->s_mount_state &= ~EXT4_FC_REPLAY;
        kfree(sbi->s_fc_replay_state.fc_regions);
        kfree(sbi->s_fc_replay_state.fc_modified_inodes);
}

/*
 * Recovery Scan phase handler
 *
 * This function is called during the scan phase and is responsible
 * for doing following things:
 * - Make sure the fast commit area has valid tags for replay
 * - Count number of tags that need to be replayed by the replay handler
 * - Verify CRC
 * - Create a list of excluded blocks for allocation during replay phase
 *
 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
 * to indicate that scan has finished and JBD2 can now start replay phase.
 * It returns a negative error to indicate that there was an error. At the end
 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
 * to indicate the number of tags that need to replayed during the replay phase.
 */
static int ext4_fc_replay_scan(journal_t *journal,
                                struct buffer_head *bh, int off,
                                tid_t expected_tid)
{
        struct super_block *sb = journal->j_private;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_fc_replay_state *state;
        int ret = JBD2_FC_REPLAY_CONTINUE;
        struct ext4_fc_add_range ext;
        struct ext4_fc_tl tl;
        struct ext4_fc_tail tail;
        __u8 *start, *end, *cur, *val;
        struct ext4_fc_head head;
        struct ext4_extent *ex;

        state = &sbi->s_fc_replay_state;

        start = (u8 *)bh->b_data;
        end = (__u8 *)bh->b_data + journal->j_blocksize - 1;

        if (state->fc_replay_expected_off == 0) {
                state->fc_cur_tag = 0;
                state->fc_replay_num_tags = 0;
                state->fc_crc = 0;
                state->fc_regions = NULL;
                state->fc_regions_valid = state->fc_regions_used =
                        state->fc_regions_size = 0;
                /* Check if we can stop early */
                if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
                        != EXT4_FC_TAG_HEAD)
                        return 0;
        }

        if (off != state->fc_replay_expected_off) {
                ret = -EFSCORRUPTED;
                goto out_err;
        }

        state->fc_replay_expected_off++;
        for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
                memcpy(&tl, cur, sizeof(tl));
                val = cur + sizeof(tl);
                jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
                          tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
                switch (le16_to_cpu(tl.fc_tag)) {
                case EXT4_FC_TAG_ADD_RANGE:
                        memcpy(&ext, val, sizeof(ext));
                        ex = (struct ext4_extent *)&ext.fc_ex;
                        ret = ext4_fc_record_regions(sb,
                                le32_to_cpu(ext.fc_ino),
                                le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
                                ext4_ext_get_actual_len(ex), 0);
                        if (ret < 0)
                                break;
                        ret = JBD2_FC_REPLAY_CONTINUE;
                        fallthrough;
                case EXT4_FC_TAG_DEL_RANGE:
                case EXT4_FC_TAG_LINK:
                case EXT4_FC_TAG_UNLINK:
                case EXT4_FC_TAG_CREAT:
                case EXT4_FC_TAG_INODE:
                case EXT4_FC_TAG_PAD:
                        state->fc_cur_tag++;
                        state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
                                        sizeof(tl) + le16_to_cpu(tl.fc_len));
                        break;
                case EXT4_FC_TAG_TAIL:
                        state->fc_cur_tag++;
                        memcpy(&tail, val, sizeof(tail));
                        state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
                                                sizeof(tl) +
                                                offsetof(struct ext4_fc_tail,
                                                fc_crc));
                        if (le32_to_cpu(tail.fc_tid) == expected_tid &&
                                le32_to_cpu(tail.fc_crc) == state->fc_crc) {
                                state->fc_replay_num_tags = state->fc_cur_tag;
                                state->fc_regions_valid =
                                        state->fc_regions_used;
                        } else {
                                ret = state->fc_replay_num_tags ?
                                        JBD2_FC_REPLAY_STOP : -EFSBADCRC;
                        }
                        state->fc_crc = 0;
                        break;
                case EXT4_FC_TAG_HEAD:
                        memcpy(&head, val, sizeof(head));
                        if (le32_to_cpu(head.fc_features) &
                                ~EXT4_FC_SUPPORTED_FEATURES) {
                                ret = -EOPNOTSUPP;
                                break;
                        }
                        if (le32_to_cpu(head.fc_tid) != expected_tid) {
                                ret = JBD2_FC_REPLAY_STOP;
                                break;
                        }
                        state->fc_cur_tag++;
                        state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
                                            sizeof(tl) + le16_to_cpu(tl.fc_len));
                        break;
                default:
                        ret = state->fc_replay_num_tags ?
                                JBD2_FC_REPLAY_STOP : -ECANCELED;
                }
                if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
                        break;
        }

out_err:
        trace_ext4_fc_replay_scan(sb, ret, off);
        return ret;
}

/*
 * Main recovery path entry point.
 * The meaning of return codes is similar as above.
 */
static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
                                enum passtype pass, int off, tid_t expected_tid)
{
        struct super_block *sb = journal->j_private;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_fc_tl tl;
        __u8 *start, *end, *cur, *val;
        int ret = JBD2_FC_REPLAY_CONTINUE;
        struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
        struct ext4_fc_tail tail;

        if (pass == PASS_SCAN) {
                state->fc_current_pass = PASS_SCAN;
                return ext4_fc_replay_scan(journal, bh, off, expected_tid);
        }

        if (state->fc_current_pass != pass) {
                state->fc_current_pass = pass;
                sbi->s_mount_state |= EXT4_FC_REPLAY;
        }
        if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
                jbd_debug(1, "Replay stops\n");
                ext4_fc_set_bitmaps_and_counters(sb);
                return 0;
        }

#ifdef CONFIG_EXT4_DEBUG
        if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
                pr_warn("Dropping fc block %d because max_replay set\n", off);
                return JBD2_FC_REPLAY_STOP;
        }
#endif

        start = (u8 *)bh->b_data;
        end = (__u8 *)bh->b_data + journal->j_blocksize - 1;

        for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
                memcpy(&tl, cur, sizeof(tl));
                val = cur + sizeof(tl);

                if (state->fc_replay_num_tags == 0) {
                        ret = JBD2_FC_REPLAY_STOP;
                        ext4_fc_set_bitmaps_and_counters(sb);
                        break;
                }
                jbd_debug(3, "Replay phase, tag:%s\n",
                                tag2str(le16_to_cpu(tl.fc_tag)));
                state->fc_replay_num_tags--;
                switch (le16_to_cpu(tl.fc_tag)) {
                case EXT4_FC_TAG_LINK:
                        ret = ext4_fc_replay_link(sb, &tl, val);
                        break;
                case EXT4_FC_TAG_UNLINK:
                        ret = ext4_fc_replay_unlink(sb, &tl, val);
                        break;
                case EXT4_FC_TAG_ADD_RANGE:
                        ret = ext4_fc_replay_add_range(sb, &tl, val);
                        break;
                case EXT4_FC_TAG_CREAT:
                        ret = ext4_fc_replay_create(sb, &tl, val);
                        break;
                case EXT4_FC_TAG_DEL_RANGE:
                        ret = ext4_fc_replay_del_range(sb, &tl, val);
                        break;
                case EXT4_FC_TAG_INODE:
                        ret = ext4_fc_replay_inode(sb, &tl, val);
                        break;
                case EXT4_FC_TAG_PAD:
                        trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
                                             le16_to_cpu(tl.fc_len), 0);
                        break;
                case EXT4_FC_TAG_TAIL:
                        trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
                                             le16_to_cpu(tl.fc_len), 0);
                        memcpy(&tail, val, sizeof(tail));
                        WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
                        break;
                case EXT4_FC_TAG_HEAD:
                        break;
                default:
                        trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
                                             le16_to_cpu(tl.fc_len), 0);
                        ret = -ECANCELED;
                        break;
                }
                if (ret < 0)
                        break;
                ret = JBD2_FC_REPLAY_CONTINUE;
        }
        return ret;
}

void ext4_fc_init(struct super_block *sb, journal_t *journal)
{
        /*
         * We set replay callback even if fast commit disabled because we may
         * could still have fast commit blocks that need to be replayed even if
         * fast commit has now been turned off.
         */
        journal->j_fc_replay_callback = ext4_fc_replay;
        if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
                return;
        journal->j_fc_cleanup_callback = ext4_fc_cleanup;
}

static const char * const fc_ineligible_reasons[] = {
        [EXT4_FC_REASON_XATTR] = "Extended attributes changed",
        [EXT4_FC_REASON_CROSS_RENAME] = "Cross rename",
        [EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed",
        [EXT4_FC_REASON_NOMEM] = "Insufficient memory",
        [EXT4_FC_REASON_SWAP_BOOT] = "Swap boot",
        [EXT4_FC_REASON_RESIZE] = "Resize",
        [EXT4_FC_REASON_RENAME_DIR] = "Dir renamed",
        [EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op",
        [EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling",
        [EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename",
};

int ext4_fc_info_show(struct seq_file *seq, void *v)
{
        struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
        struct ext4_fc_stats *stats = &sbi->s_fc_stats;
        int i;

        if (v != SEQ_START_TOKEN)
                return 0;

        seq_printf(seq,
                "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
                   stats->fc_num_commits, stats->fc_ineligible_commits,
                   stats->fc_numblks,
                   div_u64(sbi->s_fc_avg_commit_time, 1000));
        seq_puts(seq, "Ineligible reasons:\n");
        for (i = 0; i < EXT4_FC_REASON_MAX; i++)
                seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
                        stats->fc_ineligible_reason_count[i]);

        return 0;
}

int __init ext4_fc_init_dentry_cache(void)
{
        ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
                                           SLAB_RECLAIM_ACCOUNT);

        if (ext4_fc_dentry_cachep == NULL)
                return -ENOMEM;

        return 0;
}

void ext4_fc_destroy_dentry_cache(void)
{
        kmem_cache_destroy(ext4_fc_dentry_cachep);
}































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_ATOMIC64_64_H
#define _ASM_X86_ATOMIC64_64_H

#include <linux/types.h>
#include <asm/alternative.h>
#include <asm/cmpxchg.h>

/* The 64-bit atomic type */

#define ATOMIC64_INIT(i)        { (i) }

/**
 * arch_atomic64_read - read atomic64 variable
 * @v: pointer of type atomic64_t
 *
 * Atomically reads the value of @v.
 * Doesn't imply a read memory barrier.
 */
static inline s64 arch_atomic64_read(const atomic64_t *v)
{
        return __READ_ONCE((v)->counter);
}

/**
 * arch_atomic64_set - set atomic64 variable
 * @v: pointer to type atomic64_t
 * @i: required value
 *
 * Atomically sets the value of @v to @i.
 */
static inline void arch_atomic64_set(atomic64_t *v, s64 i)
{
        __WRITE_ONCE(v->counter, i);
}

/**
 * arch_atomic64_add - add integer to atomic64 variable
 * @i: integer value to add
 * @v: pointer to type atomic64_t
 *
 * Atomically adds @i to @v.
 */
static __always_inline void arch_atomic64_add(s64 i, atomic64_t *v)
{
        asm volatile(LOCK_PREFIX "addq %1,%0"
                     : "=m" (v->counter)
                     : "er" (i), "m" (v->counter) : "memory");
}

/**
 * arch_atomic64_sub - subtract the atomic64 variable
 * @i: integer value to subtract
 * @v: pointer to type atomic64_t
 *
 * Atomically subtracts @i from @v.
 */
static inline void arch_atomic64_sub(s64 i, atomic64_t *v)
{
        asm volatile(LOCK_PREFIX "subq %1,%0"
                     : "=m" (v->counter)
                     : "er" (i), "m" (v->counter) : "memory");
}

/**
 * arch_atomic64_sub_and_test - subtract value from variable and test result
 * @i: integer value to subtract
 * @v: pointer to type atomic64_t
 *
 * Atomically subtracts @i from @v and returns
 * true if the result is zero, or false for all
 * other cases.
 */
static inline bool arch_atomic64_sub_and_test(s64 i, atomic64_t *v)
{
        return GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, e, "er", i);
}
#define arch_atomic64_sub_and_test arch_atomic64_sub_and_test

/**
 * arch_atomic64_inc - increment atomic64 variable
 * @v: pointer to type atomic64_t
 *
 * Atomically increments @v by 1.
 */
static __always_inline void arch_atomic64_inc(atomic64_t *v)
{
        asm volatile(LOCK_PREFIX "incq %0"
                     : "=m" (v->counter)
                     : "m" (v->counter) : "memory");
}
#define arch_atomic64_inc arch_atomic64_inc

/**
 * arch_atomic64_dec - decrement atomic64 variable
 * @v: pointer to type atomic64_t
 *
 * Atomically decrements @v by 1.
 */
static __always_inline void arch_atomic64_dec(atomic64_t *v)
{
        asm volatile(LOCK_PREFIX "decq %0"
                     : "=m" (v->counter)
                     : "m" (v->counter) : "memory");
}
#define arch_atomic64_dec arch_atomic64_dec

/**
 * arch_atomic64_dec_and_test - decrement and test
 * @v: pointer to type atomic64_t
 *
 * Atomically decrements @v by 1 and
 * returns true if the result is 0, or false for all other
 * cases.
 */
static inline bool arch_atomic64_dec_and_test(atomic64_t *v)
{
        return GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, e);
}
#define arch_atomic64_dec_and_test arch_atomic64_dec_and_test

/**
 * arch_atomic64_inc_and_test - increment and test
 * @v: pointer to type atomic64_t
 *
 * Atomically increments @v by 1
 * and returns true if the result is zero, or false for all
 * other cases.
 */
static inline bool arch_atomic64_inc_and_test(atomic64_t *v)
{
        return GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, e);
}
#define arch_atomic64_inc_and_test arch_atomic64_inc_and_test

/**
 * arch_atomic64_add_negative - add and test if negative
 * @i: integer value to add
 * @v: pointer to type atomic64_t
 *
 * Atomically adds @i to @v and returns true
 * if the result is negative, or false when
 * result is greater than or equal to zero.
 */
static inline bool arch_atomic64_add_negative(s64 i, atomic64_t *v)
{
        return GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, s, "er", i);
}
#define arch_atomic64_add_negative arch_atomic64_add_negative

/**
 * arch_atomic64_add_return - add and return
 * @i: integer value to add
 * @v: pointer to type atomic64_t
 *
 * Atomically adds @i to @v and returns @i + @v
 */
static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v)
{
        return i + xadd(&v->counter, i);
}
#define arch_atomic64_add_return arch_atomic64_add_return

static inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v)
{
        return arch_atomic64_add_return(-i, v);
}
#define arch_atomic64_sub_return arch_atomic64_sub_return

static inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
{
        return xadd(&v->counter, i);
}
#define arch_atomic64_fetch_add arch_atomic64_fetch_add

static inline s64 arch_atomic64_fetch_sub(s64 i, atomic64_t *v)
{
        return xadd(&v->counter, -i);
}
#define arch_atomic64_fetch_sub arch_atomic64_fetch_sub

static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
{
        return arch_cmpxchg(&v->counter, old, new);
}
#define arch_atomic64_cmpxchg arch_atomic64_cmpxchg

static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
{
        return try_cmpxchg(&v->counter, old, new);
}
#define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg

static inline s64 arch_atomic64_xchg(atomic64_t *v, s64 new)
{
        return arch_xchg(&v->counter, new);
}
#define arch_atomic64_xchg arch_atomic64_xchg

static inline void arch_atomic64_and(s64 i, atomic64_t *v)
{
        asm volatile(LOCK_PREFIX "andq %1,%0"
                        : "+m" (v->counter)
                        : "er" (i)
                        : "memory");
}

static inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
{
        s64 val = arch_atomic64_read(v);

        do {
        } while (!arch_atomic64_try_cmpxchg(v, &val, val & i));
        return val;
}
#define arch_atomic64_fetch_and arch_atomic64_fetch_and

static inline void arch_atomic64_or(s64 i, atomic64_t *v)
{
        asm volatile(LOCK_PREFIX "orq %1,%0"
                        : "+m" (v->counter)
                        : "er" (i)
                        : "memory");
}

static inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
{
        s64 val = arch_atomic64_read(v);

        do {
        } while (!arch_atomic64_try_cmpxchg(v, &val, val | i));
        return val;
}
#define arch_atomic64_fetch_or arch_atomic64_fetch_or

static inline void arch_atomic64_xor(s64 i, atomic64_t *v)
{
        asm volatile(LOCK_PREFIX "xorq %1,%0"
                        : "+m" (v->counter)
                        : "er" (i)
                        : "memory");
}

static inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
{
        s64 val = arch_atomic64_read(v);

        do {
        } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i));
        return val;
}
#define arch_atomic64_fetch_xor arch_atomic64_fetch_xor

#endif /* _ASM_X86_ATOMIC64_64_H */




























































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NET_IP6_ROUTE_H
#define _NET_IP6_ROUTE_H

struct route_info {
        __u8                        type;
        __u8                        length;
        __u8                        prefix_len;
#if defined(__BIG_ENDIAN_BITFIELD)
        __u8                        reserved_h:3,
                                route_pref:2,
                                reserved_l:3;
#elif defined(__LITTLE_ENDIAN_BITFIELD)
        __u8                        reserved_l:3,
                                route_pref:2,
                                reserved_h:3;
#endif
        __be32                        lifetime;
        __u8                        prefix[];        /* 0,8 or 16 */
};

#include <net/addrconf.h>
#include <net/flow.h>
#include <net/ip6_fib.h>
#include <net/sock.h>
#include <net/lwtunnel.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/route.h>
#include <net/nexthop.h>

#define RT6_LOOKUP_F_IFACE                0x00000001
#define RT6_LOOKUP_F_REACHABLE                0x00000002
#define RT6_LOOKUP_F_HAS_SADDR                0x00000004
#define RT6_LOOKUP_F_SRCPREF_TMP        0x00000008
#define RT6_LOOKUP_F_SRCPREF_PUBLIC        0x00000010
#define RT6_LOOKUP_F_SRCPREF_COA        0x00000020
#define RT6_LOOKUP_F_IGNORE_LINKSTATE        0x00000040
#define RT6_LOOKUP_F_DST_NOREF                0x00000080

/* We do not (yet ?) support IPv6 jumbograms (RFC 2675)
 * Unlike IPv4, hdr->seg_len doesn't include the IPv6 header
 */
#define IP6_MAX_MTU (0xFFFF + sizeof(struct ipv6hdr))

/*
 * rt6_srcprefs2flags() and rt6_flags2srcprefs() translate
 * between IPV6_ADDR_PREFERENCES socket option values
 *        IPV6_PREFER_SRC_TMP    = 0x1
 *        IPV6_PREFER_SRC_PUBLIC = 0x2
 *        IPV6_PREFER_SRC_COA    = 0x4
 * and above RT6_LOOKUP_F_SRCPREF_xxx flags.
 */
static inline int rt6_srcprefs2flags(unsigned int srcprefs)
{
        /* No need to bitmask because srcprefs have only 3 bits. */
        return srcprefs << 3;
}

static inline unsigned int rt6_flags2srcprefs(int flags)
{
        return (flags >> 3) & 7;
}

static inline bool rt6_need_strict(const struct in6_addr *daddr)
{
        return ipv6_addr_type(daddr) &
                (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
}

/* fib entries using a nexthop object can not be coalesced into
 * a multipath route
 */
static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i)
{
        /* the RTF_ADDRCONF flag filters out RA's */
        return !(f6i->fib6_flags & RTF_ADDRCONF) && !f6i->nh &&
                f6i->fib6_nh->fib_nh_gw_family;
}

void ip6_route_input(struct sk_buff *skb);
struct dst_entry *ip6_route_input_lookup(struct net *net,
                                         struct net_device *dev,
                                         struct flowi6 *fl6,
                                         const struct sk_buff *skb, int flags);

struct dst_entry *ip6_route_output_flags_noref(struct net *net,
                                               const struct sock *sk,
                                               struct flowi6 *fl6, int flags);

struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
                                         struct flowi6 *fl6, int flags);

static inline struct dst_entry *ip6_route_output(struct net *net,
                                                 const struct sock *sk,
                                                 struct flowi6 *fl6)
{
        return ip6_route_output_flags(net, sk, fl6, 0);
}

/* Only conditionally release dst if flags indicates
 * !RT6_LOOKUP_F_DST_NOREF or dst is in uncached_list.
 */
static inline void ip6_rt_put_flags(struct rt6_info *rt, int flags)
{
        if (!(flags & RT6_LOOKUP_F_DST_NOREF) ||
            !list_empty(&rt->rt6i_uncached))
                ip6_rt_put(rt);
}

struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
                                   const struct sk_buff *skb, int flags);
struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
                               int ifindex, struct flowi6 *fl6,
                               const struct sk_buff *skb, int flags);

void ip6_route_init_special_entries(void);
int ip6_route_init(void);
void ip6_route_cleanup(void);

int ipv6_route_ioctl(struct net *net, unsigned int cmd,
                struct in6_rtmsg *rtmsg);

int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
                  struct netlink_ext_ack *extack);
int ip6_ins_rt(struct net *net, struct fib6_info *f6i);
int ip6_del_rt(struct net *net, struct fib6_info *f6i, bool skip_notify);

void rt6_flush_exceptions(struct fib6_info *f6i);
void rt6_age_exceptions(struct fib6_info *f6i, struct fib6_gc_args *gc_args,
                        unsigned long now);

static inline int ip6_route_get_saddr(struct net *net, struct fib6_info *f6i,
                                      const struct in6_addr *daddr,
                                      unsigned int prefs,
                                      struct in6_addr *saddr)
{
        int err = 0;

        if (f6i && f6i->fib6_prefsrc.plen) {
                *saddr = f6i->fib6_prefsrc.addr;
        } else {
                struct net_device *dev = f6i ? fib6_info_nh_dev(f6i) : NULL;

                err = ipv6_dev_get_saddr(net, dev, daddr, prefs, saddr);
        }

        return err;
}

struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
                            const struct in6_addr *saddr, int oif,
                            const struct sk_buff *skb, int flags);
u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
                       const struct sk_buff *skb, struct flow_keys *hkeys);

struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6);

void fib6_force_start_gc(struct net *net);

struct fib6_info *addrconf_f6i_alloc(struct net *net, struct inet6_dev *idev,
                                     const struct in6_addr *addr, bool anycast,
                                     gfp_t gfp_flags);

struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
                               int flags);

/*
 *        support functions for ND
 *
 */
struct fib6_info *rt6_get_dflt_router(struct net *net,
                                     const struct in6_addr *addr,
                                     struct net_device *dev);
struct fib6_info *rt6_add_dflt_router(struct net *net,
                                     const struct in6_addr *gwaddr,
                                     struct net_device *dev, unsigned int pref);

void rt6_purge_dflt_routers(struct net *net);

int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
                  const struct in6_addr *gwaddr);

void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, int oif,
                     u32 mark, kuid_t uid);
void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu);
void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
                  kuid_t uid);
void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif);
void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk);

struct netlink_callback;

struct rt6_rtnl_dump_arg {
        struct sk_buff *skb;
        struct netlink_callback *cb;
        struct net *net;
        struct fib_dump_filter filter;
};

int rt6_dump_route(struct fib6_info *f6i, void *p_arg, unsigned int skip);
void rt6_mtu_change(struct net_device *dev, unsigned int mtu);
void rt6_remove_prefsrc(struct inet6_ifaddr *ifp);
void rt6_clean_tohost(struct net *net, struct in6_addr *gateway);
void rt6_sync_up(struct net_device *dev, unsigned char nh_flags);
void rt6_disable_ip(struct net_device *dev, unsigned long event);
void rt6_sync_down_dev(struct net_device *dev, unsigned long event);
void rt6_multipath_rebalance(struct fib6_info *f6i);

void rt6_uncached_list_add(struct rt6_info *rt);
void rt6_uncached_list_del(struct rt6_info *rt);

static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb)
{
        const struct dst_entry *dst = skb_dst(skb);
        const struct rt6_info *rt6 = NULL;

        if (dst)
                rt6 = container_of(dst, struct rt6_info, dst);

        return rt6;
}

/*
 *        Store a destination cache entry in a socket
 */
static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst,
                                 const struct in6_addr *daddr,
                                 const struct in6_addr *saddr)
{
        struct ipv6_pinfo *np = inet6_sk(sk);

        np->dst_cookie = rt6_get_cookie((struct rt6_info *)dst);
        sk_setup_caps(sk, dst);
        np->daddr_cache = daddr;
#ifdef CONFIG_IPV6_SUBTREES
        np->saddr_cache = saddr;
#endif
}

void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
                           const struct flowi6 *fl6);

static inline bool ipv6_unicast_destination(const struct sk_buff *skb)
{
        struct rt6_info *rt = (struct rt6_info *) skb_dst(skb);

        return rt->rt6i_flags & RTF_LOCAL;
}

static inline bool ipv6_anycast_destination(const struct dst_entry *dst,
                                            const struct in6_addr *daddr)
{
        struct rt6_info *rt = (struct rt6_info *)dst;

        return rt->rt6i_flags & RTF_ANYCAST ||
                (rt->rt6i_dst.plen < 127 &&
                 !(rt->rt6i_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) &&
                 ipv6_addr_equal(&rt->rt6i_dst.addr, daddr));
}

int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
                 int (*output)(struct net *, struct sock *, struct sk_buff *));

static inline unsigned int ip6_skb_dst_mtu(struct sk_buff *skb)
{
        unsigned int mtu;

        struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
                                inet6_sk(skb->sk) : NULL;

        if (np && np->pmtudisc >= IPV6_PMTUDISC_PROBE) {
                mtu = READ_ONCE(skb_dst(skb)->dev->mtu);
                mtu -= lwtunnel_headroom(skb_dst(skb)->lwtstate, mtu);
        } else
                mtu = dst_mtu(skb_dst(skb));

        return mtu;
}

static inline bool ip6_sk_accept_pmtu(const struct sock *sk)
{
        return inet6_sk(sk)->pmtudisc != IPV6_PMTUDISC_INTERFACE &&
               inet6_sk(sk)->pmtudisc != IPV6_PMTUDISC_OMIT;
}

static inline bool ip6_sk_ignore_df(const struct sock *sk)
{
        return inet6_sk(sk)->pmtudisc < IPV6_PMTUDISC_DO ||
               inet6_sk(sk)->pmtudisc == IPV6_PMTUDISC_OMIT;
}

static inline const struct in6_addr *rt6_nexthop(const struct rt6_info *rt,
                                                 const struct in6_addr *daddr)
{
        if (rt->rt6i_flags & RTF_GATEWAY)
                return &rt->rt6i_gateway;
        else if (unlikely(rt->rt6i_flags & RTF_CACHE))
                return &rt->rt6i_dst.addr;
        else
                return daddr;
}

static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *b)
{
        struct fib6_nh *nha, *nhb;

        if (a->nh || b->nh)
                return nexthop_cmp(a->nh, b->nh);

        nha = a->fib6_nh;
        nhb = b->fib6_nh;
        return nha->fib_nh_dev == nhb->fib_nh_dev &&
               ipv6_addr_equal(&nha->fib_nh_gw6, &nhb->fib_nh_gw6) &&
               !lwtunnel_cmp_encap(nha->fib_nh_lws, nhb->fib_nh_lws);
}

static inline unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
{
        struct inet6_dev *idev;
        unsigned int mtu;

        if (dst_metric_locked(dst, RTAX_MTU)) {
                mtu = dst_metric_raw(dst, RTAX_MTU);
                if (mtu)
                        goto out;
        }

        mtu = IPV6_MIN_MTU;
        rcu_read_lock();
        idev = __in6_dev_get(dst->dev);
        if (idev)
                mtu = idev->cnf.mtu6;
        rcu_read_unlock();

out:
        return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
}

u32 ip6_mtu_from_fib6(const struct fib6_result *res,
                      const struct in6_addr *daddr,
                      const struct in6_addr *saddr);

struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
                                   struct net_device *dev, struct sk_buff *skb,
                                   const void *daddr);
#endif

























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *  ALSA sequencer Memory Manager
 *  Copyright (c) 1998 by Frank van de Pol <fvdpol@coil.demon.nl>
 */
#ifndef __SND_SEQ_MEMORYMGR_H
#define __SND_SEQ_MEMORYMGR_H

#include <sound/seq_kernel.h>
#include <linux/poll.h>

struct snd_info_buffer;

/* container for sequencer event (internal use) */
struct snd_seq_event_cell {
        struct snd_seq_event event;
        struct snd_seq_pool *pool;                                /* used pool */
        struct snd_seq_event_cell *next;        /* next cell */
};

/* design note: the pool is a contiguous block of memory, if we dynamicly
   want to add additional cells to the pool be better store this in another
   pool as we need to know the base address of the pool when releasing
   memory. */

struct snd_seq_pool {
        struct snd_seq_event_cell *ptr;        /* pointer to first event chunk */
        struct snd_seq_event_cell *free;        /* pointer to the head of the free list */

        int total_elements;        /* pool size actually allocated */
        atomic_t counter;        /* cells free */

        int size;                /* pool size to be allocated */
        int room;                /* watermark for sleep/wakeup */

        int closing;

        /* statistics */
        int max_used;
        int event_alloc_nopool;
        int event_alloc_failures;
        int event_alloc_success;

        /* Write locking */
        wait_queue_head_t output_sleep;

        /* Pool lock */
        spinlock_t lock;
};

void snd_seq_cell_free(struct snd_seq_event_cell *cell);

int snd_seq_event_dup(struct snd_seq_pool *pool, struct snd_seq_event *event,
                      struct snd_seq_event_cell **cellp, int nonblock,
                      struct file *file, struct mutex *mutexp);

/* return number of unused (free) cells */
static inline int snd_seq_unused_cells(struct snd_seq_pool *pool)
{
        return pool ? pool->total_elements - atomic_read(&pool->counter) : 0;
}

/* return total number of allocated cells */
static inline int snd_seq_total_cells(struct snd_seq_pool *pool)
{
        return pool ? pool->total_elements : 0;
}

/* init pool - allocate events */
int snd_seq_pool_init(struct snd_seq_pool *pool);

/* done pool - free events */
void snd_seq_pool_mark_closing(struct snd_seq_pool *pool);
int snd_seq_pool_done(struct snd_seq_pool *pool);

/* create pool */
struct snd_seq_pool *snd_seq_pool_new(int poolsize);

/* remove pool */
int snd_seq_pool_delete(struct snd_seq_pool **pool);

/* polling */
int snd_seq_pool_poll_wait(struct snd_seq_pool *pool, struct file *file, poll_table *wait);

void snd_seq_info_pool(struct snd_info_buffer *buffer,
                       struct snd_seq_pool *pool, char *space);

#endif


































































































    1 





















































































































    1 



    1 





    1 


    1 

    1 




































































































    1 







    1 


    1 


















    1 





































    1 
    1 







    1 
































































    2 


















    2 










    2 














































































































    1 











    1 




    1 










    1 



    1 



    1 

    1 



























































































































































































































    1 





    1 
    1 






    1 










































    1 







    1 









    1 


    1 
















    1 

















































































    1 










    1 








    1 
































    1 
















    1 
    1 

    1 

    1 




































    2 





















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/mm/swap.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 */

/*
 * This file contains the default values for the operation of the
 * Linux VM subsystem. Fine-tuning documentation can be found in
 * Documentation/admin-guide/sysctl/vm.rst.
 * Started 18.12.91
 * Swap aging added 23.2.95, Stephen Tweedie.
 * Buffermem limits added 12.3.98, Rik van Riel.
 */

#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/pagevec.h>
#include <linux/init.h>
#include <linux/export.h>
#include <linux/mm_inline.h>
#include <linux/percpu_counter.h>
#include <linux/memremap.h>
#include <linux/percpu.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/backing-dev.h>
#include <linux/memcontrol.h>
#include <linux/gfp.h>
#include <linux/uio.h>
#include <linux/hugetlb.h>
#include <linux/page_idle.h>
#include <linux/local_lock.h>

#include "internal.h"

#define CREATE_TRACE_POINTS
#include <trace/events/pagemap.h>

/* How many pages do we try to swap or page in/out together? */
int page_cluster;

/* Protecting only lru_rotate.pvec which requires disabling interrupts */
struct lru_rotate {
        local_lock_t lock;
        struct pagevec pvec;
};
static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = {
        .lock = INIT_LOCAL_LOCK(lock),
};

/*
 * The following struct pagevec are grouped together because they are protected
 * by disabling preemption (and interrupts remain enabled).
 */
struct lru_pvecs {
        local_lock_t lock;
        struct pagevec lru_add;
        struct pagevec lru_deactivate_file;
        struct pagevec lru_deactivate;
        struct pagevec lru_lazyfree;
#ifdef CONFIG_SMP
        struct pagevec activate_page;
#endif
};
static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = {
        .lock = INIT_LOCAL_LOCK(lock),
};

/*
 * This path almost never happens for VM activity - pages are normally
 * freed via pagevecs.  But it gets used by networking.
 */
static void __page_cache_release(struct page *page)
{
        if (PageLRU(page)) {
                pg_data_t *pgdat = page_pgdat(page);
                struct lruvec *lruvec;
                unsigned long flags;

                spin_lock_irqsave(&pgdat->lru_lock, flags);
                lruvec = mem_cgroup_page_lruvec(page, pgdat);
                VM_BUG_ON_PAGE(!PageLRU(page), page);
                __ClearPageLRU(page);
                del_page_from_lru_list(page, lruvec, page_off_lru(page));
                spin_unlock_irqrestore(&pgdat->lru_lock, flags);
        }
        __ClearPageWaiters(page);
}

static void __put_single_page(struct page *page)
{
        __page_cache_release(page);
        mem_cgroup_uncharge(page);
        free_unref_page(page);
}

static void __put_compound_page(struct page *page)
{
        /*
         * __page_cache_release() is supposed to be called for thp, not for
         * hugetlb. This is because hugetlb page does never have PageLRU set
         * (it's never listed to any LRU lists) and no memcg routines should
         * be called for hugetlb (it has a separate hugetlb_cgroup.)
         */
        if (!PageHuge(page))
                __page_cache_release(page);
        destroy_compound_page(page);
}

void __put_page(struct page *page)
{
        if (is_zone_device_page(page)) {
                put_dev_pagemap(page->pgmap);

                /*
                 * The page belongs to the device that created pgmap. Do
                 * not return it to page allocator.
                 */
                return;
        }

        if (unlikely(PageCompound(page)))
                __put_compound_page(page);
        else
                __put_single_page(page);
}
EXPORT_SYMBOL(__put_page);

/**
 * put_pages_list() - release a list of pages
 * @pages: list of pages threaded on page->lru
 *
 * Release a list of pages which are strung together on page.lru.  Currently
 * used by read_cache_pages() and related error recovery code.
 */
void put_pages_list(struct list_head *pages)
{
        while (!list_empty(pages)) {
                struct page *victim;

                victim = lru_to_page(pages);
                list_del(&victim->lru);
                put_page(victim);
        }
}
EXPORT_SYMBOL(put_pages_list);

/*
 * get_kernel_pages() - pin kernel pages in memory
 * @kiov:        An array of struct kvec structures
 * @nr_segs:        number of segments to pin
 * @write:        pinning for read/write, currently ignored
 * @pages:        array that receives pointers to the pages pinned.
 *                Should be at least nr_segs long.
 *
 * Returns number of pages pinned. This may be fewer than the number
 * requested. If nr_pages is 0 or negative, returns 0. If no pages
 * were pinned, returns -errno. Each page returned must be released
 * with a put_page() call when it is finished with.
 */
int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
                struct page **pages)
{
        int seg;

        for (seg = 0; seg < nr_segs; seg++) {
                if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE))
                        return seg;

                pages[seg] = kmap_to_page(kiov[seg].iov_base);
                get_page(pages[seg]);
        }

        return seg;
}
EXPORT_SYMBOL_GPL(get_kernel_pages);

/*
 * get_kernel_page() - pin a kernel page in memory
 * @start:        starting kernel address
 * @write:        pinning for read/write, currently ignored
 * @pages:        array that receives pointer to the page pinned.
 *                Must be at least nr_segs long.
 *
 * Returns 1 if page is pinned. If the page was not pinned, returns
 * -errno. The page returned must be released with a put_page() call
 * when it is finished with.
 */
int get_kernel_page(unsigned long start, int write, struct page **pages)
{
        const struct kvec kiov = {
                .iov_base = (void *)start,
                .iov_len = PAGE_SIZE
        };

        return get_kernel_pages(&kiov, 1, write, pages);
}
EXPORT_SYMBOL_GPL(get_kernel_page);

static void pagevec_lru_move_fn(struct pagevec *pvec,
        void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg),
        void *arg)
{
        int i;
        struct pglist_data *pgdat = NULL;
        struct lruvec *lruvec;
        unsigned long flags = 0;

        for (i = 0; i < pagevec_count(pvec); i++) {
                struct page *page = pvec->pages[i];
                struct pglist_data *pagepgdat = page_pgdat(page);

                if (pagepgdat != pgdat) {
                        if (pgdat)
                                spin_unlock_irqrestore(&pgdat->lru_lock, flags);
                        pgdat = pagepgdat;
                        spin_lock_irqsave(&pgdat->lru_lock, flags);
                }

                lruvec = mem_cgroup_page_lruvec(page, pgdat);
                (*move_fn)(page, lruvec, arg);
        }
        if (pgdat)
                spin_unlock_irqrestore(&pgdat->lru_lock, flags);
        release_pages(pvec->pages, pvec->nr);
        pagevec_reinit(pvec);
}

static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec,
                                 void *arg)
{
        int *pgmoved = arg;

        if (PageLRU(page) && !PageUnevictable(page)) {
                del_page_from_lru_list(page, lruvec, page_lru(page));
                ClearPageActive(page);
                add_page_to_lru_list_tail(page, lruvec, page_lru(page));
                (*pgmoved) += thp_nr_pages(page);
        }
}

/*
 * pagevec_move_tail() must be called with IRQ disabled.
 * Otherwise this may cause nasty races.
 */
static void pagevec_move_tail(struct pagevec *pvec)
{
        int pgmoved = 0;

        pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved);
        __count_vm_events(PGROTATED, pgmoved);
}

/*
 * Writeback is about to end against a page which has been marked for immediate
 * reclaim.  If it still appears to be reclaimable, move it to the tail of the
 * inactive list.
 */
void rotate_reclaimable_page(struct page *page)
{
        if (!PageLocked(page) && !PageDirty(page) &&
            !PageUnevictable(page) && PageLRU(page)) {
                struct pagevec *pvec;
                unsigned long flags;

                get_page(page);
                local_lock_irqsave(&lru_rotate.lock, flags);
                pvec = this_cpu_ptr(&lru_rotate.pvec);
                if (!pagevec_add(pvec, page) || PageCompound(page))
                        pagevec_move_tail(pvec);
                local_unlock_irqrestore(&lru_rotate.lock, flags);
        }
}

void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
{
        do {
                unsigned long lrusize;

                /* Record cost event */
                if (file)
                        lruvec->file_cost += nr_pages;
                else
                        lruvec->anon_cost += nr_pages;

                /*
                 * Decay previous events
                 *
                 * Because workloads change over time (and to avoid
                 * overflow) we keep these statistics as a floating
                 * average, which ends up weighing recent refaults
                 * more than old ones.
                 */
                lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) +
                          lruvec_page_state(lruvec, NR_ACTIVE_ANON) +
                          lruvec_page_state(lruvec, NR_INACTIVE_FILE) +
                          lruvec_page_state(lruvec, NR_ACTIVE_FILE);

                if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) {
                        lruvec->file_cost /= 2;
                        lruvec->anon_cost /= 2;
                }
        } while ((lruvec = parent_lruvec(lruvec)));
}

void lru_note_cost_page(struct page *page)
{
        lru_note_cost(mem_cgroup_page_lruvec(page, page_pgdat(page)),
                      page_is_file_lru(page), thp_nr_pages(page));
}

static void __activate_page(struct page *page, struct lruvec *lruvec,
                            void *arg)
{
        if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
                int lru = page_lru_base_type(page);
                int nr_pages = thp_nr_pages(page);

                del_page_from_lru_list(page, lruvec, lru);
                SetPageActive(page);
                lru += LRU_ACTIVE;
                add_page_to_lru_list(page, lruvec, lru);
                trace_mm_lru_activate(page);

                __count_vm_events(PGACTIVATE, nr_pages);
                __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE,
                                     nr_pages);
        }
}

#ifdef CONFIG_SMP
static void activate_page_drain(int cpu)
{
        struct pagevec *pvec = &per_cpu(lru_pvecs.activate_page, cpu);

        if (pagevec_count(pvec))
                pagevec_lru_move_fn(pvec, __activate_page, NULL);
}

static bool need_activate_page_drain(int cpu)
{
        return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0;
}

static void activate_page(struct page *page)
{
        page = compound_head(page);
        if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
                struct pagevec *pvec;

                local_lock(&lru_pvecs.lock);
                pvec = this_cpu_ptr(&lru_pvecs.activate_page);
                get_page(page);
                if (!pagevec_add(pvec, page) || PageCompound(page))
                        pagevec_lru_move_fn(pvec, __activate_page, NULL);
                local_unlock(&lru_pvecs.lock);
        }
}

#else
static inline void activate_page_drain(int cpu)
{
}

static void activate_page(struct page *page)
{
        pg_data_t *pgdat = page_pgdat(page);

        page = compound_head(page);
        spin_lock_irq(&pgdat->lru_lock);
        __activate_page(page, mem_cgroup_page_lruvec(page, pgdat), NULL);
        spin_unlock_irq(&pgdat->lru_lock);
}
#endif

static void __lru_cache_activate_page(struct page *page)
{
        struct pagevec *pvec;
        int i;

        local_lock(&lru_pvecs.lock);
        pvec = this_cpu_ptr(&lru_pvecs.lru_add);

        /*
         * Search backwards on the optimistic assumption that the page being
         * activated has just been added to this pagevec. Note that only
         * the local pagevec is examined as a !PageLRU page could be in the
         * process of being released, reclaimed, migrated or on a remote
         * pagevec that is currently being drained. Furthermore, marking
         * a remote pagevec's page PageActive potentially hits a race where
         * a page is marked PageActive just after it is added to the inactive
         * list causing accounting errors and BUG_ON checks to trigger.
         */
        for (i = pagevec_count(pvec) - 1; i >= 0; i--) {
                struct page *pagevec_page = pvec->pages[i];

                if (pagevec_page == page) {
                        SetPageActive(page);
                        break;
                }
        }

        local_unlock(&lru_pvecs.lock);
}

/*
 * Mark a page as having seen activity.
 *
 * inactive,unreferenced        ->        inactive,referenced
 * inactive,referenced                ->        active,unreferenced
 * active,unreferenced                ->        active,referenced
 *
 * When a newly allocated page is not yet visible, so safe for non-atomic ops,
 * __SetPageReferenced(page) may be substituted for mark_page_accessed(page).
 */
void mark_page_accessed(struct page *page)
{
        page = compound_head(page);

        if (!PageReferenced(page)) {
                SetPageReferenced(page);
        } else if (PageUnevictable(page)) {
                /*
                 * Unevictable pages are on the "LRU_UNEVICTABLE" list. But,
                 * this list is never rotated or maintained, so marking an
                 * evictable page accessed has no effect.
                 */
        } else if (!PageActive(page)) {
                /*
                 * If the page is on the LRU, queue it for activation via
                 * lru_pvecs.activate_page. Otherwise, assume the page is on a
                 * pagevec, mark it active and it'll be moved to the active
                 * LRU on the next drain.
                 */
                if (PageLRU(page))
                        activate_page(page);
                else
                        __lru_cache_activate_page(page);
                ClearPageReferenced(page);
                workingset_activation(page);
        }
        if (page_is_idle(page))
                clear_page_idle(page);
}
EXPORT_SYMBOL(mark_page_accessed);

/**
 * lru_cache_add - add a page to a page list
 * @page: the page to be added to the LRU.
 *
 * Queue the page for addition to the LRU via pagevec. The decision on whether
 * to add the page to the [in]active [file|anon] list is deferred until the
 * pagevec is drained. This gives a chance for the caller of lru_cache_add()
 * have the page added to the active list using mark_page_accessed().
 */
void lru_cache_add(struct page *page)
{
        struct pagevec *pvec;

        VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
        VM_BUG_ON_PAGE(PageLRU(page), page);

        get_page(page);
        local_lock(&lru_pvecs.lock);
        pvec = this_cpu_ptr(&lru_pvecs.lru_add);
        if (!pagevec_add(pvec, page) || PageCompound(page))
                __pagevec_lru_add(pvec);
        local_unlock(&lru_pvecs.lock);
}
EXPORT_SYMBOL(lru_cache_add);

/**
 * lru_cache_add_inactive_or_unevictable
 * @page:  the page to be added to LRU
 * @vma:   vma in which page is mapped for determining reclaimability
 *
 * Place @page on the inactive or unevictable LRU list, depending on its
 * evictability.
 */
void lru_cache_add_inactive_or_unevictable(struct page *page,
                                         struct vm_area_struct *vma)
{
        bool unevictable;

        VM_BUG_ON_PAGE(PageLRU(page), page);

        unevictable = (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED;
        if (unlikely(unevictable) && !TestSetPageMlocked(page)) {
                int nr_pages = thp_nr_pages(page);
                /*
                 * We use the irq-unsafe __mod_zone_page_stat because this
                 * counter is not modified from interrupt context, and the pte
                 * lock is held(spinlock), which implies preemption disabled.
                 */
                __mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
                count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
        }
        lru_cache_add(page);
}

/*
 * If the page can not be invalidated, it is moved to the
 * inactive list to speed up its reclaim.  It is moved to the
 * head of the list, rather than the tail, to give the flusher
 * threads some time to write it out, as this is much more
 * effective than the single-page writeout from reclaim.
 *
 * If the page isn't page_mapped and dirty/writeback, the page
 * could reclaim asap using PG_reclaim.
 *
 * 1. active, mapped page -> none
 * 2. active, dirty/writeback page -> inactive, head, PG_reclaim
 * 3. inactive, mapped page -> none
 * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim
 * 5. inactive, clean -> inactive, tail
 * 6. Others -> none
 *
 * In 4, why it moves inactive's head, the VM expects the page would
 * be write it out by flusher threads as this is much more effective
 * than the single-page writeout from reclaim.
 */
static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
                              void *arg)
{
        int lru;
        bool active;
        int nr_pages = thp_nr_pages(page);

        if (!PageLRU(page))
                return;

        if (PageUnevictable(page))
                return;

        /* Some processes are using the page */
        if (page_mapped(page))
                return;

        active = PageActive(page);
        lru = page_lru_base_type(page);

        del_page_from_lru_list(page, lruvec, lru + active);
        ClearPageActive(page);
        ClearPageReferenced(page);

        if (PageWriteback(page) || PageDirty(page)) {
                /*
                 * PG_reclaim could be raced with end_page_writeback
                 * It can make readahead confusing.  But race window
                 * is _really_ small and  it's non-critical problem.
                 */
                add_page_to_lru_list(page, lruvec, lru);
                SetPageReclaim(page);
        } else {
                /*
                 * The page's writeback ends up during pagevec
                 * We moves tha page into tail of inactive.
                 */
                add_page_to_lru_list_tail(page, lruvec, lru);
                __count_vm_events(PGROTATED, nr_pages);
        }

        if (active) {
                __count_vm_events(PGDEACTIVATE, nr_pages);
                __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
                                     nr_pages);
        }
}

static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
                            void *arg)
{
        if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
                int lru = page_lru_base_type(page);
                int nr_pages = thp_nr_pages(page);

                del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
                ClearPageActive(page);
                ClearPageReferenced(page);
                add_page_to_lru_list(page, lruvec, lru);

                __count_vm_events(PGDEACTIVATE, nr_pages);
                __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
                                     nr_pages);
        }
}

static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
                            void *arg)
{
        if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
            !PageSwapCache(page) && !PageUnevictable(page)) {
                bool active = PageActive(page);
                int nr_pages = thp_nr_pages(page);

                del_page_from_lru_list(page, lruvec,
                                       LRU_INACTIVE_ANON + active);
                ClearPageActive(page);
                ClearPageReferenced(page);
                /*
                 * Lazyfree pages are clean anonymous pages.  They have
                 * PG_swapbacked flag cleared, to distinguish them from normal
                 * anonymous pages
                 */
                ClearPageSwapBacked(page);
                add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE);

                __count_vm_events(PGLAZYFREE, nr_pages);
                __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE,
                                     nr_pages);
        }
}

/*
 * Drain pages out of the cpu's pagevecs.
 * Either "cpu" is the current CPU, and preemption has already been
 * disabled; or "cpu" is being hot-unplugged, and is already dead.
 */
void lru_add_drain_cpu(int cpu)
{
        struct pagevec *pvec = &per_cpu(lru_pvecs.lru_add, cpu);

        if (pagevec_count(pvec))
                __pagevec_lru_add(pvec);

        pvec = &per_cpu(lru_rotate.pvec, cpu);
        /* Disabling interrupts below acts as a compiler barrier. */
        if (data_race(pagevec_count(pvec))) {
                unsigned long flags;

                /* No harm done if a racing interrupt already did this */
                local_lock_irqsave(&lru_rotate.lock, flags);
                pagevec_move_tail(pvec);
                local_unlock_irqrestore(&lru_rotate.lock, flags);
        }

        pvec = &per_cpu(lru_pvecs.lru_deactivate_file, cpu);
        if (pagevec_count(pvec))
                pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);

        pvec = &per_cpu(lru_pvecs.lru_deactivate, cpu);
        if (pagevec_count(pvec))
                pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);

        pvec = &per_cpu(lru_pvecs.lru_lazyfree, cpu);
        if (pagevec_count(pvec))
                pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);

        activate_page_drain(cpu);
}

/**
 * deactivate_file_page - forcefully deactivate a file page
 * @page: page to deactivate
 *
 * This function hints the VM that @page is a good reclaim candidate,
 * for example if its invalidation fails due to the page being dirty
 * or under writeback.
 */
void deactivate_file_page(struct page *page)
{
        /*
         * In a workload with many unevictable page such as mprotect,
         * unevictable page deactivation for accelerating reclaim is pointless.
         */
        if (PageUnevictable(page))
                return;

        if (likely(get_page_unless_zero(page))) {
                struct pagevec *pvec;

                local_lock(&lru_pvecs.lock);
                pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file);

                if (!pagevec_add(pvec, page) || PageCompound(page))
                        pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
                local_unlock(&lru_pvecs.lock);
        }
}

/*
 * deactivate_page - deactivate a page
 * @page: page to deactivate
 *
 * deactivate_page() moves @page to the inactive list if @page was on the active
 * list and was not an unevictable page.  This is done to accelerate the reclaim
 * of @page.
 */
void deactivate_page(struct page *page)
{
        if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
                struct pagevec *pvec;

                local_lock(&lru_pvecs.lock);
                pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate);
                get_page(page);
                if (!pagevec_add(pvec, page) || PageCompound(page))
                        pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
                local_unlock(&lru_pvecs.lock);
        }
}

/**
 * mark_page_lazyfree - make an anon page lazyfree
 * @page: page to deactivate
 *
 * mark_page_lazyfree() moves @page to the inactive file list.
 * This is done to accelerate the reclaim of @page.
 */
void mark_page_lazyfree(struct page *page)
{
        if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
            !PageSwapCache(page) && !PageUnevictable(page)) {
                struct pagevec *pvec;

                local_lock(&lru_pvecs.lock);
                pvec = this_cpu_ptr(&lru_pvecs.lru_lazyfree);
                get_page(page);
                if (!pagevec_add(pvec, page) || PageCompound(page))
                        pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
                local_unlock(&lru_pvecs.lock);
        }
}

void lru_add_drain(void)
{
        local_lock(&lru_pvecs.lock);
        lru_add_drain_cpu(smp_processor_id());
        local_unlock(&lru_pvecs.lock);
}

void lru_add_drain_cpu_zone(struct zone *zone)
{
        local_lock(&lru_pvecs.lock);
        lru_add_drain_cpu(smp_processor_id());
        drain_local_pages(zone);
        local_unlock(&lru_pvecs.lock);
}

#ifdef CONFIG_SMP

static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);

static void lru_add_drain_per_cpu(struct work_struct *dummy)
{
        lru_add_drain();
}

/*
 * Doesn't need any cpu hotplug locking because we do rely on per-cpu
 * kworkers being shut down before our page_alloc_cpu_dead callback is
 * executed on the offlined cpu.
 * Calling this function with cpu hotplug locks held can actually lead
 * to obscure indirect dependencies via WQ context.
 */
void lru_add_drain_all(void)
{
        /*
         * lru_drain_gen - Global pages generation number
         *
         * (A) Definition: global lru_drain_gen = x implies that all generations
         *     0 < n <= x are already *scheduled* for draining.
         *
         * This is an optimization for the highly-contended use case where a
         * user space workload keeps constantly generating a flow of pages for
         * each CPU.
         */
        static unsigned int lru_drain_gen;
        static struct cpumask has_work;
        static DEFINE_MUTEX(lock);
        unsigned cpu, this_gen;

        /*
         * Make sure nobody triggers this path before mm_percpu_wq is fully
         * initialized.
         */
        if (WARN_ON(!mm_percpu_wq))
                return;

        /*
         * Guarantee pagevec counter stores visible by this CPU are visible to
         * other CPUs before loading the current drain generation.
         */
        smp_mb();

        /*
         * (B) Locally cache global LRU draining generation number
         *
         * The read barrier ensures that the counter is loaded before the mutex
         * is taken. It pairs with smp_mb() inside the mutex critical section
         * at (D).
         */
        this_gen = smp_load_acquire(&lru_drain_gen);

        mutex_lock(&lock);

        /*
         * (C) Exit the draining operation if a newer generation, from another
         * lru_add_drain_all(), was already scheduled for draining. Check (A).
         */
        if (unlikely(this_gen != lru_drain_gen))
                goto done;

        /*
         * (D) Increment global generation number
         *
         * Pairs with smp_load_acquire() at (B), outside of the critical
         * section. Use a full memory barrier to guarantee that the new global
         * drain generation number is stored before loading pagevec counters.
         *
         * This pairing must be done here, before the for_each_online_cpu loop
         * below which drains the page vectors.
         *
         * Let x, y, and z represent some system CPU numbers, where x < y < z.
         * Assume CPU #z is is in the middle of the for_each_online_cpu loop
         * below and has already reached CPU #y's per-cpu data. CPU #x comes
         * along, adds some pages to its per-cpu vectors, then calls
         * lru_add_drain_all().
         *
         * If the paired barrier is done at any later step, e.g. after the
         * loop, CPU #x will just exit at (C) and miss flushing out all of its
         * added pages.
         */
        WRITE_ONCE(lru_drain_gen, lru_drain_gen + 1);
        smp_mb();

        cpumask_clear(&has_work);
        for_each_online_cpu(cpu) {
                struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);

                if (pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) ||
                    data_race(pagevec_count(&per_cpu(lru_rotate.pvec, cpu))) ||
                    pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) ||
                    pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) ||
                    pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) ||
                    need_activate_page_drain(cpu)) {
                        INIT_WORK(work, lru_add_drain_per_cpu);
                        queue_work_on(cpu, mm_percpu_wq, work);
                        __cpumask_set_cpu(cpu, &has_work);
                }
        }

        for_each_cpu(cpu, &has_work)
                flush_work(&per_cpu(lru_add_drain_work, cpu));

done:
        mutex_unlock(&lock);
}
#else
void lru_add_drain_all(void)
{
        lru_add_drain();
}
#endif /* CONFIG_SMP */

/**
 * release_pages - batched put_page()
 * @pages: array of pages to release
 * @nr: number of pages
 *
 * Decrement the reference count on all the pages in @pages.  If it
 * fell to zero, remove the page from the LRU and free it.
 */
void release_pages(struct page **pages, int nr)
{
        int i;
        LIST_HEAD(pages_to_free);
        struct pglist_data *locked_pgdat = NULL;
        struct lruvec *lruvec;
        unsigned long flags;
        unsigned int lock_batch;

        for (i = 0; i < nr; i++) {
                struct page *page = pages[i];

                /*
                 * Make sure the IRQ-safe lock-holding time does not get
                 * excessive with a continuous string of pages from the
                 * same pgdat. The lock is held only if pgdat != NULL.
                 */
                if (locked_pgdat && ++lock_batch == SWAP_CLUSTER_MAX) {
                        spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
                        locked_pgdat = NULL;
                }

                page = compound_head(page);
                if (is_huge_zero_page(page))
                        continue;

                if (is_zone_device_page(page)) {
                        if (locked_pgdat) {
                                spin_unlock_irqrestore(&locked_pgdat->lru_lock,
                                                       flags);
                                locked_pgdat = NULL;
                        }
                        /*
                         * ZONE_DEVICE pages that return 'false' from
                         * page_is_devmap_managed() do not require special
                         * processing, and instead, expect a call to
                         * put_page_testzero().
                         */
                        if (page_is_devmap_managed(page)) {
                                put_devmap_managed_page(page);
                                continue;
                        }
                }

                if (!put_page_testzero(page))
                        continue;

                if (PageCompound(page)) {
                        if (locked_pgdat) {
                                spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
                                locked_pgdat = NULL;
                        }
                        __put_compound_page(page);
                        continue;
                }

                if (PageLRU(page)) {
                        struct pglist_data *pgdat = page_pgdat(page);

                        if (pgdat != locked_pgdat) {
                                if (locked_pgdat)
                                        spin_unlock_irqrestore(&locked_pgdat->lru_lock,
                                                                        flags);
                                lock_batch = 0;
                                locked_pgdat = pgdat;
                                spin_lock_irqsave(&locked_pgdat->lru_lock, flags);
                        }

                        lruvec = mem_cgroup_page_lruvec(page, locked_pgdat);
                        VM_BUG_ON_PAGE(!PageLRU(page), page);
                        __ClearPageLRU(page);
                        del_page_from_lru_list(page, lruvec, page_off_lru(page));
                }

                __ClearPageWaiters(page);

                list_add(&page->lru, &pages_to_free);
        }
        if (locked_pgdat)
                spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);

        mem_cgroup_uncharge_list(&pages_to_free);
        free_unref_page_list(&pages_to_free);
}
EXPORT_SYMBOL(release_pages);

/*
 * The pages which we're about to release may be in the deferred lru-addition
 * queues.  That would prevent them from really being freed right now.  That's
 * OK from a correctness point of view but is inefficient - those pages may be
 * cache-warm and we want to give them back to the page allocator ASAP.
 *
 * So __pagevec_release() will drain those queues here.  __pagevec_lru_add()
 * and __pagevec_lru_add_active() call release_pages() directly to avoid
 * mutual recursion.
 */
void __pagevec_release(struct pagevec *pvec)
{
        if (!pvec->percpu_pvec_drained) {
                lru_add_drain();
                pvec->percpu_pvec_drained = true;
        }
        release_pages(pvec->pages, pagevec_count(pvec));
        pagevec_reinit(pvec);
}
EXPORT_SYMBOL(__pagevec_release);

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* used by __split_huge_page_refcount() */
void lru_add_page_tail(struct page *page, struct page *page_tail,
                       struct lruvec *lruvec, struct list_head *list)
{
        VM_BUG_ON_PAGE(!PageHead(page), page);
        VM_BUG_ON_PAGE(PageCompound(page_tail), page);
        VM_BUG_ON_PAGE(PageLRU(page_tail), page);
        lockdep_assert_held(&lruvec_pgdat(lruvec)->lru_lock);

        if (!list)
                SetPageLRU(page_tail);

        if (likely(PageLRU(page)))
                list_add_tail(&page_tail->lru, &page->lru);
        else if (list) {
                /* page reclaim is reclaiming a huge page */
                get_page(page_tail);
                list_add_tail(&page_tail->lru, list);
        } else {
                /*
                 * Head page has not yet been counted, as an hpage,
                 * so we must account for each subpage individually.
                 *
                 * Put page_tail on the list at the correct position
                 * so they all end up in order.
                 */
                add_page_to_lru_list_tail(page_tail, lruvec,
                                          page_lru(page_tail));
        }
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
                                 void *arg)
{
        enum lru_list lru;
        int was_unevictable = TestClearPageUnevictable(page);
        int nr_pages = thp_nr_pages(page);

        VM_BUG_ON_PAGE(PageLRU(page), page);

        /*
         * Page becomes evictable in two ways:
         * 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()].
         * 2) Before acquiring LRU lock to put the page to correct LRU and then
         *   a) do PageLRU check with lock [check_move_unevictable_pages]
         *   b) do PageLRU check before lock [clear_page_mlock]
         *
         * (1) & (2a) are ok as LRU lock will serialize them. For (2b), we need
         * following strict ordering:
         *
         * #0: __pagevec_lru_add_fn                #1: clear_page_mlock
         *
         * SetPageLRU()                                TestClearPageMlocked()
         * smp_mb() // explicit ordering        // above provides strict
         *                                        // ordering
         * PageMlocked()                        PageLRU()
         *
         *
         * if '#1' does not observe setting of PG_lru by '#0' and fails
         * isolation, the explicit barrier will make sure that page_evictable
         * check will put the page in correct LRU. Without smp_mb(), SetPageLRU
         * can be reordered after PageMlocked check and can make '#1' to fail
         * the isolation of the page whose Mlocked bit is cleared (#0 is also
         * looking at the same page) and the evictable page will be stranded
         * in an unevictable LRU.
         */
        SetPageLRU(page);
        smp_mb__after_atomic();

        if (page_evictable(page)) {
                lru = page_lru(page);
                if (was_unevictable)
                        __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
        } else {
                lru = LRU_UNEVICTABLE;
                ClearPageActive(page);
                SetPageUnevictable(page);
                if (!was_unevictable)
                        __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
        }

        add_page_to_lru_list(page, lruvec, lru);
        trace_mm_lru_insertion(page, lru);
}

/*
 * Add the passed pages to the LRU, then drop the caller's refcount
 * on them.  Reinitialises the caller's pagevec.
 */
void __pagevec_lru_add(struct pagevec *pvec)
{
        pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL);
}

/**
 * pagevec_lookup_entries - gang pagecache lookup
 * @pvec:        Where the resulting entries are placed
 * @mapping:        The address_space to search
 * @start:        The starting entry index
 * @nr_entries:        The maximum number of pages
 * @indices:        The cache indices corresponding to the entries in @pvec
 *
 * pagevec_lookup_entries() will search for and return a group of up
 * to @nr_pages pages and shadow entries in the mapping.  All
 * entries are placed in @pvec.  pagevec_lookup_entries() takes a
 * reference against actual pages in @pvec.
 *
 * The search returns a group of mapping-contiguous entries with
 * ascending indexes.  There may be holes in the indices due to
 * not-present entries.
 *
 * Only one subpage of a Transparent Huge Page is returned in one call:
 * allowing truncate_inode_pages_range() to evict the whole THP without
 * cycling through a pagevec of extra references.
 *
 * pagevec_lookup_entries() returns the number of entries which were
 * found.
 */
unsigned pagevec_lookup_entries(struct pagevec *pvec,
                                struct address_space *mapping,
                                pgoff_t start, unsigned nr_entries,
                                pgoff_t *indices)
{
        pvec->nr = find_get_entries(mapping, start, nr_entries,
                                    pvec->pages, indices);
        return pagevec_count(pvec);
}

/**
 * pagevec_remove_exceptionals - pagevec exceptionals pruning
 * @pvec:        The pagevec to prune
 *
 * pagevec_lookup_entries() fills both pages and exceptional radix
 * tree entries into the pagevec.  This function prunes all
 * exceptionals from @pvec without leaving holes, so that it can be
 * passed on to page-only pagevec operations.
 */
void pagevec_remove_exceptionals(struct pagevec *pvec)
{
        int i, j;

        for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
                struct page *page = pvec->pages[i];
                if (!xa_is_value(page))
                        pvec->pages[j++] = page;
        }
        pvec->nr = j;
}

/**
 * pagevec_lookup_range - gang pagecache lookup
 * @pvec:        Where the resulting pages are placed
 * @mapping:        The address_space to search
 * @start:        The starting page index
 * @end:        The final page index
 *
 * pagevec_lookup_range() will search for & return a group of up to PAGEVEC_SIZE
 * pages in the mapping starting from index @start and upto index @end
 * (inclusive).  The pages are placed in @pvec.  pagevec_lookup() takes a
 * reference against the pages in @pvec.
 *
 * The search returns a group of mapping-contiguous pages with ascending
 * indexes.  There may be holes in the indices due to not-present pages. We
 * also update @start to index the next page for the traversal.
 *
 * pagevec_lookup_range() returns the number of pages which were found. If this
 * number is smaller than PAGEVEC_SIZE, the end of specified range has been
 * reached.
 */
unsigned pagevec_lookup_range(struct pagevec *pvec,
                struct address_space *mapping, pgoff_t *start, pgoff_t end)
{
        pvec->nr = find_get_pages_range(mapping, start, end, PAGEVEC_SIZE,
                                        pvec->pages);
        return pagevec_count(pvec);
}
EXPORT_SYMBOL(pagevec_lookup_range);

unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
                struct address_space *mapping, pgoff_t *index, pgoff_t end,
                xa_mark_t tag)
{
        pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
                                        PAGEVEC_SIZE, pvec->pages);
        return pagevec_count(pvec);
}
EXPORT_SYMBOL(pagevec_lookup_range_tag);

unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec,
                struct address_space *mapping, pgoff_t *index, pgoff_t end,
                xa_mark_t tag, unsigned max_pages)
{
        pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
                min_t(unsigned int, max_pages, PAGEVEC_SIZE), pvec->pages);
        return pagevec_count(pvec);
}
EXPORT_SYMBOL(pagevec_lookup_range_nr_tag);
/*
 * Perform any setup for the swap system
 */
void __init swap_setup(void)
{
        unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT);

        /* Use a smaller cluster for small-memory machines */
        if (megs < 16)
                page_cluster = 2;
        else
                page_cluster = 3;
        /*
         * Right now other parts of the system means that we
         * _really_ don't want to cluster much more
         */
}

#ifdef CONFIG_DEV_PAGEMAP_OPS
void put_devmap_managed_page(struct page *page)
{
        int count;

        if (WARN_ON_ONCE(!page_is_devmap_managed(page)))
                return;

        count = page_ref_dec_return(page);

        /*
         * devmap page refcounts are 1-based, rather than 0-based: if
         * refcount is 1, then the page is free and the refcount is
         * stable because nobody holds a reference on the page.
         */
        if (count == 1)
                free_devmap_managed_page(page);
        else if (!count)
                __put_page(page);
}
EXPORT_SYMBOL(put_devmap_managed_page);
#endif



























































    1 







    1 

























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_NET_SCM_H
#define __LINUX_NET_SCM_H

#include <linux/limits.h>
#include <linux/net.h>
#include <linux/cred.h>
#include <linux/security.h>
#include <linux/pid.h>
#include <linux/nsproxy.h>
#include <linux/sched/signal.h>

/* Well, we should have at least one descriptor open
 * to accept passed FDs 8)
 */
#define SCM_MAX_FD        253

struct scm_creds {
        u32        pid;
        kuid_t        uid;
        kgid_t        gid;
};

struct scm_fp_list {
        short                        count;
        short                        max;
        struct user_struct        *user;
        struct file                *fp[SCM_MAX_FD];
};

struct scm_cookie {
        struct pid                *pid;                /* Skb credentials */
        struct scm_fp_list        *fp;                /* Passed files                */
        struct scm_creds        creds;                /* Skb credentials        */
#ifdef CONFIG_SECURITY_NETWORK
        u32                        secid;                /* Passed security ID         */
#endif
};

void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm);
void scm_detach_fds_compat(struct msghdr *msg, struct scm_cookie *scm);
int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm);
void __scm_destroy(struct scm_cookie *scm);
struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl);

#ifdef CONFIG_SECURITY_NETWORK
static __inline__ void unix_get_peersec_dgram(struct socket *sock, struct scm_cookie *scm)
{
        security_socket_getpeersec_dgram(sock, NULL, &scm->secid);
}
#else
static __inline__ void unix_get_peersec_dgram(struct socket *sock, struct scm_cookie *scm)
{ }
#endif /* CONFIG_SECURITY_NETWORK */

static __inline__ void scm_set_cred(struct scm_cookie *scm,
                                    struct pid *pid, kuid_t uid, kgid_t gid)
{
        scm->pid  = get_pid(pid);
        scm->creds.pid = pid_vnr(pid);
        scm->creds.uid = uid;
        scm->creds.gid = gid;
}

static __inline__ void scm_destroy_cred(struct scm_cookie *scm)
{
        put_pid(scm->pid);
        scm->pid  = NULL;
}

static __inline__ void scm_destroy(struct scm_cookie *scm)
{
        scm_destroy_cred(scm);
        if (scm->fp)
                __scm_destroy(scm);
}

static __inline__ int scm_send(struct socket *sock, struct msghdr *msg,
                               struct scm_cookie *scm, bool forcecreds)
{
        memset(scm, 0, sizeof(*scm));
        scm->creds.uid = INVALID_UID;
        scm->creds.gid = INVALID_GID;
        if (forcecreds)
                scm_set_cred(scm, task_tgid(current), current_uid(), current_gid());
        unix_get_peersec_dgram(sock, scm);
        if (msg->msg_controllen <= 0)
                return 0;
        return __scm_send(sock, msg, scm);
}

#ifdef CONFIG_SECURITY_NETWORK
static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm)
{
        char *secdata;
        u32 seclen;
        int err;

        if (test_bit(SOCK_PASSSEC, &sock->flags)) {
                err = security_secid_to_secctx(scm->secid, &secdata, &seclen);

                if (!err) {
                        put_cmsg(msg, SOL_SOCKET, SCM_SECURITY, seclen, secdata);
                        security_release_secctx(secdata, seclen);
                }
        }
}

static inline bool scm_has_secdata(struct socket *sock)
{
        return test_bit(SOCK_PASSSEC, &sock->flags);
}
#else
static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm)
{ }

static inline bool scm_has_secdata(struct socket *sock)
{
        return false;
}
#endif /* CONFIG_SECURITY_NETWORK */

static __inline__ void scm_recv(struct socket *sock, struct msghdr *msg,
                                struct scm_cookie *scm, int flags)
{
        if (!msg->msg_control) {
                if (test_bit(SOCK_PASSCRED, &sock->flags) || scm->fp ||
                    scm_has_secdata(sock))
                        msg->msg_flags |= MSG_CTRUNC;
                scm_destroy(scm);
                return;
        }

        if (test_bit(SOCK_PASSCRED, &sock->flags)) {
                struct user_namespace *current_ns = current_user_ns();
                struct ucred ucreds = {
                        .pid = scm->creds.pid,
                        .uid = from_kuid_munged(current_ns, scm->creds.uid),
                        .gid = from_kgid_munged(current_ns, scm->creds.gid),
                };
                put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(ucreds), &ucreds);
        }

        scm_destroy_cred(scm);

        scm_passec(sock, msg, scm);

        if (!scm->fp)
                return;
        
        scm_detach_fds(msg, scm);
}


#endif /* __LINUX_NET_SCM_H */


























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_UACCESS_64_H
#define _ASM_X86_UACCESS_64_H

/*
 * User space memory access functions
 */
#include <linux/compiler.h>
#include <linux/lockdep.h>
#include <linux/kasan-checks.h>
#include <asm/alternative.h>
#include <asm/cpufeatures.h>
#include <asm/page.h>

/*
 * Copy To/From Userspace
 */

/* Handles exceptions in both to and from, but doesn't do access_ok */
__must_check unsigned long
copy_user_enhanced_fast_string(void *to, const void *from, unsigned len);
__must_check unsigned long
copy_user_generic_string(void *to, const void *from, unsigned len);
__must_check unsigned long
copy_user_generic_unrolled(void *to, const void *from, unsigned len);

static __always_inline __must_check unsigned long
copy_user_generic(void *to, const void *from, unsigned len)
{
        unsigned ret;

        /*
         * If CPU has ERMS feature, use copy_user_enhanced_fast_string.
         * Otherwise, if CPU has rep_good feature, use copy_user_generic_string.
         * Otherwise, use copy_user_generic_unrolled.
         */
        alternative_call_2(copy_user_generic_unrolled,
                         copy_user_generic_string,
                         X86_FEATURE_REP_GOOD,
                         copy_user_enhanced_fast_string,
                         X86_FEATURE_ERMS,
                         ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from),
                                     "=d" (len)),
                         "1" (to), "2" (from), "3" (len)
                         : "memory", "rcx", "r8", "r9", "r10", "r11");
        return ret;
}

static __always_inline __must_check unsigned long
raw_copy_from_user(void *dst, const void __user *src, unsigned long size)
{
        return copy_user_generic(dst, (__force void *)src, size);
}

static __always_inline __must_check unsigned long
raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
{
        return copy_user_generic((__force void *)dst, src, size);
}

static __always_inline __must_check
unsigned long raw_copy_in_user(void __user *dst, const void __user *src, unsigned long size)
{
        return copy_user_generic((__force void *)dst,
                                 (__force void *)src, size);
}

extern long __copy_user_nocache(void *dst, const void __user *src,
                                unsigned size, int zerorest);

extern long __copy_user_flushcache(void *dst, const void __user *src, unsigned size);
extern void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
                           size_t len);

static inline int
__copy_from_user_inatomic_nocache(void *dst, const void __user *src,
                                  unsigned size)
{
        kasan_check_write(dst, size);
        return __copy_user_nocache(dst, src, size, 0);
}

static inline int
__copy_from_user_flushcache(void *dst, const void __user *src, unsigned size)
{
        kasan_check_write(dst, size);
        return __copy_user_flushcache(dst, src, size);
}
#endif /* _ASM_X86_UACCESS_64_H */


































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM udp

#if !defined(_TRACE_UDP_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_UDP_H

#include <linux/udp.h>
#include <linux/tracepoint.h>

TRACE_EVENT(udp_fail_queue_rcv_skb,

        TP_PROTO(int rc, struct sock *sk),

        TP_ARGS(rc, sk),

        TP_STRUCT__entry(
                __field(int, rc)
                __field(__u16, lport)
        ),

        TP_fast_assign(
                __entry->rc = rc;
                __entry->lport = inet_sk(sk)->inet_num;
        ),

        TP_printk("rc=%d port=%hu", __entry->rc, __entry->lport)
);

#endif /* _TRACE_UDP_H */

/* This part must be outside protection */
#include <trace/define_trace.h>
























































































    1 




    1 



























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright (C) 2018 Christoph Hellwig.
 *
 * DMA operations that map physical memory directly without using an IOMMU.
 */
#ifndef _KERNEL_DMA_DIRECT_H
#define _KERNEL_DMA_DIRECT_H

#include <linux/dma-direct.h>

int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt,
                void *cpu_addr, dma_addr_t dma_addr, size_t size,
                unsigned long attrs);
bool dma_direct_can_mmap(struct device *dev);
int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
                void *cpu_addr, dma_addr_t dma_addr, size_t size,
                unsigned long attrs);
bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr);
int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
                enum dma_data_direction dir, unsigned long attrs);
size_t dma_direct_max_mapping_size(struct device *dev);

#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
    defined(CONFIG_SWIOTLB)
void dma_direct_sync_sg_for_device(struct device *dev, struct scatterlist *sgl,
                int nents, enum dma_data_direction dir);
#else
static inline void dma_direct_sync_sg_for_device(struct device *dev,
                struct scatterlist *sgl, int nents, enum dma_data_direction dir)
{
}
#endif

#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \
    defined(CONFIG_SWIOTLB)
void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
                int nents, enum dma_data_direction dir, unsigned long attrs);
void dma_direct_sync_sg_for_cpu(struct device *dev,
                struct scatterlist *sgl, int nents, enum dma_data_direction dir);
#else
static inline void dma_direct_unmap_sg(struct device *dev,
                struct scatterlist *sgl, int nents, enum dma_data_direction dir,
                unsigned long attrs)
{
}
static inline void dma_direct_sync_sg_for_cpu(struct device *dev,
                struct scatterlist *sgl, int nents, enum dma_data_direction dir)
{
}
#endif

static inline void dma_direct_sync_single_for_device(struct device *dev,
                dma_addr_t addr, size_t size, enum dma_data_direction dir)
{
        phys_addr_t paddr = dma_to_phys(dev, addr);

        if (unlikely(is_swiotlb_buffer(paddr)))
                swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE);

        if (!dev_is_dma_coherent(dev))
                arch_sync_dma_for_device(paddr, size, dir);
}

static inline void dma_direct_sync_single_for_cpu(struct device *dev,
                dma_addr_t addr, size_t size, enum dma_data_direction dir)
{
        phys_addr_t paddr = dma_to_phys(dev, addr);

        if (!dev_is_dma_coherent(dev)) {
                arch_sync_dma_for_cpu(paddr, size, dir);
                arch_sync_dma_for_cpu_all();
        }

        if (unlikely(is_swiotlb_buffer(paddr)))
                swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU);

        if (dir == DMA_FROM_DEVICE)
                arch_dma_mark_clean(paddr, size);
}

static inline dma_addr_t dma_direct_map_page(struct device *dev,
                struct page *page, unsigned long offset, size_t size,
                enum dma_data_direction dir, unsigned long attrs)
{
        phys_addr_t phys = page_to_phys(page) + offset;
        dma_addr_t dma_addr = phys_to_dma(dev, phys);

        if (unlikely(swiotlb_force == SWIOTLB_FORCE))
                return swiotlb_map(dev, phys, size, dir, attrs);

        if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
                if (swiotlb_force != SWIOTLB_NO_FORCE)
                        return swiotlb_map(dev, phys, size, dir, attrs);

                dev_WARN_ONCE(dev, 1,
                             "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
                             &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
                return DMA_MAPPING_ERROR;
        }

        if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
                arch_sync_dma_for_device(phys, size, dir);
        return dma_addr;
}

static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
                size_t size, enum dma_data_direction dir, unsigned long attrs)
{
        phys_addr_t phys = dma_to_phys(dev, addr);

        if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
                dma_direct_sync_single_for_cpu(dev, addr, size, dir);

        if (unlikely(is_swiotlb_buffer(phys)))
                swiotlb_tbl_unmap_single(dev, phys, size, size, dir,
                                         attrs | DMA_ATTR_SKIP_CPU_SYNC);
}
#endif /* _KERNEL_DMA_DIRECT_H */

























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_FRAG_H__
#define __NET_FRAG_H__

#include <linux/rhashtable-types.h>
#include <linux/completion.h>

/* Per netns frag queues directory */
struct fqdir {
        /* sysctls */
        long                        high_thresh;
        long                        low_thresh;
        int                        timeout;
        int                        max_dist;
        struct inet_frags        *f;
        struct net                *net;
        bool                        dead;

        struct rhashtable       rhashtable ____cacheline_aligned_in_smp;

        /* Keep atomic mem on separate cachelines in structs that include it */
        atomic_long_t                mem ____cacheline_aligned_in_smp;
        struct work_struct        destroy_work;
};

/**
 * fragment queue flags
 *
 * @INET_FRAG_FIRST_IN: first fragment has arrived
 * @INET_FRAG_LAST_IN: final fragment has arrived
 * @INET_FRAG_COMPLETE: frag queue has been processed and is due for destruction
 * @INET_FRAG_HASH_DEAD: inet_frag_kill() has not removed fq from rhashtable
 */
enum {
        INET_FRAG_FIRST_IN        = BIT(0),
        INET_FRAG_LAST_IN        = BIT(1),
        INET_FRAG_COMPLETE        = BIT(2),
        INET_FRAG_HASH_DEAD        = BIT(3),
};

struct frag_v4_compare_key {
        __be32                saddr;
        __be32                daddr;
        u32                user;
        u32                vif;
        __be16                id;
        u16                protocol;
};

struct frag_v6_compare_key {
        struct in6_addr        saddr;
        struct in6_addr        daddr;
        u32                user;
        __be32                id;
        u32                iif;
};

/**
 * struct inet_frag_queue - fragment queue
 *
 * @node: rhash node
 * @key: keys identifying this frag.
 * @timer: queue expiration timer
 * @lock: spinlock protecting this frag
 * @refcnt: reference count of the queue
 * @rb_fragments: received fragments rb-tree root
 * @fragments_tail: received fragments tail
 * @last_run_head: the head of the last "run". see ip_fragment.c
 * @stamp: timestamp of the last received fragment
 * @len: total length of the original datagram
 * @meat: length of received fragments so far
 * @flags: fragment queue flags
 * @max_size: maximum received fragment size
 * @fqdir: pointer to struct fqdir
 * @rcu: rcu head for freeing deferall
 */
struct inet_frag_queue {
        struct rhash_head        node;
        union {
                struct frag_v4_compare_key v4;
                struct frag_v6_compare_key v6;
        } key;
        struct timer_list        timer;
        spinlock_t                lock;
        refcount_t                refcnt;
        struct rb_root                rb_fragments;
        struct sk_buff                *fragments_tail;
        struct sk_buff                *last_run_head;
        ktime_t                        stamp;
        int                        len;
        int                        meat;
        __u8                        flags;
        u16                        max_size;
        struct fqdir                *fqdir;
        struct rcu_head                rcu;
};

struct inet_frags {
        unsigned int                qsize;

        void                        (*constructor)(struct inet_frag_queue *q,
                                               const void *arg);
        void                        (*destructor)(struct inet_frag_queue *);
        void                        (*frag_expire)(struct timer_list *t);
        struct kmem_cache        *frags_cachep;
        const char                *frags_cache_name;
        struct rhashtable_params rhash_params;
        refcount_t                refcnt;
        struct completion        completion;
};

int inet_frags_init(struct inet_frags *);
void inet_frags_fini(struct inet_frags *);

int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net);

static inline void fqdir_pre_exit(struct fqdir *fqdir)
{
        /* Prevent creation of new frags.
         * Pairs with READ_ONCE() in inet_frag_find().
         */
        WRITE_ONCE(fqdir->high_thresh, 0);

        /* Pairs with READ_ONCE() in inet_frag_kill(), ip_expire()
         * and ip6frag_expire_frag_queue().
         */
        WRITE_ONCE(fqdir->dead, true);
}
void fqdir_exit(struct fqdir *fqdir);

void inet_frag_kill(struct inet_frag_queue *q);
void inet_frag_destroy(struct inet_frag_queue *q);
struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key);

/* Free all skbs in the queue; return the sum of their truesizes. */
unsigned int inet_frag_rbtree_purge(struct rb_root *root);

static inline void inet_frag_put(struct inet_frag_queue *q)
{
        if (refcount_dec_and_test(&q->refcnt))
                inet_frag_destroy(q);
}

/* Memory Tracking Functions. */

static inline long frag_mem_limit(const struct fqdir *fqdir)
{
        return atomic_long_read(&fqdir->mem);
}

static inline void sub_frag_mem_limit(struct fqdir *fqdir, long val)
{
        atomic_long_sub(val, &fqdir->mem);
}

static inline void add_frag_mem_limit(struct fqdir *fqdir, long val)
{
        atomic_long_add(val, &fqdir->mem);
}

/* RFC 3168 support :
 * We want to check ECN values of all fragments, do detect invalid combinations.
 * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value.
 */
#define        IPFRAG_ECN_NOT_ECT        0x01 /* one frag had ECN_NOT_ECT */
#define        IPFRAG_ECN_ECT_1        0x02 /* one frag had ECN_ECT_1 */
#define        IPFRAG_ECN_ECT_0        0x04 /* one frag had ECN_ECT_0 */
#define        IPFRAG_ECN_CE                0x08 /* one frag had ECN_CE */

extern const u8 ip_frag_ecn_table[16];

/* Return values of inet_frag_queue_insert() */
#define IPFRAG_OK        0
#define IPFRAG_DUP        1
#define IPFRAG_OVERLAP        2
int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb,
                           int offset, int end);
void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,
                              struct sk_buff *parent);
void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
                            void *reasm_data, bool try_coalesce);
struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q);

#endif





















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * mac80211 <-> driver interface
 *
 * Copyright 2002-2005, Devicescape Software, Inc.
 * Copyright 2006-2007        Jiri Benc <jbenc@suse.cz>
 * Copyright 2007-2010        Johannes Berg <johannes@sipsolutions.net>
 * Copyright 2013-2014  Intel Mobile Communications GmbH
 * Copyright (C) 2015 - 2017 Intel Deutschland GmbH
 * Copyright (C) 2018 - 2020 Intel Corporation
 */

#ifndef MAC80211_H
#define MAC80211_H

#include <linux/bug.h>
#include <linux/kernel.h>
#include <linux/if_ether.h>
#include <linux/skbuff.h>
#include <linux/ieee80211.h>
#include <net/cfg80211.h>
#include <net/codel.h>
#include <net/ieee80211_radiotap.h>
#include <asm/unaligned.h>

/**
 * DOC: Introduction
 *
 * mac80211 is the Linux stack for 802.11 hardware that implements
 * only partial functionality in hard- or firmware. This document
 * defines the interface between mac80211 and low-level hardware
 * drivers.
 */

/**
 * DOC: Calling mac80211 from interrupts
 *
 * Only ieee80211_tx_status_irqsafe() and ieee80211_rx_irqsafe() can be
 * called in hardware interrupt context. The low-level driver must not call any
 * other functions in hardware interrupt context. If there is a need for such
 * call, the low-level driver should first ACK the interrupt and perform the
 * IEEE 802.11 code call after this, e.g. from a scheduled workqueue or even
 * tasklet function.
 *
 * NOTE: If the driver opts to use the _irqsafe() functions, it may not also
 *         use the non-IRQ-safe functions!
 */

/**
 * DOC: Warning
 *
 * If you're reading this document and not the header file itself, it will
 * be incomplete because not all documentation has been converted yet.
 */

/**
 * DOC: Frame format
 *
 * As a general rule, when frames are passed between mac80211 and the driver,
 * they start with the IEEE 802.11 header and include the same octets that are
 * sent over the air except for the FCS which should be calculated by the
 * hardware.
 *
 * There are, however, various exceptions to this rule for advanced features:
 *
 * The first exception is for hardware encryption and decryption offload
 * where the IV/ICV may or may not be generated in hardware.
 *
 * Secondly, when the hardware handles fragmentation, the frame handed to
 * the driver from mac80211 is the MSDU, not the MPDU.
 */

/**
 * DOC: mac80211 workqueue
 *
 * mac80211 provides its own workqueue for drivers and internal mac80211 use.
 * The workqueue is a single threaded workqueue and can only be accessed by
 * helpers for sanity checking. Drivers must ensure all work added onto the
 * mac80211 workqueue should be cancelled on the driver stop() callback.
 *
 * mac80211 will flushed the workqueue upon interface removal and during
 * suspend.
 *
 * All work performed on the mac80211 workqueue must not acquire the RTNL lock.
 *
 */

/**
 * DOC: mac80211 software tx queueing
 *
 * mac80211 provides an optional intermediate queueing implementation designed
 * to allow the driver to keep hardware queues short and provide some fairness
 * between different stations/interfaces.
 * In this model, the driver pulls data frames from the mac80211 queue instead
 * of letting mac80211 push them via drv_tx().
 * Other frames (e.g. control or management) are still pushed using drv_tx().
 *
 * Drivers indicate that they use this model by implementing the .wake_tx_queue
 * driver operation.
 *
 * Intermediate queues (struct ieee80211_txq) are kept per-sta per-tid, with
 * another per-sta for non-data/non-mgmt and bufferable management frames, and
 * a single per-vif queue for multicast data frames.
 *
 * The driver is expected to initialize its private per-queue data for stations
 * and interfaces in the .add_interface and .sta_add ops.
 *
 * The driver can't access the queue directly. To dequeue a frame from a
 * txq, it calls ieee80211_tx_dequeue(). Whenever mac80211 adds a new frame to a
 * queue, it calls the .wake_tx_queue driver op.
 *
 * Drivers can optionally delegate responsibility for scheduling queues to
 * mac80211, to take advantage of airtime fairness accounting. In this case, to
 * obtain the next queue to pull frames from, the driver calls
 * ieee80211_next_txq(). The driver is then expected to return the txq using
 * ieee80211_return_txq().
 *
 * For AP powersave TIM handling, the driver only needs to indicate if it has
 * buffered packets in the driver specific data structures by calling
 * ieee80211_sta_set_buffered(). For frames buffered in the ieee80211_txq
 * struct, mac80211 sets the appropriate TIM PVB bits and calls
 * .release_buffered_frames().
 * In that callback the driver is therefore expected to release its own
 * buffered frames and afterwards also frames from the ieee80211_txq (obtained
 * via the usual ieee80211_tx_dequeue).
 */

struct device;

/**
 * enum ieee80211_max_queues - maximum number of queues
 *
 * @IEEE80211_MAX_QUEUES: Maximum number of regular device queues.
 * @IEEE80211_MAX_QUEUE_MAP: bitmap with maximum queues set
 */
enum ieee80211_max_queues {
        IEEE80211_MAX_QUEUES =                16,
        IEEE80211_MAX_QUEUE_MAP =        BIT(IEEE80211_MAX_QUEUES) - 1,
};

#define IEEE80211_INVAL_HW_QUEUE        0xff

/**
 * enum ieee80211_ac_numbers - AC numbers as used in mac80211
 * @IEEE80211_AC_VO: voice
 * @IEEE80211_AC_VI: video
 * @IEEE80211_AC_BE: best effort
 * @IEEE80211_AC_BK: background
 */
enum ieee80211_ac_numbers {
        IEEE80211_AC_VO                = 0,
        IEEE80211_AC_VI                = 1,
        IEEE80211_AC_BE                = 2,
        IEEE80211_AC_BK                = 3,
};

/**
 * struct ieee80211_tx_queue_params - transmit queue configuration
 *
 * The information provided in this structure is required for QoS
 * transmit queue configuration. Cf. IEEE 802.11 7.3.2.29.
 *
 * @aifs: arbitration interframe space [0..255]
 * @cw_min: minimum contention window [a value of the form
 *        2^n-1 in the range 1..32767]
 * @cw_max: maximum contention window [like @cw_min]
 * @txop: maximum burst time in units of 32 usecs, 0 meaning disabled
 * @acm: is mandatory admission control required for the access category
 * @uapsd: is U-APSD mode enabled for the queue
 * @mu_edca: is the MU EDCA configured
 * @mu_edca_param_rec: MU EDCA Parameter Record for HE
 */
struct ieee80211_tx_queue_params {
        u16 txop;
        u16 cw_min;
        u16 cw_max;
        u8 aifs;
        bool acm;
        bool uapsd;
        bool mu_edca;
        struct ieee80211_he_mu_edca_param_ac_rec mu_edca_param_rec;
};

struct ieee80211_low_level_stats {
        unsigned int dot11ACKFailureCount;
        unsigned int dot11RTSFailureCount;
        unsigned int dot11FCSErrorCount;
        unsigned int dot11RTSSuccessCount;
};

/**
 * enum ieee80211_chanctx_change - change flag for channel context
 * @IEEE80211_CHANCTX_CHANGE_WIDTH: The channel width changed
 * @IEEE80211_CHANCTX_CHANGE_RX_CHAINS: The number of RX chains changed
 * @IEEE80211_CHANCTX_CHANGE_RADAR: radar detection flag changed
 * @IEEE80211_CHANCTX_CHANGE_CHANNEL: switched to another operating channel,
 *        this is used only with channel switching with CSA
 * @IEEE80211_CHANCTX_CHANGE_MIN_WIDTH: The min required channel width changed
 */
enum ieee80211_chanctx_change {
        IEEE80211_CHANCTX_CHANGE_WIDTH                = BIT(0),
        IEEE80211_CHANCTX_CHANGE_RX_CHAINS        = BIT(1),
        IEEE80211_CHANCTX_CHANGE_RADAR                = BIT(2),
        IEEE80211_CHANCTX_CHANGE_CHANNEL        = BIT(3),
        IEEE80211_CHANCTX_CHANGE_MIN_WIDTH        = BIT(4),
};

/**
 * struct ieee80211_chanctx_conf - channel context that vifs may be tuned to
 *
 * This is the driver-visible part. The ieee80211_chanctx
 * that contains it is visible in mac80211 only.
 *
 * @def: the channel definition
 * @min_def: the minimum channel definition currently required.
 * @rx_chains_static: The number of RX chains that must always be
 *        active on the channel to receive MIMO transmissions
 * @rx_chains_dynamic: The number of RX chains that must be enabled
 *        after RTS/CTS handshake to receive SMPS MIMO transmissions;
 *        this will always be >= @rx_chains_static.
 * @radar_enabled: whether radar detection is enabled on this channel.
 * @drv_priv: data area for driver use, will always be aligned to
 *        sizeof(void *), size is determined in hw information.
 */
struct ieee80211_chanctx_conf {
        struct cfg80211_chan_def def;
        struct cfg80211_chan_def min_def;

        u8 rx_chains_static, rx_chains_dynamic;

        bool radar_enabled;

        u8 drv_priv[] __aligned(sizeof(void *));
};

/**
 * enum ieee80211_chanctx_switch_mode - channel context switch mode
 * @CHANCTX_SWMODE_REASSIGN_VIF: Both old and new contexts already
 *        exist (and will continue to exist), but the virtual interface
 *        needs to be switched from one to the other.
 * @CHANCTX_SWMODE_SWAP_CONTEXTS: The old context exists but will stop
 *      to exist with this call, the new context doesn't exist but
 *      will be active after this call, the virtual interface switches
 *      from the old to the new (note that the driver may of course
 *      implement this as an on-the-fly chandef switch of the existing
 *      hardware context, but the mac80211 pointer for the old context
 *      will cease to exist and only the new one will later be used
 *      for changes/removal.)
 */
enum ieee80211_chanctx_switch_mode {
        CHANCTX_SWMODE_REASSIGN_VIF,
        CHANCTX_SWMODE_SWAP_CONTEXTS,
};

/**
 * struct ieee80211_vif_chanctx_switch - vif chanctx switch information
 *
 * This is structure is used to pass information about a vif that
 * needs to switch from one chanctx to another.  The
 * &ieee80211_chanctx_switch_mode defines how the switch should be
 * done.
 *
 * @vif: the vif that should be switched from old_ctx to new_ctx
 * @old_ctx: the old context to which the vif was assigned
 * @new_ctx: the new context to which the vif must be assigned
 */
struct ieee80211_vif_chanctx_switch {
        struct ieee80211_vif *vif;
        struct ieee80211_chanctx_conf *old_ctx;
        struct ieee80211_chanctx_conf *new_ctx;
};

/**
 * enum ieee80211_bss_change - BSS change notification flags
 *
 * These flags are used with the bss_info_changed() callback
 * to indicate which BSS parameter changed.
 *
 * @BSS_CHANGED_ASSOC: association status changed (associated/disassociated),
 *        also implies a change in the AID.
 * @BSS_CHANGED_ERP_CTS_PROT: CTS protection changed
 * @BSS_CHANGED_ERP_PREAMBLE: preamble changed
 * @BSS_CHANGED_ERP_SLOT: slot timing changed
 * @BSS_CHANGED_HT: 802.11n parameters changed
 * @BSS_CHANGED_BASIC_RATES: Basic rateset changed
 * @BSS_CHANGED_BEACON_INT: Beacon interval changed
 * @BSS_CHANGED_BSSID: BSSID changed, for whatever
 *        reason (IBSS and managed mode)
 * @BSS_CHANGED_BEACON: Beacon data changed, retrieve
 *        new beacon (beaconing modes)
 * @BSS_CHANGED_BEACON_ENABLED: Beaconing should be
 *        enabled/disabled (beaconing modes)
 * @BSS_CHANGED_CQM: Connection quality monitor config changed
 * @BSS_CHANGED_IBSS: IBSS join status changed
 * @BSS_CHANGED_ARP_FILTER: Hardware ARP filter address list or state changed.
 * @BSS_CHANGED_QOS: QoS for this association was enabled/disabled. Note
 *        that it is only ever disabled for station mode.
 * @BSS_CHANGED_IDLE: Idle changed for this BSS/interface.
 * @BSS_CHANGED_SSID: SSID changed for this BSS (AP and IBSS mode)
 * @BSS_CHANGED_AP_PROBE_RESP: Probe Response changed for this BSS (AP mode)
 * @BSS_CHANGED_PS: PS changed for this BSS (STA mode)
 * @BSS_CHANGED_TXPOWER: TX power setting changed for this interface
 * @BSS_CHANGED_P2P_PS: P2P powersave settings (CTWindow, opportunistic PS)
 *        changed
 * @BSS_CHANGED_BEACON_INFO: Data from the AP's beacon became available:
 *        currently dtim_period only is under consideration.
 * @BSS_CHANGED_BANDWIDTH: The bandwidth used by this interface changed,
 *        note that this is only called when it changes after the channel
 *        context had been assigned.
 * @BSS_CHANGED_OCB: OCB join status changed
 * @BSS_CHANGED_MU_GROUPS: VHT MU-MIMO group id or user position changed
 * @BSS_CHANGED_KEEP_ALIVE: keep alive options (idle period or protected
 *        keep alive) changed.
 * @BSS_CHANGED_MCAST_RATE: Multicast Rate setting changed for this interface
 * @BSS_CHANGED_FTM_RESPONDER: fine timing measurement request responder
 *        functionality changed for this BSS (AP mode).
 * @BSS_CHANGED_TWT: TWT status changed
 * @BSS_CHANGED_HE_OBSS_PD: OBSS Packet Detection status changed.
 * @BSS_CHANGED_HE_BSS_COLOR: BSS Color has changed
 * @BSS_CHANGED_FILS_DISCOVERY: FILS discovery status changed.
 * @BSS_CHANGED_UNSOL_BCAST_PROBE_RESP: Unsolicited broadcast probe response
 *        status changed.
 *
 */
enum ieee80211_bss_change {
        BSS_CHANGED_ASSOC                = 1<<0,
        BSS_CHANGED_ERP_CTS_PROT        = 1<<1,
        BSS_CHANGED_ERP_PREAMBLE        = 1<<2,
        BSS_CHANGED_ERP_SLOT                = 1<<3,
        BSS_CHANGED_HT                        = 1<<4,
        BSS_CHANGED_BASIC_RATES                = 1<<5,
        BSS_CHANGED_BEACON_INT                = 1<<6,
        BSS_CHANGED_BSSID                = 1<<7,
        BSS_CHANGED_BEACON                = 1<<8,
        BSS_CHANGED_BEACON_ENABLED        = 1<<9,
        BSS_CHANGED_CQM                        = 1<<10,
        BSS_CHANGED_IBSS                = 1<<11,
        BSS_CHANGED_ARP_FILTER                = 1<<12,
        BSS_CHANGED_QOS                        = 1<<13,
        BSS_CHANGED_IDLE                = 1<<14,
        BSS_CHANGED_SSID                = 1<<15,
        BSS_CHANGED_AP_PROBE_RESP        = 1<<16,
        BSS_CHANGED_PS                        = 1<<17,
        BSS_CHANGED_TXPOWER                = 1<<18,
        BSS_CHANGED_P2P_PS                = 1<<19,
        BSS_CHANGED_BEACON_INFO                = 1<<20,
        BSS_CHANGED_BANDWIDTH                = 1<<21,
        BSS_CHANGED_OCB                 = 1<<22,
        BSS_CHANGED_MU_GROUPS                = 1<<23,
        BSS_CHANGED_KEEP_ALIVE                = 1<<24,
        BSS_CHANGED_MCAST_RATE                = 1<<25,
        BSS_CHANGED_FTM_RESPONDER        = 1<<26,
        BSS_CHANGED_TWT                        = 1<<27,
        BSS_CHANGED_HE_OBSS_PD                = 1<<28,
        BSS_CHANGED_HE_BSS_COLOR        = 1<<29,
        BSS_CHANGED_FILS_DISCOVERY      = 1<<30,
        BSS_CHANGED_UNSOL_BCAST_PROBE_RESP = 1<<31,

        /* when adding here, make sure to change ieee80211_reconfig */
};

/*
 * The maximum number of IPv4 addresses listed for ARP filtering. If the number
 * of addresses for an interface increase beyond this value, hardware ARP
 * filtering will be disabled.
 */
#define IEEE80211_BSS_ARP_ADDR_LIST_LEN 4

/**
 * enum ieee80211_event_type - event to be notified to the low level driver
 * @RSSI_EVENT: AP's rssi crossed the a threshold set by the driver.
 * @MLME_EVENT: event related to MLME
 * @BAR_RX_EVENT: a BAR was received
 * @BA_FRAME_TIMEOUT: Frames were released from the reordering buffer because
 *        they timed out. This won't be called for each frame released, but only
 *        once each time the timeout triggers.
 */
enum ieee80211_event_type {
        RSSI_EVENT,
        MLME_EVENT,
        BAR_RX_EVENT,
        BA_FRAME_TIMEOUT,
};

/**
 * enum ieee80211_rssi_event_data - relevant when event type is %RSSI_EVENT
 * @RSSI_EVENT_HIGH: AP's rssi went below the threshold set by the driver.
 * @RSSI_EVENT_LOW: AP's rssi went above the threshold set by the driver.
 */
enum ieee80211_rssi_event_data {
        RSSI_EVENT_HIGH,
        RSSI_EVENT_LOW,
};

/**
 * struct ieee80211_rssi_event - data attached to an %RSSI_EVENT
 * @data: See &enum ieee80211_rssi_event_data
 */
struct ieee80211_rssi_event {
        enum ieee80211_rssi_event_data data;
};

/**
 * enum ieee80211_mlme_event_data - relevant when event type is %MLME_EVENT
 * @AUTH_EVENT: the MLME operation is authentication
 * @ASSOC_EVENT: the MLME operation is association
 * @DEAUTH_RX_EVENT: deauth received..
 * @DEAUTH_TX_EVENT: deauth sent.
 */
enum ieee80211_mlme_event_data {
        AUTH_EVENT,
        ASSOC_EVENT,
        DEAUTH_RX_EVENT,
        DEAUTH_TX_EVENT,
};

/**
 * enum ieee80211_mlme_event_status - relevant when event type is %MLME_EVENT
 * @MLME_SUCCESS: the MLME operation completed successfully.
 * @MLME_DENIED: the MLME operation was denied by the peer.
 * @MLME_TIMEOUT: the MLME operation timed out.
 */
enum ieee80211_mlme_event_status {
        MLME_SUCCESS,
        MLME_DENIED,
        MLME_TIMEOUT,
};

/**
 * struct ieee80211_mlme_event - data attached to an %MLME_EVENT
 * @data: See &enum ieee80211_mlme_event_data
 * @status: See &enum ieee80211_mlme_event_status
 * @reason: the reason code if applicable
 */
struct ieee80211_mlme_event {
        enum ieee80211_mlme_event_data data;
        enum ieee80211_mlme_event_status status;
        u16 reason;
};

/**
 * struct ieee80211_ba_event - data attached for BlockAck related events
 * @sta: pointer to the &ieee80211_sta to which this event relates
 * @tid: the tid
 * @ssn: the starting sequence number (for %BAR_RX_EVENT)
 */
struct ieee80211_ba_event {
        struct ieee80211_sta *sta;
        u16 tid;
        u16 ssn;
};

/**
 * struct ieee80211_event - event to be sent to the driver
 * @type: The event itself. See &enum ieee80211_event_type.
 * @rssi: relevant if &type is %RSSI_EVENT
 * @mlme: relevant if &type is %AUTH_EVENT
 * @ba: relevant if &type is %BAR_RX_EVENT or %BA_FRAME_TIMEOUT
 * @u:union holding the fields above
 */
struct ieee80211_event {
        enum ieee80211_event_type type;
        union {
                struct ieee80211_rssi_event rssi;
                struct ieee80211_mlme_event mlme;
                struct ieee80211_ba_event ba;
        } u;
};

/**
 * struct ieee80211_mu_group_data - STA's VHT MU-MIMO group data
 *
 * This structure describes the group id data of VHT MU-MIMO
 *
 * @membership: 64 bits array - a bit is set if station is member of the group
 * @position: 2 bits per group id indicating the position in the group
 */
struct ieee80211_mu_group_data {
        u8 membership[WLAN_MEMBERSHIP_LEN];
        u8 position[WLAN_USER_POSITION_LEN];
};

/**
 * struct ieee80211_ftm_responder_params - FTM responder parameters
 *
 * @lci: LCI subelement content
 * @civicloc: CIVIC location subelement content
 * @lci_len: LCI data length
 * @civicloc_len: Civic data length
 */
struct ieee80211_ftm_responder_params {
        const u8 *lci;
        const u8 *civicloc;
        size_t lci_len;
        size_t civicloc_len;
};

/**
 * struct ieee80211_fils_discovery - FILS discovery parameters from
 * IEEE Std 802.11ai-2016, Annex C.3 MIB detail.
 *
 * @min_interval: Minimum packet interval in TUs (0 - 10000)
 * @max_interval: Maximum packet interval in TUs (0 - 10000)
 */
struct ieee80211_fils_discovery {
        u32 min_interval;
        u32 max_interval;
};

/**
 * struct ieee80211_bss_conf - holds the BSS's changing parameters
 *
 * This structure keeps information about a BSS (and an association
 * to that BSS) that can change during the lifetime of the BSS.
 *
 * @htc_trig_based_pkt_ext: default PE in 4us units, if BSS supports HE
 * @multi_sta_back_32bit: supports BA bitmap of 32-bits in Multi-STA BACK
 * @uora_exists: is the UORA element advertised by AP
 * @ack_enabled: indicates support to receive a multi-TID that solicits either
 *        ACK, BACK or both
 * @uora_ocw_range: UORA element's OCW Range field
 * @frame_time_rts_th: HE duration RTS threshold, in units of 32us
 * @he_support: does this BSS support HE
 * @twt_requester: does this BSS support TWT requester (relevant for managed
 *        mode only, set if the AP advertises TWT responder role)
 * @twt_responder: does this BSS support TWT requester (relevant for managed
 *        mode only, set if the AP advertises TWT responder role)
 * @twt_protected: does this BSS support protected TWT frames
 * @assoc: association status
 * @ibss_joined: indicates whether this station is part of an IBSS
 *        or not
 * @ibss_creator: indicates if a new IBSS network is being created
 * @aid: association ID number, valid only when @assoc is true
 * @use_cts_prot: use CTS protection
 * @use_short_preamble: use 802.11b short preamble
 * @use_short_slot: use short slot time (only relevant for ERP)
 * @dtim_period: num of beacons before the next DTIM, for beaconing,
 *        valid in station mode only if after the driver was notified
 *        with the %BSS_CHANGED_BEACON_INFO flag, will be non-zero then.
 * @sync_tsf: last beacon's/probe response's TSF timestamp (could be old
 *        as it may have been received during scanning long ago). If the
 *        HW flag %IEEE80211_HW_TIMING_BEACON_ONLY is set, then this can
 *        only come from a beacon, but might not become valid until after
 *        association when a beacon is received (which is notified with the
 *        %BSS_CHANGED_DTIM flag.). See also sync_dtim_count important notice.
 * @sync_device_ts: the device timestamp corresponding to the sync_tsf,
 *        the driver/device can use this to calculate synchronisation
 *        (see @sync_tsf). See also sync_dtim_count important notice.
 * @sync_dtim_count: Only valid when %IEEE80211_HW_TIMING_BEACON_ONLY
 *        is requested, see @sync_tsf/@sync_device_ts.
 *        IMPORTANT: These three sync_* parameters would possibly be out of sync
 *        by the time the driver will use them. The synchronized view is currently
 *        guaranteed only in certain callbacks.
 * @beacon_int: beacon interval
 * @assoc_capability: capabilities taken from assoc resp
 * @basic_rates: bitmap of basic rates, each bit stands for an
 *        index into the rate table configured by the driver in
 *        the current band.
 * @beacon_rate: associated AP's beacon TX rate
 * @mcast_rate: per-band multicast rate index + 1 (0: disabled)
 * @bssid: The BSSID for this BSS
 * @enable_beacon: whether beaconing should be enabled or not
 * @chandef: Channel definition for this BSS -- the hardware might be
 *        configured a higher bandwidth than this BSS uses, for example.
 * @mu_group: VHT MU-MIMO group membership data
 * @ht_operation_mode: HT operation mode like in &struct ieee80211_ht_operation.
 *        This field is only valid when the channel is a wide HT/VHT channel.
 *        Note that with TDLS this can be the case (channel is HT, protection must
 *        be used from this field) even when the BSS association isn't using HT.
 * @cqm_rssi_thold: Connection quality monitor RSSI threshold, a zero value
 *        implies disabled. As with the cfg80211 callback, a change here should
 *        cause an event to be sent indicating where the current value is in
 *        relation to the newly configured threshold.
 * @cqm_rssi_low: Connection quality monitor RSSI lower threshold, a zero value
 *        implies disabled.  This is an alternative mechanism to the single
 *        threshold event and can't be enabled simultaneously with it.
 * @cqm_rssi_high: Connection quality monitor RSSI upper threshold.
 * @cqm_rssi_hyst: Connection quality monitor RSSI hysteresis
 * @arp_addr_list: List of IPv4 addresses for hardware ARP filtering. The
 *        may filter ARP queries targeted for other addresses than listed here.
 *        The driver must allow ARP queries targeted for all address listed here
 *        to pass through. An empty list implies no ARP queries need to pass.
 * @arp_addr_cnt: Number of addresses currently on the list. Note that this
 *        may be larger than %IEEE80211_BSS_ARP_ADDR_LIST_LEN (the arp_addr_list
 *        array size), it's up to the driver what to do in that case.
 * @qos: This is a QoS-enabled BSS.
 * @idle: This interface is idle. There's also a global idle flag in the
 *        hardware config which may be more appropriate depending on what
 *        your driver/device needs to do.
 * @ps: power-save mode (STA only). This flag is NOT affected by
 *        offchannel/dynamic_ps operations.
 * @ssid: The SSID of the current vif. Valid in AP and IBSS mode.
 * @ssid_len: Length of SSID given in @ssid.
 * @hidden_ssid: The SSID of the current vif is hidden. Only valid in AP-mode.
 * @txpower: TX power in dBm.  INT_MIN means not configured.
 * @txpower_type: TX power adjustment used to control per packet Transmit
 *        Power Control (TPC) in lower driver for the current vif. In particular
 *        TPC is enabled if value passed in %txpower_type is
 *        NL80211_TX_POWER_LIMITED (allow using less than specified from
 *        userspace), whereas TPC is disabled if %txpower_type is set to
 *        NL80211_TX_POWER_FIXED (use value configured from userspace)
 * @p2p_noa_attr: P2P NoA attribute for P2P powersave
 * @allow_p2p_go_ps: indication for AP or P2P GO interface, whether it's allowed
 *        to use P2P PS mechanism or not. AP/P2P GO is not allowed to use P2P PS
 *        if it has associated clients without P2P PS support.
 * @max_idle_period: the time period during which the station can refrain from
 *        transmitting frames to its associated AP without being disassociated.
 *        In units of 1000 TUs. Zero value indicates that the AP did not include
 *        a (valid) BSS Max Idle Period Element.
 * @protected_keep_alive: if set, indicates that the station should send an RSN
 *        protected frame to the AP to reset the idle timer at the AP for the
 *        station.
 * @ftm_responder: whether to enable or disable fine timing measurement FTM
 *        responder functionality.
 * @ftmr_params: configurable lci/civic parameter when enabling FTM responder.
 * @nontransmitted: this BSS is a nontransmitted BSS profile
 * @transmitter_bssid: the address of transmitter AP
 * @bssid_index: index inside the multiple BSSID set
 * @bssid_indicator: 2^bssid_indicator is the maximum number of APs in set
 * @ema_ap: AP supports enhancements of discovery and advertisement of
 *        nontransmitted BSSIDs
 * @profile_periodicity: the least number of beacon frames need to be received
 *        in order to discover all the nontransmitted BSSIDs in the set.
 * @he_oper: HE operation information of the AP we are connected to
 * @he_obss_pd: OBSS Packet Detection parameters.
 * @he_bss_color: BSS coloring settings, if BSS supports HE
 * @fils_discovery: FILS discovery configuration
 * @unsol_bcast_probe_resp_interval: Unsolicited broadcast probe response
 *        interval.
 * @s1g: BSS is S1G BSS (affects Association Request format).
 * @beacon_tx_rate: The configured beacon transmit rate that needs to be passed
 *        to driver when rate control is offloaded to firmware.
 */
struct ieee80211_bss_conf {
        const u8 *bssid;
        u8 htc_trig_based_pkt_ext;
        bool multi_sta_back_32bit;
        bool uora_exists;
        bool ack_enabled;
        u8 uora_ocw_range;
        u16 frame_time_rts_th;
        bool he_support;
        bool twt_requester;
        bool twt_responder;
        bool twt_protected;
        /* association related data */
        bool assoc, ibss_joined;
        bool ibss_creator;
        u16 aid;
        /* erp related data */
        bool use_cts_prot;
        bool use_short_preamble;
        bool use_short_slot;
        bool enable_beacon;
        u8 dtim_period;
        u16 beacon_int;
        u16 assoc_capability;
        u64 sync_tsf;
        u32 sync_device_ts;
        u8 sync_dtim_count;
        u32 basic_rates;
        struct ieee80211_rate *beacon_rate;
        int mcast_rate[NUM_NL80211_BANDS];
        u16 ht_operation_mode;
        s32 cqm_rssi_thold;
        u32 cqm_rssi_hyst;
        s32 cqm_rssi_low;
        s32 cqm_rssi_high;
        struct cfg80211_chan_def chandef;
        struct ieee80211_mu_group_data mu_group;
        __be32 arp_addr_list[IEEE80211_BSS_ARP_ADDR_LIST_LEN];
        int arp_addr_cnt;
        bool qos;
        bool idle;
        bool ps;
        u8 ssid[IEEE80211_MAX_SSID_LEN];
        size_t ssid_len;
        bool hidden_ssid;
        int txpower;
        enum nl80211_tx_power_setting txpower_type;
        struct ieee80211_p2p_noa_attr p2p_noa_attr;
        bool allow_p2p_go_ps;
        u16 max_idle_period;
        bool protected_keep_alive;
        bool ftm_responder;
        struct ieee80211_ftm_responder_params *ftmr_params;
        /* Multiple BSSID data */
        bool nontransmitted;
        u8 transmitter_bssid[ETH_ALEN];
        u8 bssid_index;
        u8 bssid_indicator;
        bool ema_ap;
        u8 profile_periodicity;
        struct {
                u32 params;
                u16 nss_set;
        } he_oper;
        struct ieee80211_he_obss_pd he_obss_pd;
        struct cfg80211_he_bss_color he_bss_color;
        struct ieee80211_fils_discovery fils_discovery;
        u32 unsol_bcast_probe_resp_interval;
        bool s1g;
        struct cfg80211_bitrate_mask beacon_tx_rate;
};

/**
 * enum mac80211_tx_info_flags - flags to describe transmission information/status
 *
 * These flags are used with the @flags member of &ieee80211_tx_info.
 *
 * @IEEE80211_TX_CTL_REQ_TX_STATUS: require TX status callback for this frame.
 * @IEEE80211_TX_CTL_ASSIGN_SEQ: The driver has to assign a sequence
 *        number to this frame, taking care of not overwriting the fragment
 *        number and increasing the sequence number only when the
 *        IEEE80211_TX_CTL_FIRST_FRAGMENT flag is set. mac80211 will properly
 *        assign sequence numbers to QoS-data frames but cannot do so correctly
 *        for non-QoS-data and management frames because beacons need them from
 *        that counter as well and mac80211 cannot guarantee proper sequencing.
 *        If this flag is set, the driver should instruct the hardware to
 *        assign a sequence number to the frame or assign one itself. Cf. IEEE
 *        802.11-2007 7.1.3.4.1 paragraph 3. This flag will always be set for
 *        beacons and always be clear for frames without a sequence number field.
 * @IEEE80211_TX_CTL_NO_ACK: tell the low level not to wait for an ack
 * @IEEE80211_TX_CTL_CLEAR_PS_FILT: clear powersave filter for destination
 *        station
 * @IEEE80211_TX_CTL_FIRST_FRAGMENT: this is a first fragment of the frame
 * @IEEE80211_TX_CTL_SEND_AFTER_DTIM: send this frame after DTIM beacon
 * @IEEE80211_TX_CTL_AMPDU: this frame should be sent as part of an A-MPDU
 * @IEEE80211_TX_CTL_INJECTED: Frame was injected, internal to mac80211.
 * @IEEE80211_TX_STAT_TX_FILTERED: The frame was not transmitted
 *        because the destination STA was in powersave mode. Note that to
 *        avoid race conditions, the filter must be set by the hardware or
 *        firmware upon receiving a frame that indicates that the station
 *        went to sleep (must be done on device to filter frames already on
 *        the queue) and may only be unset after mac80211 gives the OK for
 *        that by setting the IEEE80211_TX_CTL_CLEAR_PS_FILT (see above),
 *        since only then is it guaranteed that no more frames are in the
 *        hardware queue.
 * @IEEE80211_TX_STAT_ACK: Frame was acknowledged
 * @IEEE80211_TX_STAT_AMPDU: The frame was aggregated, so status
 *         is for the whole aggregation.
 * @IEEE80211_TX_STAT_AMPDU_NO_BACK: no block ack was returned,
 *         so consider using block ack request (BAR).
 * @IEEE80211_TX_CTL_RATE_CTRL_PROBE: internal to mac80211, can be
 *        set by rate control algorithms to indicate probe rate, will
 *        be cleared for fragmented frames (except on the last fragment)
 * @IEEE80211_TX_INTFL_OFFCHAN_TX_OK: Internal to mac80211. Used to indicate
 *        that a frame can be transmitted while the queues are stopped for
 *        off-channel operation.
 * @IEEE80211_TX_CTL_HW_80211_ENCAP: This frame uses hardware encapsulation
 *        (header conversion)
 * @IEEE80211_TX_INTFL_RETRIED: completely internal to mac80211,
 *        used to indicate that a frame was already retried due to PS
 * @IEEE80211_TX_INTFL_DONT_ENCRYPT: completely internal to mac80211,
 *        used to indicate frame should not be encrypted
 * @IEEE80211_TX_CTL_NO_PS_BUFFER: This frame is a response to a poll
 *        frame (PS-Poll or uAPSD) or a non-bufferable MMPDU and must
 *        be sent although the station is in powersave mode.
 * @IEEE80211_TX_CTL_MORE_FRAMES: More frames will be passed to the
 *        transmit function after the current frame, this can be used
 *        by drivers to kick the DMA queue only if unset or when the
 *        queue gets full.
 * @IEEE80211_TX_INTFL_RETRANSMISSION: This frame is being retransmitted
 *        after TX status because the destination was asleep, it must not
 *        be modified again (no seqno assignment, crypto, etc.)
 * @IEEE80211_TX_INTFL_MLME_CONN_TX: This frame was transmitted by the MLME
 *        code for connection establishment, this indicates that its status
 *        should kick the MLME state machine.
 * @IEEE80211_TX_INTFL_NL80211_FRAME_TX: Frame was requested through nl80211
 *        MLME command (internal to mac80211 to figure out whether to send TX
 *        status to user space)
 * @IEEE80211_TX_CTL_LDPC: tells the driver to use LDPC for this frame
 * @IEEE80211_TX_CTL_STBC: Enables Space-Time Block Coding (STBC) for this
 *        frame and selects the maximum number of streams that it can use.
 * @IEEE80211_TX_CTL_TX_OFFCHAN: Marks this packet to be transmitted on
 *        the off-channel channel when a remain-on-channel offload is done
 *        in hardware -- normal packets still flow and are expected to be
 *        handled properly by the device.
 * @IEEE80211_TX_INTFL_TKIP_MIC_FAILURE: Marks this packet to be used for TKIP
 *        testing. It will be sent out with incorrect Michael MIC key to allow
 *        TKIP countermeasures to be tested.
 * @IEEE80211_TX_CTL_NO_CCK_RATE: This frame will be sent at non CCK rate.
 *        This flag is actually used for management frame especially for P2P
 *        frames not being sent at CCK rate in 2GHz band.
 * @IEEE80211_TX_STATUS_EOSP: This packet marks the end of service period,
 *        when its status is reported the service period ends. For frames in
 *        an SP that mac80211 transmits, it is already set; for driver frames
 *        the driver may set this flag. It is also used to do the same for
 *        PS-Poll responses.
 * @IEEE80211_TX_CTL_USE_MINRATE: This frame will be sent at lowest rate.
 *        This flag is used to send nullfunc frame at minimum rate when
 *        the nullfunc is used for connection monitoring purpose.
 * @IEEE80211_TX_CTL_DONTFRAG: Don't fragment this packet even if it
 *        would be fragmented by size (this is optional, only used for
 *        monitor injection).
 * @IEEE80211_TX_STAT_NOACK_TRANSMITTED: A frame that was marked with
 *        IEEE80211_TX_CTL_NO_ACK has been successfully transmitted without
 *        any errors (like issues specific to the driver/HW).
 *        This flag must not be set for frames that don't request no-ack
 *        behaviour with IEEE80211_TX_CTL_NO_ACK.
 *
 * Note: If you have to add new flags to the enumeration, then don't
 *         forget to update %IEEE80211_TX_TEMPORARY_FLAGS when necessary.
 */
enum mac80211_tx_info_flags {
        IEEE80211_TX_CTL_REQ_TX_STATUS                = BIT(0),
        IEEE80211_TX_CTL_ASSIGN_SEQ                = BIT(1),
        IEEE80211_TX_CTL_NO_ACK                        = BIT(2),
        IEEE80211_TX_CTL_CLEAR_PS_FILT                = BIT(3),
        IEEE80211_TX_CTL_FIRST_FRAGMENT                = BIT(4),
        IEEE80211_TX_CTL_SEND_AFTER_DTIM        = BIT(5),
        IEEE80211_TX_CTL_AMPDU                        = BIT(6),
        IEEE80211_TX_CTL_INJECTED                = BIT(7),
        IEEE80211_TX_STAT_TX_FILTERED                = BIT(8),
        IEEE80211_TX_STAT_ACK                        = BIT(9),
        IEEE80211_TX_STAT_AMPDU                        = BIT(10),
        IEEE80211_TX_STAT_AMPDU_NO_BACK                = BIT(11),
        IEEE80211_TX_CTL_RATE_CTRL_PROBE        = BIT(12),
        IEEE80211_TX_INTFL_OFFCHAN_TX_OK        = BIT(13),
        IEEE80211_TX_CTL_HW_80211_ENCAP                = BIT(14),
        IEEE80211_TX_INTFL_RETRIED                = BIT(15),
        IEEE80211_TX_INTFL_DONT_ENCRYPT                = BIT(16),
        IEEE80211_TX_CTL_NO_PS_BUFFER                = BIT(17),
        IEEE80211_TX_CTL_MORE_FRAMES                = BIT(18),
        IEEE80211_TX_INTFL_RETRANSMISSION        = BIT(19),
        IEEE80211_TX_INTFL_MLME_CONN_TX                = BIT(20),
        IEEE80211_TX_INTFL_NL80211_FRAME_TX        = BIT(21),
        IEEE80211_TX_CTL_LDPC                        = BIT(22),
        IEEE80211_TX_CTL_STBC                        = BIT(23) | BIT(24),
        IEEE80211_TX_CTL_TX_OFFCHAN                = BIT(25),
        IEEE80211_TX_INTFL_TKIP_MIC_FAILURE        = BIT(26),
        IEEE80211_TX_CTL_NO_CCK_RATE                = BIT(27),
        IEEE80211_TX_STATUS_EOSP                = BIT(28),
        IEEE80211_TX_CTL_USE_MINRATE                = BIT(29),
        IEEE80211_TX_CTL_DONTFRAG                = BIT(30),
        IEEE80211_TX_STAT_NOACK_TRANSMITTED        = BIT(31),
};

#define IEEE80211_TX_CTL_STBC_SHIFT                23

#define IEEE80211_TX_RC_S1G_MCS IEEE80211_TX_RC_VHT_MCS

/**
 * enum mac80211_tx_control_flags - flags to describe transmit control
 *
 * @IEEE80211_TX_CTRL_PORT_CTRL_PROTO: this frame is a port control
 *        protocol frame (e.g. EAP)
 * @IEEE80211_TX_CTRL_PS_RESPONSE: This frame is a response to a poll
 *        frame (PS-Poll or uAPSD).
 * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate information
 * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame
 * @IEEE80211_TX_CTRL_FAST_XMIT: This frame is going through the fast_xmit path
 * @IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP: This frame skips mesh path lookup
 * @IEEE80211_TX_INTCFL_NEED_TXPROCESSING: completely internal to mac80211,
 *        used to indicate that a pending frame requires TX processing before
 *        it can be sent out.
 * @IEEE80211_TX_CTRL_NO_SEQNO: Do not overwrite the sequence number that
 *        has already been assigned to this frame.
 *
 * These flags are used in tx_info->control.flags.
 */
enum mac80211_tx_control_flags {
        IEEE80211_TX_CTRL_PORT_CTRL_PROTO        = BIT(0),
        IEEE80211_TX_CTRL_PS_RESPONSE                = BIT(1),
        IEEE80211_TX_CTRL_RATE_INJECT                = BIT(2),
        IEEE80211_TX_CTRL_AMSDU                        = BIT(3),
        IEEE80211_TX_CTRL_FAST_XMIT                = BIT(4),
        IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP        = BIT(5),
        IEEE80211_TX_INTCFL_NEED_TXPROCESSING        = BIT(6),
        IEEE80211_TX_CTRL_NO_SEQNO                = BIT(7),
};

/*
 * This definition is used as a mask to clear all temporary flags, which are
 * set by the tx handlers for each transmission attempt by the mac80211 stack.
 */
#define IEEE80211_TX_TEMPORARY_FLAGS (IEEE80211_TX_CTL_NO_ACK |                      \
        IEEE80211_TX_CTL_CLEAR_PS_FILT | IEEE80211_TX_CTL_FIRST_FRAGMENT |    \
        IEEE80211_TX_CTL_SEND_AFTER_DTIM | IEEE80211_TX_CTL_AMPDU |              \
        IEEE80211_TX_STAT_TX_FILTERED |        IEEE80211_TX_STAT_ACK |                      \
        IEEE80211_TX_STAT_AMPDU | IEEE80211_TX_STAT_AMPDU_NO_BACK |              \
        IEEE80211_TX_CTL_RATE_CTRL_PROBE | IEEE80211_TX_CTL_NO_PS_BUFFER |    \
        IEEE80211_TX_CTL_MORE_FRAMES | IEEE80211_TX_CTL_LDPC |                      \
        IEEE80211_TX_CTL_STBC | IEEE80211_TX_STATUS_EOSP)

/**
 * enum mac80211_rate_control_flags - per-rate flags set by the
 *        Rate Control algorithm.
 *
 * These flags are set by the Rate control algorithm for each rate during tx,
 * in the @flags member of struct ieee80211_tx_rate.
 *
 * @IEEE80211_TX_RC_USE_RTS_CTS: Use RTS/CTS exchange for this rate.
 * @IEEE80211_TX_RC_USE_CTS_PROTECT: CTS-to-self protection is required.
 *        This is set if the current BSS requires ERP protection.
 * @IEEE80211_TX_RC_USE_SHORT_PREAMBLE: Use short preamble.
 * @IEEE80211_TX_RC_MCS: HT rate.
 * @IEEE80211_TX_RC_VHT_MCS: VHT MCS rate, in this case the idx field is split
 *        into a higher 4 bits (Nss) and lower 4 bits (MCS number)
 * @IEEE80211_TX_RC_GREEN_FIELD: Indicates whether this rate should be used in
 *        Greenfield mode.
 * @IEEE80211_TX_RC_40_MHZ_WIDTH: Indicates if the Channel Width should be 40 MHz.
 * @IEEE80211_TX_RC_80_MHZ_WIDTH: Indicates 80 MHz transmission
 * @IEEE80211_TX_RC_160_MHZ_WIDTH: Indicates 160 MHz transmission
 *        (80+80 isn't supported yet)
 * @IEEE80211_TX_RC_DUP_DATA: The frame should be transmitted on both of the
 *        adjacent 20 MHz channels, if the current channel type is
 *        NL80211_CHAN_HT40MINUS or NL80211_CHAN_HT40PLUS.
 * @IEEE80211_TX_RC_SHORT_GI: Short Guard interval should be used for this rate.
 */
enum mac80211_rate_control_flags {
        IEEE80211_TX_RC_USE_RTS_CTS                = BIT(0),
        IEEE80211_TX_RC_USE_CTS_PROTECT                = BIT(1),
        IEEE80211_TX_RC_USE_SHORT_PREAMBLE        = BIT(2),

        /* rate index is an HT/VHT MCS instead of an index */
        IEEE80211_TX_RC_MCS                        = BIT(3),
        IEEE80211_TX_RC_GREEN_FIELD                = BIT(4),
        IEEE80211_TX_RC_40_MHZ_WIDTH                = BIT(5),
        IEEE80211_TX_RC_DUP_DATA                = BIT(6),
        IEEE80211_TX_RC_SHORT_GI                = BIT(7),
        IEEE80211_TX_RC_VHT_MCS                        = BIT(8),
        IEEE80211_TX_RC_80_MHZ_WIDTH                = BIT(9),
        IEEE80211_TX_RC_160_MHZ_WIDTH                = BIT(10),
};


/* there are 40 bytes if you don't need the rateset to be kept */
#define IEEE80211_TX_INFO_DRIVER_DATA_SIZE 40

/* if you do need the rateset, then you have less space */
#define IEEE80211_TX_INFO_RATE_DRIVER_DATA_SIZE 24

/* maximum number of rate stages */
#define IEEE80211_TX_MAX_RATES        4

/* maximum number of rate table entries */
#define IEEE80211_TX_RATE_TABLE_SIZE        4

/**
 * struct ieee80211_tx_rate - rate selection/status
 *
 * @idx: rate index to attempt to send with
 * @flags: rate control flags (&enum mac80211_rate_control_flags)
 * @count: number of tries in this rate before going to the next rate
 *
 * A value of -1 for @idx indicates an invalid rate and, if used
 * in an array of retry rates, that no more rates should be tried.
 *
 * When used for transmit status reporting, the driver should
 * always report the rate along with the flags it used.
 *
 * &struct ieee80211_tx_info contains an array of these structs
 * in the control information, and it will be filled by the rate
 * control algorithm according to what should be sent. For example,
 * if this array contains, in the format { <idx>, <count> } the
 * information::
 *
 *    { 3, 2 }, { 2, 2 }, { 1, 4 }, { -1, 0 }, { -1, 0 }
 *
 * then this means that the frame should be transmitted
 * up to twice at rate 3, up to twice at rate 2, and up to four
 * times at rate 1 if it doesn't get acknowledged. Say it gets
 * acknowledged by the peer after the fifth attempt, the status
 * information should then contain::
 *
 *   { 3, 2 }, { 2, 2 }, { 1, 1 }, { -1, 0 } ...
 *
 * since it was transmitted twice at rate 3, twice at rate 2
 * and once at rate 1 after which we received an acknowledgement.
 */
struct ieee80211_tx_rate {
        s8 idx;
        u16 count:5,
            flags:11;
} __packed;

#define IEEE80211_MAX_TX_RETRY                31

static inline void ieee80211_rate_set_vht(struct ieee80211_tx_rate *rate,
                                          u8 mcs, u8 nss)
{
        WARN_ON(mcs & ~0xF);
        WARN_ON((nss - 1) & ~0x7);
        rate->idx = ((nss - 1) << 4) | mcs;
}

static inline u8
ieee80211_rate_get_vht_mcs(const struct ieee80211_tx_rate *rate)
{
        return rate->idx & 0xF;
}

static inline u8
ieee80211_rate_get_vht_nss(const struct ieee80211_tx_rate *rate)
{
        return (rate->idx >> 4) + 1;
}

/**
 * struct ieee80211_tx_info - skb transmit information
 *
 * This structure is placed in skb->cb for three uses:
 *  (1) mac80211 TX control - mac80211 tells the driver what to do
 *  (2) driver internal use (if applicable)
 *  (3) TX status information - driver tells mac80211 what happened
 *
 * @flags: transmit info flags, defined above
 * @band: the band to transmit on (use for checking for races)
 * @hw_queue: HW queue to put the frame on, skb_get_queue_mapping() gives the AC
 * @ack_frame_id: internal frame ID for TX status, used internally
 * @tx_time_est: TX time estimate in units of 4us, used internally
 * @control: union part for control data
 * @control.rates: TX rates array to try
 * @control.rts_cts_rate_idx: rate for RTS or CTS
 * @control.use_rts: use RTS
 * @control.use_cts_prot: use RTS/CTS
 * @control.short_preamble: use short preamble (CCK only)
 * @control.skip_table: skip externally configured rate table
 * @control.jiffies: timestamp for expiry on powersave clients
 * @control.vif: virtual interface (may be NULL)
 * @control.hw_key: key to encrypt with (may be NULL)
 * @control.flags: control flags, see &enum mac80211_tx_control_flags
 * @control.enqueue_time: enqueue time (for iTXQs)
 * @driver_rates: alias to @control.rates to reserve space
 * @pad: padding
 * @rate_driver_data: driver use area if driver needs @control.rates
 * @status: union part for status data
 * @status.rates: attempted rates
 * @status.ack_signal: ACK signal
 * @status.ampdu_ack_len: AMPDU ack length
 * @status.ampdu_len: AMPDU length
 * @status.antenna: (legacy, kept only for iwlegacy)
 * @status.tx_time: airtime consumed for transmission; note this is only
 *        used for WMM AC, not for airtime fairness
 * @status.is_valid_ack_signal: ACK signal is valid
 * @status.status_driver_data: driver use area
 * @ack: union part for pure ACK data
 * @ack.cookie: cookie for the ACK
 * @driver_data: array of driver_data pointers
 * @ampdu_ack_len: number of acked aggregated frames.
 *         relevant only if IEEE80211_TX_STAT_AMPDU was set.
 * @ampdu_len: number of aggregated frames.
 *         relevant only if IEEE80211_TX_STAT_AMPDU was set.
 * @ack_signal: signal strength of the ACK frame
 */
struct ieee80211_tx_info {
        /* common information */
        u32 flags;
        u32 band:3,
            ack_frame_id:13,
            hw_queue:4,
            tx_time_est:10;
        /* 2 free bits */

        union {
                struct {
                        union {
                                /* rate control */
                                struct {
                                        struct ieee80211_tx_rate rates[
                                                IEEE80211_TX_MAX_RATES];
                                        s8 rts_cts_rate_idx;
                                        u8 use_rts:1;
                                        u8 use_cts_prot:1;
                                        u8 short_preamble:1;
                                        u8 skip_table:1;
                                        /* 2 bytes free */
                                };
                                /* only needed before rate control */
                                unsigned long jiffies;
                        };
                        /* NB: vif can be NULL for injected frames */
                        struct ieee80211_vif *vif;
                        struct ieee80211_key_conf *hw_key;
                        u32 flags;
                        codel_time_t enqueue_time;
                } control;
                struct {
                        u64 cookie;
                } ack;
                struct {
                        struct ieee80211_tx_rate rates[IEEE80211_TX_MAX_RATES];
                        s32 ack_signal;
                        u8 ampdu_ack_len;
                        u8 ampdu_len;
                        u8 antenna;
                        u16 tx_time;
                        bool is_valid_ack_signal;
                        void *status_driver_data[19 / sizeof(void *)];
                } status;
                struct {
                        struct ieee80211_tx_rate driver_rates[
                                IEEE80211_TX_MAX_RATES];
                        u8 pad[4];

                        void *rate_driver_data[
                                IEEE80211_TX_INFO_RATE_DRIVER_DATA_SIZE / sizeof(void *)];
                };
                void *driver_data[
                        IEEE80211_TX_INFO_DRIVER_DATA_SIZE / sizeof(void *)];
        };
};

static inline u16
ieee80211_info_set_tx_time_est(struct ieee80211_tx_info *info, u16 tx_time_est)
{
        /* We only have 10 bits in tx_time_est, so store airtime
         * in increments of 4us and clamp the maximum to 2**12-1
         */
        info->tx_time_est = min_t(u16, tx_time_est, 4095) >> 2;
        return info->tx_time_est << 2;
}

static inline u16
ieee80211_info_get_tx_time_est(struct ieee80211_tx_info *info)
{
        return info->tx_time_est << 2;
}

/**
 * struct ieee80211_tx_status - extended tx status info for rate control
 *
 * @sta: Station that the packet was transmitted for
 * @info: Basic tx status information
 * @skb: Packet skb (can be NULL if not provided by the driver)
 * @rate: The TX rate that was used when sending the packet
 * @free_list: list where processed skbs are stored to be free'd by the driver
 */
struct ieee80211_tx_status {
        struct ieee80211_sta *sta;
        struct ieee80211_tx_info *info;
        struct sk_buff *skb;
        struct rate_info *rate;
        struct list_head *free_list;
};

/**
 * struct ieee80211_scan_ies - descriptors for different blocks of IEs
 *
 * This structure is used to point to different blocks of IEs in HW scan
 * and scheduled scan. These blocks contain the IEs passed by userspace
 * and the ones generated by mac80211.
 *
 * @ies: pointers to band specific IEs.
 * @len: lengths of band_specific IEs.
 * @common_ies: IEs for all bands (especially vendor specific ones)
 * @common_ie_len: length of the common_ies
 */
struct ieee80211_scan_ies {
        const u8 *ies[NUM_NL80211_BANDS];
        size_t len[NUM_NL80211_BANDS];
        const u8 *common_ies;
        size_t common_ie_len;
};


static inline struct ieee80211_tx_info *IEEE80211_SKB_CB(struct sk_buff *skb)
{
        return (struct ieee80211_tx_info *)skb->cb;
}

static inline struct ieee80211_rx_status *IEEE80211_SKB_RXCB(struct sk_buff *skb)
{
        return (struct ieee80211_rx_status *)skb->cb;
}

/**
 * ieee80211_tx_info_clear_status - clear TX status
 *
 * @info: The &struct ieee80211_tx_info to be cleared.
 *
 * When the driver passes an skb back to mac80211, it must report
 * a number of things in TX status. This function clears everything
 * in the TX status but the rate control information (it does clear
 * the count since you need to fill that in anyway).
 *
 * NOTE: You can only use this function if you do NOT use
 *         info->driver_data! Use info->rate_driver_data
 *         instead if you need only the less space that allows.
 */
static inline void
ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info)
{
        int i;

        BUILD_BUG_ON(offsetof(struct ieee80211_tx_info, status.rates) !=
                     offsetof(struct ieee80211_tx_info, control.rates));
        BUILD_BUG_ON(offsetof(struct ieee80211_tx_info, status.rates) !=
                     offsetof(struct ieee80211_tx_info, driver_rates));
        BUILD_BUG_ON(offsetof(struct ieee80211_tx_info, status.rates) != 8);
        /* clear the rate counts */
        for (i = 0; i < IEEE80211_TX_MAX_RATES; i++)
                info->status.rates[i].count = 0;

        BUILD_BUG_ON(
            offsetof(struct ieee80211_tx_info, status.ack_signal) != 20);
        memset(&info->status.ampdu_ack_len, 0,
               sizeof(struct ieee80211_tx_info) -
               offsetof(struct ieee80211_tx_info, status.ampdu_ack_len));
}


/**
 * enum mac80211_rx_flags - receive flags
 *
 * These flags are used with the @flag member of &struct ieee80211_rx_status.
 * @RX_FLAG_MMIC_ERROR: Michael MIC error was reported on this frame.
 *        Use together with %RX_FLAG_MMIC_STRIPPED.
 * @RX_FLAG_DECRYPTED: This frame was decrypted in hardware.
 * @RX_FLAG_MMIC_STRIPPED: the Michael MIC is stripped off this frame,
 *        verification has been done by the hardware.
 * @RX_FLAG_IV_STRIPPED: The IV and ICV are stripped from this frame.
 *        If this flag is set, the stack cannot do any replay detection
 *        hence the driver or hardware will have to do that.
 * @RX_FLAG_PN_VALIDATED: Currently only valid for CCMP/GCMP frames, this
 *        flag indicates that the PN was verified for replay protection.
 *        Note that this flag is also currently only supported when a frame
 *        is also decrypted (ie. @RX_FLAG_DECRYPTED must be set)
 * @RX_FLAG_DUP_VALIDATED: The driver should set this flag if it did
 *        de-duplication by itself.
 * @RX_FLAG_FAILED_FCS_CRC: Set this flag if the FCS check failed on
 *        the frame.
 * @RX_FLAG_FAILED_PLCP_CRC: Set this flag if the PCLP check failed on
 *        the frame.
 * @RX_FLAG_MACTIME_START: The timestamp passed in the RX status (@mactime
 *        field) is valid and contains the time the first symbol of the MPDU
 *        was received. This is useful in monitor mode and for proper IBSS
 *        merging.
 * @RX_FLAG_MACTIME_END: The timestamp passed in the RX status (@mactime
 *        field) is valid and contains the time the last symbol of the MPDU
 *        (including FCS) was received.
 * @RX_FLAG_MACTIME_PLCP_START: The timestamp passed in the RX status (@mactime
 *        field) is valid and contains the time the SYNC preamble was received.
 * @RX_FLAG_NO_SIGNAL_VAL: The signal strength value is not present.
 *        Valid only for data frames (mainly A-MPDU)
 * @RX_FLAG_AMPDU_DETAILS: A-MPDU details are known, in particular the reference
 *        number (@ampdu_reference) must be populated and be a distinct number for
 *        each A-MPDU
 * @RX_FLAG_AMPDU_LAST_KNOWN: last subframe is known, should be set on all
 *        subframes of a single A-MPDU
 * @RX_FLAG_AMPDU_IS_LAST: this subframe is the last subframe of the A-MPDU
 * @RX_FLAG_AMPDU_DELIM_CRC_ERROR: A delimiter CRC error has been detected
 *        on this subframe
 * @RX_FLAG_AMPDU_DELIM_CRC_KNOWN: The delimiter CRC field is known (the CRC
 *        is stored in the @ampdu_delimiter_crc field)
 * @RX_FLAG_MIC_STRIPPED: The mic was stripped of this packet. Decryption was
 *        done by the hardware
 * @RX_FLAG_ONLY_MONITOR: Report frame only to monitor interfaces without
 *        processing it in any regular way.
 *        This is useful if drivers offload some frames but still want to report
 *        them for sniffing purposes.
 * @RX_FLAG_SKIP_MONITOR: Process and report frame to all interfaces except
 *        monitor interfaces.
 *        This is useful if drivers offload some frames but still want to report
 *        them for sniffing purposes.
 * @RX_FLAG_AMSDU_MORE: Some drivers may prefer to report separate A-MSDU
 *        subframes instead of a one huge frame for performance reasons.
 *        All, but the last MSDU from an A-MSDU should have this flag set. E.g.
 *        if an A-MSDU has 3 frames, the first 2 must have the flag set, while
 *        the 3rd (last) one must not have this flag set. The flag is used to
 *        deal with retransmission/duplication recovery properly since A-MSDU
 *        subframes share the same sequence number. Reported subframes can be
 *        either regular MSDU or singly A-MSDUs. Subframes must not be
 *        interleaved with other frames.
 * @RX_FLAG_RADIOTAP_VENDOR_DATA: This frame contains vendor-specific
 *        radiotap data in the skb->data (before the frame) as described by
 *        the &struct ieee80211_vendor_radiotap.
 * @RX_FLAG_ALLOW_SAME_PN: Allow the same PN as same packet before.
 *        This is used for AMSDU subframes which can have the same PN as
 *        the first subframe.
 * @RX_FLAG_ICV_STRIPPED: The ICV is stripped from this frame. CRC checking must
 *        be done in the hardware.
 * @RX_FLAG_AMPDU_EOF_BIT: Value of the EOF bit in the A-MPDU delimiter for this
 *        frame
 * @RX_FLAG_AMPDU_EOF_BIT_KNOWN: The EOF value is known
 * @RX_FLAG_RADIOTAP_HE: HE radiotap data is present
 *        (&struct ieee80211_radiotap_he, mac80211 will fill in
 *        
 *         - DATA3_DATA_MCS
 *         - DATA3_DATA_DCM
 *         - DATA3_CODING
 *         - DATA5_GI
 *         - DATA5_DATA_BW_RU_ALLOC
 *         - DATA6_NSTS
 *         - DATA3_STBC
 *        
 *        from the RX info data, so leave those zeroed when building this data)
 * @RX_FLAG_RADIOTAP_HE_MU: HE MU radiotap data is present
 *        (&struct ieee80211_radiotap_he_mu)
 * @RX_FLAG_RADIOTAP_LSIG: L-SIG radiotap data is present
 * @RX_FLAG_NO_PSDU: use the frame only for radiotap reporting, with
 *        the "0-length PSDU" field included there.  The value for it is
 *        in &struct ieee80211_rx_status.  Note that if this value isn't
 *        known the frame shouldn't be reported.
 */
enum mac80211_rx_flags {
        RX_FLAG_MMIC_ERROR                = BIT(0),
        RX_FLAG_DECRYPTED                = BIT(1),
        RX_FLAG_MACTIME_PLCP_START        = BIT(2),
        RX_FLAG_MMIC_STRIPPED                = BIT(3),
        RX_FLAG_IV_STRIPPED                = BIT(4),
        RX_FLAG_FAILED_FCS_CRC                = BIT(5),
        RX_FLAG_FAILED_PLCP_CRC         = BIT(6),
        RX_FLAG_MACTIME_START                = BIT(7),
        RX_FLAG_NO_SIGNAL_VAL                = BIT(8),
        RX_FLAG_AMPDU_DETAILS                = BIT(9),
        RX_FLAG_PN_VALIDATED                = BIT(10),
        RX_FLAG_DUP_VALIDATED                = BIT(11),
        RX_FLAG_AMPDU_LAST_KNOWN        = BIT(12),
        RX_FLAG_AMPDU_IS_LAST                = BIT(13),
        RX_FLAG_AMPDU_DELIM_CRC_ERROR        = BIT(14),
        RX_FLAG_AMPDU_DELIM_CRC_KNOWN        = BIT(15),
        RX_FLAG_MACTIME_END                = BIT(16),
        RX_FLAG_ONLY_MONITOR                = BIT(17),
        RX_FLAG_SKIP_MONITOR                = BIT(18),
        RX_FLAG_AMSDU_MORE                = BIT(19),
        RX_FLAG_RADIOTAP_VENDOR_DATA        = BIT(20),
        RX_FLAG_MIC_STRIPPED                = BIT(21),
        RX_FLAG_ALLOW_SAME_PN                = BIT(22),
        RX_FLAG_ICV_STRIPPED                = BIT(23),
        RX_FLAG_AMPDU_EOF_BIT                = BIT(24),
        RX_FLAG_AMPDU_EOF_BIT_KNOWN        = BIT(25),
        RX_FLAG_RADIOTAP_HE                = BIT(26),
        RX_FLAG_RADIOTAP_HE_MU                = BIT(27),
        RX_FLAG_RADIOTAP_LSIG                = BIT(28),
        RX_FLAG_NO_PSDU                        = BIT(29),
};

/**
 * enum mac80211_rx_encoding_flags - MCS & bandwidth flags
 *
 * @RX_ENC_FLAG_SHORTPRE: Short preamble was used for this frame
 * @RX_ENC_FLAG_SHORT_GI: Short guard interval was used
 * @RX_ENC_FLAG_HT_GF: This frame was received in a HT-greenfield transmission,
 *        if the driver fills this value it should add
 *        %IEEE80211_RADIOTAP_MCS_HAVE_FMT
 *        to @hw.radiotap_mcs_details to advertise that fact.
 * @RX_ENC_FLAG_LDPC: LDPC was used
 * @RX_ENC_FLAG_STBC_MASK: STBC 2 bit bitmask. 1 - Nss=1, 2 - Nss=2, 3 - Nss=3
 * @RX_ENC_FLAG_BF: packet was beamformed
 */
enum mac80211_rx_encoding_flags {
        RX_ENC_FLAG_SHORTPRE                = BIT(0),
        RX_ENC_FLAG_SHORT_GI                = BIT(2),
        RX_ENC_FLAG_HT_GF                = BIT(3),
        RX_ENC_FLAG_STBC_MASK                = BIT(4) | BIT(5),
        RX_ENC_FLAG_LDPC                = BIT(6),
        RX_ENC_FLAG_BF                        = BIT(7),
};

#define RX_ENC_FLAG_STBC_SHIFT                4

enum mac80211_rx_encoding {
        RX_ENC_LEGACY = 0,
        RX_ENC_HT,
        RX_ENC_VHT,
        RX_ENC_HE,
};

/**
 * struct ieee80211_rx_status - receive status
 *
 * The low-level driver should provide this information (the subset
 * supported by hardware) to the 802.11 code with each received
 * frame, in the skb's control buffer (cb).
 *
 * @mactime: value in microseconds of the 64-bit Time Synchronization Function
 *         (TSF) timer when the first data symbol (MPDU) arrived at the hardware.
 * @boottime_ns: CLOCK_BOOTTIME timestamp the frame was received at, this is
 *        needed only for beacons and probe responses that update the scan cache.
 * @device_timestamp: arbitrary timestamp for the device, mac80211 doesn't use
 *        it but can store it and pass it back to the driver for synchronisation
 * @band: the active band when this frame was received
 * @freq: frequency the radio was tuned to when receiving this frame, in MHz
 *        This field must be set for management frames, but isn't strictly needed
 *        for data (other) frames - for those it only affects radiotap reporting.
 * @freq_offset: @freq has a positive offset of 500Khz.
 * @signal: signal strength when receiving this frame, either in dBm, in dB or
 *        unspecified depending on the hardware capabilities flags
 *        @IEEE80211_HW_SIGNAL_*
 * @chains: bitmask of receive chains for which separate signal strength
 *        values were filled.
 * @chain_signal: per-chain signal strength, in dBm (unlike @signal, doesn't
 *        support dB or unspecified units)
 * @antenna: antenna used
 * @rate_idx: index of data rate into band's supported rates or MCS index if
 *        HT or VHT is used (%RX_FLAG_HT/%RX_FLAG_VHT)
 * @nss: number of streams (VHT and HE only)
 * @flag: %RX_FLAG_\*
 * @encoding: &enum mac80211_rx_encoding
 * @bw: &enum rate_info_bw
 * @enc_flags: uses bits from &enum mac80211_rx_encoding_flags
 * @he_ru: HE RU, from &enum nl80211_he_ru_alloc
 * @he_gi: HE GI, from &enum nl80211_he_gi
 * @he_dcm: HE DCM value
 * @rx_flags: internal RX flags for mac80211
 * @ampdu_reference: A-MPDU reference number, must be a different value for
 *        each A-MPDU but the same for each subframe within one A-MPDU
 * @ampdu_delimiter_crc: A-MPDU delimiter CRC
 * @zero_length_psdu_type: radiotap type of the 0-length PSDU
 */
struct ieee80211_rx_status {
        u64 mactime;
        u64 boottime_ns;
        u32 device_timestamp;
        u32 ampdu_reference;
        u32 flag;
        u16 freq: 13, freq_offset: 1;
        u8 enc_flags;
        u8 encoding:2, bw:3, he_ru:3;
        u8 he_gi:2, he_dcm:1;
        u8 rate_idx;
        u8 nss;
        u8 rx_flags;
        u8 band;
        u8 antenna;
        s8 signal;
        u8 chains;
        s8 chain_signal[IEEE80211_MAX_CHAINS];
        u8 ampdu_delimiter_crc;
        u8 zero_length_psdu_type;
};

static inline u32
ieee80211_rx_status_to_khz(struct ieee80211_rx_status *rx_status)
{
        return MHZ_TO_KHZ(rx_status->freq) +
               (rx_status->freq_offset ? 500 : 0);
}

/**
 * struct ieee80211_vendor_radiotap - vendor radiotap data information
 * @present: presence bitmap for this vendor namespace
 *        (this could be extended in the future if any vendor needs more
 *         bits, the radiotap spec does allow for that)
 * @align: radiotap vendor namespace alignment. This defines the needed
 *        alignment for the @data field below, not for the vendor namespace
 *        description itself (which has a fixed 2-byte alignment)
 *        Must be a power of two, and be set to at least 1!
 * @oui: radiotap vendor namespace OUI
 * @subns: radiotap vendor sub namespace
 * @len: radiotap vendor sub namespace skip length, if alignment is done
 *        then that's added to this, i.e. this is only the length of the
 *        @data field.
 * @pad: number of bytes of padding after the @data, this exists so that
 *        the skb data alignment can be preserved even if the data has odd
 *        length
 * @data: the actual vendor namespace data
 *
 * This struct, including the vendor data, goes into the skb->data before
 * the 802.11 header. It's split up in mac80211 using the align/oui/subns
 * data.
 */
struct ieee80211_vendor_radiotap {
        u32 present;
        u8 align;
        u8 oui[3];
        u8 subns;
        u8 pad;
        u16 len;
        u8 data[];
} __packed;

/**
 * enum ieee80211_conf_flags - configuration flags
 *
 * Flags to define PHY configuration options
 *
 * @IEEE80211_CONF_MONITOR: there's a monitor interface present -- use this
 *        to determine for example whether to calculate timestamps for packets
 *        or not, do not use instead of filter flags!
 * @IEEE80211_CONF_PS: Enable 802.11 power save mode (managed mode only).
 *        This is the power save mode defined by IEEE 802.11-2007 section 11.2,
 *        meaning that the hardware still wakes up for beacons, is able to
 *        transmit frames and receive the possible acknowledgment frames.
 *        Not to be confused with hardware specific wakeup/sleep states,
 *        driver is responsible for that. See the section "Powersave support"
 *        for more.
 * @IEEE80211_CONF_IDLE: The device is running, but idle; if the flag is set
 *        the driver should be prepared to handle configuration requests but
 *        may turn the device off as much as possible. Typically, this flag will
 *        be set when an interface is set UP but not associated or scanning, but
 *        it can also be unset in that case when monitor interfaces are active.
 * @IEEE80211_CONF_OFFCHANNEL: The device is currently not on its main
 *        operating channel.
 */
enum ieee80211_conf_flags {
        IEEE80211_CONF_MONITOR                = (1<<0),
        IEEE80211_CONF_PS                = (1<<1),
        IEEE80211_CONF_IDLE                = (1<<2),
        IEEE80211_CONF_OFFCHANNEL        = (1<<3),
};


/**
 * enum ieee80211_conf_changed - denotes which configuration changed
 *
 * @IEEE80211_CONF_CHANGE_LISTEN_INTERVAL: the listen interval changed
 * @IEEE80211_CONF_CHANGE_MONITOR: the monitor flag changed
 * @IEEE80211_CONF_CHANGE_PS: the PS flag or dynamic PS timeout changed
 * @IEEE80211_CONF_CHANGE_POWER: the TX power changed
 * @IEEE80211_CONF_CHANGE_CHANNEL: the channel/channel_type changed
 * @IEEE80211_CONF_CHANGE_RETRY_LIMITS: retry limits changed
 * @IEEE80211_CONF_CHANGE_IDLE: Idle flag changed
 * @IEEE80211_CONF_CHANGE_SMPS: Spatial multiplexing powersave mode changed
 *        Note that this is only valid if channel contexts are not used,
 *        otherwise each channel context has the number of chains listed.
 */
enum ieee80211_conf_changed {
        IEEE80211_CONF_CHANGE_SMPS                = BIT(1),
        IEEE80211_CONF_CHANGE_LISTEN_INTERVAL        = BIT(2),
        IEEE80211_CONF_CHANGE_MONITOR                = BIT(3),
        IEEE80211_CONF_CHANGE_PS                = BIT(4),
        IEEE80211_CONF_CHANGE_POWER                = BIT(5),
        IEEE80211_CONF_CHANGE_CHANNEL                = BIT(6),
        IEEE80211_CONF_CHANGE_RETRY_LIMITS        = BIT(7),
        IEEE80211_CONF_CHANGE_IDLE                = BIT(8),
};

/**
 * enum ieee80211_smps_mode - spatial multiplexing power save mode
 *
 * @IEEE80211_SMPS_AUTOMATIC: automatic
 * @IEEE80211_SMPS_OFF: off
 * @IEEE80211_SMPS_STATIC: static
 * @IEEE80211_SMPS_DYNAMIC: dynamic
 * @IEEE80211_SMPS_NUM_MODES: internal, don't use
 */
enum ieee80211_smps_mode {
        IEEE80211_SMPS_AUTOMATIC,
        IEEE80211_SMPS_OFF,
        IEEE80211_SMPS_STATIC,
        IEEE80211_SMPS_DYNAMIC,

        /* keep last */
        IEEE80211_SMPS_NUM_MODES,
};

/**
 * struct ieee80211_conf - configuration of the device
 *
 * This struct indicates how the driver shall configure the hardware.
 *
 * @flags: configuration flags defined above
 *
 * @listen_interval: listen interval in units of beacon interval
 * @ps_dtim_period: The DTIM period of the AP we're connected to, for use
 *        in power saving. Power saving will not be enabled until a beacon
 *        has been received and the DTIM period is known.
 * @dynamic_ps_timeout: The dynamic powersave timeout (in ms), see the
 *        powersave documentation below. This variable is valid only when
 *        the CONF_PS flag is set.
 *
 * @power_level: requested transmit power (in dBm), backward compatibility
 *        value only that is set to the minimum of all interfaces
 *
 * @chandef: the channel definition to tune to
 * @radar_enabled: whether radar detection is enabled
 *
 * @long_frame_max_tx_count: Maximum number of transmissions for a "long" frame
 *        (a frame not RTS protected), called "dot11LongRetryLimit" in 802.11,
 *        but actually means the number of transmissions not the number of retries
 * @short_frame_max_tx_count: Maximum number of transmissions for a "short"
 *        frame, called "dot11ShortRetryLimit" in 802.11, but actually means the
 *        number of transmissions not the number of retries
 *
 * @smps_mode: spatial multiplexing powersave mode; note that
 *        %IEEE80211_SMPS_STATIC is used when the device is not
 *        configured for an HT channel.
 *        Note that this is only valid if channel contexts are not used,
 *        otherwise each channel context has the number of chains listed.
 */
struct ieee80211_conf {
        u32 flags;
        int power_level, dynamic_ps_timeout;

        u16 listen_interval;
        u8 ps_dtim_period;

        u8 long_frame_max_tx_count, short_frame_max_tx_count;

        struct cfg80211_chan_def chandef;
        bool radar_enabled;
        enum ieee80211_smps_mode smps_mode;
};

/**
 * struct ieee80211_channel_switch - holds the channel switch data
 *
 * The information provided in this structure is required for channel switch
 * operation.
 *
 * @timestamp: value in microseconds of the 64-bit Time Synchronization
 *        Function (TSF) timer when the frame containing the channel switch
 *        announcement was received. This is simply the rx.mactime parameter
 *        the driver passed into mac80211.
 * @device_timestamp: arbitrary timestamp for the device, this is the
 *        rx.device_timestamp parameter the driver passed to mac80211.
 * @block_tx: Indicates whether transmission must be blocked before the
 *        scheduled channel switch, as indicated by the AP.
 * @chandef: the new channel to switch to
 * @count: the number of TBTT's until the channel switch event
 * @delay: maximum delay between the time the AP transmitted the last beacon in
  *        current channel and the expected time of the first beacon in the new
  *        channel, expressed in TU.
 */
struct ieee80211_channel_switch {
        u64 timestamp;
        u32 device_timestamp;
        bool block_tx;
        struct cfg80211_chan_def chandef;
        u8 count;
        u32 delay;
};

/**
 * enum ieee80211_vif_flags - virtual interface flags
 *
 * @IEEE80211_VIF_BEACON_FILTER: the device performs beacon filtering
 *        on this virtual interface to avoid unnecessary CPU wakeups
 * @IEEE80211_VIF_SUPPORTS_CQM_RSSI: the device can do connection quality
 *        monitoring on this virtual interface -- i.e. it can monitor
 *        connection quality related parameters, such as the RSSI level and
 *        provide notifications if configured trigger levels are reached.
 * @IEEE80211_VIF_SUPPORTS_UAPSD: The device can do U-APSD for this
 *        interface. This flag should be set during interface addition,
 *        but may be set/cleared as late as authentication to an AP. It is
 *        only valid for managed/station mode interfaces.
 * @IEEE80211_VIF_GET_NOA_UPDATE: request to handle NOA attributes
 *        and send P2P_PS notification to the driver if NOA changed, even
 *        this is not pure P2P vif.
 */
enum ieee80211_vif_flags {
        IEEE80211_VIF_BEACON_FILTER                = BIT(0),
        IEEE80211_VIF_SUPPORTS_CQM_RSSI                = BIT(1),
        IEEE80211_VIF_SUPPORTS_UAPSD                = BIT(2),
        IEEE80211_VIF_GET_NOA_UPDATE                = BIT(3),
};


/**
 * enum ieee80211_offload_flags - virtual interface offload flags
 *
 * @IEEE80211_OFFLOAD_ENCAP_ENABLED: tx encapsulation offload is enabled
 *        The driver supports sending frames passed as 802.3 frames by mac80211.
 *        It must also support sending 802.11 packets for the same interface.
 * @IEEE80211_OFFLOAD_ENCAP_4ADDR: support 4-address mode encapsulation offload
 */

enum ieee80211_offload_flags {
        IEEE80211_OFFLOAD_ENCAP_ENABLED                = BIT(0),
        IEEE80211_OFFLOAD_ENCAP_4ADDR                = BIT(1),
};

/**
 * struct ieee80211_vif - per-interface data
 *
 * Data in this structure is continually present for driver
 * use during the life of a virtual interface.
 *
 * @type: type of this virtual interface
 * @bss_conf: BSS configuration for this interface, either our own
 *        or the BSS we're associated to
 * @addr: address of this interface
 * @p2p: indicates whether this AP or STA interface is a p2p
 *        interface, i.e. a GO or p2p-sta respectively
 * @csa_active: marks whether a channel switch is going on. Internally it is
 *        write-protected by sdata_lock and local->mtx so holding either is fine
 *        for read access.
 * @mu_mimo_owner: indicates interface owns MU-MIMO capability
 * @driver_flags: flags/capabilities the driver has for this interface,
 *        these need to be set (or cleared) when the interface is added
 *        or, if supported by the driver, the interface type is changed
 *        at runtime, mac80211 will never touch this field
 * @offloaad_flags: hardware offload capabilities/flags for this interface.
 *        These are initialized by mac80211 before calling .add_interface,
 *        .change_interface or .update_vif_offload and updated by the driver
 *        within these ops, based on supported features or runtime change
 *        restrictions.
 * @hw_queue: hardware queue for each AC
 * @cab_queue: content-after-beacon (DTIM beacon really) queue, AP mode only
 * @chanctx_conf: The channel context this interface is assigned to, or %NULL
 *        when it is not assigned. This pointer is RCU-protected due to the TX
 *        path needing to access it; even though the netdev carrier will always
 *        be off when it is %NULL there can still be races and packets could be
 *        processed after it switches back to %NULL.
 * @debugfs_dir: debugfs dentry, can be used by drivers to create own per
 *        interface debug files. Note that it will be NULL for the virtual
 *        monitor interface (if that is requested.)
 * @probe_req_reg: probe requests should be reported to mac80211 for this
 *        interface.
 * @rx_mcast_action_reg: multicast Action frames should be reported to mac80211
 *        for this interface.
 * @drv_priv: data area for driver use, will always be aligned to
 *        sizeof(void \*).
 * @txq: the multicast data TX queue (if driver uses the TXQ abstraction)
 * @txqs_stopped: per AC flag to indicate that intermediate TXQs are stopped,
 *        protected by fq->lock.
 * @offload_flags: 802.3 -> 802.11 enapsulation offload flags, see
 *        &enum ieee80211_offload_flags.
 */
struct ieee80211_vif {
        enum nl80211_iftype type;
        struct ieee80211_bss_conf bss_conf;
        u8 addr[ETH_ALEN] __aligned(2);
        bool p2p;
        bool csa_active;
        bool mu_mimo_owner;

        u8 cab_queue;
        u8 hw_queue[IEEE80211_NUM_ACS];

        struct ieee80211_txq *txq;

        struct ieee80211_chanctx_conf __rcu *chanctx_conf;

        u32 driver_flags;
        u32 offload_flags;

#ifdef CONFIG_MAC80211_DEBUGFS
        struct dentry *debugfs_dir;
#endif

        bool probe_req_reg;
        bool rx_mcast_action_reg;

        bool txqs_stopped[IEEE80211_NUM_ACS];

        /* must be last */
        u8 drv_priv[] __aligned(sizeof(void *));
};

static inline bool ieee80211_vif_is_mesh(struct ieee80211_vif *vif)
{
#ifdef CONFIG_MAC80211_MESH
        return vif->type == NL80211_IFTYPE_MESH_POINT;
#endif
        return false;
}

/**
 * wdev_to_ieee80211_vif - return a vif struct from a wdev
 * @wdev: the wdev to get the vif for
 *
 * This can be used by mac80211 drivers with direct cfg80211 APIs
 * (like the vendor commands) that get a wdev.
 *
 * Note that this function may return %NULL if the given wdev isn't
 * associated with a vif that the driver knows about (e.g. monitor
 * or AP_VLAN interfaces.)
 */
struct ieee80211_vif *wdev_to_ieee80211_vif(struct wireless_dev *wdev);

/**
 * ieee80211_vif_to_wdev - return a wdev struct from a vif
 * @vif: the vif to get the wdev for
 *
 * This can be used by mac80211 drivers with direct cfg80211 APIs
 * (like the vendor commands) that needs to get the wdev for a vif.
 *
 * Note that this function may return %NULL if the given wdev isn't
 * associated with a vif that the driver knows about (e.g. monitor
 * or AP_VLAN interfaces.)
 */
struct wireless_dev *ieee80211_vif_to_wdev(struct ieee80211_vif *vif);

/**
 * enum ieee80211_key_flags - key flags
 *
 * These flags are used for communication about keys between the driver
 * and mac80211, with the @flags parameter of &struct ieee80211_key_conf.
 *
 * @IEEE80211_KEY_FLAG_GENERATE_IV: This flag should be set by the
 *        driver to indicate that it requires IV generation for this
 *        particular key. Setting this flag does not necessarily mean that SKBs
 *        will have sufficient tailroom for ICV or MIC.
 * @IEEE80211_KEY_FLAG_GENERATE_MMIC: This flag should be set by
 *        the driver for a TKIP key if it requires Michael MIC
 *        generation in software.
 * @IEEE80211_KEY_FLAG_PAIRWISE: Set by mac80211, this flag indicates
 *        that the key is pairwise rather then a shared key.
 * @IEEE80211_KEY_FLAG_SW_MGMT_TX: This flag should be set by the driver for a
 *        CCMP/GCMP key if it requires CCMP/GCMP encryption of management frames
 *        (MFP) to be done in software.
 * @IEEE80211_KEY_FLAG_PUT_IV_SPACE: This flag should be set by the driver
 *        if space should be prepared for the IV, but the IV
 *        itself should not be generated. Do not set together with
 *        @IEEE80211_KEY_FLAG_GENERATE_IV on the same key. Setting this flag does
 *        not necessarily mean that SKBs will have sufficient tailroom for ICV or
 *        MIC.
 * @IEEE80211_KEY_FLAG_RX_MGMT: This key will be used to decrypt received
 *        management frames. The flag can help drivers that have a hardware
 *        crypto implementation that doesn't deal with management frames
 *        properly by allowing them to not upload the keys to hardware and
 *        fall back to software crypto. Note that this flag deals only with
 *        RX, if your crypto engine can't deal with TX you can also set the
 *        %IEEE80211_KEY_FLAG_SW_MGMT_TX flag to encrypt such frames in SW.
 * @IEEE80211_KEY_FLAG_GENERATE_IV_MGMT: This flag should be set by the
 *        driver for a CCMP/GCMP key to indicate that is requires IV generation
 *        only for management frames (MFP).
 * @IEEE80211_KEY_FLAG_RESERVE_TAILROOM: This flag should be set by the
 *        driver for a key to indicate that sufficient tailroom must always
 *        be reserved for ICV or MIC, even when HW encryption is enabled.
 * @IEEE80211_KEY_FLAG_PUT_MIC_SPACE: This flag should be set by the driver for
 *        a TKIP key if it only requires MIC space. Do not set together with
 *        @IEEE80211_KEY_FLAG_GENERATE_MMIC on the same key.
 * @IEEE80211_KEY_FLAG_NO_AUTO_TX: Key needs explicit Tx activation.
 * @IEEE80211_KEY_FLAG_GENERATE_MMIE: This flag should be set by the driver
 *        for a AES_CMAC key to indicate that it requires sequence number
 *        generation only
 */
enum ieee80211_key_flags {
        IEEE80211_KEY_FLAG_GENERATE_IV_MGMT        = BIT(0),
        IEEE80211_KEY_FLAG_GENERATE_IV                = BIT(1),
        IEEE80211_KEY_FLAG_GENERATE_MMIC        = BIT(2),
        IEEE80211_KEY_FLAG_PAIRWISE                = BIT(3),
        IEEE80211_KEY_FLAG_SW_MGMT_TX                = BIT(4),
        IEEE80211_KEY_FLAG_PUT_IV_SPACE                = BIT(5),
        IEEE80211_KEY_FLAG_RX_MGMT                = BIT(6),
        IEEE80211_KEY_FLAG_RESERVE_TAILROOM        = BIT(7),
        IEEE80211_KEY_FLAG_PUT_MIC_SPACE        = BIT(8),
        IEEE80211_KEY_FLAG_NO_AUTO_TX                = BIT(9),
        IEEE80211_KEY_FLAG_GENERATE_MMIE        = BIT(10),
};

/**
 * struct ieee80211_key_conf - key information
 *
 * This key information is given by mac80211 to the driver by
 * the set_key() callback in &struct ieee80211_ops.
 *
 * @hw_key_idx: To be set by the driver, this is the key index the driver
 *        wants to be given when a frame is transmitted and needs to be
 *        encrypted in hardware.
 * @cipher: The key's cipher suite selector.
 * @tx_pn: PN used for TX keys, may be used by the driver as well if it
 *        needs to do software PN assignment by itself (e.g. due to TSO)
 * @flags: key flags, see &enum ieee80211_key_flags.
 * @keyidx: the key index (0-3)
 * @keylen: key material length
 * @key: key material. For ALG_TKIP the key is encoded as a 256-bit (32 byte)
 *         data block:
 *         - Temporal Encryption Key (128 bits)
 *         - Temporal Authenticator Tx MIC Key (64 bits)
 *         - Temporal Authenticator Rx MIC Key (64 bits)
 * @icv_len: The ICV length for this key type
 * @iv_len: The IV length for this key type
 */
struct ieee80211_key_conf {
        atomic64_t tx_pn;
        u32 cipher;
        u8 icv_len;
        u8 iv_len;
        u8 hw_key_idx;
        s8 keyidx;
        u16 flags;
        u8 keylen;
        u8 key[];
};

#define IEEE80211_MAX_PN_LEN        16

#define TKIP_PN_TO_IV16(pn) ((u16)(pn & 0xffff))
#define TKIP_PN_TO_IV32(pn) ((u32)((pn >> 16) & 0xffffffff))

/**
 * struct ieee80211_key_seq - key sequence counter
 *
 * @tkip: TKIP data, containing IV32 and IV16 in host byte order
 * @ccmp: PN data, most significant byte first (big endian,
 *        reverse order than in packet)
 * @aes_cmac: PN data, most significant byte first (big endian,
 *        reverse order than in packet)
 * @aes_gmac: PN data, most significant byte first (big endian,
 *        reverse order than in packet)
 * @gcmp: PN data, most significant byte first (big endian,
 *        reverse order than in packet)
 * @hw: data for HW-only (e.g. cipher scheme) keys
 */
struct ieee80211_key_seq {
        union {
                struct {
                        u32 iv32;
                        u16 iv16;
                } tkip;
                struct {
                        u8 pn[6];
                } ccmp;
                struct {
                        u8 pn[6];
                } aes_cmac;
                struct {
                        u8 pn[6];
                } aes_gmac;
                struct {
                        u8 pn[6];
                } gcmp;
                struct {
                        u8 seq[IEEE80211_MAX_PN_LEN];
                        u8 seq_len;
                } hw;
        };
};

/**
 * struct ieee80211_cipher_scheme - cipher scheme
 *
 * This structure contains a cipher scheme information defining
 * the secure packet crypto handling.
 *
 * @cipher: a cipher suite selector
 * @iftype: a cipher iftype bit mask indicating an allowed cipher usage
 * @hdr_len: a length of a security header used the cipher
 * @pn_len: a length of a packet number in the security header
 * @pn_off: an offset of pn from the beginning of the security header
 * @key_idx_off: an offset of key index byte in the security header
 * @key_idx_mask: a bit mask of key_idx bits
 * @key_idx_shift: a bit shift needed to get key_idx
 *     key_idx value calculation:
 *      (sec_header_base[key_idx_off] & key_idx_mask) >> key_idx_shift
 * @mic_len: a mic length in bytes
 */
struct ieee80211_cipher_scheme {
        u32 cipher;
        u16 iftype;
        u8 hdr_len;
        u8 pn_len;
        u8 pn_off;
        u8 key_idx_off;
        u8 key_idx_mask;
        u8 key_idx_shift;
        u8 mic_len;
};

/**
 * enum set_key_cmd - key command
 *
 * Used with the set_key() callback in &struct ieee80211_ops, this
 * indicates whether a key is being removed or added.
 *
 * @SET_KEY: a key is set
 * @DISABLE_KEY: a key must be disabled
 */
enum set_key_cmd {
        SET_KEY, DISABLE_KEY,
};

/**
 * enum ieee80211_sta_state - station state
 *
 * @IEEE80211_STA_NOTEXIST: station doesn't exist at all,
 *        this is a special state for add/remove transitions
 * @IEEE80211_STA_NONE: station exists without special state
 * @IEEE80211_STA_AUTH: station is authenticated
 * @IEEE80211_STA_ASSOC: station is associated
 * @IEEE80211_STA_AUTHORIZED: station is authorized (802.1X)
 */
enum ieee80211_sta_state {
        /* NOTE: These need to be ordered correctly! */
        IEEE80211_STA_NOTEXIST,
        IEEE80211_STA_NONE,
        IEEE80211_STA_AUTH,
        IEEE80211_STA_ASSOC,
        IEEE80211_STA_AUTHORIZED,
};

/**
 * enum ieee80211_sta_rx_bandwidth - station RX bandwidth
 * @IEEE80211_STA_RX_BW_20: station can only receive 20 MHz
 * @IEEE80211_STA_RX_BW_40: station can receive up to 40 MHz
 * @IEEE80211_STA_RX_BW_80: station can receive up to 80 MHz
 * @IEEE80211_STA_RX_BW_160: station can receive up to 160 MHz
 *        (including 80+80 MHz)
 *
 * Implementation note: 20 must be zero to be initialized
 *        correctly, the values must be sorted.
 */
enum ieee80211_sta_rx_bandwidth {
        IEEE80211_STA_RX_BW_20 = 0,
        IEEE80211_STA_RX_BW_40,
        IEEE80211_STA_RX_BW_80,
        IEEE80211_STA_RX_BW_160,
};

/**
 * struct ieee80211_sta_rates - station rate selection table
 *
 * @rcu_head: RCU head used for freeing the table on update
 * @rate: transmit rates/flags to be used by default.
 *        Overriding entries per-packet is possible by using cb tx control.
 */
struct ieee80211_sta_rates {
        struct rcu_head rcu_head;
        struct {
                s8 idx;
                u8 count;
                u8 count_cts;
                u8 count_rts;
                u16 flags;
        } rate[IEEE80211_TX_RATE_TABLE_SIZE];
};

/**
 * struct ieee80211_sta_txpwr - station txpower configuration
 *
 * Used to configure txpower for station.
 *
 * @power: indicates the tx power, in dBm, to be used when sending data frames
 *        to the STA.
 * @type: In particular if TPC %type is NL80211_TX_POWER_LIMITED then tx power
 *        will be less than or equal to specified from userspace, whereas if TPC
 *        %type is NL80211_TX_POWER_AUTOMATIC then it indicates default tx power.
 *        NL80211_TX_POWER_FIXED is not a valid configuration option for
 *        per peer TPC.
 */
struct ieee80211_sta_txpwr {
        s16 power;
        enum nl80211_tx_power_setting type;
};

/**
 * struct ieee80211_sta - station table entry
 *
 * A station table entry represents a station we are possibly
 * communicating with. Since stations are RCU-managed in
 * mac80211, any ieee80211_sta pointer you get access to must
 * either be protected by rcu_read_lock() explicitly or implicitly,
 * or you must take good care to not use such a pointer after a
 * call to your sta_remove callback that removed it.
 *
 * @addr: MAC address
 * @aid: AID we assigned to the station if we're an AP
 * @supp_rates: Bitmap of supported rates (per band)
 * @ht_cap: HT capabilities of this STA; restricted to our own capabilities
 * @vht_cap: VHT capabilities of this STA; restricted to our own capabilities
 * @he_cap: HE capabilities of this STA
 * @he_6ghz_capa: on 6 GHz, holds the HE 6 GHz band capabilities
 * @max_rx_aggregation_subframes: maximal amount of frames in a single AMPDU
 *        that this station is allowed to transmit to us.
 *        Can be modified by driver.
 * @wme: indicates whether the STA supports QoS/WME (if local devices does,
 *        otherwise always false)
 * @drv_priv: data area for driver use, will always be aligned to
 *        sizeof(void \*), size is determined in hw information.
 * @uapsd_queues: bitmap of queues configured for uapsd. Only valid
 *        if wme is supported. The bits order is like in
 *        IEEE80211_WMM_IE_STA_QOSINFO_AC_*.
 * @max_sp: max Service Period. Only valid if wme is supported.
 * @bandwidth: current bandwidth the station can receive with
 * @rx_nss: in HT/VHT, the maximum number of spatial streams the
 *        station can receive at the moment, changed by operating mode
 *        notifications and capabilities. The value is only valid after
 *        the station moves to associated state.
 * @smps_mode: current SMPS mode (off, static or dynamic)
 * @rates: rate control selection table
 * @tdls: indicates whether the STA is a TDLS peer
 * @tdls_initiator: indicates the STA is an initiator of the TDLS link. Only
 *        valid if the STA is a TDLS peer in the first place.
 * @mfp: indicates whether the STA uses management frame protection or not.
 * @max_amsdu_subframes: indicates the maximal number of MSDUs in a single
 *        A-MSDU. Taken from the Extended Capabilities element. 0 means
 *        unlimited.
 * @support_p2p_ps: indicates whether the STA supports P2P PS mechanism or not.
 * @max_rc_amsdu_len: Maximum A-MSDU size in bytes recommended by rate control.
 * @max_tid_amsdu_len: Maximum A-MSDU size in bytes for this TID
 * @txpwr: the station tx power configuration
 * @txq: per-TID data TX queues (if driver uses the TXQ abstraction); note that
 *        the last entry (%IEEE80211_NUM_TIDS) is used for non-data frames
 */
struct ieee80211_sta {
        u32 supp_rates[NUM_NL80211_BANDS];
        u8 addr[ETH_ALEN];
        u16 aid;
        struct ieee80211_sta_ht_cap ht_cap;
        struct ieee80211_sta_vht_cap vht_cap;
        struct ieee80211_sta_he_cap he_cap;
        struct ieee80211_he_6ghz_capa he_6ghz_capa;
        u16 max_rx_aggregation_subframes;
        bool wme;
        u8 uapsd_queues;
        u8 max_sp;
        u8 rx_nss;
        enum ieee80211_sta_rx_bandwidth bandwidth;
        enum ieee80211_smps_mode smps_mode;
        struct ieee80211_sta_rates __rcu *rates;
        bool tdls;
        bool tdls_initiator;
        bool mfp;
        u8 max_amsdu_subframes;

        /**
         * @max_amsdu_len:
         * indicates the maximal length of an A-MSDU in bytes.
         * This field is always valid for packets with a VHT preamble.
         * For packets with a HT preamble, additional limits apply:
         *
         * * If the skb is transmitted as part of a BA agreement, the
         *   A-MSDU maximal size is min(max_amsdu_len, 4065) bytes.
         * * If the skb is not part of a BA agreement, the A-MSDU maximal
         *   size is min(max_amsdu_len, 7935) bytes.
         *
         * Both additional HT limits must be enforced by the low level
         * driver. This is defined by the spec (IEEE 802.11-2012 section
         * 8.3.2.2 NOTE 2).
         */
        u16 max_amsdu_len;
        bool support_p2p_ps;
        u16 max_rc_amsdu_len;
        u16 max_tid_amsdu_len[IEEE80211_NUM_TIDS];
        struct ieee80211_sta_txpwr txpwr;

        struct ieee80211_txq *txq[IEEE80211_NUM_TIDS + 1];

        /* must be last */
        u8 drv_priv[] __aligned(sizeof(void *));
};

/**
 * enum sta_notify_cmd - sta notify command
 *
 * Used with the sta_notify() callback in &struct ieee80211_ops, this
 * indicates if an associated station made a power state transition.
 *
 * @STA_NOTIFY_SLEEP: a station is now sleeping
 * @STA_NOTIFY_AWAKE: a sleeping station woke up
 */
enum sta_notify_cmd {
        STA_NOTIFY_SLEEP, STA_NOTIFY_AWAKE,
};

/**
 * struct ieee80211_tx_control - TX control data
 *
 * @sta: station table entry, this sta pointer may be NULL and
 *         it is not allowed to copy the pointer, due to RCU.
 */
struct ieee80211_tx_control {
        struct ieee80211_sta *sta;
};

/**
 * struct ieee80211_txq - Software intermediate tx queue
 *
 * @vif: &struct ieee80211_vif pointer from the add_interface callback.
 * @sta: station table entry, %NULL for per-vif queue
 * @tid: the TID for this queue (unused for per-vif queue),
 *        %IEEE80211_NUM_TIDS for non-data (if enabled)
 * @ac: the AC for this queue
 * @drv_priv: driver private area, sized by hw->txq_data_size
 *
 * The driver can obtain packets from this queue by calling
 * ieee80211_tx_dequeue().
 */
struct ieee80211_txq {
        struct ieee80211_vif *vif;
        struct ieee80211_sta *sta;
        u8 tid;
        u8 ac;

        /* must be last */
        u8 drv_priv[] __aligned(sizeof(void *));
};

/**
 * enum ieee80211_hw_flags - hardware flags
 *
 * These flags are used to indicate hardware capabilities to
 * the stack. Generally, flags here should have their meaning
 * done in a way that the simplest hardware doesn't need setting
 * any particular flags. There are some exceptions to this rule,
 * however, so you are advised to review these flags carefully.
 *
 * @IEEE80211_HW_HAS_RATE_CONTROL:
 *        The hardware or firmware includes rate control, and cannot be
 *        controlled by the stack. As such, no rate control algorithm
 *        should be instantiated, and the TX rate reported to userspace
 *        will be taken from the TX status instead of the rate control
 *        algorithm.
 *        Note that this requires that the driver implement a number of
 *        callbacks so it has the correct information, it needs to have
 *        the @set_rts_threshold callback and must look at the BSS config
 *        @use_cts_prot for G/N protection, @use_short_slot for slot
 *        timing in 2.4 GHz and @use_short_preamble for preambles for
 *        CCK frames.
 *
 * @IEEE80211_HW_RX_INCLUDES_FCS:
 *        Indicates that received frames passed to the stack include
 *        the FCS at the end.
 *
 * @IEEE80211_HW_HOST_BROADCAST_PS_BUFFERING:
 *        Some wireless LAN chipsets buffer broadcast/multicast frames
 *        for power saving stations in the hardware/firmware and others
 *        rely on the host system for such buffering. This option is used
 *        to configure the IEEE 802.11 upper layer to buffer broadcast and
 *        multicast frames when there are power saving stations so that
 *        the driver can fetch them with ieee80211_get_buffered_bc().
 *
 * @IEEE80211_HW_SIGNAL_UNSPEC:
 *        Hardware can provide signal values but we don't know its units. We
 *        expect values between 0 and @max_signal.
 *        If possible please provide dB or dBm instead.
 *
 * @IEEE80211_HW_SIGNAL_DBM:
 *        Hardware gives signal values in dBm, decibel difference from
 *        one milliwatt. This is the preferred method since it is standardized
 *        between different devices. @max_signal does not need to be set.
 *
 * @IEEE80211_HW_SPECTRUM_MGMT:
 *         Hardware supports spectrum management defined in 802.11h
 *         Measurement, Channel Switch, Quieting, TPC
 *
 * @IEEE80211_HW_AMPDU_AGGREGATION:
 *        Hardware supports 11n A-MPDU aggregation.
 *
 * @IEEE80211_HW_SUPPORTS_PS:
 *        Hardware has power save support (i.e. can go to sleep).
 *
 * @IEEE80211_HW_PS_NULLFUNC_STACK:
 *        Hardware requires nullfunc frame handling in stack, implies
 *        stack support for dynamic PS.
 *
 * @IEEE80211_HW_SUPPORTS_DYNAMIC_PS:
 *        Hardware has support for dynamic PS.
 *
 * @IEEE80211_HW_MFP_CAPABLE:
 *        Hardware supports management frame protection (MFP, IEEE 802.11w).
 *
 * @IEEE80211_HW_REPORTS_TX_ACK_STATUS:
 *        Hardware can provide ack status reports of Tx frames to
 *        the stack.
 *
 * @IEEE80211_HW_CONNECTION_MONITOR:
 *        The hardware performs its own connection monitoring, including
 *        periodic keep-alives to the AP and probing the AP on beacon loss.
 *
 * @IEEE80211_HW_NEED_DTIM_BEFORE_ASSOC:
 *        This device needs to get data from beacon before association (i.e.
 *        dtim_period).
 *
 * @IEEE80211_HW_SUPPORTS_PER_STA_GTK: The device's crypto engine supports
 *        per-station GTKs as used by IBSS RSN or during fast transition. If
 *        the device doesn't support per-station GTKs, but can be asked not
 *        to decrypt group addressed frames, then IBSS RSN support is still
 *        possible but software crypto will be used. Advertise the wiphy flag
 *        only in that case.
 *
 * @IEEE80211_HW_AP_LINK_PS: When operating in AP mode the device
 *        autonomously manages the PS status of connected stations. When
 *        this flag is set mac80211 will not trigger PS mode for connected
 *        stations based on the PM bit of incoming frames.
 *        Use ieee80211_start_ps()/ieee8021_end_ps() to manually configure
 *        the PS mode of connected stations.
 *
 * @IEEE80211_HW_TX_AMPDU_SETUP_IN_HW: The device handles TX A-MPDU session
 *        setup strictly in HW. mac80211 should not attempt to do this in
 *        software.
 *
 * @IEEE80211_HW_WANT_MONITOR_VIF: The driver would like to be informed of
 *        a virtual monitor interface when monitor interfaces are the only
 *        active interfaces.
 *
 * @IEEE80211_HW_NO_AUTO_VIF: The driver would like for no wlanX to
 *        be created.  It is expected user-space will create vifs as
 *        desired (and thus have them named as desired).
 *
 * @IEEE80211_HW_SW_CRYPTO_CONTROL: The driver wants to control which of the
 *        crypto algorithms can be done in software - so don't automatically
 *        try to fall back to it if hardware crypto fails, but do so only if
 *        the driver returns 1. This also forces the driver to advertise its
 *        supported cipher suites.
 *
 * @IEEE80211_HW_SUPPORT_FAST_XMIT: The driver/hardware supports fast-xmit,
 *        this currently requires only the ability to calculate the duration
 *        for frames.
 *
 * @IEEE80211_HW_QUEUE_CONTROL: The driver wants to control per-interface
 *        queue mapping in order to use different queues (not just one per AC)
 *        for different virtual interfaces. See the doc section on HW queue
 *        control for more details.
 *
 * @IEEE80211_HW_SUPPORTS_RC_TABLE: The driver supports using a rate
 *        selection table provided by the rate control algorithm.
 *
 * @IEEE80211_HW_P2P_DEV_ADDR_FOR_INTF: Use the P2P Device address for any
 *        P2P Interface. This will be honoured even if more than one interface
 *        is supported.
 *
 * @IEEE80211_HW_TIMING_BEACON_ONLY: Use sync timing from beacon frames
 *        only, to allow getting TBTT of a DTIM beacon.
 *
 * @IEEE80211_HW_SUPPORTS_HT_CCK_RATES: Hardware supports mixing HT/CCK rates
 *        and can cope with CCK rates in an aggregation session (e.g. by not
 *        using aggregation for such frames.)
 *
 * @IEEE80211_HW_CHANCTX_STA_CSA: Support 802.11h based channel-switch (CSA)
 *        for a single active channel while using channel contexts. When support
 *        is not enabled the default action is to disconnect when getting the
 *        CSA frame.
 *
 * @IEEE80211_HW_SUPPORTS_CLONED_SKBS: The driver will never modify the payload
 *        or tailroom of TX skbs without copying them first.
 *
 * @IEEE80211_HW_SINGLE_SCAN_ON_ALL_BANDS: The HW supports scanning on all bands
 *        in one command, mac80211 doesn't have to run separate scans per band.
 *
 * @IEEE80211_HW_TDLS_WIDER_BW: The device/driver supports wider bandwidth
 *        than then BSS bandwidth for a TDLS link on the base channel.
 *
 * @IEEE80211_HW_SUPPORTS_AMSDU_IN_AMPDU: The driver supports receiving A-MSDUs
 *        within A-MPDU.
 *
 * @IEEE80211_HW_BEACON_TX_STATUS: The device/driver provides TX status
 *        for sent beacons.
 *
 * @IEEE80211_HW_NEEDS_UNIQUE_STA_ADDR: Hardware (or driver) requires that each
 *        station has a unique address, i.e. each station entry can be identified
 *        by just its MAC address; this prevents, for example, the same station
 *        from connecting to two virtual AP interfaces at the same time.
 *
 * @IEEE80211_HW_SUPPORTS_REORDERING_BUFFER: Hardware (or driver) manages the
 *        reordering buffer internally, guaranteeing mac80211 receives frames in
 *        order and does not need to manage its own reorder buffer or BA session
 *        timeout.
 *
 * @IEEE80211_HW_USES_RSS: The device uses RSS and thus requires parallel RX,
 *        which implies using per-CPU station statistics.
 *
 * @IEEE80211_HW_TX_AMSDU: Hardware (or driver) supports software aggregated
 *        A-MSDU frames. Requires software tx queueing and fast-xmit support.
 *        When not using minstrel/minstrel_ht rate control, the driver must
 *        limit the maximum A-MSDU size based on the current tx rate by setting
 *        max_rc_amsdu_len in struct ieee80211_sta.
 *
 * @IEEE80211_HW_TX_FRAG_LIST: Hardware (or driver) supports sending frag_list
 *        skbs, needed for zero-copy software A-MSDU.
 *
 * @IEEE80211_HW_REPORTS_LOW_ACK: The driver (or firmware) reports low ack event
 *        by ieee80211_report_low_ack() based on its own algorithm. For such
 *        drivers, mac80211 packet loss mechanism will not be triggered and driver
 *        is completely depending on firmware event for station kickout.
 *
 * @IEEE80211_HW_SUPPORTS_TX_FRAG: Hardware does fragmentation by itself.
 *        The stack will not do fragmentation.
 *        The callback for @set_frag_threshold should be set as well.
 *
 * @IEEE80211_HW_SUPPORTS_TDLS_BUFFER_STA: Hardware supports buffer STA on
 *        TDLS links.
 *
 * @IEEE80211_HW_DEAUTH_NEED_MGD_TX_PREP: The driver requires the
 *        mgd_prepare_tx() callback to be called before transmission of a
 *        deauthentication frame in case the association was completed but no
 *        beacon was heard. This is required in multi-channel scenarios, where the
 *        virtual interface might not be given air time for the transmission of
 *        the frame, as it is not synced with the AP/P2P GO yet, and thus the
 *        deauthentication frame might not be transmitted.
 *
 * @IEEE80211_HW_DOESNT_SUPPORT_QOS_NDP: The driver (or firmware) doesn't
 *        support QoS NDP for AP probing - that's most likely a driver bug.
 *
 * @IEEE80211_HW_BUFF_MMPDU_TXQ: use the TXQ for bufferable MMPDUs, this of
 *        course requires the driver to use TXQs to start with.
 *
 * @IEEE80211_HW_SUPPORTS_VHT_EXT_NSS_BW: (Hardware) rate control supports VHT
 *        extended NSS BW (dot11VHTExtendedNSSBWCapable). This flag will be set if
 *        the selected rate control algorithm sets %RATE_CTRL_CAPA_VHT_EXT_NSS_BW
 *        but if the rate control is built-in then it must be set by the driver.
 *        See also the documentation for that flag.
 *
 * @IEEE80211_HW_STA_MMPDU_TXQ: use the extra non-TID per-station TXQ for all
 *        MMPDUs on station interfaces. This of course requires the driver to use
 *        TXQs to start with.
 *
 * @IEEE80211_HW_TX_STATUS_NO_AMPDU_LEN: Driver does not report accurate A-MPDU
 *        length in tx status information
 *
 * @IEEE80211_HW_SUPPORTS_MULTI_BSSID: Hardware supports multi BSSID
 *
 * @IEEE80211_HW_SUPPORTS_ONLY_HE_MULTI_BSSID: Hardware supports multi BSSID
 *        only for HE APs. Applies if @IEEE80211_HW_SUPPORTS_MULTI_BSSID is set.
 *
 * @IEEE80211_HW_AMPDU_KEYBORDER_SUPPORT: The card and driver is only
 *        aggregating MPDUs with the same keyid, allowing mac80211 to keep Tx
 *        A-MPDU sessions active while rekeying with Extended Key ID.
 *
 * @IEEE80211_HW_SUPPORTS_TX_ENCAP_OFFLOAD: Hardware supports tx encapsulation
 *        offload
 *
 * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays
 */
enum ieee80211_hw_flags {
        IEEE80211_HW_HAS_RATE_CONTROL,
        IEEE80211_HW_RX_INCLUDES_FCS,
        IEEE80211_HW_HOST_BROADCAST_PS_BUFFERING,
        IEEE80211_HW_SIGNAL_UNSPEC,
        IEEE80211_HW_SIGNAL_DBM,
        IEEE80211_HW_NEED_DTIM_BEFORE_ASSOC,
        IEEE80211_HW_SPECTRUM_MGMT,
        IEEE80211_HW_AMPDU_AGGREGATION,
        IEEE80211_HW_SUPPORTS_PS,
        IEEE80211_HW_PS_NULLFUNC_STACK,
        IEEE80211_HW_SUPPORTS_DYNAMIC_PS,
        IEEE80211_HW_MFP_CAPABLE,
        IEEE80211_HW_WANT_MONITOR_VIF,
        IEEE80211_HW_NO_AUTO_VIF,
        IEEE80211_HW_SW_CRYPTO_CONTROL,
        IEEE80211_HW_SUPPORT_FAST_XMIT,
        IEEE80211_HW_REPORTS_TX_ACK_STATUS,
        IEEE80211_HW_CONNECTION_MONITOR,
        IEEE80211_HW_QUEUE_CONTROL,
        IEEE80211_HW_SUPPORTS_PER_STA_GTK,
        IEEE80211_HW_AP_LINK_PS,
        IEEE80211_HW_TX_AMPDU_SETUP_IN_HW,
        IEEE80211_HW_SUPPORTS_RC_TABLE,
        IEEE80211_HW_P2P_DEV_ADDR_FOR_INTF,
        IEEE80211_HW_TIMING_BEACON_ONLY,
        IEEE80211_HW_SUPPORTS_HT_CCK_RATES,
        IEEE80211_HW_CHANCTX_STA_CSA,
        IEEE80211_HW_SUPPORTS_CLONED_SKBS,
        IEEE80211_HW_SINGLE_SCAN_ON_ALL_BANDS,
        IEEE80211_HW_TDLS_WIDER_BW,
        IEEE80211_HW_SUPPORTS_AMSDU_IN_AMPDU,
        IEEE80211_HW_BEACON_TX_STATUS,
        IEEE80211_HW_NEEDS_UNIQUE_STA_ADDR,
        IEEE80211_HW_SUPPORTS_REORDERING_BUFFER,
        IEEE80211_HW_USES_RSS,
        IEEE80211_HW_TX_AMSDU,
        IEEE80211_HW_TX_FRAG_LIST,
        IEEE80211_HW_REPORTS_LOW_ACK,
        IEEE80211_HW_SUPPORTS_TX_FRAG,
        IEEE80211_HW_SUPPORTS_TDLS_BUFFER_STA,
        IEEE80211_HW_DEAUTH_NEED_MGD_TX_PREP,
        IEEE80211_HW_DOESNT_SUPPORT_QOS_NDP,
        IEEE80211_HW_BUFF_MMPDU_TXQ,
        IEEE80211_HW_SUPPORTS_VHT_EXT_NSS_BW,
        IEEE80211_HW_STA_MMPDU_TXQ,
        IEEE80211_HW_TX_STATUS_NO_AMPDU_LEN,
        IEEE80211_HW_SUPPORTS_MULTI_BSSID,
        IEEE80211_HW_SUPPORTS_ONLY_HE_MULTI_BSSID,
        IEEE80211_HW_AMPDU_KEYBORDER_SUPPORT,
        IEEE80211_HW_SUPPORTS_TX_ENCAP_OFFLOAD,

        /* keep last, obviously */
        NUM_IEEE80211_HW_FLAGS
};

/**
 * struct ieee80211_hw - hardware information and state
 *
 * This structure contains the configuration and hardware
 * information for an 802.11 PHY.
 *
 * @wiphy: This points to the &struct wiphy allocated for this
 *        802.11 PHY. You must fill in the @perm_addr and @dev
 *        members of this structure using SET_IEEE80211_DEV()
 *        and SET_IEEE80211_PERM_ADDR(). Additionally, all supported
 *        bands (with channels, bitrates) are registered here.
 *
 * @conf: &struct ieee80211_conf, device configuration, don't use.
 *
 * @priv: pointer to private area that was allocated for driver use
 *        along with this structure.
 *
 * @flags: hardware flags, see &enum ieee80211_hw_flags.
 *
 * @extra_tx_headroom: headroom to reserve in each transmit skb
 *        for use by the driver (e.g. for transmit headers.)
 *
 * @extra_beacon_tailroom: tailroom to reserve in each beacon tx skb.
 *        Can be used by drivers to add extra IEs.
 *
 * @max_signal: Maximum value for signal (rssi) in RX information, used
 *        only when @IEEE80211_HW_SIGNAL_UNSPEC or @IEEE80211_HW_SIGNAL_DB
 *
 * @max_listen_interval: max listen interval in units of beacon interval
 *        that HW supports
 *
 * @queues: number of available hardware transmit queues for
 *        data packets. WMM/QoS requires at least four, these
 *        queues need to have configurable access parameters.
 *
 * @rate_control_algorithm: rate control algorithm for this hardware.
 *        If unset (NULL), the default algorithm will be used. Must be
 *        set before calling ieee80211_register_hw().
 *
 * @vif_data_size: size (in bytes) of the drv_priv data area
 *        within &struct ieee80211_vif.
 * @sta_data_size: size (in bytes) of the drv_priv data area
 *        within &struct ieee80211_sta.
 * @chanctx_data_size: size (in bytes) of the drv_priv data area
 *        within &struct ieee80211_chanctx_conf.
 * @txq_data_size: size (in bytes) of the drv_priv data area
 *        within @struct ieee80211_txq.
 *
 * @max_rates: maximum number of alternate rate retry stages the hw
 *        can handle.
 * @max_report_rates: maximum number of alternate rate retry stages
 *        the hw can report back.
 * @max_rate_tries: maximum number of tries for each stage
 *
 * @max_rx_aggregation_subframes: maximum buffer size (number of
 *        sub-frames) to be used for A-MPDU block ack receiver
 *        aggregation.
 *        This is only relevant if the device has restrictions on the
 *        number of subframes, if it relies on mac80211 to do reordering
 *        it shouldn't be set.
 *
 * @max_tx_aggregation_subframes: maximum number of subframes in an
 *        aggregate an HT/HE device will transmit. In HT AddBA we'll
 *        advertise a constant value of 64 as some older APs crash if
 *        the window size is smaller (an example is LinkSys WRT120N
 *        with FW v1.0.07 build 002 Jun 18 2012).
 *        For AddBA to HE capable peers this value will be used.
 *
 * @max_tx_fragments: maximum number of tx buffers per (A)-MSDU, sum
 *        of 1 + skb_shinfo(skb)->nr_frags for each skb in the frag_list.
 *
 * @offchannel_tx_hw_queue: HW queue ID to use for offchannel TX
 *        (if %IEEE80211_HW_QUEUE_CONTROL is set)
 *
 * @radiotap_mcs_details: lists which MCS information can the HW
 *        reports, by default it is set to _MCS, _GI and _BW but doesn't
 *        include _FMT. Use %IEEE80211_RADIOTAP_MCS_HAVE_\* values, only
 *        adding _BW is supported today.
 *
 * @radiotap_vht_details: lists which VHT MCS information the HW reports,
 *        the default is _GI | _BANDWIDTH.
 *        Use the %IEEE80211_RADIOTAP_VHT_KNOWN_\* values.
 *
 * @radiotap_he: HE radiotap validity flags
 *
 * @radiotap_timestamp: Information for the radiotap timestamp field; if the
 *        @units_pos member is set to a non-negative value then the timestamp
 *        field will be added and populated from the &struct ieee80211_rx_status
 *        device_timestamp.
 * @radiotap_timestamp.units_pos: Must be set to a combination of a
 *        IEEE80211_RADIOTAP_TIMESTAMP_UNIT_* and a
 *        IEEE80211_RADIOTAP_TIMESTAMP_SPOS_* value.
 * @radiotap_timestamp.accuracy: If non-negative, fills the accuracy in the
 *        radiotap field and the accuracy known flag will be set.
 *
 * @netdev_features: netdev features to be set in each netdev created
 *        from this HW. Note that not all features are usable with mac80211,
 *        other features will be rejected during HW registration.
 *
 * @uapsd_queues: This bitmap is included in (re)association frame to indicate
 *        for each access category if it is uAPSD trigger-enabled and delivery-
 *        enabled. Use IEEE80211_WMM_IE_STA_QOSINFO_AC_* to set this bitmap.
 *        Each bit corresponds to different AC. Value '1' in specific bit means
 *        that corresponding AC is both trigger- and delivery-enabled. '0' means
 *        neither enabled.
 *
 * @uapsd_max_sp_len: maximum number of total buffered frames the WMM AP may
 *        deliver to a WMM STA during any Service Period triggered by the WMM STA.
 *        Use IEEE80211_WMM_IE_STA_QOSINFO_SP_* for correct values.
 *
 * @n_cipher_schemes: a size of an array of cipher schemes definitions.
 * @cipher_schemes: a pointer to an array of cipher scheme definitions
 *        supported by HW.
 * @max_nan_de_entries: maximum number of NAN DE functions supported by the
 *        device.
 *
 * @tx_sk_pacing_shift: Pacing shift to set on TCP sockets when frames from
 *        them are encountered. The default should typically not be changed,
 *        unless the driver has good reasons for needing more buffers.
 *
 * @weight_multiplier: Driver specific airtime weight multiplier used while
 *        refilling deficit of each TXQ.
 *
 * @max_mtu: the max mtu could be set.
 */
struct ieee80211_hw {
        struct ieee80211_conf conf;
        struct wiphy *wiphy;
        const char *rate_control_algorithm;
        void *priv;
        unsigned long flags[BITS_TO_LONGS(NUM_IEEE80211_HW_FLAGS)];
        unsigned int extra_tx_headroom;
        unsigned int extra_beacon_tailroom;
        int vif_data_size;
        int sta_data_size;
        int chanctx_data_size;
        int txq_data_size;
        u16 queues;
        u16 max_listen_interval;
        s8 max_signal;
        u8 max_rates;
        u8 max_report_rates;
        u8 max_rate_tries;
        u16 max_rx_aggregation_subframes;
        u16 max_tx_aggregation_subframes;
        u8 max_tx_fragments;
        u8 offchannel_tx_hw_queue;
        u8 radiotap_mcs_details;
        u16 radiotap_vht_details;
        struct {
                int units_pos;
                s16 accuracy;
        } radiotap_timestamp;
        netdev_features_t netdev_features;
        u8 uapsd_queues;
        u8 uapsd_max_sp_len;
        u8 n_cipher_schemes;
        const struct ieee80211_cipher_scheme *cipher_schemes;
        u8 max_nan_de_entries;
        u8 tx_sk_pacing_shift;
        u8 weight_multiplier;
        u32 max_mtu;
};

static inline bool _ieee80211_hw_check(struct ieee80211_hw *hw,
                                       enum ieee80211_hw_flags flg)
{
        return test_bit(flg, hw->flags);
}
#define ieee80211_hw_check(hw, flg)        _ieee80211_hw_check(hw, IEEE80211_HW_##flg)

static inline void _ieee80211_hw_set(struct ieee80211_hw *hw,
                                     enum ieee80211_hw_flags flg)
{
        return __set_bit(flg, hw->flags);
}
#define ieee80211_hw_set(hw, flg)        _ieee80211_hw_set(hw, IEEE80211_HW_##flg)

/**
 * struct ieee80211_scan_request - hw scan request
 *
 * @ies: pointers different parts of IEs (in req.ie)
 * @req: cfg80211 request.
 */
struct ieee80211_scan_request {
        struct ieee80211_scan_ies ies;

        /* Keep last */
        struct cfg80211_scan_request req;
};

/**
 * struct ieee80211_tdls_ch_sw_params - TDLS channel switch parameters
 *
 * @sta: peer this TDLS channel-switch request/response came from
 * @chandef: channel referenced in a TDLS channel-switch request
 * @action_code: see &enum ieee80211_tdls_actioncode
 * @status: channel-switch response status
 * @timestamp: time at which the frame was received
 * @switch_time: switch-timing parameter received in the frame
 * @switch_timeout: switch-timing parameter received in the frame
 * @tmpl_skb: TDLS switch-channel response template
 * @ch_sw_tm_ie: offset of the channel-switch timing IE inside @tmpl_skb
 */
struct ieee80211_tdls_ch_sw_params {
        struct ieee80211_sta *sta;
        struct cfg80211_chan_def *chandef;
        u8 action_code;
        u32 status;
        u32 timestamp;
        u16 switch_time;
        u16 switch_timeout;
        struct sk_buff *tmpl_skb;
        u32 ch_sw_tm_ie;
};

/**
 * wiphy_to_ieee80211_hw - return a mac80211 driver hw struct from a wiphy
 *
 * @wiphy: the &struct wiphy which we want to query
 *
 * mac80211 drivers can use this to get to their respective
 * &struct ieee80211_hw. Drivers wishing to get to their own private
 * structure can then access it via hw->priv. Note that mac802111 drivers should
 * not use wiphy_priv() to try to get their private driver structure as this
 * is already used internally by mac80211.
 *
 * Return: The mac80211 driver hw struct of @wiphy.
 */
struct ieee80211_hw *wiphy_to_ieee80211_hw(struct wiphy *wiphy);

/**
 * SET_IEEE80211_DEV - set device for 802.11 hardware
 *
 * @hw: the &struct ieee80211_hw to set the device for
 * @dev: the &struct device of this 802.11 device
 */
static inline void SET_IEEE80211_DEV(struct ieee80211_hw *hw, struct device *dev)
{
        set_wiphy_dev(hw->wiphy, dev);
}

/**
 * SET_IEEE80211_PERM_ADDR - set the permanent MAC address for 802.11 hardware
 *
 * @hw: the &struct ieee80211_hw to set the MAC address for
 * @addr: the address to set
 */
static inline void SET_IEEE80211_PERM_ADDR(struct ieee80211_hw *hw, const u8 *addr)
{
        memcpy(hw->wiphy->perm_addr, addr, ETH_ALEN);
}

static inline struct ieee80211_rate *
ieee80211_get_tx_rate(const struct ieee80211_hw *hw,
                      const struct ieee80211_tx_info *c)
{
        if (WARN_ON_ONCE(c->control.rates[0].idx < 0))
                return NULL;
        return &hw->wiphy->bands[c->band]->bitrates[c->control.rates[0].idx];
}

static inline struct ieee80211_rate *
ieee80211_get_rts_cts_rate(const struct ieee80211_hw *hw,
                           const struct ieee80211_tx_info *c)
{
        if (c->control.rts_cts_rate_idx < 0)
                return NULL;
        return &hw->wiphy->bands[c->band]->bitrates[c->control.rts_cts_rate_idx];
}

static inline struct ieee80211_rate *
ieee80211_get_alt_retry_rate(const struct ieee80211_hw *hw,
                             const struct ieee80211_tx_info *c, int idx)
{
        if (c->control.rates[idx + 1].idx < 0)
                return NULL;
        return &hw->wiphy->bands[c->band]->bitrates[c->control.rates[idx + 1].idx];
}

/**
 * ieee80211_free_txskb - free TX skb
 * @hw: the hardware
 * @skb: the skb
 *
 * Free a transmit skb. Use this function when some failure
 * to transmit happened and thus status cannot be reported.
 */
void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb);

/**
 * DOC: Hardware crypto acceleration
 *
 * mac80211 is capable of taking advantage of many hardware
 * acceleration designs for encryption and decryption operations.
 *
 * The set_key() callback in the &struct ieee80211_ops for a given
 * device is called to enable hardware acceleration of encryption and
 * decryption. The callback takes a @sta parameter that will be NULL
 * for default keys or keys used for transmission only, or point to
 * the station information for the peer for individual keys.
 * Multiple transmission keys with the same key index may be used when
 * VLANs are configured for an access point.
 *
 * When transmitting, the TX control data will use the @hw_key_idx
 * selected by the driver by modifying the &struct ieee80211_key_conf
 * pointed to by the @key parameter to the set_key() function.
 *
 * The set_key() call for the %SET_KEY command should return 0 if
 * the key is now in use, -%EOPNOTSUPP or -%ENOSPC if it couldn't be
 * added; if you return 0 then hw_key_idx must be assigned to the
 * hardware key index, you are free to use the full u8 range.
 *
 * Note that in the case that the @IEEE80211_HW_SW_CRYPTO_CONTROL flag is
 * set, mac80211 will not automatically fall back to software crypto if
 * enabling hardware crypto failed. The set_key() call may also return the
 * value 1 to permit this specific key/algorithm to be done in software.
 *
 * When the cmd is %DISABLE_KEY then it must succeed.
 *
 * Note that it is permissible to not decrypt a frame even if a key
 * for it has been uploaded to hardware, the stack will not make any
 * decision based on whether a key has been uploaded or not but rather
 * based on the receive flags.
 *
 * The &struct ieee80211_key_conf structure pointed to by the @key
 * parameter is guaranteed to be valid until another call to set_key()
 * removes it, but it can only be used as a cookie to differentiate
 * keys.
 *
 * In TKIP some HW need to be provided a phase 1 key, for RX decryption
 * acceleration (i.e. iwlwifi). Those drivers should provide update_tkip_key
 * handler.
 * The update_tkip_key() call updates the driver with the new phase 1 key.
 * This happens every time the iv16 wraps around (every 65536 packets). The
 * set_key() call will happen only once for each key (unless the AP did
 * rekeying), it will not include a valid phase 1 key. The valid phase 1 key is
 * provided by update_tkip_key only. The trigger that makes mac80211 call this
 * handler is software decryption with wrap around of iv16.
 *
 * The set_default_unicast_key() call updates the default WEP key index
 * configured to the hardware for WEP encryption type. This is required
 * for devices that support offload of data packets (e.g. ARP responses).
 *
 * Mac80211 drivers should set the @NL80211_EXT_FEATURE_CAN_REPLACE_PTK0 flag
 * when they are able to replace in-use PTK keys according to the following
 * requirements:
 * 1) They do not hand over frames decrypted with the old key to
      mac80211 once the call to set_key() with command %DISABLE_KEY has been
      completed when also setting @IEEE80211_KEY_FLAG_GENERATE_IV for any key,
   2) either drop or continue to use the old key for any outgoing frames queued
      at the time of the key deletion (including re-transmits),
   3) never send out a frame queued prior to the set_key() %SET_KEY command
      encrypted with the new key and
   4) never send out a frame unencrypted when it should be encrypted.
   Mac80211 will not queue any new frames for a deleted key to the driver.
 */

/**
 * DOC: Powersave support
 *
 * mac80211 has support for various powersave implementations.
 *
 * First, it can support hardware that handles all powersaving by itself,
 * such hardware should simply set the %IEEE80211_HW_SUPPORTS_PS hardware
 * flag. In that case, it will be told about the desired powersave mode
 * with the %IEEE80211_CONF_PS flag depending on the association status.
 * The hardware must take care of sending nullfunc frames when necessary,
 * i.e. when entering and leaving powersave mode. The hardware is required
 * to look at the AID in beacons and signal to the AP that it woke up when
 * it finds traffic directed to it.
 *
 * %IEEE80211_CONF_PS flag enabled means that the powersave mode defined in
 * IEEE 802.11-2007 section 11.2 is enabled. This is not to be confused
 * with hardware wakeup and sleep states. Driver is responsible for waking
 * up the hardware before issuing commands to the hardware and putting it
 * back to sleep at appropriate times.
 *
 * When PS is enabled, hardware needs to wakeup for beacons and receive the
 * buffered multicast/broadcast frames after the beacon. Also it must be
 * possible to send frames and receive the acknowledment frame.
 *
 * Other hardware designs cannot send nullfunc frames by themselves and also
 * need software support for parsing the TIM bitmap. This is also supported
 * by mac80211 by combining the %IEEE80211_HW_SUPPORTS_PS and
 * %IEEE80211_HW_PS_NULLFUNC_STACK flags. The hardware is of course still
 * required to pass up beacons. The hardware is still required to handle
 * waking up for multicast traffic; if it cannot the driver must handle that
 * as best as it can, mac80211 is too slow to do that.
 *
 * Dynamic powersave is an extension to normal powersave in which the
 * hardware stays awake for a user-specified period of time after sending a
 * frame so that reply frames need not be buffered and therefore delayed to
 * the next wakeup. It's compromise of getting good enough latency when
 * there's data traffic and still saving significantly power in idle
 * periods.
 *
 * Dynamic powersave is simply supported by mac80211 enabling and disabling
 * PS based on traffic. Driver needs to only set %IEEE80211_HW_SUPPORTS_PS
 * flag and mac80211 will handle everything automatically. Additionally,
 * hardware having support for the dynamic PS feature may set the
 * %IEEE80211_HW_SUPPORTS_DYNAMIC_PS flag to indicate that it can support
 * dynamic PS mode itself. The driver needs to look at the
 * @dynamic_ps_timeout hardware configuration value and use it that value
 * whenever %IEEE80211_CONF_PS is set. In this case mac80211 will disable
 * dynamic PS feature in stack and will just keep %IEEE80211_CONF_PS
 * enabled whenever user has enabled powersave.
 *
 * Driver informs U-APSD client support by enabling
 * %IEEE80211_VIF_SUPPORTS_UAPSD flag. The mode is configured through the
 * uapsd parameter in conf_tx() operation. Hardware needs to send the QoS
 * Nullfunc frames and stay awake until the service period has ended. To
 * utilize U-APSD, dynamic powersave is disabled for voip AC and all frames
 * from that AC are transmitted with powersave enabled.
 *
 * Note: U-APSD client mode is not yet supported with
 * %IEEE80211_HW_PS_NULLFUNC_STACK.
 */

/**
 * DOC: Beacon filter support
 *
 * Some hardware have beacon filter support to reduce host cpu wakeups
 * which will reduce system power consumption. It usually works so that
 * the firmware creates a checksum of the beacon but omits all constantly
 * changing elements (TSF, TIM etc). Whenever the checksum changes the
 * beacon is forwarded to the host, otherwise it will be just dropped. That
 * way the host will only receive beacons where some relevant information
 * (for example ERP protection or WMM settings) have changed.
 *
 * Beacon filter support is advertised with the %IEEE80211_VIF_BEACON_FILTER
 * interface capability. The driver needs to enable beacon filter support
 * whenever power save is enabled, that is %IEEE80211_CONF_PS is set. When
 * power save is enabled, the stack will not check for beacon loss and the
 * driver needs to notify about loss of beacons with ieee80211_beacon_loss().
 *
 * The time (or number of beacons missed) until the firmware notifies the
 * driver of a beacon loss event (which in turn causes the driver to call
 * ieee80211_beacon_loss()) should be configurable and will be controlled
 * by mac80211 and the roaming algorithm in the future.
 *
 * Since there may be constantly changing information elements that nothing
 * in the software stack cares about, we will, in the future, have mac80211
 * tell the driver which information elements are interesting in the sense
 * that we want to see changes in them. This will include
 *
 *  - a list of information element IDs
 *  - a list of OUIs for the vendor information element
 *
 * Ideally, the hardware would filter out any beacons without changes in the
 * requested elements, but if it cannot support that it may, at the expense
 * of some efficiency, filter out only a subset. For example, if the device
 * doesn't support checking for OUIs it should pass up all changes in all
 * vendor information elements.
 *
 * Note that change, for the sake of simplification, also includes information
 * elements appearing or disappearing from the beacon.
 *
 * Some hardware supports an "ignore list" instead, just make sure nothing
 * that was requested is on the ignore list, and include commonly changing
 * information element IDs in the ignore list, for example 11 (BSS load) and
 * the various vendor-assigned IEs with unknown contents (128, 129, 133-136,
 * 149, 150, 155, 156, 173, 176, 178, 179, 219); for forward compatibility
 * it could also include some currently unused IDs.
 *
 *
 * In addition to these capabilities, hardware should support notifying the
 * host of changes in the beacon RSSI. This is relevant to implement roaming
 * when no traffic is flowing (when traffic is flowing we see the RSSI of
 * the received data packets). This can consist in notifying the host when
 * the RSSI changes significantly or when it drops below or rises above
 * configurable thresholds. In the future these thresholds will also be
 * configured by mac80211 (which gets them from userspace) to implement
 * them as the roaming algorithm requires.
 *
 * If the hardware cannot implement this, the driver should ask it to
 * periodically pass beacon frames to the host so that software can do the
 * signal strength threshold checking.
 */

/**
 * DOC: Spatial multiplexing power save
 *
 * SMPS (Spatial multiplexing power save) is a mechanism to conserve
 * power in an 802.11n implementation. For details on the mechanism
 * and rationale, please refer to 802.11 (as amended by 802.11n-2009)
 * "11.2.3 SM power save".
 *
 * The mac80211 implementation is capable of sending action frames
 * to update the AP about the station's SMPS mode, and will instruct
 * the driver to enter the specific mode. It will also announce the
 * requested SMPS mode during the association handshake. Hardware
 * support for this feature is required, and can be indicated by
 * hardware flags.
 *
 * The default mode will be "automatic", which nl80211/cfg80211
 * defines to be dynamic SMPS in (regular) powersave, and SMPS
 * turned off otherwise.
 *
 * To support this feature, the driver must set the appropriate
 * hardware support flags, and handle the SMPS flag to the config()
 * operation. It will then with this mechanism be instructed to
 * enter the requested SMPS mode while associated to an HT AP.
 */

/**
 * DOC: Frame filtering
 *
 * mac80211 requires to see many management frames for proper
 * operation, and users may want to see many more frames when
 * in monitor mode. However, for best CPU usage and power consumption,
 * having as few frames as possible percolate through the stack is
 * desirable. Hence, the hardware should filter as much as possible.
 *
 * To achieve this, mac80211 uses filter flags (see below) to tell
 * the driver's configure_filter() function which frames should be
 * passed to mac80211 and which should be filtered out.
 *
 * Before configure_filter() is invoked, the prepare_multicast()
 * callback is invoked with the parameters @mc_count and @mc_list
 * for the combined multicast address list of all virtual interfaces.
 * It's use is optional, and it returns a u64 that is passed to
 * configure_filter(). Additionally, configure_filter() has the
 * arguments @changed_flags telling which flags were changed and
 * @total_flags with the new flag states.
 *
 * If your device has no multicast address filters your driver will
 * need to check both the %FIF_ALLMULTI flag and the @mc_count
 * parameter to see whether multicast frames should be accepted
 * or dropped.
 *
 * All unsupported flags in @total_flags must be cleared.
 * Hardware does not support a flag if it is incapable of _passing_
 * the frame to the stack. Otherwise the driver must ignore
 * the flag, but not clear it.
 * You must _only_ clear the flag (announce no support for the
 * flag to mac80211) if you are not able to pass the packet type
 * to the stack (so the hardware always filters it).
 * So for example, you should clear @FIF_CONTROL, if your hardware
 * always filters control frames. If your hardware always passes
 * control frames to the kernel and is incapable of filtering them,
 * you do _not_ clear the @FIF_CONTROL flag.
 * This rule applies to all other FIF flags as well.
 */

/**
 * DOC: AP support for powersaving clients
 *
 * In order to implement AP and P2P GO modes, mac80211 has support for
 * client powersaving, both "legacy" PS (PS-Poll/null data) and uAPSD.
 * There currently is no support for sAPSD.
 *
 * There is one assumption that mac80211 makes, namely that a client
 * will not poll with PS-Poll and trigger with uAPSD at the same time.
 * Both are supported, and both can be used by the same client, but
 * they can't be used concurrently by the same client. This simplifies
 * the driver code.
 *
 * The first thing to keep in mind is that there is a flag for complete
 * driver implementation: %IEEE80211_HW_AP_LINK_PS. If this flag is set,
 * mac80211 expects the driver to handle most of the state machine for
 * powersaving clients and will ignore the PM bit in incoming frames.
 * Drivers then use ieee80211_sta_ps_transition() to inform mac80211 of
 * stations' powersave transitions. In this mode, mac80211 also doesn't
 * handle PS-Poll/uAPSD.
 *
 * In the mode without %IEEE80211_HW_AP_LINK_PS, mac80211 will check the
 * PM bit in incoming frames for client powersave transitions. When a
 * station goes to sleep, we will stop transmitting to it. There is,
 * however, a race condition: a station might go to sleep while there is
 * data buffered on hardware queues. If the device has support for this
 * it will reject frames, and the driver should give the frames back to
 * mac80211 with the %IEEE80211_TX_STAT_TX_FILTERED flag set which will
 * cause mac80211 to retry the frame when the station wakes up. The
 * driver is also notified of powersave transitions by calling its
 * @sta_notify callback.
 *
 * When the station is asleep, it has three choices: it can wake up,
 * it can PS-Poll, or it can possibly start a uAPSD service period.
 * Waking up is implemented by simply transmitting all buffered (and
 * filtered) frames to the station. This is the easiest case. When
 * the station sends a PS-Poll or a uAPSD trigger frame, mac80211
 * will inform the driver of this with the @allow_buffered_frames
 * callback; this callback is optional. mac80211 will then transmit
 * the frames as usual and set the %IEEE80211_TX_CTL_NO_PS_BUFFER
 * on each frame. The last frame in the service period (or the only
 * response to a PS-Poll) also has %IEEE80211_TX_STATUS_EOSP set to
 * indicate that it ends the service period; as this frame must have
 * TX status report it also sets %IEEE80211_TX_CTL_REQ_TX_STATUS.
 * When TX status is reported for this frame, the service period is
 * marked has having ended and a new one can be started by the peer.
 *
 * Additionally, non-bufferable MMPDUs can also be transmitted by
 * mac80211 with the %IEEE80211_TX_CTL_NO_PS_BUFFER set in them.
 *
 * Another race condition can happen on some devices like iwlwifi
 * when there are frames queued for the station and it wakes up
 * or polls; the frames that are already queued could end up being
 * transmitted first instead, causing reordering and/or wrong
 * processing of the EOSP. The cause is that allowing frames to be
 * transmitted to a certain station is out-of-band communication to
 * the device. To allow this problem to be solved, the driver can
 * call ieee80211_sta_block_awake() if frames are buffered when it
 * is notified that the station went to sleep. When all these frames
 * have been filtered (see above), it must call the function again
 * to indicate that the station is no longer blocked.
 *
 * If the driver buffers frames in the driver for aggregation in any
 * way, it must use the ieee80211_sta_set_buffered() call when it is
 * notified of the station going to sleep to inform mac80211 of any
 * TIDs that have frames buffered. Note that when a station wakes up
 * this information is reset (hence the requirement to call it when
 * informed of the station going to sleep). Then, when a service
 * period starts for any reason, @release_buffered_frames is called
 * with the number of frames to be released and which TIDs they are
 * to come from. In this case, the driver is responsible for setting
 * the EOSP (for uAPSD) and MORE_DATA bits in the released frames,
 * to help the @more_data parameter is passed to tell the driver if
 * there is more data on other TIDs -- the TIDs to release frames
 * from are ignored since mac80211 doesn't know how many frames the
 * buffers for those TIDs contain.
 *
 * If the driver also implement GO mode, where absence periods may
 * shorten service periods (or abort PS-Poll responses), it must
 * filter those response frames except in the case of frames that
 * are buffered in the driver -- those must remain buffered to avoid
 * reordering. Because it is possible that no frames are released
 * in this case, the driver must call ieee80211_sta_eosp()
 * to indicate to mac80211 that the service period ended anyway.
 *
 * Finally, if frames from multiple TIDs are released from mac80211
 * but the driver might reorder them, it must clear & set the flags
 * appropriately (only the last frame may have %IEEE80211_TX_STATUS_EOSP)
 * and also take care of the EOSP and MORE_DATA bits in the frame.
 * The driver may also use ieee80211_sta_eosp() in this case.
 *
 * Note that if the driver ever buffers frames other than QoS-data
 * frames, it must take care to never send a non-QoS-data frame as
 * the last frame in a service period, adding a QoS-nulldata frame
 * after a non-QoS-data frame if needed.
 */

/**
 * DOC: HW queue control
 *
 * Before HW queue control was introduced, mac80211 only had a single static
 * assignment of per-interface AC software queues to hardware queues. This
 * was problematic for a few reasons:
 * 1) off-channel transmissions might get stuck behind other frames
 * 2) multiple virtual interfaces couldn't be handled correctly
 * 3) after-DTIM frames could get stuck behind other frames
 *
 * To solve this, hardware typically uses multiple different queues for all
 * the different usages, and this needs to be propagated into mac80211 so it
 * won't have the same problem with the software queues.
 *
 * Therefore, mac80211 now offers the %IEEE80211_HW_QUEUE_CONTROL capability
 * flag that tells it that the driver implements its own queue control. To do
 * so, the driver will set up the various queues in each &struct ieee80211_vif
 * and the offchannel queue in &struct ieee80211_hw. In response, mac80211 will
 * use those queue IDs in the hw_queue field of &struct ieee80211_tx_info and
 * if necessary will queue the frame on the right software queue that mirrors
 * the hardware queue.
 * Additionally, the driver has to then use these HW queue IDs for the queue
 * management functions (ieee80211_stop_queue() et al.)
 *
 * The driver is free to set up the queue mappings as needed, multiple virtual
 * interfaces may map to the same hardware queues if needed. The setup has to
 * happen during add_interface or change_interface callbacks. For example, a
 * driver supporting station+station and station+AP modes might decide to have
 * 10 hardware queues to handle different scenarios:
 *
 * 4 AC HW queues for 1st vif: 0, 1, 2, 3
 * 4 AC HW queues for 2nd vif: 4, 5, 6, 7
 * after-DTIM queue for AP:   8
 * off-channel queue:         9
 *
 * It would then set up the hardware like this:
 *   hw.offchannel_tx_hw_queue = 9
 *
 * and the first virtual interface that is added as follows:
 *   vif.hw_queue[IEEE80211_AC_VO] = 0
 *   vif.hw_queue[IEEE80211_AC_VI] = 1
 *   vif.hw_queue[IEEE80211_AC_BE] = 2
 *   vif.hw_queue[IEEE80211_AC_BK] = 3
 *   vif.cab_queue = 8 // if AP mode, otherwise %IEEE80211_INVAL_HW_QUEUE
 * and the second virtual interface with 4-7.
 *
 * If queue 6 gets full, for example, mac80211 would only stop the second
 * virtual interface's BE queue since virtual interface queues are per AC.
 *
 * Note that the vif.cab_queue value should be set to %IEEE80211_INVAL_HW_QUEUE
 * whenever the queue is not used (i.e. the interface is not in AP mode) if the
 * queue could potentially be shared since mac80211 will look at cab_queue when
 * a queue is stopped/woken even if the interface is not in AP mode.
 */

/**
 * enum ieee80211_filter_flags - hardware filter flags
 *
 * These flags determine what the filter in hardware should be
 * programmed to let through and what should not be passed to the
 * stack. It is always safe to pass more frames than requested,
 * but this has negative impact on power consumption.
 *
 * @FIF_ALLMULTI: pass all multicast frames, this is used if requested
 *        by the user or if the hardware is not capable of filtering by
 *        multicast address.
 *
 * @FIF_FCSFAIL: pass frames with failed FCS (but you need to set the
 *        %RX_FLAG_FAILED_FCS_CRC for them)
 *
 * @FIF_PLCPFAIL: pass frames with failed PLCP CRC (but you need to set
 *        the %RX_FLAG_FAILED_PLCP_CRC for them
 *
 * @FIF_BCN_PRBRESP_PROMISC: This flag is set during scanning to indicate
 *        to the hardware that it should not filter beacons or probe responses
 *        by BSSID. Filtering them can greatly reduce the amount of processing
 *        mac80211 needs to do and the amount of CPU wakeups, so you should
 *        honour this flag if possible.
 *
 * @FIF_CONTROL: pass control frames (except for PS Poll) addressed to this
 *        station
 *
 * @FIF_OTHER_BSS: pass frames destined to other BSSes
 *
 * @FIF_PSPOLL: pass PS Poll frames
 *
 * @FIF_PROBE_REQ: pass probe request frames
 *
 * @FIF_MCAST_ACTION: pass multicast Action frames
 */
enum ieee80211_filter_flags {
        FIF_ALLMULTI                = 1<<1,
        FIF_FCSFAIL                = 1<<2,
        FIF_PLCPFAIL                = 1<<3,
        FIF_BCN_PRBRESP_PROMISC        = 1<<4,
        FIF_CONTROL                = 1<<5,
        FIF_OTHER_BSS                = 1<<6,
        FIF_PSPOLL                = 1<<7,
        FIF_PROBE_REQ                = 1<<8,
        FIF_MCAST_ACTION        = 1<<9,
};

/**
 * enum ieee80211_ampdu_mlme_action - A-MPDU actions
 *
 * These flags are used with the ampdu_action() callback in
 * &struct ieee80211_ops to indicate which action is needed.
 *
 * Note that drivers MUST be able to deal with a TX aggregation
 * session being stopped even before they OK'ed starting it by
 * calling ieee80211_start_tx_ba_cb_irqsafe, because the peer
 * might receive the addBA frame and send a delBA right away!
 *
 * @IEEE80211_AMPDU_RX_START: start RX aggregation
 * @IEEE80211_AMPDU_RX_STOP: stop RX aggregation
 * @IEEE80211_AMPDU_TX_START: start TX aggregation, the driver must either
 *        call ieee80211_start_tx_ba_cb_irqsafe() or
 *        call ieee80211_start_tx_ba_cb_irqsafe() with status
 *        %IEEE80211_AMPDU_TX_START_DELAY_ADDBA to delay addba after
 *        ieee80211_start_tx_ba_cb_irqsafe is called, or just return the special
 *        status %IEEE80211_AMPDU_TX_START_IMMEDIATE.
 * @IEEE80211_AMPDU_TX_OPERATIONAL: TX aggregation has become operational
 * @IEEE80211_AMPDU_TX_STOP_CONT: stop TX aggregation but continue transmitting
 *        queued packets, now unaggregated. After all packets are transmitted the
 *        driver has to call ieee80211_stop_tx_ba_cb_irqsafe().
 * @IEEE80211_AMPDU_TX_STOP_FLUSH: stop TX aggregation and flush all packets,
 *        called when the station is removed. There's no need or reason to call
 *        ieee80211_stop_tx_ba_cb_irqsafe() in this case as mac80211 assumes the
 *        session is gone and removes the station.
 * @IEEE80211_AMPDU_TX_STOP_FLUSH_CONT: called when TX aggregation is stopped
 *        but the driver hasn't called ieee80211_stop_tx_ba_cb_irqsafe() yet and
 *        now the connection is dropped and the station will be removed. Drivers
 *        should clean up and drop remaining packets when this is called.
 */
enum ieee80211_ampdu_mlme_action {
        IEEE80211_AMPDU_RX_START,
        IEEE80211_AMPDU_RX_STOP,
        IEEE80211_AMPDU_TX_START,
        IEEE80211_AMPDU_TX_STOP_CONT,
        IEEE80211_AMPDU_TX_STOP_FLUSH,
        IEEE80211_AMPDU_TX_STOP_FLUSH_CONT,
        IEEE80211_AMPDU_TX_OPERATIONAL,
};

#define IEEE80211_AMPDU_TX_START_IMMEDIATE 1
#define IEEE80211_AMPDU_TX_START_DELAY_ADDBA 2

/**
 * struct ieee80211_ampdu_params - AMPDU action parameters
 *
 * @action: the ampdu action, value from %ieee80211_ampdu_mlme_action.
 * @sta: peer of this AMPDU session
 * @tid: tid of the BA session
 * @ssn: start sequence number of the session. TX/RX_STOP can pass 0. When
 *        action is set to %IEEE80211_AMPDU_RX_START the driver passes back the
 *        actual ssn value used to start the session and writes the value here.
 * @buf_size: reorder buffer size  (number of subframes). Valid only when the
 *        action is set to %IEEE80211_AMPDU_RX_START or
 *        %IEEE80211_AMPDU_TX_OPERATIONAL
 * @amsdu: indicates the peer's ability to receive A-MSDU within A-MPDU.
 *        valid when the action is set to %IEEE80211_AMPDU_TX_OPERATIONAL
 * @timeout: BA session timeout. Valid only when the action is set to
 *        %IEEE80211_AMPDU_RX_START
 */
struct ieee80211_ampdu_params {
        enum ieee80211_ampdu_mlme_action action;
        struct ieee80211_sta *sta;
        u16 tid;
        u16 ssn;
        u16 buf_size;
        bool amsdu;
        u16 timeout;
};

/**
 * enum ieee80211_frame_release_type - frame release reason
 * @IEEE80211_FRAME_RELEASE_PSPOLL: frame released for PS-Poll
 * @IEEE80211_FRAME_RELEASE_UAPSD: frame(s) released due to
 *        frame received on trigger-enabled AC
 */
enum ieee80211_frame_release_type {
        IEEE80211_FRAME_RELEASE_PSPOLL,
        IEEE80211_FRAME_RELEASE_UAPSD,
};

/**
 * enum ieee80211_rate_control_changed - flags to indicate what changed
 *
 * @IEEE80211_RC_BW_CHANGED: The bandwidth that can be used to transmit
 *        to this station changed. The actual bandwidth is in the station
 *        information -- for HT20/40 the IEEE80211_HT_CAP_SUP_WIDTH_20_40
 *        flag changes, for HT and VHT the bandwidth field changes.
 * @IEEE80211_RC_SMPS_CHANGED: The SMPS state of the station changed.
 * @IEEE80211_RC_SUPP_RATES_CHANGED: The supported rate set of this peer
 *        changed (in IBSS mode) due to discovering more information about
 *        the peer.
 * @IEEE80211_RC_NSS_CHANGED: N_SS (number of spatial streams) was changed
 *        by the peer
 */
enum ieee80211_rate_control_changed {
        IEEE80211_RC_BW_CHANGED                = BIT(0),
        IEEE80211_RC_SMPS_CHANGED        = BIT(1),
        IEEE80211_RC_SUPP_RATES_CHANGED        = BIT(2),
        IEEE80211_RC_NSS_CHANGED        = BIT(3),
};

/**
 * enum ieee80211_roc_type - remain on channel type
 *
 * With the support for multi channel contexts and multi channel operations,
 * remain on channel operations might be limited/deferred/aborted by other
 * flows/operations which have higher priority (and vice versa).
 * Specifying the ROC type can be used by devices to prioritize the ROC
 * operations compared to other operations/flows.
 *
 * @IEEE80211_ROC_TYPE_NORMAL: There are no special requirements for this ROC.
 * @IEEE80211_ROC_TYPE_MGMT_TX: The remain on channel request is required
 *        for sending management frames offchannel.
 */
enum ieee80211_roc_type {
        IEEE80211_ROC_TYPE_NORMAL = 0,
        IEEE80211_ROC_TYPE_MGMT_TX,
};

/**
 * enum ieee80211_reconfig_type - reconfig type
 *
 * This enum is used by the reconfig_complete() callback to indicate what
 * reconfiguration type was completed.
 *
 * @IEEE80211_RECONFIG_TYPE_RESTART: hw restart type
 *        (also due to resume() callback returning 1)
 * @IEEE80211_RECONFIG_TYPE_SUSPEND: suspend type (regardless
 *        of wowlan configuration)
 */
enum ieee80211_reconfig_type {
        IEEE80211_RECONFIG_TYPE_RESTART,
        IEEE80211_RECONFIG_TYPE_SUSPEND,
};

/**
 * struct ieee80211_ops - callbacks from mac80211 to the driver
 *
 * This structure contains various callbacks that the driver may
 * handle or, in some cases, must handle, for example to configure
 * the hardware to a new channel or to transmit a frame.
 *
 * @tx: Handler that 802.11 module calls for each transmitted frame.
 *        skb contains the buffer starting from the IEEE 802.11 header.
 *        The low-level driver should send the frame out based on
 *        configuration in the TX control data. This handler should,
 *        preferably, never fail and stop queues appropriately.
 *        Must be atomic.
 *
 * @start: Called before the first netdevice attached to the hardware
 *        is enabled. This should turn on the hardware and must turn on
 *        frame reception (for possibly enabled monitor interfaces.)
 *        Returns negative error codes, these may be seen in userspace,
 *        or zero.
 *        When the device is started it should not have a MAC address
 *        to avoid acknowledging frames before a non-monitor device
 *        is added.
 *        Must be implemented and can sleep.
 *
 * @stop: Called after last netdevice attached to the hardware
 *        is disabled. This should turn off the hardware (at least
 *        it must turn off frame reception.)
 *        May be called right after add_interface if that rejects
 *        an interface. If you added any work onto the mac80211 workqueue
 *        you should ensure to cancel it on this callback.
 *        Must be implemented and can sleep.
 *
 * @suspend: Suspend the device; mac80211 itself will quiesce before and
 *        stop transmitting and doing any other configuration, and then
 *        ask the device to suspend. This is only invoked when WoWLAN is
 *        configured, otherwise the device is deconfigured completely and
 *        reconfigured at resume time.
 *        The driver may also impose special conditions under which it
 *        wants to use the "normal" suspend (deconfigure), say if it only
 *        supports WoWLAN when the device is associated. In this case, it
 *        must return 1 from this function.
 *
 * @resume: If WoWLAN was configured, this indicates that mac80211 is
 *        now resuming its operation, after this the device must be fully
 *        functional again. If this returns an error, the only way out is
 *        to also unregister the device. If it returns 1, then mac80211
 *        will also go through the regular complete restart on resume.
 *
 * @set_wakeup: Enable or disable wakeup when WoWLAN configuration is
 *        modified. The reason is that device_set_wakeup_enable() is
 *        supposed to be called when the configuration changes, not only
 *        in suspend().
 *
 * @add_interface: Called when a netdevice attached to the hardware is
 *        enabled. Because it is not called for monitor mode devices, @start
 *        and @stop must be implemented.
 *        The driver should perform any initialization it needs before
 *        the device can be enabled. The initial configuration for the
 *        interface is given in the conf parameter.
 *        The callback may refuse to add an interface by returning a
 *        negative error code (which will be seen in userspace.)
 *        Must be implemented and can sleep.
 *
 * @change_interface: Called when a netdevice changes type. This callback
 *        is optional, but only if it is supported can interface types be
 *        switched while the interface is UP. The callback may sleep.
 *        Note that while an interface is being switched, it will not be
 *        found by the interface iteration callbacks.
 *
 * @remove_interface: Notifies a driver that an interface is going down.
 *        The @stop callback is called after this if it is the last interface
 *        and no monitor interfaces are present.
 *        When all interfaces are removed, the MAC address in the hardware
 *        must be cleared so the device no longer acknowledges packets,
 *        the mac_addr member of the conf structure is, however, set to the
 *        MAC address of the device going away.
 *        Hence, this callback must be implemented. It can sleep.
 *
 * @config: Handler for configuration requests. IEEE 802.11 code calls this
 *        function to change hardware configuration, e.g., channel.
 *        This function should never fail but returns a negative error code
 *        if it does. The callback can sleep.
 *
 * @bss_info_changed: Handler for configuration requests related to BSS
 *        parameters that may vary during BSS's lifespan, and may affect low
 *        level driver (e.g. assoc/disassoc status, erp parameters).
 *        This function should not be used if no BSS has been set, unless
 *        for association indication. The @changed parameter indicates which
 *        of the bss parameters has changed when a call is made. The callback
 *        can sleep.
 *
 * @prepare_multicast: Prepare for multicast filter configuration.
 *        This callback is optional, and its return value is passed
 *        to configure_filter(). This callback must be atomic.
 *
 * @configure_filter: Configure the device's RX filter.
 *        See the section "Frame filtering" for more information.
 *        This callback must be implemented and can sleep.
 *
 * @config_iface_filter: Configure the interface's RX filter.
 *        This callback is optional and is used to configure which frames
 *        should be passed to mac80211. The filter_flags is the combination
 *        of FIF_* flags. The changed_flags is a bit mask that indicates
 *        which flags are changed.
 *        This callback can sleep.
 *
 * @set_tim: Set TIM bit. mac80211 calls this function when a TIM bit
 *         must be set or cleared for a given STA. Must be atomic.
 *
 * @set_key: See the section "Hardware crypto acceleration"
 *        This callback is only called between add_interface and
 *        remove_interface calls, i.e. while the given virtual interface
 *        is enabled.
 *        Returns a negative error code if the key can't be added.
 *        The callback can sleep.
 *
 * @update_tkip_key: See the section "Hardware crypto acceleration"
 *         This callback will be called in the context of Rx. Called for drivers
 *         which set IEEE80211_KEY_FLAG_TKIP_REQ_RX_P1_KEY.
 *        The callback must be atomic.
 *
 * @set_rekey_data: If the device supports GTK rekeying, for example while the
 *        host is suspended, it can assign this callback to retrieve the data
 *        necessary to do GTK rekeying, this is the KEK, KCK and replay counter.
 *        After rekeying was done it should (for example during resume) notify
 *        userspace of the new replay counter using ieee80211_gtk_rekey_notify().
 *
 * @set_default_unicast_key: Set the default (unicast) key index, useful for
 *        WEP when the device sends data packets autonomously, e.g. for ARP
 *        offloading. The index can be 0-3, or -1 for unsetting it.
 *
 * @hw_scan: Ask the hardware to service the scan request, no need to start
 *        the scan state machine in stack. The scan must honour the channel
 *        configuration done by the regulatory agent in the wiphy's
 *        registered bands. The hardware (or the driver) needs to make sure
 *        that power save is disabled.
 *        The @req ie/ie_len members are rewritten by mac80211 to contain the
 *        entire IEs after the SSID, so that drivers need not look at these
 *        at all but just send them after the SSID -- mac80211 includes the
 *        (extended) supported rates and HT information (where applicable).
 *        When the scan finishes, ieee80211_scan_completed() must be called;
 *        note that it also must be called when the scan cannot finish due to
 *        any error unless this callback returned a negative error code.
 *        This callback is also allowed to return the special return value 1,
 *        this indicates that hardware scan isn't desirable right now and a
 *        software scan should be done instead. A driver wishing to use this
 *        capability must ensure its (hardware) scan capabilities aren't
 *        advertised as more capable than mac80211's software scan is.
 *        The callback can sleep.
 *
 * @cancel_hw_scan: Ask the low-level tp cancel the active hw scan.
 *        The driver should ask the hardware to cancel the scan (if possible),
 *        but the scan will be completed only after the driver will call
 *        ieee80211_scan_completed().
 *        This callback is needed for wowlan, to prevent enqueueing a new
 *        scan_work after the low-level driver was already suspended.
 *        The callback can sleep.
 *
 * @sched_scan_start: Ask the hardware to start scanning repeatedly at
 *        specific intervals.  The driver must call the
 *        ieee80211_sched_scan_results() function whenever it finds results.
 *        This process will continue until sched_scan_stop is called.
 *
 * @sched_scan_stop: Tell the hardware to stop an ongoing scheduled scan.
 *        In this case, ieee80211_sched_scan_stopped() must not be called.
 *
 * @sw_scan_start: Notifier function that is called just before a software scan
 *        is started. Can be NULL, if the driver doesn't need this notification.
 *        The mac_addr parameter allows supporting NL80211_SCAN_FLAG_RANDOM_ADDR,
 *        the driver may set the NL80211_FEATURE_SCAN_RANDOM_MAC_ADDR flag if it
 *        can use this parameter. The callback can sleep.
 *
 * @sw_scan_complete: Notifier function that is called just after a
 *        software scan finished. Can be NULL, if the driver doesn't need
 *        this notification.
 *        The callback can sleep.
 *
 * @get_stats: Return low-level statistics.
 *         Returns zero if statistics are available.
 *        The callback can sleep.
 *
 * @get_key_seq: If your device implements encryption in hardware and does
 *        IV/PN assignment then this callback should be provided to read the
 *        IV/PN for the given key from hardware.
 *        The callback must be atomic.
 *
 * @set_frag_threshold: Configuration of fragmentation threshold. Assign this
 *        if the device does fragmentation by itself. Note that to prevent the
 *        stack from doing fragmentation IEEE80211_HW_SUPPORTS_TX_FRAG
 *        should be set as well.
 *        The callback can sleep.
 *
 * @set_rts_threshold: Configuration of RTS threshold (if device needs it)
 *        The callback can sleep.
 *
 * @sta_add: Notifies low level driver about addition of an associated station,
 *        AP, IBSS/WDS/mesh peer etc. This callback can sleep.
 *
 * @sta_remove: Notifies low level driver about removal of an associated
 *        station, AP, IBSS/WDS/mesh peer etc. Note that after the callback
 *        returns it isn't safe to use the pointer, not even RCU protected;
 *        no RCU grace period is guaranteed between returning here and freeing
 *        the station. See @sta_pre_rcu_remove if needed.
 *        This callback can sleep.
 *
 * @sta_add_debugfs: Drivers can use this callback to add debugfs files
 *        when a station is added to mac80211's station list. This callback
 *        should be within a CONFIG_MAC80211_DEBUGFS conditional. This
 *        callback can sleep.
 *
 * @sta_notify: Notifies low level driver about power state transition of an
 *        associated station, AP,  IBSS/WDS/mesh peer etc. For a VIF operating
 *        in AP mode, this callback will not be called when the flag
 *        %IEEE80211_HW_AP_LINK_PS is set. Must be atomic.
 *
 * @sta_set_txpwr: Configure the station tx power. This callback set the tx
 *        power for the station.
 *        This callback can sleep.
 *
 * @sta_state: Notifies low level driver about state transition of a
 *        station (which can be the AP, a client, IBSS/WDS/mesh peer etc.)
 *        This callback is mutually exclusive with @sta_add/@sta_remove.
 *        It must not fail for down transitions but may fail for transitions
 *        up the list of states. Also note that after the callback returns it
 *        isn't safe to use the pointer, not even RCU protected - no RCU grace
 *        period is guaranteed between returning here and freeing the station.
 *        See @sta_pre_rcu_remove if needed.
 *        The callback can sleep.
 *
 * @sta_pre_rcu_remove: Notify driver about station removal before RCU
 *        synchronisation. This is useful if a driver needs to have station
 *        pointers protected using RCU, it can then use this call to clear
 *        the pointers instead of waiting for an RCU grace period to elapse
 *        in @sta_state.
 *        The callback can sleep.
 *
 * @sta_rc_update: Notifies the driver of changes to the bitrates that can be
 *        used to transmit to the station. The changes are advertised with bits
 *        from &enum ieee80211_rate_control_changed and the values are reflected
 *        in the station data. This callback should only be used when the driver
 *        uses hardware rate control (%IEEE80211_HW_HAS_RATE_CONTROL) since
 *        otherwise the rate control algorithm is notified directly.
 *        Must be atomic.
 * @sta_rate_tbl_update: Notifies the driver that the rate table changed. This
 *        is only used if the configured rate control algorithm actually uses
 *        the new rate table API, and is therefore optional. Must be atomic.
 *
 * @sta_statistics: Get statistics for this station. For example with beacon
 *        filtering, the statistics kept by mac80211 might not be accurate, so
 *        let the driver pre-fill the statistics. The driver can fill most of
 *        the values (indicating which by setting the filled bitmap), but not
 *        all of them make sense - see the source for which ones are possible.
 *        Statistics that the driver doesn't fill will be filled by mac80211.
 *        The callback can sleep.
 *
 * @conf_tx: Configure TX queue parameters (EDCF (aifs, cw_min, cw_max),
 *        bursting) for a hardware TX queue.
 *        Returns a negative error code on failure.
 *        The callback can sleep.
 *
 * @get_tsf: Get the current TSF timer value from firmware/hardware. Currently,
 *        this is only used for IBSS mode BSSID merging and debugging. Is not a
 *        required function.
 *        The callback can sleep.
 *
 * @set_tsf: Set the TSF timer to the specified value in the firmware/hardware.
 *        Currently, this is only used for IBSS mode debugging. Is not a
 *        required function.
 *        The callback can sleep.
 *
 * @offset_tsf: Offset the TSF timer by the specified value in the
 *        firmware/hardware.  Preferred to set_tsf as it avoids delay between
 *        calling set_tsf() and hardware getting programmed, which will show up
 *        as TSF delay. Is not a required function.
 *        The callback can sleep.
 *
 * @reset_tsf: Reset the TSF timer and allow firmware/hardware to synchronize
 *        with other STAs in the IBSS. This is only used in IBSS mode. This
 *        function is optional if the firmware/hardware takes full care of
 *        TSF synchronization.
 *        The callback can sleep.
 *
 * @tx_last_beacon: Determine whether the last IBSS beacon was sent by us.
 *        This is needed only for IBSS mode and the result of this function is
 *        used to determine whether to reply to Probe Requests.
 *        Returns non-zero if this device sent the last beacon.
 *        The callback can sleep.
 *
 * @get_survey: Return per-channel survey information
 *
 * @rfkill_poll: Poll rfkill hardware state. If you need this, you also
 *        need to set wiphy->rfkill_poll to %true before registration,
 *        and need to call wiphy_rfkill_set_hw_state() in the callback.
 *        The callback can sleep.
 *
 * @set_coverage_class: Set slot time for given coverage class as specified
 *        in IEEE 802.11-2007 section 17.3.8.6 and modify ACK timeout
 *        accordingly; coverage class equals to -1 to enable ACK timeout
 *        estimation algorithm (dynack). To disable dynack set valid value for
 *        coverage class. This callback is not required and may sleep.
 *
 * @testmode_cmd: Implement a cfg80211 test mode command. The passed @vif may
 *        be %NULL. The callback can sleep.
 * @testmode_dump: Implement a cfg80211 test mode dump. The callback can sleep.
 *
 * @flush: Flush all pending frames from the hardware queue, making sure
 *        that the hardware queues are empty. The @queues parameter is a bitmap
 *        of queues to flush, which is useful if different virtual interfaces
 *        use different hardware queues; it may also indicate all queues.
 *        If the parameter @drop is set to %true, pending frames may be dropped.
 *        Note that vif can be NULL.
 *        The callback can sleep.
 *
 * @channel_switch: Drivers that need (or want) to offload the channel
 *        switch operation for CSAs received from the AP may implement this
 *        callback. They must then call ieee80211_chswitch_done() to indicate
 *        completion of the channel switch.
 *
 * @set_antenna: Set antenna configuration (tx_ant, rx_ant) on the device.
 *        Parameters are bitmaps of allowed antennas to use for TX/RX. Drivers may
 *        reject TX/RX mask combinations they cannot support by returning -EINVAL
 *        (also see nl80211.h @NL80211_ATTR_WIPHY_ANTENNA_TX).
 *
 * @get_antenna: Get current antenna configuration from device (tx_ant, rx_ant).
 *
 * @remain_on_channel: Starts an off-channel period on the given channel, must
 *        call back to ieee80211_ready_on_channel() when on that channel. Note
 *        that normal channel traffic is not stopped as this is intended for hw
 *        offload. Frames to transmit on the off-channel channel are transmitted
 *        normally except for the %IEEE80211_TX_CTL_TX_OFFCHAN flag. When the
 *        duration (which will always be non-zero) expires, the driver must call
 *        ieee80211_remain_on_channel_expired().
 *        Note that this callback may be called while the device is in IDLE and
 *        must be accepted in this case.
 *        This callback may sleep.
 * @cancel_remain_on_channel: Requests that an ongoing off-channel period is
 *        aborted before it expires. This callback may sleep.
 *
 * @set_ringparam: Set tx and rx ring sizes.
 *
 * @get_ringparam: Get tx and rx ring current and maximum sizes.
 *
 * @tx_frames_pending: Check if there is any pending frame in the hardware
 *        queues before entering power save.
 *
 * @set_bitrate_mask: Set a mask of rates to be used for rate control selection
 *        when transmitting a frame. Currently only legacy rates are handled.
 *        The callback can sleep.
 * @event_callback: Notify driver about any event in mac80211. See
 *        &enum ieee80211_event_type for the different types.
 *        The callback must be atomic.
 *
 * @release_buffered_frames: Release buffered frames according to the given
 *        parameters. In the case where the driver buffers some frames for
 *        sleeping stations mac80211 will use this callback to tell the driver
 *        to release some frames, either for PS-poll or uAPSD.
 *        Note that if the @more_data parameter is %false the driver must check
 *        if there are more frames on the given TIDs, and if there are more than
 *        the frames being released then it must still set the more-data bit in
 *        the frame. If the @more_data parameter is %true, then of course the
 *        more-data bit must always be set.
 *        The @tids parameter tells the driver which TIDs to release frames
 *        from, for PS-poll it will always have only a single bit set.
 *        In the case this is used for a PS-poll initiated release, the
 *        @num_frames parameter will always be 1 so code can be shared. In
 *        this case the driver must also set %IEEE80211_TX_STATUS_EOSP flag
 *        on the TX status (and must report TX status) so that the PS-poll
 *        period is properly ended. This is used to avoid sending multiple
 *        responses for a retried PS-poll frame.
 *        In the case this is used for uAPSD, the @num_frames parameter may be
 *        bigger than one, but the driver may send fewer frames (it must send
 *        at least one, however). In this case it is also responsible for
 *        setting the EOSP flag in the QoS header of the frames. Also, when the
 *        service period ends, the driver must set %IEEE80211_TX_STATUS_EOSP
 *        on the last frame in the SP. Alternatively, it may call the function
 *        ieee80211_sta_eosp() to inform mac80211 of the end of the SP.
 *        This callback must be atomic.
 * @allow_buffered_frames: Prepare device to allow the given number of frames
 *        to go out to the given station. The frames will be sent by mac80211
 *        via the usual TX path after this call. The TX information for frames
 *        released will also have the %IEEE80211_TX_CTL_NO_PS_BUFFER flag set
 *        and the last one will also have %IEEE80211_TX_STATUS_EOSP set. In case
 *        frames from multiple TIDs are released and the driver might reorder
 *        them between the TIDs, it must set the %IEEE80211_TX_STATUS_EOSP flag
 *        on the last frame and clear it on all others and also handle the EOSP
 *        bit in the QoS header correctly. Alternatively, it can also call the
 *        ieee80211_sta_eosp() function.
 *        The @tids parameter is a bitmap and tells the driver which TIDs the
 *        frames will be on; it will at most have two bits set.
 *        This callback must be atomic.
 *
 * @get_et_sset_count:  Ethtool API to get string-set count.
 *
 * @get_et_stats:  Ethtool API to get a set of u64 stats.
 *
 * @get_et_strings:  Ethtool API to get a set of strings to describe stats
 *        and perhaps other supported types of ethtool data-sets.
 *
 * @mgd_prepare_tx: Prepare for transmitting a management frame for association
 *        before associated. In multi-channel scenarios, a virtual interface is
 *        bound to a channel before it is associated, but as it isn't associated
 *        yet it need not necessarily be given airtime, in particular since any
 *        transmission to a P2P GO needs to be synchronized against the GO's
 *        powersave state. mac80211 will call this function before transmitting a
 *        management frame prior to having successfully associated to allow the
 *        driver to give it channel time for the transmission, to get a response
 *        and to be able to synchronize with the GO.
 *        For drivers that set %IEEE80211_HW_DEAUTH_NEED_MGD_TX_PREP, mac80211
 *        would also call this function before transmitting a deauthentication
 *        frame in case that no beacon was heard from the AP/P2P GO.
 *        The callback will be called before each transmission and upon return
 *        mac80211 will transmit the frame right away.
 *      If duration is greater than zero, mac80211 hints to the driver the
 *      duration for which the operation is requested.
 *        The callback is optional and can (should!) sleep.
 *
 * @mgd_protect_tdls_discover: Protect a TDLS discovery session. After sending
 *        a TDLS discovery-request, we expect a reply to arrive on the AP's
 *        channel. We must stay on the channel (no PSM, scan, etc.), since a TDLS
 *        setup-response is a direct packet not buffered by the AP.
 *        mac80211 will call this function just before the transmission of a TDLS
 *        discovery-request. The recommended period of protection is at least
 *        2 * (DTIM period).
 *        The callback is optional and can sleep.
 *
 * @add_chanctx: Notifies device driver about new channel context creation.
 *        This callback may sleep.
 * @remove_chanctx: Notifies device driver about channel context destruction.
 *        This callback may sleep.
 * @change_chanctx: Notifies device driver about channel context changes that
 *        may happen when combining different virtual interfaces on the same
 *        channel context with different settings
 *        This callback may sleep.
 * @assign_vif_chanctx: Notifies device driver about channel context being bound
 *        to vif. Possible use is for hw queue remapping.
 *        This callback may sleep.
 * @unassign_vif_chanctx: Notifies device driver about channel context being
 *        unbound from vif.
 *        This callback may sleep.
 * @switch_vif_chanctx: switch a number of vifs from one chanctx to
 *        another, as specified in the list of
 *        @ieee80211_vif_chanctx_switch passed to the driver, according
 *        to the mode defined in &ieee80211_chanctx_switch_mode.
 *        This callback may sleep.
 *
 * @start_ap: Start operation on the AP interface, this is called after all the
 *        information in bss_conf is set and beacon can be retrieved. A channel
 *        context is bound before this is called. Note that if the driver uses
 *        software scan or ROC, this (and @stop_ap) isn't called when the AP is
 *        just "paused" for scanning/ROC, which is indicated by the beacon being
 *        disabled/enabled via @bss_info_changed.
 * @stop_ap: Stop operation on the AP interface.
 *
 * @reconfig_complete: Called after a call to ieee80211_restart_hw() and
 *        during resume, when the reconfiguration has completed.
 *        This can help the driver implement the reconfiguration step (and
 *        indicate mac80211 is ready to receive frames).
 *        This callback may sleep.
 *
 * @ipv6_addr_change: IPv6 address assignment on the given interface changed.
 *        Currently, this is only called for managed or P2P client interfaces.
 *        This callback is optional; it must not sleep.
 *
 * @channel_switch_beacon: Starts a channel switch to a new channel.
 *        Beacons are modified to include CSA or ECSA IEs before calling this
 *        function. The corresponding count fields in these IEs must be
 *        decremented, and when they reach 1 the driver must call
 *        ieee80211_csa_finish(). Drivers which use ieee80211_beacon_get()
 *        get the csa counter decremented by mac80211, but must check if it is
 *        1 using ieee80211_beacon_counter_is_complete() after the beacon has been
 *        transmitted and then call ieee80211_csa_finish().
 *        If the CSA count starts as zero or 1, this function will not be called,
 *        since there won't be any time to beacon before the switch anyway.
 * @pre_channel_switch: This is an optional callback that is called
 *        before a channel switch procedure is started (ie. when a STA
 *        gets a CSA or a userspace initiated channel-switch), allowing
 *        the driver to prepare for the channel switch.
 * @post_channel_switch: This is an optional callback that is called
 *        after a channel switch procedure is completed, allowing the
 *        driver to go back to a normal configuration.
 * @abort_channel_switch: This is an optional callback that is called
 *        when channel switch procedure was completed, allowing the
 *        driver to go back to a normal configuration.
 * @channel_switch_rx_beacon: This is an optional callback that is called
 *        when channel switch procedure is in progress and additional beacon with
 *        CSA IE was received, allowing driver to track changes in count.
 * @join_ibss: Join an IBSS (on an IBSS interface); this is called after all
 *        information in bss_conf is set up and the beacon can be retrieved. A
 *        channel context is bound before this is called.
 * @leave_ibss: Leave the IBSS again.
 *
 * @get_expected_throughput: extract the expected throughput towards the
 *        specified station. The returned value is expressed in Kbps. It returns 0
 *        if the RC algorithm does not have proper data to provide.
 *
 * @get_txpower: get current maximum tx power (in dBm) based on configuration
 *        and hardware limits.
 *
 * @tdls_channel_switch: Start channel-switching with a TDLS peer. The driver
 *        is responsible for continually initiating channel-switching operations
 *        and returning to the base channel for communication with the AP. The
 *        driver receives a channel-switch request template and the location of
 *        the switch-timing IE within the template as part of the invocation.
 *        The template is valid only within the call, and the driver can
 *        optionally copy the skb for further re-use.
 * @tdls_cancel_channel_switch: Stop channel-switching with a TDLS peer. Both
 *        peers must be on the base channel when the call completes.
 * @tdls_recv_channel_switch: a TDLS channel-switch related frame (request or
 *        response) has been received from a remote peer. The driver gets
 *        parameters parsed from the incoming frame and may use them to continue
 *        an ongoing channel-switch operation. In addition, a channel-switch
 *        response template is provided, together with the location of the
 *        switch-timing IE within the template. The skb can only be used within
 *        the function call.
 *
 * @wake_tx_queue: Called when new packets have been added to the queue.
 * @sync_rx_queues: Process all pending frames in RSS queues. This is a
 *        synchronization which is needed in case driver has in its RSS queues
 *        pending frames that were received prior to the control path action
 *        currently taken (e.g. disassociation) but are not processed yet.
 *
 * @start_nan: join an existing NAN cluster, or create a new one.
 * @stop_nan: leave the NAN cluster.
 * @nan_change_conf: change NAN configuration. The data in cfg80211_nan_conf
 *        contains full new configuration and changes specify which parameters
 *        are changed with respect to the last NAN config.
 *        The driver gets both full configuration and the changed parameters since
 *        some devices may need the full configuration while others need only the
 *        changed parameters.
 * @add_nan_func: Add a NAN function. Returns 0 on success. The data in
 *        cfg80211_nan_func must not be referenced outside the scope of
 *        this call.
 * @del_nan_func: Remove a NAN function. The driver must call
 *        ieee80211_nan_func_terminated() with
 *        NL80211_NAN_FUNC_TERM_REASON_USER_REQUEST reason code upon removal.
 * @can_aggregate_in_amsdu: Called in order to determine if HW supports
 *        aggregating two specific frames in the same A-MSDU. The relation
 *        between the skbs should be symmetric and transitive. Note that while
 *        skb is always a real frame, head may or may not be an A-MSDU.
 * @get_ftm_responder_stats: Retrieve FTM responder statistics, if available.
 *        Statistics should be cumulative, currently no way to reset is provided.
 *
 * @start_pmsr: start peer measurement (e.g. FTM) (this call can sleep)
 * @abort_pmsr: abort peer measurement (this call can sleep)
 * @set_tid_config: Apply TID specific configurations. This callback may sleep.
 * @reset_tid_config: Reset TID specific configuration for the peer.
 *        This callback may sleep.
 * @update_vif_offload: Update virtual interface offload flags
 *        This callback may sleep.
 * @sta_set_4addr: Called to notify the driver when a station starts/stops using
 *        4-address mode
 */
struct ieee80211_ops {
        void (*tx)(struct ieee80211_hw *hw,
                   struct ieee80211_tx_control *control,
                   struct sk_buff *skb);
        int (*start)(struct ieee80211_hw *hw);
        void (*stop)(struct ieee80211_hw *hw);
#ifdef CONFIG_PM
        int (*suspend)(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan);
        int (*resume)(struct ieee80211_hw *hw);
        void (*set_wakeup)(struct ieee80211_hw *hw, bool enabled);
#endif
        int (*add_interface)(struct ieee80211_hw *hw,
                             struct ieee80211_vif *vif);
        int (*change_interface)(struct ieee80211_hw *hw,
                                struct ieee80211_vif *vif,
                                enum nl80211_iftype new_type, bool p2p);
        void (*remove_interface)(struct ieee80211_hw *hw,
                                 struct ieee80211_vif *vif);
        int (*config)(struct ieee80211_hw *hw, u32 changed);
        void (*bss_info_changed)(struct ieee80211_hw *hw,
                                 struct ieee80211_vif *vif,
                                 struct ieee80211_bss_conf *info,
                                 u32 changed);

        int (*start_ap)(struct ieee80211_hw *hw, struct ieee80211_vif *vif);
        void (*stop_ap)(struct ieee80211_hw *hw, struct ieee80211_vif *vif);

        u64 (*prepare_multicast)(struct ieee80211_hw *hw,
                                 struct netdev_hw_addr_list *mc_list);
        void (*configure_filter)(struct ieee80211_hw *hw,
                                 unsigned int changed_flags,
                                 unsigned int *total_flags,
                                 u64 multicast);
        void (*config_iface_filter)(struct ieee80211_hw *hw,
                                    struct ieee80211_vif *vif,
                                    unsigned int filter_flags,
                                    unsigned int changed_flags);
        int (*set_tim)(struct ieee80211_hw *hw, struct ieee80211_sta *sta,
                       bool set);
        int (*set_key)(struct ieee80211_hw *hw, enum set_key_cmd cmd,
                       struct ieee80211_vif *vif, struct ieee80211_sta *sta,
                       struct ieee80211_key_conf *key);
        void (*update_tkip_key)(struct ieee80211_hw *hw,
                                struct ieee80211_vif *vif,
                                struct ieee80211_key_conf *conf,
                                struct ieee80211_sta *sta,
                                u32 iv32, u16 *phase1key);
        void (*set_rekey_data)(struct ieee80211_hw *hw,
                               struct ieee80211_vif *vif,
                               struct cfg80211_gtk_rekey_data *data);
        void (*set_default_unicast_key)(struct ieee80211_hw *hw,
                                        struct ieee80211_vif *vif, int idx);
        int (*hw_scan)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
                       struct ieee80211_scan_request *req);
        void (*cancel_hw_scan)(struct ieee80211_hw *hw,
                               struct ieee80211_vif *vif);
        int (*sched_scan_start)(struct ieee80211_hw *hw,
                                struct ieee80211_vif *vif,
                                struct cfg80211_sched_scan_request *req,
                                struct ieee80211_scan_ies *ies);
        int (*sched_scan_stop)(struct ieee80211_hw *hw,
                               struct ieee80211_vif *vif);
        void (*sw_scan_start)(struct ieee80211_hw *hw,
                              struct ieee80211_vif *vif,
                              const u8 *mac_addr);
        void (*sw_scan_complete)(struct ieee80211_hw *hw,
                                 struct ieee80211_vif *vif);
        int (*get_stats)(struct ieee80211_hw *hw,
                         struct ieee80211_low_level_stats *stats);
        void (*get_key_seq)(struct ieee80211_hw *hw,
                            struct ieee80211_key_conf *key,
                            struct ieee80211_key_seq *seq);
        int (*set_frag_threshold)(struct ieee80211_hw *hw, u32 value);
        int (*set_rts_threshold)(struct ieee80211_hw *hw, u32 value);
        int (*sta_add)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
                       struct ieee80211_sta *sta);
        int (*sta_remove)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
                          struct ieee80211_sta *sta);
#ifdef CONFIG_MAC80211_DEBUGFS
        void (*sta_add_debugfs)(struct ieee80211_hw *hw,
                                struct ieee80211_vif *vif,
                                struct ieee80211_sta *sta,
                                struct dentry *dir);
#endif
        void (*sta_notify)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
                        enum sta_notify_cmd, struct ieee80211_sta *sta);
        int (*sta_set_txpwr)(struct ieee80211_hw *hw,
                             struct ieee80211_vif *vif,
                             struct ieee80211_sta *sta);
        int (*sta_state)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
                         struct ieee80211_sta *sta,
                         enum ieee80211_sta_state old_state,
                         enum ieee80211_sta_state new_state);
        void (*sta_pre_rcu_remove)(struct ieee80211_hw *hw,
                                   struct ieee80211_vif *vif,
                                   struct ieee80211_sta *sta);
        void (*sta_rc_update)(struct ieee80211_hw *hw,
                              struct ieee80211_vif *vif,
                              struct ieee80211_sta *sta,
                              u32 changed);
        void (*sta_rate_tbl_update)(struct ieee80211_hw *hw,
                                    struct ieee80211_vif *vif,
                                    struct ieee80211_sta *sta);
        void (*sta_statistics)(struct ieee80211_hw *hw,
                               struct ieee80211_vif *vif,
                               struct ieee80211_sta *sta,
                               struct station_info *sinfo);
        int (*conf_tx)(struct ieee80211_hw *hw,
                       struct ieee80211_vif *vif, u16 ac,
                       const struct ieee80211_tx_queue_params *params);
        u64 (*get_tsf)(struct ieee80211_hw *hw, struct ieee80211_vif *vif);
        void (*set_tsf)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
                        u64 tsf);
        void (*offset_tsf)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
                           s64 offset);
        void (*reset_tsf)(struct ieee80211_hw *hw, struct ieee80211_vif *vif);
        int (*tx_last_beacon)(struct ieee80211_hw *hw);

        /**
         * @ampdu_action:
         * Perform a certain A-MPDU action.
         * The RA/TID combination determines the destination and TID we want
         * the ampdu action to be performed for. The action is defined through
         * ieee80211_ampdu_mlme_action.
         * When the action is set to %IEEE80211_AMPDU_TX_OPERATIONAL the driver
         * may neither send aggregates containing more subframes than @buf_size
         * nor send aggregates in a way that lost frames would exceed the
         * buffer size. If just limiting the aggregate size, this would be
         * possible with a buf_size of 8:
         *
         * - ``TX: 1.....7``
         * - ``RX:  2....7`` (lost frame #1)
         * - ``TX:        8..1...``
         *
         * which is invalid since #1 was now re-transmitted well past the
         * buffer size of 8. Correct ways to retransmit #1 would be:
         *
         * - ``TX:        1   or``
         * - ``TX:        18  or``
         * - ``TX:        81``
         *
         * Even ``189`` would be wrong since 1 could be lost again.
         *
         * Returns a negative error code on failure. The driver may return
         * %IEEE80211_AMPDU_TX_START_IMMEDIATE for %IEEE80211_AMPDU_TX_START
         * if the session can start immediately.
         *
         * The callback can sleep.
         */
        int (*ampdu_action)(struct ieee80211_hw *hw,
                            struct ieee80211_vif *vif,
                            struct ieee80211_ampdu_params *params);
        int (*get_survey)(struct ieee80211_hw *hw, int idx,
                struct survey_info *survey);
        void (*rfkill_poll)(struct ieee80211_hw *hw);
        void (*set_coverage_class)(struct ieee80211_hw *hw, s16 coverage_class);
#ifdef CONFIG_NL80211_TESTMODE
        int (*testmode_cmd)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
                            void *data, int len);
        int (*testmode_dump)(struct ieee80211_hw *hw, struct sk_buff *skb,
                             struct netlink_callback *cb,
                             void *data, int len);
#endif
        void (*flush)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
                      u32 queues, bool drop);
        void (*channel_switch)(struct ieee80211_hw *hw,
                               struct ieee80211_vif *vif,
                               struct ieee80211_channel_switch *ch_switch);
        int (*set_antenna)(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant);
        int (*get_antenna)(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant);

        int (*remain_on_channel)(struct ieee80211_hw *hw,
                                 struct ieee80211_vif *vif,
                                 struct ieee80211_channel *chan,
                                 int duration,
                                 enum ieee80211_roc_type type);
        int (*cancel_remain_on_channel)(struct ieee80211_hw *hw,
                                        struct ieee80211_vif *vif);
        int (*set_ringparam)(struct ieee80211_hw *hw, u32 tx, u32 rx);
        void (*get_ringparam)(struct ieee80211_hw *hw,
                              u32 *tx, u32 *tx_max, u32 *rx, u32 *rx_max);
        bool (*tx_frames_pending)(struct ieee80211_hw *hw);
        int (*set_bitrate_mask)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
                                const struct cfg80211_bitrate_mask *mask);
        void (*event_callback)(struct ieee80211_hw *hw,
                               struct ieee80211_vif *vif,
                               const struct ieee80211_event *event);

        void (*allow_buffered_frames)(struct ieee80211_hw *hw,
                                      struct ieee80211_sta *sta,
                                      u16 tids, int num_frames,
                                      enum ieee80211_frame_release_type reason,
                                      bool more_data);
        void (*release_buffered_frames)(struct ieee80211_hw *hw,
                                        struct ieee80211_sta *sta,
                                        u16 tids, int num_frames,
                                        enum ieee80211_frame_release_type reason,
                                        bool more_data);

        int        (*get_et_sset_count)(struct ieee80211_hw *hw,
                                     struct ieee80211_vif *vif, int sset);
        void        (*get_et_stats)(struct ieee80211_hw *hw,
                                struct ieee80211_vif *vif,
                                struct ethtool_stats *stats, u64 *data);
        void        (*get_et_strings)(struct ieee80211_hw *hw,
                                  struct ieee80211_vif *vif,
                                  u32 sset, u8 *data);

        void        (*mgd_prepare_tx)(struct ieee80211_hw *hw,
                                  struct ieee80211_vif *vif,
                                  u16 duration);

        void        (*mgd_protect_tdls_discover)(struct ieee80211_hw *hw,
                                             struct ieee80211_vif *vif);

        int (*add_chanctx)(struct ieee80211_hw *hw,
                           struct ieee80211_chanctx_conf *ctx);
        void (*remove_chanctx)(struct ieee80211_hw *hw,
                               struct ieee80211_chanctx_conf *ctx);
        void (*change_chanctx)(struct ieee80211_hw *hw,
                               struct ieee80211_chanctx_conf *ctx,
                               u32 changed);
        int (*assign_vif_chanctx)(struct ieee80211_hw *hw,
                                  struct ieee80211_vif *vif,
                                  struct ieee80211_chanctx_conf *ctx);
        void (*unassign_vif_chanctx)(struct ieee80211_hw *hw,
                                     struct ieee80211_vif *vif,
                                     struct ieee80211_chanctx_conf *ctx);
        int (*switch_vif_chanctx)(struct ieee80211_hw *hw,
                                  struct ieee80211_vif_chanctx_switch *vifs,
                                  int n_vifs,
                                  enum ieee80211_chanctx_switch_mode mode);

        void (*reconfig_complete)(struct ieee80211_hw *hw,
                                  enum ieee80211_reconfig_type reconfig_type);

#if IS_ENABLED(CONFIG_IPV6)
        void (*ipv6_addr_change)(struct ieee80211_hw *hw,
                                 struct ieee80211_vif *vif,
                                 struct inet6_dev *idev);
#endif
        void (*channel_switch_beacon)(struct ieee80211_hw *hw,
                                      struct ieee80211_vif *vif,
                                      struct cfg80211_chan_def *chandef);
        int (*pre_channel_switch)(struct ieee80211_hw *hw,
                                  struct ieee80211_vif *vif,
                                  struct ieee80211_channel_switch *ch_switch);

        int (*post_channel_switch)(struct ieee80211_hw *hw,
                                   struct ieee80211_vif *vif);
        void (*abort_channel_switch)(struct ieee80211_hw *hw,
                                     struct ieee80211_vif *vif);
        void (*channel_switch_rx_beacon)(struct ieee80211_hw *hw,
                                         struct ieee80211_vif *vif,
                                         struct ieee80211_channel_switch *ch_switch);

        int (*join_ibss)(struct ieee80211_hw *hw, struct ieee80211_vif *vif);
        void (*leave_ibss)(struct ieee80211_hw *hw, struct ieee80211_vif *vif);
        u32 (*get_expected_throughput)(struct ieee80211_hw *hw,
                                       struct ieee80211_sta *sta);
        int (*get_txpower)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
                           int *dbm);

        int (*tdls_channel_switch)(struct ieee80211_hw *hw,
                                   struct ieee80211_vif *vif,
                                   struct ieee80211_sta *sta, u8 oper_class,
                                   struct cfg80211_chan_def *chandef,
                                   struct sk_buff *tmpl_skb, u32 ch_sw_tm_ie);
        void (*tdls_cancel_channel_switch)(struct ieee80211_hw *hw,
                                           struct ieee80211_vif *vif,
                                           struct ieee80211_sta *sta);
        void (*tdls_recv_channel_switch)(struct ieee80211_hw *hw,
                                         struct ieee80211_vif *vif,
                                         struct ieee80211_tdls_ch_sw_params *params);

        void (*wake_tx_queue)(struct ieee80211_hw *hw,
                              struct ieee80211_txq *txq);
        void (*sync_rx_queues)(struct ieee80211_hw *hw);

        int (*start_nan)(struct ieee80211_hw *hw,
                         struct ieee80211_vif *vif,
                         struct cfg80211_nan_conf *conf);
        int (*stop_nan)(struct ieee80211_hw *hw,
                        struct ieee80211_vif *vif);
        int (*nan_change_conf)(struct ieee80211_hw *hw,
                               struct ieee80211_vif *vif,
                               struct cfg80211_nan_conf *conf, u32 changes);
        int (*add_nan_func)(struct ieee80211_hw *hw,
                            struct ieee80211_vif *vif,
                            const struct cfg80211_nan_func *nan_func);
        void (*del_nan_func)(struct ieee80211_hw *hw,
                            struct ieee80211_vif *vif,
                            u8 instance_id);
        bool (*can_aggregate_in_amsdu)(struct ieee80211_hw *hw,
                                       struct sk_buff *head,
                                       struct sk_buff *skb);
        int (*get_ftm_responder_stats)(struct ieee80211_hw *hw,
                                       struct ieee80211_vif *vif,
                                       struct cfg80211_ftm_responder_stats *ftm_stats);
        int (*start_pmsr)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
                          struct cfg80211_pmsr_request *request);
        void (*abort_pmsr)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
                           struct cfg80211_pmsr_request *request);
        int (*set_tid_config)(struct ieee80211_hw *hw,
                              struct ieee80211_vif *vif,
                              struct ieee80211_sta *sta,
                              struct cfg80211_tid_config *tid_conf);
        int (*reset_tid_config)(struct ieee80211_hw *hw,
                                struct ieee80211_vif *vif,
                                struct ieee80211_sta *sta, u8 tids);
        void (*update_vif_offload)(struct ieee80211_hw *hw,
                                   struct ieee80211_vif *vif);
        void (*sta_set_4addr)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
                              struct ieee80211_sta *sta, bool enabled);
};

/**
 * ieee80211_alloc_hw_nm - Allocate a new hardware device
 *
 * This must be called once for each hardware device. The returned pointer
 * must be used to refer to this device when calling other functions.
 * mac80211 allocates a private data area for the driver pointed to by
 * @priv in &struct ieee80211_hw, the size of this area is given as
 * @priv_data_len.
 *
 * @priv_data_len: length of private data
 * @ops: callbacks for this device
 * @requested_name: Requested name for this device.
 *        NULL is valid value, and means use the default naming (phy%d)
 *
 * Return: A pointer to the new hardware device, or %NULL on error.
 */
struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
                                           const struct ieee80211_ops *ops,
                                           const char *requested_name);

/**
 * ieee80211_alloc_hw - Allocate a new hardware device
 *
 * This must be called once for each hardware device. The returned pointer
 * must be used to refer to this device when calling other functions.
 * mac80211 allocates a private data area for the driver pointed to by
 * @priv in &struct ieee80211_hw, the size of this area is given as
 * @priv_data_len.
 *
 * @priv_data_len: length of private data
 * @ops: callbacks for this device
 *
 * Return: A pointer to the new hardware device, or %NULL on error.
 */
static inline
struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len,
                                        const struct ieee80211_ops *ops)
{
        return ieee80211_alloc_hw_nm(priv_data_len, ops, NULL);
}

/**
 * ieee80211_register_hw - Register hardware device
 *
 * You must call this function before any other functions in
 * mac80211. Note that before a hardware can be registered, you
 * need to fill the contained wiphy's information.
 *
 * @hw: the device to register as returned by ieee80211_alloc_hw()
 *
 * Return: 0 on success. An error code otherwise.
 */
int ieee80211_register_hw(struct ieee80211_hw *hw);

/**
 * struct ieee80211_tpt_blink - throughput blink description
 * @throughput: throughput in Kbit/sec
 * @blink_time: blink time in milliseconds
 *        (full cycle, ie. one off + one on period)
 */
struct ieee80211_tpt_blink {
        int throughput;
        int blink_time;
};

/**
 * enum ieee80211_tpt_led_trigger_flags - throughput trigger flags
 * @IEEE80211_TPT_LEDTRIG_FL_RADIO: enable blinking with radio
 * @IEEE80211_TPT_LEDTRIG_FL_WORK: enable blinking when working
 * @IEEE80211_TPT_LEDTRIG_FL_CONNECTED: enable blinking when at least one
 *        interface is connected in some way, including being an AP
 */
enum ieee80211_tpt_led_trigger_flags {
        IEEE80211_TPT_LEDTRIG_FL_RADIO                = BIT(0),
        IEEE80211_TPT_LEDTRIG_FL_WORK                = BIT(1),
        IEEE80211_TPT_LEDTRIG_FL_CONNECTED        = BIT(2),
};

#ifdef CONFIG_MAC80211_LEDS
const char *__ieee80211_get_tx_led_name(struct ieee80211_hw *hw);
const char *__ieee80211_get_rx_led_name(struct ieee80211_hw *hw);
const char *__ieee80211_get_assoc_led_name(struct ieee80211_hw *hw);
const char *__ieee80211_get_radio_led_name(struct ieee80211_hw *hw);
const char *
__ieee80211_create_tpt_led_trigger(struct ieee80211_hw *hw,
                                   unsigned int flags,
                                   const struct ieee80211_tpt_blink *blink_table,
                                   unsigned int blink_table_len);
#endif
/**
 * ieee80211_get_tx_led_name - get name of TX LED
 *
 * mac80211 creates a transmit LED trigger for each wireless hardware
 * that can be used to drive LEDs if your driver registers a LED device.
 * This function returns the name (or %NULL if not configured for LEDs)
 * of the trigger so you can automatically link the LED device.
 *
 * @hw: the hardware to get the LED trigger name for
 *
 * Return: The name of the LED trigger. %NULL if not configured for LEDs.
 */
static inline const char *ieee80211_get_tx_led_name(struct ieee80211_hw *hw)
{
#ifdef CONFIG_MAC80211_LEDS
        return __ieee80211_get_tx_led_name(hw);
#else
        return NULL;
#endif
}

/**
 * ieee80211_get_rx_led_name - get name of RX LED
 *
 * mac80211 creates a receive LED trigger for each wireless hardware
 * that can be used to drive LEDs if your driver registers a LED device.
 * This function returns the name (or %NULL if not configured for LEDs)
 * of the trigger so you can automatically link the LED device.
 *
 * @hw: the hardware to get the LED trigger name for
 *
 * Return: The name of the LED trigger. %NULL if not configured for LEDs.
 */
static inline const char *ieee80211_get_rx_led_name(struct ieee80211_hw *hw)
{
#ifdef CONFIG_MAC80211_LEDS
        return __ieee80211_get_rx_led_name(hw);
#else
        return NULL;
#endif
}

/**
 * ieee80211_get_assoc_led_name - get name of association LED
 *
 * mac80211 creates a association LED trigger for each wireless hardware
 * that can be used to drive LEDs if your driver registers a LED device.
 * This function returns the name (or %NULL if not configured for LEDs)
 * of the trigger so you can automatically link the LED device.
 *
 * @hw: the hardware to get the LED trigger name for
 *
 * Return: The name of the LED trigger. %NULL if not configured for LEDs.
 */
static inline const char *ieee80211_get_assoc_led_name(struct ieee80211_hw *hw)
{
#ifdef CONFIG_MAC80211_LEDS
        return __ieee80211_get_assoc_led_name(hw);
#else
        return NULL;
#endif
}

/**
 * ieee80211_get_radio_led_name - get name of radio LED
 *
 * mac80211 creates a radio change LED trigger for each wireless hardware
 * that can be used to drive LEDs if your driver registers a LED device.
 * This function returns the name (or %NULL if not configured for LEDs)
 * of the trigger so you can automatically link the LED device.
 *
 * @hw: the hardware to get the LED trigger name for
 *
 * Return: The name of the LED trigger. %NULL if not configured for LEDs.
 */
static inline const char *ieee80211_get_radio_led_name(struct ieee80211_hw *hw)
{
#ifdef CONFIG_MAC80211_LEDS
        return __ieee80211_get_radio_led_name(hw);
#else
        return NULL;
#endif
}

/**
 * ieee80211_create_tpt_led_trigger - create throughput LED trigger
 * @hw: the hardware to create the trigger for
 * @flags: trigger flags, see &enum ieee80211_tpt_led_trigger_flags
 * @blink_table: the blink table -- needs to be ordered by throughput
 * @blink_table_len: size of the blink table
 *
 * Return: %NULL (in case of error, or if no LED triggers are
 * configured) or the name of the new trigger.
 *
 * Note: This function must be called before ieee80211_register_hw().
 */
static inline const char *
ieee80211_create_tpt_led_trigger(struct ieee80211_hw *hw, unsigned int flags,
                                 const struct ieee80211_tpt_blink *blink_table,
                                 unsigned int blink_table_len)
{
#ifdef CONFIG_MAC80211_LEDS
        return __ieee80211_create_tpt_led_trigger(hw, flags, blink_table,
                                                  blink_table_len);
#else
        return NULL;
#endif
}

/**
 * ieee80211_unregister_hw - Unregister a hardware device
 *
 * This function instructs mac80211 to free allocated resources
 * and unregister netdevices from the networking subsystem.
 *
 * @hw: the hardware to unregister
 */
void ieee80211_unregister_hw(struct ieee80211_hw *hw);

/**
 * ieee80211_free_hw - free hardware descriptor
 *
 * This function frees everything that was allocated, including the
 * private data for the driver. You must call ieee80211_unregister_hw()
 * before calling this function.
 *
 * @hw: the hardware to free
 */
void ieee80211_free_hw(struct ieee80211_hw *hw);

/**
 * ieee80211_restart_hw - restart hardware completely
 *
 * Call this function when the hardware was restarted for some reason
 * (hardware error, ...) and the driver is unable to restore its state
 * by itself. mac80211 assumes that at this point the driver/hardware
 * is completely uninitialised and stopped, it starts the process by
 * calling the ->start() operation. The driver will need to reset all
 * internal state that it has prior to calling this function.
 *
 * @hw: the hardware to restart
 */
void ieee80211_restart_hw(struct ieee80211_hw *hw);

/**
 * ieee80211_rx_list - receive frame and store processed skbs in a list
 *
 * Use this function to hand received frames to mac80211. The receive
 * buffer in @skb must start with an IEEE 802.11 header. In case of a
 * paged @skb is used, the driver is recommended to put the ieee80211
 * header of the frame on the linear part of the @skb to avoid memory
 * allocation and/or memcpy by the stack.
 *
 * This function may not be called in IRQ context. Calls to this function
 * for a single hardware must be synchronized against each other. Calls to
 * this function, ieee80211_rx_ni() and ieee80211_rx_irqsafe() may not be
 * mixed for a single hardware. Must not run concurrently with
 * ieee80211_tx_status() or ieee80211_tx_status_ni().
 *
 * This function must be called with BHs disabled and RCU read lock
 *
 * @hw: the hardware this frame came in on
 * @sta: the station the frame was received from, or %NULL
 * @skb: the buffer to receive, owned by mac80211 after this call
 * @list: the destination list
 */
void ieee80211_rx_list(struct ieee80211_hw *hw, struct ieee80211_sta *sta,
                       struct sk_buff *skb, struct list_head *list);

/**
 * ieee80211_rx_napi - receive frame from NAPI context
 *
 * Use this function to hand received frames to mac80211. The receive
 * buffer in @skb must start with an IEEE 802.11 header. In case of a
 * paged @skb is used, the driver is recommended to put the ieee80211
 * header of the frame on the linear part of the @skb to avoid memory
 * allocation and/or memcpy by the stack.
 *
 * This function may not be called in IRQ context. Calls to this function
 * for a single hardware must be synchronized against each other. Calls to
 * this function, ieee80211_rx_ni() and ieee80211_rx_irqsafe() may not be
 * mixed for a single hardware. Must not run concurrently with
 * ieee80211_tx_status() or ieee80211_tx_status_ni().
 *
 * This function must be called with BHs disabled.
 *
 * @hw: the hardware this frame came in on
 * @sta: the station the frame was received from, or %NULL
 * @skb: the buffer to receive, owned by mac80211 after this call
 * @napi: the NAPI context
 */
void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *sta,
                       struct sk_buff *skb, struct napi_struct *napi);

/**
 * ieee80211_rx - receive frame
 *
 * Use this function to hand received frames to mac80211. The receive
 * buffer in @skb must start with an IEEE 802.11 header. In case of a
 * paged @skb is used, the driver is recommended to put the ieee80211
 * header of the frame on the linear part of the @skb to avoid memory
 * allocation and/or memcpy by the stack.
 *
 * This function may not be called in IRQ context. Calls to this function
 * for a single hardware must be synchronized against each other. Calls to
 * this function, ieee80211_rx_ni() and ieee80211_rx_irqsafe() may not be
 * mixed for a single hardware. Must not run concurrently with
 * ieee80211_tx_status() or ieee80211_tx_status_ni().
 *
 * In process context use instead ieee80211_rx_ni().
 *
 * @hw: the hardware this frame came in on
 * @skb: the buffer to receive, owned by mac80211 after this call
 */
static inline void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb)
{
        ieee80211_rx_napi(hw, NULL, skb, NULL);
}

/**
 * ieee80211_rx_irqsafe - receive frame
 *
 * Like ieee80211_rx() but can be called in IRQ context
 * (internally defers to a tasklet.)
 *
 * Calls to this function, ieee80211_rx() or ieee80211_rx_ni() may not
 * be mixed for a single hardware.Must not run concurrently with
 * ieee80211_tx_status() or ieee80211_tx_status_ni().
 *
 * @hw: the hardware this frame came in on
 * @skb: the buffer to receive, owned by mac80211 after this call
 */
void ieee80211_rx_irqsafe(struct ieee80211_hw *hw, struct sk_buff *skb);

/**
 * ieee80211_rx_ni - receive frame (in process context)
 *
 * Like ieee80211_rx() but can be called in process context
 * (internally disables bottom halves).
 *
 * Calls to this function, ieee80211_rx() and ieee80211_rx_irqsafe() may
 * not be mixed for a single hardware. Must not run concurrently with
 * ieee80211_tx_status() or ieee80211_tx_status_ni().
 *
 * @hw: the hardware this frame came in on
 * @skb: the buffer to receive, owned by mac80211 after this call
 */
static inline void ieee80211_rx_ni(struct ieee80211_hw *hw,
                                   struct sk_buff *skb)
{
        local_bh_disable();
        ieee80211_rx(hw, skb);
        local_bh_enable();
}

/**
 * ieee80211_sta_ps_transition - PS transition for connected sta
 *
 * When operating in AP mode with the %IEEE80211_HW_AP_LINK_PS
 * flag set, use this function to inform mac80211 about a connected station
 * entering/leaving PS mode.
 *
 * This function may not be called in IRQ context or with softirqs enabled.
 *
 * Calls to this function for a single hardware must be synchronized against
 * each other.
 *
 * @sta: currently connected sta
 * @start: start or stop PS
 *
 * Return: 0 on success. -EINVAL when the requested PS mode is already set.
 */
int ieee80211_sta_ps_transition(struct ieee80211_sta *sta, bool start);

/**
 * ieee80211_sta_ps_transition_ni - PS transition for connected sta
 *                                  (in process context)
 *
 * Like ieee80211_sta_ps_transition() but can be called in process context
 * (internally disables bottom halves). Concurrent call restriction still
 * applies.
 *
 * @sta: currently connected sta
 * @start: start or stop PS
 *
 * Return: Like ieee80211_sta_ps_transition().
 */
static inline int ieee80211_sta_ps_transition_ni(struct ieee80211_sta *sta,
                                                  bool start)
{
        int ret;

        local_bh_disable();
        ret = ieee80211_sta_ps_transition(sta, start);
        local_bh_enable();

        return ret;
}

/**
 * ieee80211_sta_pspoll - PS-Poll frame received
 * @sta: currently connected station
 *
 * When operating in AP mode with the %IEEE80211_HW_AP_LINK_PS flag set,
 * use this function to inform mac80211 that a PS-Poll frame from a
 * connected station was received.
 * This must be used in conjunction with ieee80211_sta_ps_transition()
 * and possibly ieee80211_sta_uapsd_trigger(); calls to all three must
 * be serialized.
 */
void ieee80211_sta_pspoll(struct ieee80211_sta *sta);

/**
 * ieee80211_sta_uapsd_trigger - (potential) U-APSD trigger frame received
 * @sta: currently connected station
 * @tid: TID of the received (potential) trigger frame
 *
 * When operating in AP mode with the %IEEE80211_HW_AP_LINK_PS flag set,
 * use this function to inform mac80211 that a (potential) trigger frame
 * from a connected station was received.
 * This must be used in conjunction with ieee80211_sta_ps_transition()
 * and possibly ieee80211_sta_pspoll(); calls to all three must be
 * serialized.
 * %IEEE80211_NUM_TIDS can be passed as the tid if the tid is unknown.
 * In this case, mac80211 will not check that this tid maps to an AC
 * that is trigger enabled and assume that the caller did the proper
 * checks.
 */
void ieee80211_sta_uapsd_trigger(struct ieee80211_sta *sta, u8 tid);

/*
 * The TX headroom reserved by mac80211 for its own tx_status functions.
 * This is enough for the radiotap header.
 */
#define IEEE80211_TX_STATUS_HEADROOM        ALIGN(14, 4)

/**
 * ieee80211_sta_set_buffered - inform mac80211 about driver-buffered frames
 * @sta: &struct ieee80211_sta pointer for the sleeping station
 * @tid: the TID that has buffered frames
 * @buffered: indicates whether or not frames are buffered for this TID
 *
 * If a driver buffers frames for a powersave station instead of passing
 * them back to mac80211 for retransmission, the station may still need
 * to be told that there are buffered frames via the TIM bit.
 *
 * This function informs mac80211 whether or not there are frames that are
 * buffered in the driver for a given TID; mac80211 can then use this data
 * to set the TIM bit (NOTE: This may call back into the driver's set_tim
 * call! Beware of the locking!)
 *
 * If all frames are released to the station (due to PS-poll or uAPSD)
 * then the driver needs to inform mac80211 that there no longer are
 * frames buffered. However, when the station wakes up mac80211 assumes
 * that all buffered frames will be transmitted and clears this data,
 * drivers need to make sure they inform mac80211 about all buffered
 * frames on the sleep transition (sta_notify() with %STA_NOTIFY_SLEEP).
 *
 * Note that technically mac80211 only needs to know this per AC, not per
 * TID, but since driver buffering will inevitably happen per TID (since
 * it is related to aggregation) it is easier to make mac80211 map the
 * TID to the AC as required instead of keeping track in all drivers that
 * use this API.
 */
void ieee80211_sta_set_buffered(struct ieee80211_sta *sta,
                                u8 tid, bool buffered);

/**
 * ieee80211_get_tx_rates - get the selected transmit rates for a packet
 *
 * Call this function in a driver with per-packet rate selection support
 * to combine the rate info in the packet tx info with the most recent
 * rate selection table for the station entry.
 *
 * @vif: &struct ieee80211_vif pointer from the add_interface callback.
 * @sta: the receiver station to which this packet is sent.
 * @skb: the frame to be transmitted.
 * @dest: buffer for extracted rate/retry information
 * @max_rates: maximum number of rates to fetch
 */
void ieee80211_get_tx_rates(struct ieee80211_vif *vif,
                            struct ieee80211_sta *sta,
                            struct sk_buff *skb,
                            struct ieee80211_tx_rate *dest,
                            int max_rates);

/**
 * ieee80211_sta_set_expected_throughput - set the expected tpt for a station
 *
 * Call this function to notify mac80211 about a change in expected throughput
 * to a station. A driver for a device that does rate control in firmware can
 * call this function when the expected throughput estimate towards a station
 * changes. The information is used to tune the CoDel AQM applied to traffic
 * going towards that station (which can otherwise be too aggressive and cause
 * slow stations to starve).
 *
 * @pubsta: the station to set throughput for.
 * @thr: the current expected throughput in kbps.
 */
void ieee80211_sta_set_expected_throughput(struct ieee80211_sta *pubsta,
                                           u32 thr);

/**
 * ieee80211_tx_rate_update - transmit rate update callback
 *
 * Drivers should call this functions with a non-NULL pub sta
 * This function can be used in drivers that does not have provision
 * in updating the tx rate in data path.
 *
 * @hw: the hardware the frame was transmitted by
 * @pubsta: the station to update the tx rate for.
 * @info: tx status information
 */
void ieee80211_tx_rate_update(struct ieee80211_hw *hw,
                              struct ieee80211_sta *pubsta,
                              struct ieee80211_tx_info *info);

/**
 * ieee80211_tx_status - transmit status callback
 *
 * Call this function for all transmitted frames after they have been
 * transmitted. It is permissible to not call this function for
 * multicast frames but this can affect statistics.
 *
 * This function may not be called in IRQ context. Calls to this function
 * for a single hardware must be synchronized against each other. Calls
 * to this function, ieee80211_tx_status_ni() and ieee80211_tx_status_irqsafe()
 * may not be mixed for a single hardware. Must not run concurrently with
 * ieee80211_rx() or ieee80211_rx_ni().
 *
 * @hw: the hardware the frame was transmitted by
 * @skb: the frame that was transmitted, owned by mac80211 after this call
 */
void ieee80211_tx_status(struct ieee80211_hw *hw,
                         struct sk_buff *skb);

/**
 * ieee80211_tx_status_ext - extended transmit status callback
 *
 * This function can be used as a replacement for ieee80211_tx_status
 * in drivers that may want to provide extra information that does not
 * fit into &struct ieee80211_tx_info.
 *
 * Calls to this function for a single hardware must be synchronized
 * against each other. Calls to this function, ieee80211_tx_status_ni()
 * and ieee80211_tx_status_irqsafe() may not be mixed for a single hardware.
 *
 * @hw: the hardware the frame was transmitted by
 * @status: tx status information
 */
void ieee80211_tx_status_ext(struct ieee80211_hw *hw,
                             struct ieee80211_tx_status *status);

/**
 * ieee80211_tx_status_noskb - transmit status callback without skb
 *
 * This function can be used as a replacement for ieee80211_tx_status
 * in drivers that cannot reliably map tx status information back to
 * specific skbs.
 *
 * Calls to this function for a single hardware must be synchronized
 * against each other. Calls to this function, ieee80211_tx_status_ni()
 * and ieee80211_tx_status_irqsafe() may not be mixed for a single hardware.
 *
 * @hw: the hardware the frame was transmitted by
 * @sta: the receiver station to which this packet is sent
 *        (NULL for multicast packets)
 * @info: tx status information
 */
static inline void ieee80211_tx_status_noskb(struct ieee80211_hw *hw,
                                             struct ieee80211_sta *sta,
                                             struct ieee80211_tx_info *info)
{
        struct ieee80211_tx_status status = {
                .sta = sta,
                .info = info,
        };

        ieee80211_tx_status_ext(hw, &status);
}

/**
 * ieee80211_tx_status_ni - transmit status callback (in process context)
 *
 * Like ieee80211_tx_status() but can be called in process context.
 *
 * Calls to this function, ieee80211_tx_status() and
 * ieee80211_tx_status_irqsafe() may not be mixed
 * for a single hardware.
 *
 * @hw: the hardware the frame was transmitted by
 * @skb: the frame that was transmitted, owned by mac80211 after this call
 */
static inline void ieee80211_tx_status_ni(struct ieee80211_hw *hw,
                                          struct sk_buff *skb)
{
        local_bh_disable();
        ieee80211_tx_status(hw, skb);
        local_bh_enable();
}

/**
 * ieee80211_tx_status_irqsafe - IRQ-safe transmit status callback
 *
 * Like ieee80211_tx_status() but can be called in IRQ context
 * (internally defers to a tasklet.)
 *
 * Calls to this function, ieee80211_tx_status() and
 * ieee80211_tx_status_ni() may not be mixed for a single hardware.
 *
 * @hw: the hardware the frame was transmitted by
 * @skb: the frame that was transmitted, owned by mac80211 after this call
 */
void ieee80211_tx_status_irqsafe(struct ieee80211_hw *hw,
                                 struct sk_buff *skb);

/**
 * ieee80211_tx_status_8023 - transmit status callback for 802.3 frame format
 *
 * Call this function for all transmitted data frames after their transmit
 * completion. This callback should only be called for data frames which
 * are using driver's (or hardware's) offload capability of encap/decap
 * 802.11 frames.
 *
 * This function may not be called in IRQ context. Calls to this function
 * for a single hardware must be synchronized against each other and all
 * calls in the same tx status family.
 *
 * @hw: the hardware the frame was transmitted by
 * @vif: the interface for which the frame was transmitted
 * @skb: the frame that was transmitted, owned by mac80211 after this call
 */
void ieee80211_tx_status_8023(struct ieee80211_hw *hw,
                               struct ieee80211_vif *vif,
                               struct sk_buff *skb);

/**
 * ieee80211_report_low_ack - report non-responding station
 *
 * When operating in AP-mode, call this function to report a non-responding
 * connected STA.
 *
 * @sta: the non-responding connected sta
 * @num_packets: number of packets sent to @sta without a response
 */
void ieee80211_report_low_ack(struct ieee80211_sta *sta, u32 num_packets);

#define IEEE80211_MAX_CNTDWN_COUNTERS_NUM 2

/**
 * struct ieee80211_mutable_offsets - mutable beacon offsets
 * @tim_offset: position of TIM element
 * @tim_length: size of TIM element
 * @cntdwn_counter_offs: array of IEEE80211_MAX_CNTDWN_COUNTERS_NUM offsets
 *        to countdown counters.  This array can contain zero values which
 *        should be ignored.
 */
struct ieee80211_mutable_offsets {
        u16 tim_offset;
        u16 tim_length;

        u16 cntdwn_counter_offs[IEEE80211_MAX_CNTDWN_COUNTERS_NUM];
};

/**
 * ieee80211_beacon_get_template - beacon template generation function
 * @hw: pointer obtained from ieee80211_alloc_hw().
 * @vif: &struct ieee80211_vif pointer from the add_interface callback.
 * @offs: &struct ieee80211_mutable_offsets pointer to struct that will
 *        receive the offsets that may be updated by the driver.
 *
 * If the driver implements beaconing modes, it must use this function to
 * obtain the beacon template.
 *
 * This function should be used if the beacon frames are generated by the
 * device, and then the driver must use the returned beacon as the template
 * The driver or the device are responsible to update the DTIM and, when
 * applicable, the CSA count.
 *
 * The driver is responsible for freeing the returned skb.
 *
 * Return: The beacon template. %NULL on error.
 */
struct sk_buff *
ieee80211_beacon_get_template(struct ieee80211_hw *hw,
                              struct ieee80211_vif *vif,
                              struct ieee80211_mutable_offsets *offs);

/**
 * ieee80211_beacon_get_tim - beacon generation function
 * @hw: pointer obtained from ieee80211_alloc_hw().
 * @vif: &struct ieee80211_vif pointer from the add_interface callback.
 * @tim_offset: pointer to variable that will receive the TIM IE offset.
 *        Set to 0 if invalid (in non-AP modes).
 * @tim_length: pointer to variable that will receive the TIM IE length,
 *        (including the ID and length bytes!).
 *        Set to 0 if invalid (in non-AP modes).
 *
 * If the driver implements beaconing modes, it must use this function to
 * obtain the beacon frame.
 *
 * If the beacon frames are generated by the host system (i.e., not in
 * hardware/firmware), the driver uses this function to get each beacon
 * frame from mac80211 -- it is responsible for calling this function exactly
 * once before the beacon is needed (e.g. based on hardware interrupt).
 *
 * The driver is responsible for freeing the returned skb.
 *
 * Return: The beacon template. %NULL on error.
 */
struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw,
                                         struct ieee80211_vif *vif,
                                         u16 *tim_offset, u16 *tim_length);

/**
 * ieee80211_beacon_get - beacon generation function
 * @hw: pointer obtained from ieee80211_alloc_hw().
 * @vif: &struct ieee80211_vif pointer from the add_interface callback.
 *
 * See ieee80211_beacon_get_tim().
 *
 * Return: See ieee80211_beacon_get_tim().
 */
static inline struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw,
                                                   struct ieee80211_vif *vif)
{
        return ieee80211_beacon_get_tim(hw, vif, NULL, NULL);
}

/**
 * ieee80211_beacon_update_cntdwn - request mac80211 to decrement the beacon countdown
 * @vif: &struct ieee80211_vif pointer from the add_interface callback.
 *
 * The beacon counter should be updated after each beacon transmission.
 * This function is called implicitly when
 * ieee80211_beacon_get/ieee80211_beacon_get_tim are called, however if the
 * beacon frames are generated by the device, the driver should call this
 * function after each beacon transmission to sync mac80211's beacon countdown.
 *
 * Return: new countdown value
 */
u8 ieee80211_beacon_update_cntdwn(struct ieee80211_vif *vif);

/**
 * ieee80211_beacon_set_cntdwn - request mac80211 to set beacon countdown
 * @vif: &struct ieee80211_vif pointer from the add_interface callback.
 * @counter: the new value for the counter
 *
 * The beacon countdown can be changed by the device, this API should be
 * used by the device driver to update csa counter in mac80211.
 *
 * It should never be used together with ieee80211_beacon_update_cntdwn(),
 * as it will cause a race condition around the counter value.
 */
void ieee80211_beacon_set_cntdwn(struct ieee80211_vif *vif, u8 counter);

/**
 * ieee80211_csa_finish - notify mac80211 about channel switch
 * @vif: &struct ieee80211_vif pointer from the add_interface callback.
 *
 * After a channel switch announcement was scheduled and the counter in this
 * announcement hits 1, this function must be called by the driver to
 * notify mac80211 that the channel can be changed.
 */
void ieee80211_csa_finish(struct ieee80211_vif *vif);

/**
 * ieee80211_beacon_cntdwn_is_complete - find out if countdown reached 1
 * @vif: &struct ieee80211_vif pointer from the add_interface callback.
 *
 * This function returns whether the countdown reached zero.
 */
bool ieee80211_beacon_cntdwn_is_complete(struct ieee80211_vif *vif);

/**
 * ieee80211_proberesp_get - retrieve a Probe Response template
 * @hw: pointer obtained from ieee80211_alloc_hw().
 * @vif: &struct ieee80211_vif pointer from the add_interface callback.
 *
 * Creates a Probe Response template which can, for example, be uploaded to
 * hardware. The destination address should be set by the caller.
 *
 * Can only be called in AP mode.
 *
 * Return: The Probe Response template. %NULL on error.
 */
struct sk_buff *ieee80211_proberesp_get(struct ieee80211_hw *hw,
                                        struct ieee80211_vif *vif);

/**
 * ieee80211_pspoll_get - retrieve a PS Poll template
 * @hw: pointer obtained from ieee80211_alloc_hw().
 * @vif: &struct ieee80211_vif pointer from the add_interface callback.
 *
 * Creates a PS Poll a template which can, for example, uploaded to
 * hardware. The template must be updated after association so that correct
 * AID, BSSID and MAC address is used.
 *
 * Note: Caller (or hardware) is responsible for setting the
 * &IEEE80211_FCTL_PM bit.
 *
 * Return: The PS Poll template. %NULL on error.
 */
struct sk_buff *ieee80211_pspoll_get(struct ieee80211_hw *hw,
                                     struct ieee80211_vif *vif);

/**
 * ieee80211_nullfunc_get - retrieve a nullfunc template
 * @hw: pointer obtained from ieee80211_alloc_hw().
 * @vif: &struct ieee80211_vif pointer from the add_interface callback.
 * @qos_ok: QoS NDP is acceptable to the caller, this should be set
 *        if at all possible
 *
 * Creates a Nullfunc template which can, for example, uploaded to
 * hardware. The template must be updated after association so that correct
 * BSSID and address is used.
 *
 * If @qos_ndp is set and the association is to an AP with QoS/WMM, the
 * returned packet will be QoS NDP.
 *
 * Note: Caller (or hardware) is responsible for setting the
 * &IEEE80211_FCTL_PM bit as well as Duration and Sequence Control fields.
 *
 * Return: The nullfunc template. %NULL on error.
 */
struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw,
                                       struct ieee80211_vif *vif,
                                       bool qos_ok);

/**
 * ieee80211_probereq_get - retrieve a Probe Request template
 * @hw: pointer obtained from ieee80211_alloc_hw().
 * @src_addr: source MAC address
 * @ssid: SSID buffer
 * @ssid_len: length of SSID
 * @tailroom: tailroom to reserve at end of SKB for IEs
 *
 * Creates a Probe Request template which can, for example, be uploaded to
 * hardware.
 *
 * Return: The Probe Request template. %NULL on error.
 */
struct sk_buff *ieee80211_probereq_get(struct ieee80211_hw *hw,
                                       const u8 *src_addr,
                                       const u8 *ssid, size_t ssid_len,
                                       size_t tailroom);

/**
 * ieee80211_rts_get - RTS frame generation function
 * @hw: pointer obtained from ieee80211_alloc_hw().
 * @vif: &struct ieee80211_vif pointer from the add_interface callback.
 * @frame: pointer to the frame that is going to be protected by the RTS.
 * @frame_len: the frame length (in octets).
 * @frame_txctl: &struct ieee80211_tx_info of the frame.
 * @rts: The buffer where to store the RTS frame.
 *
 * If the RTS frames are generated by the host system (i.e., not in
 * hardware/firmware), the low-level driver uses this function to receive
 * the next RTS frame from the 802.11 code. The low-level is responsible
 * for calling this function before and RTS frame is needed.
 */
void ieee80211_rts_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
                       const void *frame, size_t frame_len,
                       const struct ieee80211_tx_info *frame_txctl,
                       struct ieee80211_rts *rts);

/**
 * ieee80211_rts_duration - Get the duration field for an RTS frame
 * @hw: pointer obtained from ieee80211_alloc_hw().
 * @vif: &struct ieee80211_vif pointer from the add_interface callback.
 * @frame_len: the length of the frame that is going to be protected by the RTS.
 * @frame_txctl: &struct ieee80211_tx_info of the frame.
 *
 * If the RTS is generated in firmware, but the host system must provide
 * the duration field, the low-level driver uses this function to receive
 * the duration field value in little-endian byteorder.
 *
 * Return: The duration.
 */
__le16 ieee80211_rts_duration(struct ieee80211_hw *hw,
                              struct ieee80211_vif *vif, size_t frame_len,
                              const struct ieee80211_tx_info *frame_txctl);

/**
 * ieee80211_ctstoself_get - CTS-to-self frame generation function
 * @hw: pointer obtained from ieee80211_alloc_hw().
 * @vif: &struct ieee80211_vif pointer from the add_interface callback.
 * @frame: pointer to the frame that is going to be protected by the CTS-to-self.
 * @frame_len: the frame length (in octets).
 * @frame_txctl: &struct ieee80211_tx_info of the frame.
 * @cts: The buffer where to store the CTS-to-self frame.
 *
 * If the CTS-to-self frames are generated by the host system (i.e., not in
 * hardware/firmware), the low-level driver uses this function to receive
 * the next CTS-to-self frame from the 802.11 code. The low-level is responsible
 * for calling this function before and CTS-to-self frame is needed.
 */
void ieee80211_ctstoself_get(struct ieee80211_hw *hw,
                             struct ieee80211_vif *vif,
                             const void *frame, size_t frame_len,
                             const struct ieee80211_tx_info *frame_txctl,
                             struct ieee80211_cts *cts);

/**
 * ieee80211_ctstoself_duration - Get the duration field for a CTS-to-self frame
 * @hw: pointer obtained from ieee80211_alloc_hw().
 * @vif: &struct ieee80211_vif pointer from the add_interface callback.
 * @frame_len: the length of the frame that is going to be protected by the CTS-to-self.
 * @frame_txctl: &struct ieee80211_tx_info of the frame.
 *
 * If the CTS-to-self is generated in firmware, but the host system must provide
 * the duration field, the low-level driver uses this function to receive
 * the duration field value in little-endian byteorder.
 *
 * Return: The duration.
 */
__le16 ieee80211_ctstoself_duration(struct ieee80211_hw *hw,
                                    struct ieee80211_vif *vif,
                                    size_t frame_len,
                                    const struct ieee80211_tx_info *frame_txctl);

/**
 * ieee80211_generic_frame_duration - Calculate the duration field for a frame
 * @hw: pointer obtained from ieee80211_alloc_hw().
 * @vif: &struct ieee80211_vif pointer from the add_interface callback.
 * @band: the band to calculate the frame duration on
 * @frame_len: the length of the frame.
 * @rate: the rate at which the frame is going to be transmitted.
 *
 * Calculate the duration field of some generic frame, given its
 * length and transmission rate (in 100kbps).
 *
 * Return: The duration.
 */
__le16 ieee80211_generic_frame_duration(struct ieee80211_hw *hw,
                                        struct ieee80211_vif *vif,
                                        enum nl80211_band band,
                                        size_t frame_len,
                                        struct ieee80211_rate *rate);

/**
 * ieee80211_get_buffered_bc - accessing buffered broadcast and multicast frames
 * @hw: pointer as obtained from ieee80211_alloc_hw().
 * @vif: &struct ieee80211_vif pointer from the add_interface callback.
 *
 * Function for accessing buffered broadcast and multicast frames. If
 * hardware/firmware does not implement buffering of broadcast/multicast
 * frames when power saving is used, 802.11 code buffers them in the host
 * memory. The low-level driver uses this function to fetch next buffered
 * frame. In most cases, this is used when generating beacon frame.
 *
 * Return: A pointer to the next buffered skb or NULL if no more buffered
 * frames are available.
 *
 * Note: buffered frames are returned only after DTIM beacon frame was
 * generated with ieee80211_beacon_get() and the low-level driver must thus
 * call ieee80211_beacon_get() first. ieee80211_get_buffered_bc() returns
 * NULL if the previous generated beacon was not DTIM, so the low-level driver
 * does not need to check for DTIM beacons separately and should be able to
 * use common code for all beacons.
 */
struct sk_buff *
ieee80211_get_buffered_bc(struct ieee80211_hw *hw, struct ieee80211_vif *vif);

/**
 * ieee80211_get_tkip_p1k_iv - get a TKIP phase 1 key for IV32
 *
 * This function returns the TKIP phase 1 key for the given IV32.
 *
 * @keyconf: the parameter passed with the set key
 * @iv32: IV32 to get the P1K for
 * @p1k: a buffer to which the key will be written, as 5 u16 values
 */
void ieee80211_get_tkip_p1k_iv(struct ieee80211_key_conf *keyconf,
                               u32 iv32, u16 *p1k);

/**
 * ieee80211_get_tkip_p1k - get a TKIP phase 1 key
 *
 * This function returns the TKIP phase 1 key for the IV32 taken
 * from the given packet.
 *
 * @keyconf: the parameter passed with the set key
 * @skb: the packet to take the IV32 value from that will be encrypted
 *        with this P1K
 * @p1k: a buffer to which the key will be written, as 5 u16 values
 */
static inline void ieee80211_get_tkip_p1k(struct ieee80211_key_conf *keyconf,
                                          struct sk_buff *skb, u16 *p1k)
{
        struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
        const u8 *data = (u8 *)hdr + ieee80211_hdrlen(hdr->frame_control);
        u32 iv32 = get_unaligned_le32(&data[4]);

        ieee80211_get_tkip_p1k_iv(keyconf, iv32, p1k);
}

/**
 * ieee80211_get_tkip_rx_p1k - get a TKIP phase 1 key for RX
 *
 * This function returns the TKIP phase 1 key for the given IV32
 * and transmitter address.
 *
 * @keyconf: the parameter passed with the set key
 * @ta: TA that will be used with the key
 * @iv32: IV32 to get the P1K for
 * @p1k: a buffer to which the key will be written, as 5 u16 values
 */
void ieee80211_get_tkip_rx_p1k(struct ieee80211_key_conf *keyconf,
                               const u8 *ta, u32 iv32, u16 *p1k);

/**
 * ieee80211_get_tkip_p2k - get a TKIP phase 2 key
 *
 * This function computes the TKIP RC4 key for the IV values
 * in the packet.
 *
 * @keyconf: the parameter passed with the set key
 * @skb: the packet to take the IV32/IV16 values from that will be
 *        encrypted with this key
 * @p2k: a buffer to which the key will be written, 16 bytes
 */
void ieee80211_get_tkip_p2k(struct ieee80211_key_conf *keyconf,
                            struct sk_buff *skb, u8 *p2k);

/**
 * ieee80211_tkip_add_iv - write TKIP IV and Ext. IV to pos
 *
 * @pos: start of crypto header
 * @keyconf: the parameter passed with the set key
 * @pn: PN to add
 *
 * Returns: pointer to the octet following IVs (i.e. beginning of
 * the packet payload)
 *
 * This function writes the tkip IV value to pos (which should
 * point to the crypto header)
 */
u8 *ieee80211_tkip_add_iv(u8 *pos, struct ieee80211_key_conf *keyconf, u64 pn);

/**
 * ieee80211_get_key_rx_seq - get key RX sequence counter
 *
 * @keyconf: the parameter passed with the set key
 * @tid: The TID, or -1 for the management frame value (CCMP/GCMP only);
 *        the value on TID 0 is also used for non-QoS frames. For
 *        CMAC, only TID 0 is valid.
 * @seq: buffer to receive the sequence data
 *
 * This function allows a driver to retrieve the current RX IV/PNs
 * for the given key. It must not be called if IV checking is done
 * by the device and not by mac80211.
 *
 * Note that this function may only be called when no RX processing
 * can be done concurrently.
 */
void ieee80211_get_key_rx_seq(struct ieee80211_key_conf *keyconf,
                              int tid, struct ieee80211_key_seq *seq);

/**
 * ieee80211_set_key_rx_seq - set key RX sequence counter
 *
 * @keyconf: the parameter passed with the set key
 * @tid: The TID, or -1 for the management frame value (CCMP/GCMP only);
 *        the value on TID 0 is also used for non-QoS frames. For
 *        CMAC, only TID 0 is valid.
 * @seq: new sequence data
 *
 * This function allows a driver to set the current RX IV/PNs for the
 * given key. This is useful when resuming from WoWLAN sleep and GTK
 * rekey may have been done while suspended. It should not be called
 * if IV checking is done by the device and not by mac80211.
 *
 * Note that this function may only be called when no RX processing
 * can be done concurrently.
 */
void ieee80211_set_key_rx_seq(struct ieee80211_key_conf *keyconf,
                              int tid, struct ieee80211_key_seq *seq);

/**
 * ieee80211_remove_key - remove the given key
 * @keyconf: the parameter passed with the set key
 *
 * Remove the given key. If the key was uploaded to the hardware at the
 * time this function is called, it is not deleted in the hardware but
 * instead assumed to have been removed already.
 *
 * Note that due to locking considerations this function can (currently)
 * only be called during key iteration (ieee80211_iter_keys().)
 */
void ieee80211_remove_key(struct ieee80211_key_conf *keyconf);

/**
 * ieee80211_gtk_rekey_add - add a GTK key from rekeying during WoWLAN
 * @vif: the virtual interface to add the key on
 * @keyconf: new key data
 *
 * When GTK rekeying was done while the system was suspended, (a) new
 * key(s) will be available. These will be needed by mac80211 for proper
 * RX processing, so this function allows setting them.
 *
 * The function returns the newly allocated key structure, which will
 * have similar contents to the passed key configuration but point to
 * mac80211-owned memory. In case of errors, the function returns an
 * ERR_PTR(), use IS_ERR() etc.
 *
 * Note that this function assumes the key isn't added to hardware
 * acceleration, so no TX will be done with the key. Since it's a GTK
 * on managed (station) networks, this is true anyway. If the driver
 * calls this function from the resume callback and subsequently uses
 * the return code 1 to reconfigure the device, this key will be part
 * of the reconfiguration.
 *
 * Note that the driver should also call ieee80211_set_key_rx_seq()
 * for the new key for each TID to set up sequence counters properly.
 *
 * IMPORTANT: If this replaces a key that is present in the hardware,
 * then it will attempt to remove it during this call. In many cases
 * this isn't what you want, so call ieee80211_remove_key() first for
 * the key that's being replaced.
 */
struct ieee80211_key_conf *
ieee80211_gtk_rekey_add(struct ieee80211_vif *vif,
                        struct ieee80211_key_conf *keyconf);

/**
 * ieee80211_gtk_rekey_notify - notify userspace supplicant of rekeying
 * @vif: virtual interface the rekeying was done on
 * @bssid: The BSSID of the AP, for checking association
 * @replay_ctr: the new replay counter after GTK rekeying
 * @gfp: allocation flags
 */
void ieee80211_gtk_rekey_notify(struct ieee80211_vif *vif, const u8 *bssid,
                                const u8 *replay_ctr, gfp_t gfp);

/**
 * ieee80211_wake_queue - wake specific queue
 * @hw: pointer as obtained from ieee80211_alloc_hw().
 * @queue: queue number (counted from zero).
 *
 * Drivers should use this function instead of netif_wake_queue.
 */
void ieee80211_wake_queue(struct ieee80211_hw *hw, int queue);

/**
 * ieee80211_stop_queue - stop specific queue
 * @hw: pointer as obtained from ieee80211_alloc_hw().
 * @queue: queue number (counted from zero).
 *
 * Drivers should use this function instead of netif_stop_queue.
 */
void ieee80211_stop_queue(struct ieee80211_hw *hw, int queue);

/**
 * ieee80211_queue_stopped - test status of the queue
 * @hw: pointer as obtained from ieee80211_alloc_hw().
 * @queue: queue number (counted from zero).
 *
 * Drivers should use this function instead of netif_stop_queue.
 *
 * Return: %true if the queue is stopped. %false otherwise.
 */

int ieee80211_queue_stopped(struct ieee80211_hw *hw, int queue);

/**
 * ieee80211_stop_queues - stop all queues
 * @hw: pointer as obtained from ieee80211_alloc_hw().
 *
 * Drivers should use this function instead of netif_stop_queue.
 */
void ieee80211_stop_queues(struct ieee80211_hw *hw);

/**
 * ieee80211_wake_queues - wake all queues
 * @hw: pointer as obtained from ieee80211_alloc_hw().
 *
 * Drivers should use this function instead of netif_wake_queue.
 */
void ieee80211_wake_queues(struct ieee80211_hw *hw);

/**
 * ieee80211_scan_completed - completed hardware scan
 *
 * When hardware scan offload is used (i.e. the hw_scan() callback is
 * assigned) this function needs to be called by the driver to notify
 * mac80211 that the scan finished. This function can be called from
 * any context, including hardirq context.
 *
 * @hw: the hardware that finished the scan
 * @info: information about the completed scan
 */
void ieee80211_scan_completed(struct ieee80211_hw *hw,
                              struct cfg80211_scan_info *info);

/**
 * ieee80211_sched_scan_results - got results from scheduled scan
 *
 * When a scheduled scan is running, this function needs to be called by the
 * driver whenever there are new scan results available.
 *
 * @hw: the hardware that is performing scheduled scans
 */
void ieee80211_sched_scan_results(struct ieee80211_hw *hw);

/**
 * ieee80211_sched_scan_stopped - inform that the scheduled scan has stopped
 *
 * When a scheduled scan is running, this function can be called by
 * the driver if it needs to stop the scan to perform another task.
 * Usual scenarios are drivers that cannot continue the scheduled scan
 * while associating, for instance.
 *
 * @hw: the hardware that is performing scheduled scans
 */
void ieee80211_sched_scan_stopped(struct ieee80211_hw *hw);

/**
 * enum ieee80211_interface_iteration_flags - interface iteration flags
 * @IEEE80211_IFACE_ITER_NORMAL: Iterate over all interfaces that have
 *        been added to the driver; However, note that during hardware
 *        reconfiguration (after restart_hw) it will iterate over a new
 *        interface and over all the existing interfaces even if they
 *        haven't been re-added to the driver yet.
 * @IEEE80211_IFACE_ITER_RESUME_ALL: During resume, iterate over all
 *        interfaces, even if they haven't been re-added to the driver yet.
 * @IEEE80211_IFACE_ITER_ACTIVE: Iterate only active interfaces (netdev is up).
 * @IEEE80211_IFACE_SKIP_SDATA_NOT_IN_DRIVER: Skip any interfaces where SDATA
 *        is not in the driver.  This may fix crashes during firmware recovery
 *        for instance.
 */
enum ieee80211_interface_iteration_flags {
        IEEE80211_IFACE_ITER_NORMAL        = 0,
        IEEE80211_IFACE_ITER_RESUME_ALL        = BIT(0),
        IEEE80211_IFACE_ITER_ACTIVE        = BIT(1),
        IEEE80211_IFACE_SKIP_SDATA_NOT_IN_DRIVER        = BIT(2),
};

/**
 * ieee80211_iterate_interfaces - iterate interfaces
 *
 * This function iterates over the interfaces associated with a given
 * hardware and calls the callback for them. This includes active as well as
 * inactive interfaces. This function allows the iterator function to sleep.
 * Will iterate over a new interface during add_interface().
 *
 * @hw: the hardware struct of which the interfaces should be iterated over
 * @iter_flags: iteration flags, see &enum ieee80211_interface_iteration_flags
 * @iterator: the iterator function to call
 * @data: first argument of the iterator function
 */
void ieee80211_iterate_interfaces(struct ieee80211_hw *hw, u32 iter_flags,
                                  void (*iterator)(void *data, u8 *mac,
                                                   struct ieee80211_vif *vif),
                                  void *data);

/**
 * ieee80211_iterate_active_interfaces - iterate active interfaces
 *
 * This function iterates over the interfaces associated with a given
 * hardware that are currently active and calls the callback for them.
 * This function allows the iterator function to sleep, when the iterator
 * function is atomic @ieee80211_iterate_active_interfaces_atomic can
 * be used.
 * Does not iterate over a new interface during add_interface().
 *
 * @hw: the hardware struct of which the interfaces should be iterated over
 * @iter_flags: iteration flags, see &enum ieee80211_interface_iteration_flags
 * @iterator: the iterator function to call
 * @data: first argument of the iterator function
 */
static inline void
ieee80211_iterate_active_interfaces(struct ieee80211_hw *hw, u32 iter_flags,
                                    void (*iterator)(void *data, u8 *mac,
                                                     struct ieee80211_vif *vif),
                                    void *data)
{
        ieee80211_iterate_interfaces(hw,
                                     iter_flags | IEEE80211_IFACE_ITER_ACTIVE,
                                     iterator, data);
}

/**
 * ieee80211_iterate_active_interfaces_atomic - iterate active interfaces
 *
 * This function iterates over the interfaces associated with a given
 * hardware that are currently active and calls the callback for them.
 * This function requires the iterator callback function to be atomic,
 * if that is not desired, use @ieee80211_iterate_active_interfaces instead.
 * Does not iterate over a new interface during add_interface().
 *
 * @hw: the hardware struct of which the interfaces should be iterated over
 * @iter_flags: iteration flags, see &enum ieee80211_interface_iteration_flags
 * @iterator: the iterator function to call, cannot sleep
 * @data: first argument of the iterator function
 */
void ieee80211_iterate_active_interfaces_atomic(struct ieee80211_hw *hw,
                                                u32 iter_flags,
                                                void (*iterator)(void *data,
                                                    u8 *mac,
                                                    struct ieee80211_vif *vif),
                                                void *data);

/**
 * ieee80211_iterate_active_interfaces_rtnl - iterate active interfaces
 *
 * This function iterates over the interfaces associated with a given
 * hardware that are currently active and calls the callback for them.
 * This version can only be used while holding the RTNL.
 *
 * @hw: the hardware struct of which the interfaces should be iterated over
 * @iter_flags: iteration flags, see &enum ieee80211_interface_iteration_flags
 * @iterator: the iterator function to call, cannot sleep
 * @data: first argument of the iterator function
 */
void ieee80211_iterate_active_interfaces_rtnl(struct ieee80211_hw *hw,
                                              u32 iter_flags,
                                              void (*iterator)(void *data,
                                                u8 *mac,
                                                struct ieee80211_vif *vif),
                                              void *data);

/**
 * ieee80211_iterate_stations_atomic - iterate stations
 *
 * This function iterates over all stations associated with a given
 * hardware that are currently uploaded to the driver and calls the callback
 * function for them.
 * This function requires the iterator callback function to be atomic,
 *
 * @hw: the hardware struct of which the interfaces should be iterated over
 * @iterator: the iterator function to call, cannot sleep
 * @data: first argument of the iterator function
 */
void ieee80211_iterate_stations_atomic(struct ieee80211_hw *hw,
                                       void (*iterator)(void *data,
                                                struct ieee80211_sta *sta),
                                       void *data);
/**
 * ieee80211_queue_work - add work onto the mac80211 workqueue
 *
 * Drivers and mac80211 use this to add work onto the mac80211 workqueue.
 * This helper ensures drivers are not queueing work when they should not be.
 *
 * @hw: the hardware struct for the interface we are adding work for
 * @work: the work we want to add onto the mac80211 workqueue
 */
void ieee80211_queue_work(struct ieee80211_hw *hw, struct work_struct *work);

/**
 * ieee80211_queue_delayed_work - add work onto the mac80211 workqueue
 *
 * Drivers and mac80211 use this to queue delayed work onto the mac80211
 * workqueue.
 *
 * @hw: the hardware struct for the interface we are adding work for
 * @dwork: delayable work to queue onto the mac80211 workqueue
 * @delay: number of jiffies to wait before queueing
 */
void ieee80211_queue_delayed_work(struct ieee80211_hw *hw,
                                  struct delayed_work *dwork,
                                  unsigned long delay);

/**
 * ieee80211_start_tx_ba_session - Start a tx Block Ack session.
 * @sta: the station for which to start a BA session
 * @tid: the TID to BA on.
 * @timeout: session timeout value (in TUs)
 *
 * Return: success if addBA request was sent, failure otherwise
 *
 * Although mac80211/low level driver/user space application can estimate
 * the need to start aggregation on a certain RA/TID, the session level
 * will be managed by the mac80211.
 */
int ieee80211_start_tx_ba_session(struct ieee80211_sta *sta, u16 tid,
                                  u16 timeout);

/**
 * ieee80211_start_tx_ba_cb_irqsafe - low level driver ready to aggregate.
 * @vif: &struct ieee80211_vif pointer from the add_interface callback
 * @ra: receiver address of the BA session recipient.
 * @tid: the TID to BA on.
 *
 * This function must be called by low level driver once it has
 * finished with preparations for the BA session. It can be called
 * from any context.
 */
void ieee80211_start_tx_ba_cb_irqsafe(struct ieee80211_vif *vif, const u8 *ra,
                                      u16 tid);

/**
 * ieee80211_stop_tx_ba_session - Stop a Block Ack session.
 * @sta: the station whose BA session to stop
 * @tid: the TID to stop BA.
 *
 * Return: negative error if the TID is invalid, or no aggregation active
 *
 * Although mac80211/low level driver/user space application can estimate
 * the need to stop aggregation on a certain RA/TID, the session level
 * will be managed by the mac80211.
 */
int ieee80211_stop_tx_ba_session(struct ieee80211_sta *sta, u16 tid);

/**
 * ieee80211_stop_tx_ba_cb_irqsafe - low level driver ready to stop aggregate.
 * @vif: &struct ieee80211_vif pointer from the add_interface callback
 * @ra: receiver address of the BA session recipient.
 * @tid: the desired TID to BA on.
 *
 * This function must be called by low level driver once it has
 * finished with preparations for the BA session tear down. It
 * can be called from any context.
 */
void ieee80211_stop_tx_ba_cb_irqsafe(struct ieee80211_vif *vif, const u8 *ra,
                                     u16 tid);

/**
 * ieee80211_find_sta - find a station
 *
 * @vif: virtual interface to look for station on
 * @addr: station's address
 *
 * Return: The station, if found. %NULL otherwise.
 *
 * Note: This function must be called under RCU lock and the
 * resulting pointer is only valid under RCU lock as well.
 */
struct ieee80211_sta *ieee80211_find_sta(struct ieee80211_vif *vif,
                                         const u8 *addr);

/**
 * ieee80211_find_sta_by_ifaddr - find a station on hardware
 *
 * @hw: pointer as obtained from ieee80211_alloc_hw()
 * @addr: remote station's address
 * @localaddr: local address (vif->sdata->vif.addr). Use NULL for 'any'.
 *
 * Return: The station, if found. %NULL otherwise.
 *
 * Note: This function must be called under RCU lock and the
 * resulting pointer is only valid under RCU lock as well.
 *
 * NOTE: You may pass NULL for localaddr, but then you will just get
 *      the first STA that matches the remote address 'addr'.
 *      We can have multiple STA associated with multiple
 *      logical stations (e.g. consider a station connecting to another
 *      BSSID on the same AP hardware without disconnecting first).
 *      In this case, the result of this method with localaddr NULL
 *      is not reliable.
 *
 * DO NOT USE THIS FUNCTION with localaddr NULL if at all possible.
 */
struct ieee80211_sta *ieee80211_find_sta_by_ifaddr(struct ieee80211_hw *hw,
                                               const u8 *addr,
                                               const u8 *localaddr);

/**
 * ieee80211_sta_block_awake - block station from waking up
 * @hw: the hardware
 * @pubsta: the station
 * @block: whether to block or unblock
 *
 * Some devices require that all frames that are on the queues
 * for a specific station that went to sleep are flushed before
 * a poll response or frames after the station woke up can be
 * delivered to that it. Note that such frames must be rejected
 * by the driver as filtered, with the appropriate status flag.
 *
 * This function allows implementing this mode in a race-free
 * manner.
 *
 * To do this, a driver must keep track of the number of frames
 * still enqueued for a specific station. If this number is not
 * zero when the station goes to sleep, the driver must call
 * this function to force mac80211 to consider the station to
 * be asleep regardless of the station's actual state. Once the
 * number of outstanding frames reaches zero, the driver must
 * call this function again to unblock the station. That will
 * cause mac80211 to be able to send ps-poll responses, and if
 * the station queried in the meantime then frames will also
 * be sent out as a result of this. Additionally, the driver
 * will be notified that the station woke up some time after
 * it is unblocked, regardless of whether the station actually
 * woke up while blocked or not.
 */
void ieee80211_sta_block_awake(struct ieee80211_hw *hw,
                               struct ieee80211_sta *pubsta, bool block);

/**
 * ieee80211_sta_eosp - notify mac80211 about end of SP
 * @pubsta: the station
 *
 * When a device transmits frames in a way that it can't tell
 * mac80211 in the TX status about the EOSP, it must clear the
 * %IEEE80211_TX_STATUS_EOSP bit and call this function instead.
 * This applies for PS-Poll as well as uAPSD.
 *
 * Note that just like with _tx_status() and _rx() drivers must
 * not mix calls to irqsafe/non-irqsafe versions, this function
 * must not be mixed with those either. Use the all irqsafe, or
 * all non-irqsafe, don't mix!
 *
 * NB: the _irqsafe version of this function doesn't exist, no
 *     driver needs it right now. Don't call this function if
 *     you'd need the _irqsafe version, look at the git history
 *     and restore the _irqsafe version!
 */
void ieee80211_sta_eosp(struct ieee80211_sta *pubsta);

/**
 * ieee80211_send_eosp_nullfunc - ask mac80211 to send NDP with EOSP
 * @pubsta: the station
 * @tid: the tid of the NDP
 *
 * Sometimes the device understands that it needs to close
 * the Service Period unexpectedly. This can happen when
 * sending frames that are filling holes in the BA window.
 * In this case, the device can ask mac80211 to send a
 * Nullfunc frame with EOSP set. When that happens, the
 * driver must have called ieee80211_sta_set_buffered() to
 * let mac80211 know that there are no buffered frames any
 * more, otherwise mac80211 will get the more_data bit wrong.
 * The low level driver must have made sure that the frame
 * will be sent despite the station being in power-save.
 * Mac80211 won't call allow_buffered_frames().
 * Note that calling this function, doesn't exempt the driver
 * from closing the EOSP properly, it will still have to call
 * ieee80211_sta_eosp when the NDP is sent.
 */
void ieee80211_send_eosp_nullfunc(struct ieee80211_sta *pubsta, int tid);

/**
 * ieee80211_sta_register_airtime - register airtime usage for a sta/tid
 *
 * Register airtime usage for a given sta on a given tid. The driver must call
 * this function to notify mac80211 that a station used a certain amount of
 * airtime. This information will be used by the TXQ scheduler to schedule
 * stations in a way that ensures airtime fairness.
 *
 * The reported airtime should as a minimum include all time that is spent
 * transmitting to the remote station, including overhead and padding, but not
 * including time spent waiting for a TXOP. If the time is not reported by the
 * hardware it can in some cases be calculated from the rate and known frame
 * composition. When possible, the time should include any failed transmission
 * attempts.
 *
 * The driver can either call this function synchronously for every packet or
 * aggregate, or asynchronously as airtime usage information becomes available.
 * TX and RX airtime can be reported together, or separately by setting one of
 * them to 0.
 *
 * @pubsta: the station
 * @tid: the TID to register airtime for
 * @tx_airtime: airtime used during TX (in usec)
 * @rx_airtime: airtime used during RX (in usec)
 */
void ieee80211_sta_register_airtime(struct ieee80211_sta *pubsta, u8 tid,
                                    u32 tx_airtime, u32 rx_airtime);

/**
 * ieee80211_txq_airtime_check - check if a txq can send frame to device
 *
 * @hw: pointer obtained from ieee80211_alloc_hw()
 * @txq: pointer obtained from station or virtual interface
 *
 * Return true if the AQL's airtime limit has not been reached and the txq can
 * continue to send more packets to the device. Otherwise return false.
 */
bool
ieee80211_txq_airtime_check(struct ieee80211_hw *hw, struct ieee80211_txq *txq);

/**
 * ieee80211_iter_keys - iterate keys programmed into the device
 * @hw: pointer obtained from ieee80211_alloc_hw()
 * @vif: virtual interface to iterate, may be %NULL for all
 * @iter: iterator function that will be called for each key
 * @iter_data: custom data to pass to the iterator function
 *
 * This function can be used to iterate all the keys known to
 * mac80211, even those that weren't previously programmed into
 * the device. This is intended for use in WoWLAN if the device
 * needs reprogramming of the keys during suspend. Note that due
 * to locking reasons, it is also only safe to call this at few
 * spots since it must hold the RTNL and be able to sleep.
 *
 * The order in which the keys are iterated matches the order
 * in which they were originally installed and handed to the
 * set_key callback.
 */
void ieee80211_iter_keys(struct ieee80211_hw *hw,
                         struct ieee80211_vif *vif,
                         void (*iter)(struct ieee80211_hw *hw,
                                      struct ieee80211_vif *vif,
                                      struct ieee80211_sta *sta,
                                      struct ieee80211_key_conf *key,
                                      void *data),
                         void *iter_data);

/**
 * ieee80211_iter_keys_rcu - iterate keys programmed into the device
 * @hw: pointer obtained from ieee80211_alloc_hw()
 * @vif: virtual interface to iterate, may be %NULL for all
 * @iter: iterator function that will be called for each key
 * @iter_data: custom data to pass to the iterator function
 *
 * This function can be used to iterate all the keys known to
 * mac80211, even those that weren't previously programmed into
 * the device. Note that due to locking reasons, keys of station
 * in removal process will be skipped.
 *
 * This function requires being called in an RCU critical section,
 * and thus iter must be atomic.
 */
void ieee80211_iter_keys_rcu(struct ieee80211_hw *hw,
                             struct ieee80211_vif *vif,
                             void (*iter)(struct ieee80211_hw *hw,
                                          struct ieee80211_vif *vif,
                                          struct ieee80211_sta *sta,
                                          struct ieee80211_key_conf *key,
                                          void *data),
                             void *iter_data);

/**
 * ieee80211_iter_chan_contexts_atomic - iterate channel contexts
 * @hw: pointer obtained from ieee80211_alloc_hw().
 * @iter: iterator function
 * @iter_data: data passed to iterator function
 *
 * Iterate all active channel contexts. This function is atomic and
 * doesn't acquire any locks internally that might be held in other
 * places while calling into the driver.
 *
 * The iterator will not find a context that's being added (during
 * the driver callback to add it) but will find it while it's being
 * removed.
 *
 * Note that during hardware restart, all contexts that existed
 * before the restart are considered already present so will be
 * found while iterating, whether they've been re-added already
 * or not.
 */
void ieee80211_iter_chan_contexts_atomic(
        struct ieee80211_hw *hw,
        void (*iter)(struct ieee80211_hw *hw,
                     struct ieee80211_chanctx_conf *chanctx_conf,
                     void *data),
        void *iter_data);

/**
 * ieee80211_ap_probereq_get - retrieve a Probe Request template
 * @hw: pointer obtained from ieee80211_alloc_hw().
 * @vif: &struct ieee80211_vif pointer from the add_interface callback.
 *
 * Creates a Probe Request template which can, for example, be uploaded to
 * hardware. The template is filled with bssid, ssid and supported rate
 * information. This function must only be called from within the
 * .bss_info_changed callback function and only in managed mode. The function
 * is only useful when the interface is associated, otherwise it will return
 * %NULL.
 *
 * Return: The Probe Request template. %NULL on error.
 */
struct sk_buff *ieee80211_ap_probereq_get(struct ieee80211_hw *hw,
                                          struct ieee80211_vif *vif);

/**
 * ieee80211_beacon_loss - inform hardware does not receive beacons
 *
 * @vif: &struct ieee80211_vif pointer from the add_interface callback.
 *
 * When beacon filtering is enabled with %IEEE80211_VIF_BEACON_FILTER and
 * %IEEE80211_CONF_PS is set, the driver needs to inform whenever the
 * hardware is not receiving beacons with this function.
 */
void ieee80211_beacon_loss(struct ieee80211_vif *vif);

/**
 * ieee80211_connection_loss - inform hardware has lost connection to the AP
 *
 * @vif: &struct ieee80211_vif pointer from the add_interface callback.
 *
 * When beacon filtering is enabled with %IEEE80211_VIF_BEACON_FILTER, and
 * %IEEE80211_CONF_PS and %IEEE80211_HW_CONNECTION_MONITOR are set, the driver
 * needs to inform if the connection to the AP has been lost.
 * The function may also be called if the connection needs to be terminated
 * for some other reason, even if %IEEE80211_HW_CONNECTION_MONITOR isn't set.
 *
 * This function will cause immediate change to disassociated state,
 * without connection recovery attempts.
 */
void ieee80211_connection_loss(struct ieee80211_vif *vif);

/**
 * ieee80211_resume_disconnect - disconnect from AP after resume
 *
 * @vif: &struct ieee80211_vif pointer from the add_interface callback.
 *
 * Instructs mac80211 to disconnect from the AP after resume.
 * Drivers can use this after WoWLAN if they know that the
 * connection cannot be kept up, for example because keys were
 * used while the device was asleep but the replay counters or
 * similar cannot be retrieved from the device during resume.
 *
 * Note that due to implementation issues, if the driver uses
 * the reconfiguration functionality during resume the interface
 * will still be added as associated first during resume and then
 * disconnect normally later.
 *
 * This function can only be called from the resume callback and
 * the driver must not be holding any of its own locks while it
 * calls this function, or at least not any locks it needs in the
 * key configuration paths (if it supports HW crypto).
 */
void ieee80211_resume_disconnect(struct ieee80211_vif *vif);

/**
 * ieee80211_hw_restart_disconnect - disconnect from AP after
 * hardware restart
 * @vif: &struct ieee80211_vif pointer from the add_interface callback.
 *
 * Instructs mac80211 to disconnect from the AP after
 * hardware restart.
 */
void ieee80211_hw_restart_disconnect(struct ieee80211_vif *vif);

/**
 * ieee80211_cqm_rssi_notify - inform a configured connection quality monitoring
 *        rssi threshold triggered
 *
 * @vif: &struct ieee80211_vif pointer from the add_interface callback.
 * @rssi_event: the RSSI trigger event type
 * @rssi_level: new RSSI level value or 0 if not available
 * @gfp: context flags
 *
 * When the %IEEE80211_VIF_SUPPORTS_CQM_RSSI is set, and a connection quality
 * monitoring is configured with an rssi threshold, the driver will inform
 * whenever the rssi level reaches the threshold.
 */
void ieee80211_cqm_rssi_notify(struct ieee80211_vif *vif,
                               enum nl80211_cqm_rssi_threshold_event rssi_event,
                               s32 rssi_level,
                               gfp_t gfp);

/**
 * ieee80211_cqm_beacon_loss_notify - inform CQM of beacon loss
 *
 * @vif: &struct ieee80211_vif pointer from the add_interface callback.
 * @gfp: context flags
 */
void ieee80211_cqm_beacon_loss_notify(struct ieee80211_vif *vif, gfp_t gfp);

/**
 * ieee80211_radar_detected - inform that a radar was detected
 *
 * @hw: pointer as obtained from ieee80211_alloc_hw()
 */
void ieee80211_radar_detected(struct ieee80211_hw *hw);

/**
 * ieee80211_chswitch_done - Complete channel switch process
 * @vif: &struct ieee80211_vif pointer from the add_interface callback.
 * @success: make the channel switch successful or not
 *
 * Complete the channel switch post-process: set the new operational channel
 * and wake up the suspended queues.
 */
void ieee80211_chswitch_done(struct ieee80211_vif *vif, bool success);

/**
 * ieee80211_request_smps - request SM PS transition
 * @vif: &struct ieee80211_vif pointer from the add_interface callback.
 * @smps_mode: new SM PS mode
 *
 * This allows the driver to request an SM PS transition in managed
 * mode. This is useful when the driver has more information than
 * the stack about possible interference, for example by bluetooth.
 */
void ieee80211_request_smps(struct ieee80211_vif *vif,
                            enum ieee80211_smps_mode smps_mode);

/**
 * ieee80211_ready_on_channel - notification of remain-on-channel start
 * @hw: pointer as obtained from ieee80211_alloc_hw()
 */
void ieee80211_ready_on_channel(struct ieee80211_hw *hw);

/**
 * ieee80211_remain_on_channel_expired - remain_on_channel duration expired
 * @hw: pointer as obtained from ieee80211_alloc_hw()
 */
void ieee80211_remain_on_channel_expired(struct ieee80211_hw *hw);

/**
 * ieee80211_stop_rx_ba_session - callback to stop existing BA sessions
 *
 * in order not to harm the system performance and user experience, the device
 * may request not to allow any rx ba session and tear down existing rx ba
 * sessions based on system constraints such as periodic BT activity that needs
 * to limit wlan activity (eg.sco or a2dp)."
 * in such cases, the intention is to limit the duration of the rx ppdu and
 * therefore prevent the peer device to use a-mpdu aggregation.
 *
 * @vif: &struct ieee80211_vif pointer from the add_interface callback.
 * @ba_rx_bitmap: Bit map of open rx ba per tid
 * @addr: & to bssid mac address
 */
void ieee80211_stop_rx_ba_session(struct ieee80211_vif *vif, u16 ba_rx_bitmap,
                                  const u8 *addr);

/**
 * ieee80211_mark_rx_ba_filtered_frames - move RX BA window and mark filtered
 * @pubsta: station struct
 * @tid: the session's TID
 * @ssn: starting sequence number of the bitmap, all frames before this are
 *        assumed to be out of the window after the call
 * @filtered: bitmap of filtered frames, BIT(0) is the @ssn entry etc.
 * @received_mpdus: number of received mpdus in firmware
 *
 * This function moves the BA window and releases all frames before @ssn, and
 * marks frames marked in the bitmap as having been filtered. Afterwards, it
 * checks if any frames in the window starting from @ssn can now be released
 * (in case they were only waiting for frames that were filtered.)
 */
void ieee80211_mark_rx_ba_filtered_frames(struct ieee80211_sta *pubsta, u8 tid,
                                          u16 ssn, u64 filtered,
                                          u16 received_mpdus);

/**
 * ieee80211_send_bar - send a BlockAckReq frame
 *
 * can be used to flush pending frames from the peer's aggregation reorder
 * buffer.
 *
 * @vif: &struct ieee80211_vif pointer from the add_interface callback.
 * @ra: the peer's destination address
 * @tid: the TID of the aggregation session
 * @ssn: the new starting sequence number for the receiver
 */
void ieee80211_send_bar(struct ieee80211_vif *vif, u8 *ra, u16 tid, u16 ssn);

/**
 * ieee80211_manage_rx_ba_offl - helper to queue an RX BA work
 * @vif: &struct ieee80211_vif pointer from the add_interface callback
 * @addr: station mac address
 * @tid: the rx tid
 */
void ieee80211_manage_rx_ba_offl(struct ieee80211_vif *vif, const u8 *addr,
                                 unsigned int tid);

/**
 * ieee80211_start_rx_ba_session_offl - start a Rx BA session
 *
 * Some device drivers may offload part of the Rx aggregation flow including
 * AddBa/DelBa negotiation but may otherwise be incapable of full Rx
 * reordering.
 *
 * Create structures responsible for reordering so device drivers may call here
 * when they complete AddBa negotiation.
 *
 * @vif: &struct ieee80211_vif pointer from the add_interface callback
 * @addr: station mac address
 * @tid: the rx tid
 */
static inline void ieee80211_start_rx_ba_session_offl(struct ieee80211_vif *vif,
                                                      const u8 *addr, u16 tid)
{
        if (WARN_ON(tid >= IEEE80211_NUM_TIDS))
                return;
        ieee80211_manage_rx_ba_offl(vif, addr, tid);
}

/**
 * ieee80211_stop_rx_ba_session_offl - stop a Rx BA session
 *
 * Some device drivers may offload part of the Rx aggregation flow including
 * AddBa/DelBa negotiation but may otherwise be incapable of full Rx
 * reordering.
 *
 * Destroy structures responsible for reordering so device drivers may call here
 * when they complete DelBa negotiation.
 *
 * @vif: &struct ieee80211_vif pointer from the add_interface callback
 * @addr: station mac address
 * @tid: the rx tid
 */
static inline void ieee80211_stop_rx_ba_session_offl(struct ieee80211_vif *vif,
                                                     const u8 *addr, u16 tid)
{
        if (WARN_ON(tid >= IEEE80211_NUM_TIDS))
                return;
        ieee80211_manage_rx_ba_offl(vif, addr, tid + IEEE80211_NUM_TIDS);
}

/**
 * ieee80211_rx_ba_timer_expired - stop a Rx BA session due to timeout
 *
 * Some device drivers do not offload AddBa/DelBa negotiation, but handle rx
 * buffer reording internally, and therefore also handle the session timer.
 *
 * Trigger the timeout flow, which sends a DelBa.
 *
 * @vif: &struct ieee80211_vif pointer from the add_interface callback
 * @addr: station mac address
 * @tid: the rx tid
 */
void ieee80211_rx_ba_timer_expired(struct ieee80211_vif *vif,
                                   const u8 *addr, unsigned int tid);

/* Rate control API */

/**
 * struct ieee80211_tx_rate_control - rate control information for/from RC algo
 *
 * @hw: The hardware the algorithm is invoked for.
 * @sband: The band this frame is being transmitted on.
 * @bss_conf: the current BSS configuration
 * @skb: the skb that will be transmitted, the control information in it needs
 *        to be filled in
 * @reported_rate: The rate control algorithm can fill this in to indicate
 *        which rate should be reported to userspace as the current rate and
 *        used for rate calculations in the mesh network.
 * @rts: whether RTS will be used for this frame because it is longer than the
 *        RTS threshold
 * @short_preamble: whether mac80211 will request short-preamble transmission
 *        if the selected rate supports it
 * @rate_idx_mask: user-requested (legacy) rate mask
 * @rate_idx_mcs_mask: user-requested MCS rate mask (NULL if not in use)
 * @bss: whether this frame is sent out in AP or IBSS mode
 */
struct ieee80211_tx_rate_control {
        struct ieee80211_hw *hw;
        struct ieee80211_supported_band *sband;
        struct ieee80211_bss_conf *bss_conf;
        struct sk_buff *skb;
        struct ieee80211_tx_rate reported_rate;
        bool rts, short_preamble;
        u32 rate_idx_mask;
        u8 *rate_idx_mcs_mask;
        bool bss;
};

/**
 * enum rate_control_capabilities - rate control capabilities
 */
enum rate_control_capabilities {
        /**
         * @RATE_CTRL_CAPA_VHT_EXT_NSS_BW:
         * Support for extended NSS BW support (dot11VHTExtendedNSSCapable)
         * Note that this is only looked at if the minimum number of chains
         * that the AP uses is < the number of TX chains the hardware has,
         * otherwise the NSS difference doesn't bother us.
         */
        RATE_CTRL_CAPA_VHT_EXT_NSS_BW = BIT(0),
};

struct rate_control_ops {
        unsigned long capa;
        const char *name;
        void *(*alloc)(struct ieee80211_hw *hw);
        void (*add_debugfs)(struct ieee80211_hw *hw, void *priv,
                            struct dentry *debugfsdir);
        void (*free)(void *priv);

        void *(*alloc_sta)(void *priv, struct ieee80211_sta *sta, gfp_t gfp);
        void (*rate_init)(void *priv, struct ieee80211_supported_band *sband,
                          struct cfg80211_chan_def *chandef,
                          struct ieee80211_sta *sta, void *priv_sta);
        void (*rate_update)(void *priv, struct ieee80211_supported_band *sband,
                            struct cfg80211_chan_def *chandef,
                            struct ieee80211_sta *sta, void *priv_sta,
                            u32 changed);
        void (*free_sta)(void *priv, struct ieee80211_sta *sta,
                         void *priv_sta);

        void (*tx_status_ext)(void *priv,
                              struct ieee80211_supported_band *sband,
                              void *priv_sta, struct ieee80211_tx_status *st);
        void (*tx_status)(void *priv, struct ieee80211_supported_band *sband,
                          struct ieee80211_sta *sta, void *priv_sta,
                          struct sk_buff *skb);
        void (*get_rate)(void *priv, struct ieee80211_sta *sta, void *priv_sta,
                         struct ieee80211_tx_rate_control *txrc);

        void (*add_sta_debugfs)(void *priv, void *priv_sta,
                                struct dentry *dir);

        u32 (*get_expected_throughput)(void *priv_sta);
};

static inline int rate_supported(struct ieee80211_sta *sta,
                                 enum nl80211_band band,
                                 int index)
{
        return (sta == NULL || sta->supp_rates[band] & BIT(index));
}

static inline s8
rate_lowest_index(struct ieee80211_supported_band *sband,
                  struct ieee80211_sta *sta)
{
        int i;

        for (i = 0; i < sband->n_bitrates; i++)
                if (rate_supported(sta, sband->band, i))
                        return i;

        /* warn when we cannot find a rate. */
        WARN_ON_ONCE(1);

        /* and return 0 (the lowest index) */
        return 0;
}

static inline
bool rate_usable_index_exists(struct ieee80211_supported_band *sband,
                              struct ieee80211_sta *sta)
{
        unsigned int i;

        for (i = 0; i < sband->n_bitrates; i++)
                if (rate_supported(sta, sband->band, i))
                        return true;
        return false;
}

/**
 * rate_control_set_rates - pass the sta rate selection to mac80211/driver
 *
 * When not doing a rate control probe to test rates, rate control should pass
 * its rate selection to mac80211. If the driver supports receiving a station
 * rate table, it will use it to ensure that frames are always sent based on
 * the most recent rate control module decision.
 *
 * @hw: pointer as obtained from ieee80211_alloc_hw()
 * @pubsta: &struct ieee80211_sta pointer to the target destination.
 * @rates: new tx rate set to be used for this station.
 */
int rate_control_set_rates(struct ieee80211_hw *hw,
                           struct ieee80211_sta *pubsta,
                           struct ieee80211_sta_rates *rates);

int ieee80211_rate_control_register(const struct rate_control_ops *ops);
void ieee80211_rate_control_unregister(const struct rate_control_ops *ops);

static inline bool
conf_is_ht20(struct ieee80211_conf *conf)
{
        return conf->chandef.width == NL80211_CHAN_WIDTH_20;
}

static inline bool
conf_is_ht40_minus(struct ieee80211_conf *conf)
{
        return conf->chandef.width == NL80211_CHAN_WIDTH_40 &&
               conf->chandef.center_freq1 < conf->chandef.chan->center_freq;
}

static inline bool
conf_is_ht40_plus(struct ieee80211_conf *conf)
{
        return conf->chandef.width == NL80211_CHAN_WIDTH_40 &&
               conf->chandef.center_freq1 > conf->chandef.chan->center_freq;
}

static inline bool
conf_is_ht40(struct ieee80211_conf *conf)
{
        return conf->chandef.width == NL80211_CHAN_WIDTH_40;
}

static inline bool
conf_is_ht(struct ieee80211_conf *conf)
{
        return (conf->chandef.width != NL80211_CHAN_WIDTH_5) &&
                (conf->chandef.width != NL80211_CHAN_WIDTH_10) &&
                (conf->chandef.width != NL80211_CHAN_WIDTH_20_NOHT);
}

static inline enum nl80211_iftype
ieee80211_iftype_p2p(enum nl80211_iftype type, bool p2p)
{
        if (p2p) {
                switch (type) {
                case NL80211_IFTYPE_STATION:
                        return NL80211_IFTYPE_P2P_CLIENT;
                case NL80211_IFTYPE_AP:
                        return NL80211_IFTYPE_P2P_GO;
                default:
                        break;
                }
        }
        return type;
}

static inline enum nl80211_iftype
ieee80211_vif_type_p2p(struct ieee80211_vif *vif)
{
        return ieee80211_iftype_p2p(vif->type, vif->p2p);
}

/**
 * ieee80211_update_mu_groups - set the VHT MU-MIMO groud data
 *
 * @vif: the specified virtual interface
 * @membership: 64 bits array - a bit is set if station is member of the group
 * @position: 2 bits per group id indicating the position in the group
 *
 * Note: This function assumes that the given vif is valid and the position and
 * membership data is of the correct size and are in the same byte order as the
 * matching GroupId management frame.
 * Calls to this function need to be serialized with RX path.
 */
void ieee80211_update_mu_groups(struct ieee80211_vif *vif,
                                const u8 *membership, const u8 *position);

void ieee80211_enable_rssi_reports(struct ieee80211_vif *vif,
                                   int rssi_min_thold,
                                   int rssi_max_thold);

void ieee80211_disable_rssi_reports(struct ieee80211_vif *vif);

/**
 * ieee80211_ave_rssi - report the average RSSI for the specified interface
 *
 * @vif: the specified virtual interface
 *
 * Note: This function assumes that the given vif is valid.
 *
 * Return: The average RSSI value for the requested interface, or 0 if not
 * applicable.
 */
int ieee80211_ave_rssi(struct ieee80211_vif *vif);

/**
 * ieee80211_report_wowlan_wakeup - report WoWLAN wakeup
 * @vif: virtual interface
 * @wakeup: wakeup reason(s)
 * @gfp: allocation flags
 *
 * See cfg80211_report_wowlan_wakeup().
 */
void ieee80211_report_wowlan_wakeup(struct ieee80211_vif *vif,
                                    struct cfg80211_wowlan_wakeup *wakeup,
                                    gfp_t gfp);

/**
 * ieee80211_tx_prepare_skb - prepare an 802.11 skb for transmission
 * @hw: pointer as obtained from ieee80211_alloc_hw()
 * @vif: virtual interface
 * @skb: frame to be sent from within the driver
 * @band: the band to transmit on
 * @sta: optional pointer to get the station to send the frame to
 *
 * Note: must be called under RCU lock
 */
bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw,
                              struct ieee80211_vif *vif, struct sk_buff *skb,
                              int band, struct ieee80211_sta **sta);

/**
 * ieee80211_parse_tx_radiotap - Sanity-check and parse the radiotap header
 *                                 of injected frames.
 *
 * To accurately parse and take into account rate and retransmission fields,
 * you must initialize the chandef field in the ieee80211_tx_info structure
 * of the skb before calling this function.
 *
 * @skb: packet injected by userspace
 * @dev: the &struct device of this 802.11 device
 */
bool ieee80211_parse_tx_radiotap(struct sk_buff *skb,
                                 struct net_device *dev);

/**
 * struct ieee80211_noa_data - holds temporary data for tracking P2P NoA state
 *
 * @next_tsf: TSF timestamp of the next absent state change
 * @has_next_tsf: next absent state change event pending
 *
 * @absent: descriptor bitmask, set if GO is currently absent
 *
 * private:
 *
 * @count: count fields from the NoA descriptors
 * @desc: adjusted data from the NoA
 */
struct ieee80211_noa_data {
        u32 next_tsf;
        bool has_next_tsf;

        u8 absent;

        u8 count[IEEE80211_P2P_NOA_DESC_MAX];
        struct {
                u32 start;
                u32 duration;
                u32 interval;
        } desc[IEEE80211_P2P_NOA_DESC_MAX];
};

/**
 * ieee80211_parse_p2p_noa - initialize NoA tracking data from P2P IE
 *
 * @attr: P2P NoA IE
 * @data: NoA tracking data
 * @tsf: current TSF timestamp
 *
 * Return: number of successfully parsed descriptors
 */
int ieee80211_parse_p2p_noa(const struct ieee80211_p2p_noa_attr *attr,
                            struct ieee80211_noa_data *data, u32 tsf);

/**
 * ieee80211_update_p2p_noa - get next pending P2P GO absent state change
 *
 * @data: NoA tracking data
 * @tsf: current TSF timestamp
 */
void ieee80211_update_p2p_noa(struct ieee80211_noa_data *data, u32 tsf);

/**
 * ieee80211_tdls_oper_request - request userspace to perform a TDLS operation
 * @vif: virtual interface
 * @peer: the peer's destination address
 * @oper: the requested TDLS operation
 * @reason_code: reason code for the operation, valid for TDLS teardown
 * @gfp: allocation flags
 *
 * See cfg80211_tdls_oper_request().
 */
void ieee80211_tdls_oper_request(struct ieee80211_vif *vif, const u8 *peer,
                                 enum nl80211_tdls_operation oper,
                                 u16 reason_code, gfp_t gfp);

/**
 * ieee80211_reserve_tid - request to reserve a specific TID
 *
 * There is sometimes a need (such as in TDLS) for blocking the driver from
 * using a specific TID so that the FW can use it for certain operations such
 * as sending PTI requests. To make sure that the driver doesn't use that TID,
 * this function must be called as it flushes out packets on this TID and marks
 * it as blocked, so that any transmit for the station on this TID will be
 * redirected to the alternative TID in the same AC.
 *
 * Note that this function blocks and may call back into the driver, so it
 * should be called without driver locks held. Also note this function should
 * only be called from the driver's @sta_state callback.
 *
 * @sta: the station to reserve the TID for
 * @tid: the TID to reserve
 *
 * Returns: 0 on success, else on failure
 */
int ieee80211_reserve_tid(struct ieee80211_sta *sta, u8 tid);

/**
 * ieee80211_unreserve_tid - request to unreserve a specific TID
 *
 * Once there is no longer any need for reserving a certain TID, this function
 * should be called, and no longer will packets have their TID modified for
 * preventing use of this TID in the driver.
 *
 * Note that this function blocks and acquires a lock, so it should be called
 * without driver locks held. Also note this function should only be called
 * from the driver's @sta_state callback.
 *
 * @sta: the station
 * @tid: the TID to unreserve
 */
void ieee80211_unreserve_tid(struct ieee80211_sta *sta, u8 tid);

/**
 * ieee80211_tx_dequeue - dequeue a packet from a software tx queue
 *
 * @hw: pointer as obtained from ieee80211_alloc_hw()
 * @txq: pointer obtained from station or virtual interface, or from
 *        ieee80211_next_txq()
 *
 * Returns the skb if successful, %NULL if no frame was available.
 *
 * Note that this must be called in an rcu_read_lock() critical section,
 * which can only be released after the SKB was handled. Some pointers in
 * skb->cb, e.g. the key pointer, are protected by RCU and thus the
 * critical section must persist not just for the duration of this call
 * but for the duration of the frame handling.
 * However, also note that while in the wake_tx_queue() method,
 * rcu_read_lock() is already held.
 *
 * softirqs must also be disabled when this function is called.
 * In process context, use ieee80211_tx_dequeue_ni() instead.
 */
struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
                                     struct ieee80211_txq *txq);

/**
 * ieee80211_tx_dequeue_ni - dequeue a packet from a software tx queue
 * (in process context)
 *
 * Like ieee80211_tx_dequeue() but can be called in process context
 * (internally disables bottom halves).
 *
 * @hw: pointer as obtained from ieee80211_alloc_hw()
 * @txq: pointer obtained from station or virtual interface, or from
 *        ieee80211_next_txq()
 */
static inline struct sk_buff *ieee80211_tx_dequeue_ni(struct ieee80211_hw *hw,
                                                      struct ieee80211_txq *txq)
{
        struct sk_buff *skb;

        local_bh_disable();
        skb = ieee80211_tx_dequeue(hw, txq);
        local_bh_enable();

        return skb;
}

/**
 * ieee80211_next_txq - get next tx queue to pull packets from
 *
 * @hw: pointer as obtained from ieee80211_alloc_hw()
 * @ac: AC number to return packets from.
 *
 * Returns the next txq if successful, %NULL if no queue is eligible. If a txq
 * is returned, it should be returned with ieee80211_return_txq() after the
 * driver has finished scheduling it.
 */
struct ieee80211_txq *ieee80211_next_txq(struct ieee80211_hw *hw, u8 ac);

/**
 * ieee80211_txq_schedule_start - start new scheduling round for TXQs
 *
 * @hw: pointer as obtained from ieee80211_alloc_hw()
 * @ac: AC number to acquire locks for
 *
 * Should be called before ieee80211_next_txq() or ieee80211_return_txq().
 * The driver must not call multiple TXQ scheduling rounds concurrently.
 */
void ieee80211_txq_schedule_start(struct ieee80211_hw *hw, u8 ac);

/* (deprecated) */
static inline void ieee80211_txq_schedule_end(struct ieee80211_hw *hw, u8 ac)
{
}

void __ieee80211_schedule_txq(struct ieee80211_hw *hw,
                              struct ieee80211_txq *txq, bool force);

/**
 * ieee80211_schedule_txq - schedule a TXQ for transmission
 *
 * @hw: pointer as obtained from ieee80211_alloc_hw()
 * @txq: pointer obtained from station or virtual interface
 *
 * Schedules a TXQ for transmission if it is not already scheduled,
 * even if mac80211 does not have any packets buffered.
 *
 * The driver may call this function if it has buffered packets for
 * this TXQ internally.
 */
static inline void
ieee80211_schedule_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq)
{
        __ieee80211_schedule_txq(hw, txq, true);
}

/**
 * ieee80211_return_txq - return a TXQ previously acquired by ieee80211_next_txq()
 *
 * @hw: pointer as obtained from ieee80211_alloc_hw()
 * @txq: pointer obtained from station or virtual interface
 * @force: schedule txq even if mac80211 does not have any buffered packets.
 *
 * The driver may set force=true if it has buffered packets for this TXQ
 * internally.
 */
static inline void
ieee80211_return_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq,
                     bool force)
{
        __ieee80211_schedule_txq(hw, txq, force);
}

/**
 * ieee80211_txq_may_transmit - check whether TXQ is allowed to transmit
 *
 * This function is used to check whether given txq is allowed to transmit by
 * the airtime scheduler, and can be used by drivers to access the airtime
 * fairness accounting without going using the scheduling order enfored by
 * next_txq().
 *
 * Returns %true if the airtime scheduler thinks the TXQ should be allowed to
 * transmit, and %false if it should be throttled. This function can also have
 * the side effect of rotating the TXQ in the scheduler rotation, which will
 * eventually bring the deficit to positive and allow the station to transmit
 * again.
 *
 * The API ieee80211_txq_may_transmit() also ensures that TXQ list will be
 * aligned against driver's own round-robin scheduler list. i.e it rotates
 * the TXQ list till it makes the requested node becomes the first entry
 * in TXQ list. Thus both the TXQ list and driver's list are in sync. If this
 * function returns %true, the driver is expected to schedule packets
 * for transmission, and then return the TXQ through ieee80211_return_txq().
 *
 * @hw: pointer as obtained from ieee80211_alloc_hw()
 * @txq: pointer obtained from station or virtual interface
 */
bool ieee80211_txq_may_transmit(struct ieee80211_hw *hw,
                                struct ieee80211_txq *txq);

/**
 * ieee80211_txq_get_depth - get pending frame/byte count of given txq
 *
 * The values are not guaranteed to be coherent with regard to each other, i.e.
 * txq state can change half-way of this function and the caller may end up
 * with "new" frame_cnt and "old" byte_cnt or vice-versa.
 *
 * @txq: pointer obtained from station or virtual interface
 * @frame_cnt: pointer to store frame count
 * @byte_cnt: pointer to store byte count
 */
void ieee80211_txq_get_depth(struct ieee80211_txq *txq,
                             unsigned long *frame_cnt,
                             unsigned long *byte_cnt);

/**
 * ieee80211_nan_func_terminated - notify about NAN function termination.
 *
 * This function is used to notify mac80211 about NAN function termination.
 * Note that this function can't be called from hard irq.
 *
 * @vif: &struct ieee80211_vif pointer from the add_interface callback.
 * @inst_id: the local instance id
 * @reason: termination reason (one of the NL80211_NAN_FUNC_TERM_REASON_*)
 * @gfp: allocation flags
 */
void ieee80211_nan_func_terminated(struct ieee80211_vif *vif,
                                   u8 inst_id,
                                   enum nl80211_nan_func_term_reason reason,
                                   gfp_t gfp);

/**
 * ieee80211_nan_func_match - notify about NAN function match event.
 *
 * This function is used to notify mac80211 about NAN function match. The
 * cookie inside the match struct will be assigned by mac80211.
 * Note that this function can't be called from hard irq.
 *
 * @vif: &struct ieee80211_vif pointer from the add_interface callback.
 * @match: match event information
 * @gfp: allocation flags
 */
void ieee80211_nan_func_match(struct ieee80211_vif *vif,
                              struct cfg80211_nan_match_params *match,
                              gfp_t gfp);

/**
 * ieee80211_calc_rx_airtime - calculate estimated transmission airtime for RX.
 *
 * This function calculates the estimated airtime usage of a frame based on the
 * rate information in the RX status struct and the frame length.
 *
 * @hw: pointer as obtained from ieee80211_alloc_hw()
 * @status: &struct ieee80211_rx_status containing the transmission rate
 *          information.
 * @len: frame length in bytes
 */
u32 ieee80211_calc_rx_airtime(struct ieee80211_hw *hw,
                              struct ieee80211_rx_status *status,
                              int len);

/**
 * ieee80211_calc_tx_airtime - calculate estimated transmission airtime for TX.
 *
 * This function calculates the estimated airtime usage of a frame based on the
 * rate information in the TX info struct and the frame length.
 *
 * @hw: pointer as obtained from ieee80211_alloc_hw()
 * @info: &struct ieee80211_tx_info of the frame.
 * @len: frame length in bytes
 */
u32 ieee80211_calc_tx_airtime(struct ieee80211_hw *hw,
                              struct ieee80211_tx_info *info,
                              int len);
/**
 * ieee80211_set_hw_80211_encap - enable hardware encapsulation offloading.
 *
 * This function is used to notify mac80211 that a vif can be passed raw 802.3
 * frames. The driver needs to then handle the 802.11 encapsulation inside the
 * hardware or firmware.
 *
 * @vif: &struct ieee80211_vif pointer from the add_interface callback.
 * @enable: indicate if the feature should be turned on or off
 */
bool ieee80211_set_hw_80211_encap(struct ieee80211_vif *vif, bool enable);

/**
 * ieee80211_get_fils_discovery_tmpl - Get FILS discovery template.
 * @hw: pointer obtained from ieee80211_alloc_hw().
 * @vif: &struct ieee80211_vif pointer from the add_interface callback.
 *
 * The driver is responsible for freeing the returned skb.
 *
 * Return: FILS discovery template. %NULL on error.
 */
struct sk_buff *ieee80211_get_fils_discovery_tmpl(struct ieee80211_hw *hw,
                                                  struct ieee80211_vif *vif);

/**
 * ieee80211_get_unsol_bcast_probe_resp_tmpl - Get unsolicited broadcast
 *        probe response template.
 * @hw: pointer obtained from ieee80211_alloc_hw().
 * @vif: &struct ieee80211_vif pointer from the add_interface callback.
 *
 * The driver is responsible for freeing the returned skb.
 *
 * Return: Unsolicited broadcast probe response template. %NULL on error.
 */
struct sk_buff *
ieee80211_get_unsol_bcast_probe_resp_tmpl(struct ieee80211_hw *hw,
                                          struct ieee80211_vif *vif);
#endif /* MAC80211_H */


























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
/* SPDX-License-Identifier: GPL-2.0 */
/*
 *  linux/include/linux/nmi.h
 */
#ifndef LINUX_NMI_H
#define LINUX_NMI_H

#include <linux/sched.h>
#include <asm/irq.h>
#if defined(CONFIG_HAVE_NMI_WATCHDOG)
#include <asm/nmi.h>
#endif

#ifdef CONFIG_LOCKUP_DETECTOR
void lockup_detector_init(void);
void lockup_detector_soft_poweroff(void);
void lockup_detector_cleanup(void);
bool is_hardlockup(void);

extern int watchdog_user_enabled;
extern int nmi_watchdog_user_enabled;
extern int soft_watchdog_user_enabled;
extern int watchdog_thresh;
extern unsigned long watchdog_enabled;

extern struct cpumask watchdog_cpumask;
extern unsigned long *watchdog_cpumask_bits;
#ifdef CONFIG_SMP
extern int sysctl_softlockup_all_cpu_backtrace;
extern int sysctl_hardlockup_all_cpu_backtrace;
#else
#define sysctl_softlockup_all_cpu_backtrace 0
#define sysctl_hardlockup_all_cpu_backtrace 0
#endif /* !CONFIG_SMP */

#else /* CONFIG_LOCKUP_DETECTOR */
static inline void lockup_detector_init(void) { }
static inline void lockup_detector_soft_poweroff(void) { }
static inline void lockup_detector_cleanup(void) { }
#endif /* !CONFIG_LOCKUP_DETECTOR */

#ifdef CONFIG_SOFTLOCKUP_DETECTOR
extern void touch_softlockup_watchdog_sched(void);
extern void touch_softlockup_watchdog(void);
extern void touch_softlockup_watchdog_sync(void);
extern void touch_all_softlockup_watchdogs(void);
extern unsigned int  softlockup_panic;

extern int lockup_detector_online_cpu(unsigned int cpu);
extern int lockup_detector_offline_cpu(unsigned int cpu);
#else /* CONFIG_SOFTLOCKUP_DETECTOR */
static inline void touch_softlockup_watchdog_sched(void) { }
static inline void touch_softlockup_watchdog(void) { }
static inline void touch_softlockup_watchdog_sync(void) { }
static inline void touch_all_softlockup_watchdogs(void) { }

#define lockup_detector_online_cpu        NULL
#define lockup_detector_offline_cpu        NULL
#endif /* CONFIG_SOFTLOCKUP_DETECTOR */

#ifdef CONFIG_DETECT_HUNG_TASK
void reset_hung_task_detector(void);
#else
static inline void reset_hung_task_detector(void) { }
#endif

/*
 * The run state of the lockup detectors is controlled by the content of the
 * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit -
 * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector.
 *
 * 'watchdog_user_enabled', 'nmi_watchdog_user_enabled' and
 * 'soft_watchdog_user_enabled' are variables that are only used as an
 * 'interface' between the parameters in /proc/sys/kernel and the internal
 * state bits in 'watchdog_enabled'. The 'watchdog_thresh' variable is
 * handled differently because its value is not boolean, and the lockup
 * detectors are 'suspended' while 'watchdog_thresh' is equal zero.
 */
#define NMI_WATCHDOG_ENABLED_BIT   0
#define SOFT_WATCHDOG_ENABLED_BIT  1
#define NMI_WATCHDOG_ENABLED      (1 << NMI_WATCHDOG_ENABLED_BIT)
#define SOFT_WATCHDOG_ENABLED     (1 << SOFT_WATCHDOG_ENABLED_BIT)

#if defined(CONFIG_HARDLOCKUP_DETECTOR)
extern void hardlockup_detector_disable(void);
extern unsigned int hardlockup_panic;
#else
static inline void hardlockup_detector_disable(void) {}
#endif

#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
# define NMI_WATCHDOG_SYSCTL_PERM        0644
#else
# define NMI_WATCHDOG_SYSCTL_PERM        0444
#endif

#if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF)
extern void arch_touch_nmi_watchdog(void);
extern void hardlockup_detector_perf_stop(void);
extern void hardlockup_detector_perf_restart(void);
extern void hardlockup_detector_perf_disable(void);
extern void hardlockup_detector_perf_enable(void);
extern void hardlockup_detector_perf_cleanup(void);
extern int hardlockup_detector_perf_init(void);
#else
static inline void hardlockup_detector_perf_stop(void) { }
static inline void hardlockup_detector_perf_restart(void) { }
static inline void hardlockup_detector_perf_disable(void) { }
static inline void hardlockup_detector_perf_enable(void) { }
static inline void hardlockup_detector_perf_cleanup(void) { }
# if !defined(CONFIG_HAVE_NMI_WATCHDOG)
static inline int hardlockup_detector_perf_init(void) { return -ENODEV; }
static inline void arch_touch_nmi_watchdog(void) {}
# else
static inline int hardlockup_detector_perf_init(void) { return 0; }
# endif
#endif

void watchdog_nmi_stop(void);
void watchdog_nmi_start(void);
int watchdog_nmi_probe(void);
int watchdog_nmi_enable(unsigned int cpu);
void watchdog_nmi_disable(unsigned int cpu);

void lockup_detector_reconfigure(void);

/**
 * touch_nmi_watchdog - restart NMI watchdog timeout.
 *
 * If the architecture supports the NMI watchdog, touch_nmi_watchdog()
 * may be used to reset the timeout - for code which intentionally
 * disables interrupts for a long time. This call is stateless.
 */
static inline void touch_nmi_watchdog(void)
{
        arch_touch_nmi_watchdog();
        touch_softlockup_watchdog();
}

/*
 * Create trigger_all_cpu_backtrace() out of the arch-provided
 * base function. Return whether such support was available,
 * to allow calling code to fall back to some other mechanism:
 */
#ifdef arch_trigger_cpumask_backtrace
static inline bool trigger_all_cpu_backtrace(void)
{
        arch_trigger_cpumask_backtrace(cpu_online_mask, false);
        return true;
}

static inline bool trigger_allbutself_cpu_backtrace(void)
{
        arch_trigger_cpumask_backtrace(cpu_online_mask, true);
        return true;
}

static inline bool trigger_cpumask_backtrace(struct cpumask *mask)
{
        arch_trigger_cpumask_backtrace(mask, false);
        return true;
}

static inline bool trigger_single_cpu_backtrace(int cpu)
{
        arch_trigger_cpumask_backtrace(cpumask_of(cpu), false);
        return true;
}

/* generic implementation */
void nmi_trigger_cpumask_backtrace(const cpumask_t *mask,
                                   bool exclude_self,
                                   void (*raise)(cpumask_t *mask));
bool nmi_cpu_backtrace(struct pt_regs *regs);

#else
static inline bool trigger_all_cpu_backtrace(void)
{
        return false;
}
static inline bool trigger_allbutself_cpu_backtrace(void)
{
        return false;
}
static inline bool trigger_cpumask_backtrace(struct cpumask *mask)
{
        return false;
}
static inline bool trigger_single_cpu_backtrace(int cpu)
{
        return false;
}
#endif

#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF
u64 hw_nmi_get_sample_period(int watchdog_thresh);
#endif

#if defined(CONFIG_HARDLOCKUP_CHECK_TIMESTAMP) && \
    defined(CONFIG_HARDLOCKUP_DETECTOR_PERF)
void watchdog_update_hrtimer_threshold(u64 period);
#else
static inline void watchdog_update_hrtimer_threshold(u64 period) { }
#endif

struct ctl_table;
int proc_watchdog(struct ctl_table *, int, void *, size_t *, loff_t *);
int proc_nmi_watchdog(struct ctl_table *, int , void *, size_t *, loff_t *);
int proc_soft_watchdog(struct ctl_table *, int , void *, size_t *, loff_t *);
int proc_watchdog_thresh(struct ctl_table *, int , void *, size_t *, loff_t *);
int proc_watchdog_cpumask(struct ctl_table *, int, void *, size_t *, loff_t *);

#ifdef CONFIG_HAVE_ACPI_APEI_NMI
#include <asm/nmi.h>
#endif

#endif































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
/*
 * include/net/tipc.h: Include file for TIPC message header routines
 *
 * Copyright (c) 2017 Ericsson AB
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the names of the copyright holders nor the names of its
 *    contributors may be used to endorse or promote products derived from
 *    this software without specific prior written permission.
 *
 * Alternatively, this software may be distributed under the terms of the
 * GNU General Public License ("GPL") version 2 as published by the Free
 * Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef _TIPC_HDR_H
#define _TIPC_HDR_H

#include <linux/random.h>

#define KEEPALIVE_MSG_MASK 0x0e080000  /* LINK_PROTOCOL + MSG_IS_KEEPALIVE */

struct tipc_basic_hdr {
        __be32 w[4];
};

static inline __be32 tipc_hdr_rps_key(struct tipc_basic_hdr *hdr)
{
        u32 w0 = ntohl(hdr->w[0]);
        bool keepalive_msg = (w0 & KEEPALIVE_MSG_MASK) == KEEPALIVE_MSG_MASK;
        __be32 key;

        /* Return source node identity as key */
        if (likely(!keepalive_msg))
                return hdr->w[3];

        /* Spread PROBE/PROBE_REPLY messages across the cores */
        get_random_bytes(&key, sizeof(key));
        return key;
}

#endif


































































































































































































    1 









































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_BITOPS_H
#define _LINUX_BITOPS_H
#include <asm/types.h>
#include <linux/bits.h>

/* Set bits in the first 'n' bytes when loaded from memory */
#ifdef __LITTLE_ENDIAN
#  define aligned_byte_mask(n) ((1UL << 8*(n))-1)
#else
#  define aligned_byte_mask(n) (~0xffUL << (BITS_PER_LONG - 8 - 8*(n)))
#endif

#define BITS_PER_TYPE(type)        (sizeof(type) * BITS_PER_BYTE)
#define BITS_TO_LONGS(nr)        DIV_ROUND_UP(nr, BITS_PER_TYPE(long))
#define BITS_TO_U64(nr)                DIV_ROUND_UP(nr, BITS_PER_TYPE(u64))
#define BITS_TO_U32(nr)                DIV_ROUND_UP(nr, BITS_PER_TYPE(u32))
#define BITS_TO_BYTES(nr)        DIV_ROUND_UP(nr, BITS_PER_TYPE(char))

extern unsigned int __sw_hweight8(unsigned int w);
extern unsigned int __sw_hweight16(unsigned int w);
extern unsigned int __sw_hweight32(unsigned int w);
extern unsigned long __sw_hweight64(__u64 w);

/*
 * Include this here because some architectures need generic_ffs/fls in
 * scope
 */
#include <asm/bitops.h>

#define for_each_set_bit(bit, addr, size) \
        for ((bit) = find_first_bit((addr), (size));                \
             (bit) < (size);                                        \
             (bit) = find_next_bit((addr), (size), (bit) + 1))

/* same as for_each_set_bit() but use bit as value to start with */
#define for_each_set_bit_from(bit, addr, size) \
        for ((bit) = find_next_bit((addr), (size), (bit));        \
             (bit) < (size);                                        \
             (bit) = find_next_bit((addr), (size), (bit) + 1))

#define for_each_clear_bit(bit, addr, size) \
        for ((bit) = find_first_zero_bit((addr), (size));        \
             (bit) < (size);                                        \
             (bit) = find_next_zero_bit((addr), (size), (bit) + 1))

/* same as for_each_clear_bit() but use bit as value to start with */
#define for_each_clear_bit_from(bit, addr, size) \
        for ((bit) = find_next_zero_bit((addr), (size), (bit));        \
             (bit) < (size);                                        \
             (bit) = find_next_zero_bit((addr), (size), (bit) + 1))

/**
 * for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits
 * @start: bit offset to start search and to store the current iteration offset
 * @clump: location to store copy of current 8-bit clump
 * @bits: bitmap address to base the search on
 * @size: bitmap size in number of bits
 */
#define for_each_set_clump8(start, clump, bits, size) \
        for ((start) = find_first_clump8(&(clump), (bits), (size)); \
             (start) < (size); \
             (start) = find_next_clump8(&(clump), (bits), (size), (start) + 8))

static inline int get_bitmask_order(unsigned int count)
{
        int order;

        order = fls(count);
        return order;        /* We could be slightly more clever with -1 here... */
}

static __always_inline unsigned long hweight_long(unsigned long w)
{
        return sizeof(w) == 4 ? hweight32(w) : hweight64((__u64)w);
}

/**
 * rol64 - rotate a 64-bit value left
 * @word: value to rotate
 * @shift: bits to roll
 */
static inline __u64 rol64(__u64 word, unsigned int shift)
{
        return (word << (shift & 63)) | (word >> ((-shift) & 63));
}

/**
 * ror64 - rotate a 64-bit value right
 * @word: value to rotate
 * @shift: bits to roll
 */
static inline __u64 ror64(__u64 word, unsigned int shift)
{
        return (word >> (shift & 63)) | (word << ((-shift) & 63));
}

/**
 * rol32 - rotate a 32-bit value left
 * @word: value to rotate
 * @shift: bits to roll
 */
static inline __u32 rol32(__u32 word, unsigned int shift)
{
        return (word << (shift & 31)) | (word >> ((-shift) & 31));
}

/**
 * ror32 - rotate a 32-bit value right
 * @word: value to rotate
 * @shift: bits to roll
 */
static inline __u32 ror32(__u32 word, unsigned int shift)
{
        return (word >> (shift & 31)) | (word << ((-shift) & 31));
}

/**
 * rol16 - rotate a 16-bit value left
 * @word: value to rotate
 * @shift: bits to roll
 */
static inline __u16 rol16(__u16 word, unsigned int shift)
{
        return (word << (shift & 15)) | (word >> ((-shift) & 15));
}

/**
 * ror16 - rotate a 16-bit value right
 * @word: value to rotate
 * @shift: bits to roll
 */
static inline __u16 ror16(__u16 word, unsigned int shift)
{
        return (word >> (shift & 15)) | (word << ((-shift) & 15));
}

/**
 * rol8 - rotate an 8-bit value left
 * @word: value to rotate
 * @shift: bits to roll
 */
static inline __u8 rol8(__u8 word, unsigned int shift)
{
        return (word << (shift & 7)) | (word >> ((-shift) & 7));
}

/**
 * ror8 - rotate an 8-bit value right
 * @word: value to rotate
 * @shift: bits to roll
 */
static inline __u8 ror8(__u8 word, unsigned int shift)
{
        return (word >> (shift & 7)) | (word << ((-shift) & 7));
}

/**
 * sign_extend32 - sign extend a 32-bit value using specified bit as sign-bit
 * @value: value to sign extend
 * @index: 0 based bit index (0<=index<32) to sign bit
 *
 * This is safe to use for 16- and 8-bit types as well.
 */
static __always_inline __s32 sign_extend32(__u32 value, int index)
{
        __u8 shift = 31 - index;
        return (__s32)(value << shift) >> shift;
}

/**
 * sign_extend64 - sign extend a 64-bit value using specified bit as sign-bit
 * @value: value to sign extend
 * @index: 0 based bit index (0<=index<64) to sign bit
 */
static __always_inline __s64 sign_extend64(__u64 value, int index)
{
        __u8 shift = 63 - index;
        return (__s64)(value << shift) >> shift;
}

static inline unsigned fls_long(unsigned long l)
{
        if (sizeof(l) == 4)
                return fls(l);
        return fls64(l);
}

static inline int get_count_order(unsigned int count)
{
        if (count == 0)
                return -1;

        return fls(--count);
}

/**
 * get_count_order_long - get order after rounding @l up to power of 2
 * @l: parameter
 *
 * it is same as get_count_order() but with long type parameter
 */
static inline int get_count_order_long(unsigned long l)
{
        if (l == 0UL)
                return -1;
        return (int)fls_long(--l);
}

/**
 * __ffs64 - find first set bit in a 64 bit word
 * @word: The 64 bit word
 *
 * On 64 bit arches this is a synomyn for __ffs
 * The result is not defined if no bits are set, so check that @word
 * is non-zero before calling this.
 */
static inline unsigned long __ffs64(u64 word)
{
#if BITS_PER_LONG == 32
        if (((u32)word) == 0UL)
                return __ffs((u32)(word >> 32)) + 32;
#elif BITS_PER_LONG != 64
#error BITS_PER_LONG not 32 or 64
#endif
        return __ffs((unsigned long)word);
}

/**
 * assign_bit - Assign value to a bit in memory
 * @nr: the bit to set
 * @addr: the address to start counting from
 * @value: the value to assign
 */
static __always_inline void assign_bit(long nr, volatile unsigned long *addr,
                                       bool value)
{
        if (value)
                set_bit(nr, addr);
        else
                clear_bit(nr, addr);
}

static __always_inline void __assign_bit(long nr, volatile unsigned long *addr,
                                         bool value)
{
        if (value)
                __set_bit(nr, addr);
        else
                __clear_bit(nr, addr);
}

#ifdef __KERNEL__

#ifndef set_mask_bits
#define set_mask_bits(ptr, mask, bits)        \
({                                                                \
        const typeof(*(ptr)) mask__ = (mask), bits__ = (bits);        \
        typeof(*(ptr)) old__, new__;                                \
                                                                \
        do {                                                        \
                old__ = READ_ONCE(*(ptr));                        \
                new__ = (old__ & ~mask__) | bits__;                \
        } while (cmpxchg(ptr, old__, new__) != old__);                \
                                                                \
        old__;                                                        \
})
#endif

#ifndef bit_clear_unless
#define bit_clear_unless(ptr, clear, test)        \
({                                                                \
        const typeof(*(ptr)) clear__ = (clear), test__ = (test);\
        typeof(*(ptr)) old__, new__;                                \
                                                                \
        do {                                                        \
                old__ = READ_ONCE(*(ptr));                        \
                new__ = old__ & ~clear__;                        \
        } while (!(old__ & test__) &&                                \
                 cmpxchg(ptr, old__, new__) != old__);                \
                                                                \
        !(old__ & test__);                                        \
})
#endif

#ifndef find_last_bit
/**
 * find_last_bit - find the last set bit in a memory region
 * @addr: The address to start the search at
 * @size: The number of bits to search
 *
 * Returns the bit number of the last set bit, or size.
 */
extern unsigned long find_last_bit(const unsigned long *addr,
                                   unsigned long size);
#endif

#endif /* __KERNEL__ */
#endif

































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
/* SPDX-License-Identifier: GPL-2.0 */
/*
 *        Definitions for the UDP-Lite (RFC 3828) code.
 */
#ifndef _UDPLITE_H
#define _UDPLITE_H

#include <net/ip6_checksum.h>

/* UDP-Lite socket options */
#define UDPLITE_SEND_CSCOV   10 /* sender partial coverage (as sent)      */
#define UDPLITE_RECV_CSCOV   11 /* receiver partial coverage (threshold ) */

extern struct proto                 udplite_prot;
extern struct udp_table                udplite_table;

/*
 *        Checksum computation is all in software, hence simpler getfrag.
 */
static __inline__ int udplite_getfrag(void *from, char *to, int  offset,
                                      int len, int odd, struct sk_buff *skb)
{
        struct msghdr *msg = from;
        return copy_from_iter_full(to, len, &msg->msg_iter) ? 0 : -EFAULT;
}

/*
 *         Checksumming routines
 */
static inline int udplite_checksum_init(struct sk_buff *skb, struct udphdr *uh)
{
        u16 cscov;

        /* In UDPv4 a zero checksum means that the transmitter generated no
         * checksum. UDP-Lite (like IPv6) mandates checksums, hence packets
         * with a zero checksum field are illegal.                            */
        if (uh->check == 0) {
                net_dbg_ratelimited("UDPLite: zeroed checksum field\n");
                return 1;
        }

        cscov = ntohs(uh->len);

        if (cscov == 0)                 /* Indicates that full coverage is required. */
                ;
        else if (cscov < 8  || cscov > skb->len) {
                /*
                 * Coverage length violates RFC 3828: log and discard silently.
                 */
                net_dbg_ratelimited("UDPLite: bad csum coverage %d/%d\n",
                                    cscov, skb->len);
                return 1;

        } else if (cscov < skb->len) {
                UDP_SKB_CB(skb)->partial_cov = 1;
                UDP_SKB_CB(skb)->cscov = cscov;
                if (skb->ip_summed == CHECKSUM_COMPLETE)
                        skb->ip_summed = CHECKSUM_NONE;
                skb->csum_valid = 0;
        }

        return 0;
}

/* Slow-path computation of checksum. Socket is locked. */
static inline __wsum udplite_csum_outgoing(struct sock *sk, struct sk_buff *skb)
{
        const struct udp_sock *up = udp_sk(skb->sk);
        int cscov = up->len;
        __wsum csum = 0;

        if (up->pcflag & UDPLITE_SEND_CC) {
                /*
                 * Sender has set `partial coverage' option on UDP-Lite socket.
                 * The special case "up->pcslen == 0" signifies full coverage.
                 */
                if (up->pcslen < up->len) {
                        if (0 < up->pcslen)
                                cscov = up->pcslen;
                        udp_hdr(skb)->len = htons(up->pcslen);
                }
                /*
                 * NOTE: Causes for the error case  `up->pcslen > up->len':
                 *        (i)  Application error (will not be penalized).
                 *       (ii)  Payload too big for send buffer: data is split
                 *             into several packets, each with its own header.
                 *             In this case (e.g. last segment), coverage may
                 *             exceed packet length.
                 *       Since packets with coverage length > packet length are
                 *       illegal, we fall back to the defaults here.
                 */
        }

        skb->ip_summed = CHECKSUM_NONE;     /* no HW support for checksumming */

        skb_queue_walk(&sk->sk_write_queue, skb) {
                const int off = skb_transport_offset(skb);
                const int len = skb->len - off;

                csum = skb_checksum(skb, off, (cscov > len)? len : cscov, csum);

                if ((cscov -= len) <= 0)
                        break;
        }
        return csum;
}

/* Fast-path computation of checksum. Socket may not be locked. */
static inline __wsum udplite_csum(struct sk_buff *skb)
{
        const struct udp_sock *up = udp_sk(skb->sk);
        const int off = skb_transport_offset(skb);
        int len = skb->len - off;

        if ((up->pcflag & UDPLITE_SEND_CC) && up->pcslen < len) {
                if (0 < up->pcslen)
                        len = up->pcslen;
                udp_hdr(skb)->len = htons(up->pcslen);
        }
        skb->ip_summed = CHECKSUM_NONE;     /* no HW support for checksumming */

        return skb_checksum(skb, off, len, 0);
}

void udplite4_register(void);
int udplite_get_port(struct sock *sk, unsigned short snum,
                     int (*scmp)(const struct sock *, const struct sock *));
#endif        /* _UDPLITE_H */






























































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
  Red Black Trees
  (C) 1999  Andrea Arcangeli <andrea@suse.de>
  (C) 2002  David Woodhouse <dwmw2@infradead.org>
  (C) 2012  Michel Lespinasse <walken@google.com>


  linux/include/linux/rbtree_augmented.h
*/

#ifndef _LINUX_RBTREE_AUGMENTED_H
#define _LINUX_RBTREE_AUGMENTED_H

#include <linux/compiler.h>
#include <linux/rbtree.h>
#include <linux/rcupdate.h>

/*
 * Please note - only struct rb_augment_callbacks and the prototypes for
 * rb_insert_augmented() and rb_erase_augmented() are intended to be public.
 * The rest are implementation details you are not expected to depend on.
 *
 * See Documentation/core-api/rbtree.rst for documentation and samples.
 */

struct rb_augment_callbacks {
        void (*propagate)(struct rb_node *node, struct rb_node *stop);
        void (*copy)(struct rb_node *old, struct rb_node *new);
        void (*rotate)(struct rb_node *old, struct rb_node *new);
};

extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
        void (*augment_rotate)(struct rb_node *old, struct rb_node *new));

/*
 * Fixup the rbtree and update the augmented information when rebalancing.
 *
 * On insertion, the user must update the augmented information on the path
 * leading to the inserted node, then call rb_link_node() as usual and
 * rb_insert_augmented() instead of the usual rb_insert_color() call.
 * If rb_insert_augmented() rebalances the rbtree, it will callback into
 * a user provided function to update the augmented information on the
 * affected subtrees.
 */
static inline void
rb_insert_augmented(struct rb_node *node, struct rb_root *root,
                    const struct rb_augment_callbacks *augment)
{
        __rb_insert_augmented(node, root, augment->rotate);
}

static inline void
rb_insert_augmented_cached(struct rb_node *node,
                           struct rb_root_cached *root, bool newleft,
                           const struct rb_augment_callbacks *augment)
{
        if (newleft)
                root->rb_leftmost = node;
        rb_insert_augmented(node, &root->rb_root, augment);
}

/*
 * Template for declaring augmented rbtree callbacks (generic case)
 *
 * RBSTATIC:    'static' or empty
 * RBNAME:      name of the rb_augment_callbacks structure
 * RBSTRUCT:    struct type of the tree nodes
 * RBFIELD:     name of struct rb_node field within RBSTRUCT
 * RBAUGMENTED: name of field within RBSTRUCT holding data for subtree
 * RBCOMPUTE:   name of function that recomputes the RBAUGMENTED data
 */

#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME,                                \
                             RBSTRUCT, RBFIELD, RBAUGMENTED, RBCOMPUTE)        \
static inline void                                                        \
RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop)                \
{                                                                        \
        while (rb != stop) {                                                \
                RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD);        \
                if (RBCOMPUTE(node, true))                                \
                        break;                                                \
                rb = rb_parent(&node->RBFIELD);                                \
        }                                                                \
}                                                                        \
static inline void                                                        \
RBNAME ## _copy(struct rb_node *rb_old, struct rb_node *rb_new)                \
{                                                                        \
        RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD);                \
        RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD);                \
        new->RBAUGMENTED = old->RBAUGMENTED;                                \
}                                                                        \
static void                                                                \
RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new)        \
{                                                                        \
        RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD);                \
        RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD);                \
        new->RBAUGMENTED = old->RBAUGMENTED;                                \
        RBCOMPUTE(old, false);                                                \
}                                                                        \
RBSTATIC const struct rb_augment_callbacks RBNAME = {                        \
        .propagate = RBNAME ## _propagate,                                \
        .copy = RBNAME ## _copy,                                        \
        .rotate = RBNAME ## _rotate                                        \
};

/*
 * Template for declaring augmented rbtree callbacks,
 * computing RBAUGMENTED scalar as max(RBCOMPUTE(node)) for all subtree nodes.
 *
 * RBSTATIC:    'static' or empty
 * RBNAME:      name of the rb_augment_callbacks structure
 * RBSTRUCT:    struct type of the tree nodes
 * RBFIELD:     name of struct rb_node field within RBSTRUCT
 * RBTYPE:      type of the RBAUGMENTED field
 * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree
 * RBCOMPUTE:   name of function that returns the per-node RBTYPE scalar
 */

#define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD,              \
                                 RBTYPE, RBAUGMENTED, RBCOMPUTE)              \
static inline bool RBNAME ## _compute_max(RBSTRUCT *node, bool exit)              \
{                                                                              \
        RBSTRUCT *child;                                                      \
        RBTYPE max = RBCOMPUTE(node);                                              \
        if (node->RBFIELD.rb_left) {                                              \
                child = rb_entry(node->RBFIELD.rb_left, RBSTRUCT, RBFIELD);   \
                if (child->RBAUGMENTED > max)                                      \
                        max = child->RBAUGMENTED;                              \
        }                                                                      \
        if (node->RBFIELD.rb_right) {                                              \
                child = rb_entry(node->RBFIELD.rb_right, RBSTRUCT, RBFIELD);  \
                if (child->RBAUGMENTED > max)                                      \
                        max = child->RBAUGMENTED;                              \
        }                                                                      \
        if (exit && node->RBAUGMENTED == max)                                      \
                return true;                                                      \
        node->RBAUGMENTED = max;                                              \
        return false;                                                              \
}                                                                              \
RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME,                                              \
                     RBSTRUCT, RBFIELD, RBAUGMENTED, RBNAME ## _compute_max)


#define        RB_RED                0
#define        RB_BLACK        1

#define __rb_parent(pc)    ((struct rb_node *)(pc & ~3))

#define __rb_color(pc)     ((pc) & 1)
#define __rb_is_black(pc)  __rb_color(pc)
#define __rb_is_red(pc)    (!__rb_color(pc))
#define rb_color(rb)       __rb_color((rb)->__rb_parent_color)
#define rb_is_red(rb)      __rb_is_red((rb)->__rb_parent_color)
#define rb_is_black(rb)    __rb_is_black((rb)->__rb_parent_color)

static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
{
        rb->__rb_parent_color = rb_color(rb) | (unsigned long)p;
}

static inline void rb_set_parent_color(struct rb_node *rb,
                                       struct rb_node *p, int color)
{
        rb->__rb_parent_color = (unsigned long)p | color;
}

static inline void
__rb_change_child(struct rb_node *old, struct rb_node *new,
                  struct rb_node *parent, struct rb_root *root)
{
        if (parent) {
                if (parent->rb_left == old)
                        WRITE_ONCE(parent->rb_left, new);
                else
                        WRITE_ONCE(parent->rb_right, new);
        } else
                WRITE_ONCE(root->rb_node, new);
}

static inline void
__rb_change_child_rcu(struct rb_node *old, struct rb_node *new,
                      struct rb_node *parent, struct rb_root *root)
{
        if (parent) {
                if (parent->rb_left == old)
                        rcu_assign_pointer(parent->rb_left, new);
                else
                        rcu_assign_pointer(parent->rb_right, new);
        } else
                rcu_assign_pointer(root->rb_node, new);
}

extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
        void (*augment_rotate)(struct rb_node *old, struct rb_node *new));

static __always_inline struct rb_node *
__rb_erase_augmented(struct rb_node *node, struct rb_root *root,
                     const struct rb_augment_callbacks *augment)
{
        struct rb_node *child = node->rb_right;
        struct rb_node *tmp = node->rb_left;
        struct rb_node *parent, *rebalance;
        unsigned long pc;

        if (!tmp) {
                /*
                 * Case 1: node to erase has no more than 1 child (easy!)
                 *
                 * Note that if there is one child it must be red due to 5)
                 * and node must be black due to 4). We adjust colors locally
                 * so as to bypass __rb_erase_color() later on.
                 */
                pc = node->__rb_parent_color;
                parent = __rb_parent(pc);
                __rb_change_child(node, child, parent, root);
                if (child) {
                        child->__rb_parent_color = pc;
                        rebalance = NULL;
                } else
                        rebalance = __rb_is_black(pc) ? parent : NULL;
                tmp = parent;
        } else if (!child) {
                /* Still case 1, but this time the child is node->rb_left */
                tmp->__rb_parent_color = pc = node->__rb_parent_color;
                parent = __rb_parent(pc);
                __rb_change_child(node, tmp, parent, root);
                rebalance = NULL;
                tmp = parent;
        } else {
                struct rb_node *successor = child, *child2;

                tmp = child->rb_left;
                if (!tmp) {
                        /*
                         * Case 2: node's successor is its right child
                         *
                         *    (n)          (s)
                         *    / \          / \
                         *  (x) (s)  ->  (x) (c)
                         *        \
                         *        (c)
                         */
                        parent = successor;
                        child2 = successor->rb_right;

                        augment->copy(node, successor);
                } else {
                        /*
                         * Case 3: node's successor is leftmost under
                         * node's right child subtree
                         *
                         *    (n)          (s)
                         *    / \          / \
                         *  (x) (y)  ->  (x) (y)
                         *      /            /
                         *    (p)          (p)
                         *    /            /
                         *  (s)          (c)
                         *    \
                         *    (c)
                         */
                        do {
                                parent = successor;
                                successor = tmp;
                                tmp = tmp->rb_left;
                        } while (tmp);
                        child2 = successor->rb_right;
                        WRITE_ONCE(parent->rb_left, child2);
                        WRITE_ONCE(successor->rb_right, child);
                        rb_set_parent(child, successor);

                        augment->copy(node, successor);
                        augment->propagate(parent, successor);
                }

                tmp = node->rb_left;
                WRITE_ONCE(successor->rb_left, tmp);
                rb_set_parent(tmp, successor);

                pc = node->__rb_parent_color;
                tmp = __rb_parent(pc);
                __rb_change_child(node, successor, tmp, root);

                if (child2) {
                        rb_set_parent_color(child2, parent, RB_BLACK);
                        rebalance = NULL;
                } else {
                        rebalance = rb_is_black(successor) ? parent : NULL;
                }
                successor->__rb_parent_color = pc;
                tmp = successor;
        }

        augment->propagate(tmp, NULL);
        return rebalance;
}

static __always_inline void
rb_erase_augmented(struct rb_node *node, struct rb_root *root,
                   const struct rb_augment_callbacks *augment)
{
        struct rb_node *rebalance = __rb_erase_augmented(node, root, augment);
        if (rebalance)
                __rb_erase_color(rebalance, root, augment->rotate);
}

static __always_inline void
rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root,
                          const struct rb_augment_callbacks *augment)
{
        if (root->rb_leftmost == node)
                root->rb_leftmost = rb_next(node);
        rb_erase_augmented(node, &root->rb_root, augment);
}

#endif        /* _LINUX_RBTREE_AUGMENTED_H */













































    1 




    1 














    3 




    3 





























































    4 
    4 
    4 
    4 

















































    4 

    4 

    4 

    4 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * A security context is a set of security attributes
 * associated with each subject and object controlled
 * by the security policy.  Security contexts are
  * externally represented as variable-length strings
 * that can be interpreted by a user or application
 * with an understanding of the security policy.
 * Internally, the security server uses a simple
 * structure.  This structure is private to the
 * security server and can be changed without affecting
 * clients of the security server.
 *
 * Author : Stephen Smalley, <sds@tycho.nsa.gov>
 */
#ifndef _SS_CONTEXT_H_
#define _SS_CONTEXT_H_

#include "ebitmap.h"
#include "mls_types.h"
#include "security.h"

/*
 * A security context consists of an authenticated user
 * identity, a role, a type and a MLS range.
 */
struct context {
        u32 user;
        u32 role;
        u32 type;
        u32 len;        /* length of string in bytes */
        struct mls_range range;
        char *str;        /* string representation if context cannot be mapped. */
};

static inline void mls_context_init(struct context *c)
{
        memset(&c->range, 0, sizeof(c->range));
}

static inline int mls_context_cpy(struct context *dst, struct context *src)
{
        int rc;

        dst->range.level[0].sens = src->range.level[0].sens;
        rc = ebitmap_cpy(&dst->range.level[0].cat, &src->range.level[0].cat);
        if (rc)
                goto out;

        dst->range.level[1].sens = src->range.level[1].sens;
        rc = ebitmap_cpy(&dst->range.level[1].cat, &src->range.level[1].cat);
        if (rc)
                ebitmap_destroy(&dst->range.level[0].cat);
out:
        return rc;
}

/*
 * Sets both levels in the MLS range of 'dst' to the low level of 'src'.
 */
static inline int mls_context_cpy_low(struct context *dst, struct context *src)
{
        int rc;

        dst->range.level[0].sens = src->range.level[0].sens;
        rc = ebitmap_cpy(&dst->range.level[0].cat, &src->range.level[0].cat);
        if (rc)
                goto out;

        dst->range.level[1].sens = src->range.level[0].sens;
        rc = ebitmap_cpy(&dst->range.level[1].cat, &src->range.level[0].cat);
        if (rc)
                ebitmap_destroy(&dst->range.level[0].cat);
out:
        return rc;
}

/*
 * Sets both levels in the MLS range of 'dst' to the high level of 'src'.
 */
static inline int mls_context_cpy_high(struct context *dst, struct context *src)
{
        int rc;

        dst->range.level[0].sens = src->range.level[1].sens;
        rc = ebitmap_cpy(&dst->range.level[0].cat, &src->range.level[1].cat);
        if (rc)
                goto out;

        dst->range.level[1].sens = src->range.level[1].sens;
        rc = ebitmap_cpy(&dst->range.level[1].cat, &src->range.level[1].cat);
        if (rc)
                ebitmap_destroy(&dst->range.level[0].cat);
out:
        return rc;
}


static inline int mls_context_glblub(struct context *dst,
                                     struct context *c1, struct context *c2)
{
        struct mls_range *dr = &dst->range, *r1 = &c1->range, *r2 = &c2->range;
        int rc = 0;

        if (r1->level[1].sens < r2->level[0].sens ||
            r2->level[1].sens < r1->level[0].sens)
                /* These ranges have no common sensitivities */
                return -EINVAL;

        /* Take the greatest of the low */
        dr->level[0].sens = max(r1->level[0].sens, r2->level[0].sens);

        /* Take the least of the high */
        dr->level[1].sens = min(r1->level[1].sens, r2->level[1].sens);

        rc = ebitmap_and(&dr->level[0].cat,
                         &r1->level[0].cat, &r2->level[0].cat);
        if (rc)
                goto out;

        rc = ebitmap_and(&dr->level[1].cat,
                         &r1->level[1].cat, &r2->level[1].cat);
        if (rc)
                goto out;

out:
        return rc;
}

static inline int mls_context_cmp(struct context *c1, struct context *c2)
{
        return ((c1->range.level[0].sens == c2->range.level[0].sens) &&
                ebitmap_cmp(&c1->range.level[0].cat, &c2->range.level[0].cat) &&
                (c1->range.level[1].sens == c2->range.level[1].sens) &&
                ebitmap_cmp(&c1->range.level[1].cat, &c2->range.level[1].cat));
}

static inline void mls_context_destroy(struct context *c)
{
        ebitmap_destroy(&c->range.level[0].cat);
        ebitmap_destroy(&c->range.level[1].cat);
        mls_context_init(c);
}

static inline void context_init(struct context *c)
{
        memset(c, 0, sizeof(*c));
}

static inline int context_cpy(struct context *dst, struct context *src)
{
        int rc;

        dst->user = src->user;
        dst->role = src->role;
        dst->type = src->type;
        if (src->str) {
                dst->str = kstrdup(src->str, GFP_ATOMIC);
                if (!dst->str)
                        return -ENOMEM;
                dst->len = src->len;
        } else {
                dst->str = NULL;
                dst->len = 0;
        }
        rc = mls_context_cpy(dst, src);
        if (rc) {
                kfree(dst->str);
                return rc;
        }
        return 0;
}

static inline void context_destroy(struct context *c)
{
        c->user = c->role = c->type = 0;
        kfree(c->str);
        c->str = NULL;
        c->len = 0;
        mls_context_destroy(c);
}

static inline int context_cmp(struct context *c1, struct context *c2)
{
        if (c1->len && c2->len)
                return (c1->len == c2->len && !strcmp(c1->str, c2->str));
        if (c1->len || c2->len)
                return 0;
        return ((c1->user == c2->user) &&
                (c1->role == c2->role) &&
                (c1->type == c2->type) &&
                mls_context_cmp(c1, c2));
}

u32 context_compute_hash(const struct context *c);

#endif        /* _SS_CONTEXT_H_ */




























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_VMALLOC_H
#define _LINUX_VMALLOC_H

#include <linux/spinlock.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/llist.h>
#include <asm/page.h>                /* pgprot_t */
#include <linux/rbtree.h>
#include <linux/overflow.h>

#include <asm/vmalloc.h>

struct vm_area_struct;                /* vma defining user mapping in mm_types.h */
struct notifier_block;                /* in notifier.h */

/* bits in flags of vmalloc's vm_struct below */
#define VM_IOREMAP                0x00000001        /* ioremap() and friends */
#define VM_ALLOC                0x00000002        /* vmalloc() */
#define VM_MAP                        0x00000004        /* vmap()ed pages */
#define VM_USERMAP                0x00000008        /* suitable for remap_vmalloc_range */
#define VM_DMA_COHERENT                0x00000010        /* dma_alloc_coherent */
#define VM_UNINITIALIZED        0x00000020        /* vm_struct is not fully initialized */
#define VM_NO_GUARD                0x00000040      /* don't add guard page */
#define VM_KASAN                0x00000080      /* has allocated kasan shadow memory */
#define VM_FLUSH_RESET_PERMS        0x00000100        /* reset direct map and flush TLB on unmap, can't be freed in atomic context */
#define VM_MAP_PUT_PAGES        0x00000200        /* put pages and free array in vfree */

/*
 * VM_KASAN is used slighly differently depending on CONFIG_KASAN_VMALLOC.
 *
 * If IS_ENABLED(CONFIG_KASAN_VMALLOC), VM_KASAN is set on a vm_struct after
 * shadow memory has been mapped. It's used to handle allocation errors so that
 * we don't try to poision shadow on free if it was never allocated.
 *
 * Otherwise, VM_KASAN is set for kasan_module_alloc() allocations and used to
 * determine which allocations need the module shadow freed.
 */

/* bits [20..32] reserved for arch specific ioremap internals */

/*
 * Maximum alignment for ioremap() regions.
 * Can be overriden by arch-specific value.
 */
#ifndef IOREMAP_MAX_ORDER
#define IOREMAP_MAX_ORDER        (7 + PAGE_SHIFT)        /* 128 pages */
#endif

struct vm_struct {
        struct vm_struct        *next;
        void                        *addr;
        unsigned long                size;
        unsigned long                flags;
        struct page                **pages;
        unsigned int                nr_pages;
        phys_addr_t                phys_addr;
        const void                *caller;
};

struct vmap_area {
        unsigned long va_start;
        unsigned long va_end;

        struct rb_node rb_node;         /* address sorted rbtree */
        struct list_head list;          /* address sorted list */

        /*
         * The following three variables can be packed, because
         * a vmap_area object is always one of the three states:
         *    1) in "free" tree (root is vmap_area_root)
         *    2) in "busy" tree (root is free_vmap_area_root)
         *    3) in purge list  (head is vmap_purge_list)
         */
        union {
                unsigned long subtree_max_size; /* in "free" tree */
                struct vm_struct *vm;           /* in "busy" tree */
                struct llist_node purge_list;   /* in purge list */
        };
};

/*
 *        Highlevel APIs for driver use
 */
extern void vm_unmap_ram(const void *mem, unsigned int count);
extern void *vm_map_ram(struct page **pages, unsigned int count, int node);
extern void vm_unmap_aliases(void);

#ifdef CONFIG_MMU
extern void __init vmalloc_init(void);
extern unsigned long vmalloc_nr_pages(void);
#else
static inline void vmalloc_init(void)
{
}
static inline unsigned long vmalloc_nr_pages(void) { return 0; }
#endif

extern void *vmalloc(unsigned long size);
extern void *vzalloc(unsigned long size);
extern void *vmalloc_user(unsigned long size);
extern void *vmalloc_node(unsigned long size, int node);
extern void *vzalloc_node(unsigned long size, int node);
extern void *vmalloc_32(unsigned long size);
extern void *vmalloc_32_user(unsigned long size);
extern void *__vmalloc(unsigned long size, gfp_t gfp_mask);
extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
                        unsigned long start, unsigned long end, gfp_t gfp_mask,
                        pgprot_t prot, unsigned long vm_flags, int node,
                        const void *caller);
void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask,
                int node, const void *caller);

extern void *__vmalloc_array(size_t n, size_t size, gfp_t flags);
extern void *vmalloc_array(size_t n, size_t size);
extern void *__vcalloc(size_t n, size_t size, gfp_t flags);
extern void *vcalloc(size_t n, size_t size);

extern void vfree(const void *addr);
extern void vfree_atomic(const void *addr);

extern void *vmap(struct page **pages, unsigned int count,
                        unsigned long flags, pgprot_t prot);
void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot);
extern void vunmap(const void *addr);

extern int remap_vmalloc_range_partial(struct vm_area_struct *vma,
                                       unsigned long uaddr, void *kaddr,
                                       unsigned long pgoff, unsigned long size);

extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
                                                        unsigned long pgoff);

/*
 * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values
 * and let generic vmalloc and ioremap code know when arch_sync_kernel_mappings()
 * needs to be called.
 */
#ifndef ARCH_PAGE_TABLE_SYNC_MASK
#define ARCH_PAGE_TABLE_SYNC_MASK 0
#endif

/*
 * There is no default implementation for arch_sync_kernel_mappings(). It is
 * relied upon the compiler to optimize calls out if ARCH_PAGE_TABLE_SYNC_MASK
 * is 0.
 */
void arch_sync_kernel_mappings(unsigned long start, unsigned long end);

/*
 *        Lowlevel-APIs (not for driver use!)
 */

static inline size_t get_vm_area_size(const struct vm_struct *area)
{
        if (!(area->flags & VM_NO_GUARD))
                /* return actual size without guard page */
                return area->size - PAGE_SIZE;
        else
                return area->size;

}

extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags);
extern struct vm_struct *get_vm_area_caller(unsigned long size,
                                        unsigned long flags, const void *caller);
extern struct vm_struct *__get_vm_area_caller(unsigned long size,
                                        unsigned long flags,
                                        unsigned long start, unsigned long end,
                                        const void *caller);
void free_vm_area(struct vm_struct *area);
extern struct vm_struct *remove_vm_area(const void *addr);
extern struct vm_struct *find_vm_area(const void *addr);

#ifdef CONFIG_MMU
extern int map_kernel_range_noflush(unsigned long start, unsigned long size,
                                    pgprot_t prot, struct page **pages);
int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot,
                struct page **pages);
extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size);
extern void unmap_kernel_range(unsigned long addr, unsigned long size);
static inline void set_vm_flush_reset_perms(void *addr)
{
        struct vm_struct *vm = find_vm_area(addr);

        if (vm)
                vm->flags |= VM_FLUSH_RESET_PERMS;
}
#else
static inline int
map_kernel_range_noflush(unsigned long start, unsigned long size,
                        pgprot_t prot, struct page **pages)
{
        return size >> PAGE_SHIFT;
}
#define map_kernel_range map_kernel_range_noflush
static inline void
unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
{
}
#define unmap_kernel_range unmap_kernel_range_noflush
static inline void set_vm_flush_reset_perms(void *addr)
{
}
#endif

/* for /dev/kmem */
extern long vread(char *buf, char *addr, unsigned long count);
extern long vwrite(char *buf, char *addr, unsigned long count);

/*
 *        Internals.  Dont't use..
 */
extern struct list_head vmap_area_list;
extern __init void vm_area_add_early(struct vm_struct *vm);
extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);

#ifdef CONFIG_SMP
# ifdef CONFIG_MMU
struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
                                     const size_t *sizes, int nr_vms,
                                     size_t align);

void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms);
# else
static inline struct vm_struct **
pcpu_get_vm_areas(const unsigned long *offsets,
                const size_t *sizes, int nr_vms,
                size_t align)
{
        return NULL;
}

static inline void
pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
{
}
# endif
#endif

#ifdef CONFIG_MMU
#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START)
#else
#define VMALLOC_TOTAL 0UL
#endif

int register_vmap_purge_notifier(struct notifier_block *nb);
int unregister_vmap_purge_notifier(struct notifier_block *nb);

#endif /* _LINUX_VMALLOC_H */





















































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _INET_ECN_H_
#define _INET_ECN_H_

#include <linux/ip.h>
#include <linux/skbuff.h>
#include <linux/if_vlan.h>

#include <net/inet_sock.h>
#include <net/dsfield.h>

enum {
        INET_ECN_NOT_ECT = 0,
        INET_ECN_ECT_1 = 1,
        INET_ECN_ECT_0 = 2,
        INET_ECN_CE = 3,
        INET_ECN_MASK = 3,
};

extern int sysctl_tunnel_ecn_log;

static inline int INET_ECN_is_ce(__u8 dsfield)
{
        return (dsfield & INET_ECN_MASK) == INET_ECN_CE;
}

static inline int INET_ECN_is_not_ect(__u8 dsfield)
{
        return (dsfield & INET_ECN_MASK) == INET_ECN_NOT_ECT;
}

static inline int INET_ECN_is_capable(__u8 dsfield)
{
        return dsfield & INET_ECN_ECT_0;
}

/*
 * RFC 3168 9.1.1
 *  The full-functionality option for ECN encapsulation is to copy the
 *  ECN codepoint of the inside header to the outside header on
 *  encapsulation if the inside header is not-ECT or ECT, and to set the
 *  ECN codepoint of the outside header to ECT(0) if the ECN codepoint of
 *  the inside header is CE.
 */
static inline __u8 INET_ECN_encapsulate(__u8 outer, __u8 inner)
{
        outer &= ~INET_ECN_MASK;
        outer |= !INET_ECN_is_ce(inner) ? (inner & INET_ECN_MASK) :
                                          INET_ECN_ECT_0;
        return outer;
}

static inline void INET_ECN_xmit(struct sock *sk)
{
        inet_sk(sk)->tos |= INET_ECN_ECT_0;
        if (inet6_sk(sk) != NULL)
                inet6_sk(sk)->tclass |= INET_ECN_ECT_0;
}

static inline void INET_ECN_dontxmit(struct sock *sk)
{
        inet_sk(sk)->tos &= ~INET_ECN_MASK;
        if (inet6_sk(sk) != NULL)
                inet6_sk(sk)->tclass &= ~INET_ECN_MASK;
}

#define IP6_ECN_flow_init(label) do {                \
      (label) &= ~htonl(INET_ECN_MASK << 20);        \
    } while (0)

#define        IP6_ECN_flow_xmit(sk, label) do {                                \
        if (INET_ECN_is_capable(inet6_sk(sk)->tclass))                        \
                (label) |= htonl(INET_ECN_ECT_0 << 20);                        \
    } while (0)

static inline int IP_ECN_set_ce(struct iphdr *iph)
{
        u32 check = (__force u32)iph->check;
        u32 ecn = (iph->tos + 1) & INET_ECN_MASK;

        /*
         * After the last operation we have (in binary):
         * INET_ECN_NOT_ECT => 01
         * INET_ECN_ECT_1   => 10
         * INET_ECN_ECT_0   => 11
         * INET_ECN_CE      => 00
         */
        if (!(ecn & 2))
                return !ecn;

        /*
         * The following gives us:
         * INET_ECN_ECT_1 => check += htons(0xFFFD)
         * INET_ECN_ECT_0 => check += htons(0xFFFE)
         */
        check += (__force u16)htons(0xFFFB) + (__force u16)htons(ecn);

        iph->check = (__force __sum16)(check + (check>=0xFFFF));
        iph->tos |= INET_ECN_CE;
        return 1;
}

static inline int IP_ECN_set_ect1(struct iphdr *iph)
{
        u32 check = (__force u32)iph->check;

        if ((iph->tos & INET_ECN_MASK) != INET_ECN_ECT_0)
                return 0;

        check += (__force u16)htons(0x1);

        iph->check = (__force __sum16)(check + (check>=0xFFFF));
        iph->tos ^= INET_ECN_MASK;
        return 1;
}

static inline void IP_ECN_clear(struct iphdr *iph)
{
        iph->tos &= ~INET_ECN_MASK;
}

static inline void ipv4_copy_dscp(unsigned int dscp, struct iphdr *inner)
{
        dscp &= ~INET_ECN_MASK;
        ipv4_change_dsfield(inner, INET_ECN_MASK, dscp);
}

struct ipv6hdr;

/* Note:
 * IP_ECN_set_ce() has to tweak IPV4 checksum when setting CE,
 * meaning both changes have no effect on skb->csum if/when CHECKSUM_COMPLETE
 * In IPv6 case, no checksum compensates the change in IPv6 header,
 * so we have to update skb->csum.
 */
static inline int IP6_ECN_set_ce(struct sk_buff *skb, struct ipv6hdr *iph)
{
        __be32 from, to;

        if (INET_ECN_is_not_ect(ipv6_get_dsfield(iph)))
                return 0;

        from = *(__be32 *)iph;
        to = from | htonl(INET_ECN_CE << 20);
        *(__be32 *)iph = to;
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->csum = csum_add(csum_sub(skb->csum, (__force __wsum)from),
                                     (__force __wsum)to);
        return 1;
}

static inline int IP6_ECN_set_ect1(struct sk_buff *skb, struct ipv6hdr *iph)
{
        __be32 from, to;

        if ((ipv6_get_dsfield(iph) & INET_ECN_MASK) != INET_ECN_ECT_0)
                return 0;

        from = *(__be32 *)iph;
        to = from ^ htonl(INET_ECN_MASK << 20);
        *(__be32 *)iph = to;
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->csum = csum_add(csum_sub(skb->csum, (__force __wsum)from),
                                     (__force __wsum)to);
        return 1;
}

static inline void ipv6_copy_dscp(unsigned int dscp, struct ipv6hdr *inner)
{
        dscp &= ~INET_ECN_MASK;
        ipv6_change_dsfield(inner, INET_ECN_MASK, dscp);
}

static inline int INET_ECN_set_ce(struct sk_buff *skb)
{
        switch (skb_protocol(skb, true)) {
        case cpu_to_be16(ETH_P_IP):
                if (skb_network_header(skb) + sizeof(struct iphdr) <=
                    skb_tail_pointer(skb))
                        return IP_ECN_set_ce(ip_hdr(skb));
                break;

        case cpu_to_be16(ETH_P_IPV6):
                if (skb_network_header(skb) + sizeof(struct ipv6hdr) <=
                    skb_tail_pointer(skb))
                        return IP6_ECN_set_ce(skb, ipv6_hdr(skb));
                break;
        }

        return 0;
}

static inline int INET_ECN_set_ect1(struct sk_buff *skb)
{
        switch (skb_protocol(skb, true)) {
        case cpu_to_be16(ETH_P_IP):
                if (skb_network_header(skb) + sizeof(struct iphdr) <=
                    skb_tail_pointer(skb))
                        return IP_ECN_set_ect1(ip_hdr(skb));
                break;

        case cpu_to_be16(ETH_P_IPV6):
                if (skb_network_header(skb) + sizeof(struct ipv6hdr) <=
                    skb_tail_pointer(skb))
                        return IP6_ECN_set_ect1(skb, ipv6_hdr(skb));
                break;
        }

        return 0;
}

/*
 * RFC 6040 4.2
 *  To decapsulate the inner header at the tunnel egress, a compliant
 *  tunnel egress MUST set the outgoing ECN field to the codepoint at the
 *  intersection of the appropriate arriving inner header (row) and outer
 *  header (column) in Figure 4
 *
 *      +---------+------------------------------------------------+
 *      |Arriving |            Arriving Outer Header               |
 *      |   Inner +---------+------------+------------+------------+
 *      |  Header | Not-ECT | ECT(0)     | ECT(1)     |     CE     |
 *      +---------+---------+------------+------------+------------+
 *      | Not-ECT | Not-ECT |Not-ECT(!!!)|Not-ECT(!!!)| <drop>(!!!)|
 *      |  ECT(0) |  ECT(0) | ECT(0)     | ECT(1)     |     CE     |
 *      |  ECT(1) |  ECT(1) | ECT(1) (!) | ECT(1)     |     CE     |
 *      |    CE   |      CE |     CE     |     CE(!!!)|     CE     |
 *      +---------+---------+------------+------------+------------+
 *
 *             Figure 4: New IP in IP Decapsulation Behaviour
 *
 *  returns 0 on success
 *          1 if something is broken and should be logged (!!! above)
 *          2 if packet should be dropped
 */
static inline int __INET_ECN_decapsulate(__u8 outer, __u8 inner, bool *set_ce)
{
        if (INET_ECN_is_not_ect(inner)) {
                switch (outer & INET_ECN_MASK) {
                case INET_ECN_NOT_ECT:
                        return 0;
                case INET_ECN_ECT_0:
                case INET_ECN_ECT_1:
                        return 1;
                case INET_ECN_CE:
                        return 2;
                }
        }

        *set_ce = INET_ECN_is_ce(outer);
        return 0;
}

static inline int INET_ECN_decapsulate(struct sk_buff *skb,
                                       __u8 outer, __u8 inner)
{
        bool set_ce = false;
        int rc;

        rc = __INET_ECN_decapsulate(outer, inner, &set_ce);
        if (!rc) {
                if (set_ce)
                        INET_ECN_set_ce(skb);
                else if ((outer & INET_ECN_MASK) == INET_ECN_ECT_1)
                        INET_ECN_set_ect1(skb);
        }

        return rc;
}

static inline int IP_ECN_decapsulate(const struct iphdr *oiph,
                                     struct sk_buff *skb)
{
        __u8 inner;

        switch (skb_protocol(skb, true)) {
        case htons(ETH_P_IP):
                inner = ip_hdr(skb)->tos;
                break;
        case htons(ETH_P_IPV6):
                inner = ipv6_get_dsfield(ipv6_hdr(skb));
                break;
        default:
                return 0;
        }

        return INET_ECN_decapsulate(skb, oiph->tos, inner);
}

static inline int IP6_ECN_decapsulate(const struct ipv6hdr *oipv6h,
                                      struct sk_buff *skb)
{
        __u8 inner;

        switch (skb_protocol(skb, true)) {
        case htons(ETH_P_IP):
                inner = ip_hdr(skb)->tos;
                break;
        case htons(ETH_P_IPV6):
                inner = ipv6_get_dsfield(ipv6_hdr(skb));
                break;
        default:
                return 0;
        }

        return INET_ECN_decapsulate(skb, ipv6_get_dsfield(oipv6h), inner);
}
#endif


























































































































































































































































































































    6 






























































    7 














    7 











































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_STRING_H_
#define _LINUX_STRING_H_

#include <linux/compiler.h>        /* for inline */
#include <linux/types.h>        /* for size_t */
#include <linux/stddef.h>        /* for NULL */
#include <stdarg.h>
#include <uapi/linux/string.h>

extern char *strndup_user(const char __user *, long);
extern void *memdup_user(const void __user *, size_t);
extern void *vmemdup_user(const void __user *, size_t);
extern void *memdup_user_nul(const void __user *, size_t);

/*
 * Include machine specific inline routines
 */
#include <asm/string.h>

#ifndef __HAVE_ARCH_STRCPY
extern char * strcpy(char *,const char *);
#endif
#ifndef __HAVE_ARCH_STRNCPY
extern char * strncpy(char *,const char *, __kernel_size_t);
#endif
#ifndef __HAVE_ARCH_STRLCPY
size_t strlcpy(char *, const char *, size_t);
#endif
#ifndef __HAVE_ARCH_STRSCPY
ssize_t strscpy(char *, const char *, size_t);
#endif

/* Wraps calls to strscpy()/memset(), no arch specific code required */
ssize_t strscpy_pad(char *dest, const char *src, size_t count);

#ifndef __HAVE_ARCH_STRCAT
extern char * strcat(char *, const char *);
#endif
#ifndef __HAVE_ARCH_STRNCAT
extern char * strncat(char *, const char *, __kernel_size_t);
#endif
#ifndef __HAVE_ARCH_STRLCAT
extern size_t strlcat(char *, const char *, __kernel_size_t);
#endif
#ifndef __HAVE_ARCH_STRCMP
extern int strcmp(const char *,const char *);
#endif
#ifndef __HAVE_ARCH_STRNCMP
extern int strncmp(const char *,const char *,__kernel_size_t);
#endif
#ifndef __HAVE_ARCH_STRCASECMP
extern int strcasecmp(const char *s1, const char *s2);
#endif
#ifndef __HAVE_ARCH_STRNCASECMP
extern int strncasecmp(const char *s1, const char *s2, size_t n);
#endif
#ifndef __HAVE_ARCH_STRCHR
extern char * strchr(const char *,int);
#endif
#ifndef __HAVE_ARCH_STRCHRNUL
extern char * strchrnul(const char *,int);
#endif
extern char * strnchrnul(const char *, size_t, int);
#ifndef __HAVE_ARCH_STRNCHR
extern char * strnchr(const char *, size_t, int);
#endif
#ifndef __HAVE_ARCH_STRRCHR
extern char * strrchr(const char *,int);
#endif
extern char * __must_check skip_spaces(const char *);

extern char *strim(char *);

static inline __must_check char *strstrip(char *str)
{
        return strim(str);
}

#ifndef __HAVE_ARCH_STRSTR
extern char * strstr(const char *, const char *);
#endif
#ifndef __HAVE_ARCH_STRNSTR
extern char * strnstr(const char *, const char *, size_t);
#endif
#ifndef __HAVE_ARCH_STRLEN
extern __kernel_size_t strlen(const char *);
#endif
#ifndef __HAVE_ARCH_STRNLEN
extern __kernel_size_t strnlen(const char *,__kernel_size_t);
#endif
#ifndef __HAVE_ARCH_STRPBRK
extern char * strpbrk(const char *,const char *);
#endif
#ifndef __HAVE_ARCH_STRSEP
extern char * strsep(char **,const char *);
#endif
#ifndef __HAVE_ARCH_STRSPN
extern __kernel_size_t strspn(const char *,const char *);
#endif
#ifndef __HAVE_ARCH_STRCSPN
extern __kernel_size_t strcspn(const char *,const char *);
#endif

#ifndef __HAVE_ARCH_MEMSET
extern void * memset(void *,int,__kernel_size_t);
#endif

#ifndef __HAVE_ARCH_MEMSET16
extern void *memset16(uint16_t *, uint16_t, __kernel_size_t);
#endif

#ifndef __HAVE_ARCH_MEMSET32
extern void *memset32(uint32_t *, uint32_t, __kernel_size_t);
#endif

#ifndef __HAVE_ARCH_MEMSET64
extern void *memset64(uint64_t *, uint64_t, __kernel_size_t);
#endif

static inline void *memset_l(unsigned long *p, unsigned long v,
                __kernel_size_t n)
{
        if (BITS_PER_LONG == 32)
                return memset32((uint32_t *)p, v, n);
        else
                return memset64((uint64_t *)p, v, n);
}

static inline void *memset_p(void **p, void *v, __kernel_size_t n)
{
        if (BITS_PER_LONG == 32)
                return memset32((uint32_t *)p, (uintptr_t)v, n);
        else
                return memset64((uint64_t *)p, (uintptr_t)v, n);
}

extern void **__memcat_p(void **a, void **b);
#define memcat_p(a, b) ({                                        \
        BUILD_BUG_ON_MSG(!__same_type(*(a), *(b)),                \
                         "type mismatch in memcat_p()");        \
        (typeof(*a) *)__memcat_p((void **)(a), (void **)(b));        \
})

#ifndef __HAVE_ARCH_MEMCPY
extern void * memcpy(void *,const void *,__kernel_size_t);
#endif
#ifndef __HAVE_ARCH_MEMMOVE
extern void * memmove(void *,const void *,__kernel_size_t);
#endif
#ifndef __HAVE_ARCH_MEMSCAN
extern void * memscan(void *,int,__kernel_size_t);
#endif
#ifndef __HAVE_ARCH_MEMCMP
extern int memcmp(const void *,const void *,__kernel_size_t);
#endif
#ifndef __HAVE_ARCH_BCMP
extern int bcmp(const void *,const void *,__kernel_size_t);
#endif
#ifndef __HAVE_ARCH_MEMCHR
extern void * memchr(const void *,int,__kernel_size_t);
#endif
#ifndef __HAVE_ARCH_MEMCPY_FLUSHCACHE
static inline void memcpy_flushcache(void *dst, const void *src, size_t cnt)
{
        memcpy(dst, src, cnt);
}
#endif

void *memchr_inv(const void *s, int c, size_t n);
char *strreplace(char *s, char old, char new);

extern void kfree_const(const void *x);

extern char *kstrdup(const char *s, gfp_t gfp) __malloc;
extern const char *kstrdup_const(const char *s, gfp_t gfp);
extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
extern void *kmemdup(const void *src, size_t len, gfp_t gfp);
extern char *kmemdup_nul(const char *s, size_t len, gfp_t gfp);

extern char **argv_split(gfp_t gfp, const char *str, int *argcp);
extern void argv_free(char **argv);

extern bool sysfs_streq(const char *s1, const char *s2);
int match_string(const char * const *array, size_t n, const char *string);
int __sysfs_match_string(const char * const *array, size_t n, const char *s);

/**
 * sysfs_match_string - matches given string in an array
 * @_a: array of strings
 * @_s: string to match with
 *
 * Helper for __sysfs_match_string(). Calculates the size of @a automatically.
 */
#define sysfs_match_string(_a, _s) __sysfs_match_string(_a, ARRAY_SIZE(_a), _s)

#ifdef CONFIG_BINARY_PRINTF
int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args);
int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf);
int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...) __printf(3, 4);
#endif

extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
                                       const void *from, size_t available);

int ptr_to_hashval(const void *ptr, unsigned long *hashval_out);

/**
 * strstarts - does @str start with @prefix?
 * @str: string to examine
 * @prefix: prefix to look for.
 */
static inline bool strstarts(const char *str, const char *prefix)
{
        return strncmp(str, prefix, strlen(prefix)) == 0;
}

size_t memweight(const void *ptr, size_t bytes);

/**
 * memzero_explicit - Fill a region of memory (e.g. sensitive
 *                      keying data) with 0s.
 * @s: Pointer to the start of the area.
 * @count: The size of the area.
 *
 * Note: usually using memset() is just fine (!), but in cases
 * where clearing out _local_ data at the end of a scope is
 * necessary, memzero_explicit() should be used instead in
 * order to prevent the compiler from optimising away zeroing.
 *
 * memzero_explicit() doesn't need an arch-specific version as
 * it just invokes the one of memset() implicitly.
 */
static inline void memzero_explicit(void *s, size_t count)
{
        memset(s, 0, count);
        barrier_data(s);
}

/**
 * kbasename - return the last part of a pathname.
 *
 * @path: path to extract the filename from.
 */
static inline const char *kbasename(const char *path)
{
        const char *tail = strrchr(path, '/');
        return tail ? tail + 1 : path;
}

#define __FORTIFY_INLINE extern __always_inline __attribute__((gnu_inline))
#define __RENAME(x) __asm__(#x)

void fortify_panic(const char *name) __noreturn __cold;
void __read_overflow(void) __compiletime_error("detected read beyond size of object passed as 1st parameter");
void __read_overflow2(void) __compiletime_error("detected read beyond size of object passed as 2nd parameter");
void __read_overflow3(void) __compiletime_error("detected read beyond size of object passed as 3rd parameter");
void __write_overflow(void) __compiletime_error("detected write beyond size of object passed as 1st parameter");

#if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE)

#ifdef CONFIG_KASAN
extern void *__underlying_memchr(const void *p, int c, __kernel_size_t size) __RENAME(memchr);
extern int __underlying_memcmp(const void *p, const void *q, __kernel_size_t size) __RENAME(memcmp);
extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(memcpy);
extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(memmove);
extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(memset);
extern char *__underlying_strcat(char *p, const char *q) __RENAME(strcat);
extern char *__underlying_strcpy(char *p, const char *q) __RENAME(strcpy);
extern __kernel_size_t __underlying_strlen(const char *p) __RENAME(strlen);
extern char *__underlying_strncat(char *p, const char *q, __kernel_size_t count) __RENAME(strncat);
extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) __RENAME(strncpy);
#else
#define __underlying_memchr        __builtin_memchr
#define __underlying_memcmp        __builtin_memcmp
#define __underlying_memcpy        __builtin_memcpy
#define __underlying_memmove        __builtin_memmove
#define __underlying_memset        __builtin_memset
#define __underlying_strcat        __builtin_strcat
#define __underlying_strcpy        __builtin_strcpy
#define __underlying_strlen        __builtin_strlen
#define __underlying_strncat        __builtin_strncat
#define __underlying_strncpy        __builtin_strncpy
#endif

__FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size)
{
        size_t p_size = __builtin_object_size(p, 0);
        if (__builtin_constant_p(size) && p_size < size)
                __write_overflow();
        if (p_size < size)
                fortify_panic(__func__);
        return __underlying_strncpy(p, q, size);
}

__FORTIFY_INLINE char *strcat(char *p, const char *q)
{
        size_t p_size = __builtin_object_size(p, 0);
        if (p_size == (size_t)-1)
                return __underlying_strcat(p, q);
        if (strlcat(p, q, p_size) >= p_size)
                fortify_panic(__func__);
        return p;
}

__FORTIFY_INLINE __kernel_size_t strlen(const char *p)
{
        __kernel_size_t ret;
        size_t p_size = __builtin_object_size(p, 0);

        /* Work around gcc excess stack consumption issue */
        if (p_size == (size_t)-1 ||
            (__builtin_constant_p(p[p_size - 1]) && p[p_size - 1] == '\0'))
                return __underlying_strlen(p);
        ret = strnlen(p, p_size);
        if (p_size <= ret)
                fortify_panic(__func__);
        return ret;
}

extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen);
__FORTIFY_INLINE __kernel_size_t strnlen(const char *p, __kernel_size_t maxlen)
{
        size_t p_size = __builtin_object_size(p, 0);
        __kernel_size_t ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size);
        if (p_size <= ret && maxlen != ret)
                fortify_panic(__func__);
        return ret;
}

/* defined after fortified strlen to reuse it */
extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy);
__FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size)
{
        size_t ret;
        size_t p_size = __builtin_object_size(p, 0);
        size_t q_size = __builtin_object_size(q, 0);
        if (p_size == (size_t)-1 && q_size == (size_t)-1)
                return __real_strlcpy(p, q, size);
        ret = strlen(q);
        if (size) {
                size_t len = (ret >= size) ? size - 1 : ret;
                if (__builtin_constant_p(len) && len >= p_size)
                        __write_overflow();
                if (len >= p_size)
                        fortify_panic(__func__);
                __underlying_memcpy(p, q, len);
                p[len] = '\0';
        }
        return ret;
}

/* defined after fortified strlen and strnlen to reuse them */
__FORTIFY_INLINE char *strncat(char *p, const char *q, __kernel_size_t count)
{
        size_t p_len, copy_len;
        size_t p_size = __builtin_object_size(p, 0);
        size_t q_size = __builtin_object_size(q, 0);
        if (p_size == (size_t)-1 && q_size == (size_t)-1)
                return __underlying_strncat(p, q, count);
        p_len = strlen(p);
        copy_len = strnlen(q, count);
        if (p_size < p_len + copy_len + 1)
                fortify_panic(__func__);
        __underlying_memcpy(p + p_len, q, copy_len);
        p[p_len + copy_len] = '\0';
        return p;
}

__FORTIFY_INLINE void *memset(void *p, int c, __kernel_size_t size)
{
        size_t p_size = __builtin_object_size(p, 0);
        if (__builtin_constant_p(size) && p_size < size)
                __write_overflow();
        if (p_size < size)
                fortify_panic(__func__);
        return __underlying_memset(p, c, size);
}

__FORTIFY_INLINE void *memcpy(void *p, const void *q, __kernel_size_t size)
{
        size_t p_size = __builtin_object_size(p, 0);
        size_t q_size = __builtin_object_size(q, 0);
        if (__builtin_constant_p(size)) {
                if (p_size < size)
                        __write_overflow();
                if (q_size < size)
                        __read_overflow2();
        }
        if (p_size < size || q_size < size)
                fortify_panic(__func__);
        return __underlying_memcpy(p, q, size);
}

__FORTIFY_INLINE void *memmove(void *p, const void *q, __kernel_size_t size)
{
        size_t p_size = __builtin_object_size(p, 0);
        size_t q_size = __builtin_object_size(q, 0);
        if (__builtin_constant_p(size)) {
                if (p_size < size)
                        __write_overflow();
                if (q_size < size)
                        __read_overflow2();
        }
        if (p_size < size || q_size < size)
                fortify_panic(__func__);
        return __underlying_memmove(p, q, size);
}

extern void *__real_memscan(void *, int, __kernel_size_t) __RENAME(memscan);
__FORTIFY_INLINE void *memscan(void *p, int c, __kernel_size_t size)
{
        size_t p_size = __builtin_object_size(p, 0);
        if (__builtin_constant_p(size) && p_size < size)
                __read_overflow();
        if (p_size < size)
                fortify_panic(__func__);
        return __real_memscan(p, c, size);
}

__FORTIFY_INLINE int memcmp(const void *p, const void *q, __kernel_size_t size)
{
        size_t p_size = __builtin_object_size(p, 0);
        size_t q_size = __builtin_object_size(q, 0);
        if (__builtin_constant_p(size)) {
                if (p_size < size)
                        __read_overflow();
                if (q_size < size)
                        __read_overflow2();
        }
        if (p_size < size || q_size < size)
                fortify_panic(__func__);
        return __underlying_memcmp(p, q, size);
}

__FORTIFY_INLINE void *memchr(const void *p, int c, __kernel_size_t size)
{
        size_t p_size = __builtin_object_size(p, 0);
        if (__builtin_constant_p(size) && p_size < size)
                __read_overflow();
        if (p_size < size)
                fortify_panic(__func__);
        return __underlying_memchr(p, c, size);
}

void *__real_memchr_inv(const void *s, int c, size_t n) __RENAME(memchr_inv);
__FORTIFY_INLINE void *memchr_inv(const void *p, int c, size_t size)
{
        size_t p_size = __builtin_object_size(p, 0);
        if (__builtin_constant_p(size) && p_size < size)
                __read_overflow();
        if (p_size < size)
                fortify_panic(__func__);
        return __real_memchr_inv(p, c, size);
}

extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup);
__FORTIFY_INLINE void *kmemdup(const void *p, size_t size, gfp_t gfp)
{
        size_t p_size = __builtin_object_size(p, 0);
        if (__builtin_constant_p(size) && p_size < size)
                __read_overflow();
        if (p_size < size)
                fortify_panic(__func__);
        return __real_kmemdup(p, size, gfp);
}

/* defined after fortified strlen and memcpy to reuse them */
__FORTIFY_INLINE char *strcpy(char *p, const char *q)
{
        size_t p_size = __builtin_object_size(p, 0);
        size_t q_size = __builtin_object_size(q, 0);
        if (p_size == (size_t)-1 && q_size == (size_t)-1)
                return __underlying_strcpy(p, q);
        memcpy(p, q, strlen(q) + 1);
        return p;
}

/* Don't use these outside the FORITFY_SOURCE implementation */
#undef __underlying_memchr
#undef __underlying_memcmp
#undef __underlying_memcpy
#undef __underlying_memmove
#undef __underlying_memset
#undef __underlying_strcat
#undef __underlying_strcpy
#undef __underlying_strlen
#undef __underlying_strncat
#undef __underlying_strncpy
#endif

/**
 * memcpy_and_pad - Copy one buffer to another with padding
 * @dest: Where to copy to
 * @dest_len: The destination buffer size
 * @src: Where to copy from
 * @count: The number of bytes to copy
 * @pad: Character to use for padding if space is left in destination.
 */
static inline void memcpy_and_pad(void *dest, size_t dest_len,
                                  const void *src, size_t count, int pad)
{
        if (dest_len > count) {
                memcpy(dest, src, count);
                memset(dest + count, pad,  dest_len - count);
        } else
                memcpy(dest, src, dest_len);
}

/**
 * str_has_prefix - Test if a string has a given prefix
 * @str: The string to test
 * @prefix: The string to see if @str starts with
 *
 * A common way to test a prefix of a string is to do:
 *  strncmp(str, prefix, sizeof(prefix) - 1)
 *
 * But this can lead to bugs due to typos, or if prefix is a pointer
 * and not a constant. Instead use str_has_prefix().
 *
 * Returns:
 * * strlen(@prefix) if @str starts with @prefix
 * * 0 if @str does not start with @prefix
 */
static __always_inline size_t str_has_prefix(const char *str, const char *prefix)
{
        size_t len = strlen(prefix);
        return strncmp(str, prefix, len) == 0 ? len : 0;
}

#endif /* _LINUX_STRING_H_ */
























































































































































    1 




    1 



































































































    1 







    1 
    1 
    1 
    1 






    1 



    1 





















    2 
























    2 
    2 
    1 

    1 


    1 














    2 























































    1 



















































    1 



































    1 
    1 









    1 






    1 





    1 











    1 

    1 




    1 






    1 

    1 




























































































































































































    1 



































    1 




















































































    1 















    1 















    1 







    1 


    1 











    1 



























































































































































    1 














    1 









































    1 


    1 






















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
// SPDX-License-Identifier: GPL-2.0
/*
 * Functions related to segment and merge handling
 */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/scatterlist.h>
#include <linux/blk-cgroup.h>

#include <trace/events/block.h>

#include "blk.h"
#include "blk-rq-qos.h"

static inline bool bio_will_gap(struct request_queue *q,
                struct request *prev_rq, struct bio *prev, struct bio *next)
{
        struct bio_vec pb, nb;

        if (!bio_has_data(prev) || !queue_virt_boundary(q))
                return false;

        /*
         * Don't merge if the 1st bio starts with non-zero offset, otherwise it
         * is quite difficult to respect the sg gap limit.  We work hard to
         * merge a huge number of small single bios in case of mkfs.
         */
        if (prev_rq)
                bio_get_first_bvec(prev_rq->bio, &pb);
        else
                bio_get_first_bvec(prev, &pb);
        if (pb.bv_offset & queue_virt_boundary(q))
                return true;

        /*
         * We don't need to worry about the situation that the merged segment
         * ends in unaligned virt boundary:
         *
         * - if 'pb' ends aligned, the merged segment ends aligned
         * - if 'pb' ends unaligned, the next bio must include
         *   one single bvec of 'nb', otherwise the 'nb' can't
         *   merge with 'pb'
         */
        bio_get_last_bvec(prev, &pb);
        bio_get_first_bvec(next, &nb);
        if (biovec_phys_mergeable(q, &pb, &nb))
                return false;
        return __bvec_gap_to_prev(q, &pb, nb.bv_offset);
}

static inline bool req_gap_back_merge(struct request *req, struct bio *bio)
{
        return bio_will_gap(req->q, req, req->biotail, bio);
}

static inline bool req_gap_front_merge(struct request *req, struct bio *bio)
{
        return bio_will_gap(req->q, NULL, bio, req->bio);
}

static struct bio *blk_bio_discard_split(struct request_queue *q,
                                         struct bio *bio,
                                         struct bio_set *bs,
                                         unsigned *nsegs)
{
        unsigned int max_discard_sectors, granularity;
        int alignment;
        sector_t tmp;
        unsigned split_sectors;

        *nsegs = 1;

        /* Zero-sector (unknown) and one-sector granularities are the same.  */
        granularity = max(q->limits.discard_granularity >> 9, 1U);

        max_discard_sectors = min(q->limits.max_discard_sectors,
                        bio_allowed_max_sectors(q));
        max_discard_sectors -= max_discard_sectors % granularity;

        if (unlikely(!max_discard_sectors)) {
                /* XXX: warn */
                return NULL;
        }

        if (bio_sectors(bio) <= max_discard_sectors)
                return NULL;

        split_sectors = max_discard_sectors;

        /*
         * If the next starting sector would be misaligned, stop the discard at
         * the previous aligned sector.
         */
        alignment = (q->limits.discard_alignment >> 9) % granularity;

        tmp = bio->bi_iter.bi_sector + split_sectors - alignment;
        tmp = sector_div(tmp, granularity);

        if (split_sectors > tmp)
                split_sectors -= tmp;

        return bio_split(bio, split_sectors, GFP_NOIO, bs);
}

static struct bio *blk_bio_write_zeroes_split(struct request_queue *q,
                struct bio *bio, struct bio_set *bs, unsigned *nsegs)
{
        *nsegs = 0;

        if (!q->limits.max_write_zeroes_sectors)
                return NULL;

        if (bio_sectors(bio) <= q->limits.max_write_zeroes_sectors)
                return NULL;

        return bio_split(bio, q->limits.max_write_zeroes_sectors, GFP_NOIO, bs);
}

static struct bio *blk_bio_write_same_split(struct request_queue *q,
                                            struct bio *bio,
                                            struct bio_set *bs,
                                            unsigned *nsegs)
{
        *nsegs = 1;

        if (!q->limits.max_write_same_sectors)
                return NULL;

        if (bio_sectors(bio) <= q->limits.max_write_same_sectors)
                return NULL;

        return bio_split(bio, q->limits.max_write_same_sectors, GFP_NOIO, bs);
}

/*
 * Return the maximum number of sectors from the start of a bio that may be
 * submitted as a single request to a block device. If enough sectors remain,
 * align the end to the physical block size. Otherwise align the end to the
 * logical block size. This approach minimizes the number of non-aligned
 * requests that are submitted to a block device if the start of a bio is not
 * aligned to a physical block boundary.
 */
static inline unsigned get_max_io_size(struct request_queue *q,
                                       struct bio *bio)
{
        unsigned sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector, 0);
        unsigned max_sectors = sectors;
        unsigned pbs = queue_physical_block_size(q) >> SECTOR_SHIFT;
        unsigned lbs = queue_logical_block_size(q) >> SECTOR_SHIFT;
        unsigned start_offset = bio->bi_iter.bi_sector & (pbs - 1);

        max_sectors += start_offset;
        max_sectors &= ~(pbs - 1);
        if (max_sectors > start_offset)
                return max_sectors - start_offset;

        return sectors & ~(lbs - 1);
}

static inline unsigned get_max_segment_size(const struct request_queue *q,
                                            struct page *start_page,
                                            unsigned long offset)
{
        unsigned long mask = queue_segment_boundary(q);

        offset = mask & (page_to_phys(start_page) + offset);

        /*
         * overflow may be triggered in case of zero page physical address
         * on 32bit arch, use queue's max segment size when that happens.
         */
        return min_not_zero(mask - offset + 1,
                        (unsigned long)queue_max_segment_size(q));
}

/**
 * bvec_split_segs - verify whether or not a bvec should be split in the middle
 * @q:        [in] request queue associated with the bio associated with @bv
 * @bv:       [in] bvec to examine
 * @nsegs:    [in,out] Number of segments in the bio being built. Incremented
 *            by the number of segments from @bv that may be appended to that
 *            bio without exceeding @max_segs
 * @sectors:  [in,out] Number of sectors in the bio being built. Incremented
 *            by the number of sectors from @bv that may be appended to that
 *            bio without exceeding @max_sectors
 * @max_segs: [in] upper bound for *@nsegs
 * @max_sectors: [in] upper bound for *@sectors
 *
 * When splitting a bio, it can happen that a bvec is encountered that is too
 * big to fit in a single segment and hence that it has to be split in the
 * middle. This function verifies whether or not that should happen. The value
 * %true is returned if and only if appending the entire @bv to a bio with
 * *@nsegs segments and *@sectors sectors would make that bio unacceptable for
 * the block driver.
 */
static bool bvec_split_segs(const struct request_queue *q,
                            const struct bio_vec *bv, unsigned *nsegs,
                            unsigned *sectors, unsigned max_segs,
                            unsigned max_sectors)
{
        unsigned max_len = (min(max_sectors, UINT_MAX >> 9) - *sectors) << 9;
        unsigned len = min(bv->bv_len, max_len);
        unsigned total_len = 0;
        unsigned seg_size = 0;

        while (len && *nsegs < max_segs) {
                seg_size = get_max_segment_size(q, bv->bv_page,
                                                bv->bv_offset + total_len);
                seg_size = min(seg_size, len);

                (*nsegs)++;
                total_len += seg_size;
                len -= seg_size;

                if ((bv->bv_offset + total_len) & queue_virt_boundary(q))
                        break;
        }

        *sectors += total_len >> 9;

        /* tell the caller to split the bvec if it is too big to fit */
        return len > 0 || bv->bv_len > max_len;
}

/**
 * blk_bio_segment_split - split a bio in two bios
 * @q:    [in] request queue pointer
 * @bio:  [in] bio to be split
 * @bs:          [in] bio set to allocate the clone from
 * @segs: [out] number of segments in the bio with the first half of the sectors
 *
 * Clone @bio, update the bi_iter of the clone to represent the first sectors
 * of @bio and update @bio->bi_iter to represent the remaining sectors. The
 * following is guaranteed for the cloned bio:
 * - That it has at most get_max_io_size(@q, @bio) sectors.
 * - That it has at most queue_max_segments(@q) segments.
 *
 * Except for discard requests the cloned bio will point at the bi_io_vec of
 * the original bio. It is the responsibility of the caller to ensure that the
 * original bio is not freed before the cloned bio. The caller is also
 * responsible for ensuring that @bs is only destroyed after processing of the
 * split bio has finished.
 */
static struct bio *blk_bio_segment_split(struct request_queue *q,
                                         struct bio *bio,
                                         struct bio_set *bs,
                                         unsigned *segs)
{
        struct bio_vec bv, bvprv, *bvprvp = NULL;
        struct bvec_iter iter;
        unsigned nsegs = 0, sectors = 0;
        const unsigned max_sectors = get_max_io_size(q, bio);
        const unsigned max_segs = queue_max_segments(q);

        bio_for_each_bvec(bv, bio, iter) {
                /*
                 * If the queue doesn't support SG gaps and adding this
                 * offset would create a gap, disallow it.
                 */
                if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset))
                        goto split;

                if (nsegs < max_segs &&
                    sectors + (bv.bv_len >> 9) <= max_sectors &&
                    bv.bv_offset + bv.bv_len <= PAGE_SIZE) {
                        nsegs++;
                        sectors += bv.bv_len >> 9;
                } else if (bvec_split_segs(q, &bv, &nsegs, &sectors, max_segs,
                                         max_sectors)) {
                        goto split;
                }

                bvprv = bv;
                bvprvp = &bvprv;
        }

        *segs = nsegs;
        return NULL;
split:
        *segs = nsegs;
        return bio_split(bio, sectors, GFP_NOIO, bs);
}

/**
 * __blk_queue_split - split a bio and submit the second half
 * @bio:     [in, out] bio to be split
 * @nr_segs: [out] number of segments in the first bio
 *
 * Split a bio into two bios, chain the two bios, submit the second half and
 * store a pointer to the first half in *@bio. If the second bio is still too
 * big it will be split by a recursive call to this function. Since this
 * function may allocate a new bio from @bio->bi_disk->queue->bio_split, it is
 * the responsibility of the caller to ensure that
 * @bio->bi_disk->queue->bio_split is only released after processing of the
 * split bio has finished.
 */
void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
{
        struct request_queue *q = (*bio)->bi_disk->queue;
        struct bio *split = NULL;

        switch (bio_op(*bio)) {
        case REQ_OP_DISCARD:
        case REQ_OP_SECURE_ERASE:
                split = blk_bio_discard_split(q, *bio, &q->bio_split, nr_segs);
                break;
        case REQ_OP_WRITE_ZEROES:
                split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split,
                                nr_segs);
                break;
        case REQ_OP_WRITE_SAME:
                split = blk_bio_write_same_split(q, *bio, &q->bio_split,
                                nr_segs);
                break;
        default:
                /*
                 * All drivers must accept single-segments bios that are <=
                 * PAGE_SIZE.  This is a quick and dirty check that relies on
                 * the fact that bi_io_vec[0] is always valid if a bio has data.
                 * The check might lead to occasional false negatives when bios
                 * are cloned, but compared to the performance impact of cloned
                 * bios themselves the loop below doesn't matter anyway.
                 */
                if (!q->limits.chunk_sectors &&
                    (*bio)->bi_vcnt == 1 &&
                    ((*bio)->bi_io_vec[0].bv_len +
                     (*bio)->bi_io_vec[0].bv_offset) <= PAGE_SIZE) {
                        *nr_segs = 1;
                        break;
                }
                split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs);
                break;
        }

        if (split) {
                /* there isn't chance to merge the splitted bio */
                split->bi_opf |= REQ_NOMERGE;

                bio_chain(split, *bio);
                trace_block_split(q, split, (*bio)->bi_iter.bi_sector);
                submit_bio_noacct(*bio);
                *bio = split;

                blk_throtl_charge_bio_split(*bio);
        }
}

/**
 * blk_queue_split - split a bio and submit the second half
 * @bio: [in, out] bio to be split
 *
 * Split a bio into two bios, chains the two bios, submit the second half and
 * store a pointer to the first half in *@bio. Since this function may allocate
 * a new bio from @bio->bi_disk->queue->bio_split, it is the responsibility of
 * the caller to ensure that @bio->bi_disk->queue->bio_split is only released
 * after processing of the split bio has finished.
 */
void blk_queue_split(struct bio **bio)
{
        unsigned int nr_segs;

        __blk_queue_split(bio, &nr_segs);
}
EXPORT_SYMBOL(blk_queue_split);

unsigned int blk_recalc_rq_segments(struct request *rq)
{
        unsigned int nr_phys_segs = 0;
        unsigned int nr_sectors = 0;
        struct req_iterator iter;
        struct bio_vec bv;

        if (!rq->bio)
                return 0;

        switch (bio_op(rq->bio)) {
        case REQ_OP_DISCARD:
        case REQ_OP_SECURE_ERASE:
                if (queue_max_discard_segments(rq->q) > 1) {
                        struct bio *bio = rq->bio;

                        for_each_bio(bio)
                                nr_phys_segs++;
                        return nr_phys_segs;
                }
                return 1;
        case REQ_OP_WRITE_ZEROES:
                return 0;
        case REQ_OP_WRITE_SAME:
                return 1;
        }

        rq_for_each_bvec(bv, rq, iter)
                bvec_split_segs(rq->q, &bv, &nr_phys_segs, &nr_sectors,
                                UINT_MAX, UINT_MAX);
        return nr_phys_segs;
}

static inline struct scatterlist *blk_next_sg(struct scatterlist **sg,
                struct scatterlist *sglist)
{
        if (!*sg)
                return sglist;

        /*
         * If the driver previously mapped a shorter list, we could see a
         * termination bit prematurely unless it fully inits the sg table
         * on each mapping. We KNOW that there must be more entries here
         * or the driver would be buggy, so force clear the termination bit
         * to avoid doing a full sg_init_table() in drivers for each command.
         */
        sg_unmark_end(*sg);
        return sg_next(*sg);
}

static unsigned blk_bvec_map_sg(struct request_queue *q,
                struct bio_vec *bvec, struct scatterlist *sglist,
                struct scatterlist **sg)
{
        unsigned nbytes = bvec->bv_len;
        unsigned nsegs = 0, total = 0;

        while (nbytes > 0) {
                unsigned offset = bvec->bv_offset + total;
                unsigned len = min(get_max_segment_size(q, bvec->bv_page,
                                        offset), nbytes);
                struct page *page = bvec->bv_page;

                /*
                 * Unfortunately a fair number of drivers barf on scatterlists
                 * that have an offset larger than PAGE_SIZE, despite other
                 * subsystems dealing with that invariant just fine.  For now
                 * stick to the legacy format where we never present those from
                 * the block layer, but the code below should be removed once
                 * these offenders (mostly MMC/SD drivers) are fixed.
                 */
                page += (offset >> PAGE_SHIFT);
                offset &= ~PAGE_MASK;

                *sg = blk_next_sg(sg, sglist);
                sg_set_page(*sg, page, len, offset);

                total += len;
                nbytes -= len;
                nsegs++;
        }

        return nsegs;
}

static inline int __blk_bvec_map_sg(struct bio_vec bv,
                struct scatterlist *sglist, struct scatterlist **sg)
{
        *sg = blk_next_sg(sg, sglist);
        sg_set_page(*sg, bv.bv_page, bv.bv_len, bv.bv_offset);
        return 1;
}

/* only try to merge bvecs into one sg if they are from two bios */
static inline bool
__blk_segment_map_sg_merge(struct request_queue *q, struct bio_vec *bvec,
                           struct bio_vec *bvprv, struct scatterlist **sg)
{

        int nbytes = bvec->bv_len;

        if (!*sg)
                return false;

        if ((*sg)->length + nbytes > queue_max_segment_size(q))
                return false;

        if (!biovec_phys_mergeable(q, bvprv, bvec))
                return false;

        (*sg)->length += nbytes;

        return true;
}

static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
                             struct scatterlist *sglist,
                             struct scatterlist **sg)
{
        struct bio_vec bvec, bvprv = { NULL };
        struct bvec_iter iter;
        int nsegs = 0;
        bool new_bio = false;

        for_each_bio(bio) {
                bio_for_each_bvec(bvec, bio, iter) {
                        /*
                         * Only try to merge bvecs from two bios given we
                         * have done bio internal merge when adding pages
                         * to bio
                         */
                        if (new_bio &&
                            __blk_segment_map_sg_merge(q, &bvec, &bvprv, sg))
                                goto next_bvec;

                        if (bvec.bv_offset + bvec.bv_len <= PAGE_SIZE)
                                nsegs += __blk_bvec_map_sg(bvec, sglist, sg);
                        else
                                nsegs += blk_bvec_map_sg(q, &bvec, sglist, sg);
 next_bvec:
                        new_bio = false;
                }
                if (likely(bio->bi_iter.bi_size)) {
                        bvprv = bvec;
                        new_bio = true;
                }
        }

        return nsegs;
}

/*
 * map a request to scatterlist, return number of sg entries setup. Caller
 * must make sure sg can hold rq->nr_phys_segments entries
 */
int __blk_rq_map_sg(struct request_queue *q, struct request *rq,
                struct scatterlist *sglist, struct scatterlist **last_sg)
{
        int nsegs = 0;

        if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
                nsegs = __blk_bvec_map_sg(rq->special_vec, sglist, last_sg);
        else if (rq->bio && bio_op(rq->bio) == REQ_OP_WRITE_SAME)
                nsegs = __blk_bvec_map_sg(bio_iovec(rq->bio), sglist, last_sg);
        else if (rq->bio)
                nsegs = __blk_bios_map_sg(q, rq->bio, sglist, last_sg);

        if (*last_sg)
                sg_mark_end(*last_sg);

        /*
         * Something must have been wrong if the figured number of
         * segment is bigger than number of req's physical segments
         */
        WARN_ON(nsegs > blk_rq_nr_phys_segments(rq));

        return nsegs;
}
EXPORT_SYMBOL(__blk_rq_map_sg);

static inline unsigned int blk_rq_get_max_segments(struct request *rq)
{
        if (req_op(rq) == REQ_OP_DISCARD)
                return queue_max_discard_segments(rq->q);
        return queue_max_segments(rq->q);
}

static inline int ll_new_hw_segment(struct request *req, struct bio *bio,
                unsigned int nr_phys_segs)
{
        if (!blk_cgroup_mergeable(req, bio))
                goto no_merge;

        if (blk_integrity_merge_bio(req->q, req, bio) == false)
                goto no_merge;

        /* discard request merge won't add new segment */
        if (req_op(req) == REQ_OP_DISCARD)
                return 1;

        if (req->nr_phys_segments + nr_phys_segs > blk_rq_get_max_segments(req))
                goto no_merge;

        /*
         * This will form the start of a new hw segment.  Bump both
         * counters.
         */
        req->nr_phys_segments += nr_phys_segs;
        return 1;

no_merge:
        req_set_nomerge(req->q, req);
        return 0;
}

int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs)
{
        if (req_gap_back_merge(req, bio))
                return 0;
        if (blk_integrity_rq(req) &&
            integrity_req_gap_back_merge(req, bio))
                return 0;
        if (!bio_crypt_ctx_back_mergeable(req, bio))
                return 0;
        if (blk_rq_sectors(req) + bio_sectors(bio) >
            blk_rq_get_max_sectors(req, blk_rq_pos(req))) {
                req_set_nomerge(req->q, req);
                return 0;
        }

        return ll_new_hw_segment(req, bio, nr_segs);
}

static int ll_front_merge_fn(struct request *req, struct bio *bio,
                unsigned int nr_segs)
{
        if (req_gap_front_merge(req, bio))
                return 0;
        if (blk_integrity_rq(req) &&
            integrity_req_gap_front_merge(req, bio))
                return 0;
        if (!bio_crypt_ctx_front_mergeable(req, bio))
                return 0;
        if (blk_rq_sectors(req) + bio_sectors(bio) >
            blk_rq_get_max_sectors(req, bio->bi_iter.bi_sector)) {
                req_set_nomerge(req->q, req);
                return 0;
        }

        return ll_new_hw_segment(req, bio, nr_segs);
}

static bool req_attempt_discard_merge(struct request_queue *q, struct request *req,
                struct request *next)
{
        unsigned short segments = blk_rq_nr_discard_segments(req);

        if (segments >= queue_max_discard_segments(q))
                goto no_merge;
        if (blk_rq_sectors(req) + bio_sectors(next->bio) >
            blk_rq_get_max_sectors(req, blk_rq_pos(req)))
                goto no_merge;

        req->nr_phys_segments = segments + blk_rq_nr_discard_segments(next);
        return true;
no_merge:
        req_set_nomerge(q, req);
        return false;
}

static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
                                struct request *next)
{
        int total_phys_segments;

        if (req_gap_back_merge(req, next->bio))
                return 0;

        /*
         * Will it become too large?
         */
        if ((blk_rq_sectors(req) + blk_rq_sectors(next)) >
            blk_rq_get_max_sectors(req, blk_rq_pos(req)))
                return 0;

        total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
        if (total_phys_segments > blk_rq_get_max_segments(req))
                return 0;

        if (!blk_cgroup_mergeable(req, next->bio))
                return 0;

        if (blk_integrity_merge_rq(q, req, next) == false)
                return 0;

        if (!bio_crypt_ctx_merge_rq(req, next))
                return 0;

        /* Merge is OK... */
        req->nr_phys_segments = total_phys_segments;
        return 1;
}

/**
 * blk_rq_set_mixed_merge - mark a request as mixed merge
 * @rq: request to mark as mixed merge
 *
 * Description:
 *     @rq is about to be mixed merged.  Make sure the attributes
 *     which can be mixed are set in each bio and mark @rq as mixed
 *     merged.
 */
void blk_rq_set_mixed_merge(struct request *rq)
{
        unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;
        struct bio *bio;

        if (rq->rq_flags & RQF_MIXED_MERGE)
                return;

        /*
         * @rq will no longer represent mixable attributes for all the
         * contained bios.  It will just track those of the first one.
         * Distributes the attributs to each bio.
         */
        for (bio = rq->bio; bio; bio = bio->bi_next) {
                WARN_ON_ONCE((bio->bi_opf & REQ_FAILFAST_MASK) &&
                             (bio->bi_opf & REQ_FAILFAST_MASK) != ff);
                bio->bi_opf |= ff;
        }
        rq->rq_flags |= RQF_MIXED_MERGE;
}

static void blk_account_io_merge_request(struct request *req)
{
        if (blk_do_io_stat(req)) {
                part_stat_lock();
                part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
                part_stat_unlock();

                hd_struct_put(req->part);
        }
}

static enum elv_merge blk_try_req_merge(struct request *req,
                                        struct request *next)
{
        if (blk_discard_mergable(req))
                return ELEVATOR_DISCARD_MERGE;
        else if (blk_rq_pos(req) + blk_rq_sectors(req) == blk_rq_pos(next))
                return ELEVATOR_BACK_MERGE;

        return ELEVATOR_NO_MERGE;
}

/*
 * For non-mq, this has to be called with the request spinlock acquired.
 * For mq with scheduling, the appropriate queue wide lock should be held.
 */
static struct request *attempt_merge(struct request_queue *q,
                                     struct request *req, struct request *next)
{
        if (!rq_mergeable(req) || !rq_mergeable(next))
                return NULL;

        if (req_op(req) != req_op(next))
                return NULL;

        if (rq_data_dir(req) != rq_data_dir(next)
            || req->rq_disk != next->rq_disk)
                return NULL;

        if (req_op(req) == REQ_OP_WRITE_SAME &&
            !blk_write_same_mergeable(req->bio, next->bio))
                return NULL;

        /*
         * Don't allow merge of different write hints, or for a hint with
         * non-hint IO.
         */
        if (req->write_hint != next->write_hint)
                return NULL;

        if (req->ioprio != next->ioprio)
                return NULL;

        /*
         * If we are allowed to merge, then append bio list
         * from next to rq and release next. merge_requests_fn
         * will have updated segment counts, update sector
         * counts here. Handle DISCARDs separately, as they
         * have separate settings.
         */

        switch (blk_try_req_merge(req, next)) {
        case ELEVATOR_DISCARD_MERGE:
                if (!req_attempt_discard_merge(q, req, next))
                        return NULL;
                break;
        case ELEVATOR_BACK_MERGE:
                if (!ll_merge_requests_fn(q, req, next))
                        return NULL;
                break;
        default:
                return NULL;
        }

        /*
         * If failfast settings disagree or any of the two is already
         * a mixed merge, mark both as mixed before proceeding.  This
         * makes sure that all involved bios have mixable attributes
         * set properly.
         */
        if (((req->rq_flags | next->rq_flags) & RQF_MIXED_MERGE) ||
            (req->cmd_flags & REQ_FAILFAST_MASK) !=
            (next->cmd_flags & REQ_FAILFAST_MASK)) {
                blk_rq_set_mixed_merge(req);
                blk_rq_set_mixed_merge(next);
        }

        /*
         * At this point we have either done a back merge or front merge. We
         * need the smaller start_time_ns of the merged requests to be the
         * current request for accounting purposes.
         */
        if (next->start_time_ns < req->start_time_ns)
                req->start_time_ns = next->start_time_ns;

        req->biotail->bi_next = next->bio;
        req->biotail = next->biotail;

        req->__data_len += blk_rq_bytes(next);

        if (!blk_discard_mergable(req))
                elv_merge_requests(q, req, next);

        blk_crypto_rq_put_keyslot(next);

        /*
         * 'next' is going away, so update stats accordingly
         */
        blk_account_io_merge_request(next);

        trace_block_rq_merge(next);

        /*
         * ownership of bio passed from next to req, return 'next' for
         * the caller to free
         */
        next->bio = NULL;
        return next;
}

static struct request *attempt_back_merge(struct request_queue *q,
                struct request *rq)
{
        struct request *next = elv_latter_request(q, rq);

        if (next)
                return attempt_merge(q, rq, next);

        return NULL;
}

static struct request *attempt_front_merge(struct request_queue *q,
                struct request *rq)
{
        struct request *prev = elv_former_request(q, rq);

        if (prev)
                return attempt_merge(q, prev, rq);

        return NULL;
}

int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
                          struct request *next)
{
        struct request *free;

        free = attempt_merge(q, rq, next);
        if (free) {
                blk_put_request(free);
                return 1;
        }

        return 0;
}

bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
{
        if (!rq_mergeable(rq) || !bio_mergeable(bio))
                return false;

        if (req_op(rq) != bio_op(bio))
                return false;

        /* different data direction or already started, don't merge */
        if (bio_data_dir(bio) != rq_data_dir(rq))
                return false;

        /* must be same device */
        if (rq->rq_disk != bio->bi_disk)
                return false;

        /* don't merge across cgroup boundaries */
        if (!blk_cgroup_mergeable(rq, bio))
                return false;

        /* only merge integrity protected bio into ditto rq */
        if (blk_integrity_merge_bio(rq->q, rq, bio) == false)
                return false;

        /* Only merge if the crypt contexts are compatible */
        if (!bio_crypt_rq_ctx_compatible(rq, bio))
                return false;

        /* must be using the same buffer */
        if (req_op(rq) == REQ_OP_WRITE_SAME &&
            !blk_write_same_mergeable(rq->bio, bio))
                return false;

        /*
         * Don't allow merge of different write hints, or for a hint with
         * non-hint IO.
         */
        if (rq->write_hint != bio->bi_write_hint)
                return false;

        if (rq->ioprio != bio_prio(bio))
                return false;

        return true;
}

enum elv_merge blk_try_merge(struct request *rq, struct bio *bio)
{
        if (blk_discard_mergable(rq))
                return ELEVATOR_DISCARD_MERGE;
        else if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector)
                return ELEVATOR_BACK_MERGE;
        else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_iter.bi_sector)
                return ELEVATOR_FRONT_MERGE;
        return ELEVATOR_NO_MERGE;
}

static void blk_account_io_merge_bio(struct request *req)
{
        if (!blk_do_io_stat(req))
                return;

        part_stat_lock();
        part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
        part_stat_unlock();
}

enum bio_merge_status {
        BIO_MERGE_OK,
        BIO_MERGE_NONE,
        BIO_MERGE_FAILED,
};

static enum bio_merge_status bio_attempt_back_merge(struct request *req,
                struct bio *bio, unsigned int nr_segs)
{
        const int ff = bio->bi_opf & REQ_FAILFAST_MASK;

        if (!ll_back_merge_fn(req, bio, nr_segs))
                return BIO_MERGE_FAILED;

        trace_block_bio_backmerge(req->q, req, bio);
        rq_qos_merge(req->q, req, bio);

        if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
                blk_rq_set_mixed_merge(req);

        req->biotail->bi_next = bio;
        req->biotail = bio;
        req->__data_len += bio->bi_iter.bi_size;

        bio_crypt_free_ctx(bio);

        blk_account_io_merge_bio(req);
        return BIO_MERGE_OK;
}

static enum bio_merge_status bio_attempt_front_merge(struct request *req,
                struct bio *bio, unsigned int nr_segs)
{
        const int ff = bio->bi_opf & REQ_FAILFAST_MASK;

        if (!ll_front_merge_fn(req, bio, nr_segs))
                return BIO_MERGE_FAILED;

        trace_block_bio_frontmerge(req->q, req, bio);
        rq_qos_merge(req->q, req, bio);

        if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
                blk_rq_set_mixed_merge(req);

        bio->bi_next = req->bio;
        req->bio = bio;

        req->__sector = bio->bi_iter.bi_sector;
        req->__data_len += bio->bi_iter.bi_size;

        bio_crypt_do_front_merge(req, bio);

        blk_account_io_merge_bio(req);
        return BIO_MERGE_OK;
}

static enum bio_merge_status bio_attempt_discard_merge(struct request_queue *q,
                struct request *req, struct bio *bio)
{
        unsigned short segments = blk_rq_nr_discard_segments(req);

        if (segments >= queue_max_discard_segments(q))
                goto no_merge;
        if (blk_rq_sectors(req) + bio_sectors(bio) >
            blk_rq_get_max_sectors(req, blk_rq_pos(req)))
                goto no_merge;

        rq_qos_merge(q, req, bio);

        req->biotail->bi_next = bio;
        req->biotail = bio;
        req->__data_len += bio->bi_iter.bi_size;
        req->nr_phys_segments = segments + 1;

        blk_account_io_merge_bio(req);
        return BIO_MERGE_OK;
no_merge:
        req_set_nomerge(q, req);
        return BIO_MERGE_FAILED;
}

static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
                                                   struct request *rq,
                                                   struct bio *bio,
                                                   unsigned int nr_segs,
                                                   bool sched_allow_merge)
{
        if (!blk_rq_merge_ok(rq, bio))
                return BIO_MERGE_NONE;

        switch (blk_try_merge(rq, bio)) {
        case ELEVATOR_BACK_MERGE:
                if (!sched_allow_merge || blk_mq_sched_allow_merge(q, rq, bio))
                        return bio_attempt_back_merge(rq, bio, nr_segs);
                break;
        case ELEVATOR_FRONT_MERGE:
                if (!sched_allow_merge || blk_mq_sched_allow_merge(q, rq, bio))
                        return bio_attempt_front_merge(rq, bio, nr_segs);
                break;
        case ELEVATOR_DISCARD_MERGE:
                return bio_attempt_discard_merge(q, rq, bio);
        default:
                return BIO_MERGE_NONE;
        }

        return BIO_MERGE_FAILED;
}

/**
 * blk_attempt_plug_merge - try to merge with %current's plugged list
 * @q: request_queue new bio is being queued at
 * @bio: new bio being queued
 * @nr_segs: number of segments in @bio
 * @same_queue_rq: pointer to &struct request that gets filled in when
 * another request associated with @q is found on the plug list
 * (optional, may be %NULL)
 *
 * Determine whether @bio being queued on @q can be merged with a request
 * on %current's plugged list.  Returns %true if merge was successful,
 * otherwise %false.
 *
 * Plugging coalesces IOs from the same issuer for the same purpose without
 * going through @q->queue_lock.  As such it's more of an issuing mechanism
 * than scheduling, and the request, while may have elvpriv data, is not
 * added on the elevator at this point.  In addition, we don't have
 * reliable access to the elevator outside queue lock.  Only check basic
 * merging parameters without querying the elevator.
 *
 * Caller must ensure !blk_queue_nomerges(q) beforehand.
 */
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
                unsigned int nr_segs, struct request **same_queue_rq)
{
        struct blk_plug *plug;
        struct request *rq;
        struct list_head *plug_list;

        plug = blk_mq_plug(q, bio);
        if (!plug)
                return false;

        plug_list = &plug->mq_list;

        list_for_each_entry_reverse(rq, plug_list, queuelist) {
                if (rq->q == q && same_queue_rq) {
                        /*
                         * Only blk-mq multiple hardware queues case checks the
                         * rq in the same queue, there should be only one such
                         * rq in a queue
                         **/
                        *same_queue_rq = rq;
                }

                if (rq->q != q)
                        continue;

                if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
                    BIO_MERGE_OK)
                        return true;
        }

        return false;
}

/*
 * Iterate list of requests and see if we can merge this bio with any
 * of them.
 */
bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
                        struct bio *bio, unsigned int nr_segs)
{
        struct request *rq;
        int checked = 8;

        list_for_each_entry_reverse(rq, list, queuelist) {
                if (!checked--)
                        break;

                switch (blk_attempt_bio_merge(q, rq, bio, nr_segs, true)) {
                case BIO_MERGE_NONE:
                        continue;
                case BIO_MERGE_OK:
                        return true;
                case BIO_MERGE_FAILED:
                        return false;
                }

        }

        return false;
}
EXPORT_SYMBOL_GPL(blk_bio_list_merge);

bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
                unsigned int nr_segs, struct request **merged_request)
{
        struct request *rq;

        switch (elv_merge(q, &rq, bio)) {
        case ELEVATOR_BACK_MERGE:
                if (!blk_mq_sched_allow_merge(q, rq, bio))
                        return false;
                if (bio_attempt_back_merge(rq, bio, nr_segs) != BIO_MERGE_OK)
                        return false;
                *merged_request = attempt_back_merge(q, rq);
                if (!*merged_request)
                        elv_merged_request(q, rq, ELEVATOR_BACK_MERGE);
                return true;
        case ELEVATOR_FRONT_MERGE:
                if (!blk_mq_sched_allow_merge(q, rq, bio))
                        return false;
                if (bio_attempt_front_merge(rq, bio, nr_segs) != BIO_MERGE_OK)
                        return false;
                *merged_request = attempt_front_merge(q, rq);
                if (!*merged_request)
                        elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE);
                return true;
        case ELEVATOR_DISCARD_MERGE:
                return bio_attempt_discard_merge(q, rq, bio) == BIO_MERGE_OK;
        default:
                return false;
        }
}
EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);





















































































    3 

    3 
    3 






    3 





    3 




    3 










    3 
    3 


    3 






    3 


















































    3 







































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
/*
 * Aug 8, 2011 Bob Pearson with help from Joakim Tjernlund and George Spelvin
 * cleaned up code to current version of sparse and added the slicing-by-8
 * algorithm to the closely similar existing slicing-by-4 algorithm.
 *
 * Oct 15, 2000 Matt Domsch <Matt_Domsch@dell.com>
 * Nicer crc32 functions/docs submitted by linux@horizon.com.  Thanks!
 * Code was from the public domain, copyright abandoned.  Code was
 * subsequently included in the kernel, thus was re-licensed under the
 * GNU GPL v2.
 *
 * Oct 12, 2000 Matt Domsch <Matt_Domsch@dell.com>
 * Same crc32 function was used in 5 other places in the kernel.
 * I made one version, and deleted the others.
 * There are various incantations of crc32().  Some use a seed of 0 or ~0.
 * Some xor at the end with ~0.  The generic crc32() function takes
 * seed as an argument, and doesn't xor at the end.  Then individual
 * users can do whatever they need.
 *   drivers/net/smc9194.c uses seed ~0, doesn't xor with ~0.
 *   fs/jffs2 uses seed 0, doesn't xor with ~0.
 *   fs/partitions/efi.c uses seed ~0, xor's with ~0.
 *
 * This source code is licensed under the GNU General Public License,
 * Version 2.  See the file COPYING for more details.
 */

/* see: Documentation/staging/crc32.rst for a description of algorithms */

#include <linux/crc32.h>
#include <linux/crc32poly.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/sched.h>
#include "crc32defs.h"

#if CRC_LE_BITS > 8
# define tole(x) ((__force u32) cpu_to_le32(x))
#else
# define tole(x) (x)
#endif

#if CRC_BE_BITS > 8
# define tobe(x) ((__force u32) cpu_to_be32(x))
#else
# define tobe(x) (x)
#endif

#include "crc32table.h"

MODULE_AUTHOR("Matt Domsch <Matt_Domsch@dell.com>");
MODULE_DESCRIPTION("Various CRC32 calculations");
MODULE_LICENSE("GPL");

#if CRC_LE_BITS > 8 || CRC_BE_BITS > 8

/* implements slicing-by-4 or slicing-by-8 algorithm */
static inline u32 __pure
crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256])
{
# ifdef __LITTLE_ENDIAN
#  define DO_CRC(x) crc = t0[(crc ^ (x)) & 255] ^ (crc >> 8)
#  define DO_CRC4 (t3[(q) & 255] ^ t2[(q >> 8) & 255] ^ \
                   t1[(q >> 16) & 255] ^ t0[(q >> 24) & 255])
#  define DO_CRC8 (t7[(q) & 255] ^ t6[(q >> 8) & 255] ^ \
                   t5[(q >> 16) & 255] ^ t4[(q >> 24) & 255])
# else
#  define DO_CRC(x) crc = t0[((crc >> 24) ^ (x)) & 255] ^ (crc << 8)
#  define DO_CRC4 (t0[(q) & 255] ^ t1[(q >> 8) & 255] ^ \
                   t2[(q >> 16) & 255] ^ t3[(q >> 24) & 255])
#  define DO_CRC8 (t4[(q) & 255] ^ t5[(q >> 8) & 255] ^ \
                   t6[(q >> 16) & 255] ^ t7[(q >> 24) & 255])
# endif
        const u32 *b;
        size_t    rem_len;
# ifdef CONFIG_X86
        size_t i;
# endif
        const u32 *t0=tab[0], *t1=tab[1], *t2=tab[2], *t3=tab[3];
# if CRC_LE_BITS != 32
        const u32 *t4 = tab[4], *t5 = tab[5], *t6 = tab[6], *t7 = tab[7];
# endif
        u32 q;

        /* Align it */
        if (unlikely((long)buf & 3 && len)) {
                do {
                        DO_CRC(*buf++);
                } while ((--len) && ((long)buf)&3);
        }

# if CRC_LE_BITS == 32
        rem_len = len & 3;
        len = len >> 2;
# else
        rem_len = len & 7;
        len = len >> 3;
# endif

        b = (const u32 *)buf;
# ifdef CONFIG_X86
        --b;
        for (i = 0; i < len; i++) {
# else
        for (--b; len; --len) {
# endif
                q = crc ^ *++b; /* use pre increment for speed */
# if CRC_LE_BITS == 32
                crc = DO_CRC4;
# else
                crc = DO_CRC8;
                q = *++b;
                crc ^= DO_CRC4;
# endif
        }
        len = rem_len;
        /* And the last few bytes */
        if (len) {
                u8 *p = (u8 *)(b + 1) - 1;
# ifdef CONFIG_X86
                for (i = 0; i < len; i++)
                        DO_CRC(*++p); /* use pre increment for speed */
# else
                do {
                        DO_CRC(*++p); /* use pre increment for speed */
                } while (--len);
# endif
        }
        return crc;
#undef DO_CRC
#undef DO_CRC4
#undef DO_CRC8
}
#endif


/**
 * crc32_le_generic() - Calculate bitwise little-endian Ethernet AUTODIN II
 *                        CRC32/CRC32C
 * @crc: seed value for computation.  ~0 for Ethernet, sometimes 0 for other
 *         uses, or the previous crc32/crc32c value if computing incrementally.
 * @p: pointer to buffer over which CRC32/CRC32C is run
 * @len: length of buffer @p
 * @tab: little-endian Ethernet table
 * @polynomial: CRC32/CRC32c LE polynomial
 */
static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p,
                                          size_t len, const u32 (*tab)[256],
                                          u32 polynomial)
{
#if CRC_LE_BITS == 1
        int i;
        while (len--) {
                crc ^= *p++;
                for (i = 0; i < 8; i++)
                        crc = (crc >> 1) ^ ((crc & 1) ? polynomial : 0);
        }
# elif CRC_LE_BITS == 2
        while (len--) {
                crc ^= *p++;
                crc = (crc >> 2) ^ tab[0][crc & 3];
                crc = (crc >> 2) ^ tab[0][crc & 3];
                crc = (crc >> 2) ^ tab[0][crc & 3];
                crc = (crc >> 2) ^ tab[0][crc & 3];
        }
# elif CRC_LE_BITS == 4
        while (len--) {
                crc ^= *p++;
                crc = (crc >> 4) ^ tab[0][crc & 15];
                crc = (crc >> 4) ^ tab[0][crc & 15];
        }
# elif CRC_LE_BITS == 8
        /* aka Sarwate algorithm */
        while (len--) {
                crc ^= *p++;
                crc = (crc >> 8) ^ tab[0][crc & 255];
        }
# else
        crc = (__force u32) __cpu_to_le32(crc);
        crc = crc32_body(crc, p, len, tab);
        crc = __le32_to_cpu((__force __le32)crc);
#endif
        return crc;
}

#if CRC_LE_BITS == 1
u32 __pure __weak crc32_le(u32 crc, unsigned char const *p, size_t len)
{
        return crc32_le_generic(crc, p, len, NULL, CRC32_POLY_LE);
}
u32 __pure __weak __crc32c_le(u32 crc, unsigned char const *p, size_t len)
{
        return crc32_le_generic(crc, p, len, NULL, CRC32C_POLY_LE);
}
#else
u32 __pure __weak crc32_le(u32 crc, unsigned char const *p, size_t len)
{
        return crc32_le_generic(crc, p, len,
                        (const u32 (*)[256])crc32table_le, CRC32_POLY_LE);
}
u32 __pure __weak __crc32c_le(u32 crc, unsigned char const *p, size_t len)
{
        return crc32_le_generic(crc, p, len,
                        (const u32 (*)[256])crc32ctable_le, CRC32C_POLY_LE);
}
#endif
EXPORT_SYMBOL(crc32_le);
EXPORT_SYMBOL(__crc32c_le);

u32 __pure crc32_le_base(u32, unsigned char const *, size_t) __alias(crc32_le);
u32 __pure __crc32c_le_base(u32, unsigned char const *, size_t) __alias(__crc32c_le);

/*
 * This multiplies the polynomials x and y modulo the given modulus.
 * This follows the "little-endian" CRC convention that the lsbit
 * represents the highest power of x, and the msbit represents x^0.
 */
static u32 __attribute_const__ gf2_multiply(u32 x, u32 y, u32 modulus)
{
        u32 product = x & 1 ? y : 0;
        int i;

        for (i = 0; i < 31; i++) {
                product = (product >> 1) ^ (product & 1 ? modulus : 0);
                x >>= 1;
                product ^= x & 1 ? y : 0;
        }

        return product;
}

/**
 * crc32_generic_shift - Append @len 0 bytes to crc, in logarithmic time
 * @crc: The original little-endian CRC (i.e. lsbit is x^31 coefficient)
 * @len: The number of bytes. @crc is multiplied by x^(8*@len)
 * @polynomial: The modulus used to reduce the result to 32 bits.
 *
 * It's possible to parallelize CRC computations by computing a CRC
 * over separate ranges of a buffer, then summing them.
 * This shifts the given CRC by 8*len bits (i.e. produces the same effect
 * as appending len bytes of zero to the data), in time proportional
 * to log(len).
 */
static u32 __attribute_const__ crc32_generic_shift(u32 crc, size_t len,
                                                   u32 polynomial)
{
        u32 power = polynomial;        /* CRC of x^32 */
        int i;

        /* Shift up to 32 bits in the simple linear way */
        for (i = 0; i < 8 * (int)(len & 3); i++)
                crc = (crc >> 1) ^ (crc & 1 ? polynomial : 0);

        len >>= 2;
        if (!len)
                return crc;

        for (;;) {
                /* "power" is x^(2^i), modulo the polynomial */
                if (len & 1)
                        crc = gf2_multiply(crc, power, polynomial);

                len >>= 1;
                if (!len)
                        break;

                /* Square power, advancing to x^(2^(i+1)) */
                power = gf2_multiply(power, power, polynomial);
        }

        return crc;
}

u32 __attribute_const__ crc32_le_shift(u32 crc, size_t len)
{
        return crc32_generic_shift(crc, len, CRC32_POLY_LE);
}

u32 __attribute_const__ __crc32c_le_shift(u32 crc, size_t len)
{
        return crc32_generic_shift(crc, len, CRC32C_POLY_LE);
}
EXPORT_SYMBOL(crc32_le_shift);
EXPORT_SYMBOL(__crc32c_le_shift);

/**
 * crc32_be_generic() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32
 * @crc: seed value for computation.  ~0 for Ethernet, sometimes 0 for
 *        other uses, or the previous crc32 value if computing incrementally.
 * @p: pointer to buffer over which CRC32 is run
 * @len: length of buffer @p
 * @tab: big-endian Ethernet table
 * @polynomial: CRC32 BE polynomial
 */
static inline u32 __pure crc32_be_generic(u32 crc, unsigned char const *p,
                                          size_t len, const u32 (*tab)[256],
                                          u32 polynomial)
{
#if CRC_BE_BITS == 1
        int i;
        while (len--) {
                crc ^= *p++ << 24;
                for (i = 0; i < 8; i++)
                        crc =
                            (crc << 1) ^ ((crc & 0x80000000) ? polynomial :
                                          0);
        }
# elif CRC_BE_BITS == 2
        while (len--) {
                crc ^= *p++ << 24;
                crc = (crc << 2) ^ tab[0][crc >> 30];
                crc = (crc << 2) ^ tab[0][crc >> 30];
                crc = (crc << 2) ^ tab[0][crc >> 30];
                crc = (crc << 2) ^ tab[0][crc >> 30];
        }
# elif CRC_BE_BITS == 4
        while (len--) {
                crc ^= *p++ << 24;
                crc = (crc << 4) ^ tab[0][crc >> 28];
                crc = (crc << 4) ^ tab[0][crc >> 28];
        }
# elif CRC_BE_BITS == 8
        while (len--) {
                crc ^= *p++ << 24;
                crc = (crc << 8) ^ tab[0][crc >> 24];
        }
# else
        crc = (__force u32) __cpu_to_be32(crc);
        crc = crc32_body(crc, p, len, tab);
        crc = __be32_to_cpu((__force __be32)crc);
# endif
        return crc;
}

#if CRC_BE_BITS == 1
u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len)
{
        return crc32_be_generic(crc, p, len, NULL, CRC32_POLY_BE);
}
#else
u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len)
{
        return crc32_be_generic(crc, p, len,
                        (const u32 (*)[256])crc32table_be, CRC32_POLY_BE);
}
#endif
EXPORT_SYMBOL(crc32_be);


































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
/*
 * This file holds USB constants and structures that are needed for
 * USB device APIs.  These are used by the USB device model, which is
 * defined in chapter 9 of the USB 2.0 specification and in the
 * Wireless USB 1.0 (spread around).  Linux has several APIs in C that
 * need these:
 *
 * - the master/host side Linux-USB kernel driver API;
 * - the "usbfs" user space API; and
 * - the Linux "gadget" slave/device/peripheral side driver API.
 *
 * USB 2.0 adds an additional "On The Go" (OTG) mode, which lets systems
 * act either as a USB master/host or as a USB slave/device.  That means
 * the master and slave side APIs benefit from working well together.
 *
 * There's also "Wireless USB", using low power short range radios for
 * peripheral interconnection but otherwise building on the USB framework.
 *
 * Note all descriptors are declared '__attribute__((packed))' so that:
 *
 * [a] they never get padded, either internally (USB spec writers
 *     probably handled that) or externally;
 *
 * [b] so that accessing bigger-than-a-bytes fields will never
 *     generate bus errors on any platform, even when the location of
 *     its descriptor inside a bundle isn't "naturally aligned", and
 *
 * [c] for consistency, removing all doubt even when it appears to
 *     someone that the two other points are non-issues for that
 *     particular descriptor type.
 */

#ifndef _UAPI__LINUX_USB_CH9_H
#define _UAPI__LINUX_USB_CH9_H

#include <linux/types.h>        /* __u8 etc */
#include <asm/byteorder.h>        /* le16_to_cpu */

/*-------------------------------------------------------------------------*/

/* CONTROL REQUEST SUPPORT */

/*
 * USB directions
 *
 * This bit flag is used in endpoint descriptors' bEndpointAddress field.
 * It's also one of three fields in control requests bRequestType.
 */
#define USB_DIR_OUT                        0                /* to device */
#define USB_DIR_IN                        0x80                /* to host */

/*
 * USB types, the second of three bRequestType fields
 */
#define USB_TYPE_MASK                        (0x03 << 5)
#define USB_TYPE_STANDARD                (0x00 << 5)
#define USB_TYPE_CLASS                        (0x01 << 5)
#define USB_TYPE_VENDOR                        (0x02 << 5)
#define USB_TYPE_RESERVED                (0x03 << 5)

/*
 * USB recipients, the third of three bRequestType fields
 */
#define USB_RECIP_MASK                        0x1f
#define USB_RECIP_DEVICE                0x00
#define USB_RECIP_INTERFACE                0x01
#define USB_RECIP_ENDPOINT                0x02
#define USB_RECIP_OTHER                        0x03
/* From Wireless USB 1.0 */
#define USB_RECIP_PORT                        0x04
#define USB_RECIP_RPIPE                0x05

/*
 * Standard requests, for the bRequest field of a SETUP packet.
 *
 * These are qualified by the bRequestType field, so that for example
 * TYPE_CLASS or TYPE_VENDOR specific feature flags could be retrieved
 * by a GET_STATUS request.
 */
#define USB_REQ_GET_STATUS                0x00
#define USB_REQ_CLEAR_FEATURE                0x01
#define USB_REQ_SET_FEATURE                0x03
#define USB_REQ_SET_ADDRESS                0x05
#define USB_REQ_GET_DESCRIPTOR                0x06
#define USB_REQ_SET_DESCRIPTOR                0x07
#define USB_REQ_GET_CONFIGURATION        0x08
#define USB_REQ_SET_CONFIGURATION        0x09
#define USB_REQ_GET_INTERFACE                0x0A
#define USB_REQ_SET_INTERFACE                0x0B
#define USB_REQ_SYNCH_FRAME                0x0C
#define USB_REQ_SET_SEL                        0x30
#define USB_REQ_SET_ISOCH_DELAY                0x31

#define USB_REQ_SET_ENCRYPTION                0x0D        /* Wireless USB */
#define USB_REQ_GET_ENCRYPTION                0x0E
#define USB_REQ_RPIPE_ABORT                0x0E
#define USB_REQ_SET_HANDSHAKE                0x0F
#define USB_REQ_RPIPE_RESET                0x0F
#define USB_REQ_GET_HANDSHAKE                0x10
#define USB_REQ_SET_CONNECTION                0x11
#define USB_REQ_SET_SECURITY_DATA        0x12
#define USB_REQ_GET_SECURITY_DATA        0x13
#define USB_REQ_SET_WUSB_DATA                0x14
#define USB_REQ_LOOPBACK_DATA_WRITE        0x15
#define USB_REQ_LOOPBACK_DATA_READ        0x16
#define USB_REQ_SET_INTERFACE_DS        0x17

/* specific requests for USB Power Delivery */
#define USB_REQ_GET_PARTNER_PDO                20
#define USB_REQ_GET_BATTERY_STATUS        21
#define USB_REQ_SET_PDO                        22
#define USB_REQ_GET_VDM                        23
#define USB_REQ_SEND_VDM                24

/* The Link Power Management (LPM) ECN defines USB_REQ_TEST_AND_SET command,
 * used by hubs to put ports into a new L1 suspend state, except that it
 * forgot to define its number ...
 */

/*
 * USB feature flags are written using USB_REQ_{CLEAR,SET}_FEATURE, and
 * are read as a bit array returned by USB_REQ_GET_STATUS.  (So there
 * are at most sixteen features of each type.)  Hubs may also support a
 * new USB_REQ_TEST_AND_SET_FEATURE to put ports into L1 suspend.
 */
#define USB_DEVICE_SELF_POWERED                0        /* (read only) */
#define USB_DEVICE_REMOTE_WAKEUP        1        /* dev may initiate wakeup */
#define USB_DEVICE_TEST_MODE                2        /* (wired high speed only) */
#define USB_DEVICE_BATTERY                2        /* (wireless) */
#define USB_DEVICE_B_HNP_ENABLE                3        /* (otg) dev may initiate HNP */
#define USB_DEVICE_WUSB_DEVICE                3        /* (wireless)*/
#define USB_DEVICE_A_HNP_SUPPORT        4        /* (otg) RH port supports HNP */
#define USB_DEVICE_A_ALT_HNP_SUPPORT        5        /* (otg) other RH port does */
#define USB_DEVICE_DEBUG_MODE                6        /* (special devices only) */

/*
 * Test Mode Selectors
 * See USB 2.0 spec Table 9-7
 */
#define        USB_TEST_J                1
#define        USB_TEST_K                2
#define        USB_TEST_SE0_NAK        3
#define        USB_TEST_PACKET                4
#define        USB_TEST_FORCE_ENABLE        5

/* Status Type */
#define USB_STATUS_TYPE_STANDARD        0
#define USB_STATUS_TYPE_PTM                1

/*
 * New Feature Selectors as added by USB 3.0
 * See USB 3.0 spec Table 9-7
 */
#define USB_DEVICE_U1_ENABLE        48        /* dev may initiate U1 transition */
#define USB_DEVICE_U2_ENABLE        49        /* dev may initiate U2 transition */
#define USB_DEVICE_LTM_ENABLE        50        /* dev may send LTM */
#define USB_INTRF_FUNC_SUSPEND        0        /* function suspend */

#define USB_INTR_FUNC_SUSPEND_OPT_MASK        0xFF00
/*
 * Suspend Options, Table 9-8 USB 3.0 spec
 */
#define USB_INTRF_FUNC_SUSPEND_LP        (1 << (8 + 0))
#define USB_INTRF_FUNC_SUSPEND_RW        (1 << (8 + 1))

/*
 * Interface status, Figure 9-5 USB 3.0 spec
 */
#define USB_INTRF_STAT_FUNC_RW_CAP     1
#define USB_INTRF_STAT_FUNC_RW         2

#define USB_ENDPOINT_HALT                0        /* IN/OUT will STALL */

/* Bit array elements as returned by the USB_REQ_GET_STATUS request. */
#define USB_DEV_STAT_U1_ENABLED                2        /* transition into U1 state */
#define USB_DEV_STAT_U2_ENABLED                3        /* transition into U2 state */
#define USB_DEV_STAT_LTM_ENABLED        4        /* Latency tolerance messages */

/*
 * Feature selectors from Table 9-8 USB Power Delivery spec
 */
#define USB_DEVICE_BATTERY_WAKE_MASK        40
#define USB_DEVICE_OS_IS_PD_AWARE        41
#define USB_DEVICE_POLICY_MODE                42
#define USB_PORT_PR_SWAP                43
#define USB_PORT_GOTO_MIN                44
#define USB_PORT_RETURN_POWER                45
#define USB_PORT_ACCEPT_PD_REQUEST        46
#define USB_PORT_REJECT_PD_REQUEST        47
#define USB_PORT_PORT_PD_RESET                48
#define USB_PORT_C_PORT_PD_CHANGE        49
#define USB_PORT_CABLE_PD_RESET                50
#define USB_DEVICE_CHARGING_POLICY        54

/**
 * struct usb_ctrlrequest - SETUP data for a USB device control request
 * @bRequestType: matches the USB bmRequestType field
 * @bRequest: matches the USB bRequest field
 * @wValue: matches the USB wValue field (le16 byte order)
 * @wIndex: matches the USB wIndex field (le16 byte order)
 * @wLength: matches the USB wLength field (le16 byte order)
 *
 * This structure is used to send control requests to a USB device.  It matches
 * the different fields of the USB 2.0 Spec section 9.3, table 9-2.  See the
 * USB spec for a fuller description of the different fields, and what they are
 * used for.
 *
 * Note that the driver for any interface can issue control requests.
 * For most devices, interfaces don't coordinate with each other, so
 * such requests may be made at any time.
 */
struct usb_ctrlrequest {
        __u8 bRequestType;
        __u8 bRequest;
        __le16 wValue;
        __le16 wIndex;
        __le16 wLength;
} __attribute__ ((packed));

/*-------------------------------------------------------------------------*/

/*
 * STANDARD DESCRIPTORS ... as returned by GET_DESCRIPTOR, or
 * (rarely) accepted by SET_DESCRIPTOR.
 *
 * Note that all multi-byte values here are encoded in little endian
 * byte order "on the wire".  Within the kernel and when exposed
 * through the Linux-USB APIs, they are not converted to cpu byte
 * order; it is the responsibility of the client code to do this.
 * The single exception is when device and configuration descriptors (but
 * not other descriptors) are read from character devices
 * (i.e. /dev/bus/usb/BBB/DDD);
 * in this case the fields are converted to host endianness by the kernel.
 */

/*
 * Descriptor types ... USB 2.0 spec table 9.5
 */
#define USB_DT_DEVICE                        0x01
#define USB_DT_CONFIG                        0x02
#define USB_DT_STRING                        0x03
#define USB_DT_INTERFACE                0x04
#define USB_DT_ENDPOINT                        0x05
#define USB_DT_DEVICE_QUALIFIER                0x06
#define USB_DT_OTHER_SPEED_CONFIG        0x07
#define USB_DT_INTERFACE_POWER                0x08
/* these are from a minor usb 2.0 revision (ECN) */
#define USB_DT_OTG                        0x09
#define USB_DT_DEBUG                        0x0a
#define USB_DT_INTERFACE_ASSOCIATION        0x0b
/* these are from the Wireless USB spec */
#define USB_DT_SECURITY                        0x0c
#define USB_DT_KEY                        0x0d
#define USB_DT_ENCRYPTION_TYPE                0x0e
#define USB_DT_BOS                        0x0f
#define USB_DT_DEVICE_CAPABILITY        0x10
#define USB_DT_WIRELESS_ENDPOINT_COMP        0x11
#define USB_DT_WIRE_ADAPTER                0x21
#define USB_DT_RPIPE                        0x22
#define USB_DT_CS_RADIO_CONTROL                0x23
/* From the T10 UAS specification */
#define USB_DT_PIPE_USAGE                0x24
/* From the USB 3.0 spec */
#define        USB_DT_SS_ENDPOINT_COMP                0x30
/* From the USB 3.1 spec */
#define        USB_DT_SSP_ISOC_ENDPOINT_COMP        0x31

/* Conventional codes for class-specific descriptors.  The convention is
 * defined in the USB "Common Class" Spec (3.11).  Individual class specs
 * are authoritative for their usage, not the "common class" writeup.
 */
#define USB_DT_CS_DEVICE                (USB_TYPE_CLASS | USB_DT_DEVICE)
#define USB_DT_CS_CONFIG                (USB_TYPE_CLASS | USB_DT_CONFIG)
#define USB_DT_CS_STRING                (USB_TYPE_CLASS | USB_DT_STRING)
#define USB_DT_CS_INTERFACE                (USB_TYPE_CLASS | USB_DT_INTERFACE)
#define USB_DT_CS_ENDPOINT                (USB_TYPE_CLASS | USB_DT_ENDPOINT)

/* All standard descriptors have these 2 fields at the beginning */
struct usb_descriptor_header {
        __u8  bLength;
        __u8  bDescriptorType;
} __attribute__ ((packed));


/*-------------------------------------------------------------------------*/

/* USB_DT_DEVICE: Device descriptor */
struct usb_device_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;

        __le16 bcdUSB;
        __u8  bDeviceClass;
        __u8  bDeviceSubClass;
        __u8  bDeviceProtocol;
        __u8  bMaxPacketSize0;
        __le16 idVendor;
        __le16 idProduct;
        __le16 bcdDevice;
        __u8  iManufacturer;
        __u8  iProduct;
        __u8  iSerialNumber;
        __u8  bNumConfigurations;
} __attribute__ ((packed));

#define USB_DT_DEVICE_SIZE                18


/*
 * Device and/or Interface Class codes
 * as found in bDeviceClass or bInterfaceClass
 * and defined by www.usb.org documents
 */
#define USB_CLASS_PER_INTERFACE                0        /* for DeviceClass */
#define USB_CLASS_AUDIO                        1
#define USB_CLASS_COMM                        2
#define USB_CLASS_HID                        3
#define USB_CLASS_PHYSICAL                5
#define USB_CLASS_STILL_IMAGE                6
#define USB_CLASS_PRINTER                7
#define USB_CLASS_MASS_STORAGE                8
#define USB_CLASS_HUB                        9
#define USB_CLASS_CDC_DATA                0x0a
#define USB_CLASS_CSCID                        0x0b        /* chip+ smart card */
#define USB_CLASS_CONTENT_SEC                0x0d        /* content security */
#define USB_CLASS_VIDEO                        0x0e
#define USB_CLASS_WIRELESS_CONTROLLER        0xe0
#define USB_CLASS_PERSONAL_HEALTHCARE        0x0f
#define USB_CLASS_AUDIO_VIDEO                0x10
#define USB_CLASS_BILLBOARD                0x11
#define USB_CLASS_USB_TYPE_C_BRIDGE        0x12
#define USB_CLASS_MISC                        0xef
#define USB_CLASS_APP_SPEC                0xfe
#define USB_CLASS_VENDOR_SPEC                0xff

#define USB_SUBCLASS_VENDOR_SPEC        0xff

/*-------------------------------------------------------------------------*/

/* USB_DT_CONFIG: Configuration descriptor information.
 *
 * USB_DT_OTHER_SPEED_CONFIG is the same descriptor, except that the
 * descriptor type is different.  Highspeed-capable devices can look
 * different depending on what speed they're currently running.  Only
 * devices with a USB_DT_DEVICE_QUALIFIER have any OTHER_SPEED_CONFIG
 * descriptors.
 */
struct usb_config_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;

        __le16 wTotalLength;
        __u8  bNumInterfaces;
        __u8  bConfigurationValue;
        __u8  iConfiguration;
        __u8  bmAttributes;
        __u8  bMaxPower;
} __attribute__ ((packed));

#define USB_DT_CONFIG_SIZE                9

/* from config descriptor bmAttributes */
#define USB_CONFIG_ATT_ONE                (1 << 7)        /* must be set */
#define USB_CONFIG_ATT_SELFPOWER        (1 << 6)        /* self powered */
#define USB_CONFIG_ATT_WAKEUP                (1 << 5)        /* can wakeup */
#define USB_CONFIG_ATT_BATTERY                (1 << 4)        /* battery powered */

/*-------------------------------------------------------------------------*/

/* USB String descriptors can contain at most 126 characters. */
#define USB_MAX_STRING_LEN        126

/* USB_DT_STRING: String descriptor */
struct usb_string_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;

        __le16 wData[1];                /* UTF-16LE encoded */
} __attribute__ ((packed));

/* note that "string" zero is special, it holds language codes that
 * the device supports, not Unicode characters.
 */

/*-------------------------------------------------------------------------*/

/* USB_DT_INTERFACE: Interface descriptor */
struct usb_interface_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;

        __u8  bInterfaceNumber;
        __u8  bAlternateSetting;
        __u8  bNumEndpoints;
        __u8  bInterfaceClass;
        __u8  bInterfaceSubClass;
        __u8  bInterfaceProtocol;
        __u8  iInterface;
} __attribute__ ((packed));

#define USB_DT_INTERFACE_SIZE                9

/*-------------------------------------------------------------------------*/

/* USB_DT_ENDPOINT: Endpoint descriptor */
struct usb_endpoint_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;

        __u8  bEndpointAddress;
        __u8  bmAttributes;
        __le16 wMaxPacketSize;
        __u8  bInterval;

        /* NOTE:  these two are _only_ in audio endpoints. */
        /* use USB_DT_ENDPOINT*_SIZE in bLength, not sizeof. */
        __u8  bRefresh;
        __u8  bSynchAddress;
} __attribute__ ((packed));

#define USB_DT_ENDPOINT_SIZE                7
#define USB_DT_ENDPOINT_AUDIO_SIZE        9        /* Audio extension */


/*
 * Endpoints
 */
#define USB_ENDPOINT_NUMBER_MASK        0x0f        /* in bEndpointAddress */
#define USB_ENDPOINT_DIR_MASK                0x80

#define USB_ENDPOINT_XFERTYPE_MASK        0x03        /* in bmAttributes */
#define USB_ENDPOINT_XFER_CONTROL        0
#define USB_ENDPOINT_XFER_ISOC                1
#define USB_ENDPOINT_XFER_BULK                2
#define USB_ENDPOINT_XFER_INT                3
#define USB_ENDPOINT_MAX_ADJUSTABLE        0x80

#define USB_ENDPOINT_MAXP_MASK        0x07ff
#define USB_EP_MAXP_MULT_SHIFT        11
#define USB_EP_MAXP_MULT_MASK        (3 << USB_EP_MAXP_MULT_SHIFT)
#define USB_EP_MAXP_MULT(m) \
        (((m) & USB_EP_MAXP_MULT_MASK) >> USB_EP_MAXP_MULT_SHIFT)

/* The USB 3.0 spec redefines bits 5:4 of bmAttributes as interrupt ep type. */
#define USB_ENDPOINT_INTRTYPE                0x30
#define USB_ENDPOINT_INTR_PERIODIC        (0 << 4)
#define USB_ENDPOINT_INTR_NOTIFICATION        (1 << 4)

#define USB_ENDPOINT_SYNCTYPE                0x0c
#define USB_ENDPOINT_SYNC_NONE                (0 << 2)
#define USB_ENDPOINT_SYNC_ASYNC                (1 << 2)
#define USB_ENDPOINT_SYNC_ADAPTIVE        (2 << 2)
#define USB_ENDPOINT_SYNC_SYNC                (3 << 2)

#define USB_ENDPOINT_USAGE_MASK                0x30
#define USB_ENDPOINT_USAGE_DATA                0x00
#define USB_ENDPOINT_USAGE_FEEDBACK        0x10
#define USB_ENDPOINT_USAGE_IMPLICIT_FB        0x20        /* Implicit feedback Data endpoint */

/*-------------------------------------------------------------------------*/

/**
 * usb_endpoint_num - get the endpoint's number
 * @epd: endpoint to be checked
 *
 * Returns @epd's number: 0 to 15.
 */
static inline int usb_endpoint_num(const struct usb_endpoint_descriptor *epd)
{
        return epd->bEndpointAddress & USB_ENDPOINT_NUMBER_MASK;
}

/**
 * usb_endpoint_type - get the endpoint's transfer type
 * @epd: endpoint to be checked
 *
 * Returns one of USB_ENDPOINT_XFER_{CONTROL, ISOC, BULK, INT} according
 * to @epd's transfer type.
 */
static inline int usb_endpoint_type(const struct usb_endpoint_descriptor *epd)
{
        return epd->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK;
}

/**
 * usb_endpoint_dir_in - check if the endpoint has IN direction
 * @epd: endpoint to be checked
 *
 * Returns true if the endpoint is of type IN, otherwise it returns false.
 */
static inline int usb_endpoint_dir_in(const struct usb_endpoint_descriptor *epd)
{
        return ((epd->bEndpointAddress & USB_ENDPOINT_DIR_MASK) == USB_DIR_IN);
}

/**
 * usb_endpoint_dir_out - check if the endpoint has OUT direction
 * @epd: endpoint to be checked
 *
 * Returns true if the endpoint is of type OUT, otherwise it returns false.
 */
static inline int usb_endpoint_dir_out(
                                const struct usb_endpoint_descriptor *epd)
{
        return ((epd->bEndpointAddress & USB_ENDPOINT_DIR_MASK) == USB_DIR_OUT);
}

/**
 * usb_endpoint_xfer_bulk - check if the endpoint has bulk transfer type
 * @epd: endpoint to be checked
 *
 * Returns true if the endpoint is of type bulk, otherwise it returns false.
 */
static inline int usb_endpoint_xfer_bulk(
                                const struct usb_endpoint_descriptor *epd)
{
        return ((epd->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) ==
                USB_ENDPOINT_XFER_BULK);
}

/**
 * usb_endpoint_xfer_control - check if the endpoint has control transfer type
 * @epd: endpoint to be checked
 *
 * Returns true if the endpoint is of type control, otherwise it returns false.
 */
static inline int usb_endpoint_xfer_control(
                                const struct usb_endpoint_descriptor *epd)
{
        return ((epd->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) ==
                USB_ENDPOINT_XFER_CONTROL);
}

/**
 * usb_endpoint_xfer_int - check if the endpoint has interrupt transfer type
 * @epd: endpoint to be checked
 *
 * Returns true if the endpoint is of type interrupt, otherwise it returns
 * false.
 */
static inline int usb_endpoint_xfer_int(
                                const struct usb_endpoint_descriptor *epd)
{
        return ((epd->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) ==
                USB_ENDPOINT_XFER_INT);
}

/**
 * usb_endpoint_xfer_isoc - check if the endpoint has isochronous transfer type
 * @epd: endpoint to be checked
 *
 * Returns true if the endpoint is of type isochronous, otherwise it returns
 * false.
 */
static inline int usb_endpoint_xfer_isoc(
                                const struct usb_endpoint_descriptor *epd)
{
        return ((epd->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) ==
                USB_ENDPOINT_XFER_ISOC);
}

/**
 * usb_endpoint_is_bulk_in - check if the endpoint is bulk IN
 * @epd: endpoint to be checked
 *
 * Returns true if the endpoint has bulk transfer type and IN direction,
 * otherwise it returns false.
 */
static inline int usb_endpoint_is_bulk_in(
                                const struct usb_endpoint_descriptor *epd)
{
        return usb_endpoint_xfer_bulk(epd) && usb_endpoint_dir_in(epd);
}

/**
 * usb_endpoint_is_bulk_out - check if the endpoint is bulk OUT
 * @epd: endpoint to be checked
 *
 * Returns true if the endpoint has bulk transfer type and OUT direction,
 * otherwise it returns false.
 */
static inline int usb_endpoint_is_bulk_out(
                                const struct usb_endpoint_descriptor *epd)
{
        return usb_endpoint_xfer_bulk(epd) && usb_endpoint_dir_out(epd);
}

/**
 * usb_endpoint_is_int_in - check if the endpoint is interrupt IN
 * @epd: endpoint to be checked
 *
 * Returns true if the endpoint has interrupt transfer type and IN direction,
 * otherwise it returns false.
 */
static inline int usb_endpoint_is_int_in(
                                const struct usb_endpoint_descriptor *epd)
{
        return usb_endpoint_xfer_int(epd) && usb_endpoint_dir_in(epd);
}

/**
 * usb_endpoint_is_int_out - check if the endpoint is interrupt OUT
 * @epd: endpoint to be checked
 *
 * Returns true if the endpoint has interrupt transfer type and OUT direction,
 * otherwise it returns false.
 */
static inline int usb_endpoint_is_int_out(
                                const struct usb_endpoint_descriptor *epd)
{
        return usb_endpoint_xfer_int(epd) && usb_endpoint_dir_out(epd);
}

/**
 * usb_endpoint_is_isoc_in - check if the endpoint is isochronous IN
 * @epd: endpoint to be checked
 *
 * Returns true if the endpoint has isochronous transfer type and IN direction,
 * otherwise it returns false.
 */
static inline int usb_endpoint_is_isoc_in(
                                const struct usb_endpoint_descriptor *epd)
{
        return usb_endpoint_xfer_isoc(epd) && usb_endpoint_dir_in(epd);
}

/**
 * usb_endpoint_is_isoc_out - check if the endpoint is isochronous OUT
 * @epd: endpoint to be checked
 *
 * Returns true if the endpoint has isochronous transfer type and OUT direction,
 * otherwise it returns false.
 */
static inline int usb_endpoint_is_isoc_out(
                                const struct usb_endpoint_descriptor *epd)
{
        return usb_endpoint_xfer_isoc(epd) && usb_endpoint_dir_out(epd);
}

/**
 * usb_endpoint_maxp - get endpoint's max packet size
 * @epd: endpoint to be checked
 *
 * Returns @epd's max packet bits [10:0]
 */
static inline int usb_endpoint_maxp(const struct usb_endpoint_descriptor *epd)
{
        return __le16_to_cpu(epd->wMaxPacketSize) & USB_ENDPOINT_MAXP_MASK;
}

/**
 * usb_endpoint_maxp_mult - get endpoint's transactional opportunities
 * @epd: endpoint to be checked
 *
 * Return @epd's wMaxPacketSize[12:11] + 1
 */
static inline int
usb_endpoint_maxp_mult(const struct usb_endpoint_descriptor *epd)
{
        int maxp = __le16_to_cpu(epd->wMaxPacketSize);

        return USB_EP_MAXP_MULT(maxp) + 1;
}

static inline int usb_endpoint_interrupt_type(
                const struct usb_endpoint_descriptor *epd)
{
        return epd->bmAttributes & USB_ENDPOINT_INTRTYPE;
}

/*-------------------------------------------------------------------------*/

/* USB_DT_SSP_ISOC_ENDPOINT_COMP: SuperSpeedPlus Isochronous Endpoint Companion
 * descriptor
 */
struct usb_ssp_isoc_ep_comp_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;
        __le16 wReseved;
        __le32 dwBytesPerInterval;
} __attribute__ ((packed));

#define USB_DT_SSP_ISOC_EP_COMP_SIZE                8

/*-------------------------------------------------------------------------*/

/* USB_DT_SS_ENDPOINT_COMP: SuperSpeed Endpoint Companion descriptor */
struct usb_ss_ep_comp_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;

        __u8  bMaxBurst;
        __u8  bmAttributes;
        __le16 wBytesPerInterval;
} __attribute__ ((packed));

#define USB_DT_SS_EP_COMP_SIZE                6

/* Bits 4:0 of bmAttributes if this is a bulk endpoint */
static inline int
usb_ss_max_streams(const struct usb_ss_ep_comp_descriptor *comp)
{
        int                max_streams;

        if (!comp)
                return 0;

        max_streams = comp->bmAttributes & 0x1f;

        if (!max_streams)
                return 0;

        max_streams = 1 << max_streams;

        return max_streams;
}

/* Bits 1:0 of bmAttributes if this is an isoc endpoint */
#define USB_SS_MULT(p)                        (1 + ((p) & 0x3))
/* Bit 7 of bmAttributes if a SSP isoc endpoint companion descriptor exists */
#define USB_SS_SSP_ISOC_COMP(p)                ((p) & (1 << 7))

/*-------------------------------------------------------------------------*/

/* USB_DT_DEVICE_QUALIFIER: Device Qualifier descriptor */
struct usb_qualifier_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;

        __le16 bcdUSB;
        __u8  bDeviceClass;
        __u8  bDeviceSubClass;
        __u8  bDeviceProtocol;
        __u8  bMaxPacketSize0;
        __u8  bNumConfigurations;
        __u8  bRESERVED;
} __attribute__ ((packed));


/*-------------------------------------------------------------------------*/

/* USB_DT_OTG (from OTG 1.0a supplement) */
struct usb_otg_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;

        __u8  bmAttributes;        /* support for HNP, SRP, etc */
} __attribute__ ((packed));

/* USB_DT_OTG (from OTG 2.0 supplement) */
struct usb_otg20_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;

        __u8  bmAttributes;        /* support for HNP, SRP and ADP, etc */
        __le16 bcdOTG;                /* OTG and EH supplement release number
                                 * in binary-coded decimal(i.e. 2.0 is 0200H)
                                 */
} __attribute__ ((packed));

/* from usb_otg_descriptor.bmAttributes */
#define USB_OTG_SRP                (1 << 0)
#define USB_OTG_HNP                (1 << 1)        /* swap host/device roles */
#define USB_OTG_ADP                (1 << 2)        /* support ADP */

#define OTG_STS_SELECTOR        0xF000                /* OTG status selector */
/*-------------------------------------------------------------------------*/

/* USB_DT_DEBUG:  for special highspeed devices, replacing serial console */
struct usb_debug_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;

        /* bulk endpoints with 8 byte maxpacket */
        __u8  bDebugInEndpoint;
        __u8  bDebugOutEndpoint;
} __attribute__((packed));

/*-------------------------------------------------------------------------*/

/* USB_DT_INTERFACE_ASSOCIATION: groups interfaces */
struct usb_interface_assoc_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;

        __u8  bFirstInterface;
        __u8  bInterfaceCount;
        __u8  bFunctionClass;
        __u8  bFunctionSubClass;
        __u8  bFunctionProtocol;
        __u8  iFunction;
} __attribute__ ((packed));

#define USB_DT_INTERFACE_ASSOCIATION_SIZE        8

/*-------------------------------------------------------------------------*/

/* USB_DT_SECURITY:  group of wireless security descriptors, including
 * encryption types available for setting up a CC/association.
 */
struct usb_security_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;

        __le16 wTotalLength;
        __u8  bNumEncryptionTypes;
} __attribute__((packed));

/*-------------------------------------------------------------------------*/

/* USB_DT_KEY:  used with {GET,SET}_SECURITY_DATA; only public keys
 * may be retrieved.
 */
struct usb_key_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;

        __u8  tTKID[3];
        __u8  bReserved;
        __u8  bKeyData[0];
} __attribute__((packed));

/*-------------------------------------------------------------------------*/

/* USB_DT_ENCRYPTION_TYPE:  bundled in DT_SECURITY groups */
struct usb_encryption_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;

        __u8  bEncryptionType;
#define        USB_ENC_TYPE_UNSECURE                0
#define        USB_ENC_TYPE_WIRED                1        /* non-wireless mode */
#define        USB_ENC_TYPE_CCM_1                2        /* aes128/cbc session */
#define        USB_ENC_TYPE_RSA_1                3        /* rsa3072/sha1 auth */
        __u8  bEncryptionValue;                /* use in SET_ENCRYPTION */
        __u8  bAuthKeyIndex;
} __attribute__((packed));


/*-------------------------------------------------------------------------*/

/* USB_DT_BOS:  group of device-level capabilities */
struct usb_bos_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;

        __le16 wTotalLength;
        __u8  bNumDeviceCaps;
} __attribute__((packed));

#define USB_DT_BOS_SIZE                5
/*-------------------------------------------------------------------------*/

/* USB_DT_DEVICE_CAPABILITY:  grouped with BOS */
struct usb_dev_cap_header {
        __u8  bLength;
        __u8  bDescriptorType;
        __u8  bDevCapabilityType;
} __attribute__((packed));

#define        USB_CAP_TYPE_WIRELESS_USB        1

struct usb_wireless_cap_descriptor {        /* Ultra Wide Band */
        __u8  bLength;
        __u8  bDescriptorType;
        __u8  bDevCapabilityType;

        __u8  bmAttributes;
#define        USB_WIRELESS_P2P_DRD                (1 << 1)
#define        USB_WIRELESS_BEACON_MASK        (3 << 2)
#define        USB_WIRELESS_BEACON_SELF        (1 << 2)
#define        USB_WIRELESS_BEACON_DIRECTED        (2 << 2)
#define        USB_WIRELESS_BEACON_NONE        (3 << 2)
        __le16 wPHYRates;        /* bit rates, Mbps */
#define        USB_WIRELESS_PHY_53                (1 << 0)        /* always set */
#define        USB_WIRELESS_PHY_80                (1 << 1)
#define        USB_WIRELESS_PHY_107                (1 << 2)        /* always set */
#define        USB_WIRELESS_PHY_160                (1 << 3)
#define        USB_WIRELESS_PHY_200                (1 << 4)        /* always set */
#define        USB_WIRELESS_PHY_320                (1 << 5)
#define        USB_WIRELESS_PHY_400                (1 << 6)
#define        USB_WIRELESS_PHY_480                (1 << 7)
        __u8  bmTFITXPowerInfo;        /* TFI power levels */
        __u8  bmFFITXPowerInfo;        /* FFI power levels */
        __le16 bmBandGroup;
        __u8  bReserved;
} __attribute__((packed));

#define USB_DT_USB_WIRELESS_CAP_SIZE        11

/* USB 2.0 Extension descriptor */
#define        USB_CAP_TYPE_EXT                2

struct usb_ext_cap_descriptor {                /* Link Power Management */
        __u8  bLength;
        __u8  bDescriptorType;
        __u8  bDevCapabilityType;
        __le32 bmAttributes;
#define USB_LPM_SUPPORT                        (1 << 1)        /* supports LPM */
#define USB_BESL_SUPPORT                (1 << 2)        /* supports BESL */
#define USB_BESL_BASELINE_VALID                (1 << 3)        /* Baseline BESL valid*/
#define USB_BESL_DEEP_VALID                (1 << 4)        /* Deep BESL valid */
#define USB_SET_BESL_BASELINE(p)        (((p) & 0xf) << 8)
#define USB_SET_BESL_DEEP(p)                (((p) & 0xf) << 12)
#define USB_GET_BESL_BASELINE(p)        (((p) & (0xf << 8)) >> 8)
#define USB_GET_BESL_DEEP(p)                (((p) & (0xf << 12)) >> 12)
} __attribute__((packed));

#define USB_DT_USB_EXT_CAP_SIZE        7

/*
 * SuperSpeed USB Capability descriptor: Defines the set of SuperSpeed USB
 * specific device level capabilities
 */
#define                USB_SS_CAP_TYPE                3
struct usb_ss_cap_descriptor {                /* Link Power Management */
        __u8  bLength;
        __u8  bDescriptorType;
        __u8  bDevCapabilityType;
        __u8  bmAttributes;
#define USB_LTM_SUPPORT                        (1 << 1) /* supports LTM */
        __le16 wSpeedSupported;
#define USB_LOW_SPEED_OPERATION                (1)         /* Low speed operation */
#define USB_FULL_SPEED_OPERATION        (1 << 1) /* Full speed operation */
#define USB_HIGH_SPEED_OPERATION        (1 << 2) /* High speed operation */
#define USB_5GBPS_OPERATION                (1 << 3) /* Operation at 5Gbps */
        __u8  bFunctionalitySupport;
        __u8  bU1devExitLat;
        __le16 bU2DevExitLat;
} __attribute__((packed));

#define USB_DT_USB_SS_CAP_SIZE        10

/*
 * Container ID Capability descriptor: Defines the instance unique ID used to
 * identify the instance across all operating modes
 */
#define        CONTAINER_ID_TYPE        4
struct usb_ss_container_id_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;
        __u8  bDevCapabilityType;
        __u8  bReserved;
        __u8  ContainerID[16]; /* 128-bit number */
} __attribute__((packed));

#define USB_DT_USB_SS_CONTN_ID_SIZE        20

/*
 * SuperSpeed Plus USB Capability descriptor: Defines the set of
 * SuperSpeed Plus USB specific device level capabilities
 */
#define        USB_SSP_CAP_TYPE        0xa
struct usb_ssp_cap_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;
        __u8  bDevCapabilityType;
        __u8  bReserved;
        __le32 bmAttributes;
#define USB_SSP_SUBLINK_SPEED_ATTRIBS        (0x1f << 0) /* sublink speed entries */
#define USB_SSP_SUBLINK_SPEED_IDS        (0xf << 5)  /* speed ID entries */
        __le16  wFunctionalitySupport;
#define USB_SSP_MIN_SUBLINK_SPEED_ATTRIBUTE_ID        (0xf)
#define USB_SSP_MIN_RX_LANE_COUNT                (0xf << 8)
#define USB_SSP_MIN_TX_LANE_COUNT                (0xf << 12)
        __le16 wReserved;
        __le32 bmSublinkSpeedAttr[1]; /* list of sublink speed attrib entries */
#define USB_SSP_SUBLINK_SPEED_SSID        (0xf)                /* sublink speed ID */
#define USB_SSP_SUBLINK_SPEED_LSE        (0x3 << 4)        /* Lanespeed exponent */
#define USB_SSP_SUBLINK_SPEED_LSE_BPS                0
#define USB_SSP_SUBLINK_SPEED_LSE_KBPS                1
#define USB_SSP_SUBLINK_SPEED_LSE_MBPS                2
#define USB_SSP_SUBLINK_SPEED_LSE_GBPS                3

#define USB_SSP_SUBLINK_SPEED_ST        (0x3 << 6)        /* Sublink type */
#define USB_SSP_SUBLINK_SPEED_ST_SYM_RX                0
#define USB_SSP_SUBLINK_SPEED_ST_ASYM_RX        1
#define USB_SSP_SUBLINK_SPEED_ST_SYM_TX                2
#define USB_SSP_SUBLINK_SPEED_ST_ASYM_TX        3

#define USB_SSP_SUBLINK_SPEED_RSVD        (0x3f << 8)        /* Reserved */
#define USB_SSP_SUBLINK_SPEED_LP        (0x3 << 14)        /* Link protocol */
#define USB_SSP_SUBLINK_SPEED_LP_SS                0
#define USB_SSP_SUBLINK_SPEED_LP_SSP                1

#define USB_SSP_SUBLINK_SPEED_LSM        (0xff << 16)        /* Lanespeed mantissa */
} __attribute__((packed));

/*
 * USB Power Delivery Capability Descriptor:
 * Defines capabilities for PD
 */
/* Defines the various PD Capabilities of this device */
#define USB_PD_POWER_DELIVERY_CAPABILITY        0x06
/* Provides information on each battery supported by the device */
#define USB_PD_BATTERY_INFO_CAPABILITY                0x07
/* The Consumer characteristics of a Port on the device */
#define USB_PD_PD_CONSUMER_PORT_CAPABILITY        0x08
/* The provider characteristics of a Port on the device */
#define USB_PD_PD_PROVIDER_PORT_CAPABILITY        0x09

struct usb_pd_cap_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;
        __u8  bDevCapabilityType; /* set to USB_PD_POWER_DELIVERY_CAPABILITY */
        __u8  bReserved;
        __le32 bmAttributes;
#define USB_PD_CAP_BATTERY_CHARGING        (1 << 1) /* supports Battery Charging specification */
#define USB_PD_CAP_USB_PD                (1 << 2) /* supports USB Power Delivery specification */
#define USB_PD_CAP_PROVIDER                (1 << 3) /* can provide power */
#define USB_PD_CAP_CONSUMER                (1 << 4) /* can consume power */
#define USB_PD_CAP_CHARGING_POLICY        (1 << 5) /* supports CHARGING_POLICY feature */
#define USB_PD_CAP_TYPE_C_CURRENT        (1 << 6) /* supports power capabilities defined in the USB Type-C Specification */

#define USB_PD_CAP_PWR_AC                (1 << 8)
#define USB_PD_CAP_PWR_BAT                (1 << 9)
#define USB_PD_CAP_PWR_USE_V_BUS        (1 << 14)

        __le16 bmProviderPorts; /* Bit zero refers to the UFP of the device */
        __le16 bmConsumerPorts;
        __le16 bcdBCVersion;
        __le16 bcdPDVersion;
        __le16 bcdUSBTypeCVersion;
} __attribute__((packed));

struct usb_pd_cap_battery_info_descriptor {
        __u8 bLength;
        __u8 bDescriptorType;
        __u8 bDevCapabilityType;
        /* Index of string descriptor shall contain the user friendly name for this battery */
        __u8 iBattery;
        /* Index of string descriptor shall contain the Serial Number String for this battery */
        __u8 iSerial;
        __u8 iManufacturer;
        __u8 bBatteryId; /* uniquely identifies this battery in status Messages */
        __u8 bReserved;
        /*
         * Shall contain the Battery Charge value above which this
         * battery is considered to be fully charged but not necessarily
         * “topped off.”
         */
        __le32 dwChargedThreshold; /* in mWh */
        /*
         * Shall contain the minimum charge level of this battery such
         * that above this threshold, a device can be assured of being
         * able to power up successfully (see Battery Charging 1.2).
         */
        __le32 dwWeakThreshold; /* in mWh */
        __le32 dwBatteryDesignCapacity; /* in mWh */
        __le32 dwBatteryLastFullchargeCapacity; /* in mWh */
} __attribute__((packed));

struct usb_pd_cap_consumer_port_descriptor {
        __u8 bLength;
        __u8 bDescriptorType;
        __u8 bDevCapabilityType;
        __u8 bReserved;
        __u8 bmCapabilities;
/* port will oerate under: */
#define USB_PD_CAP_CONSUMER_BC                (1 << 0) /* BC */
#define USB_PD_CAP_CONSUMER_PD                (1 << 1) /* PD */
#define USB_PD_CAP_CONSUMER_TYPE_C        (1 << 2) /* USB Type-C Current */
        __le16 wMinVoltage; /* in 50mV units */
        __le16 wMaxVoltage; /* in 50mV units */
        __u16 wReserved;
        __le32 dwMaxOperatingPower; /* in 10 mW - operating at steady state */
        __le32 dwMaxPeakPower; /* in 10mW units - operating at peak power */
        __le32 dwMaxPeakPowerTime; /* in 100ms units - duration of peak */
#define USB_PD_CAP_CONSUMER_UNKNOWN_PEAK_POWER_TIME 0xffff
} __attribute__((packed));

struct usb_pd_cap_provider_port_descriptor {
        __u8 bLength;
        __u8 bDescriptorType;
        __u8 bDevCapabilityType;
        __u8 bReserved1;
        __u8 bmCapabilities;
/* port will oerate under: */
#define USB_PD_CAP_PROVIDER_BC                (1 << 0) /* BC */
#define USB_PD_CAP_PROVIDER_PD                (1 << 1) /* PD */
#define USB_PD_CAP_PROVIDER_TYPE_C        (1 << 2) /* USB Type-C Current */
        __u8 bNumOfPDObjects;
        __u8 bReserved2;
        __le32 wPowerDataObject[];
} __attribute__((packed));

/*
 * Precision time measurement capability descriptor: advertised by devices and
 * hubs that support PTM
 */
#define        USB_PTM_CAP_TYPE        0xb
struct usb_ptm_cap_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;
        __u8  bDevCapabilityType;
} __attribute__((packed));

#define USB_DT_USB_PTM_ID_SIZE                3
/*
 * The size of the descriptor for the Sublink Speed Attribute Count
 * (SSAC) specified in bmAttributes[4:0]. SSAC is zero-based
 */
#define USB_DT_USB_SSP_CAP_SIZE(ssac)        (12 + (ssac + 1) * 4)

/*-------------------------------------------------------------------------*/

/* USB_DT_WIRELESS_ENDPOINT_COMP:  companion descriptor associated with
 * each endpoint descriptor for a wireless device
 */
struct usb_wireless_ep_comp_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;

        __u8  bMaxBurst;
        __u8  bMaxSequence;
        __le16 wMaxStreamDelay;
        __le16 wOverTheAirPacketSize;
        __u8  bOverTheAirInterval;
        __u8  bmCompAttributes;
#define USB_ENDPOINT_SWITCH_MASK        0x03        /* in bmCompAttributes */
#define USB_ENDPOINT_SWITCH_NO                0
#define USB_ENDPOINT_SWITCH_SWITCH        1
#define USB_ENDPOINT_SWITCH_SCALE        2
} __attribute__((packed));

/*-------------------------------------------------------------------------*/

/* USB_REQ_SET_HANDSHAKE is a four-way handshake used between a wireless
 * host and a device for connection set up, mutual authentication, and
 * exchanging short lived session keys.  The handshake depends on a CC.
 */
struct usb_handshake {
        __u8 bMessageNumber;
        __u8 bStatus;
        __u8 tTKID[3];
        __u8 bReserved;
        __u8 CDID[16];
        __u8 nonce[16];
        __u8 MIC[8];
} __attribute__((packed));

/*-------------------------------------------------------------------------*/

/* USB_REQ_SET_CONNECTION modifies or revokes a connection context (CC).
 * A CC may also be set up using non-wireless secure channels (including
 * wired USB!), and some devices may support CCs with multiple hosts.
 */
struct usb_connection_context {
        __u8 CHID[16];                /* persistent host id */
        __u8 CDID[16];                /* device id (unique w/in host context) */
        __u8 CK[16];                /* connection key */
} __attribute__((packed));

/*-------------------------------------------------------------------------*/

/* USB 2.0 defines three speeds, here's how Linux identifies them */

enum usb_device_speed {
        USB_SPEED_UNKNOWN = 0,                        /* enumerating */
        USB_SPEED_LOW, USB_SPEED_FULL,                /* usb 1.1 */
        USB_SPEED_HIGH,                                /* usb 2.0 */
        USB_SPEED_WIRELESS,                        /* wireless (usb 2.5) */
        USB_SPEED_SUPER,                        /* usb 3.0 */
        USB_SPEED_SUPER_PLUS,                        /* usb 3.1 */
};


enum usb_device_state {
        /* NOTATTACHED isn't in the USB spec, and this state acts
         * the same as ATTACHED ... but it's clearer this way.
         */
        USB_STATE_NOTATTACHED = 0,

        /* chapter 9 and authentication (wireless) device states */
        USB_STATE_ATTACHED,
        USB_STATE_POWERED,                        /* wired */
        USB_STATE_RECONNECTING,                        /* auth */
        USB_STATE_UNAUTHENTICATED,                /* auth */
        USB_STATE_DEFAULT,                        /* limited function */
        USB_STATE_ADDRESS,
        USB_STATE_CONFIGURED,                        /* most functions */

        USB_STATE_SUSPENDED

        /* NOTE:  there are actually four different SUSPENDED
         * states, returning to POWERED, DEFAULT, ADDRESS, or
         * CONFIGURED respectively when SOF tokens flow again.
         * At this level there's no difference between L1 and L2
         * suspend states.  (L2 being original USB 1.1 suspend.)
         */
};

enum usb3_link_state {
        USB3_LPM_U0 = 0,
        USB3_LPM_U1,
        USB3_LPM_U2,
        USB3_LPM_U3
};

/*
 * A U1 timeout of 0x0 means the parent hub will reject any transitions to U1.
 * 0xff means the parent hub will accept transitions to U1, but will not
 * initiate a transition.
 *
 * A U1 timeout of 0x1 to 0x7F also causes the hub to initiate a transition to
 * U1 after that many microseconds.  Timeouts of 0x80 to 0xFE are reserved
 * values.
 *
 * A U2 timeout of 0x0 means the parent hub will reject any transitions to U2.
 * 0xff means the parent hub will accept transitions to U2, but will not
 * initiate a transition.
 *
 * A U2 timeout of 0x1 to 0xFE also causes the hub to initiate a transition to
 * U2 after N*256 microseconds.  Therefore a U2 timeout value of 0x1 means a U2
 * idle timer of 256 microseconds, 0x2 means 512 microseconds, 0xFE means
 * 65.024ms.
 */
#define USB3_LPM_DISABLED                0x0
#define USB3_LPM_U1_MAX_TIMEOUT                0x7F
#define USB3_LPM_U2_MAX_TIMEOUT                0xFE
#define USB3_LPM_DEVICE_INITIATED        0xFF

struct usb_set_sel_req {
        __u8        u1_sel;
        __u8        u1_pel;
        __le16        u2_sel;
        __le16        u2_pel;
} __attribute__ ((packed));

/*
 * The Set System Exit Latency control transfer provides one byte each for
 * U1 SEL and U1 PEL, so the max exit latency is 0xFF.  U2 SEL and U2 PEL each
 * are two bytes long.
 */
#define USB3_LPM_MAX_U1_SEL_PEL                0xFF
#define USB3_LPM_MAX_U2_SEL_PEL                0xFFFF

/*-------------------------------------------------------------------------*/

/*
 * As per USB compliance update, a device that is actively drawing
 * more than 100mA from USB must report itself as bus-powered in
 * the GetStatus(DEVICE) call.
 * https://compliance.usb.org/index.asp?UpdateFile=Electrical&Format=Standard#34
 */
#define USB_SELF_POWER_VBUS_MAX_DRAW                100

#endif /* _UAPI__LINUX_USB_CH9_H */






























































































































































































































































































































































































































































































































































































































































































































































































































































































    1 



























































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Linux Socket Filter Data Structures
 */
#ifndef __LINUX_FILTER_H__
#define __LINUX_FILTER_H__

#include <stdarg.h>

#include <linux/atomic.h>
#include <linux/refcount.h>
#include <linux/compat.h>
#include <linux/skbuff.h>
#include <linux/linkage.h>
#include <linux/printk.h>
#include <linux/workqueue.h>
#include <linux/sched.h>
#include <linux/capability.h>
#include <linux/set_memory.h>
#include <linux/kallsyms.h>
#include <linux/if_vlan.h>
#include <linux/vmalloc.h>
#include <linux/sockptr.h>
#include <crypto/sha.h>

#include <net/sch_generic.h>

#include <asm/byteorder.h>
#include <uapi/linux/filter.h>
#include <uapi/linux/bpf.h>

struct sk_buff;
struct sock;
struct seccomp_data;
struct bpf_prog_aux;
struct xdp_rxq_info;
struct xdp_buff;
struct sock_reuseport;
struct ctl_table;
struct ctl_table_header;

/* ArgX, context and stack frame pointer register positions. Note,
 * Arg1, Arg2, Arg3, etc are used as argument mappings of function
 * calls in BPF_CALL instruction.
 */
#define BPF_REG_ARG1        BPF_REG_1
#define BPF_REG_ARG2        BPF_REG_2
#define BPF_REG_ARG3        BPF_REG_3
#define BPF_REG_ARG4        BPF_REG_4
#define BPF_REG_ARG5        BPF_REG_5
#define BPF_REG_CTX        BPF_REG_6
#define BPF_REG_FP        BPF_REG_10

/* Additional register mappings for converted user programs. */
#define BPF_REG_A        BPF_REG_0
#define BPF_REG_X        BPF_REG_7
#define BPF_REG_TMP        BPF_REG_2        /* scratch reg */
#define BPF_REG_D        BPF_REG_8        /* data, callee-saved */
#define BPF_REG_H        BPF_REG_9        /* hlen, callee-saved */

/* Kernel hidden auxiliary/helper register. */
#define BPF_REG_AX                MAX_BPF_REG
#define MAX_BPF_EXT_REG                (MAX_BPF_REG + 1)
#define MAX_BPF_JIT_REG                MAX_BPF_EXT_REG

/* unused opcode to mark special call to bpf_tail_call() helper */
#define BPF_TAIL_CALL        0xf0

/* unused opcode to mark special load instruction. Same as BPF_ABS */
#define BPF_PROBE_MEM        0x20

/* unused opcode to mark call to interpreter with arguments */
#define BPF_CALL_ARGS        0xe0

/* unused opcode to mark speculation barrier for mitigating
 * Speculative Store Bypass
 */
#define BPF_NOSPEC        0xc0

/* As per nm, we expose JITed images as text (code) section for
 * kallsyms. That way, tools like perf can find it to match
 * addresses.
 */
#define BPF_SYM_ELF_TYPE        't'

/* BPF program can access up to 512 bytes of stack space. */
#define MAX_BPF_STACK        512

/* Helper macros for filter block array initializers. */

/* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */

#define BPF_ALU64_REG(OP, DST, SRC)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_OP(OP) | BPF_X,        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = 0 })

#define BPF_ALU32_REG(OP, DST, SRC)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_OP(OP) | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = 0 })

/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */

#define BPF_ALU64_IMM(OP, DST, IMM)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_OP(OP) | BPF_K,        \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

#define BPF_ALU32_IMM(OP, DST, IMM)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_OP(OP) | BPF_K,                \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

/* Endianess conversion, cpu_to_{l,b}e(), {l,b}e_to_cpu() */

#define BPF_ENDIAN(TYPE, DST, LEN)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_END | BPF_SRC(TYPE),        \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = LEN })

/* Short form of mov, dst_reg = src_reg */

#define BPF_MOV64_REG(DST, SRC)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_MOV | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = 0 })

#define BPF_MOV32_REG(DST, SRC)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_MOV | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = 0 })

/* Short form of mov, dst_reg = imm32 */

#define BPF_MOV64_IMM(DST, IMM)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_MOV | BPF_K,                \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

#define BPF_MOV32_IMM(DST, IMM)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_MOV | BPF_K,                \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

/* Special form of mov32, used for doing explicit zero extension on dst. */
#define BPF_ZEXT_REG(DST)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_MOV | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = DST,                                        \
                .off   = 0,                                        \
                .imm   = 1 })

static inline bool insn_is_zext(const struct bpf_insn *insn)
{
        return insn->code == (BPF_ALU | BPF_MOV | BPF_X) && insn->imm == 1;
}

/* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */
#define BPF_LD_IMM64(DST, IMM)                                        \
        BPF_LD_IMM64_RAW(DST, 0, IMM)

#define BPF_LD_IMM64_RAW(DST, SRC, IMM)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_LD | BPF_DW | BPF_IMM,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = (__u32) (IMM) }),                        \
        ((struct bpf_insn) {                                        \
                .code  = 0, /* zero is reserved opcode */        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = ((__u64) (IMM)) >> 32 })

/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */
#define BPF_LD_MAP_FD(DST, MAP_FD)                                \
        BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)

/* Short form of mov based on type, BPF_X: dst_reg = src_reg, BPF_K: dst_reg = imm32 */

#define BPF_MOV64_RAW(TYPE, DST, SRC, IMM)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_MOV | BPF_SRC(TYPE),        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

#define BPF_MOV32_RAW(TYPE, DST, SRC, IMM)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_MOV | BPF_SRC(TYPE),        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

/* Direct packet access, R0 = *(uint *) (skb->data + imm32) */

#define BPF_LD_ABS(SIZE, IMM)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS,        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

/* Indirect packet access, R0 = *(uint *) (skb->data + src_reg + imm32) */

#define BPF_LD_IND(SIZE, SRC, IMM)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_LD | BPF_SIZE(SIZE) | BPF_IND,        \
                .dst_reg = 0,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

/* Memory load, dst_reg = *(uint *) (src_reg + off16) */

#define BPF_LDX_MEM(SIZE, DST, SRC, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM,        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

/* Memory store, *(uint *) (dst_reg + off16) = src_reg */

#define BPF_STX_MEM(SIZE, DST, SRC, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM,        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

/* Atomic memory add, *(uint *)(dst_reg + off16) += src_reg */

#define BPF_STX_XADD(SIZE, DST, SRC, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_STX | BPF_SIZE(SIZE) | BPF_XADD,        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

/* Memory store, *(uint *) (dst_reg + off16) = imm32 */

#define BPF_ST_MEM(SIZE, DST, OFF, IMM)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM,        \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = OFF,                                        \
                .imm   = IMM })

/* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */

#define BPF_JMP_REG(OP, DST, SRC, OFF)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_OP(OP) | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */

#define BPF_JMP_IMM(OP, DST, IMM, OFF)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_OP(OP) | BPF_K,                \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = OFF,                                        \
                .imm   = IMM })

/* Like BPF_JMP_REG, but with 32-bit wide operands for comparison. */

#define BPF_JMP32_REG(OP, DST, SRC, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP32 | BPF_OP(OP) | BPF_X,        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

/* Like BPF_JMP_IMM, but with 32-bit wide operands for comparison. */

#define BPF_JMP32_IMM(OP, DST, IMM, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP32 | BPF_OP(OP) | BPF_K,        \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = OFF,                                        \
                .imm   = IMM })

/* Unconditional jumps, goto pc + off16 */

#define BPF_JMP_A(OFF)                                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_JA,                        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

/* Relative call */

#define BPF_CALL_REL(TGT)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_CALL,                        \
                .dst_reg = 0,                                        \
                .src_reg = BPF_PSEUDO_CALL,                        \
                .off   = 0,                                        \
                .imm   = TGT })

/* Function call */

#define BPF_CAST_CALL(x)                                        \
                ((u64 (*)(u64, u64, u64, u64, u64))(x))

#define BPF_EMIT_CALL(FUNC)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_CALL,                        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = ((FUNC) - __bpf_call_base) })

/* Raw code statement block */

#define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM)                        \
        ((struct bpf_insn) {                                        \
                .code  = CODE,                                        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = IMM })

/* Program exit */

#define BPF_EXIT_INSN()                                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_EXIT,                        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = 0 })

/* Speculation barrier */

#define BPF_ST_NOSPEC()                                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ST | BPF_NOSPEC,                        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = 0 })

/* Internal classic blocks for direct assignment */

#define __BPF_STMT(CODE, K)                                        \
        ((struct sock_filter) BPF_STMT(CODE, K))

#define __BPF_JUMP(CODE, K, JT, JF)                                \
        ((struct sock_filter) BPF_JUMP(CODE, K, JT, JF))

#define bytes_to_bpf_size(bytes)                                \
({                                                                \
        int bpf_size = -EINVAL;                                        \
                                                                \
        if (bytes == sizeof(u8))                                \
                bpf_size = BPF_B;                                \
        else if (bytes == sizeof(u16))                                \
                bpf_size = BPF_H;                                \
        else if (bytes == sizeof(u32))                                \
                bpf_size = BPF_W;                                \
        else if (bytes == sizeof(u64))                                \
                bpf_size = BPF_DW;                                \
                                                                \
        bpf_size;                                                \
})

#define bpf_size_to_bytes(bpf_size)                                \
({                                                                \
        int bytes = -EINVAL;                                        \
                                                                \
        if (bpf_size == BPF_B)                                        \
                bytes = sizeof(u8);                                \
        else if (bpf_size == BPF_H)                                \
                bytes = sizeof(u16);                                \
        else if (bpf_size == BPF_W)                                \
                bytes = sizeof(u32);                                \
        else if (bpf_size == BPF_DW)                                \
                bytes = sizeof(u64);                                \
                                                                \
        bytes;                                                        \
})

#define BPF_SIZEOF(type)                                        \
        ({                                                        \
                const int __size = bytes_to_bpf_size(sizeof(type)); \
                BUILD_BUG_ON(__size < 0);                        \
                __size;                                                \
        })

#define BPF_FIELD_SIZEOF(type, field)                                \
        ({                                                        \
                const int __size = bytes_to_bpf_size(sizeof_field(type, field)); \
                BUILD_BUG_ON(__size < 0);                        \
                __size;                                                \
        })

#define BPF_LDST_BYTES(insn)                                        \
        ({                                                        \
                const int __size = bpf_size_to_bytes(BPF_SIZE((insn)->code)); \
                WARN_ON(__size < 0);                                \
                __size;                                                \
        })

#define __BPF_MAP_0(m, v, ...) v
#define __BPF_MAP_1(m, v, t, a, ...) m(t, a)
#define __BPF_MAP_2(m, v, t, a, ...) m(t, a), __BPF_MAP_1(m, v, __VA_ARGS__)
#define __BPF_MAP_3(m, v, t, a, ...) m(t, a), __BPF_MAP_2(m, v, __VA_ARGS__)
#define __BPF_MAP_4(m, v, t, a, ...) m(t, a), __BPF_MAP_3(m, v, __VA_ARGS__)
#define __BPF_MAP_5(m, v, t, a, ...) m(t, a), __BPF_MAP_4(m, v, __VA_ARGS__)

#define __BPF_REG_0(...) __BPF_PAD(5)
#define __BPF_REG_1(...) __BPF_MAP(1, __VA_ARGS__), __BPF_PAD(4)
#define __BPF_REG_2(...) __BPF_MAP(2, __VA_ARGS__), __BPF_PAD(3)
#define __BPF_REG_3(...) __BPF_MAP(3, __VA_ARGS__), __BPF_PAD(2)
#define __BPF_REG_4(...) __BPF_MAP(4, __VA_ARGS__), __BPF_PAD(1)
#define __BPF_REG_5(...) __BPF_MAP(5, __VA_ARGS__)

#define __BPF_MAP(n, ...) __BPF_MAP_##n(__VA_ARGS__)
#define __BPF_REG(n, ...) __BPF_REG_##n(__VA_ARGS__)

#define __BPF_CAST(t, a)                                                       \
        (__force t)                                                               \
        (__force                                                               \
         typeof(__builtin_choose_expr(sizeof(t) == sizeof(unsigned long),      \
                                      (unsigned long)0, (t)0))) a
#define __BPF_V void
#define __BPF_N

#define __BPF_DECL_ARGS(t, a) t   a
#define __BPF_DECL_REGS(t, a) u64 a

#define __BPF_PAD(n)                                                               \
        __BPF_MAP(n, __BPF_DECL_ARGS, __BPF_N, u64, __ur_1, u64, __ur_2,       \
                  u64, __ur_3, u64, __ur_4, u64, __ur_5)

#define BPF_CALL_x(x, attr, name, ...)                                               \
        static __always_inline                                                       \
        u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__));   \
        typedef u64 (*btf_##name)(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \
        attr u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__));    \
        attr u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__))     \
        {                                                                       \
                return ((btf_##name)____##name)(__BPF_MAP(x,__BPF_CAST,__BPF_N,__VA_ARGS__));\
        }                                                                       \
        static __always_inline                                                       \
        u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__))

#define __NOATTR
#define BPF_CALL_0(name, ...)        BPF_CALL_x(0, __NOATTR, name, __VA_ARGS__)
#define BPF_CALL_1(name, ...)        BPF_CALL_x(1, __NOATTR, name, __VA_ARGS__)
#define BPF_CALL_2(name, ...)        BPF_CALL_x(2, __NOATTR, name, __VA_ARGS__)
#define BPF_CALL_3(name, ...)        BPF_CALL_x(3, __NOATTR, name, __VA_ARGS__)
#define BPF_CALL_4(name, ...)        BPF_CALL_x(4, __NOATTR, name, __VA_ARGS__)
#define BPF_CALL_5(name, ...)        BPF_CALL_x(5, __NOATTR, name, __VA_ARGS__)

#define NOTRACE_BPF_CALL_1(name, ...)        BPF_CALL_x(1, notrace, name, __VA_ARGS__)

#define bpf_ctx_range(TYPE, MEMBER)                                                \
        offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1
#define bpf_ctx_range_till(TYPE, MEMBER1, MEMBER2)                                \
        offsetof(TYPE, MEMBER1) ... offsetofend(TYPE, MEMBER2) - 1
#if BITS_PER_LONG == 64
# define bpf_ctx_range_ptr(TYPE, MEMBER)                                        \
        offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1
#else
# define bpf_ctx_range_ptr(TYPE, MEMBER)                                        \
        offsetof(TYPE, MEMBER) ... offsetof(TYPE, MEMBER) + 8 - 1
#endif /* BITS_PER_LONG == 64 */

#define bpf_target_off(TYPE, MEMBER, SIZE, PTR_SIZE)                                \
        ({                                                                        \
                BUILD_BUG_ON(sizeof_field(TYPE, MEMBER) != (SIZE));                \
                *(PTR_SIZE) = (SIZE);                                                \
                offsetof(TYPE, MEMBER);                                                \
        })

/* A struct sock_filter is architecture independent. */
struct compat_sock_fprog {
        u16                len;
        compat_uptr_t        filter;        /* struct sock_filter * */
};

struct sock_fprog_kern {
        u16                        len;
        struct sock_filter        *filter;
};

/* Some arches need doubleword alignment for their instructions and/or data */
#define BPF_IMAGE_ALIGNMENT 8

struct bpf_binary_header {
        u32 pages;
        u8 image[] __aligned(BPF_IMAGE_ALIGNMENT);
};

struct bpf_prog {
        u16                        pages;                /* Number of allocated pages */
        u16                        jited:1,        /* Is our filter JIT'ed? */
                                jit_requested:1,/* archs need to JIT the prog */
                                gpl_compatible:1, /* Is filter GPL compatible? */
                                cb_access:1,        /* Is control block accessed? */
                                dst_needed:1,        /* Do we need dst entry? */
                                blinded:1,        /* Was blinded */
                                is_func:1,        /* program is a bpf function */
                                kprobe_override:1, /* Do we override a kprobe? */
                                has_callchain_buf:1, /* callchain buffer allocated? */
                                enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
                                call_get_stack:1; /* Do we call bpf_get_stack() or bpf_get_stackid() */
        enum bpf_prog_type        type;                /* Type of BPF program */
        enum bpf_attach_type        expected_attach_type; /* For some prog types */
        u32                        len;                /* Number of filter blocks */
        u32                        jited_len;        /* Size of jited insns in bytes */
        u8                        tag[BPF_TAG_SIZE];
        struct bpf_prog_aux        *aux;                /* Auxiliary fields */
        struct sock_fprog_kern        *orig_prog;        /* Original BPF program */
        unsigned int                (*bpf_func)(const void *ctx,
                                            const struct bpf_insn *insn);
        /* Instructions for interpreter */
        struct sock_filter        insns[0];
        struct bpf_insn                insnsi[];
};

struct sk_filter {
        refcount_t        refcnt;
        struct rcu_head        rcu;
        struct bpf_prog        *prog;
};

DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);

#define __BPF_PROG_RUN(prog, ctx, dfunc)        ({                        \
        u32 __ret;                                                        \
        cant_migrate();                                                        \
        if (static_branch_unlikely(&bpf_stats_enabled_key)) {                \
                struct bpf_prog_stats *__stats;                                \
                u64 __start = sched_clock();                                \
                __ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func);        \
                __stats = this_cpu_ptr(prog->aux->stats);                \
                u64_stats_update_begin(&__stats->syncp);                \
                __stats->cnt++;                                                \
                __stats->nsecs += sched_clock() - __start;                \
                u64_stats_update_end(&__stats->syncp);                        \
        } else {                                                        \
                __ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func);        \
        }                                                                \
        __ret; })

#define BPF_PROG_RUN(prog, ctx)                                                \
        __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nop_func)

/*
 * Use in preemptible and therefore migratable context to make sure that
 * the execution of the BPF program runs on one CPU.
 *
 * This uses migrate_disable/enable() explicitly to document that the
 * invocation of a BPF program does not require reentrancy protection
 * against a BPF program which is invoked from a preempting task.
 *
 * For non RT enabled kernels migrate_disable/enable() maps to
 * preempt_disable/enable(), i.e. it disables also preemption.
 */
static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog,
                                          const void *ctx)
{
        u32 ret;

        migrate_disable();
        ret = __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nop_func);
        migrate_enable();
        return ret;
}

#define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN

struct bpf_skb_data_end {
        struct qdisc_skb_cb qdisc_cb;
        void *data_meta;
        void *data_end;
};

struct bpf_nh_params {
        u32 nh_family;
        union {
                u32 ipv4_nh;
                struct in6_addr ipv6_nh;
        };
};

struct bpf_redirect_info {
        u32 flags;
        u32 tgt_index;
        void *tgt_value;
        struct bpf_map *map;
        u32 kern_flags;
        struct bpf_nh_params nh;
};

DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);

/* flags for bpf_redirect_info kern_flags */
#define BPF_RI_F_RF_NO_DIRECT        BIT(0)        /* no napi_direct on return_frame */

/* Compute the linear packet data range [data, data_end) which
 * will be accessed by various program types (cls_bpf, act_bpf,
 * lwt, ...). Subsystems allowing direct data access must (!)
 * ensure that cb[] area can be written to when BPF program is
 * invoked (otherwise cb[] save/restore is necessary).
 */
static inline void bpf_compute_data_pointers(struct sk_buff *skb)
{
        struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;

        BUILD_BUG_ON(sizeof(*cb) > sizeof_field(struct sk_buff, cb));
        cb->data_meta = skb->data - skb_metadata_len(skb);
        cb->data_end  = skb->data + skb_headlen(skb);
}

/* Similar to bpf_compute_data_pointers(), except that save orginal
 * data in cb->data and cb->meta_data for restore.
 */
static inline void bpf_compute_and_save_data_end(
        struct sk_buff *skb, void **saved_data_end)
{
        struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;

        *saved_data_end = cb->data_end;
        cb->data_end  = skb->data + skb_headlen(skb);
}

/* Restore data saved by bpf_compute_data_pointers(). */
static inline void bpf_restore_data_end(
        struct sk_buff *skb, void *saved_data_end)
{
        struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;

        cb->data_end = saved_data_end;
}

static inline u8 *bpf_skb_cb(struct sk_buff *skb)
{
        /* eBPF programs may read/write skb->cb[] area to transfer meta
         * data between tail calls. Since this also needs to work with
         * tc, that scratch memory is mapped to qdisc_skb_cb's data area.
         *
         * In some socket filter cases, the cb unfortunately needs to be
         * saved/restored so that protocol specific skb->cb[] data won't
         * be lost. In any case, due to unpriviledged eBPF programs
         * attached to sockets, we need to clear the bpf_skb_cb() area
         * to not leak previous contents to user space.
         */
        BUILD_BUG_ON(sizeof_field(struct __sk_buff, cb) != BPF_SKB_CB_LEN);
        BUILD_BUG_ON(sizeof_field(struct __sk_buff, cb) !=
                     sizeof_field(struct qdisc_skb_cb, data));

        return qdisc_skb_cb(skb)->data;
}

/* Must be invoked with migration disabled */
static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog,
                                         struct sk_buff *skb)
{
        u8 *cb_data = bpf_skb_cb(skb);
        u8 cb_saved[BPF_SKB_CB_LEN];
        u32 res;

        if (unlikely(prog->cb_access)) {
                memcpy(cb_saved, cb_data, sizeof(cb_saved));
                memset(cb_data, 0, sizeof(cb_saved));
        }

        res = BPF_PROG_RUN(prog, skb);

        if (unlikely(prog->cb_access))
                memcpy(cb_data, cb_saved, sizeof(cb_saved));

        return res;
}

static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog,
                                       struct sk_buff *skb)
{
        u32 res;

        migrate_disable();
        res = __bpf_prog_run_save_cb(prog, skb);
        migrate_enable();
        return res;
}

static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog,
                                        struct sk_buff *skb)
{
        u8 *cb_data = bpf_skb_cb(skb);
        u32 res;

        if (unlikely(prog->cb_access))
                memset(cb_data, 0, BPF_SKB_CB_LEN);

        res = bpf_prog_run_pin_on_cpu(prog, skb);
        return res;
}

DECLARE_BPF_DISPATCHER(xdp)

static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
                                            struct xdp_buff *xdp)
{
        /* Caller needs to hold rcu_read_lock() (!), otherwise program
         * can be released while still running, or map elements could be
         * freed early while still having concurrent users. XDP fastpath
         * already takes rcu_read_lock() when fetching the program, so
         * it's not necessary here anymore.
         */
        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);

        if (ri->map)
                ri->map = NULL;
        return __BPF_PROG_RUN(prog, xdp, BPF_DISPATCHER_FUNC(xdp));
}

void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog);

static inline u32 bpf_prog_insn_size(const struct bpf_prog *prog)
{
        return prog->len * sizeof(struct bpf_insn);
}

static inline u32 bpf_prog_tag_scratch_size(const struct bpf_prog *prog)
{
        return round_up(bpf_prog_insn_size(prog) +
                        sizeof(__be64) + 1, SHA1_BLOCK_SIZE);
}

static inline unsigned int bpf_prog_size(unsigned int proglen)
{
        return max(sizeof(struct bpf_prog),
                   offsetof(struct bpf_prog, insns[proglen]));
}

static inline bool bpf_prog_was_classic(const struct bpf_prog *prog)
{
        /* When classic BPF programs have been loaded and the arch
         * does not have a classic BPF JIT (anymore), they have been
         * converted via bpf_migrate_filter() to eBPF and thus always
         * have an unspec program type.
         */
        return prog->type == BPF_PROG_TYPE_UNSPEC;
}

static inline u32 bpf_ctx_off_adjust_machine(u32 size)
{
        const u32 size_machine = sizeof(unsigned long);

        if (size > size_machine && size % size_machine == 0)
                size = size_machine;

        return size;
}

static inline bool
bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default)
{
        return size <= size_default && (size & (size - 1)) == 0;
}

static inline u8
bpf_ctx_narrow_access_offset(u32 off, u32 size, u32 size_default)
{
        u8 access_off = off & (size_default - 1);

#ifdef __LITTLE_ENDIAN
        return access_off;
#else
        return size_default - (access_off + size);
#endif
}

#define bpf_ctx_wide_access_ok(off, size, type, field)                        \
        (size == sizeof(__u64) &&                                        \
        off >= offsetof(type, field) &&                                        \
        off + sizeof(__u64) <= offsetofend(type, field) &&                \
        off % sizeof(__u64) == 0)

#define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0]))

static inline void bpf_prog_lock_ro(struct bpf_prog *fp)
{
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
        if (!fp->jited) {
                set_vm_flush_reset_perms(fp);
                set_memory_ro((unsigned long)fp, fp->pages);
        }
#endif
}

static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)
{
        set_vm_flush_reset_perms(hdr);
        set_memory_ro((unsigned long)hdr, hdr->pages);
        set_memory_x((unsigned long)hdr, hdr->pages);
}

static inline struct bpf_binary_header *
bpf_jit_binary_hdr(const struct bpf_prog *fp)
{
        unsigned long real_start = (unsigned long)fp->bpf_func;
        unsigned long addr = real_start & PAGE_MASK;

        return (void *)addr;
}

int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap);
static inline int sk_filter(struct sock *sk, struct sk_buff *skb)
{
        return sk_filter_trim_cap(sk, skb, 1);
}

struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err);
void bpf_prog_free(struct bpf_prog *fp);

bool bpf_opcode_in_insntable(u8 code);

void bpf_prog_free_linfo(struct bpf_prog *prog);
void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
                               const u32 *insn_to_jit_off);
int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog);
void bpf_prog_free_jited_linfo(struct bpf_prog *prog);
void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog);

struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags);
struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags);
struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
                                  gfp_t gfp_extra_flags);
void __bpf_prog_free(struct bpf_prog *fp);

static inline void bpf_prog_unlock_free(struct bpf_prog *fp)
{
        __bpf_prog_free(fp);
}

typedef int (*bpf_aux_classic_check_t)(struct sock_filter *filter,
                                       unsigned int flen);

int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog);
int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
                              bpf_aux_classic_check_t trans, bool save_orig);
void bpf_prog_destroy(struct bpf_prog *fp);

int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
int sk_attach_bpf(u32 ufd, struct sock *sk);
int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk);
int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk);
void sk_reuseport_prog_free(struct bpf_prog *prog);
int sk_detach_filter(struct sock *sk);
int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len);

bool sk_filter_charge(struct sock *sk, struct sk_filter *fp);
void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);

u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
#define __bpf_call_base_args \
        ((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \
         (void *)__bpf_call_base)

struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog);
void bpf_jit_compile(struct bpf_prog *prog);
bool bpf_jit_needs_zext(void);
bool bpf_helper_changes_pkt_data(void *func);

static inline bool bpf_dump_raw_ok(const struct cred *cred)
{
        /* Reconstruction of call-sites is dependent on kallsyms,
         * thus make dump the same restriction.
         */
        return kallsyms_show_value(cred);
}

struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
                                       const struct bpf_insn *patch, u32 len);
int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt);

void bpf_clear_redirect_map(struct bpf_map *map);

static inline bool xdp_return_frame_no_direct(void)
{
        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);

        return ri->kern_flags & BPF_RI_F_RF_NO_DIRECT;
}

static inline void xdp_set_return_frame_no_direct(void)
{
        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);

        ri->kern_flags |= BPF_RI_F_RF_NO_DIRECT;
}

static inline void xdp_clear_return_frame_no_direct(void)
{
        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);

        ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT;
}

static inline int xdp_ok_fwd_dev(const struct net_device *fwd,
                                 unsigned int pktlen)
{
        unsigned int len;

        if (unlikely(!(fwd->flags & IFF_UP)))
                return -ENETDOWN;

        len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN;
        if (pktlen > len)
                return -EMSGSIZE;

        return 0;
}

/* The pair of xdp_do_redirect and xdp_do_flush MUST be called in the
 * same cpu context. Further for best results no more than a single map
 * for the do_redirect/do_flush pair should be used. This limitation is
 * because we only track one map and force a flush when the map changes.
 * This does not appear to be a real limitation for existing software.
 */
int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
                            struct xdp_buff *xdp, struct bpf_prog *prog);
int xdp_do_redirect(struct net_device *dev,
                    struct xdp_buff *xdp,
                    struct bpf_prog *prog);
void xdp_do_flush(void);

/* The xdp_do_flush_map() helper has been renamed to drop the _map suffix, as
 * it is no longer only flushing maps. Keep this define for compatibility
 * until all drivers are updated - do not use xdp_do_flush_map() in new code!
 */
#define xdp_do_flush_map xdp_do_flush

void bpf_warn_invalid_xdp_action(u32 act);

#ifdef CONFIG_INET
struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
                                  struct bpf_prog *prog, struct sk_buff *skb,
                                  u32 hash);
#else
static inline struct sock *
bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
                     struct bpf_prog *prog, struct sk_buff *skb,
                     u32 hash)
{
        return NULL;
}
#endif

#ifdef CONFIG_BPF_JIT
extern int bpf_jit_enable;
extern int bpf_jit_harden;
extern int bpf_jit_kallsyms;
extern long bpf_jit_limit;
extern long bpf_jit_limit_max;

typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size);

struct bpf_binary_header *
bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
                     unsigned int alignment,
                     bpf_jit_fill_hole_t bpf_fill_ill_insns);
void bpf_jit_binary_free(struct bpf_binary_header *hdr);
u64 bpf_jit_alloc_exec_limit(void);
void *bpf_jit_alloc_exec(unsigned long size);
void bpf_jit_free_exec(void *addr);
void bpf_jit_free(struct bpf_prog *fp);

int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
                                struct bpf_jit_poke_descriptor *poke);

int bpf_jit_get_func_addr(const struct bpf_prog *prog,
                          const struct bpf_insn *insn, bool extra_pass,
                          u64 *func_addr, bool *func_addr_fixed);

struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *fp);
void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other);

static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen,
                                u32 pass, void *image)
{
        pr_err("flen=%u proglen=%u pass=%u image=%p from=%s pid=%d\n", flen,
               proglen, pass, image, current->comm, task_pid_nr(current));

        if (image)
                print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_OFFSET,
                               16, 1, image, proglen, false);
}

static inline bool bpf_jit_is_ebpf(void)
{
# ifdef CONFIG_HAVE_EBPF_JIT
        return true;
# else
        return false;
# endif
}

static inline bool ebpf_jit_enabled(void)
{
        return bpf_jit_enable && bpf_jit_is_ebpf();
}

static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp)
{
        return fp->jited && bpf_jit_is_ebpf();
}

static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog)
{
        /* These are the prerequisites, should someone ever have the
         * idea to call blinding outside of them, we make sure to
         * bail out.
         */
        if (!bpf_jit_is_ebpf())
                return false;
        if (!prog->jit_requested)
                return false;
        if (!bpf_jit_harden)
                return false;
        if (bpf_jit_harden == 1 && capable(CAP_SYS_ADMIN))
                return false;

        return true;
}

static inline bool bpf_jit_kallsyms_enabled(void)
{
        /* There are a couple of corner cases where kallsyms should
         * not be enabled f.e. on hardening.
         */
        if (bpf_jit_harden)
                return false;
        if (!bpf_jit_kallsyms)
                return false;
        if (bpf_jit_kallsyms == 1)
                return true;

        return false;
}

const char *__bpf_address_lookup(unsigned long addr, unsigned long *size,
                                 unsigned long *off, char *sym);
bool is_bpf_text_address(unsigned long addr);
int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
                    char *sym);

static inline const char *
bpf_address_lookup(unsigned long addr, unsigned long *size,
                   unsigned long *off, char **modname, char *sym)
{
        const char *ret = __bpf_address_lookup(addr, size, off, sym);

        if (ret && modname)
                *modname = NULL;
        return ret;
}

void bpf_prog_kallsyms_add(struct bpf_prog *fp);
void bpf_prog_kallsyms_del(struct bpf_prog *fp);

#else /* CONFIG_BPF_JIT */

static inline bool ebpf_jit_enabled(void)
{
        return false;
}

static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog)
{
        return false;
}

static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp)
{
        return false;
}

static inline int
bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
                            struct bpf_jit_poke_descriptor *poke)
{
        return -ENOTSUPP;
}

static inline void bpf_jit_free(struct bpf_prog *fp)
{
        bpf_prog_unlock_free(fp);
}

static inline bool bpf_jit_kallsyms_enabled(void)
{
        return false;
}

static inline const char *
__bpf_address_lookup(unsigned long addr, unsigned long *size,
                     unsigned long *off, char *sym)
{
        return NULL;
}

static inline bool is_bpf_text_address(unsigned long addr)
{
        return false;
}

static inline int bpf_get_kallsym(unsigned int symnum, unsigned long *value,
                                  char *type, char *sym)
{
        return -ERANGE;
}

static inline const char *
bpf_address_lookup(unsigned long addr, unsigned long *size,
                   unsigned long *off, char **modname, char *sym)
{
        return NULL;
}

static inline void bpf_prog_kallsyms_add(struct bpf_prog *fp)
{
}

static inline void bpf_prog_kallsyms_del(struct bpf_prog *fp)
{
}

#endif /* CONFIG_BPF_JIT */

void bpf_prog_kallsyms_del_all(struct bpf_prog *fp);

#define BPF_ANC                BIT(15)

static inline bool bpf_needs_clear_a(const struct sock_filter *first)
{
        switch (first->code) {
        case BPF_RET | BPF_K:
        case BPF_LD | BPF_W | BPF_LEN:
                return false;

        case BPF_LD | BPF_W | BPF_ABS:
        case BPF_LD | BPF_H | BPF_ABS:
        case BPF_LD | BPF_B | BPF_ABS:
                if (first->k == SKF_AD_OFF + SKF_AD_ALU_XOR_X)
                        return true;
                return false;

        default:
                return true;
        }
}

static inline u16 bpf_anc_helper(const struct sock_filter *ftest)
{
        BUG_ON(ftest->code & BPF_ANC);

        switch (ftest->code) {
        case BPF_LD | BPF_W | BPF_ABS:
        case BPF_LD | BPF_H | BPF_ABS:
        case BPF_LD | BPF_B | BPF_ABS:
#define BPF_ANCILLARY(CODE)        case SKF_AD_OFF + SKF_AD_##CODE:        \
                                return BPF_ANC | SKF_AD_##CODE
                switch (ftest->k) {
                BPF_ANCILLARY(PROTOCOL);
                BPF_ANCILLARY(PKTTYPE);
                BPF_ANCILLARY(IFINDEX);
                BPF_ANCILLARY(NLATTR);
                BPF_ANCILLARY(NLATTR_NEST);
                BPF_ANCILLARY(MARK);
                BPF_ANCILLARY(QUEUE);
                BPF_ANCILLARY(HATYPE);
                BPF_ANCILLARY(RXHASH);
                BPF_ANCILLARY(CPU);
                BPF_ANCILLARY(ALU_XOR_X);
                BPF_ANCILLARY(VLAN_TAG);
                BPF_ANCILLARY(VLAN_TAG_PRESENT);
                BPF_ANCILLARY(PAY_OFFSET);
                BPF_ANCILLARY(RANDOM);
                BPF_ANCILLARY(VLAN_TPID);
                }
                fallthrough;
        default:
                return ftest->code;
        }
}

void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb,
                                           int k, unsigned int size);

static inline void *bpf_load_pointer(const struct sk_buff *skb, int k,
                                     unsigned int size, void *buffer)
{
        if (k >= 0)
                return skb_header_pointer(skb, k, size, buffer);

        return bpf_internal_load_pointer_neg_helper(skb, k, size);
}

static inline int bpf_tell_extensions(void)
{
        return SKF_AD_MAX;
}

struct bpf_sock_addr_kern {
        struct sock *sk;
        struct sockaddr *uaddr;
        /* Temporary "register" to make indirect stores to nested structures
         * defined above. We need three registers to make such a store, but
         * only two (src and dst) are available at convert_ctx_access time
         */
        u64 tmp_reg;
        void *t_ctx;        /* Attach type specific context. */
};

struct bpf_sock_ops_kern {
        struct        sock *sk;
        union {
                u32 args[4];
                u32 reply;
                u32 replylong[4];
        };
        struct sk_buff        *syn_skb;
        struct sk_buff        *skb;
        void        *skb_data_end;
        u8        op;
        u8        is_fullsock;
        u8        remaining_opt_len;
        u64        temp;                        /* temp and everything after is not
                                         * initialized to 0 before calling
                                         * the BPF program. New fields that
                                         * should be initialized to 0 should
                                         * be inserted before temp.
                                         * temp is scratch storage used by
                                         * sock_ops_convert_ctx_access
                                         * as temporary storage of a register.
                                         */
};

struct bpf_sysctl_kern {
        struct ctl_table_header *head;
        struct ctl_table *table;
        void *cur_val;
        size_t cur_len;
        void *new_val;
        size_t new_len;
        int new_updated;
        int write;
        loff_t *ppos;
        /* Temporary "register" for indirect stores to ppos. */
        u64 tmp_reg;
};

struct bpf_sockopt_kern {
        struct sock        *sk;
        u8                *optval;
        u8                *optval_end;
        s32                level;
        s32                optname;
        s32                optlen;
        s32                retval;
};

int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len);

struct bpf_sk_lookup_kern {
        u16                family;
        u16                protocol;
        __be16                sport;
        u16                dport;
        struct {
                __be32 saddr;
                __be32 daddr;
        } v4;
        struct {
                const struct in6_addr *saddr;
                const struct in6_addr *daddr;
        } v6;
        struct sock        *selected_sk;
        bool                no_reuseport;
};

extern struct static_key_false bpf_sk_lookup_enabled;

/* Runners for BPF_SK_LOOKUP programs to invoke on socket lookup.
 *
 * Allowed return values for a BPF SK_LOOKUP program are SK_PASS and
 * SK_DROP. Their meaning is as follows:
 *
 *  SK_PASS && ctx.selected_sk != NULL: use selected_sk as lookup result
 *  SK_PASS && ctx.selected_sk == NULL: continue to htable-based socket lookup
 *  SK_DROP                           : terminate lookup with -ECONNREFUSED
 *
 * This macro aggregates return values and selected sockets from
 * multiple BPF programs according to following rules in order:
 *
 *  1. If any program returned SK_PASS and a non-NULL ctx.selected_sk,
 *     macro result is SK_PASS and last ctx.selected_sk is used.
 *  2. If any program returned SK_DROP return value,
 *     macro result is SK_DROP.
 *  3. Otherwise result is SK_PASS and ctx.selected_sk is NULL.
 *
 * Caller must ensure that the prog array is non-NULL, and that the
 * array as well as the programs it contains remain valid.
 */
#define BPF_PROG_SK_LOOKUP_RUN_ARRAY(array, ctx, func)                        \
        ({                                                                \
                struct bpf_sk_lookup_kern *_ctx = &(ctx);                \
                struct bpf_prog_array_item *_item;                        \
                struct sock *_selected_sk = NULL;                        \
                bool _no_reuseport = false;                                \
                struct bpf_prog *_prog;                                        \
                bool _all_pass = true;                                        \
                u32 _ret;                                                \
                                                                        \
                migrate_disable();                                        \
                _item = &(array)->items[0];                                \
                while ((_prog = READ_ONCE(_item->prog))) {                \
                        /* restore most recent selection */                \
                        _ctx->selected_sk = _selected_sk;                \
                        _ctx->no_reuseport = _no_reuseport;                \
                                                                        \
                        _ret = func(_prog, _ctx);                        \
                        if (_ret == SK_PASS && _ctx->selected_sk) {        \
                                /* remember last non-NULL socket */        \
                                _selected_sk = _ctx->selected_sk;        \
                                _no_reuseport = _ctx->no_reuseport;        \
                        } else if (_ret == SK_DROP && _all_pass) {        \
                                _all_pass = false;                        \
                        }                                                \
                        _item++;                                        \
                }                                                        \
                _ctx->selected_sk = _selected_sk;                        \
                _ctx->no_reuseport = _no_reuseport;                        \
                migrate_enable();                                        \
                _all_pass || _selected_sk ? SK_PASS : SK_DROP;                \
         })

static inline bool bpf_sk_lookup_run_v4(struct net *net, int protocol,
                                        const __be32 saddr, const __be16 sport,
                                        const __be32 daddr, const u16 dport,
                                        struct sock **psk)
{
        struct bpf_prog_array *run_array;
        struct sock *selected_sk = NULL;
        bool no_reuseport = false;

        rcu_read_lock();
        run_array = rcu_dereference(net->bpf.run_array[NETNS_BPF_SK_LOOKUP]);
        if (run_array) {
                struct bpf_sk_lookup_kern ctx = {
                        .family                = AF_INET,
                        .protocol        = protocol,
                        .v4.saddr        = saddr,
                        .v4.daddr        = daddr,
                        .sport                = sport,
                        .dport                = dport,
                };
                u32 act;

                act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, BPF_PROG_RUN);
                if (act == SK_PASS) {
                        selected_sk = ctx.selected_sk;
                        no_reuseport = ctx.no_reuseport;
                } else {
                        selected_sk = ERR_PTR(-ECONNREFUSED);
                }
        }
        rcu_read_unlock();
        *psk = selected_sk;
        return no_reuseport;
}

#if IS_ENABLED(CONFIG_IPV6)
static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol,
                                        const struct in6_addr *saddr,
                                        const __be16 sport,
                                        const struct in6_addr *daddr,
                                        const u16 dport,
                                        struct sock **psk)
{
        struct bpf_prog_array *run_array;
        struct sock *selected_sk = NULL;
        bool no_reuseport = false;

        rcu_read_lock();
        run_array = rcu_dereference(net->bpf.run_array[NETNS_BPF_SK_LOOKUP]);
        if (run_array) {
                struct bpf_sk_lookup_kern ctx = {
                        .family                = AF_INET6,
                        .protocol        = protocol,
                        .v6.saddr        = saddr,
                        .v6.daddr        = daddr,
                        .sport                = sport,
                        .dport                = dport,
                };
                u32 act;

                act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, BPF_PROG_RUN);
                if (act == SK_PASS) {
                        selected_sk = ctx.selected_sk;
                        no_reuseport = ctx.no_reuseport;
                } else {
                        selected_sk = ERR_PTR(-ECONNREFUSED);
                }
        }
        rcu_read_unlock();
        *psk = selected_sk;
        return no_reuseport;
}
#endif /* IS_ENABLED(CONFIG_IPV6) */

#endif /* __LINUX_FILTER_H__ */


























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
/* SPDX-License-Identifier: GPL-2.0+ */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM rseq

#if !defined(_TRACE_RSEQ_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_RSEQ_H

#include <linux/tracepoint.h>
#include <linux/types.h>

TRACE_EVENT(rseq_update,

        TP_PROTO(struct task_struct *t),

        TP_ARGS(t),

        TP_STRUCT__entry(
                __field(s32, cpu_id)
        ),

        TP_fast_assign(
                __entry->cpu_id = raw_smp_processor_id();
        ),

        TP_printk("cpu_id=%d", __entry->cpu_id)
);

TRACE_EVENT(rseq_ip_fixup,

        TP_PROTO(unsigned long regs_ip, unsigned long start_ip,
                unsigned long post_commit_offset, unsigned long abort_ip),

        TP_ARGS(regs_ip, start_ip, post_commit_offset, abort_ip),

        TP_STRUCT__entry(
                __field(unsigned long, regs_ip)
                __field(unsigned long, start_ip)
                __field(unsigned long, post_commit_offset)
                __field(unsigned long, abort_ip)
        ),

        TP_fast_assign(
                __entry->regs_ip = regs_ip;
                __entry->start_ip = start_ip;
                __entry->post_commit_offset = post_commit_offset;
                __entry->abort_ip = abort_ip;
        ),

        TP_printk("regs_ip=0x%lx start_ip=0x%lx post_commit_offset=%lu abort_ip=0x%lx",
                __entry->regs_ip, __entry->start_ip,
                __entry->post_commit_offset, __entry->abort_ip)
);

#endif /* _TRACE_SOCK_H */

/* This part must be outside protection */
#include <trace/define_trace.h>




































































































    1 

    1 







    1 
    1 
    1 

























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* delayacct.h - per-task delay accounting
 *
 * Copyright (C) Shailabh Nagar, IBM Corp. 2006
 */

#ifndef _LINUX_DELAYACCT_H
#define _LINUX_DELAYACCT_H

#include <uapi/linux/taskstats.h>

/*
 * Per-task flags relevant to delay accounting
 * maintained privately to avoid exhausting similar flags in sched.h:PF_*
 * Used to set current->delays->flags
 */
#define DELAYACCT_PF_SWAPIN        0x00000001        /* I am doing a swapin */
#define DELAYACCT_PF_BLKIO        0x00000002        /* I am waiting on IO */

#ifdef CONFIG_TASK_DELAY_ACCT
struct task_delay_info {
        raw_spinlock_t        lock;
        unsigned int        flags;        /* Private per-task flags */

        /* For each stat XXX, add following, aligned appropriately
         *
         * struct timespec XXX_start, XXX_end;
         * u64 XXX_delay;
         * u32 XXX_count;
         *
         * Atomicity of updates to XXX_delay, XXX_count protected by
         * single lock above (split into XXX_lock if contention is an issue).
         */

        /*
         * XXX_count is incremented on every XXX operation, the delay
         * associated with the operation is added to XXX_delay.
         * XXX_delay contains the accumulated delay time in nanoseconds.
         */
        u64 blkio_start;        /* Shared by blkio, swapin */
        u64 blkio_delay;        /* wait for sync block io completion */
        u64 swapin_delay;        /* wait for swapin block io completion */
        u32 blkio_count;        /* total count of the number of sync block */
                                /* io operations performed */
        u32 swapin_count;        /* total count of the number of swapin block */
                                /* io operations performed */

        u64 freepages_start;
        u64 freepages_delay;        /* wait for memory reclaim */

        u64 thrashing_start;
        u64 thrashing_delay;        /* wait for thrashing page */

        u32 freepages_count;        /* total count of memory reclaim */
        u32 thrashing_count;        /* total count of thrash waits */
};
#endif

#include <linux/sched.h>
#include <linux/slab.h>

#ifdef CONFIG_TASK_DELAY_ACCT
extern int delayacct_on;        /* Delay accounting turned on/off */
extern struct kmem_cache *delayacct_cache;
extern void delayacct_init(void);
extern void __delayacct_tsk_init(struct task_struct *);
extern void __delayacct_tsk_exit(struct task_struct *);
extern void __delayacct_blkio_start(void);
extern void __delayacct_blkio_end(struct task_struct *);
extern int __delayacct_add_tsk(struct taskstats *, struct task_struct *);
extern __u64 __delayacct_blkio_ticks(struct task_struct *);
extern void __delayacct_freepages_start(void);
extern void __delayacct_freepages_end(void);
extern void __delayacct_thrashing_start(void);
extern void __delayacct_thrashing_end(void);

static inline int delayacct_is_task_waiting_on_io(struct task_struct *p)
{
        if (p->delays)
                return (p->delays->flags & DELAYACCT_PF_BLKIO);
        else
                return 0;
}

static inline void delayacct_set_flag(int flag)
{
        if (current->delays)
                current->delays->flags |= flag;
}

static inline void delayacct_clear_flag(int flag)
{
        if (current->delays)
                current->delays->flags &= ~flag;
}

static inline void delayacct_tsk_init(struct task_struct *tsk)
{
        /* reinitialize in case parent's non-null pointer was dup'ed*/
        tsk->delays = NULL;
        if (delayacct_on)
                __delayacct_tsk_init(tsk);
}

/* Free tsk->delays. Called from bad fork and __put_task_struct
 * where there's no risk of tsk->delays being accessed elsewhere
 */
static inline void delayacct_tsk_free(struct task_struct *tsk)
{
        if (tsk->delays)
                kmem_cache_free(delayacct_cache, tsk->delays);
        tsk->delays = NULL;
}

static inline void delayacct_blkio_start(void)
{
        delayacct_set_flag(DELAYACCT_PF_BLKIO);
        if (current->delays)
                __delayacct_blkio_start();
}

static inline void delayacct_blkio_end(struct task_struct *p)
{
        if (p->delays)
                __delayacct_blkio_end(p);
        delayacct_clear_flag(DELAYACCT_PF_BLKIO);
}

static inline int delayacct_add_tsk(struct taskstats *d,
                                        struct task_struct *tsk)
{
        if (!delayacct_on || !tsk->delays)
                return 0;
        return __delayacct_add_tsk(d, tsk);
}

static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk)
{
        if (tsk->delays)
                return __delayacct_blkio_ticks(tsk);
        return 0;
}

static inline void delayacct_freepages_start(void)
{
        if (current->delays)
                __delayacct_freepages_start();
}

static inline void delayacct_freepages_end(void)
{
        if (current->delays)
                __delayacct_freepages_end();
}

static inline void delayacct_thrashing_start(void)
{
        if (current->delays)
                __delayacct_thrashing_start();
}

static inline void delayacct_thrashing_end(void)
{
        if (current->delays)
                __delayacct_thrashing_end();
}

#else
static inline void delayacct_set_flag(int flag)
{}
static inline void delayacct_clear_flag(int flag)
{}
static inline void delayacct_init(void)
{}
static inline void delayacct_tsk_init(struct task_struct *tsk)
{}
static inline void delayacct_tsk_free(struct task_struct *tsk)
{}
static inline void delayacct_blkio_start(void)
{}
static inline void delayacct_blkio_end(struct task_struct *p)
{}
static inline int delayacct_add_tsk(struct taskstats *d,
                                        struct task_struct *tsk)
{ return 0; }
static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk)
{ return 0; }
static inline int delayacct_is_task_waiting_on_io(struct task_struct *p)
{ return 0; }
static inline void delayacct_freepages_start(void)
{}
static inline void delayacct_freepages_end(void)
{}
static inline void delayacct_thrashing_start(void)
{}
static inline void delayacct_thrashing_end(void)
{}

#endif /* CONFIG_TASK_DELAY_ACCT */

#endif






















































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Hash algorithms.
 * 
 * Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au>
 */

#ifndef _CRYPTO_INTERNAL_HASH_H
#define _CRYPTO_INTERNAL_HASH_H

#include <crypto/algapi.h>
#include <crypto/hash.h>

struct ahash_request;
struct scatterlist;

struct crypto_hash_walk {
        char *data;

        unsigned int offset;
        unsigned int alignmask;

        struct page *pg;
        unsigned int entrylen;

        unsigned int total;
        struct scatterlist *sg;

        unsigned int flags;
};

struct ahash_instance {
        void (*free)(struct ahash_instance *inst);
        union {
                struct {
                        char head[offsetof(struct ahash_alg, halg.base)];
                        struct crypto_instance base;
                } s;
                struct ahash_alg alg;
        };
};

struct shash_instance {
        void (*free)(struct shash_instance *inst);
        union {
                struct {
                        char head[offsetof(struct shash_alg, base)];
                        struct crypto_instance base;
                } s;
                struct shash_alg alg;
        };
};

struct crypto_ahash_spawn {
        struct crypto_spawn base;
};

struct crypto_shash_spawn {
        struct crypto_spawn base;
};

int crypto_hash_walk_done(struct crypto_hash_walk *walk, int err);
int crypto_hash_walk_first(struct ahash_request *req,
                           struct crypto_hash_walk *walk);

static inline int crypto_hash_walk_last(struct crypto_hash_walk *walk)
{
        return !(walk->entrylen | walk->total);
}

int crypto_register_ahash(struct ahash_alg *alg);
void crypto_unregister_ahash(struct ahash_alg *alg);
int crypto_register_ahashes(struct ahash_alg *algs, int count);
void crypto_unregister_ahashes(struct ahash_alg *algs, int count);
int ahash_register_instance(struct crypto_template *tmpl,
                            struct ahash_instance *inst);

bool crypto_shash_alg_has_setkey(struct shash_alg *alg);

static inline bool crypto_shash_alg_needs_key(struct shash_alg *alg)
{
        return crypto_shash_alg_has_setkey(alg) &&
                !(alg->base.cra_flags & CRYPTO_ALG_OPTIONAL_KEY);
}

bool crypto_hash_alg_has_setkey(struct hash_alg_common *halg);

int crypto_grab_ahash(struct crypto_ahash_spawn *spawn,
                      struct crypto_instance *inst,
                      const char *name, u32 type, u32 mask);

static inline void crypto_drop_ahash(struct crypto_ahash_spawn *spawn)
{
        crypto_drop_spawn(&spawn->base);
}

static inline struct hash_alg_common *crypto_spawn_ahash_alg(
        struct crypto_ahash_spawn *spawn)
{
        return __crypto_hash_alg_common(spawn->base.alg);
}

int crypto_register_shash(struct shash_alg *alg);
void crypto_unregister_shash(struct shash_alg *alg);
int crypto_register_shashes(struct shash_alg *algs, int count);
void crypto_unregister_shashes(struct shash_alg *algs, int count);
int shash_register_instance(struct crypto_template *tmpl,
                            struct shash_instance *inst);
void shash_free_singlespawn_instance(struct shash_instance *inst);

int crypto_grab_shash(struct crypto_shash_spawn *spawn,
                      struct crypto_instance *inst,
                      const char *name, u32 type, u32 mask);

static inline void crypto_drop_shash(struct crypto_shash_spawn *spawn)
{
        crypto_drop_spawn(&spawn->base);
}

static inline struct shash_alg *crypto_spawn_shash_alg(
        struct crypto_shash_spawn *spawn)
{
        return __crypto_shash_alg(spawn->base.alg);
}

int shash_ahash_update(struct ahash_request *req, struct shash_desc *desc);
int shash_ahash_finup(struct ahash_request *req, struct shash_desc *desc);
int shash_ahash_digest(struct ahash_request *req, struct shash_desc *desc);

int crypto_init_shash_ops_async(struct crypto_tfm *tfm);

static inline void *crypto_ahash_ctx(struct crypto_ahash *tfm)
{
        return crypto_tfm_ctx(crypto_ahash_tfm(tfm));
}

static inline struct ahash_alg *__crypto_ahash_alg(struct crypto_alg *alg)
{
        return container_of(__crypto_hash_alg_common(alg), struct ahash_alg,
                            halg);
}

static inline void crypto_ahash_set_reqsize(struct crypto_ahash *tfm,
                                            unsigned int reqsize)
{
        tfm->reqsize = reqsize;
}

static inline struct crypto_instance *ahash_crypto_instance(
        struct ahash_instance *inst)
{
        return &inst->s.base;
}

static inline struct ahash_instance *ahash_instance(
        struct crypto_instance *inst)
{
        return container_of(inst, struct ahash_instance, s.base);
}

static inline struct ahash_instance *ahash_alg_instance(
        struct crypto_ahash *ahash)
{
        return ahash_instance(crypto_tfm_alg_instance(&ahash->base));
}

static inline void *ahash_instance_ctx(struct ahash_instance *inst)
{
        return crypto_instance_ctx(ahash_crypto_instance(inst));
}

static inline void ahash_request_complete(struct ahash_request *req, int err)
{
        req->base.complete(&req->base, err);
}

static inline u32 ahash_request_flags(struct ahash_request *req)
{
        return req->base.flags;
}

static inline struct crypto_ahash *crypto_spawn_ahash(
        struct crypto_ahash_spawn *spawn)
{
        return crypto_spawn_tfm2(&spawn->base);
}

static inline int ahash_enqueue_request(struct crypto_queue *queue,
                                             struct ahash_request *request)
{
        return crypto_enqueue_request(queue, &request->base);
}

static inline struct ahash_request *ahash_dequeue_request(
        struct crypto_queue *queue)
{
        return ahash_request_cast(crypto_dequeue_request(queue));
}

static inline void *crypto_shash_ctx(struct crypto_shash *tfm)
{
        return crypto_tfm_ctx(&tfm->base);
}

static inline struct crypto_instance *shash_crypto_instance(
        struct shash_instance *inst)
{
        return &inst->s.base;
}

static inline struct shash_instance *shash_instance(
        struct crypto_instance *inst)
{
        return container_of(inst, struct shash_instance, s.base);
}

static inline struct shash_instance *shash_alg_instance(
        struct crypto_shash *shash)
{
        return shash_instance(crypto_tfm_alg_instance(&shash->base));
}

static inline void *shash_instance_ctx(struct shash_instance *inst)
{
        return crypto_instance_ctx(shash_crypto_instance(inst));
}

static inline struct crypto_shash *crypto_spawn_shash(
        struct crypto_shash_spawn *spawn)
{
        return crypto_spawn_tfm2(&spawn->base);
}

static inline void *crypto_shash_ctx_aligned(struct crypto_shash *tfm)
{
        return crypto_tfm_ctx_aligned(&tfm->base);
}

static inline struct crypto_shash *__crypto_shash_cast(struct crypto_tfm *tfm)
{
        return container_of(tfm, struct crypto_shash, base);
}

#endif        /* _CRYPTO_INTERNAL_HASH_H */















































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 












1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2001 Jens Axboe <axboe@suse.de>
 */
#include <linux/compat.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/module.h>
#include <linux/blkdev.h>
#include <linux/capability.h>
#include <linux/completion.h>
#include <linux/cdrom.h>
#include <linux/ratelimit.h>
#include <linux/slab.h>
#include <linux/times.h>
#include <linux/uio.h>
#include <linux/uaccess.h>

#include <scsi/scsi.h>
#include <scsi/scsi_ioctl.h>
#include <scsi/scsi_cmnd.h>
#include <scsi/sg.h>

struct blk_cmd_filter {
        unsigned long read_ok[BLK_SCSI_CMD_PER_LONG];
        unsigned long write_ok[BLK_SCSI_CMD_PER_LONG];
};

static struct blk_cmd_filter blk_default_cmd_filter;

/* Command group 3 is reserved and should never be used.  */
const unsigned char scsi_command_size_tbl[8] =
{
        6, 10, 10, 12,
        16, 12, 10, 10
};
EXPORT_SYMBOL(scsi_command_size_tbl);

static int sg_get_version(int __user *p)
{
        static const int sg_version_num = 30527;
        return put_user(sg_version_num, p);
}

static int scsi_get_idlun(struct request_queue *q, int __user *p)
{
        return put_user(0, p);
}

static int scsi_get_bus(struct request_queue *q, int __user *p)
{
        return put_user(0, p);
}

static int sg_get_timeout(struct request_queue *q)
{
        return jiffies_to_clock_t(q->sg_timeout);
}

static int sg_set_timeout(struct request_queue *q, int __user *p)
{
        int timeout, err = get_user(timeout, p);

        if (!err)
                q->sg_timeout = clock_t_to_jiffies(timeout);

        return err;
}

static int max_sectors_bytes(struct request_queue *q)
{
        unsigned int max_sectors = queue_max_sectors(q);

        max_sectors = min_t(unsigned int, max_sectors, INT_MAX >> 9);

        return max_sectors << 9;
}

static int sg_get_reserved_size(struct request_queue *q, int __user *p)
{
        int val = min_t(int, q->sg_reserved_size, max_sectors_bytes(q));

        return put_user(val, p);
}

static int sg_set_reserved_size(struct request_queue *q, int __user *p)
{
        int size, err = get_user(size, p);

        if (err)
                return err;

        if (size < 0)
                return -EINVAL;

        q->sg_reserved_size = min(size, max_sectors_bytes(q));
        return 0;
}

/*
 * will always return that we are ATAPI even for a real SCSI drive, I'm not
 * so sure this is worth doing anything about (why would you care??)
 */
static int sg_emulated_host(struct request_queue *q, int __user *p)
{
        return put_user(1, p);
}

static void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter)
{
        /* Basic read-only commands */
        __set_bit(TEST_UNIT_READY, filter->read_ok);
        __set_bit(REQUEST_SENSE, filter->read_ok);
        __set_bit(READ_6, filter->read_ok);
        __set_bit(READ_10, filter->read_ok);
        __set_bit(READ_12, filter->read_ok);
        __set_bit(READ_16, filter->read_ok);
        __set_bit(READ_BUFFER, filter->read_ok);
        __set_bit(READ_DEFECT_DATA, filter->read_ok);
        __set_bit(READ_CAPACITY, filter->read_ok);
        __set_bit(READ_LONG, filter->read_ok);
        __set_bit(INQUIRY, filter->read_ok);
        __set_bit(MODE_SENSE, filter->read_ok);
        __set_bit(MODE_SENSE_10, filter->read_ok);
        __set_bit(LOG_SENSE, filter->read_ok);
        __set_bit(START_STOP, filter->read_ok);
        __set_bit(GPCMD_VERIFY_10, filter->read_ok);
        __set_bit(VERIFY_16, filter->read_ok);
        __set_bit(REPORT_LUNS, filter->read_ok);
        __set_bit(SERVICE_ACTION_IN_16, filter->read_ok);
        __set_bit(RECEIVE_DIAGNOSTIC, filter->read_ok);
        __set_bit(MAINTENANCE_IN, filter->read_ok);
        __set_bit(GPCMD_READ_BUFFER_CAPACITY, filter->read_ok);

        /* Audio CD commands */
        __set_bit(GPCMD_PLAY_CD, filter->read_ok);
        __set_bit(GPCMD_PLAY_AUDIO_10, filter->read_ok);
        __set_bit(GPCMD_PLAY_AUDIO_MSF, filter->read_ok);
        __set_bit(GPCMD_PLAY_AUDIO_TI, filter->read_ok);
        __set_bit(GPCMD_PAUSE_RESUME, filter->read_ok);

        /* CD/DVD data reading */
        __set_bit(GPCMD_READ_CD, filter->read_ok);
        __set_bit(GPCMD_READ_CD_MSF, filter->read_ok);
        __set_bit(GPCMD_READ_DISC_INFO, filter->read_ok);
        __set_bit(GPCMD_READ_CDVD_CAPACITY, filter->read_ok);
        __set_bit(GPCMD_READ_DVD_STRUCTURE, filter->read_ok);
        __set_bit(GPCMD_READ_HEADER, filter->read_ok);
        __set_bit(GPCMD_READ_TRACK_RZONE_INFO, filter->read_ok);
        __set_bit(GPCMD_READ_SUBCHANNEL, filter->read_ok);
        __set_bit(GPCMD_READ_TOC_PMA_ATIP, filter->read_ok);
        __set_bit(GPCMD_REPORT_KEY, filter->read_ok);
        __set_bit(GPCMD_SCAN, filter->read_ok);
        __set_bit(GPCMD_GET_CONFIGURATION, filter->read_ok);
        __set_bit(GPCMD_READ_FORMAT_CAPACITIES, filter->read_ok);
        __set_bit(GPCMD_GET_EVENT_STATUS_NOTIFICATION, filter->read_ok);
        __set_bit(GPCMD_GET_PERFORMANCE, filter->read_ok);
        __set_bit(GPCMD_SEEK, filter->read_ok);
        __set_bit(GPCMD_STOP_PLAY_SCAN, filter->read_ok);

        /* Basic writing commands */
        __set_bit(WRITE_6, filter->write_ok);
        __set_bit(WRITE_10, filter->write_ok);
        __set_bit(WRITE_VERIFY, filter->write_ok);
        __set_bit(WRITE_12, filter->write_ok);
        __set_bit(WRITE_VERIFY_12, filter->write_ok);
        __set_bit(WRITE_16, filter->write_ok);
        __set_bit(WRITE_LONG, filter->write_ok);
        __set_bit(WRITE_LONG_2, filter->write_ok);
        __set_bit(WRITE_SAME, filter->write_ok);
        __set_bit(WRITE_SAME_16, filter->write_ok);
        __set_bit(WRITE_SAME_32, filter->write_ok);
        __set_bit(ERASE, filter->write_ok);
        __set_bit(GPCMD_MODE_SELECT_10, filter->write_ok);
        __set_bit(MODE_SELECT, filter->write_ok);
        __set_bit(LOG_SELECT, filter->write_ok);
        __set_bit(GPCMD_BLANK, filter->write_ok);
        __set_bit(GPCMD_CLOSE_TRACK, filter->write_ok);
        __set_bit(GPCMD_FLUSH_CACHE, filter->write_ok);
        __set_bit(GPCMD_FORMAT_UNIT, filter->write_ok);
        __set_bit(GPCMD_REPAIR_RZONE_TRACK, filter->write_ok);
        __set_bit(GPCMD_RESERVE_RZONE_TRACK, filter->write_ok);
        __set_bit(GPCMD_SEND_DVD_STRUCTURE, filter->write_ok);
        __set_bit(GPCMD_SEND_EVENT, filter->write_ok);
        __set_bit(GPCMD_SEND_KEY, filter->write_ok);
        __set_bit(GPCMD_SEND_OPC, filter->write_ok);
        __set_bit(GPCMD_SEND_CUE_SHEET, filter->write_ok);
        __set_bit(GPCMD_SET_SPEED, filter->write_ok);
        __set_bit(GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL, filter->write_ok);
        __set_bit(GPCMD_LOAD_UNLOAD, filter->write_ok);
        __set_bit(GPCMD_SET_STREAMING, filter->write_ok);
        __set_bit(GPCMD_SET_READ_AHEAD, filter->write_ok);

        /* ZBC Commands */
        __set_bit(ZBC_OUT, filter->write_ok);
        __set_bit(ZBC_IN, filter->read_ok);
}

int blk_verify_command(unsigned char *cmd, fmode_t mode)
{
        struct blk_cmd_filter *filter = &blk_default_cmd_filter;

        /* root can do any command. */
        if (capable(CAP_SYS_RAWIO))
                return 0;

        /* Anybody who can open the device can do a read-safe command */
        if (test_bit(cmd[0], filter->read_ok))
                return 0;

        /* Write-safe commands require a writable open */
        if (test_bit(cmd[0], filter->write_ok) && (mode & FMODE_WRITE))
                return 0;

        return -EPERM;
}
EXPORT_SYMBOL(blk_verify_command);

static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq,
                             struct sg_io_hdr *hdr, fmode_t mode)
{
        struct scsi_request *req = scsi_req(rq);

        if (copy_from_user(req->cmd, hdr->cmdp, hdr->cmd_len))
                return -EFAULT;
        if (blk_verify_command(req->cmd, mode))
                return -EPERM;

        /*
         * fill in request structure
         */
        req->cmd_len = hdr->cmd_len;

        rq->timeout = msecs_to_jiffies(hdr->timeout);
        if (!rq->timeout)
                rq->timeout = q->sg_timeout;
        if (!rq->timeout)
                rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
        if (rq->timeout < BLK_MIN_SG_TIMEOUT)
                rq->timeout = BLK_MIN_SG_TIMEOUT;

        return 0;
}

static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr,
                                 struct bio *bio)
{
        struct scsi_request *req = scsi_req(rq);
        int r, ret = 0;

        /*
         * fill in all the output members
         */
        hdr->status = req->result & 0xff;
        hdr->masked_status = status_byte(req->result);
        hdr->msg_status = msg_byte(req->result);
        hdr->host_status = host_byte(req->result);
        hdr->driver_status = driver_byte(req->result);
        hdr->info = 0;
        if (hdr->masked_status || hdr->host_status || hdr->driver_status)
                hdr->info |= SG_INFO_CHECK;
        hdr->resid = req->resid_len;
        hdr->sb_len_wr = 0;

        if (req->sense_len && hdr->sbp) {
                int len = min((unsigned int) hdr->mx_sb_len, req->sense_len);

                if (!copy_to_user(hdr->sbp, req->sense, len))
                        hdr->sb_len_wr = len;
                else
                        ret = -EFAULT;
        }

        r = blk_rq_unmap_user(bio);
        if (!ret)
                ret = r;

        return ret;
}

static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
                struct sg_io_hdr *hdr, fmode_t mode)
{
        unsigned long start_time;
        ssize_t ret = 0;
        int writing = 0;
        int at_head = 0;
        struct request *rq;
        struct scsi_request *req;
        struct bio *bio;

        if (hdr->interface_id != 'S')
                return -EINVAL;

        if (hdr->dxfer_len > (queue_max_hw_sectors(q) << 9))
                return -EIO;

        if (hdr->dxfer_len)
                switch (hdr->dxfer_direction) {
                default:
                        return -EINVAL;
                case SG_DXFER_TO_DEV:
                        writing = 1;
                        break;
                case SG_DXFER_TO_FROM_DEV:
                case SG_DXFER_FROM_DEV:
                        break;
                }
        if (hdr->flags & SG_FLAG_Q_AT_HEAD)
                at_head = 1;

        ret = -ENOMEM;
        rq = blk_get_request(q, writing ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
        if (IS_ERR(rq))
                return PTR_ERR(rq);
        req = scsi_req(rq);

        if (hdr->cmd_len > BLK_MAX_CDB) {
                req->cmd = kzalloc(hdr->cmd_len, GFP_KERNEL);
                if (!req->cmd)
                        goto out_put_request;
        }

        ret = blk_fill_sghdr_rq(q, rq, hdr, mode);
        if (ret < 0)
                goto out_free_cdb;

        ret = 0;
        if (hdr->iovec_count) {
                struct iov_iter i;
                struct iovec *iov = NULL;

                ret = import_iovec(rq_data_dir(rq), hdr->dxferp,
                                   hdr->iovec_count, 0, &iov, &i);
                if (ret < 0)
                        goto out_free_cdb;

                /* SG_IO howto says that the shorter of the two wins */
                iov_iter_truncate(&i, hdr->dxfer_len);

                ret = blk_rq_map_user_iov(q, rq, NULL, &i, GFP_KERNEL);
                kfree(iov);
        } else if (hdr->dxfer_len)
                ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len,
                                      GFP_KERNEL);

        if (ret)
                goto out_free_cdb;

        bio = rq->bio;
        req->retries = 0;

        start_time = jiffies;

        /* ignore return value. All information is passed back to caller
         * (if he doesn't check that is his problem).
         * N.B. a non-zero SCSI status is _not_ necessarily an error.
         */
        blk_execute_rq(q, bd_disk, rq, at_head);

        hdr->duration = jiffies_to_msecs(jiffies - start_time);

        ret = blk_complete_sghdr_rq(rq, hdr, bio);

out_free_cdb:
        scsi_req_free_cmd(req);
out_put_request:
        blk_put_request(rq);
        return ret;
}

/**
 * sg_scsi_ioctl  --  handle deprecated SCSI_IOCTL_SEND_COMMAND ioctl
 * @q:                request queue to send scsi commands down
 * @disk:        gendisk to operate on (option)
 * @mode:        mode used to open the file through which the ioctl has been
 *                submitted
 * @sic:        userspace structure describing the command to perform
 *
 * Send down the scsi command described by @sic to the device below
 * the request queue @q.  If @file is non-NULL it's used to perform
 * fine-grained permission checks that allow users to send down
 * non-destructive SCSI commands.  If the caller has a struct gendisk
 * available it should be passed in as @disk to allow the low level
 * driver to use the information contained in it.  A non-NULL @disk
 * is only allowed if the caller knows that the low level driver doesn't
 * need it (e.g. in the scsi subsystem).
 *
 * Notes:
 *   -  This interface is deprecated - users should use the SG_IO
 *      interface instead, as this is a more flexible approach to
 *      performing SCSI commands on a device.
 *   -  The SCSI command length is determined by examining the 1st byte
 *      of the given command. There is no way to override this.
 *   -  Data transfers are limited to PAGE_SIZE
 *   -  The length (x + y) must be at least OMAX_SB_LEN bytes long to
 *      accommodate the sense buffer when an error occurs.
 *      The sense buffer is truncated to OMAX_SB_LEN (16) bytes so that
 *      old code will not be surprised.
 *   -  If a Unix error occurs (e.g. ENOMEM) then the user will receive
 *      a negative return and the Unix error code in 'errno'.
 *      If the SCSI command succeeds then 0 is returned.
 *      Positive numbers returned are the compacted SCSI error codes (4
 *      bytes in one int) where the lowest byte is the SCSI status.
 */
int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
                struct scsi_ioctl_command __user *sic)
{
        enum { OMAX_SB_LEN = 16 };        /* For backward compatibility */
        struct request *rq;
        struct scsi_request *req;
        int err;
        unsigned int in_len, out_len, bytes, opcode, cmdlen;
        char *buffer = NULL;

        if (!sic)
                return -EINVAL;

        /*
         * get in an out lengths, verify they don't exceed a page worth of data
         */
        if (get_user(in_len, &sic->inlen))
                return -EFAULT;
        if (get_user(out_len, &sic->outlen))
                return -EFAULT;
        if (in_len > PAGE_SIZE || out_len > PAGE_SIZE)
                return -EINVAL;
        if (get_user(opcode, sic->data))
                return -EFAULT;

        bytes = max(in_len, out_len);
        if (bytes) {
                buffer = kzalloc(bytes, q->bounce_gfp | GFP_USER| __GFP_NOWARN);
                if (!buffer)
                        return -ENOMEM;

        }

        rq = blk_get_request(q, in_len ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
        if (IS_ERR(rq)) {
                err = PTR_ERR(rq);
                goto error_free_buffer;
        }
        req = scsi_req(rq);

        cmdlen = COMMAND_SIZE(opcode);

        /*
         * get command and data to send to device, if any
         */
        err = -EFAULT;
        req->cmd_len = cmdlen;
        if (copy_from_user(req->cmd, sic->data, cmdlen))
                goto error;

        if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len))
                goto error;

        err = blk_verify_command(req->cmd, mode);
        if (err)
                goto error;

        /* default.  possible overriden later */
        req->retries = 5;

        switch (opcode) {
        case SEND_DIAGNOSTIC:
        case FORMAT_UNIT:
                rq->timeout = FORMAT_UNIT_TIMEOUT;
                req->retries = 1;
                break;
        case START_STOP:
                rq->timeout = START_STOP_TIMEOUT;
                break;
        case MOVE_MEDIUM:
                rq->timeout = MOVE_MEDIUM_TIMEOUT;
                break;
        case READ_ELEMENT_STATUS:
                rq->timeout = READ_ELEMENT_STATUS_TIMEOUT;
                break;
        case READ_DEFECT_DATA:
                rq->timeout = READ_DEFECT_DATA_TIMEOUT;
                req->retries = 1;
                break;
        default:
                rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
                break;
        }

        if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, GFP_NOIO)) {
                err = DRIVER_ERROR << 24;
                goto error;
        }

        blk_execute_rq(q, disk, rq, 0);

        err = req->result & 0xff;        /* only 8 bit SCSI status */
        if (err) {
                if (req->sense_len && req->sense) {
                        bytes = (OMAX_SB_LEN > req->sense_len) ?
                                req->sense_len : OMAX_SB_LEN;
                        if (copy_to_user(sic->data, req->sense, bytes))
                                err = -EFAULT;
                }
        } else {
                if (copy_to_user(sic->data, buffer, out_len))
                        err = -EFAULT;
        }
        
error:
        blk_put_request(rq);

error_free_buffer:
        kfree(buffer);

        return err;
}
EXPORT_SYMBOL_GPL(sg_scsi_ioctl);

/* Send basic block requests */
static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk,
                              int cmd, int data)
{
        struct request *rq;
        int err;

        rq = blk_get_request(q, REQ_OP_SCSI_OUT, 0);
        if (IS_ERR(rq))
                return PTR_ERR(rq);
        rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
        scsi_req(rq)->cmd[0] = cmd;
        scsi_req(rq)->cmd[4] = data;
        scsi_req(rq)->cmd_len = 6;
        blk_execute_rq(q, bd_disk, rq, 0);
        err = scsi_req(rq)->result ? -EIO : 0;
        blk_put_request(rq);

        return err;
}

static inline int blk_send_start_stop(struct request_queue *q,
                                      struct gendisk *bd_disk, int data)
{
        return __blk_send_generic(q, bd_disk, GPCMD_START_STOP_UNIT, data);
}

int put_sg_io_hdr(const struct sg_io_hdr *hdr, void __user *argp)
{
#ifdef CONFIG_COMPAT
        if (in_compat_syscall()) {
                struct compat_sg_io_hdr hdr32 =  {
                        .interface_id         = hdr->interface_id,
                        .dxfer_direction = hdr->dxfer_direction,
                        .cmd_len         = hdr->cmd_len,
                        .mx_sb_len         = hdr->mx_sb_len,
                        .iovec_count         = hdr->iovec_count,
                        .dxfer_len         = hdr->dxfer_len,
                        .dxferp                 = (uintptr_t)hdr->dxferp,
                        .cmdp                 = (uintptr_t)hdr->cmdp,
                        .sbp                 = (uintptr_t)hdr->sbp,
                        .timeout         = hdr->timeout,
                        .flags                 = hdr->flags,
                        .pack_id         = hdr->pack_id,
                        .usr_ptr         = (uintptr_t)hdr->usr_ptr,
                        .status                 = hdr->status,
                        .masked_status         = hdr->masked_status,
                        .msg_status         = hdr->msg_status,
                        .sb_len_wr         = hdr->sb_len_wr,
                        .host_status         = hdr->host_status,
                        .driver_status         = hdr->driver_status,
                        .resid                 = hdr->resid,
                        .duration         = hdr->duration,
                        .info                 = hdr->info,
                };

                if (copy_to_user(argp, &hdr32, sizeof(hdr32)))
                        return -EFAULT;

                return 0;
        }
#endif

        if (copy_to_user(argp, hdr, sizeof(*hdr)))
                return -EFAULT;

        return 0;
}
EXPORT_SYMBOL(put_sg_io_hdr);

int get_sg_io_hdr(struct sg_io_hdr *hdr, const void __user *argp)
{
#ifdef CONFIG_COMPAT
        struct compat_sg_io_hdr hdr32;

        if (in_compat_syscall()) {
                if (copy_from_user(&hdr32, argp, sizeof(hdr32)))
                        return -EFAULT;

                *hdr = (struct sg_io_hdr) {
                        .interface_id         = hdr32.interface_id,
                        .dxfer_direction = hdr32.dxfer_direction,
                        .cmd_len         = hdr32.cmd_len,
                        .mx_sb_len         = hdr32.mx_sb_len,
                        .iovec_count         = hdr32.iovec_count,
                        .dxfer_len         = hdr32.dxfer_len,
                        .dxferp                 = compat_ptr(hdr32.dxferp),
                        .cmdp                 = compat_ptr(hdr32.cmdp),
                        .sbp                 = compat_ptr(hdr32.sbp),
                        .timeout         = hdr32.timeout,
                        .flags                 = hdr32.flags,
                        .pack_id         = hdr32.pack_id,
                        .usr_ptr         = compat_ptr(hdr32.usr_ptr),
                        .status                 = hdr32.status,
                        .masked_status         = hdr32.masked_status,
                        .msg_status         = hdr32.msg_status,
                        .sb_len_wr         = hdr32.sb_len_wr,
                        .host_status         = hdr32.host_status,
                        .driver_status         = hdr32.driver_status,
                        .resid                 = hdr32.resid,
                        .duration         = hdr32.duration,
                        .info                 = hdr32.info,
                };

                return 0;
        }
#endif

        if (copy_from_user(hdr, argp, sizeof(*hdr)))
                return -EFAULT;

        return 0;
}
EXPORT_SYMBOL(get_sg_io_hdr);

#ifdef CONFIG_COMPAT
struct compat_cdrom_generic_command {
        unsigned char        cmd[CDROM_PACKET_SIZE];
        compat_caddr_t        buffer;
        compat_uint_t        buflen;
        compat_int_t        stat;
        compat_caddr_t        sense;
        unsigned char        data_direction;
        unsigned char        pad[3];
        compat_int_t        quiet;
        compat_int_t        timeout;
        compat_caddr_t        unused;
};
#endif

static int scsi_get_cdrom_generic_arg(struct cdrom_generic_command *cgc,
                                      const void __user *arg)
{
#ifdef CONFIG_COMPAT
        if (in_compat_syscall()) {
                struct compat_cdrom_generic_command cgc32;

                if (copy_from_user(&cgc32, arg, sizeof(cgc32)))
                        return -EFAULT;

                *cgc = (struct cdrom_generic_command) {
                        .buffer                = compat_ptr(cgc32.buffer),
                        .buflen                = cgc32.buflen,
                        .stat                = cgc32.stat,
                        .sense                = compat_ptr(cgc32.sense),
                        .data_direction        = cgc32.data_direction,
                        .quiet                = cgc32.quiet,
                        .timeout        = cgc32.timeout,
                        .unused                = compat_ptr(cgc32.unused),
                };
                memcpy(&cgc->cmd, &cgc32.cmd, CDROM_PACKET_SIZE);
                return 0;
        }
#endif
        if (copy_from_user(cgc, arg, sizeof(*cgc)))
                return -EFAULT;

        return 0;
}

static int scsi_put_cdrom_generic_arg(const struct cdrom_generic_command *cgc,
                                      void __user *arg)
{
#ifdef CONFIG_COMPAT
        if (in_compat_syscall()) {
                struct compat_cdrom_generic_command cgc32 = {
                        .buffer                = (uintptr_t)(cgc->buffer),
                        .buflen                = cgc->buflen,
                        .stat                = cgc->stat,
                        .sense                = (uintptr_t)(cgc->sense),
                        .data_direction        = cgc->data_direction,
                        .quiet                = cgc->quiet,
                        .timeout        = cgc->timeout,
                        .unused                = (uintptr_t)(cgc->unused),
                };
                memcpy(&cgc32.cmd, &cgc->cmd, CDROM_PACKET_SIZE);

                if (copy_to_user(arg, &cgc32, sizeof(cgc32)))
                        return -EFAULT;

                return 0;
        }
#endif
        if (copy_to_user(arg, cgc, sizeof(*cgc)))
                return -EFAULT;

        return 0;
}

static int scsi_cdrom_send_packet(struct request_queue *q,
                                  struct gendisk *bd_disk,
                                  fmode_t mode, void __user *arg)
{
        struct cdrom_generic_command cgc;
        struct sg_io_hdr hdr;
        int err;

        err = scsi_get_cdrom_generic_arg(&cgc, arg);
        if (err)
                return err;

        cgc.timeout = clock_t_to_jiffies(cgc.timeout);
        memset(&hdr, 0, sizeof(hdr));
        hdr.interface_id = 'S';
        hdr.cmd_len = sizeof(cgc.cmd);
        hdr.dxfer_len = cgc.buflen;
        switch (cgc.data_direction) {
                case CGC_DATA_UNKNOWN:
                        hdr.dxfer_direction = SG_DXFER_UNKNOWN;
                        break;
                case CGC_DATA_WRITE:
                        hdr.dxfer_direction = SG_DXFER_TO_DEV;
                        break;
                case CGC_DATA_READ:
                        hdr.dxfer_direction = SG_DXFER_FROM_DEV;
                        break;
                case CGC_DATA_NONE:
                        hdr.dxfer_direction = SG_DXFER_NONE;
                        break;
                default:
                        return -EINVAL;
        }

        hdr.dxferp = cgc.buffer;
        hdr.sbp = cgc.sense;
        if (hdr.sbp)
                hdr.mx_sb_len = sizeof(struct request_sense);
        hdr.timeout = jiffies_to_msecs(cgc.timeout);
        hdr.cmdp = ((struct cdrom_generic_command __user*) arg)->cmd;
        hdr.cmd_len = sizeof(cgc.cmd);

        err = sg_io(q, bd_disk, &hdr, mode);
        if (err == -EFAULT)
                return -EFAULT;

        if (hdr.status)
                return -EIO;

        cgc.stat = err;
        cgc.buflen = hdr.resid;
        if (scsi_put_cdrom_generic_arg(&cgc, arg))
                return -EFAULT;

        return err;
}

int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mode,
                   unsigned int cmd, void __user *arg)
{
        int err;

        if (!q)
                return -ENXIO;

        switch (cmd) {
                /*
                 * new sgv3 interface
                 */
                case SG_GET_VERSION_NUM:
                        err = sg_get_version(arg);
                        break;
                case SCSI_IOCTL_GET_IDLUN:
                        err = scsi_get_idlun(q, arg);
                        break;
                case SCSI_IOCTL_GET_BUS_NUMBER:
                        err = scsi_get_bus(q, arg);
                        break;
                case SG_SET_TIMEOUT:
                        err = sg_set_timeout(q, arg);
                        break;
                case SG_GET_TIMEOUT:
                        err = sg_get_timeout(q);
                        break;
                case SG_GET_RESERVED_SIZE:
                        err = sg_get_reserved_size(q, arg);
                        break;
                case SG_SET_RESERVED_SIZE:
                        err = sg_set_reserved_size(q, arg);
                        break;
                case SG_EMULATED_HOST:
                        err = sg_emulated_host(q, arg);
                        break;
                case SG_IO: {
                        struct sg_io_hdr hdr;

                        err = get_sg_io_hdr(&hdr, arg);
                        if (err)
                                break;
                        err = sg_io(q, bd_disk, &hdr, mode);
                        if (err == -EFAULT)
                                break;

                        if (put_sg_io_hdr(&hdr, arg))
                                err = -EFAULT;
                        break;
                }
                case CDROM_SEND_PACKET:
                        err = scsi_cdrom_send_packet(q, bd_disk, mode, arg);
                        break;

                /*
                 * old junk scsi send command ioctl
                 */
                case SCSI_IOCTL_SEND_COMMAND:
                        printk(KERN_WARNING "program %s is using a deprecated SCSI ioctl, please convert it to SG_IO\n", current->comm);
                        err = -EINVAL;
                        if (!arg)
                                break;

                        err = sg_scsi_ioctl(q, bd_disk, mode, arg);
                        break;
                case CDROMCLOSETRAY:
                        err = blk_send_start_stop(q, bd_disk, 0x03);
                        break;
                case CDROMEJECT:
                        err = blk_send_start_stop(q, bd_disk, 0x02);
                        break;
                default:
                        err = -ENOTTY;
        }

        return err;
}
EXPORT_SYMBOL(scsi_cmd_ioctl);

int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd)
{
        if (bd && !bdev_is_partition(bd))
                return 0;

        if (capable(CAP_SYS_RAWIO))
                return 0;

        return -ENOIOCTLCMD;
}
EXPORT_SYMBOL(scsi_verify_blk_ioctl);

int scsi_cmd_blk_ioctl(struct block_device *bd, fmode_t mode,
                       unsigned int cmd, void __user *arg)
{
        int ret;

        ret = scsi_verify_blk_ioctl(bd, cmd);
        if (ret < 0)
                return ret;

        return scsi_cmd_ioctl(bd->bd_disk->queue, bd->bd_disk, mode, cmd, arg);
}
EXPORT_SYMBOL(scsi_cmd_blk_ioctl);

/**
 * scsi_req_init - initialize certain fields of a scsi_request structure
 * @req: Pointer to a scsi_request structure.
 * Initializes .__cmd[], .cmd, .cmd_len and .sense_len but no other members
 * of struct scsi_request.
 */
void scsi_req_init(struct scsi_request *req)
{
        memset(req->__cmd, 0, sizeof(req->__cmd));
        req->cmd = req->__cmd;
        req->cmd_len = BLK_MAX_CDB;
        req->sense_len = 0;
}
EXPORT_SYMBOL(scsi_req_init);

static int __init blk_scsi_ioctl_init(void)
{
        blk_set_cmd_filter_defaults(&blk_default_cmd_filter);
        return 0;
}
fs_initcall(blk_scsi_ioctl_init);



































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * NetLabel Network Address Lists
 *
 * This file contains network address list functions used to manage ordered
 * lists of network addresses for use by the NetLabel subsystem.  The NetLabel
 * system manages static and dynamic label mappings for network protocols such
 * as CIPSO and RIPSO.
 *
 * Author: Paul Moore <paul@paul-moore.com>
 */

/*
 * (c) Copyright Hewlett-Packard Development Company, L.P., 2008
 */

#ifndef _NETLABEL_ADDRLIST_H
#define _NETLABEL_ADDRLIST_H

#include <linux/types.h>
#include <linux/rcupdate.h>
#include <linux/list.h>
#include <linux/in6.h>
#include <linux/audit.h>

/**
 * struct netlbl_af4list - NetLabel IPv4 address list
 * @addr: IPv4 address
 * @mask: IPv4 address mask
 * @valid: valid flag
 * @list: list structure, used internally
 */
struct netlbl_af4list {
        __be32 addr;
        __be32 mask;

        u32 valid;
        struct list_head list;
};

/**
 * struct netlbl_af6list - NetLabel IPv6 address list
 * @addr: IPv6 address
 * @mask: IPv6 address mask
 * @valid: valid flag
 * @list: list structure, used internally
 */
struct netlbl_af6list {
        struct in6_addr addr;
        struct in6_addr mask;

        u32 valid;
        struct list_head list;
};

#define __af4list_entry(ptr) container_of(ptr, struct netlbl_af4list, list)

static inline struct netlbl_af4list *__af4list_valid(struct list_head *s,
                                                     struct list_head *h)
{
        struct list_head *i = s;
        struct netlbl_af4list *n = __af4list_entry(s);
        while (i != h && !n->valid) {
                i = i->next;
                n = __af4list_entry(i);
        }
        return n;
}

static inline struct netlbl_af4list *__af4list_valid_rcu(struct list_head *s,
                                                         struct list_head *h)
{
        struct list_head *i = s;
        struct netlbl_af4list *n = __af4list_entry(s);
        while (i != h && !n->valid) {
                i = rcu_dereference(list_next_rcu(i));
                n = __af4list_entry(i);
        }
        return n;
}

#define netlbl_af4list_foreach(iter, head)                                \
        for (iter = __af4list_valid((head)->next, head);                \
             &iter->list != (head);                                        \
             iter = __af4list_valid(iter->list.next, head))

#define netlbl_af4list_foreach_rcu(iter, head)                                \
        for (iter = __af4list_valid_rcu((head)->next, head);                \
             &iter->list != (head);                                        \
             iter = __af4list_valid_rcu(iter->list.next, head))

#define netlbl_af4list_foreach_safe(iter, tmp, head)                        \
        for (iter = __af4list_valid((head)->next, head),                \
                     tmp = __af4list_valid(iter->list.next, head);        \
             &iter->list != (head);                                        \
             iter = tmp, tmp = __af4list_valid(iter->list.next, head))

int netlbl_af4list_add(struct netlbl_af4list *entry,
                       struct list_head *head);
struct netlbl_af4list *netlbl_af4list_remove(__be32 addr, __be32 mask,
                                             struct list_head *head);
void netlbl_af4list_remove_entry(struct netlbl_af4list *entry);
struct netlbl_af4list *netlbl_af4list_search(__be32 addr,
                                             struct list_head *head);
struct netlbl_af4list *netlbl_af4list_search_exact(__be32 addr,
                                                   __be32 mask,
                                                   struct list_head *head);

#ifdef CONFIG_AUDIT
void netlbl_af4list_audit_addr(struct audit_buffer *audit_buf,
                               int src, const char *dev,
                               __be32 addr, __be32 mask);
#else
static inline void netlbl_af4list_audit_addr(struct audit_buffer *audit_buf,
                                             int src, const char *dev,
                                             __be32 addr, __be32 mask)
{
}
#endif

#if IS_ENABLED(CONFIG_IPV6)

#define __af6list_entry(ptr) container_of(ptr, struct netlbl_af6list, list)

static inline struct netlbl_af6list *__af6list_valid(struct list_head *s,
                                                     struct list_head *h)
{
        struct list_head *i = s;
        struct netlbl_af6list *n = __af6list_entry(s);
        while (i != h && !n->valid) {
                i = i->next;
                n = __af6list_entry(i);
        }
        return n;
}

static inline struct netlbl_af6list *__af6list_valid_rcu(struct list_head *s,
                                                         struct list_head *h)
{
        struct list_head *i = s;
        struct netlbl_af6list *n = __af6list_entry(s);
        while (i != h && !n->valid) {
                i = rcu_dereference(list_next_rcu(i));
                n = __af6list_entry(i);
        }
        return n;
}

#define netlbl_af6list_foreach(iter, head)                                \
        for (iter = __af6list_valid((head)->next, head);                \
             &iter->list != (head);                                        \
             iter = __af6list_valid(iter->list.next, head))

#define netlbl_af6list_foreach_rcu(iter, head)                                \
        for (iter = __af6list_valid_rcu((head)->next, head);                \
             &iter->list != (head);                                        \
             iter = __af6list_valid_rcu(iter->list.next, head))

#define netlbl_af6list_foreach_safe(iter, tmp, head)                        \
        for (iter = __af6list_valid((head)->next, head),                \
                     tmp = __af6list_valid(iter->list.next, head);        \
             &iter->list != (head);                                        \
             iter = tmp, tmp = __af6list_valid(iter->list.next, head))

int netlbl_af6list_add(struct netlbl_af6list *entry,
                       struct list_head *head);
struct netlbl_af6list *netlbl_af6list_remove(const struct in6_addr *addr,
                                             const struct in6_addr *mask,
                                             struct list_head *head);
void netlbl_af6list_remove_entry(struct netlbl_af6list *entry);
struct netlbl_af6list *netlbl_af6list_search(const struct in6_addr *addr,
                                             struct list_head *head);
struct netlbl_af6list *netlbl_af6list_search_exact(const struct in6_addr *addr,
                                                   const struct in6_addr *mask,
                                                   struct list_head *head);

#ifdef CONFIG_AUDIT
void netlbl_af6list_audit_addr(struct audit_buffer *audit_buf,
                               int src,
                               const char *dev,
                               const struct in6_addr *addr,
                               const struct in6_addr *mask);
#else
static inline void netlbl_af6list_audit_addr(struct audit_buffer *audit_buf,
                                             int src,
                                             const char *dev,
                                             const struct in6_addr *addr,
                                             const struct in6_addr *mask)
{
}
#endif
#endif /* IPV6 */

#endif



















































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Generic nexthop implementation
 *
 * Copyright (c) 2017-19 Cumulus Networks
 * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com>
 */

#ifndef __LINUX_NEXTHOP_H
#define __LINUX_NEXTHOP_H

#include <linux/netdevice.h>
#include <linux/notifier.h>
#include <linux/route.h>
#include <linux/types.h>
#include <net/ip_fib.h>
#include <net/ip6_fib.h>
#include <net/netlink.h>

#define NEXTHOP_VALID_USER_FLAGS RTNH_F_ONLINK

struct nexthop;

struct nh_config {
        u32                nh_id;

        u8                nh_family;
        u8                nh_protocol;
        u8                nh_blackhole;
        u8                nh_fdb;
        u32                nh_flags;

        int                nh_ifindex;
        struct net_device *dev;

        union {
                __be32                ipv4;
                struct in6_addr        ipv6;
        } gw;

        struct nlattr        *nh_grp;
        u16                nh_grp_type;

        struct nlattr        *nh_encap;
        u16                nh_encap_type;

        u32                nlflags;
        struct nl_info        nlinfo;
};

struct nh_info {
        struct hlist_node        dev_hash;    /* entry on netns devhash */
        struct nexthop                *nh_parent;

        u8                        family;
        bool                        reject_nh;
        bool                        fdb_nh;

        union {
                struct fib_nh_common        fib_nhc;
                struct fib_nh                fib_nh;
                struct fib6_nh                fib6_nh;
        };
};

struct nh_grp_entry {
        struct nexthop        *nh;
        u8                weight;
        atomic_t        upper_bound;

        struct list_head nh_list;
        struct nexthop        *nh_parent;  /* nexthop of group with this entry */
};

struct nh_group {
        struct nh_group                *spare; /* spare group for removals */
        u16                        num_nh;
        bool                        mpath;
        bool                        fdb_nh;
        bool                        has_v4;
        struct nh_grp_entry        nh_entries[];
};

struct nexthop {
        struct rb_node                rb_node;    /* entry on netns rbtree */
        struct list_head        fi_list;    /* v4 entries using nh */
        struct list_head        f6i_list;   /* v6 entries using nh */
        struct list_head        fdb_list;   /* fdb entries using this nh */
        struct list_head        grp_list;   /* nh group entries using this nh */
        struct net                *net;

        u32                        id;

        u8                        protocol;   /* app managing this nh */
        u8                        nh_flags;
        bool                        is_group;

        refcount_t                refcnt;
        struct rcu_head                rcu;

        union {
                struct nh_info        __rcu *nh_info;
                struct nh_group __rcu *nh_grp;
        };
};

enum nexthop_event_type {
        NEXTHOP_EVENT_DEL,
        NEXTHOP_EVENT_REPLACE,
};

int register_nexthop_notifier(struct net *net, struct notifier_block *nb);
int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb);

/* caller is holding rcu or rtnl; no reference taken to nexthop */
struct nexthop *nexthop_find_by_id(struct net *net, u32 id);
void nexthop_free_rcu(struct rcu_head *head);

static inline bool nexthop_get(struct nexthop *nh)
{
        return refcount_inc_not_zero(&nh->refcnt);
}

static inline void nexthop_put(struct nexthop *nh)
{
        if (refcount_dec_and_test(&nh->refcnt))
                call_rcu(&nh->rcu, nexthop_free_rcu);
}

static inline bool nexthop_cmp(const struct nexthop *nh1,
                               const struct nexthop *nh2)
{
        return nh1 == nh2;
}

static inline bool nexthop_is_fdb(const struct nexthop *nh)
{
        if (nh->is_group) {
                const struct nh_group *nh_grp;

                nh_grp = rcu_dereference_rtnl(nh->nh_grp);
                return nh_grp->fdb_nh;
        } else {
                const struct nh_info *nhi;

                nhi = rcu_dereference_rtnl(nh->nh_info);
                return nhi->fdb_nh;
        }
}

static inline bool nexthop_has_v4(const struct nexthop *nh)
{
        if (nh->is_group) {
                struct nh_group *nh_grp;

                nh_grp = rcu_dereference_rtnl(nh->nh_grp);
                return nh_grp->has_v4;
        }
        return false;
}

static inline bool nexthop_is_multipath(const struct nexthop *nh)
{
        if (nh->is_group) {
                struct nh_group *nh_grp;

                nh_grp = rcu_dereference_rtnl(nh->nh_grp);
                return nh_grp->mpath;
        }
        return false;
}

struct nexthop *nexthop_select_path(struct nexthop *nh, int hash);

static inline unsigned int nexthop_num_path(const struct nexthop *nh)
{
        unsigned int rc = 1;

        if (nh->is_group) {
                struct nh_group *nh_grp;

                nh_grp = rcu_dereference_rtnl(nh->nh_grp);
                if (nh_grp->mpath)
                        rc = nh_grp->num_nh;
        }

        return rc;
}

static inline
struct nexthop *nexthop_mpath_select(const struct nh_group *nhg, int nhsel)
{
        /* for_nexthops macros in fib_semantics.c grabs a pointer to
         * the nexthop before checking nhsel
         */
        if (nhsel >= nhg->num_nh)
                return NULL;

        return nhg->nh_entries[nhsel].nh;
}

static inline
int nexthop_mpath_fill_node(struct sk_buff *skb, struct nexthop *nh,
                            u8 rt_family)
{
        struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
        int i;

        for (i = 0; i < nhg->num_nh; i++) {
                struct nexthop *nhe = nhg->nh_entries[i].nh;
                struct nh_info *nhi = rcu_dereference_rtnl(nhe->nh_info);
                struct fib_nh_common *nhc = &nhi->fib_nhc;
                int weight = nhg->nh_entries[i].weight;

                if (fib_add_nexthop(skb, nhc, weight, rt_family, 0) < 0)
                        return -EMSGSIZE;
        }

        return 0;
}

/* called with rcu lock */
static inline bool nexthop_is_blackhole(const struct nexthop *nh)
{
        const struct nh_info *nhi;

        if (nh->is_group) {
                struct nh_group *nh_grp;

                nh_grp = rcu_dereference_rtnl(nh->nh_grp);
                if (nh_grp->num_nh > 1)
                        return false;

                nh = nh_grp->nh_entries[0].nh;
        }

        nhi = rcu_dereference_rtnl(nh->nh_info);
        return nhi->reject_nh;
}

static inline void nexthop_path_fib_result(struct fib_result *res, int hash)
{
        struct nh_info *nhi;
        struct nexthop *nh;

        nh = nexthop_select_path(res->fi->nh, hash);
        nhi = rcu_dereference(nh->nh_info);
        res->nhc = &nhi->fib_nhc;
}

/* called with rcu read lock or rtnl held */
static inline
struct fib_nh_common *nexthop_fib_nhc(struct nexthop *nh, int nhsel)
{
        struct nh_info *nhi;

        BUILD_BUG_ON(offsetof(struct fib_nh, nh_common) != 0);
        BUILD_BUG_ON(offsetof(struct fib6_nh, nh_common) != 0);

        if (nh->is_group) {
                struct nh_group *nh_grp;

                nh_grp = rcu_dereference_rtnl(nh->nh_grp);
                if (nh_grp->mpath) {
                        nh = nexthop_mpath_select(nh_grp, nhsel);
                        if (!nh)
                                return NULL;
                }
        }

        nhi = rcu_dereference_rtnl(nh->nh_info);
        return &nhi->fib_nhc;
}

/* called from fib_table_lookup with rcu_lock */
static inline
struct fib_nh_common *nexthop_get_nhc_lookup(const struct nexthop *nh,
                                             int fib_flags,
                                             const struct flowi4 *flp,
                                             int *nhsel)
{
        struct nh_info *nhi;

        if (nh->is_group) {
                struct nh_group *nhg = rcu_dereference(nh->nh_grp);
                int i;

                for (i = 0; i < nhg->num_nh; i++) {
                        struct nexthop *nhe = nhg->nh_entries[i].nh;

                        nhi = rcu_dereference(nhe->nh_info);
                        if (fib_lookup_good_nhc(&nhi->fib_nhc, fib_flags, flp)) {
                                *nhsel = i;
                                return &nhi->fib_nhc;
                        }
                }
        } else {
                nhi = rcu_dereference(nh->nh_info);
                if (fib_lookup_good_nhc(&nhi->fib_nhc, fib_flags, flp)) {
                        *nhsel = 0;
                        return &nhi->fib_nhc;
                }
        }

        return NULL;
}

static inline bool nexthop_uses_dev(const struct nexthop *nh,
                                    const struct net_device *dev)
{
        struct nh_info *nhi;

        if (nh->is_group) {
                struct nh_group *nhg = rcu_dereference(nh->nh_grp);
                int i;

                for (i = 0; i < nhg->num_nh; i++) {
                        struct nexthop *nhe = nhg->nh_entries[i].nh;

                        nhi = rcu_dereference(nhe->nh_info);
                        if (nhc_l3mdev_matches_dev(&nhi->fib_nhc, dev))
                                return true;
                }
        } else {
                nhi = rcu_dereference(nh->nh_info);
                if (nhc_l3mdev_matches_dev(&nhi->fib_nhc, dev))
                        return true;
        }

        return false;
}

static inline unsigned int fib_info_num_path(const struct fib_info *fi)
{
        if (unlikely(fi->nh))
                return nexthop_num_path(fi->nh);

        return fi->fib_nhs;
}

int fib_check_nexthop(struct nexthop *nh, u8 scope,
                      struct netlink_ext_ack *extack);

static inline struct fib_nh_common *fib_info_nhc(struct fib_info *fi, int nhsel)
{
        if (unlikely(fi->nh))
                return nexthop_fib_nhc(fi->nh, nhsel);

        return &fi->fib_nh[nhsel].nh_common;
}

/* only used when fib_nh is built into fib_info */
static inline struct fib_nh *fib_info_nh(struct fib_info *fi, int nhsel)
{
        WARN_ON(fi->nh);

        return &fi->fib_nh[nhsel];
}

/*
 * IPv6 variants
 */
int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
                       struct netlink_ext_ack *extack);

/* Caller should either hold rcu_read_lock(), or RTNL. */
static inline struct fib6_nh *nexthop_fib6_nh(struct nexthop *nh)
{
        struct nh_info *nhi;

        if (nh->is_group) {
                struct nh_group *nh_grp;

                nh_grp = rcu_dereference_rtnl(nh->nh_grp);
                nh = nexthop_mpath_select(nh_grp, 0);
                if (!nh)
                        return NULL;
        }

        nhi = rcu_dereference_rtnl(nh->nh_info);
        if (nhi->family == AF_INET6)
                return &nhi->fib6_nh;

        return NULL;
}

/* Variant of nexthop_fib6_nh().
 * Caller should either hold rcu_read_lock_bh(), or RTNL.
 */
static inline struct fib6_nh *nexthop_fib6_nh_bh(struct nexthop *nh)
{
        struct nh_info *nhi;

        if (nh->is_group) {
                struct nh_group *nh_grp;

                nh_grp = rcu_dereference_bh_rtnl(nh->nh_grp);
                nh = nexthop_mpath_select(nh_grp, 0);
                if (!nh)
                        return NULL;
        }

        nhi = rcu_dereference_bh_rtnl(nh->nh_info);
        if (nhi->family == AF_INET6)
                return &nhi->fib6_nh;

        return NULL;
}

static inline struct net_device *fib6_info_nh_dev(struct fib6_info *f6i)
{
        struct fib6_nh *fib6_nh;

        fib6_nh = f6i->nh ? nexthop_fib6_nh(f6i->nh) : f6i->fib6_nh;
        return fib6_nh->fib_nh_dev;
}

static inline void nexthop_path_fib6_result(struct fib6_result *res, int hash)
{
        struct nexthop *nh = res->f6i->nh;
        struct nh_info *nhi;

        nh = nexthop_select_path(nh, hash);

        nhi = rcu_dereference_rtnl(nh->nh_info);
        if (nhi->reject_nh) {
                res->fib6_type = RTN_BLACKHOLE;
                res->fib6_flags |= RTF_REJECT;
                res->nh = nexthop_fib6_nh(nh);
        } else {
                res->nh = &nhi->fib6_nh;
        }
}

int nexthop_for_each_fib6_nh(struct nexthop *nh,
                             int (*cb)(struct fib6_nh *nh, void *arg),
                             void *arg);

static inline int nexthop_get_family(struct nexthop *nh)
{
        struct nh_info *nhi = rcu_dereference_rtnl(nh->nh_info);

        return nhi->family;
}

static inline
struct fib_nh_common *nexthop_fdb_nhc(struct nexthop *nh)
{
        struct nh_info *nhi = rcu_dereference_rtnl(nh->nh_info);

        return &nhi->fib_nhc;
}

static inline struct fib_nh_common *nexthop_path_fdb_result(struct nexthop *nh,
                                                            int hash)
{
        struct nh_info *nhi;
        struct nexthop *nhp;

        nhp = nexthop_select_path(nh, hash);
        if (unlikely(!nhp))
                return NULL;
        nhi = rcu_dereference(nhp->nh_info);
        return &nhi->fib_nhc;
}
#endif







































































    1 












    1 
    1 

    1 

    1 
    1 








    1 



    1 











    1 






    1 
    1 
    1 
    1 






    1 



    1 







    1 











    1 

























    1 








    1 



    1 




    1 


    1 


    1 



    1 

    1 













    1 


    1 
    1 
    1 
    1 












    1 
    1 













































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
// SPDX-License-Identifier: GPL-2.0-or-later
/* procfs files for key database enumeration
 *
 * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#include <linux/init.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <asm/errno.h>
#include "internal.h"

static void *proc_keys_start(struct seq_file *p, loff_t *_pos);
static void *proc_keys_next(struct seq_file *p, void *v, loff_t *_pos);
static void proc_keys_stop(struct seq_file *p, void *v);
static int proc_keys_show(struct seq_file *m, void *v);

static const struct seq_operations proc_keys_ops = {
        .start        = proc_keys_start,
        .next        = proc_keys_next,
        .stop        = proc_keys_stop,
        .show        = proc_keys_show,
};

static void *proc_key_users_start(struct seq_file *p, loff_t *_pos);
static void *proc_key_users_next(struct seq_file *p, void *v, loff_t *_pos);
static void proc_key_users_stop(struct seq_file *p, void *v);
static int proc_key_users_show(struct seq_file *m, void *v);

static const struct seq_operations proc_key_users_ops = {
        .start        = proc_key_users_start,
        .next        = proc_key_users_next,
        .stop        = proc_key_users_stop,
        .show        = proc_key_users_show,
};

/*
 * Declare the /proc files.
 */
static int __init key_proc_init(void)
{
        struct proc_dir_entry *p;

        p = proc_create_seq("keys", 0, NULL, &proc_keys_ops);
        if (!p)
                panic("Cannot create /proc/keys\n");

        p = proc_create_seq("key-users", 0, NULL, &proc_key_users_ops);
        if (!p)
                panic("Cannot create /proc/key-users\n");

        return 0;
}

__initcall(key_proc_init);

/*
 * Implement "/proc/keys" to provide a list of the keys on the system that
 * grant View permission to the caller.
 */
static struct rb_node *key_serial_next(struct seq_file *p, struct rb_node *n)
{
        struct user_namespace *user_ns = seq_user_ns(p);

        n = rb_next(n);
        while (n) {
                struct key *key = rb_entry(n, struct key, serial_node);
                if (kuid_has_mapping(user_ns, key->user->uid))
                        break;
                n = rb_next(n);
        }
        return n;
}

static struct key *find_ge_key(struct seq_file *p, key_serial_t id)
{
        struct user_namespace *user_ns = seq_user_ns(p);
        struct rb_node *n = key_serial_tree.rb_node;
        struct key *minkey = NULL;

        while (n) {
                struct key *key = rb_entry(n, struct key, serial_node);
                if (id < key->serial) {
                        if (!minkey || minkey->serial > key->serial)
                                minkey = key;
                        n = n->rb_left;
                } else if (id > key->serial) {
                        n = n->rb_right;
                } else {
                        minkey = key;
                        break;
                }
                key = NULL;
        }

        if (!minkey)
                return NULL;

        for (;;) {
                if (kuid_has_mapping(user_ns, minkey->user->uid))
                        return minkey;
                n = rb_next(&minkey->serial_node);
                if (!n)
                        return NULL;
                minkey = rb_entry(n, struct key, serial_node);
        }
}

static void *proc_keys_start(struct seq_file *p, loff_t *_pos)
        __acquires(key_serial_lock)
{
        key_serial_t pos = *_pos;
        struct key *key;

        spin_lock(&key_serial_lock);

        if (*_pos > INT_MAX)
                return NULL;
        key = find_ge_key(p, pos);
        if (!key)
                return NULL;
        *_pos = key->serial;
        return &key->serial_node;
}

static inline key_serial_t key_node_serial(struct rb_node *n)
{
        struct key *key = rb_entry(n, struct key, serial_node);
        return key->serial;
}

static void *proc_keys_next(struct seq_file *p, void *v, loff_t *_pos)
{
        struct rb_node *n;

        n = key_serial_next(p, v);
        if (n)
                *_pos = key_node_serial(n);
        else
                (*_pos)++;
        return n;
}

static void proc_keys_stop(struct seq_file *p, void *v)
        __releases(key_serial_lock)
{
        spin_unlock(&key_serial_lock);
}

static int proc_keys_show(struct seq_file *m, void *v)
{
        struct rb_node *_p = v;
        struct key *key = rb_entry(_p, struct key, serial_node);
        unsigned long flags;
        key_ref_t key_ref, skey_ref;
        time64_t now, expiry;
        char xbuf[16];
        short state;
        u64 timo;
        int rc;

        struct keyring_search_context ctx = {
                .index_key                = key->index_key,
                .cred                        = m->file->f_cred,
                .match_data.cmp                = lookup_user_key_possessed,
                .match_data.raw_data        = key,
                .match_data.lookup_type        = KEYRING_SEARCH_LOOKUP_DIRECT,
                .flags                        = (KEYRING_SEARCH_NO_STATE_CHECK |
                                           KEYRING_SEARCH_RECURSE),
        };

        key_ref = make_key_ref(key, 0);

        /* determine if the key is possessed by this process (a test we can
         * skip if the key does not indicate the possessor can view it
         */
        if (key->perm & KEY_POS_VIEW) {
                rcu_read_lock();
                skey_ref = search_cred_keyrings_rcu(&ctx);
                rcu_read_unlock();
                if (!IS_ERR(skey_ref)) {
                        key_ref_put(skey_ref);
                        key_ref = make_key_ref(key, 1);
                }
        }

        /* check whether the current task is allowed to view the key */
        rc = key_task_permission(key_ref, ctx.cred, KEY_NEED_VIEW);
        if (rc < 0)
                return 0;

        now = ktime_get_real_seconds();

        rcu_read_lock();

        /* come up with a suitable timeout value */
        expiry = READ_ONCE(key->expiry);
        if (expiry == TIME64_MAX) {
                memcpy(xbuf, "perm", 5);
        } else if (now >= expiry) {
                memcpy(xbuf, "expd", 5);
        } else {
                timo = expiry - now;

                if (timo < 60)
                        sprintf(xbuf, "%llus", timo);
                else if (timo < 60*60)
                        sprintf(xbuf, "%llum", div_u64(timo, 60));
                else if (timo < 60*60*24)
                        sprintf(xbuf, "%lluh", div_u64(timo, 60 * 60));
                else if (timo < 60*60*24*7)
                        sprintf(xbuf, "%llud", div_u64(timo, 60 * 60 * 24));
                else
                        sprintf(xbuf, "%lluw", div_u64(timo, 60 * 60 * 24 * 7));
        }

        state = key_read_state(key);

#define showflag(FLAGS, LETTER, FLAG) \
        ((FLAGS & (1 << FLAG)) ? LETTER : '-')

        flags = READ_ONCE(key->flags);
        seq_printf(m, "%08x %c%c%c%c%c%c%c %5d %4s %08x %5d %5d %-9.9s ",
                   key->serial,
                   state != KEY_IS_UNINSTANTIATED ? 'I' : '-',
                   showflag(flags, 'R', KEY_FLAG_REVOKED),
                   showflag(flags, 'D', KEY_FLAG_DEAD),
                   showflag(flags, 'Q', KEY_FLAG_IN_QUOTA),
                   showflag(flags, 'U', KEY_FLAG_USER_CONSTRUCT),
                   state < 0 ? 'N' : '-',
                   showflag(flags, 'i', KEY_FLAG_INVALIDATED),
                   refcount_read(&key->usage),
                   xbuf,
                   key->perm,
                   from_kuid_munged(seq_user_ns(m), key->uid),
                   from_kgid_munged(seq_user_ns(m), key->gid),
                   key->type->name);

#undef showflag

        if (key->type->describe)
                key->type->describe(key, m);
        seq_putc(m, '\n');

        rcu_read_unlock();
        return 0;
}

static struct rb_node *__key_user_next(struct user_namespace *user_ns, struct rb_node *n)
{
        while (n) {
                struct key_user *user = rb_entry(n, struct key_user, node);
                if (kuid_has_mapping(user_ns, user->uid))
                        break;
                n = rb_next(n);
        }
        return n;
}

static struct rb_node *key_user_next(struct user_namespace *user_ns, struct rb_node *n)
{
        return __key_user_next(user_ns, rb_next(n));
}

static struct rb_node *key_user_first(struct user_namespace *user_ns, struct rb_root *r)
{
        struct rb_node *n = rb_first(r);
        return __key_user_next(user_ns, n);
}

static void *proc_key_users_start(struct seq_file *p, loff_t *_pos)
        __acquires(key_user_lock)
{
        struct rb_node *_p;
        loff_t pos = *_pos;

        spin_lock(&key_user_lock);

        _p = key_user_first(seq_user_ns(p), &key_user_tree);
        while (pos > 0 && _p) {
                pos--;
                _p = key_user_next(seq_user_ns(p), _p);
        }

        return _p;
}

static void *proc_key_users_next(struct seq_file *p, void *v, loff_t *_pos)
{
        (*_pos)++;
        return key_user_next(seq_user_ns(p), (struct rb_node *)v);
}

static void proc_key_users_stop(struct seq_file *p, void *v)
        __releases(key_user_lock)
{
        spin_unlock(&key_user_lock);
}

static int proc_key_users_show(struct seq_file *m, void *v)
{
        struct rb_node *_p = v;
        struct key_user *user = rb_entry(_p, struct key_user, node);
        unsigned maxkeys = uid_eq(user->uid, GLOBAL_ROOT_UID) ?
                key_quota_root_maxkeys : key_quota_maxkeys;
        unsigned maxbytes = uid_eq(user->uid, GLOBAL_ROOT_UID) ?
                key_quota_root_maxbytes : key_quota_maxbytes;

        seq_printf(m, "%5u: %5d %d/%d %d/%d %d/%d\n",
                   from_kuid_munged(seq_user_ns(m), user->uid),
                   refcount_read(&user->usage),
                   atomic_read(&user->nkeys),
                   atomic_read(&user->nikeys),
                   user->qnkeys,
                   maxkeys,
                   user->qnbytes,
                   maxbytes);

        return 0;
}












































































































































































































































































































































































































































































































































































































































































































































































    1 

    1 



























































































































































































































































































































































































    1 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
/* SPDX-License-Identifier: GPL-2.0-or-later */

/*
 *  Copyright 2003-2004 Red Hat, Inc.  All rights reserved.
 *  Copyright 2003-2004 Jeff Garzik
 *
 *  libata documentation is available via 'make {ps|pdf}docs',
 *  as Documentation/driver-api/libata.rst
 *
 *  Hardware documentation available from http://www.t13.org/
 */

#ifndef __LINUX_ATA_H__
#define __LINUX_ATA_H__

#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/types.h>
#include <asm/byteorder.h>

/* defines only for the constants which don't work well as enums */
#define ATA_DMA_BOUNDARY        0xffffUL
#define ATA_DMA_MASK                0xffffffffULL

enum {
        /* various global constants */
        ATA_MAX_DEVICES                = 2,        /* per bus/port */
        ATA_MAX_PRD                = 256,        /* we could make these 256/256 */
        ATA_SECT_SIZE                = 512,
        ATA_MAX_SECTORS_128        = 128,
        ATA_MAX_SECTORS                = 256,
        ATA_MAX_SECTORS_1024    = 1024,
        ATA_MAX_SECTORS_LBA48        = 65535,/* avoid count to be 0000h */
        ATA_MAX_SECTORS_TAPE        = 65535,
        ATA_MAX_TRIM_RNUM        = 64,        /* 512-byte payload / (6-byte LBA + 2-byte range per entry) */

        ATA_ID_WORDS                = 256,
        ATA_ID_CONFIG                = 0,
        ATA_ID_CYLS                = 1,
        ATA_ID_HEADS                = 3,
        ATA_ID_SECTORS                = 6,
        ATA_ID_SERNO                = 10,
        ATA_ID_BUF_SIZE                = 21,
        ATA_ID_FW_REV                = 23,
        ATA_ID_PROD                = 27,
        ATA_ID_MAX_MULTSECT        = 47,
        ATA_ID_DWORD_IO                = 48,        /* before ATA-8 */
        ATA_ID_TRUSTED                = 48,        /* ATA-8 and later */
        ATA_ID_CAPABILITY        = 49,
        ATA_ID_OLD_PIO_MODES        = 51,
        ATA_ID_OLD_DMA_MODES        = 52,
        ATA_ID_FIELD_VALID        = 53,
        ATA_ID_CUR_CYLS                = 54,
        ATA_ID_CUR_HEADS        = 55,
        ATA_ID_CUR_SECTORS        = 56,
        ATA_ID_MULTSECT                = 59,
        ATA_ID_LBA_CAPACITY        = 60,
        ATA_ID_SWDMA_MODES        = 62,
        ATA_ID_MWDMA_MODES        = 63,
        ATA_ID_PIO_MODES        = 64,
        ATA_ID_EIDE_DMA_MIN        = 65,
        ATA_ID_EIDE_DMA_TIME        = 66,
        ATA_ID_EIDE_PIO                = 67,
        ATA_ID_EIDE_PIO_IORDY        = 68,
        ATA_ID_ADDITIONAL_SUPP        = 69,
        ATA_ID_QUEUE_DEPTH        = 75,
        ATA_ID_SATA_CAPABILITY        = 76,
        ATA_ID_SATA_CAPABILITY_2        = 77,
        ATA_ID_FEATURE_SUPP        = 78,
        ATA_ID_MAJOR_VER        = 80,
        ATA_ID_COMMAND_SET_1        = 82,
        ATA_ID_COMMAND_SET_2        = 83,
        ATA_ID_CFSSE                = 84,
        ATA_ID_CFS_ENABLE_1        = 85,
        ATA_ID_CFS_ENABLE_2        = 86,
        ATA_ID_CSF_DEFAULT        = 87,
        ATA_ID_UDMA_MODES        = 88,
        ATA_ID_HW_CONFIG        = 93,
        ATA_ID_SPG                = 98,
        ATA_ID_LBA_CAPACITY_2        = 100,
        ATA_ID_SECTOR_SIZE        = 106,
        ATA_ID_WWN                = 108,
        ATA_ID_LOGICAL_SECTOR_SIZE        = 117,        /* and 118 */
        ATA_ID_COMMAND_SET_3        = 119,
        ATA_ID_COMMAND_SET_4        = 120,
        ATA_ID_LAST_LUN                = 126,
        ATA_ID_DLF                = 128,
        ATA_ID_CSFO                = 129,
        ATA_ID_CFA_POWER        = 160,
        ATA_ID_CFA_KEY_MGMT        = 162,
        ATA_ID_CFA_MODES        = 163,
        ATA_ID_DATA_SET_MGMT        = 169,
        ATA_ID_SCT_CMD_XPORT        = 206,
        ATA_ID_ROT_SPEED        = 217,
        ATA_ID_PIO4                = (1 << 1),

        ATA_ID_SERNO_LEN        = 20,
        ATA_ID_FW_REV_LEN        = 8,
        ATA_ID_PROD_LEN                = 40,
        ATA_ID_WWN_LEN                = 8,

        ATA_PCI_CTL_OFS                = 2,

        ATA_PIO0                = (1 << 0),
        ATA_PIO1                = ATA_PIO0 | (1 << 1),
        ATA_PIO2                = ATA_PIO1 | (1 << 2),
        ATA_PIO3                = ATA_PIO2 | (1 << 3),
        ATA_PIO4                = ATA_PIO3 | (1 << 4),
        ATA_PIO5                = ATA_PIO4 | (1 << 5),
        ATA_PIO6                = ATA_PIO5 | (1 << 6),

        ATA_PIO4_ONLY                = (1 << 4),

        ATA_SWDMA0                = (1 << 0),
        ATA_SWDMA1                = ATA_SWDMA0 | (1 << 1),
        ATA_SWDMA2                = ATA_SWDMA1 | (1 << 2),

        ATA_SWDMA2_ONLY                = (1 << 2),

        ATA_MWDMA0                = (1 << 0),
        ATA_MWDMA1                = ATA_MWDMA0 | (1 << 1),
        ATA_MWDMA2                = ATA_MWDMA1 | (1 << 2),
        ATA_MWDMA3                = ATA_MWDMA2 | (1 << 3),
        ATA_MWDMA4                = ATA_MWDMA3 | (1 << 4),

        ATA_MWDMA12_ONLY        = (1 << 1) | (1 << 2),
        ATA_MWDMA2_ONLY                = (1 << 2),

        ATA_UDMA0                = (1 << 0),
        ATA_UDMA1                = ATA_UDMA0 | (1 << 1),
        ATA_UDMA2                = ATA_UDMA1 | (1 << 2),
        ATA_UDMA3                = ATA_UDMA2 | (1 << 3),
        ATA_UDMA4                = ATA_UDMA3 | (1 << 4),
        ATA_UDMA5                = ATA_UDMA4 | (1 << 5),
        ATA_UDMA6                = ATA_UDMA5 | (1 << 6),
        ATA_UDMA7                = ATA_UDMA6 | (1 << 7),
        /* ATA_UDMA7 is just for completeness... doesn't exist (yet?).  */

        ATA_UDMA24_ONLY                = (1 << 2) | (1 << 4),

        ATA_UDMA_MASK_40C        = ATA_UDMA2,        /* udma0-2 */

        /* DMA-related */
        ATA_PRD_SZ                = 8,
        ATA_PRD_TBL_SZ                = (ATA_MAX_PRD * ATA_PRD_SZ),
        ATA_PRD_EOT                = (1 << 31),        /* end-of-table flag */

        ATA_DMA_TABLE_OFS        = 4,
        ATA_DMA_STATUS                = 2,
        ATA_DMA_CMD                = 0,
        ATA_DMA_WR                = (1 << 3),
        ATA_DMA_START                = (1 << 0),
        ATA_DMA_INTR                = (1 << 2),
        ATA_DMA_ERR                = (1 << 1),
        ATA_DMA_ACTIVE                = (1 << 0),

        /* bits in ATA command block registers */
        ATA_HOB                        = (1 << 7),        /* LBA48 selector */
        ATA_NIEN                = (1 << 1),        /* disable-irq flag */
        ATA_LBA                        = (1 << 6),        /* LBA28 selector */
        ATA_DEV1                = (1 << 4),        /* Select Device 1 (slave) */
        ATA_DEVICE_OBS                = (1 << 7) | (1 << 5), /* obs bits in dev reg */
        ATA_DEVCTL_OBS                = (1 << 3),        /* obsolete bit in devctl reg */
        ATA_BUSY                = (1 << 7),        /* BSY status bit */
        ATA_DRDY                = (1 << 6),        /* device ready */
        ATA_DF                        = (1 << 5),        /* device fault */
        ATA_DSC                        = (1 << 4),        /* drive seek complete */
        ATA_DRQ                        = (1 << 3),        /* data request i/o */
        ATA_CORR                = (1 << 2),        /* corrected data error */
        ATA_SENSE                = (1 << 1),        /* sense code available */
        ATA_ERR                        = (1 << 0),        /* have an error */
        ATA_SRST                = (1 << 2),        /* software reset */
        ATA_ICRC                = (1 << 7),        /* interface CRC error */
        ATA_BBK                        = ATA_ICRC,        /* pre-EIDE: block marked bad */
        ATA_UNC                        = (1 << 6),        /* uncorrectable media error */
        ATA_MC                        = (1 << 5),        /* media changed */
        ATA_IDNF                = (1 << 4),        /* ID not found */
        ATA_MCR                        = (1 << 3),        /* media change requested */
        ATA_ABORTED                = (1 << 2),        /* command aborted */
        ATA_TRK0NF                = (1 << 1),        /* track 0 not found */
        ATA_AMNF                = (1 << 0),        /* address mark not found */
        ATAPI_LFS                = 0xF0,                /* last failed sense */
        ATAPI_EOM                = ATA_TRK0NF,        /* end of media */
        ATAPI_ILI                = ATA_AMNF,        /* illegal length indication */
        ATAPI_IO                = (1 << 1),
        ATAPI_COD                = (1 << 0),

        /* ATA command block registers */
        ATA_REG_DATA                = 0x00,
        ATA_REG_ERR                = 0x01,
        ATA_REG_NSECT                = 0x02,
        ATA_REG_LBAL                = 0x03,
        ATA_REG_LBAM                = 0x04,
        ATA_REG_LBAH                = 0x05,
        ATA_REG_DEVICE                = 0x06,
        ATA_REG_STATUS                = 0x07,

        ATA_REG_FEATURE                = ATA_REG_ERR, /* and their aliases */
        ATA_REG_CMD                = ATA_REG_STATUS,
        ATA_REG_BYTEL                = ATA_REG_LBAM,
        ATA_REG_BYTEH                = ATA_REG_LBAH,
        ATA_REG_DEVSEL                = ATA_REG_DEVICE,
        ATA_REG_IRQ                = ATA_REG_NSECT,

        /* ATA device commands */
        ATA_CMD_DEV_RESET        = 0x08, /* ATAPI device reset */
        ATA_CMD_CHK_POWER        = 0xE5, /* check power mode */
        ATA_CMD_STANDBY                = 0xE2, /* place in standby power mode */
        ATA_CMD_IDLE                = 0xE3, /* place in idle power mode */
        ATA_CMD_EDD                = 0x90,        /* execute device diagnostic */
        ATA_CMD_DOWNLOAD_MICRO  = 0x92,
        ATA_CMD_DOWNLOAD_MICRO_DMA = 0x93,
        ATA_CMD_NOP                = 0x00,
        ATA_CMD_FLUSH                = 0xE7,
        ATA_CMD_FLUSH_EXT        = 0xEA,
        ATA_CMD_ID_ATA                = 0xEC,
        ATA_CMD_ID_ATAPI        = 0xA1,
        ATA_CMD_SERVICE                = 0xA2,
        ATA_CMD_READ                = 0xC8,
        ATA_CMD_READ_EXT        = 0x25,
        ATA_CMD_READ_QUEUED        = 0x26,
        ATA_CMD_READ_STREAM_EXT        = 0x2B,
        ATA_CMD_READ_STREAM_DMA_EXT = 0x2A,
        ATA_CMD_WRITE                = 0xCA,
        ATA_CMD_WRITE_EXT        = 0x35,
        ATA_CMD_WRITE_QUEUED        = 0x36,
        ATA_CMD_WRITE_STREAM_EXT = 0x3B,
        ATA_CMD_WRITE_STREAM_DMA_EXT = 0x3A,
        ATA_CMD_WRITE_FUA_EXT        = 0x3D,
        ATA_CMD_WRITE_QUEUED_FUA_EXT = 0x3E,
        ATA_CMD_FPDMA_READ        = 0x60,
        ATA_CMD_FPDMA_WRITE        = 0x61,
        ATA_CMD_NCQ_NON_DATA        = 0x63,
        ATA_CMD_FPDMA_SEND        = 0x64,
        ATA_CMD_FPDMA_RECV        = 0x65,
        ATA_CMD_PIO_READ        = 0x20,
        ATA_CMD_PIO_READ_EXT        = 0x24,
        ATA_CMD_PIO_WRITE        = 0x30,
        ATA_CMD_PIO_WRITE_EXT        = 0x34,
        ATA_CMD_READ_MULTI        = 0xC4,
        ATA_CMD_READ_MULTI_EXT        = 0x29,
        ATA_CMD_WRITE_MULTI        = 0xC5,
        ATA_CMD_WRITE_MULTI_EXT        = 0x39,
        ATA_CMD_WRITE_MULTI_FUA_EXT = 0xCE,
        ATA_CMD_SET_FEATURES        = 0xEF,
        ATA_CMD_SET_MULTI        = 0xC6,
        ATA_CMD_PACKET                = 0xA0,
        ATA_CMD_VERIFY                = 0x40,
        ATA_CMD_VERIFY_EXT        = 0x42,
        ATA_CMD_WRITE_UNCORR_EXT = 0x45,
        ATA_CMD_STANDBYNOW1        = 0xE0,
        ATA_CMD_IDLEIMMEDIATE        = 0xE1,
        ATA_CMD_SLEEP                = 0xE6,
        ATA_CMD_INIT_DEV_PARAMS        = 0x91,
        ATA_CMD_READ_NATIVE_MAX        = 0xF8,
        ATA_CMD_READ_NATIVE_MAX_EXT = 0x27,
        ATA_CMD_SET_MAX                = 0xF9,
        ATA_CMD_SET_MAX_EXT        = 0x37,
        ATA_CMD_READ_LOG_EXT        = 0x2F,
        ATA_CMD_WRITE_LOG_EXT        = 0x3F,
        ATA_CMD_READ_LOG_DMA_EXT = 0x47,
        ATA_CMD_WRITE_LOG_DMA_EXT = 0x57,
        ATA_CMD_TRUSTED_NONDATA        = 0x5B,
        ATA_CMD_TRUSTED_RCV        = 0x5C,
        ATA_CMD_TRUSTED_RCV_DMA = 0x5D,
        ATA_CMD_TRUSTED_SND        = 0x5E,
        ATA_CMD_TRUSTED_SND_DMA = 0x5F,
        ATA_CMD_PMP_READ        = 0xE4,
        ATA_CMD_PMP_READ_DMA        = 0xE9,
        ATA_CMD_PMP_WRITE        = 0xE8,
        ATA_CMD_PMP_WRITE_DMA        = 0xEB,
        ATA_CMD_CONF_OVERLAY        = 0xB1,
        ATA_CMD_SEC_SET_PASS        = 0xF1,
        ATA_CMD_SEC_UNLOCK        = 0xF2,
        ATA_CMD_SEC_ERASE_PREP        = 0xF3,
        ATA_CMD_SEC_ERASE_UNIT        = 0xF4,
        ATA_CMD_SEC_FREEZE_LOCK        = 0xF5,
        ATA_CMD_SEC_DISABLE_PASS = 0xF6,
        ATA_CMD_CONFIG_STREAM        = 0x51,
        ATA_CMD_SMART                = 0xB0,
        ATA_CMD_MEDIA_LOCK        = 0xDE,
        ATA_CMD_MEDIA_UNLOCK        = 0xDF,
        ATA_CMD_DSM                = 0x06,
        ATA_CMD_CHK_MED_CRD_TYP = 0xD1,
        ATA_CMD_CFA_REQ_EXT_ERR = 0x03,
        ATA_CMD_CFA_WRITE_NE        = 0x38,
        ATA_CMD_CFA_TRANS_SECT        = 0x87,
        ATA_CMD_CFA_ERASE        = 0xC0,
        ATA_CMD_CFA_WRITE_MULT_NE = 0xCD,
        ATA_CMD_REQ_SENSE_DATA  = 0x0B,
        ATA_CMD_SANITIZE_DEVICE = 0xB4,
        ATA_CMD_ZAC_MGMT_IN        = 0x4A,
        ATA_CMD_ZAC_MGMT_OUT        = 0x9F,

        /* marked obsolete in the ATA/ATAPI-7 spec */
        ATA_CMD_RESTORE                = 0x10,

        /* Subcmds for ATA_CMD_FPDMA_RECV */
        ATA_SUBCMD_FPDMA_RECV_RD_LOG_DMA_EXT = 0x01,
        ATA_SUBCMD_FPDMA_RECV_ZAC_MGMT_IN    = 0x02,

        /* Subcmds for ATA_CMD_FPDMA_SEND */
        ATA_SUBCMD_FPDMA_SEND_DSM            = 0x00,
        ATA_SUBCMD_FPDMA_SEND_WR_LOG_DMA_EXT = 0x02,

        /* Subcmds for ATA_CMD_NCQ_NON_DATA */
        ATA_SUBCMD_NCQ_NON_DATA_ABORT_QUEUE  = 0x00,
        ATA_SUBCMD_NCQ_NON_DATA_SET_FEATURES = 0x05,
        ATA_SUBCMD_NCQ_NON_DATA_ZERO_EXT     = 0x06,
        ATA_SUBCMD_NCQ_NON_DATA_ZAC_MGMT_OUT = 0x07,

        /* Subcmds for ATA_CMD_ZAC_MGMT_IN */
        ATA_SUBCMD_ZAC_MGMT_IN_REPORT_ZONES = 0x00,

        /* Subcmds for ATA_CMD_ZAC_MGMT_OUT */
        ATA_SUBCMD_ZAC_MGMT_OUT_CLOSE_ZONE = 0x01,
        ATA_SUBCMD_ZAC_MGMT_OUT_FINISH_ZONE = 0x02,
        ATA_SUBCMD_ZAC_MGMT_OUT_OPEN_ZONE = 0x03,
        ATA_SUBCMD_ZAC_MGMT_OUT_RESET_WRITE_POINTER = 0x04,

        /* READ_LOG_EXT pages */
        ATA_LOG_DIRECTORY        = 0x0,
        ATA_LOG_SATA_NCQ        = 0x10,
        ATA_LOG_NCQ_NON_DATA        = 0x12,
        ATA_LOG_NCQ_SEND_RECV        = 0x13,
        ATA_LOG_IDENTIFY_DEVICE        = 0x30,

        /* Identify device log pages: */
        ATA_LOG_SECURITY          = 0x06,
        ATA_LOG_SATA_SETTINGS          = 0x08,
        ATA_LOG_ZONED_INFORMATION = 0x09,

        /* Identify device SATA settings log:*/
        ATA_LOG_DEVSLP_OFFSET          = 0x30,
        ATA_LOG_DEVSLP_SIZE          = 0x08,
        ATA_LOG_DEVSLP_MDAT          = 0x00,
        ATA_LOG_DEVSLP_MDAT_MASK  = 0x1F,
        ATA_LOG_DEVSLP_DETO          = 0x01,
        ATA_LOG_DEVSLP_VALID          = 0x07,
        ATA_LOG_DEVSLP_VALID_MASK = 0x80,
        ATA_LOG_NCQ_PRIO_OFFSET   = 0x09,

        /* NCQ send and receive log */
        ATA_LOG_NCQ_SEND_RECV_SUBCMDS_OFFSET        = 0x00,
        ATA_LOG_NCQ_SEND_RECV_SUBCMDS_DSM        = (1 << 0),
        ATA_LOG_NCQ_SEND_RECV_DSM_OFFSET        = 0x04,
        ATA_LOG_NCQ_SEND_RECV_DSM_TRIM                = (1 << 0),
        ATA_LOG_NCQ_SEND_RECV_RD_LOG_OFFSET        = 0x08,
        ATA_LOG_NCQ_SEND_RECV_RD_LOG_SUPPORTED  = (1 << 0),
        ATA_LOG_NCQ_SEND_RECV_WR_LOG_OFFSET        = 0x0C,
        ATA_LOG_NCQ_SEND_RECV_WR_LOG_SUPPORTED  = (1 << 0),
        ATA_LOG_NCQ_SEND_RECV_ZAC_MGMT_OFFSET        = 0x10,
        ATA_LOG_NCQ_SEND_RECV_ZAC_MGMT_OUT_SUPPORTED = (1 << 0),
        ATA_LOG_NCQ_SEND_RECV_ZAC_MGMT_IN_SUPPORTED = (1 << 1),
        ATA_LOG_NCQ_SEND_RECV_SIZE                = 0x14,

        /* NCQ Non-Data log */
        ATA_LOG_NCQ_NON_DATA_SUBCMDS_OFFSET        = 0x00,
        ATA_LOG_NCQ_NON_DATA_ABORT_OFFSET        = 0x00,
        ATA_LOG_NCQ_NON_DATA_ABORT_NCQ                = (1 << 0),
        ATA_LOG_NCQ_NON_DATA_ABORT_ALL                = (1 << 1),
        ATA_LOG_NCQ_NON_DATA_ABORT_STREAMING        = (1 << 2),
        ATA_LOG_NCQ_NON_DATA_ABORT_NON_STREAMING = (1 << 3),
        ATA_LOG_NCQ_NON_DATA_ABORT_SELECTED        = (1 << 4),
        ATA_LOG_NCQ_NON_DATA_ZAC_MGMT_OFFSET        = 0x1C,
        ATA_LOG_NCQ_NON_DATA_ZAC_MGMT_OUT        = (1 << 0),
        ATA_LOG_NCQ_NON_DATA_SIZE                = 0x40,

        /* READ/WRITE LONG (obsolete) */
        ATA_CMD_READ_LONG        = 0x22,
        ATA_CMD_READ_LONG_ONCE        = 0x23,
        ATA_CMD_WRITE_LONG        = 0x32,
        ATA_CMD_WRITE_LONG_ONCE        = 0x33,

        /* SETFEATURES stuff */
        SETFEATURES_XFER        = 0x03,
        XFER_UDMA_7                = 0x47,
        XFER_UDMA_6                = 0x46,
        XFER_UDMA_5                = 0x45,
        XFER_UDMA_4                = 0x44,
        XFER_UDMA_3                = 0x43,
        XFER_UDMA_2                = 0x42,
        XFER_UDMA_1                = 0x41,
        XFER_UDMA_0                = 0x40,
        XFER_MW_DMA_4                = 0x24,        /* CFA only */
        XFER_MW_DMA_3                = 0x23,        /* CFA only */
        XFER_MW_DMA_2                = 0x22,
        XFER_MW_DMA_1                = 0x21,
        XFER_MW_DMA_0                = 0x20,
        XFER_SW_DMA_2                = 0x12,
        XFER_SW_DMA_1                = 0x11,
        XFER_SW_DMA_0                = 0x10,
        XFER_PIO_6                = 0x0E,        /* CFA only */
        XFER_PIO_5                = 0x0D,        /* CFA only */
        XFER_PIO_4                = 0x0C,
        XFER_PIO_3                = 0x0B,
        XFER_PIO_2                = 0x0A,
        XFER_PIO_1                = 0x09,
        XFER_PIO_0                = 0x08,
        XFER_PIO_SLOW                = 0x00,

        SETFEATURES_WC_ON        = 0x02, /* Enable write cache */
        SETFEATURES_WC_OFF        = 0x82, /* Disable write cache */

        SETFEATURES_RA_ON        = 0xaa, /* Enable read look-ahead */
        SETFEATURES_RA_OFF        = 0x55, /* Disable read look-ahead */

        /* Enable/Disable Automatic Acoustic Management */
        SETFEATURES_AAM_ON        = 0x42,
        SETFEATURES_AAM_OFF        = 0xC2,

        SETFEATURES_SPINUP                = 0x07, /* Spin-up drive */
        SETFEATURES_SPINUP_TIMEOUT        = 30000, /* 30s timeout for drive spin-up from PUIS */

        SETFEATURES_SATA_ENABLE = 0x10, /* Enable use of SATA feature */
        SETFEATURES_SATA_DISABLE = 0x90, /* Disable use of SATA feature */

        /* SETFEATURE Sector counts for SATA features */
        SATA_FPDMA_OFFSET        = 0x01,        /* FPDMA non-zero buffer offsets */
        SATA_FPDMA_AA                = 0x02, /* FPDMA Setup FIS Auto-Activate */
        SATA_DIPM                = 0x03,        /* Device Initiated Power Management */
        SATA_FPDMA_IN_ORDER        = 0x04,        /* FPDMA in-order data delivery */
        SATA_AN                        = 0x05,        /* Asynchronous Notification */
        SATA_SSP                = 0x06,        /* Software Settings Preservation */
        SATA_DEVSLP                = 0x09,        /* Device Sleep */

        SETFEATURE_SENSE_DATA        = 0xC3, /* Sense Data Reporting feature */

        /* feature values for SET_MAX */
        ATA_SET_MAX_ADDR        = 0x00,
        ATA_SET_MAX_PASSWD        = 0x01,
        ATA_SET_MAX_LOCK        = 0x02,
        ATA_SET_MAX_UNLOCK        = 0x03,
        ATA_SET_MAX_FREEZE_LOCK        = 0x04,
        ATA_SET_MAX_PASSWD_DMA        = 0x05,
        ATA_SET_MAX_UNLOCK_DMA        = 0x06,

        /* feature values for DEVICE CONFIGURATION OVERLAY */
        ATA_DCO_RESTORE                = 0xC0,
        ATA_DCO_FREEZE_LOCK        = 0xC1,
        ATA_DCO_IDENTIFY        = 0xC2,
        ATA_DCO_SET                = 0xC3,

        /* feature values for SMART */
        ATA_SMART_ENABLE        = 0xD8,
        ATA_SMART_READ_VALUES        = 0xD0,
        ATA_SMART_READ_THRESHOLDS = 0xD1,

        /* feature values for Data Set Management */
        ATA_DSM_TRIM                = 0x01,

        /* password used in LBA Mid / LBA High for executing SMART commands */
        ATA_SMART_LBAM_PASS        = 0x4F,
        ATA_SMART_LBAH_PASS        = 0xC2,

        /* ATAPI stuff */
        ATAPI_PKT_DMA                = (1 << 0),
        ATAPI_DMADIR                = (1 << 2),        /* ATAPI data dir:
                                                   0=to device, 1=to host */
        ATAPI_CDB_LEN                = 16,

        /* PMP stuff */
        SATA_PMP_MAX_PORTS        = 15,
        SATA_PMP_CTRL_PORT        = 15,

        SATA_PMP_GSCR_DWORDS        = 128,
        SATA_PMP_GSCR_PROD_ID        = 0,
        SATA_PMP_GSCR_REV        = 1,
        SATA_PMP_GSCR_PORT_INFO        = 2,
        SATA_PMP_GSCR_ERROR        = 32,
        SATA_PMP_GSCR_ERROR_EN        = 33,
        SATA_PMP_GSCR_FEAT        = 64,
        SATA_PMP_GSCR_FEAT_EN        = 96,

        SATA_PMP_PSCR_STATUS        = 0,
        SATA_PMP_PSCR_ERROR        = 1,
        SATA_PMP_PSCR_CONTROL        = 2,

        SATA_PMP_FEAT_BIST        = (1 << 0),
        SATA_PMP_FEAT_PMREQ        = (1 << 1),
        SATA_PMP_FEAT_DYNSSC        = (1 << 2),
        SATA_PMP_FEAT_NOTIFY        = (1 << 3),

        /* cable types */
        ATA_CBL_NONE                = 0,
        ATA_CBL_PATA40                = 1,
        ATA_CBL_PATA80                = 2,
        ATA_CBL_PATA40_SHORT        = 3,        /* 40 wire cable to high UDMA spec */
        ATA_CBL_PATA_UNK        = 4,        /* don't know, maybe 80c? */
        ATA_CBL_PATA_IGN        = 5,        /* don't know, ignore cable handling */
        ATA_CBL_SATA                = 6,

        /* SATA Status and Control Registers */
        SCR_STATUS                = 0,
        SCR_ERROR                = 1,
        SCR_CONTROL                = 2,
        SCR_ACTIVE                = 3,
        SCR_NOTIFICATION        = 4,

        /* SError bits */
        SERR_DATA_RECOVERED        = (1 << 0), /* recovered data error */
        SERR_COMM_RECOVERED        = (1 << 1), /* recovered comm failure */
        SERR_DATA                = (1 << 8), /* unrecovered data error */
        SERR_PERSISTENT                = (1 << 9), /* persistent data/comm error */
        SERR_PROTOCOL                = (1 << 10), /* protocol violation */
        SERR_INTERNAL                = (1 << 11), /* host internal error */
        SERR_PHYRDY_CHG                = (1 << 16), /* PHY RDY changed */
        SERR_PHY_INT_ERR        = (1 << 17), /* PHY internal error */
        SERR_COMM_WAKE                = (1 << 18), /* Comm wake */
        SERR_10B_8B_ERR                = (1 << 19), /* 10b to 8b decode error */
        SERR_DISPARITY                = (1 << 20), /* Disparity */
        SERR_CRC                = (1 << 21), /* CRC error */
        SERR_HANDSHAKE                = (1 << 22), /* Handshake error */
        SERR_LINK_SEQ_ERR        = (1 << 23), /* Link sequence error */
        SERR_TRANS_ST_ERROR        = (1 << 24), /* Transport state trans. error */
        SERR_UNRECOG_FIS        = (1 << 25), /* Unrecognized FIS */
        SERR_DEV_XCHG                = (1 << 26), /* device exchanged */
};

enum ata_prot_flags {
        /* protocol flags */
        ATA_PROT_FLAG_PIO        = (1 << 0), /* is PIO */
        ATA_PROT_FLAG_DMA        = (1 << 1), /* is DMA */
        ATA_PROT_FLAG_NCQ        = (1 << 2), /* is NCQ */
        ATA_PROT_FLAG_ATAPI        = (1 << 3), /* is ATAPI */

        /* taskfile protocols */
        ATA_PROT_UNKNOWN        = (u8)-1,
        ATA_PROT_NODATA                = 0,
        ATA_PROT_PIO                = ATA_PROT_FLAG_PIO,
        ATA_PROT_DMA                = ATA_PROT_FLAG_DMA,
        ATA_PROT_NCQ_NODATA        = ATA_PROT_FLAG_NCQ,
        ATA_PROT_NCQ                = ATA_PROT_FLAG_DMA | ATA_PROT_FLAG_NCQ,
        ATAPI_PROT_NODATA        = ATA_PROT_FLAG_ATAPI,
        ATAPI_PROT_PIO                = ATA_PROT_FLAG_ATAPI | ATA_PROT_FLAG_PIO,
        ATAPI_PROT_DMA                = ATA_PROT_FLAG_ATAPI | ATA_PROT_FLAG_DMA,
};

enum ata_ioctls {
        ATA_IOC_GET_IO32        = 0x309, /* HDIO_GET_32BIT */
        ATA_IOC_SET_IO32        = 0x324, /* HDIO_SET_32BIT */
};

/* core structures */

struct ata_bmdma_prd {
        __le32                        addr;
        __le32                        flags_len;
};

/*
 * id tests
 */
#define ata_id_is_ata(id)        (((id)[ATA_ID_CONFIG] & (1 << 15)) == 0)
#define ata_id_has_lba(id)        ((id)[ATA_ID_CAPABILITY] & (1 << 9))
#define ata_id_has_dma(id)        ((id)[ATA_ID_CAPABILITY] & (1 << 8))
#define ata_id_has_ncq(id)        ((id)[ATA_ID_SATA_CAPABILITY] & (1 << 8))
#define ata_id_queue_depth(id)        (((id)[ATA_ID_QUEUE_DEPTH] & 0x1f) + 1)
#define ata_id_removable(id)        ((id)[ATA_ID_CONFIG] & (1 << 7))
#define ata_id_is_locked(id)        (((id)[ATA_ID_DLF] & 0x7) == 0x7)
#define ata_id_has_atapi_AN(id)        \
        ((((id)[ATA_ID_SATA_CAPABILITY] != 0x0000) && \
          ((id)[ATA_ID_SATA_CAPABILITY] != 0xffff)) && \
         ((id)[ATA_ID_FEATURE_SUPP] & (1 << 5)))
#define ata_id_has_fpdma_aa(id)        \
        ((((id)[ATA_ID_SATA_CAPABILITY] != 0x0000) && \
          ((id)[ATA_ID_SATA_CAPABILITY] != 0xffff)) && \
         ((id)[ATA_ID_FEATURE_SUPP] & (1 << 2)))
#define ata_id_has_devslp(id)        \
        ((((id)[ATA_ID_SATA_CAPABILITY] != 0x0000) && \
          ((id)[ATA_ID_SATA_CAPABILITY] != 0xffff)) && \
         ((id)[ATA_ID_FEATURE_SUPP] & (1 << 8)))
#define ata_id_has_ncq_autosense(id) \
        ((((id)[ATA_ID_SATA_CAPABILITY] != 0x0000) && \
          ((id)[ATA_ID_SATA_CAPABILITY] != 0xffff)) && \
         ((id)[ATA_ID_FEATURE_SUPP] & (1 << 7)))
#define ata_id_has_dipm(id)        \
        ((((id)[ATA_ID_SATA_CAPABILITY] != 0x0000) && \
          ((id)[ATA_ID_SATA_CAPABILITY] != 0xffff)) && \
         ((id)[ATA_ID_FEATURE_SUPP] & (1 << 3)))
#define ata_id_iordy_disable(id) ((id)[ATA_ID_CAPABILITY] & (1 << 10))
#define ata_id_has_iordy(id) ((id)[ATA_ID_CAPABILITY] & (1 << 11))
#define ata_id_u32(id,n)        \
        (((u32) (id)[(n) + 1] << 16) | ((u32) (id)[(n)]))
#define ata_id_u64(id,n)        \
        ( ((u64) (id)[(n) + 3] << 48) |        \
          ((u64) (id)[(n) + 2] << 32) |        \
          ((u64) (id)[(n) + 1] << 16) |        \
          ((u64) (id)[(n) + 0]) )

#define ata_id_cdb_intr(id)        (((id)[ATA_ID_CONFIG] & 0x60) == 0x20)
#define ata_id_has_da(id)        ((id)[ATA_ID_SATA_CAPABILITY_2] & (1 << 4))

static inline bool ata_id_has_hipm(const u16 *id)
{
        u16 val = id[ATA_ID_SATA_CAPABILITY];

        if (val == 0 || val == 0xffff)
                return false;

        return val & (1 << 9);
}

static inline bool ata_id_has_fua(const u16 *id)
{
        if ((id[ATA_ID_CFSSE] & 0xC000) != 0x4000)
                return false;
        return id[ATA_ID_CFSSE] & (1 << 6);
}

static inline bool ata_id_has_flush(const u16 *id)
{
        if ((id[ATA_ID_COMMAND_SET_2] & 0xC000) != 0x4000)
                return false;
        return id[ATA_ID_COMMAND_SET_2] & (1 << 12);
}

static inline bool ata_id_flush_enabled(const u16 *id)
{
        if (ata_id_has_flush(id) == 0)
                return false;
        if ((id[ATA_ID_CSF_DEFAULT] & 0xC000) != 0x4000)
                return false;
        return id[ATA_ID_CFS_ENABLE_2] & (1 << 12);
}

static inline bool ata_id_has_flush_ext(const u16 *id)
{
        if ((id[ATA_ID_COMMAND_SET_2] & 0xC000) != 0x4000)
                return false;
        return id[ATA_ID_COMMAND_SET_2] & (1 << 13);
}

static inline bool ata_id_flush_ext_enabled(const u16 *id)
{
        if (ata_id_has_flush_ext(id) == 0)
                return false;
        if ((id[ATA_ID_CSF_DEFAULT] & 0xC000) != 0x4000)
                return false;
        /*
         * some Maxtor disks have bit 13 defined incorrectly
         * so check bit 10 too
         */
        return (id[ATA_ID_CFS_ENABLE_2] & 0x2400) == 0x2400;
}

static inline u32 ata_id_logical_sector_size(const u16 *id)
{
        /* T13/1699-D Revision 6a, Sep 6, 2008. Page 128.
         * IDENTIFY DEVICE data, word 117-118.
         * 0xd000 ignores bit 13 (logical:physical > 1)
         */
        if ((id[ATA_ID_SECTOR_SIZE] & 0xd000) == 0x5000)
                return (((id[ATA_ID_LOGICAL_SECTOR_SIZE+1] << 16)
                         + id[ATA_ID_LOGICAL_SECTOR_SIZE]) * sizeof(u16)) ;
        return ATA_SECT_SIZE;
}

static inline u8 ata_id_log2_per_physical_sector(const u16 *id)
{
        /* T13/1699-D Revision 6a, Sep 6, 2008. Page 128.
         * IDENTIFY DEVICE data, word 106.
         * 0xe000 ignores bit 12 (logical sector > 512 bytes)
         */
        if ((id[ATA_ID_SECTOR_SIZE] & 0xe000) == 0x6000)
                return (id[ATA_ID_SECTOR_SIZE] & 0xf);
        return 0;
}

/* Offset of logical sectors relative to physical sectors.
 *
 * If device has more than one logical sector per physical sector
 * (aka 512 byte emulation), vendors might offset the "sector 0" address
 * so sector 63 is "naturally aligned" - e.g. FAT partition table.
 * This avoids Read/Mod/Write penalties when using FAT partition table
 * and updating "well aligned" (FS perspective) physical sectors on every
 * transaction.
 */
static inline u16 ata_id_logical_sector_offset(const u16 *id,
         u8 log2_per_phys)
{
        u16 word_209 = id[209];

        if ((log2_per_phys > 1) && (word_209 & 0xc000) == 0x4000) {
                u16 first = word_209 & 0x3fff;
                if (first > 0)
                        return (1 << log2_per_phys) - first;
        }
        return 0;
}

static inline bool ata_id_has_lba48(const u16 *id)
{
        if ((id[ATA_ID_COMMAND_SET_2] & 0xC000) != 0x4000)
                return false;
        if (!ata_id_u64(id, ATA_ID_LBA_CAPACITY_2))
                return false;
        return id[ATA_ID_COMMAND_SET_2] & (1 << 10);
}

static inline bool ata_id_lba48_enabled(const u16 *id)
{
        if (ata_id_has_lba48(id) == 0)
                return false;
        if ((id[ATA_ID_CSF_DEFAULT] & 0xC000) != 0x4000)
                return false;
        return id[ATA_ID_CFS_ENABLE_2] & (1 << 10);
}

static inline bool ata_id_hpa_enabled(const u16 *id)
{
        /* Yes children, word 83 valid bits cover word 82 data */
        if ((id[ATA_ID_COMMAND_SET_2] & 0xC000) != 0x4000)
                return false;
        /* And 87 covers 85-87 */
        if ((id[ATA_ID_CSF_DEFAULT] & 0xC000) != 0x4000)
                return false;
        /* Check command sets enabled as well as supported */
        if ((id[ATA_ID_CFS_ENABLE_1] & (1 << 10)) == 0)
                return false;
        return id[ATA_ID_COMMAND_SET_1] & (1 << 10);
}

static inline bool ata_id_has_wcache(const u16 *id)
{
        /* Yes children, word 83 valid bits cover word 82 data */
        if ((id[ATA_ID_COMMAND_SET_2] & 0xC000) != 0x4000)
                return false;
        return id[ATA_ID_COMMAND_SET_1] & (1 << 5);
}

static inline bool ata_id_has_pm(const u16 *id)
{
        if ((id[ATA_ID_COMMAND_SET_2] & 0xC000) != 0x4000)
                return false;
        return id[ATA_ID_COMMAND_SET_1] & (1 << 3);
}

static inline bool ata_id_rahead_enabled(const u16 *id)
{
        if ((id[ATA_ID_CSF_DEFAULT] & 0xC000) != 0x4000)
                return false;
        return id[ATA_ID_CFS_ENABLE_1] & (1 << 6);
}

static inline bool ata_id_wcache_enabled(const u16 *id)
{
        if ((id[ATA_ID_CSF_DEFAULT] & 0xC000) != 0x4000)
                return false;
        return id[ATA_ID_CFS_ENABLE_1] & (1 << 5);
}

static inline bool ata_id_has_read_log_dma_ext(const u16 *id)
{
        /* Word 86 must have bit 15 set */
        if (!(id[ATA_ID_CFS_ENABLE_2] & (1 << 15)))
                return false;

        /* READ LOG DMA EXT support can be signaled either from word 119
         * or from word 120. The format is the same for both words: Bit
         * 15 must be cleared, bit 14 set and bit 3 set.
         */
        if ((id[ATA_ID_COMMAND_SET_3] & 0xC008) == 0x4008 ||
            (id[ATA_ID_COMMAND_SET_4] & 0xC008) == 0x4008)
                return true;

        return false;
}

static inline bool ata_id_has_sense_reporting(const u16 *id)
{
        if (!(id[ATA_ID_CFS_ENABLE_2] & BIT(15)))
                return false;
        if ((id[ATA_ID_COMMAND_SET_3] & (BIT(15) | BIT(14))) != BIT(14))
                return false;
        return id[ATA_ID_COMMAND_SET_3] & BIT(6);
}

static inline bool ata_id_sense_reporting_enabled(const u16 *id)
{
        if (!ata_id_has_sense_reporting(id))
                return false;
        /* ata_id_has_sense_reporting() == true, word 86 must have bit 15 set */
        if ((id[ATA_ID_COMMAND_SET_4] & (BIT(15) | BIT(14))) != BIT(14))
                return false;
        return id[ATA_ID_COMMAND_SET_4] & BIT(6);
}

/**
 *
 * Word: 206 - SCT Command Transport
 *    15:12 - Vendor Specific
 *     11:6 - Reserved
 *        5 - SCT Command Transport Data Tables supported
 *        4 - SCT Command Transport Features Control supported
 *        3 - SCT Command Transport Error Recovery Control supported
 *        2 - SCT Command Transport Write Same supported
 *        1 - SCT Command Transport Long Sector Access supported
 *        0 - SCT Command Transport supported
 */
static inline bool ata_id_sct_data_tables(const u16 *id)
{
        return id[ATA_ID_SCT_CMD_XPORT] & (1 << 5) ? true : false;
}

static inline bool ata_id_sct_features_ctrl(const u16 *id)
{
        return id[ATA_ID_SCT_CMD_XPORT] & (1 << 4) ? true : false;
}

static inline bool ata_id_sct_error_recovery_ctrl(const u16 *id)
{
        return id[ATA_ID_SCT_CMD_XPORT] & (1 << 3) ? true : false;
}

static inline bool ata_id_sct_long_sector_access(const u16 *id)
{
        return id[ATA_ID_SCT_CMD_XPORT] & (1 << 1) ? true : false;
}

static inline bool ata_id_sct_supported(const u16 *id)
{
        return id[ATA_ID_SCT_CMD_XPORT] & (1 << 0) ? true : false;
}

/**
 *        ata_id_major_version        -        get ATA level of drive
 *        @id: Identify data
 *
 *        Caveats:
 *                ATA-1 considers identify optional
 *                ATA-2 introduces mandatory identify
 *                ATA-3 introduces word 80 and accurate reporting
 *
 *        The practical impact of this is that ata_id_major_version cannot
 *        reliably report on drives below ATA3.
 */

static inline unsigned int ata_id_major_version(const u16 *id)
{
        unsigned int mver;

        if (id[ATA_ID_MAJOR_VER] == 0xFFFF)
                return 0;

        for (mver = 14; mver >= 1; mver--)
                if (id[ATA_ID_MAJOR_VER] & (1 << mver))
                        break;
        return mver;
}

static inline bool ata_id_is_sata(const u16 *id)
{
        /*
         * See if word 93 is 0 AND drive is at least ATA-5 compatible
         * verifying that word 80 by casting it to a signed type --
         * this trick allows us to filter out the reserved values of
         * 0x0000 and 0xffff along with the earlier ATA revisions...
         */
        if (id[ATA_ID_HW_CONFIG] == 0 && (short)id[ATA_ID_MAJOR_VER] >= 0x0020)
                return true;
        return false;
}

static inline bool ata_id_has_tpm(const u16 *id)
{
        /* The TPM bits are only valid on ATA8 */
        if (ata_id_major_version(id) < 8)
                return false;
        if ((id[48] & 0xC000) != 0x4000)
                return false;
        return id[48] & (1 << 0);
}

static inline bool ata_id_has_dword_io(const u16 *id)
{
        /* ATA 8 reuses this flag for "trusted" computing */
        if (ata_id_major_version(id) > 7)
                return false;
        return id[ATA_ID_DWORD_IO] & (1 << 0);
}

static inline bool ata_id_has_trusted(const u16 *id)
{
        if (ata_id_major_version(id) <= 7)
                return false;
        return id[ATA_ID_TRUSTED] & (1 << 0);
}

static inline bool ata_id_has_unload(const u16 *id)
{
        if (ata_id_major_version(id) >= 7 &&
            (id[ATA_ID_CFSSE] & 0xC000) == 0x4000 &&
            id[ATA_ID_CFSSE] & (1 << 13))
                return true;
        return false;
}

static inline bool ata_id_has_wwn(const u16 *id)
{
        return (id[ATA_ID_CSF_DEFAULT] & 0xC100) == 0x4100;
}

static inline int ata_id_form_factor(const u16 *id)
{
        u16 val = id[168];

        if (ata_id_major_version(id) < 7 || val == 0 || val == 0xffff)
                return 0;

        val &= 0xf;

        if (val > 5)
                return 0;

        return val;
}

static inline int ata_id_rotation_rate(const u16 *id)
{
        u16 val = id[217];

        if (ata_id_major_version(id) < 7 || val == 0 || val == 0xffff)
                return 0;

        if (val > 1 && val < 0x401)
                return 0;

        return val;
}

static inline bool ata_id_has_ncq_send_and_recv(const u16 *id)
{
        return id[ATA_ID_SATA_CAPABILITY_2] & BIT(6);
}

static inline bool ata_id_has_ncq_non_data(const u16 *id)
{
        return id[ATA_ID_SATA_CAPABILITY_2] & BIT(5);
}

static inline bool ata_id_has_ncq_prio(const u16 *id)
{
        return id[ATA_ID_SATA_CAPABILITY] & BIT(12);
}

static inline bool ata_id_has_trim(const u16 *id)
{
        if (ata_id_major_version(id) >= 7 &&
            (id[ATA_ID_DATA_SET_MGMT] & 1))
                return true;
        return false;
}

static inline bool ata_id_has_zero_after_trim(const u16 *id)
{
        /* DSM supported, deterministic read, and read zero after trim set */
        if (ata_id_has_trim(id) &&
            (id[ATA_ID_ADDITIONAL_SUPP] & 0x4020) == 0x4020)
                return true;

        return false;
}

static inline bool ata_id_current_chs_valid(const u16 *id)
{
        /* For ATA-1 devices, if the INITIALIZE DEVICE PARAMETERS command
           has not been issued to the device then the values of
           id[ATA_ID_CUR_CYLS] to id[ATA_ID_CUR_SECTORS] are vendor specific. */
        return (id[ATA_ID_FIELD_VALID] & 1) && /* Current translation valid */
                id[ATA_ID_CUR_CYLS] &&  /* cylinders in current translation */
                id[ATA_ID_CUR_HEADS] &&  /* heads in current translation */
                id[ATA_ID_CUR_HEADS] <= 16 &&
                id[ATA_ID_CUR_SECTORS];    /* sectors in current translation */
}

static inline bool ata_id_is_cfa(const u16 *id)
{
        if ((id[ATA_ID_CONFIG] == 0x848A) ||        /* Traditional CF */
            (id[ATA_ID_CONFIG] == 0x844A))        /* Delkin Devices CF */
                return true;
        /*
         * CF specs don't require specific value in the word 0 anymore and yet
         * they forbid to report the ATA version in the word 80 and require the
         * CFA feature set support to be indicated in the word 83 in this case.
         * Unfortunately, some cards only follow either of this requirements,
         * and while those that don't indicate CFA feature support need some
         * sort of quirk list, it seems impractical for the ones that do...
         */
        return (id[ATA_ID_COMMAND_SET_2] & 0xC004) == 0x4004;
}

static inline bool ata_id_is_ssd(const u16 *id)
{
        return id[ATA_ID_ROT_SPEED] == 0x01;
}

static inline u8 ata_id_zoned_cap(const u16 *id)
{
        return (id[ATA_ID_ADDITIONAL_SUPP] & 0x3);
}

static inline bool ata_id_pio_need_iordy(const u16 *id, const u8 pio)
{
        /* CF spec. r4.1 Table 22 says no IORDY on PIO5 and PIO6. */
        if (pio > 4 && ata_id_is_cfa(id))
                return false;
        /* For PIO3 and higher it is mandatory. */
        if (pio > 2)
                return true;
        /* Turn it on when possible. */
        return ata_id_has_iordy(id);
}

static inline bool ata_drive_40wire(const u16 *dev_id)
{
        if (ata_id_is_sata(dev_id))
                return false;        /* SATA */
        if ((dev_id[ATA_ID_HW_CONFIG] & 0xE000) == 0x6000)
                return false;        /* 80 wire */
        return true;
}

static inline bool ata_drive_40wire_relaxed(const u16 *dev_id)
{
        if ((dev_id[ATA_ID_HW_CONFIG] & 0x2000) == 0x2000)
                return false;        /* 80 wire */
        return true;
}

static inline int atapi_cdb_len(const u16 *dev_id)
{
        u16 tmp = dev_id[ATA_ID_CONFIG] & 0x3;
        switch (tmp) {
        case 0:                return 12;
        case 1:                return 16;
        default:        return -1;
        }
}

static inline int atapi_command_packet_set(const u16 *dev_id)
{
        return (dev_id[ATA_ID_CONFIG] >> 8) & 0x1f;
}

static inline bool atapi_id_dmadir(const u16 *dev_id)
{
        return ata_id_major_version(dev_id) >= 7 && (dev_id[62] & 0x8000);
}

/*
 * ata_id_is_lba_capacity_ok() performs a sanity check on
 * the claimed LBA capacity value for the device.
 *
 * Returns 1 if LBA capacity looks sensible, 0 otherwise.
 *
 * It is called only once for each device.
 */
static inline bool ata_id_is_lba_capacity_ok(u16 *id)
{
        unsigned long lba_sects, chs_sects, head, tail;

        /* No non-LBA info .. so valid! */
        if (id[ATA_ID_CYLS] == 0)
                return true;

        lba_sects = ata_id_u32(id, ATA_ID_LBA_CAPACITY);

        /*
         * The ATA spec tells large drives to return
         * C/H/S = 16383/16/63 independent of their size.
         * Some drives can be jumpered to use 15 heads instead of 16.
         * Some drives can be jumpered to use 4092 cyls instead of 16383.
         */
        if ((id[ATA_ID_CYLS] == 16383 ||
             (id[ATA_ID_CYLS] == 4092 && id[ATA_ID_CUR_CYLS] == 16383)) &&
            id[ATA_ID_SECTORS] == 63 &&
            (id[ATA_ID_HEADS] == 15 || id[ATA_ID_HEADS] == 16) &&
            (lba_sects >= 16383 * 63 * id[ATA_ID_HEADS]))
                return true;

        chs_sects = id[ATA_ID_CYLS] * id[ATA_ID_HEADS] * id[ATA_ID_SECTORS];

        /* perform a rough sanity check on lba_sects: within 10% is OK */
        if (lba_sects - chs_sects < chs_sects/10)
                return true;

        /* some drives have the word order reversed */
        head = (lba_sects >> 16) & 0xffff;
        tail = lba_sects & 0xffff;
        lba_sects = head | (tail << 16);

        if (lba_sects - chs_sects < chs_sects/10) {
                *(__le32 *)&id[ATA_ID_LBA_CAPACITY] = __cpu_to_le32(lba_sects);
                return true;        /* LBA capacity is (now) good */
        }

        return false;        /* LBA capacity value may be bad */
}

static inline void ata_id_to_hd_driveid(u16 *id)
{
#ifdef __BIG_ENDIAN
        /* accessed in struct hd_driveid as 8-bit values */
        id[ATA_ID_MAX_MULTSECT]         = __cpu_to_le16(id[ATA_ID_MAX_MULTSECT]);
        id[ATA_ID_CAPABILITY]         = __cpu_to_le16(id[ATA_ID_CAPABILITY]);
        id[ATA_ID_OLD_PIO_MODES] = __cpu_to_le16(id[ATA_ID_OLD_PIO_MODES]);
        id[ATA_ID_OLD_DMA_MODES] = __cpu_to_le16(id[ATA_ID_OLD_DMA_MODES]);
        id[ATA_ID_MULTSECT]         = __cpu_to_le16(id[ATA_ID_MULTSECT]);

        /* as 32-bit values */
        *(u32 *)&id[ATA_ID_LBA_CAPACITY] = ata_id_u32(id, ATA_ID_LBA_CAPACITY);
        *(u32 *)&id[ATA_ID_SPG]                 = ata_id_u32(id, ATA_ID_SPG);

        /* as 64-bit value */
        *(u64 *)&id[ATA_ID_LBA_CAPACITY_2] =
                ata_id_u64(id, ATA_ID_LBA_CAPACITY_2);
#endif
}

static inline bool ata_ok(u8 status)
{
        return ((status & (ATA_BUSY | ATA_DRDY | ATA_DF | ATA_DRQ | ATA_ERR))
                        == ATA_DRDY);
}

static inline bool lba_28_ok(u64 block, u32 n_block)
{
        /* check the ending block number: must be LESS THAN 0x0fffffff */
        return ((block + n_block) < ((1 << 28) - 1)) && (n_block <= ATA_MAX_SECTORS);
}

static inline bool lba_48_ok(u64 block, u32 n_block)
{
        /* check the ending block number */
        return ((block + n_block - 1) < ((u64)1 << 48)) && (n_block <= ATA_MAX_SECTORS_LBA48);
}

#define sata_pmp_gscr_vendor(gscr)        ((gscr)[SATA_PMP_GSCR_PROD_ID] & 0xffff)
#define sata_pmp_gscr_devid(gscr)        ((gscr)[SATA_PMP_GSCR_PROD_ID] >> 16)
#define sata_pmp_gscr_rev(gscr)                (((gscr)[SATA_PMP_GSCR_REV] >> 8) & 0xff)
#define sata_pmp_gscr_ports(gscr)        ((gscr)[SATA_PMP_GSCR_PORT_INFO] & 0xf)

#endif /* __LINUX_ATA_H__ */















































































    1 

















































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * This header is for implementations of dma_map_ops and related code.
 * It should not be included in drivers just using the DMA API.
 */
#ifndef _LINUX_DMA_MAP_OPS_H
#define _LINUX_DMA_MAP_OPS_H

#include <linux/dma-mapping.h>
#include <linux/pgtable.h>

struct cma;

struct dma_map_ops {
        void *(*alloc)(struct device *dev, size_t size,
                        dma_addr_t *dma_handle, gfp_t gfp,
                        unsigned long attrs);
        void (*free)(struct device *dev, size_t size, void *vaddr,
                        dma_addr_t dma_handle, unsigned long attrs);
        struct page *(*alloc_pages)(struct device *dev, size_t size,
                        dma_addr_t *dma_handle, enum dma_data_direction dir,
                        gfp_t gfp);
        void (*free_pages)(struct device *dev, size_t size, struct page *vaddr,
                        dma_addr_t dma_handle, enum dma_data_direction dir);
        void *(*alloc_noncoherent)(struct device *dev, size_t size,
                        dma_addr_t *dma_handle, enum dma_data_direction dir,
                        gfp_t gfp);
        void (*free_noncoherent)(struct device *dev, size_t size, void *vaddr,
                        dma_addr_t dma_handle, enum dma_data_direction dir);
        int (*mmap)(struct device *, struct vm_area_struct *,
                        void *, dma_addr_t, size_t, unsigned long attrs);

        int (*get_sgtable)(struct device *dev, struct sg_table *sgt,
                        void *cpu_addr, dma_addr_t dma_addr, size_t size,
                        unsigned long attrs);

        dma_addr_t (*map_page)(struct device *dev, struct page *page,
                        unsigned long offset, size_t size,
                        enum dma_data_direction dir, unsigned long attrs);
        void (*unmap_page)(struct device *dev, dma_addr_t dma_handle,
                        size_t size, enum dma_data_direction dir,
                        unsigned long attrs);
        /*
         * map_sg returns 0 on error and a value > 0 on success.
         * It should never return a value < 0.
         */
        int (*map_sg)(struct device *dev, struct scatterlist *sg, int nents,
                        enum dma_data_direction dir, unsigned long attrs);
        void (*unmap_sg)(struct device *dev, struct scatterlist *sg, int nents,
                        enum dma_data_direction dir, unsigned long attrs);
        dma_addr_t (*map_resource)(struct device *dev, phys_addr_t phys_addr,
                        size_t size, enum dma_data_direction dir,
                        unsigned long attrs);
        void (*unmap_resource)(struct device *dev, dma_addr_t dma_handle,
                        size_t size, enum dma_data_direction dir,
                        unsigned long attrs);
        void (*sync_single_for_cpu)(struct device *dev, dma_addr_t dma_handle,
                        size_t size, enum dma_data_direction dir);
        void (*sync_single_for_device)(struct device *dev,
                        dma_addr_t dma_handle, size_t size,
                        enum dma_data_direction dir);
        void (*sync_sg_for_cpu)(struct device *dev, struct scatterlist *sg,
                        int nents, enum dma_data_direction dir);
        void (*sync_sg_for_device)(struct device *dev, struct scatterlist *sg,
                        int nents, enum dma_data_direction dir);
        void (*cache_sync)(struct device *dev, void *vaddr, size_t size,
                        enum dma_data_direction direction);
        int (*dma_supported)(struct device *dev, u64 mask);
        u64 (*get_required_mask)(struct device *dev);
        size_t (*max_mapping_size)(struct device *dev);
        unsigned long (*get_merge_boundary)(struct device *dev);
};

#ifdef CONFIG_DMA_OPS
#include <asm/dma-mapping.h>

static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
{
        if (dev->dma_ops)
                return dev->dma_ops;
        return get_arch_dma_ops(dev->bus);
}

static inline void set_dma_ops(struct device *dev,
                               const struct dma_map_ops *dma_ops)
{
        dev->dma_ops = dma_ops;
}
#else /* CONFIG_DMA_OPS */
static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
{
        return NULL;
}
static inline void set_dma_ops(struct device *dev,
                               const struct dma_map_ops *dma_ops)
{
}
#endif /* CONFIG_DMA_OPS */

#ifdef CONFIG_DMA_CMA
extern struct cma *dma_contiguous_default_area;

static inline struct cma *dev_get_cma_area(struct device *dev)
{
        if (dev && dev->cma_area)
                return dev->cma_area;
        return dma_contiguous_default_area;
}

void dma_contiguous_reserve(phys_addr_t addr_limit);
int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base,
                phys_addr_t limit, struct cma **res_cma, bool fixed);

struct page *dma_alloc_from_contiguous(struct device *dev, size_t count,
                                       unsigned int order, bool no_warn);
bool dma_release_from_contiguous(struct device *dev, struct page *pages,
                                 int count);
struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp);
void dma_free_contiguous(struct device *dev, struct page *page, size_t size);

void dma_contiguous_early_fixup(phys_addr_t base, unsigned long size);
#else /* CONFIG_DMA_CMA */
static inline struct cma *dev_get_cma_area(struct device *dev)
{
        return NULL;
}
static inline void dma_contiguous_reserve(phys_addr_t limit)
{
}
static inline int dma_contiguous_reserve_area(phys_addr_t size,
                phys_addr_t base, phys_addr_t limit, struct cma **res_cma,
                bool fixed)
{
        return -ENOSYS;
}
static inline struct page *dma_alloc_from_contiguous(struct device *dev,
                size_t count, unsigned int order, bool no_warn)
{
        return NULL;
}
static inline bool dma_release_from_contiguous(struct device *dev,
                struct page *pages, int count)
{
        return false;
}
/* Use fallback alloc() and free() when CONFIG_DMA_CMA=n */
static inline struct page *dma_alloc_contiguous(struct device *dev, size_t size,
                gfp_t gfp)
{
        return NULL;
}
static inline void dma_free_contiguous(struct device *dev, struct page *page,
                size_t size)
{
        __free_pages(page, get_order(size));
}
#endif /* CONFIG_DMA_CMA*/

#ifdef CONFIG_DMA_PERNUMA_CMA
void dma_pernuma_cma_reserve(void);
#else
static inline void dma_pernuma_cma_reserve(void) { }
#endif /* CONFIG_DMA_PERNUMA_CMA */

#ifdef CONFIG_DMA_DECLARE_COHERENT
int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
                dma_addr_t device_addr, size_t size);
void dma_release_coherent_memory(struct device *dev);
int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size,
                dma_addr_t *dma_handle, void **ret);
int dma_release_from_dev_coherent(struct device *dev, int order, void *vaddr);
int dma_mmap_from_dev_coherent(struct device *dev, struct vm_area_struct *vma,
                void *cpu_addr, size_t size, int *ret);

void *dma_alloc_from_global_coherent(struct device *dev, ssize_t size,
                dma_addr_t *dma_handle);
int dma_release_from_global_coherent(int order, void *vaddr);
int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *cpu_addr,
                size_t size, int *ret);

#else
static inline int dma_declare_coherent_memory(struct device *dev,
                phys_addr_t phys_addr, dma_addr_t device_addr, size_t size)
{
        return -ENOSYS;
}

#define dma_alloc_from_dev_coherent(dev, size, handle, ret) (0)
#define dma_release_from_dev_coherent(dev, order, vaddr) (0)
#define dma_mmap_from_dev_coherent(dev, vma, vaddr, order, ret) (0)
static inline void dma_release_coherent_memory(struct device *dev) { }

static inline void *dma_alloc_from_global_coherent(struct device *dev,
                ssize_t size, dma_addr_t *dma_handle)
{
        return NULL;
}
static inline int dma_release_from_global_coherent(int order, void *vaddr)
{
        return 0;
}
static inline int dma_mmap_from_global_coherent(struct vm_area_struct *vma,
                void *cpu_addr, size_t size, int *ret)
{
        return 0;
}
#endif /* CONFIG_DMA_DECLARE_COHERENT */

int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt,
                void *cpu_addr, dma_addr_t dma_addr, size_t size,
                unsigned long attrs);
int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
                void *cpu_addr, dma_addr_t dma_addr, size_t size,
                unsigned long attrs);
struct page *dma_common_alloc_pages(struct device *dev, size_t size,
                dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp);
void dma_common_free_pages(struct device *dev, size_t size, struct page *vaddr,
                dma_addr_t dma_handle, enum dma_data_direction dir);

struct page **dma_common_find_pages(void *cpu_addr);
void *dma_common_contiguous_remap(struct page *page, size_t size, pgprot_t prot,
                const void *caller);
void *dma_common_pages_remap(struct page **pages, size_t size, pgprot_t prot,
                const void *caller);
void dma_common_free_remap(void *cpu_addr, size_t size);

struct page *dma_alloc_from_pool(struct device *dev, size_t size,
                void **cpu_addr, gfp_t flags,
                bool (*phys_addr_ok)(struct device *, phys_addr_t, size_t));
bool dma_free_from_pool(struct device *dev, void *start, size_t size);

#ifdef CONFIG_ARCH_HAS_DMA_COHERENCE_H
#include <asm/dma-coherence.h>
#elif defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
        defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
        defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
static inline bool dev_is_dma_coherent(struct device *dev)
{
        return dev->dma_coherent;
}
#else
static inline bool dev_is_dma_coherent(struct device *dev)
{
        return true;
}
#endif /* CONFIG_ARCH_HAS_DMA_COHERENCE_H */

void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
                gfp_t gfp, unsigned long attrs);
void arch_dma_free(struct device *dev, size_t size, void *cpu_addr,
                dma_addr_t dma_addr, unsigned long attrs);

#ifdef CONFIG_MMU
/*
 * Page protection so that devices that can't snoop CPU caches can use the
 * memory coherently.  We default to pgprot_noncached which is usually used
 * for ioremap as a safe bet, but architectures can override this with less
 * strict semantics if possible.
 */
#ifndef pgprot_dmacoherent
#define pgprot_dmacoherent(prot)        pgprot_noncached(prot)
#endif

pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs);
#else
static inline pgprot_t dma_pgprot(struct device *dev, pgprot_t prot,
                unsigned long attrs)
{
        return prot;        /* no protection bits supported without page tables */
}
#endif /* CONFIG_MMU */

#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE
void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
                enum dma_data_direction dir);
#else
static inline void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
                enum dma_data_direction dir)
{
}
#endif /* ARCH_HAS_SYNC_DMA_FOR_DEVICE */

#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU
void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
                enum dma_data_direction dir);
#else
static inline void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
                enum dma_data_direction dir)
{
}
#endif /* ARCH_HAS_SYNC_DMA_FOR_CPU */

#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL
void arch_sync_dma_for_cpu_all(void);
#else
static inline void arch_sync_dma_for_cpu_all(void)
{
}
#endif /* CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL */

#ifdef CONFIG_ARCH_HAS_DMA_PREP_COHERENT
void arch_dma_prep_coherent(struct page *page, size_t size);
#else
static inline void arch_dma_prep_coherent(struct page *page, size_t size)
{
}
#endif /* CONFIG_ARCH_HAS_DMA_PREP_COHERENT */

#ifdef CONFIG_ARCH_HAS_DMA_MARK_CLEAN
void arch_dma_mark_clean(phys_addr_t paddr, size_t size);
#else
static inline void arch_dma_mark_clean(phys_addr_t paddr, size_t size)
{
}
#endif /* ARCH_HAS_DMA_MARK_CLEAN */

void *arch_dma_set_uncached(void *addr, size_t size);
void arch_dma_clear_uncached(void *addr, size_t size);

#ifdef CONFIG_ARCH_HAS_SETUP_DMA_OPS
void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
                const struct iommu_ops *iommu, bool coherent);
#else
static inline void arch_setup_dma_ops(struct device *dev, u64 dma_base,
                u64 size, const struct iommu_ops *iommu, bool coherent)
{
}
#endif /* CONFIG_ARCH_HAS_SETUP_DMA_OPS */

#ifdef CONFIG_ARCH_HAS_TEARDOWN_DMA_OPS
void arch_teardown_dma_ops(struct device *dev);
#else
static inline void arch_teardown_dma_ops(struct device *dev)
{
}
#endif /* CONFIG_ARCH_HAS_TEARDOWN_DMA_OPS */

#ifdef CONFIG_DMA_API_DEBUG
void dma_debug_add_bus(struct bus_type *bus);
void debug_dma_dump_mappings(struct device *dev);
#else
static inline void dma_debug_add_bus(struct bus_type *bus)
{
}
static inline void debug_dma_dump_mappings(struct device *dev)
{
}
#endif /* CONFIG_DMA_API_DEBUG */

extern const struct dma_map_ops dma_dummy_ops;

#endif /* _LINUX_DMA_MAP_OPS_H */




































































































































































































    1 

    1 

    1 

    1 


































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
// SPDX-License-Identifier: GPL-2.0
/*
 * High-level sync()-related operations
 */

#include <linux/kernel.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/namei.h>
#include <linux/sched.h>
#include <linux/writeback.h>
#include <linux/syscalls.h>
#include <linux/linkage.h>
#include <linux/pagemap.h>
#include <linux/quotaops.h>
#include <linux/backing-dev.h>
#include "internal.h"

#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
                        SYNC_FILE_RANGE_WAIT_AFTER)

/*
 * Write out and wait upon all dirty data associated with this
 * superblock.  Filesystem data as well as the underlying block
 * device.  Takes the superblock lock.
 */
int sync_filesystem(struct super_block *sb)
{
        int ret = 0;

        /*
         * We need to be protected against the filesystem going from
         * r/o to r/w or vice versa.
         */
        WARN_ON(!rwsem_is_locked(&sb->s_umount));

        /*
         * No point in syncing out anything if the filesystem is read-only.
         */
        if (sb_rdonly(sb))
                return 0;

        /*
         * Do the filesystem syncing work.  For simple filesystems
         * writeback_inodes_sb(sb) just dirties buffers with inodes so we have
         * to submit I/O for these buffers via __sync_blockdev().  This also
         * speeds up the wait == 1 case since in that case write_inode()
         * methods call sync_dirty_buffer() and thus effectively write one block
         * at a time.
         */
        writeback_inodes_sb(sb, WB_REASON_SYNC);
        if (sb->s_op->sync_fs) {
                ret = sb->s_op->sync_fs(sb, 0);
                if (ret)
                        return ret;
        }
        ret = __sync_blockdev(sb->s_bdev, 0);
        if (ret)
                return ret;

        sync_inodes_sb(sb);
        if (sb->s_op->sync_fs) {
                ret = sb->s_op->sync_fs(sb, 1);
                if (ret)
                        return ret;
        }
        return __sync_blockdev(sb->s_bdev, 1);
}
EXPORT_SYMBOL(sync_filesystem);

static void sync_inodes_one_sb(struct super_block *sb, void *arg)
{
        if (!sb_rdonly(sb))
                sync_inodes_sb(sb);
}

static void sync_fs_one_sb(struct super_block *sb, void *arg)
{
        if (!sb_rdonly(sb) && !(sb->s_iflags & SB_I_SKIP_SYNC) &&
            sb->s_op->sync_fs)
                sb->s_op->sync_fs(sb, *(int *)arg);
}

static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
{
        filemap_fdatawrite(bdev->bd_inode->i_mapping);
}

static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
{
        /*
         * We keep the error status of individual mapping so that
         * applications can catch the writeback error using fsync(2).
         * See filemap_fdatawait_keep_errors() for details.
         */
        filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
}

/*
 * Sync everything. We start by waking flusher threads so that most of
 * writeback runs on all devices in parallel. Then we sync all inodes reliably
 * which effectively also waits for all flusher threads to finish doing
 * writeback. At this point all data is on disk so metadata should be stable
 * and we tell filesystems to sync their metadata via ->sync_fs() calls.
 * Finally, we writeout all block devices because some filesystems (e.g. ext2)
 * just write metadata (such as inodes or bitmaps) to block device page cache
 * and do not sync it on their own in ->sync_fs().
 */
void ksys_sync(void)
{
        int nowait = 0, wait = 1;

        wakeup_flusher_threads(WB_REASON_SYNC);
        iterate_supers(sync_inodes_one_sb, NULL);
        iterate_supers(sync_fs_one_sb, &nowait);
        iterate_supers(sync_fs_one_sb, &wait);
        iterate_bdevs(fdatawrite_one_bdev, NULL);
        iterate_bdevs(fdatawait_one_bdev, NULL);
        if (unlikely(laptop_mode))
                laptop_sync_completion();
}

SYSCALL_DEFINE0(sync)
{
        ksys_sync();
        return 0;
}

static void do_sync_work(struct work_struct *work)
{
        int nowait = 0;

        /*
         * Sync twice to reduce the possibility we skipped some inodes / pages
         * because they were temporarily locked
         */
        iterate_supers(sync_inodes_one_sb, &nowait);
        iterate_supers(sync_fs_one_sb, &nowait);
        iterate_bdevs(fdatawrite_one_bdev, NULL);
        iterate_supers(sync_inodes_one_sb, &nowait);
        iterate_supers(sync_fs_one_sb, &nowait);
        iterate_bdevs(fdatawrite_one_bdev, NULL);
        printk("Emergency Sync complete\n");
        kfree(work);
}

void emergency_sync(void)
{
        struct work_struct *work;

        work = kmalloc(sizeof(*work), GFP_ATOMIC);
        if (work) {
                INIT_WORK(work, do_sync_work);
                schedule_work(work);
        }
}

/*
 * sync a single super
 */
SYSCALL_DEFINE1(syncfs, int, fd)
{
        struct fd f = fdget(fd);
        struct super_block *sb;
        int ret, ret2;

        if (!f.file)
                return -EBADF;
        sb = f.file->f_path.dentry->d_sb;

        down_read(&sb->s_umount);
        ret = sync_filesystem(sb);
        up_read(&sb->s_umount);

        ret2 = errseq_check_and_advance(&sb->s_wb_err, &f.file->f_sb_err);

        fdput(f);
        return ret ? ret : ret2;
}

/**
 * vfs_fsync_range - helper to sync a range of data & metadata to disk
 * @file:                file to sync
 * @start:                offset in bytes of the beginning of data range to sync
 * @end:                offset in bytes of the end of data range (inclusive)
 * @datasync:                perform only datasync
 *
 * Write back data in range @start..@end and metadata for @file to disk.  If
 * @datasync is set only metadata needed to access modified file data is
 * written.
 */
int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
{
        struct inode *inode = file->f_mapping->host;

        if (!file->f_op->fsync)
                return -EINVAL;
        if (!datasync && (inode->i_state & I_DIRTY_TIME))
                mark_inode_dirty_sync(inode);
        return file->f_op->fsync(file, start, end, datasync);
}
EXPORT_SYMBOL(vfs_fsync_range);

/**
 * vfs_fsync - perform a fsync or fdatasync on a file
 * @file:                file to sync
 * @datasync:                only perform a fdatasync operation
 *
 * Write back data and metadata for @file to disk.  If @datasync is
 * set only metadata needed to access modified file data is written.
 */
int vfs_fsync(struct file *file, int datasync)
{
        return vfs_fsync_range(file, 0, LLONG_MAX, datasync);
}
EXPORT_SYMBOL(vfs_fsync);

static int do_fsync(unsigned int fd, int datasync)
{
        struct fd f = fdget(fd);
        int ret = -EBADF;

        if (f.file) {
                ret = vfs_fsync(f.file, datasync);
                fdput(f);
        }
        return ret;
}

SYSCALL_DEFINE1(fsync, unsigned int, fd)
{
        return do_fsync(fd, 0);
}

SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
{
        return do_fsync(fd, 1);
}

int sync_file_range(struct file *file, loff_t offset, loff_t nbytes,
                    unsigned int flags)
{
        int ret;
        struct address_space *mapping;
        loff_t endbyte;                        /* inclusive */
        umode_t i_mode;

        ret = -EINVAL;
        if (flags & ~VALID_FLAGS)
                goto out;

        endbyte = offset + nbytes;

        if ((s64)offset < 0)
                goto out;
        if ((s64)endbyte < 0)
                goto out;
        if (endbyte < offset)
                goto out;

        if (sizeof(pgoff_t) == 4) {
                if (offset >= (0x100000000ULL << PAGE_SHIFT)) {
                        /*
                         * The range starts outside a 32 bit machine's
                         * pagecache addressing capabilities.  Let it "succeed"
                         */
                        ret = 0;
                        goto out;
                }
                if (endbyte >= (0x100000000ULL << PAGE_SHIFT)) {
                        /*
                         * Out to EOF
                         */
                        nbytes = 0;
                }
        }

        if (nbytes == 0)
                endbyte = LLONG_MAX;
        else
                endbyte--;                /* inclusive */

        i_mode = file_inode(file)->i_mode;
        ret = -ESPIPE;
        if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) &&
                        !S_ISLNK(i_mode))
                goto out;

        mapping = file->f_mapping;
        ret = 0;
        if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
                ret = file_fdatawait_range(file, offset, endbyte);
                if (ret < 0)
                        goto out;
        }

        if (flags & SYNC_FILE_RANGE_WRITE) {
                int sync_mode = WB_SYNC_NONE;

                if ((flags & SYNC_FILE_RANGE_WRITE_AND_WAIT) ==
                             SYNC_FILE_RANGE_WRITE_AND_WAIT)
                        sync_mode = WB_SYNC_ALL;

                ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
                                                 sync_mode);
                if (ret < 0)
                        goto out;
        }

        if (flags & SYNC_FILE_RANGE_WAIT_AFTER)
                ret = file_fdatawait_range(file, offset, endbyte);

out:
        return ret;
}

/*
 * ksys_sync_file_range() permits finely controlled syncing over a segment of
 * a file in the range offset .. (offset+nbytes-1) inclusive.  If nbytes is
 * zero then ksys_sync_file_range() will operate from offset out to EOF.
 *
 * The flag bits are:
 *
 * SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range
 * before performing the write.
 *
 * SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the
 * range which are not presently under writeback. Note that this may block for
 * significant periods due to exhaustion of disk request structures.
 *
 * SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range
 * after performing the write.
 *
 * Useful combinations of the flag bits are:
 *
 * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages
 * in the range which were dirty on entry to ksys_sync_file_range() are placed
 * under writeout.  This is a start-write-for-data-integrity operation.
 *
 * SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which
 * are not presently under writeout.  This is an asynchronous flush-to-disk
 * operation.  Not suitable for data integrity operations.
 *
 * SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for
 * completion of writeout of all pages in the range.  This will be used after an
 * earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait
 * for that operation to complete and to return the result.
 *
 * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER
 * (a.k.a. SYNC_FILE_RANGE_WRITE_AND_WAIT):
 * a traditional sync() operation.  This is a write-for-data-integrity operation
 * which will ensure that all pages in the range which were dirty on entry to
 * ksys_sync_file_range() are written to disk.  It should be noted that disk
 * caches are not flushed by this call, so there are no guarantees here that the
 * data will be available on disk after a crash.
 *
 *
 * SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any
 * I/O errors or ENOSPC conditions and will return those to the caller, after
 * clearing the EIO and ENOSPC flags in the address_space.
 *
 * It should be noted that none of these operations write out the file's
 * metadata.  So unless the application is strictly performing overwrites of
 * already-instantiated disk blocks, there are no guarantees here that the data
 * will be available after a crash.
 */
int ksys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
                         unsigned int flags)
{
        int ret;
        struct fd f;

        ret = -EBADF;
        f = fdget(fd);
        if (f.file)
                ret = sync_file_range(f.file, offset, nbytes, flags);

        fdput(f);
        return ret;
}

SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
                                unsigned int, flags)
{
        return ksys_sync_file_range(fd, offset, nbytes, flags);
}

/* It would be nice if people remember that not all the world's an i386
   when they introduce new system calls */
SYSCALL_DEFINE4(sync_file_range2, int, fd, unsigned int, flags,
                                 loff_t, offset, loff_t, nbytes)
{
        return ksys_sync_file_range(fd, offset, nbytes, flags);
}









































































































































































































































































    1 





















































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_NODEMASK_H
#define __LINUX_NODEMASK_H

/*
 * Nodemasks provide a bitmap suitable for representing the
 * set of Node's in a system, one bit position per Node number.
 *
 * See detailed comments in the file linux/bitmap.h describing the
 * data type on which these nodemasks are based.
 *
 * For details of nodemask_parse_user(), see bitmap_parse_user() in
 * lib/bitmap.c.  For details of nodelist_parse(), see bitmap_parselist(),
 * also in bitmap.c.  For details of node_remap(), see bitmap_bitremap in
 * lib/bitmap.c.  For details of nodes_remap(), see bitmap_remap in
 * lib/bitmap.c.  For details of nodes_onto(), see bitmap_onto in
 * lib/bitmap.c.  For details of nodes_fold(), see bitmap_fold in
 * lib/bitmap.c.
 *
 * The available nodemask operations are:
 *
 * void node_set(node, mask)                turn on bit 'node' in mask
 * void node_clear(node, mask)                turn off bit 'node' in mask
 * void nodes_setall(mask)                set all bits
 * void nodes_clear(mask)                clear all bits
 * int node_isset(node, mask)                true iff bit 'node' set in mask
 * int node_test_and_set(node, mask)        test and set bit 'node' in mask
 *
 * void nodes_and(dst, src1, src2)        dst = src1 & src2  [intersection]
 * void nodes_or(dst, src1, src2)        dst = src1 | src2  [union]
 * void nodes_xor(dst, src1, src2)        dst = src1 ^ src2
 * void nodes_andnot(dst, src1, src2)        dst = src1 & ~src2
 * void nodes_complement(dst, src)        dst = ~src
 *
 * int nodes_equal(mask1, mask2)        Does mask1 == mask2?
 * int nodes_intersects(mask1, mask2)        Do mask1 and mask2 intersect?
 * int nodes_subset(mask1, mask2)        Is mask1 a subset of mask2?
 * int nodes_empty(mask)                Is mask empty (no bits sets)?
 * int nodes_full(mask)                        Is mask full (all bits sets)?
 * int nodes_weight(mask)                Hamming weight - number of set bits
 *
 * void nodes_shift_right(dst, src, n)        Shift right
 * void nodes_shift_left(dst, src, n)        Shift left
 *
 * unsigned int first_node(mask)        Number lowest set bit, or MAX_NUMNODES
 * unsigend int next_node(node, mask)        Next node past 'node', or MAX_NUMNODES
 * unsigned int next_node_in(node, mask) Next node past 'node', or wrap to first,
 *                                        or MAX_NUMNODES
 * unsigned int first_unset_node(mask)        First node not set in mask, or
 *                                        MAX_NUMNODES
 *
 * nodemask_t nodemask_of_node(node)        Return nodemask with bit 'node' set
 * NODE_MASK_ALL                        Initializer - all bits set
 * NODE_MASK_NONE                        Initializer - no bits set
 * unsigned long *nodes_addr(mask)        Array of unsigned long's in mask
 *
 * int nodemask_parse_user(ubuf, ulen, mask)        Parse ascii string as nodemask
 * int nodelist_parse(buf, map)                Parse ascii string as nodelist
 * int node_remap(oldbit, old, new)        newbit = map(old, new)(oldbit)
 * void nodes_remap(dst, src, old, new)        *dst = map(old, new)(src)
 * void nodes_onto(dst, orig, relmap)        *dst = orig relative to relmap
 * void nodes_fold(dst, orig, sz)        dst bits = orig bits mod sz
 *
 * for_each_node_mask(node, mask)        for-loop node over mask
 *
 * int num_online_nodes()                Number of online Nodes
 * int num_possible_nodes()                Number of all possible Nodes
 *
 * int node_random(mask)                Random node with set bit in mask
 *
 * int node_online(node)                Is some node online?
 * int node_possible(node)                Is some node possible?
 *
 * node_set_online(node)                set bit 'node' in node_online_map
 * node_set_offline(node)                clear bit 'node' in node_online_map
 *
 * for_each_node(node)                        for-loop node over node_possible_map
 * for_each_online_node(node)                for-loop node over node_online_map
 *
 * Subtlety:
 * 1) The 'type-checked' form of node_isset() causes gcc (3.3.2, anyway)
 *    to generate slightly worse code.  So use a simple one-line #define
 *    for node_isset(), instead of wrapping an inline inside a macro, the
 *    way we do the other calls.
 *
 * NODEMASK_SCRATCH
 * When doing above logical AND, OR, XOR, Remap operations the callers tend to
 * need temporary nodemask_t's on the stack. But if NODES_SHIFT is large,
 * nodemask_t's consume too much stack space.  NODEMASK_SCRATCH is a helper
 * for such situations. See below and CPUMASK_ALLOC also.
 */

#include <linux/threads.h>
#include <linux/bitmap.h>
#include <linux/minmax.h>
#include <linux/numa.h>

typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;
extern nodemask_t _unused_nodemask_arg_;

/**
 * nodemask_pr_args - printf args to output a nodemask
 * @maskp: nodemask to be printed
 *
 * Can be used to provide arguments for '%*pb[l]' when printing a nodemask.
 */
#define nodemask_pr_args(maskp)        __nodemask_pr_numnodes(maskp), \
                                __nodemask_pr_bits(maskp)
static inline unsigned int __nodemask_pr_numnodes(const nodemask_t *m)
{
        return m ? MAX_NUMNODES : 0;
}
static inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m)
{
        return m ? m->bits : NULL;
}

/*
 * The inline keyword gives the compiler room to decide to inline, or
 * not inline a function as it sees best.  However, as these functions
 * are called in both __init and non-__init functions, if they are not
 * inlined we will end up with a section mis-match error (of the type of
 * freeable items not being freed).  So we must use __always_inline here
 * to fix the problem.  If other functions in the future also end up in
 * this situation they will also need to be annotated as __always_inline
 */
#define node_set(node, dst) __node_set((node), &(dst))
static __always_inline void __node_set(int node, volatile nodemask_t *dstp)
{
        set_bit(node, dstp->bits);
}

#define node_clear(node, dst) __node_clear((node), &(dst))
static inline void __node_clear(int node, volatile nodemask_t *dstp)
{
        clear_bit(node, dstp->bits);
}

#define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES)
static inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits)
{
        bitmap_fill(dstp->bits, nbits);
}

#define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES)
static inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits)
{
        bitmap_zero(dstp->bits, nbits);
}

/* No static inline type checking - see Subtlety (1) above. */
#define node_isset(node, nodemask) test_bit((node), (nodemask).bits)

#define node_test_and_set(node, nodemask) \
                        __node_test_and_set((node), &(nodemask))
static inline bool __node_test_and_set(int node, nodemask_t *addr)
{
        return test_and_set_bit(node, addr->bits);
}

#define nodes_and(dst, src1, src2) \
                        __nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES)
static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits);
}

#define nodes_or(dst, src1, src2) \
                        __nodes_or(&(dst), &(src1), &(src2), MAX_NUMNODES)
static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits);
}

#define nodes_xor(dst, src1, src2) \
                        __nodes_xor(&(dst), &(src1), &(src2), MAX_NUMNODES)
static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits);
}

#define nodes_andnot(dst, src1, src2) \
                        __nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES)
static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits);
}

#define nodes_complement(dst, src) \
                        __nodes_complement(&(dst), &(src), MAX_NUMNODES)
static inline void __nodes_complement(nodemask_t *dstp,
                                        const nodemask_t *srcp, unsigned int nbits)
{
        bitmap_complement(dstp->bits, srcp->bits, nbits);
}

#define nodes_equal(src1, src2) \
                        __nodes_equal(&(src1), &(src2), MAX_NUMNODES)
static inline bool __nodes_equal(const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        return bitmap_equal(src1p->bits, src2p->bits, nbits);
}

#define nodes_intersects(src1, src2) \
                        __nodes_intersects(&(src1), &(src2), MAX_NUMNODES)
static inline bool __nodes_intersects(const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        return bitmap_intersects(src1p->bits, src2p->bits, nbits);
}

#define nodes_subset(src1, src2) \
                        __nodes_subset(&(src1), &(src2), MAX_NUMNODES)
static inline bool __nodes_subset(const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        return bitmap_subset(src1p->bits, src2p->bits, nbits);
}

#define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES)
static inline bool __nodes_empty(const nodemask_t *srcp, unsigned int nbits)
{
        return bitmap_empty(srcp->bits, nbits);
}

#define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES)
static inline bool __nodes_full(const nodemask_t *srcp, unsigned int nbits)
{
        return bitmap_full(srcp->bits, nbits);
}

#define nodes_weight(nodemask) __nodes_weight(&(nodemask), MAX_NUMNODES)
static inline int __nodes_weight(const nodemask_t *srcp, unsigned int nbits)
{
        return bitmap_weight(srcp->bits, nbits);
}

#define nodes_shift_right(dst, src, n) \
                        __nodes_shift_right(&(dst), &(src), (n), MAX_NUMNODES)
static inline void __nodes_shift_right(nodemask_t *dstp,
                                        const nodemask_t *srcp, int n, int nbits)
{
        bitmap_shift_right(dstp->bits, srcp->bits, n, nbits);
}

#define nodes_shift_left(dst, src, n) \
                        __nodes_shift_left(&(dst), &(src), (n), MAX_NUMNODES)
static inline void __nodes_shift_left(nodemask_t *dstp,
                                        const nodemask_t *srcp, int n, int nbits)
{
        bitmap_shift_left(dstp->bits, srcp->bits, n, nbits);
}

/* FIXME: better would be to fix all architectures to never return
          > MAX_NUMNODES, then the silly min_ts could be dropped. */

#define first_node(src) __first_node(&(src))
static inline unsigned int __first_node(const nodemask_t *srcp)
{
        return min_t(unsigned int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES));
}

#define next_node(n, src) __next_node((n), &(src))
static inline unsigned int __next_node(int n, const nodemask_t *srcp)
{
        return min_t(unsigned int, MAX_NUMNODES, find_next_bit(srcp->bits, MAX_NUMNODES, n+1));
}

/*
 * Find the next present node in src, starting after node n, wrapping around to
 * the first node in src if needed.  Returns MAX_NUMNODES if src is empty.
 */
#define next_node_in(n, src) __next_node_in((n), &(src))
unsigned int __next_node_in(int node, const nodemask_t *srcp);

static inline void init_nodemask_of_node(nodemask_t *mask, int node)
{
        nodes_clear(*mask);
        node_set(node, *mask);
}

#define nodemask_of_node(node)                                                \
({                                                                        \
        typeof(_unused_nodemask_arg_) m;                                \
        if (sizeof(m) == sizeof(unsigned long)) {                        \
                m.bits[0] = 1UL << (node);                                \
        } else {                                                        \
                init_nodemask_of_node(&m, (node));                        \
        }                                                                \
        m;                                                                \
})

#define first_unset_node(mask) __first_unset_node(&(mask))
static inline unsigned int __first_unset_node(const nodemask_t *maskp)
{
        return min_t(unsigned int, MAX_NUMNODES,
                        find_first_zero_bit(maskp->bits, MAX_NUMNODES));
}

#define NODE_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(MAX_NUMNODES)

#if MAX_NUMNODES <= BITS_PER_LONG

#define NODE_MASK_ALL                                                        \
((nodemask_t) { {                                                        \
        [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD                \
} })

#else

#define NODE_MASK_ALL                                                        \
((nodemask_t) { {                                                        \
        [0 ... BITS_TO_LONGS(MAX_NUMNODES)-2] = ~0UL,                        \
        [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD                \
} })

#endif

#define NODE_MASK_NONE                                                        \
((nodemask_t) { {                                                        \
        [0 ... BITS_TO_LONGS(MAX_NUMNODES)-1] =  0UL                        \
} })

#define nodes_addr(src) ((src).bits)

#define nodemask_parse_user(ubuf, ulen, dst) \
                __nodemask_parse_user((ubuf), (ulen), &(dst), MAX_NUMNODES)
static inline int __nodemask_parse_user(const char __user *buf, int len,
                                        nodemask_t *dstp, int nbits)
{
        return bitmap_parse_user(buf, len, dstp->bits, nbits);
}

#define nodelist_parse(buf, dst) __nodelist_parse((buf), &(dst), MAX_NUMNODES)
static inline int __nodelist_parse(const char *buf, nodemask_t *dstp, int nbits)
{
        return bitmap_parselist(buf, dstp->bits, nbits);
}

#define node_remap(oldbit, old, new) \
                __node_remap((oldbit), &(old), &(new), MAX_NUMNODES)
static inline int __node_remap(int oldbit,
                const nodemask_t *oldp, const nodemask_t *newp, int nbits)
{
        return bitmap_bitremap(oldbit, oldp->bits, newp->bits, nbits);
}

#define nodes_remap(dst, src, old, new) \
                __nodes_remap(&(dst), &(src), &(old), &(new), MAX_NUMNODES)
static inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp,
                const nodemask_t *oldp, const nodemask_t *newp, int nbits)
{
        bitmap_remap(dstp->bits, srcp->bits, oldp->bits, newp->bits, nbits);
}

#define nodes_onto(dst, orig, relmap) \
                __nodes_onto(&(dst), &(orig), &(relmap), MAX_NUMNODES)
static inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp,
                const nodemask_t *relmapp, int nbits)
{
        bitmap_onto(dstp->bits, origp->bits, relmapp->bits, nbits);
}

#define nodes_fold(dst, orig, sz) \
                __nodes_fold(&(dst), &(orig), sz, MAX_NUMNODES)
static inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp,
                int sz, int nbits)
{
        bitmap_fold(dstp->bits, origp->bits, sz, nbits);
}

#if MAX_NUMNODES > 1
#define for_each_node_mask(node, mask)                                    \
        for ((node) = first_node(mask);                                    \
             (node >= 0) && (node) < MAX_NUMNODES;                    \
             (node) = next_node((node), (mask)))
#else /* MAX_NUMNODES == 1 */
#define for_each_node_mask(node, mask)                                  \
        for ((node) = 0; (node) < 1 && !nodes_empty(mask); (node)++)
#endif /* MAX_NUMNODES */

/*
 * Bitmasks that are kept for all the nodes.
 */
enum node_states {
        N_POSSIBLE,                /* The node could become online at some point */
        N_ONLINE,                /* The node is online */
        N_NORMAL_MEMORY,        /* The node has regular memory */
#ifdef CONFIG_HIGHMEM
        N_HIGH_MEMORY,                /* The node has regular or high memory */
#else
        N_HIGH_MEMORY = N_NORMAL_MEMORY,
#endif
        N_MEMORY,                /* The node has memory(regular, high, movable) */
        N_CPU,                /* The node has one or more cpus */
        N_GENERIC_INITIATOR,        /* The node has one or more Generic Initiators */
        NR_NODE_STATES
};

/*
 * The following particular system nodemasks and operations
 * on them manage all possible and online nodes.
 */

extern nodemask_t node_states[NR_NODE_STATES];

#if MAX_NUMNODES > 1
static inline int node_state(int node, enum node_states state)
{
        return node_isset(node, node_states[state]);
}

static inline void node_set_state(int node, enum node_states state)
{
        __node_set(node, &node_states[state]);
}

static inline void node_clear_state(int node, enum node_states state)
{
        __node_clear(node, &node_states[state]);
}

static inline int num_node_state(enum node_states state)
{
        return nodes_weight(node_states[state]);
}

#define for_each_node_state(__node, __state) \
        for_each_node_mask((__node), node_states[__state])

#define first_online_node        first_node(node_states[N_ONLINE])
#define first_memory_node        first_node(node_states[N_MEMORY])
static inline unsigned int next_online_node(int nid)
{
        return next_node(nid, node_states[N_ONLINE]);
}
static inline unsigned int next_memory_node(int nid)
{
        return next_node(nid, node_states[N_MEMORY]);
}

extern unsigned int nr_node_ids;
extern unsigned int nr_online_nodes;

static inline void node_set_online(int nid)
{
        node_set_state(nid, N_ONLINE);
        nr_online_nodes = num_node_state(N_ONLINE);
}

static inline void node_set_offline(int nid)
{
        node_clear_state(nid, N_ONLINE);
        nr_online_nodes = num_node_state(N_ONLINE);
}

#else

static inline int node_state(int node, enum node_states state)
{
        return node == 0;
}

static inline void node_set_state(int node, enum node_states state)
{
}

static inline void node_clear_state(int node, enum node_states state)
{
}

static inline int num_node_state(enum node_states state)
{
        return 1;
}

#define for_each_node_state(node, __state) \
        for ( (node) = 0; (node) == 0; (node) = 1)

#define first_online_node        0
#define first_memory_node        0
#define next_online_node(nid)        (MAX_NUMNODES)
#define nr_node_ids                1U
#define nr_online_nodes                1U

#define node_set_online(node)           node_set_state((node), N_ONLINE)
#define node_set_offline(node)           node_clear_state((node), N_ONLINE)

#endif

#if defined(CONFIG_NUMA) && (MAX_NUMNODES > 1)
extern int node_random(const nodemask_t *maskp);
#else
static inline int node_random(const nodemask_t *mask)
{
        return 0;
}
#endif

#define node_online_map         node_states[N_ONLINE]
#define node_possible_map         node_states[N_POSSIBLE]

#define num_online_nodes()        num_node_state(N_ONLINE)
#define num_possible_nodes()        num_node_state(N_POSSIBLE)
#define node_online(node)        node_state((node), N_ONLINE)
#define node_possible(node)        node_state((node), N_POSSIBLE)

#define for_each_node(node)           for_each_node_state(node, N_POSSIBLE)
#define for_each_online_node(node) for_each_node_state(node, N_ONLINE)

/*
 * For nodemask scrach area.
 * NODEMASK_ALLOC(type, name) allocates an object with a specified type and
 * name.
 */
#if NODES_SHIFT > 8 /* nodemask_t > 32 bytes */
#define NODEMASK_ALLOC(type, name, gfp_flags)        \
                        type *name = kmalloc(sizeof(*name), gfp_flags)
#define NODEMASK_FREE(m)                        kfree(m)
#else
#define NODEMASK_ALLOC(type, name, gfp_flags)        type _##name, *name = &_##name
#define NODEMASK_FREE(m)                        do {} while (0)
#endif

/* A example struture for using NODEMASK_ALLOC, used in mempolicy. */
struct nodemask_scratch {
        nodemask_t        mask1;
        nodemask_t        mask2;
};

#define NODEMASK_SCRATCH(x)                                                \
                        NODEMASK_ALLOC(struct nodemask_scratch, x,        \
                                        GFP_KERNEL | __GFP_NORETRY)
#define NODEMASK_SCRATCH_FREE(x)        NODEMASK_FREE(x)


#endif /* __LINUX_NODEMASK_H */

























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 *  linux/fs/pnode.h
 *
 * (C) Copyright IBM Corporation 2005.
 */
#ifndef _LINUX_PNODE_H
#define _LINUX_PNODE_H

#include <linux/list.h>
#include "mount.h"

#define IS_MNT_SHARED(m) ((m)->mnt.mnt_flags & MNT_SHARED)
#define IS_MNT_SLAVE(m) ((m)->mnt_master)
#define IS_MNT_NEW(m)  (!(m)->mnt_ns || is_anon_ns((m)->mnt_ns))
#define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED)
#define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE)
#define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED)
#define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED)
#define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED)
#define IS_MNT_LOCKED(m) ((m)->mnt.mnt_flags & MNT_LOCKED)

#define CL_EXPIRE                    0x01
#define CL_SLAVE                     0x02
#define CL_COPY_UNBINDABLE        0x04
#define CL_MAKE_SHARED                 0x08
#define CL_PRIVATE                 0x10
#define CL_SHARED_TO_SLAVE        0x20
#define CL_COPY_MNT_NS_FILE        0x40

#define CL_COPY_ALL                (CL_COPY_UNBINDABLE | CL_COPY_MNT_NS_FILE)

static inline void set_mnt_shared(struct mount *mnt)
{
        mnt->mnt.mnt_flags &= ~MNT_SHARED_MASK;
        mnt->mnt.mnt_flags |= MNT_SHARED;
}

void change_mnt_propagation(struct mount *, int);
int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
                struct hlist_head *);
int propagate_umount(struct list_head *);
int propagate_mount_busy(struct mount *, int);
void propagate_mount_unlock(struct mount *);
void mnt_release_group_id(struct mount *);
int get_dominating_id(struct mount *mnt, const struct path *root);
int mnt_get_count(struct mount *mnt);
void mnt_set_mountpoint(struct mount *, struct mountpoint *,
                        struct mount *);
void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp,
                           struct mount *mnt);
struct mount *copy_tree(struct mount *, struct dentry *, int);
bool is_path_reachable(struct mount *, struct dentry *,
                         const struct path *root);
int count_mounts(struct mnt_namespace *ns, struct mount *mnt);
#endif /* _LINUX_PNODE_H */












































































































































































































































































































































































    3 


























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef _LINUX_KPROBES_H
#define _LINUX_KPROBES_H
/*
 *  Kernel Probes (KProbes)
 *  include/linux/kprobes.h
 *
 * Copyright (C) IBM Corporation, 2002, 2004
 *
 * 2002-Oct        Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
 *                Probes initial implementation ( includes suggestions from
 *                Rusty Russell).
 * 2004-July        Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
 *                interface to access function arguments.
 * 2005-May        Hien Nguyen <hien@us.ibm.com> and Jim Keniston
 *                <jkenisto@us.ibm.com>  and Prasanna S Panchamukhi
 *                <prasanna@in.ibm.com> added function-return probes.
 */
#include <linux/compiler.h>
#include <linux/linkage.h>
#include <linux/list.h>
#include <linux/notifier.h>
#include <linux/smp.h>
#include <linux/bug.h>
#include <linux/percpu.h>
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/mutex.h>
#include <linux/ftrace.h>
#include <asm/kprobes.h>

#ifdef CONFIG_KPROBES

/* kprobe_status settings */
#define KPROBE_HIT_ACTIVE        0x00000001
#define KPROBE_HIT_SS                0x00000002
#define KPROBE_REENTER                0x00000004
#define KPROBE_HIT_SSDONE        0x00000008

#else /* CONFIG_KPROBES */
#include <asm-generic/kprobes.h>
typedef int kprobe_opcode_t;
struct arch_specific_insn {
        int dummy;
};
#endif /* CONFIG_KPROBES */

struct kprobe;
struct pt_regs;
struct kretprobe;
struct kretprobe_instance;
typedef int (*kprobe_pre_handler_t) (struct kprobe *, struct pt_regs *);
typedef void (*kprobe_post_handler_t) (struct kprobe *, struct pt_regs *,
                                       unsigned long flags);
typedef int (*kprobe_fault_handler_t) (struct kprobe *, struct pt_regs *,
                                       int trapnr);
typedef int (*kretprobe_handler_t) (struct kretprobe_instance *,
                                    struct pt_regs *);

struct kprobe {
        struct hlist_node hlist;

        /* list of kprobes for multi-handler support */
        struct list_head list;

        /*count the number of times this probe was temporarily disarmed */
        unsigned long nmissed;

        /* location of the probe point */
        kprobe_opcode_t *addr;

        /* Allow user to indicate symbol name of the probe point */
        const char *symbol_name;

        /* Offset into the symbol */
        unsigned int offset;

        /* Called before addr is executed. */
        kprobe_pre_handler_t pre_handler;

        /* Called after addr is executed, unless... */
        kprobe_post_handler_t post_handler;

        /*
         * ... called if executing addr causes a fault (eg. page fault).
         * Return 1 if it handled fault, otherwise kernel will see it.
         */
        kprobe_fault_handler_t fault_handler;

        /* Saved opcode (which has been replaced with breakpoint) */
        kprobe_opcode_t opcode;

        /* copy of the original instruction */
        struct arch_specific_insn ainsn;

        /*
         * Indicates various status flags.
         * Protected by kprobe_mutex after this kprobe is registered.
         */
        u32 flags;
};

/* Kprobe status flags */
#define KPROBE_FLAG_GONE        1 /* breakpoint has already gone */
#define KPROBE_FLAG_DISABLED        2 /* probe is temporarily disabled */
#define KPROBE_FLAG_OPTIMIZED        4 /*
                                   * probe is really optimized.
                                   * NOTE:
                                   * this flag is only for optimized_kprobe.
                                   */
#define KPROBE_FLAG_FTRACE        8 /* probe is using ftrace */

/* Has this kprobe gone ? */
static inline int kprobe_gone(struct kprobe *p)
{
        return p->flags & KPROBE_FLAG_GONE;
}

/* Is this kprobe disabled ? */
static inline int kprobe_disabled(struct kprobe *p)
{
        return p->flags & (KPROBE_FLAG_DISABLED | KPROBE_FLAG_GONE);
}

/* Is this kprobe really running optimized path ? */
static inline int kprobe_optimized(struct kprobe *p)
{
        return p->flags & KPROBE_FLAG_OPTIMIZED;
}

/* Is this kprobe uses ftrace ? */
static inline int kprobe_ftrace(struct kprobe *p)
{
        return p->flags & KPROBE_FLAG_FTRACE;
}

/*
 * Function-return probe -
 * Note:
 * User needs to provide a handler function, and initialize maxactive.
 * maxactive - The maximum number of instances of the probed function that
 * can be active concurrently.
 * nmissed - tracks the number of times the probed function's return was
 * ignored, due to maxactive being too low.
 *
 */
struct kretprobe {
        struct kprobe kp;
        kretprobe_handler_t handler;
        kretprobe_handler_t entry_handler;
        int maxactive;
        int nmissed;
        size_t data_size;
        struct hlist_head free_instances;
        raw_spinlock_t lock;
};

#define KRETPROBE_MAX_DATA_SIZE        4096

struct kretprobe_instance {
        union {
                struct hlist_node hlist;
                struct rcu_head rcu;
        };
        struct kretprobe *rp;
        kprobe_opcode_t *ret_addr;
        struct task_struct *task;
        void *fp;
        char data[];
};

struct kretprobe_blackpoint {
        const char *name;
        void *addr;
};

struct kprobe_blacklist_entry {
        struct list_head list;
        unsigned long start_addr;
        unsigned long end_addr;
};

#ifdef CONFIG_KPROBES
DECLARE_PER_CPU(struct kprobe *, current_kprobe);
DECLARE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);

/*
 * For #ifdef avoidance:
 */
static inline int kprobes_built_in(void)
{
        return 1;
}

extern void kprobe_busy_begin(void);
extern void kprobe_busy_end(void);

#ifdef CONFIG_KRETPROBES
extern void arch_prepare_kretprobe(struct kretprobe_instance *ri,
                                   struct pt_regs *regs);
extern int arch_trampoline_kprobe(struct kprobe *p);

/* If the trampoline handler called from a kprobe, use this version */
unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs,
                                void *trampoline_address,
                                void *frame_pointer);

static nokprobe_inline
unsigned long kretprobe_trampoline_handler(struct pt_regs *regs,
                                void *trampoline_address,
                                void *frame_pointer)
{
        unsigned long ret;
        /*
         * Set a dummy kprobe for avoiding kretprobe recursion.
         * Since kretprobe never runs in kprobe handler, no kprobe must
         * be running at this point.
         */
        kprobe_busy_begin();
        ret = __kretprobe_trampoline_handler(regs, trampoline_address, frame_pointer);
        kprobe_busy_end();

        return ret;
}

#else /* CONFIG_KRETPROBES */
static inline void arch_prepare_kretprobe(struct kretprobe *rp,
                                        struct pt_regs *regs)
{
}
static inline int arch_trampoline_kprobe(struct kprobe *p)
{
        return 0;
}
#endif /* CONFIG_KRETPROBES */

extern struct kretprobe_blackpoint kretprobe_blacklist[];

#ifdef CONFIG_KPROBES_SANITY_TEST
extern int init_test_probes(void);
#else
static inline int init_test_probes(void)
{
        return 0;
}
#endif /* CONFIG_KPROBES_SANITY_TEST */

extern int arch_prepare_kprobe(struct kprobe *p);
extern void arch_arm_kprobe(struct kprobe *p);
extern void arch_disarm_kprobe(struct kprobe *p);
extern int arch_init_kprobes(void);
extern void kprobes_inc_nmissed_count(struct kprobe *p);
extern bool arch_within_kprobe_blacklist(unsigned long addr);
extern int arch_populate_kprobe_blacklist(void);
extern bool arch_kprobe_on_func_entry(unsigned long offset);
extern int kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset);

extern bool within_kprobe_blacklist(unsigned long addr);
extern int kprobe_add_ksym_blacklist(unsigned long entry);
extern int kprobe_add_area_blacklist(unsigned long start, unsigned long end);

struct kprobe_insn_cache {
        struct mutex mutex;
        void *(*alloc)(void);        /* allocate insn page */
        void (*free)(void *);        /* free insn page */
        const char *sym;        /* symbol for insn pages */
        struct list_head pages; /* list of kprobe_insn_page */
        size_t insn_size;        /* size of instruction slot */
        int nr_garbage;
};

#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
extern kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c);
extern void __free_insn_slot(struct kprobe_insn_cache *c,
                             kprobe_opcode_t *slot, int dirty);
/* sleep-less address checking routine  */
extern bool __is_insn_slot_addr(struct kprobe_insn_cache *c,
                                unsigned long addr);

#define DEFINE_INSN_CACHE_OPS(__name)                                        \
extern struct kprobe_insn_cache kprobe_##__name##_slots;                \
                                                                        \
static inline kprobe_opcode_t *get_##__name##_slot(void)                \
{                                                                        \
        return __get_insn_slot(&kprobe_##__name##_slots);                \
}                                                                        \
                                                                        \
static inline void free_##__name##_slot(kprobe_opcode_t *slot, int dirty)\
{                                                                        \
        __free_insn_slot(&kprobe_##__name##_slots, slot, dirty);        \
}                                                                        \
                                                                        \
static inline bool is_kprobe_##__name##_slot(unsigned long addr)        \
{                                                                        \
        return __is_insn_slot_addr(&kprobe_##__name##_slots, addr);        \
}
#define KPROBE_INSN_PAGE_SYM                "kprobe_insn_page"
#define KPROBE_OPTINSN_PAGE_SYM                "kprobe_optinsn_page"
int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum,
                             unsigned long *value, char *type, char *sym);
#else /* __ARCH_WANT_KPROBES_INSN_SLOT */
#define DEFINE_INSN_CACHE_OPS(__name)                                        \
static inline bool is_kprobe_##__name##_slot(unsigned long addr)        \
{                                                                        \
        return 0;                                                        \
}
#endif

DEFINE_INSN_CACHE_OPS(insn);

#ifdef CONFIG_OPTPROBES
/*
 * Internal structure for direct jump optimized probe
 */
struct optimized_kprobe {
        struct kprobe kp;
        struct list_head list;        /* list for optimizing queue */
        struct arch_optimized_insn optinsn;
};

/* Architecture dependent functions for direct jump optimization */
extern int arch_prepared_optinsn(struct arch_optimized_insn *optinsn);
extern int arch_check_optimized_kprobe(struct optimized_kprobe *op);
extern int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
                                         struct kprobe *orig);
extern void arch_remove_optimized_kprobe(struct optimized_kprobe *op);
extern void arch_optimize_kprobes(struct list_head *oplist);
extern void arch_unoptimize_kprobes(struct list_head *oplist,
                                    struct list_head *done_list);
extern void arch_unoptimize_kprobe(struct optimized_kprobe *op);
extern int arch_within_optimized_kprobe(struct optimized_kprobe *op,
                                        unsigned long addr);

extern void opt_pre_handler(struct kprobe *p, struct pt_regs *regs);

DEFINE_INSN_CACHE_OPS(optinsn);

#ifdef CONFIG_SYSCTL
extern int sysctl_kprobes_optimization;
extern int proc_kprobes_optimization_handler(struct ctl_table *table,
                                             int write, void *buffer,
                                             size_t *length, loff_t *ppos);
#endif
extern void wait_for_kprobe_optimizer(void);
bool optprobe_queued_unopt(struct optimized_kprobe *op);
bool kprobe_disarmed(struct kprobe *p);
#else
static inline void wait_for_kprobe_optimizer(void) { }
#endif /* CONFIG_OPTPROBES */
#ifdef CONFIG_KPROBES_ON_FTRACE
extern void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
                                  struct ftrace_ops *ops, struct pt_regs *regs);
extern int arch_prepare_kprobe_ftrace(struct kprobe *p);
#endif

int arch_check_ftrace_location(struct kprobe *p);

/* Get the kprobe at this addr (if any) - called with preemption disabled */
struct kprobe *get_kprobe(void *addr);

/* kprobe_running() will just return the current_kprobe on this CPU */
static inline struct kprobe *kprobe_running(void)
{
        return (__this_cpu_read(current_kprobe));
}

static inline void reset_current_kprobe(void)
{
        __this_cpu_write(current_kprobe, NULL);
}

static inline struct kprobe_ctlblk *get_kprobe_ctlblk(void)
{
        return this_cpu_ptr(&kprobe_ctlblk);
}

kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset);
int register_kprobe(struct kprobe *p);
void unregister_kprobe(struct kprobe *p);
int register_kprobes(struct kprobe **kps, int num);
void unregister_kprobes(struct kprobe **kps, int num);
unsigned long arch_deref_entry_point(void *);

int register_kretprobe(struct kretprobe *rp);
void unregister_kretprobe(struct kretprobe *rp);
int register_kretprobes(struct kretprobe **rps, int num);
void unregister_kretprobes(struct kretprobe **rps, int num);

void kprobe_flush_task(struct task_struct *tk);

void kprobe_free_init_mem(void);

int disable_kprobe(struct kprobe *kp);
int enable_kprobe(struct kprobe *kp);

void dump_kprobe(struct kprobe *kp);

void *alloc_insn_page(void);
void free_insn_page(void *page);

int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
                       char *sym);

int arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
                            char *type, char *sym);
#else /* !CONFIG_KPROBES: */

static inline int kprobes_built_in(void)
{
        return 0;
}
static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
{
        return 0;
}
static inline struct kprobe *get_kprobe(void *addr)
{
        return NULL;
}
static inline struct kprobe *kprobe_running(void)
{
        return NULL;
}
static inline int register_kprobe(struct kprobe *p)
{
        return -ENOSYS;
}
static inline int register_kprobes(struct kprobe **kps, int num)
{
        return -ENOSYS;
}
static inline void unregister_kprobe(struct kprobe *p)
{
}
static inline void unregister_kprobes(struct kprobe **kps, int num)
{
}
static inline int register_kretprobe(struct kretprobe *rp)
{
        return -ENOSYS;
}
static inline int register_kretprobes(struct kretprobe **rps, int num)
{
        return -ENOSYS;
}
static inline void unregister_kretprobe(struct kretprobe *rp)
{
}
static inline void unregister_kretprobes(struct kretprobe **rps, int num)
{
}
static inline void kprobe_flush_task(struct task_struct *tk)
{
}
static inline void kprobe_free_init_mem(void)
{
}
static inline int disable_kprobe(struct kprobe *kp)
{
        return -ENOSYS;
}
static inline int enable_kprobe(struct kprobe *kp)
{
        return -ENOSYS;
}

static inline bool within_kprobe_blacklist(unsigned long addr)
{
        return true;
}
static inline int kprobe_get_kallsym(unsigned int symnum, unsigned long *value,
                                     char *type, char *sym)
{
        return -ERANGE;
}
#endif /* CONFIG_KPROBES */
static inline int disable_kretprobe(struct kretprobe *rp)
{
        return disable_kprobe(&rp->kp);
}
static inline int enable_kretprobe(struct kretprobe *rp)
{
        return enable_kprobe(&rp->kp);
}

#ifndef CONFIG_KPROBES
static inline bool is_kprobe_insn_slot(unsigned long addr)
{
        return false;
}
#endif
#ifndef CONFIG_OPTPROBES
static inline bool is_kprobe_optinsn_slot(unsigned long addr)
{
        return false;
}
#endif

/* Returns true if kprobes handled the fault */
static nokprobe_inline bool kprobe_page_fault(struct pt_regs *regs,
                                              unsigned int trap)
{
        if (!kprobes_built_in())
                return false;
        if (user_mode(regs))
                return false;
        /*
         * To be potentially processing a kprobe fault and to be allowed
         * to call kprobe_running(), we have to be non-preemptible.
         */
        if (preemptible())
                return false;
        if (!kprobe_running())
                return false;
        return kprobe_fault_handler(regs, trap);
}

#endif /* _LINUX_KPROBES_H */



































































































    1 

































































































    1 
















    1 













    1 

































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCATTERLIST_H
#define _LINUX_SCATTERLIST_H

#include <linux/string.h>
#include <linux/types.h>
#include <linux/bug.h>
#include <linux/mm.h>
#include <asm/io.h>

struct scatterlist {
        unsigned long        page_link;
        unsigned int        offset;
        unsigned int        length;
        dma_addr_t        dma_address;
#ifdef CONFIG_NEED_SG_DMA_LENGTH
        unsigned int        dma_length;
#endif
};

/*
 * Since the above length field is an unsigned int, below we define the maximum
 * length in bytes that can be stored in one scatterlist entry.
 */
#define SCATTERLIST_MAX_SEGMENT (UINT_MAX & PAGE_MASK)

/*
 * These macros should be used after a dma_map_sg call has been done
 * to get bus addresses of each of the SG entries and their lengths.
 * You should only work with the number of sg entries dma_map_sg
 * returns, or alternatively stop on the first sg_dma_len(sg) which
 * is 0.
 */
#define sg_dma_address(sg)        ((sg)->dma_address)

#ifdef CONFIG_NEED_SG_DMA_LENGTH
#define sg_dma_len(sg)                ((sg)->dma_length)
#else
#define sg_dma_len(sg)                ((sg)->length)
#endif

struct sg_table {
        struct scatterlist *sgl;        /* the list */
        unsigned int nents;                /* number of mapped entries */
        unsigned int orig_nents;        /* original size of list */
};

/*
 * Notes on SG table design.
 *
 * We use the unsigned long page_link field in the scatterlist struct to place
 * the page pointer AND encode information about the sg table as well. The two
 * lower bits are reserved for this information.
 *
 * If bit 0 is set, then the page_link contains a pointer to the next sg
 * table list. Otherwise the next entry is at sg + 1.
 *
 * If bit 1 is set, then this sg entry is the last element in a list.
 *
 * See sg_next().
 *
 */

#define SG_CHAIN        0x01UL
#define SG_END                0x02UL

/*
 * We overload the LSB of the page pointer to indicate whether it's
 * a valid sg entry, or whether it points to the start of a new scatterlist.
 * Those low bits are there for everyone! (thanks mason :-)
 */
#define sg_is_chain(sg)                ((sg)->page_link & SG_CHAIN)
#define sg_is_last(sg)                ((sg)->page_link & SG_END)
#define sg_chain_ptr(sg)        \
        ((struct scatterlist *) ((sg)->page_link & ~(SG_CHAIN | SG_END)))

/**
 * sg_assign_page - Assign a given page to an SG entry
 * @sg:                    SG entry
 * @page:            The page
 *
 * Description:
 *   Assign page to sg entry. Also see sg_set_page(), the most commonly used
 *   variant.
 *
 **/
static inline void sg_assign_page(struct scatterlist *sg, struct page *page)
{
        unsigned long page_link = sg->page_link & (SG_CHAIN | SG_END);

        /*
         * In order for the low bit stealing approach to work, pages
         * must be aligned at a 32-bit boundary as a minimum.
         */
        BUG_ON((unsigned long) page & (SG_CHAIN | SG_END));
#ifdef CONFIG_DEBUG_SG
        BUG_ON(sg_is_chain(sg));
#endif
        sg->page_link = page_link | (unsigned long) page;
}

/**
 * sg_set_page - Set sg entry to point at given page
 * @sg:                 SG entry
 * @page:         The page
 * @len:         Length of data
 * @offset:         Offset into page
 *
 * Description:
 *   Use this function to set an sg entry pointing at a page, never assign
 *   the page directly. We encode sg table information in the lower bits
 *   of the page pointer. See sg_page() for looking up the page belonging
 *   to an sg entry.
 *
 **/
static inline void sg_set_page(struct scatterlist *sg, struct page *page,
                               unsigned int len, unsigned int offset)
{
        sg_assign_page(sg, page);
        sg->offset = offset;
        sg->length = len;
}

static inline struct page *sg_page(struct scatterlist *sg)
{
#ifdef CONFIG_DEBUG_SG
        BUG_ON(sg_is_chain(sg));
#endif
        return (struct page *)((sg)->page_link & ~(SG_CHAIN | SG_END));
}

/**
 * sg_set_buf - Set sg entry to point at given data
 * @sg:                 SG entry
 * @buf:         Data
 * @buflen:         Data length
 *
 **/
static inline void sg_set_buf(struct scatterlist *sg, const void *buf,
                              unsigned int buflen)
{
#ifdef CONFIG_DEBUG_SG
        BUG_ON(!virt_addr_valid(buf));
#endif
        sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf));
}

/*
 * Loop over each sg element, following the pointer to a new list if necessary
 */
#define for_each_sg(sglist, sg, nr, __i)        \
        for (__i = 0, sg = (sglist); __i < (nr); __i++, sg = sg_next(sg))

/*
 * Loop over each sg element in the given sg_table object.
 */
#define for_each_sgtable_sg(sgt, sg, i)                \
        for_each_sg((sgt)->sgl, sg, (sgt)->orig_nents, i)

/*
 * Loop over each sg element in the given *DMA mapped* sg_table object.
 * Please use sg_dma_address(sg) and sg_dma_len(sg) to extract DMA addresses
 * of the each element.
 */
#define for_each_sgtable_dma_sg(sgt, sg, i)        \
        for_each_sg((sgt)->sgl, sg, (sgt)->nents, i)

static inline void __sg_chain(struct scatterlist *chain_sg,
                              struct scatterlist *sgl)
{
        /*
         * offset and length are unused for chain entry. Clear them.
         */
        chain_sg->offset = 0;
        chain_sg->length = 0;

        /*
         * Set lowest bit to indicate a link pointer, and make sure to clear
         * the termination bit if it happens to be set.
         */
        chain_sg->page_link = ((unsigned long) sgl | SG_CHAIN) & ~SG_END;
}

/**
 * sg_chain - Chain two sglists together
 * @prv:        First scatterlist
 * @prv_nents:        Number of entries in prv
 * @sgl:        Second scatterlist
 *
 * Description:
 *   Links @prv@ and @sgl@ together, to form a longer scatterlist.
 *
 **/
static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents,
                            struct scatterlist *sgl)
{
        __sg_chain(&prv[prv_nents - 1], sgl);
}

/**
 * sg_mark_end - Mark the end of the scatterlist
 * @sg:                 SG entryScatterlist
 *
 * Description:
 *   Marks the passed in sg entry as the termination point for the sg
 *   table. A call to sg_next() on this entry will return NULL.
 *
 **/
static inline void sg_mark_end(struct scatterlist *sg)
{
        /*
         * Set termination bit, clear potential chain bit
         */
        sg->page_link |= SG_END;
        sg->page_link &= ~SG_CHAIN;
}

/**
 * sg_unmark_end - Undo setting the end of the scatterlist
 * @sg:                 SG entryScatterlist
 *
 * Description:
 *   Removes the termination marker from the given entry of the scatterlist.
 *
 **/
static inline void sg_unmark_end(struct scatterlist *sg)
{
        sg->page_link &= ~SG_END;
}

/**
 * sg_phys - Return physical address of an sg entry
 * @sg:             SG entry
 *
 * Description:
 *   This calls page_to_phys() on the page in this sg entry, and adds the
 *   sg offset. The caller must know that it is legal to call page_to_phys()
 *   on the sg page.
 *
 **/
static inline dma_addr_t sg_phys(struct scatterlist *sg)
{
        return page_to_phys(sg_page(sg)) + sg->offset;
}

/**
 * sg_virt - Return virtual address of an sg entry
 * @sg:      SG entry
 *
 * Description:
 *   This calls page_address() on the page in this sg entry, and adds the
 *   sg offset. The caller must know that the sg page has a valid virtual
 *   mapping.
 *
 **/
static inline void *sg_virt(struct scatterlist *sg)
{
        return page_address(sg_page(sg)) + sg->offset;
}

/**
 * sg_init_marker - Initialize markers in sg table
 * @sgl:           The SG table
 * @nents:           Number of entries in table
 *
 **/
static inline void sg_init_marker(struct scatterlist *sgl,
                                  unsigned int nents)
{
        sg_mark_end(&sgl[nents - 1]);
}

int sg_nents(struct scatterlist *sg);
int sg_nents_for_len(struct scatterlist *sg, u64 len);
struct scatterlist *sg_next(struct scatterlist *);
struct scatterlist *sg_last(struct scatterlist *s, unsigned int);
void sg_init_table(struct scatterlist *, unsigned int);
void sg_init_one(struct scatterlist *, const void *, unsigned int);
int sg_split(struct scatterlist *in, const int in_mapped_nents,
             const off_t skip, const int nb_splits,
             const size_t *split_sizes,
             struct scatterlist **out, int *out_mapped_nents,
             gfp_t gfp_mask);

typedef struct scatterlist *(sg_alloc_fn)(unsigned int, gfp_t);
typedef void (sg_free_fn)(struct scatterlist *, unsigned int);

void __sg_free_table(struct sg_table *, unsigned int, unsigned int,
                     sg_free_fn *);
void sg_free_table(struct sg_table *);
int __sg_alloc_table(struct sg_table *, unsigned int, unsigned int,
                     struct scatterlist *, unsigned int, gfp_t, sg_alloc_fn *);
int sg_alloc_table(struct sg_table *, unsigned int, gfp_t);
struct scatterlist *__sg_alloc_table_from_pages(struct sg_table *sgt,
                struct page **pages, unsigned int n_pages, unsigned int offset,
                unsigned long size, unsigned int max_segment,
                struct scatterlist *prv, unsigned int left_pages,
                gfp_t gfp_mask);
int sg_alloc_table_from_pages(struct sg_table *sgt, struct page **pages,
                              unsigned int n_pages, unsigned int offset,
                              unsigned long size, gfp_t gfp_mask);

#ifdef CONFIG_SGL_ALLOC
struct scatterlist *sgl_alloc_order(unsigned long long length,
                                    unsigned int order, bool chainable,
                                    gfp_t gfp, unsigned int *nent_p);
struct scatterlist *sgl_alloc(unsigned long long length, gfp_t gfp,
                              unsigned int *nent_p);
void sgl_free_n_order(struct scatterlist *sgl, int nents, int order);
void sgl_free_order(struct scatterlist *sgl, int order);
void sgl_free(struct scatterlist *sgl);
#endif /* CONFIG_SGL_ALLOC */

size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
                      size_t buflen, off_t skip, bool to_buffer);

size_t sg_copy_from_buffer(struct scatterlist *sgl, unsigned int nents,
                           const void *buf, size_t buflen);
size_t sg_copy_to_buffer(struct scatterlist *sgl, unsigned int nents,
                         void *buf, size_t buflen);

size_t sg_pcopy_from_buffer(struct scatterlist *sgl, unsigned int nents,
                            const void *buf, size_t buflen, off_t skip);
size_t sg_pcopy_to_buffer(struct scatterlist *sgl, unsigned int nents,
                          void *buf, size_t buflen, off_t skip);
size_t sg_zero_buffer(struct scatterlist *sgl, unsigned int nents,
                       size_t buflen, off_t skip);

/*
 * Maximum number of entries that will be allocated in one piece, if
 * a list larger than this is required then chaining will be utilized.
 */
#define SG_MAX_SINGLE_ALLOC                (PAGE_SIZE / sizeof(struct scatterlist))

/*
 * The maximum number of SG segments that we will put inside a
 * scatterlist (unless chaining is used). Should ideally fit inside a
 * single page, to avoid a higher order allocation.  We could define this
 * to SG_MAX_SINGLE_ALLOC to pack correctly at the highest order.  The
 * minimum value is 32
 */
#define SG_CHUNK_SIZE        128

/*
 * Like SG_CHUNK_SIZE, but for archs that have sg chaining. This limit
 * is totally arbitrary, a setting of 2048 will get you at least 8mb ios.
 */
#ifdef CONFIG_ARCH_NO_SG_CHAIN
#define SG_MAX_SEGMENTS        SG_CHUNK_SIZE
#else
#define SG_MAX_SEGMENTS        2048
#endif

#ifdef CONFIG_SG_POOL
void sg_free_table_chained(struct sg_table *table,
                           unsigned nents_first_chunk);
int sg_alloc_table_chained(struct sg_table *table, int nents,
                           struct scatterlist *first_chunk,
                           unsigned nents_first_chunk);
#endif

/*
 * sg page iterator
 *
 * Iterates over sg entries page-by-page.  On each successful iteration, you
 * can call sg_page_iter_page(@piter) to get the current page.
 * @piter->sg will point to the sg holding this page and @piter->sg_pgoffset to
 * the page's page offset within the sg. The iteration will stop either when a
 * maximum number of sg entries was reached or a terminating sg
 * (sg_last(sg) == true) was reached.
 */
struct sg_page_iter {
        struct scatterlist        *sg;                /* sg holding the page */
        unsigned int                sg_pgoffset;        /* page offset within the sg */

        /* these are internal states, keep away */
        unsigned int                __nents;        /* remaining sg entries */
        int                        __pg_advance;        /* nr pages to advance at the
                                                 * next step */
};

/*
 * sg page iterator for DMA addresses
 *
 * This is the same as sg_page_iter however you can call
 * sg_page_iter_dma_address(@dma_iter) to get the page's DMA
 * address. sg_page_iter_page() cannot be called on this iterator.
 */
struct sg_dma_page_iter {
        struct sg_page_iter base;
};

bool __sg_page_iter_next(struct sg_page_iter *piter);
bool __sg_page_iter_dma_next(struct sg_dma_page_iter *dma_iter);
void __sg_page_iter_start(struct sg_page_iter *piter,
                          struct scatterlist *sglist, unsigned int nents,
                          unsigned long pgoffset);
/**
 * sg_page_iter_page - get the current page held by the page iterator
 * @piter:        page iterator holding the page
 */
static inline struct page *sg_page_iter_page(struct sg_page_iter *piter)
{
        return nth_page(sg_page(piter->sg), piter->sg_pgoffset);
}

/**
 * sg_page_iter_dma_address - get the dma address of the current page held by
 * the page iterator.
 * @dma_iter:        page iterator holding the page
 */
static inline dma_addr_t
sg_page_iter_dma_address(struct sg_dma_page_iter *dma_iter)
{
        return sg_dma_address(dma_iter->base.sg) +
               (dma_iter->base.sg_pgoffset << PAGE_SHIFT);
}

/**
 * for_each_sg_page - iterate over the pages of the given sg list
 * @sglist:        sglist to iterate over
 * @piter:        page iterator to hold current page, sg, sg_pgoffset
 * @nents:        maximum number of sg entries to iterate over
 * @pgoffset:        starting page offset (in pages)
 *
 * Callers may use sg_page_iter_page() to get each page pointer.
 * In each loop it operates on PAGE_SIZE unit.
 */
#define for_each_sg_page(sglist, piter, nents, pgoffset)                   \
        for (__sg_page_iter_start((piter), (sglist), (nents), (pgoffset)); \
             __sg_page_iter_next(piter);)

/**
 * for_each_sg_dma_page - iterate over the pages of the given sg list
 * @sglist:        sglist to iterate over
 * @dma_iter:        DMA page iterator to hold current page
 * @dma_nents:        maximum number of sg entries to iterate over, this is the value
 *              returned from dma_map_sg
 * @pgoffset:        starting page offset (in pages)
 *
 * Callers may use sg_page_iter_dma_address() to get each page's DMA address.
 * In each loop it operates on PAGE_SIZE unit.
 */
#define for_each_sg_dma_page(sglist, dma_iter, dma_nents, pgoffset)            \
        for (__sg_page_iter_start(&(dma_iter)->base, sglist, dma_nents,        \
                                  pgoffset);                                   \
             __sg_page_iter_dma_next(dma_iter);)

/**
 * for_each_sgtable_page - iterate over all pages in the sg_table object
 * @sgt:        sg_table object to iterate over
 * @piter:        page iterator to hold current page
 * @pgoffset:        starting page offset (in pages)
 *
 * Iterates over the all memory pages in the buffer described by
 * a scatterlist stored in the given sg_table object.
 * See also for_each_sg_page(). In each loop it operates on PAGE_SIZE unit.
 */
#define for_each_sgtable_page(sgt, piter, pgoffset)        \
        for_each_sg_page((sgt)->sgl, piter, (sgt)->orig_nents, pgoffset)

/**
 * for_each_sgtable_dma_page - iterate over the DMA mapped sg_table object
 * @sgt:        sg_table object to iterate over
 * @dma_iter:        DMA page iterator to hold current page
 * @pgoffset:        starting page offset (in pages)
 *
 * Iterates over the all DMA mapped pages in the buffer described by
 * a scatterlist stored in the given sg_table object.
 * See also for_each_sg_dma_page(). In each loop it operates on PAGE_SIZE
 * unit.
 */
#define for_each_sgtable_dma_page(sgt, dma_iter, pgoffset)        \
        for_each_sg_dma_page((sgt)->sgl, dma_iter, (sgt)->nents, pgoffset)


/*
 * Mapping sg iterator
 *
 * Iterates over sg entries mapping page-by-page.  On each successful
 * iteration, @miter->page points to the mapped page and
 * @miter->length bytes of data can be accessed at @miter->addr.  As
 * long as an interation is enclosed between start and stop, the user
 * is free to choose control structure and when to stop.
 *
 * @miter->consumed is set to @miter->length on each iteration.  It
 * can be adjusted if the user can't consume all the bytes in one go.
 * Also, a stopped iteration can be resumed by calling next on it.
 * This is useful when iteration needs to release all resources and
 * continue later (e.g. at the next interrupt).
 */

#define SG_MITER_ATOMIC                (1 << 0)         /* use kmap_atomic */
#define SG_MITER_TO_SG                (1 << 1)        /* flush back to phys on unmap */
#define SG_MITER_FROM_SG        (1 << 2)        /* nop */

struct sg_mapping_iter {
        /* the following three fields can be accessed directly */
        struct page                *page;                /* currently mapped page */
        void                        *addr;                /* pointer to the mapped area */
        size_t                        length;                /* length of the mapped area */
        size_t                        consumed;        /* number of consumed bytes */
        struct sg_page_iter        piter;                /* page iterator */

        /* these are internal states, keep away */
        unsigned int                __offset;        /* offset within page */
        unsigned int                __remaining;        /* remaining bytes on page */
        unsigned int                __flags;
};

void sg_miter_start(struct sg_mapping_iter *miter, struct scatterlist *sgl,
                    unsigned int nents, unsigned int flags);
bool sg_miter_skip(struct sg_mapping_iter *miter, off_t offset);
bool sg_miter_next(struct sg_mapping_iter *miter);
void sg_miter_stop(struct sg_mapping_iter *miter);

#endif /* _LINUX_SCATTERLIST_H */


































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* request_key authorisation token key type
 *
 * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#ifndef _KEYS_REQUEST_KEY_AUTH_TYPE_H
#define _KEYS_REQUEST_KEY_AUTH_TYPE_H

#include <linux/key.h>

/*
 * Authorisation record for request_key().
 */
struct request_key_auth {
        struct rcu_head                rcu;
        struct key                *target_key;
        struct key                *dest_keyring;
        const struct cred        *cred;
        void                        *callout_info;
        size_t                        callout_len;
        pid_t                        pid;
        char                        op[8];
} __randomize_layout;

static inline struct request_key_auth *get_request_key_auth(const struct key *key)
{
        return key->payload.data[0];
}


#endif /* _KEYS_REQUEST_KEY_AUTH_TYPE_H */












































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
/* SPDX-License-Identifier: GPL-2.0 */

#undef TRACE_SYSTEM
#define TRACE_SYSTEM rpm

#if !defined(_TRACE_RUNTIME_POWER_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_RUNTIME_POWER_H

#include <linux/ktime.h>
#include <linux/tracepoint.h>

struct device;

/*
 * The rpm_internal events are used for tracing some important
 * runtime pm internal functions.
 */
DECLARE_EVENT_CLASS(rpm_internal,

        TP_PROTO(struct device *dev, int flags),

        TP_ARGS(dev, flags),

        TP_STRUCT__entry(
                __string(       name,                dev_name(dev)        )
                __field(        int,            flags           )
                __field(        int ,           usage_count        )
                __field(        int ,           disable_depth   )
                __field(        int ,           runtime_auto        )
                __field(        int ,           request_pending        )
                __field(        int ,           irq_safe        )
                __field(        int ,           child_count         )
        ),

        TP_fast_assign(
                __assign_str(name, dev_name(dev));
                __entry->flags = flags;
                __entry->usage_count = atomic_read(
                        &dev->power.usage_count);
                __entry->disable_depth = dev->power.disable_depth;
                __entry->runtime_auto = dev->power.runtime_auto;
                __entry->request_pending = dev->power.request_pending;
                __entry->irq_safe = dev->power.irq_safe;
                __entry->child_count = atomic_read(
                        &dev->power.child_count);
        ),

        TP_printk("%s flags-%x cnt-%-2d dep-%-2d auto-%-1d p-%-1d"
                        " irq-%-1d child-%d",
                        __get_str(name), __entry->flags,
                        __entry->usage_count,
                        __entry->disable_depth,
                        __entry->runtime_auto,
                        __entry->request_pending,
                        __entry->irq_safe,
                        __entry->child_count
                 )
);
DEFINE_EVENT(rpm_internal, rpm_suspend,

        TP_PROTO(struct device *dev, int flags),

        TP_ARGS(dev, flags)
);
DEFINE_EVENT(rpm_internal, rpm_resume,

        TP_PROTO(struct device *dev, int flags),

        TP_ARGS(dev, flags)
);
DEFINE_EVENT(rpm_internal, rpm_idle,

        TP_PROTO(struct device *dev, int flags),

        TP_ARGS(dev, flags)
);
DEFINE_EVENT(rpm_internal, rpm_usage,

        TP_PROTO(struct device *dev, int flags),

        TP_ARGS(dev, flags)
);

TRACE_EVENT(rpm_return_int,
        TP_PROTO(struct device *dev, unsigned long ip, int ret),
        TP_ARGS(dev, ip, ret),

        TP_STRUCT__entry(
                __string(       name,                dev_name(dev))
                __field(        unsigned long,                ip        )
                __field(        int,                        ret        )
        ),

        TP_fast_assign(
                __assign_str(name, dev_name(dev));
                __entry->ip = ip;
                __entry->ret = ret;
        ),

        TP_printk("%pS:%s ret=%d", (void *)__entry->ip, __get_str(name),
                __entry->ret)
);

#endif /* _TRACE_RUNTIME_POWER_H */

/* This part must be outside protection */
#include <trace/define_trace.h>






















































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM vmscan

#if !defined(_TRACE_VMSCAN_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_VMSCAN_H

#include <linux/types.h>
#include <linux/tracepoint.h>
#include <linux/mm.h>
#include <linux/memcontrol.h>
#include <trace/events/mmflags.h>

#define RECLAIM_WB_ANON                0x0001u
#define RECLAIM_WB_FILE                0x0002u
#define RECLAIM_WB_MIXED        0x0010u
#define RECLAIM_WB_SYNC                0x0004u /* Unused, all reclaim async */
#define RECLAIM_WB_ASYNC        0x0008u
#define RECLAIM_WB_LRU                (RECLAIM_WB_ANON|RECLAIM_WB_FILE)

#define show_reclaim_flags(flags)                                \
        (flags) ? __print_flags(flags, "|",                        \
                {RECLAIM_WB_ANON,        "RECLAIM_WB_ANON"},        \
                {RECLAIM_WB_FILE,        "RECLAIM_WB_FILE"},        \
                {RECLAIM_WB_MIXED,        "RECLAIM_WB_MIXED"},        \
                {RECLAIM_WB_SYNC,        "RECLAIM_WB_SYNC"},        \
                {RECLAIM_WB_ASYNC,        "RECLAIM_WB_ASYNC"}        \
                ) : "RECLAIM_WB_NONE"

#define trace_reclaim_flags(file) ( \
        (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \
        (RECLAIM_WB_ASYNC) \
        )

TRACE_EVENT(mm_vmscan_kswapd_sleep,

        TP_PROTO(int nid),

        TP_ARGS(nid),

        TP_STRUCT__entry(
                __field(        int,        nid        )
        ),

        TP_fast_assign(
                __entry->nid        = nid;
        ),

        TP_printk("nid=%d", __entry->nid)
);

TRACE_EVENT(mm_vmscan_kswapd_wake,

        TP_PROTO(int nid, int zid, int order),

        TP_ARGS(nid, zid, order),

        TP_STRUCT__entry(
                __field(        int,        nid        )
                __field(        int,        zid        )
                __field(        int,        order        )
        ),

        TP_fast_assign(
                __entry->nid        = nid;
                __entry->zid    = zid;
                __entry->order        = order;
        ),

        TP_printk("nid=%d order=%d",
                __entry->nid,
                __entry->order)
);

TRACE_EVENT(mm_vmscan_wakeup_kswapd,

        TP_PROTO(int nid, int zid, int order, gfp_t gfp_flags),

        TP_ARGS(nid, zid, order, gfp_flags),

        TP_STRUCT__entry(
                __field(        int,        nid                )
                __field(        int,        zid                )
                __field(        int,        order                )
                __field(        gfp_t,        gfp_flags        )
        ),

        TP_fast_assign(
                __entry->nid                = nid;
                __entry->zid                = zid;
                __entry->order                = order;
                __entry->gfp_flags        = gfp_flags;
        ),

        TP_printk("nid=%d order=%d gfp_flags=%s",
                __entry->nid,
                __entry->order,
                show_gfp_flags(__entry->gfp_flags))
);

DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template,

        TP_PROTO(int order, gfp_t gfp_flags),

        TP_ARGS(order, gfp_flags),

        TP_STRUCT__entry(
                __field(        int,        order                )
                __field(        gfp_t,        gfp_flags        )
        ),

        TP_fast_assign(
                __entry->order                = order;
                __entry->gfp_flags        = gfp_flags;
        ),

        TP_printk("order=%d gfp_flags=%s",
                __entry->order,
                show_gfp_flags(__entry->gfp_flags))
);

DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_direct_reclaim_begin,

        TP_PROTO(int order, gfp_t gfp_flags),

        TP_ARGS(order, gfp_flags)
);

#ifdef CONFIG_MEMCG
DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_reclaim_begin,

        TP_PROTO(int order, gfp_t gfp_flags),

        TP_ARGS(order, gfp_flags)
);

DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_softlimit_reclaim_begin,

        TP_PROTO(int order, gfp_t gfp_flags),

        TP_ARGS(order, gfp_flags)
);
#endif /* CONFIG_MEMCG */

DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_end_template,

        TP_PROTO(unsigned long nr_reclaimed),

        TP_ARGS(nr_reclaimed),

        TP_STRUCT__entry(
                __field(        unsigned long,        nr_reclaimed        )
        ),

        TP_fast_assign(
                __entry->nr_reclaimed        = nr_reclaimed;
        ),

        TP_printk("nr_reclaimed=%lu", __entry->nr_reclaimed)
);

DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_direct_reclaim_end,

        TP_PROTO(unsigned long nr_reclaimed),

        TP_ARGS(nr_reclaimed)
);

#ifdef CONFIG_MEMCG
DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_reclaim_end,

        TP_PROTO(unsigned long nr_reclaimed),

        TP_ARGS(nr_reclaimed)
);

DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_softlimit_reclaim_end,

        TP_PROTO(unsigned long nr_reclaimed),

        TP_ARGS(nr_reclaimed)
);
#endif /* CONFIG_MEMCG */

TRACE_EVENT(mm_shrink_slab_start,
        TP_PROTO(struct shrinker *shr, struct shrink_control *sc,
                long nr_objects_to_shrink, unsigned long cache_items,
                unsigned long long delta, unsigned long total_scan,
                int priority),

        TP_ARGS(shr, sc, nr_objects_to_shrink, cache_items, delta, total_scan,
                priority),

        TP_STRUCT__entry(
                __field(struct shrinker *, shr)
                __field(void *, shrink)
                __field(int, nid)
                __field(long, nr_objects_to_shrink)
                __field(gfp_t, gfp_flags)
                __field(unsigned long, cache_items)
                __field(unsigned long long, delta)
                __field(unsigned long, total_scan)
                __field(int, priority)
        ),

        TP_fast_assign(
                __entry->shr = shr;
                __entry->shrink = shr->scan_objects;
                __entry->nid = sc->nid;
                __entry->nr_objects_to_shrink = nr_objects_to_shrink;
                __entry->gfp_flags = sc->gfp_mask;
                __entry->cache_items = cache_items;
                __entry->delta = delta;
                __entry->total_scan = total_scan;
                __entry->priority = priority;
        ),

        TP_printk("%pS %p: nid: %d objects to shrink %ld gfp_flags %s cache items %ld delta %lld total_scan %ld priority %d",
                __entry->shrink,
                __entry->shr,
                __entry->nid,
                __entry->nr_objects_to_shrink,
                show_gfp_flags(__entry->gfp_flags),
                __entry->cache_items,
                __entry->delta,
                __entry->total_scan,
                __entry->priority)
);

TRACE_EVENT(mm_shrink_slab_end,
        TP_PROTO(struct shrinker *shr, int nid, int shrinker_retval,
                long unused_scan_cnt, long new_scan_cnt, long total_scan),

        TP_ARGS(shr, nid, shrinker_retval, unused_scan_cnt, new_scan_cnt,
                total_scan),

        TP_STRUCT__entry(
                __field(struct shrinker *, shr)
                __field(int, nid)
                __field(void *, shrink)
                __field(long, unused_scan)
                __field(long, new_scan)
                __field(int, retval)
                __field(long, total_scan)
        ),

        TP_fast_assign(
                __entry->shr = shr;
                __entry->nid = nid;
                __entry->shrink = shr->scan_objects;
                __entry->unused_scan = unused_scan_cnt;
                __entry->new_scan = new_scan_cnt;
                __entry->retval = shrinker_retval;
                __entry->total_scan = total_scan;
        ),

        TP_printk("%pS %p: nid: %d unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d",
                __entry->shrink,
                __entry->shr,
                __entry->nid,
                __entry->unused_scan,
                __entry->new_scan,
                __entry->total_scan,
                __entry->retval)
);

TRACE_EVENT(mm_vmscan_lru_isolate,
        TP_PROTO(int highest_zoneidx,
                int order,
                unsigned long nr_requested,
                unsigned long nr_scanned,
                unsigned long nr_skipped,
                unsigned long nr_taken,
                isolate_mode_t isolate_mode,
                int lru),

        TP_ARGS(highest_zoneidx, order, nr_requested, nr_scanned, nr_skipped, nr_taken, isolate_mode, lru),

        TP_STRUCT__entry(
                __field(int, highest_zoneidx)
                __field(int, order)
                __field(unsigned long, nr_requested)
                __field(unsigned long, nr_scanned)
                __field(unsigned long, nr_skipped)
                __field(unsigned long, nr_taken)
                __field(unsigned int, isolate_mode)
                __field(int, lru)
        ),

        TP_fast_assign(
                __entry->highest_zoneidx = highest_zoneidx;
                __entry->order = order;
                __entry->nr_requested = nr_requested;
                __entry->nr_scanned = nr_scanned;
                __entry->nr_skipped = nr_skipped;
                __entry->nr_taken = nr_taken;
                __entry->isolate_mode = (__force unsigned int)isolate_mode;
                __entry->lru = lru;
        ),

        /*
         * classzone is previous name of the highest_zoneidx.
         * Reason not to change it is the ABI requirement of the tracepoint.
         */
        TP_printk("isolate_mode=%d classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_skipped=%lu nr_taken=%lu lru=%s",
                __entry->isolate_mode,
                __entry->highest_zoneidx,
                __entry->order,
                __entry->nr_requested,
                __entry->nr_scanned,
                __entry->nr_skipped,
                __entry->nr_taken,
                __print_symbolic(__entry->lru, LRU_NAMES))
);

TRACE_EVENT(mm_vmscan_writepage,

        TP_PROTO(struct page *page),

        TP_ARGS(page),

        TP_STRUCT__entry(
                __field(unsigned long, pfn)
                __field(int, reclaim_flags)
        ),

        TP_fast_assign(
                __entry->pfn = page_to_pfn(page);
                __entry->reclaim_flags = trace_reclaim_flags(
                                                page_is_file_lru(page));
        ),

        TP_printk("page=%p pfn=%lu flags=%s",
                pfn_to_page(__entry->pfn),
                __entry->pfn,
                show_reclaim_flags(__entry->reclaim_flags))
);

TRACE_EVENT(mm_vmscan_lru_shrink_inactive,

        TP_PROTO(int nid,
                unsigned long nr_scanned, unsigned long nr_reclaimed,
                struct reclaim_stat *stat, int priority, int file),

        TP_ARGS(nid, nr_scanned, nr_reclaimed, stat, priority, file),

        TP_STRUCT__entry(
                __field(int, nid)
                __field(unsigned long, nr_scanned)
                __field(unsigned long, nr_reclaimed)
                __field(unsigned long, nr_dirty)
                __field(unsigned long, nr_writeback)
                __field(unsigned long, nr_congested)
                __field(unsigned long, nr_immediate)
                __field(unsigned int, nr_activate0)
                __field(unsigned int, nr_activate1)
                __field(unsigned long, nr_ref_keep)
                __field(unsigned long, nr_unmap_fail)
                __field(int, priority)
                __field(int, reclaim_flags)
        ),

        TP_fast_assign(
                __entry->nid = nid;
                __entry->nr_scanned = nr_scanned;
                __entry->nr_reclaimed = nr_reclaimed;
                __entry->nr_dirty = stat->nr_dirty;
                __entry->nr_writeback = stat->nr_writeback;
                __entry->nr_congested = stat->nr_congested;
                __entry->nr_immediate = stat->nr_immediate;
                __entry->nr_activate0 = stat->nr_activate[0];
                __entry->nr_activate1 = stat->nr_activate[1];
                __entry->nr_ref_keep = stat->nr_ref_keep;
                __entry->nr_unmap_fail = stat->nr_unmap_fail;
                __entry->priority = priority;
                __entry->reclaim_flags = trace_reclaim_flags(file);
        ),

        TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld nr_dirty=%ld nr_writeback=%ld nr_congested=%ld nr_immediate=%ld nr_activate_anon=%d nr_activate_file=%d nr_ref_keep=%ld nr_unmap_fail=%ld priority=%d flags=%s",
                __entry->nid,
                __entry->nr_scanned, __entry->nr_reclaimed,
                __entry->nr_dirty, __entry->nr_writeback,
                __entry->nr_congested, __entry->nr_immediate,
                __entry->nr_activate0, __entry->nr_activate1,
                __entry->nr_ref_keep, __entry->nr_unmap_fail,
                __entry->priority,
                show_reclaim_flags(__entry->reclaim_flags))
);

TRACE_EVENT(mm_vmscan_lru_shrink_active,

        TP_PROTO(int nid, unsigned long nr_taken,
                unsigned long nr_active, unsigned long nr_deactivated,
                unsigned long nr_referenced, int priority, int file),

        TP_ARGS(nid, nr_taken, nr_active, nr_deactivated, nr_referenced, priority, file),

        TP_STRUCT__entry(
                __field(int, nid)
                __field(unsigned long, nr_taken)
                __field(unsigned long, nr_active)
                __field(unsigned long, nr_deactivated)
                __field(unsigned long, nr_referenced)
                __field(int, priority)
                __field(int, reclaim_flags)
        ),

        TP_fast_assign(
                __entry->nid = nid;
                __entry->nr_taken = nr_taken;
                __entry->nr_active = nr_active;
                __entry->nr_deactivated = nr_deactivated;
                __entry->nr_referenced = nr_referenced;
                __entry->priority = priority;
                __entry->reclaim_flags = trace_reclaim_flags(file);
        ),

        TP_printk("nid=%d nr_taken=%ld nr_active=%ld nr_deactivated=%ld nr_referenced=%ld priority=%d flags=%s",
                __entry->nid,
                __entry->nr_taken,
                __entry->nr_active, __entry->nr_deactivated, __entry->nr_referenced,
                __entry->priority,
                show_reclaim_flags(__entry->reclaim_flags))
);

TRACE_EVENT(mm_vmscan_inactive_list_is_low,

        TP_PROTO(int nid, int reclaim_idx,
                unsigned long total_inactive, unsigned long inactive,
                unsigned long total_active, unsigned long active,
                unsigned long ratio, int file),

        TP_ARGS(nid, reclaim_idx, total_inactive, inactive, total_active, active, ratio, file),

        TP_STRUCT__entry(
                __field(int, nid)
                __field(int, reclaim_idx)
                __field(unsigned long, total_inactive)
                __field(unsigned long, inactive)
                __field(unsigned long, total_active)
                __field(unsigned long, active)
                __field(unsigned long, ratio)
                __field(int, reclaim_flags)
        ),

        TP_fast_assign(
                __entry->nid = nid;
                __entry->reclaim_idx = reclaim_idx;
                __entry->total_inactive = total_inactive;
                __entry->inactive = inactive;
                __entry->total_active = total_active;
                __entry->active = active;
                __entry->ratio = ratio;
                __entry->reclaim_flags = trace_reclaim_flags(file) &
                                         RECLAIM_WB_LRU;
        ),

        TP_printk("nid=%d reclaim_idx=%d total_inactive=%ld inactive=%ld total_active=%ld active=%ld ratio=%ld flags=%s",
                __entry->nid,
                __entry->reclaim_idx,
                __entry->total_inactive, __entry->inactive,
                __entry->total_active, __entry->active,
                __entry->ratio,
                show_reclaim_flags(__entry->reclaim_flags))
);

TRACE_EVENT(mm_vmscan_node_reclaim_begin,

        TP_PROTO(int nid, int order, gfp_t gfp_flags),

        TP_ARGS(nid, order, gfp_flags),

        TP_STRUCT__entry(
                __field(int, nid)
                __field(int, order)
                __field(gfp_t, gfp_flags)
        ),

        TP_fast_assign(
                __entry->nid = nid;
                __entry->order = order;
                __entry->gfp_flags = gfp_flags;
        ),

        TP_printk("nid=%d order=%d gfp_flags=%s",
                __entry->nid,
                __entry->order,
                show_gfp_flags(__entry->gfp_flags))
);

DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_node_reclaim_end,

        TP_PROTO(unsigned long nr_reclaimed),

        TP_ARGS(nr_reclaimed)
);

#endif /* _TRACE_VMSCAN_H */

/* This part must be outside protection */
#include <trace/define_trace.h>



























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
/*
 * The VGA aribiter manages VGA space routing and VGA resource decode to
 * allow multiple VGA devices to be used in a system in a safe way.
 *
 * (C) Copyright 2005 Benjamin Herrenschmidt <benh@kernel.crashing.org>
 * (C) Copyright 2007 Paulo R. Zanoni <przanoni@gmail.com>
 * (C) Copyright 2007, 2009 Tiago Vignatti <vignatti@freedesktop.org>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS
 * IN THE SOFTWARE.
 *
 */

#ifndef LINUX_VGA_H
#define LINUX_VGA_H

#include <video/vga.h>

/* Legacy VGA regions */
#define VGA_RSRC_NONE               0x00
#define VGA_RSRC_LEGACY_IO     0x01
#define VGA_RSRC_LEGACY_MEM    0x02
#define VGA_RSRC_LEGACY_MASK   (VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM)
/* Non-legacy access */
#define VGA_RSRC_NORMAL_IO     0x04
#define VGA_RSRC_NORMAL_MEM    0x08

/* Passing that instead of a pci_dev to use the system "default"
 * device, that is the one used by vgacon. Archs will probably
 * have to provide their own vga_default_device();
 */
#define VGA_DEFAULT_DEVICE     (NULL)

struct pci_dev;

/* For use by clients */

/**
 *     vga_set_legacy_decoding
 *
 *     @pdev: pci device of the VGA card
 *     @decodes: bit mask of what legacy regions the card decodes
 *
 *     Indicates to the arbiter if the card decodes legacy VGA IOs,
 *     legacy VGA Memory, both, or none. All cards default to both,
 *     the card driver (fbdev for example) should tell the arbiter
 *     if it has disabled legacy decoding, so the card can be left
 *     out of the arbitration process (and can be safe to take
 *     interrupts at any time.
 */
#if defined(CONFIG_VGA_ARB)
extern void vga_set_legacy_decoding(struct pci_dev *pdev,
                                    unsigned int decodes);
#else
static inline void vga_set_legacy_decoding(struct pci_dev *pdev,
                                           unsigned int decodes) { };
#endif

#if defined(CONFIG_VGA_ARB)
extern int vga_get(struct pci_dev *pdev, unsigned int rsrc, int interruptible);
#else
static inline int vga_get(struct pci_dev *pdev, unsigned int rsrc, int interruptible) { return 0; }
#endif

/**
 * vga_get_interruptible
 * @pdev: pci device of the VGA card or NULL for the system default
 * @rsrc: bit mask of resources to acquire and lock
 *
 * Shortcut to vga_get with interruptible set to true.
 *
 * On success, release the VGA resource again with vga_put().
 */
static inline int vga_get_interruptible(struct pci_dev *pdev,
                                        unsigned int rsrc)
{
       return vga_get(pdev, rsrc, 1);
}

/**
 * vga_get_uninterruptible - shortcut to vga_get()
 * @pdev: pci device of the VGA card or NULL for the system default
 * @rsrc: bit mask of resources to acquire and lock
 *
 * Shortcut to vga_get with interruptible set to false.
 *
 * On success, release the VGA resource again with vga_put().
 */
static inline int vga_get_uninterruptible(struct pci_dev *pdev,
                                          unsigned int rsrc)
{
       return vga_get(pdev, rsrc, 0);
}

#if defined(CONFIG_VGA_ARB)
extern void vga_put(struct pci_dev *pdev, unsigned int rsrc);
#else
#define vga_put(pdev, rsrc)
#endif


#ifdef CONFIG_VGA_ARB
extern struct pci_dev *vga_default_device(void);
extern void vga_set_default_device(struct pci_dev *pdev);
extern int vga_remove_vgacon(struct pci_dev *pdev);
#else
static inline struct pci_dev *vga_default_device(void) { return NULL; };
static inline void vga_set_default_device(struct pci_dev *pdev) { };
static inline int vga_remove_vgacon(struct pci_dev *pdev) { return 0; };
#endif

/*
 * Architectures should define this if they have several
 * independent PCI domains that can afford concurrent VGA
 * decoding
 */
#ifndef __ARCH_HAS_VGA_CONFLICT
static inline int vga_conflicts(struct pci_dev *p1, struct pci_dev *p2)
{
       return 1;
}
#endif

#if defined(CONFIG_VGA_ARB)
int vga_client_register(struct pci_dev *pdev, void *cookie,
                        void (*irq_set_state)(void *cookie, bool state),
                        unsigned int (*set_vga_decode)(void *cookie, bool state));
#else
static inline int vga_client_register(struct pci_dev *pdev, void *cookie,
                                      void (*irq_set_state)(void *cookie, bool state),
                                      unsigned int (*set_vga_decode)(void *cookie, bool state))
{
        return 0;
}
#endif

#endif /* LINUX_VGA_H */



































































































































































































































































































    1 






















    1 







    1 













    1 







    1 






















    1 























    1 












    1 





    1 









    1 




































































































































    1 





















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
/*
 * Copyright (C) 2017-2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
 * Copyright Matt Mackall <mpm@selenic.com>, 2003, 2004, 2005
 * Copyright Theodore Ts'o, 1994, 1995, 1996, 1997, 1998, 1999. All rights reserved.
 *
 * This driver produces cryptographically secure pseudorandom data. It is divided
 * into roughly six sections, each with a section header:
 *
 *   - Initialization and readiness waiting.
 *   - Fast key erasure RNG, the "crng".
 *   - Entropy accumulation and extraction routines.
 *   - Entropy collection routines.
 *   - Userspace reader/writer interfaces.
 *   - Sysctl interface.
 *
 * The high level overview is that there is one input pool, into which
 * various pieces of data are hashed. Prior to initialization, some of that
 * data is then "credited" as having a certain number of bits of entropy.
 * When enough bits of entropy are available, the hash is finalized and
 * handed as a key to a stream cipher that expands it indefinitely for
 * various consumers. This key is periodically refreshed as the various
 * entropy collectors, described below, add data to the input pool.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/utsname.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/major.h>
#include <linux/string.h>
#include <linux/fcntl.h>
#include <linux/slab.h>
#include <linux/random.h>
#include <linux/poll.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/genhd.h>
#include <linux/interrupt.h>
#include <linux/mm.h>
#include <linux/nodemask.h>
#include <linux/spinlock.h>
#include <linux/kthread.h>
#include <linux/percpu.h>
#include <linux/ptrace.h>
#include <linux/workqueue.h>
#include <linux/irq.h>
#include <linux/ratelimit.h>
#include <linux/syscalls.h>
#include <linux/completion.h>
#include <linux/uuid.h>
#include <linux/uaccess.h>
#include <linux/siphash.h>
#include <linux/uio.h>
#include <crypto/chacha.h>
#include <crypto/blake2s.h>
#include <asm/processor.h>
#include <asm/irq.h>
#include <asm/irq_regs.h>
#include <asm/io.h>

/*********************************************************************
 *
 * Initialization and readiness waiting.
 *
 * Much of the RNG infrastructure is devoted to various dependencies
 * being able to wait until the RNG has collected enough entropy and
 * is ready for safe consumption.
 *
 *********************************************************************/

/*
 * crng_init is protected by base_crng->lock, and only increases
 * its value (from empty->early->ready).
 */
static enum {
        CRNG_EMPTY = 0, /* Little to no entropy collected */
        CRNG_EARLY = 1, /* At least POOL_EARLY_BITS collected */
        CRNG_READY = 2  /* Fully initialized with POOL_READY_BITS collected */
} crng_init __read_mostly = CRNG_EMPTY;
#define crng_ready() (likely(crng_init >= CRNG_READY))
/* Various types of waiters for crng_init->CRNG_READY transition. */
static DECLARE_WAIT_QUEUE_HEAD(crng_init_wait);
static struct fasync_struct *fasync;
static DEFINE_SPINLOCK(random_ready_chain_lock);
static RAW_NOTIFIER_HEAD(random_ready_chain);

/* Control how we warn userspace. */
static struct ratelimit_state urandom_warning =
        RATELIMIT_STATE_INIT_FLAGS("urandom_warning", HZ, 3, RATELIMIT_MSG_ON_RELEASE);
static int ratelimit_disable __read_mostly =
        IS_ENABLED(CONFIG_WARN_ALL_UNSEEDED_RANDOM);
module_param_named(ratelimit_disable, ratelimit_disable, int, 0644);
MODULE_PARM_DESC(ratelimit_disable, "Disable random ratelimit suppression");

/*
 * Returns whether or not the input pool has been seeded and thus guaranteed
 * to supply cryptographically secure random numbers. This applies to: the
 * /dev/urandom device, the get_random_bytes function, and the get_random_{u32,
 * ,u64,int,long} family of functions.
 *
 * Returns: true if the input pool has been seeded.
 *          false if the input pool has not been seeded.
 */
bool rng_is_initialized(void)
{
        return crng_ready();
}
EXPORT_SYMBOL(rng_is_initialized);

/* Used by wait_for_random_bytes(), and considered an entropy collector, below. */
static void try_to_generate_entropy(void);

/*
 * Wait for the input pool to be seeded and thus guaranteed to supply
 * cryptographically secure random numbers. This applies to: the /dev/urandom
 * device, the get_random_bytes function, and the get_random_{u32,u64,int,long}
 * family of functions. Using any of these functions without first calling
 * this function forfeits the guarantee of security.
 *
 * Returns: 0 if the input pool has been seeded.
 *          -ERESTARTSYS if the function was interrupted by a signal.
 */
int wait_for_random_bytes(void)
{
        while (!crng_ready()) {
                int ret;

                try_to_generate_entropy();
                ret = wait_event_interruptible_timeout(crng_init_wait, crng_ready(), HZ);
                if (ret)
                        return ret > 0 ? 0 : ret;
        }
        return 0;
}
EXPORT_SYMBOL(wait_for_random_bytes);

/*
 * Add a callback function that will be invoked when the input
 * pool is initialised.
 *
 * returns: 0 if callback is successfully added
 *            -EALREADY if pool is already initialised (callback not called)
 */
int __cold register_random_ready_notifier(struct notifier_block *nb)
{
        unsigned long flags;
        int ret = -EALREADY;

        if (crng_ready())
                return ret;

        spin_lock_irqsave(&random_ready_chain_lock, flags);
        if (!crng_ready())
                ret = raw_notifier_chain_register(&random_ready_chain, nb);
        spin_unlock_irqrestore(&random_ready_chain_lock, flags);
        return ret;
}

/*
 * Delete a previously registered readiness callback function.
 */
int __cold unregister_random_ready_notifier(struct notifier_block *nb)
{
        unsigned long flags;
        int ret;

        spin_lock_irqsave(&random_ready_chain_lock, flags);
        ret = raw_notifier_chain_unregister(&random_ready_chain, nb);
        spin_unlock_irqrestore(&random_ready_chain_lock, flags);
        return ret;
}

static void __cold process_random_ready_list(void)
{
        unsigned long flags;

        spin_lock_irqsave(&random_ready_chain_lock, flags);
        raw_notifier_call_chain(&random_ready_chain, 0, NULL);
        spin_unlock_irqrestore(&random_ready_chain_lock, flags);
}

#define warn_unseeded_randomness() \
        if (IS_ENABLED(CONFIG_WARN_ALL_UNSEEDED_RANDOM) && !crng_ready()) \
                printk_deferred(KERN_NOTICE "random: %s called from %pS with crng_init=%d\n", \
                                __func__, (void *)_RET_IP_, crng_init)


/*********************************************************************
 *
 * Fast key erasure RNG, the "crng".
 *
 * These functions expand entropy from the entropy extractor into
 * long streams for external consumption using the "fast key erasure"
 * RNG described at <https://blog.cr.yp.to/20170723-random.html>.
 *
 * There are a few exported interfaces for use by other drivers:
 *
 *        void get_random_bytes(void *buf, size_t len)
 *        u32 get_random_u32()
 *        u64 get_random_u64()
 *        unsigned int get_random_int()
 *        unsigned long get_random_long()
 *
 * These interfaces will return the requested number of random bytes
 * into the given buffer or as a return value. This is equivalent to
 * a read from /dev/urandom. The u32, u64, int, and long family of
 * functions may be higher performance for one-off random integers,
 * because they do a bit of buffering and do not invoke reseeding
 * until the buffer is emptied.
 *
 *********************************************************************/

enum {
        CRNG_RESEED_START_INTERVAL = HZ,
        CRNG_RESEED_INTERVAL = 60 * HZ
};

static struct {
        u8 key[CHACHA_KEY_SIZE] __aligned(__alignof__(long));
        unsigned long birth;
        unsigned long generation;
        spinlock_t lock;
} base_crng = {
        .lock = __SPIN_LOCK_UNLOCKED(base_crng.lock)
};

struct crng {
        u8 key[CHACHA_KEY_SIZE];
        unsigned long generation;
        local_lock_t lock;
};

static DEFINE_PER_CPU(struct crng, crngs) = {
        .generation = ULONG_MAX,
        .lock = INIT_LOCAL_LOCK(crngs.lock),
};

/* Used by crng_reseed() and crng_make_state() to extract a new seed from the input pool. */
static void extract_entropy(void *buf, size_t len);

/* This extracts a new crng key from the input pool. */
static void crng_reseed(void)
{
        unsigned long flags;
        unsigned long next_gen;
        u8 key[CHACHA_KEY_SIZE];

        extract_entropy(key, sizeof(key));

        /*
         * We copy the new key into the base_crng, overwriting the old one,
         * and update the generation counter. We avoid hitting ULONG_MAX,
         * because the per-cpu crngs are initialized to ULONG_MAX, so this
         * forces new CPUs that come online to always initialize.
         */
        spin_lock_irqsave(&base_crng.lock, flags);
        memcpy(base_crng.key, key, sizeof(base_crng.key));
        next_gen = base_crng.generation + 1;
        if (next_gen == ULONG_MAX)
                ++next_gen;
        WRITE_ONCE(base_crng.generation, next_gen);
        WRITE_ONCE(base_crng.birth, jiffies);
        if (!crng_ready())
                crng_init = CRNG_READY;
        spin_unlock_irqrestore(&base_crng.lock, flags);
        memzero_explicit(key, sizeof(key));
}

/*
 * This generates a ChaCha block using the provided key, and then
 * immediately overwites that key with half the block. It returns
 * the resultant ChaCha state to the user, along with the second
 * half of the block containing 32 bytes of random data that may
 * be used; random_data_len may not be greater than 32.
 *
 * The returned ChaCha state contains within it a copy of the old
 * key value, at index 4, so the state should always be zeroed out
 * immediately after using in order to maintain forward secrecy.
 * If the state cannot be erased in a timely manner, then it is
 * safer to set the random_data parameter to &chacha_state[4] so
 * that this function overwrites it before returning.
 */
static void crng_fast_key_erasure(u8 key[CHACHA_KEY_SIZE],
                                  u32 chacha_state[CHACHA_STATE_WORDS],
                                  u8 *random_data, size_t random_data_len)
{
        u8 first_block[CHACHA_BLOCK_SIZE];

        BUG_ON(random_data_len > 32);

        chacha_init_consts(chacha_state);
        memcpy(&chacha_state[4], key, CHACHA_KEY_SIZE);
        memset(&chacha_state[12], 0, sizeof(u32) * 4);
        chacha20_block(chacha_state, first_block);

        memcpy(key, first_block, CHACHA_KEY_SIZE);
        memcpy(random_data, first_block + CHACHA_KEY_SIZE, random_data_len);
        memzero_explicit(first_block, sizeof(first_block));
}

/*
 * Return whether the crng seed is considered to be sufficiently old
 * that a reseeding is needed. This happens if the last reseeding
 * was CRNG_RESEED_INTERVAL ago, or during early boot, at an interval
 * proportional to the uptime.
 */
static bool crng_has_old_seed(void)
{
        static bool early_boot = true;
        unsigned long interval = CRNG_RESEED_INTERVAL;

        if (unlikely(READ_ONCE(early_boot))) {
                time64_t uptime = ktime_get_seconds();
                if (uptime >= CRNG_RESEED_INTERVAL / HZ * 2)
                        WRITE_ONCE(early_boot, false);
                else
                        interval = max_t(unsigned int, CRNG_RESEED_START_INTERVAL,
                                         (unsigned int)uptime / 2 * HZ);
        }
        return time_is_before_jiffies(READ_ONCE(base_crng.birth) + interval);
}

/*
 * This function returns a ChaCha state that you may use for generating
 * random data. It also returns up to 32 bytes on its own of random data
 * that may be used; random_data_len may not be greater than 32.
 */
static void crng_make_state(u32 chacha_state[CHACHA_STATE_WORDS],
                            u8 *random_data, size_t random_data_len)
{
        unsigned long flags;
        struct crng *crng;

        BUG_ON(random_data_len > 32);

        /*
         * For the fast path, we check whether we're ready, unlocked first, and
         * then re-check once locked later. In the case where we're really not
         * ready, we do fast key erasure with the base_crng directly, extracting
         * when crng_init is CRNG_EMPTY.
         */
        if (!crng_ready()) {
                bool ready;

                spin_lock_irqsave(&base_crng.lock, flags);
                ready = crng_ready();
                if (!ready) {
                        if (crng_init == CRNG_EMPTY)
                                extract_entropy(base_crng.key, sizeof(base_crng.key));
                        crng_fast_key_erasure(base_crng.key, chacha_state,
                                              random_data, random_data_len);
                }
                spin_unlock_irqrestore(&base_crng.lock, flags);
                if (!ready)
                        return;
        }

        /*
         * If the base_crng is old enough, we reseed, which in turn bumps the
         * generation counter that we check below.
         */
        if (unlikely(crng_has_old_seed()))
                crng_reseed();

        local_lock_irqsave(&crngs.lock, flags);
        crng = raw_cpu_ptr(&crngs);

        /*
         * If our per-cpu crng is older than the base_crng, then it means
         * somebody reseeded the base_crng. In that case, we do fast key
         * erasure on the base_crng, and use its output as the new key
         * for our per-cpu crng. This brings us up to date with base_crng.
         */
        if (unlikely(crng->generation != READ_ONCE(base_crng.generation))) {
                spin_lock(&base_crng.lock);
                crng_fast_key_erasure(base_crng.key, chacha_state,
                                      crng->key, sizeof(crng->key));
                crng->generation = base_crng.generation;
                spin_unlock(&base_crng.lock);
        }

        /*
         * Finally, when we've made it this far, our per-cpu crng has an up
         * to date key, and we can do fast key erasure with it to produce
         * some random data and a ChaCha state for the caller. All other
         * branches of this function are "unlikely", so most of the time we
         * should wind up here immediately.
         */
        crng_fast_key_erasure(crng->key, chacha_state, random_data, random_data_len);
        local_unlock_irqrestore(&crngs.lock, flags);
}

static void _get_random_bytes(void *buf, size_t len)
{
        u32 chacha_state[CHACHA_STATE_WORDS];
        u8 tmp[CHACHA_BLOCK_SIZE];
        size_t first_block_len;

        if (!len)
                return;

        first_block_len = min_t(size_t, 32, len);
        crng_make_state(chacha_state, buf, first_block_len);
        len -= first_block_len;
        buf += first_block_len;

        while (len) {
                if (len < CHACHA_BLOCK_SIZE) {
                        chacha20_block(chacha_state, tmp);
                        memcpy(buf, tmp, len);
                        memzero_explicit(tmp, sizeof(tmp));
                        break;
                }

                chacha20_block(chacha_state, buf);
                if (unlikely(chacha_state[12] == 0))
                        ++chacha_state[13];
                len -= CHACHA_BLOCK_SIZE;
                buf += CHACHA_BLOCK_SIZE;
        }

        memzero_explicit(chacha_state, sizeof(chacha_state));
}

/*
 * This function is the exported kernel interface.  It returns some
 * number of good random numbers, suitable for key generation, seeding
 * TCP sequence numbers, etc.  It does not rely on the hardware random
 * number generator.  For random bytes direct from the hardware RNG
 * (when available), use get_random_bytes_arch(). In order to ensure
 * that the randomness provided by this function is okay, the function
 * wait_for_random_bytes() should be called and return 0 at least once
 * at any point prior.
 */
void get_random_bytes(void *buf, size_t len)
{
        warn_unseeded_randomness();
        _get_random_bytes(buf, len);
}
EXPORT_SYMBOL(get_random_bytes);

static ssize_t get_random_bytes_user(struct iov_iter *iter)
{
        u32 chacha_state[CHACHA_STATE_WORDS];
        u8 block[CHACHA_BLOCK_SIZE];
        size_t ret = 0, copied;

        if (unlikely(!iov_iter_count(iter)))
                return 0;

        /*
         * Immediately overwrite the ChaCha key at index 4 with random
         * bytes, in case userspace causes copy_to_iter() below to sleep
         * forever, so that we still retain forward secrecy in that case.
         */
        crng_make_state(chacha_state, (u8 *)&chacha_state[4], CHACHA_KEY_SIZE);
        /*
         * However, if we're doing a read of len <= 32, we don't need to
         * use chacha_state after, so we can simply return those bytes to
         * the user directly.
         */
        if (iov_iter_count(iter) <= CHACHA_KEY_SIZE) {
                ret = copy_to_iter(&chacha_state[4], CHACHA_KEY_SIZE, iter);
                goto out_zero_chacha;
        }

        for (;;) {
                chacha20_block(chacha_state, block);
                if (unlikely(chacha_state[12] == 0))
                        ++chacha_state[13];

                copied = copy_to_iter(block, sizeof(block), iter);
                ret += copied;
                if (!iov_iter_count(iter) || copied != sizeof(block))
                        break;

                BUILD_BUG_ON(PAGE_SIZE % sizeof(block) != 0);
                if (ret % PAGE_SIZE == 0) {
                        if (signal_pending(current))
                                break;
                        cond_resched();
                }
        }

        memzero_explicit(block, sizeof(block));
out_zero_chacha:
        memzero_explicit(chacha_state, sizeof(chacha_state));
        return ret ? ret : -EFAULT;
}

/*
 * Batched entropy returns random integers. The quality of the random
 * number is good as /dev/urandom. In order to ensure that the randomness
 * provided by this function is okay, the function wait_for_random_bytes()
 * should be called and return 0 at least once at any point prior.
 */

#define DEFINE_BATCHED_ENTROPY(type)                                                \
struct batch_ ##type {                                                                \
        /*                                                                        \
         * We make this 1.5x a ChaCha block, so that we get the                        \
         * remaining 32 bytes from fast key erasure, plus one full                \
         * block from the detached ChaCha state. We can increase                \
         * the size of this later if needed so long as we keep the                \
         * formula of (integer_blocks + 0.5) * CHACHA_BLOCK_SIZE.                \
         */                                                                        \
        type entropy[CHACHA_BLOCK_SIZE * 3 / (2 * sizeof(type))];                \
        local_lock_t lock;                                                        \
        unsigned long generation;                                                \
        unsigned int position;                                                        \
};                                                                                \
                                                                                \
static DEFINE_PER_CPU(struct batch_ ##type, batched_entropy_ ##type) = {        \
        .lock = INIT_LOCAL_LOCK(batched_entropy_ ##type.lock),                        \
        .position = UINT_MAX                                                        \
};                                                                                \
                                                                                \
type get_random_ ##type(void)                                                        \
{                                                                                \
        type ret;                                                                \
        unsigned long flags;                                                        \
        struct batch_ ##type *batch;                                                \
        unsigned long next_gen;                                                        \
                                                                                \
        warn_unseeded_randomness();                                                \
                                                                                \
        if  (!crng_ready()) {                                                        \
                _get_random_bytes(&ret, sizeof(ret));                                \
                return ret;                                                        \
        }                                                                        \
                                                                                \
        local_lock_irqsave(&batched_entropy_ ##type.lock, flags);                \
        batch = raw_cpu_ptr(&batched_entropy_##type);                                \
                                                                                \
        next_gen = READ_ONCE(base_crng.generation);                                \
        if (batch->position >= ARRAY_SIZE(batch->entropy) ||                        \
            next_gen != batch->generation) {                                        \
                _get_random_bytes(batch->entropy, sizeof(batch->entropy));        \
                batch->position = 0;                                                \
                batch->generation = next_gen;                                        \
        }                                                                        \
                                                                                \
        ret = batch->entropy[batch->position];                                        \
        batch->entropy[batch->position] = 0;                                        \
        ++batch->position;                                                        \
        local_unlock_irqrestore(&batched_entropy_ ##type.lock, flags);                \
        return ret;                                                                \
}                                                                                \
EXPORT_SYMBOL(get_random_ ##type);

DEFINE_BATCHED_ENTROPY(u64)
DEFINE_BATCHED_ENTROPY(u32)

#ifdef CONFIG_SMP
/*
 * This function is called when the CPU is coming up, with entry
 * CPUHP_RANDOM_PREPARE, which comes before CPUHP_WORKQUEUE_PREP.
 */
int __cold random_prepare_cpu(unsigned int cpu)
{
        /*
         * When the cpu comes back online, immediately invalidate both
         * the per-cpu crng and all batches, so that we serve fresh
         * randomness.
         */
        per_cpu_ptr(&crngs, cpu)->generation = ULONG_MAX;
        per_cpu_ptr(&batched_entropy_u32, cpu)->position = UINT_MAX;
        per_cpu_ptr(&batched_entropy_u64, cpu)->position = UINT_MAX;
        return 0;
}
#endif

/*
 * This function will use the architecture-specific hardware random
 * number generator if it is available. It is not recommended for
 * use. Use get_random_bytes() instead. It returns the number of
 * bytes filled in.
 */
size_t __must_check get_random_bytes_arch(void *buf, size_t len)
{
        size_t left = len;
        u8 *p = buf;

        while (left) {
                unsigned long v;
                size_t block_len = min_t(size_t, left, sizeof(unsigned long));

                if (!arch_get_random_long(&v))
                        break;

                memcpy(p, &v, block_len);
                p += block_len;
                left -= block_len;
        }

        return len - left;
}
EXPORT_SYMBOL(get_random_bytes_arch);


/**********************************************************************
 *
 * Entropy accumulation and extraction routines.
 *
 * Callers may add entropy via:
 *
 *     static void mix_pool_bytes(const void *buf, size_t len)
 *
 * After which, if added entropy should be credited:
 *
 *     static void credit_init_bits(size_t bits)
 *
 * Finally, extract entropy via:
 *
 *     static void extract_entropy(void *buf, size_t len)
 *
 **********************************************************************/

enum {
        POOL_BITS = BLAKE2S_HASH_SIZE * 8,
        POOL_READY_BITS = POOL_BITS, /* When crng_init->CRNG_READY */
        POOL_EARLY_BITS = POOL_READY_BITS / 2 /* When crng_init->CRNG_EARLY */
};

static struct {
        struct blake2s_state hash;
        spinlock_t lock;
        unsigned int init_bits;
} input_pool = {
        .hash.h = { BLAKE2S_IV0 ^ (0x01010000 | BLAKE2S_HASH_SIZE),
                    BLAKE2S_IV1, BLAKE2S_IV2, BLAKE2S_IV3, BLAKE2S_IV4,
                    BLAKE2S_IV5, BLAKE2S_IV6, BLAKE2S_IV7 },
        .hash.outlen = BLAKE2S_HASH_SIZE,
        .lock = __SPIN_LOCK_UNLOCKED(input_pool.lock),
};

static void _mix_pool_bytes(const void *buf, size_t len)
{
        blake2s_update(&input_pool.hash, buf, len);
}

/*
 * This function adds bytes into the input pool. It does not
 * update the initialization bit counter; the caller should call
 * credit_init_bits if this is appropriate.
 */
static void mix_pool_bytes(const void *buf, size_t len)
{
        unsigned long flags;

        spin_lock_irqsave(&input_pool.lock, flags);
        _mix_pool_bytes(buf, len);
        spin_unlock_irqrestore(&input_pool.lock, flags);
}

/*
 * This is an HKDF-like construction for using the hashed collected entropy
 * as a PRF key, that's then expanded block-by-block.
 */
static void extract_entropy(void *buf, size_t len)
{
        unsigned long flags;
        u8 seed[BLAKE2S_HASH_SIZE], next_key[BLAKE2S_HASH_SIZE];
        struct {
                unsigned long rdseed[32 / sizeof(long)];
                size_t counter;
        } block;
        size_t i;

        for (i = 0; i < ARRAY_SIZE(block.rdseed); ++i) {
                if (!arch_get_random_seed_long(&block.rdseed[i]) &&
                    !arch_get_random_long(&block.rdseed[i]))
                        block.rdseed[i] = random_get_entropy();
        }

        spin_lock_irqsave(&input_pool.lock, flags);

        /* seed = HASHPRF(last_key, entropy_input) */
        blake2s_final(&input_pool.hash, seed);

        /* next_key = HASHPRF(seed, RDSEED || 0) */
        block.counter = 0;
        blake2s(next_key, (u8 *)&block, seed, sizeof(next_key), sizeof(block), sizeof(seed));
        blake2s_init_key(&input_pool.hash, BLAKE2S_HASH_SIZE, next_key, sizeof(next_key));

        spin_unlock_irqrestore(&input_pool.lock, flags);
        memzero_explicit(next_key, sizeof(next_key));

        while (len) {
                i = min_t(size_t, len, BLAKE2S_HASH_SIZE);
                /* output = HASHPRF(seed, RDSEED || ++counter) */
                ++block.counter;
                blake2s(buf, (u8 *)&block, seed, i, sizeof(block), sizeof(seed));
                len -= i;
                buf += i;
        }

        memzero_explicit(seed, sizeof(seed));
        memzero_explicit(&block, sizeof(block));
}

#define credit_init_bits(bits) if (!crng_ready()) _credit_init_bits(bits)

static void __cold _credit_init_bits(size_t bits)
{
        unsigned int new, orig, add;
        unsigned long flags;

        if (!bits)
                return;

        add = min_t(size_t, bits, POOL_BITS);

        do {
                orig = READ_ONCE(input_pool.init_bits);
                new = min_t(unsigned int, POOL_BITS, orig + add);
        } while (cmpxchg(&input_pool.init_bits, orig, new) != orig);

        if (orig < POOL_READY_BITS && new >= POOL_READY_BITS) {
                crng_reseed(); /* Sets crng_init to CRNG_READY under base_crng.lock. */
                process_random_ready_list();
                wake_up_interruptible(&crng_init_wait);
                kill_fasync(&fasync, SIGIO, POLL_IN);
                pr_notice("crng init done\n");
                if (urandom_warning.missed)
                        pr_notice("%d urandom warning(s) missed due to ratelimiting\n",
                                  urandom_warning.missed);
        } else if (orig < POOL_EARLY_BITS && new >= POOL_EARLY_BITS) {
                spin_lock_irqsave(&base_crng.lock, flags);
                /* Check if crng_init is CRNG_EMPTY, to avoid race with crng_reseed(). */
                if (crng_init == CRNG_EMPTY) {
                        extract_entropy(base_crng.key, sizeof(base_crng.key));
                        crng_init = CRNG_EARLY;
                }
                spin_unlock_irqrestore(&base_crng.lock, flags);
        }
}


/**********************************************************************
 *
 * Entropy collection routines.
 *
 * The following exported functions are used for pushing entropy into
 * the above entropy accumulation routines:
 *
 *        void add_device_randomness(const void *buf, size_t len);
 *        void add_hwgenerator_randomness(const void *buf, size_t len, size_t entropy);
 *        void add_bootloader_randomness(const void *buf, size_t len);
 *        void add_interrupt_randomness(int irq);
 *        void add_input_randomness(unsigned int type, unsigned int code, unsigned int value);
 *        void add_disk_randomness(struct gendisk *disk);
 *
 * add_device_randomness() adds data to the input pool that
 * is likely to differ between two devices (or possibly even per boot).
 * This would be things like MAC addresses or serial numbers, or the
 * read-out of the RTC. This does *not* credit any actual entropy to
 * the pool, but it initializes the pool to different values for devices
 * that might otherwise be identical and have very little entropy
 * available to them (particularly common in the embedded world).
 *
 * add_hwgenerator_randomness() is for true hardware RNGs, and will credit
 * entropy as specified by the caller. If the entropy pool is full it will
 * block until more entropy is needed.
 *
 * add_bootloader_randomness() is called by bootloader drivers, such as EFI
 * and device tree, and credits its input depending on whether or not the
 * configuration option CONFIG_RANDOM_TRUST_BOOTLOADER is set.
 *
 * add_interrupt_randomness() uses the interrupt timing as random
 * inputs to the entropy pool. Using the cycle counters and the irq source
 * as inputs, it feeds the input pool roughly once a second or after 64
 * interrupts, crediting 1 bit of entropy for whichever comes first.
 *
 * add_input_randomness() uses the input layer interrupt timing, as well
 * as the event type information from the hardware.
 *
 * add_disk_randomness() uses what amounts to the seek time of block
 * layer request events, on a per-disk_devt basis, as input to the
 * entropy pool. Note that high-speed solid state drives with very low
 * seek times do not make for good sources of entropy, as their seek
 * times are usually fairly consistent.
 *
 * The last two routines try to estimate how many bits of entropy
 * to credit. They do this by keeping track of the first and second
 * order deltas of the event timings.
 *
 **********************************************************************/

static bool trust_cpu __ro_after_init = IS_ENABLED(CONFIG_RANDOM_TRUST_CPU);
static bool trust_bootloader __ro_after_init = IS_ENABLED(CONFIG_RANDOM_TRUST_BOOTLOADER);
static int __init parse_trust_cpu(char *arg)
{
        return kstrtobool(arg, &trust_cpu);
}
static int __init parse_trust_bootloader(char *arg)
{
        return kstrtobool(arg, &trust_bootloader);
}
early_param("random.trust_cpu", parse_trust_cpu);
early_param("random.trust_bootloader", parse_trust_bootloader);

/*
 * The first collection of entropy occurs at system boot while interrupts
 * are still turned off. Here we push in latent entropy, RDSEED, a timestamp,
 * utsname(), and the command line. Depending on the above configuration knob,
 * RDSEED may be considered sufficient for initialization. Note that much
 * earlier setup may already have pushed entropy into the input pool by the
 * time we get here.
 */
int __init random_init(const char *command_line)
{
        ktime_t now = ktime_get_real();
        unsigned int i, arch_bytes;
        unsigned long entropy;

#if defined(LATENT_ENTROPY_PLUGIN)
        static const u8 compiletime_seed[BLAKE2S_BLOCK_SIZE] __initconst __latent_entropy;
        _mix_pool_bytes(compiletime_seed, sizeof(compiletime_seed));
#endif

        for (i = 0, arch_bytes = BLAKE2S_BLOCK_SIZE;
             i < BLAKE2S_BLOCK_SIZE; i += sizeof(entropy)) {
                if (!arch_get_random_seed_long_early(&entropy) &&
                    !arch_get_random_long_early(&entropy)) {
                        entropy = random_get_entropy();
                        arch_bytes -= sizeof(entropy);
                }
                _mix_pool_bytes(&entropy, sizeof(entropy));
        }
        _mix_pool_bytes(&now, sizeof(now));
        _mix_pool_bytes(utsname(), sizeof(*(utsname())));
        _mix_pool_bytes(command_line, strlen(command_line));
        add_latent_entropy();

        if (crng_ready())
                crng_reseed();
        else if (trust_cpu)
                credit_init_bits(arch_bytes * 8);

        return 0;
}

/*
 * Add device- or boot-specific data to the input pool to help
 * initialize it.
 *
 * None of this adds any entropy; it is meant to avoid the problem of
 * the entropy pool having similar initial state across largely
 * identical devices.
 */
void add_device_randomness(const void *buf, size_t len)
{
        unsigned long entropy = random_get_entropy();
        unsigned long flags;

        spin_lock_irqsave(&input_pool.lock, flags);
        _mix_pool_bytes(&entropy, sizeof(entropy));
        _mix_pool_bytes(buf, len);
        spin_unlock_irqrestore(&input_pool.lock, flags);
}
EXPORT_SYMBOL(add_device_randomness);

/*
 * Interface for in-kernel drivers of true hardware RNGs.
 * Those devices may produce endless random bits and will be throttled
 * when our pool is full.
 */
void add_hwgenerator_randomness(const void *buf, size_t len, size_t entropy)
{
        mix_pool_bytes(buf, len);
        credit_init_bits(entropy);

        /*
         * Throttle writing to once every CRNG_RESEED_INTERVAL, unless
         * we're not yet initialized.
         */
        if (!kthread_should_stop() && crng_ready())
                schedule_timeout_interruptible(CRNG_RESEED_INTERVAL);
}
EXPORT_SYMBOL_GPL(add_hwgenerator_randomness);

/*
 * Handle random seed passed by bootloader, and credit it if
 * CONFIG_RANDOM_TRUST_BOOTLOADER is set.
 */
void __cold add_bootloader_randomness(const void *buf, size_t len)
{
        mix_pool_bytes(buf, len);
        if (trust_bootloader)
                credit_init_bits(len * 8);
}
EXPORT_SYMBOL_GPL(add_bootloader_randomness);

struct fast_pool {
        unsigned long pool[4];
        unsigned long last;
        unsigned int count;
        struct timer_list mix;
};

static void mix_interrupt_randomness(struct timer_list *work);

static DEFINE_PER_CPU(struct fast_pool, irq_randomness) = {
#ifdef CONFIG_64BIT
#define FASTMIX_PERM SIPHASH_PERMUTATION
        .pool = { SIPHASH_CONST_0, SIPHASH_CONST_1, SIPHASH_CONST_2, SIPHASH_CONST_3 },
#else
#define FASTMIX_PERM HSIPHASH_PERMUTATION
        .pool = { HSIPHASH_CONST_0, HSIPHASH_CONST_1, HSIPHASH_CONST_2, HSIPHASH_CONST_3 },
#endif
        .mix = __TIMER_INITIALIZER(mix_interrupt_randomness, 0)
};

/*
 * This is [Half]SipHash-1-x, starting from an empty key. Because
 * the key is fixed, it assumes that its inputs are non-malicious,
 * and therefore this has no security on its own. s represents the
 * four-word SipHash state, while v represents a two-word input.
 */
static void fast_mix(unsigned long s[4], unsigned long v1, unsigned long v2)
{
        s[3] ^= v1;
        FASTMIX_PERM(s[0], s[1], s[2], s[3]);
        s[0] ^= v1;
        s[3] ^= v2;
        FASTMIX_PERM(s[0], s[1], s[2], s[3]);
        s[0] ^= v2;
}

#ifdef CONFIG_SMP
/*
 * This function is called when the CPU has just come online, with
 * entry CPUHP_AP_RANDOM_ONLINE, just after CPUHP_AP_WORKQUEUE_ONLINE.
 */
int __cold random_online_cpu(unsigned int cpu)
{
        /*
         * During CPU shutdown and before CPU onlining, add_interrupt_
         * randomness() may schedule mix_interrupt_randomness(), and
         * set the MIX_INFLIGHT flag. However, because the worker can
         * be scheduled on a different CPU during this period, that
         * flag will never be cleared. For that reason, we zero out
         * the flag here, which runs just after workqueues are onlined
         * for the CPU again. This also has the effect of setting the
         * irq randomness count to zero so that new accumulated irqs
         * are fresh.
         */
        per_cpu_ptr(&irq_randomness, cpu)->count = 0;
        return 0;
}
#endif

static void mix_interrupt_randomness(struct timer_list *work)
{
        struct fast_pool *fast_pool = container_of(work, struct fast_pool, mix);
        /*
         * The size of the copied stack pool is explicitly 2 longs so that we
         * only ever ingest half of the siphash output each time, retaining
         * the other half as the next "key" that carries over. The entropy is
         * supposed to be sufficiently dispersed between bits so on average
         * we don't wind up "losing" some.
         */
        unsigned long pool[2];
        unsigned int count;

        /* Check to see if we're running on the wrong CPU due to hotplug. */
        local_irq_disable();
        if (fast_pool != this_cpu_ptr(&irq_randomness)) {
                local_irq_enable();
                return;
        }

        /*
         * Copy the pool to the stack so that the mixer always has a
         * consistent view, before we reenable irqs again.
         */
        memcpy(pool, fast_pool->pool, sizeof(pool));
        count = fast_pool->count;
        fast_pool->count = 0;
        fast_pool->last = jiffies;
        local_irq_enable();

        mix_pool_bytes(pool, sizeof(pool));
        credit_init_bits(clamp_t(unsigned int, (count & U16_MAX) / 64, 1, sizeof(pool) * 8));

        memzero_explicit(pool, sizeof(pool));
}

void add_interrupt_randomness(int irq)
{
        enum { MIX_INFLIGHT = 1U << 31 };
        unsigned long entropy = random_get_entropy();
        struct fast_pool *fast_pool = this_cpu_ptr(&irq_randomness);
        struct pt_regs *regs = get_irq_regs();
        unsigned int new_count;

        fast_mix(fast_pool->pool, entropy,
                 (regs ? instruction_pointer(regs) : _RET_IP_) ^ swab(irq));
        new_count = ++fast_pool->count;

        if (new_count & MIX_INFLIGHT)
                return;

        if (new_count < 1024 && !time_is_before_jiffies(fast_pool->last + HZ))
                return;

        fast_pool->count |= MIX_INFLIGHT;
        if (!timer_pending(&fast_pool->mix)) {
                fast_pool->mix.expires = jiffies;
                add_timer_on(&fast_pool->mix, raw_smp_processor_id());
        }
}
EXPORT_SYMBOL_GPL(add_interrupt_randomness);

/* There is one of these per entropy source */
struct timer_rand_state {
        unsigned long last_time;
        long last_delta, last_delta2;
};

/*
 * This function adds entropy to the entropy "pool" by using timing
 * delays. It uses the timer_rand_state structure to make an estimate
 * of how many bits of entropy this call has added to the pool. The
 * value "num" is also added to the pool; it should somehow describe
 * the type of event that just happened.
 */
static void add_timer_randomness(struct timer_rand_state *state, unsigned int num)
{
        unsigned long entropy = random_get_entropy(), now = jiffies, flags;
        long delta, delta2, delta3;
        unsigned int bits;

        /*
         * If we're in a hard IRQ, add_interrupt_randomness() will be called
         * sometime after, so mix into the fast pool.
         */
        if (in_irq()) {
                fast_mix(this_cpu_ptr(&irq_randomness)->pool, entropy, num);
        } else {
                spin_lock_irqsave(&input_pool.lock, flags);
                _mix_pool_bytes(&entropy, sizeof(entropy));
                _mix_pool_bytes(&num, sizeof(num));
                spin_unlock_irqrestore(&input_pool.lock, flags);
        }

        if (crng_ready())
                return;

        /*
         * Calculate number of bits of randomness we probably added.
         * We take into account the first, second and third-order deltas
         * in order to make our estimate.
         */
        delta = now - READ_ONCE(state->last_time);
        WRITE_ONCE(state->last_time, now);

        delta2 = delta - READ_ONCE(state->last_delta);
        WRITE_ONCE(state->last_delta, delta);

        delta3 = delta2 - READ_ONCE(state->last_delta2);
        WRITE_ONCE(state->last_delta2, delta2);

        if (delta < 0)
                delta = -delta;
        if (delta2 < 0)
                delta2 = -delta2;
        if (delta3 < 0)
                delta3 = -delta3;
        if (delta > delta2)
                delta = delta2;
        if (delta > delta3)
                delta = delta3;

        /*
         * delta is now minimum absolute delta. Round down by 1 bit
         * on general principles, and limit entropy estimate to 11 bits.
         */
        bits = min(fls(delta >> 1), 11);

        /*
         * As mentioned above, if we're in a hard IRQ, add_interrupt_randomness()
         * will run after this, which uses a different crediting scheme of 1 bit
         * per every 64 interrupts. In order to let that function do accounting
         * close to the one in this function, we credit a full 64/64 bit per bit,
         * and then subtract one to account for the extra one added.
         */
        if (in_irq())
                this_cpu_ptr(&irq_randomness)->count += max(1u, bits * 64) - 1;
        else
                _credit_init_bits(bits);
}

void add_input_randomness(unsigned int type, unsigned int code, unsigned int value)
{
        static unsigned char last_value;
        static struct timer_rand_state input_timer_state = { INITIAL_JIFFIES };

        /* Ignore autorepeat and the like. */
        if (value == last_value)
                return;

        last_value = value;
        add_timer_randomness(&input_timer_state,
                             (type << 4) ^ code ^ (code >> 4) ^ value);
}
EXPORT_SYMBOL_GPL(add_input_randomness);

#ifdef CONFIG_BLOCK
void add_disk_randomness(struct gendisk *disk)
{
        if (!disk || !disk->random)
                return;
        /* First major is 1, so we get >= 0x200 here. */
        add_timer_randomness(disk->random, 0x100 + disk_devt(disk));
}
EXPORT_SYMBOL_GPL(add_disk_randomness);

void __cold rand_initialize_disk(struct gendisk *disk)
{
        struct timer_rand_state *state;

        /*
         * If kzalloc returns null, we just won't use that entropy
         * source.
         */
        state = kzalloc(sizeof(struct timer_rand_state), GFP_KERNEL);
        if (state) {
                state->last_time = INITIAL_JIFFIES;
                disk->random = state;
        }
}
#endif

/*
 * Each time the timer fires, we expect that we got an unpredictable
 * jump in the cycle counter. Even if the timer is running on another
 * CPU, the timer activity will be touching the stack of the CPU that is
 * generating entropy..
 *
 * Note that we don't re-arm the timer in the timer itself - we are
 * happy to be scheduled away, since that just makes the load more
 * complex, but we do not want the timer to keep ticking unless the
 * entropy loop is running.
 *
 * So the re-arming always happens in the entropy loop itself.
 */
static void __cold entropy_timer(struct timer_list *t)
{
        credit_init_bits(1);
}

/*
 * If we have an actual cycle counter, see if we can
 * generate enough entropy with timing noise
 */
static void __cold try_to_generate_entropy(void)
{
        struct {
                unsigned long entropy;
                struct timer_list timer;
        } stack;

        stack.entropy = random_get_entropy();

        /* Slow counter - or none. Don't even bother */
        if (stack.entropy == random_get_entropy())
                return;

        timer_setup_on_stack(&stack.timer, entropy_timer, 0);
        while (!crng_ready() && !signal_pending(current)) {
                if (!timer_pending(&stack.timer))
                        mod_timer(&stack.timer, jiffies + 1);
                mix_pool_bytes(&stack.entropy, sizeof(stack.entropy));
                schedule();
                stack.entropy = random_get_entropy();
        }

        del_timer_sync(&stack.timer);
        destroy_timer_on_stack(&stack.timer);
        mix_pool_bytes(&stack.entropy, sizeof(stack.entropy));
}


/**********************************************************************
 *
 * Userspace reader/writer interfaces.
 *
 * getrandom(2) is the primary modern interface into the RNG and should
 * be used in preference to anything else.
 *
 * Reading from /dev/random has the same functionality as calling
 * getrandom(2) with flags=0. In earlier versions, however, it had
 * vastly different semantics and should therefore be avoided, to
 * prevent backwards compatibility issues.
 *
 * Reading from /dev/urandom has the same functionality as calling
 * getrandom(2) with flags=GRND_INSECURE. Because it does not block
 * waiting for the RNG to be ready, it should not be used.
 *
 * Writing to either /dev/random or /dev/urandom adds entropy to
 * the input pool but does not credit it.
 *
 * Polling on /dev/random indicates when the RNG is initialized, on
 * the read side, and when it wants new entropy, on the write side.
 *
 * Both /dev/random and /dev/urandom have the same set of ioctls for
 * adding entropy, getting the entropy count, zeroing the count, and
 * reseeding the crng.
 *
 **********************************************************************/

SYSCALL_DEFINE3(getrandom, char __user *, ubuf, size_t, len, unsigned int, flags)
{
        struct iov_iter iter;
        struct iovec iov;
        int ret;

        if (flags & ~(GRND_NONBLOCK | GRND_RANDOM | GRND_INSECURE))
                return -EINVAL;

        /*
         * Requesting insecure and blocking randomness at the same time makes
         * no sense.
         */
        if ((flags & (GRND_INSECURE | GRND_RANDOM)) == (GRND_INSECURE | GRND_RANDOM))
                return -EINVAL;

        if (!crng_ready() && !(flags & GRND_INSECURE)) {
                if (flags & GRND_NONBLOCK)
                        return -EAGAIN;
                ret = wait_for_random_bytes();
                if (unlikely(ret))
                        return ret;
        }

        ret = import_single_range(READ, ubuf, len, &iov, &iter);
        if (unlikely(ret))
                return ret;
        return get_random_bytes_user(&iter);
}

static __poll_t random_poll(struct file *file, poll_table *wait)
{
        poll_wait(file, &crng_init_wait, wait);
        return crng_ready() ? EPOLLIN | EPOLLRDNORM : EPOLLOUT | EPOLLWRNORM;
}

static ssize_t write_pool_user(struct iov_iter *iter)
{
        u8 block[BLAKE2S_BLOCK_SIZE];
        ssize_t ret = 0;
        size_t copied;

        if (unlikely(!iov_iter_count(iter)))
                return 0;

        for (;;) {
                copied = copy_from_iter(block, sizeof(block), iter);
                ret += copied;
                mix_pool_bytes(block, copied);
                if (!iov_iter_count(iter) || copied != sizeof(block))
                        break;

                BUILD_BUG_ON(PAGE_SIZE % sizeof(block) != 0);
                if (ret % PAGE_SIZE == 0) {
                        if (signal_pending(current))
                                break;
                        cond_resched();
                }
        }

        memzero_explicit(block, sizeof(block));
        return ret ? ret : -EFAULT;
}

static ssize_t random_write_iter(struct kiocb *kiocb, struct iov_iter *iter)
{
        return write_pool_user(iter);
}

static ssize_t urandom_read_iter(struct kiocb *kiocb, struct iov_iter *iter)
{
        static int maxwarn = 10;

        if (!crng_ready()) {
                if (!ratelimit_disable && maxwarn <= 0)
                        ++urandom_warning.missed;
                else if (ratelimit_disable || __ratelimit(&urandom_warning)) {
                        --maxwarn;
                        pr_notice("%s: uninitialized urandom read (%zu bytes read)\n",
                                  current->comm, iov_iter_count(iter));
                }
        }

        return get_random_bytes_user(iter);
}

static ssize_t random_read_iter(struct kiocb *kiocb, struct iov_iter *iter)
{
        int ret;

        if (!crng_ready() &&
            ((kiocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO)) ||
             (kiocb->ki_filp->f_flags & O_NONBLOCK)))
                return -EAGAIN;

        ret = wait_for_random_bytes();
        if (ret != 0)
                return ret;
        return get_random_bytes_user(iter);
}

static long random_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
{
        int __user *p = (int __user *)arg;
        int ent_count;

        switch (cmd) {
        case RNDGETENTCNT:
                /* Inherently racy, no point locking. */
                if (put_user(input_pool.init_bits, p))
                        return -EFAULT;
                return 0;
        case RNDADDTOENTCNT:
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                if (get_user(ent_count, p))
                        return -EFAULT;
                if (ent_count < 0)
                        return -EINVAL;
                credit_init_bits(ent_count);
                return 0;
        case RNDADDENTROPY: {
                struct iov_iter iter;
                struct iovec iov;
                ssize_t ret;
                int len;

                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                if (get_user(ent_count, p++))
                        return -EFAULT;
                if (ent_count < 0)
                        return -EINVAL;
                if (get_user(len, p++))
                        return -EFAULT;
                ret = import_single_range(WRITE, p, len, &iov, &iter);
                if (unlikely(ret))
                        return ret;
                ret = write_pool_user(&iter);
                if (unlikely(ret < 0))
                        return ret;
                /* Since we're crediting, enforce that it was all written into the pool. */
                if (unlikely(ret != len))
                        return -EFAULT;
                credit_init_bits(ent_count);
                return 0;
        }
        case RNDZAPENTCNT:
        case RNDCLEARPOOL:
                /* No longer has any effect. */
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                return 0;
        case RNDRESEEDCRNG:
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                if (!crng_ready())
                        return -ENODATA;
                crng_reseed();
                return 0;
        default:
                return -EINVAL;
        }
}

static int random_fasync(int fd, struct file *filp, int on)
{
        return fasync_helper(fd, filp, on, &fasync);
}

const struct file_operations random_fops = {
        .read_iter = random_read_iter,
        .write_iter = random_write_iter,
        .poll = random_poll,
        .unlocked_ioctl = random_ioctl,
        .compat_ioctl = compat_ptr_ioctl,
        .fasync = random_fasync,
        .llseek = noop_llseek,
        .splice_read = generic_file_splice_read,
        .splice_write = iter_file_splice_write,
};

const struct file_operations urandom_fops = {
        .read_iter = urandom_read_iter,
        .write_iter = random_write_iter,
        .unlocked_ioctl = random_ioctl,
        .compat_ioctl = compat_ptr_ioctl,
        .fasync = random_fasync,
        .llseek = noop_llseek,
        .splice_read = generic_file_splice_read,
        .splice_write = iter_file_splice_write,
};


/********************************************************************
 *
 * Sysctl interface.
 *
 * These are partly unused legacy knobs with dummy values to not break
 * userspace and partly still useful things. They are usually accessible
 * in /proc/sys/kernel/random/ and are as follows:
 *
 * - boot_id - a UUID representing the current boot.
 *
 * - uuid - a random UUID, different each time the file is read.
 *
 * - poolsize - the number of bits of entropy that the input pool can
 *   hold, tied to the POOL_BITS constant.
 *
 * - entropy_avail - the number of bits of entropy currently in the
 *   input pool. Always <= poolsize.
 *
 * - write_wakeup_threshold - the amount of entropy in the input pool
 *   below which write polls to /dev/random will unblock, requesting
 *   more entropy, tied to the POOL_READY_BITS constant. It is writable
 *   to avoid breaking old userspaces, but writing to it does not
 *   change any behavior of the RNG.
 *
 * - urandom_min_reseed_secs - fixed to the value CRNG_RESEED_INTERVAL.
 *   It is writable to avoid breaking old userspaces, but writing
 *   to it does not change any behavior of the RNG.
 *
 ********************************************************************/

#ifdef CONFIG_SYSCTL

#include <linux/sysctl.h>

static int sysctl_random_min_urandom_seed = CRNG_RESEED_INTERVAL / HZ;
static int sysctl_random_write_wakeup_bits = POOL_READY_BITS;
static int sysctl_poolsize = POOL_BITS;
static u8 sysctl_bootid[UUID_SIZE];

/*
 * This function is used to return both the bootid UUID, and random
 * UUID. The difference is in whether table->data is NULL; if it is,
 * then a new UUID is generated and returned to the user.
 */
static int proc_do_uuid(struct ctl_table *table, int write, void *buf,
                        size_t *lenp, loff_t *ppos)
{
        u8 tmp_uuid[UUID_SIZE], *uuid;
        char uuid_string[UUID_STRING_LEN + 1];
        struct ctl_table fake_table = {
                .data = uuid_string,
                .maxlen = UUID_STRING_LEN
        };

        if (write)
                return -EPERM;

        uuid = table->data;
        if (!uuid) {
                uuid = tmp_uuid;
                generate_random_uuid(uuid);
        } else {
                static DEFINE_SPINLOCK(bootid_spinlock);

                spin_lock(&bootid_spinlock);
                if (!uuid[8])
                        generate_random_uuid(uuid);
                spin_unlock(&bootid_spinlock);
        }

        snprintf(uuid_string, sizeof(uuid_string), "%pU", uuid);
        return proc_dostring(&fake_table, 0, buf, lenp, ppos);
}

/* The same as proc_dointvec, but writes don't change anything. */
static int proc_do_rointvec(struct ctl_table *table, int write, void *buf,
                            size_t *lenp, loff_t *ppos)
{
        return write ? 0 : proc_dointvec(table, 0, buf, lenp, ppos);
}

extern struct ctl_table random_table[];
struct ctl_table random_table[] = {
        {
                .procname        = "poolsize",
                .data                = &sysctl_poolsize,
                .maxlen                = sizeof(int),
                .mode                = 0444,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "entropy_avail",
                .data                = &input_pool.init_bits,
                .maxlen                = sizeof(int),
                .mode                = 0444,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "write_wakeup_threshold",
                .data                = &sysctl_random_write_wakeup_bits,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_do_rointvec,
        },
        {
                .procname        = "urandom_min_reseed_secs",
                .data                = &sysctl_random_min_urandom_seed,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_do_rointvec,
        },
        {
                .procname        = "boot_id",
                .data                = &sysctl_bootid,
                .mode                = 0444,
                .proc_handler        = proc_do_uuid,
        },
        {
                .procname        = "uuid",
                .mode                = 0444,
                .proc_handler        = proc_do_uuid,
        },
        { }
};
#endif        /* CONFIG_SYSCTL */




























































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_GENERIC_NETLINK_H
#define __NET_GENERIC_NETLINK_H

#include <linux/genetlink.h>
#include <net/netlink.h>
#include <net/net_namespace.h>

#define GENLMSG_DEFAULT_SIZE (NLMSG_DEFAULT_SIZE - GENL_HDRLEN)

/**
 * struct genl_multicast_group - generic netlink multicast group
 * @name: name of the multicast group, names are per-family
 * @cap_sys_admin: whether %CAP_SYS_ADMIN is required for binding
 */
struct genl_multicast_group {
        char                        name[GENL_NAMSIZ];
        u8                        flags;
        u8                        cap_sys_admin:1;
};

struct genl_ops;
struct genl_info;

/**
 * struct genl_family - generic netlink family
 * @id: protocol family identifier (private)
 * @hdrsize: length of user specific header in bytes
 * @name: name of family
 * @version: protocol version
 * @maxattr: maximum number of attributes supported
 * @policy: netlink policy
 * @netnsok: set to true if the family can handle network
 *        namespaces and should be presented in all of them
 * @parallel_ops: operations can be called in parallel and aren't
 *        synchronized by the core genetlink code
 * @pre_doit: called before an operation's doit callback, it may
 *        do additional, common, filtering and return an error
 * @post_doit: called after an operation's doit callback, it may
 *        undo operations done by pre_doit, for example release locks
 * @mcgrps: multicast groups used by this family
 * @n_mcgrps: number of multicast groups
 * @mcgrp_offset: starting number of multicast group IDs in this family
 *        (private)
 * @ops: the operations supported by this family
 * @n_ops: number of operations supported by this family
 * @small_ops: the small-struct operations supported by this family
 * @n_small_ops: number of small-struct operations supported by this family
 */
struct genl_family {
        int                        id;                /* private */
        unsigned int                hdrsize;
        char                        name[GENL_NAMSIZ];
        unsigned int                version;
        unsigned int                maxattr;
        unsigned int                mcgrp_offset;        /* private */
        u8                        netnsok:1;
        u8                        parallel_ops:1;
        u8                        n_ops;
        u8                        n_small_ops;
        u8                        n_mcgrps;
        const struct nla_policy *policy;
        int                        (*pre_doit)(const struct genl_ops *ops,
                                            struct sk_buff *skb,
                                            struct genl_info *info);
        void                        (*post_doit)(const struct genl_ops *ops,
                                             struct sk_buff *skb,
                                             struct genl_info *info);
        const struct genl_ops *        ops;
        const struct genl_small_ops *small_ops;
        const struct genl_multicast_group *mcgrps;
        struct module                *module;
};

/**
 * struct genl_info - receiving information
 * @snd_seq: sending sequence number
 * @snd_portid: netlink portid of sender
 * @nlhdr: netlink message header
 * @genlhdr: generic netlink message header
 * @userhdr: user specific header
 * @attrs: netlink attributes
 * @_net: network namespace
 * @user_ptr: user pointers
 * @extack: extended ACK report struct
 */
struct genl_info {
        u32                        snd_seq;
        u32                        snd_portid;
        struct nlmsghdr *        nlhdr;
        struct genlmsghdr *        genlhdr;
        void *                        userhdr;
        struct nlattr **        attrs;
        possible_net_t                _net;
        void *                        user_ptr[2];
        struct netlink_ext_ack *extack;
};

static inline struct net *genl_info_net(struct genl_info *info)
{
        return read_pnet(&info->_net);
}

static inline void genl_info_net_set(struct genl_info *info, struct net *net)
{
        write_pnet(&info->_net, net);
}

#define GENL_SET_ERR_MSG(info, msg) NL_SET_ERR_MSG((info)->extack, msg)

enum genl_validate_flags {
        GENL_DONT_VALIDATE_STRICT                = BIT(0),
        GENL_DONT_VALIDATE_DUMP                        = BIT(1),
        GENL_DONT_VALIDATE_DUMP_STRICT                = BIT(2),
};

/**
 * struct genl_small_ops - generic netlink operations (small version)
 * @cmd: command identifier
 * @internal_flags: flags used by the family
 * @flags: flags
 * @validate: validation flags from enum genl_validate_flags
 * @doit: standard command callback
 * @dumpit: callback for dumpers
 *
 * This is a cut-down version of struct genl_ops for users who don't need
 * most of the ancillary infra and want to save space.
 */
struct genl_small_ops {
        int        (*doit)(struct sk_buff *skb, struct genl_info *info);
        int        (*dumpit)(struct sk_buff *skb, struct netlink_callback *cb);
        u8        cmd;
        u8        internal_flags;
        u8        flags;
        u8        validate;
};

/**
 * struct genl_ops - generic netlink operations
 * @cmd: command identifier
 * @internal_flags: flags used by the family
 * @flags: flags
 * @maxattr: maximum number of attributes supported
 * @policy: netlink policy (takes precedence over family policy)
 * @validate: validation flags from enum genl_validate_flags
 * @doit: standard command callback
 * @start: start callback for dumps
 * @dumpit: callback for dumpers
 * @done: completion callback for dumps
 */
struct genl_ops {
        int                       (*doit)(struct sk_buff *skb,
                                       struct genl_info *info);
        int                       (*start)(struct netlink_callback *cb);
        int                       (*dumpit)(struct sk_buff *skb,
                                         struct netlink_callback *cb);
        int                       (*done)(struct netlink_callback *cb);
        const struct nla_policy *policy;
        unsigned int                maxattr;
        u8                        cmd;
        u8                        internal_flags;
        u8                        flags;
        u8                        validate;
};

/**
 * struct genl_info - info that is available during dumpit op call
 * @family: generic netlink family - for internal genl code usage
 * @ops: generic netlink ops - for internal genl code usage
 * @attrs: netlink attributes
 */
struct genl_dumpit_info {
        const struct genl_family *family;
        struct genl_ops op;
        struct nlattr **attrs;
};

static inline const struct genl_dumpit_info *
genl_dumpit_info(struct netlink_callback *cb)
{
        return cb->data;
}

int genl_register_family(struct genl_family *family);
int genl_unregister_family(const struct genl_family *family);
void genl_notify(const struct genl_family *family, struct sk_buff *skb,
                 struct genl_info *info, u32 group, gfp_t flags);

void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq,
                  const struct genl_family *family, int flags, u8 cmd);

/**
 * genlmsg_nlhdr - Obtain netlink header from user specified header
 * @user_hdr: user header as returned from genlmsg_put()
 *
 * Returns pointer to netlink header.
 */
static inline struct nlmsghdr *genlmsg_nlhdr(void *user_hdr)
{
        return (struct nlmsghdr *)((char *)user_hdr -
                                   GENL_HDRLEN -
                                   NLMSG_HDRLEN);
}

/**
 * genlmsg_parse_deprecated - parse attributes of a genetlink message
 * @nlh: netlink message header
 * @family: genetlink message family
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @extack: extended ACK report struct
 */
static inline int genlmsg_parse_deprecated(const struct nlmsghdr *nlh,
                                           const struct genl_family *family,
                                           struct nlattr *tb[], int maxtype,
                                           const struct nla_policy *policy,
                                           struct netlink_ext_ack *extack)
{
        return __nlmsg_parse(nlh, family->hdrsize + GENL_HDRLEN, tb, maxtype,
                             policy, NL_VALIDATE_LIBERAL, extack);
}

/**
 * genlmsg_parse - parse attributes of a genetlink message
 * @nlh: netlink message header
 * @family: genetlink message family
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @extack: extended ACK report struct
 */
static inline int genlmsg_parse(const struct nlmsghdr *nlh,
                                const struct genl_family *family,
                                struct nlattr *tb[], int maxtype,
                                const struct nla_policy *policy,
                                struct netlink_ext_ack *extack)
{
        return __nlmsg_parse(nlh, family->hdrsize + GENL_HDRLEN, tb, maxtype,
                             policy, NL_VALIDATE_STRICT, extack);
}

/**
 * genl_dump_check_consistent - check if sequence is consistent and advertise if not
 * @cb: netlink callback structure that stores the sequence number
 * @user_hdr: user header as returned from genlmsg_put()
 *
 * Cf. nl_dump_check_consistent(), this just provides a wrapper to make it
 * simpler to use with generic netlink.
 */
static inline void genl_dump_check_consistent(struct netlink_callback *cb,
                                              void *user_hdr)
{
        nl_dump_check_consistent(cb, genlmsg_nlhdr(user_hdr));
}

/**
 * genlmsg_put_reply - Add generic netlink header to a reply message
 * @skb: socket buffer holding the message
 * @info: receiver info
 * @family: generic netlink family
 * @flags: netlink message flags
 * @cmd: generic netlink command
 *
 * Returns pointer to user specific header
 */
static inline void *genlmsg_put_reply(struct sk_buff *skb,
                                      struct genl_info *info,
                                      const struct genl_family *family,
                                      int flags, u8 cmd)
{
        return genlmsg_put(skb, info->snd_portid, info->snd_seq, family,
                           flags, cmd);
}

/**
 * genlmsg_end - Finalize a generic netlink message
 * @skb: socket buffer the message is stored in
 * @hdr: user specific header
 */
static inline void genlmsg_end(struct sk_buff *skb, void *hdr)
{
        nlmsg_end(skb, hdr - GENL_HDRLEN - NLMSG_HDRLEN);
}

/**
 * genlmsg_cancel - Cancel construction of a generic netlink message
 * @skb: socket buffer the message is stored in
 * @hdr: generic netlink message header
 */
static inline void genlmsg_cancel(struct sk_buff *skb, void *hdr)
{
        if (hdr)
                nlmsg_cancel(skb, hdr - GENL_HDRLEN - NLMSG_HDRLEN);
}

/**
 * genlmsg_multicast_netns - multicast a netlink message to a specific netns
 * @family: the generic netlink family
 * @net: the net namespace
 * @skb: netlink message as socket buffer
 * @portid: own netlink portid to avoid sending to yourself
 * @group: offset of multicast group in groups array
 * @flags: allocation flags
 */
static inline int genlmsg_multicast_netns(const struct genl_family *family,
                                          struct net *net, struct sk_buff *skb,
                                          u32 portid, unsigned int group, gfp_t flags)
{
        if (WARN_ON_ONCE(group >= family->n_mcgrps))
                return -EINVAL;
        group = family->mcgrp_offset + group;
        return nlmsg_multicast(net->genl_sock, skb, portid, group, flags);
}

/**
 * genlmsg_multicast - multicast a netlink message to the default netns
 * @family: the generic netlink family
 * @skb: netlink message as socket buffer
 * @portid: own netlink portid to avoid sending to yourself
 * @group: offset of multicast group in groups array
 * @flags: allocation flags
 */
static inline int genlmsg_multicast(const struct genl_family *family,
                                    struct sk_buff *skb, u32 portid,
                                    unsigned int group, gfp_t flags)
{
        return genlmsg_multicast_netns(family, &init_net, skb,
                                       portid, group, flags);
}

/**
 * genlmsg_multicast_allns - multicast a netlink message to all net namespaces
 * @family: the generic netlink family
 * @skb: netlink message as socket buffer
 * @portid: own netlink portid to avoid sending to yourself
 * @group: offset of multicast group in groups array
 *
 * This function must hold the RTNL or rcu_read_lock().
 */
int genlmsg_multicast_allns(const struct genl_family *family,
                            struct sk_buff *skb, u32 portid,
                            unsigned int group);

/**
 * genlmsg_unicast - unicast a netlink message
 * @skb: netlink message as socket buffer
 * @portid: netlink portid of the destination socket
 */
static inline int genlmsg_unicast(struct net *net, struct sk_buff *skb, u32 portid)
{
        return nlmsg_unicast(net->genl_sock, skb, portid);
}

/**
 * genlmsg_reply - reply to a request
 * @skb: netlink message to be sent back
 * @info: receiver information
 */
static inline int genlmsg_reply(struct sk_buff *skb, struct genl_info *info)
{
        return genlmsg_unicast(genl_info_net(info), skb, info->snd_portid);
}

/**
 * gennlmsg_data - head of message payload
 * @gnlh: genetlink message header
 */
static inline void *genlmsg_data(const struct genlmsghdr *gnlh)
{
        return ((unsigned char *) gnlh + GENL_HDRLEN);
}

/**
 * genlmsg_len - length of message payload
 * @gnlh: genetlink message header
 */
static inline int genlmsg_len(const struct genlmsghdr *gnlh)
{
        struct nlmsghdr *nlh = (struct nlmsghdr *)((unsigned char *)gnlh -
                                                        NLMSG_HDRLEN);
        return (nlh->nlmsg_len - GENL_HDRLEN - NLMSG_HDRLEN);
}

/**
 * genlmsg_msg_size - length of genetlink message not including padding
 * @payload: length of message payload
 */
static inline int genlmsg_msg_size(int payload)
{
        return GENL_HDRLEN + payload;
}

/**
 * genlmsg_total_size - length of genetlink message including padding
 * @payload: length of message payload
 */
static inline int genlmsg_total_size(int payload)
{
        return NLMSG_ALIGN(genlmsg_msg_size(payload));
}

/**
 * genlmsg_new - Allocate a new generic netlink message
 * @payload: size of the message payload
 * @flags: the type of memory to allocate.
 */
static inline struct sk_buff *genlmsg_new(size_t payload, gfp_t flags)
{
        return nlmsg_new(genlmsg_total_size(payload), flags);
}

/**
 * genl_set_err - report error to genetlink broadcast listeners
 * @family: the generic netlink family
 * @net: the network namespace to report the error to
 * @portid: the PORTID of a process that we want to skip (if any)
 * @group: the broadcast group that will notice the error
 *         (this is the offset of the multicast group in the groups array)
 * @code: error code, must be negative (as usual in kernelspace)
 *
 * This function returns the number of broadcast listeners that have set the
 * NETLINK_RECV_NO_ENOBUFS socket option.
 */
static inline int genl_set_err(const struct genl_family *family,
                               struct net *net, u32 portid,
                               u32 group, int code)
{
        if (WARN_ON_ONCE(group >= family->n_mcgrps))
                return -EINVAL;
        group = family->mcgrp_offset + group;
        return netlink_set_err(net->genl_sock, portid, group, code);
}

static inline int genl_has_listeners(const struct genl_family *family,
                                     struct net *net, unsigned int group)
{
        if (WARN_ON_ONCE(group >= family->n_mcgrps))
                return -EINVAL;
        group = family->mcgrp_offset + group;
        return netlink_has_listeners(net->genl_sock, group);
}
#endif        /* __NET_GENERIC_NETLINK_H */
























































































































    3 

    2 


    4 
    1 










































    4 













































































































































    4 










    4 
    4 











    1 























    3 


























    2 











    2 
















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * include/linux/buffer_head.h
 *
 * Everything to do with buffer_heads.
 */

#ifndef _LINUX_BUFFER_HEAD_H
#define _LINUX_BUFFER_HEAD_H

#include <linux/types.h>
#include <linux/fs.h>
#include <linux/linkage.h>
#include <linux/pagemap.h>
#include <linux/wait.h>
#include <linux/atomic.h>

#ifdef CONFIG_BLOCK

enum bh_state_bits {
        BH_Uptodate,        /* Contains valid data */
        BH_Dirty,        /* Is dirty */
        BH_Lock,        /* Is locked */
        BH_Req,                /* Has been submitted for I/O */

        BH_Mapped,        /* Has a disk mapping */
        BH_New,                /* Disk mapping was newly created by get_block */
        BH_Async_Read,        /* Is under end_buffer_async_read I/O */
        BH_Async_Write,        /* Is under end_buffer_async_write I/O */
        BH_Delay,        /* Buffer is not yet allocated on disk */
        BH_Boundary,        /* Block is followed by a discontiguity */
        BH_Write_EIO,        /* I/O error on write */
        BH_Unwritten,        /* Buffer is allocated on disk but not written */
        BH_Quiet,        /* Buffer Error Prinks to be quiet */
        BH_Meta,        /* Buffer contains metadata */
        BH_Prio,        /* Buffer should be submitted with REQ_PRIO */
        BH_Defer_Completion, /* Defer AIO completion to workqueue */

        BH_PrivateStart,/* not a state bit, but the first bit available
                         * for private allocation by other entities
                         */
};

#define MAX_BUF_PER_PAGE (PAGE_SIZE / 512)

struct page;
struct buffer_head;
struct address_space;
typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate);

/*
 * Historically, a buffer_head was used to map a single block
 * within a page, and of course as the unit of I/O through the
 * filesystem and block layers.  Nowadays the basic I/O unit
 * is the bio, and buffer_heads are used for extracting block
 * mappings (via a get_block_t call), for tracking state within
 * a page (via a page_mapping) and for wrapping bio submission
 * for backward compatibility reasons (e.g. submit_bh).
 */
struct buffer_head {
        unsigned long b_state;                /* buffer state bitmap (see above) */
        struct buffer_head *b_this_page;/* circular list of page's buffers */
        struct page *b_page;                /* the page this bh is mapped to */

        sector_t b_blocknr;                /* start block number */
        size_t b_size;                        /* size of mapping */
        char *b_data;                        /* pointer to data within the page */

        struct block_device *b_bdev;
        bh_end_io_t *b_end_io;                /* I/O completion */
         void *b_private;                /* reserved for b_end_io */
        struct list_head b_assoc_buffers; /* associated with another mapping */
        struct address_space *b_assoc_map;        /* mapping this buffer is
                                                   associated with */
        atomic_t b_count;                /* users using this buffer_head */
        spinlock_t b_uptodate_lock;        /* Used by the first bh in a page, to
                                         * serialise IO completion of other
                                         * buffers in the page */
};

/*
 * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
 * and buffer_foo() functions.
 * To avoid reset buffer flags that are already set, because that causes
 * a costly cache line transition, check the flag first.
 */
#define BUFFER_FNS(bit, name)                                                \
static __always_inline void set_buffer_##name(struct buffer_head *bh)        \
{                                                                        \
        if (!test_bit(BH_##bit, &(bh)->b_state))                        \
                set_bit(BH_##bit, &(bh)->b_state);                        \
}                                                                        \
static __always_inline void clear_buffer_##name(struct buffer_head *bh)        \
{                                                                        \
        clear_bit(BH_##bit, &(bh)->b_state);                                \
}                                                                        \
static __always_inline int buffer_##name(const struct buffer_head *bh)        \
{                                                                        \
        return test_bit(BH_##bit, &(bh)->b_state);                        \
}

/*
 * test_set_buffer_foo() and test_clear_buffer_foo()
 */
#define TAS_BUFFER_FNS(bit, name)                                        \
static __always_inline int test_set_buffer_##name(struct buffer_head *bh) \
{                                                                        \
        return test_and_set_bit(BH_##bit, &(bh)->b_state);                \
}                                                                        \
static __always_inline int test_clear_buffer_##name(struct buffer_head *bh) \
{                                                                        \
        return test_and_clear_bit(BH_##bit, &(bh)->b_state);                \
}                                                                        \

/*
 * Emit the buffer bitops functions.   Note that there are also functions
 * of the form "mark_buffer_foo()".  These are higher-level functions which
 * do something in addition to setting a b_state bit.
 */
BUFFER_FNS(Dirty, dirty)
TAS_BUFFER_FNS(Dirty, dirty)
BUFFER_FNS(Lock, locked)
BUFFER_FNS(Req, req)
TAS_BUFFER_FNS(Req, req)
BUFFER_FNS(Mapped, mapped)
BUFFER_FNS(New, new)
BUFFER_FNS(Async_Read, async_read)
BUFFER_FNS(Async_Write, async_write)
BUFFER_FNS(Delay, delay)
BUFFER_FNS(Boundary, boundary)
BUFFER_FNS(Write_EIO, write_io_error)
BUFFER_FNS(Unwritten, unwritten)
BUFFER_FNS(Meta, meta)
BUFFER_FNS(Prio, prio)
BUFFER_FNS(Defer_Completion, defer_completion)

static __always_inline void set_buffer_uptodate(struct buffer_head *bh)
{
        /*
         * If somebody else already set this uptodate, they will
         * have done the memory barrier, and a reader will thus
         * see *some* valid buffer state.
         *
         * Any other serialization (with IO errors or whatever that
         * might clear the bit) has to come from other state (eg BH_Lock).
         */
        if (test_bit(BH_Uptodate, &bh->b_state))
                return;

        /*
         * make it consistent with folio_mark_uptodate
         * pairs with smp_load_acquire in buffer_uptodate
         */
        smp_mb__before_atomic();
        set_bit(BH_Uptodate, &bh->b_state);
}

static __always_inline void clear_buffer_uptodate(struct buffer_head *bh)
{
        clear_bit(BH_Uptodate, &bh->b_state);
}

static __always_inline int buffer_uptodate(const struct buffer_head *bh)
{
        /*
         * make it consistent with folio_test_uptodate
         * pairs with smp_mb__before_atomic in set_buffer_uptodate
         */
        return (smp_load_acquire(&bh->b_state) & (1UL << BH_Uptodate)) != 0;
}

#define bh_offset(bh)                ((unsigned long)(bh)->b_data & ~PAGE_MASK)

/* If we *know* page->private refers to buffer_heads */
#define page_buffers(page)                                        \
        ({                                                        \
                BUG_ON(!PagePrivate(page));                        \
                ((struct buffer_head *)page_private(page));        \
        })
#define page_has_buffers(page)        PagePrivate(page)

void buffer_check_dirty_writeback(struct page *page,
                                     bool *dirty, bool *writeback);

/*
 * Declarations
 */

void mark_buffer_dirty(struct buffer_head *bh);
void mark_buffer_write_io_error(struct buffer_head *bh);
void touch_buffer(struct buffer_head *bh);
void set_bh_page(struct buffer_head *bh,
                struct page *page, unsigned long offset);
int try_to_free_buffers(struct page *);
struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
                bool retry);
void create_empty_buffers(struct page *, unsigned long,
                        unsigned long b_state);
void end_buffer_read_sync(struct buffer_head *bh, int uptodate);
void end_buffer_write_sync(struct buffer_head *bh, int uptodate);
void end_buffer_async_write(struct buffer_head *bh, int uptodate);

/* Things to do with buffers at mapping->private_list */
void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode);
int inode_has_buffers(struct inode *);
void invalidate_inode_buffers(struct inode *);
int remove_inode_buffers(struct inode *inode);
int sync_mapping_buffers(struct address_space *mapping);
void clean_bdev_aliases(struct block_device *bdev, sector_t block,
                        sector_t len);
static inline void clean_bdev_bh_alias(struct buffer_head *bh)
{
        clean_bdev_aliases(bh->b_bdev, bh->b_blocknr, 1);
}

void mark_buffer_async_write(struct buffer_head *bh);
void __wait_on_buffer(struct buffer_head *);
wait_queue_head_t *bh_waitq_head(struct buffer_head *bh);
struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block,
                        unsigned size);
struct buffer_head *__getblk_gfp(struct block_device *bdev, sector_t block,
                                  unsigned size, gfp_t gfp);
void __brelse(struct buffer_head *);
void __bforget(struct buffer_head *);
void __breadahead(struct block_device *, sector_t block, unsigned int size);
void __breadahead_gfp(struct block_device *, sector_t block, unsigned int size,
                  gfp_t gfp);
struct buffer_head *__bread_gfp(struct block_device *,
                                sector_t block, unsigned size, gfp_t gfp);
void invalidate_bh_lrus(void);
struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
void free_buffer_head(struct buffer_head * bh);
void unlock_buffer(struct buffer_head *bh);
void __lock_buffer(struct buffer_head *bh);
void ll_rw_block(int, int, int, struct buffer_head * bh[]);
int sync_dirty_buffer(struct buffer_head *bh);
int __sync_dirty_buffer(struct buffer_head *bh, int op_flags);
void write_dirty_buffer(struct buffer_head *bh, int op_flags);
int submit_bh(int, int, struct buffer_head *);
void write_boundary_block(struct block_device *bdev,
                        sector_t bblock, unsigned blocksize);
int bh_uptodate_or_lock(struct buffer_head *bh);
int bh_submit_read(struct buffer_head *bh);

extern int buffer_heads_over_limit;

/*
 * Generic address_space_operations implementations for buffer_head-backed
 * address_spaces.
 */
void block_invalidatepage(struct page *page, unsigned int offset,
                          unsigned int length);
int block_write_full_page(struct page *page, get_block_t *get_block,
                                struct writeback_control *wbc);
int __block_write_full_page(struct inode *inode, struct page *page,
                        get_block_t *get_block, struct writeback_control *wbc,
                        bh_end_io_t *handler);
int block_read_full_page(struct page*, get_block_t*);
int block_is_partially_uptodate(struct page *page, unsigned long from,
                                unsigned long count);
int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
                unsigned flags, struct page **pagep, get_block_t *get_block);
int __block_write_begin(struct page *page, loff_t pos, unsigned len,
                get_block_t *get_block);
int block_write_end(struct file *, struct address_space *,
                                loff_t, unsigned, unsigned,
                                struct page *, void *);
int generic_write_end(struct file *, struct address_space *,
                                loff_t, unsigned, unsigned,
                                struct page *, void *);
void page_zero_new_buffers(struct page *page, unsigned from, unsigned to);
void clean_page_buffers(struct page *page);
int cont_write_begin(struct file *, struct address_space *, loff_t,
                        unsigned, unsigned, struct page **, void **,
                        get_block_t *, loff_t *);
int generic_cont_expand_simple(struct inode *inode, loff_t size);
int block_commit_write(struct page *page, unsigned from, unsigned to);
int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
                                get_block_t get_block);
/* Convert errno to return value from ->page_mkwrite() call */
static inline vm_fault_t block_page_mkwrite_return(int err)
{
        if (err == 0)
                return VM_FAULT_LOCKED;
        if (err == -EFAULT || err == -EAGAIN)
                return VM_FAULT_NOPAGE;
        if (err == -ENOMEM)
                return VM_FAULT_OOM;
        /* -ENOSPC, -EDQUOT, -EIO ... */
        return VM_FAULT_SIGBUS;
}
sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
int block_truncate_page(struct address_space *, loff_t, get_block_t *);
int nobh_write_begin(struct address_space *, loff_t, unsigned, unsigned,
                                struct page **, void **, get_block_t*);
int nobh_write_end(struct file *, struct address_space *,
                                loff_t, unsigned, unsigned,
                                struct page *, void *);
int nobh_truncate_page(struct address_space *, loff_t, get_block_t *);
int nobh_writepage(struct page *page, get_block_t *get_block,
                        struct writeback_control *wbc);

void buffer_init(void);

/*
 * inline definitions
 */

static inline void get_bh(struct buffer_head *bh)
{
        atomic_inc(&bh->b_count);
}

static inline void put_bh(struct buffer_head *bh)
{
        smp_mb__before_atomic();
        atomic_dec(&bh->b_count);
}

static inline void brelse(struct buffer_head *bh)
{
        if (bh)
                __brelse(bh);
}

static inline void bforget(struct buffer_head *bh)
{
        if (bh)
                __bforget(bh);
}

static inline struct buffer_head *
sb_bread(struct super_block *sb, sector_t block)
{
        return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE);
}

static inline struct buffer_head *
sb_bread_unmovable(struct super_block *sb, sector_t block)
{
        return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, 0);
}

static inline void
sb_breadahead(struct super_block *sb, sector_t block)
{
        __breadahead(sb->s_bdev, block, sb->s_blocksize);
}

static inline void
sb_breadahead_unmovable(struct super_block *sb, sector_t block)
{
        __breadahead_gfp(sb->s_bdev, block, sb->s_blocksize, 0);
}

static inline struct buffer_head *
sb_getblk(struct super_block *sb, sector_t block)
{
        return __getblk_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE);
}


static inline struct buffer_head *
sb_getblk_gfp(struct super_block *sb, sector_t block, gfp_t gfp)
{
        return __getblk_gfp(sb->s_bdev, block, sb->s_blocksize, gfp);
}

static inline struct buffer_head *
sb_find_get_block(struct super_block *sb, sector_t block)
{
        return __find_get_block(sb->s_bdev, block, sb->s_blocksize);
}

static inline void
map_bh(struct buffer_head *bh, struct super_block *sb, sector_t block)
{
        set_buffer_mapped(bh);
        bh->b_bdev = sb->s_bdev;
        bh->b_blocknr = block;
        bh->b_size = sb->s_blocksize;
}

static inline void wait_on_buffer(struct buffer_head *bh)
{
        might_sleep();
        if (buffer_locked(bh))
                __wait_on_buffer(bh);
}

static inline int trylock_buffer(struct buffer_head *bh)
{
        return likely(!test_and_set_bit_lock(BH_Lock, &bh->b_state));
}

static inline void lock_buffer(struct buffer_head *bh)
{
        might_sleep();
        if (!trylock_buffer(bh))
                __lock_buffer(bh);
}

static inline struct buffer_head *getblk_unmovable(struct block_device *bdev,
                                                   sector_t block,
                                                   unsigned size)
{
        return __getblk_gfp(bdev, block, size, 0);
}

static inline struct buffer_head *__getblk(struct block_device *bdev,
                                           sector_t block,
                                           unsigned size)
{
        return __getblk_gfp(bdev, block, size, __GFP_MOVABLE);
}

/**
 *  __bread() - reads a specified block and returns the bh
 *  @bdev: the block_device to read from
 *  @block: number of block
 *  @size: size (in bytes) to read
 *
 *  Reads a specified block, and returns buffer head that contains it.
 *  The page cache is allocated from movable area so that it can be migrated.
 *  It returns NULL if the block was unreadable.
 */
static inline struct buffer_head *
__bread(struct block_device *bdev, sector_t block, unsigned size)
{
        return __bread_gfp(bdev, block, size, __GFP_MOVABLE);
}

extern int __set_page_dirty_buffers(struct page *page);

#else /* CONFIG_BLOCK */

static inline void buffer_init(void) {}
static inline int try_to_free_buffers(struct page *page) { return 1; }
static inline int inode_has_buffers(struct inode *inode) { return 0; }
static inline void invalidate_inode_buffers(struct inode *inode) {}
static inline int remove_inode_buffers(struct inode *inode) { return 1; }
static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; }
#define buffer_heads_over_limit 0

#endif /* CONFIG_BLOCK */
#endif /* _LINUX_BUFFER_HEAD_H */





























































    1 

    1 



    1 


















    1 









    1 




    1 
    1 



    1 




    1 


    1 





    1 

    1 








    1 
    1 








































    1 





    1 














    1 


    1 






    1 




    1 





    1 
    1 







    1 





    1 




    1 






    1 

















    1 

    1 







    1 

    1 


    1 




    1 


    1 


    1 


    1 


    1 








































































































    1 
    1 













    1 



























































































































































    1 





    1 



    1 











































































    1 


    1 











    1 





















































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
// SPDX-License-Identifier: GPL-2.0
/*
 * linux/fs/seq_file.c
 *
 * helper functions for making synthetic files from sequences of records.
 * initial implementation -- AV, Oct 2001.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/cache.h>
#include <linux/fs.h>
#include <linux/export.h>
#include <linux/seq_file.h>
#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/cred.h>
#include <linux/mm.h>
#include <linux/printk.h>
#include <linux/string_helpers.h>
#include <linux/uio.h>

#include <linux/uaccess.h>
#include <asm/page.h>

static struct kmem_cache *seq_file_cache __ro_after_init;

static void seq_set_overflow(struct seq_file *m)
{
        m->count = m->size;
}

static void *seq_buf_alloc(unsigned long size)
{
        if (unlikely(size > MAX_RW_COUNT))
                return NULL;

        return kvmalloc(size, GFP_KERNEL_ACCOUNT);
}

/**
 *        seq_open -        initialize sequential file
 *        @file: file we initialize
 *        @op: method table describing the sequence
 *
 *        seq_open() sets @file, associating it with a sequence described
 *        by @op.  @op->start() sets the iterator up and returns the first
 *        element of sequence. @op->stop() shuts it down.  @op->next()
 *        returns the next element of sequence.  @op->show() prints element
 *        into the buffer.  In case of error ->start() and ->next() return
 *        ERR_PTR(error).  In the end of sequence they return %NULL. ->show()
 *        returns 0 in case of success and negative number in case of error.
 *        Returning SEQ_SKIP means "discard this element and move on".
 *        Note: seq_open() will allocate a struct seq_file and store its
 *        pointer in @file->private_data. This pointer should not be modified.
 */
int seq_open(struct file *file, const struct seq_operations *op)
{
        struct seq_file *p;

        WARN_ON(file->private_data);

        p = kmem_cache_zalloc(seq_file_cache, GFP_KERNEL);
        if (!p)
                return -ENOMEM;

        file->private_data = p;

        mutex_init(&p->lock);
        p->op = op;

        // No refcounting: the lifetime of 'p' is constrained
        // to the lifetime of the file.
        p->file = file;

        /*
         * seq_files support lseek() and pread().  They do not implement
         * write() at all, but we clear FMODE_PWRITE here for historical
         * reasons.
         *
         * If a client of seq_files a) implements file.write() and b) wishes to
         * support pwrite() then that client will need to implement its own
         * file.open() which calls seq_open() and then sets FMODE_PWRITE.
         */
        file->f_mode &= ~FMODE_PWRITE;
        return 0;
}
EXPORT_SYMBOL(seq_open);

static int traverse(struct seq_file *m, loff_t offset)
{
        loff_t pos = 0;
        int error = 0;
        void *p;

        m->index = 0;
        m->count = m->from = 0;
        if (!offset)
                return 0;

        if (!m->buf) {
                m->buf = seq_buf_alloc(m->size = PAGE_SIZE);
                if (!m->buf)
                        return -ENOMEM;
        }
        p = m->op->start(m, &m->index);
        while (p) {
                error = PTR_ERR(p);
                if (IS_ERR(p))
                        break;
                error = m->op->show(m, p);
                if (error < 0)
                        break;
                if (unlikely(error)) {
                        error = 0;
                        m->count = 0;
                }
                if (seq_has_overflowed(m))
                        goto Eoverflow;
                p = m->op->next(m, p, &m->index);
                if (pos + m->count > offset) {
                        m->from = offset - pos;
                        m->count -= m->from;
                        break;
                }
                pos += m->count;
                m->count = 0;
                if (pos == offset)
                        break;
        }
        m->op->stop(m, p);
        return error;

Eoverflow:
        m->op->stop(m, p);
        kvfree(m->buf);
        m->count = 0;
        m->buf = seq_buf_alloc(m->size <<= 1);
        return !m->buf ? -ENOMEM : -EAGAIN;
}

/**
 *        seq_read -        ->read() method for sequential files.
 *        @file: the file to read from
 *        @buf: the buffer to read to
 *        @size: the maximum number of bytes to read
 *        @ppos: the current position in the file
 *
 *        Ready-made ->f_op->read()
 */
ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
{
        struct iovec iov = { .iov_base = buf, .iov_len = size};
        struct kiocb kiocb;
        struct iov_iter iter;
        ssize_t ret;

        init_sync_kiocb(&kiocb, file);
        iov_iter_init(&iter, READ, &iov, 1, size);

        kiocb.ki_pos = *ppos;
        ret = seq_read_iter(&kiocb, &iter);
        *ppos = kiocb.ki_pos;
        return ret;
}
EXPORT_SYMBOL(seq_read);

/*
 * Ready-made ->f_op->read_iter()
 */
ssize_t seq_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
        struct seq_file *m = iocb->ki_filp->private_data;
        size_t copied = 0;
        size_t n;
        void *p;
        int err = 0;

        if (!iov_iter_count(iter))
                return 0;

        mutex_lock(&m->lock);

        /*
         * if request is to read from zero offset, reset iterator to first
         * record as it might have been already advanced by previous requests
         */
        if (iocb->ki_pos == 0) {
                m->index = 0;
                m->count = 0;
        }

        /* Don't assume ki_pos is where we left it */
        if (unlikely(iocb->ki_pos != m->read_pos)) {
                while ((err = traverse(m, iocb->ki_pos)) == -EAGAIN)
                        ;
                if (err) {
                        /* With prejudice... */
                        m->read_pos = 0;
                        m->index = 0;
                        m->count = 0;
                        goto Done;
                } else {
                        m->read_pos = iocb->ki_pos;
                }
        }

        /* grab buffer if we didn't have one */
        if (!m->buf) {
                m->buf = seq_buf_alloc(m->size = PAGE_SIZE);
                if (!m->buf)
                        goto Enomem;
        }
        // something left in the buffer - copy it out first
        if (m->count) {
                n = copy_to_iter(m->buf + m->from, m->count, iter);
                m->count -= n;
                m->from += n;
                copied += n;
                if (m->count)        // hadn't managed to copy everything
                        goto Done;
        }
        // get a non-empty record in the buffer
        m->from = 0;
        p = m->op->start(m, &m->index);
        while (1) {
                err = PTR_ERR(p);
                if (!p || IS_ERR(p))        // EOF or an error
                        break;
                err = m->op->show(m, p);
                if (err < 0)                // hard error
                        break;
                if (unlikely(err))        // ->show() says "skip it"
                        m->count = 0;
                if (unlikely(!m->count)) { // empty record
                        p = m->op->next(m, p, &m->index);
                        continue;
                }
                if (!seq_has_overflowed(m)) // got it
                        goto Fill;
                // need a bigger buffer
                m->op->stop(m, p);
                kvfree(m->buf);
                m->count = 0;
                m->buf = seq_buf_alloc(m->size <<= 1);
                if (!m->buf)
                        goto Enomem;
                p = m->op->start(m, &m->index);
        }
        // EOF or an error
        m->op->stop(m, p);
        m->count = 0;
        goto Done;
Fill:
        // one non-empty record is in the buffer; if they want more,
        // try to fit more in, but in any case we need to advance
        // the iterator once for every record shown.
        while (1) {
                size_t offs = m->count;
                loff_t pos = m->index;

                p = m->op->next(m, p, &m->index);
                if (pos == m->index) {
                        pr_info_ratelimited("buggy .next function %ps did not update position index\n",
                                            m->op->next);
                        m->index++;
                }
                if (!p || IS_ERR(p))        // no next record for us
                        break;
                if (m->count >= iov_iter_count(iter))
                        break;
                err = m->op->show(m, p);
                if (err > 0) {                // ->show() says "skip it"
                        m->count = offs;
                } else if (err || seq_has_overflowed(m)) {
                        m->count = offs;
                        break;
                }
        }
        m->op->stop(m, p);
        n = copy_to_iter(m->buf, m->count, iter);
        copied += n;
        m->count -= n;
        m->from = n;
Done:
        if (unlikely(!copied)) {
                copied = m->count ? -EFAULT : err;
        } else {
                iocb->ki_pos += copied;
                m->read_pos += copied;
        }
        mutex_unlock(&m->lock);
        return copied;
Enomem:
        err = -ENOMEM;
        goto Done;
}
EXPORT_SYMBOL(seq_read_iter);

/**
 *        seq_lseek -        ->llseek() method for sequential files.
 *        @file: the file in question
 *        @offset: new position
 *        @whence: 0 for absolute, 1 for relative position
 *
 *        Ready-made ->f_op->llseek()
 */
loff_t seq_lseek(struct file *file, loff_t offset, int whence)
{
        struct seq_file *m = file->private_data;
        loff_t retval = -EINVAL;

        mutex_lock(&m->lock);
        switch (whence) {
        case SEEK_CUR:
                offset += file->f_pos;
                fallthrough;
        case SEEK_SET:
                if (offset < 0)
                        break;
                retval = offset;
                if (offset != m->read_pos) {
                        while ((retval = traverse(m, offset)) == -EAGAIN)
                                ;
                        if (retval) {
                                /* with extreme prejudice... */
                                file->f_pos = 0;
                                m->read_pos = 0;
                                m->index = 0;
                                m->count = 0;
                        } else {
                                m->read_pos = offset;
                                retval = file->f_pos = offset;
                        }
                } else {
                        file->f_pos = offset;
                }
        }
        mutex_unlock(&m->lock);
        return retval;
}
EXPORT_SYMBOL(seq_lseek);

/**
 *        seq_release -        free the structures associated with sequential file.
 *        @file: file in question
 *        @inode: its inode
 *
 *        Frees the structures associated with sequential file; can be used
 *        as ->f_op->release() if you don't have private data to destroy.
 */
int seq_release(struct inode *inode, struct file *file)
{
        struct seq_file *m = file->private_data;
        kvfree(m->buf);
        kmem_cache_free(seq_file_cache, m);
        return 0;
}
EXPORT_SYMBOL(seq_release);

/**
 *        seq_escape -        print string into buffer, escaping some characters
 *        @m:        target buffer
 *        @s:        string
 *        @esc:        set of characters that need escaping
 *
 *        Puts string into buffer, replacing each occurrence of character from
 *        @esc with usual octal escape.
 *        Use seq_has_overflowed() to check for errors.
 */
void seq_escape(struct seq_file *m, const char *s, const char *esc)
{
        char *buf;
        size_t size = seq_get_buf(m, &buf);
        int ret;

        ret = string_escape_str(s, buf, size, ESCAPE_OCTAL, esc);
        seq_commit(m, ret < size ? ret : -1);
}
EXPORT_SYMBOL(seq_escape);

void seq_escape_mem_ascii(struct seq_file *m, const char *src, size_t isz)
{
        char *buf;
        size_t size = seq_get_buf(m, &buf);
        int ret;

        ret = string_escape_mem_ascii(src, isz, buf, size);
        seq_commit(m, ret < size ? ret : -1);
}
EXPORT_SYMBOL(seq_escape_mem_ascii);

void seq_vprintf(struct seq_file *m, const char *f, va_list args)
{
        int len;

        if (m->count < m->size) {
                len = vsnprintf(m->buf + m->count, m->size - m->count, f, args);
                if (m->count + len < m->size) {
                        m->count += len;
                        return;
                }
        }
        seq_set_overflow(m);
}
EXPORT_SYMBOL(seq_vprintf);

void seq_printf(struct seq_file *m, const char *f, ...)
{
        va_list args;

        va_start(args, f);
        seq_vprintf(m, f, args);
        va_end(args);
}
EXPORT_SYMBOL(seq_printf);

/**
 *        mangle_path -        mangle and copy path to buffer beginning
 *        @s: buffer start
 *        @p: beginning of path in above buffer
 *        @esc: set of characters that need escaping
 *
 *      Copy the path from @p to @s, replacing each occurrence of character from
 *      @esc with usual octal escape.
 *      Returns pointer past last written character in @s, or NULL in case of
 *      failure.
 */
char *mangle_path(char *s, const char *p, const char *esc)
{
        while (s <= p) {
                char c = *p++;
                if (!c) {
                        return s;
                } else if (!strchr(esc, c)) {
                        *s++ = c;
                } else if (s + 4 > p) {
                        break;
                } else {
                        *s++ = '\\';
                        *s++ = '0' + ((c & 0300) >> 6);
                        *s++ = '0' + ((c & 070) >> 3);
                        *s++ = '0' + (c & 07);
                }
        }
        return NULL;
}
EXPORT_SYMBOL(mangle_path);

/**
 * seq_path - seq_file interface to print a pathname
 * @m: the seq_file handle
 * @path: the struct path to print
 * @esc: set of characters to escape in the output
 *
 * return the absolute path of 'path', as represented by the
 * dentry / mnt pair in the path parameter.
 */
int seq_path(struct seq_file *m, const struct path *path, const char *esc)
{
        char *buf;
        size_t size = seq_get_buf(m, &buf);
        int res = -1;

        if (size) {
                char *p = d_path(path, buf, size);
                if (!IS_ERR(p)) {
                        char *end = mangle_path(buf, p, esc);
                        if (end)
                                res = end - buf;
                }
        }
        seq_commit(m, res);

        return res;
}
EXPORT_SYMBOL(seq_path);

/**
 * seq_file_path - seq_file interface to print a pathname of a file
 * @m: the seq_file handle
 * @file: the struct file to print
 * @esc: set of characters to escape in the output
 *
 * return the absolute path to the file.
 */
int seq_file_path(struct seq_file *m, struct file *file, const char *esc)
{
        return seq_path(m, &file->f_path, esc);
}
EXPORT_SYMBOL(seq_file_path);

/*
 * Same as seq_path, but relative to supplied root.
 */
int seq_path_root(struct seq_file *m, const struct path *path,
                  const struct path *root, const char *esc)
{
        char *buf;
        size_t size = seq_get_buf(m, &buf);
        int res = -ENAMETOOLONG;

        if (size) {
                char *p;

                p = __d_path(path, root, buf, size);
                if (!p)
                        return SEQ_SKIP;
                res = PTR_ERR(p);
                if (!IS_ERR(p)) {
                        char *end = mangle_path(buf, p, esc);
                        if (end)
                                res = end - buf;
                        else
                                res = -ENAMETOOLONG;
                }
        }
        seq_commit(m, res);

        return res < 0 && res != -ENAMETOOLONG ? res : 0;
}

/*
 * returns the path of the 'dentry' from the root of its filesystem.
 */
int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc)
{
        char *buf;
        size_t size = seq_get_buf(m, &buf);
        int res = -1;

        if (size) {
                char *p = dentry_path(dentry, buf, size);
                if (!IS_ERR(p)) {
                        char *end = mangle_path(buf, p, esc);
                        if (end)
                                res = end - buf;
                }
        }
        seq_commit(m, res);

        return res;
}
EXPORT_SYMBOL(seq_dentry);

static void *single_start(struct seq_file *p, loff_t *pos)
{
        return NULL + (*pos == 0);
}

static void *single_next(struct seq_file *p, void *v, loff_t *pos)
{
        ++*pos;
        return NULL;
}

static void single_stop(struct seq_file *p, void *v)
{
}

int single_open(struct file *file, int (*show)(struct seq_file *, void *),
                void *data)
{
        struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL_ACCOUNT);
        int res = -ENOMEM;

        if (op) {
                op->start = single_start;
                op->next = single_next;
                op->stop = single_stop;
                op->show = show;
                res = seq_open(file, op);
                if (!res)
                        ((struct seq_file *)file->private_data)->private = data;
                else
                        kfree(op);
        }
        return res;
}
EXPORT_SYMBOL(single_open);

int single_open_size(struct file *file, int (*show)(struct seq_file *, void *),
                void *data, size_t size)
{
        char *buf = seq_buf_alloc(size);
        int ret;
        if (!buf)
                return -ENOMEM;
        ret = single_open(file, show, data);
        if (ret) {
                kvfree(buf);
                return ret;
        }
        ((struct seq_file *)file->private_data)->buf = buf;
        ((struct seq_file *)file->private_data)->size = size;
        return 0;
}
EXPORT_SYMBOL(single_open_size);

int single_release(struct inode *inode, struct file *file)
{
        const struct seq_operations *op = ((struct seq_file *)file->private_data)->op;
        int res = seq_release(inode, file);
        kfree(op);
        return res;
}
EXPORT_SYMBOL(single_release);

int seq_release_private(struct inode *inode, struct file *file)
{
        struct seq_file *seq = file->private_data;

        kfree(seq->private);
        seq->private = NULL;
        return seq_release(inode, file);
}
EXPORT_SYMBOL(seq_release_private);

void *__seq_open_private(struct file *f, const struct seq_operations *ops,
                int psize)
{
        int rc;
        void *private;
        struct seq_file *seq;

        private = kzalloc(psize, GFP_KERNEL_ACCOUNT);
        if (private == NULL)
                goto out;

        rc = seq_open(f, ops);
        if (rc < 0)
                goto out_free;

        seq = f->private_data;
        seq->private = private;
        return private;

out_free:
        kfree(private);
out:
        return NULL;
}
EXPORT_SYMBOL(__seq_open_private);

int seq_open_private(struct file *filp, const struct seq_operations *ops,
                int psize)
{
        return __seq_open_private(filp, ops, psize) ? 0 : -ENOMEM;
}
EXPORT_SYMBOL(seq_open_private);

void seq_putc(struct seq_file *m, char c)
{
        if (m->count >= m->size)
                return;

        m->buf[m->count++] = c;
}
EXPORT_SYMBOL(seq_putc);

void seq_puts(struct seq_file *m, const char *s)
{
        int len = strlen(s);

        if (m->count + len >= m->size) {
                seq_set_overflow(m);
                return;
        }
        memcpy(m->buf + m->count, s, len);
        m->count += len;
}
EXPORT_SYMBOL(seq_puts);

/**
 * A helper routine for putting decimal numbers without rich format of printf().
 * only 'unsigned long long' is supported.
 * @m: seq_file identifying the buffer to which data should be written
 * @delimiter: a string which is printed before the number
 * @num: the number
 * @width: a minimum field width
 *
 * This routine will put strlen(delimiter) + number into seq_filed.
 * This routine is very quick when you show lots of numbers.
 * In usual cases, it will be better to use seq_printf(). It's easier to read.
 */
void seq_put_decimal_ull_width(struct seq_file *m, const char *delimiter,
                         unsigned long long num, unsigned int width)
{
        int len;

        if (m->count + 2 >= m->size) /* we'll write 2 bytes at least */
                goto overflow;

        if (delimiter && delimiter[0]) {
                if (delimiter[1] == 0)
                        seq_putc(m, delimiter[0]);
                else
                        seq_puts(m, delimiter);
        }

        if (!width)
                width = 1;

        if (m->count + width >= m->size)
                goto overflow;

        len = num_to_str(m->buf + m->count, m->size - m->count, num, width);
        if (!len)
                goto overflow;

        m->count += len;
        return;

overflow:
        seq_set_overflow(m);
}

void seq_put_decimal_ull(struct seq_file *m, const char *delimiter,
                         unsigned long long num)
{
        return seq_put_decimal_ull_width(m, delimiter, num, 0);
}
EXPORT_SYMBOL(seq_put_decimal_ull);

/**
 * seq_put_hex_ll - put a number in hexadecimal notation
 * @m: seq_file identifying the buffer to which data should be written
 * @delimiter: a string which is printed before the number
 * @v: the number
 * @width: a minimum field width
 *
 * seq_put_hex_ll(m, "", v, 8) is equal to seq_printf(m, "%08llx", v)
 *
 * This routine is very quick when you show lots of numbers.
 * In usual cases, it will be better to use seq_printf(). It's easier to read.
 */
void seq_put_hex_ll(struct seq_file *m, const char *delimiter,
                                unsigned long long v, unsigned int width)
{
        unsigned int len;
        int i;

        if (delimiter && delimiter[0]) {
                if (delimiter[1] == 0)
                        seq_putc(m, delimiter[0]);
                else
                        seq_puts(m, delimiter);
        }

        /* If x is 0, the result of __builtin_clzll is undefined */
        if (v == 0)
                len = 1;
        else
                len = (sizeof(v) * 8 - __builtin_clzll(v) + 3) / 4;

        if (len < width)
                len = width;

        if (m->count + len > m->size) {
                seq_set_overflow(m);
                return;
        }

        for (i = len - 1; i >= 0; i--) {
                m->buf[m->count + i] = hex_asc[0xf & v];
                v = v >> 4;
        }
        m->count += len;
}

void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num)
{
        int len;

        if (m->count + 3 >= m->size) /* we'll write 2 bytes at least */
                goto overflow;

        if (delimiter && delimiter[0]) {
                if (delimiter[1] == 0)
                        seq_putc(m, delimiter[0]);
                else
                        seq_puts(m, delimiter);
        }

        if (m->count + 2 >= m->size)
                goto overflow;

        if (num < 0) {
                m->buf[m->count++] = '-';
                num = -num;
        }

        if (num < 10) {
                m->buf[m->count++] = num + '0';
                return;
        }

        len = num_to_str(m->buf + m->count, m->size - m->count, num, 0);
        if (!len)
                goto overflow;

        m->count += len;
        return;

overflow:
        seq_set_overflow(m);
}
EXPORT_SYMBOL(seq_put_decimal_ll);

/**
 * seq_write - write arbitrary data to buffer
 * @seq: seq_file identifying the buffer to which data should be written
 * @data: data address
 * @len: number of bytes
 *
 * Return 0 on success, non-zero otherwise.
 */
int seq_write(struct seq_file *seq, const void *data, size_t len)
{
        if (seq->count + len < seq->size) {
                memcpy(seq->buf + seq->count, data, len);
                seq->count += len;
                return 0;
        }
        seq_set_overflow(seq);
        return -1;
}
EXPORT_SYMBOL(seq_write);

/**
 * seq_pad - write padding spaces to buffer
 * @m: seq_file identifying the buffer to which data should be written
 * @c: the byte to append after padding if non-zero
 */
void seq_pad(struct seq_file *m, char c)
{
        int size = m->pad_until - m->count;
        if (size > 0) {
                if (size + m->count > m->size) {
                        seq_set_overflow(m);
                        return;
                }
                memset(m->buf + m->count, ' ', size);
                m->count += size;
        }
        if (c)
                seq_putc(m, c);
}
EXPORT_SYMBOL(seq_pad);

/* A complete analogue of print_hex_dump() */
void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
                  int rowsize, int groupsize, const void *buf, size_t len,
                  bool ascii)
{
        const u8 *ptr = buf;
        int i, linelen, remaining = len;
        char *buffer;
        size_t size;
        int ret;

        if (rowsize != 16 && rowsize != 32)
                rowsize = 16;

        for (i = 0; i < len && !seq_has_overflowed(m); i += rowsize) {
                linelen = min(remaining, rowsize);
                remaining -= rowsize;

                switch (prefix_type) {
                case DUMP_PREFIX_ADDRESS:
                        seq_printf(m, "%s%p: ", prefix_str, ptr + i);
                        break;
                case DUMP_PREFIX_OFFSET:
                        seq_printf(m, "%s%.8x: ", prefix_str, i);
                        break;
                default:
                        seq_printf(m, "%s", prefix_str);
                        break;
                }

                size = seq_get_buf(m, &buffer);
                ret = hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize,
                                         buffer, size, ascii);
                seq_commit(m, ret < size ? ret : -1);

                seq_putc(m, '\n');
        }
}
EXPORT_SYMBOL(seq_hex_dump);

struct list_head *seq_list_start(struct list_head *head, loff_t pos)
{
        struct list_head *lh;

        list_for_each(lh, head)
                if (pos-- == 0)
                        return lh;

        return NULL;
}
EXPORT_SYMBOL(seq_list_start);

struct list_head *seq_list_start_head(struct list_head *head, loff_t pos)
{
        if (!pos)
                return head;

        return seq_list_start(head, pos - 1);
}
EXPORT_SYMBOL(seq_list_start_head);

struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos)
{
        struct list_head *lh;

        lh = ((struct list_head *)v)->next;
        ++*ppos;
        return lh == head ? NULL : lh;
}
EXPORT_SYMBOL(seq_list_next);

/**
 * seq_hlist_start - start an iteration of a hlist
 * @head: the head of the hlist
 * @pos:  the start position of the sequence
 *
 * Called at seq_file->op->start().
 */
struct hlist_node *seq_hlist_start(struct hlist_head *head, loff_t pos)
{
        struct hlist_node *node;

        hlist_for_each(node, head)
                if (pos-- == 0)
                        return node;
        return NULL;
}
EXPORT_SYMBOL(seq_hlist_start);

/**
 * seq_hlist_start_head - start an iteration of a hlist
 * @head: the head of the hlist
 * @pos:  the start position of the sequence
 *
 * Called at seq_file->op->start(). Call this function if you want to
 * print a header at the top of the output.
 */
struct hlist_node *seq_hlist_start_head(struct hlist_head *head, loff_t pos)
{
        if (!pos)
                return SEQ_START_TOKEN;

        return seq_hlist_start(head, pos - 1);
}
EXPORT_SYMBOL(seq_hlist_start_head);

/**
 * seq_hlist_next - move to the next position of the hlist
 * @v:    the current iterator
 * @head: the head of the hlist
 * @ppos: the current position
 *
 * Called at seq_file->op->next().
 */
struct hlist_node *seq_hlist_next(void *v, struct hlist_head *head,
                                  loff_t *ppos)
{
        struct hlist_node *node = v;

        ++*ppos;
        if (v == SEQ_START_TOKEN)
                return head->first;
        else
                return node->next;
}
EXPORT_SYMBOL(seq_hlist_next);

/**
 * seq_hlist_start_rcu - start an iteration of a hlist protected by RCU
 * @head: the head of the hlist
 * @pos:  the start position of the sequence
 *
 * Called at seq_file->op->start().
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
 * as long as the traversal is guarded by rcu_read_lock().
 */
struct hlist_node *seq_hlist_start_rcu(struct hlist_head *head,
                                       loff_t pos)
{
        struct hlist_node *node;

        __hlist_for_each_rcu(node, head)
                if (pos-- == 0)
                        return node;
        return NULL;
}
EXPORT_SYMBOL(seq_hlist_start_rcu);

/**
 * seq_hlist_start_head_rcu - start an iteration of a hlist protected by RCU
 * @head: the head of the hlist
 * @pos:  the start position of the sequence
 *
 * Called at seq_file->op->start(). Call this function if you want to
 * print a header at the top of the output.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
 * as long as the traversal is guarded by rcu_read_lock().
 */
struct hlist_node *seq_hlist_start_head_rcu(struct hlist_head *head,
                                            loff_t pos)
{
        if (!pos)
                return SEQ_START_TOKEN;

        return seq_hlist_start_rcu(head, pos - 1);
}
EXPORT_SYMBOL(seq_hlist_start_head_rcu);

/**
 * seq_hlist_next_rcu - move to the next position of the hlist protected by RCU
 * @v:    the current iterator
 * @head: the head of the hlist
 * @ppos: the current position
 *
 * Called at seq_file->op->next().
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
 * as long as the traversal is guarded by rcu_read_lock().
 */
struct hlist_node *seq_hlist_next_rcu(void *v,
                                      struct hlist_head *head,
                                      loff_t *ppos)
{
        struct hlist_node *node = v;

        ++*ppos;
        if (v == SEQ_START_TOKEN)
                return rcu_dereference(head->first);
        else
                return rcu_dereference(node->next);
}
EXPORT_SYMBOL(seq_hlist_next_rcu);

/**
 * seq_hlist_start_precpu - start an iteration of a percpu hlist array
 * @head: pointer to percpu array of struct hlist_heads
 * @cpu:  pointer to cpu "cursor"
 * @pos:  start position of sequence
 *
 * Called at seq_file->op->start().
 */
struct hlist_node *
seq_hlist_start_percpu(struct hlist_head __percpu *head, int *cpu, loff_t pos)
{
        struct hlist_node *node;

        for_each_possible_cpu(*cpu) {
                hlist_for_each(node, per_cpu_ptr(head, *cpu)) {
                        if (pos-- == 0)
                                return node;
                }
        }
        return NULL;
}
EXPORT_SYMBOL(seq_hlist_start_percpu);

/**
 * seq_hlist_next_percpu - move to the next position of the percpu hlist array
 * @v:    pointer to current hlist_node
 * @head: pointer to percpu array of struct hlist_heads
 * @cpu:  pointer to cpu "cursor"
 * @pos:  start position of sequence
 *
 * Called at seq_file->op->next().
 */
struct hlist_node *
seq_hlist_next_percpu(void *v, struct hlist_head __percpu *head,
                        int *cpu, loff_t *pos)
{
        struct hlist_node *node = v;

        ++*pos;

        if (node->next)
                return node->next;

        for (*cpu = cpumask_next(*cpu, cpu_possible_mask); *cpu < nr_cpu_ids;
             *cpu = cpumask_next(*cpu, cpu_possible_mask)) {
                struct hlist_head *bucket = per_cpu_ptr(head, *cpu);

                if (!hlist_empty(bucket))
                        return bucket->first;
        }
        return NULL;
}
EXPORT_SYMBOL(seq_hlist_next_percpu);

void __init seq_file_init(void)
{
        seq_file_cache = KMEM_CACHE(seq_file, SLAB_ACCOUNT|SLAB_PANIC);
}








































    1 





    1 





    1 
    1 
    1 















































































































































































































    1 

    1 

    1 






    1 














    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/filesystems.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  table of configured filesystems
 */

#include <linux/syscalls.h>
#include <linux/fs.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/kmod.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/fs_parser.h>

/*
 * Handling of filesystem drivers list.
 * Rules:
 *        Inclusion to/removals from/scanning of list are protected by spinlock.
 *        During the unload module must call unregister_filesystem().
 *        We can access the fields of list element if:
 *                1) spinlock is held or
 *                2) we hold the reference to the module.
 *        The latter can be guaranteed by call of try_module_get(); if it
 *        returned 0 we must skip the element, otherwise we got the reference.
 *        Once the reference is obtained we can drop the spinlock.
 */

static struct file_system_type *file_systems;
static DEFINE_RWLOCK(file_systems_lock);

/* WARNING: This can be used only if we _already_ own a reference */
struct file_system_type *get_filesystem(struct file_system_type *fs)
{
        __module_get(fs->owner);
        return fs;
}

void put_filesystem(struct file_system_type *fs)
{
        module_put(fs->owner);
}

static struct file_system_type **find_filesystem(const char *name, unsigned len)
{
        struct file_system_type **p;
        for (p = &file_systems; *p; p = &(*p)->next)
                if (strncmp((*p)->name, name, len) == 0 &&
                    !(*p)->name[len])
                        break;
        return p;
}

/**
 *        register_filesystem - register a new filesystem
 *        @fs: the file system structure
 *
 *        Adds the file system passed to the list of file systems the kernel
 *        is aware of for mount and other syscalls. Returns 0 on success,
 *        or a negative errno code on an error.
 *
 *        The &struct file_system_type that is passed is linked into the kernel 
 *        structures and must not be freed until the file system has been
 *        unregistered.
 */
 
int register_filesystem(struct file_system_type * fs)
{
        int res = 0;
        struct file_system_type ** p;

        if (fs->parameters &&
            !fs_validate_description(fs->name, fs->parameters))
                return -EINVAL;

        BUG_ON(strchr(fs->name, '.'));
        if (fs->next)
                return -EBUSY;
        write_lock(&file_systems_lock);
        p = find_filesystem(fs->name, strlen(fs->name));
        if (*p)
                res = -EBUSY;
        else
                *p = fs;
        write_unlock(&file_systems_lock);
        return res;
}

EXPORT_SYMBOL(register_filesystem);

/**
 *        unregister_filesystem - unregister a file system
 *        @fs: filesystem to unregister
 *
 *        Remove a file system that was previously successfully registered
 *        with the kernel. An error is returned if the file system is not found.
 *        Zero is returned on a success.
 *        
 *        Once this function has returned the &struct file_system_type structure
 *        may be freed or reused.
 */
 
int unregister_filesystem(struct file_system_type * fs)
{
        struct file_system_type ** tmp;

        write_lock(&file_systems_lock);
        tmp = &file_systems;
        while (*tmp) {
                if (fs == *tmp) {
                        *tmp = fs->next;
                        fs->next = NULL;
                        write_unlock(&file_systems_lock);
                        synchronize_rcu();
                        return 0;
                }
                tmp = &(*tmp)->next;
        }
        write_unlock(&file_systems_lock);

        return -EINVAL;
}

EXPORT_SYMBOL(unregister_filesystem);

#ifdef CONFIG_SYSFS_SYSCALL
static int fs_index(const char __user * __name)
{
        struct file_system_type * tmp;
        struct filename *name;
        int err, index;

        name = getname(__name);
        err = PTR_ERR(name);
        if (IS_ERR(name))
                return err;

        err = -EINVAL;
        read_lock(&file_systems_lock);
        for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) {
                if (strcmp(tmp->name, name->name) == 0) {
                        err = index;
                        break;
                }
        }
        read_unlock(&file_systems_lock);
        putname(name);
        return err;
}

static int fs_name(unsigned int index, char __user * buf)
{
        struct file_system_type * tmp;
        int len, res = -EINVAL;

        read_lock(&file_systems_lock);
        for (tmp = file_systems; tmp; tmp = tmp->next, index--) {
                if (index == 0) {
                        if (try_module_get(tmp->owner))
                                res = 0;
                        break;
                }
        }
        read_unlock(&file_systems_lock);
        if (res)
                return res;

        /* OK, we got the reference, so we can safely block */
        len = strlen(tmp->name) + 1;
        res = copy_to_user(buf, tmp->name, len) ? -EFAULT : 0;
        put_filesystem(tmp);
        return res;
}

static int fs_maxindex(void)
{
        struct file_system_type * tmp;
        int index;

        read_lock(&file_systems_lock);
        for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++)
                ;
        read_unlock(&file_systems_lock);
        return index;
}

/*
 * Whee.. Weird sysv syscall. 
 */
SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2)
{
        int retval = -EINVAL;

        switch (option) {
                case 1:
                        retval = fs_index((const char __user *) arg1);
                        break;

                case 2:
                        retval = fs_name(arg1, (char __user *) arg2);
                        break;

                case 3:
                        retval = fs_maxindex();
                        break;
        }
        return retval;
}
#endif

int __init get_filesystem_list(char *buf)
{
        int len = 0;
        struct file_system_type * tmp;

        read_lock(&file_systems_lock);
        tmp = file_systems;
        while (tmp && len < PAGE_SIZE - 80) {
                len += sprintf(buf+len, "%s\t%s\n",
                        (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
                        tmp->name);
                tmp = tmp->next;
        }
        read_unlock(&file_systems_lock);
        return len;
}

#ifdef CONFIG_PROC_FS
static int filesystems_proc_show(struct seq_file *m, void *v)
{
        struct file_system_type * tmp;

        read_lock(&file_systems_lock);
        tmp = file_systems;
        while (tmp) {
                seq_printf(m, "%s\t%s\n",
                        (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
                        tmp->name);
                tmp = tmp->next;
        }
        read_unlock(&file_systems_lock);
        return 0;
}

static int __init proc_filesystems_init(void)
{
        proc_create_single("filesystems", 0, NULL, filesystems_proc_show);
        return 0;
}
module_init(proc_filesystems_init);
#endif

static struct file_system_type *__get_fs_type(const char *name, int len)
{
        struct file_system_type *fs;

        read_lock(&file_systems_lock);
        fs = *(find_filesystem(name, len));
        if (fs && !try_module_get(fs->owner))
                fs = NULL;
        read_unlock(&file_systems_lock);
        return fs;
}

struct file_system_type *get_fs_type(const char *name)
{
        struct file_system_type *fs;
        const char *dot = strchr(name, '.');
        int len = dot ? dot - name : strlen(name);

        fs = __get_fs_type(name, len);
        if (!fs && (request_module("fs-%.*s", len, name) == 0)) {
                fs = __get_fs_type(name, len);
                if (!fs)
                        pr_warn_once("request_module fs-%.*s succeeded, but still no fs?\n",
                                     len, name);
        }

        if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) {
                put_filesystem(fs);
                fs = NULL;
        }
        return fs;
}

EXPORT_SYMBOL(get_fs_type);


























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef IOCONTEXT_H
#define IOCONTEXT_H

#include <linux/radix-tree.h>
#include <linux/rcupdate.h>
#include <linux/workqueue.h>

enum {
        ICQ_EXITED                = 1 << 2,
        ICQ_DESTROYED                = 1 << 3,
};

/*
 * An io_cq (icq) is association between an io_context (ioc) and a
 * request_queue (q).  This is used by elevators which need to track
 * information per ioc - q pair.
 *
 * Elevator can request use of icq by setting elevator_type->icq_size and
 * ->icq_align.  Both size and align must be larger than that of struct
 * io_cq and elevator can use the tail area for private information.  The
 * recommended way to do this is defining a struct which contains io_cq as
 * the first member followed by private members and using its size and
 * align.  For example,
 *
 *        struct snail_io_cq {
 *                struct io_cq        icq;
 *                int                poke_snail;
 *                int                feed_snail;
 *        };
 *
 *        struct elevator_type snail_elv_type {
 *                .ops =                { ... },
 *                .icq_size =        sizeof(struct snail_io_cq),
 *                .icq_align =        __alignof__(struct snail_io_cq),
 *                ...
 *        };
 *
 * If icq_size is set, block core will manage icq's.  All requests will
 * have its ->elv.icq field set before elevator_ops->elevator_set_req_fn()
 * is called and be holding a reference to the associated io_context.
 *
 * Whenever a new icq is created, elevator_ops->elevator_init_icq_fn() is
 * called and, on destruction, ->elevator_exit_icq_fn().  Both functions
 * are called with both the associated io_context and queue locks held.
 *
 * Elevator is allowed to lookup icq using ioc_lookup_icq() while holding
 * queue lock but the returned icq is valid only until the queue lock is
 * released.  Elevators can not and should not try to create or destroy
 * icq's.
 *
 * As icq's are linked from both ioc and q, the locking rules are a bit
 * complex.
 *
 * - ioc lock nests inside q lock.
 *
 * - ioc->icq_list and icq->ioc_node are protected by ioc lock.
 *   q->icq_list and icq->q_node by q lock.
 *
 * - ioc->icq_tree and ioc->icq_hint are protected by ioc lock, while icq
 *   itself is protected by q lock.  However, both the indexes and icq
 *   itself are also RCU managed and lookup can be performed holding only
 *   the q lock.
 *
 * - icq's are not reference counted.  They are destroyed when either the
 *   ioc or q goes away.  Each request with icq set holds an extra
 *   reference to ioc to ensure it stays until the request is completed.
 *
 * - Linking and unlinking icq's are performed while holding both ioc and q
 *   locks.  Due to the lock ordering, q exit is simple but ioc exit
 *   requires reverse-order double lock dance.
 */
struct io_cq {
        struct request_queue        *q;
        struct io_context        *ioc;

        /*
         * q_node and ioc_node link io_cq through icq_list of q and ioc
         * respectively.  Both fields are unused once ioc_exit_icq() is
         * called and shared with __rcu_icq_cache and __rcu_head which are
         * used for RCU free of io_cq.
         */
        union {
                struct list_head        q_node;
                struct kmem_cache        *__rcu_icq_cache;
        };
        union {
                struct hlist_node        ioc_node;
                struct rcu_head                __rcu_head;
        };

        unsigned int                flags;
};

/*
 * I/O subsystem state of the associated processes.  It is refcounted
 * and kmalloc'ed. These could be shared between processes.
 */
struct io_context {
        atomic_long_t refcount;
        atomic_t active_ref;
        atomic_t nr_tasks;

        /* all the fields below are protected by this lock */
        spinlock_t lock;

        unsigned short ioprio;

        struct radix_tree_root        icq_tree;
        struct io_cq __rcu        *icq_hint;
        struct hlist_head        icq_list;

        struct work_struct release_work;
};

/**
 * get_io_context_active - get active reference on ioc
 * @ioc: ioc of interest
 *
 * Only iocs with active reference can issue new IOs.  This function
 * acquires an active reference on @ioc.  The caller must already have an
 * active reference on @ioc.
 */
static inline void get_io_context_active(struct io_context *ioc)
{
        WARN_ON_ONCE(atomic_long_read(&ioc->refcount) <= 0);
        WARN_ON_ONCE(atomic_read(&ioc->active_ref) <= 0);
        atomic_long_inc(&ioc->refcount);
        atomic_inc(&ioc->active_ref);
}

static inline void ioc_task_link(struct io_context *ioc)
{
        get_io_context_active(ioc);

        WARN_ON_ONCE(atomic_read(&ioc->nr_tasks) <= 0);
        atomic_inc(&ioc->nr_tasks);
}

struct task_struct;
#ifdef CONFIG_BLOCK
void put_io_context(struct io_context *ioc);
void put_io_context_active(struct io_context *ioc);
void exit_io_context(struct task_struct *task);
struct io_context *get_task_io_context(struct task_struct *task,
                                       gfp_t gfp_flags, int node);
#else
struct io_context;
static inline void put_io_context(struct io_context *ioc) { }
static inline void exit_io_context(struct task_struct *task) { }
#endif

#endif













    2 


    1 























    2 






    2 


    2 


    2 

    2 





































    1 






    1 


    1 
    1 











    1 




    1 








    1 






    1 







    1 







    1 





    1 






























































    1 





    1 












    1 






















    1 
















    1 








    1 













































    1 





    1 









    1 




    1 


    1 






    1 



    1 




    1 



































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/syscalls.h>
#include <linux/export.h>
#include <linux/uaccess.h>
#include <linux/fs_struct.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/prefetch.h>
#include "mount.h"

static int prepend(char **buffer, int *buflen, const char *str, int namelen)
{
        *buflen -= namelen;
        if (*buflen < 0)
                return -ENAMETOOLONG;
        *buffer -= namelen;
        memcpy(*buffer, str, namelen);
        return 0;
}

/**
 * prepend_name - prepend a pathname in front of current buffer pointer
 * @buffer: buffer pointer
 * @buflen: allocated length of the buffer
 * @name:   name string and length qstr structure
 *
 * With RCU path tracing, it may race with d_move(). Use READ_ONCE() to
 * make sure that either the old or the new name pointer and length are
 * fetched. However, there may be mismatch between length and pointer.
 * The length cannot be trusted, we need to copy it byte-by-byte until
 * the length is reached or a null byte is found. It also prepends "/" at
 * the beginning of the name. The sequence number check at the caller will
 * retry it again when a d_move() does happen. So any garbage in the buffer
 * due to mismatched pointer and length will be discarded.
 *
 * Load acquire is needed to make sure that we see that terminating NUL.
 */
static int prepend_name(char **buffer, int *buflen, const struct qstr *name)
{
        const char *dname = smp_load_acquire(&name->name); /* ^^^ */
        u32 dlen = READ_ONCE(name->len);
        char *p;

        *buflen -= dlen + 1;
        if (*buflen < 0)
                return -ENAMETOOLONG;
        p = *buffer -= dlen + 1;
        *p++ = '/';
        while (dlen--) {
                char c = *dname++;
                if (!c)
                        break;
                *p++ = c;
        }
        return 0;
}

/**
 * prepend_path - Prepend path string to a buffer
 * @path: the dentry/vfsmount to report
 * @root: root vfsmnt/dentry
 * @buffer: pointer to the end of the buffer
 * @buflen: pointer to buffer length
 *
 * The function will first try to write out the pathname without taking any
 * lock other than the RCU read lock to make sure that dentries won't go away.
 * It only checks the sequence number of the global rename_lock as any change
 * in the dentry's d_seq will be preceded by changes in the rename_lock
 * sequence number. If the sequence number had been changed, it will restart
 * the whole pathname back-tracing sequence again by taking the rename_lock.
 * In this case, there is no need to take the RCU read lock as the recursive
 * parent pointer references will keep the dentry chain alive as long as no
 * rename operation is performed.
 */
static int prepend_path(const struct path *path,
                        const struct path *root,
                        char **buffer, int *buflen)
{
        struct dentry *dentry;
        struct vfsmount *vfsmnt;
        struct mount *mnt;
        int error = 0;
        unsigned seq, m_seq = 0;
        char *bptr;
        int blen;

        rcu_read_lock();
restart_mnt:
        read_seqbegin_or_lock(&mount_lock, &m_seq);
        seq = 0;
        rcu_read_lock();
restart:
        bptr = *buffer;
        blen = *buflen;
        error = 0;
        dentry = path->dentry;
        vfsmnt = path->mnt;
        mnt = real_mount(vfsmnt);
        read_seqbegin_or_lock(&rename_lock, &seq);
        while (dentry != root->dentry || vfsmnt != root->mnt) {
                struct dentry * parent;

                if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
                        struct mount *parent = READ_ONCE(mnt->mnt_parent);
                        struct mnt_namespace *mnt_ns;

                        /* Escaped? */
                        if (dentry != vfsmnt->mnt_root) {
                                bptr = *buffer;
                                blen = *buflen;
                                error = 3;
                                break;
                        }
                        /* Global root? */
                        if (mnt != parent) {
                                dentry = READ_ONCE(mnt->mnt_mountpoint);
                                mnt = parent;
                                vfsmnt = &mnt->mnt;
                                continue;
                        }
                        mnt_ns = READ_ONCE(mnt->mnt_ns);
                        /* open-coded is_mounted() to use local mnt_ns */
                        if (!IS_ERR_OR_NULL(mnt_ns) && !is_anon_ns(mnt_ns))
                                error = 1;        // absolute root
                        else
                                error = 2;        // detached or not attached yet
                        break;
                }
                parent = dentry->d_parent;
                prefetch(parent);
                error = prepend_name(&bptr, &blen, &dentry->d_name);
                if (error)
                        break;

                dentry = parent;
        }
        if (!(seq & 1))
                rcu_read_unlock();
        if (need_seqretry(&rename_lock, seq)) {
                seq = 1;
                goto restart;
        }
        done_seqretry(&rename_lock, seq);

        if (!(m_seq & 1))
                rcu_read_unlock();
        if (need_seqretry(&mount_lock, m_seq)) {
                m_seq = 1;
                goto restart_mnt;
        }
        done_seqretry(&mount_lock, m_seq);

        if (error >= 0 && bptr == *buffer) {
                if (--blen < 0)
                        error = -ENAMETOOLONG;
                else
                        *--bptr = '/';
        }
        *buffer = bptr;
        *buflen = blen;
        return error;
}

/**
 * __d_path - return the path of a dentry
 * @path: the dentry/vfsmount to report
 * @root: root vfsmnt/dentry
 * @buf: buffer to return value in
 * @buflen: buffer length
 *
 * Convert a dentry into an ASCII path name.
 *
 * Returns a pointer into the buffer or an error code if the
 * path was too long.
 *
 * "buflen" should be positive.
 *
 * If the path is not reachable from the supplied root, return %NULL.
 */
char *__d_path(const struct path *path,
               const struct path *root,
               char *buf, int buflen)
{
        char *res = buf + buflen;
        int error;

        prepend(&res, &buflen, "\0", 1);
        error = prepend_path(path, root, &res, &buflen);

        if (error < 0)
                return ERR_PTR(error);
        if (error > 0)
                return NULL;
        return res;
}

char *d_absolute_path(const struct path *path,
               char *buf, int buflen)
{
        struct path root = {};
        char *res = buf + buflen;
        int error;

        prepend(&res, &buflen, "\0", 1);
        error = prepend_path(path, &root, &res, &buflen);

        if (error > 1)
                error = -EINVAL;
        if (error < 0)
                return ERR_PTR(error);
        return res;
}

/*
 * same as __d_path but appends "(deleted)" for unlinked files.
 */
static int path_with_deleted(const struct path *path,
                             const struct path *root,
                             char **buf, int *buflen)
{
        prepend(buf, buflen, "\0", 1);
        if (d_unlinked(path->dentry)) {
                int error = prepend(buf, buflen, " (deleted)", 10);
                if (error)
                        return error;
        }

        return prepend_path(path, root, buf, buflen);
}

static int prepend_unreachable(char **buffer, int *buflen)
{
        return prepend(buffer, buflen, "(unreachable)", 13);
}

static void get_fs_root_rcu(struct fs_struct *fs, struct path *root)
{
        unsigned seq;

        do {
                seq = read_seqcount_begin(&fs->seq);
                *root = fs->root;
        } while (read_seqcount_retry(&fs->seq, seq));
}

/**
 * d_path - return the path of a dentry
 * @path: path to report
 * @buf: buffer to return value in
 * @buflen: buffer length
 *
 * Convert a dentry into an ASCII path name. If the entry has been deleted
 * the string " (deleted)" is appended. Note that this is ambiguous.
 *
 * Returns a pointer into the buffer or an error code if the path was
 * too long. Note: Callers should use the returned pointer, not the passed
 * in buffer, to use the name! The implementation often starts at an offset
 * into the buffer, and may leave 0 bytes at the start.
 *
 * "buflen" should be positive.
 */
char *d_path(const struct path *path, char *buf, int buflen)
{
        char *res = buf + buflen;
        struct path root;
        int error;

        /*
         * We have various synthetic filesystems that never get mounted.  On
         * these filesystems dentries are never used for lookup purposes, and
         * thus don't need to be hashed.  They also don't need a name until a
         * user wants to identify the object in /proc/pid/fd/.  The little hack
         * below allows us to generate a name for these objects on demand:
         *
         * Some pseudo inodes are mountable.  When they are mounted
         * path->dentry == path->mnt->mnt_root.  In that case don't call d_dname
         * and instead have d_path return the mounted path.
         */
        if (path->dentry->d_op && path->dentry->d_op->d_dname &&
            (!IS_ROOT(path->dentry) || path->dentry != path->mnt->mnt_root))
                return path->dentry->d_op->d_dname(path->dentry, buf, buflen);

        rcu_read_lock();
        get_fs_root_rcu(current->fs, &root);
        error = path_with_deleted(path, &root, &res, &buflen);
        rcu_read_unlock();

        if (error < 0)
                res = ERR_PTR(error);
        return res;
}
EXPORT_SYMBOL(d_path);

/*
 * Helper function for dentry_operations.d_dname() members
 */
char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen,
                        const char *fmt, ...)
{
        va_list args;
        char temp[64];
        int sz;

        va_start(args, fmt);
        sz = vsnprintf(temp, sizeof(temp), fmt, args) + 1;
        va_end(args);

        if (sz > sizeof(temp) || sz > buflen)
                return ERR_PTR(-ENAMETOOLONG);

        buffer += buflen - sz;
        return memcpy(buffer, temp, sz);
}

char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
{
        char *end = buffer + buflen;
        /* these dentries are never renamed, so d_lock is not needed */
        if (prepend(&end, &buflen, " (deleted)", 11) ||
            prepend(&end, &buflen, dentry->d_name.name, dentry->d_name.len) ||
            prepend(&end, &buflen, "/", 1))  
                end = ERR_PTR(-ENAMETOOLONG);
        return end;
}

/*
 * Write full pathname from the root of the filesystem into the buffer.
 */
static char *__dentry_path(struct dentry *d, char *buf, int buflen)
{
        struct dentry *dentry;
        char *end, *retval;
        int len, seq = 0;
        int error = 0;

        if (buflen < 2)
                goto Elong;

        rcu_read_lock();
restart:
        dentry = d;
        end = buf + buflen;
        len = buflen;
        prepend(&end, &len, "\0", 1);
        /* Get '/' right */
        retval = end-1;
        *retval = '/';
        read_seqbegin_or_lock(&rename_lock, &seq);
        while (!IS_ROOT(dentry)) {
                struct dentry *parent = dentry->d_parent;

                prefetch(parent);
                error = prepend_name(&end, &len, &dentry->d_name);
                if (error)
                        break;

                retval = end;
                dentry = parent;
        }
        if (!(seq & 1))
                rcu_read_unlock();
        if (need_seqretry(&rename_lock, seq)) {
                seq = 1;
                goto restart;
        }
        done_seqretry(&rename_lock, seq);
        if (error)
                goto Elong;
        return retval;
Elong:
        return ERR_PTR(-ENAMETOOLONG);
}

char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen)
{
        return __dentry_path(dentry, buf, buflen);
}
EXPORT_SYMBOL(dentry_path_raw);

char *dentry_path(struct dentry *dentry, char *buf, int buflen)
{
        char *p = NULL;
        char *retval;

        if (d_unlinked(dentry)) {
                p = buf + buflen;
                if (prepend(&p, &buflen, "//deleted", 10) != 0)
                        goto Elong;
                buflen++;
        }
        retval = __dentry_path(dentry, buf, buflen);
        if (!IS_ERR(retval) && p)
                *p = '/';        /* restore '/' overriden with '\0' */
        return retval;
Elong:
        return ERR_PTR(-ENAMETOOLONG);
}

static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root,
                                    struct path *pwd)
{
        unsigned seq;

        do {
                seq = read_seqcount_begin(&fs->seq);
                *root = fs->root;
                *pwd = fs->pwd;
        } while (read_seqcount_retry(&fs->seq, seq));
}

/*
 * NOTE! The user-level library version returns a
 * character pointer. The kernel system call just
 * returns the length of the buffer filled (which
 * includes the ending '\0' character), or a negative
 * error value. So libc would do something like
 *
 *        char *getcwd(char * buf, size_t size)
 *        {
 *                int retval;
 *
 *                retval = sys_getcwd(buf, size);
 *                if (retval >= 0)
 *                        return buf;
 *                errno = -retval;
 *                return NULL;
 *        }
 */
SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
{
        int error;
        struct path pwd, root;
        char *page = __getname();

        if (!page)
                return -ENOMEM;

        rcu_read_lock();
        get_fs_root_and_pwd_rcu(current->fs, &root, &pwd);

        error = -ENOENT;
        if (!d_unlinked(pwd.dentry)) {
                unsigned long len;
                char *cwd = page + PATH_MAX;
                int buflen = PATH_MAX;

                prepend(&cwd, &buflen, "\0", 1);
                error = prepend_path(&pwd, &root, &cwd, &buflen);
                rcu_read_unlock();

                if (error < 0)
                        goto out;

                /* Unreachable from current root */
                if (error > 0) {
                        error = prepend_unreachable(&cwd, &buflen);
                        if (error)
                                goto out;
                }

                error = -ERANGE;
                len = PATH_MAX + page - cwd;
                if (len <= size) {
                        error = len;
                        if (copy_to_user(buf, cwd, len))
                                error = -EFAULT;
                }
        } else {
                rcu_read_unlock();
        }

out:
        __putname(page);
        return error;
}



























































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NET_NEIGHBOUR_H
#define _NET_NEIGHBOUR_H

#include <linux/neighbour.h>

/*
 *        Generic neighbour manipulation
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>
 *        Alexey Kuznetsov        <kuznet@ms2.inr.ac.ru>
 *
 *         Changes:
 *
 *        Harald Welte:                <laforge@gnumonks.org>
 *                - Add neighbour cache statistics like rtstat
 */

#include <linux/atomic.h>
#include <linux/refcount.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/rcupdate.h>
#include <linux/seq_file.h>
#include <linux/bitmap.h>

#include <linux/err.h>
#include <linux/sysctl.h>
#include <linux/workqueue.h>
#include <net/rtnetlink.h>

/*
 * NUD stands for "neighbor unreachability detection"
 */

#define NUD_IN_TIMER        (NUD_INCOMPLETE|NUD_REACHABLE|NUD_DELAY|NUD_PROBE)
#define NUD_VALID        (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE|NUD_PROBE|NUD_STALE|NUD_DELAY)
#define NUD_CONNECTED        (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE)

struct neighbour;

enum {
        NEIGH_VAR_MCAST_PROBES,
        NEIGH_VAR_UCAST_PROBES,
        NEIGH_VAR_APP_PROBES,
        NEIGH_VAR_MCAST_REPROBES,
        NEIGH_VAR_RETRANS_TIME,
        NEIGH_VAR_BASE_REACHABLE_TIME,
        NEIGH_VAR_DELAY_PROBE_TIME,
        NEIGH_VAR_GC_STALETIME,
        NEIGH_VAR_QUEUE_LEN_BYTES,
        NEIGH_VAR_PROXY_QLEN,
        NEIGH_VAR_ANYCAST_DELAY,
        NEIGH_VAR_PROXY_DELAY,
        NEIGH_VAR_LOCKTIME,
#define NEIGH_VAR_DATA_MAX (NEIGH_VAR_LOCKTIME + 1)
        /* Following are used as a second way to access one of the above */
        NEIGH_VAR_QUEUE_LEN, /* same data as NEIGH_VAR_QUEUE_LEN_BYTES */
        NEIGH_VAR_RETRANS_TIME_MS, /* same data as NEIGH_VAR_RETRANS_TIME */
        NEIGH_VAR_BASE_REACHABLE_TIME_MS, /* same data as NEIGH_VAR_BASE_REACHABLE_TIME */
        /* Following are used by "default" only */
        NEIGH_VAR_GC_INTERVAL,
        NEIGH_VAR_GC_THRESH1,
        NEIGH_VAR_GC_THRESH2,
        NEIGH_VAR_GC_THRESH3,
        NEIGH_VAR_MAX
};

struct neigh_parms {
        possible_net_t net;
        struct net_device *dev;
        struct list_head list;
        int        (*neigh_setup)(struct neighbour *);
        struct neigh_table *tbl;

        void        *sysctl_table;

        int dead;
        refcount_t refcnt;
        struct rcu_head rcu_head;

        int        reachable_time;
        int        data[NEIGH_VAR_DATA_MAX];
        DECLARE_BITMAP(data_state, NEIGH_VAR_DATA_MAX);
};

static inline void neigh_var_set(struct neigh_parms *p, int index, int val)
{
        set_bit(index, p->data_state);
        p->data[index] = val;
}

#define NEIGH_VAR(p, attr) ((p)->data[NEIGH_VAR_ ## attr])

/* In ndo_neigh_setup, NEIGH_VAR_INIT should be used.
 * In other cases, NEIGH_VAR_SET should be used.
 */
#define NEIGH_VAR_INIT(p, attr, val) (NEIGH_VAR(p, attr) = val)
#define NEIGH_VAR_SET(p, attr, val) neigh_var_set(p, NEIGH_VAR_ ## attr, val)

static inline void neigh_parms_data_state_setall(struct neigh_parms *p)
{
        bitmap_fill(p->data_state, NEIGH_VAR_DATA_MAX);
}

static inline void neigh_parms_data_state_cleanall(struct neigh_parms *p)
{
        bitmap_zero(p->data_state, NEIGH_VAR_DATA_MAX);
}

struct neigh_statistics {
        unsigned long allocs;                /* number of allocated neighs */
        unsigned long destroys;                /* number of destroyed neighs */
        unsigned long hash_grows;        /* number of hash resizes */

        unsigned long res_failed;        /* number of failed resolutions */

        unsigned long lookups;                /* number of lookups */
        unsigned long hits;                /* number of hits (among lookups) */

        unsigned long rcv_probes_mcast;        /* number of received mcast ipv6 */
        unsigned long rcv_probes_ucast; /* number of received ucast ipv6 */

        unsigned long periodic_gc_runs;        /* number of periodic GC runs */
        unsigned long forced_gc_runs;        /* number of forced GC runs */

        unsigned long unres_discards;        /* number of unresolved drops */
        unsigned long table_fulls;      /* times even gc couldn't help */
};

#define NEIGH_CACHE_STAT_INC(tbl, field) this_cpu_inc((tbl)->stats->field)

struct neighbour {
        struct neighbour __rcu        *next;
        struct neigh_table        *tbl;
        struct neigh_parms        *parms;
        unsigned long                confirmed;
        unsigned long                updated;
        rwlock_t                lock;
        refcount_t                refcnt;
        unsigned int                arp_queue_len_bytes;
        struct sk_buff_head        arp_queue;
        struct timer_list        timer;
        unsigned long                used;
        atomic_t                probes;
        __u8                        flags;
        __u8                        nud_state;
        __u8                        type;
        __u8                        dead;
        u8                        protocol;
        seqlock_t                ha_lock;
        unsigned char                ha[ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))] __aligned(8);
        struct hh_cache                hh;
        int                        (*output)(struct neighbour *, struct sk_buff *);
        const struct neigh_ops        *ops;
        struct list_head        gc_list;
        struct rcu_head                rcu;
        struct net_device        *dev;
        u8                        primary_key[0];
} __randomize_layout;

struct neigh_ops {
        int                        family;
        void                        (*solicit)(struct neighbour *, struct sk_buff *);
        void                        (*error_report)(struct neighbour *, struct sk_buff *);
        int                        (*output)(struct neighbour *, struct sk_buff *);
        int                        (*connected_output)(struct neighbour *, struct sk_buff *);
};

struct pneigh_entry {
        struct pneigh_entry        *next;
        possible_net_t                net;
        struct net_device        *dev;
        u8                        flags;
        u8                        protocol;
        u32                        key[];
};

/*
 *        neighbour table manipulation
 */

#define NEIGH_NUM_HASH_RND        4

struct neigh_hash_table {
        struct neighbour __rcu        **hash_buckets;
        unsigned int                hash_shift;
        __u32                        hash_rnd[NEIGH_NUM_HASH_RND];
        struct rcu_head                rcu;
};


struct neigh_table {
        int                        family;
        unsigned int                entry_size;
        unsigned int                key_len;
        __be16                        protocol;
        __u32                        (*hash)(const void *pkey,
                                        const struct net_device *dev,
                                        __u32 *hash_rnd);
        bool                        (*key_eq)(const struct neighbour *, const void *pkey);
        int                        (*constructor)(struct neighbour *);
        int                        (*pconstructor)(struct pneigh_entry *);
        void                        (*pdestructor)(struct pneigh_entry *);
        void                        (*proxy_redo)(struct sk_buff *skb);
        int                        (*is_multicast)(const void *pkey);
        bool                        (*allow_add)(const struct net_device *dev,
                                             struct netlink_ext_ack *extack);
        char                        *id;
        struct neigh_parms        parms;
        struct list_head        parms_list;
        int                        gc_interval;
        int                        gc_thresh1;
        int                        gc_thresh2;
        int                        gc_thresh3;
        unsigned long                last_flush;
        struct delayed_work        gc_work;
        struct timer_list         proxy_timer;
        struct sk_buff_head        proxy_queue;
        atomic_t                entries;
        atomic_t                gc_entries;
        struct list_head        gc_list;
        rwlock_t                lock;
        unsigned long                last_rand;
        struct neigh_statistics        __percpu *stats;
        struct neigh_hash_table __rcu *nht;
        struct pneigh_entry        **phash_buckets;
};

enum {
        NEIGH_ARP_TABLE = 0,
        NEIGH_ND_TABLE = 1,
        NEIGH_DN_TABLE = 2,
        NEIGH_NR_TABLES,
        NEIGH_LINK_TABLE = NEIGH_NR_TABLES /* Pseudo table for neigh_xmit */
};

static inline int neigh_parms_family(struct neigh_parms *p)
{
        return p->tbl->family;
}

#define NEIGH_PRIV_ALIGN        sizeof(long long)
#define NEIGH_ENTRY_SIZE(size)        ALIGN((size), NEIGH_PRIV_ALIGN)

static inline void *neighbour_priv(const struct neighbour *n)
{
        return (char *)n + n->tbl->entry_size;
}

/* flags for neigh_update() */
#define NEIGH_UPDATE_F_OVERRIDE                        0x00000001
#define NEIGH_UPDATE_F_WEAK_OVERRIDE                0x00000002
#define NEIGH_UPDATE_F_OVERRIDE_ISROUTER        0x00000004
#define NEIGH_UPDATE_F_USE                        0x10000000
#define NEIGH_UPDATE_F_EXT_LEARNED                0x20000000
#define NEIGH_UPDATE_F_ISROUTER                        0x40000000
#define NEIGH_UPDATE_F_ADMIN                        0x80000000

extern const struct nla_policy nda_policy[];

static inline bool neigh_key_eq32(const struct neighbour *n, const void *pkey)
{
        return *(const u32 *)n->primary_key == *(const u32 *)pkey;
}

static inline bool neigh_key_eq128(const struct neighbour *n, const void *pkey)
{
        const u32 *n32 = (const u32 *)n->primary_key;
        const u32 *p32 = pkey;

        return ((n32[0] ^ p32[0]) | (n32[1] ^ p32[1]) |
                (n32[2] ^ p32[2]) | (n32[3] ^ p32[3])) == 0;
}

static inline struct neighbour *___neigh_lookup_noref(
        struct neigh_table *tbl,
        bool (*key_eq)(const struct neighbour *n, const void *pkey),
        __u32 (*hash)(const void *pkey,
                      const struct net_device *dev,
                      __u32 *hash_rnd),
        const void *pkey,
        struct net_device *dev)
{
        struct neigh_hash_table *nht = rcu_dereference_bh(tbl->nht);
        struct neighbour *n;
        u32 hash_val;

        hash_val = hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift);
        for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);
             n != NULL;
             n = rcu_dereference_bh(n->next)) {
                if (n->dev == dev && key_eq(n, pkey))
                        return n;
        }

        return NULL;
}

static inline struct neighbour *__neigh_lookup_noref(struct neigh_table *tbl,
                                                     const void *pkey,
                                                     struct net_device *dev)
{
        return ___neigh_lookup_noref(tbl, tbl->key_eq, tbl->hash, pkey, dev);
}

void neigh_table_init(int index, struct neigh_table *tbl);
int neigh_table_clear(int index, struct neigh_table *tbl);
struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
                               struct net_device *dev);
struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
                                 struct net_device *dev, bool want_ref);
static inline struct neighbour *neigh_create(struct neigh_table *tbl,
                                             const void *pkey,
                                             struct net_device *dev)
{
        return __neigh_create(tbl, pkey, dev, true);
}
void neigh_destroy(struct neighbour *neigh);
int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb);
int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags,
                 u32 nlmsg_pid);
void __neigh_set_probe_once(struct neighbour *neigh);
bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl);
void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev);
int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev);
int neigh_carrier_down(struct neigh_table *tbl, struct net_device *dev);
int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb);
int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb);
int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb);
struct neighbour *neigh_event_ns(struct neigh_table *tbl,
                                                u8 *lladdr, void *saddr,
                                                struct net_device *dev);

struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
                                      struct neigh_table *tbl);
void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms);

static inline
struct net *neigh_parms_net(const struct neigh_parms *parms)
{
        return read_pnet(&parms->net);
}

unsigned long neigh_rand_reach_time(unsigned long base);

void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
                    struct sk_buff *skb);
struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net,
                                   const void *key, struct net_device *dev,
                                   int creat);
struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, struct net *net,
                                     const void *key, struct net_device *dev);
int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *key,
                  struct net_device *dev);

static inline struct net *pneigh_net(const struct pneigh_entry *pneigh)
{
        return read_pnet(&pneigh->net);
}

void neigh_app_ns(struct neighbour *n);
void neigh_for_each(struct neigh_table *tbl,
                    void (*cb)(struct neighbour *, void *), void *cookie);
void __neigh_for_each_release(struct neigh_table *tbl,
                              int (*cb)(struct neighbour *));
int neigh_xmit(int fam, struct net_device *, const void *, struct sk_buff *);
void pneigh_for_each(struct neigh_table *tbl,
                     void (*cb)(struct pneigh_entry *));

struct neigh_seq_state {
        struct seq_net_private p;
        struct neigh_table *tbl;
        struct neigh_hash_table *nht;
        void *(*neigh_sub_iter)(struct neigh_seq_state *state,
                                struct neighbour *n, loff_t *pos);
        unsigned int bucket;
        unsigned int flags;
#define NEIGH_SEQ_NEIGH_ONLY        0x00000001
#define NEIGH_SEQ_IS_PNEIGH        0x00000002
#define NEIGH_SEQ_SKIP_NOARP        0x00000004
};
void *neigh_seq_start(struct seq_file *, loff_t *, struct neigh_table *,
                      unsigned int);
void *neigh_seq_next(struct seq_file *, void *, loff_t *);
void neigh_seq_stop(struct seq_file *, void *);

int neigh_proc_dointvec(struct ctl_table *ctl, int write,
                        void *buffer, size_t *lenp, loff_t *ppos);
int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write,
                                void *buffer,
                                size_t *lenp, loff_t *ppos);
int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write,
                                   void *buffer, size_t *lenp, loff_t *ppos);

int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
                          proc_handler *proc_handler);
void neigh_sysctl_unregister(struct neigh_parms *p);

static inline void __neigh_parms_put(struct neigh_parms *parms)
{
        refcount_dec(&parms->refcnt);
}

static inline struct neigh_parms *neigh_parms_clone(struct neigh_parms *parms)
{
        refcount_inc(&parms->refcnt);
        return parms;
}

/*
 *        Neighbour references
 */

static inline void neigh_release(struct neighbour *neigh)
{
        if (refcount_dec_and_test(&neigh->refcnt))
                neigh_destroy(neigh);
}

static inline struct neighbour * neigh_clone(struct neighbour *neigh)
{
        if (neigh)
                refcount_inc(&neigh->refcnt);
        return neigh;
}

#define neigh_hold(n)        refcount_inc(&(n)->refcnt)

static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
{
        unsigned long now = jiffies;
        
        if (READ_ONCE(neigh->used) != now)
                WRITE_ONCE(neigh->used, now);
        if (!(neigh->nud_state&(NUD_CONNECTED|NUD_DELAY|NUD_PROBE)))
                return __neigh_event_send(neigh, skb);
        return 0;
}

#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb)
{
        unsigned int seq, hh_alen;

        do {
                seq = read_seqbegin(&hh->hh_lock);
                hh_alen = HH_DATA_ALIGN(ETH_HLEN);
                memcpy(skb->data - hh_alen, hh->hh_data, ETH_ALEN + hh_alen - ETH_HLEN);
        } while (read_seqretry(&hh->hh_lock, seq));
        return 0;
}
#endif

static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
{
        unsigned int hh_alen = 0;
        unsigned int seq;
        unsigned int hh_len;

        do {
                seq = read_seqbegin(&hh->hh_lock);
                hh_len = READ_ONCE(hh->hh_len);
                if (likely(hh_len <= HH_DATA_MOD)) {
                        hh_alen = HH_DATA_MOD;

                        /* skb_push() would proceed silently if we have room for
                         * the unaligned size but not for the aligned size:
                         * check headroom explicitly.
                         */
                        if (likely(skb_headroom(skb) >= HH_DATA_MOD)) {
                                /* this is inlined by gcc */
                                memcpy(skb->data - HH_DATA_MOD, hh->hh_data,
                                       HH_DATA_MOD);
                        }
                } else {
                        hh_alen = HH_DATA_ALIGN(hh_len);

                        if (likely(skb_headroom(skb) >= hh_alen)) {
                                memcpy(skb->data - hh_alen, hh->hh_data,
                                       hh_alen);
                        }
                }
        } while (read_seqretry(&hh->hh_lock, seq));

        if (WARN_ON_ONCE(skb_headroom(skb) < hh_alen)) {
                kfree_skb(skb);
                return NET_XMIT_DROP;
        }

        __skb_push(skb, hh_len);
        return dev_queue_xmit(skb);
}

static inline int neigh_output(struct neighbour *n, struct sk_buff *skb,
                               bool skip_cache)
{
        const struct hh_cache *hh = &n->hh;

        /* n->nud_state and hh->hh_len could be changed under us.
         * neigh_hh_output() is taking care of the race later.
         */
        if (!skip_cache &&
            (READ_ONCE(n->nud_state) & NUD_CONNECTED) &&
            READ_ONCE(hh->hh_len))
                return neigh_hh_output(hh, skb);

        return n->output(n, skb);
}

static inline struct neighbour *
__neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev, int creat)
{
        struct neighbour *n = neigh_lookup(tbl, pkey, dev);

        if (n || !creat)
                return n;

        n = neigh_create(tbl, pkey, dev);
        return IS_ERR(n) ? NULL : n;
}

static inline struct neighbour *
__neigh_lookup_errno(struct neigh_table *tbl, const void *pkey,
  struct net_device *dev)
{
        struct neighbour *n = neigh_lookup(tbl, pkey, dev);

        if (n)
                return n;

        return neigh_create(tbl, pkey, dev);
}

struct neighbour_cb {
        unsigned long sched_next;
        unsigned int flags;
};

#define LOCALLY_ENQUEUED 0x1

#define NEIGH_CB(skb)        ((struct neighbour_cb *)(skb)->cb)

static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n,
                                     const struct net_device *dev)
{
        unsigned int seq;

        do {
                seq = read_seqbegin(&n->ha_lock);
                memcpy(dst, n->ha, dev->addr_len);
        } while (read_seqretry(&n->ha_lock, seq));
}

static inline void neigh_update_is_router(struct neighbour *neigh, u32 flags,
                                          int *notify)
{
        u8 ndm_flags = 0;

        ndm_flags |= (flags & NEIGH_UPDATE_F_ISROUTER) ? NTF_ROUTER : 0;
        if ((neigh->flags ^ ndm_flags) & NTF_ROUTER) {
                if (ndm_flags & NTF_ROUTER)
                        neigh->flags |= NTF_ROUTER;
                else
                        neigh->flags &= ~NTF_ROUTER;
                *notify = 1;
        }
}
#endif

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *        Linux INET6 implementation
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>
 */

#ifndef _NET_IPV6_H
#define _NET_IPV6_H

#include <linux/ipv6.h>
#include <linux/hardirq.h>
#include <linux/jhash.h>
#include <linux/refcount.h>
#include <linux/jump_label_ratelimit.h>
#include <net/if_inet6.h>
#include <net/ndisc.h>
#include <net/flow.h>
#include <net/flow_dissector.h>
#include <net/snmp.h>
#include <net/netns/hash.h>

#define SIN6_LEN_RFC2133        24

#define IPV6_MAXPLEN                65535

/*
 *        NextHeader field of IPv6 header
 */

#define NEXTHDR_HOP                0        /* Hop-by-hop option header. */
#define NEXTHDR_TCP                6        /* TCP segment. */
#define NEXTHDR_UDP                17        /* UDP message. */
#define NEXTHDR_IPV6                41        /* IPv6 in IPv6 */
#define NEXTHDR_ROUTING                43        /* Routing header. */
#define NEXTHDR_FRAGMENT        44        /* Fragmentation/reassembly header. */
#define NEXTHDR_GRE                47        /* GRE header. */
#define NEXTHDR_ESP                50        /* Encapsulating security payload. */
#define NEXTHDR_AUTH                51        /* Authentication header. */
#define NEXTHDR_ICMP                58        /* ICMP for IPv6. */
#define NEXTHDR_NONE                59        /* No next header */
#define NEXTHDR_DEST                60        /* Destination options header. */
#define NEXTHDR_SCTP                132        /* SCTP message. */
#define NEXTHDR_MOBILITY        135        /* Mobility header. */

#define NEXTHDR_MAX                255

#define IPV6_DEFAULT_HOPLIMIT   64
#define IPV6_DEFAULT_MCASTHOPS        1

/* Limits on Hop-by-Hop and Destination options.
 *
 * Per RFC8200 there is no limit on the maximum number or lengths of options in
 * Hop-by-Hop or Destination options other then the packet must fit in an MTU.
 * We allow configurable limits in order to mitigate potential denial of
 * service attacks.
 *
 * There are three limits that may be set:
 *   - Limit the number of options in a Hop-by-Hop or Destination options
 *     extension header
 *   - Limit the byte length of a Hop-by-Hop or Destination options extension
 *     header
 *   - Disallow unknown options
 *
 * The limits are expressed in corresponding sysctls:
 *
 * ipv6.sysctl.max_dst_opts_cnt
 * ipv6.sysctl.max_hbh_opts_cnt
 * ipv6.sysctl.max_dst_opts_len
 * ipv6.sysctl.max_hbh_opts_len
 *
 * max_*_opts_cnt is the number of TLVs that are allowed for Destination
 * options or Hop-by-Hop options. If the number is less than zero then unknown
 * TLVs are disallowed and the number of known options that are allowed is the
 * absolute value. Setting the value to INT_MAX indicates no limit.
 *
 * max_*_opts_len is the length limit in bytes of a Destination or
 * Hop-by-Hop options extension header. Setting the value to INT_MAX
 * indicates no length limit.
 *
 * If a limit is exceeded when processing an extension header the packet is
 * silently discarded.
 */

/* Default limits for Hop-by-Hop and Destination options */
#define IP6_DEFAULT_MAX_DST_OPTS_CNT         8
#define IP6_DEFAULT_MAX_HBH_OPTS_CNT         8
#define IP6_DEFAULT_MAX_DST_OPTS_LEN         INT_MAX /* No limit */
#define IP6_DEFAULT_MAX_HBH_OPTS_LEN         INT_MAX /* No limit */

/*
 *        Addr type
 *        
 *        type        -        unicast | multicast
 *        scope        -        local        | site            | global
 *        v4        -        compat
 *        v4mapped
 *        any
 *        loopback
 */

#define IPV6_ADDR_ANY                0x0000U

#define IPV6_ADDR_UNICAST        0x0001U
#define IPV6_ADDR_MULTICAST        0x0002U

#define IPV6_ADDR_LOOPBACK        0x0010U
#define IPV6_ADDR_LINKLOCAL        0x0020U
#define IPV6_ADDR_SITELOCAL        0x0040U

#define IPV6_ADDR_COMPATv4        0x0080U

#define IPV6_ADDR_SCOPE_MASK        0x00f0U

#define IPV6_ADDR_MAPPED        0x1000U

/*
 *        Addr scopes
 */
#define IPV6_ADDR_MC_SCOPE(a)        \
        ((a)->s6_addr[1] & 0x0f)        /* nonstandard */
#define __IPV6_ADDR_SCOPE_INVALID        -1
#define IPV6_ADDR_SCOPE_NODELOCAL        0x01
#define IPV6_ADDR_SCOPE_LINKLOCAL        0x02
#define IPV6_ADDR_SCOPE_SITELOCAL        0x05
#define IPV6_ADDR_SCOPE_ORGLOCAL        0x08
#define IPV6_ADDR_SCOPE_GLOBAL                0x0e

/*
 *        Addr flags
 */
#define IPV6_ADDR_MC_FLAG_TRANSIENT(a)        \
        ((a)->s6_addr[1] & 0x10)
#define IPV6_ADDR_MC_FLAG_PREFIX(a)        \
        ((a)->s6_addr[1] & 0x20)
#define IPV6_ADDR_MC_FLAG_RENDEZVOUS(a)        \
        ((a)->s6_addr[1] & 0x40)

/*
 *        fragmentation header
 */

struct frag_hdr {
        __u8        nexthdr;
        __u8        reserved;
        __be16        frag_off;
        __be32        identification;
};

#define        IP6_MF                0x0001
#define        IP6_OFFSET        0xFFF8

struct ip6_fraglist_iter {
        struct ipv6hdr        *tmp_hdr;
        struct sk_buff        *frag;
        int                offset;
        unsigned int        hlen;
        __be32                frag_id;
        u8                nexthdr;
};

int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
                      u8 nexthdr, __be32 frag_id,
                      struct ip6_fraglist_iter *iter);
void ip6_fraglist_prepare(struct sk_buff *skb, struct ip6_fraglist_iter *iter);

static inline struct sk_buff *ip6_fraglist_next(struct ip6_fraglist_iter *iter)
{
        struct sk_buff *skb = iter->frag;

        iter->frag = skb->next;
        skb_mark_not_on_list(skb);

        return skb;
}

struct ip6_frag_state {
        u8                *prevhdr;
        unsigned int        hlen;
        unsigned int        mtu;
        unsigned int        left;
        int                offset;
        int                ptr;
        int                hroom;
        int                troom;
        __be32                frag_id;
        u8                nexthdr;
};

void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
                   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
                   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state);
struct sk_buff *ip6_frag_next(struct sk_buff *skb,
                              struct ip6_frag_state *state);

#define IP6_REPLY_MARK(net, mark) \
        ((net)->ipv6.sysctl.fwmark_reflect ? (mark) : 0)

#include <net/sock.h>

/* sysctls */
extern int sysctl_mld_max_msf;
extern int sysctl_mld_qrv;

#define _DEVINC(net, statname, mod, idev, field)                        \
({                                                                        \
        struct inet6_dev *_idev = (idev);                                \
        if (likely(_idev != NULL))                                        \
                mod##SNMP_INC_STATS64((_idev)->stats.statname, (field));\
        mod##SNMP_INC_STATS64((net)->mib.statname##_statistics, (field));\
})

/* per device counters are atomic_long_t */
#define _DEVINCATOMIC(net, statname, mod, idev, field)                        \
({                                                                        \
        struct inet6_dev *_idev = (idev);                                \
        if (likely(_idev != NULL))                                        \
                SNMP_INC_STATS_ATOMIC_LONG((_idev)->stats.statname##dev, (field)); \
        mod##SNMP_INC_STATS((net)->mib.statname##_statistics, (field));\
})

/* per device and per net counters are atomic_long_t */
#define _DEVINC_ATOMIC_ATOMIC(net, statname, idev, field)                \
({                                                                        \
        struct inet6_dev *_idev = (idev);                                \
        if (likely(_idev != NULL))                                        \
                SNMP_INC_STATS_ATOMIC_LONG((_idev)->stats.statname##dev, (field)); \
        SNMP_INC_STATS_ATOMIC_LONG((net)->mib.statname##_statistics, (field));\
})

#define _DEVADD(net, statname, mod, idev, field, val)                        \
({                                                                        \
        struct inet6_dev *_idev = (idev);                                \
        if (likely(_idev != NULL))                                        \
                mod##SNMP_ADD_STATS((_idev)->stats.statname, (field), (val)); \
        mod##SNMP_ADD_STATS((net)->mib.statname##_statistics, (field), (val));\
})

#define _DEVUPD(net, statname, mod, idev, field, val)                        \
({                                                                        \
        struct inet6_dev *_idev = (idev);                                \
        if (likely(_idev != NULL))                                        \
                mod##SNMP_UPD_PO_STATS((_idev)->stats.statname, field, (val)); \
        mod##SNMP_UPD_PO_STATS((net)->mib.statname##_statistics, field, (val));\
})

/* MIBs */

#define IP6_INC_STATS(net, idev,field)                \
                _DEVINC(net, ipv6, , idev, field)
#define __IP6_INC_STATS(net, idev,field)        \
                _DEVINC(net, ipv6, __, idev, field)
#define IP6_ADD_STATS(net, idev,field,val)        \
                _DEVADD(net, ipv6, , idev, field, val)
#define __IP6_ADD_STATS(net, idev,field,val)        \
                _DEVADD(net, ipv6, __, idev, field, val)
#define IP6_UPD_PO_STATS(net, idev,field,val)   \
                _DEVUPD(net, ipv6, , idev, field, val)
#define __IP6_UPD_PO_STATS(net, idev,field,val)   \
                _DEVUPD(net, ipv6, __, idev, field, val)
#define ICMP6_INC_STATS(net, idev, field)        \
                _DEVINCATOMIC(net, icmpv6, , idev, field)
#define __ICMP6_INC_STATS(net, idev, field)        \
                _DEVINCATOMIC(net, icmpv6, __, idev, field)

#define ICMP6MSGOUT_INC_STATS(net, idev, field)                \
        _DEVINC_ATOMIC_ATOMIC(net, icmpv6msg, idev, field +256)
#define ICMP6MSGIN_INC_STATS(net, idev, field)        \
        _DEVINC_ATOMIC_ATOMIC(net, icmpv6msg, idev, field)

struct ip6_ra_chain {
        struct ip6_ra_chain        *next;
        struct sock                *sk;
        int                        sel;
        void                        (*destructor)(struct sock *);
};

extern struct ip6_ra_chain        *ip6_ra_chain;
extern rwlock_t ip6_ra_lock;

/*
   This structure is prepared by protocol, when parsing
   ancillary data and passed to IPv6.
 */

struct ipv6_txoptions {
        refcount_t                refcnt;
        /* Length of this structure */
        int                        tot_len;

        /* length of extension headers   */

        __u16                        opt_flen;        /* after fragment hdr */
        __u16                        opt_nflen;        /* before fragment hdr */

        struct ipv6_opt_hdr        *hopopt;
        struct ipv6_opt_hdr        *dst0opt;
        struct ipv6_rt_hdr        *srcrt;        /* Routing Header */
        struct ipv6_opt_hdr        *dst1opt;
        struct rcu_head                rcu;
        /* Option buffer, as read by IPV6_PKTOPTIONS, starts here. */
};

/* flowlabel_reflect sysctl values */
enum flowlabel_reflect {
        FLOWLABEL_REFLECT_ESTABLISHED                = 1,
        FLOWLABEL_REFLECT_TCP_RESET                = 2,
        FLOWLABEL_REFLECT_ICMPV6_ECHO_REPLIES        = 4,
};

struct ip6_flowlabel {
        struct ip6_flowlabel __rcu *next;
        __be32                        label;
        atomic_t                users;
        struct in6_addr                dst;
        struct ipv6_txoptions        *opt;
        unsigned long                linger;
        struct rcu_head                rcu;
        u8                        share;
        union {
                struct pid *pid;
                kuid_t uid;
        } owner;
        unsigned long                lastuse;
        unsigned long                expires;
        struct net                *fl_net;
};

#define IPV6_FLOWINFO_MASK                cpu_to_be32(0x0FFFFFFF)
#define IPV6_FLOWLABEL_MASK                cpu_to_be32(0x000FFFFF)
#define IPV6_FLOWLABEL_STATELESS_FLAG        cpu_to_be32(0x00080000)

#define IPV6_TCLASS_MASK (IPV6_FLOWINFO_MASK & ~IPV6_FLOWLABEL_MASK)
#define IPV6_TCLASS_SHIFT        20

struct ipv6_fl_socklist {
        struct ipv6_fl_socklist        __rcu        *next;
        struct ip6_flowlabel                *fl;
        struct rcu_head                        rcu;
};

struct ipcm6_cookie {
        struct sockcm_cookie sockc;
        __s16 hlimit;
        __s16 tclass;
        __s8  dontfrag;
        struct ipv6_txoptions *opt;
        __u16 gso_size;
};

static inline void ipcm6_init(struct ipcm6_cookie *ipc6)
{
        *ipc6 = (struct ipcm6_cookie) {
                .hlimit = -1,
                .tclass = -1,
                .dontfrag = -1,
        };
}

static inline void ipcm6_init_sk(struct ipcm6_cookie *ipc6,
                                 const struct ipv6_pinfo *np)
{
        *ipc6 = (struct ipcm6_cookie) {
                .hlimit = -1,
                .tclass = np->tclass,
                .dontfrag = np->dontfrag,
        };
}

static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np)
{
        struct ipv6_txoptions *opt;

        rcu_read_lock();
        opt = rcu_dereference(np->opt);
        if (opt) {
                if (!refcount_inc_not_zero(&opt->refcnt))
                        opt = NULL;
                else
                        opt = rcu_pointer_handoff(opt);
        }
        rcu_read_unlock();
        return opt;
}

static inline void txopt_put(struct ipv6_txoptions *opt)
{
        if (opt && refcount_dec_and_test(&opt->refcnt))
                kfree_rcu(opt, rcu);
}

#if IS_ENABLED(CONFIG_IPV6)
struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label);

extern struct static_key_false_deferred ipv6_flowlabel_exclusive;
static inline struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk,
                                                    __be32 label)
{
        if (static_branch_unlikely(&ipv6_flowlabel_exclusive.key) &&
            READ_ONCE(sock_net(sk)->ipv6.flowlabel_has_excl))
                return __fl6_sock_lookup(sk, label) ? : ERR_PTR(-ENOENT);

        return NULL;
}
#endif

struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space,
                                         struct ip6_flowlabel *fl,
                                         struct ipv6_txoptions *fopt);
void fl6_free_socklist(struct sock *sk);
int ipv6_flowlabel_opt(struct sock *sk, sockptr_t optval, int optlen);
int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq,
                           int flags);
int ip6_flowlabel_init(void);
void ip6_flowlabel_cleanup(void);
bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np);

static inline void fl6_sock_release(struct ip6_flowlabel *fl)
{
        if (fl)
                atomic_dec(&fl->users);
}

void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info);

void icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
                                struct icmp6hdr *thdr, int len);

int ip6_ra_control(struct sock *sk, int sel);

int ipv6_parse_hopopts(struct sk_buff *skb);

struct ipv6_txoptions *ipv6_dup_options(struct sock *sk,
                                        struct ipv6_txoptions *opt);
struct ipv6_txoptions *ipv6_renew_options(struct sock *sk,
                                          struct ipv6_txoptions *opt,
                                          int newtype,
                                          struct ipv6_opt_hdr *newopt);
struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
                                          struct ipv6_txoptions *opt);

bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb,
                       const struct inet6_skb_parm *opt);
struct ipv6_txoptions *ipv6_update_options(struct sock *sk,
                                           struct ipv6_txoptions *opt);

static inline bool ipv6_accept_ra(struct inet6_dev *idev)
{
        /* If forwarding is enabled, RA are not accepted unless the special
         * hybrid mode (accept_ra=2) is enabled.
         */
        return idev->cnf.forwarding ? idev->cnf.accept_ra == 2 :
            idev->cnf.accept_ra;
}

#define IPV6_FRAG_HIGH_THRESH        (4 * 1024*1024)        /* 4194304 */
#define IPV6_FRAG_LOW_THRESH        (3 * 1024*1024)        /* 3145728 */
#define IPV6_FRAG_TIMEOUT        (60 * HZ)        /* 60 seconds */

int __ipv6_addr_type(const struct in6_addr *addr);
static inline int ipv6_addr_type(const struct in6_addr *addr)
{
        return __ipv6_addr_type(addr) & 0xffff;
}

static inline int ipv6_addr_scope(const struct in6_addr *addr)
{
        return __ipv6_addr_type(addr) & IPV6_ADDR_SCOPE_MASK;
}

static inline int __ipv6_addr_src_scope(int type)
{
        return (type == IPV6_ADDR_ANY) ? __IPV6_ADDR_SCOPE_INVALID : (type >> 16);
}

static inline int ipv6_addr_src_scope(const struct in6_addr *addr)
{
        return __ipv6_addr_src_scope(__ipv6_addr_type(addr));
}

static inline bool __ipv6_addr_needs_scope_id(int type)
{
        return type & IPV6_ADDR_LINKLOCAL ||
               (type & IPV6_ADDR_MULTICAST &&
                (type & (IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL)));
}

static inline __u32 ipv6_iface_scope_id(const struct in6_addr *addr, int iface)
{
        return __ipv6_addr_needs_scope_id(__ipv6_addr_type(addr)) ? iface : 0;
}

static inline int ipv6_addr_cmp(const struct in6_addr *a1, const struct in6_addr *a2)
{
        return memcmp(a1, a2, sizeof(struct in6_addr));
}

static inline bool
ipv6_masked_addr_cmp(const struct in6_addr *a1, const struct in6_addr *m,
                     const struct in6_addr *a2)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        const unsigned long *ul1 = (const unsigned long *)a1;
        const unsigned long *ulm = (const unsigned long *)m;
        const unsigned long *ul2 = (const unsigned long *)a2;

        return !!(((ul1[0] ^ ul2[0]) & ulm[0]) |
                  ((ul1[1] ^ ul2[1]) & ulm[1]));
#else
        return !!(((a1->s6_addr32[0] ^ a2->s6_addr32[0]) & m->s6_addr32[0]) |
                  ((a1->s6_addr32[1] ^ a2->s6_addr32[1]) & m->s6_addr32[1]) |
                  ((a1->s6_addr32[2] ^ a2->s6_addr32[2]) & m->s6_addr32[2]) |
                  ((a1->s6_addr32[3] ^ a2->s6_addr32[3]) & m->s6_addr32[3]));
#endif
}

static inline void ipv6_addr_prefix(struct in6_addr *pfx,
                                    const struct in6_addr *addr,
                                    int plen)
{
        /* caller must guarantee 0 <= plen <= 128 */
        int o = plen >> 3,
            b = plen & 0x7;

        memset(pfx->s6_addr, 0, sizeof(pfx->s6_addr));
        memcpy(pfx->s6_addr, addr, o);
        if (b != 0)
                pfx->s6_addr[o] = addr->s6_addr[o] & (0xff00 >> b);
}

static inline void ipv6_addr_prefix_copy(struct in6_addr *addr,
                                         const struct in6_addr *pfx,
                                         int plen)
{
        /* caller must guarantee 0 <= plen <= 128 */
        int o = plen >> 3,
            b = plen & 0x7;

        memcpy(addr->s6_addr, pfx, o);
        if (b != 0) {
                addr->s6_addr[o] &= ~(0xff00 >> b);
                addr->s6_addr[o] |= (pfx->s6_addr[o] & (0xff00 >> b));
        }
}

static inline void __ipv6_addr_set_half(__be32 *addr,
                                        __be32 wh, __be32 wl)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
#if defined(__BIG_ENDIAN)
        if (__builtin_constant_p(wh) && __builtin_constant_p(wl)) {
                *(__force u64 *)addr = ((__force u64)(wh) << 32 | (__force u64)(wl));
                return;
        }
#elif defined(__LITTLE_ENDIAN)
        if (__builtin_constant_p(wl) && __builtin_constant_p(wh)) {
                *(__force u64 *)addr = ((__force u64)(wl) << 32 | (__force u64)(wh));
                return;
        }
#endif
#endif
        addr[0] = wh;
        addr[1] = wl;
}

static inline void ipv6_addr_set(struct in6_addr *addr,
                                     __be32 w1, __be32 w2,
                                     __be32 w3, __be32 w4)
{
        __ipv6_addr_set_half(&addr->s6_addr32[0], w1, w2);
        __ipv6_addr_set_half(&addr->s6_addr32[2], w3, w4);
}

static inline bool ipv6_addr_equal(const struct in6_addr *a1,
                                   const struct in6_addr *a2)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        const unsigned long *ul1 = (const unsigned long *)a1;
        const unsigned long *ul2 = (const unsigned long *)a2;

        return ((ul1[0] ^ ul2[0]) | (ul1[1] ^ ul2[1])) == 0UL;
#else
        return ((a1->s6_addr32[0] ^ a2->s6_addr32[0]) |
                (a1->s6_addr32[1] ^ a2->s6_addr32[1]) |
                (a1->s6_addr32[2] ^ a2->s6_addr32[2]) |
                (a1->s6_addr32[3] ^ a2->s6_addr32[3])) == 0;
#endif
}

#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
static inline bool __ipv6_prefix_equal64_half(const __be64 *a1,
                                              const __be64 *a2,
                                              unsigned int len)
{
        if (len && ((*a1 ^ *a2) & cpu_to_be64((~0UL) << (64 - len))))
                return false;
        return true;
}

static inline bool ipv6_prefix_equal(const struct in6_addr *addr1,
                                     const struct in6_addr *addr2,
                                     unsigned int prefixlen)
{
        const __be64 *a1 = (const __be64 *)addr1;
        const __be64 *a2 = (const __be64 *)addr2;

        if (prefixlen >= 64) {
                if (a1[0] ^ a2[0])
                        return false;
                return __ipv6_prefix_equal64_half(a1 + 1, a2 + 1, prefixlen - 64);
        }
        return __ipv6_prefix_equal64_half(a1, a2, prefixlen);
}
#else
static inline bool ipv6_prefix_equal(const struct in6_addr *addr1,
                                     const struct in6_addr *addr2,
                                     unsigned int prefixlen)
{
        const __be32 *a1 = addr1->s6_addr32;
        const __be32 *a2 = addr2->s6_addr32;
        unsigned int pdw, pbi;

        /* check complete u32 in prefix */
        pdw = prefixlen >> 5;
        if (pdw && memcmp(a1, a2, pdw << 2))
                return false;

        /* check incomplete u32 in prefix */
        pbi = prefixlen & 0x1f;
        if (pbi && ((a1[pdw] ^ a2[pdw]) & htonl((0xffffffff) << (32 - pbi))))
                return false;

        return true;
}
#endif

static inline bool ipv6_addr_any(const struct in6_addr *a)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        const unsigned long *ul = (const unsigned long *)a;

        return (ul[0] | ul[1]) == 0UL;
#else
        return (a->s6_addr32[0] | a->s6_addr32[1] |
                a->s6_addr32[2] | a->s6_addr32[3]) == 0;
#endif
}

static inline u32 ipv6_addr_hash(const struct in6_addr *a)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        const unsigned long *ul = (const unsigned long *)a;
        unsigned long x = ul[0] ^ ul[1];

        return (u32)(x ^ (x >> 32));
#else
        return (__force u32)(a->s6_addr32[0] ^ a->s6_addr32[1] ^
                             a->s6_addr32[2] ^ a->s6_addr32[3]);
#endif
}

/* more secured version of ipv6_addr_hash() */
static inline u32 __ipv6_addr_jhash(const struct in6_addr *a, const u32 initval)
{
        return jhash2((__force const u32 *)a->s6_addr32,
                      ARRAY_SIZE(a->s6_addr32), initval);
}

static inline bool ipv6_addr_loopback(const struct in6_addr *a)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        const __be64 *be = (const __be64 *)a;

        return (be[0] | (be[1] ^ cpu_to_be64(1))) == 0UL;
#else
        return (a->s6_addr32[0] | a->s6_addr32[1] |
                a->s6_addr32[2] | (a->s6_addr32[3] ^ cpu_to_be32(1))) == 0;
#endif
}

/*
 * Note that we must __force cast these to unsigned long to make sparse happy,
 * since all of the endian-annotated types are fixed size regardless of arch.
 */
static inline bool ipv6_addr_v4mapped(const struct in6_addr *a)
{
        return (
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
                *(unsigned long *)a |
#else
                (__force unsigned long)(a->s6_addr32[0] | a->s6_addr32[1]) |
#endif
                (__force unsigned long)(a->s6_addr32[2] ^
                                        cpu_to_be32(0x0000ffff))) == 0UL;
}

static inline bool ipv6_addr_v4mapped_loopback(const struct in6_addr *a)
{
        return ipv6_addr_v4mapped(a) && ipv4_is_loopback(a->s6_addr32[3]);
}

static inline u32 ipv6_portaddr_hash(const struct net *net,
                                     const struct in6_addr *addr6,
                                     unsigned int port)
{
        unsigned int hash, mix = net_hash_mix(net);

        if (ipv6_addr_any(addr6))
                hash = jhash_1word(0, mix);
        else if (ipv6_addr_v4mapped(addr6))
                hash = jhash_1word((__force u32)addr6->s6_addr32[3], mix);
        else
                hash = jhash2((__force u32 *)addr6->s6_addr32, 4, mix);

        return hash ^ port;
}

/*
 * Check for a RFC 4843 ORCHID address
 * (Overlay Routable Cryptographic Hash Identifiers)
 */
static inline bool ipv6_addr_orchid(const struct in6_addr *a)
{
        return (a->s6_addr32[0] & htonl(0xfffffff0)) == htonl(0x20010010);
}

static inline bool ipv6_addr_is_multicast(const struct in6_addr *addr)
{
        return (addr->s6_addr32[0] & htonl(0xFF000000)) == htonl(0xFF000000);
}

static inline void ipv6_addr_set_v4mapped(const __be32 addr,
                                          struct in6_addr *v4mapped)
{
        ipv6_addr_set(v4mapped,
                        0, 0,
                        htonl(0x0000FFFF),
                        addr);
}

/*
 * find the first different bit between two addresses
 * length of address must be a multiple of 32bits
 */
static inline int __ipv6_addr_diff32(const void *token1, const void *token2, int addrlen)
{
        const __be32 *a1 = token1, *a2 = token2;
        int i;

        addrlen >>= 2;

        for (i = 0; i < addrlen; i++) {
                __be32 xb = a1[i] ^ a2[i];
                if (xb)
                        return i * 32 + 31 - __fls(ntohl(xb));
        }

        /*
         *        we should *never* get to this point since that
         *        would mean the addrs are equal
         *
         *        However, we do get to it 8) And exacly, when
         *        addresses are equal 8)
         *
         *        ip route add 1111::/128 via ...
         *        ip route add 1111::/64 via ...
         *        and we are here.
         *
         *        Ideally, this function should stop comparison
         *        at prefix length. It does not, but it is still OK,
         *        if returned value is greater than prefix length.
         *                                        --ANK (980803)
         */
        return addrlen << 5;
}

#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
static inline int __ipv6_addr_diff64(const void *token1, const void *token2, int addrlen)
{
        const __be64 *a1 = token1, *a2 = token2;
        int i;

        addrlen >>= 3;

        for (i = 0; i < addrlen; i++) {
                __be64 xb = a1[i] ^ a2[i];
                if (xb)
                        return i * 64 + 63 - __fls(be64_to_cpu(xb));
        }

        return addrlen << 6;
}
#endif

static inline int __ipv6_addr_diff(const void *token1, const void *token2, int addrlen)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        if (__builtin_constant_p(addrlen) && !(addrlen & 7))
                return __ipv6_addr_diff64(token1, token2, addrlen);
#endif
        return __ipv6_addr_diff32(token1, token2, addrlen);
}

static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_addr *a2)
{
        return __ipv6_addr_diff(a1, a2, sizeof(struct in6_addr));
}

__be32 ipv6_select_ident(struct net *net,
                         const struct in6_addr *daddr,
                         const struct in6_addr *saddr);
__be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb);

int ip6_dst_hoplimit(struct dst_entry *dst);

static inline int ip6_sk_dst_hoplimit(struct ipv6_pinfo *np, struct flowi6 *fl6,
                                      struct dst_entry *dst)
{
        int hlimit;

        if (ipv6_addr_is_multicast(&fl6->daddr))
                hlimit = np->mcast_hops;
        else
                hlimit = np->hop_limit;
        if (hlimit < 0)
                hlimit = ip6_dst_hoplimit(dst);
        return hlimit;
}

/* copy IPv6 saddr & daddr to flow_keys, possibly using 64bit load/store
 * Equivalent to :        flow->v6addrs.src = iph->saddr;
 *                        flow->v6addrs.dst = iph->daddr;
 */
static inline void iph_to_flow_copy_v6addrs(struct flow_keys *flow,
                                            const struct ipv6hdr *iph)
{
        BUILD_BUG_ON(offsetof(typeof(flow->addrs), v6addrs.dst) !=
                     offsetof(typeof(flow->addrs), v6addrs.src) +
                     sizeof(flow->addrs.v6addrs.src));
        memcpy(&flow->addrs.v6addrs, &iph->addrs, sizeof(flow->addrs.v6addrs));
        flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
}

#if IS_ENABLED(CONFIG_IPV6)

static inline bool ipv6_can_nonlocal_bind(struct net *net,
                                          struct inet_sock *inet)
{
        return net->ipv6.sysctl.ip_nonlocal_bind ||
                inet->freebind || inet->transparent;
}

/* Sysctl settings for net ipv6.auto_flowlabels */
#define IP6_AUTO_FLOW_LABEL_OFF                0
#define IP6_AUTO_FLOW_LABEL_OPTOUT        1
#define IP6_AUTO_FLOW_LABEL_OPTIN        2
#define IP6_AUTO_FLOW_LABEL_FORCED        3

#define IP6_AUTO_FLOW_LABEL_MAX                IP6_AUTO_FLOW_LABEL_FORCED

#define IP6_DEFAULT_AUTO_FLOW_LABELS        IP6_AUTO_FLOW_LABEL_OPTOUT

static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb,
                                        __be32 flowlabel, bool autolabel,
                                        struct flowi6 *fl6)
{
        u32 hash;

        /* @flowlabel may include more than a flow label, eg, the traffic class.
         * Here we want only the flow label value.
         */
        flowlabel &= IPV6_FLOWLABEL_MASK;

        if (flowlabel ||
            net->ipv6.sysctl.auto_flowlabels == IP6_AUTO_FLOW_LABEL_OFF ||
            (!autolabel &&
             net->ipv6.sysctl.auto_flowlabels != IP6_AUTO_FLOW_LABEL_FORCED))
                return flowlabel;

        hash = skb_get_hash_flowi6(skb, fl6);

        /* Since this is being sent on the wire obfuscate hash a bit
         * to minimize possbility that any useful information to an
         * attacker is leaked. Only lower 20 bits are relevant.
         */
        hash = rol32(hash, 16);

        flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK;

        if (net->ipv6.sysctl.flowlabel_state_ranges)
                flowlabel |= IPV6_FLOWLABEL_STATELESS_FLAG;

        return flowlabel;
}

static inline int ip6_default_np_autolabel(struct net *net)
{
        switch (net->ipv6.sysctl.auto_flowlabels) {
        case IP6_AUTO_FLOW_LABEL_OFF:
        case IP6_AUTO_FLOW_LABEL_OPTIN:
        default:
                return 0;
        case IP6_AUTO_FLOW_LABEL_OPTOUT:
        case IP6_AUTO_FLOW_LABEL_FORCED:
                return 1;
        }
}
#else
static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb,
                                        __be32 flowlabel, bool autolabel,
                                        struct flowi6 *fl6)
{
        return flowlabel;
}
static inline int ip6_default_np_autolabel(struct net *net)
{
        return 0;
}
#endif

#if IS_ENABLED(CONFIG_IPV6)
static inline int ip6_multipath_hash_policy(const struct net *net)
{
        return net->ipv6.sysctl.multipath_hash_policy;
}
#else
static inline int ip6_multipath_hash_policy(const struct net *net)
{
        return 0;
}
#endif

/*
 *        Header manipulation
 */
static inline void ip6_flow_hdr(struct ipv6hdr *hdr, unsigned int tclass,
                                __be32 flowlabel)
{
        *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | flowlabel;
}

static inline __be32 ip6_flowinfo(const struct ipv6hdr *hdr)
{
        return *(__be32 *)hdr & IPV6_FLOWINFO_MASK;
}

static inline __be32 ip6_flowlabel(const struct ipv6hdr *hdr)
{
        return *(__be32 *)hdr & IPV6_FLOWLABEL_MASK;
}

static inline u8 ip6_tclass(__be32 flowinfo)
{
        return ntohl(flowinfo & IPV6_TCLASS_MASK) >> IPV6_TCLASS_SHIFT;
}

static inline __be32 ip6_make_flowinfo(unsigned int tclass, __be32 flowlabel)
{
        return htonl(tclass << IPV6_TCLASS_SHIFT) | flowlabel;
}

static inline __be32 flowi6_get_flowlabel(const struct flowi6 *fl6)
{
        return fl6->flowlabel & IPV6_FLOWLABEL_MASK;
}

/*
 *        Prototypes exported by ipv6
 */

/*
 *        rcv function (called from netdevice level)
 */

int ipv6_rcv(struct sk_buff *skb, struct net_device *dev,
             struct packet_type *pt, struct net_device *orig_dev);
void ipv6_list_rcv(struct list_head *head, struct packet_type *pt,
                   struct net_device *orig_dev);

int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb);

/*
 *        upper-layer output functions
 */
int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
             __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority);

int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr);

int ip6_append_data(struct sock *sk,
                    int getfrag(void *from, char *to, int offset, int len,
                                int odd, struct sk_buff *skb),
                    void *from, size_t length, int transhdrlen,
                    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
                    struct rt6_info *rt, unsigned int flags);

int ip6_push_pending_frames(struct sock *sk);

void ip6_flush_pending_frames(struct sock *sk);

int ip6_send_skb(struct sk_buff *skb);

struct sk_buff *__ip6_make_skb(struct sock *sk, struct sk_buff_head *queue,
                               struct inet_cork_full *cork,
                               struct inet6_cork *v6_cork);
struct sk_buff *ip6_make_skb(struct sock *sk,
                             int getfrag(void *from, char *to, int offset,
                                         int len, int odd, struct sk_buff *skb),
                             void *from, size_t length, int transhdrlen,
                             struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
                             struct rt6_info *rt, unsigned int flags,
                             struct inet_cork_full *cork);

static inline struct sk_buff *ip6_finish_skb(struct sock *sk)
{
        return __ip6_make_skb(sk, &sk->sk_write_queue, &inet_sk(sk)->cork,
                              &inet6_sk(sk)->cork);
}

int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
                   struct flowi6 *fl6);
struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
                                      const struct in6_addr *final_dst);
struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
                                         const struct in6_addr *final_dst,
                                         bool connected);
struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
                                        struct net_device *dev,
                                        struct net *net, struct socket *sock,
                                        struct in6_addr *saddr,
                                        const struct ip_tunnel_info *info,
                                        u8 protocol, bool use_cache);
struct dst_entry *ip6_blackhole_route(struct net *net,
                                      struct dst_entry *orig_dst);

/*
 *        skb processing functions
 */

int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb);
int ip6_forward(struct sk_buff *skb);
int ip6_input(struct sk_buff *skb);
int ip6_mc_input(struct sk_buff *skb);
void ip6_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int nexthdr,
                              bool have_final);

int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb);
int ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb);

/*
 *        Extension header (options) processing
 */

void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt,
                          u8 *proto, struct in6_addr **daddr_p,
                          struct in6_addr *saddr);
void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt,
                         u8 *proto);

int ipv6_skip_exthdr(const struct sk_buff *, int start, u8 *nexthdrp,
                     __be16 *frag_offp);

bool ipv6_ext_hdr(u8 nexthdr);

enum {
        IP6_FH_F_FRAG                = (1 << 0),
        IP6_FH_F_AUTH                = (1 << 1),
        IP6_FH_F_SKIP_RH        = (1 << 2),
};

/* find specified header and get offset to it */
int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, int target,
                  unsigned short *fragoff, int *fragflg);

int ipv6_find_tlv(const struct sk_buff *skb, int offset, int type);

struct in6_addr *fl6_update_dst(struct flowi6 *fl6,
                                const struct ipv6_txoptions *opt,
                                struct in6_addr *orig);

/*
 *        socket options (ipv6_sockglue.c)
 */

int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
                    unsigned int optlen);
int ipv6_getsockopt(struct sock *sk, int level, int optname,
                    char __user *optval, int __user *optlen);

int __ip6_datagram_connect(struct sock *sk, struct sockaddr *addr,
                           int addr_len);
int ip6_datagram_connect(struct sock *sk, struct sockaddr *addr, int addr_len);
int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *addr,
                                 int addr_len);
int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr);
void ip6_datagram_release_cb(struct sock *sk);

int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len,
                    int *addr_len);
int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len,
                     int *addr_len);
void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port,
                     u32 info, u8 *payload);
void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info);
void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu);

void inet6_cleanup_sock(struct sock *sk);
void inet6_sock_destruct(struct sock *sk);
int inet6_release(struct socket *sock);
int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
                  int peer);
int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
int inet6_compat_ioctl(struct socket *sock, unsigned int cmd,
                unsigned long arg);

int inet6_hash_connect(struct inet_timewait_death_row *death_row,
                              struct sock *sk);
int inet6_sendmsg(struct socket *sock, struct msghdr *msg, size_t size);
int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
                  int flags);

/*
 * reassembly.c
 */
extern const struct proto_ops inet6_stream_ops;
extern const struct proto_ops inet6_dgram_ops;
extern const struct proto_ops inet6_sockraw_ops;

struct group_source_req;
struct group_filter;

int ip6_mc_source(int add, int omode, struct sock *sk,
                  struct group_source_req *pgsr);
int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf,
                  struct sockaddr_storage *list);
int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
                  struct sockaddr_storage __user *p);

#ifdef CONFIG_PROC_FS
int ac6_proc_init(struct net *net);
void ac6_proc_exit(struct net *net);
int raw6_proc_init(void);
void raw6_proc_exit(void);
int tcp6_proc_init(struct net *net);
void tcp6_proc_exit(struct net *net);
int udp6_proc_init(struct net *net);
void udp6_proc_exit(struct net *net);
int udplite6_proc_init(void);
void udplite6_proc_exit(void);
int ipv6_misc_proc_init(void);
void ipv6_misc_proc_exit(void);
int snmp6_register_dev(struct inet6_dev *idev);
int snmp6_unregister_dev(struct inet6_dev *idev);

#else
static inline int ac6_proc_init(struct net *net) { return 0; }
static inline void ac6_proc_exit(struct net *net) { }
static inline int snmp6_register_dev(struct inet6_dev *idev) { return 0; }
static inline int snmp6_unregister_dev(struct inet6_dev *idev) { return 0; }
#endif

#ifdef CONFIG_SYSCTL
struct ctl_table *ipv6_icmp_sysctl_init(struct net *net);
struct ctl_table *ipv6_route_sysctl_init(struct net *net);
int ipv6_sysctl_register(void);
void ipv6_sysctl_unregister(void);
#endif

int ipv6_sock_mc_join(struct sock *sk, int ifindex,
                      const struct in6_addr *addr);
int ipv6_sock_mc_join_ssm(struct sock *sk, int ifindex,
                          const struct in6_addr *addr, unsigned int mode);
int ipv6_sock_mc_drop(struct sock *sk, int ifindex,
                      const struct in6_addr *addr);

static inline int ip6_sock_set_v6only(struct sock *sk)
{
        int ret = 0;

        lock_sock(sk);
        if (inet_sk(sk)->inet_num)
                ret = -EINVAL;
        else
                sk->sk_ipv6only = true;
        release_sock(sk);
        return ret;
}

static inline void ip6_sock_set_recverr(struct sock *sk)
{
        lock_sock(sk);
        inet6_sk(sk)->recverr = true;
        release_sock(sk);
}

static inline int __ip6_sock_set_addr_preferences(struct sock *sk, int val)
{
        unsigned int pref = 0;
        unsigned int prefmask = ~0;

        /* check PUBLIC/TMP/PUBTMP_DEFAULT conflicts */
        switch (val & (IPV6_PREFER_SRC_PUBLIC |
                       IPV6_PREFER_SRC_TMP |
                       IPV6_PREFER_SRC_PUBTMP_DEFAULT)) {
        case IPV6_PREFER_SRC_PUBLIC:
                pref |= IPV6_PREFER_SRC_PUBLIC;
                prefmask &= ~(IPV6_PREFER_SRC_PUBLIC |
                              IPV6_PREFER_SRC_TMP);
                break;
        case IPV6_PREFER_SRC_TMP:
                pref |= IPV6_PREFER_SRC_TMP;
                prefmask &= ~(IPV6_PREFER_SRC_PUBLIC |
                              IPV6_PREFER_SRC_TMP);
                break;
        case IPV6_PREFER_SRC_PUBTMP_DEFAULT:
                prefmask &= ~(IPV6_PREFER_SRC_PUBLIC |
                              IPV6_PREFER_SRC_TMP);
                break;
        case 0:
                break;
        default:
                return -EINVAL;
        }

        /* check HOME/COA conflicts */
        switch (val & (IPV6_PREFER_SRC_HOME | IPV6_PREFER_SRC_COA)) {
        case IPV6_PREFER_SRC_HOME:
                prefmask &= ~IPV6_PREFER_SRC_COA;
                break;
        case IPV6_PREFER_SRC_COA:
                pref |= IPV6_PREFER_SRC_COA;
                break;
        case 0:
                break;
        default:
                return -EINVAL;
        }

        /* check CGA/NONCGA conflicts */
        switch (val & (IPV6_PREFER_SRC_CGA|IPV6_PREFER_SRC_NONCGA)) {
        case IPV6_PREFER_SRC_CGA:
        case IPV6_PREFER_SRC_NONCGA:
        case 0:
                break;
        default:
                return -EINVAL;
        }

        inet6_sk(sk)->srcprefs = (inet6_sk(sk)->srcprefs & prefmask) | pref;
        return 0;
}

static inline int ip6_sock_set_addr_preferences(struct sock *sk, int val)
{
        int ret;

        lock_sock(sk);
        ret = __ip6_sock_set_addr_preferences(sk, val);
        release_sock(sk);
        return ret;
}

static inline void ip6_sock_set_recvpktinfo(struct sock *sk)
{
        lock_sock(sk);
        inet6_sk(sk)->rxopt.bits.rxinfo = true;
        release_sock(sk);
}

#endif /* _NET_IPV6_H */











































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
// SPDX-License-Identifier: GPL-2.0
/*
 * The class-specific portions of the driver model
 *
 * Copyright (c) 2001-2003 Patrick Mochel <mochel@osdl.org>
 * Copyright (c) 2004-2009 Greg Kroah-Hartman <gregkh@suse.de>
 * Copyright (c) 2008-2009 Novell Inc.
 * Copyright (c) 2012-2019 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
 * Copyright (c) 2012-2019 Linux Foundation
 *
 * See Documentation/driver-api/driver-model/ for more information.
 */

#ifndef _DEVICE_CLASS_H_
#define _DEVICE_CLASS_H_

#include <linux/kobject.h>
#include <linux/klist.h>
#include <linux/pm.h>
#include <linux/device/bus.h>

struct device;
struct fwnode_handle;

/**
 * struct class - device classes
 * @name:        Name of the class.
 * @owner:        The module owner.
 * @class_groups: Default attributes of this class.
 * @dev_groups:        Default attributes of the devices that belong to the class.
 * @dev_kobj:        The kobject that represents this class and links it into the hierarchy.
 * @dev_uevent:        Called when a device is added, removed from this class, or a
 *                few other things that generate uevents to add the environment
 *                variables.
 * @devnode:        Callback to provide the devtmpfs.
 * @class_release: Called to release this class.
 * @dev_release: Called to release the device.
 * @shutdown_pre: Called at shut-down time before driver shutdown.
 * @ns_type:        Callbacks so sysfs can detemine namespaces.
 * @namespace:        Namespace of the device belongs to this class.
 * @get_ownership: Allows class to specify uid/gid of the sysfs directories
 *                for the devices belonging to the class. Usually tied to
 *                device's namespace.
 * @pm:                The default device power management operations of this class.
 * @p:                The private data of the driver core, no one other than the
 *                driver core can touch this.
 *
 * A class is a higher-level view of a device that abstracts out low-level
 * implementation details. Drivers may see a SCSI disk or an ATA disk, but,
 * at the class level, they are all simply disks. Classes allow user space
 * to work with devices based on what they do, rather than how they are
 * connected or how they work.
 */
struct class {
        const char                *name;
        struct module                *owner;

        const struct attribute_group        **class_groups;
        const struct attribute_group        **dev_groups;
        struct kobject                        *dev_kobj;

        int (*dev_uevent)(struct device *dev, struct kobj_uevent_env *env);
        char *(*devnode)(struct device *dev, umode_t *mode);

        void (*class_release)(struct class *class);
        void (*dev_release)(struct device *dev);

        int (*shutdown_pre)(struct device *dev);

        const struct kobj_ns_type_operations *ns_type;
        const void *(*namespace)(struct device *dev);

        void (*get_ownership)(struct device *dev, kuid_t *uid, kgid_t *gid);

        const struct dev_pm_ops *pm;

        struct subsys_private *p;
};

struct class_dev_iter {
        struct klist_iter                ki;
        const struct device_type        *type;
};

extern struct kobject *sysfs_dev_block_kobj;
extern struct kobject *sysfs_dev_char_kobj;
extern int __must_check __class_register(struct class *class,
                                         struct lock_class_key *key);
extern void class_unregister(struct class *class);

/* This is a #define to keep the compiler from merging different
 * instances of the __key variable */
#define class_register(class)                        \
({                                                \
        static struct lock_class_key __key;        \
        __class_register(class, &__key);        \
})

struct class_compat;
struct class_compat *class_compat_register(const char *name);
void class_compat_unregister(struct class_compat *cls);
int class_compat_create_link(struct class_compat *cls, struct device *dev,
                             struct device *device_link);
void class_compat_remove_link(struct class_compat *cls, struct device *dev,
                              struct device *device_link);

extern void class_dev_iter_init(struct class_dev_iter *iter,
                                struct class *class,
                                struct device *start,
                                const struct device_type *type);
extern struct device *class_dev_iter_next(struct class_dev_iter *iter);
extern void class_dev_iter_exit(struct class_dev_iter *iter);

extern int class_for_each_device(struct class *class, struct device *start,
                                 void *data,
                                 int (*fn)(struct device *dev, void *data));
extern struct device *class_find_device(struct class *class,
                                        struct device *start, const void *data,
                                        int (*match)(struct device *, const void *));

/**
 * class_find_device_by_name - device iterator for locating a particular device
 * of a specific name.
 * @class: class type
 * @name: name of the device to match
 */
static inline struct device *class_find_device_by_name(struct class *class,
                                                       const char *name)
{
        return class_find_device(class, NULL, name, device_match_name);
}

/**
 * class_find_device_by_of_node : device iterator for locating a particular device
 * matching the of_node.
 * @class: class type
 * @np: of_node of the device to match.
 */
static inline struct device *
class_find_device_by_of_node(struct class *class, const struct device_node *np)
{
        return class_find_device(class, NULL, np, device_match_of_node);
}

/**
 * class_find_device_by_fwnode : device iterator for locating a particular device
 * matching the fwnode.
 * @class: class type
 * @fwnode: fwnode of the device to match.
 */
static inline struct device *
class_find_device_by_fwnode(struct class *class,
                            const struct fwnode_handle *fwnode)
{
        return class_find_device(class, NULL, fwnode, device_match_fwnode);
}

/**
 * class_find_device_by_devt : device iterator for locating a particular device
 * matching the device type.
 * @class: class type
 * @devt: device type of the device to match.
 */
static inline struct device *class_find_device_by_devt(struct class *class,
                                                       dev_t devt)
{
        return class_find_device(class, NULL, &devt, device_match_devt);
}

#ifdef CONFIG_ACPI
struct acpi_device;
/**
 * class_find_device_by_acpi_dev : device iterator for locating a particular
 * device matching the ACPI_COMPANION device.
 * @class: class type
 * @adev: ACPI_COMPANION device to match.
 */
static inline struct device *
class_find_device_by_acpi_dev(struct class *class, const struct acpi_device *adev)
{
        return class_find_device(class, NULL, adev, device_match_acpi_dev);
}
#else
static inline struct device *
class_find_device_by_acpi_dev(struct class *class, const void *adev)
{
        return NULL;
}
#endif

struct class_attribute {
        struct attribute attr;
        ssize_t (*show)(struct class *class, struct class_attribute *attr,
                        char *buf);
        ssize_t (*store)(struct class *class, struct class_attribute *attr,
                        const char *buf, size_t count);
};

#define CLASS_ATTR_RW(_name) \
        struct class_attribute class_attr_##_name = __ATTR_RW(_name)
#define CLASS_ATTR_RO(_name) \
        struct class_attribute class_attr_##_name = __ATTR_RO(_name)
#define CLASS_ATTR_WO(_name) \
        struct class_attribute class_attr_##_name = __ATTR_WO(_name)

extern int __must_check class_create_file_ns(struct class *class,
                                             const struct class_attribute *attr,
                                             const void *ns);
extern void class_remove_file_ns(struct class *class,
                                 const struct class_attribute *attr,
                                 const void *ns);

static inline int __must_check class_create_file(struct class *class,
                                        const struct class_attribute *attr)
{
        return class_create_file_ns(class, attr, NULL);
}

static inline void class_remove_file(struct class *class,
                                     const struct class_attribute *attr)
{
        return class_remove_file_ns(class, attr, NULL);
}

/* Simple class attribute that is just a static string */
struct class_attribute_string {
        struct class_attribute attr;
        char *str;
};

/* Currently read-only only */
#define _CLASS_ATTR_STRING(_name, _mode, _str) \
        { __ATTR(_name, _mode, show_class_attr_string, NULL), _str }
#define CLASS_ATTR_STRING(_name, _mode, _str) \
        struct class_attribute_string class_attr_##_name = \
                _CLASS_ATTR_STRING(_name, _mode, _str)

extern ssize_t show_class_attr_string(struct class *class, struct class_attribute *attr,
                        char *buf);

struct class_interface {
        struct list_head        node;
        struct class                *class;

        int (*add_dev)                (struct device *, struct class_interface *);
        void (*remove_dev)        (struct device *, struct class_interface *);
};

extern int __must_check class_interface_register(struct class_interface *);
extern void class_interface_unregister(struct class_interface *);

extern struct class * __must_check __class_create(struct module *owner,
                                                  const char *name,
                                                  struct lock_class_key *key);
extern void class_destroy(struct class *cls);

/* This is a #define to keep the compiler from merging different
 * instances of the __key variable */
#define class_create(owner, name)                \
({                                                \
        static struct lock_class_key __key;        \
        __class_create(owner, name, &__key);        \
})


#endif        /* _DEVICE_CLASS_H_ */



















































































































    1 
    1 



























    1 
    1 
    1 








    1 
























    1 
    1 
    1 




    1 































































































































































































































































































































































































































































































































































































































































































    1 

    1 
    1 














































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * NetLabel Domain Hash Table
 *
 * This file manages the domain hash table that NetLabel uses to determine
 * which network labeling protocol to use for a given domain.  The NetLabel
 * system manages static and dynamic label mappings for network protocols such
 * as CIPSO and RIPSO.
 *
 * Author: Paul Moore <paul@paul-moore.com>
 */

/*
 * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008
 */

#include <linux/types.h>
#include <linux/rculist.h>
#include <linux/skbuff.h>
#include <linux/spinlock.h>
#include <linux/string.h>
#include <linux/audit.h>
#include <linux/slab.h>
#include <net/netlabel.h>
#include <net/cipso_ipv4.h>
#include <net/calipso.h>
#include <asm/bug.h>

#include "netlabel_mgmt.h"
#include "netlabel_addrlist.h"
#include "netlabel_calipso.h"
#include "netlabel_domainhash.h"
#include "netlabel_user.h"

struct netlbl_domhsh_tbl {
        struct list_head *tbl;
        u32 size;
};

/* Domain hash table */
/* updates should be so rare that having one spinlock for the entire hash table
 * should be okay */
static DEFINE_SPINLOCK(netlbl_domhsh_lock);
#define netlbl_domhsh_rcu_deref(p) \
        rcu_dereference_check(p, lockdep_is_held(&netlbl_domhsh_lock))
static struct netlbl_domhsh_tbl __rcu *netlbl_domhsh;
static struct netlbl_dom_map __rcu *netlbl_domhsh_def_ipv4;
static struct netlbl_dom_map __rcu *netlbl_domhsh_def_ipv6;

/*
 * Domain Hash Table Helper Functions
 */

/**
 * netlbl_domhsh_free_entry - Frees a domain hash table entry
 * @entry: the entry's RCU field
 *
 * Description:
 * This function is designed to be used as a callback to the call_rcu()
 * function so that the memory allocated to a hash table entry can be released
 * safely.
 *
 */
static void netlbl_domhsh_free_entry(struct rcu_head *entry)
{
        struct netlbl_dom_map *ptr;
        struct netlbl_af4list *iter4;
        struct netlbl_af4list *tmp4;
#if IS_ENABLED(CONFIG_IPV6)
        struct netlbl_af6list *iter6;
        struct netlbl_af6list *tmp6;
#endif /* IPv6 */

        ptr = container_of(entry, struct netlbl_dom_map, rcu);
        if (ptr->def.type == NETLBL_NLTYPE_ADDRSELECT) {
                netlbl_af4list_foreach_safe(iter4, tmp4,
                                            &ptr->def.addrsel->list4) {
                        netlbl_af4list_remove_entry(iter4);
                        kfree(netlbl_domhsh_addr4_entry(iter4));
                }
#if IS_ENABLED(CONFIG_IPV6)
                netlbl_af6list_foreach_safe(iter6, tmp6,
                                            &ptr->def.addrsel->list6) {
                        netlbl_af6list_remove_entry(iter6);
                        kfree(netlbl_domhsh_addr6_entry(iter6));
                }
#endif /* IPv6 */
                kfree(ptr->def.addrsel);
        }
        kfree(ptr->domain);
        kfree(ptr);
}

/**
 * netlbl_domhsh_hash - Hashing function for the domain hash table
 * @key: the domain name to hash
 *
 * Description:
 * This is the hashing function for the domain hash table, it returns the
 * correct bucket number for the domain.  The caller is responsible for
 * ensuring that the hash table is protected with either a RCU read lock or the
 * hash table lock.
 *
 */
static u32 netlbl_domhsh_hash(const char *key)
{
        u32 iter;
        u32 val;
        u32 len;

        /* This is taken (with slight modification) from
         * security/selinux/ss/symtab.c:symhash() */

        for (iter = 0, val = 0, len = strlen(key); iter < len; iter++)
                val = (val << 4 | (val >> (8 * sizeof(u32) - 4))) ^ key[iter];
        return val & (netlbl_domhsh_rcu_deref(netlbl_domhsh)->size - 1);
}

static bool netlbl_family_match(u16 f1, u16 f2)
{
        return (f1 == f2) || (f1 == AF_UNSPEC) || (f2 == AF_UNSPEC);
}

/**
 * netlbl_domhsh_search - Search for a domain entry
 * @domain: the domain
 * @family: the address family
 *
 * Description:
 * Searches the domain hash table and returns a pointer to the hash table
 * entry if found, otherwise NULL is returned.  @family may be %AF_UNSPEC
 * which matches any address family entries.  The caller is responsible for
 * ensuring that the hash table is protected with either a RCU read lock or the
 * hash table lock.
 *
 */
static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain,
                                                   u16 family)
{
        u32 bkt;
        struct list_head *bkt_list;
        struct netlbl_dom_map *iter;

        if (domain != NULL) {
                bkt = netlbl_domhsh_hash(domain);
                bkt_list = &netlbl_domhsh_rcu_deref(netlbl_domhsh)->tbl[bkt];
                list_for_each_entry_rcu(iter, bkt_list, list,
                                        lockdep_is_held(&netlbl_domhsh_lock))
                        if (iter->valid &&
                            netlbl_family_match(iter->family, family) &&
                            strcmp(iter->domain, domain) == 0)
                                return iter;
        }

        return NULL;
}

/**
 * netlbl_domhsh_search_def - Search for a domain entry
 * @domain: the domain
 * @family: the address family
 *
 * Description:
 * Searches the domain hash table and returns a pointer to the hash table
 * entry if an exact match is found, if an exact match is not present in the
 * hash table then the default entry is returned if valid otherwise NULL is
 * returned.  @family may be %AF_UNSPEC which matches any address family
 * entries.  The caller is responsible ensuring that the hash table is
 * protected with either a RCU read lock or the hash table lock.
 *
 */
static struct netlbl_dom_map *netlbl_domhsh_search_def(const char *domain,
                                                       u16 family)
{
        struct netlbl_dom_map *entry;

        entry = netlbl_domhsh_search(domain, family);
        if (entry != NULL)
                return entry;
        if (family == AF_INET || family == AF_UNSPEC) {
                entry = netlbl_domhsh_rcu_deref(netlbl_domhsh_def_ipv4);
                if (entry != NULL && entry->valid)
                        return entry;
        }
        if (family == AF_INET6 || family == AF_UNSPEC) {
                entry = netlbl_domhsh_rcu_deref(netlbl_domhsh_def_ipv6);
                if (entry != NULL && entry->valid)
                        return entry;
        }

        return NULL;
}

/**
 * netlbl_domhsh_audit_add - Generate an audit entry for an add event
 * @entry: the entry being added
 * @addr4: the IPv4 address information
 * @addr6: the IPv6 address information
 * @result: the result code
 * @audit_info: NetLabel audit information
 *
 * Description:
 * Generate an audit record for adding a new NetLabel/LSM mapping entry with
 * the given information.  Caller is responsible for holding the necessary
 * locks.
 *
 */
static void netlbl_domhsh_audit_add(struct netlbl_dom_map *entry,
                                    struct netlbl_af4list *addr4,
                                    struct netlbl_af6list *addr6,
                                    int result,
                                    struct netlbl_audit *audit_info)
{
        struct audit_buffer *audit_buf;
        struct cipso_v4_doi *cipsov4 = NULL;
        struct calipso_doi *calipso = NULL;
        u32 type;

        audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_ADD, audit_info);
        if (audit_buf != NULL) {
                audit_log_format(audit_buf, " nlbl_domain=%s",
                                 entry->domain ? entry->domain : "(default)");
                if (addr4 != NULL) {
                        struct netlbl_domaddr4_map *map4;
                        map4 = netlbl_domhsh_addr4_entry(addr4);
                        type = map4->def.type;
                        cipsov4 = map4->def.cipso;
                        netlbl_af4list_audit_addr(audit_buf, 0, NULL,
                                                  addr4->addr, addr4->mask);
#if IS_ENABLED(CONFIG_IPV6)
                } else if (addr6 != NULL) {
                        struct netlbl_domaddr6_map *map6;
                        map6 = netlbl_domhsh_addr6_entry(addr6);
                        type = map6->def.type;
                        calipso = map6->def.calipso;
                        netlbl_af6list_audit_addr(audit_buf, 0, NULL,
                                                  &addr6->addr, &addr6->mask);
#endif /* IPv6 */
                } else {
                        type = entry->def.type;
                        cipsov4 = entry->def.cipso;
                        calipso = entry->def.calipso;
                }
                switch (type) {
                case NETLBL_NLTYPE_UNLABELED:
                        audit_log_format(audit_buf, " nlbl_protocol=unlbl");
                        break;
                case NETLBL_NLTYPE_CIPSOV4:
                        BUG_ON(cipsov4 == NULL);
                        audit_log_format(audit_buf,
                                         " nlbl_protocol=cipsov4 cipso_doi=%u",
                                         cipsov4->doi);
                        break;
                case NETLBL_NLTYPE_CALIPSO:
                        BUG_ON(calipso == NULL);
                        audit_log_format(audit_buf,
                                         " nlbl_protocol=calipso calipso_doi=%u",
                                         calipso->doi);
                        break;
                }
                audit_log_format(audit_buf, " res=%u", result == 0 ? 1 : 0);
                audit_log_end(audit_buf);
        }
}

/**
 * netlbl_domhsh_validate - Validate a new domain mapping entry
 * @entry: the entry to validate
 *
 * This function validates the new domain mapping entry to ensure that it is
 * a valid entry.  Returns zero on success, negative values on failure.
 *
 */
static int netlbl_domhsh_validate(const struct netlbl_dom_map *entry)
{
        struct netlbl_af4list *iter4;
        struct netlbl_domaddr4_map *map4;
#if IS_ENABLED(CONFIG_IPV6)
        struct netlbl_af6list *iter6;
        struct netlbl_domaddr6_map *map6;
#endif /* IPv6 */

        if (entry == NULL)
                return -EINVAL;

        if (entry->family != AF_INET && entry->family != AF_INET6 &&
            (entry->family != AF_UNSPEC ||
             entry->def.type != NETLBL_NLTYPE_UNLABELED))
                return -EINVAL;

        switch (entry->def.type) {
        case NETLBL_NLTYPE_UNLABELED:
                if (entry->def.cipso != NULL || entry->def.calipso != NULL ||
                    entry->def.addrsel != NULL)
                        return -EINVAL;
                break;
        case NETLBL_NLTYPE_CIPSOV4:
                if (entry->family != AF_INET ||
                    entry->def.cipso == NULL)
                        return -EINVAL;
                break;
        case NETLBL_NLTYPE_CALIPSO:
                if (entry->family != AF_INET6 ||
                    entry->def.calipso == NULL)
                        return -EINVAL;
                break;
        case NETLBL_NLTYPE_ADDRSELECT:
                netlbl_af4list_foreach(iter4, &entry->def.addrsel->list4) {
                        map4 = netlbl_domhsh_addr4_entry(iter4);
                        switch (map4->def.type) {
                        case NETLBL_NLTYPE_UNLABELED:
                                if (map4->def.cipso != NULL)
                                        return -EINVAL;
                                break;
                        case NETLBL_NLTYPE_CIPSOV4:
                                if (map4->def.cipso == NULL)
                                        return -EINVAL;
                                break;
                        default:
                                return -EINVAL;
                        }
                }
#if IS_ENABLED(CONFIG_IPV6)
                netlbl_af6list_foreach(iter6, &entry->def.addrsel->list6) {
                        map6 = netlbl_domhsh_addr6_entry(iter6);
                        switch (map6->def.type) {
                        case NETLBL_NLTYPE_UNLABELED:
                                if (map6->def.calipso != NULL)
                                        return -EINVAL;
                                break;
                        case NETLBL_NLTYPE_CALIPSO:
                                if (map6->def.calipso == NULL)
                                        return -EINVAL;
                                break;
                        default:
                                return -EINVAL;
                        }
                }
#endif /* IPv6 */
                break;
        default:
                return -EINVAL;
        }

        return 0;
}

/*
 * Domain Hash Table Functions
 */

/**
 * netlbl_domhsh_init - Init for the domain hash
 * @size: the number of bits to use for the hash buckets
 *
 * Description:
 * Initializes the domain hash table, should be called only by
 * netlbl_user_init() during initialization.  Returns zero on success, non-zero
 * values on error.
 *
 */
int __init netlbl_domhsh_init(u32 size)
{
        u32 iter;
        struct netlbl_domhsh_tbl *hsh_tbl;

        if (size == 0)
                return -EINVAL;

        hsh_tbl = kmalloc(sizeof(*hsh_tbl), GFP_KERNEL);
        if (hsh_tbl == NULL)
                return -ENOMEM;
        hsh_tbl->size = 1 << size;
        hsh_tbl->tbl = kcalloc(hsh_tbl->size,
                               sizeof(struct list_head),
                               GFP_KERNEL);
        if (hsh_tbl->tbl == NULL) {
                kfree(hsh_tbl);
                return -ENOMEM;
        }
        for (iter = 0; iter < hsh_tbl->size; iter++)
                INIT_LIST_HEAD(&hsh_tbl->tbl[iter]);

        spin_lock(&netlbl_domhsh_lock);
        rcu_assign_pointer(netlbl_domhsh, hsh_tbl);
        spin_unlock(&netlbl_domhsh_lock);

        return 0;
}

/**
 * netlbl_domhsh_add - Adds a entry to the domain hash table
 * @entry: the entry to add
 * @audit_info: NetLabel audit information
 *
 * Description:
 * Adds a new entry to the domain hash table and handles any updates to the
 * lower level protocol handler (i.e. CIPSO).  @entry->family may be set to
 * %AF_UNSPEC which will add an entry that matches all address families.  This
 * is only useful for the unlabelled type and will only succeed if there is no
 * existing entry for any address family with the same domain.  Returns zero
 * on success, negative on failure.
 *
 */
int netlbl_domhsh_add(struct netlbl_dom_map *entry,
                      struct netlbl_audit *audit_info)
{
        int ret_val = 0;
        struct netlbl_dom_map *entry_old, *entry_b;
        struct netlbl_af4list *iter4;
        struct netlbl_af4list *tmp4;
#if IS_ENABLED(CONFIG_IPV6)
        struct netlbl_af6list *iter6;
        struct netlbl_af6list *tmp6;
#endif /* IPv6 */

        ret_val = netlbl_domhsh_validate(entry);
        if (ret_val != 0)
                return ret_val;

        /* XXX - we can remove this RCU read lock as the spinlock protects the
         *       entire function, but before we do we need to fixup the
         *       netlbl_af[4,6]list RCU functions to do "the right thing" with
         *       respect to rcu_dereference() when only a spinlock is held. */
        rcu_read_lock();
        spin_lock(&netlbl_domhsh_lock);
        if (entry->domain != NULL)
                entry_old = netlbl_domhsh_search(entry->domain, entry->family);
        else
                entry_old = netlbl_domhsh_search_def(entry->domain,
                                                     entry->family);
        if (entry_old == NULL) {
                entry->valid = 1;

                if (entry->domain != NULL) {
                        u32 bkt = netlbl_domhsh_hash(entry->domain);
                        list_add_tail_rcu(&entry->list,
                                    &rcu_dereference(netlbl_domhsh)->tbl[bkt]);
                } else {
                        INIT_LIST_HEAD(&entry->list);
                        switch (entry->family) {
                        case AF_INET:
                                rcu_assign_pointer(netlbl_domhsh_def_ipv4,
                                                   entry);
                                break;
                        case AF_INET6:
                                rcu_assign_pointer(netlbl_domhsh_def_ipv6,
                                                   entry);
                                break;
                        case AF_UNSPEC:
                                if (entry->def.type !=
                                    NETLBL_NLTYPE_UNLABELED) {
                                        ret_val = -EINVAL;
                                        goto add_return;
                                }
                                entry_b = kzalloc(sizeof(*entry_b), GFP_ATOMIC);
                                if (entry_b == NULL) {
                                        ret_val = -ENOMEM;
                                        goto add_return;
                                }
                                entry_b->family = AF_INET6;
                                entry_b->def.type = NETLBL_NLTYPE_UNLABELED;
                                entry_b->valid = 1;
                                entry->family = AF_INET;
                                rcu_assign_pointer(netlbl_domhsh_def_ipv4,
                                                   entry);
                                rcu_assign_pointer(netlbl_domhsh_def_ipv6,
                                                   entry_b);
                                break;
                        default:
                                /* Already checked in
                                 * netlbl_domhsh_validate(). */
                                ret_val = -EINVAL;
                                goto add_return;
                        }
                }

                if (entry->def.type == NETLBL_NLTYPE_ADDRSELECT) {
                        netlbl_af4list_foreach_rcu(iter4,
                                                   &entry->def.addrsel->list4)
                                netlbl_domhsh_audit_add(entry, iter4, NULL,
                                                        ret_val, audit_info);
#if IS_ENABLED(CONFIG_IPV6)
                        netlbl_af6list_foreach_rcu(iter6,
                                                   &entry->def.addrsel->list6)
                                netlbl_domhsh_audit_add(entry, NULL, iter6,
                                                        ret_val, audit_info);
#endif /* IPv6 */
                } else
                        netlbl_domhsh_audit_add(entry, NULL, NULL,
                                                ret_val, audit_info);
        } else if (entry_old->def.type == NETLBL_NLTYPE_ADDRSELECT &&
                   entry->def.type == NETLBL_NLTYPE_ADDRSELECT) {
                struct list_head *old_list4;
                struct list_head *old_list6;

                old_list4 = &entry_old->def.addrsel->list4;
                old_list6 = &entry_old->def.addrsel->list6;

                /* we only allow the addition of address selectors if all of
                 * the selectors do not exist in the existing domain map */
                netlbl_af4list_foreach_rcu(iter4, &entry->def.addrsel->list4)
                        if (netlbl_af4list_search_exact(iter4->addr,
                                                        iter4->mask,
                                                        old_list4)) {
                                ret_val = -EEXIST;
                                goto add_return;
                        }
#if IS_ENABLED(CONFIG_IPV6)
                netlbl_af6list_foreach_rcu(iter6, &entry->def.addrsel->list6)
                        if (netlbl_af6list_search_exact(&iter6->addr,
                                                        &iter6->mask,
                                                        old_list6)) {
                                ret_val = -EEXIST;
                                goto add_return;
                        }
#endif /* IPv6 */

                netlbl_af4list_foreach_safe(iter4, tmp4,
                                            &entry->def.addrsel->list4) {
                        netlbl_af4list_remove_entry(iter4);
                        iter4->valid = 1;
                        ret_val = netlbl_af4list_add(iter4, old_list4);
                        netlbl_domhsh_audit_add(entry_old, iter4, NULL,
                                                ret_val, audit_info);
                        if (ret_val != 0)
                                goto add_return;
                }
#if IS_ENABLED(CONFIG_IPV6)
                netlbl_af6list_foreach_safe(iter6, tmp6,
                                            &entry->def.addrsel->list6) {
                        netlbl_af6list_remove_entry(iter6);
                        iter6->valid = 1;
                        ret_val = netlbl_af6list_add(iter6, old_list6);
                        netlbl_domhsh_audit_add(entry_old, NULL, iter6,
                                                ret_val, audit_info);
                        if (ret_val != 0)
                                goto add_return;
                }
#endif /* IPv6 */
                /* cleanup the new entry since we've moved everything over */
                netlbl_domhsh_free_entry(&entry->rcu);
        } else
                ret_val = -EINVAL;

add_return:
        spin_unlock(&netlbl_domhsh_lock);
        rcu_read_unlock();
        return ret_val;
}

/**
 * netlbl_domhsh_add_default - Adds the default entry to the domain hash table
 * @entry: the entry to add
 * @audit_info: NetLabel audit information
 *
 * Description:
 * Adds a new default entry to the domain hash table and handles any updates
 * to the lower level protocol handler (i.e. CIPSO).  Returns zero on success,
 * negative on failure.
 *
 */
int netlbl_domhsh_add_default(struct netlbl_dom_map *entry,
                              struct netlbl_audit *audit_info)
{
        return netlbl_domhsh_add(entry, audit_info);
}

/**
 * netlbl_domhsh_remove_entry - Removes a given entry from the domain table
 * @entry: the entry to remove
 * @audit_info: NetLabel audit information
 *
 * Description:
 * Removes an entry from the domain hash table and handles any updates to the
 * lower level protocol handler (i.e. CIPSO).  Caller is responsible for
 * ensuring that the RCU read lock is held.  Returns zero on success, negative
 * on failure.
 *
 */
int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry,
                               struct netlbl_audit *audit_info)
{
        int ret_val = 0;
        struct audit_buffer *audit_buf;
        struct netlbl_af4list *iter4;
        struct netlbl_domaddr4_map *map4;
#if IS_ENABLED(CONFIG_IPV6)
        struct netlbl_af6list *iter6;
        struct netlbl_domaddr6_map *map6;
#endif /* IPv6 */

        if (entry == NULL)
                return -ENOENT;

        spin_lock(&netlbl_domhsh_lock);
        if (entry->valid) {
                entry->valid = 0;
                if (entry == rcu_dereference(netlbl_domhsh_def_ipv4))
                        RCU_INIT_POINTER(netlbl_domhsh_def_ipv4, NULL);
                else if (entry == rcu_dereference(netlbl_domhsh_def_ipv6))
                        RCU_INIT_POINTER(netlbl_domhsh_def_ipv6, NULL);
                else
                        list_del_rcu(&entry->list);
        } else
                ret_val = -ENOENT;
        spin_unlock(&netlbl_domhsh_lock);

        if (ret_val)
                return ret_val;

        audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_DEL, audit_info);
        if (audit_buf != NULL) {
                audit_log_format(audit_buf,
                                 " nlbl_domain=%s res=1",
                                 entry->domain ? entry->domain : "(default)");
                audit_log_end(audit_buf);
        }

        switch (entry->def.type) {
        case NETLBL_NLTYPE_ADDRSELECT:
                netlbl_af4list_foreach_rcu(iter4, &entry->def.addrsel->list4) {
                        map4 = netlbl_domhsh_addr4_entry(iter4);
                        cipso_v4_doi_putdef(map4->def.cipso);
                }
#if IS_ENABLED(CONFIG_IPV6)
                netlbl_af6list_foreach_rcu(iter6, &entry->def.addrsel->list6) {
                        map6 = netlbl_domhsh_addr6_entry(iter6);
                        calipso_doi_putdef(map6->def.calipso);
                }
#endif /* IPv6 */
                break;
        case NETLBL_NLTYPE_CIPSOV4:
                cipso_v4_doi_putdef(entry->def.cipso);
                break;
#if IS_ENABLED(CONFIG_IPV6)
        case NETLBL_NLTYPE_CALIPSO:
                calipso_doi_putdef(entry->def.calipso);
                break;
#endif /* IPv6 */
        }
        call_rcu(&entry->rcu, netlbl_domhsh_free_entry);

        return ret_val;
}

/**
 * netlbl_domhsh_remove_af4 - Removes an address selector entry
 * @domain: the domain
 * @addr: IPv4 address
 * @mask: IPv4 address mask
 * @audit_info: NetLabel audit information
 *
 * Description:
 * Removes an individual address selector from a domain mapping and potentially
 * the entire mapping if it is empty.  Returns zero on success, negative values
 * on failure.
 *
 */
int netlbl_domhsh_remove_af4(const char *domain,
                             const struct in_addr *addr,
                             const struct in_addr *mask,
                             struct netlbl_audit *audit_info)
{
        struct netlbl_dom_map *entry_map;
        struct netlbl_af4list *entry_addr;
        struct netlbl_af4list *iter4;
#if IS_ENABLED(CONFIG_IPV6)
        struct netlbl_af6list *iter6;
#endif /* IPv6 */
        struct netlbl_domaddr4_map *entry;

        rcu_read_lock();

        if (domain)
                entry_map = netlbl_domhsh_search(domain, AF_INET);
        else
                entry_map = netlbl_domhsh_search_def(domain, AF_INET);
        if (entry_map == NULL ||
            entry_map->def.type != NETLBL_NLTYPE_ADDRSELECT)
                goto remove_af4_failure;

        spin_lock(&netlbl_domhsh_lock);
        entry_addr = netlbl_af4list_remove(addr->s_addr, mask->s_addr,
                                           &entry_map->def.addrsel->list4);
        spin_unlock(&netlbl_domhsh_lock);

        if (entry_addr == NULL)
                goto remove_af4_failure;
        netlbl_af4list_foreach_rcu(iter4, &entry_map->def.addrsel->list4)
                goto remove_af4_single_addr;
#if IS_ENABLED(CONFIG_IPV6)
        netlbl_af6list_foreach_rcu(iter6, &entry_map->def.addrsel->list6)
                goto remove_af4_single_addr;
#endif /* IPv6 */
        /* the domain mapping is empty so remove it from the mapping table */
        netlbl_domhsh_remove_entry(entry_map, audit_info);

remove_af4_single_addr:
        rcu_read_unlock();
        /* yick, we can't use call_rcu here because we don't have a rcu head
         * pointer but hopefully this should be a rare case so the pause
         * shouldn't be a problem */
        synchronize_rcu();
        entry = netlbl_domhsh_addr4_entry(entry_addr);
        cipso_v4_doi_putdef(entry->def.cipso);
        kfree(entry);
        return 0;

remove_af4_failure:
        rcu_read_unlock();
        return -ENOENT;
}

#if IS_ENABLED(CONFIG_IPV6)
/**
 * netlbl_domhsh_remove_af6 - Removes an address selector entry
 * @domain: the domain
 * @addr: IPv6 address
 * @mask: IPv6 address mask
 * @audit_info: NetLabel audit information
 *
 * Description:
 * Removes an individual address selector from a domain mapping and potentially
 * the entire mapping if it is empty.  Returns zero on success, negative values
 * on failure.
 *
 */
int netlbl_domhsh_remove_af6(const char *domain,
                             const struct in6_addr *addr,
                             const struct in6_addr *mask,
                             struct netlbl_audit *audit_info)
{
        struct netlbl_dom_map *entry_map;
        struct netlbl_af6list *entry_addr;
        struct netlbl_af4list *iter4;
        struct netlbl_af6list *iter6;
        struct netlbl_domaddr6_map *entry;

        rcu_read_lock();

        if (domain)
                entry_map = netlbl_domhsh_search(domain, AF_INET6);
        else
                entry_map = netlbl_domhsh_search_def(domain, AF_INET6);
        if (entry_map == NULL ||
            entry_map->def.type != NETLBL_NLTYPE_ADDRSELECT)
                goto remove_af6_failure;

        spin_lock(&netlbl_domhsh_lock);
        entry_addr = netlbl_af6list_remove(addr, mask,
                                           &entry_map->def.addrsel->list6);
        spin_unlock(&netlbl_domhsh_lock);

        if (entry_addr == NULL)
                goto remove_af6_failure;
        netlbl_af4list_foreach_rcu(iter4, &entry_map->def.addrsel->list4)
                goto remove_af6_single_addr;
        netlbl_af6list_foreach_rcu(iter6, &entry_map->def.addrsel->list6)
                goto remove_af6_single_addr;
        /* the domain mapping is empty so remove it from the mapping table */
        netlbl_domhsh_remove_entry(entry_map, audit_info);

remove_af6_single_addr:
        rcu_read_unlock();
        /* yick, we can't use call_rcu here because we don't have a rcu head
         * pointer but hopefully this should be a rare case so the pause
         * shouldn't be a problem */
        synchronize_rcu();
        entry = netlbl_domhsh_addr6_entry(entry_addr);
        calipso_doi_putdef(entry->def.calipso);
        kfree(entry);
        return 0;

remove_af6_failure:
        rcu_read_unlock();
        return -ENOENT;
}
#endif /* IPv6 */

/**
 * netlbl_domhsh_remove - Removes an entry from the domain hash table
 * @domain: the domain to remove
 * @family: address family
 * @audit_info: NetLabel audit information
 *
 * Description:
 * Removes an entry from the domain hash table and handles any updates to the
 * lower level protocol handler (i.e. CIPSO).  @family may be %AF_UNSPEC which
 * removes all address family entries.  Returns zero on success, negative on
 * failure.
 *
 */
int netlbl_domhsh_remove(const char *domain, u16 family,
                         struct netlbl_audit *audit_info)
{
        int ret_val = -EINVAL;
        struct netlbl_dom_map *entry;

        rcu_read_lock();

        if (family == AF_INET || family == AF_UNSPEC) {
                if (domain)
                        entry = netlbl_domhsh_search(domain, AF_INET);
                else
                        entry = netlbl_domhsh_search_def(domain, AF_INET);
                ret_val = netlbl_domhsh_remove_entry(entry, audit_info);
                if (ret_val && ret_val != -ENOENT)
                        goto done;
        }
        if (family == AF_INET6 || family == AF_UNSPEC) {
                int ret_val2;

                if (domain)
                        entry = netlbl_domhsh_search(domain, AF_INET6);
                else
                        entry = netlbl_domhsh_search_def(domain, AF_INET6);
                ret_val2 = netlbl_domhsh_remove_entry(entry, audit_info);
                if (ret_val2 != -ENOENT)
                        ret_val = ret_val2;
        }
done:
        rcu_read_unlock();

        return ret_val;
}

/**
 * netlbl_domhsh_remove_default - Removes the default entry from the table
 * @family: address family
 * @audit_info: NetLabel audit information
 *
 * Description:
 * Removes/resets the default entry corresponding to @family from the domain
 * hash table and handles any updates to the lower level protocol handler
 * (i.e. CIPSO).  @family may be %AF_UNSPEC which removes all address family
 * entries.  Returns zero on success, negative on failure.
 *
 */
int netlbl_domhsh_remove_default(u16 family, struct netlbl_audit *audit_info)
{
        return netlbl_domhsh_remove(NULL, family, audit_info);
}

/**
 * netlbl_domhsh_getentry - Get an entry from the domain hash table
 * @domain: the domain name to search for
 * @family: address family
 *
 * Description:
 * Look through the domain hash table searching for an entry to match @domain,
 * with address family @family, return a pointer to a copy of the entry or
 * NULL.  The caller is responsible for ensuring that rcu_read_[un]lock() is
 * called.
 *
 */
struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain, u16 family)
{
        if (family == AF_UNSPEC)
                return NULL;
        return netlbl_domhsh_search_def(domain, family);
}

/**
 * netlbl_domhsh_getentry_af4 - Get an entry from the domain hash table
 * @domain: the domain name to search for
 * @addr: the IP address to search for
 *
 * Description:
 * Look through the domain hash table searching for an entry to match @domain
 * and @addr, return a pointer to a copy of the entry or NULL.  The caller is
 * responsible for ensuring that rcu_read_[un]lock() is called.
 *
 */
struct netlbl_dommap_def *netlbl_domhsh_getentry_af4(const char *domain,
                                                     __be32 addr)
{
        struct netlbl_dom_map *dom_iter;
        struct netlbl_af4list *addr_iter;

        dom_iter = netlbl_domhsh_search_def(domain, AF_INET);
        if (dom_iter == NULL)
                return NULL;

        if (dom_iter->def.type != NETLBL_NLTYPE_ADDRSELECT)
                return &dom_iter->def;
        addr_iter = netlbl_af4list_search(addr, &dom_iter->def.addrsel->list4);
        if (addr_iter == NULL)
                return NULL;
        return &(netlbl_domhsh_addr4_entry(addr_iter)->def);
}

#if IS_ENABLED(CONFIG_IPV6)
/**
 * netlbl_domhsh_getentry_af6 - Get an entry from the domain hash table
 * @domain: the domain name to search for
 * @addr: the IP address to search for
 *
 * Description:
 * Look through the domain hash table searching for an entry to match @domain
 * and @addr, return a pointer to a copy of the entry or NULL.  The caller is
 * responsible for ensuring that rcu_read_[un]lock() is called.
 *
 */
struct netlbl_dommap_def *netlbl_domhsh_getentry_af6(const char *domain,
                                                   const struct in6_addr *addr)
{
        struct netlbl_dom_map *dom_iter;
        struct netlbl_af6list *addr_iter;

        dom_iter = netlbl_domhsh_search_def(domain, AF_INET6);
        if (dom_iter == NULL)
                return NULL;

        if (dom_iter->def.type != NETLBL_NLTYPE_ADDRSELECT)
                return &dom_iter->def;
        addr_iter = netlbl_af6list_search(addr, &dom_iter->def.addrsel->list6);
        if (addr_iter == NULL)
                return NULL;
        return &(netlbl_domhsh_addr6_entry(addr_iter)->def);
}
#endif /* IPv6 */

/**
 * netlbl_domhsh_walk - Iterate through the domain mapping hash table
 * @skip_bkt: the number of buckets to skip at the start
 * @skip_chain: the number of entries to skip in the first iterated bucket
 * @callback: callback for each entry
 * @cb_arg: argument for the callback function
 *
 * Description:
 * Interate over the domain mapping hash table, skipping the first @skip_bkt
 * buckets and @skip_chain entries.  For each entry in the table call
 * @callback, if @callback returns a negative value stop 'walking' through the
 * table and return.  Updates the values in @skip_bkt and @skip_chain on
 * return.  Returns zero on success, negative values on failure.
 *
 */
int netlbl_domhsh_walk(u32 *skip_bkt,
                     u32 *skip_chain,
                     int (*callback) (struct netlbl_dom_map *entry, void *arg),
                     void *cb_arg)
{
        int ret_val = -ENOENT;
        u32 iter_bkt;
        struct list_head *iter_list;
        struct netlbl_dom_map *iter_entry;
        u32 chain_cnt = 0;

        rcu_read_lock();
        for (iter_bkt = *skip_bkt;
             iter_bkt < rcu_dereference(netlbl_domhsh)->size;
             iter_bkt++, chain_cnt = 0) {
                iter_list = &rcu_dereference(netlbl_domhsh)->tbl[iter_bkt];
                list_for_each_entry_rcu(iter_entry, iter_list, list)
                        if (iter_entry->valid) {
                                if (chain_cnt++ < *skip_chain)
                                        continue;
                                ret_val = callback(iter_entry, cb_arg);
                                if (ret_val < 0) {
                                        chain_cnt--;
                                        goto walk_return;
                                }
                        }
        }

walk_return:
        rcu_read_unlock();
        *skip_bkt = iter_bkt;
        *skip_chain = chain_cnt;
        return ret_val;
}








































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
/*
 * Copyright 1997 Transmeta Corporation - All Rights Reserved
 * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
 * Copyright 2005-2006,2013,2017-2018 Ian Kent <raven@themaw.net>
 *
 * This file is part of the Linux kernel and is made available under
 * the terms of the GNU General Public License, version 2, or at your
 * option, any later version, incorporated herein by reference.
 *
 * ----------------------------------------------------------------------- */

#ifndef _UAPI_LINUX_AUTO_FS_H
#define _UAPI_LINUX_AUTO_FS_H

#include <linux/types.h>
#include <linux/limits.h>
#ifndef __KERNEL__
#include <sys/ioctl.h>
#endif /* __KERNEL__ */

#define AUTOFS_PROTO_VERSION                5
#define AUTOFS_MIN_PROTO_VERSION        3
#define AUTOFS_MAX_PROTO_VERSION        5

#define AUTOFS_PROTO_SUBVERSION                5

/*
 * The wait_queue_token (autofs_wqt_t) is part of a structure which is passed
 * back to the kernel via ioctl from userspace. On architectures where 32- and
 * 64-bit userspace binaries can be executed it's important that the size of
 * autofs_wqt_t stays constant between 32- and 64-bit Linux kernels so that we
 * do not break the binary ABI interface by changing the structure size.
 */
#if defined(__ia64__) || defined(__alpha__) /* pure 64bit architectures */
typedef unsigned long autofs_wqt_t;
#else
typedef unsigned int autofs_wqt_t;
#endif

/* Packet types */
#define autofs_ptype_missing        0        /* Missing entry (mount request) */
#define autofs_ptype_expire        1        /* Expire entry (umount request) */

struct autofs_packet_hdr {
        int proto_version;                /* Protocol version */
        int type;                        /* Type of packet */
};

struct autofs_packet_missing {
        struct autofs_packet_hdr hdr;
        autofs_wqt_t wait_queue_token;
        int len;
        char name[NAME_MAX+1];
};        

/* v3 expire (via ioctl) */
struct autofs_packet_expire {
        struct autofs_packet_hdr hdr;
        int len;
        char name[NAME_MAX+1];
};

#define AUTOFS_IOCTL 0x93

enum {
        AUTOFS_IOC_READY_CMD = 0x60,
        AUTOFS_IOC_FAIL_CMD,
        AUTOFS_IOC_CATATONIC_CMD,
        AUTOFS_IOC_PROTOVER_CMD,
        AUTOFS_IOC_SETTIMEOUT_CMD,
        AUTOFS_IOC_EXPIRE_CMD,
};

#define AUTOFS_IOC_READY        _IO(AUTOFS_IOCTL, AUTOFS_IOC_READY_CMD)
#define AUTOFS_IOC_FAIL         _IO(AUTOFS_IOCTL, AUTOFS_IOC_FAIL_CMD)
#define AUTOFS_IOC_CATATONIC    _IO(AUTOFS_IOCTL, AUTOFS_IOC_CATATONIC_CMD)
#define AUTOFS_IOC_PROTOVER     _IOR(AUTOFS_IOCTL, \
                                     AUTOFS_IOC_PROTOVER_CMD, int)
#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(AUTOFS_IOCTL, \
                                      AUTOFS_IOC_SETTIMEOUT_CMD, \
                                      compat_ulong_t)
#define AUTOFS_IOC_SETTIMEOUT   _IOWR(AUTOFS_IOCTL, \
                                      AUTOFS_IOC_SETTIMEOUT_CMD, \
                                      unsigned long)
#define AUTOFS_IOC_EXPIRE       _IOR(AUTOFS_IOCTL, \
                                     AUTOFS_IOC_EXPIRE_CMD, \
                                     struct autofs_packet_expire)

/* autofs version 4 and later definitions */

/* Mask for expire behaviour */
#define AUTOFS_EXP_NORMAL                0x00
#define AUTOFS_EXP_IMMEDIATE                0x01
#define AUTOFS_EXP_LEAVES                0x02
#define AUTOFS_EXP_FORCED                0x04

#define AUTOFS_TYPE_ANY                        0U
#define AUTOFS_TYPE_INDIRECT                1U
#define AUTOFS_TYPE_DIRECT                2U
#define AUTOFS_TYPE_OFFSET                4U

static inline void set_autofs_type_indirect(unsigned int *type)
{
        *type = AUTOFS_TYPE_INDIRECT;
}

static inline unsigned int autofs_type_indirect(unsigned int type)
{
        return (type == AUTOFS_TYPE_INDIRECT);
}

static inline void set_autofs_type_direct(unsigned int *type)
{
        *type = AUTOFS_TYPE_DIRECT;
}

static inline unsigned int autofs_type_direct(unsigned int type)
{
        return (type == AUTOFS_TYPE_DIRECT);
}

static inline void set_autofs_type_offset(unsigned int *type)
{
        *type = AUTOFS_TYPE_OFFSET;
}

static inline unsigned int autofs_type_offset(unsigned int type)
{
        return (type == AUTOFS_TYPE_OFFSET);
}

static inline unsigned int autofs_type_trigger(unsigned int type)
{
        return (type == AUTOFS_TYPE_DIRECT || type == AUTOFS_TYPE_OFFSET);
}

/*
 * This isn't really a type as we use it to say "no type set" to
 * indicate we want to search for "any" mount in the
 * autofs_dev_ioctl_ismountpoint() device ioctl function.
 */
static inline void set_autofs_type_any(unsigned int *type)
{
        *type = AUTOFS_TYPE_ANY;
}

static inline unsigned int autofs_type_any(unsigned int type)
{
        return (type == AUTOFS_TYPE_ANY);
}

/* Daemon notification packet types */
enum autofs_notify {
        NFY_NONE,
        NFY_MOUNT,
        NFY_EXPIRE
};

/* Kernel protocol version 4 packet types */

/* Expire entry (umount request) */
#define autofs_ptype_expire_multi        2

/* Kernel protocol version 5 packet types */

/* Indirect mount missing and expire requests. */
#define autofs_ptype_missing_indirect        3
#define autofs_ptype_expire_indirect        4

/* Direct mount missing and expire requests */
#define autofs_ptype_missing_direct        5
#define autofs_ptype_expire_direct        6

/* v4 multi expire (via pipe) */
struct autofs_packet_expire_multi {
        struct autofs_packet_hdr hdr;
        autofs_wqt_t wait_queue_token;
        int len;
        char name[NAME_MAX+1];
};

union autofs_packet_union {
        struct autofs_packet_hdr hdr;
        struct autofs_packet_missing missing;
        struct autofs_packet_expire expire;
        struct autofs_packet_expire_multi expire_multi;
};

/* autofs v5 common packet struct */
struct autofs_v5_packet {
        struct autofs_packet_hdr hdr;
        autofs_wqt_t wait_queue_token;
        __u32 dev;
        __u64 ino;
        __u32 uid;
        __u32 gid;
        __u32 pid;
        __u32 tgid;
        __u32 len;
        char name[NAME_MAX+1];
};

typedef struct autofs_v5_packet autofs_packet_missing_indirect_t;
typedef struct autofs_v5_packet autofs_packet_expire_indirect_t;
typedef struct autofs_v5_packet autofs_packet_missing_direct_t;
typedef struct autofs_v5_packet autofs_packet_expire_direct_t;

union autofs_v5_packet_union {
        struct autofs_packet_hdr hdr;
        struct autofs_v5_packet v5_packet;
        autofs_packet_missing_indirect_t missing_indirect;
        autofs_packet_expire_indirect_t expire_indirect;
        autofs_packet_missing_direct_t missing_direct;
        autofs_packet_expire_direct_t expire_direct;
};

enum {
        AUTOFS_IOC_EXPIRE_MULTI_CMD = 0x66, /* AUTOFS_IOC_EXPIRE_CMD + 1 */
        AUTOFS_IOC_PROTOSUBVER_CMD,
        AUTOFS_IOC_ASKUMOUNT_CMD = 0x70, /* AUTOFS_DEV_IOCTL_VERSION_CMD - 1 */
};

#define AUTOFS_IOC_EXPIRE_MULTI                _IOW(AUTOFS_IOCTL, \
                                             AUTOFS_IOC_EXPIRE_MULTI_CMD, int)
#define AUTOFS_IOC_PROTOSUBVER                _IOR(AUTOFS_IOCTL, \
                                             AUTOFS_IOC_PROTOSUBVER_CMD, int)
#define AUTOFS_IOC_ASKUMOUNT                _IOR(AUTOFS_IOCTL, \
                                             AUTOFS_IOC_ASKUMOUNT_CMD, int)

#endif /* _UAPI_LINUX_AUTO_FS_H */




































    1 

    1 











































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
// SPDX-License-Identifier: GPL-2.0-or-later
/* delayacct.c - per-task delay accounting
 *
 * Copyright (C) Shailabh Nagar, IBM Corp. 2006
 */

#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/sched/cputime.h>
#include <linux/slab.h>
#include <linux/taskstats.h>
#include <linux/time.h>
#include <linux/sysctl.h>
#include <linux/delayacct.h>
#include <linux/module.h>

int delayacct_on __read_mostly = 1;        /* Delay accounting turned on/off */
EXPORT_SYMBOL_GPL(delayacct_on);
struct kmem_cache *delayacct_cache;

static int __init delayacct_setup_disable(char *str)
{
        delayacct_on = 0;
        return 1;
}
__setup("nodelayacct", delayacct_setup_disable);

void delayacct_init(void)
{
        delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC|SLAB_ACCOUNT);
        delayacct_tsk_init(&init_task);
}

void __delayacct_tsk_init(struct task_struct *tsk)
{
        tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL);
        if (tsk->delays)
                raw_spin_lock_init(&tsk->delays->lock);
}

/*
 * Finish delay accounting for a statistic using its timestamps (@start),
 * accumalator (@total) and @count
 */
static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total,
                          u32 *count)
{
        s64 ns = ktime_get_ns() - *start;
        unsigned long flags;

        if (ns > 0) {
                raw_spin_lock_irqsave(lock, flags);
                *total += ns;
                (*count)++;
                raw_spin_unlock_irqrestore(lock, flags);
        }
}

void __delayacct_blkio_start(void)
{
        current->delays->blkio_start = ktime_get_ns();
}

/*
 * We cannot rely on the `current` macro, as we haven't yet switched back to
 * the process being woken.
 */
void __delayacct_blkio_end(struct task_struct *p)
{
        struct task_delay_info *delays = p->delays;
        u64 *total;
        u32 *count;

        if (p->delays->flags & DELAYACCT_PF_SWAPIN) {
                total = &delays->swapin_delay;
                count = &delays->swapin_count;
        } else {
                total = &delays->blkio_delay;
                count = &delays->blkio_count;
        }

        delayacct_end(&delays->lock, &delays->blkio_start, total, count);
}

int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
{
        u64 utime, stime, stimescaled, utimescaled;
        unsigned long long t2, t3;
        unsigned long flags, t1;
        s64 tmp;

        task_cputime(tsk, &utime, &stime);
        tmp = (s64)d->cpu_run_real_total;
        tmp += utime + stime;
        d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;

        task_cputime_scaled(tsk, &utimescaled, &stimescaled);
        tmp = (s64)d->cpu_scaled_run_real_total;
        tmp += utimescaled + stimescaled;
        d->cpu_scaled_run_real_total =
                (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp;

        /*
         * No locking available for sched_info (and too expensive to add one)
         * Mitigate by taking snapshot of values
         */
        t1 = tsk->sched_info.pcount;
        t2 = tsk->sched_info.run_delay;
        t3 = tsk->se.sum_exec_runtime;

        d->cpu_count += t1;

        tmp = (s64)d->cpu_delay_total + t2;
        d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp;

        tmp = (s64)d->cpu_run_virtual_total + t3;
        d->cpu_run_virtual_total =
                (tmp < (s64)d->cpu_run_virtual_total) ?        0 : tmp;

        /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */

        raw_spin_lock_irqsave(&tsk->delays->lock, flags);
        tmp = d->blkio_delay_total + tsk->delays->blkio_delay;
        d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
        tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
        d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
        tmp = d->freepages_delay_total + tsk->delays->freepages_delay;
        d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp;
        tmp = d->thrashing_delay_total + tsk->delays->thrashing_delay;
        d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp;
        d->blkio_count += tsk->delays->blkio_count;
        d->swapin_count += tsk->delays->swapin_count;
        d->freepages_count += tsk->delays->freepages_count;
        d->thrashing_count += tsk->delays->thrashing_count;
        raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);

        return 0;
}

__u64 __delayacct_blkio_ticks(struct task_struct *tsk)
{
        __u64 ret;
        unsigned long flags;

        raw_spin_lock_irqsave(&tsk->delays->lock, flags);
        ret = nsec_to_clock_t(tsk->delays->blkio_delay +
                                tsk->delays->swapin_delay);
        raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
        return ret;
}

void __delayacct_freepages_start(void)
{
        current->delays->freepages_start = ktime_get_ns();
}

void __delayacct_freepages_end(void)
{
        delayacct_end(
                &current->delays->lock,
                &current->delays->freepages_start,
                &current->delays->freepages_delay,
                &current->delays->freepages_count);
}

void __delayacct_thrashing_start(void)
{
        current->delays->thrashing_start = ktime_get_ns();
}

void __delayacct_thrashing_end(void)
{
        delayacct_end(&current->delays->lock,
                      &current->delays->thrashing_start,
                      &current->delays->thrashing_delay,
                      &current->delays->thrashing_count);
}





























































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/stat.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/export.h>
#include <linux/mm.h>
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/highuid.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/security.h>
#include <linux/cred.h>
#include <linux/syscalls.h>
#include <linux/pagemap.h>
#include <linux/compat.h>

#include <linux/uaccess.h>
#include <asm/unistd.h>

#include "internal.h"
#include "mount.h"

/**
 * generic_fillattr - Fill in the basic attributes from the inode struct
 * @inode: Inode to use as the source
 * @stat: Where to fill in the attributes
 *
 * Fill in the basic attributes in the kstat structure from data that's to be
 * found on the VFS inode structure.  This is the default if no getattr inode
 * operation is supplied.
 */
void generic_fillattr(struct inode *inode, struct kstat *stat)
{
        stat->dev = inode->i_sb->s_dev;
        stat->ino = inode->i_ino;
        stat->mode = inode->i_mode;
        stat->nlink = inode->i_nlink;
        stat->uid = inode->i_uid;
        stat->gid = inode->i_gid;
        stat->rdev = inode->i_rdev;
        stat->size = i_size_read(inode);
        stat->atime = inode->i_atime;
        stat->mtime = inode->i_mtime;
        stat->ctime = inode->i_ctime;
        stat->blksize = i_blocksize(inode);
        stat->blocks = inode->i_blocks;
}
EXPORT_SYMBOL(generic_fillattr);

/**
 * vfs_getattr_nosec - getattr without security checks
 * @path: file to get attributes from
 * @stat: structure to return attributes in
 * @request_mask: STATX_xxx flags indicating what the caller wants
 * @query_flags: Query mode (AT_STATX_SYNC_TYPE)
 *
 * Get attributes without calling security_inode_getattr.
 *
 * Currently the only caller other than vfs_getattr is internal to the
 * filehandle lookup code, which uses only the inode number and returns no
 * attributes to any user.  Any other code probably wants vfs_getattr.
 */
int vfs_getattr_nosec(const struct path *path, struct kstat *stat,
                      u32 request_mask, unsigned int query_flags)
{
        struct inode *inode = d_backing_inode(path->dentry);

        memset(stat, 0, sizeof(*stat));
        stat->result_mask |= STATX_BASIC_STATS;
        query_flags &= AT_STATX_SYNC_TYPE;

        /* allow the fs to override these if it really wants to */
        /* SB_NOATIME means filesystem supplies dummy atime value */
        if (inode->i_sb->s_flags & SB_NOATIME)
                stat->result_mask &= ~STATX_ATIME;

        /*
         * Note: If you add another clause to set an attribute flag, please
         * update attributes_mask below.
         */
        if (IS_AUTOMOUNT(inode))
                stat->attributes |= STATX_ATTR_AUTOMOUNT;

        if (IS_DAX(inode))
                stat->attributes |= STATX_ATTR_DAX;

        stat->attributes_mask |= (STATX_ATTR_AUTOMOUNT |
                                  STATX_ATTR_DAX);

        if (inode->i_op->getattr)
                return inode->i_op->getattr(path, stat, request_mask,
                                            query_flags);

        generic_fillattr(inode, stat);
        return 0;
}
EXPORT_SYMBOL(vfs_getattr_nosec);

/*
 * vfs_getattr - Get the enhanced basic attributes of a file
 * @path: The file of interest
 * @stat: Where to return the statistics
 * @request_mask: STATX_xxx flags indicating what the caller wants
 * @query_flags: Query mode (AT_STATX_SYNC_TYPE)
 *
 * Ask the filesystem for a file's attributes.  The caller must indicate in
 * request_mask and query_flags to indicate what they want.
 *
 * If the file is remote, the filesystem can be forced to update the attributes
 * from the backing store by passing AT_STATX_FORCE_SYNC in query_flags or can
 * suppress the update by passing AT_STATX_DONT_SYNC.
 *
 * Bits must have been set in request_mask to indicate which attributes the
 * caller wants retrieving.  Any such attribute not requested may be returned
 * anyway, but the value may be approximate, and, if remote, may not have been
 * synchronised with the server.
 *
 * 0 will be returned on success, and a -ve error code if unsuccessful.
 */
int vfs_getattr(const struct path *path, struct kstat *stat,
                u32 request_mask, unsigned int query_flags)
{
        int retval;

        retval = security_inode_getattr(path);
        if (retval)
                return retval;
        return vfs_getattr_nosec(path, stat, request_mask, query_flags);
}
EXPORT_SYMBOL(vfs_getattr);

/**
 * vfs_fstat - Get the basic attributes by file descriptor
 * @fd: The file descriptor referring to the file of interest
 * @stat: The result structure to fill in.
 *
 * This function is a wrapper around vfs_getattr().  The main difference is
 * that it uses a file descriptor to determine the file location.
 *
 * 0 will be returned on success, and a -ve error code if unsuccessful.
 */
int vfs_fstat(int fd, struct kstat *stat)
{
        struct fd f;
        int error;

        f = fdget_raw(fd);
        if (!f.file)
                return -EBADF;
        error = vfs_getattr(&f.file->f_path, stat, STATX_BASIC_STATS, 0);
        fdput(f);
        return error;
}

/**
 * vfs_statx - Get basic and extra attributes by filename
 * @dfd: A file descriptor representing the base dir for a relative filename
 * @filename: The name of the file of interest
 * @flags: Flags to control the query
 * @stat: The result structure to fill in.
 * @request_mask: STATX_xxx flags indicating what the caller wants
 *
 * This function is a wrapper around vfs_getattr().  The main difference is
 * that it uses a filename and base directory to determine the file location.
 * Additionally, the use of AT_SYMLINK_NOFOLLOW in flags will prevent a symlink
 * at the given name from being referenced.
 *
 * 0 will be returned on success, and a -ve error code if unsuccessful.
 */
static int vfs_statx(int dfd, const char __user *filename, int flags,
              struct kstat *stat, u32 request_mask)
{
        struct path path;
        unsigned lookup_flags = 0;
        int error;

        if (flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | AT_EMPTY_PATH |
                      AT_STATX_SYNC_TYPE))
                return -EINVAL;

        if (!(flags & AT_SYMLINK_NOFOLLOW))
                lookup_flags |= LOOKUP_FOLLOW;
        if (!(flags & AT_NO_AUTOMOUNT))
                lookup_flags |= LOOKUP_AUTOMOUNT;
        if (flags & AT_EMPTY_PATH)
                lookup_flags |= LOOKUP_EMPTY;

retry:
        error = user_path_at(dfd, filename, lookup_flags, &path);
        if (error)
                goto out;

        error = vfs_getattr(&path, stat, request_mask, flags);
        stat->mnt_id = real_mount(path.mnt)->mnt_id;
        stat->result_mask |= STATX_MNT_ID;
        if (path.mnt->mnt_root == path.dentry)
                stat->attributes |= STATX_ATTR_MOUNT_ROOT;
        stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT;
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
out:
        return error;
}

int vfs_fstatat(int dfd, const char __user *filename,
                              struct kstat *stat, int flags)
{
        return vfs_statx(dfd, filename, flags | AT_NO_AUTOMOUNT,
                         stat, STATX_BASIC_STATS);
}

#ifdef __ARCH_WANT_OLD_STAT

/*
 * For backward compatibility?  Maybe this should be moved
 * into arch/i386 instead?
 */
static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * statbuf)
{
        static int warncount = 5;
        struct __old_kernel_stat tmp;

        if (warncount > 0) {
                warncount--;
                printk(KERN_WARNING "VFS: Warning: %s using old stat() call. Recompile your binary.\n",
                        current->comm);
        } else if (warncount < 0) {
                /* it's laughable, but... */
                warncount = 0;
        }

        memset(&tmp, 0, sizeof(struct __old_kernel_stat));
        tmp.st_dev = old_encode_dev(stat->dev);
        tmp.st_ino = stat->ino;
        if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
                return -EOVERFLOW;
        tmp.st_mode = stat->mode;
        tmp.st_nlink = stat->nlink;
        if (tmp.st_nlink != stat->nlink)
                return -EOVERFLOW;
        SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
        SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
        tmp.st_rdev = old_encode_dev(stat->rdev);
#if BITS_PER_LONG == 32
        if (stat->size > MAX_NON_LFS)
                return -EOVERFLOW;
#endif
        tmp.st_size = stat->size;
        tmp.st_atime = stat->atime.tv_sec;
        tmp.st_mtime = stat->mtime.tv_sec;
        tmp.st_ctime = stat->ctime.tv_sec;
        return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
}

SYSCALL_DEFINE2(stat, const char __user *, filename,
                struct __old_kernel_stat __user *, statbuf)
{
        struct kstat stat;
        int error;

        error = vfs_stat(filename, &stat);
        if (error)
                return error;

        return cp_old_stat(&stat, statbuf);
}

SYSCALL_DEFINE2(lstat, const char __user *, filename,
                struct __old_kernel_stat __user *, statbuf)
{
        struct kstat stat;
        int error;

        error = vfs_lstat(filename, &stat);
        if (error)
                return error;

        return cp_old_stat(&stat, statbuf);
}

SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, statbuf)
{
        struct kstat stat;
        int error = vfs_fstat(fd, &stat);

        if (!error)
                error = cp_old_stat(&stat, statbuf);

        return error;
}

#endif /* __ARCH_WANT_OLD_STAT */

#ifdef __ARCH_WANT_NEW_STAT

#if BITS_PER_LONG == 32
#  define choose_32_64(a,b) a
#else
#  define choose_32_64(a,b) b
#endif

#ifndef INIT_STRUCT_STAT_PADDING
#  define INIT_STRUCT_STAT_PADDING(st) memset(&st, 0, sizeof(st))
#endif

static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
{
        struct stat tmp;

        if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev))
                return -EOVERFLOW;
        if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev))
                return -EOVERFLOW;
#if BITS_PER_LONG == 32
        if (stat->size > MAX_NON_LFS)
                return -EOVERFLOW;
#endif

        INIT_STRUCT_STAT_PADDING(tmp);
        tmp.st_dev = new_encode_dev(stat->dev);
        tmp.st_ino = stat->ino;
        if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
                return -EOVERFLOW;
        tmp.st_mode = stat->mode;
        tmp.st_nlink = stat->nlink;
        if (tmp.st_nlink != stat->nlink)
                return -EOVERFLOW;
        SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
        SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
        tmp.st_rdev = new_encode_dev(stat->rdev);
        tmp.st_size = stat->size;
        tmp.st_atime = stat->atime.tv_sec;
        tmp.st_mtime = stat->mtime.tv_sec;
        tmp.st_ctime = stat->ctime.tv_sec;
#ifdef STAT_HAVE_NSEC
        tmp.st_atime_nsec = stat->atime.tv_nsec;
        tmp.st_mtime_nsec = stat->mtime.tv_nsec;
        tmp.st_ctime_nsec = stat->ctime.tv_nsec;
#endif
        tmp.st_blocks = stat->blocks;
        tmp.st_blksize = stat->blksize;
        return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
}

SYSCALL_DEFINE2(newstat, const char __user *, filename,
                struct stat __user *, statbuf)
{
        struct kstat stat;
        int error = vfs_stat(filename, &stat);

        if (error)
                return error;
        return cp_new_stat(&stat, statbuf);
}

SYSCALL_DEFINE2(newlstat, const char __user *, filename,
                struct stat __user *, statbuf)
{
        struct kstat stat;
        int error;

        error = vfs_lstat(filename, &stat);
        if (error)
                return error;

        return cp_new_stat(&stat, statbuf);
}

#if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT)
SYSCALL_DEFINE4(newfstatat, int, dfd, const char __user *, filename,
                struct stat __user *, statbuf, int, flag)
{
        struct kstat stat;
        int error;

        error = vfs_fstatat(dfd, filename, &stat, flag);
        if (error)
                return error;
        return cp_new_stat(&stat, statbuf);
}
#endif

SYSCALL_DEFINE2(newfstat, unsigned int, fd, struct stat __user *, statbuf)
{
        struct kstat stat;
        int error = vfs_fstat(fd, &stat);

        if (!error)
                error = cp_new_stat(&stat, statbuf);

        return error;
}
#endif

static int do_readlinkat(int dfd, const char __user *pathname,
                         char __user *buf, int bufsiz)
{
        struct path path;
        int error;
        int empty = 0;
        unsigned int lookup_flags = LOOKUP_EMPTY;

        if (bufsiz <= 0)
                return -EINVAL;

retry:
        error = user_path_at_empty(dfd, pathname, lookup_flags, &path, &empty);
        if (!error) {
                struct inode *inode = d_backing_inode(path.dentry);

                error = empty ? -ENOENT : -EINVAL;
                /*
                 * AFS mountpoints allow readlink(2) but are not symlinks
                 */
                if (d_is_symlink(path.dentry) || inode->i_op->readlink) {
                        error = security_inode_readlink(path.dentry);
                        if (!error) {
                                touch_atime(&path);
                                error = vfs_readlink(path.dentry, buf, bufsiz);
                        }
                }
                path_put(&path);
                if (retry_estale(error, lookup_flags)) {
                        lookup_flags |= LOOKUP_REVAL;
                        goto retry;
                }
        }
        return error;
}

SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
                char __user *, buf, int, bufsiz)
{
        return do_readlinkat(dfd, pathname, buf, bufsiz);
}

SYSCALL_DEFINE3(readlink, const char __user *, path, char __user *, buf,
                int, bufsiz)
{
        return do_readlinkat(AT_FDCWD, path, buf, bufsiz);
}


/* ---------- LFS-64 ----------- */
#if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64)

#ifndef INIT_STRUCT_STAT64_PADDING
#  define INIT_STRUCT_STAT64_PADDING(st) memset(&st, 0, sizeof(st))
#endif

static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf)
{
        struct stat64 tmp;

        INIT_STRUCT_STAT64_PADDING(tmp);
#ifdef CONFIG_MIPS
        /* mips has weird padding, so we don't get 64 bits there */
        tmp.st_dev = new_encode_dev(stat->dev);
        tmp.st_rdev = new_encode_dev(stat->rdev);
#else
        tmp.st_dev = huge_encode_dev(stat->dev);
        tmp.st_rdev = huge_encode_dev(stat->rdev);
#endif
        tmp.st_ino = stat->ino;
        if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
                return -EOVERFLOW;
#ifdef STAT64_HAS_BROKEN_ST_INO
        tmp.__st_ino = stat->ino;
#endif
        tmp.st_mode = stat->mode;
        tmp.st_nlink = stat->nlink;
        tmp.st_uid = from_kuid_munged(current_user_ns(), stat->uid);
        tmp.st_gid = from_kgid_munged(current_user_ns(), stat->gid);
        tmp.st_atime = stat->atime.tv_sec;
        tmp.st_atime_nsec = stat->atime.tv_nsec;
        tmp.st_mtime = stat->mtime.tv_sec;
        tmp.st_mtime_nsec = stat->mtime.tv_nsec;
        tmp.st_ctime = stat->ctime.tv_sec;
        tmp.st_ctime_nsec = stat->ctime.tv_nsec;
        tmp.st_size = stat->size;
        tmp.st_blocks = stat->blocks;
        tmp.st_blksize = stat->blksize;
        return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
}

SYSCALL_DEFINE2(stat64, const char __user *, filename,
                struct stat64 __user *, statbuf)
{
        struct kstat stat;
        int error = vfs_stat(filename, &stat);

        if (!error)
                error = cp_new_stat64(&stat, statbuf);

        return error;
}

SYSCALL_DEFINE2(lstat64, const char __user *, filename,
                struct stat64 __user *, statbuf)
{
        struct kstat stat;
        int error = vfs_lstat(filename, &stat);

        if (!error)
                error = cp_new_stat64(&stat, statbuf);

        return error;
}

SYSCALL_DEFINE2(fstat64, unsigned long, fd, struct stat64 __user *, statbuf)
{
        struct kstat stat;
        int error = vfs_fstat(fd, &stat);

        if (!error)
                error = cp_new_stat64(&stat, statbuf);

        return error;
}

SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename,
                struct stat64 __user *, statbuf, int, flag)
{
        struct kstat stat;
        int error;

        error = vfs_fstatat(dfd, filename, &stat, flag);
        if (error)
                return error;
        return cp_new_stat64(&stat, statbuf);
}
#endif /* __ARCH_WANT_STAT64 || __ARCH_WANT_COMPAT_STAT64 */

static noinline_for_stack int
cp_statx(const struct kstat *stat, struct statx __user *buffer)
{
        struct statx tmp;

        memset(&tmp, 0, sizeof(tmp));

        tmp.stx_mask = stat->result_mask;
        tmp.stx_blksize = stat->blksize;
        tmp.stx_attributes = stat->attributes;
        tmp.stx_nlink = stat->nlink;
        tmp.stx_uid = from_kuid_munged(current_user_ns(), stat->uid);
        tmp.stx_gid = from_kgid_munged(current_user_ns(), stat->gid);
        tmp.stx_mode = stat->mode;
        tmp.stx_ino = stat->ino;
        tmp.stx_size = stat->size;
        tmp.stx_blocks = stat->blocks;
        tmp.stx_attributes_mask = stat->attributes_mask;
        tmp.stx_atime.tv_sec = stat->atime.tv_sec;
        tmp.stx_atime.tv_nsec = stat->atime.tv_nsec;
        tmp.stx_btime.tv_sec = stat->btime.tv_sec;
        tmp.stx_btime.tv_nsec = stat->btime.tv_nsec;
        tmp.stx_ctime.tv_sec = stat->ctime.tv_sec;
        tmp.stx_ctime.tv_nsec = stat->ctime.tv_nsec;
        tmp.stx_mtime.tv_sec = stat->mtime.tv_sec;
        tmp.stx_mtime.tv_nsec = stat->mtime.tv_nsec;
        tmp.stx_rdev_major = MAJOR(stat->rdev);
        tmp.stx_rdev_minor = MINOR(stat->rdev);
        tmp.stx_dev_major = MAJOR(stat->dev);
        tmp.stx_dev_minor = MINOR(stat->dev);
        tmp.stx_mnt_id = stat->mnt_id;

        return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
}

int do_statx(int dfd, const char __user *filename, unsigned flags,
             unsigned int mask, struct statx __user *buffer)
{
        struct kstat stat;
        int error;

        if (mask & STATX__RESERVED)
                return -EINVAL;
        if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE)
                return -EINVAL;

        error = vfs_statx(dfd, filename, flags, &stat, mask);
        if (error)
                return error;

        return cp_statx(&stat, buffer);
}

/**
 * sys_statx - System call to get enhanced stats
 * @dfd: Base directory to pathwalk from *or* fd to stat.
 * @filename: File to stat or "" with AT_EMPTY_PATH
 * @flags: AT_* flags to control pathwalk.
 * @mask: Parts of statx struct actually required.
 * @buffer: Result buffer.
 *
 * Note that fstat() can be emulated by setting dfd to the fd of interest,
 * supplying "" as the filename and setting AT_EMPTY_PATH in the flags.
 */
SYSCALL_DEFINE5(statx,
                int, dfd, const char __user *, filename, unsigned, flags,
                unsigned int, mask,
                struct statx __user *, buffer)
{
        return do_statx(dfd, filename, flags, mask, buffer);
}

#ifdef CONFIG_COMPAT
static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
{
        struct compat_stat tmp;

        if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev))
                return -EOVERFLOW;
        if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev))
                return -EOVERFLOW;

        memset(&tmp, 0, sizeof(tmp));
        tmp.st_dev = new_encode_dev(stat->dev);
        tmp.st_ino = stat->ino;
        if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
                return -EOVERFLOW;
        tmp.st_mode = stat->mode;
        tmp.st_nlink = stat->nlink;
        if (tmp.st_nlink != stat->nlink)
                return -EOVERFLOW;
        SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
        SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
        tmp.st_rdev = new_encode_dev(stat->rdev);
        if ((u64) stat->size > MAX_NON_LFS)
                return -EOVERFLOW;
        tmp.st_size = stat->size;
        tmp.st_atime = stat->atime.tv_sec;
        tmp.st_atime_nsec = stat->atime.tv_nsec;
        tmp.st_mtime = stat->mtime.tv_sec;
        tmp.st_mtime_nsec = stat->mtime.tv_nsec;
        tmp.st_ctime = stat->ctime.tv_sec;
        tmp.st_ctime_nsec = stat->ctime.tv_nsec;
        tmp.st_blocks = stat->blocks;
        tmp.st_blksize = stat->blksize;
        return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0;
}

COMPAT_SYSCALL_DEFINE2(newstat, const char __user *, filename,
                       struct compat_stat __user *, statbuf)
{
        struct kstat stat;
        int error;

        error = vfs_stat(filename, &stat);
        if (error)
                return error;
        return cp_compat_stat(&stat, statbuf);
}

COMPAT_SYSCALL_DEFINE2(newlstat, const char __user *, filename,
                       struct compat_stat __user *, statbuf)
{
        struct kstat stat;
        int error;

        error = vfs_lstat(filename, &stat);
        if (error)
                return error;
        return cp_compat_stat(&stat, statbuf);
}

#ifndef __ARCH_WANT_STAT64
COMPAT_SYSCALL_DEFINE4(newfstatat, unsigned int, dfd,
                       const char __user *, filename,
                       struct compat_stat __user *, statbuf, int, flag)
{
        struct kstat stat;
        int error;

        error = vfs_fstatat(dfd, filename, &stat, flag);
        if (error)
                return error;
        return cp_compat_stat(&stat, statbuf);
}
#endif

COMPAT_SYSCALL_DEFINE2(newfstat, unsigned int, fd,
                       struct compat_stat __user *, statbuf)
{
        struct kstat stat;
        int error = vfs_fstat(fd, &stat);

        if (!error)
                error = cp_compat_stat(&stat, statbuf);
        return error;
}
#endif

/* Caller is here responsible for sufficient locking (ie. inode->i_lock) */
void __inode_add_bytes(struct inode *inode, loff_t bytes)
{
        inode->i_blocks += bytes >> 9;
        bytes &= 511;
        inode->i_bytes += bytes;
        if (inode->i_bytes >= 512) {
                inode->i_blocks++;
                inode->i_bytes -= 512;
        }
}
EXPORT_SYMBOL(__inode_add_bytes);

void inode_add_bytes(struct inode *inode, loff_t bytes)
{
        spin_lock(&inode->i_lock);
        __inode_add_bytes(inode, bytes);
        spin_unlock(&inode->i_lock);
}

EXPORT_SYMBOL(inode_add_bytes);

void __inode_sub_bytes(struct inode *inode, loff_t bytes)
{
        inode->i_blocks -= bytes >> 9;
        bytes &= 511;
        if (inode->i_bytes < bytes) {
                inode->i_blocks--;
                inode->i_bytes += 512;
        }
        inode->i_bytes -= bytes;
}

EXPORT_SYMBOL(__inode_sub_bytes);

void inode_sub_bytes(struct inode *inode, loff_t bytes)
{
        spin_lock(&inode->i_lock);
        __inode_sub_bytes(inode, bytes);
        spin_unlock(&inode->i_lock);
}

EXPORT_SYMBOL(inode_sub_bytes);

loff_t inode_get_bytes(struct inode *inode)
{
        loff_t ret;

        spin_lock(&inode->i_lock);
        ret = __inode_get_bytes(inode);
        spin_unlock(&inode->i_lock);
        return ret;
}

EXPORT_SYMBOL(inode_get_bytes);

void inode_set_bytes(struct inode *inode, loff_t bytes)
{
        /* Caller is here responsible for sufficient locking
         * (ie. inode->i_lock) */
        inode->i_blocks = bytes >> 9;
        inode->i_bytes = bytes & 511;
}

EXPORT_SYMBOL(inode_set_bytes);



























































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM power

#if !defined(_TRACE_POWER_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_POWER_H

#include <linux/cpufreq.h>
#include <linux/ktime.h>
#include <linux/pm_qos.h>
#include <linux/tracepoint.h>
#include <linux/trace_events.h>

#define TPS(x)  tracepoint_string(x)

DECLARE_EVENT_CLASS(cpu,

        TP_PROTO(unsigned int state, unsigned int cpu_id),

        TP_ARGS(state, cpu_id),

        TP_STRUCT__entry(
                __field(        u32,                state                )
                __field(        u32,                cpu_id                )
        ),

        TP_fast_assign(
                __entry->state = state;
                __entry->cpu_id = cpu_id;
        ),

        TP_printk("state=%lu cpu_id=%lu", (unsigned long)__entry->state,
                  (unsigned long)__entry->cpu_id)
);

DEFINE_EVENT(cpu, cpu_idle,

        TP_PROTO(unsigned int state, unsigned int cpu_id),

        TP_ARGS(state, cpu_id)
);

TRACE_EVENT(powernv_throttle,

        TP_PROTO(int chip_id, const char *reason, int pmax),

        TP_ARGS(chip_id, reason, pmax),

        TP_STRUCT__entry(
                __field(int, chip_id)
                __string(reason, reason)
                __field(int, pmax)
        ),

        TP_fast_assign(
                __entry->chip_id = chip_id;
                __assign_str(reason, reason);
                __entry->pmax = pmax;
        ),

        TP_printk("Chip %d Pmax %d %s", __entry->chip_id,
                  __entry->pmax, __get_str(reason))
);

TRACE_EVENT(pstate_sample,

        TP_PROTO(u32 core_busy,
                u32 scaled_busy,
                u32 from,
                u32 to,
                u64 mperf,
                u64 aperf,
                u64 tsc,
                u32 freq,
                u32 io_boost
                ),

        TP_ARGS(core_busy,
                scaled_busy,
                from,
                to,
                mperf,
                aperf,
                tsc,
                freq,
                io_boost
                ),

        TP_STRUCT__entry(
                __field(u32, core_busy)
                __field(u32, scaled_busy)
                __field(u32, from)
                __field(u32, to)
                __field(u64, mperf)
                __field(u64, aperf)
                __field(u64, tsc)
                __field(u32, freq)
                __field(u32, io_boost)
                ),

        TP_fast_assign(
                __entry->core_busy = core_busy;
                __entry->scaled_busy = scaled_busy;
                __entry->from = from;
                __entry->to = to;
                __entry->mperf = mperf;
                __entry->aperf = aperf;
                __entry->tsc = tsc;
                __entry->freq = freq;
                __entry->io_boost = io_boost;
                ),

        TP_printk("core_busy=%lu scaled=%lu from=%lu to=%lu mperf=%llu aperf=%llu tsc=%llu freq=%lu io_boost=%lu",
                (unsigned long)__entry->core_busy,
                (unsigned long)__entry->scaled_busy,
                (unsigned long)__entry->from,
                (unsigned long)__entry->to,
                (unsigned long long)__entry->mperf,
                (unsigned long long)__entry->aperf,
                (unsigned long long)__entry->tsc,
                (unsigned long)__entry->freq,
                (unsigned long)__entry->io_boost
                )

);

/* This file can get included multiple times, TRACE_HEADER_MULTI_READ at top */
#ifndef _PWR_EVENT_AVOID_DOUBLE_DEFINING
#define _PWR_EVENT_AVOID_DOUBLE_DEFINING

#define PWR_EVENT_EXIT -1
#endif

#define pm_verb_symbolic(event) \
        __print_symbolic(event, \
                { PM_EVENT_SUSPEND, "suspend" }, \
                { PM_EVENT_RESUME, "resume" }, \
                { PM_EVENT_FREEZE, "freeze" }, \
                { PM_EVENT_QUIESCE, "quiesce" }, \
                { PM_EVENT_HIBERNATE, "hibernate" }, \
                { PM_EVENT_THAW, "thaw" }, \
                { PM_EVENT_RESTORE, "restore" }, \
                { PM_EVENT_RECOVER, "recover" })

DEFINE_EVENT(cpu, cpu_frequency,

        TP_PROTO(unsigned int frequency, unsigned int cpu_id),

        TP_ARGS(frequency, cpu_id)
);

TRACE_EVENT(cpu_frequency_limits,

        TP_PROTO(struct cpufreq_policy *policy),

        TP_ARGS(policy),

        TP_STRUCT__entry(
                __field(u32, min_freq)
                __field(u32, max_freq)
                __field(u32, cpu_id)
        ),

        TP_fast_assign(
                __entry->min_freq = policy->min;
                __entry->max_freq = policy->max;
                __entry->cpu_id = policy->cpu;
        ),

        TP_printk("min=%lu max=%lu cpu_id=%lu",
                  (unsigned long)__entry->min_freq,
                  (unsigned long)__entry->max_freq,
                  (unsigned long)__entry->cpu_id)
);

TRACE_EVENT(device_pm_callback_start,

        TP_PROTO(struct device *dev, const char *pm_ops, int event),

        TP_ARGS(dev, pm_ops, event),

        TP_STRUCT__entry(
                __string(device, dev_name(dev))
                __string(driver, dev_driver_string(dev))
                __string(parent, dev->parent ? dev_name(dev->parent) : "none")
                __string(pm_ops, pm_ops ? pm_ops : "none ")
                __field(int, event)
        ),

        TP_fast_assign(
                __assign_str(device, dev_name(dev));
                __assign_str(driver, dev_driver_string(dev));
                __assign_str(parent,
                        dev->parent ? dev_name(dev->parent) : "none");
                __assign_str(pm_ops, pm_ops ? pm_ops : "none ");
                __entry->event = event;
        ),

        TP_printk("%s %s, parent: %s, %s[%s]", __get_str(driver),
                __get_str(device), __get_str(parent), __get_str(pm_ops),
                pm_verb_symbolic(__entry->event))
);

TRACE_EVENT(device_pm_callback_end,

        TP_PROTO(struct device *dev, int error),

        TP_ARGS(dev, error),

        TP_STRUCT__entry(
                __string(device, dev_name(dev))
                __string(driver, dev_driver_string(dev))
                __field(int, error)
        ),

        TP_fast_assign(
                __assign_str(device, dev_name(dev));
                __assign_str(driver, dev_driver_string(dev));
                __entry->error = error;
        ),

        TP_printk("%s %s, err=%d",
                __get_str(driver), __get_str(device), __entry->error)
);

TRACE_EVENT(suspend_resume,

        TP_PROTO(const char *action, int val, bool start),

        TP_ARGS(action, val, start),

        TP_STRUCT__entry(
                __field(const char *, action)
                __field(int, val)
                __field(bool, start)
        ),

        TP_fast_assign(
                __entry->action = action;
                __entry->val = val;
                __entry->start = start;
        ),

        TP_printk("%s[%u] %s", __entry->action, (unsigned int)__entry->val,
                (__entry->start)?"begin":"end")
);

DECLARE_EVENT_CLASS(wakeup_source,

        TP_PROTO(const char *name, unsigned int state),

        TP_ARGS(name, state),

        TP_STRUCT__entry(
                __string(       name,           name            )
                __field(        u64,            state           )
        ),

        TP_fast_assign(
                __assign_str(name, name);
                __entry->state = state;
        ),

        TP_printk("%s state=0x%lx", __get_str(name),
                (unsigned long)__entry->state)
);

DEFINE_EVENT(wakeup_source, wakeup_source_activate,

        TP_PROTO(const char *name, unsigned int state),

        TP_ARGS(name, state)
);

DEFINE_EVENT(wakeup_source, wakeup_source_deactivate,

        TP_PROTO(const char *name, unsigned int state),

        TP_ARGS(name, state)
);

/*
 * The clock events are used for clock enable/disable and for
 *  clock rate change
 */
DECLARE_EVENT_CLASS(clock,

        TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id),

        TP_ARGS(name, state, cpu_id),

        TP_STRUCT__entry(
                __string(       name,           name            )
                __field(        u64,            state           )
                __field(        u64,            cpu_id          )
        ),

        TP_fast_assign(
                __assign_str(name, name);
                __entry->state = state;
                __entry->cpu_id = cpu_id;
        ),

        TP_printk("%s state=%lu cpu_id=%lu", __get_str(name),
                (unsigned long)__entry->state, (unsigned long)__entry->cpu_id)
);

DEFINE_EVENT(clock, clock_enable,

        TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id),

        TP_ARGS(name, state, cpu_id)
);

DEFINE_EVENT(clock, clock_disable,

        TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id),

        TP_ARGS(name, state, cpu_id)
);

DEFINE_EVENT(clock, clock_set_rate,

        TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id),

        TP_ARGS(name, state, cpu_id)
);

/*
 * The power domain events are used for power domains transitions
 */
DECLARE_EVENT_CLASS(power_domain,

        TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id),

        TP_ARGS(name, state, cpu_id),

        TP_STRUCT__entry(
                __string(       name,           name            )
                __field(        u64,            state           )
                __field(        u64,            cpu_id          )
        ),

        TP_fast_assign(
                __assign_str(name, name);
                __entry->state = state;
                __entry->cpu_id = cpu_id;
),

        TP_printk("%s state=%lu cpu_id=%lu", __get_str(name),
                (unsigned long)__entry->state, (unsigned long)__entry->cpu_id)
);

DEFINE_EVENT(power_domain, power_domain_target,

        TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id),

        TP_ARGS(name, state, cpu_id)
);

/*
 * CPU latency QoS events used for global CPU latency QoS list updates
 */
DECLARE_EVENT_CLASS(cpu_latency_qos_request,

        TP_PROTO(s32 value),

        TP_ARGS(value),

        TP_STRUCT__entry(
                __field( s32,                    value          )
        ),

        TP_fast_assign(
                __entry->value = value;
        ),

        TP_printk("CPU_DMA_LATENCY value=%d",
                  __entry->value)
);

DEFINE_EVENT(cpu_latency_qos_request, pm_qos_add_request,

        TP_PROTO(s32 value),

        TP_ARGS(value)
);

DEFINE_EVENT(cpu_latency_qos_request, pm_qos_update_request,

        TP_PROTO(s32 value),

        TP_ARGS(value)
);

DEFINE_EVENT(cpu_latency_qos_request, pm_qos_remove_request,

        TP_PROTO(s32 value),

        TP_ARGS(value)
);

/*
 * General PM QoS events used for updates of PM QoS request lists
 */
DECLARE_EVENT_CLASS(pm_qos_update,

        TP_PROTO(enum pm_qos_req_action action, int prev_value, int curr_value),

        TP_ARGS(action, prev_value, curr_value),

        TP_STRUCT__entry(
                __field( enum pm_qos_req_action, action         )
                __field( int,                    prev_value     )
                __field( int,                    curr_value     )
        ),

        TP_fast_assign(
                __entry->action = action;
                __entry->prev_value = prev_value;
                __entry->curr_value = curr_value;
        ),

        TP_printk("action=%s prev_value=%d curr_value=%d",
                  __print_symbolic(__entry->action,
                        { PM_QOS_ADD_REQ,        "ADD_REQ" },
                        { PM_QOS_UPDATE_REQ,        "UPDATE_REQ" },
                        { PM_QOS_REMOVE_REQ,        "REMOVE_REQ" }),
                  __entry->prev_value, __entry->curr_value)
);

DEFINE_EVENT(pm_qos_update, pm_qos_update_target,

        TP_PROTO(enum pm_qos_req_action action, int prev_value, int curr_value),

        TP_ARGS(action, prev_value, curr_value)
);

DEFINE_EVENT_PRINT(pm_qos_update, pm_qos_update_flags,

        TP_PROTO(enum pm_qos_req_action action, int prev_value, int curr_value),

        TP_ARGS(action, prev_value, curr_value),

        TP_printk("action=%s prev_value=0x%x curr_value=0x%x",
                  __print_symbolic(__entry->action,
                        { PM_QOS_ADD_REQ,        "ADD_REQ" },
                        { PM_QOS_UPDATE_REQ,        "UPDATE_REQ" },
                        { PM_QOS_REMOVE_REQ,        "REMOVE_REQ" }),
                  __entry->prev_value, __entry->curr_value)
);

DECLARE_EVENT_CLASS(dev_pm_qos_request,

        TP_PROTO(const char *name, enum dev_pm_qos_req_type type,
                 s32 new_value),

        TP_ARGS(name, type, new_value),

        TP_STRUCT__entry(
                __string( name,                    name         )
                __field( enum dev_pm_qos_req_type, type         )
                __field( s32,                      new_value    )
        ),

        TP_fast_assign(
                __assign_str(name, name);
                __entry->type = type;
                __entry->new_value = new_value;
        ),

        TP_printk("device=%s type=%s new_value=%d",
                  __get_str(name),
                  __print_symbolic(__entry->type,
                        { DEV_PM_QOS_RESUME_LATENCY, "DEV_PM_QOS_RESUME_LATENCY" },
                        { DEV_PM_QOS_FLAGS, "DEV_PM_QOS_FLAGS" }),
                  __entry->new_value)
);

DEFINE_EVENT(dev_pm_qos_request, dev_pm_qos_add_request,

        TP_PROTO(const char *name, enum dev_pm_qos_req_type type,
                 s32 new_value),

        TP_ARGS(name, type, new_value)
);

DEFINE_EVENT(dev_pm_qos_request, dev_pm_qos_update_request,

        TP_PROTO(const char *name, enum dev_pm_qos_req_type type,
                 s32 new_value),

        TP_ARGS(name, type, new_value)
);

DEFINE_EVENT(dev_pm_qos_request, dev_pm_qos_remove_request,

        TP_PROTO(const char *name, enum dev_pm_qos_req_type type,
                 s32 new_value),

        TP_ARGS(name, type, new_value)
);
#endif /* _TRACE_POWER_H */

/* This part must be outside protection */
#include <trace/define_trace.h>



































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __INCLUDE_LINUX_OOM_H
#define __INCLUDE_LINUX_OOM_H


#include <linux/sched/signal.h>
#include <linux/types.h>
#include <linux/nodemask.h>
#include <uapi/linux/oom.h>
#include <linux/sched/coredump.h> /* MMF_* */
#include <linux/mm.h> /* VM_FAULT* */

struct zonelist;
struct notifier_block;
struct mem_cgroup;
struct task_struct;

enum oom_constraint {
        CONSTRAINT_NONE,
        CONSTRAINT_CPUSET,
        CONSTRAINT_MEMORY_POLICY,
        CONSTRAINT_MEMCG,
};

/*
 * Details of the page allocation that triggered the oom killer that are used to
 * determine what should be killed.
 */
struct oom_control {
        /* Used to determine cpuset */
        struct zonelist *zonelist;

        /* Used to determine mempolicy */
        nodemask_t *nodemask;

        /* Memory cgroup in which oom is invoked, or NULL for global oom */
        struct mem_cgroup *memcg;

        /* Used to determine cpuset and node locality requirement */
        const gfp_t gfp_mask;

        /*
         * order == -1 means the oom kill is required by sysrq, otherwise only
         * for display purposes.
         */
        const int order;

        /* Used by oom implementation, do not set */
        unsigned long totalpages;
        struct task_struct *chosen;
        long chosen_points;

        /* Used to print the constraint info. */
        enum oom_constraint constraint;
};

extern struct mutex oom_lock;
extern struct mutex oom_adj_mutex;

static inline void set_current_oom_origin(void)
{
        current->signal->oom_flag_origin = true;
}

static inline void clear_current_oom_origin(void)
{
        current->signal->oom_flag_origin = false;
}

static inline bool oom_task_origin(const struct task_struct *p)
{
        return p->signal->oom_flag_origin;
}

static inline bool tsk_is_oom_victim(struct task_struct * tsk)
{
        return tsk->signal->oom_mm;
}

/*
 * Use this helper if tsk->mm != mm and the victim mm needs a special
 * handling. This is guaranteed to stay true after once set.
 */
static inline bool mm_is_oom_victim(struct mm_struct *mm)
{
        return test_bit(MMF_OOM_VICTIM, &mm->flags);
}

/*
 * Checks whether a page fault on the given mm is still reliable.
 * This is no longer true if the oom reaper started to reap the
 * address space which is reflected by MMF_UNSTABLE flag set in
 * the mm. At that moment any !shared mapping would lose the content
 * and could cause a memory corruption (zero pages instead of the
 * original content).
 *
 * User should call this before establishing a page table entry for
 * a !shared mapping and under the proper page table lock.
 *
 * Return 0 when the PF is safe VM_FAULT_SIGBUS otherwise.
 */
static inline vm_fault_t check_stable_address_space(struct mm_struct *mm)
{
        if (unlikely(test_bit(MMF_UNSTABLE, &mm->flags)))
                return VM_FAULT_SIGBUS;
        return 0;
}

bool __oom_reap_task_mm(struct mm_struct *mm);

long oom_badness(struct task_struct *p,
                unsigned long totalpages);

extern bool out_of_memory(struct oom_control *oc);

extern void exit_oom_victim(void);

extern int register_oom_notifier(struct notifier_block *nb);
extern int unregister_oom_notifier(struct notifier_block *nb);

extern bool oom_killer_disable(signed long timeout);
extern void oom_killer_enable(void);

extern struct task_struct *find_lock_task_mm(struct task_struct *p);

/* sysctls */
extern int sysctl_oom_dump_tasks;
extern int sysctl_oom_kill_allocating_task;
extern int sysctl_panic_on_oom;
#endif /* _INCLUDE_LINUX_OOM_H */











































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
/* SPDX-License-Identifier: GPL-2.0 */

/*
 * This file provides wrappers with sanitizer instrumentation for non-atomic
 * bit operations.
 *
 * To use this functionality, an arch's bitops.h file needs to define each of
 * the below bit operations with an arch_ prefix (e.g. arch_set_bit(),
 * arch___set_bit(), etc.).
 */
#ifndef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H
#define _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H

#include <linux/instrumented.h>

/**
 * __set_bit - Set a bit in memory
 * @nr: the bit to set
 * @addr: the address to start counting from
 *
 * Unlike set_bit(), this function is non-atomic. If it is called on the same
 * region of memory concurrently, the effect may be that only one operation
 * succeeds.
 */
static inline void __set_bit(long nr, volatile unsigned long *addr)
{
        instrument_write(addr + BIT_WORD(nr), sizeof(long));
        arch___set_bit(nr, addr);
}

/**
 * __clear_bit - Clears a bit in memory
 * @nr: the bit to clear
 * @addr: the address to start counting from
 *
 * Unlike clear_bit(), this function is non-atomic. If it is called on the same
 * region of memory concurrently, the effect may be that only one operation
 * succeeds.
 */
static inline void __clear_bit(long nr, volatile unsigned long *addr)
{
        instrument_write(addr + BIT_WORD(nr), sizeof(long));
        arch___clear_bit(nr, addr);
}

/**
 * __change_bit - Toggle a bit in memory
 * @nr: the bit to change
 * @addr: the address to start counting from
 *
 * Unlike change_bit(), this function is non-atomic. If it is called on the same
 * region of memory concurrently, the effect may be that only one operation
 * succeeds.
 */
static inline void __change_bit(long nr, volatile unsigned long *addr)
{
        instrument_write(addr + BIT_WORD(nr), sizeof(long));
        arch___change_bit(nr, addr);
}

static inline void __instrument_read_write_bitop(long nr, volatile unsigned long *addr)
{
        if (IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC)) {
                /*
                 * We treat non-atomic read-write bitops a little more special.
                 * Given the operations here only modify a single bit, assuming
                 * non-atomicity of the writer is sufficient may be reasonable
                 * for certain usage (and follows the permissible nature of the
                 * assume-plain-writes-atomic rule):
                 * 1. report read-modify-write races -> check read;
                 * 2. do not report races with marked readers, but do report
                 *    races with unmarked readers -> check "atomic" write.
                 */
                kcsan_check_read(addr + BIT_WORD(nr), sizeof(long));
                /*
                 * Use generic write instrumentation, in case other sanitizers
                 * or tools are enabled alongside KCSAN.
                 */
                instrument_write(addr + BIT_WORD(nr), sizeof(long));
        } else {
                instrument_read_write(addr + BIT_WORD(nr), sizeof(long));
        }
}

/**
 * __test_and_set_bit - Set a bit and return its old value
 * @nr: Bit to set
 * @addr: Address to count from
 *
 * This operation is non-atomic. If two instances of this operation race, one
 * can appear to succeed but actually fail.
 */
static inline bool __test_and_set_bit(long nr, volatile unsigned long *addr)
{
        __instrument_read_write_bitop(nr, addr);
        return arch___test_and_set_bit(nr, addr);
}

/**
 * __test_and_clear_bit - Clear a bit and return its old value
 * @nr: Bit to clear
 * @addr: Address to count from
 *
 * This operation is non-atomic. If two instances of this operation race, one
 * can appear to succeed but actually fail.
 */
static inline bool __test_and_clear_bit(long nr, volatile unsigned long *addr)
{
        __instrument_read_write_bitop(nr, addr);
        return arch___test_and_clear_bit(nr, addr);
}

/**
 * __test_and_change_bit - Change a bit and return its old value
 * @nr: Bit to change
 * @addr: Address to count from
 *
 * This operation is non-atomic. If two instances of this operation race, one
 * can appear to succeed but actually fail.
 */
static inline bool __test_and_change_bit(long nr, volatile unsigned long *addr)
{
        __instrument_read_write_bitop(nr, addr);
        return arch___test_and_change_bit(nr, addr);
}

/**
 * test_bit - Determine whether a bit is set
 * @nr: bit number to test
 * @addr: Address to start counting from
 */
static inline bool test_bit(long nr, const volatile unsigned long *addr)
{
        instrument_atomic_read(addr + BIT_WORD(nr), sizeof(long));
        return arch_test_bit(nr, addr);
}

#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */
















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _IPV6_FRAG_H
#define _IPV6_FRAG_H
#include <linux/kernel.h>
#include <net/addrconf.h>
#include <net/ipv6.h>
#include <net/inet_frag.h>

enum ip6_defrag_users {
        IP6_DEFRAG_LOCAL_DELIVER,
        IP6_DEFRAG_CONNTRACK_IN,
        __IP6_DEFRAG_CONNTRACK_IN        = IP6_DEFRAG_CONNTRACK_IN + USHRT_MAX,
        IP6_DEFRAG_CONNTRACK_OUT,
        __IP6_DEFRAG_CONNTRACK_OUT        = IP6_DEFRAG_CONNTRACK_OUT + USHRT_MAX,
        IP6_DEFRAG_CONNTRACK_BRIDGE_IN,
        __IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX,
};

/*
 *        Equivalent of ipv4 struct ip
 */
struct frag_queue {
        struct inet_frag_queue        q;

        int                        iif;
        __u16                        nhoffset;
        u8                        ecn;
};

#if IS_ENABLED(CONFIG_IPV6)
static inline void ip6frag_init(struct inet_frag_queue *q, const void *a)
{
        struct frag_queue *fq = container_of(q, struct frag_queue, q);
        const struct frag_v6_compare_key *key = a;

        q->key.v6 = *key;
        fq->ecn = 0;
}

static inline u32 ip6frag_key_hashfn(const void *data, u32 len, u32 seed)
{
        return jhash2(data,
                      sizeof(struct frag_v6_compare_key) / sizeof(u32), seed);
}

static inline u32 ip6frag_obj_hashfn(const void *data, u32 len, u32 seed)
{
        const struct inet_frag_queue *fq = data;

        return jhash2((const u32 *)&fq->key.v6,
                      sizeof(struct frag_v6_compare_key) / sizeof(u32), seed);
}

static inline int
ip6frag_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
{
        const struct frag_v6_compare_key *key = arg->key;
        const struct inet_frag_queue *fq = ptr;

        return !!memcmp(&fq->key, key, sizeof(*key));
}

static inline void
ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq)
{
        struct net_device *dev = NULL;
        struct sk_buff *head;

        rcu_read_lock();
        /* Paired with the WRITE_ONCE() in fqdir_pre_exit(). */
        if (READ_ONCE(fq->q.fqdir->dead))
                goto out_rcu_unlock;
        spin_lock(&fq->q.lock);

        if (fq->q.flags & INET_FRAG_COMPLETE)
                goto out;

        inet_frag_kill(&fq->q);

        dev = dev_get_by_index_rcu(net, fq->iif);
        if (!dev)
                goto out;

        __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
        __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);

        /* Don't send error if the first segment did not arrive. */
        if (!(fq->q.flags & INET_FRAG_FIRST_IN))
                goto out;

        /* sk_buff::dev and sk_buff::rbnode are unionized. So we
         * pull the head out of the tree in order to be able to
         * deal with head->dev.
         */
        head = inet_frag_pull_head(&fq->q);
        if (!head)
                goto out;

        head->dev = dev;
        spin_unlock(&fq->q.lock);

        icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0);
        kfree_skb(head);
        goto out_rcu_unlock;

out:
        spin_unlock(&fq->q.lock);
out_rcu_unlock:
        rcu_read_unlock();
        inet_frag_put(&fq->q);
}

/* Check if the upper layer header is truncated in the first fragment. */
static inline bool
ipv6frag_thdr_truncated(struct sk_buff *skb, int start, u8 *nexthdrp)
{
        u8 nexthdr = *nexthdrp;
        __be16 frag_off;
        int offset;

        offset = ipv6_skip_exthdr(skb, start, &nexthdr, &frag_off);
        if (offset < 0 || (frag_off & htons(IP6_OFFSET)))
                return false;
        switch (nexthdr) {
        case NEXTHDR_TCP:
                offset += sizeof(struct tcphdr);
                break;
        case NEXTHDR_UDP:
                offset += sizeof(struct udphdr);
                break;
        case NEXTHDR_ICMP:
                offset += sizeof(struct icmp6hdr);
                break;
        default:
                offset += 1;
        }
        if (offset > skb->len)
                return true;
        return false;
}

#endif
#endif



































































































































































    1 















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_CPUSET_H
#define _LINUX_CPUSET_H
/*
 *  cpuset interface
 *
 *  Copyright (C) 2003 BULL SA
 *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
 *
 */

#include <linux/sched.h>
#include <linux/sched/topology.h>
#include <linux/sched/task.h>
#include <linux/cpumask.h>
#include <linux/nodemask.h>
#include <linux/mm.h>
#include <linux/jump_label.h>

#ifdef CONFIG_CPUSETS

/*
 * Static branch rewrites can happen in an arbitrary order for a given
 * key. In code paths where we need to loop with read_mems_allowed_begin() and
 * read_mems_allowed_retry() to get a consistent view of mems_allowed, we need
 * to ensure that begin() always gets rewritten before retry() in the
 * disabled -> enabled transition. If not, then if local irqs are disabled
 * around the loop, we can deadlock since retry() would always be
 * comparing the latest value of the mems_allowed seqcount against 0 as
 * begin() still would see cpusets_enabled() as false. The enabled -> disabled
 * transition should happen in reverse order for the same reasons (want to stop
 * looking at real value of mems_allowed.sequence in retry() first).
 */
extern struct static_key_false cpusets_pre_enable_key;
extern struct static_key_false cpusets_enabled_key;
extern struct static_key_false cpusets_insane_config_key;

static inline bool cpusets_enabled(void)
{
        return static_branch_unlikely(&cpusets_enabled_key);
}

static inline void cpuset_inc(void)
{
        static_branch_inc_cpuslocked(&cpusets_pre_enable_key);
        static_branch_inc_cpuslocked(&cpusets_enabled_key);
}

static inline void cpuset_dec(void)
{
        static_branch_dec_cpuslocked(&cpusets_enabled_key);
        static_branch_dec_cpuslocked(&cpusets_pre_enable_key);
}

/*
 * This will get enabled whenever a cpuset configuration is considered
 * unsupportable in general. E.g. movable only node which cannot satisfy
 * any non movable allocations (see update_nodemask). Page allocator
 * needs to make additional checks for those configurations and this
 * check is meant to guard those checks without any overhead for sane
 * configurations.
 */
static inline bool cpusets_insane_config(void)
{
        return static_branch_unlikely(&cpusets_insane_config_key);
}

extern int cpuset_init(void);
extern void cpuset_init_smp(void);
extern void cpuset_force_rebuild(void);
extern void cpuset_update_active_cpus(void);
extern void cpuset_wait_for_hotplug(void);
extern void inc_dl_tasks_cs(struct task_struct *task);
extern void dec_dl_tasks_cs(struct task_struct *task);
extern void cpuset_lock(void);
extern void cpuset_unlock(void);
extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
#define cpuset_current_mems_allowed (current->mems_allowed)
void cpuset_init_current_mems_allowed(void);
int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);

extern bool __cpuset_node_allowed(int node, gfp_t gfp_mask);

static inline bool cpuset_node_allowed(int node, gfp_t gfp_mask)
{
        if (cpusets_enabled())
                return __cpuset_node_allowed(node, gfp_mask);
        return true;
}

static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
{
        return __cpuset_node_allowed(zone_to_nid(z), gfp_mask);
}

static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
{
        if (cpusets_enabled())
                return __cpuset_zone_allowed(z, gfp_mask);
        return true;
}

extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
                                          const struct task_struct *tsk2);

#define cpuset_memory_pressure_bump()                                 \
        do {                                                        \
                if (cpuset_memory_pressure_enabled)                \
                        __cpuset_memory_pressure_bump();        \
        } while (0)
extern int cpuset_memory_pressure_enabled;
extern void __cpuset_memory_pressure_bump(void);

extern void cpuset_task_status_allowed(struct seq_file *m,
                                        struct task_struct *task);
extern int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
                            struct pid *pid, struct task_struct *tsk);

extern int cpuset_mem_spread_node(void);
extern int cpuset_slab_spread_node(void);

static inline int cpuset_do_page_mem_spread(void)
{
        return task_spread_page(current);
}

static inline int cpuset_do_slab_mem_spread(void)
{
        return task_spread_slab(current);
}

extern bool current_cpuset_is_being_rebound(void);

extern void rebuild_sched_domains(void);

extern void cpuset_print_current_mems_allowed(void);

/*
 * read_mems_allowed_begin is required when making decisions involving
 * mems_allowed such as during page allocation. mems_allowed can be updated in
 * parallel and depending on the new value an operation can fail potentially
 * causing process failure. A retry loop with read_mems_allowed_begin and
 * read_mems_allowed_retry prevents these artificial failures.
 */
static inline unsigned int read_mems_allowed_begin(void)
{
        if (!static_branch_unlikely(&cpusets_pre_enable_key))
                return 0;

        return read_seqcount_begin(&current->mems_allowed_seq);
}

/*
 * If this returns true, the operation that took place after
 * read_mems_allowed_begin may have failed artificially due to a concurrent
 * update of mems_allowed. It is up to the caller to retry the operation if
 * appropriate.
 */
static inline bool read_mems_allowed_retry(unsigned int seq)
{
        if (!static_branch_unlikely(&cpusets_enabled_key))
                return false;

        return read_seqcount_retry(&current->mems_allowed_seq, seq);
}

static inline void set_mems_allowed(nodemask_t nodemask)
{
        unsigned long flags;

        task_lock(current);
        local_irq_save(flags);
        write_seqcount_begin(&current->mems_allowed_seq);
        current->mems_allowed = nodemask;
        write_seqcount_end(&current->mems_allowed_seq);
        local_irq_restore(flags);
        task_unlock(current);
}

#else /* !CONFIG_CPUSETS */

static inline bool cpusets_enabled(void) { return false; }

static inline bool cpusets_insane_config(void) { return false; }

static inline int cpuset_init(void) { return 0; }
static inline void cpuset_init_smp(void) {}

static inline void cpuset_force_rebuild(void) { }

static inline void cpuset_update_active_cpus(void)
{
        partition_sched_domains(1, NULL, NULL);
}

static inline void cpuset_wait_for_hotplug(void) { }

static inline void inc_dl_tasks_cs(struct task_struct *task) { }
static inline void dec_dl_tasks_cs(struct task_struct *task) { }
static inline void cpuset_lock(void) { }
static inline void cpuset_unlock(void) { }

static inline void cpuset_cpus_allowed(struct task_struct *p,
                                       struct cpumask *mask)
{
        cpumask_copy(mask, cpu_possible_mask);
}

static inline void cpuset_cpus_allowed_fallback(struct task_struct *p)
{
}

static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
{
        return node_possible_map;
}

#define cpuset_current_mems_allowed (node_states[N_MEMORY])
static inline void cpuset_init_current_mems_allowed(void) {}

static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
{
        return 1;
}

static inline bool cpuset_node_allowed(int node, gfp_t gfp_mask)
{
        return true;
}

static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
{
        return true;
}

static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
{
        return true;
}

static inline int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
                                                 const struct task_struct *tsk2)
{
        return 1;
}

static inline void cpuset_memory_pressure_bump(void) {}

static inline void cpuset_task_status_allowed(struct seq_file *m,
                                                struct task_struct *task)
{
}

static inline int cpuset_mem_spread_node(void)
{
        return 0;
}

static inline int cpuset_slab_spread_node(void)
{
        return 0;
}

static inline int cpuset_do_page_mem_spread(void)
{
        return 0;
}

static inline int cpuset_do_slab_mem_spread(void)
{
        return 0;
}

static inline bool current_cpuset_is_being_rebound(void)
{
        return false;
}

static inline void rebuild_sched_domains(void)
{
        partition_sched_domains(1, NULL, NULL);
}

static inline void cpuset_print_current_mems_allowed(void)
{
}

static inline void set_mems_allowed(nodemask_t nodemask)
{
}

static inline unsigned int read_mems_allowed_begin(void)
{
        return 0;
}

static inline bool read_mems_allowed_retry(unsigned int seq)
{
        return false;
}

#endif /* !CONFIG_CPUSETS */

#endif /* _LINUX_CPUSET_H */














































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (c) 2008 Intel Corporation
 * Author: Matthew Wilcox <willy@linux.intel.com>
 *
 * Please see kernel/locking/semaphore.c for documentation of these functions
 */
#ifndef __LINUX_SEMAPHORE_H
#define __LINUX_SEMAPHORE_H

#include <linux/list.h>
#include <linux/spinlock.h>

/* Please don't access any members of this structure directly */
struct semaphore {
        raw_spinlock_t                lock;
        unsigned int                count;
        struct list_head        wait_list;
};

#define __SEMAPHORE_INITIALIZER(name, n)                                \
{                                                                        \
        .lock                = __RAW_SPIN_LOCK_UNLOCKED((name).lock),        \
        .count                = n,                                                \
        .wait_list        = LIST_HEAD_INIT((name).wait_list),                \
}

#define DEFINE_SEMAPHORE(name)        \
        struct semaphore name = __SEMAPHORE_INITIALIZER(name, 1)

static inline void sema_init(struct semaphore *sem, int val)
{
        static struct lock_class_key __key;
        *sem = (struct semaphore) __SEMAPHORE_INITIALIZER(*sem, val);
        lockdep_init_map(&sem->lock.dep_map, "semaphore->lock", &__key, 0);
}

extern void down(struct semaphore *sem);
extern int __must_check down_interruptible(struct semaphore *sem);
extern int __must_check down_killable(struct semaphore *sem);
extern int __must_check down_trylock(struct semaphore *sem);
extern int __must_check down_timeout(struct semaphore *sem, long jiffies);
extern void up(struct semaphore *sem);

#endif /* __LINUX_SEMAPHORE_H */























































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the UDP module.
 *
 * Version:        @(#)udp.h        1.0.2        05/07/93
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *
 * Fixes:
 *                Alan Cox        : Turned on udp checksums. I don't want to
 *                                  chase 'memory corruption' bugs that aren't!
 */
#ifndef _UDP_H
#define _UDP_H

#include <linux/list.h>
#include <linux/bug.h>
#include <net/inet_sock.h>
#include <net/sock.h>
#include <net/snmp.h>
#include <net/ip.h>
#include <linux/ipv6.h>
#include <linux/seq_file.h>
#include <linux/poll.h>
#include <linux/indirect_call_wrapper.h>

/**
 *        struct udp_skb_cb  -  UDP(-Lite) private variables
 *
 *        @header:      private variables used by IPv4/IPv6
 *        @cscov:       checksum coverage length (UDP-Lite only)
 *        @partial_cov: if set indicates partial csum coverage
 */
struct udp_skb_cb {
        union {
                struct inet_skb_parm        h4;
#if IS_ENABLED(CONFIG_IPV6)
                struct inet6_skb_parm        h6;
#endif
        } header;
        __u16                cscov;
        __u8                partial_cov;
};
#define UDP_SKB_CB(__skb)        ((struct udp_skb_cb *)((__skb)->cb))

/**
 *        struct udp_hslot - UDP hash slot
 *
 *        @head:        head of list of sockets
 *        @count:        number of sockets in 'head' list
 *        @lock:        spinlock protecting changes to head/count
 */
struct udp_hslot {
        struct hlist_head        head;
        int                        count;
        spinlock_t                lock;
} __attribute__((aligned(2 * sizeof(long))));

/**
 *        struct udp_table - UDP table
 *
 *        @hash:        hash table, sockets are hashed on (local port)
 *        @hash2:        hash table, sockets are hashed on (local port, local address)
 *        @mask:        number of slots in hash tables, minus 1
 *        @log:        log2(number of slots in hash table)
 */
struct udp_table {
        struct udp_hslot        *hash;
        struct udp_hslot        *hash2;
        unsigned int                mask;
        unsigned int                log;
};
extern struct udp_table udp_table;
void udp_table_init(struct udp_table *, const char *);
static inline struct udp_hslot *udp_hashslot(struct udp_table *table,
                                             struct net *net, unsigned int num)
{
        return &table->hash[udp_hashfn(net, num, table->mask)];
}
/*
 * For secondary hash, net_hash_mix() is performed before calling
 * udp_hashslot2(), this explains difference with udp_hashslot()
 */
static inline struct udp_hslot *udp_hashslot2(struct udp_table *table,
                                              unsigned int hash)
{
        return &table->hash2[hash & table->mask];
}

extern struct proto udp_prot;

extern atomic_long_t udp_memory_allocated;

/* sysctl variables for udp */
extern long sysctl_udp_mem[3];
extern int sysctl_udp_rmem_min;
extern int sysctl_udp_wmem_min;

struct sk_buff;

/*
 *        Generic checksumming routines for UDP(-Lite) v4 and v6
 */
static inline __sum16 __udp_lib_checksum_complete(struct sk_buff *skb)
{
        return (UDP_SKB_CB(skb)->cscov == skb->len ?
                __skb_checksum_complete(skb) :
                __skb_checksum_complete_head(skb, UDP_SKB_CB(skb)->cscov));
}

static inline int udp_lib_checksum_complete(struct sk_buff *skb)
{
        return !skb_csum_unnecessary(skb) &&
                __udp_lib_checksum_complete(skb);
}

/**
 *         udp_csum_outgoing  -  compute UDPv4/v6 checksum over fragments
 *         @sk:         socket we are writing to
 *         @skb:         sk_buff containing the filled-in UDP header
 *                 (checksum field must be zeroed out)
 */
static inline __wsum udp_csum_outgoing(struct sock *sk, struct sk_buff *skb)
{
        __wsum csum = csum_partial(skb_transport_header(skb),
                                   sizeof(struct udphdr), 0);
        skb_queue_walk(&sk->sk_write_queue, skb) {
                csum = csum_add(csum, skb->csum);
        }
        return csum;
}

static inline __wsum udp_csum(struct sk_buff *skb)
{
        __wsum csum = csum_partial(skb_transport_header(skb),
                                   sizeof(struct udphdr), skb->csum);

        for (skb = skb_shinfo(skb)->frag_list; skb; skb = skb->next) {
                csum = csum_add(csum, skb->csum);
        }
        return csum;
}

static inline __sum16 udp_v4_check(int len, __be32 saddr,
                                   __be32 daddr, __wsum base)
{
        return csum_tcpudp_magic(saddr, daddr, len, IPPROTO_UDP, base);
}

void udp_set_csum(bool nocheck, struct sk_buff *skb,
                  __be32 saddr, __be32 daddr, int len);

static inline void udp_csum_pull_header(struct sk_buff *skb)
{
        if (!skb->csum_valid && skb->ip_summed == CHECKSUM_NONE)
                skb->csum = csum_partial(skb->data, sizeof(struct udphdr),
                                         skb->csum);
        skb_pull_rcsum(skb, sizeof(struct udphdr));
        UDP_SKB_CB(skb)->cscov -= sizeof(struct udphdr);
}

typedef struct sock *(*udp_lookup_t)(struct sk_buff *skb, __be16 sport,
                                     __be16 dport);

INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp4_gro_receive(struct list_head *,
                                                           struct sk_buff *));
INDIRECT_CALLABLE_DECLARE(int udp4_gro_complete(struct sk_buff *, int));
INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp6_gro_receive(struct list_head *,
                                                           struct sk_buff *));
INDIRECT_CALLABLE_DECLARE(int udp6_gro_complete(struct sk_buff *, int));
struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
                                struct udphdr *uh, struct sock *sk);
int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup);
void udp_v6_early_demux(struct sk_buff *skb);

struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
                                  netdev_features_t features, bool is_ipv6);

static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb)
{
        struct udphdr *uh;
        unsigned int hlen, off;

        off  = skb_gro_offset(skb);
        hlen = off + sizeof(*uh);
        uh   = skb_gro_header_fast(skb, off);
        if (skb_gro_header_hard(skb, hlen))
                uh = skb_gro_header_slow(skb, hlen, off);

        return uh;
}

/* hash routines shared between UDPv4/6 and UDP-Litev4/6 */
static inline int udp_lib_hash(struct sock *sk)
{
        BUG();
        return 0;
}

void udp_lib_unhash(struct sock *sk);
void udp_lib_rehash(struct sock *sk, u16 new_hash);

static inline void udp_lib_close(struct sock *sk, long timeout)
{
        sk_common_release(sk);
}

int udp_lib_get_port(struct sock *sk, unsigned short snum,
                     unsigned int hash2_nulladdr);

u32 udp_flow_hashrnd(void);

static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb,
                                       int min, int max, bool use_eth)
{
        u32 hash;

        if (min >= max) {
                /* Use default range */
                inet_get_local_port_range(net, &min, &max);
        }

        hash = skb_get_hash(skb);
        if (unlikely(!hash)) {
                if (use_eth) {
                        /* Can't find a normal hash, caller has indicated an
                         * Ethernet packet so use that to compute a hash.
                         */
                        hash = jhash(skb->data, 2 * ETH_ALEN,
                                     (__force u32) skb->protocol);
                } else {
                        /* Can't derive any sort of hash for the packet, set
                         * to some consistent random value.
                         */
                        hash = udp_flow_hashrnd();
                }
        }

        /* Since this is being sent on the wire obfuscate hash a bit
         * to minimize possbility that any useful information to an
         * attacker is leaked. Only upper 16 bits are relevant in the
         * computation for 16 bit port value.
         */
        hash ^= hash << 16;

        return htons((((u64) hash * (max - min)) >> 32) + min);
}

static inline int udp_rqueue_get(struct sock *sk)
{
        return sk_rmem_alloc_get(sk) - READ_ONCE(udp_sk(sk)->forward_deficit);
}

static inline bool udp_sk_bound_dev_eq(struct net *net, int bound_dev_if,
                                       int dif, int sdif)
{
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
        return inet_bound_dev_eq(!!READ_ONCE(net->ipv4.sysctl_udp_l3mdev_accept),
                                 bound_dev_if, dif, sdif);
#else
        return inet_bound_dev_eq(true, bound_dev_if, dif, sdif);
#endif
}

/* net/ipv4/udp.c */
void udp_destruct_common(struct sock *sk);
void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len);
int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb);
void udp_skb_destructor(struct sock *sk, struct sk_buff *skb);
struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
                               int noblock, int *off, int *err);
static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags,
                                           int noblock, int *err)
{
        int off = 0;

        return __skb_recv_udp(sk, flags, noblock, &off, err);
}

int udp_v4_early_demux(struct sk_buff *skb);
bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst);
int udp_get_port(struct sock *sk, unsigned short snum,
                 int (*saddr_cmp)(const struct sock *,
                                  const struct sock *));
int udp_err(struct sk_buff *, u32);
int udp_abort(struct sock *sk, int err);
int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
int udp_push_pending_frames(struct sock *sk);
void udp_flush_pending_frames(struct sock *sk);
int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size);
void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst);
int udp_rcv(struct sk_buff *skb);
int udp_ioctl(struct sock *sk, int cmd, unsigned long arg);
int udp_init_sock(struct sock *sk);
int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
int __udp_disconnect(struct sock *sk, int flags);
int udp_disconnect(struct sock *sk, int flags);
__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait);
struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
                                       netdev_features_t features,
                                       bool is_ipv6);
int udp_lib_getsockopt(struct sock *sk, int level, int optname,
                       char __user *optval, int __user *optlen);
int udp_lib_setsockopt(struct sock *sk, int level, int optname,
                       sockptr_t optval, unsigned int optlen,
                       int (*push_pending_frames)(struct sock *));
struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
                             __be32 daddr, __be16 dport, int dif);
struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
                               __be32 daddr, __be16 dport, int dif, int sdif,
                               struct udp_table *tbl, struct sk_buff *skb);
struct sock *udp4_lib_lookup_skb(struct sk_buff *skb,
                                 __be16 sport, __be16 dport);
struct sock *udp6_lib_lookup(struct net *net,
                             const struct in6_addr *saddr, __be16 sport,
                             const struct in6_addr *daddr, __be16 dport,
                             int dif);
struct sock *__udp6_lib_lookup(struct net *net,
                               const struct in6_addr *saddr, __be16 sport,
                               const struct in6_addr *daddr, __be16 dport,
                               int dif, int sdif, struct udp_table *tbl,
                               struct sk_buff *skb);
struct sock *udp6_lib_lookup_skb(struct sk_buff *skb,
                                 __be16 sport, __be16 dport);

/* UDP uses skb->dev_scratch to cache as much information as possible and avoid
 * possibly multiple cache miss on dequeue()
 */
struct udp_dev_scratch {
        /* skb->truesize and the stateless bit are embedded in a single field;
         * do not use a bitfield since the compiler emits better/smaller code
         * this way
         */
        u32 _tsize_state;

#if BITS_PER_LONG == 64
        /* len and the bit needed to compute skb_csum_unnecessary
         * will be on cold cache lines at recvmsg time.
         * skb->len can be stored on 16 bits since the udp header has been
         * already validated and pulled.
         */
        u16 len;
        bool is_linear;
        bool csum_unnecessary;
#endif
};

static inline struct udp_dev_scratch *udp_skb_scratch(struct sk_buff *skb)
{
        return (struct udp_dev_scratch *)&skb->dev_scratch;
}

#if BITS_PER_LONG == 64
static inline unsigned int udp_skb_len(struct sk_buff *skb)
{
        return udp_skb_scratch(skb)->len;
}

static inline bool udp_skb_csum_unnecessary(struct sk_buff *skb)
{
        return udp_skb_scratch(skb)->csum_unnecessary;
}

static inline bool udp_skb_is_linear(struct sk_buff *skb)
{
        return udp_skb_scratch(skb)->is_linear;
}

#else
static inline unsigned int udp_skb_len(struct sk_buff *skb)
{
        return skb->len;
}

static inline bool udp_skb_csum_unnecessary(struct sk_buff *skb)
{
        return skb_csum_unnecessary(skb);
}

static inline bool udp_skb_is_linear(struct sk_buff *skb)
{
        return !skb_is_nonlinear(skb);
}
#endif

static inline int copy_linear_skb(struct sk_buff *skb, int len, int off,
                                  struct iov_iter *to)
{
        int n;

        n = copy_to_iter(skb->data + off, len, to);
        if (n == len)
                return 0;

        iov_iter_revert(to, n);
        return -EFAULT;
}

/*
 *         SNMP statistics for UDP and UDP-Lite
 */
#define UDP_INC_STATS(net, field, is_udplite)                      do { \
        if (is_udplite) SNMP_INC_STATS((net)->mib.udplite_statistics, field);       \
        else                SNMP_INC_STATS((net)->mib.udp_statistics, field);  }  while(0)
#define __UDP_INC_STATS(net, field, is_udplite)               do { \
        if (is_udplite) __SNMP_INC_STATS((net)->mib.udplite_statistics, field);         \
        else                __SNMP_INC_STATS((net)->mib.udp_statistics, field);    }  while(0)

#define __UDP6_INC_STATS(net, field, is_udplite)            do { \
        if (is_udplite) __SNMP_INC_STATS((net)->mib.udplite_stats_in6, field);\
        else                __SNMP_INC_STATS((net)->mib.udp_stats_in6, field);  \
} while(0)
#define UDP6_INC_STATS(net, field, __lite)                    do { \
        if (__lite) SNMP_INC_STATS((net)->mib.udplite_stats_in6, field);  \
        else            SNMP_INC_STATS((net)->mib.udp_stats_in6, field);      \
} while(0)

#if IS_ENABLED(CONFIG_IPV6)
#define __UDPX_MIB(sk, ipv4)                                                \
({                                                                        \
        ipv4 ? (IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_statistics :        \
                                 sock_net(sk)->mib.udp_statistics) :        \
                (IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_stats_in6 :        \
                                 sock_net(sk)->mib.udp_stats_in6);        \
})
#else
#define __UDPX_MIB(sk, ipv4)                                                \
({                                                                        \
        IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_statistics :                \
                         sock_net(sk)->mib.udp_statistics;                \
})
#endif

#define __UDPX_INC_STATS(sk, field) \
        __SNMP_INC_STATS(__UDPX_MIB(sk, (sk)->sk_family == AF_INET), field)

#ifdef CONFIG_PROC_FS
struct udp_seq_afinfo {
        sa_family_t                        family;
        struct udp_table                *udp_table;
};

struct udp_iter_state {
        struct seq_net_private  p;
        int                        bucket;
        struct udp_seq_afinfo        *bpf_seq_afinfo;
};

void *udp_seq_start(struct seq_file *seq, loff_t *pos);
void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos);
void udp_seq_stop(struct seq_file *seq, void *v);

extern const struct seq_operations udp_seq_ops;
extern const struct seq_operations udp6_seq_ops;

int udp4_proc_init(void);
void udp4_proc_exit(void);
#endif /* CONFIG_PROC_FS */

int udpv4_offload_init(void);

void udp_init(void);

DECLARE_STATIC_KEY_FALSE(udp_encap_needed_key);
void udp_encap_enable(void);
void udp_encap_disable(void);
#if IS_ENABLED(CONFIG_IPV6)
DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
void udpv6_encap_enable(void);
#endif

static inline struct sk_buff *udp_rcv_segment(struct sock *sk,
                                              struct sk_buff *skb, bool ipv4)
{
        netdev_features_t features = NETIF_F_SG;
        struct sk_buff *segs;
        int drop_count;

        /*
         * Segmentation in UDP receive path is only for UDP GRO, drop udp
         * fragmentation offload (UFO) packets.
         */
        if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP) {
                drop_count = 1;
                goto drop;
        }

        /* Avoid csum recalculation by skb_segment unless userspace explicitly
         * asks for the final checksum values
         */
        if (!inet_get_convert_csum(sk))
                features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;

        /* UDP segmentation expects packets of type CHECKSUM_PARTIAL or
         * CHECKSUM_NONE in __udp_gso_segment. UDP GRO indeed builds partial
         * packets in udp_gro_complete_segment. As does UDP GSO, verified by
         * udp_send_skb. But when those packets are looped in dev_loopback_xmit
         * their ip_summed CHECKSUM_NONE is changed to CHECKSUM_UNNECESSARY.
         * Reset in this specific case, where PARTIAL is both correct and
         * required.
         */
        if (skb->pkt_type == PACKET_LOOPBACK)
                skb->ip_summed = CHECKSUM_PARTIAL;

        /* the GSO CB lays after the UDP one, no need to save and restore any
         * CB fragment
         */
        segs = __skb_gso_segment(skb, features, false);
        if (IS_ERR_OR_NULL(segs)) {
                drop_count = skb_shinfo(skb)->gso_segs;
                goto drop;
        }

        consume_skb(skb);
        return segs;

drop:
        atomic_add(drop_count, &sk->sk_drops);
        SNMP_ADD_STATS(__UDPX_MIB(sk, ipv4), UDP_MIB_INERRORS, drop_count);
        kfree_skb(skb);
        return NULL;
}

#ifdef CONFIG_BPF_SYSCALL
struct sk_psock;
struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock);
#endif

#endif        /* _UDP_H */























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM exceptions

#if !defined(_TRACE_PAGE_FAULT_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_PAGE_FAULT_H

#include <linux/tracepoint.h>
#include <asm/trace/common.h>

extern int trace_pagefault_reg(void);
extern void trace_pagefault_unreg(void);

DECLARE_EVENT_CLASS(x86_exceptions,

        TP_PROTO(unsigned long address, struct pt_regs *regs,
                 unsigned long error_code),

        TP_ARGS(address, regs, error_code),

        TP_STRUCT__entry(
                __field(                unsigned long, address        )
                __field(                unsigned long, ip        )
                __field(                unsigned long, error_code )
        ),

        TP_fast_assign(
                __entry->address = address;
                __entry->ip = regs->ip;
                __entry->error_code = error_code;
        ),

        TP_printk("address=%ps ip=%ps error_code=0x%lx",
                  (void *)__entry->address, (void *)__entry->ip,
                  __entry->error_code) );

#define DEFINE_PAGE_FAULT_EVENT(name)                                \
DEFINE_EVENT_FN(x86_exceptions, name,                                \
        TP_PROTO(unsigned long address,        struct pt_regs *regs,        \
                 unsigned long error_code),                        \
        TP_ARGS(address, regs, error_code),                        \
        trace_pagefault_reg, trace_pagefault_unreg);

DEFINE_PAGE_FAULT_EVENT(page_fault_user);
DEFINE_PAGE_FAULT_EVENT(page_fault_kernel);

#undef TRACE_INCLUDE_PATH
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_PATH .
#define TRACE_INCLUDE_FILE exceptions
#endif /*  _TRACE_PAGE_FAULT_H */

/* This part must be outside protection */
#include <trace/define_trace.h>





































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * ioport.h        Definitions of routines for detecting, reserving and
 *                allocating system resources.
 *
 * Authors:        Linus Torvalds
 */

#ifndef _LINUX_IOPORT_H
#define _LINUX_IOPORT_H

#ifndef __ASSEMBLY__
#include <linux/compiler.h>
#include <linux/types.h>
#include <linux/bits.h>
/*
 * Resources are tree-like, allowing
 * nesting etc..
 */
struct resource {
        resource_size_t start;
        resource_size_t end;
        const char *name;
        unsigned long flags;
        unsigned long desc;
        struct resource *parent, *sibling, *child;
};

/*
 * IO resources have these defined flags.
 *
 * PCI devices expose these flags to userspace in the "resource" sysfs file,
 * so don't move them.
 */
#define IORESOURCE_BITS                0x000000ff        /* Bus-specific bits */

#define IORESOURCE_TYPE_BITS        0x00001f00        /* Resource type */
#define IORESOURCE_IO                0x00000100        /* PCI/ISA I/O ports */
#define IORESOURCE_MEM                0x00000200
#define IORESOURCE_REG                0x00000300        /* Register offsets */
#define IORESOURCE_IRQ                0x00000400
#define IORESOURCE_DMA                0x00000800
#define IORESOURCE_BUS                0x00001000

#define IORESOURCE_PREFETCH        0x00002000        /* No side effects */
#define IORESOURCE_READONLY        0x00004000
#define IORESOURCE_CACHEABLE        0x00008000
#define IORESOURCE_RANGELENGTH        0x00010000
#define IORESOURCE_SHADOWABLE        0x00020000

#define IORESOURCE_SIZEALIGN        0x00040000        /* size indicates alignment */
#define IORESOURCE_STARTALIGN        0x00080000        /* start field is alignment */

#define IORESOURCE_MEM_64        0x00100000
#define IORESOURCE_WINDOW        0x00200000        /* forwarded by bridge */
#define IORESOURCE_MUXED        0x00400000        /* Resource is software muxed */

#define IORESOURCE_EXT_TYPE_BITS 0x01000000        /* Resource extended types */
#define IORESOURCE_SYSRAM        0x01000000        /* System RAM (modifier) */

/* IORESOURCE_SYSRAM specific bits. */
#define IORESOURCE_SYSRAM_DRIVER_MANAGED        0x02000000 /* Always detected via a driver. */
#define IORESOURCE_SYSRAM_MERGEABLE                0x04000000 /* Resource can be merged. */

#define IORESOURCE_EXCLUSIVE        0x08000000        /* Userland may not map this resource */

#define IORESOURCE_DISABLED        0x10000000
#define IORESOURCE_UNSET        0x20000000        /* No address assigned yet */
#define IORESOURCE_AUTO                0x40000000
#define IORESOURCE_BUSY                0x80000000        /* Driver has marked this resource busy */

/* I/O resource extended types */
#define IORESOURCE_SYSTEM_RAM                (IORESOURCE_MEM|IORESOURCE_SYSRAM)

/* PnP IRQ specific bits (IORESOURCE_BITS) */
#define IORESOURCE_IRQ_HIGHEDGE                (1<<0)
#define IORESOURCE_IRQ_LOWEDGE                (1<<1)
#define IORESOURCE_IRQ_HIGHLEVEL        (1<<2)
#define IORESOURCE_IRQ_LOWLEVEL                (1<<3)
#define IORESOURCE_IRQ_SHAREABLE        (1<<4)
#define IORESOURCE_IRQ_OPTIONAL         (1<<5)

/* PnP DMA specific bits (IORESOURCE_BITS) */
#define IORESOURCE_DMA_TYPE_MASK        (3<<0)
#define IORESOURCE_DMA_8BIT                (0<<0)
#define IORESOURCE_DMA_8AND16BIT        (1<<0)
#define IORESOURCE_DMA_16BIT                (2<<0)

#define IORESOURCE_DMA_MASTER                (1<<2)
#define IORESOURCE_DMA_BYTE                (1<<3)
#define IORESOURCE_DMA_WORD                (1<<4)

#define IORESOURCE_DMA_SPEED_MASK        (3<<6)
#define IORESOURCE_DMA_COMPATIBLE        (0<<6)
#define IORESOURCE_DMA_TYPEA                (1<<6)
#define IORESOURCE_DMA_TYPEB                (2<<6)
#define IORESOURCE_DMA_TYPEF                (3<<6)

/* PnP memory I/O specific bits (IORESOURCE_BITS) */
#define IORESOURCE_MEM_WRITEABLE        (1<<0)        /* dup: IORESOURCE_READONLY */
#define IORESOURCE_MEM_CACHEABLE        (1<<1)        /* dup: IORESOURCE_CACHEABLE */
#define IORESOURCE_MEM_RANGELENGTH        (1<<2)        /* dup: IORESOURCE_RANGELENGTH */
#define IORESOURCE_MEM_TYPE_MASK        (3<<3)
#define IORESOURCE_MEM_8BIT                (0<<3)
#define IORESOURCE_MEM_16BIT                (1<<3)
#define IORESOURCE_MEM_8AND16BIT        (2<<3)
#define IORESOURCE_MEM_32BIT                (3<<3)
#define IORESOURCE_MEM_SHADOWABLE        (1<<5)        /* dup: IORESOURCE_SHADOWABLE */
#define IORESOURCE_MEM_EXPANSIONROM        (1<<6)

/* PnP I/O specific bits (IORESOURCE_BITS) */
#define IORESOURCE_IO_16BIT_ADDR        (1<<0)
#define IORESOURCE_IO_FIXED                (1<<1)
#define IORESOURCE_IO_SPARSE                (1<<2)

/* PCI ROM control bits (IORESOURCE_BITS) */
#define IORESOURCE_ROM_ENABLE                (1<<0)        /* ROM is enabled, same as PCI_ROM_ADDRESS_ENABLE */
#define IORESOURCE_ROM_SHADOW                (1<<1)        /* Use RAM image, not ROM BAR */

/* PCI control bits.  Shares IORESOURCE_BITS with above PCI ROM.  */
#define IORESOURCE_PCI_FIXED                (1<<4)        /* Do not move resource */
#define IORESOURCE_PCI_EA_BEI                (1<<5)        /* BAR Equivalent Indicator */

/*
 * I/O Resource Descriptors
 *
 * Descriptors are used by walk_iomem_res_desc() and region_intersects()
 * for searching a specific resource range in the iomem table.  Assign
 * a new descriptor when a resource range supports the search interfaces.
 * Otherwise, resource.desc must be set to IORES_DESC_NONE (0).
 */
enum {
        IORES_DESC_NONE                                = 0,
        IORES_DESC_CRASH_KERNEL                        = 1,
        IORES_DESC_ACPI_TABLES                        = 2,
        IORES_DESC_ACPI_NV_STORAGE                = 3,
        IORES_DESC_PERSISTENT_MEMORY                = 4,
        IORES_DESC_PERSISTENT_MEMORY_LEGACY        = 5,
        IORES_DESC_DEVICE_PRIVATE_MEMORY        = 6,
        IORES_DESC_RESERVED                        = 7,
        IORES_DESC_SOFT_RESERVED                = 8,
};

/*
 * Flags controlling ioremap() behavior.
 */
enum {
        IORES_MAP_SYSTEM_RAM                = BIT(0),
        IORES_MAP_ENCRYPTED                = BIT(1),
};

/* helpers to define resources */
#define DEFINE_RES_NAMED(_start, _size, _name, _flags)                        \
        {                                                                \
                .start = (_start),                                        \
                .end = (_start) + (_size) - 1,                                \
                .name = (_name),                                        \
                .flags = (_flags),                                        \
                .desc = IORES_DESC_NONE,                                \
        }

#define DEFINE_RES_IO_NAMED(_start, _size, _name)                        \
        DEFINE_RES_NAMED((_start), (_size), (_name), IORESOURCE_IO)
#define DEFINE_RES_IO(_start, _size)                                        \
        DEFINE_RES_IO_NAMED((_start), (_size), NULL)

#define DEFINE_RES_MEM_NAMED(_start, _size, _name)                        \
        DEFINE_RES_NAMED((_start), (_size), (_name), IORESOURCE_MEM)
#define DEFINE_RES_MEM(_start, _size)                                        \
        DEFINE_RES_MEM_NAMED((_start), (_size), NULL)

#define DEFINE_RES_IRQ_NAMED(_irq, _name)                                \
        DEFINE_RES_NAMED((_irq), 1, (_name), IORESOURCE_IRQ)
#define DEFINE_RES_IRQ(_irq)                                                \
        DEFINE_RES_IRQ_NAMED((_irq), NULL)

#define DEFINE_RES_DMA_NAMED(_dma, _name)                                \
        DEFINE_RES_NAMED((_dma), 1, (_name), IORESOURCE_DMA)
#define DEFINE_RES_DMA(_dma)                                                \
        DEFINE_RES_DMA_NAMED((_dma), NULL)

/* PC/ISA/whatever - the normal PC address spaces: IO and memory */
extern struct resource ioport_resource;
extern struct resource iomem_resource;

extern struct resource *request_resource_conflict(struct resource *root, struct resource *new);
extern int request_resource(struct resource *root, struct resource *new);
extern int release_resource(struct resource *new);
void release_child_resources(struct resource *new);
extern void reserve_region_with_split(struct resource *root,
                             resource_size_t start, resource_size_t end,
                             const char *name);
extern struct resource *insert_resource_conflict(struct resource *parent, struct resource *new);
extern int insert_resource(struct resource *parent, struct resource *new);
extern void insert_resource_expand_to_fit(struct resource *root, struct resource *new);
extern int remove_resource(struct resource *old);
extern void arch_remove_reservations(struct resource *avail);
extern int allocate_resource(struct resource *root, struct resource *new,
                             resource_size_t size, resource_size_t min,
                             resource_size_t max, resource_size_t align,
                             resource_size_t (*alignf)(void *,
                                                       const struct resource *,
                                                       resource_size_t,
                                                       resource_size_t),
                             void *alignf_data);
struct resource *lookup_resource(struct resource *root, resource_size_t start);
int adjust_resource(struct resource *res, resource_size_t start,
                    resource_size_t size);
resource_size_t resource_alignment(struct resource *res);
static inline resource_size_t resource_size(const struct resource *res)
{
        return res->end - res->start + 1;
}
static inline unsigned long resource_type(const struct resource *res)
{
        return res->flags & IORESOURCE_TYPE_BITS;
}
static inline unsigned long resource_ext_type(const struct resource *res)
{
        return res->flags & IORESOURCE_EXT_TYPE_BITS;
}
/* True iff r1 completely contains r2 */
static inline bool resource_contains(struct resource *r1, struct resource *r2)
{
        if (resource_type(r1) != resource_type(r2))
                return false;
        if (r1->flags & IORESOURCE_UNSET || r2->flags & IORESOURCE_UNSET)
                return false;
        return r1->start <= r2->start && r1->end >= r2->end;
}


/* Convenience shorthand with allocation */
#define request_region(start,n,name)                __request_region(&ioport_resource, (start), (n), (name), 0)
#define request_muxed_region(start,n,name)        __request_region(&ioport_resource, (start), (n), (name), IORESOURCE_MUXED)
#define __request_mem_region(start,n,name, excl) __request_region(&iomem_resource, (start), (n), (name), excl)
#define request_mem_region(start,n,name) __request_region(&iomem_resource, (start), (n), (name), 0)
#define request_mem_region_exclusive(start,n,name) \
        __request_region(&iomem_resource, (start), (n), (name), IORESOURCE_EXCLUSIVE)
#define rename_region(region, newname) do { (region)->name = (newname); } while (0)

extern struct resource * __request_region(struct resource *,
                                        resource_size_t start,
                                        resource_size_t n,
                                        const char *name, int flags);

/* Compatibility cruft */
#define release_region(start,n)        __release_region(&ioport_resource, (start), (n))
#define release_mem_region(start,n)        __release_region(&iomem_resource, (start), (n))

extern void __release_region(struct resource *, resource_size_t,
                                resource_size_t);
#ifdef CONFIG_MEMORY_HOTREMOVE
extern void release_mem_region_adjustable(resource_size_t, resource_size_t);
#endif
#ifdef CONFIG_MEMORY_HOTPLUG
extern void merge_system_ram_resource(struct resource *res);
#endif

/* Wrappers for managed devices */
struct device;

extern int devm_request_resource(struct device *dev, struct resource *root,
                                 struct resource *new);
extern void devm_release_resource(struct device *dev, struct resource *new);

#define devm_request_region(dev,start,n,name) \
        __devm_request_region(dev, &ioport_resource, (start), (n), (name))
#define devm_request_mem_region(dev,start,n,name) \
        __devm_request_region(dev, &iomem_resource, (start), (n), (name))

extern struct resource * __devm_request_region(struct device *dev,
                                struct resource *parent, resource_size_t start,
                                resource_size_t n, const char *name);

#define devm_release_region(dev, start, n) \
        __devm_release_region(dev, &ioport_resource, (start), (n))
#define devm_release_mem_region(dev, start, n) \
        __devm_release_region(dev, &iomem_resource, (start), (n))

extern void __devm_release_region(struct device *dev, struct resource *parent,
                                  resource_size_t start, resource_size_t n);
extern int iomem_map_sanity_check(resource_size_t addr, unsigned long size);
extern bool iomem_is_exclusive(u64 addr);

extern int
walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
                void *arg, int (*func)(unsigned long, unsigned long, void *));
extern int
walk_mem_res(u64 start, u64 end, void *arg,
             int (*func)(struct resource *, void *));
extern int
walk_system_ram_res(u64 start, u64 end, void *arg,
                    int (*func)(struct resource *, void *));
extern int
walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end,
                    void *arg, int (*func)(struct resource *, void *));

/* True if any part of r1 overlaps r2 */
static inline bool resource_overlaps(struct resource *r1, struct resource *r2)
{
       return (r1->start <= r2->end && r1->end >= r2->start);
}

struct resource *devm_request_free_mem_region(struct device *dev,
                struct resource *base, unsigned long size);
struct resource *request_free_mem_region(struct resource *base,
                unsigned long size, const char *name);

static inline void irqresource_disabled(struct resource *res, u32 irq)
{
        res->start = irq;
        res->end = irq;
        res->flags = IORESOURCE_IRQ | IORESOURCE_DISABLED | IORESOURCE_UNSET;
}

#ifdef CONFIG_IO_STRICT_DEVMEM
void revoke_devmem(struct resource *res);
#else
static inline void revoke_devmem(struct resource *res) { };
#endif

#endif /* __ASSEMBLY__ */
#endif        /* _LINUX_IOPORT_H */




















    6 


    7 










































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PAGE_64_H
#define _ASM_X86_PAGE_64_H

#include <asm/page_64_types.h>

#ifndef __ASSEMBLY__
#include <asm/alternative.h>

/* duplicated to the one in bootmem.h */
extern unsigned long max_pfn;
extern unsigned long phys_base;

extern unsigned long page_offset_base;
extern unsigned long vmalloc_base;
extern unsigned long vmemmap_base;

static inline unsigned long __phys_addr_nodebug(unsigned long x)
{
        unsigned long y = x - __START_KERNEL_map;

        /* use the carry flag to determine if x was < __START_KERNEL_map */
        x = y + ((x > y) ? phys_base : (__START_KERNEL_map - PAGE_OFFSET));

        return x;
}

#ifdef CONFIG_DEBUG_VIRTUAL
extern unsigned long __phys_addr(unsigned long);
extern unsigned long __phys_addr_symbol(unsigned long);
#else
#define __phys_addr(x)                __phys_addr_nodebug(x)
#define __phys_addr_symbol(x) \
        ((unsigned long)(x) - __START_KERNEL_map + phys_base)
#endif

#define __phys_reloc_hide(x)        (x)

#ifdef CONFIG_FLATMEM
#define pfn_valid(pfn)          ((pfn) < max_pfn)
#endif

void clear_page_orig(void *page);
void clear_page_rep(void *page);
void clear_page_erms(void *page);

static inline void clear_page(void *page)
{
        alternative_call_2(clear_page_orig,
                           clear_page_rep, X86_FEATURE_REP_GOOD,
                           clear_page_erms, X86_FEATURE_ERMS,
                           "=D" (page),
                           "0" (page)
                           : "cc", "memory", "rax", "rcx");
}

void copy_page(void *to, void *from);

#endif        /* !__ASSEMBLY__ */

#ifdef CONFIG_X86_VSYSCALL_EMULATION
# define __HAVE_ARCH_GATE_AREA 1
#endif

#endif /* _ASM_X86_PAGE_64_H */

































































    1 































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef LINUX_MM_INLINE_H
#define LINUX_MM_INLINE_H

#include <linux/huge_mm.h>
#include <linux/swap.h>

/**
 * page_is_file_lru - should the page be on a file LRU or anon LRU?
 * @page: the page to test
 *
 * Returns 1 if @page is a regular filesystem backed page cache page or a lazily
 * freed anonymous page (e.g. via MADV_FREE).  Returns 0 if @page is a normal
 * anonymous page, a tmpfs page or otherwise ram or swap backed page.  Used by
 * functions that manipulate the LRU lists, to sort a page onto the right LRU
 * list.
 *
 * We would like to get this info without a page flag, but the state
 * needs to survive until the page is last deleted from the LRU, which
 * could be as far down as __page_cache_release.
 */
static inline int page_is_file_lru(struct page *page)
{
        return !PageSwapBacked(page);
}

static __always_inline void __update_lru_size(struct lruvec *lruvec,
                                enum lru_list lru, enum zone_type zid,
                                int nr_pages)
{
        struct pglist_data *pgdat = lruvec_pgdat(lruvec);

        __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
        __mod_zone_page_state(&pgdat->node_zones[zid],
                                NR_ZONE_LRU_BASE + lru, nr_pages);
}

static __always_inline void update_lru_size(struct lruvec *lruvec,
                                enum lru_list lru, enum zone_type zid,
                                int nr_pages)
{
        __update_lru_size(lruvec, lru, zid, nr_pages);
#ifdef CONFIG_MEMCG
        mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages);
#endif
}

static __always_inline void add_page_to_lru_list(struct page *page,
                                struct lruvec *lruvec, enum lru_list lru)
{
        update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
        list_add(&page->lru, &lruvec->lists[lru]);
}

static __always_inline void add_page_to_lru_list_tail(struct page *page,
                                struct lruvec *lruvec, enum lru_list lru)
{
        update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
        list_add_tail(&page->lru, &lruvec->lists[lru]);
}

static __always_inline void del_page_from_lru_list(struct page *page,
                                struct lruvec *lruvec, enum lru_list lru)
{
        list_del(&page->lru);
        update_lru_size(lruvec, lru, page_zonenum(page), -thp_nr_pages(page));
}

/**
 * page_lru_base_type - which LRU list type should a page be on?
 * @page: the page to test
 *
 * Used for LRU list index arithmetic.
 *
 * Returns the base LRU type - file or anon - @page should be on.
 */
static inline enum lru_list page_lru_base_type(struct page *page)
{
        if (page_is_file_lru(page))
                return LRU_INACTIVE_FILE;
        return LRU_INACTIVE_ANON;
}

/**
 * page_off_lru - which LRU list was page on? clearing its lru flags.
 * @page: the page to test
 *
 * Returns the LRU list a page was on, as an index into the array of LRU
 * lists; and clears its Unevictable or Active flags, ready for freeing.
 */
static __always_inline enum lru_list page_off_lru(struct page *page)
{
        enum lru_list lru;

        if (PageUnevictable(page)) {
                __ClearPageUnevictable(page);
                lru = LRU_UNEVICTABLE;
        } else {
                lru = page_lru_base_type(page);
                if (PageActive(page)) {
                        __ClearPageActive(page);
                        lru += LRU_ACTIVE;
                }
        }
        return lru;
}

/**
 * page_lru - which LRU list should a page be on?
 * @page: the page to test
 *
 * Returns the LRU list a page should be on, as an index
 * into the array of LRU lists.
 */
static __always_inline enum lru_list page_lru(struct page *page)
{
        enum lru_list lru;

        if (PageUnevictable(page))
                lru = LRU_UNEVICTABLE;
        else {
                lru = page_lru_base_type(page);
                if (PageActive(page))
                        lru += LRU_ACTIVE;
        }
        return lru;
}
#endif




























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Definitions and Declarations for tuple.
 *
 * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
 *        - generalize L3 protocol dependent part.
 *
 * Derived from include/linux/netfiter_ipv4/ip_conntrack_tuple.h
 */

#ifndef _NF_CONNTRACK_TUPLE_H
#define _NF_CONNTRACK_TUPLE_H

#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/nf_conntrack_tuple_common.h>
#include <linux/list_nulls.h>

/* A `tuple' is a structure containing the information to uniquely
  identify a connection.  ie. if two packets have the same tuple, they
  are in the same connection; if not, they are not.

  We divide the structure along "manipulatable" and
  "non-manipulatable" lines, for the benefit of the NAT code.
*/

#define NF_CT_TUPLE_L3SIZE        ARRAY_SIZE(((union nf_inet_addr *)NULL)->all)

/* The manipulable part of the tuple. */
struct nf_conntrack_man {
        union nf_inet_addr u3;
        union nf_conntrack_man_proto u;
        /* Layer 3 protocol */
        u_int16_t l3num;
};

/* This contains the information to distinguish a connection. */
struct nf_conntrack_tuple {
        struct nf_conntrack_man src;

        /* These are the parts of the tuple which are fixed. */
        struct {
                union nf_inet_addr u3;
                union {
                        /* Add other protocols here. */
                        __be16 all;

                        struct {
                                __be16 port;
                        } tcp;
                        struct {
                                __be16 port;
                        } udp;
                        struct {
                                u_int8_t type, code;
                        } icmp;
                        struct {
                                __be16 port;
                        } dccp;
                        struct {
                                __be16 port;
                        } sctp;
                        struct {
                                __be16 key;
                        } gre;
                } u;

                /* The protocol. */
                u_int8_t protonum;

                /* The direction (for tuplehash) */
                u_int8_t dir;
        } dst;
};

struct nf_conntrack_tuple_mask {
        struct {
                union nf_inet_addr u3;
                union nf_conntrack_man_proto u;
        } src;
};

static inline void nf_ct_dump_tuple_ip(const struct nf_conntrack_tuple *t)
{
#ifdef DEBUG
        printk("tuple %p: %u %pI4:%hu -> %pI4:%hu\n",
               t, t->dst.protonum,
               &t->src.u3.ip, ntohs(t->src.u.all),
               &t->dst.u3.ip, ntohs(t->dst.u.all));
#endif
}

static inline void nf_ct_dump_tuple_ipv6(const struct nf_conntrack_tuple *t)
{
#ifdef DEBUG
        printk("tuple %p: %u %pI6 %hu -> %pI6 %hu\n",
               t, t->dst.protonum,
               t->src.u3.all, ntohs(t->src.u.all),
               t->dst.u3.all, ntohs(t->dst.u.all));
#endif
}

static inline void nf_ct_dump_tuple(const struct nf_conntrack_tuple *t)
{
        switch (t->src.l3num) {
        case AF_INET:
                nf_ct_dump_tuple_ip(t);
                break;
        case AF_INET6:
                nf_ct_dump_tuple_ipv6(t);
                break;
        }
}

/* If we're the first tuple, it's the original dir. */
#define NF_CT_DIRECTION(h)                                                \
        ((enum ip_conntrack_dir)(h)->tuple.dst.dir)

/* Connections have two entries in the hash table: one for each way */
struct nf_conntrack_tuple_hash {
        struct hlist_nulls_node hnnode;
        struct nf_conntrack_tuple tuple;
};

static inline bool __nf_ct_tuple_src_equal(const struct nf_conntrack_tuple *t1,
                                           const struct nf_conntrack_tuple *t2)
{
        return (nf_inet_addr_cmp(&t1->src.u3, &t2->src.u3) &&
                t1->src.u.all == t2->src.u.all &&
                t1->src.l3num == t2->src.l3num);
}

static inline bool __nf_ct_tuple_dst_equal(const struct nf_conntrack_tuple *t1,
                                           const struct nf_conntrack_tuple *t2)
{
        return (nf_inet_addr_cmp(&t1->dst.u3, &t2->dst.u3) &&
                t1->dst.u.all == t2->dst.u.all &&
                t1->dst.protonum == t2->dst.protonum);
}

static inline bool nf_ct_tuple_equal(const struct nf_conntrack_tuple *t1,
                                     const struct nf_conntrack_tuple *t2)
{
        return __nf_ct_tuple_src_equal(t1, t2) &&
               __nf_ct_tuple_dst_equal(t1, t2);
}

static inline bool
nf_ct_tuple_mask_equal(const struct nf_conntrack_tuple_mask *m1,
                       const struct nf_conntrack_tuple_mask *m2)
{
        return (nf_inet_addr_cmp(&m1->src.u3, &m2->src.u3) &&
                m1->src.u.all == m2->src.u.all);
}

static inline bool
nf_ct_tuple_src_mask_cmp(const struct nf_conntrack_tuple *t1,
                         const struct nf_conntrack_tuple *t2,
                         const struct nf_conntrack_tuple_mask *mask)
{
        int count;

        for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++) {
                if ((t1->src.u3.all[count] ^ t2->src.u3.all[count]) &
                    mask->src.u3.all[count])
                        return false;
        }

        if ((t1->src.u.all ^ t2->src.u.all) & mask->src.u.all)
                return false;

        if (t1->src.l3num != t2->src.l3num ||
            t1->dst.protonum != t2->dst.protonum)
                return false;

        return true;
}

static inline bool
nf_ct_tuple_mask_cmp(const struct nf_conntrack_tuple *t,
                     const struct nf_conntrack_tuple *tuple,
                     const struct nf_conntrack_tuple_mask *mask)
{
        return nf_ct_tuple_src_mask_cmp(t, tuple, mask) &&
               __nf_ct_tuple_dst_equal(t, tuple);
}

#endif /* _NF_CONNTRACK_TUPLE_H */

























































































































































































































    1 








































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
/*
 * cgroup_freezer.c -  control group freezer subsystem
 *
 * Copyright IBM Corporation, 2007
 *
 * Author : Cedric Le Goater <clg@fr.ibm.com>
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of version 2.1 of the GNU Lesser General Public License
 * as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it would be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 */

#include <linux/export.h>
#include <linux/slab.h>
#include <linux/cgroup.h>
#include <linux/fs.h>
#include <linux/uaccess.h>
#include <linux/freezer.h>
#include <linux/seq_file.h>
#include <linux/mutex.h>

/*
 * A cgroup is freezing if any FREEZING flags are set.  FREEZING_SELF is
 * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared
 * for "THAWED".  FREEZING_PARENT is set if the parent freezer is FREEZING
 * for whatever reason.  IOW, a cgroup has FREEZING_PARENT set if one of
 * its ancestors has FREEZING_SELF set.
 */
enum freezer_state_flags {
        CGROUP_FREEZER_ONLINE        = (1 << 0), /* freezer is fully online */
        CGROUP_FREEZING_SELF        = (1 << 1), /* this freezer is freezing */
        CGROUP_FREEZING_PARENT        = (1 << 2), /* the parent freezer is freezing */
        CGROUP_FROZEN                = (1 << 3), /* this and its descendants frozen */

        /* mask for all FREEZING flags */
        CGROUP_FREEZING                = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT,
};

struct freezer {
        struct cgroup_subsys_state        css;
        unsigned int                        state;
};

static DEFINE_MUTEX(freezer_mutex);

static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
{
        return css ? container_of(css, struct freezer, css) : NULL;
}

static inline struct freezer *task_freezer(struct task_struct *task)
{
        return css_freezer(task_css(task, freezer_cgrp_id));
}

static struct freezer *parent_freezer(struct freezer *freezer)
{
        return css_freezer(freezer->css.parent);
}

bool cgroup_freezing(struct task_struct *task)
{
        bool ret;

        rcu_read_lock();
        ret = task_freezer(task)->state & CGROUP_FREEZING;
        rcu_read_unlock();

        return ret;
}

static const char *freezer_state_strs(unsigned int state)
{
        if (state & CGROUP_FROZEN)
                return "FROZEN";
        if (state & CGROUP_FREEZING)
                return "FREEZING";
        return "THAWED";
};

static struct cgroup_subsys_state *
freezer_css_alloc(struct cgroup_subsys_state *parent_css)
{
        struct freezer *freezer;

        freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL);
        if (!freezer)
                return ERR_PTR(-ENOMEM);

        return &freezer->css;
}

/**
 * freezer_css_online - commit creation of a freezer css
 * @css: css being created
 *
 * We're committing to creation of @css.  Mark it online and inherit
 * parent's freezing state while holding both parent's and our
 * freezer->lock.
 */
static int freezer_css_online(struct cgroup_subsys_state *css)
{
        struct freezer *freezer = css_freezer(css);
        struct freezer *parent = parent_freezer(freezer);

        mutex_lock(&freezer_mutex);

        freezer->state |= CGROUP_FREEZER_ONLINE;

        if (parent && (parent->state & CGROUP_FREEZING)) {
                freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
                atomic_inc(&system_freezing_cnt);
        }

        mutex_unlock(&freezer_mutex);
        return 0;
}

/**
 * freezer_css_offline - initiate destruction of a freezer css
 * @css: css being destroyed
 *
 * @css is going away.  Mark it dead and decrement system_freezing_count if
 * it was holding one.
 */
static void freezer_css_offline(struct cgroup_subsys_state *css)
{
        struct freezer *freezer = css_freezer(css);

        mutex_lock(&freezer_mutex);

        if (freezer->state & CGROUP_FREEZING)
                atomic_dec(&system_freezing_cnt);

        freezer->state = 0;

        mutex_unlock(&freezer_mutex);
}

static void freezer_css_free(struct cgroup_subsys_state *css)
{
        kfree(css_freezer(css));
}

/*
 * Tasks can be migrated into a different freezer anytime regardless of its
 * current state.  freezer_attach() is responsible for making new tasks
 * conform to the current state.
 *
 * Freezer state changes and task migration are synchronized via
 * @freezer->lock.  freezer_attach() makes the new tasks conform to the
 * current state and all following state changes can see the new tasks.
 */
static void freezer_attach(struct cgroup_taskset *tset)
{
        struct task_struct *task;
        struct cgroup_subsys_state *new_css;

        mutex_lock(&freezer_mutex);

        /*
         * Make the new tasks conform to the current state of @new_css.
         * For simplicity, when migrating any task to a FROZEN cgroup, we
         * revert it to FREEZING and let update_if_frozen() determine the
         * correct state later.
         *
         * Tasks in @tset are on @new_css but may not conform to its
         * current state before executing the following - !frozen tasks may
         * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
         */
        cgroup_taskset_for_each(task, new_css, tset) {
                struct freezer *freezer = css_freezer(new_css);

                if (!(freezer->state & CGROUP_FREEZING)) {
                        __thaw_task(task);
                } else {
                        freeze_task(task);
                        /* clear FROZEN and propagate upwards */
                        while (freezer && (freezer->state & CGROUP_FROZEN)) {
                                freezer->state &= ~CGROUP_FROZEN;
                                freezer = parent_freezer(freezer);
                        }
                }
        }

        mutex_unlock(&freezer_mutex);
}

/**
 * freezer_fork - cgroup post fork callback
 * @task: a task which has just been forked
 *
 * @task has just been created and should conform to the current state of
 * the cgroup_freezer it belongs to.  This function may race against
 * freezer_attach().  Losing to freezer_attach() means that we don't have
 * to do anything as freezer_attach() will put @task into the appropriate
 * state.
 */
static void freezer_fork(struct task_struct *task)
{
        struct freezer *freezer;

        /*
         * The root cgroup is non-freezable, so we can skip locking the
         * freezer.  This is safe regardless of race with task migration.
         * If we didn't race or won, skipping is obviously the right thing
         * to do.  If we lost and root is the new cgroup, noop is still the
         * right thing to do.
         */
        if (task_css_is_root(task, freezer_cgrp_id))
                return;

        mutex_lock(&freezer_mutex);
        rcu_read_lock();

        freezer = task_freezer(task);
        if (freezer->state & CGROUP_FREEZING)
                freeze_task(task);

        rcu_read_unlock();
        mutex_unlock(&freezer_mutex);
}

/**
 * update_if_frozen - update whether a cgroup finished freezing
 * @css: css of interest
 *
 * Once FREEZING is initiated, transition to FROZEN is lazily updated by
 * calling this function.  If the current state is FREEZING but not FROZEN,
 * this function checks whether all tasks of this cgroup and the descendant
 * cgroups finished freezing and, if so, sets FROZEN.
 *
 * The caller is responsible for grabbing RCU read lock and calling
 * update_if_frozen() on all descendants prior to invoking this function.
 *
 * Task states and freezer state might disagree while tasks are being
 * migrated into or out of @css, so we can't verify task states against
 * @freezer state here.  See freezer_attach() for details.
 */
static void update_if_frozen(struct cgroup_subsys_state *css)
{
        struct freezer *freezer = css_freezer(css);
        struct cgroup_subsys_state *pos;
        struct css_task_iter it;
        struct task_struct *task;

        lockdep_assert_held(&freezer_mutex);

        if (!(freezer->state & CGROUP_FREEZING) ||
            (freezer->state & CGROUP_FROZEN))
                return;

        /* are all (live) children frozen? */
        rcu_read_lock();
        css_for_each_child(pos, css) {
                struct freezer *child = css_freezer(pos);

                if ((child->state & CGROUP_FREEZER_ONLINE) &&
                    !(child->state & CGROUP_FROZEN)) {
                        rcu_read_unlock();
                        return;
                }
        }
        rcu_read_unlock();

        /* are all tasks frozen? */
        css_task_iter_start(css, 0, &it);

        while ((task = css_task_iter_next(&it))) {
                if (freezing(task)) {
                        /*
                         * freezer_should_skip() indicates that the task
                         * should be skipped when determining freezing
                         * completion.  Consider it frozen in addition to
                         * the usual frozen condition.
                         */
                        if (!frozen(task) && !freezer_should_skip(task))
                                goto out_iter_end;
                }
        }

        freezer->state |= CGROUP_FROZEN;
out_iter_end:
        css_task_iter_end(&it);
}

static int freezer_read(struct seq_file *m, void *v)
{
        struct cgroup_subsys_state *css = seq_css(m), *pos;

        mutex_lock(&freezer_mutex);
        rcu_read_lock();

        /* update states bottom-up */
        css_for_each_descendant_post(pos, css) {
                if (!css_tryget_online(pos))
                        continue;
                rcu_read_unlock();

                update_if_frozen(pos);

                rcu_read_lock();
                css_put(pos);
        }

        rcu_read_unlock();
        mutex_unlock(&freezer_mutex);

        seq_puts(m, freezer_state_strs(css_freezer(css)->state));
        seq_putc(m, '\n');
        return 0;
}

static void freeze_cgroup(struct freezer *freezer)
{
        struct css_task_iter it;
        struct task_struct *task;

        css_task_iter_start(&freezer->css, 0, &it);
        while ((task = css_task_iter_next(&it)))
                freeze_task(task);
        css_task_iter_end(&it);
}

static void unfreeze_cgroup(struct freezer *freezer)
{
        struct css_task_iter it;
        struct task_struct *task;

        css_task_iter_start(&freezer->css, 0, &it);
        while ((task = css_task_iter_next(&it)))
                __thaw_task(task);
        css_task_iter_end(&it);
}

/**
 * freezer_apply_state - apply state change to a single cgroup_freezer
 * @freezer: freezer to apply state change to
 * @freeze: whether to freeze or unfreeze
 * @state: CGROUP_FREEZING_* flag to set or clear
 *
 * Set or clear @state on @cgroup according to @freeze, and perform
 * freezing or thawing as necessary.
 */
static void freezer_apply_state(struct freezer *freezer, bool freeze,
                                unsigned int state)
{
        /* also synchronizes against task migration, see freezer_attach() */
        lockdep_assert_held(&freezer_mutex);

        if (!(freezer->state & CGROUP_FREEZER_ONLINE))
                return;

        if (freeze) {
                if (!(freezer->state & CGROUP_FREEZING))
                        atomic_inc(&system_freezing_cnt);
                freezer->state |= state;
                freeze_cgroup(freezer);
        } else {
                bool was_freezing = freezer->state & CGROUP_FREEZING;

                freezer->state &= ~state;

                if (!(freezer->state & CGROUP_FREEZING)) {
                        if (was_freezing)
                                atomic_dec(&system_freezing_cnt);
                        freezer->state &= ~CGROUP_FROZEN;
                        unfreeze_cgroup(freezer);
                }
        }
}

/**
 * freezer_change_state - change the freezing state of a cgroup_freezer
 * @freezer: freezer of interest
 * @freeze: whether to freeze or thaw
 *
 * Freeze or thaw @freezer according to @freeze.  The operations are
 * recursive - all descendants of @freezer will be affected.
 */
static void freezer_change_state(struct freezer *freezer, bool freeze)
{
        struct cgroup_subsys_state *pos;

        /*
         * Update all its descendants in pre-order traversal.  Each
         * descendant will try to inherit its parent's FREEZING state as
         * CGROUP_FREEZING_PARENT.
         */
        mutex_lock(&freezer_mutex);
        rcu_read_lock();
        css_for_each_descendant_pre(pos, &freezer->css) {
                struct freezer *pos_f = css_freezer(pos);
                struct freezer *parent = parent_freezer(pos_f);

                if (!css_tryget_online(pos))
                        continue;
                rcu_read_unlock();

                if (pos_f == freezer)
                        freezer_apply_state(pos_f, freeze,
                                            CGROUP_FREEZING_SELF);
                else
                        freezer_apply_state(pos_f,
                                            parent->state & CGROUP_FREEZING,
                                            CGROUP_FREEZING_PARENT);

                rcu_read_lock();
                css_put(pos);
        }
        rcu_read_unlock();
        mutex_unlock(&freezer_mutex);
}

static ssize_t freezer_write(struct kernfs_open_file *of,
                             char *buf, size_t nbytes, loff_t off)
{
        bool freeze;

        buf = strstrip(buf);

        if (strcmp(buf, freezer_state_strs(0)) == 0)
                freeze = false;
        else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0)
                freeze = true;
        else
                return -EINVAL;

        freezer_change_state(css_freezer(of_css(of)), freeze);
        return nbytes;
}

static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css,
                                      struct cftype *cft)
{
        struct freezer *freezer = css_freezer(css);

        return (bool)(freezer->state & CGROUP_FREEZING_SELF);
}

static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css,
                                        struct cftype *cft)
{
        struct freezer *freezer = css_freezer(css);

        return (bool)(freezer->state & CGROUP_FREEZING_PARENT);
}

static struct cftype files[] = {
        {
                .name = "state",
                .flags = CFTYPE_NOT_ON_ROOT,
                .seq_show = freezer_read,
                .write = freezer_write,
        },
        {
                .name = "self_freezing",
                .flags = CFTYPE_NOT_ON_ROOT,
                .read_u64 = freezer_self_freezing_read,
        },
        {
                .name = "parent_freezing",
                .flags = CFTYPE_NOT_ON_ROOT,
                .read_u64 = freezer_parent_freezing_read,
        },
        { }        /* terminate */
};

struct cgroup_subsys freezer_cgrp_subsys = {
        .css_alloc        = freezer_css_alloc,
        .css_online        = freezer_css_online,
        .css_offline        = freezer_css_offline,
        .css_free        = freezer_css_free,
        .attach                = freezer_attach,
        .fork                = freezer_fork,
        .legacy_cftypes        = files,
};














































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_NLS_H
#define _LINUX_NLS_H

#include <linux/init.h>

/* Unicode has changed over the years.  Unicode code points no longer
 * fit into 16 bits; as of Unicode 5 valid code points range from 0
 * to 0x10ffff (17 planes, where each plane holds 65536 code points).
 *
 * The original decision to represent Unicode characters as 16-bit
 * wchar_t values is now outdated.  But plane 0 still includes the
 * most commonly used characters, so we will retain it.  The newer
 * 32-bit unicode_t type can be used when it is necessary to
 * represent the full Unicode character set.
 */

/* Plane-0 Unicode character */
typedef u16 wchar_t;
#define MAX_WCHAR_T        0xffff

/* Arbitrary Unicode character */
typedef u32 unicode_t;

struct nls_table {
        const char *charset;
        const char *alias;
        int (*uni2char) (wchar_t uni, unsigned char *out, int boundlen);
        int (*char2uni) (const unsigned char *rawstring, int boundlen,
                         wchar_t *uni);
        const unsigned char *charset2lower;
        const unsigned char *charset2upper;
        struct module *owner;
        struct nls_table *next;
};

/* this value hold the maximum octet of charset */
#define NLS_MAX_CHARSET_SIZE 6 /* for UTF-8 */

/* Byte order for UTF-16 strings */
enum utf16_endian {
        UTF16_HOST_ENDIAN,
        UTF16_LITTLE_ENDIAN,
        UTF16_BIG_ENDIAN
};

/* nls_base.c */
extern int __register_nls(struct nls_table *, struct module *);
extern int unregister_nls(struct nls_table *);
extern struct nls_table *load_nls(const char *charset);
extern void unload_nls(struct nls_table *);
extern struct nls_table *load_nls_default(void);
#define register_nls(nls) __register_nls((nls), THIS_MODULE)

extern int utf8_to_utf32(const u8 *s, int len, unicode_t *pu);
extern int utf32_to_utf8(unicode_t u, u8 *s, int maxlen);
extern int utf8s_to_utf16s(const u8 *s, int len,
                enum utf16_endian endian, wchar_t *pwcs, int maxlen);
extern int utf16s_to_utf8s(const wchar_t *pwcs, int len,
                enum utf16_endian endian, u8 *s, int maxlen);

static inline unsigned char nls_tolower(struct nls_table *t, unsigned char c)
{
        unsigned char nc = t->charset2lower[c];

        return nc ? nc : c;
}

static inline unsigned char nls_toupper(struct nls_table *t, unsigned char c)
{
        unsigned char nc = t->charset2upper[c];

        return nc ? nc : c;
}

static inline int nls_strnicmp(struct nls_table *t, const unsigned char *s1,
                const unsigned char *s2, int len)
{
        while (len--) {
                if (nls_tolower(t, *s1++) != nls_tolower(t, *s2++))
                        return 1;
        }

        return 0;
}

/*
 * nls_nullsize - return length of null character for codepage
 * @codepage - codepage for which to return length of NULL terminator
 *
 * Since we can't guarantee that the null terminator will be a particular
 * length, we have to check against the codepage. If there's a problem
 * determining it, assume a single-byte NULL terminator.
 */
static inline int
nls_nullsize(const struct nls_table *codepage)
{
        int charlen;
        char tmp[NLS_MAX_CHARSET_SIZE];

        charlen = codepage->uni2char(0, tmp, NLS_MAX_CHARSET_SIZE);

        return charlen > 0 ? charlen : 1;
}

#define MODULE_ALIAS_NLS(name)        MODULE_ALIAS("nls_" __stringify(name))

#endif /* _LINUX_NLS_H */


































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
  Red Black Trees
  (C) 1999  Andrea Arcangeli <andrea@suse.de>
  

  linux/include/linux/rbtree.h

  To use rbtrees you'll have to implement your own insert and search cores.
  This will avoid us to use callbacks and to drop drammatically performances.
  I know it's not the cleaner way,  but in C (not in C++) to get
  performances and genericity...

  See Documentation/core-api/rbtree.rst for documentation and samples.
*/

#ifndef        _LINUX_RBTREE_H
#define        _LINUX_RBTREE_H

#include <linux/kernel.h>
#include <linux/stddef.h>
#include <linux/rcupdate.h>

struct rb_node {
        unsigned long  __rb_parent_color;
        struct rb_node *rb_right;
        struct rb_node *rb_left;
} __attribute__((aligned(sizeof(long))));
    /* The alignment might seem pointless, but allegedly CRIS needs it */

struct rb_root {
        struct rb_node *rb_node;
};

#define rb_parent(r)   ((struct rb_node *)((r)->__rb_parent_color & ~3))

#define RB_ROOT        (struct rb_root) { NULL, }
#define        rb_entry(ptr, type, member) container_of(ptr, type, member)

#define RB_EMPTY_ROOT(root)  (READ_ONCE((root)->rb_node) == NULL)

/* 'empty' nodes are nodes that are known not to be inserted in an rbtree */
#define RB_EMPTY_NODE(node)  \
        ((node)->__rb_parent_color == (unsigned long)(node))
#define RB_CLEAR_NODE(node)  \
        ((node)->__rb_parent_color = (unsigned long)(node))


extern void rb_insert_color(struct rb_node *, struct rb_root *);
extern void rb_erase(struct rb_node *, struct rb_root *);


/* Find logical next and previous nodes in a tree */
extern struct rb_node *rb_next(const struct rb_node *);
extern struct rb_node *rb_prev(const struct rb_node *);
extern struct rb_node *rb_first(const struct rb_root *);
extern struct rb_node *rb_last(const struct rb_root *);

/* Postorder iteration - always visit the parent after its children */
extern struct rb_node *rb_first_postorder(const struct rb_root *);
extern struct rb_node *rb_next_postorder(const struct rb_node *);

/* Fast replacement of a single node without remove/rebalance/add/rebalance */
extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
                            struct rb_root *root);
extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
                                struct rb_root *root);

static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
                                struct rb_node **rb_link)
{
        node->__rb_parent_color = (unsigned long)parent;
        node->rb_left = node->rb_right = NULL;

        *rb_link = node;
}

static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
                                    struct rb_node **rb_link)
{
        node->__rb_parent_color = (unsigned long)parent;
        node->rb_left = node->rb_right = NULL;

        rcu_assign_pointer(*rb_link, node);
}

#define rb_entry_safe(ptr, type, member) \
        ({ typeof(ptr) ____ptr = (ptr); \
           ____ptr ? rb_entry(____ptr, type, member) : NULL; \
        })

/**
 * rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of
 * given type allowing the backing memory of @pos to be invalidated
 *
 * @pos:        the 'type *' to use as a loop cursor.
 * @n:                another 'type *' to use as temporary storage
 * @root:        'rb_root *' of the rbtree.
 * @field:        the name of the rb_node field within 'type'.
 *
 * rbtree_postorder_for_each_entry_safe() provides a similar guarantee as
 * list_for_each_entry_safe() and allows the iteration to continue independent
 * of changes to @pos by the body of the loop.
 *
 * Note, however, that it cannot handle other modifications that re-order the
 * rbtree it is iterating over. This includes calling rb_erase() on @pos, as
 * rb_erase() may rebalance the tree, causing us to miss some nodes.
 */
#define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \
        for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \
             pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \
                        typeof(*pos), field); 1; }); \
             pos = n)

/*
 * Leftmost-cached rbtrees.
 *
 * We do not cache the rightmost node based on footprint
 * size vs number of potential users that could benefit
 * from O(1) rb_last(). Just not worth it, users that want
 * this feature can always implement the logic explicitly.
 * Furthermore, users that want to cache both pointers may
 * find it a bit asymmetric, but that's ok.
 */
struct rb_root_cached {
        struct rb_root rb_root;
        struct rb_node *rb_leftmost;
};

#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL }

/* Same as rb_first(), but O(1) */
#define rb_first_cached(root) (root)->rb_leftmost

static inline void rb_insert_color_cached(struct rb_node *node,
                                          struct rb_root_cached *root,
                                          bool leftmost)
{
        if (leftmost)
                root->rb_leftmost = node;
        rb_insert_color(node, &root->rb_root);
}

static inline void rb_erase_cached(struct rb_node *node,
                                   struct rb_root_cached *root)
{
        if (root->rb_leftmost == node)
                root->rb_leftmost = rb_next(node);
        rb_erase(node, &root->rb_root);
}

static inline void rb_replace_node_cached(struct rb_node *victim,
                                          struct rb_node *new,
                                          struct rb_root_cached *root)
{
        if (root->rb_leftmost == victim)
                root->rb_leftmost = new;
        rb_replace_node(victim, new, &root->rb_root);
}

#endif        /* _LINUX_RBTREE_H */




























    1 



















































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MATH64_H
#define _LINUX_MATH64_H

#include <linux/types.h>
#include <vdso/math64.h>
#include <asm/div64.h>

#if BITS_PER_LONG == 64

#define div64_long(x, y) div64_s64((x), (y))
#define div64_ul(x, y)   div64_u64((x), (y))

/**
 * div_u64_rem - unsigned 64bit divide with 32bit divisor with remainder
 * @dividend: unsigned 64bit dividend
 * @divisor: unsigned 32bit divisor
 * @remainder: pointer to unsigned 32bit remainder
 *
 * Return: sets ``*remainder``, then returns dividend / divisor
 *
 * This is commonly provided by 32bit archs to provide an optimized 64bit
 * divide.
 */
static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
{
        *remainder = dividend % divisor;
        return dividend / divisor;
}

/*
 * div_s64_rem - signed 64bit divide with 32bit divisor with remainder
 * @dividend: signed 64bit dividend
 * @divisor: signed 32bit divisor
 * @remainder: pointer to signed 32bit remainder
 *
 * Return: sets ``*remainder``, then returns dividend / divisor
 */
static inline s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder)
{
        *remainder = dividend % divisor;
        return dividend / divisor;
}

/*
 * div64_u64_rem - unsigned 64bit divide with 64bit divisor and remainder
 * @dividend: unsigned 64bit dividend
 * @divisor: unsigned 64bit divisor
 * @remainder: pointer to unsigned 64bit remainder
 *
 * Return: sets ``*remainder``, then returns dividend / divisor
 */
static inline u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
{
        *remainder = dividend % divisor;
        return dividend / divisor;
}

/*
 * div64_u64 - unsigned 64bit divide with 64bit divisor
 * @dividend: unsigned 64bit dividend
 * @divisor: unsigned 64bit divisor
 *
 * Return: dividend / divisor
 */
static inline u64 div64_u64(u64 dividend, u64 divisor)
{
        return dividend / divisor;
}

/*
 * div64_s64 - signed 64bit divide with 64bit divisor
 * @dividend: signed 64bit dividend
 * @divisor: signed 64bit divisor
 *
 * Return: dividend / divisor
 */
static inline s64 div64_s64(s64 dividend, s64 divisor)
{
        return dividend / divisor;
}

#elif BITS_PER_LONG == 32

#define div64_long(x, y) div_s64((x), (y))
#define div64_ul(x, y)   div_u64((x), (y))

#ifndef div_u64_rem
static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
{
        *remainder = do_div(dividend, divisor);
        return dividend;
}
#endif

#ifndef div_s64_rem
extern s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder);
#endif

#ifndef div64_u64_rem
extern u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder);
#endif

#ifndef div64_u64
extern u64 div64_u64(u64 dividend, u64 divisor);
#endif

#ifndef div64_s64
extern s64 div64_s64(s64 dividend, s64 divisor);
#endif

#endif /* BITS_PER_LONG */

/**
 * div_u64 - unsigned 64bit divide with 32bit divisor
 * @dividend: unsigned 64bit dividend
 * @divisor: unsigned 32bit divisor
 *
 * This is the most common 64bit divide and should be used if possible,
 * as many 32bit archs can optimize this variant better than a full 64bit
 * divide.
 */
#ifndef div_u64
static inline u64 div_u64(u64 dividend, u32 divisor)
{
        u32 remainder;
        return div_u64_rem(dividend, divisor, &remainder);
}
#endif

/**
 * div_s64 - signed 64bit divide with 32bit divisor
 * @dividend: signed 64bit dividend
 * @divisor: signed 32bit divisor
 */
#ifndef div_s64
static inline s64 div_s64(s64 dividend, s32 divisor)
{
        s32 remainder;
        return div_s64_rem(dividend, divisor, &remainder);
}
#endif

u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder);

#ifndef mul_u32_u32
/*
 * Many a GCC version messes this up and generates a 64x64 mult :-(
 */
static inline u64 mul_u32_u32(u32 a, u32 b)
{
        return (u64)a * b;
}
#endif

#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)

#ifndef mul_u64_u32_shr
static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
{
        return (u64)(((unsigned __int128)a * mul) >> shift);
}
#endif /* mul_u64_u32_shr */

#ifndef mul_u64_u64_shr
static inline u64 mul_u64_u64_shr(u64 a, u64 mul, unsigned int shift)
{
        return (u64)(((unsigned __int128)a * mul) >> shift);
}
#endif /* mul_u64_u64_shr */

#else

#ifndef mul_u64_u32_shr
static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
{
        u32 ah, al;
        u64 ret;

        al = a;
        ah = a >> 32;

        ret = mul_u32_u32(al, mul) >> shift;
        if (ah)
                ret += mul_u32_u32(ah, mul) << (32 - shift);

        return ret;
}
#endif /* mul_u64_u32_shr */

#ifndef mul_u64_u64_shr
static inline u64 mul_u64_u64_shr(u64 a, u64 b, unsigned int shift)
{
        union {
                u64 ll;
                struct {
#ifdef __BIG_ENDIAN
                        u32 high, low;
#else
                        u32 low, high;
#endif
                } l;
        } rl, rm, rn, rh, a0, b0;
        u64 c;

        a0.ll = a;
        b0.ll = b;

        rl.ll = mul_u32_u32(a0.l.low, b0.l.low);
        rm.ll = mul_u32_u32(a0.l.low, b0.l.high);
        rn.ll = mul_u32_u32(a0.l.high, b0.l.low);
        rh.ll = mul_u32_u32(a0.l.high, b0.l.high);

        /*
         * Each of these lines computes a 64-bit intermediate result into "c",
         * starting at bits 32-95.  The low 32-bits go into the result of the
         * multiplication, the high 32-bits are carried into the next step.
         */
        rl.l.high = c = (u64)rl.l.high + rm.l.low + rn.l.low;
        rh.l.low = c = (c >> 32) + rm.l.high + rn.l.high + rh.l.low;
        rh.l.high = (c >> 32) + rh.l.high;

        /*
         * The 128-bit result of the multiplication is in rl.ll and rh.ll,
         * shift it right and throw away the high part of the result.
         */
        if (shift == 0)
                return rl.ll;
        if (shift < 64)
                return (rl.ll >> shift) | (rh.ll << (64 - shift));
        return rh.ll >> (shift & 63);
}
#endif /* mul_u64_u64_shr */

#endif

#ifndef mul_u64_u32_div
static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor)
{
        union {
                u64 ll;
                struct {
#ifdef __BIG_ENDIAN
                        u32 high, low;
#else
                        u32 low, high;
#endif
                } l;
        } u, rl, rh;

        u.ll = a;
        rl.ll = mul_u32_u32(u.l.low, mul);
        rh.ll = mul_u32_u32(u.l.high, mul) + rl.l.high;

        /* Bits 32-63 of the result will be in rh.l.low. */
        rl.l.high = do_div(rh.ll, divisor);

        /* Bits 0-31 of the result will be in rl.l.low.        */
        do_div(rl.ll, divisor);

        rl.l.high = rh.l.low;
        return rl.ll;
}
#endif /* mul_u64_u32_div */

u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div);

#define DIV64_U64_ROUND_UP(ll, d)        \
        ({ u64 _tmp = (d); div64_u64((ll) + _tmp - 1, _tmp); })

/**
 * DIV64_U64_ROUND_CLOSEST - unsigned 64bit divide with 64bit divisor rounded to nearest integer
 * @dividend: unsigned 64bit dividend
 * @divisor: unsigned 64bit divisor
 *
 * Divide unsigned 64bit dividend by unsigned 64bit divisor
 * and round to closest integer.
 *
 * Return: dividend / divisor rounded to nearest integer
 */
#define DIV64_U64_ROUND_CLOSEST(dividend, divisor)        \
        ({ u64 _tmp = (divisor); div64_u64((dividend) + _tmp / 2, _tmp); })

/*
 * DIV_S64_ROUND_CLOSEST - signed 64bit divide with 32bit divisor rounded to nearest integer
 * @dividend: signed 64bit dividend
 * @divisor: signed 32bit divisor
 *
 * Divide signed 64bit dividend by signed 32bit divisor
 * and round to closest integer.
 *
 * Return: dividend / divisor rounded to nearest integer
 */
#define DIV_S64_ROUND_CLOSEST(dividend, divisor)(        \
{                                                        \
        s64 __x = (dividend);                                \
        s32 __d = (divisor);                                \
        ((__x > 0) == (__d > 0)) ?                        \
                div_s64((__x + (__d / 2)), __d) :        \
                div_s64((__x - (__d / 2)), __d);        \
}                                                        \
)
#endif /* _LINUX_MATH64_H */

































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef _ASM_X86_KPROBES_H
#define _ASM_X86_KPROBES_H
/*
 *  Kernel Probes (KProbes)
 *
 * Copyright (C) IBM Corporation, 2002, 2004
 *
 * See arch/x86/kernel/kprobes.c for x86 kprobes history.
 */

#include <asm-generic/kprobes.h>

#ifdef CONFIG_KPROBES
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/percpu.h>
#include <asm/text-patching.h>
#include <asm/insn.h>

#define  __ARCH_WANT_KPROBES_INSN_SLOT

struct pt_regs;
struct kprobe;

typedef u8 kprobe_opcode_t;

#define MAX_STACK_SIZE 64
#define CUR_STACK_SIZE(ADDR) \
        (current_top_of_stack() - (unsigned long)(ADDR))
#define MIN_STACK_SIZE(ADDR)                                \
        (MAX_STACK_SIZE < CUR_STACK_SIZE(ADDR) ?        \
         MAX_STACK_SIZE : CUR_STACK_SIZE(ADDR))

#define flush_insn_slot(p)        do { } while (0)

/* optinsn template addresses */
extern __visible kprobe_opcode_t optprobe_template_entry[];
extern __visible kprobe_opcode_t optprobe_template_clac[];
extern __visible kprobe_opcode_t optprobe_template_val[];
extern __visible kprobe_opcode_t optprobe_template_call[];
extern __visible kprobe_opcode_t optprobe_template_end[];
#define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + DISP32_SIZE)
#define MAX_OPTINSN_SIZE                                 \
        (((unsigned long)optprobe_template_end -        \
          (unsigned long)optprobe_template_entry) +        \
         MAX_OPTIMIZED_LENGTH + JMP32_INSN_SIZE)

extern const int kretprobe_blacklist_size;

void arch_remove_kprobe(struct kprobe *p);
asmlinkage void kretprobe_trampoline(void);

extern void arch_kprobe_override_function(struct pt_regs *regs);

/* Architecture specific copy of original instruction*/
struct arch_specific_insn {
        /* copy of the original instruction */
        kprobe_opcode_t *insn;
        /*
         * boostable = 0: This instruction type is not boostable.
         * boostable = 1: This instruction has been boosted: we have
         * added a relative jump after the instruction copy in insn,
         * so no single-step and fixup are needed (unless there's
         * a post_handler).
         */
        unsigned boostable:1;
        unsigned char size;        /* The size of insn */
        union {
                unsigned char opcode;
                struct {
                        unsigned char type;
                } jcc;
                struct {
                        unsigned char type;
                        unsigned char asize;
                } loop;
                struct {
                        unsigned char reg;
                } indirect;
        };
        s32 rel32;        /* relative offset must be s32, s16, or s8 */
        void (*emulate_op)(struct kprobe *p, struct pt_regs *regs);
        /* Number of bytes of text poked */
        int tp_len;
};

struct arch_optimized_insn {
        /* copy of the original instructions */
        kprobe_opcode_t copied_insn[DISP32_SIZE];
        /* detour code buffer */
        kprobe_opcode_t *insn;
        /* the size of instructions copied to detour code buffer */
        size_t size;
};

/* Return true (!0) if optinsn is prepared for optimization. */
static inline int arch_prepared_optinsn(struct arch_optimized_insn *optinsn)
{
        return optinsn->size;
}

struct prev_kprobe {
        struct kprobe *kp;
        unsigned long status;
        unsigned long old_flags;
        unsigned long saved_flags;
};

/* per-cpu kprobe control block */
struct kprobe_ctlblk {
        unsigned long kprobe_status;
        unsigned long kprobe_old_flags;
        unsigned long kprobe_saved_flags;
        struct prev_kprobe prev_kprobe;
};

extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
extern int kprobe_exceptions_notify(struct notifier_block *self,
                                    unsigned long val, void *data);
extern int kprobe_int3_handler(struct pt_regs *regs);

#else

static inline int kprobe_debug_handler(struct pt_regs *regs) { return 0; }

#endif /* CONFIG_KPROBES */
#endif /* _ASM_X86_KPROBES_H */





















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * IEEE 802.11 defines
 *
 * Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen
 * <jkmaline@cc.hut.fi>
 * Copyright (c) 2002-2003, Jouni Malinen <jkmaline@cc.hut.fi>
 * Copyright (c) 2005, Devicescape Software, Inc.
 * Copyright (c) 2006, Michael Wu <flamingice@sourmilk.net>
 * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH
 * Copyright (c) 2016 - 2017 Intel Deutschland GmbH
 * Copyright (c) 2018 - 2020 Intel Corporation
 */

#ifndef LINUX_IEEE80211_H
#define LINUX_IEEE80211_H

#include <linux/types.h>
#include <linux/if_ether.h>
#include <linux/etherdevice.h>
#include <asm/byteorder.h>
#include <asm/unaligned.h>

/*
 * DS bit usage
 *
 * TA = transmitter address
 * RA = receiver address
 * DA = destination address
 * SA = source address
 *
 * ToDS    FromDS  A1(RA)  A2(TA)  A3      A4      Use
 * -----------------------------------------------------------------
 *  0       0       DA      SA      BSSID   -       IBSS/DLS
 *  0       1       DA      BSSID   SA      -       AP -> STA
 *  1       0       BSSID   SA      DA      -       AP <- STA
 *  1       1       RA      TA      DA      SA      unspecified (WDS)
 */

#define FCS_LEN 4

#define IEEE80211_FCTL_VERS                0x0003
#define IEEE80211_FCTL_FTYPE                0x000c
#define IEEE80211_FCTL_STYPE                0x00f0
#define IEEE80211_FCTL_TODS                0x0100
#define IEEE80211_FCTL_FROMDS                0x0200
#define IEEE80211_FCTL_MOREFRAGS        0x0400
#define IEEE80211_FCTL_RETRY                0x0800
#define IEEE80211_FCTL_PM                0x1000
#define IEEE80211_FCTL_MOREDATA                0x2000
#define IEEE80211_FCTL_PROTECTED        0x4000
#define IEEE80211_FCTL_ORDER                0x8000
#define IEEE80211_FCTL_CTL_EXT                0x0f00

#define IEEE80211_SCTL_FRAG                0x000F
#define IEEE80211_SCTL_SEQ                0xFFF0

#define IEEE80211_FTYPE_MGMT                0x0000
#define IEEE80211_FTYPE_CTL                0x0004
#define IEEE80211_FTYPE_DATA                0x0008
#define IEEE80211_FTYPE_EXT                0x000c

/* management */
#define IEEE80211_STYPE_ASSOC_REQ        0x0000
#define IEEE80211_STYPE_ASSOC_RESP        0x0010
#define IEEE80211_STYPE_REASSOC_REQ        0x0020
#define IEEE80211_STYPE_REASSOC_RESP        0x0030
#define IEEE80211_STYPE_PROBE_REQ        0x0040
#define IEEE80211_STYPE_PROBE_RESP        0x0050
#define IEEE80211_STYPE_BEACON                0x0080
#define IEEE80211_STYPE_ATIM                0x0090
#define IEEE80211_STYPE_DISASSOC        0x00A0
#define IEEE80211_STYPE_AUTH                0x00B0
#define IEEE80211_STYPE_DEAUTH                0x00C0
#define IEEE80211_STYPE_ACTION                0x00D0

/* control */
#define IEEE80211_STYPE_CTL_EXT                0x0060
#define IEEE80211_STYPE_BACK_REQ        0x0080
#define IEEE80211_STYPE_BACK                0x0090
#define IEEE80211_STYPE_PSPOLL                0x00A0
#define IEEE80211_STYPE_RTS                0x00B0
#define IEEE80211_STYPE_CTS                0x00C0
#define IEEE80211_STYPE_ACK                0x00D0
#define IEEE80211_STYPE_CFEND                0x00E0
#define IEEE80211_STYPE_CFENDACK        0x00F0

/* data */
#define IEEE80211_STYPE_DATA                        0x0000
#define IEEE80211_STYPE_DATA_CFACK                0x0010
#define IEEE80211_STYPE_DATA_CFPOLL                0x0020
#define IEEE80211_STYPE_DATA_CFACKPOLL                0x0030
#define IEEE80211_STYPE_NULLFUNC                0x0040
#define IEEE80211_STYPE_CFACK                        0x0050
#define IEEE80211_STYPE_CFPOLL                        0x0060
#define IEEE80211_STYPE_CFACKPOLL                0x0070
#define IEEE80211_STYPE_QOS_DATA                0x0080
#define IEEE80211_STYPE_QOS_DATA_CFACK                0x0090
#define IEEE80211_STYPE_QOS_DATA_CFPOLL                0x00A0
#define IEEE80211_STYPE_QOS_DATA_CFACKPOLL        0x00B0
#define IEEE80211_STYPE_QOS_NULLFUNC                0x00C0
#define IEEE80211_STYPE_QOS_CFACK                0x00D0
#define IEEE80211_STYPE_QOS_CFPOLL                0x00E0
#define IEEE80211_STYPE_QOS_CFACKPOLL                0x00F0

/* extension, added by 802.11ad */
#define IEEE80211_STYPE_DMG_BEACON                0x0000
#define IEEE80211_STYPE_S1G_BEACON                0x0010

/* bits unique to S1G beacon */
#define IEEE80211_S1G_BCN_NEXT_TBTT        0x100

/* see 802.11ah-2016 9.9 NDP CMAC frames */
#define IEEE80211_S1G_1MHZ_NDP_BITS        25
#define IEEE80211_S1G_1MHZ_NDP_BYTES        4
#define IEEE80211_S1G_2MHZ_NDP_BITS        37
#define IEEE80211_S1G_2MHZ_NDP_BYTES        5

#define IEEE80211_NDP_FTYPE_CTS                        0
#define IEEE80211_NDP_FTYPE_CF_END                0
#define IEEE80211_NDP_FTYPE_PS_POLL                1
#define IEEE80211_NDP_FTYPE_ACK                        2
#define IEEE80211_NDP_FTYPE_PS_POLL_ACK                3
#define IEEE80211_NDP_FTYPE_BA                        4
#define IEEE80211_NDP_FTYPE_BF_REPORT_POLL        5
#define IEEE80211_NDP_FTYPE_PAGING                6
#define IEEE80211_NDP_FTYPE_PREQ                7

#define SM64(f, v)        ((((u64)v) << f##_S) & f)

/* NDP CMAC frame fields */
#define IEEE80211_NDP_FTYPE                    0x0000000000000007
#define IEEE80211_NDP_FTYPE_S                  0x0000000000000000

/* 1M Probe Request 11ah 9.9.3.1.1 */
#define IEEE80211_NDP_1M_PREQ_ANO      0x0000000000000008
#define IEEE80211_NDP_1M_PREQ_ANO_S                     3
#define IEEE80211_NDP_1M_PREQ_CSSID    0x00000000000FFFF0
#define IEEE80211_NDP_1M_PREQ_CSSID_S                   4
#define IEEE80211_NDP_1M_PREQ_RTYPE    0x0000000000100000
#define IEEE80211_NDP_1M_PREQ_RTYPE_S                  20
#define IEEE80211_NDP_1M_PREQ_RSV      0x0000000001E00000
#define IEEE80211_NDP_1M_PREQ_RSV      0x0000000001E00000
/* 2M Probe Request 11ah 9.9.3.1.2 */
#define IEEE80211_NDP_2M_PREQ_ANO      0x0000000000000008
#define IEEE80211_NDP_2M_PREQ_ANO_S                     3
#define IEEE80211_NDP_2M_PREQ_CSSID    0x0000000FFFFFFFF0
#define IEEE80211_NDP_2M_PREQ_CSSID_S                   4
#define IEEE80211_NDP_2M_PREQ_RTYPE    0x0000001000000000
#define IEEE80211_NDP_2M_PREQ_RTYPE_S                  36

#define IEEE80211_ANO_NETTYPE_WILD              15

/* bits unique to S1G beacon */
#define IEEE80211_S1G_BCN_NEXT_TBTT    0x100

/* control extension - for IEEE80211_FTYPE_CTL | IEEE80211_STYPE_CTL_EXT */
#define IEEE80211_CTL_EXT_POLL                0x2000
#define IEEE80211_CTL_EXT_SPR                0x3000
#define IEEE80211_CTL_EXT_GRANT        0x4000
#define IEEE80211_CTL_EXT_DMG_CTS        0x5000
#define IEEE80211_CTL_EXT_DMG_DTS        0x6000
#define IEEE80211_CTL_EXT_SSW                0x8000
#define IEEE80211_CTL_EXT_SSW_FBACK        0x9000
#define IEEE80211_CTL_EXT_SSW_ACK        0xa000


#define IEEE80211_SN_MASK                ((IEEE80211_SCTL_SEQ) >> 4)
#define IEEE80211_MAX_SN                IEEE80211_SN_MASK
#define IEEE80211_SN_MODULO                (IEEE80211_MAX_SN + 1)


/* PV1 Layout 11ah 9.8.3.1 */
#define IEEE80211_PV1_FCTL_VERS                0x0003
#define IEEE80211_PV1_FCTL_FTYPE        0x001c
#define IEEE80211_PV1_FCTL_STYPE        0x00e0
#define IEEE80211_PV1_FCTL_TODS                0x0100
#define IEEE80211_PV1_FCTL_MOREFRAGS        0x0200
#define IEEE80211_PV1_FCTL_PM                0x0400
#define IEEE80211_PV1_FCTL_MOREDATA        0x0800
#define IEEE80211_PV1_FCTL_PROTECTED        0x1000
#define IEEE80211_PV1_FCTL_END_SP       0x2000
#define IEEE80211_PV1_FCTL_RELAYED      0x4000
#define IEEE80211_PV1_FCTL_ACK_POLICY   0x8000
#define IEEE80211_PV1_FCTL_CTL_EXT        0x0f00

static inline bool ieee80211_sn_less(u16 sn1, u16 sn2)
{
        return ((sn1 - sn2) & IEEE80211_SN_MASK) > (IEEE80211_SN_MODULO >> 1);
}

static inline u16 ieee80211_sn_add(u16 sn1, u16 sn2)
{
        return (sn1 + sn2) & IEEE80211_SN_MASK;
}

static inline u16 ieee80211_sn_inc(u16 sn)
{
        return ieee80211_sn_add(sn, 1);
}

static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2)
{
        return (sn1 - sn2) & IEEE80211_SN_MASK;
}

#define IEEE80211_SEQ_TO_SN(seq)        (((seq) & IEEE80211_SCTL_SEQ) >> 4)
#define IEEE80211_SN_TO_SEQ(ssn)        (((ssn) << 4) & IEEE80211_SCTL_SEQ)

/* miscellaneous IEEE 802.11 constants */
#define IEEE80211_MAX_FRAG_THRESHOLD        2352
#define IEEE80211_MAX_RTS_THRESHOLD        2353
#define IEEE80211_MAX_AID                2007
#define IEEE80211_MAX_AID_S1G                8191
#define IEEE80211_MAX_TIM_LEN                251
#define IEEE80211_MAX_MESH_PEERINGS        63
/* Maximum size for the MA-UNITDATA primitive, 802.11 standard section
   6.2.1.1.2.

   802.11e clarifies the figure in section 7.1.2. The frame body is
   up to 2304 octets long (maximum MSDU size) plus any crypt overhead. */
#define IEEE80211_MAX_DATA_LEN                2304
/* 802.11ad extends maximum MSDU size for DMG (freq > 40Ghz) networks
 * to 7920 bytes, see 8.2.3 General frame format
 */
#define IEEE80211_MAX_DATA_LEN_DMG        7920
/* 30 byte 4 addr hdr, 2 byte QoS, 2304 byte MSDU, 12 byte crypt, 4 byte FCS */
#define IEEE80211_MAX_FRAME_LEN                2352

/* Maximal size of an A-MSDU that can be transported in a HT BA session */
#define IEEE80211_MAX_MPDU_LEN_HT_BA                4095

/* Maximal size of an A-MSDU */
#define IEEE80211_MAX_MPDU_LEN_HT_3839                3839
#define IEEE80211_MAX_MPDU_LEN_HT_7935                7935

#define IEEE80211_MAX_MPDU_LEN_VHT_3895                3895
#define IEEE80211_MAX_MPDU_LEN_VHT_7991                7991
#define IEEE80211_MAX_MPDU_LEN_VHT_11454        11454

#define IEEE80211_MAX_SSID_LEN                32

#define IEEE80211_MAX_MESH_ID_LEN        32

#define IEEE80211_FIRST_TSPEC_TSID        8
#define IEEE80211_NUM_TIDS                16

/* number of user priorities 802.11 uses */
#define IEEE80211_NUM_UPS                8
/* number of ACs */
#define IEEE80211_NUM_ACS                4

#define IEEE80211_QOS_CTL_LEN                2
/* 1d tag mask */
#define IEEE80211_QOS_CTL_TAG1D_MASK                0x0007
/* TID mask */
#define IEEE80211_QOS_CTL_TID_MASK                0x000f
/* EOSP */
#define IEEE80211_QOS_CTL_EOSP                        0x0010
/* ACK policy */
#define IEEE80211_QOS_CTL_ACK_POLICY_NORMAL        0x0000
#define IEEE80211_QOS_CTL_ACK_POLICY_NOACK        0x0020
#define IEEE80211_QOS_CTL_ACK_POLICY_NO_EXPL        0x0040
#define IEEE80211_QOS_CTL_ACK_POLICY_BLOCKACK        0x0060
#define IEEE80211_QOS_CTL_ACK_POLICY_MASK        0x0060
/* A-MSDU 802.11n */
#define IEEE80211_QOS_CTL_A_MSDU_PRESENT        0x0080
/* Mesh Control 802.11s */
#define IEEE80211_QOS_CTL_MESH_CONTROL_PRESENT  0x0100

/* Mesh Power Save Level */
#define IEEE80211_QOS_CTL_MESH_PS_LEVEL                0x0200
/* Mesh Receiver Service Period Initiated */
#define IEEE80211_QOS_CTL_RSPI                        0x0400

/* U-APSD queue for WMM IEs sent by AP */
#define IEEE80211_WMM_IE_AP_QOSINFO_UAPSD        (1<<7)
#define IEEE80211_WMM_IE_AP_QOSINFO_PARAM_SET_CNT_MASK        0x0f

/* U-APSD queues for WMM IEs sent by STA */
#define IEEE80211_WMM_IE_STA_QOSINFO_AC_VO        (1<<0)
#define IEEE80211_WMM_IE_STA_QOSINFO_AC_VI        (1<<1)
#define IEEE80211_WMM_IE_STA_QOSINFO_AC_BK        (1<<2)
#define IEEE80211_WMM_IE_STA_QOSINFO_AC_BE        (1<<3)
#define IEEE80211_WMM_IE_STA_QOSINFO_AC_MASK        0x0f

/* U-APSD max SP length for WMM IEs sent by STA */
#define IEEE80211_WMM_IE_STA_QOSINFO_SP_ALL        0x00
#define IEEE80211_WMM_IE_STA_QOSINFO_SP_2        0x01
#define IEEE80211_WMM_IE_STA_QOSINFO_SP_4        0x02
#define IEEE80211_WMM_IE_STA_QOSINFO_SP_6        0x03
#define IEEE80211_WMM_IE_STA_QOSINFO_SP_MASK        0x03
#define IEEE80211_WMM_IE_STA_QOSINFO_SP_SHIFT        5

#define IEEE80211_HT_CTL_LEN                4

struct ieee80211_hdr {
        __le16 frame_control;
        __le16 duration_id;
        u8 addr1[ETH_ALEN];
        u8 addr2[ETH_ALEN];
        u8 addr3[ETH_ALEN];
        __le16 seq_ctrl;
        u8 addr4[ETH_ALEN];
} __packed __aligned(2);

struct ieee80211_hdr_3addr {
        __le16 frame_control;
        __le16 duration_id;
        u8 addr1[ETH_ALEN];
        u8 addr2[ETH_ALEN];
        u8 addr3[ETH_ALEN];
        __le16 seq_ctrl;
} __packed __aligned(2);

struct ieee80211_qos_hdr {
        __le16 frame_control;
        __le16 duration_id;
        u8 addr1[ETH_ALEN];
        u8 addr2[ETH_ALEN];
        u8 addr3[ETH_ALEN];
        __le16 seq_ctrl;
        __le16 qos_ctrl;
} __packed __aligned(2);

/**
 * ieee80211_has_tods - check if IEEE80211_FCTL_TODS is set
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_has_tods(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_TODS)) != 0;
}

/**
 * ieee80211_has_fromds - check if IEEE80211_FCTL_FROMDS is set
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_has_fromds(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FROMDS)) != 0;
}

/**
 * ieee80211_has_a4 - check if IEEE80211_FCTL_TODS and IEEE80211_FCTL_FROMDS are set
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_has_a4(__le16 fc)
{
        __le16 tmp = cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS);
        return (fc & tmp) == tmp;
}

/**
 * ieee80211_has_morefrags - check if IEEE80211_FCTL_MOREFRAGS is set
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_has_morefrags(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_MOREFRAGS)) != 0;
}

/**
 * ieee80211_has_retry - check if IEEE80211_FCTL_RETRY is set
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_has_retry(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_RETRY)) != 0;
}

/**
 * ieee80211_has_pm - check if IEEE80211_FCTL_PM is set
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_has_pm(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_PM)) != 0;
}

/**
 * ieee80211_has_moredata - check if IEEE80211_FCTL_MOREDATA is set
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_has_moredata(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_MOREDATA)) != 0;
}

/**
 * ieee80211_has_protected - check if IEEE80211_FCTL_PROTECTED is set
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_has_protected(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_PROTECTED)) != 0;
}

/**
 * ieee80211_has_order - check if IEEE80211_FCTL_ORDER is set
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_has_order(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_ORDER)) != 0;
}

/**
 * ieee80211_is_mgmt - check if type is IEEE80211_FTYPE_MGMT
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_is_mgmt(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_MGMT);
}

/**
 * ieee80211_is_ctl - check if type is IEEE80211_FTYPE_CTL
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_is_ctl(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_CTL);
}

/**
 * ieee80211_is_data - check if type is IEEE80211_FTYPE_DATA
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_is_data(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_DATA);
}

/**
 * ieee80211_is_ext - check if type is IEEE80211_FTYPE_EXT
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_is_ext(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_EXT);
}


/**
 * ieee80211_is_data_qos - check if type is IEEE80211_FTYPE_DATA and IEEE80211_STYPE_QOS_DATA is set
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_is_data_qos(__le16 fc)
{
        /*
         * mask with QOS_DATA rather than IEEE80211_FCTL_STYPE as we just need
         * to check the one bit
         */
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_STYPE_QOS_DATA)) ==
               cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_QOS_DATA);
}

/**
 * ieee80211_is_data_present - check if type is IEEE80211_FTYPE_DATA and has data
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_is_data_present(__le16 fc)
{
        /*
         * mask with 0x40 and test that that bit is clear to only return true
         * for the data-containing substypes.
         */
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | 0x40)) ==
               cpu_to_le16(IEEE80211_FTYPE_DATA);
}

/**
 * ieee80211_is_assoc_req - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_ASSOC_REQ
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_is_assoc_req(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ASSOC_REQ);
}

/**
 * ieee80211_is_assoc_resp - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_ASSOC_RESP
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_is_assoc_resp(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ASSOC_RESP);
}

/**
 * ieee80211_is_reassoc_req - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_REASSOC_REQ
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_is_reassoc_req(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_REASSOC_REQ);
}

/**
 * ieee80211_is_reassoc_resp - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_REASSOC_RESP
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_is_reassoc_resp(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_REASSOC_RESP);
}

/**
 * ieee80211_is_probe_req - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_PROBE_REQ
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_is_probe_req(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_REQ);
}

/**
 * ieee80211_is_probe_resp - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_PROBE_RESP
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_is_probe_resp(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_RESP);
}

/**
 * ieee80211_is_beacon - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_BEACON
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_is_beacon(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_BEACON);
}

/**
 * ieee80211_is_s1g_beacon - check if IEEE80211_FTYPE_EXT &&
 * IEEE80211_STYPE_S1G_BEACON
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_is_s1g_beacon(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE |
                                 IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_EXT | IEEE80211_STYPE_S1G_BEACON);
}

/**
 * ieee80211_next_tbtt_present - check if IEEE80211_FTYPE_EXT &&
 * IEEE80211_STYPE_S1G_BEACON && IEEE80211_S1G_BCN_NEXT_TBTT
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_next_tbtt_present(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_EXT | IEEE80211_STYPE_S1G_BEACON) &&
               fc & cpu_to_le16(IEEE80211_S1G_BCN_NEXT_TBTT);
}

/**
 * ieee80211_is_s1g_short_beacon - check if next tbtt present bit is set. Only
 * true for S1G beacons when they're short.
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_is_s1g_short_beacon(__le16 fc)
{
        return ieee80211_is_s1g_beacon(fc) && ieee80211_next_tbtt_present(fc);
}

/**
 * ieee80211_is_atim - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_ATIM
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_is_atim(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ATIM);
}

/**
 * ieee80211_is_disassoc - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_DISASSOC
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_is_disassoc(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_DISASSOC);
}

/**
 * ieee80211_is_auth - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_AUTH
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_is_auth(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_AUTH);
}

/**
 * ieee80211_is_deauth - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_DEAUTH
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_is_deauth(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_DEAUTH);
}

/**
 * ieee80211_is_action - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_ACTION
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_is_action(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION);
}

/**
 * ieee80211_is_back_req - check if IEEE80211_FTYPE_CTL && IEEE80211_STYPE_BACK_REQ
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_is_back_req(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_BACK_REQ);
}

/**
 * ieee80211_is_back - check if IEEE80211_FTYPE_CTL && IEEE80211_STYPE_BACK
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_is_back(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_BACK);
}

/**
 * ieee80211_is_pspoll - check if IEEE80211_FTYPE_CTL && IEEE80211_STYPE_PSPOLL
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_is_pspoll(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_PSPOLL);
}

/**
 * ieee80211_is_rts - check if IEEE80211_FTYPE_CTL && IEEE80211_STYPE_RTS
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_is_rts(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_RTS);
}

/**
 * ieee80211_is_cts - check if IEEE80211_FTYPE_CTL && IEEE80211_STYPE_CTS
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_is_cts(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_CTS);
}

/**
 * ieee80211_is_ack - check if IEEE80211_FTYPE_CTL && IEEE80211_STYPE_ACK
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_is_ack(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_ACK);
}

/**
 * ieee80211_is_cfend - check if IEEE80211_FTYPE_CTL && IEEE80211_STYPE_CFEND
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_is_cfend(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_CFEND);
}

/**
 * ieee80211_is_cfendack - check if IEEE80211_FTYPE_CTL && IEEE80211_STYPE_CFENDACK
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_is_cfendack(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_CFENDACK);
}

/**
 * ieee80211_is_nullfunc - check if frame is a regular (non-QoS) nullfunc frame
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_is_nullfunc(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC);
}

/**
 * ieee80211_is_qos_nullfunc - check if frame is a QoS nullfunc frame
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_is_qos_nullfunc(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_QOS_NULLFUNC);
}

/**
 * ieee80211_is_any_nullfunc - check if frame is regular or QoS nullfunc frame
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee80211_is_any_nullfunc(__le16 fc)
{
        return (ieee80211_is_nullfunc(fc) || ieee80211_is_qos_nullfunc(fc));
}

/**
 * ieee80211_is_bufferable_mmpdu - check if frame is bufferable MMPDU
 * @fc: frame control field in little-endian byteorder
 */
static inline bool ieee80211_is_bufferable_mmpdu(__le16 fc)
{
        /* IEEE 802.11-2012, definition of "bufferable management frame";
         * note that this ignores the IBSS special case. */
        return ieee80211_is_mgmt(fc) &&
               (ieee80211_is_action(fc) ||
                ieee80211_is_disassoc(fc) ||
                ieee80211_is_deauth(fc));
}

/**
 * ieee80211_is_first_frag - check if IEEE80211_SCTL_FRAG is not set
 * @seq_ctrl: frame sequence control bytes in little-endian byteorder
 */
static inline bool ieee80211_is_first_frag(__le16 seq_ctrl)
{
        return (seq_ctrl & cpu_to_le16(IEEE80211_SCTL_FRAG)) == 0;
}

/**
 * ieee80211_is_frag - check if a frame is a fragment
 * @hdr: 802.11 header of the frame
 */
static inline bool ieee80211_is_frag(struct ieee80211_hdr *hdr)
{
        return ieee80211_has_morefrags(hdr->frame_control) ||
               hdr->seq_ctrl & cpu_to_le16(IEEE80211_SCTL_FRAG);
}

struct ieee80211s_hdr {
        u8 flags;
        u8 ttl;
        __le32 seqnum;
        u8 eaddr1[ETH_ALEN];
        u8 eaddr2[ETH_ALEN];
} __packed __aligned(2);

/* Mesh flags */
#define MESH_FLAGS_AE_A4         0x1
#define MESH_FLAGS_AE_A5_A6        0x2
#define MESH_FLAGS_AE                0x3
#define MESH_FLAGS_PS_DEEP        0x4

/**
 * enum ieee80211_preq_flags - mesh PREQ element flags
 *
 * @IEEE80211_PREQ_PROACTIVE_PREP_FLAG: proactive PREP subfield
 */
enum ieee80211_preq_flags {
        IEEE80211_PREQ_PROACTIVE_PREP_FLAG        = 1<<2,
};

/**
 * enum ieee80211_preq_target_flags - mesh PREQ element per target flags
 *
 * @IEEE80211_PREQ_TO_FLAG: target only subfield
 * @IEEE80211_PREQ_USN_FLAG: unknown target HWMP sequence number subfield
 */
enum ieee80211_preq_target_flags {
        IEEE80211_PREQ_TO_FLAG        = 1<<0,
        IEEE80211_PREQ_USN_FLAG        = 1<<2,
};

/**
 * struct ieee80211_quiet_ie
 *
 * This structure refers to "Quiet information element"
 */
struct ieee80211_quiet_ie {
        u8 count;
        u8 period;
        __le16 duration;
        __le16 offset;
} __packed;

/**
 * struct ieee80211_msrment_ie
 *
 * This structure refers to "Measurement Request/Report information element"
 */
struct ieee80211_msrment_ie {
        u8 token;
        u8 mode;
        u8 type;
        u8 request[];
} __packed;

/**
 * struct ieee80211_channel_sw_ie
 *
 * This structure refers to "Channel Switch Announcement information element"
 */
struct ieee80211_channel_sw_ie {
        u8 mode;
        u8 new_ch_num;
        u8 count;
} __packed;

/**
 * struct ieee80211_ext_chansw_ie
 *
 * This structure represents the "Extended Channel Switch Announcement element"
 */
struct ieee80211_ext_chansw_ie {
        u8 mode;
        u8 new_operating_class;
        u8 new_ch_num;
        u8 count;
} __packed;

/**
 * struct ieee80211_sec_chan_offs_ie - secondary channel offset IE
 * @sec_chan_offs: secondary channel offset, uses IEEE80211_HT_PARAM_CHA_SEC_*
 *        values here
 * This structure represents the "Secondary Channel Offset element"
 */
struct ieee80211_sec_chan_offs_ie {
        u8 sec_chan_offs;
} __packed;

/**
 * struct ieee80211_mesh_chansw_params_ie - mesh channel switch parameters IE
 *
 * This structure represents the "Mesh Channel Switch Paramters element"
 */
struct ieee80211_mesh_chansw_params_ie {
        u8 mesh_ttl;
        u8 mesh_flags;
        __le16 mesh_reason;
        __le16 mesh_pre_value;
} __packed;

/**
 * struct ieee80211_wide_bw_chansw_ie - wide bandwidth channel switch IE
 */
struct ieee80211_wide_bw_chansw_ie {
        u8 new_channel_width;
        u8 new_center_freq_seg0, new_center_freq_seg1;
} __packed;

/**
 * struct ieee80211_tim
 *
 * This structure refers to "Traffic Indication Map information element"
 */
struct ieee80211_tim_ie {
        u8 dtim_count;
        u8 dtim_period;
        u8 bitmap_ctrl;
        /* variable size: 1 - 251 bytes */
        u8 virtual_map[1];
} __packed;

/**
 * struct ieee80211_meshconf_ie
 *
 * This structure refers to "Mesh Configuration information element"
 */
struct ieee80211_meshconf_ie {
        u8 meshconf_psel;
        u8 meshconf_pmetric;
        u8 meshconf_congest;
        u8 meshconf_synch;
        u8 meshconf_auth;
        u8 meshconf_form;
        u8 meshconf_cap;
} __packed;

/**
 * enum mesh_config_capab_flags - Mesh Configuration IE capability field flags
 *
 * @IEEE80211_MESHCONF_CAPAB_ACCEPT_PLINKS: STA is willing to establish
 *        additional mesh peerings with other mesh STAs
 * @IEEE80211_MESHCONF_CAPAB_FORWARDING: the STA forwards MSDUs
 * @IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING: TBTT adjustment procedure
 *        is ongoing
 * @IEEE80211_MESHCONF_CAPAB_POWER_SAVE_LEVEL: STA is in deep sleep mode or has
 *        neighbors in deep sleep mode
 */
enum mesh_config_capab_flags {
        IEEE80211_MESHCONF_CAPAB_ACCEPT_PLINKS                = 0x01,
        IEEE80211_MESHCONF_CAPAB_FORWARDING                = 0x08,
        IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING                = 0x20,
        IEEE80211_MESHCONF_CAPAB_POWER_SAVE_LEVEL        = 0x40,
};

#define IEEE80211_MESHCONF_FORM_CONNECTED_TO_GATE 0x1

/**
 * mesh channel switch parameters element's flag indicator
 *
 */
#define WLAN_EID_CHAN_SWITCH_PARAM_TX_RESTRICT BIT(0)
#define WLAN_EID_CHAN_SWITCH_PARAM_INITIATOR BIT(1)
#define WLAN_EID_CHAN_SWITCH_PARAM_REASON BIT(2)

/**
 * struct ieee80211_rann_ie
 *
 * This structure refers to "Root Announcement information element"
 */
struct ieee80211_rann_ie {
        u8 rann_flags;
        u8 rann_hopcount;
        u8 rann_ttl;
        u8 rann_addr[ETH_ALEN];
        __le32 rann_seq;
        __le32 rann_interval;
        __le32 rann_metric;
} __packed;

enum ieee80211_rann_flags {
        RANN_FLAG_IS_GATE = 1 << 0,
};

enum ieee80211_ht_chanwidth_values {
        IEEE80211_HT_CHANWIDTH_20MHZ = 0,
        IEEE80211_HT_CHANWIDTH_ANY = 1,
};

/**
 * enum ieee80211_opmode_bits - VHT operating mode field bits
 * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK: channel width mask
 * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ: 20 MHz channel width
 * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ: 40 MHz channel width
 * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ: 80 MHz channel width
 * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ: 160 MHz or 80+80 MHz channel width
 * @IEEE80211_OPMODE_NOTIF_BW_160_80P80: 160 / 80+80 MHz indicator flag
 * @IEEE80211_OPMODE_NOTIF_RX_NSS_MASK: number of spatial streams mask
 *        (the NSS value is the value of this field + 1)
 * @IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT: number of spatial streams shift
 * @IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF: indicates streams in SU-MIMO PPDU
 *        using a beamforming steering matrix
 */
enum ieee80211_vht_opmode_bits {
        IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK        = 0x03,
        IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ        = 0,
        IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ        = 1,
        IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ        = 2,
        IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ        = 3,
        IEEE80211_OPMODE_NOTIF_BW_160_80P80        = 0x04,
        IEEE80211_OPMODE_NOTIF_RX_NSS_MASK        = 0x70,
        IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT        = 4,
        IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF        = 0x80,
};

/**
 * enum ieee80211_s1g_chanwidth
 * These are defined in IEEE802.11-2016ah Table 10-20
 * as BSS Channel Width
 *
 * @IEEE80211_S1G_CHANWIDTH_1MHZ: 1MHz operating channel
 * @IEEE80211_S1G_CHANWIDTH_2MHZ: 2MHz operating channel
 * @IEEE80211_S1G_CHANWIDTH_4MHZ: 4MHz operating channel
 * @IEEE80211_S1G_CHANWIDTH_8MHZ: 8MHz operating channel
 * @IEEE80211_S1G_CHANWIDTH_16MHZ: 16MHz operating channel
 */
enum ieee80211_s1g_chanwidth {
        IEEE80211_S1G_CHANWIDTH_1MHZ = 0,
        IEEE80211_S1G_CHANWIDTH_2MHZ = 1,
        IEEE80211_S1G_CHANWIDTH_4MHZ = 3,
        IEEE80211_S1G_CHANWIDTH_8MHZ = 7,
        IEEE80211_S1G_CHANWIDTH_16MHZ = 15,
};

#define WLAN_SA_QUERY_TR_ID_LEN 2
#define WLAN_MEMBERSHIP_LEN 8
#define WLAN_USER_POSITION_LEN 16

/**
 * struct ieee80211_tpc_report_ie
 *
 * This structure refers to "TPC Report element"
 */
struct ieee80211_tpc_report_ie {
        u8 tx_power;
        u8 link_margin;
} __packed;

#define IEEE80211_ADDBA_EXT_FRAG_LEVEL_MASK        GENMASK(2, 1)
#define IEEE80211_ADDBA_EXT_FRAG_LEVEL_SHIFT        1
#define IEEE80211_ADDBA_EXT_NO_FRAG                BIT(0)

struct ieee80211_addba_ext_ie {
        u8 data;
} __packed;

/**
 * struct ieee80211_s1g_bcn_compat_ie
 *
 * S1G Beacon Compatibility element
 */
struct ieee80211_s1g_bcn_compat_ie {
        __le16 compat_info;
        __le16 beacon_int;
        __le32 tsf_completion;
} __packed;

/**
 * struct ieee80211_s1g_oper_ie
 *
 * S1G Operation element
 */
struct ieee80211_s1g_oper_ie {
        u8 ch_width;
        u8 oper_class;
        u8 primary_ch;
        u8 oper_ch;
        __le16 basic_mcs_nss;
} __packed;

/**
 * struct ieee80211_aid_response_ie
 *
 * AID Response element
 */
struct ieee80211_aid_response_ie {
        __le16 aid;
        u8 switch_count;
        __le16 response_int;
} __packed;

struct ieee80211_s1g_cap {
        u8 capab_info[10];
        u8 supp_mcs_nss[5];
} __packed;

struct ieee80211_ext {
        __le16 frame_control;
        __le16 duration;
        union {
                struct {
                        u8 sa[ETH_ALEN];
                        __le32 timestamp;
                        u8 change_seq;
                        u8 variable[0];
                } __packed s1g_beacon;
                struct {
                        u8 sa[ETH_ALEN];
                        __le32 timestamp;
                        u8 change_seq;
                        u8 next_tbtt[3];
                        u8 variable[0];
                } __packed s1g_short_beacon;
        } u;
} __packed __aligned(2);

struct ieee80211_mgmt {
        __le16 frame_control;
        __le16 duration;
        u8 da[ETH_ALEN];
        u8 sa[ETH_ALEN];
        u8 bssid[ETH_ALEN];
        __le16 seq_ctrl;
        union {
                struct {
                        __le16 auth_alg;
                        __le16 auth_transaction;
                        __le16 status_code;
                        /* possibly followed by Challenge text */
                        u8 variable[0];
                } __packed auth;
                struct {
                        __le16 reason_code;
                } __packed deauth;
                struct {
                        __le16 capab_info;
                        __le16 listen_interval;
                        /* followed by SSID and Supported rates */
                        u8 variable[0];
                } __packed assoc_req;
                struct {
                        __le16 capab_info;
                        __le16 status_code;
                        __le16 aid;
                        /* followed by Supported rates */
                        u8 variable[0];
                } __packed assoc_resp, reassoc_resp;
                struct {
                        __le16 capab_info;
                        __le16 status_code;
                        u8 variable[0];
                } __packed s1g_assoc_resp, s1g_reassoc_resp;
                struct {
                        __le16 capab_info;
                        __le16 listen_interval;
                        u8 current_ap[ETH_ALEN];
                        /* followed by SSID and Supported rates */
                        u8 variable[0];
                } __packed reassoc_req;
                struct {
                        __le16 reason_code;
                } __packed disassoc;
                struct {
                        __le64 timestamp;
                        __le16 beacon_int;
                        __le16 capab_info;
                        /* followed by some of SSID, Supported rates,
                         * FH Params, DS Params, CF Params, IBSS Params, TIM */
                        u8 variable[0];
                } __packed beacon;
                struct {
                        /* only variable items: SSID, Supported rates */
                        u8 variable[0];
                } __packed probe_req;
                struct {
                        __le64 timestamp;
                        __le16 beacon_int;
                        __le16 capab_info;
                        /* followed by some of SSID, Supported rates,
                         * FH Params, DS Params, CF Params, IBSS Params */
                        u8 variable[0];
                } __packed probe_resp;
                struct {
                        u8 category;
                        union {
                                struct {
                                        u8 action_code;
                                        u8 dialog_token;
                                        u8 status_code;
                                        u8 variable[0];
                                } __packed wme_action;
                                struct{
                                        u8 action_code;
                                        u8 variable[0];
                                } __packed chan_switch;
                                struct{
                                        u8 action_code;
                                        struct ieee80211_ext_chansw_ie data;
                                        u8 variable[0];
                                } __packed ext_chan_switch;
                                struct{
                                        u8 action_code;
                                        u8 dialog_token;
                                        u8 element_id;
                                        u8 length;
                                        struct ieee80211_msrment_ie msr_elem;
                                } __packed measurement;
                                struct{
                                        u8 action_code;
                                        u8 dialog_token;
                                        __le16 capab;
                                        __le16 timeout;
                                        __le16 start_seq_num;
                                        /* followed by BA Extension */
                                        u8 variable[0];
                                } __packed addba_req;
                                struct{
                                        u8 action_code;
                                        u8 dialog_token;
                                        __le16 status;
                                        __le16 capab;
                                        __le16 timeout;
                                } __packed addba_resp;
                                struct{
                                        u8 action_code;
                                        __le16 params;
                                        __le16 reason_code;
                                } __packed delba;
                                struct {
                                        u8 action_code;
                                        u8 variable[0];
                                } __packed self_prot;
                                struct{
                                        u8 action_code;
                                        u8 variable[0];
                                } __packed mesh_action;
                                struct {
                                        u8 action;
                                        u8 trans_id[WLAN_SA_QUERY_TR_ID_LEN];
                                } __packed sa_query;
                                struct {
                                        u8 action;
                                        u8 smps_control;
                                } __packed ht_smps;
                                struct {
                                        u8 action_code;
                                        u8 chanwidth;
                                } __packed ht_notify_cw;
                                struct {
                                        u8 action_code;
                                        u8 dialog_token;
                                        __le16 capability;
                                        u8 variable[0];
                                } __packed tdls_discover_resp;
                                struct {
                                        u8 action_code;
                                        u8 operating_mode;
                                } __packed vht_opmode_notif;
                                struct {
                                        u8 action_code;
                                        u8 membership[WLAN_MEMBERSHIP_LEN];
                                        u8 position[WLAN_USER_POSITION_LEN];
                                } __packed vht_group_notif;
                                struct {
                                        u8 action_code;
                                        u8 dialog_token;
                                        u8 tpc_elem_id;
                                        u8 tpc_elem_length;
                                        struct ieee80211_tpc_report_ie tpc;
                                } __packed tpc_report;
                                struct {
                                        u8 action_code;
                                        u8 dialog_token;
                                        u8 follow_up;
                                        u8 tod[6];
                                        u8 toa[6];
                                        __le16 tod_error;
                                        __le16 toa_error;
                                        u8 variable[0];
                                } __packed ftm;
                        } u;
                } __packed action;
        } u;
} __packed __aligned(2);

/* Supported rates membership selectors */
#define BSS_MEMBERSHIP_SELECTOR_HT_PHY        127
#define BSS_MEMBERSHIP_SELECTOR_VHT_PHY        126
#define BSS_MEMBERSHIP_SELECTOR_HE_PHY        122

/* mgmt header + 1 byte category code */
#define IEEE80211_MIN_ACTION_SIZE offsetof(struct ieee80211_mgmt, u.action.u)


/* Management MIC information element (IEEE 802.11w) */
struct ieee80211_mmie {
        u8 element_id;
        u8 length;
        __le16 key_id;
        u8 sequence_number[6];
        u8 mic[8];
} __packed;

/* Management MIC information element (IEEE 802.11w) for GMAC and CMAC-256 */
struct ieee80211_mmie_16 {
        u8 element_id;
        u8 length;
        __le16 key_id;
        u8 sequence_number[6];
        u8 mic[16];
} __packed;

struct ieee80211_vendor_ie {
        u8 element_id;
        u8 len;
        u8 oui[3];
        u8 oui_type;
} __packed;

struct ieee80211_wmm_ac_param {
        u8 aci_aifsn; /* AIFSN, ACM, ACI */
        u8 cw; /* ECWmin, ECWmax (CW = 2^ECW - 1) */
        __le16 txop_limit;
} __packed;

struct ieee80211_wmm_param_ie {
        u8 element_id; /* Element ID: 221 (0xdd); */
        u8 len; /* Length: 24 */
        /* required fields for WMM version 1 */
        u8 oui[3]; /* 00:50:f2 */
        u8 oui_type; /* 2 */
        u8 oui_subtype; /* 1 */
        u8 version; /* 1 for WMM version 1.0 */
        u8 qos_info; /* AP/STA specific QoS info */
        u8 reserved; /* 0 */
        /* AC_BE, AC_BK, AC_VI, AC_VO */
        struct ieee80211_wmm_ac_param ac[4];
} __packed;

/* Control frames */
struct ieee80211_rts {
        __le16 frame_control;
        __le16 duration;
        u8 ra[ETH_ALEN];
        u8 ta[ETH_ALEN];
} __packed __aligned(2);

struct ieee80211_cts {
        __le16 frame_control;
        __le16 duration;
        u8 ra[ETH_ALEN];
} __packed __aligned(2);

struct ieee80211_pspoll {
        __le16 frame_control;
        __le16 aid;
        u8 bssid[ETH_ALEN];
        u8 ta[ETH_ALEN];
} __packed __aligned(2);

/* TDLS */

/* Channel switch timing */
struct ieee80211_ch_switch_timing {
        __le16 switch_time;
        __le16 switch_timeout;
} __packed;

/* Link-id information element */
struct ieee80211_tdls_lnkie {
        u8 ie_type; /* Link Identifier IE */
        u8 ie_len;
        u8 bssid[ETH_ALEN];
        u8 init_sta[ETH_ALEN];
        u8 resp_sta[ETH_ALEN];
} __packed;

struct ieee80211_tdls_data {
        u8 da[ETH_ALEN];
        u8 sa[ETH_ALEN];
        __be16 ether_type;
        u8 payload_type;
        u8 category;
        u8 action_code;
        union {
                struct {
                        u8 dialog_token;
                        __le16 capability;
                        u8 variable[0];
                } __packed setup_req;
                struct {
                        __le16 status_code;
                        u8 dialog_token;
                        __le16 capability;
                        u8 variable[0];
                } __packed setup_resp;
                struct {
                        __le16 status_code;
                        u8 dialog_token;
                        u8 variable[0];
                } __packed setup_cfm;
                struct {
                        __le16 reason_code;
                        u8 variable[0];
                } __packed teardown;
                struct {
                        u8 dialog_token;
                        u8 variable[0];
                } __packed discover_req;
                struct {
                        u8 target_channel;
                        u8 oper_class;
                        u8 variable[0];
                } __packed chan_switch_req;
                struct {
                        __le16 status_code;
                        u8 variable[0];
                } __packed chan_switch_resp;
        } u;
} __packed;

/*
 * Peer-to-Peer IE attribute related definitions.
 */
/**
 * enum ieee80211_p2p_attr_id - identifies type of peer-to-peer attribute.
 */
enum ieee80211_p2p_attr_id {
        IEEE80211_P2P_ATTR_STATUS = 0,
        IEEE80211_P2P_ATTR_MINOR_REASON,
        IEEE80211_P2P_ATTR_CAPABILITY,
        IEEE80211_P2P_ATTR_DEVICE_ID,
        IEEE80211_P2P_ATTR_GO_INTENT,
        IEEE80211_P2P_ATTR_GO_CONFIG_TIMEOUT,
        IEEE80211_P2P_ATTR_LISTEN_CHANNEL,
        IEEE80211_P2P_ATTR_GROUP_BSSID,
        IEEE80211_P2P_ATTR_EXT_LISTEN_TIMING,
        IEEE80211_P2P_ATTR_INTENDED_IFACE_ADDR,
        IEEE80211_P2P_ATTR_MANAGABILITY,
        IEEE80211_P2P_ATTR_CHANNEL_LIST,
        IEEE80211_P2P_ATTR_ABSENCE_NOTICE,
        IEEE80211_P2P_ATTR_DEVICE_INFO,
        IEEE80211_P2P_ATTR_GROUP_INFO,
        IEEE80211_P2P_ATTR_GROUP_ID,
        IEEE80211_P2P_ATTR_INTERFACE,
        IEEE80211_P2P_ATTR_OPER_CHANNEL,
        IEEE80211_P2P_ATTR_INVITE_FLAGS,
        /* 19 - 220: Reserved */
        IEEE80211_P2P_ATTR_VENDOR_SPECIFIC = 221,

        IEEE80211_P2P_ATTR_MAX
};

/* Notice of Absence attribute - described in P2P spec 4.1.14 */
/* Typical max value used here */
#define IEEE80211_P2P_NOA_DESC_MAX        4

struct ieee80211_p2p_noa_desc {
        u8 count;
        __le32 duration;
        __le32 interval;
        __le32 start_time;
} __packed;

struct ieee80211_p2p_noa_attr {
        u8 index;
        u8 oppps_ctwindow;
        struct ieee80211_p2p_noa_desc desc[IEEE80211_P2P_NOA_DESC_MAX];
} __packed;

#define IEEE80211_P2P_OPPPS_ENABLE_BIT                BIT(7)
#define IEEE80211_P2P_OPPPS_CTWINDOW_MASK        0x7F

/**
 * struct ieee80211_bar - HT Block Ack Request
 *
 * This structure refers to "HT BlockAckReq" as
 * described in 802.11n draft section 7.2.1.7.1
 */
struct ieee80211_bar {
        __le16 frame_control;
        __le16 duration;
        __u8 ra[ETH_ALEN];
        __u8 ta[ETH_ALEN];
        __le16 control;
        __le16 start_seq_num;
} __packed;

/* 802.11 BAR control masks */
#define IEEE80211_BAR_CTRL_ACK_POLICY_NORMAL        0x0000
#define IEEE80211_BAR_CTRL_MULTI_TID                0x0002
#define IEEE80211_BAR_CTRL_CBMTID_COMPRESSED_BA        0x0004
#define IEEE80211_BAR_CTRL_TID_INFO_MASK        0xf000
#define IEEE80211_BAR_CTRL_TID_INFO_SHIFT        12

#define IEEE80211_HT_MCS_MASK_LEN                10

/**
 * struct ieee80211_mcs_info - MCS information
 * @rx_mask: RX mask
 * @rx_highest: highest supported RX rate. If set represents
 *        the highest supported RX data rate in units of 1 Mbps.
 *        If this field is 0 this value should not be used to
 *        consider the highest RX data rate supported.
 * @tx_params: TX parameters
 */
struct ieee80211_mcs_info {
        u8 rx_mask[IEEE80211_HT_MCS_MASK_LEN];
        __le16 rx_highest;
        u8 tx_params;
        u8 reserved[3];
} __packed;

/* 802.11n HT capability MSC set */
#define IEEE80211_HT_MCS_RX_HIGHEST_MASK        0x3ff
#define IEEE80211_HT_MCS_TX_DEFINED                0x01
#define IEEE80211_HT_MCS_TX_RX_DIFF                0x02
/* value 0 == 1 stream etc */
#define IEEE80211_HT_MCS_TX_MAX_STREAMS_MASK        0x0C
#define IEEE80211_HT_MCS_TX_MAX_STREAMS_SHIFT        2
#define                IEEE80211_HT_MCS_TX_MAX_STREAMS        4
#define IEEE80211_HT_MCS_TX_UNEQUAL_MODULATION        0x10

/*
 * 802.11n D5.0 20.3.5 / 20.6 says:
 * - indices 0 to 7 and 32 are single spatial stream
 * - 8 to 31 are multiple spatial streams using equal modulation
 *   [8..15 for two streams, 16..23 for three and 24..31 for four]
 * - remainder are multiple spatial streams using unequal modulation
 */
#define IEEE80211_HT_MCS_UNEQUAL_MODULATION_START 33
#define IEEE80211_HT_MCS_UNEQUAL_MODULATION_START_BYTE \
        (IEEE80211_HT_MCS_UNEQUAL_MODULATION_START / 8)

/**
 * struct ieee80211_ht_cap - HT capabilities
 *
 * This structure is the "HT capabilities element" as
 * described in 802.11n D5.0 7.3.2.57
 */
struct ieee80211_ht_cap {
        __le16 cap_info;
        u8 ampdu_params_info;

        /* 16 bytes MCS information */
        struct ieee80211_mcs_info mcs;

        __le16 extended_ht_cap_info;
        __le32 tx_BF_cap_info;
        u8 antenna_selection_info;
} __packed;

/* 802.11n HT capabilities masks (for cap_info) */
#define IEEE80211_HT_CAP_LDPC_CODING                0x0001
#define IEEE80211_HT_CAP_SUP_WIDTH_20_40        0x0002
#define IEEE80211_HT_CAP_SM_PS                        0x000C
#define                IEEE80211_HT_CAP_SM_PS_SHIFT        2
#define IEEE80211_HT_CAP_GRN_FLD                0x0010
#define IEEE80211_HT_CAP_SGI_20                        0x0020
#define IEEE80211_HT_CAP_SGI_40                        0x0040
#define IEEE80211_HT_CAP_TX_STBC                0x0080
#define IEEE80211_HT_CAP_RX_STBC                0x0300
#define                IEEE80211_HT_CAP_RX_STBC_SHIFT        8
#define IEEE80211_HT_CAP_DELAY_BA                0x0400
#define IEEE80211_HT_CAP_MAX_AMSDU                0x0800
#define IEEE80211_HT_CAP_DSSSCCK40                0x1000
#define IEEE80211_HT_CAP_RESERVED                0x2000
#define IEEE80211_HT_CAP_40MHZ_INTOLERANT        0x4000
#define IEEE80211_HT_CAP_LSIG_TXOP_PROT                0x8000

/* 802.11n HT extended capabilities masks (for extended_ht_cap_info) */
#define IEEE80211_HT_EXT_CAP_PCO                0x0001
#define IEEE80211_HT_EXT_CAP_PCO_TIME                0x0006
#define                IEEE80211_HT_EXT_CAP_PCO_TIME_SHIFT        1
#define IEEE80211_HT_EXT_CAP_MCS_FB                0x0300
#define                IEEE80211_HT_EXT_CAP_MCS_FB_SHIFT        8
#define IEEE80211_HT_EXT_CAP_HTC_SUP                0x0400
#define IEEE80211_HT_EXT_CAP_RD_RESPONDER        0x0800

/* 802.11n HT capability AMPDU settings (for ampdu_params_info) */
#define IEEE80211_HT_AMPDU_PARM_FACTOR                0x03
#define IEEE80211_HT_AMPDU_PARM_DENSITY                0x1C
#define                IEEE80211_HT_AMPDU_PARM_DENSITY_SHIFT        2

/*
 * Maximum length of AMPDU that the STA can receive in high-throughput (HT).
 * Length = 2 ^ (13 + max_ampdu_length_exp) - 1 (octets)
 */
enum ieee80211_max_ampdu_length_exp {
        IEEE80211_HT_MAX_AMPDU_8K = 0,
        IEEE80211_HT_MAX_AMPDU_16K = 1,
        IEEE80211_HT_MAX_AMPDU_32K = 2,
        IEEE80211_HT_MAX_AMPDU_64K = 3
};

/*
 * Maximum length of AMPDU that the STA can receive in VHT.
 * Length = 2 ^ (13 + max_ampdu_length_exp) - 1 (octets)
 */
enum ieee80211_vht_max_ampdu_length_exp {
        IEEE80211_VHT_MAX_AMPDU_8K = 0,
        IEEE80211_VHT_MAX_AMPDU_16K = 1,
        IEEE80211_VHT_MAX_AMPDU_32K = 2,
        IEEE80211_VHT_MAX_AMPDU_64K = 3,
        IEEE80211_VHT_MAX_AMPDU_128K = 4,
        IEEE80211_VHT_MAX_AMPDU_256K = 5,
        IEEE80211_VHT_MAX_AMPDU_512K = 6,
        IEEE80211_VHT_MAX_AMPDU_1024K = 7
};

#define IEEE80211_HT_MAX_AMPDU_FACTOR 13

/* Minimum MPDU start spacing */
enum ieee80211_min_mpdu_spacing {
        IEEE80211_HT_MPDU_DENSITY_NONE = 0,        /* No restriction */
        IEEE80211_HT_MPDU_DENSITY_0_25 = 1,        /* 1/4 usec */
        IEEE80211_HT_MPDU_DENSITY_0_5 = 2,        /* 1/2 usec */
        IEEE80211_HT_MPDU_DENSITY_1 = 3,        /* 1 usec */
        IEEE80211_HT_MPDU_DENSITY_2 = 4,        /* 2 usec */
        IEEE80211_HT_MPDU_DENSITY_4 = 5,        /* 4 usec */
        IEEE80211_HT_MPDU_DENSITY_8 = 6,        /* 8 usec */
        IEEE80211_HT_MPDU_DENSITY_16 = 7        /* 16 usec */
};

/**
 * struct ieee80211_ht_operation - HT operation IE
 *
 * This structure is the "HT operation element" as
 * described in 802.11n-2009 7.3.2.57
 */
struct ieee80211_ht_operation {
        u8 primary_chan;
        u8 ht_param;
        __le16 operation_mode;
        __le16 stbc_param;
        u8 basic_set[16];
} __packed;

/* for ht_param */
#define IEEE80211_HT_PARAM_CHA_SEC_OFFSET                0x03
#define                IEEE80211_HT_PARAM_CHA_SEC_NONE                0x00
#define                IEEE80211_HT_PARAM_CHA_SEC_ABOVE        0x01
#define                IEEE80211_HT_PARAM_CHA_SEC_BELOW        0x03
#define IEEE80211_HT_PARAM_CHAN_WIDTH_ANY                0x04
#define IEEE80211_HT_PARAM_RIFS_MODE                        0x08

/* for operation_mode */
#define IEEE80211_HT_OP_MODE_PROTECTION                        0x0003
#define                IEEE80211_HT_OP_MODE_PROTECTION_NONE                0
#define                IEEE80211_HT_OP_MODE_PROTECTION_NONMEMBER        1
#define                IEEE80211_HT_OP_MODE_PROTECTION_20MHZ                2
#define                IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED        3
#define IEEE80211_HT_OP_MODE_NON_GF_STA_PRSNT                0x0004
#define IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT                0x0010
#define IEEE80211_HT_OP_MODE_CCFS2_SHIFT                5
#define IEEE80211_HT_OP_MODE_CCFS2_MASK                        0x1fe0

/* for stbc_param */
#define IEEE80211_HT_STBC_PARAM_DUAL_BEACON                0x0040
#define IEEE80211_HT_STBC_PARAM_DUAL_CTS_PROT                0x0080
#define IEEE80211_HT_STBC_PARAM_STBC_BEACON                0x0100
#define IEEE80211_HT_STBC_PARAM_LSIG_TXOP_FULLPROT        0x0200
#define IEEE80211_HT_STBC_PARAM_PCO_ACTIVE                0x0400
#define IEEE80211_HT_STBC_PARAM_PCO_PHASE                0x0800


/* block-ack parameters */
#define IEEE80211_ADDBA_PARAM_AMSDU_MASK 0x0001
#define IEEE80211_ADDBA_PARAM_POLICY_MASK 0x0002
#define IEEE80211_ADDBA_PARAM_TID_MASK 0x003C
#define IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK 0xFFC0
#define IEEE80211_DELBA_PARAM_TID_MASK 0xF000
#define IEEE80211_DELBA_PARAM_INITIATOR_MASK 0x0800

/*
 * A-MPDU buffer sizes
 * According to HT size varies from 8 to 64 frames
 * HE adds the ability to have up to 256 frames.
 */
#define IEEE80211_MIN_AMPDU_BUF                0x8
#define IEEE80211_MAX_AMPDU_BUF_HT        0x40
#define IEEE80211_MAX_AMPDU_BUF                0x100


/* Spatial Multiplexing Power Save Modes (for capability) */
#define WLAN_HT_CAP_SM_PS_STATIC        0
#define WLAN_HT_CAP_SM_PS_DYNAMIC        1
#define WLAN_HT_CAP_SM_PS_INVALID        2
#define WLAN_HT_CAP_SM_PS_DISABLED        3

/* for SM power control field lower two bits */
#define WLAN_HT_SMPS_CONTROL_DISABLED        0
#define WLAN_HT_SMPS_CONTROL_STATIC        1
#define WLAN_HT_SMPS_CONTROL_DYNAMIC        3

/**
 * struct ieee80211_vht_mcs_info - VHT MCS information
 * @rx_mcs_map: RX MCS map 2 bits for each stream, total 8 streams
 * @rx_highest: Indicates highest long GI VHT PPDU data rate
 *        STA can receive. Rate expressed in units of 1 Mbps.
 *        If this field is 0 this value should not be used to
 *        consider the highest RX data rate supported.
 *        The top 3 bits of this field indicate the Maximum NSTS,total
 *        (a beamformee capability.)
 * @tx_mcs_map: TX MCS map 2 bits for each stream, total 8 streams
 * @tx_highest: Indicates highest long GI VHT PPDU data rate
 *        STA can transmit. Rate expressed in units of 1 Mbps.
 *        If this field is 0 this value should not be used to
 *        consider the highest TX data rate supported.
 *        The top 2 bits of this field are reserved, the
 *        3rd bit from the top indiciates VHT Extended NSS BW
 *        Capability.
 */
struct ieee80211_vht_mcs_info {
        __le16 rx_mcs_map;
        __le16 rx_highest;
        __le16 tx_mcs_map;
        __le16 tx_highest;
} __packed;

/* for rx_highest */
#define IEEE80211_VHT_MAX_NSTS_TOTAL_SHIFT        13
#define IEEE80211_VHT_MAX_NSTS_TOTAL_MASK        (7 << IEEE80211_VHT_MAX_NSTS_TOTAL_SHIFT)

/* for tx_highest */
#define IEEE80211_VHT_EXT_NSS_BW_CAPABLE        (1 << 13)

/**
 * enum ieee80211_vht_mcs_support - VHT MCS support definitions
 * @IEEE80211_VHT_MCS_SUPPORT_0_7: MCSes 0-7 are supported for the
 *        number of streams
 * @IEEE80211_VHT_MCS_SUPPORT_0_8: MCSes 0-8 are supported
 * @IEEE80211_VHT_MCS_SUPPORT_0_9: MCSes 0-9 are supported
 * @IEEE80211_VHT_MCS_NOT_SUPPORTED: This number of streams isn't supported
 *
 * These definitions are used in each 2-bit subfield of the @rx_mcs_map
 * and @tx_mcs_map fields of &struct ieee80211_vht_mcs_info, which are
 * both split into 8 subfields by number of streams. These values indicate
 * which MCSes are supported for the number of streams the value appears
 * for.
 */
enum ieee80211_vht_mcs_support {
        IEEE80211_VHT_MCS_SUPPORT_0_7        = 0,
        IEEE80211_VHT_MCS_SUPPORT_0_8        = 1,
        IEEE80211_VHT_MCS_SUPPORT_0_9        = 2,
        IEEE80211_VHT_MCS_NOT_SUPPORTED        = 3,
};

/**
 * struct ieee80211_vht_cap - VHT capabilities
 *
 * This structure is the "VHT capabilities element" as
 * described in 802.11ac D3.0 8.4.2.160
 * @vht_cap_info: VHT capability info
 * @supp_mcs: VHT MCS supported rates
 */
struct ieee80211_vht_cap {
        __le32 vht_cap_info;
        struct ieee80211_vht_mcs_info supp_mcs;
} __packed;

/**
 * enum ieee80211_vht_chanwidth - VHT channel width
 * @IEEE80211_VHT_CHANWIDTH_USE_HT: use the HT operation IE to
 *        determine the channel width (20 or 40 MHz)
 * @IEEE80211_VHT_CHANWIDTH_80MHZ: 80 MHz bandwidth
 * @IEEE80211_VHT_CHANWIDTH_160MHZ: 160 MHz bandwidth
 * @IEEE80211_VHT_CHANWIDTH_80P80MHZ: 80+80 MHz bandwidth
 */
enum ieee80211_vht_chanwidth {
        IEEE80211_VHT_CHANWIDTH_USE_HT                = 0,
        IEEE80211_VHT_CHANWIDTH_80MHZ                = 1,
        IEEE80211_VHT_CHANWIDTH_160MHZ                = 2,
        IEEE80211_VHT_CHANWIDTH_80P80MHZ        = 3,
};

/**
 * struct ieee80211_vht_operation - VHT operation IE
 *
 * This structure is the "VHT operation element" as
 * described in 802.11ac D3.0 8.4.2.161
 * @chan_width: Operating channel width
 * @center_freq_seg0_idx: center freq segment 0 index
 * @center_freq_seg1_idx: center freq segment 1 index
 * @basic_mcs_set: VHT Basic MCS rate set
 */
struct ieee80211_vht_operation {
        u8 chan_width;
        u8 center_freq_seg0_idx;
        u8 center_freq_seg1_idx;
        __le16 basic_mcs_set;
} __packed;

/**
 * struct ieee80211_he_cap_elem - HE capabilities element
 *
 * This structure is the "HE capabilities element" fixed fields as
 * described in P802.11ax_D4.0 section 9.4.2.242.2 and 9.4.2.242.3
 */
struct ieee80211_he_cap_elem {
        u8 mac_cap_info[6];
        u8 phy_cap_info[11];
} __packed;

#define IEEE80211_TX_RX_MCS_NSS_DESC_MAX_LEN        5

/**
 * enum ieee80211_he_mcs_support - HE MCS support definitions
 * @IEEE80211_HE_MCS_SUPPORT_0_7: MCSes 0-7 are supported for the
 *        number of streams
 * @IEEE80211_HE_MCS_SUPPORT_0_9: MCSes 0-9 are supported
 * @IEEE80211_HE_MCS_SUPPORT_0_11: MCSes 0-11 are supported
 * @IEEE80211_HE_MCS_NOT_SUPPORTED: This number of streams isn't supported
 *
 * These definitions are used in each 2-bit subfield of the rx_mcs_*
 * and tx_mcs_* fields of &struct ieee80211_he_mcs_nss_supp, which are
 * both split into 8 subfields by number of streams. These values indicate
 * which MCSes are supported for the number of streams the value appears
 * for.
 */
enum ieee80211_he_mcs_support {
        IEEE80211_HE_MCS_SUPPORT_0_7        = 0,
        IEEE80211_HE_MCS_SUPPORT_0_9        = 1,
        IEEE80211_HE_MCS_SUPPORT_0_11        = 2,
        IEEE80211_HE_MCS_NOT_SUPPORTED        = 3,
};

/**
 * struct ieee80211_he_mcs_nss_supp - HE Tx/Rx HE MCS NSS Support Field
 *
 * This structure holds the data required for the Tx/Rx HE MCS NSS Support Field
 * described in P802.11ax_D2.0 section 9.4.2.237.4
 *
 * @rx_mcs_80: Rx MCS map 2 bits for each stream, total 8 streams, for channel
 *     widths less than 80MHz.
 * @tx_mcs_80: Tx MCS map 2 bits for each stream, total 8 streams, for channel
 *     widths less than 80MHz.
 * @rx_mcs_160: Rx MCS map 2 bits for each stream, total 8 streams, for channel
 *     width 160MHz.
 * @tx_mcs_160: Tx MCS map 2 bits for each stream, total 8 streams, for channel
 *     width 160MHz.
 * @rx_mcs_80p80: Rx MCS map 2 bits for each stream, total 8 streams, for
 *     channel width 80p80MHz.
 * @tx_mcs_80p80: Tx MCS map 2 bits for each stream, total 8 streams, for
 *     channel width 80p80MHz.
 */
struct ieee80211_he_mcs_nss_supp {
        __le16 rx_mcs_80;
        __le16 tx_mcs_80;
        __le16 rx_mcs_160;
        __le16 tx_mcs_160;
        __le16 rx_mcs_80p80;
        __le16 tx_mcs_80p80;
} __packed;

/**
 * struct ieee80211_he_operation - HE capabilities element
 *
 * This structure is the "HE operation element" fields as
 * described in P802.11ax_D4.0 section 9.4.2.243
 */
struct ieee80211_he_operation {
        __le32 he_oper_params;
        __le16 he_mcs_nss_set;
        /* Optional 0,1,3,4,5,7 or 8 bytes: depends on @he_oper_params */
        u8 optional[];
} __packed;

/**
 * struct ieee80211_he_spr - HE spatial reuse element
 *
 * This structure is the "HE spatial reuse element" element as
 * described in P802.11ax_D4.0 section 9.4.2.241
 */
struct ieee80211_he_spr {
        u8 he_sr_control;
        /* Optional 0 to 19 bytes: depends on @he_sr_control */
        u8 optional[];
} __packed;

/**
 * struct ieee80211_he_mu_edca_param_ac_rec - MU AC Parameter Record field
 *
 * This structure is the "MU AC Parameter Record" fields as
 * described in P802.11ax_D4.0 section 9.4.2.245
 */
struct ieee80211_he_mu_edca_param_ac_rec {
        u8 aifsn;
        u8 ecw_min_max;
        u8 mu_edca_timer;
} __packed;

/**
 * struct ieee80211_mu_edca_param_set - MU EDCA Parameter Set element
 *
 * This structure is the "MU EDCA Parameter Set element" fields as
 * described in P802.11ax_D4.0 section 9.4.2.245
 */
struct ieee80211_mu_edca_param_set {
        u8 mu_qos_info;
        struct ieee80211_he_mu_edca_param_ac_rec ac_be;
        struct ieee80211_he_mu_edca_param_ac_rec ac_bk;
        struct ieee80211_he_mu_edca_param_ac_rec ac_vi;
        struct ieee80211_he_mu_edca_param_ac_rec ac_vo;
} __packed;

/* 802.11ac VHT Capabilities */
#define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895                        0x00000000
#define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991                        0x00000001
#define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454                        0x00000002
#define IEEE80211_VHT_CAP_MAX_MPDU_MASK                                0x00000003
#define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ                0x00000004
#define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ        0x00000008
#define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK                        0x0000000C
#define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_SHIFT                        2
#define IEEE80211_VHT_CAP_RXLDPC                                0x00000010
#define IEEE80211_VHT_CAP_SHORT_GI_80                                0x00000020
#define IEEE80211_VHT_CAP_SHORT_GI_160                                0x00000040
#define IEEE80211_VHT_CAP_TXSTBC                                0x00000080
#define IEEE80211_VHT_CAP_RXSTBC_1                                0x00000100
#define IEEE80211_VHT_CAP_RXSTBC_2                                0x00000200
#define IEEE80211_VHT_CAP_RXSTBC_3                                0x00000300
#define IEEE80211_VHT_CAP_RXSTBC_4                                0x00000400
#define IEEE80211_VHT_CAP_RXSTBC_MASK                                0x00000700
#define IEEE80211_VHT_CAP_RXSTBC_SHIFT                                8
#define IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE                        0x00000800
#define IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE                        0x00001000
#define IEEE80211_VHT_CAP_BEAMFORMEE_STS_SHIFT                  13
#define IEEE80211_VHT_CAP_BEAMFORMEE_STS_MASK                        \
                (7 << IEEE80211_VHT_CAP_BEAMFORMEE_STS_SHIFT)
#define IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_SHIFT                16
#define IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_MASK                \
                (7 << IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_SHIFT)
#define IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE                        0x00080000
#define IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE                        0x00100000
#define IEEE80211_VHT_CAP_VHT_TXOP_PS                                0x00200000
#define IEEE80211_VHT_CAP_HTC_VHT                                0x00400000
#define IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT        23
#define IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK        \
                (7 << IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT)
#define IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_UNSOL_MFB        0x08000000
#define IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_MRQ_MFB        0x0c000000
#define IEEE80211_VHT_CAP_RX_ANTENNA_PATTERN                        0x10000000
#define IEEE80211_VHT_CAP_TX_ANTENNA_PATTERN                        0x20000000
#define IEEE80211_VHT_CAP_EXT_NSS_BW_SHIFT                        30
#define IEEE80211_VHT_CAP_EXT_NSS_BW_MASK                        0xc0000000

/**
 * ieee80211_get_vht_max_nss - return max NSS for a given bandwidth/MCS
 * @cap: VHT capabilities of the peer
 * @bw: bandwidth to use
 * @mcs: MCS index to use
 * @ext_nss_bw_capable: indicates whether or not the local transmitter
 *        (rate scaling algorithm) can deal with the new logic
 *        (dot11VHTExtendedNSSBWCapable)
 * @max_vht_nss: current maximum NSS as advertised by the STA in
 *        operating mode notification, can be 0 in which case the
 *        capability data will be used to derive this (from MCS support)
 *
 * Due to the VHT Extended NSS Bandwidth Support, the maximum NSS can
 * vary for a given BW/MCS. This function parses the data.
 *
 * Note: This function is exported by cfg80211.
 */
int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap,
                              enum ieee80211_vht_chanwidth bw,
                              int mcs, bool ext_nss_bw_capable,
                              unsigned int max_vht_nss);

/* 802.11ax HE MAC capabilities */
#define IEEE80211_HE_MAC_CAP0_HTC_HE                                0x01
#define IEEE80211_HE_MAC_CAP0_TWT_REQ                                0x02
#define IEEE80211_HE_MAC_CAP0_TWT_RES                                0x04
#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_NOT_SUPP                0x00
#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_1                0x08
#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_2                0x10
#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_3                0x18
#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_MASK                        0x18
#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_1                0x00
#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_2                0x20
#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_4                0x40
#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_8                0x60
#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_16                0x80
#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_32                0xa0
#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_64                0xc0
#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_UNLIMITED        0xe0
#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_MASK                0xe0

#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_UNLIMITED                0x00
#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_128                        0x01
#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_256                        0x02
#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_512                        0x03
#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_MASK                0x03
#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_0US                0x00
#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_8US                0x04
#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US                0x08
#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_MASK                0x0c
#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_1                0x00
#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_2                0x10
#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_3                0x20
#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_4                0x30
#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_5                0x40
#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_6                0x50
#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_7                0x60
#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8                0x70
#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_MASK                0x70

/* Link adaptation is split between byte HE_MAC_CAP1 and
 * HE_MAC_CAP2. It should be set only if IEEE80211_HE_MAC_CAP0_HTC_HE
 * in which case the following values apply:
 * 0 = No feedback.
 * 1 = reserved.
 * 2 = Unsolicited feedback.
 * 3 = both
 */
#define IEEE80211_HE_MAC_CAP1_LINK_ADAPTATION                        0x80

#define IEEE80211_HE_MAC_CAP2_LINK_ADAPTATION                        0x01
#define IEEE80211_HE_MAC_CAP2_ALL_ACK                                0x02
#define IEEE80211_HE_MAC_CAP2_TRS                                0x04
#define IEEE80211_HE_MAC_CAP2_BSR                                0x08
#define IEEE80211_HE_MAC_CAP2_BCAST_TWT                                0x10
#define IEEE80211_HE_MAC_CAP2_32BIT_BA_BITMAP                        0x20
#define IEEE80211_HE_MAC_CAP2_MU_CASCADING                        0x40
#define IEEE80211_HE_MAC_CAP2_ACK_EN                                0x80

#define IEEE80211_HE_MAC_CAP3_OMI_CONTROL                        0x02
#define IEEE80211_HE_MAC_CAP3_OFDMA_RA                                0x04

/* The maximum length of an A-MDPU is defined by the combination of the Maximum
 * A-MDPU Length Exponent field in the HT capabilities, VHT capabilities and the
 * same field in the HE capabilities.
 */
#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_USE_VHT        0x00
#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_VHT_1                0x08
#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_VHT_2                0x10
#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_RESERVED        0x18
#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_MASK                0x18
#define IEEE80211_HE_MAC_CAP3_AMSDU_FRAG                        0x20
#define IEEE80211_HE_MAC_CAP3_FLEX_TWT_SCHED                        0x40
#define IEEE80211_HE_MAC_CAP3_RX_CTRL_FRAME_TO_MULTIBSS                0x80

#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_SHIFT                3

#define IEEE80211_HE_MAC_CAP4_BSRP_BQRP_A_MPDU_AGG                0x01
#define IEEE80211_HE_MAC_CAP4_QTP                                0x02
#define IEEE80211_HE_MAC_CAP4_BQR                                0x04
#define IEEE80211_HE_MAC_CAP4_SRP_RESP                                0x08
#define IEEE80211_HE_MAC_CAP4_NDP_FB_REP                        0x10
#define IEEE80211_HE_MAC_CAP4_OPS                                0x20
#define IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU                        0x40
/* Multi TID agg TX is split between byte #4 and #5
 * The value is a combination of B39,B40,B41
 */
#define IEEE80211_HE_MAC_CAP4_MULTI_TID_AGG_TX_QOS_B39                0x80

#define IEEE80211_HE_MAC_CAP5_MULTI_TID_AGG_TX_QOS_B40                0x01
#define IEEE80211_HE_MAC_CAP5_MULTI_TID_AGG_TX_QOS_B41                0x02
#define IEEE80211_HE_MAC_CAP5_SUBCHAN_SELECVITE_TRANSMISSION        0x04
#define IEEE80211_HE_MAC_CAP5_UL_2x996_TONE_RU                        0x08
#define IEEE80211_HE_MAC_CAP5_OM_CTRL_UL_MU_DATA_DIS_RX                0x10
#define IEEE80211_HE_MAC_CAP5_HE_DYNAMIC_SM_PS                        0x20
#define IEEE80211_HE_MAC_CAP5_PUNCTURED_SOUNDING                0x40
#define IEEE80211_HE_MAC_CAP5_HT_VHT_TRIG_FRAME_RX                0x80

#define IEEE80211_HE_VHT_MAX_AMPDU_FACTOR        20
#define IEEE80211_HE_HT_MAX_AMPDU_FACTOR        16

/* 802.11ax HE PHY capabilities */
#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G                0x02
#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G        0x04
#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G                0x08
#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G        0x10
#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_RU_MAPPING_IN_2G        0x20
#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_RU_MAPPING_IN_5G        0x40
#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_MASK                        0xfe

#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_80MHZ_ONLY_SECOND_20MHZ        0x01
#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_80MHZ_ONLY_SECOND_40MHZ        0x02
#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_160MHZ_ONLY_SECOND_20MHZ        0x04
#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_160MHZ_ONLY_SECOND_40MHZ        0x08
#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK                        0x0f
#define IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A                                0x10
#define IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD                        0x20
#define IEEE80211_HE_PHY_CAP1_HE_LTF_AND_GI_FOR_HE_PPDUS_0_8US                0x40
/* Midamble RX/TX Max NSTS is split between byte #2 and byte #3 */
#define IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS                        0x80

#define IEEE80211_HE_PHY_CAP2_MIDAMBLE_RX_TX_MAX_NSTS                        0x01
#define IEEE80211_HE_PHY_CAP2_NDP_4x_LTF_AND_3_2US                        0x02
#define IEEE80211_HE_PHY_CAP2_STBC_TX_UNDER_80MHZ                        0x04
#define IEEE80211_HE_PHY_CAP2_STBC_RX_UNDER_80MHZ                        0x08
#define IEEE80211_HE_PHY_CAP2_DOPPLER_TX                                0x10
#define IEEE80211_HE_PHY_CAP2_DOPPLER_RX                                0x20

/* Note that the meaning of UL MU below is different between an AP and a non-AP
 * sta, where in the AP case it indicates support for Rx and in the non-AP sta
 * case it indicates support for Tx.
 */
#define IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO                        0x40
#define IEEE80211_HE_PHY_CAP2_UL_MU_PARTIAL_MU_MIMO                        0x80

#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_NO_DCM                        0x00
#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_BPSK                        0x01
#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_QPSK                        0x02
#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_16_QAM                        0x03
#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_MASK                        0x03
#define IEEE80211_HE_PHY_CAP3_DCM_MAX_TX_NSS_1                                0x00
#define IEEE80211_HE_PHY_CAP3_DCM_MAX_TX_NSS_2                                0x04
#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_NO_DCM                        0x00
#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_BPSK                        0x08
#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_QPSK                        0x10
#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_16_QAM                        0x18
#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_MASK                        0x18
#define IEEE80211_HE_PHY_CAP3_DCM_MAX_RX_NSS_1                                0x00
#define IEEE80211_HE_PHY_CAP3_DCM_MAX_RX_NSS_2                                0x20
#define IEEE80211_HE_PHY_CAP3_RX_HE_MU_PPDU_FROM_NON_AP_STA                0x40
#define IEEE80211_HE_PHY_CAP3_SU_BEAMFORMER                                0x80

#define IEEE80211_HE_PHY_CAP4_SU_BEAMFORMEE                                0x01
#define IEEE80211_HE_PHY_CAP4_MU_BEAMFORMER                                0x02

/* Minimal allowed value of Max STS under 80MHz is 3 */
#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_4                0x0c
#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_5                0x10
#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_6                0x14
#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_7                0x18
#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_8                0x1c
#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_MASK        0x1c

/* Minimal allowed value of Max STS above 80MHz is 3 */
#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_4                0x60
#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_5                0x80
#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_6                0xa0
#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_7                0xc0
#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_8                0xe0
#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_MASK        0xe0

#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_1        0x00
#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_2        0x01
#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_3        0x02
#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_4        0x03
#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_5        0x04
#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_6        0x05
#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_7        0x06
#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_8        0x07
#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_MASK        0x07

#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_1        0x00
#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_2        0x08
#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_3        0x10
#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_4        0x18
#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_5        0x20
#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_6        0x28
#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_7        0x30
#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_8        0x38
#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_MASK        0x38

#define IEEE80211_HE_PHY_CAP5_NG16_SU_FEEDBACK                                0x40
#define IEEE80211_HE_PHY_CAP5_NG16_MU_FEEDBACK                                0x80

#define IEEE80211_HE_PHY_CAP6_CODEBOOK_SIZE_42_SU                        0x01
#define IEEE80211_HE_PHY_CAP6_CODEBOOK_SIZE_75_MU                        0x02
#define IEEE80211_HE_PHY_CAP6_TRIG_SU_BEAMFORMER_FB                        0x04
#define IEEE80211_HE_PHY_CAP6_TRIG_MU_BEAMFORMER_FB                        0x08
#define IEEE80211_HE_PHY_CAP6_TRIG_CQI_FB                                0x10
#define IEEE80211_HE_PHY_CAP6_PARTIAL_BW_EXT_RANGE                        0x20
#define IEEE80211_HE_PHY_CAP6_PARTIAL_BANDWIDTH_DL_MUMIMO                0x40
#define IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT                        0x80

#define IEEE80211_HE_PHY_CAP7_SRP_BASED_SR                                0x01
#define IEEE80211_HE_PHY_CAP7_POWER_BOOST_FACTOR_AR                        0x02
#define IEEE80211_HE_PHY_CAP7_HE_SU_MU_PPDU_4XLTF_AND_08_US_GI                0x04
#define IEEE80211_HE_PHY_CAP7_MAX_NC_1                                        0x08
#define IEEE80211_HE_PHY_CAP7_MAX_NC_2                                        0x10
#define IEEE80211_HE_PHY_CAP7_MAX_NC_3                                        0x18
#define IEEE80211_HE_PHY_CAP7_MAX_NC_4                                        0x20
#define IEEE80211_HE_PHY_CAP7_MAX_NC_5                                        0x28
#define IEEE80211_HE_PHY_CAP7_MAX_NC_6                                        0x30
#define IEEE80211_HE_PHY_CAP7_MAX_NC_7                                        0x38
#define IEEE80211_HE_PHY_CAP7_MAX_NC_MASK                                0x38
#define IEEE80211_HE_PHY_CAP7_STBC_TX_ABOVE_80MHZ                        0x40
#define IEEE80211_HE_PHY_CAP7_STBC_RX_ABOVE_80MHZ                        0x80

#define IEEE80211_HE_PHY_CAP8_HE_ER_SU_PPDU_4XLTF_AND_08_US_GI                0x01
#define IEEE80211_HE_PHY_CAP8_20MHZ_IN_40MHZ_HE_PPDU_IN_2G                0x02
#define IEEE80211_HE_PHY_CAP8_20MHZ_IN_160MHZ_HE_PPDU                        0x04
#define IEEE80211_HE_PHY_CAP8_80MHZ_IN_160MHZ_HE_PPDU                        0x08
#define IEEE80211_HE_PHY_CAP8_HE_ER_SU_1XLTF_AND_08_US_GI                0x10
#define IEEE80211_HE_PHY_CAP8_MIDAMBLE_RX_TX_2X_AND_1XLTF                0x20
#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_242                                0x00
#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_484                                0x40
#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_996                                0x80
#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_2x996                                0xc0
#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_MASK                                0xc0

#define IEEE80211_HE_PHY_CAP9_LONGER_THAN_16_SIGB_OFDM_SYM                0x01
#define IEEE80211_HE_PHY_CAP9_NON_TRIGGERED_CQI_FEEDBACK                0x02
#define IEEE80211_HE_PHY_CAP9_TX_1024_QAM_LESS_THAN_242_TONE_RU                0x04
#define IEEE80211_HE_PHY_CAP9_RX_1024_QAM_LESS_THAN_242_TONE_RU                0x08
#define IEEE80211_HE_PHY_CAP9_RX_FULL_BW_SU_USING_MU_WITH_COMP_SIGB        0x10
#define IEEE80211_HE_PHY_CAP9_RX_FULL_BW_SU_USING_MU_WITH_NON_COMP_SIGB        0x20
#define IEEE80211_HE_PHY_CAP9_NOMIMAL_PKT_PADDING_0US                        0x00
#define IEEE80211_HE_PHY_CAP9_NOMIMAL_PKT_PADDING_8US                        0x40
#define IEEE80211_HE_PHY_CAP9_NOMIMAL_PKT_PADDING_16US                        0x80
#define IEEE80211_HE_PHY_CAP9_NOMIMAL_PKT_PADDING_RESERVED                0xc0
#define IEEE80211_HE_PHY_CAP9_NOMIMAL_PKT_PADDING_MASK                        0xc0

/* 802.11ax HE TX/RX MCS NSS Support  */
#define IEEE80211_TX_RX_MCS_NSS_SUPP_HIGHEST_MCS_POS                        (3)
#define IEEE80211_TX_RX_MCS_NSS_SUPP_TX_BITMAP_POS                        (6)
#define IEEE80211_TX_RX_MCS_NSS_SUPP_RX_BITMAP_POS                        (11)
#define IEEE80211_TX_RX_MCS_NSS_SUPP_TX_BITMAP_MASK                        0x07c0
#define IEEE80211_TX_RX_MCS_NSS_SUPP_RX_BITMAP_MASK                        0xf800

/* TX/RX HE MCS Support field Highest MCS subfield encoding */
enum ieee80211_he_highest_mcs_supported_subfield_enc {
        HIGHEST_MCS_SUPPORTED_MCS7 = 0,
        HIGHEST_MCS_SUPPORTED_MCS8,
        HIGHEST_MCS_SUPPORTED_MCS9,
        HIGHEST_MCS_SUPPORTED_MCS10,
        HIGHEST_MCS_SUPPORTED_MCS11,
};

/* Calculate 802.11ax HE capabilities IE Tx/Rx HE MCS NSS Support Field size */
static inline u8
ieee80211_he_mcs_nss_size(const struct ieee80211_he_cap_elem *he_cap)
{
        u8 count = 4;

        if (he_cap->phy_cap_info[0] &
            IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G)
                count += 4;

        if (he_cap->phy_cap_info[0] &
            IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G)
                count += 4;

        return count;
}

/* 802.11ax HE PPE Thresholds */
#define IEEE80211_PPE_THRES_NSS_SUPPORT_2NSS                        (1)
#define IEEE80211_PPE_THRES_NSS_POS                                (0)
#define IEEE80211_PPE_THRES_NSS_MASK                                (7)
#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_2x966_AND_966_RU        \
        (BIT(5) | BIT(6))
#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK                0x78
#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_POS                (3)
#define IEEE80211_PPE_THRES_INFO_PPET_SIZE                        (3)

/*
 * Calculate 802.11ax HE capabilities IE PPE field size
 * Input: Header byte of ppe_thres (first byte), and HE capa IE's PHY cap u8*
 */
static inline u8
ieee80211_he_ppe_size(u8 ppe_thres_hdr, const u8 *phy_cap_info)
{
        u8 n;

        if ((phy_cap_info[6] &
             IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) == 0)
                return 0;

        n = hweight8(ppe_thres_hdr &
                     IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK);
        n *= (1 + ((ppe_thres_hdr & IEEE80211_PPE_THRES_NSS_MASK) >>
                   IEEE80211_PPE_THRES_NSS_POS));

        /*
         * Each pair is 6 bits, and we need to add the 7 "header" bits to the
         * total size.
         */
        n = (n * IEEE80211_PPE_THRES_INFO_PPET_SIZE * 2) + 7;
        n = DIV_ROUND_UP(n, 8);

        return n;
}

/* HE Operation defines */
#define IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK                0x00000007
#define IEEE80211_HE_OPERATION_TWT_REQUIRED                        0x00000008
#define IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK                0x00003ff0
#define IEEE80211_HE_OPERATION_RTS_THRESHOLD_OFFSET                4
#define IEEE80211_HE_OPERATION_VHT_OPER_INFO                        0x00004000
#define IEEE80211_HE_OPERATION_CO_HOSTED_BSS                        0x00008000
#define IEEE80211_HE_OPERATION_ER_SU_DISABLE                        0x00010000
#define IEEE80211_HE_OPERATION_6GHZ_OP_INFO                        0x00020000
#define IEEE80211_HE_OPERATION_BSS_COLOR_MASK                        0x3f000000
#define IEEE80211_HE_OPERATION_BSS_COLOR_OFFSET                        24
#define IEEE80211_HE_OPERATION_PARTIAL_BSS_COLOR                0x40000000
#define IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED                0x80000000

/**
 * ieee80211_he_6ghz_oper - HE 6 GHz operation Information field
 * @primary: primary channel
 * @control: control flags
 * @ccfs0: channel center frequency segment 0
 * @ccfs1: channel center frequency segment 1
 * @minrate: minimum rate (in 1 Mbps units)
 */
struct ieee80211_he_6ghz_oper {
        u8 primary;
#define IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH        0x3
#define                IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ        0
#define                IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_40MHZ        1
#define                IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ        2
#define                IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ        3
#define IEEE80211_HE_6GHZ_OPER_CTRL_DUP_BEACON        0x4
        u8 control;
        u8 ccfs0;
        u8 ccfs1;
        u8 minrate;
} __packed;

/*
 * ieee80211_he_oper_size - calculate 802.11ax HE Operations IE size
 * @he_oper_ie: byte data of the He Operations IE, stating from the byte
 *        after the ext ID byte. It is assumed that he_oper_ie has at least
 *        sizeof(struct ieee80211_he_operation) bytes, the caller must have
 *        validated this.
 * @return the actual size of the IE data (not including header), or 0 on error
 */
static inline u8
ieee80211_he_oper_size(const u8 *he_oper_ie)
{
        struct ieee80211_he_operation *he_oper = (void *)he_oper_ie;
        u8 oper_len = sizeof(struct ieee80211_he_operation);
        u32 he_oper_params;

        /* Make sure the input is not NULL */
        if (!he_oper_ie)
                return 0;

        /* Calc required length */
        he_oper_params = le32_to_cpu(he_oper->he_oper_params);
        if (he_oper_params & IEEE80211_HE_OPERATION_VHT_OPER_INFO)
                oper_len += 3;
        if (he_oper_params & IEEE80211_HE_OPERATION_CO_HOSTED_BSS)
                oper_len++;
        if (he_oper_params & IEEE80211_HE_OPERATION_6GHZ_OP_INFO)
                oper_len += sizeof(struct ieee80211_he_6ghz_oper);

        /* Add the first byte (extension ID) to the total length */
        oper_len++;

        return oper_len;
}

/**
 * ieee80211_he_6ghz_oper - obtain 6 GHz operation field
 * @he_oper: HE operation element (must be pre-validated for size)
 *        but may be %NULL
 *
 * Return: a pointer to the 6 GHz operation field, or %NULL
 */
static inline const struct ieee80211_he_6ghz_oper *
ieee80211_he_6ghz_oper(const struct ieee80211_he_operation *he_oper)
{
        const u8 *ret = (void *)&he_oper->optional;
        u32 he_oper_params;

        if (!he_oper)
                return NULL;

        he_oper_params = le32_to_cpu(he_oper->he_oper_params);

        if (!(he_oper_params & IEEE80211_HE_OPERATION_6GHZ_OP_INFO))
                return NULL;
        if (he_oper_params & IEEE80211_HE_OPERATION_VHT_OPER_INFO)
                ret += 3;
        if (he_oper_params & IEEE80211_HE_OPERATION_CO_HOSTED_BSS)
                ret++;

        return (void *)ret;
}

/* HE Spatial Reuse defines */
#define IEEE80211_HE_SPR_PSR_DISALLOWED                                BIT(0)
#define IEEE80211_HE_SPR_NON_SRG_OBSS_PD_SR_DISALLOWED                BIT(1)
#define IEEE80211_HE_SPR_NON_SRG_OFFSET_PRESENT                        BIT(2)
#define IEEE80211_HE_SPR_SRG_INFORMATION_PRESENT                BIT(3)
#define IEEE80211_HE_SPR_HESIGA_SR_VAL15_ALLOWED                BIT(4)

/*
 * ieee80211_he_spr_size - calculate 802.11ax HE Spatial Reuse IE size
 * @he_spr_ie: byte data of the He Spatial Reuse IE, stating from the byte
 *        after the ext ID byte. It is assumed that he_spr_ie has at least
 *        sizeof(struct ieee80211_he_spr) bytes, the caller must have validated
 *        this
 * @return the actual size of the IE data (not including header), or 0 on error
 */
static inline u8
ieee80211_he_spr_size(const u8 *he_spr_ie)
{
        struct ieee80211_he_spr *he_spr = (void *)he_spr_ie;
        u8 spr_len = sizeof(struct ieee80211_he_spr);
        u8 he_spr_params;

        /* Make sure the input is not NULL */
        if (!he_spr_ie)
                return 0;

        /* Calc required length */
        he_spr_params = he_spr->he_sr_control;
        if (he_spr_params & IEEE80211_HE_SPR_NON_SRG_OFFSET_PRESENT)
                spr_len++;
        if (he_spr_params & IEEE80211_HE_SPR_SRG_INFORMATION_PRESENT)
                spr_len += 18;

        /* Add the first byte (extension ID) to the total length */
        spr_len++;

        return spr_len;
}

/* S1G Capabilities Information field */
#define IEEE80211_S1G_CAPABILITY_LEN        15

#define S1G_CAP0_S1G_LONG        BIT(0)
#define S1G_CAP0_SGI_1MHZ        BIT(1)
#define S1G_CAP0_SGI_2MHZ        BIT(2)
#define S1G_CAP0_SGI_4MHZ        BIT(3)
#define S1G_CAP0_SGI_8MHZ        BIT(4)
#define S1G_CAP0_SGI_16MHZ        BIT(5)
#define S1G_CAP0_SUPP_CH_WIDTH        GENMASK(7, 6)

#define S1G_SUPP_CH_WIDTH_2        0
#define S1G_SUPP_CH_WIDTH_4        1
#define S1G_SUPP_CH_WIDTH_8        2
#define S1G_SUPP_CH_WIDTH_16        3
#define S1G_SUPP_CH_WIDTH_MAX(cap) ((1 << FIELD_GET(S1G_CAP0_SUPP_CH_WIDTH, \
                                                    cap[0])) << 1)

#define S1G_CAP1_RX_LDPC        BIT(0)
#define S1G_CAP1_TX_STBC        BIT(1)
#define S1G_CAP1_RX_STBC        BIT(2)
#define S1G_CAP1_SU_BFER        BIT(3)
#define S1G_CAP1_SU_BFEE        BIT(4)
#define S1G_CAP1_BFEE_STS        GENMASK(7, 5)

#define S1G_CAP2_SOUNDING_DIMENSIONS        GENMASK(2, 0)
#define S1G_CAP2_MU_BFER                BIT(3)
#define S1G_CAP2_MU_BFEE                BIT(4)
#define S1G_CAP2_PLUS_HTC_VHT                BIT(5)
#define S1G_CAP2_TRAVELING_PILOT        GENMASK(7, 6)

#define S1G_CAP3_RD_RESPONDER                BIT(0)
#define S1G_CAP3_HT_DELAYED_BA                BIT(1)
#define S1G_CAP3_MAX_MPDU_LEN                BIT(2)
#define S1G_CAP3_MAX_AMPDU_LEN_EXP        GENMASK(4, 3)
#define S1G_CAP3_MIN_MPDU_START                GENMASK(7, 5)

#define S1G_CAP4_UPLINK_SYNC        BIT(0)
#define S1G_CAP4_DYNAMIC_AID        BIT(1)
#define S1G_CAP4_BAT                BIT(2)
#define S1G_CAP4_TIME_ADE        BIT(3)
#define S1G_CAP4_NON_TIM        BIT(4)
#define S1G_CAP4_GROUP_AID        BIT(5)
#define S1G_CAP4_STA_TYPE        GENMASK(7, 6)

#define S1G_CAP5_CENT_AUTH_CONTROL        BIT(0)
#define S1G_CAP5_DIST_AUTH_CONTROL        BIT(1)
#define S1G_CAP5_AMSDU                        BIT(2)
#define S1G_CAP5_AMPDU                        BIT(3)
#define S1G_CAP5_ASYMMETRIC_BA                BIT(4)
#define S1G_CAP5_FLOW_CONTROL                BIT(5)
#define S1G_CAP5_SECTORIZED_BEAM        GENMASK(7, 6)

#define S1G_CAP6_OBSS_MITIGATION        BIT(0)
#define S1G_CAP6_FRAGMENT_BA                BIT(1)
#define S1G_CAP6_NDP_PS_POLL                BIT(2)
#define S1G_CAP6_RAW_OPERATION                BIT(3)
#define S1G_CAP6_PAGE_SLICING                BIT(4)
#define S1G_CAP6_TXOP_SHARING_IMP_ACK        BIT(5)
#define S1G_CAP6_VHT_LINK_ADAPT                GENMASK(7, 6)

#define S1G_CAP7_TACK_AS_PS_POLL                BIT(0)
#define S1G_CAP7_DUP_1MHZ                        BIT(1)
#define S1G_CAP7_MCS_NEGOTIATION                BIT(2)
#define S1G_CAP7_1MHZ_CTL_RESPONSE_PREAMBLE        BIT(3)
#define S1G_CAP7_NDP_BFING_REPORT_POLL                BIT(4)
#define S1G_CAP7_UNSOLICITED_DYN_AID                BIT(5)
#define S1G_CAP7_SECTOR_TRAINING_OPERATION        BIT(6)
#define S1G_CAP7_TEMP_PS_MODE_SWITCH                BIT(7)

#define S1G_CAP8_TWT_GROUPING        BIT(0)
#define S1G_CAP8_BDT                BIT(1)
#define S1G_CAP8_COLOR                GENMASK(4, 2)
#define S1G_CAP8_TWT_REQUEST        BIT(5)
#define S1G_CAP8_TWT_RESPOND        BIT(6)
#define S1G_CAP8_PV1_FRAME        BIT(7)

#define S1G_CAP9_LINK_ADAPT_PER_CONTROL_RESPONSE BIT(0)

#define S1G_OPER_CH_WIDTH_PRIMARY_1MHZ        BIT(0)
#define S1G_OPER_CH_WIDTH_OPER                GENMASK(4, 1)


#define LISTEN_INT_USF        GENMASK(15, 14)
#define LISTEN_INT_UI        GENMASK(13, 0)

#define IEEE80211_MAX_USF        FIELD_MAX(LISTEN_INT_USF)
#define IEEE80211_MAX_UI        FIELD_MAX(LISTEN_INT_UI)

/* Authentication algorithms */
#define WLAN_AUTH_OPEN 0
#define WLAN_AUTH_SHARED_KEY 1
#define WLAN_AUTH_FT 2
#define WLAN_AUTH_SAE 3
#define WLAN_AUTH_FILS_SK 4
#define WLAN_AUTH_FILS_SK_PFS 5
#define WLAN_AUTH_FILS_PK 6
#define WLAN_AUTH_LEAP 128

#define WLAN_AUTH_CHALLENGE_LEN 128

#define WLAN_CAPABILITY_ESS                (1<<0)
#define WLAN_CAPABILITY_IBSS                (1<<1)

/*
 * A mesh STA sets the ESS and IBSS capability bits to zero.
 * however, this holds true for p2p probe responses (in the p2p_find
 * phase) as well.
 */
#define WLAN_CAPABILITY_IS_STA_BSS(cap)        \
        (!((cap) & (WLAN_CAPABILITY_ESS | WLAN_CAPABILITY_IBSS)))

#define WLAN_CAPABILITY_CF_POLLABLE        (1<<2)
#define WLAN_CAPABILITY_CF_POLL_REQUEST        (1<<3)
#define WLAN_CAPABILITY_PRIVACY                (1<<4)
#define WLAN_CAPABILITY_SHORT_PREAMBLE        (1<<5)
#define WLAN_CAPABILITY_PBCC                (1<<6)
#define WLAN_CAPABILITY_CHANNEL_AGILITY        (1<<7)

/* 802.11h */
#define WLAN_CAPABILITY_SPECTRUM_MGMT        (1<<8)
#define WLAN_CAPABILITY_QOS                (1<<9)
#define WLAN_CAPABILITY_SHORT_SLOT_TIME        (1<<10)
#define WLAN_CAPABILITY_APSD                (1<<11)
#define WLAN_CAPABILITY_RADIO_MEASURE        (1<<12)
#define WLAN_CAPABILITY_DSSS_OFDM        (1<<13)
#define WLAN_CAPABILITY_DEL_BACK        (1<<14)
#define WLAN_CAPABILITY_IMM_BACK        (1<<15)

/* DMG (60gHz) 802.11ad */
/* type - bits 0..1 */
#define WLAN_CAPABILITY_DMG_TYPE_MASK                (3<<0)
#define WLAN_CAPABILITY_DMG_TYPE_IBSS                (1<<0) /* Tx by: STA */
#define WLAN_CAPABILITY_DMG_TYPE_PBSS                (2<<0) /* Tx by: PCP */
#define WLAN_CAPABILITY_DMG_TYPE_AP                (3<<0) /* Tx by: AP */

#define WLAN_CAPABILITY_DMG_CBAP_ONLY                (1<<2)
#define WLAN_CAPABILITY_DMG_CBAP_SOURCE                (1<<3)
#define WLAN_CAPABILITY_DMG_PRIVACY                (1<<4)
#define WLAN_CAPABILITY_DMG_ECPAC                (1<<5)

#define WLAN_CAPABILITY_DMG_SPECTRUM_MGMT        (1<<8)
#define WLAN_CAPABILITY_DMG_RADIO_MEASURE        (1<<12)

/* measurement */
#define IEEE80211_SPCT_MSR_RPRT_MODE_LATE        (1<<0)
#define IEEE80211_SPCT_MSR_RPRT_MODE_INCAPABLE        (1<<1)
#define IEEE80211_SPCT_MSR_RPRT_MODE_REFUSED        (1<<2)

#define IEEE80211_SPCT_MSR_RPRT_TYPE_BASIC        0
#define IEEE80211_SPCT_MSR_RPRT_TYPE_CCA        1
#define IEEE80211_SPCT_MSR_RPRT_TYPE_RPI        2
#define IEEE80211_SPCT_MSR_RPRT_TYPE_LCI        8
#define IEEE80211_SPCT_MSR_RPRT_TYPE_CIVIC        11

/* 802.11g ERP information element */
#define WLAN_ERP_NON_ERP_PRESENT (1<<0)
#define WLAN_ERP_USE_PROTECTION (1<<1)
#define WLAN_ERP_BARKER_PREAMBLE (1<<2)

/* WLAN_ERP_BARKER_PREAMBLE values */
enum {
        WLAN_ERP_PREAMBLE_SHORT = 0,
        WLAN_ERP_PREAMBLE_LONG = 1,
};

/* Band ID, 802.11ad #8.4.1.45 */
enum {
        IEEE80211_BANDID_TV_WS = 0, /* TV white spaces */
        IEEE80211_BANDID_SUB1  = 1, /* Sub-1 GHz (excluding TV white spaces) */
        IEEE80211_BANDID_2G    = 2, /* 2.4 GHz */
        IEEE80211_BANDID_3G    = 3, /* 3.6 GHz */
        IEEE80211_BANDID_5G    = 4, /* 4.9 and 5 GHz */
        IEEE80211_BANDID_60G   = 5, /* 60 GHz */
};

/* Status codes */
enum ieee80211_statuscode {
        WLAN_STATUS_SUCCESS = 0,
        WLAN_STATUS_UNSPECIFIED_FAILURE = 1,
        WLAN_STATUS_CAPS_UNSUPPORTED = 10,
        WLAN_STATUS_REASSOC_NO_ASSOC = 11,
        WLAN_STATUS_ASSOC_DENIED_UNSPEC = 12,
        WLAN_STATUS_NOT_SUPPORTED_AUTH_ALG = 13,
        WLAN_STATUS_UNKNOWN_AUTH_TRANSACTION = 14,
        WLAN_STATUS_CHALLENGE_FAIL = 15,
        WLAN_STATUS_AUTH_TIMEOUT = 16,
        WLAN_STATUS_AP_UNABLE_TO_HANDLE_NEW_STA = 17,
        WLAN_STATUS_ASSOC_DENIED_RATES = 18,
        /* 802.11b */
        WLAN_STATUS_ASSOC_DENIED_NOSHORTPREAMBLE = 19,
        WLAN_STATUS_ASSOC_DENIED_NOPBCC = 20,
        WLAN_STATUS_ASSOC_DENIED_NOAGILITY = 21,
        /* 802.11h */
        WLAN_STATUS_ASSOC_DENIED_NOSPECTRUM = 22,
        WLAN_STATUS_ASSOC_REJECTED_BAD_POWER = 23,
        WLAN_STATUS_ASSOC_REJECTED_BAD_SUPP_CHAN = 24,
        /* 802.11g */
        WLAN_STATUS_ASSOC_DENIED_NOSHORTTIME = 25,
        WLAN_STATUS_ASSOC_DENIED_NODSSSOFDM = 26,
        /* 802.11w */
        WLAN_STATUS_ASSOC_REJECTED_TEMPORARILY = 30,
        WLAN_STATUS_ROBUST_MGMT_FRAME_POLICY_VIOLATION = 31,
        /* 802.11i */
        WLAN_STATUS_INVALID_IE = 40,
        WLAN_STATUS_INVALID_GROUP_CIPHER = 41,
        WLAN_STATUS_INVALID_PAIRWISE_CIPHER = 42,
        WLAN_STATUS_INVALID_AKMP = 43,
        WLAN_STATUS_UNSUPP_RSN_VERSION = 44,
        WLAN_STATUS_INVALID_RSN_IE_CAP = 45,
        WLAN_STATUS_CIPHER_SUITE_REJECTED = 46,
        /* 802.11e */
        WLAN_STATUS_UNSPECIFIED_QOS = 32,
        WLAN_STATUS_ASSOC_DENIED_NOBANDWIDTH = 33,
        WLAN_STATUS_ASSOC_DENIED_LOWACK = 34,
        WLAN_STATUS_ASSOC_DENIED_UNSUPP_QOS = 35,
        WLAN_STATUS_REQUEST_DECLINED = 37,
        WLAN_STATUS_INVALID_QOS_PARAM = 38,
        WLAN_STATUS_CHANGE_TSPEC = 39,
        WLAN_STATUS_WAIT_TS_DELAY = 47,
        WLAN_STATUS_NO_DIRECT_LINK = 48,
        WLAN_STATUS_STA_NOT_PRESENT = 49,
        WLAN_STATUS_STA_NOT_QSTA = 50,
        /* 802.11s */
        WLAN_STATUS_ANTI_CLOG_REQUIRED = 76,
        WLAN_STATUS_FCG_NOT_SUPP = 78,
        WLAN_STATUS_STA_NO_TBTT = 78,
        /* 802.11ad */
        WLAN_STATUS_REJECTED_WITH_SUGGESTED_CHANGES = 39,
        WLAN_STATUS_REJECTED_FOR_DELAY_PERIOD = 47,
        WLAN_STATUS_REJECT_WITH_SCHEDULE = 83,
        WLAN_STATUS_PENDING_ADMITTING_FST_SESSION = 86,
        WLAN_STATUS_PERFORMING_FST_NOW = 87,
        WLAN_STATUS_PENDING_GAP_IN_BA_WINDOW = 88,
        WLAN_STATUS_REJECT_U_PID_SETTING = 89,
        WLAN_STATUS_REJECT_DSE_BAND = 96,
        WLAN_STATUS_DENIED_WITH_SUGGESTED_BAND_AND_CHANNEL = 99,
        WLAN_STATUS_DENIED_DUE_TO_SPECTRUM_MANAGEMENT = 103,
        /* 802.11ai */
        WLAN_STATUS_FILS_AUTHENTICATION_FAILURE = 112,
        WLAN_STATUS_UNKNOWN_AUTHENTICATION_SERVER = 113,
        WLAN_STATUS_SAE_HASH_TO_ELEMENT = 126,
        WLAN_STATUS_SAE_PK = 127,
};


/* Reason codes */
enum ieee80211_reasoncode {
        WLAN_REASON_UNSPECIFIED = 1,
        WLAN_REASON_PREV_AUTH_NOT_VALID = 2,
        WLAN_REASON_DEAUTH_LEAVING = 3,
        WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY = 4,
        WLAN_REASON_DISASSOC_AP_BUSY = 5,
        WLAN_REASON_CLASS2_FRAME_FROM_NONAUTH_STA = 6,
        WLAN_REASON_CLASS3_FRAME_FROM_NONASSOC_STA = 7,
        WLAN_REASON_DISASSOC_STA_HAS_LEFT = 8,
        WLAN_REASON_STA_REQ_ASSOC_WITHOUT_AUTH = 9,
        /* 802.11h */
        WLAN_REASON_DISASSOC_BAD_POWER = 10,
        WLAN_REASON_DISASSOC_BAD_SUPP_CHAN = 11,
        /* 802.11i */
        WLAN_REASON_INVALID_IE = 13,
        WLAN_REASON_MIC_FAILURE = 14,
        WLAN_REASON_4WAY_HANDSHAKE_TIMEOUT = 15,
        WLAN_REASON_GROUP_KEY_HANDSHAKE_TIMEOUT = 16,
        WLAN_REASON_IE_DIFFERENT = 17,
        WLAN_REASON_INVALID_GROUP_CIPHER = 18,
        WLAN_REASON_INVALID_PAIRWISE_CIPHER = 19,
        WLAN_REASON_INVALID_AKMP = 20,
        WLAN_REASON_UNSUPP_RSN_VERSION = 21,
        WLAN_REASON_INVALID_RSN_IE_CAP = 22,
        WLAN_REASON_IEEE8021X_FAILED = 23,
        WLAN_REASON_CIPHER_SUITE_REJECTED = 24,
        /* TDLS (802.11z) */
        WLAN_REASON_TDLS_TEARDOWN_UNREACHABLE = 25,
        WLAN_REASON_TDLS_TEARDOWN_UNSPECIFIED = 26,
        /* 802.11e */
        WLAN_REASON_DISASSOC_UNSPECIFIED_QOS = 32,
        WLAN_REASON_DISASSOC_QAP_NO_BANDWIDTH = 33,
        WLAN_REASON_DISASSOC_LOW_ACK = 34,
        WLAN_REASON_DISASSOC_QAP_EXCEED_TXOP = 35,
        WLAN_REASON_QSTA_LEAVE_QBSS = 36,
        WLAN_REASON_QSTA_NOT_USE = 37,
        WLAN_REASON_QSTA_REQUIRE_SETUP = 38,
        WLAN_REASON_QSTA_TIMEOUT = 39,
        WLAN_REASON_QSTA_CIPHER_NOT_SUPP = 45,
        /* 802.11s */
        WLAN_REASON_MESH_PEER_CANCELED = 52,
        WLAN_REASON_MESH_MAX_PEERS = 53,
        WLAN_REASON_MESH_CONFIG = 54,
        WLAN_REASON_MESH_CLOSE = 55,
        WLAN_REASON_MESH_MAX_RETRIES = 56,
        WLAN_REASON_MESH_CONFIRM_TIMEOUT = 57,
        WLAN_REASON_MESH_INVALID_GTK = 58,
        WLAN_REASON_MESH_INCONSISTENT_PARAM = 59,
        WLAN_REASON_MESH_INVALID_SECURITY = 60,
        WLAN_REASON_MESH_PATH_ERROR = 61,
        WLAN_REASON_MESH_PATH_NOFORWARD = 62,
        WLAN_REASON_MESH_PATH_DEST_UNREACHABLE = 63,
        WLAN_REASON_MAC_EXISTS_IN_MBSS = 64,
        WLAN_REASON_MESH_CHAN_REGULATORY = 65,
        WLAN_REASON_MESH_CHAN = 66,
};


/* Information Element IDs */
enum ieee80211_eid {
        WLAN_EID_SSID = 0,
        WLAN_EID_SUPP_RATES = 1,
        WLAN_EID_FH_PARAMS = 2, /* reserved now */
        WLAN_EID_DS_PARAMS = 3,
        WLAN_EID_CF_PARAMS = 4,
        WLAN_EID_TIM = 5,
        WLAN_EID_IBSS_PARAMS = 6,
        WLAN_EID_COUNTRY = 7,
        /* 8, 9 reserved */
        WLAN_EID_REQUEST = 10,
        WLAN_EID_QBSS_LOAD = 11,
        WLAN_EID_EDCA_PARAM_SET = 12,
        WLAN_EID_TSPEC = 13,
        WLAN_EID_TCLAS = 14,
        WLAN_EID_SCHEDULE = 15,
        WLAN_EID_CHALLENGE = 16,
        /* 17-31 reserved for challenge text extension */
        WLAN_EID_PWR_CONSTRAINT = 32,
        WLAN_EID_PWR_CAPABILITY = 33,
        WLAN_EID_TPC_REQUEST = 34,
        WLAN_EID_TPC_REPORT = 35,
        WLAN_EID_SUPPORTED_CHANNELS = 36,
        WLAN_EID_CHANNEL_SWITCH = 37,
        WLAN_EID_MEASURE_REQUEST = 38,
        WLAN_EID_MEASURE_REPORT = 39,
        WLAN_EID_QUIET = 40,
        WLAN_EID_IBSS_DFS = 41,
        WLAN_EID_ERP_INFO = 42,
        WLAN_EID_TS_DELAY = 43,
        WLAN_EID_TCLAS_PROCESSING = 44,
        WLAN_EID_HT_CAPABILITY = 45,
        WLAN_EID_QOS_CAPA = 46,
        /* 47 reserved for Broadcom */
        WLAN_EID_RSN = 48,
        WLAN_EID_802_15_COEX = 49,
        WLAN_EID_EXT_SUPP_RATES = 50,
        WLAN_EID_AP_CHAN_REPORT = 51,
        WLAN_EID_NEIGHBOR_REPORT = 52,
        WLAN_EID_RCPI = 53,
        WLAN_EID_MOBILITY_DOMAIN = 54,
        WLAN_EID_FAST_BSS_TRANSITION = 55,
        WLAN_EID_TIMEOUT_INTERVAL = 56,
        WLAN_EID_RIC_DATA = 57,
        WLAN_EID_DSE_REGISTERED_LOCATION = 58,
        WLAN_EID_SUPPORTED_REGULATORY_CLASSES = 59,
        WLAN_EID_EXT_CHANSWITCH_ANN = 60,
        WLAN_EID_HT_OPERATION = 61,
        WLAN_EID_SECONDARY_CHANNEL_OFFSET = 62,
        WLAN_EID_BSS_AVG_ACCESS_DELAY = 63,
        WLAN_EID_ANTENNA_INFO = 64,
        WLAN_EID_RSNI = 65,
        WLAN_EID_MEASUREMENT_PILOT_TX_INFO = 66,
        WLAN_EID_BSS_AVAILABLE_CAPACITY = 67,
        WLAN_EID_BSS_AC_ACCESS_DELAY = 68,
        WLAN_EID_TIME_ADVERTISEMENT = 69,
        WLAN_EID_RRM_ENABLED_CAPABILITIES = 70,
        WLAN_EID_MULTIPLE_BSSID = 71,
        WLAN_EID_BSS_COEX_2040 = 72,
        WLAN_EID_BSS_INTOLERANT_CHL_REPORT = 73,
        WLAN_EID_OVERLAP_BSS_SCAN_PARAM = 74,
        WLAN_EID_RIC_DESCRIPTOR = 75,
        WLAN_EID_MMIE = 76,
        WLAN_EID_ASSOC_COMEBACK_TIME = 77,
        WLAN_EID_EVENT_REQUEST = 78,
        WLAN_EID_EVENT_REPORT = 79,
        WLAN_EID_DIAGNOSTIC_REQUEST = 80,
        WLAN_EID_DIAGNOSTIC_REPORT = 81,
        WLAN_EID_LOCATION_PARAMS = 82,
        WLAN_EID_NON_TX_BSSID_CAP =  83,
        WLAN_EID_SSID_LIST = 84,
        WLAN_EID_MULTI_BSSID_IDX = 85,
        WLAN_EID_FMS_DESCRIPTOR = 86,
        WLAN_EID_FMS_REQUEST = 87,
        WLAN_EID_FMS_RESPONSE = 88,
        WLAN_EID_QOS_TRAFFIC_CAPA = 89,
        WLAN_EID_BSS_MAX_IDLE_PERIOD = 90,
        WLAN_EID_TSF_REQUEST = 91,
        WLAN_EID_TSF_RESPOSNE = 92,
        WLAN_EID_WNM_SLEEP_MODE = 93,
        WLAN_EID_TIM_BCAST_REQ = 94,
        WLAN_EID_TIM_BCAST_RESP = 95,
        WLAN_EID_COLL_IF_REPORT = 96,
        WLAN_EID_CHANNEL_USAGE = 97,
        WLAN_EID_TIME_ZONE = 98,
        WLAN_EID_DMS_REQUEST = 99,
        WLAN_EID_DMS_RESPONSE = 100,
        WLAN_EID_LINK_ID = 101,
        WLAN_EID_WAKEUP_SCHEDUL = 102,
        /* 103 reserved */
        WLAN_EID_CHAN_SWITCH_TIMING = 104,
        WLAN_EID_PTI_CONTROL = 105,
        WLAN_EID_PU_BUFFER_STATUS = 106,
        WLAN_EID_INTERWORKING = 107,
        WLAN_EID_ADVERTISEMENT_PROTOCOL = 108,
        WLAN_EID_EXPEDITED_BW_REQ = 109,
        WLAN_EID_QOS_MAP_SET = 110,
        WLAN_EID_ROAMING_CONSORTIUM = 111,
        WLAN_EID_EMERGENCY_ALERT = 112,
        WLAN_EID_MESH_CONFIG = 113,
        WLAN_EID_MESH_ID = 114,
        WLAN_EID_LINK_METRIC_REPORT = 115,
        WLAN_EID_CONGESTION_NOTIFICATION = 116,
        WLAN_EID_PEER_MGMT = 117,
        WLAN_EID_CHAN_SWITCH_PARAM = 118,
        WLAN_EID_MESH_AWAKE_WINDOW = 119,
        WLAN_EID_BEACON_TIMING = 120,
        WLAN_EID_MCCAOP_SETUP_REQ = 121,
        WLAN_EID_MCCAOP_SETUP_RESP = 122,
        WLAN_EID_MCCAOP_ADVERT = 123,
        WLAN_EID_MCCAOP_TEARDOWN = 124,
        WLAN_EID_GANN = 125,
        WLAN_EID_RANN = 126,
        WLAN_EID_EXT_CAPABILITY = 127,
        /* 128, 129 reserved for Agere */
        WLAN_EID_PREQ = 130,
        WLAN_EID_PREP = 131,
        WLAN_EID_PERR = 132,
        /* 133-136 reserved for Cisco */
        WLAN_EID_PXU = 137,
        WLAN_EID_PXUC = 138,
        WLAN_EID_AUTH_MESH_PEER_EXCH = 139,
        WLAN_EID_MIC = 140,
        WLAN_EID_DESTINATION_URI = 141,
        WLAN_EID_UAPSD_COEX = 142,
        WLAN_EID_WAKEUP_SCHEDULE = 143,
        WLAN_EID_EXT_SCHEDULE = 144,
        WLAN_EID_STA_AVAILABILITY = 145,
        WLAN_EID_DMG_TSPEC = 146,
        WLAN_EID_DMG_AT = 147,
        WLAN_EID_DMG_CAP = 148,
        /* 149 reserved for Cisco */
        WLAN_EID_CISCO_VENDOR_SPECIFIC = 150,
        WLAN_EID_DMG_OPERATION = 151,
        WLAN_EID_DMG_BSS_PARAM_CHANGE = 152,
        WLAN_EID_DMG_BEAM_REFINEMENT = 153,
        WLAN_EID_CHANNEL_MEASURE_FEEDBACK = 154,
        /* 155-156 reserved for Cisco */
        WLAN_EID_AWAKE_WINDOW = 157,
        WLAN_EID_MULTI_BAND = 158,
        WLAN_EID_ADDBA_EXT = 159,
        WLAN_EID_NEXT_PCP_LIST = 160,
        WLAN_EID_PCP_HANDOVER = 161,
        WLAN_EID_DMG_LINK_MARGIN = 162,
        WLAN_EID_SWITCHING_STREAM = 163,
        WLAN_EID_SESSION_TRANSITION = 164,
        WLAN_EID_DYN_TONE_PAIRING_REPORT = 165,
        WLAN_EID_CLUSTER_REPORT = 166,
        WLAN_EID_RELAY_CAP = 167,
        WLAN_EID_RELAY_XFER_PARAM_SET = 168,
        WLAN_EID_BEAM_LINK_MAINT = 169,
        WLAN_EID_MULTIPLE_MAC_ADDR = 170,
        WLAN_EID_U_PID = 171,
        WLAN_EID_DMG_LINK_ADAPT_ACK = 172,
        /* 173 reserved for Symbol */
        WLAN_EID_MCCAOP_ADV_OVERVIEW = 174,
        WLAN_EID_QUIET_PERIOD_REQ = 175,
        /* 176 reserved for Symbol */
        WLAN_EID_QUIET_PERIOD_RESP = 177,
        /* 178-179 reserved for Symbol */
        /* 180 reserved for ISO/IEC 20011 */
        WLAN_EID_EPAC_POLICY = 182,
        WLAN_EID_CLISTER_TIME_OFF = 183,
        WLAN_EID_INTER_AC_PRIO = 184,
        WLAN_EID_SCS_DESCRIPTOR = 185,
        WLAN_EID_QLOAD_REPORT = 186,
        WLAN_EID_HCCA_TXOP_UPDATE_COUNT = 187,
        WLAN_EID_HL_STREAM_ID = 188,
        WLAN_EID_GCR_GROUP_ADDR = 189,
        WLAN_EID_ANTENNA_SECTOR_ID_PATTERN = 190,
        WLAN_EID_VHT_CAPABILITY = 191,
        WLAN_EID_VHT_OPERATION = 192,
        WLAN_EID_EXTENDED_BSS_LOAD = 193,
        WLAN_EID_WIDE_BW_CHANNEL_SWITCH = 194,
        WLAN_EID_VHT_TX_POWER_ENVELOPE = 195,
        WLAN_EID_CHANNEL_SWITCH_WRAPPER = 196,
        WLAN_EID_AID = 197,
        WLAN_EID_QUIET_CHANNEL = 198,
        WLAN_EID_OPMODE_NOTIF = 199,

        WLAN_EID_REDUCED_NEIGHBOR_REPORT = 201,

        WLAN_EID_AID_REQUEST = 210,
        WLAN_EID_AID_RESPONSE = 211,
        WLAN_EID_S1G_BCN_COMPAT = 213,
        WLAN_EID_S1G_SHORT_BCN_INTERVAL = 214,
        WLAN_EID_S1G_CAPABILITIES = 217,
        WLAN_EID_VENDOR_SPECIFIC = 221,
        WLAN_EID_QOS_PARAMETER = 222,
        WLAN_EID_S1G_OPERATION = 232,
        WLAN_EID_CAG_NUMBER = 237,
        WLAN_EID_AP_CSN = 239,
        WLAN_EID_FILS_INDICATION = 240,
        WLAN_EID_DILS = 241,
        WLAN_EID_FRAGMENT = 242,
        WLAN_EID_RSNX = 244,
        WLAN_EID_EXTENSION = 255
};

/* Element ID Extensions for Element ID 255 */
enum ieee80211_eid_ext {
        WLAN_EID_EXT_ASSOC_DELAY_INFO = 1,
        WLAN_EID_EXT_FILS_REQ_PARAMS = 2,
        WLAN_EID_EXT_FILS_KEY_CONFIRM = 3,
        WLAN_EID_EXT_FILS_SESSION = 4,
        WLAN_EID_EXT_FILS_HLP_CONTAINER = 5,
        WLAN_EID_EXT_FILS_IP_ADDR_ASSIGN = 6,
        WLAN_EID_EXT_KEY_DELIVERY = 7,
        WLAN_EID_EXT_FILS_WRAPPED_DATA = 8,
        WLAN_EID_EXT_FILS_PUBLIC_KEY = 12,
        WLAN_EID_EXT_FILS_NONCE = 13,
        WLAN_EID_EXT_FUTURE_CHAN_GUIDANCE = 14,
        WLAN_EID_EXT_HE_CAPABILITY = 35,
        WLAN_EID_EXT_HE_OPERATION = 36,
        WLAN_EID_EXT_UORA = 37,
        WLAN_EID_EXT_HE_MU_EDCA = 38,
        WLAN_EID_EXT_HE_SPR = 39,
        WLAN_EID_EXT_NDP_FEEDBACK_REPORT_PARAMSET = 41,
        WLAN_EID_EXT_BSS_COLOR_CHG_ANN = 42,
        WLAN_EID_EXT_QUIET_TIME_PERIOD_SETUP = 43,
        WLAN_EID_EXT_ESS_REPORT = 45,
        WLAN_EID_EXT_OPS = 46,
        WLAN_EID_EXT_HE_BSS_LOAD = 47,
        WLAN_EID_EXT_MAX_CHANNEL_SWITCH_TIME = 52,
        WLAN_EID_EXT_MULTIPLE_BSSID_CONFIGURATION = 55,
        WLAN_EID_EXT_NON_INHERITANCE = 56,
        WLAN_EID_EXT_KNOWN_BSSID = 57,
        WLAN_EID_EXT_SHORT_SSID_LIST = 58,
        WLAN_EID_EXT_HE_6GHZ_CAPA = 59,
        WLAN_EID_EXT_UL_MU_POWER_CAPA = 60,
};

/* Action category code */
enum ieee80211_category {
        WLAN_CATEGORY_SPECTRUM_MGMT = 0,
        WLAN_CATEGORY_QOS = 1,
        WLAN_CATEGORY_DLS = 2,
        WLAN_CATEGORY_BACK = 3,
        WLAN_CATEGORY_PUBLIC = 4,
        WLAN_CATEGORY_RADIO_MEASUREMENT = 5,
        WLAN_CATEGORY_HT = 7,
        WLAN_CATEGORY_SA_QUERY = 8,
        WLAN_CATEGORY_PROTECTED_DUAL_OF_ACTION = 9,
        WLAN_CATEGORY_WNM = 10,
        WLAN_CATEGORY_WNM_UNPROTECTED = 11,
        WLAN_CATEGORY_TDLS = 12,
        WLAN_CATEGORY_MESH_ACTION = 13,
        WLAN_CATEGORY_MULTIHOP_ACTION = 14,
        WLAN_CATEGORY_SELF_PROTECTED = 15,
        WLAN_CATEGORY_DMG = 16,
        WLAN_CATEGORY_WMM = 17,
        WLAN_CATEGORY_FST = 18,
        WLAN_CATEGORY_UNPROT_DMG = 20,
        WLAN_CATEGORY_VHT = 21,
        WLAN_CATEGORY_VENDOR_SPECIFIC_PROTECTED = 126,
        WLAN_CATEGORY_VENDOR_SPECIFIC = 127,
};

/* SPECTRUM_MGMT action code */
enum ieee80211_spectrum_mgmt_actioncode {
        WLAN_ACTION_SPCT_MSR_REQ = 0,
        WLAN_ACTION_SPCT_MSR_RPRT = 1,
        WLAN_ACTION_SPCT_TPC_REQ = 2,
        WLAN_ACTION_SPCT_TPC_RPRT = 3,
        WLAN_ACTION_SPCT_CHL_SWITCH = 4,
};

/* HT action codes */
enum ieee80211_ht_actioncode {
        WLAN_HT_ACTION_NOTIFY_CHANWIDTH = 0,
        WLAN_HT_ACTION_SMPS = 1,
        WLAN_HT_ACTION_PSMP = 2,
        WLAN_HT_ACTION_PCO_PHASE = 3,
        WLAN_HT_ACTION_CSI = 4,
        WLAN_HT_ACTION_NONCOMPRESSED_BF = 5,
        WLAN_HT_ACTION_COMPRESSED_BF = 6,
        WLAN_HT_ACTION_ASEL_IDX_FEEDBACK = 7,
};

/* VHT action codes */
enum ieee80211_vht_actioncode {
        WLAN_VHT_ACTION_COMPRESSED_BF = 0,
        WLAN_VHT_ACTION_GROUPID_MGMT = 1,
        WLAN_VHT_ACTION_OPMODE_NOTIF = 2,
};

/* Self Protected Action codes */
enum ieee80211_self_protected_actioncode {
        WLAN_SP_RESERVED = 0,
        WLAN_SP_MESH_PEERING_OPEN = 1,
        WLAN_SP_MESH_PEERING_CONFIRM = 2,
        WLAN_SP_MESH_PEERING_CLOSE = 3,
        WLAN_SP_MGK_INFORM = 4,
        WLAN_SP_MGK_ACK = 5,
};

/* Mesh action codes */
enum ieee80211_mesh_actioncode {
        WLAN_MESH_ACTION_LINK_METRIC_REPORT,
        WLAN_MESH_ACTION_HWMP_PATH_SELECTION,
        WLAN_MESH_ACTION_GATE_ANNOUNCEMENT,
        WLAN_MESH_ACTION_CONGESTION_CONTROL_NOTIFICATION,
        WLAN_MESH_ACTION_MCCA_SETUP_REQUEST,
        WLAN_MESH_ACTION_MCCA_SETUP_REPLY,
        WLAN_MESH_ACTION_MCCA_ADVERTISEMENT_REQUEST,
        WLAN_MESH_ACTION_MCCA_ADVERTISEMENT,
        WLAN_MESH_ACTION_MCCA_TEARDOWN,
        WLAN_MESH_ACTION_TBTT_ADJUSTMENT_REQUEST,
        WLAN_MESH_ACTION_TBTT_ADJUSTMENT_RESPONSE,
};

/* Security key length */
enum ieee80211_key_len {
        WLAN_KEY_LEN_WEP40 = 5,
        WLAN_KEY_LEN_WEP104 = 13,
        WLAN_KEY_LEN_CCMP = 16,
        WLAN_KEY_LEN_CCMP_256 = 32,
        WLAN_KEY_LEN_TKIP = 32,
        WLAN_KEY_LEN_AES_CMAC = 16,
        WLAN_KEY_LEN_SMS4 = 32,
        WLAN_KEY_LEN_GCMP = 16,
        WLAN_KEY_LEN_GCMP_256 = 32,
        WLAN_KEY_LEN_BIP_CMAC_256 = 32,
        WLAN_KEY_LEN_BIP_GMAC_128 = 16,
        WLAN_KEY_LEN_BIP_GMAC_256 = 32,
};

#define IEEE80211_WEP_IV_LEN                4
#define IEEE80211_WEP_ICV_LEN                4
#define IEEE80211_CCMP_HDR_LEN                8
#define IEEE80211_CCMP_MIC_LEN                8
#define IEEE80211_CCMP_PN_LEN                6
#define IEEE80211_CCMP_256_HDR_LEN        8
#define IEEE80211_CCMP_256_MIC_LEN        16
#define IEEE80211_CCMP_256_PN_LEN        6
#define IEEE80211_TKIP_IV_LEN                8
#define IEEE80211_TKIP_ICV_LEN                4
#define IEEE80211_CMAC_PN_LEN                6
#define IEEE80211_GMAC_PN_LEN                6
#define IEEE80211_GCMP_HDR_LEN                8
#define IEEE80211_GCMP_MIC_LEN                16
#define IEEE80211_GCMP_PN_LEN                6

#define FILS_NONCE_LEN                        16
#define FILS_MAX_KEK_LEN                64

#define FILS_ERP_MAX_USERNAME_LEN        16
#define FILS_ERP_MAX_REALM_LEN                253
#define FILS_ERP_MAX_RRK_LEN                64

#define PMK_MAX_LEN                        64
#define SAE_PASSWORD_MAX_LEN                128

/* Public action codes (IEEE Std 802.11-2016, 9.6.8.1, Table 9-307) */
enum ieee80211_pub_actioncode {
        WLAN_PUB_ACTION_20_40_BSS_COEX = 0,
        WLAN_PUB_ACTION_DSE_ENABLEMENT = 1,
        WLAN_PUB_ACTION_DSE_DEENABLEMENT = 2,
        WLAN_PUB_ACTION_DSE_REG_LOC_ANN = 3,
        WLAN_PUB_ACTION_EXT_CHANSW_ANN = 4,
        WLAN_PUB_ACTION_DSE_MSMT_REQ = 5,
        WLAN_PUB_ACTION_DSE_MSMT_RESP = 6,
        WLAN_PUB_ACTION_MSMT_PILOT = 7,
        WLAN_PUB_ACTION_DSE_PC = 8,
        WLAN_PUB_ACTION_VENDOR_SPECIFIC = 9,
        WLAN_PUB_ACTION_GAS_INITIAL_REQ = 10,
        WLAN_PUB_ACTION_GAS_INITIAL_RESP = 11,
        WLAN_PUB_ACTION_GAS_COMEBACK_REQ = 12,
        WLAN_PUB_ACTION_GAS_COMEBACK_RESP = 13,
        WLAN_PUB_ACTION_TDLS_DISCOVER_RES = 14,
        WLAN_PUB_ACTION_LOC_TRACK_NOTI = 15,
        WLAN_PUB_ACTION_QAB_REQUEST_FRAME = 16,
        WLAN_PUB_ACTION_QAB_RESPONSE_FRAME = 17,
        WLAN_PUB_ACTION_QMF_POLICY = 18,
        WLAN_PUB_ACTION_QMF_POLICY_CHANGE = 19,
        WLAN_PUB_ACTION_QLOAD_REQUEST = 20,
        WLAN_PUB_ACTION_QLOAD_REPORT = 21,
        WLAN_PUB_ACTION_HCCA_TXOP_ADVERT = 22,
        WLAN_PUB_ACTION_HCCA_TXOP_RESPONSE = 23,
        WLAN_PUB_ACTION_PUBLIC_KEY = 24,
        WLAN_PUB_ACTION_CHANNEL_AVAIL_QUERY = 25,
        WLAN_PUB_ACTION_CHANNEL_SCHEDULE_MGMT = 26,
        WLAN_PUB_ACTION_CONTACT_VERI_SIGNAL = 27,
        WLAN_PUB_ACTION_GDD_ENABLEMENT_REQ = 28,
        WLAN_PUB_ACTION_GDD_ENABLEMENT_RESP = 29,
        WLAN_PUB_ACTION_NETWORK_CHANNEL_CONTROL = 30,
        WLAN_PUB_ACTION_WHITE_SPACE_MAP_ANN = 31,
        WLAN_PUB_ACTION_FTM_REQUEST = 32,
        WLAN_PUB_ACTION_FTM = 33,
        WLAN_PUB_ACTION_FILS_DISCOVERY = 34,
};

/* TDLS action codes */
enum ieee80211_tdls_actioncode {
        WLAN_TDLS_SETUP_REQUEST = 0,
        WLAN_TDLS_SETUP_RESPONSE = 1,
        WLAN_TDLS_SETUP_CONFIRM = 2,
        WLAN_TDLS_TEARDOWN = 3,
        WLAN_TDLS_PEER_TRAFFIC_INDICATION = 4,
        WLAN_TDLS_CHANNEL_SWITCH_REQUEST = 5,
        WLAN_TDLS_CHANNEL_SWITCH_RESPONSE = 6,
        WLAN_TDLS_PEER_PSM_REQUEST = 7,
        WLAN_TDLS_PEER_PSM_RESPONSE = 8,
        WLAN_TDLS_PEER_TRAFFIC_RESPONSE = 9,
        WLAN_TDLS_DISCOVERY_REQUEST = 10,
};

/* Extended Channel Switching capability to be set in the 1st byte of
 * the @WLAN_EID_EXT_CAPABILITY information element
 */
#define WLAN_EXT_CAPA1_EXT_CHANNEL_SWITCHING        BIT(2)

/* Multiple BSSID capability is set in the 6th bit of 3rd byte of the
 * @WLAN_EID_EXT_CAPABILITY information element
 */
#define WLAN_EXT_CAPA3_MULTI_BSSID_SUPPORT        BIT(6)

/* TDLS capabilities in the 4th byte of @WLAN_EID_EXT_CAPABILITY */
#define WLAN_EXT_CAPA4_TDLS_BUFFER_STA                BIT(4)
#define WLAN_EXT_CAPA4_TDLS_PEER_PSM                BIT(5)
#define WLAN_EXT_CAPA4_TDLS_CHAN_SWITCH                BIT(6)

/* Interworking capabilities are set in 7th bit of 4th byte of the
 * @WLAN_EID_EXT_CAPABILITY information element
 */
#define WLAN_EXT_CAPA4_INTERWORKING_ENABLED        BIT(7)

/*
 * TDLS capabililites to be enabled in the 5th byte of the
 * @WLAN_EID_EXT_CAPABILITY information element
 */
#define WLAN_EXT_CAPA5_TDLS_ENABLED        BIT(5)
#define WLAN_EXT_CAPA5_TDLS_PROHIBITED        BIT(6)
#define WLAN_EXT_CAPA5_TDLS_CH_SW_PROHIBITED        BIT(7)

#define WLAN_EXT_CAPA8_TDLS_WIDE_BW_ENABLED        BIT(5)
#define WLAN_EXT_CAPA8_OPMODE_NOTIF        BIT(6)

/* Defines the maximal number of MSDUs in an A-MSDU. */
#define WLAN_EXT_CAPA8_MAX_MSDU_IN_AMSDU_LSB        BIT(7)
#define WLAN_EXT_CAPA9_MAX_MSDU_IN_AMSDU_MSB        BIT(0)

/*
 * Fine Timing Measurement Initiator - bit 71 of @WLAN_EID_EXT_CAPABILITY
 * information element
 */
#define WLAN_EXT_CAPA9_FTM_INITIATOR        BIT(7)

/* Defines support for TWT Requester and TWT Responder */
#define WLAN_EXT_CAPA10_TWT_REQUESTER_SUPPORT        BIT(5)
#define WLAN_EXT_CAPA10_TWT_RESPONDER_SUPPORT        BIT(6)

/*
 * When set, indicates that the AP is able to tolerate 26-tone RU UL
 * OFDMA transmissions using HE TB PPDU from OBSS (not falsely classify the
 * 26-tone RU UL OFDMA transmissions as radar pulses).
 */
#define WLAN_EXT_CAPA10_OBSS_NARROW_BW_RU_TOLERANCE_SUPPORT BIT(7)

/* Defines support for enhanced multi-bssid advertisement*/
#define WLAN_EXT_CAPA11_EMA_SUPPORT        BIT(3)

/* TDLS specific payload type in the LLC/SNAP header */
#define WLAN_TDLS_SNAP_RFTYPE        0x2

/* BSS Coex IE information field bits */
#define WLAN_BSS_COEX_INFORMATION_REQUEST        BIT(0)

/**
 * enum ieee80211_mesh_sync_method - mesh synchronization method identifier
 *
 * @IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET: the default synchronization method
 * @IEEE80211_SYNC_METHOD_VENDOR: a vendor specific synchronization method
 *        that will be specified in a vendor specific information element
 */
enum ieee80211_mesh_sync_method {
        IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET = 1,
        IEEE80211_SYNC_METHOD_VENDOR = 255,
};

/**
 * enum ieee80211_mesh_path_protocol - mesh path selection protocol identifier
 *
 * @IEEE80211_PATH_PROTOCOL_HWMP: the default path selection protocol
 * @IEEE80211_PATH_PROTOCOL_VENDOR: a vendor specific protocol that will
 *        be specified in a vendor specific information element
 */
enum ieee80211_mesh_path_protocol {
        IEEE80211_PATH_PROTOCOL_HWMP = 1,
        IEEE80211_PATH_PROTOCOL_VENDOR = 255,
};

/**
 * enum ieee80211_mesh_path_metric - mesh path selection metric identifier
 *
 * @IEEE80211_PATH_METRIC_AIRTIME: the default path selection metric
 * @IEEE80211_PATH_METRIC_VENDOR: a vendor specific metric that will be
 *        specified in a vendor specific information element
 */
enum ieee80211_mesh_path_metric {
        IEEE80211_PATH_METRIC_AIRTIME = 1,
        IEEE80211_PATH_METRIC_VENDOR = 255,
};

/**
 * enum ieee80211_root_mode_identifier - root mesh STA mode identifier
 *
 * These attribute are used by dot11MeshHWMPRootMode to set root mesh STA mode
 *
 * @IEEE80211_ROOTMODE_NO_ROOT: the mesh STA is not a root mesh STA (default)
 * @IEEE80211_ROOTMODE_ROOT: the mesh STA is a root mesh STA if greater than
 *        this value
 * @IEEE80211_PROACTIVE_PREQ_NO_PREP: the mesh STA is a root mesh STA supports
 *        the proactive PREQ with proactive PREP subfield set to 0
 * @IEEE80211_PROACTIVE_PREQ_WITH_PREP: the mesh STA is a root mesh STA
 *        supports the proactive PREQ with proactive PREP subfield set to 1
 * @IEEE80211_PROACTIVE_RANN: the mesh STA is a root mesh STA supports
 *        the proactive RANN
 */
enum ieee80211_root_mode_identifier {
        IEEE80211_ROOTMODE_NO_ROOT = 0,
        IEEE80211_ROOTMODE_ROOT = 1,
        IEEE80211_PROACTIVE_PREQ_NO_PREP = 2,
        IEEE80211_PROACTIVE_PREQ_WITH_PREP = 3,
        IEEE80211_PROACTIVE_RANN = 4,
};

/*
 * IEEE 802.11-2007 7.3.2.9 Country information element
 *
 * Minimum length is 8 octets, ie len must be evenly
 * divisible by 2
 */

/* Although the spec says 8 I'm seeing 6 in practice */
#define IEEE80211_COUNTRY_IE_MIN_LEN        6

/* The Country String field of the element shall be 3 octets in length */
#define IEEE80211_COUNTRY_STRING_LEN        3

/*
 * For regulatory extension stuff see IEEE 802.11-2007
 * Annex I (page 1141) and Annex J (page 1147). Also
 * review 7.3.2.9.
 *
 * When dot11RegulatoryClassesRequired is true and the
 * first_channel/reg_extension_id is >= 201 then the IE
 * compromises of the 'ext' struct represented below:
 *
 *  - Regulatory extension ID - when generating IE this just needs
 *    to be monotonically increasing for each triplet passed in
 *    the IE
 *  - Regulatory class - index into set of rules
 *  - Coverage class - index into air propagation time (Table 7-27),
 *    in microseconds, you can compute the air propagation time from
 *    the index by multiplying by 3, so index 10 yields a propagation
 *    of 10 us. Valid values are 0-31, values 32-255 are not defined
 *    yet. A value of 0 inicates air propagation of <= 1 us.
 *
 *  See also Table I.2 for Emission limit sets and table
 *  I.3 for Behavior limit sets. Table J.1 indicates how to map
 *  a reg_class to an emission limit set and behavior limit set.
 */
#define IEEE80211_COUNTRY_EXTENSION_ID 201

/*
 *  Channels numbers in the IE must be monotonically increasing
 *  if dot11RegulatoryClassesRequired is not true.
 *
 *  If dot11RegulatoryClassesRequired is true consecutive
 *  subband triplets following a regulatory triplet shall
 *  have monotonically increasing first_channel number fields.
 *
 *  Channel numbers shall not overlap.
 *
 *  Note that max_power is signed.
 */
struct ieee80211_country_ie_triplet {
        union {
                struct {
                        u8 first_channel;
                        u8 num_channels;
                        s8 max_power;
                } __packed chans;
                struct {
                        u8 reg_extension_id;
                        u8 reg_class;
                        u8 coverage_class;
                } __packed ext;
        };
} __packed;

enum ieee80211_timeout_interval_type {
        WLAN_TIMEOUT_REASSOC_DEADLINE = 1 /* 802.11r */,
        WLAN_TIMEOUT_KEY_LIFETIME = 2 /* 802.11r */,
        WLAN_TIMEOUT_ASSOC_COMEBACK = 3 /* 802.11w */,
};

/**
 * struct ieee80211_timeout_interval_ie - Timeout Interval element
 * @type: type, see &enum ieee80211_timeout_interval_type
 * @value: timeout interval value
 */
struct ieee80211_timeout_interval_ie {
        u8 type;
        __le32 value;
} __packed;

/**
 * enum ieee80211_idle_options - BSS idle options
 * @WLAN_IDLE_OPTIONS_PROTECTED_KEEP_ALIVE: the station should send an RSN
 *        protected frame to the AP to reset the idle timer at the AP for
 *        the station.
 */
enum ieee80211_idle_options {
        WLAN_IDLE_OPTIONS_PROTECTED_KEEP_ALIVE = BIT(0),
};

/**
 * struct ieee80211_bss_max_idle_period_ie
 *
 * This structure refers to "BSS Max idle period element"
 *
 * @max_idle_period: indicates the time period during which a station can
 *        refrain from transmitting frames to its associated AP without being
 *        disassociated. In units of 1000 TUs.
 * @idle_options: indicates the options associated with the BSS idle capability
 *        as specified in &enum ieee80211_idle_options.
 */
struct ieee80211_bss_max_idle_period_ie {
        __le16 max_idle_period;
        u8 idle_options;
} __packed;

/* BACK action code */
enum ieee80211_back_actioncode {
        WLAN_ACTION_ADDBA_REQ = 0,
        WLAN_ACTION_ADDBA_RESP = 1,
        WLAN_ACTION_DELBA = 2,
};

/* BACK (block-ack) parties */
enum ieee80211_back_parties {
        WLAN_BACK_RECIPIENT = 0,
        WLAN_BACK_INITIATOR = 1,
};

/* SA Query action */
enum ieee80211_sa_query_action {
        WLAN_ACTION_SA_QUERY_REQUEST = 0,
        WLAN_ACTION_SA_QUERY_RESPONSE = 1,
};

/**
 * struct ieee80211_bssid_index
 *
 * This structure refers to "Multiple BSSID-index element"
 *
 * @bssid_index: BSSID index
 * @dtim_period: optional, overrides transmitted BSS dtim period
 * @dtim_count: optional, overrides transmitted BSS dtim count
 */
struct ieee80211_bssid_index {
        u8 bssid_index;
        u8 dtim_period;
        u8 dtim_count;
};

/**
 * struct ieee80211_multiple_bssid_configuration
 *
 * This structure refers to "Multiple BSSID Configuration element"
 *
 * @bssid_count: total number of active BSSIDs in the set
 * @profile_periodicity: the least number of beacon frames need to be received
 *        in order to discover all the nontransmitted BSSIDs in the set.
 */
struct ieee80211_multiple_bssid_configuration {
        u8 bssid_count;
        u8 profile_periodicity;
};

#define SUITE(oui, id)        (((oui) << 8) | (id))

/* cipher suite selectors */
#define WLAN_CIPHER_SUITE_USE_GROUP        SUITE(0x000FAC, 0)
#define WLAN_CIPHER_SUITE_WEP40                SUITE(0x000FAC, 1)
#define WLAN_CIPHER_SUITE_TKIP                SUITE(0x000FAC, 2)
/* reserved:                                 SUITE(0x000FAC, 3) */
#define WLAN_CIPHER_SUITE_CCMP                SUITE(0x000FAC, 4)
#define WLAN_CIPHER_SUITE_WEP104        SUITE(0x000FAC, 5)
#define WLAN_CIPHER_SUITE_AES_CMAC        SUITE(0x000FAC, 6)
#define WLAN_CIPHER_SUITE_GCMP                SUITE(0x000FAC, 8)
#define WLAN_CIPHER_SUITE_GCMP_256        SUITE(0x000FAC, 9)
#define WLAN_CIPHER_SUITE_CCMP_256        SUITE(0x000FAC, 10)
#define WLAN_CIPHER_SUITE_BIP_GMAC_128        SUITE(0x000FAC, 11)
#define WLAN_CIPHER_SUITE_BIP_GMAC_256        SUITE(0x000FAC, 12)
#define WLAN_CIPHER_SUITE_BIP_CMAC_256        SUITE(0x000FAC, 13)

#define WLAN_CIPHER_SUITE_SMS4                SUITE(0x001472, 1)

/* AKM suite selectors */
#define WLAN_AKM_SUITE_8021X                        SUITE(0x000FAC, 1)
#define WLAN_AKM_SUITE_PSK                        SUITE(0x000FAC, 2)
#define WLAN_AKM_SUITE_FT_8021X                        SUITE(0x000FAC, 3)
#define WLAN_AKM_SUITE_FT_PSK                        SUITE(0x000FAC, 4)
#define WLAN_AKM_SUITE_8021X_SHA256                SUITE(0x000FAC, 5)
#define WLAN_AKM_SUITE_PSK_SHA256                SUITE(0x000FAC, 6)
#define WLAN_AKM_SUITE_TDLS                        SUITE(0x000FAC, 7)
#define WLAN_AKM_SUITE_SAE                        SUITE(0x000FAC, 8)
#define WLAN_AKM_SUITE_FT_OVER_SAE                SUITE(0x000FAC, 9)
#define WLAN_AKM_SUITE_AP_PEER_KEY                SUITE(0x000FAC, 10)
#define WLAN_AKM_SUITE_8021X_SUITE_B                SUITE(0x000FAC, 11)
#define WLAN_AKM_SUITE_8021X_SUITE_B_192        SUITE(0x000FAC, 12)
#define WLAN_AKM_SUITE_FT_8021X_SHA384                SUITE(0x000FAC, 13)
#define WLAN_AKM_SUITE_FILS_SHA256                SUITE(0x000FAC, 14)
#define WLAN_AKM_SUITE_FILS_SHA384                SUITE(0x000FAC, 15)
#define WLAN_AKM_SUITE_FT_FILS_SHA256                SUITE(0x000FAC, 16)
#define WLAN_AKM_SUITE_FT_FILS_SHA384                SUITE(0x000FAC, 17)
#define WLAN_AKM_SUITE_OWE                        SUITE(0x000FAC, 18)
#define WLAN_AKM_SUITE_FT_PSK_SHA384                SUITE(0x000FAC, 19)
#define WLAN_AKM_SUITE_PSK_SHA384                SUITE(0x000FAC, 20)

#define WLAN_MAX_KEY_LEN                32

#define WLAN_PMK_NAME_LEN                16
#define WLAN_PMKID_LEN                        16
#define WLAN_PMK_LEN_EAP_LEAP                16
#define WLAN_PMK_LEN                        32
#define WLAN_PMK_LEN_SUITE_B_192        48

#define WLAN_OUI_WFA                        0x506f9a
#define WLAN_OUI_TYPE_WFA_P2P                9
#define WLAN_OUI_MICROSOFT                0x0050f2
#define WLAN_OUI_TYPE_MICROSOFT_WPA        1
#define WLAN_OUI_TYPE_MICROSOFT_WMM        2
#define WLAN_OUI_TYPE_MICROSOFT_WPS        4
#define WLAN_OUI_TYPE_MICROSOFT_TPC        8

/*
 * WMM/802.11e Tspec Element
 */
#define IEEE80211_WMM_IE_TSPEC_TID_MASK                0x0F
#define IEEE80211_WMM_IE_TSPEC_TID_SHIFT        1

enum ieee80211_tspec_status_code {
        IEEE80211_TSPEC_STATUS_ADMISS_ACCEPTED = 0,
        IEEE80211_TSPEC_STATUS_ADDTS_INVAL_PARAMS = 0x1,
};

struct ieee80211_tspec_ie {
        u8 element_id;
        u8 len;
        u8 oui[3];
        u8 oui_type;
        u8 oui_subtype;
        u8 version;
        __le16 tsinfo;
        u8 tsinfo_resvd;
        __le16 nominal_msdu;
        __le16 max_msdu;
        __le32 min_service_int;
        __le32 max_service_int;
        __le32 inactivity_int;
        __le32 suspension_int;
        __le32 service_start_time;
        __le32 min_data_rate;
        __le32 mean_data_rate;
        __le32 peak_data_rate;
        __le32 max_burst_size;
        __le32 delay_bound;
        __le32 min_phy_rate;
        __le16 sba;
        __le16 medium_time;
} __packed;

struct ieee80211_he_6ghz_capa {
        /* uses IEEE80211_HE_6GHZ_CAP_* below */
        __le16 capa;
} __packed;

/* HE 6 GHz band capabilities */
/* uses enum ieee80211_min_mpdu_spacing values */
#define IEEE80211_HE_6GHZ_CAP_MIN_MPDU_START        0x0007
/* uses enum ieee80211_vht_max_ampdu_length_exp values */
#define IEEE80211_HE_6GHZ_CAP_MAX_AMPDU_LEN_EXP        0x0038
/* uses IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_* values */
#define IEEE80211_HE_6GHZ_CAP_MAX_MPDU_LEN        0x00c0
/* WLAN_HT_CAP_SM_PS_* values */
#define IEEE80211_HE_6GHZ_CAP_SM_PS                0x0600
#define IEEE80211_HE_6GHZ_CAP_RD_RESPONDER        0x0800
#define IEEE80211_HE_6GHZ_CAP_RX_ANTPAT_CONS        0x1000
#define IEEE80211_HE_6GHZ_CAP_TX_ANTPAT_CONS        0x2000

/**
 * ieee80211_get_qos_ctl - get pointer to qos control bytes
 * @hdr: the frame
 *
 * The qos ctrl bytes come after the frame_control, duration, seq_num
 * and 3 or 4 addresses of length ETH_ALEN.
 * 3 addr: 2 + 2 + 2 + 3*6 = 24
 * 4 addr: 2 + 2 + 2 + 4*6 = 30
 */
static inline u8 *ieee80211_get_qos_ctl(struct ieee80211_hdr *hdr)
{
        if (ieee80211_has_a4(hdr->frame_control))
                return (u8 *)hdr + 30;
        else
                return (u8 *)hdr + 24;
}

/**
 * ieee80211_get_tid - get qos TID
 * @hdr: the frame
 */
static inline u8 ieee80211_get_tid(struct ieee80211_hdr *hdr)
{
        u8 *qc = ieee80211_get_qos_ctl(hdr);

        return qc[0] & IEEE80211_QOS_CTL_TID_MASK;
}

/**
 * ieee80211_get_SA - get pointer to SA
 * @hdr: the frame
 *
 * Given an 802.11 frame, this function returns the offset
 * to the source address (SA). It does not verify that the
 * header is long enough to contain the address, and the
 * header must be long enough to contain the frame control
 * field.
 */
static inline u8 *ieee80211_get_SA(struct ieee80211_hdr *hdr)
{
        if (ieee80211_has_a4(hdr->frame_control))
                return hdr->addr4;
        if (ieee80211_has_fromds(hdr->frame_control))
                return hdr->addr3;
        return hdr->addr2;
}

/**
 * ieee80211_get_DA - get pointer to DA
 * @hdr: the frame
 *
 * Given an 802.11 frame, this function returns the offset
 * to the destination address (DA). It does not verify that
 * the header is long enough to contain the address, and the
 * header must be long enough to contain the frame control
 * field.
 */
static inline u8 *ieee80211_get_DA(struct ieee80211_hdr *hdr)
{
        if (ieee80211_has_tods(hdr->frame_control))
                return hdr->addr3;
        else
                return hdr->addr1;
}

/**
 * _ieee80211_is_robust_mgmt_frame - check if frame is a robust management frame
 * @hdr: the frame (buffer must include at least the first octet of payload)
 */
static inline bool _ieee80211_is_robust_mgmt_frame(struct ieee80211_hdr *hdr)
{
        if (ieee80211_is_disassoc(hdr->frame_control) ||
            ieee80211_is_deauth(hdr->frame_control))
                return true;

        if (ieee80211_is_action(hdr->frame_control)) {
                u8 *category;

                /*
                 * Action frames, excluding Public Action frames, are Robust
                 * Management Frames. However, if we are looking at a Protected
                 * frame, skip the check since the data may be encrypted and
                 * the frame has already been found to be a Robust Management
                 * Frame (by the other end).
                 */
                if (ieee80211_has_protected(hdr->frame_control))
                        return true;
                category = ((u8 *) hdr) + 24;
                return *category != WLAN_CATEGORY_PUBLIC &&
                        *category != WLAN_CATEGORY_HT &&
                        *category != WLAN_CATEGORY_WNM_UNPROTECTED &&
                        *category != WLAN_CATEGORY_SELF_PROTECTED &&
                        *category != WLAN_CATEGORY_UNPROT_DMG &&
                        *category != WLAN_CATEGORY_VHT &&
                        *category != WLAN_CATEGORY_VENDOR_SPECIFIC;
        }

        return false;
}

/**
 * ieee80211_is_robust_mgmt_frame - check if skb contains a robust mgmt frame
 * @skb: the skb containing the frame, length will be checked
 */
static inline bool ieee80211_is_robust_mgmt_frame(struct sk_buff *skb)
{
        if (skb->len < IEEE80211_MIN_ACTION_SIZE)
                return false;
        return _ieee80211_is_robust_mgmt_frame((void *)skb->data);
}

/**
 * ieee80211_is_public_action - check if frame is a public action frame
 * @hdr: the frame
 * @len: length of the frame
 */
static inline bool ieee80211_is_public_action(struct ieee80211_hdr *hdr,
                                              size_t len)
{
        struct ieee80211_mgmt *mgmt = (void *)hdr;

        if (len < IEEE80211_MIN_ACTION_SIZE)
                return false;
        if (!ieee80211_is_action(hdr->frame_control))
                return false;
        return mgmt->u.action.category == WLAN_CATEGORY_PUBLIC;
}

/**
 * _ieee80211_is_group_privacy_action - check if frame is a group addressed
 * privacy action frame
 * @hdr: the frame
 */
static inline bool _ieee80211_is_group_privacy_action(struct ieee80211_hdr *hdr)
{
        struct ieee80211_mgmt *mgmt = (void *)hdr;

        if (!ieee80211_is_action(hdr->frame_control) ||
            !is_multicast_ether_addr(hdr->addr1))
                return false;

        return mgmt->u.action.category == WLAN_CATEGORY_MESH_ACTION ||
               mgmt->u.action.category == WLAN_CATEGORY_MULTIHOP_ACTION;
}

/**
 * ieee80211_is_group_privacy_action - check if frame is a group addressed
 * privacy action frame
 * @skb: the skb containing the frame, length will be checked
 */
static inline bool ieee80211_is_group_privacy_action(struct sk_buff *skb)
{
        if (skb->len < IEEE80211_MIN_ACTION_SIZE)
                return false;
        return _ieee80211_is_group_privacy_action((void *)skb->data);
}

/**
 * ieee80211_tu_to_usec - convert time units (TU) to microseconds
 * @tu: the TUs
 */
static inline unsigned long ieee80211_tu_to_usec(unsigned long tu)
{
        return 1024 * tu;
}

/**
 * ieee80211_check_tim - check if AID bit is set in TIM
 * @tim: the TIM IE
 * @tim_len: length of the TIM IE
 * @aid: the AID to look for
 */
static inline bool ieee80211_check_tim(const struct ieee80211_tim_ie *tim,
                                       u8 tim_len, u16 aid)
{
        u8 mask;
        u8 index, indexn1, indexn2;

        if (unlikely(!tim || tim_len < sizeof(*tim)))
                return false;

        aid &= 0x3fff;
        index = aid / 8;
        mask  = 1 << (aid & 7);

        indexn1 = tim->bitmap_ctrl & 0xfe;
        indexn2 = tim_len + indexn1 - 4;

        if (index < indexn1 || index > indexn2)
                return false;

        index -= indexn1;

        return !!(tim->virtual_map[index] & mask);
}

/**
 * ieee80211_get_tdls_action - get tdls packet action (or -1, if not tdls packet)
 * @skb: the skb containing the frame, length will not be checked
 * @hdr_size: the size of the ieee80211_hdr that starts at skb->data
 *
 * This function assumes the frame is a data frame, and that the network header
 * is in the correct place.
 */
static inline int ieee80211_get_tdls_action(struct sk_buff *skb, u32 hdr_size)
{
        if (!skb_is_nonlinear(skb) &&
            skb->len > (skb_network_offset(skb) + 2)) {
                /* Point to where the indication of TDLS should start */
                const u8 *tdls_data = skb_network_header(skb) - 2;

                if (get_unaligned_be16(tdls_data) == ETH_P_TDLS &&
                    tdls_data[2] == WLAN_TDLS_SNAP_RFTYPE &&
                    tdls_data[3] == WLAN_CATEGORY_TDLS)
                        return tdls_data[4];
        }

        return -1;
}

/* convert time units */
#define TU_TO_JIFFIES(x)        (usecs_to_jiffies((x) * 1024))
#define TU_TO_EXP_TIME(x)        (jiffies + TU_TO_JIFFIES(x))

/* convert frequencies */
#define MHZ_TO_KHZ(freq) ((freq) * 1000)
#define KHZ_TO_MHZ(freq) ((freq) / 1000)
#define PR_KHZ(f) KHZ_TO_MHZ(f), f % 1000
#define KHZ_F "%d.%03d"

/* convert powers */
#define DBI_TO_MBI(gain) ((gain) * 100)
#define MBI_TO_DBI(gain) ((gain) / 100)
#define DBM_TO_MBM(gain) ((gain) * 100)
#define MBM_TO_DBM(gain) ((gain) / 100)

/**
 * ieee80211_action_contains_tpc - checks if the frame contains TPC element
 * @skb: the skb containing the frame, length will be checked
 *
 * This function checks if it's either TPC report action frame or Link
 * Measurement report action frame as defined in IEEE Std. 802.11-2012 8.5.2.5
 * and 8.5.7.5 accordingly.
 */
static inline bool ieee80211_action_contains_tpc(struct sk_buff *skb)
{
        struct ieee80211_mgmt *mgmt = (void *)skb->data;

        if (!ieee80211_is_action(mgmt->frame_control))
                return false;

        if (skb->len < IEEE80211_MIN_ACTION_SIZE +
                       sizeof(mgmt->u.action.u.tpc_report))
                return false;

        /*
         * TPC report - check that:
         * category = 0 (Spectrum Management) or 5 (Radio Measurement)
         * spectrum management action = 3 (TPC/Link Measurement report)
         * TPC report EID = 35
         * TPC report element length = 2
         *
         * The spectrum management's tpc_report struct is used here both for
         * parsing tpc_report and radio measurement's link measurement report
         * frame, since the relevant part is identical in both frames.
         */
        if (mgmt->u.action.category != WLAN_CATEGORY_SPECTRUM_MGMT &&
            mgmt->u.action.category != WLAN_CATEGORY_RADIO_MEASUREMENT)
                return false;

        /* both spectrum mgmt and link measurement have same action code */
        if (mgmt->u.action.u.tpc_report.action_code !=
            WLAN_ACTION_SPCT_TPC_RPRT)
                return false;

        if (mgmt->u.action.u.tpc_report.tpc_elem_id != WLAN_EID_TPC_REPORT ||
            mgmt->u.action.u.tpc_report.tpc_elem_length !=
            sizeof(struct ieee80211_tpc_report_ie))
                return false;

        return true;
}

struct element {
        u8 id;
        u8 datalen;
        u8 data[];
} __packed;

/* element iteration helpers */
#define for_each_element(_elem, _data, _datalen)                        \
        for (_elem = (const struct element *)(_data);                        \
             (const u8 *)(_data) + (_datalen) - (const u8 *)_elem >=        \
                (int)sizeof(*_elem) &&                                        \
             (const u8 *)(_data) + (_datalen) - (const u8 *)_elem >=        \
                (int)sizeof(*_elem) + _elem->datalen;                        \
             _elem = (const struct element *)(_elem->data + _elem->datalen))

#define for_each_element_id(element, _id, data, datalen)                \
        for_each_element(element, data, datalen)                        \
                if (element->id == (_id))

#define for_each_element_extid(element, extid, _data, _datalen)                \
        for_each_element(element, _data, _datalen)                        \
                if (element->id == WLAN_EID_EXTENSION &&                \
                    element->datalen > 0 &&                                \
                    element->data[0] == (extid))

#define for_each_subelement(sub, element)                                \
        for_each_element(sub, (element)->data, (element)->datalen)

#define for_each_subelement_id(sub, id, element)                        \
        for_each_element_id(sub, id, (element)->data, (element)->datalen)

#define for_each_subelement_extid(sub, extid, element)                        \
        for_each_element_extid(sub, extid, (element)->data, (element)->datalen)

/**
 * for_each_element_completed - determine if element parsing consumed all data
 * @element: element pointer after for_each_element() or friends
 * @data: same data pointer as passed to for_each_element() or friends
 * @datalen: same data length as passed to for_each_element() or friends
 *
 * This function returns %true if all the data was parsed or considered
 * while walking the elements. Only use this if your for_each_element()
 * loop cannot be broken out of, otherwise it always returns %false.
 *
 * If some data was malformed, this returns %false since the last parsed
 * element will not fill the whole remaining data.
 */
static inline bool for_each_element_completed(const struct element *element,
                                              const void *data, size_t datalen)
{
        return (const u8 *)element == (const u8 *)data + datalen;
}

/**
 * RSNX Capabilities:
 * bits 0-3: Field length (n-1)
 */
#define WLAN_RSNX_CAPA_PROTECTED_TWT BIT(4)
#define WLAN_RSNX_CAPA_SAE_H2E BIT(5)

/*
 * reduced neighbor report, based on Draft P802.11ax_D5.0,
 * section 9.4.2.170
 */
#define IEEE80211_AP_INFO_TBTT_HDR_TYPE                                0x03
#define IEEE80211_AP_INFO_TBTT_HDR_FILTERED                        0x04
#define IEEE80211_AP_INFO_TBTT_HDR_COLOC                        0x08
#define IEEE80211_AP_INFO_TBTT_HDR_COUNT                        0xF0
#define IEEE80211_TBTT_INFO_OFFSET_BSSID_BSS_PARAM                8
#define IEEE80211_TBTT_INFO_OFFSET_BSSID_SSSID_BSS_PARAM        12

#define IEEE80211_RNR_TBTT_PARAMS_OCT_RECOMMENDED                0x01
#define IEEE80211_RNR_TBTT_PARAMS_SAME_SSID                        0x02
#define IEEE80211_RNR_TBTT_PARAMS_MULTI_BSSID                        0x04
#define IEEE80211_RNR_TBTT_PARAMS_TRANSMITTED_BSSID                0x08
#define IEEE80211_RNR_TBTT_PARAMS_COLOC_ESS                        0x10
#define IEEE80211_RNR_TBTT_PARAMS_PROBE_ACTIVE                        0x20
#define IEEE80211_RNR_TBTT_PARAMS_COLOC_AP                        0x40

struct ieee80211_neighbor_ap_info {
       u8 tbtt_info_hdr;
       u8 tbtt_info_len;
       u8 op_class;
       u8 channel;
} __packed;

#endif /* LINUX_IEEE80211_H */






























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_RTNETLINK_H
#define __NET_RTNETLINK_H

#include <linux/rtnetlink.h>
#include <net/netlink.h>

typedef int (*rtnl_doit_func)(struct sk_buff *, struct nlmsghdr *,
                              struct netlink_ext_ack *);
typedef int (*rtnl_dumpit_func)(struct sk_buff *, struct netlink_callback *);

enum rtnl_link_flags {
        RTNL_FLAG_DOIT_UNLOCKED                = BIT(0),
        RTNL_FLAG_BULK_DEL_SUPPORTED        = BIT(1),
};

enum rtnl_kinds {
        RTNL_KIND_NEW,
        RTNL_KIND_DEL,
        RTNL_KIND_GET,
        RTNL_KIND_SET
};
#define RTNL_KIND_MASK 0x3

static inline enum rtnl_kinds rtnl_msgtype_kind(int msgtype)
{
        return msgtype & RTNL_KIND_MASK;
}

void rtnl_register(int protocol, int msgtype,
                   rtnl_doit_func, rtnl_dumpit_func, unsigned int flags);
int rtnl_register_module(struct module *owner, int protocol, int msgtype,
                         rtnl_doit_func, rtnl_dumpit_func, unsigned int flags);
int rtnl_unregister(int protocol, int msgtype);
void rtnl_unregister_all(int protocol);

static inline int rtnl_msg_family(const struct nlmsghdr *nlh)
{
        if (nlmsg_len(nlh) >= sizeof(struct rtgenmsg))
                return ((struct rtgenmsg *) nlmsg_data(nlh))->rtgen_family;
        else
                return AF_UNSPEC;
}

/**
 *        struct rtnl_link_ops - rtnetlink link operations
 *
 *        @list: Used internally
 *        @kind: Identifier
 *        @netns_refund: Physical device, move to init_net on netns exit
 *        @maxtype: Highest device specific netlink attribute number
 *        @policy: Netlink policy for device specific attribute validation
 *        @validate: Optional validation function for netlink/changelink parameters
 *        @priv_size: sizeof net_device private space
 *        @setup: net_device setup function
 *        @newlink: Function for configuring and registering a new device
 *        @changelink: Function for changing parameters of an existing device
 *        @dellink: Function to remove a device
 *        @get_size: Function to calculate required room for dumping device
 *                   specific netlink attributes
 *        @fill_info: Function to dump device specific netlink attributes
 *        @get_xstats_size: Function to calculate required room for dumping device
 *                          specific statistics
 *        @fill_xstats: Function to dump device specific statistics
 *        @get_num_tx_queues: Function to determine number of transmit queues
 *                            to create when creating a new device.
 *        @get_num_rx_queues: Function to determine number of receive queues
 *                            to create when creating a new device.
 *        @get_link_net: Function to get the i/o netns of the device
 *        @get_linkxstats_size: Function to calculate the required room for
 *                              dumping device-specific extended link stats
 *        @fill_linkxstats: Function to dump device-specific extended link stats
 */
struct rtnl_link_ops {
        struct list_head        list;

        const char                *kind;

        size_t                        priv_size;
        void                        (*setup)(struct net_device *dev);

        bool                        netns_refund;
        unsigned int                maxtype;
        const struct nla_policy        *policy;
        int                        (*validate)(struct nlattr *tb[],
                                            struct nlattr *data[],
                                            struct netlink_ext_ack *extack);

        int                        (*newlink)(struct net *src_net,
                                           struct net_device *dev,
                                           struct nlattr *tb[],
                                           struct nlattr *data[],
                                           struct netlink_ext_ack *extack);
        int                        (*changelink)(struct net_device *dev,
                                              struct nlattr *tb[],
                                              struct nlattr *data[],
                                              struct netlink_ext_ack *extack);
        void                        (*dellink)(struct net_device *dev,
                                           struct list_head *head);

        size_t                        (*get_size)(const struct net_device *dev);
        int                        (*fill_info)(struct sk_buff *skb,
                                             const struct net_device *dev);

        size_t                        (*get_xstats_size)(const struct net_device *dev);
        int                        (*fill_xstats)(struct sk_buff *skb,
                                               const struct net_device *dev);
        unsigned int                (*get_num_tx_queues)(void);
        unsigned int                (*get_num_rx_queues)(void);

        unsigned int                slave_maxtype;
        const struct nla_policy        *slave_policy;
        int                        (*slave_changelink)(struct net_device *dev,
                                                    struct net_device *slave_dev,
                                                    struct nlattr *tb[],
                                                    struct nlattr *data[],
                                                    struct netlink_ext_ack *extack);
        size_t                        (*get_slave_size)(const struct net_device *dev,
                                                  const struct net_device *slave_dev);
        int                        (*fill_slave_info)(struct sk_buff *skb,
                                                   const struct net_device *dev,
                                                   const struct net_device *slave_dev);
        struct net                *(*get_link_net)(const struct net_device *dev);
        size_t                        (*get_linkxstats_size)(const struct net_device *dev,
                                                       int attr);
        int                        (*fill_linkxstats)(struct sk_buff *skb,
                                                   const struct net_device *dev,
                                                   int *prividx, int attr);
};

int __rtnl_link_register(struct rtnl_link_ops *ops);
void __rtnl_link_unregister(struct rtnl_link_ops *ops);

int rtnl_link_register(struct rtnl_link_ops *ops);
void rtnl_link_unregister(struct rtnl_link_ops *ops);

/**
 *         struct rtnl_af_ops - rtnetlink address family operations
 *
 *        @list: Used internally
 *         @family: Address family
 *         @fill_link_af: Function to fill IFLA_AF_SPEC with address family
 *                        specific netlink attributes.
 *         @get_link_af_size: Function to calculate size of address family specific
 *                            netlink attributes.
 *        @validate_link_af: Validate a IFLA_AF_SPEC attribute, must check attr
 *                           for invalid configuration settings.
 *         @set_link_af: Function to parse a IFLA_AF_SPEC attribute and modify
 *                      net_device accordingly.
 */
struct rtnl_af_ops {
        struct list_head        list;
        int                        family;

        int                        (*fill_link_af)(struct sk_buff *skb,
                                                const struct net_device *dev,
                                                u32 ext_filter_mask);
        size_t                        (*get_link_af_size)(const struct net_device *dev,
                                                    u32 ext_filter_mask);

        int                        (*validate_link_af)(const struct net_device *dev,
                                                    const struct nlattr *attr);
        int                        (*set_link_af)(struct net_device *dev,
                                               const struct nlattr *attr);

        int                        (*fill_stats_af)(struct sk_buff *skb,
                                                 const struct net_device *dev);
        size_t                        (*get_stats_af_size)(const struct net_device *dev);
};

void rtnl_af_register(struct rtnl_af_ops *ops);
void rtnl_af_unregister(struct rtnl_af_ops *ops);

struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]);
struct net_device *rtnl_create_link(struct net *net, const char *ifname,
                                    unsigned char name_assign_type,
                                    const struct rtnl_link_ops *ops,
                                    struct nlattr *tb[],
                                    struct netlink_ext_ack *extack);
int rtnl_delete_link(struct net_device *dev);
int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm);

int rtnl_nla_parse_ifinfomsg(struct nlattr **tb, const struct nlattr *nla_peer,
                             struct netlink_ext_ack *exterr);
struct net *rtnl_get_net_ns_capable(struct sock *sk, int netnsid);

#define MODULE_ALIAS_RTNL_LINK(kind) MODULE_ALIAS("rtnl-link-" kind)

#endif




























    1 



































    1 
























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM pagemap

#if !defined(_TRACE_PAGEMAP_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_PAGEMAP_H

#include <linux/tracepoint.h>
#include <linux/mm.h>

#define        PAGEMAP_MAPPED                0x0001u
#define PAGEMAP_ANONYMOUS        0x0002u
#define PAGEMAP_FILE                0x0004u
#define PAGEMAP_SWAPCACHE        0x0008u
#define PAGEMAP_SWAPBACKED        0x0010u
#define PAGEMAP_MAPPEDDISK        0x0020u
#define PAGEMAP_BUFFERS                0x0040u

#define trace_pagemap_flags(page) ( \
        (PageAnon(page)                ? PAGEMAP_ANONYMOUS  : PAGEMAP_FILE) | \
        (page_mapped(page)        ? PAGEMAP_MAPPED     : 0) | \
        (PageSwapCache(page)        ? PAGEMAP_SWAPCACHE  : 0) | \
        (PageSwapBacked(page)        ? PAGEMAP_SWAPBACKED : 0) | \
        (PageMappedToDisk(page)        ? PAGEMAP_MAPPEDDISK : 0) | \
        (page_has_private(page) ? PAGEMAP_BUFFERS    : 0) \
        )

TRACE_EVENT(mm_lru_insertion,

        TP_PROTO(
                struct page *page,
                int lru
        ),

        TP_ARGS(page, lru),

        TP_STRUCT__entry(
                __field(struct page *,        page        )
                __field(unsigned long,        pfn        )
                __field(int,                lru        )
                __field(unsigned long,        flags        )
        ),

        TP_fast_assign(
                __entry->page        = page;
                __entry->pfn        = page_to_pfn(page);
                __entry->lru        = lru;
                __entry->flags        = trace_pagemap_flags(page);
        ),

        /* Flag format is based on page-types.c formatting for pagemap */
        TP_printk("page=%p pfn=%lu lru=%d flags=%s%s%s%s%s%s",
                        __entry->page,
                        __entry->pfn,
                        __entry->lru,
                        __entry->flags & PAGEMAP_MAPPED                ? "M" : " ",
                        __entry->flags & PAGEMAP_ANONYMOUS        ? "a" : "f",
                        __entry->flags & PAGEMAP_SWAPCACHE        ? "s" : " ",
                        __entry->flags & PAGEMAP_SWAPBACKED        ? "b" : " ",
                        __entry->flags & PAGEMAP_MAPPEDDISK        ? "d" : " ",
                        __entry->flags & PAGEMAP_BUFFERS        ? "B" : " ")
);

TRACE_EVENT(mm_lru_activate,

        TP_PROTO(struct page *page),

        TP_ARGS(page),

        TP_STRUCT__entry(
                __field(struct page *,        page        )
                __field(unsigned long,        pfn        )
        ),

        TP_fast_assign(
                __entry->page        = page;
                __entry->pfn        = page_to_pfn(page);
        ),

        /* Flag format is based on page-types.c formatting for pagemap */
        TP_printk("page=%p pfn=%lu", __entry->page, __entry->pfn)

);

#endif /* _TRACE_PAGEMAP_H */

/* This part must be outside protection */
#include <trace/define_trace.h>

































    2 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
// SPDX-License-Identifier: GPL-2.0-only
/*
 * A generic implementation of binary search for the Linux kernel
 *
 * Copyright (C) 2008-2009 Ksplice, Inc.
 * Author: Tim Abbott <tabbott@ksplice.com>
 */

#include <linux/export.h>
#include <linux/bsearch.h>
#include <linux/kprobes.h>

/*
 * bsearch - binary search an array of elements
 * @key: pointer to item being searched for
 * @base: pointer to first element to search
 * @num: number of elements
 * @size: size of each element
 * @cmp: pointer to comparison function
 *
 * This function does a binary search on the given array.  The
 * contents of the array should already be in ascending sorted order
 * under the provided comparison function.
 *
 * Note that the key need not have the same type as the elements in
 * the array, e.g. key could be a string and the comparison function
 * could compare the string with the struct's name field.  However, if
 * the key and elements in the array are of the same type, you can use
 * the same comparison function for both sort() and bsearch().
 */
void *bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp)
{
        return __inline_bsearch(key, base, num, size, cmp);
}
EXPORT_SYMBOL(bsearch);
NOKPROBE_SYMBOL(bsearch);

























    1 


























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
// SPDX-License-Identifier: GPL-2.0
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/atomic.h>

/*
 * This is an implementation of the notion of "decrement a
 * reference count, and return locked if it decremented to zero".
 *
 * NOTE NOTE NOTE! This is _not_ equivalent to
 *
 *        if (atomic_dec_and_test(&atomic)) {
 *                spin_lock(&lock);
 *                return 1;
 *        }
 *        return 0;
 *
 * because the spin-lock and the decrement must be
 * "atomic".
 */
int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
{
        /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
        if (atomic_add_unless(atomic, -1, 1))
                return 0;

        /* Otherwise do it the slow way */
        spin_lock(lock);
        if (atomic_dec_and_test(atomic))
                return 1;
        spin_unlock(lock);
        return 0;
}

EXPORT_SYMBOL(_atomic_dec_and_lock);

int _atomic_dec_and_lock_irqsave(atomic_t *atomic, spinlock_t *lock,
                                 unsigned long *flags)
{
        /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
        if (atomic_add_unless(atomic, -1, 1))
                return 0;

        /* Otherwise do it the slow way */
        spin_lock_irqsave(lock, *flags);
        if (atomic_dec_and_test(atomic))
                return 1;
        spin_unlock_irqrestore(lock, *flags);
        return 0;
}
EXPORT_SYMBOL(_atomic_dec_and_lock_irqsave);






























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_USER_H
#define _LINUX_SCHED_USER_H

#include <linux/uidgid.h>
#include <linux/atomic.h>
#include <linux/refcount.h>
#include <linux/ratelimit.h>

/*
 * Some day this will be a full-fledged user tracking system..
 */
struct user_struct {
        refcount_t __count;        /* reference count */
        atomic_t processes;        /* How many processes does this user have? */
        atomic_t sigpending;        /* How many pending signals does this user have? */
#ifdef CONFIG_EPOLL
        atomic_long_t epoll_watches; /* The number of file descriptors currently watched */
#endif
#ifdef CONFIG_POSIX_MQUEUE
        /* protected by mq_lock        */
        unsigned long mq_bytes;        /* How many bytes can be allocated to mqueue? */
#endif
        unsigned long locked_shm; /* How many pages of mlocked shm ? */
        unsigned long unix_inflight;        /* How many files in flight in unix sockets */
        atomic_long_t pipe_bufs;  /* how many pages are allocated in pipe buffers */

        /* Hash table maintenance information */
        struct hlist_node uidhash_node;
        kuid_t uid;

#if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL) || \
    defined(CONFIG_NET) || defined(CONFIG_IO_URING)
        atomic_long_t locked_vm;
#endif
#ifdef CONFIG_WATCH_QUEUE
        atomic_t nr_watches;        /* The number of watches this user currently has */
#endif

        /* Miscellaneous per-user rate limit */
        struct ratelimit_state ratelimit;
};

extern int uids_sysfs_init(void);

extern struct user_struct *find_user(kuid_t);

extern struct user_struct root_user;
#define INIT_USER (&root_user)


/* per-UID process charging. */
extern struct user_struct * alloc_uid(kuid_t);
static inline struct user_struct *get_uid(struct user_struct *u)
{
        refcount_inc(&u->__count);
        return u;
}
extern void free_uid(struct user_struct *);

#endif /* _LINUX_SCHED_USER_H */



















































































































































































    1 







    1 























    1 



















    1 
    1 






















































































































































































































    2 






































    2 






    2 






    2 



    1 











    1 
    1 
    1 


    2 


    2 


























































































    1 














    1 


    1 



















































































































































































































































    2 
    1 




    2 























    2 




    2 
























    2 




    2 

    2 





















    1 






    1 




    1 

























    1 





















    1 

    1 









    1 

    1 


    1 













































































    1 




    1 






    1 

    1 




































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk>
 */
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/uio.h>
#include <linux/iocontext.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/mempool.h>
#include <linux/workqueue.h>
#include <linux/cgroup.h>
#include <linux/blk-cgroup.h>
#include <linux/highmem.h>
#include <linux/sched/sysctl.h>
#include <linux/blk-crypto.h>

#include <trace/events/block.h>
#include "blk.h"
#include "blk-rq-qos.h"

/*
 * Test patch to inline a certain number of bi_io_vec's inside the bio
 * itself, to shrink a bio data allocation from two mempool calls to one
 */
#define BIO_INLINE_VECS                4

/*
 * if you change this list, also change bvec_alloc or things will
 * break badly! cannot be bigger than what you can fit into an
 * unsigned short
 */
#define BV(x, n) { .nr_vecs = x, .name = "biovec-"#n }
static struct biovec_slab bvec_slabs[BVEC_POOL_NR] __read_mostly = {
        BV(1, 1), BV(4, 4), BV(16, 16), BV(64, 64), BV(128, 128), BV(BIO_MAX_PAGES, max),
};
#undef BV

/*
 * fs_bio_set is the bio_set containing bio and iovec memory pools used by
 * IO code that does not need private memory pools.
 */
struct bio_set fs_bio_set;
EXPORT_SYMBOL(fs_bio_set);

/*
 * Our slab pool management
 */
struct bio_slab {
        struct kmem_cache *slab;
        unsigned int slab_ref;
        unsigned int slab_size;
        char name[12];
};
static DEFINE_MUTEX(bio_slab_lock);
static struct bio_slab *bio_slabs;
static unsigned int bio_slab_nr, bio_slab_max;

static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
{
        unsigned int sz = sizeof(struct bio) + extra_size;
        struct kmem_cache *slab = NULL;
        struct bio_slab *bslab, *new_bio_slabs;
        unsigned int new_bio_slab_max;
        unsigned int i, entry = -1;

        mutex_lock(&bio_slab_lock);

        i = 0;
        while (i < bio_slab_nr) {
                bslab = &bio_slabs[i];

                if (!bslab->slab && entry == -1)
                        entry = i;
                else if (bslab->slab_size == sz) {
                        slab = bslab->slab;
                        bslab->slab_ref++;
                        break;
                }
                i++;
        }

        if (slab)
                goto out_unlock;

        if (bio_slab_nr == bio_slab_max && entry == -1) {
                new_bio_slab_max = bio_slab_max << 1;
                new_bio_slabs = krealloc(bio_slabs,
                                         new_bio_slab_max * sizeof(struct bio_slab),
                                         GFP_KERNEL);
                if (!new_bio_slabs)
                        goto out_unlock;
                bio_slab_max = new_bio_slab_max;
                bio_slabs = new_bio_slabs;
        }
        if (entry == -1)
                entry = bio_slab_nr++;

        bslab = &bio_slabs[entry];

        snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry);
        slab = kmem_cache_create(bslab->name, sz, ARCH_KMALLOC_MINALIGN,
                                 SLAB_HWCACHE_ALIGN, NULL);
        if (!slab)
                goto out_unlock;

        bslab->slab = slab;
        bslab->slab_ref = 1;
        bslab->slab_size = sz;
out_unlock:
        mutex_unlock(&bio_slab_lock);
        return slab;
}

static void bio_put_slab(struct bio_set *bs)
{
        struct bio_slab *bslab = NULL;
        unsigned int i;

        mutex_lock(&bio_slab_lock);

        for (i = 0; i < bio_slab_nr; i++) {
                if (bs->bio_slab == bio_slabs[i].slab) {
                        bslab = &bio_slabs[i];
                        break;
                }
        }

        if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n"))
                goto out;

        WARN_ON(!bslab->slab_ref);

        if (--bslab->slab_ref)
                goto out;

        kmem_cache_destroy(bslab->slab);
        bslab->slab = NULL;

out:
        mutex_unlock(&bio_slab_lock);
}

unsigned int bvec_nr_vecs(unsigned short idx)
{
        return bvec_slabs[--idx].nr_vecs;
}

void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx)
{
        if (!idx)
                return;
        idx--;

        BIO_BUG_ON(idx >= BVEC_POOL_NR);

        if (idx == BVEC_POOL_MAX) {
                mempool_free(bv, pool);
        } else {
                struct biovec_slab *bvs = bvec_slabs + idx;

                kmem_cache_free(bvs->slab, bv);
        }
}

struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx,
                           mempool_t *pool)
{
        struct bio_vec *bvl;

        /*
         * see comment near bvec_array define!
         */
        switch (nr) {
        case 1:
                *idx = 0;
                break;
        case 2 ... 4:
                *idx = 1;
                break;
        case 5 ... 16:
                *idx = 2;
                break;
        case 17 ... 64:
                *idx = 3;
                break;
        case 65 ... 128:
                *idx = 4;
                break;
        case 129 ... BIO_MAX_PAGES:
                *idx = 5;
                break;
        default:
                return NULL;
        }

        /*
         * idx now points to the pool we want to allocate from. only the
         * 1-vec entry pool is mempool backed.
         */
        if (*idx == BVEC_POOL_MAX) {
fallback:
                bvl = mempool_alloc(pool, gfp_mask);
        } else {
                struct biovec_slab *bvs = bvec_slabs + *idx;
                gfp_t __gfp_mask = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO);

                /*
                 * Make this allocation restricted and don't dump info on
                 * allocation failures, since we'll fallback to the mempool
                 * in case of failure.
                 */
                __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;

                /*
                 * Try a slab allocation. If this fails and __GFP_DIRECT_RECLAIM
                 * is set, retry with the 1-entry mempool
                 */
                bvl = kmem_cache_alloc(bvs->slab, __gfp_mask);
                if (unlikely(!bvl && (gfp_mask & __GFP_DIRECT_RECLAIM))) {
                        *idx = BVEC_POOL_MAX;
                        goto fallback;
                }
        }

        (*idx)++;
        return bvl;
}

void bio_uninit(struct bio *bio)
{
#ifdef CONFIG_BLK_CGROUP
        if (bio->bi_blkg) {
                blkg_put(bio->bi_blkg);
                bio->bi_blkg = NULL;
        }
#endif
        if (bio_integrity(bio))
                bio_integrity_free(bio);

        bio_crypt_free_ctx(bio);
}
EXPORT_SYMBOL(bio_uninit);

static void bio_free(struct bio *bio)
{
        struct bio_set *bs = bio->bi_pool;
        void *p;

        bio_uninit(bio);

        if (bs) {
                bvec_free(&bs->bvec_pool, bio->bi_io_vec, BVEC_POOL_IDX(bio));

                /*
                 * If we have front padding, adjust the bio pointer before freeing
                 */
                p = bio;
                p -= bs->front_pad;

                mempool_free(p, &bs->bio_pool);
        } else {
                /* Bio was allocated by bio_kmalloc() */
                kfree(bio);
        }
}

/*
 * Users of this function have their own bio allocation. Subsequently,
 * they must remember to pair any call to bio_init() with bio_uninit()
 * when IO has completed, or when the bio is released.
 */
void bio_init(struct bio *bio, struct bio_vec *table,
              unsigned short max_vecs)
{
        memset(bio, 0, sizeof(*bio));
        atomic_set(&bio->__bi_remaining, 1);
        atomic_set(&bio->__bi_cnt, 1);

        bio->bi_io_vec = table;
        bio->bi_max_vecs = max_vecs;
}
EXPORT_SYMBOL(bio_init);

/**
 * bio_reset - reinitialize a bio
 * @bio:        bio to reset
 *
 * Description:
 *   After calling bio_reset(), @bio will be in the same state as a freshly
 *   allocated bio returned bio bio_alloc_bioset() - the only fields that are
 *   preserved are the ones that are initialized by bio_alloc_bioset(). See
 *   comment in struct bio.
 */
void bio_reset(struct bio *bio)
{
        unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS);

        bio_uninit(bio);

        memset(bio, 0, BIO_RESET_BYTES);
        bio->bi_flags = flags;
        atomic_set(&bio->__bi_remaining, 1);
}
EXPORT_SYMBOL(bio_reset);

static struct bio *__bio_chain_endio(struct bio *bio)
{
        struct bio *parent = bio->bi_private;

        if (bio->bi_status && !parent->bi_status)
                parent->bi_status = bio->bi_status;
        bio_put(bio);
        return parent;
}

static void bio_chain_endio(struct bio *bio)
{
        bio_endio(__bio_chain_endio(bio));
}

/**
 * bio_chain - chain bio completions
 * @bio: the target bio
 * @parent: the parent bio of @bio
 *
 * The caller won't have a bi_end_io called when @bio completes - instead,
 * @parent's bi_end_io won't be called until both @parent and @bio have
 * completed; the chained bio will also be freed when it completes.
 *
 * The caller must not set bi_private or bi_end_io in @bio.
 */
void bio_chain(struct bio *bio, struct bio *parent)
{
        BUG_ON(bio->bi_private || bio->bi_end_io);

        bio->bi_private = parent;
        bio->bi_end_io        = bio_chain_endio;
        bio_inc_remaining(parent);
}
EXPORT_SYMBOL(bio_chain);

static void bio_alloc_rescue(struct work_struct *work)
{
        struct bio_set *bs = container_of(work, struct bio_set, rescue_work);
        struct bio *bio;

        while (1) {
                spin_lock(&bs->rescue_lock);
                bio = bio_list_pop(&bs->rescue_list);
                spin_unlock(&bs->rescue_lock);

                if (!bio)
                        break;

                submit_bio_noacct(bio);
        }
}

static void punt_bios_to_rescuer(struct bio_set *bs)
{
        struct bio_list punt, nopunt;
        struct bio *bio;

        if (WARN_ON_ONCE(!bs->rescue_workqueue))
                return;
        /*
         * In order to guarantee forward progress we must punt only bios that
         * were allocated from this bio_set; otherwise, if there was a bio on
         * there for a stacking driver higher up in the stack, processing it
         * could require allocating bios from this bio_set, and doing that from
         * our own rescuer would be bad.
         *
         * Since bio lists are singly linked, pop them all instead of trying to
         * remove from the middle of the list:
         */

        bio_list_init(&punt);
        bio_list_init(&nopunt);

        while ((bio = bio_list_pop(&current->bio_list[0])))
                bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
        current->bio_list[0] = nopunt;

        bio_list_init(&nopunt);
        while ((bio = bio_list_pop(&current->bio_list[1])))
                bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
        current->bio_list[1] = nopunt;

        spin_lock(&bs->rescue_lock);
        bio_list_merge(&bs->rescue_list, &punt);
        spin_unlock(&bs->rescue_lock);

        queue_work(bs->rescue_workqueue, &bs->rescue_work);
}

/**
 * bio_alloc_bioset - allocate a bio for I/O
 * @gfp_mask:   the GFP_* mask given to the slab allocator
 * @nr_iovecs:        number of iovecs to pre-allocate
 * @bs:                the bio_set to allocate from.
 *
 * Description:
 *   If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is
 *   backed by the @bs's mempool.
 *
 *   When @bs is not NULL, if %__GFP_DIRECT_RECLAIM is set then bio_alloc will
 *   always be able to allocate a bio. This is due to the mempool guarantees.
 *   To make this work, callers must never allocate more than 1 bio at a time
 *   from this pool. Callers that need to allocate more than 1 bio must always
 *   submit the previously allocated bio for IO before attempting to allocate
 *   a new one. Failure to do so can cause deadlocks under memory pressure.
 *
 *   Note that when running under submit_bio_noacct() (i.e. any block
 *   driver), bios are not submitted until after you return - see the code in
 *   submit_bio_noacct() that converts recursion into iteration, to prevent
 *   stack overflows.
 *
 *   This would normally mean allocating multiple bios under
 *   submit_bio_noacct() would be susceptible to deadlocks, but we have
 *   deadlock avoidance code that resubmits any blocked bios from a rescuer
 *   thread.
 *
 *   However, we do not guarantee forward progress for allocations from other
 *   mempools. Doing multiple allocations from the same mempool under
 *   submit_bio_noacct() should be avoided - instead, use bio_set's front_pad
 *   for per bio allocations.
 *
 *   RETURNS:
 *   Pointer to new bio on success, NULL on failure.
 */
struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
                             struct bio_set *bs)
{
        gfp_t saved_gfp = gfp_mask;
        unsigned front_pad;
        unsigned inline_vecs;
        struct bio_vec *bvl = NULL;
        struct bio *bio;
        void *p;

        if (!bs) {
                if (nr_iovecs > UIO_MAXIOV)
                        return NULL;

                p = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask);
                front_pad = 0;
                inline_vecs = nr_iovecs;
        } else {
                /* should not use nobvec bioset for nr_iovecs > 0 */
                if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) &&
                                 nr_iovecs > 0))
                        return NULL;
                /*
                 * submit_bio_noacct() converts recursion to iteration; this
                 * means if we're running beneath it, any bios we allocate and
                 * submit will not be submitted (and thus freed) until after we
                 * return.
                 *
                 * This exposes us to a potential deadlock if we allocate
                 * multiple bios from the same bio_set() while running
                 * underneath submit_bio_noacct(). If we were to allocate
                 * multiple bios (say a stacking block driver that was splitting
                 * bios), we would deadlock if we exhausted the mempool's
                 * reserve.
                 *
                 * We solve this, and guarantee forward progress, with a rescuer
                 * workqueue per bio_set. If we go to allocate and there are
                 * bios on current->bio_list, we first try the allocation
                 * without __GFP_DIRECT_RECLAIM; if that fails, we punt those
                 * bios we would be blocking to the rescuer workqueue before
                 * we retry with the original gfp_flags.
                 */

                if (current->bio_list &&
                    (!bio_list_empty(&current->bio_list[0]) ||
                     !bio_list_empty(&current->bio_list[1])) &&
                    bs->rescue_workqueue)
                        gfp_mask &= ~__GFP_DIRECT_RECLAIM;

                p = mempool_alloc(&bs->bio_pool, gfp_mask);
                if (!p && gfp_mask != saved_gfp) {
                        punt_bios_to_rescuer(bs);
                        gfp_mask = saved_gfp;
                        p = mempool_alloc(&bs->bio_pool, gfp_mask);
                }

                front_pad = bs->front_pad;
                inline_vecs = BIO_INLINE_VECS;
        }

        if (unlikely(!p))
                return NULL;

        bio = p + front_pad;
        bio_init(bio, NULL, 0);

        if (nr_iovecs > inline_vecs) {
                unsigned long idx = 0;

                bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool);
                if (!bvl && gfp_mask != saved_gfp) {
                        punt_bios_to_rescuer(bs);
                        gfp_mask = saved_gfp;
                        bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool);
                }

                if (unlikely(!bvl))
                        goto err_free;

                bio->bi_flags |= idx << BVEC_POOL_OFFSET;
        } else if (nr_iovecs) {
                bvl = bio->bi_inline_vecs;
        }

        bio->bi_pool = bs;
        bio->bi_max_vecs = nr_iovecs;
        bio->bi_io_vec = bvl;
        return bio;

err_free:
        mempool_free(p, &bs->bio_pool);
        return NULL;
}
EXPORT_SYMBOL(bio_alloc_bioset);

void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start)
{
        unsigned long flags;
        struct bio_vec bv;
        struct bvec_iter iter;

        __bio_for_each_segment(bv, bio, iter, start) {
                char *data = bvec_kmap_irq(&bv, &flags);
                memset(data, 0, bv.bv_len);
                flush_dcache_page(bv.bv_page);
                bvec_kunmap_irq(data, &flags);
        }
}
EXPORT_SYMBOL(zero_fill_bio_iter);

/**
 * bio_truncate - truncate the bio to small size of @new_size
 * @bio:        the bio to be truncated
 * @new_size:        new size for truncating the bio
 *
 * Description:
 *   Truncate the bio to new size of @new_size. If bio_op(bio) is
 *   REQ_OP_READ, zero the truncated part. This function should only
 *   be used for handling corner cases, such as bio eod.
 */
void bio_truncate(struct bio *bio, unsigned new_size)
{
        struct bio_vec bv;
        struct bvec_iter iter;
        unsigned int done = 0;
        bool truncated = false;

        if (new_size >= bio->bi_iter.bi_size)
                return;

        if (bio_op(bio) != REQ_OP_READ)
                goto exit;

        bio_for_each_segment(bv, bio, iter) {
                if (done + bv.bv_len > new_size) {
                        unsigned offset;

                        if (!truncated)
                                offset = new_size - done;
                        else
                                offset = 0;
                        zero_user(bv.bv_page, bv.bv_offset + offset,
                                  bv.bv_len - offset);
                        truncated = true;
                }
                done += bv.bv_len;
        }

 exit:
        /*
         * Don't touch bvec table here and make it really immutable, since
         * fs bio user has to retrieve all pages via bio_for_each_segment_all
         * in its .end_bio() callback.
         *
         * It is enough to truncate bio by updating .bi_size since we can make
         * correct bvec with the updated .bi_size for drivers.
         */
        bio->bi_iter.bi_size = new_size;
}

/**
 * guard_bio_eod - truncate a BIO to fit the block device
 * @bio:        bio to truncate
 *
 * This allows us to do IO even on the odd last sectors of a device, even if the
 * block size is some multiple of the physical sector size.
 *
 * We'll just truncate the bio to the size of the device, and clear the end of
 * the buffer head manually.  Truly out-of-range accesses will turn into actual
 * I/O errors, this only handles the "we need to be able to do I/O at the final
 * sector" case.
 */
void guard_bio_eod(struct bio *bio)
{
        sector_t maxsector;
        struct hd_struct *part;

        rcu_read_lock();
        part = __disk_get_part(bio->bi_disk, bio->bi_partno);
        if (part)
                maxsector = part_nr_sects_read(part);
        else
                maxsector = get_capacity(bio->bi_disk);
        rcu_read_unlock();

        if (!maxsector)
                return;

        /*
         * If the *whole* IO is past the end of the device,
         * let it through, and the IO layer will turn it into
         * an EIO.
         */
        if (unlikely(bio->bi_iter.bi_sector >= maxsector))
                return;

        maxsector -= bio->bi_iter.bi_sector;
        if (likely((bio->bi_iter.bi_size >> 9) <= maxsector))
                return;

        bio_truncate(bio, maxsector << 9);
}

/**
 * bio_put - release a reference to a bio
 * @bio:   bio to release reference to
 *
 * Description:
 *   Put a reference to a &struct bio, either one you have gotten with
 *   bio_alloc, bio_get or bio_clone_*. The last put of a bio will free it.
 **/
void bio_put(struct bio *bio)
{
        if (!bio_flagged(bio, BIO_REFFED))
                bio_free(bio);
        else {
                BIO_BUG_ON(!atomic_read(&bio->__bi_cnt));

                /*
                 * last put frees it
                 */
                if (atomic_dec_and_test(&bio->__bi_cnt))
                        bio_free(bio);
        }
}
EXPORT_SYMBOL(bio_put);

/**
 *         __bio_clone_fast - clone a bio that shares the original bio's biovec
 *         @bio: destination bio
 *         @bio_src: bio to clone
 *
 *        Clone a &bio. Caller will own the returned bio, but not
 *        the actual data it points to. Reference count of returned
 *         bio will be one.
 *
 *         Caller must ensure that @bio_src is not freed before @bio.
 */
void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
{
        BUG_ON(bio->bi_pool && BVEC_POOL_IDX(bio));

        /*
         * most users will be overriding ->bi_disk with a new target,
         * so we don't set nor calculate new physical/hw segment counts here
         */
        bio->bi_disk = bio_src->bi_disk;
        bio->bi_partno = bio_src->bi_partno;
        bio_set_flag(bio, BIO_CLONED);
        if (bio_flagged(bio_src, BIO_THROTTLED))
                bio_set_flag(bio, BIO_THROTTLED);
        bio->bi_opf = bio_src->bi_opf;
        bio->bi_ioprio = bio_src->bi_ioprio;
        bio->bi_write_hint = bio_src->bi_write_hint;
        bio->bi_iter = bio_src->bi_iter;
        bio->bi_io_vec = bio_src->bi_io_vec;

        bio_clone_blkg_association(bio, bio_src);
        blkcg_bio_issue_init(bio);
}
EXPORT_SYMBOL(__bio_clone_fast);

/**
 *        bio_clone_fast - clone a bio that shares the original bio's biovec
 *        @bio: bio to clone
 *        @gfp_mask: allocation priority
 *        @bs: bio_set to allocate from
 *
 *         Like __bio_clone_fast, only also allocates the returned bio
 */
struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
{
        struct bio *b;

        b = bio_alloc_bioset(gfp_mask, 0, bs);
        if (!b)
                return NULL;

        __bio_clone_fast(b, bio);

        if (bio_crypt_clone(b, bio, gfp_mask) < 0)
                goto err_put;

        if (bio_integrity(bio) &&
            bio_integrity_clone(b, bio, gfp_mask) < 0)
                goto err_put;

        return b;

err_put:
        bio_put(b);
        return NULL;
}
EXPORT_SYMBOL(bio_clone_fast);

const char *bio_devname(struct bio *bio, char *buf)
{
        return disk_name(bio->bi_disk, bio->bi_partno, buf);
}
EXPORT_SYMBOL(bio_devname);

static inline bool page_is_mergeable(const struct bio_vec *bv,
                struct page *page, unsigned int len, unsigned int off,
                bool *same_page)
{
        size_t bv_end = bv->bv_offset + bv->bv_len;
        phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1;
        phys_addr_t page_addr = page_to_phys(page);

        if (vec_end_addr + 1 != page_addr + off)
                return false;
        if (xen_domain() && !xen_biovec_phys_mergeable(bv, page))
                return false;

        *same_page = ((vec_end_addr & PAGE_MASK) == page_addr);
        if (*same_page)
                return true;
        return (bv->bv_page + bv_end / PAGE_SIZE) == (page + off / PAGE_SIZE);
}

/*
 * Try to merge a page into a segment, while obeying the hardware segment
 * size limit.  This is not for normal read/write bios, but for passthrough
 * or Zone Append operations that we can't split.
 */
static bool bio_try_merge_hw_seg(struct request_queue *q, struct bio *bio,
                                 struct page *page, unsigned len,
                                 unsigned offset, bool *same_page)
{
        struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
        unsigned long mask = queue_segment_boundary(q);
        phys_addr_t addr1 = page_to_phys(bv->bv_page) + bv->bv_offset;
        phys_addr_t addr2 = page_to_phys(page) + offset + len - 1;

        if ((addr1 | mask) != (addr2 | mask))
                return false;
        if (len > queue_max_segment_size(q) - bv->bv_len)
                return false;
        return __bio_try_merge_page(bio, page, len, offset, same_page);
}

/**
 * bio_add_hw_page - attempt to add a page to a bio with hw constraints
 * @q: the target queue
 * @bio: destination bio
 * @page: page to add
 * @len: vec entry length
 * @offset: vec entry offset
 * @max_sectors: maximum number of sectors that can be added
 * @same_page: return if the segment has been merged inside the same page
 *
 * Add a page to a bio while respecting the hardware max_sectors, max_segment
 * and gap limitations.
 */
int bio_add_hw_page(struct request_queue *q, struct bio *bio,
                struct page *page, unsigned int len, unsigned int offset,
                unsigned int max_sectors, bool *same_page)
{
        struct bio_vec *bvec;

        if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
                return 0;

        if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors)
                return 0;

        if (bio->bi_vcnt > 0) {
                if (bio_try_merge_hw_seg(q, bio, page, len, offset, same_page))
                        return len;

                /*
                 * If the queue doesn't support SG gaps and adding this segment
                 * would create a gap, disallow it.
                 */
                bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
                if (bvec_gap_to_prev(q, bvec, offset))
                        return 0;
        }

        if (bio_full(bio, len))
                return 0;

        if (bio->bi_vcnt >= queue_max_segments(q))
                return 0;

        bvec = &bio->bi_io_vec[bio->bi_vcnt];
        bvec->bv_page = page;
        bvec->bv_len = len;
        bvec->bv_offset = offset;
        bio->bi_vcnt++;
        bio->bi_iter.bi_size += len;
        return len;
}

/**
 * bio_add_pc_page        - attempt to add page to passthrough bio
 * @q: the target queue
 * @bio: destination bio
 * @page: page to add
 * @len: vec entry length
 * @offset: vec entry offset
 *
 * Attempt to add a page to the bio_vec maplist. This can fail for a
 * number of reasons, such as the bio being full or target block device
 * limitations. The target block device must allow bio's up to PAGE_SIZE,
 * so it is always possible to add a single page to an empty bio.
 *
 * This should only be used by passthrough bios.
 */
int bio_add_pc_page(struct request_queue *q, struct bio *bio,
                struct page *page, unsigned int len, unsigned int offset)
{
        bool same_page = false;
        return bio_add_hw_page(q, bio, page, len, offset,
                        queue_max_hw_sectors(q), &same_page);
}
EXPORT_SYMBOL(bio_add_pc_page);

/**
 * __bio_try_merge_page - try appending data to an existing bvec.
 * @bio: destination bio
 * @page: start page to add
 * @len: length of the data to add
 * @off: offset of the data relative to @page
 * @same_page: return if the segment has been merged inside the same page
 *
 * Try to add the data at @page + @off to the last bvec of @bio.  This is a
 * useful optimisation for file systems with a block size smaller than the
 * page size.
 *
 * Warn if (@len, @off) crosses pages in case that @same_page is true.
 *
 * Return %true on success or %false on failure.
 */
bool __bio_try_merge_page(struct bio *bio, struct page *page,
                unsigned int len, unsigned int off, bool *same_page)
{
        if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
                return false;

        if (bio->bi_vcnt > 0) {
                struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];

                if (page_is_mergeable(bv, page, len, off, same_page)) {
                        if (bio->bi_iter.bi_size > UINT_MAX - len) {
                                *same_page = false;
                                return false;
                        }
                        bv->bv_len += len;
                        bio->bi_iter.bi_size += len;
                        return true;
                }
        }
        return false;
}
EXPORT_SYMBOL_GPL(__bio_try_merge_page);

/**
 * __bio_add_page - add page(s) to a bio in a new segment
 * @bio: destination bio
 * @page: start page to add
 * @len: length of the data to add, may cross pages
 * @off: offset of the data relative to @page, may cross pages
 *
 * Add the data at @page + @off to @bio as a new bvec.  The caller must ensure
 * that @bio has space for another bvec.
 */
void __bio_add_page(struct bio *bio, struct page *page,
                unsigned int len, unsigned int off)
{
        struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];

        WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
        WARN_ON_ONCE(bio_full(bio, len));

        bv->bv_page = page;
        bv->bv_offset = off;
        bv->bv_len = len;

        bio->bi_iter.bi_size += len;
        bio->bi_vcnt++;

        if (!bio_flagged(bio, BIO_WORKINGSET) && unlikely(PageWorkingset(page)))
                bio_set_flag(bio, BIO_WORKINGSET);
}
EXPORT_SYMBOL_GPL(__bio_add_page);

/**
 *        bio_add_page        -        attempt to add page(s) to bio
 *        @bio: destination bio
 *        @page: start page to add
 *        @len: vec entry length, may cross pages
 *        @offset: vec entry offset relative to @page, may cross pages
 *
 *        Attempt to add page(s) to the bio_vec maplist. This will only fail
 *        if either bio->bi_vcnt == bio->bi_max_vecs or it's a cloned bio.
 */
int bio_add_page(struct bio *bio, struct page *page,
                 unsigned int len, unsigned int offset)
{
        bool same_page = false;

        if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) {
                if (bio_full(bio, len))
                        return 0;
                __bio_add_page(bio, page, len, offset);
        }
        return len;
}
EXPORT_SYMBOL(bio_add_page);

void bio_release_pages(struct bio *bio, bool mark_dirty)
{
        struct bvec_iter_all iter_all;
        struct bio_vec *bvec;

        if (bio_flagged(bio, BIO_NO_PAGE_REF))
                return;

        bio_for_each_segment_all(bvec, bio, iter_all) {
                if (mark_dirty)
                        set_page_dirty_lock(bvec->bv_page);
                put_page(bvec->bv_page);
        }
}
EXPORT_SYMBOL_GPL(bio_release_pages);

static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter)
{
        const struct bio_vec *bv = iter->bvec;
        unsigned int len;
        size_t size;

        if (WARN_ON_ONCE(iter->iov_offset > bv->bv_len))
                return -EINVAL;

        len = min_t(size_t, bv->bv_len - iter->iov_offset, iter->count);
        size = bio_add_page(bio, bv->bv_page, len,
                                bv->bv_offset + iter->iov_offset);
        if (unlikely(size != len))
                return -EINVAL;
        iov_iter_advance(iter, size);
        return 0;
}

static void bio_put_pages(struct page **pages, size_t size, size_t off)
{
        size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE);

        for (i = 0; i < nr; i++)
                put_page(pages[i]);
}

#define PAGE_PTRS_PER_BVEC     (sizeof(struct bio_vec) / sizeof(struct page *))

/**
 * __bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
 * @bio: bio to add pages to
 * @iter: iov iterator describing the region to be mapped
 *
 * Pins pages from *iter and appends them to @bio's bvec array. The
 * pages will have to be released using put_page() when done.
 * For multi-segment *iter, this function only adds pages from the
 * next non-empty segment of the iov iterator.
 */
static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
{
        unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
        unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
        struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
        struct page **pages = (struct page **)bv;
        bool same_page = false;
        ssize_t size, left;
        unsigned len, i;
        size_t offset;

        /*
         * Move page array up in the allocated memory for the bio vecs as far as
         * possible so that we can start filling biovecs from the beginning
         * without overwriting the temporary page array.
        */
        BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
        pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);

        size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
        if (unlikely(size <= 0))
                return size ? size : -EFAULT;

        for (left = size, i = 0; left > 0; left -= len, i++) {
                struct page *page = pages[i];

                len = min_t(size_t, PAGE_SIZE - offset, left);

                if (__bio_try_merge_page(bio, page, len, offset, &same_page)) {
                        if (same_page)
                                put_page(page);
                } else {
                        if (WARN_ON_ONCE(bio_full(bio, len))) {
                                bio_put_pages(pages + i, left, offset);
                                return -EINVAL;
                        }
                        __bio_add_page(bio, page, len, offset);
                }
                offset = 0;
        }

        iov_iter_advance(iter, size);
        return 0;
}

static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
{
        unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
        unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
        struct request_queue *q = bio->bi_disk->queue;
        unsigned int max_append_sectors = queue_max_zone_append_sectors(q);
        struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
        struct page **pages = (struct page **)bv;
        ssize_t size, left;
        unsigned len, i;
        size_t offset;
        int ret = 0;

        /*
         * Move page array up in the allocated memory for the bio vecs as far as
         * possible so that we can start filling biovecs from the beginning
         * without overwriting the temporary page array.
         */
        BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
        pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);

        size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
        if (unlikely(size <= 0))
                return size ? size : -EFAULT;

        for (left = size, i = 0; left > 0; left -= len, i++) {
                struct page *page = pages[i];
                bool same_page = false;

                len = min_t(size_t, PAGE_SIZE - offset, left);
                if (bio_add_hw_page(q, bio, page, len, offset,
                                max_append_sectors, &same_page) != len) {
                        bio_put_pages(pages + i, left, offset);
                        ret = -EINVAL;
                        break;
                }
                if (same_page)
                        put_page(page);
                offset = 0;
        }

        iov_iter_advance(iter, size - left);
        return ret;
}

/**
 * bio_iov_iter_get_pages - add user or kernel pages to a bio
 * @bio: bio to add pages to
 * @iter: iov iterator describing the region to be added
 *
 * This takes either an iterator pointing to user memory, or one pointing to
 * kernel pages (BVEC iterator). If we're adding user pages, we pin them and
 * map them into the kernel. On IO completion, the caller should put those
 * pages. If we're adding kernel pages, and the caller told us it's safe to
 * do so, we just have to add the pages to the bio directly. We don't grab an
 * extra reference to those pages (the user should already have that), and we
 * don't put the page on IO completion. The caller needs to check if the bio is
 * flagged BIO_NO_PAGE_REF on IO completion. If it isn't, then pages should be
 * released.
 *
 * The function tries, but does not guarantee, to pin as many pages as
 * fit into the bio, or are requested in @iter, whatever is smaller. If
 * MM encounters an error pinning the requested pages, it stops. Error
 * is returned only if 0 pages could be pinned.
 */
int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
{
        const bool is_bvec = iov_iter_is_bvec(iter);
        int ret;

        if (WARN_ON_ONCE(bio->bi_vcnt))
                return -EINVAL;

        do {
                if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
                        if (WARN_ON_ONCE(is_bvec))
                                return -EINVAL;
                        ret = __bio_iov_append_get_pages(bio, iter);
                } else {
                        if (is_bvec)
                                ret = __bio_iov_bvec_add_pages(bio, iter);
                        else
                                ret = __bio_iov_iter_get_pages(bio, iter);
                }
        } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));

        if (is_bvec)
                bio_set_flag(bio, BIO_NO_PAGE_REF);
        return bio->bi_vcnt ? 0 : ret;
}
EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);

static void submit_bio_wait_endio(struct bio *bio)
{
        complete(bio->bi_private);
}

/**
 * submit_bio_wait - submit a bio, and wait until it completes
 * @bio: The &struct bio which describes the I/O
 *
 * Simple wrapper around submit_bio(). Returns 0 on success, or the error from
 * bio_endio() on failure.
 *
 * WARNING: Unlike to how submit_bio() is usually used, this function does not
 * result in bio reference to be consumed. The caller must drop the reference
 * on his own.
 */
int submit_bio_wait(struct bio *bio)
{
        DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_disk->lockdep_map);
        unsigned long hang_check;

        bio->bi_private = &done;
        bio->bi_end_io = submit_bio_wait_endio;
        bio->bi_opf |= REQ_SYNC;
        submit_bio(bio);

        /* Prevent hang_check timer from firing at us during very long I/O */
        hang_check = sysctl_hung_task_timeout_secs;
        if (hang_check)
                while (!wait_for_completion_io_timeout(&done,
                                        hang_check * (HZ/2)))
                        ;
        else
                wait_for_completion_io(&done);

        return blk_status_to_errno(bio->bi_status);
}
EXPORT_SYMBOL(submit_bio_wait);

/**
 * bio_advance - increment/complete a bio by some number of bytes
 * @bio:        bio to advance
 * @bytes:        number of bytes to complete
 *
 * This updates bi_sector, bi_size and bi_idx; if the number of bytes to
 * complete doesn't align with a bvec boundary, then bv_len and bv_offset will
 * be updated on the last bvec as well.
 *
 * @bio will then represent the remaining, uncompleted portion of the io.
 */
void bio_advance(struct bio *bio, unsigned bytes)
{
        if (bio_integrity(bio))
                bio_integrity_advance(bio, bytes);

        bio_crypt_advance(bio, bytes);
        bio_advance_iter(bio, &bio->bi_iter, bytes);
}
EXPORT_SYMBOL(bio_advance);

void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
                        struct bio *src, struct bvec_iter *src_iter)
{
        struct bio_vec src_bv, dst_bv;
        void *src_p, *dst_p;
        unsigned bytes;

        while (src_iter->bi_size && dst_iter->bi_size) {
                src_bv = bio_iter_iovec(src, *src_iter);
                dst_bv = bio_iter_iovec(dst, *dst_iter);

                bytes = min(src_bv.bv_len, dst_bv.bv_len);

                src_p = kmap_atomic(src_bv.bv_page);
                dst_p = kmap_atomic(dst_bv.bv_page);

                memcpy(dst_p + dst_bv.bv_offset,
                       src_p + src_bv.bv_offset,
                       bytes);

                kunmap_atomic(dst_p);
                kunmap_atomic(src_p);

                flush_dcache_page(dst_bv.bv_page);

                bio_advance_iter(src, src_iter, bytes);
                bio_advance_iter(dst, dst_iter, bytes);
        }
}
EXPORT_SYMBOL(bio_copy_data_iter);

/**
 * bio_copy_data - copy contents of data buffers from one bio to another
 * @src: source bio
 * @dst: destination bio
 *
 * Stops when it reaches the end of either @src or @dst - that is, copies
 * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
 */
void bio_copy_data(struct bio *dst, struct bio *src)
{
        struct bvec_iter src_iter = src->bi_iter;
        struct bvec_iter dst_iter = dst->bi_iter;

        bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
}
EXPORT_SYMBOL(bio_copy_data);

/**
 * bio_list_copy_data - copy contents of data buffers from one chain of bios to
 * another
 * @src: source bio list
 * @dst: destination bio list
 *
 * Stops when it reaches the end of either the @src list or @dst list - that is,
 * copies min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of
 * bios).
 */
void bio_list_copy_data(struct bio *dst, struct bio *src)
{
        struct bvec_iter src_iter = src->bi_iter;
        struct bvec_iter dst_iter = dst->bi_iter;

        while (1) {
                if (!src_iter.bi_size) {
                        src = src->bi_next;
                        if (!src)
                                break;

                        src_iter = src->bi_iter;
                }

                if (!dst_iter.bi_size) {
                        dst = dst->bi_next;
                        if (!dst)
                                break;

                        dst_iter = dst->bi_iter;
                }

                bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
        }
}
EXPORT_SYMBOL(bio_list_copy_data);

void bio_free_pages(struct bio *bio)
{
        struct bio_vec *bvec;
        struct bvec_iter_all iter_all;

        bio_for_each_segment_all(bvec, bio, iter_all)
                __free_page(bvec->bv_page);
}
EXPORT_SYMBOL(bio_free_pages);

/*
 * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
 * for performing direct-IO in BIOs.
 *
 * The problem is that we cannot run set_page_dirty() from interrupt context
 * because the required locks are not interrupt-safe.  So what we can do is to
 * mark the pages dirty _before_ performing IO.  And in interrupt context,
 * check that the pages are still dirty.   If so, fine.  If not, redirty them
 * in process context.
 *
 * We special-case compound pages here: normally this means reads into hugetlb
 * pages.  The logic in here doesn't really work right for compound pages
 * because the VM does not uniformly chase down the head page in all cases.
 * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't
 * handle them at all.  So we skip compound pages here at an early stage.
 *
 * Note that this code is very hard to test under normal circumstances because
 * direct-io pins the pages with get_user_pages().  This makes
 * is_page_cache_freeable return false, and the VM will not clean the pages.
 * But other code (eg, flusher threads) could clean the pages if they are mapped
 * pagecache.
 *
 * Simply disabling the call to bio_set_pages_dirty() is a good way to test the
 * deferred bio dirtying paths.
 */

/*
 * bio_set_pages_dirty() will mark all the bio's pages as dirty.
 */
void bio_set_pages_dirty(struct bio *bio)
{
        struct bio_vec *bvec;
        struct bvec_iter_all iter_all;

        bio_for_each_segment_all(bvec, bio, iter_all) {
                set_page_dirty_lock(bvec->bv_page);
        }
}

/*
 * bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
 * If they are, then fine.  If, however, some pages are clean then they must
 * have been written out during the direct-IO read.  So we take another ref on
 * the BIO and re-dirty the pages in process context.
 *
 * It is expected that bio_check_pages_dirty() will wholly own the BIO from
 * here on.  It will run one put_page() against each page and will run one
 * bio_put() against the BIO.
 */

static void bio_dirty_fn(struct work_struct *work);

static DECLARE_WORK(bio_dirty_work, bio_dirty_fn);
static DEFINE_SPINLOCK(bio_dirty_lock);
static struct bio *bio_dirty_list;

/*
 * This runs in process context
 */
static void bio_dirty_fn(struct work_struct *work)
{
        struct bio *bio, *next;

        spin_lock_irq(&bio_dirty_lock);
        next = bio_dirty_list;
        bio_dirty_list = NULL;
        spin_unlock_irq(&bio_dirty_lock);

        while ((bio = next) != NULL) {
                next = bio->bi_private;

                bio_release_pages(bio, true);
                bio_put(bio);
        }
}

void bio_check_pages_dirty(struct bio *bio)
{
        struct bio_vec *bvec;
        unsigned long flags;
        struct bvec_iter_all iter_all;

        bio_for_each_segment_all(bvec, bio, iter_all) {
                if (!PageDirty(bvec->bv_page))
                        goto defer;
        }

        bio_release_pages(bio, false);
        bio_put(bio);
        return;
defer:
        spin_lock_irqsave(&bio_dirty_lock, flags);
        bio->bi_private = bio_dirty_list;
        bio_dirty_list = bio;
        spin_unlock_irqrestore(&bio_dirty_lock, flags);
        schedule_work(&bio_dirty_work);
}

static inline bool bio_remaining_done(struct bio *bio)
{
        /*
         * If we're not chaining, then ->__bi_remaining is always 1 and
         * we always end io on the first invocation.
         */
        if (!bio_flagged(bio, BIO_CHAIN))
                return true;

        BUG_ON(atomic_read(&bio->__bi_remaining) <= 0);

        if (atomic_dec_and_test(&bio->__bi_remaining)) {
                bio_clear_flag(bio, BIO_CHAIN);
                return true;
        }

        return false;
}

/**
 * bio_endio - end I/O on a bio
 * @bio:        bio
 *
 * Description:
 *   bio_endio() will end I/O on the whole bio. bio_endio() is the preferred
 *   way to end I/O on a bio. No one should call bi_end_io() directly on a
 *   bio unless they own it and thus know that it has an end_io function.
 *
 *   bio_endio() can be called several times on a bio that has been chained
 *   using bio_chain().  The ->bi_end_io() function will only be called the
 *   last time.  At this point the BLK_TA_COMPLETE tracing event will be
 *   generated if BIO_TRACE_COMPLETION is set.
 **/
void bio_endio(struct bio *bio)
{
again:
        if (!bio_remaining_done(bio))
                return;
        if (!bio_integrity_endio(bio))
                return;

        if (bio->bi_disk && bio_flagged(bio, BIO_TRACKED))
                rq_qos_done_bio(bio->bi_disk->queue, bio);

        /*
         * Need to have a real endio function for chained bios, otherwise
         * various corner cases will break (like stacking block devices that
         * save/restore bi_end_io) - however, we want to avoid unbounded
         * recursion and blowing the stack. Tail call optimization would
         * handle this, but compiling with frame pointers also disables
         * gcc's sibling call optimization.
         */
        if (bio->bi_end_io == bio_chain_endio) {
                bio = __bio_chain_endio(bio);
                goto again;
        }

        if (bio->bi_disk && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
                trace_block_bio_complete(bio->bi_disk->queue, bio);
                bio_clear_flag(bio, BIO_TRACE_COMPLETION);
        }

        blk_throtl_bio_endio(bio);
        /* release cgroup info */
        bio_uninit(bio);
        if (bio->bi_end_io)
                bio->bi_end_io(bio);
}
EXPORT_SYMBOL(bio_endio);

/**
 * bio_split - split a bio
 * @bio:        bio to split
 * @sectors:        number of sectors to split from the front of @bio
 * @gfp:        gfp mask
 * @bs:                bio set to allocate from
 *
 * Allocates and returns a new bio which represents @sectors from the start of
 * @bio, and updates @bio to represent the remaining sectors.
 *
 * Unless this is a discard request the newly allocated bio will point
 * to @bio's bi_io_vec. It is the caller's responsibility to ensure that
 * neither @bio nor @bs are freed before the split bio.
 */
struct bio *bio_split(struct bio *bio, int sectors,
                      gfp_t gfp, struct bio_set *bs)
{
        struct bio *split;

        BUG_ON(sectors <= 0);
        BUG_ON(sectors >= bio_sectors(bio));

        /* Zone append commands cannot be split */
        if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND))
                return NULL;

        split = bio_clone_fast(bio, gfp, bs);
        if (!split)
                return NULL;

        split->bi_iter.bi_size = sectors << 9;

        if (bio_integrity(split))
                bio_integrity_trim(split);

        bio_advance(bio, split->bi_iter.bi_size);

        if (bio_flagged(bio, BIO_TRACE_COMPLETION))
                bio_set_flag(split, BIO_TRACE_COMPLETION);

        return split;
}
EXPORT_SYMBOL(bio_split);

/**
 * bio_trim - trim a bio
 * @bio:        bio to trim
 * @offset:        number of sectors to trim from the front of @bio
 * @size:        size we want to trim @bio to, in sectors
 */
void bio_trim(struct bio *bio, int offset, int size)
{
        /* 'bio' is a cloned bio which we need to trim to match
         * the given offset and size.
         */

        size <<= 9;
        if (offset == 0 && size == bio->bi_iter.bi_size)
                return;

        bio_advance(bio, offset << 9);
        bio->bi_iter.bi_size = size;

        if (bio_integrity(bio))
                bio_integrity_trim(bio);

}
EXPORT_SYMBOL_GPL(bio_trim);

/*
 * create memory pools for biovec's in a bio_set.
 * use the global biovec slabs created for general use.
 */
int biovec_init_pool(mempool_t *pool, int pool_entries)
{
        struct biovec_slab *bp = bvec_slabs + BVEC_POOL_MAX;

        return mempool_init_slab_pool(pool, pool_entries, bp->slab);
}

/*
 * bioset_exit - exit a bioset initialized with bioset_init()
 *
 * May be called on a zeroed but uninitialized bioset (i.e. allocated with
 * kzalloc()).
 */
void bioset_exit(struct bio_set *bs)
{
        if (bs->rescue_workqueue)
                destroy_workqueue(bs->rescue_workqueue);
        bs->rescue_workqueue = NULL;

        mempool_exit(&bs->bio_pool);
        mempool_exit(&bs->bvec_pool);

        bioset_integrity_free(bs);
        if (bs->bio_slab)
                bio_put_slab(bs);
        bs->bio_slab = NULL;
}
EXPORT_SYMBOL(bioset_exit);

/**
 * bioset_init - Initialize a bio_set
 * @bs:                pool to initialize
 * @pool_size:        Number of bio and bio_vecs to cache in the mempool
 * @front_pad:        Number of bytes to allocate in front of the returned bio
 * @flags:        Flags to modify behavior, currently %BIOSET_NEED_BVECS
 *              and %BIOSET_NEED_RESCUER
 *
 * Description:
 *    Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller
 *    to ask for a number of bytes to be allocated in front of the bio.
 *    Front pad allocation is useful for embedding the bio inside
 *    another structure, to avoid allocating extra data to go with the bio.
 *    Note that the bio must be embedded at the END of that structure always,
 *    or things will break badly.
 *    If %BIOSET_NEED_BVECS is set in @flags, a separate pool will be allocated
 *    for allocating iovecs.  This pool is not needed e.g. for bio_clone_fast().
 *    If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used to
 *    dispatch queued requests when the mempool runs out of space.
 *
 */
int bioset_init(struct bio_set *bs,
                unsigned int pool_size,
                unsigned int front_pad,
                int flags)
{
        unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);

        bs->front_pad = front_pad;

        spin_lock_init(&bs->rescue_lock);
        bio_list_init(&bs->rescue_list);
        INIT_WORK(&bs->rescue_work, bio_alloc_rescue);

        bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
        if (!bs->bio_slab)
                return -ENOMEM;

        if (mempool_init_slab_pool(&bs->bio_pool, pool_size, bs->bio_slab))
                goto bad;

        if ((flags & BIOSET_NEED_BVECS) &&
            biovec_init_pool(&bs->bvec_pool, pool_size))
                goto bad;

        if (!(flags & BIOSET_NEED_RESCUER))
                return 0;

        bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
        if (!bs->rescue_workqueue)
                goto bad;

        return 0;
bad:
        bioset_exit(bs);
        return -ENOMEM;
}
EXPORT_SYMBOL(bioset_init);

/*
 * Initialize and setup a new bio_set, based on the settings from
 * another bio_set.
 */
int bioset_init_from_src(struct bio_set *bs, struct bio_set *src)
{
        int flags;

        flags = 0;
        if (src->bvec_pool.min_nr)
                flags |= BIOSET_NEED_BVECS;
        if (src->rescue_workqueue)
                flags |= BIOSET_NEED_RESCUER;

        return bioset_init(bs, src->bio_pool.min_nr, src->front_pad, flags);
}
EXPORT_SYMBOL(bioset_init_from_src);

static void __init biovec_init_slabs(void)
{
        int i;

        for (i = 0; i < BVEC_POOL_NR; i++) {
                int size;
                struct biovec_slab *bvs = bvec_slabs + i;

                if (bvs->nr_vecs <= BIO_INLINE_VECS) {
                        bvs->slab = NULL;
                        continue;
                }

                size = bvs->nr_vecs * sizeof(struct bio_vec);
                bvs->slab = kmem_cache_create(bvs->name, size, 0,
                                SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
        }
}

static int __init init_bio(void)
{
        bio_slab_max = 2;
        bio_slab_nr = 0;
        bio_slabs = kcalloc(bio_slab_max, sizeof(struct bio_slab),
                            GFP_KERNEL);

        BUILD_BUG_ON(BIO_FLAG_LAST > BVEC_POOL_OFFSET);

        if (!bio_slabs)
                panic("bio: can't allocate bios\n");

        bio_integrity_init();
        biovec_init_slabs();

        if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS))
                panic("bio: can't allocate bios\n");

        if (bioset_integrity_create(&fs_bio_set, BIO_POOL_SIZE))
                panic("bio: can't create integrity pool\n");

        return 0;
}
subsys_initcall(init_bio);







































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/pagemap.h>
#include <linux/blkdev.h>
#include <linux/genhd.h>
#include "../blk.h"

/*
 * add_gd_partition adds a partitions details to the devices partition
 * description.
 */
struct parsed_partitions {
        struct block_device *bdev;
        char name[BDEVNAME_SIZE];
        struct {
                sector_t from;
                sector_t size;
                int flags;
                bool has_info;
                struct partition_meta_info info;
        } *parts;
        int next;
        int limit;
        bool access_beyond_eod;
        char *pp_buf;
};

typedef struct {
        struct page *v;
} Sector;

void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p);
static inline void put_dev_sector(Sector p)
{
        put_page(p.v);
}

static inline void
put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size)
{
        if (n < p->limit) {
                char tmp[1 + BDEVNAME_SIZE + 10 + 1];

                p->parts[n].from = from;
                p->parts[n].size = size;
                snprintf(tmp, sizeof(tmp), " %s%d", p->name, n);
                strlcat(p->pp_buf, tmp, PAGE_SIZE);
        }
}

/* detection routines go here in alphabetical order: */
int adfspart_check_ADFS(struct parsed_partitions *state);
int adfspart_check_CUMANA(struct parsed_partitions *state);
int adfspart_check_EESOX(struct parsed_partitions *state);
int adfspart_check_ICS(struct parsed_partitions *state);
int adfspart_check_POWERTEC(struct parsed_partitions *state);
int aix_partition(struct parsed_partitions *state);
int amiga_partition(struct parsed_partitions *state);
int atari_partition(struct parsed_partitions *state);
int cmdline_partition(struct parsed_partitions *state);
int efi_partition(struct parsed_partitions *state);
int ibm_partition(struct parsed_partitions *);
int karma_partition(struct parsed_partitions *state);
int ldm_partition(struct parsed_partitions *state);
int mac_partition(struct parsed_partitions *state);
int msdos_partition(struct parsed_partitions *state);
int osf_partition(struct parsed_partitions *state);
int sgi_partition(struct parsed_partitions *state);
int sun_partition(struct parsed_partitions *state);
int sysv68_partition(struct parsed_partitions *state);
int ultrix_partition(struct parsed_partitions *state);







































    1 












































































    1 

































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Events for filesystem locks
 *
 * Copyright 2013 Jeff Layton <jlayton@poochiereds.net>
 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM filelock

#if !defined(_TRACE_FILELOCK_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_FILELOCK_H

#include <linux/tracepoint.h>
#include <linux/fs.h>
#include <linux/device.h>
#include <linux/kdev_t.h>

#define show_fl_flags(val)                                                \
        __print_flags(val, "|",                                         \
                { FL_POSIX,                "FL_POSIX" },                        \
                { FL_FLOCK,                "FL_FLOCK" },                        \
                { FL_DELEG,                "FL_DELEG" },                        \
                { FL_ACCESS,                "FL_ACCESS" },                        \
                { FL_EXISTS,                "FL_EXISTS" },                        \
                { FL_LEASE,                "FL_LEASE" },                        \
                { FL_CLOSE,                "FL_CLOSE" },                        \
                { FL_SLEEP,                "FL_SLEEP" },                        \
                { FL_DOWNGRADE_PENDING,        "FL_DOWNGRADE_PENDING" },        \
                { FL_UNLOCK_PENDING,        "FL_UNLOCK_PENDING" },                \
                { FL_OFDLCK,                "FL_OFDLCK" },                        \
                { FL_RECLAIM,                "FL_RECLAIM"})

#define show_fl_type(val)                                \
        __print_symbolic(val,                                \
                        { F_RDLCK, "F_RDLCK" },                \
                        { F_WRLCK, "F_WRLCK" },                \
                        { F_UNLCK, "F_UNLCK" })

TRACE_EVENT(locks_get_lock_context,
        TP_PROTO(struct inode *inode, int type, struct file_lock_context *ctx),

        TP_ARGS(inode, type, ctx),

        TP_STRUCT__entry(
                __field(unsigned long, i_ino)
                __field(dev_t, s_dev)
                __field(unsigned char, type)
                __field(struct file_lock_context *, ctx)
        ),

        TP_fast_assign(
                __entry->s_dev = inode->i_sb->s_dev;
                __entry->i_ino = inode->i_ino;
                __entry->type = type;
                __entry->ctx = ctx;
        ),

        TP_printk("dev=0x%x:0x%x ino=0x%lx type=%s ctx=%p",
                  MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
                  __entry->i_ino, show_fl_type(__entry->type), __entry->ctx)
);

DECLARE_EVENT_CLASS(filelock_lock,
        TP_PROTO(struct inode *inode, struct file_lock *fl, int ret),

        TP_ARGS(inode, fl, ret),

        TP_STRUCT__entry(
                __field(struct file_lock *, fl)
                __field(unsigned long, i_ino)
                __field(dev_t, s_dev)
                __field(struct file_lock *, fl_blocker)
                __field(fl_owner_t, fl_owner)
                __field(unsigned int, fl_pid)
                __field(unsigned int, fl_flags)
                __field(unsigned char, fl_type)
                __field(loff_t, fl_start)
                __field(loff_t, fl_end)
                __field(int, ret)
        ),

        TP_fast_assign(
                __entry->fl = fl ? fl : NULL;
                __entry->s_dev = inode->i_sb->s_dev;
                __entry->i_ino = inode->i_ino;
                __entry->fl_blocker = fl ? fl->fl_blocker : NULL;
                __entry->fl_owner = fl ? fl->fl_owner : NULL;
                __entry->fl_pid = fl ? fl->fl_pid : 0;
                __entry->fl_flags = fl ? fl->fl_flags : 0;
                __entry->fl_type = fl ? fl->fl_type : 0;
                __entry->fl_start = fl ? fl->fl_start : 0;
                __entry->fl_end = fl ? fl->fl_end : 0;
                __entry->ret = ret;
        ),

        TP_printk("fl=%p dev=0x%x:0x%x ino=0x%lx fl_blocker=%p fl_owner=%p fl_pid=%u fl_flags=%s fl_type=%s fl_start=%lld fl_end=%lld ret=%d",
                __entry->fl, MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
                __entry->i_ino, __entry->fl_blocker, __entry->fl_owner,
                __entry->fl_pid, show_fl_flags(__entry->fl_flags),
                show_fl_type(__entry->fl_type),
                __entry->fl_start, __entry->fl_end, __entry->ret)
);

DEFINE_EVENT(filelock_lock, posix_lock_inode,
                TP_PROTO(struct inode *inode, struct file_lock *fl, int ret),
                TP_ARGS(inode, fl, ret));

DEFINE_EVENT(filelock_lock, fcntl_setlk,
                TP_PROTO(struct inode *inode, struct file_lock *fl, int ret),
                TP_ARGS(inode, fl, ret));

DEFINE_EVENT(filelock_lock, locks_remove_posix,
                TP_PROTO(struct inode *inode, struct file_lock *fl, int ret),
                TP_ARGS(inode, fl, ret));

DEFINE_EVENT(filelock_lock, flock_lock_inode,
                TP_PROTO(struct inode *inode, struct file_lock *fl, int ret),
                TP_ARGS(inode, fl, ret));

DECLARE_EVENT_CLASS(filelock_lease,
        TP_PROTO(struct inode *inode, struct file_lock *fl),

        TP_ARGS(inode, fl),

        TP_STRUCT__entry(
                __field(struct file_lock *, fl)
                __field(unsigned long, i_ino)
                __field(dev_t, s_dev)
                __field(struct file_lock *, fl_blocker)
                __field(fl_owner_t, fl_owner)
                __field(unsigned int, fl_flags)
                __field(unsigned char, fl_type)
                __field(unsigned long, fl_break_time)
                __field(unsigned long, fl_downgrade_time)
        ),

        TP_fast_assign(
                __entry->fl = fl ? fl : NULL;
                __entry->s_dev = inode->i_sb->s_dev;
                __entry->i_ino = inode->i_ino;
                __entry->fl_blocker = fl ? fl->fl_blocker : NULL;
                __entry->fl_owner = fl ? fl->fl_owner : NULL;
                __entry->fl_flags = fl ? fl->fl_flags : 0;
                __entry->fl_type = fl ? fl->fl_type : 0;
                __entry->fl_break_time = fl ? fl->fl_break_time : 0;
                __entry->fl_downgrade_time = fl ? fl->fl_downgrade_time : 0;
        ),

        TP_printk("fl=%p dev=0x%x:0x%x ino=0x%lx fl_blocker=%p fl_owner=%p fl_flags=%s fl_type=%s fl_break_time=%lu fl_downgrade_time=%lu",
                __entry->fl, MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
                __entry->i_ino, __entry->fl_blocker, __entry->fl_owner,
                show_fl_flags(__entry->fl_flags),
                show_fl_type(__entry->fl_type),
                __entry->fl_break_time, __entry->fl_downgrade_time)
);

DEFINE_EVENT(filelock_lease, break_lease_noblock, TP_PROTO(struct inode *inode, struct file_lock *fl),
                TP_ARGS(inode, fl));

DEFINE_EVENT(filelock_lease, break_lease_block, TP_PROTO(struct inode *inode, struct file_lock *fl),
                TP_ARGS(inode, fl));

DEFINE_EVENT(filelock_lease, break_lease_unblock, TP_PROTO(struct inode *inode, struct file_lock *fl),
                TP_ARGS(inode, fl));

DEFINE_EVENT(filelock_lease, generic_delete_lease, TP_PROTO(struct inode *inode, struct file_lock *fl),
                TP_ARGS(inode, fl));

DEFINE_EVENT(filelock_lease, time_out_leases, TP_PROTO(struct inode *inode, struct file_lock *fl),
                TP_ARGS(inode, fl));

TRACE_EVENT(generic_add_lease,
        TP_PROTO(struct inode *inode, struct file_lock *fl),

        TP_ARGS(inode, fl),

        TP_STRUCT__entry(
                __field(unsigned long, i_ino)
                __field(int, wcount)
                __field(int, rcount)
                __field(int, icount)
                __field(dev_t, s_dev)
                __field(fl_owner_t, fl_owner)
                __field(unsigned int, fl_flags)
                __field(unsigned char, fl_type)
        ),

        TP_fast_assign(
                __entry->s_dev = inode->i_sb->s_dev;
                __entry->i_ino = inode->i_ino;
                __entry->wcount = atomic_read(&inode->i_writecount);
                __entry->rcount = atomic_read(&inode->i_readcount);
                __entry->icount = atomic_read(&inode->i_count);
                __entry->fl_owner = fl->fl_owner;
                __entry->fl_flags = fl->fl_flags;
                __entry->fl_type = fl->fl_type;
        ),

        TP_printk("dev=0x%x:0x%x ino=0x%lx wcount=%d rcount=%d icount=%d fl_owner=%p fl_flags=%s fl_type=%s",
                MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
                __entry->i_ino, __entry->wcount, __entry->rcount,
                __entry->icount, __entry->fl_owner,
                show_fl_flags(__entry->fl_flags),
                show_fl_type(__entry->fl_type))
);

TRACE_EVENT(leases_conflict,
        TP_PROTO(bool conflict, struct file_lock *lease, struct file_lock *breaker),

        TP_ARGS(conflict, lease, breaker),

        TP_STRUCT__entry(
                __field(void *, lease)
                __field(void *, breaker)
                __field(unsigned int, l_fl_flags)
                __field(unsigned int, b_fl_flags)
                __field(unsigned char, l_fl_type)
                __field(unsigned char, b_fl_type)
                __field(bool, conflict)
        ),

        TP_fast_assign(
                __entry->lease = lease;
                __entry->l_fl_flags = lease->fl_flags;
                __entry->l_fl_type = lease->fl_type;
                __entry->breaker = breaker;
                __entry->b_fl_flags = breaker->fl_flags;
                __entry->b_fl_type = breaker->fl_type;
                __entry->conflict = conflict;
        ),

        TP_printk("conflict %d: lease=%p fl_flags=%s fl_type=%s; breaker=%p fl_flags=%s fl_type=%s",
                __entry->conflict,
                __entry->lease,
                show_fl_flags(__entry->l_fl_flags),
                show_fl_type(__entry->l_fl_type),
                __entry->breaker,
                show_fl_flags(__entry->b_fl_flags),
                show_fl_type(__entry->b_fl_type))
);

#endif /* _TRACE_FILELOCK_H */

/* This part must be outside protection */
#include <trace/define_trace.h>












































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_KERNEL_H
#define _LINUX_KERNEL_H


#include <stdarg.h>
#include <linux/limits.h>
#include <linux/linkage.h>
#include <linux/stddef.h>
#include <linux/types.h>
#include <linux/compiler.h>
#include <linux/bitops.h>
#include <linux/kstrtox.h>
#include <linux/log2.h>
#include <linux/minmax.h>
#include <linux/typecheck.h>
#include <linux/printk.h>
#include <linux/build_bug.h>
#include <asm/byteorder.h>
#include <asm/div64.h>
#include <uapi/linux/kernel.h>

#define STACK_MAGIC        0xdeadbeef

/**
 * REPEAT_BYTE - repeat the value @x multiple times as an unsigned long value
 * @x: value to repeat
 *
 * NOTE: @x is not checked for > 0xff; larger values produce odd results.
 */
#define REPEAT_BYTE(x)        ((~0ul / 0xff) * (x))

/* @a is a power of 2 value */
#define ALIGN(x, a)                __ALIGN_KERNEL((x), (a))
#define ALIGN_DOWN(x, a)        __ALIGN_KERNEL((x) - ((a) - 1), (a))
#define __ALIGN_MASK(x, mask)        __ALIGN_KERNEL_MASK((x), (mask))
#define PTR_ALIGN(p, a)                ((typeof(p))ALIGN((unsigned long)(p), (a)))
#define PTR_ALIGN_DOWN(p, a)        ((typeof(p))ALIGN_DOWN((unsigned long)(p), (a)))
#define IS_ALIGNED(x, a)                (((x) & ((typeof(x))(a) - 1)) == 0)

/* generic data direction definitions */
#define READ                        0
#define WRITE                        1

/**
 * ARRAY_SIZE - get the number of elements in array @arr
 * @arr: array to be sized
 */
#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr))

#define PTR_IF(cond, ptr)        ((cond) ? (ptr) : NULL)

#define u64_to_user_ptr(x) (                \
{                                        \
        typecheck(u64, (x));                \
        (void __user *)(uintptr_t)(x);        \
}                                        \
)

/*
 * This looks more complex than it should be. But we need to
 * get the type for the ~ right in round_down (it needs to be
 * as wide as the result!), and we want to evaluate the macro
 * arguments just once each.
 */
#define __round_mask(x, y) ((__typeof__(x))((y)-1))
/**
 * round_up - round up to next specified power of 2
 * @x: the value to round
 * @y: multiple to round up to (must be a power of 2)
 *
 * Rounds @x up to next multiple of @y (which must be a power of 2).
 * To perform arbitrary rounding up, use roundup() below.
 */
#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
/**
 * round_down - round down to next specified power of 2
 * @x: the value to round
 * @y: multiple to round down to (must be a power of 2)
 *
 * Rounds @x down to next multiple of @y (which must be a power of 2).
 * To perform arbitrary rounding down, use rounddown() below.
 */
#define round_down(x, y) ((x) & ~__round_mask(x, y))

#define typeof_member(T, m)        typeof(((T*)0)->m)

#define DIV_ROUND_UP __KERNEL_DIV_ROUND_UP

#define DIV_ROUND_DOWN_ULL(ll, d) \
        ({ unsigned long long _tmp = (ll); do_div(_tmp, d); _tmp; })

#define DIV_ROUND_UP_ULL(ll, d) \
        DIV_ROUND_DOWN_ULL((unsigned long long)(ll) + (d) - 1, (d))

#if BITS_PER_LONG == 32
# define DIV_ROUND_UP_SECTOR_T(ll,d) DIV_ROUND_UP_ULL(ll, d)
#else
# define DIV_ROUND_UP_SECTOR_T(ll,d) DIV_ROUND_UP(ll,d)
#endif

/**
 * roundup - round up to the next specified multiple
 * @x: the value to up
 * @y: multiple to round up to
 *
 * Rounds @x up to next multiple of @y. If @y will always be a power
 * of 2, consider using the faster round_up().
 */
#define roundup(x, y) (                                        \
{                                                        \
        typeof(y) __y = y;                                \
        (((x) + (__y - 1)) / __y) * __y;                \
}                                                        \
)
/**
 * rounddown - round down to next specified multiple
 * @x: the value to round
 * @y: multiple to round down to
 *
 * Rounds @x down to next multiple of @y. If @y will always be a power
 * of 2, consider using the faster round_down().
 */
#define rounddown(x, y) (                                \
{                                                        \
        typeof(x) __x = (x);                                \
        __x - (__x % (y));                                \
}                                                        \
)

/*
 * Divide positive or negative dividend by positive or negative divisor
 * and round to closest integer. Result is undefined for negative
 * divisors if the dividend variable type is unsigned and for negative
 * dividends if the divisor variable type is unsigned.
 */
#define DIV_ROUND_CLOSEST(x, divisor)(                        \
{                                                        \
        typeof(x) __x = x;                                \
        typeof(divisor) __d = divisor;                        \
        (((typeof(x))-1) > 0 ||                                \
         ((typeof(divisor))-1) > 0 ||                        \
         (((__x) > 0) == ((__d) > 0))) ?                \
                (((__x) + ((__d) / 2)) / (__d)) :        \
                (((__x) - ((__d) / 2)) / (__d));        \
}                                                        \
)
/*
 * Same as above but for u64 dividends. divisor must be a 32-bit
 * number.
 */
#define DIV_ROUND_CLOSEST_ULL(x, divisor)(                \
{                                                        \
        typeof(divisor) __d = divisor;                        \
        unsigned long long _tmp = (x) + (__d) / 2;        \
        do_div(_tmp, __d);                                \
        _tmp;                                                \
}                                                        \
)

/*
 * Multiplies an integer by a fraction, while avoiding unnecessary
 * overflow or loss of precision.
 */
#define mult_frac(x, numer, denom)(                        \
{                                                        \
        typeof(x) quot = (x) / (denom);                        \
        typeof(x) rem  = (x) % (denom);                        \
        (quot * (numer)) + ((rem * (numer)) / (denom));        \
}                                                        \
)


#define _RET_IP_                (unsigned long)__builtin_return_address(0)
#define _THIS_IP_  ({ __label__ __here; __here: (unsigned long)&&__here; })

#define sector_div(a, b) do_div(a, b)

/**
 * upper_32_bits - return bits 32-63 of a number
 * @n: the number we're accessing
 *
 * A basic shift-right of a 64- or 32-bit quantity.  Use this to suppress
 * the "right shift count >= width of type" warning when that quantity is
 * 32-bits.
 */
#define upper_32_bits(n) ((u32)(((n) >> 16) >> 16))

/**
 * lower_32_bits - return bits 0-31 of a number
 * @n: the number we're accessing
 */
#define lower_32_bits(n) ((u32)((n) & 0xffffffff))

struct completion;
struct pt_regs;
struct user;

#ifdef CONFIG_PREEMPT_VOLUNTARY
extern int _cond_resched(void);
# define might_resched() _cond_resched()
#else
# define might_resched() do { } while (0)
#endif

#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
extern void ___might_sleep(const char *file, int line, int preempt_offset);
extern void __might_sleep(const char *file, int line, int preempt_offset);
extern void __cant_sleep(const char *file, int line, int preempt_offset);

/**
 * might_sleep - annotation for functions that can sleep
 *
 * this macro will print a stack trace if it is executed in an atomic
 * context (spinlock, irq-handler, ...). Additional sections where blocking is
 * not allowed can be annotated with non_block_start() and non_block_end()
 * pairs.
 *
 * This is a useful debugging help to be able to catch problems early and not
 * be bitten later when the calling function happens to sleep when it is not
 * supposed to.
 */
# define might_sleep() \
        do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
/**
 * cant_sleep - annotation for functions that cannot sleep
 *
 * this macro will print a stack trace if it is executed with preemption enabled
 */
# define cant_sleep() \
        do { __cant_sleep(__FILE__, __LINE__, 0); } while (0)
# define sched_annotate_sleep()        (current->task_state_change = 0)
/**
 * non_block_start - annotate the start of section where sleeping is prohibited
 *
 * This is on behalf of the oom reaper, specifically when it is calling the mmu
 * notifiers. The problem is that if the notifier were to block on, for example,
 * mutex_lock() and if the process which holds that mutex were to perform a
 * sleeping memory allocation, the oom reaper is now blocked on completion of
 * that memory allocation. Other blocking calls like wait_event() pose similar
 * issues.
 */
# define non_block_start() (current->non_block_count++)
/**
 * non_block_end - annotate the end of section where sleeping is prohibited
 *
 * Closes a section opened by non_block_start().
 */
# define non_block_end() WARN_ON(current->non_block_count-- == 0)
#else
  static inline void ___might_sleep(const char *file, int line,
                                   int preempt_offset) { }
  static inline void __might_sleep(const char *file, int line,
                                   int preempt_offset) { }
# define might_sleep() do { might_resched(); } while (0)
# define cant_sleep() do { } while (0)
# define sched_annotate_sleep() do { } while (0)
# define non_block_start() do { } while (0)
# define non_block_end() do { } while (0)
#endif

#define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0)

#ifndef CONFIG_PREEMPT_RT
# define cant_migrate()                cant_sleep()
#else
  /* Placeholder for now */
# define cant_migrate()                do { } while (0)
#endif

/**
 * abs - return absolute value of an argument
 * @x: the value.  If it is unsigned type, it is converted to signed type first.
 *     char is treated as if it was signed (regardless of whether it really is)
 *     but the macro's return type is preserved as char.
 *
 * Return: an absolute value of x.
 */
#define abs(x)        __abs_choose_expr(x, long long,                                \
                __abs_choose_expr(x, long,                                \
                __abs_choose_expr(x, int,                                \
                __abs_choose_expr(x, short,                                \
                __abs_choose_expr(x, char,                                \
                __builtin_choose_expr(                                        \
                        __builtin_types_compatible_p(typeof(x), char),        \
                        (char)({ signed char __x = (x); __x<0?-__x:__x; }), \
                        ((void)0)))))))

#define __abs_choose_expr(x, type, other) __builtin_choose_expr(        \
        __builtin_types_compatible_p(typeof(x),   signed type) ||        \
        __builtin_types_compatible_p(typeof(x), unsigned type),                \
        ({ signed type __x = (x); __x < 0 ? -__x : __x; }), other)

/**
 * reciprocal_scale - "scale" a value into range [0, ep_ro)
 * @val: value
 * @ep_ro: right open interval endpoint
 *
 * Perform a "reciprocal multiplication" in order to "scale" a value into
 * range [0, @ep_ro), where the upper interval endpoint is right-open.
 * This is useful, e.g. for accessing a index of an array containing
 * @ep_ro elements, for example. Think of it as sort of modulus, only that
 * the result isn't that of modulo. ;) Note that if initial input is a
 * small value, then result will return 0.
 *
 * Return: a result based on @val in interval [0, @ep_ro).
 */
static inline u32 reciprocal_scale(u32 val, u32 ep_ro)
{
        return (u32)(((u64) val * ep_ro) >> 32);
}

#if defined(CONFIG_MMU) && \
        (defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP))
#define might_fault() __might_fault(__FILE__, __LINE__)
void __might_fault(const char *file, int line);
#else
static inline void might_fault(void) { }
#endif

extern struct atomic_notifier_head panic_notifier_list;
extern long (*panic_blink)(int state);
__printf(1, 2)
void panic(const char *fmt, ...) __noreturn __cold;
void nmi_panic(struct pt_regs *regs, const char *msg);
void check_panic_on_warn(const char *origin);
extern void oops_enter(void);
extern void oops_exit(void);
extern bool oops_may_print(void);
void do_exit(long error_code) __noreturn;
void complete_and_exit(struct completion *, long) __noreturn;

extern int num_to_str(char *buf, int size,
                      unsigned long long num, unsigned int width);

/* lib/printf utilities */

extern __printf(2, 3) int sprintf(char *buf, const char * fmt, ...);
extern __printf(2, 0) int vsprintf(char *buf, const char *, va_list);
extern __printf(3, 4)
int snprintf(char *buf, size_t size, const char *fmt, ...);
extern __printf(3, 0)
int vsnprintf(char *buf, size_t size, const char *fmt, va_list args);
extern __printf(3, 4)
int scnprintf(char *buf, size_t size, const char *fmt, ...);
extern __printf(3, 0)
int vscnprintf(char *buf, size_t size, const char *fmt, va_list args);
extern __printf(2, 3) __malloc
char *kasprintf(gfp_t gfp, const char *fmt, ...);
extern __printf(2, 0) __malloc
char *kvasprintf(gfp_t gfp, const char *fmt, va_list args);
extern __printf(2, 0)
const char *kvasprintf_const(gfp_t gfp, const char *fmt, va_list args);

extern __scanf(2, 3)
int sscanf(const char *, const char *, ...);
extern __scanf(2, 0)
int vsscanf(const char *, const char *, va_list);

extern int get_option(char **str, int *pint);
extern char *get_options(const char *str, int nints, int *ints);
extern unsigned long long memparse(const char *ptr, char **retptr);
extern bool parse_option_str(const char *str, const char *option);
extern char *next_arg(char *args, char **param, char **val);

extern int core_kernel_text(unsigned long addr);
extern int init_kernel_text(unsigned long addr);
extern int core_kernel_data(unsigned long addr);
extern int __kernel_text_address(unsigned long addr);
extern int kernel_text_address(unsigned long addr);
extern int func_ptr_is_kernel_text(void *ptr);

u64 int_pow(u64 base, unsigned int exp);
unsigned long int_sqrt(unsigned long);

#if BITS_PER_LONG < 64
u32 int_sqrt64(u64 x);
#else
static inline u32 int_sqrt64(u64 x)
{
        return (u32)int_sqrt(x);
}
#endif

extern void bust_spinlocks(int yes);
extern int panic_timeout;
extern unsigned long panic_print;
extern int panic_on_oops;
extern int panic_on_unrecovered_nmi;
extern int panic_on_io_nmi;
extern int panic_on_warn;
extern unsigned long panic_on_taint;
extern bool panic_on_taint_nousertaint;
extern int sysctl_panic_on_rcu_stall;
extern int sysctl_panic_on_stackoverflow;

extern bool crash_kexec_post_notifiers;

/*
 * panic_cpu is used for synchronizing panic() and crash_kexec() execution. It
 * holds a CPU number which is executing panic() currently. A value of
 * PANIC_CPU_INVALID means no CPU has entered panic() or crash_kexec().
 */
extern atomic_t panic_cpu;
#define PANIC_CPU_INVALID        -1

/*
 * Only to be used by arch init code. If the user over-wrote the default
 * CONFIG_PANIC_TIMEOUT, honor it.
 */
static inline void set_arch_panic_timeout(int timeout, int arch_default_timeout)
{
        if (panic_timeout == arch_default_timeout)
                panic_timeout = timeout;
}
extern const char *print_tainted(void);
enum lockdep_ok {
        LOCKDEP_STILL_OK,
        LOCKDEP_NOW_UNRELIABLE
};
extern void add_taint(unsigned flag, enum lockdep_ok);
extern int test_taint(unsigned flag);
extern unsigned long get_taint(void);
extern int root_mountflags;

extern bool early_boot_irqs_disabled;

/*
 * Values used for system_state. Ordering of the states must not be changed
 * as code checks for <, <=, >, >= STATE.
 */
extern enum system_states {
        SYSTEM_BOOTING,
        SYSTEM_SCHEDULING,
        SYSTEM_RUNNING,
        SYSTEM_HALT,
        SYSTEM_POWER_OFF,
        SYSTEM_RESTART,
        SYSTEM_SUSPEND,
} system_state;

/* This cannot be an enum because some may be used in assembly source. */
#define TAINT_PROPRIETARY_MODULE        0
#define TAINT_FORCED_MODULE                1
#define TAINT_CPU_OUT_OF_SPEC                2
#define TAINT_FORCED_RMMOD                3
#define TAINT_MACHINE_CHECK                4
#define TAINT_BAD_PAGE                        5
#define TAINT_USER                        6
#define TAINT_DIE                        7
#define TAINT_OVERRIDDEN_ACPI_TABLE        8
#define TAINT_WARN                        9
#define TAINT_CRAP                        10
#define TAINT_FIRMWARE_WORKAROUND        11
#define TAINT_OOT_MODULE                12
#define TAINT_UNSIGNED_MODULE                13
#define TAINT_SOFTLOCKUP                14
#define TAINT_LIVEPATCH                        15
#define TAINT_AUX                        16
#define TAINT_RANDSTRUCT                17
#define TAINT_FLAGS_COUNT                18
#define TAINT_FLAGS_MAX                        ((1UL << TAINT_FLAGS_COUNT) - 1)

struct taint_flag {
        char c_true;        /* character printed when tainted */
        char c_false;        /* character printed when not tainted */
        bool module;        /* also show as a per-module taint flag */
};

extern const struct taint_flag taint_flags[TAINT_FLAGS_COUNT];

extern const char hex_asc[];
#define hex_asc_lo(x)        hex_asc[((x) & 0x0f)]
#define hex_asc_hi(x)        hex_asc[((x) & 0xf0) >> 4]

static inline char *hex_byte_pack(char *buf, u8 byte)
{
        *buf++ = hex_asc_hi(byte);
        *buf++ = hex_asc_lo(byte);
        return buf;
}

extern const char hex_asc_upper[];
#define hex_asc_upper_lo(x)        hex_asc_upper[((x) & 0x0f)]
#define hex_asc_upper_hi(x)        hex_asc_upper[((x) & 0xf0) >> 4]

static inline char *hex_byte_pack_upper(char *buf, u8 byte)
{
        *buf++ = hex_asc_upper_hi(byte);
        *buf++ = hex_asc_upper_lo(byte);
        return buf;
}

extern int hex_to_bin(unsigned char ch);
extern int __must_check hex2bin(u8 *dst, const char *src, size_t count);
extern char *bin2hex(char *dst, const void *src, size_t count);

bool mac_pton(const char *s, u8 *mac);

/*
 * General tracing related utility functions - trace_printk(),
 * tracing_on/tracing_off and tracing_start()/tracing_stop
 *
 * Use tracing_on/tracing_off when you want to quickly turn on or off
 * tracing. It simply enables or disables the recording of the trace events.
 * This also corresponds to the user space /sys/kernel/debug/tracing/tracing_on
 * file, which gives a means for the kernel and userspace to interact.
 * Place a tracing_off() in the kernel where you want tracing to end.
 * From user space, examine the trace, and then echo 1 > tracing_on
 * to continue tracing.
 *
 * tracing_stop/tracing_start has slightly more overhead. It is used
 * by things like suspend to ram where disabling the recording of the
 * trace is not enough, but tracing must actually stop because things
 * like calling smp_processor_id() may crash the system.
 *
 * Most likely, you want to use tracing_on/tracing_off.
 */

enum ftrace_dump_mode {
        DUMP_NONE,
        DUMP_ALL,
        DUMP_ORIG,
};

#ifdef CONFIG_TRACING
void tracing_on(void);
void tracing_off(void);
int tracing_is_on(void);
void tracing_snapshot(void);
void tracing_snapshot_alloc(void);

extern void tracing_start(void);
extern void tracing_stop(void);

static inline __printf(1, 2)
void ____trace_printk_check_format(const char *fmt, ...)
{
}
#define __trace_printk_check_format(fmt, args...)                        \
do {                                                                        \
        if (0)                                                                \
                ____trace_printk_check_format(fmt, ##args);                \
} while (0)

/**
 * trace_printk - printf formatting in the ftrace buffer
 * @fmt: the printf format for printing
 *
 * Note: __trace_printk is an internal function for trace_printk() and
 *       the @ip is passed in via the trace_printk() macro.
 *
 * This function allows a kernel developer to debug fast path sections
 * that printk is not appropriate for. By scattering in various
 * printk like tracing in the code, a developer can quickly see
 * where problems are occurring.
 *
 * This is intended as a debugging tool for the developer only.
 * Please refrain from leaving trace_printks scattered around in
 * your code. (Extra memory is used for special buffers that are
 * allocated when trace_printk() is used.)
 *
 * A little optimization trick is done here. If there's only one
 * argument, there's no need to scan the string for printf formats.
 * The trace_puts() will suffice. But how can we take advantage of
 * using trace_puts() when trace_printk() has only one argument?
 * By stringifying the args and checking the size we can tell
 * whether or not there are args. __stringify((__VA_ARGS__)) will
 * turn into "()\0" with a size of 3 when there are no args, anything
 * else will be bigger. All we need to do is define a string to this,
 * and then take its size and compare to 3. If it's bigger, use
 * do_trace_printk() otherwise, optimize it to trace_puts(). Then just
 * let gcc optimize the rest.
 */

#define trace_printk(fmt, ...)                                \
do {                                                        \
        char _______STR[] = __stringify((__VA_ARGS__));        \
        if (sizeof(_______STR) > 3)                        \
                do_trace_printk(fmt, ##__VA_ARGS__);        \
        else                                                \
                trace_puts(fmt);                        \
} while (0)

#define do_trace_printk(fmt, args...)                                        \
do {                                                                        \
        static const char *trace_printk_fmt __used                        \
                __section("__trace_printk_fmt") =                        \
                __builtin_constant_p(fmt) ? fmt : NULL;                        \
                                                                        \
        __trace_printk_check_format(fmt, ##args);                        \
                                                                        \
        if (__builtin_constant_p(fmt))                                        \
                __trace_bprintk(_THIS_IP_, trace_printk_fmt, ##args);        \
        else                                                                \
                __trace_printk(_THIS_IP_, fmt, ##args);                        \
} while (0)

extern __printf(2, 3)
int __trace_bprintk(unsigned long ip, const char *fmt, ...);

extern __printf(2, 3)
int __trace_printk(unsigned long ip, const char *fmt, ...);

/**
 * trace_puts - write a string into the ftrace buffer
 * @str: the string to record
 *
 * Note: __trace_bputs is an internal function for trace_puts and
 *       the @ip is passed in via the trace_puts macro.
 *
 * This is similar to trace_printk() but is made for those really fast
 * paths that a developer wants the least amount of "Heisenbug" effects,
 * where the processing of the print format is still too much.
 *
 * This function allows a kernel developer to debug fast path sections
 * that printk is not appropriate for. By scattering in various
 * printk like tracing in the code, a developer can quickly see
 * where problems are occurring.
 *
 * This is intended as a debugging tool for the developer only.
 * Please refrain from leaving trace_puts scattered around in
 * your code. (Extra memory is used for special buffers that are
 * allocated when trace_puts() is used.)
 *
 * Returns: 0 if nothing was written, positive # if string was.
 *  (1 when __trace_bputs is used, strlen(str) when __trace_puts is used)
 */

#define trace_puts(str) ({                                                \
        static const char *trace_printk_fmt __used                        \
                __section("__trace_printk_fmt") =                        \
                __builtin_constant_p(str) ? str : NULL;                        \
                                                                        \
        if (__builtin_constant_p(str))                                        \
                __trace_bputs(_THIS_IP_, trace_printk_fmt);                \
        else                                                                \
                __trace_puts(_THIS_IP_, str, strlen(str));                \
})
extern int __trace_bputs(unsigned long ip, const char *str);
extern int __trace_puts(unsigned long ip, const char *str, int size);

extern void trace_dump_stack(int skip);

/*
 * The double __builtin_constant_p is because gcc will give us an error
 * if we try to allocate the static variable to fmt if it is not a
 * constant. Even with the outer if statement.
 */
#define ftrace_vprintk(fmt, vargs)                                        \
do {                                                                        \
        if (__builtin_constant_p(fmt)) {                                \
                static const char *trace_printk_fmt __used                \
                  __section("__trace_printk_fmt") =                        \
                        __builtin_constant_p(fmt) ? fmt : NULL;                \
                                                                        \
                __ftrace_vbprintk(_THIS_IP_, trace_printk_fmt, vargs);        \
        } else                                                                \
                __ftrace_vprintk(_THIS_IP_, fmt, vargs);                \
} while (0)

extern __printf(2, 0) int
__ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap);

extern __printf(2, 0) int
__ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap);

extern void ftrace_dump(enum ftrace_dump_mode oops_dump_mode);
#else
static inline void tracing_start(void) { }
static inline void tracing_stop(void) { }
static inline void trace_dump_stack(int skip) { }

static inline void tracing_on(void) { }
static inline void tracing_off(void) { }
static inline int tracing_is_on(void) { return 0; }
static inline void tracing_snapshot(void) { }
static inline void tracing_snapshot_alloc(void) { }

static inline __printf(1, 2)
int trace_printk(const char *fmt, ...)
{
        return 0;
}
static __printf(1, 0) inline int
ftrace_vprintk(const char *fmt, va_list ap)
{
        return 0;
}
static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
#endif /* CONFIG_TRACING */

/* This counts to 12. Any more, it will return 13th argument. */
#define __COUNT_ARGS(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _n, X...) _n
#define COUNT_ARGS(X...) __COUNT_ARGS(, ##X, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)

#define __CONCAT(a, b) a ## b
#define CONCATENATE(a, b) __CONCAT(a, b)

/**
 * container_of - cast a member of a structure out to the containing structure
 * @ptr:        the pointer to the member.
 * @type:        the type of the container struct this is embedded in.
 * @member:        the name of the member within the struct.
 *
 */
#define container_of(ptr, type, member) ({                                \
        void *__mptr = (void *)(ptr);                                        \
        BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) &&        \
                         !__same_type(*(ptr), void),                        \
                         "pointer type mismatch in container_of()");        \
        ((type *)(__mptr - offsetof(type, member))); })

/**
 * container_of_safe - cast a member of a structure out to the containing structure
 * @ptr:        the pointer to the member.
 * @type:        the type of the container struct this is embedded in.
 * @member:        the name of the member within the struct.
 *
 * If IS_ERR_OR_NULL(ptr), ptr is returned unchanged.
 */
#define container_of_safe(ptr, type, member) ({                                \
        void *__mptr = (void *)(ptr);                                        \
        BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) &&        \
                         !__same_type(*(ptr), void),                        \
                         "pointer type mismatch in container_of()");        \
        IS_ERR_OR_NULL(__mptr) ? ERR_CAST(__mptr) :                        \
                ((type *)(__mptr - offsetof(type, member))); })

/* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */
#ifdef CONFIG_FTRACE_MCOUNT_RECORD
# define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD
#endif

/* Permissions on a sysfs file: you didn't miss the 0 prefix did you? */
#define VERIFY_OCTAL_PERMISSIONS(perms)                                                \
        (BUILD_BUG_ON_ZERO((perms) < 0) +                                        \
         BUILD_BUG_ON_ZERO((perms) > 0777) +                                        \
         /* USER_READABLE >= GROUP_READABLE >= OTHER_READABLE */                \
         BUILD_BUG_ON_ZERO((((perms) >> 6) & 4) < (((perms) >> 3) & 4)) +        \
         BUILD_BUG_ON_ZERO((((perms) >> 3) & 4) < ((perms) & 4)) +                \
         /* USER_WRITABLE >= GROUP_WRITABLE */                                        \
         BUILD_BUG_ON_ZERO((((perms) >> 6) & 2) < (((perms) >> 3) & 2)) +        \
         /* OTHER_WRITABLE?  Generally considered a bad idea. */                \
         BUILD_BUG_ON_ZERO((perms) & 2) +                                        \
         (perms))
#endif










































































































































































































































































































































































































































    3 

















































    3 











































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
// SPDX-License-Identifier: GPL-2.0+
/*
 * linux/fs/jbd2/revoke.c
 *
 * Written by Stephen C. Tweedie <sct@redhat.com>, 2000
 *
 * Copyright 2000 Red Hat corp --- All Rights Reserved
 *
 * Journal revoke routines for the generic filesystem journaling code;
 * part of the ext2fs journaling system.
 *
 * Revoke is the mechanism used to prevent old log records for deleted
 * metadata from being replayed on top of newer data using the same
 * blocks.  The revoke mechanism is used in two separate places:
 *
 * + Commit: during commit we write the entire list of the current
 *   transaction's revoked blocks to the journal
 *
 * + Recovery: during recovery we record the transaction ID of all
 *   revoked blocks.  If there are multiple revoke records in the log
 *   for a single block, only the last one counts, and if there is a log
 *   entry for a block beyond the last revoke, then that log entry still
 *   gets replayed.
 *
 * We can get interactions between revokes and new log data within a
 * single transaction:
 *
 * Block is revoked and then journaled:
 *   The desired end result is the journaling of the new block, so we
 *   cancel the revoke before the transaction commits.
 *
 * Block is journaled and then revoked:
 *   The revoke must take precedence over the write of the block, so we
 *   need either to cancel the journal entry or to write the revoke
 *   later in the log than the log block.  In this case, we choose the
 *   latter: journaling a block cancels any revoke record for that block
 *   in the current transaction, so any revoke for that block in the
 *   transaction must have happened after the block was journaled and so
 *   the revoke must take precedence.
 *
 * Block is revoked and then written as data:
 *   The data write is allowed to succeed, but the revoke is _not_
 *   cancelled.  We still need to prevent old log records from
 *   overwriting the new data.  We don't even need to clear the revoke
 *   bit here.
 *
 * We cache revoke status of a buffer in the current transaction in b_states
 * bits.  As the name says, revokevalid flag indicates that the cached revoke
 * status of a buffer is valid and we can rely on the cached status.
 *
 * Revoke information on buffers is a tri-state value:
 *
 * RevokeValid clear:        no cached revoke status, need to look it up
 * RevokeValid set, Revoked clear:
 *                        buffer has not been revoked, and cancel_revoke
 *                        need do nothing.
 * RevokeValid set, Revoked set:
 *                        buffer has been revoked.
 *
 * Locking rules:
 * We keep two hash tables of revoke records. One hashtable belongs to the
 * running transaction (is pointed to by journal->j_revoke), the other one
 * belongs to the committing transaction. Accesses to the second hash table
 * happen only from the kjournald and no other thread touches this table.  Also
 * journal_switch_revoke_table() which switches which hashtable belongs to the
 * running and which to the committing transaction is called only from
 * kjournald. Therefore we need no locks when accessing the hashtable belonging
 * to the committing transaction.
 *
 * All users operating on the hash table belonging to the running transaction
 * have a handle to the transaction. Therefore they are safe from kjournald
 * switching hash tables under them. For operations on the lists of entries in
 * the hash table j_revoke_lock is used.
 *
 * Finally, also replay code uses the hash tables but at this moment no one else
 * can touch them (filesystem isn't mounted yet) and hence no locking is
 * needed.
 */

#ifndef __KERNEL__
#include "jfs_user.h"
#else
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd2.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/init.h>
#include <linux/bio.h>
#include <linux/log2.h>
#include <linux/hash.h>
#endif

static struct kmem_cache *jbd2_revoke_record_cache;
static struct kmem_cache *jbd2_revoke_table_cache;

/* Each revoke record represents one single revoked block.  During
   journal replay, this involves recording the transaction ID of the
   last transaction to revoke this block. */

struct jbd2_revoke_record_s
{
        struct list_head  hash;
        tid_t                  sequence;        /* Used for recovery only */
        unsigned long long          blocknr;
};


/* The revoke table is just a simple hash table of revoke records. */
struct jbd2_revoke_table_s
{
        /* It is conceivable that we might want a larger hash table
         * for recovery.  Must be a power of two. */
        int                  hash_size;
        int                  hash_shift;
        struct list_head *hash_table;
};


#ifdef __KERNEL__
static void write_one_revoke_record(transaction_t *,
                                    struct list_head *,
                                    struct buffer_head **, int *,
                                    struct jbd2_revoke_record_s *);
static void flush_descriptor(journal_t *, struct buffer_head *, int);
#endif

/* Utility functions to maintain the revoke table */

static inline int hash(journal_t *journal, unsigned long long block)
{
        return hash_64(block, journal->j_revoke->hash_shift);
}

static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr,
                              tid_t seq)
{
        struct list_head *hash_list;
        struct jbd2_revoke_record_s *record;
        gfp_t gfp_mask = GFP_NOFS;

        if (journal_oom_retry)
                gfp_mask |= __GFP_NOFAIL;
        record = kmem_cache_alloc(jbd2_revoke_record_cache, gfp_mask);
        if (!record)
                return -ENOMEM;

        record->sequence = seq;
        record->blocknr = blocknr;
        hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
        spin_lock(&journal->j_revoke_lock);
        list_add(&record->hash, hash_list);
        spin_unlock(&journal->j_revoke_lock);
        return 0;
}

/* Find a revoke record in the journal's hash table. */

static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal,
                                                      unsigned long long blocknr)
{
        struct list_head *hash_list;
        struct jbd2_revoke_record_s *record;

        hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];

        spin_lock(&journal->j_revoke_lock);
        record = (struct jbd2_revoke_record_s *) hash_list->next;
        while (&(record->hash) != hash_list) {
                if (record->blocknr == blocknr) {
                        spin_unlock(&journal->j_revoke_lock);
                        return record;
                }
                record = (struct jbd2_revoke_record_s *) record->hash.next;
        }
        spin_unlock(&journal->j_revoke_lock);
        return NULL;
}

void jbd2_journal_destroy_revoke_record_cache(void)
{
        kmem_cache_destroy(jbd2_revoke_record_cache);
        jbd2_revoke_record_cache = NULL;
}

void jbd2_journal_destroy_revoke_table_cache(void)
{
        kmem_cache_destroy(jbd2_revoke_table_cache);
        jbd2_revoke_table_cache = NULL;
}

int __init jbd2_journal_init_revoke_record_cache(void)
{
        J_ASSERT(!jbd2_revoke_record_cache);
        jbd2_revoke_record_cache = KMEM_CACHE(jbd2_revoke_record_s,
                                        SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY);

        if (!jbd2_revoke_record_cache) {
                pr_emerg("JBD2: failed to create revoke_record cache\n");
                return -ENOMEM;
        }
        return 0;
}

int __init jbd2_journal_init_revoke_table_cache(void)
{
        J_ASSERT(!jbd2_revoke_table_cache);
        jbd2_revoke_table_cache = KMEM_CACHE(jbd2_revoke_table_s,
                                             SLAB_TEMPORARY);
        if (!jbd2_revoke_table_cache) {
                pr_emerg("JBD2: failed to create revoke_table cache\n");
                return -ENOMEM;
        }
        return 0;
}

static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size)
{
        int shift = 0;
        int tmp = hash_size;
        struct jbd2_revoke_table_s *table;

        table = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL);
        if (!table)
                goto out;

        while((tmp >>= 1UL) != 0UL)
                shift++;

        table->hash_size = hash_size;
        table->hash_shift = shift;
        table->hash_table =
                kmalloc_array(hash_size, sizeof(struct list_head), GFP_KERNEL);
        if (!table->hash_table) {
                kmem_cache_free(jbd2_revoke_table_cache, table);
                table = NULL;
                goto out;
        }

        for (tmp = 0; tmp < hash_size; tmp++)
                INIT_LIST_HEAD(&table->hash_table[tmp]);

out:
        return table;
}

static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table)
{
        int i;
        struct list_head *hash_list;

        for (i = 0; i < table->hash_size; i++) {
                hash_list = &table->hash_table[i];
                J_ASSERT(list_empty(hash_list));
        }

        kfree(table->hash_table);
        kmem_cache_free(jbd2_revoke_table_cache, table);
}

/* Initialise the revoke table for a given journal to a given size. */
int jbd2_journal_init_revoke(journal_t *journal, int hash_size)
{
        J_ASSERT(journal->j_revoke_table[0] == NULL);
        J_ASSERT(is_power_of_2(hash_size));

        journal->j_revoke_table[0] = jbd2_journal_init_revoke_table(hash_size);
        if (!journal->j_revoke_table[0])
                goto fail0;

        journal->j_revoke_table[1] = jbd2_journal_init_revoke_table(hash_size);
        if (!journal->j_revoke_table[1])
                goto fail1;

        journal->j_revoke = journal->j_revoke_table[1];

        spin_lock_init(&journal->j_revoke_lock);

        return 0;

fail1:
        jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]);
        journal->j_revoke_table[0] = NULL;
fail0:
        return -ENOMEM;
}

/* Destroy a journal's revoke table.  The table must already be empty! */
void jbd2_journal_destroy_revoke(journal_t *journal)
{
        journal->j_revoke = NULL;
        if (journal->j_revoke_table[0])
                jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]);
        if (journal->j_revoke_table[1])
                jbd2_journal_destroy_revoke_table(journal->j_revoke_table[1]);
}


#ifdef __KERNEL__

/*
 * jbd2_journal_revoke: revoke a given buffer_head from the journal.  This
 * prevents the block from being replayed during recovery if we take a
 * crash after this current transaction commits.  Any subsequent
 * metadata writes of the buffer in this transaction cancel the
 * revoke.
 *
 * Note that this call may block --- it is up to the caller to make
 * sure that there are no further calls to journal_write_metadata
 * before the revoke is complete.  In ext3, this implies calling the
 * revoke before clearing the block bitmap when we are deleting
 * metadata.
 *
 * Revoke performs a jbd2_journal_forget on any buffer_head passed in as a
 * parameter, but does _not_ forget the buffer_head if the bh was only
 * found implicitly.
 *
 * bh_in may not be a journalled buffer - it may have come off
 * the hash tables without an attached journal_head.
 *
 * If bh_in is non-zero, jbd2_journal_revoke() will decrement its b_count
 * by one.
 */

int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
                   struct buffer_head *bh_in)
{
        struct buffer_head *bh = NULL;
        journal_t *journal;
        struct block_device *bdev;
        int err;

        might_sleep();
        if (bh_in)
                BUFFER_TRACE(bh_in, "enter");

        journal = handle->h_transaction->t_journal;
        if (!jbd2_journal_set_features(journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)){
                J_ASSERT (!"Cannot set revoke feature!");
                return -EINVAL;
        }

        bdev = journal->j_fs_dev;
        bh = bh_in;

        if (!bh) {
                bh = __find_get_block(bdev, blocknr, journal->j_blocksize);
                if (bh)
                        BUFFER_TRACE(bh, "found on hash");
        }
#ifdef JBD2_EXPENSIVE_CHECKING
        else {
                struct buffer_head *bh2;

                /* If there is a different buffer_head lying around in
                 * memory anywhere... */
                bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize);
                if (bh2) {
                        /* ... and it has RevokeValid status... */
                        if (bh2 != bh && buffer_revokevalid(bh2))
                                /* ...then it better be revoked too,
                                 * since it's illegal to create a revoke
                                 * record against a buffer_head which is
                                 * not marked revoked --- that would
                                 * risk missing a subsequent revoke
                                 * cancel. */
                                J_ASSERT_BH(bh2, buffer_revoked(bh2));
                        put_bh(bh2);
                }
        }
#endif

        if (WARN_ON_ONCE(handle->h_revoke_credits <= 0)) {
                if (!bh_in)
                        brelse(bh);
                return -EIO;
        }
        /* We really ought not ever to revoke twice in a row without
           first having the revoke cancelled: it's illegal to free a
           block twice without allocating it in between! */
        if (bh) {
                if (!J_EXPECT_BH(bh, !buffer_revoked(bh),
                                 "inconsistent data on disk")) {
                        if (!bh_in)
                                brelse(bh);
                        return -EIO;
                }
                set_buffer_revoked(bh);
                set_buffer_revokevalid(bh);
                if (bh_in) {
                        BUFFER_TRACE(bh_in, "call jbd2_journal_forget");
                        jbd2_journal_forget(handle, bh_in);
                } else {
                        BUFFER_TRACE(bh, "call brelse");
                        __brelse(bh);
                }
        }
        handle->h_revoke_credits--;

        jbd_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in);
        err = insert_revoke_hash(journal, blocknr,
                                handle->h_transaction->t_tid);
        BUFFER_TRACE(bh_in, "exit");
        return err;
}

/*
 * Cancel an outstanding revoke.  For use only internally by the
 * journaling code (called from jbd2_journal_get_write_access).
 *
 * We trust buffer_revoked() on the buffer if the buffer is already
 * being journaled: if there is no revoke pending on the buffer, then we
 * don't do anything here.
 *
 * This would break if it were possible for a buffer to be revoked and
 * discarded, and then reallocated within the same transaction.  In such
 * a case we would have lost the revoked bit, but when we arrived here
 * the second time we would still have a pending revoke to cancel.  So,
 * do not trust the Revoked bit on buffers unless RevokeValid is also
 * set.
 */
int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
{
        struct jbd2_revoke_record_s *record;
        journal_t *journal = handle->h_transaction->t_journal;
        int need_cancel;
        int did_revoke = 0;        /* akpm: debug */
        struct buffer_head *bh = jh2bh(jh);

        jbd_debug(4, "journal_head %p, cancelling revoke\n", jh);

        /* Is the existing Revoke bit valid?  If so, we trust it, and
         * only perform the full cancel if the revoke bit is set.  If
         * not, we can't trust the revoke bit, and we need to do the
         * full search for a revoke record. */
        if (test_set_buffer_revokevalid(bh)) {
                need_cancel = test_clear_buffer_revoked(bh);
        } else {
                need_cancel = 1;
                clear_buffer_revoked(bh);
        }

        if (need_cancel) {
                record = find_revoke_record(journal, bh->b_blocknr);
                if (record) {
                        jbd_debug(4, "cancelled existing revoke on "
                                  "blocknr %llu\n", (unsigned long long)bh->b_blocknr);
                        spin_lock(&journal->j_revoke_lock);
                        list_del(&record->hash);
                        spin_unlock(&journal->j_revoke_lock);
                        kmem_cache_free(jbd2_revoke_record_cache, record);
                        did_revoke = 1;
                }
        }

#ifdef JBD2_EXPENSIVE_CHECKING
        /* There better not be one left behind by now! */
        record = find_revoke_record(journal, bh->b_blocknr);
        J_ASSERT_JH(jh, record == NULL);
#endif

        /* Finally, have we just cleared revoke on an unhashed
         * buffer_head?  If so, we'd better make sure we clear the
         * revoked status on any hashed alias too, otherwise the revoke
         * state machine will get very upset later on. */
        if (need_cancel) {
                struct buffer_head *bh2;
                bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size);
                if (bh2) {
                        if (bh2 != bh)
                                clear_buffer_revoked(bh2);
                        __brelse(bh2);
                }
        }
        return did_revoke;
}

/*
 * journal_clear_revoked_flag clears revoked flag of buffers in
 * revoke table to reflect there is no revoked buffers in the next
 * transaction which is going to be started.
 */
void jbd2_clear_buffer_revoked_flags(journal_t *journal)
{
        struct jbd2_revoke_table_s *revoke = journal->j_revoke;
        int i = 0;

        for (i = 0; i < revoke->hash_size; i++) {
                struct list_head *hash_list;
                struct list_head *list_entry;
                hash_list = &revoke->hash_table[i];

                list_for_each(list_entry, hash_list) {
                        struct jbd2_revoke_record_s *record;
                        struct buffer_head *bh;
                        record = (struct jbd2_revoke_record_s *)list_entry;
                        bh = __find_get_block(journal->j_fs_dev,
                                              record->blocknr,
                                              journal->j_blocksize);
                        if (bh) {
                                clear_buffer_revoked(bh);
                                __brelse(bh);
                        }
                }
        }
}

/* journal_switch_revoke table select j_revoke for next transaction
 * we do not want to suspend any processing until all revokes are
 * written -bzzz
 */
void jbd2_journal_switch_revoke_table(journal_t *journal)
{
        int i;

        if (journal->j_revoke == journal->j_revoke_table[0])
                journal->j_revoke = journal->j_revoke_table[1];
        else
                journal->j_revoke = journal->j_revoke_table[0];

        for (i = 0; i < journal->j_revoke->hash_size; i++)
                INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]);
}

/*
 * Write revoke records to the journal for all entries in the current
 * revoke hash, deleting the entries as we go.
 */
void jbd2_journal_write_revoke_records(transaction_t *transaction,
                                       struct list_head *log_bufs)
{
        journal_t *journal = transaction->t_journal;
        struct buffer_head *descriptor;
        struct jbd2_revoke_record_s *record;
        struct jbd2_revoke_table_s *revoke;
        struct list_head *hash_list;
        int i, offset, count;

        descriptor = NULL;
        offset = 0;
        count = 0;

        /* select revoke table for committing transaction */
        revoke = journal->j_revoke == journal->j_revoke_table[0] ?
                journal->j_revoke_table[1] : journal->j_revoke_table[0];

        for (i = 0; i < revoke->hash_size; i++) {
                hash_list = &revoke->hash_table[i];

                while (!list_empty(hash_list)) {
                        record = (struct jbd2_revoke_record_s *)
                                hash_list->next;
                        write_one_revoke_record(transaction, log_bufs,
                                                &descriptor, &offset, record);
                        count++;
                        list_del(&record->hash);
                        kmem_cache_free(jbd2_revoke_record_cache, record);
                }
        }
        if (descriptor)
                flush_descriptor(journal, descriptor, offset);
        jbd_debug(1, "Wrote %d revoke records\n", count);
}

/*
 * Write out one revoke record.  We need to create a new descriptor
 * block if the old one is full or if we have not already created one.
 */

static void write_one_revoke_record(transaction_t *transaction,
                                    struct list_head *log_bufs,
                                    struct buffer_head **descriptorp,
                                    int *offsetp,
                                    struct jbd2_revoke_record_s *record)
{
        journal_t *journal = transaction->t_journal;
        int csum_size = 0;
        struct buffer_head *descriptor;
        int sz, offset;

        /* If we are already aborting, this all becomes a noop.  We
           still need to go round the loop in
           jbd2_journal_write_revoke_records in order to free all of the
           revoke records: only the IO to the journal is omitted. */
        if (is_journal_aborted(journal))
                return;

        descriptor = *descriptorp;
        offset = *offsetp;

        /* Do we need to leave space at the end for a checksum? */
        if (jbd2_journal_has_csum_v2or3(journal))
                csum_size = sizeof(struct jbd2_journal_block_tail);

        if (jbd2_has_feature_64bit(journal))
                sz = 8;
        else
                sz = 4;

        /* Make sure we have a descriptor with space left for the record */
        if (descriptor) {
                if (offset + sz > journal->j_blocksize - csum_size) {
                        flush_descriptor(journal, descriptor, offset);
                        descriptor = NULL;
                }
        }

        if (!descriptor) {
                descriptor = jbd2_journal_get_descriptor_buffer(transaction,
                                                        JBD2_REVOKE_BLOCK);
                if (!descriptor)
                        return;

                /* Record it so that we can wait for IO completion later */
                BUFFER_TRACE(descriptor, "file in log_bufs");
                jbd2_file_log_bh(log_bufs, descriptor);

                offset = sizeof(jbd2_journal_revoke_header_t);
                *descriptorp = descriptor;
        }

        if (jbd2_has_feature_64bit(journal))
                * ((__be64 *)(&descriptor->b_data[offset])) =
                        cpu_to_be64(record->blocknr);
        else
                * ((__be32 *)(&descriptor->b_data[offset])) =
                        cpu_to_be32(record->blocknr);
        offset += sz;

        *offsetp = offset;
}

/*
 * Flush a revoke descriptor out to the journal.  If we are aborting,
 * this is a noop; otherwise we are generating a buffer which needs to
 * be waited for during commit, so it has to go onto the appropriate
 * journal buffer list.
 */

static void flush_descriptor(journal_t *journal,
                             struct buffer_head *descriptor,
                             int offset)
{
        jbd2_journal_revoke_header_t *header;

        if (is_journal_aborted(journal))
                return;

        header = (jbd2_journal_revoke_header_t *)descriptor->b_data;
        header->r_count = cpu_to_be32(offset);
        jbd2_descriptor_block_csum_set(journal, descriptor);

        set_buffer_jwrite(descriptor);
        BUFFER_TRACE(descriptor, "write");
        set_buffer_dirty(descriptor);
        write_dirty_buffer(descriptor, REQ_SYNC);
}
#endif

/*
 * Revoke support for recovery.
 *
 * Recovery needs to be able to:
 *
 *  record all revoke records, including the tid of the latest instance
 *  of each revoke in the journal
 *
 *  check whether a given block in a given transaction should be replayed
 *  (ie. has not been revoked by a revoke record in that or a subsequent
 *  transaction)
 *
 *  empty the revoke table after recovery.
 */

/*
 * First, setting revoke records.  We create a new revoke record for
 * every block ever revoked in the log as we scan it for recovery, and
 * we update the existing records if we find multiple revokes for a
 * single block.
 */

int jbd2_journal_set_revoke(journal_t *journal,
                       unsigned long long blocknr,
                       tid_t sequence)
{
        struct jbd2_revoke_record_s *record;

        record = find_revoke_record(journal, blocknr);
        if (record) {
                /* If we have multiple occurrences, only record the
                 * latest sequence number in the hashed record */
                if (tid_gt(sequence, record->sequence))
                        record->sequence = sequence;
                return 0;
        }
        return insert_revoke_hash(journal, blocknr, sequence);
}

/*
 * Test revoke records.  For a given block referenced in the log, has
 * that block been revoked?  A revoke record with a given transaction
 * sequence number revokes all blocks in that transaction and earlier
 * ones, but later transactions still need replayed.
 */

int jbd2_journal_test_revoke(journal_t *journal,
                        unsigned long long blocknr,
                        tid_t sequence)
{
        struct jbd2_revoke_record_s *record;

        record = find_revoke_record(journal, blocknr);
        if (!record)
                return 0;
        if (tid_gt(sequence, record->sequence))
                return 0;
        return 1;
}

/*
 * Finally, once recovery is over, we need to clear the revoke table so
 * that it can be reused by the running filesystem.
 */

void jbd2_journal_clear_revoke(journal_t *journal)
{
        int i;
        struct list_head *hash_list;
        struct jbd2_revoke_record_s *record;
        struct jbd2_revoke_table_s *revoke;

        revoke = journal->j_revoke;

        for (i = 0; i < revoke->hash_size; i++) {
                hash_list = &revoke->hash_table[i];
                while (!list_empty(hash_list)) {
                        record = (struct jbd2_revoke_record_s*) hash_list->next;
                        list_del(&record->hash);
                        kmem_cache_free(jbd2_revoke_record_cache, record);
                }
        }
}































































































































































































































































    2 





    2 

















    2 







    2 







































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
// SPDX-License-Identifier: GPL-2.0
/*
 * Functions related to io context handling
 */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
#include <linux/sched/task.h>

#include "blk.h"

/*
 * For io context allocations
 */
static struct kmem_cache *iocontext_cachep;

/**
 * get_io_context - increment reference count to io_context
 * @ioc: io_context to get
 *
 * Increment reference count to @ioc.
 */
void get_io_context(struct io_context *ioc)
{
        BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
        atomic_long_inc(&ioc->refcount);
}

static void icq_free_icq_rcu(struct rcu_head *head)
{
        struct io_cq *icq = container_of(head, struct io_cq, __rcu_head);

        kmem_cache_free(icq->__rcu_icq_cache, icq);
}

/*
 * Exit an icq. Called with ioc locked for blk-mq, and with both ioc
 * and queue locked for legacy.
 */
static void ioc_exit_icq(struct io_cq *icq)
{
        struct elevator_type *et = icq->q->elevator->type;

        if (icq->flags & ICQ_EXITED)
                return;

        if (et->ops.exit_icq)
                et->ops.exit_icq(icq);

        icq->flags |= ICQ_EXITED;
}

/*
 * Release an icq. Called with ioc locked for blk-mq, and with both ioc
 * and queue locked for legacy.
 */
static void ioc_destroy_icq(struct io_cq *icq)
{
        struct io_context *ioc = icq->ioc;
        struct request_queue *q = icq->q;
        struct elevator_type *et = q->elevator->type;

        lockdep_assert_held(&ioc->lock);

        radix_tree_delete(&ioc->icq_tree, icq->q->id);
        hlist_del_init(&icq->ioc_node);
        list_del_init(&icq->q_node);

        /*
         * Both setting lookup hint to and clearing it from @icq are done
         * under queue_lock.  If it's not pointing to @icq now, it never
         * will.  Hint assignment itself can race safely.
         */
        if (rcu_access_pointer(ioc->icq_hint) == icq)
                rcu_assign_pointer(ioc->icq_hint, NULL);

        ioc_exit_icq(icq);

        /*
         * @icq->q might have gone away by the time RCU callback runs
         * making it impossible to determine icq_cache.  Record it in @icq.
         */
        icq->__rcu_icq_cache = et->icq_cache;
        icq->flags |= ICQ_DESTROYED;
        call_rcu(&icq->__rcu_head, icq_free_icq_rcu);
}

/*
 * Slow path for ioc release in put_io_context().  Performs double-lock
 * dancing to unlink all icq's and then frees ioc.
 */
static void ioc_release_fn(struct work_struct *work)
{
        struct io_context *ioc = container_of(work, struct io_context,
                                              release_work);
        spin_lock_irq(&ioc->lock);

        while (!hlist_empty(&ioc->icq_list)) {
                struct io_cq *icq = hlist_entry(ioc->icq_list.first,
                                                struct io_cq, ioc_node);
                struct request_queue *q = icq->q;

                if (spin_trylock(&q->queue_lock)) {
                        ioc_destroy_icq(icq);
                        spin_unlock(&q->queue_lock);
                } else {
                        /* Make sure q and icq cannot be freed. */
                        rcu_read_lock();

                        /* Re-acquire the locks in the correct order. */
                        spin_unlock(&ioc->lock);
                        spin_lock(&q->queue_lock);
                        spin_lock(&ioc->lock);

                        /*
                         * The icq may have been destroyed when the ioc lock
                         * was released.
                         */
                        if (!(icq->flags & ICQ_DESTROYED))
                                ioc_destroy_icq(icq);

                        spin_unlock(&q->queue_lock);
                        rcu_read_unlock();
                }
        }

        spin_unlock_irq(&ioc->lock);

        kmem_cache_free(iocontext_cachep, ioc);
}

/**
 * put_io_context - put a reference of io_context
 * @ioc: io_context to put
 *
 * Decrement reference count of @ioc and release it if the count reaches
 * zero.
 */
void put_io_context(struct io_context *ioc)
{
        unsigned long flags;
        bool free_ioc = false;

        if (ioc == NULL)
                return;

        BUG_ON(atomic_long_read(&ioc->refcount) <= 0);

        /*
         * Releasing ioc requires reverse order double locking and we may
         * already be holding a queue_lock.  Do it asynchronously from wq.
         */
        if (atomic_long_dec_and_test(&ioc->refcount)) {
                spin_lock_irqsave(&ioc->lock, flags);
                if (!hlist_empty(&ioc->icq_list))
                        queue_work(system_power_efficient_wq,
                                        &ioc->release_work);
                else
                        free_ioc = true;
                spin_unlock_irqrestore(&ioc->lock, flags);
        }

        if (free_ioc)
                kmem_cache_free(iocontext_cachep, ioc);
}

/**
 * put_io_context_active - put active reference on ioc
 * @ioc: ioc of interest
 *
 * Undo get_io_context_active().  If active reference reaches zero after
 * put, @ioc can never issue further IOs and ioscheds are notified.
 */
void put_io_context_active(struct io_context *ioc)
{
        struct io_cq *icq;

        if (!atomic_dec_and_test(&ioc->active_ref)) {
                put_io_context(ioc);
                return;
        }

        spin_lock_irq(&ioc->lock);
        hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) {
                if (icq->flags & ICQ_EXITED)
                        continue;

                ioc_exit_icq(icq);
        }
        spin_unlock_irq(&ioc->lock);

        put_io_context(ioc);
}

/* Called by the exiting task */
void exit_io_context(struct task_struct *task)
{
        struct io_context *ioc;

        task_lock(task);
        ioc = task->io_context;
        task->io_context = NULL;
        task_unlock(task);

        atomic_dec(&ioc->nr_tasks);
        put_io_context_active(ioc);
}

static void __ioc_clear_queue(struct list_head *icq_list)
{
        unsigned long flags;

        rcu_read_lock();
        while (!list_empty(icq_list)) {
                struct io_cq *icq = list_entry(icq_list->next,
                                                struct io_cq, q_node);
                struct io_context *ioc = icq->ioc;

                spin_lock_irqsave(&ioc->lock, flags);
                if (icq->flags & ICQ_DESTROYED) {
                        spin_unlock_irqrestore(&ioc->lock, flags);
                        continue;
                }
                ioc_destroy_icq(icq);
                spin_unlock_irqrestore(&ioc->lock, flags);
        }
        rcu_read_unlock();
}

/**
 * ioc_clear_queue - break any ioc association with the specified queue
 * @q: request_queue being cleared
 *
 * Walk @q->icq_list and exit all io_cq's.
 */
void ioc_clear_queue(struct request_queue *q)
{
        LIST_HEAD(icq_list);

        spin_lock_irq(&q->queue_lock);
        list_splice_init(&q->icq_list, &icq_list);
        spin_unlock_irq(&q->queue_lock);

        __ioc_clear_queue(&icq_list);
}

int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node)
{
        struct io_context *ioc;
        int ret;

        ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO,
                                    node);
        if (unlikely(!ioc))
                return -ENOMEM;

        /* initialize */
        atomic_long_set(&ioc->refcount, 1);
        atomic_set(&ioc->nr_tasks, 1);
        atomic_set(&ioc->active_ref, 1);
        spin_lock_init(&ioc->lock);
        INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC);
        INIT_HLIST_HEAD(&ioc->icq_list);
        INIT_WORK(&ioc->release_work, ioc_release_fn);

        /*
         * Try to install.  ioc shouldn't be installed if someone else
         * already did or @task, which isn't %current, is exiting.  Note
         * that we need to allow ioc creation on exiting %current as exit
         * path may issue IOs from e.g. exit_files().  The exit path is
         * responsible for not issuing IO after exit_io_context().
         */
        task_lock(task);
        if (!task->io_context &&
            (task == current || !(task->flags & PF_EXITING)))
                task->io_context = ioc;
        else
                kmem_cache_free(iocontext_cachep, ioc);

        ret = task->io_context ? 0 : -EBUSY;

        task_unlock(task);

        return ret;
}

/**
 * get_task_io_context - get io_context of a task
 * @task: task of interest
 * @gfp_flags: allocation flags, used if allocation is necessary
 * @node: allocation node, used if allocation is necessary
 *
 * Return io_context of @task.  If it doesn't exist, it is created with
 * @gfp_flags and @node.  The returned io_context has its reference count
 * incremented.
 *
 * This function always goes through task_lock() and it's better to use
 * %current->io_context + get_io_context() for %current.
 */
struct io_context *get_task_io_context(struct task_struct *task,
                                       gfp_t gfp_flags, int node)
{
        struct io_context *ioc;

        might_sleep_if(gfpflags_allow_blocking(gfp_flags));

        do {
                task_lock(task);
                ioc = task->io_context;
                if (likely(ioc)) {
                        get_io_context(ioc);
                        task_unlock(task);
                        return ioc;
                }
                task_unlock(task);
        } while (!create_task_io_context(task, gfp_flags, node));

        return NULL;
}

/**
 * ioc_lookup_icq - lookup io_cq from ioc
 * @ioc: the associated io_context
 * @q: the associated request_queue
 *
 * Look up io_cq associated with @ioc - @q pair from @ioc.  Must be called
 * with @q->queue_lock held.
 */
struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q)
{
        struct io_cq *icq;

        lockdep_assert_held(&q->queue_lock);

        /*
         * icq's are indexed from @ioc using radix tree and hint pointer,
         * both of which are protected with RCU.  All removals are done
         * holding both q and ioc locks, and we're holding q lock - if we
         * find a icq which points to us, it's guaranteed to be valid.
         */
        rcu_read_lock();
        icq = rcu_dereference(ioc->icq_hint);
        if (icq && icq->q == q)
                goto out;

        icq = radix_tree_lookup(&ioc->icq_tree, q->id);
        if (icq && icq->q == q)
                rcu_assign_pointer(ioc->icq_hint, icq);        /* allowed to race */
        else
                icq = NULL;
out:
        rcu_read_unlock();
        return icq;
}
EXPORT_SYMBOL(ioc_lookup_icq);

/**
 * ioc_create_icq - create and link io_cq
 * @ioc: io_context of interest
 * @q: request_queue of interest
 * @gfp_mask: allocation mask
 *
 * Make sure io_cq linking @ioc and @q exists.  If icq doesn't exist, they
 * will be created using @gfp_mask.
 *
 * The caller is responsible for ensuring @ioc won't go away and @q is
 * alive and will stay alive until this function returns.
 */
struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
                             gfp_t gfp_mask)
{
        struct elevator_type *et = q->elevator->type;
        struct io_cq *icq;

        /* allocate stuff */
        icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO,
                                    q->node);
        if (!icq)
                return NULL;

        if (radix_tree_maybe_preload(gfp_mask) < 0) {
                kmem_cache_free(et->icq_cache, icq);
                return NULL;
        }

        icq->ioc = ioc;
        icq->q = q;
        INIT_LIST_HEAD(&icq->q_node);
        INIT_HLIST_NODE(&icq->ioc_node);

        /* lock both q and ioc and try to link @icq */
        spin_lock_irq(&q->queue_lock);
        spin_lock(&ioc->lock);

        if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) {
                hlist_add_head(&icq->ioc_node, &ioc->icq_list);
                list_add(&icq->q_node, &q->icq_list);
                if (et->ops.init_icq)
                        et->ops.init_icq(icq);
        } else {
                kmem_cache_free(et->icq_cache, icq);
                icq = ioc_lookup_icq(ioc, q);
                if (!icq)
                        printk(KERN_ERR "cfq: icq link failed!\n");
        }

        spin_unlock(&ioc->lock);
        spin_unlock_irq(&q->queue_lock);
        radix_tree_preload_end();
        return icq;
}

static int __init blk_ioc_init(void)
{
        iocontext_cachep = kmem_cache_create("blkdev_ioc",
                        sizeof(struct io_context), 0, SLAB_PANIC, NULL);
        return 0;
}
subsys_initcall(blk_ioc_init);




























    1 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _DELAYED_CALL_H
#define _DELAYED_CALL_H

/*
 * Poor man's closures; I wish we could've done them sanely polymorphic,
 * but...
 */

struct delayed_call {
        void (*fn)(void *);
        void *arg;
};

#define DEFINE_DELAYED_CALL(name) struct delayed_call name = {NULL, NULL}

/* I really wish we had closures with sane typechecking... */
static inline void set_delayed_call(struct delayed_call *call,
                void (*fn)(void *), void *arg)
{
        call->fn = fn;
        call->arg = arg;
}

static inline void do_delayed_call(struct delayed_call *call)
{
        if (call->fn)
                call->fn(call->arg);
}

static inline void clear_delayed_call(struct delayed_call *call)
{
        call->fn = NULL;
}
#endif
































































































































































































































































    1 










































































    2 






    1 








































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Operations on the network namespace
 */
#ifndef __NET_NET_NAMESPACE_H
#define __NET_NET_NAMESPACE_H

#include <linux/atomic.h>
#include <linux/refcount.h>
#include <linux/workqueue.h>
#include <linux/list.h>
#include <linux/sysctl.h>
#include <linux/uidgid.h>

#include <net/flow.h>
#include <net/netns/core.h>
#include <net/netns/mib.h>
#include <net/netns/unix.h>
#include <net/netns/packet.h>
#include <net/netns/ipv4.h>
#include <net/netns/ipv6.h>
#include <net/netns/nexthop.h>
#include <net/netns/ieee802154_6lowpan.h>
#include <net/netns/sctp.h>
#include <net/netns/dccp.h>
#include <net/netns/netfilter.h>
#include <net/netns/x_tables.h>
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
#include <net/netns/conntrack.h>
#endif
#include <net/netns/nftables.h>
#include <net/netns/xfrm.h>
#include <net/netns/mpls.h>
#include <net/netns/can.h>
#include <net/netns/xdp.h>
#include <net/netns/bpf.h>
#include <linux/ns_common.h>
#include <linux/idr.h>
#include <linux/skbuff.h>
#include <linux/notifier.h>

struct user_namespace;
struct proc_dir_entry;
struct net_device;
struct sock;
struct ctl_table_header;
struct net_generic;
struct uevent_sock;
struct netns_ipvs;
struct bpf_prog;


#define NETDEV_HASHBITS    8
#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)

struct net {
        /* First cache line can be often dirtied.
         * Do not place here read-mostly fields.
         */
        refcount_t                passive;        /* To decide when the network
                                                 * namespace should be freed.
                                                 */
        refcount_t                count;                /* To decided when the network
                                                 *  namespace should be shut down.
                                                 */
        spinlock_t                rules_mod_lock;

        unsigned int                dev_unreg_count;

        unsigned int                dev_base_seq;        /* protected by rtnl_mutex */
        int                        ifindex;

        spinlock_t                nsid_lock;
        atomic_t                fnhe_genid;

        struct list_head        list;                /* list of network namespaces */
        struct list_head        exit_list;        /* To linked to call pernet exit
                                                 * methods on dead net (
                                                 * pernet_ops_rwsem read locked),
                                                 * or to unregister pernet ops
                                                 * (pernet_ops_rwsem write locked).
                                                 */
        struct llist_node        defer_free_list;
        struct llist_node        cleanup_list;        /* namespaces on death row */

#ifdef CONFIG_KEYS
        struct key_tag                *key_domain;        /* Key domain of operation tag */
#endif
        struct user_namespace   *user_ns;        /* Owning user namespace */
        struct ucounts                *ucounts;
        struct idr                netns_ids;

        struct ns_common        ns;

        struct list_head         dev_base_head;
        struct proc_dir_entry         *proc_net;
        struct proc_dir_entry         *proc_net_stat;

#ifdef CONFIG_SYSCTL
        struct ctl_table_set        sysctls;
#endif

        struct sock                 *rtnl;                        /* rtnetlink socket */
        struct sock                *genl_sock;

        struct uevent_sock        *uevent_sock;                /* uevent socket */

        struct hlist_head         *dev_name_head;
        struct hlist_head        *dev_index_head;
        struct raw_notifier_head        netdev_chain;

        /* Note that @hash_mix can be read millions times per second,
         * it is critical that it is on a read_mostly cache line.
         */
        u32                        hash_mix;

        struct net_device       *loopback_dev;          /* The loopback */

        /* core fib_rules */
        struct list_head        rules_ops;

        struct netns_core        core;
        struct netns_mib        mib;
        struct netns_packet        packet;
        struct netns_unix        unx;
        struct netns_nexthop        nexthop;
        struct netns_ipv4        ipv4;
#if IS_ENABLED(CONFIG_IPV6)
        struct netns_ipv6        ipv6;
#endif
#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
        struct netns_ieee802154_lowpan        ieee802154_lowpan;
#endif
#if defined(CONFIG_IP_SCTP) || defined(CONFIG_IP_SCTP_MODULE)
        struct netns_sctp        sctp;
#endif
#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE)
        struct netns_dccp        dccp;
#endif
#ifdef CONFIG_NETFILTER
        struct netns_nf                nf;
        struct netns_xt                xt;
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
        struct netns_ct                ct;
#endif
#if defined(CONFIG_NF_TABLES) || defined(CONFIG_NF_TABLES_MODULE)
        struct netns_nftables        nft;
#endif
#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
        struct netns_nf_frag        nf_frag;
        struct ctl_table_header *nf_frag_frags_hdr;
#endif
        struct sock                *nfnl;
        struct sock                *nfnl_stash;
#if IS_ENABLED(CONFIG_NETFILTER_NETLINK_ACCT)
        struct list_head        nfnl_acct_list;
#endif
#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
        struct list_head        nfct_timeout_list;
#endif
#endif
#ifdef CONFIG_WEXT_CORE
        struct sk_buff_head        wext_nlevents;
#endif
        struct net_generic __rcu        *gen;

        /* Used to store attached BPF programs */
        struct netns_bpf        bpf;

        /* Note : following structs are cache line aligned */
#ifdef CONFIG_XFRM
        struct netns_xfrm        xfrm;
#endif

        atomic64_t                net_cookie; /* written once */

#if IS_ENABLED(CONFIG_IP_VS)
        struct netns_ipvs        *ipvs;
#endif
#if IS_ENABLED(CONFIG_MPLS)
        struct netns_mpls        mpls;
#endif
#if IS_ENABLED(CONFIG_CAN)
        struct netns_can        can;
#endif
#ifdef CONFIG_XDP_SOCKETS
        struct netns_xdp        xdp;
#endif
#if IS_ENABLED(CONFIG_CRYPTO_USER)
        struct sock                *crypto_nlsk;
#endif
        struct sock                *diag_nlsk;
} __randomize_layout;

#include <linux/seq_file_net.h>

/* Init's network namespace */
extern struct net init_net;

#ifdef CONFIG_NET_NS
struct net *copy_net_ns(unsigned long flags, struct user_namespace *user_ns,
                        struct net *old_net);

void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid);

void net_ns_barrier(void);

struct ns_common *get_net_ns(struct ns_common *ns);
#else /* CONFIG_NET_NS */
#include <linux/sched.h>
#include <linux/nsproxy.h>
static inline struct net *copy_net_ns(unsigned long flags,
        struct user_namespace *user_ns, struct net *old_net)
{
        if (flags & CLONE_NEWNET)
                return ERR_PTR(-EINVAL);
        return old_net;
}

static inline void net_ns_get_ownership(const struct net *net,
                                        kuid_t *uid, kgid_t *gid)
{
        *uid = GLOBAL_ROOT_UID;
        *gid = GLOBAL_ROOT_GID;
}

static inline void net_ns_barrier(void) {}

static inline struct ns_common *get_net_ns(struct ns_common *ns)
{
        return ERR_PTR(-EINVAL);
}
#endif /* CONFIG_NET_NS */


extern struct list_head net_namespace_list;

struct net *get_net_ns_by_pid(pid_t pid);
struct net *get_net_ns_by_fd(int fd);

u64 __net_gen_cookie(struct net *net);

#ifdef CONFIG_SYSCTL
void ipx_register_sysctl(void);
void ipx_unregister_sysctl(void);
#else
#define ipx_register_sysctl()
#define ipx_unregister_sysctl()
#endif

#ifdef CONFIG_NET_NS
void __put_net(struct net *net);

static inline struct net *get_net(struct net *net)
{
        refcount_inc(&net->count);
        return net;
}

static inline struct net *maybe_get_net(struct net *net)
{
        /* Used when we know struct net exists but we
         * aren't guaranteed a previous reference count
         * exists.  If the reference count is zero this
         * function fails and returns NULL.
         */
        if (!refcount_inc_not_zero(&net->count))
                net = NULL;
        return net;
}

static inline void put_net(struct net *net)
{
        if (refcount_dec_and_test(&net->count))
                __put_net(net);
}

static inline
int net_eq(const struct net *net1, const struct net *net2)
{
        return net1 == net2;
}

static inline int check_net(const struct net *net)
{
        return refcount_read(&net->count) != 0;
}

void net_drop_ns(void *);

#else

static inline struct net *get_net(struct net *net)
{
        return net;
}

static inline void put_net(struct net *net)
{
}

static inline struct net *maybe_get_net(struct net *net)
{
        return net;
}

static inline
int net_eq(const struct net *net1, const struct net *net2)
{
        return 1;
}

static inline int check_net(const struct net *net)
{
        return 1;
}

#define net_drop_ns NULL
#endif


typedef struct {
#ifdef CONFIG_NET_NS
        struct net __rcu *net;
#endif
} possible_net_t;

static inline void write_pnet(possible_net_t *pnet, struct net *net)
{
#ifdef CONFIG_NET_NS
        rcu_assign_pointer(pnet->net, net);
#endif
}

static inline struct net *read_pnet(const possible_net_t *pnet)
{
#ifdef CONFIG_NET_NS
        return rcu_dereference_protected(pnet->net, true);
#else
        return &init_net;
#endif
}

static inline struct net *read_pnet_rcu(const possible_net_t *pnet)
{
#ifdef CONFIG_NET_NS
        return rcu_dereference(pnet->net);
#else
        return &init_net;
#endif
}

/* Protected by net_rwsem */
#define for_each_net(VAR)                                \
        list_for_each_entry(VAR, &net_namespace_list, list)
#define for_each_net_continue_reverse(VAR)                \
        list_for_each_entry_continue_reverse(VAR, &net_namespace_list, list)
#define for_each_net_rcu(VAR)                                \
        list_for_each_entry_rcu(VAR, &net_namespace_list, list)

#ifdef CONFIG_NET_NS
#define __net_init
#define __net_exit
#define __net_initdata
#define __net_initconst
#else
#define __net_init        __init
#define __net_exit        __ref
#define __net_initdata        __initdata
#define __net_initconst        __initconst
#endif

int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp);
int peernet2id(const struct net *net, struct net *peer);
bool peernet_has_id(const struct net *net, struct net *peer);
struct net *get_net_ns_by_id(const struct net *net, int id);

struct pernet_operations {
        struct list_head list;
        /*
         * Below methods are called without any exclusive locks.
         * More than one net may be constructed and destructed
         * in parallel on several cpus. Every pernet_operations
         * have to keep in mind all other pernet_operations and
         * to introduce a locking, if they share common resources.
         *
         * The only time they are called with exclusive lock is
         * from register_pernet_subsys(), unregister_pernet_subsys()
         * register_pernet_device() and unregister_pernet_device().
         *
         * Exit methods using blocking RCU primitives, such as
         * synchronize_rcu(), should be implemented via exit_batch.
         * Then, destruction of a group of net requires single
         * synchronize_rcu() related to these pernet_operations,
         * instead of separate synchronize_rcu() for every net.
         * Please, avoid synchronize_rcu() at all, where it's possible.
         *
         * Note that a combination of pre_exit() and exit() can
         * be used, since a synchronize_rcu() is guaranteed between
         * the calls.
         */
        int (*init)(struct net *net);
        void (*pre_exit)(struct net *net);
        void (*exit)(struct net *net);
        void (*exit_batch)(struct list_head *net_exit_list);
        /* Following method is called with RTNL held. */
        void (*exit_batch_rtnl)(struct list_head *net_exit_list,
                                struct list_head *dev_kill_list);
        unsigned int *id;
        size_t size;
};

/*
 * Use these carefully.  If you implement a network device and it
 * needs per network namespace operations use device pernet operations,
 * otherwise use pernet subsys operations.
 *
 * Network interfaces need to be removed from a dying netns _before_
 * subsys notifiers can be called, as most of the network code cleanup
 * (which is done from subsys notifiers) runs with the assumption that
 * dev_remove_pack has been called so no new packets will arrive during
 * and after the cleanup functions have been called.  dev_remove_pack
 * is not per namespace so instead the guarantee of no more packets
 * arriving in a network namespace is provided by ensuring that all
 * network devices and all sockets have left the network namespace
 * before the cleanup methods are called.
 *
 * For the longest time the ipv4 icmp code was registered as a pernet
 * device which caused kernel oops, and panics during network
 * namespace cleanup.   So please don't get this wrong.
 */
int register_pernet_subsys(struct pernet_operations *);
void unregister_pernet_subsys(struct pernet_operations *);
int register_pernet_device(struct pernet_operations *);
void unregister_pernet_device(struct pernet_operations *);

struct ctl_table;
struct ctl_table_header;

#ifdef CONFIG_SYSCTL
int net_sysctl_init(void);
struct ctl_table_header *register_net_sysctl(struct net *net, const char *path,
                                             struct ctl_table *table);
void unregister_net_sysctl_table(struct ctl_table_header *header);
#else
static inline int net_sysctl_init(void) { return 0; }
static inline struct ctl_table_header *register_net_sysctl(struct net *net,
        const char *path, struct ctl_table *table)
{
        return NULL;
}
static inline void unregister_net_sysctl_table(struct ctl_table_header *header)
{
}
#endif

static inline int rt_genid_ipv4(const struct net *net)
{
        return atomic_read(&net->ipv4.rt_genid);
}

#if IS_ENABLED(CONFIG_IPV6)
static inline int rt_genid_ipv6(const struct net *net)
{
        return atomic_read(&net->ipv6.fib6_sernum);
}
#endif

static inline void rt_genid_bump_ipv4(struct net *net)
{
        atomic_inc(&net->ipv4.rt_genid);
}

extern void (*__fib6_flush_trees)(struct net *net);
static inline void rt_genid_bump_ipv6(struct net *net)
{
        if (__fib6_flush_trees)
                __fib6_flush_trees(net);
}

#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
static inline struct netns_ieee802154_lowpan *
net_ieee802154_lowpan(struct net *net)
{
        return &net->ieee802154_lowpan;
}
#endif

/* For callers who don't really care about whether it's IPv4 or IPv6 */
static inline void rt_genid_bump_all(struct net *net)
{
        rt_genid_bump_ipv4(net);
        rt_genid_bump_ipv6(net);
}

static inline int fnhe_genid(const struct net *net)
{
        return atomic_read(&net->fnhe_genid);
}

static inline void fnhe_genid_bump(struct net *net)
{
        atomic_inc(&net->fnhe_genid);
}

#endif /* __NET_NET_NAMESPACE_H */


































































































































































































































    1 



















































































































































































































































































































































































































    1 

    1 





























































































































































































































































































































































































































































































































































































































































    1 





    1 

















































    1 













































































































































































































    1 









    1 



    1 


    1 












































    1 









    1 

































































































    1 




































    1 










    1 











    1 




    1 































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 




    1 
    1 






















    1 

    1 




























    1 


    1 



































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 























































    1 
    1 











    1 



    1 

































    1 

























    1 









    1 





    1 









































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  libata-scsi.c - helper library for ATA
 *
 *  Copyright 2003-2004 Red Hat, Inc.  All rights reserved.
 *  Copyright 2003-2004 Jeff Garzik
 *
 *  libata documentation is available via 'make {ps|pdf}docs',
 *  as Documentation/driver-api/libata.rst
 *
 *  Hardware documentation available from
 *  - http://www.t10.org/
 *  - http://www.t13.org/
 */

#include <linux/compat.h>
#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/spinlock.h>
#include <linux/export.h>
#include <scsi/scsi.h>
#include <scsi/scsi_host.h>
#include <scsi/scsi_cmnd.h>
#include <scsi/scsi_eh.h>
#include <scsi/scsi_device.h>
#include <scsi/scsi_tcq.h>
#include <scsi/scsi_transport.h>
#include <linux/libata.h>
#include <linux/hdreg.h>
#include <linux/uaccess.h>
#include <linux/suspend.h>
#include <asm/unaligned.h>
#include <linux/ioprio.h>
#include <linux/of.h>

#include "libata.h"
#include "libata-transport.h"

#define ATA_SCSI_RBUF_SIZE        576

static DEFINE_SPINLOCK(ata_scsi_rbuf_lock);
static u8 ata_scsi_rbuf[ATA_SCSI_RBUF_SIZE];

typedef unsigned int (*ata_xlat_func_t)(struct ata_queued_cmd *qc);

static struct ata_device *__ata_scsi_find_dev(struct ata_port *ap,
                                        const struct scsi_device *scsidev);

#define RW_RECOVERY_MPAGE 0x1
#define RW_RECOVERY_MPAGE_LEN 12
#define CACHE_MPAGE 0x8
#define CACHE_MPAGE_LEN 20
#define CONTROL_MPAGE 0xa
#define CONTROL_MPAGE_LEN 12
#define ALL_MPAGES 0x3f
#define ALL_SUB_MPAGES 0xff


static const u8 def_rw_recovery_mpage[RW_RECOVERY_MPAGE_LEN] = {
        RW_RECOVERY_MPAGE,
        RW_RECOVERY_MPAGE_LEN - 2,
        (1 << 7),        /* AWRE */
        0,                /* read retry count */
        0, 0, 0, 0,
        0,                /* write retry count */
        0, 0, 0
};

static const u8 def_cache_mpage[CACHE_MPAGE_LEN] = {
        CACHE_MPAGE,
        CACHE_MPAGE_LEN - 2,
        0,                /* contains WCE, needs to be 0 for logic */
        0, 0, 0, 0, 0, 0, 0, 0, 0,
        0,                /* contains DRA, needs to be 0 for logic */
        0, 0, 0, 0, 0, 0, 0
};

static const u8 def_control_mpage[CONTROL_MPAGE_LEN] = {
        CONTROL_MPAGE,
        CONTROL_MPAGE_LEN - 2,
        2,        /* DSENSE=0, GLTSD=1 */
        0,        /* [QAM+QERR may be 1, see 05-359r1] */
        0, 0, 0, 0, 0xff, 0xff,
        0, 30        /* extended self test time, see 05-359r1 */
};

static ssize_t ata_scsi_park_show(struct device *device,
                                  struct device_attribute *attr, char *buf)
{
        struct scsi_device *sdev = to_scsi_device(device);
        struct ata_port *ap;
        struct ata_link *link;
        struct ata_device *dev;
        unsigned long now;
        unsigned int msecs;
        int rc = 0;

        ap = ata_shost_to_port(sdev->host);

        spin_lock_irq(ap->lock);
        dev = ata_scsi_find_dev(ap, sdev);
        if (!dev) {
                rc = -ENODEV;
                goto unlock;
        }
        if (dev->flags & ATA_DFLAG_NO_UNLOAD) {
                rc = -EOPNOTSUPP;
                goto unlock;
        }

        link = dev->link;
        now = jiffies;
        if (ap->pflags & ATA_PFLAG_EH_IN_PROGRESS &&
            link->eh_context.unloaded_mask & (1 << dev->devno) &&
            time_after(dev->unpark_deadline, now))
                msecs = jiffies_to_msecs(dev->unpark_deadline - now);
        else
                msecs = 0;

unlock:
        spin_unlock_irq(ap->lock);

        return rc ? rc : snprintf(buf, 20, "%u\n", msecs);
}

static ssize_t ata_scsi_park_store(struct device *device,
                                   struct device_attribute *attr,
                                   const char *buf, size_t len)
{
        struct scsi_device *sdev = to_scsi_device(device);
        struct ata_port *ap;
        struct ata_device *dev;
        long int input;
        unsigned long flags;
        int rc;

        rc = kstrtol(buf, 10, &input);
        if (rc)
                return rc;
        if (input < -2)
                return -EINVAL;
        if (input > ATA_TMOUT_MAX_PARK) {
                rc = -EOVERFLOW;
                input = ATA_TMOUT_MAX_PARK;
        }

        ap = ata_shost_to_port(sdev->host);

        spin_lock_irqsave(ap->lock, flags);
        dev = ata_scsi_find_dev(ap, sdev);
        if (unlikely(!dev)) {
                rc = -ENODEV;
                goto unlock;
        }
        if (dev->class != ATA_DEV_ATA &&
            dev->class != ATA_DEV_ZAC) {
                rc = -EOPNOTSUPP;
                goto unlock;
        }

        if (input >= 0) {
                if (dev->flags & ATA_DFLAG_NO_UNLOAD) {
                        rc = -EOPNOTSUPP;
                        goto unlock;
                }

                dev->unpark_deadline = ata_deadline(jiffies, input);
                dev->link->eh_info.dev_action[dev->devno] |= ATA_EH_PARK;
                ata_port_schedule_eh(ap);
                complete(&ap->park_req_pending);
        } else {
                switch (input) {
                case -1:
                        dev->flags &= ~ATA_DFLAG_NO_UNLOAD;
                        break;
                case -2:
                        dev->flags |= ATA_DFLAG_NO_UNLOAD;
                        break;
                }
        }
unlock:
        spin_unlock_irqrestore(ap->lock, flags);

        return rc ? rc : len;
}
DEVICE_ATTR(unload_heads, S_IRUGO | S_IWUSR,
            ata_scsi_park_show, ata_scsi_park_store);
EXPORT_SYMBOL_GPL(dev_attr_unload_heads);

void ata_scsi_set_sense(struct ata_device *dev, struct scsi_cmnd *cmd,
                        u8 sk, u8 asc, u8 ascq)
{
        bool d_sense = (dev->flags & ATA_DFLAG_D_SENSE);

        if (!cmd)
                return;

        cmd->result = (DRIVER_SENSE << 24) | SAM_STAT_CHECK_CONDITION;

        scsi_build_sense_buffer(d_sense, cmd->sense_buffer, sk, asc, ascq);
}

void ata_scsi_set_sense_information(struct ata_device *dev,
                                    struct scsi_cmnd *cmd,
                                    const struct ata_taskfile *tf)
{
        u64 information;

        if (!cmd)
                return;

        information = ata_tf_read_block(tf, dev);
        if (information == U64_MAX)
                return;

        scsi_set_sense_information(cmd->sense_buffer,
                                   SCSI_SENSE_BUFFERSIZE, information);
}

static void ata_scsi_set_invalid_field(struct ata_device *dev,
                                       struct scsi_cmnd *cmd, u16 field, u8 bit)
{
        ata_scsi_set_sense(dev, cmd, ILLEGAL_REQUEST, 0x24, 0x0);
        /* "Invalid field in CDB" */
        scsi_set_sense_field_pointer(cmd->sense_buffer, SCSI_SENSE_BUFFERSIZE,
                                     field, bit, 1);
}

static void ata_scsi_set_invalid_parameter(struct ata_device *dev,
                                           struct scsi_cmnd *cmd, u16 field)
{
        /* "Invalid field in parameter list" */
        ata_scsi_set_sense(dev, cmd, ILLEGAL_REQUEST, 0x26, 0x0);
        scsi_set_sense_field_pointer(cmd->sense_buffer, SCSI_SENSE_BUFFERSIZE,
                                     field, 0xff, 0);
}

struct device_attribute *ata_common_sdev_attrs[] = {
        &dev_attr_unload_heads,
        NULL
};
EXPORT_SYMBOL_GPL(ata_common_sdev_attrs);

/**
 *        ata_std_bios_param - generic bios head/sector/cylinder calculator used by sd.
 *        @sdev: SCSI device for which BIOS geometry is to be determined
 *        @bdev: block device associated with @sdev
 *        @capacity: capacity of SCSI device
 *        @geom: location to which geometry will be output
 *
 *        Generic bios head/sector/cylinder calculator
 *        used by sd. Most BIOSes nowadays expect a XXX/255/16  (CHS)
 *        mapping. Some situations may arise where the disk is not
 *        bootable if this is not used.
 *
 *        LOCKING:
 *        Defined by the SCSI layer.  We don't really care.
 *
 *        RETURNS:
 *        Zero.
 */
int ata_std_bios_param(struct scsi_device *sdev, struct block_device *bdev,
                       sector_t capacity, int geom[])
{
        geom[0] = 255;
        geom[1] = 63;
        sector_div(capacity, 255*63);
        geom[2] = capacity;

        return 0;
}
EXPORT_SYMBOL_GPL(ata_std_bios_param);

/**
 *        ata_scsi_unlock_native_capacity - unlock native capacity
 *        @sdev: SCSI device to adjust device capacity for
 *
 *        This function is called if a partition on @sdev extends beyond
 *        the end of the device.  It requests EH to unlock HPA.
 *
 *        LOCKING:
 *        Defined by the SCSI layer.  Might sleep.
 */
void ata_scsi_unlock_native_capacity(struct scsi_device *sdev)
{
        struct ata_port *ap = ata_shost_to_port(sdev->host);
        struct ata_device *dev;
        unsigned long flags;

        spin_lock_irqsave(ap->lock, flags);

        dev = ata_scsi_find_dev(ap, sdev);
        if (dev && dev->n_sectors < dev->n_native_sectors) {
                dev->flags |= ATA_DFLAG_UNLOCK_HPA;
                dev->link->eh_info.action |= ATA_EH_RESET;
                ata_port_schedule_eh(ap);
        }

        spin_unlock_irqrestore(ap->lock, flags);
        ata_port_wait_eh(ap);
}
EXPORT_SYMBOL_GPL(ata_scsi_unlock_native_capacity);

/**
 *        ata_get_identity - Handler for HDIO_GET_IDENTITY ioctl
 *        @ap: target port
 *        @sdev: SCSI device to get identify data for
 *        @arg: User buffer area for identify data
 *
 *        LOCKING:
 *        Defined by the SCSI layer.  We don't really care.
 *
 *        RETURNS:
 *        Zero on success, negative errno on error.
 */
static int ata_get_identity(struct ata_port *ap, struct scsi_device *sdev,
                            void __user *arg)
{
        struct ata_device *dev = ata_scsi_find_dev(ap, sdev);
        u16 __user *dst = arg;
        char buf[40];

        if (!dev)
                return -ENOMSG;

        if (copy_to_user(dst, dev->id, ATA_ID_WORDS * sizeof(u16)))
                return -EFAULT;

        ata_id_string(dev->id, buf, ATA_ID_PROD, ATA_ID_PROD_LEN);
        if (copy_to_user(dst + ATA_ID_PROD, buf, ATA_ID_PROD_LEN))
                return -EFAULT;

        ata_id_string(dev->id, buf, ATA_ID_FW_REV, ATA_ID_FW_REV_LEN);
        if (copy_to_user(dst + ATA_ID_FW_REV, buf, ATA_ID_FW_REV_LEN))
                return -EFAULT;

        ata_id_string(dev->id, buf, ATA_ID_SERNO, ATA_ID_SERNO_LEN);
        if (copy_to_user(dst + ATA_ID_SERNO, buf, ATA_ID_SERNO_LEN))
                return -EFAULT;

        return 0;
}

/**
 *        ata_cmd_ioctl - Handler for HDIO_DRIVE_CMD ioctl
 *        @scsidev: Device to which we are issuing command
 *        @arg: User provided data for issuing command
 *
 *        LOCKING:
 *        Defined by the SCSI layer.  We don't really care.
 *
 *        RETURNS:
 *        Zero on success, negative errno on error.
 */
int ata_cmd_ioctl(struct scsi_device *scsidev, void __user *arg)
{
        int rc = 0;
        u8 sensebuf[SCSI_SENSE_BUFFERSIZE];
        u8 scsi_cmd[MAX_COMMAND_SIZE];
        u8 args[4], *argbuf = NULL;
        int argsize = 0;
        enum dma_data_direction data_dir;
        struct scsi_sense_hdr sshdr;
        int cmd_result;

        if (arg == NULL)
                return -EINVAL;

        if (copy_from_user(args, arg, sizeof(args)))
                return -EFAULT;

        memset(sensebuf, 0, sizeof(sensebuf));
        memset(scsi_cmd, 0, sizeof(scsi_cmd));

        if (args[3]) {
                argsize = ATA_SECT_SIZE * args[3];
                argbuf = kmalloc(argsize, GFP_KERNEL);
                if (argbuf == NULL) {
                        rc = -ENOMEM;
                        goto error;
                }

                scsi_cmd[1]  = (4 << 1); /* PIO Data-in */
                scsi_cmd[2]  = 0x0e;     /* no off.line or cc, read from dev,
                                            block count in sector count field */
                data_dir = DMA_FROM_DEVICE;
        } else {
                scsi_cmd[1]  = (3 << 1); /* Non-data */
                scsi_cmd[2]  = 0x20;     /* cc but no off.line or data xfer */
                data_dir = DMA_NONE;
        }

        scsi_cmd[0] = ATA_16;

        scsi_cmd[4] = args[2];
        if (args[0] == ATA_CMD_SMART) { /* hack -- ide driver does this too */
                scsi_cmd[6]  = args[3];
                scsi_cmd[8]  = args[1];
                scsi_cmd[10] = ATA_SMART_LBAM_PASS;
                scsi_cmd[12] = ATA_SMART_LBAH_PASS;
        } else {
                scsi_cmd[6]  = args[1];
        }
        scsi_cmd[14] = args[0];

        /* Good values for timeout and retries?  Values below
           from scsi_ioctl_send_command() for default case... */
        cmd_result = scsi_execute(scsidev, scsi_cmd, data_dir, argbuf, argsize,
                                  sensebuf, &sshdr, (10*HZ), 5, 0, 0, NULL);

        if (driver_byte(cmd_result) == DRIVER_SENSE) {/* sense data available */
                u8 *desc = sensebuf + 8;
                cmd_result &= ~(0xFF<<24); /* DRIVER_SENSE is not an error */

                /* If we set cc then ATA pass-through will cause a
                 * check condition even if no error. Filter that. */
                if (cmd_result & SAM_STAT_CHECK_CONDITION) {
                        if (sshdr.sense_key == RECOVERED_ERROR &&
                            sshdr.asc == 0 && sshdr.ascq == 0x1d)
                                cmd_result &= ~SAM_STAT_CHECK_CONDITION;
                }

                /* Send userspace a few ATA registers (same as drivers/ide) */
                if (sensebuf[0] == 0x72 &&        /* format is "descriptor" */
                    desc[0] == 0x09) {                /* code is "ATA Descriptor" */
                        args[0] = desc[13];        /* status */
                        args[1] = desc[3];        /* error */
                        args[2] = desc[5];        /* sector count (0:7) */
                        if (copy_to_user(arg, args, sizeof(args)))
                                rc = -EFAULT;
                }
        }


        if (cmd_result) {
                rc = -EIO;
                goto error;
        }

        if ((argbuf)
         && copy_to_user(arg + sizeof(args), argbuf, argsize))
                rc = -EFAULT;
error:
        kfree(argbuf);
        return rc;
}

/**
 *        ata_task_ioctl - Handler for HDIO_DRIVE_TASK ioctl
 *        @scsidev: Device to which we are issuing command
 *        @arg: User provided data for issuing command
 *
 *        LOCKING:
 *        Defined by the SCSI layer.  We don't really care.
 *
 *        RETURNS:
 *        Zero on success, negative errno on error.
 */
int ata_task_ioctl(struct scsi_device *scsidev, void __user *arg)
{
        int rc = 0;
        u8 sensebuf[SCSI_SENSE_BUFFERSIZE];
        u8 scsi_cmd[MAX_COMMAND_SIZE];
        u8 args[7];
        struct scsi_sense_hdr sshdr;
        int cmd_result;

        if (arg == NULL)
                return -EINVAL;

        if (copy_from_user(args, arg, sizeof(args)))
                return -EFAULT;

        memset(sensebuf, 0, sizeof(sensebuf));
        memset(scsi_cmd, 0, sizeof(scsi_cmd));
        scsi_cmd[0]  = ATA_16;
        scsi_cmd[1]  = (3 << 1); /* Non-data */
        scsi_cmd[2]  = 0x20;     /* cc but no off.line or data xfer */
        scsi_cmd[4]  = args[1];
        scsi_cmd[6]  = args[2];
        scsi_cmd[8]  = args[3];
        scsi_cmd[10] = args[4];
        scsi_cmd[12] = args[5];
        scsi_cmd[13] = args[6] & 0x4f;
        scsi_cmd[14] = args[0];

        /* Good values for timeout and retries?  Values below
           from scsi_ioctl_send_command() for default case... */
        cmd_result = scsi_execute(scsidev, scsi_cmd, DMA_NONE, NULL, 0,
                                sensebuf, &sshdr, (10*HZ), 5, 0, 0, NULL);

        if (driver_byte(cmd_result) == DRIVER_SENSE) {/* sense data available */
                u8 *desc = sensebuf + 8;
                cmd_result &= ~(0xFF<<24); /* DRIVER_SENSE is not an error */

                /* If we set cc then ATA pass-through will cause a
                 * check condition even if no error. Filter that. */
                if (cmd_result & SAM_STAT_CHECK_CONDITION) {
                        if (sshdr.sense_key == RECOVERED_ERROR &&
                            sshdr.asc == 0 && sshdr.ascq == 0x1d)
                                cmd_result &= ~SAM_STAT_CHECK_CONDITION;
                }

                /* Send userspace ATA registers */
                if (sensebuf[0] == 0x72 &&        /* format is "descriptor" */
                                desc[0] == 0x09) {/* code is "ATA Descriptor" */
                        args[0] = desc[13];        /* status */
                        args[1] = desc[3];        /* error */
                        args[2] = desc[5];        /* sector count (0:7) */
                        args[3] = desc[7];        /* lbal */
                        args[4] = desc[9];        /* lbam */
                        args[5] = desc[11];        /* lbah */
                        args[6] = desc[12];        /* select */
                        if (copy_to_user(arg, args, sizeof(args)))
                                rc = -EFAULT;
                }
        }

        if (cmd_result) {
                rc = -EIO;
                goto error;
        }

 error:
        return rc;
}

static int ata_ioc32(struct ata_port *ap)
{
        if (ap->flags & ATA_FLAG_PIO_DMA)
                return 1;
        if (ap->pflags & ATA_PFLAG_PIO32)
                return 1;
        return 0;
}

/*
 * This handles both native and compat commands, so anything added
 * here must have a compatible argument, or check in_compat_syscall()
 */
int ata_sas_scsi_ioctl(struct ata_port *ap, struct scsi_device *scsidev,
                     unsigned int cmd, void __user *arg)
{
        unsigned long val;
        int rc = -EINVAL;
        unsigned long flags;

        switch (cmd) {
        case HDIO_GET_32BIT:
                spin_lock_irqsave(ap->lock, flags);
                val = ata_ioc32(ap);
                spin_unlock_irqrestore(ap->lock, flags);
#ifdef CONFIG_COMPAT
                if (in_compat_syscall())
                        return put_user(val, (compat_ulong_t __user *)arg);
#endif
                return put_user(val, (unsigned long __user *)arg);

        case HDIO_SET_32BIT:
                val = (unsigned long) arg;
                rc = 0;
                spin_lock_irqsave(ap->lock, flags);
                if (ap->pflags & ATA_PFLAG_PIO32CHANGE) {
                        if (val)
                                ap->pflags |= ATA_PFLAG_PIO32;
                        else
                                ap->pflags &= ~ATA_PFLAG_PIO32;
                } else {
                        if (val != ata_ioc32(ap))
                                rc = -EINVAL;
                }
                spin_unlock_irqrestore(ap->lock, flags);
                return rc;

        case HDIO_GET_IDENTITY:
                return ata_get_identity(ap, scsidev, arg);

        case HDIO_DRIVE_CMD:
                if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RAWIO))
                        return -EACCES;
                return ata_cmd_ioctl(scsidev, arg);

        case HDIO_DRIVE_TASK:
                if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RAWIO))
                        return -EACCES;
                return ata_task_ioctl(scsidev, arg);

        default:
                rc = -ENOTTY;
                break;
        }

        return rc;
}
EXPORT_SYMBOL_GPL(ata_sas_scsi_ioctl);

int ata_scsi_ioctl(struct scsi_device *scsidev, unsigned int cmd,
                   void __user *arg)
{
        return ata_sas_scsi_ioctl(ata_shost_to_port(scsidev->host),
                                scsidev, cmd, arg);
}
EXPORT_SYMBOL_GPL(ata_scsi_ioctl);

/**
 *        ata_scsi_qc_new - acquire new ata_queued_cmd reference
 *        @dev: ATA device to which the new command is attached
 *        @cmd: SCSI command that originated this ATA command
 *
 *        Obtain a reference to an unused ata_queued_cmd structure,
 *        which is the basic libata structure representing a single
 *        ATA command sent to the hardware.
 *
 *        If a command was available, fill in the SCSI-specific
 *        portions of the structure with information on the
 *        current command.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 *
 *        RETURNS:
 *        Command allocated, or %NULL if none available.
 */
static struct ata_queued_cmd *ata_scsi_qc_new(struct ata_device *dev,
                                              struct scsi_cmnd *cmd)
{
        struct ata_queued_cmd *qc;

        qc = ata_qc_new_init(dev, cmd->request->tag);
        if (qc) {
                qc->scsicmd = cmd;
                qc->scsidone = cmd->scsi_done;

                qc->sg = scsi_sglist(cmd);
                qc->n_elem = scsi_sg_count(cmd);

                if (cmd->request->rq_flags & RQF_QUIET)
                        qc->flags |= ATA_QCFLAG_QUIET;
        } else {
                cmd->result = (DID_OK << 16) | (QUEUE_FULL << 1);
                cmd->scsi_done(cmd);
        }

        return qc;
}

static void ata_qc_set_pc_nbytes(struct ata_queued_cmd *qc)
{
        struct scsi_cmnd *scmd = qc->scsicmd;

        qc->extrabytes = scmd->extra_len;
        qc->nbytes = scsi_bufflen(scmd) + qc->extrabytes;
}

/**
 *        ata_dump_status - user friendly display of error info
 *        @id: id of the port in question
 *        @tf: ptr to filled out taskfile
 *
 *        Decode and dump the ATA error/status registers for the user so
 *        that they have some idea what really happened at the non
 *        make-believe layer.
 *
 *        LOCKING:
 *        inherited from caller
 */
static void ata_dump_status(unsigned id, struct ata_taskfile *tf)
{
        u8 stat = tf->command, err = tf->feature;

        pr_warn("ata%u: status=0x%02x { ", id, stat);
        if (stat & ATA_BUSY) {
                pr_cont("Busy }\n");        /* Data is not valid in this case */
        } else {
                if (stat & ATA_DRDY)        pr_cont("DriveReady ");
                if (stat & ATA_DF)        pr_cont("DeviceFault ");
                if (stat & ATA_DSC)        pr_cont("SeekComplete ");
                if (stat & ATA_DRQ)        pr_cont("DataRequest ");
                if (stat & ATA_CORR)        pr_cont("CorrectedError ");
                if (stat & ATA_SENSE)        pr_cont("Sense ");
                if (stat & ATA_ERR)        pr_cont("Error ");
                pr_cont("}\n");

                if (err) {
                        pr_warn("ata%u: error=0x%02x { ", id, err);
                        if (err & ATA_ABORTED)        pr_cont("DriveStatusError ");
                        if (err & ATA_ICRC) {
                                if (err & ATA_ABORTED)
                                                pr_cont("BadCRC ");
                                else                pr_cont("Sector ");
                        }
                        if (err & ATA_UNC)        pr_cont("UncorrectableError ");
                        if (err & ATA_IDNF)        pr_cont("SectorIdNotFound ");
                        if (err & ATA_TRK0NF)        pr_cont("TrackZeroNotFound ");
                        if (err & ATA_AMNF)        pr_cont("AddrMarkNotFound ");
                        pr_cont("}\n");
                }
        }
}

/**
 *        ata_to_sense_error - convert ATA error to SCSI error
 *        @id: ATA device number
 *        @drv_stat: value contained in ATA status register
 *        @drv_err: value contained in ATA error register
 *        @sk: the sense key we'll fill out
 *        @asc: the additional sense code we'll fill out
 *        @ascq: the additional sense code qualifier we'll fill out
 *        @verbose: be verbose
 *
 *        Converts an ATA error into a SCSI error.  Fill out pointers to
 *        SK, ASC, and ASCQ bytes for later use in fixed or descriptor
 *        format sense blocks.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 */
static void ata_to_sense_error(unsigned id, u8 drv_stat, u8 drv_err, u8 *sk,
                               u8 *asc, u8 *ascq, int verbose)
{
        int i;

        /* Based on the 3ware driver translation table */
        static const unsigned char sense_table[][4] = {
                /* BBD|ECC|ID|MAR */
                {0xd1,                ABORTED_COMMAND, 0x00, 0x00},
                        // Device busy                  Aborted command
                /* BBD|ECC|ID */
                {0xd0,                ABORTED_COMMAND, 0x00, 0x00},
                        // Device busy                  Aborted command
                /* ECC|MC|MARK */
                {0x61,                HARDWARE_ERROR, 0x00, 0x00},
                        // Device fault                 Hardware error
                /* ICRC|ABRT */                /* NB: ICRC & !ABRT is BBD */
                {0x84,                ABORTED_COMMAND, 0x47, 0x00},
                        // Data CRC error               SCSI parity error
                /* MC|ID|ABRT|TRK0|MARK */
                {0x37,                NOT_READY, 0x04, 0x00},
                        // Unit offline                 Not ready
                /* MCR|MARK */
                {0x09,                NOT_READY, 0x04, 0x00},
                        // Unrecovered disk error       Not ready
                /*  Bad address mark */
                {0x01,                MEDIUM_ERROR, 0x13, 0x00},
                        // Address mark not found for data field
                /* TRK0 - Track 0 not found */
                {0x02,                HARDWARE_ERROR, 0x00, 0x00},
                        // Hardware error
                /* Abort: 0x04 is not translated here, see below */
                /* Media change request */
                {0x08,                NOT_READY, 0x04, 0x00},
                        // FIXME: faking offline
                /* SRV/IDNF - ID not found */
                {0x10,                ILLEGAL_REQUEST, 0x21, 0x00},
                        // Logical address out of range
                /* MC - Media Changed */
                {0x20,                UNIT_ATTENTION, 0x28, 0x00},
                        // Not ready to ready change, medium may have changed
                /* ECC - Uncorrectable ECC error */
                {0x40,                MEDIUM_ERROR, 0x11, 0x04},
                        // Unrecovered read error
                /* BBD - block marked bad */
                {0x80,                MEDIUM_ERROR, 0x11, 0x04},
                        // Block marked bad        Medium error, unrecovered read error
                {0xFF, 0xFF, 0xFF, 0xFF}, // END mark
        };
        static const unsigned char stat_table[][4] = {
                /* Busy: must be first because BUSY means no other bits valid */
                { ATA_BUSY,        ABORTED_COMMAND, 0x00, 0x00 },
                /* Device fault: INTERNAL TARGET FAILURE */
                { ATA_DF,        HARDWARE_ERROR,  0x44, 0x00 },
                /* Corrected data error */
                { ATA_CORR,        RECOVERED_ERROR, 0x00, 0x00 },

                { 0xFF, 0xFF, 0xFF, 0xFF }, /* END mark */
        };

        /*
         *        Is this an error we can process/parse
         */
        if (drv_stat & ATA_BUSY) {
                drv_err = 0;        /* Ignore the err bits, they're invalid */
        }

        if (drv_err) {
                /* Look for drv_err */
                for (i = 0; sense_table[i][0] != 0xFF; i++) {
                        /* Look for best matches first */
                        if ((sense_table[i][0] & drv_err) ==
                            sense_table[i][0]) {
                                *sk = sense_table[i][1];
                                *asc = sense_table[i][2];
                                *ascq = sense_table[i][3];
                                goto translate_done;
                        }
                }
        }

        /*
         * Fall back to interpreting status bits.  Note that if the drv_err
         * has only the ABRT bit set, we decode drv_stat.  ABRT by itself
         * is not descriptive enough.
         */
        for (i = 0; stat_table[i][0] != 0xFF; i++) {
                if (stat_table[i][0] & drv_stat) {
                        *sk = stat_table[i][1];
                        *asc = stat_table[i][2];
                        *ascq = stat_table[i][3];
                        goto translate_done;
                }
        }

        /*
         * We need a sensible error return here, which is tricky, and one
         * that won't cause people to do things like return a disk wrongly.
         */
        *sk = ABORTED_COMMAND;
        *asc = 0x00;
        *ascq = 0x00;

 translate_done:
        if (verbose)
                pr_err("ata%u: translated ATA stat/err 0x%02x/%02x to SCSI SK/ASC/ASCQ 0x%x/%02x/%02x\n",
                       id, drv_stat, drv_err, *sk, *asc, *ascq);
        return;
}

/*
 *        ata_gen_passthru_sense - Generate check condition sense block.
 *        @qc: Command that completed.
 *
 *        This function is specific to the ATA descriptor format sense
 *        block specified for the ATA pass through commands.  Regardless
 *        of whether the command errored or not, return a sense
 *        block. Copy all controller registers into the sense
 *        block. If there was no error, we get the request from an ATA
 *        passthrough command, so we use the following sense data:
 *        sk = RECOVERED ERROR
 *        asc,ascq = ATA PASS-THROUGH INFORMATION AVAILABLE
 *      
 *
 *        LOCKING:
 *        None.
 */
static void ata_gen_passthru_sense(struct ata_queued_cmd *qc)
{
        struct scsi_cmnd *cmd = qc->scsicmd;
        struct ata_taskfile *tf = &qc->result_tf;
        unsigned char *sb = cmd->sense_buffer;
        unsigned char *desc = sb + 8;
        int verbose = qc->ap->ops->error_handler == NULL;
        u8 sense_key, asc, ascq;

        memset(sb, 0, SCSI_SENSE_BUFFERSIZE);

        cmd->result = (DRIVER_SENSE << 24) | SAM_STAT_CHECK_CONDITION;

        /*
         * Use ata_to_sense_error() to map status register bits
         * onto sense key, asc & ascq.
         */
        if (qc->err_mask ||
            tf->command & (ATA_BUSY | ATA_DF | ATA_ERR | ATA_DRQ)) {
                ata_to_sense_error(qc->ap->print_id, tf->command, tf->feature,
                                   &sense_key, &asc, &ascq, verbose);
                ata_scsi_set_sense(qc->dev, cmd, sense_key, asc, ascq);
        } else {
                /*
                 * ATA PASS-THROUGH INFORMATION AVAILABLE
                 * Always in descriptor format sense.
                 */
                scsi_build_sense_buffer(1, cmd->sense_buffer,
                                        RECOVERED_ERROR, 0, 0x1D);
        }

        if ((cmd->sense_buffer[0] & 0x7f) >= 0x72) {
                u8 len;

                /* descriptor format */
                len = sb[7];
                desc = (char *)scsi_sense_desc_find(sb, len + 8, 9);
                if (!desc) {
                        if (SCSI_SENSE_BUFFERSIZE < len + 14)
                                return;
                        sb[7] = len + 14;
                        desc = sb + 8 + len;
                }
                desc[0] = 9;
                desc[1] = 12;
                /*
                 * Copy registers into sense buffer.
                 */
                desc[2] = 0x00;
                desc[3] = tf->feature;        /* == error reg */
                desc[5] = tf->nsect;
                desc[7] = tf->lbal;
                desc[9] = tf->lbam;
                desc[11] = tf->lbah;
                desc[12] = tf->device;
                desc[13] = tf->command; /* == status reg */

                /*
                 * Fill in Extend bit, and the high order bytes
                 * if applicable.
                 */
                if (tf->flags & ATA_TFLAG_LBA48) {
                        desc[2] |= 0x01;
                        desc[4] = tf->hob_nsect;
                        desc[6] = tf->hob_lbal;
                        desc[8] = tf->hob_lbam;
                        desc[10] = tf->hob_lbah;
                }
        } else {
                /* Fixed sense format */
                desc[0] = tf->feature;
                desc[1] = tf->command; /* status */
                desc[2] = tf->device;
                desc[3] = tf->nsect;
                desc[7] = 0;
                if (tf->flags & ATA_TFLAG_LBA48)  {
                        desc[8] |= 0x80;
                        if (tf->hob_nsect)
                                desc[8] |= 0x40;
                        if (tf->hob_lbal || tf->hob_lbam || tf->hob_lbah)
                                desc[8] |= 0x20;
                }
                desc[9] = tf->lbal;
                desc[10] = tf->lbam;
                desc[11] = tf->lbah;
        }
}

/**
 *        ata_gen_ata_sense - generate a SCSI fixed sense block
 *        @qc: Command that we are erroring out
 *
 *        Generate sense block for a failed ATA command @qc.  Descriptor
 *        format is used to accommodate LBA48 block address.
 *
 *        LOCKING:
 *        None.
 */
static void ata_gen_ata_sense(struct ata_queued_cmd *qc)
{
        struct ata_device *dev = qc->dev;
        struct scsi_cmnd *cmd = qc->scsicmd;
        struct ata_taskfile *tf = &qc->result_tf;
        unsigned char *sb = cmd->sense_buffer;
        int verbose = qc->ap->ops->error_handler == NULL;
        u64 block;
        u8 sense_key, asc, ascq;

        memset(sb, 0, SCSI_SENSE_BUFFERSIZE);

        cmd->result = (DRIVER_SENSE << 24) | SAM_STAT_CHECK_CONDITION;

        if (ata_dev_disabled(dev)) {
                /* Device disabled after error recovery */
                /* LOGICAL UNIT NOT READY, HARD RESET REQUIRED */
                ata_scsi_set_sense(dev, cmd, NOT_READY, 0x04, 0x21);
                return;
        }

        if (ata_id_is_locked(dev->id)) {
                /* Security locked */
                /* LOGICAL UNIT ACCESS NOT AUTHORIZED */
                ata_scsi_set_sense(dev, cmd, DATA_PROTECT, 0x74, 0x71);
                return;
        }

        /* Use ata_to_sense_error() to map status register bits
         * onto sense key, asc & ascq.
         */
        if (qc->err_mask ||
            tf->command & (ATA_BUSY | ATA_DF | ATA_ERR | ATA_DRQ)) {
                ata_to_sense_error(qc->ap->print_id, tf->command, tf->feature,
                                   &sense_key, &asc, &ascq, verbose);
                ata_scsi_set_sense(dev, cmd, sense_key, asc, ascq);
        } else {
                /* Could not decode error */
                ata_dev_warn(dev, "could not decode error status 0x%x err_mask 0x%x\n",
                             tf->command, qc->err_mask);
                ata_scsi_set_sense(dev, cmd, ABORTED_COMMAND, 0, 0);
                return;
        }

        block = ata_tf_read_block(&qc->result_tf, dev);
        if (block == U64_MAX)
                return;

        scsi_set_sense_information(sb, SCSI_SENSE_BUFFERSIZE, block);
}

void ata_scsi_sdev_config(struct scsi_device *sdev)
{
        sdev->use_10_for_rw = 1;
        sdev->use_10_for_ms = 1;
        sdev->no_write_same = 1;

        /* Schedule policy is determined by ->qc_defer() callback and
         * it needs to see every deferred qc.  Set dev_blocked to 1 to
         * prevent SCSI midlayer from automatically deferring
         * requests.
         */
        sdev->max_device_blocked = 1;
}

/**
 *        ata_scsi_dma_need_drain - Check whether data transfer may overflow
 *        @rq: request to be checked
 *
 *        ATAPI commands which transfer variable length data to host
 *        might overflow due to application error or hardware bug.  This
 *        function checks whether overflow should be drained and ignored
 *        for @request.
 *
 *        LOCKING:
 *        None.
 *
 *        RETURNS:
 *        1 if ; otherwise, 0.
 */
bool ata_scsi_dma_need_drain(struct request *rq)
{
        return atapi_cmd_type(scsi_req(rq)->cmd[0]) == ATAPI_MISC;
}
EXPORT_SYMBOL_GPL(ata_scsi_dma_need_drain);

int ata_scsi_dev_config(struct scsi_device *sdev, struct ata_device *dev)
{
        struct request_queue *q = sdev->request_queue;

        if (!ata_id_has_unload(dev->id))
                dev->flags |= ATA_DFLAG_NO_UNLOAD;

        /* configure max sectors */
        blk_queue_max_hw_sectors(q, dev->max_sectors);

        if (dev->class == ATA_DEV_ATAPI) {
                sdev->sector_size = ATA_SECT_SIZE;

                /* set DMA padding */
                blk_queue_update_dma_pad(q, ATA_DMA_PAD_SZ - 1);

                /* make room for appending the drain */
                blk_queue_max_segments(q, queue_max_segments(q) - 1);

                sdev->dma_drain_len = ATAPI_MAX_DRAIN;
                sdev->dma_drain_buf = kmalloc(sdev->dma_drain_len,
                                q->bounce_gfp | GFP_KERNEL);
                if (!sdev->dma_drain_buf) {
                        ata_dev_err(dev, "drain buffer allocation failed\n");
                        return -ENOMEM;
                }
        } else {
                sdev->sector_size = ata_id_logical_sector_size(dev->id);
                sdev->manage_start_stop = 1;
        }

        /*
         * ata_pio_sectors() expects buffer for each sector to not cross
         * page boundary.  Enforce it by requiring buffers to be sector
         * aligned, which works iff sector_size is not larger than
         * PAGE_SIZE.  ATAPI devices also need the alignment as
         * IDENTIFY_PACKET is executed as ATA_PROT_PIO.
         */
        if (sdev->sector_size > PAGE_SIZE)
                ata_dev_warn(dev,
                        "sector_size=%u > PAGE_SIZE, PIO may malfunction\n",
                        sdev->sector_size);

        blk_queue_update_dma_alignment(q, sdev->sector_size - 1);

        if (dev->flags & ATA_DFLAG_AN)
                set_bit(SDEV_EVT_MEDIA_CHANGE, sdev->supported_events);

        if (dev->flags & ATA_DFLAG_NCQ) {
                int depth;

                depth = min(sdev->host->can_queue, ata_id_queue_depth(dev->id));
                depth = min(ATA_MAX_QUEUE, depth);
                scsi_change_queue_depth(sdev, depth);
        }

        if (dev->flags & ATA_DFLAG_TRUSTED)
                sdev->security_supported = 1;

        dev->sdev = sdev;
        return 0;
}

/**
 *        ata_scsi_slave_config - Set SCSI device attributes
 *        @sdev: SCSI device to examine
 *
 *        This is called before we actually start reading
 *        and writing to the device, to configure certain
 *        SCSI mid-layer behaviors.
 *
 *        LOCKING:
 *        Defined by SCSI layer.  We don't really care.
 */

int ata_scsi_slave_config(struct scsi_device *sdev)
{
        struct ata_port *ap = ata_shost_to_port(sdev->host);
        struct ata_device *dev = __ata_scsi_find_dev(ap, sdev);
        int rc = 0;

        ata_scsi_sdev_config(sdev);

        if (dev)
                rc = ata_scsi_dev_config(sdev, dev);

        return rc;
}
EXPORT_SYMBOL_GPL(ata_scsi_slave_config);

/**
 *        ata_scsi_slave_destroy - SCSI device is about to be destroyed
 *        @sdev: SCSI device to be destroyed
 *
 *        @sdev is about to be destroyed for hot/warm unplugging.  If
 *        this unplugging was initiated by libata as indicated by NULL
 *        dev->sdev, this function doesn't have to do anything.
 *        Otherwise, SCSI layer initiated warm-unplug is in progress.
 *        Clear dev->sdev, schedule the device for ATA detach and invoke
 *        EH.
 *
 *        LOCKING:
 *        Defined by SCSI layer.  We don't really care.
 */
void ata_scsi_slave_destroy(struct scsi_device *sdev)
{
        struct ata_port *ap = ata_shost_to_port(sdev->host);
        unsigned long flags;
        struct ata_device *dev;

        if (!ap->ops->error_handler)
                return;

        spin_lock_irqsave(ap->lock, flags);
        dev = __ata_scsi_find_dev(ap, sdev);
        if (dev && dev->sdev) {
                /* SCSI device already in CANCEL state, no need to offline it */
                dev->sdev = NULL;
                dev->flags |= ATA_DFLAG_DETACH;
                ata_port_schedule_eh(ap);
        }
        spin_unlock_irqrestore(ap->lock, flags);

        kfree(sdev->dma_drain_buf);
}
EXPORT_SYMBOL_GPL(ata_scsi_slave_destroy);

/**
 *        ata_scsi_start_stop_xlat - Translate SCSI START STOP UNIT command
 *        @qc: Storage for translated ATA taskfile
 *
 *        Sets up an ATA taskfile to issue STANDBY (to stop) or READ VERIFY
 *        (to start). Perhaps these commands should be preceded by
 *        CHECK POWER MODE to see what power mode the device is already in.
 *        [See SAT revision 5 at www.t10.org]
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 *
 *        RETURNS:
 *        Zero on success, non-zero on error.
 */
static unsigned int ata_scsi_start_stop_xlat(struct ata_queued_cmd *qc)
{
        struct scsi_cmnd *scmd = qc->scsicmd;
        struct ata_taskfile *tf = &qc->tf;
        const u8 *cdb = scmd->cmnd;
        u16 fp;
        u8 bp = 0xff;

        if (scmd->cmd_len < 5) {
                fp = 4;
                goto invalid_fld;
        }

        tf->flags |= ATA_TFLAG_DEVICE | ATA_TFLAG_ISADDR;
        tf->protocol = ATA_PROT_NODATA;
        if (cdb[1] & 0x1) {
                ;        /* ignore IMMED bit, violates sat-r05 */
        }
        if (cdb[4] & 0x2) {
                fp = 4;
                bp = 1;
                goto invalid_fld;       /* LOEJ bit set not supported */
        }
        if (((cdb[4] >> 4) & 0xf) != 0) {
                fp = 4;
                bp = 3;
                goto invalid_fld;       /* power conditions not supported */
        }

        if (cdb[4] & 0x1) {
                tf->nsect = 1;        /* 1 sector, lba=0 */

                if (qc->dev->flags & ATA_DFLAG_LBA) {
                        tf->flags |= ATA_TFLAG_LBA;

                        tf->lbah = 0x0;
                        tf->lbam = 0x0;
                        tf->lbal = 0x0;
                        tf->device |= ATA_LBA;
                } else {
                        /* CHS */
                        tf->lbal = 0x1; /* sect */
                        tf->lbam = 0x0; /* cyl low */
                        tf->lbah = 0x0; /* cyl high */
                }

                tf->command = ATA_CMD_VERIFY;        /* READ VERIFY */
        } else {
                /* Some odd clown BIOSen issue spindown on power off (ACPI S4
                 * or S5) causing some drives to spin up and down again.
                 */
                if ((qc->ap->flags & ATA_FLAG_NO_POWEROFF_SPINDOWN) &&
                    system_state == SYSTEM_POWER_OFF)
                        goto skip;

                if ((qc->ap->flags & ATA_FLAG_NO_HIBERNATE_SPINDOWN) &&
                     system_entering_hibernation())
                        goto skip;

                /* Issue ATA STANDBY IMMEDIATE command */
                tf->command = ATA_CMD_STANDBYNOW1;
        }

        /*
         * Standby and Idle condition timers could be implemented but that
         * would require libata to implement the Power condition mode page
         * and allow the user to change it. Changing mode pages requires
         * MODE SELECT to be implemented.
         */

        return 0;

 invalid_fld:
        ata_scsi_set_invalid_field(qc->dev, scmd, fp, bp);
        return 1;
 skip:
        scmd->result = SAM_STAT_GOOD;
        return 1;
}


/**
 *        ata_scsi_flush_xlat - Translate SCSI SYNCHRONIZE CACHE command
 *        @qc: Storage for translated ATA taskfile
 *
 *        Sets up an ATA taskfile to issue FLUSH CACHE or
 *        FLUSH CACHE EXT.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 *
 *        RETURNS:
 *        Zero on success, non-zero on error.
 */
static unsigned int ata_scsi_flush_xlat(struct ata_queued_cmd *qc)
{
        struct ata_taskfile *tf = &qc->tf;

        tf->flags |= ATA_TFLAG_DEVICE;
        tf->protocol = ATA_PROT_NODATA;

        if (qc->dev->flags & ATA_DFLAG_FLUSH_EXT)
                tf->command = ATA_CMD_FLUSH_EXT;
        else
                tf->command = ATA_CMD_FLUSH;

        /* flush is critical for IO integrity, consider it an IO command */
        qc->flags |= ATA_QCFLAG_IO;

        return 0;
}

/**
 *        scsi_6_lba_len - Get LBA and transfer length
 *        @cdb: SCSI command to translate
 *
 *        Calculate LBA and transfer length for 6-byte commands.
 *
 *        RETURNS:
 *        @plba: the LBA
 *        @plen: the transfer length
 */
static void scsi_6_lba_len(const u8 *cdb, u64 *plba, u32 *plen)
{
        u64 lba = 0;
        u32 len;

        lba |= ((u64)(cdb[1] & 0x1f)) << 16;
        lba |= ((u64)cdb[2]) << 8;
        lba |= ((u64)cdb[3]);

        len = cdb[4];

        *plba = lba;
        *plen = len;
}

/**
 *        scsi_10_lba_len - Get LBA and transfer length
 *        @cdb: SCSI command to translate
 *
 *        Calculate LBA and transfer length for 10-byte commands.
 *
 *        RETURNS:
 *        @plba: the LBA
 *        @plen: the transfer length
 */
static void scsi_10_lba_len(const u8 *cdb, u64 *plba, u32 *plen)
{
        u64 lba = 0;
        u32 len = 0;

        lba |= ((u64)cdb[2]) << 24;
        lba |= ((u64)cdb[3]) << 16;
        lba |= ((u64)cdb[4]) << 8;
        lba |= ((u64)cdb[5]);

        len |= ((u32)cdb[7]) << 8;
        len |= ((u32)cdb[8]);

        *plba = lba;
        *plen = len;
}

/**
 *        scsi_16_lba_len - Get LBA and transfer length
 *        @cdb: SCSI command to translate
 *
 *        Calculate LBA and transfer length for 16-byte commands.
 *
 *        RETURNS:
 *        @plba: the LBA
 *        @plen: the transfer length
 */
static void scsi_16_lba_len(const u8 *cdb, u64 *plba, u32 *plen)
{
        u64 lba = 0;
        u32 len = 0;

        lba |= ((u64)cdb[2]) << 56;
        lba |= ((u64)cdb[3]) << 48;
        lba |= ((u64)cdb[4]) << 40;
        lba |= ((u64)cdb[5]) << 32;
        lba |= ((u64)cdb[6]) << 24;
        lba |= ((u64)cdb[7]) << 16;
        lba |= ((u64)cdb[8]) << 8;
        lba |= ((u64)cdb[9]);

        len |= ((u32)cdb[10]) << 24;
        len |= ((u32)cdb[11]) << 16;
        len |= ((u32)cdb[12]) << 8;
        len |= ((u32)cdb[13]);

        *plba = lba;
        *plen = len;
}

/**
 *        ata_scsi_verify_xlat - Translate SCSI VERIFY command into an ATA one
 *        @qc: Storage for translated ATA taskfile
 *
 *        Converts SCSI VERIFY command to an ATA READ VERIFY command.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 *
 *        RETURNS:
 *        Zero on success, non-zero on error.
 */
static unsigned int ata_scsi_verify_xlat(struct ata_queued_cmd *qc)
{
        struct scsi_cmnd *scmd = qc->scsicmd;
        struct ata_taskfile *tf = &qc->tf;
        struct ata_device *dev = qc->dev;
        u64 dev_sectors = qc->dev->n_sectors;
        const u8 *cdb = scmd->cmnd;
        u64 block;
        u32 n_block;
        u16 fp;

        tf->flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE;
        tf->protocol = ATA_PROT_NODATA;

        if (cdb[0] == VERIFY) {
                if (scmd->cmd_len < 10) {
                        fp = 9;
                        goto invalid_fld;
                }
                scsi_10_lba_len(cdb, &block, &n_block);
        } else if (cdb[0] == VERIFY_16) {
                if (scmd->cmd_len < 16) {
                        fp = 15;
                        goto invalid_fld;
                }
                scsi_16_lba_len(cdb, &block, &n_block);
        } else {
                fp = 0;
                goto invalid_fld;
        }

        if (!n_block)
                goto nothing_to_do;
        if (block >= dev_sectors)
                goto out_of_range;
        if ((block + n_block) > dev_sectors)
                goto out_of_range;

        if (dev->flags & ATA_DFLAG_LBA) {
                tf->flags |= ATA_TFLAG_LBA;

                if (lba_28_ok(block, n_block)) {
                        /* use LBA28 */
                        tf->command = ATA_CMD_VERIFY;
                        tf->device |= (block >> 24) & 0xf;
                } else if (lba_48_ok(block, n_block)) {
                        if (!(dev->flags & ATA_DFLAG_LBA48))
                                goto out_of_range;

                        /* use LBA48 */
                        tf->flags |= ATA_TFLAG_LBA48;
                        tf->command = ATA_CMD_VERIFY_EXT;

                        tf->hob_nsect = (n_block >> 8) & 0xff;

                        tf->hob_lbah = (block >> 40) & 0xff;
                        tf->hob_lbam = (block >> 32) & 0xff;
                        tf->hob_lbal = (block >> 24) & 0xff;
                } else
                        /* request too large even for LBA48 */
                        goto out_of_range;

                tf->nsect = n_block & 0xff;

                tf->lbah = (block >> 16) & 0xff;
                tf->lbam = (block >> 8) & 0xff;
                tf->lbal = block & 0xff;

                tf->device |= ATA_LBA;
        } else {
                /* CHS */
                u32 sect, head, cyl, track;

                if (!lba_28_ok(block, n_block))
                        goto out_of_range;

                /* Convert LBA to CHS */
                track = (u32)block / dev->sectors;
                cyl   = track / dev->heads;
                head  = track % dev->heads;
                sect  = (u32)block % dev->sectors + 1;

                DPRINTK("block %u track %u cyl %u head %u sect %u\n",
                        (u32)block, track, cyl, head, sect);

                /* Check whether the converted CHS can fit.
                   Cylinder: 0-65535
                   Head: 0-15
                   Sector: 1-255*/
                if ((cyl >> 16) || (head >> 4) || (sect >> 8) || (!sect))
                        goto out_of_range;

                tf->command = ATA_CMD_VERIFY;
                tf->nsect = n_block & 0xff; /* Sector count 0 means 256 sectors */
                tf->lbal = sect;
                tf->lbam = cyl;
                tf->lbah = cyl >> 8;
                tf->device |= head;
        }

        return 0;

invalid_fld:
        ata_scsi_set_invalid_field(qc->dev, scmd, fp, 0xff);
        return 1;

out_of_range:
        ata_scsi_set_sense(qc->dev, scmd, ILLEGAL_REQUEST, 0x21, 0x0);
        /* "Logical Block Address out of range" */
        return 1;

nothing_to_do:
        scmd->result = SAM_STAT_GOOD;
        return 1;
}

static bool ata_check_nblocks(struct scsi_cmnd *scmd, u32 n_blocks)
{
        struct request *rq = scmd->request;
        u32 req_blocks;

        if (!blk_rq_is_passthrough(rq))
                return true;

        req_blocks = blk_rq_bytes(rq) / scmd->device->sector_size;
        if (n_blocks > req_blocks)
                return false;

        return true;
}

/**
 *        ata_scsi_rw_xlat - Translate SCSI r/w command into an ATA one
 *        @qc: Storage for translated ATA taskfile
 *
 *        Converts any of six SCSI read/write commands into the
 *        ATA counterpart, including starting sector (LBA),
 *        sector count, and taking into account the device's LBA48
 *        support.
 *
 *        Commands %READ_6, %READ_10, %READ_16, %WRITE_6, %WRITE_10, and
 *        %WRITE_16 are currently supported.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 *
 *        RETURNS:
 *        Zero on success, non-zero on error.
 */
static unsigned int ata_scsi_rw_xlat(struct ata_queued_cmd *qc)
{
        struct scsi_cmnd *scmd = qc->scsicmd;
        const u8 *cdb = scmd->cmnd;
        struct request *rq = scmd->request;
        int class = IOPRIO_PRIO_CLASS(req_get_ioprio(rq));
        unsigned int tf_flags = 0;
        u64 block;
        u32 n_block;
        int rc;
        u16 fp = 0;

        if (cdb[0] == WRITE_10 || cdb[0] == WRITE_6 || cdb[0] == WRITE_16)
                tf_flags |= ATA_TFLAG_WRITE;

        /* Calculate the SCSI LBA, transfer length and FUA. */
        switch (cdb[0]) {
        case READ_10:
        case WRITE_10:
                if (unlikely(scmd->cmd_len < 10)) {
                        fp = 9;
                        goto invalid_fld;
                }
                scsi_10_lba_len(cdb, &block, &n_block);
                if (cdb[1] & (1 << 3))
                        tf_flags |= ATA_TFLAG_FUA;
                if (!ata_check_nblocks(scmd, n_block))
                        goto invalid_fld;
                break;
        case READ_6:
        case WRITE_6:
                if (unlikely(scmd->cmd_len < 6)) {
                        fp = 5;
                        goto invalid_fld;
                }
                scsi_6_lba_len(cdb, &block, &n_block);

                /* for 6-byte r/w commands, transfer length 0
                 * means 256 blocks of data, not 0 block.
                 */
                if (!n_block)
                        n_block = 256;
                if (!ata_check_nblocks(scmd, n_block))
                        goto invalid_fld;
                break;
        case READ_16:
        case WRITE_16:
                if (unlikely(scmd->cmd_len < 16)) {
                        fp = 15;
                        goto invalid_fld;
                }
                scsi_16_lba_len(cdb, &block, &n_block);
                if (cdb[1] & (1 << 3))
                        tf_flags |= ATA_TFLAG_FUA;
                if (!ata_check_nblocks(scmd, n_block))
                        goto invalid_fld;
                break;
        default:
                DPRINTK("no-byte command\n");
                fp = 0;
                goto invalid_fld;
        }

        /* Check and compose ATA command */
        if (!n_block)
                /* For 10-byte and 16-byte SCSI R/W commands, transfer
                 * length 0 means transfer 0 block of data.
                 * However, for ATA R/W commands, sector count 0 means
                 * 256 or 65536 sectors, not 0 sectors as in SCSI.
                 *
                 * WARNING: one or two older ATA drives treat 0 as 0...
                 */
                goto nothing_to_do;

        qc->flags |= ATA_QCFLAG_IO;
        qc->nbytes = n_block * scmd->device->sector_size;

        rc = ata_build_rw_tf(&qc->tf, qc->dev, block, n_block, tf_flags,
                             qc->hw_tag, class);

        if (likely(rc == 0))
                return 0;

        if (rc == -ERANGE)
                goto out_of_range;
        /* treat all other errors as -EINVAL, fall through */
invalid_fld:
        ata_scsi_set_invalid_field(qc->dev, scmd, fp, 0xff);
        return 1;

out_of_range:
        ata_scsi_set_sense(qc->dev, scmd, ILLEGAL_REQUEST, 0x21, 0x0);
        /* "Logical Block Address out of range" */
        return 1;

nothing_to_do:
        scmd->result = SAM_STAT_GOOD;
        return 1;
}

static void ata_qc_done(struct ata_queued_cmd *qc)
{
        struct scsi_cmnd *cmd = qc->scsicmd;
        void (*done)(struct scsi_cmnd *) = qc->scsidone;

        ata_qc_free(qc);
        done(cmd);
}

static void ata_scsi_qc_complete(struct ata_queued_cmd *qc)
{
        struct ata_port *ap = qc->ap;
        struct scsi_cmnd *cmd = qc->scsicmd;
        u8 *cdb = cmd->cmnd;
        int need_sense = (qc->err_mask != 0);

        /* For ATA pass thru (SAT) commands, generate a sense block if
         * user mandated it or if there's an error.  Note that if we
         * generate because the user forced us to [CK_COND =1], a check
         * condition is generated and the ATA register values are returned
         * whether the command completed successfully or not. If there
         * was no error, we use the following sense data:
         * sk = RECOVERED ERROR
         * asc,ascq = ATA PASS-THROUGH INFORMATION AVAILABLE
         */
        if (((cdb[0] == ATA_16) || (cdb[0] == ATA_12)) &&
            ((cdb[2] & 0x20) || need_sense))
                ata_gen_passthru_sense(qc);
        else if (qc->flags & ATA_QCFLAG_SENSE_VALID)
                cmd->result = SAM_STAT_CHECK_CONDITION;
        else if (need_sense)
                ata_gen_ata_sense(qc);
        else
                cmd->result = SAM_STAT_GOOD;

        if (need_sense && !ap->ops->error_handler)
                ata_dump_status(ap->print_id, &qc->result_tf);

        ata_qc_done(qc);
}

static int ata_scsi_qc_issue(struct ata_port *ap, struct ata_queued_cmd *qc)
{
        int ret;

        if (!ap->ops->qc_defer)
                goto issue;

        /* Check if the command needs to be deferred. */
        ret = ap->ops->qc_defer(qc);
        switch (ret) {
        case 0:
                break;
        case ATA_DEFER_LINK:
                ret = SCSI_MLQUEUE_DEVICE_BUSY;
                break;
        case ATA_DEFER_PORT:
                ret = SCSI_MLQUEUE_HOST_BUSY;
                break;
        default:
                WARN_ON_ONCE(1);
                ret = SCSI_MLQUEUE_HOST_BUSY;
                break;
        }

        if (ret) {
                /* Force a requeue of the command to defer its execution. */
                ata_qc_free(qc);
                return ret;
        }

issue:
        ata_qc_issue(qc);

        return 0;
}

/**
 *        ata_scsi_translate - Translate then issue SCSI command to ATA device
 *        @dev: ATA device to which the command is addressed
 *        @cmd: SCSI command to execute
 *        @xlat_func: Actor which translates @cmd to an ATA taskfile
 *
 *        Our ->queuecommand() function has decided that the SCSI
 *        command issued can be directly translated into an ATA
 *        command, rather than handled internally.
 *
 *        This function sets up an ata_queued_cmd structure for the
 *        SCSI command, and sends that ata_queued_cmd to the hardware.
 *
 *        The xlat_func argument (actor) returns 0 if ready to execute
 *        ATA command, else 1 to finish translation. If 1 is returned
 *        then cmd->result (and possibly cmd->sense_buffer) are assumed
 *        to be set reflecting an error condition or clean (early)
 *        termination.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 *
 *        RETURNS:
 *        0 on success, SCSI_ML_QUEUE_DEVICE_BUSY or SCSI_MLQUEUE_HOST_BUSY if the
 *        command needs to be deferred.
 */
static int ata_scsi_translate(struct ata_device *dev, struct scsi_cmnd *cmd,
                              ata_xlat_func_t xlat_func)
{
        struct ata_port *ap = dev->link->ap;
        struct ata_queued_cmd *qc;

        lockdep_assert_held(ap->lock);

        /*
         * ata_scsi_qc_new() calls scsi_done(cmd) in case of failure. So we
         * have nothing further to do when allocating a qc fails.
         */
        qc = ata_scsi_qc_new(dev, cmd);
        if (!qc)
                return 0;

        /* data is present; dma-map it */
        if (cmd->sc_data_direction == DMA_FROM_DEVICE ||
            cmd->sc_data_direction == DMA_TO_DEVICE) {
                if (unlikely(scsi_bufflen(cmd) < 1)) {
                        ata_dev_warn(dev, "WARNING: zero len r/w req\n");
                        cmd->result = (DID_ERROR << 16);
                        goto done;
                }

                ata_sg_init(qc, scsi_sglist(cmd), scsi_sg_count(cmd));
                qc->dma_dir = cmd->sc_data_direction;
        }

        qc->complete_fn = ata_scsi_qc_complete;

        if (xlat_func(qc))
                goto done;

        return ata_scsi_qc_issue(ap, qc);

done:
        ata_qc_free(qc);
        cmd->scsi_done(cmd);
        DPRINTK("EXIT - early finish (good or error)\n");
        return 0;
}

struct ata_scsi_args {
        struct ata_device        *dev;
        u16                        *id;
        struct scsi_cmnd        *cmd;
};

/**
 *        ata_scsi_rbuf_get - Map response buffer.
 *        @cmd: SCSI command containing buffer to be mapped.
 *        @flags: unsigned long variable to store irq enable status
 *        @copy_in: copy in from user buffer
 *
 *        Prepare buffer for simulated SCSI commands.
 *
 *        LOCKING:
 *        spin_lock_irqsave(ata_scsi_rbuf_lock) on success
 *
 *        RETURNS:
 *        Pointer to response buffer.
 */
static void *ata_scsi_rbuf_get(struct scsi_cmnd *cmd, bool copy_in,
                               unsigned long *flags)
{
        spin_lock_irqsave(&ata_scsi_rbuf_lock, *flags);

        memset(ata_scsi_rbuf, 0, ATA_SCSI_RBUF_SIZE);
        if (copy_in)
                sg_copy_to_buffer(scsi_sglist(cmd), scsi_sg_count(cmd),
                                  ata_scsi_rbuf, ATA_SCSI_RBUF_SIZE);
        return ata_scsi_rbuf;
}

/**
 *        ata_scsi_rbuf_put - Unmap response buffer.
 *        @cmd: SCSI command containing buffer to be unmapped.
 *        @copy_out: copy out result
 *        @flags: @flags passed to ata_scsi_rbuf_get()
 *
 *        Returns rbuf buffer.  The result is copied to @cmd's buffer if
 *        @copy_back is true.
 *
 *        LOCKING:
 *        Unlocks ata_scsi_rbuf_lock.
 */
static inline void ata_scsi_rbuf_put(struct scsi_cmnd *cmd, bool copy_out,
                                     unsigned long *flags)
{
        if (copy_out)
                sg_copy_from_buffer(scsi_sglist(cmd), scsi_sg_count(cmd),
                                    ata_scsi_rbuf, ATA_SCSI_RBUF_SIZE);
        spin_unlock_irqrestore(&ata_scsi_rbuf_lock, *flags);
}

/**
 *        ata_scsi_rbuf_fill - wrapper for SCSI command simulators
 *        @args: device IDENTIFY data / SCSI command of interest.
 *        @actor: Callback hook for desired SCSI command simulator
 *
 *        Takes care of the hard work of simulating a SCSI command...
 *        Mapping the response buffer, calling the command's handler,
 *        and handling the handler's return value.  This return value
 *        indicates whether the handler wishes the SCSI command to be
 *        completed successfully (0), or not (in which case cmd->result
 *        and sense buffer are assumed to be set).
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 */
static void ata_scsi_rbuf_fill(struct ata_scsi_args *args,
                unsigned int (*actor)(struct ata_scsi_args *args, u8 *rbuf))
{
        u8 *rbuf;
        unsigned int rc;
        struct scsi_cmnd *cmd = args->cmd;
        unsigned long flags;

        rbuf = ata_scsi_rbuf_get(cmd, false, &flags);
        rc = actor(args, rbuf);
        ata_scsi_rbuf_put(cmd, rc == 0, &flags);

        if (rc == 0)
                cmd->result = SAM_STAT_GOOD;
}

/**
 *        ata_scsiop_inq_std - Simulate INQUIRY command
 *        @args: device IDENTIFY data / SCSI command of interest.
 *        @rbuf: Response buffer, to which simulated SCSI cmd output is sent.
 *
 *        Returns standard device identification data associated
 *        with non-VPD INQUIRY command output.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 */
static unsigned int ata_scsiop_inq_std(struct ata_scsi_args *args, u8 *rbuf)
{
        static const u8 versions[] = {
                0x00,
                0x60,        /* SAM-3 (no version claimed) */

                0x03,
                0x20,        /* SBC-2 (no version claimed) */

                0x03,
                0x00        /* SPC-3 (no version claimed) */
        };
        static const u8 versions_zbc[] = {
                0x00,
                0xA0,        /* SAM-5 (no version claimed) */

                0x06,
                0x00,        /* SBC-4 (no version claimed) */

                0x05,
                0xC0,        /* SPC-5 (no version claimed) */

                0x60,
                0x24,   /* ZBC r05 */
        };

        u8 hdr[] = {
                TYPE_DISK,
                0,
                0x5,        /* claim SPC-3 version compatibility */
                2,
                95 - 4,
                0,
                0,
                2
        };

        /* set scsi removable (RMB) bit per ata bit, or if the
         * AHCI port says it's external (Hotplug-capable, eSATA).
         */
        if (ata_id_removable(args->id) ||
            (args->dev->link->ap->pflags & ATA_PFLAG_EXTERNAL))
                hdr[1] |= (1 << 7);

        if (args->dev->class == ATA_DEV_ZAC) {
                hdr[0] = TYPE_ZBC;
                hdr[2] = 0x7; /* claim SPC-5 version compatibility */
        }

        memcpy(rbuf, hdr, sizeof(hdr));
        memcpy(&rbuf[8], "ATA     ", 8);
        ata_id_string(args->id, &rbuf[16], ATA_ID_PROD, 16);

        /* From SAT, use last 2 words from fw rev unless they are spaces */
        ata_id_string(args->id, &rbuf[32], ATA_ID_FW_REV + 2, 4);
        if (strncmp(&rbuf[32], "    ", 4) == 0)
                ata_id_string(args->id, &rbuf[32], ATA_ID_FW_REV, 4);

        if (rbuf[32] == 0 || rbuf[32] == ' ')
                memcpy(&rbuf[32], "n/a ", 4);

        if (ata_id_zoned_cap(args->id) || args->dev->class == ATA_DEV_ZAC)
                memcpy(rbuf + 58, versions_zbc, sizeof(versions_zbc));
        else
                memcpy(rbuf + 58, versions, sizeof(versions));

        return 0;
}

/**
 *        ata_scsiop_inq_00 - Simulate INQUIRY VPD page 0, list of pages
 *        @args: device IDENTIFY data / SCSI command of interest.
 *        @rbuf: Response buffer, to which simulated SCSI cmd output is sent.
 *
 *        Returns list of inquiry VPD pages available.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 */
static unsigned int ata_scsiop_inq_00(struct ata_scsi_args *args, u8 *rbuf)
{
        int num_pages;
        static const u8 pages[] = {
                0x00,        /* page 0x00, this page */
                0x80,        /* page 0x80, unit serial no page */
                0x83,        /* page 0x83, device ident page */
                0x89,        /* page 0x89, ata info page */
                0xb0,        /* page 0xb0, block limits page */
                0xb1,        /* page 0xb1, block device characteristics page */
                0xb2,        /* page 0xb2, thin provisioning page */
                0xb6,        /* page 0xb6, zoned block device characteristics */
        };

        num_pages = sizeof(pages);
        if (!(args->dev->flags & ATA_DFLAG_ZAC))
                num_pages--;
        rbuf[3] = num_pages;        /* number of supported VPD pages */
        memcpy(rbuf + 4, pages, num_pages);
        return 0;
}

/**
 *        ata_scsiop_inq_80 - Simulate INQUIRY VPD page 80, device serial number
 *        @args: device IDENTIFY data / SCSI command of interest.
 *        @rbuf: Response buffer, to which simulated SCSI cmd output is sent.
 *
 *        Returns ATA device serial number.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 */
static unsigned int ata_scsiop_inq_80(struct ata_scsi_args *args, u8 *rbuf)
{
        static const u8 hdr[] = {
                0,
                0x80,                        /* this page code */
                0,
                ATA_ID_SERNO_LEN,        /* page len */
        };

        memcpy(rbuf, hdr, sizeof(hdr));
        ata_id_string(args->id, (unsigned char *) &rbuf[4],
                      ATA_ID_SERNO, ATA_ID_SERNO_LEN);
        return 0;
}

/**
 *        ata_scsiop_inq_83 - Simulate INQUIRY VPD page 83, device identity
 *        @args: device IDENTIFY data / SCSI command of interest.
 *        @rbuf: Response buffer, to which simulated SCSI cmd output is sent.
 *
 *        Yields two logical unit device identification designators:
 *         - vendor specific ASCII containing the ATA serial number
 *         - SAT defined "t10 vendor id based" containing ASCII vendor
 *           name ("ATA     "), model and serial numbers.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 */
static unsigned int ata_scsiop_inq_83(struct ata_scsi_args *args, u8 *rbuf)
{
        const int sat_model_serial_desc_len = 68;
        int num;

        rbuf[1] = 0x83;                        /* this page code */
        num = 4;

        /* piv=0, assoc=lu, code_set=ACSII, designator=vendor */
        rbuf[num + 0] = 2;
        rbuf[num + 3] = ATA_ID_SERNO_LEN;
        num += 4;
        ata_id_string(args->id, (unsigned char *) rbuf + num,
                      ATA_ID_SERNO, ATA_ID_SERNO_LEN);
        num += ATA_ID_SERNO_LEN;

        /* SAT defined lu model and serial numbers descriptor */
        /* piv=0, assoc=lu, code_set=ACSII, designator=t10 vendor id */
        rbuf[num + 0] = 2;
        rbuf[num + 1] = 1;
        rbuf[num + 3] = sat_model_serial_desc_len;
        num += 4;
        memcpy(rbuf + num, "ATA     ", 8);
        num += 8;
        ata_id_string(args->id, (unsigned char *) rbuf + num, ATA_ID_PROD,
                      ATA_ID_PROD_LEN);
        num += ATA_ID_PROD_LEN;
        ata_id_string(args->id, (unsigned char *) rbuf + num, ATA_ID_SERNO,
                      ATA_ID_SERNO_LEN);
        num += ATA_ID_SERNO_LEN;

        if (ata_id_has_wwn(args->id)) {
                /* SAT defined lu world wide name */
                /* piv=0, assoc=lu, code_set=binary, designator=NAA */
                rbuf[num + 0] = 1;
                rbuf[num + 1] = 3;
                rbuf[num + 3] = ATA_ID_WWN_LEN;
                num += 4;
                ata_id_string(args->id, (unsigned char *) rbuf + num,
                              ATA_ID_WWN, ATA_ID_WWN_LEN);
                num += ATA_ID_WWN_LEN;
        }
        rbuf[3] = num - 4;    /* page len (assume less than 256 bytes) */
        return 0;
}

/**
 *        ata_scsiop_inq_89 - Simulate INQUIRY VPD page 89, ATA info
 *        @args: device IDENTIFY data / SCSI command of interest.
 *        @rbuf: Response buffer, to which simulated SCSI cmd output is sent.
 *
 *        Yields SAT-specified ATA VPD page.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 */
static unsigned int ata_scsiop_inq_89(struct ata_scsi_args *args, u8 *rbuf)
{
        rbuf[1] = 0x89;                        /* our page code */
        rbuf[2] = (0x238 >> 8);                /* page size fixed at 238h */
        rbuf[3] = (0x238 & 0xff);

        memcpy(&rbuf[8], "linux   ", 8);
        memcpy(&rbuf[16], "libata          ", 16);
        memcpy(&rbuf[32], DRV_VERSION, 4);

        rbuf[36] = 0x34;                /* force D2H Reg FIS (34h) */
        rbuf[37] = (1 << 7);                /* bit 7 indicates Command FIS */
                                        /* TODO: PMP? */

        /* we don't store the ATA device signature, so we fake it */
        rbuf[38] = ATA_DRDY;                /* really, this is Status reg */
        rbuf[40] = 0x1;
        rbuf[48] = 0x1;

        rbuf[56] = ATA_CMD_ID_ATA;

        memcpy(&rbuf[60], &args->id[0], 512);
        return 0;
}

static unsigned int ata_scsiop_inq_b0(struct ata_scsi_args *args, u8 *rbuf)
{
        struct ata_device *dev = args->dev;
        u16 min_io_sectors;

        rbuf[1] = 0xb0;
        rbuf[3] = 0x3c;                /* required VPD size with unmap support */

        /*
         * Optimal transfer length granularity.
         *
         * This is always one physical block, but for disks with a smaller
         * logical than physical sector size we need to figure out what the
         * latter is.
         */
        min_io_sectors = 1 << ata_id_log2_per_physical_sector(args->id);
        put_unaligned_be16(min_io_sectors, &rbuf[6]);

        /*
         * Optimal unmap granularity.
         *
         * The ATA spec doesn't even know about a granularity or alignment
         * for the TRIM command.  We can leave away most of the unmap related
         * VPD page entries, but we have specifify a granularity to signal
         * that we support some form of unmap - in thise case via WRITE SAME
         * with the unmap bit set.
         */
        if (ata_id_has_trim(args->id)) {
                u64 max_blocks = 65535 * ATA_MAX_TRIM_RNUM;

                if (dev->horkage & ATA_HORKAGE_MAX_TRIM_128M)
                        max_blocks = 128 << (20 - SECTOR_SHIFT);

                put_unaligned_be64(max_blocks, &rbuf[36]);
                put_unaligned_be32(1, &rbuf[28]);
        }

        return 0;
}

static unsigned int ata_scsiop_inq_b1(struct ata_scsi_args *args, u8 *rbuf)
{
        int form_factor = ata_id_form_factor(args->id);
        int media_rotation_rate = ata_id_rotation_rate(args->id);
        u8 zoned = ata_id_zoned_cap(args->id);

        rbuf[1] = 0xb1;
        rbuf[3] = 0x3c;
        rbuf[4] = media_rotation_rate >> 8;
        rbuf[5] = media_rotation_rate;
        rbuf[7] = form_factor;
        if (zoned)
                rbuf[8] = (zoned << 4);

        return 0;
}

static unsigned int ata_scsiop_inq_b2(struct ata_scsi_args *args, u8 *rbuf)
{
        /* SCSI Thin Provisioning VPD page: SBC-3 rev 22 or later */
        rbuf[1] = 0xb2;
        rbuf[3] = 0x4;
        rbuf[5] = 1 << 6;        /* TPWS */

        return 0;
}

static unsigned int ata_scsiop_inq_b6(struct ata_scsi_args *args, u8 *rbuf)
{
        /*
         * zbc-r05 SCSI Zoned Block device characteristics VPD page
         */
        rbuf[1] = 0xb6;
        rbuf[3] = 0x3C;

        /*
         * URSWRZ bit is only meaningful for host-managed ZAC drives
         */
        if (args->dev->zac_zoned_cap & 1)
                rbuf[4] |= 1;
        put_unaligned_be32(args->dev->zac_zones_optimal_open, &rbuf[8]);
        put_unaligned_be32(args->dev->zac_zones_optimal_nonseq, &rbuf[12]);
        put_unaligned_be32(args->dev->zac_zones_max_open, &rbuf[16]);

        return 0;
}

/**
 *        modecpy - Prepare response for MODE SENSE
 *        @dest: output buffer
 *        @src: data being copied
 *        @n: length of mode page
 *        @changeable: whether changeable parameters are requested
 *
 *        Generate a generic MODE SENSE page for either current or changeable
 *        parameters.
 *
 *        LOCKING:
 *        None.
 */
static void modecpy(u8 *dest, const u8 *src, int n, bool changeable)
{
        if (changeable) {
                memcpy(dest, src, 2);
                memset(dest + 2, 0, n - 2);
        } else {
                memcpy(dest, src, n);
        }
}

/**
 *        ata_msense_caching - Simulate MODE SENSE caching info page
 *        @id: device IDENTIFY data
 *        @buf: output buffer
 *        @changeable: whether changeable parameters are requested
 *
 *        Generate a caching info page, which conditionally indicates
 *        write caching to the SCSI layer, depending on device
 *        capabilities.
 *
 *        LOCKING:
 *        None.
 */
static unsigned int ata_msense_caching(u16 *id, u8 *buf, bool changeable)
{
        modecpy(buf, def_cache_mpage, sizeof(def_cache_mpage), changeable);
        if (changeable) {
                buf[2] |= (1 << 2);        /* ata_mselect_caching() */
        } else {
                buf[2] |= (ata_id_wcache_enabled(id) << 2);        /* write cache enable */
                buf[12] |= (!ata_id_rahead_enabled(id) << 5);        /* disable read ahead */
        }
        return sizeof(def_cache_mpage);
}

/**
 *        ata_msense_control - Simulate MODE SENSE control mode page
 *        @dev: ATA device of interest
 *        @buf: output buffer
 *        @changeable: whether changeable parameters are requested
 *
 *        Generate a generic MODE SENSE control mode page.
 *
 *        LOCKING:
 *        None.
 */
static unsigned int ata_msense_control(struct ata_device *dev, u8 *buf,
                                        bool changeable)
{
        modecpy(buf, def_control_mpage, sizeof(def_control_mpage), changeable);
        if (changeable) {
                buf[2] |= (1 << 2);        /* ata_mselect_control() */
        } else {
                bool d_sense = (dev->flags & ATA_DFLAG_D_SENSE);

                buf[2] |= (d_sense << 2);        /* descriptor format sense data */
        }
        return sizeof(def_control_mpage);
}

/**
 *        ata_msense_rw_recovery - Simulate MODE SENSE r/w error recovery page
 *        @buf: output buffer
 *        @changeable: whether changeable parameters are requested
 *
 *        Generate a generic MODE SENSE r/w error recovery page.
 *
 *        LOCKING:
 *        None.
 */
static unsigned int ata_msense_rw_recovery(u8 *buf, bool changeable)
{
        modecpy(buf, def_rw_recovery_mpage, sizeof(def_rw_recovery_mpage),
                changeable);
        return sizeof(def_rw_recovery_mpage);
}

/*
 * We can turn this into a real blacklist if it's needed, for now just
 * blacklist any Maxtor BANC1G10 revision firmware
 */
static int ata_dev_supports_fua(u16 *id)
{
        unsigned char model[ATA_ID_PROD_LEN + 1], fw[ATA_ID_FW_REV_LEN + 1];

        if (!libata_fua)
                return 0;
        if (!ata_id_has_fua(id))
                return 0;

        ata_id_c_string(id, model, ATA_ID_PROD, sizeof(model));
        ata_id_c_string(id, fw, ATA_ID_FW_REV, sizeof(fw));

        if (strcmp(model, "Maxtor"))
                return 1;
        if (strcmp(fw, "BANC1G10"))
                return 1;

        return 0; /* blacklisted */
}

/**
 *        ata_scsiop_mode_sense - Simulate MODE SENSE 6, 10 commands
 *        @args: device IDENTIFY data / SCSI command of interest.
 *        @rbuf: Response buffer, to which simulated SCSI cmd output is sent.
 *
 *        Simulate MODE SENSE commands. Assume this is invoked for direct
 *        access devices (e.g. disks) only. There should be no block
 *        descriptor for other device types.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 */
static unsigned int ata_scsiop_mode_sense(struct ata_scsi_args *args, u8 *rbuf)
{
        struct ata_device *dev = args->dev;
        u8 *scsicmd = args->cmd->cmnd, *p = rbuf;
        static const u8 sat_blk_desc[] = {
                0, 0, 0, 0,        /* number of blocks: sat unspecified */
                0,
                0, 0x2, 0x0        /* block length: 512 bytes */
        };
        u8 pg, spg;
        unsigned int ebd, page_control, six_byte;
        u8 dpofua, bp = 0xff;
        u16 fp;

        six_byte = (scsicmd[0] == MODE_SENSE);
        ebd = !(scsicmd[1] & 0x8);      /* dbd bit inverted == edb */
        /*
         * LLBA bit in msense(10) ignored (compliant)
         */

        page_control = scsicmd[2] >> 6;
        switch (page_control) {
        case 0: /* current */
        case 1: /* changeable */
        case 2: /* defaults */
                break;  /* supported */
        case 3: /* saved */
                goto saving_not_supp;
        default:
                fp = 2;
                bp = 6;
                goto invalid_fld;
        }

        if (six_byte)
                p += 4 + (ebd ? 8 : 0);
        else
                p += 8 + (ebd ? 8 : 0);

        pg = scsicmd[2] & 0x3f;
        spg = scsicmd[3];
        /*
         * No mode subpages supported (yet) but asking for _all_
         * subpages may be valid
         */
        if (spg && (spg != ALL_SUB_MPAGES)) {
                fp = 3;
                goto invalid_fld;
        }

        switch(pg) {
        case RW_RECOVERY_MPAGE:
                p += ata_msense_rw_recovery(p, page_control == 1);
                break;

        case CACHE_MPAGE:
                p += ata_msense_caching(args->id, p, page_control == 1);
                break;

        case CONTROL_MPAGE:
                p += ata_msense_control(args->dev, p, page_control == 1);
                break;

        case ALL_MPAGES:
                p += ata_msense_rw_recovery(p, page_control == 1);
                p += ata_msense_caching(args->id, p, page_control == 1);
                p += ata_msense_control(args->dev, p, page_control == 1);
                break;

        default:                /* invalid page code */
                fp = 2;
                goto invalid_fld;
        }

        dpofua = 0;
        if (ata_dev_supports_fua(args->id) && (dev->flags & ATA_DFLAG_LBA48) &&
            (!(dev->flags & ATA_DFLAG_PIO) || dev->multi_count))
                dpofua = 1 << 4;

        if (six_byte) {
                rbuf[0] = p - rbuf - 1;
                rbuf[2] |= dpofua;
                if (ebd) {
                        rbuf[3] = sizeof(sat_blk_desc);
                        memcpy(rbuf + 4, sat_blk_desc, sizeof(sat_blk_desc));
                }
        } else {
                unsigned int output_len = p - rbuf - 2;

                rbuf[0] = output_len >> 8;
                rbuf[1] = output_len;
                rbuf[3] |= dpofua;
                if (ebd) {
                        rbuf[7] = sizeof(sat_blk_desc);
                        memcpy(rbuf + 8, sat_blk_desc, sizeof(sat_blk_desc));
                }
        }
        return 0;

invalid_fld:
        ata_scsi_set_invalid_field(dev, args->cmd, fp, bp);
        return 1;

saving_not_supp:
        ata_scsi_set_sense(dev, args->cmd, ILLEGAL_REQUEST, 0x39, 0x0);
         /* "Saving parameters not supported" */
        return 1;
}

/**
 *        ata_scsiop_read_cap - Simulate READ CAPACITY[ 16] commands
 *        @args: device IDENTIFY data / SCSI command of interest.
 *        @rbuf: Response buffer, to which simulated SCSI cmd output is sent.
 *
 *        Simulate READ CAPACITY commands.
 *
 *        LOCKING:
 *        None.
 */
static unsigned int ata_scsiop_read_cap(struct ata_scsi_args *args, u8 *rbuf)
{
        struct ata_device *dev = args->dev;
        u64 last_lba = dev->n_sectors - 1; /* LBA of the last block */
        u32 sector_size; /* physical sector size in bytes */
        u8 log2_per_phys;
        u16 lowest_aligned;

        sector_size = ata_id_logical_sector_size(dev->id);
        log2_per_phys = ata_id_log2_per_physical_sector(dev->id);
        lowest_aligned = ata_id_logical_sector_offset(dev->id, log2_per_phys);

        if (args->cmd->cmnd[0] == READ_CAPACITY) {
                if (last_lba >= 0xffffffffULL)
                        last_lba = 0xffffffff;

                /* sector count, 32-bit */
                rbuf[0] = last_lba >> (8 * 3);
                rbuf[1] = last_lba >> (8 * 2);
                rbuf[2] = last_lba >> (8 * 1);
                rbuf[3] = last_lba;

                /* sector size */
                rbuf[4] = sector_size >> (8 * 3);
                rbuf[5] = sector_size >> (8 * 2);
                rbuf[6] = sector_size >> (8 * 1);
                rbuf[7] = sector_size;
        } else {
                /* sector count, 64-bit */
                rbuf[0] = last_lba >> (8 * 7);
                rbuf[1] = last_lba >> (8 * 6);
                rbuf[2] = last_lba >> (8 * 5);
                rbuf[3] = last_lba >> (8 * 4);
                rbuf[4] = last_lba >> (8 * 3);
                rbuf[5] = last_lba >> (8 * 2);
                rbuf[6] = last_lba >> (8 * 1);
                rbuf[7] = last_lba;

                /* sector size */
                rbuf[ 8] = sector_size >> (8 * 3);
                rbuf[ 9] = sector_size >> (8 * 2);
                rbuf[10] = sector_size >> (8 * 1);
                rbuf[11] = sector_size;

                rbuf[12] = 0;
                rbuf[13] = log2_per_phys;
                rbuf[14] = (lowest_aligned >> 8) & 0x3f;
                rbuf[15] = lowest_aligned;

                if (ata_id_has_trim(args->id) &&
                    !(dev->horkage & ATA_HORKAGE_NOTRIM)) {
                        rbuf[14] |= 0x80; /* LBPME */

                        if (ata_id_has_zero_after_trim(args->id) &&
                            dev->horkage & ATA_HORKAGE_ZERO_AFTER_TRIM) {
                                ata_dev_info(dev, "Enabling discard_zeroes_data\n");
                                rbuf[14] |= 0x40; /* LBPRZ */
                        }
                }
                if (ata_id_zoned_cap(args->id) ||
                    args->dev->class == ATA_DEV_ZAC)
                        rbuf[12] = (1 << 4); /* RC_BASIS */
        }
        return 0;
}

/**
 *        ata_scsiop_report_luns - Simulate REPORT LUNS command
 *        @args: device IDENTIFY data / SCSI command of interest.
 *        @rbuf: Response buffer, to which simulated SCSI cmd output is sent.
 *
 *        Simulate REPORT LUNS command.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 */
static unsigned int ata_scsiop_report_luns(struct ata_scsi_args *args, u8 *rbuf)
{
        rbuf[3] = 8;        /* just one lun, LUN 0, size 8 bytes */

        return 0;
}

static void atapi_sense_complete(struct ata_queued_cmd *qc)
{
        if (qc->err_mask && ((qc->err_mask & AC_ERR_DEV) == 0)) {
                /* FIXME: not quite right; we don't want the
                 * translation of taskfile registers into
                 * a sense descriptors, since that's only
                 * correct for ATA, not ATAPI
                 */
                ata_gen_passthru_sense(qc);
        }

        ata_qc_done(qc);
}

/* is it pointless to prefer PIO for "safety reasons"? */
static inline int ata_pio_use_silly(struct ata_port *ap)
{
        return (ap->flags & ATA_FLAG_PIO_DMA);
}

static void atapi_request_sense(struct ata_queued_cmd *qc)
{
        struct ata_port *ap = qc->ap;
        struct scsi_cmnd *cmd = qc->scsicmd;

        DPRINTK("ATAPI request sense\n");

        memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE);

#ifdef CONFIG_ATA_SFF
        if (ap->ops->sff_tf_read)
                ap->ops->sff_tf_read(ap, &qc->tf);
#endif

        /* fill these in, for the case where they are -not- overwritten */
        cmd->sense_buffer[0] = 0x70;
        cmd->sense_buffer[2] = qc->tf.feature >> 4;

        ata_qc_reinit(qc);

        /* setup sg table and init transfer direction */
        sg_init_one(&qc->sgent, cmd->sense_buffer, SCSI_SENSE_BUFFERSIZE);
        ata_sg_init(qc, &qc->sgent, 1);
        qc->dma_dir = DMA_FROM_DEVICE;

        memset(&qc->cdb, 0, qc->dev->cdb_len);
        qc->cdb[0] = REQUEST_SENSE;
        qc->cdb[4] = SCSI_SENSE_BUFFERSIZE;

        qc->tf.flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE;
        qc->tf.command = ATA_CMD_PACKET;

        if (ata_pio_use_silly(ap)) {
                qc->tf.protocol = ATAPI_PROT_DMA;
                qc->tf.feature |= ATAPI_PKT_DMA;
        } else {
                qc->tf.protocol = ATAPI_PROT_PIO;
                qc->tf.lbam = SCSI_SENSE_BUFFERSIZE;
                qc->tf.lbah = 0;
        }
        qc->nbytes = SCSI_SENSE_BUFFERSIZE;

        qc->complete_fn = atapi_sense_complete;

        ata_qc_issue(qc);

        DPRINTK("EXIT\n");
}

/*
 * ATAPI devices typically report zero for their SCSI version, and sometimes
 * deviate from the spec WRT response data format.  If SCSI version is
 * reported as zero like normal, then we make the following fixups:
 *   1) Fake MMC-5 version, to indicate to the Linux scsi midlayer this is a
 *        modern device.
 *   2) Ensure response data format / ATAPI information are always correct.
 */
static void atapi_fixup_inquiry(struct scsi_cmnd *cmd)
{
        u8 buf[4];

        sg_copy_to_buffer(scsi_sglist(cmd), scsi_sg_count(cmd), buf, 4);
        if (buf[2] == 0) {
                buf[2] = 0x5;
                buf[3] = 0x32;
        }
        sg_copy_from_buffer(scsi_sglist(cmd), scsi_sg_count(cmd), buf, 4);
}

static void atapi_qc_complete(struct ata_queued_cmd *qc)
{
        struct scsi_cmnd *cmd = qc->scsicmd;
        unsigned int err_mask = qc->err_mask;

        /* handle completion from new EH */
        if (unlikely(qc->ap->ops->error_handler &&
                     (err_mask || qc->flags & ATA_QCFLAG_SENSE_VALID))) {

                if (!(qc->flags & ATA_QCFLAG_SENSE_VALID)) {
                        /* FIXME: not quite right; we don't want the
                         * translation of taskfile registers into a
                         * sense descriptors, since that's only
                         * correct for ATA, not ATAPI
                         */
                        ata_gen_passthru_sense(qc);
                }

                /* SCSI EH automatically locks door if sdev->locked is
                 * set.  Sometimes door lock request continues to
                 * fail, for example, when no media is present.  This
                 * creates a loop - SCSI EH issues door lock which
                 * fails and gets invoked again to acquire sense data
                 * for the failed command.
                 *
                 * If door lock fails, always clear sdev->locked to
                 * avoid this infinite loop.
                 *
                 * This may happen before SCSI scan is complete.  Make
                 * sure qc->dev->sdev isn't NULL before dereferencing.
                 */
                if (qc->cdb[0] == ALLOW_MEDIUM_REMOVAL && qc->dev->sdev)
                        qc->dev->sdev->locked = 0;

                qc->scsicmd->result = SAM_STAT_CHECK_CONDITION;
                ata_qc_done(qc);
                return;
        }

        /* successful completion or old EH failure path */
        if (unlikely(err_mask & AC_ERR_DEV)) {
                cmd->result = SAM_STAT_CHECK_CONDITION;
                atapi_request_sense(qc);
                return;
        } else if (unlikely(err_mask)) {
                /* FIXME: not quite right; we don't want the
                 * translation of taskfile registers into
                 * a sense descriptors, since that's only
                 * correct for ATA, not ATAPI
                 */
                ata_gen_passthru_sense(qc);
        } else {
                if (cmd->cmnd[0] == INQUIRY && (cmd->cmnd[1] & 0x03) == 0)
                        atapi_fixup_inquiry(cmd);
                cmd->result = SAM_STAT_GOOD;
        }

        ata_qc_done(qc);
}
/**
 *        atapi_xlat - Initialize PACKET taskfile
 *        @qc: command structure to be initialized
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 *
 *        RETURNS:
 *        Zero on success, non-zero on failure.
 */
static unsigned int atapi_xlat(struct ata_queued_cmd *qc)
{
        struct scsi_cmnd *scmd = qc->scsicmd;
        struct ata_device *dev = qc->dev;
        int nodata = (scmd->sc_data_direction == DMA_NONE);
        int using_pio = !nodata && (dev->flags & ATA_DFLAG_PIO);
        unsigned int nbytes;

        memset(qc->cdb, 0, dev->cdb_len);
        memcpy(qc->cdb, scmd->cmnd, scmd->cmd_len);

        qc->complete_fn = atapi_qc_complete;

        qc->tf.flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE;
        if (scmd->sc_data_direction == DMA_TO_DEVICE) {
                qc->tf.flags |= ATA_TFLAG_WRITE;
                DPRINTK("direction: write\n");
        }

        qc->tf.command = ATA_CMD_PACKET;
        ata_qc_set_pc_nbytes(qc);

        /* check whether ATAPI DMA is safe */
        if (!nodata && !using_pio && atapi_check_dma(qc))
                using_pio = 1;

        /* Some controller variants snoop this value for Packet
         * transfers to do state machine and FIFO management.  Thus we
         * want to set it properly, and for DMA where it is
         * effectively meaningless.
         */
        nbytes = min(ata_qc_raw_nbytes(qc), (unsigned int)63 * 1024);

        /* Most ATAPI devices which honor transfer chunk size don't
         * behave according to the spec when odd chunk size which
         * matches the transfer length is specified.  If the number of
         * bytes to transfer is 2n+1.  According to the spec, what
         * should happen is to indicate that 2n+1 is going to be
         * transferred and transfer 2n+2 bytes where the last byte is
         * padding.
         *
         * In practice, this doesn't happen.  ATAPI devices first
         * indicate and transfer 2n bytes and then indicate and
         * transfer 2 bytes where the last byte is padding.
         *
         * This inconsistency confuses several controllers which
         * perform PIO using DMA such as Intel AHCIs and sil3124/32.
         * These controllers use actual number of transferred bytes to
         * update DMA pointer and transfer of 4n+2 bytes make those
         * controller push DMA pointer by 4n+4 bytes because SATA data
         * FISes are aligned to 4 bytes.  This causes data corruption
         * and buffer overrun.
         *
         * Always setting nbytes to even number solves this problem
         * because then ATAPI devices don't have to split data at 2n
         * boundaries.
         */
        if (nbytes & 0x1)
                nbytes++;

        qc->tf.lbam = (nbytes & 0xFF);
        qc->tf.lbah = (nbytes >> 8);

        if (nodata)
                qc->tf.protocol = ATAPI_PROT_NODATA;
        else if (using_pio)
                qc->tf.protocol = ATAPI_PROT_PIO;
        else {
                /* DMA data xfer */
                qc->tf.protocol = ATAPI_PROT_DMA;
                qc->tf.feature |= ATAPI_PKT_DMA;

                if ((dev->flags & ATA_DFLAG_DMADIR) &&
                    (scmd->sc_data_direction != DMA_TO_DEVICE))
                        /* some SATA bridges need us to indicate data xfer direction */
                        qc->tf.feature |= ATAPI_DMADIR;
        }


        /* FIXME: We need to translate 0x05 READ_BLOCK_LIMITS to a MODE_SENSE
           as ATAPI tape drives don't get this right otherwise */
        return 0;
}

static struct ata_device *ata_find_dev(struct ata_port *ap, unsigned int devno)
{
        /*
         * For the non-PMP case, ata_link_max_devices() returns 1 (SATA case),
         * or 2 (IDE master + slave case). However, the former case includes
         * libsas hosted devices which are numbered per scsi host, leading
         * to devno potentially being larger than 0 but with each struct
         * ata_device having its own struct ata_port and struct ata_link.
         * To accommodate these, ignore devno and always use device number 0.
         */
        if (likely(!sata_pmp_attached(ap))) {
                int link_max_devices = ata_link_max_devices(&ap->link);

                if (link_max_devices == 1)
                        return &ap->link.device[0];

                if (devno < link_max_devices)
                        return &ap->link.device[devno];

                return NULL;
        }

        /*
         * For PMP-attached devices, the device number corresponds to C
         * (channel) of SCSI [H:C:I:L], indicating the port pmp link
         * for the device.
         */
        if (devno < ap->nr_pmp_links)
                return &ap->pmp_link[devno].device[0];

        return NULL;
}

static struct ata_device *__ata_scsi_find_dev(struct ata_port *ap,
                                              const struct scsi_device *scsidev)
{
        int devno;

        /* skip commands not addressed to targets we simulate */
        if (!sata_pmp_attached(ap)) {
                if (unlikely(scsidev->channel || scsidev->lun))
                        return NULL;
                devno = scsidev->id;
        } else {
                if (unlikely(scsidev->id || scsidev->lun))
                        return NULL;
                devno = scsidev->channel;
        }

        return ata_find_dev(ap, devno);
}

/**
 *        ata_scsi_find_dev - lookup ata_device from scsi_cmnd
 *        @ap: ATA port to which the device is attached
 *        @scsidev: SCSI device from which we derive the ATA device
 *
 *        Given various information provided in struct scsi_cmnd,
 *        map that onto an ATA bus, and using that mapping
 *        determine which ata_device is associated with the
 *        SCSI command to be sent.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 *
 *        RETURNS:
 *        Associated ATA device, or %NULL if not found.
 */
struct ata_device *
ata_scsi_find_dev(struct ata_port *ap, const struct scsi_device *scsidev)
{
        struct ata_device *dev = __ata_scsi_find_dev(ap, scsidev);

        if (unlikely(!dev || !ata_dev_enabled(dev)))
                return NULL;

        return dev;
}

/*
 *        ata_scsi_map_proto - Map pass-thru protocol value to taskfile value.
 *        @byte1: Byte 1 from pass-thru CDB.
 *
 *        RETURNS:
 *        ATA_PROT_UNKNOWN if mapping failed/unimplemented, protocol otherwise.
 */
static u8
ata_scsi_map_proto(u8 byte1)
{
        switch((byte1 & 0x1e) >> 1) {
        case 3:                /* Non-data */
                return ATA_PROT_NODATA;

        case 6:                /* DMA */
        case 10:        /* UDMA Data-in */
        case 11:        /* UDMA Data-Out */
                return ATA_PROT_DMA;

        case 4:                /* PIO Data-in */
        case 5:                /* PIO Data-out */
                return ATA_PROT_PIO;

        case 12:        /* FPDMA */
                return ATA_PROT_NCQ;

        case 0:                /* Hard Reset */
        case 1:                /* SRST */
        case 8:                /* Device Diagnostic */
        case 9:                /* Device Reset */
        case 7:                /* DMA Queued */
        case 15:        /* Return Response Info */
        default:        /* Reserved */
                break;
        }

        return ATA_PROT_UNKNOWN;
}

/**
 *        ata_scsi_pass_thru - convert ATA pass-thru CDB to taskfile
 *        @qc: command structure to be initialized
 *
 *        Handles either 12, 16, or 32-byte versions of the CDB.
 *
 *        RETURNS:
 *        Zero on success, non-zero on failure.
 */
static unsigned int ata_scsi_pass_thru(struct ata_queued_cmd *qc)
{
        struct ata_taskfile *tf = &(qc->tf);
        struct scsi_cmnd *scmd = qc->scsicmd;
        struct ata_device *dev = qc->dev;
        const u8 *cdb = scmd->cmnd;
        u16 fp;
        u16 cdb_offset = 0;

        /* 7Fh variable length cmd means a ata pass-thru(32) */
        if (cdb[0] == VARIABLE_LENGTH_CMD)
                cdb_offset = 9;

        tf->protocol = ata_scsi_map_proto(cdb[1 + cdb_offset]);
        if (tf->protocol == ATA_PROT_UNKNOWN) {
                fp = 1;
                goto invalid_fld;
        }

        if ((cdb[2 + cdb_offset] & 0x3) == 0) {
                /*
                 * When T_LENGTH is zero (No data is transferred), dir should
                 * be DMA_NONE.
                 */
                if (scmd->sc_data_direction != DMA_NONE) {
                        fp = 2 + cdb_offset;
                        goto invalid_fld;
                }

                if (ata_is_ncq(tf->protocol))
                        tf->protocol = ATA_PROT_NCQ_NODATA;
        }

        /* enable LBA */
        tf->flags |= ATA_TFLAG_LBA;

        /*
         * 12 and 16 byte CDBs use different offsets to
         * provide the various register values.
         */
        if (cdb[0] == ATA_16) {
                /*
                 * 16-byte CDB - may contain extended commands.
                 *
                 * If that is the case, copy the upper byte register values.
                 */
                if (cdb[1] & 0x01) {
                        tf->hob_feature = cdb[3];
                        tf->hob_nsect = cdb[5];
                        tf->hob_lbal = cdb[7];
                        tf->hob_lbam = cdb[9];
                        tf->hob_lbah = cdb[11];
                        tf->flags |= ATA_TFLAG_LBA48;
                } else
                        tf->flags &= ~ATA_TFLAG_LBA48;

                /*
                 * Always copy low byte, device and command registers.
                 */
                tf->feature = cdb[4];
                tf->nsect = cdb[6];
                tf->lbal = cdb[8];
                tf->lbam = cdb[10];
                tf->lbah = cdb[12];
                tf->device = cdb[13];
                tf->command = cdb[14];
        } else if (cdb[0] == ATA_12) {
                /*
                 * 12-byte CDB - incapable of extended commands.
                 */
                tf->flags &= ~ATA_TFLAG_LBA48;

                tf->feature = cdb[3];
                tf->nsect = cdb[4];
                tf->lbal = cdb[5];
                tf->lbam = cdb[6];
                tf->lbah = cdb[7];
                tf->device = cdb[8];
                tf->command = cdb[9];
        } else {
                /*
                 * 32-byte CDB - may contain extended command fields.
                 *
                 * If that is the case, copy the upper byte register values.
                 */
                if (cdb[10] & 0x01) {
                        tf->hob_feature = cdb[20];
                        tf->hob_nsect = cdb[22];
                        tf->hob_lbal = cdb[16];
                        tf->hob_lbam = cdb[15];
                        tf->hob_lbah = cdb[14];
                        tf->flags |= ATA_TFLAG_LBA48;
                } else
                        tf->flags &= ~ATA_TFLAG_LBA48;

                tf->feature = cdb[21];
                tf->nsect = cdb[23];
                tf->lbal = cdb[19];
                tf->lbam = cdb[18];
                tf->lbah = cdb[17];
                tf->device = cdb[24];
                tf->command = cdb[25];
                tf->auxiliary = get_unaligned_be32(&cdb[28]);
        }

        /* For NCQ commands copy the tag value */
        if (ata_is_ncq(tf->protocol))
                tf->nsect = qc->hw_tag << 3;

        /* enforce correct master/slave bit */
        tf->device = dev->devno ?
                tf->device | ATA_DEV1 : tf->device & ~ATA_DEV1;

        switch (tf->command) {
        /* READ/WRITE LONG use a non-standard sect_size */
        case ATA_CMD_READ_LONG:
        case ATA_CMD_READ_LONG_ONCE:
        case ATA_CMD_WRITE_LONG:
        case ATA_CMD_WRITE_LONG_ONCE:
                if (tf->protocol != ATA_PROT_PIO || tf->nsect != 1) {
                        fp = 1;
                        goto invalid_fld;
                }
                qc->sect_size = scsi_bufflen(scmd);
                break;

        /* commands using reported Logical Block size (e.g. 512 or 4K) */
        case ATA_CMD_CFA_WRITE_NE:
        case ATA_CMD_CFA_TRANS_SECT:
        case ATA_CMD_CFA_WRITE_MULT_NE:
        /* XXX: case ATA_CMD_CFA_WRITE_SECTORS_WITHOUT_ERASE: */
        case ATA_CMD_READ:
        case ATA_CMD_READ_EXT:
        case ATA_CMD_READ_QUEUED:
        /* XXX: case ATA_CMD_READ_QUEUED_EXT: */
        case ATA_CMD_FPDMA_READ:
        case ATA_CMD_READ_MULTI:
        case ATA_CMD_READ_MULTI_EXT:
        case ATA_CMD_PIO_READ:
        case ATA_CMD_PIO_READ_EXT:
        case ATA_CMD_READ_STREAM_DMA_EXT:
        case ATA_CMD_READ_STREAM_EXT:
        case ATA_CMD_VERIFY:
        case ATA_CMD_VERIFY_EXT:
        case ATA_CMD_WRITE:
        case ATA_CMD_WRITE_EXT:
        case ATA_CMD_WRITE_FUA_EXT:
        case ATA_CMD_WRITE_QUEUED:
        case ATA_CMD_WRITE_QUEUED_FUA_EXT:
        case ATA_CMD_FPDMA_WRITE:
        case ATA_CMD_WRITE_MULTI:
        case ATA_CMD_WRITE_MULTI_EXT:
        case ATA_CMD_WRITE_MULTI_FUA_EXT:
        case ATA_CMD_PIO_WRITE:
        case ATA_CMD_PIO_WRITE_EXT:
        case ATA_CMD_WRITE_STREAM_DMA_EXT:
        case ATA_CMD_WRITE_STREAM_EXT:
                qc->sect_size = scmd->device->sector_size;
                break;

        /* Everything else uses 512 byte "sectors" */
        default:
                qc->sect_size = ATA_SECT_SIZE;
        }

        /*
         * Set flags so that all registers will be written, pass on
         * write indication (used for PIO/DMA setup), result TF is
         * copied back and we don't whine too much about its failure.
         */
        tf->flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE;
        if (scmd->sc_data_direction == DMA_TO_DEVICE)
                tf->flags |= ATA_TFLAG_WRITE;

        qc->flags |= ATA_QCFLAG_RESULT_TF | ATA_QCFLAG_QUIET;

        /*
         * Set transfer length.
         *
         * TODO: find out if we need to do more here to
         *       cover scatter/gather case.
         */
        ata_qc_set_pc_nbytes(qc);

        /* We may not issue DMA commands if no DMA mode is set */
        if (tf->protocol == ATA_PROT_DMA && dev->dma_mode == 0) {
                fp = 1;
                goto invalid_fld;
        }

        /* We may not issue NCQ commands to devices not supporting NCQ */
        if (ata_is_ncq(tf->protocol) && !ata_ncq_enabled(dev)) {
                fp = 1;
                goto invalid_fld;
        }

        /* sanity check for pio multi commands */
        if ((cdb[1] & 0xe0) && !is_multi_taskfile(tf)) {
                fp = 1;
                goto invalid_fld;
        }

        if (is_multi_taskfile(tf)) {
                unsigned int multi_count = 1 << (cdb[1] >> 5);

                /* compare the passed through multi_count
                 * with the cached multi_count of libata
                 */
                if (multi_count != dev->multi_count)
                        ata_dev_warn(dev, "invalid multi_count %u ignored\n",
                                     multi_count);
        }

        /*
         * Filter SET_FEATURES - XFER MODE command -- otherwise,
         * SET_FEATURES - XFER MODE must be preceded/succeeded
         * by an update to hardware-specific registers for each
         * controller (i.e. the reason for ->set_piomode(),
         * ->set_dmamode(), and ->post_set_mode() hooks).
         */
        if (tf->command == ATA_CMD_SET_FEATURES &&
            tf->feature == SETFEATURES_XFER) {
                fp = (cdb[0] == ATA_16) ? 4 : 3;
                goto invalid_fld;
        }

        /*
         * Filter TPM commands by default. These provide an
         * essentially uncontrolled encrypted "back door" between
         * applications and the disk. Set libata.allow_tpm=1 if you
         * have a real reason for wanting to use them. This ensures
         * that installed software cannot easily mess stuff up without
         * user intent. DVR type users will probably ship with this enabled
         * for movie content management.
         *
         * Note that for ATA8 we can issue a DCS change and DCS freeze lock
         * for this and should do in future but that it is not sufficient as
         * DCS is an optional feature set. Thus we also do the software filter
         * so that we comply with the TC consortium stated goal that the user
         * can turn off TC features of their system.
         */
        if (tf->command >= 0x5C && tf->command <= 0x5F && !libata_allow_tpm) {
                fp = (cdb[0] == ATA_16) ? 14 : 9;
                goto invalid_fld;
        }

        return 0;

 invalid_fld:
        ata_scsi_set_invalid_field(dev, scmd, fp, 0xff);
        return 1;
}

/**
 * ata_format_dsm_trim_descr() - SATL Write Same to DSM Trim
 * @cmd: SCSI command being translated
 * @trmax: Maximum number of entries that will fit in sector_size bytes.
 * @sector: Starting sector
 * @count: Total Range of request in logical sectors
 *
 * Rewrite the WRITE SAME descriptor to be a DSM TRIM little-endian formatted
 * descriptor.
 *
 * Upto 64 entries of the format:
 *   63:48 Range Length
 *   47:0  LBA
 *
 *  Range Length of 0 is ignored.
 *  LBA's should be sorted order and not overlap.
 *
 * NOTE: this is the same format as ADD LBA(S) TO NV CACHE PINNED SET
 *
 * Return: Number of bytes copied into sglist.
 */
static size_t ata_format_dsm_trim_descr(struct scsi_cmnd *cmd, u32 trmax,
                                        u64 sector, u32 count)
{
        struct scsi_device *sdp = cmd->device;
        size_t len = sdp->sector_size;
        size_t r;
        __le64 *buf;
        u32 i = 0;
        unsigned long flags;

        WARN_ON(len > ATA_SCSI_RBUF_SIZE);

        if (len > ATA_SCSI_RBUF_SIZE)
                len = ATA_SCSI_RBUF_SIZE;

        spin_lock_irqsave(&ata_scsi_rbuf_lock, flags);
        buf = ((void *)ata_scsi_rbuf);
        memset(buf, 0, len);
        while (i < trmax) {
                u64 entry = sector |
                        ((u64)(count > 0xffff ? 0xffff : count) << 48);
                buf[i++] = __cpu_to_le64(entry);
                if (count <= 0xffff)
                        break;
                count -= 0xffff;
                sector += 0xffff;
        }
        r = sg_copy_from_buffer(scsi_sglist(cmd), scsi_sg_count(cmd), buf, len);
        spin_unlock_irqrestore(&ata_scsi_rbuf_lock, flags);

        return r;
}

/**
 * ata_scsi_write_same_xlat() - SATL Write Same to ATA SCT Write Same
 * @qc: Command to be translated
 *
 * Translate a SCSI WRITE SAME command to be either a DSM TRIM command or
 * an SCT Write Same command.
 * Based on WRITE SAME has the UNMAP flag:
 *
 *   - When set translate to DSM TRIM
 *   - When clear translate to SCT Write Same
 */
static unsigned int ata_scsi_write_same_xlat(struct ata_queued_cmd *qc)
{
        struct ata_taskfile *tf = &qc->tf;
        struct scsi_cmnd *scmd = qc->scsicmd;
        struct scsi_device *sdp = scmd->device;
        size_t len = sdp->sector_size;
        struct ata_device *dev = qc->dev;
        const u8 *cdb = scmd->cmnd;
        u64 block;
        u32 n_block;
        const u32 trmax = len >> 3;
        u32 size;
        u16 fp;
        u8 bp = 0xff;
        u8 unmap = cdb[1] & 0x8;

        /* we may not issue DMA commands if no DMA mode is set */
        if (unlikely(!dev->dma_mode))
                goto invalid_opcode;

        /*
         * We only allow sending this command through the block layer,
         * as it modifies the DATA OUT buffer, which would corrupt user
         * memory for SG_IO commands.
         */
        if (unlikely(blk_rq_is_passthrough(scmd->request)))
                goto invalid_opcode;

        if (unlikely(scmd->cmd_len < 16)) {
                fp = 15;
                goto invalid_fld;
        }
        scsi_16_lba_len(cdb, &block, &n_block);

        if (!unmap ||
            (dev->horkage & ATA_HORKAGE_NOTRIM) ||
            !ata_id_has_trim(dev->id)) {
                fp = 1;
                bp = 3;
                goto invalid_fld;
        }
        /* If the request is too large the cmd is invalid */
        if (n_block > 0xffff * trmax) {
                fp = 2;
                goto invalid_fld;
        }

        /*
         * WRITE SAME always has a sector sized buffer as payload, this
         * should never be a multiple entry S/G list.
         */
        if (!scsi_sg_count(scmd))
                goto invalid_param_len;

        /*
         * size must match sector size in bytes
         * For DATA SET MANAGEMENT TRIM in ACS-2 nsect (aka count)
         * is defined as number of 512 byte blocks to be transferred.
         */

        size = ata_format_dsm_trim_descr(scmd, trmax, block, n_block);
        if (size != len)
                goto invalid_param_len;

        if (ata_ncq_enabled(dev) && ata_fpdma_dsm_supported(dev)) {
                /* Newer devices support queued TRIM commands */
                tf->protocol = ATA_PROT_NCQ;
                tf->command = ATA_CMD_FPDMA_SEND;
                tf->hob_nsect = ATA_SUBCMD_FPDMA_SEND_DSM & 0x1f;
                tf->nsect = qc->hw_tag << 3;
                tf->hob_feature = (size / 512) >> 8;
                tf->feature = size / 512;

                tf->auxiliary = 1;
        } else {
                tf->protocol = ATA_PROT_DMA;
                tf->hob_feature = 0;
                tf->feature = ATA_DSM_TRIM;
                tf->hob_nsect = (size / 512) >> 8;
                tf->nsect = size / 512;
                tf->command = ATA_CMD_DSM;
        }

        tf->flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE | ATA_TFLAG_LBA48 |
                     ATA_TFLAG_WRITE;

        ata_qc_set_pc_nbytes(qc);

        return 0;

invalid_fld:
        ata_scsi_set_invalid_field(dev, scmd, fp, bp);
        return 1;
invalid_param_len:
        /* "Parameter list length error" */
        ata_scsi_set_sense(dev, scmd, ILLEGAL_REQUEST, 0x1a, 0x0);
        return 1;
invalid_opcode:
        /* "Invalid command operation code" */
        ata_scsi_set_sense(dev, scmd, ILLEGAL_REQUEST, 0x20, 0x0);
        return 1;
}

/**
 *        ata_scsiop_maint_in - Simulate a subset of MAINTENANCE_IN
 *        @args: device MAINTENANCE_IN data / SCSI command of interest.
 *        @rbuf: Response buffer, to which simulated SCSI cmd output is sent.
 *
 *        Yields a subset to satisfy scsi_report_opcode()
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 */
static unsigned int ata_scsiop_maint_in(struct ata_scsi_args *args, u8 *rbuf)
{
        struct ata_device *dev = args->dev;
        u8 *cdb = args->cmd->cmnd;
        u8 supported = 0;
        unsigned int err = 0;

        if (cdb[2] != 1) {
                ata_dev_warn(dev, "invalid command format %d\n", cdb[2]);
                err = 2;
                goto out;
        }
        switch (cdb[3]) {
        case INQUIRY:
        case MODE_SENSE:
        case MODE_SENSE_10:
        case READ_CAPACITY:
        case SERVICE_ACTION_IN_16:
        case REPORT_LUNS:
        case REQUEST_SENSE:
        case SYNCHRONIZE_CACHE:
        case SYNCHRONIZE_CACHE_16:
        case REZERO_UNIT:
        case SEEK_6:
        case SEEK_10:
        case TEST_UNIT_READY:
        case SEND_DIAGNOSTIC:
        case MAINTENANCE_IN:
        case READ_6:
        case READ_10:
        case READ_16:
        case WRITE_6:
        case WRITE_10:
        case WRITE_16:
        case ATA_12:
        case ATA_16:
        case VERIFY:
        case VERIFY_16:
        case MODE_SELECT:
        case MODE_SELECT_10:
        case START_STOP:
                supported = 3;
                break;
        case ZBC_IN:
        case ZBC_OUT:
                if (ata_id_zoned_cap(dev->id) ||
                    dev->class == ATA_DEV_ZAC)
                        supported = 3;
                break;
        case SECURITY_PROTOCOL_IN:
        case SECURITY_PROTOCOL_OUT:
                if (dev->flags & ATA_DFLAG_TRUSTED)
                        supported = 3;
                break;
        default:
                break;
        }
out:
        rbuf[1] = supported; /* supported */
        return err;
}

/**
 *        ata_scsi_report_zones_complete - convert ATA output
 *        @qc: command structure returning the data
 *
 *        Convert T-13 little-endian field representation into
 *        T-10 big-endian field representation.
 *        What a mess.
 */
static void ata_scsi_report_zones_complete(struct ata_queued_cmd *qc)
{
        struct scsi_cmnd *scmd = qc->scsicmd;
        struct sg_mapping_iter miter;
        unsigned long flags;
        unsigned int bytes = 0;

        sg_miter_start(&miter, scsi_sglist(scmd), scsi_sg_count(scmd),
                       SG_MITER_TO_SG | SG_MITER_ATOMIC);

        local_irq_save(flags);
        while (sg_miter_next(&miter)) {
                unsigned int offset = 0;

                if (bytes == 0) {
                        char *hdr;
                        u32 list_length;
                        u64 max_lba, opt_lba;
                        u16 same;

                        /* Swizzle header */
                        hdr = miter.addr;
                        list_length = get_unaligned_le32(&hdr[0]);
                        same = get_unaligned_le16(&hdr[4]);
                        max_lba = get_unaligned_le64(&hdr[8]);
                        opt_lba = get_unaligned_le64(&hdr[16]);
                        put_unaligned_be32(list_length, &hdr[0]);
                        hdr[4] = same & 0xf;
                        put_unaligned_be64(max_lba, &hdr[8]);
                        put_unaligned_be64(opt_lba, &hdr[16]);
                        offset += 64;
                        bytes += 64;
                }
                while (offset < miter.length) {
                        char *rec;
                        u8 cond, type, non_seq, reset;
                        u64 size, start, wp;

                        /* Swizzle zone descriptor */
                        rec = miter.addr + offset;
                        type = rec[0] & 0xf;
                        cond = (rec[1] >> 4) & 0xf;
                        non_seq = (rec[1] & 2);
                        reset = (rec[1] & 1);
                        size = get_unaligned_le64(&rec[8]);
                        start = get_unaligned_le64(&rec[16]);
                        wp = get_unaligned_le64(&rec[24]);
                        rec[0] = type;
                        rec[1] = (cond << 4) | non_seq | reset;
                        put_unaligned_be64(size, &rec[8]);
                        put_unaligned_be64(start, &rec[16]);
                        put_unaligned_be64(wp, &rec[24]);
                        WARN_ON(offset + 64 > miter.length);
                        offset += 64;
                        bytes += 64;
                }
        }
        sg_miter_stop(&miter);
        local_irq_restore(flags);

        ata_scsi_qc_complete(qc);
}

static unsigned int ata_scsi_zbc_in_xlat(struct ata_queued_cmd *qc)
{
        struct ata_taskfile *tf = &qc->tf;
        struct scsi_cmnd *scmd = qc->scsicmd;
        const u8 *cdb = scmd->cmnd;
        u16 sect, fp = (u16)-1;
        u8 sa, options, bp = 0xff;
        u64 block;
        u32 n_block;

        if (unlikely(scmd->cmd_len < 16)) {
                ata_dev_warn(qc->dev, "invalid cdb length %d\n",
                             scmd->cmd_len);
                fp = 15;
                goto invalid_fld;
        }
        scsi_16_lba_len(cdb, &block, &n_block);
        if (n_block != scsi_bufflen(scmd)) {
                ata_dev_warn(qc->dev, "non-matching transfer count (%d/%d)\n",
                             n_block, scsi_bufflen(scmd));
                goto invalid_param_len;
        }
        sa = cdb[1] & 0x1f;
        if (sa != ZI_REPORT_ZONES) {
                ata_dev_warn(qc->dev, "invalid service action %d\n", sa);
                fp = 1;
                goto invalid_fld;
        }
        /*
         * ZAC allows only for transfers in 512 byte blocks,
         * and uses a 16 bit value for the transfer count.
         */
        if ((n_block / 512) > 0xffff || n_block < 512 || (n_block % 512)) {
                ata_dev_warn(qc->dev, "invalid transfer count %d\n", n_block);
                goto invalid_param_len;
        }
        sect = n_block / 512;
        options = cdb[14] & 0xbf;

        if (ata_ncq_enabled(qc->dev) &&
            ata_fpdma_zac_mgmt_in_supported(qc->dev)) {
                tf->protocol = ATA_PROT_NCQ;
                tf->command = ATA_CMD_FPDMA_RECV;
                tf->hob_nsect = ATA_SUBCMD_FPDMA_RECV_ZAC_MGMT_IN & 0x1f;
                tf->nsect = qc->hw_tag << 3;
                tf->feature = sect & 0xff;
                tf->hob_feature = (sect >> 8) & 0xff;
                tf->auxiliary = ATA_SUBCMD_ZAC_MGMT_IN_REPORT_ZONES | (options << 8);
        } else {
                tf->command = ATA_CMD_ZAC_MGMT_IN;
                tf->feature = ATA_SUBCMD_ZAC_MGMT_IN_REPORT_ZONES;
                tf->protocol = ATA_PROT_DMA;
                tf->hob_feature = options;
                tf->hob_nsect = (sect >> 8) & 0xff;
                tf->nsect = sect & 0xff;
        }
        tf->device = ATA_LBA;
        tf->lbah = (block >> 16) & 0xff;
        tf->lbam = (block >> 8) & 0xff;
        tf->lbal = block & 0xff;
        tf->hob_lbah = (block >> 40) & 0xff;
        tf->hob_lbam = (block >> 32) & 0xff;
        tf->hob_lbal = (block >> 24) & 0xff;

        tf->flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE | ATA_TFLAG_LBA48;
        qc->flags |= ATA_QCFLAG_RESULT_TF;

        ata_qc_set_pc_nbytes(qc);

        qc->complete_fn = ata_scsi_report_zones_complete;

        return 0;

invalid_fld:
        ata_scsi_set_invalid_field(qc->dev, scmd, fp, bp);
        return 1;

invalid_param_len:
        /* "Parameter list length error" */
        ata_scsi_set_sense(qc->dev, scmd, ILLEGAL_REQUEST, 0x1a, 0x0);
        return 1;
}

static unsigned int ata_scsi_zbc_out_xlat(struct ata_queued_cmd *qc)
{
        struct ata_taskfile *tf = &qc->tf;
        struct scsi_cmnd *scmd = qc->scsicmd;
        struct ata_device *dev = qc->dev;
        const u8 *cdb = scmd->cmnd;
        u8 all, sa;
        u64 block;
        u32 n_block;
        u16 fp = (u16)-1;

        if (unlikely(scmd->cmd_len < 16)) {
                fp = 15;
                goto invalid_fld;
        }

        sa = cdb[1] & 0x1f;
        if ((sa != ZO_CLOSE_ZONE) && (sa != ZO_FINISH_ZONE) &&
            (sa != ZO_OPEN_ZONE) && (sa != ZO_RESET_WRITE_POINTER)) {
                fp = 1;
                goto invalid_fld;
        }

        scsi_16_lba_len(cdb, &block, &n_block);
        if (n_block) {
                /*
                 * ZAC MANAGEMENT OUT doesn't define any length
                 */
                goto invalid_param_len;
        }

        all = cdb[14] & 0x1;
        if (all) {
                /*
                 * Ignore the block address (zone ID) as defined by ZBC.
                 */
                block = 0;
        } else if (block >= dev->n_sectors) {
                /*
                 * Block must be a valid zone ID (a zone start LBA).
                 */
                fp = 2;
                goto invalid_fld;
        }

        if (ata_ncq_enabled(qc->dev) &&
            ata_fpdma_zac_mgmt_out_supported(qc->dev)) {
                tf->protocol = ATA_PROT_NCQ_NODATA;
                tf->command = ATA_CMD_NCQ_NON_DATA;
                tf->feature = ATA_SUBCMD_NCQ_NON_DATA_ZAC_MGMT_OUT;
                tf->nsect = qc->hw_tag << 3;
                tf->auxiliary = sa | ((u16)all << 8);
        } else {
                tf->protocol = ATA_PROT_NODATA;
                tf->command = ATA_CMD_ZAC_MGMT_OUT;
                tf->feature = sa;
                tf->hob_feature = all;
        }
        tf->lbah = (block >> 16) & 0xff;
        tf->lbam = (block >> 8) & 0xff;
        tf->lbal = block & 0xff;
        tf->hob_lbah = (block >> 40) & 0xff;
        tf->hob_lbam = (block >> 32) & 0xff;
        tf->hob_lbal = (block >> 24) & 0xff;
        tf->device = ATA_LBA;
        tf->flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE | ATA_TFLAG_LBA48;

        return 0;

 invalid_fld:
        ata_scsi_set_invalid_field(qc->dev, scmd, fp, 0xff);
        return 1;
invalid_param_len:
        /* "Parameter list length error" */
        ata_scsi_set_sense(qc->dev, scmd, ILLEGAL_REQUEST, 0x1a, 0x0);
        return 1;
}

/**
 *        ata_mselect_caching - Simulate MODE SELECT for caching info page
 *        @qc: Storage for translated ATA taskfile
 *        @buf: input buffer
 *        @len: number of valid bytes in the input buffer
 *        @fp: out parameter for the failed field on error
 *
 *        Prepare a taskfile to modify caching information for the device.
 *
 *        LOCKING:
 *        None.
 */
static int ata_mselect_caching(struct ata_queued_cmd *qc,
                               const u8 *buf, int len, u16 *fp)
{
        struct ata_taskfile *tf = &qc->tf;
        struct ata_device *dev = qc->dev;
        u8 mpage[CACHE_MPAGE_LEN];
        u8 wce;
        int i;

        /*
         * The first two bytes of def_cache_mpage are a header, so offsets
         * in mpage are off by 2 compared to buf.  Same for len.
         */

        if (len != CACHE_MPAGE_LEN - 2) {
                if (len < CACHE_MPAGE_LEN - 2)
                        *fp = len;
                else
                        *fp = CACHE_MPAGE_LEN - 2;
                return -EINVAL;
        }

        wce = buf[0] & (1 << 2);

        /*
         * Check that read-only bits are not modified.
         */
        ata_msense_caching(dev->id, mpage, false);
        for (i = 0; i < CACHE_MPAGE_LEN - 2; i++) {
                if (i == 0)
                        continue;
                if (mpage[i + 2] != buf[i]) {
                        *fp = i;
                        return -EINVAL;
                }
        }

        tf->flags |= ATA_TFLAG_DEVICE | ATA_TFLAG_ISADDR;
        tf->protocol = ATA_PROT_NODATA;
        tf->nsect = 0;
        tf->command = ATA_CMD_SET_FEATURES;
        tf->feature = wce ? SETFEATURES_WC_ON : SETFEATURES_WC_OFF;
        return 0;
}

/**
 *        ata_mselect_control - Simulate MODE SELECT for control page
 *        @qc: Storage for translated ATA taskfile
 *        @buf: input buffer
 *        @len: number of valid bytes in the input buffer
 *        @fp: out parameter for the failed field on error
 *
 *        Prepare a taskfile to modify caching information for the device.
 *
 *        LOCKING:
 *        None.
 */
static int ata_mselect_control(struct ata_queued_cmd *qc,
                               const u8 *buf, int len, u16 *fp)
{
        struct ata_device *dev = qc->dev;
        u8 mpage[CONTROL_MPAGE_LEN];
        u8 d_sense;
        int i;

        /*
         * The first two bytes of def_control_mpage are a header, so offsets
         * in mpage are off by 2 compared to buf.  Same for len.
         */

        if (len != CONTROL_MPAGE_LEN - 2) {
                if (len < CONTROL_MPAGE_LEN - 2)
                        *fp = len;
                else
                        *fp = CONTROL_MPAGE_LEN - 2;
                return -EINVAL;
        }

        d_sense = buf[0] & (1 << 2);

        /*
         * Check that read-only bits are not modified.
         */
        ata_msense_control(dev, mpage, false);
        for (i = 0; i < CONTROL_MPAGE_LEN - 2; i++) {
                if (i == 0)
                        continue;
                if (mpage[2 + i] != buf[i]) {
                        *fp = i;
                        return -EINVAL;
                }
        }
        if (d_sense & (1 << 2))
                dev->flags |= ATA_DFLAG_D_SENSE;
        else
                dev->flags &= ~ATA_DFLAG_D_SENSE;
        return 0;
}

/**
 *        ata_scsi_mode_select_xlat - Simulate MODE SELECT 6, 10 commands
 *        @qc: Storage for translated ATA taskfile
 *
 *        Converts a MODE SELECT command to an ATA SET FEATURES taskfile.
 *        Assume this is invoked for direct access devices (e.g. disks) only.
 *        There should be no block descriptor for other device types.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 */
static unsigned int ata_scsi_mode_select_xlat(struct ata_queued_cmd *qc)
{
        struct scsi_cmnd *scmd = qc->scsicmd;
        const u8 *cdb = scmd->cmnd;
        u8 pg, spg;
        unsigned six_byte, pg_len, hdr_len, bd_len;
        int len;
        u16 fp = (u16)-1;
        u8 bp = 0xff;
        u8 buffer[64];
        const u8 *p = buffer;

        six_byte = (cdb[0] == MODE_SELECT);
        if (six_byte) {
                if (scmd->cmd_len < 5) {
                        fp = 4;
                        goto invalid_fld;
                }

                len = cdb[4];
                hdr_len = 4;
        } else {
                if (scmd->cmd_len < 9) {
                        fp = 8;
                        goto invalid_fld;
                }

                len = (cdb[7] << 8) + cdb[8];
                hdr_len = 8;
        }

        /* We only support PF=1, SP=0.  */
        if ((cdb[1] & 0x11) != 0x10) {
                fp = 1;
                bp = (cdb[1] & 0x01) ? 1 : 5;
                goto invalid_fld;
        }

        /* Test early for possible overrun.  */
        if (!scsi_sg_count(scmd) || scsi_sglist(scmd)->length < len)
                goto invalid_param_len;

        /* Move past header and block descriptors.  */
        if (len < hdr_len)
                goto invalid_param_len;

        if (!sg_copy_to_buffer(scsi_sglist(scmd), scsi_sg_count(scmd),
                               buffer, sizeof(buffer)))
                goto invalid_param_len;

        if (six_byte)
                bd_len = p[3];
        else
                bd_len = (p[6] << 8) + p[7];

        len -= hdr_len;
        p += hdr_len;
        if (len < bd_len)
                goto invalid_param_len;
        if (bd_len != 0 && bd_len != 8) {
                fp = (six_byte) ? 3 : 6;
                fp += bd_len + hdr_len;
                goto invalid_param;
        }

        len -= bd_len;
        p += bd_len;
        if (len == 0)
                goto skip;

        /* Parse both possible formats for the mode page headers.  */
        pg = p[0] & 0x3f;
        if (p[0] & 0x40) {
                if (len < 4)
                        goto invalid_param_len;

                spg = p[1];
                pg_len = (p[2] << 8) | p[3];
                p += 4;
                len -= 4;
        } else {
                if (len < 2)
                        goto invalid_param_len;

                spg = 0;
                pg_len = p[1];
                p += 2;
                len -= 2;
        }

        /*
         * No mode subpages supported (yet) but asking for _all_
         * subpages may be valid
         */
        if (spg && (spg != ALL_SUB_MPAGES)) {
                fp = (p[0] & 0x40) ? 1 : 0;
                fp += hdr_len + bd_len;
                goto invalid_param;
        }
        if (pg_len > len)
                goto invalid_param_len;

        switch (pg) {
        case CACHE_MPAGE:
                if (ata_mselect_caching(qc, p, pg_len, &fp) < 0) {
                        fp += hdr_len + bd_len;
                        goto invalid_param;
                }
                break;
        case CONTROL_MPAGE:
                if (ata_mselect_control(qc, p, pg_len, &fp) < 0) {
                        fp += hdr_len + bd_len;
                        goto invalid_param;
                } else {
                        goto skip; /* No ATA command to send */
                }
                break;
        default:                /* invalid page code */
                fp = bd_len + hdr_len;
                goto invalid_param;
        }

        /*
         * Only one page has changeable data, so we only support setting one
         * page at a time.
         */
        if (len > pg_len)
                goto invalid_param;

        return 0;

 invalid_fld:
        ata_scsi_set_invalid_field(qc->dev, scmd, fp, bp);
        return 1;

 invalid_param:
        ata_scsi_set_invalid_parameter(qc->dev, scmd, fp);
        return 1;

 invalid_param_len:
        /* "Parameter list length error" */
        ata_scsi_set_sense(qc->dev, scmd, ILLEGAL_REQUEST, 0x1a, 0x0);
        return 1;

 skip:
        scmd->result = SAM_STAT_GOOD;
        return 1;
}

static u8 ata_scsi_trusted_op(u32 len, bool send, bool dma)
{
        if (len == 0)
                return ATA_CMD_TRUSTED_NONDATA;
        else if (send)
                return dma ? ATA_CMD_TRUSTED_SND_DMA : ATA_CMD_TRUSTED_SND;
        else
                return dma ? ATA_CMD_TRUSTED_RCV_DMA : ATA_CMD_TRUSTED_RCV;
}

static unsigned int ata_scsi_security_inout_xlat(struct ata_queued_cmd *qc)
{
        struct scsi_cmnd *scmd = qc->scsicmd;
        const u8 *cdb = scmd->cmnd;
        struct ata_taskfile *tf = &qc->tf;
        u8 secp = cdb[1];
        bool send = (cdb[0] == SECURITY_PROTOCOL_OUT);
        u16 spsp = get_unaligned_be16(&cdb[2]);
        u32 len = get_unaligned_be32(&cdb[6]);
        bool dma = !(qc->dev->flags & ATA_DFLAG_PIO);

        /*
         * We don't support the ATA "security" protocol.
         */
        if (secp == 0xef) {
                ata_scsi_set_invalid_field(qc->dev, scmd, 1, 0);
                return 1;
        }

        if (cdb[4] & 7) { /* INC_512 */
                if (len > 0xffff) {
                        ata_scsi_set_invalid_field(qc->dev, scmd, 6, 0);
                        return 1;
                }
        } else {
                if (len > 0x01fffe00) {
                        ata_scsi_set_invalid_field(qc->dev, scmd, 6, 0);
                        return 1;
                }

                /* convert to the sector-based ATA addressing */
                len = (len + 511) / 512;
        }

        tf->protocol = dma ? ATA_PROT_DMA : ATA_PROT_PIO;
        tf->flags |= ATA_TFLAG_DEVICE | ATA_TFLAG_ISADDR | ATA_TFLAG_LBA;
        if (send)
                tf->flags |= ATA_TFLAG_WRITE;
        tf->command = ata_scsi_trusted_op(len, send, dma);
        tf->feature = secp;
        tf->lbam = spsp & 0xff;
        tf->lbah = spsp >> 8;

        if (len) {
                tf->nsect = len & 0xff;
                tf->lbal = len >> 8;
        } else {
                if (!send)
                        tf->lbah = (1 << 7);
        }

        ata_qc_set_pc_nbytes(qc);
        return 0;
}

/**
 *        ata_scsi_var_len_cdb_xlat - SATL variable length CDB to Handler
 *        @qc: Command to be translated
 *
 *        Translate a SCSI variable length CDB to specified commands.
 *        It checks a service action value in CDB to call corresponding handler.
 *
 *        RETURNS:
 *        Zero on success, non-zero on failure
 *
 */
static unsigned int ata_scsi_var_len_cdb_xlat(struct ata_queued_cmd *qc)
{
        struct scsi_cmnd *scmd = qc->scsicmd;
        const u8 *cdb = scmd->cmnd;
        const u16 sa = get_unaligned_be16(&cdb[8]);

        /*
         * if service action represents a ata pass-thru(32) command,
         * then pass it to ata_scsi_pass_thru handler.
         */
        if (sa == ATA_32)
                return ata_scsi_pass_thru(qc);

        /* unsupported service action */
        return 1;
}

/**
 *        ata_get_xlat_func - check if SCSI to ATA translation is possible
 *        @dev: ATA device
 *        @cmd: SCSI command opcode to consider
 *
 *        Look up the SCSI command given, and determine whether the
 *        SCSI command is to be translated or simulated.
 *
 *        RETURNS:
 *        Pointer to translation function if possible, %NULL if not.
 */

static inline ata_xlat_func_t ata_get_xlat_func(struct ata_device *dev, u8 cmd)
{
        switch (cmd) {
        case READ_6:
        case READ_10:
        case READ_16:

        case WRITE_6:
        case WRITE_10:
        case WRITE_16:
                return ata_scsi_rw_xlat;

        case WRITE_SAME_16:
                return ata_scsi_write_same_xlat;

        case SYNCHRONIZE_CACHE:
        case SYNCHRONIZE_CACHE_16:
                if (ata_try_flush_cache(dev))
                        return ata_scsi_flush_xlat;
                break;

        case VERIFY:
        case VERIFY_16:
                return ata_scsi_verify_xlat;

        case ATA_12:
        case ATA_16:
                return ata_scsi_pass_thru;

        case VARIABLE_LENGTH_CMD:
                return ata_scsi_var_len_cdb_xlat;

        case MODE_SELECT:
        case MODE_SELECT_10:
                return ata_scsi_mode_select_xlat;
                break;

        case ZBC_IN:
                return ata_scsi_zbc_in_xlat;

        case ZBC_OUT:
                return ata_scsi_zbc_out_xlat;

        case SECURITY_PROTOCOL_IN:
        case SECURITY_PROTOCOL_OUT:
                if (!(dev->flags & ATA_DFLAG_TRUSTED))
                        break;
                return ata_scsi_security_inout_xlat;

        case START_STOP:
                return ata_scsi_start_stop_xlat;
        }

        return NULL;
}

int __ata_scsi_queuecmd(struct scsi_cmnd *scmd, struct ata_device *dev)
{
        struct ata_port *ap = dev->link->ap;
        u8 scsi_op = scmd->cmnd[0];
        ata_xlat_func_t xlat_func;

        /*
         * scsi_queue_rq() will defer commands if scsi_host_in_recovery().
         * However, this check is done without holding the ap->lock (a libata
         * specific lock), so we can have received an error irq since then,
         * therefore we must check if EH is pending, while holding ap->lock.
         */
        if (ap->pflags & (ATA_PFLAG_EH_PENDING | ATA_PFLAG_EH_IN_PROGRESS))
                return SCSI_MLQUEUE_DEVICE_BUSY;

        if (unlikely(!scmd->cmd_len))
                goto bad_cdb_len;

        if (dev->class == ATA_DEV_ATA || dev->class == ATA_DEV_ZAC) {
                if (unlikely(scmd->cmd_len > dev->cdb_len))
                        goto bad_cdb_len;

                xlat_func = ata_get_xlat_func(dev, scsi_op);
        } else if (likely((scsi_op != ATA_16) || !atapi_passthru16)) {
                /* relay SCSI command to ATAPI device */
                int len = COMMAND_SIZE(scsi_op);

                if (unlikely(len > scmd->cmd_len ||
                             len > dev->cdb_len ||
                             scmd->cmd_len > ATAPI_CDB_LEN))
                        goto bad_cdb_len;

                xlat_func = atapi_xlat;
        } else {
                /* ATA_16 passthru, treat as an ATA command */
                if (unlikely(scmd->cmd_len > 16))
                        goto bad_cdb_len;

                xlat_func = ata_get_xlat_func(dev, scsi_op);
        }

        if (xlat_func)
                return ata_scsi_translate(dev, scmd, xlat_func);

        ata_scsi_simulate(dev, scmd);

        return 0;

 bad_cdb_len:
        DPRINTK("bad CDB len=%u, scsi_op=0x%02x, max=%u\n",
                scmd->cmd_len, scsi_op, dev->cdb_len);
        scmd->result = DID_ERROR << 16;
        scmd->scsi_done(scmd);
        return 0;
}

/**
 *        ata_scsi_queuecmd - Issue SCSI cdb to libata-managed device
 *        @shost: SCSI host of command to be sent
 *        @cmd: SCSI command to be sent
 *
 *        In some cases, this function translates SCSI commands into
 *        ATA taskfiles, and queues the taskfiles to be sent to
 *        hardware.  In other cases, this function simulates a
 *        SCSI device by evaluating and responding to certain
 *        SCSI commands.  This creates the overall effect of
 *        ATA and ATAPI devices appearing as SCSI devices.
 *
 *        LOCKING:
 *        ATA host lock
 *
 *        RETURNS:
 *        Return value from __ata_scsi_queuecmd() if @cmd can be queued,
 *        0 otherwise.
 */
int ata_scsi_queuecmd(struct Scsi_Host *shost, struct scsi_cmnd *cmd)
{
        struct ata_port *ap;
        struct ata_device *dev;
        struct scsi_device *scsidev = cmd->device;
        int rc = 0;
        unsigned long irq_flags;

        ap = ata_shost_to_port(shost);

        spin_lock_irqsave(ap->lock, irq_flags);

        dev = ata_scsi_find_dev(ap, scsidev);
        if (likely(dev))
                rc = __ata_scsi_queuecmd(cmd, dev);
        else {
                cmd->result = (DID_BAD_TARGET << 16);
                cmd->scsi_done(cmd);
        }

        spin_unlock_irqrestore(ap->lock, irq_flags);

        return rc;
}
EXPORT_SYMBOL_GPL(ata_scsi_queuecmd);

/**
 *        ata_scsi_simulate - simulate SCSI command on ATA device
 *        @dev: the target device
 *        @cmd: SCSI command being sent to device.
 *
 *        Interprets and directly executes a select list of SCSI commands
 *        that can be handled internally.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 */

void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd)
{
        struct ata_scsi_args args;
        const u8 *scsicmd = cmd->cmnd;
        u8 tmp8;

        args.dev = dev;
        args.id = dev->id;
        args.cmd = cmd;

        switch(scsicmd[0]) {
        case INQUIRY:
                if (scsicmd[1] & 2)                   /* is CmdDt set?  */
                        ata_scsi_set_invalid_field(dev, cmd, 1, 0xff);
                else if ((scsicmd[1] & 1) == 0)    /* is EVPD clear? */
                        ata_scsi_rbuf_fill(&args, ata_scsiop_inq_std);
                else switch (scsicmd[2]) {
                case 0x00:
                        ata_scsi_rbuf_fill(&args, ata_scsiop_inq_00);
                        break;
                case 0x80:
                        ata_scsi_rbuf_fill(&args, ata_scsiop_inq_80);
                        break;
                case 0x83:
                        ata_scsi_rbuf_fill(&args, ata_scsiop_inq_83);
                        break;
                case 0x89:
                        ata_scsi_rbuf_fill(&args, ata_scsiop_inq_89);
                        break;
                case 0xb0:
                        ata_scsi_rbuf_fill(&args, ata_scsiop_inq_b0);
                        break;
                case 0xb1:
                        ata_scsi_rbuf_fill(&args, ata_scsiop_inq_b1);
                        break;
                case 0xb2:
                        ata_scsi_rbuf_fill(&args, ata_scsiop_inq_b2);
                        break;
                case 0xb6:
                        if (dev->flags & ATA_DFLAG_ZAC) {
                                ata_scsi_rbuf_fill(&args, ata_scsiop_inq_b6);
                                break;
                        }
                        fallthrough;
                default:
                        ata_scsi_set_invalid_field(dev, cmd, 2, 0xff);
                        break;
                }
                break;

        case MODE_SENSE:
        case MODE_SENSE_10:
                ata_scsi_rbuf_fill(&args, ata_scsiop_mode_sense);
                break;

        case READ_CAPACITY:
                ata_scsi_rbuf_fill(&args, ata_scsiop_read_cap);
                break;

        case SERVICE_ACTION_IN_16:
                if ((scsicmd[1] & 0x1f) == SAI_READ_CAPACITY_16)
                        ata_scsi_rbuf_fill(&args, ata_scsiop_read_cap);
                else
                        ata_scsi_set_invalid_field(dev, cmd, 1, 0xff);
                break;

        case REPORT_LUNS:
                ata_scsi_rbuf_fill(&args, ata_scsiop_report_luns);
                break;

        case REQUEST_SENSE:
                ata_scsi_set_sense(dev, cmd, 0, 0, 0);
                cmd->result = (DRIVER_SENSE << 24);
                break;

        /* if we reach this, then writeback caching is disabled,
         * turning this into a no-op.
         */
        case SYNCHRONIZE_CACHE:
        case SYNCHRONIZE_CACHE_16:
                fallthrough;

        /* no-op's, complete with success */
        case REZERO_UNIT:
        case SEEK_6:
        case SEEK_10:
        case TEST_UNIT_READY:
                break;

        case SEND_DIAGNOSTIC:
                tmp8 = scsicmd[1] & ~(1 << 3);
                if (tmp8 != 0x4 || scsicmd[3] || scsicmd[4])
                        ata_scsi_set_invalid_field(dev, cmd, 1, 0xff);
                break;

        case MAINTENANCE_IN:
                if ((scsicmd[1] & 0x1f) == MI_REPORT_SUPPORTED_OPERATION_CODES)
                        ata_scsi_rbuf_fill(&args, ata_scsiop_maint_in);
                else
                        ata_scsi_set_invalid_field(dev, cmd, 1, 0xff);
                break;

        /* all other commands */
        default:
                ata_scsi_set_sense(dev, cmd, ILLEGAL_REQUEST, 0x20, 0x0);
                /* "Invalid command operation code" */
                break;
        }

        cmd->scsi_done(cmd);
}

int ata_scsi_add_hosts(struct ata_host *host, struct scsi_host_template *sht)
{
        int i, rc;

        for (i = 0; i < host->n_ports; i++) {
                struct ata_port *ap = host->ports[i];
                struct Scsi_Host *shost;

                rc = -ENOMEM;
                shost = scsi_host_alloc(sht, sizeof(struct ata_port *));
                if (!shost)
                        goto err_alloc;

                shost->eh_noresume = 1;
                *(struct ata_port **)&shost->hostdata[0] = ap;
                ap->scsi_host = shost;

                shost->transportt = ata_scsi_transport_template;
                shost->unique_id = ap->print_id;
                shost->max_id = 16;
                shost->max_lun = 1;
                shost->max_channel = 1;
                shost->max_cmd_len = 32;

                /* Schedule policy is determined by ->qc_defer()
                 * callback and it needs to see every deferred qc.
                 * Set host_blocked to 1 to prevent SCSI midlayer from
                 * automatically deferring requests.
                 */
                shost->max_host_blocked = 1;

                rc = scsi_add_host_with_dma(shost, &ap->tdev, ap->host->dev);
                if (rc)
                        goto err_alloc;
        }

        return 0;

 err_alloc:
        while (--i >= 0) {
                struct Scsi_Host *shost = host->ports[i]->scsi_host;

                /* scsi_host_put() is in ata_devres_release() */
                scsi_remove_host(shost);
        }
        return rc;
}

#ifdef CONFIG_OF
static void ata_scsi_assign_ofnode(struct ata_device *dev, struct ata_port *ap)
{
        struct scsi_device *sdev = dev->sdev;
        struct device *d = ap->host->dev;
        struct device_node *np = d->of_node;
        struct device_node *child;

        for_each_available_child_of_node(np, child) {
                int ret;
                u32 val;

                ret = of_property_read_u32(child, "reg", &val);
                if (ret)
                        continue;
                if (val == dev->devno) {
                        dev_dbg(d, "found matching device node\n");
                        sdev->sdev_gendev.of_node = child;
                        return;
                }
        }
}
#else
static void ata_scsi_assign_ofnode(struct ata_device *dev, struct ata_port *ap)
{
}
#endif

void ata_scsi_scan_host(struct ata_port *ap, int sync)
{
        int tries = 5;
        struct ata_device *last_failed_dev = NULL;
        struct ata_link *link;
        struct ata_device *dev;

 repeat:
        ata_for_each_link(link, ap, EDGE) {
                ata_for_each_dev(dev, link, ENABLED) {
                        struct scsi_device *sdev;
                        int channel = 0, id = 0;

                        if (dev->sdev)
                                continue;

                        if (ata_is_host_link(link))
                                id = dev->devno;
                        else
                                channel = link->pmp;

                        sdev = __scsi_add_device(ap->scsi_host, channel, id, 0,
                                                 NULL);
                        if (!IS_ERR(sdev)) {
                                dev->sdev = sdev;
                                ata_scsi_assign_ofnode(dev, ap);
                                scsi_device_put(sdev);
                        } else {
                                dev->sdev = NULL;
                        }
                }
        }

        /* If we scanned while EH was in progress or allocation
         * failure occurred, scan would have failed silently.  Check
         * whether all devices are attached.
         */
        ata_for_each_link(link, ap, EDGE) {
                ata_for_each_dev(dev, link, ENABLED) {
                        if (!dev->sdev)
                                goto exit_loop;
                }
        }
 exit_loop:
        if (!link)
                return;

        /* we're missing some SCSI devices */
        if (sync) {
                /* If caller requested synchrnous scan && we've made
                 * any progress, sleep briefly and repeat.
                 */
                if (dev != last_failed_dev) {
                        msleep(100);
                        last_failed_dev = dev;
                        goto repeat;
                }

                /* We might be failing to detect boot device, give it
                 * a few more chances.
                 */
                if (--tries) {
                        msleep(100);
                        goto repeat;
                }

                ata_port_err(ap,
                             "WARNING: synchronous SCSI scan failed without making any progress, switching to async\n");
        }

        queue_delayed_work(system_long_wq, &ap->hotplug_task,
                           round_jiffies_relative(HZ));
}

/**
 *        ata_scsi_offline_dev - offline attached SCSI device
 *        @dev: ATA device to offline attached SCSI device for
 *
 *        This function is called from ata_eh_hotplug() and responsible
 *        for taking the SCSI device attached to @dev offline.  This
 *        function is called with host lock which protects dev->sdev
 *        against clearing.
 *
 *        LOCKING:
 *        spin_lock_irqsave(host lock)
 *
 *        RETURNS:
 *        1 if attached SCSI device exists, 0 otherwise.
 */
int ata_scsi_offline_dev(struct ata_device *dev)
{
        if (dev->sdev) {
                scsi_device_set_state(dev->sdev, SDEV_OFFLINE);
                return 1;
        }
        return 0;
}

/**
 *        ata_scsi_remove_dev - remove attached SCSI device
 *        @dev: ATA device to remove attached SCSI device for
 *
 *        This function is called from ata_eh_scsi_hotplug() and
 *        responsible for removing the SCSI device attached to @dev.
 *
 *        LOCKING:
 *        Kernel thread context (may sleep).
 */
static void ata_scsi_remove_dev(struct ata_device *dev)
{
        struct ata_port *ap = dev->link->ap;
        struct scsi_device *sdev;
        unsigned long flags;

        /* Alas, we need to grab scan_mutex to ensure SCSI device
         * state doesn't change underneath us and thus
         * scsi_device_get() always succeeds.  The mutex locking can
         * be removed if there is __scsi_device_get() interface which
         * increments reference counts regardless of device state.
         */
        mutex_lock(&ap->scsi_host->scan_mutex);
        spin_lock_irqsave(ap->lock, flags);

        /* clearing dev->sdev is protected by host lock */
        sdev = dev->sdev;
        dev->sdev = NULL;

        if (sdev) {
                /* If user initiated unplug races with us, sdev can go
                 * away underneath us after the host lock and
                 * scan_mutex are released.  Hold onto it.
                 */
                if (scsi_device_get(sdev) == 0) {
                        /* The following ensures the attached sdev is
                         * offline on return from ata_scsi_offline_dev()
                         * regardless it wins or loses the race
                         * against this function.
                         */
                        scsi_device_set_state(sdev, SDEV_OFFLINE);
                } else {
                        WARN_ON(1);
                        sdev = NULL;
                }
        }

        spin_unlock_irqrestore(ap->lock, flags);
        mutex_unlock(&ap->scsi_host->scan_mutex);

        if (sdev) {
                ata_dev_info(dev, "detaching (SCSI %s)\n",
                             dev_name(&sdev->sdev_gendev));

                scsi_remove_device(sdev);
                scsi_device_put(sdev);
        }
}

static void ata_scsi_handle_link_detach(struct ata_link *link)
{
        struct ata_port *ap = link->ap;
        struct ata_device *dev;

        ata_for_each_dev(dev, link, ALL) {
                unsigned long flags;

                if (!(dev->flags & ATA_DFLAG_DETACHED))
                        continue;

                spin_lock_irqsave(ap->lock, flags);
                dev->flags &= ~ATA_DFLAG_DETACHED;
                spin_unlock_irqrestore(ap->lock, flags);

                if (zpodd_dev_enabled(dev))
                        zpodd_exit(dev);

                ata_scsi_remove_dev(dev);
        }
}

/**
 *        ata_scsi_media_change_notify - send media change event
 *        @dev: Pointer to the disk device with media change event
 *
 *        Tell the block layer to send a media change notification
 *        event.
 *
 *         LOCKING:
 *         spin_lock_irqsave(host lock)
 */
void ata_scsi_media_change_notify(struct ata_device *dev)
{
        if (dev->sdev)
                sdev_evt_send_simple(dev->sdev, SDEV_EVT_MEDIA_CHANGE,
                                     GFP_ATOMIC);
}

/**
 *        ata_scsi_hotplug - SCSI part of hotplug
 *        @work: Pointer to ATA port to perform SCSI hotplug on
 *
 *        Perform SCSI part of hotplug.  It's executed from a separate
 *        workqueue after EH completes.  This is necessary because SCSI
 *        hot plugging requires working EH and hot unplugging is
 *        synchronized with hot plugging with a mutex.
 *
 *        LOCKING:
 *        Kernel thread context (may sleep).
 */
void ata_scsi_hotplug(struct work_struct *work)
{
        struct ata_port *ap =
                container_of(work, struct ata_port, hotplug_task.work);
        int i;

        if (ap->pflags & ATA_PFLAG_UNLOADING) {
                DPRINTK("ENTER/EXIT - unloading\n");
                return;
        }

        DPRINTK("ENTER\n");
        mutex_lock(&ap->scsi_scan_mutex);

        /* Unplug detached devices.  We cannot use link iterator here
         * because PMP links have to be scanned even if PMP is
         * currently not attached.  Iterate manually.
         */
        ata_scsi_handle_link_detach(&ap->link);
        if (ap->pmp_link)
                for (i = 0; i < SATA_PMP_MAX_PORTS; i++)
                        ata_scsi_handle_link_detach(&ap->pmp_link[i]);

        /* scan for new ones */
        ata_scsi_scan_host(ap, 0);

        mutex_unlock(&ap->scsi_scan_mutex);
        DPRINTK("EXIT\n");
}

/**
 *        ata_scsi_user_scan - indication for user-initiated bus scan
 *        @shost: SCSI host to scan
 *        @channel: Channel to scan
 *        @id: ID to scan
 *        @lun: LUN to scan
 *
 *        This function is called when user explicitly requests bus
 *        scan.  Set probe pending flag and invoke EH.
 *
 *        LOCKING:
 *        SCSI layer (we don't care)
 *
 *        RETURNS:
 *        Zero.
 */
int ata_scsi_user_scan(struct Scsi_Host *shost, unsigned int channel,
                       unsigned int id, u64 lun)
{
        struct ata_port *ap = ata_shost_to_port(shost);
        unsigned long flags;
        int devno, rc = 0;

        if (!ap->ops->error_handler)
                return -EOPNOTSUPP;

        if (lun != SCAN_WILD_CARD && lun)
                return -EINVAL;

        if (!sata_pmp_attached(ap)) {
                if (channel != SCAN_WILD_CARD && channel)
                        return -EINVAL;
                devno = id;
        } else {
                if (id != SCAN_WILD_CARD && id)
                        return -EINVAL;
                devno = channel;
        }

        spin_lock_irqsave(ap->lock, flags);

        if (devno == SCAN_WILD_CARD) {
                struct ata_link *link;

                ata_for_each_link(link, ap, EDGE) {
                        struct ata_eh_info *ehi = &link->eh_info;
                        ehi->probe_mask |= ATA_ALL_DEVICES;
                        ehi->action |= ATA_EH_RESET;
                }
        } else {
                struct ata_device *dev = ata_find_dev(ap, devno);

                if (dev) {
                        struct ata_eh_info *ehi = &dev->link->eh_info;
                        ehi->probe_mask |= 1 << dev->devno;
                        ehi->action |= ATA_EH_RESET;
                } else
                        rc = -EINVAL;
        }

        if (rc == 0) {
                ata_port_schedule_eh(ap);
                spin_unlock_irqrestore(ap->lock, flags);
                ata_port_wait_eh(ap);
        } else
                spin_unlock_irqrestore(ap->lock, flags);

        return rc;
}

/**
 *        ata_scsi_dev_rescan - initiate scsi_rescan_device()
 *        @work: Pointer to ATA port to perform scsi_rescan_device()
 *
 *        After ATA pass thru (SAT) commands are executed successfully,
 *        libata need to propagate the changes to SCSI layer.
 *
 *        LOCKING:
 *        Kernel thread context (may sleep).
 */
void ata_scsi_dev_rescan(struct work_struct *work)
{
        struct ata_port *ap =
                container_of(work, struct ata_port, scsi_rescan_task);
        struct ata_link *link;
        struct ata_device *dev;
        unsigned long flags;

        mutex_lock(&ap->scsi_scan_mutex);
        spin_lock_irqsave(ap->lock, flags);

        ata_for_each_link(link, ap, EDGE) {
                ata_for_each_dev(dev, link, ENABLED) {
                        struct scsi_device *sdev = dev->sdev;

                        if (!sdev)
                                continue;
                        if (scsi_device_get(sdev))
                                continue;

                        spin_unlock_irqrestore(ap->lock, flags);
                        scsi_rescan_device(&(sdev->sdev_gendev));
                        scsi_device_put(sdev);
                        spin_lock_irqsave(ap->lock, flags);
                }
        }

        spin_unlock_irqrestore(ap->lock, flags);
        mutex_unlock(&ap->scsi_scan_mutex);
}



































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* include/asm-generic/tlb.h
 *
 *        Generic TLB shootdown code
 *
 * Copyright 2001 Red Hat, Inc.
 * Based on code from mm/memory.c Copyright Linus Torvalds and others.
 *
 * Copyright 2011 Red Hat, Inc., Peter Zijlstra
 */
#ifndef _ASM_GENERIC__TLB_H
#define _ASM_GENERIC__TLB_H

#include <linux/mmu_notifier.h>
#include <linux/swap.h>
#include <linux/hugetlb_inline.h>
#include <asm/tlbflush.h>
#include <asm/cacheflush.h>

/*
 * Blindly accessing user memory from NMI context can be dangerous
 * if we're in the middle of switching the current user task or switching
 * the loaded mm.
 */
#ifndef nmi_uaccess_okay
# define nmi_uaccess_okay() true
#endif

#ifdef CONFIG_MMU

/*
 * Generic MMU-gather implementation.
 *
 * The mmu_gather data structure is used by the mm code to implement the
 * correct and efficient ordering of freeing pages and TLB invalidations.
 *
 * This correct ordering is:
 *
 *  1) unhook page
 *  2) TLB invalidate page
 *  3) free page
 *
 * That is, we must never free a page before we have ensured there are no live
 * translations left to it. Otherwise it might be possible to observe (or
 * worse, change) the page content after it has been reused.
 *
 * The mmu_gather API consists of:
 *
 *  - tlb_gather_mmu() / tlb_gather_mmu_vma() / tlb_finish_mmu(); start and
 *    finish a mmu_gather
 *
 *    Finish in particular will issue a (final) TLB invalidate and free
 *    all (remaining) queued pages.
 *
 *  - tlb_start_vma() / tlb_end_vma(); marks the start / end of a VMA
 *
 *    Defaults to flushing at tlb_end_vma() to reset the range; helps when
 *    there's large holes between the VMAs.
 *
 *  - tlb_remove_table()
 *
 *    tlb_remove_table() is the basic primitive to free page-table directories
 *    (__p*_free_tlb()).  In it's most primitive form it is an alias for
 *    tlb_remove_page() below, for when page directories are pages and have no
 *    additional constraints.
 *
 *    See also MMU_GATHER_TABLE_FREE and MMU_GATHER_RCU_TABLE_FREE.
 *
 *  - tlb_remove_page() / __tlb_remove_page()
 *  - tlb_remove_page_size() / __tlb_remove_page_size()
 *
 *    __tlb_remove_page_size() is the basic primitive that queues a page for
 *    freeing. __tlb_remove_page() assumes PAGE_SIZE. Both will return a
 *    boolean indicating if the queue is (now) full and a call to
 *    tlb_flush_mmu() is required.
 *
 *    tlb_remove_page() and tlb_remove_page_size() imply the call to
 *    tlb_flush_mmu() when required and has no return value.
 *
 *  - tlb_change_page_size()
 *
 *    call before __tlb_remove_page*() to set the current page-size; implies a
 *    possible tlb_flush_mmu() call.
 *
 *  - tlb_flush_mmu() / tlb_flush_mmu_tlbonly()
 *
 *    tlb_flush_mmu_tlbonly() - does the TLB invalidate (and resets
 *                              related state, like the range)
 *
 *    tlb_flush_mmu() - in addition to the above TLB invalidate, also frees
 *                        whatever pages are still batched.
 *
 *  - mmu_gather::fullmm
 *
 *    A flag set by tlb_gather_mmu() to indicate we're going to free
 *    the entire mm; this allows a number of optimizations.
 *
 *    - We can ignore tlb_{start,end}_vma(); because we don't
 *      care about ranges. Everything will be shot down.
 *
 *    - (RISC) architectures that use ASIDs can cycle to a new ASID
 *      and delay the invalidation until ASID space runs out.
 *
 *  - mmu_gather::need_flush_all
 *
 *    A flag that can be set by the arch code if it wants to force
 *    flush the entire TLB irrespective of the range. For instance
 *    x86-PAE needs this when changing top-level entries.
 *
 * And allows the architecture to provide and implement tlb_flush():
 *
 * tlb_flush() may, in addition to the above mentioned mmu_gather fields, make
 * use of:
 *
 *  - mmu_gather::start / mmu_gather::end
 *
 *    which provides the range that needs to be flushed to cover the pages to
 *    be freed.
 *
 *  - mmu_gather::freed_tables
 *
 *    set when we freed page table pages
 *
 *  - tlb_get_unmap_shift() / tlb_get_unmap_size()
 *
 *    returns the smallest TLB entry size unmapped in this range.
 *
 * If an architecture does not provide tlb_flush() a default implementation
 * based on flush_tlb_range() will be used, unless MMU_GATHER_NO_RANGE is
 * specified, in which case we'll default to flush_tlb_mm().
 *
 * Additionally there are a few opt-in features:
 *
 *  MMU_GATHER_PAGE_SIZE
 *
 *  This ensures we call tlb_flush() every time tlb_change_page_size() actually
 *  changes the size and provides mmu_gather::page_size to tlb_flush().
 *
 *  This might be useful if your architecture has size specific TLB
 *  invalidation instructions.
 *
 *  MMU_GATHER_TABLE_FREE
 *
 *  This provides tlb_remove_table(), to be used instead of tlb_remove_page()
 *  for page directores (__p*_free_tlb()).
 *
 *  Useful if your architecture has non-page page directories.
 *
 *  When used, an architecture is expected to provide __tlb_remove_table()
 *  which does the actual freeing of these pages.
 *
 *  MMU_GATHER_RCU_TABLE_FREE
 *
 *  Like MMU_GATHER_TABLE_FREE, and adds semi-RCU semantics to the free (see
 *  comment below).
 *
 *  Useful if your architecture doesn't use IPIs for remote TLB invalidates
 *  and therefore doesn't naturally serialize with software page-table walkers.
 *
 *  MMU_GATHER_NO_RANGE
 *
 *  Use this if your architecture lacks an efficient flush_tlb_range().
 *
 *  MMU_GATHER_NO_GATHER
 *
 *  If the option is set the mmu_gather will not track individual pages for
 *  delayed page free anymore. A platform that enables the option needs to
 *  provide its own implementation of the __tlb_remove_page_size() function to
 *  free pages.
 *
 *  This is useful if your architecture already flushes TLB entries in the
 *  various ptep_get_and_clear() functions.
 */

#ifdef CONFIG_MMU_GATHER_TABLE_FREE

struct mmu_table_batch {
#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE
        struct rcu_head                rcu;
#endif
        unsigned int                nr;
        void                        *tables[0];
};

#define MAX_TABLE_BATCH                \
        ((PAGE_SIZE - sizeof(struct mmu_table_batch)) / sizeof(void *))

extern void tlb_remove_table(struct mmu_gather *tlb, void *table);

#else /* !CONFIG_MMU_GATHER_HAVE_TABLE_FREE */

/*
 * Without MMU_GATHER_TABLE_FREE the architecture is assumed to have page based
 * page directories and we can use the normal page batching to free them.
 */
#define tlb_remove_table(tlb, page) tlb_remove_page((tlb), (page))

#endif /* CONFIG_MMU_GATHER_TABLE_FREE */

#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE
/*
 * This allows an architecture that does not use the linux page-tables for
 * hardware to skip the TLBI when freeing page tables.
 */
#ifndef tlb_needs_table_invalidate
#define tlb_needs_table_invalidate() (true)
#endif

void tlb_remove_table_sync_one(void);

#else

#ifdef tlb_needs_table_invalidate
#error tlb_needs_table_invalidate() requires MMU_GATHER_RCU_TABLE_FREE
#endif

static inline void tlb_remove_table_sync_one(void) { }

#endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */


#ifndef CONFIG_MMU_GATHER_NO_GATHER
/*
 * If we can't allocate a page to make a big batch of page pointers
 * to work on, then just handle a few from the on-stack structure.
 */
#define MMU_GATHER_BUNDLE        8

struct mmu_gather_batch {
        struct mmu_gather_batch        *next;
        unsigned int                nr;
        unsigned int                max;
        struct page                *pages[0];
};

#define MAX_GATHER_BATCH        \
        ((PAGE_SIZE - sizeof(struct mmu_gather_batch)) / sizeof(void *))

/*
 * Limit the maximum number of mmu_gather batches to reduce a risk of soft
 * lockups for non-preemptible kernels on huge machines when a lot of memory
 * is zapped during unmapping.
 * 10K pages freed at once should be safe even without a preemption point.
 */
#define MAX_GATHER_BATCH_COUNT        (10000UL/MAX_GATHER_BATCH)

extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
                                   int page_size);
#endif

/*
 * struct mmu_gather is an opaque type used by the mm code for passing around
 * any data needed by arch specific code for tlb_remove_page.
 */
struct mmu_gather {
        struct mm_struct        *mm;

#ifdef CONFIG_MMU_GATHER_TABLE_FREE
        struct mmu_table_batch        *batch;
#endif

        unsigned long                start;
        unsigned long                end;
        /*
         * we are in the middle of an operation to clear
         * a full mm and can make some optimizations
         */
        unsigned int                fullmm : 1;

        /*
         * we have performed an operation which
         * requires a complete flush of the tlb
         */
        unsigned int                need_flush_all : 1;

        /*
         * we have removed page directories
         */
        unsigned int                freed_tables : 1;

        /*
         * at which levels have we cleared entries?
         */
        unsigned int                cleared_ptes : 1;
        unsigned int                cleared_pmds : 1;
        unsigned int                cleared_puds : 1;
        unsigned int                cleared_p4ds : 1;

        /*
         * tracks VM_EXEC | VM_HUGETLB in tlb_start_vma
         */
        unsigned int                vma_exec : 1;
        unsigned int                vma_huge : 1;

        /*
         * Did we unshare (unmap) any shared page tables? For now only
         * used for hugetlb PMD table sharing.
         */
        unsigned int                unshared_tables : 1;

        /*
         * Did we unshare any page tables such that they are now exclusive
         * and could get reused+modified by the new owner? When setting this
         * flag, "unshared_tables" will be set as well. For now only used
         * for hugetlb PMD table sharing.
         */
        unsigned int                fully_unshared_tables : 1;

        unsigned int                batch_count;

#ifndef CONFIG_MMU_GATHER_NO_GATHER
        struct mmu_gather_batch *active;
        struct mmu_gather_batch        local;
        struct page                *__pages[MMU_GATHER_BUNDLE];

#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
        unsigned int page_size;
#endif
#endif
};

void tlb_flush_mmu(struct mmu_gather *tlb);

static inline void __tlb_adjust_range(struct mmu_gather *tlb,
                                      unsigned long address,
                                      unsigned int range_size)
{
        tlb->start = min(tlb->start, address);
        tlb->end = max(tlb->end, address + range_size);
}

static inline void __tlb_reset_range(struct mmu_gather *tlb)
{
        if (tlb->fullmm) {
                tlb->start = tlb->end = ~0;
        } else {
                tlb->start = TASK_SIZE;
                tlb->end = 0;
        }
        tlb->freed_tables = 0;
        tlb->cleared_ptes = 0;
        tlb->cleared_pmds = 0;
        tlb->cleared_puds = 0;
        tlb->cleared_p4ds = 0;
        tlb->unshared_tables = 0;
        /*
         * Do not reset mmu_gather::vma_* fields here, we do not
         * call into tlb_start_vma() again to set them if there is an
         * intermediate flush.
         */
}

#ifdef CONFIG_MMU_GATHER_NO_RANGE

#if defined(tlb_flush) || defined(tlb_start_vma) || defined(tlb_end_vma)
#error MMU_GATHER_NO_RANGE relies on default tlb_flush(), tlb_start_vma() and tlb_end_vma()
#endif

/*
 * When an architecture does not have efficient means of range flushing TLBs
 * there is no point in doing intermediate flushes on tlb_end_vma() to keep the
 * range small. We equally don't have to worry about page granularity or other
 * things.
 *
 * All we need to do is issue a full flush for any !0 range.
 */
static inline void tlb_flush(struct mmu_gather *tlb)
{
        if (tlb->end)
                flush_tlb_mm(tlb->mm);
}

static inline void
tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) { }

#define tlb_end_vma tlb_end_vma
static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) { }

#else /* CONFIG_MMU_GATHER_NO_RANGE */

#ifndef tlb_flush

#if defined(tlb_start_vma) || defined(tlb_end_vma)
#error Default tlb_flush() relies on default tlb_start_vma() and tlb_end_vma()
#endif

/*
 * When an architecture does not provide its own tlb_flush() implementation
 * but does have a reasonably efficient flush_vma_range() implementation
 * use that.
 */
static inline void tlb_flush(struct mmu_gather *tlb)
{
        if (tlb->fullmm || tlb->need_flush_all) {
                flush_tlb_mm(tlb->mm);
        } else if (tlb->end) {
                struct vm_area_struct vma = {
                        .vm_mm = tlb->mm,
                        .vm_flags = (tlb->vma_exec ? VM_EXEC    : 0) |
                                    (tlb->vma_huge ? VM_HUGETLB : 0),
                };

                flush_tlb_range(&vma, tlb->start, tlb->end);
        }
}

static inline void
tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma)
{
        /*
         * flush_tlb_range() implementations that look at VM_HUGETLB (tile,
         * mips-4k) flush only large pages.
         *
         * flush_tlb_range() implementations that flush I-TLB also flush D-TLB
         * (tile, xtensa, arm), so it's ok to just add VM_EXEC to an existing
         * range.
         *
         * We rely on tlb_end_vma() to issue a flush, such that when we reset
         * these values the batch is empty.
         */
        tlb->vma_huge = is_vm_hugetlb_page(vma);
        tlb->vma_exec = !!(vma->vm_flags & VM_EXEC);
}

#else

static inline void
tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) { }

#endif

#endif /* CONFIG_MMU_GATHER_NO_RANGE */

static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
{
        /*
         * Anything calling __tlb_adjust_range() also sets at least one of
         * these bits.
         */
        if (!(tlb->freed_tables || tlb->cleared_ptes || tlb->cleared_pmds ||
              tlb->cleared_puds || tlb->cleared_p4ds || tlb->unshared_tables))
                return;

        tlb_flush(tlb);
        mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end);
        __tlb_reset_range(tlb);
}

static inline void tlb_remove_page_size(struct mmu_gather *tlb,
                                        struct page *page, int page_size)
{
        if (__tlb_remove_page_size(tlb, page, page_size))
                tlb_flush_mmu(tlb);
}

static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
{
        return __tlb_remove_page_size(tlb, page, PAGE_SIZE);
}

/* tlb_remove_page
 *        Similar to __tlb_remove_page but will call tlb_flush_mmu() itself when
 *        required.
 */
static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
{
        return tlb_remove_page_size(tlb, page, PAGE_SIZE);
}

static inline void tlb_change_page_size(struct mmu_gather *tlb,
                                                     unsigned int page_size)
{
#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
        if (tlb->page_size && tlb->page_size != page_size) {
                if (!tlb->fullmm && !tlb->need_flush_all)
                        tlb_flush_mmu(tlb);
        }

        tlb->page_size = page_size;
#endif
}

static inline unsigned long tlb_get_unmap_shift(struct mmu_gather *tlb)
{
        if (tlb->cleared_ptes)
                return PAGE_SHIFT;
        if (tlb->cleared_pmds)
                return PMD_SHIFT;
        if (tlb->cleared_puds)
                return PUD_SHIFT;
        if (tlb->cleared_p4ds)
                return P4D_SHIFT;

        return PAGE_SHIFT;
}

static inline unsigned long tlb_get_unmap_size(struct mmu_gather *tlb)
{
        return 1UL << tlb_get_unmap_shift(tlb);
}

/*
 * In the case of tlb vma handling, we can optimise these away in the
 * case where we're doing a full MM flush.  When we're doing a munmap,
 * the vmas are adjusted to only cover the region to be torn down.
 */
#ifndef tlb_start_vma
static inline void tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
{
        if (tlb->fullmm)
                return;

        tlb_update_vma_flags(tlb, vma);
        flush_cache_range(vma, vma->vm_start, vma->vm_end);
}
#endif

#ifndef tlb_end_vma
static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
{
        if (tlb->fullmm)
                return;

        /*
         * Do a TLB flush and reset the range at VMA boundaries; this avoids
         * the ranges growing with the unused space between consecutive VMAs,
         * but also the mmu_gather::vma_* flags from tlb_start_vma() rely on
         * this.
         */
        tlb_flush_mmu_tlbonly(tlb);
}
#endif

/*
 * tlb_flush_{pte|pmd|pud|p4d}_range() adjust the tlb->start and tlb->end,
 * and set corresponding cleared_*.
 */
static inline void tlb_flush_pte_range(struct mmu_gather *tlb,
                                     unsigned long address, unsigned long size)
{
        __tlb_adjust_range(tlb, address, size);
        tlb->cleared_ptes = 1;
}

static inline void tlb_flush_pmd_range(struct mmu_gather *tlb,
                                     unsigned long address, unsigned long size)
{
        __tlb_adjust_range(tlb, address, size);
        tlb->cleared_pmds = 1;
}

static inline void tlb_flush_pud_range(struct mmu_gather *tlb,
                                     unsigned long address, unsigned long size)
{
        __tlb_adjust_range(tlb, address, size);
        tlb->cleared_puds = 1;
}

static inline void tlb_flush_p4d_range(struct mmu_gather *tlb,
                                     unsigned long address, unsigned long size)
{
        __tlb_adjust_range(tlb, address, size);
        tlb->cleared_p4ds = 1;
}

#ifndef __tlb_remove_tlb_entry
#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
#endif

/**
 * tlb_remove_tlb_entry - remember a pte unmapping for later tlb invalidation.
 *
 * Record the fact that pte's were really unmapped by updating the range,
 * so we can later optimise away the tlb invalidate.   This helps when
 * userspace is unmapping already-unmapped pages, which happens quite a lot.
 */
#define tlb_remove_tlb_entry(tlb, ptep, address)                \
        do {                                                        \
                tlb_flush_pte_range(tlb, address, PAGE_SIZE);        \
                __tlb_remove_tlb_entry(tlb, ptep, address);        \
        } while (0)

#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address)        \
        do {                                                        \
                unsigned long _sz = huge_page_size(h);                \
                if (_sz >= P4D_SIZE)                                \
                        tlb_flush_p4d_range(tlb, address, _sz);        \
                else if (_sz >= PUD_SIZE)                        \
                        tlb_flush_pud_range(tlb, address, _sz);        \
                else if (_sz >= PMD_SIZE)                        \
                        tlb_flush_pmd_range(tlb, address, _sz);        \
                else                                                \
                        tlb_flush_pte_range(tlb, address, _sz);        \
                __tlb_remove_tlb_entry(tlb, ptep, address);        \
        } while (0)

/**
 * tlb_remove_pmd_tlb_entry - remember a pmd mapping for later tlb invalidation
 * This is a nop so far, because only x86 needs it.
 */
#ifndef __tlb_remove_pmd_tlb_entry
#define __tlb_remove_pmd_tlb_entry(tlb, pmdp, address) do {} while (0)
#endif

#define tlb_remove_pmd_tlb_entry(tlb, pmdp, address)                        \
        do {                                                                \
                tlb_flush_pmd_range(tlb, address, HPAGE_PMD_SIZE);        \
                __tlb_remove_pmd_tlb_entry(tlb, pmdp, address);                \
        } while (0)

/**
 * tlb_remove_pud_tlb_entry - remember a pud mapping for later tlb
 * invalidation. This is a nop so far, because only x86 needs it.
 */
#ifndef __tlb_remove_pud_tlb_entry
#define __tlb_remove_pud_tlb_entry(tlb, pudp, address) do {} while (0)
#endif

#define tlb_remove_pud_tlb_entry(tlb, pudp, address)                        \
        do {                                                                \
                tlb_flush_pud_range(tlb, address, HPAGE_PUD_SIZE);        \
                __tlb_remove_pud_tlb_entry(tlb, pudp, address);                \
        } while (0)

/*
 * For things like page tables caches (ie caching addresses "inside" the
 * page tables, like x86 does), for legacy reasons, flushing an
 * individual page had better flush the page table caches behind it. This
 * is definitely how x86 works, for example. And if you have an
 * architected non-legacy page table cache (which I'm not aware of
 * anybody actually doing), you're going to have some architecturally
 * explicit flushing for that, likely *separate* from a regular TLB entry
 * flush, and thus you'd need more than just some range expansion..
 *
 * So if we ever find an architecture
 * that would want something that odd, I think it is up to that
 * architecture to do its own odd thing, not cause pain for others
 * http://lkml.kernel.org/r/CA+55aFzBggoXtNXQeng5d_mRoDnaMBE5Y+URs+PHR67nUpMtaw@mail.gmail.com
 *
 * For now w.r.t page table cache, mark the range_size as PAGE_SIZE
 */

#ifndef pte_free_tlb
#define pte_free_tlb(tlb, ptep, address)                        \
        do {                                                        \
                tlb_flush_pmd_range(tlb, address, PAGE_SIZE);        \
                tlb->freed_tables = 1;                                \
                __pte_free_tlb(tlb, ptep, address);                \
        } while (0)
#endif

#ifndef pmd_free_tlb
#define pmd_free_tlb(tlb, pmdp, address)                        \
        do {                                                        \
                tlb_flush_pud_range(tlb, address, PAGE_SIZE);        \
                tlb->freed_tables = 1;                                \
                __pmd_free_tlb(tlb, pmdp, address);                \
        } while (0)
#endif

#ifndef pud_free_tlb
#define pud_free_tlb(tlb, pudp, address)                        \
        do {                                                        \
                tlb_flush_p4d_range(tlb, address, PAGE_SIZE);        \
                tlb->freed_tables = 1;                                \
                __pud_free_tlb(tlb, pudp, address);                \
        } while (0)
#endif

#ifndef p4d_free_tlb
#define p4d_free_tlb(tlb, pudp, address)                        \
        do {                                                        \
                __tlb_adjust_range(tlb, address, PAGE_SIZE);        \
                tlb->freed_tables = 1;                                \
                __p4d_free_tlb(tlb, pudp, address);                \
        } while (0)
#endif

#if defined(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && defined(CONFIG_HUGETLB_PAGE)
static inline void tlb_unshare_pmd_ptdesc(struct mmu_gather *tlb, struct page *pt,
                                          unsigned long addr)
{
        /*
         * The caller must make sure that concurrent unsharing + exclusive
         * reuse is impossible until tlb_flush_unshared_tables() was called.
         */
        VM_WARN_ON_ONCE(!atomic_read(&pt->pt_share_count));
        atomic_dec(&pt->pt_share_count);

        /* Clearing a PUD pointing at a PMD table with PMD leaves. */
        tlb_flush_pmd_range(tlb, addr & PUD_MASK, PUD_SIZE);

        /*
         * If the page table is now exclusively owned, we fully unshared
         * a page table.
         */
        if (!atomic_read(&pt->pt_share_count))
                tlb->fully_unshared_tables = true;
        tlb->unshared_tables = true;
}

static inline void tlb_flush_unshared_tables(struct mmu_gather *tlb)
{
        /*
         * As soon as the caller drops locks to allow for reuse of
         * previously-shared tables, these tables could get modified and
         * even reused outside of hugetlb context, so we have to make sure that
         * any page table walkers (incl. TLB, GUP-fast) are aware of that
         * change.
         *
         * Even if we are not fully unsharing a PMD table, we must
         * flush the TLB for the unsharer now.
         */
        if (tlb->unshared_tables)
                tlb_flush_mmu_tlbonly(tlb);

        /*
         * Similarly, we must make sure that concurrent GUP-fast will not
         * walk previously-shared page tables that are getting modified+reused
         * elsewhere. So broadcast an IPI to wait for any concurrent GUP-fast.
         *
         * We only perform this when we are the last sharer of a page table,
         * as the IPI will reach all CPUs: any GUP-fast.
         *
         * Note that on configs where tlb_remove_table_sync_one() is a NOP,
         * the expectation is that the tlb_flush_mmu_tlbonly() would have issued
         * required IPIs already for us.
         */
        if (tlb->fully_unshared_tables) {
                tlb_remove_table_sync_one();
                tlb->fully_unshared_tables = false;
        }
}
#endif

#endif /* CONFIG_MMU */

#endif /* _ASM_GENERIC__TLB_H */






















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 

































































































































































































    1 

    1 
    1 










    1 
    1 






    1 
    1 



    1 





    1 













    1 

    1 


    1 


























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 


















    1 






    1 
























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 




































































































































    2 

    2 





    2 








































































































































































































































    2 


    2 








    2 




    2 




















    2 











    2 











    2 

    2 






    2 































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
    1 






































    1 

































































































































































































    1 






















    1 
    1 
















































































    1 



    1 
    1 





    1 























    1 
















    1 



    1 







    1 



































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057
8058
8059
8060
8061
8062
8063
8064
8065
8066
8067
8068
8069
8070
8071
8072
8073
8074
8075
8076
8077
8078
8079
8080
8081
8082
8083
8084
8085
8086
8087
8088
8089
8090
8091
8092
8093
8094
8095
8096
8097
8098
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118
8119
8120
8121
8122
8123
8124
8125
8126
8127
8128
8129
8130
8131
8132
8133
8134
8135
8136
8137
8138
8139
8140
8141
8142
8143
8144
8145
8146
8147
8148
8149
8150
8151
8152
8153
8154
8155
8156
8157
8158
8159
8160
8161
8162
8163
8164
8165
8166
8167
8168
8169
8170
8171
8172
8173
8174
8175
8176
8177
8178
8179
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195
8196
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213
8214
8215
8216
8217
8218
8219
8220
8221
8222
8223
8224
8225
8226
8227
8228
8229
8230
8231
8232
8233
8234
8235
8236
8237
8238
8239
8240
8241
8242
8243
8244
8245
8246
8247
8248
8249
8250
8251
8252
8253
8254
8255
8256
8257
8258
8259
8260
8261
8262
8263
8264
8265
8266
8267
8268
8269
8270
8271
8272
8273
8274
8275
8276
8277
8278
8279
8280
8281
8282
8283
8284
8285
8286
8287
8288
8289
8290
8291
8292
8293
8294
8295
8296
8297
8298
8299
8300
8301
8302
8303
8304
8305
8306
8307
8308
8309
8310
8311
8312
8313
8314
8315
8316
8317
8318
8319
8320
8321
8322
8323
8324
8325
8326
8327
8328
8329
8330
8331
8332
8333
8334
8335
8336
8337
8338
8339
8340
8341
8342
8343
8344
8345
8346
8347
8348
8349
8350
8351
8352
8353
8354
8355
8356
8357
8358
8359
8360
8361
8362
8363
8364
8365
8366
8367
8368
8369
8370
8371
8372
8373
8374
8375
8376
8377
8378
8379
8380
8381
8382
8383
8384
8385
8386
8387
8388
8389
8390
8391
8392
8393
8394
8395
8396
8397
8398
8399
8400
8401
8402
8403
8404
8405
8406
8407
8408
8409
8410
8411
8412
8413
8414
8415
8416
8417
8418
8419
8420
8421
8422
8423
8424
8425
8426
8427
8428
8429
8430
8431
8432
8433
8434
8435
8436
8437
8438
8439
8440
8441
8442
8443
8444
8445
8446
8447
8448
8449
8450
8451
8452
8453
8454
8455
8456
8457
8458
8459
8460
8461
8462
8463
8464
8465
8466
8467
8468
8469
8470
8471
8472
8473
8474
8475
8476
8477
8478
8479
8480
8481
8482
8483
8484
8485
8486
8487
8488
8489
8490
8491
8492
8493
8494
8495
8496
8497
8498
8499
8500
8501
8502
8503
8504
8505
8506
8507
8508
8509
8510
8511
8512
8513
8514
8515
8516
8517
8518
8519
8520
8521
8522
8523
8524
8525
8526
8527
8528
8529
8530
8531
8532
8533
8534
8535
8536
8537
8538
8539
8540
8541
8542
8543
8544
8545
8546
8547
8548
8549
8550
8551
8552
8553
8554
8555
8556
8557
8558
8559
8560
8561
8562
8563
8564
8565
8566
8567
8568
8569
8570
8571
8572
8573
8574
8575
8576
8577
8578
8579
8580
8581
8582
8583
8584
8585
8586
8587
8588
8589
8590
8591
8592
8593
8594
8595
8596
8597
8598
8599
8600
8601
8602
8603
8604
8605
8606
8607
8608
8609
8610
8611
8612
8613
8614
8615
8616
8617
8618
8619
8620
8621
8622
8623
8624
8625
8626
8627
8628
8629
8630
8631
8632
8633
8634
8635
8636
8637
8638
8639
8640
8641
8642
8643
8644
8645
8646
8647
8648
8649
8650
8651
8652
8653
8654
8655
8656
8657
8658
8659
8660
8661
8662
8663
8664
8665
8666
8667
8668
8669
8670
8671
8672
8673
8674
8675
8676
8677
8678
8679
8680
8681
8682
8683
8684
8685
8686
8687
8688
8689
8690
8691
8692
8693
8694
8695
8696
8697
8698
8699
8700
8701
8702
8703
8704
8705
8706
8707
8708
8709
8710
8711
8712
8713
8714
8715
8716
8717
8718
8719
8720
8721
8722
8723
8724
8725
8726
8727
8728
8729
8730
8731
8732
8733
8734
8735
8736
8737
8738
8739
8740
8741
8742
8743
8744
8745
8746
8747
8748
8749
8750
8751
8752
8753
8754
8755
8756
8757
8758
8759
8760
8761
8762
8763
8764
8765
8766
8767
8768
8769
8770
8771
8772
8773
8774
8775
8776
8777
8778
8779
8780
8781
8782
8783
8784
8785
8786
8787
8788
8789
8790
8791
8792
8793
8794
8795
8796
8797
8798
8799
8800
8801
8802
8803
8804
8805
8806
8807
8808
8809
8810
8811
8812
8813
8814
8815
8816
8817
8818
8819
8820
8821
8822
8823
8824
8825
8826
8827
8828
8829
8830
8831
8832
8833
8834
8835
8836
8837
8838
8839
8840
8841
8842
8843
8844
8845
8846
8847
8848
8849
8850
8851
8852
8853
8854
8855
8856
8857
8858
8859
8860
8861
8862
8863
8864
8865
8866
8867
8868
8869
8870
8871
8872
8873
8874
8875
8876
8877
8878
8879
8880
8881
8882
8883
8884
8885
8886
8887
8888
8889
8890
8891
8892
8893
8894
8895
8896
8897
8898
8899
8900
8901
8902
8903
8904
8905
8906
8907
8908
8909
8910
8911
8912
8913
8914
8915
8916
8917
8918
8919
8920
8921
8922
8923
8924
8925
8926
8927
8928
8929
8930
8931
8932
8933
8934
8935
8936
8937
8938
8939
8940
8941
8942
8943
8944
8945
8946
8947
8948
8949
8950
8951
8952
8953
8954
8955
8956
8957
8958
8959
8960
8961
8962
8963
8964
8965
8966
8967
8968
8969
8970
8971
8972
8973
8974
8975
8976
8977
8978
8979
8980
8981
8982
8983
8984
8985
8986
8987
8988
8989
8990
8991
8992
8993
8994
8995
8996
8997
8998
8999
9000
9001
9002
9003
9004
9005
9006
9007
9008
9009
9010
9011
9012
9013
9014
9015
9016
9017
9018
9019
9020
9021
9022
9023
9024
9025
9026
9027
9028
9029
9030
9031
9032
9033
9034
9035
9036
9037
9038
9039
9040
9041
9042
9043
9044
9045
9046
9047
9048
9049
9050
9051
9052
9053
9054
9055
9056
9057
9058
9059
9060
9061
9062
9063
9064
9065
9066
9067
9068
9069
9070
9071
9072
9073
9074
9075
9076
9077
9078
9079
9080
9081
9082
9083
9084
9085
9086
9087
9088
9089
9090
9091
9092
9093
9094
9095
9096
9097
9098
9099
9100
9101
9102
9103
9104
9105
9106
9107
9108
9109
9110
9111
9112
9113
9114
9115
9116
9117
9118
9119
9120
9121
9122
9123
9124
9125
9126
9127
9128
9129
9130
9131
9132
9133
9134
9135
9136
9137
9138
9139
9140
9141
9142
9143
9144
9145
9146
9147
9148
9149
9150
9151
9152
9153
9154
9155
9156
9157
9158
9159
9160
9161
9162
9163
9164
9165
9166
9167
9168
9169
9170
9171
9172
9173
9174
9175
9176
9177
9178
9179
9180
9181
9182
9183
9184
9185
9186
9187
9188
9189
9190
9191
9192
9193
9194
9195
9196
9197
9198
9199
9200
9201
9202
9203
9204
9205
9206
9207
9208
9209
9210
9211
9212
9213
9214
9215
9216
9217
9218
9219
9220
9221
9222
9223
9224
9225
9226
9227
9228
9229
9230
9231
9232
9233
9234
9235
9236
9237
9238
9239
9240
9241
9242
9243
9244
9245
9246
9247
9248
9249
9250
9251
9252
9253
9254
9255
9256
9257
9258
9259
9260
9261
9262
9263
9264
9265
9266
9267
9268
9269
9270
9271
9272
9273
9274
9275
9276
9277
9278
9279
9280
9281
9282
9283
9284
9285
9286
9287
9288
9289
9290
9291
9292
9293
9294
9295
9296
9297
9298
9299
9300
9301
9302
9303
9304
9305
9306
9307
9308
9309
9310
9311
9312
9313
9314
9315
9316
9317
9318
9319
9320
9321
9322
9323
9324
9325
9326
9327
9328
9329
9330
9331
9332
9333
9334
9335
9336
9337
9338
9339
9340
9341
9342
9343
9344
9345
9346
9347
9348
9349
9350
9351
9352
9353
9354
9355
9356
9357
9358
9359
9360
9361
9362
9363
9364
9365
9366
9367
9368
9369
9370
9371
9372
9373
9374
9375
9376
9377
9378
9379
9380
9381
9382
9383
9384
9385
9386
9387
9388
9389
9390
9391
9392
9393
9394
9395
9396
9397
9398
9399
9400
9401
9402
9403
9404
9405
9406
9407
9408
9409
9410
9411
9412
9413
9414
9415
9416
9417
9418
9419
9420
9421
9422
9423
9424
9425
9426
9427
9428
9429
9430
9431
9432
9433
9434
9435
9436
9437
9438
9439
9440
9441
9442
9443
9444
9445
9446
9447
9448
9449
9450
9451
9452
9453
9454
9455
9456
9457
9458
9459
9460
9461
9462
9463
9464
9465
9466
9467
9468
9469
9470
9471
9472
9473
9474
9475
9476
9477
9478
9479
9480
9481
9482
9483
9484
9485
9486
9487
9488
9489
9490
9491
9492
9493
9494
9495
9496
9497
9498
9499
9500
9501
9502
9503
9504
9505
9506
9507
9508
9509
9510
9511
9512
9513
9514
9515
9516
9517
9518
9519
9520
9521
9522
9523
9524
9525
9526
9527
9528
9529
9530
9531
9532
9533
9534
9535
9536
9537
9538
9539
9540
9541
9542
9543
9544
9545
9546
9547
9548
9549
9550
9551
9552
9553
9554
9555
9556
9557
9558
9559
9560
9561
9562
9563
9564
9565
9566
9567
9568
9569
9570
9571
9572
9573
9574
9575
9576
9577
9578
9579
9580
9581
9582
9583
9584
9585
9586
9587
9588
9589
9590
9591
9592
9593
9594
9595
9596
9597
9598
9599
9600
9601
9602
9603
9604
9605
9606
9607
9608
9609
9610
9611
9612
9613
9614
9615
9616
9617
9618
9619
9620
9621
9622
9623
9624
9625
9626
9627
9628
9629
9630
9631
9632
9633
9634
9635
9636
9637
9638
9639
9640
9641
9642
9643
9644
9645
9646
9647
9648
9649
9650
9651
9652
9653
9654
9655
9656
9657
9658
9659
9660
9661
9662
9663
9664
9665
9666
9667
9668
9669
9670
9671
9672
9673
9674
9675
9676
9677
9678
9679
9680
9681
9682
9683
9684
9685
9686
9687
9688
9689
9690
9691
9692
9693
9694
9695
9696
9697
9698
9699
9700
9701
9702
9703
9704
9705
9706
9707
9708
9709
9710
9711
9712
9713
9714
9715
9716
9717
9718
9719
9720
9721
9722
9723
9724
9725
9726
9727
9728
9729
9730
9731
9732
9733
9734
9735
9736
9737
9738
9739
9740
9741
9742
9743
9744
9745
9746
9747
9748
9749
9750
9751
9752
9753
9754
9755
9756
9757
9758
9759
9760
9761
9762
9763
9764
9765
9766
9767
9768
9769
9770
9771
9772
9773
9774
9775
9776
9777
9778
9779
9780
9781
9782
9783
9784
9785
9786
9787
9788
9789
9790
9791
9792
9793
9794
9795
9796
9797
9798
9799
9800
9801
9802
9803
9804
9805
9806
9807
9808
9809
9810
9811
9812
9813
9814
9815
9816
9817
9818
9819
9820
9821
9822
9823
9824
9825
9826
9827
9828
9829
9830
9831
9832
9833
9834
9835
9836
9837
9838
9839
9840
9841
9842
9843
9844
9845
9846
9847
9848
9849
9850
9851
9852
9853
9854
9855
9856
9857
9858
9859
9860
9861
9862
9863
9864
9865
9866
9867
9868
9869
9870
9871
9872
9873
9874
9875
9876
9877
9878
9879
9880
9881
9882
9883
9884
9885
9886
9887
9888
9889
9890
9891
9892
9893
9894
9895
9896
9897
9898
9899
9900
9901
9902
9903
9904
9905
9906
9907
9908
9909
9910
9911
9912
9913
9914
9915
9916
9917
9918
9919
9920
9921
9922
9923
9924
9925
9926
9927
9928
9929
9930
9931
9932
9933
9934
9935
9936
9937
9938
9939
9940
9941
9942
9943
9944
9945
9946
9947
9948
9949
9950
9951
9952
9953
9954
9955
9956
9957
9958
9959
9960
9961
9962
9963
9964
9965
9966
9967
9968
9969
9970
9971
9972
9973
9974
9975
9976
9977
9978
9979
9980
9981
9982
9983
9984
9985
9986
9987
9988
9989
9990
9991
9992
9993
9994
9995
9996
9997
9998
9999
10000
10001
10002
10003
10004
10005
10006
10007
10008
10009
10010
10011
10012
10013
10014
10015
10016
10017
10018
10019
10020
10021
10022
10023
10024
10025
10026
10027
10028
10029
10030
10031
10032
10033
10034
10035
10036
10037
10038
10039
10040
10041
10042
10043
10044
10045
10046
10047
10048
10049
10050
10051
10052
10053
10054
10055
10056
10057
10058
10059
10060
10061
10062
10063
10064
10065
10066
10067
10068
10069
10070
10071
10072
10073
10074
10075
10076
10077
10078
10079
10080
10081
10082
10083
10084
10085
10086
10087
10088
10089
10090
10091
10092
10093
10094
10095
10096
10097
10098
10099
10100
10101
10102
10103
10104
10105
10106
10107
10108
10109
10110
10111
10112
10113
10114
10115
10116
10117
10118
10119
10120
10121
10122
10123
10124
10125
10126
10127
10128
10129
10130
10131
10132
10133
10134
10135
10136
10137
10138
10139
10140
10141
10142
10143
10144
10145
10146
10147
10148
10149
10150
10151
10152
10153
10154
10155
10156
10157
10158
10159
10160
10161
10162
10163
10164
10165
10166
10167
10168
10169
10170
10171
10172
10173
10174
10175
10176
10177
10178
10179
10180
10181
10182
10183
10184
10185
10186
10187
10188
10189
10190
10191
10192
10193
10194
10195
10196
10197
10198
10199
10200
10201
10202
10203
10204
10205
10206
10207
10208
10209
10210
10211
10212
10213
10214
10215
10216
10217
10218
10219
10220
10221
10222
10223
10224
10225
10226
10227
10228
10229
10230
10231
10232
10233
10234
10235
10236
10237
10238
10239
10240
10241
10242
10243
10244
10245
10246
10247
10248
10249
10250
10251
10252
10253
10254
10255
10256
10257
10258
10259
10260
10261
10262
10263
10264
10265
10266
10267
10268
10269
10270
10271
10272
10273
10274
10275
10276
10277
10278
10279
10280
10281
10282
10283
10284
10285
10286
10287
10288
10289
10290
10291
10292
10293
10294
10295
10296
10297
10298
10299
10300
10301
10302
10303
10304
10305
10306
10307
10308
10309
10310
10311
10312
10313
10314
10315
10316
10317
10318
10319
10320
10321
10322
10323
10324
10325
10326
10327
10328
10329
10330
10331
10332
10333
10334
10335
10336
10337
10338
10339
10340
10341
10342
10343
10344
10345
10346
10347
10348
10349
10350
10351
10352
10353
10354
10355
10356
10357
10358
10359
10360
10361
10362
10363
10364
10365
10366
10367
10368
10369
10370
10371
10372
10373
10374
10375
10376
10377
10378
10379
10380
10381
10382
10383
10384
10385
10386
10387
10388
10389
10390
10391
10392
10393
10394
10395
10396
10397
10398
10399
10400
10401
10402
10403
10404
10405
10406
10407
10408
10409
10410
10411
10412
10413
10414
10415
10416
10417
10418
10419
10420
10421
10422
10423
10424
10425
10426
10427
10428
10429
10430
10431
10432
10433
10434
10435
10436
10437
10438
10439
10440
10441
10442
10443
10444
10445
10446
10447
10448
10449
10450
10451
10452
10453
10454
10455
10456
10457
10458
10459
10460
10461
10462
10463
10464
10465
10466
10467
10468
10469
10470
10471
10472
10473
10474
10475
10476
10477
10478
10479
10480
10481
10482
10483
10484
10485
10486
10487
10488
10489
10490
10491
10492
10493
10494
10495
10496
10497
10498
10499
10500
10501
10502
10503
10504
10505
10506
10507
10508
10509
10510
10511
10512
10513
10514
10515
10516
10517
10518
10519
10520
10521
10522
10523
10524
10525
10526
10527
10528
10529
10530
10531
10532
10533
10534
10535
10536
10537
10538
10539
10540
10541
10542
10543
10544
10545
10546
10547
10548
10549
10550
10551
10552
10553
10554
10555
10556
10557
10558
10559
10560
10561
10562
10563
10564
10565
10566
10567
10568
10569
10570
10571
10572
10573
10574
10575
10576
10577
10578
10579
10580
10581
10582
10583
10584
10585
10586
10587
10588
10589
10590
10591
10592
10593
10594
10595
10596
10597
10598
10599
10600
10601
10602
10603
10604
10605
10606
10607
10608
10609
10610
10611
10612
10613
10614
10615
10616
10617
10618
10619
10620
10621
10622
10623
10624
10625
10626
10627
10628
10629
10630
10631
10632
10633
10634
10635
10636
10637
10638
10639
10640
10641
10642
10643
10644
10645
10646
10647
10648
10649
10650
10651
10652
10653
10654
10655
10656
10657
10658
10659
10660
10661
10662
10663
10664
10665
10666
10667
10668
10669
10670
10671
10672
10673
10674
10675
10676
10677
10678
10679
10680
10681
10682
10683
10684
10685
10686
10687
10688
10689
10690
10691
10692
10693
10694
10695
10696
10697
10698
10699
10700
10701
10702
10703
10704
10705
10706
10707
10708
10709
10710
10711
10712
10713
10714
10715
10716
10717
10718
10719
10720
10721
10722
10723
10724
10725
10726
10727
10728
10729
10730
10731
10732
10733
10734
10735
10736
10737
10738
10739
10740
10741
10742
10743
10744
10745
10746
10747
10748
10749
10750
10751
10752
10753
10754
10755
10756
10757
10758
10759
10760
10761
10762
10763
10764
10765
10766
10767
10768
10769
10770
10771
10772
10773
10774
10775
10776
10777
10778
10779
10780
10781
10782
10783
10784
10785
10786
10787
10788
10789
10790
10791
10792
10793
10794
10795
10796
10797
10798
10799
10800
10801
10802
10803
10804
10805
10806
10807
10808
10809
10810
10811
10812
10813
10814
10815
10816
10817
10818
10819
10820
10821
10822
10823
10824
10825
10826
10827
10828
10829
10830
10831
10832
10833
10834
10835
10836
10837
10838
10839
10840
10841
10842
10843
10844
10845
10846
10847
10848
10849
10850
10851
10852
10853
10854
10855
10856
10857
10858
10859
10860
10861
10862
10863
10864
10865
10866
10867
10868
10869
10870
10871
10872
10873
10874
10875
10876
10877
10878
10879
10880
10881
10882
10883
10884
10885
10886
10887
10888
10889
10890
10891
10892
10893
10894
10895
10896
10897
10898
10899
10900
10901
10902
10903
10904
10905
10906
10907
10908
10909
10910
10911
10912
10913
10914
10915
10916
10917
10918
10919
10920
10921
10922
10923
10924
10925
10926
10927
10928
10929
10930
10931
10932
10933
10934
10935
10936
10937
10938
10939
10940
10941
10942
10943
10944
10945
10946
10947
10948
10949
10950
10951
10952
10953
10954
10955
10956
10957
10958
10959
10960
10961
10962
10963
10964
10965
10966
10967
10968
10969
10970
10971
10972
10973
10974
10975
10976
10977
10978
10979
10980
10981
10982
10983
10984
10985
10986
10987
10988
10989
10990
10991
10992
10993
10994
10995
10996
10997
10998
10999
11000
11001
11002
11003
11004
11005
11006
11007
11008
11009
11010
11011
11012
11013
11014
11015
11016
11017
11018
11019
11020
11021
11022
11023
11024
11025
11026
11027
11028
11029
11030
11031
11032
11033
11034
11035
11036
11037
11038
11039
11040
11041
11042
11043
11044
11045
11046
11047
11048
11049
11050
11051
11052
11053
11054
11055
11056
11057
11058
11059
11060
11061
11062
11063
11064
11065
11066
11067
11068
11069
11070
11071
11072
11073
11074
11075
11076
11077
11078
11079
11080
11081
11082
11083
11084
11085
11086
11087
11088
11089
11090
11091
11092
11093
11094
11095
11096
11097
11098
11099
11100
11101
11102
11103
11104
11105
11106
11107
11108
11109
11110
11111
11112
11113
11114
11115
11116
11117
11118
11119
11120
11121
11122
11123
11124
11125
11126
11127
11128
11129
11130
11131
11132
11133
11134
11135
11136
11137
11138
11139
11140
11141
11142
11143
11144
11145
11146
11147
11148
11149
11150
11151
11152
11153
11154
11155
11156
11157
11158
11159
11160
11161
11162
11163
11164
11165
11166
11167
11168
11169
11170
11171
11172
11173
11174
11175
11176
11177
11178
11179
11180
11181
11182
11183
11184
11185
11186
11187
11188
11189
11190
11191
11192
11193
11194
11195
11196
11197
11198
11199
11200
11201
11202
11203
11204
11205
11206
11207
11208
11209
11210
11211
11212
11213
11214
11215
11216
11217
11218
11219
11220
11221
11222
11223
11224
11225
11226
11227
11228
11229
11230
11231
11232
11233
11234
11235
11236
11237
11238
11239
11240
11241
11242
11243
11244
11245
11246
11247
11248
11249
11250
11251
11252
11253
11254
11255
11256
11257
11258
11259
11260
11261
11262
11263
11264
11265
11266
11267
11268
11269
11270
11271
11272
11273
11274
11275
11276
11277
11278
11279
11280
11281
11282
11283
11284
11285
11286
11287
11288
11289
11290
11291
11292
11293
11294
11295
11296
11297
11298
11299
11300
11301
11302
11303
11304
11305
11306
11307
11308
11309
11310
11311
11312
11313
11314
11315
11316
11317
11318
11319
11320
11321
11322
11323
11324
11325
11326
11327
11328
11329
11330
11331
11332
11333
11334
11335
11336
11337
11338
11339
11340
11341
11342
11343
11344
11345
11346
11347
11348
11349
11350
11351
11352
11353
11354
11355
11356
11357
11358
11359
11360
11361
11362
11363
11364
11365
11366
11367
11368
11369
11370
11371
11372
11373
11374
11375
11376
11377
11378
11379
11380
11381
11382
11383
11384
11385
11386
11387
11388
11389
11390
11391
11392
11393
11394
11395
11396
11397
11398
11399
11400
11401
11402
11403
11404
11405
11406
11407
11408
11409
11410
11411
11412
11413
11414
11415
11416
11417
11418
11419
11420
11421
11422
11423
11424
11425
11426
11427
11428
11429
11430
11431
11432
11433
11434
11435
11436
11437
11438
11439
11440
11441
11442
11443
11444
11445
11446
11447
11448
11449
11450
11451
11452
11453
11454
11455
11456
11457
11458
11459
11460
11461
11462
11463
11464
11465
11466
11467
11468
11469
11470
11471
11472
11473
11474
11475
11476
11477
11478
11479
11480
11481
11482
11483
11484
11485
11486
11487
11488
11489
11490
11491
11492
11493
11494
11495
11496
11497
11498
11499
11500
11501
11502
11503
11504
11505
11506
11507
11508
11509
11510
11511
11512
11513
11514
11515
11516
11517
11518
11519
11520
11521
11522
11523
11524
11525
11526
11527
11528
11529
11530
11531
11532
11533
11534
11535
11536
11537
11538
11539
11540
11541
11542
11543
11544
11545
11546
11547
11548
11549
11550
11551
11552
11553
11554
11555
11556
11557
11558
11559
11560
11561
11562
11563
11564
11565
11566
11567
11568
11569
11570
11571
11572
11573
11574
11575
11576
11577
11578
11579
11580
11581
11582
11583
11584
11585
11586
11587
11588
11589
11590
11591
11592
11593
11594
11595
11596
11597
11598
11599
11600
11601
11602
11603
11604
11605
11606
11607
11608
11609
11610
11611
11612
11613
11614
11615
11616
11617
11618
11619
11620
11621
11622
11623
11624
11625
11626
11627
11628
11629
11630
11631
11632
11633
11634
11635
11636
11637
11638
11639
11640
11641
11642
11643
11644
11645
11646
11647
11648
11649
11650
11651
11652
11653
11654
11655
11656
11657
11658
11659
11660
11661
11662
11663
11664
11665
11666
11667
11668
11669
11670
11671
11672
11673
11674
11675
11676
11677
11678
11679
11680
11681
11682
11683
11684
11685
11686
11687
11688
11689
11690
11691
11692
11693
11694
11695
11696
11697
11698
11699
11700
11701
11702
11703
11704
11705
11706
11707
11708
11709
11710
11711
11712
11713
11714
11715
11716
11717
11718
11719
11720
11721
11722
11723
11724
11725
11726
11727
11728
11729
11730
11731
11732
11733
11734
11735
11736
11737
11738
11739
11740
11741
11742
11743
11744
11745
11746
11747
11748
11749
11750
11751
11752
11753
11754
11755
11756
11757
11758
11759
11760
11761
11762
11763
11764
11765
11766
11767
11768
11769
11770
11771
11772
11773
11774
11775
11776
11777
11778
11779
11780
11781
11782
11783
11784
11785
11786
11787
11788
11789
11790
11791
11792
11793
11794
11795
11796
11797
11798
11799
11800
11801
11802
11803
11804
11805
11806
11807
11808
11809
11810
11811
11812
11813
11814
11815
11816
11817
11818
11819
11820
11821
11822
11823
11824
11825
11826
11827
11828
11829
11830
11831
11832
11833
11834
11835
11836
11837
11838
11839
11840
11841
11842
11843
11844
11845
11846
11847
11848
11849
11850
11851
11852
11853
11854
11855
11856
11857
11858
11859
11860
11861
11862
11863
11864
11865
11866
11867
11868
11869
11870
11871
11872
11873
11874
11875
11876
11877
11878
11879
11880
11881
11882
11883
11884
11885
11886
11887
11888
11889
11890
11891
11892
11893
11894
11895
11896
11897
11898
11899
11900
11901
11902
11903
11904
11905
11906
11907
11908
11909
11910
11911
11912
11913
11914
11915
11916
11917
11918
11919
11920
11921
11922
11923
11924
11925
11926
11927
11928
11929
11930
11931
11932
11933
11934
11935
11936
11937
11938
11939
11940
11941
11942
11943
11944
11945
11946
11947
11948
11949
11950
11951
11952
11953
11954
11955
11956
11957
11958
11959
11960
11961
11962
11963
11964
11965
11966
11967
11968
11969
11970
11971
11972
11973
11974
11975
11976
11977
11978
11979
11980
11981
11982
11983
11984
11985
11986
11987
11988
11989
11990
11991
11992
11993
11994
11995
11996
11997
11998
11999
12000
12001
12002
12003
12004
12005
12006
12007
12008
12009
12010
12011
12012
12013
12014
12015
12016
12017
12018
12019
12020
12021
12022
12023
12024
12025
12026
12027
12028
12029
12030
12031
12032
12033
12034
12035
12036
12037
12038
12039
12040
12041
12042
12043
12044
12045
12046
12047
12048
12049
12050
12051
12052
12053
12054
12055
12056
12057
12058
12059
12060
12061
12062
12063
12064
12065
12066
12067
12068
12069
12070
12071
12072
12073
12074
12075
12076
12077
12078
12079
12080
12081
12082
12083
12084
12085
12086
12087
12088
12089
12090
12091
12092
12093
12094
12095
12096
12097
12098
12099
12100
12101
12102
12103
12104
12105
12106
12107
12108
12109
12110
12111
12112
12113
12114
12115
12116
12117
12118
12119
12120
12121
12122
12123
12124
12125
12126
12127
12128
12129
12130
12131
12132
12133
12134
12135
12136
12137
12138
12139
12140
12141
12142
12143
12144
12145
12146
12147
12148
12149
12150
12151
12152
12153
12154
12155
12156
12157
12158
12159
12160
12161
12162
12163
12164
12165
12166
12167
12168
12169
12170
12171
12172
12173
12174
12175
12176
12177
12178
12179
12180
12181
12182
12183
12184
12185
12186
12187
12188
12189
12190
12191
12192
12193
12194
12195
12196
12197
12198
12199
12200
12201
12202
12203
12204
12205
12206
12207
12208
12209
12210
12211
12212
12213
12214
12215
12216
12217
12218
12219
12220
12221
12222
12223
12224
12225
12226
12227
12228
12229
12230
12231
12232
12233
12234
12235
12236
12237
12238
12239
12240
12241
12242
12243
12244
12245
12246
12247
12248
12249
12250
12251
12252
12253
12254
12255
12256
12257
12258
12259
12260
12261
12262
12263
12264
12265
12266
12267
12268
12269
12270
12271
12272
12273
12274
12275
12276
12277
12278
12279
12280
12281
12282
12283
12284
12285
12286
12287
12288
12289
12290
12291
12292
12293
12294
12295
12296
12297
12298
12299
12300
12301
12302
12303
12304
12305
12306
12307
12308
12309
12310
12311
12312
12313
12314
12315
12316
12317
12318
12319
12320
12321
12322
12323
12324
12325
12326
12327
12328
12329
12330
12331
12332
12333
12334
12335
12336
12337
12338
12339
12340
12341
12342
12343
12344
12345
12346
12347
12348
12349
12350
12351
12352
12353
12354
12355
12356
12357
12358
12359
12360
12361
12362
12363
12364
12365
12366
12367
12368
12369
12370
12371
12372
12373
12374
12375
12376
12377
12378
12379
12380
12381
12382
12383
12384
12385
12386
12387
12388
12389
12390
12391
12392
12393
12394
12395
12396
12397
12398
12399
12400
12401
12402
12403
12404
12405
12406
12407
12408
12409
12410
12411
12412
12413
12414
12415
12416
12417
12418
12419
12420
12421
12422
12423
12424
12425
12426
12427
12428
12429
12430
12431
12432
12433
12434
12435
12436
12437
12438
12439
12440
12441
12442
12443
12444
12445
12446
12447
12448
12449
12450
12451
12452
12453
12454
12455
12456
12457
12458
12459
12460
12461
12462
12463
12464
12465
12466
12467
12468
12469
12470
12471
12472
12473
12474
12475
12476
12477
12478
12479
12480
12481
12482
12483
12484
12485
12486
12487
12488
12489
12490
12491
12492
12493
12494
12495
12496
12497
12498
12499
12500
12501
12502
12503
12504
12505
12506
12507
12508
12509
12510
12511
12512
12513
12514
12515
12516
12517
12518
12519
12520
12521
12522
12523
12524
12525
12526
12527
12528
12529
12530
12531
12532
12533
12534
12535
12536
12537
12538
12539
12540
12541
12542
12543
12544
12545
12546
12547
12548
12549
12550
12551
12552
12553
12554
12555
12556
12557
12558
12559
12560
12561
12562
12563
12564
12565
12566
12567
12568
12569
12570
12571
12572
12573
12574
12575
12576
12577
12578
12579
12580
12581
12582
12583
12584
12585
12586
12587
12588
12589
12590
12591
12592
12593
12594
12595
12596
12597
12598
12599
12600
12601
12602
12603
12604
12605
12606
12607
12608
12609
12610
12611
12612
12613
12614
12615
12616
12617
12618
12619
12620
12621
12622
12623
12624
12625
12626
12627
12628
12629
12630
12631
12632
12633
12634
12635
12636
12637
12638
12639
12640
12641
12642
12643
12644
12645
12646
12647
12648
12649
12650
12651
12652
12653
12654
12655
12656
12657
12658
12659
12660
12661
12662
12663
12664
12665
12666
12667
12668
12669
12670
12671
12672
12673
12674
12675
12676
12677
12678
12679
12680
12681
12682
12683
12684
12685
12686
12687
12688
12689
12690
12691
12692
12693
12694
12695
12696
12697
12698
12699
12700
12701
12702
12703
12704
12705
12706
12707
12708
12709
12710
12711
12712
12713
12714
12715
12716
12717
12718
12719
12720
12721
12722
12723
12724
12725
12726
12727
12728
12729
12730
12731
12732
12733
12734
12735
12736
12737
12738
12739
12740
12741
12742
12743
12744
12745
12746
12747
12748
12749
12750
12751
12752
12753
12754
12755
12756
12757
12758
12759
12760
12761
12762
12763
12764
12765
12766
12767
12768
12769
12770
12771
12772
12773
12774
12775
12776
12777
12778
12779
12780
12781
12782
12783
12784
12785
12786
12787
12788
12789
12790
12791
12792
12793
12794
12795
12796
12797
12798
12799
12800
12801
12802
12803
12804
12805
12806
12807
12808
12809
12810
12811
12812
12813
12814
12815
12816
12817
12818
12819
12820
12821
12822
12823
12824
12825
12826
12827
12828
12829
12830
12831
12832
12833
12834
12835
12836
12837
12838
12839
12840
12841
12842
12843
12844
12845
12846
12847
12848
12849
12850
12851
12852
12853
12854
12855
12856
12857
12858
12859
12860
12861
12862
12863
12864
12865
12866
12867
12868
12869
12870
12871
12872
12873
12874
12875
12876
12877
12878
12879
12880
12881
12882
12883
12884
12885
12886
12887
12888
12889
12890
12891
12892
12893
12894
12895
12896
12897
12898
12899
12900
12901
12902
12903
12904
12905
12906
12907
12908
12909
12910
12911
12912
12913
12914
12915
12916
12917
12918
12919
12920
12921
12922
12923
12924
12925
12926
12927
12928
12929
12930
12931
12932
12933
12934
12935
12936
12937
12938
12939
12940
12941
12942
12943
12944
12945
12946
12947
12948
12949
12950
12951
12952
12953
12954
12955
12956
12957
12958
12959
12960
12961
12962
12963
12964
12965
12966
12967
12968
12969
12970
12971
12972
12973
12974
12975
12976
12977
12978
12979
12980
12981
12982
12983
12984
12985
12986
12987
12988
12989
12990
12991
12992
12993
12994
12995
12996
12997
12998
12999
13000
13001
13002
13003
13004
13005
13006
13007
13008
13009
13010
13011
13012
13013
13014
13015
13016
13017
13018
13019
13020
13021
13022
13023
13024
13025
13026
13027
13028
13029
13030
13031
13032
13033
13034
13035
13036
13037
13038
13039
13040
13041
13042
13043
13044
13045
13046
13047
13048
13049
13050
13051
13052
13053
13054
13055
13056
13057
13058
13059
13060
13061
13062
13063
13064
13065
13066
13067
13068
13069
13070
13071
13072
13073
13074
13075
13076
13077
13078
13079
13080
13081
13082
13083
13084
13085
13086
13087
13088
13089
13090
13091
13092
13093
13094
13095
13096
13097
13098
13099
13100
13101
13102
13103
13104
13105
13106
13107
13108
13109
13110
13111
13112
13113
13114
13115
13116
13117
13118
13119
13120
13121
13122
13123
13124
13125
13126
13127
13128
13129
13130
13131
13132
13133
13134
13135
13136
13137
13138
13139
13140
13141
13142
13143
13144
13145
13146
13147
13148
13149
13150
13151
13152
13153
13154
13155
13156
13157
13158
13159
13160
13161
13162
13163
13164
13165
13166
13167
13168
13169
13170
13171
13172
13173
13174
13175
13176
13177
13178
13179
13180
13181
13182
13183
13184
13185
13186
13187
13188
13189
13190
13191
13192
13193
13194
13195
13196
13197
13198
13199
13200
13201
13202
13203
13204
13205
13206
13207
13208
13209
13210
13211
13212
13213
13214
13215
13216
13217
13218
13219
13220
13221
13222
13223
13224
13225
13226
13227
13228
13229
13230
13231
13232
13233
13234
13235
13236
13237
13238
13239
13240
13241
13242
13243
13244
13245
13246
13247
13248
13249
13250
13251
13252
13253
13254
13255
13256
13257
13258
13259
13260
13261
13262
13263
13264
13265
13266
13267
13268
13269
13270
13271
13272
13273
13274
13275
13276
13277
13278
13279
13280
13281
13282
13283
13284
13285
13286
13287
13288
13289
13290
13291
13292
13293
13294
13295
13296
13297
13298
13299
13300
13301
13302
13303
13304
13305
13306
13307
13308
13309
13310
13311
13312
13313
13314
13315
13316
13317
13318
13319
13320
13321
13322
13323
13324
13325
13326
13327
13328
13329
13330
13331
13332
13333
13334
13335
13336
13337
13338
13339
13340
13341
13342
13343
13344
13345
13346
13347
13348
13349
13350
13351
13352
13353
13354
13355
13356
13357
13358
13359
13360
13361
13362
13363
13364
13365
13366
13367
13368
13369
13370
13371
13372
13373
13374
13375
13376
13377
13378
13379
13380
13381
13382
13383
13384
13385
13386
13387
13388
13389
// SPDX-License-Identifier: GPL-2.0
/*
 * Performance events core code:
 *
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
 *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
 */

#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/cpu.h>
#include <linux/smp.h>
#include <linux/idr.h>
#include <linux/file.h>
#include <linux/poll.h>
#include <linux/slab.h>
#include <linux/hash.h>
#include <linux/tick.h>
#include <linux/sysfs.h>
#include <linux/dcache.h>
#include <linux/percpu.h>
#include <linux/ptrace.h>
#include <linux/reboot.h>
#include <linux/vmstat.h>
#include <linux/device.h>
#include <linux/export.h>
#include <linux/vmalloc.h>
#include <linux/hardirq.h>
#include <linux/hugetlb.h>
#include <linux/rculist.h>
#include <linux/uaccess.h>
#include <linux/syscalls.h>
#include <linux/anon_inodes.h>
#include <linux/kernel_stat.h>
#include <linux/cgroup.h>
#include <linux/perf_event.h>
#include <linux/trace_events.h>
#include <linux/hw_breakpoint.h>
#include <linux/mm_types.h>
#include <linux/module.h>
#include <linux/mman.h>
#include <linux/compat.h>
#include <linux/bpf.h>
#include <linux/filter.h>
#include <linux/namei.h>
#include <linux/parser.h>
#include <linux/sched/clock.h>
#include <linux/sched/mm.h>
#include <linux/proc_ns.h>
#include <linux/mount.h>
#include <linux/min_heap.h>

#include "internal.h"

#include <asm/irq_regs.h>

typedef int (*remote_function_f)(void *);

struct remote_function_call {
        struct task_struct        *p;
        remote_function_f        func;
        void                        *info;
        int                        ret;
};

static void remote_function(void *data)
{
        struct remote_function_call *tfc = data;
        struct task_struct *p = tfc->p;

        if (p) {
                /* -EAGAIN */
                if (task_cpu(p) != smp_processor_id())
                        return;

                /*
                 * Now that we're on right CPU with IRQs disabled, we can test
                 * if we hit the right task without races.
                 */

                tfc->ret = -ESRCH; /* No such (running) process */
                if (p != current)
                        return;
        }

        tfc->ret = tfc->func(tfc->info);
}

/**
 * task_function_call - call a function on the cpu on which a task runs
 * @p:                the task to evaluate
 * @func:        the function to be called
 * @info:        the function call argument
 *
 * Calls the function @func when the task is currently running. This might
 * be on the current CPU, which just calls the function directly.  This will
 * retry due to any failures in smp_call_function_single(), such as if the
 * task_cpu() goes offline concurrently.
 *
 * returns @func return value or -ESRCH or -ENXIO when the process isn't running
 */
static int
task_function_call(struct task_struct *p, remote_function_f func, void *info)
{
        struct remote_function_call data = {
                .p        = p,
                .func        = func,
                .info        = info,
                .ret        = -EAGAIN,
        };
        int ret;

        for (;;) {
                ret = smp_call_function_single(task_cpu(p), remote_function,
                                               &data, 1);
                if (!ret)
                        ret = data.ret;

                if (ret != -EAGAIN)
                        break;

                cond_resched();
        }

        return ret;
}

/**
 * cpu_function_call - call a function on the cpu
 * @func:        the function to be called
 * @info:        the function call argument
 *
 * Calls the function @func on the remote cpu.
 *
 * returns: @func return value or -ENXIO when the cpu is offline
 */
static int cpu_function_call(int cpu, remote_function_f func, void *info)
{
        struct remote_function_call data = {
                .p        = NULL,
                .func        = func,
                .info        = info,
                .ret        = -ENXIO, /* No such CPU */
        };

        smp_call_function_single(cpu, remote_function, &data, 1);

        return data.ret;
}

static inline struct perf_cpu_context *
__get_cpu_context(struct perf_event_context *ctx)
{
        return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
}

static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
                          struct perf_event_context *ctx)
{
        raw_spin_lock(&cpuctx->ctx.lock);
        if (ctx)
                raw_spin_lock(&ctx->lock);
}

static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
                            struct perf_event_context *ctx)
{
        if (ctx)
                raw_spin_unlock(&ctx->lock);
        raw_spin_unlock(&cpuctx->ctx.lock);
}

#define TASK_TOMBSTONE ((void *)-1L)

static bool is_kernel_event(struct perf_event *event)
{
        return READ_ONCE(event->owner) == TASK_TOMBSTONE;
}

/*
 * On task ctx scheduling...
 *
 * When !ctx->nr_events a task context will not be scheduled. This means
 * we can disable the scheduler hooks (for performance) without leaving
 * pending task ctx state.
 *
 * This however results in two special cases:
 *
 *  - removing the last event from a task ctx; this is relatively straight
 *    forward and is done in __perf_remove_from_context.
 *
 *  - adding the first event to a task ctx; this is tricky because we cannot
 *    rely on ctx->is_active and therefore cannot use event_function_call().
 *    See perf_install_in_context().
 *
 * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
 */

typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
                        struct perf_event_context *, void *);

struct event_function_struct {
        struct perf_event *event;
        event_f func;
        void *data;
};

static int event_function(void *info)
{
        struct event_function_struct *efs = info;
        struct perf_event *event = efs->event;
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        struct perf_event_context *task_ctx = cpuctx->task_ctx;
        int ret = 0;

        lockdep_assert_irqs_disabled();

        perf_ctx_lock(cpuctx, task_ctx);
        /*
         * Since we do the IPI call without holding ctx->lock things can have
         * changed, double check we hit the task we set out to hit.
         */
        if (ctx->task) {
                if (ctx->task != current) {
                        ret = -ESRCH;
                        goto unlock;
                }

                /*
                 * We only use event_function_call() on established contexts,
                 * and event_function() is only ever called when active (or
                 * rather, we'll have bailed in task_function_call() or the
                 * above ctx->task != current test), therefore we must have
                 * ctx->is_active here.
                 */
                WARN_ON_ONCE(!ctx->is_active);
                /*
                 * And since we have ctx->is_active, cpuctx->task_ctx must
                 * match.
                 */
                WARN_ON_ONCE(task_ctx != ctx);
        } else {
                WARN_ON_ONCE(&cpuctx->ctx != ctx);
        }

        efs->func(event, cpuctx, ctx, efs->data);
unlock:
        perf_ctx_unlock(cpuctx, task_ctx);

        return ret;
}

static void event_function_call(struct perf_event *event, event_f func, void *data)
{
        struct perf_event_context *ctx = event->ctx;
        struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
        struct event_function_struct efs = {
                .event = event,
                .func = func,
                .data = data,
        };

        if (!event->parent) {
                /*
                 * If this is a !child event, we must hold ctx::mutex to
                 * stabilize the event->ctx relation. See
                 * perf_event_ctx_lock().
                 */
                lockdep_assert_held(&ctx->mutex);
        }

        if (!task) {
                cpu_function_call(event->cpu, event_function, &efs);
                return;
        }

        if (task == TASK_TOMBSTONE)
                return;

again:
        if (!task_function_call(task, event_function, &efs))
                return;

        raw_spin_lock_irq(&ctx->lock);
        /*
         * Reload the task pointer, it might have been changed by
         * a concurrent perf_event_context_sched_out().
         */
        task = ctx->task;
        if (task == TASK_TOMBSTONE) {
                raw_spin_unlock_irq(&ctx->lock);
                return;
        }
        if (ctx->is_active) {
                raw_spin_unlock_irq(&ctx->lock);
                goto again;
        }
        func(event, NULL, ctx, data);
        raw_spin_unlock_irq(&ctx->lock);
}

/*
 * Similar to event_function_call() + event_function(), but hard assumes IRQs
 * are already disabled and we're on the right CPU.
 */
static void event_function_local(struct perf_event *event, event_f func, void *data)
{
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        struct task_struct *task = READ_ONCE(ctx->task);
        struct perf_event_context *task_ctx = NULL;

        lockdep_assert_irqs_disabled();

        if (task) {
                if (task == TASK_TOMBSTONE)
                        return;

                task_ctx = ctx;
        }

        perf_ctx_lock(cpuctx, task_ctx);

        task = ctx->task;
        if (task == TASK_TOMBSTONE)
                goto unlock;

        if (task) {
                /*
                 * We must be either inactive or active and the right task,
                 * otherwise we're screwed, since we cannot IPI to somewhere
                 * else.
                 */
                if (ctx->is_active) {
                        if (WARN_ON_ONCE(task != current))
                                goto unlock;

                        if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
                                goto unlock;
                }
        } else {
                WARN_ON_ONCE(&cpuctx->ctx != ctx);
        }

        func(event, cpuctx, ctx, data);
unlock:
        perf_ctx_unlock(cpuctx, task_ctx);
}

#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
                       PERF_FLAG_FD_OUTPUT  |\
                       PERF_FLAG_PID_CGROUP |\
                       PERF_FLAG_FD_CLOEXEC)

/*
 * branch priv levels that need permission checks
 */
#define PERF_SAMPLE_BRANCH_PERM_PLM \
        (PERF_SAMPLE_BRANCH_KERNEL |\
         PERF_SAMPLE_BRANCH_HV)

enum event_type_t {
        EVENT_FLEXIBLE = 0x1,
        EVENT_PINNED = 0x2,
        EVENT_TIME = 0x4,
        /* see ctx_resched() for details */
        EVENT_CPU = 0x8,
        EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
};

/*
 * perf_sched_events : >0 events exist
 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
 */

static void perf_sched_delayed(struct work_struct *work);
DEFINE_STATIC_KEY_FALSE(perf_sched_events);
static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
static DEFINE_MUTEX(perf_sched_mutex);
static atomic_t perf_sched_count;

static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
static DEFINE_PER_CPU(int, perf_sched_cb_usages);
static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);

static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
static atomic_t nr_namespaces_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
static atomic_t nr_freq_events __read_mostly;
static atomic_t nr_switch_events __read_mostly;
static atomic_t nr_ksymbol_events __read_mostly;
static atomic_t nr_bpf_events __read_mostly;
static atomic_t nr_cgroup_events __read_mostly;
static atomic_t nr_text_poke_events __read_mostly;

static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
static struct srcu_struct pmus_srcu;
static cpumask_var_t perf_online_mask;

/*
 * perf event paranoia level:
 *  -1 - not paranoid at all
 *   0 - disallow raw tracepoint access for unpriv
 *   1 - disallow cpu events for unpriv
 *   2 - disallow kernel profiling for unpriv
 */
int sysctl_perf_event_paranoid __read_mostly = 2;

/* Minimum for 512 kiB + 1 user control page */
int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */

/*
 * max perf event sample rate
 */
#define DEFAULT_MAX_SAMPLE_RATE                100000
#define DEFAULT_SAMPLE_PERIOD_NS        (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
#define DEFAULT_CPU_TIME_MAX_PERCENT        25

int sysctl_perf_event_sample_rate __read_mostly        = DEFAULT_MAX_SAMPLE_RATE;

static int max_samples_per_tick __read_mostly        = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
static int perf_sample_period_ns __read_mostly        = DEFAULT_SAMPLE_PERIOD_NS;

static int perf_sample_allowed_ns __read_mostly =
        DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;

static void update_perf_cpu_limits(void)
{
        u64 tmp = perf_sample_period_ns;

        tmp *= sysctl_perf_cpu_time_max_percent;
        tmp = div_u64(tmp, 100);
        if (!tmp)
                tmp = 1;

        WRITE_ONCE(perf_sample_allowed_ns, tmp);
}

static bool perf_rotate_context(struct perf_cpu_context *cpuctx);

int perf_proc_update_handler(struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret;
        int perf_cpu = sysctl_perf_cpu_time_max_percent;
        /*
         * If throttling is disabled don't allow the write:
         */
        if (write && (perf_cpu == 100 || perf_cpu == 0))
                return -EINVAL;

        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (ret || !write)
                return ret;

        max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
        perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
        update_perf_cpu_limits();

        return 0;
}

int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;

int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);

        if (ret || !write)
                return ret;

        if (sysctl_perf_cpu_time_max_percent == 100 ||
            sysctl_perf_cpu_time_max_percent == 0) {
                printk(KERN_WARNING
                       "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
                WRITE_ONCE(perf_sample_allowed_ns, 0);
        } else {
                update_perf_cpu_limits();
        }

        return 0;
}

/*
 * perf samples are done in some very critical code paths (NMIs).
 * If they take too much CPU time, the system can lock up and not
 * get any real work done.  This will drop the sample rate when
 * we detect that events are taking too long.
 */
#define NR_ACCUMULATED_SAMPLES 128
static DEFINE_PER_CPU(u64, running_sample_length);

static u64 __report_avg;
static u64 __report_allowed;

static void perf_duration_warn(struct irq_work *w)
{
        printk_ratelimited(KERN_INFO
                "perf: interrupt took too long (%lld > %lld), lowering "
                "kernel.perf_event_max_sample_rate to %d\n",
                __report_avg, __report_allowed,
                sysctl_perf_event_sample_rate);
}

static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);

void perf_sample_event_took(u64 sample_len_ns)
{
        u64 max_len = READ_ONCE(perf_sample_allowed_ns);
        u64 running_len;
        u64 avg_len;
        u32 max;

        if (max_len == 0)
                return;

        /* Decay the counter by 1 average sample. */
        running_len = __this_cpu_read(running_sample_length);
        running_len -= running_len/NR_ACCUMULATED_SAMPLES;
        running_len += sample_len_ns;
        __this_cpu_write(running_sample_length, running_len);

        /*
         * Note: this will be biased artifically low until we have
         * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
         * from having to maintain a count.
         */
        avg_len = running_len/NR_ACCUMULATED_SAMPLES;
        if (avg_len <= max_len)
                return;

        __report_avg = avg_len;
        __report_allowed = max_len;

        /*
         * Compute a throttle threshold 25% below the current duration.
         */
        avg_len += avg_len / 4;
        max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
        if (avg_len < max)
                max /= (u32)avg_len;
        else
                max = 1;

        WRITE_ONCE(perf_sample_allowed_ns, avg_len);
        WRITE_ONCE(max_samples_per_tick, max);

        sysctl_perf_event_sample_rate = max * HZ;
        perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;

        if (!irq_work_queue(&perf_duration_work)) {
                early_printk("perf: interrupt took too long (%lld > %lld), lowering "
                             "kernel.perf_event_max_sample_rate to %d\n",
                             __report_avg, __report_allowed,
                             sysctl_perf_event_sample_rate);
        }
}

static atomic64_t perf_event_id;

static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
                              enum event_type_t event_type);

static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
                             enum event_type_t event_type,
                             struct task_struct *task);

static void update_context_time(struct perf_event_context *ctx);
static u64 perf_event_time(struct perf_event *event);

void __weak perf_event_print_debug(void)        { }

extern __weak const char *perf_pmu_name(void)
{
        return "pmu";
}

static inline u64 perf_clock(void)
{
        return local_clock();
}

static inline u64 perf_event_clock(struct perf_event *event)
{
        return event->clock();
}

/*
 * State based event timekeeping...
 *
 * The basic idea is to use event->state to determine which (if any) time
 * fields to increment with the current delta. This means we only need to
 * update timestamps when we change state or when they are explicitly requested
 * (read).
 *
 * Event groups make things a little more complicated, but not terribly so. The
 * rules for a group are that if the group leader is OFF the entire group is
 * OFF, irrespecive of what the group member states are. This results in
 * __perf_effective_state().
 *
 * A futher ramification is that when a group leader flips between OFF and
 * !OFF, we need to update all group member times.
 *
 *
 * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we
 * need to make sure the relevant context time is updated before we try and
 * update our timestamps.
 */

static __always_inline enum perf_event_state
__perf_effective_state(struct perf_event *event)
{
        struct perf_event *leader = event->group_leader;

        if (leader->state <= PERF_EVENT_STATE_OFF)
                return leader->state;

        return event->state;
}

static __always_inline void
__perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
{
        enum perf_event_state state = __perf_effective_state(event);
        u64 delta = now - event->tstamp;

        *enabled = event->total_time_enabled;
        if (state >= PERF_EVENT_STATE_INACTIVE)
                *enabled += delta;

        *running = event->total_time_running;
        if (state >= PERF_EVENT_STATE_ACTIVE)
                *running += delta;
}

static void perf_event_update_time(struct perf_event *event)
{
        u64 now = perf_event_time(event);

        __perf_update_times(event, now, &event->total_time_enabled,
                                        &event->total_time_running);
        event->tstamp = now;
}

static void perf_event_update_sibling_time(struct perf_event *leader)
{
        struct perf_event *sibling;

        for_each_sibling_event(sibling, leader)
                perf_event_update_time(sibling);
}

static void
perf_event_set_state(struct perf_event *event, enum perf_event_state state)
{
        if (event->state == state)
                return;

        perf_event_update_time(event);
        /*
         * If a group leader gets enabled/disabled all its siblings
         * are affected too.
         */
        if ((event->state < 0) ^ (state < 0))
                perf_event_update_sibling_time(event);

        WRITE_ONCE(event->state, state);
}

/*
 * UP store-release, load-acquire
 */

#define __store_release(ptr, val)                                        \
do {                                                                        \
        barrier();                                                        \
        WRITE_ONCE(*(ptr), (val));                                        \
} while (0)

#define __load_acquire(ptr)                                                \
({                                                                        \
        __unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr));        \
        barrier();                                                        \
        ___p;                                                                \
})

#ifdef CONFIG_CGROUP_PERF

static inline bool
perf_cgroup_match(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);

        /* @event doesn't care about cgroup */
        if (!event->cgrp)
                return true;

        /* wants specific cgroup scope but @cpuctx isn't associated with any */
        if (!cpuctx->cgrp)
                return false;

        /*
         * Cgroup scoping is recursive.  An event enabled for a cgroup is
         * also enabled for all its descendant cgroups.  If @cpuctx's
         * cgroup is a descendant of @event's (the test covers identity
         * case), it's a match.
         */
        return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
                                    event->cgrp->css.cgroup);
}

static inline void perf_detach_cgroup(struct perf_event *event)
{
        css_put(&event->cgrp->css);
        event->cgrp = NULL;
}

static inline int is_cgroup_event(struct perf_event *event)
{
        return event->cgrp != NULL;
}

static inline u64 perf_cgroup_event_time(struct perf_event *event)
{
        struct perf_cgroup_info *t;

        t = per_cpu_ptr(event->cgrp->info, event->cpu);
        return t->time;
}

static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
{
        struct perf_cgroup_info *t;

        t = per_cpu_ptr(event->cgrp->info, event->cpu);
        if (!__load_acquire(&t->active))
                return t->time;
        now += READ_ONCE(t->timeoffset);
        return now;
}

static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv)
{
        if (adv)
                info->time += now - info->timestamp;
        info->timestamp = now;
        /*
         * see update_context_time()
         */
        WRITE_ONCE(info->timeoffset, info->time - info->timestamp);
}

static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final)
{
        struct perf_cgroup *cgrp = cpuctx->cgrp;
        struct cgroup_subsys_state *css;
        struct perf_cgroup_info *info;

        if (cgrp) {
                u64 now = perf_clock();

                for (css = &cgrp->css; css; css = css->parent) {
                        cgrp = container_of(css, struct perf_cgroup, css);
                        info = this_cpu_ptr(cgrp->info);

                        __update_cgrp_time(info, now, true);
                        if (final)
                                __store_release(&info->active, 0);
                }
        }
}

static inline void update_cgrp_time_from_event(struct perf_event *event)
{
        struct perf_cgroup_info *info;
        struct perf_cgroup *cgrp;

        /*
         * ensure we access cgroup data only when needed and
         * when we know the cgroup is pinned (css_get)
         */
        if (!is_cgroup_event(event))
                return;

        cgrp = perf_cgroup_from_task(current, event->ctx);
        /*
         * Do not update time when cgroup is not active
         */
        if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) {
                info = this_cpu_ptr(event->cgrp->info);
                __update_cgrp_time(info, perf_clock(), true);
        }
}

static inline void
perf_cgroup_set_timestamp(struct task_struct *task,
                          struct perf_event_context *ctx)
{
        struct perf_cgroup *cgrp;
        struct perf_cgroup_info *info;
        struct cgroup_subsys_state *css;

        /*
         * ctx->lock held by caller
         * ensure we do not access cgroup data
         * unless we have the cgroup pinned (css_get)
         */
        if (!task || !ctx->nr_cgroups)
                return;

        cgrp = perf_cgroup_from_task(task, ctx);

        for (css = &cgrp->css; css; css = css->parent) {
                cgrp = container_of(css, struct perf_cgroup, css);
                info = this_cpu_ptr(cgrp->info);
                __update_cgrp_time(info, ctx->timestamp, false);
                __store_release(&info->active, 1);
        }
}

static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);

#define PERF_CGROUP_SWOUT        0x1 /* cgroup switch out every event */
#define PERF_CGROUP_SWIN        0x2 /* cgroup switch in events based on task */

/*
 * reschedule events based on the cgroup constraint of task.
 *
 * mode SWOUT : schedule out everything
 * mode SWIN : schedule in based on cgroup for next
 */
static void perf_cgroup_switch(struct task_struct *task, int mode)
{
        struct perf_cpu_context *cpuctx, *tmp;
        struct list_head *list;
        unsigned long flags;

        /*
         * Disable interrupts and preemption to avoid this CPU's
         * cgrp_cpuctx_entry to change under us.
         */
        local_irq_save(flags);

        list = this_cpu_ptr(&cgrp_cpuctx_list);
        list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
                WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);

                perf_ctx_lock(cpuctx, cpuctx->task_ctx);
                perf_pmu_disable(cpuctx->ctx.pmu);

                if (mode & PERF_CGROUP_SWOUT) {
                        cpu_ctx_sched_out(cpuctx, EVENT_ALL);
                        /*
                         * must not be done before ctxswout due
                         * to event_filter_match() in event_sched_out()
                         */
                        cpuctx->cgrp = NULL;
                }

                if (mode & PERF_CGROUP_SWIN) {
                        WARN_ON_ONCE(cpuctx->cgrp);
                        /*
                         * set cgrp before ctxsw in to allow
                         * event_filter_match() to not have to pass
                         * task around
                         * we pass the cpuctx->ctx to perf_cgroup_from_task()
                         * because cgorup events are only per-cpu
                         */
                        cpuctx->cgrp = perf_cgroup_from_task(task,
                                                             &cpuctx->ctx);
                        cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
                }
                perf_pmu_enable(cpuctx->ctx.pmu);
                perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
        }

        local_irq_restore(flags);
}

static inline void perf_cgroup_sched_out(struct task_struct *task,
                                         struct task_struct *next)
{
        struct perf_cgroup *cgrp1;
        struct perf_cgroup *cgrp2 = NULL;

        rcu_read_lock();
        /*
         * we come here when we know perf_cgroup_events > 0
         * we do not need to pass the ctx here because we know
         * we are holding the rcu lock
         */
        cgrp1 = perf_cgroup_from_task(task, NULL);
        cgrp2 = perf_cgroup_from_task(next, NULL);

        /*
         * only schedule out current cgroup events if we know
         * that we are switching to a different cgroup. Otherwise,
         * do no touch the cgroup events.
         */
        if (cgrp1 != cgrp2)
                perf_cgroup_switch(task, PERF_CGROUP_SWOUT);

        rcu_read_unlock();
}

static inline void perf_cgroup_sched_in(struct task_struct *prev,
                                        struct task_struct *task)
{
        struct perf_cgroup *cgrp1;
        struct perf_cgroup *cgrp2 = NULL;

        rcu_read_lock();
        /*
         * we come here when we know perf_cgroup_events > 0
         * we do not need to pass the ctx here because we know
         * we are holding the rcu lock
         */
        cgrp1 = perf_cgroup_from_task(task, NULL);
        cgrp2 = perf_cgroup_from_task(prev, NULL);

        /*
         * only need to schedule in cgroup events if we are changing
         * cgroup during ctxsw. Cgroup events were not scheduled
         * out of ctxsw out if that was not the case.
         */
        if (cgrp1 != cgrp2)
                perf_cgroup_switch(task, PERF_CGROUP_SWIN);

        rcu_read_unlock();
}

static int perf_cgroup_ensure_storage(struct perf_event *event,
                                struct cgroup_subsys_state *css)
{
        struct perf_cpu_context *cpuctx;
        struct perf_event **storage;
        int cpu, heap_size, ret = 0;

        /*
         * Allow storage to have sufficent space for an iterator for each
         * possibly nested cgroup plus an iterator for events with no cgroup.
         */
        for (heap_size = 1; css; css = css->parent)
                heap_size++;

        for_each_possible_cpu(cpu) {
                cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu);
                if (heap_size <= cpuctx->heap_size)
                        continue;

                storage = kmalloc_node(heap_size * sizeof(struct perf_event *),
                                       GFP_KERNEL, cpu_to_node(cpu));
                if (!storage) {
                        ret = -ENOMEM;
                        break;
                }

                raw_spin_lock_irq(&cpuctx->ctx.lock);
                if (cpuctx->heap_size < heap_size) {
                        swap(cpuctx->heap, storage);
                        if (storage == cpuctx->heap_default)
                                storage = NULL;
                        cpuctx->heap_size = heap_size;
                }
                raw_spin_unlock_irq(&cpuctx->ctx.lock);

                kfree(storage);
        }

        return ret;
}

static inline int perf_cgroup_connect(int fd, struct perf_event *event,
                                      struct perf_event_attr *attr,
                                      struct perf_event *group_leader)
{
        struct perf_cgroup *cgrp;
        struct cgroup_subsys_state *css;
        struct fd f = fdget(fd);
        int ret = 0;

        if (!f.file)
                return -EBADF;

        css = css_tryget_online_from_dir(f.file->f_path.dentry,
                                         &perf_event_cgrp_subsys);
        if (IS_ERR(css)) {
                ret = PTR_ERR(css);
                goto out;
        }

        ret = perf_cgroup_ensure_storage(event, css);
        if (ret)
                goto out;

        cgrp = container_of(css, struct perf_cgroup, css);
        event->cgrp = cgrp;

        /*
         * all events in a group must monitor
         * the same cgroup because a task belongs
         * to only one perf cgroup at a time
         */
        if (group_leader && group_leader->cgrp != cgrp) {
                perf_detach_cgroup(event);
                ret = -EINVAL;
        }
out:
        fdput(f);
        return ret;
}

static inline void
perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
{
        struct perf_cpu_context *cpuctx;

        if (!is_cgroup_event(event))
                return;

        /*
         * Because cgroup events are always per-cpu events,
         * @ctx == &cpuctx->ctx.
         */
        cpuctx = container_of(ctx, struct perf_cpu_context, ctx);

        /*
         * Since setting cpuctx->cgrp is conditional on the current @cgrp
         * matching the event's cgroup, we must do this for every new event,
         * because if the first would mismatch, the second would not try again
         * and we would leave cpuctx->cgrp unset.
         */
        if (ctx->is_active && !cpuctx->cgrp) {
                struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);

                if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
                        cpuctx->cgrp = cgrp;
        }

        if (ctx->nr_cgroups++)
                return;

        list_add(&cpuctx->cgrp_cpuctx_entry,
                        per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
}

static inline void
perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
{
        struct perf_cpu_context *cpuctx;

        if (!is_cgroup_event(event))
                return;

        /*
         * Because cgroup events are always per-cpu events,
         * @ctx == &cpuctx->ctx.
         */
        cpuctx = container_of(ctx, struct perf_cpu_context, ctx);

        if (--ctx->nr_cgroups)
                return;

        if (ctx->is_active && cpuctx->cgrp)
                cpuctx->cgrp = NULL;

        list_del(&cpuctx->cgrp_cpuctx_entry);
}

#else /* !CONFIG_CGROUP_PERF */

static inline bool
perf_cgroup_match(struct perf_event *event)
{
        return true;
}

static inline void perf_detach_cgroup(struct perf_event *event)
{}

static inline int is_cgroup_event(struct perf_event *event)
{
        return 0;
}

static inline void update_cgrp_time_from_event(struct perf_event *event)
{
}

static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
                                                bool final)
{
}

static inline void perf_cgroup_sched_out(struct task_struct *task,
                                         struct task_struct *next)
{
}

static inline void perf_cgroup_sched_in(struct task_struct *prev,
                                        struct task_struct *task)
{
}

static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
                                      struct perf_event_attr *attr,
                                      struct perf_event *group_leader)
{
        return -EINVAL;
}

static inline void
perf_cgroup_set_timestamp(struct task_struct *task,
                          struct perf_event_context *ctx)
{
}

static inline void
perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
{
}

static inline u64 perf_cgroup_event_time(struct perf_event *event)
{
        return 0;
}

static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
{
        return 0;
}

static inline void
perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
{
}

static inline void
perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
{
}
#endif

/*
 * set default to be dependent on timer tick just
 * like original code
 */
#define PERF_CPU_HRTIMER (1000 / HZ)
/*
 * function must be called with interrupts disabled
 */
static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
{
        struct perf_cpu_context *cpuctx;
        bool rotations;

        lockdep_assert_irqs_disabled();

        cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
        rotations = perf_rotate_context(cpuctx);

        raw_spin_lock(&cpuctx->hrtimer_lock);
        if (rotations)
                hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
        else
                cpuctx->hrtimer_active = 0;
        raw_spin_unlock(&cpuctx->hrtimer_lock);

        return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
}

static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
{
        struct hrtimer *timer = &cpuctx->hrtimer;
        struct pmu *pmu = cpuctx->ctx.pmu;
        u64 interval;

        /* no multiplexing needed for SW PMU */
        if (pmu->task_ctx_nr == perf_sw_context)
                return;

        /*
         * check default is sane, if not set then force to
         * default interval (1/tick)
         */
        interval = pmu->hrtimer_interval_ms;
        if (interval < 1)
                interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;

        cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);

        raw_spin_lock_init(&cpuctx->hrtimer_lock);
        hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
        timer->function = perf_mux_hrtimer_handler;
}

static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
{
        struct hrtimer *timer = &cpuctx->hrtimer;
        struct pmu *pmu = cpuctx->ctx.pmu;
        unsigned long flags;

        /* not for SW PMU */
        if (pmu->task_ctx_nr == perf_sw_context)
                return 0;

        raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
        if (!cpuctx->hrtimer_active) {
                cpuctx->hrtimer_active = 1;
                hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
                hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
        }
        raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);

        return 0;
}

static int perf_mux_hrtimer_restart_ipi(void *arg)
{
        return perf_mux_hrtimer_restart(arg);
}

void perf_pmu_disable(struct pmu *pmu)
{
        int *count = this_cpu_ptr(pmu->pmu_disable_count);
        if (!(*count)++)
                pmu->pmu_disable(pmu);
}

void perf_pmu_enable(struct pmu *pmu)
{
        int *count = this_cpu_ptr(pmu->pmu_disable_count);
        if (!--(*count))
                pmu->pmu_enable(pmu);
}

static DEFINE_PER_CPU(struct list_head, active_ctx_list);

/*
 * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
 * perf_event_task_tick() are fully serialized because they're strictly cpu
 * affine and perf_event_ctx{activate,deactivate} are called with IRQs
 * disabled, while perf_event_task_tick is called from IRQ context.
 */
static void perf_event_ctx_activate(struct perf_event_context *ctx)
{
        struct list_head *head = this_cpu_ptr(&active_ctx_list);

        lockdep_assert_irqs_disabled();

        WARN_ON(!list_empty(&ctx->active_ctx_list));

        list_add(&ctx->active_ctx_list, head);
}

static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
{
        lockdep_assert_irqs_disabled();

        WARN_ON(list_empty(&ctx->active_ctx_list));

        list_del_init(&ctx->active_ctx_list);
}

static void get_ctx(struct perf_event_context *ctx)
{
        refcount_inc(&ctx->refcount);
}

static void *alloc_task_ctx_data(struct pmu *pmu)
{
        if (pmu->task_ctx_cache)
                return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);

        return NULL;
}

static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
{
        if (pmu->task_ctx_cache && task_ctx_data)
                kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);
}

static void free_ctx(struct rcu_head *head)
{
        struct perf_event_context *ctx;

        ctx = container_of(head, struct perf_event_context, rcu_head);
        free_task_ctx_data(ctx->pmu, ctx->task_ctx_data);
        kfree(ctx);
}

static void put_ctx(struct perf_event_context *ctx)
{
        if (refcount_dec_and_test(&ctx->refcount)) {
                if (ctx->parent_ctx)
                        put_ctx(ctx->parent_ctx);
                if (ctx->task && ctx->task != TASK_TOMBSTONE)
                        put_task_struct(ctx->task);
                call_rcu(&ctx->rcu_head, free_ctx);
        }
}

/*
 * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
 * perf_pmu_migrate_context() we need some magic.
 *
 * Those places that change perf_event::ctx will hold both
 * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
 *
 * Lock ordering is by mutex address. There are two other sites where
 * perf_event_context::mutex nests and those are:
 *
 *  - perf_event_exit_task_context()        [ child , 0 ]
 *      perf_event_exit_event()
 *        put_event()                        [ parent, 1 ]
 *
 *  - perf_event_init_context()                [ parent, 0 ]
 *      inherit_task_group()
 *        inherit_group()
 *          inherit_event()
 *            perf_event_alloc()
 *              perf_init_event()
 *                perf_try_init_event()        [ child , 1 ]
 *
 * While it appears there is an obvious deadlock here -- the parent and child
 * nesting levels are inverted between the two. This is in fact safe because
 * life-time rules separate them. That is an exiting task cannot fork, and a
 * spawning task cannot (yet) exit.
 *
 * But remember that these are parent<->child context relations, and
 * migration does not affect children, therefore these two orderings should not
 * interact.
 *
 * The change in perf_event::ctx does not affect children (as claimed above)
 * because the sys_perf_event_open() case will install a new event and break
 * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
 * concerned with cpuctx and that doesn't have children.
 *
 * The places that change perf_event::ctx will issue:
 *
 *   perf_remove_from_context();
 *   synchronize_rcu();
 *   perf_install_in_context();
 *
 * to affect the change. The remove_from_context() + synchronize_rcu() should
 * quiesce the event, after which we can install it in the new location. This
 * means that only external vectors (perf_fops, prctl) can perturb the event
 * while in transit. Therefore all such accessors should also acquire
 * perf_event_context::mutex to serialize against this.
 *
 * However; because event->ctx can change while we're waiting to acquire
 * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
 * function.
 *
 * Lock order:
 *    exec_update_lock
 *        task_struct::perf_event_mutex
 *          perf_event_context::mutex
 *            perf_event::child_mutex;
 *              perf_event_context::lock
 *            mmap_lock
 *              perf_event::mmap_mutex
 *                perf_buffer::aux_mutex
 *              perf_addr_filters_head::lock
 *
 *    cpu_hotplug_lock
 *      pmus_lock
 *          cpuctx->mutex / perf_event_context::mutex
 */
static struct perf_event_context *
perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
{
        struct perf_event_context *ctx;

again:
        rcu_read_lock();
        ctx = READ_ONCE(event->ctx);
        if (!refcount_inc_not_zero(&ctx->refcount)) {
                rcu_read_unlock();
                goto again;
        }
        rcu_read_unlock();

        mutex_lock_nested(&ctx->mutex, nesting);
        if (event->ctx != ctx) {
                mutex_unlock(&ctx->mutex);
                put_ctx(ctx);
                goto again;
        }

        return ctx;
}

static inline struct perf_event_context *
perf_event_ctx_lock(struct perf_event *event)
{
        return perf_event_ctx_lock_nested(event, 0);
}

static void perf_event_ctx_unlock(struct perf_event *event,
                                  struct perf_event_context *ctx)
{
        mutex_unlock(&ctx->mutex);
        put_ctx(ctx);
}

/*
 * This must be done under the ctx->lock, such as to serialize against
 * context_equiv(), therefore we cannot call put_ctx() since that might end up
 * calling scheduler related locks and ctx->lock nests inside those.
 */
static __must_check struct perf_event_context *
unclone_ctx(struct perf_event_context *ctx)
{
        struct perf_event_context *parent_ctx = ctx->parent_ctx;

        lockdep_assert_held(&ctx->lock);

        if (parent_ctx)
                ctx->parent_ctx = NULL;
        ctx->generation++;

        return parent_ctx;
}

static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
                                enum pid_type type)
{
        u32 nr;
        /*
         * only top level events have the pid namespace they were created in
         */
        if (event->parent)
                event = event->parent;

        nr = __task_pid_nr_ns(p, type, event->ns);
        /* avoid -1 if it is idle thread or runs in another ns */
        if (!nr && !pid_alive(p))
                nr = -1;
        return nr;
}

static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
{
        return perf_event_pid_type(event, p, PIDTYPE_TGID);
}

static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
{
        return perf_event_pid_type(event, p, PIDTYPE_PID);
}

/*
 * If we inherit events we want to return the parent event id
 * to userspace.
 */
static u64 primary_event_id(struct perf_event *event)
{
        u64 id = event->id;

        if (event->parent)
                id = event->parent->id;

        return id;
}

/*
 * Get the perf_event_context for a task and lock it.
 *
 * This has to cope with the fact that until it is locked,
 * the context could get moved to another task.
 */
static struct perf_event_context *
perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
{
        struct perf_event_context *ctx;

retry:
        /*
         * One of the few rules of preemptible RCU is that one cannot do
         * rcu_read_unlock() while holding a scheduler (or nested) lock when
         * part of the read side critical section was irqs-enabled -- see
         * rcu_read_unlock_special().
         *
         * Since ctx->lock nests under rq->lock we must ensure the entire read
         * side critical section has interrupts disabled.
         */
        local_irq_save(*flags);
        rcu_read_lock();
        ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
        if (ctx) {
                /*
                 * If this context is a clone of another, it might
                 * get swapped for another underneath us by
                 * perf_event_task_sched_out, though the
                 * rcu_read_lock() protects us from any context
                 * getting freed.  Lock the context and check if it
                 * got swapped before we could get the lock, and retry
                 * if so.  If we locked the right context, then it
                 * can't get swapped on us any more.
                 */
                raw_spin_lock(&ctx->lock);
                if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
                        raw_spin_unlock(&ctx->lock);
                        rcu_read_unlock();
                        local_irq_restore(*flags);
                        goto retry;
                }

                if (ctx->task == TASK_TOMBSTONE ||
                    !refcount_inc_not_zero(&ctx->refcount)) {
                        raw_spin_unlock(&ctx->lock);
                        ctx = NULL;
                } else {
                        WARN_ON_ONCE(ctx->task != task);
                }
        }
        rcu_read_unlock();
        if (!ctx)
                local_irq_restore(*flags);
        return ctx;
}

/*
 * Get the context for a task and increment its pin_count so it
 * can't get swapped to another task.  This also increments its
 * reference count so that the context can't get freed.
 */
static struct perf_event_context *
perf_pin_task_context(struct task_struct *task, int ctxn)
{
        struct perf_event_context *ctx;
        unsigned long flags;

        ctx = perf_lock_task_context(task, ctxn, &flags);
        if (ctx) {
                ++ctx->pin_count;
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
        }
        return ctx;
}

static void perf_unpin_context(struct perf_event_context *ctx)
{
        unsigned long flags;

        raw_spin_lock_irqsave(&ctx->lock, flags);
        --ctx->pin_count;
        raw_spin_unlock_irqrestore(&ctx->lock, flags);
}

/*
 * Update the record of the current time in a context.
 */
static void __update_context_time(struct perf_event_context *ctx, bool adv)
{
        u64 now = perf_clock();

        if (adv)
                ctx->time += now - ctx->timestamp;
        ctx->timestamp = now;

        /*
         * The above: time' = time + (now - timestamp), can be re-arranged
         * into: time` = now + (time - timestamp), which gives a single value
         * offset to compute future time without locks on.
         *
         * See perf_event_time_now(), which can be used from NMI context where
         * it's (obviously) not possible to acquire ctx->lock in order to read
         * both the above values in a consistent manner.
         */
        WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp);
}

static void update_context_time(struct perf_event_context *ctx)
{
        __update_context_time(ctx, true);
}

static u64 perf_event_time(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;

        if (unlikely(!ctx))
                return 0;

        if (is_cgroup_event(event))
                return perf_cgroup_event_time(event);

        return ctx->time;
}

static u64 perf_event_time_now(struct perf_event *event, u64 now)
{
        struct perf_event_context *ctx = event->ctx;

        if (unlikely(!ctx))
                return 0;

        if (is_cgroup_event(event))
                return perf_cgroup_event_time_now(event, now);

        if (!(__load_acquire(&ctx->is_active) & EVENT_TIME))
                return ctx->time;

        now += READ_ONCE(ctx->timeoffset);
        return now;
}

static enum event_type_t get_event_type(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;
        enum event_type_t event_type;

        lockdep_assert_held(&ctx->lock);

        /*
         * It's 'group type', really, because if our group leader is
         * pinned, so are we.
         */
        if (event->group_leader != event)
                event = event->group_leader;

        event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
        if (!ctx->task)
                event_type |= EVENT_CPU;

        return event_type;
}

/*
 * Helper function to initialize event group nodes.
 */
static void init_event_group(struct perf_event *event)
{
        RB_CLEAR_NODE(&event->group_node);
        event->group_index = 0;
}

/*
 * Extract pinned or flexible groups from the context
 * based on event attrs bits.
 */
static struct perf_event_groups *
get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
{
        if (event->attr.pinned)
                return &ctx->pinned_groups;
        else
                return &ctx->flexible_groups;
}

/*
 * Helper function to initializes perf_event_group trees.
 */
static void perf_event_groups_init(struct perf_event_groups *groups)
{
        groups->tree = RB_ROOT;
        groups->index = 0;
}

/*
 * Compare function for event groups;
 *
 * Implements complex key that first sorts by CPU and then by virtual index
 * which provides ordering when rotating groups for the same CPU.
 */
static bool
perf_event_groups_less(struct perf_event *left, struct perf_event *right)
{
        if (left->cpu < right->cpu)
                return true;
        if (left->cpu > right->cpu)
                return false;

#ifdef CONFIG_CGROUP_PERF
        if (left->cgrp != right->cgrp) {
                if (!left->cgrp || !left->cgrp->css.cgroup) {
                        /*
                         * Left has no cgroup but right does, no cgroups come
                         * first.
                         */
                        return true;
                }
                if (!right->cgrp || !right->cgrp->css.cgroup) {
                        /*
                         * Right has no cgroup but left does, no cgroups come
                         * first.
                         */
                        return false;
                }
                /* Two dissimilar cgroups, order by id. */
                if (left->cgrp->css.cgroup->kn->id < right->cgrp->css.cgroup->kn->id)
                        return true;

                return false;
        }
#endif

        if (left->group_index < right->group_index)
                return true;
        if (left->group_index > right->group_index)
                return false;

        return false;
}

/*
 * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
 * key (see perf_event_groups_less). This places it last inside the CPU
 * subtree.
 */
static void
perf_event_groups_insert(struct perf_event_groups *groups,
                         struct perf_event *event)
{
        struct perf_event *node_event;
        struct rb_node *parent;
        struct rb_node **node;

        event->group_index = ++groups->index;

        node = &groups->tree.rb_node;
        parent = *node;

        while (*node) {
                parent = *node;
                node_event = container_of(*node, struct perf_event, group_node);

                if (perf_event_groups_less(event, node_event))
                        node = &parent->rb_left;
                else
                        node = &parent->rb_right;
        }

        rb_link_node(&event->group_node, parent, node);
        rb_insert_color(&event->group_node, &groups->tree);
}

/*
 * Helper function to insert event into the pinned or flexible groups.
 */
static void
add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
{
        struct perf_event_groups *groups;

        groups = get_event_groups(event, ctx);
        perf_event_groups_insert(groups, event);
}

/*
 * Delete a group from a tree.
 */
static void
perf_event_groups_delete(struct perf_event_groups *groups,
                         struct perf_event *event)
{
        WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
                     RB_EMPTY_ROOT(&groups->tree));

        rb_erase(&event->group_node, &groups->tree);
        init_event_group(event);
}

/*
 * Helper function to delete event from its groups.
 */
static void
del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
{
        struct perf_event_groups *groups;

        groups = get_event_groups(event, ctx);
        perf_event_groups_delete(groups, event);
}

/*
 * Get the leftmost event in the cpu/cgroup subtree.
 */
static struct perf_event *
perf_event_groups_first(struct perf_event_groups *groups, int cpu,
                        struct cgroup *cgrp)
{
        struct perf_event *node_event = NULL, *match = NULL;
        struct rb_node *node = groups->tree.rb_node;
#ifdef CONFIG_CGROUP_PERF
        u64 node_cgrp_id, cgrp_id = 0;

        if (cgrp)
                cgrp_id = cgrp->kn->id;
#endif

        while (node) {
                node_event = container_of(node, struct perf_event, group_node);

                if (cpu < node_event->cpu) {
                        node = node->rb_left;
                        continue;
                }
                if (cpu > node_event->cpu) {
                        node = node->rb_right;
                        continue;
                }
#ifdef CONFIG_CGROUP_PERF
                node_cgrp_id = 0;
                if (node_event->cgrp && node_event->cgrp->css.cgroup)
                        node_cgrp_id = node_event->cgrp->css.cgroup->kn->id;

                if (cgrp_id < node_cgrp_id) {
                        node = node->rb_left;
                        continue;
                }
                if (cgrp_id > node_cgrp_id) {
                        node = node->rb_right;
                        continue;
                }
#endif
                match = node_event;
                node = node->rb_left;
        }

        return match;
}

/*
 * Like rb_entry_next_safe() for the @cpu subtree.
 */
static struct perf_event *
perf_event_groups_next(struct perf_event *event)
{
        struct perf_event *next;
#ifdef CONFIG_CGROUP_PERF
        u64 curr_cgrp_id = 0;
        u64 next_cgrp_id = 0;
#endif

        next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
        if (next == NULL || next->cpu != event->cpu)
                return NULL;

#ifdef CONFIG_CGROUP_PERF
        if (event->cgrp && event->cgrp->css.cgroup)
                curr_cgrp_id = event->cgrp->css.cgroup->kn->id;

        if (next->cgrp && next->cgrp->css.cgroup)
                next_cgrp_id = next->cgrp->css.cgroup->kn->id;

        if (curr_cgrp_id != next_cgrp_id)
                return NULL;
#endif
        return next;
}

/*
 * Iterate through the whole groups tree.
 */
#define perf_event_groups_for_each(event, groups)                        \
        for (event = rb_entry_safe(rb_first(&((groups)->tree)),                \
                                typeof(*event), group_node); event;        \
                event = rb_entry_safe(rb_next(&event->group_node),        \
                                typeof(*event), group_node))

/*
 * Add an event from the lists for its context.
 * Must be called with ctx->mutex and ctx->lock held.
 */
static void
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
{
        lockdep_assert_held(&ctx->lock);

        WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
        event->attach_state |= PERF_ATTACH_CONTEXT;

        event->tstamp = perf_event_time(event);

        /*
         * If we're a stand alone event or group leader, we go to the context
         * list, group events are kept attached to the group so that
         * perf_group_detach can, at all times, locate all siblings.
         */
        if (event->group_leader == event) {
                event->group_caps = event->event_caps;
                add_event_to_groups(event, ctx);
        }

        list_add_rcu(&event->event_entry, &ctx->event_list);
        ctx->nr_events++;
        if (event->attr.inherit_stat)
                ctx->nr_stat++;

        if (event->state > PERF_EVENT_STATE_OFF)
                perf_cgroup_event_enable(event, ctx);

        ctx->generation++;
}

/*
 * Initialize event state based on the perf_event_attr::disabled.
 */
static inline void perf_event__state_init(struct perf_event *event)
{
        event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
                                              PERF_EVENT_STATE_INACTIVE;
}

static int __perf_event_read_size(u64 read_format, int nr_siblings)
{
        int entry = sizeof(u64); /* value */
        int size = 0;
        int nr = 1;

        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
                size += sizeof(u64);

        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
                size += sizeof(u64);

        if (read_format & PERF_FORMAT_ID)
                entry += sizeof(u64);

        if (read_format & PERF_FORMAT_LOST)
                entry += sizeof(u64);

        if (read_format & PERF_FORMAT_GROUP) {
                nr += nr_siblings;
                size += sizeof(u64);
        }

        /*
         * Since perf_event_validate_size() limits this to 16k and inhibits
         * adding more siblings, this will never overflow.
         */
        return size + nr * entry;
}

static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
{
        struct perf_sample_data *data;
        u16 size = 0;

        if (sample_type & PERF_SAMPLE_IP)
                size += sizeof(data->ip);

        if (sample_type & PERF_SAMPLE_ADDR)
                size += sizeof(data->addr);

        if (sample_type & PERF_SAMPLE_PERIOD)
                size += sizeof(data->period);

        if (sample_type & PERF_SAMPLE_WEIGHT)
                size += sizeof(data->weight);

        if (sample_type & PERF_SAMPLE_READ)
                size += event->read_size;

        if (sample_type & PERF_SAMPLE_DATA_SRC)
                size += sizeof(data->data_src.val);

        if (sample_type & PERF_SAMPLE_TRANSACTION)
                size += sizeof(data->txn);

        if (sample_type & PERF_SAMPLE_PHYS_ADDR)
                size += sizeof(data->phys_addr);

        if (sample_type & PERF_SAMPLE_CGROUP)
                size += sizeof(data->cgroup);

        event->header_size = size;
}

/*
 * Called at perf_event creation and when events are attached/detached from a
 * group.
 */
static void perf_event__header_size(struct perf_event *event)
{
        event->read_size =
                __perf_event_read_size(event->attr.read_format,
                                       event->group_leader->nr_siblings);
        __perf_event_header_size(event, event->attr.sample_type);
}

static void perf_event__id_header_size(struct perf_event *event)
{
        struct perf_sample_data *data;
        u64 sample_type = event->attr.sample_type;
        u16 size = 0;

        if (sample_type & PERF_SAMPLE_TID)
                size += sizeof(data->tid_entry);

        if (sample_type & PERF_SAMPLE_TIME)
                size += sizeof(data->time);

        if (sample_type & PERF_SAMPLE_IDENTIFIER)
                size += sizeof(data->id);

        if (sample_type & PERF_SAMPLE_ID)
                size += sizeof(data->id);

        if (sample_type & PERF_SAMPLE_STREAM_ID)
                size += sizeof(data->stream_id);

        if (sample_type & PERF_SAMPLE_CPU)
                size += sizeof(data->cpu_entry);

        event->id_header_size = size;
}

/*
 * Check that adding an event to the group does not result in anybody
 * overflowing the 64k event limit imposed by the output buffer.
 *
 * Specifically, check that the read_size for the event does not exceed 16k,
 * read_size being the one term that grows with groups size. Since read_size
 * depends on per-event read_format, also (re)check the existing events.
 *
 * This leaves 48k for the constant size fields and things like callchains,
 * branch stacks and register sets.
 */
static bool perf_event_validate_size(struct perf_event *event)
{
        struct perf_event *sibling, *group_leader = event->group_leader;

        if (__perf_event_read_size(event->attr.read_format,
                                   group_leader->nr_siblings + 1) > 16*1024)
                return false;

        if (__perf_event_read_size(group_leader->attr.read_format,
                                   group_leader->nr_siblings + 1) > 16*1024)
                return false;

        /*
         * When creating a new group leader, group_leader->ctx is initialized
         * after the size has been validated, but we cannot safely use
         * for_each_sibling_event() until group_leader->ctx is set. A new group
         * leader cannot have any siblings yet, so we can safely skip checking
         * the non-existent siblings.
         */
        if (event == group_leader)
                return true;

        for_each_sibling_event(sibling, group_leader) {
                if (__perf_event_read_size(sibling->attr.read_format,
                                           group_leader->nr_siblings + 1) > 16*1024)
                        return false;
        }

        return true;
}

static void perf_group_attach(struct perf_event *event)
{
        struct perf_event *group_leader = event->group_leader, *pos;

        lockdep_assert_held(&event->ctx->lock);

        /*
         * We can have double attach due to group movement in perf_event_open.
         */
        if (event->attach_state & PERF_ATTACH_GROUP)
                return;

        event->attach_state |= PERF_ATTACH_GROUP;

        if (group_leader == event)
                return;

        WARN_ON_ONCE(group_leader->ctx != event->ctx);

        group_leader->group_caps &= event->event_caps;

        list_add_tail(&event->sibling_list, &group_leader->sibling_list);
        group_leader->nr_siblings++;
        group_leader->group_generation++;

        perf_event__header_size(group_leader);

        for_each_sibling_event(pos, group_leader)
                perf_event__header_size(pos);
}

/*
 * Remove an event from the lists for its context.
 * Must be called with ctx->mutex and ctx->lock held.
 */
static void
list_del_event(struct perf_event *event, struct perf_event_context *ctx)
{
        WARN_ON_ONCE(event->ctx != ctx);
        lockdep_assert_held(&ctx->lock);

        /*
         * We can have double detach due to exit/hot-unplug + close.
         */
        if (!(event->attach_state & PERF_ATTACH_CONTEXT))
                return;

        event->attach_state &= ~PERF_ATTACH_CONTEXT;

        ctx->nr_events--;
        if (event->attr.inherit_stat)
                ctx->nr_stat--;

        list_del_rcu(&event->event_entry);

        if (event->group_leader == event)
                del_event_from_groups(event, ctx);

        /*
         * If event was in error state, then keep it
         * that way, otherwise bogus counts will be
         * returned on read(). The only way to get out
         * of error state is by explicit re-enabling
         * of the event
         */
        if (event->state > PERF_EVENT_STATE_OFF) {
                perf_cgroup_event_disable(event, ctx);
                perf_event_set_state(event, PERF_EVENT_STATE_OFF);
        }

        ctx->generation++;
}

static int
perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
{
        if (!has_aux(aux_event))
                return 0;

        if (!event->pmu->aux_output_match)
                return 0;

        return event->pmu->aux_output_match(aux_event);
}

static void put_event(struct perf_event *event);
static void event_sched_out(struct perf_event *event,
                            struct perf_cpu_context *cpuctx,
                            struct perf_event_context *ctx);

static void perf_put_aux_event(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        struct perf_event *iter;

        /*
         * If event uses aux_event tear down the link
         */
        if (event->aux_event) {
                iter = event->aux_event;
                event->aux_event = NULL;
                put_event(iter);
                return;
        }

        /*
         * If the event is an aux_event, tear down all links to
         * it from other events.
         */
        for_each_sibling_event(iter, event->group_leader) {
                if (iter->aux_event != event)
                        continue;

                iter->aux_event = NULL;
                put_event(event);

                /*
                 * If it's ACTIVE, schedule it out and put it into ERROR
                 * state so that we don't try to schedule it again. Note
                 * that perf_event_enable() will clear the ERROR status.
                 */
                event_sched_out(iter, cpuctx, ctx);
                perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
        }
}

static bool perf_need_aux_event(struct perf_event *event)
{
        return !!event->attr.aux_output || !!event->attr.aux_sample_size;
}

static int perf_get_aux_event(struct perf_event *event,
                              struct perf_event *group_leader)
{
        /*
         * Our group leader must be an aux event if we want to be
         * an aux_output. This way, the aux event will precede its
         * aux_output events in the group, and therefore will always
         * schedule first.
         */
        if (!group_leader)
                return 0;

        /*
         * aux_output and aux_sample_size are mutually exclusive.
         */
        if (event->attr.aux_output && event->attr.aux_sample_size)
                return 0;

        if (event->attr.aux_output &&
            !perf_aux_output_match(event, group_leader))
                return 0;

        if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
                return 0;

        if (!atomic_long_inc_not_zero(&group_leader->refcount))
                return 0;

        /*
         * Link aux_outputs to their aux event; this is undone in
         * perf_group_detach() by perf_put_aux_event(). When the
         * group in torn down, the aux_output events loose their
         * link to the aux_event and can't schedule any more.
         */
        event->aux_event = group_leader;

        return 1;
}

static inline struct list_head *get_event_list(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;
        return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active;
}

/*
 * Events that have PERF_EV_CAP_SIBLING require being part of a group and
 * cannot exist on their own, schedule them out and move them into the ERROR
 * state. Also see _perf_event_enable(), it will not be able to recover
 * this ERROR state.
 */
static inline void perf_remove_sibling_event(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);

        event_sched_out(event, cpuctx, ctx);
        perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
}

static void perf_group_detach(struct perf_event *event)
{
        struct perf_event *leader = event->group_leader;
        struct perf_event *sibling, *tmp;
        struct perf_event_context *ctx = event->ctx;

        lockdep_assert_held(&ctx->lock);

        /*
         * We can have double detach due to exit/hot-unplug + close.
         */
        if (!(event->attach_state & PERF_ATTACH_GROUP))
                return;

        event->attach_state &= ~PERF_ATTACH_GROUP;

        perf_put_aux_event(event);

        /*
         * If this is a sibling, remove it from its group.
         */
        if (leader != event) {
                list_del_init(&event->sibling_list);
                event->group_leader->nr_siblings--;
                event->group_leader->group_generation++;
                goto out;
        }

        /*
         * If this was a group event with sibling events then
         * upgrade the siblings to singleton events by adding them
         * to whatever list we are on.
         */
        list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {

                if (sibling->event_caps & PERF_EV_CAP_SIBLING)
                        perf_remove_sibling_event(sibling);

                sibling->group_leader = sibling;
                list_del_init(&sibling->sibling_list);

                /* Inherit group flags from the previous leader */
                sibling->group_caps = event->group_caps;

                if (!RB_EMPTY_NODE(&event->group_node)) {
                        add_event_to_groups(sibling, event->ctx);

                        if (sibling->state == PERF_EVENT_STATE_ACTIVE)
                                list_add_tail(&sibling->active_list, get_event_list(sibling));
                }

                WARN_ON_ONCE(sibling->ctx != event->ctx);
        }

out:
        for_each_sibling_event(tmp, leader)
                perf_event__header_size(tmp);

        perf_event__header_size(leader);
}

static bool is_orphaned_event(struct perf_event *event)
{
        return event->state == PERF_EVENT_STATE_DEAD;
}

static inline int __pmu_filter_match(struct perf_event *event)
{
        struct pmu *pmu = event->pmu;
        return pmu->filter_match ? pmu->filter_match(event) : 1;
}

/*
 * Check whether we should attempt to schedule an event group based on
 * PMU-specific filtering. An event group can consist of HW and SW events,
 * potentially with a SW leader, so we must check all the filters, to
 * determine whether a group is schedulable:
 */
static inline int pmu_filter_match(struct perf_event *event)
{
        struct perf_event *sibling;

        if (!__pmu_filter_match(event))
                return 0;

        for_each_sibling_event(sibling, event) {
                if (!__pmu_filter_match(sibling))
                        return 0;
        }

        return 1;
}

static inline int
event_filter_match(struct perf_event *event)
{
        return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
               perf_cgroup_match(event) && pmu_filter_match(event);
}

static void
event_sched_out(struct perf_event *event,
                  struct perf_cpu_context *cpuctx,
                  struct perf_event_context *ctx)
{
        enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;

        WARN_ON_ONCE(event->ctx != ctx);
        lockdep_assert_held(&ctx->lock);

        if (event->state != PERF_EVENT_STATE_ACTIVE)
                return;

        /*
         * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but
         * we can schedule events _OUT_ individually through things like
         * __perf_remove_from_context().
         */
        list_del_init(&event->active_list);

        perf_pmu_disable(event->pmu);

        event->pmu->del(event, 0);
        event->oncpu = -1;

        if (READ_ONCE(event->pending_disable) >= 0) {
                WRITE_ONCE(event->pending_disable, -1);
                perf_cgroup_event_disable(event, ctx);
                state = PERF_EVENT_STATE_OFF;
        }
        perf_event_set_state(event, state);

        if (!is_software_event(event))
                cpuctx->active_oncpu--;
        if (!--ctx->nr_active)
                perf_event_ctx_deactivate(ctx);
        if (event->attr.freq && event->attr.sample_freq)
                ctx->nr_freq--;
        if (event->attr.exclusive || !cpuctx->active_oncpu)
                cpuctx->exclusive = 0;

        perf_pmu_enable(event->pmu);
}

static void
group_sched_out(struct perf_event *group_event,
                struct perf_cpu_context *cpuctx,
                struct perf_event_context *ctx)
{
        struct perf_event *event;

        if (group_event->state != PERF_EVENT_STATE_ACTIVE)
                return;

        perf_pmu_disable(ctx->pmu);

        event_sched_out(group_event, cpuctx, ctx);

        /*
         * Schedule out siblings (if any):
         */
        for_each_sibling_event(event, group_event)
                event_sched_out(event, cpuctx, ctx);

        perf_pmu_enable(ctx->pmu);
}

#define DETACH_GROUP        0x01UL

/*
 * Cross CPU call to remove a performance event
 *
 * We disable the event on the hardware level first. After that we
 * remove it from the context list.
 */
static void
__perf_remove_from_context(struct perf_event *event,
                           struct perf_cpu_context *cpuctx,
                           struct perf_event_context *ctx,
                           void *info)
{
        unsigned long flags = (unsigned long)info;

        if (ctx->is_active & EVENT_TIME) {
                update_context_time(ctx);
                update_cgrp_time_from_cpuctx(cpuctx, false);
        }

        event_sched_out(event, cpuctx, ctx);
        if (flags & DETACH_GROUP)
                perf_group_detach(event);
        list_del_event(event, ctx);

        if (!ctx->nr_events && ctx->is_active) {
                if (ctx == &cpuctx->ctx)
                        update_cgrp_time_from_cpuctx(cpuctx, true);

                ctx->is_active = 0;
                ctx->rotate_necessary = 0;
                if (ctx->task) {
                        WARN_ON_ONCE(cpuctx->task_ctx != ctx);
                        cpuctx->task_ctx = NULL;
                }
        }
}

/*
 * Remove the event from a task's (or a CPU's) list of events.
 *
 * If event->ctx is a cloned context, callers must make sure that
 * every task struct that event->ctx->task could possibly point to
 * remains valid.  This is OK when called from perf_release since
 * that only calls us on the top-level context, which can't be a clone.
 * When called from perf_event_exit_task, it's OK because the
 * context has been detached from its task.
 */
static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
{
        struct perf_event_context *ctx = event->ctx;

        lockdep_assert_held(&ctx->mutex);

        event_function_call(event, __perf_remove_from_context, (void *)flags);

        /*
         * The above event_function_call() can NO-OP when it hits
         * TASK_TOMBSTONE. In that case we must already have been detached
         * from the context (by perf_event_exit_event()) but the grouping
         * might still be in-tact.
         */
        WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
        if ((flags & DETACH_GROUP) &&
            (event->attach_state & PERF_ATTACH_GROUP)) {
                /*
                 * Since in that case we cannot possibly be scheduled, simply
                 * detach now.
                 */
                raw_spin_lock_irq(&ctx->lock);
                perf_group_detach(event);
                raw_spin_unlock_irq(&ctx->lock);
        }
}

/*
 * Cross CPU call to disable a performance event
 */
static void __perf_event_disable(struct perf_event *event,
                                 struct perf_cpu_context *cpuctx,
                                 struct perf_event_context *ctx,
                                 void *info)
{
        if (event->state < PERF_EVENT_STATE_INACTIVE)
                return;

        if (ctx->is_active & EVENT_TIME) {
                update_context_time(ctx);
                update_cgrp_time_from_event(event);
        }

        if (event == event->group_leader)
                group_sched_out(event, cpuctx, ctx);
        else
                event_sched_out(event, cpuctx, ctx);

        perf_event_set_state(event, PERF_EVENT_STATE_OFF);
        perf_cgroup_event_disable(event, ctx);
}

/*
 * Disable an event.
 *
 * If event->ctx is a cloned context, callers must make sure that
 * every task struct that event->ctx->task could possibly point to
 * remains valid.  This condition is satisfied when called through
 * perf_event_for_each_child or perf_event_for_each because they
 * hold the top-level event's child_mutex, so any descendant that
 * goes to exit will block in perf_event_exit_event().
 *
 * When called from perf_pending_event it's OK because event->ctx
 * is the current context on this CPU and preemption is disabled,
 * hence we can't get into perf_event_task_sched_out for this context.
 */
static void _perf_event_disable(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;

        raw_spin_lock_irq(&ctx->lock);
        if (event->state <= PERF_EVENT_STATE_OFF) {
                raw_spin_unlock_irq(&ctx->lock);
                return;
        }
        raw_spin_unlock_irq(&ctx->lock);

        event_function_call(event, __perf_event_disable, NULL);
}

void perf_event_disable_local(struct perf_event *event)
{
        event_function_local(event, __perf_event_disable, NULL);
}

/*
 * Strictly speaking kernel users cannot create groups and therefore this
 * interface does not need the perf_event_ctx_lock() magic.
 */
void perf_event_disable(struct perf_event *event)
{
        struct perf_event_context *ctx;

        ctx = perf_event_ctx_lock(event);
        _perf_event_disable(event);
        perf_event_ctx_unlock(event, ctx);
}
EXPORT_SYMBOL_GPL(perf_event_disable);

void perf_event_disable_inatomic(struct perf_event *event)
{
        WRITE_ONCE(event->pending_disable, smp_processor_id());
        /* can fail, see perf_pending_event_disable() */
        irq_work_queue(&event->pending);
}

#define MAX_INTERRUPTS (~0ULL)

static void perf_log_throttle(struct perf_event *event, int enable);
static void perf_log_itrace_start(struct perf_event *event);

static int
event_sched_in(struct perf_event *event,
                 struct perf_cpu_context *cpuctx,
                 struct perf_event_context *ctx)
{
        int ret = 0;

        WARN_ON_ONCE(event->ctx != ctx);

        lockdep_assert_held(&ctx->lock);

        if (event->state <= PERF_EVENT_STATE_OFF)
                return 0;

        WRITE_ONCE(event->oncpu, smp_processor_id());
        /*
         * Order event::oncpu write to happen before the ACTIVE state is
         * visible. This allows perf_event_{stop,read}() to observe the correct
         * ->oncpu if it sees ACTIVE.
         */
        smp_wmb();
        perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);

        /*
         * Unthrottle events, since we scheduled we might have missed several
         * ticks already, also for a heavily scheduling task there is little
         * guarantee it'll get a tick in a timely manner.
         */
        if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
                perf_log_throttle(event, 1);
                event->hw.interrupts = 0;
        }

        perf_pmu_disable(event->pmu);

        perf_log_itrace_start(event);

        if (event->pmu->add(event, PERF_EF_START)) {
                perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
                event->oncpu = -1;
                ret = -EAGAIN;
                goto out;
        }

        if (!is_software_event(event))
                cpuctx->active_oncpu++;
        if (!ctx->nr_active++)
                perf_event_ctx_activate(ctx);
        if (event->attr.freq && event->attr.sample_freq)
                ctx->nr_freq++;

        if (event->attr.exclusive)
                cpuctx->exclusive = 1;

out:
        perf_pmu_enable(event->pmu);

        return ret;
}

static int
group_sched_in(struct perf_event *group_event,
               struct perf_cpu_context *cpuctx,
               struct perf_event_context *ctx)
{
        struct perf_event *event, *partial_group = NULL;
        struct pmu *pmu = ctx->pmu;

        if (group_event->state == PERF_EVENT_STATE_OFF)
                return 0;

        pmu->start_txn(pmu, PERF_PMU_TXN_ADD);

        if (event_sched_in(group_event, cpuctx, ctx))
                goto error;

        /*
         * Schedule in siblings as one group (if any):
         */
        for_each_sibling_event(event, group_event) {
                if (event_sched_in(event, cpuctx, ctx)) {
                        partial_group = event;
                        goto group_error;
                }
        }

        if (!pmu->commit_txn(pmu))
                return 0;

group_error:
        /*
         * Groups can be scheduled in as one unit only, so undo any
         * partial group before returning:
         * The events up to the failed event are scheduled out normally.
         */
        for_each_sibling_event(event, group_event) {
                if (event == partial_group)
                        break;

                event_sched_out(event, cpuctx, ctx);
        }
        event_sched_out(group_event, cpuctx, ctx);

error:
        pmu->cancel_txn(pmu);
        return -EAGAIN;
}

/*
 * Work out whether we can put this event group on the CPU now.
 */
static int group_can_go_on(struct perf_event *event,
                           struct perf_cpu_context *cpuctx,
                           int can_add_hw)
{
        /*
         * Groups consisting entirely of software events can always go on.
         */
        if (event->group_caps & PERF_EV_CAP_SOFTWARE)
                return 1;
        /*
         * If an exclusive group is already on, no other hardware
         * events can go on.
         */
        if (cpuctx->exclusive)
                return 0;
        /*
         * If this group is exclusive and there are already
         * events on the CPU, it can't go on.
         */
        if (event->attr.exclusive && !list_empty(get_event_list(event)))
                return 0;
        /*
         * Otherwise, try to add it if all previous groups were able
         * to go on.
         */
        return can_add_hw;
}

static void add_event_to_ctx(struct perf_event *event,
                               struct perf_event_context *ctx)
{
        list_add_event(event, ctx);
        perf_group_attach(event);
}

static void ctx_sched_out(struct perf_event_context *ctx,
                          struct perf_cpu_context *cpuctx,
                          enum event_type_t event_type);
static void
ctx_sched_in(struct perf_event_context *ctx,
             struct perf_cpu_context *cpuctx,
             enum event_type_t event_type,
             struct task_struct *task);

static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
                               struct perf_event_context *ctx,
                               enum event_type_t event_type)
{
        if (!cpuctx->task_ctx)
                return;

        if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
                return;

        ctx_sched_out(ctx, cpuctx, event_type);
}

static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
                                struct perf_event_context *ctx,
                                struct task_struct *task)
{
        cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
        if (ctx)
                ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
        cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
        if (ctx)
                ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
}

/*
 * We want to maintain the following priority of scheduling:
 *  - CPU pinned (EVENT_CPU | EVENT_PINNED)
 *  - task pinned (EVENT_PINNED)
 *  - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE)
 *  - task flexible (EVENT_FLEXIBLE).
 *
 * In order to avoid unscheduling and scheduling back in everything every
 * time an event is added, only do it for the groups of equal priority and
 * below.
 *
 * This can be called after a batch operation on task events, in which case
 * event_type is a bit mask of the types of events involved. For CPU events,
 * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
 */
static void ctx_resched(struct perf_cpu_context *cpuctx,
                        struct perf_event_context *task_ctx,
                        enum event_type_t event_type)
{
        enum event_type_t ctx_event_type;
        bool cpu_event = !!(event_type & EVENT_CPU);

        /*
         * If pinned groups are involved, flexible groups also need to be
         * scheduled out.
         */
        if (event_type & EVENT_PINNED)
                event_type |= EVENT_FLEXIBLE;

        ctx_event_type = event_type & EVENT_ALL;

        perf_pmu_disable(cpuctx->ctx.pmu);
        if (task_ctx)
                task_ctx_sched_out(cpuctx, task_ctx, event_type);

        /*
         * Decide which cpu ctx groups to schedule out based on the types
         * of events that caused rescheduling:
         *  - EVENT_CPU: schedule out corresponding groups;
         *  - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
         *  - otherwise, do nothing more.
         */
        if (cpu_event)
                cpu_ctx_sched_out(cpuctx, ctx_event_type);
        else if (ctx_event_type & EVENT_PINNED)
                cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);

        perf_event_sched_in(cpuctx, task_ctx, current);
        perf_pmu_enable(cpuctx->ctx.pmu);
}

void perf_pmu_resched(struct pmu *pmu)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
        struct perf_event_context *task_ctx = cpuctx->task_ctx;

        perf_ctx_lock(cpuctx, task_ctx);
        ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
        perf_ctx_unlock(cpuctx, task_ctx);
}

/*
 * Cross CPU call to install and enable a performance event
 *
 * Very similar to remote_function() + event_function() but cannot assume that
 * things like ctx->is_active and cpuctx->task_ctx are set.
 */
static int  __perf_install_in_context(void *info)
{
        struct perf_event *event = info;
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        struct perf_event_context *task_ctx = cpuctx->task_ctx;
        bool reprogram = true;
        int ret = 0;

        raw_spin_lock(&cpuctx->ctx.lock);
        if (ctx->task) {
                raw_spin_lock(&ctx->lock);
                task_ctx = ctx;

                reprogram = (ctx->task == current);

                /*
                 * If the task is running, it must be running on this CPU,
                 * otherwise we cannot reprogram things.
                 *
                 * If its not running, we don't care, ctx->lock will
                 * serialize against it becoming runnable.
                 */
                if (task_curr(ctx->task) && !reprogram) {
                        ret = -ESRCH;
                        goto unlock;
                }

                WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
        } else if (task_ctx) {
                raw_spin_lock(&task_ctx->lock);
        }

#ifdef CONFIG_CGROUP_PERF
        if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) {
                /*
                 * If the current cgroup doesn't match the event's
                 * cgroup, we should not try to schedule it.
                 */
                struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
                reprogram = cgroup_is_descendant(cgrp->css.cgroup,
                                        event->cgrp->css.cgroup);
        }
#endif

        if (reprogram) {
                ctx_sched_out(ctx, cpuctx, EVENT_TIME);
                add_event_to_ctx(event, ctx);
                ctx_resched(cpuctx, task_ctx, get_event_type(event));
        } else {
                add_event_to_ctx(event, ctx);
        }

unlock:
        perf_ctx_unlock(cpuctx, task_ctx);

        return ret;
}

static bool exclusive_event_installable(struct perf_event *event,
                                        struct perf_event_context *ctx);

/*
 * Attach a performance event to a context.
 *
 * Very similar to event_function_call, see comment there.
 */
static void
perf_install_in_context(struct perf_event_context *ctx,
                        struct perf_event *event,
                        int cpu)
{
        struct task_struct *task = READ_ONCE(ctx->task);

        lockdep_assert_held(&ctx->mutex);

        WARN_ON_ONCE(!exclusive_event_installable(event, ctx));

        if (event->cpu != -1)
                event->cpu = cpu;

        /*
         * Ensures that if we can observe event->ctx, both the event and ctx
         * will be 'complete'. See perf_iterate_sb_cpu().
         */
        smp_store_release(&event->ctx, ctx);

        /*
         * perf_event_attr::disabled events will not run and can be initialized
         * without IPI. Except when this is the first event for the context, in
         * that case we need the magic of the IPI to set ctx->is_active.
         *
         * The IOC_ENABLE that is sure to follow the creation of a disabled
         * event will issue the IPI and reprogram the hardware.
         */
        if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) {
                raw_spin_lock_irq(&ctx->lock);
                if (ctx->task == TASK_TOMBSTONE) {
                        raw_spin_unlock_irq(&ctx->lock);
                        return;
                }
                add_event_to_ctx(event, ctx);
                raw_spin_unlock_irq(&ctx->lock);
                return;
        }

        if (!task) {
                cpu_function_call(cpu, __perf_install_in_context, event);
                return;
        }

        /*
         * Should not happen, we validate the ctx is still alive before calling.
         */
        if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
                return;

        /*
         * Installing events is tricky because we cannot rely on ctx->is_active
         * to be set in case this is the nr_events 0 -> 1 transition.
         *
         * Instead we use task_curr(), which tells us if the task is running.
         * However, since we use task_curr() outside of rq::lock, we can race
         * against the actual state. This means the result can be wrong.
         *
         * If we get a false positive, we retry, this is harmless.
         *
         * If we get a false negative, things are complicated. If we are after
         * perf_event_context_sched_in() ctx::lock will serialize us, and the
         * value must be correct. If we're before, it doesn't matter since
         * perf_event_context_sched_in() will program the counter.
         *
         * However, this hinges on the remote context switch having observed
         * our task->perf_event_ctxp[] store, such that it will in fact take
         * ctx::lock in perf_event_context_sched_in().
         *
         * We do this by task_function_call(), if the IPI fails to hit the task
         * we know any future context switch of task must see the
         * perf_event_ctpx[] store.
         */

        /*
         * This smp_mb() orders the task->perf_event_ctxp[] store with the
         * task_cpu() load, such that if the IPI then does not find the task
         * running, a future context switch of that task must observe the
         * store.
         */
        smp_mb();
again:
        if (!task_function_call(task, __perf_install_in_context, event))
                return;

        raw_spin_lock_irq(&ctx->lock);
        task = ctx->task;
        if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
                /*
                 * Cannot happen because we already checked above (which also
                 * cannot happen), and we hold ctx->mutex, which serializes us
                 * against perf_event_exit_task_context().
                 */
                raw_spin_unlock_irq(&ctx->lock);
                return;
        }
        /*
         * If the task is not running, ctx->lock will avoid it becoming so,
         * thus we can safely install the event.
         */
        if (task_curr(task)) {
                raw_spin_unlock_irq(&ctx->lock);
                goto again;
        }
        add_event_to_ctx(event, ctx);
        raw_spin_unlock_irq(&ctx->lock);
}

/*
 * Cross CPU call to enable a performance event
 */
static void __perf_event_enable(struct perf_event *event,
                                struct perf_cpu_context *cpuctx,
                                struct perf_event_context *ctx,
                                void *info)
{
        struct perf_event *leader = event->group_leader;
        struct perf_event_context *task_ctx;

        if (event->state >= PERF_EVENT_STATE_INACTIVE ||
            event->state <= PERF_EVENT_STATE_ERROR)
                return;

        if (ctx->is_active)
                ctx_sched_out(ctx, cpuctx, EVENT_TIME);

        perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
        perf_cgroup_event_enable(event, ctx);

        if (!ctx->is_active)
                return;

        if (!event_filter_match(event)) {
                ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
                return;
        }

        /*
         * If the event is in a group and isn't the group leader,
         * then don't put it on unless the group is on.
         */
        if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
                ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
                return;
        }

        task_ctx = cpuctx->task_ctx;
        if (ctx->task)
                WARN_ON_ONCE(task_ctx != ctx);

        ctx_resched(cpuctx, task_ctx, get_event_type(event));
}

/*
 * Enable an event.
 *
 * If event->ctx is a cloned context, callers must make sure that
 * every task struct that event->ctx->task could possibly point to
 * remains valid.  This condition is satisfied when called through
 * perf_event_for_each_child or perf_event_for_each as described
 * for perf_event_disable.
 */
static void _perf_event_enable(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;

        raw_spin_lock_irq(&ctx->lock);
        if (event->state >= PERF_EVENT_STATE_INACTIVE ||
            event->state <  PERF_EVENT_STATE_ERROR) {
out:
                raw_spin_unlock_irq(&ctx->lock);
                return;
        }

        /*
         * If the event is in error state, clear that first.
         *
         * That way, if we see the event in error state below, we know that it
         * has gone back into error state, as distinct from the task having
         * been scheduled away before the cross-call arrived.
         */
        if (event->state == PERF_EVENT_STATE_ERROR) {
                /*
                 * Detached SIBLING events cannot leave ERROR state.
                 */
                if (event->event_caps & PERF_EV_CAP_SIBLING &&
                    event->group_leader == event)
                        goto out;

                event->state = PERF_EVENT_STATE_OFF;
        }
        raw_spin_unlock_irq(&ctx->lock);

        event_function_call(event, __perf_event_enable, NULL);
}

/*
 * See perf_event_disable();
 */
void perf_event_enable(struct perf_event *event)
{
        struct perf_event_context *ctx;

        ctx = perf_event_ctx_lock(event);
        _perf_event_enable(event);
        perf_event_ctx_unlock(event, ctx);
}
EXPORT_SYMBOL_GPL(perf_event_enable);

struct stop_event_data {
        struct perf_event        *event;
        unsigned int                restart;
};

static int __perf_event_stop(void *info)
{
        struct stop_event_data *sd = info;
        struct perf_event *event = sd->event;

        /* if it's already INACTIVE, do nothing */
        if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
                return 0;

        /* matches smp_wmb() in event_sched_in() */
        smp_rmb();

        /*
         * There is a window with interrupts enabled before we get here,
         * so we need to check again lest we try to stop another CPU's event.
         */
        if (READ_ONCE(event->oncpu) != smp_processor_id())
                return -EAGAIN;

        event->pmu->stop(event, PERF_EF_UPDATE);

        /*
         * May race with the actual stop (through perf_pmu_output_stop()),
         * but it is only used for events with AUX ring buffer, and such
         * events will refuse to restart because of rb::aux_mmap_count==0,
         * see comments in perf_aux_output_begin().
         *
         * Since this is happening on an event-local CPU, no trace is lost
         * while restarting.
         */
        if (sd->restart)
                event->pmu->start(event, 0);

        return 0;
}

static int perf_event_stop(struct perf_event *event, int restart)
{
        struct stop_event_data sd = {
                .event                = event,
                .restart        = restart,
        };
        int ret = 0;

        do {
                if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
                        return 0;

                /* matches smp_wmb() in event_sched_in() */
                smp_rmb();

                /*
                 * We only want to restart ACTIVE events, so if the event goes
                 * inactive here (event->oncpu==-1), there's nothing more to do;
                 * fall through with ret==-ENXIO.
                 */
                ret = cpu_function_call(READ_ONCE(event->oncpu),
                                        __perf_event_stop, &sd);
        } while (ret == -EAGAIN);

        return ret;
}

/*
 * In order to contain the amount of racy and tricky in the address filter
 * configuration management, it is a two part process:
 *
 * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
 *      we update the addresses of corresponding vmas in
 *        event::addr_filter_ranges array and bump the event::addr_filters_gen;
 * (p2) when an event is scheduled in (pmu::add), it calls
 *      perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
 *      if the generation has changed since the previous call.
 *
 * If (p1) happens while the event is active, we restart it to force (p2).
 *
 * (1) perf_addr_filters_apply(): adjusting filters' offsets based on
 *     pre-existing mappings, called once when new filters arrive via SET_FILTER
 *     ioctl;
 * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
 *     registered mapping, called for every new mmap(), with mm::mmap_lock down
 *     for reading;
 * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
 *     of exec.
 */
void perf_event_addr_filters_sync(struct perf_event *event)
{
        struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);

        if (!has_addr_filter(event))
                return;

        raw_spin_lock(&ifh->lock);
        if (event->addr_filters_gen != event->hw.addr_filters_gen) {
                event->pmu->addr_filters_sync(event);
                event->hw.addr_filters_gen = event->addr_filters_gen;
        }
        raw_spin_unlock(&ifh->lock);
}
EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);

static int _perf_event_refresh(struct perf_event *event, int refresh)
{
        /*
         * not supported on inherited events
         */
        if (event->attr.inherit || !is_sampling_event(event))
                return -EINVAL;

        atomic_add(refresh, &event->event_limit);
        _perf_event_enable(event);

        return 0;
}

/*
 * See perf_event_disable()
 */
int perf_event_refresh(struct perf_event *event, int refresh)
{
        struct perf_event_context *ctx;
        int ret;

        ctx = perf_event_ctx_lock(event);
        ret = _perf_event_refresh(event, refresh);
        perf_event_ctx_unlock(event, ctx);

        return ret;
}
EXPORT_SYMBOL_GPL(perf_event_refresh);

static int perf_event_modify_breakpoint(struct perf_event *bp,
                                         struct perf_event_attr *attr)
{
        int err;

        _perf_event_disable(bp);

        err = modify_user_hw_breakpoint_check(bp, attr, true);

        if (!bp->attr.disabled)
                _perf_event_enable(bp);

        return err;
}

static int perf_event_modify_attr(struct perf_event *event,
                                  struct perf_event_attr *attr)
{
        if (event->attr.type != attr->type)
                return -EINVAL;

        switch (event->attr.type) {
        case PERF_TYPE_BREAKPOINT:
                return perf_event_modify_breakpoint(event, attr);
        default:
                /* Place holder for future additions. */
                return -EOPNOTSUPP;
        }
}

static void ctx_sched_out(struct perf_event_context *ctx,
                          struct perf_cpu_context *cpuctx,
                          enum event_type_t event_type)
{
        struct perf_event *event, *tmp;
        int is_active = ctx->is_active;

        lockdep_assert_held(&ctx->lock);

        if (likely(!ctx->nr_events)) {
                /*
                 * See __perf_remove_from_context().
                 */
                WARN_ON_ONCE(ctx->is_active);
                if (ctx->task)
                        WARN_ON_ONCE(cpuctx->task_ctx);
                return;
        }

        /*
         * Always update time if it was set; not only when it changes.
         * Otherwise we can 'forget' to update time for any but the last
         * context we sched out. For example:
         *
         *   ctx_sched_out(.event_type = EVENT_FLEXIBLE)
         *   ctx_sched_out(.event_type = EVENT_PINNED)
         *
         * would only update time for the pinned events.
         */
        if (is_active & EVENT_TIME) {
                /* update (and stop) ctx time */
                update_context_time(ctx);
                update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
                /*
                 * CPU-release for the below ->is_active store,
                 * see __load_acquire() in perf_event_time_now()
                 */
                barrier();
        }

        ctx->is_active &= ~event_type;
        if (!(ctx->is_active & EVENT_ALL))
                ctx->is_active = 0;

        if (ctx->task) {
                WARN_ON_ONCE(cpuctx->task_ctx != ctx);
                if (!ctx->is_active)
                        cpuctx->task_ctx = NULL;
        }

        is_active ^= ctx->is_active; /* changed bits */

        if (!ctx->nr_active || !(is_active & EVENT_ALL))
                return;

        perf_pmu_disable(ctx->pmu);
        if (is_active & EVENT_PINNED) {
                list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
                        group_sched_out(event, cpuctx, ctx);
        }

        if (is_active & EVENT_FLEXIBLE) {
                list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
                        group_sched_out(event, cpuctx, ctx);

                /*
                 * Since we cleared EVENT_FLEXIBLE, also clear
                 * rotate_necessary, is will be reset by
                 * ctx_flexible_sched_in() when needed.
                 */
                ctx->rotate_necessary = 0;
        }
        perf_pmu_enable(ctx->pmu);
}

/*
 * Test whether two contexts are equivalent, i.e. whether they have both been
 * cloned from the same version of the same context.
 *
 * Equivalence is measured using a generation number in the context that is
 * incremented on each modification to it; see unclone_ctx(), list_add_event()
 * and list_del_event().
 */
static int context_equiv(struct perf_event_context *ctx1,
                         struct perf_event_context *ctx2)
{
        lockdep_assert_held(&ctx1->lock);
        lockdep_assert_held(&ctx2->lock);

        /* Pinning disables the swap optimization */
        if (ctx1->pin_count || ctx2->pin_count)
                return 0;

        /* If ctx1 is the parent of ctx2 */
        if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
                return 1;

        /* If ctx2 is the parent of ctx1 */
        if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
                return 1;

        /*
         * If ctx1 and ctx2 have the same parent; we flatten the parent
         * hierarchy, see perf_event_init_context().
         */
        if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
                        ctx1->parent_gen == ctx2->parent_gen)
                return 1;

        /* Unmatched */
        return 0;
}

static void __perf_event_sync_stat(struct perf_event *event,
                                     struct perf_event *next_event)
{
        u64 value;

        if (!event->attr.inherit_stat)
                return;

        /*
         * Update the event value, we cannot use perf_event_read()
         * because we're in the middle of a context switch and have IRQs
         * disabled, which upsets smp_call_function_single(), however
         * we know the event must be on the current CPU, therefore we
         * don't need to use it.
         */
        if (event->state == PERF_EVENT_STATE_ACTIVE)
                event->pmu->read(event);

        perf_event_update_time(event);

        /*
         * In order to keep per-task stats reliable we need to flip the event
         * values when we flip the contexts.
         */
        value = local64_read(&next_event->count);
        value = local64_xchg(&event->count, value);
        local64_set(&next_event->count, value);

        swap(event->total_time_enabled, next_event->total_time_enabled);
        swap(event->total_time_running, next_event->total_time_running);

        /*
         * Since we swizzled the values, update the user visible data too.
         */
        perf_event_update_userpage(event);
        perf_event_update_userpage(next_event);
}

static void perf_event_sync_stat(struct perf_event_context *ctx,
                                   struct perf_event_context *next_ctx)
{
        struct perf_event *event, *next_event;

        if (!ctx->nr_stat)
                return;

        update_context_time(ctx);

        event = list_first_entry(&ctx->event_list,
                                   struct perf_event, event_entry);

        next_event = list_first_entry(&next_ctx->event_list,
                                        struct perf_event, event_entry);

        while (&event->event_entry != &ctx->event_list &&
               &next_event->event_entry != &next_ctx->event_list) {

                __perf_event_sync_stat(event, next_event);

                event = list_next_entry(event, event_entry);
                next_event = list_next_entry(next_event, event_entry);
        }
}

static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
                                         struct task_struct *next)
{
        struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
        struct perf_event_context *next_ctx;
        struct perf_event_context *parent, *next_parent;
        struct perf_cpu_context *cpuctx;
        int do_switch = 1;
        struct pmu *pmu;

        if (likely(!ctx))
                return;

        pmu = ctx->pmu;
        cpuctx = __get_cpu_context(ctx);
        if (!cpuctx->task_ctx)
                return;

        rcu_read_lock();
        next_ctx = next->perf_event_ctxp[ctxn];
        if (!next_ctx)
                goto unlock;

        parent = rcu_dereference(ctx->parent_ctx);
        next_parent = rcu_dereference(next_ctx->parent_ctx);

        /* If neither context have a parent context; they cannot be clones. */
        if (!parent && !next_parent)
                goto unlock;

        if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
                /*
                 * Looks like the two contexts are clones, so we might be
                 * able to optimize the context switch.  We lock both
                 * contexts and check that they are clones under the
                 * lock (including re-checking that neither has been
                 * uncloned in the meantime).  It doesn't matter which
                 * order we take the locks because no other cpu could
                 * be trying to lock both of these tasks.
                 */
                raw_spin_lock(&ctx->lock);
                raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
                if (context_equiv(ctx, next_ctx)) {

                        WRITE_ONCE(ctx->task, next);
                        WRITE_ONCE(next_ctx->task, task);

                        perf_pmu_disable(pmu);

                        if (cpuctx->sched_cb_usage && pmu->sched_task)
                                pmu->sched_task(ctx, false);

                        /*
                         * PMU specific parts of task perf context can require
                         * additional synchronization. As an example of such
                         * synchronization see implementation details of Intel
                         * LBR call stack data profiling;
                         */
                        if (pmu->swap_task_ctx)
                                pmu->swap_task_ctx(ctx, next_ctx);
                        else
                                swap(ctx->task_ctx_data, next_ctx->task_ctx_data);

                        perf_pmu_enable(pmu);

                        /*
                         * RCU_INIT_POINTER here is safe because we've not
                         * modified the ctx and the above modification of
                         * ctx->task and ctx->task_ctx_data are immaterial
                         * since those values are always verified under
                         * ctx->lock which we're now holding.
                         */
                        RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
                        RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);

                        do_switch = 0;

                        perf_event_sync_stat(ctx, next_ctx);
                }
                raw_spin_unlock(&next_ctx->lock);
                raw_spin_unlock(&ctx->lock);
        }
unlock:
        rcu_read_unlock();

        if (do_switch) {
                raw_spin_lock(&ctx->lock);
                perf_pmu_disable(pmu);

                if (cpuctx->sched_cb_usage && pmu->sched_task)
                        pmu->sched_task(ctx, false);
                task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);

                perf_pmu_enable(pmu);
                raw_spin_unlock(&ctx->lock);
        }
}

static DEFINE_PER_CPU(struct list_head, sched_cb_list);

void perf_sched_cb_dec(struct pmu *pmu)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);

        this_cpu_dec(perf_sched_cb_usages);

        if (!--cpuctx->sched_cb_usage)
                list_del(&cpuctx->sched_cb_entry);
}


void perf_sched_cb_inc(struct pmu *pmu)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);

        if (!cpuctx->sched_cb_usage++)
                list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));

        this_cpu_inc(perf_sched_cb_usages);
}

/*
 * This function provides the context switch callback to the lower code
 * layer. It is invoked ONLY when the context switch callback is enabled.
 *
 * This callback is relevant even to per-cpu events; for example multi event
 * PEBS requires this to provide PID/TID information. This requires we flush
 * all queued PEBS records before we context switch to a new task.
 */
static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in)
{
        struct pmu *pmu;

        pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */

        if (WARN_ON_ONCE(!pmu->sched_task))
                return;

        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
        perf_pmu_disable(pmu);

        pmu->sched_task(cpuctx->task_ctx, sched_in);

        perf_pmu_enable(pmu);
        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}

static void perf_pmu_sched_task(struct task_struct *prev,
                                struct task_struct *next,
                                bool sched_in)
{
        struct perf_cpu_context *cpuctx;

        if (prev == next)
                return;

        list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
                /* will be handled in perf_event_context_sched_in/out */
                if (cpuctx->task_ctx)
                        continue;

                __perf_pmu_sched_task(cpuctx, sched_in);
        }
}

static void perf_event_switch(struct task_struct *task,
                              struct task_struct *next_prev, bool sched_in);

#define for_each_task_context_nr(ctxn)                                        \
        for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)

/*
 * Called from scheduler to remove the events of the current task,
 * with interrupts disabled.
 *
 * We stop each event and update the event value in event->count.
 *
 * This does not protect us against NMI, but disable()
 * sets the disabled bit in the control field of event _before_
 * accessing the event control register. If a NMI hits, then it will
 * not restart the event.
 */
void __perf_event_task_sched_out(struct task_struct *task,
                                 struct task_struct *next)
{
        int ctxn;

        if (__this_cpu_read(perf_sched_cb_usages))
                perf_pmu_sched_task(task, next, false);

        if (atomic_read(&nr_switch_events))
                perf_event_switch(task, next, false);

        for_each_task_context_nr(ctxn)
                perf_event_context_sched_out(task, ctxn, next);

        /*
         * if cgroup events exist on this CPU, then we need
         * to check if we have to switch out PMU state.
         * cgroup event are system-wide mode only
         */
        if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
                perf_cgroup_sched_out(task, next);
}

/*
 * Called with IRQs disabled
 */
static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
                              enum event_type_t event_type)
{
        ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
}

static bool perf_less_group_idx(const void *l, const void *r)
{
        const struct perf_event *le = *(const struct perf_event **)l;
        const struct perf_event *re = *(const struct perf_event **)r;

        return le->group_index < re->group_index;
}

static void swap_ptr(void *l, void *r)
{
        void **lp = l, **rp = r;

        swap(*lp, *rp);
}

static const struct min_heap_callbacks perf_min_heap = {
        .elem_size = sizeof(struct perf_event *),
        .less = perf_less_group_idx,
        .swp = swap_ptr,
};

static void __heap_add(struct min_heap *heap, struct perf_event *event)
{
        struct perf_event **itrs = heap->data;

        if (event) {
                itrs[heap->nr] = event;
                heap->nr++;
        }
}

static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
                                struct perf_event_groups *groups, int cpu,
                                int (*func)(struct perf_event *, void *),
                                void *data)
{
#ifdef CONFIG_CGROUP_PERF
        struct cgroup_subsys_state *css = NULL;
#endif
        /* Space for per CPU and/or any CPU event iterators. */
        struct perf_event *itrs[2];
        struct min_heap event_heap;
        struct perf_event **evt;
        int ret;

        if (cpuctx) {
                event_heap = (struct min_heap){
                        .data = cpuctx->heap,
                        .nr = 0,
                        .size = cpuctx->heap_size,
                };

                lockdep_assert_held(&cpuctx->ctx.lock);

#ifdef CONFIG_CGROUP_PERF
                if (cpuctx->cgrp)
                        css = &cpuctx->cgrp->css;
#endif
        } else {
                event_heap = (struct min_heap){
                        .data = itrs,
                        .nr = 0,
                        .size = ARRAY_SIZE(itrs),
                };
                /* Events not within a CPU context may be on any CPU. */
                __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL));
        }
        evt = event_heap.data;

        __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL));

#ifdef CONFIG_CGROUP_PERF
        for (; css; css = css->parent)
                __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup));
#endif

        min_heapify_all(&event_heap, &perf_min_heap);

        while (event_heap.nr) {
                ret = func(*evt, data);
                if (ret)
                        return ret;

                *evt = perf_event_groups_next(*evt);
                if (*evt)
                        min_heapify(&event_heap, 0, &perf_min_heap);
                else
                        min_heap_pop(&event_heap, &perf_min_heap);
        }

        return 0;
}

/*
 * Because the userpage is strictly per-event (there is no concept of context,
 * so there cannot be a context indirection), every userpage must be updated
 * when context time starts :-(
 *
 * IOW, we must not miss EVENT_TIME edges.
 */
static inline bool event_update_userpage(struct perf_event *event)
{
        if (likely(!atomic_read(&event->mmap_count)))
                return false;

        perf_event_update_time(event);
        perf_event_update_userpage(event);

        return true;
}

static inline void group_update_userpage(struct perf_event *group_event)
{
        struct perf_event *event;

        if (!event_update_userpage(group_event))
                return;

        for_each_sibling_event(event, group_event)
                event_update_userpage(event);
}

static int merge_sched_in(struct perf_event *event, void *data)
{
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        int *can_add_hw = data;

        if (event->state <= PERF_EVENT_STATE_OFF)
                return 0;

        if (!event_filter_match(event))
                return 0;

        if (group_can_go_on(event, cpuctx, *can_add_hw)) {
                if (!group_sched_in(event, cpuctx, ctx))
                        list_add_tail(&event->active_list, get_event_list(event));
        }

        if (event->state == PERF_EVENT_STATE_INACTIVE) {
                *can_add_hw = 0;
                if (event->attr.pinned) {
                        perf_cgroup_event_disable(event, ctx);
                        perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
                } else {
                        ctx->rotate_necessary = 1;
                        perf_mux_hrtimer_restart(cpuctx);
                        group_update_userpage(event);
                }
        }

        return 0;
}

static void
ctx_pinned_sched_in(struct perf_event_context *ctx,
                    struct perf_cpu_context *cpuctx)
{
        int can_add_hw = 1;

        if (ctx != &cpuctx->ctx)
                cpuctx = NULL;

        visit_groups_merge(cpuctx, &ctx->pinned_groups,
                           smp_processor_id(),
                           merge_sched_in, &can_add_hw);
}

static void
ctx_flexible_sched_in(struct perf_event_context *ctx,
                      struct perf_cpu_context *cpuctx)
{
        int can_add_hw = 1;

        if (ctx != &cpuctx->ctx)
                cpuctx = NULL;

        visit_groups_merge(cpuctx, &ctx->flexible_groups,
                           smp_processor_id(),
                           merge_sched_in, &can_add_hw);
}

static void
ctx_sched_in(struct perf_event_context *ctx,
             struct perf_cpu_context *cpuctx,
             enum event_type_t event_type,
             struct task_struct *task)
{
        int is_active = ctx->is_active;

        lockdep_assert_held(&ctx->lock);

        if (likely(!ctx->nr_events))
                return;

        if (!(is_active & EVENT_TIME)) {
                /* start ctx time */
                __update_context_time(ctx, false);
                perf_cgroup_set_timestamp(task, ctx);
                /*
                 * CPU-release for the below ->is_active store,
                 * see __load_acquire() in perf_event_time_now()
                 */
                barrier();
        }

        ctx->is_active |= (event_type | EVENT_TIME);
        if (ctx->task) {
                if (!is_active)
                        cpuctx->task_ctx = ctx;
                else
                        WARN_ON_ONCE(cpuctx->task_ctx != ctx);
        }

        is_active ^= ctx->is_active; /* changed bits */

        /*
         * First go through the list and put on any pinned groups
         * in order to give them the best chance of going on.
         */
        if (is_active & EVENT_PINNED)
                ctx_pinned_sched_in(ctx, cpuctx);

        /* Then walk through the lower prio flexible groups */
        if (is_active & EVENT_FLEXIBLE)
                ctx_flexible_sched_in(ctx, cpuctx);
}

static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
                             enum event_type_t event_type,
                             struct task_struct *task)
{
        struct perf_event_context *ctx = &cpuctx->ctx;

        ctx_sched_in(ctx, cpuctx, event_type, task);
}

static void perf_event_context_sched_in(struct perf_event_context *ctx,
                                        struct task_struct *task)
{
        struct perf_cpu_context *cpuctx;
        struct pmu *pmu = ctx->pmu;

        cpuctx = __get_cpu_context(ctx);
        if (cpuctx->task_ctx == ctx) {
                if (cpuctx->sched_cb_usage)
                        __perf_pmu_sched_task(cpuctx, true);
                return;
        }

        perf_ctx_lock(cpuctx, ctx);
        /*
         * We must check ctx->nr_events while holding ctx->lock, such
         * that we serialize against perf_install_in_context().
         */
        if (!ctx->nr_events)
                goto unlock;

        perf_pmu_disable(pmu);
        /*
         * We want to keep the following priority order:
         * cpu pinned (that don't need to move), task pinned,
         * cpu flexible, task flexible.
         *
         * However, if task's ctx is not carrying any pinned
         * events, no need to flip the cpuctx's events around.
         */
        if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
                cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
        perf_event_sched_in(cpuctx, ctx, task);

        if (cpuctx->sched_cb_usage && pmu->sched_task)
                pmu->sched_task(cpuctx->task_ctx, true);

        perf_pmu_enable(pmu);

unlock:
        perf_ctx_unlock(cpuctx, ctx);
}

/*
 * Called from scheduler to add the events of the current task
 * with interrupts disabled.
 *
 * We restore the event value and then enable it.
 *
 * This does not protect us against NMI, but enable()
 * sets the enabled bit in the control field of event _before_
 * accessing the event control register. If a NMI hits, then it will
 * keep the event running.
 */
void __perf_event_task_sched_in(struct task_struct *prev,
                                struct task_struct *task)
{
        struct perf_event_context *ctx;
        int ctxn;

        /*
         * If cgroup events exist on this CPU, then we need to check if we have
         * to switch in PMU state; cgroup event are system-wide mode only.
         *
         * Since cgroup events are CPU events, we must schedule these in before
         * we schedule in the task events.
         */
        if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
                perf_cgroup_sched_in(prev, task);

        for_each_task_context_nr(ctxn) {
                ctx = task->perf_event_ctxp[ctxn];
                if (likely(!ctx))
                        continue;

                perf_event_context_sched_in(ctx, task);
        }

        if (atomic_read(&nr_switch_events))
                perf_event_switch(task, prev, true);

        if (__this_cpu_read(perf_sched_cb_usages))
                perf_pmu_sched_task(prev, task, true);
}

static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
{
        u64 frequency = event->attr.sample_freq;
        u64 sec = NSEC_PER_SEC;
        u64 divisor, dividend;

        int count_fls, nsec_fls, frequency_fls, sec_fls;

        count_fls = fls64(count);
        nsec_fls = fls64(nsec);
        frequency_fls = fls64(frequency);
        sec_fls = 30;

        /*
         * We got @count in @nsec, with a target of sample_freq HZ
         * the target period becomes:
         *
         *             @count * 10^9
         * period = -------------------
         *          @nsec * sample_freq
         *
         */

        /*
         * Reduce accuracy by one bit such that @a and @b converge
         * to a similar magnitude.
         */
#define REDUCE_FLS(a, b)                \
do {                                        \
        if (a##_fls > b##_fls) {        \
                a >>= 1;                \
                a##_fls--;                \
        } else {                        \
                b >>= 1;                \
                b##_fls--;                \
        }                                \
} while (0)

        /*
         * Reduce accuracy until either term fits in a u64, then proceed with
         * the other, so that finally we can do a u64/u64 division.
         */
        while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
                REDUCE_FLS(nsec, frequency);
                REDUCE_FLS(sec, count);
        }

        if (count_fls + sec_fls > 64) {
                divisor = nsec * frequency;

                while (count_fls + sec_fls > 64) {
                        REDUCE_FLS(count, sec);
                        divisor >>= 1;
                }

                dividend = count * sec;
        } else {
                dividend = count * sec;

                while (nsec_fls + frequency_fls > 64) {
                        REDUCE_FLS(nsec, frequency);
                        dividend >>= 1;
                }

                divisor = nsec * frequency;
        }

        if (!divisor)
                return dividend;

        return div64_u64(dividend, divisor);
}

static DEFINE_PER_CPU(int, perf_throttled_count);
static DEFINE_PER_CPU(u64, perf_throttled_seq);

static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
{
        struct hw_perf_event *hwc = &event->hw;
        s64 period, sample_period;
        s64 delta;

        period = perf_calculate_period(event, nsec, count);

        delta = (s64)(period - hwc->sample_period);
        if (delta >= 0)
                delta += 7;
        else
                delta -= 7;
        delta /= 8; /* low pass filter */

        sample_period = hwc->sample_period + delta;

        if (!sample_period)
                sample_period = 1;

        hwc->sample_period = sample_period;

        if (local64_read(&hwc->period_left) > 8*sample_period) {
                if (disable)
                        event->pmu->stop(event, PERF_EF_UPDATE);

                local64_set(&hwc->period_left, 0);

                if (disable)
                        event->pmu->start(event, PERF_EF_RELOAD);
        }
}

/*
 * combine freq adjustment with unthrottling to avoid two passes over the
 * events. At the same time, make sure, having freq events does not change
 * the rate of unthrottling as that would introduce bias.
 */
static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
                                           int needs_unthr)
{
        struct perf_event *event;
        struct hw_perf_event *hwc;
        u64 now, period = TICK_NSEC;
        s64 delta;

        /*
         * only need to iterate over all events iff:
         * - context have events in frequency mode (needs freq adjust)
         * - there are events to unthrottle on this cpu
         */
        if (!(ctx->nr_freq || needs_unthr))
                return;

        raw_spin_lock(&ctx->lock);
        perf_pmu_disable(ctx->pmu);

        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
                if (event->state != PERF_EVENT_STATE_ACTIVE)
                        continue;

                if (!event_filter_match(event))
                        continue;

                perf_pmu_disable(event->pmu);

                hwc = &event->hw;

                if (hwc->interrupts == MAX_INTERRUPTS) {
                        hwc->interrupts = 0;
                        perf_log_throttle(event, 1);
                        event->pmu->start(event, 0);
                }

                if (!event->attr.freq || !event->attr.sample_freq)
                        goto next;

                /*
                 * stop the event and update event->count
                 */
                event->pmu->stop(event, PERF_EF_UPDATE);

                now = local64_read(&event->count);
                delta = now - hwc->freq_count_stamp;
                hwc->freq_count_stamp = now;

                /*
                 * restart the event
                 * reload only if value has changed
                 * we have stopped the event so tell that
                 * to perf_adjust_period() to avoid stopping it
                 * twice.
                 */
                if (delta > 0)
                        perf_adjust_period(event, period, delta, false);

                event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
        next:
                perf_pmu_enable(event->pmu);
        }

        perf_pmu_enable(ctx->pmu);
        raw_spin_unlock(&ctx->lock);
}

/*
 * Move @event to the tail of the @ctx's elegible events.
 */
static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
{
        /*
         * Rotate the first entry last of non-pinned groups. Rotation might be
         * disabled by the inheritance code.
         */
        if (ctx->rotate_disable)
                return;

        perf_event_groups_delete(&ctx->flexible_groups, event);
        perf_event_groups_insert(&ctx->flexible_groups, event);
}

/* pick an event from the flexible_groups to rotate */
static inline struct perf_event *
ctx_event_to_rotate(struct perf_event_context *ctx)
{
        struct perf_event *event;

        /* pick the first active flexible event */
        event = list_first_entry_or_null(&ctx->flexible_active,
                                         struct perf_event, active_list);

        /* if no active flexible event, pick the first event */
        if (!event) {
                event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
                                      typeof(*event), group_node);
        }

        /*
         * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
         * finds there are unschedulable events, it will set it again.
         */
        ctx->rotate_necessary = 0;

        return event;
}

static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
{
        struct perf_event *cpu_event = NULL, *task_event = NULL;
        struct perf_event_context *task_ctx = NULL;
        int cpu_rotate, task_rotate;

        /*
         * Since we run this from IRQ context, nobody can install new
         * events, thus the event count values are stable.
         */

        cpu_rotate = cpuctx->ctx.rotate_necessary;
        task_ctx = cpuctx->task_ctx;
        task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;

        if (!(cpu_rotate || task_rotate))
                return false;

        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
        perf_pmu_disable(cpuctx->ctx.pmu);

        if (task_rotate)
                task_event = ctx_event_to_rotate(task_ctx);
        if (cpu_rotate)
                cpu_event = ctx_event_to_rotate(&cpuctx->ctx);

        /*
         * As per the order given at ctx_resched() first 'pop' task flexible
         * and then, if needed CPU flexible.
         */
        if (task_event || (task_ctx && cpu_event))
                ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
        if (cpu_event)
                cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);

        if (task_event)
                rotate_ctx(task_ctx, task_event);
        if (cpu_event)
                rotate_ctx(&cpuctx->ctx, cpu_event);

        perf_event_sched_in(cpuctx, task_ctx, current);

        perf_pmu_enable(cpuctx->ctx.pmu);
        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);

        return true;
}

void perf_event_task_tick(void)
{
        struct list_head *head = this_cpu_ptr(&active_ctx_list);
        struct perf_event_context *ctx, *tmp;
        int throttled;

        lockdep_assert_irqs_disabled();

        __this_cpu_inc(perf_throttled_seq);
        throttled = __this_cpu_xchg(perf_throttled_count, 0);
        tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);

        list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
                perf_adjust_freq_unthr_context(ctx, throttled);
}

static int event_enable_on_exec(struct perf_event *event,
                                struct perf_event_context *ctx)
{
        if (!event->attr.enable_on_exec)
                return 0;

        event->attr.enable_on_exec = 0;
        if (event->state >= PERF_EVENT_STATE_INACTIVE)
                return 0;

        perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);

        return 1;
}

/*
 * Enable all of a task's events that have been marked enable-on-exec.
 * This expects task == current.
 */
static void perf_event_enable_on_exec(int ctxn)
{
        struct perf_event_context *ctx, *clone_ctx = NULL;
        enum event_type_t event_type = 0;
        struct perf_cpu_context *cpuctx;
        struct perf_event *event;
        unsigned long flags;
        int enabled = 0;

        local_irq_save(flags);
        ctx = current->perf_event_ctxp[ctxn];
        if (!ctx || !ctx->nr_events)
                goto out;

        cpuctx = __get_cpu_context(ctx);
        perf_ctx_lock(cpuctx, ctx);
        ctx_sched_out(ctx, cpuctx, EVENT_TIME);
        list_for_each_entry(event, &ctx->event_list, event_entry) {
                enabled |= event_enable_on_exec(event, ctx);
                event_type |= get_event_type(event);
        }

        /*
         * Unclone and reschedule this context if we enabled any event.
         */
        if (enabled) {
                clone_ctx = unclone_ctx(ctx);
                ctx_resched(cpuctx, ctx, event_type);
        } else {
                ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
        }
        perf_ctx_unlock(cpuctx, ctx);

out:
        local_irq_restore(flags);

        if (clone_ctx)
                put_ctx(clone_ctx);
}

struct perf_read_data {
        struct perf_event *event;
        bool group;
        int ret;
};

static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
{
        u16 local_pkg, event_pkg;

        if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
                int local_cpu = smp_processor_id();

                event_pkg = topology_physical_package_id(event_cpu);
                local_pkg = topology_physical_package_id(local_cpu);

                if (event_pkg == local_pkg)
                        return local_cpu;
        }

        return event_cpu;
}

/*
 * Cross CPU call to read the hardware event
 */
static void __perf_event_read(void *info)
{
        struct perf_read_data *data = info;
        struct perf_event *sub, *event = data->event;
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        struct pmu *pmu = event->pmu;

        /*
         * If this is a task context, we need to check whether it is
         * the current task context of this cpu.  If not it has been
         * scheduled out before the smp call arrived.  In that case
         * event->count would have been updated to a recent sample
         * when the event was scheduled out.
         */
        if (ctx->task && cpuctx->task_ctx != ctx)
                return;

        raw_spin_lock(&ctx->lock);
        if (ctx->is_active & EVENT_TIME) {
                update_context_time(ctx);
                update_cgrp_time_from_event(event);
        }

        perf_event_update_time(event);
        if (data->group)
                perf_event_update_sibling_time(event);

        if (event->state != PERF_EVENT_STATE_ACTIVE)
                goto unlock;

        if (!data->group) {
                pmu->read(event);
                data->ret = 0;
                goto unlock;
        }

        pmu->start_txn(pmu, PERF_PMU_TXN_READ);

        pmu->read(event);

        for_each_sibling_event(sub, event) {
                if (sub->state == PERF_EVENT_STATE_ACTIVE) {
                        /*
                         * Use sibling's PMU rather than @event's since
                         * sibling could be on different (eg: software) PMU.
                         */
                        sub->pmu->read(sub);
                }
        }

        data->ret = pmu->commit_txn(pmu);

unlock:
        raw_spin_unlock(&ctx->lock);
}

static inline u64 perf_event_count(struct perf_event *event)
{
        return local64_read(&event->count) + atomic64_read(&event->child_count);
}

static void calc_timer_values(struct perf_event *event,
                                u64 *now,
                                u64 *enabled,
                                u64 *running)
{
        u64 ctx_time;

        *now = perf_clock();
        ctx_time = perf_event_time_now(event, *now);
        __perf_update_times(event, ctx_time, enabled, running);
}

/*
 * NMI-safe method to read a local event, that is an event that
 * is:
 *   - either for the current task, or for this CPU
 *   - does not have inherit set, for inherited task events
 *     will not be local and we cannot read them atomically
 *   - must not have a pmu::count method
 */
int perf_event_read_local(struct perf_event *event, u64 *value,
                          u64 *enabled, u64 *running)
{
        unsigned long flags;
        int ret = 0;

        /*
         * Disabling interrupts avoids all counter scheduling (context
         * switches, timer based rotation and IPIs).
         */
        local_irq_save(flags);

        /*
         * It must not be an event with inherit set, we cannot read
         * all child counters from atomic context.
         */
        if (event->attr.inherit) {
                ret = -EOPNOTSUPP;
                goto out;
        }

        /* If this is a per-task event, it must be for current */
        if ((event->attach_state & PERF_ATTACH_TASK) &&
            event->hw.target != current) {
                ret = -EINVAL;
                goto out;
        }

        /* If this is a per-CPU event, it must be for this CPU */
        if (!(event->attach_state & PERF_ATTACH_TASK) &&
            event->cpu != smp_processor_id()) {
                ret = -EINVAL;
                goto out;
        }

        /* If this is a pinned event it must be running on this CPU */
        if (event->attr.pinned && event->oncpu != smp_processor_id()) {
                ret = -EBUSY;
                goto out;
        }

        /*
         * If the event is currently on this CPU, its either a per-task event,
         * or local to this CPU. Furthermore it means its ACTIVE (otherwise
         * oncpu == -1).
         */
        if (event->oncpu == smp_processor_id())
                event->pmu->read(event);

        *value = local64_read(&event->count);
        if (enabled || running) {
                u64 __enabled, __running, __now;;

                calc_timer_values(event, &__now, &__enabled, &__running);
                if (enabled)
                        *enabled = __enabled;
                if (running)
                        *running = __running;
        }
out:
        local_irq_restore(flags);

        return ret;
}

static int perf_event_read(struct perf_event *event, bool group)
{
        enum perf_event_state state = READ_ONCE(event->state);
        int event_cpu, ret = 0;

        /*
         * If event is enabled and currently active on a CPU, update the
         * value in the event structure:
         */
again:
        if (state == PERF_EVENT_STATE_ACTIVE) {
                struct perf_read_data data;

                /*
                 * Orders the ->state and ->oncpu loads such that if we see
                 * ACTIVE we must also see the right ->oncpu.
                 *
                 * Matches the smp_wmb() from event_sched_in().
                 */
                smp_rmb();

                event_cpu = READ_ONCE(event->oncpu);
                if ((unsigned)event_cpu >= nr_cpu_ids)
                        return 0;

                data = (struct perf_read_data){
                        .event = event,
                        .group = group,
                        .ret = 0,
                };

                preempt_disable();
                event_cpu = __perf_event_read_cpu(event, event_cpu);

                /*
                 * Purposely ignore the smp_call_function_single() return
                 * value.
                 *
                 * If event_cpu isn't a valid CPU it means the event got
                 * scheduled out and that will have updated the event count.
                 *
                 * Therefore, either way, we'll have an up-to-date event count
                 * after this.
                 */
                (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
                preempt_enable();
                ret = data.ret;

        } else if (state == PERF_EVENT_STATE_INACTIVE) {
                struct perf_event_context *ctx = event->ctx;
                unsigned long flags;

                raw_spin_lock_irqsave(&ctx->lock, flags);
                state = event->state;
                if (state != PERF_EVENT_STATE_INACTIVE) {
                        raw_spin_unlock_irqrestore(&ctx->lock, flags);
                        goto again;
                }

                /*
                 * May read while context is not active (e.g., thread is
                 * blocked), in that case we cannot update context time
                 */
                if (ctx->is_active & EVENT_TIME) {
                        update_context_time(ctx);
                        update_cgrp_time_from_event(event);
                }

                perf_event_update_time(event);
                if (group)
                        perf_event_update_sibling_time(event);
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
        }

        return ret;
}

/*
 * Initialize the perf_event context in a task_struct:
 */
static void __perf_event_init_context(struct perf_event_context *ctx)
{
        raw_spin_lock_init(&ctx->lock);
        mutex_init(&ctx->mutex);
        INIT_LIST_HEAD(&ctx->active_ctx_list);
        perf_event_groups_init(&ctx->pinned_groups);
        perf_event_groups_init(&ctx->flexible_groups);
        INIT_LIST_HEAD(&ctx->event_list);
        INIT_LIST_HEAD(&ctx->pinned_active);
        INIT_LIST_HEAD(&ctx->flexible_active);
        refcount_set(&ctx->refcount, 1);
}

static struct perf_event_context *
alloc_perf_context(struct pmu *pmu, struct task_struct *task)
{
        struct perf_event_context *ctx;

        ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
        if (!ctx)
                return NULL;

        __perf_event_init_context(ctx);
        if (task)
                ctx->task = get_task_struct(task);
        ctx->pmu = pmu;

        return ctx;
}

static struct task_struct *
find_lively_task_by_vpid(pid_t vpid)
{
        struct task_struct *task;

        rcu_read_lock();
        if (!vpid)
                task = current;
        else
                task = find_task_by_vpid(vpid);
        if (task)
                get_task_struct(task);
        rcu_read_unlock();

        if (!task)
                return ERR_PTR(-ESRCH);

        return task;
}

/*
 * Returns a matching context with refcount and pincount.
 */
static struct perf_event_context *
find_get_context(struct pmu *pmu, struct task_struct *task,
                struct perf_event *event)
{
        struct perf_event_context *ctx, *clone_ctx = NULL;
        struct perf_cpu_context *cpuctx;
        void *task_ctx_data = NULL;
        unsigned long flags;
        int ctxn, err;
        int cpu = event->cpu;

        if (!task) {
                /* Must be root to operate on a CPU event: */
                err = perf_allow_cpu(&event->attr);
                if (err)
                        return ERR_PTR(err);

                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
                ctx = &cpuctx->ctx;
                get_ctx(ctx);
                raw_spin_lock_irqsave(&ctx->lock, flags);
                ++ctx->pin_count;
                raw_spin_unlock_irqrestore(&ctx->lock, flags);

                return ctx;
        }

        err = -EINVAL;
        ctxn = pmu->task_ctx_nr;
        if (ctxn < 0)
                goto errout;

        if (event->attach_state & PERF_ATTACH_TASK_DATA) {
                task_ctx_data = alloc_task_ctx_data(pmu);
                if (!task_ctx_data) {
                        err = -ENOMEM;
                        goto errout;
                }
        }

retry:
        ctx = perf_lock_task_context(task, ctxn, &flags);
        if (ctx) {
                clone_ctx = unclone_ctx(ctx);
                ++ctx->pin_count;

                if (task_ctx_data && !ctx->task_ctx_data) {
                        ctx->task_ctx_data = task_ctx_data;
                        task_ctx_data = NULL;
                }
                raw_spin_unlock_irqrestore(&ctx->lock, flags);

                if (clone_ctx)
                        put_ctx(clone_ctx);
        } else {
                ctx = alloc_perf_context(pmu, task);
                err = -ENOMEM;
                if (!ctx)
                        goto errout;

                if (task_ctx_data) {
                        ctx->task_ctx_data = task_ctx_data;
                        task_ctx_data = NULL;
                }

                err = 0;
                mutex_lock(&task->perf_event_mutex);
                /*
                 * If it has already passed perf_event_exit_task().
                 * we must see PF_EXITING, it takes this mutex too.
                 */
                if (task->flags & PF_EXITING)
                        err = -ESRCH;
                else if (task->perf_event_ctxp[ctxn])
                        err = -EAGAIN;
                else {
                        get_ctx(ctx);
                        ++ctx->pin_count;
                        rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
                }
                mutex_unlock(&task->perf_event_mutex);

                if (unlikely(err)) {
                        put_ctx(ctx);

                        if (err == -EAGAIN)
                                goto retry;
                        goto errout;
                }
        }

        free_task_ctx_data(pmu, task_ctx_data);
        return ctx;

errout:
        free_task_ctx_data(pmu, task_ctx_data);
        return ERR_PTR(err);
}

static void perf_event_free_filter(struct perf_event *event);
static void perf_event_free_bpf_prog(struct perf_event *event);

static void free_event_rcu(struct rcu_head *head)
{
        struct perf_event *event;

        event = container_of(head, struct perf_event, rcu_head);
        if (event->ns)
                put_pid_ns(event->ns);
        perf_event_free_filter(event);
        kfree(event);
}

static void ring_buffer_attach(struct perf_event *event,
                               struct perf_buffer *rb);

static void detach_sb_event(struct perf_event *event)
{
        struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);

        raw_spin_lock(&pel->lock);
        list_del_rcu(&event->sb_list);
        raw_spin_unlock(&pel->lock);
}

static bool is_sb_event(struct perf_event *event)
{
        struct perf_event_attr *attr = &event->attr;

        if (event->parent)
                return false;

        if (event->attach_state & PERF_ATTACH_TASK)
                return false;

        if (attr->mmap || attr->mmap_data || attr->mmap2 ||
            attr->comm || attr->comm_exec ||
            attr->task || attr->ksymbol ||
            attr->context_switch || attr->text_poke ||
            attr->bpf_event)
                return true;
        return false;
}

static void unaccount_pmu_sb_event(struct perf_event *event)
{
        if (is_sb_event(event))
                detach_sb_event(event);
}

static void unaccount_event_cpu(struct perf_event *event, int cpu)
{
        if (event->parent)
                return;

        if (is_cgroup_event(event))
                atomic_dec(&per_cpu(perf_cgroup_events, cpu));
}

#ifdef CONFIG_NO_HZ_FULL
static DEFINE_SPINLOCK(nr_freq_lock);
#endif

static void unaccount_freq_event_nohz(void)
{
#ifdef CONFIG_NO_HZ_FULL
        spin_lock(&nr_freq_lock);
        if (atomic_dec_and_test(&nr_freq_events))
                tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
        spin_unlock(&nr_freq_lock);
#endif
}

static void unaccount_freq_event(void)
{
        if (tick_nohz_full_enabled())
                unaccount_freq_event_nohz();
        else
                atomic_dec(&nr_freq_events);
}

static void unaccount_event(struct perf_event *event)
{
        bool dec = false;

        if (event->parent)
                return;

        if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
                dec = true;
        if (event->attr.mmap || event->attr.mmap_data)
                atomic_dec(&nr_mmap_events);
        if (event->attr.comm)
                atomic_dec(&nr_comm_events);
        if (event->attr.namespaces)
                atomic_dec(&nr_namespaces_events);
        if (event->attr.cgroup)
                atomic_dec(&nr_cgroup_events);
        if (event->attr.task)
                atomic_dec(&nr_task_events);
        if (event->attr.freq)
                unaccount_freq_event();
        if (event->attr.context_switch) {
                dec = true;
                atomic_dec(&nr_switch_events);
        }
        if (is_cgroup_event(event))
                dec = true;
        if (has_branch_stack(event))
                dec = true;
        if (event->attr.ksymbol)
                atomic_dec(&nr_ksymbol_events);
        if (event->attr.bpf_event)
                atomic_dec(&nr_bpf_events);
        if (event->attr.text_poke)
                atomic_dec(&nr_text_poke_events);

        if (dec) {
                if (!atomic_add_unless(&perf_sched_count, -1, 1))
                        schedule_delayed_work(&perf_sched_work, HZ);
        }

        unaccount_event_cpu(event, event->cpu);

        unaccount_pmu_sb_event(event);
}

static void perf_sched_delayed(struct work_struct *work)
{
        mutex_lock(&perf_sched_mutex);
        if (atomic_dec_and_test(&perf_sched_count))
                static_branch_disable(&perf_sched_events);
        mutex_unlock(&perf_sched_mutex);
}

/*
 * The following implement mutual exclusion of events on "exclusive" pmus
 * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
 * at a time, so we disallow creating events that might conflict, namely:
 *
 *  1) cpu-wide events in the presence of per-task events,
 *  2) per-task events in the presence of cpu-wide events,
 *  3) two matching events on the same context.
 *
 * The former two cases are handled in the allocation path (perf_event_alloc(),
 * _free_event()), the latter -- before the first perf_install_in_context().
 */
static int exclusive_event_init(struct perf_event *event)
{
        struct pmu *pmu = event->pmu;

        if (!is_exclusive_pmu(pmu))
                return 0;

        /*
         * Prevent co-existence of per-task and cpu-wide events on the
         * same exclusive pmu.
         *
         * Negative pmu::exclusive_cnt means there are cpu-wide
         * events on this "exclusive" pmu, positive means there are
         * per-task events.
         *
         * Since this is called in perf_event_alloc() path, event::ctx
         * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
         * to mean "per-task event", because unlike other attach states it
         * never gets cleared.
         */
        if (event->attach_state & PERF_ATTACH_TASK) {
                if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
                        return -EBUSY;
        } else {
                if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
                        return -EBUSY;
        }

        return 0;
}

static void exclusive_event_destroy(struct perf_event *event)
{
        struct pmu *pmu = event->pmu;

        if (!is_exclusive_pmu(pmu))
                return;

        /* see comment in exclusive_event_init() */
        if (event->attach_state & PERF_ATTACH_TASK)
                atomic_dec(&pmu->exclusive_cnt);
        else
                atomic_inc(&pmu->exclusive_cnt);
}

static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
{
        if ((e1->pmu == e2->pmu) &&
            (e1->cpu == e2->cpu ||
             e1->cpu == -1 ||
             e2->cpu == -1))
                return true;
        return false;
}

static bool exclusive_event_installable(struct perf_event *event,
                                        struct perf_event_context *ctx)
{
        struct perf_event *iter_event;
        struct pmu *pmu = event->pmu;

        lockdep_assert_held(&ctx->mutex);

        if (!is_exclusive_pmu(pmu))
                return true;

        list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
                if (exclusive_event_match(iter_event, event))
                        return false;
        }

        return true;
}

static void perf_addr_filters_splice(struct perf_event *event,
                                       struct list_head *head);

static void _free_event(struct perf_event *event)
{
        irq_work_sync(&event->pending);

        unaccount_event(event);

        security_perf_event_free(event);

        if (event->rb) {
                /*
                 * Can happen when we close an event with re-directed output.
                 *
                 * Since we have a 0 refcount, perf_mmap_close() will skip
                 * over us; possibly making our ring_buffer_put() the last.
                 */
                mutex_lock(&event->mmap_mutex);
                ring_buffer_attach(event, NULL);
                mutex_unlock(&event->mmap_mutex);
        }

        if (is_cgroup_event(event))
                perf_detach_cgroup(event);

        if (!event->parent) {
                if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
                        put_callchain_buffers();
        }

        perf_event_free_bpf_prog(event);
        perf_addr_filters_splice(event, NULL);
        kfree(event->addr_filter_ranges);

        if (event->destroy)
                event->destroy(event);

        /*
         * Must be after ->destroy(), due to uprobe_perf_close() using
         * hw.target.
         */
        if (event->hw.target)
                put_task_struct(event->hw.target);

        /*
         * perf_event_free_task() relies on put_ctx() being 'last', in particular
         * all task references must be cleaned up.
         */
        if (event->ctx)
                put_ctx(event->ctx);

        exclusive_event_destroy(event);
        module_put(event->pmu->module);

        call_rcu(&event->rcu_head, free_event_rcu);
}

/*
 * Used to free events which have a known refcount of 1, such as in error paths
 * where the event isn't exposed yet and inherited events.
 */
static void free_event(struct perf_event *event)
{
        if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
                                "unexpected event refcount: %ld; ptr=%p\n",
                                atomic_long_read(&event->refcount), event)) {
                /* leak to avoid use-after-free */
                return;
        }

        _free_event(event);
}

/*
 * Remove user event from the owner task.
 */
static void perf_remove_from_owner(struct perf_event *event)
{
        struct task_struct *owner;

        rcu_read_lock();
        /*
         * Matches the smp_store_release() in perf_event_exit_task(). If we
         * observe !owner it means the list deletion is complete and we can
         * indeed free this event, otherwise we need to serialize on
         * owner->perf_event_mutex.
         */
        owner = READ_ONCE(event->owner);
        if (owner) {
                /*
                 * Since delayed_put_task_struct() also drops the last
                 * task reference we can safely take a new reference
                 * while holding the rcu_read_lock().
                 */
                get_task_struct(owner);
        }
        rcu_read_unlock();

        if (owner) {
                /*
                 * If we're here through perf_event_exit_task() we're already
                 * holding ctx->mutex which would be an inversion wrt. the
                 * normal lock order.
                 *
                 * However we can safely take this lock because its the child
                 * ctx->mutex.
                 */
                mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);

                /*
                 * We have to re-check the event->owner field, if it is cleared
                 * we raced with perf_event_exit_task(), acquiring the mutex
                 * ensured they're done, and we can proceed with freeing the
                 * event.
                 */
                if (event->owner) {
                        list_del_init(&event->owner_entry);
                        smp_store_release(&event->owner, NULL);
                }
                mutex_unlock(&owner->perf_event_mutex);
                put_task_struct(owner);
        }
}

static void put_event(struct perf_event *event)
{
        if (!atomic_long_dec_and_test(&event->refcount))
                return;

        _free_event(event);
}

/*
 * Kill an event dead; while event:refcount will preserve the event
 * object, it will not preserve its functionality. Once the last 'user'
 * gives up the object, we'll destroy the thing.
 */
int perf_event_release_kernel(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;
        struct perf_event *child, *tmp;
        LIST_HEAD(free_list);

        /*
         * If we got here through err_file: fput(event_file); we will not have
         * attached to a context yet.
         */
        if (!ctx) {
                WARN_ON_ONCE(event->attach_state &
                                (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
                goto no_ctx;
        }

        if (!is_kernel_event(event))
                perf_remove_from_owner(event);

        ctx = perf_event_ctx_lock(event);
        WARN_ON_ONCE(ctx->parent_ctx);
        perf_remove_from_context(event, DETACH_GROUP);

        raw_spin_lock_irq(&ctx->lock);
        /*
         * Mark this event as STATE_DEAD, there is no external reference to it
         * anymore.
         *
         * Anybody acquiring event->child_mutex after the below loop _must_
         * also see this, most importantly inherit_event() which will avoid
         * placing more children on the list.
         *
         * Thus this guarantees that we will in fact observe and kill _ALL_
         * child events.
         */
        event->state = PERF_EVENT_STATE_DEAD;
        raw_spin_unlock_irq(&ctx->lock);

        perf_event_ctx_unlock(event, ctx);

again:
        mutex_lock(&event->child_mutex);
        list_for_each_entry(child, &event->child_list, child_list) {
                void *var = NULL;

                /*
                 * Cannot change, child events are not migrated, see the
                 * comment with perf_event_ctx_lock_nested().
                 */
                ctx = READ_ONCE(child->ctx);
                /*
                 * Since child_mutex nests inside ctx::mutex, we must jump
                 * through hoops. We start by grabbing a reference on the ctx.
                 *
                 * Since the event cannot get freed while we hold the
                 * child_mutex, the context must also exist and have a !0
                 * reference count.
                 */
                get_ctx(ctx);

                /*
                 * Now that we have a ctx ref, we can drop child_mutex, and
                 * acquire ctx::mutex without fear of it going away. Then we
                 * can re-acquire child_mutex.
                 */
                mutex_unlock(&event->child_mutex);
                mutex_lock(&ctx->mutex);
                mutex_lock(&event->child_mutex);

                /*
                 * Now that we hold ctx::mutex and child_mutex, revalidate our
                 * state, if child is still the first entry, it didn't get freed
                 * and we can continue doing so.
                 */
                tmp = list_first_entry_or_null(&event->child_list,
                                               struct perf_event, child_list);
                if (tmp == child) {
                        perf_remove_from_context(child, DETACH_GROUP);
                        list_move(&child->child_list, &free_list);
                        /*
                         * This matches the refcount bump in inherit_event();
                         * this can't be the last reference.
                         */
                        put_event(event);
                } else {
                        var = &ctx->refcount;
                }

                mutex_unlock(&event->child_mutex);
                mutex_unlock(&ctx->mutex);
                put_ctx(ctx);

                if (var) {
                        /*
                         * If perf_event_free_task() has deleted all events from the
                         * ctx while the child_mutex got released above, make sure to
                         * notify about the preceding put_ctx().
                         */
                        smp_mb(); /* pairs with wait_var_event() */
                        wake_up_var(var);
                }
                goto again;
        }
        mutex_unlock(&event->child_mutex);

        list_for_each_entry_safe(child, tmp, &free_list, child_list) {
                void *var = &child->ctx->refcount;

                list_del(&child->child_list);
                free_event(child);

                /*
                 * Wake any perf_event_free_task() waiting for this event to be
                 * freed.
                 */
                smp_mb(); /* pairs with wait_var_event() */
                wake_up_var(var);
        }

no_ctx:
        put_event(event); /* Must be the 'last' reference */
        return 0;
}
EXPORT_SYMBOL_GPL(perf_event_release_kernel);

/*
 * Called when the last reference to the file is gone.
 */
static int perf_release(struct inode *inode, struct file *file)
{
        perf_event_release_kernel(file->private_data);
        return 0;
}

static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
{
        struct perf_event *child;
        u64 total = 0;

        *enabled = 0;
        *running = 0;

        mutex_lock(&event->child_mutex);

        (void)perf_event_read(event, false);
        total += perf_event_count(event);

        *enabled += event->total_time_enabled +
                        atomic64_read(&event->child_total_time_enabled);
        *running += event->total_time_running +
                        atomic64_read(&event->child_total_time_running);

        list_for_each_entry(child, &event->child_list, child_list) {
                (void)perf_event_read(child, false);
                total += perf_event_count(child);
                *enabled += child->total_time_enabled;
                *running += child->total_time_running;
        }
        mutex_unlock(&event->child_mutex);

        return total;
}

u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
{
        struct perf_event_context *ctx;
        u64 count;

        ctx = perf_event_ctx_lock(event);
        count = __perf_event_read_value(event, enabled, running);
        perf_event_ctx_unlock(event, ctx);

        return count;
}
EXPORT_SYMBOL_GPL(perf_event_read_value);

static int __perf_read_group_add(struct perf_event *leader,
                                        u64 read_format, u64 *values)
{
        struct perf_event_context *ctx = leader->ctx;
        struct perf_event *sub, *parent;
        unsigned long flags;
        int n = 1; /* skip @nr */
        int ret;

        ret = perf_event_read(leader, true);
        if (ret)
                return ret;

        raw_spin_lock_irqsave(&ctx->lock, flags);
        /*
         * Verify the grouping between the parent and child (inherited)
         * events is still in tact.
         *
         * Specifically:
         *  - leader->ctx->lock pins leader->sibling_list
         *  - parent->child_mutex pins parent->child_list
         *  - parent->ctx->mutex pins parent->sibling_list
         *
         * Because parent->ctx != leader->ctx (and child_list nests inside
         * ctx->mutex), group destruction is not atomic between children, also
         * see perf_event_release_kernel(). Additionally, parent can grow the
         * group.
         *
         * Therefore it is possible to have parent and child groups in a
         * different configuration and summing over such a beast makes no sense
         * what so ever.
         *
         * Reject this.
         */
        parent = leader->parent;
        if (parent &&
            (parent->group_generation != leader->group_generation ||
             parent->nr_siblings != leader->nr_siblings)) {
                ret = -ECHILD;
                goto unlock;
        }

        /*
         * Since we co-schedule groups, {enabled,running} times of siblings
         * will be identical to those of the leader, so we only publish one
         * set.
         */
        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
                values[n++] += leader->total_time_enabled +
                        atomic64_read(&leader->child_total_time_enabled);
        }

        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
                values[n++] += leader->total_time_running +
                        atomic64_read(&leader->child_total_time_running);
        }

        /*
         * Write {count,id} tuples for every sibling.
         */
        values[n++] += perf_event_count(leader);
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(leader);
        if (read_format & PERF_FORMAT_LOST)
                values[n++] = atomic64_read(&leader->lost_samples);

        for_each_sibling_event(sub, leader) {
                values[n++] += perf_event_count(sub);
                if (read_format & PERF_FORMAT_ID)
                        values[n++] = primary_event_id(sub);
                if (read_format & PERF_FORMAT_LOST)
                        values[n++] = atomic64_read(&sub->lost_samples);
        }

unlock:
        raw_spin_unlock_irqrestore(&ctx->lock, flags);
        return ret;
}

static int perf_read_group(struct perf_event *event,
                                   u64 read_format, char __user *buf)
{
        struct perf_event *leader = event->group_leader, *child;
        struct perf_event_context *ctx = leader->ctx;
        int ret;
        u64 *values;

        lockdep_assert_held(&ctx->mutex);

        values = kzalloc(event->read_size, GFP_KERNEL);
        if (!values)
                return -ENOMEM;

        values[0] = 1 + leader->nr_siblings;

        mutex_lock(&leader->child_mutex);

        ret = __perf_read_group_add(leader, read_format, values);
        if (ret)
                goto unlock;

        list_for_each_entry(child, &leader->child_list, child_list) {
                ret = __perf_read_group_add(child, read_format, values);
                if (ret)
                        goto unlock;
        }

        mutex_unlock(&leader->child_mutex);

        ret = event->read_size;
        if (copy_to_user(buf, values, event->read_size))
                ret = -EFAULT;
        goto out;

unlock:
        mutex_unlock(&leader->child_mutex);
out:
        kfree(values);
        return ret;
}

static int perf_read_one(struct perf_event *event,
                                 u64 read_format, char __user *buf)
{
        u64 enabled, running;
        u64 values[5];
        int n = 0;

        values[n++] = __perf_event_read_value(event, &enabled, &running);
        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
                values[n++] = enabled;
        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
                values[n++] = running;
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(event);
        if (read_format & PERF_FORMAT_LOST)
                values[n++] = atomic64_read(&event->lost_samples);

        if (copy_to_user(buf, values, n * sizeof(u64)))
                return -EFAULT;

        return n * sizeof(u64);
}

static bool is_event_hup(struct perf_event *event)
{
        bool no_children;

        if (event->state > PERF_EVENT_STATE_EXIT)
                return false;

        mutex_lock(&event->child_mutex);
        no_children = list_empty(&event->child_list);
        mutex_unlock(&event->child_mutex);
        return no_children;
}

/*
 * Read the performance event - simple non blocking version for now
 */
static ssize_t
__perf_read(struct perf_event *event, char __user *buf, size_t count)
{
        u64 read_format = event->attr.read_format;
        int ret;

        /*
         * Return end-of-file for a read on an event that is in
         * error state (i.e. because it was pinned but it couldn't be
         * scheduled on to the CPU at some point).
         */
        if (event->state == PERF_EVENT_STATE_ERROR)
                return 0;

        if (count < event->read_size)
                return -ENOSPC;

        WARN_ON_ONCE(event->ctx->parent_ctx);
        if (read_format & PERF_FORMAT_GROUP)
                ret = perf_read_group(event, read_format, buf);
        else
                ret = perf_read_one(event, read_format, buf);

        return ret;
}

static ssize_t
perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
        struct perf_event *event = file->private_data;
        struct perf_event_context *ctx;
        int ret;

        ret = security_perf_event_read(event);
        if (ret)
                return ret;

        ctx = perf_event_ctx_lock(event);
        ret = __perf_read(event, buf, count);
        perf_event_ctx_unlock(event, ctx);

        return ret;
}

static __poll_t perf_poll(struct file *file, poll_table *wait)
{
        struct perf_event *event = file->private_data;
        struct perf_buffer *rb;
        __poll_t events = EPOLLHUP;

        poll_wait(file, &event->waitq, wait);

        if (is_event_hup(event))
                return events;

        /*
         * Pin the event->rb by taking event->mmap_mutex; otherwise
         * perf_event_set_output() can swizzle our rb and make us miss wakeups.
         */
        mutex_lock(&event->mmap_mutex);
        rb = event->rb;
        if (rb)
                events = atomic_xchg(&rb->poll, 0);
        mutex_unlock(&event->mmap_mutex);
        return events;
}

static void _perf_event_reset(struct perf_event *event)
{
        (void)perf_event_read(event, false);
        local64_set(&event->count, 0);
        perf_event_update_userpage(event);
}

/* Assume it's not an event with inherit set. */
u64 perf_event_pause(struct perf_event *event, bool reset)
{
        struct perf_event_context *ctx;
        u64 count;

        ctx = perf_event_ctx_lock(event);
        WARN_ON_ONCE(event->attr.inherit);
        _perf_event_disable(event);
        count = local64_read(&event->count);
        if (reset)
                local64_set(&event->count, 0);
        perf_event_ctx_unlock(event, ctx);

        return count;
}
EXPORT_SYMBOL_GPL(perf_event_pause);

/*
 * Holding the top-level event's child_mutex means that any
 * descendant process that has inherited this event will block
 * in perf_event_exit_event() if it goes to exit, thus satisfying the
 * task existence requirements of perf_event_enable/disable.
 */
static void perf_event_for_each_child(struct perf_event *event,
                                        void (*func)(struct perf_event *))
{
        struct perf_event *child;

        WARN_ON_ONCE(event->ctx->parent_ctx);

        mutex_lock(&event->child_mutex);
        func(event);
        list_for_each_entry(child, &event->child_list, child_list)
                func(child);
        mutex_unlock(&event->child_mutex);
}

static void perf_event_for_each(struct perf_event *event,
                                  void (*func)(struct perf_event *))
{
        struct perf_event_context *ctx = event->ctx;
        struct perf_event *sibling;

        lockdep_assert_held(&ctx->mutex);

        event = event->group_leader;

        perf_event_for_each_child(event, func);
        for_each_sibling_event(sibling, event)
                perf_event_for_each_child(sibling, func);
}

static void __perf_event_period(struct perf_event *event,
                                struct perf_cpu_context *cpuctx,
                                struct perf_event_context *ctx,
                                void *info)
{
        u64 value = *((u64 *)info);
        bool active;

        if (event->attr.freq) {
                event->attr.sample_freq = value;
        } else {
                event->attr.sample_period = value;
                event->hw.sample_period = value;
        }

        active = (event->state == PERF_EVENT_STATE_ACTIVE);
        if (active) {
                perf_pmu_disable(ctx->pmu);
                /*
                 * We could be throttled; unthrottle now to avoid the tick
                 * trying to unthrottle while we already re-started the event.
                 */
                if (event->hw.interrupts == MAX_INTERRUPTS) {
                        event->hw.interrupts = 0;
                        perf_log_throttle(event, 1);
                }
                event->pmu->stop(event, PERF_EF_UPDATE);
        }

        local64_set(&event->hw.period_left, 0);

        if (active) {
                event->pmu->start(event, PERF_EF_RELOAD);
                perf_pmu_enable(ctx->pmu);
        }
}

static int perf_event_check_period(struct perf_event *event, u64 value)
{
        return event->pmu->check_period(event, value);
}

static int _perf_event_period(struct perf_event *event, u64 value)
{
        if (!is_sampling_event(event))
                return -EINVAL;

        if (!value)
                return -EINVAL;

        if (event->attr.freq) {
                if (value > sysctl_perf_event_sample_rate)
                        return -EINVAL;
        } else {
                if (perf_event_check_period(event, value))
                        return -EINVAL;
                if (value & (1ULL << 63))
                        return -EINVAL;
        }

        event_function_call(event, __perf_event_period, &value);

        return 0;
}

int perf_event_period(struct perf_event *event, u64 value)
{
        struct perf_event_context *ctx;
        int ret;

        ctx = perf_event_ctx_lock(event);
        ret = _perf_event_period(event, value);
        perf_event_ctx_unlock(event, ctx);

        return ret;
}
EXPORT_SYMBOL_GPL(perf_event_period);

static const struct file_operations perf_fops;

static inline int perf_fget_light(int fd, struct fd *p)
{
        struct fd f = fdget(fd);
        if (!f.file)
                return -EBADF;

        if (f.file->f_op != &perf_fops) {
                fdput(f);
                return -EBADF;
        }
        *p = f;
        return 0;
}

static int perf_event_set_output(struct perf_event *event,
                                 struct perf_event *output_event);
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
static int perf_copy_attr(struct perf_event_attr __user *uattr,
                          struct perf_event_attr *attr);

static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
{
        void (*func)(struct perf_event *);
        u32 flags = arg;

        switch (cmd) {
        case PERF_EVENT_IOC_ENABLE:
                func = _perf_event_enable;
                break;
        case PERF_EVENT_IOC_DISABLE:
                func = _perf_event_disable;
                break;
        case PERF_EVENT_IOC_RESET:
                func = _perf_event_reset;
                break;

        case PERF_EVENT_IOC_REFRESH:
                return _perf_event_refresh(event, arg);

        case PERF_EVENT_IOC_PERIOD:
        {
                u64 value;

                if (copy_from_user(&value, (u64 __user *)arg, sizeof(value)))
                        return -EFAULT;

                return _perf_event_period(event, value);
        }
        case PERF_EVENT_IOC_ID:
        {
                u64 id = primary_event_id(event);

                if (copy_to_user((void __user *)arg, &id, sizeof(id)))
                        return -EFAULT;
                return 0;
        }

        case PERF_EVENT_IOC_SET_OUTPUT:
        {
                int ret;
                if (arg != -1) {
                        struct perf_event *output_event;
                        struct fd output;
                        ret = perf_fget_light(arg, &output);
                        if (ret)
                                return ret;
                        output_event = output.file->private_data;
                        ret = perf_event_set_output(event, output_event);
                        fdput(output);
                } else {
                        ret = perf_event_set_output(event, NULL);
                }
                return ret;
        }

        case PERF_EVENT_IOC_SET_FILTER:
                return perf_event_set_filter(event, (void __user *)arg);

        case PERF_EVENT_IOC_SET_BPF:
                return perf_event_set_bpf_prog(event, arg);

        case PERF_EVENT_IOC_PAUSE_OUTPUT: {
                struct perf_buffer *rb;

                rcu_read_lock();
                rb = rcu_dereference(event->rb);
                if (!rb || !rb->nr_pages) {
                        rcu_read_unlock();
                        return -EINVAL;
                }
                rb_toggle_paused(rb, !!arg);
                rcu_read_unlock();
                return 0;
        }

        case PERF_EVENT_IOC_QUERY_BPF:
                return perf_event_query_prog_array(event, (void __user *)arg);

        case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
                struct perf_event_attr new_attr;
                int err = perf_copy_attr((struct perf_event_attr __user *)arg,
                                         &new_attr);

                if (err)
                        return err;

                return perf_event_modify_attr(event,  &new_attr);
        }
        default:
                return -ENOTTY;
        }

        if (flags & PERF_IOC_FLAG_GROUP)
                perf_event_for_each(event, func);
        else
                perf_event_for_each_child(event, func);

        return 0;
}

static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
        struct perf_event *event = file->private_data;
        struct perf_event_context *ctx;
        long ret;

        /* Treat ioctl like writes as it is likely a mutating operation. */
        ret = security_perf_event_write(event);
        if (ret)
                return ret;

        ctx = perf_event_ctx_lock(event);
        ret = _perf_ioctl(event, cmd, arg);
        perf_event_ctx_unlock(event, ctx);

        return ret;
}

#ifdef CONFIG_COMPAT
static long perf_compat_ioctl(struct file *file, unsigned int cmd,
                                unsigned long arg)
{
        switch (_IOC_NR(cmd)) {
        case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
        case _IOC_NR(PERF_EVENT_IOC_ID):
        case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF):
        case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES):
                /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
                if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
                        cmd &= ~IOCSIZE_MASK;
                        cmd |= sizeof(void *) << IOCSIZE_SHIFT;
                }
                break;
        }
        return perf_ioctl(file, cmd, arg);
}
#else
# define perf_compat_ioctl NULL
#endif

int perf_event_task_enable(void)
{
        struct perf_event_context *ctx;
        struct perf_event *event;

        mutex_lock(&current->perf_event_mutex);
        list_for_each_entry(event, &current->perf_event_list, owner_entry) {
                ctx = perf_event_ctx_lock(event);
                perf_event_for_each_child(event, _perf_event_enable);
                perf_event_ctx_unlock(event, ctx);
        }
        mutex_unlock(&current->perf_event_mutex);

        return 0;
}

int perf_event_task_disable(void)
{
        struct perf_event_context *ctx;
        struct perf_event *event;

        mutex_lock(&current->perf_event_mutex);
        list_for_each_entry(event, &current->perf_event_list, owner_entry) {
                ctx = perf_event_ctx_lock(event);
                perf_event_for_each_child(event, _perf_event_disable);
                perf_event_ctx_unlock(event, ctx);
        }
        mutex_unlock(&current->perf_event_mutex);

        return 0;
}

static int perf_event_index(struct perf_event *event)
{
        if (event->hw.state & PERF_HES_STOPPED)
                return 0;

        if (event->state != PERF_EVENT_STATE_ACTIVE)
                return 0;

        return event->pmu->event_idx(event);
}

static void perf_event_init_userpage(struct perf_event *event)
{
        struct perf_event_mmap_page *userpg;
        struct perf_buffer *rb;

        rcu_read_lock();
        rb = rcu_dereference(event->rb);
        if (!rb)
                goto unlock;

        userpg = rb->user_page;

        /* Allow new userspace to detect that bit 0 is deprecated */
        userpg->cap_bit0_is_deprecated = 1;
        userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
        userpg->data_offset = PAGE_SIZE;
        userpg->data_size = perf_data_size(rb);

unlock:
        rcu_read_unlock();
}

void __weak arch_perf_update_userpage(
        struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
{
}

/*
 * Callers need to ensure there can be no nesting of this function, otherwise
 * the seqlock logic goes bad. We can not serialize this because the arch
 * code calls this from NMI context.
 */
void perf_event_update_userpage(struct perf_event *event)
{
        struct perf_event_mmap_page *userpg;
        struct perf_buffer *rb;
        u64 enabled, running, now;

        rcu_read_lock();
        rb = rcu_dereference(event->rb);
        if (!rb)
                goto unlock;

        /*
         * compute total_time_enabled, total_time_running
         * based on snapshot values taken when the event
         * was last scheduled in.
         *
         * we cannot simply called update_context_time()
         * because of locking issue as we can be called in
         * NMI context
         */
        calc_timer_values(event, &now, &enabled, &running);

        userpg = rb->user_page;
        /*
         * Disable preemption to guarantee consistent time stamps are stored to
         * the user page.
         */
        preempt_disable();
        ++userpg->lock;
        barrier();
        userpg->index = perf_event_index(event);
        userpg->offset = perf_event_count(event);
        if (userpg->index)
                userpg->offset -= local64_read(&event->hw.prev_count);

        userpg->time_enabled = enabled +
                        atomic64_read(&event->child_total_time_enabled);

        userpg->time_running = running +
                        atomic64_read(&event->child_total_time_running);

        arch_perf_update_userpage(event, userpg, now);

        barrier();
        ++userpg->lock;
        preempt_enable();
unlock:
        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(perf_event_update_userpage);

static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
{
        struct perf_event *event = vmf->vma->vm_file->private_data;
        struct perf_buffer *rb;
        vm_fault_t ret = VM_FAULT_SIGBUS;

        if (vmf->flags & FAULT_FLAG_MKWRITE) {
                if (vmf->pgoff == 0)
                        ret = 0;
                return ret;
        }

        rcu_read_lock();
        rb = rcu_dereference(event->rb);
        if (!rb)
                goto unlock;

        if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
                goto unlock;

        vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
        if (!vmf->page)
                goto unlock;

        get_page(vmf->page);
        vmf->page->mapping = vmf->vma->vm_file->f_mapping;
        vmf->page->index   = vmf->pgoff;

        ret = 0;
unlock:
        rcu_read_unlock();

        return ret;
}

static void ring_buffer_attach(struct perf_event *event,
                               struct perf_buffer *rb)
{
        struct perf_buffer *old_rb = NULL;
        unsigned long flags;

        WARN_ON_ONCE(event->parent);

        if (event->rb) {
                /*
                 * Should be impossible, we set this when removing
                 * event->rb_entry and wait/clear when adding event->rb_entry.
                 */
                WARN_ON_ONCE(event->rcu_pending);

                old_rb = event->rb;
                spin_lock_irqsave(&old_rb->event_lock, flags);
                list_del_rcu(&event->rb_entry);
                spin_unlock_irqrestore(&old_rb->event_lock, flags);

                event->rcu_batches = get_state_synchronize_rcu();
                event->rcu_pending = 1;
        }

        if (rb) {
                if (event->rcu_pending) {
                        cond_synchronize_rcu(event->rcu_batches);
                        event->rcu_pending = 0;
                }

                spin_lock_irqsave(&rb->event_lock, flags);
                list_add_rcu(&event->rb_entry, &rb->event_list);
                spin_unlock_irqrestore(&rb->event_lock, flags);
        }

        /*
         * Avoid racing with perf_mmap_close(AUX): stop the event
         * before swizzling the event::rb pointer; if it's getting
         * unmapped, its aux_mmap_count will be 0 and it won't
         * restart. See the comment in __perf_pmu_output_stop().
         *
         * Data will inevitably be lost when set_output is done in
         * mid-air, but then again, whoever does it like this is
         * not in for the data anyway.
         */
        if (has_aux(event))
                perf_event_stop(event, 0);

        rcu_assign_pointer(event->rb, rb);

        if (old_rb) {
                ring_buffer_put(old_rb);
                /*
                 * Since we detached before setting the new rb, so that we
                 * could attach the new rb, we could have missed a wakeup.
                 * Provide it now.
                 */
                wake_up_all(&event->waitq);
        }
}

static void ring_buffer_wakeup(struct perf_event *event)
{
        struct perf_buffer *rb;

        if (event->parent)
                event = event->parent;

        rcu_read_lock();
        rb = rcu_dereference(event->rb);
        if (rb) {
                list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
                        wake_up_all(&event->waitq);
        }
        rcu_read_unlock();
}

struct perf_buffer *ring_buffer_get(struct perf_event *event)
{
        struct perf_buffer *rb;

        if (event->parent)
                event = event->parent;

        rcu_read_lock();
        rb = rcu_dereference(event->rb);
        if (rb) {
                if (!refcount_inc_not_zero(&rb->refcount))
                        rb = NULL;
        }
        rcu_read_unlock();

        return rb;
}

void ring_buffer_put(struct perf_buffer *rb)
{
        if (!refcount_dec_and_test(&rb->refcount))
                return;

        WARN_ON_ONCE(!list_empty(&rb->event_list));

        call_rcu(&rb->rcu_head, rb_free_rcu);
}

static void perf_mmap_open(struct vm_area_struct *vma)
{
        struct perf_event *event = vma->vm_file->private_data;

        atomic_inc(&event->mmap_count);
        atomic_inc(&event->rb->mmap_count);

        if (vma->vm_pgoff)
                atomic_inc(&event->rb->aux_mmap_count);

        if (event->pmu->event_mapped)
                event->pmu->event_mapped(event, vma->vm_mm);
}

static void perf_pmu_output_stop(struct perf_event *event);

/*
 * A buffer can be mmap()ed multiple times; either directly through the same
 * event, or through other events by use of perf_event_set_output().
 *
 * In order to undo the VM accounting done by perf_mmap() we need to destroy
 * the buffer here, where we still have a VM context. This means we need
 * to detach all events redirecting to us.
 */
static void perf_mmap_close(struct vm_area_struct *vma)
{
        struct perf_event *event = vma->vm_file->private_data;
        struct perf_buffer *rb = ring_buffer_get(event);
        struct user_struct *mmap_user = rb->mmap_user;
        int mmap_locked = rb->mmap_locked;
        unsigned long size = perf_data_size(rb);
        bool detach_rest = false;

        if (event->pmu->event_unmapped)
                event->pmu->event_unmapped(event, vma->vm_mm);

        /*
         * The AUX buffer is strictly a sub-buffer, serialize using aux_mutex
         * to avoid complications.
         */
        if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
            atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) {
                /*
                 * Stop all AUX events that are writing to this buffer,
                 * so that we can free its AUX pages and corresponding PMU
                 * data. Note that after rb::aux_mmap_count dropped to zero,
                 * they won't start any more (see perf_aux_output_begin()).
                 */
                perf_pmu_output_stop(event);

                /* now it's safe to free the pages */
                atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);
                atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);

                /* this has to be the last one */
                rb_free_aux(rb);
                WARN_ON_ONCE(refcount_read(&rb->aux_refcount));

                mutex_unlock(&rb->aux_mutex);
        }

        if (atomic_dec_and_test(&rb->mmap_count))
                detach_rest = true;

        if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
                goto out_put;

        ring_buffer_attach(event, NULL);
        mutex_unlock(&event->mmap_mutex);

        /* If there's still other mmap()s of this buffer, we're done. */
        if (!detach_rest)
                goto out_put;

        /*
         * No other mmap()s, detach from all other events that might redirect
         * into the now unreachable buffer. Somewhat complicated by the
         * fact that rb::event_lock otherwise nests inside mmap_mutex.
         */
again:
        rcu_read_lock();
        list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
                if (!atomic_long_inc_not_zero(&event->refcount)) {
                        /*
                         * This event is en-route to free_event() which will
                         * detach it and remove it from the list.
                         */
                        continue;
                }
                rcu_read_unlock();

                mutex_lock(&event->mmap_mutex);
                /*
                 * Check we didn't race with perf_event_set_output() which can
                 * swizzle the rb from under us while we were waiting to
                 * acquire mmap_mutex.
                 *
                 * If we find a different rb; ignore this event, a next
                 * iteration will no longer find it on the list. We have to
                 * still restart the iteration to make sure we're not now
                 * iterating the wrong list.
                 */
                if (event->rb == rb)
                        ring_buffer_attach(event, NULL);

                mutex_unlock(&event->mmap_mutex);
                put_event(event);

                /*
                 * Restart the iteration; either we're on the wrong list or
                 * destroyed its integrity by doing a deletion.
                 */
                goto again;
        }
        rcu_read_unlock();

        /*
         * It could be there's still a few 0-ref events on the list; they'll
         * get cleaned up by free_event() -- they'll also still have their
         * ref on the rb and will free it whenever they are done with it.
         *
         * Aside from that, this buffer is 'fully' detached and unmapped,
         * undo the VM accounting.
         */

        atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
                        &mmap_user->locked_vm);
        atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
        free_uid(mmap_user);

out_put:
        ring_buffer_put(rb); /* could be last */
}

static int perf_mmap_may_split(struct vm_area_struct *vma, unsigned long addr)
{
        /*
         * Forbid splitting perf mappings to prevent refcount leaks due to
         * the resulting non-matching offsets and sizes. See open()/close().
         */
        return -EINVAL;
}

static const struct vm_operations_struct perf_mmap_vmops = {
        .open                = perf_mmap_open,
        .close                = perf_mmap_close, /* non mergeable */
        .fault                = perf_mmap_fault,
        .page_mkwrite        = perf_mmap_fault,
        .split                = perf_mmap_may_split,
};

static int perf_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct perf_event *event = file->private_data;
        unsigned long user_locked, user_lock_limit;
        struct user_struct *user = current_user();
        struct mutex *aux_mutex = NULL;
        struct perf_buffer *rb = NULL;
        unsigned long locked, lock_limit;
        unsigned long vma_size;
        unsigned long nr_pages;
        long user_extra = 0, extra = 0;
        int ret = 0, flags = 0;

        /*
         * Don't allow mmap() of inherited per-task counters. This would
         * create a performance issue due to all children writing to the
         * same rb.
         */
        if (event->cpu == -1 && event->attr.inherit)
                return -EINVAL;

        if (!(vma->vm_flags & VM_SHARED))
                return -EINVAL;

        ret = security_perf_event_read(event);
        if (ret)
                return ret;

        vma_size = vma->vm_end - vma->vm_start;

        if (vma->vm_pgoff == 0) {
                nr_pages = (vma_size / PAGE_SIZE) - 1;
        } else {
                /*
                 * AUX area mapping: if rb->aux_nr_pages != 0, it's already
                 * mapped, all subsequent mappings should have the same size
                 * and offset. Must be above the normal perf buffer.
                 */
                u64 aux_offset, aux_size;

                if (!event->rb)
                        return -EINVAL;

                nr_pages = vma_size / PAGE_SIZE;
                if (nr_pages > INT_MAX)
                        return -ENOMEM;

                mutex_lock(&event->mmap_mutex);
                ret = -EINVAL;

                rb = event->rb;
                if (!rb)
                        goto aux_unlock;

                aux_mutex = &rb->aux_mutex;
                mutex_lock(aux_mutex);

                aux_offset = READ_ONCE(rb->user_page->aux_offset);
                aux_size = READ_ONCE(rb->user_page->aux_size);

                if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
                        goto aux_unlock;

                if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
                        goto aux_unlock;

                /* already mapped with a different offset */
                if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
                        goto aux_unlock;

                if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
                        goto aux_unlock;

                /* already mapped with a different size */
                if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
                        goto aux_unlock;

                if (!is_power_of_2(nr_pages))
                        goto aux_unlock;

                if (!atomic_inc_not_zero(&rb->mmap_count))
                        goto aux_unlock;

                if (rb_has_aux(rb)) {
                        atomic_inc(&rb->aux_mmap_count);
                        ret = 0;
                        goto unlock;
                }

                user_extra = nr_pages;
                goto accounting;
        }

        /*
         * If we have rb pages ensure they're a power-of-two number, so we
         * can do bitmasks instead of modulo.
         */
        if (nr_pages != 0 && !is_power_of_2(nr_pages))
                return -EINVAL;

        if (vma_size != PAGE_SIZE * (1 + nr_pages))
                return -EINVAL;

        WARN_ON_ONCE(event->ctx->parent_ctx);
again:
        mutex_lock(&event->mmap_mutex);
        if (event->rb) {
                if (data_page_nr(event->rb) != nr_pages) {
                        ret = -EINVAL;
                        goto unlock;
                }

                if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
                        /*
                         * Raced against perf_mmap_close(); remove the
                         * event and try again.
                         */
                        ring_buffer_attach(event, NULL);
                        mutex_unlock(&event->mmap_mutex);
                        goto again;
                }

                goto unlock;
        }

        user_extra = nr_pages + 1;

accounting:
        user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);

        /*
         * Increase the limit linearly with more CPUs:
         */
        user_lock_limit *= num_online_cpus();

        user_locked = atomic_long_read(&user->locked_vm);

        /*
         * sysctl_perf_event_mlock may have changed, so that
         *     user->locked_vm > user_lock_limit
         */
        if (user_locked > user_lock_limit)
                user_locked = user_lock_limit;
        user_locked += user_extra;

        if (user_locked > user_lock_limit) {
                /*
                 * charge locked_vm until it hits user_lock_limit;
                 * charge the rest from pinned_vm
                 */
                extra = user_locked - user_lock_limit;
                user_extra -= extra;
        }

        lock_limit = rlimit(RLIMIT_MEMLOCK);
        lock_limit >>= PAGE_SHIFT;
        locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;

        if ((locked > lock_limit) && perf_is_paranoid() &&
                !capable(CAP_IPC_LOCK)) {
                ret = -EPERM;
                goto unlock;
        }

        WARN_ON(!rb && event->rb);

        if (vma->vm_flags & VM_WRITE)
                flags |= RING_BUFFER_WRITABLE;

        if (!rb) {
                rb = rb_alloc(nr_pages,
                              event->attr.watermark ? event->attr.wakeup_watermark : 0,
                              event->cpu, flags);

                if (!rb) {
                        ret = -ENOMEM;
                        goto unlock;
                }

                atomic_set(&rb->mmap_count, 1);
                rb->mmap_user = get_current_user();
                rb->mmap_locked = extra;

                ring_buffer_attach(event, rb);

                perf_event_update_time(event);
                perf_event_init_userpage(event);
                perf_event_update_userpage(event);
        } else {
                ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
                                   event->attr.aux_watermark, flags);
                if (!ret) {
                        atomic_set(&rb->aux_mmap_count, 1);
                        rb->aux_mmap_locked = extra;
                }
        }

unlock:
        if (!ret) {
                atomic_long_add(user_extra, &user->locked_vm);
                atomic64_add(extra, &vma->vm_mm->pinned_vm);

                atomic_inc(&event->mmap_count);
        } else if (rb) {
                /* AUX allocation failed */
                atomic_dec(&rb->mmap_count);
        }
aux_unlock:
        if (aux_mutex)
                mutex_unlock(aux_mutex);
        mutex_unlock(&event->mmap_mutex);

        if (ret)
                return ret;

        /*
         * Since pinned accounting is per vm we cannot allow fork() to copy our
         * vma.
         */
        vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
        vma->vm_ops = &perf_mmap_vmops;

        if (event->pmu->event_mapped)
                event->pmu->event_mapped(event, vma->vm_mm);

        return ret;
}

static int perf_fasync(int fd, struct file *filp, int on)
{
        struct inode *inode = file_inode(filp);
        struct perf_event *event = filp->private_data;
        int retval;

        inode_lock(inode);
        retval = fasync_helper(fd, filp, on, &event->fasync);
        inode_unlock(inode);

        if (retval < 0)
                return retval;

        return 0;
}

static const struct file_operations perf_fops = {
        .llseek                        = no_llseek,
        .release                = perf_release,
        .read                        = perf_read,
        .poll                        = perf_poll,
        .unlocked_ioctl                = perf_ioctl,
        .compat_ioctl                = perf_compat_ioctl,
        .mmap                        = perf_mmap,
        .fasync                        = perf_fasync,
};

/*
 * Perf event wakeup
 *
 * If there's data, ensure we set the poll() state and publish everything
 * to user-space before waking everybody up.
 */

static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
{
        /* only the parent has fasync state */
        if (event->parent)
                event = event->parent;
        return &event->fasync;
}

void perf_event_wakeup(struct perf_event *event)
{
        ring_buffer_wakeup(event);

        if (event->pending_kill) {
                kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
                event->pending_kill = 0;
        }
}

static void perf_pending_event_disable(struct perf_event *event)
{
        int cpu = READ_ONCE(event->pending_disable);

        if (cpu < 0)
                return;

        if (cpu == smp_processor_id()) {
                WRITE_ONCE(event->pending_disable, -1);
                perf_event_disable_local(event);
                return;
        }

        /*
         *  CPU-A                        CPU-B
         *
         *  perf_event_disable_inatomic()
         *    @pending_disable = CPU-A;
         *    irq_work_queue();
         *
         *  sched-out
         *    @pending_disable = -1;
         *
         *                                sched-in
         *                                perf_event_disable_inatomic()
         *                                  @pending_disable = CPU-B;
         *                                  irq_work_queue(); // FAILS
         *
         *  irq_work_run()
         *    perf_pending_event()
         *
         * But the event runs on CPU-B and wants disabling there.
         */
        irq_work_queue_on(&event->pending, cpu);
}

static void perf_pending_event(struct irq_work *entry)
{
        struct perf_event *event = container_of(entry, struct perf_event, pending);
        int rctx;

        rctx = perf_swevent_get_recursion_context();
        /*
         * If we 'fail' here, that's OK, it means recursion is already disabled
         * and we won't recurse 'further'.
         */

        perf_pending_event_disable(event);

        if (event->pending_wakeup) {
                event->pending_wakeup = 0;
                perf_event_wakeup(event);
        }

        if (rctx >= 0)
                perf_swevent_put_recursion_context(rctx);
}

/*
 * We assume there is only KVM supporting the callbacks.
 * Later on, we might change it to a list if there is
 * another virtualization implementation supporting the callbacks.
 */
struct perf_guest_info_callbacks __rcu *perf_guest_cbs;

int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
{
        if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs)))
                return -EBUSY;

        rcu_assign_pointer(perf_guest_cbs, cbs);
        return 0;
}
EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);

int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
{
        if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs))
                return -EINVAL;

        rcu_assign_pointer(perf_guest_cbs, NULL);
        synchronize_rcu();
        return 0;
}
EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);

static void
perf_output_sample_regs(struct perf_output_handle *handle,
                        struct pt_regs *regs, u64 mask)
{
        int bit;
        DECLARE_BITMAP(_mask, 64);

        bitmap_from_u64(_mask, mask);
        for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
                u64 val;

                val = perf_reg_value(regs, bit);
                perf_output_put(handle, val);
        }
}

static void perf_sample_regs_user(struct perf_regs *regs_user,
                                  struct pt_regs *regs)
{
        if (user_mode(regs)) {
                regs_user->abi = perf_reg_abi(current);
                regs_user->regs = regs;
        } else if (!(current->flags & PF_KTHREAD)) {
                perf_get_regs_user(regs_user, regs);
        } else {
                regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
                regs_user->regs = NULL;
        }
}

static void perf_sample_regs_intr(struct perf_regs *regs_intr,
                                  struct pt_regs *regs)
{
        regs_intr->regs = regs;
        regs_intr->abi  = perf_reg_abi(current);
}


/*
 * Get remaining task size from user stack pointer.
 *
 * It'd be better to take stack vma map and limit this more
 * precisely, but there's no way to get it safely under interrupt,
 * so using TASK_SIZE as limit.
 */
static u64 perf_ustack_task_size(struct pt_regs *regs)
{
        unsigned long addr = perf_user_stack_pointer(regs);

        if (!addr || addr >= TASK_SIZE)
                return 0;

        return TASK_SIZE - addr;
}

static u16
perf_sample_ustack_size(u16 stack_size, u16 header_size,
                        struct pt_regs *regs)
{
        u64 task_size;

        /* No regs, no stack pointer, no dump. */
        if (!regs)
                return 0;

        /* No mm, no stack, no dump. */
        if (!current->mm)
                return 0;

        /*
         * Check if we fit in with the requested stack size into the:
         * - TASK_SIZE
         *   If we don't, we limit the size to the TASK_SIZE.
         *
         * - remaining sample size
         *   If we don't, we customize the stack size to
         *   fit in to the remaining sample size.
         */

        task_size  = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
        stack_size = min(stack_size, (u16) task_size);

        /* Current header size plus static size and dynamic size. */
        header_size += 2 * sizeof(u64);

        /* Do we fit in with the current stack dump size? */
        if ((u16) (header_size + stack_size) < header_size) {
                /*
                 * If we overflow the maximum size for the sample,
                 * we customize the stack dump size to fit in.
                 */
                stack_size = USHRT_MAX - header_size - sizeof(u64);
                stack_size = round_up(stack_size, sizeof(u64));
        }

        return stack_size;
}

static void
perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
                          struct pt_regs *regs)
{
        /* Case of a kernel thread, nothing to dump */
        if (!regs) {
                u64 size = 0;
                perf_output_put(handle, size);
        } else {
                unsigned long sp;
                unsigned int rem;
                u64 dyn_size;
                mm_segment_t fs;

                /*
                 * We dump:
                 * static size
                 *   - the size requested by user or the best one we can fit
                 *     in to the sample max size
                 * data
                 *   - user stack dump data
                 * dynamic size
                 *   - the actual dumped size
                 */

                /* Static size. */
                perf_output_put(handle, dump_size);

                /* Data. */
                sp = perf_user_stack_pointer(regs);
                fs = force_uaccess_begin();
                rem = __output_copy_user(handle, (void *) sp, dump_size);
                force_uaccess_end(fs);
                dyn_size = dump_size - rem;

                perf_output_skip(handle, rem);

                /* Dynamic size. */
                perf_output_put(handle, dyn_size);
        }
}

static unsigned long perf_prepare_sample_aux(struct perf_event *event,
                                          struct perf_sample_data *data,
                                          size_t size)
{
        struct perf_event *sampler = event->aux_event;
        struct perf_buffer *rb;

        data->aux_size = 0;

        if (!sampler)
                goto out;

        if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE))
                goto out;

        if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
                goto out;

        rb = ring_buffer_get(sampler);
        if (!rb)
                goto out;

        /*
         * If this is an NMI hit inside sampling code, don't take
         * the sample. See also perf_aux_sample_output().
         */
        if (READ_ONCE(rb->aux_in_sampling)) {
                data->aux_size = 0;
        } else {
                size = min_t(size_t, size, perf_aux_size(rb));
                data->aux_size = ALIGN(size, sizeof(u64));
        }
        ring_buffer_put(rb);

out:
        return data->aux_size;
}

long perf_pmu_snapshot_aux(struct perf_buffer *rb,
                           struct perf_event *event,
                           struct perf_output_handle *handle,
                           unsigned long size)
{
        unsigned long flags;
        long ret;

        /*
         * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler
         * paths. If we start calling them in NMI context, they may race with
         * the IRQ ones, that is, for example, re-starting an event that's just
         * been stopped, which is why we're using a separate callback that
         * doesn't change the event state.
         *
         * IRQs need to be disabled to prevent IPIs from racing with us.
         */
        local_irq_save(flags);
        /*
         * Guard against NMI hits inside the critical section;
         * see also perf_prepare_sample_aux().
         */
        WRITE_ONCE(rb->aux_in_sampling, 1);
        barrier();

        ret = event->pmu->snapshot_aux(event, handle, size);

        barrier();
        WRITE_ONCE(rb->aux_in_sampling, 0);
        local_irq_restore(flags);

        return ret;
}

static void perf_aux_sample_output(struct perf_event *event,
                                   struct perf_output_handle *handle,
                                   struct perf_sample_data *data)
{
        struct perf_event *sampler = event->aux_event;
        struct perf_buffer *rb;
        unsigned long pad;
        long size;

        if (WARN_ON_ONCE(!sampler || !data->aux_size))
                return;

        rb = ring_buffer_get(sampler);
        if (!rb)
                return;

        size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size);

        /*
         * An error here means that perf_output_copy() failed (returned a
         * non-zero surplus that it didn't copy), which in its current
         * enlightened implementation is not possible. If that changes, we'd
         * like to know.
         */
        if (WARN_ON_ONCE(size < 0))
                goto out_put;

        /*
         * The pad comes from ALIGN()ing data->aux_size up to u64 in
         * perf_prepare_sample_aux(), so should not be more than that.
         */
        pad = data->aux_size - size;
        if (WARN_ON_ONCE(pad >= sizeof(u64)))
                pad = 8;

        if (pad) {
                u64 zero = 0;
                perf_output_copy(handle, &zero, pad);
        }

out_put:
        ring_buffer_put(rb);
}

static void __perf_event_header__init_id(struct perf_event_header *header,
                                         struct perf_sample_data *data,
                                         struct perf_event *event)
{
        u64 sample_type = event->attr.sample_type;

        data->type = sample_type;
        header->size += event->id_header_size;

        if (sample_type & PERF_SAMPLE_TID) {
                /* namespace issues */
                data->tid_entry.pid = perf_event_pid(event, current);
                data->tid_entry.tid = perf_event_tid(event, current);
        }

        if (sample_type & PERF_SAMPLE_TIME)
                data->time = perf_event_clock(event);

        if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
                data->id = primary_event_id(event);

        if (sample_type & PERF_SAMPLE_STREAM_ID)
                data->stream_id = event->id;

        if (sample_type & PERF_SAMPLE_CPU) {
                data->cpu_entry.cpu         = raw_smp_processor_id();
                data->cpu_entry.reserved = 0;
        }
}

void perf_event_header__init_id(struct perf_event_header *header,
                                struct perf_sample_data *data,
                                struct perf_event *event)
{
        if (event->attr.sample_id_all)
                __perf_event_header__init_id(header, data, event);
}

static void __perf_event__output_id_sample(struct perf_output_handle *handle,
                                           struct perf_sample_data *data)
{
        u64 sample_type = data->type;

        if (sample_type & PERF_SAMPLE_TID)
                perf_output_put(handle, data->tid_entry);

        if (sample_type & PERF_SAMPLE_TIME)
                perf_output_put(handle, data->time);

        if (sample_type & PERF_SAMPLE_ID)
                perf_output_put(handle, data->id);

        if (sample_type & PERF_SAMPLE_STREAM_ID)
                perf_output_put(handle, data->stream_id);

        if (sample_type & PERF_SAMPLE_CPU)
                perf_output_put(handle, data->cpu_entry);

        if (sample_type & PERF_SAMPLE_IDENTIFIER)
                perf_output_put(handle, data->id);
}

void perf_event__output_id_sample(struct perf_event *event,
                                  struct perf_output_handle *handle,
                                  struct perf_sample_data *sample)
{
        if (event->attr.sample_id_all)
                __perf_event__output_id_sample(handle, sample);
}

static void perf_output_read_one(struct perf_output_handle *handle,
                                 struct perf_event *event,
                                 u64 enabled, u64 running)
{
        u64 read_format = event->attr.read_format;
        u64 values[5];
        int n = 0;

        values[n++] = perf_event_count(event);
        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
                values[n++] = enabled +
                        atomic64_read(&event->child_total_time_enabled);
        }
        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
                values[n++] = running +
                        atomic64_read(&event->child_total_time_running);
        }
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(event);
        if (read_format & PERF_FORMAT_LOST)
                values[n++] = atomic64_read(&event->lost_samples);

        __output_copy(handle, values, n * sizeof(u64));
}

static void perf_output_read_group(struct perf_output_handle *handle,
                            struct perf_event *event,
                            u64 enabled, u64 running)
{
        struct perf_event *leader = event->group_leader, *sub;
        u64 read_format = event->attr.read_format;
        unsigned long flags;
        u64 values[6];
        int n = 0;

        /*
         * Disabling interrupts avoids all counter scheduling
         * (context switches, timer based rotation and IPIs).
         */
        local_irq_save(flags);

        values[n++] = 1 + leader->nr_siblings;

        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
                values[n++] = enabled;

        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
                values[n++] = running;

        if ((leader != event) &&
            (leader->state == PERF_EVENT_STATE_ACTIVE))
                leader->pmu->read(leader);

        values[n++] = perf_event_count(leader);
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(leader);
        if (read_format & PERF_FORMAT_LOST)
                values[n++] = atomic64_read(&leader->lost_samples);

        __output_copy(handle, values, n * sizeof(u64));

        for_each_sibling_event(sub, leader) {
                n = 0;

                if ((sub != event) &&
                    (sub->state == PERF_EVENT_STATE_ACTIVE))
                        sub->pmu->read(sub);

                values[n++] = perf_event_count(sub);
                if (read_format & PERF_FORMAT_ID)
                        values[n++] = primary_event_id(sub);
                if (read_format & PERF_FORMAT_LOST)
                        values[n++] = atomic64_read(&sub->lost_samples);

                __output_copy(handle, values, n * sizeof(u64));
        }

        local_irq_restore(flags);
}

#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
                                 PERF_FORMAT_TOTAL_TIME_RUNNING)

/*
 * XXX PERF_SAMPLE_READ vs inherited events seems difficult.
 *
 * The problem is that its both hard and excessively expensive to iterate the
 * child list, not to mention that its impossible to IPI the children running
 * on another CPU, from interrupt/NMI context.
 */
static void perf_output_read(struct perf_output_handle *handle,
                             struct perf_event *event)
{
        u64 enabled = 0, running = 0, now;
        u64 read_format = event->attr.read_format;

        /*
         * compute total_time_enabled, total_time_running
         * based on snapshot values taken when the event
         * was last scheduled in.
         *
         * we cannot simply called update_context_time()
         * because of locking issue as we are called in
         * NMI context
         */
        if (read_format & PERF_FORMAT_TOTAL_TIMES)
                calc_timer_values(event, &now, &enabled, &running);

        if (event->attr.read_format & PERF_FORMAT_GROUP)
                perf_output_read_group(handle, event, enabled, running);
        else
                perf_output_read_one(handle, event, enabled, running);
}

static inline bool perf_sample_save_hw_index(struct perf_event *event)
{
        return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX;
}

void perf_output_sample(struct perf_output_handle *handle,
                        struct perf_event_header *header,
                        struct perf_sample_data *data,
                        struct perf_event *event)
{
        u64 sample_type = data->type;

        perf_output_put(handle, *header);

        if (sample_type & PERF_SAMPLE_IDENTIFIER)
                perf_output_put(handle, data->id);

        if (sample_type & PERF_SAMPLE_IP)
                perf_output_put(handle, data->ip);

        if (sample_type & PERF_SAMPLE_TID)
                perf_output_put(handle, data->tid_entry);

        if (sample_type & PERF_SAMPLE_TIME)
                perf_output_put(handle, data->time);

        if (sample_type & PERF_SAMPLE_ADDR)
                perf_output_put(handle, data->addr);

        if (sample_type & PERF_SAMPLE_ID)
                perf_output_put(handle, data->id);

        if (sample_type & PERF_SAMPLE_STREAM_ID)
                perf_output_put(handle, data->stream_id);

        if (sample_type & PERF_SAMPLE_CPU)
                perf_output_put(handle, data->cpu_entry);

        if (sample_type & PERF_SAMPLE_PERIOD)
                perf_output_put(handle, data->period);

        if (sample_type & PERF_SAMPLE_READ)
                perf_output_read(handle, event);

        if (sample_type & PERF_SAMPLE_CALLCHAIN) {
                int size = 1;

                size += data->callchain->nr;
                size *= sizeof(u64);
                __output_copy(handle, data->callchain, size);
        }

        if (sample_type & PERF_SAMPLE_RAW) {
                struct perf_raw_record *raw = data->raw;

                if (raw) {
                        struct perf_raw_frag *frag = &raw->frag;

                        perf_output_put(handle, raw->size);
                        do {
                                if (frag->copy) {
                                        __output_custom(handle, frag->copy,
                                                        frag->data, frag->size);
                                } else {
                                        __output_copy(handle, frag->data,
                                                      frag->size);
                                }
                                if (perf_raw_frag_last(frag))
                                        break;
                                frag = frag->next;
                        } while (1);
                        if (frag->pad)
                                __output_skip(handle, NULL, frag->pad);
                } else {
                        struct {
                                u32        size;
                                u32        data;
                        } raw = {
                                .size = sizeof(u32),
                                .data = 0,
                        };
                        perf_output_put(handle, raw);
                }
        }

        if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
                if (data->br_stack) {
                        size_t size;

                        size = data->br_stack->nr
                             * sizeof(struct perf_branch_entry);

                        perf_output_put(handle, data->br_stack->nr);
                        if (perf_sample_save_hw_index(event))
                                perf_output_put(handle, data->br_stack->hw_idx);
                        perf_output_copy(handle, data->br_stack->entries, size);
                } else {
                        /*
                         * we always store at least the value of nr
                         */
                        u64 nr = 0;
                        perf_output_put(handle, nr);
                }
        }

        if (sample_type & PERF_SAMPLE_REGS_USER) {
                u64 abi = data->regs_user.abi;

                /*
                 * If there are no regs to dump, notice it through
                 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
                 */
                perf_output_put(handle, abi);

                if (abi) {
                        u64 mask = event->attr.sample_regs_user;
                        perf_output_sample_regs(handle,
                                                data->regs_user.regs,
                                                mask);
                }
        }

        if (sample_type & PERF_SAMPLE_STACK_USER) {
                perf_output_sample_ustack(handle,
                                          data->stack_user_size,
                                          data->regs_user.regs);
        }

        if (sample_type & PERF_SAMPLE_WEIGHT)
                perf_output_put(handle, data->weight);

        if (sample_type & PERF_SAMPLE_DATA_SRC)
                perf_output_put(handle, data->data_src.val);

        if (sample_type & PERF_SAMPLE_TRANSACTION)
                perf_output_put(handle, data->txn);

        if (sample_type & PERF_SAMPLE_REGS_INTR) {
                u64 abi = data->regs_intr.abi;
                /*
                 * If there are no regs to dump, notice it through
                 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
                 */
                perf_output_put(handle, abi);

                if (abi) {
                        u64 mask = event->attr.sample_regs_intr;

                        perf_output_sample_regs(handle,
                                                data->regs_intr.regs,
                                                mask);
                }
        }

        if (sample_type & PERF_SAMPLE_PHYS_ADDR)
                perf_output_put(handle, data->phys_addr);

        if (sample_type & PERF_SAMPLE_CGROUP)
                perf_output_put(handle, data->cgroup);

        if (sample_type & PERF_SAMPLE_AUX) {
                perf_output_put(handle, data->aux_size);

                if (data->aux_size)
                        perf_aux_sample_output(event, handle, data);
        }

        if (!event->attr.watermark) {
                int wakeup_events = event->attr.wakeup_events;

                if (wakeup_events) {
                        struct perf_buffer *rb = handle->rb;
                        int events = local_inc_return(&rb->events);

                        if (events >= wakeup_events) {
                                local_sub(wakeup_events, &rb->events);
                                local_inc(&rb->wakeup);
                        }
                }
        }
}

static u64 perf_virt_to_phys(u64 virt)
{
        u64 phys_addr = 0;

        if (!virt)
                return 0;

        if (virt >= TASK_SIZE) {
                /* If it's vmalloc()d memory, leave phys_addr as 0 */
                if (virt_addr_valid((void *)(uintptr_t)virt) &&
                    !(virt >= VMALLOC_START && virt < VMALLOC_END))
                        phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
        } else {
                /*
                 * Walking the pages tables for user address.
                 * Interrupts are disabled, so it prevents any tear down
                 * of the page tables.
                 * Try IRQ-safe get_user_page_fast_only first.
                 * If failed, leave phys_addr as 0.
                 */
                if (current->mm != NULL) {
                        struct page *p;

                        pagefault_disable();
                        if (get_user_page_fast_only(virt, 0, &p)) {
                                phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
                                put_page(p);
                        }
                        pagefault_enable();
                }
        }

        return phys_addr;
}

static struct perf_callchain_entry __empty_callchain = { .nr = 0, };

struct perf_callchain_entry *
perf_callchain(struct perf_event *event, struct pt_regs *regs)
{
        bool kernel = !event->attr.exclude_callchain_kernel;
        bool user   = !event->attr.exclude_callchain_user;
        /* Disallow cross-task user callchains. */
        bool crosstask = event->ctx->task && event->ctx->task != current;
        const u32 max_stack = event->attr.sample_max_stack;
        struct perf_callchain_entry *callchain;

        if (!current->mm)
                user = false;

        if (!kernel && !user)
                return &__empty_callchain;

        callchain = get_perf_callchain(regs, 0, kernel, user,
                                       max_stack, crosstask, true);
        return callchain ?: &__empty_callchain;
}

void perf_prepare_sample(struct perf_event_header *header,
                         struct perf_sample_data *data,
                         struct perf_event *event,
                         struct pt_regs *regs)
{
        u64 sample_type = event->attr.sample_type;

        header->type = PERF_RECORD_SAMPLE;
        header->size = sizeof(*header) + event->header_size;

        header->misc = 0;
        header->misc |= perf_misc_flags(regs);

        __perf_event_header__init_id(header, data, event);

        if (sample_type & PERF_SAMPLE_IP)
                data->ip = perf_instruction_pointer(regs);

        if (sample_type & PERF_SAMPLE_CALLCHAIN) {
                int size = 1;

                if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
                        data->callchain = perf_callchain(event, regs);

                size += data->callchain->nr;

                header->size += size * sizeof(u64);
        }

        if (sample_type & PERF_SAMPLE_RAW) {
                struct perf_raw_record *raw = data->raw;
                int size;

                if (raw) {
                        struct perf_raw_frag *frag = &raw->frag;
                        u32 sum = 0;

                        do {
                                sum += frag->size;
                                if (perf_raw_frag_last(frag))
                                        break;
                                frag = frag->next;
                        } while (1);

                        size = round_up(sum + sizeof(u32), sizeof(u64));
                        raw->size = size - sizeof(u32);
                        frag->pad = raw->size - sum;
                } else {
                        size = sizeof(u64);
                }

                header->size += size;
        }

        if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
                int size = sizeof(u64); /* nr */
                if (data->br_stack) {
                        if (perf_sample_save_hw_index(event))
                                size += sizeof(u64);

                        size += data->br_stack->nr
                              * sizeof(struct perf_branch_entry);
                }
                header->size += size;
        }

        if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
                perf_sample_regs_user(&data->regs_user, regs);

        if (sample_type & PERF_SAMPLE_REGS_USER) {
                /* regs dump ABI info */
                int size = sizeof(u64);

                if (data->regs_user.regs) {
                        u64 mask = event->attr.sample_regs_user;
                        size += hweight64(mask) * sizeof(u64);
                }

                header->size += size;
        }

        if (sample_type & PERF_SAMPLE_STACK_USER) {
                /*
                 * Either we need PERF_SAMPLE_STACK_USER bit to be always
                 * processed as the last one or have additional check added
                 * in case new sample type is added, because we could eat
                 * up the rest of the sample size.
                 */
                u16 stack_size = event->attr.sample_stack_user;
                u16 size = sizeof(u64);

                stack_size = perf_sample_ustack_size(stack_size, header->size,
                                                     data->regs_user.regs);

                /*
                 * If there is something to dump, add space for the dump
                 * itself and for the field that tells the dynamic size,
                 * which is how many have been actually dumped.
                 */
                if (stack_size)
                        size += sizeof(u64) + stack_size;

                data->stack_user_size = stack_size;
                header->size += size;
        }

        if (sample_type & PERF_SAMPLE_REGS_INTR) {
                /* regs dump ABI info */
                int size = sizeof(u64);

                perf_sample_regs_intr(&data->regs_intr, regs);

                if (data->regs_intr.regs) {
                        u64 mask = event->attr.sample_regs_intr;

                        size += hweight64(mask) * sizeof(u64);
                }

                header->size += size;
        }

        if (sample_type & PERF_SAMPLE_PHYS_ADDR)
                data->phys_addr = perf_virt_to_phys(data->addr);

#ifdef CONFIG_CGROUP_PERF
        if (sample_type & PERF_SAMPLE_CGROUP) {
                struct cgroup *cgrp;

                /* protected by RCU */
                cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
                data->cgroup = cgroup_id(cgrp);
        }
#endif

        if (sample_type & PERF_SAMPLE_AUX) {
                u64 size;

                header->size += sizeof(u64); /* size */

                /*
                 * Given the 16bit nature of header::size, an AUX sample can
                 * easily overflow it, what with all the preceding sample bits.
                 * Make sure this doesn't happen by using up to U16_MAX bytes
                 * per sample in total (rounded down to 8 byte boundary).
                 */
                size = min_t(size_t, U16_MAX - header->size,
                             event->attr.aux_sample_size);
                size = rounddown(size, 8);
                size = perf_prepare_sample_aux(event, data, size);

                WARN_ON_ONCE(size + header->size > U16_MAX);
                header->size += size;
        }
        /*
         * If you're adding more sample types here, you likely need to do
         * something about the overflowing header::size, like repurpose the
         * lowest 3 bits of size, which should be always zero at the moment.
         * This raises a more important question, do we really need 512k sized
         * samples and why, so good argumentation is in order for whatever you
         * do here next.
         */
        WARN_ON_ONCE(header->size & 7);
}

static __always_inline int
__perf_event_output(struct perf_event *event,
                    struct perf_sample_data *data,
                    struct pt_regs *regs,
                    int (*output_begin)(struct perf_output_handle *,
                                        struct perf_sample_data *,
                                        struct perf_event *,
                                        unsigned int))
{
        struct perf_output_handle handle;
        struct perf_event_header header;
        int err;

        /* protect the callchain buffers */
        rcu_read_lock();

        perf_prepare_sample(&header, data, event, regs);

        err = output_begin(&handle, data, event, header.size);
        if (err)
                goto exit;

        perf_output_sample(&handle, &header, data, event);

        perf_output_end(&handle);

exit:
        rcu_read_unlock();
        return err;
}

void
perf_event_output_forward(struct perf_event *event,
                         struct perf_sample_data *data,
                         struct pt_regs *regs)
{
        __perf_event_output(event, data, regs, perf_output_begin_forward);
}

void
perf_event_output_backward(struct perf_event *event,
                           struct perf_sample_data *data,
                           struct pt_regs *regs)
{
        __perf_event_output(event, data, regs, perf_output_begin_backward);
}

int
perf_event_output(struct perf_event *event,
                  struct perf_sample_data *data,
                  struct pt_regs *regs)
{
        return __perf_event_output(event, data, regs, perf_output_begin);
}

/*
 * read event_id
 */

struct perf_read_event {
        struct perf_event_header        header;

        u32                                pid;
        u32                                tid;
};

static void
perf_event_read_event(struct perf_event *event,
                        struct task_struct *task)
{
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        struct perf_read_event read_event = {
                .header = {
                        .type = PERF_RECORD_READ,
                        .misc = 0,
                        .size = sizeof(read_event) + event->read_size,
                },
                .pid = perf_event_pid(event, task),
                .tid = perf_event_tid(event, task),
        };
        int ret;

        perf_event_header__init_id(&read_event.header, &sample, event);
        ret = perf_output_begin(&handle, &sample, event, read_event.header.size);
        if (ret)
                return;

        perf_output_put(&handle, read_event);
        perf_output_read(&handle, event);
        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

typedef void (perf_iterate_f)(struct perf_event *event, void *data);

static void
perf_iterate_ctx(struct perf_event_context *ctx,
                   perf_iterate_f output,
                   void *data, bool all)
{
        struct perf_event *event;

        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
                if (!all) {
                        if (event->state < PERF_EVENT_STATE_INACTIVE)
                                continue;
                        if (!event_filter_match(event))
                                continue;
                }

                output(event, data);
        }
}

static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
{
        struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
        struct perf_event *event;

        list_for_each_entry_rcu(event, &pel->list, sb_list) {
                /*
                 * Skip events that are not fully formed yet; ensure that
                 * if we observe event->ctx, both event and ctx will be
                 * complete enough. See perf_install_in_context().
                 */
                if (!smp_load_acquire(&event->ctx))
                        continue;

                if (event->state < PERF_EVENT_STATE_INACTIVE)
                        continue;
                if (!event_filter_match(event))
                        continue;
                output(event, data);
        }
}

/*
 * Iterate all events that need to receive side-band events.
 *
 * For new callers; ensure that account_pmu_sb_event() includes
 * your event, otherwise it might not get delivered.
 */
static void
perf_iterate_sb(perf_iterate_f output, void *data,
               struct perf_event_context *task_ctx)
{
        struct perf_event_context *ctx;
        int ctxn;

        rcu_read_lock();
        preempt_disable();

        /*
         * If we have task_ctx != NULL we only notify the task context itself.
         * The task_ctx is set only for EXIT events before releasing task
         * context.
         */
        if (task_ctx) {
                perf_iterate_ctx(task_ctx, output, data, false);
                goto done;
        }

        perf_iterate_sb_cpu(output, data);

        for_each_task_context_nr(ctxn) {
                ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
                if (ctx)
                        perf_iterate_ctx(ctx, output, data, false);
        }
done:
        preempt_enable();
        rcu_read_unlock();
}

/*
 * Clear all file-based filters at exec, they'll have to be
 * re-instated when/if these objects are mmapped again.
 */
static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
{
        struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
        struct perf_addr_filter *filter;
        unsigned int restart = 0, count = 0;
        unsigned long flags;

        if (!has_addr_filter(event))
                return;

        raw_spin_lock_irqsave(&ifh->lock, flags);
        list_for_each_entry(filter, &ifh->list, entry) {
                if (filter->path.dentry) {
                        event->addr_filter_ranges[count].start = 0;
                        event->addr_filter_ranges[count].size = 0;
                        restart++;
                }

                count++;
        }

        if (restart)
                event->addr_filters_gen++;
        raw_spin_unlock_irqrestore(&ifh->lock, flags);

        if (restart)
                perf_event_stop(event, 1);
}

void perf_event_exec(void)
{
        struct perf_event_context *ctx;
        int ctxn;

        rcu_read_lock();
        for_each_task_context_nr(ctxn) {
                ctx = current->perf_event_ctxp[ctxn];
                if (!ctx)
                        continue;

                perf_event_enable_on_exec(ctxn);

                perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
                                   true);
        }
        rcu_read_unlock();
}

struct remote_output {
        struct perf_buffer        *rb;
        int                        err;
};

static void __perf_event_output_stop(struct perf_event *event, void *data)
{
        struct perf_event *parent = event->parent;
        struct remote_output *ro = data;
        struct perf_buffer *rb = ro->rb;
        struct stop_event_data sd = {
                .event        = event,
        };

        if (!has_aux(event))
                return;

        if (!parent)
                parent = event;

        /*
         * In case of inheritance, it will be the parent that links to the
         * ring-buffer, but it will be the child that's actually using it.
         *
         * We are using event::rb to determine if the event should be stopped,
         * however this may race with ring_buffer_attach() (through set_output),
         * which will make us skip the event that actually needs to be stopped.
         * So ring_buffer_attach() has to stop an aux event before re-assigning
         * its rb pointer.
         */
        if (rcu_dereference(parent->rb) == rb)
                ro->err = __perf_event_stop(&sd);
}

static int __perf_pmu_output_stop(void *info)
{
        struct perf_event *event = info;
        struct pmu *pmu = event->ctx->pmu;
        struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
        struct remote_output ro = {
                .rb        = event->rb,
        };

        rcu_read_lock();
        perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
        if (cpuctx->task_ctx)
                perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
                                   &ro, false);
        rcu_read_unlock();

        return ro.err;
}

static void perf_pmu_output_stop(struct perf_event *event)
{
        struct perf_event *iter;
        int err, cpu;

restart:
        rcu_read_lock();
        list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
                /*
                 * For per-CPU events, we need to make sure that neither they
                 * nor their children are running; for cpu==-1 events it's
                 * sufficient to stop the event itself if it's active, since
                 * it can't have children.
                 */
                cpu = iter->cpu;
                if (cpu == -1)
                        cpu = READ_ONCE(iter->oncpu);

                if (cpu == -1)
                        continue;

                err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
                if (err == -EAGAIN) {
                        rcu_read_unlock();
                        goto restart;
                }
        }
        rcu_read_unlock();
}

/*
 * task tracking -- fork/exit
 *
 * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
 */

struct perf_task_event {
        struct task_struct                *task;
        struct perf_event_context        *task_ctx;

        struct {
                struct perf_event_header        header;

                u32                                pid;
                u32                                ppid;
                u32                                tid;
                u32                                ptid;
                u64                                time;
        } event_id;
};

static int perf_event_task_match(struct perf_event *event)
{
        return event->attr.comm  || event->attr.mmap ||
               event->attr.mmap2 || event->attr.mmap_data ||
               event->attr.task;
}

static void perf_event_task_output(struct perf_event *event,
                                   void *data)
{
        struct perf_task_event *task_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data        sample;
        struct task_struct *task = task_event->task;
        int ret, size = task_event->event_id.header.size;

        if (!perf_event_task_match(event))
                return;

        perf_event_header__init_id(&task_event->event_id.header, &sample, event);

        ret = perf_output_begin(&handle, &sample, event,
                                task_event->event_id.header.size);
        if (ret)
                goto out;

        task_event->event_id.pid = perf_event_pid(event, task);
        task_event->event_id.tid = perf_event_tid(event, task);

        if (task_event->event_id.header.type == PERF_RECORD_EXIT) {
                task_event->event_id.ppid = perf_event_pid(event,
                                                        task->real_parent);
                task_event->event_id.ptid = perf_event_pid(event,
                                                        task->real_parent);
        } else {  /* PERF_RECORD_FORK */
                task_event->event_id.ppid = perf_event_pid(event, current);
                task_event->event_id.ptid = perf_event_tid(event, current);
        }

        task_event->event_id.time = perf_event_clock(event);

        perf_output_put(&handle, task_event->event_id);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
out:
        task_event->event_id.header.size = size;
}

static void perf_event_task(struct task_struct *task,
                              struct perf_event_context *task_ctx,
                              int new)
{
        struct perf_task_event task_event;

        if (!atomic_read(&nr_comm_events) &&
            !atomic_read(&nr_mmap_events) &&
            !atomic_read(&nr_task_events))
                return;

        task_event = (struct perf_task_event){
                .task          = task,
                .task_ctx = task_ctx,
                .event_id    = {
                        .header = {
                                .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
                                .misc = 0,
                                .size = sizeof(task_event.event_id),
                        },
                        /* .pid  */
                        /* .ppid */
                        /* .tid  */
                        /* .ptid */
                        /* .time */
                },
        };

        perf_iterate_sb(perf_event_task_output,
                       &task_event,
                       task_ctx);
}

void perf_event_fork(struct task_struct *task)
{
        perf_event_task(task, NULL, 1);
        perf_event_namespaces(task);
}

/*
 * comm tracking
 */

struct perf_comm_event {
        struct task_struct        *task;
        char                        *comm;
        int                        comm_size;

        struct {
                struct perf_event_header        header;

                u32                                pid;
                u32                                tid;
        } event_id;
};

static int perf_event_comm_match(struct perf_event *event)
{
        return event->attr.comm;
}

static void perf_event_comm_output(struct perf_event *event,
                                   void *data)
{
        struct perf_comm_event *comm_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int size = comm_event->event_id.header.size;
        int ret;

        if (!perf_event_comm_match(event))
                return;

        perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
        ret = perf_output_begin(&handle, &sample, event,
                                comm_event->event_id.header.size);

        if (ret)
                goto out;

        comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
        comm_event->event_id.tid = perf_event_tid(event, comm_event->task);

        perf_output_put(&handle, comm_event->event_id);
        __output_copy(&handle, comm_event->comm,
                                   comm_event->comm_size);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
out:
        comm_event->event_id.header.size = size;
}

static void perf_event_comm_event(struct perf_comm_event *comm_event)
{
        char comm[TASK_COMM_LEN];
        unsigned int size;

        memset(comm, 0, sizeof(comm));
        strlcpy(comm, comm_event->task->comm, sizeof(comm));
        size = ALIGN(strlen(comm)+1, sizeof(u64));

        comm_event->comm = comm;
        comm_event->comm_size = size;

        comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;

        perf_iterate_sb(perf_event_comm_output,
                       comm_event,
                       NULL);
}

void perf_event_comm(struct task_struct *task, bool exec)
{
        struct perf_comm_event comm_event;

        if (!atomic_read(&nr_comm_events))
                return;

        comm_event = (struct perf_comm_event){
                .task        = task,
                /* .comm      */
                /* .comm_size */
                .event_id  = {
                        .header = {
                                .type = PERF_RECORD_COMM,
                                .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
                                /* .size */
                        },
                        /* .pid */
                        /* .tid */
                },
        };

        perf_event_comm_event(&comm_event);
}

/*
 * namespaces tracking
 */

struct perf_namespaces_event {
        struct task_struct                *task;

        struct {
                struct perf_event_header        header;

                u32                                pid;
                u32                                tid;
                u64                                nr_namespaces;
                struct perf_ns_link_info        link_info[NR_NAMESPACES];
        } event_id;
};

static int perf_event_namespaces_match(struct perf_event *event)
{
        return event->attr.namespaces;
}

static void perf_event_namespaces_output(struct perf_event *event,
                                         void *data)
{
        struct perf_namespaces_event *namespaces_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        u16 header_size = namespaces_event->event_id.header.size;
        int ret;

        if (!perf_event_namespaces_match(event))
                return;

        perf_event_header__init_id(&namespaces_event->event_id.header,
                                   &sample, event);
        ret = perf_output_begin(&handle, &sample, event,
                                namespaces_event->event_id.header.size);
        if (ret)
                goto out;

        namespaces_event->event_id.pid = perf_event_pid(event,
                                                        namespaces_event->task);
        namespaces_event->event_id.tid = perf_event_tid(event,
                                                        namespaces_event->task);

        perf_output_put(&handle, namespaces_event->event_id);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
out:
        namespaces_event->event_id.header.size = header_size;
}

static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
                                   struct task_struct *task,
                                   const struct proc_ns_operations *ns_ops)
{
        struct path ns_path;
        struct inode *ns_inode;
        int error;

        error = ns_get_path(&ns_path, task, ns_ops);
        if (!error) {
                ns_inode = ns_path.dentry->d_inode;
                ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
                ns_link_info->ino = ns_inode->i_ino;
                path_put(&ns_path);
        }
}

void perf_event_namespaces(struct task_struct *task)
{
        struct perf_namespaces_event namespaces_event;
        struct perf_ns_link_info *ns_link_info;

        if (!atomic_read(&nr_namespaces_events))
                return;

        namespaces_event = (struct perf_namespaces_event){
                .task        = task,
                .event_id  = {
                        .header = {
                                .type = PERF_RECORD_NAMESPACES,
                                .misc = 0,
                                .size = sizeof(namespaces_event.event_id),
                        },
                        /* .pid */
                        /* .tid */
                        .nr_namespaces = NR_NAMESPACES,
                        /* .link_info[NR_NAMESPACES] */
                },
        };

        ns_link_info = namespaces_event.event_id.link_info;

        perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
                               task, &mntns_operations);

#ifdef CONFIG_USER_NS
        perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
                               task, &userns_operations);
#endif
#ifdef CONFIG_NET_NS
        perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
                               task, &netns_operations);
#endif
#ifdef CONFIG_UTS_NS
        perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
                               task, &utsns_operations);
#endif
#ifdef CONFIG_IPC_NS
        perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
                               task, &ipcns_operations);
#endif
#ifdef CONFIG_PID_NS
        perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
                               task, &pidns_operations);
#endif
#ifdef CONFIG_CGROUPS
        perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
                               task, &cgroupns_operations);
#endif

        perf_iterate_sb(perf_event_namespaces_output,
                        &namespaces_event,
                        NULL);
}

/*
 * cgroup tracking
 */
#ifdef CONFIG_CGROUP_PERF

struct perf_cgroup_event {
        char                                *path;
        int                                path_size;
        struct {
                struct perf_event_header        header;
                u64                                id;
                char                                path[];
        } event_id;
};

static int perf_event_cgroup_match(struct perf_event *event)
{
        return event->attr.cgroup;
}

static void perf_event_cgroup_output(struct perf_event *event, void *data)
{
        struct perf_cgroup_event *cgroup_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        u16 header_size = cgroup_event->event_id.header.size;
        int ret;

        if (!perf_event_cgroup_match(event))
                return;

        perf_event_header__init_id(&cgroup_event->event_id.header,
                                   &sample, event);
        ret = perf_output_begin(&handle, &sample, event,
                                cgroup_event->event_id.header.size);
        if (ret)
                goto out;

        perf_output_put(&handle, cgroup_event->event_id);
        __output_copy(&handle, cgroup_event->path, cgroup_event->path_size);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
out:
        cgroup_event->event_id.header.size = header_size;
}

static void perf_event_cgroup(struct cgroup *cgrp)
{
        struct perf_cgroup_event cgroup_event;
        char path_enomem[16] = "//enomem";
        char *pathname;
        size_t size;

        if (!atomic_read(&nr_cgroup_events))
                return;

        cgroup_event = (struct perf_cgroup_event){
                .event_id  = {
                        .header = {
                                .type = PERF_RECORD_CGROUP,
                                .misc = 0,
                                .size = sizeof(cgroup_event.event_id),
                        },
                        .id = cgroup_id(cgrp),
                },
        };

        pathname = kmalloc(PATH_MAX, GFP_KERNEL);
        if (pathname == NULL) {
                cgroup_event.path = path_enomem;
        } else {
                /* just to be sure to have enough space for alignment */
                cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64));
                cgroup_event.path = pathname;
        }

        /*
         * Since our buffer works in 8 byte units we need to align our string
         * size to a multiple of 8. However, we must guarantee the tail end is
         * zero'd out to avoid leaking random bits to userspace.
         */
        size = strlen(cgroup_event.path) + 1;
        while (!IS_ALIGNED(size, sizeof(u64)))
                cgroup_event.path[size++] = '\0';

        cgroup_event.event_id.header.size += size;
        cgroup_event.path_size = size;

        perf_iterate_sb(perf_event_cgroup_output,
                        &cgroup_event,
                        NULL);

        kfree(pathname);
}

#endif

/*
 * mmap tracking
 */

struct perf_mmap_event {
        struct vm_area_struct        *vma;

        const char                *file_name;
        int                        file_size;
        int                        maj, min;
        u64                        ino;
        u64                        ino_generation;
        u32                        prot, flags;

        struct {
                struct perf_event_header        header;

                u32                                pid;
                u32                                tid;
                u64                                start;
                u64                                len;
                u64                                pgoff;
        } event_id;
};

static int perf_event_mmap_match(struct perf_event *event,
                                 void *data)
{
        struct perf_mmap_event *mmap_event = data;
        struct vm_area_struct *vma = mmap_event->vma;
        int executable = vma->vm_flags & VM_EXEC;

        return (!executable && event->attr.mmap_data) ||
               (executable && (event->attr.mmap || event->attr.mmap2));
}

static void perf_event_mmap_output(struct perf_event *event,
                                   void *data)
{
        struct perf_mmap_event *mmap_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int size = mmap_event->event_id.header.size;
        u32 type = mmap_event->event_id.header.type;
        int ret;

        if (!perf_event_mmap_match(event, data))
                return;

        if (event->attr.mmap2) {
                mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
                mmap_event->event_id.header.size += sizeof(mmap_event->maj);
                mmap_event->event_id.header.size += sizeof(mmap_event->min);
                mmap_event->event_id.header.size += sizeof(mmap_event->ino);
                mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
                mmap_event->event_id.header.size += sizeof(mmap_event->prot);
                mmap_event->event_id.header.size += sizeof(mmap_event->flags);
        }

        perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
        ret = perf_output_begin(&handle, &sample, event,
                                mmap_event->event_id.header.size);
        if (ret)
                goto out;

        mmap_event->event_id.pid = perf_event_pid(event, current);
        mmap_event->event_id.tid = perf_event_tid(event, current);

        perf_output_put(&handle, mmap_event->event_id);

        if (event->attr.mmap2) {
                perf_output_put(&handle, mmap_event->maj);
                perf_output_put(&handle, mmap_event->min);
                perf_output_put(&handle, mmap_event->ino);
                perf_output_put(&handle, mmap_event->ino_generation);
                perf_output_put(&handle, mmap_event->prot);
                perf_output_put(&handle, mmap_event->flags);
        }

        __output_copy(&handle, mmap_event->file_name,
                                   mmap_event->file_size);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
out:
        mmap_event->event_id.header.size = size;
        mmap_event->event_id.header.type = type;
}

static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
{
        struct vm_area_struct *vma = mmap_event->vma;
        struct file *file = vma->vm_file;
        int maj = 0, min = 0;
        u64 ino = 0, gen = 0;
        u32 prot = 0, flags = 0;
        unsigned int size;
        char tmp[16];
        char *buf = NULL;
        char *name;

        if (vma->vm_flags & VM_READ)
                prot |= PROT_READ;
        if (vma->vm_flags & VM_WRITE)
                prot |= PROT_WRITE;
        if (vma->vm_flags & VM_EXEC)
                prot |= PROT_EXEC;

        if (vma->vm_flags & VM_MAYSHARE)
                flags = MAP_SHARED;
        else
                flags = MAP_PRIVATE;

        if (vma->vm_flags & VM_DENYWRITE)
                flags |= MAP_DENYWRITE;
        if (vma->vm_flags & VM_MAYEXEC)
                flags |= MAP_EXECUTABLE;
        if (vma->vm_flags & VM_LOCKED)
                flags |= MAP_LOCKED;
        if (is_vm_hugetlb_page(vma))
                flags |= MAP_HUGETLB;

        if (file) {
                struct inode *inode;
                dev_t dev;

                buf = kmalloc(PATH_MAX, GFP_KERNEL);
                if (!buf) {
                        name = "//enomem";
                        goto cpy_name;
                }
                /*
                 * d_path() works from the end of the rb backwards, so we
                 * need to add enough zero bytes after the string to handle
                 * the 64bit alignment we do later.
                 */
                name = file_path(file, buf, PATH_MAX - sizeof(u64));
                if (IS_ERR(name)) {
                        name = "//toolong";
                        goto cpy_name;
                }
                inode = file_inode(vma->vm_file);
                dev = inode->i_sb->s_dev;
                ino = inode->i_ino;
                gen = inode->i_generation;
                maj = MAJOR(dev);
                min = MINOR(dev);

                goto got_name;
        } else {
                if (vma->vm_ops && vma->vm_ops->name) {
                        name = (char *) vma->vm_ops->name(vma);
                        if (name)
                                goto cpy_name;
                }

                name = (char *)arch_vma_name(vma);
                if (name)
                        goto cpy_name;

                if (vma->vm_start <= vma->vm_mm->start_brk &&
                                vma->vm_end >= vma->vm_mm->brk) {
                        name = "[heap]";
                        goto cpy_name;
                }
                if (vma->vm_start <= vma->vm_mm->start_stack &&
                                vma->vm_end >= vma->vm_mm->start_stack) {
                        name = "[stack]";
                        goto cpy_name;
                }

                name = "//anon";
                goto cpy_name;
        }

cpy_name:
        strlcpy(tmp, name, sizeof(tmp));
        name = tmp;
got_name:
        /*
         * Since our buffer works in 8 byte units we need to align our string
         * size to a multiple of 8. However, we must guarantee the tail end is
         * zero'd out to avoid leaking random bits to userspace.
         */
        size = strlen(name)+1;
        while (!IS_ALIGNED(size, sizeof(u64)))
                name[size++] = '\0';

        mmap_event->file_name = name;
        mmap_event->file_size = size;
        mmap_event->maj = maj;
        mmap_event->min = min;
        mmap_event->ino = ino;
        mmap_event->ino_generation = gen;
        mmap_event->prot = prot;
        mmap_event->flags = flags;

        if (!(vma->vm_flags & VM_EXEC))
                mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;

        mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;

        perf_iterate_sb(perf_event_mmap_output,
                       mmap_event,
                       NULL);

        kfree(buf);
}

/*
 * Check whether inode and address range match filter criteria.
 */
static bool perf_addr_filter_match(struct perf_addr_filter *filter,
                                     struct file *file, unsigned long offset,
                                     unsigned long size)
{
        /* d_inode(NULL) won't be equal to any mapped user-space file */
        if (!filter->path.dentry)
                return false;

        if (d_inode(filter->path.dentry) != file_inode(file))
                return false;

        if (filter->offset > offset + size)
                return false;

        if (filter->offset + filter->size < offset)
                return false;

        return true;
}

static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter,
                                        struct vm_area_struct *vma,
                                        struct perf_addr_filter_range *fr)
{
        unsigned long vma_size = vma->vm_end - vma->vm_start;
        unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
        struct file *file = vma->vm_file;

        if (!perf_addr_filter_match(filter, file, off, vma_size))
                return false;

        if (filter->offset < off) {
                fr->start = vma->vm_start;
                fr->size = min(vma_size, filter->size - (off - filter->offset));
        } else {
                fr->start = vma->vm_start + filter->offset - off;
                fr->size = min(vma->vm_end - fr->start, filter->size);
        }

        return true;
}

static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
{
        struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
        struct vm_area_struct *vma = data;
        struct perf_addr_filter *filter;
        unsigned int restart = 0, count = 0;
        unsigned long flags;

        if (!has_addr_filter(event))
                return;

        if (!vma->vm_file)
                return;

        raw_spin_lock_irqsave(&ifh->lock, flags);
        list_for_each_entry(filter, &ifh->list, entry) {
                if (perf_addr_filter_vma_adjust(filter, vma,
                                                &event->addr_filter_ranges[count]))
                        restart++;

                count++;
        }

        if (restart)
                event->addr_filters_gen++;
        raw_spin_unlock_irqrestore(&ifh->lock, flags);

        if (restart)
                perf_event_stop(event, 1);
}

/*
 * Adjust all task's events' filters to the new vma
 */
static void perf_addr_filters_adjust(struct vm_area_struct *vma)
{
        struct perf_event_context *ctx;
        int ctxn;

        /*
         * Data tracing isn't supported yet and as such there is no need
         * to keep track of anything that isn't related to executable code:
         */
        if (!(vma->vm_flags & VM_EXEC))
                return;

        rcu_read_lock();
        for_each_task_context_nr(ctxn) {
                ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
                if (!ctx)
                        continue;

                perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
        }
        rcu_read_unlock();
}

void perf_event_mmap(struct vm_area_struct *vma)
{
        struct perf_mmap_event mmap_event;

        if (!atomic_read(&nr_mmap_events))
                return;

        mmap_event = (struct perf_mmap_event){
                .vma        = vma,
                /* .file_name */
                /* .file_size */
                .event_id  = {
                        .header = {
                                .type = PERF_RECORD_MMAP,
                                .misc = PERF_RECORD_MISC_USER,
                                /* .size */
                        },
                        /* .pid */
                        /* .tid */
                        .start  = vma->vm_start,
                        .len    = vma->vm_end - vma->vm_start,
                        .pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
                },
                /* .maj (attr_mmap2 only) */
                /* .min (attr_mmap2 only) */
                /* .ino (attr_mmap2 only) */
                /* .ino_generation (attr_mmap2 only) */
                /* .prot (attr_mmap2 only) */
                /* .flags (attr_mmap2 only) */
        };

        perf_addr_filters_adjust(vma);
        perf_event_mmap_event(&mmap_event);
}

void perf_event_aux_event(struct perf_event *event, unsigned long head,
                          unsigned long size, u64 flags)
{
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        struct perf_aux_event {
                struct perf_event_header        header;
                u64                                offset;
                u64                                size;
                u64                                flags;
        } rec = {
                .header = {
                        .type = PERF_RECORD_AUX,
                        .misc = 0,
                        .size = sizeof(rec),
                },
                .offset                = head,
                .size                = size,
                .flags                = flags,
        };
        int ret;

        perf_event_header__init_id(&rec.header, &sample, event);
        ret = perf_output_begin(&handle, &sample, event, rec.header.size);

        if (ret)
                return;

        perf_output_put(&handle, rec);
        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

/*
 * Lost/dropped samples logging
 */
void perf_log_lost_samples(struct perf_event *event, u64 lost)
{
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int ret;

        struct {
                struct perf_event_header        header;
                u64                                lost;
        } lost_samples_event = {
                .header = {
                        .type = PERF_RECORD_LOST_SAMPLES,
                        .misc = 0,
                        .size = sizeof(lost_samples_event),
                },
                .lost                = lost,
        };

        perf_event_header__init_id(&lost_samples_event.header, &sample, event);

        ret = perf_output_begin(&handle, &sample, event,
                                lost_samples_event.header.size);
        if (ret)
                return;

        perf_output_put(&handle, lost_samples_event);
        perf_event__output_id_sample(event, &handle, &sample);
        perf_output_end(&handle);
}

/*
 * context_switch tracking
 */

struct perf_switch_event {
        struct task_struct        *task;
        struct task_struct        *next_prev;

        struct {
                struct perf_event_header        header;
                u32                                next_prev_pid;
                u32                                next_prev_tid;
        } event_id;
};

static int perf_event_switch_match(struct perf_event *event)
{
        return event->attr.context_switch;
}

static void perf_event_switch_output(struct perf_event *event, void *data)
{
        struct perf_switch_event *se = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int ret;

        if (!perf_event_switch_match(event))
                return;

        /* Only CPU-wide events are allowed to see next/prev pid/tid */
        if (event->ctx->task) {
                se->event_id.header.type = PERF_RECORD_SWITCH;
                se->event_id.header.size = sizeof(se->event_id.header);
        } else {
                se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
                se->event_id.header.size = sizeof(se->event_id);
                se->event_id.next_prev_pid =
                                        perf_event_pid(event, se->next_prev);
                se->event_id.next_prev_tid =
                                        perf_event_tid(event, se->next_prev);
        }

        perf_event_header__init_id(&se->event_id.header, &sample, event);

        ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size);
        if (ret)
                return;

        if (event->ctx->task)
                perf_output_put(&handle, se->event_id.header);
        else
                perf_output_put(&handle, se->event_id);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

static void perf_event_switch(struct task_struct *task,
                              struct task_struct *next_prev, bool sched_in)
{
        struct perf_switch_event switch_event;

        /* N.B. caller checks nr_switch_events != 0 */

        switch_event = (struct perf_switch_event){
                .task                = task,
                .next_prev        = next_prev,
                .event_id        = {
                        .header = {
                                /* .type */
                                .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
                                /* .size */
                        },
                        /* .next_prev_pid */
                        /* .next_prev_tid */
                },
        };

        if (!sched_in && task->state == TASK_RUNNING)
                switch_event.event_id.header.misc |=
                                PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;

        perf_iterate_sb(perf_event_switch_output,
                       &switch_event,
                       NULL);
}

/*
 * IRQ throttle logging
 */

static void perf_log_throttle(struct perf_event *event, int enable)
{
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int ret;

        struct {
                struct perf_event_header        header;
                u64                                time;
                u64                                id;
                u64                                stream_id;
        } throttle_event = {
                .header = {
                        .type = PERF_RECORD_THROTTLE,
                        .misc = 0,
                        .size = sizeof(throttle_event),
                },
                .time                = perf_event_clock(event),
                .id                = primary_event_id(event),
                .stream_id        = event->id,
        };

        if (enable)
                throttle_event.header.type = PERF_RECORD_UNTHROTTLE;

        perf_event_header__init_id(&throttle_event.header, &sample, event);

        ret = perf_output_begin(&handle, &sample, event,
                                throttle_event.header.size);
        if (ret)
                return;

        perf_output_put(&handle, throttle_event);
        perf_event__output_id_sample(event, &handle, &sample);
        perf_output_end(&handle);
}

/*
 * ksymbol register/unregister tracking
 */

struct perf_ksymbol_event {
        const char        *name;
        int                name_len;
        struct {
                struct perf_event_header        header;
                u64                                addr;
                u32                                len;
                u16                                ksym_type;
                u16                                flags;
        } event_id;
};

static int perf_event_ksymbol_match(struct perf_event *event)
{
        return event->attr.ksymbol;
}

static void perf_event_ksymbol_output(struct perf_event *event, void *data)
{
        struct perf_ksymbol_event *ksymbol_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int ret;

        if (!perf_event_ksymbol_match(event))
                return;

        perf_event_header__init_id(&ksymbol_event->event_id.header,
                                   &sample, event);
        ret = perf_output_begin(&handle, &sample, event,
                                ksymbol_event->event_id.header.size);
        if (ret)
                return;

        perf_output_put(&handle, ksymbol_event->event_id);
        __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len);
        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
                        const char *sym)
{
        struct perf_ksymbol_event ksymbol_event;
        char name[KSYM_NAME_LEN];
        u16 flags = 0;
        int name_len;

        if (!atomic_read(&nr_ksymbol_events))
                return;

        if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX ||
            ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
                goto err;

        strlcpy(name, sym, KSYM_NAME_LEN);
        name_len = strlen(name) + 1;
        while (!IS_ALIGNED(name_len, sizeof(u64)))
                name[name_len++] = '\0';
        BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64));

        if (unregister)
                flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER;

        ksymbol_event = (struct perf_ksymbol_event){
                .name = name,
                .name_len = name_len,
                .event_id = {
                        .header = {
                                .type = PERF_RECORD_KSYMBOL,
                                .size = sizeof(ksymbol_event.event_id) +
                                        name_len,
                        },
                        .addr = addr,
                        .len = len,
                        .ksym_type = ksym_type,
                        .flags = flags,
                },
        };

        perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL);
        return;
err:
        WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type);
}

/*
 * bpf program load/unload tracking
 */

struct perf_bpf_event {
        struct bpf_prog        *prog;
        struct {
                struct perf_event_header        header;
                u16                                type;
                u16                                flags;
                u32                                id;
                u8                                tag[BPF_TAG_SIZE];
        } event_id;
};

static int perf_event_bpf_match(struct perf_event *event)
{
        return event->attr.bpf_event;
}

static void perf_event_bpf_output(struct perf_event *event, void *data)
{
        struct perf_bpf_event *bpf_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int ret;

        if (!perf_event_bpf_match(event))
                return;

        perf_event_header__init_id(&bpf_event->event_id.header,
                                   &sample, event);
        ret = perf_output_begin(&handle, &sample, event,
                                bpf_event->event_id.header.size);
        if (ret)
                return;

        perf_output_put(&handle, bpf_event->event_id);
        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
                                         enum perf_bpf_event_type type)
{
        bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
        int i;

        if (prog->aux->func_cnt == 0) {
                perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
                                   (u64)(unsigned long)prog->bpf_func,
                                   prog->jited_len, unregister,
                                   prog->aux->ksym.name);
        } else {
                for (i = 0; i < prog->aux->func_cnt; i++) {
                        struct bpf_prog *subprog = prog->aux->func[i];

                        perf_event_ksymbol(
                                PERF_RECORD_KSYMBOL_TYPE_BPF,
                                (u64)(unsigned long)subprog->bpf_func,
                                subprog->jited_len, unregister,
                                subprog->aux->ksym.name);
                }
        }
}

void perf_event_bpf_event(struct bpf_prog *prog,
                          enum perf_bpf_event_type type,
                          u16 flags)
{
        struct perf_bpf_event bpf_event;

        if (type <= PERF_BPF_EVENT_UNKNOWN ||
            type >= PERF_BPF_EVENT_MAX)
                return;

        switch (type) {
        case PERF_BPF_EVENT_PROG_LOAD:
        case PERF_BPF_EVENT_PROG_UNLOAD:
                if (atomic_read(&nr_ksymbol_events))
                        perf_event_bpf_emit_ksymbols(prog, type);
                break;
        default:
                break;
        }

        if (!atomic_read(&nr_bpf_events))
                return;

        bpf_event = (struct perf_bpf_event){
                .prog = prog,
                .event_id = {
                        .header = {
                                .type = PERF_RECORD_BPF_EVENT,
                                .size = sizeof(bpf_event.event_id),
                        },
                        .type = type,
                        .flags = flags,
                        .id = prog->aux->id,
                },
        };

        BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64));

        memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE);
        perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
}

struct perf_text_poke_event {
        const void                *old_bytes;
        const void                *new_bytes;
        size_t                        pad;
        u16                        old_len;
        u16                        new_len;

        struct {
                struct perf_event_header        header;

                u64                                addr;
        } event_id;
};

static int perf_event_text_poke_match(struct perf_event *event)
{
        return event->attr.text_poke;
}

static void perf_event_text_poke_output(struct perf_event *event, void *data)
{
        struct perf_text_poke_event *text_poke_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        u64 padding = 0;
        int ret;

        if (!perf_event_text_poke_match(event))
                return;

        perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event);

        ret = perf_output_begin(&handle, &sample, event,
                                text_poke_event->event_id.header.size);
        if (ret)
                return;

        perf_output_put(&handle, text_poke_event->event_id);
        perf_output_put(&handle, text_poke_event->old_len);
        perf_output_put(&handle, text_poke_event->new_len);

        __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len);
        __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len);

        if (text_poke_event->pad)
                __output_copy(&handle, &padding, text_poke_event->pad);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

void perf_event_text_poke(const void *addr, const void *old_bytes,
                          size_t old_len, const void *new_bytes, size_t new_len)
{
        struct perf_text_poke_event text_poke_event;
        size_t tot, pad;

        if (!atomic_read(&nr_text_poke_events))
                return;

        tot  = sizeof(text_poke_event.old_len) + old_len;
        tot += sizeof(text_poke_event.new_len) + new_len;
        pad  = ALIGN(tot, sizeof(u64)) - tot;

        text_poke_event = (struct perf_text_poke_event){
                .old_bytes    = old_bytes,
                .new_bytes    = new_bytes,
                .pad          = pad,
                .old_len      = old_len,
                .new_len      = new_len,
                .event_id  = {
                        .header = {
                                .type = PERF_RECORD_TEXT_POKE,
                                .misc = PERF_RECORD_MISC_KERNEL,
                                .size = sizeof(text_poke_event.event_id) + tot + pad,
                        },
                        .addr = (unsigned long)addr,
                },
        };

        perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL);
}

void perf_event_itrace_started(struct perf_event *event)
{
        event->attach_state |= PERF_ATTACH_ITRACE;
}

static void perf_log_itrace_start(struct perf_event *event)
{
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        struct perf_aux_event {
                struct perf_event_header        header;
                u32                                pid;
                u32                                tid;
        } rec;
        int ret;

        if (event->parent)
                event = event->parent;

        if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
            event->attach_state & PERF_ATTACH_ITRACE)
                return;

        rec.header.type        = PERF_RECORD_ITRACE_START;
        rec.header.misc        = 0;
        rec.header.size        = sizeof(rec);
        rec.pid        = perf_event_pid(event, current);
        rec.tid        = perf_event_tid(event, current);

        perf_event_header__init_id(&rec.header, &sample, event);
        ret = perf_output_begin(&handle, &sample, event, rec.header.size);

        if (ret)
                return;

        perf_output_put(&handle, rec);
        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

static int
__perf_event_account_interrupt(struct perf_event *event, int throttle)
{
        struct hw_perf_event *hwc = &event->hw;
        int ret = 0;
        u64 seq;

        seq = __this_cpu_read(perf_throttled_seq);
        if (seq != hwc->interrupts_seq) {
                hwc->interrupts_seq = seq;
                hwc->interrupts = 1;
        } else {
                hwc->interrupts++;
        }

        if (unlikely(throttle && hwc->interrupts >= max_samples_per_tick)) {
                __this_cpu_inc(perf_throttled_count);
                tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
                hwc->interrupts = MAX_INTERRUPTS;
                perf_log_throttle(event, 0);
                ret = 1;
        }

        if (event->attr.freq) {
                u64 now = perf_clock();
                s64 delta = now - hwc->freq_time_stamp;

                hwc->freq_time_stamp = now;

                if (delta > 0 && delta < 2*TICK_NSEC)
                        perf_adjust_period(event, delta, hwc->last_period, true);
        }

        return ret;
}

int perf_event_account_interrupt(struct perf_event *event)
{
        return __perf_event_account_interrupt(event, 1);
}

/*
 * Generic event overflow handling, sampling.
 */

static int __perf_event_overflow(struct perf_event *event,
                                   int throttle, struct perf_sample_data *data,
                                   struct pt_regs *regs)
{
        int events = atomic_read(&event->event_limit);
        int ret = 0;

        /*
         * Non-sampling counters might still use the PMI to fold short
         * hardware counters, ignore those.
         */
        if (unlikely(!is_sampling_event(event)))
                return 0;

        ret = __perf_event_account_interrupt(event, throttle);

        /*
         * XXX event_limit might not quite work as expected on inherited
         * events
         */

        event->pending_kill = POLL_IN;
        if (events && atomic_dec_and_test(&event->event_limit)) {
                ret = 1;
                event->pending_kill = POLL_HUP;

                perf_event_disable_inatomic(event);
        }

        READ_ONCE(event->overflow_handler)(event, data, regs);

        if (*perf_event_fasync(event) && event->pending_kill) {
                event->pending_wakeup = 1;
                irq_work_queue(&event->pending);
        }

        return ret;
}

int perf_event_overflow(struct perf_event *event,
                          struct perf_sample_data *data,
                          struct pt_regs *regs)
{
        return __perf_event_overflow(event, 1, data, regs);
}

/*
 * Generic software event infrastructure
 */

struct swevent_htable {
        struct swevent_hlist                *swevent_hlist;
        struct mutex                        hlist_mutex;
        int                                hlist_refcount;

        /* Recursion avoidance in each contexts */
        int                                recursion[PERF_NR_CONTEXTS];
};

static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);

/*
 * We directly increment event->count and keep a second value in
 * event->hw.period_left to count intervals. This period event
 * is kept in the range [-sample_period, 0] so that we can use the
 * sign as trigger.
 */

u64 perf_swevent_set_period(struct perf_event *event)
{
        struct hw_perf_event *hwc = &event->hw;
        u64 period = hwc->last_period;
        u64 nr, offset;
        s64 old, val;

        hwc->last_period = hwc->sample_period;

again:
        old = val = local64_read(&hwc->period_left);
        if (val < 0)
                return 0;

        nr = div64_u64(period + val, period);
        offset = nr * period;
        val -= offset;
        if (local64_cmpxchg(&hwc->period_left, old, val) != old)
                goto again;

        return nr;
}

static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
                                    struct perf_sample_data *data,
                                    struct pt_regs *regs)
{
        struct hw_perf_event *hwc = &event->hw;
        int throttle = 0;

        if (!overflow)
                overflow = perf_swevent_set_period(event);

        if (hwc->interrupts == MAX_INTERRUPTS)
                return;

        for (; overflow; overflow--) {
                if (__perf_event_overflow(event, throttle,
                                            data, regs)) {
                        /*
                         * We inhibit the overflow from happening when
                         * hwc->interrupts == MAX_INTERRUPTS.
                         */
                        break;
                }
                throttle = 1;
        }
}

static void perf_swevent_event(struct perf_event *event, u64 nr,
                               struct perf_sample_data *data,
                               struct pt_regs *regs)
{
        struct hw_perf_event *hwc = &event->hw;

        local64_add(nr, &event->count);

        if (!regs)
                return;

        if (!is_sampling_event(event))
                return;

        if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
                data->period = nr;
                return perf_swevent_overflow(event, 1, data, regs);
        } else
                data->period = event->hw.last_period;

        if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
                return perf_swevent_overflow(event, 1, data, regs);

        if (local64_add_negative(nr, &hwc->period_left))
                return;

        perf_swevent_overflow(event, 0, data, regs);
}

static int perf_exclude_event(struct perf_event *event,
                              struct pt_regs *regs)
{
        if (event->hw.state & PERF_HES_STOPPED)
                return 1;

        if (regs) {
                if (event->attr.exclude_user && user_mode(regs))
                        return 1;

                if (event->attr.exclude_kernel && !user_mode(regs))
                        return 1;
        }

        return 0;
}

static int perf_swevent_match(struct perf_event *event,
                                enum perf_type_id type,
                                u32 event_id,
                                struct perf_sample_data *data,
                                struct pt_regs *regs)
{
        if (event->attr.type != type)
                return 0;

        if (event->attr.config != event_id)
                return 0;

        if (perf_exclude_event(event, regs))
                return 0;

        return 1;
}

static inline u64 swevent_hash(u64 type, u32 event_id)
{
        u64 val = event_id | (type << 32);

        return hash_64(val, SWEVENT_HLIST_BITS);
}

static inline struct hlist_head *
__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
{
        u64 hash = swevent_hash(type, event_id);

        return &hlist->heads[hash];
}

/* For the read side: events when they trigger */
static inline struct hlist_head *
find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
{
        struct swevent_hlist *hlist;

        hlist = rcu_dereference(swhash->swevent_hlist);
        if (!hlist)
                return NULL;

        return __find_swevent_head(hlist, type, event_id);
}

/* For the event head insertion and removal in the hlist */
static inline struct hlist_head *
find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
{
        struct swevent_hlist *hlist;
        u32 event_id = event->attr.config;
        u64 type = event->attr.type;

        /*
         * Event scheduling is always serialized against hlist allocation
         * and release. Which makes the protected version suitable here.
         * The context lock guarantees that.
         */
        hlist = rcu_dereference_protected(swhash->swevent_hlist,
                                          lockdep_is_held(&event->ctx->lock));
        if (!hlist)
                return NULL;

        return __find_swevent_head(hlist, type, event_id);
}

static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
                                    u64 nr,
                                    struct perf_sample_data *data,
                                    struct pt_regs *regs)
{
        struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
        struct perf_event *event;
        struct hlist_head *head;

        rcu_read_lock();
        head = find_swevent_head_rcu(swhash, type, event_id);
        if (!head)
                goto end;

        hlist_for_each_entry_rcu(event, head, hlist_entry) {
                if (perf_swevent_match(event, type, event_id, data, regs))
                        perf_swevent_event(event, nr, data, regs);
        }
end:
        rcu_read_unlock();
}

DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);

int perf_swevent_get_recursion_context(void)
{
        struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);

        return get_recursion_context(swhash->recursion);
}
EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);

void perf_swevent_put_recursion_context(int rctx)
{
        struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);

        put_recursion_context(swhash->recursion, rctx);
}

void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
{
        struct perf_sample_data data;

        if (WARN_ON_ONCE(!regs))
                return;

        perf_sample_data_init(&data, addr, 0);
        do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
}

void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
{
        int rctx;

        preempt_disable_notrace();
        rctx = perf_swevent_get_recursion_context();
        if (unlikely(rctx < 0))
                goto fail;

        ___perf_sw_event(event_id, nr, regs, addr);

        perf_swevent_put_recursion_context(rctx);
fail:
        preempt_enable_notrace();
}

static void perf_swevent_read(struct perf_event *event)
{
}

static int perf_swevent_add(struct perf_event *event, int flags)
{
        struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
        struct hw_perf_event *hwc = &event->hw;
        struct hlist_head *head;

        if (is_sampling_event(event)) {
                hwc->last_period = hwc->sample_period;
                perf_swevent_set_period(event);
        }

        hwc->state = !(flags & PERF_EF_START);

        head = find_swevent_head(swhash, event);
        if (WARN_ON_ONCE(!head))
                return -EINVAL;

        hlist_add_head_rcu(&event->hlist_entry, head);
        perf_event_update_userpage(event);

        return 0;
}

static void perf_swevent_del(struct perf_event *event, int flags)
{
        hlist_del_rcu(&event->hlist_entry);
}

static void perf_swevent_start(struct perf_event *event, int flags)
{
        event->hw.state = 0;
}

static void perf_swevent_stop(struct perf_event *event, int flags)
{
        event->hw.state = PERF_HES_STOPPED;
}

/* Deref the hlist from the update side */
static inline struct swevent_hlist *
swevent_hlist_deref(struct swevent_htable *swhash)
{
        return rcu_dereference_protected(swhash->swevent_hlist,
                                         lockdep_is_held(&swhash->hlist_mutex));
}

static void swevent_hlist_release(struct swevent_htable *swhash)
{
        struct swevent_hlist *hlist = swevent_hlist_deref(swhash);

        if (!hlist)
                return;

        RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
        kfree_rcu(hlist, rcu_head);
}

static void swevent_hlist_put_cpu(int cpu)
{
        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);

        mutex_lock(&swhash->hlist_mutex);

        if (!--swhash->hlist_refcount)
                swevent_hlist_release(swhash);

        mutex_unlock(&swhash->hlist_mutex);
}

static void swevent_hlist_put(void)
{
        int cpu;

        for_each_possible_cpu(cpu)
                swevent_hlist_put_cpu(cpu);
}

static int swevent_hlist_get_cpu(int cpu)
{
        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
        int err = 0;

        mutex_lock(&swhash->hlist_mutex);
        if (!swevent_hlist_deref(swhash) &&
            cpumask_test_cpu(cpu, perf_online_mask)) {
                struct swevent_hlist *hlist;

                hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
                if (!hlist) {
                        err = -ENOMEM;
                        goto exit;
                }
                rcu_assign_pointer(swhash->swevent_hlist, hlist);
        }
        swhash->hlist_refcount++;
exit:
        mutex_unlock(&swhash->hlist_mutex);

        return err;
}

static int swevent_hlist_get(void)
{
        int err, cpu, failed_cpu;

        mutex_lock(&pmus_lock);
        for_each_possible_cpu(cpu) {
                err = swevent_hlist_get_cpu(cpu);
                if (err) {
                        failed_cpu = cpu;
                        goto fail;
                }
        }
        mutex_unlock(&pmus_lock);
        return 0;
fail:
        for_each_possible_cpu(cpu) {
                if (cpu == failed_cpu)
                        break;
                swevent_hlist_put_cpu(cpu);
        }
        mutex_unlock(&pmus_lock);
        return err;
}

struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];

static void sw_perf_event_destroy(struct perf_event *event)
{
        u64 event_id = event->attr.config;

        WARN_ON(event->parent);

        static_key_slow_dec(&perf_swevent_enabled[event_id]);
        swevent_hlist_put();
}

static int perf_swevent_init(struct perf_event *event)
{
        u64 event_id = event->attr.config;

        if (event->attr.type != PERF_TYPE_SOFTWARE)
                return -ENOENT;

        /*
         * no branch sampling for software events
         */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        switch (event_id) {
        case PERF_COUNT_SW_CPU_CLOCK:
        case PERF_COUNT_SW_TASK_CLOCK:
                return -ENOENT;

        default:
                break;
        }

        if (event_id >= PERF_COUNT_SW_MAX)
                return -ENOENT;

        if (!event->parent) {
                int err;

                err = swevent_hlist_get();
                if (err)
                        return err;

                static_key_slow_inc(&perf_swevent_enabled[event_id]);
                event->destroy = sw_perf_event_destroy;
        }

        return 0;
}

static struct pmu perf_swevent = {
        .task_ctx_nr        = perf_sw_context,

        .capabilities        = PERF_PMU_CAP_NO_NMI,

        .event_init        = perf_swevent_init,
        .add                = perf_swevent_add,
        .del                = perf_swevent_del,
        .start                = perf_swevent_start,
        .stop                = perf_swevent_stop,
        .read                = perf_swevent_read,
};

#ifdef CONFIG_EVENT_TRACING

static int perf_tp_filter_match(struct perf_event *event,
                                struct perf_sample_data *data)
{
        void *record = data->raw->frag.data;

        /* only top level events have filters set */
        if (event->parent)
                event = event->parent;

        if (likely(!event->filter) || filter_match_preds(event->filter, record))
                return 1;
        return 0;
}

static int perf_tp_event_match(struct perf_event *event,
                                struct perf_sample_data *data,
                                struct pt_regs *regs)
{
        if (event->hw.state & PERF_HES_STOPPED)
                return 0;
        /*
         * If exclude_kernel, only trace user-space tracepoints (uprobes)
         */
        if (event->attr.exclude_kernel && !user_mode(regs))
                return 0;

        if (!perf_tp_filter_match(event, data))
                return 0;

        return 1;
}

void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
                               struct trace_event_call *call, u64 count,
                               struct pt_regs *regs, struct hlist_head *head,
                               struct task_struct *task)
{
        if (bpf_prog_array_valid(call)) {
                *(struct pt_regs **)raw_data = regs;
                if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
                        perf_swevent_put_recursion_context(rctx);
                        return;
                }
        }
        perf_tp_event(call->event.type, count, raw_data, size, regs, head,
                      rctx, task);
}
EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);

void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
                   struct pt_regs *regs, struct hlist_head *head, int rctx,
                   struct task_struct *task)
{
        struct perf_sample_data data;
        struct perf_event *event;

        struct perf_raw_record raw = {
                .frag = {
                        .size = entry_size,
                        .data = record,
                },
        };

        perf_sample_data_init(&data, 0, 0);
        data.raw = &raw;

        perf_trace_buf_update(record, event_type);

        hlist_for_each_entry_rcu(event, head, hlist_entry) {
                if (perf_tp_event_match(event, &data, regs))
                        perf_swevent_event(event, count, &data, regs);
        }

        /*
         * If we got specified a target task, also iterate its context and
         * deliver this event there too.
         */
        if (task && task != current) {
                struct perf_event_context *ctx;
                struct trace_entry *entry = record;

                rcu_read_lock();
                ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
                if (!ctx)
                        goto unlock;

                list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
                        if (event->cpu != smp_processor_id())
                                continue;
                        if (event->attr.type != PERF_TYPE_TRACEPOINT)
                                continue;
                        if (event->attr.config != entry->type)
                                continue;
                        if (perf_tp_event_match(event, &data, regs))
                                perf_swevent_event(event, count, &data, regs);
                }
unlock:
                rcu_read_unlock();
        }

        perf_swevent_put_recursion_context(rctx);
}
EXPORT_SYMBOL_GPL(perf_tp_event);

static void tp_perf_event_destroy(struct perf_event *event)
{
        perf_trace_destroy(event);
}

static int perf_tp_event_init(struct perf_event *event)
{
        int err;

        if (event->attr.type != PERF_TYPE_TRACEPOINT)
                return -ENOENT;

        /*
         * no branch sampling for tracepoint events
         */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        err = perf_trace_init(event);
        if (err)
                return err;

        event->destroy = tp_perf_event_destroy;

        return 0;
}

static struct pmu perf_tracepoint = {
        .task_ctx_nr        = perf_sw_context,

        .event_init        = perf_tp_event_init,
        .add                = perf_trace_add,
        .del                = perf_trace_del,
        .start                = perf_swevent_start,
        .stop                = perf_swevent_stop,
        .read                = perf_swevent_read,
};

#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
/*
 * Flags in config, used by dynamic PMU kprobe and uprobe
 * The flags should match following PMU_FORMAT_ATTR().
 *
 * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe
 *                               if not set, create kprobe/uprobe
 *
 * The following values specify a reference counter (or semaphore in the
 * terminology of tools like dtrace, systemtap, etc.) Userspace Statically
 * Defined Tracepoints (USDT). Currently, we use 40 bit for the offset.
 *
 * PERF_UPROBE_REF_CTR_OFFSET_BITS        # of bits in config as th offset
 * PERF_UPROBE_REF_CTR_OFFSET_SHIFT        # of bits to shift left
 */
enum perf_probe_config {
        PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0,  /* [k,u]retprobe */
        PERF_UPROBE_REF_CTR_OFFSET_BITS = 32,
        PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS,
};

PMU_FORMAT_ATTR(retprobe, "config:0");
#endif

#ifdef CONFIG_KPROBE_EVENTS
static struct attribute *kprobe_attrs[] = {
        &format_attr_retprobe.attr,
        NULL,
};

static struct attribute_group kprobe_format_group = {
        .name = "format",
        .attrs = kprobe_attrs,
};

static const struct attribute_group *kprobe_attr_groups[] = {
        &kprobe_format_group,
        NULL,
};

static int perf_kprobe_event_init(struct perf_event *event);
static struct pmu perf_kprobe = {
        .task_ctx_nr        = perf_sw_context,
        .event_init        = perf_kprobe_event_init,
        .add                = perf_trace_add,
        .del                = perf_trace_del,
        .start                = perf_swevent_start,
        .stop                = perf_swevent_stop,
        .read                = perf_swevent_read,
        .attr_groups        = kprobe_attr_groups,
};

static int perf_kprobe_event_init(struct perf_event *event)
{
        int err;
        bool is_retprobe;

        if (event->attr.type != perf_kprobe.type)
                return -ENOENT;

        if (!perfmon_capable())
                return -EACCES;

        /*
         * no branch sampling for probe events
         */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
        err = perf_kprobe_init(event, is_retprobe);
        if (err)
                return err;

        event->destroy = perf_kprobe_destroy;

        return 0;
}
#endif /* CONFIG_KPROBE_EVENTS */

#ifdef CONFIG_UPROBE_EVENTS
PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63");

static struct attribute *uprobe_attrs[] = {
        &format_attr_retprobe.attr,
        &format_attr_ref_ctr_offset.attr,
        NULL,
};

static struct attribute_group uprobe_format_group = {
        .name = "format",
        .attrs = uprobe_attrs,
};

static const struct attribute_group *uprobe_attr_groups[] = {
        &uprobe_format_group,
        NULL,
};

static int perf_uprobe_event_init(struct perf_event *event);
static struct pmu perf_uprobe = {
        .task_ctx_nr        = perf_sw_context,
        .event_init        = perf_uprobe_event_init,
        .add                = perf_trace_add,
        .del                = perf_trace_del,
        .start                = perf_swevent_start,
        .stop                = perf_swevent_stop,
        .read                = perf_swevent_read,
        .attr_groups        = uprobe_attr_groups,
};

static int perf_uprobe_event_init(struct perf_event *event)
{
        int err;
        unsigned long ref_ctr_offset;
        bool is_retprobe;

        if (event->attr.type != perf_uprobe.type)
                return -ENOENT;

        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;

        /*
         * no branch sampling for probe events
         */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
        ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT;
        err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe);
        if (err)
                return err;

        event->destroy = perf_uprobe_destroy;

        return 0;
}
#endif /* CONFIG_UPROBE_EVENTS */

static inline void perf_tp_register(void)
{
        perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
#ifdef CONFIG_KPROBE_EVENTS
        perf_pmu_register(&perf_kprobe, "kprobe", -1);
#endif
#ifdef CONFIG_UPROBE_EVENTS
        perf_pmu_register(&perf_uprobe, "uprobe", -1);
#endif
}

static void perf_event_free_filter(struct perf_event *event)
{
        ftrace_profile_free_filter(event);
}

#ifdef CONFIG_BPF_SYSCALL
static void bpf_overflow_handler(struct perf_event *event,
                                 struct perf_sample_data *data,
                                 struct pt_regs *regs)
{
        struct bpf_perf_event_data_kern ctx = {
                .data = data,
                .event = event,
        };
        int ret = 0;

        ctx.regs = perf_arch_bpf_user_pt_regs(regs);
        if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
                goto out;
        rcu_read_lock();
        ret = BPF_PROG_RUN(event->prog, &ctx);
        rcu_read_unlock();
out:
        __this_cpu_dec(bpf_prog_active);
        if (!ret)
                return;

        event->orig_overflow_handler(event, data, regs);
}

static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
{
        struct bpf_prog *prog;

        if (event->overflow_handler_context)
                /* hw breakpoint or kernel counter */
                return -EINVAL;

        if (event->prog)
                return -EEXIST;

        prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
        if (IS_ERR(prog))
                return PTR_ERR(prog);

        if (event->attr.precise_ip &&
            prog->call_get_stack &&
            (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) ||
             event->attr.exclude_callchain_kernel ||
             event->attr.exclude_callchain_user)) {
                /*
                 * On perf_event with precise_ip, calling bpf_get_stack()
                 * may trigger unwinder warnings and occasional crashes.
                 * bpf_get_[stack|stackid] works around this issue by using
                 * callchain attached to perf_sample_data. If the
                 * perf_event does not full (kernel and user) callchain
                 * attached to perf_sample_data, do not allow attaching BPF
                 * program that calls bpf_get_[stack|stackid].
                 */
                bpf_prog_put(prog);
                return -EPROTO;
        }

        event->prog = prog;
        event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
        WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
        return 0;
}

static void perf_event_free_bpf_handler(struct perf_event *event)
{
        struct bpf_prog *prog = event->prog;

        if (!prog)
                return;

        WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
        event->prog = NULL;
        bpf_prog_put(prog);
}
#else
static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
{
        return -EOPNOTSUPP;
}
static void perf_event_free_bpf_handler(struct perf_event *event)
{
}
#endif

/*
 * returns true if the event is a tracepoint, or a kprobe/upprobe created
 * with perf_event_open()
 */
static inline bool perf_event_is_tracing(struct perf_event *event)
{
        if (event->pmu == &perf_tracepoint)
                return true;
#ifdef CONFIG_KPROBE_EVENTS
        if (event->pmu == &perf_kprobe)
                return true;
#endif
#ifdef CONFIG_UPROBE_EVENTS
        if (event->pmu == &perf_uprobe)
                return true;
#endif
        return false;
}

static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
{
        bool is_kprobe, is_tracepoint, is_syscall_tp;
        struct bpf_prog *prog;
        int ret;

        if (!perf_event_is_tracing(event))
                return perf_event_set_bpf_handler(event, prog_fd);

        is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
        is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
        is_syscall_tp = is_syscall_trace_event(event->tp_event);
        if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
                /* bpf programs can only be attached to u/kprobe or tracepoint */
                return -EINVAL;

        prog = bpf_prog_get(prog_fd);
        if (IS_ERR(prog))
                return PTR_ERR(prog);

        if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
            (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
            (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
                /* valid fd, but invalid bpf program type */
                bpf_prog_put(prog);
                return -EINVAL;
        }

        /* Kprobe override only works for kprobes, not uprobes. */
        if (prog->kprobe_override &&
            !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) {
                bpf_prog_put(prog);
                return -EINVAL;
        }

        if (is_tracepoint || is_syscall_tp) {
                int off = trace_event_get_offsets(event->tp_event);

                if (prog->aux->max_ctx_offset > off) {
                        bpf_prog_put(prog);
                        return -EACCES;
                }
        }

        ret = perf_event_attach_bpf_prog(event, prog);
        if (ret)
                bpf_prog_put(prog);
        return ret;
}

static void perf_event_free_bpf_prog(struct perf_event *event)
{
        if (!perf_event_is_tracing(event)) {
                perf_event_free_bpf_handler(event);
                return;
        }
        perf_event_detach_bpf_prog(event);
}

#else

static inline void perf_tp_register(void)
{
}

static void perf_event_free_filter(struct perf_event *event)
{
}

static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
{
        return -ENOENT;
}

static void perf_event_free_bpf_prog(struct perf_event *event)
{
}
#endif /* CONFIG_EVENT_TRACING */

#ifdef CONFIG_HAVE_HW_BREAKPOINT
void perf_bp_event(struct perf_event *bp, void *data)
{
        struct perf_sample_data sample;
        struct pt_regs *regs = data;

        perf_sample_data_init(&sample, bp->attr.bp_addr, 0);

        if (!bp->hw.state && !perf_exclude_event(bp, regs))
                perf_swevent_event(bp, 1, &sample, regs);
}
#endif

/*
 * Allocate a new address filter
 */
static struct perf_addr_filter *
perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
{
        int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
        struct perf_addr_filter *filter;

        filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
        if (!filter)
                return NULL;

        INIT_LIST_HEAD(&filter->entry);
        list_add_tail(&filter->entry, filters);

        return filter;
}

static void free_filters_list(struct list_head *filters)
{
        struct perf_addr_filter *filter, *iter;

        list_for_each_entry_safe(filter, iter, filters, entry) {
                path_put(&filter->path);
                list_del(&filter->entry);
                kfree(filter);
        }
}

/*
 * Free existing address filters and optionally install new ones
 */
static void perf_addr_filters_splice(struct perf_event *event,
                                     struct list_head *head)
{
        unsigned long flags;
        LIST_HEAD(list);

        if (!has_addr_filter(event))
                return;

        /* don't bother with children, they don't have their own filters */
        if (event->parent)
                return;

        raw_spin_lock_irqsave(&event->addr_filters.lock, flags);

        list_splice_init(&event->addr_filters.list, &list);
        if (head)
                list_splice(head, &event->addr_filters.list);

        raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);

        free_filters_list(&list);
}

/*
 * Scan through mm's vmas and see if one of them matches the
 * @filter; if so, adjust filter's address range.
 * Called with mm::mmap_lock down for reading.
 */
static void perf_addr_filter_apply(struct perf_addr_filter *filter,
                                   struct mm_struct *mm,
                                   struct perf_addr_filter_range *fr)
{
        struct vm_area_struct *vma;

        for (vma = mm->mmap; vma; vma = vma->vm_next) {
                if (!vma->vm_file)
                        continue;

                if (perf_addr_filter_vma_adjust(filter, vma, fr))
                        return;
        }
}

/*
 * Update event's address range filters based on the
 * task's existing mappings, if any.
 */
static void perf_event_addr_filters_apply(struct perf_event *event)
{
        struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
        struct task_struct *task = READ_ONCE(event->ctx->task);
        struct perf_addr_filter *filter;
        struct mm_struct *mm = NULL;
        unsigned int count = 0;
        unsigned long flags;

        /*
         * We may observe TASK_TOMBSTONE, which means that the event tear-down
         * will stop on the parent's child_mutex that our caller is also holding
         */
        if (task == TASK_TOMBSTONE)
                return;

        if (ifh->nr_file_filters) {
                mm = get_task_mm(task);
                if (!mm)
                        goto restart;

                mmap_read_lock(mm);
        }

        raw_spin_lock_irqsave(&ifh->lock, flags);
        list_for_each_entry(filter, &ifh->list, entry) {
                if (filter->path.dentry) {
                        /*
                         * Adjust base offset if the filter is associated to a
                         * binary that needs to be mapped:
                         */
                        event->addr_filter_ranges[count].start = 0;
                        event->addr_filter_ranges[count].size = 0;

                        perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]);
                } else {
                        event->addr_filter_ranges[count].start = filter->offset;
                        event->addr_filter_ranges[count].size  = filter->size;
                }

                count++;
        }

        event->addr_filters_gen++;
        raw_spin_unlock_irqrestore(&ifh->lock, flags);

        if (ifh->nr_file_filters) {
                mmap_read_unlock(mm);

                mmput(mm);
        }

restart:
        perf_event_stop(event, 1);
}

/*
 * Address range filtering: limiting the data to certain
 * instruction address ranges. Filters are ioctl()ed to us from
 * userspace as ascii strings.
 *
 * Filter string format:
 *
 * ACTION RANGE_SPEC
 * where ACTION is one of the
 *  * "filter": limit the trace to this region
 *  * "start": start tracing from this address
 *  * "stop": stop tracing at this address/region;
 * RANGE_SPEC is
 *  * for kernel addresses: <start address>[/<size>]
 *  * for object files:     <start address>[/<size>]@</path/to/object/file>
 *
 * if <size> is not specified or is zero, the range is treated as a single
 * address; not valid for ACTION=="filter".
 */
enum {
        IF_ACT_NONE = -1,
        IF_ACT_FILTER,
        IF_ACT_START,
        IF_ACT_STOP,
        IF_SRC_FILE,
        IF_SRC_KERNEL,
        IF_SRC_FILEADDR,
        IF_SRC_KERNELADDR,
};

enum {
        IF_STATE_ACTION = 0,
        IF_STATE_SOURCE,
        IF_STATE_END,
};

static const match_table_t if_tokens = {
        { IF_ACT_FILTER,        "filter" },
        { IF_ACT_START,                "start" },
        { IF_ACT_STOP,                "stop" },
        { IF_SRC_FILE,                "%u/%u@%s" },
        { IF_SRC_KERNEL,        "%u/%u" },
        { IF_SRC_FILEADDR,        "%u@%s" },
        { IF_SRC_KERNELADDR,        "%u" },
        { IF_ACT_NONE,                NULL },
};

/*
 * Address filter string parser
 */
static int
perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
                             struct list_head *filters)
{
        struct perf_addr_filter *filter = NULL;
        char *start, *orig, *filename = NULL;
        substring_t args[MAX_OPT_ARGS];
        int state = IF_STATE_ACTION, token;
        unsigned int kernel = 0;
        int ret = -EINVAL;

        orig = fstr = kstrdup(fstr, GFP_KERNEL);
        if (!fstr)
                return -ENOMEM;

        while ((start = strsep(&fstr, " ,\n")) != NULL) {
                static const enum perf_addr_filter_action_t actions[] = {
                        [IF_ACT_FILTER]        = PERF_ADDR_FILTER_ACTION_FILTER,
                        [IF_ACT_START]        = PERF_ADDR_FILTER_ACTION_START,
                        [IF_ACT_STOP]        = PERF_ADDR_FILTER_ACTION_STOP,
                };
                ret = -EINVAL;

                if (!*start)
                        continue;

                /* filter definition begins */
                if (state == IF_STATE_ACTION) {
                        filter = perf_addr_filter_new(event, filters);
                        if (!filter)
                                goto fail;
                }

                token = match_token(start, if_tokens, args);
                switch (token) {
                case IF_ACT_FILTER:
                case IF_ACT_START:
                case IF_ACT_STOP:
                        if (state != IF_STATE_ACTION)
                                goto fail;

                        filter->action = actions[token];
                        state = IF_STATE_SOURCE;
                        break;

                case IF_SRC_KERNELADDR:
                case IF_SRC_KERNEL:
                        kernel = 1;
                        fallthrough;

                case IF_SRC_FILEADDR:
                case IF_SRC_FILE:
                        if (state != IF_STATE_SOURCE)
                                goto fail;

                        *args[0].to = 0;
                        ret = kstrtoul(args[0].from, 0, &filter->offset);
                        if (ret)
                                goto fail;

                        if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) {
                                *args[1].to = 0;
                                ret = kstrtoul(args[1].from, 0, &filter->size);
                                if (ret)
                                        goto fail;
                        }

                        if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
                                int fpos = token == IF_SRC_FILE ? 2 : 1;

                                kfree(filename);
                                filename = match_strdup(&args[fpos]);
                                if (!filename) {
                                        ret = -ENOMEM;
                                        goto fail;
                                }
                        }

                        state = IF_STATE_END;
                        break;

                default:
                        goto fail;
                }

                /*
                 * Filter definition is fully parsed, validate and install it.
                 * Make sure that it doesn't contradict itself or the event's
                 * attribute.
                 */
                if (state == IF_STATE_END) {
                        ret = -EINVAL;
                        if (kernel && event->attr.exclude_kernel)
                                goto fail;

                        /*
                         * ACTION "filter" must have a non-zero length region
                         * specified.
                         */
                        if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER &&
                            !filter->size)
                                goto fail;

                        if (!kernel) {
                                if (!filename)
                                        goto fail;

                                /*
                                 * For now, we only support file-based filters
                                 * in per-task events; doing so for CPU-wide
                                 * events requires additional context switching
                                 * trickery, since same object code will be
                                 * mapped at different virtual addresses in
                                 * different processes.
                                 */
                                ret = -EOPNOTSUPP;
                                if (!event->ctx->task)
                                        goto fail;

                                /* look up the path and grab its inode */
                                ret = kern_path(filename, LOOKUP_FOLLOW,
                                                &filter->path);
                                if (ret)
                                        goto fail;

                                ret = -EINVAL;
                                if (!filter->path.dentry ||
                                    !S_ISREG(d_inode(filter->path.dentry)
                                             ->i_mode))
                                        goto fail;

                                event->addr_filters.nr_file_filters++;
                        }

                        /* ready to consume more filters */
                        kfree(filename);
                        filename = NULL;
                        state = IF_STATE_ACTION;
                        filter = NULL;
                        kernel = 0;
                }
        }

        if (state != IF_STATE_ACTION)
                goto fail;

        kfree(filename);
        kfree(orig);

        return 0;

fail:
        kfree(filename);
        free_filters_list(filters);
        kfree(orig);

        return ret;
}

static int
perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
{
        LIST_HEAD(filters);
        int ret;

        /*
         * Since this is called in perf_ioctl() path, we're already holding
         * ctx::mutex.
         */
        lockdep_assert_held(&event->ctx->mutex);

        if (WARN_ON_ONCE(event->parent))
                return -EINVAL;

        ret = perf_event_parse_addr_filter(event, filter_str, &filters);
        if (ret)
                goto fail_clear_files;

        ret = event->pmu->addr_filters_validate(&filters);
        if (ret)
                goto fail_free_filters;

        /* remove existing filters, if any */
        perf_addr_filters_splice(event, &filters);

        /* install new filters */
        perf_event_for_each_child(event, perf_event_addr_filters_apply);

        return ret;

fail_free_filters:
        free_filters_list(&filters);

fail_clear_files:
        event->addr_filters.nr_file_filters = 0;

        return ret;
}

static int perf_event_set_filter(struct perf_event *event, void __user *arg)
{
        int ret = -EINVAL;
        char *filter_str;

        filter_str = strndup_user(arg, PAGE_SIZE);
        if (IS_ERR(filter_str))
                return PTR_ERR(filter_str);

#ifdef CONFIG_EVENT_TRACING
        if (perf_event_is_tracing(event)) {
                struct perf_event_context *ctx = event->ctx;

                /*
                 * Beware, here be dragons!!
                 *
                 * the tracepoint muck will deadlock against ctx->mutex, but
                 * the tracepoint stuff does not actually need it. So
                 * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we
                 * already have a reference on ctx.
                 *
                 * This can result in event getting moved to a different ctx,
                 * but that does not affect the tracepoint state.
                 */
                mutex_unlock(&ctx->mutex);
                ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
                mutex_lock(&ctx->mutex);
        } else
#endif
        if (has_addr_filter(event))
                ret = perf_event_set_addr_filter(event, filter_str);

        kfree(filter_str);
        return ret;
}

/*
 * hrtimer based swevent callback
 */

static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
{
        enum hrtimer_restart ret = HRTIMER_RESTART;
        struct perf_sample_data data;
        struct pt_regs *regs;
        struct perf_event *event;
        u64 period;

        event = container_of(hrtimer, struct perf_event, hw.hrtimer);

        if (event->state != PERF_EVENT_STATE_ACTIVE)
                return HRTIMER_NORESTART;

        event->pmu->read(event);

        perf_sample_data_init(&data, 0, event->hw.last_period);
        regs = get_irq_regs();

        if (regs && !perf_exclude_event(event, regs)) {
                if (!(event->attr.exclude_idle && is_idle_task(current)))
                        if (__perf_event_overflow(event, 1, &data, regs))
                                ret = HRTIMER_NORESTART;
        }

        period = max_t(u64, 10000, event->hw.sample_period);
        hrtimer_forward_now(hrtimer, ns_to_ktime(period));

        return ret;
}

static void perf_swevent_start_hrtimer(struct perf_event *event)
{
        struct hw_perf_event *hwc = &event->hw;
        s64 period;

        if (!is_sampling_event(event))
                return;

        period = local64_read(&hwc->period_left);
        if (period) {
                if (period < 0)
                        period = 10000;

                local64_set(&hwc->period_left, 0);
        } else {
                period = max_t(u64, 10000, hwc->sample_period);
        }
        hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
                      HRTIMER_MODE_REL_PINNED_HARD);
}

static void perf_swevent_cancel_hrtimer(struct perf_event *event)
{
        struct hw_perf_event *hwc = &event->hw;

        if (is_sampling_event(event)) {
                ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
                local64_set(&hwc->period_left, ktime_to_ns(remaining));

                hrtimer_cancel(&hwc->hrtimer);
        }
}

static void perf_swevent_init_hrtimer(struct perf_event *event)
{
        struct hw_perf_event *hwc = &event->hw;

        if (!is_sampling_event(event))
                return;

        hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
        hwc->hrtimer.function = perf_swevent_hrtimer;

        /*
         * Since hrtimers have a fixed rate, we can do a static freq->period
         * mapping and avoid the whole period adjust feedback stuff.
         */
        if (event->attr.freq) {
                long freq = event->attr.sample_freq;

                event->attr.sample_period = NSEC_PER_SEC / freq;
                hwc->sample_period = event->attr.sample_period;
                local64_set(&hwc->period_left, hwc->sample_period);
                hwc->last_period = hwc->sample_period;
                event->attr.freq = 0;
        }
}

/*
 * Software event: cpu wall time clock
 */

static void cpu_clock_event_update(struct perf_event *event)
{
        s64 prev;
        u64 now;

        now = local_clock();
        prev = local64_xchg(&event->hw.prev_count, now);
        local64_add(now - prev, &event->count);
}

static void cpu_clock_event_start(struct perf_event *event, int flags)
{
        local64_set(&event->hw.prev_count, local_clock());
        perf_swevent_start_hrtimer(event);
}

static void cpu_clock_event_stop(struct perf_event *event, int flags)
{
        perf_swevent_cancel_hrtimer(event);
        cpu_clock_event_update(event);
}

static int cpu_clock_event_add(struct perf_event *event, int flags)
{
        if (flags & PERF_EF_START)
                cpu_clock_event_start(event, flags);
        perf_event_update_userpage(event);

        return 0;
}

static void cpu_clock_event_del(struct perf_event *event, int flags)
{
        cpu_clock_event_stop(event, flags);
}

static void cpu_clock_event_read(struct perf_event *event)
{
        cpu_clock_event_update(event);
}

static int cpu_clock_event_init(struct perf_event *event)
{
        if (event->attr.type != PERF_TYPE_SOFTWARE)
                return -ENOENT;

        if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
                return -ENOENT;

        /*
         * no branch sampling for software events
         */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        perf_swevent_init_hrtimer(event);

        return 0;
}

static struct pmu perf_cpu_clock = {
        .task_ctx_nr        = perf_sw_context,

        .capabilities        = PERF_PMU_CAP_NO_NMI,

        .event_init        = cpu_clock_event_init,
        .add                = cpu_clock_event_add,
        .del                = cpu_clock_event_del,
        .start                = cpu_clock_event_start,
        .stop                = cpu_clock_event_stop,
        .read                = cpu_clock_event_read,
};

/*
 * Software event: task time clock
 */

static void task_clock_event_update(struct perf_event *event, u64 now)
{
        u64 prev;
        s64 delta;

        prev = local64_xchg(&event->hw.prev_count, now);
        delta = now - prev;
        local64_add(delta, &event->count);
}

static void task_clock_event_start(struct perf_event *event, int flags)
{
        local64_set(&event->hw.prev_count, event->ctx->time);
        perf_swevent_start_hrtimer(event);
}

static void task_clock_event_stop(struct perf_event *event, int flags)
{
        perf_swevent_cancel_hrtimer(event);
        task_clock_event_update(event, event->ctx->time);
}

static int task_clock_event_add(struct perf_event *event, int flags)
{
        if (flags & PERF_EF_START)
                task_clock_event_start(event, flags);
        perf_event_update_userpage(event);

        return 0;
}

static void task_clock_event_del(struct perf_event *event, int flags)
{
        task_clock_event_stop(event, PERF_EF_UPDATE);
}

static void task_clock_event_read(struct perf_event *event)
{
        u64 now = perf_clock();
        u64 delta = now - event->ctx->timestamp;
        u64 time = event->ctx->time + delta;

        task_clock_event_update(event, time);
}

static int task_clock_event_init(struct perf_event *event)
{
        if (event->attr.type != PERF_TYPE_SOFTWARE)
                return -ENOENT;

        if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
                return -ENOENT;

        /*
         * no branch sampling for software events
         */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        perf_swevent_init_hrtimer(event);

        return 0;
}

static struct pmu perf_task_clock = {
        .task_ctx_nr        = perf_sw_context,

        .capabilities        = PERF_PMU_CAP_NO_NMI,

        .event_init        = task_clock_event_init,
        .add                = task_clock_event_add,
        .del                = task_clock_event_del,
        .start                = task_clock_event_start,
        .stop                = task_clock_event_stop,
        .read                = task_clock_event_read,
};

static void perf_pmu_nop_void(struct pmu *pmu)
{
}

static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
{
}

static int perf_pmu_nop_int(struct pmu *pmu)
{
        return 0;
}

static int perf_event_nop_int(struct perf_event *event, u64 value)
{
        return 0;
}

static DEFINE_PER_CPU(unsigned int, nop_txn_flags);

static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
{
        __this_cpu_write(nop_txn_flags, flags);

        if (flags & ~PERF_PMU_TXN_ADD)
                return;

        perf_pmu_disable(pmu);
}

static int perf_pmu_commit_txn(struct pmu *pmu)
{
        unsigned int flags = __this_cpu_read(nop_txn_flags);

        __this_cpu_write(nop_txn_flags, 0);

        if (flags & ~PERF_PMU_TXN_ADD)
                return 0;

        perf_pmu_enable(pmu);
        return 0;
}

static void perf_pmu_cancel_txn(struct pmu *pmu)
{
        unsigned int flags =  __this_cpu_read(nop_txn_flags);

        __this_cpu_write(nop_txn_flags, 0);

        if (flags & ~PERF_PMU_TXN_ADD)
                return;

        perf_pmu_enable(pmu);
}

static int perf_event_idx_default(struct perf_event *event)
{
        return 0;
}

/*
 * Ensures all contexts with the same task_ctx_nr have the same
 * pmu_cpu_context too.
 */
static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
{
        struct pmu *pmu;

        if (ctxn < 0)
                return NULL;

        list_for_each_entry(pmu, &pmus, entry) {
                if (pmu->task_ctx_nr == ctxn)
                        return pmu->pmu_cpu_context;
        }

        return NULL;
}

static void free_pmu_context(struct pmu *pmu)
{
        /*
         * Static contexts such as perf_sw_context have a global lifetime
         * and may be shared between different PMUs. Avoid freeing them
         * when a single PMU is going away.
         */
        if (pmu->task_ctx_nr > perf_invalid_context)
                return;

        free_percpu(pmu->pmu_cpu_context);
}

/*
 * Let userspace know that this PMU supports address range filtering:
 */
static ssize_t nr_addr_filters_show(struct device *dev,
                                    struct device_attribute *attr,
                                    char *page)
{
        struct pmu *pmu = dev_get_drvdata(dev);

        return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
}
DEVICE_ATTR_RO(nr_addr_filters);

static struct idr pmu_idr;

static ssize_t
type_show(struct device *dev, struct device_attribute *attr, char *page)
{
        struct pmu *pmu = dev_get_drvdata(dev);

        return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
}
static DEVICE_ATTR_RO(type);

static ssize_t
perf_event_mux_interval_ms_show(struct device *dev,
                                struct device_attribute *attr,
                                char *page)
{
        struct pmu *pmu = dev_get_drvdata(dev);

        return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
}

static DEFINE_MUTEX(mux_interval_mutex);

static ssize_t
perf_event_mux_interval_ms_store(struct device *dev,
                                 struct device_attribute *attr,
                                 const char *buf, size_t count)
{
        struct pmu *pmu = dev_get_drvdata(dev);
        int timer, cpu, ret;

        ret = kstrtoint(buf, 0, &timer);
        if (ret)
                return ret;

        if (timer < 1)
                return -EINVAL;

        /* same value, noting to do */
        if (timer == pmu->hrtimer_interval_ms)
                return count;

        mutex_lock(&mux_interval_mutex);
        pmu->hrtimer_interval_ms = timer;

        /* update all cpuctx for this PMU */
        cpus_read_lock();
        for_each_online_cpu(cpu) {
                struct perf_cpu_context *cpuctx;
                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
                cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);

                cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpuctx);
        }
        cpus_read_unlock();
        mutex_unlock(&mux_interval_mutex);

        return count;
}
static DEVICE_ATTR_RW(perf_event_mux_interval_ms);

static struct attribute *pmu_dev_attrs[] = {
        &dev_attr_type.attr,
        &dev_attr_perf_event_mux_interval_ms.attr,
        &dev_attr_nr_addr_filters.attr,
        NULL,
};

static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int n)
{
        struct device *dev = kobj_to_dev(kobj);
        struct pmu *pmu = dev_get_drvdata(dev);

        if (n == 2 && !pmu->nr_addr_filters)
                return 0;

        return a->mode;
}

static struct attribute_group pmu_dev_attr_group = {
        .is_visible = pmu_dev_is_visible,
        .attrs = pmu_dev_attrs,
};

static const struct attribute_group *pmu_dev_groups[] = {
        &pmu_dev_attr_group,
        NULL,
};

static int pmu_bus_running;
static struct bus_type pmu_bus = {
        .name                = "event_source",
        .dev_groups        = pmu_dev_groups,
};

static void pmu_dev_release(struct device *dev)
{
        kfree(dev);
}

static int pmu_dev_alloc(struct pmu *pmu)
{
        int ret = -ENOMEM;

        pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
        if (!pmu->dev)
                goto out;

        pmu->dev->groups = pmu->attr_groups;
        device_initialize(pmu->dev);

        dev_set_drvdata(pmu->dev, pmu);
        pmu->dev->bus = &pmu_bus;
        pmu->dev->release = pmu_dev_release;

        ret = dev_set_name(pmu->dev, "%s", pmu->name);
        if (ret)
                goto free_dev;

        ret = device_add(pmu->dev);
        if (ret)
                goto free_dev;

        if (pmu->attr_update) {
                ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
                if (ret)
                        goto del_dev;
        }

out:
        return ret;

del_dev:
        device_del(pmu->dev);

free_dev:
        put_device(pmu->dev);
        goto out;
}

static struct lock_class_key cpuctx_mutex;
static struct lock_class_key cpuctx_lock;

int perf_pmu_register(struct pmu *pmu, const char *name, int type)
{
        int cpu, ret, max = PERF_TYPE_MAX;

        mutex_lock(&pmus_lock);
        ret = -ENOMEM;
        pmu->pmu_disable_count = alloc_percpu(int);
        if (!pmu->pmu_disable_count)
                goto unlock;

        pmu->type = -1;
        if (!name)
                goto skip_type;
        pmu->name = name;

        if (type != PERF_TYPE_SOFTWARE) {
                if (type >= 0)
                        max = type;

                ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
                if (ret < 0)
                        goto free_pdc;

                WARN_ON(type >= 0 && ret != type);

                type = ret;
        }
        pmu->type = type;

        if (pmu_bus_running) {
                ret = pmu_dev_alloc(pmu);
                if (ret)
                        goto free_idr;
        }

skip_type:
        if (pmu->task_ctx_nr == perf_hw_context) {
                static int hw_context_taken = 0;

                /*
                 * Other than systems with heterogeneous CPUs, it never makes
                 * sense for two PMUs to share perf_hw_context. PMUs which are
                 * uncore must use perf_invalid_context.
                 */
                if (WARN_ON_ONCE(hw_context_taken &&
                    !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
                        pmu->task_ctx_nr = perf_invalid_context;

                hw_context_taken = 1;
        }

        pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
        if (pmu->pmu_cpu_context)
                goto got_cpu_context;

        ret = -ENOMEM;
        pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
        if (!pmu->pmu_cpu_context)
                goto free_dev;

        for_each_possible_cpu(cpu) {
                struct perf_cpu_context *cpuctx;

                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
                __perf_event_init_context(&cpuctx->ctx);
                lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
                lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
                cpuctx->ctx.pmu = pmu;
                cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);

                __perf_mux_hrtimer_init(cpuctx, cpu);

                cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
                cpuctx->heap = cpuctx->heap_default;
        }

got_cpu_context:
        if (!pmu->start_txn) {
                if (pmu->pmu_enable) {
                        /*
                         * If we have pmu_enable/pmu_disable calls, install
                         * transaction stubs that use that to try and batch
                         * hardware accesses.
                         */
                        pmu->start_txn  = perf_pmu_start_txn;
                        pmu->commit_txn = perf_pmu_commit_txn;
                        pmu->cancel_txn = perf_pmu_cancel_txn;
                } else {
                        pmu->start_txn  = perf_pmu_nop_txn;
                        pmu->commit_txn = perf_pmu_nop_int;
                        pmu->cancel_txn = perf_pmu_nop_void;
                }
        }

        if (!pmu->pmu_enable) {
                pmu->pmu_enable  = perf_pmu_nop_void;
                pmu->pmu_disable = perf_pmu_nop_void;
        }

        if (!pmu->check_period)
                pmu->check_period = perf_event_nop_int;

        if (!pmu->event_idx)
                pmu->event_idx = perf_event_idx_default;

        /*
         * Ensure the TYPE_SOFTWARE PMUs are at the head of the list,
         * since these cannot be in the IDR. This way the linear search
         * is fast, provided a valid software event is provided.
         */
        if (type == PERF_TYPE_SOFTWARE || !name)
                list_add_rcu(&pmu->entry, &pmus);
        else
                list_add_tail_rcu(&pmu->entry, &pmus);

        atomic_set(&pmu->exclusive_cnt, 0);
        ret = 0;
unlock:
        mutex_unlock(&pmus_lock);

        return ret;

free_dev:
        device_del(pmu->dev);
        put_device(pmu->dev);

free_idr:
        if (pmu->type != PERF_TYPE_SOFTWARE)
                idr_remove(&pmu_idr, pmu->type);

free_pdc:
        free_percpu(pmu->pmu_disable_count);
        goto unlock;
}
EXPORT_SYMBOL_GPL(perf_pmu_register);

void perf_pmu_unregister(struct pmu *pmu)
{
        mutex_lock(&pmus_lock);
        list_del_rcu(&pmu->entry);

        /*
         * We dereference the pmu list under both SRCU and regular RCU, so
         * synchronize against both of those.
         */
        synchronize_srcu(&pmus_srcu);
        synchronize_rcu();

        free_percpu(pmu->pmu_disable_count);
        if (pmu->type != PERF_TYPE_SOFTWARE)
                idr_remove(&pmu_idr, pmu->type);
        if (pmu_bus_running) {
                if (pmu->nr_addr_filters)
                        device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
                device_del(pmu->dev);
                put_device(pmu->dev);
        }
        free_pmu_context(pmu);
        mutex_unlock(&pmus_lock);
}
EXPORT_SYMBOL_GPL(perf_pmu_unregister);

static inline bool has_extended_regs(struct perf_event *event)
{
        return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) ||
               (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
}

static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
{
        struct perf_event_context *ctx = NULL;
        int ret;

        if (!try_module_get(pmu->module))
                return -ENODEV;

        /*
         * A number of pmu->event_init() methods iterate the sibling_list to,
         * for example, validate if the group fits on the PMU. Therefore,
         * if this is a sibling event, acquire the ctx->mutex to protect
         * the sibling_list.
         */
        if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) {
                /*
                 * This ctx->mutex can nest when we're called through
                 * inheritance. See the perf_event_ctx_lock_nested() comment.
                 */
                ctx = perf_event_ctx_lock_nested(event->group_leader,
                                                 SINGLE_DEPTH_NESTING);
                BUG_ON(!ctx);
        }

        event->pmu = pmu;
        ret = pmu->event_init(event);

        if (ctx)
                perf_event_ctx_unlock(event->group_leader, ctx);

        if (!ret) {
                if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
                    has_extended_regs(event))
                        ret = -EOPNOTSUPP;

                if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
                    event_has_any_exclude_flag(event))
                        ret = -EINVAL;

                if (ret && event->destroy)
                        event->destroy(event);
        }

        if (ret)
                module_put(pmu->module);

        return ret;
}

static struct pmu *perf_init_event(struct perf_event *event)
{
        int idx, type, ret;
        struct pmu *pmu;

        idx = srcu_read_lock(&pmus_srcu);

        /* Try parent's PMU first: */
        if (event->parent && event->parent->pmu) {
                pmu = event->parent->pmu;
                ret = perf_try_init_event(pmu, event);
                if (!ret)
                        goto unlock;
        }

        /*
         * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE
         * are often aliases for PERF_TYPE_RAW.
         */
        type = event->attr.type;
        if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE)
                type = PERF_TYPE_RAW;

again:
        rcu_read_lock();
        pmu = idr_find(&pmu_idr, type);
        rcu_read_unlock();
        if (pmu) {
                ret = perf_try_init_event(pmu, event);
                if (ret == -ENOENT && event->attr.type != type) {
                        type = event->attr.type;
                        goto again;
                }

                if (ret)
                        pmu = ERR_PTR(ret);

                goto unlock;
        }

        list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
                ret = perf_try_init_event(pmu, event);
                if (!ret)
                        goto unlock;

                if (ret != -ENOENT) {
                        pmu = ERR_PTR(ret);
                        goto unlock;
                }
        }
        pmu = ERR_PTR(-ENOENT);
unlock:
        srcu_read_unlock(&pmus_srcu, idx);

        return pmu;
}

static void attach_sb_event(struct perf_event *event)
{
        struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);

        raw_spin_lock(&pel->lock);
        list_add_rcu(&event->sb_list, &pel->list);
        raw_spin_unlock(&pel->lock);
}

/*
 * We keep a list of all !task (and therefore per-cpu) events
 * that need to receive side-band records.
 *
 * This avoids having to scan all the various PMU per-cpu contexts
 * looking for them.
 */
static void account_pmu_sb_event(struct perf_event *event)
{
        if (is_sb_event(event))
                attach_sb_event(event);
}

static void account_event_cpu(struct perf_event *event, int cpu)
{
        if (event->parent)
                return;

        if (is_cgroup_event(event))
                atomic_inc(&per_cpu(perf_cgroup_events, cpu));
}

/* Freq events need the tick to stay alive (see perf_event_task_tick). */
static void account_freq_event_nohz(void)
{
#ifdef CONFIG_NO_HZ_FULL
        /* Lock so we don't race with concurrent unaccount */
        spin_lock(&nr_freq_lock);
        if (atomic_inc_return(&nr_freq_events) == 1)
                tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
        spin_unlock(&nr_freq_lock);
#endif
}

static void account_freq_event(void)
{
        if (tick_nohz_full_enabled())
                account_freq_event_nohz();
        else
                atomic_inc(&nr_freq_events);
}


static void account_event(struct perf_event *event)
{
        bool inc = false;

        if (event->parent)
                return;

        if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
                inc = true;
        if (event->attr.mmap || event->attr.mmap_data)
                atomic_inc(&nr_mmap_events);
        if (event->attr.comm)
                atomic_inc(&nr_comm_events);
        if (event->attr.namespaces)
                atomic_inc(&nr_namespaces_events);
        if (event->attr.cgroup)
                atomic_inc(&nr_cgroup_events);
        if (event->attr.task)
                atomic_inc(&nr_task_events);
        if (event->attr.freq)
                account_freq_event();
        if (event->attr.context_switch) {
                atomic_inc(&nr_switch_events);
                inc = true;
        }
        if (has_branch_stack(event))
                inc = true;
        if (is_cgroup_event(event))
                inc = true;
        if (event->attr.ksymbol)
                atomic_inc(&nr_ksymbol_events);
        if (event->attr.bpf_event)
                atomic_inc(&nr_bpf_events);
        if (event->attr.text_poke)
                atomic_inc(&nr_text_poke_events);

        if (inc) {
                /*
                 * We need the mutex here because static_branch_enable()
                 * must complete *before* the perf_sched_count increment
                 * becomes visible.
                 */
                if (atomic_inc_not_zero(&perf_sched_count))
                        goto enabled;

                mutex_lock(&perf_sched_mutex);
                if (!atomic_read(&perf_sched_count)) {
                        static_branch_enable(&perf_sched_events);
                        /*
                         * Guarantee that all CPUs observe they key change and
                         * call the perf scheduling hooks before proceeding to
                         * install events that need them.
                         */
                        synchronize_rcu();
                }
                /*
                 * Now that we have waited for the sync_sched(), allow further
                 * increments to by-pass the mutex.
                 */
                atomic_inc(&perf_sched_count);
                mutex_unlock(&perf_sched_mutex);
        }
enabled:

        account_event_cpu(event, event->cpu);

        account_pmu_sb_event(event);
}

/*
 * Allocate and initialize an event structure
 */
static struct perf_event *
perf_event_alloc(struct perf_event_attr *attr, int cpu,
                 struct task_struct *task,
                 struct perf_event *group_leader,
                 struct perf_event *parent_event,
                 perf_overflow_handler_t overflow_handler,
                 void *context, int cgroup_fd)
{
        struct pmu *pmu;
        struct perf_event *event;
        struct hw_perf_event *hwc;
        long err = -EINVAL;

        if ((unsigned)cpu >= nr_cpu_ids) {
                if (!task || cpu != -1)
                        return ERR_PTR(-EINVAL);
        }

        event = kzalloc(sizeof(*event), GFP_KERNEL);
        if (!event)
                return ERR_PTR(-ENOMEM);

        /*
         * Single events are their own group leaders, with an
         * empty sibling list:
         */
        if (!group_leader)
                group_leader = event;

        mutex_init(&event->child_mutex);
        INIT_LIST_HEAD(&event->child_list);

        INIT_LIST_HEAD(&event->event_entry);
        INIT_LIST_HEAD(&event->sibling_list);
        INIT_LIST_HEAD(&event->active_list);
        init_event_group(event);
        INIT_LIST_HEAD(&event->rb_entry);
        INIT_LIST_HEAD(&event->active_entry);
        INIT_LIST_HEAD(&event->addr_filters.list);
        INIT_HLIST_NODE(&event->hlist_entry);


        init_waitqueue_head(&event->waitq);
        event->pending_disable = -1;
        init_irq_work(&event->pending, perf_pending_event);

        mutex_init(&event->mmap_mutex);
        raw_spin_lock_init(&event->addr_filters.lock);

        atomic_long_set(&event->refcount, 1);
        event->cpu                = cpu;
        event->attr                = *attr;
        event->group_leader        = group_leader;
        event->pmu                = NULL;
        event->oncpu                = -1;

        event->parent                = parent_event;

        event->ns                = get_pid_ns(task_active_pid_ns(current));
        event->id                = atomic64_inc_return(&perf_event_id);

        event->state                = PERF_EVENT_STATE_INACTIVE;

        if (task) {
                event->attach_state = PERF_ATTACH_TASK;
                /*
                 * XXX pmu::event_init needs to know what task to account to
                 * and we cannot use the ctx information because we need the
                 * pmu before we get a ctx.
                 */
                event->hw.target = get_task_struct(task);
        }

        event->clock = &local_clock;
        if (parent_event)
                event->clock = parent_event->clock;

        if (!overflow_handler && parent_event) {
                overflow_handler = parent_event->overflow_handler;
                context = parent_event->overflow_handler_context;
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
                if (overflow_handler == bpf_overflow_handler) {
                        struct bpf_prog *prog = parent_event->prog;

                        bpf_prog_inc(prog);
                        event->prog = prog;
                        event->orig_overflow_handler =
                                parent_event->orig_overflow_handler;
                }
#endif
        }

        if (overflow_handler) {
                event->overflow_handler        = overflow_handler;
                event->overflow_handler_context = context;
        } else if (is_write_backward(event)){
                event->overflow_handler = perf_event_output_backward;
                event->overflow_handler_context = NULL;
        } else {
                event->overflow_handler = perf_event_output_forward;
                event->overflow_handler_context = NULL;
        }

        perf_event__state_init(event);

        pmu = NULL;

        hwc = &event->hw;
        hwc->sample_period = attr->sample_period;
        if (attr->freq && attr->sample_freq)
                hwc->sample_period = 1;
        hwc->last_period = hwc->sample_period;

        local64_set(&hwc->period_left, hwc->sample_period);

        /*
         * We currently do not support PERF_SAMPLE_READ on inherited events.
         * See perf_output_read().
         */
        if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
                goto err_ns;

        if (!has_branch_stack(event))
                event->attr.branch_sample_type = 0;

        pmu = perf_init_event(event);
        if (IS_ERR(pmu)) {
                err = PTR_ERR(pmu);
                goto err_ns;
        }

        /*
         * Disallow uncore-cgroup events, they don't make sense as the cgroup will
         * be different on other CPUs in the uncore mask.
         */
        if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) {
                err = -EINVAL;
                goto err_pmu;
        }

        if (event->attr.aux_output &&
            !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
                err = -EOPNOTSUPP;
                goto err_pmu;
        }

        if (cgroup_fd != -1) {
                err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
                if (err)
                        goto err_pmu;
        }

        err = exclusive_event_init(event);
        if (err)
                goto err_pmu;

        if (has_addr_filter(event)) {
                event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
                                                    sizeof(struct perf_addr_filter_range),
                                                    GFP_KERNEL);
                if (!event->addr_filter_ranges) {
                        err = -ENOMEM;
                        goto err_per_task;
                }

                /*
                 * Clone the parent's vma offsets: they are valid until exec()
                 * even if the mm is not shared with the parent.
                 */
                if (event->parent) {
                        struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);

                        raw_spin_lock_irq(&ifh->lock);
                        memcpy(event->addr_filter_ranges,
                               event->parent->addr_filter_ranges,
                               pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range));
                        raw_spin_unlock_irq(&ifh->lock);
                }

                /* force hw sync on the address filters */
                event->addr_filters_gen = 1;
        }

        if (!event->parent) {
                if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
                        err = get_callchain_buffers(attr->sample_max_stack);
                        if (err)
                                goto err_addr_filters;
                }
        }

        err = security_perf_event_alloc(event);
        if (err)
                goto err_callchain_buffer;

        /* symmetric to unaccount_event() in _free_event() */
        account_event(event);

        return event;

err_callchain_buffer:
        if (!event->parent) {
                if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
                        put_callchain_buffers();
        }
err_addr_filters:
        kfree(event->addr_filter_ranges);

err_per_task:
        exclusive_event_destroy(event);

err_pmu:
        if (is_cgroup_event(event))
                perf_detach_cgroup(event);
        if (event->destroy)
                event->destroy(event);
        module_put(pmu->module);
err_ns:
        if (event->ns)
                put_pid_ns(event->ns);
        if (event->hw.target)
                put_task_struct(event->hw.target);
        kfree(event);

        return ERR_PTR(err);
}

static int perf_copy_attr(struct perf_event_attr __user *uattr,
                          struct perf_event_attr *attr)
{
        u32 size;
        int ret;

        /* Zero the full structure, so that a short copy will be nice. */
        memset(attr, 0, sizeof(*attr));

        ret = get_user(size, &uattr->size);
        if (ret)
                return ret;

        /* ABI compatibility quirk: */
        if (!size)
                size = PERF_ATTR_SIZE_VER0;
        if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE)
                goto err_size;

        ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
        if (ret) {
                if (ret == -E2BIG)
                        goto err_size;
                return ret;
        }

        attr->size = size;

        if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
                return -EINVAL;

        if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
                return -EINVAL;

        if (attr->read_format & ~(PERF_FORMAT_MAX-1))
                return -EINVAL;

        if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
                u64 mask = attr->branch_sample_type;

                /* only using defined bits */
                if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
                        return -EINVAL;

                /* at least one branch bit must be set */
                if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
                        return -EINVAL;

                /* propagate priv level, when not set for branch */
                if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {

                        /* exclude_kernel checked on syscall entry */
                        if (!attr->exclude_kernel)
                                mask |= PERF_SAMPLE_BRANCH_KERNEL;

                        if (!attr->exclude_user)
                                mask |= PERF_SAMPLE_BRANCH_USER;

                        if (!attr->exclude_hv)
                                mask |= PERF_SAMPLE_BRANCH_HV;
                        /*
                         * adjust user setting (for HW filter setup)
                         */
                        attr->branch_sample_type = mask;
                }
                /* privileged levels capture (kernel, hv): check permissions */
                if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) {
                        ret = perf_allow_kernel(attr);
                        if (ret)
                                return ret;
                }
        }

        if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
                ret = perf_reg_validate(attr->sample_regs_user);
                if (ret)
                        return ret;
        }

        if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
                if (!arch_perf_have_user_stack_dump())
                        return -ENOSYS;

                /*
                 * We have __u32 type for the size, but so far
                 * we can only use __u16 as maximum due to the
                 * __u16 sample size limit.
                 */
                if (attr->sample_stack_user >= USHRT_MAX)
                        return -EINVAL;
                else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
                        return -EINVAL;
        }

        if (!attr->sample_max_stack)
                attr->sample_max_stack = sysctl_perf_event_max_stack;

        if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
                ret = perf_reg_validate(attr->sample_regs_intr);

#ifndef CONFIG_CGROUP_PERF
        if (attr->sample_type & PERF_SAMPLE_CGROUP)
                return -EINVAL;
#endif

out:
        return ret;

err_size:
        put_user(sizeof(*attr), &uattr->size);
        ret = -E2BIG;
        goto out;
}

static void mutex_lock_double(struct mutex *a, struct mutex *b)
{
        if (b < a)
                swap(a, b);

        mutex_lock(a);
        mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
}

static int
perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
{
        struct perf_buffer *rb = NULL;
        int ret = -EINVAL;

        if (!output_event) {
                mutex_lock(&event->mmap_mutex);
                goto set;
        }

        /* don't allow circular references */
        if (event == output_event)
                goto out;

        /*
         * Don't allow cross-cpu buffers
         */
        if (output_event->cpu != event->cpu)
                goto out;

        /*
         * If its not a per-cpu rb, it must be the same task.
         */
        if (output_event->cpu == -1 && output_event->hw.target != event->hw.target)
                goto out;

        /*
         * Mixing clocks in the same buffer is trouble you don't need.
         */
        if (output_event->clock != event->clock)
                goto out;

        /*
         * Either writing ring buffer from beginning or from end.
         * Mixing is not allowed.
         */
        if (is_write_backward(output_event) != is_write_backward(event))
                goto out;

        /*
         * If both events generate aux data, they must be on the same PMU
         */
        if (has_aux(event) && has_aux(output_event) &&
            event->pmu != output_event->pmu)
                goto out;

        /*
         * Hold both mmap_mutex to serialize against perf_mmap_close().  Since
         * output_event is already on rb->event_list, and the list iteration
         * restarts after every removal, it is guaranteed this new event is
         * observed *OR* if output_event is already removed, it's guaranteed we
         * observe !rb->mmap_count.
         */
        mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
set:
        /* Can't redirect output if we've got an active mmap() */
        if (atomic_read(&event->mmap_count))
                goto unlock;

        if (output_event) {
                /* get the rb we want to redirect to */
                rb = ring_buffer_get(output_event);
                if (!rb)
                        goto unlock;

                /* did we race against perf_mmap_close() */
                if (!atomic_read(&rb->mmap_count)) {
                        ring_buffer_put(rb);
                        goto unlock;
                }
        }

        ring_buffer_attach(event, rb);

        ret = 0;
unlock:
        mutex_unlock(&event->mmap_mutex);
        if (output_event)
                mutex_unlock(&output_event->mmap_mutex);

out:
        return ret;
}

static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
{
        bool nmi_safe = false;

        switch (clk_id) {
        case CLOCK_MONOTONIC:
                event->clock = &ktime_get_mono_fast_ns;
                nmi_safe = true;
                break;

        case CLOCK_MONOTONIC_RAW:
                event->clock = &ktime_get_raw_fast_ns;
                nmi_safe = true;
                break;

        case CLOCK_REALTIME:
                event->clock = &ktime_get_real_ns;
                break;

        case CLOCK_BOOTTIME:
                event->clock = &ktime_get_boottime_ns;
                break;

        case CLOCK_TAI:
                event->clock = &ktime_get_clocktai_ns;
                break;

        default:
                return -EINVAL;
        }

        if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
                return -EINVAL;

        return 0;
}

/*
 * Variation on perf_event_ctx_lock_nested(), except we take two context
 * mutexes.
 */
static struct perf_event_context *
__perf_event_ctx_lock_double(struct perf_event *group_leader,
                             struct perf_event_context *ctx)
{
        struct perf_event_context *gctx;

again:
        rcu_read_lock();
        gctx = READ_ONCE(group_leader->ctx);
        if (!refcount_inc_not_zero(&gctx->refcount)) {
                rcu_read_unlock();
                goto again;
        }
        rcu_read_unlock();

        mutex_lock_double(&gctx->mutex, &ctx->mutex);

        if (group_leader->ctx != gctx) {
                mutex_unlock(&ctx->mutex);
                mutex_unlock(&gctx->mutex);
                put_ctx(gctx);
                goto again;
        }

        return gctx;
}

/**
 * sys_perf_event_open - open a performance event, associate it to a task/cpu
 *
 * @attr_uptr:        event_id type attributes for monitoring/sampling
 * @pid:                target pid
 * @cpu:                target cpu
 * @group_fd:                group leader event fd
 */
SYSCALL_DEFINE5(perf_event_open,
                struct perf_event_attr __user *, attr_uptr,
                pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
{
        struct perf_event *group_leader = NULL, *output_event = NULL;
        struct perf_event *event, *sibling;
        struct perf_event_attr attr;
        struct perf_event_context *ctx, *gctx;
        struct file *event_file = NULL;
        struct fd group = {NULL, 0};
        struct task_struct *task = NULL;
        struct pmu *pmu;
        int event_fd;
        int move_group = 0;
        int err;
        int f_flags = O_RDWR;
        int cgroup_fd = -1;

        /* for future expandability... */
        if (flags & ~PERF_FLAG_ALL)
                return -EINVAL;

        err = perf_copy_attr(attr_uptr, &attr);
        if (err)
                return err;

        /* Do we allow access to perf_event_open(2) ? */
        err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
        if (err)
                return err;

        if (!attr.exclude_kernel) {
                err = perf_allow_kernel(&attr);
                if (err)
                        return err;
        }

        if (attr.namespaces) {
                if (!perfmon_capable())
                        return -EACCES;
        }

        if (attr.freq) {
                if (attr.sample_freq > sysctl_perf_event_sample_rate)
                        return -EINVAL;
        } else {
                if (attr.sample_period & (1ULL << 63))
                        return -EINVAL;
        }

        /* Only privileged users can get physical addresses */
        if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
                err = perf_allow_kernel(&attr);
                if (err)
                        return err;
        }

        /* REGS_INTR can leak data, lockdown must prevent this */
        if (attr.sample_type & PERF_SAMPLE_REGS_INTR) {
                err = security_locked_down(LOCKDOWN_PERF);
                if (err)
                        return err;
        }

        /*
         * In cgroup mode, the pid argument is used to pass the fd
         * opened to the cgroup directory in cgroupfs. The cpu argument
         * designates the cpu on which to monitor threads from that
         * cgroup.
         */
        if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
                return -EINVAL;

        if (flags & PERF_FLAG_FD_CLOEXEC)
                f_flags |= O_CLOEXEC;

        event_fd = get_unused_fd_flags(f_flags);
        if (event_fd < 0)
                return event_fd;

        if (group_fd != -1) {
                err = perf_fget_light(group_fd, &group);
                if (err)
                        goto err_fd;
                group_leader = group.file->private_data;
                if (flags & PERF_FLAG_FD_OUTPUT)
                        output_event = group_leader;
                if (flags & PERF_FLAG_FD_NO_GROUP)
                        group_leader = NULL;
        }

        if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
                task = find_lively_task_by_vpid(pid);
                if (IS_ERR(task)) {
                        err = PTR_ERR(task);
                        goto err_group_fd;
                }
        }

        if (task && group_leader &&
            group_leader->attr.inherit != attr.inherit) {
                err = -EINVAL;
                goto err_task;
        }

        if (flags & PERF_FLAG_PID_CGROUP)
                cgroup_fd = pid;

        event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
                                 NULL, NULL, cgroup_fd);
        if (IS_ERR(event)) {
                err = PTR_ERR(event);
                goto err_task;
        }

        if (is_sampling_event(event)) {
                if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
                        err = -EOPNOTSUPP;
                        goto err_alloc;
                }
        }

        /*
         * Special case software events and allow them to be part of
         * any hardware group.
         */
        pmu = event->pmu;

        if (attr.use_clockid) {
                err = perf_event_set_clock(event, attr.clockid);
                if (err)
                        goto err_alloc;
        }

        if (pmu->task_ctx_nr == perf_sw_context)
                event->event_caps |= PERF_EV_CAP_SOFTWARE;

        if (group_leader) {
                if (is_software_event(event) &&
                    !in_software_context(group_leader)) {
                        /*
                         * If the event is a sw event, but the group_leader
                         * is on hw context.
                         *
                         * Allow the addition of software events to hw
                         * groups, this is safe because software events
                         * never fail to schedule.
                         */
                        pmu = group_leader->ctx->pmu;
                } else if (!is_software_event(event) &&
                           is_software_event(group_leader) &&
                           (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
                        /*
                         * In case the group is a pure software group, and we
                         * try to add a hardware event, move the whole group to
                         * the hardware context.
                         */
                        move_group = 1;
                }
        }

        /*
         * Get the target context (task or percpu):
         */
        ctx = find_get_context(pmu, task, event);
        if (IS_ERR(ctx)) {
                err = PTR_ERR(ctx);
                goto err_alloc;
        }

        /*
         * Look up the group leader (we will attach this event to it):
         */
        if (group_leader) {
                err = -EINVAL;

                /*
                 * Do not allow a recursive hierarchy (this new sibling
                 * becoming part of another group-sibling):
                 */
                if (group_leader->group_leader != group_leader)
                        goto err_context;

                /* All events in a group should have the same clock */
                if (group_leader->clock != event->clock)
                        goto err_context;

                /*
                 * Make sure we're both events for the same CPU;
                 * grouping events for different CPUs is broken; since
                 * you can never concurrently schedule them anyhow.
                 */
                if (group_leader->cpu != event->cpu)
                        goto err_context;

                /*
                 * Make sure we're both on the same task, or both
                 * per-CPU events.
                 */
                if (group_leader->ctx->task != ctx->task)
                        goto err_context;

                /*
                 * Do not allow to attach to a group in a different task
                 * or CPU context. If we're moving SW events, we'll fix
                 * this up later, so allow that.
                 *
                 * Racy, not holding group_leader->ctx->mutex, see comment with
                 * perf_event_ctx_lock().
                 */
                if (!move_group && group_leader->ctx != ctx)
                        goto err_context;

                /*
                 * Only a group leader can be exclusive or pinned
                 */
                if (attr.exclusive || attr.pinned)
                        goto err_context;
        }

        if (output_event) {
                err = perf_event_set_output(event, output_event);
                if (err)
                        goto err_context;
        }

        event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
                                        f_flags);
        if (IS_ERR(event_file)) {
                err = PTR_ERR(event_file);
                event_file = NULL;
                goto err_context;
        }

        if (task) {
                err = down_read_interruptible(&task->signal->exec_update_lock);
                if (err)
                        goto err_file;

                /*
                 * Preserve ptrace permission check for backwards compatibility.
                 *
                 * We must hold exec_update_lock across this and any potential
                 * perf_install_in_context() call for this new event to
                 * serialize against exec() altering our credentials (and the
                 * perf_event_exit_task() that could imply).
                 */
                err = -EACCES;
                if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
                        goto err_cred;
        }

        if (move_group) {
                gctx = __perf_event_ctx_lock_double(group_leader, ctx);

                if (gctx->task == TASK_TOMBSTONE) {
                        err = -ESRCH;
                        goto err_locked;
                }

                /*
                 * Check if we raced against another sys_perf_event_open() call
                 * moving the software group underneath us.
                 */
                if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
                        /*
                         * If someone moved the group out from under us, check
                         * if this new event wound up on the same ctx, if so
                         * its the regular !move_group case, otherwise fail.
                         */
                        if (gctx != ctx) {
                                err = -EINVAL;
                                goto err_locked;
                        } else {
                                perf_event_ctx_unlock(group_leader, gctx);
                                move_group = 0;
                                goto not_move_group;
                        }
                }

                /*
                 * Failure to create exclusive events returns -EBUSY.
                 */
                err = -EBUSY;
                if (!exclusive_event_installable(group_leader, ctx))
                        goto err_locked;

                for_each_sibling_event(sibling, group_leader) {
                        if (!exclusive_event_installable(sibling, ctx))
                                goto err_locked;
                }
        } else {
                mutex_lock(&ctx->mutex);

                /*
                 * Now that we hold ctx->lock, (re)validate group_leader->ctx == ctx,
                 * see the group_leader && !move_group test earlier.
                 */
                if (group_leader && group_leader->ctx != ctx) {
                        err = -EINVAL;
                        goto err_locked;
                }
        }
not_move_group:

        if (ctx->task == TASK_TOMBSTONE) {
                err = -ESRCH;
                goto err_locked;
        }

        if (!perf_event_validate_size(event)) {
                err = -E2BIG;
                goto err_locked;
        }

        if (!task) {
                /*
                 * Check if the @cpu we're creating an event for is online.
                 *
                 * We use the perf_cpu_context::ctx::mutex to serialize against
                 * the hotplug notifiers. See perf_event_{init,exit}_cpu().
                 */
                struct perf_cpu_context *cpuctx =
                        container_of(ctx, struct perf_cpu_context, ctx);

                if (!cpuctx->online) {
                        err = -ENODEV;
                        goto err_locked;
                }
        }

        if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
                err = -EINVAL;
                goto err_locked;
        }

        /*
         * Must be under the same ctx::mutex as perf_install_in_context(),
         * because we need to serialize with concurrent event creation.
         */
        if (!exclusive_event_installable(event, ctx)) {
                err = -EBUSY;
                goto err_locked;
        }

        WARN_ON_ONCE(ctx->parent_ctx);

        /*
         * This is the point on no return; we cannot fail hereafter. This is
         * where we start modifying current state.
         */

        if (move_group) {
                /*
                 * See perf_event_ctx_lock() for comments on the details
                 * of swizzling perf_event::ctx.
                 */
                perf_remove_from_context(group_leader, 0);
                put_ctx(gctx);

                for_each_sibling_event(sibling, group_leader) {
                        perf_remove_from_context(sibling, 0);
                        put_ctx(gctx);
                }

                /*
                 * Wait for everybody to stop referencing the events through
                 * the old lists, before installing it on new lists.
                 */
                synchronize_rcu();

                /*
                 * Install the group siblings before the group leader.
                 *
                 * Because a group leader will try and install the entire group
                 * (through the sibling list, which is still in-tact), we can
                 * end up with siblings installed in the wrong context.
                 *
                 * By installing siblings first we NO-OP because they're not
                 * reachable through the group lists.
                 */
                for_each_sibling_event(sibling, group_leader) {
                        perf_event__state_init(sibling);
                        perf_install_in_context(ctx, sibling, sibling->cpu);
                        get_ctx(ctx);
                }

                /*
                 * Removing from the context ends up with disabled
                 * event. What we want here is event in the initial
                 * startup state, ready to be add into new context.
                 */
                perf_event__state_init(group_leader);
                perf_install_in_context(ctx, group_leader, group_leader->cpu);
                get_ctx(ctx);
        }

        /*
         * Precalculate sample_data sizes; do while holding ctx::mutex such
         * that we're serialized against further additions and before
         * perf_install_in_context() which is the point the event is active and
         * can use these values.
         */
        perf_event__header_size(event);
        perf_event__id_header_size(event);

        event->owner = current;

        perf_install_in_context(ctx, event, event->cpu);
        perf_unpin_context(ctx);

        if (move_group)
                perf_event_ctx_unlock(group_leader, gctx);
        mutex_unlock(&ctx->mutex);

        if (task) {
                up_read(&task->signal->exec_update_lock);
                put_task_struct(task);
        }

        mutex_lock(&current->perf_event_mutex);
        list_add_tail(&event->owner_entry, &current->perf_event_list);
        mutex_unlock(&current->perf_event_mutex);

        /*
         * Drop the reference on the group_event after placing the
         * new event on the sibling_list. This ensures destruction
         * of the group leader will find the pointer to itself in
         * perf_group_detach().
         */
        fdput(group);
        fd_install(event_fd, event_file);
        return event_fd;

err_locked:
        if (move_group)
                perf_event_ctx_unlock(group_leader, gctx);
        mutex_unlock(&ctx->mutex);
err_cred:
        if (task)
                up_read(&task->signal->exec_update_lock);
err_file:
        fput(event_file);
err_context:
        perf_unpin_context(ctx);
        put_ctx(ctx);
err_alloc:
        /*
         * If event_file is set, the fput() above will have called ->release()
         * and that will take care of freeing the event.
         */
        if (!event_file)
                free_event(event);
err_task:
        if (task)
                put_task_struct(task);
err_group_fd:
        fdput(group);
err_fd:
        put_unused_fd(event_fd);
        return err;
}

/**
 * perf_event_create_kernel_counter
 *
 * @attr: attributes of the counter to create
 * @cpu: cpu in which the counter is bound
 * @task: task to profile (NULL for percpu)
 */
struct perf_event *
perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
                                 struct task_struct *task,
                                 perf_overflow_handler_t overflow_handler,
                                 void *context)
{
        struct perf_event_context *ctx;
        struct perf_event *event;
        int err;

        /*
         * Grouping is not supported for kernel events, neither is 'AUX',
         * make sure the caller's intentions are adjusted.
         */
        if (attr->aux_output)
                return ERR_PTR(-EINVAL);

        event = perf_event_alloc(attr, cpu, task, NULL, NULL,
                                 overflow_handler, context, -1);
        if (IS_ERR(event)) {
                err = PTR_ERR(event);
                goto err;
        }

        /* Mark owner so we could distinguish it from user events. */
        event->owner = TASK_TOMBSTONE;

        /*
         * Get the target context (task or percpu):
         */
        ctx = find_get_context(event->pmu, task, event);
        if (IS_ERR(ctx)) {
                err = PTR_ERR(ctx);
                goto err_free;
        }

        WARN_ON_ONCE(ctx->parent_ctx);
        mutex_lock(&ctx->mutex);
        if (ctx->task == TASK_TOMBSTONE) {
                err = -ESRCH;
                goto err_unlock;
        }

        if (!task) {
                /*
                 * Check if the @cpu we're creating an event for is online.
                 *
                 * We use the perf_cpu_context::ctx::mutex to serialize against
                 * the hotplug notifiers. See perf_event_{init,exit}_cpu().
                 */
                struct perf_cpu_context *cpuctx =
                        container_of(ctx, struct perf_cpu_context, ctx);
                if (!cpuctx->online) {
                        err = -ENODEV;
                        goto err_unlock;
                }
        }

        if (!exclusive_event_installable(event, ctx)) {
                err = -EBUSY;
                goto err_unlock;
        }

        perf_install_in_context(ctx, event, event->cpu);
        perf_unpin_context(ctx);
        mutex_unlock(&ctx->mutex);

        return event;

err_unlock:
        mutex_unlock(&ctx->mutex);
        perf_unpin_context(ctx);
        put_ctx(ctx);
err_free:
        free_event(event);
err:
        return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);

void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
{
        struct perf_event_context *src_ctx;
        struct perf_event_context *dst_ctx;
        struct perf_event *event, *tmp;
        LIST_HEAD(events);

        src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
        dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;

        /*
         * See perf_event_ctx_lock() for comments on the details
         * of swizzling perf_event::ctx.
         */
        mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
        list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
                                 event_entry) {
                perf_remove_from_context(event, 0);
                unaccount_event_cpu(event, src_cpu);
                put_ctx(src_ctx);
                list_add(&event->migrate_entry, &events);
        }

        /*
         * Wait for the events to quiesce before re-instating them.
         */
        synchronize_rcu();

        /*
         * Re-instate events in 2 passes.
         *
         * Skip over group leaders and only install siblings on this first
         * pass, siblings will not get enabled without a leader, however a
         * leader will enable its siblings, even if those are still on the old
         * context.
         */
        list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
                if (event->group_leader == event)
                        continue;

                list_del(&event->migrate_entry);
                if (event->state >= PERF_EVENT_STATE_OFF)
                        event->state = PERF_EVENT_STATE_INACTIVE;
                account_event_cpu(event, dst_cpu);
                perf_install_in_context(dst_ctx, event, dst_cpu);
                get_ctx(dst_ctx);
        }

        /*
         * Once all the siblings are setup properly, install the group leaders
         * to make it go.
         */
        list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
                list_del(&event->migrate_entry);
                if (event->state >= PERF_EVENT_STATE_OFF)
                        event->state = PERF_EVENT_STATE_INACTIVE;
                account_event_cpu(event, dst_cpu);
                perf_install_in_context(dst_ctx, event, dst_cpu);
                get_ctx(dst_ctx);
        }
        mutex_unlock(&dst_ctx->mutex);
        mutex_unlock(&src_ctx->mutex);
}
EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);

static void sync_child_event(struct perf_event *child_event,
                               struct task_struct *child)
{
        struct perf_event *parent_event = child_event->parent;
        u64 child_val;

        if (child_event->attr.inherit_stat)
                perf_event_read_event(child_event, child);

        child_val = perf_event_count(child_event);

        /*
         * Add back the child's count to the parent's count:
         */
        atomic64_add(child_val, &parent_event->child_count);
        atomic64_add(child_event->total_time_enabled,
                     &parent_event->child_total_time_enabled);
        atomic64_add(child_event->total_time_running,
                     &parent_event->child_total_time_running);
}

static void
perf_event_exit_event(struct perf_event *child_event,
                      struct perf_event_context *child_ctx,
                      struct task_struct *child)
{
        struct perf_event *parent_event = child_event->parent;

        /*
         * Do not destroy the 'original' grouping; because of the context
         * switch optimization the original events could've ended up in a
         * random child task.
         *
         * If we were to destroy the original group, all group related
         * operations would cease to function properly after this random
         * child dies.
         *
         * Do destroy all inherited groups, we don't care about those
         * and being thorough is better.
         */
        raw_spin_lock_irq(&child_ctx->lock);
        WARN_ON_ONCE(child_ctx->is_active);

        if (parent_event)
                perf_group_detach(child_event);
        list_del_event(child_event, child_ctx);
        perf_event_set_state(child_event, PERF_EVENT_STATE_EXIT); /* is_event_hup() */
        raw_spin_unlock_irq(&child_ctx->lock);

        /*
         * Parent events are governed by their filedesc, retain them.
         */
        if (!parent_event) {
                perf_event_wakeup(child_event);
                return;
        }
        /*
         * Child events can be cleaned up.
         */

        sync_child_event(child_event, child);

        /*
         * Remove this event from the parent's list
         */
        WARN_ON_ONCE(parent_event->ctx->parent_ctx);
        mutex_lock(&parent_event->child_mutex);
        list_del_init(&child_event->child_list);
        mutex_unlock(&parent_event->child_mutex);

        /*
         * Kick perf_poll() for is_event_hup().
         */
        perf_event_wakeup(parent_event);
        free_event(child_event);
        put_event(parent_event);
}

static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
{
        struct perf_event_context *child_ctx, *clone_ctx = NULL;
        struct perf_event *child_event, *next;

        WARN_ON_ONCE(child != current);

        child_ctx = perf_pin_task_context(child, ctxn);
        if (!child_ctx)
                return;

        /*
         * In order to reduce the amount of tricky in ctx tear-down, we hold
         * ctx::mutex over the entire thing. This serializes against almost
         * everything that wants to access the ctx.
         *
         * The exception is sys_perf_event_open() /
         * perf_event_create_kernel_count() which does find_get_context()
         * without ctx::mutex (it cannot because of the move_group double mutex
         * lock thing). See the comments in perf_install_in_context().
         */
        mutex_lock(&child_ctx->mutex);

        /*
         * In a single ctx::lock section, de-schedule the events and detach the
         * context from the task such that we cannot ever get it scheduled back
         * in.
         */
        raw_spin_lock_irq(&child_ctx->lock);
        task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);

        /*
         * Now that the context is inactive, destroy the task <-> ctx relation
         * and mark the context dead.
         */
        RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
        put_ctx(child_ctx); /* cannot be last */
        WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
        put_task_struct(current); /* cannot be last */

        clone_ctx = unclone_ctx(child_ctx);
        raw_spin_unlock_irq(&child_ctx->lock);

        if (clone_ctx)
                put_ctx(clone_ctx);

        /*
         * Report the task dead after unscheduling the events so that we
         * won't get any samples after PERF_RECORD_EXIT. We can however still
         * get a few PERF_RECORD_READ events.
         */
        perf_event_task(child, child_ctx, 0);

        list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
                perf_event_exit_event(child_event, child_ctx, child);

        mutex_unlock(&child_ctx->mutex);

        put_ctx(child_ctx);
}

/*
 * When a child task exits, feed back event values to parent events.
 *
 * Can be called with exec_update_lock held when called from
 * setup_new_exec().
 */
void perf_event_exit_task(struct task_struct *child)
{
        struct perf_event *event, *tmp;
        int ctxn;

        mutex_lock(&child->perf_event_mutex);
        list_for_each_entry_safe(event, tmp, &child->perf_event_list,
                                 owner_entry) {
                list_del_init(&event->owner_entry);

                /*
                 * Ensure the list deletion is visible before we clear
                 * the owner, closes a race against perf_release() where
                 * we need to serialize on the owner->perf_event_mutex.
                 */
                smp_store_release(&event->owner, NULL);
        }
        mutex_unlock(&child->perf_event_mutex);

        for_each_task_context_nr(ctxn)
                perf_event_exit_task_context(child, ctxn);

        /*
         * The perf_event_exit_task_context calls perf_event_task
         * with child's task_ctx, which generates EXIT events for
         * child contexts and sets child->perf_event_ctxp[] to NULL.
         * At this point we need to send EXIT events to cpu contexts.
         */
        perf_event_task(child, NULL, 0);
}

static void perf_free_event(struct perf_event *event,
                            struct perf_event_context *ctx)
{
        struct perf_event *parent = event->parent;

        if (WARN_ON_ONCE(!parent))
                return;

        mutex_lock(&parent->child_mutex);
        list_del_init(&event->child_list);
        mutex_unlock(&parent->child_mutex);

        put_event(parent);

        raw_spin_lock_irq(&ctx->lock);
        perf_group_detach(event);
        list_del_event(event, ctx);
        raw_spin_unlock_irq(&ctx->lock);
        free_event(event);
}

/*
 * Free a context as created by inheritance by perf_event_init_task() below,
 * used by fork() in case of fail.
 *
 * Even though the task has never lived, the context and events have been
 * exposed through the child_list, so we must take care tearing it all down.
 */
void perf_event_free_task(struct task_struct *task)
{
        struct perf_event_context *ctx;
        struct perf_event *event, *tmp;
        int ctxn;

        for_each_task_context_nr(ctxn) {
                ctx = task->perf_event_ctxp[ctxn];
                if (!ctx)
                        continue;

                mutex_lock(&ctx->mutex);
                raw_spin_lock_irq(&ctx->lock);
                /*
                 * Destroy the task <-> ctx relation and mark the context dead.
                 *
                 * This is important because even though the task hasn't been
                 * exposed yet the context has been (through child_list).
                 */
                RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
                WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
                put_task_struct(task); /* cannot be last */
                raw_spin_unlock_irq(&ctx->lock);

                list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
                        perf_free_event(event, ctx);

                mutex_unlock(&ctx->mutex);

                /*
                 * perf_event_release_kernel() could've stolen some of our
                 * child events and still have them on its free_list. In that
                 * case we must wait for these events to have been freed (in
                 * particular all their references to this task must've been
                 * dropped).
                 *
                 * Without this copy_process() will unconditionally free this
                 * task (irrespective of its reference count) and
                 * _free_event()'s put_task_struct(event->hw.target) will be a
                 * use-after-free.
                 *
                 * Wait for all events to drop their context reference.
                 */
                wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
                put_ctx(ctx); /* must be last */
        }
}

void perf_event_delayed_put(struct task_struct *task)
{
        int ctxn;

        for_each_task_context_nr(ctxn)
                WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
}

struct file *perf_event_get(unsigned int fd)
{
        struct file *file = fget(fd);
        if (!file)
                return ERR_PTR(-EBADF);

        if (file->f_op != &perf_fops) {
                fput(file);
                return ERR_PTR(-EBADF);
        }

        return file;
}

const struct perf_event *perf_get_event(struct file *file)
{
        if (file->f_op != &perf_fops)
                return ERR_PTR(-EINVAL);

        return file->private_data;
}

const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
{
        if (!event)
                return ERR_PTR(-EINVAL);

        return &event->attr;
}

/*
 * Inherit an event from parent task to child task.
 *
 * Returns:
 *  - valid pointer on success
 *  - NULL for orphaned events
 *  - IS_ERR() on error
 */
static struct perf_event *
inherit_event(struct perf_event *parent_event,
              struct task_struct *parent,
              struct perf_event_context *parent_ctx,
              struct task_struct *child,
              struct perf_event *group_leader,
              struct perf_event_context *child_ctx)
{
        enum perf_event_state parent_state = parent_event->state;
        struct perf_event *child_event;
        unsigned long flags;

        /*
         * Instead of creating recursive hierarchies of events,
         * we link inherited events back to the original parent,
         * which has a filp for sure, which we use as the reference
         * count:
         */
        if (parent_event->parent)
                parent_event = parent_event->parent;

        child_event = perf_event_alloc(&parent_event->attr,
                                           parent_event->cpu,
                                           child,
                                           group_leader, parent_event,
                                           NULL, NULL, -1);
        if (IS_ERR(child_event))
                return child_event;


        if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) &&
            !child_ctx->task_ctx_data) {
                struct pmu *pmu = child_event->pmu;

                child_ctx->task_ctx_data = alloc_task_ctx_data(pmu);
                if (!child_ctx->task_ctx_data) {
                        free_event(child_event);
                        return ERR_PTR(-ENOMEM);
                }
        }

        /*
         * is_orphaned_event() and list_add_tail(&parent_event->child_list)
         * must be under the same lock in order to serialize against
         * perf_event_release_kernel(), such that either we must observe
         * is_orphaned_event() or they will observe us on the child_list.
         */
        mutex_lock(&parent_event->child_mutex);
        if (is_orphaned_event(parent_event) ||
            !atomic_long_inc_not_zero(&parent_event->refcount)) {
                mutex_unlock(&parent_event->child_mutex);
                /* task_ctx_data is freed with child_ctx */
                free_event(child_event);
                return NULL;
        }

        get_ctx(child_ctx);

        /*
         * Make the child state follow the state of the parent event,
         * not its attr.disabled bit.  We hold the parent's mutex,
         * so we won't race with perf_event_{en, dis}able_family.
         */
        if (parent_state >= PERF_EVENT_STATE_INACTIVE)
                child_event->state = PERF_EVENT_STATE_INACTIVE;
        else
                child_event->state = PERF_EVENT_STATE_OFF;

        if (parent_event->attr.freq) {
                u64 sample_period = parent_event->hw.sample_period;
                struct hw_perf_event *hwc = &child_event->hw;

                hwc->sample_period = sample_period;
                hwc->last_period   = sample_period;

                local64_set(&hwc->period_left, sample_period);
        }

        child_event->ctx = child_ctx;
        child_event->overflow_handler = parent_event->overflow_handler;
        child_event->overflow_handler_context
                = parent_event->overflow_handler_context;

        /*
         * Precalculate sample_data sizes
         */
        perf_event__header_size(child_event);
        perf_event__id_header_size(child_event);

        /*
         * Link it up in the child's context:
         */
        raw_spin_lock_irqsave(&child_ctx->lock, flags);
        add_event_to_ctx(child_event, child_ctx);
        raw_spin_unlock_irqrestore(&child_ctx->lock, flags);

        /*
         * Link this into the parent event's child list
         */
        list_add_tail(&child_event->child_list, &parent_event->child_list);
        mutex_unlock(&parent_event->child_mutex);

        return child_event;
}

/*
 * Inherits an event group.
 *
 * This will quietly suppress orphaned events; !inherit_event() is not an error.
 * This matches with perf_event_release_kernel() removing all child events.
 *
 * Returns:
 *  - 0 on success
 *  - <0 on error
 */
static int inherit_group(struct perf_event *parent_event,
              struct task_struct *parent,
              struct perf_event_context *parent_ctx,
              struct task_struct *child,
              struct perf_event_context *child_ctx)
{
        struct perf_event *leader;
        struct perf_event *sub;
        struct perf_event *child_ctr;

        leader = inherit_event(parent_event, parent, parent_ctx,
                                 child, NULL, child_ctx);
        if (IS_ERR(leader))
                return PTR_ERR(leader);
        /*
         * @leader can be NULL here because of is_orphaned_event(). In this
         * case inherit_event() will create individual events, similar to what
         * perf_group_detach() would do anyway.
         */
        for_each_sibling_event(sub, parent_event) {
                child_ctr = inherit_event(sub, parent, parent_ctx,
                                            child, leader, child_ctx);
                if (IS_ERR(child_ctr))
                        return PTR_ERR(child_ctr);

                if (sub->aux_event == parent_event && child_ctr &&
                    !perf_get_aux_event(child_ctr, leader))
                        return -EINVAL;
        }
        if (leader)
                leader->group_generation = parent_event->group_generation;
        return 0;
}

/*
 * Creates the child task context and tries to inherit the event-group.
 *
 * Clears @inherited_all on !attr.inherited or error. Note that we'll leave
 * inherited_all set when we 'fail' to inherit an orphaned event; this is
 * consistent with perf_event_release_kernel() removing all child events.
 *
 * Returns:
 *  - 0 on success
 *  - <0 on error
 */
static int
inherit_task_group(struct perf_event *event, struct task_struct *parent,
                   struct perf_event_context *parent_ctx,
                   struct task_struct *child, int ctxn,
                   int *inherited_all)
{
        int ret;
        struct perf_event_context *child_ctx;

        if (!event->attr.inherit) {
                *inherited_all = 0;
                return 0;
        }

        child_ctx = child->perf_event_ctxp[ctxn];
        if (!child_ctx) {
                /*
                 * This is executed from the parent task context, so
                 * inherit events that have been marked for cloning.
                 * First allocate and initialize a context for the
                 * child.
                 */
                child_ctx = alloc_perf_context(parent_ctx->pmu, child);
                if (!child_ctx)
                        return -ENOMEM;

                child->perf_event_ctxp[ctxn] = child_ctx;
        }

        ret = inherit_group(event, parent, parent_ctx,
                            child, child_ctx);

        if (ret)
                *inherited_all = 0;

        return ret;
}

/*
 * Initialize the perf_event context in task_struct
 */
static int perf_event_init_context(struct task_struct *child, int ctxn)
{
        struct perf_event_context *child_ctx, *parent_ctx;
        struct perf_event_context *cloned_ctx;
        struct perf_event *event;
        struct task_struct *parent = current;
        int inherited_all = 1;
        unsigned long flags;
        int ret = 0;

        if (likely(!parent->perf_event_ctxp[ctxn]))
                return 0;

        /*
         * If the parent's context is a clone, pin it so it won't get
         * swapped under us.
         */
        parent_ctx = perf_pin_task_context(parent, ctxn);
        if (!parent_ctx)
                return 0;

        /*
         * No need to check if parent_ctx != NULL here; since we saw
         * it non-NULL earlier, the only reason for it to become NULL
         * is if we exit, and since we're currently in the middle of
         * a fork we can't be exiting at the same time.
         */

        /*
         * Lock the parent list. No need to lock the child - not PID
         * hashed yet and not running, so nobody can access it.
         */
        mutex_lock(&parent_ctx->mutex);

        /*
         * We dont have to disable NMIs - we are only looking at
         * the list, not manipulating it:
         */
        perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
                ret = inherit_task_group(event, parent, parent_ctx,
                                         child, ctxn, &inherited_all);
                if (ret)
                        goto out_unlock;
        }

        /*
         * We can't hold ctx->lock when iterating the ->flexible_group list due
         * to allocations, but we need to prevent rotation because
         * rotate_ctx() will change the list from interrupt context.
         */
        raw_spin_lock_irqsave(&parent_ctx->lock, flags);
        parent_ctx->rotate_disable = 1;
        raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);

        perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
                ret = inherit_task_group(event, parent, parent_ctx,
                                         child, ctxn, &inherited_all);
                if (ret)
                        goto out_unlock;
        }

        raw_spin_lock_irqsave(&parent_ctx->lock, flags);
        parent_ctx->rotate_disable = 0;

        child_ctx = child->perf_event_ctxp[ctxn];

        if (child_ctx && inherited_all) {
                /*
                 * Mark the child context as a clone of the parent
                 * context, or of whatever the parent is a clone of.
                 *
                 * Note that if the parent is a clone, the holding of
                 * parent_ctx->lock avoids it from being uncloned.
                 */
                cloned_ctx = parent_ctx->parent_ctx;
                if (cloned_ctx) {
                        child_ctx->parent_ctx = cloned_ctx;
                        child_ctx->parent_gen = parent_ctx->parent_gen;
                } else {
                        child_ctx->parent_ctx = parent_ctx;
                        child_ctx->parent_gen = parent_ctx->generation;
                }
                get_ctx(child_ctx->parent_ctx);
        }

        raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
out_unlock:
        mutex_unlock(&parent_ctx->mutex);

        perf_unpin_context(parent_ctx);
        put_ctx(parent_ctx);

        return ret;
}

/*
 * Initialize the perf_event context in task_struct
 */
int perf_event_init_task(struct task_struct *child)
{
        int ctxn, ret;

        memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
        mutex_init(&child->perf_event_mutex);
        INIT_LIST_HEAD(&child->perf_event_list);

        for_each_task_context_nr(ctxn) {
                ret = perf_event_init_context(child, ctxn);
                if (ret) {
                        perf_event_free_task(child);
                        return ret;
                }
        }

        return 0;
}

static void __init perf_event_init_all_cpus(void)
{
        struct swevent_htable *swhash;
        int cpu;

        zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);

        for_each_possible_cpu(cpu) {
                swhash = &per_cpu(swevent_htable, cpu);
                mutex_init(&swhash->hlist_mutex);
                INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));

                INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
                raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));

#ifdef CONFIG_CGROUP_PERF
                INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
#endif
                INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
        }
}

static void perf_swevent_init_cpu(unsigned int cpu)
{
        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);

        mutex_lock(&swhash->hlist_mutex);
        if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
                struct swevent_hlist *hlist;

                hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
                WARN_ON(!hlist);
                rcu_assign_pointer(swhash->swevent_hlist, hlist);
        }
        mutex_unlock(&swhash->hlist_mutex);
}

#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
static void __perf_event_exit_context(void *__info)
{
        struct perf_event_context *ctx = __info;
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        struct perf_event *event;

        raw_spin_lock(&ctx->lock);
        ctx_sched_out(ctx, cpuctx, EVENT_TIME);
        list_for_each_entry(event, &ctx->event_list, event_entry)
                __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
        raw_spin_unlock(&ctx->lock);
}

static void perf_event_exit_cpu_context(int cpu)
{
        struct perf_cpu_context *cpuctx;
        struct perf_event_context *ctx;
        struct pmu *pmu;

        mutex_lock(&pmus_lock);
        list_for_each_entry(pmu, &pmus, entry) {
                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
                ctx = &cpuctx->ctx;

                mutex_lock(&ctx->mutex);
                smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
                cpuctx->online = 0;
                mutex_unlock(&ctx->mutex);
        }
        cpumask_clear_cpu(cpu, perf_online_mask);
        mutex_unlock(&pmus_lock);
}
#else

static void perf_event_exit_cpu_context(int cpu) { }

#endif

int perf_event_init_cpu(unsigned int cpu)
{
        struct perf_cpu_context *cpuctx;
        struct perf_event_context *ctx;
        struct pmu *pmu;

        perf_swevent_init_cpu(cpu);

        mutex_lock(&pmus_lock);
        cpumask_set_cpu(cpu, perf_online_mask);
        list_for_each_entry(pmu, &pmus, entry) {
                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
                ctx = &cpuctx->ctx;

                mutex_lock(&ctx->mutex);
                cpuctx->online = 1;
                mutex_unlock(&ctx->mutex);
        }
        mutex_unlock(&pmus_lock);

        return 0;
}

int perf_event_exit_cpu(unsigned int cpu)
{
        perf_event_exit_cpu_context(cpu);
        return 0;
}

static int
perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
{
        int cpu;

        for_each_online_cpu(cpu)
                perf_event_exit_cpu(cpu);

        return NOTIFY_OK;
}

/*
 * Run the perf reboot notifier at the very last possible moment so that
 * the generic watchdog code runs as long as possible.
 */
static struct notifier_block perf_reboot_notifier = {
        .notifier_call = perf_reboot,
        .priority = INT_MIN,
};

void __init perf_event_init(void)
{
        int ret;

        idr_init(&pmu_idr);

        perf_event_init_all_cpus();
        init_srcu_struct(&pmus_srcu);
        perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
        perf_pmu_register(&perf_cpu_clock, NULL, -1);
        perf_pmu_register(&perf_task_clock, NULL, -1);
        perf_tp_register();
        perf_event_init_cpu(smp_processor_id());
        register_reboot_notifier(&perf_reboot_notifier);

        ret = init_hw_breakpoint();
        WARN(ret, "hw_breakpoint initialization failed with: %d", ret);

        /*
         * Build time assertion that we keep the data_head at the intended
         * location.  IOW, validation we got the __reserved[] size right.
         */
        BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
                     != 1024);
}

ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
                              char *page)
{
        struct perf_pmu_events_attr *pmu_attr =
                container_of(attr, struct perf_pmu_events_attr, attr);

        if (pmu_attr->event_str)
                return sprintf(page, "%s\n", pmu_attr->event_str);

        return 0;
}
EXPORT_SYMBOL_GPL(perf_event_sysfs_show);

static int __init perf_event_sysfs_init(void)
{
        struct pmu *pmu;
        int ret;

        mutex_lock(&pmus_lock);

        ret = bus_register(&pmu_bus);
        if (ret)
                goto unlock;

        list_for_each_entry(pmu, &pmus, entry) {
                if (!pmu->name || pmu->type < 0)
                        continue;

                ret = pmu_dev_alloc(pmu);
                WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
        }
        pmu_bus_running = 1;
        ret = 0;

unlock:
        mutex_unlock(&pmus_lock);

        return ret;
}
device_initcall(perf_event_sysfs_init);

#ifdef CONFIG_CGROUP_PERF
static struct cgroup_subsys_state *
perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
        struct perf_cgroup *jc;

        jc = kzalloc(sizeof(*jc), GFP_KERNEL);
        if (!jc)
                return ERR_PTR(-ENOMEM);

        jc->info = alloc_percpu(struct perf_cgroup_info);
        if (!jc->info) {
                kfree(jc);
                return ERR_PTR(-ENOMEM);
        }

        return &jc->css;
}

static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
{
        struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);

        free_percpu(jc->info);
        kfree(jc);
}

static int perf_cgroup_css_online(struct cgroup_subsys_state *css)
{
        perf_event_cgroup(css->cgroup);
        return 0;
}

static int __perf_cgroup_move(void *info)
{
        struct task_struct *task = info;
        rcu_read_lock();
        perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
        rcu_read_unlock();
        return 0;
}

static void perf_cgroup_attach(struct cgroup_taskset *tset)
{
        struct task_struct *task;
        struct cgroup_subsys_state *css;

        cgroup_taskset_for_each(task, css, tset)
                task_function_call(task, __perf_cgroup_move, task);
}

struct cgroup_subsys perf_event_cgrp_subsys = {
        .css_alloc        = perf_cgroup_css_alloc,
        .css_free        = perf_cgroup_css_free,
        .css_online        = perf_cgroup_css_online,
        .attach                = perf_cgroup_attach,
        /*
         * Implicitly enable on dfl hierarchy so that perf events can
         * always be filtered by cgroup2 path as long as perf_event
         * controller is not mounted on a legacy hierarchy.
         */
        .implicit_on_dfl = true,
        .threaded        = true,
};
#endif /* CONFIG_CGROUP_PERF */













    2 
    2 





    2 
    2 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_BSEARCH_H
#define _LINUX_BSEARCH_H

#include <linux/types.h>

static __always_inline
void *__inline_bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp)
{
        const char *pivot;
        int result;

        while (num > 0) {
                pivot = base + (num >> 1) * size;
                result = cmp(key, pivot);

                if (result == 0)
                        return (void *)pivot;

                if (result > 0) {
                        base = pivot + size;
                        num--;
                }
                num >>= 1;
        }

        return NULL;
}

extern void *bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp);

#endif /* _LINUX_BSEARCH_H */































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_DAX_H
#define _LINUX_DAX_H

#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/radix-tree.h>

/* Flag for synchronous flush */
#define DAXDEV_F_SYNC (1UL << 0)

typedef unsigned long dax_entry_t;

struct iomap_ops;
struct iomap;
struct dax_device;
struct dax_operations {
        /*
         * direct_access: translate a device-relative
         * logical-page-offset into an absolute physical pfn. Return the
         * number of pages available for DAX at that pfn.
         */
        long (*direct_access)(struct dax_device *, pgoff_t, long,
                        void **, pfn_t *);
        /*
         * Validate whether this device is usable as an fsdax backing
         * device.
         */
        bool (*dax_supported)(struct dax_device *, struct block_device *, int,
                        sector_t, sector_t);
        /* copy_from_iter: required operation for fs-dax direct-i/o */
        size_t (*copy_from_iter)(struct dax_device *, pgoff_t, void *, size_t,
                        struct iov_iter *);
        /* copy_to_iter: required operation for fs-dax direct-i/o */
        size_t (*copy_to_iter)(struct dax_device *, pgoff_t, void *, size_t,
                        struct iov_iter *);
        /* zero_page_range: required operation. Zero page range   */
        int (*zero_page_range)(struct dax_device *, pgoff_t, size_t);
};

extern struct attribute_group dax_attribute_group;

#if IS_ENABLED(CONFIG_DAX)
struct dax_device *dax_get_by_host(const char *host);
struct dax_device *alloc_dax(void *private, const char *host,
                const struct dax_operations *ops, unsigned long flags);
void put_dax(struct dax_device *dax_dev);
void kill_dax(struct dax_device *dax_dev);
void dax_write_cache(struct dax_device *dax_dev, bool wc);
bool dax_write_cache_enabled(struct dax_device *dax_dev);
bool __dax_synchronous(struct dax_device *dax_dev);
static inline bool dax_synchronous(struct dax_device *dax_dev)
{
        return  __dax_synchronous(dax_dev);
}
void __set_dax_synchronous(struct dax_device *dax_dev);
static inline void set_dax_synchronous(struct dax_device *dax_dev)
{
        __set_dax_synchronous(dax_dev);
}
bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev,
                int blocksize, sector_t start, sector_t len);
/*
 * Check if given mapping is supported by the file / underlying device.
 */
static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
                                             struct dax_device *dax_dev)
{
        if (!(vma->vm_flags & VM_SYNC))
                return true;
        if (!IS_DAX(file_inode(vma->vm_file)))
                return false;
        return dax_synchronous(dax_dev);
}
#else
static inline struct dax_device *dax_get_by_host(const char *host)
{
        return NULL;
}
static inline struct dax_device *alloc_dax(void *private, const char *host,
                const struct dax_operations *ops, unsigned long flags)
{
        /*
         * Callers should check IS_ENABLED(CONFIG_DAX) to know if this
         * NULL is an error or expected.
         */
        return NULL;
}
static inline void put_dax(struct dax_device *dax_dev)
{
}
static inline void kill_dax(struct dax_device *dax_dev)
{
}
static inline void dax_write_cache(struct dax_device *dax_dev, bool wc)
{
}
static inline bool dax_write_cache_enabled(struct dax_device *dax_dev)
{
        return false;
}
static inline bool dax_synchronous(struct dax_device *dax_dev)
{
        return true;
}
static inline void set_dax_synchronous(struct dax_device *dax_dev)
{
}
static inline bool dax_supported(struct dax_device *dax_dev,
                struct block_device *bdev, int blocksize, sector_t start,
                sector_t len)
{
        return false;
}
static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
                                struct dax_device *dax_dev)
{
        return !(vma->vm_flags & VM_SYNC);
}
#endif

struct writeback_control;
int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff);
#if IS_ENABLED(CONFIG_FS_DAX)
bool __bdev_dax_supported(struct block_device *bdev, int blocksize);
static inline bool bdev_dax_supported(struct block_device *bdev, int blocksize)
{
        return __bdev_dax_supported(bdev, blocksize);
}

bool __generic_fsdax_supported(struct dax_device *dax_dev,
                struct block_device *bdev, int blocksize, sector_t start,
                sector_t sectors);
static inline bool generic_fsdax_supported(struct dax_device *dax_dev,
                struct block_device *bdev, int blocksize, sector_t start,
                sector_t sectors)
{
        return __generic_fsdax_supported(dax_dev, bdev, blocksize, start,
                        sectors);
}

static inline void fs_put_dax(struct dax_device *dax_dev)
{
        put_dax(dax_dev);
}

struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev);
int dax_writeback_mapping_range(struct address_space *mapping,
                struct dax_device *dax_dev, struct writeback_control *wbc);

struct page *dax_layout_busy_page(struct address_space *mapping);
struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end);
dax_entry_t dax_lock_page(struct page *page);
void dax_unlock_page(struct page *page, dax_entry_t cookie);
#else
static inline bool bdev_dax_supported(struct block_device *bdev,
                int blocksize)
{
        return false;
}

static inline bool generic_fsdax_supported(struct dax_device *dax_dev,
                struct block_device *bdev, int blocksize, sector_t start,
                sector_t sectors)
{
        return false;
}

static inline void fs_put_dax(struct dax_device *dax_dev)
{
}

static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
{
        return NULL;
}

static inline struct page *dax_layout_busy_page(struct address_space *mapping)
{
        return NULL;
}

static inline struct page *dax_layout_busy_page_range(struct address_space *mapping, pgoff_t start, pgoff_t nr_pages)
{
        return NULL;
}

static inline int dax_writeback_mapping_range(struct address_space *mapping,
                struct dax_device *dax_dev, struct writeback_control *wbc)
{
        return -EOPNOTSUPP;
}

static inline dax_entry_t dax_lock_page(struct page *page)
{
        if (IS_DAX(page->mapping->host))
                return ~0UL;
        return 0;
}

static inline void dax_unlock_page(struct page *page, dax_entry_t cookie)
{
}
#endif

#if IS_ENABLED(CONFIG_DAX)
int dax_read_lock(void);
void dax_read_unlock(int id);
#else
static inline int dax_read_lock(void)
{
        return 0;
}

static inline void dax_read_unlock(int id)
{
}
#endif /* CONFIG_DAX */
bool dax_alive(struct dax_device *dax_dev);
void *dax_get_private(struct dax_device *dax_dev);
long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
                void **kaddr, pfn_t *pfn);
size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
                size_t bytes, struct iov_iter *i);
size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
                size_t bytes, struct iov_iter *i);
int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
                        size_t nr_pages);
void dax_flush(struct dax_device *dax_dev, void *addr, size_t size);

ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
                const struct iomap_ops *ops);
vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
                    pfn_t *pfnp, int *errp, const struct iomap_ops *ops);
vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
                enum page_entry_size pe_size, pfn_t pfn);
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
                                      pgoff_t index);
s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap);
static inline bool dax_mapping(struct address_space *mapping)
{
        return mapping->host && IS_DAX(mapping->host);
}

#ifdef CONFIG_DEV_DAX_HMEM_DEVICES
void hmem_register_device(int target_nid, struct resource *r);
#else
static inline void hmem_register_device(int target_nid, struct resource *r)
{
}
#endif

#endif
















































































































































































































































































































































































    2 







    2 
    2 












1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
// SPDX-License-Identifier: GPL-2.0
/* bounce buffer handling for block devices
 *
 * - Split from highmem.c
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/mm.h>
#include <linux/export.h>
#include <linux/swap.h>
#include <linux/gfp.h>
#include <linux/bio.h>
#include <linux/pagemap.h>
#include <linux/mempool.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/highmem.h>
#include <linux/memblock.h>
#include <linux/printk.h>
#include <asm/tlbflush.h>

#include <trace/events/block.h>
#include "blk.h"

#define POOL_SIZE        64
#define ISA_POOL_SIZE        16

static struct bio_set bounce_bio_set, bounce_bio_split;
static mempool_t page_pool, isa_page_pool;

static void init_bounce_bioset(void)
{
        static bool bounce_bs_setup;
        int ret;

        if (bounce_bs_setup)
                return;

        ret = bioset_init(&bounce_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
        BUG_ON(ret);
        if (bioset_integrity_create(&bounce_bio_set, BIO_POOL_SIZE))
                BUG_ON(1);

        ret = bioset_init(&bounce_bio_split, BIO_POOL_SIZE, 0, 0);
        BUG_ON(ret);
        bounce_bs_setup = true;
}

#if defined(CONFIG_HIGHMEM)
static __init int init_emergency_pool(void)
{
        int ret;
#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG)
        if (max_pfn <= max_low_pfn)
                return 0;
#endif

        ret = mempool_init_page_pool(&page_pool, POOL_SIZE, 0);
        BUG_ON(ret);
        pr_info("pool size: %d pages\n", POOL_SIZE);

        init_bounce_bioset();
        return 0;
}

__initcall(init_emergency_pool);
#endif

#ifdef CONFIG_HIGHMEM
/*
 * highmem version, map in to vec
 */
static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
{
        unsigned char *vto;

        vto = kmap_atomic(to->bv_page);
        memcpy(vto + to->bv_offset, vfrom, to->bv_len);
        kunmap_atomic(vto);
}

#else /* CONFIG_HIGHMEM */

#define bounce_copy_vec(to, vfrom)        \
        memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len)

#endif /* CONFIG_HIGHMEM */

/*
 * allocate pages in the DMA region for the ISA pool
 */
static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
{
        return mempool_alloc_pages(gfp_mask | GFP_DMA, data);
}

static DEFINE_MUTEX(isa_mutex);

/*
 * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA
 * as the max address, so check if the pool has already been created.
 */
int init_emergency_isa_pool(void)
{
        int ret;

        mutex_lock(&isa_mutex);

        if (mempool_initialized(&isa_page_pool)) {
                mutex_unlock(&isa_mutex);
                return 0;
        }

        ret = mempool_init(&isa_page_pool, ISA_POOL_SIZE, mempool_alloc_pages_isa,
                           mempool_free_pages, (void *) 0);
        BUG_ON(ret);

        pr_info("isa pool size: %d pages\n", ISA_POOL_SIZE);
        init_bounce_bioset();
        mutex_unlock(&isa_mutex);
        return 0;
}

/*
 * Simple bounce buffer support for highmem pages. Depending on the
 * queue gfp mask set, *to may or may not be a highmem page. kmap it
 * always, it will do the Right Thing
 */
static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
{
        unsigned char *vfrom;
        struct bio_vec tovec, fromvec;
        struct bvec_iter iter;
        /*
         * The bio of @from is created by bounce, so we can iterate
         * its bvec from start to end, but the @from->bi_iter can't be
         * trusted because it might be changed by splitting.
         */
        struct bvec_iter from_iter = BVEC_ITER_ALL_INIT;

        bio_for_each_segment(tovec, to, iter) {
                fromvec = bio_iter_iovec(from, from_iter);
                if (tovec.bv_page != fromvec.bv_page) {
                        /*
                         * fromvec->bv_offset and fromvec->bv_len might have
                         * been modified by the block layer, so use the original
                         * copy, bounce_copy_vec already uses tovec->bv_len
                         */
                        vfrom = page_address(fromvec.bv_page) +
                                tovec.bv_offset;

                        bounce_copy_vec(&tovec, vfrom);
                        flush_dcache_page(tovec.bv_page);
                }
                bio_advance_iter(from, &from_iter, tovec.bv_len);
        }
}

static void bounce_end_io(struct bio *bio, mempool_t *pool)
{
        struct bio *bio_orig = bio->bi_private;
        struct bio_vec *bvec, orig_vec;
        struct bvec_iter orig_iter = bio_orig->bi_iter;
        struct bvec_iter_all iter_all;

        /*
         * free up bounce indirect pages used
         */
        bio_for_each_segment_all(bvec, bio, iter_all) {
                orig_vec = bio_iter_iovec(bio_orig, orig_iter);
                if (bvec->bv_page != orig_vec.bv_page) {
                        dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
                        mempool_free(bvec->bv_page, pool);
                }
                bio_advance_iter(bio_orig, &orig_iter, orig_vec.bv_len);
        }

        bio_orig->bi_status = bio->bi_status;
        bio_endio(bio_orig);
        bio_put(bio);
}

static void bounce_end_io_write(struct bio *bio)
{
        bounce_end_io(bio, &page_pool);
}

static void bounce_end_io_write_isa(struct bio *bio)
{

        bounce_end_io(bio, &isa_page_pool);
}

static void __bounce_end_io_read(struct bio *bio, mempool_t *pool)
{
        struct bio *bio_orig = bio->bi_private;

        if (!bio->bi_status)
                copy_to_high_bio_irq(bio_orig, bio);

        bounce_end_io(bio, pool);
}

static void bounce_end_io_read(struct bio *bio)
{
        __bounce_end_io_read(bio, &page_pool);
}

static void bounce_end_io_read_isa(struct bio *bio)
{
        __bounce_end_io_read(bio, &isa_page_pool);
}

static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask,
                struct bio_set *bs)
{
        struct bvec_iter iter;
        struct bio_vec bv;
        struct bio *bio;

        /*
         * Pre immutable biovecs, __bio_clone() used to just do a memcpy from
         * bio_src->bi_io_vec to bio->bi_io_vec.
         *
         * We can't do that anymore, because:
         *
         *  - The point of cloning the biovec is to produce a bio with a biovec
         *    the caller can modify: bi_idx and bi_bvec_done should be 0.
         *
         *  - The original bio could've had more than BIO_MAX_PAGES biovecs; if
         *    we tried to clone the whole thing bio_alloc_bioset() would fail.
         *    But the clone should succeed as long as the number of biovecs we
         *    actually need to allocate is fewer than BIO_MAX_PAGES.
         *
         *  - Lastly, bi_vcnt should not be looked at or relied upon by code
         *    that does not own the bio - reason being drivers don't use it for
         *    iterating over the biovec anymore, so expecting it to be kept up
         *    to date (i.e. for clones that share the parent biovec) is just
         *    asking for trouble and would force extra work on
         *    __bio_clone_fast() anyways.
         */

        bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
        if (!bio)
                return NULL;
        bio->bi_disk                = bio_src->bi_disk;
        bio->bi_opf                = bio_src->bi_opf;
        bio->bi_ioprio                = bio_src->bi_ioprio;
        bio->bi_write_hint        = bio_src->bi_write_hint;
        bio->bi_iter.bi_sector        = bio_src->bi_iter.bi_sector;
        bio->bi_iter.bi_size        = bio_src->bi_iter.bi_size;

        switch (bio_op(bio)) {
        case REQ_OP_DISCARD:
        case REQ_OP_SECURE_ERASE:
        case REQ_OP_WRITE_ZEROES:
                break;
        case REQ_OP_WRITE_SAME:
                bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
                break;
        default:
                bio_for_each_segment(bv, bio_src, iter)
                        bio->bi_io_vec[bio->bi_vcnt++] = bv;
                break;
        }

        if (bio_crypt_clone(bio, bio_src, gfp_mask) < 0)
                goto err_put;

        if (bio_integrity(bio_src) &&
            bio_integrity_clone(bio, bio_src, gfp_mask) < 0)
                goto err_put;

        bio_clone_blkg_association(bio, bio_src);
        blkcg_bio_issue_init(bio);

        return bio;

err_put:
        bio_put(bio);
        return NULL;
}

static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
                               mempool_t *pool)
{
        struct bio *bio;
        int rw = bio_data_dir(*bio_orig);
        struct bio_vec *to, from;
        struct bvec_iter iter;
        unsigned i = 0;
        bool bounce = false;
        int sectors = 0;
        bool passthrough = bio_is_passthrough(*bio_orig);

        bio_for_each_segment(from, *bio_orig, iter) {
                if (i++ < BIO_MAX_PAGES)
                        sectors += from.bv_len >> 9;
                if (page_to_pfn(from.bv_page) > q->limits.bounce_pfn)
                        bounce = true;
        }
        if (!bounce)
                return;

        if (!passthrough && sectors < bio_sectors(*bio_orig)) {
                bio = bio_split(*bio_orig, sectors, GFP_NOIO, &bounce_bio_split);
                bio_chain(bio, *bio_orig);
                submit_bio_noacct(*bio_orig);
                *bio_orig = bio;
        }
        bio = bounce_clone_bio(*bio_orig, GFP_NOIO, passthrough ? NULL :
                        &bounce_bio_set);

        /*
         * Bvec table can't be updated by bio_for_each_segment_all(),
         * so retrieve bvec from the table directly. This way is safe
         * because the 'bio' is single-page bvec.
         */
        for (i = 0, to = bio->bi_io_vec; i < bio->bi_vcnt; to++, i++) {
                struct page *page = to->bv_page;

                if (page_to_pfn(page) <= q->limits.bounce_pfn)
                        continue;

                to->bv_page = mempool_alloc(pool, q->bounce_gfp);
                inc_zone_page_state(to->bv_page, NR_BOUNCE);

                if (rw == WRITE) {
                        char *vto, *vfrom;

                        flush_dcache_page(page);

                        vto = page_address(to->bv_page) + to->bv_offset;
                        vfrom = kmap_atomic(page) + to->bv_offset;
                        memcpy(vto, vfrom, to->bv_len);
                        kunmap_atomic(vfrom);
                }
        }

        trace_block_bio_bounce(q, *bio_orig);

        bio->bi_flags |= (1 << BIO_BOUNCED);

        if (pool == &page_pool) {
                bio->bi_end_io = bounce_end_io_write;
                if (rw == READ)
                        bio->bi_end_io = bounce_end_io_read;
        } else {
                bio->bi_end_io = bounce_end_io_write_isa;
                if (rw == READ)
                        bio->bi_end_io = bounce_end_io_read_isa;
        }

        bio->bi_private = *bio_orig;
        *bio_orig = bio;
}

void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
{
        mempool_t *pool;

        /*
         * Data-less bio, nothing to bounce
         */
        if (!bio_has_data(*bio_orig))
                return;

        /*
         * for non-isa bounce case, just check if the bounce pfn is equal
         * to or bigger than the highest pfn in the system -- in that case,
         * don't waste time iterating over bio segments
         */
        if (!(q->bounce_gfp & GFP_DMA)) {
                if (q->limits.bounce_pfn >= blk_max_pfn)
                        return;
                pool = &page_pool;
        } else {
                BUG_ON(!mempool_initialized(&isa_page_pool));
                pool = &isa_page_pool;
        }

        /*
         * slow path
         */
        __blk_queue_bounce(q, bio_orig, pool);
}
































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_CPUTIME_H
#define _LINUX_SCHED_CPUTIME_H

#include <linux/sched/signal.h>

/*
 * cputime accounting APIs:
 */

#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
#include <asm/cputime.h>

#ifndef cputime_to_nsecs
# define cputime_to_nsecs(__ct)        \
        (cputime_to_usecs(__ct) * NSEC_PER_USEC)
#endif
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */

#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
extern void task_cputime(struct task_struct *t,
                         u64 *utime, u64 *stime);
extern u64 task_gtime(struct task_struct *t);
#else
static inline void task_cputime(struct task_struct *t,
                                u64 *utime, u64 *stime)
{
        *utime = t->utime;
        *stime = t->stime;
}

static inline u64 task_gtime(struct task_struct *t)
{
        return t->gtime;
}
#endif

#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
static inline void task_cputime_scaled(struct task_struct *t,
                                       u64 *utimescaled,
                                       u64 *stimescaled)
{
        *utimescaled = t->utimescaled;
        *stimescaled = t->stimescaled;
}
#else
static inline void task_cputime_scaled(struct task_struct *t,
                                       u64 *utimescaled,
                                       u64 *stimescaled)
{
        task_cputime(t, utimescaled, stimescaled);
}
#endif

extern void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st);
extern void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st);
extern void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
                           u64 *ut, u64 *st);

/*
 * Thread group CPU time accounting.
 */
void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
void thread_group_sample_cputime(struct task_struct *tsk, u64 *samples);

/*
 * The following are functions that support scheduler-internal time accounting.
 * These functions are generally called at the timer tick.  None of this depends
 * on CONFIG_SCHEDSTATS.
 */

/**
 * get_running_cputimer - return &tsk->signal->cputimer if cputimers are active
 *
 * @tsk:        Pointer to target task.
 */
#ifdef CONFIG_POSIX_TIMERS
static inline
struct thread_group_cputimer *get_running_cputimer(struct task_struct *tsk)
{
        struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;

        /*
         * Check whether posix CPU timers are active. If not the thread
         * group accounting is not active either. Lockless check.
         */
        if (!READ_ONCE(tsk->signal->posix_cputimers.timers_active))
                return NULL;

        /*
         * After we flush the task's sum_exec_runtime to sig->sum_sched_runtime
         * in __exit_signal(), we won't account to the signal struct further
         * cputime consumed by that task, even though the task can still be
         * ticking after __exit_signal().
         *
         * In order to keep a consistent behaviour between thread group cputime
         * and thread group cputimer accounting, lets also ignore the cputime
         * elapsing after __exit_signal() in any thread group timer running.
         *
         * This makes sure that POSIX CPU clocks and timers are synchronized, so
         * that a POSIX CPU timer won't expire while the corresponding POSIX CPU
         * clock delta is behind the expiring timer value.
         */
        if (unlikely(!tsk->sighand))
                return NULL;

        return cputimer;
}
#else
static inline
struct thread_group_cputimer *get_running_cputimer(struct task_struct *tsk)
{
        return NULL;
}
#endif

/**
 * account_group_user_time - Maintain utime for a thread group.
 *
 * @tsk:        Pointer to task structure.
 * @cputime:        Time value by which to increment the utime field of the
 *                thread_group_cputime structure.
 *
 * If thread group time is being maintained, get the structure for the
 * running CPU and update the utime field there.
 */
static inline void account_group_user_time(struct task_struct *tsk,
                                           u64 cputime)
{
        struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);

        if (!cputimer)
                return;

        atomic64_add(cputime, &cputimer->cputime_atomic.utime);
}

/**
 * account_group_system_time - Maintain stime for a thread group.
 *
 * @tsk:        Pointer to task structure.
 * @cputime:        Time value by which to increment the stime field of the
 *                thread_group_cputime structure.
 *
 * If thread group time is being maintained, get the structure for the
 * running CPU and update the stime field there.
 */
static inline void account_group_system_time(struct task_struct *tsk,
                                             u64 cputime)
{
        struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);

        if (!cputimer)
                return;

        atomic64_add(cputime, &cputimer->cputime_atomic.stime);
}

/**
 * account_group_exec_runtime - Maintain exec runtime for a thread group.
 *
 * @tsk:        Pointer to task structure.
 * @ns:                Time value by which to increment the sum_exec_runtime field
 *                of the thread_group_cputime structure.
 *
 * If thread group time is being maintained, get the structure for the
 * running CPU and update the sum_exec_runtime field there.
 */
static inline void account_group_exec_runtime(struct task_struct *tsk,
                                              unsigned long long ns)
{
        struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);

        if (!cputimer)
                return;

        atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime);
}

static inline void prev_cputime_init(struct prev_cputime *prev)
{
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
        prev->utime = prev->stime = 0;
        raw_spin_lock_init(&prev->lock);
#endif
}

extern unsigned long long
task_sched_runtime(struct task_struct *task);

#endif /* _LINUX_SCHED_CPUTIME_H */


















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM sock

#if !defined(_TRACE_SOCK_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_SOCK_H

#include <net/sock.h>
#include <net/ipv6.h>
#include <linux/tracepoint.h>
#include <linux/ipv6.h>
#include <linux/tcp.h>

#define family_names                        \
                EM(AF_INET)                                \
                EMe(AF_INET6)

/* The protocol traced by inet_sock_set_state */
#define inet_protocol_names                \
                EM(IPPROTO_TCP)                        \
                EM(IPPROTO_DCCP)                \
                EM(IPPROTO_SCTP)                \
                EMe(IPPROTO_MPTCP)

#define tcp_state_names                        \
                EM(TCP_ESTABLISHED)                \
                EM(TCP_SYN_SENT)                \
                EM(TCP_SYN_RECV)                \
                EM(TCP_FIN_WAIT1)                \
                EM(TCP_FIN_WAIT2)                \
                EM(TCP_TIME_WAIT)                \
                EM(TCP_CLOSE)                        \
                EM(TCP_CLOSE_WAIT)                \
                EM(TCP_LAST_ACK)                \
                EM(TCP_LISTEN)                        \
                EM(TCP_CLOSING)                        \
                EMe(TCP_NEW_SYN_RECV)

#define skmem_kind_names                        \
                EM(SK_MEM_SEND)                        \
                EMe(SK_MEM_RECV)

/* enums need to be exported to user space */
#undef EM
#undef EMe
#define EM(a)       TRACE_DEFINE_ENUM(a);
#define EMe(a)      TRACE_DEFINE_ENUM(a);

family_names
inet_protocol_names
tcp_state_names
skmem_kind_names

#undef EM
#undef EMe
#define EM(a)       { a, #a },
#define EMe(a)      { a, #a }

#define show_family_name(val)                        \
        __print_symbolic(val, family_names)

#define show_inet_protocol_name(val)    \
        __print_symbolic(val, inet_protocol_names)

#define show_tcp_state_name(val)        \
        __print_symbolic(val, tcp_state_names)

#define show_skmem_kind_names(val)        \
        __print_symbolic(val, skmem_kind_names)

TRACE_EVENT(sock_rcvqueue_full,

        TP_PROTO(struct sock *sk, struct sk_buff *skb),

        TP_ARGS(sk, skb),

        TP_STRUCT__entry(
                __field(int, rmem_alloc)
                __field(unsigned int, truesize)
                __field(int, sk_rcvbuf)
        ),

        TP_fast_assign(
                __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc);
                __entry->truesize   = skb->truesize;
                __entry->sk_rcvbuf  = READ_ONCE(sk->sk_rcvbuf);
        ),

        TP_printk("rmem_alloc=%d truesize=%u sk_rcvbuf=%d",
                __entry->rmem_alloc, __entry->truesize, __entry->sk_rcvbuf)
);

TRACE_EVENT(sock_exceed_buf_limit,

        TP_PROTO(struct sock *sk, struct proto *prot, long allocated, int kind),

        TP_ARGS(sk, prot, allocated, kind),

        TP_STRUCT__entry(
                __array(char, name, 32)
                __array(long, sysctl_mem, 3)
                __field(long, allocated)
                __field(int, sysctl_rmem)
                __field(int, rmem_alloc)
                __field(int, sysctl_wmem)
                __field(int, wmem_alloc)
                __field(int, wmem_queued)
                __field(int, kind)
        ),

        TP_fast_assign(
                strncpy(__entry->name, prot->name, 32);
                __entry->sysctl_mem[0] = READ_ONCE(prot->sysctl_mem[0]);
                __entry->sysctl_mem[1] = READ_ONCE(prot->sysctl_mem[1]);
                __entry->sysctl_mem[2] = READ_ONCE(prot->sysctl_mem[2]);
                __entry->allocated = allocated;
                __entry->sysctl_rmem = sk_get_rmem0(sk, prot);
                __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc);
                __entry->sysctl_wmem = sk_get_wmem0(sk, prot);
                __entry->wmem_alloc = refcount_read(&sk->sk_wmem_alloc);
                __entry->wmem_queued = READ_ONCE(sk->sk_wmem_queued);
                __entry->kind = kind;
        ),

        TP_printk("proto:%s sysctl_mem=%ld,%ld,%ld allocated=%ld sysctl_rmem=%d rmem_alloc=%d sysctl_wmem=%d wmem_alloc=%d wmem_queued=%d kind=%s",
                __entry->name,
                __entry->sysctl_mem[0],
                __entry->sysctl_mem[1],
                __entry->sysctl_mem[2],
                __entry->allocated,
                __entry->sysctl_rmem,
                __entry->rmem_alloc,
                __entry->sysctl_wmem,
                __entry->wmem_alloc,
                __entry->wmem_queued,
                show_skmem_kind_names(__entry->kind)
        )
);

TRACE_EVENT(inet_sock_set_state,

        TP_PROTO(const struct sock *sk, const int oldstate, const int newstate),

        TP_ARGS(sk, oldstate, newstate),

        TP_STRUCT__entry(
                __field(const void *, skaddr)
                __field(int, oldstate)
                __field(int, newstate)
                __field(__u16, sport)
                __field(__u16, dport)
                __field(__u16, family)
                __field(__u16, protocol)
                __array(__u8, saddr, 4)
                __array(__u8, daddr, 4)
                __array(__u8, saddr_v6, 16)
                __array(__u8, daddr_v6, 16)
        ),

        TP_fast_assign(
                struct inet_sock *inet = inet_sk(sk);
                struct in6_addr *pin6;
                __be32 *p32;

                __entry->skaddr = sk;
                __entry->oldstate = oldstate;
                __entry->newstate = newstate;

                __entry->family = sk->sk_family;
                __entry->protocol = sk->sk_protocol;
                __entry->sport = ntohs(inet->inet_sport);
                __entry->dport = ntohs(inet->inet_dport);

                p32 = (__be32 *) __entry->saddr;
                *p32 = inet->inet_saddr;

                p32 = (__be32 *) __entry->daddr;
                *p32 =  inet->inet_daddr;

#if IS_ENABLED(CONFIG_IPV6)
                if (sk->sk_family == AF_INET6) {
                        pin6 = (struct in6_addr *)__entry->saddr_v6;
                        *pin6 = sk->sk_v6_rcv_saddr;
                        pin6 = (struct in6_addr *)__entry->daddr_v6;
                        *pin6 = sk->sk_v6_daddr;
                } else
#endif
                {
                        pin6 = (struct in6_addr *)__entry->saddr_v6;
                        ipv6_addr_set_v4mapped(inet->inet_saddr, pin6);
                        pin6 = (struct in6_addr *)__entry->daddr_v6;
                        ipv6_addr_set_v4mapped(inet->inet_daddr, pin6);
                }
        ),

        TP_printk("family=%s protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c oldstate=%s newstate=%s",
                        show_family_name(__entry->family),
                        show_inet_protocol_name(__entry->protocol),
                        __entry->sport, __entry->dport,
                        __entry->saddr, __entry->daddr,
                        __entry->saddr_v6, __entry->daddr_v6,
                        show_tcp_state_name(__entry->oldstate),
                        show_tcp_state_name(__entry->newstate))
);

#endif /* _TRACE_SOCK_H */

/* This part must be outside protection */
#include <trace/define_trace.h>









































































































































































































































































































































































































































































































































































































































    2 


































































    1 





























    2 


    2 

    2 
    1 

    2 



    2 






























































































    2 


    2 
















    2 









































































































































































































    2 


























































    2 
    2 
    2 











    1 
    1 



































































    1 


    1 









    1 
















    1 






































    1 










    1 




    1 







    1 



    1 









    1 









    1 


    1 



































    2 









































































    2 







    2 



    2 





    2 




    2 
    2 
    2 







    2 
    2 


    2 







    2 



    2 










    2 













    2 


    2 




    2 


    2 






    2 


    2 




















    1 

    1 
    1 



    1 




































































































    1 



    1 

    1 







    1 
    1 



    1 






    1 


























































    1 



    1 
    1 



    1 










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
// SPDX-License-Identifier: GPL-2.0-only
/*
 * kernel/workqueue.c - generic async execution with shared worker pool
 *
 * Copyright (C) 2002                Ingo Molnar
 *
 *   Derived from the taskqueue/keventd code by:
 *     David Woodhouse <dwmw2@infradead.org>
 *     Andrew Morton
 *     Kai Petzke <wpp@marie.physik.tu-berlin.de>
 *     Theodore Ts'o <tytso@mit.edu>
 *
 * Made to use alloc_percpu by Christoph Lameter.
 *
 * Copyright (C) 2010                SUSE Linux Products GmbH
 * Copyright (C) 2010                Tejun Heo <tj@kernel.org>
 *
 * This is the generic async execution mechanism.  Work items as are
 * executed in process context.  The worker pool is shared and
 * automatically managed.  There are two worker pools for each CPU (one for
 * normal work items and the other for high priority ones) and some extra
 * pools for workqueues which are not bound to any specific CPU - the
 * number of these backing pools is dynamic.
 *
 * Please read Documentation/core-api/workqueue.rst for details.
 */

#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/signal.h>
#include <linux/completion.h>
#include <linux/workqueue.h>
#include <linux/slab.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/kthread.h>
#include <linux/hardirq.h>
#include <linux/mempolicy.h>
#include <linux/freezer.h>
#include <linux/debug_locks.h>
#include <linux/lockdep.h>
#include <linux/idr.h>
#include <linux/jhash.h>
#include <linux/hashtable.h>
#include <linux/rculist.h>
#include <linux/nodemask.h>
#include <linux/moduleparam.h>
#include <linux/uaccess.h>
#include <linux/sched/isolation.h>
#include <linux/nmi.h>
#include <linux/kvm_para.h>

#include "workqueue_internal.h"

enum {
        /*
         * worker_pool flags
         *
         * A bound pool is either associated or disassociated with its CPU.
         * While associated (!DISASSOCIATED), all workers are bound to the
         * CPU and none has %WORKER_UNBOUND set and concurrency management
         * is in effect.
         *
         * While DISASSOCIATED, the cpu may be offline and all workers have
         * %WORKER_UNBOUND set and concurrency management disabled, and may
         * be executing on any CPU.  The pool behaves as an unbound one.
         *
         * Note that DISASSOCIATED should be flipped only while holding
         * wq_pool_attach_mutex to avoid changing binding state while
         * worker_attach_to_pool() is in progress.
         */
        POOL_MANAGER_ACTIVE        = 1 << 0,        /* being managed */
        POOL_DISASSOCIATED        = 1 << 2,        /* cpu can't serve workers */

        /* worker flags */
        WORKER_DIE                = 1 << 1,        /* die die die */
        WORKER_IDLE                = 1 << 2,        /* is idle */
        WORKER_PREP                = 1 << 3,        /* preparing to run works */
        WORKER_CPU_INTENSIVE        = 1 << 6,        /* cpu intensive */
        WORKER_UNBOUND                = 1 << 7,        /* worker is unbound */
        WORKER_REBOUND                = 1 << 8,        /* worker was rebound */

        WORKER_NOT_RUNNING        = WORKER_PREP | WORKER_CPU_INTENSIVE |
                                  WORKER_UNBOUND | WORKER_REBOUND,

        NR_STD_WORKER_POOLS        = 2,                /* # standard pools per cpu */

        UNBOUND_POOL_HASH_ORDER        = 6,                /* hashed by pool->attrs */
        BUSY_WORKER_HASH_ORDER        = 6,                /* 64 pointers */

        MAX_IDLE_WORKERS_RATIO        = 4,                /* 1/4 of busy can be idle */
        IDLE_WORKER_TIMEOUT        = 300 * HZ,        /* keep idle ones for 5 mins */

        MAYDAY_INITIAL_TIMEOUT  = HZ / 100 >= 2 ? HZ / 100 : 2,
                                                /* call for help after 10ms
                                                   (min two ticks) */
        MAYDAY_INTERVAL                = HZ / 10,        /* and then every 100ms */
        CREATE_COOLDOWN                = HZ,                /* time to breath after fail */

        /*
         * Rescue workers are used only on emergencies and shared by
         * all cpus.  Give MIN_NICE.
         */
        RESCUER_NICE_LEVEL        = MIN_NICE,
        HIGHPRI_NICE_LEVEL        = MIN_NICE,

        WQ_NAME_LEN                = 24,
};

/*
 * Structure fields follow one of the following exclusion rules.
 *
 * I: Modifiable by initialization/destruction paths and read-only for
 *    everyone else.
 *
 * P: Preemption protected.  Disabling preemption is enough and should
 *    only be modified and accessed from the local cpu.
 *
 * L: pool->lock protected.  Access with pool->lock held.
 *
 * X: During normal operation, modification requires pool->lock and should
 *    be done only from local cpu.  Either disabling preemption on local
 *    cpu or grabbing pool->lock is enough for read access.  If
 *    POOL_DISASSOCIATED is set, it's identical to L.
 *
 * A: wq_pool_attach_mutex protected.
 *
 * PL: wq_pool_mutex protected.
 *
 * PR: wq_pool_mutex protected for writes.  RCU protected for reads.
 *
 * PW: wq_pool_mutex and wq->mutex protected for writes.  Either for reads.
 *
 * PWR: wq_pool_mutex and wq->mutex protected for writes.  Either or
 *      RCU for reads.
 *
 * WQ: wq->mutex protected.
 *
 * WR: wq->mutex protected for writes.  RCU protected for reads.
 *
 * MD: wq_mayday_lock protected.
 */

/* struct worker is defined in workqueue_internal.h */

struct worker_pool {
        raw_spinlock_t                lock;                /* the pool lock */
        int                        cpu;                /* I: the associated cpu */
        int                        node;                /* I: the associated node ID */
        int                        id;                /* I: pool ID */
        unsigned int                flags;                /* X: flags */

        unsigned long                watchdog_ts;        /* L: watchdog timestamp */

        struct list_head        worklist;        /* L: list of pending works */

        int                        nr_workers;        /* L: total number of workers */
        int                        nr_idle;        /* L: currently idle workers */

        struct list_head        idle_list;        /* X: list of idle workers */
        struct timer_list        idle_timer;        /* L: worker idle timeout */
        struct timer_list        mayday_timer;        /* L: SOS timer for workers */

        /* a workers is either on busy_hash or idle_list, or the manager */
        DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
                                                /* L: hash of busy workers */

        struct worker                *manager;        /* L: purely informational */
        struct list_head        workers;        /* A: attached workers */
        struct completion        *detach_completion; /* all workers detached */

        struct ida                worker_ida;        /* worker IDs for task name */

        struct workqueue_attrs        *attrs;                /* I: worker attributes */
        struct hlist_node        hash_node;        /* PL: unbound_pool_hash node */
        int                        refcnt;                /* PL: refcnt for unbound pools */

        /*
         * The current concurrency level.  As it's likely to be accessed
         * from other CPUs during try_to_wake_up(), put it in a separate
         * cacheline.
         */
        atomic_t                nr_running ____cacheline_aligned_in_smp;

        /*
         * Destruction of pool is RCU protected to allow dereferences
         * from get_work_pool().
         */
        struct rcu_head                rcu;
} ____cacheline_aligned_in_smp;

/*
 * The per-pool workqueue.  While queued, the lower WORK_STRUCT_FLAG_BITS
 * of work_struct->data are used for flags and the remaining high bits
 * point to the pwq; thus, pwqs need to be aligned at two's power of the
 * number of flag bits.
 */
struct pool_workqueue {
        struct worker_pool        *pool;                /* I: the associated pool */
        struct workqueue_struct *wq;                /* I: the owning workqueue */
        int                        work_color;        /* L: current color */
        int                        flush_color;        /* L: flushing color */
        int                        refcnt;                /* L: reference count */
        int                        nr_in_flight[WORK_NR_COLORS];
                                                /* L: nr of in_flight works */
        int                        nr_active;        /* L: nr of active works */
        int                        max_active;        /* L: max active works */
        struct list_head        inactive_works;        /* L: inactive works */
        struct list_head        pwqs_node;        /* WR: node on wq->pwqs */
        struct list_head        mayday_node;        /* MD: node on wq->maydays */

        /*
         * Release of unbound pwq is punted to system_wq.  See put_pwq()
         * and pwq_unbound_release_workfn() for details.  pool_workqueue
         * itself is also RCU protected so that the first pwq can be
         * determined without grabbing wq->mutex.
         */
        struct work_struct        unbound_release_work;
        struct rcu_head                rcu;
} __aligned(1 << WORK_STRUCT_FLAG_BITS);

/*
 * Structure used to wait for workqueue flush.
 */
struct wq_flusher {
        struct list_head        list;                /* WQ: list of flushers */
        int                        flush_color;        /* WQ: flush color waiting for */
        struct completion        done;                /* flush completion */
};

struct wq_device;

/*
 * The externally visible workqueue.  It relays the issued work items to
 * the appropriate worker_pool through its pool_workqueues.
 */
struct workqueue_struct {
        struct list_head        pwqs;                /* WR: all pwqs of this wq */
        struct list_head        list;                /* PR: list of all workqueues */

        struct mutex                mutex;                /* protects this wq */
        int                        work_color;        /* WQ: current work color */
        int                        flush_color;        /* WQ: current flush color */
        atomic_t                nr_pwqs_to_flush; /* flush in progress */
        struct wq_flusher        *first_flusher;        /* WQ: first flusher */
        struct list_head        flusher_queue;        /* WQ: flush waiters */
        struct list_head        flusher_overflow; /* WQ: flush overflow list */

        struct list_head        maydays;        /* MD: pwqs requesting rescue */
        struct worker                *rescuer;        /* MD: rescue worker */

        int                        nr_drainers;        /* WQ: drain in progress */
        int                        saved_max_active; /* WQ: saved pwq max_active */

        struct workqueue_attrs        *unbound_attrs;        /* PW: only for unbound wqs */
        struct pool_workqueue        *dfl_pwq;        /* PW: only for unbound wqs */

#ifdef CONFIG_SYSFS
        struct wq_device        *wq_dev;        /* I: for sysfs interface */
#endif
#ifdef CONFIG_LOCKDEP
        char                        *lock_name;
        struct lock_class_key        key;
        struct lockdep_map        lockdep_map;
#endif
        char                        name[WQ_NAME_LEN]; /* I: workqueue name */

        /*
         * Destruction of workqueue_struct is RCU protected to allow walking
         * the workqueues list without grabbing wq_pool_mutex.
         * This is used to dump all workqueues from sysrq.
         */
        struct rcu_head                rcu;

        /* hot fields used during command issue, aligned to cacheline */
        unsigned int                flags ____cacheline_aligned; /* WQ: WQ_* flags */
        struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
        struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */
};

static struct kmem_cache *pwq_cache;

static cpumask_var_t *wq_numa_possible_cpumask;
                                        /* possible CPUs of each node */

static bool wq_disable_numa;
module_param_named(disable_numa, wq_disable_numa, bool, 0444);

/* see the comment above the definition of WQ_POWER_EFFICIENT */
static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT);
module_param_named(power_efficient, wq_power_efficient, bool, 0444);

static bool wq_online;                        /* can kworkers be created yet? */

static bool wq_numa_enabled;                /* unbound NUMA affinity enabled */

/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;

static DEFINE_MUTEX(wq_pool_mutex);        /* protects pools and workqueues list */
static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */
static DEFINE_RAW_SPINLOCK(wq_mayday_lock);        /* protects wq->maydays list */
/* wait for manager to go away */
static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait);

static LIST_HEAD(workqueues);                /* PR: list of all workqueues */
static bool workqueue_freezing;                /* PL: have wqs started freezing? */

/* PL: allowable cpus for unbound wqs and work items */
static cpumask_var_t wq_unbound_cpumask;

/* CPU where unbound work was last round robin scheduled from this CPU */
static DEFINE_PER_CPU(int, wq_rr_cpu_last);

/*
 * Local execution of unbound work items is no longer guaranteed.  The
 * following always forces round-robin CPU selection on unbound work items
 * to uncover usages which depend on it.
 */
#ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU
static bool wq_debug_force_rr_cpu = true;
#else
static bool wq_debug_force_rr_cpu = false;
#endif
module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644);

/* the per-cpu worker pools */
static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools);

static DEFINE_IDR(worker_pool_idr);        /* PR: idr of all pools */

/* PL: hash of all unbound pools keyed by pool->attrs */
static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);

/* I: attributes used when instantiating standard unbound pools on demand */
static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];

/* I: attributes used when instantiating ordered pools on demand */
static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];

struct workqueue_struct *system_wq __read_mostly;
EXPORT_SYMBOL(system_wq);
struct workqueue_struct *system_highpri_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_highpri_wq);
struct workqueue_struct *system_long_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_long_wq);
struct workqueue_struct *system_unbound_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_unbound_wq);
struct workqueue_struct *system_freezable_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_freezable_wq);
struct workqueue_struct *system_power_efficient_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_power_efficient_wq);
struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);

static int worker_thread(void *__worker);
static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
static void show_pwq(struct pool_workqueue *pwq);

#define CREATE_TRACE_POINTS
#include <trace/events/workqueue.h>

#define assert_rcu_or_pool_mutex()                                        \
        RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                        \
                         !lockdep_is_held(&wq_pool_mutex),                \
                         "RCU or wq_pool_mutex should be held")

#define assert_rcu_or_wq_mutex_or_pool_mutex(wq)                        \
        RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                        \
                         !lockdep_is_held(&wq->mutex) &&                \
                         !lockdep_is_held(&wq_pool_mutex),                \
                         "RCU, wq->mutex or wq_pool_mutex should be held")

#define for_each_cpu_worker_pool(pool, cpu)                                \
        for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];                \
             (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
             (pool)++)

/**
 * for_each_pool - iterate through all worker_pools in the system
 * @pool: iteration cursor
 * @pi: integer used for iteration
 *
 * This must be called either with wq_pool_mutex held or RCU read
 * locked.  If the pool needs to be used beyond the locking in effect, the
 * caller is responsible for guaranteeing that the pool stays online.
 *
 * The if/else clause exists only for the lockdep assertion and can be
 * ignored.
 */
#define for_each_pool(pool, pi)                                                \
        idr_for_each_entry(&worker_pool_idr, pool, pi)                        \
                if (({ assert_rcu_or_pool_mutex(); false; })) { }        \
                else

/**
 * for_each_pool_worker - iterate through all workers of a worker_pool
 * @worker: iteration cursor
 * @pool: worker_pool to iterate workers of
 *
 * This must be called with wq_pool_attach_mutex.
 *
 * The if/else clause exists only for the lockdep assertion and can be
 * ignored.
 */
#define for_each_pool_worker(worker, pool)                                \
        list_for_each_entry((worker), &(pool)->workers, node)                \
                if (({ lockdep_assert_held(&wq_pool_attach_mutex); false; })) { } \
                else

/**
 * for_each_pwq - iterate through all pool_workqueues of the specified workqueue
 * @pwq: iteration cursor
 * @wq: the target workqueue
 *
 * This must be called either with wq->mutex held or RCU read locked.
 * If the pwq needs to be used beyond the locking in effect, the caller is
 * responsible for guaranteeing that the pwq stays online.
 *
 * The if/else clause exists only for the lockdep assertion and can be
 * ignored.
 */
#define for_each_pwq(pwq, wq)                                                \
        list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node,                \
                                 lockdep_is_held(&(wq->mutex)))

#ifdef CONFIG_DEBUG_OBJECTS_WORK

static const struct debug_obj_descr work_debug_descr;

static void *work_debug_hint(void *addr)
{
        return ((struct work_struct *) addr)->func;
}

static bool work_is_static_object(void *addr)
{
        struct work_struct *work = addr;

        return test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work));
}

/*
 * fixup_init is called when:
 * - an active object is initialized
 */
static bool work_fixup_init(void *addr, enum debug_obj_state state)
{
        struct work_struct *work = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                cancel_work_sync(work);
                debug_object_init(work, &work_debug_descr);
                return true;
        default:
                return false;
        }
}

/*
 * fixup_free is called when:
 * - an active object is freed
 */
static bool work_fixup_free(void *addr, enum debug_obj_state state)
{
        struct work_struct *work = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                cancel_work_sync(work);
                debug_object_free(work, &work_debug_descr);
                return true;
        default:
                return false;
        }
}

static const struct debug_obj_descr work_debug_descr = {
        .name                = "work_struct",
        .debug_hint        = work_debug_hint,
        .is_static_object = work_is_static_object,
        .fixup_init        = work_fixup_init,
        .fixup_free        = work_fixup_free,
};

static inline void debug_work_activate(struct work_struct *work)
{
        debug_object_activate(work, &work_debug_descr);
}

static inline void debug_work_deactivate(struct work_struct *work)
{
        debug_object_deactivate(work, &work_debug_descr);
}

void __init_work(struct work_struct *work, int onstack)
{
        if (onstack)
                debug_object_init_on_stack(work, &work_debug_descr);
        else
                debug_object_init(work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(__init_work);

void destroy_work_on_stack(struct work_struct *work)
{
        debug_object_free(work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_work_on_stack);

void destroy_delayed_work_on_stack(struct delayed_work *work)
{
        destroy_timer_on_stack(&work->timer);
        debug_object_free(&work->work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack);

#else
static inline void debug_work_activate(struct work_struct *work) { }
static inline void debug_work_deactivate(struct work_struct *work) { }
#endif

/**
 * worker_pool_assign_id - allocate ID and assing it to @pool
 * @pool: the pool pointer of interest
 *
 * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned
 * successfully, -errno on failure.
 */
static int worker_pool_assign_id(struct worker_pool *pool)
{
        int ret;

        lockdep_assert_held(&wq_pool_mutex);

        ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE,
                        GFP_KERNEL);
        if (ret >= 0) {
                pool->id = ret;
                return 0;
        }
        return ret;
}

/**
 * unbound_pwq_by_node - return the unbound pool_workqueue for the given node
 * @wq: the target workqueue
 * @node: the node ID
 *
 * This must be called with any of wq_pool_mutex, wq->mutex or RCU
 * read locked.
 * If the pwq needs to be used beyond the locking in effect, the caller is
 * responsible for guaranteeing that the pwq stays online.
 *
 * Return: The unbound pool_workqueue for @node.
 */
static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq,
                                                  int node)
{
        assert_rcu_or_wq_mutex_or_pool_mutex(wq);

        /*
         * XXX: @node can be NUMA_NO_NODE if CPU goes offline while a
         * delayed item is pending.  The plan is to keep CPU -> NODE
         * mapping valid and stable across CPU on/offlines.  Once that
         * happens, this workaround can be removed.
         */
        if (unlikely(node == NUMA_NO_NODE))
                return wq->dfl_pwq;

        return rcu_dereference_raw(wq->numa_pwq_tbl[node]);
}

static unsigned int work_color_to_flags(int color)
{
        return color << WORK_STRUCT_COLOR_SHIFT;
}

static int get_work_color(struct work_struct *work)
{
        return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) &
                ((1 << WORK_STRUCT_COLOR_BITS) - 1);
}

static int work_next_color(int color)
{
        return (color + 1) % WORK_NR_COLORS;
}

/*
 * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data
 * contain the pointer to the queued pwq.  Once execution starts, the flag
 * is cleared and the high bits contain OFFQ flags and pool ID.
 *
 * set_work_pwq(), set_work_pool_and_clear_pending(), mark_work_canceling()
 * and clear_work_data() can be used to set the pwq, pool or clear
 * work->data.  These functions should only be called while the work is
 * owned - ie. while the PENDING bit is set.
 *
 * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq
 * corresponding to a work.  Pool is available once the work has been
 * queued anywhere after initialization until it is sync canceled.  pwq is
 * available only while the work item is queued.
 *
 * %WORK_OFFQ_CANCELING is used to mark a work item which is being
 * canceled.  While being canceled, a work item may have its PENDING set
 * but stay off timer and worklist for arbitrarily long and nobody should
 * try to steal the PENDING bit.
 */
static inline void set_work_data(struct work_struct *work, unsigned long data,
                                 unsigned long flags)
{
        WARN_ON_ONCE(!work_pending(work));
        atomic_long_set(&work->data, data | flags | work_static(work));
}

static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq,
                         unsigned long extra_flags)
{
        set_work_data(work, (unsigned long)pwq,
                      WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | extra_flags);
}

static void set_work_pool_and_keep_pending(struct work_struct *work,
                                           int pool_id)
{
        set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT,
                      WORK_STRUCT_PENDING);
}

static void set_work_pool_and_clear_pending(struct work_struct *work,
                                            int pool_id)
{
        /*
         * The following wmb is paired with the implied mb in
         * test_and_set_bit(PENDING) and ensures all updates to @work made
         * here are visible to and precede any updates by the next PENDING
         * owner.
         */
        smp_wmb();
        set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);
        /*
         * The following mb guarantees that previous clear of a PENDING bit
         * will not be reordered with any speculative LOADS or STORES from
         * work->current_func, which is executed afterwards.  This possible
         * reordering can lead to a missed execution on attempt to queue
         * the same @work.  E.g. consider this case:
         *
         *   CPU#0                         CPU#1
         *   ----------------------------  --------------------------------
         *
         * 1  STORE event_indicated
         * 2  queue_work_on() {
         * 3    test_and_set_bit(PENDING)
         * 4 }                             set_..._and_clear_pending() {
         * 5                                 set_work_data() # clear bit
         * 6                                 smp_mb()
         * 7                               work->current_func() {
         * 8                                      LOAD event_indicated
         *                                   }
         *
         * Without an explicit full barrier speculative LOAD on line 8 can
         * be executed before CPU#0 does STORE on line 1.  If that happens,
         * CPU#0 observes the PENDING bit is still set and new execution of
         * a @work is not queued in a hope, that CPU#1 will eventually
         * finish the queued @work.  Meanwhile CPU#1 does not see
         * event_indicated is set, because speculative LOAD was executed
         * before actual STORE.
         */
        smp_mb();
}

static void clear_work_data(struct work_struct *work)
{
        smp_wmb();        /* see set_work_pool_and_clear_pending() */
        set_work_data(work, WORK_STRUCT_NO_POOL, 0);
}

static inline struct pool_workqueue *work_struct_pwq(unsigned long data)
{
        return (struct pool_workqueue *)(data & WORK_STRUCT_WQ_DATA_MASK);
}

static struct pool_workqueue *get_work_pwq(struct work_struct *work)
{
        unsigned long data = atomic_long_read(&work->data);

        if (data & WORK_STRUCT_PWQ)
                return work_struct_pwq(data);
        else
                return NULL;
}

/**
 * get_work_pool - return the worker_pool a given work was associated with
 * @work: the work item of interest
 *
 * Pools are created and destroyed under wq_pool_mutex, and allows read
 * access under RCU read lock.  As such, this function should be
 * called under wq_pool_mutex or inside of a rcu_read_lock() region.
 *
 * All fields of the returned pool are accessible as long as the above
 * mentioned locking is in effect.  If the returned pool needs to be used
 * beyond the critical section, the caller is responsible for ensuring the
 * returned pool is and stays online.
 *
 * Return: The worker_pool @work was last associated with.  %NULL if none.
 */
static struct worker_pool *get_work_pool(struct work_struct *work)
{
        unsigned long data = atomic_long_read(&work->data);
        int pool_id;

        assert_rcu_or_pool_mutex();

        if (data & WORK_STRUCT_PWQ)
                return work_struct_pwq(data)->pool;

        pool_id = data >> WORK_OFFQ_POOL_SHIFT;
        if (pool_id == WORK_OFFQ_POOL_NONE)
                return NULL;

        return idr_find(&worker_pool_idr, pool_id);
}

/**
 * get_work_pool_id - return the worker pool ID a given work is associated with
 * @work: the work item of interest
 *
 * Return: The worker_pool ID @work was last associated with.
 * %WORK_OFFQ_POOL_NONE if none.
 */
static int get_work_pool_id(struct work_struct *work)
{
        unsigned long data = atomic_long_read(&work->data);

        if (data & WORK_STRUCT_PWQ)
                return work_struct_pwq(data)->pool->id;

        return data >> WORK_OFFQ_POOL_SHIFT;
}

static void mark_work_canceling(struct work_struct *work)
{
        unsigned long pool_id = get_work_pool_id(work);

        pool_id <<= WORK_OFFQ_POOL_SHIFT;
        set_work_data(work, pool_id | WORK_OFFQ_CANCELING, WORK_STRUCT_PENDING);
}

static bool work_is_canceling(struct work_struct *work)
{
        unsigned long data = atomic_long_read(&work->data);

        return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING);
}

/*
 * Policy functions.  These define the policies on how the global worker
 * pools are managed.  Unless noted otherwise, these functions assume that
 * they're being called with pool->lock held.
 */

static bool __need_more_worker(struct worker_pool *pool)
{
        return !atomic_read(&pool->nr_running);
}

/*
 * Need to wake up a worker?  Called from anything but currently
 * running workers.
 *
 * Note that, because unbound workers never contribute to nr_running, this
 * function will always return %true for unbound pools as long as the
 * worklist isn't empty.
 */
static bool need_more_worker(struct worker_pool *pool)
{
        return !list_empty(&pool->worklist) && __need_more_worker(pool);
}

/* Can I start working?  Called from busy but !running workers. */
static bool may_start_working(struct worker_pool *pool)
{
        return pool->nr_idle;
}

/* Do I need to keep working?  Called from currently running workers. */
static bool keep_working(struct worker_pool *pool)
{
        return !list_empty(&pool->worklist) &&
                atomic_read(&pool->nr_running) <= 1;
}

/* Do we need a new worker?  Called from manager. */
static bool need_to_create_worker(struct worker_pool *pool)
{
        return need_more_worker(pool) && !may_start_working(pool);
}

/* Do we have too many workers and should some go away? */
static bool too_many_workers(struct worker_pool *pool)
{
        bool managing = pool->flags & POOL_MANAGER_ACTIVE;
        int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
        int nr_busy = pool->nr_workers - nr_idle;

        return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
}

/*
 * Wake up functions.
 */

/* Return the first idle worker.  Safe with preemption disabled */
static struct worker *first_idle_worker(struct worker_pool *pool)
{
        if (unlikely(list_empty(&pool->idle_list)))
                return NULL;

        return list_first_entry(&pool->idle_list, struct worker, entry);
}

/**
 * wake_up_worker - wake up an idle worker
 * @pool: worker pool to wake worker from
 *
 * Wake up the first idle worker of @pool.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 */
static void wake_up_worker(struct worker_pool *pool)
{
        struct worker *worker = first_idle_worker(pool);

        if (likely(worker))
                wake_up_process(worker->task);
}

/**
 * wq_worker_running - a worker is running again
 * @task: task waking up
 *
 * This function is called when a worker returns from schedule()
 */
void wq_worker_running(struct task_struct *task)
{
        struct worker *worker = kthread_data(task);

        if (!worker->sleeping)
                return;

        /*
         * If preempted by unbind_workers() between the WORKER_NOT_RUNNING check
         * and the nr_running increment below, we may ruin the nr_running reset
         * and leave with an unexpected pool->nr_running == 1 on the newly unbound
         * pool. Protect against such race.
         */
        preempt_disable();
        if (!(worker->flags & WORKER_NOT_RUNNING))
                atomic_inc(&worker->pool->nr_running);
        preempt_enable();
        worker->sleeping = 0;
}

/**
 * wq_worker_sleeping - a worker is going to sleep
 * @task: task going to sleep
 *
 * This function is called from schedule() when a busy worker is
 * going to sleep. Preemption needs to be disabled to protect ->sleeping
 * assignment.
 */
void wq_worker_sleeping(struct task_struct *task)
{
        struct worker *next, *worker = kthread_data(task);
        struct worker_pool *pool;

        /*
         * Rescuers, which may not have all the fields set up like normal
         * workers, also reach here, let's not access anything before
         * checking NOT_RUNNING.
         */
        if (worker->flags & WORKER_NOT_RUNNING)
                return;

        pool = worker->pool;

        /* Return if preempted before wq_worker_running() was reached */
        if (worker->sleeping)
                return;

        worker->sleeping = 1;
        raw_spin_lock_irq(&pool->lock);

        /*
         * The counterpart of the following dec_and_test, implied mb,
         * worklist not empty test sequence is in insert_work().
         * Please read comment there.
         *
         * NOT_RUNNING is clear.  This means that we're bound to and
         * running on the local cpu w/ rq lock held and preemption
         * disabled, which in turn means that none else could be
         * manipulating idle_list, so dereferencing idle_list without pool
         * lock is safe.
         */
        if (atomic_dec_and_test(&pool->nr_running) &&
            !list_empty(&pool->worklist)) {
                next = first_idle_worker(pool);
                if (next)
                        wake_up_process(next->task);
        }
        raw_spin_unlock_irq(&pool->lock);
}

/**
 * wq_worker_last_func - retrieve worker's last work function
 * @task: Task to retrieve last work function of.
 *
 * Determine the last function a worker executed. This is called from
 * the scheduler to get a worker's last known identity.
 *
 * CONTEXT:
 * raw_spin_lock_irq(rq->lock)
 *
 * This function is called during schedule() when a kworker is going
 * to sleep. It's used by psi to identify aggregation workers during
 * dequeuing, to allow periodic aggregation to shut-off when that
 * worker is the last task in the system or cgroup to go to sleep.
 *
 * As this function doesn't involve any workqueue-related locking, it
 * only returns stable values when called from inside the scheduler's
 * queuing and dequeuing paths, when @task, which must be a kworker,
 * is guaranteed to not be processing any works.
 *
 * Return:
 * The last work function %current executed as a worker, NULL if it
 * hasn't executed any work yet.
 */
work_func_t wq_worker_last_func(struct task_struct *task)
{
        struct worker *worker = kthread_data(task);

        return worker->last_func;
}

/**
 * worker_set_flags - set worker flags and adjust nr_running accordingly
 * @worker: self
 * @flags: flags to set
 *
 * Set @flags in @worker->flags and adjust nr_running accordingly.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock)
 */
static inline void worker_set_flags(struct worker *worker, unsigned int flags)
{
        struct worker_pool *pool = worker->pool;

        WARN_ON_ONCE(worker->task != current);

        /* If transitioning into NOT_RUNNING, adjust nr_running. */
        if ((flags & WORKER_NOT_RUNNING) &&
            !(worker->flags & WORKER_NOT_RUNNING)) {
                atomic_dec(&pool->nr_running);
        }

        worker->flags |= flags;
}

/**
 * worker_clr_flags - clear worker flags and adjust nr_running accordingly
 * @worker: self
 * @flags: flags to clear
 *
 * Clear @flags in @worker->flags and adjust nr_running accordingly.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock)
 */
static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
{
        struct worker_pool *pool = worker->pool;
        unsigned int oflags = worker->flags;

        WARN_ON_ONCE(worker->task != current);

        worker->flags &= ~flags;

        /*
         * If transitioning out of NOT_RUNNING, increment nr_running.  Note
         * that the nested NOT_RUNNING is not a noop.  NOT_RUNNING is mask
         * of multiple flags, not a single flag.
         */
        if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
                if (!(worker->flags & WORKER_NOT_RUNNING))
                        atomic_inc(&pool->nr_running);
}

/**
 * find_worker_executing_work - find worker which is executing a work
 * @pool: pool of interest
 * @work: work to find worker for
 *
 * Find a worker which is executing @work on @pool by searching
 * @pool->busy_hash which is keyed by the address of @work.  For a worker
 * to match, its current execution should match the address of @work and
 * its work function.  This is to avoid unwanted dependency between
 * unrelated work executions through a work item being recycled while still
 * being executed.
 *
 * This is a bit tricky.  A work item may be freed once its execution
 * starts and nothing prevents the freed area from being recycled for
 * another work item.  If the same work item address ends up being reused
 * before the original execution finishes, workqueue will identify the
 * recycled work item as currently executing and make it wait until the
 * current execution finishes, introducing an unwanted dependency.
 *
 * This function checks the work item address and work function to avoid
 * false positives.  Note that this isn't complete as one may construct a
 * work function which can introduce dependency onto itself through a
 * recycled work item.  Well, if somebody wants to shoot oneself in the
 * foot that badly, there's only so much we can do, and if such deadlock
 * actually occurs, it should be easy to locate the culprit work function.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 *
 * Return:
 * Pointer to worker which is executing @work if found, %NULL
 * otherwise.
 */
static struct worker *find_worker_executing_work(struct worker_pool *pool,
                                                 struct work_struct *work)
{
        struct worker *worker;

        hash_for_each_possible(pool->busy_hash, worker, hentry,
                               (unsigned long)work)
                if (worker->current_work == work &&
                    worker->current_func == work->func)
                        return worker;

        return NULL;
}

/**
 * move_linked_works - move linked works to a list
 * @work: start of series of works to be scheduled
 * @head: target list to append @work to
 * @nextp: out parameter for nested worklist walking
 *
 * Schedule linked works starting from @work to @head.  Work series to
 * be scheduled starts at @work and includes any consecutive work with
 * WORK_STRUCT_LINKED set in its predecessor.
 *
 * If @nextp is not NULL, it's updated to point to the next work of
 * the last scheduled work.  This allows move_linked_works() to be
 * nested inside outer list_for_each_entry_safe().
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 */
static void move_linked_works(struct work_struct *work, struct list_head *head,
                              struct work_struct **nextp)
{
        struct work_struct *n;

        /*
         * Linked worklist will always end before the end of the list,
         * use NULL for list head.
         */
        list_for_each_entry_safe_from(work, n, NULL, entry) {
                list_move_tail(&work->entry, head);
                if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
                        break;
        }

        /*
         * If we're already inside safe list traversal and have moved
         * multiple works to the scheduled queue, the next position
         * needs to be updated.
         */
        if (nextp)
                *nextp = n;
}

/**
 * get_pwq - get an extra reference on the specified pool_workqueue
 * @pwq: pool_workqueue to get
 *
 * Obtain an extra reference on @pwq.  The caller should guarantee that
 * @pwq has positive refcnt and be holding the matching pool->lock.
 */
static void get_pwq(struct pool_workqueue *pwq)
{
        lockdep_assert_held(&pwq->pool->lock);
        WARN_ON_ONCE(pwq->refcnt <= 0);
        pwq->refcnt++;
}

/**
 * put_pwq - put a pool_workqueue reference
 * @pwq: pool_workqueue to put
 *
 * Drop a reference of @pwq.  If its refcnt reaches zero, schedule its
 * destruction.  The caller should be holding the matching pool->lock.
 */
static void put_pwq(struct pool_workqueue *pwq)
{
        lockdep_assert_held(&pwq->pool->lock);
        if (likely(--pwq->refcnt))
                return;
        if (WARN_ON_ONCE(!(pwq->wq->flags & WQ_UNBOUND)))
                return;
        /*
         * @pwq can't be released under pool->lock, bounce to
         * pwq_unbound_release_workfn().  This never recurses on the same
         * pool->lock as this path is taken only for unbound workqueues and
         * the release work item is scheduled on a per-cpu workqueue.  To
         * avoid lockdep warning, unbound pool->locks are given lockdep
         * subclass of 1 in get_unbound_pool().
         */
        schedule_work(&pwq->unbound_release_work);
}

/**
 * put_pwq_unlocked - put_pwq() with surrounding pool lock/unlock
 * @pwq: pool_workqueue to put (can be %NULL)
 *
 * put_pwq() with locking.  This function also allows %NULL @pwq.
 */
static void put_pwq_unlocked(struct pool_workqueue *pwq)
{
        if (pwq) {
                /*
                 * As both pwqs and pools are RCU protected, the
                 * following lock operations are safe.
                 */
                raw_spin_lock_irq(&pwq->pool->lock);
                put_pwq(pwq);
                raw_spin_unlock_irq(&pwq->pool->lock);
        }
}

static void pwq_activate_inactive_work(struct work_struct *work)
{
        struct pool_workqueue *pwq = get_work_pwq(work);

        trace_workqueue_activate_work(work);
        if (list_empty(&pwq->pool->worklist))
                pwq->pool->watchdog_ts = jiffies;
        move_linked_works(work, &pwq->pool->worklist, NULL);
        __clear_bit(WORK_STRUCT_INACTIVE_BIT, work_data_bits(work));
        pwq->nr_active++;
}

static void pwq_activate_first_inactive(struct pool_workqueue *pwq)
{
        struct work_struct *work = list_first_entry(&pwq->inactive_works,
                                                    struct work_struct, entry);

        pwq_activate_inactive_work(work);
}

/**
 * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight
 * @pwq: pwq of interest
 * @color: color of work which left the queue
 *
 * A work either has completed or is removed from pending queue,
 * decrement nr_in_flight of its pwq and handle workqueue flushing.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 */
static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)
{
        /* uncolored work items don't participate in flushing or nr_active */
        if (color == WORK_NO_COLOR)
                goto out_put;

        pwq->nr_in_flight[color]--;

        pwq->nr_active--;
        if (!list_empty(&pwq->inactive_works)) {
                /* one down, submit an inactive one */
                if (pwq->nr_active < pwq->max_active)
                        pwq_activate_first_inactive(pwq);
        }

        /* is flush in progress and are we at the flushing tip? */
        if (likely(pwq->flush_color != color))
                goto out_put;

        /* are there still in-flight works? */
        if (pwq->nr_in_flight[color])
                goto out_put;

        /* this pwq is done, clear flush_color */
        pwq->flush_color = -1;

        /*
         * If this was the last pwq, wake up the first flusher.  It
         * will handle the rest.
         */
        if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush))
                complete(&pwq->wq->first_flusher->done);
out_put:
        put_pwq(pwq);
}

/**
 * try_to_grab_pending - steal work item from worklist and disable irq
 * @work: work item to steal
 * @is_dwork: @work is a delayed_work
 * @flags: place to store irq state
 *
 * Try to grab PENDING bit of @work.  This function can handle @work in any
 * stable state - idle, on timer or on worklist.
 *
 * Return:
 *
 *  ========        ================================================================
 *  1                if @work was pending and we successfully stole PENDING
 *  0                if @work was idle and we claimed PENDING
 *  -EAGAIN        if PENDING couldn't be grabbed at the moment, safe to busy-retry
 *  -ENOENT        if someone else is canceling @work, this state may persist
 *                for arbitrarily long
 *  ========        ================================================================
 *
 * Note:
 * On >= 0 return, the caller owns @work's PENDING bit.  To avoid getting
 * interrupted while holding PENDING and @work off queue, irq must be
 * disabled on entry.  This, combined with delayed_work->timer being
 * irqsafe, ensures that we return -EAGAIN for finite short period of time.
 *
 * On successful return, >= 0, irq is disabled and the caller is
 * responsible for releasing it using local_irq_restore(*@flags).
 *
 * This function is safe to call from any context including IRQ handler.
 */
static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
                               unsigned long *flags)
{
        struct worker_pool *pool;
        struct pool_workqueue *pwq;

        local_irq_save(*flags);

        /* try to steal the timer if it exists */
        if (is_dwork) {
                struct delayed_work *dwork = to_delayed_work(work);

                /*
                 * dwork->timer is irqsafe.  If del_timer() fails, it's
                 * guaranteed that the timer is not queued anywhere and not
                 * running on the local CPU.
                 */
                if (likely(del_timer(&dwork->timer)))
                        return 1;
        }

        /* try to claim PENDING the normal way */
        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
                return 0;

        rcu_read_lock();
        /*
         * The queueing is in progress, or it is already queued. Try to
         * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
         */
        pool = get_work_pool(work);
        if (!pool)
                goto fail;

        raw_spin_lock(&pool->lock);
        /*
         * work->data is guaranteed to point to pwq only while the work
         * item is queued on pwq->wq, and both updating work->data to point
         * to pwq on queueing and to pool on dequeueing are done under
         * pwq->pool->lock.  This in turn guarantees that, if work->data
         * points to pwq which is associated with a locked pool, the work
         * item is currently queued on that pool.
         */
        pwq = get_work_pwq(work);
        if (pwq && pwq->pool == pool) {
                debug_work_deactivate(work);

                /*
                 * An inactive work item cannot be grabbed directly because
                 * it might have linked NO_COLOR work items which, if left
                 * on the inactive_works list, will confuse pwq->nr_active
                 * management later on and cause stall.  Make sure the work
                 * item is activated before grabbing.
                 */
                if (*work_data_bits(work) & WORK_STRUCT_INACTIVE)
                        pwq_activate_inactive_work(work);

                list_del_init(&work->entry);
                pwq_dec_nr_in_flight(pwq, get_work_color(work));

                /* work->data points to pwq iff queued, point to pool */
                set_work_pool_and_keep_pending(work, pool->id);

                raw_spin_unlock(&pool->lock);
                rcu_read_unlock();
                return 1;
        }
        raw_spin_unlock(&pool->lock);
fail:
        rcu_read_unlock();
        local_irq_restore(*flags);
        if (work_is_canceling(work))
                return -ENOENT;
        cpu_relax();
        return -EAGAIN;
}

/**
 * insert_work - insert a work into a pool
 * @pwq: pwq @work belongs to
 * @work: work to insert
 * @head: insertion point
 * @extra_flags: extra WORK_STRUCT_* flags to set
 *
 * Insert @work which belongs to @pwq after @head.  @extra_flags is or'd to
 * work_struct flags.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 */
static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
                        struct list_head *head, unsigned int extra_flags)
{
        struct worker_pool *pool = pwq->pool;

        /* we own @work, set data and link */
        set_work_pwq(work, pwq, extra_flags);
        list_add_tail(&work->entry, head);
        get_pwq(pwq);

        /*
         * Ensure either wq_worker_sleeping() sees the above
         * list_add_tail() or we see zero nr_running to avoid workers lying
         * around lazily while there are works to be processed.
         */
        smp_mb();

        if (__need_more_worker(pool))
                wake_up_worker(pool);
}

/*
 * Test whether @work is being queued from another work executing on the
 * same workqueue.
 */
static bool is_chained_work(struct workqueue_struct *wq)
{
        struct worker *worker;

        worker = current_wq_worker();
        /*
         * Return %true iff I'm a worker executing a work item on @wq.  If
         * I'm @worker, it's safe to dereference it without locking.
         */
        return worker && worker->current_pwq->wq == wq;
}

/*
 * When queueing an unbound work item to a wq, prefer local CPU if allowed
 * by wq_unbound_cpumask.  Otherwise, round robin among the allowed ones to
 * avoid perturbing sensitive tasks.
 */
static int wq_select_unbound_cpu(int cpu)
{
        static bool printed_dbg_warning;
        int new_cpu;

        if (likely(!wq_debug_force_rr_cpu)) {
                if (cpumask_test_cpu(cpu, wq_unbound_cpumask))
                        return cpu;
        } else if (!printed_dbg_warning) {
                pr_warn("workqueue: round-robin CPU selection forced, expect performance impact\n");
                printed_dbg_warning = true;
        }

        if (cpumask_empty(wq_unbound_cpumask))
                return cpu;

        new_cpu = __this_cpu_read(wq_rr_cpu_last);
        new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask);
        if (unlikely(new_cpu >= nr_cpu_ids)) {
                new_cpu = cpumask_first_and(wq_unbound_cpumask, cpu_online_mask);
                if (unlikely(new_cpu >= nr_cpu_ids))
                        return cpu;
        }
        __this_cpu_write(wq_rr_cpu_last, new_cpu);

        return new_cpu;
}

static void __queue_work(int cpu, struct workqueue_struct *wq,
                         struct work_struct *work)
{
        struct pool_workqueue *pwq;
        struct worker_pool *last_pool;
        struct list_head *worklist;
        unsigned int work_flags;
        unsigned int req_cpu = cpu;

        /*
         * While a work item is PENDING && off queue, a task trying to
         * steal the PENDING will busy-loop waiting for it to either get
         * queued or lose PENDING.  Grabbing PENDING and queueing should
         * happen with IRQ disabled.
         */
        lockdep_assert_irqs_disabled();


        /* if draining, only works from the same workqueue are allowed */
        if (unlikely(wq->flags & __WQ_DRAINING) &&
            WARN_ON_ONCE(!is_chained_work(wq)))
                return;
        rcu_read_lock();
retry:
        /* pwq which will be used unless @work is executing elsewhere */
        if (wq->flags & WQ_UNBOUND) {
                if (req_cpu == WORK_CPU_UNBOUND)
                        cpu = wq_select_unbound_cpu(raw_smp_processor_id());
                pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
        } else {
                if (req_cpu == WORK_CPU_UNBOUND)
                        cpu = raw_smp_processor_id();
                pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
        }

        /*
         * If @work was previously on a different pool, it might still be
         * running there, in which case the work needs to be queued on that
         * pool to guarantee non-reentrancy.
         */
        last_pool = get_work_pool(work);
        if (last_pool && last_pool != pwq->pool) {
                struct worker *worker;

                raw_spin_lock(&last_pool->lock);

                worker = find_worker_executing_work(last_pool, work);

                if (worker && worker->current_pwq->wq == wq) {
                        pwq = worker->current_pwq;
                } else {
                        /* meh... not running there, queue here */
                        raw_spin_unlock(&last_pool->lock);
                        raw_spin_lock(&pwq->pool->lock);
                }
        } else {
                raw_spin_lock(&pwq->pool->lock);
        }

        /*
         * pwq is determined and locked.  For unbound pools, we could have
         * raced with pwq release and it could already be dead.  If its
         * refcnt is zero, repeat pwq selection.  Note that pwqs never die
         * without another pwq replacing it in the numa_pwq_tbl or while
         * work items are executing on it, so the retrying is guaranteed to
         * make forward-progress.
         */
        if (unlikely(!pwq->refcnt)) {
                if (wq->flags & WQ_UNBOUND) {
                        raw_spin_unlock(&pwq->pool->lock);
                        cpu_relax();
                        goto retry;
                }
                /* oops */
                WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt",
                          wq->name, cpu);
        }

        /* pwq determined, queue */
        trace_workqueue_queue_work(req_cpu, pwq, work);

        if (WARN_ON(!list_empty(&work->entry)))
                goto out;

        pwq->nr_in_flight[pwq->work_color]++;
        work_flags = work_color_to_flags(pwq->work_color);

        if (likely(pwq->nr_active < pwq->max_active)) {
                trace_workqueue_activate_work(work);
                pwq->nr_active++;
                worklist = &pwq->pool->worklist;
                if (list_empty(worklist))
                        pwq->pool->watchdog_ts = jiffies;
        } else {
                work_flags |= WORK_STRUCT_INACTIVE;
                worklist = &pwq->inactive_works;
        }

        debug_work_activate(work);
        insert_work(pwq, work, worklist, work_flags);

out:
        raw_spin_unlock(&pwq->pool->lock);
        rcu_read_unlock();
}

/**
 * queue_work_on - queue work on specific cpu
 * @cpu: CPU number to execute work on
 * @wq: workqueue to use
 * @work: work to queue
 *
 * We queue the work to a specific CPU, the caller must ensure it
 * can't go away.
 *
 * Return: %false if @work was already on a queue, %true otherwise.
 */
bool queue_work_on(int cpu, struct workqueue_struct *wq,
                   struct work_struct *work)
{
        bool ret = false;
        unsigned long flags;

        local_irq_save(flags);

        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
                __queue_work(cpu, wq, work);
                ret = true;
        }

        local_irq_restore(flags);
        return ret;
}
EXPORT_SYMBOL(queue_work_on);

/**
 * workqueue_select_cpu_near - Select a CPU based on NUMA node
 * @node: NUMA node ID that we want to select a CPU from
 *
 * This function will attempt to find a "random" cpu available on a given
 * node. If there are no CPUs available on the given node it will return
 * WORK_CPU_UNBOUND indicating that we should just schedule to any
 * available CPU if we need to schedule this work.
 */
static int workqueue_select_cpu_near(int node)
{
        int cpu;

        /* No point in doing this if NUMA isn't enabled for workqueues */
        if (!wq_numa_enabled)
                return WORK_CPU_UNBOUND;

        /* Delay binding to CPU if node is not valid or online */
        if (node < 0 || node >= MAX_NUMNODES || !node_online(node))
                return WORK_CPU_UNBOUND;

        /* Use local node/cpu if we are already there */
        cpu = raw_smp_processor_id();
        if (node == cpu_to_node(cpu))
                return cpu;

        /* Use "random" otherwise know as "first" online CPU of node */
        cpu = cpumask_any_and(cpumask_of_node(node), cpu_online_mask);

        /* If CPU is valid return that, otherwise just defer */
        return cpu < nr_cpu_ids ? cpu : WORK_CPU_UNBOUND;
}

/**
 * queue_work_node - queue work on a "random" cpu for a given NUMA node
 * @node: NUMA node that we are targeting the work for
 * @wq: workqueue to use
 * @work: work to queue
 *
 * We queue the work to a "random" CPU within a given NUMA node. The basic
 * idea here is to provide a way to somehow associate work with a given
 * NUMA node.
 *
 * This function will only make a best effort attempt at getting this onto
 * the right NUMA node. If no node is requested or the requested node is
 * offline then we just fall back to standard queue_work behavior.
 *
 * Currently the "random" CPU ends up being the first available CPU in the
 * intersection of cpu_online_mask and the cpumask of the node, unless we
 * are running on the node. In that case we just use the current CPU.
 *
 * Return: %false if @work was already on a queue, %true otherwise.
 */
bool queue_work_node(int node, struct workqueue_struct *wq,
                     struct work_struct *work)
{
        unsigned long flags;
        bool ret = false;

        /*
         * This current implementation is specific to unbound workqueues.
         * Specifically we only return the first available CPU for a given
         * node instead of cycling through individual CPUs within the node.
         *
         * If this is used with a per-cpu workqueue then the logic in
         * workqueue_select_cpu_near would need to be updated to allow for
         * some round robin type logic.
         */
        WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND));

        local_irq_save(flags);

        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
                int cpu = workqueue_select_cpu_near(node);

                __queue_work(cpu, wq, work);
                ret = true;
        }

        local_irq_restore(flags);
        return ret;
}
EXPORT_SYMBOL_GPL(queue_work_node);

void delayed_work_timer_fn(struct timer_list *t)
{
        struct delayed_work *dwork = from_timer(dwork, t, timer);

        /* should have been called from irqsafe timer with irq already off */
        __queue_work(dwork->cpu, dwork->wq, &dwork->work);
}
EXPORT_SYMBOL(delayed_work_timer_fn);

static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
                                struct delayed_work *dwork, unsigned long delay)
{
        struct timer_list *timer = &dwork->timer;
        struct work_struct *work = &dwork->work;

        WARN_ON_ONCE(!wq);
        WARN_ON_ONCE(timer->function != delayed_work_timer_fn);
        WARN_ON_ONCE(timer_pending(timer));
        WARN_ON_ONCE(!list_empty(&work->entry));

        /*
         * If @delay is 0, queue @dwork->work immediately.  This is for
         * both optimization and correctness.  The earliest @timer can
         * expire is on the closest next tick and delayed_work users depend
         * on that there's no such delay when @delay is 0.
         */
        if (!delay) {
                __queue_work(cpu, wq, &dwork->work);
                return;
        }

        dwork->wq = wq;
        dwork->cpu = cpu;
        timer->expires = jiffies + delay;

        if (unlikely(cpu != WORK_CPU_UNBOUND))
                add_timer_on(timer, cpu);
        else
                add_timer(timer);
}

/**
 * queue_delayed_work_on - queue work on specific CPU after delay
 * @cpu: CPU number to execute work on
 * @wq: workqueue to use
 * @dwork: work to queue
 * @delay: number of jiffies to wait before queueing
 *
 * Return: %false if @work was already on a queue, %true otherwise.  If
 * @delay is zero and @dwork is idle, it will be scheduled for immediate
 * execution.
 */
bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
                           struct delayed_work *dwork, unsigned long delay)
{
        struct work_struct *work = &dwork->work;
        bool ret = false;
        unsigned long flags;

        /* read the comment in __queue_work() */
        local_irq_save(flags);

        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
                __queue_delayed_work(cpu, wq, dwork, delay);
                ret = true;
        }

        local_irq_restore(flags);
        return ret;
}
EXPORT_SYMBOL(queue_delayed_work_on);

/**
 * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU
 * @cpu: CPU number to execute work on
 * @wq: workqueue to use
 * @dwork: work to queue
 * @delay: number of jiffies to wait before queueing
 *
 * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise,
 * modify @dwork's timer so that it expires after @delay.  If @delay is
 * zero, @work is guaranteed to be scheduled immediately regardless of its
 * current state.
 *
 * Return: %false if @dwork was idle and queued, %true if @dwork was
 * pending and its timer was modified.
 *
 * This function is safe to call from any context including IRQ handler.
 * See try_to_grab_pending() for details.
 */
bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
                         struct delayed_work *dwork, unsigned long delay)
{
        unsigned long flags;
        int ret;

        do {
                ret = try_to_grab_pending(&dwork->work, true, &flags);
        } while (unlikely(ret == -EAGAIN));

        if (likely(ret >= 0)) {
                __queue_delayed_work(cpu, wq, dwork, delay);
                local_irq_restore(flags);
        }

        /* -ENOENT from try_to_grab_pending() becomes %true */
        return ret;
}
EXPORT_SYMBOL_GPL(mod_delayed_work_on);

static void rcu_work_rcufn(struct rcu_head *rcu)
{
        struct rcu_work *rwork = container_of(rcu, struct rcu_work, rcu);

        /* read the comment in __queue_work() */
        local_irq_disable();
        __queue_work(WORK_CPU_UNBOUND, rwork->wq, &rwork->work);
        local_irq_enable();
}

/**
 * queue_rcu_work - queue work after a RCU grace period
 * @wq: workqueue to use
 * @rwork: work to queue
 *
 * Return: %false if @rwork was already pending, %true otherwise.  Note
 * that a full RCU grace period is guaranteed only after a %true return.
 * While @rwork is guaranteed to be executed after a %false return, the
 * execution may happen before a full RCU grace period has passed.
 */
bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork)
{
        struct work_struct *work = &rwork->work;

        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
                rwork->wq = wq;
                call_rcu(&rwork->rcu, rcu_work_rcufn);
                return true;
        }

        return false;
}
EXPORT_SYMBOL(queue_rcu_work);

/**
 * worker_enter_idle - enter idle state
 * @worker: worker which is entering idle state
 *
 * @worker is entering idle state.  Update stats and idle timer if
 * necessary.
 *
 * LOCKING:
 * raw_spin_lock_irq(pool->lock).
 */
static void worker_enter_idle(struct worker *worker)
{
        struct worker_pool *pool = worker->pool;

        if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) ||
            WARN_ON_ONCE(!list_empty(&worker->entry) &&
                         (worker->hentry.next || worker->hentry.pprev)))
                return;

        /* can't use worker_set_flags(), also called from create_worker() */
        worker->flags |= WORKER_IDLE;
        pool->nr_idle++;
        worker->last_active = jiffies;

        /* idle_list is LIFO */
        list_add(&worker->entry, &pool->idle_list);

        if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
                mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);

        /*
         * Sanity check nr_running.  Because unbind_workers() releases
         * pool->lock between setting %WORKER_UNBOUND and zapping
         * nr_running, the warning may trigger spuriously.  Check iff
         * unbind is not in progress.
         */
        WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
                     pool->nr_workers == pool->nr_idle &&
                     atomic_read(&pool->nr_running));
}

/**
 * worker_leave_idle - leave idle state
 * @worker: worker which is leaving idle state
 *
 * @worker is leaving idle state.  Update stats.
 *
 * LOCKING:
 * raw_spin_lock_irq(pool->lock).
 */
static void worker_leave_idle(struct worker *worker)
{
        struct worker_pool *pool = worker->pool;

        if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE)))
                return;
        worker_clr_flags(worker, WORKER_IDLE);
        pool->nr_idle--;
        list_del_init(&worker->entry);
}

static struct worker *alloc_worker(int node)
{
        struct worker *worker;

        worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node);
        if (worker) {
                INIT_LIST_HEAD(&worker->entry);
                INIT_LIST_HEAD(&worker->scheduled);
                INIT_LIST_HEAD(&worker->node);
                /* on creation a worker is in !idle && prep state */
                worker->flags = WORKER_PREP;
        }
        return worker;
}

/**
 * worker_attach_to_pool() - attach a worker to a pool
 * @worker: worker to be attached
 * @pool: the target pool
 *
 * Attach @worker to @pool.  Once attached, the %WORKER_UNBOUND flag and
 * cpu-binding of @worker are kept coordinated with the pool across
 * cpu-[un]hotplugs.
 */
static void worker_attach_to_pool(struct worker *worker,
                                   struct worker_pool *pool)
{
        mutex_lock(&wq_pool_attach_mutex);

        /*
         * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains
         * stable across this function.  See the comments above the flag
         * definition for details.
         */
        if (pool->flags & POOL_DISASSOCIATED)
                worker->flags |= WORKER_UNBOUND;

        if (worker->rescue_wq)
                set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);

        list_add_tail(&worker->node, &pool->workers);
        worker->pool = pool;

        mutex_unlock(&wq_pool_attach_mutex);
}

/**
 * worker_detach_from_pool() - detach a worker from its pool
 * @worker: worker which is attached to its pool
 *
 * Undo the attaching which had been done in worker_attach_to_pool().  The
 * caller worker shouldn't access to the pool after detached except it has
 * other reference to the pool.
 */
static void worker_detach_from_pool(struct worker *worker)
{
        struct worker_pool *pool = worker->pool;
        struct completion *detach_completion = NULL;

        mutex_lock(&wq_pool_attach_mutex);

        list_del(&worker->node);
        worker->pool = NULL;

        if (list_empty(&pool->workers))
                detach_completion = pool->detach_completion;
        mutex_unlock(&wq_pool_attach_mutex);

        /* clear leftover flags without pool->lock after it is detached */
        worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND);

        if (detach_completion)
                complete(detach_completion);
}

/**
 * create_worker - create a new workqueue worker
 * @pool: pool the new worker will belong to
 *
 * Create and start a new worker which is attached to @pool.
 *
 * CONTEXT:
 * Might sleep.  Does GFP_KERNEL allocations.
 *
 * Return:
 * Pointer to the newly created worker.
 */
static struct worker *create_worker(struct worker_pool *pool)
{
        struct worker *worker = NULL;
        int id = -1;
        char id_buf[16];

        /* ID is needed to determine kthread name */
        id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL);
        if (id < 0)
                goto fail;

        worker = alloc_worker(pool->node);
        if (!worker)
                goto fail;

        worker->id = id;

        if (pool->cpu >= 0)
                snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id,
                         pool->attrs->nice < 0  ? "H" : "");
        else
                snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id);

        worker->task = kthread_create_on_node(worker_thread, worker, pool->node,
                                              "kworker/%s", id_buf);
        if (IS_ERR(worker->task))
                goto fail;

        set_user_nice(worker->task, pool->attrs->nice);
        kthread_bind_mask(worker->task, pool->attrs->cpumask);

        /* successful, attach the worker to the pool */
        worker_attach_to_pool(worker, pool);

        /* start the newly created worker */
        raw_spin_lock_irq(&pool->lock);
        worker->pool->nr_workers++;
        worker_enter_idle(worker);
        wake_up_process(worker->task);
        raw_spin_unlock_irq(&pool->lock);

        return worker;

fail:
        if (id >= 0)
                ida_simple_remove(&pool->worker_ida, id);
        kfree(worker);
        return NULL;
}

/**
 * destroy_worker - destroy a workqueue worker
 * @worker: worker to be destroyed
 *
 * Destroy @worker and adjust @pool stats accordingly.  The worker should
 * be idle.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 */
static void destroy_worker(struct worker *worker)
{
        struct worker_pool *pool = worker->pool;

        lockdep_assert_held(&pool->lock);

        /* sanity check frenzy */
        if (WARN_ON(worker->current_work) ||
            WARN_ON(!list_empty(&worker->scheduled)) ||
            WARN_ON(!(worker->flags & WORKER_IDLE)))
                return;

        pool->nr_workers--;
        pool->nr_idle--;

        list_del_init(&worker->entry);
        worker->flags |= WORKER_DIE;
        wake_up_process(worker->task);
}

static void idle_worker_timeout(struct timer_list *t)
{
        struct worker_pool *pool = from_timer(pool, t, idle_timer);

        raw_spin_lock_irq(&pool->lock);

        while (too_many_workers(pool)) {
                struct worker *worker;
                unsigned long expires;

                /* idle_list is kept in LIFO order, check the last one */
                worker = list_entry(pool->idle_list.prev, struct worker, entry);
                expires = worker->last_active + IDLE_WORKER_TIMEOUT;

                if (time_before(jiffies, expires)) {
                        mod_timer(&pool->idle_timer, expires);
                        break;
                }

                destroy_worker(worker);
        }

        raw_spin_unlock_irq(&pool->lock);
}

static void send_mayday(struct work_struct *work)
{
        struct pool_workqueue *pwq = get_work_pwq(work);
        struct workqueue_struct *wq = pwq->wq;

        lockdep_assert_held(&wq_mayday_lock);

        if (!wq->rescuer)
                return;

        /* mayday mayday mayday */
        if (list_empty(&pwq->mayday_node)) {
                /*
                 * If @pwq is for an unbound wq, its base ref may be put at
                 * any time due to an attribute change.  Pin @pwq until the
                 * rescuer is done with it.
                 */
                get_pwq(pwq);
                list_add_tail(&pwq->mayday_node, &wq->maydays);
                wake_up_process(wq->rescuer->task);
        }
}

static void pool_mayday_timeout(struct timer_list *t)
{
        struct worker_pool *pool = from_timer(pool, t, mayday_timer);
        struct work_struct *work;

        raw_spin_lock_irq(&pool->lock);
        raw_spin_lock(&wq_mayday_lock);                /* for wq->maydays */

        if (need_to_create_worker(pool)) {
                /*
                 * We've been trying to create a new worker but
                 * haven't been successful.  We might be hitting an
                 * allocation deadlock.  Send distress signals to
                 * rescuers.
                 */
                list_for_each_entry(work, &pool->worklist, entry)
                        send_mayday(work);
        }

        raw_spin_unlock(&wq_mayday_lock);
        raw_spin_unlock_irq(&pool->lock);

        mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
}

/**
 * maybe_create_worker - create a new worker if necessary
 * @pool: pool to create a new worker for
 *
 * Create a new worker for @pool if necessary.  @pool is guaranteed to
 * have at least one idle worker on return from this function.  If
 * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
 * sent to all rescuers with works scheduled on @pool to resolve
 * possible allocation deadlock.
 *
 * On return, need_to_create_worker() is guaranteed to be %false and
 * may_start_working() %true.
 *
 * LOCKING:
 * raw_spin_lock_irq(pool->lock) which may be released and regrabbed
 * multiple times.  Does GFP_KERNEL allocations.  Called only from
 * manager.
 */
static void maybe_create_worker(struct worker_pool *pool)
__releases(&pool->lock)
__acquires(&pool->lock)
{
restart:
        raw_spin_unlock_irq(&pool->lock);

        /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
        mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);

        while (true) {
                if (create_worker(pool) || !need_to_create_worker(pool))
                        break;

                schedule_timeout_interruptible(CREATE_COOLDOWN);

                if (!need_to_create_worker(pool))
                        break;
        }

        del_timer_sync(&pool->mayday_timer);
        raw_spin_lock_irq(&pool->lock);
        /*
         * This is necessary even after a new worker was just successfully
         * created as @pool->lock was dropped and the new worker might have
         * already become busy.
         */
        if (need_to_create_worker(pool))
                goto restart;
}

/**
 * manage_workers - manage worker pool
 * @worker: self
 *
 * Assume the manager role and manage the worker pool @worker belongs
 * to.  At any given time, there can be only zero or one manager per
 * pool.  The exclusion is handled automatically by this function.
 *
 * The caller can safely start processing works on false return.  On
 * true return, it's guaranteed that need_to_create_worker() is false
 * and may_start_working() is true.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock) which may be released and regrabbed
 * multiple times.  Does GFP_KERNEL allocations.
 *
 * Return:
 * %false if the pool doesn't need management and the caller can safely
 * start processing works, %true if management function was performed and
 * the conditions that the caller verified before calling the function may
 * no longer be true.
 */
static bool manage_workers(struct worker *worker)
{
        struct worker_pool *pool = worker->pool;

        if (pool->flags & POOL_MANAGER_ACTIVE)
                return false;

        pool->flags |= POOL_MANAGER_ACTIVE;
        pool->manager = worker;

        maybe_create_worker(pool);

        pool->manager = NULL;
        pool->flags &= ~POOL_MANAGER_ACTIVE;
        rcuwait_wake_up(&manager_wait);
        return true;
}

/**
 * process_one_work - process single work
 * @worker: self
 * @work: work to process
 *
 * Process @work.  This function contains all the logics necessary to
 * process a single work including synchronization against and
 * interaction with other workers on the same cpu, queueing and
 * flushing.  As long as context requirement is met, any worker can
 * call this function to process a work.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock) which is released and regrabbed.
 */
static void process_one_work(struct worker *worker, struct work_struct *work)
__releases(&pool->lock)
__acquires(&pool->lock)
{
        struct pool_workqueue *pwq = get_work_pwq(work);
        struct worker_pool *pool = worker->pool;
        bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;
        int work_color;
        struct worker *collision;
#ifdef CONFIG_LOCKDEP
        /*
         * It is permissible to free the struct work_struct from
         * inside the function that is called from it, this we need to
         * take into account for lockdep too.  To avoid bogus "held
         * lock freed" warnings as well as problems when looking into
         * work->lockdep_map, make a copy and use that here.
         */
        struct lockdep_map lockdep_map;

        lockdep_copy_map(&lockdep_map, &work->lockdep_map);
#endif
        /* ensure we're on the correct CPU */
        WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
                     raw_smp_processor_id() != pool->cpu);

        /*
         * A single work shouldn't be executed concurrently by
         * multiple workers on a single cpu.  Check whether anyone is
         * already processing the work.  If so, defer the work to the
         * currently executing one.
         */
        collision = find_worker_executing_work(pool, work);
        if (unlikely(collision)) {
                move_linked_works(work, &collision->scheduled, NULL);
                return;
        }

        /* claim and dequeue */
        debug_work_deactivate(work);
        hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);
        worker->current_work = work;
        worker->current_func = work->func;
        worker->current_pwq = pwq;
        work_color = get_work_color(work);

        /*
         * Record wq name for cmdline and debug reporting, may get
         * overridden through set_worker_desc().
         */
        strscpy(worker->desc, pwq->wq->name, WORKER_DESC_LEN);

        list_del_init(&work->entry);

        /*
         * CPU intensive works don't participate in concurrency management.
         * They're the scheduler's responsibility.  This takes @worker out
         * of concurrency management and the next code block will chain
         * execution of the pending work items.
         */
        if (unlikely(cpu_intensive))
                worker_set_flags(worker, WORKER_CPU_INTENSIVE);

        /*
         * Wake up another worker if necessary.  The condition is always
         * false for normal per-cpu workers since nr_running would always
         * be >= 1 at this point.  This is used to chain execution of the
         * pending work items for WORKER_NOT_RUNNING workers such as the
         * UNBOUND and CPU_INTENSIVE ones.
         */
        if (need_more_worker(pool))
                wake_up_worker(pool);

        /*
         * Record the last pool and clear PENDING which should be the last
         * update to @work.  Also, do this inside @pool->lock so that
         * PENDING and queued state changes happen together while IRQ is
         * disabled.
         */
        set_work_pool_and_clear_pending(work, pool->id);

        raw_spin_unlock_irq(&pool->lock);

        lock_map_acquire(&pwq->wq->lockdep_map);
        lock_map_acquire(&lockdep_map);
        /*
         * Strictly speaking we should mark the invariant state without holding
         * any locks, that is, before these two lock_map_acquire()'s.
         *
         * However, that would result in:
         *
         *   A(W1)
         *   WFC(C)
         *                A(W1)
         *                C(C)
         *
         * Which would create W1->C->W1 dependencies, even though there is no
         * actual deadlock possible. There are two solutions, using a
         * read-recursive acquire on the work(queue) 'locks', but this will then
         * hit the lockdep limitation on recursive locks, or simply discard
         * these locks.
         *
         * AFAICT there is no possible deadlock scenario between the
         * flush_work() and complete() primitives (except for single-threaded
         * workqueues), so hiding them isn't a problem.
         */
        lockdep_invariant_state(true);
        trace_workqueue_execute_start(work);
        worker->current_func(work);
        /*
         * While we must be careful to not use "work" after this, the trace
         * point will only record its address.
         */
        trace_workqueue_execute_end(work, worker->current_func);
        lock_map_release(&lockdep_map);
        lock_map_release(&pwq->wq->lockdep_map);

        if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
                pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
                       "     last function: %ps\n",
                       current->comm, preempt_count(), task_pid_nr(current),
                       worker->current_func);
                debug_show_held_locks(current);
                dump_stack();
        }

        /*
         * The following prevents a kworker from hogging CPU on !PREEMPTION
         * kernels, where a requeueing work item waiting for something to
         * happen could deadlock with stop_machine as such work item could
         * indefinitely requeue itself while all other CPUs are trapped in
         * stop_machine. At the same time, report a quiescent RCU state so
         * the same condition doesn't freeze RCU.
         */
        cond_resched();

        raw_spin_lock_irq(&pool->lock);

        /* clear cpu intensive status */
        if (unlikely(cpu_intensive))
                worker_clr_flags(worker, WORKER_CPU_INTENSIVE);

        /* tag the worker for identification in schedule() */
        worker->last_func = worker->current_func;

        /* we're done with it, release */
        hash_del(&worker->hentry);
        worker->current_work = NULL;
        worker->current_func = NULL;
        worker->current_pwq = NULL;
        pwq_dec_nr_in_flight(pwq, work_color);
}

/**
 * process_scheduled_works - process scheduled works
 * @worker: self
 *
 * Process all scheduled works.  Please note that the scheduled list
 * may change while processing a work, so this function repeatedly
 * fetches a work from the top and executes it.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock) which may be released and regrabbed
 * multiple times.
 */
static void process_scheduled_works(struct worker *worker)
{
        while (!list_empty(&worker->scheduled)) {
                struct work_struct *work = list_first_entry(&worker->scheduled,
                                                struct work_struct, entry);
                process_one_work(worker, work);
        }
}

static void set_pf_worker(bool val)
{
        mutex_lock(&wq_pool_attach_mutex);
        if (val)
                current->flags |= PF_WQ_WORKER;
        else
                current->flags &= ~PF_WQ_WORKER;
        mutex_unlock(&wq_pool_attach_mutex);
}

/**
 * worker_thread - the worker thread function
 * @__worker: self
 *
 * The worker thread function.  All workers belong to a worker_pool -
 * either a per-cpu one or dynamic unbound one.  These workers process all
 * work items regardless of their specific target workqueue.  The only
 * exception is work items which belong to workqueues with a rescuer which
 * will be explained in rescuer_thread().
 *
 * Return: 0
 */
static int worker_thread(void *__worker)
{
        struct worker *worker = __worker;
        struct worker_pool *pool = worker->pool;

        /* tell the scheduler that this is a workqueue worker */
        set_pf_worker(true);
woke_up:
        raw_spin_lock_irq(&pool->lock);

        /* am I supposed to die? */
        if (unlikely(worker->flags & WORKER_DIE)) {
                raw_spin_unlock_irq(&pool->lock);
                WARN_ON_ONCE(!list_empty(&worker->entry));
                set_pf_worker(false);

                set_task_comm(worker->task, "kworker/dying");
                ida_simple_remove(&pool->worker_ida, worker->id);
                worker_detach_from_pool(worker);
                kfree(worker);
                return 0;
        }

        worker_leave_idle(worker);
recheck:
        /* no more worker necessary? */
        if (!need_more_worker(pool))
                goto sleep;

        /* do we need to manage? */
        if (unlikely(!may_start_working(pool)) && manage_workers(worker))
                goto recheck;

        /*
         * ->scheduled list can only be filled while a worker is
         * preparing to process a work or actually processing it.
         * Make sure nobody diddled with it while I was sleeping.
         */
        WARN_ON_ONCE(!list_empty(&worker->scheduled));

        /*
         * Finish PREP stage.  We're guaranteed to have at least one idle
         * worker or that someone else has already assumed the manager
         * role.  This is where @worker starts participating in concurrency
         * management if applicable and concurrency management is restored
         * after being rebound.  See rebind_workers() for details.
         */
        worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);

        do {
                struct work_struct *work =
                        list_first_entry(&pool->worklist,
                                         struct work_struct, entry);

                pool->watchdog_ts = jiffies;

                if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
                        /* optimization path, not strictly necessary */
                        process_one_work(worker, work);
                        if (unlikely(!list_empty(&worker->scheduled)))
                                process_scheduled_works(worker);
                } else {
                        move_linked_works(work, &worker->scheduled, NULL);
                        process_scheduled_works(worker);
                }
        } while (keep_working(pool));

        worker_set_flags(worker, WORKER_PREP);
sleep:
        /*
         * pool->lock is held and there's no work to process and no need to
         * manage, sleep.  Workers are woken up only while holding
         * pool->lock or from local cpu, so setting the current state
         * before releasing pool->lock is enough to prevent losing any
         * event.
         */
        worker_enter_idle(worker);
        __set_current_state(TASK_IDLE);
        raw_spin_unlock_irq(&pool->lock);
        schedule();
        goto woke_up;
}

/**
 * rescuer_thread - the rescuer thread function
 * @__rescuer: self
 *
 * Workqueue rescuer thread function.  There's one rescuer for each
 * workqueue which has WQ_MEM_RECLAIM set.
 *
 * Regular work processing on a pool may block trying to create a new
 * worker which uses GFP_KERNEL allocation which has slight chance of
 * developing into deadlock if some works currently on the same queue
 * need to be processed to satisfy the GFP_KERNEL allocation.  This is
 * the problem rescuer solves.
 *
 * When such condition is possible, the pool summons rescuers of all
 * workqueues which have works queued on the pool and let them process
 * those works so that forward progress can be guaranteed.
 *
 * This should happen rarely.
 *
 * Return: 0
 */
static int rescuer_thread(void *__rescuer)
{
        struct worker *rescuer = __rescuer;
        struct workqueue_struct *wq = rescuer->rescue_wq;
        struct list_head *scheduled = &rescuer->scheduled;
        bool should_stop;

        set_user_nice(current, RESCUER_NICE_LEVEL);

        /*
         * Mark rescuer as worker too.  As WORKER_PREP is never cleared, it
         * doesn't participate in concurrency management.
         */
        set_pf_worker(true);
repeat:
        set_current_state(TASK_IDLE);

        /*
         * By the time the rescuer is requested to stop, the workqueue
         * shouldn't have any work pending, but @wq->maydays may still have
         * pwq(s) queued.  This can happen by non-rescuer workers consuming
         * all the work items before the rescuer got to them.  Go through
         * @wq->maydays processing before acting on should_stop so that the
         * list is always empty on exit.
         */
        should_stop = kthread_should_stop();

        /* see whether any pwq is asking for help */
        raw_spin_lock_irq(&wq_mayday_lock);

        while (!list_empty(&wq->maydays)) {
                struct pool_workqueue *pwq = list_first_entry(&wq->maydays,
                                        struct pool_workqueue, mayday_node);
                struct worker_pool *pool = pwq->pool;
                struct work_struct *work, *n;
                bool first = true;

                __set_current_state(TASK_RUNNING);
                list_del_init(&pwq->mayday_node);

                raw_spin_unlock_irq(&wq_mayday_lock);

                worker_attach_to_pool(rescuer, pool);

                raw_spin_lock_irq(&pool->lock);

                /*
                 * Slurp in all works issued via this workqueue and
                 * process'em.
                 */
                WARN_ON_ONCE(!list_empty(scheduled));
                list_for_each_entry_safe(work, n, &pool->worklist, entry) {
                        if (get_work_pwq(work) == pwq) {
                                if (first)
                                        pool->watchdog_ts = jiffies;
                                move_linked_works(work, scheduled, &n);
                        }
                        first = false;
                }

                if (!list_empty(scheduled)) {
                        process_scheduled_works(rescuer);

                        /*
                         * The above execution of rescued work items could
                         * have created more to rescue through
                         * pwq_activate_first_inactive() or chained
                         * queueing.  Let's put @pwq back on mayday list so
                         * that such back-to-back work items, which may be
                         * being used to relieve memory pressure, don't
                         * incur MAYDAY_INTERVAL delay inbetween.
                         */
                        if (pwq->nr_active && need_to_create_worker(pool)) {
                                raw_spin_lock(&wq_mayday_lock);
                                /*
                                 * Queue iff we aren't racing destruction
                                 * and somebody else hasn't queued it already.
                                 */
                                if (wq->rescuer && list_empty(&pwq->mayday_node)) {
                                        get_pwq(pwq);
                                        list_add_tail(&pwq->mayday_node, &wq->maydays);
                                }
                                raw_spin_unlock(&wq_mayday_lock);
                        }
                }

                /*
                 * Put the reference grabbed by send_mayday().  @pool won't
                 * go away while we're still attached to it.
                 */
                put_pwq(pwq);

                /*
                 * Leave this pool.  If need_more_worker() is %true, notify a
                 * regular worker; otherwise, we end up with 0 concurrency
                 * and stalling the execution.
                 */
                if (need_more_worker(pool))
                        wake_up_worker(pool);

                raw_spin_unlock_irq(&pool->lock);

                worker_detach_from_pool(rescuer);

                raw_spin_lock_irq(&wq_mayday_lock);
        }

        raw_spin_unlock_irq(&wq_mayday_lock);

        if (should_stop) {
                __set_current_state(TASK_RUNNING);
                set_pf_worker(false);
                return 0;
        }

        /* rescuers should never participate in concurrency management */
        WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
        schedule();
        goto repeat;
}

/**
 * check_flush_dependency - check for flush dependency sanity
 * @target_wq: workqueue being flushed
 * @target_work: work item being flushed (NULL for workqueue flushes)
 *
 * %current is trying to flush the whole @target_wq or @target_work on it.
 * If @target_wq doesn't have %WQ_MEM_RECLAIM, verify that %current is not
 * reclaiming memory or running on a workqueue which doesn't have
 * %WQ_MEM_RECLAIM as that can break forward-progress guarantee leading to
 * a deadlock.
 */
static void check_flush_dependency(struct workqueue_struct *target_wq,
                                   struct work_struct *target_work)
{
        work_func_t target_func = target_work ? target_work->func : NULL;
        struct worker *worker;

        if (target_wq->flags & WQ_MEM_RECLAIM)
                return;

        worker = current_wq_worker();

        WARN_ONCE(current->flags & PF_MEMALLOC,
                  "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps",
                  current->pid, current->comm, target_wq->name, target_func);
        WARN_ONCE(worker && ((worker->current_pwq->wq->flags &
                              (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM),
                  "workqueue: WQ_MEM_RECLAIM %s:%ps is flushing !WQ_MEM_RECLAIM %s:%ps",
                  worker->current_pwq->wq->name, worker->current_func,
                  target_wq->name, target_func);
}

struct wq_barrier {
        struct work_struct        work;
        struct completion        done;
        struct task_struct        *task;        /* purely informational */
};

static void wq_barrier_func(struct work_struct *work)
{
        struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
        complete(&barr->done);
}

/**
 * insert_wq_barrier - insert a barrier work
 * @pwq: pwq to insert barrier into
 * @barr: wq_barrier to insert
 * @target: target work to attach @barr to
 * @worker: worker currently executing @target, NULL if @target is not executing
 *
 * @barr is linked to @target such that @barr is completed only after
 * @target finishes execution.  Please note that the ordering
 * guarantee is observed only with respect to @target and on the local
 * cpu.
 *
 * Currently, a queued barrier can't be canceled.  This is because
 * try_to_grab_pending() can't determine whether the work to be
 * grabbed is at the head of the queue and thus can't clear LINKED
 * flag of the previous work while there must be a valid next work
 * after a work with LINKED flag set.
 *
 * Note that when @worker is non-NULL, @target may be modified
 * underneath us, so we can't reliably determine pwq from @target.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 */
static void insert_wq_barrier(struct pool_workqueue *pwq,
                              struct wq_barrier *barr,
                              struct work_struct *target, struct worker *worker)
{
        struct list_head *head;
        unsigned int linked = 0;

        /*
         * debugobject calls are safe here even with pool->lock locked
         * as we know for sure that this will not trigger any of the
         * checks and call back into the fixup functions where we
         * might deadlock.
         */
        INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
        __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));

        init_completion_map(&barr->done, &target->lockdep_map);

        barr->task = current;

        /*
         * If @target is currently being executed, schedule the
         * barrier to the worker; otherwise, put it after @target.
         */
        if (worker)
                head = worker->scheduled.next;
        else {
                unsigned long *bits = work_data_bits(target);

                head = target->entry.next;
                /* there can already be other linked works, inherit and set */
                linked = *bits & WORK_STRUCT_LINKED;
                __set_bit(WORK_STRUCT_LINKED_BIT, bits);
        }

        debug_work_activate(&barr->work);
        insert_work(pwq, &barr->work, head,
                    work_color_to_flags(WORK_NO_COLOR) | linked);
}

/**
 * flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing
 * @wq: workqueue being flushed
 * @flush_color: new flush color, < 0 for no-op
 * @work_color: new work color, < 0 for no-op
 *
 * Prepare pwqs for workqueue flushing.
 *
 * If @flush_color is non-negative, flush_color on all pwqs should be
 * -1.  If no pwq has in-flight commands at the specified color, all
 * pwq->flush_color's stay at -1 and %false is returned.  If any pwq
 * has in flight commands, its pwq->flush_color is set to
 * @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq
 * wakeup logic is armed and %true is returned.
 *
 * The caller should have initialized @wq->first_flusher prior to
 * calling this function with non-negative @flush_color.  If
 * @flush_color is negative, no flush color update is done and %false
 * is returned.
 *
 * If @work_color is non-negative, all pwqs should have the same
 * work_color which is previous to @work_color and all will be
 * advanced to @work_color.
 *
 * CONTEXT:
 * mutex_lock(wq->mutex).
 *
 * Return:
 * %true if @flush_color >= 0 and there's something to flush.  %false
 * otherwise.
 */
static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
                                      int flush_color, int work_color)
{
        bool wait = false;
        struct pool_workqueue *pwq;

        if (flush_color >= 0) {
                WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush));
                atomic_set(&wq->nr_pwqs_to_flush, 1);
        }

        for_each_pwq(pwq, wq) {
                struct worker_pool *pool = pwq->pool;

                raw_spin_lock_irq(&pool->lock);

                if (flush_color >= 0) {
                        WARN_ON_ONCE(pwq->flush_color != -1);

                        if (pwq->nr_in_flight[flush_color]) {
                                pwq->flush_color = flush_color;
                                atomic_inc(&wq->nr_pwqs_to_flush);
                                wait = true;
                        }
                }

                if (work_color >= 0) {
                        WARN_ON_ONCE(work_color != work_next_color(pwq->work_color));
                        pwq->work_color = work_color;
                }

                raw_spin_unlock_irq(&pool->lock);
        }

        if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush))
                complete(&wq->first_flusher->done);

        return wait;
}

/**
 * flush_workqueue - ensure that any scheduled work has run to completion.
 * @wq: workqueue to flush
 *
 * This function sleeps until all work items which were queued on entry
 * have finished execution, but it is not livelocked by new incoming ones.
 */
void flush_workqueue(struct workqueue_struct *wq)
{
        struct wq_flusher this_flusher = {
                .list = LIST_HEAD_INIT(this_flusher.list),
                .flush_color = -1,
                .done = COMPLETION_INITIALIZER_ONSTACK_MAP(this_flusher.done, wq->lockdep_map),
        };
        int next_color;

        if (WARN_ON(!wq_online))
                return;

        lock_map_acquire(&wq->lockdep_map);
        lock_map_release(&wq->lockdep_map);

        mutex_lock(&wq->mutex);

        /*
         * Start-to-wait phase
         */
        next_color = work_next_color(wq->work_color);

        if (next_color != wq->flush_color) {
                /*
                 * Color space is not full.  The current work_color
                 * becomes our flush_color and work_color is advanced
                 * by one.
                 */
                WARN_ON_ONCE(!list_empty(&wq->flusher_overflow));
                this_flusher.flush_color = wq->work_color;
                wq->work_color = next_color;

                if (!wq->first_flusher) {
                        /* no flush in progress, become the first flusher */
                        WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);

                        wq->first_flusher = &this_flusher;

                        if (!flush_workqueue_prep_pwqs(wq, wq->flush_color,
                                                       wq->work_color)) {
                                /* nothing to flush, done */
                                wq->flush_color = next_color;
                                wq->first_flusher = NULL;
                                goto out_unlock;
                        }
                } else {
                        /* wait in queue */
                        WARN_ON_ONCE(wq->flush_color == this_flusher.flush_color);
                        list_add_tail(&this_flusher.list, &wq->flusher_queue);
                        flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
                }
        } else {
                /*
                 * Oops, color space is full, wait on overflow queue.
                 * The next flush completion will assign us
                 * flush_color and transfer to flusher_queue.
                 */
                list_add_tail(&this_flusher.list, &wq->flusher_overflow);
        }

        check_flush_dependency(wq, NULL);

        mutex_unlock(&wq->mutex);

        wait_for_completion(&this_flusher.done);

        /*
         * Wake-up-and-cascade phase
         *
         * First flushers are responsible for cascading flushes and
         * handling overflow.  Non-first flushers can simply return.
         */
        if (READ_ONCE(wq->first_flusher) != &this_flusher)
                return;

        mutex_lock(&wq->mutex);

        /* we might have raced, check again with mutex held */
        if (wq->first_flusher != &this_flusher)
                goto out_unlock;

        WRITE_ONCE(wq->first_flusher, NULL);

        WARN_ON_ONCE(!list_empty(&this_flusher.list));
        WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);

        while (true) {
                struct wq_flusher *next, *tmp;

                /* complete all the flushers sharing the current flush color */
                list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) {
                        if (next->flush_color != wq->flush_color)
                                break;
                        list_del_init(&next->list);
                        complete(&next->done);
                }

                WARN_ON_ONCE(!list_empty(&wq->flusher_overflow) &&
                             wq->flush_color != work_next_color(wq->work_color));

                /* this flush_color is finished, advance by one */
                wq->flush_color = work_next_color(wq->flush_color);

                /* one color has been freed, handle overflow queue */
                if (!list_empty(&wq->flusher_overflow)) {
                        /*
                         * Assign the same color to all overflowed
                         * flushers, advance work_color and append to
                         * flusher_queue.  This is the start-to-wait
                         * phase for these overflowed flushers.
                         */
                        list_for_each_entry(tmp, &wq->flusher_overflow, list)
                                tmp->flush_color = wq->work_color;

                        wq->work_color = work_next_color(wq->work_color);

                        list_splice_tail_init(&wq->flusher_overflow,
                                              &wq->flusher_queue);
                        flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
                }

                if (list_empty(&wq->flusher_queue)) {
                        WARN_ON_ONCE(wq->flush_color != wq->work_color);
                        break;
                }

                /*
                 * Need to flush more colors.  Make the next flusher
                 * the new first flusher and arm pwqs.
                 */
                WARN_ON_ONCE(wq->flush_color == wq->work_color);
                WARN_ON_ONCE(wq->flush_color != next->flush_color);

                list_del_init(&next->list);
                wq->first_flusher = next;

                if (flush_workqueue_prep_pwqs(wq, wq->flush_color, -1))
                        break;

                /*
                 * Meh... this color is already done, clear first
                 * flusher and repeat cascading.
                 */
                wq->first_flusher = NULL;
        }

out_unlock:
        mutex_unlock(&wq->mutex);
}
EXPORT_SYMBOL(flush_workqueue);

/**
 * drain_workqueue - drain a workqueue
 * @wq: workqueue to drain
 *
 * Wait until the workqueue becomes empty.  While draining is in progress,
 * only chain queueing is allowed.  IOW, only currently pending or running
 * work items on @wq can queue further work items on it.  @wq is flushed
 * repeatedly until it becomes empty.  The number of flushing is determined
 * by the depth of chaining and should be relatively short.  Whine if it
 * takes too long.
 */
void drain_workqueue(struct workqueue_struct *wq)
{
        unsigned int flush_cnt = 0;
        struct pool_workqueue *pwq;

        /*
         * __queue_work() needs to test whether there are drainers, is much
         * hotter than drain_workqueue() and already looks at @wq->flags.
         * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers.
         */
        mutex_lock(&wq->mutex);
        if (!wq->nr_drainers++)
                wq->flags |= __WQ_DRAINING;
        mutex_unlock(&wq->mutex);
reflush:
        flush_workqueue(wq);

        mutex_lock(&wq->mutex);

        for_each_pwq(pwq, wq) {
                bool drained;

                raw_spin_lock_irq(&pwq->pool->lock);
                drained = !pwq->nr_active && list_empty(&pwq->inactive_works);
                raw_spin_unlock_irq(&pwq->pool->lock);

                if (drained)
                        continue;

                if (++flush_cnt == 10 ||
                    (flush_cnt % 100 == 0 && flush_cnt <= 1000))
                        pr_warn("workqueue %s: drain_workqueue() isn't complete after %u tries\n",
                                wq->name, flush_cnt);

                mutex_unlock(&wq->mutex);
                goto reflush;
        }

        if (!--wq->nr_drainers)
                wq->flags &= ~__WQ_DRAINING;
        mutex_unlock(&wq->mutex);
}
EXPORT_SYMBOL_GPL(drain_workqueue);

static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
                             bool from_cancel)
{
        struct worker *worker = NULL;
        struct worker_pool *pool;
        struct pool_workqueue *pwq;

        might_sleep();

        rcu_read_lock();
        pool = get_work_pool(work);
        if (!pool) {
                rcu_read_unlock();
                return false;
        }

        raw_spin_lock_irq(&pool->lock);
        /* see the comment in try_to_grab_pending() with the same code */
        pwq = get_work_pwq(work);
        if (pwq) {
                if (unlikely(pwq->pool != pool))
                        goto already_gone;
        } else {
                worker = find_worker_executing_work(pool, work);
                if (!worker)
                        goto already_gone;
                pwq = worker->current_pwq;
        }

        check_flush_dependency(pwq->wq, work);

        insert_wq_barrier(pwq, barr, work, worker);
        raw_spin_unlock_irq(&pool->lock);

        /*
         * Force a lock recursion deadlock when using flush_work() inside a
         * single-threaded or rescuer equipped workqueue.
         *
         * For single threaded workqueues the deadlock happens when the work
         * is after the work issuing the flush_work(). For rescuer equipped
         * workqueues the deadlock happens when the rescuer stalls, blocking
         * forward progress.
         */
        if (!from_cancel &&
            (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)) {
                lock_map_acquire(&pwq->wq->lockdep_map);
                lock_map_release(&pwq->wq->lockdep_map);
        }
        rcu_read_unlock();
        return true;
already_gone:
        raw_spin_unlock_irq(&pool->lock);
        rcu_read_unlock();
        return false;
}

static bool __flush_work(struct work_struct *work, bool from_cancel)
{
        struct wq_barrier barr;

        if (WARN_ON(!wq_online))
                return false;

        if (WARN_ON(!work->func))
                return false;

        lock_map_acquire(&work->lockdep_map);
        lock_map_release(&work->lockdep_map);

        if (start_flush_work(work, &barr, from_cancel)) {
                wait_for_completion(&barr.done);
                destroy_work_on_stack(&barr.work);
                return true;
        } else {
                return false;
        }
}

/**
 * flush_work - wait for a work to finish executing the last queueing instance
 * @work: the work to flush
 *
 * Wait until @work has finished execution.  @work is guaranteed to be idle
 * on return if it hasn't been requeued since flush started.
 *
 * Return:
 * %true if flush_work() waited for the work to finish execution,
 * %false if it was already idle.
 */
bool flush_work(struct work_struct *work)
{
        return __flush_work(work, false);
}
EXPORT_SYMBOL_GPL(flush_work);

struct cwt_wait {
        wait_queue_entry_t                wait;
        struct work_struct        *work;
};

static int cwt_wakefn(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
        struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait);

        if (cwait->work != key)
                return 0;
        return autoremove_wake_function(wait, mode, sync, key);
}

static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
{
        static DECLARE_WAIT_QUEUE_HEAD(cancel_waitq);
        unsigned long flags;
        int ret;

        do {
                ret = try_to_grab_pending(work, is_dwork, &flags);
                /*
                 * If someone else is already canceling, wait for it to
                 * finish.  flush_work() doesn't work for PREEMPT_NONE
                 * because we may get scheduled between @work's completion
                 * and the other canceling task resuming and clearing
                 * CANCELING - flush_work() will return false immediately
                 * as @work is no longer busy, try_to_grab_pending() will
                 * return -ENOENT as @work is still being canceled and the
                 * other canceling task won't be able to clear CANCELING as
                 * we're hogging the CPU.
                 *
                 * Let's wait for completion using a waitqueue.  As this
                 * may lead to the thundering herd problem, use a custom
                 * wake function which matches @work along with exclusive
                 * wait and wakeup.
                 */
                if (unlikely(ret == -ENOENT)) {
                        struct cwt_wait cwait;

                        init_wait(&cwait.wait);
                        cwait.wait.func = cwt_wakefn;
                        cwait.work = work;

                        prepare_to_wait_exclusive(&cancel_waitq, &cwait.wait,
                                                  TASK_UNINTERRUPTIBLE);
                        if (work_is_canceling(work))
                                schedule();
                        finish_wait(&cancel_waitq, &cwait.wait);
                }
        } while (unlikely(ret < 0));

        /* tell other tasks trying to grab @work to back off */
        mark_work_canceling(work);
        local_irq_restore(flags);

        /*
         * This allows canceling during early boot.  We know that @work
         * isn't executing.
         */
        if (wq_online)
                __flush_work(work, true);

        clear_work_data(work);

        /*
         * Paired with prepare_to_wait() above so that either
         * waitqueue_active() is visible here or !work_is_canceling() is
         * visible there.
         */
        smp_mb();
        if (waitqueue_active(&cancel_waitq))
                __wake_up(&cancel_waitq, TASK_NORMAL, 1, work);

        return ret;
}

/**
 * cancel_work_sync - cancel a work and wait for it to finish
 * @work: the work to cancel
 *
 * Cancel @work and wait for its execution to finish.  This function
 * can be used even if the work re-queues itself or migrates to
 * another workqueue.  On return from this function, @work is
 * guaranteed to be not pending or executing on any CPU.
 *
 * cancel_work_sync(&delayed_work->work) must not be used for
 * delayed_work's.  Use cancel_delayed_work_sync() instead.
 *
 * The caller must ensure that the workqueue on which @work was last
 * queued can't be destroyed before this function returns.
 *
 * Return:
 * %true if @work was pending, %false otherwise.
 */
bool cancel_work_sync(struct work_struct *work)
{
        return __cancel_work_timer(work, false);
}
EXPORT_SYMBOL_GPL(cancel_work_sync);

/**
 * flush_delayed_work - wait for a dwork to finish executing the last queueing
 * @dwork: the delayed work to flush
 *
 * Delayed timer is cancelled and the pending work is queued for
 * immediate execution.  Like flush_work(), this function only
 * considers the last queueing instance of @dwork.
 *
 * Return:
 * %true if flush_work() waited for the work to finish execution,
 * %false if it was already idle.
 */
bool flush_delayed_work(struct delayed_work *dwork)
{
        local_irq_disable();
        if (del_timer_sync(&dwork->timer))
                __queue_work(dwork->cpu, dwork->wq, &dwork->work);
        local_irq_enable();
        return flush_work(&dwork->work);
}
EXPORT_SYMBOL(flush_delayed_work);

/**
 * flush_rcu_work - wait for a rwork to finish executing the last queueing
 * @rwork: the rcu work to flush
 *
 * Return:
 * %true if flush_rcu_work() waited for the work to finish execution,
 * %false if it was already idle.
 */
bool flush_rcu_work(struct rcu_work *rwork)
{
        if (test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&rwork->work))) {
                rcu_barrier();
                flush_work(&rwork->work);
                return true;
        } else {
                return flush_work(&rwork->work);
        }
}
EXPORT_SYMBOL(flush_rcu_work);

static bool __cancel_work(struct work_struct *work, bool is_dwork)
{
        unsigned long flags;
        int ret;

        do {
                ret = try_to_grab_pending(work, is_dwork, &flags);
        } while (unlikely(ret == -EAGAIN));

        if (unlikely(ret < 0))
                return false;

        set_work_pool_and_clear_pending(work, get_work_pool_id(work));
        local_irq_restore(flags);
        return ret;
}

/*
 * See cancel_delayed_work()
 */
bool cancel_work(struct work_struct *work)
{
        return __cancel_work(work, false);
}
EXPORT_SYMBOL(cancel_work);

/**
 * cancel_delayed_work - cancel a delayed work
 * @dwork: delayed_work to cancel
 *
 * Kill off a pending delayed_work.
 *
 * Return: %true if @dwork was pending and canceled; %false if it wasn't
 * pending.
 *
 * Note:
 * The work callback function may still be running on return, unless
 * it returns %true and the work doesn't re-arm itself.  Explicitly flush or
 * use cancel_delayed_work_sync() to wait on it.
 *
 * This function is safe to call from any context including IRQ handler.
 */
bool cancel_delayed_work(struct delayed_work *dwork)
{
        return __cancel_work(&dwork->work, true);
}
EXPORT_SYMBOL(cancel_delayed_work);

/**
 * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
 * @dwork: the delayed work cancel
 *
 * This is cancel_work_sync() for delayed works.
 *
 * Return:
 * %true if @dwork was pending, %false otherwise.
 */
bool cancel_delayed_work_sync(struct delayed_work *dwork)
{
        return __cancel_work_timer(&dwork->work, true);
}
EXPORT_SYMBOL(cancel_delayed_work_sync);

/**
 * schedule_on_each_cpu - execute a function synchronously on each online CPU
 * @func: the function to call
 *
 * schedule_on_each_cpu() executes @func on each online CPU using the
 * system workqueue and blocks until all CPUs have completed.
 * schedule_on_each_cpu() is very slow.
 *
 * Return:
 * 0 on success, -errno on failure.
 */
int schedule_on_each_cpu(work_func_t func)
{
        int cpu;
        struct work_struct __percpu *works;

        works = alloc_percpu(struct work_struct);
        if (!works)
                return -ENOMEM;

        get_online_cpus();

        for_each_online_cpu(cpu) {
                struct work_struct *work = per_cpu_ptr(works, cpu);

                INIT_WORK(work, func);
                schedule_work_on(cpu, work);
        }

        for_each_online_cpu(cpu)
                flush_work(per_cpu_ptr(works, cpu));

        put_online_cpus();
        free_percpu(works);
        return 0;
}

/**
 * execute_in_process_context - reliably execute the routine with user context
 * @fn:                the function to execute
 * @ew:                guaranteed storage for the execute work structure (must
 *                be available when the work executes)
 *
 * Executes the function immediately if process context is available,
 * otherwise schedules the function for delayed execution.
 *
 * Return:        0 - function was executed
 *                1 - function was scheduled for execution
 */
int execute_in_process_context(work_func_t fn, struct execute_work *ew)
{
        if (!in_interrupt()) {
                fn(&ew->work);
                return 0;
        }

        INIT_WORK(&ew->work, fn);
        schedule_work(&ew->work);

        return 1;
}
EXPORT_SYMBOL_GPL(execute_in_process_context);

/**
 * free_workqueue_attrs - free a workqueue_attrs
 * @attrs: workqueue_attrs to free
 *
 * Undo alloc_workqueue_attrs().
 */
void free_workqueue_attrs(struct workqueue_attrs *attrs)
{
        if (attrs) {
                free_cpumask_var(attrs->cpumask);
                kfree(attrs);
        }
}

/**
 * alloc_workqueue_attrs - allocate a workqueue_attrs
 *
 * Allocate a new workqueue_attrs, initialize with default settings and
 * return it.
 *
 * Return: The allocated new workqueue_attr on success. %NULL on failure.
 */
struct workqueue_attrs *alloc_workqueue_attrs(void)
{
        struct workqueue_attrs *attrs;

        attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
        if (!attrs)
                goto fail;
        if (!alloc_cpumask_var(&attrs->cpumask, GFP_KERNEL))
                goto fail;

        cpumask_copy(attrs->cpumask, cpu_possible_mask);
        return attrs;
fail:
        free_workqueue_attrs(attrs);
        return NULL;
}

static void copy_workqueue_attrs(struct workqueue_attrs *to,
                                 const struct workqueue_attrs *from)
{
        to->nice = from->nice;
        cpumask_copy(to->cpumask, from->cpumask);
        /*
         * Unlike hash and equality test, this function doesn't ignore
         * ->no_numa as it is used for both pool and wq attrs.  Instead,
         * get_unbound_pool() explicitly clears ->no_numa after copying.
         */
        to->no_numa = from->no_numa;
}

/* hash value of the content of @attr */
static u32 wqattrs_hash(const struct workqueue_attrs *attrs)
{
        u32 hash = 0;

        hash = jhash_1word(attrs->nice, hash);
        hash = jhash(cpumask_bits(attrs->cpumask),
                     BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
        return hash;
}

/* content equality test */
static bool wqattrs_equal(const struct workqueue_attrs *a,
                          const struct workqueue_attrs *b)
{
        if (a->nice != b->nice)
                return false;
        if (!cpumask_equal(a->cpumask, b->cpumask))
                return false;
        return true;
}

/**
 * init_worker_pool - initialize a newly zalloc'd worker_pool
 * @pool: worker_pool to initialize
 *
 * Initialize a newly zalloc'd @pool.  It also allocates @pool->attrs.
 *
 * Return: 0 on success, -errno on failure.  Even on failure, all fields
 * inside @pool proper are initialized and put_unbound_pool() can be called
 * on @pool safely to release it.
 */
static int init_worker_pool(struct worker_pool *pool)
{
        raw_spin_lock_init(&pool->lock);
        pool->id = -1;
        pool->cpu = -1;
        pool->node = NUMA_NO_NODE;
        pool->flags |= POOL_DISASSOCIATED;
        pool->watchdog_ts = jiffies;
        INIT_LIST_HEAD(&pool->worklist);
        INIT_LIST_HEAD(&pool->idle_list);
        hash_init(pool->busy_hash);

        timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE);

        timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0);

        INIT_LIST_HEAD(&pool->workers);

        ida_init(&pool->worker_ida);
        INIT_HLIST_NODE(&pool->hash_node);
        pool->refcnt = 1;

        /* shouldn't fail above this point */
        pool->attrs = alloc_workqueue_attrs();
        if (!pool->attrs)
                return -ENOMEM;
        return 0;
}

#ifdef CONFIG_LOCKDEP
static void wq_init_lockdep(struct workqueue_struct *wq)
{
        char *lock_name;

        lockdep_register_key(&wq->key);
        lock_name = kasprintf(GFP_KERNEL, "%s%s", "(wq_completion)", wq->name);
        if (!lock_name)
                lock_name = wq->name;

        wq->lock_name = lock_name;
        lockdep_init_map(&wq->lockdep_map, lock_name, &wq->key, 0);
}

static void wq_unregister_lockdep(struct workqueue_struct *wq)
{
        lockdep_unregister_key(&wq->key);
}

static void wq_free_lockdep(struct workqueue_struct *wq)
{
        if (wq->lock_name != wq->name)
                kfree(wq->lock_name);
}
#else
static void wq_init_lockdep(struct workqueue_struct *wq)
{
}

static void wq_unregister_lockdep(struct workqueue_struct *wq)
{
}

static void wq_free_lockdep(struct workqueue_struct *wq)
{
}
#endif

static void rcu_free_wq(struct rcu_head *rcu)
{
        struct workqueue_struct *wq =
                container_of(rcu, struct workqueue_struct, rcu);

        wq_free_lockdep(wq);

        if (!(wq->flags & WQ_UNBOUND))
                free_percpu(wq->cpu_pwqs);
        else
                free_workqueue_attrs(wq->unbound_attrs);

        kfree(wq);
}

static void rcu_free_pool(struct rcu_head *rcu)
{
        struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);

        ida_destroy(&pool->worker_ida);
        free_workqueue_attrs(pool->attrs);
        kfree(pool);
}

/* This returns with the lock held on success (pool manager is inactive). */
static bool wq_manager_inactive(struct worker_pool *pool)
{
        raw_spin_lock_irq(&pool->lock);

        if (pool->flags & POOL_MANAGER_ACTIVE) {
                raw_spin_unlock_irq(&pool->lock);
                return false;
        }
        return true;
}

/**
 * put_unbound_pool - put a worker_pool
 * @pool: worker_pool to put
 *
 * Put @pool.  If its refcnt reaches zero, it gets destroyed in RCU
 * safe manner.  get_unbound_pool() calls this function on its failure path
 * and this function should be able to release pools which went through,
 * successfully or not, init_worker_pool().
 *
 * Should be called with wq_pool_mutex held.
 */
static void put_unbound_pool(struct worker_pool *pool)
{
        DECLARE_COMPLETION_ONSTACK(detach_completion);
        struct worker *worker;

        lockdep_assert_held(&wq_pool_mutex);

        if (--pool->refcnt)
                return;

        /* sanity checks */
        if (WARN_ON(!(pool->cpu < 0)) ||
            WARN_ON(!list_empty(&pool->worklist)))
                return;

        /* release id and unhash */
        if (pool->id >= 0)
                idr_remove(&worker_pool_idr, pool->id);
        hash_del(&pool->hash_node);

        /*
         * Become the manager and destroy all workers.  This prevents
         * @pool's workers from blocking on attach_mutex.  We're the last
         * manager and @pool gets freed with the flag set.
         * Because of how wq_manager_inactive() works, we will hold the
         * spinlock after a successful wait.
         */
        rcuwait_wait_event(&manager_wait, wq_manager_inactive(pool),
                           TASK_UNINTERRUPTIBLE);
        pool->flags |= POOL_MANAGER_ACTIVE;

        while ((worker = first_idle_worker(pool)))
                destroy_worker(worker);
        WARN_ON(pool->nr_workers || pool->nr_idle);
        raw_spin_unlock_irq(&pool->lock);

        mutex_lock(&wq_pool_attach_mutex);
        if (!list_empty(&pool->workers))
                pool->detach_completion = &detach_completion;
        mutex_unlock(&wq_pool_attach_mutex);

        if (pool->detach_completion)
                wait_for_completion(pool->detach_completion);

        /* shut down the timers */
        del_timer_sync(&pool->idle_timer);
        del_timer_sync(&pool->mayday_timer);

        /* RCU protected to allow dereferences from get_work_pool() */
        call_rcu(&pool->rcu, rcu_free_pool);
}

/**
 * get_unbound_pool - get a worker_pool with the specified attributes
 * @attrs: the attributes of the worker_pool to get
 *
 * Obtain a worker_pool which has the same attributes as @attrs, bump the
 * reference count and return it.  If there already is a matching
 * worker_pool, it will be used; otherwise, this function attempts to
 * create a new one.
 *
 * Should be called with wq_pool_mutex held.
 *
 * Return: On success, a worker_pool with the same attributes as @attrs.
 * On failure, %NULL.
 */
static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
{
        u32 hash = wqattrs_hash(attrs);
        struct worker_pool *pool;
        int node;
        int target_node = NUMA_NO_NODE;

        lockdep_assert_held(&wq_pool_mutex);

        /* do we already have a matching pool? */
        hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {
                if (wqattrs_equal(pool->attrs, attrs)) {
                        pool->refcnt++;
                        return pool;
                }
        }

        /* if cpumask is contained inside a NUMA node, we belong to that node */
        if (wq_numa_enabled) {
                for_each_node(node) {
                        if (cpumask_subset(attrs->cpumask,
                                           wq_numa_possible_cpumask[node])) {
                                target_node = node;
                                break;
                        }
                }
        }

        /* nope, create a new one */
        pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, target_node);
        if (!pool || init_worker_pool(pool) < 0)
                goto fail;

        lockdep_set_subclass(&pool->lock, 1);        /* see put_pwq() */
        copy_workqueue_attrs(pool->attrs, attrs);
        pool->node = target_node;

        /*
         * no_numa isn't a worker_pool attribute, always clear it.  See
         * 'struct workqueue_attrs' comments for detail.
         */
        pool->attrs->no_numa = false;

        if (worker_pool_assign_id(pool) < 0)
                goto fail;

        /* create and start the initial worker */
        if (wq_online && !create_worker(pool))
                goto fail;

        /* install */
        hash_add(unbound_pool_hash, &pool->hash_node, hash);

        return pool;
fail:
        if (pool)
                put_unbound_pool(pool);
        return NULL;
}

static void rcu_free_pwq(struct rcu_head *rcu)
{
        kmem_cache_free(pwq_cache,
                        container_of(rcu, struct pool_workqueue, rcu));
}

/*
 * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt
 * and needs to be destroyed.
 */
static void pwq_unbound_release_workfn(struct work_struct *work)
{
        struct pool_workqueue *pwq = container_of(work, struct pool_workqueue,
                                                  unbound_release_work);
        struct workqueue_struct *wq = pwq->wq;
        struct worker_pool *pool = pwq->pool;
        bool is_last = false;

        /*
         * when @pwq is not linked, it doesn't hold any reference to the
         * @wq, and @wq is invalid to access.
         */
        if (!list_empty(&pwq->pwqs_node)) {
                if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
                        return;

                mutex_lock(&wq->mutex);
                list_del_rcu(&pwq->pwqs_node);
                is_last = list_empty(&wq->pwqs);
                mutex_unlock(&wq->mutex);
        }

        mutex_lock(&wq_pool_mutex);
        put_unbound_pool(pool);
        mutex_unlock(&wq_pool_mutex);

        call_rcu(&pwq->rcu, rcu_free_pwq);

        /*
         * If we're the last pwq going away, @wq is already dead and no one
         * is gonna access it anymore.  Schedule RCU free.
         */
        if (is_last) {
                wq_unregister_lockdep(wq);
                call_rcu(&wq->rcu, rcu_free_wq);
        }
}

/**
 * pwq_adjust_max_active - update a pwq's max_active to the current setting
 * @pwq: target pool_workqueue
 *
 * If @pwq isn't freezing, set @pwq->max_active to the associated
 * workqueue's saved_max_active and activate inactive work items
 * accordingly.  If @pwq is freezing, clear @pwq->max_active to zero.
 */
static void pwq_adjust_max_active(struct pool_workqueue *pwq)
{
        struct workqueue_struct *wq = pwq->wq;
        bool freezable = wq->flags & WQ_FREEZABLE;
        unsigned long flags;

        /* for @wq->saved_max_active */
        lockdep_assert_held(&wq->mutex);

        /* fast exit for non-freezable wqs */
        if (!freezable && pwq->max_active == wq->saved_max_active)
                return;

        /* this function can be called during early boot w/ irq disabled */
        raw_spin_lock_irqsave(&pwq->pool->lock, flags);

        /*
         * During [un]freezing, the caller is responsible for ensuring that
         * this function is called at least once after @workqueue_freezing
         * is updated and visible.
         */
        if (!freezable || !workqueue_freezing) {
                bool kick = false;

                pwq->max_active = wq->saved_max_active;

                while (!list_empty(&pwq->inactive_works) &&
                       pwq->nr_active < pwq->max_active) {
                        pwq_activate_first_inactive(pwq);
                        kick = true;
                }

                /*
                 * Need to kick a worker after thawed or an unbound wq's
                 * max_active is bumped. In realtime scenarios, always kicking a
                 * worker will cause interference on the isolated cpu cores, so
                 * let's kick iff work items were activated.
                 */
                if (kick)
                        wake_up_worker(pwq->pool);
        } else {
                pwq->max_active = 0;
        }

        raw_spin_unlock_irqrestore(&pwq->pool->lock, flags);
}

/* initialize newly alloced @pwq which is associated with @wq and @pool */
static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
                     struct worker_pool *pool)
{
        BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);

        memset(pwq, 0, sizeof(*pwq));

        pwq->pool = pool;
        pwq->wq = wq;
        pwq->flush_color = -1;
        pwq->refcnt = 1;
        INIT_LIST_HEAD(&pwq->inactive_works);
        INIT_LIST_HEAD(&pwq->pwqs_node);
        INIT_LIST_HEAD(&pwq->mayday_node);
        INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn);
}

/* sync @pwq with the current state of its associated wq and link it */
static void link_pwq(struct pool_workqueue *pwq)
{
        struct workqueue_struct *wq = pwq->wq;

        lockdep_assert_held(&wq->mutex);

        /* may be called multiple times, ignore if already linked */
        if (!list_empty(&pwq->pwqs_node))
                return;

        /* set the matching work_color */
        pwq->work_color = wq->work_color;

        /* sync max_active to the current setting */
        pwq_adjust_max_active(pwq);

        /* link in @pwq */
        list_add_rcu(&pwq->pwqs_node, &wq->pwqs);
}

/* obtain a pool matching @attr and create a pwq associating the pool and @wq */
static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
                                        const struct workqueue_attrs *attrs)
{
        struct worker_pool *pool;
        struct pool_workqueue *pwq;

        lockdep_assert_held(&wq_pool_mutex);

        pool = get_unbound_pool(attrs);
        if (!pool)
                return NULL;

        pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node);
        if (!pwq) {
                put_unbound_pool(pool);
                return NULL;
        }

        init_pwq(pwq, wq, pool);
        return pwq;
}

/**
 * wq_calc_node_cpumask - calculate a wq_attrs' cpumask for the specified node
 * @attrs: the wq_attrs of the default pwq of the target workqueue
 * @node: the target NUMA node
 * @cpu_going_down: if >= 0, the CPU to consider as offline
 * @cpumask: outarg, the resulting cpumask
 *
 * Calculate the cpumask a workqueue with @attrs should use on @node.  If
 * @cpu_going_down is >= 0, that cpu is considered offline during
 * calculation.  The result is stored in @cpumask.
 *
 * If NUMA affinity is not enabled, @attrs->cpumask is always used.  If
 * enabled and @node has online CPUs requested by @attrs, the returned
 * cpumask is the intersection of the possible CPUs of @node and
 * @attrs->cpumask.
 *
 * The caller is responsible for ensuring that the cpumask of @node stays
 * stable.
 *
 * Return: %true if the resulting @cpumask is different from @attrs->cpumask,
 * %false if equal.
 */
static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
                                 int cpu_going_down, cpumask_t *cpumask)
{
        if (!wq_numa_enabled || attrs->no_numa)
                goto use_dfl;

        /* does @node have any online CPUs @attrs wants? */
        cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask);
        if (cpu_going_down >= 0)
                cpumask_clear_cpu(cpu_going_down, cpumask);

        if (cpumask_empty(cpumask))
                goto use_dfl;

        /* yeap, return possible CPUs in @node that @attrs wants */
        cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]);

        if (cpumask_empty(cpumask)) {
                pr_warn_once("WARNING: workqueue cpumask: online intersect > "
                                "possible intersect\n");
                return false;
        }

        return !cpumask_equal(cpumask, attrs->cpumask);

use_dfl:
        cpumask_copy(cpumask, attrs->cpumask);
        return false;
}

/* install @pwq into @wq's numa_pwq_tbl[] for @node and return the old pwq */
static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
                                                   int node,
                                                   struct pool_workqueue *pwq)
{
        struct pool_workqueue *old_pwq;

        lockdep_assert_held(&wq_pool_mutex);
        lockdep_assert_held(&wq->mutex);

        /* link_pwq() can handle duplicate calls */
        link_pwq(pwq);

        old_pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]);
        rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq);
        return old_pwq;
}

/* context to store the prepared attrs & pwqs before applying */
struct apply_wqattrs_ctx {
        struct workqueue_struct        *wq;                /* target workqueue */
        struct workqueue_attrs        *attrs;                /* attrs to apply */
        struct list_head        list;                /* queued for batching commit */
        struct pool_workqueue        *dfl_pwq;
        struct pool_workqueue        *pwq_tbl[];
};

/* free the resources after success or abort */
static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx)
{
        if (ctx) {
                int node;

                for_each_node(node)
                        put_pwq_unlocked(ctx->pwq_tbl[node]);
                put_pwq_unlocked(ctx->dfl_pwq);

                free_workqueue_attrs(ctx->attrs);

                kfree(ctx);
        }
}

/* allocate the attrs and pwqs for later installation */
static struct apply_wqattrs_ctx *
apply_wqattrs_prepare(struct workqueue_struct *wq,
                      const struct workqueue_attrs *attrs)
{
        struct apply_wqattrs_ctx *ctx;
        struct workqueue_attrs *new_attrs, *tmp_attrs;
        int node;

        lockdep_assert_held(&wq_pool_mutex);

        ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_node_ids), GFP_KERNEL);

        new_attrs = alloc_workqueue_attrs();
        tmp_attrs = alloc_workqueue_attrs();
        if (!ctx || !new_attrs || !tmp_attrs)
                goto out_free;

        /*
         * Calculate the attrs of the default pwq.
         * If the user configured cpumask doesn't overlap with the
         * wq_unbound_cpumask, we fallback to the wq_unbound_cpumask.
         */
        copy_workqueue_attrs(new_attrs, attrs);
        cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_cpumask);
        if (unlikely(cpumask_empty(new_attrs->cpumask)))
                cpumask_copy(new_attrs->cpumask, wq_unbound_cpumask);

        /*
         * We may create multiple pwqs with differing cpumasks.  Make a
         * copy of @new_attrs which will be modified and used to obtain
         * pools.
         */
        copy_workqueue_attrs(tmp_attrs, new_attrs);

        /*
         * If something goes wrong during CPU up/down, we'll fall back to
         * the default pwq covering whole @attrs->cpumask.  Always create
         * it even if we don't use it immediately.
         */
        ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
        if (!ctx->dfl_pwq)
                goto out_free;

        for_each_node(node) {
                if (wq_calc_node_cpumask(new_attrs, node, -1, tmp_attrs->cpumask)) {
                        ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
                        if (!ctx->pwq_tbl[node])
                                goto out_free;
                } else {
                        ctx->dfl_pwq->refcnt++;
                        ctx->pwq_tbl[node] = ctx->dfl_pwq;
                }
        }

        /* save the user configured attrs and sanitize it. */
        copy_workqueue_attrs(new_attrs, attrs);
        cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
        ctx->attrs = new_attrs;

        ctx->wq = wq;
        free_workqueue_attrs(tmp_attrs);
        return ctx;

out_free:
        free_workqueue_attrs(tmp_attrs);
        free_workqueue_attrs(new_attrs);
        apply_wqattrs_cleanup(ctx);
        return NULL;
}

/* set attrs and install prepared pwqs, @ctx points to old pwqs on return */
static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
{
        int node;

        /* all pwqs have been created successfully, let's install'em */
        mutex_lock(&ctx->wq->mutex);

        copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs);

        /* save the previous pwq and install the new one */
        for_each_node(node)
                ctx->pwq_tbl[node] = numa_pwq_tbl_install(ctx->wq, node,
                                                          ctx->pwq_tbl[node]);

        /* @dfl_pwq might not have been used, ensure it's linked */
        link_pwq(ctx->dfl_pwq);
        swap(ctx->wq->dfl_pwq, ctx->dfl_pwq);

        mutex_unlock(&ctx->wq->mutex);
}

static void apply_wqattrs_lock(void)
{
        /* CPUs should stay stable across pwq creations and installations */
        get_online_cpus();
        mutex_lock(&wq_pool_mutex);
}

static void apply_wqattrs_unlock(void)
{
        mutex_unlock(&wq_pool_mutex);
        put_online_cpus();
}

static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
                                        const struct workqueue_attrs *attrs)
{
        struct apply_wqattrs_ctx *ctx;

        /* only unbound workqueues can change attributes */
        if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
                return -EINVAL;

        /* creating multiple pwqs breaks ordering guarantee */
        if (!list_empty(&wq->pwqs)) {
                if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
                        return -EINVAL;

                wq->flags &= ~__WQ_ORDERED;
        }

        ctx = apply_wqattrs_prepare(wq, attrs);
        if (!ctx)
                return -ENOMEM;

        /* the ctx has been prepared successfully, let's commit it */
        apply_wqattrs_commit(ctx);
        apply_wqattrs_cleanup(ctx);

        return 0;
}

/**
 * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
 * @wq: the target workqueue
 * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
 *
 * Apply @attrs to an unbound workqueue @wq.  Unless disabled, on NUMA
 * machines, this function maps a separate pwq to each NUMA node with
 * possibles CPUs in @attrs->cpumask so that work items are affine to the
 * NUMA node it was issued on.  Older pwqs are released as in-flight work
 * items finish.  Note that a work item which repeatedly requeues itself
 * back-to-back will stay on its current pwq.
 *
 * Performs GFP_KERNEL allocations.
 *
 * Assumes caller has CPU hotplug read exclusion, i.e. get_online_cpus().
 *
 * Return: 0 on success and -errno on failure.
 */
int apply_workqueue_attrs(struct workqueue_struct *wq,
                          const struct workqueue_attrs *attrs)
{
        int ret;

        lockdep_assert_cpus_held();

        mutex_lock(&wq_pool_mutex);
        ret = apply_workqueue_attrs_locked(wq, attrs);
        mutex_unlock(&wq_pool_mutex);

        return ret;
}

/**
 * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug
 * @wq: the target workqueue
 * @cpu: the CPU coming up or going down
 * @online: whether @cpu is coming up or going down
 *
 * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and
 * %CPU_DOWN_FAILED.  @cpu is being hot[un]plugged, update NUMA affinity of
 * @wq accordingly.
 *
 * If NUMA affinity can't be adjusted due to memory allocation failure, it
 * falls back to @wq->dfl_pwq which may not be optimal but is always
 * correct.
 *
 * Note that when the last allowed CPU of a NUMA node goes offline for a
 * workqueue with a cpumask spanning multiple nodes, the workers which were
 * already executing the work items for the workqueue will lose their CPU
 * affinity and may execute on any CPU.  This is similar to how per-cpu
 * workqueues behave on CPU_DOWN.  If a workqueue user wants strict
 * affinity, it's the user's responsibility to flush the work item from
 * CPU_DOWN_PREPARE.
 */
static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
                                   bool online)
{
        int node = cpu_to_node(cpu);
        int cpu_off = online ? -1 : cpu;
        struct pool_workqueue *old_pwq = NULL, *pwq;
        struct workqueue_attrs *target_attrs;
        cpumask_t *cpumask;

        lockdep_assert_held(&wq_pool_mutex);

        if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND) ||
            wq->unbound_attrs->no_numa)
                return;

        /*
         * We don't wanna alloc/free wq_attrs for each wq for each CPU.
         * Let's use a preallocated one.  The following buf is protected by
         * CPU hotplug exclusion.
         */
        target_attrs = wq_update_unbound_numa_attrs_buf;
        cpumask = target_attrs->cpumask;

        copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
        pwq = unbound_pwq_by_node(wq, node);

        /*
         * Let's determine what needs to be done.  If the target cpumask is
         * different from the default pwq's, we need to compare it to @pwq's
         * and create a new one if they don't match.  If the target cpumask
         * equals the default pwq's, the default pwq should be used.
         */
        if (wq_calc_node_cpumask(wq->dfl_pwq->pool->attrs, node, cpu_off, cpumask)) {
                if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
                        return;
        } else {
                goto use_dfl_pwq;
        }

        /* create a new pwq */
        pwq = alloc_unbound_pwq(wq, target_attrs);
        if (!pwq) {
                pr_warn("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
                        wq->name);
                goto use_dfl_pwq;
        }

        /* Install the new pwq. */
        mutex_lock(&wq->mutex);
        old_pwq = numa_pwq_tbl_install(wq, node, pwq);
        goto out_unlock;

use_dfl_pwq:
        mutex_lock(&wq->mutex);
        raw_spin_lock_irq(&wq->dfl_pwq->pool->lock);
        get_pwq(wq->dfl_pwq);
        raw_spin_unlock_irq(&wq->dfl_pwq->pool->lock);
        old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq);
out_unlock:
        mutex_unlock(&wq->mutex);
        put_pwq_unlocked(old_pwq);
}

static int alloc_and_link_pwqs(struct workqueue_struct *wq)
{
        bool highpri = wq->flags & WQ_HIGHPRI;
        int cpu, ret;

        if (!(wq->flags & WQ_UNBOUND)) {
                wq->cpu_pwqs = alloc_percpu(struct pool_workqueue);
                if (!wq->cpu_pwqs)
                        return -ENOMEM;

                for_each_possible_cpu(cpu) {
                        struct pool_workqueue *pwq =
                                per_cpu_ptr(wq->cpu_pwqs, cpu);
                        struct worker_pool *cpu_pools =
                                per_cpu(cpu_worker_pools, cpu);

                        init_pwq(pwq, wq, &cpu_pools[highpri]);

                        mutex_lock(&wq->mutex);
                        link_pwq(pwq);
                        mutex_unlock(&wq->mutex);
                }
                return 0;
        }

        get_online_cpus();
        if (wq->flags & __WQ_ORDERED) {
                ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
                /* there should only be single pwq for ordering guarantee */
                WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node ||
                              wq->pwqs.prev != &wq->dfl_pwq->pwqs_node),
                     "ordering guarantee broken for workqueue %s\n", wq->name);
        } else {
                ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
        }
        put_online_cpus();

        return ret;
}

static int wq_clamp_max_active(int max_active, unsigned int flags,
                               const char *name)
{
        int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;

        if (max_active < 1 || max_active > lim)
                pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n",
                        max_active, name, 1, lim);

        return clamp_val(max_active, 1, lim);
}

/*
 * Workqueues which may be used during memory reclaim should have a rescuer
 * to guarantee forward progress.
 */
static int init_rescuer(struct workqueue_struct *wq)
{
        struct worker *rescuer;
        int ret;

        if (!(wq->flags & WQ_MEM_RECLAIM))
                return 0;

        rescuer = alloc_worker(NUMA_NO_NODE);
        if (!rescuer)
                return -ENOMEM;

        rescuer->rescue_wq = wq;
        rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name);
        if (IS_ERR(rescuer->task)) {
                ret = PTR_ERR(rescuer->task);
                kfree(rescuer);
                return ret;
        }

        wq->rescuer = rescuer;
        kthread_bind_mask(rescuer->task, cpu_possible_mask);
        wake_up_process(rescuer->task);

        return 0;
}

__printf(1, 4)
struct workqueue_struct *alloc_workqueue(const char *fmt,
                                         unsigned int flags,
                                         int max_active, ...)
{
        size_t tbl_size = 0;
        va_list args;
        struct workqueue_struct *wq;
        struct pool_workqueue *pwq;

        /*
         * Unbound && max_active == 1 used to imply ordered, which is no
         * longer the case on NUMA machines due to per-node pools.  While
         * alloc_ordered_workqueue() is the right way to create an ordered
         * workqueue, keep the previous behavior to avoid subtle breakages
         * on NUMA.
         */
        if ((flags & WQ_UNBOUND) && max_active == 1)
                flags |= __WQ_ORDERED;

        /* see the comment above the definition of WQ_POWER_EFFICIENT */
        if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
                flags |= WQ_UNBOUND;

        /* allocate wq and format name */
        if (flags & WQ_UNBOUND)
                tbl_size = nr_node_ids * sizeof(wq->numa_pwq_tbl[0]);

        wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL);
        if (!wq)
                return NULL;

        if (flags & WQ_UNBOUND) {
                wq->unbound_attrs = alloc_workqueue_attrs();
                if (!wq->unbound_attrs)
                        goto err_free_wq;
        }

        va_start(args, max_active);
        vsnprintf(wq->name, sizeof(wq->name), fmt, args);
        va_end(args);

        max_active = max_active ?: WQ_DFL_ACTIVE;
        max_active = wq_clamp_max_active(max_active, flags, wq->name);

        /* init wq */
        wq->flags = flags;
        wq->saved_max_active = max_active;
        mutex_init(&wq->mutex);
        atomic_set(&wq->nr_pwqs_to_flush, 0);
        INIT_LIST_HEAD(&wq->pwqs);
        INIT_LIST_HEAD(&wq->flusher_queue);
        INIT_LIST_HEAD(&wq->flusher_overflow);
        INIT_LIST_HEAD(&wq->maydays);

        wq_init_lockdep(wq);
        INIT_LIST_HEAD(&wq->list);

        if (alloc_and_link_pwqs(wq) < 0)
                goto err_unreg_lockdep;

        if (wq_online && init_rescuer(wq) < 0)
                goto err_destroy;

        if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
                goto err_destroy;

        /*
         * wq_pool_mutex protects global freeze state and workqueues list.
         * Grab it, adjust max_active and add the new @wq to workqueues
         * list.
         */
        mutex_lock(&wq_pool_mutex);

        mutex_lock(&wq->mutex);
        for_each_pwq(pwq, wq)
                pwq_adjust_max_active(pwq);
        mutex_unlock(&wq->mutex);

        list_add_tail_rcu(&wq->list, &workqueues);

        mutex_unlock(&wq_pool_mutex);

        return wq;

err_unreg_lockdep:
        wq_unregister_lockdep(wq);
        wq_free_lockdep(wq);
err_free_wq:
        free_workqueue_attrs(wq->unbound_attrs);
        kfree(wq);
        return NULL;
err_destroy:
        destroy_workqueue(wq);
        return NULL;
}
EXPORT_SYMBOL_GPL(alloc_workqueue);

static bool pwq_busy(struct pool_workqueue *pwq)
{
        int i;

        for (i = 0; i < WORK_NR_COLORS; i++)
                if (pwq->nr_in_flight[i])
                        return true;

        if ((pwq != pwq->wq->dfl_pwq) && (pwq->refcnt > 1))
                return true;
        if (pwq->nr_active || !list_empty(&pwq->inactive_works))
                return true;

        return false;
}

/**
 * destroy_workqueue - safely terminate a workqueue
 * @wq: target workqueue
 *
 * Safely destroy a workqueue. All work currently pending will be done first.
 */
void destroy_workqueue(struct workqueue_struct *wq)
{
        struct pool_workqueue *pwq;
        int node;

        /*
         * Remove it from sysfs first so that sanity check failure doesn't
         * lead to sysfs name conflicts.
         */
        workqueue_sysfs_unregister(wq);

        /* drain it before proceeding with destruction */
        drain_workqueue(wq);

        /* kill rescuer, if sanity checks fail, leave it w/o rescuer */
        if (wq->rescuer) {
                struct worker *rescuer = wq->rescuer;

                /* this prevents new queueing */
                raw_spin_lock_irq(&wq_mayday_lock);
                wq->rescuer = NULL;
                raw_spin_unlock_irq(&wq_mayday_lock);

                /* rescuer will empty maydays list before exiting */
                kthread_stop(rescuer->task);
                kfree(rescuer);
        }

        /*
         * Sanity checks - grab all the locks so that we wait for all
         * in-flight operations which may do put_pwq().
         */
        mutex_lock(&wq_pool_mutex);
        mutex_lock(&wq->mutex);
        for_each_pwq(pwq, wq) {
                raw_spin_lock_irq(&pwq->pool->lock);
                if (WARN_ON(pwq_busy(pwq))) {
                        pr_warn("%s: %s has the following busy pwq\n",
                                __func__, wq->name);
                        show_pwq(pwq);
                        raw_spin_unlock_irq(&pwq->pool->lock);
                        mutex_unlock(&wq->mutex);
                        mutex_unlock(&wq_pool_mutex);
                        show_workqueue_state();
                        return;
                }
                raw_spin_unlock_irq(&pwq->pool->lock);
        }
        mutex_unlock(&wq->mutex);

        /*
         * wq list is used to freeze wq, remove from list after
         * flushing is complete in case freeze races us.
         */
        list_del_rcu(&wq->list);
        mutex_unlock(&wq_pool_mutex);

        if (!(wq->flags & WQ_UNBOUND)) {
                wq_unregister_lockdep(wq);
                /*
                 * The base ref is never dropped on per-cpu pwqs.  Directly
                 * schedule RCU free.
                 */
                call_rcu(&wq->rcu, rcu_free_wq);
        } else {
                /*
                 * We're the sole accessor of @wq at this point.  Directly
                 * access numa_pwq_tbl[] and dfl_pwq to put the base refs.
                 * @wq will be freed when the last pwq is released.
                 */
                for_each_node(node) {
                        pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]);
                        RCU_INIT_POINTER(wq->numa_pwq_tbl[node], NULL);
                        put_pwq_unlocked(pwq);
                }

                /*
                 * Put dfl_pwq.  @wq may be freed any time after dfl_pwq is
                 * put.  Don't access it afterwards.
                 */
                pwq = wq->dfl_pwq;
                wq->dfl_pwq = NULL;
                put_pwq_unlocked(pwq);
        }
}
EXPORT_SYMBOL_GPL(destroy_workqueue);

/**
 * workqueue_set_max_active - adjust max_active of a workqueue
 * @wq: target workqueue
 * @max_active: new max_active value.
 *
 * Set max_active of @wq to @max_active.
 *
 * CONTEXT:
 * Don't call from IRQ context.
 */
void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
{
        struct pool_workqueue *pwq;

        /* disallow meddling with max_active for ordered workqueues */
        if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
                return;

        max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);

        mutex_lock(&wq->mutex);

        wq->flags &= ~__WQ_ORDERED;
        wq->saved_max_active = max_active;

        for_each_pwq(pwq, wq)
                pwq_adjust_max_active(pwq);

        mutex_unlock(&wq->mutex);
}
EXPORT_SYMBOL_GPL(workqueue_set_max_active);

/**
 * current_work - retrieve %current task's work struct
 *
 * Determine if %current task is a workqueue worker and what it's working on.
 * Useful to find out the context that the %current task is running in.
 *
 * Return: work struct if %current task is a workqueue worker, %NULL otherwise.
 */
struct work_struct *current_work(void)
{
        struct worker *worker = current_wq_worker();

        return worker ? worker->current_work : NULL;
}
EXPORT_SYMBOL(current_work);

/**
 * current_is_workqueue_rescuer - is %current workqueue rescuer?
 *
 * Determine whether %current is a workqueue rescuer.  Can be used from
 * work functions to determine whether it's being run off the rescuer task.
 *
 * Return: %true if %current is a workqueue rescuer. %false otherwise.
 */
bool current_is_workqueue_rescuer(void)
{
        struct worker *worker = current_wq_worker();

        return worker && worker->rescue_wq;
}

/**
 * workqueue_congested - test whether a workqueue is congested
 * @cpu: CPU in question
 * @wq: target workqueue
 *
 * Test whether @wq's cpu workqueue for @cpu is congested.  There is
 * no synchronization around this function and the test result is
 * unreliable and only useful as advisory hints or for debugging.
 *
 * If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU.
 * Note that both per-cpu and unbound workqueues may be associated with
 * multiple pool_workqueues which have separate congested states.  A
 * workqueue being congested on one CPU doesn't mean the workqueue is also
 * contested on other CPUs / NUMA nodes.
 *
 * Return:
 * %true if congested, %false otherwise.
 */
bool workqueue_congested(int cpu, struct workqueue_struct *wq)
{
        struct pool_workqueue *pwq;
        bool ret;

        rcu_read_lock();
        preempt_disable();

        if (cpu == WORK_CPU_UNBOUND)
                cpu = smp_processor_id();

        if (!(wq->flags & WQ_UNBOUND))
                pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
        else
                pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));

        ret = !list_empty(&pwq->inactive_works);
        preempt_enable();
        rcu_read_unlock();

        return ret;
}
EXPORT_SYMBOL_GPL(workqueue_congested);

/**
 * work_busy - test whether a work is currently pending or running
 * @work: the work to be tested
 *
 * Test whether @work is currently pending or running.  There is no
 * synchronization around this function and the test result is
 * unreliable and only useful as advisory hints or for debugging.
 *
 * Return:
 * OR'd bitmask of WORK_BUSY_* bits.
 */
unsigned int work_busy(struct work_struct *work)
{
        struct worker_pool *pool;
        unsigned long flags;
        unsigned int ret = 0;

        if (work_pending(work))
                ret |= WORK_BUSY_PENDING;

        rcu_read_lock();
        pool = get_work_pool(work);
        if (pool) {
                raw_spin_lock_irqsave(&pool->lock, flags);
                if (find_worker_executing_work(pool, work))
                        ret |= WORK_BUSY_RUNNING;
                raw_spin_unlock_irqrestore(&pool->lock, flags);
        }
        rcu_read_unlock();

        return ret;
}
EXPORT_SYMBOL_GPL(work_busy);

/**
 * set_worker_desc - set description for the current work item
 * @fmt: printf-style format string
 * @...: arguments for the format string
 *
 * This function can be called by a running work function to describe what
 * the work item is about.  If the worker task gets dumped, this
 * information will be printed out together to help debugging.  The
 * description can be at most WORKER_DESC_LEN including the trailing '\0'.
 */
void set_worker_desc(const char *fmt, ...)
{
        struct worker *worker = current_wq_worker();
        va_list args;

        if (worker) {
                va_start(args, fmt);
                vsnprintf(worker->desc, sizeof(worker->desc), fmt, args);
                va_end(args);
        }
}
EXPORT_SYMBOL_GPL(set_worker_desc);

/**
 * print_worker_info - print out worker information and description
 * @log_lvl: the log level to use when printing
 * @task: target task
 *
 * If @task is a worker and currently executing a work item, print out the
 * name of the workqueue being serviced and worker description set with
 * set_worker_desc() by the currently executing work item.
 *
 * This function can be safely called on any task as long as the
 * task_struct itself is accessible.  While safe, this function isn't
 * synchronized and may print out mixups or garbages of limited length.
 */
void print_worker_info(const char *log_lvl, struct task_struct *task)
{
        work_func_t *fn = NULL;
        char name[WQ_NAME_LEN] = { };
        char desc[WORKER_DESC_LEN] = { };
        struct pool_workqueue *pwq = NULL;
        struct workqueue_struct *wq = NULL;
        struct worker *worker;

        if (!(task->flags & PF_WQ_WORKER))
                return;

        /*
         * This function is called without any synchronization and @task
         * could be in any state.  Be careful with dereferences.
         */
        worker = kthread_probe_data(task);

        /*
         * Carefully copy the associated workqueue's workfn, name and desc.
         * Keep the original last '\0' in case the original is garbage.
         */
        copy_from_kernel_nofault(&fn, &worker->current_func, sizeof(fn));
        copy_from_kernel_nofault(&pwq, &worker->current_pwq, sizeof(pwq));
        copy_from_kernel_nofault(&wq, &pwq->wq, sizeof(wq));
        copy_from_kernel_nofault(name, wq->name, sizeof(name) - 1);
        copy_from_kernel_nofault(desc, worker->desc, sizeof(desc) - 1);

        if (fn || name[0] || desc[0]) {
                printk("%sWorkqueue: %s %ps", log_lvl, name, fn);
                if (strcmp(name, desc))
                        pr_cont(" (%s)", desc);
                pr_cont("\n");
        }
}

static void pr_cont_pool_info(struct worker_pool *pool)
{
        pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask);
        if (pool->node != NUMA_NO_NODE)
                pr_cont(" node=%d", pool->node);
        pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice);
}

static void pr_cont_work(bool comma, struct work_struct *work)
{
        if (work->func == wq_barrier_func) {
                struct wq_barrier *barr;

                barr = container_of(work, struct wq_barrier, work);

                pr_cont("%s BAR(%d)", comma ? "," : "",
                        task_pid_nr(barr->task));
        } else {
                pr_cont("%s %ps", comma ? "," : "", work->func);
        }
}

static void show_pwq(struct pool_workqueue *pwq)
{
        struct worker_pool *pool = pwq->pool;
        struct work_struct *work;
        struct worker *worker;
        bool has_in_flight = false, has_pending = false;
        int bkt;

        pr_info("  pwq %d:", pool->id);
        pr_cont_pool_info(pool);

        pr_cont(" active=%d/%d refcnt=%d%s\n",
                pwq->nr_active, pwq->max_active, pwq->refcnt,
                !list_empty(&pwq->mayday_node) ? " MAYDAY" : "");

        hash_for_each(pool->busy_hash, bkt, worker, hentry) {
                if (worker->current_pwq == pwq) {
                        has_in_flight = true;
                        break;
                }
        }
        if (has_in_flight) {
                bool comma = false;

                pr_info("    in-flight:");
                hash_for_each(pool->busy_hash, bkt, worker, hentry) {
                        if (worker->current_pwq != pwq)
                                continue;

                        pr_cont("%s %d%s:%ps", comma ? "," : "",
                                task_pid_nr(worker->task),
                                worker->rescue_wq ? "(RESCUER)" : "",
                                worker->current_func);
                        list_for_each_entry(work, &worker->scheduled, entry)
                                pr_cont_work(false, work);
                        comma = true;
                }
                pr_cont("\n");
        }

        list_for_each_entry(work, &pool->worklist, entry) {
                if (get_work_pwq(work) == pwq) {
                        has_pending = true;
                        break;
                }
        }
        if (has_pending) {
                bool comma = false;

                pr_info("    pending:");
                list_for_each_entry(work, &pool->worklist, entry) {
                        if (get_work_pwq(work) != pwq)
                                continue;

                        pr_cont_work(comma, work);
                        comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
                }
                pr_cont("\n");
        }

        if (!list_empty(&pwq->inactive_works)) {
                bool comma = false;

                pr_info("    inactive:");
                list_for_each_entry(work, &pwq->inactive_works, entry) {
                        pr_cont_work(comma, work);
                        comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
                }
                pr_cont("\n");
        }
}

/**
 * show_workqueue_state - dump workqueue state
 *
 * Called from a sysrq handler or try_to_freeze_tasks() and prints out
 * all busy workqueues and pools.
 */
void show_workqueue_state(void)
{
        struct workqueue_struct *wq;
        struct worker_pool *pool;
        unsigned long flags;
        int pi;

        rcu_read_lock();

        pr_info("Showing busy workqueues and worker pools:\n");

        list_for_each_entry_rcu(wq, &workqueues, list) {
                struct pool_workqueue *pwq;
                bool idle = true;

                for_each_pwq(pwq, wq) {
                        if (pwq->nr_active || !list_empty(&pwq->inactive_works)) {
                                idle = false;
                                break;
                        }
                }
                if (idle)
                        continue;

                pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags);

                for_each_pwq(pwq, wq) {
                        raw_spin_lock_irqsave(&pwq->pool->lock, flags);
                        if (pwq->nr_active || !list_empty(&pwq->inactive_works))
                                show_pwq(pwq);
                        raw_spin_unlock_irqrestore(&pwq->pool->lock, flags);
                        /*
                         * We could be printing a lot from atomic context, e.g.
                         * sysrq-t -> show_workqueue_state(). Avoid triggering
                         * hard lockup.
                         */
                        touch_nmi_watchdog();
                }
        }

        for_each_pool(pool, pi) {
                struct worker *worker;
                bool first = true;
                unsigned long hung = 0;

                raw_spin_lock_irqsave(&pool->lock, flags);
                if (pool->nr_workers == pool->nr_idle)
                        goto next_pool;

                /* How long the first pending work is waiting for a worker. */
                if (!list_empty(&pool->worklist))
                        hung = jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000;

                pr_info("pool %d:", pool->id);
                pr_cont_pool_info(pool);
                pr_cont(" hung=%lus workers=%d", hung, pool->nr_workers);
                if (pool->manager)
                        pr_cont(" manager: %d",
                                task_pid_nr(pool->manager->task));
                list_for_each_entry(worker, &pool->idle_list, entry) {
                        pr_cont(" %s%d", first ? "idle: " : "",
                                task_pid_nr(worker->task));
                        first = false;
                }
                pr_cont("\n");
        next_pool:
                raw_spin_unlock_irqrestore(&pool->lock, flags);
                /*
                 * We could be printing a lot from atomic context, e.g.
                 * sysrq-t -> show_workqueue_state(). Avoid triggering
                 * hard lockup.
                 */
                touch_nmi_watchdog();
        }

        rcu_read_unlock();
}

/* used to show worker information through /proc/PID/{comm,stat,status} */
void wq_worker_comm(char *buf, size_t size, struct task_struct *task)
{
        int off;

        /* always show the actual comm */
        off = strscpy(buf, task->comm, size);
        if (off < 0)
                return;

        /* stabilize PF_WQ_WORKER and worker pool association */
        mutex_lock(&wq_pool_attach_mutex);

        if (task->flags & PF_WQ_WORKER) {
                struct worker *worker = kthread_data(task);
                struct worker_pool *pool = worker->pool;

                if (pool) {
                        raw_spin_lock_irq(&pool->lock);
                        /*
                         * ->desc tracks information (wq name or
                         * set_worker_desc()) for the latest execution.  If
                         * current, prepend '+', otherwise '-'.
                         */
                        if (worker->desc[0] != '\0') {
                                if (worker->current_work)
                                        scnprintf(buf + off, size - off, "+%s",
                                                  worker->desc);
                                else
                                        scnprintf(buf + off, size - off, "-%s",
                                                  worker->desc);
                        }
                        raw_spin_unlock_irq(&pool->lock);
                }
        }

        mutex_unlock(&wq_pool_attach_mutex);
}

#ifdef CONFIG_SMP

/*
 * CPU hotplug.
 *
 * There are two challenges in supporting CPU hotplug.  Firstly, there
 * are a lot of assumptions on strong associations among work, pwq and
 * pool which make migrating pending and scheduled works very
 * difficult to implement without impacting hot paths.  Secondly,
 * worker pools serve mix of short, long and very long running works making
 * blocked draining impractical.
 *
 * This is solved by allowing the pools to be disassociated from the CPU
 * running as an unbound one and allowing it to be reattached later if the
 * cpu comes back online.
 */

static void unbind_workers(int cpu)
{
        struct worker_pool *pool;
        struct worker *worker;

        for_each_cpu_worker_pool(pool, cpu) {
                mutex_lock(&wq_pool_attach_mutex);
                raw_spin_lock_irq(&pool->lock);

                /*
                 * We've blocked all attach/detach operations. Make all workers
                 * unbound and set DISASSOCIATED.  Before this, all workers
                 * except for the ones which are still executing works from
                 * before the last CPU down must be on the cpu.  After
                 * this, they may become diasporas.
                 */
                for_each_pool_worker(worker, pool)
                        worker->flags |= WORKER_UNBOUND;

                pool->flags |= POOL_DISASSOCIATED;

                raw_spin_unlock_irq(&pool->lock);
                mutex_unlock(&wq_pool_attach_mutex);

                /*
                 * Call schedule() so that we cross rq->lock and thus can
                 * guarantee sched callbacks see the %WORKER_UNBOUND flag.
                 * This is necessary as scheduler callbacks may be invoked
                 * from other cpus.
                 */
                schedule();

                /*
                 * Sched callbacks are disabled now.  Zap nr_running.
                 * After this, nr_running stays zero and need_more_worker()
                 * and keep_working() are always true as long as the
                 * worklist is not empty.  This pool now behaves as an
                 * unbound (in terms of concurrency management) pool which
                 * are served by workers tied to the pool.
                 */
                atomic_set(&pool->nr_running, 0);

                /*
                 * With concurrency management just turned off, a busy
                 * worker blocking could lead to lengthy stalls.  Kick off
                 * unbound chain execution of currently pending work items.
                 */
                raw_spin_lock_irq(&pool->lock);
                wake_up_worker(pool);
                raw_spin_unlock_irq(&pool->lock);
        }
}

/**
 * rebind_workers - rebind all workers of a pool to the associated CPU
 * @pool: pool of interest
 *
 * @pool->cpu is coming online.  Rebind all workers to the CPU.
 */
static void rebind_workers(struct worker_pool *pool)
{
        struct worker *worker;

        lockdep_assert_held(&wq_pool_attach_mutex);

        /*
         * Restore CPU affinity of all workers.  As all idle workers should
         * be on the run-queue of the associated CPU before any local
         * wake-ups for concurrency management happen, restore CPU affinity
         * of all workers first and then clear UNBOUND.  As we're called
         * from CPU_ONLINE, the following shouldn't fail.
         */
        for_each_pool_worker(worker, pool)
                WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
                                                  pool->attrs->cpumask) < 0);

        raw_spin_lock_irq(&pool->lock);

        pool->flags &= ~POOL_DISASSOCIATED;

        for_each_pool_worker(worker, pool) {
                unsigned int worker_flags = worker->flags;

                /*
                 * A bound idle worker should actually be on the runqueue
                 * of the associated CPU for local wake-ups targeting it to
                 * work.  Kick all idle workers so that they migrate to the
                 * associated CPU.  Doing this in the same loop as
                 * replacing UNBOUND with REBOUND is safe as no worker will
                 * be bound before @pool->lock is released.
                 */
                if (worker_flags & WORKER_IDLE)
                        wake_up_process(worker->task);

                /*
                 * We want to clear UNBOUND but can't directly call
                 * worker_clr_flags() or adjust nr_running.  Atomically
                 * replace UNBOUND with another NOT_RUNNING flag REBOUND.
                 * @worker will clear REBOUND using worker_clr_flags() when
                 * it initiates the next execution cycle thus restoring
                 * concurrency management.  Note that when or whether
                 * @worker clears REBOUND doesn't affect correctness.
                 *
                 * WRITE_ONCE() is necessary because @worker->flags may be
                 * tested without holding any lock in
                 * wq_worker_running().  Without it, NOT_RUNNING test may
                 * fail incorrectly leading to premature concurrency
                 * management operations.
                 */
                WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND));
                worker_flags |= WORKER_REBOUND;
                worker_flags &= ~WORKER_UNBOUND;
                WRITE_ONCE(worker->flags, worker_flags);
        }

        raw_spin_unlock_irq(&pool->lock);
}

/**
 * restore_unbound_workers_cpumask - restore cpumask of unbound workers
 * @pool: unbound pool of interest
 * @cpu: the CPU which is coming up
 *
 * An unbound pool may end up with a cpumask which doesn't have any online
 * CPUs.  When a worker of such pool get scheduled, the scheduler resets
 * its cpus_allowed.  If @cpu is in @pool's cpumask which didn't have any
 * online CPU before, cpus_allowed of all its workers should be restored.
 */
static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
{
        static cpumask_t cpumask;
        struct worker *worker;

        lockdep_assert_held(&wq_pool_attach_mutex);

        /* is @cpu allowed for @pool? */
        if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
                return;

        cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask);

        /* as we're called from CPU_ONLINE, the following shouldn't fail */
        for_each_pool_worker(worker, pool)
                WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, &cpumask) < 0);
}

int workqueue_prepare_cpu(unsigned int cpu)
{
        struct worker_pool *pool;

        for_each_cpu_worker_pool(pool, cpu) {
                if (pool->nr_workers)
                        continue;
                if (!create_worker(pool))
                        return -ENOMEM;
        }
        return 0;
}

int workqueue_online_cpu(unsigned int cpu)
{
        struct worker_pool *pool;
        struct workqueue_struct *wq;
        int pi;

        mutex_lock(&wq_pool_mutex);

        for_each_pool(pool, pi) {
                mutex_lock(&wq_pool_attach_mutex);

                if (pool->cpu == cpu)
                        rebind_workers(pool);
                else if (pool->cpu < 0)
                        restore_unbound_workers_cpumask(pool, cpu);

                mutex_unlock(&wq_pool_attach_mutex);
        }

        /* update NUMA affinity of unbound workqueues */
        list_for_each_entry(wq, &workqueues, list)
                wq_update_unbound_numa(wq, cpu, true);

        mutex_unlock(&wq_pool_mutex);
        return 0;
}

int workqueue_offline_cpu(unsigned int cpu)
{
        struct workqueue_struct *wq;

        /* unbinding per-cpu workers should happen on the local CPU */
        if (WARN_ON(cpu != smp_processor_id()))
                return -1;

        unbind_workers(cpu);

        /* update NUMA affinity of unbound workqueues */
        mutex_lock(&wq_pool_mutex);
        list_for_each_entry(wq, &workqueues, list)
                wq_update_unbound_numa(wq, cpu, false);
        mutex_unlock(&wq_pool_mutex);

        return 0;
}

struct work_for_cpu {
        struct work_struct work;
        long (*fn)(void *);
        void *arg;
        long ret;
};

static void work_for_cpu_fn(struct work_struct *work)
{
        struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work);

        wfc->ret = wfc->fn(wfc->arg);
}

/**
 * work_on_cpu - run a function in thread context on a particular cpu
 * @cpu: the cpu to run on
 * @fn: the function to run
 * @arg: the function arg
 *
 * It is up to the caller to ensure that the cpu doesn't go offline.
 * The caller must not hold any locks which would prevent @fn from completing.
 *
 * Return: The value @fn returns.
 */
long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
{
        struct work_for_cpu wfc = { .fn = fn, .arg = arg };

        INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
        schedule_work_on(cpu, &wfc.work);
        flush_work(&wfc.work);
        destroy_work_on_stack(&wfc.work);
        return wfc.ret;
}
EXPORT_SYMBOL_GPL(work_on_cpu);

/**
 * work_on_cpu_safe - run a function in thread context on a particular cpu
 * @cpu: the cpu to run on
 * @fn:  the function to run
 * @arg: the function argument
 *
 * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold
 * any locks which would prevent @fn from completing.
 *
 * Return: The value @fn returns.
 */
long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
{
        long ret = -ENODEV;

        get_online_cpus();
        if (cpu_online(cpu))
                ret = work_on_cpu(cpu, fn, arg);
        put_online_cpus();
        return ret;
}
EXPORT_SYMBOL_GPL(work_on_cpu_safe);
#endif /* CONFIG_SMP */

#ifdef CONFIG_FREEZER

/**
 * freeze_workqueues_begin - begin freezing workqueues
 *
 * Start freezing workqueues.  After this function returns, all freezable
 * workqueues will queue new works to their inactive_works list instead of
 * pool->worklist.
 *
 * CONTEXT:
 * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
 */
void freeze_workqueues_begin(void)
{
        struct workqueue_struct *wq;
        struct pool_workqueue *pwq;

        mutex_lock(&wq_pool_mutex);

        WARN_ON_ONCE(workqueue_freezing);
        workqueue_freezing = true;

        list_for_each_entry(wq, &workqueues, list) {
                mutex_lock(&wq->mutex);
                for_each_pwq(pwq, wq)
                        pwq_adjust_max_active(pwq);
                mutex_unlock(&wq->mutex);
        }

        mutex_unlock(&wq_pool_mutex);
}

/**
 * freeze_workqueues_busy - are freezable workqueues still busy?
 *
 * Check whether freezing is complete.  This function must be called
 * between freeze_workqueues_begin() and thaw_workqueues().
 *
 * CONTEXT:
 * Grabs and releases wq_pool_mutex.
 *
 * Return:
 * %true if some freezable workqueues are still busy.  %false if freezing
 * is complete.
 */
bool freeze_workqueues_busy(void)
{
        bool busy = false;
        struct workqueue_struct *wq;
        struct pool_workqueue *pwq;

        mutex_lock(&wq_pool_mutex);

        WARN_ON_ONCE(!workqueue_freezing);

        list_for_each_entry(wq, &workqueues, list) {
                if (!(wq->flags & WQ_FREEZABLE))
                        continue;
                /*
                 * nr_active is monotonically decreasing.  It's safe
                 * to peek without lock.
                 */
                rcu_read_lock();
                for_each_pwq(pwq, wq) {
                        WARN_ON_ONCE(pwq->nr_active < 0);
                        if (pwq->nr_active) {
                                busy = true;
                                rcu_read_unlock();
                                goto out_unlock;
                        }
                }
                rcu_read_unlock();
        }
out_unlock:
        mutex_unlock(&wq_pool_mutex);
        return busy;
}

/**
 * thaw_workqueues - thaw workqueues
 *
 * Thaw workqueues.  Normal queueing is restored and all collected
 * frozen works are transferred to their respective pool worklists.
 *
 * CONTEXT:
 * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
 */
void thaw_workqueues(void)
{
        struct workqueue_struct *wq;
        struct pool_workqueue *pwq;

        mutex_lock(&wq_pool_mutex);

        if (!workqueue_freezing)
                goto out_unlock;

        workqueue_freezing = false;

        /* restore max_active and repopulate worklist */
        list_for_each_entry(wq, &workqueues, list) {
                mutex_lock(&wq->mutex);
                for_each_pwq(pwq, wq)
                        pwq_adjust_max_active(pwq);
                mutex_unlock(&wq->mutex);
        }

out_unlock:
        mutex_unlock(&wq_pool_mutex);
}
#endif /* CONFIG_FREEZER */

static int workqueue_apply_unbound_cpumask(void)
{
        LIST_HEAD(ctxs);
        int ret = 0;
        struct workqueue_struct *wq;
        struct apply_wqattrs_ctx *ctx, *n;

        lockdep_assert_held(&wq_pool_mutex);

        list_for_each_entry(wq, &workqueues, list) {
                if (!(wq->flags & WQ_UNBOUND))
                        continue;

                /* creating multiple pwqs breaks ordering guarantee */
                if (!list_empty(&wq->pwqs)) {
                        if (wq->flags & __WQ_ORDERED_EXPLICIT)
                                continue;
                        wq->flags &= ~__WQ_ORDERED;
                }

                ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs);
                if (!ctx) {
                        ret = -ENOMEM;
                        break;
                }

                list_add_tail(&ctx->list, &ctxs);
        }

        list_for_each_entry_safe(ctx, n, &ctxs, list) {
                if (!ret)
                        apply_wqattrs_commit(ctx);
                apply_wqattrs_cleanup(ctx);
        }

        return ret;
}

/**
 *  workqueue_set_unbound_cpumask - Set the low-level unbound cpumask
 *  @cpumask: the cpumask to set
 *
 *  The low-level workqueues cpumask is a global cpumask that limits
 *  the affinity of all unbound workqueues.  This function check the @cpumask
 *  and apply it to all unbound workqueues and updates all pwqs of them.
 *
 *  Retun:        0        - Success
 *                  -EINVAL        - Invalid @cpumask
 *                  -ENOMEM        - Failed to allocate memory for attrs or pwqs.
 */
int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
{
        int ret = -EINVAL;
        cpumask_var_t saved_cpumask;

        /*
         * Not excluding isolated cpus on purpose.
         * If the user wishes to include them, we allow that.
         */
        cpumask_and(cpumask, cpumask, cpu_possible_mask);
        if (!cpumask_empty(cpumask)) {
                apply_wqattrs_lock();
                if (cpumask_equal(cpumask, wq_unbound_cpumask)) {
                        ret = 0;
                        goto out_unlock;
                }

                if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL)) {
                        ret = -ENOMEM;
                        goto out_unlock;
                }

                /* save the old wq_unbound_cpumask. */
                cpumask_copy(saved_cpumask, wq_unbound_cpumask);

                /* update wq_unbound_cpumask at first and apply it to wqs. */
                cpumask_copy(wq_unbound_cpumask, cpumask);
                ret = workqueue_apply_unbound_cpumask();

                /* restore the wq_unbound_cpumask when failed. */
                if (ret < 0)
                        cpumask_copy(wq_unbound_cpumask, saved_cpumask);

                free_cpumask_var(saved_cpumask);
out_unlock:
                apply_wqattrs_unlock();
        }

        return ret;
}

#ifdef CONFIG_SYSFS
/*
 * Workqueues with WQ_SYSFS flag set is visible to userland via
 * /sys/bus/workqueue/devices/WQ_NAME.  All visible workqueues have the
 * following attributes.
 *
 *  per_cpu        RO bool        : whether the workqueue is per-cpu or unbound
 *  max_active        RW int        : maximum number of in-flight work items
 *
 * Unbound workqueues have the following extra attributes.
 *
 *  pool_ids        RO int        : the associated pool IDs for each node
 *  nice        RW int        : nice value of the workers
 *  cpumask        RW mask        : bitmask of allowed CPUs for the workers
 *  numa        RW bool        : whether enable NUMA affinity
 */
struct wq_device {
        struct workqueue_struct                *wq;
        struct device                        dev;
};

static struct workqueue_struct *dev_to_wq(struct device *dev)
{
        struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);

        return wq_dev->wq;
}

static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
                            char *buf)
{
        struct workqueue_struct *wq = dev_to_wq(dev);

        return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
}
static DEVICE_ATTR_RO(per_cpu);

static ssize_t max_active_show(struct device *dev,
                               struct device_attribute *attr, char *buf)
{
        struct workqueue_struct *wq = dev_to_wq(dev);

        return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
}

static ssize_t max_active_store(struct device *dev,
                                struct device_attribute *attr, const char *buf,
                                size_t count)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        int val;

        if (sscanf(buf, "%d", &val) != 1 || val <= 0)
                return -EINVAL;

        workqueue_set_max_active(wq, val);
        return count;
}
static DEVICE_ATTR_RW(max_active);

static struct attribute *wq_sysfs_attrs[] = {
        &dev_attr_per_cpu.attr,
        &dev_attr_max_active.attr,
        NULL,
};
ATTRIBUTE_GROUPS(wq_sysfs);

static ssize_t wq_pool_ids_show(struct device *dev,
                                struct device_attribute *attr, char *buf)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        const char *delim = "";
        int node, written = 0;

        get_online_cpus();
        rcu_read_lock();
        for_each_node(node) {
                written += scnprintf(buf + written, PAGE_SIZE - written,
                                     "%s%d:%d", delim, node,
                                     unbound_pwq_by_node(wq, node)->pool->id);
                delim = " ";
        }
        written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
        rcu_read_unlock();
        put_online_cpus();

        return written;
}

static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
                            char *buf)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        int written;

        mutex_lock(&wq->mutex);
        written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
        mutex_unlock(&wq->mutex);

        return written;
}

/* prepare workqueue_attrs for sysfs store operations */
static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
{
        struct workqueue_attrs *attrs;

        lockdep_assert_held(&wq_pool_mutex);

        attrs = alloc_workqueue_attrs();
        if (!attrs)
                return NULL;

        copy_workqueue_attrs(attrs, wq->unbound_attrs);
        return attrs;
}

static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
                             const char *buf, size_t count)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        struct workqueue_attrs *attrs;
        int ret = -ENOMEM;

        apply_wqattrs_lock();

        attrs = wq_sysfs_prep_attrs(wq);
        if (!attrs)
                goto out_unlock;

        if (sscanf(buf, "%d", &attrs->nice) == 1 &&
            attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
                ret = apply_workqueue_attrs_locked(wq, attrs);
        else
                ret = -EINVAL;

out_unlock:
        apply_wqattrs_unlock();
        free_workqueue_attrs(attrs);
        return ret ?: count;
}

static ssize_t wq_cpumask_show(struct device *dev,
                               struct device_attribute *attr, char *buf)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        int written;

        mutex_lock(&wq->mutex);
        written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
                            cpumask_pr_args(wq->unbound_attrs->cpumask));
        mutex_unlock(&wq->mutex);
        return written;
}

static ssize_t wq_cpumask_store(struct device *dev,
                                struct device_attribute *attr,
                                const char *buf, size_t count)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        struct workqueue_attrs *attrs;
        int ret = -ENOMEM;

        apply_wqattrs_lock();

        attrs = wq_sysfs_prep_attrs(wq);
        if (!attrs)
                goto out_unlock;

        ret = cpumask_parse(buf, attrs->cpumask);
        if (!ret)
                ret = apply_workqueue_attrs_locked(wq, attrs);

out_unlock:
        apply_wqattrs_unlock();
        free_workqueue_attrs(attrs);
        return ret ?: count;
}

static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
                            char *buf)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        int written;

        mutex_lock(&wq->mutex);
        written = scnprintf(buf, PAGE_SIZE, "%d\n",
                            !wq->unbound_attrs->no_numa);
        mutex_unlock(&wq->mutex);

        return written;
}

static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
                             const char *buf, size_t count)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        struct workqueue_attrs *attrs;
        int v, ret = -ENOMEM;

        apply_wqattrs_lock();

        attrs = wq_sysfs_prep_attrs(wq);
        if (!attrs)
                goto out_unlock;

        ret = -EINVAL;
        if (sscanf(buf, "%d", &v) == 1) {
                attrs->no_numa = !v;
                ret = apply_workqueue_attrs_locked(wq, attrs);
        }

out_unlock:
        apply_wqattrs_unlock();
        free_workqueue_attrs(attrs);
        return ret ?: count;
}

static struct device_attribute wq_sysfs_unbound_attrs[] = {
        __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
        __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
        __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
        __ATTR(numa, 0644, wq_numa_show, wq_numa_store),
        __ATTR_NULL,
};

static struct bus_type wq_subsys = {
        .name                                = "workqueue",
        .dev_groups                        = wq_sysfs_groups,
};

static ssize_t wq_unbound_cpumask_show(struct device *dev,
                struct device_attribute *attr, char *buf)
{
        int written;

        mutex_lock(&wq_pool_mutex);
        written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
                            cpumask_pr_args(wq_unbound_cpumask));
        mutex_unlock(&wq_pool_mutex);

        return written;
}

static ssize_t wq_unbound_cpumask_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
{
        cpumask_var_t cpumask;
        int ret;

        if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
                return -ENOMEM;

        ret = cpumask_parse(buf, cpumask);
        if (!ret)
                ret = workqueue_set_unbound_cpumask(cpumask);

        free_cpumask_var(cpumask);
        return ret ? ret : count;
}

static struct device_attribute wq_sysfs_cpumask_attr =
        __ATTR(cpumask, 0644, wq_unbound_cpumask_show,
               wq_unbound_cpumask_store);

static int __init wq_sysfs_init(void)
{
        int err;

        err = subsys_virtual_register(&wq_subsys, NULL);
        if (err)
                return err;

        return device_create_file(wq_subsys.dev_root, &wq_sysfs_cpumask_attr);
}
core_initcall(wq_sysfs_init);

static void wq_device_release(struct device *dev)
{
        struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);

        kfree(wq_dev);
}

/**
 * workqueue_sysfs_register - make a workqueue visible in sysfs
 * @wq: the workqueue to register
 *
 * Expose @wq in sysfs under /sys/bus/workqueue/devices.
 * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
 * which is the preferred method.
 *
 * Workqueue user should use this function directly iff it wants to apply
 * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
 * apply_workqueue_attrs() may race against userland updating the
 * attributes.
 *
 * Return: 0 on success, -errno on failure.
 */
int workqueue_sysfs_register(struct workqueue_struct *wq)
{
        struct wq_device *wq_dev;
        int ret;

        /*
         * Adjusting max_active or creating new pwqs by applying
         * attributes breaks ordering guarantee.  Disallow exposing ordered
         * workqueues.
         */
        if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
                return -EINVAL;

        wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
        if (!wq_dev)
                return -ENOMEM;

        wq_dev->wq = wq;
        wq_dev->dev.bus = &wq_subsys;
        wq_dev->dev.release = wq_device_release;
        dev_set_name(&wq_dev->dev, "%s", wq->name);

        /*
         * unbound_attrs are created separately.  Suppress uevent until
         * everything is ready.
         */
        dev_set_uevent_suppress(&wq_dev->dev, true);

        ret = device_register(&wq_dev->dev);
        if (ret) {
                put_device(&wq_dev->dev);
                wq->wq_dev = NULL;
                return ret;
        }

        if (wq->flags & WQ_UNBOUND) {
                struct device_attribute *attr;

                for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
                        ret = device_create_file(&wq_dev->dev, attr);
                        if (ret) {
                                device_unregister(&wq_dev->dev);
                                wq->wq_dev = NULL;
                                return ret;
                        }
                }
        }

        dev_set_uevent_suppress(&wq_dev->dev, false);
        kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
        return 0;
}

/**
 * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
 * @wq: the workqueue to unregister
 *
 * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
 */
static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
{
        struct wq_device *wq_dev = wq->wq_dev;

        if (!wq->wq_dev)
                return;

        wq->wq_dev = NULL;
        device_unregister(&wq_dev->dev);
}
#else        /* CONFIG_SYSFS */
static void workqueue_sysfs_unregister(struct workqueue_struct *wq)        { }
#endif        /* CONFIG_SYSFS */

/*
 * Workqueue watchdog.
 *
 * Stall may be caused by various bugs - missing WQ_MEM_RECLAIM, illegal
 * flush dependency, a concurrency managed work item which stays RUNNING
 * indefinitely.  Workqueue stalls can be very difficult to debug as the
 * usual warning mechanisms don't trigger and internal workqueue state is
 * largely opaque.
 *
 * Workqueue watchdog monitors all worker pools periodically and dumps
 * state if some pools failed to make forward progress for a while where
 * forward progress is defined as the first item on ->worklist changing.
 *
 * This mechanism is controlled through the kernel parameter
 * "workqueue.watchdog_thresh" which can be updated at runtime through the
 * corresponding sysfs parameter file.
 */
#ifdef CONFIG_WQ_WATCHDOG

static unsigned long wq_watchdog_thresh = 30;
static struct timer_list wq_watchdog_timer;

static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;

static void wq_watchdog_reset_touched(void)
{
        int cpu;

        wq_watchdog_touched = jiffies;
        for_each_possible_cpu(cpu)
                per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
}

static void wq_watchdog_timer_fn(struct timer_list *unused)
{
        unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
        bool lockup_detected = false;
        unsigned long now = jiffies;
        struct worker_pool *pool;
        int pi;

        if (!thresh)
                return;

        rcu_read_lock();

        for_each_pool(pool, pi) {
                unsigned long pool_ts, touched, ts;

                if (list_empty(&pool->worklist))
                        continue;

                /*
                 * If a virtual machine is stopped by the host it can look to
                 * the watchdog like a stall.
                 */
                kvm_check_and_clear_guest_paused();

                /* get the latest of pool and touched timestamps */
                pool_ts = READ_ONCE(pool->watchdog_ts);
                touched = READ_ONCE(wq_watchdog_touched);

                if (time_after(pool_ts, touched))
                        ts = pool_ts;
                else
                        ts = touched;

                if (pool->cpu >= 0) {
                        unsigned long cpu_touched =
                                READ_ONCE(per_cpu(wq_watchdog_touched_cpu,
                                                  pool->cpu));
                        if (time_after(cpu_touched, ts))
                                ts = cpu_touched;
                }

                /* did we stall? */
                if (time_after(now, ts + thresh)) {
                        lockup_detected = true;
                        pr_emerg("BUG: workqueue lockup - pool");
                        pr_cont_pool_info(pool);
                        pr_cont(" stuck for %us!\n",
                                jiffies_to_msecs(now - pool_ts) / 1000);
                }
        }

        rcu_read_unlock();

        if (lockup_detected)
                show_workqueue_state();

        wq_watchdog_reset_touched();
        mod_timer(&wq_watchdog_timer, jiffies + thresh);
}

notrace void wq_watchdog_touch(int cpu)
{
        if (cpu >= 0)
                per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
        else
                wq_watchdog_touched = jiffies;
}

static void wq_watchdog_set_thresh(unsigned long thresh)
{
        wq_watchdog_thresh = 0;
        del_timer_sync(&wq_watchdog_timer);

        if (thresh) {
                wq_watchdog_thresh = thresh;
                wq_watchdog_reset_touched();
                mod_timer(&wq_watchdog_timer, jiffies + thresh * HZ);
        }
}

static int wq_watchdog_param_set_thresh(const char *val,
                                        const struct kernel_param *kp)
{
        unsigned long thresh;
        int ret;

        ret = kstrtoul(val, 0, &thresh);
        if (ret)
                return ret;

        if (system_wq)
                wq_watchdog_set_thresh(thresh);
        else
                wq_watchdog_thresh = thresh;

        return 0;
}

static const struct kernel_param_ops wq_watchdog_thresh_ops = {
        .set        = wq_watchdog_param_set_thresh,
        .get        = param_get_ulong,
};

module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh,
                0644);

static void wq_watchdog_init(void)
{
        timer_setup(&wq_watchdog_timer, wq_watchdog_timer_fn, TIMER_DEFERRABLE);
        wq_watchdog_set_thresh(wq_watchdog_thresh);
}

#else        /* CONFIG_WQ_WATCHDOG */

static inline void wq_watchdog_init(void) { }

#endif        /* CONFIG_WQ_WATCHDOG */

static void __init wq_numa_init(void)
{
        cpumask_var_t *tbl;
        int node, cpu;

        if (num_possible_nodes() <= 1)
                return;

        if (wq_disable_numa) {
                pr_info("workqueue: NUMA affinity support disabled\n");
                return;
        }

        for_each_possible_cpu(cpu) {
                if (WARN_ON(cpu_to_node(cpu) == NUMA_NO_NODE)) {
                        pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu);
                        return;
                }
        }

        wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs();
        BUG_ON(!wq_update_unbound_numa_attrs_buf);

        /*
         * We want masks of possible CPUs of each node which isn't readily
         * available.  Build one from cpu_to_node() which should have been
         * fully initialized by now.
         */
        tbl = kcalloc(nr_node_ids, sizeof(tbl[0]), GFP_KERNEL);
        BUG_ON(!tbl);

        for_each_node(node)
                BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
                                node_online(node) ? node : NUMA_NO_NODE));

        for_each_possible_cpu(cpu) {
                node = cpu_to_node(cpu);
                cpumask_set_cpu(cpu, tbl[node]);
        }

        wq_numa_possible_cpumask = tbl;
        wq_numa_enabled = true;
}

/**
 * workqueue_init_early - early init for workqueue subsystem
 *
 * This is the first half of two-staged workqueue subsystem initialization
 * and invoked as soon as the bare basics - memory allocation, cpumasks and
 * idr are up.  It sets up all the data structures and system workqueues
 * and allows early boot code to create workqueues and queue/cancel work
 * items.  Actual work item execution starts only after kthreads can be
 * created and scheduled right before early initcalls.
 */
void __init workqueue_init_early(void)
{
        int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
        int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ;
        int i, cpu;

        BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));

        BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
        cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(hk_flags));

        pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);

        /* initialize CPU pools */
        for_each_possible_cpu(cpu) {
                struct worker_pool *pool;

                i = 0;
                for_each_cpu_worker_pool(pool, cpu) {
                        BUG_ON(init_worker_pool(pool));
                        pool->cpu = cpu;
                        cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));
                        pool->attrs->nice = std_nice[i++];
                        pool->node = cpu_to_node(cpu);

                        /* alloc pool ID */
                        mutex_lock(&wq_pool_mutex);
                        BUG_ON(worker_pool_assign_id(pool));
                        mutex_unlock(&wq_pool_mutex);
                }
        }

        /* create default unbound and ordered wq attrs */
        for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
                struct workqueue_attrs *attrs;

                BUG_ON(!(attrs = alloc_workqueue_attrs()));
                attrs->nice = std_nice[i];
                unbound_std_wq_attrs[i] = attrs;

                /*
                 * An ordered wq should have only one pwq as ordering is
                 * guaranteed by max_active which is enforced by pwqs.
                 * Turn off NUMA so that dfl_pwq is used for all nodes.
                 */
                BUG_ON(!(attrs = alloc_workqueue_attrs()));
                attrs->nice = std_nice[i];
                attrs->no_numa = true;
                ordered_wq_attrs[i] = attrs;
        }

        system_wq = alloc_workqueue("events", 0, 0);
        system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
        system_long_wq = alloc_workqueue("events_long", 0, 0);
        system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
                                            WQ_UNBOUND_MAX_ACTIVE);
        system_freezable_wq = alloc_workqueue("events_freezable",
                                              WQ_FREEZABLE, 0);
        system_power_efficient_wq = alloc_workqueue("events_power_efficient",
                                              WQ_POWER_EFFICIENT, 0);
        system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient",
                                              WQ_FREEZABLE | WQ_POWER_EFFICIENT,
                                              0);
        BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
               !system_unbound_wq || !system_freezable_wq ||
               !system_power_efficient_wq ||
               !system_freezable_power_efficient_wq);
}

/**
 * workqueue_init - bring workqueue subsystem fully online
 *
 * This is the latter half of two-staged workqueue subsystem initialization
 * and invoked as soon as kthreads can be created and scheduled.
 * Workqueues have been created and work items queued on them, but there
 * are no kworkers executing the work items yet.  Populate the worker pools
 * with the initial workers and enable future kworker creations.
 */
void __init workqueue_init(void)
{
        struct workqueue_struct *wq;
        struct worker_pool *pool;
        int cpu, bkt;

        /*
         * It'd be simpler to initialize NUMA in workqueue_init_early() but
         * CPU to node mapping may not be available that early on some
         * archs such as power and arm64.  As per-cpu pools created
         * previously could be missing node hint and unbound pools NUMA
         * affinity, fix them up.
         *
         * Also, while iterating workqueues, create rescuers if requested.
         */
        wq_numa_init();

        mutex_lock(&wq_pool_mutex);

        for_each_possible_cpu(cpu) {
                for_each_cpu_worker_pool(pool, cpu) {
                        pool->node = cpu_to_node(cpu);
                }
        }

        list_for_each_entry(wq, &workqueues, list) {
                wq_update_unbound_numa(wq, smp_processor_id(), true);
                WARN(init_rescuer(wq),
                     "workqueue: failed to create early rescuer for %s",
                     wq->name);
        }

        mutex_unlock(&wq_pool_mutex);

        /* create the initial workers */
        for_each_online_cpu(cpu) {
                for_each_cpu_worker_pool(pool, cpu) {
                        pool->flags &= ~POOL_DISASSOCIATED;
                        BUG_ON(!create_worker(pool));
                }
        }

        hash_for_each(unbound_pool_hash, bkt, pool, hash_node)
                BUG_ON(!create_worker(pool));

        wq_online = true;
        wq_watchdog_init();
}













































































































































































    1 


    1 
    1 



























































































    1 














    1 

    1 


    1 














































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/proc/net.c
 *
 *  Copyright (C) 2007
 *
 *  Author: Eric Biederman <ebiederm@xmission.com>
 *
 *  proc net directory handling functions
 */

#include <linux/uaccess.h>

#include <linux/errno.h>
#include <linux/time.h>
#include <linux/proc_fs.h>
#include <linux/stat.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/module.h>
#include <linux/bitops.h>
#include <linux/mount.h>
#include <linux/nsproxy.h>
#include <linux/uidgid.h>
#include <net/net_namespace.h>
#include <linux/seq_file.h>

#include "internal.h"

static inline struct net *PDE_NET(struct proc_dir_entry *pde)
{
        return pde->parent->data;
}

static struct net *get_proc_net(const struct inode *inode)
{
        return maybe_get_net(PDE_NET(PDE(inode)));
}

static int seq_open_net(struct inode *inode, struct file *file)
{
        unsigned int state_size = PDE(inode)->state_size;
        struct seq_net_private *p;
        struct net *net;

        WARN_ON_ONCE(state_size < sizeof(*p));

        if (file->f_mode & FMODE_WRITE && !PDE(inode)->write)
                return -EACCES;

        net = get_proc_net(inode);
        if (!net)
                return -ENXIO;

        p = __seq_open_private(file, PDE(inode)->seq_ops, state_size);
        if (!p) {
                put_net(net);
                return -ENOMEM;
        }
#ifdef CONFIG_NET_NS
        p->net = net;
#endif
        return 0;
}

static int seq_release_net(struct inode *ino, struct file *f)
{
        struct seq_file *seq = f->private_data;

        put_net(seq_file_net(seq));
        seq_release_private(ino, f);
        return 0;
}

static const struct proc_ops proc_net_seq_ops = {
        .proc_open        = seq_open_net,
        .proc_read        = seq_read,
        .proc_write        = proc_simple_write,
        .proc_lseek        = seq_lseek,
        .proc_release        = seq_release_net,
};

int bpf_iter_init_seq_net(void *priv_data, struct bpf_iter_aux_info *aux)
{
#ifdef CONFIG_NET_NS
        struct seq_net_private *p = priv_data;

        p->net = get_net(current->nsproxy->net_ns);
#endif
        return 0;
}

void bpf_iter_fini_seq_net(void *priv_data)
{
#ifdef CONFIG_NET_NS
        struct seq_net_private *p = priv_data;

        put_net(p->net);
#endif
}

struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode,
                struct proc_dir_entry *parent, const struct seq_operations *ops,
                unsigned int state_size, void *data)
{
        struct proc_dir_entry *p;

        p = proc_create_reg(name, mode, &parent, data);
        if (!p)
                return NULL;
        pde_force_lookup(p);
        p->proc_ops = &proc_net_seq_ops;
        p->seq_ops = ops;
        p->state_size = state_size;
        return proc_register(parent, p);
}
EXPORT_SYMBOL_GPL(proc_create_net_data);

/**
 * proc_create_net_data_write - Create a writable net_ns-specific proc file
 * @name: The name of the file.
 * @mode: The file's access mode.
 * @parent: The parent directory in which to create.
 * @ops: The seq_file ops with which to read the file.
 * @write: The write method which which to 'modify' the file.
 * @data: Data for retrieval by PDE_DATA().
 *
 * Create a network namespaced proc file in the @parent directory with the
 * specified @name and @mode that allows reading of a file that displays a
 * series of elements and also provides for the file accepting writes that have
 * some arbitrary effect.
 *
 * The functions in the @ops table are used to iterate over items to be
 * presented and extract the readable content using the seq_file interface.
 *
 * The @write function is called with the data copied into a kernel space
 * scratch buffer and has a NUL appended for convenience.  The buffer may be
 * modified by the @write function.  @write should return 0 on success.
 *
 * The @data value is accessible from the @show and @write functions by calling
 * PDE_DATA() on the file inode.  The network namespace must be accessed by
 * calling seq_file_net() on the seq_file struct.
 */
struct proc_dir_entry *proc_create_net_data_write(const char *name, umode_t mode,
                                                  struct proc_dir_entry *parent,
                                                  const struct seq_operations *ops,
                                                  proc_write_t write,
                                                  unsigned int state_size, void *data)
{
        struct proc_dir_entry *p;

        p = proc_create_reg(name, mode, &parent, data);
        if (!p)
                return NULL;
        pde_force_lookup(p);
        p->proc_ops = &proc_net_seq_ops;
        p->seq_ops = ops;
        p->state_size = state_size;
        p->write = write;
        return proc_register(parent, p);
}
EXPORT_SYMBOL_GPL(proc_create_net_data_write);

static int single_open_net(struct inode *inode, struct file *file)
{
        struct proc_dir_entry *de = PDE(inode);
        struct net *net;
        int err;

        net = get_proc_net(inode);
        if (!net)
                return -ENXIO;

        err = single_open(file, de->single_show, net);
        if (err)
                put_net(net);
        return err;
}

static int single_release_net(struct inode *ino, struct file *f)
{
        struct seq_file *seq = f->private_data;
        put_net(seq->private);
        return single_release(ino, f);
}

static const struct proc_ops proc_net_single_ops = {
        .proc_open        = single_open_net,
        .proc_read        = seq_read,
        .proc_write        = proc_simple_write,
        .proc_lseek        = seq_lseek,
        .proc_release        = single_release_net,
};

struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode,
                struct proc_dir_entry *parent,
                int (*show)(struct seq_file *, void *), void *data)
{
        struct proc_dir_entry *p;

        p = proc_create_reg(name, mode, &parent, data);
        if (!p)
                return NULL;
        pde_force_lookup(p);
        p->proc_ops = &proc_net_single_ops;
        p->single_show = show;
        return proc_register(parent, p);
}
EXPORT_SYMBOL_GPL(proc_create_net_single);

/**
 * proc_create_net_single_write - Create a writable net_ns-specific proc file
 * @name: The name of the file.
 * @mode: The file's access mode.
 * @parent: The parent directory in which to create.
 * @show: The seqfile show method with which to read the file.
 * @write: The write method which which to 'modify' the file.
 * @data: Data for retrieval by PDE_DATA().
 *
 * Create a network-namespaced proc file in the @parent directory with the
 * specified @name and @mode that allows reading of a file that displays a
 * single element rather than a series and also provides for the file accepting
 * writes that have some arbitrary effect.
 *
 * The @show function is called to extract the readable content via the
 * seq_file interface.
 *
 * The @write function is called with the data copied into a kernel space
 * scratch buffer and has a NUL appended for convenience.  The buffer may be
 * modified by the @write function.  @write should return 0 on success.
 *
 * The @data value is accessible from the @show and @write functions by calling
 * PDE_DATA() on the file inode.  The network namespace must be accessed by
 * calling seq_file_single_net() on the seq_file struct.
 */
struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mode,
                                                    struct proc_dir_entry *parent,
                                                    int (*show)(struct seq_file *, void *),
                                                    proc_write_t write,
                                                    void *data)
{
        struct proc_dir_entry *p;

        p = proc_create_reg(name, mode, &parent, data);
        if (!p)
                return NULL;
        pde_force_lookup(p);
        p->proc_ops = &proc_net_single_ops;
        p->single_show = show;
        p->write = write;
        return proc_register(parent, p);
}
EXPORT_SYMBOL_GPL(proc_create_net_single_write);

static struct net *get_proc_task_net(struct inode *dir)
{
        struct task_struct *task;
        struct nsproxy *ns;
        struct net *net = NULL;

        rcu_read_lock();
        task = pid_task(proc_pid(dir), PIDTYPE_PID);
        if (task != NULL) {
                task_lock(task);
                ns = task->nsproxy;
                if (ns != NULL)
                        net = get_net(ns->net_ns);
                task_unlock(task);
        }
        rcu_read_unlock();

        return net;
}

static struct dentry *proc_tgid_net_lookup(struct inode *dir,
                struct dentry *dentry, unsigned int flags)
{
        struct dentry *de;
        struct net *net;

        de = ERR_PTR(-ENOENT);
        net = get_proc_task_net(dir);
        if (net != NULL) {
                de = proc_lookup_de(dir, dentry, net->proc_net);
                put_net(net);
        }
        return de;
}

static int proc_tgid_net_getattr(const struct path *path, struct kstat *stat,
                                 u32 request_mask, unsigned int query_flags)
{
        struct inode *inode = d_inode(path->dentry);
        struct net *net;

        net = get_proc_task_net(inode);

        generic_fillattr(inode, stat);

        if (net != NULL) {
                stat->nlink = net->proc_net->nlink;
                put_net(net);
        }

        return 0;
}

const struct inode_operations proc_net_inode_operations = {
        .lookup                = proc_tgid_net_lookup,
        .getattr        = proc_tgid_net_getattr,
};

static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx)
{
        int ret;
        struct net *net;

        ret = -EINVAL;
        net = get_proc_task_net(file_inode(file));
        if (net != NULL) {
                ret = proc_readdir_de(file, ctx, net->proc_net);
                put_net(net);
        }
        return ret;
}

const struct file_operations proc_net_operations = {
        .llseek                = generic_file_llseek,
        .read                = generic_read_dir,
        .iterate_shared        = proc_tgid_net_readdir,
};

static __net_init int proc_net_ns_init(struct net *net)
{
        struct proc_dir_entry *netd, *net_statd;
        kuid_t uid;
        kgid_t gid;
        int err;

        err = -ENOMEM;
        netd = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL);
        if (!netd)
                goto out;

        netd->subdir = RB_ROOT;
        netd->data = net;
        netd->nlink = 2;
        netd->namelen = 3;
        netd->parent = &proc_root;
        netd->name = netd->inline_name;
        memcpy(netd->name, "net", 4);

        uid = make_kuid(net->user_ns, 0);
        if (!uid_valid(uid))
                uid = netd->uid;

        gid = make_kgid(net->user_ns, 0);
        if (!gid_valid(gid))
                gid = netd->gid;

        proc_set_user(netd, uid, gid);

        /* Seed dentry revalidation for /proc/${pid}/net */
        pde_force_lookup(netd);

        err = -EEXIST;
        net_statd = proc_net_mkdir(net, "stat", netd);
        if (!net_statd)
                goto free_net;

        net->proc_net = netd;
        net->proc_net_stat = net_statd;
        return 0;

free_net:
        pde_free(netd);
out:
        return err;
}

static __net_exit void proc_net_ns_exit(struct net *net)
{
        remove_proc_entry("stat", net->proc_net);
        pde_free(net->proc_net);
}

static struct pernet_operations __net_initdata proc_net_ns_ops = {
        .init = proc_net_ns_init,
        .exit = proc_net_ns_exit,
};

int __init proc_net_init(void)
{
        proc_symlink("net", NULL, "self/net");

        return register_pernet_subsys(&proc_net_ns_ops);
}













    1 























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_TASK_WORK_H
#define _LINUX_TASK_WORK_H

#include <linux/list.h>
#include <linux/sched.h>

typedef void (*task_work_func_t)(struct callback_head *);

static inline void
init_task_work(struct callback_head *twork, task_work_func_t func)
{
        twork->func = func;
}

enum task_work_notify_mode {
        TWA_NONE,
        TWA_RESUME,
        TWA_SIGNAL,
};

int task_work_add(struct task_struct *task, struct callback_head *twork,
                        enum task_work_notify_mode mode);

struct callback_head *task_work_cancel_match(struct task_struct *task,
        bool (*match)(struct callback_head *, void *data), void *data);
struct callback_head *task_work_cancel_func(struct task_struct *, task_work_func_t);
bool task_work_cancel(struct task_struct *task, struct callback_head *cb);
void task_work_run(void);

static inline void exit_task_work(struct task_struct *task)
{
        task_work_run();
}

#endif        /* _LINUX_TASK_WORK_H */
































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
/* SPDX-License-Identifier: GPL-2.0 */
/* Based on net/wireless/trace.h */

#undef TRACE_SYSTEM
#define TRACE_SYSTEM cfg802154

#if !defined(__RDEV_CFG802154_OPS_TRACE) || defined(TRACE_HEADER_MULTI_READ)
#define __RDEV_CFG802154_OPS_TRACE

#include <linux/tracepoint.h>

#include <net/cfg802154.h>

#define MAXNAME                32
#define WPAN_PHY_ENTRY        __array(char, wpan_phy_name, MAXNAME)
#define WPAN_PHY_ASSIGN        strlcpy(__entry->wpan_phy_name,         \
                                wpan_phy_name(wpan_phy), \
                                MAXNAME)
#define WPAN_PHY_PR_FMT        "%s"
#define WPAN_PHY_PR_ARG        __entry->wpan_phy_name

#define WPAN_DEV_ENTRY        __field(u32, identifier)
#define WPAN_DEV_ASSIGN        (__entry->identifier) = (!IS_ERR_OR_NULL(wpan_dev) \
                                         ? wpan_dev->identifier : 0)
#define WPAN_DEV_PR_FMT        "wpan_dev(%u)"
#define WPAN_DEV_PR_ARG        (__entry->identifier)

#define WPAN_CCA_ENTRY        __field(enum nl802154_cca_modes, cca_mode) \
                        __field(enum nl802154_cca_opts, cca_opt)
#define WPAN_CCA_ASSIGN \
        do {                                         \
                (__entry->cca_mode) = cca->mode; \
                (__entry->cca_opt) = cca->opt;         \
        } while (0)
#define WPAN_CCA_PR_FMT        "cca_mode: %d, cca_opt: %d"
#define WPAN_CCA_PR_ARG __entry->cca_mode, __entry->cca_opt

#define BOOL_TO_STR(bo) (bo) ? "true" : "false"

/*************************************************************
 *                        rdev->ops traces                     *
 *************************************************************/

DECLARE_EVENT_CLASS(wpan_phy_only_evt,
        TP_PROTO(struct wpan_phy *wpan_phy),
        TP_ARGS(wpan_phy),
        TP_STRUCT__entry(
                WPAN_PHY_ENTRY
        ),
        TP_fast_assign(
                WPAN_PHY_ASSIGN;
        ),
        TP_printk(WPAN_PHY_PR_FMT, WPAN_PHY_PR_ARG)
);

DEFINE_EVENT(wpan_phy_only_evt, 802154_rdev_suspend,
        TP_PROTO(struct wpan_phy *wpan_phy),
        TP_ARGS(wpan_phy)
);

DEFINE_EVENT(wpan_phy_only_evt, 802154_rdev_resume,
        TP_PROTO(struct wpan_phy *wpan_phy),
        TP_ARGS(wpan_phy)
);

TRACE_EVENT(802154_rdev_add_virtual_intf,
        TP_PROTO(struct wpan_phy *wpan_phy, char *name,
                 enum nl802154_iftype type, __le64 extended_addr),
        TP_ARGS(wpan_phy, name, type, extended_addr),
        TP_STRUCT__entry(
                WPAN_PHY_ENTRY
                __string(vir_intf_name, name ? name : "<noname>")
                __field(enum nl802154_iftype, type)
                __field(__le64, extended_addr)
        ),
        TP_fast_assign(
                WPAN_PHY_ASSIGN;
                __assign_str(vir_intf_name, name ? name : "<noname>");
                __entry->type = type;
                __entry->extended_addr = extended_addr;
        ),
        TP_printk(WPAN_PHY_PR_FMT ", virtual intf name: %s, type: %d, extended addr: 0x%llx",
                  WPAN_PHY_PR_ARG, __get_str(vir_intf_name), __entry->type,
                  __le64_to_cpu(__entry->extended_addr))
);

TRACE_EVENT(802154_rdev_del_virtual_intf,
        TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev),
        TP_ARGS(wpan_phy, wpan_dev),
        TP_STRUCT__entry(
                WPAN_PHY_ENTRY
                WPAN_DEV_ENTRY
        ),
        TP_fast_assign(
                WPAN_PHY_ASSIGN;
                WPAN_DEV_ASSIGN;
        ),
        TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT, WPAN_PHY_PR_ARG,
                  WPAN_DEV_PR_ARG)
);

TRACE_EVENT(802154_rdev_set_channel,
        TP_PROTO(struct wpan_phy *wpan_phy, u8 page, u8 channel),
        TP_ARGS(wpan_phy, page, channel),
        TP_STRUCT__entry(
                WPAN_PHY_ENTRY
                __field(u8, page)
                __field(u8, channel)
        ),
        TP_fast_assign(
                WPAN_PHY_ASSIGN;
                __entry->page = page;
                __entry->channel = channel;
        ),
        TP_printk(WPAN_PHY_PR_FMT ", page: %d, channel: %d", WPAN_PHY_PR_ARG,
                  __entry->page, __entry->channel)
);

TRACE_EVENT(802154_rdev_set_tx_power,
        TP_PROTO(struct wpan_phy *wpan_phy, s32 power),
        TP_ARGS(wpan_phy, power),
        TP_STRUCT__entry(
                WPAN_PHY_ENTRY
                __field(s32, power)
        ),
        TP_fast_assign(
                WPAN_PHY_ASSIGN;
                __entry->power = power;
        ),
        TP_printk(WPAN_PHY_PR_FMT ", mbm: %d", WPAN_PHY_PR_ARG,
                  __entry->power)
);

TRACE_EVENT(802154_rdev_set_cca_mode,
        TP_PROTO(struct wpan_phy *wpan_phy, const struct wpan_phy_cca *cca),
        TP_ARGS(wpan_phy, cca),
        TP_STRUCT__entry(
                WPAN_PHY_ENTRY
                WPAN_CCA_ENTRY
        ),
        TP_fast_assign(
                WPAN_PHY_ASSIGN;
                WPAN_CCA_ASSIGN;
        ),
        TP_printk(WPAN_PHY_PR_FMT ", " WPAN_CCA_PR_FMT, WPAN_PHY_PR_ARG,
                  WPAN_CCA_PR_ARG)
);

TRACE_EVENT(802154_rdev_set_cca_ed_level,
        TP_PROTO(struct wpan_phy *wpan_phy, s32 ed_level),
        TP_ARGS(wpan_phy, ed_level),
        TP_STRUCT__entry(
                WPAN_PHY_ENTRY
                __field(s32, ed_level)
        ),
        TP_fast_assign(
                WPAN_PHY_ASSIGN;
                __entry->ed_level = ed_level;
        ),
        TP_printk(WPAN_PHY_PR_FMT ", ed level: %d", WPAN_PHY_PR_ARG,
                  __entry->ed_level)
);

DECLARE_EVENT_CLASS(802154_le16_template,
        TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
                 __le16 le16arg),
        TP_ARGS(wpan_phy, wpan_dev, le16arg),
        TP_STRUCT__entry(
                WPAN_PHY_ENTRY
                WPAN_DEV_ENTRY
                __field(__le16, le16arg)
        ),
        TP_fast_assign(
                WPAN_PHY_ASSIGN;
                WPAN_DEV_ASSIGN;
                __entry->le16arg = le16arg;
        ),
        TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", pan id: 0x%04x",
                  WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG,
                  __le16_to_cpu(__entry->le16arg))
);

DEFINE_EVENT(802154_le16_template, 802154_rdev_set_pan_id,
        TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
                 __le16 le16arg),
        TP_ARGS(wpan_phy, wpan_dev, le16arg)
);

DEFINE_EVENT_PRINT(802154_le16_template, 802154_rdev_set_short_addr,
        TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
                 __le16 le16arg),
        TP_ARGS(wpan_phy, wpan_dev, le16arg),
        TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", short addr: 0x%04x",
                  WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG,
                  __le16_to_cpu(__entry->le16arg))
);

TRACE_EVENT(802154_rdev_set_backoff_exponent,
        TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
                 u8 min_be, u8 max_be),
        TP_ARGS(wpan_phy, wpan_dev, min_be, max_be),
        TP_STRUCT__entry(
                WPAN_PHY_ENTRY
                WPAN_DEV_ENTRY
                __field(u8, min_be)
                __field(u8, max_be)
        ),
        TP_fast_assign(
                WPAN_PHY_ASSIGN;
                WPAN_DEV_ASSIGN;
                __entry->min_be = min_be;
                __entry->max_be = max_be;
        ),

        TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT
                  ", min be: %d, max be: %d", WPAN_PHY_PR_ARG,
                  WPAN_DEV_PR_ARG, __entry->min_be, __entry->max_be)
);

TRACE_EVENT(802154_rdev_set_csma_backoffs,
        TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
                 u8 max_csma_backoffs),
        TP_ARGS(wpan_phy, wpan_dev, max_csma_backoffs),
        TP_STRUCT__entry(
                WPAN_PHY_ENTRY
                WPAN_DEV_ENTRY
                __field(u8, max_csma_backoffs)
        ),
        TP_fast_assign(
                WPAN_PHY_ASSIGN;
                WPAN_DEV_ASSIGN;
                __entry->max_csma_backoffs = max_csma_backoffs;
        ),

        TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT
                  ", max csma backoffs: %d", WPAN_PHY_PR_ARG,
                  WPAN_DEV_PR_ARG, __entry->max_csma_backoffs)
);

TRACE_EVENT(802154_rdev_set_max_frame_retries,
        TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
                 s8 max_frame_retries),
        TP_ARGS(wpan_phy, wpan_dev, max_frame_retries),
        TP_STRUCT__entry(
                WPAN_PHY_ENTRY
                WPAN_DEV_ENTRY
                __field(s8, max_frame_retries)
        ),
        TP_fast_assign(
                WPAN_PHY_ASSIGN;
                WPAN_DEV_ASSIGN;
                __entry->max_frame_retries = max_frame_retries;
        ),

        TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT
                  ", max frame retries: %d", WPAN_PHY_PR_ARG,
                  WPAN_DEV_PR_ARG, __entry->max_frame_retries)
);

TRACE_EVENT(802154_rdev_set_lbt_mode,
        TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
                 bool mode),
        TP_ARGS(wpan_phy, wpan_dev, mode),
        TP_STRUCT__entry(
                WPAN_PHY_ENTRY
                WPAN_DEV_ENTRY
                __field(bool, mode)
        ),
        TP_fast_assign(
                WPAN_PHY_ASSIGN;
                WPAN_DEV_ASSIGN;
                __entry->mode = mode;
        ),
        TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT
                ", lbt mode: %s", WPAN_PHY_PR_ARG,
                WPAN_DEV_PR_ARG, BOOL_TO_STR(__entry->mode))
);

TRACE_EVENT(802154_rdev_set_ackreq_default,
        TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
                 bool ackreq),
        TP_ARGS(wpan_phy, wpan_dev, ackreq),
        TP_STRUCT__entry(
                WPAN_PHY_ENTRY
                WPAN_DEV_ENTRY
                __field(bool, ackreq)
        ),
        TP_fast_assign(
                WPAN_PHY_ASSIGN;
                WPAN_DEV_ASSIGN;
                __entry->ackreq = ackreq;
        ),
        TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT
                ", ackreq default: %s", WPAN_PHY_PR_ARG,
                WPAN_DEV_PR_ARG, BOOL_TO_STR(__entry->ackreq))
);

TRACE_EVENT(802154_rdev_return_int,
        TP_PROTO(struct wpan_phy *wpan_phy, int ret),
        TP_ARGS(wpan_phy, ret),
        TP_STRUCT__entry(
                WPAN_PHY_ENTRY
                __field(int, ret)
        ),
        TP_fast_assign(
                WPAN_PHY_ASSIGN;
                __entry->ret = ret;
        ),
        TP_printk(WPAN_PHY_PR_FMT ", returned: %d", WPAN_PHY_PR_ARG,
                  __entry->ret)
);

#endif /* !__RDEV_CFG802154_OPS_TRACE || TRACE_HEADER_MULTI_READ */

#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH .
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE trace
#include <trace/define_trace.h>
































































































































    2 









    2 










    2 







    2 









    2 





    2 






























































































































































































    2 

    2 












    2 






































    2 


    2 
    2 













    2 








































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 



    2 































    2 



    2 










    2 



    2 


    2 






































    2 








    2 














    2 









    2 












    2 




    2 








    2 



    2 


    2 
    2 








    2 























    2 












    2 

    2 






    2 







    2 








































































































    2 






    2 







































    2 
















































































































































































































    2 







    2 

    2 




    2 

    2 





    2 











    2 

    2 






    2 

    2 
    2 
















    2 






    2 

    2 




    2 






    2 















    2 


    2 





























































































































































































    2 













    2 













    2 













    2 
















    2 












    2 































    2 










































































































































































































































































































    2 



    2 
    2 

    2 






    2 















    1 





    1 
    1 

    1 




    1 




    1 

    1 
    1 
    1 

    1 









    1 





    1 
    1 

    1 




    1 



    1 

    1 
    1 
    1 

    1 








































































































































































































































































    1 





    1 
    1 

    1 





    1 








    1 



    1 



    1 



    1 





    1 

    1 











    1 




























    1 





    1 

    1 


    1 




    1 


    1 














    1 



    1 




    1 




    1 









    1 



    1 

    1 
























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ext4/namei.c
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  from
 *
 *  linux/fs/minix/namei.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  Big-endian to little-endian byte-swapping/bitmaps by
 *        David S. Miller (davem@caip.rutgers.edu), 1995
 *  Directory entry file type support and forward compatibility hooks
 *        for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
 *  Hash Tree Directory indexing (c)
 *        Daniel Phillips, 2001
 *  Hash Tree Directory indexing porting
 *        Christopher Li, 2002
 *  Hash Tree Directory indexing cleanup
 *        Theodore Ts'o, 2002
 */

#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/time.h>
#include <linux/fcntl.h>
#include <linux/stat.h>
#include <linux/string.h>
#include <linux/quotaops.h>
#include <linux/buffer_head.h>
#include <linux/bio.h>
#include <linux/iversion.h>
#include <linux/unicode.h>
#include "ext4.h"
#include "ext4_jbd2.h"

#include "xattr.h"
#include "acl.h"

#include <trace/events/ext4.h>
/*
 * define how far ahead to read directories while searching them.
 */
#define NAMEI_RA_CHUNKS  2
#define NAMEI_RA_BLOCKS  4
#define NAMEI_RA_SIZE             (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)

static struct buffer_head *ext4_append(handle_t *handle,
                                        struct inode *inode,
                                        ext4_lblk_t *block)
{
        struct ext4_map_blocks map;
        struct buffer_head *bh;
        int err;

        if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb &&
                     ((inode->i_size >> 10) >=
                      EXT4_SB(inode->i_sb)->s_max_dir_size_kb)))
                return ERR_PTR(-ENOSPC);

        *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
        map.m_lblk = *block;
        map.m_len = 1;

        /*
         * We're appending new directory block. Make sure the block is not
         * allocated yet, otherwise we will end up corrupting the
         * directory.
         */
        err = ext4_map_blocks(NULL, inode, &map, 0);
        if (err < 0)
                return ERR_PTR(err);
        if (err) {
                EXT4_ERROR_INODE(inode, "Logical block already allocated");
                return ERR_PTR(-EFSCORRUPTED);
        }

        bh = ext4_bread(handle, inode, *block, EXT4_GET_BLOCKS_CREATE);
        if (IS_ERR(bh))
                return bh;
        inode->i_size += inode->i_sb->s_blocksize;
        EXT4_I(inode)->i_disksize = inode->i_size;
        BUFFER_TRACE(bh, "get_write_access");
        err = ext4_journal_get_write_access(handle, bh);
        if (err) {
                brelse(bh);
                ext4_std_error(inode->i_sb, err);
                return ERR_PTR(err);
        }
        return bh;
}

static int ext4_dx_csum_verify(struct inode *inode,
                               struct ext4_dir_entry *dirent);

/*
 * Hints to ext4_read_dirblock regarding whether we expect a directory
 * block being read to be an index block, or a block containing
 * directory entries (and if the latter, whether it was found via a
 * logical block in an htree index block).  This is used to control
 * what sort of sanity checkinig ext4_read_dirblock() will do on the
 * directory block read from the storage device.  EITHER will means
 * the caller doesn't know what kind of directory block will be read,
 * so no specific verification will be done.
 */
typedef enum {
        EITHER, INDEX, DIRENT, DIRENT_HTREE
} dirblock_type_t;

#define ext4_read_dirblock(inode, block, type) \
        __ext4_read_dirblock((inode), (block), (type), __func__, __LINE__)

static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
                                                ext4_lblk_t block,
                                                dirblock_type_t type,
                                                const char *func,
                                                unsigned int line)
{
        struct buffer_head *bh;
        struct ext4_dir_entry *dirent;
        int is_dx_block = 0;

        if (block >= inode->i_size >> inode->i_blkbits) {
                ext4_error_inode(inode, func, line, block,
                       "Attempting to read directory block (%u) that is past i_size (%llu)",
                       block, inode->i_size);
                return ERR_PTR(-EFSCORRUPTED);
        }

        if (ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_EIO))
                bh = ERR_PTR(-EIO);
        else
                bh = ext4_bread(NULL, inode, block, 0);
        if (IS_ERR(bh)) {
                __ext4_warning(inode->i_sb, func, line,
                               "inode #%lu: lblock %lu: comm %s: "
                               "error %ld reading directory block",
                               inode->i_ino, (unsigned long)block,
                               current->comm, PTR_ERR(bh));

                return bh;
        }
        /* The first directory block must not be a hole. */
        if (!bh && (type == INDEX || type == DIRENT_HTREE || block == 0)) {
                ext4_error_inode(inode, func, line, block,
                                 "Directory hole found for htree %s block %u",
                                 (type == INDEX) ? "index" : "leaf", block);
                return ERR_PTR(-EFSCORRUPTED);
        }
        if (!bh)
                return NULL;
        dirent = (struct ext4_dir_entry *) bh->b_data;
        /* Determine whether or not we have an index block */
        if (is_dx(inode)) {
                if (block == 0)
                        is_dx_block = 1;
                else if (ext4_rec_len_from_disk(dirent->rec_len,
                                                inode->i_sb->s_blocksize) ==
                         inode->i_sb->s_blocksize)
                        is_dx_block = 1;
        }
        if (!is_dx_block && type == INDEX) {
                ext4_error_inode(inode, func, line, block,
                       "directory leaf block found instead of index block");
                brelse(bh);
                return ERR_PTR(-EFSCORRUPTED);
        }
        if (!ext4_has_metadata_csum(inode->i_sb) ||
            buffer_verified(bh))
                return bh;

        /*
         * An empty leaf block can get mistaken for a index block; for
         * this reason, we can only check the index checksum when the
         * caller is sure it should be an index block.
         */
        if (is_dx_block && type == INDEX) {
                if (ext4_dx_csum_verify(inode, dirent) &&
                    !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC))
                        set_buffer_verified(bh);
                else {
                        ext4_error_inode_err(inode, func, line, block,
                                             EFSBADCRC,
                                             "Directory index failed checksum");
                        brelse(bh);
                        return ERR_PTR(-EFSBADCRC);
                }
        }
        if (!is_dx_block) {
                if (ext4_dirblock_csum_verify(inode, bh) &&
                    !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC))
                        set_buffer_verified(bh);
                else {
                        ext4_error_inode_err(inode, func, line, block,
                                             EFSBADCRC,
                                             "Directory block failed checksum");
                        brelse(bh);
                        return ERR_PTR(-EFSBADCRC);
                }
        }
        return bh;
}

#ifndef assert
#define assert(test) J_ASSERT(test)
#endif

#ifdef DX_DEBUG
#define dxtrace(command) command
#else
#define dxtrace(command)
#endif

struct fake_dirent
{
        __le32 inode;
        __le16 rec_len;
        u8 name_len;
        u8 file_type;
};

struct dx_countlimit
{
        __le16 limit;
        __le16 count;
};

struct dx_entry
{
        __le32 hash;
        __le32 block;
};

/*
 * dx_root_info is laid out so that if it should somehow get overlaid by a
 * dirent the two low bits of the hash version will be zero.  Therefore, the
 * hash version mod 4 should never be 0.  Sincerely, the paranoia department.
 */

struct dx_root
{
        struct fake_dirent dot;
        char dot_name[4];
        struct fake_dirent dotdot;
        char dotdot_name[4];
        struct dx_root_info
        {
                __le32 reserved_zero;
                u8 hash_version;
                u8 info_length; /* 8 */
                u8 indirect_levels;
                u8 unused_flags;
        }
        info;
        struct dx_entry        entries[];
};

struct dx_node
{
        struct fake_dirent fake;
        struct dx_entry        entries[];
};


struct dx_frame
{
        struct buffer_head *bh;
        struct dx_entry *entries;
        struct dx_entry *at;
};

struct dx_map_entry
{
        u32 hash;
        u16 offs;
        u16 size;
};

/*
 * This goes at the end of each htree block.
 */
struct dx_tail {
        u32 dt_reserved;
        __le32 dt_checksum;        /* crc32c(uuid+inum+dirblock) */
};

static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
static inline unsigned dx_get_hash(struct dx_entry *entry);
static void dx_set_hash(struct dx_entry *entry, unsigned value);
static unsigned dx_get_count(struct dx_entry *entries);
static unsigned dx_get_limit(struct dx_entry *entries);
static void dx_set_count(struct dx_entry *entries, unsigned value);
static void dx_set_limit(struct dx_entry *entries, unsigned value);
static unsigned dx_root_limit(struct inode *dir, unsigned infosize);
static unsigned dx_node_limit(struct inode *dir);
static struct dx_frame *dx_probe(struct ext4_filename *fname,
                                 struct inode *dir,
                                 struct dx_hash_info *hinfo,
                                 struct dx_frame *frame);
static void dx_release(struct dx_frame *frames);
static int dx_make_map(struct inode *dir, struct buffer_head *bh,
                       struct dx_hash_info *hinfo,
                       struct dx_map_entry *map_tail);
static void dx_sort_map(struct dx_map_entry *map, unsigned count);
static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
                struct dx_map_entry *offsets, int count, unsigned blocksize);
static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize);
static void dx_insert_block(struct dx_frame *frame,
                                        u32 hash, ext4_lblk_t block);
static int ext4_htree_next_block(struct inode *dir, __u32 hash,
                                 struct dx_frame *frame,
                                 struct dx_frame *frames,
                                 __u32 *start_hash);
static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
                struct ext4_filename *fname,
                struct ext4_dir_entry_2 **res_dir);
static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
                             struct inode *dir, struct inode *inode);

/* checksumming functions */
void ext4_initialize_dirent_tail(struct buffer_head *bh,
                                 unsigned int blocksize)
{
        struct ext4_dir_entry_tail *t = EXT4_DIRENT_TAIL(bh->b_data, blocksize);

        memset(t, 0, sizeof(struct ext4_dir_entry_tail));
        t->det_rec_len = ext4_rec_len_to_disk(
                        sizeof(struct ext4_dir_entry_tail), blocksize);
        t->det_reserved_ft = EXT4_FT_DIR_CSUM;
}

/* Walk through a dirent block to find a checksum "dirent" at the tail */
static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
                                                   struct buffer_head *bh)
{
        struct ext4_dir_entry_tail *t;
        int blocksize = EXT4_BLOCK_SIZE(inode->i_sb);

#ifdef PARANOID
        struct ext4_dir_entry *d, *top;

        d = (struct ext4_dir_entry *)bh->b_data;
        top = (struct ext4_dir_entry *)(bh->b_data +
                (blocksize - sizeof(struct ext4_dir_entry_tail)));
        while (d < top && ext4_rec_len_from_disk(d->rec_len, blocksize))
                d = (struct ext4_dir_entry *)(((void *)d) +
                    ext4_rec_len_from_disk(d->rec_len, blocksize));

        if (d != top)
                return NULL;

        t = (struct ext4_dir_entry_tail *)d;
#else
        t = EXT4_DIRENT_TAIL(bh->b_data, EXT4_BLOCK_SIZE(inode->i_sb));
#endif

        if (t->det_reserved_zero1 ||
            (ext4_rec_len_from_disk(t->det_rec_len, blocksize) !=
             sizeof(struct ext4_dir_entry_tail)) ||
            t->det_reserved_zero2 ||
            t->det_reserved_ft != EXT4_FT_DIR_CSUM)
                return NULL;

        return t;
}

static __le32 ext4_dirblock_csum(struct inode *inode, void *dirent, int size)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);
        __u32 csum;

        csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
        return cpu_to_le32(csum);
}

#define warn_no_space_for_csum(inode)                                        \
        __warn_no_space_for_csum((inode), __func__, __LINE__)

static void __warn_no_space_for_csum(struct inode *inode, const char *func,
                                     unsigned int line)
{
        __ext4_warning_inode(inode, func, line,
                "No space for directory leaf checksum. Please run e2fsck -D.");
}

int ext4_dirblock_csum_verify(struct inode *inode, struct buffer_head *bh)
{
        struct ext4_dir_entry_tail *t;

        if (!ext4_has_metadata_csum(inode->i_sb))
                return 1;

        t = get_dirent_tail(inode, bh);
        if (!t) {
                warn_no_space_for_csum(inode);
                return 0;
        }

        if (t->det_checksum != ext4_dirblock_csum(inode, bh->b_data,
                                                  (char *)t - bh->b_data))
                return 0;

        return 1;
}

static void ext4_dirblock_csum_set(struct inode *inode,
                                 struct buffer_head *bh)
{
        struct ext4_dir_entry_tail *t;

        if (!ext4_has_metadata_csum(inode->i_sb))
                return;

        t = get_dirent_tail(inode, bh);
        if (!t) {
                warn_no_space_for_csum(inode);
                return;
        }

        t->det_checksum = ext4_dirblock_csum(inode, bh->b_data,
                                             (char *)t - bh->b_data);
}

int ext4_handle_dirty_dirblock(handle_t *handle,
                               struct inode *inode,
                               struct buffer_head *bh)
{
        ext4_dirblock_csum_set(inode, bh);
        return ext4_handle_dirty_metadata(handle, inode, bh);
}

static struct dx_countlimit *get_dx_countlimit(struct inode *inode,
                                               struct ext4_dir_entry *dirent,
                                               int *offset)
{
        struct ext4_dir_entry *dp;
        struct dx_root_info *root;
        int count_offset;
        int blocksize = EXT4_BLOCK_SIZE(inode->i_sb);
        unsigned int rlen = ext4_rec_len_from_disk(dirent->rec_len, blocksize);

        if (rlen == blocksize)
                count_offset = 8;
        else if (rlen == 12) {
                dp = (struct ext4_dir_entry *)(((void *)dirent) + 12);
                if (ext4_rec_len_from_disk(dp->rec_len, blocksize) != blocksize - 12)
                        return NULL;
                root = (struct dx_root_info *)(((void *)dp + 12));
                if (root->reserved_zero ||
                    root->info_length != sizeof(struct dx_root_info))
                        return NULL;
                count_offset = 32;
        } else
                return NULL;

        if (offset)
                *offset = count_offset;
        return (struct dx_countlimit *)(((void *)dirent) + count_offset);
}

static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
                           int count_offset, int count, struct dx_tail *t)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);
        __u32 csum;
        int size;
        __u32 dummy_csum = 0;
        int offset = offsetof(struct dx_tail, dt_checksum);

        size = count_offset + (count * sizeof(struct dx_entry));
        csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
        csum = ext4_chksum(sbi, csum, (__u8 *)t, offset);
        csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum));

        return cpu_to_le32(csum);
}

static int ext4_dx_csum_verify(struct inode *inode,
                               struct ext4_dir_entry *dirent)
{
        struct dx_countlimit *c;
        struct dx_tail *t;
        int count_offset, limit, count;

        if (!ext4_has_metadata_csum(inode->i_sb))
                return 1;

        c = get_dx_countlimit(inode, dirent, &count_offset);
        if (!c) {
                EXT4_ERROR_INODE(inode, "dir seems corrupt?  Run e2fsck -D.");
                return 0;
        }
        limit = le16_to_cpu(c->limit);
        count = le16_to_cpu(c->count);
        if (count_offset + (limit * sizeof(struct dx_entry)) >
            EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
                warn_no_space_for_csum(inode);
                return 0;
        }
        t = (struct dx_tail *)(((struct dx_entry *)c) + limit);

        if (t->dt_checksum != ext4_dx_csum(inode, dirent, count_offset,
                                            count, t))
                return 0;
        return 1;
}

static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent)
{
        struct dx_countlimit *c;
        struct dx_tail *t;
        int count_offset, limit, count;

        if (!ext4_has_metadata_csum(inode->i_sb))
                return;

        c = get_dx_countlimit(inode, dirent, &count_offset);
        if (!c) {
                EXT4_ERROR_INODE(inode, "dir seems corrupt?  Run e2fsck -D.");
                return;
        }
        limit = le16_to_cpu(c->limit);
        count = le16_to_cpu(c->count);
        if (count_offset + (limit * sizeof(struct dx_entry)) >
            EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
                warn_no_space_for_csum(inode);
                return;
        }
        t = (struct dx_tail *)(((struct dx_entry *)c) + limit);

        t->dt_checksum = ext4_dx_csum(inode, dirent, count_offset, count, t);
}

static inline int ext4_handle_dirty_dx_node(handle_t *handle,
                                            struct inode *inode,
                                            struct buffer_head *bh)
{
        ext4_dx_csum_set(inode, (struct ext4_dir_entry *)bh->b_data);
        return ext4_handle_dirty_metadata(handle, inode, bh);
}

/*
 * p is at least 6 bytes before the end of page
 */
static inline struct ext4_dir_entry_2 *
ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize)
{
        return (struct ext4_dir_entry_2 *)((char *)p +
                ext4_rec_len_from_disk(p->rec_len, blocksize));
}

/*
 * Future: use high four bits of block for coalesce-on-delete flags
 * Mask them off for now.
 */

static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
{
        return le32_to_cpu(entry->block) & 0x0fffffff;
}

static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
{
        entry->block = cpu_to_le32(value);
}

static inline unsigned dx_get_hash(struct dx_entry *entry)
{
        return le32_to_cpu(entry->hash);
}

static inline void dx_set_hash(struct dx_entry *entry, unsigned value)
{
        entry->hash = cpu_to_le32(value);
}

static inline unsigned dx_get_count(struct dx_entry *entries)
{
        return le16_to_cpu(((struct dx_countlimit *) entries)->count);
}

static inline unsigned dx_get_limit(struct dx_entry *entries)
{
        return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
}

static inline void dx_set_count(struct dx_entry *entries, unsigned value)
{
        ((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
}

static inline void dx_set_limit(struct dx_entry *entries, unsigned value)
{
        ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
}

static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
{
        unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
                EXT4_DIR_REC_LEN(2) - infosize;

        if (ext4_has_metadata_csum(dir->i_sb))
                entry_space -= sizeof(struct dx_tail);
        return entry_space / sizeof(struct dx_entry);
}

static inline unsigned dx_node_limit(struct inode *dir)
{
        unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);

        if (ext4_has_metadata_csum(dir->i_sb))
                entry_space -= sizeof(struct dx_tail);
        return entry_space / sizeof(struct dx_entry);
}

/*
 * Debug
 */
#ifdef DX_DEBUG
static void dx_show_index(char * label, struct dx_entry *entries)
{
        int i, n = dx_get_count (entries);
        printk(KERN_DEBUG "%s index", label);
        for (i = 0; i < n; i++) {
                printk(KERN_CONT " %x->%lu",
                       i ? dx_get_hash(entries + i) : 0,
                       (unsigned long)dx_get_block(entries + i));
        }
        printk(KERN_CONT "\n");
}

struct stats
{
        unsigned names;
        unsigned space;
        unsigned bcount;
};

static struct stats dx_show_leaf(struct inode *dir,
                                struct dx_hash_info *hinfo,
                                struct ext4_dir_entry_2 *de,
                                int size, int show_names)
{
        unsigned names = 0, space = 0;
        char *base = (char *) de;
        struct dx_hash_info h = *hinfo;

        printk("names: ");
        while ((char *) de < base + size)
        {
                if (de->inode)
                {
                        if (show_names)
                        {
#ifdef CONFIG_FS_ENCRYPTION
                                int len;
                                char *name;
                                struct fscrypt_str fname_crypto_str =
                                        FSTR_INIT(NULL, 0);
                                int res = 0;

                                name  = de->name;
                                len = de->name_len;
                                if (IS_ENCRYPTED(dir))
                                        res = fscrypt_get_encryption_info(dir);
                                if (res) {
                                        printk(KERN_WARNING "Error setting up"
                                               " fname crypto: %d\n", res);
                                }
                                if (!fscrypt_has_encryption_key(dir)) {
                                        /* Directory is not encrypted */
                                        ext4fs_dirhash(dir, de->name,
                                                de->name_len, &h);
                                        printk("%*.s:(U)%x.%u ", len,
                                               name, h.hash,
                                               (unsigned) ((char *) de
                                                           - base));
                                } else {
                                        struct fscrypt_str de_name =
                                                FSTR_INIT(name, len);

                                        /* Directory is encrypted */
                                        res = fscrypt_fname_alloc_buffer(
                                                len, &fname_crypto_str);
                                        if (res)
                                                printk(KERN_WARNING "Error "
                                                        "allocating crypto "
                                                        "buffer--skipping "
                                                        "crypto\n");
                                        res = fscrypt_fname_disk_to_usr(dir,
                                                0, 0, &de_name,
                                                &fname_crypto_str);
                                        if (res) {
                                                printk(KERN_WARNING "Error "
                                                        "converting filename "
                                                        "from disk to usr"
                                                        "\n");
                                                name = "??";
                                                len = 2;
                                        } else {
                                                name = fname_crypto_str.name;
                                                len = fname_crypto_str.len;
                                        }
                                        ext4fs_dirhash(dir, de->name,
                                                       de->name_len, &h);
                                        printk("%*.s:(E)%x.%u ", len, name,
                                               h.hash, (unsigned) ((char *) de
                                                                   - base));
                                        fscrypt_fname_free_buffer(
                                                        &fname_crypto_str);
                                }
#else
                                int len = de->name_len;
                                char *name = de->name;
                                ext4fs_dirhash(dir, de->name, de->name_len, &h);
                                printk("%*.s:%x.%u ", len, name, h.hash,
                                       (unsigned) ((char *) de - base));
#endif
                        }
                        space += EXT4_DIR_REC_LEN(de->name_len);
                        names++;
                }
                de = ext4_next_entry(de, size);
        }
        printk(KERN_CONT "(%i)\n", names);
        return (struct stats) { names, space, 1 };
}

struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
                             struct dx_entry *entries, int levels)
{
        unsigned blocksize = dir->i_sb->s_blocksize;
        unsigned count = dx_get_count(entries), names = 0, space = 0, i;
        unsigned bcount = 0;
        struct buffer_head *bh;
        printk("%i indexed blocks...\n", count);
        for (i = 0; i < count; i++, entries++)
        {
                ext4_lblk_t block = dx_get_block(entries);
                ext4_lblk_t hash  = i ? dx_get_hash(entries): 0;
                u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
                struct stats stats;
                printk("%s%3u:%03u hash %8x/%8x ",levels?"":"   ", i, block, hash, range);
                bh = ext4_bread(NULL,dir, block, 0);
                if (!bh || IS_ERR(bh))
                        continue;
                stats = levels?
                   dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
                   dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *)
                        bh->b_data, blocksize, 0);
                names += stats.names;
                space += stats.space;
                bcount += stats.bcount;
                brelse(bh);
        }
        if (bcount)
                printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n",
                       levels ? "" : "   ", names, space/bcount,
                       (space/bcount)*100/blocksize);
        return (struct stats) { names, space, bcount};
}
#endif /* DX_DEBUG */

/*
 * Probe for a directory leaf block to search.
 *
 * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
 * error in the directory index, and the caller should fall back to
 * searching the directory normally.  The callers of dx_probe **MUST**
 * check for this error code, and make sure it never gets reflected
 * back to userspace.
 */
static struct dx_frame *
dx_probe(struct ext4_filename *fname, struct inode *dir,
         struct dx_hash_info *hinfo, struct dx_frame *frame_in)
{
        unsigned count, indirect, level, i;
        struct dx_entry *at, *entries, *p, *q, *m;
        struct dx_root *root;
        struct dx_frame *frame = frame_in;
        struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
        u32 hash;
        ext4_lblk_t block;
        ext4_lblk_t blocks[EXT4_HTREE_LEVEL];

        memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
        frame->bh = ext4_read_dirblock(dir, 0, INDEX);
        if (IS_ERR(frame->bh))
                return (struct dx_frame *) frame->bh;

        root = (struct dx_root *) frame->bh->b_data;
        if (root->info.hash_version != DX_HASH_TEA &&
            root->info.hash_version != DX_HASH_HALF_MD4 &&
            root->info.hash_version != DX_HASH_LEGACY) {
                ext4_warning_inode(dir, "Unrecognised inode hash code %u",
                                   root->info.hash_version);
                goto fail;
        }
        if (fname)
                hinfo = &fname->hinfo;
        hinfo->hash_version = root->info.hash_version;
        if (hinfo->hash_version <= DX_HASH_TEA)
                hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
        hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
        if (fname && fname_name(fname))
                ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), hinfo);
        hash = hinfo->hash;

        if (root->info.unused_flags & 1) {
                ext4_warning_inode(dir, "Unimplemented hash flags: %#06x",
                                   root->info.unused_flags);
                goto fail;
        }

        indirect = root->info.indirect_levels;
        if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
                ext4_warning(dir->i_sb,
                             "Directory (ino: %lu) htree depth %#06x exceed"
                             "supported value", dir->i_ino,
                             ext4_dir_htree_level(dir->i_sb));
                if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
                        ext4_warning(dir->i_sb, "Enable large directory "
                                                "feature to access it");
                }
                goto fail;
        }

        entries = (struct dx_entry *)(((char *)&root->info) +
                                      root->info.info_length);

        if (dx_get_limit(entries) != dx_root_limit(dir,
                                                   root->info.info_length)) {
                ext4_warning_inode(dir, "dx entry: limit %u != root limit %u",
                                   dx_get_limit(entries),
                                   dx_root_limit(dir, root->info.info_length));
                goto fail;
        }

        dxtrace(printk("Look up %x", hash));
        level = 0;
        blocks[0] = 0;
        while (1) {
                count = dx_get_count(entries);
                if (!count || count > dx_get_limit(entries)) {
                        ext4_warning_inode(dir,
                                           "dx entry: count %u beyond limit %u",
                                           count, dx_get_limit(entries));
                        goto fail;
                }

                p = entries + 1;
                q = entries + count - 1;
                while (p <= q) {
                        m = p + (q - p) / 2;
                        dxtrace(printk(KERN_CONT "."));
                        if (dx_get_hash(m) > hash)
                                q = m - 1;
                        else
                                p = m + 1;
                }

                if (0) { // linear search cross check
                        unsigned n = count - 1;
                        at = entries;
                        while (n--)
                        {
                                dxtrace(printk(KERN_CONT ","));
                                if (dx_get_hash(++at) > hash)
                                {
                                        at--;
                                        break;
                                }
                        }
                        assert (at == p - 1);
                }

                at = p - 1;
                dxtrace(printk(KERN_CONT " %x->%u\n",
                               at == entries ? 0 : dx_get_hash(at),
                               dx_get_block(at)));
                frame->entries = entries;
                frame->at = at;

                block = dx_get_block(at);
                for (i = 0; i <= level; i++) {
                        if (blocks[i] == block) {
                                ext4_warning_inode(dir,
                                        "dx entry: tree cycle block %u points back to block %u",
                                        blocks[level], block);
                                goto fail;
                        }
                }
                if (++level > indirect)
                        return frame;
                blocks[level] = block;
                frame++;
                frame->bh = ext4_read_dirblock(dir, block, INDEX);
                if (IS_ERR(frame->bh)) {
                        ret_err = (struct dx_frame *) frame->bh;
                        frame->bh = NULL;
                        goto fail;
                }

                entries = ((struct dx_node *) frame->bh->b_data)->entries;

                if (dx_get_limit(entries) != dx_node_limit(dir)) {
                        ext4_warning_inode(dir,
                                "dx entry: limit %u != node limit %u",
                                dx_get_limit(entries), dx_node_limit(dir));
                        goto fail;
                }
        }
fail:
        while (frame >= frame_in) {
                brelse(frame->bh);
                frame--;
        }

        if (ret_err == ERR_PTR(ERR_BAD_DX_DIR))
                ext4_warning_inode(dir,
                        "Corrupt directory, running e2fsck is recommended");
        return ret_err;
}

static void dx_release(struct dx_frame *frames)
{
        struct dx_root_info *info;
        int i;
        unsigned int indirect_levels;

        if (frames[0].bh == NULL)
                return;

        info = &((struct dx_root *)frames[0].bh->b_data)->info;
        /* save local copy, "info" may be freed after brelse() */
        indirect_levels = info->indirect_levels;
        for (i = 0; i <= indirect_levels; i++) {
                if (frames[i].bh == NULL)
                        break;
                brelse(frames[i].bh);
                frames[i].bh = NULL;
        }
}

/*
 * This function increments the frame pointer to search the next leaf
 * block, and reads in the necessary intervening nodes if the search
 * should be necessary.  Whether or not the search is necessary is
 * controlled by the hash parameter.  If the hash value is even, then
 * the search is only continued if the next block starts with that
 * hash value.  This is used if we are searching for a specific file.
 *
 * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
 *
 * This function returns 1 if the caller should continue to search,
 * or 0 if it should not.  If there is an error reading one of the
 * index blocks, it will a negative error code.
 *
 * If start_hash is non-null, it will be filled in with the starting
 * hash of the next page.
 */
static int ext4_htree_next_block(struct inode *dir, __u32 hash,
                                 struct dx_frame *frame,
                                 struct dx_frame *frames,
                                 __u32 *start_hash)
{
        struct dx_frame *p;
        struct buffer_head *bh;
        int num_frames = 0;
        __u32 bhash;

        p = frame;
        /*
         * Find the next leaf page by incrementing the frame pointer.
         * If we run out of entries in the interior node, loop around and
         * increment pointer in the parent node.  When we break out of
         * this loop, num_frames indicates the number of interior
         * nodes need to be read.
         */
        while (1) {
                if (++(p->at) < p->entries + dx_get_count(p->entries))
                        break;
                if (p == frames)
                        return 0;
                num_frames++;
                p--;
        }

        /*
         * If the hash is 1, then continue only if the next page has a
         * continuation hash of any value.  This is used for readdir
         * handling.  Otherwise, check to see if the hash matches the
         * desired continuation hash.  If it doesn't, return since
         * there's no point to read in the successive index pages.
         */
        bhash = dx_get_hash(p->at);
        if (start_hash)
                *start_hash = bhash;
        if ((hash & 1) == 0) {
                if ((bhash & ~1) != hash)
                        return 0;
        }
        /*
         * If the hash is HASH_NB_ALWAYS, we always go to the next
         * block so no check is necessary
         */
        while (num_frames--) {
                bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX);
                if (IS_ERR(bh))
                        return PTR_ERR(bh);
                p++;
                brelse(p->bh);
                p->bh = bh;
                p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
        }
        return 1;
}


/*
 * This function fills a red-black tree with information from a
 * directory block.  It returns the number directory entries loaded
 * into the tree.  If there is an error it is returned in err.
 */
static int htree_dirblock_to_tree(struct file *dir_file,
                                  struct inode *dir, ext4_lblk_t block,
                                  struct dx_hash_info *hinfo,
                                  __u32 start_hash, __u32 start_minor_hash)
{
        struct buffer_head *bh;
        struct ext4_dir_entry_2 *de, *top;
        int err = 0, count = 0;
        struct fscrypt_str fname_crypto_str = FSTR_INIT(NULL, 0), tmp_str;

        dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
                                                        (unsigned long)block));
        bh = ext4_read_dirblock(dir, block, DIRENT_HTREE);
        if (IS_ERR(bh))
                return PTR_ERR(bh);

        de = (struct ext4_dir_entry_2 *) bh->b_data;
        top = (struct ext4_dir_entry_2 *) ((char *) de +
                                           dir->i_sb->s_blocksize -
                                           EXT4_DIR_REC_LEN(0));
        /* Check if the directory is encrypted */
        if (IS_ENCRYPTED(dir)) {
                err = fscrypt_get_encryption_info(dir);
                if (err < 0) {
                        brelse(bh);
                        return err;
                }
                err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN,
                                                 &fname_crypto_str);
                if (err < 0) {
                        brelse(bh);
                        return err;
                }
        }

        for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
                if (ext4_check_dir_entry(dir, NULL, de, bh,
                                bh->b_data, bh->b_size,
                                (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
                                         + ((char *)de - bh->b_data))) {
                        /* silently ignore the rest of the block */
                        break;
                }
                ext4fs_dirhash(dir, de->name, de->name_len, hinfo);
                if ((hinfo->hash < start_hash) ||
                    ((hinfo->hash == start_hash) &&
                     (hinfo->minor_hash < start_minor_hash)))
                        continue;
                if (de->inode == 0)
                        continue;
                if (!IS_ENCRYPTED(dir)) {
                        tmp_str.name = de->name;
                        tmp_str.len = de->name_len;
                        err = ext4_htree_store_dirent(dir_file,
                                   hinfo->hash, hinfo->minor_hash, de,
                                   &tmp_str);
                } else {
                        int save_len = fname_crypto_str.len;
                        struct fscrypt_str de_name = FSTR_INIT(de->name,
                                                                de->name_len);

                        /* Directory is encrypted */
                        err = fscrypt_fname_disk_to_usr(dir, hinfo->hash,
                                        hinfo->minor_hash, &de_name,
                                        &fname_crypto_str);
                        if (err) {
                                count = err;
                                goto errout;
                        }
                        err = ext4_htree_store_dirent(dir_file,
                                   hinfo->hash, hinfo->minor_hash, de,
                                        &fname_crypto_str);
                        fname_crypto_str.len = save_len;
                }
                if (err != 0) {
                        count = err;
                        goto errout;
                }
                count++;
        }
errout:
        brelse(bh);
        fscrypt_fname_free_buffer(&fname_crypto_str);
        return count;
}


/*
 * This function fills a red-black tree with information from a
 * directory.  We start scanning the directory in hash order, starting
 * at start_hash and start_minor_hash.
 *
 * This function returns the number of entries inserted into the tree,
 * or a negative error code.
 */
int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
                         __u32 start_minor_hash, __u32 *next_hash)
{
        struct dx_hash_info hinfo;
        struct ext4_dir_entry_2 *de;
        struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
        struct inode *dir;
        ext4_lblk_t block;
        int count = 0;
        int ret, err;
        __u32 hashval;
        struct fscrypt_str tmp_str;

        dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
                       start_hash, start_minor_hash));
        dir = file_inode(dir_file);
        if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {
                hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
                if (hinfo.hash_version <= DX_HASH_TEA)
                        hinfo.hash_version +=
                                EXT4_SB(dir->i_sb)->s_hash_unsigned;
                hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
                if (ext4_has_inline_data(dir)) {
                        int has_inline_data = 1;
                        count = ext4_inlinedir_to_tree(dir_file, dir, 0,
                                                       &hinfo, start_hash,
                                                       start_minor_hash,
                                                       &has_inline_data);
                        if (has_inline_data) {
                                *next_hash = ~0;
                                return count;
                        }
                }
                count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
                                               start_hash, start_minor_hash);
                *next_hash = ~0;
                return count;
        }
        hinfo.hash = start_hash;
        hinfo.minor_hash = 0;
        frame = dx_probe(NULL, dir, &hinfo, frames);
        if (IS_ERR(frame))
                return PTR_ERR(frame);

        /* Add '.' and '..' from the htree header */
        if (!start_hash && !start_minor_hash) {
                de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
                tmp_str.name = de->name;
                tmp_str.len = de->name_len;
                err = ext4_htree_store_dirent(dir_file, 0, 0,
                                              de, &tmp_str);
                if (err != 0)
                        goto errout;
                count++;
        }
        if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
                de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
                de = ext4_next_entry(de, dir->i_sb->s_blocksize);
                tmp_str.name = de->name;
                tmp_str.len = de->name_len;
                err = ext4_htree_store_dirent(dir_file, 2, 0,
                                              de, &tmp_str);
                if (err != 0)
                        goto errout;
                count++;
        }

        while (1) {
                if (fatal_signal_pending(current)) {
                        err = -ERESTARTSYS;
                        goto errout;
                }
                cond_resched();
                block = dx_get_block(frame->at);
                ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo,
                                             start_hash, start_minor_hash);
                if (ret < 0) {
                        err = ret;
                        goto errout;
                }
                count += ret;
                hashval = ~0;
                ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS,
                                            frame, frames, &hashval);
                *next_hash = hashval;
                if (ret < 0) {
                        err = ret;
                        goto errout;
                }
                /*
                 * Stop if:  (a) there are no more entries, or
                 * (b) we have inserted at least one entry and the
                 * next hash value is not a continuation
                 */
                if ((ret == 0) ||
                    (count && ((hashval & 1) == 0)))
                        break;
        }
        dx_release(frames);
        dxtrace(printk(KERN_DEBUG "Fill tree: returned %d entries, "
                       "next hash: %x\n", count, *next_hash));
        return count;
errout:
        dx_release(frames);
        return (err);
}

static inline int search_dirblock(struct buffer_head *bh,
                                  struct inode *dir,
                                  struct ext4_filename *fname,
                                  unsigned int offset,
                                  struct ext4_dir_entry_2 **res_dir)
{
        return ext4_search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir,
                               fname, offset, res_dir);
}

/*
 * Directory block splitting, compacting
 */

/*
 * Create map of hash values, offsets, and sizes, stored at end of block.
 * Returns number of entries mapped.
 */
static int dx_make_map(struct inode *dir, struct buffer_head *bh,
                       struct dx_hash_info *hinfo,
                       struct dx_map_entry *map_tail)
{
        int count = 0;
        struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)bh->b_data;
        unsigned int buflen = bh->b_size;
        char *base = bh->b_data;
        struct dx_hash_info h = *hinfo;
        int blocksize = EXT4_BLOCK_SIZE(dir->i_sb);

        if (ext4_has_metadata_csum(dir->i_sb))
                buflen -= sizeof(struct ext4_dir_entry_tail);

        while ((char *) de < base + buflen) {
                if (ext4_check_dir_entry(dir, NULL, de, bh, base, buflen,
                                         ((char *)de) - base))
                        return -EFSCORRUPTED;
                if (de->name_len && de->inode) {
                        ext4fs_dirhash(dir, de->name, de->name_len, &h);
                        map_tail--;
                        map_tail->hash = h.hash;
                        map_tail->offs = ((char *) de - base)>>2;
                        map_tail->size = ext4_rec_len_from_disk(de->rec_len,
                                                                blocksize);
                        count++;
                        cond_resched();
                }
                de = ext4_next_entry(de, blocksize);
        }
        return count;
}

/* Sort map by hash value */
static void dx_sort_map (struct dx_map_entry *map, unsigned count)
{
        struct dx_map_entry *p, *q, *top = map + count - 1;
        int more;
        /* Combsort until bubble sort doesn't suck */
        while (count > 2) {
                count = count*10/13;
                if (count - 9 < 2) /* 9, 10 -> 11 */
                        count = 11;
                for (p = top, q = p - count; q >= map; p--, q--)
                        if (p->hash < q->hash)
                                swap(*p, *q);
        }
        /* Garden variety bubble sort */
        do {
                more = 0;
                q = top;
                while (q-- > map) {
                        if (q[1].hash >= q[0].hash)
                                continue;
                        swap(*(q+1), *q);
                        more = 1;
                }
        } while(more);
}

static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
{
        struct dx_entry *entries = frame->entries;
        struct dx_entry *old = frame->at, *new = old + 1;
        int count = dx_get_count(entries);

        assert(count < dx_get_limit(entries));
        assert(old < entries + count);
        memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
        dx_set_hash(new, hash);
        dx_set_block(new, block);
        dx_set_count(entries, count + 1);
}

#ifdef CONFIG_UNICODE
/*
 * Test whether a case-insensitive directory entry matches the filename
 * being searched for.  If quick is set, assume the name being looked up
 * is already in the casefolded form.
 *
 * Returns: 0 if the directory entry matches, more than 0 if it
 * doesn't match or less than zero on error.
 */
int ext4_ci_compare(const struct inode *parent, const struct qstr *name,
                    const struct qstr *entry, bool quick)
{
        const struct super_block *sb = parent->i_sb;
        const struct unicode_map *um = sb->s_encoding;
        int ret;

        if (quick)
                ret = utf8_strncasecmp_folded(um, name, entry);
        else
                ret = utf8_strncasecmp(um, name, entry);

        if (ret < 0) {
                /* Handle invalid character sequence as either an error
                 * or as an opaque byte sequence.
                 */
                if (sb_has_strict_encoding(sb))
                        return -EINVAL;

                if (name->len != entry->len)
                        return 1;

                return !!memcmp(name->name, entry->name, name->len);
        }

        return ret;
}

void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname,
                                  struct fscrypt_str *cf_name)
{
        int len;

        if (!IS_CASEFOLDED(dir) || !dir->i_sb->s_encoding) {
                cf_name->name = NULL;
                return;
        }

        cf_name->name = kmalloc(EXT4_NAME_LEN, GFP_NOFS);
        if (!cf_name->name)
                return;

        len = utf8_casefold(dir->i_sb->s_encoding,
                            iname, cf_name->name,
                            EXT4_NAME_LEN);
        if (len <= 0) {
                kfree(cf_name->name);
                cf_name->name = NULL;
                return;
        }
        cf_name->len = (unsigned) len;

}
#endif

/*
 * Test whether a directory entry matches the filename being searched for.
 *
 * Return: %true if the directory entry matches, otherwise %false.
 */
static inline bool ext4_match(const struct inode *parent,
                              const struct ext4_filename *fname,
                              const struct ext4_dir_entry_2 *de)
{
        struct fscrypt_name f;
#ifdef CONFIG_UNICODE
        const struct qstr entry = {.name = de->name, .len = de->name_len};
#endif

        if (!de->inode)
                return false;

        f.usr_fname = fname->usr_fname;
        f.disk_name = fname->disk_name;
#ifdef CONFIG_FS_ENCRYPTION
        f.crypto_buf = fname->crypto_buf;
#endif

#ifdef CONFIG_UNICODE
        if (parent->i_sb->s_encoding && IS_CASEFOLDED(parent)) {
                if (fname->cf_name.name) {
                        struct qstr cf = {.name = fname->cf_name.name,
                                          .len = fname->cf_name.len};
                        return !ext4_ci_compare(parent, &cf, &entry, true);
                }
                return !ext4_ci_compare(parent, fname->usr_fname, &entry,
                                        false);
        }
#endif

        return fscrypt_match_name(&f, de->name, de->name_len);
}

/*
 * Returns 0 if not found, -EFSCORRUPTED on failure, and 1 on success
 */
int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
                    struct inode *dir, struct ext4_filename *fname,
                    unsigned int offset, struct ext4_dir_entry_2 **res_dir)
{
        struct ext4_dir_entry_2 * de;
        char * dlimit;
        int de_len;

        de = (struct ext4_dir_entry_2 *)search_buf;
        dlimit = search_buf + buf_size;
        while ((char *) de < dlimit - EXT4_BASE_DIR_LEN) {
                /* this code is executed quadratically often */
                /* do minimal checking `by hand' */
                if (de->name + de->name_len <= dlimit &&
                    ext4_match(dir, fname, de)) {
                        /* found a match - just to be sure, do
                         * a full check */
                        if (ext4_check_dir_entry(dir, NULL, de, bh, search_buf,
                                                 buf_size, offset))
                                return -EFSCORRUPTED;
                        *res_dir = de;
                        return 1;
                }
                /* prevent looping on a bad block */
                de_len = ext4_rec_len_from_disk(de->rec_len,
                                                dir->i_sb->s_blocksize);
                if (de_len <= 0)
                        return -EFSCORRUPTED;
                offset += de_len;
                de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
        }
        return 0;
}

static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block,
                               struct ext4_dir_entry *de)
{
        struct super_block *sb = dir->i_sb;

        if (!is_dx(dir))
                return 0;
        if (block == 0)
                return 1;
        if (de->inode == 0 &&
            ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) ==
                        sb->s_blocksize)
                return 1;
        return 0;
}

/*
 *        __ext4_find_entry()
 *
 * finds an entry in the specified directory with the wanted name. It
 * returns the cache buffer in which the entry was found, and the entry
 * itself (as a parameter - res_dir). It does NOT read the inode of the
 * entry - you'll have to do that yourself if you want to.
 *
 * The returned buffer_head has ->b_count elevated.  The caller is expected
 * to brelse() it when appropriate.
 */
static struct buffer_head *__ext4_find_entry(struct inode *dir,
                                             struct ext4_filename *fname,
                                             struct ext4_dir_entry_2 **res_dir,
                                             int *inlined)
{
        struct super_block *sb;
        struct buffer_head *bh_use[NAMEI_RA_SIZE];
        struct buffer_head *bh, *ret = NULL;
        ext4_lblk_t start, block;
        const u8 *name = fname->usr_fname->name;
        size_t ra_max = 0;        /* Number of bh's in the readahead
                                   buffer, bh_use[] */
        size_t ra_ptr = 0;        /* Current index into readahead
                                   buffer */
        ext4_lblk_t  nblocks;
        int i, namelen, retval;

        *res_dir = NULL;
        sb = dir->i_sb;
        namelen = fname->usr_fname->len;
        if (namelen > EXT4_NAME_LEN)
                return NULL;

        if (ext4_has_inline_data(dir)) {
                int has_inline_data = 1;
                ret = ext4_find_inline_entry(dir, fname, res_dir,
                                             &has_inline_data);
                if (inlined)
                        *inlined = has_inline_data;
                if (has_inline_data)
                        goto cleanup_and_exit;
        }

        if ((namelen <= 2) && (name[0] == '.') &&
            (name[1] == '.' || name[1] == '\0')) {
                /*
                 * "." or ".." will only be in the first block
                 * NFS may look up ".."; "." should be handled by the VFS
                 */
                block = start = 0;
                nblocks = 1;
                goto restart;
        }
        if (is_dx(dir)) {
                ret = ext4_dx_find_entry(dir, fname, res_dir);
                /*
                 * On success, or if the error was file not found,
                 * return.  Otherwise, fall back to doing a search the
                 * old fashioned way.
                 */
                if (!IS_ERR(ret) || PTR_ERR(ret) != ERR_BAD_DX_DIR)
                        goto cleanup_and_exit;
                dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
                               "falling back\n"));
                ret = NULL;
        }
        nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
        if (!nblocks) {
                ret = NULL;
                goto cleanup_and_exit;
        }
        start = EXT4_I(dir)->i_dir_start_lookup;
        if (start >= nblocks)
                start = 0;
        block = start;
restart:
        do {
                /*
                 * We deal with the read-ahead logic here.
                 */
                cond_resched();
                if (ra_ptr >= ra_max) {
                        /* Refill the readahead buffer */
                        ra_ptr = 0;
                        if (block < start)
                                ra_max = start - block;
                        else
                                ra_max = nblocks - block;
                        ra_max = min(ra_max, ARRAY_SIZE(bh_use));
                        retval = ext4_bread_batch(dir, block, ra_max,
                                                  false /* wait */, bh_use);
                        if (retval) {
                                ret = ERR_PTR(retval);
                                ra_max = 0;
                                goto cleanup_and_exit;
                        }
                }
                if ((bh = bh_use[ra_ptr++]) == NULL)
                        goto next;
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh)) {
                        EXT4_ERROR_INODE_ERR(dir, EIO,
                                             "reading directory lblock %lu",
                                             (unsigned long) block);
                        brelse(bh);
                        ret = ERR_PTR(-EIO);
                        goto cleanup_and_exit;
                }
                if (!buffer_verified(bh) &&
                    !is_dx_internal_node(dir, block,
                                         (struct ext4_dir_entry *)bh->b_data) &&
                    !ext4_dirblock_csum_verify(dir, bh)) {
                        EXT4_ERROR_INODE_ERR(dir, EFSBADCRC,
                                             "checksumming directory "
                                             "block %lu", (unsigned long)block);
                        brelse(bh);
                        ret = ERR_PTR(-EFSBADCRC);
                        goto cleanup_and_exit;
                }
                set_buffer_verified(bh);
                i = search_dirblock(bh, dir, fname,
                            block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
                if (i == 1) {
                        EXT4_I(dir)->i_dir_start_lookup = block;
                        ret = bh;
                        goto cleanup_and_exit;
                } else {
                        brelse(bh);
                        if (i < 0) {
                                ret = ERR_PTR(i);
                                goto cleanup_and_exit;
                        }
                }
        next:
                if (++block >= nblocks)
                        block = 0;
        } while (block != start);

        /*
         * If the directory has grown while we were searching, then
         * search the last part of the directory before giving up.
         */
        block = nblocks;
        nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
        if (block < nblocks) {
                start = 0;
                goto restart;
        }

cleanup_and_exit:
        /* Clean up the read-ahead blocks */
        for (; ra_ptr < ra_max; ra_ptr++)
                brelse(bh_use[ra_ptr]);
        return ret;
}

static struct buffer_head *ext4_find_entry(struct inode *dir,
                                           const struct qstr *d_name,
                                           struct ext4_dir_entry_2 **res_dir,
                                           int *inlined)
{
        int err;
        struct ext4_filename fname;
        struct buffer_head *bh;

        err = ext4_fname_setup_filename(dir, d_name, 1, &fname);
        if (err == -ENOENT)
                return NULL;
        if (err)
                return ERR_PTR(err);

        bh = __ext4_find_entry(dir, &fname, res_dir, inlined);

        ext4_fname_free_filename(&fname);
        return bh;
}

static struct buffer_head *ext4_lookup_entry(struct inode *dir,
                                             struct dentry *dentry,
                                             struct ext4_dir_entry_2 **res_dir)
{
        int err;
        struct ext4_filename fname;
        struct buffer_head *bh;

        err = ext4_fname_prepare_lookup(dir, dentry, &fname);
        if (err == -ENOENT)
                return NULL;
        if (err)
                return ERR_PTR(err);

        bh = __ext4_find_entry(dir, &fname, res_dir, NULL);

        ext4_fname_free_filename(&fname);
        return bh;
}

static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
                        struct ext4_filename *fname,
                        struct ext4_dir_entry_2 **res_dir)
{
        struct super_block * sb = dir->i_sb;
        struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
        struct buffer_head *bh;
        ext4_lblk_t block;
        int retval;

#ifdef CONFIG_FS_ENCRYPTION
        *res_dir = NULL;
#endif
        frame = dx_probe(fname, dir, NULL, frames);
        if (IS_ERR(frame))
                return (struct buffer_head *) frame;
        do {
                block = dx_get_block(frame->at);
                bh = ext4_read_dirblock(dir, block, DIRENT_HTREE);
                if (IS_ERR(bh))
                        goto errout;

                retval = search_dirblock(bh, dir, fname,
                                         block << EXT4_BLOCK_SIZE_BITS(sb),
                                         res_dir);
                if (retval == 1)
                        goto success;
                brelse(bh);
                if (retval < 0) {
                        bh = ERR_PTR(ERR_BAD_DX_DIR);
                        goto errout;
                }

                /* Check to see if we should continue to search */
                retval = ext4_htree_next_block(dir, fname->hinfo.hash, frame,
                                               frames, NULL);
                if (retval < 0) {
                        ext4_warning_inode(dir,
                                "error %d reading directory index block",
                                retval);
                        bh = ERR_PTR(retval);
                        goto errout;
                }
        } while (retval == 1);

        bh = NULL;
errout:
        dxtrace(printk(KERN_DEBUG "%s not found\n", fname->usr_fname->name));
success:
        dx_release(frames);
        return bh;
}

static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
        struct inode *inode;
        struct ext4_dir_entry_2 *de;
        struct buffer_head *bh;

        if (dentry->d_name.len > EXT4_NAME_LEN)
                return ERR_PTR(-ENAMETOOLONG);

        bh = ext4_lookup_entry(dir, dentry, &de);
        if (IS_ERR(bh))
                return ERR_CAST(bh);
        inode = NULL;
        if (bh) {
                __u32 ino = le32_to_cpu(de->inode);
                brelse(bh);
                if (!ext4_valid_inum(dir->i_sb, ino)) {
                        EXT4_ERROR_INODE(dir, "bad inode number: %u", ino);
                        return ERR_PTR(-EFSCORRUPTED);
                }
                if (unlikely(ino == dir->i_ino)) {
                        EXT4_ERROR_INODE(dir, "'%pd' linked to parent dir",
                                         dentry);
                        return ERR_PTR(-EFSCORRUPTED);
                }
                inode = ext4_iget(dir->i_sb, ino, EXT4_IGET_NORMAL);
                if (inode == ERR_PTR(-ESTALE)) {
                        EXT4_ERROR_INODE(dir,
                                         "deleted inode referenced: %u",
                                         ino);
                        return ERR_PTR(-EFSCORRUPTED);
                }
                if (!IS_ERR(inode) && IS_ENCRYPTED(dir) &&
                    (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
                    !fscrypt_has_permitted_context(dir, inode)) {
                        ext4_warning(inode->i_sb,
                                     "Inconsistent encryption contexts: %lu/%lu",
                                     dir->i_ino, inode->i_ino);
                        iput(inode);
                        return ERR_PTR(-EPERM);
                }
        }

#ifdef CONFIG_UNICODE
        if (!inode && IS_CASEFOLDED(dir)) {
                /* Eventually we want to call d_add_ci(dentry, NULL)
                 * for negative dentries in the encoding case as
                 * well.  For now, prevent the negative dentry
                 * from being cached.
                 */
                return NULL;
        }
#endif
        return d_splice_alias(inode, dentry);
}


struct dentry *ext4_get_parent(struct dentry *child)
{
        __u32 ino;
        static const struct qstr dotdot = QSTR_INIT("..", 2);
        struct ext4_dir_entry_2 * de;
        struct buffer_head *bh;

        bh = ext4_find_entry(d_inode(child), &dotdot, &de, NULL);
        if (IS_ERR(bh))
                return ERR_CAST(bh);
        if (!bh)
                return ERR_PTR(-ENOENT);
        ino = le32_to_cpu(de->inode);
        brelse(bh);

        if (!ext4_valid_inum(child->d_sb, ino)) {
                EXT4_ERROR_INODE(d_inode(child),
                                 "bad parent inode number: %u", ino);
                return ERR_PTR(-EFSCORRUPTED);
        }

        return d_obtain_alias(ext4_iget(child->d_sb, ino, EXT4_IGET_NORMAL));
}

/*
 * Move count entries from end of map between two memory locations.
 * Returns pointer to last entry moved.
 */
static struct ext4_dir_entry_2 *
dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
                unsigned blocksize)
{
        unsigned rec_len = 0;

        while (count--) {
                struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
                                                (from + (map->offs<<2));
                rec_len = EXT4_DIR_REC_LEN(de->name_len);
                memcpy (to, de, rec_len);
                ((struct ext4_dir_entry_2 *) to)->rec_len =
                                ext4_rec_len_to_disk(rec_len, blocksize);
                de->inode = 0;
                map++;
                to += rec_len;
        }
        return (struct ext4_dir_entry_2 *) (to - rec_len);
}

/*
 * Compact each dir entry in the range to the minimal rec_len.
 * Returns pointer to last entry in range.
 */
static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
{
        struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base;
        unsigned rec_len = 0;

        prev = to = de;
        while ((char*)de < base + blocksize) {
                next = ext4_next_entry(de, blocksize);
                if (de->inode && de->name_len) {
                        rec_len = EXT4_DIR_REC_LEN(de->name_len);
                        if (de > to)
                                memmove(to, de, rec_len);
                        to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
                        prev = to;
                        to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len);
                }
                de = next;
        }
        return prev;
}

/*
 * Split a full leaf block to make room for a new dir entry.
 * Allocate a new block, and move entries so that they are approx. equally full.
 * Returns pointer to de in block into which the new entry will be inserted.
 */
static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
                        struct buffer_head **bh,struct dx_frame *frame,
                        struct dx_hash_info *hinfo)
{
        unsigned blocksize = dir->i_sb->s_blocksize;
        unsigned continued;
        int count;
        struct buffer_head *bh2;
        ext4_lblk_t newblock;
        u32 hash2;
        struct dx_map_entry *map;
        char *data1 = (*bh)->b_data, *data2;
        unsigned split, move, size;
        struct ext4_dir_entry_2 *de = NULL, *de2;
        int        csum_size = 0;
        int        err = 0, i;

        if (ext4_has_metadata_csum(dir->i_sb))
                csum_size = sizeof(struct ext4_dir_entry_tail);

        bh2 = ext4_append(handle, dir, &newblock);
        if (IS_ERR(bh2)) {
                brelse(*bh);
                *bh = NULL;
                return (struct ext4_dir_entry_2 *) bh2;
        }

        BUFFER_TRACE(*bh, "get_write_access");
        err = ext4_journal_get_write_access(handle, *bh);
        if (err)
                goto journal_error;

        BUFFER_TRACE(frame->bh, "get_write_access");
        err = ext4_journal_get_write_access(handle, frame->bh);
        if (err)
                goto journal_error;

        data2 = bh2->b_data;

        /* create map in the end of data2 block */
        map = (struct dx_map_entry *) (data2 + blocksize);
        count = dx_make_map(dir, *bh, hinfo, map);
        if (count < 0) {
                err = count;
                goto journal_error;
        }
        map -= count;
        dx_sort_map(map, count);
        /* Ensure that neither split block is over half full */
        size = 0;
        move = 0;
        for (i = count-1; i >= 0; i--) {
                /* is more than half of this entry in 2nd half of the block? */
                if (size + map[i].size/2 > blocksize/2)
                        break;
                size += map[i].size;
                move++;
        }
        /*
         * map index at which we will split
         *
         * If the sum of active entries didn't exceed half the block size, just
         * split it in half by count; each resulting block will have at least
         * half the space free.
         */
        if (i >= 0)
                split = count - move;
        else
                split = count/2;

        hash2 = map[split].hash;
        continued = split > 0 ? hash2 == map[split - 1].hash : 0;
        dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n",
                        (unsigned long)dx_get_block(frame->at),
                                        hash2, split, count-split));

        /* Fancy dance to stay within two buffers */
        de2 = dx_move_dirents(data1, data2, map + split, count - split,
                              blocksize);
        de = dx_pack_dirents(data1, blocksize);
        de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
                                           (char *) de,
                                           blocksize);
        de2->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) -
                                            (char *) de2,
                                            blocksize);
        if (csum_size) {
                ext4_initialize_dirent_tail(*bh, blocksize);
                ext4_initialize_dirent_tail(bh2, blocksize);
        }

        dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data1,
                        blocksize, 1));
        dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data2,
                        blocksize, 1));

        /* Which block gets the new entry? */
        if (hinfo->hash >= hash2) {
                swap(*bh, bh2);
                de = de2;
        }
        dx_insert_block(frame, hash2 + continued, newblock);
        err = ext4_handle_dirty_dirblock(handle, dir, bh2);
        if (err)
                goto journal_error;
        err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
        if (err)
                goto journal_error;
        brelse(bh2);
        dxtrace(dx_show_index("frame", frame->entries));
        return de;

journal_error:
        brelse(*bh);
        brelse(bh2);
        *bh = NULL;
        ext4_std_error(dir->i_sb, err);
        return ERR_PTR(err);
}

int ext4_find_dest_de(struct inode *dir, struct inode *inode,
                      struct buffer_head *bh,
                      void *buf, int buf_size,
                      struct ext4_filename *fname,
                      struct ext4_dir_entry_2 **dest_de)
{
        struct ext4_dir_entry_2 *de;
        unsigned short reclen = EXT4_DIR_REC_LEN(fname_len(fname));
        int nlen, rlen;
        unsigned int offset = 0;
        char *top;

        de = (struct ext4_dir_entry_2 *)buf;
        top = buf + buf_size - reclen;
        while ((char *) de <= top) {
                if (ext4_check_dir_entry(dir, NULL, de, bh,
                                         buf, buf_size, offset))
                        return -EFSCORRUPTED;
                if (ext4_match(dir, fname, de))
                        return -EEXIST;
                nlen = EXT4_DIR_REC_LEN(de->name_len);
                rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
                if ((de->inode ? rlen - nlen : rlen) >= reclen)
                        break;
                de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
                offset += rlen;
        }
        if ((char *) de > top)
                return -ENOSPC;

        *dest_de = de;
        return 0;
}

void ext4_insert_dentry(struct inode *inode,
                        struct ext4_dir_entry_2 *de,
                        int buf_size,
                        struct ext4_filename *fname)
{

        int nlen, rlen;

        nlen = EXT4_DIR_REC_LEN(de->name_len);
        rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
        if (de->inode) {
                struct ext4_dir_entry_2 *de1 =
                        (struct ext4_dir_entry_2 *)((char *)de + nlen);
                de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, buf_size);
                de->rec_len = ext4_rec_len_to_disk(nlen, buf_size);
                de = de1;
        }
        de->file_type = EXT4_FT_UNKNOWN;
        de->inode = cpu_to_le32(inode->i_ino);
        ext4_set_de_type(inode->i_sb, de, inode->i_mode);
        de->name_len = fname_len(fname);
        memcpy(de->name, fname_name(fname), fname_len(fname));
}

/*
 * Add a new entry into a directory (leaf) block.  If de is non-NULL,
 * it points to a directory entry which is guaranteed to be large
 * enough for new directory entry.  If de is NULL, then
 * add_dirent_to_buf will attempt search the directory block for
 * space.  It will return -ENOSPC if no space is available, and -EIO
 * and -EEXIST if directory entry already exists.
 */
static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
                             struct inode *dir,
                             struct inode *inode, struct ext4_dir_entry_2 *de,
                             struct buffer_head *bh)
{
        unsigned int        blocksize = dir->i_sb->s_blocksize;
        int                csum_size = 0;
        int                err, err2;

        if (ext4_has_metadata_csum(inode->i_sb))
                csum_size = sizeof(struct ext4_dir_entry_tail);

        if (!de) {
                err = ext4_find_dest_de(dir, inode, bh, bh->b_data,
                                        blocksize - csum_size, fname, &de);
                if (err)
                        return err;
        }
        BUFFER_TRACE(bh, "get_write_access");
        err = ext4_journal_get_write_access(handle, bh);
        if (err) {
                ext4_std_error(dir->i_sb, err);
                return err;
        }

        /* By now the buffer is marked for journaling */
        ext4_insert_dentry(inode, de, blocksize, fname);

        /*
         * XXX shouldn't update any times until successful
         * completion of syscall, but too many callers depend
         * on this.
         *
         * XXX similarly, too many callers depend on
         * ext4_new_inode() setting the times, but error
         * recovery deletes the inode, so the worst that can
         * happen is that the times are slightly out of date
         * and/or different from the directory change time.
         */
        dir->i_mtime = dir->i_ctime = current_time(dir);
        ext4_update_dx_flag(dir);
        inode_inc_iversion(dir);
        err2 = ext4_mark_inode_dirty(handle, dir);
        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
        err = ext4_handle_dirty_dirblock(handle, dir, bh);
        if (err)
                ext4_std_error(dir->i_sb, err);
        return err ? err : err2;
}

static bool ext4_check_dx_root(struct inode *dir, struct dx_root *root)
{
        struct fake_dirent *fde;
        const char *error_msg;
        unsigned int rlen;
        unsigned int blocksize = dir->i_sb->s_blocksize;
        char *blockend = (char *)root + dir->i_sb->s_blocksize;

        fde = &root->dot;
        if (unlikely(fde->name_len != 1)) {
                error_msg = "invalid name_len for '.'";
                goto corrupted;
        }
        if (unlikely(strncmp(root->dot_name, ".", fde->name_len))) {
                error_msg = "invalid name for '.'";
                goto corrupted;
        }
        rlen = ext4_rec_len_from_disk(fde->rec_len, blocksize);
        if (unlikely((char *)fde + rlen >= blockend)) {
                error_msg = "invalid rec_len for '.'";
                goto corrupted;
        }

        fde = &root->dotdot;
        if (unlikely(fde->name_len != 2)) {
                error_msg = "invalid name_len for '..'";
                goto corrupted;
        }
        if (unlikely(strncmp(root->dotdot_name, "..", fde->name_len))) {
                error_msg = "invalid name for '..'";
                goto corrupted;
        }
        rlen = ext4_rec_len_from_disk(fde->rec_len, blocksize);
        if (unlikely((char *)fde + rlen >= blockend)) {
                error_msg = "invalid rec_len for '..'";
                goto corrupted;
        }

        return true;

corrupted:
        EXT4_ERROR_INODE(dir, "Corrupt dir, %s, running e2fsck is recommended",
                         error_msg);
        return false;
}

/*
 * This converts a one block unindexed directory to a 3 block indexed
 * directory, and adds the dentry to the indexed directory.
 */
static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
                            struct inode *dir,
                            struct inode *inode, struct buffer_head *bh)
{
        struct buffer_head *bh2;
        struct dx_root        *root;
        struct dx_frame        frames[EXT4_HTREE_LEVEL], *frame;
        struct dx_entry *entries;
        struct ext4_dir_entry_2        *de, *de2;
        char                *data2, *top;
        unsigned        len;
        int                retval;
        unsigned        blocksize;
        ext4_lblk_t  block;
        struct fake_dirent *fde;
        int csum_size = 0;

        if (ext4_has_metadata_csum(inode->i_sb))
                csum_size = sizeof(struct ext4_dir_entry_tail);

        blocksize =  dir->i_sb->s_blocksize;
        dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
        BUFFER_TRACE(bh, "get_write_access");
        retval = ext4_journal_get_write_access(handle, bh);
        if (retval) {
                ext4_std_error(dir->i_sb, retval);
                brelse(bh);
                return retval;
        }

        root = (struct dx_root *) bh->b_data;
        if (!ext4_check_dx_root(dir, root)) {
                brelse(bh);
                return -EFSCORRUPTED;
        }

        /* The 0th block becomes the root, move the dirents out */
        fde = &root->dotdot;
        de = (struct ext4_dir_entry_2 *)((char *)fde +
                ext4_rec_len_from_disk(fde->rec_len, blocksize));
        len = ((char *) root) + (blocksize - csum_size) - (char *) de;

        /* Allocate new block for the 0th block's dirents */
        bh2 = ext4_append(handle, dir, &block);
        if (IS_ERR(bh2)) {
                brelse(bh);
                return PTR_ERR(bh2);
        }
        ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
        data2 = bh2->b_data;

        memcpy(data2, de, len);
        de = (struct ext4_dir_entry_2 *) data2;
        top = data2 + len;
        while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) {
                if (ext4_check_dir_entry(dir, NULL, de, bh2, data2, len,
                                         (data2 + (blocksize - csum_size) -
                                          (char *) de))) {
                        brelse(bh2);
                        brelse(bh);
                        return -EFSCORRUPTED;
                }
                de = de2;
        }
        de->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) -
                                           (char *) de, blocksize);

        if (csum_size)
                ext4_initialize_dirent_tail(bh2, blocksize);

        /* Initialize the root; the dot dirents already exist */
        de = (struct ext4_dir_entry_2 *) (&root->dotdot);
        de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2),
                                           blocksize);
        memset (&root->info, 0, sizeof(root->info));
        root->info.info_length = sizeof(root->info);
        root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
        entries = root->entries;
        dx_set_block(entries, 1);
        dx_set_count(entries, 1);
        dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info)));

        /* Initialize as for dx_probe */
        fname->hinfo.hash_version = root->info.hash_version;
        if (fname->hinfo.hash_version <= DX_HASH_TEA)
                fname->hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
        fname->hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
        ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), &fname->hinfo);

        memset(frames, 0, sizeof(frames));
        frame = frames;
        frame->entries = entries;
        frame->at = entries;
        frame->bh = bh;

        retval = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
        if (retval)
                goto out_frames;        
        retval = ext4_handle_dirty_dirblock(handle, dir, bh2);
        if (retval)
                goto out_frames;        

        de = do_split(handle,dir, &bh2, frame, &fname->hinfo);
        if (IS_ERR(de)) {
                retval = PTR_ERR(de);
                goto out_frames;
        }

        retval = add_dirent_to_buf(handle, fname, dir, inode, de, bh2);
out_frames:
        /*
         * Even if the block split failed, we have to properly write
         * out all the changes we did so far. Otherwise we can end up
         * with corrupted filesystem.
         */
        if (retval)
                ext4_mark_inode_dirty(handle, dir);
        dx_release(frames);
        brelse(bh2);
        return retval;
}

/*
 *        ext4_add_entry()
 *
 * adds a file entry to the specified directory, using the same
 * semantics as ext4_find_entry(). It returns NULL if it failed.
 *
 * NOTE!! The inode part of 'de' is left at 0 - which means you
 * may not sleep between calling this and putting something into
 * the entry, as someone else might have used it while you slept.
 */
static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
                          struct inode *inode)
{
        struct inode *dir = d_inode(dentry->d_parent);
        struct buffer_head *bh = NULL;
        struct ext4_dir_entry_2 *de;
        struct super_block *sb;
        struct ext4_filename fname;
        int        retval;
        int        dx_fallback=0;
        unsigned blocksize;
        ext4_lblk_t block, blocks;
        int        csum_size = 0;

        if (ext4_has_metadata_csum(inode->i_sb))
                csum_size = sizeof(struct ext4_dir_entry_tail);

        sb = dir->i_sb;
        blocksize = sb->s_blocksize;
        if (!dentry->d_name.len)
                return -EINVAL;

        if (fscrypt_is_nokey_name(dentry))
                return -ENOKEY;

#ifdef CONFIG_UNICODE
        if (sb_has_strict_encoding(sb) && IS_CASEFOLDED(dir) &&
            sb->s_encoding && utf8_validate(sb->s_encoding, &dentry->d_name))
                return -EINVAL;
#endif

        retval = ext4_fname_setup_filename(dir, &dentry->d_name, 0, &fname);
        if (retval)
                return retval;

        if (ext4_has_inline_data(dir)) {
                retval = ext4_try_add_inline_entry(handle, &fname, dir, inode);
                if (retval < 0)
                        goto out;
                if (retval == 1) {
                        retval = 0;
                        goto out;
                }
        }

        if (is_dx(dir)) {
                retval = ext4_dx_add_entry(handle, &fname, dir, inode);
                if (!retval || (retval != ERR_BAD_DX_DIR))
                        goto out;
                /* Can we just ignore htree data? */
                if (ext4_has_metadata_csum(sb)) {
                        EXT4_ERROR_INODE(dir,
                                "Directory has corrupted htree index.");
                        retval = -EFSCORRUPTED;
                        goto out;
                }
                ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
                dx_fallback++;
                retval = ext4_mark_inode_dirty(handle, dir);
                if (unlikely(retval))
                        goto out;
        }
        blocks = dir->i_size >> sb->s_blocksize_bits;
        for (block = 0; block < blocks; block++) {
                bh = ext4_read_dirblock(dir, block, DIRENT);
                if (bh == NULL) {
                        bh = ext4_bread(handle, dir, block,
                                        EXT4_GET_BLOCKS_CREATE);
                        goto add_to_new_block;
                }
                if (IS_ERR(bh)) {
                        retval = PTR_ERR(bh);
                        bh = NULL;
                        goto out;
                }
                retval = add_dirent_to_buf(handle, &fname, dir, inode,
                                           NULL, bh);
                if (retval != -ENOSPC)
                        goto out;

                if (blocks == 1 && !dx_fallback &&
                    ext4_has_feature_dir_index(sb)) {
                        retval = make_indexed_dir(handle, &fname, dir,
                                                  inode, bh);
                        bh = NULL; /* make_indexed_dir releases bh */
                        goto out;
                }
                brelse(bh);
        }
        bh = ext4_append(handle, dir, &block);
add_to_new_block:
        if (IS_ERR(bh)) {
                retval = PTR_ERR(bh);
                bh = NULL;
                goto out;
        }
        de = (struct ext4_dir_entry_2 *) bh->b_data;
        de->inode = 0;
        de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize);

        if (csum_size)
                ext4_initialize_dirent_tail(bh, blocksize);

        retval = add_dirent_to_buf(handle, &fname, dir, inode, de, bh);
out:
        ext4_fname_free_filename(&fname);
        brelse(bh);
        if (retval == 0)
                ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
        return retval;
}

/*
 * Returns 0 for success, or a negative error value
 */
static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
                             struct inode *dir, struct inode *inode)
{
        struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
        struct dx_entry *entries, *at;
        struct buffer_head *bh;
        struct super_block *sb = dir->i_sb;
        struct ext4_dir_entry_2 *de;
        int restart;
        int err;

again:
        restart = 0;
        frame = dx_probe(fname, dir, NULL, frames);
        if (IS_ERR(frame))
                return PTR_ERR(frame);
        entries = frame->entries;
        at = frame->at;
        bh = ext4_read_dirblock(dir, dx_get_block(frame->at), DIRENT_HTREE);
        if (IS_ERR(bh)) {
                err = PTR_ERR(bh);
                bh = NULL;
                goto cleanup;
        }

        BUFFER_TRACE(bh, "get_write_access");
        err = ext4_journal_get_write_access(handle, bh);
        if (err)
                goto journal_error;

        err = add_dirent_to_buf(handle, fname, dir, inode, NULL, bh);
        if (err != -ENOSPC)
                goto cleanup;

        err = 0;
        /* Block full, should compress but for now just split */
        dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
                       dx_get_count(entries), dx_get_limit(entries)));
        /* Need to split index? */
        if (dx_get_count(entries) == dx_get_limit(entries)) {
                ext4_lblk_t newblock;
                int levels = frame - frames + 1;
                unsigned int icount;
                int add_level = 1;
                struct dx_entry *entries2;
                struct dx_node *node2;
                struct buffer_head *bh2;

                while (frame > frames) {
                        if (dx_get_count((frame - 1)->entries) <
                            dx_get_limit((frame - 1)->entries)) {
                                add_level = 0;
                                break;
                        }
                        frame--; /* split higher index block */
                        at = frame->at;
                        entries = frame->entries;
                        restart = 1;
                }
                if (add_level && levels == ext4_dir_htree_level(sb)) {
                        ext4_warning(sb, "Directory (ino: %lu) index full, "
                                         "reach max htree level :%d",
                                         dir->i_ino, levels);
                        if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
                                ext4_warning(sb, "Large directory feature is "
                                                 "not enabled on this "
                                                 "filesystem");
                        }
                        err = -ENOSPC;
                        goto cleanup;
                }
                icount = dx_get_count(entries);
                bh2 = ext4_append(handle, dir, &newblock);
                if (IS_ERR(bh2)) {
                        err = PTR_ERR(bh2);
                        goto cleanup;
                }
                node2 = (struct dx_node *)(bh2->b_data);
                entries2 = node2->entries;
                memset(&node2->fake, 0, sizeof(struct fake_dirent));
                node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
                                                           sb->s_blocksize);
                BUFFER_TRACE(frame->bh, "get_write_access");
                err = ext4_journal_get_write_access(handle, frame->bh);
                if (err)
                        goto journal_error;
                if (!add_level) {
                        unsigned icount1 = icount/2, icount2 = icount - icount1;
                        unsigned hash2 = dx_get_hash(entries + icount1);
                        dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
                                       icount1, icount2));

                        BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
                        err = ext4_journal_get_write_access(handle,
                                                             (frame - 1)->bh);
                        if (err)
                                goto journal_error;

                        memcpy((char *) entries2, (char *) (entries + icount1),
                               icount2 * sizeof(struct dx_entry));
                        dx_set_count(entries, icount1);
                        dx_set_count(entries2, icount2);
                        dx_set_limit(entries2, dx_node_limit(dir));

                        /* Which index block gets the new entry? */
                        if (at - entries >= icount1) {
                                frame->at = at = at - entries - icount1 + entries2;
                                frame->entries = entries = entries2;
                                swap(frame->bh, bh2);
                        }
                        dx_insert_block((frame - 1), hash2, newblock);
                        dxtrace(dx_show_index("node", frame->entries));
                        dxtrace(dx_show_index("node",
                               ((struct dx_node *) bh2->b_data)->entries));
                        err = ext4_handle_dirty_dx_node(handle, dir, bh2);
                        if (err)
                                goto journal_error;
                        brelse (bh2);
                        err = ext4_handle_dirty_dx_node(handle, dir,
                                                   (frame - 1)->bh);
                        if (err)
                                goto journal_error;
                        err = ext4_handle_dirty_dx_node(handle, dir,
                                                        frame->bh);
                        if (restart || err)
                                goto journal_error;
                } else {
                        struct dx_root *dxroot;
                        memcpy((char *) entries2, (char *) entries,
                               icount * sizeof(struct dx_entry));
                        dx_set_limit(entries2, dx_node_limit(dir));

                        /* Set up root */
                        dx_set_count(entries, 1);
                        dx_set_block(entries + 0, newblock);
                        dxroot = (struct dx_root *)frames[0].bh->b_data;
                        dxroot->info.indirect_levels += 1;
                        dxtrace(printk(KERN_DEBUG
                                       "Creating %d level index...\n",
                                       dxroot->info.indirect_levels));
                        err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
                        if (err)
                                goto journal_error;
                        err = ext4_handle_dirty_dx_node(handle, dir, bh2);
                        brelse(bh2);
                        restart = 1;
                        goto journal_error;
                }
        }
        de = do_split(handle, dir, &bh, frame, &fname->hinfo);
        if (IS_ERR(de)) {
                err = PTR_ERR(de);
                goto cleanup;
        }
        err = add_dirent_to_buf(handle, fname, dir, inode, de, bh);
        goto cleanup;

journal_error:
        ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */
cleanup:
        brelse(bh);
        dx_release(frames);
        /* @restart is true means htree-path has been changed, we need to
         * repeat dx_probe() to find out valid htree-path
         */
        if (restart && err == 0)
                goto again;
        return err;
}

/*
 * ext4_generic_delete_entry deletes a directory entry by merging it
 * with the previous entry
 */
int ext4_generic_delete_entry(struct inode *dir,
                              struct ext4_dir_entry_2 *de_del,
                              struct buffer_head *bh,
                              void *entry_buf,
                              int buf_size,
                              int csum_size)
{
        struct ext4_dir_entry_2 *de, *pde;
        unsigned int blocksize = dir->i_sb->s_blocksize;
        int i;

        i = 0;
        pde = NULL;
        de = (struct ext4_dir_entry_2 *)entry_buf;
        while (i < buf_size - csum_size) {
                if (ext4_check_dir_entry(dir, NULL, de, bh,
                                         entry_buf, buf_size, i))
                        return -EFSCORRUPTED;
                if (de == de_del)  {
                        if (pde)
                                pde->rec_len = ext4_rec_len_to_disk(
                                        ext4_rec_len_from_disk(pde->rec_len,
                                                               blocksize) +
                                        ext4_rec_len_from_disk(de->rec_len,
                                                               blocksize),
                                        blocksize);
                        else
                                de->inode = 0;
                        inode_inc_iversion(dir);
                        return 0;
                }
                i += ext4_rec_len_from_disk(de->rec_len, blocksize);
                pde = de;
                de = ext4_next_entry(de, blocksize);
        }
        return -ENOENT;
}

static int ext4_delete_entry(handle_t *handle,
                             struct inode *dir,
                             struct ext4_dir_entry_2 *de_del,
                             struct buffer_head *bh)
{
        int err, csum_size = 0;

        if (ext4_has_inline_data(dir)) {
                int has_inline_data = 1;
                err = ext4_delete_inline_entry(handle, dir, de_del, bh,
                                               &has_inline_data);
                if (has_inline_data)
                        return err;
        }

        if (ext4_has_metadata_csum(dir->i_sb))
                csum_size = sizeof(struct ext4_dir_entry_tail);

        BUFFER_TRACE(bh, "get_write_access");
        err = ext4_journal_get_write_access(handle, bh);
        if (unlikely(err))
                goto out;

        err = ext4_generic_delete_entry(dir, de_del, bh, bh->b_data,
                                        dir->i_sb->s_blocksize, csum_size);
        if (err)
                goto out;

        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
        err = ext4_handle_dirty_dirblock(handle, dir, bh);
        if (unlikely(err))
                goto out;

        return 0;
out:
        if (err != -ENOENT)
                ext4_std_error(dir->i_sb, err);
        return err;
}

/*
 * Set directory link count to 1 if nlinks > EXT4_LINK_MAX, or if nlinks == 2
 * since this indicates that nlinks count was previously 1 to avoid overflowing
 * the 16-bit i_links_count field on disk.  Directories with i_nlink == 1 mean
 * that subdirectory link counts are not being maintained accurately.
 *
 * The caller has already checked for i_nlink overflow in case the DIR_LINK
 * feature is not enabled and returned -EMLINK.  The is_dx() check is a proxy
 * for checking S_ISDIR(inode) (since the INODE_INDEX feature will not be set
 * on regular files) and to avoid creating huge/slow non-HTREE directories.
 */
static void ext4_inc_count(struct inode *inode)
{
        inc_nlink(inode);
        if (is_dx(inode) &&
            (inode->i_nlink > EXT4_LINK_MAX || inode->i_nlink == 2))
                set_nlink(inode, 1);
}

/*
 * If a directory had nlink == 1, then we should let it be 1. This indicates
 * directory has >EXT4_LINK_MAX subdirs.
 */
static void ext4_dec_count(struct inode *inode)
{
        if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
                drop_nlink(inode);
}


/*
 * Add non-directory inode to a directory. On success, the inode reference is
 * consumed by dentry is instantiation. This is also indicated by clearing of
 * *inodep pointer. On failure, the caller is responsible for dropping the
 * inode reference in the safe context.
 */
static int ext4_add_nondir(handle_t *handle,
                struct dentry *dentry, struct inode **inodep)
{
        struct inode *dir = d_inode(dentry->d_parent);
        struct inode *inode = *inodep;
        int err = ext4_add_entry(handle, dentry, inode);
        if (!err) {
                err = ext4_mark_inode_dirty(handle, inode);
                if (IS_DIRSYNC(dir))
                        ext4_handle_sync(handle);
                d_instantiate_new(dentry, inode);
                *inodep = NULL;
                return err;
        }
        drop_nlink(inode);
        ext4_orphan_add(handle, inode);
        unlock_new_inode(inode);
        return err;
}

/*
 * By the time this is called, we already have created
 * the directory cache entry for the new file, but it
 * is so far negative - it has no inode.
 *
 * If the create succeeds, we fill in the inode information
 * with d_instantiate().
 */
static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
                       bool excl)
{
        handle_t *handle;
        struct inode *inode;
        int err, credits, retries = 0;

        err = dquot_initialize(dir);
        if (err)
                return err;

        credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
                   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
retry:
        inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
                                            NULL, EXT4_HT_DIR, credits);
        handle = ext4_journal_current_handle();
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                inode->i_op = &ext4_file_inode_operations;
                inode->i_fop = &ext4_file_operations;
                ext4_set_aops(inode);
                err = ext4_add_nondir(handle, dentry, &inode);
                if (!err)
                        ext4_fc_track_create(handle, dentry);
        }
        if (handle)
                ext4_journal_stop(handle);
        if (!IS_ERR_OR_NULL(inode))
                iput(inode);
        if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
        return err;
}

static int ext4_mknod(struct inode *dir, struct dentry *dentry,
                      umode_t mode, dev_t rdev)
{
        handle_t *handle;
        struct inode *inode;
        int err, credits, retries = 0;

        err = dquot_initialize(dir);
        if (err)
                return err;

        credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
                   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
retry:
        inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
                                            NULL, EXT4_HT_DIR, credits);
        handle = ext4_journal_current_handle();
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                init_special_inode(inode, inode->i_mode, rdev);
                inode->i_op = &ext4_special_inode_operations;
                err = ext4_add_nondir(handle, dentry, &inode);
                if (!err)
                        ext4_fc_track_create(handle, dentry);
        }
        if (handle)
                ext4_journal_stop(handle);
        if (!IS_ERR_OR_NULL(inode))
                iput(inode);
        if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
        return err;
}

static int ext4_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
{
        handle_t *handle;
        struct inode *inode;
        int err, retries = 0;

        err = dquot_initialize(dir);
        if (err)
                return err;

retry:
        inode = ext4_new_inode_start_handle(dir, mode,
                                            NULL, 0, NULL,
                                            EXT4_HT_DIR,
                        EXT4_MAXQUOTAS_TRANS_BLOCKS(dir->i_sb) +
                          4 + EXT4_XATTR_TRANS_BLOCKS);
        handle = ext4_journal_current_handle();
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                inode->i_op = &ext4_file_inode_operations;
                inode->i_fop = &ext4_file_operations;
                ext4_set_aops(inode);
                d_tmpfile(dentry, inode);
                err = ext4_orphan_add(handle, inode);
                if (err)
                        goto err_unlock_inode;
                mark_inode_dirty(inode);
                unlock_new_inode(inode);
        }
        if (handle)
                ext4_journal_stop(handle);
        if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
        return err;
err_unlock_inode:
        ext4_journal_stop(handle);
        unlock_new_inode(inode);
        return err;
}

struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
                          struct ext4_dir_entry_2 *de,
                          int blocksize, int csum_size,
                          unsigned int parent_ino, int dotdot_real_len)
{
        de->inode = cpu_to_le32(inode->i_ino);
        de->name_len = 1;
        de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
                                           blocksize);
        strcpy(de->name, ".");
        ext4_set_de_type(inode->i_sb, de, S_IFDIR);

        de = ext4_next_entry(de, blocksize);
        de->inode = cpu_to_le32(parent_ino);
        de->name_len = 2;
        if (!dotdot_real_len)
                de->rec_len = ext4_rec_len_to_disk(blocksize -
                                        (csum_size + EXT4_DIR_REC_LEN(1)),
                                        blocksize);
        else
                de->rec_len = ext4_rec_len_to_disk(
                                EXT4_DIR_REC_LEN(de->name_len), blocksize);
        strcpy(de->name, "..");
        ext4_set_de_type(inode->i_sb, de, S_IFDIR);

        return ext4_next_entry(de, blocksize);
}

int ext4_init_new_dir(handle_t *handle, struct inode *dir,
                             struct inode *inode)
{
        struct buffer_head *dir_block = NULL;
        struct ext4_dir_entry_2 *de;
        ext4_lblk_t block = 0;
        unsigned int blocksize = dir->i_sb->s_blocksize;
        int csum_size = 0;
        int err;

        if (ext4_has_metadata_csum(dir->i_sb))
                csum_size = sizeof(struct ext4_dir_entry_tail);

        if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
                err = ext4_try_create_inline_dir(handle, dir, inode);
                if (err < 0 && err != -ENOSPC)
                        goto out;
                if (!err)
                        goto out;
        }

        inode->i_size = 0;
        dir_block = ext4_append(handle, inode, &block);
        if (IS_ERR(dir_block))
                return PTR_ERR(dir_block);
        de = (struct ext4_dir_entry_2 *)dir_block->b_data;
        ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0);
        set_nlink(inode, 2);
        if (csum_size)
                ext4_initialize_dirent_tail(dir_block, blocksize);

        BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
        err = ext4_handle_dirty_dirblock(handle, inode, dir_block);
        if (err)
                goto out;
        set_buffer_verified(dir_block);
out:
        brelse(dir_block);
        return err;
}

static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
        handle_t *handle;
        struct inode *inode;
        int err, err2 = 0, credits, retries = 0;

        if (EXT4_DIR_LINK_MAX(dir))
                return -EMLINK;

        err = dquot_initialize(dir);
        if (err)
                return err;

        credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
                   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
retry:
        inode = ext4_new_inode_start_handle(dir, S_IFDIR | mode,
                                            &dentry->d_name,
                                            0, NULL, EXT4_HT_DIR, credits);
        handle = ext4_journal_current_handle();
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
                goto out_stop;

        inode->i_op = &ext4_dir_inode_operations;
        inode->i_fop = &ext4_dir_operations;
        err = ext4_init_new_dir(handle, dir, inode);
        if (err)
                goto out_clear_inode;
        err = ext4_mark_inode_dirty(handle, inode);
        if (!err)
                err = ext4_add_entry(handle, dentry, inode);
        if (err) {
out_clear_inode:
                clear_nlink(inode);
                ext4_orphan_add(handle, inode);
                unlock_new_inode(inode);
                err2 = ext4_mark_inode_dirty(handle, inode);
                if (unlikely(err2))
                        err = err2;
                ext4_journal_stop(handle);
                iput(inode);
                goto out_retry;
        }
        ext4_inc_count(dir);

        ext4_update_dx_flag(dir);
        err = ext4_mark_inode_dirty(handle, dir);
        if (err)
                goto out_clear_inode;
        d_instantiate_new(dentry, inode);
        ext4_fc_track_create(handle, dentry);
        if (IS_DIRSYNC(dir))
                ext4_handle_sync(handle);

out_stop:
        if (handle)
                ext4_journal_stop(handle);
out_retry:
        if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
        return err;
}

/*
 * routine to check that the specified directory is empty (for rmdir)
 */
bool ext4_empty_dir(struct inode *inode)
{
        unsigned int offset;
        struct buffer_head *bh;
        struct ext4_dir_entry_2 *de;
        struct super_block *sb;

        if (ext4_has_inline_data(inode)) {
                int has_inline_data = 1;
                int ret;

                ret = empty_inline_dir(inode, &has_inline_data);
                if (has_inline_data)
                        return ret;
        }

        sb = inode->i_sb;
        if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2)) {
                EXT4_ERROR_INODE(inode, "invalid size");
                return false;
        }
        bh = ext4_read_dirblock(inode, 0, EITHER);
        if (IS_ERR(bh))
                return false;

        de = (struct ext4_dir_entry_2 *) bh->b_data;
        if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size,
                                 0) ||
            le32_to_cpu(de->inode) != inode->i_ino || strcmp(".", de->name)) {
                ext4_warning_inode(inode, "directory missing '.'");
                brelse(bh);
                return false;
        }
        offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
        de = ext4_next_entry(de, sb->s_blocksize);
        if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size,
                                 offset) ||
            le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) {
                ext4_warning_inode(inode, "directory missing '..'");
                brelse(bh);
                return false;
        }
        offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
        while (offset < inode->i_size) {
                if (!(offset & (sb->s_blocksize - 1))) {
                        unsigned int lblock;
                        brelse(bh);
                        lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
                        bh = ext4_read_dirblock(inode, lblock, EITHER);
                        if (bh == NULL) {
                                offset += sb->s_blocksize;
                                continue;
                        }
                        if (IS_ERR(bh))
                                return false;
                }
                de = (struct ext4_dir_entry_2 *) (bh->b_data +
                                        (offset & (sb->s_blocksize - 1)));
                if (ext4_check_dir_entry(inode, NULL, de, bh,
                                         bh->b_data, bh->b_size, offset) ||
                    le32_to_cpu(de->inode)) {
                        brelse(bh);
                        return false;
                }
                offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
        }
        brelse(bh);
        return true;
}

/*
 * ext4_orphan_add() links an unlinked or truncated inode into a list of
 * such inodes, starting at the superblock, in case we crash before the
 * file is closed/deleted, or in case the inode truncate spans multiple
 * transactions and the last transaction is not recovered after a crash.
 *
 * At filesystem recovery time, we walk this list deleting unlinked
 * inodes and truncating linked inodes in ext4_orphan_cleanup().
 *
 * Orphan list manipulation functions must be called under i_mutex unless
 * we are just creating the inode or deleting it.
 */
int ext4_orphan_add(handle_t *handle, struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_iloc iloc;
        int err = 0, rc;
        bool dirty = false;

        if (!sbi->s_journal || is_bad_inode(inode))
                return 0;

        WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
                     !inode_is_locked(inode));
        /*
         * Exit early if inode already is on orphan list. This is a big speedup
         * since we don't have to contend on the global s_orphan_lock.
         */
        if (!list_empty(&EXT4_I(inode)->i_orphan))
                return 0;

        /*
         * Orphan handling is only valid for files with data blocks
         * being truncated, or files being unlinked. Note that we either
         * hold i_mutex, or the inode can not be referenced from outside,
         * so i_nlink should not be bumped due to race
         */
        J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
                  S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);

        BUFFER_TRACE(sbi->s_sbh, "get_write_access");
        err = ext4_journal_get_write_access(handle, sbi->s_sbh);
        if (err)
                goto out;

        err = ext4_reserve_inode_write(handle, inode, &iloc);
        if (err)
                goto out;

        mutex_lock(&sbi->s_orphan_lock);
        /*
         * Due to previous errors inode may be already a part of on-disk
         * orphan list. If so skip on-disk list modification.
         */
        if (!NEXT_ORPHAN(inode) || NEXT_ORPHAN(inode) >
            (le32_to_cpu(sbi->s_es->s_inodes_count))) {
                /* Insert this inode at the head of the on-disk orphan list */
                NEXT_ORPHAN(inode) = le32_to_cpu(sbi->s_es->s_last_orphan);
                sbi->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
                dirty = true;
        }
        list_add(&EXT4_I(inode)->i_orphan, &sbi->s_orphan);
        mutex_unlock(&sbi->s_orphan_lock);

        if (dirty) {
                err = ext4_handle_dirty_super(handle, sb);
                rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
                if (!err)
                        err = rc;
                if (err) {
                        /*
                         * We have to remove inode from in-memory list if
                         * addition to on disk orphan list failed. Stray orphan
                         * list entries can cause panics at unmount time.
                         */
                        mutex_lock(&sbi->s_orphan_lock);
                        list_del_init(&EXT4_I(inode)->i_orphan);
                        mutex_unlock(&sbi->s_orphan_lock);
                }
        } else
                brelse(iloc.bh);

        jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
        jbd_debug(4, "orphan inode %lu will point to %d\n",
                        inode->i_ino, NEXT_ORPHAN(inode));
out:
        ext4_std_error(sb, err);
        return err;
}

/*
 * ext4_orphan_del() removes an unlinked or truncated inode from the list
 * of such inodes stored on disk, because it is finally being cleaned up.
 */
int ext4_orphan_del(handle_t *handle, struct inode *inode)
{
        struct list_head *prev;
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        __u32 ino_next;
        struct ext4_iloc iloc;
        int err = 0;

        if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS))
                return 0;

        WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
                     !inode_is_locked(inode));
        /* Do this quick check before taking global s_orphan_lock. */
        if (list_empty(&ei->i_orphan))
                return 0;

        if (handle) {
                /* Grab inode buffer early before taking global s_orphan_lock */
                err = ext4_reserve_inode_write(handle, inode, &iloc);
        }

        mutex_lock(&sbi->s_orphan_lock);
        jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);

        prev = ei->i_orphan.prev;
        list_del_init(&ei->i_orphan);

        /* If we're on an error path, we may not have a valid
         * transaction handle with which to update the orphan list on
         * disk, but we still need to remove the inode from the linked
         * list in memory. */
        if (!handle || err) {
                mutex_unlock(&sbi->s_orphan_lock);
                goto out_err;
        }

        ino_next = NEXT_ORPHAN(inode);
        if (prev == &sbi->s_orphan) {
                jbd_debug(4, "superblock will point to %u\n", ino_next);
                BUFFER_TRACE(sbi->s_sbh, "get_write_access");
                err = ext4_journal_get_write_access(handle, sbi->s_sbh);
                if (err) {
                        mutex_unlock(&sbi->s_orphan_lock);
                        goto out_brelse;
                }
                sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
                mutex_unlock(&sbi->s_orphan_lock);
                err = ext4_handle_dirty_super(handle, inode->i_sb);
        } else {
                struct ext4_iloc iloc2;
                struct inode *i_prev =
                        &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;

                jbd_debug(4, "orphan inode %lu will point to %u\n",
                          i_prev->i_ino, ino_next);
                err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
                if (err) {
                        mutex_unlock(&sbi->s_orphan_lock);
                        goto out_brelse;
                }
                NEXT_ORPHAN(i_prev) = ino_next;
                err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
                mutex_unlock(&sbi->s_orphan_lock);
        }
        if (err)
                goto out_brelse;
        NEXT_ORPHAN(inode) = 0;
        err = ext4_mark_iloc_dirty(handle, inode, &iloc);
out_err:
        ext4_std_error(inode->i_sb, err);
        return err;

out_brelse:
        brelse(iloc.bh);
        goto out_err;
}

static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
{
        int retval;
        struct inode *inode;
        struct buffer_head *bh;
        struct ext4_dir_entry_2 *de;
        handle_t *handle = NULL;

        if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
                return -EIO;

        /* Initialize quotas before so that eventual writes go in
         * separate transaction */
        retval = dquot_initialize(dir);
        if (retval)
                return retval;
        retval = dquot_initialize(d_inode(dentry));
        if (retval)
                return retval;

        retval = -ENOENT;
        bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
        if (IS_ERR(bh))
                return PTR_ERR(bh);
        if (!bh)
                goto end_rmdir;

        inode = d_inode(dentry);

        retval = -EFSCORRUPTED;
        if (le32_to_cpu(de->inode) != inode->i_ino)
                goto end_rmdir;

        retval = -ENOTEMPTY;
        if (!ext4_empty_dir(inode))
                goto end_rmdir;

        handle = ext4_journal_start(dir, EXT4_HT_DIR,
                                    EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
        if (IS_ERR(handle)) {
                retval = PTR_ERR(handle);
                handle = NULL;
                goto end_rmdir;
        }

        if (IS_DIRSYNC(dir))
                ext4_handle_sync(handle);

        retval = ext4_delete_entry(handle, dir, de, bh);
        if (retval)
                goto end_rmdir;
        if (!EXT4_DIR_LINK_EMPTY(inode))
                ext4_warning_inode(inode,
                             "empty directory '%.*s' has too many links (%u)",
                             dentry->d_name.len, dentry->d_name.name,
                             inode->i_nlink);
        inode_inc_iversion(inode);
        clear_nlink(inode);
        /* There's no need to set i_disksize: the fact that i_nlink is
         * zero will ensure that the right thing happens during any
         * recovery. */
        inode->i_size = 0;
        ext4_orphan_add(handle, inode);
        inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
        retval = ext4_mark_inode_dirty(handle, inode);
        if (retval)
                goto end_rmdir;
        ext4_dec_count(dir);
        ext4_update_dx_flag(dir);
        ext4_fc_track_unlink(handle, dentry);
        retval = ext4_mark_inode_dirty(handle, dir);

#ifdef CONFIG_UNICODE
        /* VFS negative dentries are incompatible with Encoding and
         * Case-insensitiveness. Eventually we'll want avoid
         * invalidating the dentries here, alongside with returning the
         * negative dentries at ext4_lookup(), when it is better
         * supported by the VFS for the CI case.
         */
        if (IS_CASEFOLDED(dir))
                d_invalidate(dentry);
#endif

end_rmdir:
        brelse(bh);
        if (handle)
                ext4_journal_stop(handle);
        return retval;
}

int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
                  struct inode *inode,
                  struct dentry *dentry /* NULL during fast_commit recovery */)
{
        int retval = -ENOENT;
        struct buffer_head *bh;
        struct ext4_dir_entry_2 *de;
        handle_t *handle;
        int skip_remove_dentry = 0;

        /*
         * Keep this outside the transaction; it may have to set up the
         * directory's encryption key, which isn't GFP_NOFS-safe.
         */
        bh = ext4_find_entry(dir, d_name, &de, NULL);
        if (IS_ERR(bh))
                return PTR_ERR(bh);

        if (!bh)
                return -ENOENT;

        if (le32_to_cpu(de->inode) != inode->i_ino) {
                /*
                 * It's okay if we find dont find dentry which matches
                 * the inode. That's because it might have gotten
                 * renamed to a different inode number
                 */
                if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                        skip_remove_dentry = 1;
                else
                        goto out_bh;
        }

        handle = ext4_journal_start(dir, EXT4_HT_DIR,
                                    EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
        if (IS_ERR(handle)) {
                retval = PTR_ERR(handle);
                goto out_bh;
        }

        if (IS_DIRSYNC(dir))
                ext4_handle_sync(handle);

        if (!skip_remove_dentry) {
                retval = ext4_delete_entry(handle, dir, de, bh);
                if (retval)
                        goto out_handle;
                dir->i_ctime = dir->i_mtime = current_time(dir);
                ext4_update_dx_flag(dir);
                retval = ext4_mark_inode_dirty(handle, dir);
                if (retval)
                        goto out_handle;
        } else {
                retval = 0;
        }
        if (inode->i_nlink == 0)
                ext4_warning_inode(inode, "Deleting file '%.*s' with no links",
                                   d_name->len, d_name->name);
        else
                drop_nlink(inode);
        if (!inode->i_nlink)
                ext4_orphan_add(handle, inode);
        inode->i_ctime = current_time(inode);
        retval = ext4_mark_inode_dirty(handle, inode);
        if (dentry && !retval)
                ext4_fc_track_unlink(handle, dentry);
out_handle:
        ext4_journal_stop(handle);
out_bh:
        brelse(bh);
        return retval;
}

static int ext4_unlink(struct inode *dir, struct dentry *dentry)
{
        int retval;

        if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
                return -EIO;

        trace_ext4_unlink_enter(dir, dentry);
        /*
         * Initialize quotas before so that eventual writes go
         * in separate transaction
         */
        retval = dquot_initialize(dir);
        if (retval)
                goto out_trace;
        retval = dquot_initialize(d_inode(dentry));
        if (retval)
                goto out_trace;

        retval = __ext4_unlink(dir, &dentry->d_name, d_inode(dentry), dentry);
#ifdef CONFIG_UNICODE
        /* VFS negative dentries are incompatible with Encoding and
         * Case-insensitiveness. Eventually we'll want avoid
         * invalidating the dentries here, alongside with returning the
         * negative dentries at ext4_lookup(), when it is  better
         * supported by the VFS for the CI case.
         */
        if (IS_CASEFOLDED(dir))
                d_invalidate(dentry);
#endif

out_trace:
        trace_ext4_unlink_exit(dentry, retval);
        return retval;
}

static int ext4_symlink(struct inode *dir,
                        struct dentry *dentry, const char *symname)
{
        handle_t *handle;
        struct inode *inode;
        int err, len = strlen(symname);
        int credits;
        struct fscrypt_str disk_link;

        if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
                return -EIO;

        err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize,
                                      &disk_link);
        if (err)
                return err;

        err = dquot_initialize(dir);
        if (err)
                return err;

        if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
                /*
                 * For non-fast symlinks, we just allocate inode and put it on
                 * orphan list in the first transaction => we need bitmap,
                 * group descriptor, sb, inode block, quota blocks, and
                 * possibly selinux xattr blocks.
                 */
                credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
                          EXT4_XATTR_TRANS_BLOCKS;
        } else {
                /*
                 * Fast symlink. We have to add entry to directory
                 * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS),
                 * allocate new inode (bitmap, group descriptor, inode block,
                 * quota blocks, sb is already counted in previous macros).
                 */
                credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
                          EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3;
        }

        inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO,
                                            &dentry->d_name, 0, NULL,
                                            EXT4_HT_DIR, credits);
        handle = ext4_journal_current_handle();
        if (IS_ERR(inode)) {
                if (handle)
                        ext4_journal_stop(handle);
                return PTR_ERR(inode);
        }

        if (IS_ENCRYPTED(inode)) {
                err = fscrypt_encrypt_symlink(inode, symname, len, &disk_link);
                if (err)
                        goto err_drop_inode;
                inode->i_op = &ext4_encrypted_symlink_inode_operations;
        }

        if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
                if (!IS_ENCRYPTED(inode))
                        inode->i_op = &ext4_symlink_inode_operations;
                inode_nohighmem(inode);
                ext4_set_aops(inode);
                /*
                 * We cannot call page_symlink() with transaction started
                 * because it calls into ext4_write_begin() which can wait
                 * for transaction commit if we are running out of space
                 * and thus we deadlock. So we have to stop transaction now
                 * and restart it when symlink contents is written.
                 * 
                 * To keep fs consistent in case of crash, we have to put inode
                 * to orphan list in the mean time.
                 */
                drop_nlink(inode);
                err = ext4_orphan_add(handle, inode);
                if (handle)
                        ext4_journal_stop(handle);
                handle = NULL;
                if (err)
                        goto err_drop_inode;
                err = __page_symlink(inode, disk_link.name, disk_link.len, 1);
                if (err)
                        goto err_drop_inode;
                /*
                 * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS
                 * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
                 */
                handle = ext4_journal_start(dir, EXT4_HT_DIR,
                                EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
                                EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
                if (IS_ERR(handle)) {
                        err = PTR_ERR(handle);
                        handle = NULL;
                        goto err_drop_inode;
                }
                set_nlink(inode, 1);
                err = ext4_orphan_del(handle, inode);
                if (err)
                        goto err_drop_inode;
        } else {
                /* clear the extent format for fast symlink */
                ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
                if (!IS_ENCRYPTED(inode)) {
                        inode->i_op = &ext4_fast_symlink_inode_operations;
                        inode->i_link = (char *)&EXT4_I(inode)->i_data;
                }
                memcpy((char *)&EXT4_I(inode)->i_data, disk_link.name,
                       disk_link.len);
                inode->i_size = disk_link.len - 1;
        }
        EXT4_I(inode)->i_disksize = inode->i_size;
        err = ext4_add_nondir(handle, dentry, &inode);
        if (handle)
                ext4_journal_stop(handle);
        if (inode)
                iput(inode);
        goto out_free_encrypted_link;

err_drop_inode:
        if (handle)
                ext4_journal_stop(handle);
        clear_nlink(inode);
        unlock_new_inode(inode);
        iput(inode);
out_free_encrypted_link:
        if (disk_link.name != (unsigned char *)symname)
                kfree(disk_link.name);
        return err;
}

int __ext4_link(struct inode *dir, struct inode *inode, struct dentry *dentry)
{
        handle_t *handle;
        int err, retries = 0;
retry:
        handle = ext4_journal_start(dir, EXT4_HT_DIR,
                (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
                 EXT4_INDEX_EXTRA_TRANS_BLOCKS) + 1);
        if (IS_ERR(handle))
                return PTR_ERR(handle);

        if (IS_DIRSYNC(dir))
                ext4_handle_sync(handle);

        inode->i_ctime = current_time(inode);
        ext4_inc_count(inode);
        ihold(inode);

        err = ext4_add_entry(handle, dentry, inode);
        if (!err) {
                err = ext4_mark_inode_dirty(handle, inode);
                /* this can happen only for tmpfile being
                 * linked the first time
                 */
                if (inode->i_nlink == 1)
                        ext4_orphan_del(handle, inode);
                d_instantiate(dentry, inode);
                ext4_fc_track_link(handle, dentry);
        } else {
                drop_nlink(inode);
                iput(inode);
        }
        ext4_journal_stop(handle);
        if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
        return err;
}

static int ext4_link(struct dentry *old_dentry,
                     struct inode *dir, struct dentry *dentry)
{
        struct inode *inode = d_inode(old_dentry);
        int err;

        if (inode->i_nlink >= EXT4_LINK_MAX)
                return -EMLINK;

        err = fscrypt_prepare_link(old_dentry, dir, dentry);
        if (err)
                return err;

        if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) &&
            (!projid_eq(EXT4_I(dir)->i_projid,
                        EXT4_I(old_dentry->d_inode)->i_projid)))
                return -EXDEV;

        err = dquot_initialize(dir);
        if (err)
                return err;
        return __ext4_link(dir, inode, dentry);
}

/*
 * Try to find buffer head where contains the parent block.
 * It should be the inode block if it is inlined or the 1st block
 * if it is a normal dir.
 */
static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
                                        struct inode *inode,
                                        int *retval,
                                        struct ext4_dir_entry_2 **parent_de,
                                        int *inlined)
{
        struct buffer_head *bh;

        if (!ext4_has_inline_data(inode)) {
                struct ext4_dir_entry_2 *de;
                unsigned int offset;

                bh = ext4_read_dirblock(inode, 0, EITHER);
                if (IS_ERR(bh)) {
                        *retval = PTR_ERR(bh);
                        return NULL;
                }

                de = (struct ext4_dir_entry_2 *) bh->b_data;
                if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data,
                                         bh->b_size, 0) ||
                    le32_to_cpu(de->inode) != inode->i_ino ||
                    strcmp(".", de->name)) {
                        EXT4_ERROR_INODE(inode, "directory missing '.'");
                        brelse(bh);
                        *retval = -EFSCORRUPTED;
                        return NULL;
                }
                offset = ext4_rec_len_from_disk(de->rec_len,
                                                inode->i_sb->s_blocksize);
                de = ext4_next_entry(de, inode->i_sb->s_blocksize);
                if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data,
                                         bh->b_size, offset) ||
                    le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) {
                        EXT4_ERROR_INODE(inode, "directory missing '..'");
                        brelse(bh);
                        *retval = -EFSCORRUPTED;
                        return NULL;
                }
                *parent_de = de;

                return bh;
        }

        *inlined = 1;
        return ext4_get_first_inline_block(inode, parent_de, retval);
}

struct ext4_renament {
        struct inode *dir;
        struct dentry *dentry;
        struct inode *inode;
        bool is_dir;
        int dir_nlink_delta;

        /* entry for "dentry" */
        struct buffer_head *bh;
        struct ext4_dir_entry_2 *de;
        int inlined;

        /* entry for ".." in inode if it's a directory */
        struct buffer_head *dir_bh;
        struct ext4_dir_entry_2 *parent_de;
        int dir_inlined;
};

static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent)
{
        int retval;

        ent->dir_bh = ext4_get_first_dir_block(handle, ent->inode,
                                              &retval, &ent->parent_de,
                                              &ent->dir_inlined);
        if (!ent->dir_bh)
                return retval;
        if (le32_to_cpu(ent->parent_de->inode) != ent->dir->i_ino)
                return -EFSCORRUPTED;
        BUFFER_TRACE(ent->dir_bh, "get_write_access");
        return ext4_journal_get_write_access(handle, ent->dir_bh);
}

static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent,
                                  unsigned dir_ino)
{
        int retval;

        ent->parent_de->inode = cpu_to_le32(dir_ino);
        BUFFER_TRACE(ent->dir_bh, "call ext4_handle_dirty_metadata");
        if (!ent->dir_inlined) {
                if (is_dx(ent->inode)) {
                        retval = ext4_handle_dirty_dx_node(handle,
                                                           ent->inode,
                                                           ent->dir_bh);
                } else {
                        retval = ext4_handle_dirty_dirblock(handle, ent->inode,
                                                            ent->dir_bh);
                }
        } else {
                retval = ext4_mark_inode_dirty(handle, ent->inode);
        }
        if (retval) {
                ext4_std_error(ent->dir->i_sb, retval);
                return retval;
        }
        return 0;
}

static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
                       unsigned ino, unsigned file_type)
{
        int retval, retval2;

        BUFFER_TRACE(ent->bh, "get write access");
        retval = ext4_journal_get_write_access(handle, ent->bh);
        if (retval)
                return retval;
        ent->de->inode = cpu_to_le32(ino);
        if (ext4_has_feature_filetype(ent->dir->i_sb))
                ent->de->file_type = file_type;
        inode_inc_iversion(ent->dir);
        ent->dir->i_ctime = ent->dir->i_mtime =
                current_time(ent->dir);
        retval = ext4_mark_inode_dirty(handle, ent->dir);
        BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata");
        if (!ent->inlined) {
                retval2 = ext4_handle_dirty_dirblock(handle, ent->dir, ent->bh);
                if (unlikely(retval2)) {
                        ext4_std_error(ent->dir->i_sb, retval2);
                        return retval2;
                }
        }
        return retval;
}

static void ext4_resetent(handle_t *handle, struct ext4_renament *ent,
                          unsigned ino, unsigned file_type)
{
        struct ext4_renament old = *ent;
        int retval = 0;

        /*
         * old->de could have moved from under us during make indexed dir,
         * so the old->de may no longer valid and need to find it again
         * before reset old inode info.
         */
        old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de,
                                 &old.inlined);
        if (IS_ERR(old.bh))
                retval = PTR_ERR(old.bh);
        if (!old.bh)
                retval = -ENOENT;
        if (retval) {
                ext4_std_error(old.dir->i_sb, retval);
                return;
        }

        ext4_setent(handle, &old, ino, file_type);
        brelse(old.bh);
}

static int ext4_find_delete_entry(handle_t *handle, struct inode *dir,
                                  const struct qstr *d_name)
{
        int retval = -ENOENT;
        struct buffer_head *bh;
        struct ext4_dir_entry_2 *de;

        bh = ext4_find_entry(dir, d_name, &de, NULL);
        if (IS_ERR(bh))
                return PTR_ERR(bh);
        if (bh) {
                retval = ext4_delete_entry(handle, dir, de, bh);
                brelse(bh);
        }
        return retval;
}

static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent,
                               int force_reread)
{
        int retval;
        /*
         * ent->de could have moved from under us during htree split, so make
         * sure that we are deleting the right entry.  We might also be pointing
         * to a stale entry in the unused part of ent->bh so just checking inum
         * and the name isn't enough.
         */
        if (le32_to_cpu(ent->de->inode) != ent->inode->i_ino ||
            ent->de->name_len != ent->dentry->d_name.len ||
            strncmp(ent->de->name, ent->dentry->d_name.name,
                    ent->de->name_len) ||
            force_reread) {
                retval = ext4_find_delete_entry(handle, ent->dir,
                                                &ent->dentry->d_name);
        } else {
                retval = ext4_delete_entry(handle, ent->dir, ent->de, ent->bh);
                if (retval == -ENOENT) {
                        retval = ext4_find_delete_entry(handle, ent->dir,
                                                        &ent->dentry->d_name);
                }
        }

        if (retval) {
                ext4_warning_inode(ent->dir,
                                   "Deleting old file: nlink %d, error=%d",
                                   ent->dir->i_nlink, retval);
        }
}

static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent)
{
        if (ent->dir_nlink_delta) {
                if (ent->dir_nlink_delta == -1)
                        ext4_dec_count(ent->dir);
                else
                        ext4_inc_count(ent->dir);
                ext4_mark_inode_dirty(handle, ent->dir);
        }
}

static struct inode *ext4_whiteout_for_rename(struct ext4_renament *ent,
                                              int credits, handle_t **h)
{
        struct inode *wh;
        handle_t *handle;
        int retries = 0;

        /*
         * for inode block, sb block, group summaries,
         * and inode bitmap
         */
        credits += (EXT4_MAXQUOTAS_TRANS_BLOCKS(ent->dir->i_sb) +
                    EXT4_XATTR_TRANS_BLOCKS + 4);
retry:
        wh = ext4_new_inode_start_handle(ent->dir, S_IFCHR | WHITEOUT_MODE,
                                         &ent->dentry->d_name, 0, NULL,
                                         EXT4_HT_DIR, credits);

        handle = ext4_journal_current_handle();
        if (IS_ERR(wh)) {
                if (handle)
                        ext4_journal_stop(handle);
                if (PTR_ERR(wh) == -ENOSPC &&
                    ext4_should_retry_alloc(ent->dir->i_sb, &retries))
                        goto retry;
        } else {
                *h = handle;
                init_special_inode(wh, wh->i_mode, WHITEOUT_DEV);
                wh->i_op = &ext4_special_inode_operations;
        }
        return wh;
}

/*
 * Anybody can rename anything with this: the permission checks are left to the
 * higher-level routines.
 *
 * n.b.  old_{dentry,inode) refers to the source dentry/inode
 * while new_{dentry,inode) refers to the destination dentry/inode
 * This comes from rename(const char *oldpath, const char *newpath)
 */
static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                       struct inode *new_dir, struct dentry *new_dentry,
                       unsigned int flags)
{
        handle_t *handle = NULL;
        struct ext4_renament old = {
                .dir = old_dir,
                .dentry = old_dentry,
                .inode = d_inode(old_dentry),
        };
        struct ext4_renament new = {
                .dir = new_dir,
                .dentry = new_dentry,
                .inode = d_inode(new_dentry),
        };
        int force_reread;
        int retval;
        struct inode *whiteout = NULL;
        int credits;
        u8 old_file_type;

        if (new.inode && new.inode->i_nlink == 0) {
                EXT4_ERROR_INODE(new.inode,
                                 "target of rename is already freed");
                return -EFSCORRUPTED;
        }

        if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT)) &&
            (!projid_eq(EXT4_I(new_dir)->i_projid,
                        EXT4_I(old_dentry->d_inode)->i_projid)))
                return -EXDEV;

        retval = dquot_initialize(old.dir);
        if (retval)
                return retval;
        retval = dquot_initialize(old.inode);
        if (retval)
                return retval;
        retval = dquot_initialize(new.dir);
        if (retval)
                return retval;

        /* Initialize quotas before so that eventual writes go
         * in separate transaction */
        if (new.inode) {
                retval = dquot_initialize(new.inode);
                if (retval)
                        return retval;
        }

        old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de,
                                 &old.inlined);
        if (IS_ERR(old.bh))
                return PTR_ERR(old.bh);

        /*
         *  Check for inode number is _not_ due to possible IO errors.
         *  We might rmdir the source, keep it as pwd of some process
         *  and merrily kill the link to whatever was created under the
         *  same name. Goodbye sticky bit ;-<
         */
        retval = -ENOENT;
        if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)
                goto release_bh;

        new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
                                 &new.de, &new.inlined);
        if (IS_ERR(new.bh)) {
                retval = PTR_ERR(new.bh);
                new.bh = NULL;
                goto release_bh;
        }
        if (new.bh) {
                if (!new.inode) {
                        brelse(new.bh);
                        new.bh = NULL;
                }
        }
        if (new.inode && !test_opt(new.dir->i_sb, NO_AUTO_DA_ALLOC))
                ext4_alloc_da_blocks(old.inode);

        credits = (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
                   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
        if (!(flags & RENAME_WHITEOUT)) {
                handle = ext4_journal_start(old.dir, EXT4_HT_DIR, credits);
                if (IS_ERR(handle)) {
                        retval = PTR_ERR(handle);
                        goto release_bh;
                }
        } else {
                whiteout = ext4_whiteout_for_rename(&old, credits, &handle);
                if (IS_ERR(whiteout)) {
                        retval = PTR_ERR(whiteout);
                        goto release_bh;
                }
        }

        old_file_type = old.de->file_type;
        if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
                ext4_handle_sync(handle);

        if (S_ISDIR(old.inode->i_mode)) {
                if (new.inode) {
                        retval = -ENOTEMPTY;
                        if (!ext4_empty_dir(new.inode))
                                goto end_rename;
                } else {
                        retval = -EMLINK;
                        if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir))
                                goto end_rename;
                }
                retval = ext4_rename_dir_prepare(handle, &old);
                if (retval)
                        goto end_rename;
        }
        /*
         * If we're renaming a file within an inline_data dir and adding or
         * setting the new dirent causes a conversion from inline_data to
         * extents/blockmap, we need to force the dirent delete code to
         * re-read the directory, or else we end up trying to delete a dirent
         * from what is now the extent tree root (or a block map).
         */
        force_reread = (new.dir->i_ino == old.dir->i_ino &&
                        ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA));

        if (whiteout) {
                /*
                 * Do this before adding a new entry, so the old entry is sure
                 * to be still pointing to the valid old entry.
                 */
                retval = ext4_setent(handle, &old, whiteout->i_ino,
                                     EXT4_FT_CHRDEV);
                if (retval)
                        goto end_rename;
                retval = ext4_mark_inode_dirty(handle, whiteout);
                if (unlikely(retval))
                        goto end_rename;

        }
        if (!new.bh) {
                retval = ext4_add_entry(handle, new.dentry, old.inode);
                if (retval)
                        goto end_rename;
        } else {
                retval = ext4_setent(handle, &new,
                                     old.inode->i_ino, old_file_type);
                if (retval)
                        goto end_rename;
        }
        if (force_reread)
                force_reread = !ext4_test_inode_flag(new.dir,
                                                     EXT4_INODE_INLINE_DATA);

        /*
         * Like most other Unix systems, set the ctime for inodes on a
         * rename.
         */
        old.inode->i_ctime = current_time(old.inode);
        retval = ext4_mark_inode_dirty(handle, old.inode);
        if (unlikely(retval))
                goto end_rename;

        if (!whiteout) {
                /*
                 * ok, that's it
                 */
                ext4_rename_delete(handle, &old, force_reread);
        }

        if (new.inode) {
                ext4_dec_count(new.inode);
                new.inode->i_ctime = current_time(new.inode);
        }
        old.dir->i_ctime = old.dir->i_mtime = current_time(old.dir);
        ext4_update_dx_flag(old.dir);
        if (old.dir_bh) {
                retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
                if (retval)
                        goto end_rename;

                ext4_dec_count(old.dir);
                if (new.inode) {
                        /* checked ext4_empty_dir above, can't have another
                         * parent, ext4_dec_count() won't work for many-linked
                         * dirs */
                        clear_nlink(new.inode);
                } else {
                        ext4_inc_count(new.dir);
                        ext4_update_dx_flag(new.dir);
                        retval = ext4_mark_inode_dirty(handle, new.dir);
                        if (unlikely(retval))
                                goto end_rename;
                }
        }
        retval = ext4_mark_inode_dirty(handle, old.dir);
        if (unlikely(retval))
                goto end_rename;

        if (S_ISDIR(old.inode->i_mode)) {
                /*
                 * We disable fast commits here that's because the
                 * replay code is not yet capable of changing dot dot
                 * dirents in directories.
                 */
                ext4_fc_mark_ineligible(old.inode->i_sb,
                        EXT4_FC_REASON_RENAME_DIR);
        } else {
                if (new.inode)
                        ext4_fc_track_unlink(handle, new.dentry);
                __ext4_fc_track_link(handle, old.inode, new.dentry);
                __ext4_fc_track_unlink(handle, old.inode, old.dentry);
                if (whiteout)
                        __ext4_fc_track_create(handle, whiteout, old.dentry);
        }

        if (new.inode) {
                retval = ext4_mark_inode_dirty(handle, new.inode);
                if (unlikely(retval))
                        goto end_rename;
                if (!new.inode->i_nlink)
                        ext4_orphan_add(handle, new.inode);
        }
        retval = 0;

end_rename:
        if (whiteout) {
                if (retval) {
                        ext4_resetent(handle, &old,
                                      old.inode->i_ino, old_file_type);
                        drop_nlink(whiteout);
                        ext4_orphan_add(handle, whiteout);
                }
                unlock_new_inode(whiteout);
                ext4_journal_stop(handle);
                iput(whiteout);
        } else {
                ext4_journal_stop(handle);
        }
release_bh:
        brelse(old.dir_bh);
        brelse(old.bh);
        brelse(new.bh);

        return retval;
}

static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
                             struct inode *new_dir, struct dentry *new_dentry)
{
        handle_t *handle = NULL;
        struct ext4_renament old = {
                .dir = old_dir,
                .dentry = old_dentry,
                .inode = d_inode(old_dentry),
        };
        struct ext4_renament new = {
                .dir = new_dir,
                .dentry = new_dentry,
                .inode = d_inode(new_dentry),
        };
        u8 new_file_type;
        int retval;
        struct timespec64 ctime;

        if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) &&
             !projid_eq(EXT4_I(new_dir)->i_projid,
                        EXT4_I(old_dentry->d_inode)->i_projid)) ||
            (ext4_test_inode_flag(old_dir, EXT4_INODE_PROJINHERIT) &&
             !projid_eq(EXT4_I(old_dir)->i_projid,
                        EXT4_I(new_dentry->d_inode)->i_projid)))
                return -EXDEV;

        retval = dquot_initialize(old.dir);
        if (retval)
                return retval;
        retval = dquot_initialize(new.dir);
        if (retval)
                return retval;

        old.bh = ext4_find_entry(old.dir, &old.dentry->d_name,
                                 &old.de, &old.inlined);
        if (IS_ERR(old.bh))
                return PTR_ERR(old.bh);
        /*
         *  Check for inode number is _not_ due to possible IO errors.
         *  We might rmdir the source, keep it as pwd of some process
         *  and merrily kill the link to whatever was created under the
         *  same name. Goodbye sticky bit ;-<
         */
        retval = -ENOENT;
        if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)
                goto end_rename;

        new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
                                 &new.de, &new.inlined);
        if (IS_ERR(new.bh)) {
                retval = PTR_ERR(new.bh);
                new.bh = NULL;
                goto end_rename;
        }

        /* RENAME_EXCHANGE case: old *and* new must both exist */
        if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino)
                goto end_rename;

        handle = ext4_journal_start(old.dir, EXT4_HT_DIR,
                (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
                 2 * EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
        if (IS_ERR(handle)) {
                retval = PTR_ERR(handle);
                handle = NULL;
                goto end_rename;
        }

        if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
                ext4_handle_sync(handle);

        if (S_ISDIR(old.inode->i_mode)) {
                old.is_dir = true;
                retval = ext4_rename_dir_prepare(handle, &old);
                if (retval)
                        goto end_rename;
        }
        if (S_ISDIR(new.inode->i_mode)) {
                new.is_dir = true;
                retval = ext4_rename_dir_prepare(handle, &new);
                if (retval)
                        goto end_rename;
        }

        /*
         * Other than the special case of overwriting a directory, parents'
         * nlink only needs to be modified if this is a cross directory rename.
         */
        if (old.dir != new.dir && old.is_dir != new.is_dir) {
                old.dir_nlink_delta = old.is_dir ? -1 : 1;
                new.dir_nlink_delta = -old.dir_nlink_delta;
                retval = -EMLINK;
                if ((old.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(old.dir)) ||
                    (new.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(new.dir)))
                        goto end_rename;
        }

        new_file_type = new.de->file_type;
        retval = ext4_setent(handle, &new, old.inode->i_ino, old.de->file_type);
        if (retval)
                goto end_rename;

        retval = ext4_setent(handle, &old, new.inode->i_ino, new_file_type);
        if (retval)
                goto end_rename;

        /*
         * Like most other Unix systems, set the ctime for inodes on a
         * rename.
         */
        ctime = current_time(old.inode);
        old.inode->i_ctime = ctime;
        new.inode->i_ctime = ctime;
        retval = ext4_mark_inode_dirty(handle, old.inode);
        if (unlikely(retval))
                goto end_rename;
        retval = ext4_mark_inode_dirty(handle, new.inode);
        if (unlikely(retval))
                goto end_rename;
        ext4_fc_mark_ineligible(new.inode->i_sb,
                                EXT4_FC_REASON_CROSS_RENAME);
        if (old.dir_bh) {
                retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
                if (retval)
                        goto end_rename;
        }
        if (new.dir_bh) {
                retval = ext4_rename_dir_finish(handle, &new, old.dir->i_ino);
                if (retval)
                        goto end_rename;
        }
        ext4_update_dir_count(handle, &old);
        ext4_update_dir_count(handle, &new);
        retval = 0;

end_rename:
        brelse(old.dir_bh);
        brelse(new.dir_bh);
        brelse(old.bh);
        brelse(new.bh);
        if (handle)
                ext4_journal_stop(handle);
        return retval;
}

static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry,
                        struct inode *new_dir, struct dentry *new_dentry,
                        unsigned int flags)
{
        int err;

        if (unlikely(ext4_forced_shutdown(EXT4_SB(old_dir->i_sb))))
                return -EIO;

        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
                return -EINVAL;

        err = fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry,
                                     flags);
        if (err)
                return err;

        if (flags & RENAME_EXCHANGE) {
                return ext4_cross_rename(old_dir, old_dentry,
                                         new_dir, new_dentry);
        }

        return ext4_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
}

/*
 * directories can handle most operations...
 */
const struct inode_operations ext4_dir_inode_operations = {
        .create                = ext4_create,
        .lookup                = ext4_lookup,
        .link                = ext4_link,
        .unlink                = ext4_unlink,
        .symlink        = ext4_symlink,
        .mkdir                = ext4_mkdir,
        .rmdir                = ext4_rmdir,
        .mknod                = ext4_mknod,
        .tmpfile        = ext4_tmpfile,
        .rename                = ext4_rename2,
        .setattr        = ext4_setattr,
        .getattr        = ext4_getattr,
        .listxattr        = ext4_listxattr,
        .get_acl        = ext4_get_acl,
        .set_acl        = ext4_set_acl,
        .fiemap         = ext4_fiemap,
};

const struct inode_operations ext4_special_inode_operations = {
        .setattr        = ext4_setattr,
        .getattr        = ext4_getattr,
        .listxattr        = ext4_listxattr,
        .get_acl        = ext4_get_acl,
        .set_acl        = ext4_set_acl,
};

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Portions of this file
* Copyright(c) 2016-2017 Intel Deutschland GmbH
* Copyright (C) 2018 - 2019 Intel Corporation
*/

#if !defined(__MAC80211_DRIVER_TRACE) || defined(TRACE_HEADER_MULTI_READ)
#define __MAC80211_DRIVER_TRACE

#include <linux/tracepoint.h>
#include <net/mac80211.h>
#include "ieee80211_i.h"

#undef TRACE_SYSTEM
#define TRACE_SYSTEM mac80211

#define MAXNAME                32
#define LOCAL_ENTRY        __array(char, wiphy_name, 32)
#define LOCAL_ASSIGN        strlcpy(__entry->wiphy_name, wiphy_name(local->hw.wiphy), MAXNAME)
#define LOCAL_PR_FMT        "%s"
#define LOCAL_PR_ARG        __entry->wiphy_name

#define STA_ENTRY        __array(char, sta_addr, ETH_ALEN)
#define STA_ASSIGN        (sta ? memcpy(__entry->sta_addr, sta->addr, ETH_ALEN) : \
                                eth_zero_addr(__entry->sta_addr))
#define STA_NAMED_ASSIGN(s)        memcpy(__entry->sta_addr, (s)->addr, ETH_ALEN)
#define STA_PR_FMT        " sta:%pM"
#define STA_PR_ARG        __entry->sta_addr

#define VIF_ENTRY        __field(enum nl80211_iftype, vif_type) __field(void *, sdata)        \
                        __field(bool, p2p)                                                \
                        __string(vif_name, sdata->name)
#define VIF_ASSIGN        __entry->vif_type = sdata->vif.type; __entry->sdata = sdata;        \
                        __entry->p2p = sdata->vif.p2p;                                        \
                        __assign_str(vif_name, sdata->name)
#define VIF_PR_FMT        " vif:%s(%d%s)"
#define VIF_PR_ARG        __get_str(vif_name), __entry->vif_type, __entry->p2p ? "/p2p" : ""

#define CHANDEF_ENTRY        __field(u32, control_freq)                                        \
                        __field(u32, freq_offset)                                        \
                        __field(u32, chan_width)                                        \
                        __field(u32, center_freq1)                                        \
                        __field(u32, freq1_offset)                                        \
                        __field(u32, center_freq2)
#define CHANDEF_ASSIGN(c)                                                        \
                        __entry->control_freq = (c) ? ((c)->chan ? (c)->chan->center_freq : 0) : 0;        \
                        __entry->freq_offset = (c) ? ((c)->chan ? (c)->chan->freq_offset : 0) : 0;        \
                        __entry->chan_width = (c) ? (c)->width : 0;                        \
                        __entry->center_freq1 = (c) ? (c)->center_freq1 : 0;                \
                        __entry->freq1_offset = (c) ? (c)->freq1_offset : 0;                \
                        __entry->center_freq2 = (c) ? (c)->center_freq2 : 0;
#define CHANDEF_PR_FMT        " control:%d.%03d MHz width:%d center: %d.%03d/%d MHz"
#define CHANDEF_PR_ARG        __entry->control_freq, __entry->freq_offset, __entry->chan_width, \
                        __entry->center_freq1, __entry->freq1_offset, __entry->center_freq2

#define MIN_CHANDEF_ENTRY                                                                \
                        __field(u32, min_control_freq)                                        \
                        __field(u32, min_freq_offset)                                        \
                        __field(u32, min_chan_width)                                        \
                        __field(u32, min_center_freq1)                                        \
                        __field(u32, min_freq1_offset)                                        \
                        __field(u32, min_center_freq2)

#define MIN_CHANDEF_ASSIGN(c)                                                                \
                        __entry->min_control_freq = (c)->chan ? (c)->chan->center_freq : 0;        \
                        __entry->min_freq_offset = (c)->chan ? (c)->chan->freq_offset : 0;        \
                        __entry->min_chan_width = (c)->width;                                \
                        __entry->min_center_freq1 = (c)->center_freq1;                        \
                        __entry->min_freq1_offset = (c)->freq1_offset;                        \
                        __entry->min_center_freq2 = (c)->center_freq2;
#define MIN_CHANDEF_PR_FMT        " min_control:%d.%03d MHz min_width:%d min_center: %d.%03d/%d MHz"
#define MIN_CHANDEF_PR_ARG        __entry->min_control_freq, __entry->min_freq_offset,        \
                        __entry->min_chan_width,                                        \
                        __entry->min_center_freq1, __entry->min_freq1_offset,                \
                        __entry->min_center_freq2

#define CHANCTX_ENTRY        CHANDEF_ENTRY                                                        \
                        MIN_CHANDEF_ENTRY                                                \
                        __field(u8, rx_chains_static)                                        \
                        __field(u8, rx_chains_dynamic)
#define CHANCTX_ASSIGN        CHANDEF_ASSIGN(&ctx->conf.def)                                        \
                        MIN_CHANDEF_ASSIGN(&ctx->conf.min_def)                                \
                        __entry->rx_chains_static = ctx->conf.rx_chains_static;                \
                        __entry->rx_chains_dynamic = ctx->conf.rx_chains_dynamic
#define CHANCTX_PR_FMT        CHANDEF_PR_FMT MIN_CHANDEF_PR_FMT " chains:%d/%d"
#define CHANCTX_PR_ARG        CHANDEF_PR_ARG,        MIN_CHANDEF_PR_ARG,                                \
                        __entry->rx_chains_static, __entry->rx_chains_dynamic

#define KEY_ENTRY        __field(u32, cipher)                                                \
                        __field(u8, hw_key_idx)                                                \
                        __field(u8, flags)                                                \
                        __field(s8, keyidx)
#define KEY_ASSIGN(k)        __entry->cipher = (k)->cipher;                                        \
                        __entry->flags = (k)->flags;                                        \
                        __entry->keyidx = (k)->keyidx;                                        \
                        __entry->hw_key_idx = (k)->hw_key_idx;
#define KEY_PR_FMT        " cipher:0x%x, flags=%#x, keyidx=%d, hw_key_idx=%d"
#define KEY_PR_ARG        __entry->cipher, __entry->flags, __entry->keyidx, __entry->hw_key_idx

#define AMPDU_ACTION_ENTRY        __field(enum ieee80211_ampdu_mlme_action,                \
                                        ieee80211_ampdu_mlme_action)                        \
                                STA_ENTRY                                                \
                                __field(u16, tid)                                        \
                                __field(u16, ssn)                                        \
                                __field(u16, buf_size)                                        \
                                __field(bool, amsdu)                                        \
                                __field(u16, timeout)                                        \
                                __field(u16, action)
#define AMPDU_ACTION_ASSIGN        STA_NAMED_ASSIGN(params->sta);                                \
                                __entry->tid = params->tid;                                \
                                __entry->ssn = params->ssn;                                \
                                __entry->buf_size = params->buf_size;                        \
                                __entry->amsdu = params->amsdu;                                \
                                __entry->timeout = params->timeout;                        \
                                __entry->action = params->action;
#define AMPDU_ACTION_PR_FMT        STA_PR_FMT " tid %d, ssn %d, buf_size %u, amsdu %d, timeout %d action %d"
#define AMPDU_ACTION_PR_ARG        STA_PR_ARG, __entry->tid, __entry->ssn,                        \
                                __entry->buf_size, __entry->amsdu, __entry->timeout,        \
                                __entry->action

/*
 * Tracing for driver callbacks.
 */

DECLARE_EVENT_CLASS(local_only_evt,
        TP_PROTO(struct ieee80211_local *local),
        TP_ARGS(local),
        TP_STRUCT__entry(
                LOCAL_ENTRY
        ),
        TP_fast_assign(
                LOCAL_ASSIGN;
        ),
        TP_printk(LOCAL_PR_FMT, LOCAL_PR_ARG)
);

DECLARE_EVENT_CLASS(local_sdata_addr_evt,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),
        TP_ARGS(local, sdata),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __array(char, addr, ETH_ALEN)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                memcpy(__entry->addr, sdata->vif.addr, ETH_ALEN);
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT " addr:%pM",
                LOCAL_PR_ARG, VIF_PR_ARG, __entry->addr
        )
);

DECLARE_EVENT_CLASS(local_u32_evt,
        TP_PROTO(struct ieee80211_local *local, u32 value),
        TP_ARGS(local, value),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(u32, value)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->value = value;
        ),

        TP_printk(
                LOCAL_PR_FMT " value:%d",
                LOCAL_PR_ARG, __entry->value
        )
);

DECLARE_EVENT_CLASS(local_sdata_evt,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),
        TP_ARGS(local, sdata),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
        ),

        TP_printk(
                LOCAL_PR_FMT VIF_PR_FMT,
                LOCAL_PR_ARG, VIF_PR_ARG
        )
);

DEFINE_EVENT(local_only_evt, drv_return_void,
        TP_PROTO(struct ieee80211_local *local),
        TP_ARGS(local)
);

TRACE_EVENT(drv_return_int,
        TP_PROTO(struct ieee80211_local *local, int ret),
        TP_ARGS(local, ret),
        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(int, ret)
        ),
        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->ret = ret;
        ),
        TP_printk(LOCAL_PR_FMT " - %d", LOCAL_PR_ARG, __entry->ret)
);

TRACE_EVENT(drv_return_bool,
        TP_PROTO(struct ieee80211_local *local, bool ret),
        TP_ARGS(local, ret),
        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(bool, ret)
        ),
        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->ret = ret;
        ),
        TP_printk(LOCAL_PR_FMT " - %s", LOCAL_PR_ARG, (__entry->ret) ?
                  "true" : "false")
);

TRACE_EVENT(drv_return_u32,
        TP_PROTO(struct ieee80211_local *local, u32 ret),
        TP_ARGS(local, ret),
        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(u32, ret)
        ),
        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->ret = ret;
        ),
        TP_printk(LOCAL_PR_FMT " - %u", LOCAL_PR_ARG, __entry->ret)
);

TRACE_EVENT(drv_return_u64,
        TP_PROTO(struct ieee80211_local *local, u64 ret),
        TP_ARGS(local, ret),
        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(u64, ret)
        ),
        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->ret = ret;
        ),
        TP_printk(LOCAL_PR_FMT " - %llu", LOCAL_PR_ARG, __entry->ret)
);

DEFINE_EVENT(local_only_evt, drv_start,
        TP_PROTO(struct ieee80211_local *local),
        TP_ARGS(local)
);

DEFINE_EVENT(local_u32_evt, drv_get_et_strings,
             TP_PROTO(struct ieee80211_local *local, u32 sset),
             TP_ARGS(local, sset)
);

DEFINE_EVENT(local_u32_evt, drv_get_et_sset_count,
             TP_PROTO(struct ieee80211_local *local, u32 sset),
             TP_ARGS(local, sset)
);

DEFINE_EVENT(local_only_evt, drv_get_et_stats,
             TP_PROTO(struct ieee80211_local *local),
             TP_ARGS(local)
);

DEFINE_EVENT(local_only_evt, drv_suspend,
        TP_PROTO(struct ieee80211_local *local),
        TP_ARGS(local)
);

DEFINE_EVENT(local_only_evt, drv_resume,
        TP_PROTO(struct ieee80211_local *local),
        TP_ARGS(local)
);

TRACE_EVENT(drv_set_wakeup,
        TP_PROTO(struct ieee80211_local *local, bool enabled),
        TP_ARGS(local, enabled),
        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(bool, enabled)
        ),
        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->enabled = enabled;
        ),
        TP_printk(LOCAL_PR_FMT " enabled:%d", LOCAL_PR_ARG, __entry->enabled)
);

DEFINE_EVENT(local_only_evt, drv_stop,
        TP_PROTO(struct ieee80211_local *local),
        TP_ARGS(local)
);

DEFINE_EVENT(local_sdata_addr_evt, drv_add_interface,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),
        TP_ARGS(local, sdata)
);

TRACE_EVENT(drv_change_interface,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 enum nl80211_iftype type, bool p2p),

        TP_ARGS(local, sdata, type, p2p),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(u32, new_type)
                __field(bool, new_p2p)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->new_type = type;
                __entry->new_p2p = p2p;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT " new type:%d%s",
                LOCAL_PR_ARG, VIF_PR_ARG, __entry->new_type,
                __entry->new_p2p ? "/p2p" : ""
        )
);

DEFINE_EVENT(local_sdata_addr_evt, drv_remove_interface,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),
        TP_ARGS(local, sdata)
);

TRACE_EVENT(drv_config,
        TP_PROTO(struct ieee80211_local *local,
                 u32 changed),

        TP_ARGS(local, changed),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(u32, changed)
                __field(u32, flags)
                __field(int, power_level)
                __field(int, dynamic_ps_timeout)
                __field(u16, listen_interval)
                __field(u8, long_frame_max_tx_count)
                __field(u8, short_frame_max_tx_count)
                CHANDEF_ENTRY
                __field(int, smps)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->changed = changed;
                __entry->flags = local->hw.conf.flags;
                __entry->power_level = local->hw.conf.power_level;
                __entry->dynamic_ps_timeout = local->hw.conf.dynamic_ps_timeout;
                __entry->listen_interval = local->hw.conf.listen_interval;
                __entry->long_frame_max_tx_count =
                        local->hw.conf.long_frame_max_tx_count;
                __entry->short_frame_max_tx_count =
                        local->hw.conf.short_frame_max_tx_count;
                CHANDEF_ASSIGN(&local->hw.conf.chandef)
                __entry->smps = local->hw.conf.smps_mode;
        ),

        TP_printk(
                LOCAL_PR_FMT " ch:%#x" CHANDEF_PR_FMT,
                LOCAL_PR_ARG, __entry->changed, CHANDEF_PR_ARG
        )
);

TRACE_EVENT(drv_bss_info_changed,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_bss_conf *info,
                 u32 changed),

        TP_ARGS(local, sdata, info, changed),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(u32, changed)
                __field(bool, assoc)
                __field(bool, ibss_joined)
                __field(bool, ibss_creator)
                __field(u16, aid)
                __field(bool, cts)
                __field(bool, shortpre)
                __field(bool, shortslot)
                __field(bool, enable_beacon)
                __field(u8, dtimper)
                __field(u16, bcnint)
                __field(u16, assoc_cap)
                __field(u64, sync_tsf)
                __field(u32, sync_device_ts)
                __field(u8, sync_dtim_count)
                __field(u32, basic_rates)
                __array(int, mcast_rate, NUM_NL80211_BANDS)
                __field(u16, ht_operation_mode)
                __field(s32, cqm_rssi_thold)
                __field(s32, cqm_rssi_hyst)
                __field(u32, channel_width)
                __field(u32, channel_cfreq1)
                __field(u32, channel_cfreq1_offset)
                __dynamic_array(u32, arp_addr_list,
                                info->arp_addr_cnt > IEEE80211_BSS_ARP_ADDR_LIST_LEN ?
                                        IEEE80211_BSS_ARP_ADDR_LIST_LEN :
                                        info->arp_addr_cnt)
                __field(int, arp_addr_cnt)
                __field(bool, qos)
                __field(bool, idle)
                __field(bool, ps)
                __dynamic_array(u8, ssid, info->ssid_len)
                __field(bool, hidden_ssid)
                __field(int, txpower)
                __field(u8, p2p_oppps_ctwindow)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->changed = changed;
                __entry->aid = info->aid;
                __entry->assoc = info->assoc;
                __entry->ibss_joined = info->ibss_joined;
                __entry->ibss_creator = info->ibss_creator;
                __entry->shortpre = info->use_short_preamble;
                __entry->cts = info->use_cts_prot;
                __entry->shortslot = info->use_short_slot;
                __entry->enable_beacon = info->enable_beacon;
                __entry->dtimper = info->dtim_period;
                __entry->bcnint = info->beacon_int;
                __entry->assoc_cap = info->assoc_capability;
                __entry->sync_tsf = info->sync_tsf;
                __entry->sync_device_ts = info->sync_device_ts;
                __entry->sync_dtim_count = info->sync_dtim_count;
                __entry->basic_rates = info->basic_rates;
                memcpy(__entry->mcast_rate, info->mcast_rate,
                       sizeof(__entry->mcast_rate));
                __entry->ht_operation_mode = info->ht_operation_mode;
                __entry->cqm_rssi_thold = info->cqm_rssi_thold;
                __entry->cqm_rssi_hyst = info->cqm_rssi_hyst;
                __entry->channel_width = info->chandef.width;
                __entry->channel_cfreq1 = info->chandef.center_freq1;
                __entry->channel_cfreq1_offset = info->chandef.freq1_offset;
                __entry->arp_addr_cnt = info->arp_addr_cnt;
                memcpy(__get_dynamic_array(arp_addr_list), info->arp_addr_list,
                       sizeof(u32) * (info->arp_addr_cnt > IEEE80211_BSS_ARP_ADDR_LIST_LEN ?
                                        IEEE80211_BSS_ARP_ADDR_LIST_LEN :
                                        info->arp_addr_cnt));
                __entry->qos = info->qos;
                __entry->idle = info->idle;
                __entry->ps = info->ps;
                memcpy(__get_dynamic_array(ssid), info->ssid, info->ssid_len);
                __entry->hidden_ssid = info->hidden_ssid;
                __entry->txpower = info->txpower;
                __entry->p2p_oppps_ctwindow = info->p2p_noa_attr.oppps_ctwindow;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT " changed:%#x",
                LOCAL_PR_ARG, VIF_PR_ARG, __entry->changed
        )
);

TRACE_EVENT(drv_prepare_multicast,
        TP_PROTO(struct ieee80211_local *local, int mc_count),

        TP_ARGS(local, mc_count),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(int, mc_count)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->mc_count = mc_count;
        ),

        TP_printk(
                LOCAL_PR_FMT " prepare mc (%d)",
                LOCAL_PR_ARG, __entry->mc_count
        )
);

TRACE_EVENT(drv_configure_filter,
        TP_PROTO(struct ieee80211_local *local,
                 unsigned int changed_flags,
                 unsigned int *total_flags,
                 u64 multicast),

        TP_ARGS(local, changed_flags, total_flags, multicast),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(unsigned int, changed)
                __field(unsigned int, total)
                __field(u64, multicast)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->changed = changed_flags;
                __entry->total = *total_flags;
                __entry->multicast = multicast;
        ),

        TP_printk(
                LOCAL_PR_FMT " changed:%#x total:%#x",
                LOCAL_PR_ARG, __entry->changed, __entry->total
        )
);

TRACE_EVENT(drv_config_iface_filter,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 unsigned int filter_flags,
                 unsigned int changed_flags),

        TP_ARGS(local, sdata, filter_flags, changed_flags),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(unsigned int, filter_flags)
                __field(unsigned int, changed_flags)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->filter_flags = filter_flags;
                __entry->changed_flags = changed_flags;
        ),

        TP_printk(
                LOCAL_PR_FMT VIF_PR_FMT
                " filter_flags: %#x changed_flags: %#x",
                LOCAL_PR_ARG, VIF_PR_ARG, __entry->filter_flags,
                __entry->changed_flags
        )
);

TRACE_EVENT(drv_set_tim,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sta *sta, bool set),

        TP_ARGS(local, sta, set),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                STA_ENTRY
                __field(bool, set)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                STA_ASSIGN;
                __entry->set = set;
        ),

        TP_printk(
                LOCAL_PR_FMT STA_PR_FMT " set:%d",
                LOCAL_PR_ARG, STA_PR_ARG, __entry->set
        )
);

TRACE_EVENT(drv_set_key,
        TP_PROTO(struct ieee80211_local *local,
                 enum set_key_cmd cmd, struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_sta *sta,
                 struct ieee80211_key_conf *key),

        TP_ARGS(local, cmd, sdata, sta, key),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                STA_ENTRY
                KEY_ENTRY
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                STA_ASSIGN;
                KEY_ASSIGN(key);
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT  STA_PR_FMT KEY_PR_FMT,
                LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, KEY_PR_ARG
        )
);

TRACE_EVENT(drv_update_tkip_key,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_key_conf *conf,
                 struct ieee80211_sta *sta, u32 iv32),

        TP_ARGS(local, sdata, conf, sta, iv32),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                STA_ENTRY
                __field(u32, iv32)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                STA_ASSIGN;
                __entry->iv32 = iv32;
        ),

        TP_printk(
                LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " iv32:%#x",
                LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->iv32
        )
);

DEFINE_EVENT(local_sdata_evt, drv_hw_scan,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),
        TP_ARGS(local, sdata)
);

DEFINE_EVENT(local_sdata_evt, drv_cancel_hw_scan,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),
        TP_ARGS(local, sdata)
);

DEFINE_EVENT(local_sdata_evt, drv_sched_scan_start,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),
        TP_ARGS(local, sdata)
);

DEFINE_EVENT(local_sdata_evt, drv_sched_scan_stop,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),
        TP_ARGS(local, sdata)
);

TRACE_EVENT(drv_sw_scan_start,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 const u8 *mac_addr),

        TP_ARGS(local, sdata, mac_addr),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __array(char, mac_addr, ETH_ALEN)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                memcpy(__entry->mac_addr, mac_addr, ETH_ALEN);
        ),

        TP_printk(LOCAL_PR_FMT ", " VIF_PR_FMT ", addr:%pM",
                  LOCAL_PR_ARG, VIF_PR_ARG, __entry->mac_addr)
);

DEFINE_EVENT(local_sdata_evt, drv_sw_scan_complete,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),
        TP_ARGS(local, sdata)
);

TRACE_EVENT(drv_get_stats,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_low_level_stats *stats,
                 int ret),

        TP_ARGS(local, stats, ret),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(int, ret)
                __field(unsigned int, ackfail)
                __field(unsigned int, rtsfail)
                __field(unsigned int, fcserr)
                __field(unsigned int, rtssucc)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->ret = ret;
                __entry->ackfail = stats->dot11ACKFailureCount;
                __entry->rtsfail = stats->dot11RTSFailureCount;
                __entry->fcserr = stats->dot11FCSErrorCount;
                __entry->rtssucc = stats->dot11RTSSuccessCount;
        ),

        TP_printk(
                LOCAL_PR_FMT " ret:%d",
                LOCAL_PR_ARG, __entry->ret
        )
);

TRACE_EVENT(drv_get_key_seq,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_key_conf *key),

        TP_ARGS(local, key),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                KEY_ENTRY
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                KEY_ASSIGN(key);
        ),

        TP_printk(
                LOCAL_PR_FMT KEY_PR_FMT,
                LOCAL_PR_ARG, KEY_PR_ARG
        )
);

DEFINE_EVENT(local_u32_evt, drv_set_frag_threshold,
        TP_PROTO(struct ieee80211_local *local, u32 value),
        TP_ARGS(local, value)
);

DEFINE_EVENT(local_u32_evt, drv_set_rts_threshold,
        TP_PROTO(struct ieee80211_local *local, u32 value),
        TP_ARGS(local, value)
);

TRACE_EVENT(drv_set_coverage_class,
        TP_PROTO(struct ieee80211_local *local, s16 value),

        TP_ARGS(local, value),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(s16, value)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->value = value;
        ),

        TP_printk(
                LOCAL_PR_FMT " value:%d",
                LOCAL_PR_ARG, __entry->value
        )
);

TRACE_EVENT(drv_sta_notify,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 enum sta_notify_cmd cmd,
                 struct ieee80211_sta *sta),

        TP_ARGS(local, sdata, cmd, sta),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                STA_ENTRY
                __field(u32, cmd)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                STA_ASSIGN;
                __entry->cmd = cmd;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT  STA_PR_FMT " cmd:%d",
                LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->cmd
        )
);

TRACE_EVENT(drv_sta_state,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_sta *sta,
                 enum ieee80211_sta_state old_state,
                 enum ieee80211_sta_state new_state),

        TP_ARGS(local, sdata, sta, old_state, new_state),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                STA_ENTRY
                __field(u32, old_state)
                __field(u32, new_state)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                STA_ASSIGN;
                __entry->old_state = old_state;
                __entry->new_state = new_state;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT  STA_PR_FMT " state: %d->%d",
                LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG,
                __entry->old_state, __entry->new_state
        )
);

TRACE_EVENT(drv_sta_set_txpwr,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_sta *sta),

        TP_ARGS(local, sdata, sta),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                STA_ENTRY
                __field(s16, txpwr)
                __field(u8, type)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                STA_ASSIGN;
                __entry->txpwr = sta->txpwr.power;
                __entry->type = sta->txpwr.type;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT  STA_PR_FMT " txpwr: %d type %d",
                LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG,
                __entry->txpwr,  __entry->type
        )
);

TRACE_EVENT(drv_sta_rc_update,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_sta *sta,
                 u32 changed),

        TP_ARGS(local, sdata, sta, changed),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                STA_ENTRY
                __field(u32, changed)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                STA_ASSIGN;
                __entry->changed = changed;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT  STA_PR_FMT " changed: 0x%x",
                LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->changed
        )
);

DECLARE_EVENT_CLASS(sta_event,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_sta *sta),

        TP_ARGS(local, sdata, sta),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                STA_ENTRY
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                STA_ASSIGN;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT  STA_PR_FMT,
                LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG
        )
);

DEFINE_EVENT(sta_event, drv_sta_statistics,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_sta *sta),
        TP_ARGS(local, sdata, sta)
);

DEFINE_EVENT(sta_event, drv_sta_add,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_sta *sta),
        TP_ARGS(local, sdata, sta)
);

DEFINE_EVENT(sta_event, drv_sta_remove,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_sta *sta),
        TP_ARGS(local, sdata, sta)
);

DEFINE_EVENT(sta_event, drv_sta_pre_rcu_remove,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_sta *sta),
        TP_ARGS(local, sdata, sta)
);

DEFINE_EVENT(sta_event, drv_sync_rx_queues,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_sta *sta),
        TP_ARGS(local, sdata, sta)
);

DEFINE_EVENT(sta_event, drv_sta_rate_tbl_update,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_sta *sta),
        TP_ARGS(local, sdata, sta)
);

TRACE_EVENT(drv_conf_tx,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 u16 ac, const struct ieee80211_tx_queue_params *params),

        TP_ARGS(local, sdata, ac, params),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(u16, ac)
                __field(u16, txop)
                __field(u16, cw_min)
                __field(u16, cw_max)
                __field(u8, aifs)
                __field(bool, uapsd)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->ac = ac;
                __entry->txop = params->txop;
                __entry->cw_max = params->cw_max;
                __entry->cw_min = params->cw_min;
                __entry->aifs = params->aifs;
                __entry->uapsd = params->uapsd;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT  " AC:%d",
                LOCAL_PR_ARG, VIF_PR_ARG, __entry->ac
        )
);

DEFINE_EVENT(local_sdata_evt, drv_get_tsf,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),
        TP_ARGS(local, sdata)
);

TRACE_EVENT(drv_set_tsf,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 u64 tsf),

        TP_ARGS(local, sdata, tsf),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(u64, tsf)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->tsf = tsf;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT  " tsf:%llu",
                LOCAL_PR_ARG, VIF_PR_ARG, (unsigned long long)__entry->tsf
        )
);

TRACE_EVENT(drv_offset_tsf,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 s64 offset),

        TP_ARGS(local, sdata, offset),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(s64, tsf_offset)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->tsf_offset = offset;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT  " tsf offset:%lld",
                LOCAL_PR_ARG, VIF_PR_ARG,
                (unsigned long long)__entry->tsf_offset
        )
);

DEFINE_EVENT(local_sdata_evt, drv_reset_tsf,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),
        TP_ARGS(local, sdata)
);

DEFINE_EVENT(local_only_evt, drv_tx_last_beacon,
        TP_PROTO(struct ieee80211_local *local),
        TP_ARGS(local)
);

TRACE_EVENT(drv_ampdu_action,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_ampdu_params *params),

        TP_ARGS(local, sdata, params),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                AMPDU_ACTION_ENTRY
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                AMPDU_ACTION_ASSIGN;
        ),

        TP_printk(
                LOCAL_PR_FMT VIF_PR_FMT AMPDU_ACTION_PR_FMT,
                LOCAL_PR_ARG, VIF_PR_ARG, AMPDU_ACTION_PR_ARG
        )
);

TRACE_EVENT(drv_get_survey,
        TP_PROTO(struct ieee80211_local *local, int _idx,
                 struct survey_info *survey),

        TP_ARGS(local, _idx, survey),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(int, idx)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->idx = _idx;
        ),

        TP_printk(
                LOCAL_PR_FMT " idx:%d",
                LOCAL_PR_ARG, __entry->idx
        )
);

TRACE_EVENT(drv_flush,
        TP_PROTO(struct ieee80211_local *local,
                 u32 queues, bool drop),

        TP_ARGS(local, queues, drop),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(bool, drop)
                __field(u32, queues)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->drop = drop;
                __entry->queues = queues;
        ),

        TP_printk(
                LOCAL_PR_FMT " queues:0x%x drop:%d",
                LOCAL_PR_ARG, __entry->queues, __entry->drop
        )
);

TRACE_EVENT(drv_channel_switch,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_channel_switch *ch_switch),

        TP_ARGS(local, sdata, ch_switch),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                CHANDEF_ENTRY
                __field(u64, timestamp)
                __field(u32, device_timestamp)
                __field(bool, block_tx)
                __field(u8, count)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                CHANDEF_ASSIGN(&ch_switch->chandef)
                __entry->timestamp = ch_switch->timestamp;
                __entry->device_timestamp = ch_switch->device_timestamp;
                __entry->block_tx = ch_switch->block_tx;
                __entry->count = ch_switch->count;
        ),

        TP_printk(
                LOCAL_PR_FMT VIF_PR_FMT " new " CHANDEF_PR_FMT " count:%d",
                LOCAL_PR_ARG, VIF_PR_ARG, CHANDEF_PR_ARG, __entry->count
        )
);

TRACE_EVENT(drv_set_antenna,
        TP_PROTO(struct ieee80211_local *local, u32 tx_ant, u32 rx_ant, int ret),

        TP_ARGS(local, tx_ant, rx_ant, ret),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(u32, tx_ant)
                __field(u32, rx_ant)
                __field(int, ret)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->tx_ant = tx_ant;
                __entry->rx_ant = rx_ant;
                __entry->ret = ret;
        ),

        TP_printk(
                LOCAL_PR_FMT " tx_ant:%d rx_ant:%d ret:%d",
                LOCAL_PR_ARG, __entry->tx_ant, __entry->rx_ant, __entry->ret
        )
);

TRACE_EVENT(drv_get_antenna,
        TP_PROTO(struct ieee80211_local *local, u32 tx_ant, u32 rx_ant, int ret),

        TP_ARGS(local, tx_ant, rx_ant, ret),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(u32, tx_ant)
                __field(u32, rx_ant)
                __field(int, ret)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->tx_ant = tx_ant;
                __entry->rx_ant = rx_ant;
                __entry->ret = ret;
        ),

        TP_printk(
                LOCAL_PR_FMT " tx_ant:%d rx_ant:%d ret:%d",
                LOCAL_PR_ARG, __entry->tx_ant, __entry->rx_ant, __entry->ret
        )
);

TRACE_EVENT(drv_remain_on_channel,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_channel *chan,
                 unsigned int duration,
                 enum ieee80211_roc_type type),

        TP_ARGS(local, sdata, chan, duration, type),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(int, center_freq)
                __field(int, freq_offset)
                __field(unsigned int, duration)
                __field(u32, type)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->center_freq = chan->center_freq;
                __entry->freq_offset = chan->freq_offset;
                __entry->duration = duration;
                __entry->type = type;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT " freq:%d.%03dMHz duration:%dms type=%d",
                LOCAL_PR_ARG, VIF_PR_ARG,
                __entry->center_freq, __entry->freq_offset,
                __entry->duration, __entry->type
        )
);

DEFINE_EVENT(local_sdata_evt, drv_cancel_remain_on_channel,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),
        TP_ARGS(local, sdata)
);

TRACE_EVENT(drv_set_ringparam,
        TP_PROTO(struct ieee80211_local *local, u32 tx, u32 rx),

        TP_ARGS(local, tx, rx),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(u32, tx)
                __field(u32, rx)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->tx = tx;
                __entry->rx = rx;
        ),

        TP_printk(
                LOCAL_PR_FMT " tx:%d rx %d",
                LOCAL_PR_ARG, __entry->tx, __entry->rx
        )
);

TRACE_EVENT(drv_get_ringparam,
        TP_PROTO(struct ieee80211_local *local, u32 *tx, u32 *tx_max,
                 u32 *rx, u32 *rx_max),

        TP_ARGS(local, tx, tx_max, rx, rx_max),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(u32, tx)
                __field(u32, tx_max)
                __field(u32, rx)
                __field(u32, rx_max)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->tx = *tx;
                __entry->tx_max = *tx_max;
                __entry->rx = *rx;
                __entry->rx_max = *rx_max;
        ),

        TP_printk(
                LOCAL_PR_FMT " tx:%d tx_max %d rx %d rx_max %d",
                LOCAL_PR_ARG,
                __entry->tx, __entry->tx_max, __entry->rx, __entry->rx_max
        )
);

DEFINE_EVENT(local_only_evt, drv_tx_frames_pending,
        TP_PROTO(struct ieee80211_local *local),
        TP_ARGS(local)
);

DEFINE_EVENT(local_only_evt, drv_offchannel_tx_cancel_wait,
        TP_PROTO(struct ieee80211_local *local),
        TP_ARGS(local)
);

TRACE_EVENT(drv_set_bitrate_mask,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 const struct cfg80211_bitrate_mask *mask),

        TP_ARGS(local, sdata, mask),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(u32, legacy_2g)
                __field(u32, legacy_5g)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->legacy_2g = mask->control[NL80211_BAND_2GHZ].legacy;
                __entry->legacy_5g = mask->control[NL80211_BAND_5GHZ].legacy;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT " 2G Mask:0x%x 5G Mask:0x%x",
                LOCAL_PR_ARG, VIF_PR_ARG, __entry->legacy_2g, __entry->legacy_5g
        )
);

TRACE_EVENT(drv_set_rekey_data,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct cfg80211_gtk_rekey_data *data),

        TP_ARGS(local, sdata, data),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __array(u8, kek, NL80211_KEK_LEN)
                __array(u8, kck, NL80211_KCK_LEN)
                __array(u8, replay_ctr, NL80211_REPLAY_CTR_LEN)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                memcpy(__entry->kek, data->kek, NL80211_KEK_LEN);
                memcpy(__entry->kck, data->kck, NL80211_KCK_LEN);
                memcpy(__entry->replay_ctr, data->replay_ctr,
                       NL80211_REPLAY_CTR_LEN);
        ),

        TP_printk(LOCAL_PR_FMT VIF_PR_FMT,
                  LOCAL_PR_ARG, VIF_PR_ARG)
);

TRACE_EVENT(drv_event_callback,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 const struct ieee80211_event *_event),

        TP_ARGS(local, sdata, _event),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(u32, type)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->type = _event->type;
        ),

        TP_printk(
                LOCAL_PR_FMT VIF_PR_FMT " event:%d",
                LOCAL_PR_ARG, VIF_PR_ARG, __entry->type
        )
);

DECLARE_EVENT_CLASS(release_evt,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sta *sta,
                 u16 tids, int num_frames,
                 enum ieee80211_frame_release_type reason,
                 bool more_data),

        TP_ARGS(local, sta, tids, num_frames, reason, more_data),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                STA_ENTRY
                __field(u16, tids)
                __field(int, num_frames)
                __field(int, reason)
                __field(bool, more_data)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                STA_ASSIGN;
                __entry->tids = tids;
                __entry->num_frames = num_frames;
                __entry->reason = reason;
                __entry->more_data = more_data;
        ),

        TP_printk(
                LOCAL_PR_FMT STA_PR_FMT
                " TIDs:0x%.4x frames:%d reason:%d more:%d",
                LOCAL_PR_ARG, STA_PR_ARG, __entry->tids, __entry->num_frames,
                __entry->reason, __entry->more_data
        )
);

DEFINE_EVENT(release_evt, drv_release_buffered_frames,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sta *sta,
                 u16 tids, int num_frames,
                 enum ieee80211_frame_release_type reason,
                 bool more_data),

        TP_ARGS(local, sta, tids, num_frames, reason, more_data)
);

DEFINE_EVENT(release_evt, drv_allow_buffered_frames,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sta *sta,
                 u16 tids, int num_frames,
                 enum ieee80211_frame_release_type reason,
                 bool more_data),

        TP_ARGS(local, sta, tids, num_frames, reason, more_data)
);

TRACE_EVENT(drv_mgd_prepare_tx,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 u16 duration),

        TP_ARGS(local, sdata, duration),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(u32, duration)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->duration = duration;
        ),

        TP_printk(
                LOCAL_PR_FMT VIF_PR_FMT " duration: %u",
                LOCAL_PR_ARG, VIF_PR_ARG, __entry->duration
        )
);

DEFINE_EVENT(local_sdata_evt, drv_mgd_protect_tdls_discover,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),

        TP_ARGS(local, sdata)
);

DECLARE_EVENT_CLASS(local_chanctx,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_chanctx *ctx),

        TP_ARGS(local, ctx),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                CHANCTX_ENTRY
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                CHANCTX_ASSIGN;
        ),

        TP_printk(
                LOCAL_PR_FMT CHANCTX_PR_FMT,
                LOCAL_PR_ARG, CHANCTX_PR_ARG
        )
);

DEFINE_EVENT(local_chanctx, drv_add_chanctx,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_chanctx *ctx),
        TP_ARGS(local, ctx)
);

DEFINE_EVENT(local_chanctx, drv_remove_chanctx,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_chanctx *ctx),
        TP_ARGS(local, ctx)
);

TRACE_EVENT(drv_change_chanctx,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_chanctx *ctx,
                 u32 changed),

        TP_ARGS(local, ctx, changed),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                CHANCTX_ENTRY
                __field(u32, changed)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                CHANCTX_ASSIGN;
                __entry->changed = changed;
        ),

        TP_printk(
                LOCAL_PR_FMT CHANCTX_PR_FMT " changed:%#x",
                LOCAL_PR_ARG, CHANCTX_PR_ARG, __entry->changed
        )
);

#if !defined(__TRACE_VIF_ENTRY)
#define __TRACE_VIF_ENTRY
struct trace_vif_entry {
        enum nl80211_iftype vif_type;
        bool p2p;
        char vif_name[IFNAMSIZ];
} __packed;

struct trace_chandef_entry {
        u32 control_freq;
        u32 freq_offset;
        u32 chan_width;
        u32 center_freq1;
        u32 freq1_offset;
        u32 center_freq2;
} __packed;

struct trace_switch_entry {
        struct trace_vif_entry vif;
        struct trace_chandef_entry old_chandef;
        struct trace_chandef_entry new_chandef;
} __packed;

#define SWITCH_ENTRY_ASSIGN(to, from) local_vifs[i].to = vifs[i].from
#endif

TRACE_EVENT(drv_switch_vif_chanctx,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_vif_chanctx_switch *vifs,
                 int n_vifs, enum ieee80211_chanctx_switch_mode mode),
            TP_ARGS(local, vifs, n_vifs, mode),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(int, n_vifs)
                __field(u32, mode)
                __dynamic_array(u8, vifs,
                                sizeof(struct trace_switch_entry) * n_vifs)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->n_vifs = n_vifs;
                __entry->mode = mode;
                {
                        struct trace_switch_entry *local_vifs =
                                __get_dynamic_array(vifs);
                        int i;

                        for (i = 0; i < n_vifs; i++) {
                                struct ieee80211_sub_if_data *sdata;

                                sdata = container_of(vifs[i].vif,
                                                struct ieee80211_sub_if_data,
                                                vif);

                                SWITCH_ENTRY_ASSIGN(vif.vif_type, vif->type);
                                SWITCH_ENTRY_ASSIGN(vif.p2p, vif->p2p);
                                strncpy(local_vifs[i].vif.vif_name,
                                        sdata->name,
                                        sizeof(local_vifs[i].vif.vif_name));
                                SWITCH_ENTRY_ASSIGN(old_chandef.control_freq,
                                                old_ctx->def.chan->center_freq);
                                SWITCH_ENTRY_ASSIGN(old_chandef.freq_offset,
                                                old_ctx->def.chan->freq_offset);
                                SWITCH_ENTRY_ASSIGN(old_chandef.chan_width,
                                                    old_ctx->def.width);
                                SWITCH_ENTRY_ASSIGN(old_chandef.center_freq1,
                                                    old_ctx->def.center_freq1);
                                SWITCH_ENTRY_ASSIGN(old_chandef.freq1_offset,
                                                    old_ctx->def.freq1_offset);
                                SWITCH_ENTRY_ASSIGN(old_chandef.center_freq2,
                                                    old_ctx->def.center_freq2);
                                SWITCH_ENTRY_ASSIGN(new_chandef.control_freq,
                                                new_ctx->def.chan->center_freq);
                                SWITCH_ENTRY_ASSIGN(new_chandef.freq_offset,
                                                new_ctx->def.chan->freq_offset);
                                SWITCH_ENTRY_ASSIGN(new_chandef.chan_width,
                                                    new_ctx->def.width);
                                SWITCH_ENTRY_ASSIGN(new_chandef.center_freq1,
                                                    new_ctx->def.center_freq1);
                                SWITCH_ENTRY_ASSIGN(new_chandef.freq1_offset,
                                                    new_ctx->def.freq1_offset);
                                SWITCH_ENTRY_ASSIGN(new_chandef.center_freq2,
                                                    new_ctx->def.center_freq2);
                        }
                }
        ),

        TP_printk(
                LOCAL_PR_FMT " n_vifs:%d mode:%d",
                LOCAL_PR_ARG, __entry->n_vifs, __entry->mode
        )
);

DECLARE_EVENT_CLASS(local_sdata_chanctx,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_chanctx *ctx),

        TP_ARGS(local, sdata, ctx),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                CHANCTX_ENTRY
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                CHANCTX_ASSIGN;
        ),

        TP_printk(
                LOCAL_PR_FMT VIF_PR_FMT CHANCTX_PR_FMT,
                LOCAL_PR_ARG, VIF_PR_ARG, CHANCTX_PR_ARG
        )
);

DEFINE_EVENT(local_sdata_chanctx, drv_assign_vif_chanctx,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_chanctx *ctx),
        TP_ARGS(local, sdata, ctx)
);

DEFINE_EVENT(local_sdata_chanctx, drv_unassign_vif_chanctx,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_chanctx *ctx),
        TP_ARGS(local, sdata, ctx)
);

TRACE_EVENT(drv_start_ap,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_bss_conf *info),

        TP_ARGS(local, sdata, info),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(u8, dtimper)
                __field(u16, bcnint)
                __dynamic_array(u8, ssid, info->ssid_len)
                __field(bool, hidden_ssid)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->dtimper = info->dtim_period;
                __entry->bcnint = info->beacon_int;
                memcpy(__get_dynamic_array(ssid), info->ssid, info->ssid_len);
                __entry->hidden_ssid = info->hidden_ssid;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT,
                LOCAL_PR_ARG, VIF_PR_ARG
        )
);

DEFINE_EVENT(local_sdata_evt, drv_stop_ap,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),
        TP_ARGS(local, sdata)
);

TRACE_EVENT(drv_reconfig_complete,
        TP_PROTO(struct ieee80211_local *local,
                 enum ieee80211_reconfig_type reconfig_type),
        TP_ARGS(local, reconfig_type),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(u8, reconfig_type)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->reconfig_type = reconfig_type;
        ),

        TP_printk(
                LOCAL_PR_FMT  " reconfig_type:%d",
                LOCAL_PR_ARG, __entry->reconfig_type
        )

);

#if IS_ENABLED(CONFIG_IPV6)
DEFINE_EVENT(local_sdata_evt, drv_ipv6_addr_change,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),
        TP_ARGS(local, sdata)
);
#endif

TRACE_EVENT(drv_join_ibss,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_bss_conf *info),

        TP_ARGS(local, sdata, info),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(u8, dtimper)
                __field(u16, bcnint)
                __dynamic_array(u8, ssid, info->ssid_len)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->dtimper = info->dtim_period;
                __entry->bcnint = info->beacon_int;
                memcpy(__get_dynamic_array(ssid), info->ssid, info->ssid_len);
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT,
                LOCAL_PR_ARG, VIF_PR_ARG
        )
);

DEFINE_EVENT(local_sdata_evt, drv_leave_ibss,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),
        TP_ARGS(local, sdata)
);

TRACE_EVENT(drv_get_expected_throughput,
        TP_PROTO(struct ieee80211_sta *sta),

        TP_ARGS(sta),

        TP_STRUCT__entry(
                STA_ENTRY
        ),

        TP_fast_assign(
                STA_ASSIGN;
        ),

        TP_printk(
                STA_PR_FMT, STA_PR_ARG
        )
);

TRACE_EVENT(drv_start_nan,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct cfg80211_nan_conf *conf),

        TP_ARGS(local, sdata, conf),
        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(u8, master_pref)
                __field(u8, bands)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->master_pref = conf->master_pref;
                __entry->bands = conf->bands;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT
                ", master preference: %u, bands: 0x%0x",
                LOCAL_PR_ARG, VIF_PR_ARG, __entry->master_pref,
                __entry->bands
        )
);

TRACE_EVENT(drv_stop_nan,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),

        TP_ARGS(local, sdata),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT,
                LOCAL_PR_ARG, VIF_PR_ARG
        )
);

TRACE_EVENT(drv_nan_change_conf,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct cfg80211_nan_conf *conf,
                 u32 changes),

        TP_ARGS(local, sdata, conf, changes),
        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(u8, master_pref)
                __field(u8, bands)
                __field(u32, changes)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->master_pref = conf->master_pref;
                __entry->bands = conf->bands;
                __entry->changes = changes;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT
                ", master preference: %u, bands: 0x%0x, changes: 0x%x",
                LOCAL_PR_ARG, VIF_PR_ARG, __entry->master_pref,
                __entry->bands, __entry->changes
        )
);

TRACE_EVENT(drv_add_nan_func,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 const struct cfg80211_nan_func *func),

        TP_ARGS(local, sdata, func),
        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(u8, type)
                __field(u8, inst_id)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->type = func->type;
                __entry->inst_id = func->instance_id;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT
                ", type: %u, inst_id: %u",
                LOCAL_PR_ARG, VIF_PR_ARG, __entry->type, __entry->inst_id
        )
);

TRACE_EVENT(drv_del_nan_func,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 u8 instance_id),

        TP_ARGS(local, sdata, instance_id),
        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(u8, instance_id)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->instance_id = instance_id;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT
                ", instance_id: %u",
                LOCAL_PR_ARG, VIF_PR_ARG, __entry->instance_id
        )
);

DEFINE_EVENT(local_sdata_evt, drv_start_pmsr,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),
        TP_ARGS(local, sdata)
);

DEFINE_EVENT(local_sdata_evt, drv_abort_pmsr,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),
        TP_ARGS(local, sdata)
);

/*
 * Tracing for API calls that drivers call.
 */

TRACE_EVENT(api_start_tx_ba_session,
        TP_PROTO(struct ieee80211_sta *sta, u16 tid),

        TP_ARGS(sta, tid),

        TP_STRUCT__entry(
                STA_ENTRY
                __field(u16, tid)
        ),

        TP_fast_assign(
                STA_ASSIGN;
                __entry->tid = tid;
        ),

        TP_printk(
                STA_PR_FMT " tid:%d",
                STA_PR_ARG, __entry->tid
        )
);

TRACE_EVENT(api_start_tx_ba_cb,
        TP_PROTO(struct ieee80211_sub_if_data *sdata, const u8 *ra, u16 tid),

        TP_ARGS(sdata, ra, tid),

        TP_STRUCT__entry(
                VIF_ENTRY
                __array(u8, ra, ETH_ALEN)
                __field(u16, tid)
        ),

        TP_fast_assign(
                VIF_ASSIGN;
                memcpy(__entry->ra, ra, ETH_ALEN);
                __entry->tid = tid;
        ),

        TP_printk(
                VIF_PR_FMT " ra:%pM tid:%d",
                VIF_PR_ARG, __entry->ra, __entry->tid
        )
);

TRACE_EVENT(api_stop_tx_ba_session,
        TP_PROTO(struct ieee80211_sta *sta, u16 tid),

        TP_ARGS(sta, tid),

        TP_STRUCT__entry(
                STA_ENTRY
                __field(u16, tid)
        ),

        TP_fast_assign(
                STA_ASSIGN;
                __entry->tid = tid;
        ),

        TP_printk(
                STA_PR_FMT " tid:%d",
                STA_PR_ARG, __entry->tid
        )
);

TRACE_EVENT(api_stop_tx_ba_cb,
        TP_PROTO(struct ieee80211_sub_if_data *sdata, const u8 *ra, u16 tid),

        TP_ARGS(sdata, ra, tid),

        TP_STRUCT__entry(
                VIF_ENTRY
                __array(u8, ra, ETH_ALEN)
                __field(u16, tid)
        ),

        TP_fast_assign(
                VIF_ASSIGN;
                memcpy(__entry->ra, ra, ETH_ALEN);
                __entry->tid = tid;
        ),

        TP_printk(
                VIF_PR_FMT " ra:%pM tid:%d",
                VIF_PR_ARG, __entry->ra, __entry->tid
        )
);

DEFINE_EVENT(local_only_evt, api_restart_hw,
        TP_PROTO(struct ieee80211_local *local),
        TP_ARGS(local)
);

TRACE_EVENT(api_beacon_loss,
        TP_PROTO(struct ieee80211_sub_if_data *sdata),

        TP_ARGS(sdata),

        TP_STRUCT__entry(
                VIF_ENTRY
        ),

        TP_fast_assign(
                VIF_ASSIGN;
        ),

        TP_printk(
                VIF_PR_FMT,
                VIF_PR_ARG
        )
);

TRACE_EVENT(api_connection_loss,
        TP_PROTO(struct ieee80211_sub_if_data *sdata),

        TP_ARGS(sdata),

        TP_STRUCT__entry(
                VIF_ENTRY
        ),

        TP_fast_assign(
                VIF_ASSIGN;
        ),

        TP_printk(
                VIF_PR_FMT,
                VIF_PR_ARG
        )
);

TRACE_EVENT(api_cqm_rssi_notify,
        TP_PROTO(struct ieee80211_sub_if_data *sdata,
                 enum nl80211_cqm_rssi_threshold_event rssi_event,
                 s32 rssi_level),

        TP_ARGS(sdata, rssi_event, rssi_level),

        TP_STRUCT__entry(
                VIF_ENTRY
                __field(u32, rssi_event)
                __field(s32, rssi_level)
        ),

        TP_fast_assign(
                VIF_ASSIGN;
                __entry->rssi_event = rssi_event;
                __entry->rssi_level = rssi_level;
        ),

        TP_printk(
                VIF_PR_FMT " event:%d rssi:%d",
                VIF_PR_ARG, __entry->rssi_event, __entry->rssi_level
        )
);

DEFINE_EVENT(local_sdata_evt, api_cqm_beacon_loss_notify,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),
        TP_ARGS(local, sdata)
);

TRACE_EVENT(api_scan_completed,
        TP_PROTO(struct ieee80211_local *local, bool aborted),

        TP_ARGS(local, aborted),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(bool, aborted)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->aborted = aborted;
        ),

        TP_printk(
                LOCAL_PR_FMT " aborted:%d",
                LOCAL_PR_ARG, __entry->aborted
        )
);

TRACE_EVENT(api_sched_scan_results,
        TP_PROTO(struct ieee80211_local *local),

        TP_ARGS(local),

        TP_STRUCT__entry(
                LOCAL_ENTRY
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
        ),

        TP_printk(
                LOCAL_PR_FMT, LOCAL_PR_ARG
        )
);

TRACE_EVENT(api_sched_scan_stopped,
        TP_PROTO(struct ieee80211_local *local),

        TP_ARGS(local),

        TP_STRUCT__entry(
                LOCAL_ENTRY
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
        ),

        TP_printk(
                LOCAL_PR_FMT, LOCAL_PR_ARG
        )
);

TRACE_EVENT(api_sta_block_awake,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sta *sta, bool block),

        TP_ARGS(local, sta, block),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                STA_ENTRY
                __field(bool, block)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                STA_ASSIGN;
                __entry->block = block;
        ),

        TP_printk(
                LOCAL_PR_FMT STA_PR_FMT " block:%d",
                LOCAL_PR_ARG, STA_PR_ARG, __entry->block
        )
);

TRACE_EVENT(api_chswitch_done,
        TP_PROTO(struct ieee80211_sub_if_data *sdata, bool success),

        TP_ARGS(sdata, success),

        TP_STRUCT__entry(
                VIF_ENTRY
                __field(bool, success)
        ),

        TP_fast_assign(
                VIF_ASSIGN;
                __entry->success = success;
        ),

        TP_printk(
                VIF_PR_FMT " success=%d",
                VIF_PR_ARG, __entry->success
        )
);

DEFINE_EVENT(local_only_evt, api_ready_on_channel,
        TP_PROTO(struct ieee80211_local *local),
        TP_ARGS(local)
);

DEFINE_EVENT(local_only_evt, api_remain_on_channel_expired,
        TP_PROTO(struct ieee80211_local *local),
        TP_ARGS(local)
);

TRACE_EVENT(api_gtk_rekey_notify,
        TP_PROTO(struct ieee80211_sub_if_data *sdata,
                 const u8 *bssid, const u8 *replay_ctr),

        TP_ARGS(sdata, bssid, replay_ctr),

        TP_STRUCT__entry(
                VIF_ENTRY
                __array(u8, bssid, ETH_ALEN)
                __array(u8, replay_ctr, NL80211_REPLAY_CTR_LEN)
        ),

        TP_fast_assign(
                VIF_ASSIGN;
                memcpy(__entry->bssid, bssid, ETH_ALEN);
                memcpy(__entry->replay_ctr, replay_ctr, NL80211_REPLAY_CTR_LEN);
        ),

        TP_printk(VIF_PR_FMT, VIF_PR_ARG)
);

TRACE_EVENT(api_enable_rssi_reports,
        TP_PROTO(struct ieee80211_sub_if_data *sdata,
                 int rssi_min_thold, int rssi_max_thold),

        TP_ARGS(sdata, rssi_min_thold, rssi_max_thold),

        TP_STRUCT__entry(
                VIF_ENTRY
                __field(int, rssi_min_thold)
                __field(int, rssi_max_thold)
        ),

        TP_fast_assign(
                VIF_ASSIGN;
                __entry->rssi_min_thold = rssi_min_thold;
                __entry->rssi_max_thold = rssi_max_thold;
        ),

        TP_printk(
                VIF_PR_FMT " rssi_min_thold =%d, rssi_max_thold = %d",
                VIF_PR_ARG, __entry->rssi_min_thold, __entry->rssi_max_thold
        )
);

TRACE_EVENT(api_eosp,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sta *sta),

        TP_ARGS(local, sta),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                STA_ENTRY
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                STA_ASSIGN;
        ),

        TP_printk(
                LOCAL_PR_FMT STA_PR_FMT,
                LOCAL_PR_ARG, STA_PR_ARG
        )
);

TRACE_EVENT(api_send_eosp_nullfunc,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sta *sta,
                 u8 tid),

        TP_ARGS(local, sta, tid),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                STA_ENTRY
                __field(u8, tid)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                STA_ASSIGN;
                __entry->tid = tid;
        ),

        TP_printk(
                LOCAL_PR_FMT STA_PR_FMT " tid:%d",
                LOCAL_PR_ARG, STA_PR_ARG, __entry->tid
        )
);

TRACE_EVENT(api_sta_set_buffered,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sta *sta,
                 u8 tid, bool buffered),

        TP_ARGS(local, sta, tid, buffered),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                STA_ENTRY
                __field(u8, tid)
                __field(bool, buffered)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                STA_ASSIGN;
                __entry->tid = tid;
                __entry->buffered = buffered;
        ),

        TP_printk(
                LOCAL_PR_FMT STA_PR_FMT " tid:%d buffered:%d",
                LOCAL_PR_ARG, STA_PR_ARG, __entry->tid, __entry->buffered
        )
);

/*
 * Tracing for internal functions
 * (which may also be called in response to driver calls)
 */

TRACE_EVENT(wake_queue,
        TP_PROTO(struct ieee80211_local *local, u16 queue,
                 enum queue_stop_reason reason),

        TP_ARGS(local, queue, reason),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(u16, queue)
                __field(u32, reason)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->queue = queue;
                __entry->reason = reason;
        ),

        TP_printk(
                LOCAL_PR_FMT " queue:%d, reason:%d",
                LOCAL_PR_ARG, __entry->queue, __entry->reason
        )
);

TRACE_EVENT(stop_queue,
        TP_PROTO(struct ieee80211_local *local, u16 queue,
                 enum queue_stop_reason reason),

        TP_ARGS(local, queue, reason),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(u16, queue)
                __field(u32, reason)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->queue = queue;
                __entry->reason = reason;
        ),

        TP_printk(
                LOCAL_PR_FMT " queue:%d, reason:%d",
                LOCAL_PR_ARG, __entry->queue, __entry->reason
        )
);

TRACE_EVENT(drv_set_default_unicast_key,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 int key_idx),

        TP_ARGS(local, sdata, key_idx),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(int, key_idx)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->key_idx = key_idx;
        ),

        TP_printk(LOCAL_PR_FMT VIF_PR_FMT " key_idx:%d",
                  LOCAL_PR_ARG, VIF_PR_ARG, __entry->key_idx)
);

TRACE_EVENT(api_radar_detected,
        TP_PROTO(struct ieee80211_local *local),

        TP_ARGS(local),

        TP_STRUCT__entry(
                LOCAL_ENTRY
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
        ),

        TP_printk(
                LOCAL_PR_FMT " radar detected",
                LOCAL_PR_ARG
        )
);

TRACE_EVENT(drv_channel_switch_beacon,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct cfg80211_chan_def *chandef),

        TP_ARGS(local, sdata, chandef),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                CHANDEF_ENTRY
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                CHANDEF_ASSIGN(chandef);
        ),

        TP_printk(
                LOCAL_PR_FMT VIF_PR_FMT " channel switch to " CHANDEF_PR_FMT,
                LOCAL_PR_ARG, VIF_PR_ARG, CHANDEF_PR_ARG
        )
);

TRACE_EVENT(drv_pre_channel_switch,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_channel_switch *ch_switch),

        TP_ARGS(local, sdata, ch_switch),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                CHANDEF_ENTRY
                __field(u64, timestamp)
                __field(u32, device_timestamp)
                __field(bool, block_tx)
                __field(u8, count)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                CHANDEF_ASSIGN(&ch_switch->chandef)
                __entry->timestamp = ch_switch->timestamp;
                __entry->device_timestamp = ch_switch->device_timestamp;
                __entry->block_tx = ch_switch->block_tx;
                __entry->count = ch_switch->count;
        ),

        TP_printk(
                LOCAL_PR_FMT VIF_PR_FMT " prepare channel switch to "
                CHANDEF_PR_FMT  " count:%d block_tx:%d timestamp:%llu",
                LOCAL_PR_ARG, VIF_PR_ARG, CHANDEF_PR_ARG, __entry->count,
                __entry->block_tx, __entry->timestamp
        )
);

DEFINE_EVENT(local_sdata_evt, drv_post_channel_switch,
             TP_PROTO(struct ieee80211_local *local,
                      struct ieee80211_sub_if_data *sdata),
             TP_ARGS(local, sdata)
);

DEFINE_EVENT(local_sdata_evt, drv_abort_channel_switch,
             TP_PROTO(struct ieee80211_local *local,
                      struct ieee80211_sub_if_data *sdata),
             TP_ARGS(local, sdata)
);

TRACE_EVENT(drv_channel_switch_rx_beacon,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_channel_switch *ch_switch),

        TP_ARGS(local, sdata, ch_switch),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                CHANDEF_ENTRY
                __field(u64, timestamp)
                __field(u32, device_timestamp)
                __field(bool, block_tx)
                __field(u8, count)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                CHANDEF_ASSIGN(&ch_switch->chandef)
                __entry->timestamp = ch_switch->timestamp;
                __entry->device_timestamp = ch_switch->device_timestamp;
                __entry->block_tx = ch_switch->block_tx;
                __entry->count = ch_switch->count;
        ),

        TP_printk(
                LOCAL_PR_FMT VIF_PR_FMT
                " received a channel switch beacon to "
                CHANDEF_PR_FMT  " count:%d block_tx:%d timestamp:%llu",
                LOCAL_PR_ARG, VIF_PR_ARG, CHANDEF_PR_ARG, __entry->count,
                __entry->block_tx, __entry->timestamp
        )
);

TRACE_EVENT(drv_get_txpower,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 int dbm, int ret),

        TP_ARGS(local, sdata, dbm, ret),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(int, dbm)
                __field(int, ret)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->dbm = dbm;
                __entry->ret = ret;
        ),

        TP_printk(
                LOCAL_PR_FMT VIF_PR_FMT " dbm:%d ret:%d",
                LOCAL_PR_ARG, VIF_PR_ARG, __entry->dbm, __entry->ret
        )
);

TRACE_EVENT(drv_tdls_channel_switch,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_sta *sta, u8 oper_class,
                 struct cfg80211_chan_def *chandef),

        TP_ARGS(local, sdata, sta, oper_class, chandef),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                STA_ENTRY
                __field(u8, oper_class)
                CHANDEF_ENTRY
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                STA_ASSIGN;
                __entry->oper_class = oper_class;
                CHANDEF_ASSIGN(chandef)
        ),

        TP_printk(
                LOCAL_PR_FMT VIF_PR_FMT " tdls channel switch to"
                CHANDEF_PR_FMT  " oper_class:%d " STA_PR_FMT,
                LOCAL_PR_ARG, VIF_PR_ARG, CHANDEF_PR_ARG, __entry->oper_class,
                STA_PR_ARG
        )
);

TRACE_EVENT(drv_tdls_cancel_channel_switch,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_sta *sta),

        TP_ARGS(local, sdata, sta),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                STA_ENTRY
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                STA_ASSIGN;
        ),

        TP_printk(
                LOCAL_PR_FMT VIF_PR_FMT
                " tdls cancel channel switch with " STA_PR_FMT,
                LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG
        )
);

TRACE_EVENT(drv_tdls_recv_channel_switch,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_tdls_ch_sw_params *params),

        TP_ARGS(local, sdata, params),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(u8, action_code)
                STA_ENTRY
                CHANDEF_ENTRY
                __field(u32, status)
                __field(bool, peer_initiator)
                __field(u32, timestamp)
                __field(u16, switch_time)
                __field(u16, switch_timeout)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                STA_NAMED_ASSIGN(params->sta);
                CHANDEF_ASSIGN(params->chandef)
                __entry->peer_initiator = params->sta->tdls_initiator;
                __entry->action_code = params->action_code;
                __entry->status = params->status;
                __entry->timestamp = params->timestamp;
                __entry->switch_time = params->switch_time;
                __entry->switch_timeout = params->switch_timeout;
        ),

        TP_printk(
                LOCAL_PR_FMT VIF_PR_FMT " received tdls channel switch packet"
                " action:%d status:%d time:%d switch time:%d switch"
                " timeout:%d initiator: %d chan:" CHANDEF_PR_FMT STA_PR_FMT,
                LOCAL_PR_ARG, VIF_PR_ARG, __entry->action_code, __entry->status,
                __entry->timestamp, __entry->switch_time,
                __entry->switch_timeout, __entry->peer_initiator,
                CHANDEF_PR_ARG, STA_PR_ARG
        )
);

TRACE_EVENT(drv_wake_tx_queue,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct txq_info *txq),

        TP_ARGS(local, sdata, txq),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                STA_ENTRY
                __field(u8, ac)
                __field(u8, tid)
        ),

        TP_fast_assign(
                struct ieee80211_sta *sta = txq->txq.sta;

                LOCAL_ASSIGN;
                VIF_ASSIGN;
                STA_ASSIGN;
                __entry->ac = txq->txq.ac;
                __entry->tid = txq->txq.tid;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT  STA_PR_FMT " ac:%d tid:%d",
                LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->ac, __entry->tid
        )
);

TRACE_EVENT(drv_get_ftm_responder_stats,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct cfg80211_ftm_responder_stats *ftm_stats),

        TP_ARGS(local, sdata, ftm_stats),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
        ),

        TP_printk(
                LOCAL_PR_FMT VIF_PR_FMT,
                LOCAL_PR_ARG, VIF_PR_ARG
        )
);

DEFINE_EVENT(local_sdata_addr_evt, drv_update_vif_offload,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),
        TP_ARGS(local, sdata)
);

TRACE_EVENT(drv_sta_set_4addr,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_sta *sta, bool enabled),

        TP_ARGS(local, sdata, sta, enabled),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                STA_ENTRY
                __field(bool, enabled)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                STA_ASSIGN;
                __entry->enabled = enabled;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT  STA_PR_FMT " enabled:%d",
                LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->enabled
        )
);

#endif /* !__MAC80211_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */

#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH .
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE trace
#include <trace/define_trace.h>




























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Checksumming functions for IP, TCP, UDP and so on
 *
 * Authors:        Jorge Cwik, <jorge@laser.satlink.net>
 *                Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 *                Borrows very liberally from tcp.c and ip.c, see those
 *                files for more names.
 */

#ifndef _CHECKSUM_H
#define _CHECKSUM_H

#include <linux/errno.h>
#include <asm/types.h>
#include <asm/byteorder.h>
#include <linux/uaccess.h>
#include <asm/checksum.h>

#ifndef _HAVE_ARCH_COPY_AND_CSUM_FROM_USER
static __always_inline
__wsum csum_and_copy_from_user (const void __user *src, void *dst,
                                      int len)
{
        if (copy_from_user(dst, src, len))
                return 0;
        return csum_partial(dst, len, ~0U);
}
#endif

#ifndef HAVE_CSUM_COPY_USER
static __always_inline __wsum csum_and_copy_to_user
(const void *src, void __user *dst, int len)
{
        __wsum sum = csum_partial(src, len, ~0U);

        if (copy_to_user(dst, src, len) == 0)
                return sum;
        return 0;
}
#endif

#ifndef _HAVE_ARCH_CSUM_AND_COPY
static __always_inline __wsum
csum_partial_copy_nocheck(const void *src, void *dst, int len)
{
        memcpy(dst, src, len);
        return csum_partial(dst, len, 0);
}
#endif

#ifndef HAVE_ARCH_CSUM_ADD
static __always_inline __wsum csum_add(__wsum csum, __wsum addend)
{
        u32 res = (__force u32)csum;
        res += (__force u32)addend;
        return (__force __wsum)(res + (res < (__force u32)addend));
}
#endif

static __always_inline __wsum csum_sub(__wsum csum, __wsum addend)
{
        return csum_add(csum, ~addend);
}

static __always_inline __sum16 csum16_add(__sum16 csum, __be16 addend)
{
        u16 res = (__force u16)csum;

        res += (__force u16)addend;
        return (__force __sum16)(res + (res < (__force u16)addend));
}

static __always_inline __sum16 csum16_sub(__sum16 csum, __be16 addend)
{
        return csum16_add(csum, ~addend);
}

static __always_inline __wsum
csum_block_add(__wsum csum, __wsum csum2, int offset)
{
        u32 sum = (__force u32)csum2;

        /* rotate sum to align it with a 16b boundary */
        if (offset & 1)
                sum = ror32(sum, 8);

        return csum_add(csum, (__force __wsum)sum);
}

static __always_inline __wsum
csum_block_add_ext(__wsum csum, __wsum csum2, int offset, int len)
{
        return csum_block_add(csum, csum2, offset);
}

static __always_inline __wsum
csum_block_sub(__wsum csum, __wsum csum2, int offset)
{
        return csum_block_add(csum, ~csum2, offset);
}

static __always_inline __wsum csum_unfold(__sum16 n)
{
        return (__force __wsum)n;
}

static __always_inline
__wsum csum_partial_ext(const void *buff, int len, __wsum sum)
{
        return csum_partial(buff, len, sum);
}

#define CSUM_MANGLED_0 ((__force __sum16)0xffff)

static __always_inline void csum_replace_by_diff(__sum16 *sum, __wsum diff)
{
        *sum = csum_fold(csum_add(diff, ~csum_unfold(*sum)));
}

static __always_inline void csum_replace4(__sum16 *sum, __be32 from, __be32 to)
{
        __wsum tmp = csum_sub(~csum_unfold(*sum), (__force __wsum)from);

        *sum = csum_fold(csum_add(tmp, (__force __wsum)to));
}

/* Implements RFC 1624 (Incremental Internet Checksum)
 * 3. Discussion states :
 *     HC' = ~(~HC + ~m + m')
 *  m : old value of a 16bit field
 *  m' : new value of a 16bit field
 */
static __always_inline void csum_replace2(__sum16 *sum, __be16 old, __be16 new)
{
        *sum = ~csum16_add(csum16_sub(~(*sum), old), new);
}

static inline void csum_replace(__wsum *csum, __wsum old, __wsum new)
{
        *csum = csum_add(csum_sub(*csum, old), new);
}

struct sk_buff;
void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,
                              __be32 from, __be32 to, bool pseudohdr);
void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb,
                               const __be32 *from, const __be32 *to,
                               bool pseudohdr);
void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb,
                                     __wsum diff, bool pseudohdr, bool ipv6);

static __always_inline
void inet_proto_csum_replace2(__sum16 *sum, struct sk_buff *skb,
                              __be16 from, __be16 to, bool pseudohdr)
{
        inet_proto_csum_replace4(sum, skb, (__force __be32)from,
                                 (__force __be32)to, pseudohdr);
}

static __always_inline __wsum remcsum_adjust(void *ptr, __wsum csum,
                                             int start, int offset)
{
        __sum16 *psum = (__sum16 *)(ptr + offset);
        __wsum delta;

        /* Subtract out checksum up to start */
        csum = csum_sub(csum, csum_partial(ptr, start, 0));

        /* Set derived checksum in packet */
        delta = csum_sub((__force __wsum)csum_fold(csum),
                         (__force __wsum)*psum);
        *psum = csum_fold(csum);

        return delta;
}

static __always_inline void remcsum_unadjust(__sum16 *psum, __wsum delta)
{
        *psum = csum_fold(csum_sub(delta, (__force __wsum)*psum));
}

#endif
































   14 





























































































































































































































































































































































































































































































































































    4 



























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_UACCESS_H
#define _ASM_X86_UACCESS_H
/*
 * User space memory access functions
 */
#include <linux/compiler.h>
#include <linux/kasan-checks.h>
#include <linux/string.h>
#include <asm/asm.h>
#include <asm/page.h>
#include <asm/smap.h>
#include <asm/extable.h>

/*
 * Test whether a block of memory is a valid user space address.
 * Returns 0 if the range is valid, nonzero otherwise.
 */
static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, unsigned long limit)
{
        /*
         * If we have used "sizeof()" for the size,
         * we know it won't overflow the limit (but
         * it might overflow the 'addr', so it's
         * important to subtract the size from the
         * limit, not add it to the address).
         */
        if (__builtin_constant_p(size))
                return unlikely(addr > limit - size);

        /* Arbitrary sizes? Be careful about overflow */
        addr += size;
        if (unlikely(addr < size))
                return true;
        return unlikely(addr > limit);
}

#define __range_not_ok(addr, size, limit)                                \
({                                                                        \
        __chk_user_ptr(addr);                                                \
        __chk_range_not_ok((unsigned long __force)(addr), size, limit); \
})

#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
static inline bool pagefault_disabled(void);
# define WARN_ON_IN_IRQ()        \
        WARN_ON_ONCE(!in_task() && !pagefault_disabled())
#else
# define WARN_ON_IN_IRQ()
#endif

/**
 * access_ok - Checks if a user space pointer is valid
 * @addr: User space pointer to start of block to check
 * @size: Size of block to check
 *
 * Context: User context only. This function may sleep if pagefaults are
 *          enabled.
 *
 * Checks if a pointer to a block of memory in user space is valid.
 *
 * Note that, depending on architecture, this function probably just
 * checks that the pointer is in the user space range - after calling
 * this function, memory access functions may still return -EFAULT.
 *
 * Return: true (nonzero) if the memory block may be valid, false (zero)
 * if it is definitely invalid.
 */
#define access_ok(addr, size)                                        \
({                                                                        \
        WARN_ON_IN_IRQ();                                                \
        likely(!__range_not_ok(addr, size, TASK_SIZE_MAX));                \
})

extern int __get_user_1(void);
extern int __get_user_2(void);
extern int __get_user_4(void);
extern int __get_user_8(void);
extern int __get_user_nocheck_1(void);
extern int __get_user_nocheck_2(void);
extern int __get_user_nocheck_4(void);
extern int __get_user_nocheck_8(void);
extern int __get_user_bad(void);

#define __uaccess_begin() stac()
#define __uaccess_end()   clac()
#define __uaccess_begin_nospec()        \
({                                        \
        stac();                                \
        barrier_nospec();                \
})

/*
 * This is the smallest unsigned integer type that can fit a value
 * (up to 'long long')
 */
#define __inttype(x) __typeof__(                \
        __typefits(x,char,                        \
          __typefits(x,short,                        \
            __typefits(x,int,                        \
              __typefits(x,long,0ULL)))))

#define __typefits(x,type,not) \
        __builtin_choose_expr(sizeof(x)<=sizeof(type),(unsigned type)0,not)

/*
 * This is used for both get_user() and __get_user() to expand to
 * the proper special function call that has odd calling conventions
 * due to returning both a value and an error, and that depends on
 * the size of the pointer passed in.
 *
 * Careful: we have to cast the result to the type of the pointer
 * for sign reasons.
 *
 * The use of _ASM_DX as the register specifier is a bit of a
 * simplification, as gcc only cares about it as the starting point
 * and not size: for a 64-bit value it will use %ecx:%edx on 32 bits
 * (%ecx being the next register in gcc's x86 register sequence), and
 * %rdx on 64 bits.
 *
 * Clang/LLVM cares about the size of the register, but still wants
 * the base register for something that ends up being a pair.
 */
#define do_get_user_call(fn,x,ptr)                                        \
({                                                                        \
        int __ret_gu;                                                        \
        register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX);                \
        __chk_user_ptr(ptr);                                                \
        asm volatile("call __" #fn "_%P4"                                \
                     : "=a" (__ret_gu), "=r" (__val_gu),                \
                        ASM_CALL_CONSTRAINT                                \
                     : "0" (ptr), "i" (sizeof(*(ptr))));                \
        (x) = (__force __typeof__(*(ptr))) __val_gu;                        \
        __builtin_expect(__ret_gu, 0);                                        \
})

/**
 * get_user - Get a simple variable from user space.
 * @x:   Variable to store result.
 * @ptr: Source address, in user space.
 *
 * Context: User context only. This function may sleep if pagefaults are
 *          enabled.
 *
 * This macro copies a single simple variable from user space to kernel
 * space.  It supports simple types like char and int, but not larger
 * data types like structures or arrays.
 *
 * @ptr must have pointer-to-simple-variable type, and the result of
 * dereferencing @ptr must be assignable to @x without a cast.
 *
 * Return: zero on success, or -EFAULT on error.
 * On error, the variable @x is set to zero.
 */
#define get_user(x,ptr) ({ might_fault(); do_get_user_call(get_user,x,ptr); })

/**
 * __get_user - Get a simple variable from user space, with less checking.
 * @x:   Variable to store result.
 * @ptr: Source address, in user space.
 *
 * Context: User context only. This function may sleep if pagefaults are
 *          enabled.
 *
 * This macro copies a single simple variable from user space to kernel
 * space.  It supports simple types like char and int, but not larger
 * data types like structures or arrays.
 *
 * @ptr must have pointer-to-simple-variable type, and the result of
 * dereferencing @ptr must be assignable to @x without a cast.
 *
 * Caller must check the pointer with access_ok() before calling this
 * function.
 *
 * Return: zero on success, or -EFAULT on error.
 * On error, the variable @x is set to zero.
 */
#define __get_user(x,ptr) do_get_user_call(get_user_nocheck,x,ptr)


#ifdef CONFIG_X86_32
#define __put_user_goto_u64(x, addr, label)                        \
        asm_volatile_goto("\n"                                        \
                     "1:        movl %%eax,0(%1)\n"                \
                     "2:        movl %%edx,4(%1)\n"                \
                     _ASM_EXTABLE_UA(1b, %l2)                        \
                     _ASM_EXTABLE_UA(2b, %l2)                        \
                     : : "A" (x), "r" (addr)                        \
                     : : label)

#else
#define __put_user_goto_u64(x, ptr, label) \
        __put_user_goto(x, ptr, "q", "er", label)
#endif

extern void __put_user_bad(void);

/*
 * Strange magic calling convention: pointer in %ecx,
 * value in %eax(:%edx), return value in %ecx. clobbers %rbx
 */
extern void __put_user_1(void);
extern void __put_user_2(void);
extern void __put_user_4(void);
extern void __put_user_8(void);
extern void __put_user_nocheck_1(void);
extern void __put_user_nocheck_2(void);
extern void __put_user_nocheck_4(void);
extern void __put_user_nocheck_8(void);

/*
 * ptr must be evaluated and assigned to the temporary __ptr_pu before
 * the assignment of x to __val_pu, to avoid any function calls
 * involved in the ptr expression (possibly implicitly generated due
 * to KASAN) from clobbering %ax.
 */
#define do_put_user_call(fn,x,ptr)                                        \
({                                                                        \
        int __ret_pu;                                                        \
        void __user *__ptr_pu;                                                \
        register __typeof__(*(ptr)) __val_pu asm("%"_ASM_AX);                \
        __chk_user_ptr(ptr);                                                \
        __ptr_pu = (ptr);                                                \
        __val_pu = (x);                                                        \
        asm volatile("call __" #fn "_%P[size]"                                \
                     : "=c" (__ret_pu),                                        \
                        ASM_CALL_CONSTRAINT                                \
                     : "0" (__ptr_pu),                                        \
                       "r" (__val_pu),                                        \
                       [size] "i" (sizeof(*(ptr)))                        \
                     :"ebx");                                                \
        __builtin_expect(__ret_pu, 0);                                        \
})

/**
 * put_user - Write a simple value into user space.
 * @x:   Value to copy to user space.
 * @ptr: Destination address, in user space.
 *
 * Context: User context only. This function may sleep if pagefaults are
 *          enabled.
 *
 * This macro copies a single simple value from kernel space to user
 * space.  It supports simple types like char and int, but not larger
 * data types like structures or arrays.
 *
 * @ptr must have pointer-to-simple-variable type, and @x must be assignable
 * to the result of dereferencing @ptr.
 *
 * Return: zero on success, or -EFAULT on error.
 */
#define put_user(x, ptr) ({ might_fault(); do_put_user_call(put_user,x,ptr); })

/**
 * __put_user - Write a simple value into user space, with less checking.
 * @x:   Value to copy to user space.
 * @ptr: Destination address, in user space.
 *
 * Context: User context only. This function may sleep if pagefaults are
 *          enabled.
 *
 * This macro copies a single simple value from kernel space to user
 * space.  It supports simple types like char and int, but not larger
 * data types like structures or arrays.
 *
 * @ptr must have pointer-to-simple-variable type, and @x must be assignable
 * to the result of dereferencing @ptr.
 *
 * Caller must check the pointer with access_ok() before calling this
 * function.
 *
 * Return: zero on success, or -EFAULT on error.
 */
#define __put_user(x, ptr) do_put_user_call(put_user_nocheck,x,ptr)

#define __put_user_size(x, ptr, size, label)                                \
do {                                                                        \
        __chk_user_ptr(ptr);                                                \
        switch (size) {                                                        \
        case 1:                                                                \
                __put_user_goto(x, ptr, "b", "iq", label);                \
                break;                                                        \
        case 2:                                                                \
                __put_user_goto(x, ptr, "w", "ir", label);                \
                break;                                                        \
        case 4:                                                                \
                __put_user_goto(x, ptr, "l", "ir", label);                \
                break;                                                        \
        case 8:                                                                \
                __put_user_goto_u64(x, ptr, label);                        \
                break;                                                        \
        default:                                                        \
                __put_user_bad();                                        \
        }                                                                \
} while (0)

#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT

#ifdef CONFIG_X86_32
#define __get_user_asm_u64(x, ptr, label) do {                                \
        unsigned int __gu_low, __gu_high;                                \
        const unsigned int __user *__gu_ptr;                                \
        __gu_ptr = (const void __user *)(ptr);                                \
        __get_user_asm(__gu_low, __gu_ptr, "l", "=r", label);                \
        __get_user_asm(__gu_high, __gu_ptr+1, "l", "=r", label);        \
        (x) = ((unsigned long long)__gu_high << 32) | __gu_low;                \
} while (0)
#else
#define __get_user_asm_u64(x, ptr, label)                                \
        __get_user_asm(x, ptr, "q", "=r", label)
#endif

#define __get_user_size(x, ptr, size, label)                                \
do {                                                                        \
        __chk_user_ptr(ptr);                                                \
        switch (size) {                                                        \
        case 1:        {                                                        \
                unsigned char x_u8__;                                        \
                __get_user_asm(x_u8__, ptr, "b", "=q", label);                \
                (x) = x_u8__;                                                \
                break;                                                        \
        }                                                                \
        case 2:                                                                \
                __get_user_asm(x, ptr, "w", "=r", label);                \
                break;                                                        \
        case 4:                                                                \
                __get_user_asm(x, ptr, "l", "=r", label);                \
                break;                                                        \
        case 8:                                                                \
                __get_user_asm_u64(x, ptr, label);                        \
                break;                                                        \
        default:                                                        \
                (x) = __get_user_bad();                                        \
        }                                                                \
} while (0)

#define __get_user_asm(x, addr, itype, ltype, label)                        \
        asm_volatile_goto("\n"                                                \
                     "1:        mov"itype" %[umem],%[output]\n"                \
                     _ASM_EXTABLE_UA(1b, %l2)                                \
                     : [output] ltype(x)                                \
                     : [umem] "m" (__m(addr))                                \
                     : : label)

#else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT

#ifdef CONFIG_X86_32
#define __get_user_asm_u64(x, ptr, retval)                                \
({                                                                        \
        __typeof__(ptr) __ptr = (ptr);                                        \
        asm volatile("\n"                                                \
                     "1:        movl %[lowbits],%%eax\n"                \
                     "2:        movl %[highbits],%%edx\n"                \
                     "3:\n"                                                \
                     ".section .fixup,\"ax\"\n"                                \
                     "4:        mov %[efault],%[errout]\n"                \
                     "        xorl %%eax,%%eax\n"                                \
                     "        xorl %%edx,%%edx\n"                                \
                     "        jmp 3b\n"                                        \
                     ".previous\n"                                        \
                     _ASM_EXTABLE_UA(1b, 4b)                                \
                     _ASM_EXTABLE_UA(2b, 4b)                                \
                     : [errout] "=r" (retval),                                \
                       [output] "=&A"(x)                                \
                     : [lowbits] "m" (__m(__ptr)),                        \
                       [highbits] "m" __m(((u32 __user *)(__ptr)) + 1),        \
                       [efault] "i" (-EFAULT), "0" (retval));                \
})

#else
#define __get_user_asm_u64(x, ptr, retval) \
         __get_user_asm(x, ptr, retval, "q", "=r")
#endif

#define __get_user_size(x, ptr, size, retval)                                \
do {                                                                        \
        unsigned char x_u8__;                                                \
                                                                        \
        retval = 0;                                                        \
        __chk_user_ptr(ptr);                                                \
        switch (size) {                                                        \
        case 1:                                                                \
                __get_user_asm(x_u8__, ptr, retval, "b", "=q");                \
                (x) = x_u8__;                                                \
                break;                                                        \
        case 2:                                                                \
                __get_user_asm(x, ptr, retval, "w", "=r");                \
                break;                                                        \
        case 4:                                                                \
                __get_user_asm(x, ptr, retval, "l", "=r");                \
                break;                                                        \
        case 8:                                                                \
                __get_user_asm_u64(x, ptr, retval);                        \
                break;                                                        \
        default:                                                        \
                (x) = __get_user_bad();                                        \
        }                                                                \
} while (0)

#define __get_user_asm(x, addr, err, itype, ltype)                        \
        asm volatile("\n"                                                \
                     "1:        mov"itype" %[umem],%[output]\n"                \
                     "2:\n"                                                \
                     ".section .fixup,\"ax\"\n"                                \
                     "3:        mov %[efault],%[errout]\n"                \
                     "        xorl %k[output],%k[output]\n"                        \
                     "        jmp 2b\n"                                        \
                     ".previous\n"                                        \
                     _ASM_EXTABLE_UA(1b, 3b)                                \
                     : [errout] "=r" (err),                                \
                       [output] ltype(x)                                \
                     : [umem] "m" (__m(addr)),                                \
                       [efault] "i" (-EFAULT), "0" (err))

#endif // CONFIG_CC_ASM_GOTO_OUTPUT

#ifdef CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT
#define __try_cmpxchg_user_asm(itype, ltype, _ptr, _pold, _new, label)        ({ \
        bool success;                                                        \
        __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold);                \
        __typeof__(*(_ptr)) __old = *_old;                                \
        __typeof__(*(_ptr)) __new = (_new);                                \
        asm_volatile_goto("\n"                                                \
                     "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\
                     _ASM_EXTABLE_UA(1b, %l[label])                        \
                     : CC_OUT(z) (success),                                \
                       [ptr] "+m" (*_ptr),                                \
                       [old] "+a" (__old)                                \
                     : [new] ltype (__new)                                \
                     : "memory"                                                \
                     : label);                                                \
        if (unlikely(!success))                                                \
                *_old = __old;                                                \
        likely(success);                                        })

#ifdef CONFIG_X86_32
#define __try_cmpxchg64_user_asm(_ptr, _pold, _new, label)        ({        \
        bool success;                                                        \
        __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold);                \
        __typeof__(*(_ptr)) __old = *_old;                                \
        __typeof__(*(_ptr)) __new = (_new);                                \
        asm_volatile_goto("\n"                                                \
                     "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n"                \
                     _ASM_EXTABLE_UA(1b, %l[label])                        \
                     : CC_OUT(z) (success),                                \
                       "+A" (__old),                                        \
                       [ptr] "+m" (*_ptr)                                \
                     : "b" ((u32)__new),                                \
                       "c" ((u32)((u64)__new >> 32))                        \
                     : "memory"                                                \
                     : label);                                                \
        if (unlikely(!success))                                                \
                *_old = __old;                                                \
        likely(success);                                        })
#endif // CONFIG_X86_32
#else  // !CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT
#define __try_cmpxchg_user_asm(itype, ltype, _ptr, _pold, _new, label)        ({ \
        int __err = 0;                                                        \
        bool success;                                                        \
        __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold);                \
        __typeof__(*(_ptr)) __old = *_old;                                \
        __typeof__(*(_ptr)) __new = (_new);                                \
        asm volatile("\n"                                                \
                     "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\
                     CC_SET(z)                                                \
                     "2:\n"                                                \
                     _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG,        \
                                           %[errout])                        \
                     : CC_OUT(z) (success),                                \
                       [errout] "+r" (__err),                                \
                       [ptr] "+m" (*_ptr),                                \
                       [old] "+a" (__old)                                \
                     : [new] ltype (__new)                                \
                     : "memory");                                        \
        if (unlikely(__err))                                                \
                goto label;                                                \
        if (unlikely(!success))                                                \
                *_old = __old;                                                \
        likely(success);                                        })

#ifdef CONFIG_X86_32
/*
 * Unlike the normal CMPXCHG, hardcode ECX for both success/fail and error.
 * There are only six GPRs available and four (EAX, EBX, ECX, and EDX) are
 * hardcoded by CMPXCHG8B, leaving only ESI and EDI.  If the compiler uses
 * both ESI and EDI for the memory operand, compilation will fail if the error
 * is an input+output as there will be no register available for input.
 */
#define __try_cmpxchg64_user_asm(_ptr, _pold, _new, label)        ({        \
        int __result;                                                        \
        __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold);                \
        __typeof__(*(_ptr)) __old = *_old;                                \
        __typeof__(*(_ptr)) __new = (_new);                                \
        asm volatile("\n"                                                \
                     "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n"                \
                     "mov $0, %%ecx\n\t"                                \
                     "setz %%cl\n"                                        \
                     "2:\n"                                                \
                     _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %%ecx) \
                     : [result]"=c" (__result),                                \
                       "+A" (__old),                                        \
                       [ptr] "+m" (*_ptr)                                \
                     : "b" ((u32)__new),                                \
                       "c" ((u32)((u64)__new >> 32))                        \
                     : "memory", "cc");                                        \
        if (unlikely(__result < 0))                                        \
                goto label;                                                \
        if (unlikely(!__result))                                        \
                *_old = __old;                                                \
        likely(__result);                                        })
#endif // CONFIG_X86_32
#endif // CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT

/* FIXME: this hack is definitely wrong -AK */
struct __large_struct { unsigned long buf[100]; };
#define __m(x) (*(struct __large_struct __user *)(x))

/*
 * Tell gcc we read from memory instead of writing: this is because
 * we do not write to any memory gcc knows about, so there are no
 * aliasing issues.
 */
#define __put_user_goto(x, addr, itype, ltype, label)                        \
        asm_volatile_goto("\n"                                                \
                "1:        mov"itype" %0,%1\n"                                \
                _ASM_EXTABLE_UA(1b, %l2)                                \
                : : ltype(x), "m" (__m(addr))                                \
                : : label)

extern unsigned long
copy_from_user_nmi(void *to, const void __user *from, unsigned long n);
extern __must_check long
strncpy_from_user(char *dst, const char __user *src, long count);

extern __must_check long strnlen_user(const char __user *str, long n);

unsigned long __must_check clear_user(void __user *mem, unsigned long len);
unsigned long __must_check __clear_user(void __user *mem, unsigned long len);

#ifdef CONFIG_ARCH_HAS_COPY_MC
unsigned long __must_check
copy_mc_to_kernel(void *to, const void *from, unsigned len);
#define copy_mc_to_kernel copy_mc_to_kernel

unsigned long __must_check
copy_mc_to_user(void __user *to, const void *from, unsigned len);
#endif

/*
 * movsl can be slow when source and dest are not both 8-byte aligned
 */
#ifdef CONFIG_X86_INTEL_USERCOPY
extern struct movsl_mask {
        int mask;
} ____cacheline_aligned_in_smp movsl_mask;
#endif

#define ARCH_HAS_NOCACHE_UACCESS 1

#ifdef CONFIG_X86_32
# include <asm/uaccess_32.h>
#else
# include <asm/uaccess_64.h>
#endif

/*
 * The "unsafe" user accesses aren't really "unsafe", but the naming
 * is a big fat warning: you have to not only do the access_ok()
 * checking before using them, but you have to surround them with the
 * user_access_begin/end() pair.
 */
static __must_check __always_inline bool user_access_begin(const void __user *ptr, size_t len)
{
        if (unlikely(!access_ok(ptr,len)))
                return 0;
        __uaccess_begin_nospec();
        return 1;
}
#define user_access_begin(a,b)        user_access_begin(a,b)
#define user_access_end()        __uaccess_end()

#define user_access_save()        smap_save()
#define user_access_restore(x)        smap_restore(x)

#define unsafe_put_user(x, ptr, label)        \
        __put_user_size((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)), label)

#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
#define unsafe_get_user(x, ptr, err_label)                                        \
do {                                                                                \
        __inttype(*(ptr)) __gu_val;                                                \
        __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), err_label);                \
        (x) = (__force __typeof__(*(ptr)))__gu_val;                                \
} while (0)
#else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT
#define unsafe_get_user(x, ptr, err_label)                                        \
do {                                                                                \
        int __gu_err;                                                                \
        __inttype(*(ptr)) __gu_val;                                                \
        __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err);                \
        (x) = (__force __typeof__(*(ptr)))__gu_val;                                \
        if (unlikely(__gu_err)) goto err_label;                                        \
} while (0)
#endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT

extern void __try_cmpxchg_user_wrong_size(void);

#ifndef CONFIG_X86_32
#define __try_cmpxchg64_user_asm(_ptr, _oldp, _nval, _label)                \
        __try_cmpxchg_user_asm("q", "r", (_ptr), (_oldp), (_nval), _label)
#endif

/*
 * Force the pointer to u<size> to match the size expected by the asm helper.
 * clang/LLVM compiles all cases and only discards the unused paths after
 * processing errors, which breaks i386 if the pointer is an 8-byte value.
 */
#define unsafe_try_cmpxchg_user(_ptr, _oldp, _nval, _label) ({                        \
        bool __ret;                                                                \
        __chk_user_ptr(_ptr);                                                        \
        switch (sizeof(*(_ptr))) {                                                \
        case 1:        __ret = __try_cmpxchg_user_asm("b", "q",                        \
                                               (__force u8 *)(_ptr), (_oldp),        \
                                               (_nval), _label);                \
                break;                                                                \
        case 2:        __ret = __try_cmpxchg_user_asm("w", "r",                        \
                                               (__force u16 *)(_ptr), (_oldp),        \
                                               (_nval), _label);                \
                break;                                                                \
        case 4:        __ret = __try_cmpxchg_user_asm("l", "r",                        \
                                               (__force u32 *)(_ptr), (_oldp),        \
                                               (_nval), _label);                \
                break;                                                                \
        case 8:        __ret = __try_cmpxchg64_user_asm((__force u64 *)(_ptr), (_oldp),\
                                                 (_nval), _label);                \
                break;                                                                \
        default: __try_cmpxchg_user_wrong_size();                                \
        }                                                                        \
        __ret;                                                })

/* "Returns" 0 on success, 1 on failure, -EFAULT if the access faults. */
#define __try_cmpxchg_user(_ptr, _oldp, _nval, _label)        ({                \
        int __ret = -EFAULT;                                                \
        __uaccess_begin_nospec();                                        \
        __ret = !unsafe_try_cmpxchg_user(_ptr, _oldp, _nval, _label);        \
_label:                                                                        \
        __uaccess_end();                                                \
        __ret;                                                                \
                                                        })

/*
 * We want the unsafe accessors to always be inlined and use
 * the error labels - thus the macro games.
 */
#define unsafe_copy_loop(dst, src, len, type, label)                                \
        while (len >= sizeof(type)) {                                                \
                unsafe_put_user(*(type *)(src),(type __user *)(dst),label);        \
                dst += sizeof(type);                                                \
                src += sizeof(type);                                                \
                len -= sizeof(type);                                                \
        }

#define unsafe_copy_to_user(_dst,_src,_len,label)                        \
do {                                                                        \
        char __user *__ucu_dst = (_dst);                                \
        const char *__ucu_src = (_src);                                        \
        size_t __ucu_len = (_len);                                        \
        unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u64, label);        \
        unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u32, label);        \
        unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u16, label);        \
        unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u8, label);        \
} while (0)

#define HAVE_GET_KERNEL_NOFAULT

#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
#define __get_kernel_nofault(dst, src, type, err_label)                        \
        __get_user_size(*((type *)(dst)), (__force type __user *)(src),        \
                        sizeof(type), err_label)
#else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT
#define __get_kernel_nofault(dst, src, type, err_label)                        \
do {                                                                        \
        int __kr_err;                                                        \
                                                                        \
        __get_user_size(*((type *)(dst)), (__force type __user *)(src),        \
                        sizeof(type), __kr_err);                        \
        if (unlikely(__kr_err))                                                \
                goto err_label;                                                \
} while (0)
#endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT

#define __put_kernel_nofault(dst, src, type, err_label)                        \
        __put_user_size(*((type *)(src)), (__force type __user *)(dst),        \
                        sizeof(type), err_label)

#endif /* _ASM_X86_UACCESS_H */





























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * include/linux/cpu.h - generic cpu definition
 *
 * This is mainly for topological representation. We define the 
 * basic 'struct cpu' here, which can be embedded in per-arch 
 * definitions of processors.
 *
 * Basic handling of the devices is done in drivers/base/cpu.c
 *
 * CPUs are exported via sysfs in the devices/system/cpu
 * directory. 
 */
#ifndef _LINUX_CPU_H_
#define _LINUX_CPU_H_

#include <linux/node.h>
#include <linux/compiler.h>
#include <linux/cpumask.h>
#include <linux/cpuhotplug.h>

struct device;
struct device_node;
struct attribute_group;

struct cpu {
        int node_id;                /* The node which contains the CPU */
        int hotpluggable;        /* creates sysfs control file if hotpluggable */
        struct device dev;
};

extern void boot_cpu_init(void);
extern void boot_cpu_hotplug_init(void);
extern void cpu_init(void);
extern void trap_init(void);

extern int register_cpu(struct cpu *cpu, int num);
extern struct device *get_cpu_device(unsigned cpu);
extern bool cpu_is_hotpluggable(unsigned cpu);
extern bool arch_match_cpu_phys_id(int cpu, u64 phys_id);
extern bool arch_find_n_match_cpu_physical_id(struct device_node *cpun,
                                              int cpu, unsigned int *thread);

extern int cpu_add_dev_attr(struct device_attribute *attr);
extern void cpu_remove_dev_attr(struct device_attribute *attr);

extern int cpu_add_dev_attr_group(struct attribute_group *attrs);
extern void cpu_remove_dev_attr_group(struct attribute_group *attrs);

extern ssize_t cpu_show_meltdown(struct device *dev,
                                 struct device_attribute *attr, char *buf);
extern ssize_t cpu_show_spectre_v1(struct device *dev,
                                   struct device_attribute *attr, char *buf);
extern ssize_t cpu_show_spectre_v2(struct device *dev,
                                   struct device_attribute *attr, char *buf);
extern ssize_t cpu_show_spec_store_bypass(struct device *dev,
                                          struct device_attribute *attr, char *buf);
extern ssize_t cpu_show_l1tf(struct device *dev,
                             struct device_attribute *attr, char *buf);
extern ssize_t cpu_show_mds(struct device *dev,
                            struct device_attribute *attr, char *buf);
extern ssize_t cpu_show_tsx_async_abort(struct device *dev,
                                        struct device_attribute *attr,
                                        char *buf);
extern ssize_t cpu_show_itlb_multihit(struct device *dev,
                                      struct device_attribute *attr, char *buf);
extern ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char *buf);
extern ssize_t cpu_show_mmio_stale_data(struct device *dev,
                                        struct device_attribute *attr,
                                        char *buf);
extern ssize_t cpu_show_retbleed(struct device *dev,
                                 struct device_attribute *attr, char *buf);
extern ssize_t cpu_show_spec_rstack_overflow(struct device *dev,
                                             struct device_attribute *attr, char *buf);
extern ssize_t cpu_show_gds(struct device *dev,
                            struct device_attribute *attr, char *buf);
extern ssize_t cpu_show_reg_file_data_sampling(struct device *dev,
                                               struct device_attribute *attr, char *buf);
extern ssize_t cpu_show_indirect_target_selection(struct device *dev,
                                                  struct device_attribute *attr, char *buf);
extern ssize_t cpu_show_tsa(struct device *dev, struct device_attribute *attr, char *buf);
extern ssize_t cpu_show_vmscape(struct device *dev, struct device_attribute *attr, char *buf);

extern __printf(4, 5)
struct device *cpu_device_create(struct device *parent, void *drvdata,
                                 const struct attribute_group **groups,
                                 const char *fmt, ...);
#ifdef CONFIG_HOTPLUG_CPU
extern void unregister_cpu(struct cpu *cpu);
extern ssize_t arch_cpu_probe(const char *, size_t);
extern ssize_t arch_cpu_release(const char *, size_t);
#endif

/*
 * These states are not related to the core CPU hotplug mechanism. They are
 * used by various (sub)architectures to track internal state
 */
#define CPU_ONLINE                0x0002 /* CPU is up */
#define CPU_UP_PREPARE                0x0003 /* CPU coming up */
#define CPU_DEAD                0x0007 /* CPU dead */
#define CPU_DEAD_FROZEN                0x0008 /* CPU timed out on unplug */
#define CPU_POST_DEAD                0x0009 /* CPU successfully unplugged */
#define CPU_BROKEN                0x000B /* CPU did not die properly */

#ifdef CONFIG_SMP
extern bool cpuhp_tasks_frozen;
int add_cpu(unsigned int cpu);
int cpu_device_up(struct device *dev);
void notify_cpu_starting(unsigned int cpu);
extern void cpu_maps_update_begin(void);
extern void cpu_maps_update_done(void);
int bringup_hibernate_cpu(unsigned int sleep_cpu);
void bringup_nonboot_cpus(unsigned int setup_max_cpus);

#else        /* CONFIG_SMP */
#define cpuhp_tasks_frozen        0

static inline void cpu_maps_update_begin(void)
{
}

static inline void cpu_maps_update_done(void)
{
}

#endif /* CONFIG_SMP */
extern struct bus_type cpu_subsys;

#ifdef CONFIG_HOTPLUG_CPU
extern void cpus_write_lock(void);
extern void cpus_write_unlock(void);
extern void cpus_read_lock(void);
extern void cpus_read_unlock(void);
extern int  cpus_read_trylock(void);
extern void lockdep_assert_cpus_held(void);
extern void cpu_hotplug_disable(void);
extern void cpu_hotplug_enable(void);
void clear_tasks_mm_cpumask(int cpu);
int remove_cpu(unsigned int cpu);
int cpu_device_down(struct device *dev);
extern void smp_shutdown_nonboot_cpus(unsigned int primary_cpu);

#else /* CONFIG_HOTPLUG_CPU */

static inline void cpus_write_lock(void) { }
static inline void cpus_write_unlock(void) { }
static inline void cpus_read_lock(void) { }
static inline void cpus_read_unlock(void) { }
static inline int  cpus_read_trylock(void) { return true; }
static inline void lockdep_assert_cpus_held(void) { }
static inline void cpu_hotplug_disable(void) { }
static inline void cpu_hotplug_enable(void) { }
static inline void smp_shutdown_nonboot_cpus(unsigned int primary_cpu) { }
#endif        /* !CONFIG_HOTPLUG_CPU */

/* Wrappers which go away once all code is converted */
static inline void cpu_hotplug_begin(void) { cpus_write_lock(); }
static inline void cpu_hotplug_done(void) { cpus_write_unlock(); }
static inline void get_online_cpus(void) { cpus_read_lock(); }
static inline void put_online_cpus(void) { cpus_read_unlock(); }

#ifdef CONFIG_PM_SLEEP_SMP
extern int freeze_secondary_cpus(int primary);
extern void thaw_secondary_cpus(void);

static inline int suspend_disable_secondary_cpus(void)
{
        int cpu = 0;

        if (IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU))
                cpu = -1;

        return freeze_secondary_cpus(cpu);
}
static inline void suspend_enable_secondary_cpus(void)
{
        return thaw_secondary_cpus();
}

#else /* !CONFIG_PM_SLEEP_SMP */
static inline void thaw_secondary_cpus(void) {}
static inline int suspend_disable_secondary_cpus(void) { return 0; }
static inline void suspend_enable_secondary_cpus(void) { }
#endif /* !CONFIG_PM_SLEEP_SMP */

void cpu_startup_entry(enum cpuhp_state state);

void cpu_idle_poll_ctrl(bool enable);

/* Attach to any functions which should be considered cpuidle. */
#define __cpuidle        __section(".cpuidle.text")

bool cpu_in_idle(unsigned long pc);

void arch_cpu_idle(void);
void arch_cpu_idle_prepare(void);
void arch_cpu_idle_enter(void);
void arch_cpu_idle_exit(void);
void arch_cpu_idle_dead(void);

#ifdef CONFIG_ARCH_HAS_CPU_FINALIZE_INIT
void arch_cpu_finalize_init(void);
#else
static inline void arch_cpu_finalize_init(void) { }
#endif

int cpu_report_state(int cpu);
int cpu_check_up_prepare(int cpu);
void cpu_set_state_online(int cpu);
void play_idle_precise(u64 duration_ns, u64 latency_ns);

static inline void play_idle(unsigned long duration_us)
{
        play_idle_precise(duration_us * NSEC_PER_USEC, U64_MAX);
}

#ifdef CONFIG_HOTPLUG_CPU
bool cpu_wait_death(unsigned int cpu, int seconds);
bool cpu_report_death(void);
void cpuhp_report_idle_dead(void);
#else
static inline void cpuhp_report_idle_dead(void) { }
#endif /* #ifdef CONFIG_HOTPLUG_CPU */

enum cpuhp_smt_control {
        CPU_SMT_ENABLED,
        CPU_SMT_DISABLED,
        CPU_SMT_FORCE_DISABLED,
        CPU_SMT_NOT_SUPPORTED,
        CPU_SMT_NOT_IMPLEMENTED,
};

#if defined(CONFIG_SMP) && defined(CONFIG_HOTPLUG_SMT)
extern enum cpuhp_smt_control cpu_smt_control;
extern void cpu_smt_disable(bool force);
extern void cpu_smt_check_topology(void);
extern bool cpu_smt_possible(void);
extern int cpuhp_smt_enable(void);
extern int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval);
#else
# define cpu_smt_control                (CPU_SMT_NOT_IMPLEMENTED)
static inline void cpu_smt_disable(bool force) { }
static inline void cpu_smt_check_topology(void) { }
static inline bool cpu_smt_possible(void) { return false; }
static inline int cpuhp_smt_enable(void) { return 0; }
static inline int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { return 0; }
#endif

extern bool cpu_mitigations_off(void);
extern bool cpu_mitigations_auto_nosmt(void);

#endif /* _LINUX_CPU_H_ */















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the BSD Socket
 *                interface as the means of communication with the user level.
 *
 * Authors:        Lotsa people, from code originally in tcp
 */

#ifndef _INET6_HASHTABLES_H
#define _INET6_HASHTABLES_H


#if IS_ENABLED(CONFIG_IPV6)
#include <linux/in6.h>
#include <linux/ipv6.h>
#include <linux/types.h>
#include <linux/jhash.h>

#include <net/inet_sock.h>

#include <net/ipv6.h>
#include <net/netns/hash.h>

struct inet_hashinfo;

static inline unsigned int __inet6_ehashfn(const u32 lhash,
                                    const u16 lport,
                                    const u32 fhash,
                                    const __be16 fport,
                                    const u32 initval)
{
        const u32 ports = (((u32)lport) << 16) | (__force u32)fport;
        return jhash_3words(lhash, fhash, ports, initval);
}

/*
 * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
 *
 * The sockhash lock must be held as a reader here.
 */
struct sock *__inet6_lookup_established(struct net *net,
                                        struct inet_hashinfo *hashinfo,
                                        const struct in6_addr *saddr,
                                        const __be16 sport,
                                        const struct in6_addr *daddr,
                                        const u16 hnum, const int dif,
                                        const int sdif);

typedef u32 (inet6_ehashfn_t)(const struct net *net,
                               const struct in6_addr *laddr, const u16 lport,
                               const struct in6_addr *faddr, const __be16 fport);

inet6_ehashfn_t inet6_ehashfn;

INDIRECT_CALLABLE_DECLARE(inet6_ehashfn_t udp6_ehashfn);

struct sock *inet6_lookup_reuseport(struct net *net, struct sock *sk,
                                    struct sk_buff *skb, int doff,
                                    const struct in6_addr *saddr,
                                    __be16 sport,
                                    const struct in6_addr *daddr,
                                    unsigned short hnum,
                                    inet6_ehashfn_t *ehashfn);

struct sock *inet6_lookup_listener(struct net *net,
                                   struct inet_hashinfo *hashinfo,
                                   struct sk_buff *skb, int doff,
                                   const struct in6_addr *saddr,
                                   const __be16 sport,
                                   const struct in6_addr *daddr,
                                   const unsigned short hnum,
                                   const int dif, const int sdif);

static inline struct sock *__inet6_lookup(struct net *net,
                                          struct inet_hashinfo *hashinfo,
                                          struct sk_buff *skb, int doff,
                                          const struct in6_addr *saddr,
                                          const __be16 sport,
                                          const struct in6_addr *daddr,
                                          const u16 hnum,
                                          const int dif, const int sdif,
                                          bool *refcounted)
{
        struct sock *sk = __inet6_lookup_established(net, hashinfo, saddr,
                                                     sport, daddr, hnum,
                                                     dif, sdif);
        *refcounted = true;
        if (sk)
                return sk;
        *refcounted = false;
        return inet6_lookup_listener(net, hashinfo, skb, doff, saddr, sport,
                                     daddr, hnum, dif, sdif);
}

static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo,
                                              struct sk_buff *skb, int doff,
                                              const __be16 sport,
                                              const __be16 dport,
                                              int iif, int sdif,
                                              bool *refcounted)
{
        struct sock *sk = skb_steal_sock(skb, refcounted);

        if (sk)
                return sk;

        return __inet6_lookup(dev_net(skb_dst(skb)->dev), hashinfo, skb,
                              doff, &ipv6_hdr(skb)->saddr, sport,
                              &ipv6_hdr(skb)->daddr, ntohs(dport),
                              iif, sdif, refcounted);
}

struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo,
                          struct sk_buff *skb, int doff,
                          const struct in6_addr *saddr, const __be16 sport,
                          const struct in6_addr *daddr, const __be16 dport,
                          const int dif);

int inet6_hash(struct sock *sk);

static inline bool inet6_match(struct net *net, const struct sock *sk,
                               const struct in6_addr *saddr,
                               const struct in6_addr *daddr,
                               const __portpair ports,
                               const int dif, const int sdif)
{
        if (!net_eq(sock_net(sk), net) ||
            sk->sk_family != AF_INET6 ||
            sk->sk_portpair != ports ||
            !ipv6_addr_equal(&sk->sk_v6_daddr, saddr) ||
            !ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
                return false;

        /* READ_ONCE() paired with WRITE_ONCE() in sock_bindtoindex_locked() */
        return inet_sk_bound_dev_eq(net, READ_ONCE(sk->sk_bound_dev_if), dif,
                                    sdif);
}
#endif /* IS_ENABLED(CONFIG_IPV6) */

#endif /* _INET6_HASHTABLES_H */












































































































    3 






































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_ATOMIC_H
#define _ASM_X86_ATOMIC_H

#include <linux/compiler.h>
#include <linux/types.h>
#include <asm/alternative.h>
#include <asm/cmpxchg.h>
#include <asm/rmwcc.h>
#include <asm/barrier.h>

/*
 * Atomic operations that C can't guarantee us.  Useful for
 * resource counting etc..
 */

/**
 * arch_atomic_read - read atomic variable
 * @v: pointer of type atomic_t
 *
 * Atomically reads the value of @v.
 */
static __always_inline int arch_atomic_read(const atomic_t *v)
{
        /*
         * Note for KASAN: we deliberately don't use READ_ONCE_NOCHECK() here,
         * it's non-inlined function that increases binary size and stack usage.
         */
        return __READ_ONCE((v)->counter);
}

/**
 * arch_atomic_set - set atomic variable
 * @v: pointer of type atomic_t
 * @i: required value
 *
 * Atomically sets the value of @v to @i.
 */
static __always_inline void arch_atomic_set(atomic_t *v, int i)
{
        __WRITE_ONCE(v->counter, i);
}

/**
 * arch_atomic_add - add integer to atomic variable
 * @i: integer value to add
 * @v: pointer of type atomic_t
 *
 * Atomically adds @i to @v.
 */
static __always_inline void arch_atomic_add(int i, atomic_t *v)
{
        asm volatile(LOCK_PREFIX "addl %1,%0"
                     : "+m" (v->counter)
                     : "ir" (i) : "memory");
}

/**
 * arch_atomic_sub - subtract integer from atomic variable
 * @i: integer value to subtract
 * @v: pointer of type atomic_t
 *
 * Atomically subtracts @i from @v.
 */
static __always_inline void arch_atomic_sub(int i, atomic_t *v)
{
        asm volatile(LOCK_PREFIX "subl %1,%0"
                     : "+m" (v->counter)
                     : "ir" (i) : "memory");
}

/**
 * arch_atomic_sub_and_test - subtract value from variable and test result
 * @i: integer value to subtract
 * @v: pointer of type atomic_t
 *
 * Atomically subtracts @i from @v and returns
 * true if the result is zero, or false for all
 * other cases.
 */
static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v)
{
        return GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, e, "er", i);
}
#define arch_atomic_sub_and_test arch_atomic_sub_and_test

/**
 * arch_atomic_inc - increment atomic variable
 * @v: pointer of type atomic_t
 *
 * Atomically increments @v by 1.
 */
static __always_inline void arch_atomic_inc(atomic_t *v)
{
        asm volatile(LOCK_PREFIX "incl %0"
                     : "+m" (v->counter) :: "memory");
}
#define arch_atomic_inc arch_atomic_inc

/**
 * arch_atomic_dec - decrement atomic variable
 * @v: pointer of type atomic_t
 *
 * Atomically decrements @v by 1.
 */
static __always_inline void arch_atomic_dec(atomic_t *v)
{
        asm volatile(LOCK_PREFIX "decl %0"
                     : "+m" (v->counter) :: "memory");
}
#define arch_atomic_dec arch_atomic_dec

/**
 * arch_atomic_dec_and_test - decrement and test
 * @v: pointer of type atomic_t
 *
 * Atomically decrements @v by 1 and
 * returns true if the result is 0, or false for all other
 * cases.
 */
static __always_inline bool arch_atomic_dec_and_test(atomic_t *v)
{
        return GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, e);
}
#define arch_atomic_dec_and_test arch_atomic_dec_and_test

/**
 * arch_atomic_inc_and_test - increment and test
 * @v: pointer of type atomic_t
 *
 * Atomically increments @v by 1
 * and returns true if the result is zero, or false for all
 * other cases.
 */
static __always_inline bool arch_atomic_inc_and_test(atomic_t *v)
{
        return GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, e);
}
#define arch_atomic_inc_and_test arch_atomic_inc_and_test

/**
 * arch_atomic_add_negative - add and test if negative
 * @i: integer value to add
 * @v: pointer of type atomic_t
 *
 * Atomically adds @i to @v and returns true
 * if the result is negative, or false when
 * result is greater than or equal to zero.
 */
static __always_inline bool arch_atomic_add_negative(int i, atomic_t *v)
{
        return GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, s, "er", i);
}
#define arch_atomic_add_negative arch_atomic_add_negative

/**
 * arch_atomic_add_return - add integer and return
 * @i: integer value to add
 * @v: pointer of type atomic_t
 *
 * Atomically adds @i to @v and returns @i + @v
 */
static __always_inline int arch_atomic_add_return(int i, atomic_t *v)
{
        return i + xadd(&v->counter, i);
}
#define arch_atomic_add_return arch_atomic_add_return

/**
 * arch_atomic_sub_return - subtract integer and return
 * @v: pointer of type atomic_t
 * @i: integer value to subtract
 *
 * Atomically subtracts @i from @v and returns @v - @i
 */
static __always_inline int arch_atomic_sub_return(int i, atomic_t *v)
{
        return arch_atomic_add_return(-i, v);
}
#define arch_atomic_sub_return arch_atomic_sub_return

static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v)
{
        return xadd(&v->counter, i);
}
#define arch_atomic_fetch_add arch_atomic_fetch_add

static __always_inline int arch_atomic_fetch_sub(int i, atomic_t *v)
{
        return xadd(&v->counter, -i);
}
#define arch_atomic_fetch_sub arch_atomic_fetch_sub

static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new)
{
        return arch_cmpxchg(&v->counter, old, new);
}
#define arch_atomic_cmpxchg arch_atomic_cmpxchg

static __always_inline bool arch_atomic_try_cmpxchg(atomic_t *v, int *old, int new)
{
        return try_cmpxchg(&v->counter, old, new);
}
#define arch_atomic_try_cmpxchg arch_atomic_try_cmpxchg

static __always_inline int arch_atomic_xchg(atomic_t *v, int new)
{
        return arch_xchg(&v->counter, new);
}
#define arch_atomic_xchg arch_atomic_xchg

static __always_inline void arch_atomic_and(int i, atomic_t *v)
{
        asm volatile(LOCK_PREFIX "andl %1,%0"
                        : "+m" (v->counter)
                        : "ir" (i)
                        : "memory");
}

static __always_inline int arch_atomic_fetch_and(int i, atomic_t *v)
{
        int val = arch_atomic_read(v);

        do { } while (!arch_atomic_try_cmpxchg(v, &val, val & i));

        return val;
}
#define arch_atomic_fetch_and arch_atomic_fetch_and

static __always_inline void arch_atomic_or(int i, atomic_t *v)
{
        asm volatile(LOCK_PREFIX "orl %1,%0"
                        : "+m" (v->counter)
                        : "ir" (i)
                        : "memory");
}

static __always_inline int arch_atomic_fetch_or(int i, atomic_t *v)
{
        int val = arch_atomic_read(v);

        do { } while (!arch_atomic_try_cmpxchg(v, &val, val | i));

        return val;
}
#define arch_atomic_fetch_or arch_atomic_fetch_or

static __always_inline void arch_atomic_xor(int i, atomic_t *v)
{
        asm volatile(LOCK_PREFIX "xorl %1,%0"
                        : "+m" (v->counter)
                        : "ir" (i)
                        : "memory");
}

static __always_inline int arch_atomic_fetch_xor(int i, atomic_t *v)
{
        int val = arch_atomic_read(v);

        do { } while (!arch_atomic_try_cmpxchg(v, &val, val ^ i));

        return val;
}
#define arch_atomic_fetch_xor arch_atomic_fetch_xor

#ifdef CONFIG_X86_32
# include <asm/atomic64_32.h>
#else
# include <asm/atomic64_64.h>
#endif

#define ARCH_ATOMIC

#endif /* _ASM_X86_ATOMIC_H */




















































































































































































































































































































    1 























    4 










    2 



    2 

    1 




























































    1 





    1 






    1 







































































































































































    9 


    9 


































    1 

















































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Filesystem access notification for Linux
 *
 *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
 */

#ifndef __LINUX_FSNOTIFY_BACKEND_H
#define __LINUX_FSNOTIFY_BACKEND_H

#ifdef __KERNEL__

#include <linux/idr.h> /* inotify uses this */
#include <linux/fs.h> /* struct inode */
#include <linux/list.h>
#include <linux/path.h> /* struct path */
#include <linux/spinlock.h>
#include <linux/types.h>
#include <linux/atomic.h>
#include <linux/user_namespace.h>
#include <linux/refcount.h>
#include <linux/mempool.h>
#include <linux/sched/mm.h>

/*
 * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily
 * convert between them.  dnotify only needs conversion at watch creation
 * so no perf loss there.  fanotify isn't defined yet, so it can use the
 * wholes if it needs more events.
 */
#define FS_ACCESS                0x00000001        /* File was accessed */
#define FS_MODIFY                0x00000002        /* File was modified */
#define FS_ATTRIB                0x00000004        /* Metadata changed */
#define FS_CLOSE_WRITE                0x00000008        /* Writtable file was closed */
#define FS_CLOSE_NOWRITE        0x00000010        /* Unwrittable file closed */
#define FS_OPEN                        0x00000020        /* File was opened */
#define FS_MOVED_FROM                0x00000040        /* File was moved from X */
#define FS_MOVED_TO                0x00000080        /* File was moved to Y */
#define FS_CREATE                0x00000100        /* Subfile was created */
#define FS_DELETE                0x00000200        /* Subfile was deleted */
#define FS_DELETE_SELF                0x00000400        /* Self was deleted */
#define FS_MOVE_SELF                0x00000800        /* Self was moved */
#define FS_OPEN_EXEC                0x00001000        /* File was opened for exec */

#define FS_UNMOUNT                0x00002000        /* inode on umount fs */
#define FS_Q_OVERFLOW                0x00004000        /* Event queued overflowed */
#define FS_ERROR                0x00008000        /* Filesystem Error (fanotify) */

/*
 * FS_IN_IGNORED overloads FS_ERROR.  It is only used internally by inotify
 * which does not support FS_ERROR.
 */
#define FS_IN_IGNORED                0x00008000        /* last inotify event here */

#define FS_OPEN_PERM                0x00010000        /* open event in an permission hook */
#define FS_ACCESS_PERM                0x00020000        /* access event in a permissions hook */
#define FS_OPEN_EXEC_PERM        0x00040000        /* open/exec event in a permission hook */

/*
 * Set on inode mark that cares about things that happen to its children.
 * Always set for dnotify and inotify.
 * Set on inode/sb/mount marks that care about parent/name info.
 */
#define FS_EVENT_ON_CHILD        0x08000000

#define FS_RENAME                0x10000000        /* File was renamed */
#define FS_DN_MULTISHOT                0x20000000        /* dnotify multishot */
#define FS_ISDIR                0x40000000        /* event occurred against dir */

#define FS_MOVE                        (FS_MOVED_FROM | FS_MOVED_TO)

/*
 * Directory entry modification events - reported only to directory
 * where entry is modified and not to a watching parent.
 * The watching parent may get an FS_ATTRIB|FS_EVENT_ON_CHILD event
 * when a directory entry inside a child subdir changes.
 */
#define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE | FS_RENAME)

#define ALL_FSNOTIFY_PERM_EVENTS (FS_OPEN_PERM | FS_ACCESS_PERM | \
                                  FS_OPEN_EXEC_PERM)

/*
 * This is a list of all events that may get sent to a parent that is watching
 * with flag FS_EVENT_ON_CHILD based on fs event on a child of that directory.
 */
#define FS_EVENTS_POSS_ON_CHILD   (ALL_FSNOTIFY_PERM_EVENTS | \
                                   FS_ACCESS | FS_MODIFY | FS_ATTRIB | \
                                   FS_CLOSE_WRITE | FS_CLOSE_NOWRITE | \
                                   FS_OPEN | FS_OPEN_EXEC)

/*
 * This is a list of all events that may get sent with the parent inode as the
 * @to_tell argument of fsnotify().
 * It may include events that can be sent to an inode/sb/mount mark, but cannot
 * be sent to a parent watching children.
 */
#define FS_EVENTS_POSS_TO_PARENT (FS_EVENTS_POSS_ON_CHILD)

/* Events that can be reported to backends */
#define ALL_FSNOTIFY_EVENTS (ALL_FSNOTIFY_DIRENT_EVENTS | \
                             FS_EVENTS_POSS_ON_CHILD | \
                             FS_DELETE_SELF | FS_MOVE_SELF | \
                             FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED | \
                             FS_ERROR)

/* Extra flags that may be reported with event or control handling of events */
#define ALL_FSNOTIFY_FLAGS  (FS_ISDIR | FS_EVENT_ON_CHILD | FS_DN_MULTISHOT)

#define ALL_FSNOTIFY_BITS   (ALL_FSNOTIFY_EVENTS | ALL_FSNOTIFY_FLAGS)

struct fsnotify_group;
struct fsnotify_event;
struct fsnotify_mark;
struct fsnotify_event_private_data;
struct fsnotify_fname;
struct fsnotify_iter_info;

struct mem_cgroup;

/*
 * Each group much define these ops.  The fsnotify infrastructure will call
 * these operations for each relevant group.
 *
 * handle_event - main call for a group to handle an fs event
 * @group:        group to notify
 * @mask:        event type and flags
 * @data:        object that event happened on
 * @data_type:        type of object for fanotify_data_XXX() accessors
 * @dir:        optional directory associated with event -
 *                if @file_name is not NULL, this is the directory that
 *                @file_name is relative to
 * @file_name:        optional file name associated with event
 * @cookie:        inotify rename cookie
 * @iter_info:        array of marks from this group that are interested in the event
 *
 * handle_inode_event - simple variant of handle_event() for groups that only
 *                have inode marks and don't have ignore mask
 * @mark:        mark to notify
 * @mask:        event type and flags
 * @inode:        inode that event happened on
 * @dir:        optional directory associated with event -
 *                if @file_name is not NULL, this is the directory that
 *                @file_name is relative to.
 *                Either @inode or @dir must be non-NULL.
 * @file_name:        optional file name associated with event
 * @cookie:        inotify rename cookie
 *
 * free_group_priv - called when a group refcnt hits 0 to clean up the private union
 * freeing_mark - called when a mark is being destroyed for some reason.  The group
 *                MUST be holding a reference on each mark and that reference must be
 *                dropped in this function.  inotify uses this function to send
 *                userspace messages that marks have been removed.
 */
struct fsnotify_ops {
        int (*handle_event)(struct fsnotify_group *group, u32 mask,
                            const void *data, int data_type, struct inode *dir,
                            const struct qstr *file_name, u32 cookie,
                            struct fsnotify_iter_info *iter_info);
        int (*handle_inode_event)(struct fsnotify_mark *mark, u32 mask,
                            struct inode *inode, struct inode *dir,
                            const struct qstr *file_name, u32 cookie);
        void (*free_group_priv)(struct fsnotify_group *group);
        void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group);
        void (*free_event)(struct fsnotify_group *group, struct fsnotify_event *event);
        /* called on final put+free to free memory */
        void (*free_mark)(struct fsnotify_mark *mark);
};

/*
 * all of the information about the original object we want to now send to
 * a group.  If you want to carry more info from the accessing task to the
 * listener this structure is where you need to be adding fields.
 */
struct fsnotify_event {
        struct list_head list;
};

/*
 * A group is a "thing" that wants to receive notification about filesystem
 * events.  The mask holds the subset of event types this group cares about.
 * refcnt on a group is up to the implementor and at any moment if it goes 0
 * everything will be cleaned up.
 */
struct fsnotify_group {
        const struct fsnotify_ops *ops;        /* how this group handles things */

        /*
         * How the refcnt is used is up to each group.  When the refcnt hits 0
         * fsnotify will clean up all of the resources associated with this group.
         * As an example, the dnotify group will always have a refcnt=1 and that
         * will never change.  Inotify, on the other hand, has a group per
         * inotify_init() and the refcnt will hit 0 only when that fd has been
         * closed.
         */
        refcount_t refcnt;                /* things with interest in this group */

        /* needed to send notification to userspace */
        spinlock_t notification_lock;                /* protect the notification_list */
        struct list_head notification_list;        /* list of event_holder this group needs to send to userspace */
        wait_queue_head_t notification_waitq;        /* read() on the notification file blocks on this waitq */
        unsigned int q_len;                        /* events on the queue */
        unsigned int max_events;                /* maximum events allowed on the list */
        /*
         * Valid fsnotify group priorities.  Events are send in order from highest
         * priority to lowest priority.  We default to the lowest priority.
         */
        #define FS_PRIO_0        0 /* normal notifiers, no permissions */
        #define FS_PRIO_1        1 /* fanotify content based access control */
        #define FS_PRIO_2        2 /* fanotify pre-content access */
        unsigned int priority;
        bool shutdown;                /* group is being shut down, don't queue more events */

#define FSNOTIFY_GROUP_USER        0x01 /* user allocated group */
#define FSNOTIFY_GROUP_DUPS        0x02 /* allow multiple marks per object */
#define FSNOTIFY_GROUP_NOFS        0x04 /* group lock is not direct reclaim safe */
        int flags;
        unsigned int owner_flags;        /* stored flags of mark_mutex owner */

        /* stores all fastpath marks assoc with this group so they can be cleaned on unregister */
        struct mutex mark_mutex;        /* protect marks_list */
        atomic_t user_waits;                /* Number of tasks waiting for user
                                         * response */
        struct list_head marks_list;        /* all inode marks for this group */

        struct fasync_struct *fsn_fa;    /* async notification */

        struct fsnotify_event *overflow_event;        /* Event we queue when the
                                                 * notification list is too
                                                 * full */

        struct mem_cgroup *memcg;        /* memcg to charge allocations */

        /* groups can define private fields here or use the void *private */
        union {
                void *private;
#ifdef CONFIG_INOTIFY_USER
                struct inotify_group_private_data {
                        spinlock_t        idr_lock;
                        struct idr      idr;
                        struct ucounts *ucounts;
                } inotify_data;
#endif
#ifdef CONFIG_FANOTIFY
                struct fanotify_group_private_data {
                        /* Hash table of events for merge */
                        struct hlist_head *merge_hash;
                        /* allows a group to block waiting for a userspace response */
                        struct list_head access_list;
                        wait_queue_head_t access_waitq;
                        int flags;           /* flags from fanotify_init() */
                        int f_flags; /* event_f_flags from fanotify_init() */
                        struct ucounts *ucounts;
                        mempool_t error_events_pool;
                } fanotify_data;
#endif /* CONFIG_FANOTIFY */
        };
};

/*
 * These helpers are used to prevent deadlock when reclaiming inodes with
 * evictable marks of the same group that is allocating a new mark.
 */
static inline void fsnotify_group_lock(struct fsnotify_group *group)
{
        mutex_lock(&group->mark_mutex);
        if (group->flags & FSNOTIFY_GROUP_NOFS)
                group->owner_flags = memalloc_nofs_save();
}

static inline void fsnotify_group_unlock(struct fsnotify_group *group)
{
        if (group->flags & FSNOTIFY_GROUP_NOFS)
                memalloc_nofs_restore(group->owner_flags);
        mutex_unlock(&group->mark_mutex);
}

static inline void fsnotify_group_assert_locked(struct fsnotify_group *group)
{
        WARN_ON_ONCE(!mutex_is_locked(&group->mark_mutex));
        if (group->flags & FSNOTIFY_GROUP_NOFS)
                WARN_ON_ONCE(!(current->flags & PF_MEMALLOC_NOFS));
}

/* When calling fsnotify tell it if the data is a path or inode */
enum fsnotify_data_type {
        FSNOTIFY_EVENT_NONE,
        FSNOTIFY_EVENT_PATH,
        FSNOTIFY_EVENT_INODE,
        FSNOTIFY_EVENT_DENTRY,
        FSNOTIFY_EVENT_ERROR,
};

struct fs_error_report {
        int error;
        struct inode *inode;
        struct super_block *sb;
};

static inline struct inode *fsnotify_data_inode(const void *data, int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_INODE:
                return (struct inode *)data;
        case FSNOTIFY_EVENT_DENTRY:
                return d_inode(data);
        case FSNOTIFY_EVENT_PATH:
                return d_inode(((const struct path *)data)->dentry);
        case FSNOTIFY_EVENT_ERROR:
                return ((struct fs_error_report *)data)->inode;
        default:
                return NULL;
        }
}

static inline struct dentry *fsnotify_data_dentry(const void *data, int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_DENTRY:
                /* Non const is needed for dget() */
                return (struct dentry *)data;
        case FSNOTIFY_EVENT_PATH:
                return ((const struct path *)data)->dentry;
        default:
                return NULL;
        }
}

static inline const struct path *fsnotify_data_path(const void *data,
                                                    int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_PATH:
                return data;
        default:
                return NULL;
        }
}

static inline struct super_block *fsnotify_data_sb(const void *data,
                                                   int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_INODE:
                return ((struct inode *)data)->i_sb;
        case FSNOTIFY_EVENT_DENTRY:
                return ((struct dentry *)data)->d_sb;
        case FSNOTIFY_EVENT_PATH:
                return ((const struct path *)data)->dentry->d_sb;
        case FSNOTIFY_EVENT_ERROR:
                return ((struct fs_error_report *) data)->sb;
        default:
                return NULL;
        }
}

static inline struct fs_error_report *fsnotify_data_error_report(
                                                        const void *data,
                                                        int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_ERROR:
                return (struct fs_error_report *) data;
        default:
                return NULL;
        }
}

/*
 * Index to merged marks iterator array that correlates to a type of watch.
 * The type of watched object can be deduced from the iterator type, but not
 * the other way around, because an event can match different watched objects
 * of the same object type.
 * For example, both parent and child are watching an object of type inode.
 */
enum fsnotify_iter_type {
        FSNOTIFY_ITER_TYPE_INODE,
        FSNOTIFY_ITER_TYPE_VFSMOUNT,
        FSNOTIFY_ITER_TYPE_SB,
        FSNOTIFY_ITER_TYPE_PARENT,
        FSNOTIFY_ITER_TYPE_INODE2,
        FSNOTIFY_ITER_TYPE_COUNT
};

/* The type of object that a mark is attached to */
enum fsnotify_obj_type {
        FSNOTIFY_OBJ_TYPE_ANY = -1,
        FSNOTIFY_OBJ_TYPE_INODE,
        FSNOTIFY_OBJ_TYPE_VFSMOUNT,
        FSNOTIFY_OBJ_TYPE_SB,
        FSNOTIFY_OBJ_TYPE_COUNT,
        FSNOTIFY_OBJ_TYPE_DETACHED = FSNOTIFY_OBJ_TYPE_COUNT
};

static inline bool fsnotify_valid_obj_type(unsigned int obj_type)
{
        return (obj_type < FSNOTIFY_OBJ_TYPE_COUNT);
}

struct fsnotify_iter_info {
        struct fsnotify_mark *marks[FSNOTIFY_ITER_TYPE_COUNT];
        struct fsnotify_group *current_group;
        unsigned int report_mask;
        int srcu_idx;
};

static inline bool fsnotify_iter_should_report_type(
                struct fsnotify_iter_info *iter_info, int iter_type)
{
        return (iter_info->report_mask & (1U << iter_type));
}

static inline void fsnotify_iter_set_report_type(
                struct fsnotify_iter_info *iter_info, int iter_type)
{
        iter_info->report_mask |= (1U << iter_type);
}

static inline struct fsnotify_mark *fsnotify_iter_mark(
                struct fsnotify_iter_info *iter_info, int iter_type)
{
        if (fsnotify_iter_should_report_type(iter_info, iter_type))
                return iter_info->marks[iter_type];
        return NULL;
}

static inline int fsnotify_iter_step(struct fsnotify_iter_info *iter, int type,
                                     struct fsnotify_mark **markp)
{
        while (type < FSNOTIFY_ITER_TYPE_COUNT) {
                *markp = fsnotify_iter_mark(iter, type);
                if (*markp)
                        break;
                type++;
        }
        return type;
}

#define FSNOTIFY_ITER_FUNCS(name, NAME) \
static inline struct fsnotify_mark *fsnotify_iter_##name##_mark( \
                struct fsnotify_iter_info *iter_info) \
{ \
        return fsnotify_iter_mark(iter_info, FSNOTIFY_ITER_TYPE_##NAME); \
}

FSNOTIFY_ITER_FUNCS(inode, INODE)
FSNOTIFY_ITER_FUNCS(parent, PARENT)
FSNOTIFY_ITER_FUNCS(vfsmount, VFSMOUNT)
FSNOTIFY_ITER_FUNCS(sb, SB)

#define fsnotify_foreach_iter_type(type) \
        for (type = 0; type < FSNOTIFY_ITER_TYPE_COUNT; type++)
#define fsnotify_foreach_iter_mark_type(iter, mark, type) \
        for (type = 0; \
             type = fsnotify_iter_step(iter, type, &mark), \
             type < FSNOTIFY_ITER_TYPE_COUNT; \
             type++)

/*
 * fsnotify_connp_t is what we embed in objects which connector can be attached
 * to. fsnotify_connp_t * is how we refer from connector back to object.
 */
struct fsnotify_mark_connector;
typedef struct fsnotify_mark_connector __rcu *fsnotify_connp_t;

/*
 * Inode/vfsmount/sb point to this structure which tracks all marks attached to
 * the inode/vfsmount/sb. The reference to inode/vfsmount/sb is held by this
 * structure. We destroy this structure when there are no more marks attached
 * to it. The structure is protected by fsnotify_mark_srcu.
 */
struct fsnotify_mark_connector {
        spinlock_t lock;
        unsigned short type;        /* Type of object [lock] */
#define FSNOTIFY_CONN_FLAG_HAS_FSID        0x01
#define FSNOTIFY_CONN_FLAG_HAS_IREF        0x02
        unsigned short flags;        /* flags [lock] */
        __kernel_fsid_t fsid;        /* fsid of filesystem containing object */
        union {
                /* Object pointer [lock] */
                fsnotify_connp_t *obj;
                /* Used listing heads to free after srcu period expires */
                struct fsnotify_mark_connector *destroy_next;
        };
        struct hlist_head list;
};

/*
 * A mark is simply an object attached to an in core inode which allows an
 * fsnotify listener to indicate they are either no longer interested in events
 * of a type matching mask or only interested in those events.
 *
 * These are flushed when an inode is evicted from core and may be flushed
 * when the inode is modified (as seen by fsnotify_access).  Some fsnotify
 * users (such as dnotify) will flush these when the open fd is closed and not
 * at inode eviction or modification.
 *
 * Text in brackets is showing the lock(s) protecting modifications of a
 * particular entry. obj_lock means either inode->i_lock or
 * mnt->mnt_root->d_lock depending on the mark type.
 */
struct fsnotify_mark {
        /* Mask this mark is for [mark->lock, group->mark_mutex] */
        __u32 mask;
        /* We hold one for presence in g_list. Also one ref for each 'thing'
         * in kernel that found and may be using this mark. */
        refcount_t refcnt;
        /* Group this mark is for. Set on mark creation, stable until last ref
         * is dropped */
        struct fsnotify_group *group;
        /* List of marks by group->marks_list. Also reused for queueing
         * mark into destroy_list when it's waiting for the end of SRCU period
         * before it can be freed. [group->mark_mutex] */
        struct list_head g_list;
        /* Protects inode / mnt pointers, flags, masks */
        spinlock_t lock;
        /* List of marks for inode / vfsmount [connector->lock, mark ref] */
        struct hlist_node obj_list;
        /* Head of list of marks for an object [mark ref] */
        struct fsnotify_mark_connector *connector;
        /* Events types and flags to ignore [mark->lock, group->mark_mutex] */
        __u32 ignore_mask;
        /* General fsnotify mark flags */
#define FSNOTIFY_MARK_FLAG_ALIVE                0x0001
#define FSNOTIFY_MARK_FLAG_ATTACHED                0x0002
        /* inotify mark flags */
#define FSNOTIFY_MARK_FLAG_EXCL_UNLINK                0x0010
#define FSNOTIFY_MARK_FLAG_IN_ONESHOT                0x0020
        /* fanotify mark flags */
#define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY        0x0100
#define FSNOTIFY_MARK_FLAG_NO_IREF                0x0200
#define FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS        0x0400
        unsigned int flags;                /* flags [mark->lock] */
};

#ifdef CONFIG_FSNOTIFY

/* called from the vfs helpers */

/* main fsnotify call to send events */
extern int fsnotify(__u32 mask, const void *data, int data_type,
                    struct inode *dir, const struct qstr *name,
                    struct inode *inode, u32 cookie);
extern int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
                           int data_type);
extern void __fsnotify_inode_delete(struct inode *inode);
extern void __fsnotify_vfsmount_delete(struct vfsmount *mnt);
extern void fsnotify_sb_delete(struct super_block *sb);
extern u32 fsnotify_get_cookie(void);

static inline __u32 fsnotify_parent_needed_mask(__u32 mask)
{
        /* FS_EVENT_ON_CHILD is set on marks that want parent/name info */
        if (!(mask & FS_EVENT_ON_CHILD))
                return 0;
        /*
         * This object might be watched by a mark that cares about parent/name
         * info, does it care about the specific set of events that can be
         * reported with parent/name info?
         */
        return mask & FS_EVENTS_POSS_TO_PARENT;
}

static inline int fsnotify_inode_watches_children(struct inode *inode)
{
        __u32 parent_mask = READ_ONCE(inode->i_fsnotify_mask);

        /* FS_EVENT_ON_CHILD is set if the inode may care */
        if (!(parent_mask & FS_EVENT_ON_CHILD))
                return 0;
        /* this inode might care about child events, does it care about the
         * specific set of events that can happen on a child? */
        return parent_mask & FS_EVENTS_POSS_ON_CHILD;
}

/*
 * Update the dentry with a flag indicating the interest of its parent to receive
 * filesystem events when those events happens to this dentry->d_inode.
 */
static inline void fsnotify_update_flags(struct dentry *dentry)
{
        assert_spin_locked(&dentry->d_lock);

        /*
         * Serialisation of setting PARENT_WATCHED on the dentries is provided
         * by d_lock. If inotify_inode_watched changes after we have taken
         * d_lock, the following fsnotify_set_children_dentry_flags call will
         * find our entry, so it will spin until we complete here, and update
         * us with the new state.
         */
        if (fsnotify_inode_watches_children(dentry->d_parent->d_inode))
                dentry->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
        else
                dentry->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
}

/* called from fsnotify listeners, such as fanotify or dnotify */

/* create a new group */
extern struct fsnotify_group *fsnotify_alloc_group(
                                const struct fsnotify_ops *ops,
                                int flags);
/* get reference to a group */
extern void fsnotify_get_group(struct fsnotify_group *group);
/* drop reference on a group from fsnotify_alloc_group */
extern void fsnotify_put_group(struct fsnotify_group *group);
/* group destruction begins, stop queuing new events */
extern void fsnotify_group_stop_queueing(struct fsnotify_group *group);
/* destroy group */
extern void fsnotify_destroy_group(struct fsnotify_group *group);
/* fasync handler function */
extern int fsnotify_fasync(int fd, struct file *file, int on);
/* Free event from memory */
extern void fsnotify_destroy_event(struct fsnotify_group *group,
                                   struct fsnotify_event *event);
/* attach the event to the group notification queue */
extern int fsnotify_insert_event(struct fsnotify_group *group,
                                 struct fsnotify_event *event,
                                 int (*merge)(struct fsnotify_group *,
                                              struct fsnotify_event *),
                                 void (*insert)(struct fsnotify_group *,
                                                struct fsnotify_event *));

static inline int fsnotify_add_event(struct fsnotify_group *group,
                                     struct fsnotify_event *event,
                                     int (*merge)(struct fsnotify_group *,
                                                  struct fsnotify_event *))
{
        return fsnotify_insert_event(group, event, merge, NULL);
}

/* Queue overflow event to a notification group */
static inline void fsnotify_queue_overflow(struct fsnotify_group *group)
{
        fsnotify_add_event(group, group->overflow_event, NULL);
}

static inline bool fsnotify_is_overflow_event(u32 mask)
{
        return mask & FS_Q_OVERFLOW;
}

static inline bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
{
        assert_spin_locked(&group->notification_lock);

        return list_empty(&group->notification_list);
}

extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group);
/* return, but do not dequeue the first event on the notification queue */
extern struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group);
/* return AND dequeue the first event on the notification queue */
extern struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group);
/* Remove event queued in the notification list */
extern void fsnotify_remove_queued_event(struct fsnotify_group *group,
                                         struct fsnotify_event *event);

/* functions used to manipulate the marks attached to inodes */

/*
 * Canonical "ignore mask" including event flags.
 *
 * Note the subtle semantic difference from the legacy ->ignored_mask.
 * ->ignored_mask traditionally only meant which events should be ignored,
 * while ->ignore_mask also includes flags regarding the type of objects on
 * which events should be ignored.
 */
static inline __u32 fsnotify_ignore_mask(struct fsnotify_mark *mark)
{
        __u32 ignore_mask = mark->ignore_mask;

        /* The event flags in ignore mask take effect */
        if (mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS)
                return ignore_mask;

        /*
         * Legacy behavior:
         * - Always ignore events on dir
         * - Ignore events on child if parent is watching children
         */
        ignore_mask |= FS_ISDIR;
        ignore_mask &= ~FS_EVENT_ON_CHILD;
        ignore_mask |= mark->mask & FS_EVENT_ON_CHILD;

        return ignore_mask;
}

/* Legacy ignored_mask - only event types to ignore */
static inline __u32 fsnotify_ignored_events(struct fsnotify_mark *mark)
{
        return mark->ignore_mask & ALL_FSNOTIFY_EVENTS;
}

/*
 * Check if mask (or ignore mask) should be applied depending if victim is a
 * directory and whether it is reported to a watching parent.
 */
static inline bool fsnotify_mask_applicable(__u32 mask, bool is_dir,
                                            int iter_type)
{
        /* Should mask be applied to a directory? */
        if (is_dir && !(mask & FS_ISDIR))
                return false;

        /* Should mask be applied to a child? */
        if (iter_type == FSNOTIFY_ITER_TYPE_PARENT &&
            !(mask & FS_EVENT_ON_CHILD))
                return false;

        return true;
}

/*
 * Effective ignore mask taking into account if event victim is a
 * directory and whether it is reported to a watching parent.
 */
static inline __u32 fsnotify_effective_ignore_mask(struct fsnotify_mark *mark,
                                                   bool is_dir, int iter_type)
{
        __u32 ignore_mask = fsnotify_ignored_events(mark);

        if (!ignore_mask)
                return 0;

        /* For non-dir and non-child, no need to consult the event flags */
        if (!is_dir && iter_type != FSNOTIFY_ITER_TYPE_PARENT)
                return ignore_mask;

        ignore_mask = fsnotify_ignore_mask(mark);
        if (!fsnotify_mask_applicable(ignore_mask, is_dir, iter_type))
                return 0;

        return ignore_mask & ALL_FSNOTIFY_EVENTS;
}

/* Get mask for calculating object interest taking ignore mask into account */
static inline __u32 fsnotify_calc_mask(struct fsnotify_mark *mark)
{
        __u32 mask = mark->mask;

        if (!fsnotify_ignored_events(mark))
                return mask;

        /* Interest in FS_MODIFY may be needed for clearing ignore mask */
        if (!(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
                mask |= FS_MODIFY;

        /*
         * If mark is interested in ignoring events on children, the object must
         * show interest in those events for fsnotify_parent() to notice it.
         */
        return mask | mark->ignore_mask;
}

/* Get mask of events for a list of marks */
extern __u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn);
/* Calculate mask of events for a list of marks */
extern void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn);
extern void fsnotify_init_mark(struct fsnotify_mark *mark,
                               struct fsnotify_group *group);
/* Find mark belonging to given group in the list of marks */
extern struct fsnotify_mark *fsnotify_find_mark(fsnotify_connp_t *connp,
                                                struct fsnotify_group *group);
/* Get cached fsid of filesystem containing object */
extern int fsnotify_get_conn_fsid(const struct fsnotify_mark_connector *conn,
                                  __kernel_fsid_t *fsid);
/* attach the mark to the object */
extern int fsnotify_add_mark(struct fsnotify_mark *mark,
                             fsnotify_connp_t *connp, unsigned int obj_type,
                             int add_flags, __kernel_fsid_t *fsid);
extern int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
                                    fsnotify_connp_t *connp,
                                    unsigned int obj_type, int add_flags,
                                    __kernel_fsid_t *fsid);

/* attach the mark to the inode */
static inline int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
                                          struct inode *inode,
                                          int add_flags)
{
        return fsnotify_add_mark(mark, &inode->i_fsnotify_marks,
                                 FSNOTIFY_OBJ_TYPE_INODE, add_flags, NULL);
}
static inline int fsnotify_add_inode_mark_locked(struct fsnotify_mark *mark,
                                                 struct inode *inode,
                                                 int add_flags)
{
        return fsnotify_add_mark_locked(mark, &inode->i_fsnotify_marks,
                                        FSNOTIFY_OBJ_TYPE_INODE, add_flags,
                                        NULL);
}

/* given a group and a mark, flag mark to be freed when all references are dropped */
extern void fsnotify_destroy_mark(struct fsnotify_mark *mark,
                                  struct fsnotify_group *group);
/* detach mark from inode / mount list, group list, drop inode reference */
extern void fsnotify_detach_mark(struct fsnotify_mark *mark);
/* free mark */
extern void fsnotify_free_mark(struct fsnotify_mark *mark);
/* Wait until all marks queued for destruction are destroyed */
extern void fsnotify_wait_marks_destroyed(void);
/* Clear all of the marks of a group attached to a given object type */
extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group,
                                          unsigned int obj_type);
/* run all the marks in a group, and clear all of the vfsmount marks */
static inline void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
{
        fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_VFSMOUNT);
}
/* run all the marks in a group, and clear all of the inode marks */
static inline void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
{
        fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_INODE);
}
/* run all the marks in a group, and clear all of the sn marks */
static inline void fsnotify_clear_sb_marks_by_group(struct fsnotify_group *group)
{
        fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_SB);
}
extern void fsnotify_get_mark(struct fsnotify_mark *mark);
extern void fsnotify_put_mark(struct fsnotify_mark *mark);
extern void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info);
extern bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info);

static inline void fsnotify_init_event(struct fsnotify_event *event)
{
        INIT_LIST_HEAD(&event->list);
}

#else

static inline int fsnotify(__u32 mask, const void *data, int data_type,
                           struct inode *dir, const struct qstr *name,
                           struct inode *inode, u32 cookie)
{
        return 0;
}

static inline int __fsnotify_parent(struct dentry *dentry, __u32 mask,
                                  const void *data, int data_type)
{
        return 0;
}

static inline void __fsnotify_inode_delete(struct inode *inode)
{}

static inline void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
{}

static inline void fsnotify_sb_delete(struct super_block *sb)
{}

static inline void fsnotify_update_flags(struct dentry *dentry)
{}

static inline u32 fsnotify_get_cookie(void)
{
        return 0;
}

static inline void fsnotify_unmount_inodes(struct super_block *sb)
{}

#endif        /* CONFIG_FSNOTIFY */

#endif        /* __KERNEL __ */

#endif        /* __LINUX_FSNOTIFY_BACKEND_H */


























































































































































    4 





















    2 

    2 










    1 























    4 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
// SPDX-License-Identifier: GPL-2.0
#include <linux/export.h>
#include <linux/lockref.h>

#if USE_CMPXCHG_LOCKREF

/*
 * Note that the "cmpxchg()" reloads the "old" value for the
 * failure case.
 */
#define CMPXCHG_LOOP(CODE, SUCCESS) do {                                        \
        int retry = 100;                                                        \
        struct lockref old;                                                        \
        BUILD_BUG_ON(sizeof(old) != 8);                                                \
        old.lock_count = READ_ONCE(lockref->lock_count);                        \
        while (likely(arch_spin_value_unlocked(old.lock.rlock.raw_lock))) {          \
                struct lockref new = old, prev = old;                                \
                CODE                                                                \
                old.lock_count = cmpxchg64_relaxed(&lockref->lock_count,        \
                                                   old.lock_count,                \
                                                   new.lock_count);                \
                if (likely(old.lock_count == prev.lock_count)) {                \
                        SUCCESS;                                                \
                }                                                                \
                if (!--retry)                                                        \
                        break;                                                        \
        }                                                                        \
} while (0)

#else

#define CMPXCHG_LOOP(CODE, SUCCESS) do { } while (0)

#endif

/**
 * lockref_get - Increments reference count unconditionally
 * @lockref: pointer to lockref structure
 *
 * This operation is only valid if you already hold a reference
 * to the object, so you know the count cannot be zero.
 */
void lockref_get(struct lockref *lockref)
{
        CMPXCHG_LOOP(
                new.count++;
        ,
                return;
        );

        spin_lock(&lockref->lock);
        lockref->count++;
        spin_unlock(&lockref->lock);
}
EXPORT_SYMBOL(lockref_get);

/**
 * lockref_get_not_zero - Increments count unless the count is 0 or dead
 * @lockref: pointer to lockref structure
 * Return: 1 if count updated successfully or 0 if count was zero
 */
int lockref_get_not_zero(struct lockref *lockref)
{
        int retval;

        CMPXCHG_LOOP(
                new.count++;
                if (old.count <= 0)
                        return 0;
        ,
                return 1;
        );

        spin_lock(&lockref->lock);
        retval = 0;
        if (lockref->count > 0) {
                lockref->count++;
                retval = 1;
        }
        spin_unlock(&lockref->lock);
        return retval;
}
EXPORT_SYMBOL(lockref_get_not_zero);

/**
 * lockref_put_not_zero - Decrements count unless count <= 1 before decrement
 * @lockref: pointer to lockref structure
 * Return: 1 if count updated successfully or 0 if count would become zero
 */
int lockref_put_not_zero(struct lockref *lockref)
{
        int retval;

        CMPXCHG_LOOP(
                new.count--;
                if (old.count <= 1)
                        return 0;
        ,
                return 1;
        );

        spin_lock(&lockref->lock);
        retval = 0;
        if (lockref->count > 1) {
                lockref->count--;
                retval = 1;
        }
        spin_unlock(&lockref->lock);
        return retval;
}
EXPORT_SYMBOL(lockref_put_not_zero);

/**
 * lockref_get_or_lock - Increments count unless the count is 0 or dead
 * @lockref: pointer to lockref structure
 * Return: 1 if count updated successfully or 0 if count was zero
 * and we got the lock instead.
 */
int lockref_get_or_lock(struct lockref *lockref)
{
        CMPXCHG_LOOP(
                new.count++;
                if (old.count <= 0)
                        break;
        ,
                return 1;
        );

        spin_lock(&lockref->lock);
        if (lockref->count <= 0)
                return 0;
        lockref->count++;
        spin_unlock(&lockref->lock);
        return 1;
}
EXPORT_SYMBOL(lockref_get_or_lock);

/**
 * lockref_put_return - Decrement reference count if possible
 * @lockref: pointer to lockref structure
 *
 * Decrement the reference count and return the new value.
 * If the lockref was dead or locked, return an error.
 */
int lockref_put_return(struct lockref *lockref)
{
        CMPXCHG_LOOP(
                new.count--;
                if (old.count <= 0)
                        return -1;
        ,
                return new.count;
        );
        return -1;
}
EXPORT_SYMBOL(lockref_put_return);

/**
 * lockref_put_or_lock - decrements count unless count <= 1 before decrement
 * @lockref: pointer to lockref structure
 * Return: 1 if count updated successfully or 0 if count <= 1 and lock taken
 */
int lockref_put_or_lock(struct lockref *lockref)
{
        CMPXCHG_LOOP(
                new.count--;
                if (old.count <= 1)
                        break;
        ,
                return 1;
        );

        spin_lock(&lockref->lock);
        if (lockref->count <= 1)
                return 0;
        lockref->count--;
        spin_unlock(&lockref->lock);
        return 1;
}
EXPORT_SYMBOL(lockref_put_or_lock);

/**
 * lockref_mark_dead - mark lockref dead
 * @lockref: pointer to lockref structure
 */
void lockref_mark_dead(struct lockref *lockref)
{
        assert_spin_locked(&lockref->lock);
        lockref->count = -128;
}
EXPORT_SYMBOL(lockref_mark_dead);

/**
 * lockref_get_not_dead - Increments count unless the ref is dead
 * @lockref: pointer to lockref structure
 * Return: 1 if count updated successfully or 0 if lockref was dead
 */
int lockref_get_not_dead(struct lockref *lockref)
{
        int retval;

        CMPXCHG_LOOP(
                new.count++;
                if (old.count < 0)
                        return 0;
        ,
                return 1;
        );

        spin_lock(&lockref->lock);
        retval = 0;
        if (lockref->count >= 0) {
                lockref->count++;
                retval = 1;
        }
        spin_unlock(&lockref->lock);
        return retval;
}
EXPORT_SYMBOL(lockref_get_not_dead);















































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET  is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the Forwarding Information Base.
 *
 * Authors:        A.N.Kuznetsov, <kuznet@ms2.inr.ac.ru>
 */

#ifndef _NET_IP_FIB_H
#define _NET_IP_FIB_H

#include <net/flow.h>
#include <linux/seq_file.h>
#include <linux/rcupdate.h>
#include <net/fib_notifier.h>
#include <net/fib_rules.h>
#include <net/inetpeer.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
#include <linux/refcount.h>

struct fib_config {
        u8                        fc_dst_len;
        u8                        fc_tos;
        u8                        fc_protocol;
        u8                        fc_scope;
        u8                        fc_type;
        u8                        fc_gw_family;
        /* 2 bytes unused */
        u32                        fc_table;
        __be32                        fc_dst;
        union {
                __be32                fc_gw4;
                struct in6_addr        fc_gw6;
        };
        int                        fc_oif;
        u32                        fc_flags;
        u32                        fc_priority;
        __be32                        fc_prefsrc;
        u32                        fc_nh_id;
        struct nlattr                *fc_mx;
        struct rtnexthop        *fc_mp;
        int                        fc_mx_len;
        int                        fc_mp_len;
        u32                        fc_flow;
        u32                        fc_nlflags;
        struct nl_info                fc_nlinfo;
        struct nlattr                *fc_encap;
        u16                        fc_encap_type;
};

struct fib_info;
struct rtable;

struct fib_nh_exception {
        struct fib_nh_exception __rcu        *fnhe_next;
        int                                fnhe_genid;
        __be32                                fnhe_daddr;
        u32                                fnhe_pmtu;
        bool                                fnhe_mtu_locked;
        __be32                                fnhe_gw;
        unsigned long                        fnhe_expires;
        struct rtable __rcu                *fnhe_rth_input;
        struct rtable __rcu                *fnhe_rth_output;
        unsigned long                        fnhe_stamp;
        struct rcu_head                        rcu;
};

struct fnhe_hash_bucket {
        struct fib_nh_exception __rcu        *chain;
};

#define FNHE_HASH_SHIFT                11
#define FNHE_HASH_SIZE                (1 << FNHE_HASH_SHIFT)
#define FNHE_RECLAIM_DEPTH        5

struct fib_nh_common {
        struct net_device        *nhc_dev;
        int                        nhc_oif;
        unsigned char                nhc_scope;
        u8                        nhc_family;
        u8                        nhc_gw_family;
        unsigned char                nhc_flags;
        struct lwtunnel_state        *nhc_lwtstate;

        union {
                __be32          ipv4;
                struct in6_addr ipv6;
        } nhc_gw;

        int                        nhc_weight;
        atomic_t                nhc_upper_bound;

        /* v4 specific, but allows fib6_nh with v4 routes */
        struct rtable __rcu * __percpu *nhc_pcpu_rth_output;
        struct rtable __rcu     *nhc_rth_input;
        struct fnhe_hash_bucket        __rcu *nhc_exceptions;
};

struct fib_nh {
        struct fib_nh_common        nh_common;
        struct hlist_node        nh_hash;
        struct fib_info                *nh_parent;
#ifdef CONFIG_IP_ROUTE_CLASSID
        __u32                        nh_tclassid;
#endif
        __be32                        nh_saddr;
        int                        nh_saddr_genid;
#define fib_nh_family                nh_common.nhc_family
#define fib_nh_dev                nh_common.nhc_dev
#define fib_nh_oif                nh_common.nhc_oif
#define fib_nh_flags                nh_common.nhc_flags
#define fib_nh_lws                nh_common.nhc_lwtstate
#define fib_nh_scope                nh_common.nhc_scope
#define fib_nh_gw_family        nh_common.nhc_gw_family
#define fib_nh_gw4                nh_common.nhc_gw.ipv4
#define fib_nh_gw6                nh_common.nhc_gw.ipv6
#define fib_nh_weight                nh_common.nhc_weight
#define fib_nh_upper_bound        nh_common.nhc_upper_bound
};

/*
 * This structure contains data shared by many of routes.
 */

struct nexthop;

struct fib_info {
        struct hlist_node        fib_hash;
        struct hlist_node        fib_lhash;
        struct list_head        nh_list;
        struct net                *fib_net;
        int                        fib_treeref;
        refcount_t                fib_clntref;
        unsigned int                fib_flags;
        unsigned char                fib_dead;
        unsigned char                fib_protocol;
        unsigned char                fib_scope;
        unsigned char                fib_type;
        __be32                        fib_prefsrc;
        u32                        fib_tb_id;
        u32                        fib_priority;
        struct dst_metrics        *fib_metrics;
#define fib_mtu fib_metrics->metrics[RTAX_MTU-1]
#define fib_window fib_metrics->metrics[RTAX_WINDOW-1]
#define fib_rtt fib_metrics->metrics[RTAX_RTT-1]
#define fib_advmss fib_metrics->metrics[RTAX_ADVMSS-1]
        int                        fib_nhs;
        bool                        fib_nh_is_v6;
        bool                        nh_updated;
        bool                        pfsrc_removed;
        struct nexthop                *nh;
        struct rcu_head                rcu;
        struct fib_nh                fib_nh[];
};


#ifdef CONFIG_IP_MULTIPLE_TABLES
struct fib_rule;
#endif

struct fib_table;
struct fib_result {
        __be32                        prefix;
        unsigned char                prefixlen;
        unsigned char                nh_sel;
        unsigned char                type;
        unsigned char                scope;
        u32                        tclassid;
        struct fib_nh_common        *nhc;
        struct fib_info                *fi;
        struct fib_table        *table;
        struct hlist_head        *fa_head;
};

struct fib_result_nl {
        __be32                fl_addr;   /* To be looked up*/
        u32                fl_mark;
        unsigned char        fl_tos;
        unsigned char   fl_scope;
        unsigned char   tb_id_in;

        unsigned char   tb_id;      /* Results */
        unsigned char        prefixlen;
        unsigned char        nh_sel;
        unsigned char        type;
        unsigned char        scope;
        int             err;
};

#ifdef CONFIG_IP_MULTIPLE_TABLES
#define FIB_TABLE_HASHSZ 256
#else
#define FIB_TABLE_HASHSZ 2
#endif

__be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc,
                                 unsigned char scope);
__be32 fib_result_prefsrc(struct net *net, struct fib_result *res);

#define FIB_RES_NHC(res)                ((res).nhc)
#define FIB_RES_DEV(res)        (FIB_RES_NHC(res)->nhc_dev)
#define FIB_RES_OIF(res)        (FIB_RES_NHC(res)->nhc_oif)

struct fib_rt_info {
        struct fib_info                *fi;
        u32                        tb_id;
        __be32                        dst;
        int                        dst_len;
        u8                        tos;
        u8                        type;
        u8                        offload:1,
                                trap:1,
                                unused:6;
};

struct fib_entry_notifier_info {
        struct fib_notifier_info info; /* must be first */
        u32 dst;
        int dst_len;
        struct fib_info *fi;
        u8 tos;
        u8 type;
        u32 tb_id;
};

struct fib_nh_notifier_info {
        struct fib_notifier_info info; /* must be first */
        struct fib_nh *fib_nh;
};

int call_fib4_notifier(struct notifier_block *nb,
                       enum fib_event_type event_type,
                       struct fib_notifier_info *info);
int call_fib4_notifiers(struct net *net, enum fib_event_type event_type,
                        struct fib_notifier_info *info);

int __net_init fib4_notifier_init(struct net *net);
void __net_exit fib4_notifier_exit(struct net *net);

void fib_info_notify_update(struct net *net, struct nl_info *info);
int fib_notify(struct net *net, struct notifier_block *nb,
               struct netlink_ext_ack *extack);

struct fib_table {
        struct hlist_node        tb_hlist;
        u32                        tb_id;
        int                        tb_num_default;
        struct rcu_head                rcu;
        unsigned long                 *tb_data;
        unsigned long                __data[];
};

struct fib_dump_filter {
        u32                        table_id;
        /* filter_set is an optimization that an entry is set */
        bool                        filter_set;
        bool                        dump_routes;
        bool                        dump_exceptions;
        unsigned char                protocol;
        unsigned char                rt_type;
        unsigned int                flags;
        struct net_device        *dev;
};

int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
                     struct fib_result *res, int fib_flags);
int fib_table_insert(struct net *, struct fib_table *, struct fib_config *,
                     struct netlink_ext_ack *extack);
int fib_table_delete(struct net *, struct fib_table *, struct fib_config *,
                     struct netlink_ext_ack *extack);
int fib_table_dump(struct fib_table *table, struct sk_buff *skb,
                   struct netlink_callback *cb, struct fib_dump_filter *filter);
int fib_table_flush(struct net *net, struct fib_table *table, bool flush_all);
struct fib_table *fib_trie_unmerge(struct fib_table *main_tb);
void fib_table_flush_external(struct fib_table *table);
void fib_free_table(struct fib_table *tb);

#ifndef CONFIG_IP_MULTIPLE_TABLES

#define TABLE_LOCAL_INDEX        (RT_TABLE_LOCAL & (FIB_TABLE_HASHSZ - 1))
#define TABLE_MAIN_INDEX        (RT_TABLE_MAIN  & (FIB_TABLE_HASHSZ - 1))

static inline struct fib_table *fib_get_table(struct net *net, u32 id)
{
        struct hlist_node *tb_hlist;
        struct hlist_head *ptr;

        ptr = id == RT_TABLE_LOCAL ?
                &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX] :
                &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX];

        tb_hlist = rcu_dereference_rtnl(hlist_first_rcu(ptr));

        return hlist_entry(tb_hlist, struct fib_table, tb_hlist);
}

static inline struct fib_table *fib_new_table(struct net *net, u32 id)
{
        return fib_get_table(net, id);
}

static inline int fib_lookup(struct net *net, const struct flowi4 *flp,
                             struct fib_result *res, unsigned int flags)
{
        struct fib_table *tb;
        int err = -ENETUNREACH;

        rcu_read_lock();

        tb = fib_get_table(net, RT_TABLE_MAIN);
        if (tb)
                err = fib_table_lookup(tb, flp, res, flags | FIB_LOOKUP_NOREF);

        if (err == -EAGAIN)
                err = -ENETUNREACH;

        rcu_read_unlock();

        return err;
}

static inline bool fib4_has_custom_rules(const struct net *net)
{
        return false;
}

static inline bool fib4_rule_default(const struct fib_rule *rule)
{
        return true;
}

static inline int fib4_rules_dump(struct net *net, struct notifier_block *nb,
                                  struct netlink_ext_ack *extack)
{
        return 0;
}

static inline unsigned int fib4_rules_seq_read(struct net *net)
{
        return 0;
}

static inline bool fib4_rules_early_flow_dissect(struct net *net,
                                                 struct sk_buff *skb,
                                                 struct flowi4 *fl4,
                                                 struct flow_keys *flkeys)
{
        return false;
}
#else /* CONFIG_IP_MULTIPLE_TABLES */
int __net_init fib4_rules_init(struct net *net);
void __net_exit fib4_rules_exit(struct net *net);

struct fib_table *fib_new_table(struct net *net, u32 id);
struct fib_table *fib_get_table(struct net *net, u32 id);

int __fib_lookup(struct net *net, struct flowi4 *flp,
                 struct fib_result *res, unsigned int flags);

static inline int fib_lookup(struct net *net, struct flowi4 *flp,
                             struct fib_result *res, unsigned int flags)
{
        struct fib_table *tb;
        int err = -ENETUNREACH;

        flags |= FIB_LOOKUP_NOREF;
        if (net->ipv4.fib_has_custom_rules)
                return __fib_lookup(net, flp, res, flags);

        rcu_read_lock();

        res->tclassid = 0;

        tb = rcu_dereference_rtnl(net->ipv4.fib_main);
        if (tb)
                err = fib_table_lookup(tb, flp, res, flags);

        if (!err)
                goto out;

        tb = rcu_dereference_rtnl(net->ipv4.fib_default);
        if (tb)
                err = fib_table_lookup(tb, flp, res, flags);

out:
        if (err == -EAGAIN)
                err = -ENETUNREACH;

        rcu_read_unlock();

        return err;
}

static inline bool fib4_has_custom_rules(const struct net *net)
{
        return net->ipv4.fib_has_custom_rules;
}

bool fib4_rule_default(const struct fib_rule *rule);
int fib4_rules_dump(struct net *net, struct notifier_block *nb,
                    struct netlink_ext_ack *extack);
unsigned int fib4_rules_seq_read(struct net *net);

static inline bool fib4_rules_early_flow_dissect(struct net *net,
                                                 struct sk_buff *skb,
                                                 struct flowi4 *fl4,
                                                 struct flow_keys *flkeys)
{
        unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;

        if (!net->ipv4.fib_rules_require_fldissect)
                return false;

        skb_flow_dissect_flow_keys(skb, flkeys, flag);
        fl4->fl4_sport = flkeys->ports.src;
        fl4->fl4_dport = flkeys->ports.dst;
        fl4->flowi4_proto = flkeys->basic.ip_proto;

        return true;
}

#endif /* CONFIG_IP_MULTIPLE_TABLES */

/* Exported by fib_frontend.c */
extern const struct nla_policy rtm_ipv4_policy[];
void ip_fib_init(void);
int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla,
                    struct netlink_ext_ack *extack);
__be32 fib_compute_spec_dst(struct sk_buff *skb);
bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev);
int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
                        u8 tos, int oif, struct net_device *dev,
                        struct in_device *idev, u32 *itag);
#ifdef CONFIG_IP_ROUTE_CLASSID
static inline int fib_num_tclassid_users(struct net *net)
{
        return atomic_read(&net->ipv4.fib_num_tclassid_users);
}
#else
static inline int fib_num_tclassid_users(struct net *net)
{
        return 0;
}
#endif
int fib_unmerge(struct net *net);

static inline bool nhc_l3mdev_matches_dev(const struct fib_nh_common *nhc,
const struct net_device *dev)
{
        if (nhc->nhc_dev == dev ||
            l3mdev_master_ifindex_rcu(nhc->nhc_dev) == dev->ifindex)
                return true;

        return false;
}

/* Exported by fib_semantics.c */
int ip_fib_check_default(__be32 gw, struct net_device *dev);
int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force);
int fib_sync_down_addr(struct net_device *dev, __be32 local);
int fib_sync_up(struct net_device *dev, unsigned char nh_flags);
void fib_sync_mtu(struct net_device *dev, u32 orig_mtu);
void fib_nhc_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig);

#ifdef CONFIG_IP_ROUTE_MULTIPATH
int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
                       const struct sk_buff *skb, struct flow_keys *flkeys);
#endif
int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope,
                 struct netlink_ext_ack *extack);
void fib_select_multipath(struct fib_result *res, int hash);
void fib_select_path(struct net *net, struct fib_result *res,
                     struct flowi4 *fl4, const struct sk_buff *skb);

int fib_nh_init(struct net *net, struct fib_nh *fib_nh,
                struct fib_config *cfg, int nh_weight,
                struct netlink_ext_ack *extack);
void fib_nh_release(struct net *net, struct fib_nh *fib_nh);
int fib_nh_common_init(struct net *net, struct fib_nh_common *nhc,
                       struct nlattr *fc_encap, u16 fc_encap_type,
                       void *cfg, gfp_t gfp_flags,
                       struct netlink_ext_ack *extack);
void fib_nh_common_release(struct fib_nh_common *nhc);

/* Exported by fib_trie.c */
void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri);
void fib_trie_init(void);
struct fib_table *fib_trie_table(u32 id, struct fib_table *alias);
bool fib_lookup_good_nhc(const struct fib_nh_common *nhc, int fib_flags,
                         const struct flowi4 *flp);

static inline void fib_combine_itag(u32 *itag, const struct fib_result *res)
{
#ifdef CONFIG_IP_ROUTE_CLASSID
        struct fib_nh_common *nhc = res->nhc;
#ifdef CONFIG_IP_MULTIPLE_TABLES
        u32 rtag;
#endif
        if (nhc->nhc_family == AF_INET) {
                struct fib_nh *nh;

                nh = container_of(nhc, struct fib_nh, nh_common);
                *itag = nh->nh_tclassid << 16;
        } else {
                *itag = 0;
        }

#ifdef CONFIG_IP_MULTIPLE_TABLES
        rtag = res->tclassid;
        if (*itag == 0)
                *itag = (rtag<<16);
        *itag |= (rtag>>16);
#endif
#endif
}

void fib_flush(struct net *net);
void free_fib_info(struct fib_info *fi);

static inline void fib_info_hold(struct fib_info *fi)
{
        refcount_inc(&fi->fib_clntref);
}

static inline void fib_info_put(struct fib_info *fi)
{
        if (refcount_dec_and_test(&fi->fib_clntref))
                free_fib_info(fi);
}

#ifdef CONFIG_PROC_FS
int __net_init fib_proc_init(struct net *net);
void __net_exit fib_proc_exit(struct net *net);
#else
static inline int fib_proc_init(struct net *net)
{
        return 0;
}
static inline void fib_proc_exit(struct net *net)
{
}
#endif

u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr);

int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
                          struct fib_dump_filter *filter,
                          struct netlink_callback *cb);

int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nh,
                     u8 rt_family, unsigned char *flags, bool skip_oif);
int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nh,
                    int nh_weight, u8 rt_family, u32 nh_tclassid);
#endif  /* _NET_FIB_H */


















































































































































































































































































































































































































































































































































































































    1 



























































































    1 
























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RCULIST_H
#define _LINUX_RCULIST_H

#ifdef __KERNEL__

/*
 * RCU-protected list version
 */
#include <linux/list.h>
#include <linux/rcupdate.h>

/*
 * Why is there no list_empty_rcu()?  Because list_empty() serves this
 * purpose.  The list_empty() function fetches the RCU-protected pointer
 * and compares it to the address of the list head, but neither dereferences
 * this pointer itself nor provides this pointer to the caller.  Therefore,
 * it is not necessary to use rcu_dereference(), so that list_empty() can
 * be used anywhere you would want to use a list_empty_rcu().
 */

/*
 * INIT_LIST_HEAD_RCU - Initialize a list_head visible to RCU readers
 * @list: list to be initialized
 *
 * You should instead use INIT_LIST_HEAD() for normal initialization and
 * cleanup tasks, when readers have no access to the list being initialized.
 * However, if the list being initialized is visible to readers, you
 * need to keep the compiler from being too mischievous.
 */
static inline void INIT_LIST_HEAD_RCU(struct list_head *list)
{
        WRITE_ONCE(list->next, list);
        WRITE_ONCE(list->prev, list);
}

/*
 * return the ->next pointer of a list_head in an rcu safe
 * way, we must not access it directly
 */
#define list_next_rcu(list)        (*((struct list_head __rcu **)(&(list)->next)))

/**
 * list_tail_rcu - returns the prev pointer of the head of the list
 * @head: the head of the list
 *
 * Note: This should only be used with the list header, and even then
 * only if list_del() and similar primitives are not also used on the
 * list header.
 */
#define list_tail_rcu(head)        (*((struct list_head __rcu **)(&(head)->prev)))

/*
 * Check during list traversal that we are within an RCU reader
 */

#define check_arg_count_one(dummy)

#ifdef CONFIG_PROVE_RCU_LIST
#define __list_check_rcu(dummy, cond, extra...)                                \
        ({                                                                \
        check_arg_count_one(extra);                                        \
        RCU_LOCKDEP_WARN(!(cond) && !rcu_read_lock_any_held(),                \
                         "RCU-list traversed in non-reader section!");        \
        })

#define __list_check_srcu(cond)                                         \
        ({                                                                 \
        RCU_LOCKDEP_WARN(!(cond),                                         \
                "RCU-list traversed without holding the required lock!");\
        })
#else
#define __list_check_rcu(dummy, cond, extra...)                                \
        ({ check_arg_count_one(extra); })

#define __list_check_srcu(cond) ({ })
#endif

/*
 * Insert a new entry between two known consecutive entries.
 *
 * This is only for internal list manipulation where we know
 * the prev/next entries already!
 */
static inline void __list_add_rcu(struct list_head *new,
                struct list_head *prev, struct list_head *next)
{
        if (!__list_add_valid(new, prev, next))
                return;

        new->next = next;
        new->prev = prev;
        rcu_assign_pointer(list_next_rcu(prev), new);
        next->prev = new;
}

/**
 * list_add_rcu - add a new entry to rcu-protected list
 * @new: new entry to be added
 * @head: list head to add it after
 *
 * Insert a new entry after the specified head.
 * This is good for implementing stacks.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as list_add_rcu()
 * or list_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * list_for_each_entry_rcu().
 */
static inline void list_add_rcu(struct list_head *new, struct list_head *head)
{
        __list_add_rcu(new, head, head->next);
}

/**
 * list_add_tail_rcu - add a new entry to rcu-protected list
 * @new: new entry to be added
 * @head: list head to add it before
 *
 * Insert a new entry before the specified head.
 * This is useful for implementing queues.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as list_add_tail_rcu()
 * or list_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * list_for_each_entry_rcu().
 */
static inline void list_add_tail_rcu(struct list_head *new,
                                        struct list_head *head)
{
        __list_add_rcu(new, head->prev, head);
}

/**
 * list_del_rcu - deletes entry from list without re-initialization
 * @entry: the element to delete from the list.
 *
 * Note: list_empty() on entry does not return true after this,
 * the entry is in an undefined state. It is useful for RCU based
 * lockfree traversal.
 *
 * In particular, it means that we can not poison the forward
 * pointers that may still be used for walking the list.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as list_del_rcu()
 * or list_add_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * list_for_each_entry_rcu().
 *
 * Note that the caller is not permitted to immediately free
 * the newly deleted entry.  Instead, either synchronize_rcu()
 * or call_rcu() must be used to defer freeing until an RCU
 * grace period has elapsed.
 */
static inline void list_del_rcu(struct list_head *entry)
{
        __list_del_entry(entry);
        entry->prev = LIST_POISON2;
}

/**
 * hlist_del_init_rcu - deletes entry from hash list with re-initialization
 * @n: the element to delete from the hash list.
 *
 * Note: list_unhashed() on the node return true after this. It is
 * useful for RCU based read lockfree traversal if the writer side
 * must know if the list entry is still hashed or already unhashed.
 *
 * In particular, it means that we can not poison the forward pointers
 * that may still be used for walking the hash list and we can only
 * zero the pprev pointer so list_unhashed() will return true after
 * this.
 *
 * The caller must take whatever precautions are necessary (such as
 * holding appropriate locks) to avoid racing with another
 * list-mutation primitive, such as hlist_add_head_rcu() or
 * hlist_del_rcu(), running on this same list.  However, it is
 * perfectly legal to run concurrently with the _rcu list-traversal
 * primitives, such as hlist_for_each_entry_rcu().
 */
static inline void hlist_del_init_rcu(struct hlist_node *n)
{
        if (!hlist_unhashed(n)) {
                __hlist_del(n);
                WRITE_ONCE(n->pprev, NULL);
        }
}

/**
 * list_replace_rcu - replace old entry by new one
 * @old : the element to be replaced
 * @new : the new element to insert
 *
 * The @old entry will be replaced with the @new entry atomically.
 * Note: @old should not be empty.
 */
static inline void list_replace_rcu(struct list_head *old,
                                struct list_head *new)
{
        new->next = old->next;
        new->prev = old->prev;
        rcu_assign_pointer(list_next_rcu(new->prev), new);
        new->next->prev = new;
        old->prev = LIST_POISON2;
}

/**
 * __list_splice_init_rcu - join an RCU-protected list into an existing list.
 * @list:        the RCU-protected list to splice
 * @prev:        points to the last element of the existing list
 * @next:        points to the first element of the existing list
 * @sync:        synchronize_rcu, synchronize_rcu_expedited, ...
 *
 * The list pointed to by @prev and @next can be RCU-read traversed
 * concurrently with this function.
 *
 * Note that this function blocks.
 *
 * Important note: the caller must take whatever action is necessary to prevent
 * any other updates to the existing list.  In principle, it is possible to
 * modify the list as soon as sync() begins execution. If this sort of thing
 * becomes necessary, an alternative version based on call_rcu() could be
 * created.  But only if -really- needed -- there is no shortage of RCU API
 * members.
 */
static inline void __list_splice_init_rcu(struct list_head *list,
                                          struct list_head *prev,
                                          struct list_head *next,
                                          void (*sync)(void))
{
        struct list_head *first = list->next;
        struct list_head *last = list->prev;

        /*
         * "first" and "last" tracking list, so initialize it.  RCU readers
         * have access to this list, so we must use INIT_LIST_HEAD_RCU()
         * instead of INIT_LIST_HEAD().
         */

        INIT_LIST_HEAD_RCU(list);

        /*
         * At this point, the list body still points to the source list.
         * Wait for any readers to finish using the list before splicing
         * the list body into the new list.  Any new readers will see
         * an empty list.
         */

        sync();
        ASSERT_EXCLUSIVE_ACCESS(*first);
        ASSERT_EXCLUSIVE_ACCESS(*last);

        /*
         * Readers are finished with the source list, so perform splice.
         * The order is important if the new list is global and accessible
         * to concurrent RCU readers.  Note that RCU readers are not
         * permitted to traverse the prev pointers without excluding
         * this function.
         */

        last->next = next;
        rcu_assign_pointer(list_next_rcu(prev), first);
        first->prev = prev;
        next->prev = last;
}

/**
 * list_splice_init_rcu - splice an RCU-protected list into an existing list,
 *                        designed for stacks.
 * @list:        the RCU-protected list to splice
 * @head:        the place in the existing list to splice the first list into
 * @sync:        synchronize_rcu, synchronize_rcu_expedited, ...
 */
static inline void list_splice_init_rcu(struct list_head *list,
                                        struct list_head *head,
                                        void (*sync)(void))
{
        if (!list_empty(list))
                __list_splice_init_rcu(list, head, head->next, sync);
}

/**
 * list_splice_tail_init_rcu - splice an RCU-protected list into an existing
 *                             list, designed for queues.
 * @list:        the RCU-protected list to splice
 * @head:        the place in the existing list to splice the first list into
 * @sync:        synchronize_rcu, synchronize_rcu_expedited, ...
 */
static inline void list_splice_tail_init_rcu(struct list_head *list,
                                             struct list_head *head,
                                             void (*sync)(void))
{
        if (!list_empty(list))
                __list_splice_init_rcu(list, head->prev, head, sync);
}

/**
 * list_entry_rcu - get the struct for this entry
 * @ptr:        the &struct list_head pointer.
 * @type:       the type of the struct this is embedded in.
 * @member:     the name of the list_head within the struct.
 *
 * This primitive may safely run concurrently with the _rcu list-mutation
 * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
 */
#define list_entry_rcu(ptr, type, member) \
        container_of(READ_ONCE(ptr), type, member)

/*
 * Where are list_empty_rcu() and list_first_entry_rcu()?
 *
 * Implementing those functions following their counterparts list_empty() and
 * list_first_entry() is not advisable because they lead to subtle race
 * conditions as the following snippet shows:
 *
 * if (!list_empty_rcu(mylist)) {
 *        struct foo *bar = list_first_entry_rcu(mylist, struct foo, list_member);
 *        do_something(bar);
 * }
 *
 * The list may not be empty when list_empty_rcu checks it, but it may be when
 * list_first_entry_rcu rereads the ->next pointer.
 *
 * Rereading the ->next pointer is not a problem for list_empty() and
 * list_first_entry() because they would be protected by a lock that blocks
 * writers.
 *
 * See list_first_or_null_rcu for an alternative.
 */

/**
 * list_first_or_null_rcu - get the first element from a list
 * @ptr:        the list head to take the element from.
 * @type:       the type of the struct this is embedded in.
 * @member:     the name of the list_head within the struct.
 *
 * Note that if the list is empty, it returns NULL.
 *
 * This primitive may safely run concurrently with the _rcu list-mutation
 * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
 */
#define list_first_or_null_rcu(ptr, type, member) \
({ \
        struct list_head *__ptr = (ptr); \
        struct list_head *__next = READ_ONCE(__ptr->next); \
        likely(__ptr != __next) ? list_entry_rcu(__next, type, member) : NULL; \
})

/**
 * list_next_or_null_rcu - get the first element from a list
 * @head:        the head for the list.
 * @ptr:        the list head to take the next element from.
 * @type:       the type of the struct this is embedded in.
 * @member:     the name of the list_head within the struct.
 *
 * Note that if the ptr is at the end of the list, NULL is returned.
 *
 * This primitive may safely run concurrently with the _rcu list-mutation
 * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
 */
#define list_next_or_null_rcu(head, ptr, type, member) \
({ \
        struct list_head *__head = (head); \
        struct list_head *__ptr = (ptr); \
        struct list_head *__next = READ_ONCE(__ptr->next); \
        likely(__next != __head) ? list_entry_rcu(__next, type, \
                                                  member) : NULL; \
})

/**
 * list_for_each_entry_rcu        -        iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 * @cond:        optional lockdep expression if called from non-RCU protection.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as list_add_rcu()
 * as long as the traversal is guarded by rcu_read_lock().
 */
#define list_for_each_entry_rcu(pos, head, member, cond...)                \
        for (__list_check_rcu(dummy, ## cond, 0),                        \
             pos = list_entry_rcu((head)->next, typeof(*pos), member);        \
                &pos->member != (head);                                        \
                pos = list_entry_rcu(pos->member.next, typeof(*pos), member))

/**
 * list_for_each_entry_srcu        -        iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 * @cond:        lockdep expression for the lock required to traverse the list.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as list_add_rcu()
 * as long as the traversal is guarded by srcu_read_lock().
 * The lockdep expression srcu_read_lock_held() can be passed as the
 * cond argument from read side.
 */
#define list_for_each_entry_srcu(pos, head, member, cond)                \
        for (__list_check_srcu(cond),                                        \
             pos = list_entry_rcu((head)->next, typeof(*pos), member);        \
                &pos->member != (head);                                        \
                pos = list_entry_rcu(pos->member.next, typeof(*pos), member))

/**
 * list_entry_lockless - get the struct for this entry
 * @ptr:        the &struct list_head pointer.
 * @type:       the type of the struct this is embedded in.
 * @member:     the name of the list_head within the struct.
 *
 * This primitive may safely run concurrently with the _rcu
 * list-mutation primitives such as list_add_rcu(), but requires some
 * implicit RCU read-side guarding.  One example is running within a special
 * exception-time environment where preemption is disabled and where lockdep
 * cannot be invoked.  Another example is when items are added to the list,
 * but never deleted.
 */
#define list_entry_lockless(ptr, type, member) \
        container_of((typeof(ptr))READ_ONCE(ptr), type, member)

/**
 * list_for_each_entry_lockless - iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_struct within the struct.
 *
 * This primitive may safely run concurrently with the _rcu
 * list-mutation primitives such as list_add_rcu(), but requires some
 * implicit RCU read-side guarding.  One example is running within a special
 * exception-time environment where preemption is disabled and where lockdep
 * cannot be invoked.  Another example is when items are added to the list,
 * but never deleted.
 */
#define list_for_each_entry_lockless(pos, head, member) \
        for (pos = list_entry_lockless((head)->next, typeof(*pos), member); \
             &pos->member != (head); \
             pos = list_entry_lockless(pos->member.next, typeof(*pos), member))

/**
 * list_for_each_entry_continue_rcu - continue iteration over list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Continue to iterate over list of given type, continuing after
 * the current position which must have been in the list when the RCU read
 * lock was taken.
 * This would typically require either that you obtained the node from a
 * previous walk of the list in the same RCU read-side critical section, or
 * that you held some sort of non-RCU reference (such as a reference count)
 * to keep the node alive *and* in the list.
 *
 * This iterator is similar to list_for_each_entry_from_rcu() except
 * this starts after the given position and that one starts at the given
 * position.
 */
#define list_for_each_entry_continue_rcu(pos, head, member)                 \
        for (pos = list_entry_rcu(pos->member.next, typeof(*pos), member); \
             &pos->member != (head);        \
             pos = list_entry_rcu(pos->member.next, typeof(*pos), member))

/**
 * list_for_each_entry_from_rcu - iterate over a list from current point
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_node within the struct.
 *
 * Iterate over the tail of a list starting from a given position,
 * which must have been in the list when the RCU read lock was taken.
 * This would typically require either that you obtained the node from a
 * previous walk of the list in the same RCU read-side critical section, or
 * that you held some sort of non-RCU reference (such as a reference count)
 * to keep the node alive *and* in the list.
 *
 * This iterator is similar to list_for_each_entry_continue_rcu() except
 * this starts from the given position and that one starts from the position
 * after the given position.
 */
#define list_for_each_entry_from_rcu(pos, head, member)                        \
        for (; &(pos)->member != (head);                                        \
                pos = list_entry_rcu(pos->member.next, typeof(*(pos)), member))

/**
 * hlist_del_rcu - deletes entry from hash list without re-initialization
 * @n: the element to delete from the hash list.
 *
 * Note: list_unhashed() on entry does not return true after this,
 * the entry is in an undefined state. It is useful for RCU based
 * lockfree traversal.
 *
 * In particular, it means that we can not poison the forward
 * pointers that may still be used for walking the hash list.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_add_head_rcu()
 * or hlist_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_for_each_entry().
 */
static inline void hlist_del_rcu(struct hlist_node *n)
{
        __hlist_del(n);
        WRITE_ONCE(n->pprev, LIST_POISON2);
}

/**
 * hlist_replace_rcu - replace old entry by new one
 * @old : the element to be replaced
 * @new : the new element to insert
 *
 * The @old entry will be replaced with the @new entry atomically.
 */
static inline void hlist_replace_rcu(struct hlist_node *old,
                                        struct hlist_node *new)
{
        struct hlist_node *next = old->next;

        new->next = next;
        WRITE_ONCE(new->pprev, old->pprev);
        rcu_assign_pointer(*(struct hlist_node __rcu **)new->pprev, new);
        if (next)
                WRITE_ONCE(new->next->pprev, &new->next);
        WRITE_ONCE(old->pprev, LIST_POISON2);
}

/**
 * hlists_swap_heads_rcu - swap the lists the hlist heads point to
 * @left:  The hlist head on the left
 * @right: The hlist head on the right
 *
 * The lists start out as [@left  ][node1 ... ] and
 *                        [@right ][node2 ... ]
 * The lists end up as    [@left  ][node2 ... ]
 *                        [@right ][node1 ... ]
 */
static inline void hlists_swap_heads_rcu(struct hlist_head *left, struct hlist_head *right)
{
        struct hlist_node *node1 = left->first;
        struct hlist_node *node2 = right->first;

        rcu_assign_pointer(left->first, node2);
        rcu_assign_pointer(right->first, node1);
        WRITE_ONCE(node2->pprev, &left->first);
        WRITE_ONCE(node1->pprev, &right->first);
}

/*
 * return the first or the next element in an RCU protected hlist
 */
#define hlist_first_rcu(head)        (*((struct hlist_node __rcu **)(&(head)->first)))
#define hlist_next_rcu(node)        (*((struct hlist_node __rcu **)(&(node)->next)))
#define hlist_pprev_rcu(node)        (*((struct hlist_node __rcu **)((node)->pprev)))

/**
 * hlist_add_head_rcu
 * @n: the element to add to the hash list.
 * @h: the list to add to.
 *
 * Description:
 * Adds the specified element to the specified hlist,
 * while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_add_head_rcu()
 * or hlist_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.  Regardless of the type of CPU, the
 * list-traversal primitive must be guarded by rcu_read_lock().
 */
static inline void hlist_add_head_rcu(struct hlist_node *n,
                                        struct hlist_head *h)
{
        struct hlist_node *first = h->first;

        n->next = first;
        WRITE_ONCE(n->pprev, &h->first);
        rcu_assign_pointer(hlist_first_rcu(h), n);
        if (first)
                WRITE_ONCE(first->pprev, &n->next);
}

/**
 * hlist_add_tail_rcu
 * @n: the element to add to the hash list.
 * @h: the list to add to.
 *
 * Description:
 * Adds the specified element to the specified hlist,
 * while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_add_head_rcu()
 * or hlist_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.  Regardless of the type of CPU, the
 * list-traversal primitive must be guarded by rcu_read_lock().
 */
static inline void hlist_add_tail_rcu(struct hlist_node *n,
                                      struct hlist_head *h)
{
        struct hlist_node *i, *last = NULL;

        /* Note: write side code, so rcu accessors are not needed. */
        for (i = h->first; i; i = i->next)
                last = i;

        if (last) {
                n->next = last->next;
                WRITE_ONCE(n->pprev, &last->next);
                rcu_assign_pointer(hlist_next_rcu(last), n);
        } else {
                hlist_add_head_rcu(n, h);
        }
}

/**
 * hlist_add_before_rcu
 * @n: the new element to add to the hash list.
 * @next: the existing element to add the new element before.
 *
 * Description:
 * Adds the specified element to the specified hlist
 * before the specified node while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_add_head_rcu()
 * or hlist_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.
 */
static inline void hlist_add_before_rcu(struct hlist_node *n,
                                        struct hlist_node *next)
{
        WRITE_ONCE(n->pprev, next->pprev);
        n->next = next;
        rcu_assign_pointer(hlist_pprev_rcu(n), n);
        WRITE_ONCE(next->pprev, &n->next);
}

/**
 * hlist_add_behind_rcu
 * @n: the new element to add to the hash list.
 * @prev: the existing element to add the new element after.
 *
 * Description:
 * Adds the specified element to the specified hlist
 * after the specified node while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_add_head_rcu()
 * or hlist_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.
 */
static inline void hlist_add_behind_rcu(struct hlist_node *n,
                                        struct hlist_node *prev)
{
        n->next = prev->next;
        WRITE_ONCE(n->pprev, &prev->next);
        rcu_assign_pointer(hlist_next_rcu(prev), n);
        if (n->next)
                WRITE_ONCE(n->next->pprev, &n->next);
}

#define __hlist_for_each_rcu(pos, head)                                \
        for (pos = rcu_dereference(hlist_first_rcu(head));        \
             pos;                                                \
             pos = rcu_dereference(hlist_next_rcu(pos)))

/**
 * hlist_for_each_entry_rcu - iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 * @cond:        optional lockdep expression if called from non-RCU protection.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
 * as long as the traversal is guarded by rcu_read_lock().
 */
#define hlist_for_each_entry_rcu(pos, head, member, cond...)                \
        for (__list_check_rcu(dummy, ## cond, 0),                        \
             pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\
                        typeof(*(pos)), member);                        \
                pos;                                                        \
                pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_srcu - iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 * @cond:        lockdep expression for the lock required to traverse the list.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
 * as long as the traversal is guarded by srcu_read_lock().
 * The lockdep expression srcu_read_lock_held() can be passed as the
 * cond argument from read side.
 */
#define hlist_for_each_entry_srcu(pos, head, member, cond)                \
        for (__list_check_srcu(cond),                                        \
             pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\
                        typeof(*(pos)), member);                        \
                pos;                                                        \
                pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_rcu_notrace - iterate over rcu list of given type (for tracing)
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
 * as long as the traversal is guarded by rcu_read_lock().
 *
 * This is the same as hlist_for_each_entry_rcu() except that it does
 * not do any RCU debugging or tracing.
 */
#define hlist_for_each_entry_rcu_notrace(pos, head, member)                        \
        for (pos = hlist_entry_safe(rcu_dereference_raw_check(hlist_first_rcu(head)),\
                        typeof(*(pos)), member);                        \
                pos;                                                        \
                pos = hlist_entry_safe(rcu_dereference_raw_check(hlist_next_rcu(\
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_rcu_bh - iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
 * as long as the traversal is guarded by rcu_read_lock().
 */
#define hlist_for_each_entry_rcu_bh(pos, head, member)                        \
        for (pos = hlist_entry_safe(rcu_dereference_bh(hlist_first_rcu(head)),\
                        typeof(*(pos)), member);                        \
                pos;                                                        \
                pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu(\
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_continue_rcu - iterate over a hlist continuing after current point
 * @pos:        the type * to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_continue_rcu(pos, member)                        \
        for (pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \
                        &(pos)->member)), typeof(*(pos)), member);        \
             pos;                                                        \
             pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(        \
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_continue_rcu_bh - iterate over a hlist continuing after current point
 * @pos:        the type * to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_continue_rcu_bh(pos, member)                \
        for (pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu(  \
                        &(pos)->member)), typeof(*(pos)), member);        \
             pos;                                                        \
             pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu(        \
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_from_rcu - iterate over a hlist continuing from current point
 * @pos:        the type * to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_from_rcu(pos, member)                        \
        for (; pos;                                                        \
             pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(        \
                        &(pos)->member)), typeof(*(pos)), member))

#endif        /* __KERNEL__ */
#endif

























   14 








    3 






    3 





















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_JUMP_LABEL_H
#define _ASM_X86_JUMP_LABEL_H

#define HAVE_JUMP_LABEL_BATCH

#define JUMP_LABEL_NOP_SIZE 5

#ifdef CONFIG_X86_64
# define STATIC_KEY_INIT_NOP P6_NOP5_ATOMIC
#else
# define STATIC_KEY_INIT_NOP GENERIC_NOP5_ATOMIC
#endif

#include <asm/asm.h>
#include <asm/nops.h>

#ifndef __ASSEMBLY__

#include <linux/stringify.h>
#include <linux/types.h>

static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
{
        asm_volatile_goto("1:"
                ".byte " __stringify(STATIC_KEY_INIT_NOP) "\n\t"
                ".pushsection __jump_table,  \"aw\" \n\t"
                _ASM_ALIGN "\n\t"
                ".long 1b - ., %l[l_yes] - . \n\t"
                _ASM_PTR "%c0 + %c1 - .\n\t"
                ".popsection \n\t"
                : :  "i" (key), "i" (branch) : : l_yes);

        return false;
l_yes:
        return true;
}

static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch)
{
        asm_volatile_goto("1:"
                ".byte 0xe9\n\t .long %l[l_yes] - 2f\n\t"
                "2:\n\t"
                ".pushsection __jump_table,  \"aw\" \n\t"
                _ASM_ALIGN "\n\t"
                ".long 1b - ., %l[l_yes] - . \n\t"
                _ASM_PTR "%c0 + %c1 - .\n\t"
                ".popsection \n\t"
                : :  "i" (key), "i" (branch) : : l_yes);

        return false;
l_yes:
        return true;
}

#else        /* __ASSEMBLY__ */

.macro STATIC_JUMP_IF_TRUE target, key, def
.Lstatic_jump_\@:
        .if \def
        /* Equivalent to "jmp.d32 \target" */
        .byte                0xe9
        .long                \target - .Lstatic_jump_after_\@
.Lstatic_jump_after_\@:
        .else
        .byte                STATIC_KEY_INIT_NOP
        .endif
        .pushsection __jump_table, "aw"
        _ASM_ALIGN
        .long                .Lstatic_jump_\@ - ., \target - .
        _ASM_PTR        \key - .
        .popsection
.endm

.macro STATIC_JUMP_IF_FALSE target, key, def
.Lstatic_jump_\@:
        .if \def
        .byte                STATIC_KEY_INIT_NOP
        .else
        /* Equivalent to "jmp.d32 \target" */
        .byte                0xe9
        .long                \target - .Lstatic_jump_after_\@
.Lstatic_jump_after_\@:
        .endif
        .pushsection __jump_table, "aw"
        _ASM_ALIGN
        .long                .Lstatic_jump_\@ - ., \target - .
        _ASM_PTR        \key + 1 - .
        .popsection
.endm

#endif        /* __ASSEMBLY__ */

#endif































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
/* SPDX-License-Identifier: GPL-2.0 */

#ifndef _KERNEL_PRINTK_RINGBUFFER_H
#define _KERNEL_PRINTK_RINGBUFFER_H

#include <linux/atomic.h>
#include <linux/dev_printk.h>

/*
 * Meta information about each stored message.
 *
 * All fields are set by the printk code except for @seq, which is
 * set by the ringbuffer code.
 */
struct printk_info {
        u64        seq;                /* sequence number */
        u64        ts_nsec;        /* timestamp in nanoseconds */
        u16        text_len;        /* length of text message */
        u8        facility;        /* syslog facility */
        u8        flags:5;        /* internal record flags */
        u8        level:3;        /* syslog level */
        u32        caller_id;        /* thread id or processor id */

        struct dev_printk_info        dev_info;
};

/*
 * A structure providing the buffers, used by writers and readers.
 *
 * Writers:
 * Using prb_rec_init_wr(), a writer sets @text_buf_size before calling
 * prb_reserve(). On success, prb_reserve() sets @info and @text_buf to
 * buffers reserved for that writer.
 *
 * Readers:
 * Using prb_rec_init_rd(), a reader sets all fields before calling
 * prb_read_valid(). Note that the reader provides the @info and @text_buf,
 * buffers. On success, the struct pointed to by @info will be filled and
 * the char array pointed to by @text_buf will be filled with text data.
 */
struct printk_record {
        struct printk_info        *info;
        char                        *text_buf;
        unsigned int                text_buf_size;
};

/* Specifies the logical position and span of a data block. */
struct prb_data_blk_lpos {
        unsigned long        begin;
        unsigned long        next;
};

/*
 * A descriptor: the complete meta-data for a record.
 *
 * @state_var: A bitwise combination of descriptor ID and descriptor state.
 */
struct prb_desc {
        atomic_long_t                        state_var;
        struct prb_data_blk_lpos        text_blk_lpos;
};

/* A ringbuffer of "ID + data" elements. */
struct prb_data_ring {
        unsigned int        size_bits;
        char                *data;
        atomic_long_t        head_lpos;
        atomic_long_t        tail_lpos;
};

/* A ringbuffer of "struct prb_desc" elements. */
struct prb_desc_ring {
        unsigned int                count_bits;
        struct prb_desc                *descs;
        struct printk_info        *infos;
        atomic_long_t                head_id;
        atomic_long_t                tail_id;
};

/*
 * The high level structure representing the printk ringbuffer.
 *
 * @fail: Count of failed prb_reserve() calls where not even a data-less
 *        record was created.
 */
struct printk_ringbuffer {
        struct prb_desc_ring        desc_ring;
        struct prb_data_ring        text_data_ring;
        atomic_long_t                fail;
};

/*
 * Used by writers as a reserve/commit handle.
 *
 * @rb:         Ringbuffer where the entry is reserved.
 * @irqflags:   Saved irq flags to restore on entry commit.
 * @id:         ID of the reserved descriptor.
 * @text_space: Total occupied buffer space in the text data ring, including
 *              ID, alignment padding, and wrapping data blocks.
 *
 * This structure is an opaque handle for writers. Its contents are only
 * to be used by the ringbuffer implementation.
 */
struct prb_reserved_entry {
        struct printk_ringbuffer        *rb;
        unsigned long                        irqflags;
        unsigned long                        id;
        unsigned int                        text_space;
};

/* The possible responses of a descriptor state-query. */
enum desc_state {
        desc_miss        =  -1,        /* ID mismatch (pseudo state) */
        desc_reserved        = 0x0,        /* reserved, in use by writer */
        desc_committed        = 0x1,        /* committed by writer, could get reopened */
        desc_finalized        = 0x2,        /* committed, no further modification allowed */
        desc_reusable        = 0x3,        /* free, not yet used by any writer */
};

#define _DATA_SIZE(sz_bits)        (1UL << (sz_bits))
#define _DESCS_COUNT(ct_bits)        (1U << (ct_bits))
#define DESC_SV_BITS                (sizeof(unsigned long) * 8)
#define DESC_FLAGS_SHIFT        (DESC_SV_BITS - 2)
#define DESC_FLAGS_MASK                (3UL << DESC_FLAGS_SHIFT)
#define DESC_STATE(sv)                (3UL & (sv >> DESC_FLAGS_SHIFT))
#define DESC_SV(id, state)        (((unsigned long)state << DESC_FLAGS_SHIFT) | id)
#define DESC_ID_MASK                (~DESC_FLAGS_MASK)
#define DESC_ID(sv)                ((sv) & DESC_ID_MASK)
#define FAILED_LPOS                0x1
#define NO_LPOS                        0x3

#define FAILED_BLK_LPOS        \
{                                \
        .begin        = FAILED_LPOS,        \
        .next        = FAILED_LPOS,        \
}

/*
 * Descriptor Bootstrap
 *
 * The descriptor array is minimally initialized to allow immediate usage
 * by readers and writers. The requirements that the descriptor array
 * initialization must satisfy:
 *
 *   Req1
 *     The tail must point to an existing (committed or reusable) descriptor.
 *     This is required by the implementation of prb_first_seq().
 *
 *   Req2
 *     Readers must see that the ringbuffer is initially empty.
 *
 *   Req3
 *     The first record reserved by a writer is assigned sequence number 0.
 *
 * To satisfy Req1, the tail initially points to a descriptor that is
 * minimally initialized (having no data block, i.e. data-less with the
 * data block's lpos @begin and @next values set to FAILED_LPOS).
 *
 * To satisfy Req2, the initial tail descriptor is initialized to the
 * reusable state. Readers recognize reusable descriptors as existing
 * records, but skip over them.
 *
 * To satisfy Req3, the last descriptor in the array is used as the initial
 * head (and tail) descriptor. This allows the first record reserved by a
 * writer (head + 1) to be the first descriptor in the array. (Only the first
 * descriptor in the array could have a valid sequence number of 0.)
 *
 * The first time a descriptor is reserved, it is assigned a sequence number
 * with the value of the array index. A "first time reserved" descriptor can
 * be recognized because it has a sequence number of 0 but does not have an
 * index of 0. (Only the first descriptor in the array could have a valid
 * sequence number of 0.) After the first reservation, all future reservations
 * (recycling) simply involve incrementing the sequence number by the array
 * count.
 *
 *   Hack #1
 *     Only the first descriptor in the array is allowed to have the sequence
 *     number 0. In this case it is not possible to recognize if it is being
 *     reserved the first time (set to index value) or has been reserved
 *     previously (increment by the array count). This is handled by _always_
 *     incrementing the sequence number by the array count when reserving the
 *     first descriptor in the array. In order to satisfy Req3, the sequence
 *     number of the first descriptor in the array is initialized to minus
 *     the array count. Then, upon the first reservation, it is incremented
 *     to 0, thus satisfying Req3.
 *
 *   Hack #2
 *     prb_first_seq() can be called at any time by readers to retrieve the
 *     sequence number of the tail descriptor. However, due to Req2 and Req3,
 *     initially there are no records to report the sequence number of
 *     (sequence numbers are u64 and there is nothing less than 0). To handle
 *     this, the sequence number of the initial tail descriptor is initialized
 *     to 0. Technically this is incorrect, because there is no record with
 *     sequence number 0 (yet) and the tail descriptor is not the first
 *     descriptor in the array. But it allows prb_read_valid() to correctly
 *     report the existence of a record for _any_ given sequence number at all
 *     times. Bootstrapping is complete when the tail is pushed the first
 *     time, thus finally pointing to the first descriptor reserved by a
 *     writer, which has the assigned sequence number 0.
 */

/*
 * Initiating Logical Value Overflows
 *
 * Both logical position (lpos) and ID values can be mapped to array indexes
 * but may experience overflows during the lifetime of the system. To ensure
 * that printk_ringbuffer can handle the overflows for these types, initial
 * values are chosen that map to the correct initial array indexes, but will
 * result in overflows soon.
 *
 *   BLK0_LPOS
 *     The initial @head_lpos and @tail_lpos for data rings. It is at index
 *     0 and the lpos value is such that it will overflow on the first wrap.
 *
 *   DESC0_ID
 *     The initial @head_id and @tail_id for the desc ring. It is at the last
 *     index of the descriptor array (see Req3 above) and the ID value is such
 *     that it will overflow on the second wrap.
 */
#define BLK0_LPOS(sz_bits)        (-(_DATA_SIZE(sz_bits)))
#define DESC0_ID(ct_bits)        DESC_ID(-(_DESCS_COUNT(ct_bits) + 1))
#define DESC0_SV(ct_bits)        DESC_SV(DESC0_ID(ct_bits), desc_reusable)

/*
 * Define a ringbuffer with an external text data buffer. The same as
 * DEFINE_PRINTKRB() but requires specifying an external buffer for the
 * text data.
 *
 * Note: The specified external buffer must be of the size:
 *       2 ^ (descbits + avgtextbits)
 */
#define _DEFINE_PRINTKRB(name, descbits, avgtextbits, text_buf)                        \
static struct prb_desc _##name##_descs[_DESCS_COUNT(descbits)] = {                                \
        /* the initial head and tail */                                                                \
        [_DESCS_COUNT(descbits) - 1] = {                                                        \
                /* reusable */                                                                        \
                .state_var        = ATOMIC_INIT(DESC0_SV(descbits)),                                \
                /* no associated data block */                                                        \
                .text_blk_lpos        = FAILED_BLK_LPOS,                                                \
        },                                                                                        \
};                                                                                                \
static struct printk_info _##name##_infos[_DESCS_COUNT(descbits)] = {                                \
        /* this will be the first record reserved by a writer */                                \
        [0] = {                                                                                        \
                /* will be incremented to 0 on the first reservation */                                \
                .seq = -(u64)_DESCS_COUNT(descbits),                                                \
        },                                                                                        \
        /* the initial head and tail */                                                                \
        [_DESCS_COUNT(descbits) - 1] = {                                                        \
                /* reports the first seq value during the bootstrap phase */                        \
                .seq = 0,                                                                        \
        },                                                                                        \
};                                                                                                \
static struct printk_ringbuffer name = {                                                        \
        .desc_ring = {                                                                                \
                .count_bits        = descbits,                                                        \
                .descs                = &_##name##_descs[0],                                                \
                .infos                = &_##name##_infos[0],                                                \
                .head_id        = ATOMIC_INIT(DESC0_ID(descbits)),                                \
                .tail_id        = ATOMIC_INIT(DESC0_ID(descbits)),                                \
        },                                                                                        \
        .text_data_ring = {                                                                        \
                .size_bits        = (avgtextbits) + (descbits),                                        \
                .data                = text_buf,                                                        \
                .head_lpos        = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))),        \
                .tail_lpos        = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))),        \
        },                                                                                        \
        .fail                        = ATOMIC_LONG_INIT(0),                                                \
}

/**
 * DEFINE_PRINTKRB() - Define a ringbuffer.
 *
 * @name:        The name of the ringbuffer variable.
 * @descbits:    The number of descriptors as a power-of-2 value.
 * @avgtextbits: The average text data size per record as a power-of-2 value.
 *
 * This is a macro for defining a ringbuffer and all internal structures
 * such that it is ready for immediate use. See _DEFINE_PRINTKRB() for a
 * variant where the text data buffer can be specified externally.
 */
#define DEFINE_PRINTKRB(name, descbits, avgtextbits)                                \
static char _##name##_text[1U << ((avgtextbits) + (descbits))]                        \
                        __aligned(__alignof__(unsigned long));                        \
_DEFINE_PRINTKRB(name, descbits, avgtextbits, &_##name##_text[0])

/* Writer Interface */

/**
 * prb_rec_init_wd() - Initialize a buffer for writing records.
 *
 * @r:             The record to initialize.
 * @text_buf_size: The needed text buffer size.
 */
static inline void prb_rec_init_wr(struct printk_record *r,
                                   unsigned int text_buf_size)
{
        r->info = NULL;
        r->text_buf = NULL;
        r->text_buf_size = text_buf_size;
}

bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
                 struct printk_record *r);
bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
                         struct printk_record *r, u32 caller_id, unsigned int max_size);
void prb_commit(struct prb_reserved_entry *e);
void prb_final_commit(struct prb_reserved_entry *e);

void prb_init(struct printk_ringbuffer *rb,
              char *text_buf, unsigned int text_buf_size,
              struct prb_desc *descs, unsigned int descs_count_bits,
              struct printk_info *infos);
unsigned int prb_record_text_space(struct prb_reserved_entry *e);

/* Reader Interface */

/**
 * prb_rec_init_rd() - Initialize a buffer for reading records.
 *
 * @r:             The record to initialize.
 * @info:          A buffer to store record meta-data.
 * @text_buf:      A buffer to store text data.
 * @text_buf_size: The size of @text_buf.
 *
 * Initialize all the fields that a reader is interested in. All arguments
 * (except @r) are optional. Only record data for arguments that are
 * non-NULL or non-zero will be read.
 */
static inline void prb_rec_init_rd(struct printk_record *r,
                                   struct printk_info *info,
                                   char *text_buf, unsigned int text_buf_size)
{
        r->info = info;
        r->text_buf = text_buf;
        r->text_buf_size = text_buf_size;
}

/**
 * prb_for_each_record() - Iterate over the records of a ringbuffer.
 *
 * @from: The sequence number to begin with.
 * @rb:   The ringbuffer to iterate over.
 * @s:    A u64 to store the sequence number on each iteration.
 * @r:    A printk_record to store the record on each iteration.
 *
 * This is a macro for conveniently iterating over a ringbuffer.
 * Note that @s may not be the sequence number of the record on each
 * iteration. For the sequence number, @r->info->seq should be checked.
 *
 * Context: Any context.
 */
#define prb_for_each_record(from, rb, s, r) \
for ((s) = from; prb_read_valid(rb, s, r); (s) = (r)->info->seq + 1)

/**
 * prb_for_each_info() - Iterate over the meta data of a ringbuffer.
 *
 * @from: The sequence number to begin with.
 * @rb:   The ringbuffer to iterate over.
 * @s:    A u64 to store the sequence number on each iteration.
 * @i:    A printk_info to store the record meta data on each iteration.
 * @lc:   An unsigned int to store the text line count of each record.
 *
 * This is a macro for conveniently iterating over a ringbuffer.
 * Note that @s may not be the sequence number of the record on each
 * iteration. For the sequence number, @r->info->seq should be checked.
 *
 * Context: Any context.
 */
#define prb_for_each_info(from, rb, s, i, lc) \
for ((s) = from; prb_read_valid_info(rb, s, i, lc); (s) = (i)->seq + 1)

bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq,
                    struct printk_record *r);
bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq,
                         struct printk_info *info, unsigned int *line_count);

u64 prb_first_valid_seq(struct printk_ringbuffer *rb);
u64 prb_next_seq(struct printk_ringbuffer *rb);

#endif /* _KERNEL_PRINTK_RINGBUFFER_H */


































    5 

    5 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
/* SPDX-License-Identifier: GPL-2.0+ */
/*
 * RCU-based infrastructure for lightweight reader-writer locking
 *
 * Copyright (c) 2015, Red Hat, Inc.
 *
 * Author: Oleg Nesterov <oleg@redhat.com>
 */

#ifndef _LINUX_RCU_SYNC_H_
#define _LINUX_RCU_SYNC_H_

#include <linux/wait.h>
#include <linux/rcupdate.h>

/* Structure to mediate between updaters and fastpath-using readers.  */
struct rcu_sync {
        int                        gp_state;
        int                        gp_count;
        wait_queue_head_t        gp_wait;

        struct rcu_head                cb_head;
};

/**
 * rcu_sync_is_idle() - Are readers permitted to use their fastpaths?
 * @rsp: Pointer to rcu_sync structure to use for synchronization
 *
 * Returns true if readers are permitted to use their fastpaths.  Must be
 * invoked within some flavor of RCU read-side critical section.
 */
static inline bool rcu_sync_is_idle(struct rcu_sync *rsp)
{
        RCU_LOCKDEP_WARN(!rcu_read_lock_any_held(),
                         "suspicious rcu_sync_is_idle() usage");
        return !READ_ONCE(rsp->gp_state); /* GP_IDLE */
}

extern void rcu_sync_init(struct rcu_sync *);
extern void rcu_sync_enter_start(struct rcu_sync *);
extern void rcu_sync_enter(struct rcu_sync *);
extern void rcu_sync_exit(struct rcu_sync *);
extern void rcu_sync_dtor(struct rcu_sync *);

#define __RCU_SYNC_INITIALIZER(name) {                                        \
                .gp_state = 0,                                                \
                .gp_count = 0,                                                \
                .gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait),        \
        }

#define        DEFINE_RCU_SYNC(name)        \
        struct rcu_sync name = __RCU_SYNC_INITIALIZER(name)

#endif /* _LINUX_RCU_SYNC_H_ */










































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_TLB_H
#define _ASM_X86_TLB_H

#define tlb_start_vma(tlb, vma) do { } while (0)
#define tlb_end_vma(tlb, vma) do { } while (0)
#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)

#define tlb_flush tlb_flush
static inline void tlb_flush(struct mmu_gather *tlb);

#include <asm-generic/tlb.h>

static inline void tlb_flush(struct mmu_gather *tlb)
{
        unsigned long start = 0UL, end = TLB_FLUSH_ALL;
        unsigned int stride_shift = tlb_get_unmap_shift(tlb);

        if (!tlb->fullmm && !tlb->need_flush_all) {
                start = tlb->start;
                end = tlb->end;
        }

        flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables);
}

/*
 * While x86 architecture in general requires an IPI to perform TLB
 * shootdown, enablement code for several hypervisors overrides
 * .flush_tlb_others hook in pv_mmu_ops and implements it by issuing
 * a hypercall. To keep software pagetable walkers safe in this case we
 * switch to RCU based table free (MMU_GATHER_RCU_TABLE_FREE). See the comment
 * below 'ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE' in include/asm-generic/tlb.h
 * for more details.
 */
static inline void __tlb_remove_table(void *table)
{
        free_page_and_swap_cache(table);
}

#endif /* _ASM_X86_TLB_H */
































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Network device features.
 */
#ifndef _LINUX_NETDEV_FEATURES_H
#define _LINUX_NETDEV_FEATURES_H

#include <linux/types.h>
#include <linux/bitops.h>
#include <asm/byteorder.h>

typedef u64 netdev_features_t;

enum {
        NETIF_F_SG_BIT,                        /* Scatter/gather IO. */
        NETIF_F_IP_CSUM_BIT,                /* Can checksum TCP/UDP over IPv4. */
        __UNUSED_NETIF_F_1,
        NETIF_F_HW_CSUM_BIT,                /* Can checksum all the packets. */
        NETIF_F_IPV6_CSUM_BIT,                /* Can checksum TCP/UDP over IPV6 */
        NETIF_F_HIGHDMA_BIT,                /* Can DMA to high memory. */
        NETIF_F_FRAGLIST_BIT,                /* Scatter/gather IO. */
        NETIF_F_HW_VLAN_CTAG_TX_BIT,        /* Transmit VLAN CTAG HW acceleration */
        NETIF_F_HW_VLAN_CTAG_RX_BIT,        /* Receive VLAN CTAG HW acceleration */
        NETIF_F_HW_VLAN_CTAG_FILTER_BIT,/* Receive filtering on VLAN CTAGs */
        NETIF_F_VLAN_CHALLENGED_BIT,        /* Device cannot handle VLAN packets */
        NETIF_F_GSO_BIT,                /* Enable software GSO. */
        NETIF_F_LLTX_BIT,                /* LockLess TX - deprecated. Please */
                                        /* do not use LLTX in new drivers */
        NETIF_F_NETNS_LOCAL_BIT,        /* Does not change network namespaces */
        NETIF_F_GRO_BIT,                /* Generic receive offload */
        NETIF_F_LRO_BIT,                /* large receive offload */

        /**/NETIF_F_GSO_SHIFT,                /* keep the order of SKB_GSO_* bits */
        NETIF_F_TSO_BIT                        /* ... TCPv4 segmentation */
                = NETIF_F_GSO_SHIFT,
        NETIF_F_GSO_ROBUST_BIT,                /* ... ->SKB_GSO_DODGY */
        NETIF_F_TSO_ECN_BIT,                /* ... TCP ECN support */
        NETIF_F_TSO_MANGLEID_BIT,        /* ... IPV4 ID mangling allowed */
        NETIF_F_TSO6_BIT,                /* ... TCPv6 segmentation */
        NETIF_F_FSO_BIT,                /* ... FCoE segmentation */
        NETIF_F_GSO_GRE_BIT,                /* ... GRE with TSO */
        NETIF_F_GSO_GRE_CSUM_BIT,        /* ... GRE with csum with TSO */
        NETIF_F_GSO_IPXIP4_BIT,                /* ... IP4 or IP6 over IP4 with TSO */
        NETIF_F_GSO_IPXIP6_BIT,                /* ... IP4 or IP6 over IP6 with TSO */
        NETIF_F_GSO_UDP_TUNNEL_BIT,        /* ... UDP TUNNEL with TSO */
        NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT,/* ... UDP TUNNEL with TSO & CSUM */
        NETIF_F_GSO_PARTIAL_BIT,        /* ... Only segment inner-most L4
                                         *     in hardware and all other
                                         *     headers in software.
                                         */
        NETIF_F_GSO_TUNNEL_REMCSUM_BIT, /* ... TUNNEL with TSO & REMCSUM */
        NETIF_F_GSO_SCTP_BIT,                /* ... SCTP fragmentation */
        NETIF_F_GSO_ESP_BIT,                /* ... ESP with TSO */
        NETIF_F_GSO_UDP_BIT,                /* ... UFO, deprecated except tuntap */
        NETIF_F_GSO_UDP_L4_BIT,                /* ... UDP payload GSO (not UFO) */
        NETIF_F_GSO_FRAGLIST_BIT,                /* ... Fraglist GSO */
        /**/NETIF_F_GSO_LAST =                /* last bit, see GSO_MASK */
                NETIF_F_GSO_FRAGLIST_BIT,

        NETIF_F_FCOE_CRC_BIT,                /* FCoE CRC32 */
        NETIF_F_SCTP_CRC_BIT,                /* SCTP checksum offload */
        NETIF_F_FCOE_MTU_BIT,                /* Supports max FCoE MTU, 2158 bytes*/
        NETIF_F_NTUPLE_BIT,                /* N-tuple filters supported */
        NETIF_F_RXHASH_BIT,                /* Receive hashing offload */
        NETIF_F_RXCSUM_BIT,                /* Receive checksumming offload */
        NETIF_F_NOCACHE_COPY_BIT,        /* Use no-cache copyfromuser */
        NETIF_F_LOOPBACK_BIT,                /* Enable loopback */
        NETIF_F_RXFCS_BIT,                /* Append FCS to skb pkt data */
        NETIF_F_RXALL_BIT,                /* Receive errored frames too */
        NETIF_F_HW_VLAN_STAG_TX_BIT,        /* Transmit VLAN STAG HW acceleration */
        NETIF_F_HW_VLAN_STAG_RX_BIT,        /* Receive VLAN STAG HW acceleration */
        NETIF_F_HW_VLAN_STAG_FILTER_BIT,/* Receive filtering on VLAN STAGs */
        NETIF_F_HW_L2FW_DOFFLOAD_BIT,        /* Allow L2 Forwarding in Hardware */

        NETIF_F_HW_TC_BIT,                /* Offload TC infrastructure */
        NETIF_F_HW_ESP_BIT,                /* Hardware ESP transformation offload */
        NETIF_F_HW_ESP_TX_CSUM_BIT,        /* ESP with TX checksum offload */
        NETIF_F_RX_UDP_TUNNEL_PORT_BIT, /* Offload of RX port for UDP tunnels */
        NETIF_F_HW_TLS_TX_BIT,                /* Hardware TLS TX offload */
        NETIF_F_HW_TLS_RX_BIT,                /* Hardware TLS RX offload */

        NETIF_F_GRO_HW_BIT,                /* Hardware Generic receive offload */
        NETIF_F_HW_TLS_RECORD_BIT,        /* Offload TLS record */
        NETIF_F_GRO_FRAGLIST_BIT,        /* Fraglist GRO */

        NETIF_F_HW_MACSEC_BIT,                /* Offload MACsec operations */
        NETIF_F_GRO_UDP_FWD_BIT,        /* Allow UDP GRO for forwarding */

        /*
         * Add your fresh new feature above and remember to update
         * netdev_features_strings[] in net/ethtool/common.c and maybe
         * some feature mask #defines below. Please also describe it
         * in Documentation/networking/netdev-features.rst.
         */

        /**/NETDEV_FEATURE_COUNT
};

/* copy'n'paste compression ;) */
#define __NETIF_F_BIT(bit)        ((netdev_features_t)1 << (bit))
#define __NETIF_F(name)                __NETIF_F_BIT(NETIF_F_##name##_BIT)

#define NETIF_F_FCOE_CRC        __NETIF_F(FCOE_CRC)
#define NETIF_F_FCOE_MTU        __NETIF_F(FCOE_MTU)
#define NETIF_F_FRAGLIST        __NETIF_F(FRAGLIST)
#define NETIF_F_FSO                __NETIF_F(FSO)
#define NETIF_F_GRO                __NETIF_F(GRO)
#define NETIF_F_GRO_HW                __NETIF_F(GRO_HW)
#define NETIF_F_GSO                __NETIF_F(GSO)
#define NETIF_F_GSO_ROBUST        __NETIF_F(GSO_ROBUST)
#define NETIF_F_HIGHDMA                __NETIF_F(HIGHDMA)
#define NETIF_F_HW_CSUM                __NETIF_F(HW_CSUM)
#define NETIF_F_HW_VLAN_CTAG_FILTER __NETIF_F(HW_VLAN_CTAG_FILTER)
#define NETIF_F_HW_VLAN_CTAG_RX        __NETIF_F(HW_VLAN_CTAG_RX)
#define NETIF_F_HW_VLAN_CTAG_TX        __NETIF_F(HW_VLAN_CTAG_TX)
#define NETIF_F_IP_CSUM                __NETIF_F(IP_CSUM)
#define NETIF_F_IPV6_CSUM        __NETIF_F(IPV6_CSUM)
#define NETIF_F_LLTX                __NETIF_F(LLTX)
#define NETIF_F_LOOPBACK        __NETIF_F(LOOPBACK)
#define NETIF_F_LRO                __NETIF_F(LRO)
#define NETIF_F_NETNS_LOCAL        __NETIF_F(NETNS_LOCAL)
#define NETIF_F_NOCACHE_COPY        __NETIF_F(NOCACHE_COPY)
#define NETIF_F_NTUPLE                __NETIF_F(NTUPLE)
#define NETIF_F_RXCSUM                __NETIF_F(RXCSUM)
#define NETIF_F_RXHASH                __NETIF_F(RXHASH)
#define NETIF_F_SCTP_CRC        __NETIF_F(SCTP_CRC)
#define NETIF_F_SG                __NETIF_F(SG)
#define NETIF_F_TSO6                __NETIF_F(TSO6)
#define NETIF_F_TSO_ECN                __NETIF_F(TSO_ECN)
#define NETIF_F_TSO                __NETIF_F(TSO)
#define NETIF_F_VLAN_CHALLENGED        __NETIF_F(VLAN_CHALLENGED)
#define NETIF_F_RXFCS                __NETIF_F(RXFCS)
#define NETIF_F_RXALL                __NETIF_F(RXALL)
#define NETIF_F_GSO_GRE                __NETIF_F(GSO_GRE)
#define NETIF_F_GSO_GRE_CSUM        __NETIF_F(GSO_GRE_CSUM)
#define NETIF_F_GSO_IPXIP4        __NETIF_F(GSO_IPXIP4)
#define NETIF_F_GSO_IPXIP6        __NETIF_F(GSO_IPXIP6)
#define NETIF_F_GSO_UDP_TUNNEL        __NETIF_F(GSO_UDP_TUNNEL)
#define NETIF_F_GSO_UDP_TUNNEL_CSUM __NETIF_F(GSO_UDP_TUNNEL_CSUM)
#define NETIF_F_TSO_MANGLEID        __NETIF_F(TSO_MANGLEID)
#define NETIF_F_GSO_PARTIAL         __NETIF_F(GSO_PARTIAL)
#define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM)
#define NETIF_F_GSO_SCTP        __NETIF_F(GSO_SCTP)
#define NETIF_F_GSO_ESP                __NETIF_F(GSO_ESP)
#define NETIF_F_GSO_UDP                __NETIF_F(GSO_UDP)
#define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER)
#define NETIF_F_HW_VLAN_STAG_RX        __NETIF_F(HW_VLAN_STAG_RX)
#define NETIF_F_HW_VLAN_STAG_TX        __NETIF_F(HW_VLAN_STAG_TX)
#define NETIF_F_HW_L2FW_DOFFLOAD        __NETIF_F(HW_L2FW_DOFFLOAD)
#define NETIF_F_HW_TC                __NETIF_F(HW_TC)
#define NETIF_F_HW_ESP                __NETIF_F(HW_ESP)
#define NETIF_F_HW_ESP_TX_CSUM        __NETIF_F(HW_ESP_TX_CSUM)
#define        NETIF_F_RX_UDP_TUNNEL_PORT  __NETIF_F(RX_UDP_TUNNEL_PORT)
#define NETIF_F_HW_TLS_RECORD        __NETIF_F(HW_TLS_RECORD)
#define NETIF_F_GSO_UDP_L4        __NETIF_F(GSO_UDP_L4)
#define NETIF_F_HW_TLS_TX        __NETIF_F(HW_TLS_TX)
#define NETIF_F_HW_TLS_RX        __NETIF_F(HW_TLS_RX)
#define NETIF_F_GRO_FRAGLIST        __NETIF_F(GRO_FRAGLIST)
#define NETIF_F_GSO_FRAGLIST        __NETIF_F(GSO_FRAGLIST)
#define NETIF_F_HW_MACSEC        __NETIF_F(HW_MACSEC)
#define NETIF_F_GRO_UDP_FWD        __NETIF_F(GRO_UDP_FWD)

/* Finds the next feature with the highest number of the range of start-1 till 0.
 */
static inline int find_next_netdev_feature(u64 feature, unsigned long start)
{
        /* like BITMAP_LAST_WORD_MASK() for u64
         * this sets the most significant 64 - start to 0.
         */
        feature &= ~0ULL >> (-start & ((sizeof(feature) * 8) - 1));

        return fls64(feature) - 1;
}

/* This goes for the MSB to the LSB through the set feature bits,
 * mask_addr should be a u64 and bit an int
 */
#define for_each_netdev_feature(mask_addr, bit)                                \
        for ((bit) = find_next_netdev_feature((mask_addr),                \
                                              NETDEV_FEATURE_COUNT);        \
             (bit) >= 0;                                                \
             (bit) = find_next_netdev_feature((mask_addr), (bit)))

/* Features valid for ethtool to change */
/* = all defined minus driver/device-class-related */
#define NETIF_F_NEVER_CHANGE        (NETIF_F_VLAN_CHALLENGED | \
                                 NETIF_F_LLTX | NETIF_F_NETNS_LOCAL)

/* remember that ((t)1 << t_BITS) is undefined in C99 */
#define NETIF_F_ETHTOOL_BITS        ((__NETIF_F_BIT(NETDEV_FEATURE_COUNT - 1) | \
                (__NETIF_F_BIT(NETDEV_FEATURE_COUNT - 1) - 1)) & \
                ~NETIF_F_NEVER_CHANGE)

/* Segmentation offload feature mask */
#define NETIF_F_GSO_MASK        (__NETIF_F_BIT(NETIF_F_GSO_LAST + 1) - \
                __NETIF_F_BIT(NETIF_F_GSO_SHIFT))

/* List of IP checksum features. Note that NETIF_F_HW_CSUM should not be
 * set in features when NETIF_F_IP_CSUM or NETIF_F_IPV6_CSUM are set--
 * this would be contradictory
 */
#define NETIF_F_CSUM_MASK        (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | \
                                 NETIF_F_HW_CSUM)

#define NETIF_F_ALL_TSO         (NETIF_F_TSO | NETIF_F_TSO6 | \
                                 NETIF_F_TSO_ECN | NETIF_F_TSO_MANGLEID)

#define NETIF_F_ALL_FCOE        (NETIF_F_FCOE_CRC | NETIF_F_FCOE_MTU | \
                                 NETIF_F_FSO)

/* List of features with software fallbacks. */
#define NETIF_F_GSO_SOFTWARE        (NETIF_F_ALL_TSO | \
                                 NETIF_F_GSO_SCTP)

/*
 * If one device supports one of these features, then enable them
 * for all in netdev_increment_features.
 */
#define NETIF_F_ONE_FOR_ALL        (NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ROBUST | \
                                 NETIF_F_SG | NETIF_F_HIGHDMA |                \
                                 NETIF_F_FRAGLIST | NETIF_F_VLAN_CHALLENGED)

/*
 * If one device doesn't support one of these features, then disable it
 * for all in netdev_increment_features.
 */
#define NETIF_F_ALL_FOR_ALL        (NETIF_F_NOCACHE_COPY | NETIF_F_FSO)

/*
 * If upper/master device has these features disabled, they must be disabled
 * on all lower/slave devices as well.
 */
#define NETIF_F_UPPER_DISABLES        NETIF_F_LRO

/* changeable features with no special hardware requirements */
#define NETIF_F_SOFT_FEATURES        (NETIF_F_GSO | NETIF_F_GRO)

/* Changeable features with no special hardware requirements that defaults to off. */
#define NETIF_F_SOFT_FEATURES_OFF        (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD)

#define NETIF_F_VLAN_FEATURES        (NETIF_F_HW_VLAN_CTAG_FILTER | \
                                 NETIF_F_HW_VLAN_CTAG_RX | \
                                 NETIF_F_HW_VLAN_CTAG_TX | \
                                 NETIF_F_HW_VLAN_STAG_FILTER | \
                                 NETIF_F_HW_VLAN_STAG_RX | \
                                 NETIF_F_HW_VLAN_STAG_TX)

#define NETIF_F_GSO_ENCAP_ALL        (NETIF_F_GSO_GRE |                        \
                                 NETIF_F_GSO_GRE_CSUM |                        \
                                 NETIF_F_GSO_IPXIP4 |                        \
                                 NETIF_F_GSO_IPXIP6 |                        \
                                 NETIF_F_GSO_UDP_TUNNEL |                \
                                 NETIF_F_GSO_UDP_TUNNEL_CSUM)

#endif        /* _LINUX_NETDEV_FEATURES_H */



































































































































































































































































































































































































































































































    2 




































































































    1 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BLK_MQ_H
#define BLK_MQ_H

#include <linux/blkdev.h>
#include <linux/sbitmap.h>
#include <linux/srcu.h>

struct blk_mq_tags;
struct blk_flush_queue;

/**
 * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware
 * block device
 */
struct blk_mq_hw_ctx {
        struct {
                /** @lock: Protects the dispatch list. */
                spinlock_t                lock;
                /**
                 * @dispatch: Used for requests that are ready to be
                 * dispatched to the hardware but for some reason (e.g. lack of
                 * resources) could not be sent to the hardware. As soon as the
                 * driver can send new requests, requests at this list will
                 * be sent first for a fairer dispatch.
                 */
                struct list_head        dispatch;
                 /**
                  * @state: BLK_MQ_S_* flags. Defines the state of the hw
                  * queue (active, scheduled to restart, stopped).
                  */
                unsigned long                state;
        } ____cacheline_aligned_in_smp;

        /**
         * @run_work: Used for scheduling a hardware queue run at a later time.
         */
        struct delayed_work        run_work;
        /** @cpumask: Map of available CPUs where this hctx can run. */
        cpumask_var_t                cpumask;
        /**
         * @next_cpu: Used by blk_mq_hctx_next_cpu() for round-robin CPU
         * selection from @cpumask.
         */
        int                        next_cpu;
        /**
         * @next_cpu_batch: Counter of how many works left in the batch before
         * changing to the next CPU.
         */
        int                        next_cpu_batch;

        /** @flags: BLK_MQ_F_* flags. Defines the behaviour of the queue. */
        unsigned long                flags;

        /**
         * @sched_data: Pointer owned by the IO scheduler attached to a request
         * queue. It's up to the IO scheduler how to use this pointer.
         */
        void                        *sched_data;
        /**
         * @queue: Pointer to the request queue that owns this hardware context.
         */
        struct request_queue        *queue;
        /** @fq: Queue of requests that need to perform a flush operation. */
        struct blk_flush_queue        *fq;

        /**
         * @driver_data: Pointer to data owned by the block driver that created
         * this hctx
         */
        void                        *driver_data;

        /**
         * @ctx_map: Bitmap for each software queue. If bit is on, there is a
         * pending request in that software queue.
         */
        struct sbitmap                ctx_map;

        /**
         * @dispatch_from: Software queue to be used when no scheduler was
         * selected.
         */
        struct blk_mq_ctx        *dispatch_from;
        /**
         * @dispatch_busy: Number used by blk_mq_update_dispatch_busy() to
         * decide if the hw_queue is busy using Exponential Weighted Moving
         * Average algorithm.
         */
        unsigned int                dispatch_busy;

        /** @type: HCTX_TYPE_* flags. Type of hardware queue. */
        unsigned short                type;
        /** @nr_ctx: Number of software queues. */
        unsigned short                nr_ctx;
        /** @ctxs: Array of software queues. */
        struct blk_mq_ctx        **ctxs;

        /** @dispatch_wait_lock: Lock for dispatch_wait queue. */
        spinlock_t                dispatch_wait_lock;
        /**
         * @dispatch_wait: Waitqueue to put requests when there is no tag
         * available at the moment, to wait for another try in the future.
         */
        wait_queue_entry_t        dispatch_wait;

        /**
         * @wait_index: Index of next available dispatch_wait queue to insert
         * requests.
         */
        atomic_t                wait_index;

        /**
         * @tags: Tags owned by the block driver. A tag at this set is only
         * assigned when a request is dispatched from a hardware queue.
         */
        struct blk_mq_tags        *tags;
        /**
         * @sched_tags: Tags owned by I/O scheduler. If there is an I/O
         * scheduler associated with a request queue, a tag is assigned when
         * that request is allocated. Else, this member is not used.
         */
        struct blk_mq_tags        *sched_tags;

        /** @queued: Number of queued requests. */
        unsigned long                queued;
        /** @run: Number of dispatched requests. */
        unsigned long                run;
#define BLK_MQ_MAX_DISPATCH_ORDER        7
        /** @dispatched: Number of dispatch requests by queue. */
        unsigned long                dispatched[BLK_MQ_MAX_DISPATCH_ORDER];

        /** @numa_node: NUMA node the storage adapter has been connected to. */
        unsigned int                numa_node;
        /** @queue_num: Index of this hardware queue. */
        unsigned int                queue_num;

        /**
         * @nr_active: Number of active requests. Only used when a tag set is
         * shared across request queues.
         */
        atomic_t                nr_active;
        /**
         * @elevator_queued: Number of queued requests on hctx.
         */
        atomic_t                elevator_queued;

        /** @cpuhp_online: List to store request if CPU is going to die */
        struct hlist_node        cpuhp_online;
        /** @cpuhp_dead: List to store request if some CPU die. */
        struct hlist_node        cpuhp_dead;
        /** @kobj: Kernel object for sysfs. */
        struct kobject                kobj;

        /** @poll_considered: Count times blk_poll() was called. */
        unsigned long                poll_considered;
        /** @poll_invoked: Count how many requests blk_poll() polled. */
        unsigned long                poll_invoked;
        /** @poll_success: Count how many polled requests were completed. */
        unsigned long                poll_success;

#ifdef CONFIG_BLK_DEBUG_FS
        /**
         * @debugfs_dir: debugfs directory for this hardware queue. Named
         * as cpu<cpu_number>.
         */
        struct dentry                *debugfs_dir;
        /** @sched_debugfs_dir:        debugfs directory for the scheduler. */
        struct dentry                *sched_debugfs_dir;
#endif

        /**
         * @hctx_list: if this hctx is not in use, this is an entry in
         * q->unused_hctx_list.
         */
        struct list_head        hctx_list;

        /**
         * @srcu: Sleepable RCU. Use as lock when type of the hardware queue is
         * blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also
         * blk_mq_hw_ctx_size().
         */
        struct srcu_struct        srcu[];
};

/**
 * struct blk_mq_queue_map - Map software queues to hardware queues
 * @mq_map:       CPU ID to hardware queue index map. This is an array
 *        with nr_cpu_ids elements. Each element has a value in the range
 *        [@queue_offset, @queue_offset + @nr_queues).
 * @nr_queues:    Number of hardware queues to map CPU IDs onto.
 * @queue_offset: First hardware queue to map onto. Used by the PCIe NVMe
 *        driver to map each hardware queue type (enum hctx_type) onto a distinct
 *        set of hardware queues.
 */
struct blk_mq_queue_map {
        unsigned int *mq_map;
        unsigned int nr_queues;
        unsigned int queue_offset;
};

/**
 * enum hctx_type - Type of hardware queue
 * @HCTX_TYPE_DEFAULT:        All I/O not otherwise accounted for.
 * @HCTX_TYPE_READ:        Just for READ I/O.
 * @HCTX_TYPE_POLL:        Polled I/O of any kind.
 * @HCTX_MAX_TYPES:        Number of types of hctx.
 */
enum hctx_type {
        HCTX_TYPE_DEFAULT,
        HCTX_TYPE_READ,
        HCTX_TYPE_POLL,

        HCTX_MAX_TYPES,
};

/**
 * struct blk_mq_tag_set - tag set that can be shared between request queues
 * @map:           One or more ctx -> hctx mappings. One map exists for each
 *                   hardware queue type (enum hctx_type) that the driver wishes
 *                   to support. There are no restrictions on maps being of the
 *                   same size, and it's perfectly legal to share maps between
 *                   types.
 * @nr_maps:           Number of elements in the @map array. A number in the range
 *                   [1, HCTX_MAX_TYPES].
 * @ops:           Pointers to functions that implement block driver behavior.
 * @nr_hw_queues:  Number of hardware queues supported by the block driver that
 *                   owns this data structure.
 * @queue_depth:   Number of tags per hardware queue, reserved tags included.
 * @reserved_tags: Number of tags to set aside for BLK_MQ_REQ_RESERVED tag
 *                   allocations.
 * @cmd_size:           Number of additional bytes to allocate per request. The block
 *                   driver owns these additional bytes.
 * @numa_node:           NUMA node the storage adapter has been connected to.
 * @timeout:           Request processing timeout in jiffies.
 * @flags:           Zero or more BLK_MQ_F_* flags.
 * @driver_data:   Pointer to data owned by the block driver that created this
 *                   tag set.
 * @active_queues_shared_sbitmap:
 *                    number of active request queues per tag set.
 * @__bitmap_tags: A shared tags sbitmap, used over all hctx's
 * @__breserved_tags:
 *                   A shared reserved tags sbitmap, used over all hctx's
 * @tags:           Tag sets. One tag set per hardware queue. Has @nr_hw_queues
 *                   elements.
 * @tag_list_lock: Serializes tag_list accesses.
 * @tag_list:           List of the request queues that use this tag set. See also
 *                   request_queue.tag_set_list.
 */
struct blk_mq_tag_set {
        struct blk_mq_queue_map        map[HCTX_MAX_TYPES];
        unsigned int                nr_maps;
        const struct blk_mq_ops        *ops;
        unsigned int                nr_hw_queues;
        unsigned int                queue_depth;
        unsigned int                reserved_tags;
        unsigned int                cmd_size;
        int                        numa_node;
        unsigned int                timeout;
        unsigned int                flags;
        void                        *driver_data;
        atomic_t                active_queues_shared_sbitmap;

        struct sbitmap_queue        __bitmap_tags;
        struct sbitmap_queue        __breserved_tags;
        struct blk_mq_tags        **tags;

        struct mutex                tag_list_lock;
        struct list_head        tag_list;
};

/**
 * struct blk_mq_queue_data - Data about a request inserted in a queue
 *
 * @rq:   Request pointer.
 * @last: If it is the last request in the queue.
 */
struct blk_mq_queue_data {
        struct request *rq;
        bool last;
};

typedef bool (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *,
                bool);
typedef bool (busy_tag_iter_fn)(struct request *, void *, bool);

/**
 * struct blk_mq_ops - Callback functions that implements block driver
 * behaviour.
 */
struct blk_mq_ops {
        /**
         * @queue_rq: Queue a new request from block IO.
         */
        blk_status_t (*queue_rq)(struct blk_mq_hw_ctx *,
                                 const struct blk_mq_queue_data *);

        /**
         * @commit_rqs: If a driver uses bd->last to judge when to submit
         * requests to hardware, it must define this function. In case of errors
         * that make us stop issuing further requests, this hook serves the
         * purpose of kicking the hardware (which the last request otherwise
         * would have done).
         */
        void (*commit_rqs)(struct blk_mq_hw_ctx *);

        /**
         * @get_budget: Reserve budget before queue request, once .queue_rq is
         * run, it is driver's responsibility to release the
         * reserved budget. Also we have to handle failure case
         * of .get_budget for avoiding I/O deadlock.
         */
        bool (*get_budget)(struct request_queue *);

        /**
         * @put_budget: Release the reserved budget.
         */
        void (*put_budget)(struct request_queue *);

        /**
         * @timeout: Called on request timeout.
         */
        enum blk_eh_timer_return (*timeout)(struct request *, bool);

        /**
         * @poll: Called to poll for completion of a specific tag.
         */
        int (*poll)(struct blk_mq_hw_ctx *);

        /**
         * @complete: Mark the request as complete.
         */
        void (*complete)(struct request *);

        /**
         * @init_hctx: Called when the block layer side of a hardware queue has
         * been set up, allowing the driver to allocate/init matching
         * structures.
         */
        int (*init_hctx)(struct blk_mq_hw_ctx *, void *, unsigned int);
        /**
         * @exit_hctx: Ditto for exit/teardown.
         */
        void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int);

        /**
         * @init_request: Called for every command allocated by the block layer
         * to allow the driver to set up driver specific data.
         *
         * Tag greater than or equal to queue_depth is for setting up
         * flush request.
         */
        int (*init_request)(struct blk_mq_tag_set *set, struct request *,
                            unsigned int, unsigned int);
        /**
         * @exit_request: Ditto for exit/teardown.
         */
        void (*exit_request)(struct blk_mq_tag_set *set, struct request *,
                             unsigned int);

        /**
         * @initialize_rq_fn: Called from inside blk_get_request().
         */
        void (*initialize_rq_fn)(struct request *rq);

        /**
         * @cleanup_rq: Called before freeing one request which isn't completed
         * yet, and usually for freeing the driver private data.
         */
        void (*cleanup_rq)(struct request *);

        /**
         * @busy: If set, returns whether or not this queue currently is busy.
         */
        bool (*busy)(struct request_queue *);

        /**
         * @map_queues: This allows drivers specify their own queue mapping by
         * overriding the setup-time function that builds the mq_map.
         */
        int (*map_queues)(struct blk_mq_tag_set *set);

#ifdef CONFIG_BLK_DEBUG_FS
        /**
         * @show_rq: Used by the debugfs implementation to show driver-specific
         * information about a request.
         */
        void (*show_rq)(struct seq_file *m, struct request *rq);
#endif
};

enum {
        BLK_MQ_F_SHOULD_MERGE        = 1 << 0,
        BLK_MQ_F_TAG_QUEUE_SHARED = 1 << 1,
        /*
         * Set when this device requires underlying blk-mq device for
         * completing IO:
         */
        BLK_MQ_F_STACKING        = 1 << 2,
        BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3,
        BLK_MQ_F_BLOCKING        = 1 << 5,
        BLK_MQ_F_NO_SCHED        = 1 << 6,
        BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
        BLK_MQ_F_ALLOC_POLICY_BITS = 1,

        BLK_MQ_S_STOPPED        = 0,
        BLK_MQ_S_TAG_ACTIVE        = 1,
        BLK_MQ_S_SCHED_RESTART        = 2,

        /* hw queue is inactive after all its CPUs become offline */
        BLK_MQ_S_INACTIVE        = 3,

        BLK_MQ_MAX_DEPTH        = 10240,

        BLK_MQ_CPU_WORK_BATCH        = 8,
};
#define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \
        ((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \
                ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1))
#define BLK_ALLOC_POLICY_TO_MQ_FLAG(policy) \
        ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \
                << BLK_MQ_F_ALLOC_POLICY_START_BIT)

struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *);
struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
                void *queuedata);
struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
                                                  struct request_queue *q,
                                                  bool elevator_init);
struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set,
                                                const struct blk_mq_ops *ops,
                                                unsigned int queue_depth,
                                                unsigned int set_flags);
void blk_mq_unregister_dev(struct device *, struct request_queue *);

int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set);
void blk_mq_free_tag_set(struct blk_mq_tag_set *set);

void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);

void blk_mq_free_request(struct request *rq);

bool blk_mq_queue_inflight(struct request_queue *q);

enum {
        /* return when out of requests */
        BLK_MQ_REQ_NOWAIT        = (__force blk_mq_req_flags_t)(1 << 0),
        /* allocate from reserved pool */
        BLK_MQ_REQ_RESERVED        = (__force blk_mq_req_flags_t)(1 << 1),
        /* set RQF_PM */
        BLK_MQ_REQ_PM                = (__force blk_mq_req_flags_t)(1 << 2),
};

struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
                blk_mq_req_flags_t flags);
struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
                unsigned int op, blk_mq_req_flags_t flags,
                unsigned int hctx_idx);
struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);

enum {
        BLK_MQ_UNIQUE_TAG_BITS = 16,
        BLK_MQ_UNIQUE_TAG_MASK = (1 << BLK_MQ_UNIQUE_TAG_BITS) - 1,
};

u32 blk_mq_unique_tag(struct request *rq);

static inline u16 blk_mq_unique_tag_to_hwq(u32 unique_tag)
{
        return unique_tag >> BLK_MQ_UNIQUE_TAG_BITS;
}

static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag)
{
        return unique_tag & BLK_MQ_UNIQUE_TAG_MASK;
}

/**
 * blk_mq_rq_state() - read the current MQ_RQ_* state of a request
 * @rq: target request.
 */
static inline enum mq_rq_state blk_mq_rq_state(struct request *rq)
{
        return READ_ONCE(rq->state);
}

static inline int blk_mq_request_started(struct request *rq)
{
        return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
}

static inline int blk_mq_request_completed(struct request *rq)
{
        return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE;
}

void blk_mq_start_request(struct request *rq);
void blk_mq_end_request(struct request *rq, blk_status_t error);
void __blk_mq_end_request(struct request *rq, blk_status_t error);

void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list);
void blk_mq_kick_requeue_list(struct request_queue *q);
void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
void blk_mq_complete_request(struct request *rq);
bool blk_mq_complete_request_remote(struct request *rq);
bool blk_mq_queue_stopped(struct request_queue *q);
void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
void blk_mq_stop_hw_queues(struct request_queue *q);
void blk_mq_start_hw_queues(struct request_queue *q);
void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
void blk_mq_quiesce_queue(struct request_queue *q);
void blk_mq_unquiesce_queue(struct request_queue *q);
void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
void blk_mq_run_hw_queues(struct request_queue *q, bool async);
void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs);
void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
                busy_tag_iter_fn *fn, void *priv);
void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset);
void blk_mq_freeze_queue(struct request_queue *q);
void blk_mq_unfreeze_queue(struct request_queue *q);
void blk_freeze_queue_start(struct request_queue *q);
void blk_mq_freeze_queue_wait(struct request_queue *q);
int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
                                     unsigned long timeout);

int blk_mq_map_queues(struct blk_mq_queue_map *qmap);
void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);

void blk_mq_quiesce_queue_nowait(struct request_queue *q);

unsigned int blk_mq_rq_cpu(struct request *rq);

bool __blk_should_fake_timeout(struct request_queue *q);
static inline bool blk_should_fake_timeout(struct request_queue *q)
{
        if (IS_ENABLED(CONFIG_FAIL_IO_TIMEOUT) &&
            test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags))
                return __blk_should_fake_timeout(q);
        return false;
}

/**
 * blk_mq_rq_from_pdu - cast a PDU to a request
 * @pdu: the PDU (Protocol Data Unit) to be casted
 *
 * Return: request
 *
 * Driver command data is immediately after the request. So subtract request
 * size to get back to the original request.
 */
static inline struct request *blk_mq_rq_from_pdu(void *pdu)
{
        return pdu - sizeof(struct request);
}

/**
 * blk_mq_rq_to_pdu - cast a request to a PDU
 * @rq: the request to be casted
 *
 * Return: pointer to the PDU
 *
 * Driver command data is immediately after the request. So add request to get
 * the PDU.
 */
static inline void *blk_mq_rq_to_pdu(struct request *rq)
{
        return rq + 1;
}

#define queue_for_each_hw_ctx(q, hctx, i)                                \
        for ((i) = 0; (i) < (q)->nr_hw_queues &&                        \
             ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++)

#define hctx_for_each_ctx(hctx, ctx, i)                                        \
        for ((i) = 0; (i) < (hctx)->nr_ctx &&                                \
             ({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++)

static inline blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx,
                struct request *rq)
{
        if (rq->tag != -1)
                return rq->tag | (hctx->queue_num << BLK_QC_T_SHIFT);

        return rq->internal_tag | (hctx->queue_num << BLK_QC_T_SHIFT) |
                        BLK_QC_T_INTERNAL;
}

static inline void blk_mq_cleanup_rq(struct request *rq)
{
        if (rq->q->mq_ops->cleanup_rq)
                rq->q->mq_ops->cleanup_rq(rq);
}

blk_qc_t blk_mq_submit_bio(struct bio *bio);

#endif













































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 





    1 







































    1 


































































































































































































































































































































































































































































































    1 






















































































































































    1 


    1 
    1 













    1 


    1 













































    1 
































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
// SPDX-License-Identifier: GPL-2.0-only
/*
 * mm/page-writeback.c
 *
 * Copyright (C) 2002, Linus Torvalds.
 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
 *
 * Contains functions related to writing back dirty pages at the
 * address_space level.
 *
 * 10Apr2002        Andrew Morton
 *                Initial version
 */

#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/writeback.h>
#include <linux/init.h>
#include <linux/backing-dev.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/blkdev.h>
#include <linux/mpage.h>
#include <linux/rmap.h>
#include <linux/percpu.h>
#include <linux/smp.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/syscalls.h>
#include <linux/buffer_head.h> /* __set_page_dirty_buffers */
#include <linux/pagevec.h>
#include <linux/timer.h>
#include <linux/sched/rt.h>
#include <linux/sched/signal.h>
#include <linux/mm_inline.h>
#include <trace/events/writeback.h>

#include "internal.h"

/*
 * Sleep at most 200ms at a time in balance_dirty_pages().
 */
#define MAX_PAUSE                max(HZ/5, 1)

/*
 * Try to keep balance_dirty_pages() call intervals higher than this many pages
 * by raising pause time to max_pause when falls below it.
 */
#define DIRTY_POLL_THRESH        (128 >> (PAGE_SHIFT - 10))

/*
 * Estimate write bandwidth at 200ms intervals.
 */
#define BANDWIDTH_INTERVAL        max(HZ/5, 1)

#define RATELIMIT_CALC_SHIFT        10

/*
 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
 * will look to see if it needs to force writeback or throttling.
 */
static long ratelimit_pages = 32;

/* The following parameters are exported via /proc/sys/vm */

/*
 * Start background writeback (via writeback threads) at this percentage
 */
int dirty_background_ratio = 10;

/*
 * dirty_background_bytes starts at 0 (disabled) so that it is a function of
 * dirty_background_ratio * the amount of dirtyable memory
 */
unsigned long dirty_background_bytes;

/*
 * free highmem will not be subtracted from the total free memory
 * for calculating free ratios if vm_highmem_is_dirtyable is true
 */
int vm_highmem_is_dirtyable;

/*
 * The generator of dirty data starts writeback at this percentage
 */
int vm_dirty_ratio = 20;

/*
 * vm_dirty_bytes starts at 0 (disabled) so that it is a function of
 * vm_dirty_ratio * the amount of dirtyable memory
 */
unsigned long vm_dirty_bytes;

/*
 * The interval between `kupdate'-style writebacks
 */
unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */

EXPORT_SYMBOL_GPL(dirty_writeback_interval);

/*
 * The longest time for which data is allowed to remain dirty
 */
unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */

/*
 * Flag that makes the machine dump writes/reads and block dirtyings.
 */
int block_dump;

/*
 * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies:
 * a full sync is triggered after this time elapses without any disk activity.
 */
int laptop_mode;

EXPORT_SYMBOL(laptop_mode);

/* End of sysctl-exported parameters */

struct wb_domain global_wb_domain;

/* consolidated parameters for balance_dirty_pages() and its subroutines */
struct dirty_throttle_control {
#ifdef CONFIG_CGROUP_WRITEBACK
        struct wb_domain        *dom;
        struct dirty_throttle_control *gdtc;        /* only set in memcg dtc's */
#endif
        struct bdi_writeback        *wb;
        struct fprop_local_percpu *wb_completions;

        unsigned long                avail;                /* dirtyable */
        unsigned long                dirty;                /* file_dirty + write + nfs */
        unsigned long                thresh;                /* dirty threshold */
        unsigned long                bg_thresh;        /* dirty background threshold */

        unsigned long                wb_dirty;        /* per-wb counterparts */
        unsigned long                wb_thresh;
        unsigned long                wb_bg_thresh;

        unsigned long                pos_ratio;
};

/*
 * Length of period for aging writeout fractions of bdis. This is an
 * arbitrarily chosen number. The longer the period, the slower fractions will
 * reflect changes in current writeout rate.
 */
#define VM_COMPLETIONS_PERIOD_LEN (3*HZ)

#ifdef CONFIG_CGROUP_WRITEBACK

#define GDTC_INIT(__wb)                .wb = (__wb),                                \
                                .dom = &global_wb_domain,                \
                                .wb_completions = &(__wb)->completions

#define GDTC_INIT_NO_WB                .dom = &global_wb_domain

#define MDTC_INIT(__wb, __gdtc)        .wb = (__wb),                                \
                                .dom = mem_cgroup_wb_domain(__wb),        \
                                .wb_completions = &(__wb)->memcg_completions, \
                                .gdtc = __gdtc

static bool mdtc_valid(struct dirty_throttle_control *dtc)
{
        return dtc->dom;
}

static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
{
        return dtc->dom;
}

static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
{
        return mdtc->gdtc;
}

static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
{
        return &wb->memcg_completions;
}

static void wb_min_max_ratio(struct bdi_writeback *wb,
                             unsigned long *minp, unsigned long *maxp)
{
        unsigned long this_bw = wb->avg_write_bandwidth;
        unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
        unsigned long long min = wb->bdi->min_ratio;
        unsigned long long max = wb->bdi->max_ratio;

        /*
         * @wb may already be clean by the time control reaches here and
         * the total may not include its bw.
         */
        if (this_bw < tot_bw) {
                if (min) {
                        min *= this_bw;
                        min = div64_ul(min, tot_bw);
                }
                if (max < 100) {
                        max *= this_bw;
                        max = div64_ul(max, tot_bw);
                }
        }

        *minp = min;
        *maxp = max;
}

#else        /* CONFIG_CGROUP_WRITEBACK */

#define GDTC_INIT(__wb)                .wb = (__wb),                           \
                                .wb_completions = &(__wb)->completions
#define GDTC_INIT_NO_WB
#define MDTC_INIT(__wb, __gdtc)

static bool mdtc_valid(struct dirty_throttle_control *dtc)
{
        return false;
}

static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
{
        return &global_wb_domain;
}

static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
{
        return NULL;
}

static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
{
        return NULL;
}

static void wb_min_max_ratio(struct bdi_writeback *wb,
                             unsigned long *minp, unsigned long *maxp)
{
        *minp = wb->bdi->min_ratio;
        *maxp = wb->bdi->max_ratio;
}

#endif        /* CONFIG_CGROUP_WRITEBACK */

/*
 * In a memory zone, there is a certain amount of pages we consider
 * available for the page cache, which is essentially the number of
 * free and reclaimable pages, minus some zone reserves to protect
 * lowmem and the ability to uphold the zone's watermarks without
 * requiring writeback.
 *
 * This number of dirtyable pages is the base value of which the
 * user-configurable dirty ratio is the effective number of pages that
 * are allowed to be actually dirtied.  Per individual zone, or
 * globally by using the sum of dirtyable pages over all zones.
 *
 * Because the user is allowed to specify the dirty limit globally as
 * absolute number of bytes, calculating the per-zone dirty limit can
 * require translating the configured limit into a percentage of
 * global dirtyable memory first.
 */

/**
 * node_dirtyable_memory - number of dirtyable pages in a node
 * @pgdat: the node
 *
 * Return: the node's number of pages potentially available for dirty
 * page cache.  This is the base value for the per-node dirty limits.
 */
static unsigned long node_dirtyable_memory(struct pglist_data *pgdat)
{
        unsigned long nr_pages = 0;
        int z;

        for (z = 0; z < MAX_NR_ZONES; z++) {
                struct zone *zone = pgdat->node_zones + z;

                if (!populated_zone(zone))
                        continue;

                nr_pages += zone_page_state(zone, NR_FREE_PAGES);
        }

        /*
         * Pages reserved for the kernel should not be considered
         * dirtyable, to prevent a situation where reclaim has to
         * clean pages in order to balance the zones.
         */
        nr_pages -= min(nr_pages, pgdat->totalreserve_pages);

        nr_pages += node_page_state(pgdat, NR_INACTIVE_FILE);
        nr_pages += node_page_state(pgdat, NR_ACTIVE_FILE);

        return nr_pages;
}

static unsigned long highmem_dirtyable_memory(unsigned long total)
{
#ifdef CONFIG_HIGHMEM
        int node;
        unsigned long x = 0;
        int i;

        for_each_node_state(node, N_HIGH_MEMORY) {
                for (i = ZONE_NORMAL + 1; i < MAX_NR_ZONES; i++) {
                        struct zone *z;
                        unsigned long nr_pages;

                        if (!is_highmem_idx(i))
                                continue;

                        z = &NODE_DATA(node)->node_zones[i];
                        if (!populated_zone(z))
                                continue;

                        nr_pages = zone_page_state(z, NR_FREE_PAGES);
                        /* watch for underflows */
                        nr_pages -= min(nr_pages, high_wmark_pages(z));
                        nr_pages += zone_page_state(z, NR_ZONE_INACTIVE_FILE);
                        nr_pages += zone_page_state(z, NR_ZONE_ACTIVE_FILE);
                        x += nr_pages;
                }
        }

        /*
         * Unreclaimable memory (kernel memory or anonymous memory
         * without swap) can bring down the dirtyable pages below
         * the zone's dirty balance reserve and the above calculation
         * will underflow.  However we still want to add in nodes
         * which are below threshold (negative values) to get a more
         * accurate calculation but make sure that the total never
         * underflows.
         */
        if ((long)x < 0)
                x = 0;

        /*
         * Make sure that the number of highmem pages is never larger
         * than the number of the total dirtyable memory. This can only
         * occur in very strange VM situations but we want to make sure
         * that this does not occur.
         */
        return min(x, total);
#else
        return 0;
#endif
}

/**
 * global_dirtyable_memory - number of globally dirtyable pages
 *
 * Return: the global number of pages potentially available for dirty
 * page cache.  This is the base value for the global dirty limits.
 */
static unsigned long global_dirtyable_memory(void)
{
        unsigned long x;

        x = global_zone_page_state(NR_FREE_PAGES);
        /*
         * Pages reserved for the kernel should not be considered
         * dirtyable, to prevent a situation where reclaim has to
         * clean pages in order to balance the zones.
         */
        x -= min(x, totalreserve_pages);

        x += global_node_page_state(NR_INACTIVE_FILE);
        x += global_node_page_state(NR_ACTIVE_FILE);

        if (!vm_highmem_is_dirtyable)
                x -= highmem_dirtyable_memory(x);

        return x + 1;        /* Ensure that we never return 0 */
}

/**
 * domain_dirty_limits - calculate thresh and bg_thresh for a wb_domain
 * @dtc: dirty_throttle_control of interest
 *
 * Calculate @dtc->thresh and ->bg_thresh considering
 * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}.  The caller
 * must ensure that @dtc->avail is set before calling this function.  The
 * dirty limits will be lifted by 1/4 for real-time tasks.
 */
static void domain_dirty_limits(struct dirty_throttle_control *dtc)
{
        const unsigned long available_memory = dtc->avail;
        struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc);
        unsigned long bytes = vm_dirty_bytes;
        unsigned long bg_bytes = dirty_background_bytes;
        /* convert ratios to per-PAGE_SIZE for higher precision */
        unsigned long ratio = (vm_dirty_ratio * PAGE_SIZE) / 100;
        unsigned long bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100;
        unsigned long thresh;
        unsigned long bg_thresh;
        struct task_struct *tsk;

        /* gdtc is !NULL iff @dtc is for memcg domain */
        if (gdtc) {
                unsigned long global_avail = gdtc->avail;

                /*
                 * The byte settings can't be applied directly to memcg
                 * domains.  Convert them to ratios by scaling against
                 * globally available memory.  As the ratios are in
                 * per-PAGE_SIZE, they can be obtained by dividing bytes by
                 * number of pages.
                 */
                if (bytes)
                        ratio = min(DIV_ROUND_UP(bytes, global_avail),
                                    PAGE_SIZE);
                if (bg_bytes)
                        bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail),
                                       PAGE_SIZE);
                bytes = bg_bytes = 0;
        }

        if (bytes)
                thresh = DIV_ROUND_UP(bytes, PAGE_SIZE);
        else
                thresh = (ratio * available_memory) / PAGE_SIZE;

        if (bg_bytes)
                bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE);
        else
                bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE;

        tsk = current;
        if (rt_task(tsk)) {
                bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32;
                thresh += thresh / 4 + global_wb_domain.dirty_limit / 32;
        }
        /*
         * Dirty throttling logic assumes the limits in page units fit into
         * 32-bits. This gives 16TB dirty limits max which is hopefully enough.
         */
        if (thresh > UINT_MAX)
                thresh = UINT_MAX;
        /* This makes sure bg_thresh is within 32-bits as well */
        if (bg_thresh >= thresh)
                bg_thresh = thresh / 2;
        dtc->thresh = thresh;
        dtc->bg_thresh = bg_thresh;

        /* we should eventually report the domain in the TP */
        if (!gdtc)
                trace_global_dirty_state(bg_thresh, thresh);
}

/**
 * global_dirty_limits - background-writeback and dirty-throttling thresholds
 * @pbackground: out parameter for bg_thresh
 * @pdirty: out parameter for thresh
 *
 * Calculate bg_thresh and thresh for global_wb_domain.  See
 * domain_dirty_limits() for details.
 */
void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
{
        struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };

        gdtc.avail = global_dirtyable_memory();
        domain_dirty_limits(&gdtc);

        *pbackground = gdtc.bg_thresh;
        *pdirty = gdtc.thresh;
}

/**
 * node_dirty_limit - maximum number of dirty pages allowed in a node
 * @pgdat: the node
 *
 * Return: the maximum number of dirty pages allowed in a node, based
 * on the node's dirtyable memory.
 */
static unsigned long node_dirty_limit(struct pglist_data *pgdat)
{
        unsigned long node_memory = node_dirtyable_memory(pgdat);
        struct task_struct *tsk = current;
        unsigned long dirty;

        if (vm_dirty_bytes)
                dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) *
                        node_memory / global_dirtyable_memory();
        else
                dirty = vm_dirty_ratio * node_memory / 100;

        if (rt_task(tsk))
                dirty += dirty / 4;

        /*
         * Dirty throttling logic assumes the limits in page units fit into
         * 32-bits. This gives 16TB dirty limits max which is hopefully enough.
         */
        return min_t(unsigned long, dirty, UINT_MAX);
}

/**
 * node_dirty_ok - tells whether a node is within its dirty limits
 * @pgdat: the node to check
 *
 * Return: %true when the dirty pages in @pgdat are within the node's
 * dirty limit, %false if the limit is exceeded.
 */
bool node_dirty_ok(struct pglist_data *pgdat)
{
        unsigned long limit = node_dirty_limit(pgdat);
        unsigned long nr_pages = 0;

        nr_pages += node_page_state(pgdat, NR_FILE_DIRTY);
        nr_pages += node_page_state(pgdat, NR_WRITEBACK);

        return nr_pages <= limit;
}

int dirty_background_ratio_handler(struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret;

        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write)
                dirty_background_bytes = 0;
        return ret;
}

int dirty_background_bytes_handler(struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret;
        unsigned long old_bytes = dirty_background_bytes;

        ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write) {
                if (DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE) >
                                                                UINT_MAX) {
                        dirty_background_bytes = old_bytes;
                        return -ERANGE;
                }
                dirty_background_ratio = 0;
        }
        return ret;
}

int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer,
                size_t *lenp, loff_t *ppos)
{
        int old_ratio = vm_dirty_ratio;
        int ret;

        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
                vm_dirty_bytes = 0;
                writeback_set_ratelimit();
        }
        return ret;
}

int dirty_bytes_handler(struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
{
        unsigned long old_bytes = vm_dirty_bytes;
        int ret;

        ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
                if (DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) > UINT_MAX) {
                        vm_dirty_bytes = old_bytes;
                        return -ERANGE;
                }
                writeback_set_ratelimit();
                vm_dirty_ratio = 0;
        }
        return ret;
}

static unsigned long wp_next_time(unsigned long cur_time)
{
        cur_time += VM_COMPLETIONS_PERIOD_LEN;
        /* 0 has a special meaning... */
        if (!cur_time)
                return 1;
        return cur_time;
}

static void wb_domain_writeout_inc(struct wb_domain *dom,
                                   struct fprop_local_percpu *completions,
                                   unsigned int max_prop_frac)
{
        __fprop_inc_percpu_max(&dom->completions, completions,
                               max_prop_frac);
        /* First event after period switching was turned off? */
        if (unlikely(!dom->period_time)) {
                /*
                 * We can race with other __bdi_writeout_inc calls here but
                 * it does not cause any harm since the resulting time when
                 * timer will fire and what is in writeout_period_time will be
                 * roughly the same.
                 */
                dom->period_time = wp_next_time(jiffies);
                mod_timer(&dom->period_timer, dom->period_time);
        }
}

/*
 * Increment @wb's writeout completion count and the global writeout
 * completion count. Called from test_clear_page_writeback().
 */
static inline void __wb_writeout_inc(struct bdi_writeback *wb)
{
        struct wb_domain *cgdom;

        inc_wb_stat(wb, WB_WRITTEN);
        wb_domain_writeout_inc(&global_wb_domain, &wb->completions,
                               wb->bdi->max_prop_frac);

        cgdom = mem_cgroup_wb_domain(wb);
        if (cgdom)
                wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb),
                                       wb->bdi->max_prop_frac);
}

void wb_writeout_inc(struct bdi_writeback *wb)
{
        unsigned long flags;

        local_irq_save(flags);
        __wb_writeout_inc(wb);
        local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(wb_writeout_inc);

/*
 * On idle system, we can be called long after we scheduled because we use
 * deferred timers so count with missed periods.
 */
static void writeout_period(struct timer_list *t)
{
        struct wb_domain *dom = from_timer(dom, t, period_timer);
        int miss_periods = (jiffies - dom->period_time) /
                                                 VM_COMPLETIONS_PERIOD_LEN;

        if (fprop_new_period(&dom->completions, miss_periods + 1)) {
                dom->period_time = wp_next_time(dom->period_time +
                                miss_periods * VM_COMPLETIONS_PERIOD_LEN);
                mod_timer(&dom->period_timer, dom->period_time);
        } else {
                /*
                 * Aging has zeroed all fractions. Stop wasting CPU on period
                 * updates.
                 */
                dom->period_time = 0;
        }
}

int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
{
        memset(dom, 0, sizeof(*dom));

        spin_lock_init(&dom->lock);

        timer_setup(&dom->period_timer, writeout_period, TIMER_DEFERRABLE);

        dom->dirty_limit_tstamp = jiffies;

        return fprop_global_init(&dom->completions, gfp);
}

#ifdef CONFIG_CGROUP_WRITEBACK
void wb_domain_exit(struct wb_domain *dom)
{
        del_timer_sync(&dom->period_timer);
        fprop_global_destroy(&dom->completions);
}
#endif

/*
 * bdi_min_ratio keeps the sum of the minimum dirty shares of all
 * registered backing devices, which, for obvious reasons, can not
 * exceed 100%.
 */
static unsigned int bdi_min_ratio;

int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
{
        int ret = 0;

        spin_lock_bh(&bdi_lock);
        if (min_ratio > bdi->max_ratio) {
                ret = -EINVAL;
        } else {
                min_ratio -= bdi->min_ratio;
                if (bdi_min_ratio + min_ratio < 100) {
                        bdi_min_ratio += min_ratio;
                        bdi->min_ratio += min_ratio;
                } else {
                        ret = -EINVAL;
                }
        }
        spin_unlock_bh(&bdi_lock);

        return ret;
}

int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
{
        int ret = 0;

        if (max_ratio > 100)
                return -EINVAL;

        spin_lock_bh(&bdi_lock);
        if (bdi->min_ratio > max_ratio) {
                ret = -EINVAL;
        } else {
                bdi->max_ratio = max_ratio;
                bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100;
        }
        spin_unlock_bh(&bdi_lock);

        return ret;
}
EXPORT_SYMBOL(bdi_set_max_ratio);

static unsigned long dirty_freerun_ceiling(unsigned long thresh,
                                           unsigned long bg_thresh)
{
        return (thresh + bg_thresh) / 2;
}

static unsigned long hard_dirty_limit(struct wb_domain *dom,
                                      unsigned long thresh)
{
        return max(thresh, dom->dirty_limit);
}

/*
 * Memory which can be further allocated to a memcg domain is capped by
 * system-wide clean memory excluding the amount being used in the domain.
 */
static void mdtc_calc_avail(struct dirty_throttle_control *mdtc,
                            unsigned long filepages, unsigned long headroom)
{
        struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc);
        unsigned long clean = filepages - min(filepages, mdtc->dirty);
        unsigned long global_clean = gdtc->avail - min(gdtc->avail, gdtc->dirty);
        unsigned long other_clean = global_clean - min(global_clean, clean);

        mdtc->avail = filepages + min(headroom, other_clean);
}

/**
 * __wb_calc_thresh - @wb's share of dirty throttling threshold
 * @dtc: dirty_throttle_context of interest
 *
 * Note that balance_dirty_pages() will only seriously take it as a hard limit
 * when sleeping max_pause per page is not enough to keep the dirty pages under
 * control. For example, when the device is completely stalled due to some error
 * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key.
 * In the other normal situations, it acts more gently by throttling the tasks
 * more (rather than completely block them) when the wb dirty pages go high.
 *
 * It allocates high/low dirty limits to fast/slow devices, in order to prevent
 * - starving fast devices
 * - piling up dirty pages (that will take long time to sync) on slow devices
 *
 * The wb's share of dirty limit will be adapting to its throughput and
 * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
 *
 * Return: @wb's dirty limit in pages. The term "dirty" in the context of
 * dirty balancing includes all PG_dirty and PG_writeback pages.
 */
static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc)
{
        struct wb_domain *dom = dtc_dom(dtc);
        unsigned long thresh = dtc->thresh;
        u64 wb_thresh;
        unsigned long numerator, denominator;
        unsigned long wb_min_ratio, wb_max_ratio;

        /*
         * Calculate this BDI's share of the thresh ratio.
         */
        fprop_fraction_percpu(&dom->completions, dtc->wb_completions,
                              &numerator, &denominator);

        wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100;
        wb_thresh *= numerator;
        wb_thresh = div64_ul(wb_thresh, denominator);

        wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio);

        wb_thresh += (thresh * wb_min_ratio) / 100;
        if (wb_thresh > (thresh * wb_max_ratio) / 100)
                wb_thresh = thresh * wb_max_ratio / 100;

        return wb_thresh;
}

unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
{
        struct dirty_throttle_control gdtc = { GDTC_INIT(wb),
                                               .thresh = thresh };
        return __wb_calc_thresh(&gdtc);
}

/*
 *                           setpoint - dirty 3
 *        f(dirty) := 1.0 + (----------------)
 *                           limit - setpoint
 *
 * it's a 3rd order polynomial that subjects to
 *
 * (1) f(freerun)  = 2.0 => rampup dirty_ratelimit reasonably fast
 * (2) f(setpoint) = 1.0 => the balance point
 * (3) f(limit)    = 0   => the hard limit
 * (4) df/dx      <= 0         => negative feedback control
 * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
 *     => fast response on large errors; small oscillation near setpoint
 */
static long long pos_ratio_polynom(unsigned long setpoint,
                                          unsigned long dirty,
                                          unsigned long limit)
{
        long long pos_ratio;
        long x;

        x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
                      (limit - setpoint) | 1);
        pos_ratio = x;
        pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
        pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
        pos_ratio += 1 << RATELIMIT_CALC_SHIFT;

        return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT);
}

/*
 * Dirty position control.
 *
 * (o) global/bdi setpoints
 *
 * We want the dirty pages be balanced around the global/wb setpoints.
 * When the number of dirty pages is higher/lower than the setpoint, the
 * dirty position control ratio (and hence task dirty ratelimit) will be
 * decreased/increased to bring the dirty pages back to the setpoint.
 *
 *     pos_ratio = 1 << RATELIMIT_CALC_SHIFT
 *
 *     if (dirty < setpoint) scale up   pos_ratio
 *     if (dirty > setpoint) scale down pos_ratio
 *
 *     if (wb_dirty < wb_setpoint) scale up   pos_ratio
 *     if (wb_dirty > wb_setpoint) scale down pos_ratio
 *
 *     task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT
 *
 * (o) global control line
 *
 *     ^ pos_ratio
 *     |
 *     |            |<===== global dirty control scope ======>|
 * 2.0 .............*
 *     |            .*
 *     |            . *
 *     |            .   *
 *     |            .     *
 *     |            .        *
 *     |            .            *
 * 1.0 ................................*
 *     |            .                  .     *
 *     |            .                  .          *
 *     |            .                  .              *
 *     |            .                  .                 *
 *     |            .                  .                    *
 *   0 +------------.------------------.----------------------*------------->
 *           freerun^          setpoint^                 limit^   dirty pages
 *
 * (o) wb control line
 *
 *     ^ pos_ratio
 *     |
 *     |            *
 *     |              *
 *     |                *
 *     |                  *
 *     |                    * |<=========== span ============>|
 * 1.0 .......................*
 *     |                      . *
 *     |                      .   *
 *     |                      .     *
 *     |                      .       *
 *     |                      .         *
 *     |                      .           *
 *     |                      .             *
 *     |                      .               *
 *     |                      .                 *
 *     |                      .                   *
 *     |                      .                     *
 * 1/4 ...............................................* * * * * * * * * * * *
 *     |                      .                         .
 *     |                      .                           .
 *     |                      .                             .
 *   0 +----------------------.-------------------------------.------------->
 *                wb_setpoint^                    x_intercept^
 *
 * The wb control line won't drop below pos_ratio=1/4, so that wb_dirty can
 * be smoothly throttled down to normal if it starts high in situations like
 * - start writing to a slow SD card and a fast disk at the same time. The SD
 *   card's wb_dirty may rush to many times higher than wb_setpoint.
 * - the wb dirty thresh drops quickly due to change of JBOD workload
 */
static void wb_position_ratio(struct dirty_throttle_control *dtc)
{
        struct bdi_writeback *wb = dtc->wb;
        unsigned long write_bw = wb->avg_write_bandwidth;
        unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
        unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
        unsigned long wb_thresh = dtc->wb_thresh;
        unsigned long x_intercept;
        unsigned long setpoint;                /* dirty pages' target balance point */
        unsigned long wb_setpoint;
        unsigned long span;
        long long pos_ratio;                /* for scaling up/down the rate limit */
        long x;

        dtc->pos_ratio = 0;

        if (unlikely(dtc->dirty >= limit))
                return;

        /*
         * global setpoint
         *
         * See comment for pos_ratio_polynom().
         */
        setpoint = (freerun + limit) / 2;
        pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit);

        /*
         * The strictlimit feature is a tool preventing mistrusted filesystems
         * from growing a large number of dirty pages before throttling. For
         * such filesystems balance_dirty_pages always checks wb counters
         * against wb limits. Even if global "nr_dirty" is under "freerun".
         * This is especially important for fuse which sets bdi->max_ratio to
         * 1% by default. Without strictlimit feature, fuse writeback may
         * consume arbitrary amount of RAM because it is accounted in
         * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty".
         *
         * Here, in wb_position_ratio(), we calculate pos_ratio based on
         * two values: wb_dirty and wb_thresh. Let's consider an example:
         * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global
         * limits are set by default to 10% and 20% (background and throttle).
         * Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
         * wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is
         * about ~6K pages (as the average of background and throttle wb
         * limits). The 3rd order polynomial will provide positive feedback if
         * wb_dirty is under wb_setpoint and vice versa.
         *
         * Note, that we cannot use global counters in these calculations
         * because we want to throttle process writing to a strictlimit wb
         * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB
         * in the example above).
         */
        if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
                long long wb_pos_ratio;

                if (dtc->wb_dirty < 8) {
                        dtc->pos_ratio = min_t(long long, pos_ratio * 2,
                                           2 << RATELIMIT_CALC_SHIFT);
                        return;
                }

                if (dtc->wb_dirty >= wb_thresh)
                        return;

                wb_setpoint = dirty_freerun_ceiling(wb_thresh,
                                                    dtc->wb_bg_thresh);

                if (wb_setpoint == 0 || wb_setpoint == wb_thresh)
                        return;

                wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty,
                                                 wb_thresh);

                /*
                 * Typically, for strictlimit case, wb_setpoint << setpoint
                 * and pos_ratio >> wb_pos_ratio. In the other words global
                 * state ("dirty") is not limiting factor and we have to
                 * make decision based on wb counters. But there is an
                 * important case when global pos_ratio should get precedence:
                 * global limits are exceeded (e.g. due to activities on other
                 * wb's) while given strictlimit wb is below limit.
                 *
                 * "pos_ratio * wb_pos_ratio" would work for the case above,
                 * but it would look too non-natural for the case of all
                 * activity in the system coming from a single strictlimit wb
                 * with bdi->max_ratio == 100%.
                 *
                 * Note that min() below somewhat changes the dynamics of the
                 * control system. Normally, pos_ratio value can be well over 3
                 * (when globally we are at freerun and wb is well below wb
                 * setpoint). Now the maximum pos_ratio in the same situation
                 * is 2. We might want to tweak this if we observe the control
                 * system is too slow to adapt.
                 */
                dtc->pos_ratio = min(pos_ratio, wb_pos_ratio);
                return;
        }

        /*
         * We have computed basic pos_ratio above based on global situation. If
         * the wb is over/under its share of dirty pages, we want to scale
         * pos_ratio further down/up. That is done by the following mechanism.
         */

        /*
         * wb setpoint
         *
         *        f(wb_dirty) := 1.0 + k * (wb_dirty - wb_setpoint)
         *
         *                        x_intercept - wb_dirty
         *                     := --------------------------
         *                        x_intercept - wb_setpoint
         *
         * The main wb control line is a linear function that subjects to
         *
         * (1) f(wb_setpoint) = 1.0
         * (2) k = - 1 / (8 * write_bw)  (in single wb case)
         *     or equally: x_intercept = wb_setpoint + 8 * write_bw
         *
         * For single wb case, the dirty pages are observed to fluctuate
         * regularly within range
         *        [wb_setpoint - write_bw/2, wb_setpoint + write_bw/2]
         * for various filesystems, where (2) can yield in a reasonable 12.5%
         * fluctuation range for pos_ratio.
         *
         * For JBOD case, wb_thresh (not wb_dirty!) could fluctuate up to its
         * own size, so move the slope over accordingly and choose a slope that
         * yields 100% pos_ratio fluctuation on suddenly doubled wb_thresh.
         */
        if (unlikely(wb_thresh > dtc->thresh))
                wb_thresh = dtc->thresh;
        /*
         * It's very possible that wb_thresh is close to 0 not because the
         * device is slow, but that it has remained inactive for long time.
         * Honour such devices a reasonable good (hopefully IO efficient)
         * threshold, so that the occasional writes won't be blocked and active
         * writes can rampup the threshold quickly.
         */
        wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8);
        /*
         * scale global setpoint to wb's:
         *        wb_setpoint = setpoint * wb_thresh / thresh
         */
        x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1);
        wb_setpoint = setpoint * (u64)x >> 16;
        /*
         * Use span=(8*write_bw) in single wb case as indicated by
         * (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case.
         *
         *        wb_thresh                    thresh - wb_thresh
         * span = --------- * (8 * write_bw) + ------------------ * wb_thresh
         *         thresh                           thresh
         */
        span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16;
        x_intercept = wb_setpoint + span;

        if (dtc->wb_dirty < x_intercept - span / 4) {
                pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty),
                                      (x_intercept - wb_setpoint) | 1);
        } else
                pos_ratio /= 4;

        /*
         * wb reserve area, safeguard against dirty pool underrun and disk idle
         * It may push the desired control point of global dirty pages higher
         * than setpoint.
         */
        x_intercept = wb_thresh / 2;
        if (dtc->wb_dirty < x_intercept) {
                if (dtc->wb_dirty > x_intercept / 8)
                        pos_ratio = div_u64(pos_ratio * x_intercept,
                                            dtc->wb_dirty);
                else
                        pos_ratio *= 8;
        }

        dtc->pos_ratio = pos_ratio;
}

static void wb_update_write_bandwidth(struct bdi_writeback *wb,
                                      unsigned long elapsed,
                                      unsigned long written)
{
        const unsigned long period = roundup_pow_of_two(3 * HZ);
        unsigned long avg = wb->avg_write_bandwidth;
        unsigned long old = wb->write_bandwidth;
        u64 bw;

        /*
         * bw = written * HZ / elapsed
         *
         *                   bw * elapsed + write_bandwidth * (period - elapsed)
         * write_bandwidth = ---------------------------------------------------
         *                                          period
         *
         * @written may have decreased due to account_page_redirty().
         * Avoid underflowing @bw calculation.
         */
        bw = written - min(written, wb->written_stamp);
        bw *= HZ;
        if (unlikely(elapsed > period)) {
                bw = div64_ul(bw, elapsed);
                avg = bw;
                goto out;
        }
        bw += (u64)wb->write_bandwidth * (period - elapsed);
        bw >>= ilog2(period);

        /*
         * one more level of smoothing, for filtering out sudden spikes
         */
        if (avg > old && old >= (unsigned long)bw)
                avg -= (avg - old) >> 3;

        if (avg < old && old <= (unsigned long)bw)
                avg += (old - avg) >> 3;

out:
        /* keep avg > 0 to guarantee that tot > 0 if there are dirty wbs */
        avg = max(avg, 1LU);
        if (wb_has_dirty_io(wb)) {
                long delta = avg - wb->avg_write_bandwidth;
                WARN_ON_ONCE(atomic_long_add_return(delta,
                                        &wb->bdi->tot_write_bandwidth) <= 0);
        }
        wb->write_bandwidth = bw;
        wb->avg_write_bandwidth = avg;
}

static void update_dirty_limit(struct dirty_throttle_control *dtc)
{
        struct wb_domain *dom = dtc_dom(dtc);
        unsigned long thresh = dtc->thresh;
        unsigned long limit = dom->dirty_limit;

        /*
         * Follow up in one step.
         */
        if (limit < thresh) {
                limit = thresh;
                goto update;
        }

        /*
         * Follow down slowly. Use the higher one as the target, because thresh
         * may drop below dirty. This is exactly the reason to introduce
         * dom->dirty_limit which is guaranteed to lie above the dirty pages.
         */
        thresh = max(thresh, dtc->dirty);
        if (limit > thresh) {
                limit -= (limit - thresh) >> 5;
                goto update;
        }
        return;
update:
        dom->dirty_limit = limit;
}

static void domain_update_bandwidth(struct dirty_throttle_control *dtc,
                                    unsigned long now)
{
        struct wb_domain *dom = dtc_dom(dtc);

        /*
         * check locklessly first to optimize away locking for the most time
         */
        if (time_before(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL))
                return;

        spin_lock(&dom->lock);
        if (time_after_eq(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) {
                update_dirty_limit(dtc);
                dom->dirty_limit_tstamp = now;
        }
        spin_unlock(&dom->lock);
}

/*
 * Maintain wb->dirty_ratelimit, the base dirty throttle rate.
 *
 * Normal wb tasks will be curbed at or below it in long term.
 * Obviously it should be around (write_bw / N) when there are N dd tasks.
 */
static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
                                      unsigned long dirtied,
                                      unsigned long elapsed)
{
        struct bdi_writeback *wb = dtc->wb;
        unsigned long dirty = dtc->dirty;
        unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
        unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
        unsigned long setpoint = (freerun + limit) / 2;
        unsigned long write_bw = wb->avg_write_bandwidth;
        unsigned long dirty_ratelimit = wb->dirty_ratelimit;
        unsigned long dirty_rate;
        unsigned long task_ratelimit;
        unsigned long balanced_dirty_ratelimit;
        unsigned long step;
        unsigned long x;
        unsigned long shift;

        /*
         * The dirty rate will match the writeout rate in long term, except
         * when dirty pages are truncated by userspace or re-dirtied by FS.
         */
        dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed;

        /*
         * task_ratelimit reflects each dd's dirty rate for the past 200ms.
         */
        task_ratelimit = (u64)dirty_ratelimit *
                                        dtc->pos_ratio >> RATELIMIT_CALC_SHIFT;
        task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */

        /*
         * A linear estimation of the "balanced" throttle rate. The theory is,
         * if there are N dd tasks, each throttled at task_ratelimit, the wb's
         * dirty_rate will be measured to be (N * task_ratelimit). So the below
         * formula will yield the balanced rate limit (write_bw / N).
         *
         * Note that the expanded form is not a pure rate feedback:
         *        rate_(i+1) = rate_(i) * (write_bw / dirty_rate)                     (1)
         * but also takes pos_ratio into account:
         *        rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio  (2)
         *
         * (1) is not realistic because pos_ratio also takes part in balancing
         * the dirty rate.  Consider the state
         *        pos_ratio = 0.5                                                     (3)
         *        rate = 2 * (write_bw / N)                                     (4)
         * If (1) is used, it will stuck in that state! Because each dd will
         * be throttled at
         *        task_ratelimit = pos_ratio * rate = (write_bw / N)             (5)
         * yielding
         *        dirty_rate = N * task_ratelimit = write_bw                     (6)
         * put (6) into (1) we get
         *        rate_(i+1) = rate_(i)                                             (7)
         *
         * So we end up using (2) to always keep
         *        rate_(i+1) ~= (write_bw / N)                                     (8)
         * regardless of the value of pos_ratio. As long as (8) is satisfied,
         * pos_ratio is able to drive itself to 1.0, which is not only where
         * the dirty count meet the setpoint, but also where the slope of
         * pos_ratio is most flat and hence task_ratelimit is least fluctuated.
         */
        balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw,
                                           dirty_rate | 1);
        /*
         * balanced_dirty_ratelimit ~= (write_bw / N) <= write_bw
         */
        if (unlikely(balanced_dirty_ratelimit > write_bw))
                balanced_dirty_ratelimit = write_bw;

        /*
         * We could safely do this and return immediately:
         *
         *        wb->dirty_ratelimit = balanced_dirty_ratelimit;
         *
         * However to get a more stable dirty_ratelimit, the below elaborated
         * code makes use of task_ratelimit to filter out singular points and
         * limit the step size.
         *
         * The below code essentially only uses the relative value of
         *
         *        task_ratelimit - dirty_ratelimit
         *        = (pos_ratio - 1) * dirty_ratelimit
         *
         * which reflects the direction and size of dirty position error.
         */

        /*
         * dirty_ratelimit will follow balanced_dirty_ratelimit iff
         * task_ratelimit is on the same side of dirty_ratelimit, too.
         * For example, when
         * - dirty_ratelimit > balanced_dirty_ratelimit
         * - dirty_ratelimit > task_ratelimit (dirty pages are above setpoint)
         * lowering dirty_ratelimit will help meet both the position and rate
         * control targets. Otherwise, don't update dirty_ratelimit if it will
         * only help meet the rate target. After all, what the users ultimately
         * feel and care are stable dirty rate and small position error.
         *
         * |task_ratelimit - dirty_ratelimit| is used to limit the step size
         * and filter out the singular points of balanced_dirty_ratelimit. Which
         * keeps jumping around randomly and can even leap far away at times
         * due to the small 200ms estimation period of dirty_rate (we want to
         * keep that period small to reduce time lags).
         */
        step = 0;

        /*
         * For strictlimit case, calculations above were based on wb counters
         * and limits (starting from pos_ratio = wb_position_ratio() and up to
         * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate).
         * Hence, to calculate "step" properly, we have to use wb_dirty as
         * "dirty" and wb_setpoint as "setpoint".
         *
         * We rampup dirty_ratelimit forcibly if wb_dirty is low because
         * it's possible that wb_thresh is close to zero due to inactivity
         * of backing device.
         */
        if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
                dirty = dtc->wb_dirty;
                if (dtc->wb_dirty < 8)
                        setpoint = dtc->wb_dirty + 1;
                else
                        setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2;
        }

        if (dirty < setpoint) {
                x = min3(wb->balanced_dirty_ratelimit,
                         balanced_dirty_ratelimit, task_ratelimit);
                if (dirty_ratelimit < x)
                        step = x - dirty_ratelimit;
        } else {
                x = max3(wb->balanced_dirty_ratelimit,
                         balanced_dirty_ratelimit, task_ratelimit);
                if (dirty_ratelimit > x)
                        step = dirty_ratelimit - x;
        }

        /*
         * Don't pursue 100% rate matching. It's impossible since the balanced
         * rate itself is constantly fluctuating. So decrease the track speed
         * when it gets close to the target. Helps eliminate pointless tremors.
         */
        shift = dirty_ratelimit / (2 * step + 1);
        if (shift < BITS_PER_LONG)
                step = DIV_ROUND_UP(step >> shift, 8);
        else
                step = 0;

        if (dirty_ratelimit < balanced_dirty_ratelimit)
                dirty_ratelimit += step;
        else
                dirty_ratelimit -= step;

        wb->dirty_ratelimit = max(dirty_ratelimit, 1UL);
        wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit;

        trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit);
}

static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
                                  struct dirty_throttle_control *mdtc,
                                  unsigned long start_time,
                                  bool update_ratelimit)
{
        struct bdi_writeback *wb = gdtc->wb;
        unsigned long now = jiffies;
        unsigned long elapsed = now - wb->bw_time_stamp;
        unsigned long dirtied;
        unsigned long written;

        lockdep_assert_held(&wb->list_lock);

        /*
         * rate-limit, only update once every 200ms.
         */
        if (elapsed < BANDWIDTH_INTERVAL)
                return;

        dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]);
        written = percpu_counter_read(&wb->stat[WB_WRITTEN]);

        /*
         * Skip quiet periods when disk bandwidth is under-utilized.
         * (at least 1s idle time between two flusher runs)
         */
        if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time))
                goto snapshot;

        if (update_ratelimit) {
                domain_update_bandwidth(gdtc, now);
                wb_update_dirty_ratelimit(gdtc, dirtied, elapsed);

                /*
                 * @mdtc is always NULL if !CGROUP_WRITEBACK but the
                 * compiler has no way to figure that out.  Help it.
                 */
                if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) {
                        domain_update_bandwidth(mdtc, now);
                        wb_update_dirty_ratelimit(mdtc, dirtied, elapsed);
                }
        }
        wb_update_write_bandwidth(wb, elapsed, written);

snapshot:
        wb->dirtied_stamp = dirtied;
        wb->written_stamp = written;
        wb->bw_time_stamp = now;
}

void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time)
{
        struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };

        __wb_update_bandwidth(&gdtc, NULL, start_time, false);
}

/*
 * After a task dirtied this many pages, balance_dirty_pages_ratelimited()
 * will look to see if it needs to start dirty throttling.
 *
 * If dirty_poll_interval is too low, big NUMA machines will call the expensive
 * global_zone_page_state() too often. So scale it near-sqrt to the safety margin
 * (the number of pages we may dirty without exceeding the dirty limits).
 */
static unsigned long dirty_poll_interval(unsigned long dirty,
                                         unsigned long thresh)
{
        if (thresh > dirty)
                return 1UL << (ilog2(thresh - dirty) >> 1);

        return 1;
}

static unsigned long wb_max_pause(struct bdi_writeback *wb,
                                  unsigned long wb_dirty)
{
        unsigned long bw = wb->avg_write_bandwidth;
        unsigned long t;

        /*
         * Limit pause time for small memory systems. If sleeping for too long
         * time, a small pool of dirty/writeback pages may go empty and disk go
         * idle.
         *
         * 8 serves as the safety ratio.
         */
        t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
        t++;

        return min_t(unsigned long, t, MAX_PAUSE);
}

static long wb_min_pause(struct bdi_writeback *wb,
                         long max_pause,
                         unsigned long task_ratelimit,
                         unsigned long dirty_ratelimit,
                         int *nr_dirtied_pause)
{
        long hi = ilog2(wb->avg_write_bandwidth);
        long lo = ilog2(wb->dirty_ratelimit);
        long t;                /* target pause */
        long pause;        /* estimated next pause */
        int pages;        /* target nr_dirtied_pause */

        /* target for 10ms pause on 1-dd case */
        t = max(1, HZ / 100);

        /*
         * Scale up pause time for concurrent dirtiers in order to reduce CPU
         * overheads.
         *
         * (N * 10ms) on 2^N concurrent tasks.
         */
        if (hi > lo)
                t += (hi - lo) * (10 * HZ) / 1024;

        /*
         * This is a bit convoluted. We try to base the next nr_dirtied_pause
         * on the much more stable dirty_ratelimit. However the next pause time
         * will be computed based on task_ratelimit and the two rate limits may
         * depart considerably at some time. Especially if task_ratelimit goes
         * below dirty_ratelimit/2 and the target pause is max_pause, the next
         * pause time will be max_pause*2 _trimmed down_ to max_pause.  As a
         * result task_ratelimit won't be executed faithfully, which could
         * eventually bring down dirty_ratelimit.
         *
         * We apply two rules to fix it up:
         * 1) try to estimate the next pause time and if necessary, use a lower
         *    nr_dirtied_pause so as not to exceed max_pause. When this happens,
         *    nr_dirtied_pause will be "dancing" with task_ratelimit.
         * 2) limit the target pause time to max_pause/2, so that the normal
         *    small fluctuations of task_ratelimit won't trigger rule (1) and
         *    nr_dirtied_pause will remain as stable as dirty_ratelimit.
         */
        t = min(t, 1 + max_pause / 2);
        pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);

        /*
         * Tiny nr_dirtied_pause is found to hurt I/O performance in the test
         * case fio-mmap-randwrite-64k, which does 16*{sync read, async write}.
         * When the 16 consecutive reads are often interrupted by some dirty
         * throttling pause during the async writes, cfq will go into idles
         * (deadline is fine). So push nr_dirtied_pause as high as possible
         * until reaches DIRTY_POLL_THRESH=32 pages.
         */
        if (pages < DIRTY_POLL_THRESH) {
                t = max_pause;
                pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
                if (pages > DIRTY_POLL_THRESH) {
                        pages = DIRTY_POLL_THRESH;
                        t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit;
                }
        }

        pause = HZ * pages / (task_ratelimit + 1);
        if (pause > max_pause) {
                t = max_pause;
                pages = task_ratelimit * t / roundup_pow_of_two(HZ);
        }

        *nr_dirtied_pause = pages;
        /*
         * The minimal pause time will normally be half the target pause time.
         */
        return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
}

static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
{
        struct bdi_writeback *wb = dtc->wb;
        unsigned long wb_reclaimable;

        /*
         * wb_thresh is not treated as some limiting factor as
         * dirty_thresh, due to reasons
         * - in JBOD setup, wb_thresh can fluctuate a lot
         * - in a system with HDD and USB key, the USB key may somehow
         *   go into state (wb_dirty >> wb_thresh) either because
         *   wb_dirty starts high, or because wb_thresh drops low.
         *   In this case we don't want to hard throttle the USB key
         *   dirtiers for 100 seconds until wb_dirty drops under
         *   wb_thresh. Instead the auxiliary wb control line in
         *   wb_position_ratio() will let the dirtier task progress
         *   at some rate <= (write_bw / 2) for bringing down wb_dirty.
         */
        dtc->wb_thresh = __wb_calc_thresh(dtc);
        dtc->wb_bg_thresh = dtc->thresh ?
                div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0;

        /*
         * In order to avoid the stacked BDI deadlock we need
         * to ensure we accurately count the 'dirty' pages when
         * the threshold is low.
         *
         * Otherwise it would be possible to get thresh+n pages
         * reported dirty, even though there are thresh-m pages
         * actually dirty; with m+n sitting in the percpu
         * deltas.
         */
        if (dtc->wb_thresh < 2 * wb_stat_error()) {
                wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
                dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK);
        } else {
                wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE);
                dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK);
        }
}

/*
 * balance_dirty_pages() must be called by processes which are generating dirty
 * data.  It looks at the number of dirty pages in the machine and will force
 * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.
 * If we're over `background_thresh' then the writeback threads are woken to
 * perform some writeout.
 */
static void balance_dirty_pages(struct bdi_writeback *wb,
                                unsigned long pages_dirtied)
{
        struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
        struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
        struct dirty_throttle_control * const gdtc = &gdtc_stor;
        struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
                                                     &mdtc_stor : NULL;
        struct dirty_throttle_control *sdtc;
        unsigned long nr_reclaimable;        /* = file_dirty */
        long period;
        long pause;
        long max_pause;
        long min_pause;
        int nr_dirtied_pause;
        bool dirty_exceeded = false;
        unsigned long task_ratelimit;
        unsigned long dirty_ratelimit;
        struct backing_dev_info *bdi = wb->bdi;
        bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
        unsigned long start_time = jiffies;

        for (;;) {
                unsigned long now = jiffies;
                unsigned long dirty, thresh, bg_thresh;
                unsigned long m_dirty = 0;        /* stop bogus uninit warnings */
                unsigned long m_thresh = 0;
                unsigned long m_bg_thresh = 0;

                nr_reclaimable = global_node_page_state(NR_FILE_DIRTY);
                gdtc->avail = global_dirtyable_memory();
                gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK);

                domain_dirty_limits(gdtc);

                if (unlikely(strictlimit)) {
                        wb_dirty_limits(gdtc);

                        dirty = gdtc->wb_dirty;
                        thresh = gdtc->wb_thresh;
                        bg_thresh = gdtc->wb_bg_thresh;
                } else {
                        dirty = gdtc->dirty;
                        thresh = gdtc->thresh;
                        bg_thresh = gdtc->bg_thresh;
                }

                if (mdtc) {
                        unsigned long filepages, headroom, writeback;

                        /*
                         * If @wb belongs to !root memcg, repeat the same
                         * basic calculations for the memcg domain.
                         */
                        mem_cgroup_wb_stats(wb, &filepages, &headroom,
                                            &mdtc->dirty, &writeback);
                        mdtc->dirty += writeback;
                        mdtc_calc_avail(mdtc, filepages, headroom);

                        domain_dirty_limits(mdtc);

                        if (unlikely(strictlimit)) {
                                wb_dirty_limits(mdtc);
                                m_dirty = mdtc->wb_dirty;
                                m_thresh = mdtc->wb_thresh;
                                m_bg_thresh = mdtc->wb_bg_thresh;
                        } else {
                                m_dirty = mdtc->dirty;
                                m_thresh = mdtc->thresh;
                                m_bg_thresh = mdtc->bg_thresh;
                        }
                }

                /*
                 * Throttle it only when the background writeback cannot
                 * catch-up. This avoids (excessively) small writeouts
                 * when the wb limits are ramping up in case of !strictlimit.
                 *
                 * In strictlimit case make decision based on the wb counters
                 * and limits. Small writeouts when the wb limits are ramping
                 * up are the price we consciously pay for strictlimit-ing.
                 *
                 * If memcg domain is in effect, @dirty should be under
                 * both global and memcg freerun ceilings.
                 */
                if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) &&
                    (!mdtc ||
                     m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) {
                        unsigned long intv;
                        unsigned long m_intv;

free_running:
                        intv = dirty_poll_interval(dirty, thresh);
                        m_intv = ULONG_MAX;

                        current->dirty_paused_when = now;
                        current->nr_dirtied = 0;
                        if (mdtc)
                                m_intv = dirty_poll_interval(m_dirty, m_thresh);
                        current->nr_dirtied_pause = min(intv, m_intv);
                        break;
                }

                if (unlikely(!writeback_in_progress(wb)))
                        wb_start_background_writeback(wb);

                mem_cgroup_flush_foreign(wb);

                /*
                 * Calculate global domain's pos_ratio and select the
                 * global dtc by default.
                 */
                if (!strictlimit) {
                        wb_dirty_limits(gdtc);

                        if ((current->flags & PF_LOCAL_THROTTLE) &&
                            gdtc->wb_dirty <
                            dirty_freerun_ceiling(gdtc->wb_thresh,
                                                  gdtc->wb_bg_thresh))
                                /*
                                 * LOCAL_THROTTLE tasks must not be throttled
                                 * when below the per-wb freerun ceiling.
                                 */
                                goto free_running;
                }

                dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) &&
                        ((gdtc->dirty > gdtc->thresh) || strictlimit);

                wb_position_ratio(gdtc);
                sdtc = gdtc;

                if (mdtc) {
                        /*
                         * If memcg domain is in effect, calculate its
                         * pos_ratio.  @wb should satisfy constraints from
                         * both global and memcg domains.  Choose the one
                         * w/ lower pos_ratio.
                         */
                        if (!strictlimit) {
                                wb_dirty_limits(mdtc);

                                if ((current->flags & PF_LOCAL_THROTTLE) &&
                                    mdtc->wb_dirty <
                                    dirty_freerun_ceiling(mdtc->wb_thresh,
                                                          mdtc->wb_bg_thresh))
                                        /*
                                         * LOCAL_THROTTLE tasks must not be
                                         * throttled when below the per-wb
                                         * freerun ceiling.
                                         */
                                        goto free_running;
                        }
                        dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) &&
                                ((mdtc->dirty > mdtc->thresh) || strictlimit);

                        wb_position_ratio(mdtc);
                        if (mdtc->pos_ratio < gdtc->pos_ratio)
                                sdtc = mdtc;
                }

                if (dirty_exceeded && !wb->dirty_exceeded)
                        wb->dirty_exceeded = 1;

                if (time_is_before_jiffies(wb->bw_time_stamp +
                                           BANDWIDTH_INTERVAL)) {
                        spin_lock(&wb->list_lock);
                        __wb_update_bandwidth(gdtc, mdtc, start_time, true);
                        spin_unlock(&wb->list_lock);
                }

                /* throttle according to the chosen dtc */
                dirty_ratelimit = wb->dirty_ratelimit;
                task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >>
                                                        RATELIMIT_CALC_SHIFT;
                max_pause = wb_max_pause(wb, sdtc->wb_dirty);
                min_pause = wb_min_pause(wb, max_pause,
                                         task_ratelimit, dirty_ratelimit,
                                         &nr_dirtied_pause);

                if (unlikely(task_ratelimit == 0)) {
                        period = max_pause;
                        pause = max_pause;
                        goto pause;
                }
                period = HZ * pages_dirtied / task_ratelimit;
                pause = period;
                if (current->dirty_paused_when)
                        pause -= now - current->dirty_paused_when;
                /*
                 * For less than 1s think time (ext3/4 may block the dirtier
                 * for up to 800ms from time to time on 1-HDD; so does xfs,
                 * however at much less frequency), try to compensate it in
                 * future periods by updating the virtual time; otherwise just
                 * do a reset, as it may be a light dirtier.
                 */
                if (pause < min_pause) {
                        trace_balance_dirty_pages(wb,
                                                  sdtc->thresh,
                                                  sdtc->bg_thresh,
                                                  sdtc->dirty,
                                                  sdtc->wb_thresh,
                                                  sdtc->wb_dirty,
                                                  dirty_ratelimit,
                                                  task_ratelimit,
                                                  pages_dirtied,
                                                  period,
                                                  min(pause, 0L),
                                                  start_time);
                        if (pause < -HZ) {
                                current->dirty_paused_when = now;
                                current->nr_dirtied = 0;
                        } else if (period) {
                                current->dirty_paused_when += period;
                                current->nr_dirtied = 0;
                        } else if (current->nr_dirtied_pause <= pages_dirtied)
                                current->nr_dirtied_pause += pages_dirtied;
                        break;
                }
                if (unlikely(pause > max_pause)) {
                        /* for occasional dropped task_ratelimit */
                        now += min(pause - max_pause, max_pause);
                        pause = max_pause;
                }

pause:
                trace_balance_dirty_pages(wb,
                                          sdtc->thresh,
                                          sdtc->bg_thresh,
                                          sdtc->dirty,
                                          sdtc->wb_thresh,
                                          sdtc->wb_dirty,
                                          dirty_ratelimit,
                                          task_ratelimit,
                                          pages_dirtied,
                                          period,
                                          pause,
                                          start_time);
                __set_current_state(TASK_KILLABLE);
                wb->dirty_sleep = now;
                io_schedule_timeout(pause);

                current->dirty_paused_when = now + pause;
                current->nr_dirtied = 0;
                current->nr_dirtied_pause = nr_dirtied_pause;

                /*
                 * This is typically equal to (dirty < thresh) and can also
                 * keep "1000+ dd on a slow USB stick" under control.
                 */
                if (task_ratelimit)
                        break;

                /*
                 * In the case of an unresponding NFS server and the NFS dirty
                 * pages exceeds dirty_thresh, give the other good wb's a pipe
                 * to go through, so that tasks on them still remain responsive.
                 *
                 * In theory 1 page is enough to keep the consumer-producer
                 * pipe going: the flusher cleans 1 page => the task dirties 1
                 * more page. However wb_dirty has accounting errors.  So use
                 * the larger and more IO friendly wb_stat_error.
                 */
                if (sdtc->wb_dirty <= wb_stat_error())
                        break;

                if (fatal_signal_pending(current))
                        break;
        }

        if (!dirty_exceeded && wb->dirty_exceeded)
                wb->dirty_exceeded = 0;

        if (writeback_in_progress(wb))
                return;

        /*
         * In laptop mode, we wait until hitting the higher threshold before
         * starting background writeout, and then write out all the way down
         * to the lower threshold.  So slow writers cause minimal disk activity.
         *
         * In normal mode, we start background writeout at the lower
         * background_thresh, to keep the amount of dirty memory low.
         */
        if (laptop_mode)
                return;

        if (nr_reclaimable > gdtc->bg_thresh)
                wb_start_background_writeback(wb);
}

static DEFINE_PER_CPU(int, bdp_ratelimits);

/*
 * Normal tasks are throttled by
 *        loop {
 *                dirty tsk->nr_dirtied_pause pages;
 *                take a snap in balance_dirty_pages();
 *        }
 * However there is a worst case. If every task exit immediately when dirtied
 * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be
 * called to throttle the page dirties. The solution is to save the not yet
 * throttled page dirties in dirty_throttle_leaks on task exit and charge them
 * randomly into the running tasks. This works well for the above worst case,
 * as the new task will pick up and accumulate the old task's leaked dirty
 * count and eventually get throttled.
 */
DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;

/**
 * balance_dirty_pages_ratelimited - balance dirty memory state
 * @mapping: address_space which was dirtied
 *
 * Processes which are dirtying memory should call in here once for each page
 * which was newly dirtied.  The function will periodically check the system's
 * dirty state and will initiate writeback if needed.
 *
 * On really big machines, get_writeback_state is expensive, so try to avoid
 * calling it too often (ratelimiting).  But once we're over the dirty memory
 * limit we decrease the ratelimiting by a lot, to prevent individual processes
 * from overshooting the limit by (ratelimit_pages) each.
 */
void balance_dirty_pages_ratelimited(struct address_space *mapping)
{
        struct inode *inode = mapping->host;
        struct backing_dev_info *bdi = inode_to_bdi(inode);
        struct bdi_writeback *wb = NULL;
        int ratelimit;
        int *p;

        if (!(bdi->capabilities & BDI_CAP_WRITEBACK))
                return;

        if (inode_cgwb_enabled(inode))
                wb = wb_get_create_current(bdi, GFP_KERNEL);
        if (!wb)
                wb = &bdi->wb;

        ratelimit = current->nr_dirtied_pause;
        if (wb->dirty_exceeded)
                ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));

        preempt_disable();
        /*
         * This prevents one CPU to accumulate too many dirtied pages without
         * calling into balance_dirty_pages(), which can happen when there are
         * 1000+ tasks, all of them start dirtying pages at exactly the same
         * time, hence all honoured too large initial task->nr_dirtied_pause.
         */
        p =  this_cpu_ptr(&bdp_ratelimits);
        if (unlikely(current->nr_dirtied >= ratelimit))
                *p = 0;
        else if (unlikely(*p >= ratelimit_pages)) {
                *p = 0;
                ratelimit = 0;
        }
        /*
         * Pick up the dirtied pages by the exited tasks. This avoids lots of
         * short-lived tasks (eg. gcc invocations in a kernel build) escaping
         * the dirty throttling and livelock other long-run dirtiers.
         */
        p = this_cpu_ptr(&dirty_throttle_leaks);
        if (*p > 0 && current->nr_dirtied < ratelimit) {
                unsigned long nr_pages_dirtied;
                nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
                *p -= nr_pages_dirtied;
                current->nr_dirtied += nr_pages_dirtied;
        }
        preempt_enable();

        if (unlikely(current->nr_dirtied >= ratelimit))
                balance_dirty_pages(wb, current->nr_dirtied);

        wb_put(wb);
}
EXPORT_SYMBOL(balance_dirty_pages_ratelimited);

/**
 * wb_over_bg_thresh - does @wb need to be written back?
 * @wb: bdi_writeback of interest
 *
 * Determines whether background writeback should keep writing @wb or it's
 * clean enough.
 *
 * Return: %true if writeback should continue.
 */
bool wb_over_bg_thresh(struct bdi_writeback *wb)
{
        struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
        struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
        struct dirty_throttle_control * const gdtc = &gdtc_stor;
        struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
                                                     &mdtc_stor : NULL;

        /*
         * Similar to balance_dirty_pages() but ignores pages being written
         * as we're trying to decide whether to put more under writeback.
         */
        gdtc->avail = global_dirtyable_memory();
        gdtc->dirty = global_node_page_state(NR_FILE_DIRTY);
        domain_dirty_limits(gdtc);

        if (gdtc->dirty > gdtc->bg_thresh)
                return true;

        if (wb_stat(wb, WB_RECLAIMABLE) >
            wb_calc_thresh(gdtc->wb, gdtc->bg_thresh))
                return true;

        if (mdtc) {
                unsigned long filepages, headroom, writeback;

                mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty,
                                    &writeback);
                mdtc_calc_avail(mdtc, filepages, headroom);
                domain_dirty_limits(mdtc);        /* ditto, ignore writeback */

                if (mdtc->dirty > mdtc->bg_thresh)
                        return true;

                if (wb_stat(wb, WB_RECLAIMABLE) >
                    wb_calc_thresh(mdtc->wb, mdtc->bg_thresh))
                        return true;
        }

        return false;
}

/*
 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
 */
int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
                void *buffer, size_t *length, loff_t *ppos)
{
        unsigned int old_interval = dirty_writeback_interval;
        int ret;

        ret = proc_dointvec(table, write, buffer, length, ppos);

        /*
         * Writing 0 to dirty_writeback_interval will disable periodic writeback
         * and a different non-zero value will wakeup the writeback threads.
         * wb_wakeup_delayed() would be more appropriate, but it's a pain to
         * iterate over all bdis and wbs.
         * The reason we do this is to make the change take effect immediately.
         */
        if (!ret && write && dirty_writeback_interval &&
                dirty_writeback_interval != old_interval)
                wakeup_flusher_threads(WB_REASON_PERIODIC);

        return ret;
}

#ifdef CONFIG_BLOCK
void laptop_mode_timer_fn(struct timer_list *t)
{
        struct backing_dev_info *backing_dev_info =
                from_timer(backing_dev_info, t, laptop_mode_wb_timer);

        wakeup_flusher_threads_bdi(backing_dev_info, WB_REASON_LAPTOP_TIMER);
}

/*
 * We've spun up the disk and we're in laptop mode: schedule writeback
 * of all dirty data a few seconds from now.  If the flush is already scheduled
 * then push it back - the user is still using the disk.
 */
void laptop_io_completion(struct backing_dev_info *info)
{
        mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode);
}

/*
 * We're in laptop mode and we've just synced. The sync's writes will have
 * caused another writeback to be scheduled by laptop_io_completion.
 * Nothing needs to be written back anymore, so we unschedule the writeback.
 */
void laptop_sync_completion(void)
{
        struct backing_dev_info *bdi;

        rcu_read_lock();

        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
                del_timer(&bdi->laptop_mode_wb_timer);

        rcu_read_unlock();
}
#endif

/*
 * If ratelimit_pages is too high then we can get into dirty-data overload
 * if a large number of processes all perform writes at the same time.
 * If it is too low then SMP machines will call the (expensive)
 * get_writeback_state too often.
 *
 * Here we set ratelimit_pages to a level which ensures that when all CPUs are
 * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
 * thresholds.
 */

void writeback_set_ratelimit(void)
{
        struct wb_domain *dom = &global_wb_domain;
        unsigned long background_thresh;
        unsigned long dirty_thresh;

        global_dirty_limits(&background_thresh, &dirty_thresh);
        dom->dirty_limit = dirty_thresh;
        ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
        if (ratelimit_pages < 16)
                ratelimit_pages = 16;
}

static int page_writeback_cpu_online(unsigned int cpu)
{
        writeback_set_ratelimit();
        return 0;
}

/*
 * Called early on to tune the page writeback dirty limits.
 *
 * We used to scale dirty pages according to how total memory
 * related to pages that could be allocated for buffers.
 *
 * However, that was when we used "dirty_ratio" to scale with
 * all memory, and we don't do that any more. "dirty_ratio"
 * is now applied to total non-HIGHPAGE memory, and as such we can't
 * get into the old insane situation any more where we had
 * large amounts of dirty pages compared to a small amount of
 * non-HIGHMEM memory.
 *
 * But we might still want to scale the dirty_ratio by how
 * much memory the box has..
 */
void __init page_writeback_init(void)
{
        BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL));

        cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mm/writeback:online",
                          page_writeback_cpu_online, NULL);
        cpuhp_setup_state(CPUHP_MM_WRITEBACK_DEAD, "mm/writeback:dead", NULL,
                          page_writeback_cpu_online);
}

/**
 * tag_pages_for_writeback - tag pages to be written by write_cache_pages
 * @mapping: address space structure to write
 * @start: starting page index
 * @end: ending page index (inclusive)
 *
 * This function scans the page range from @start to @end (inclusive) and tags
 * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is
 * that write_cache_pages (or whoever calls this function) will then use
 * TOWRITE tag to identify pages eligible for writeback.  This mechanism is
 * used to avoid livelocking of writeback by a process steadily creating new
 * dirty pages in the file (thus it is important for this function to be quick
 * so that it can tag pages faster than a dirtying process can create them).
 */
void tag_pages_for_writeback(struct address_space *mapping,
                             pgoff_t start, pgoff_t end)
{
        XA_STATE(xas, &mapping->i_pages, start);
        unsigned int tagged = 0;
        void *page;

        xas_lock_irq(&xas);
        xas_for_each_marked(&xas, page, end, PAGECACHE_TAG_DIRTY) {
                xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE);
                if (++tagged % XA_CHECK_SCHED)
                        continue;

                xas_pause(&xas);
                xas_unlock_irq(&xas);
                cond_resched();
                xas_lock_irq(&xas);
        }
        xas_unlock_irq(&xas);
}
EXPORT_SYMBOL(tag_pages_for_writeback);

/**
 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
 * @mapping: address space structure to write
 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
 * @writepage: function called for each page
 * @data: data passed to writepage function
 *
 * If a page is already under I/O, write_cache_pages() skips it, even
 * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
 * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
 * and msync() need to guarantee that all the data which was dirty at the time
 * the call was made get new I/O started against them.  If wbc->sync_mode is
 * WB_SYNC_ALL then we were called for data integrity and we must wait for
 * existing IO to complete.
 *
 * To avoid livelocks (when other process dirties new pages), we first tag
 * pages which should be written back with TOWRITE tag and only then start
 * writing them. For data-integrity sync we have to be careful so that we do
 * not miss some pages (e.g., because some other process has cleared TOWRITE
 * tag we set). The rule we follow is that TOWRITE tag can be cleared only
 * by the process clearing the DIRTY tag (and submitting the page for IO).
 *
 * To avoid deadlocks between range_cyclic writeback and callers that hold
 * pages in PageWriteback to aggregate IO until write_cache_pages() returns,
 * we do not loop back to the start of the file. Doing so causes a page
 * lock/page writeback access order inversion - we should only ever lock
 * multiple pages in ascending page->index order, and looping back to the start
 * of the file violates that rule and causes deadlocks.
 *
 * Return: %0 on success, negative error code otherwise
 */
int write_cache_pages(struct address_space *mapping,
                      struct writeback_control *wbc, writepage_t writepage,
                      void *data)
{
        int ret = 0;
        int done = 0;
        int error;
        struct pagevec pvec;
        int nr_pages;
        pgoff_t index;
        pgoff_t end;                /* Inclusive */
        pgoff_t done_index;
        int range_whole = 0;
        xa_mark_t tag;

        pagevec_init(&pvec);
        if (wbc->range_cyclic) {
                index = mapping->writeback_index; /* prev offset */
                end = -1;
        } else {
                index = wbc->range_start >> PAGE_SHIFT;
                end = wbc->range_end >> PAGE_SHIFT;
                if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
                        range_whole = 1;
        }
        if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) {
                tag_pages_for_writeback(mapping, index, end);
                tag = PAGECACHE_TAG_TOWRITE;
        } else {
                tag = PAGECACHE_TAG_DIRTY;
        }
        done_index = index;
        while (!done && (index <= end)) {
                int i;

                nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
                                tag);
                if (nr_pages == 0)
                        break;

                for (i = 0; i < nr_pages; i++) {
                        struct page *page = pvec.pages[i];

                        done_index = page->index;

                        lock_page(page);

                        /*
                         * Page truncated or invalidated. We can freely skip it
                         * then, even for data integrity operations: the page
                         * has disappeared concurrently, so there could be no
                         * real expectation of this data interity operation
                         * even if there is now a new, dirty page at the same
                         * pagecache address.
                         */
                        if (unlikely(page->mapping != mapping)) {
continue_unlock:
                                unlock_page(page);
                                continue;
                        }

                        if (!PageDirty(page)) {
                                /* someone wrote it for us */
                                goto continue_unlock;
                        }

                        if (PageWriteback(page)) {
                                if (wbc->sync_mode != WB_SYNC_NONE)
                                        wait_on_page_writeback(page);
                                else
                                        goto continue_unlock;
                        }

                        BUG_ON(PageWriteback(page));
                        if (!clear_page_dirty_for_io(page))
                                goto continue_unlock;

                        trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
                        error = (*writepage)(page, wbc, data);
                        if (unlikely(error)) {
                                /*
                                 * Handle errors according to the type of
                                 * writeback. There's no need to continue for
                                 * background writeback. Just push done_index
                                 * past this page so media errors won't choke
                                 * writeout for the entire file. For integrity
                                 * writeback, we must process the entire dirty
                                 * set regardless of errors because the fs may
                                 * still have state to clear for each page. In
                                 * that case we continue processing and return
                                 * the first error.
                                 */
                                if (error == AOP_WRITEPAGE_ACTIVATE) {
                                        unlock_page(page);
                                        error = 0;
                                } else if (wbc->sync_mode != WB_SYNC_ALL) {
                                        ret = error;
                                        done_index = page->index + 1;
                                        done = 1;
                                        break;
                                }
                                if (!ret)
                                        ret = error;
                        }

                        /*
                         * We stop writing back only if we are not doing
                         * integrity sync. In case of integrity sync we have to
                         * keep going until we have written all the pages
                         * we tagged for writeback prior to entering this loop.
                         */
                        if (--wbc->nr_to_write <= 0 &&
                            wbc->sync_mode == WB_SYNC_NONE) {
                                done = 1;
                                break;
                        }
                }
                pagevec_release(&pvec);
                cond_resched();
        }

        /*
         * If we hit the last page and there is more work to be done: wrap
         * back the index back to the start of the file for the next
         * time we are called.
         */
        if (wbc->range_cyclic && !done)
                done_index = 0;
        if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
                mapping->writeback_index = done_index;

        return ret;
}
EXPORT_SYMBOL(write_cache_pages);

/*
 * Function used by generic_writepages to call the real writepage
 * function and set the mapping flags on error
 */
static int __writepage(struct page *page, struct writeback_control *wbc,
                       void *data)
{
        struct address_space *mapping = data;
        int ret = mapping->a_ops->writepage(page, wbc);
        mapping_set_error(mapping, ret);
        return ret;
}

/**
 * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them.
 * @mapping: address space structure to write
 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
 *
 * This is a library function, which implements the writepages()
 * address_space_operation.
 *
 * Return: %0 on success, negative error code otherwise
 */
int generic_writepages(struct address_space *mapping,
                       struct writeback_control *wbc)
{
        struct blk_plug plug;
        int ret;

        /* deal with chardevs and other special file */
        if (!mapping->a_ops->writepage)
                return 0;

        blk_start_plug(&plug);
        ret = write_cache_pages(mapping, wbc, __writepage, mapping);
        blk_finish_plug(&plug);
        return ret;
}

EXPORT_SYMBOL(generic_writepages);

int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
        int ret;

        if (wbc->nr_to_write <= 0)
                return 0;
        while (1) {
                if (mapping->a_ops->writepages)
                        ret = mapping->a_ops->writepages(mapping, wbc);
                else
                        ret = generic_writepages(mapping, wbc);
                if ((ret != -ENOMEM) || (wbc->sync_mode != WB_SYNC_ALL))
                        break;
                cond_resched();
                congestion_wait(BLK_RW_ASYNC, HZ/50);
        }
        return ret;
}

/**
 * write_one_page - write out a single page and wait on I/O
 * @page: the page to write
 *
 * The page must be locked by the caller and will be unlocked upon return.
 *
 * Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this
 * function returns.
 *
 * Return: %0 on success, negative error code otherwise
 */
int write_one_page(struct page *page)
{
        struct address_space *mapping = page->mapping;
        int ret = 0;
        struct writeback_control wbc = {
                .sync_mode = WB_SYNC_ALL,
                .nr_to_write = 1,
        };

        BUG_ON(!PageLocked(page));

        wait_on_page_writeback(page);

        if (clear_page_dirty_for_io(page)) {
                get_page(page);
                ret = mapping->a_ops->writepage(page, &wbc);
                if (ret == 0)
                        wait_on_page_writeback(page);
                put_page(page);
        } else {
                unlock_page(page);
        }

        if (!ret)
                ret = filemap_check_errors(mapping);
        return ret;
}
EXPORT_SYMBOL(write_one_page);

/*
 * For address_spaces which do not use buffers nor write back.
 */
int __set_page_dirty_no_writeback(struct page *page)
{
        if (!PageDirty(page))
                return !TestSetPageDirty(page);
        return 0;
}

/*
 * Helper function for set_page_dirty family.
 *
 * Caller must hold lock_page_memcg().
 *
 * NOTE: This relies on being atomic wrt interrupts.
 */
void account_page_dirtied(struct page *page, struct address_space *mapping)
{
        struct inode *inode = mapping->host;

        trace_writeback_dirty_page(page, mapping);

        if (mapping_can_writeback(mapping)) {
                struct bdi_writeback *wb;

                inode_attach_wb(inode, page);
                wb = inode_to_wb(inode);

                __inc_lruvec_page_state(page, NR_FILE_DIRTY);
                __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
                __inc_node_page_state(page, NR_DIRTIED);
                inc_wb_stat(wb, WB_RECLAIMABLE);
                inc_wb_stat(wb, WB_DIRTIED);
                task_io_account_write(PAGE_SIZE);
                current->nr_dirtied++;
                this_cpu_inc(bdp_ratelimits);

                mem_cgroup_track_foreign_dirty(page, wb);
        }
}

/*
 * Helper function for deaccounting dirty page without writeback.
 *
 * Caller must hold lock_page_memcg().
 */
void account_page_cleaned(struct page *page, struct address_space *mapping,
                          struct bdi_writeback *wb)
{
        if (mapping_can_writeback(mapping)) {
                dec_lruvec_page_state(page, NR_FILE_DIRTY);
                dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
                dec_wb_stat(wb, WB_RECLAIMABLE);
                task_io_account_cancelled_write(PAGE_SIZE);
        }
}

/*
 * For address_spaces which do not use buffers.  Just tag the page as dirty in
 * the xarray.
 *
 * This is also used when a single buffer is being dirtied: we want to set the
 * page dirty in that case, but not all the buffers.  This is a "bottom-up"
 * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying.
 *
 * The caller must ensure this doesn't race with truncation.  Most will simply
 * hold the page lock, but e.g. zap_pte_range() calls with the page mapped and
 * the pte lock held, which also locks out truncation.
 */
int __set_page_dirty_nobuffers(struct page *page)
{
        lock_page_memcg(page);
        if (!TestSetPageDirty(page)) {
                struct address_space *mapping = page_mapping(page);
                unsigned long flags;

                if (!mapping) {
                        unlock_page_memcg(page);
                        return 1;
                }

                xa_lock_irqsave(&mapping->i_pages, flags);
                BUG_ON(page_mapping(page) != mapping);
                WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
                account_page_dirtied(page, mapping);
                __xa_set_mark(&mapping->i_pages, page_index(page),
                                   PAGECACHE_TAG_DIRTY);
                xa_unlock_irqrestore(&mapping->i_pages, flags);
                unlock_page_memcg(page);

                if (mapping->host) {
                        /* !PageAnon && !swapper_space */
                        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
                }
                return 1;
        }
        unlock_page_memcg(page);
        return 0;
}
EXPORT_SYMBOL(__set_page_dirty_nobuffers);

/*
 * Call this whenever redirtying a page, to de-account the dirty counters
 * (NR_DIRTIED, WB_DIRTIED, tsk->nr_dirtied), so that they match the written
 * counters (NR_WRITTEN, WB_WRITTEN) in long term. The mismatches will lead to
 * systematic errors in balanced_dirty_ratelimit and the dirty pages position
 * control.
 */
void account_page_redirty(struct page *page)
{
        struct address_space *mapping = page->mapping;

        if (mapping && mapping_can_writeback(mapping)) {
                struct inode *inode = mapping->host;
                struct bdi_writeback *wb;
                struct wb_lock_cookie cookie = {};

                wb = unlocked_inode_to_wb_begin(inode, &cookie);
                current->nr_dirtied--;
                dec_node_page_state(page, NR_DIRTIED);
                dec_wb_stat(wb, WB_DIRTIED);
                unlocked_inode_to_wb_end(inode, &cookie);
        }
}
EXPORT_SYMBOL(account_page_redirty);

/*
 * When a writepage implementation decides that it doesn't want to write this
 * page for some reason, it should redirty the locked page via
 * redirty_page_for_writepage() and it should then unlock the page and return 0
 */
int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
{
        int ret;

        wbc->pages_skipped++;
        ret = __set_page_dirty_nobuffers(page);
        account_page_redirty(page);
        return ret;
}
EXPORT_SYMBOL(redirty_page_for_writepage);

/*
 * Dirty a page.
 *
 * For pages with a mapping this should be done under the page lock
 * for the benefit of asynchronous memory errors who prefer a consistent
 * dirty state. This rule can be broken in some special cases,
 * but should be better not to.
 *
 * If the mapping doesn't provide a set_page_dirty a_op, then
 * just fall through and assume that it wants buffer_heads.
 */
int set_page_dirty(struct page *page)
{
        struct address_space *mapping = page_mapping(page);

        page = compound_head(page);
        if (likely(mapping)) {
                int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
                /*
                 * readahead/lru_deactivate_page could remain
                 * PG_readahead/PG_reclaim due to race with end_page_writeback
                 * About readahead, if the page is written, the flags would be
                 * reset. So no problem.
                 * About lru_deactivate_page, if the page is redirty, the flag
                 * will be reset. So no problem. but if the page is used by readahead
                 * it will confuse readahead and make it restart the size rampup
                 * process. But it's a trivial problem.
                 */
                if (PageReclaim(page))
                        ClearPageReclaim(page);
#ifdef CONFIG_BLOCK
                if (!spd)
                        spd = __set_page_dirty_buffers;
#endif
                return (*spd)(page);
        }
        if (!PageDirty(page)) {
                if (!TestSetPageDirty(page))
                        return 1;
        }
        return 0;
}
EXPORT_SYMBOL(set_page_dirty);

/*
 * set_page_dirty() is racy if the caller has no reference against
 * page->mapping->host, and if the page is unlocked.  This is because another
 * CPU could truncate the page off the mapping and then free the mapping.
 *
 * Usually, the page _is_ locked, or the caller is a user-space process which
 * holds a reference on the inode by having an open file.
 *
 * In other cases, the page should be locked before running set_page_dirty().
 */
int set_page_dirty_lock(struct page *page)
{
        int ret;

        lock_page(page);
        ret = set_page_dirty(page);
        unlock_page(page);
        return ret;
}
EXPORT_SYMBOL(set_page_dirty_lock);

/*
 * This cancels just the dirty bit on the kernel page itself, it does NOT
 * actually remove dirty bits on any mmap's that may be around. It also
 * leaves the page tagged dirty, so any sync activity will still find it on
 * the dirty lists, and in particular, clear_page_dirty_for_io() will still
 * look at the dirty bits in the VM.
 *
 * Doing this should *normally* only ever be done when a page is truncated,
 * and is not actually mapped anywhere at all. However, fs/buffer.c does
 * this when it notices that somebody has cleaned out all the buffers on a
 * page without actually doing it through the VM. Can you say "ext3 is
 * horribly ugly"? Thought you could.
 */
void __cancel_dirty_page(struct page *page)
{
        struct address_space *mapping = page_mapping(page);

        if (mapping_can_writeback(mapping)) {
                struct inode *inode = mapping->host;
                struct bdi_writeback *wb;
                struct wb_lock_cookie cookie = {};

                lock_page_memcg(page);
                wb = unlocked_inode_to_wb_begin(inode, &cookie);

                if (TestClearPageDirty(page))
                        account_page_cleaned(page, mapping, wb);

                unlocked_inode_to_wb_end(inode, &cookie);
                unlock_page_memcg(page);
        } else {
                ClearPageDirty(page);
        }
}
EXPORT_SYMBOL(__cancel_dirty_page);

/*
 * Clear a page's dirty flag, while caring for dirty memory accounting.
 * Returns true if the page was previously dirty.
 *
 * This is for preparing to put the page under writeout.  We leave the page
 * tagged as dirty in the xarray so that a concurrent write-for-sync
 * can discover it via a PAGECACHE_TAG_DIRTY walk.  The ->writepage
 * implementation will run either set_page_writeback() or set_page_dirty(),
 * at which stage we bring the page's dirty flag and xarray dirty tag
 * back into sync.
 *
 * This incoherency between the page's dirty flag and xarray tag is
 * unfortunate, but it only exists while the page is locked.
 */
int clear_page_dirty_for_io(struct page *page)
{
        struct address_space *mapping = page_mapping(page);
        int ret = 0;

        VM_BUG_ON_PAGE(!PageLocked(page), page);

        if (mapping && mapping_can_writeback(mapping)) {
                struct inode *inode = mapping->host;
                struct bdi_writeback *wb;
                struct wb_lock_cookie cookie = {};

                /*
                 * Yes, Virginia, this is indeed insane.
                 *
                 * We use this sequence to make sure that
                 *  (a) we account for dirty stats properly
                 *  (b) we tell the low-level filesystem to
                 *      mark the whole page dirty if it was
                 *      dirty in a pagetable. Only to then
                 *  (c) clean the page again and return 1 to
                 *      cause the writeback.
                 *
                 * This way we avoid all nasty races with the
                 * dirty bit in multiple places and clearing
                 * them concurrently from different threads.
                 *
                 * Note! Normally the "set_page_dirty(page)"
                 * has no effect on the actual dirty bit - since
                 * that will already usually be set. But we
                 * need the side effects, and it can help us
                 * avoid races.
                 *
                 * We basically use the page "master dirty bit"
                 * as a serialization point for all the different
                 * threads doing their things.
                 */
                if (page_mkclean(page))
                        set_page_dirty(page);
                /*
                 * We carefully synchronise fault handlers against
                 * installing a dirty pte and marking the page dirty
                 * at this point.  We do this by having them hold the
                 * page lock while dirtying the page, and pages are
                 * always locked coming in here, so we get the desired
                 * exclusion.
                 */
                wb = unlocked_inode_to_wb_begin(inode, &cookie);
                if (TestClearPageDirty(page)) {
                        dec_lruvec_page_state(page, NR_FILE_DIRTY);
                        dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
                        dec_wb_stat(wb, WB_RECLAIMABLE);
                        ret = 1;
                }
                unlocked_inode_to_wb_end(inode, &cookie);
                return ret;
        }
        return TestClearPageDirty(page);
}
EXPORT_SYMBOL(clear_page_dirty_for_io);

int test_clear_page_writeback(struct page *page)
{
        struct address_space *mapping = page_mapping(page);
        struct mem_cgroup *memcg;
        struct lruvec *lruvec;
        int ret;

        memcg = lock_page_memcg(page);
        lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
        if (mapping && mapping_use_writeback_tags(mapping)) {
                struct inode *inode = mapping->host;
                struct backing_dev_info *bdi = inode_to_bdi(inode);
                unsigned long flags;

                xa_lock_irqsave(&mapping->i_pages, flags);
                ret = TestClearPageWriteback(page);
                if (ret) {
                        __xa_clear_mark(&mapping->i_pages, page_index(page),
                                                PAGECACHE_TAG_WRITEBACK);
                        if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
                                struct bdi_writeback *wb = inode_to_wb(inode);

                                dec_wb_stat(wb, WB_WRITEBACK);
                                __wb_writeout_inc(wb);
                        }
                }

                if (mapping->host && !mapping_tagged(mapping,
                                                     PAGECACHE_TAG_WRITEBACK))
                        sb_clear_inode_writeback(mapping->host);

                xa_unlock_irqrestore(&mapping->i_pages, flags);
        } else {
                ret = TestClearPageWriteback(page);
        }
        if (ret) {
                dec_lruvec_state(lruvec, NR_WRITEBACK);
                dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
                inc_node_page_state(page, NR_WRITTEN);
        }
        __unlock_page_memcg(memcg);
        return ret;
}

int __test_set_page_writeback(struct page *page, bool keep_write)
{
        struct address_space *mapping = page_mapping(page);
        int ret, access_ret;

        lock_page_memcg(page);
        if (mapping && mapping_use_writeback_tags(mapping)) {
                XA_STATE(xas, &mapping->i_pages, page_index(page));
                struct inode *inode = mapping->host;
                struct backing_dev_info *bdi = inode_to_bdi(inode);
                unsigned long flags;

                xas_lock_irqsave(&xas, flags);
                xas_load(&xas);
                ret = TestSetPageWriteback(page);
                if (!ret) {
                        bool on_wblist;

                        on_wblist = mapping_tagged(mapping,
                                                   PAGECACHE_TAG_WRITEBACK);

                        xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);
                        if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT)
                                inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);

                        /*
                         * We can come through here when swapping anonymous
                         * pages, so we don't necessarily have an inode to track
                         * for sync.
                         */
                        if (mapping->host && !on_wblist)
                                sb_mark_inode_writeback(mapping->host);
                }
                if (!PageDirty(page))
                        xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
                if (!keep_write)
                        xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
                xas_unlock_irqrestore(&xas, flags);
        } else {
                ret = TestSetPageWriteback(page);
        }
        if (!ret) {
                inc_lruvec_page_state(page, NR_WRITEBACK);
                inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
        }
        unlock_page_memcg(page);
        access_ret = arch_make_page_accessible(page);
        /*
         * If writeback has been triggered on a page that cannot be made
         * accessible, it is too late to recover here.
         */
        VM_BUG_ON_PAGE(access_ret != 0, page);

        return ret;

}
EXPORT_SYMBOL(__test_set_page_writeback);

/*
 * Wait for a page to complete writeback
 */
void wait_on_page_writeback(struct page *page)
{
        while (PageWriteback(page)) {
                trace_wait_on_page_writeback(page, page_mapping(page));
                wait_on_page_bit(page, PG_writeback);
        }
}
EXPORT_SYMBOL_GPL(wait_on_page_writeback);

/**
 * wait_for_stable_page() - wait for writeback to finish, if necessary.
 * @page:        The page to wait on.
 *
 * This function determines if the given page is related to a backing device
 * that requires page contents to be held stable during writeback.  If so, then
 * it will wait for any pending writeback to complete.
 */
void wait_for_stable_page(struct page *page)
{
        page = thp_head(page);
        if (page->mapping->host->i_sb->s_iflags & SB_I_STABLE_WRITES)
                wait_on_page_writeback(page);
}
EXPORT_SYMBOL_GPL(wait_for_stable_page);







































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_NETFILTER_H
#define __LINUX_NETFILTER_H

#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/net.h>
#include <linux/if.h>
#include <linux/in.h>
#include <linux/in6.h>
#include <linux/wait.h>
#include <linux/list.h>
#include <linux/static_key.h>
#include <linux/netfilter_defs.h>
#include <linux/netdevice.h>
#include <linux/sockptr.h>
#include <net/net_namespace.h>

static inline int NF_DROP_GETERR(int verdict)
{
        return -(verdict >> NF_VERDICT_QBITS);
}

static inline int nf_inet_addr_cmp(const union nf_inet_addr *a1,
                                   const union nf_inet_addr *a2)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        const unsigned long *ul1 = (const unsigned long *)a1;
        const unsigned long *ul2 = (const unsigned long *)a2;

        return ((ul1[0] ^ ul2[0]) | (ul1[1] ^ ul2[1])) == 0UL;
#else
        return a1->all[0] == a2->all[0] &&
               a1->all[1] == a2->all[1] &&
               a1->all[2] == a2->all[2] &&
               a1->all[3] == a2->all[3];
#endif
}

static inline void nf_inet_addr_mask(const union nf_inet_addr *a1,
                                     union nf_inet_addr *result,
                                     const union nf_inet_addr *mask)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        const unsigned long *ua = (const unsigned long *)a1;
        unsigned long *ur = (unsigned long *)result;
        const unsigned long *um = (const unsigned long *)mask;

        ur[0] = ua[0] & um[0];
        ur[1] = ua[1] & um[1];
#else
        result->all[0] = a1->all[0] & mask->all[0];
        result->all[1] = a1->all[1] & mask->all[1];
        result->all[2] = a1->all[2] & mask->all[2];
        result->all[3] = a1->all[3] & mask->all[3];
#endif
}

int netfilter_init(void);

struct sk_buff;

struct nf_hook_ops;

struct sock;

struct nf_hook_state {
        unsigned int hook;
        u_int8_t pf;
        struct net_device *in;
        struct net_device *out;
        struct sock *sk;
        struct net *net;
        int (*okfn)(struct net *, struct sock *, struct sk_buff *);
};

typedef unsigned int nf_hookfn(void *priv,
                               struct sk_buff *skb,
                               const struct nf_hook_state *state);
struct nf_hook_ops {
        /* User fills in from here down. */
        nf_hookfn                *hook;
        struct net_device        *dev;
        void                        *priv;
        u_int8_t                pf;
        unsigned int                hooknum;
        /* Hooks are ordered in ascending priority. */
        int                        priority;
};

struct nf_hook_entry {
        nf_hookfn                        *hook;
        void                                *priv;
};

struct nf_hook_entries_rcu_head {
        struct rcu_head head;
        void        *allocation;
};

struct nf_hook_entries {
        u16                                num_hook_entries;
        /* padding */
        struct nf_hook_entry                hooks[];

        /* trailer: pointers to original orig_ops of each hook,
         * followed by rcu_head and scratch space used for freeing
         * the structure via call_rcu.
         *
         *   This is not part of struct nf_hook_entry since its only
         *   needed in slow path (hook register/unregister):
         * const struct nf_hook_ops     *orig_ops[]
         *
         *   For the same reason, we store this at end -- its
         *   only needed when a hook is deleted, not during
         *   packet path processing:
         * struct nf_hook_entries_rcu_head     head
         */
};

#ifdef CONFIG_NETFILTER
static inline struct nf_hook_ops **nf_hook_entries_get_hook_ops(const struct nf_hook_entries *e)
{
        unsigned int n = e->num_hook_entries;
        const void *hook_end;

        hook_end = &e->hooks[n]; /* this is *past* ->hooks[]! */

        return (struct nf_hook_ops **)hook_end;
}

static inline int
nf_hook_entry_hookfn(const struct nf_hook_entry *entry, struct sk_buff *skb,
                     struct nf_hook_state *state)
{
        return entry->hook(entry->priv, skb, state);
}

static inline void nf_hook_state_init(struct nf_hook_state *p,
                                      unsigned int hook,
                                      u_int8_t pf,
                                      struct net_device *indev,
                                      struct net_device *outdev,
                                      struct sock *sk,
                                      struct net *net,
                                      int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
        p->hook = hook;
        p->pf = pf;
        p->in = indev;
        p->out = outdev;
        p->sk = sk;
        p->net = net;
        p->okfn = okfn;
}



struct nf_sockopt_ops {
        struct list_head list;

        u_int8_t pf;

        /* Non-inclusive ranges: use 0/0/NULL to never get called. */
        int set_optmin;
        int set_optmax;
        int (*set)(struct sock *sk, int optval, sockptr_t arg,
                   unsigned int len);
        int get_optmin;
        int get_optmax;
        int (*get)(struct sock *sk, int optval, void __user *user, int *len);
        /* Use the module struct to lock set/get code in place */
        struct module *owner;
};

/* Function to register/unregister hook points. */
int nf_register_net_hook(struct net *net, const struct nf_hook_ops *ops);
void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *ops);
int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg,
                          unsigned int n);
void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg,
                             unsigned int n);

/* Functions to register get/setsockopt ranges (non-inclusive).  You
   need to check permissions yourself! */
int nf_register_sockopt(struct nf_sockopt_ops *reg);
void nf_unregister_sockopt(struct nf_sockopt_ops *reg);

#ifdef CONFIG_JUMP_LABEL
extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
#endif

int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
                 const struct nf_hook_entries *e, unsigned int i);

void nf_hook_slow_list(struct list_head *head, struct nf_hook_state *state,
                       const struct nf_hook_entries *e);
/**
 *        nf_hook - call a netfilter hook
 *
 *        Returns 1 if the hook has allowed the packet to pass.  The function
 *        okfn must be invoked by the caller in this case.  Any other return
 *        value indicates the packet has been consumed by the hook.
 */
static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
                          struct sock *sk, struct sk_buff *skb,
                          struct net_device *indev, struct net_device *outdev,
                          int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
        struct nf_hook_entries *hook_head = NULL;
        int ret = 1;

#ifdef CONFIG_JUMP_LABEL
        if (__builtin_constant_p(pf) &&
            __builtin_constant_p(hook) &&
            !static_key_false(&nf_hooks_needed[pf][hook]))
                return 1;
#endif

        rcu_read_lock();
        switch (pf) {
        case NFPROTO_IPV4:
                hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]);
                break;
        case NFPROTO_IPV6:
                hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]);
                break;
        case NFPROTO_ARP:
#ifdef CONFIG_NETFILTER_FAMILY_ARP
                if (WARN_ON_ONCE(hook >= ARRAY_SIZE(net->nf.hooks_arp)))
                        break;
                hook_head = rcu_dereference(net->nf.hooks_arp[hook]);
#endif
                break;
        case NFPROTO_BRIDGE:
#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
                hook_head = rcu_dereference(net->nf.hooks_bridge[hook]);
#endif
                break;
        default:
                WARN_ON_ONCE(1);
                break;
        }

        if (hook_head) {
                struct nf_hook_state state;

                nf_hook_state_init(&state, hook, pf, indev, outdev,
                                   sk, net, okfn);

                ret = nf_hook_slow(skb, &state, hook_head, 0);
        }
        rcu_read_unlock();

        return ret;
}

/* Activate hook; either okfn or kfree_skb called, unless a hook
   returns NF_STOLEN (in which case, it's up to the hook to deal with
   the consequences).

   Returns -ERRNO if packet dropped.  Zero means queued, stolen or
   accepted.
*/

/* RR:
   > I don't want nf_hook to return anything because people might forget
   > about async and trust the return value to mean "packet was ok".

   AK:
   Just document it clearly, then you can expect some sense from kernel
   coders :)
*/

static inline int
NF_HOOK_COND(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
             struct sk_buff *skb, struct net_device *in, struct net_device *out,
             int (*okfn)(struct net *, struct sock *, struct sk_buff *),
             bool cond)
{
        int ret;

        if (!cond ||
            ((ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn)) == 1))
                ret = okfn(net, sk, skb);
        return ret;
}

static inline int
NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb,
        struct net_device *in, struct net_device *out,
        int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
        int ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn);
        if (ret == 1)
                ret = okfn(net, sk, skb);
        return ret;
}

static inline void
NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
             struct list_head *head, struct net_device *in, struct net_device *out,
             int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
        struct nf_hook_entries *hook_head = NULL;

#ifdef CONFIG_JUMP_LABEL
        if (__builtin_constant_p(pf) &&
            __builtin_constant_p(hook) &&
            !static_key_false(&nf_hooks_needed[pf][hook]))
                return;
#endif

        rcu_read_lock();
        switch (pf) {
        case NFPROTO_IPV4:
                hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]);
                break;
        case NFPROTO_IPV6:
                hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]);
                break;
        default:
                WARN_ON_ONCE(1);
                break;
        }

        if (hook_head) {
                struct nf_hook_state state;

                nf_hook_state_init(&state, hook, pf, in, out, sk, net, okfn);

                nf_hook_slow_list(head, &state, hook_head);
        }
        rcu_read_unlock();
}

/* Call setsockopt() */
int nf_setsockopt(struct sock *sk, u_int8_t pf, int optval, sockptr_t opt,
                  unsigned int len);
int nf_getsockopt(struct sock *sk, u_int8_t pf, int optval, char __user *opt,
                  int *len);

struct flowi;
struct nf_queue_entry;

__sum16 nf_checksum(struct sk_buff *skb, unsigned int hook,
                    unsigned int dataoff, u_int8_t protocol,
                    unsigned short family);

__sum16 nf_checksum_partial(struct sk_buff *skb, unsigned int hook,
                            unsigned int dataoff, unsigned int len,
                            u_int8_t protocol, unsigned short family);
int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
             bool strict, unsigned short family);
int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry);

#include <net/flow.h>

struct nf_conn;
enum nf_nat_manip_type;
struct nlattr;
enum ip_conntrack_dir;

struct nf_nat_hook {
        int (*parse_nat_setup)(struct nf_conn *ct, enum nf_nat_manip_type manip,
                               const struct nlattr *attr);
        void (*decode_session)(struct sk_buff *skb, struct flowi *fl);
        unsigned int (*manip_pkt)(struct sk_buff *skb, struct nf_conn *ct,
                                  enum nf_nat_manip_type mtype,
                                  enum ip_conntrack_dir dir);
};

extern struct nf_nat_hook __rcu *nf_nat_hook;

static inline void
nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family)
{
#if IS_ENABLED(CONFIG_NF_NAT)
        struct nf_nat_hook *nat_hook;

        rcu_read_lock();
        nat_hook = rcu_dereference(nf_nat_hook);
        if (nat_hook && nat_hook->decode_session)
                nat_hook->decode_session(skb, fl);
        rcu_read_unlock();
#endif
}

#else /* !CONFIG_NETFILTER */
static inline int
NF_HOOK_COND(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
             struct sk_buff *skb, struct net_device *in, struct net_device *out,
             int (*okfn)(struct net *, struct sock *, struct sk_buff *),
             bool cond)
{
        return okfn(net, sk, skb);
}

static inline int
NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
        struct sk_buff *skb, struct net_device *in, struct net_device *out,
        int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
        return okfn(net, sk, skb);
}

static inline void
NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
             struct list_head *head, struct net_device *in, struct net_device *out,
             int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
        /* nothing to do */
}

static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
                          struct sock *sk, struct sk_buff *skb,
                          struct net_device *indev, struct net_device *outdev,
                          int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
        return 1;
}
struct flowi;
static inline void
nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family)
{
}
#endif /*CONFIG_NETFILTER*/

#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#include <linux/netfilter/nf_conntrack_zones_common.h>

extern void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *) __rcu;
void nf_ct_attach(struct sk_buff *, const struct sk_buff *);
struct nf_conntrack_tuple;
bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
                         const struct sk_buff *skb);
#else
static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {}
struct nf_conntrack_tuple;
static inline bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
                                       const struct sk_buff *skb)
{
        return false;
}
#endif

struct nf_conn;
enum ip_conntrack_info;

struct nf_ct_hook {
        int (*update)(struct net *net, struct sk_buff *skb);
        void (*destroy)(struct nf_conntrack *);
        bool (*get_tuple_skb)(struct nf_conntrack_tuple *,
                              const struct sk_buff *);
};
extern struct nf_ct_hook __rcu *nf_ct_hook;

struct nlattr;

struct nfnl_ct_hook {
        struct nf_conn *(*get_ct)(const struct sk_buff *skb,
                                  enum ip_conntrack_info *ctinfo);
        size_t (*build_size)(const struct nf_conn *ct);
        int (*build)(struct sk_buff *skb, struct nf_conn *ct,
                     enum ip_conntrack_info ctinfo,
                     u_int16_t ct_attr, u_int16_t ct_info_attr);
        int (*parse)(const struct nlattr *attr, struct nf_conn *ct);
        int (*attach_expect)(const struct nlattr *attr, struct nf_conn *ct,
                             u32 portid, u32 report);
        void (*seq_adjust)(struct sk_buff *skb, struct nf_conn *ct,
                           enum ip_conntrack_info ctinfo, s32 off);
};
extern struct nfnl_ct_hook __rcu *nfnl_ct_hook;

/**
 * nf_skb_duplicated - TEE target has sent a packet
 *
 * When a xtables target sends a packet, the OUTPUT and POSTROUTING
 * hooks are traversed again, i.e. nft and xtables are invoked recursively.
 *
 * This is used by xtables TEE target to prevent the duplicated skb from
 * being duplicated again.
 */
DECLARE_PER_CPU(bool, nf_skb_duplicated);

#endif /*__LINUX_NETFILTER_H*/

































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __CPUHOTPLUG_H
#define __CPUHOTPLUG_H

#include <linux/types.h>

/*
 * CPU-up                        CPU-down
 *
 * BP                AP                BP                AP
 *
 * OFFLINE                        OFFLINE
 *   |                                  ^
 *   v                                  |
 * BRINGUP_CPU->AP_OFFLINE        BRINGUP_CPU  <- AP_IDLE_DEAD (idle thread/play_dead)
 *                  |                                AP_OFFLINE
 *                  v (IRQ-off)          ,---------------^
 *                AP_ONLNE          | (stop_machine)
 *                  |                TEARDOWN_CPU <-        AP_ONLINE_IDLE
 *                  |                                  ^
 *                  v                                  |
 *              AP_ACTIVE                        AP_ACTIVE
 */

enum cpuhp_state {
        CPUHP_INVALID = -1,
        CPUHP_OFFLINE = 0,
        CPUHP_CREATE_THREADS,
        CPUHP_PERF_PREPARE,
        CPUHP_PERF_X86_PREPARE,
        CPUHP_PERF_X86_AMD_UNCORE_PREP,
        CPUHP_PERF_POWER,
        CPUHP_PERF_SUPERH,
        CPUHP_X86_HPET_DEAD,
        CPUHP_X86_APB_DEAD,
        CPUHP_X86_MCE_DEAD,
        CPUHP_VIRT_NET_DEAD,
        CPUHP_SLUB_DEAD,
        CPUHP_DEBUG_OBJ_DEAD,
        CPUHP_MM_WRITEBACK_DEAD,
        CPUHP_MM_VMSTAT_DEAD,
        CPUHP_SOFTIRQ_DEAD,
        CPUHP_NET_MVNETA_DEAD,
        CPUHP_CPUIDLE_DEAD,
        CPUHP_ARM64_FPSIMD_DEAD,
        CPUHP_ARM_OMAP_WAKE_DEAD,
        CPUHP_IRQ_POLL_DEAD,
        CPUHP_BLOCK_SOFTIRQ_DEAD,
        CPUHP_ACPI_CPUDRV_DEAD,
        CPUHP_S390_PFAULT_DEAD,
        CPUHP_BLK_MQ_DEAD,
        CPUHP_FS_BUFF_DEAD,
        CPUHP_PRINTK_DEAD,
        CPUHP_MM_MEMCQ_DEAD,
        CPUHP_PERCPU_CNT_DEAD,
        CPUHP_RADIX_DEAD,
        CPUHP_PAGE_ALLOC_DEAD,
        CPUHP_NET_DEV_DEAD,
        CPUHP_PCI_XGENE_DEAD,
        CPUHP_IOMMU_INTEL_DEAD,
        CPUHP_LUSTRE_CFS_DEAD,
        CPUHP_AP_ARM_CACHE_B15_RAC_DEAD,
        CPUHP_PADATA_DEAD,
        CPUHP_RANDOM_PREPARE,
        CPUHP_WORKQUEUE_PREP,
        CPUHP_POWER_NUMA_PREPARE,
        CPUHP_HRTIMERS_PREPARE,
        CPUHP_PROFILE_PREPARE,
        CPUHP_X2APIC_PREPARE,
        CPUHP_SMPCFD_PREPARE,
        CPUHP_RELAY_PREPARE,
        CPUHP_SLAB_PREPARE,
        CPUHP_MD_RAID5_PREPARE,
        CPUHP_RCUTREE_PREP,
        CPUHP_CPUIDLE_COUPLED_PREPARE,
        CPUHP_POWERPC_PMAC_PREPARE,
        CPUHP_POWERPC_MMU_CTX_PREPARE,
        CPUHP_XEN_PREPARE,
        CPUHP_XEN_EVTCHN_PREPARE,
        CPUHP_ARM_SHMOBILE_SCU_PREPARE,
        CPUHP_SH_SH3X_PREPARE,
        CPUHP_NET_FLOW_PREPARE,
        CPUHP_TOPOLOGY_PREPARE,
        CPUHP_NET_IUCV_PREPARE,
        CPUHP_ARM_BL_PREPARE,
        CPUHP_TRACE_RB_PREPARE,
        CPUHP_MM_ZS_PREPARE,
        CPUHP_MM_ZSWP_MEM_PREPARE,
        CPUHP_MM_ZSWP_POOL_PREPARE,
        CPUHP_KVM_PPC_BOOK3S_PREPARE,
        CPUHP_ZCOMP_PREPARE,
        CPUHP_TIMERS_PREPARE,
        CPUHP_MIPS_SOC_PREPARE,
        CPUHP_BP_PREPARE_DYN,
        CPUHP_BP_PREPARE_DYN_END                = CPUHP_BP_PREPARE_DYN + 20,
        CPUHP_BRINGUP_CPU,
        CPUHP_AP_IDLE_DEAD,
        CPUHP_AP_OFFLINE,
        CPUHP_AP_SCHED_STARTING,
        CPUHP_AP_RCUTREE_DYING,
        CPUHP_AP_CPU_PM_STARTING,
        CPUHP_AP_IRQ_GIC_STARTING,
        CPUHP_AP_IRQ_HIP04_STARTING,
        CPUHP_AP_IRQ_ARMADA_XP_STARTING,
        CPUHP_AP_IRQ_BCM2836_STARTING,
        CPUHP_AP_IRQ_MIPS_GIC_STARTING,
        CPUHP_AP_IRQ_RISCV_STARTING,
        CPUHP_AP_IRQ_SIFIVE_PLIC_STARTING,
        CPUHP_AP_ARM_MVEBU_COHERENCY,
        CPUHP_AP_MICROCODE_LOADER,
        CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING,
        CPUHP_AP_PERF_X86_STARTING,
        CPUHP_AP_PERF_X86_AMD_IBS_STARTING,
        CPUHP_AP_PERF_X86_CQM_STARTING,
        CPUHP_AP_PERF_X86_CSTATE_STARTING,
        CPUHP_AP_PERF_XTENSA_STARTING,
        CPUHP_AP_MIPS_OP_LOONGSON3_STARTING,
        CPUHP_AP_ARM_VFP_STARTING,
        CPUHP_AP_ARM64_DEBUG_MONITORS_STARTING,
        CPUHP_AP_PERF_ARM_HW_BREAKPOINT_STARTING,
        CPUHP_AP_PERF_ARM_ACPI_STARTING,
        CPUHP_AP_PERF_ARM_STARTING,
        CPUHP_AP_ARM_L2X0_STARTING,
        CPUHP_AP_EXYNOS4_MCT_TIMER_STARTING,
        CPUHP_AP_ARM_ARCH_TIMER_STARTING,
        CPUHP_AP_ARM_GLOBAL_TIMER_STARTING,
        CPUHP_AP_JCORE_TIMER_STARTING,
        CPUHP_AP_ARM_TWD_STARTING,
        CPUHP_AP_QCOM_TIMER_STARTING,
        CPUHP_AP_TEGRA_TIMER_STARTING,
        CPUHP_AP_ARMADA_TIMER_STARTING,
        CPUHP_AP_MARCO_TIMER_STARTING,
        CPUHP_AP_MIPS_GIC_TIMER_STARTING,
        CPUHP_AP_ARC_TIMER_STARTING,
        CPUHP_AP_RISCV_TIMER_STARTING,
        CPUHP_AP_CLINT_TIMER_STARTING,
        CPUHP_AP_CSKY_TIMER_STARTING,
        CPUHP_AP_TI_GP_TIMER_STARTING,
        CPUHP_AP_HYPERV_TIMER_STARTING,
        CPUHP_AP_KVM_STARTING,
        CPUHP_AP_KVM_ARM_VGIC_INIT_STARTING,
        CPUHP_AP_KVM_ARM_VGIC_STARTING,
        CPUHP_AP_KVM_ARM_TIMER_STARTING,
        /* Must be the last timer callback */
        CPUHP_AP_DUMMY_TIMER_STARTING,
        CPUHP_AP_ARM_XEN_STARTING,
        CPUHP_AP_ARM_XEN_RUNSTATE_STARTING,
        CPUHP_AP_ARM_CORESIGHT_STARTING,
        CPUHP_AP_ARM_CORESIGHT_CTI_STARTING,
        CPUHP_AP_ARM64_ISNDEP_STARTING,
        CPUHP_AP_SMPCFD_DYING,
        CPUHP_AP_HRTIMERS_DYING,
        CPUHP_AP_X86_TBOOT_DYING,
        CPUHP_AP_ARM_CACHE_B15_RAC_DYING,
        CPUHP_AP_ONLINE,
        CPUHP_TEARDOWN_CPU,
        CPUHP_AP_ONLINE_IDLE,
        CPUHP_AP_SMPBOOT_THREADS,
        CPUHP_AP_X86_VDSO_VMA_ONLINE,
        CPUHP_AP_IRQ_AFFINITY_ONLINE,
        CPUHP_AP_BLK_MQ_ONLINE,
        CPUHP_AP_ARM_MVEBU_SYNC_CLOCKS,
        CPUHP_AP_X86_INTEL_EPB_ONLINE,
        CPUHP_AP_PERF_ONLINE,
        CPUHP_AP_PERF_X86_ONLINE,
        CPUHP_AP_PERF_X86_UNCORE_ONLINE,
        CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE,
        CPUHP_AP_PERF_X86_AMD_POWER_ONLINE,
        CPUHP_AP_PERF_X86_RAPL_ONLINE,
        CPUHP_AP_PERF_X86_CQM_ONLINE,
        CPUHP_AP_PERF_X86_CSTATE_ONLINE,
        CPUHP_AP_PERF_S390_CF_ONLINE,
        CPUHP_AP_PERF_S390_SF_ONLINE,
        CPUHP_AP_PERF_ARM_CCI_ONLINE,
        CPUHP_AP_PERF_ARM_CCN_ONLINE,
        CPUHP_AP_PERF_ARM_HISI_DDRC_ONLINE,
        CPUHP_AP_PERF_ARM_HISI_HHA_ONLINE,
        CPUHP_AP_PERF_ARM_HISI_L3_ONLINE,
        CPUHP_AP_PERF_ARM_L2X0_ONLINE,
        CPUHP_AP_PERF_ARM_QCOM_L2_ONLINE,
        CPUHP_AP_PERF_ARM_QCOM_L3_ONLINE,
        CPUHP_AP_PERF_ARM_APM_XGENE_ONLINE,
        CPUHP_AP_PERF_ARM_CAVIUM_TX2_UNCORE_ONLINE,
        CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE,
        CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE,
        CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE,
        CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE,
        CPUHP_AP_PERF_POWERPC_HV_24x7_ONLINE,
        CPUHP_AP_PERF_POWERPC_HV_GPCI_ONLINE,
        CPUHP_AP_WATCHDOG_ONLINE,
        CPUHP_AP_WORKQUEUE_ONLINE,
        CPUHP_AP_RANDOM_ONLINE,
        CPUHP_AP_RCUTREE_ONLINE,
        CPUHP_AP_BASE_CACHEINFO_ONLINE,
        CPUHP_AP_ONLINE_DYN,
        CPUHP_AP_ONLINE_DYN_END                = CPUHP_AP_ONLINE_DYN + 30,
        CPUHP_AP_X86_HPET_ONLINE,
        CPUHP_AP_X86_KVM_CLK_ONLINE,
        CPUHP_AP_ACTIVE,
        CPUHP_ONLINE,
};

int __cpuhp_setup_state(enum cpuhp_state state,        const char *name, bool invoke,
                        int (*startup)(unsigned int cpu),
                        int (*teardown)(unsigned int cpu), bool multi_instance);

int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state, const char *name,
                                   bool invoke,
                                   int (*startup)(unsigned int cpu),
                                   int (*teardown)(unsigned int cpu),
                                   bool multi_instance);
/**
 * cpuhp_setup_state - Setup hotplug state callbacks with calling the callbacks
 * @state:        The state for which the calls are installed
 * @name:        Name of the callback (will be used in debug output)
 * @startup:        startup callback function
 * @teardown:        teardown callback function
 *
 * Installs the callback functions and invokes the startup callback on
 * the present cpus which have already reached the @state.
 */
static inline int cpuhp_setup_state(enum cpuhp_state state,
                                    const char *name,
                                    int (*startup)(unsigned int cpu),
                                    int (*teardown)(unsigned int cpu))
{
        return __cpuhp_setup_state(state, name, true, startup, teardown, false);
}

static inline int cpuhp_setup_state_cpuslocked(enum cpuhp_state state,
                                               const char *name,
                                               int (*startup)(unsigned int cpu),
                                               int (*teardown)(unsigned int cpu))
{
        return __cpuhp_setup_state_cpuslocked(state, name, true, startup,
                                              teardown, false);
}

/**
 * cpuhp_setup_state_nocalls - Setup hotplug state callbacks without calling the
 *                               callbacks
 * @state:        The state for which the calls are installed
 * @name:        Name of the callback.
 * @startup:        startup callback function
 * @teardown:        teardown callback function
 *
 * Same as @cpuhp_setup_state except that no calls are executed are invoked
 * during installation of this callback. NOP if SMP=n or HOTPLUG_CPU=n.
 */
static inline int cpuhp_setup_state_nocalls(enum cpuhp_state state,
                                            const char *name,
                                            int (*startup)(unsigned int cpu),
                                            int (*teardown)(unsigned int cpu))
{
        return __cpuhp_setup_state(state, name, false, startup, teardown,
                                   false);
}

static inline int cpuhp_setup_state_nocalls_cpuslocked(enum cpuhp_state state,
                                                     const char *name,
                                                     int (*startup)(unsigned int cpu),
                                                     int (*teardown)(unsigned int cpu))
{
        return __cpuhp_setup_state_cpuslocked(state, name, false, startup,
                                            teardown, false);
}

/**
 * cpuhp_setup_state_multi - Add callbacks for multi state
 * @state:        The state for which the calls are installed
 * @name:        Name of the callback.
 * @startup:        startup callback function
 * @teardown:        teardown callback function
 *
 * Sets the internal multi_instance flag and prepares a state to work as a multi
 * instance callback. No callbacks are invoked at this point. The callbacks are
 * invoked once an instance for this state are registered via
 * @cpuhp_state_add_instance or @cpuhp_state_add_instance_nocalls.
 */
static inline int cpuhp_setup_state_multi(enum cpuhp_state state,
                                          const char *name,
                                          int (*startup)(unsigned int cpu,
                                                         struct hlist_node *node),
                                          int (*teardown)(unsigned int cpu,
                                                          struct hlist_node *node))
{
        return __cpuhp_setup_state(state, name, false,
                                   (void *) startup,
                                   (void *) teardown, true);
}

int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,
                               bool invoke);
int __cpuhp_state_add_instance_cpuslocked(enum cpuhp_state state,
                                          struct hlist_node *node, bool invoke);

/**
 * cpuhp_state_add_instance - Add an instance for a state and invoke startup
 *                            callback.
 * @state:        The state for which the instance is installed
 * @node:        The node for this individual state.
 *
 * Installs the instance for the @state and invokes the startup callback on
 * the present cpus which have already reached the @state. The @state must have
 * been earlier marked as multi-instance by @cpuhp_setup_state_multi.
 */
static inline int cpuhp_state_add_instance(enum cpuhp_state state,
                                           struct hlist_node *node)
{
        return __cpuhp_state_add_instance(state, node, true);
}

/**
 * cpuhp_state_add_instance_nocalls - Add an instance for a state without
 *                                    invoking the startup callback.
 * @state:        The state for which the instance is installed
 * @node:        The node for this individual state.
 *
 * Installs the instance for the @state The @state must have been earlier
 * marked as multi-instance by @cpuhp_setup_state_multi.
 */
static inline int cpuhp_state_add_instance_nocalls(enum cpuhp_state state,
                                                   struct hlist_node *node)
{
        return __cpuhp_state_add_instance(state, node, false);
}

static inline int
cpuhp_state_add_instance_nocalls_cpuslocked(enum cpuhp_state state,
                                            struct hlist_node *node)
{
        return __cpuhp_state_add_instance_cpuslocked(state, node, false);
}

void __cpuhp_remove_state(enum cpuhp_state state, bool invoke);
void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke);

/**
 * cpuhp_remove_state - Remove hotplug state callbacks and invoke the teardown
 * @state:        The state for which the calls are removed
 *
 * Removes the callback functions and invokes the teardown callback on
 * the present cpus which have already reached the @state.
 */
static inline void cpuhp_remove_state(enum cpuhp_state state)
{
        __cpuhp_remove_state(state, true);
}

/**
 * cpuhp_remove_state_nocalls - Remove hotplug state callbacks without invoking
 *                                teardown
 * @state:        The state for which the calls are removed
 */
static inline void cpuhp_remove_state_nocalls(enum cpuhp_state state)
{
        __cpuhp_remove_state(state, false);
}

static inline void cpuhp_remove_state_nocalls_cpuslocked(enum cpuhp_state state)
{
        __cpuhp_remove_state_cpuslocked(state, false);
}

/**
 * cpuhp_remove_multi_state - Remove hotplug multi state callback
 * @state:        The state for which the calls are removed
 *
 * Removes the callback functions from a multi state. This is the reverse of
 * cpuhp_setup_state_multi(). All instances should have been removed before
 * invoking this function.
 */
static inline void cpuhp_remove_multi_state(enum cpuhp_state state)
{
        __cpuhp_remove_state(state, false);
}

int __cpuhp_state_remove_instance(enum cpuhp_state state,
                                  struct hlist_node *node, bool invoke);

/**
 * cpuhp_state_remove_instance - Remove hotplug instance from state and invoke
 *                               the teardown callback
 * @state:        The state from which the instance is removed
 * @node:        The node for this individual state.
 *
 * Removes the instance and invokes the teardown callback on the present cpus
 * which have already reached the @state.
 */
static inline int cpuhp_state_remove_instance(enum cpuhp_state state,
                                              struct hlist_node *node)
{
        return __cpuhp_state_remove_instance(state, node, true);
}

/**
 * cpuhp_state_remove_instance_nocalls - Remove hotplug instance from state
 *                                         without invoking the reatdown callback
 * @state:        The state from which the instance is removed
 * @node:        The node for this individual state.
 *
 * Removes the instance without invoking the teardown callback.
 */
static inline int cpuhp_state_remove_instance_nocalls(enum cpuhp_state state,
                                                      struct hlist_node *node)
{
        return __cpuhp_state_remove_instance(state, node, false);
}

#ifdef CONFIG_SMP
void cpuhp_online_idle(enum cpuhp_state state);
#else
static inline void cpuhp_online_idle(enum cpuhp_state state) { }
#endif

#endif











































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * NetLabel System
 *
 * The NetLabel system manages static and dynamic label mappings for network
 * protocols such as CIPSO and RIPSO.
 *
 * Author: Paul Moore <paul@paul-moore.com>
 */

/*
 * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008
 */

#ifndef _NETLABEL_H
#define _NETLABEL_H

#include <linux/types.h>
#include <linux/slab.h>
#include <linux/net.h>
#include <linux/skbuff.h>
#include <linux/in.h>
#include <linux/in6.h>
#include <net/netlink.h>
#include <net/request_sock.h>
#include <linux/refcount.h>

struct cipso_v4_doi;
struct calipso_doi;

/*
 * NetLabel - A management interface for maintaining network packet label
 *            mapping tables for explicit packet labling protocols.
 *
 * Network protocols such as CIPSO and RIPSO require a label translation layer
 * to convert the label on the packet into something meaningful on the host
 * machine.  In the current Linux implementation these mapping tables live
 * inside the kernel; NetLabel provides a mechanism for user space applications
 * to manage these mapping tables.
 *
 * NetLabel makes use of the Generic NETLINK mechanism as a transport layer to
 * send messages between kernel and user space.  The general format of a
 * NetLabel message is shown below:
 *
 *  +-----------------+-------------------+--------- --- -- -
 *  | struct nlmsghdr | struct genlmsghdr | payload
 *  +-----------------+-------------------+--------- --- -- -
 *
 * The 'nlmsghdr' and 'genlmsghdr' structs should be dealt with like normal.
 * The payload is dependent on the subsystem specified in the
 * 'nlmsghdr->nlmsg_type' and should be defined below, supporting functions
 * should be defined in the corresponding net/netlabel/netlabel_<subsys>.h|c
 * file.  All of the fields in the NetLabel payload are NETLINK attributes, see
 * the include/net/netlink.h file for more information on NETLINK attributes.
 *
 */

/*
 * NetLabel NETLINK protocol
 */

/* NetLabel NETLINK protocol version
 *  1: initial version
 *  2: added static labels for unlabeled connections
 *  3: network selectors added to the NetLabel/LSM domain mapping and the
 *     CIPSO_V4_MAP_LOCAL CIPSO mapping was added
 */
#define NETLBL_PROTO_VERSION            3

/* NetLabel NETLINK types/families */
#define NETLBL_NLTYPE_NONE              0
#define NETLBL_NLTYPE_MGMT              1
#define NETLBL_NLTYPE_MGMT_NAME         "NLBL_MGMT"
#define NETLBL_NLTYPE_RIPSO             2
#define NETLBL_NLTYPE_RIPSO_NAME        "NLBL_RIPSO"
#define NETLBL_NLTYPE_CIPSOV4           3
#define NETLBL_NLTYPE_CIPSOV4_NAME      "NLBL_CIPSOv4"
#define NETLBL_NLTYPE_CIPSOV6           4
#define NETLBL_NLTYPE_CIPSOV6_NAME      "NLBL_CIPSOv6"
#define NETLBL_NLTYPE_UNLABELED         5
#define NETLBL_NLTYPE_UNLABELED_NAME    "NLBL_UNLBL"
#define NETLBL_NLTYPE_ADDRSELECT        6
#define NETLBL_NLTYPE_ADDRSELECT_NAME   "NLBL_ADRSEL"
#define NETLBL_NLTYPE_CALIPSO           7
#define NETLBL_NLTYPE_CALIPSO_NAME      "NLBL_CALIPSO"

/*
 * NetLabel - Kernel API for accessing the network packet label mappings.
 *
 * The following functions are provided for use by other kernel modules,
 * specifically kernel LSM modules, to provide a consistent, transparent API
 * for dealing with explicit packet labeling protocols such as CIPSO and
 * RIPSO.  The functions defined here are implemented in the
 * net/netlabel/netlabel_kapi.c file.
 *
 */

/* NetLabel audit information */
struct netlbl_audit {
        u32 secid;
        kuid_t loginuid;
        unsigned int sessionid;
};

/*
 * LSM security attributes
 */

/**
 * struct netlbl_lsm_cache - NetLabel LSM security attribute cache
 * @refcount: atomic reference counter
 * @free: LSM supplied function to free the cache data
 * @data: LSM supplied cache data
 *
 * Description:
 * This structure is provided for LSMs which wish to make use of the NetLabel
 * caching mechanism to store LSM specific data/attributes in the NetLabel
 * cache.  If the LSM has to perform a lot of translation from the NetLabel
 * security attributes into it's own internal representation then the cache
 * mechanism can provide a way to eliminate some or all of that translation
 * overhead on a cache hit.
 *
 */
struct netlbl_lsm_cache {
        refcount_t refcount;
        void (*free) (const void *data);
        void *data;
};

/**
 * struct netlbl_lsm_catmap - NetLabel LSM secattr category bitmap
 * @startbit: the value of the lowest order bit in the bitmap
 * @bitmap: the category bitmap
 * @next: pointer to the next bitmap "node" or NULL
 *
 * Description:
 * This structure is used to represent category bitmaps.  Due to the large
 * number of categories supported by most labeling protocols it is not
 * practical to transfer a full bitmap internally so NetLabel adopts a sparse
 * bitmap structure modeled after SELinux's ebitmap structure.
 * The catmap bitmap field MUST be a power of two in length and large
 * enough to hold at least 240 bits.  Special care (i.e. check the code!)
 * should be used when changing these values as the LSM implementation
 * probably has functions which rely on the sizes of these types to speed
 * processing.
 *
 */
#define NETLBL_CATMAP_MAPTYPE           u64
#define NETLBL_CATMAP_MAPCNT            4
#define NETLBL_CATMAP_MAPSIZE           (sizeof(NETLBL_CATMAP_MAPTYPE) * 8)
#define NETLBL_CATMAP_SIZE              (NETLBL_CATMAP_MAPSIZE * \
                                         NETLBL_CATMAP_MAPCNT)
#define NETLBL_CATMAP_BIT               (NETLBL_CATMAP_MAPTYPE)0x01
struct netlbl_lsm_catmap {
        u32 startbit;
        NETLBL_CATMAP_MAPTYPE bitmap[NETLBL_CATMAP_MAPCNT];
        struct netlbl_lsm_catmap *next;
};

/**
 * struct netlbl_lsm_secattr - NetLabel LSM security attributes
 * @flags: indicate structure attributes, see NETLBL_SECATTR_*
 * @type: indicate the NLTYPE of the attributes
 * @domain: the NetLabel LSM domain
 * @cache: NetLabel LSM specific cache
 * @attr.mls: MLS sensitivity label
 * @attr.mls.cat: MLS category bitmap
 * @attr.mls.lvl: MLS sensitivity level
 * @attr.secid: LSM specific secid token
 *
 * Description:
 * This structure is used to pass security attributes between NetLabel and the
 * LSM modules.  The flags field is used to specify which fields within the
 * struct are valid and valid values can be created by bitwise OR'ing the
 * NETLBL_SECATTR_* defines.  The domain field is typically set by the LSM to
 * specify domain specific configuration settings and is not usually used by
 * NetLabel itself when returning security attributes to the LSM.
 *
 */
struct netlbl_lsm_secattr {
        u32 flags;
        /* bitmap values for 'flags' */
#define NETLBL_SECATTR_NONE             0x00000000
#define NETLBL_SECATTR_DOMAIN           0x00000001
#define NETLBL_SECATTR_DOMAIN_CPY       (NETLBL_SECATTR_DOMAIN | \
                                         NETLBL_SECATTR_FREE_DOMAIN)
#define NETLBL_SECATTR_CACHE            0x00000002
#define NETLBL_SECATTR_MLS_LVL          0x00000004
#define NETLBL_SECATTR_MLS_CAT          0x00000008
#define NETLBL_SECATTR_SECID            0x00000010
        /* bitmap meta-values for 'flags' */
#define NETLBL_SECATTR_FREE_DOMAIN      0x01000000
#define NETLBL_SECATTR_CACHEABLE        (NETLBL_SECATTR_MLS_LVL | \
                                         NETLBL_SECATTR_MLS_CAT | \
                                         NETLBL_SECATTR_SECID)
        u32 type;
        char *domain;
        struct netlbl_lsm_cache *cache;
        struct {
                struct {
                        struct netlbl_lsm_catmap *cat;
                        u32 lvl;
                } mls;
                u32 secid;
        } attr;
};

/**
 * struct netlbl_calipso_ops - NetLabel CALIPSO operations
 * @doi_add: add a CALIPSO DOI
 * @doi_free: free a CALIPSO DOI
 * @doi_getdef: returns a reference to a DOI
 * @doi_putdef: releases a reference of a DOI
 * @doi_walk: enumerate the DOI list
 * @sock_getattr: retrieve the socket's attr
 * @sock_setattr: set the socket's attr
 * @sock_delattr: remove the socket's attr
 * @req_setattr: set the req socket's attr
 * @req_delattr: remove the req socket's attr
 * @opt_getattr: retrieve attr from memory block
 * @skbuff_optptr: find option in packet
 * @skbuff_setattr: set the skbuff's attr
 * @skbuff_delattr: remove the skbuff's attr
 * @cache_invalidate: invalidate cache
 * @cache_add: add cache entry
 *
 * Description:
 * This structure is filled out by the CALIPSO engine and passed
 * to the NetLabel core via a call to netlbl_calipso_ops_register().
 * It enables the CALIPSO engine (and hence IPv6) to be compiled
 * as a module.
 */
struct netlbl_calipso_ops {
        int (*doi_add)(struct calipso_doi *doi_def,
                       struct netlbl_audit *audit_info);
        void (*doi_free)(struct calipso_doi *doi_def);
        int (*doi_remove)(u32 doi, struct netlbl_audit *audit_info);
        struct calipso_doi *(*doi_getdef)(u32 doi);
        void (*doi_putdef)(struct calipso_doi *doi_def);
        int (*doi_walk)(u32 *skip_cnt,
                        int (*callback)(struct calipso_doi *doi_def, void *arg),
                        void *cb_arg);
        int (*sock_getattr)(struct sock *sk,
                            struct netlbl_lsm_secattr *secattr);
        int (*sock_setattr)(struct sock *sk,
                            const struct calipso_doi *doi_def,
                            const struct netlbl_lsm_secattr *secattr);
        void (*sock_delattr)(struct sock *sk);
        int (*req_setattr)(struct request_sock *req,
                           const struct calipso_doi *doi_def,
                           const struct netlbl_lsm_secattr *secattr);
        void (*req_delattr)(struct request_sock *req);
        int (*opt_getattr)(const unsigned char *calipso,
                           struct netlbl_lsm_secattr *secattr);
        unsigned char *(*skbuff_optptr)(const struct sk_buff *skb);
        int (*skbuff_setattr)(struct sk_buff *skb,
                              const struct calipso_doi *doi_def,
                              const struct netlbl_lsm_secattr *secattr);
        int (*skbuff_delattr)(struct sk_buff *skb);
        void (*cache_invalidate)(void);
        int (*cache_add)(const unsigned char *calipso_ptr,
                         const struct netlbl_lsm_secattr *secattr);
};

/*
 * LSM security attribute operations (inline)
 */

/**
 * netlbl_secattr_cache_alloc - Allocate and initialize a secattr cache
 * @flags: the memory allocation flags
 *
 * Description:
 * Allocate and initialize a netlbl_lsm_cache structure.  Returns a pointer
 * on success, NULL on failure.
 *
 */
static inline struct netlbl_lsm_cache *netlbl_secattr_cache_alloc(gfp_t flags)
{
        struct netlbl_lsm_cache *cache;

        cache = kzalloc(sizeof(*cache), flags);
        if (cache)
                refcount_set(&cache->refcount, 1);
        return cache;
}

/**
 * netlbl_secattr_cache_free - Frees a netlbl_lsm_cache struct
 * @cache: the struct to free
 *
 * Description:
 * Frees @secattr including all of the internal buffers.
 *
 */
static inline void netlbl_secattr_cache_free(struct netlbl_lsm_cache *cache)
{
        if (!refcount_dec_and_test(&cache->refcount))
                return;

        if (cache->free)
                cache->free(cache->data);
        kfree(cache);
}

/**
 * netlbl_catmap_alloc - Allocate a LSM secattr catmap
 * @flags: memory allocation flags
 *
 * Description:
 * Allocate memory for a LSM secattr catmap, returns a pointer on success, NULL
 * on failure.
 *
 */
static inline struct netlbl_lsm_catmap *netlbl_catmap_alloc(gfp_t flags)
{
        return kzalloc(sizeof(struct netlbl_lsm_catmap), flags);
}

/**
 * netlbl_catmap_free - Free a LSM secattr catmap
 * @catmap: the category bitmap
 *
 * Description:
 * Free a LSM secattr catmap.
 *
 */
static inline void netlbl_catmap_free(struct netlbl_lsm_catmap *catmap)
{
        struct netlbl_lsm_catmap *iter;

        while (catmap) {
                iter = catmap;
                catmap = catmap->next;
                kfree(iter);
        }
}

/**
 * netlbl_secattr_init - Initialize a netlbl_lsm_secattr struct
 * @secattr: the struct to initialize
 *
 * Description:
 * Initialize an already allocated netlbl_lsm_secattr struct.
 *
 */
static inline void netlbl_secattr_init(struct netlbl_lsm_secattr *secattr)
{
        memset(secattr, 0, sizeof(*secattr));
}

/**
 * netlbl_secattr_destroy - Clears a netlbl_lsm_secattr struct
 * @secattr: the struct to clear
 *
 * Description:
 * Destroys the @secattr struct, including freeing all of the internal buffers.
 * The struct must be reset with a call to netlbl_secattr_init() before reuse.
 *
 */
static inline void netlbl_secattr_destroy(struct netlbl_lsm_secattr *secattr)
{
        if (secattr->flags & NETLBL_SECATTR_FREE_DOMAIN)
                kfree(secattr->domain);
        if (secattr->flags & NETLBL_SECATTR_CACHE)
                netlbl_secattr_cache_free(secattr->cache);
        if (secattr->flags & NETLBL_SECATTR_MLS_CAT)
                netlbl_catmap_free(secattr->attr.mls.cat);
}

/**
 * netlbl_secattr_alloc - Allocate and initialize a netlbl_lsm_secattr struct
 * @flags: the memory allocation flags
 *
 * Description:
 * Allocate and initialize a netlbl_lsm_secattr struct.  Returns a valid
 * pointer on success, or NULL on failure.
 *
 */
static inline struct netlbl_lsm_secattr *netlbl_secattr_alloc(gfp_t flags)
{
        return kzalloc(sizeof(struct netlbl_lsm_secattr), flags);
}

/**
 * netlbl_secattr_free - Frees a netlbl_lsm_secattr struct
 * @secattr: the struct to free
 *
 * Description:
 * Frees @secattr including all of the internal buffers.
 *
 */
static inline void netlbl_secattr_free(struct netlbl_lsm_secattr *secattr)
{
        netlbl_secattr_destroy(secattr);
        kfree(secattr);
}

#ifdef CONFIG_NETLABEL
/*
 * LSM configuration operations
 */
int netlbl_cfg_map_del(const char *domain,
                       u16 family,
                       const void *addr,
                       const void *mask,
                       struct netlbl_audit *audit_info);
int netlbl_cfg_unlbl_map_add(const char *domain,
                             u16 family,
                             const void *addr,
                             const void *mask,
                             struct netlbl_audit *audit_info);
int netlbl_cfg_unlbl_static_add(struct net *net,
                                const char *dev_name,
                                const void *addr,
                                const void *mask,
                                u16 family,
                                u32 secid,
                                struct netlbl_audit *audit_info);
int netlbl_cfg_unlbl_static_del(struct net *net,
                                const char *dev_name,
                                const void *addr,
                                const void *mask,
                                u16 family,
                                struct netlbl_audit *audit_info);
int netlbl_cfg_cipsov4_add(struct cipso_v4_doi *doi_def,
                           struct netlbl_audit *audit_info);
void netlbl_cfg_cipsov4_del(u32 doi, struct netlbl_audit *audit_info);
int netlbl_cfg_cipsov4_map_add(u32 doi,
                               const char *domain,
                               const struct in_addr *addr,
                               const struct in_addr *mask,
                               struct netlbl_audit *audit_info);
int netlbl_cfg_calipso_add(struct calipso_doi *doi_def,
                           struct netlbl_audit *audit_info);
void netlbl_cfg_calipso_del(u32 doi, struct netlbl_audit *audit_info);
int netlbl_cfg_calipso_map_add(u32 doi,
                               const char *domain,
                               const struct in6_addr *addr,
                               const struct in6_addr *mask,
                               struct netlbl_audit *audit_info);
/*
 * LSM security attribute operations
 */
int netlbl_catmap_walk(struct netlbl_lsm_catmap *catmap, u32 offset);
int netlbl_catmap_walkrng(struct netlbl_lsm_catmap *catmap, u32 offset);
int netlbl_catmap_getlong(struct netlbl_lsm_catmap *catmap,
                          u32 *offset,
                          unsigned long *bitmap);
int netlbl_catmap_setbit(struct netlbl_lsm_catmap **catmap,
                         u32 bit,
                         gfp_t flags);
int netlbl_catmap_setrng(struct netlbl_lsm_catmap **catmap,
                         u32 start,
                         u32 end,
                         gfp_t flags);
int netlbl_catmap_setlong(struct netlbl_lsm_catmap **catmap,
                          u32 offset,
                          unsigned long bitmap,
                          gfp_t flags);

/* Bitmap functions
 */
int netlbl_bitmap_walk(const unsigned char *bitmap, u32 bitmap_len,
                       u32 offset, u8 state);
void netlbl_bitmap_setbit(unsigned char *bitmap, u32 bit, u8 state);

/*
 * LSM protocol operations (NetLabel LSM/kernel API)
 */
int netlbl_enabled(void);
int netlbl_sock_setattr(struct sock *sk,
                        u16 family,
                        const struct netlbl_lsm_secattr *secattr);
void netlbl_sock_delattr(struct sock *sk);
int netlbl_sock_getattr(struct sock *sk,
                        struct netlbl_lsm_secattr *secattr);
int netlbl_conn_setattr(struct sock *sk,
                        struct sockaddr *addr,
                        const struct netlbl_lsm_secattr *secattr);
int netlbl_req_setattr(struct request_sock *req,
                       const struct netlbl_lsm_secattr *secattr);
void netlbl_req_delattr(struct request_sock *req);
int netlbl_skbuff_setattr(struct sk_buff *skb,
                          u16 family,
                          const struct netlbl_lsm_secattr *secattr);
int netlbl_skbuff_getattr(const struct sk_buff *skb,
                          u16 family,
                          struct netlbl_lsm_secattr *secattr);
void netlbl_skbuff_err(struct sk_buff *skb, u16 family, int error, int gateway);

/*
 * LSM label mapping cache operations
 */
void netlbl_cache_invalidate(void);
int netlbl_cache_add(const struct sk_buff *skb, u16 family,
                     const struct netlbl_lsm_secattr *secattr);

/*
 * Protocol engine operations
 */
struct audit_buffer *netlbl_audit_start(int type,
                                        struct netlbl_audit *audit_info);
#else
static inline int netlbl_cfg_map_del(const char *domain,
                                     u16 family,
                                     const void *addr,
                                     const void *mask,
                                     struct netlbl_audit *audit_info)
{
        return -ENOSYS;
}
static inline int netlbl_cfg_unlbl_map_add(const char *domain,
                                           u16 family,
                                           void *addr,
                                           void *mask,
                                           struct netlbl_audit *audit_info)
{
        return -ENOSYS;
}
static inline int netlbl_cfg_unlbl_static_add(struct net *net,
                                              const char *dev_name,
                                              const void *addr,
                                              const void *mask,
                                              u16 family,
                                              u32 secid,
                                              struct netlbl_audit *audit_info)
{
        return -ENOSYS;
}
static inline int netlbl_cfg_unlbl_static_del(struct net *net,
                                              const char *dev_name,
                                              const void *addr,
                                              const void *mask,
                                              u16 family,
                                              struct netlbl_audit *audit_info)
{
        return -ENOSYS;
}
static inline int netlbl_cfg_cipsov4_add(struct cipso_v4_doi *doi_def,
                                         struct netlbl_audit *audit_info)
{
        return -ENOSYS;
}
static inline void netlbl_cfg_cipsov4_del(u32 doi,
                                          struct netlbl_audit *audit_info)
{
        return;
}
static inline int netlbl_cfg_cipsov4_map_add(u32 doi,
                                             const char *domain,
                                             const struct in_addr *addr,
                                             const struct in_addr *mask,
                                             struct netlbl_audit *audit_info)
{
        return -ENOSYS;
}
static inline int netlbl_cfg_calipso_add(struct calipso_doi *doi_def,
                                         struct netlbl_audit *audit_info)
{
        return -ENOSYS;
}
static inline void netlbl_cfg_calipso_del(u32 doi,
                                          struct netlbl_audit *audit_info)
{
        return;
}
static inline int netlbl_cfg_calipso_map_add(u32 doi,
                                             const char *domain,
                                             const struct in6_addr *addr,
                                             const struct in6_addr *mask,
                                             struct netlbl_audit *audit_info)
{
        return -ENOSYS;
}
static inline int netlbl_catmap_walk(struct netlbl_lsm_catmap *catmap,
                                     u32 offset)
{
        return -ENOENT;
}
static inline int netlbl_catmap_walkrng(struct netlbl_lsm_catmap *catmap,
                                        u32 offset)
{
        return -ENOENT;
}
static inline int netlbl_catmap_getlong(struct netlbl_lsm_catmap *catmap,
                                        u32 *offset,
                                        unsigned long *bitmap)
{
        return 0;
}
static inline int netlbl_catmap_setbit(struct netlbl_lsm_catmap **catmap,
                                       u32 bit,
                                       gfp_t flags)
{
        return 0;
}
static inline int netlbl_catmap_setrng(struct netlbl_lsm_catmap **catmap,
                                       u32 start,
                                       u32 end,
                                       gfp_t flags)
{
        return 0;
}
static inline int netlbl_catmap_setlong(struct netlbl_lsm_catmap **catmap,
                                        u32 offset,
                                        unsigned long bitmap,
                                        gfp_t flags)
{
        return 0;
}
static inline int netlbl_enabled(void)
{
        return 0;
}
static inline int netlbl_sock_setattr(struct sock *sk,
                                      u16 family,
                                      const struct netlbl_lsm_secattr *secattr)
{
        return -ENOSYS;
}
static inline void netlbl_sock_delattr(struct sock *sk)
{
}
static inline int netlbl_sock_getattr(struct sock *sk,
                                      struct netlbl_lsm_secattr *secattr)
{
        return -ENOSYS;
}
static inline int netlbl_conn_setattr(struct sock *sk,
                                      struct sockaddr *addr,
                                      const struct netlbl_lsm_secattr *secattr)
{
        return -ENOSYS;
}
static inline int netlbl_req_setattr(struct request_sock *req,
                                     const struct netlbl_lsm_secattr *secattr)
{
        return -ENOSYS;
}
static inline void netlbl_req_delattr(struct request_sock *req)
{
        return;
}
static inline int netlbl_skbuff_setattr(struct sk_buff *skb,
                                      u16 family,
                                      const struct netlbl_lsm_secattr *secattr)
{
        return -ENOSYS;
}
static inline int netlbl_skbuff_getattr(const struct sk_buff *skb,
                                        u16 family,
                                        struct netlbl_lsm_secattr *secattr)
{
        return -ENOSYS;
}
static inline void netlbl_skbuff_err(struct sk_buff *skb,
                                     int error,
                                     int gateway)
{
        return;
}
static inline void netlbl_cache_invalidate(void)
{
        return;
}
static inline int netlbl_cache_add(const struct sk_buff *skb, u16 family,
                                   const struct netlbl_lsm_secattr *secattr)
{
        return 0;
}
static inline struct audit_buffer *netlbl_audit_start(int type,
                                                struct netlbl_audit *audit_info)
{
        return NULL;
}
#endif /* CONFIG_NETLABEL */

const struct netlbl_calipso_ops *
netlbl_calipso_ops_register(const struct netlbl_calipso_ops *ops);

#endif /* _NETLABEL_H */








































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_DST_METADATA_H
#define __NET_DST_METADATA_H 1

#include <linux/skbuff.h>
#include <net/ip_tunnels.h>
#include <net/dst.h>

enum metadata_type {
        METADATA_IP_TUNNEL,
        METADATA_HW_PORT_MUX,
};

struct hw_port_info {
        struct net_device *lower_dev;
        u32 port_id;
};

struct metadata_dst {
        struct dst_entry                dst;
        enum metadata_type                type;
        union {
                struct ip_tunnel_info        tun_info;
                struct hw_port_info        port_info;
        } u;
};

static inline struct metadata_dst *skb_metadata_dst(const struct sk_buff *skb)
{
        struct metadata_dst *md_dst = (struct metadata_dst *) skb_dst(skb);

        if (md_dst && md_dst->dst.flags & DST_METADATA)
                return md_dst;

        return NULL;
}

static inline struct ip_tunnel_info *
skb_tunnel_info(const struct sk_buff *skb)
{
        struct metadata_dst *md_dst = skb_metadata_dst(skb);
        struct dst_entry *dst;

        if (md_dst && md_dst->type == METADATA_IP_TUNNEL)
                return &md_dst->u.tun_info;

        dst = skb_dst(skb);
        if (dst && dst->lwtstate &&
            (dst->lwtstate->type == LWTUNNEL_ENCAP_IP ||
             dst->lwtstate->type == LWTUNNEL_ENCAP_IP6))
                return lwt_tun_info(dst->lwtstate);

        return NULL;
}

static inline bool skb_valid_dst(const struct sk_buff *skb)
{
        struct dst_entry *dst = skb_dst(skb);

        return dst && !(dst->flags & DST_METADATA);
}

static inline int skb_metadata_dst_cmp(const struct sk_buff *skb_a,
                                       const struct sk_buff *skb_b)
{
        const struct metadata_dst *a, *b;

        if (!(skb_a->_skb_refdst | skb_b->_skb_refdst))
                return 0;

        a = (const struct metadata_dst *) skb_dst(skb_a);
        b = (const struct metadata_dst *) skb_dst(skb_b);

        if (!a != !b || a->type != b->type)
                return 1;

        switch (a->type) {
        case METADATA_HW_PORT_MUX:
                return memcmp(&a->u.port_info, &b->u.port_info,
                              sizeof(a->u.port_info));
        case METADATA_IP_TUNNEL:
                return memcmp(&a->u.tun_info, &b->u.tun_info,
                              sizeof(a->u.tun_info) +
                                         a->u.tun_info.options_len);
        default:
                return 1;
        }
}

void metadata_dst_free(struct metadata_dst *);
struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type,
                                        gfp_t flags);
void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst);
struct metadata_dst __percpu *
metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags);

static inline struct metadata_dst *tun_rx_dst(int md_size)
{
        struct metadata_dst *tun_dst;

        tun_dst = metadata_dst_alloc(md_size, METADATA_IP_TUNNEL, GFP_ATOMIC);
        if (!tun_dst)
                return NULL;

        tun_dst->u.tun_info.options_len = 0;
        tun_dst->u.tun_info.mode = 0;
        return tun_dst;
}

static inline struct metadata_dst *tun_dst_unclone(struct sk_buff *skb)
{
        struct metadata_dst *md_dst = skb_metadata_dst(skb);
        int md_size;
        struct metadata_dst *new_md;

        if (!md_dst || md_dst->type != METADATA_IP_TUNNEL)
                return ERR_PTR(-EINVAL);

        md_size = md_dst->u.tun_info.options_len;
        new_md = metadata_dst_alloc(md_size, METADATA_IP_TUNNEL, GFP_ATOMIC);
        if (!new_md)
                return ERR_PTR(-ENOMEM);

        memcpy(&new_md->u.tun_info, &md_dst->u.tun_info,
               sizeof(struct ip_tunnel_info) + md_size);
#ifdef CONFIG_DST_CACHE
        /* Unclone the dst cache if there is one */
        if (new_md->u.tun_info.dst_cache.cache) {
                int ret;

                ret = dst_cache_init(&new_md->u.tun_info.dst_cache, GFP_ATOMIC);
                if (ret) {
                        metadata_dst_free(new_md);
                        return ERR_PTR(ret);
                }
        }
#endif

        skb_dst_drop(skb);
        skb_dst_set(skb, &new_md->dst);
        return new_md;
}

static inline struct ip_tunnel_info *skb_tunnel_info_unclone(struct sk_buff *skb)
{
        struct metadata_dst *dst;

        dst = tun_dst_unclone(skb);
        if (IS_ERR(dst))
                return NULL;

        return &dst->u.tun_info;
}

static inline struct metadata_dst *__ip_tun_set_dst(__be32 saddr,
                                                    __be32 daddr,
                                                    __u8 tos, __u8 ttl,
                                                    __be16 tp_dst,
                                                    __be16 flags,
                                                    __be64 tunnel_id,
                                                    int md_size)
{
        struct metadata_dst *tun_dst;

        tun_dst = tun_rx_dst(md_size);
        if (!tun_dst)
                return NULL;

        ip_tunnel_key_init(&tun_dst->u.tun_info.key,
                           saddr, daddr, tos, ttl,
                           0, 0, tp_dst, tunnel_id, flags);
        return tun_dst;
}

static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb,
                                                 __be16 flags,
                                                 __be64 tunnel_id,
                                                 int md_size)
{
        const struct iphdr *iph = ip_hdr(skb);

        return __ip_tun_set_dst(iph->saddr, iph->daddr, iph->tos, iph->ttl,
                                0, flags, tunnel_id, md_size);
}

static inline struct metadata_dst *__ipv6_tun_set_dst(const struct in6_addr *saddr,
                                                      const struct in6_addr *daddr,
                                                      __u8 tos, __u8 ttl,
                                                      __be16 tp_dst,
                                                      __be32 label,
                                                      __be16 flags,
                                                      __be64 tunnel_id,
                                                      int md_size)
{
        struct metadata_dst *tun_dst;
        struct ip_tunnel_info *info;

        tun_dst = tun_rx_dst(md_size);
        if (!tun_dst)
                return NULL;

        info = &tun_dst->u.tun_info;
        info->mode = IP_TUNNEL_INFO_IPV6;
        info->key.tun_flags = flags;
        info->key.tun_id = tunnel_id;
        info->key.tp_src = 0;
        info->key.tp_dst = tp_dst;

        info->key.u.ipv6.src = *saddr;
        info->key.u.ipv6.dst = *daddr;

        info->key.tos = tos;
        info->key.ttl = ttl;
        info->key.label = label;

        return tun_dst;
}

static inline struct metadata_dst *ipv6_tun_rx_dst(struct sk_buff *skb,
                                                   __be16 flags,
                                                   __be64 tunnel_id,
                                                   int md_size)
{
        const struct ipv6hdr *ip6h = ipv6_hdr(skb);

        return __ipv6_tun_set_dst(&ip6h->saddr, &ip6h->daddr,
                                  ipv6_get_dsfield(ip6h), ip6h->hop_limit,
                                  0, ip6_flowlabel(ip6h), flags, tunnel_id,
                                  md_size);
}
#endif /* __NET_DST_METADATA_H */

















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM mmap

#if !defined(_TRACE_MMAP_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_MMAP_H

#include <linux/tracepoint.h>

TRACE_EVENT(vm_unmapped_area,

        TP_PROTO(unsigned long addr, struct vm_unmapped_area_info *info),

        TP_ARGS(addr, info),

        TP_STRUCT__entry(
                __field(unsigned long,        addr)
                __field(unsigned long,        total_vm)
                __field(unsigned long,        flags)
                __field(unsigned long,        length)
                __field(unsigned long,        low_limit)
                __field(unsigned long,        high_limit)
                __field(unsigned long,        align_mask)
                __field(unsigned long,        align_offset)
        ),

        TP_fast_assign(
                __entry->addr = addr;
                __entry->total_vm = current->mm->total_vm;
                __entry->flags = info->flags;
                __entry->length = info->length;
                __entry->low_limit = info->low_limit;
                __entry->high_limit = info->high_limit;
                __entry->align_mask = info->align_mask;
                __entry->align_offset = info->align_offset;
        ),

        TP_printk("addr=0x%lx err=%ld total_vm=0x%lx flags=0x%lx len=0x%lx lo=0x%lx hi=0x%lx mask=0x%lx ofs=0x%lx\n",
                IS_ERR_VALUE(__entry->addr) ? 0 : __entry->addr,
                IS_ERR_VALUE(__entry->addr) ? __entry->addr : 0,
                __entry->total_vm, __entry->flags, __entry->length,
                __entry->low_limit, __entry->high_limit, __entry->align_mask,
                __entry->align_offset)
);
#endif

/* This part must be outside protection */
#include <trace/define_trace.h>









































    4 




















    2 






















































    4 

    4 
    4 



    4 

    4 




















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
// SPDX-License-Identifier: GPL-2.0
#include <linux/compiler.h>
#include <linux/export.h>
#include <linux/fault-inject-usercopy.h>
#include <linux/kasan-checks.h>
#include <linux/thread_info.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/mm.h>

#include <asm/byteorder.h>
#include <asm/word-at-a-time.h>

#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
#define IS_UNALIGNED(src, dst)        0
#else
#define IS_UNALIGNED(src, dst)        \
        (((long) dst | (long) src) & (sizeof(long) - 1))
#endif

/*
 * Do a strncpy, return length of string without final '\0'.
 * 'count' is the user-supplied count (return 'count' if we
 * hit it), 'max' is the address space maximum (and we return
 * -EFAULT if we hit it).
 */
static inline long do_strncpy_from_user(char *dst, const char __user *src,
                                        unsigned long count, unsigned long max)
{
        const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
        unsigned long res = 0;

        if (IS_UNALIGNED(src, dst))
                goto byte_at_a_time;

        while (max >= sizeof(unsigned long)) {
                unsigned long c, data, mask;

                /* Fall back to byte-at-a-time if we get a page fault */
                unsafe_get_user(c, (unsigned long __user *)(src+res), byte_at_a_time);

                /*
                 * Note that we mask out the bytes following the NUL. This is
                 * important to do because string oblivious code may read past
                 * the NUL. For those routines, we don't want to give them
                 * potentially random bytes after the NUL in `src`.
                 *
                 * One example of such code is BPF map keys. BPF treats map keys
                 * as an opaque set of bytes. Without the post-NUL mask, any BPF
                 * maps keyed by strings returned from strncpy_from_user() may
                 * have multiple entries for semantically identical strings.
                 */
                if (has_zero(c, &data, &constants)) {
                        data = prep_zero_mask(c, data, &constants);
                        data = create_zero_mask(data);
                        mask = zero_bytemask(data);
                        *(unsigned long *)(dst+res) = c & mask;
                        return res + find_zero(data);
                }

                *(unsigned long *)(dst+res) = c;

                res += sizeof(unsigned long);
                max -= sizeof(unsigned long);
        }

byte_at_a_time:
        while (max) {
                char c;

                unsafe_get_user(c,src+res, efault);
                dst[res] = c;
                if (!c)
                        return res;
                res++;
                max--;
        }

        /*
         * Uhhuh. We hit 'max'. But was that the user-specified maximum
         * too? If so, that's ok - we got as much as the user asked for.
         */
        if (res >= count)
                return res;

        /*
         * Nope: we hit the address space limit, and we still had more
         * characters the caller would have wanted. That's an EFAULT.
         */
efault:
        return -EFAULT;
}

/**
 * strncpy_from_user: - Copy a NUL terminated string from userspace.
 * @dst:   Destination address, in kernel space.  This buffer must be at
 *         least @count bytes long.
 * @src:   Source address, in user space.
 * @count: Maximum number of bytes to copy, including the trailing NUL.
 *
 * Copies a NUL-terminated string from userspace to kernel space.
 *
 * On success, returns the length of the string (not including the trailing
 * NUL).
 *
 * If access to userspace fails, returns -EFAULT (some data may have been
 * copied).
 *
 * If @count is smaller than the length of the string, copies @count bytes
 * and returns @count.
 */
long strncpy_from_user(char *dst, const char __user *src, long count)
{
        unsigned long max_addr, src_addr;

        might_fault();
        if (should_fail_usercopy())
                return -EFAULT;
        if (unlikely(count <= 0))
                return 0;

        max_addr = user_addr_max();
        src_addr = (unsigned long)untagged_addr(src);
        if (likely(src_addr < max_addr)) {
                unsigned long max = max_addr - src_addr;
                long retval;

                /*
                 * Truncate 'max' to the user-specified limit, so that
                 * we only have one limit we need to check in the loop
                 */
                if (max > count)
                        max = count;

                kasan_check_write(dst, count);
                check_object_size(dst, count, false);
                if (user_read_access_begin(src, max)) {
                        retval = do_strncpy_from_user(dst, src, count, max);
                        user_read_access_end();
                        return retval;
                }
        }
        return -EFAULT;
}
EXPORT_SYMBOL(strncpy_from_user);








































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Fast and scalable bitmaps.
 *
 * Copyright (C) 2016 Facebook
 * Copyright (C) 2013-2014 Jens Axboe
 */

#ifndef __LINUX_SCALE_BITMAP_H
#define __LINUX_SCALE_BITMAP_H

#include <linux/kernel.h>
#include <linux/slab.h>

struct seq_file;

/**
 * struct sbitmap_word - Word in a &struct sbitmap.
 */
struct sbitmap_word {
        /**
         * @depth: Number of bits being used in @word/@cleared
         */
        unsigned long depth;

        /**
         * @word: word holding free bits
         */
        unsigned long word ____cacheline_aligned_in_smp;

        /**
         * @cleared: word holding cleared bits
         */
        unsigned long cleared ____cacheline_aligned_in_smp;

        /**
         * @swap_lock: Held while swapping word <-> cleared
         */
        spinlock_t swap_lock;
} ____cacheline_aligned_in_smp;

/**
 * struct sbitmap - Scalable bitmap.
 *
 * A &struct sbitmap is spread over multiple cachelines to avoid ping-pong. This
 * trades off higher memory usage for better scalability.
 */
struct sbitmap {
        /**
         * @depth: Number of bits used in the whole bitmap.
         */
        unsigned int depth;

        /**
         * @shift: log2(number of bits used per word)
         */
        unsigned int shift;

        /**
         * @map_nr: Number of words (cachelines) being used for the bitmap.
         */
        unsigned int map_nr;

        /**
         * @map: Allocated bitmap.
         */
        struct sbitmap_word *map;
};

#define SBQ_WAIT_QUEUES 8
#define SBQ_WAKE_BATCH 8

/**
 * struct sbq_wait_state - Wait queue in a &struct sbitmap_queue.
 */
struct sbq_wait_state {
        /**
         * @wait_cnt: Number of frees remaining before we wake up.
         */
        atomic_t wait_cnt;

        /**
         * @wait: Wait queue.
         */
        wait_queue_head_t wait;
} ____cacheline_aligned_in_smp;

/**
 * struct sbitmap_queue - Scalable bitmap with the added ability to wait on free
 * bits.
 *
 * A &struct sbitmap_queue uses multiple wait queues and rolling wakeups to
 * avoid contention on the wait queue spinlock. This ensures that we don't hit a
 * scalability wall when we run out of free bits and have to start putting tasks
 * to sleep.
 */
struct sbitmap_queue {
        /**
         * @sb: Scalable bitmap.
         */
        struct sbitmap sb;

        /*
         * @alloc_hint: Cache of last successfully allocated or freed bit.
         *
         * This is per-cpu, which allows multiple users to stick to different
         * cachelines until the map is exhausted.
         */
        unsigned int __percpu *alloc_hint;

        /**
         * @wake_batch: Number of bits which must be freed before we wake up any
         * waiters.
         */
        unsigned int wake_batch;

        /**
         * @wake_index: Next wait queue in @ws to wake up.
         */
        atomic_t wake_index;

        /**
         * @ws: Wait queues.
         */
        struct sbq_wait_state *ws;

        /*
         * @ws_active: count of currently active ws waitqueues
         */
        atomic_t ws_active;

        /**
         * @round_robin: Allocate bits in strict round-robin order.
         */
        bool round_robin;

        /**
         * @min_shallow_depth: The minimum shallow depth which may be passed to
         * sbitmap_queue_get_shallow() or __sbitmap_queue_get_shallow().
         */
        unsigned int min_shallow_depth;
};

/**
 * sbitmap_init_node() - Initialize a &struct sbitmap on a specific memory node.
 * @sb: Bitmap to initialize.
 * @depth: Number of bits to allocate.
 * @shift: Use 2^@shift bits per word in the bitmap; if a negative number if
 *         given, a good default is chosen.
 * @flags: Allocation flags.
 * @node: Memory node to allocate on.
 *
 * Return: Zero on success or negative errno on failure.
 */
int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
                      gfp_t flags, int node);

/**
 * sbitmap_free() - Free memory used by a &struct sbitmap.
 * @sb: Bitmap to free.
 */
static inline void sbitmap_free(struct sbitmap *sb)
{
        kfree(sb->map);
        sb->map = NULL;
}

/**
 * sbitmap_resize() - Resize a &struct sbitmap.
 * @sb: Bitmap to resize.
 * @depth: New number of bits to resize to.
 *
 * Doesn't reallocate anything. It's up to the caller to ensure that the new
 * depth doesn't exceed the depth that the sb was initialized with.
 */
void sbitmap_resize(struct sbitmap *sb, unsigned int depth);

/**
 * sbitmap_get() - Try to allocate a free bit from a &struct sbitmap.
 * @sb: Bitmap to allocate from.
 * @alloc_hint: Hint for where to start searching for a free bit.
 * @round_robin: If true, be stricter about allocation order; always allocate
 *               starting from the last allocated bit. This is less efficient
 *               than the default behavior (false).
 *
 * This operation provides acquire barrier semantics if it succeeds.
 *
 * Return: Non-negative allocated bit number if successful, -1 otherwise.
 */
int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin);

/**
 * sbitmap_get_shallow() - Try to allocate a free bit from a &struct sbitmap,
 * limiting the depth used from each word.
 * @sb: Bitmap to allocate from.
 * @alloc_hint: Hint for where to start searching for a free bit.
 * @shallow_depth: The maximum number of bits to allocate from a single word.
 *
 * This rather specific operation allows for having multiple users with
 * different allocation limits. E.g., there can be a high-priority class that
 * uses sbitmap_get() and a low-priority class that uses sbitmap_get_shallow()
 * with a @shallow_depth of (1 << (@sb->shift - 1)). Then, the low-priority
 * class can only allocate half of the total bits in the bitmap, preventing it
 * from starving out the high-priority class.
 *
 * Return: Non-negative allocated bit number if successful, -1 otherwise.
 */
int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint,
                        unsigned long shallow_depth);

/**
 * sbitmap_any_bit_set() - Check for a set bit in a &struct sbitmap.
 * @sb: Bitmap to check.
 *
 * Return: true if any bit in the bitmap is set, false otherwise.
 */
bool sbitmap_any_bit_set(const struct sbitmap *sb);

#define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift)
#define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U))

typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *);

/**
 * __sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap.
 * @start: Where to start the iteration.
 * @sb: Bitmap to iterate over.
 * @fn: Callback. Should return true to continue or false to break early.
 * @data: Pointer to pass to callback.
 *
 * This is inline even though it's non-trivial so that the function calls to the
 * callback will hopefully get optimized away.
 */
static inline void __sbitmap_for_each_set(struct sbitmap *sb,
                                          unsigned int start,
                                          sb_for_each_fn fn, void *data)
{
        unsigned int index;
        unsigned int nr;
        unsigned int scanned = 0;

        if (start >= sb->depth)
                start = 0;
        index = SB_NR_TO_INDEX(sb, start);
        nr = SB_NR_TO_BIT(sb, start);

        while (scanned < sb->depth) {
                unsigned long word;
                unsigned int depth = min_t(unsigned int,
                                           sb->map[index].depth - nr,
                                           sb->depth - scanned);

                scanned += depth;
                word = sb->map[index].word & ~sb->map[index].cleared;
                if (!word)
                        goto next;

                /*
                 * On the first iteration of the outer loop, we need to add the
                 * bit offset back to the size of the word for find_next_bit().
                 * On all other iterations, nr is zero, so this is a noop.
                 */
                depth += nr;
                while (1) {
                        nr = find_next_bit(&word, depth, nr);
                        if (nr >= depth)
                                break;
                        if (!fn(sb, (index << sb->shift) + nr, data))
                                return;

                        nr++;
                }
next:
                nr = 0;
                if (++index >= sb->map_nr)
                        index = 0;
        }
}

/**
 * sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap.
 * @sb: Bitmap to iterate over.
 * @fn: Callback. Should return true to continue or false to break early.
 * @data: Pointer to pass to callback.
 */
static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn,
                                        void *data)
{
        __sbitmap_for_each_set(sb, 0, fn, data);
}

static inline unsigned long *__sbitmap_word(struct sbitmap *sb,
                                            unsigned int bitnr)
{
        return &sb->map[SB_NR_TO_INDEX(sb, bitnr)].word;
}

/* Helpers equivalent to the operations in asm/bitops.h and linux/bitmap.h */

static inline void sbitmap_set_bit(struct sbitmap *sb, unsigned int bitnr)
{
        set_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
}

static inline void sbitmap_clear_bit(struct sbitmap *sb, unsigned int bitnr)
{
        clear_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
}

/*
 * This one is special, since it doesn't actually clear the bit, rather it
 * sets the corresponding bit in the ->cleared mask instead. Paired with
 * the caller doing sbitmap_deferred_clear() if a given index is full, which
 * will clear the previously freed entries in the corresponding ->word.
 */
static inline void sbitmap_deferred_clear_bit(struct sbitmap *sb, unsigned int bitnr)
{
        unsigned long *addr = &sb->map[SB_NR_TO_INDEX(sb, bitnr)].cleared;

        set_bit(SB_NR_TO_BIT(sb, bitnr), addr);
}

static inline void sbitmap_clear_bit_unlock(struct sbitmap *sb,
                                            unsigned int bitnr)
{
        clear_bit_unlock(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
}

static inline int sbitmap_test_bit(struct sbitmap *sb, unsigned int bitnr)
{
        return test_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
}

/**
 * sbitmap_show() - Dump &struct sbitmap information to a &struct seq_file.
 * @sb: Bitmap to show.
 * @m: struct seq_file to write to.
 *
 * This is intended for debugging. The format may change at any time.
 */
void sbitmap_show(struct sbitmap *sb, struct seq_file *m);

/**
 * sbitmap_bitmap_show() - Write a hex dump of a &struct sbitmap to a &struct
 * seq_file.
 * @sb: Bitmap to show.
 * @m: struct seq_file to write to.
 *
 * This is intended for debugging. The output isn't guaranteed to be internally
 * consistent.
 */
void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m);

/**
 * sbitmap_queue_init_node() - Initialize a &struct sbitmap_queue on a specific
 * memory node.
 * @sbq: Bitmap queue to initialize.
 * @depth: See sbitmap_init_node().
 * @shift: See sbitmap_init_node().
 * @round_robin: See sbitmap_get().
 * @flags: Allocation flags.
 * @node: Memory node to allocate on.
 *
 * Return: Zero on success or negative errno on failure.
 */
int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
                            int shift, bool round_robin, gfp_t flags, int node);

/**
 * sbitmap_queue_free() - Free memory used by a &struct sbitmap_queue.
 *
 * @sbq: Bitmap queue to free.
 */
static inline void sbitmap_queue_free(struct sbitmap_queue *sbq)
{
        kfree(sbq->ws);
        free_percpu(sbq->alloc_hint);
        sbitmap_free(&sbq->sb);
}

/**
 * sbitmap_queue_resize() - Resize a &struct sbitmap_queue.
 * @sbq: Bitmap queue to resize.
 * @depth: New number of bits to resize to.
 *
 * Like sbitmap_resize(), this doesn't reallocate anything. It has to do
 * some extra work on the &struct sbitmap_queue, so it's not safe to just
 * resize the underlying &struct sbitmap.
 */
void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth);

/**
 * __sbitmap_queue_get() - Try to allocate a free bit from a &struct
 * sbitmap_queue with preemption already disabled.
 * @sbq: Bitmap queue to allocate from.
 *
 * Return: Non-negative allocated bit number if successful, -1 otherwise.
 */
int __sbitmap_queue_get(struct sbitmap_queue *sbq);

/**
 * __sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct
 * sbitmap_queue, limiting the depth used from each word, with preemption
 * already disabled.
 * @sbq: Bitmap queue to allocate from.
 * @shallow_depth: The maximum number of bits to allocate from a single word.
 * See sbitmap_get_shallow().
 *
 * If you call this, make sure to call sbitmap_queue_min_shallow_depth() after
 * initializing @sbq.
 *
 * Return: Non-negative allocated bit number if successful, -1 otherwise.
 */
int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
                                unsigned int shallow_depth);

/**
 * sbitmap_queue_get() - Try to allocate a free bit from a &struct
 * sbitmap_queue.
 * @sbq: Bitmap queue to allocate from.
 * @cpu: Output parameter; will contain the CPU we ran on (e.g., to be passed to
 *       sbitmap_queue_clear()).
 *
 * Return: Non-negative allocated bit number if successful, -1 otherwise.
 */
static inline int sbitmap_queue_get(struct sbitmap_queue *sbq,
                                    unsigned int *cpu)
{
        int nr;

        *cpu = get_cpu();
        nr = __sbitmap_queue_get(sbq);
        put_cpu();
        return nr;
}

/**
 * sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct
 * sbitmap_queue, limiting the depth used from each word.
 * @sbq: Bitmap queue to allocate from.
 * @cpu: Output parameter; will contain the CPU we ran on (e.g., to be passed to
 *       sbitmap_queue_clear()).
 * @shallow_depth: The maximum number of bits to allocate from a single word.
 * See sbitmap_get_shallow().
 *
 * If you call this, make sure to call sbitmap_queue_min_shallow_depth() after
 * initializing @sbq.
 *
 * Return: Non-negative allocated bit number if successful, -1 otherwise.
 */
static inline int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
                                            unsigned int *cpu,
                                            unsigned int shallow_depth)
{
        int nr;

        *cpu = get_cpu();
        nr = __sbitmap_queue_get_shallow(sbq, shallow_depth);
        put_cpu();
        return nr;
}

/**
 * sbitmap_queue_min_shallow_depth() - Inform a &struct sbitmap_queue of the
 * minimum shallow depth that will be used.
 * @sbq: Bitmap queue in question.
 * @min_shallow_depth: The minimum shallow depth that will be passed to
 * sbitmap_queue_get_shallow() or __sbitmap_queue_get_shallow().
 *
 * sbitmap_queue_clear() batches wakeups as an optimization. The batch size
 * depends on the depth of the bitmap. Since the shallow allocation functions
 * effectively operate with a different depth, the shallow depth must be taken
 * into account when calculating the batch size. This function must be called
 * with the minimum shallow depth that will be used. Failure to do so can result
 * in missed wakeups.
 */
void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq,
                                     unsigned int min_shallow_depth);

/**
 * sbitmap_queue_clear() - Free an allocated bit and wake up waiters on a
 * &struct sbitmap_queue.
 * @sbq: Bitmap to free from.
 * @nr: Bit number to free.
 * @cpu: CPU the bit was allocated on.
 */
void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
                         unsigned int cpu);

static inline int sbq_index_inc(int index)
{
        return (index + 1) & (SBQ_WAIT_QUEUES - 1);
}

static inline void sbq_index_atomic_inc(atomic_t *index)
{
        int old = atomic_read(index);
        int new = sbq_index_inc(old);
        atomic_cmpxchg(index, old, new);
}

/**
 * sbq_wait_ptr() - Get the next wait queue to use for a &struct
 * sbitmap_queue.
 * @sbq: Bitmap queue to wait on.
 * @wait_index: A counter per "user" of @sbq.
 */
static inline struct sbq_wait_state *sbq_wait_ptr(struct sbitmap_queue *sbq,
                                                  atomic_t *wait_index)
{
        struct sbq_wait_state *ws;

        ws = &sbq->ws[atomic_read(wait_index)];
        sbq_index_atomic_inc(wait_index);
        return ws;
}

/**
 * sbitmap_queue_wake_all() - Wake up everything waiting on a &struct
 * sbitmap_queue.
 * @sbq: Bitmap queue to wake up.
 */
void sbitmap_queue_wake_all(struct sbitmap_queue *sbq);

/**
 * sbitmap_queue_wake_up() - Wake up some of waiters in one waitqueue
 * on a &struct sbitmap_queue.
 * @sbq: Bitmap queue to wake up.
 */
void sbitmap_queue_wake_up(struct sbitmap_queue *sbq);

/**
 * sbitmap_queue_show() - Dump &struct sbitmap_queue information to a &struct
 * seq_file.
 * @sbq: Bitmap queue to show.
 * @m: struct seq_file to write to.
 *
 * This is intended for debugging. The format may change at any time.
 */
void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m);

struct sbq_wait {
        struct sbitmap_queue *sbq;        /* if set, sbq_wait is accounted */
        struct wait_queue_entry wait;
};

#define DEFINE_SBQ_WAIT(name)                                                        \
        struct sbq_wait name = {                                                \
                .sbq = NULL,                                                        \
                .wait = {                                                        \
                        .private        = current,                                \
                        .func                = autoremove_wake_function,                \
                        .entry                = LIST_HEAD_INIT((name).wait.entry),        \
                }                                                                \
        }

/*
 * Wrapper around prepare_to_wait_exclusive(), which maintains some extra
 * internal state.
 */
void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq,
                                struct sbq_wait_state *ws,
                                struct sbq_wait *sbq_wait, int state);

/*
 * Must be paired with sbitmap_prepare_to_wait().
 */
void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws,
                                struct sbq_wait *sbq_wait);

/*
 * Wrapper around add_wait_queue(), which maintains some extra internal state
 */
void sbitmap_add_wait_queue(struct sbitmap_queue *sbq,
                            struct sbq_wait_state *ws,
                            struct sbq_wait *sbq_wait);

/*
 * Must be paired with sbitmap_add_wait_queue()
 */
void sbitmap_del_wait_queue(struct sbq_wait *sbq_wait);

#endif /* __LINUX_SCALE_BITMAP_H */







































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef DRIVERS_PCI_H
#define DRIVERS_PCI_H

#include <linux/pci.h>

/* Number of possible devfns: 0.0 to 1f.7 inclusive */
#define MAX_NR_DEVFNS 256

#define PCI_FIND_CAP_TTL        48

#define PCI_VSEC_ID_INTEL_TBT        0x1234        /* Thunderbolt */

extern const unsigned char pcie_link_speed[];
extern bool pci_early_dump;

bool pcie_cap_has_lnkctl(const struct pci_dev *dev);
bool pcie_cap_has_rtctl(const struct pci_dev *dev);

/* Functions internal to the PCI core code */

int pci_create_sysfs_dev_files(struct pci_dev *pdev);
void pci_remove_sysfs_dev_files(struct pci_dev *pdev);
#if !defined(CONFIG_DMI) && !defined(CONFIG_ACPI)
static inline void pci_create_firmware_label_files(struct pci_dev *pdev)
{ return; }
static inline void pci_remove_firmware_label_files(struct pci_dev *pdev)
{ return; }
#else
void pci_create_firmware_label_files(struct pci_dev *pdev);
void pci_remove_firmware_label_files(struct pci_dev *pdev);
#endif
void pci_cleanup_rom(struct pci_dev *dev);

enum pci_mmap_api {
        PCI_MMAP_SYSFS,        /* mmap on /sys/bus/pci/devices/<BDF>/resource<N> */
        PCI_MMAP_PROCFS        /* mmap on /proc/bus/pci/<BDF> */
};
int pci_mmap_fits(struct pci_dev *pdev, int resno, struct vm_area_struct *vmai,
                  enum pci_mmap_api mmap_api);

int pci_probe_reset_function(struct pci_dev *dev);
int pci_bridge_secondary_bus_reset(struct pci_dev *dev);
int pci_bus_error_reset(struct pci_dev *dev);
int __pci_reset_bus(struct pci_bus *bus);

#define PCI_PM_D2_DELAY         200        /* usec; see PCIe r4.0, sec 5.9.1 */
#define PCI_PM_D3HOT_WAIT       10        /* msec */
#define PCI_PM_D3COLD_WAIT      100        /* msec */

/*
 * Following exit from Conventional Reset, devices must be ready within 1 sec
 * (PCIe r6.0 sec 6.6.1).  A D3cold to D0 transition implies a Conventional
 * Reset (PCIe r6.0 sec 5.8).
 */
#define PCI_RESET_WAIT                1000        /* msec */
/*
 * Devices may extend the 1 sec period through Request Retry Status completions
 * (PCIe r6.0 sec 2.3.1).  The spec does not provide an upper limit, but 60 sec
 * ought to be enough for any device to become responsive.
 */
#define PCIE_RESET_READY_POLL_MS 60000        /* msec */

/**
 * struct pci_platform_pm_ops - Firmware PM callbacks
 *
 * @bridge_d3: Does the bridge allow entering into D3
 *
 * @is_manageable: returns 'true' if given device is power manageable by the
 *                   platform firmware
 *
 * @set_state: invokes the platform firmware to set the device's power state
 *
 * @get_state: queries the platform firmware for a device's current power state
 *
 * @refresh_state: asks the platform to refresh the device's power state data
 *
 * @choose_state: returns PCI power state of given device preferred by the
 *                  platform; to be used during system-wide transitions from a
 *                  sleeping state to the working state and vice versa
 *
 * @set_wakeup: enables/disables wakeup capability for the device
 *
 * @need_resume: returns 'true' if the given device (which is currently
 *                 suspended) needs to be resumed to be configured for system
 *                 wakeup.
 *
 * If given platform is generally capable of power managing PCI devices, all of
 * these callbacks are mandatory.
 */
struct pci_platform_pm_ops {
        bool (*bridge_d3)(struct pci_dev *dev);
        bool (*is_manageable)(struct pci_dev *dev);
        int (*set_state)(struct pci_dev *dev, pci_power_t state);
        pci_power_t (*get_state)(struct pci_dev *dev);
        void (*refresh_state)(struct pci_dev *dev);
        pci_power_t (*choose_state)(struct pci_dev *dev);
        int (*set_wakeup)(struct pci_dev *dev, bool enable);
        bool (*need_resume)(struct pci_dev *dev);
};

int pci_set_platform_pm(const struct pci_platform_pm_ops *ops);
void pci_update_current_state(struct pci_dev *dev, pci_power_t state);
void pci_refresh_power_state(struct pci_dev *dev);
int pci_power_up(struct pci_dev *dev);
void pci_disable_enabled_device(struct pci_dev *dev);
int pci_finish_runtime_suspend(struct pci_dev *dev);
void pcie_clear_device_status(struct pci_dev *dev);
void pcie_clear_root_pme_status(struct pci_dev *dev);
bool pci_check_pme_status(struct pci_dev *dev);
void pci_pme_wakeup_bus(struct pci_bus *bus);
int __pci_pme_wakeup(struct pci_dev *dev, void *ign);
void pci_pme_restore(struct pci_dev *dev);
bool pci_dev_need_resume(struct pci_dev *dev);
void pci_dev_adjust_pme(struct pci_dev *dev);
void pci_dev_complete_resume(struct pci_dev *pci_dev);
void pci_config_pm_runtime_get(struct pci_dev *dev);
void pci_config_pm_runtime_put(struct pci_dev *dev);
void pci_pm_init(struct pci_dev *dev);
void pci_ea_init(struct pci_dev *dev);
void pci_allocate_cap_save_buffers(struct pci_dev *dev);
void pci_free_cap_save_buffers(struct pci_dev *dev);
bool pci_bridge_d3_possible(struct pci_dev *dev);
void pci_bridge_d3_update(struct pci_dev *dev);
int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type,
                                      int timeout);

static inline void pci_wakeup_event(struct pci_dev *dev)
{
        /* Wait 100 ms before the system can be put into a sleep state. */
        pm_wakeup_event(&dev->dev, 100);
}

static inline bool pci_has_subordinate(struct pci_dev *pci_dev)
{
        return !!(pci_dev->subordinate);
}

static inline bool pci_power_manageable(struct pci_dev *pci_dev)
{
        /*
         * Currently we allow normal PCI devices and PCI bridges transition
         * into D3 if their bridge_d3 is set.
         */
        return !pci_has_subordinate(pci_dev) || pci_dev->bridge_d3;
}

static inline bool pcie_downstream_port(const struct pci_dev *dev)
{
        int type = pci_pcie_type(dev);

        return type == PCI_EXP_TYPE_ROOT_PORT ||
               type == PCI_EXP_TYPE_DOWNSTREAM ||
               type == PCI_EXP_TYPE_PCIE_BRIDGE;
}

int pci_vpd_init(struct pci_dev *dev);
void pci_vpd_release(struct pci_dev *dev);
void pcie_vpd_create_sysfs_dev_files(struct pci_dev *dev);
void pcie_vpd_remove_sysfs_dev_files(struct pci_dev *dev);

/* PCI Virtual Channel */
int pci_save_vc_state(struct pci_dev *dev);
void pci_restore_vc_state(struct pci_dev *dev);
void pci_allocate_vc_save_buffers(struct pci_dev *dev);

/* PCI /proc functions */
#ifdef CONFIG_PROC_FS
int pci_proc_attach_device(struct pci_dev *dev);
int pci_proc_detach_device(struct pci_dev *dev);
int pci_proc_detach_bus(struct pci_bus *bus);
#else
static inline int pci_proc_attach_device(struct pci_dev *dev) { return 0; }
static inline int pci_proc_detach_device(struct pci_dev *dev) { return 0; }
static inline int pci_proc_detach_bus(struct pci_bus *bus) { return 0; }
#endif

/* Functions for PCI Hotplug drivers to use */
int pci_hp_add_bridge(struct pci_dev *dev);

#ifdef HAVE_PCI_LEGACY
void pci_create_legacy_files(struct pci_bus *bus);
void pci_remove_legacy_files(struct pci_bus *bus);
#else
static inline void pci_create_legacy_files(struct pci_bus *bus) { return; }
static inline void pci_remove_legacy_files(struct pci_bus *bus) { return; }
#endif

/* Lock for read/write access to pci device and bus lists */
extern struct rw_semaphore pci_bus_sem;
extern struct mutex pci_slot_mutex;

extern raw_spinlock_t pci_lock;

extern unsigned int pci_pm_d3hot_delay;

#ifdef CONFIG_PCI_MSI
void pci_no_msi(void);
#else
static inline void pci_no_msi(void) { }
#endif

static inline void pci_msi_set_enable(struct pci_dev *dev, int enable)
{
        u16 control;

        pci_read_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, &control);
        control &= ~PCI_MSI_FLAGS_ENABLE;
        if (enable)
                control |= PCI_MSI_FLAGS_ENABLE;
        pci_write_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, control);
}

static inline void pci_msix_clear_and_set_ctrl(struct pci_dev *dev, u16 clear, u16 set)
{
        u16 ctrl;

        pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &ctrl);
        ctrl &= ~clear;
        ctrl |= set;
        pci_write_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, ctrl);
}

void pci_realloc_get_opt(char *);

static inline int pci_no_d1d2(struct pci_dev *dev)
{
        unsigned int parent_dstates = 0;

        if (dev->bus->self)
                parent_dstates = dev->bus->self->no_d1d2;
        return (dev->no_d1d2 || parent_dstates);

}
extern const struct attribute_group *pci_dev_groups[];
extern const struct attribute_group *pcibus_groups[];
extern const struct device_type pci_dev_type;
extern const struct attribute_group *pci_bus_groups[];

extern unsigned long pci_hotplug_io_size;
extern unsigned long pci_hotplug_mmio_size;
extern unsigned long pci_hotplug_mmio_pref_size;
extern unsigned long pci_hotplug_bus_size;

/**
 * pci_match_one_device - Tell if a PCI device structure has a matching
 *                          PCI device id structure
 * @id: single PCI device id structure to match
 * @dev: the PCI device structure to match against
 *
 * Returns the matching pci_device_id structure or %NULL if there is no match.
 */
static inline const struct pci_device_id *
pci_match_one_device(const struct pci_device_id *id, const struct pci_dev *dev)
{
        if ((id->vendor == PCI_ANY_ID || id->vendor == dev->vendor) &&
            (id->device == PCI_ANY_ID || id->device == dev->device) &&
            (id->subvendor == PCI_ANY_ID || id->subvendor == dev->subsystem_vendor) &&
            (id->subdevice == PCI_ANY_ID || id->subdevice == dev->subsystem_device) &&
            !((id->class ^ dev->class) & id->class_mask))
                return id;
        return NULL;
}

/* PCI slot sysfs helper code */
#define to_pci_slot(s) container_of(s, struct pci_slot, kobj)

extern struct kset *pci_slots_kset;

struct pci_slot_attribute {
        struct attribute attr;
        ssize_t (*show)(struct pci_slot *, char *);
        ssize_t (*store)(struct pci_slot *, const char *, size_t);
};
#define to_pci_slot_attr(s) container_of(s, struct pci_slot_attribute, attr)

enum pci_bar_type {
        pci_bar_unknown,        /* Standard PCI BAR probe */
        pci_bar_io,                /* An I/O port BAR */
        pci_bar_mem32,                /* A 32-bit memory BAR */
        pci_bar_mem64,                /* A 64-bit memory BAR */
};

struct device *pci_get_host_bridge_device(struct pci_dev *dev);
void pci_put_host_bridge_device(struct device *dev);

int pci_configure_extended_tags(struct pci_dev *dev, void *ign);
bool pci_bus_read_dev_vendor_id(struct pci_bus *bus, int devfn, u32 *pl,
                                int crs_timeout);
bool pci_bus_generic_read_dev_vendor_id(struct pci_bus *bus, int devfn, u32 *pl,
                                        int crs_timeout);
int pci_idt_bus_quirk(struct pci_bus *bus, int devfn, u32 *pl, int crs_timeout);

int pci_setup_device(struct pci_dev *dev);
int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
                    struct resource *res, unsigned int reg);
void pci_configure_ari(struct pci_dev *dev);
void __pci_bus_size_bridges(struct pci_bus *bus,
                        struct list_head *realloc_head);
void __pci_bus_assign_resources(const struct pci_bus *bus,
                                struct list_head *realloc_head,
                                struct list_head *fail_head);
bool pci_bus_clip_resource(struct pci_dev *dev, int idx);

void pci_reassigndev_resource_alignment(struct pci_dev *dev);
void pci_disable_bridge_window(struct pci_dev *dev);
struct pci_bus *pci_bus_get(struct pci_bus *bus);
void pci_bus_put(struct pci_bus *bus);

/* PCIe link information from Link Capabilities 2 */
#define PCIE_LNKCAP2_SLS2SPEED(lnkcap2) \
        ((lnkcap2) & PCI_EXP_LNKCAP2_SLS_32_0GB ? PCIE_SPEED_32_0GT : \
         (lnkcap2) & PCI_EXP_LNKCAP2_SLS_16_0GB ? PCIE_SPEED_16_0GT : \
         (lnkcap2) & PCI_EXP_LNKCAP2_SLS_8_0GB ? PCIE_SPEED_8_0GT : \
         (lnkcap2) & PCI_EXP_LNKCAP2_SLS_5_0GB ? PCIE_SPEED_5_0GT : \
         (lnkcap2) & PCI_EXP_LNKCAP2_SLS_2_5GB ? PCIE_SPEED_2_5GT : \
         PCI_SPEED_UNKNOWN)

/* PCIe speed to Mb/s reduced by encoding overhead */
#define PCIE_SPEED2MBS_ENC(speed) \
        ((speed) == PCIE_SPEED_32_0GT ? 32000*128/130 : \
         (speed) == PCIE_SPEED_16_0GT ? 16000*128/130 : \
         (speed) == PCIE_SPEED_8_0GT  ?  8000*128/130 : \
         (speed) == PCIE_SPEED_5_0GT  ?  5000*8/10 : \
         (speed) == PCIE_SPEED_2_5GT  ?  2500*8/10 : \
         0)

const char *pci_speed_string(enum pci_bus_speed speed);
enum pci_bus_speed pcie_get_speed_cap(struct pci_dev *dev);
enum pcie_link_width pcie_get_width_cap(struct pci_dev *dev);
u32 pcie_bandwidth_capable(struct pci_dev *dev, enum pci_bus_speed *speed,
                           enum pcie_link_width *width);
void __pcie_print_link_status(struct pci_dev *dev, bool verbose);
void pcie_report_downtraining(struct pci_dev *dev);
void pcie_update_link_speed(struct pci_bus *bus, u16 link_status);

/* Single Root I/O Virtualization */
struct pci_sriov {
        int                pos;                /* Capability position */
        int                nres;                /* Number of resources */
        u32                cap;                /* SR-IOV Capabilities */
        u16                ctrl;                /* SR-IOV Control */
        u16                total_VFs;        /* Total VFs associated with the PF */
        u16                initial_VFs;        /* Initial VFs associated with the PF */
        u16                num_VFs;        /* Number of VFs available */
        u16                offset;                /* First VF Routing ID offset */
        u16                stride;                /* Following VF stride */
        u16                vf_device;        /* VF device ID */
        u32                pgsz;                /* Page size for BAR alignment */
        u8                link;                /* Function Dependency Link */
        u8                max_VF_buses;        /* Max buses consumed by VFs */
        u16                driver_max_VFs;        /* Max num VFs driver supports */
        struct pci_dev        *dev;                /* Lowest numbered PF */
        struct pci_dev        *self;                /* This PF */
        u32                class;                /* VF device */
        u8                hdr_type;        /* VF header type */
        u16                subsystem_vendor; /* VF subsystem vendor */
        u16                subsystem_device; /* VF subsystem device */
        resource_size_t        barsz[PCI_SRIOV_NUM_BARS];        /* VF BAR size */
        bool                drivers_autoprobe; /* Auto probing of VFs by driver */
};

/**
 * pci_dev_set_io_state - Set the new error state if possible.
 *
 * @dev - pci device to set new error_state
 * @new - the state we want dev to be in
 *
 * If the device is experiencing perm_failure, it has to remain in that state.
 * Any other transition is allowed.
 *
 * Returns true if state has been changed to the requested state.
 */
static inline bool pci_dev_set_io_state(struct pci_dev *dev,
                                        pci_channel_state_t new)
{
        pci_channel_state_t old;

        switch (new) {
        case pci_channel_io_perm_failure:
                xchg(&dev->error_state, pci_channel_io_perm_failure);
                return true;
        case pci_channel_io_frozen:
                old = cmpxchg(&dev->error_state, pci_channel_io_normal,
                              pci_channel_io_frozen);
                return old != pci_channel_io_perm_failure;
        case pci_channel_io_normal:
                old = cmpxchg(&dev->error_state, pci_channel_io_frozen,
                              pci_channel_io_normal);
                return old != pci_channel_io_perm_failure;
        default:
                return false;
        }
}

static inline int pci_dev_set_disconnected(struct pci_dev *dev, void *unused)
{
        pci_dev_set_io_state(dev, pci_channel_io_perm_failure);

        return 0;
}

/* pci_dev priv_flags */
#define PCI_DEV_ADDED 0
#define PCI_DPC_RECOVERED 1
#define PCI_DPC_RECOVERING 2

static inline void pci_dev_assign_added(struct pci_dev *dev, bool added)
{
        assign_bit(PCI_DEV_ADDED, &dev->priv_flags, added);
}

static inline bool pci_dev_is_added(const struct pci_dev *dev)
{
        return test_bit(PCI_DEV_ADDED, &dev->priv_flags);
}

#ifdef CONFIG_PCIEAER
#include <linux/aer.h>

#define AER_MAX_MULTI_ERR_DEVICES        5        /* Not likely to have more */

struct aer_err_info {
        struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES];
        int error_dev_num;

        unsigned int id:16;

        unsigned int severity:2;        /* 0:NONFATAL | 1:FATAL | 2:COR */
        unsigned int __pad1:5;
        unsigned int multi_error_valid:1;

        unsigned int first_error:5;
        unsigned int __pad2:2;
        unsigned int tlp_header_valid:1;

        unsigned int status;                /* COR/UNCOR Error Status */
        unsigned int mask;                /* COR/UNCOR Error Mask */
        struct aer_header_log_regs tlp;        /* TLP Header */
};

int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info);
void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
#endif        /* CONFIG_PCIEAER */

#ifdef CONFIG_PCIEPORTBUS
/* Cached RCEC Endpoint Association */
struct rcec_ea {
        u8                nextbusn;
        u8                lastbusn;
        u32                bitmap;
};
#endif

#ifdef CONFIG_PCIE_DPC
void pci_save_dpc_state(struct pci_dev *dev);
void pci_restore_dpc_state(struct pci_dev *dev);
void pci_dpc_init(struct pci_dev *pdev);
void dpc_process_error(struct pci_dev *pdev);
pci_ers_result_t dpc_reset_link(struct pci_dev *pdev);
bool pci_dpc_recovered(struct pci_dev *pdev);
#else
static inline void pci_save_dpc_state(struct pci_dev *dev) {}
static inline void pci_restore_dpc_state(struct pci_dev *dev) {}
static inline void pci_dpc_init(struct pci_dev *pdev) {}
static inline bool pci_dpc_recovered(struct pci_dev *pdev) { return false; }
#endif

#ifdef CONFIG_PCIEPORTBUS
void pci_rcec_init(struct pci_dev *dev);
void pci_rcec_exit(struct pci_dev *dev);
#else
static inline void pci_rcec_init(struct pci_dev *dev) {}
static inline void pci_rcec_exit(struct pci_dev *dev) {}
#endif

#ifdef CONFIG_PCI_ATS
/* Address Translation Service */
void pci_ats_init(struct pci_dev *dev);
void pci_restore_ats_state(struct pci_dev *dev);
#else
static inline void pci_ats_init(struct pci_dev *d) { }
static inline void pci_restore_ats_state(struct pci_dev *dev) { }
#endif /* CONFIG_PCI_ATS */

#ifdef CONFIG_PCI_PRI
void pci_pri_init(struct pci_dev *dev);
void pci_restore_pri_state(struct pci_dev *pdev);
#else
static inline void pci_pri_init(struct pci_dev *dev) { }
static inline void pci_restore_pri_state(struct pci_dev *pdev) { }
#endif

#ifdef CONFIG_PCI_PASID
void pci_pasid_init(struct pci_dev *dev);
void pci_restore_pasid_state(struct pci_dev *pdev);
#else
static inline void pci_pasid_init(struct pci_dev *dev) { }
static inline void pci_restore_pasid_state(struct pci_dev *pdev) { }
#endif

#ifdef CONFIG_PCI_IOV
int pci_iov_init(struct pci_dev *dev);
void pci_iov_release(struct pci_dev *dev);
void pci_iov_remove(struct pci_dev *dev);
void pci_iov_update_resource(struct pci_dev *dev, int resno);
resource_size_t pci_sriov_resource_alignment(struct pci_dev *dev, int resno);
void pci_restore_iov_state(struct pci_dev *dev);
int pci_iov_bus_range(struct pci_bus *bus);
extern const struct attribute_group sriov_dev_attr_group;
#else
static inline int pci_iov_init(struct pci_dev *dev)
{
        return -ENODEV;
}
static inline void pci_iov_release(struct pci_dev *dev)

{
}
static inline void pci_iov_remove(struct pci_dev *dev)
{
}
static inline void pci_restore_iov_state(struct pci_dev *dev)
{
}
static inline int pci_iov_bus_range(struct pci_bus *bus)
{
        return 0;
}

#endif /* CONFIG_PCI_IOV */

unsigned long pci_cardbus_resource_alignment(struct resource *);

static inline resource_size_t pci_resource_alignment(struct pci_dev *dev,
                                                     struct resource *res)
{
#ifdef CONFIG_PCI_IOV
        int resno = res - dev->resource;

        if (resno >= PCI_IOV_RESOURCES && resno <= PCI_IOV_RESOURCE_END)
                return pci_sriov_resource_alignment(dev, resno);
#endif
        if (dev->class >> 8 == PCI_CLASS_BRIDGE_CARDBUS)
                return pci_cardbus_resource_alignment(res);
        return resource_alignment(res);
}

void pci_acs_init(struct pci_dev *dev);
#ifdef CONFIG_PCI_QUIRKS
int pci_dev_specific_acs_enabled(struct pci_dev *dev, u16 acs_flags);
int pci_dev_specific_enable_acs(struct pci_dev *dev);
int pci_dev_specific_disable_acs_redir(struct pci_dev *dev);
#else
static inline int pci_dev_specific_acs_enabled(struct pci_dev *dev,
                                               u16 acs_flags)
{
        return -ENOTTY;
}
static inline int pci_dev_specific_enable_acs(struct pci_dev *dev)
{
        return -ENOTTY;
}
static inline int pci_dev_specific_disable_acs_redir(struct pci_dev *dev)
{
        return -ENOTTY;
}
#endif

/* PCI error reporting and recovery */
pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
                pci_channel_state_t state,
                pci_ers_result_t (*reset_subordinates)(struct pci_dev *pdev));

bool pcie_wait_for_link(struct pci_dev *pdev, bool active);
#ifdef CONFIG_PCIEASPM
void pcie_aspm_init_link_state(struct pci_dev *pdev);
void pcie_aspm_exit_link_state(struct pci_dev *pdev);
void pcie_aspm_pm_state_change(struct pci_dev *pdev);
void pcie_aspm_powersave_config_link(struct pci_dev *pdev);
#else
static inline void pcie_aspm_init_link_state(struct pci_dev *pdev) { }
static inline void pcie_aspm_exit_link_state(struct pci_dev *pdev) { }
static inline void pcie_aspm_pm_state_change(struct pci_dev *pdev) { }
static inline void pcie_aspm_powersave_config_link(struct pci_dev *pdev) { }
#endif

#ifdef CONFIG_PCIE_ECRC
void pcie_set_ecrc_checking(struct pci_dev *dev);
void pcie_ecrc_get_policy(char *str);
#else
static inline void pcie_set_ecrc_checking(struct pci_dev *dev) { }
static inline void pcie_ecrc_get_policy(char *str) { }
#endif

#ifdef CONFIG_PCIE_PTM
void pci_ptm_init(struct pci_dev *dev);
#else
static inline void pci_ptm_init(struct pci_dev *dev) { }
#endif

struct pci_dev_reset_methods {
        u16 vendor;
        u16 device;
        int (*reset)(struct pci_dev *dev, int probe);
};

#ifdef CONFIG_PCI_QUIRKS
int pci_dev_specific_reset(struct pci_dev *dev, int probe);
#else
static inline int pci_dev_specific_reset(struct pci_dev *dev, int probe)
{
        return -ENOTTY;
}
#endif

#if defined(CONFIG_PCI_QUIRKS) && defined(CONFIG_ARM64)
int acpi_get_rc_resources(struct device *dev, const char *hid, u16 segment,
                          struct resource *res);
#else
static inline int acpi_get_rc_resources(struct device *dev, const char *hid,
                                        u16 segment, struct resource *res)
{
        return -ENODEV;
}
#endif

u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar);
int pci_rebar_get_current_size(struct pci_dev *pdev, int bar);
int pci_rebar_set_size(struct pci_dev *pdev, int bar, int size);
static inline u64 pci_rebar_size_to_bytes(int size)
{
        return 1ULL << (size + 20);
}

struct device_node;

#ifdef CONFIG_OF
int of_pci_parse_bus_range(struct device_node *node, struct resource *res);
int of_get_pci_domain_nr(struct device_node *node);
int of_pci_get_max_link_speed(struct device_node *node);
void pci_set_of_node(struct pci_dev *dev);
void pci_release_of_node(struct pci_dev *dev);
void pci_set_bus_of_node(struct pci_bus *bus);
void pci_release_bus_of_node(struct pci_bus *bus);

int devm_of_pci_bridge_init(struct device *dev, struct pci_host_bridge *bridge);

#else
static inline int
of_pci_parse_bus_range(struct device_node *node, struct resource *res)
{
        return -EINVAL;
}

static inline int
of_get_pci_domain_nr(struct device_node *node)
{
        return -1;
}

static inline int
of_pci_get_max_link_speed(struct device_node *node)
{
        return -EINVAL;
}

static inline void pci_set_of_node(struct pci_dev *dev) { }
static inline void pci_release_of_node(struct pci_dev *dev) { }
static inline void pci_set_bus_of_node(struct pci_bus *bus) { }
static inline void pci_release_bus_of_node(struct pci_bus *bus) { }

static inline int devm_of_pci_bridge_init(struct device *dev, struct pci_host_bridge *bridge)
{
        return 0;
}

#endif /* CONFIG_OF */

#ifdef CONFIG_PCIEAER
void pci_no_aer(void);
void pci_aer_init(struct pci_dev *dev);
void pci_aer_exit(struct pci_dev *dev);
extern const struct attribute_group aer_stats_attr_group;
void pci_aer_clear_fatal_status(struct pci_dev *dev);
int pci_aer_clear_status(struct pci_dev *dev);
int pci_aer_raw_clear_status(struct pci_dev *dev);
#else
static inline void pci_no_aer(void) { }
static inline void pci_aer_init(struct pci_dev *d) { }
static inline void pci_aer_exit(struct pci_dev *d) { }
static inline void pci_aer_clear_fatal_status(struct pci_dev *dev) { }
static inline int pci_aer_clear_status(struct pci_dev *dev) { return -EINVAL; }
static inline int pci_aer_raw_clear_status(struct pci_dev *dev) { return -EINVAL; }
#endif

#ifdef CONFIG_ACPI
int pci_acpi_program_hp_params(struct pci_dev *dev);
#else
static inline int pci_acpi_program_hp_params(struct pci_dev *dev)
{
        return -ENODEV;
}
#endif

#ifdef CONFIG_PCIEASPM
extern const struct attribute_group aspm_ctrl_attr_group;
#endif

#endif /* DRIVERS_PCI_H */




















































































































































































































































































































































































    3 






    3 
    3 






    3 












    3 












































































































    1 





    1 




    1 
    1 
    1 


































































    1 

    1 

    1 


    1 


    1 


    1 

    1 
    1 


    1 






































































































    1 







    1 


    1 












    2 





    2 
    2 
    2 












































    1 

    1 





    1 



    1 
    1 



    1 















































    1 

    1 




    1 
    1 



    1 
    1 


    1 



































    1 

    1 







    1 





    1 



    1 



















































































    1 







    1 








































    1 




    1 








































































































































    1 


    1 


    1 
    1 





    1 









    1 


    1 








    1 
    1 

    1 

















    1 
    1 














































    1 













    1 


















































































































































































































































































































    2 




    2 



    2 


    2 


    2 


    2 

    2 











    2 











    2 


    2 







    2 
    2 

























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/read_write.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/slab.h>
#include <linux/stat.h>
#include <linux/sched/xacct.h>
#include <linux/fcntl.h>
#include <linux/file.h>
#include <linux/uio.h>
#include <linux/fsnotify.h>
#include <linux/security.h>
#include <linux/export.h>
#include <linux/syscalls.h>
#include <linux/pagemap.h>
#include <linux/splice.h>
#include <linux/compat.h>
#include <linux/mount.h>
#include <linux/fs.h>
#include "internal.h"

#include <linux/uaccess.h>
#include <asm/unistd.h>

const struct file_operations generic_ro_fops = {
        .llseek                = generic_file_llseek,
        .read_iter        = generic_file_read_iter,
        .mmap                = generic_file_readonly_mmap,
        .splice_read        = generic_file_splice_read,
};

EXPORT_SYMBOL(generic_ro_fops);

static inline bool unsigned_offsets(struct file *file)
{
        return file->f_mode & FMODE_UNSIGNED_OFFSET;
}

/**
 * vfs_setpos - update the file offset for lseek
 * @file:        file structure in question
 * @offset:        file offset to seek to
 * @maxsize:        maximum file size
 *
 * This is a low-level filesystem helper for updating the file offset to
 * the value specified by @offset if the given offset is valid and it is
 * not equal to the current file offset.
 *
 * Return the specified offset on success and -EINVAL on invalid offset.
 */
loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
{
        if (offset < 0 && !unsigned_offsets(file))
                return -EINVAL;
        if (offset > maxsize)
                return -EINVAL;

        if (offset != file->f_pos) {
                file->f_pos = offset;
                file->f_version = 0;
        }
        return offset;
}
EXPORT_SYMBOL(vfs_setpos);

/**
 * generic_file_llseek_size - generic llseek implementation for regular files
 * @file:        file structure to seek on
 * @offset:        file offset to seek to
 * @whence:        type of seek
 * @size:        max size of this file in file system
 * @eof:        offset used for SEEK_END position
 *
 * This is a variant of generic_file_llseek that allows passing in a custom
 * maximum file size and a custom EOF position, for e.g. hashed directories
 *
 * Synchronization:
 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
 * read/writes behave like SEEK_SET against seeks.
 */
loff_t
generic_file_llseek_size(struct file *file, loff_t offset, int whence,
                loff_t maxsize, loff_t eof)
{
        switch (whence) {
        case SEEK_END:
                offset += eof;
                break;
        case SEEK_CUR:
                /*
                 * Here we special-case the lseek(fd, 0, SEEK_CUR)
                 * position-querying operation.  Avoid rewriting the "same"
                 * f_pos value back to the file because a concurrent read(),
                 * write() or lseek() might have altered it
                 */
                if (offset == 0)
                        return file->f_pos;
                /*
                 * f_lock protects against read/modify/write race with other
                 * SEEK_CURs. Note that parallel writes and reads behave
                 * like SEEK_SET.
                 */
                spin_lock(&file->f_lock);
                offset = vfs_setpos(file, file->f_pos + offset, maxsize);
                spin_unlock(&file->f_lock);
                return offset;
        case SEEK_DATA:
                /*
                 * In the generic case the entire file is data, so as long as
                 * offset isn't at the end of the file then the offset is data.
                 */
                if ((unsigned long long)offset >= eof)
                        return -ENXIO;
                break;
        case SEEK_HOLE:
                /*
                 * There is a virtual hole at the end of the file, so as long as
                 * offset isn't i_size or larger, return i_size.
                 */
                if ((unsigned long long)offset >= eof)
                        return -ENXIO;
                offset = eof;
                break;
        }

        return vfs_setpos(file, offset, maxsize);
}
EXPORT_SYMBOL(generic_file_llseek_size);

/**
 * generic_file_llseek - generic llseek implementation for regular files
 * @file:        file structure to seek on
 * @offset:        file offset to seek to
 * @whence:        type of seek
 *
 * This is a generic implemenation of ->llseek useable for all normal local
 * filesystems.  It just updates the file offset to the value specified by
 * @offset and @whence.
 */
loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
{
        struct inode *inode = file->f_mapping->host;

        return generic_file_llseek_size(file, offset, whence,
                                        inode->i_sb->s_maxbytes,
                                        i_size_read(inode));
}
EXPORT_SYMBOL(generic_file_llseek);

/**
 * fixed_size_llseek - llseek implementation for fixed-sized devices
 * @file:        file structure to seek on
 * @offset:        file offset to seek to
 * @whence:        type of seek
 * @size:        size of the file
 *
 */
loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
{
        switch (whence) {
        case SEEK_SET: case SEEK_CUR: case SEEK_END:
                return generic_file_llseek_size(file, offset, whence,
                                                size, size);
        default:
                return -EINVAL;
        }
}
EXPORT_SYMBOL(fixed_size_llseek);

/**
 * no_seek_end_llseek - llseek implementation for fixed-sized devices
 * @file:        file structure to seek on
 * @offset:        file offset to seek to
 * @whence:        type of seek
 *
 */
loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence)
{
        switch (whence) {
        case SEEK_SET: case SEEK_CUR:
                return generic_file_llseek_size(file, offset, whence,
                                                OFFSET_MAX, 0);
        default:
                return -EINVAL;
        }
}
EXPORT_SYMBOL(no_seek_end_llseek);

/**
 * no_seek_end_llseek_size - llseek implementation for fixed-sized devices
 * @file:        file structure to seek on
 * @offset:        file offset to seek to
 * @whence:        type of seek
 * @size:        maximal offset allowed
 *
 */
loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size)
{
        switch (whence) {
        case SEEK_SET: case SEEK_CUR:
                return generic_file_llseek_size(file, offset, whence,
                                                size, 0);
        default:
                return -EINVAL;
        }
}
EXPORT_SYMBOL(no_seek_end_llseek_size);

/**
 * noop_llseek - No Operation Performed llseek implementation
 * @file:        file structure to seek on
 * @offset:        file offset to seek to
 * @whence:        type of seek
 *
 * This is an implementation of ->llseek useable for the rare special case when
 * userspace expects the seek to succeed but the (device) file is actually not
 * able to perform the seek. In this case you use noop_llseek() instead of
 * falling back to the default implementation of ->llseek.
 */
loff_t noop_llseek(struct file *file, loff_t offset, int whence)
{
        return file->f_pos;
}
EXPORT_SYMBOL(noop_llseek);

loff_t no_llseek(struct file *file, loff_t offset, int whence)
{
        return -ESPIPE;
}
EXPORT_SYMBOL(no_llseek);

loff_t default_llseek(struct file *file, loff_t offset, int whence)
{
        struct inode *inode = file_inode(file);
        loff_t retval;

        inode_lock(inode);
        switch (whence) {
                case SEEK_END:
                        offset += i_size_read(inode);
                        break;
                case SEEK_CUR:
                        if (offset == 0) {
                                retval = file->f_pos;
                                goto out;
                        }
                        offset += file->f_pos;
                        break;
                case SEEK_DATA:
                        /*
                         * In the generic case the entire file is data, so as
                         * long as offset isn't at the end of the file then the
                         * offset is data.
                         */
                        if (offset >= inode->i_size) {
                                retval = -ENXIO;
                                goto out;
                        }
                        break;
                case SEEK_HOLE:
                        /*
                         * There is a virtual hole at the end of the file, so
                         * as long as offset isn't i_size or larger, return
                         * i_size.
                         */
                        if (offset >= inode->i_size) {
                                retval = -ENXIO;
                                goto out;
                        }
                        offset = inode->i_size;
                        break;
        }
        retval = -EINVAL;
        if (offset >= 0 || unsigned_offsets(file)) {
                if (offset != file->f_pos) {
                        file->f_pos = offset;
                        file->f_version = 0;
                }
                retval = offset;
        }
out:
        inode_unlock(inode);
        return retval;
}
EXPORT_SYMBOL(default_llseek);

loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
{
        loff_t (*fn)(struct file *, loff_t, int);

        fn = no_llseek;
        if (file->f_mode & FMODE_LSEEK) {
                if (file->f_op->llseek)
                        fn = file->f_op->llseek;
        }
        return fn(file, offset, whence);
}
EXPORT_SYMBOL(vfs_llseek);

static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence)
{
        off_t retval;
        struct fd f = fdget_pos(fd);
        if (!f.file)
                return -EBADF;

        retval = -EINVAL;
        if (whence <= SEEK_MAX) {
                loff_t res = vfs_llseek(f.file, offset, whence);
                retval = res;
                if (res != (loff_t)retval)
                        retval = -EOVERFLOW;        /* LFS: should only happen on 32 bit platforms */
        }
        fdput_pos(f);
        return retval;
}

SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
{
        return ksys_lseek(fd, offset, whence);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
{
        return ksys_lseek(fd, offset, whence);
}
#endif

#if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \
        defined(__ARCH_WANT_SYS_LLSEEK)
SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
                unsigned long, offset_low, loff_t __user *, result,
                unsigned int, whence)
{
        int retval;
        struct fd f = fdget_pos(fd);
        loff_t offset;

        if (!f.file)
                return -EBADF;

        retval = -EINVAL;
        if (whence > SEEK_MAX)
                goto out_putf;

        offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
                        whence);

        retval = (int)offset;
        if (offset >= 0) {
                retval = -EFAULT;
                if (!copy_to_user(result, &offset, sizeof(offset)))
                        retval = 0;
        }
out_putf:
        fdput_pos(f);
        return retval;
}
#endif

int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
{
        struct inode *inode;
        int retval = -EINVAL;

        inode = file_inode(file);
        if (unlikely((ssize_t) count < 0))
                return retval;

        /*
         * ranged mandatory locking does not apply to streams - it makes sense
         * only for files where position has a meaning.
         */
        if (ppos) {
                loff_t pos = *ppos;

                if (unlikely(pos < 0)) {
                        if (!unsigned_offsets(file))
                                return retval;
                        if (count >= -pos) /* both values are in 0..LLONG_MAX */
                                return -EOVERFLOW;
                } else if (unlikely((loff_t) (pos + count) < 0)) {
                        if (!unsigned_offsets(file))
                                return retval;
                }

                if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
                        retval = locks_mandatory_area(inode, file, pos, pos + count - 1,
                                        read_write == READ ? F_RDLCK : F_WRLCK);
                        if (retval < 0)
                                return retval;
                }
        }

        return security_file_permission(file,
                                read_write == READ ? MAY_READ : MAY_WRITE);
}

static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
        struct iovec iov = { .iov_base = buf, .iov_len = len };
        struct kiocb kiocb;
        struct iov_iter iter;
        ssize_t ret;

        init_sync_kiocb(&kiocb, filp);
        kiocb.ki_pos = (ppos ? *ppos : 0);
        iov_iter_init(&iter, READ, &iov, 1, len);

        ret = call_read_iter(filp, &kiocb, &iter);
        BUG_ON(ret == -EIOCBQUEUED);
        if (ppos)
                *ppos = kiocb.ki_pos;
        return ret;
}

static int warn_unsupported(struct file *file, const char *op)
{
        pr_warn_ratelimited(
                "kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
                op, file, current->pid, current->comm);
        return -EINVAL;
}

ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
{
        struct kvec iov = {
                .iov_base        = buf,
                .iov_len        = min_t(size_t, count, MAX_RW_COUNT),
        };
        struct kiocb kiocb;
        struct iov_iter iter;
        ssize_t ret;

        if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ)))
                return -EINVAL;
        if (!(file->f_mode & FMODE_CAN_READ))
                return -EINVAL;
        /*
         * Also fail if ->read_iter and ->read are both wired up as that
         * implies very convoluted semantics.
         */
        if (unlikely(!file->f_op->read_iter || file->f_op->read))
                return warn_unsupported(file, "read");

        init_sync_kiocb(&kiocb, file);
        kiocb.ki_pos = pos ? *pos : 0;
        iov_iter_kvec(&iter, READ, &iov, 1, iov.iov_len);
        ret = file->f_op->read_iter(&kiocb, &iter);
        if (ret > 0) {
                if (pos)
                        *pos = kiocb.ki_pos;
                fsnotify_access(file);
                add_rchar(current, ret);
        }
        inc_syscr(current);
        return ret;
}

ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
{
        ssize_t ret;

        ret = rw_verify_area(READ, file, pos, count);
        if (ret)
                return ret;
        return __kernel_read(file, buf, count, pos);
}
EXPORT_SYMBOL(kernel_read);

ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
        ssize_t ret;

        if (!(file->f_mode & FMODE_READ))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_READ))
                return -EINVAL;
        if (unlikely(!access_ok(buf, count)))
                return -EFAULT;

        ret = rw_verify_area(READ, file, pos, count);
        if (ret)
                return ret;
        if (count > MAX_RW_COUNT)
                count =  MAX_RW_COUNT;

        if (file->f_op->read)
                ret = file->f_op->read(file, buf, count, pos);
        else if (file->f_op->read_iter)
                ret = new_sync_read(file, buf, count, pos);
        else
                ret = -EINVAL;
        if (ret > 0) {
                fsnotify_access(file);
                add_rchar(current, ret);
        }
        inc_syscr(current);
        return ret;
}

static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
{
        struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
        struct kiocb kiocb;
        struct iov_iter iter;
        ssize_t ret;

        init_sync_kiocb(&kiocb, filp);
        kiocb.ki_pos = (ppos ? *ppos : 0);
        iov_iter_init(&iter, WRITE, &iov, 1, len);

        ret = call_write_iter(filp, &kiocb, &iter);
        BUG_ON(ret == -EIOCBQUEUED);
        if (ret > 0 && ppos)
                *ppos = kiocb.ki_pos;
        return ret;
}

/* caller is responsible for file_start_write/file_end_write */
ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
{
        struct kvec iov = {
                .iov_base        = (void *)buf,
                .iov_len        = min_t(size_t, count, MAX_RW_COUNT),
        };
        struct kiocb kiocb;
        struct iov_iter iter;
        ssize_t ret;

        if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE)))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_WRITE))
                return -EINVAL;
        /*
         * Also fail if ->write_iter and ->write are both wired up as that
         * implies very convoluted semantics.
         */
        if (unlikely(!file->f_op->write_iter || file->f_op->write))
                return warn_unsupported(file, "write");

        init_sync_kiocb(&kiocb, file);
        kiocb.ki_pos = pos ? *pos : 0;
        iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len);
        ret = file->f_op->write_iter(&kiocb, &iter);
        if (ret > 0) {
                if (pos)
                        *pos = kiocb.ki_pos;
                fsnotify_modify(file);
                add_wchar(current, ret);
        }
        inc_syscw(current);
        return ret;
}
/*
 * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()",
 * but autofs is one of the few internal kernel users that actually
 * wants this _and_ can be built as a module. So we need to export
 * this symbol for autofs, even though it really isn't appropriate
 * for any other kernel modules.
 */
EXPORT_SYMBOL_GPL(__kernel_write);

ssize_t kernel_write(struct file *file, const void *buf, size_t count,
                            loff_t *pos)
{
        ssize_t ret;

        ret = rw_verify_area(WRITE, file, pos, count);
        if (ret)
                return ret;

        file_start_write(file);
        ret =  __kernel_write(file, buf, count, pos);
        file_end_write(file);
        return ret;
}
EXPORT_SYMBOL(kernel_write);

ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
        ssize_t ret;

        if (!(file->f_mode & FMODE_WRITE))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_WRITE))
                return -EINVAL;
        if (unlikely(!access_ok(buf, count)))
                return -EFAULT;

        ret = rw_verify_area(WRITE, file, pos, count);
        if (ret)
                return ret;
        if (count > MAX_RW_COUNT)
                count =  MAX_RW_COUNT;
        file_start_write(file);
        if (file->f_op->write)
                ret = file->f_op->write(file, buf, count, pos);
        else if (file->f_op->write_iter)
                ret = new_sync_write(file, buf, count, pos);
        else
                ret = -EINVAL;
        if (ret > 0) {
                fsnotify_modify(file);
                add_wchar(current, ret);
        }
        inc_syscw(current);
        file_end_write(file);
        return ret;
}

/* file_ppos returns &file->f_pos or NULL if file is stream */
static inline loff_t *file_ppos(struct file *file)
{
        return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos;
}

ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
{
        struct fd f = fdget_pos(fd);
        ssize_t ret = -EBADF;

        if (f.file) {
                loff_t pos, *ppos = file_ppos(f.file);
                if (ppos) {
                        pos = *ppos;
                        ppos = &pos;
                }
                ret = vfs_read(f.file, buf, count, ppos);
                if (ret >= 0 && ppos)
                        f.file->f_pos = pos;
                fdput_pos(f);
        }
        return ret;
}

SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
        return ksys_read(fd, buf, count);
}

ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
{
        struct fd f = fdget_pos(fd);
        ssize_t ret = -EBADF;

        if (f.file) {
                loff_t pos, *ppos = file_ppos(f.file);
                if (ppos) {
                        pos = *ppos;
                        ppos = &pos;
                }
                ret = vfs_write(f.file, buf, count, ppos);
                if (ret >= 0 && ppos)
                        f.file->f_pos = pos;
                fdput_pos(f);
        }

        return ret;
}

SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
                size_t, count)
{
        return ksys_write(fd, buf, count);
}

ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count,
                     loff_t pos)
{
        struct fd f;
        ssize_t ret = -EBADF;

        if (pos < 0)
                return -EINVAL;

        f = fdget(fd);
        if (f.file) {
                ret = -ESPIPE;
                if (f.file->f_mode & FMODE_PREAD)
                        ret = vfs_read(f.file, buf, count, &pos);
                fdput(f);
        }

        return ret;
}

SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
                        size_t, count, loff_t, pos)
{
        return ksys_pread64(fd, buf, count, pos);
}

ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf,
                      size_t count, loff_t pos)
{
        struct fd f;
        ssize_t ret = -EBADF;

        if (pos < 0)
                return -EINVAL;

        f = fdget(fd);
        if (f.file) {
                ret = -ESPIPE;
                if (f.file->f_mode & FMODE_PWRITE)  
                        ret = vfs_write(f.file, buf, count, &pos);
                fdput(f);
        }

        return ret;
}

SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
                         size_t, count, loff_t, pos)
{
        return ksys_pwrite64(fd, buf, count, pos);
}

static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
                loff_t *ppos, int type, rwf_t flags)
{
        struct kiocb kiocb;
        ssize_t ret;

        init_sync_kiocb(&kiocb, filp);
        ret = kiocb_set_rw_flags(&kiocb, flags);
        if (ret)
                return ret;
        kiocb.ki_pos = (ppos ? *ppos : 0);

        if (type == READ)
                ret = call_read_iter(filp, &kiocb, iter);
        else
                ret = call_write_iter(filp, &kiocb, iter);
        BUG_ON(ret == -EIOCBQUEUED);
        if (ppos)
                *ppos = kiocb.ki_pos;
        return ret;
}

/* Do it by hand, with file-ops */
static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
                loff_t *ppos, int type, rwf_t flags)
{
        ssize_t ret = 0;

        if (flags & ~RWF_HIPRI)
                return -EOPNOTSUPP;

        while (iov_iter_count(iter)) {
                struct iovec iovec = iov_iter_iovec(iter);
                ssize_t nr;

                if (type == READ) {
                        nr = filp->f_op->read(filp, iovec.iov_base,
                                              iovec.iov_len, ppos);
                } else {
                        nr = filp->f_op->write(filp, iovec.iov_base,
                                               iovec.iov_len, ppos);
                }

                if (nr < 0) {
                        if (!ret)
                                ret = nr;
                        break;
                }
                ret += nr;
                if (nr != iovec.iov_len)
                        break;
                iov_iter_advance(iter, nr);
        }

        return ret;
}

static ssize_t do_iter_read(struct file *file, struct iov_iter *iter,
                loff_t *pos, rwf_t flags)
{
        size_t tot_len;
        ssize_t ret = 0;

        if (!(file->f_mode & FMODE_READ))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_READ))
                return -EINVAL;

        tot_len = iov_iter_count(iter);
        if (!tot_len)
                goto out;
        ret = rw_verify_area(READ, file, pos, tot_len);
        if (ret < 0)
                return ret;

        if (file->f_op->read_iter)
                ret = do_iter_readv_writev(file, iter, pos, READ, flags);
        else
                ret = do_loop_readv_writev(file, iter, pos, READ, flags);
out:
        if (ret >= 0)
                fsnotify_access(file);
        return ret;
}

ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb,
                           struct iov_iter *iter)
{
        size_t tot_len;
        ssize_t ret = 0;

        if (!file->f_op->read_iter)
                return -EINVAL;
        if (!(file->f_mode & FMODE_READ))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_READ))
                return -EINVAL;

        tot_len = iov_iter_count(iter);
        if (!tot_len)
                goto out;
        ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len);
        if (ret < 0)
                return ret;

        ret = call_read_iter(file, iocb, iter);
out:
        if (ret >= 0)
                fsnotify_access(file);
        return ret;
}
EXPORT_SYMBOL(vfs_iocb_iter_read);

ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
                rwf_t flags)
{
        if (!file->f_op->read_iter)
                return -EINVAL;
        return do_iter_read(file, iter, ppos, flags);
}
EXPORT_SYMBOL(vfs_iter_read);

static ssize_t do_iter_write(struct file *file, struct iov_iter *iter,
                loff_t *pos, rwf_t flags)
{
        size_t tot_len;
        ssize_t ret = 0;

        if (!(file->f_mode & FMODE_WRITE))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_WRITE))
                return -EINVAL;

        tot_len = iov_iter_count(iter);
        if (!tot_len)
                return 0;
        ret = rw_verify_area(WRITE, file, pos, tot_len);
        if (ret < 0)
                return ret;

        if (file->f_op->write_iter)
                ret = do_iter_readv_writev(file, iter, pos, WRITE, flags);
        else
                ret = do_loop_readv_writev(file, iter, pos, WRITE, flags);
        if (ret > 0)
                fsnotify_modify(file);
        return ret;
}

ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
                            struct iov_iter *iter)
{
        size_t tot_len;
        ssize_t ret = 0;

        if (!file->f_op->write_iter)
                return -EINVAL;
        if (!(file->f_mode & FMODE_WRITE))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_WRITE))
                return -EINVAL;

        tot_len = iov_iter_count(iter);
        if (!tot_len)
                return 0;
        ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len);
        if (ret < 0)
                return ret;

        ret = call_write_iter(file, iocb, iter);
        if (ret > 0)
                fsnotify_modify(file);

        return ret;
}
EXPORT_SYMBOL(vfs_iocb_iter_write);

ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
                rwf_t flags)
{
        if (!file->f_op->write_iter)
                return -EINVAL;
        return do_iter_write(file, iter, ppos, flags);
}
EXPORT_SYMBOL(vfs_iter_write);

static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
                  unsigned long vlen, loff_t *pos, rwf_t flags)
{
        struct iovec iovstack[UIO_FASTIOV];
        struct iovec *iov = iovstack;
        struct iov_iter iter;
        ssize_t ret;

        ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
        if (ret >= 0) {
                ret = do_iter_read(file, &iter, pos, flags);
                kfree(iov);
        }

        return ret;
}

static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
                   unsigned long vlen, loff_t *pos, rwf_t flags)
{
        struct iovec iovstack[UIO_FASTIOV];
        struct iovec *iov = iovstack;
        struct iov_iter iter;
        ssize_t ret;

        ret = import_iovec(WRITE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
        if (ret >= 0) {
                file_start_write(file);
                ret = do_iter_write(file, &iter, pos, flags);
                file_end_write(file);
                kfree(iov);
        }
        return ret;
}

static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
                        unsigned long vlen, rwf_t flags)
{
        struct fd f = fdget_pos(fd);
        ssize_t ret = -EBADF;

        if (f.file) {
                loff_t pos, *ppos = file_ppos(f.file);
                if (ppos) {
                        pos = *ppos;
                        ppos = &pos;
                }
                ret = vfs_readv(f.file, vec, vlen, ppos, flags);
                if (ret >= 0 && ppos)
                        f.file->f_pos = pos;
                fdput_pos(f);
        }

        if (ret > 0)
                add_rchar(current, ret);
        inc_syscr(current);
        return ret;
}

static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
                         unsigned long vlen, rwf_t flags)
{
        struct fd f = fdget_pos(fd);
        ssize_t ret = -EBADF;

        if (f.file) {
                loff_t pos, *ppos = file_ppos(f.file);
                if (ppos) {
                        pos = *ppos;
                        ppos = &pos;
                }
                ret = vfs_writev(f.file, vec, vlen, ppos, flags);
                if (ret >= 0 && ppos)
                        f.file->f_pos = pos;
                fdput_pos(f);
        }

        if (ret > 0)
                add_wchar(current, ret);
        inc_syscw(current);
        return ret;
}

static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
{
#define HALF_LONG_BITS (BITS_PER_LONG / 2)
        return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
}

static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
                         unsigned long vlen, loff_t pos, rwf_t flags)
{
        struct fd f;
        ssize_t ret = -EBADF;

        if (pos < 0)
                return -EINVAL;

        f = fdget(fd);
        if (f.file) {
                ret = -ESPIPE;
                if (f.file->f_mode & FMODE_PREAD)
                        ret = vfs_readv(f.file, vec, vlen, &pos, flags);
                fdput(f);
        }

        if (ret > 0)
                add_rchar(current, ret);
        inc_syscr(current);
        return ret;
}

static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
                          unsigned long vlen, loff_t pos, rwf_t flags)
{
        struct fd f;
        ssize_t ret = -EBADF;

        if (pos < 0)
                return -EINVAL;

        f = fdget(fd);
        if (f.file) {
                ret = -ESPIPE;
                if (f.file->f_mode & FMODE_PWRITE)
                        ret = vfs_writev(f.file, vec, vlen, &pos, flags);
                fdput(f);
        }

        if (ret > 0)
                add_wchar(current, ret);
        inc_syscw(current);
        return ret;
}

SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
                unsigned long, vlen)
{
        return do_readv(fd, vec, vlen, 0);
}

SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
                unsigned long, vlen)
{
        return do_writev(fd, vec, vlen, 0);
}

SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
{
        loff_t pos = pos_from_hilo(pos_h, pos_l);

        return do_preadv(fd, vec, vlen, pos, 0);
}

SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec,
                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
                rwf_t, flags)
{
        loff_t pos = pos_from_hilo(pos_h, pos_l);

        if (pos == -1)
                return do_readv(fd, vec, vlen, flags);

        return do_preadv(fd, vec, vlen, pos, flags);
}

SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
{
        loff_t pos = pos_from_hilo(pos_h, pos_l);

        return do_pwritev(fd, vec, vlen, pos, 0);
}

SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
                rwf_t, flags)
{
        loff_t pos = pos_from_hilo(pos_h, pos_l);

        if (pos == -1)
                return do_writev(fd, vec, vlen, flags);

        return do_pwritev(fd, vec, vlen, pos, flags);
}

/*
 * Various compat syscalls.  Note that they all pretend to take a native
 * iovec - import_iovec will properly treat those as compat_iovecs based on
 * in_compat_syscall().
 */
#ifdef CONFIG_COMPAT
#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
                const struct iovec __user *, vec,
                unsigned long, vlen, loff_t, pos)
{
        return do_preadv(fd, vec, vlen, pos, 0);
}
#endif

COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
                const struct iovec __user *, vec,
                compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
{
        loff_t pos = ((loff_t)pos_high << 32) | pos_low;

        return do_preadv(fd, vec, vlen, pos, 0);
}

#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
                const struct iovec __user *, vec,
                unsigned long, vlen, loff_t, pos, rwf_t, flags)
{
        if (pos == -1)
                return do_readv(fd, vec, vlen, flags);
        return do_preadv(fd, vec, vlen, pos, flags);
}
#endif

COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
                const struct iovec __user *, vec,
                compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
                rwf_t, flags)
{
        loff_t pos = ((loff_t)pos_high << 32) | pos_low;

        if (pos == -1)
                return do_readv(fd, vec, vlen, flags);
        return do_preadv(fd, vec, vlen, pos, flags);
}

#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
                const struct iovec __user *, vec,
                unsigned long, vlen, loff_t, pos)
{
        return do_pwritev(fd, vec, vlen, pos, 0);
}
#endif

COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
                const struct iovec __user *,vec,
                compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
{
        loff_t pos = ((loff_t)pos_high << 32) | pos_low;

        return do_pwritev(fd, vec, vlen, pos, 0);
}

#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
                const struct iovec __user *, vec,
                unsigned long, vlen, loff_t, pos, rwf_t, flags)
{
        if (pos == -1)
                return do_writev(fd, vec, vlen, flags);
        return do_pwritev(fd, vec, vlen, pos, flags);
}
#endif

COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
                const struct iovec __user *,vec,
                compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags)
{
        loff_t pos = ((loff_t)pos_high << 32) | pos_low;

        if (pos == -1)
                return do_writev(fd, vec, vlen, flags);
        return do_pwritev(fd, vec, vlen, pos, flags);
}
#endif /* CONFIG_COMPAT */

static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
                             size_t count, loff_t max)
{
        struct fd in, out;
        struct inode *in_inode, *out_inode;
        loff_t pos;
        loff_t out_pos;
        ssize_t retval;
        int fl;

        /*
         * Get input file, and verify that it is ok..
         */
        retval = -EBADF;
        in = fdget(in_fd);
        if (!in.file)
                goto out;
        if (!(in.file->f_mode & FMODE_READ))
                goto fput_in;
        retval = -ESPIPE;
        if (!ppos) {
                pos = in.file->f_pos;
        } else {
                pos = *ppos;
                if (!(in.file->f_mode & FMODE_PREAD))
                        goto fput_in;
        }
        retval = rw_verify_area(READ, in.file, &pos, count);
        if (retval < 0)
                goto fput_in;
        if (count > MAX_RW_COUNT)
                count =  MAX_RW_COUNT;

        /*
         * Get output file, and verify that it is ok..
         */
        retval = -EBADF;
        out = fdget(out_fd);
        if (!out.file)
                goto fput_in;
        if (!(out.file->f_mode & FMODE_WRITE))
                goto fput_out;
        in_inode = file_inode(in.file);
        out_inode = file_inode(out.file);
        out_pos = out.file->f_pos;
        retval = rw_verify_area(WRITE, out.file, &out_pos, count);
        if (retval < 0)
                goto fput_out;

        if (!max)
                max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);

        if (unlikely(pos + count > max)) {
                retval = -EOVERFLOW;
                if (pos >= max)
                        goto fput_out;
                count = max - pos;
        }

        fl = 0;
#if 0
        /*
         * We need to debate whether we can enable this or not. The
         * man page documents EAGAIN return for the output at least,
         * and the application is arguably buggy if it doesn't expect
         * EAGAIN on a non-blocking file descriptor.
         */
        if (in.file->f_flags & O_NONBLOCK)
                fl = SPLICE_F_NONBLOCK;
#endif
        file_start_write(out.file);
        retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
        file_end_write(out.file);

        if (retval > 0) {
                add_rchar(current, retval);
                add_wchar(current, retval);
                fsnotify_access(in.file);
                fsnotify_modify(out.file);
                out.file->f_pos = out_pos;
                if (ppos)
                        *ppos = pos;
                else
                        in.file->f_pos = pos;
        }

        inc_syscr(current);
        inc_syscw(current);
        if (pos > max)
                retval = -EOVERFLOW;

fput_out:
        fdput(out);
fput_in:
        fdput(in);
out:
        return retval;
}

SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
{
        loff_t pos;
        off_t off;
        ssize_t ret;

        if (offset) {
                if (unlikely(get_user(off, offset)))
                        return -EFAULT;
                pos = off;
                ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
                if (unlikely(put_user(pos, offset)))
                        return -EFAULT;
                return ret;
        }

        return do_sendfile(out_fd, in_fd, NULL, count, 0);
}

SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
{
        loff_t pos;
        ssize_t ret;

        if (offset) {
                if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
                        return -EFAULT;
                ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
                if (unlikely(put_user(pos, offset)))
                        return -EFAULT;
                return ret;
        }

        return do_sendfile(out_fd, in_fd, NULL, count, 0);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
                compat_off_t __user *, offset, compat_size_t, count)
{
        loff_t pos;
        off_t off;
        ssize_t ret;

        if (offset) {
                if (unlikely(get_user(off, offset)))
                        return -EFAULT;
                pos = off;
                ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
                if (unlikely(put_user(pos, offset)))
                        return -EFAULT;
                return ret;
        }

        return do_sendfile(out_fd, in_fd, NULL, count, 0);
}

COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
                compat_loff_t __user *, offset, compat_size_t, count)
{
        loff_t pos;
        ssize_t ret;

        if (offset) {
                if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
                        return -EFAULT;
                ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
                if (unlikely(put_user(pos, offset)))
                        return -EFAULT;
                return ret;
        }

        return do_sendfile(out_fd, in_fd, NULL, count, 0);
}
#endif

/**
 * generic_copy_file_range - copy data between two files
 * @file_in:        file structure to read from
 * @pos_in:        file offset to read from
 * @file_out:        file structure to write data to
 * @pos_out:        file offset to write data to
 * @len:        amount of data to copy
 * @flags:        copy flags
 *
 * This is a generic filesystem helper to copy data from one file to another.
 * It has no constraints on the source or destination file owners - the files
 * can belong to different superblocks and different filesystem types. Short
 * copies are allowed.
 *
 * This should be called from the @file_out filesystem, as per the
 * ->copy_file_range() method.
 *
 * Returns the number of bytes copied or a negative error indicating the
 * failure.
 */

ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
                                struct file *file_out, loff_t pos_out,
                                size_t len, unsigned int flags)
{
        return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
                                len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
}
EXPORT_SYMBOL(generic_copy_file_range);

/*
 * Performs necessary checks before doing a file copy
 *
 * Can adjust amount of bytes to copy via @req_count argument.
 * Returns appropriate error code that caller should return or
 * zero in case the copy should be allowed.
 */
static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
                                    struct file *file_out, loff_t pos_out,
                                    size_t *req_count, unsigned int flags)
{
        struct inode *inode_in = file_inode(file_in);
        struct inode *inode_out = file_inode(file_out);
        uint64_t count = *req_count;
        loff_t size_in;
        int ret;

        ret = generic_file_rw_checks(file_in, file_out);
        if (ret)
                return ret;

        /*
         * We allow some filesystems to handle cross sb copy, but passing
         * a file of the wrong filesystem type to filesystem driver can result
         * in an attempt to dereference the wrong type of ->private_data, so
         * avoid doing that until we really have a good reason.
         *
         * nfs and cifs define several different file_system_type structures
         * and several different sets of file_operations, but they all end up
         * using the same ->copy_file_range() function pointer.
         */
        if (flags & COPY_FILE_SPLICE) {
                /* cross sb splice is allowed */
        } else if (file_out->f_op->copy_file_range) {
                if (file_in->f_op->copy_file_range !=
                    file_out->f_op->copy_file_range)
                        return -EXDEV;
        } else if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) {
                return -EXDEV;
        }

        /* Don't touch certain kinds of inodes */
        if (IS_IMMUTABLE(inode_out))
                return -EPERM;

        if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
                return -ETXTBSY;

        /* Ensure offsets don't wrap. */
        if (pos_in + count < pos_in || pos_out + count < pos_out)
                return -EOVERFLOW;

        /* Shorten the copy to EOF */
        size_in = i_size_read(inode_in);
        if (pos_in >= size_in)
                count = 0;
        else
                count = min(count, size_in - (uint64_t)pos_in);

        ret = generic_write_check_limits(file_out, pos_out, &count);
        if (ret)
                return ret;

        /* Don't allow overlapped copying within the same file. */
        if (inode_in == inode_out &&
            pos_out + count > pos_in &&
            pos_out < pos_in + count)
                return -EINVAL;

        *req_count = count;
        return 0;
}

/*
 * copy_file_range() differs from regular file read and write in that it
 * specifically allows return partial success.  When it does so is up to
 * the copy_file_range method.
 */
ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
                            struct file *file_out, loff_t pos_out,
                            size_t len, unsigned int flags)
{
        ssize_t ret;
        bool splice = flags & COPY_FILE_SPLICE;

        if (flags & ~COPY_FILE_SPLICE)
                return -EINVAL;

        ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len,
                                       flags);
        if (unlikely(ret))
                return ret;

        ret = rw_verify_area(READ, file_in, &pos_in, len);
        if (unlikely(ret))
                return ret;

        ret = rw_verify_area(WRITE, file_out, &pos_out, len);
        if (unlikely(ret))
                return ret;

        if (len == 0)
                return 0;

        file_start_write(file_out);

        /*
         * Cloning is supported by more file systems, so we implement copy on
         * same sb using clone, but for filesystems where both clone and copy
         * are supported (e.g. nfs,cifs), we only call the copy method.
         */
        if (!splice && file_out->f_op->copy_file_range) {
                ret = file_out->f_op->copy_file_range(file_in, pos_in,
                                                      file_out, pos_out,
                                                      len, flags);
                goto done;
        }

        if (!splice && file_in->f_op->remap_file_range &&
            file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) {
                ret = file_in->f_op->remap_file_range(file_in, pos_in,
                                file_out, pos_out,
                                min_t(loff_t, MAX_RW_COUNT, len),
                                REMAP_FILE_CAN_SHORTEN);
                if (ret > 0)
                        goto done;
        }

        /*
         * We can get here for same sb copy of filesystems that do not implement
         * ->copy_file_range() in case filesystem does not support clone or in
         * case filesystem supports clone but rejected the clone request (e.g.
         * because it was not block aligned).
         *
         * In both cases, fall back to kernel copy so we are able to maintain a
         * consistent story about which filesystems support copy_file_range()
         * and which filesystems do not, that will allow userspace tools to
         * make consistent desicions w.r.t using copy_file_range().
         *
         * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE.
         */
        ret = generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
                                      flags);

done:
        if (ret > 0) {
                fsnotify_access(file_in);
                add_rchar(current, ret);
                fsnotify_modify(file_out);
                add_wchar(current, ret);
        }

        inc_syscr(current);
        inc_syscw(current);

        file_end_write(file_out);

        return ret;
}
EXPORT_SYMBOL(vfs_copy_file_range);

SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
                int, fd_out, loff_t __user *, off_out,
                size_t, len, unsigned int, flags)
{
        loff_t pos_in;
        loff_t pos_out;
        struct fd f_in;
        struct fd f_out;
        ssize_t ret = -EBADF;

        f_in = fdget(fd_in);
        if (!f_in.file)
                goto out2;

        f_out = fdget(fd_out);
        if (!f_out.file)
                goto out1;

        ret = -EFAULT;
        if (off_in) {
                if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
                        goto out;
        } else {
                pos_in = f_in.file->f_pos;
        }

        if (off_out) {
                if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
                        goto out;
        } else {
                pos_out = f_out.file->f_pos;
        }

        ret = -EINVAL;
        if (flags != 0)
                goto out;

        ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
                                  flags);
        if (ret > 0) {
                pos_in += ret;
                pos_out += ret;

                if (off_in) {
                        if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
                                ret = -EFAULT;
                } else {
                        f_in.file->f_pos = pos_in;
                }

                if (off_out) {
                        if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
                                ret = -EFAULT;
                } else {
                        f_out.file->f_pos = pos_out;
                }
        }

out:
        fdput(f_out);
out1:
        fdput(f_in);
out2:
        return ret;
}

/*
 * Don't operate on ranges the page cache doesn't support, and don't exceed the
 * LFS limits.  If pos is under the limit it becomes a short access.  If it
 * exceeds the limit we return -EFBIG.
 */
int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count)
{
        struct inode *inode = file->f_mapping->host;
        loff_t max_size = inode->i_sb->s_maxbytes;
        loff_t limit = rlimit(RLIMIT_FSIZE);

        if (limit != RLIM_INFINITY) {
                if (pos >= limit) {
                        send_sig(SIGXFSZ, current, 0);
                        return -EFBIG;
                }
                *count = min(*count, limit - pos);
        }

        if (!(file->f_flags & O_LARGEFILE))
                max_size = MAX_NON_LFS;

        if (unlikely(pos >= max_size))
                return -EFBIG;

        *count = min(*count, max_size - pos);

        return 0;
}

/*
 * Performs necessary checks before doing a write
 *
 * Can adjust writing position or amount of bytes to write.
 * Returns appropriate error code that caller should return or
 * zero in case that write should be allowed.
 */
ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
{
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
        loff_t count;
        int ret;

        if (IS_SWAPFILE(inode))
                return -ETXTBSY;

        if (!iov_iter_count(from))
                return 0;

        /* FIXME: this is for backwards compatibility with 2.4 */
        if (iocb->ki_flags & IOCB_APPEND)
                iocb->ki_pos = i_size_read(inode);

        if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
                return -EINVAL;

        count = iov_iter_count(from);
        ret = generic_write_check_limits(file, iocb->ki_pos, &count);
        if (ret)
                return ret;

        iov_iter_truncate(from, count);
        return iov_iter_count(from);
}
EXPORT_SYMBOL(generic_write_checks);

/*
 * Performs common checks before doing a file copy/clone
 * from @file_in to @file_out.
 */
int generic_file_rw_checks(struct file *file_in, struct file *file_out)
{
        struct inode *inode_in = file_inode(file_in);
        struct inode *inode_out = file_inode(file_out);

        /* Don't copy dirs, pipes, sockets... */
        if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
                return -EISDIR;
        if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
                return -EINVAL;

        if (!(file_in->f_mode & FMODE_READ) ||
            !(file_out->f_mode & FMODE_WRITE) ||
            (file_out->f_flags & O_APPEND))
                return -EBADF;

        return 0;
}

















































































































































    1 

    1 

    1 













































































































































































    2 

    2 
    2 


    2 















    2 

































































































































































































































































































































    1 





































































































































































































































































































































    1 




    1 

    1 












    1 






    1 






    1 




















































































































































































































































































































































































































































































































































































































































































































































































    1 

    1 


    1 
    1 





































    1 
    1 























































































































    1 



























































    1 

















    1 

















































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
// SPDX-License-Identifier: GPL-2.0
/*
 *  gendisk handling
 */

#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/fs.h>
#include <linux/genhd.h>
#include <linux/kdev_t.h>
#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/kmod.h>
#include <linux/kobj_map.h>
#include <linux/mutex.h>
#include <linux/idr.h>
#include <linux/log2.h>
#include <linux/pm_runtime.h>
#include <linux/badblocks.h>

#include "blk.h"

static DEFINE_MUTEX(block_class_lock);
static struct kobject *block_depr;

/* for extended dynamic devt allocation, currently only one major is used */
#define NR_EXT_DEVT                (1 << MINORBITS)

/* For extended devt allocation.  ext_devt_lock prevents look up
 * results from going away underneath its user.
 */
static DEFINE_SPINLOCK(ext_devt_lock);
static DEFINE_IDR(ext_devt_idr);

static void disk_check_events(struct disk_events *ev,
                              unsigned int *clearing_ptr);
static void disk_alloc_events(struct gendisk *disk);
static void disk_add_events(struct gendisk *disk);
static void disk_del_events(struct gendisk *disk);
static void disk_release_events(struct gendisk *disk);

/*
 * Set disk capacity and notify if the size is not currently zero and will not
 * be set to zero.  Returns true if a uevent was sent, otherwise false.
 */
bool set_capacity_and_notify(struct gendisk *disk, sector_t size)
{
        sector_t capacity = get_capacity(disk);

        set_capacity(disk, size);
        revalidate_disk_size(disk, true);

        if (capacity != size && capacity != 0 && size != 0) {
                char *envp[] = { "RESIZE=1", NULL };

                kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
                return true;
        }

        return false;
}
EXPORT_SYMBOL_GPL(set_capacity_and_notify);

/*
 * Format the device name of the indicated disk into the supplied buffer and
 * return a pointer to that same buffer for convenience.
 */
char *disk_name(struct gendisk *hd, int partno, char *buf)
{
        if (!partno)
                snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name);
        else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1]))
                snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno);
        else
                snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno);

        return buf;
}

const char *bdevname(struct block_device *bdev, char *buf)
{
        return disk_name(bdev->bd_disk, bdev->bd_partno, buf);
}
EXPORT_SYMBOL(bdevname);

static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat)
{
        int cpu;

        memset(stat, 0, sizeof(struct disk_stats));
        for_each_possible_cpu(cpu) {
                struct disk_stats *ptr = per_cpu_ptr(part->dkstats, cpu);
                int group;

                for (group = 0; group < NR_STAT_GROUPS; group++) {
                        stat->nsecs[group] += ptr->nsecs[group];
                        stat->sectors[group] += ptr->sectors[group];
                        stat->ios[group] += ptr->ios[group];
                        stat->merges[group] += ptr->merges[group];
                }

                stat->io_ticks += ptr->io_ticks;
        }
}

static unsigned int part_in_flight(struct hd_struct *part)
{
        unsigned int inflight = 0;
        int cpu;

        for_each_possible_cpu(cpu) {
                inflight += part_stat_local_read_cpu(part, in_flight[0], cpu) +
                            part_stat_local_read_cpu(part, in_flight[1], cpu);
        }
        if ((int)inflight < 0)
                inflight = 0;

        return inflight;
}

static void part_in_flight_rw(struct hd_struct *part, unsigned int inflight[2])
{
        int cpu;

        inflight[0] = 0;
        inflight[1] = 0;
        for_each_possible_cpu(cpu) {
                inflight[0] += part_stat_local_read_cpu(part, in_flight[0], cpu);
                inflight[1] += part_stat_local_read_cpu(part, in_flight[1], cpu);
        }
        if ((int)inflight[0] < 0)
                inflight[0] = 0;
        if ((int)inflight[1] < 0)
                inflight[1] = 0;
}

struct hd_struct *__disk_get_part(struct gendisk *disk, int partno)
{
        struct disk_part_tbl *ptbl = rcu_dereference(disk->part_tbl);

        if (unlikely(partno < 0 || partno >= ptbl->len))
                return NULL;
        return rcu_dereference(ptbl->part[partno]);
}

/**
 * disk_get_part - get partition
 * @disk: disk to look partition from
 * @partno: partition number
 *
 * Look for partition @partno from @disk.  If found, increment
 * reference count and return it.
 *
 * CONTEXT:
 * Don't care.
 *
 * RETURNS:
 * Pointer to the found partition on success, NULL if not found.
 */
struct hd_struct *disk_get_part(struct gendisk *disk, int partno)
{
        struct hd_struct *part;

        rcu_read_lock();
        part = __disk_get_part(disk, partno);
        if (part)
                get_device(part_to_dev(part));
        rcu_read_unlock();

        return part;
}

/**
 * disk_part_iter_init - initialize partition iterator
 * @piter: iterator to initialize
 * @disk: disk to iterate over
 * @flags: DISK_PITER_* flags
 *
 * Initialize @piter so that it iterates over partitions of @disk.
 *
 * CONTEXT:
 * Don't care.
 */
void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk,
                          unsigned int flags)
{
        struct disk_part_tbl *ptbl;

        rcu_read_lock();
        ptbl = rcu_dereference(disk->part_tbl);

        piter->disk = disk;
        piter->part = NULL;

        if (flags & DISK_PITER_REVERSE)
                piter->idx = ptbl->len - 1;
        else if (flags & (DISK_PITER_INCL_PART0 | DISK_PITER_INCL_EMPTY_PART0))
                piter->idx = 0;
        else
                piter->idx = 1;

        piter->flags = flags;

        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(disk_part_iter_init);

/**
 * disk_part_iter_next - proceed iterator to the next partition and return it
 * @piter: iterator of interest
 *
 * Proceed @piter to the next partition and return it.
 *
 * CONTEXT:
 * Don't care.
 */
struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter)
{
        struct disk_part_tbl *ptbl;
        int inc, end;

        /* put the last partition */
        disk_put_part(piter->part);
        piter->part = NULL;

        /* get part_tbl */
        rcu_read_lock();
        ptbl = rcu_dereference(piter->disk->part_tbl);

        /* determine iteration parameters */
        if (piter->flags & DISK_PITER_REVERSE) {
                inc = -1;
                if (piter->flags & (DISK_PITER_INCL_PART0 |
                                    DISK_PITER_INCL_EMPTY_PART0))
                        end = -1;
                else
                        end = 0;
        } else {
                inc = 1;
                end = ptbl->len;
        }

        /* iterate to the next partition */
        for (; piter->idx != end; piter->idx += inc) {
                struct hd_struct *part;

                part = rcu_dereference(ptbl->part[piter->idx]);
                if (!part)
                        continue;
                get_device(part_to_dev(part));
                piter->part = part;
                if (!part_nr_sects_read(part) &&
                    !(piter->flags & DISK_PITER_INCL_EMPTY) &&
                    !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 &&
                      piter->idx == 0)) {
                        put_device(part_to_dev(part));
                        piter->part = NULL;
                        continue;
                }

                piter->idx += inc;
                break;
        }

        rcu_read_unlock();

        return piter->part;
}
EXPORT_SYMBOL_GPL(disk_part_iter_next);

/**
 * disk_part_iter_exit - finish up partition iteration
 * @piter: iter of interest
 *
 * Called when iteration is over.  Cleans up @piter.
 *
 * CONTEXT:
 * Don't care.
 */
void disk_part_iter_exit(struct disk_part_iter *piter)
{
        disk_put_part(piter->part);
        piter->part = NULL;
}
EXPORT_SYMBOL_GPL(disk_part_iter_exit);

static inline int sector_in_part(struct hd_struct *part, sector_t sector)
{
        return part->start_sect <= sector &&
                sector < part->start_sect + part_nr_sects_read(part);
}

/**
 * disk_map_sector_rcu - map sector to partition
 * @disk: gendisk of interest
 * @sector: sector to map
 *
 * Find out which partition @sector maps to on @disk.  This is
 * primarily used for stats accounting.
 *
 * CONTEXT:
 * RCU read locked.  The returned partition pointer is always valid
 * because its refcount is grabbed except for part0, which lifetime
 * is same with the disk.
 *
 * RETURNS:
 * Found partition on success, part0 is returned if no partition matches
 * or the matched partition is being deleted.
 */
struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
{
        struct disk_part_tbl *ptbl;
        struct hd_struct *part;
        int i;

        rcu_read_lock();
        ptbl = rcu_dereference(disk->part_tbl);

        part = rcu_dereference(ptbl->last_lookup);
        if (part && sector_in_part(part, sector) && hd_struct_try_get(part))
                goto out_unlock;

        for (i = 1; i < ptbl->len; i++) {
                part = rcu_dereference(ptbl->part[i]);

                if (part && sector_in_part(part, sector)) {
                        /*
                         * only live partition can be cached for lookup,
                         * so use-after-free on cached & deleting partition
                         * can be avoided
                         */
                        if (!hd_struct_try_get(part))
                                break;
                        rcu_assign_pointer(ptbl->last_lookup, part);
                        goto out_unlock;
                }
        }

        part = &disk->part0;
out_unlock:
        rcu_read_unlock();
        return part;
}

/**
 * disk_has_partitions
 * @disk: gendisk of interest
 *
 * Walk through the partition table and check if valid partition exists.
 *
 * CONTEXT:
 * Don't care.
 *
 * RETURNS:
 * True if the gendisk has at least one valid non-zero size partition.
 * Otherwise false.
 */
bool disk_has_partitions(struct gendisk *disk)
{
        struct disk_part_tbl *ptbl;
        int i;
        bool ret = false;

        rcu_read_lock();
        ptbl = rcu_dereference(disk->part_tbl);

        /* Iterate partitions skipping the whole device at index 0 */
        for (i = 1; i < ptbl->len; i++) {
                if (rcu_dereference(ptbl->part[i])) {
                        ret = true;
                        break;
                }
        }

        rcu_read_unlock();

        return ret;
}
EXPORT_SYMBOL_GPL(disk_has_partitions);

/*
 * Can be deleted altogether. Later.
 *
 */
#define BLKDEV_MAJOR_HASH_SIZE 255
static struct blk_major_name {
        struct blk_major_name *next;
        int major;
        char name[16];
} *major_names[BLKDEV_MAJOR_HASH_SIZE];

/* index in the above - for now: assume no multimajor ranges */
static inline int major_to_index(unsigned major)
{
        return major % BLKDEV_MAJOR_HASH_SIZE;
}

#ifdef CONFIG_PROC_FS
void blkdev_show(struct seq_file *seqf, off_t offset)
{
        struct blk_major_name *dp;

        mutex_lock(&block_class_lock);
        for (dp = major_names[major_to_index(offset)]; dp; dp = dp->next)
                if (dp->major == offset)
                        seq_printf(seqf, "%3d %s\n", dp->major, dp->name);
        mutex_unlock(&block_class_lock);
}
#endif /* CONFIG_PROC_FS */

/**
 * register_blkdev - register a new block device
 *
 * @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If
 *         @major = 0, try to allocate any unused major number.
 * @name: the name of the new block device as a zero terminated string
 *
 * The @name must be unique within the system.
 *
 * The return value depends on the @major input parameter:
 *
 *  - if a major device number was requested in range [1..BLKDEV_MAJOR_MAX-1]
 *    then the function returns zero on success, or a negative error code
 *  - if any unused major number was requested with @major = 0 parameter
 *    then the return value is the allocated major number in range
 *    [1..BLKDEV_MAJOR_MAX-1] or a negative error code otherwise
 *
 * See Documentation/admin-guide/devices.txt for the list of allocated
 * major numbers.
 */
int register_blkdev(unsigned int major, const char *name)
{
        struct blk_major_name **n, *p;
        int index, ret = 0;

        mutex_lock(&block_class_lock);

        /* temporary */
        if (major == 0) {
                for (index = ARRAY_SIZE(major_names)-1; index > 0; index--) {
                        if (major_names[index] == NULL)
                                break;
                }

                if (index == 0) {
                        printk("%s: failed to get major for %s\n",
                               __func__, name);
                        ret = -EBUSY;
                        goto out;
                }
                major = index;
                ret = major;
        }

        if (major >= BLKDEV_MAJOR_MAX) {
                pr_err("%s: major requested (%u) is greater than the maximum (%u) for %s\n",
                       __func__, major, BLKDEV_MAJOR_MAX-1, name);

                ret = -EINVAL;
                goto out;
        }

        p = kmalloc(sizeof(struct blk_major_name), GFP_KERNEL);
        if (p == NULL) {
                ret = -ENOMEM;
                goto out;
        }

        p->major = major;
        strlcpy(p->name, name, sizeof(p->name));
        p->next = NULL;
        index = major_to_index(major);

        for (n = &major_names[index]; *n; n = &(*n)->next) {
                if ((*n)->major == major)
                        break;
        }
        if (!*n)
                *n = p;
        else
                ret = -EBUSY;

        if (ret < 0) {
                printk("register_blkdev: cannot get major %u for %s\n",
                       major, name);
                kfree(p);
        }
out:
        mutex_unlock(&block_class_lock);
        return ret;
}

EXPORT_SYMBOL(register_blkdev);

void unregister_blkdev(unsigned int major, const char *name)
{
        struct blk_major_name **n;
        struct blk_major_name *p = NULL;
        int index = major_to_index(major);

        mutex_lock(&block_class_lock);
        for (n = &major_names[index]; *n; n = &(*n)->next)
                if ((*n)->major == major)
                        break;
        if (!*n || strcmp((*n)->name, name)) {
                WARN_ON(1);
        } else {
                p = *n;
                *n = p->next;
        }
        mutex_unlock(&block_class_lock);
        kfree(p);
}

EXPORT_SYMBOL(unregister_blkdev);

static struct kobj_map *bdev_map;

/**
 * blk_mangle_minor - scatter minor numbers apart
 * @minor: minor number to mangle
 *
 * Scatter consecutively allocated @minor number apart if MANGLE_DEVT
 * is enabled.  Mangling twice gives the original value.
 *
 * RETURNS:
 * Mangled value.
 *
 * CONTEXT:
 * Don't care.
 */
static int blk_mangle_minor(int minor)
{
#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT
        int i;

        for (i = 0; i < MINORBITS / 2; i++) {
                int low = minor & (1 << i);
                int high = minor & (1 << (MINORBITS - 1 - i));
                int distance = MINORBITS - 1 - 2 * i;

                minor ^= low | high;        /* clear both bits */
                low <<= distance;        /* swap the positions */
                high >>= distance;
                minor |= low | high;        /* and set */
        }
#endif
        return minor;
}

/**
 * blk_alloc_devt - allocate a dev_t for a partition
 * @part: partition to allocate dev_t for
 * @devt: out parameter for resulting dev_t
 *
 * Allocate a dev_t for block device.
 *
 * RETURNS:
 * 0 on success, allocated dev_t is returned in *@devt.  -errno on
 * failure.
 *
 * CONTEXT:
 * Might sleep.
 */
int blk_alloc_devt(struct hd_struct *part, dev_t *devt)
{
        struct gendisk *disk = part_to_disk(part);
        int idx;

        /* in consecutive minor range? */
        if (part->partno < disk->minors) {
                *devt = MKDEV(disk->major, disk->first_minor + part->partno);
                return 0;
        }

        /* allocate ext devt */
        idr_preload(GFP_KERNEL);

        spin_lock_bh(&ext_devt_lock);
        idx = idr_alloc(&ext_devt_idr, part, 0, NR_EXT_DEVT, GFP_NOWAIT);
        spin_unlock_bh(&ext_devt_lock);

        idr_preload_end();
        if (idx < 0)
                return idx == -ENOSPC ? -EBUSY : idx;

        *devt = MKDEV(BLOCK_EXT_MAJOR, blk_mangle_minor(idx));
        return 0;
}

/**
 * blk_free_devt - free a dev_t
 * @devt: dev_t to free
 *
 * Free @devt which was allocated using blk_alloc_devt().
 *
 * CONTEXT:
 * Might sleep.
 */
void blk_free_devt(dev_t devt)
{
        if (devt == MKDEV(0, 0))
                return;

        if (MAJOR(devt) == BLOCK_EXT_MAJOR) {
                spin_lock_bh(&ext_devt_lock);
                idr_remove(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
                spin_unlock_bh(&ext_devt_lock);
        }
}

/*
 * We invalidate devt by assigning NULL pointer for devt in idr.
 */
void blk_invalidate_devt(dev_t devt)
{
        if (MAJOR(devt) == BLOCK_EXT_MAJOR) {
                spin_lock_bh(&ext_devt_lock);
                idr_replace(&ext_devt_idr, NULL, blk_mangle_minor(MINOR(devt)));
                spin_unlock_bh(&ext_devt_lock);
        }
}

static char *bdevt_str(dev_t devt, char *buf)
{
        if (MAJOR(devt) <= 0xff && MINOR(devt) <= 0xff) {
                char tbuf[BDEVT_SIZE];
                snprintf(tbuf, BDEVT_SIZE, "%02x%02x", MAJOR(devt), MINOR(devt));
                snprintf(buf, BDEVT_SIZE, "%-9s", tbuf);
        } else
                snprintf(buf, BDEVT_SIZE, "%03x:%05x", MAJOR(devt), MINOR(devt));

        return buf;
}

/*
 * Register device numbers dev..(dev+range-1)
 * range must be nonzero
 * The hash chain is sorted on range, so that subranges can override.
 */
void blk_register_region(dev_t devt, unsigned long range, struct module *module,
                         struct kobject *(*probe)(dev_t, int *, void *),
                         int (*lock)(dev_t, void *), void *data)
{
        kobj_map(bdev_map, devt, range, module, probe, lock, data);
}

EXPORT_SYMBOL(blk_register_region);

void blk_unregister_region(dev_t devt, unsigned long range)
{
        kobj_unmap(bdev_map, devt, range);
}

EXPORT_SYMBOL(blk_unregister_region);

static struct kobject *exact_match(dev_t devt, int *partno, void *data)
{
        struct gendisk *p = data;

        return &disk_to_dev(p)->kobj;
}

static int exact_lock(dev_t devt, void *data)
{
        struct gendisk *p = data;

        if (!get_disk_and_module(p))
                return -1;
        return 0;
}

static void disk_scan_partitions(struct gendisk *disk)
{
        struct block_device *bdev;

        if (!get_capacity(disk) || !disk_part_scan_enabled(disk))
                return;

        set_bit(GD_NEED_PART_SCAN, &disk->state);
        bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL);
        if (!IS_ERR(bdev))
                blkdev_put(bdev, FMODE_READ);
}

static void register_disk(struct device *parent, struct gendisk *disk,
                          const struct attribute_group **groups)
{
        struct device *ddev = disk_to_dev(disk);
        struct disk_part_iter piter;
        struct hd_struct *part;
        int err;

        ddev->parent = parent;

        dev_set_name(ddev, "%s", disk->disk_name);

        /* delay uevents, until we scanned partition table */
        dev_set_uevent_suppress(ddev, 1);

        if (groups) {
                WARN_ON(ddev->groups);
                ddev->groups = groups;
        }
        if (device_add(ddev))
                return;
        if (!sysfs_deprecated) {
                err = sysfs_create_link(block_depr, &ddev->kobj,
                                        kobject_name(&ddev->kobj));
                if (err) {
                        device_del(ddev);
                        return;
                }
        }

        /*
         * avoid probable deadlock caused by allocating memory with
         * GFP_KERNEL in runtime_resume callback of its all ancestor
         * devices
         */
        pm_runtime_set_memalloc_noio(ddev, true);

        disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
        disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);

        if (disk->flags & GENHD_FL_HIDDEN)
                return;

        disk_scan_partitions(disk);

        /* announce disk after possible partitions are created */
        dev_set_uevent_suppress(ddev, 0);
        kobject_uevent(&ddev->kobj, KOBJ_ADD);

        /* announce possible partitions */
        disk_part_iter_init(&piter, disk, 0);
        while ((part = disk_part_iter_next(&piter)))
                kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
        disk_part_iter_exit(&piter);

        if (disk->queue->backing_dev_info->dev) {
                err = sysfs_create_link(&ddev->kobj,
                          &disk->queue->backing_dev_info->dev->kobj,
                          "bdi");
                WARN_ON(err);
        }
}

/**
 * __device_add_disk - add disk information to kernel list
 * @parent: parent device for the disk
 * @disk: per-device partitioning information
 * @groups: Additional per-device sysfs groups
 * @register_queue: register the queue if set to true
 *
 * This function registers the partitioning information in @disk
 * with the kernel.
 *
 * FIXME: error handling
 */
static void __device_add_disk(struct device *parent, struct gendisk *disk,
                              const struct attribute_group **groups,
                              bool register_queue)
{
        dev_t devt;
        int retval;

        /*
         * The disk queue should now be all set with enough information about
         * the device for the elevator code to pick an adequate default
         * elevator if one is needed, that is, for devices requesting queue
         * registration.
         */
        if (register_queue)
                elevator_init_mq(disk->queue);

        /* minors == 0 indicates to use ext devt from part0 and should
         * be accompanied with EXT_DEVT flag.  Make sure all
         * parameters make sense.
         */
        WARN_ON(disk->minors && !(disk->major || disk->first_minor));
        WARN_ON(!disk->minors &&
                !(disk->flags & (GENHD_FL_EXT_DEVT | GENHD_FL_HIDDEN)));

        disk->flags |= GENHD_FL_UP;

        retval = blk_alloc_devt(&disk->part0, &devt);
        if (retval) {
                WARN_ON(1);
                return;
        }
        disk->major = MAJOR(devt);
        disk->first_minor = MINOR(devt);

        disk_alloc_events(disk);

        if (disk->flags & GENHD_FL_HIDDEN) {
                /*
                 * Don't let hidden disks show up in /proc/partitions,
                 * and don't bother scanning for partitions either.
                 */
                disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
                disk->flags |= GENHD_FL_NO_PART_SCAN;
        } else {
                struct backing_dev_info *bdi = disk->queue->backing_dev_info;
                struct device *dev = disk_to_dev(disk);
                int ret;

                /* Register BDI before referencing it from bdev */
                dev->devt = devt;
                ret = bdi_register(bdi, "%u:%u", MAJOR(devt), MINOR(devt));
                WARN_ON(ret);
                bdi_set_owner(bdi, dev);
                blk_register_region(disk_devt(disk), disk->minors, NULL,
                                    exact_match, exact_lock, disk);
        }
        register_disk(parent, disk, groups);
        if (register_queue)
                blk_register_queue(disk);

        /*
         * Take an extra ref on queue which will be put on disk_release()
         * so that it sticks around as long as @disk is there.
         */
        WARN_ON_ONCE(!blk_get_queue(disk->queue));

        disk_add_events(disk);
        blk_integrity_add(disk);
}

void device_add_disk(struct device *parent, struct gendisk *disk,
                     const struct attribute_group **groups)

{
        __device_add_disk(parent, disk, groups, true);
}
EXPORT_SYMBOL(device_add_disk);

void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk)
{
        __device_add_disk(parent, disk, NULL, false);
}
EXPORT_SYMBOL(device_add_disk_no_queue_reg);

static void invalidate_partition(struct gendisk *disk, int partno)
{
        struct block_device *bdev;

        bdev = bdget_disk(disk, partno);
        if (!bdev)
                return;

        fsync_bdev(bdev);
        __invalidate_device(bdev, true);

        /*
         * Unhash the bdev inode for this device so that it gets evicted as soon
         * as last inode reference is dropped.
         */
        remove_inode_hash(bdev->bd_inode);
        bdput(bdev);
}

/**
 * del_gendisk - remove the gendisk
 * @disk: the struct gendisk to remove
 *
 * Removes the gendisk and all its associated resources. This deletes the
 * partitions associated with the gendisk, and unregisters the associated
 * request_queue.
 *
 * This is the counter to the respective __device_add_disk() call.
 *
 * The final removal of the struct gendisk happens when its refcount reaches 0
 * with put_disk(), which should be called after del_gendisk(), if
 * __device_add_disk() was used.
 *
 * Drivers exist which depend on the release of the gendisk to be synchronous,
 * it should not be deferred.
 *
 * Context: can sleep
 */
void del_gendisk(struct gendisk *disk)
{
        struct disk_part_iter piter;
        struct hd_struct *part;

        might_sleep();

        blk_integrity_del(disk);
        disk_del_events(disk);

        /*
         * Block lookups of the disk until all bdevs are unhashed and the
         * disk is marked as dead (GENHD_FL_UP cleared).
         */
        down_write(&disk->lookup_sem);
        /* invalidate stuff */
        disk_part_iter_init(&piter, disk,
                             DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
        while ((part = disk_part_iter_next(&piter))) {
                invalidate_partition(disk, part->partno);
                delete_partition(part);
        }
        disk_part_iter_exit(&piter);

        invalidate_partition(disk, 0);
        set_capacity(disk, 0);
        disk->flags &= ~GENHD_FL_UP;
        up_write(&disk->lookup_sem);

        if (!(disk->flags & GENHD_FL_HIDDEN))
                sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
        if (disk->queue) {
                /*
                 * Unregister bdi before releasing device numbers (as they can
                 * get reused and we'd get clashes in sysfs).
                 */
                if (!(disk->flags & GENHD_FL_HIDDEN))
                        bdi_unregister(disk->queue->backing_dev_info);
                blk_unregister_queue(disk);
        } else {
                WARN_ON(1);
        }

        if (!(disk->flags & GENHD_FL_HIDDEN))
                blk_unregister_region(disk_devt(disk), disk->minors);
        /*
         * Remove gendisk pointer from idr so that it cannot be looked up
         * while RCU period before freeing gendisk is running to prevent
         * use-after-free issues. Note that the device number stays
         * "in-use" until we really free the gendisk.
         */
        blk_invalidate_devt(disk_devt(disk));

        kobject_put(disk->part0.holder_dir);
        kobject_put(disk->slave_dir);

        part_stat_set_all(&disk->part0, 0);
        disk->part0.stamp = 0;
        if (!sysfs_deprecated)
                sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
        pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
        device_del(disk_to_dev(disk));
}
EXPORT_SYMBOL(del_gendisk);

/* sysfs access to bad-blocks list. */
static ssize_t disk_badblocks_show(struct device *dev,
                                        struct device_attribute *attr,
                                        char *page)
{
        struct gendisk *disk = dev_to_disk(dev);

        if (!disk->bb)
                return sprintf(page, "\n");

        return badblocks_show(disk->bb, page, 0);
}

static ssize_t disk_badblocks_store(struct device *dev,
                                        struct device_attribute *attr,
                                        const char *page, size_t len)
{
        struct gendisk *disk = dev_to_disk(dev);

        if (!disk->bb)
                return -ENXIO;

        return badblocks_store(disk->bb, page, len, 0);
}

/**
 * get_gendisk - get partitioning information for a given device
 * @devt: device to get partitioning information for
 * @partno: returned partition index
 *
 * This function gets the structure containing partitioning
 * information for the given device @devt.
 *
 * Context: can sleep
 */
struct gendisk *get_gendisk(dev_t devt, int *partno)
{
        struct gendisk *disk = NULL;

        might_sleep();

        if (MAJOR(devt) != BLOCK_EXT_MAJOR) {
                struct kobject *kobj;

                kobj = kobj_lookup(bdev_map, devt, partno);
                if (kobj)
                        disk = dev_to_disk(kobj_to_dev(kobj));
        } else {
                struct hd_struct *part;

                spin_lock_bh(&ext_devt_lock);
                part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
                if (part && get_disk_and_module(part_to_disk(part))) {
                        *partno = part->partno;
                        disk = part_to_disk(part);
                }
                spin_unlock_bh(&ext_devt_lock);
        }

        if (!disk)
                return NULL;

        /*
         * Synchronize with del_gendisk() to not return disk that is being
         * destroyed.
         */
        down_read(&disk->lookup_sem);
        if (unlikely((disk->flags & GENHD_FL_HIDDEN) ||
                     !(disk->flags & GENHD_FL_UP))) {
                up_read(&disk->lookup_sem);
                put_disk_and_module(disk);
                disk = NULL;
        } else {
                up_read(&disk->lookup_sem);
        }
        return disk;
}

/**
 * bdget_disk - do bdget() by gendisk and partition number
 * @disk: gendisk of interest
 * @partno: partition number
 *
 * Find partition @partno from @disk, do bdget() on it.
 *
 * CONTEXT:
 * Don't care.
 *
 * RETURNS:
 * Resulting block_device on success, NULL on failure.
 */
struct block_device *bdget_disk(struct gendisk *disk, int partno)
{
        struct hd_struct *part;
        struct block_device *bdev = NULL;

        part = disk_get_part(disk, partno);
        if (part)
                bdev = bdget_part(part);
        disk_put_part(part);

        return bdev;
}
EXPORT_SYMBOL(bdget_disk);

/*
 * print a full list of all partitions - intended for places where the root
 * filesystem can't be mounted and thus to give the victim some idea of what
 * went wrong
 */
void __init printk_all_partitions(void)
{
        struct class_dev_iter iter;
        struct device *dev;

        class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
        while ((dev = class_dev_iter_next(&iter))) {
                struct gendisk *disk = dev_to_disk(dev);
                struct disk_part_iter piter;
                struct hd_struct *part;
                char name_buf[BDEVNAME_SIZE];
                char devt_buf[BDEVT_SIZE];

                /*
                 * Don't show empty devices or things that have been
                 * suppressed
                 */
                if (get_capacity(disk) == 0 ||
                    (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
                        continue;

                /*
                 * Note, unlike /proc/partitions, I am showing the
                 * numbers in hex - the same format as the root=
                 * option takes.
                 */
                disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
                while ((part = disk_part_iter_next(&piter))) {
                        bool is_part0 = part == &disk->part0;

                        printk("%s%s %10llu %s %s", is_part0 ? "" : "  ",
                               bdevt_str(part_devt(part), devt_buf),
                               (unsigned long long)part_nr_sects_read(part) >> 1
                               , disk_name(disk, part->partno, name_buf),
                               part->info ? part->info->uuid : "");
                        if (is_part0) {
                                if (dev->parent && dev->parent->driver)
                                        printk(" driver: %s\n",
                                              dev->parent->driver->name);
                                else
                                        printk(" (driver?)\n");
                        } else
                                printk("\n");
                }
                disk_part_iter_exit(&piter);
        }
        class_dev_iter_exit(&iter);
}

#ifdef CONFIG_PROC_FS
/* iterator */
static void *disk_seqf_start(struct seq_file *seqf, loff_t *pos)
{
        loff_t skip = *pos;
        struct class_dev_iter *iter;
        struct device *dev;

        iter = kmalloc(sizeof(*iter), GFP_KERNEL);
        if (!iter)
                return ERR_PTR(-ENOMEM);

        seqf->private = iter;
        class_dev_iter_init(iter, &block_class, NULL, &disk_type);
        do {
                dev = class_dev_iter_next(iter);
                if (!dev)
                        return NULL;
        } while (skip--);

        return dev_to_disk(dev);
}

static void *disk_seqf_next(struct seq_file *seqf, void *v, loff_t *pos)
{
        struct device *dev;

        (*pos)++;
        dev = class_dev_iter_next(seqf->private);
        if (dev)
                return dev_to_disk(dev);

        return NULL;
}

static void disk_seqf_stop(struct seq_file *seqf, void *v)
{
        struct class_dev_iter *iter = seqf->private;

        /* stop is called even after start failed :-( */
        if (iter) {
                class_dev_iter_exit(iter);
                kfree(iter);
                seqf->private = NULL;
        }
}

static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
{
        void *p;

        p = disk_seqf_start(seqf, pos);
        if (!IS_ERR_OR_NULL(p) && !*pos)
                seq_puts(seqf, "major minor  #blocks  name\n\n");
        return p;
}

static int show_partition(struct seq_file *seqf, void *v)
{
        struct gendisk *sgp = v;
        struct disk_part_iter piter;
        struct hd_struct *part;
        char buf[BDEVNAME_SIZE];

        /* Don't show non-partitionable removeable devices or empty devices */
        if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
                                   (sgp->flags & GENHD_FL_REMOVABLE)))
                return 0;
        if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
                return 0;

        /* show the full disk and all non-0 size partitions of it */
        disk_part_iter_init(&piter, sgp, DISK_PITER_INCL_PART0);
        while ((part = disk_part_iter_next(&piter)))
                seq_printf(seqf, "%4d  %7d %10llu %s\n",
                           MAJOR(part_devt(part)), MINOR(part_devt(part)),
                           (unsigned long long)part_nr_sects_read(part) >> 1,
                           disk_name(sgp, part->partno, buf));
        disk_part_iter_exit(&piter);

        return 0;
}

static const struct seq_operations partitions_op = {
        .start        = show_partition_start,
        .next        = disk_seqf_next,
        .stop        = disk_seqf_stop,
        .show        = show_partition
};
#endif


static struct kobject *base_probe(dev_t devt, int *partno, void *data)
{
        if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0)
                /* Make old-style 2.4 aliases work */
                request_module("block-major-%d", MAJOR(devt));
        return NULL;
}

static int __init genhd_device_init(void)
{
        int error;

        block_class.dev_kobj = sysfs_dev_block_kobj;
        error = class_register(&block_class);
        if (unlikely(error))
                return error;
        bdev_map = kobj_map_init(base_probe, &block_class_lock);
        blk_dev_init();

        register_blkdev(BLOCK_EXT_MAJOR, "blkext");

        /* create top-level block dir */
        if (!sysfs_deprecated)
                block_depr = kobject_create_and_add("block", NULL);
        return 0;
}

subsys_initcall(genhd_device_init);

static ssize_t disk_range_show(struct device *dev,
                               struct device_attribute *attr, char *buf)
{
        struct gendisk *disk = dev_to_disk(dev);

        return sprintf(buf, "%d\n", disk->minors);
}

static ssize_t disk_ext_range_show(struct device *dev,
                                   struct device_attribute *attr, char *buf)
{
        struct gendisk *disk = dev_to_disk(dev);

        return sprintf(buf, "%d\n", disk_max_parts(disk));
}

static ssize_t disk_removable_show(struct device *dev,
                                   struct device_attribute *attr, char *buf)
{
        struct gendisk *disk = dev_to_disk(dev);

        return sprintf(buf, "%d\n",
                       (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0));
}

static ssize_t disk_hidden_show(struct device *dev,
                                   struct device_attribute *attr, char *buf)
{
        struct gendisk *disk = dev_to_disk(dev);

        return sprintf(buf, "%d\n",
                       (disk->flags & GENHD_FL_HIDDEN ? 1 : 0));
}

static ssize_t disk_ro_show(struct device *dev,
                                   struct device_attribute *attr, char *buf)
{
        struct gendisk *disk = dev_to_disk(dev);

        return sprintf(buf, "%d\n", get_disk_ro(disk) ? 1 : 0);
}

ssize_t part_size_show(struct device *dev,
                       struct device_attribute *attr, char *buf)
{
        struct hd_struct *p = dev_to_part(dev);

        return sprintf(buf, "%llu\n",
                (unsigned long long)part_nr_sects_read(p));
}

ssize_t part_stat_show(struct device *dev,
                       struct device_attribute *attr, char *buf)
{
        struct hd_struct *p = dev_to_part(dev);
        struct request_queue *q = part_to_disk(p)->queue;
        struct disk_stats stat;
        unsigned int inflight;

        part_stat_read_all(p, &stat);
        if (queue_is_mq(q))
                inflight = blk_mq_in_flight(q, p);
        else
                inflight = part_in_flight(p);

        return sprintf(buf,
                "%8lu %8lu %8llu %8u "
                "%8lu %8lu %8llu %8u "
                "%8u %8u %8u "
                "%8lu %8lu %8llu %8u "
                "%8lu %8u"
                "\n",
                stat.ios[STAT_READ],
                stat.merges[STAT_READ],
                (unsigned long long)stat.sectors[STAT_READ],
                (unsigned int)div_u64(stat.nsecs[STAT_READ], NSEC_PER_MSEC),
                stat.ios[STAT_WRITE],
                stat.merges[STAT_WRITE],
                (unsigned long long)stat.sectors[STAT_WRITE],
                (unsigned int)div_u64(stat.nsecs[STAT_WRITE], NSEC_PER_MSEC),
                inflight,
                jiffies_to_msecs(stat.io_ticks),
                (unsigned int)div_u64(stat.nsecs[STAT_READ] +
                                      stat.nsecs[STAT_WRITE] +
                                      stat.nsecs[STAT_DISCARD] +
                                      stat.nsecs[STAT_FLUSH],
                                                NSEC_PER_MSEC),
                stat.ios[STAT_DISCARD],
                stat.merges[STAT_DISCARD],
                (unsigned long long)stat.sectors[STAT_DISCARD],
                (unsigned int)div_u64(stat.nsecs[STAT_DISCARD], NSEC_PER_MSEC),
                stat.ios[STAT_FLUSH],
                (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC));
}

ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
                           char *buf)
{
        struct hd_struct *p = dev_to_part(dev);
        struct request_queue *q = part_to_disk(p)->queue;
        unsigned int inflight[2];

        if (queue_is_mq(q))
                blk_mq_in_flight_rw(q, p, inflight);
        else
                part_in_flight_rw(p, inflight);

        return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]);
}

static ssize_t disk_capability_show(struct device *dev,
                                    struct device_attribute *attr, char *buf)
{
        struct gendisk *disk = dev_to_disk(dev);

        return sprintf(buf, "%x\n", disk->flags);
}

static ssize_t disk_alignment_offset_show(struct device *dev,
                                          struct device_attribute *attr,
                                          char *buf)
{
        struct gendisk *disk = dev_to_disk(dev);

        return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue));
}

static ssize_t disk_discard_alignment_show(struct device *dev,
                                           struct device_attribute *attr,
                                           char *buf)
{
        struct gendisk *disk = dev_to_disk(dev);

        return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue));
}

static DEVICE_ATTR(range, 0444, disk_range_show, NULL);
static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL);
static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL);
static DEVICE_ATTR(hidden, 0444, disk_hidden_show, NULL);
static DEVICE_ATTR(ro, 0444, disk_ro_show, NULL);
static DEVICE_ATTR(size, 0444, part_size_show, NULL);
static DEVICE_ATTR(alignment_offset, 0444, disk_alignment_offset_show, NULL);
static DEVICE_ATTR(discard_alignment, 0444, disk_discard_alignment_show, NULL);
static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL);
static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store);

#ifdef CONFIG_FAIL_MAKE_REQUEST
ssize_t part_fail_show(struct device *dev,
                       struct device_attribute *attr, char *buf)
{
        struct hd_struct *p = dev_to_part(dev);

        return sprintf(buf, "%d\n", p->make_it_fail);
}

ssize_t part_fail_store(struct device *dev,
                        struct device_attribute *attr,
                        const char *buf, size_t count)
{
        struct hd_struct *p = dev_to_part(dev);
        int i;

        if (count > 0 && sscanf(buf, "%d", &i) > 0)
                p->make_it_fail = (i == 0) ? 0 : 1;

        return count;
}

static struct device_attribute dev_attr_fail =
        __ATTR(make-it-fail, 0644, part_fail_show, part_fail_store);
#endif /* CONFIG_FAIL_MAKE_REQUEST */

#ifdef CONFIG_FAIL_IO_TIMEOUT
static struct device_attribute dev_attr_fail_timeout =
        __ATTR(io-timeout-fail, 0644, part_timeout_show, part_timeout_store);
#endif

static struct attribute *disk_attrs[] = {
        &dev_attr_range.attr,
        &dev_attr_ext_range.attr,
        &dev_attr_removable.attr,
        &dev_attr_hidden.attr,
        &dev_attr_ro.attr,
        &dev_attr_size.attr,
        &dev_attr_alignment_offset.attr,
        &dev_attr_discard_alignment.attr,
        &dev_attr_capability.attr,
        &dev_attr_stat.attr,
        &dev_attr_inflight.attr,
        &dev_attr_badblocks.attr,
#ifdef CONFIG_FAIL_MAKE_REQUEST
        &dev_attr_fail.attr,
#endif
#ifdef CONFIG_FAIL_IO_TIMEOUT
        &dev_attr_fail_timeout.attr,
#endif
        NULL
};

static umode_t disk_visible(struct kobject *kobj, struct attribute *a, int n)
{
        struct device *dev = container_of(kobj, typeof(*dev), kobj);
        struct gendisk *disk = dev_to_disk(dev);

        if (a == &dev_attr_badblocks.attr && !disk->bb)
                return 0;
        return a->mode;
}

static struct attribute_group disk_attr_group = {
        .attrs = disk_attrs,
        .is_visible = disk_visible,
};

static const struct attribute_group *disk_attr_groups[] = {
        &disk_attr_group,
        NULL
};

/**
 * disk_replace_part_tbl - replace disk->part_tbl in RCU-safe way
 * @disk: disk to replace part_tbl for
 * @new_ptbl: new part_tbl to install
 *
 * Replace disk->part_tbl with @new_ptbl in RCU-safe way.  The
 * original ptbl is freed using RCU callback.
 *
 * LOCKING:
 * Matching bd_mutex locked or the caller is the only user of @disk.
 */
static void disk_replace_part_tbl(struct gendisk *disk,
                                  struct disk_part_tbl *new_ptbl)
{
        struct disk_part_tbl *old_ptbl =
                rcu_dereference_protected(disk->part_tbl, 1);

        rcu_assign_pointer(disk->part_tbl, new_ptbl);

        if (old_ptbl) {
                rcu_assign_pointer(old_ptbl->last_lookup, NULL);
                kfree_rcu(old_ptbl, rcu_head);
        }
}

/**
 * disk_expand_part_tbl - expand disk->part_tbl
 * @disk: disk to expand part_tbl for
 * @partno: expand such that this partno can fit in
 *
 * Expand disk->part_tbl such that @partno can fit in.  disk->part_tbl
 * uses RCU to allow unlocked dereferencing for stats and other stuff.
 *
 * LOCKING:
 * Matching bd_mutex locked or the caller is the only user of @disk.
 * Might sleep.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
int disk_expand_part_tbl(struct gendisk *disk, int partno)
{
        struct disk_part_tbl *old_ptbl =
                rcu_dereference_protected(disk->part_tbl, 1);
        struct disk_part_tbl *new_ptbl;
        int len = old_ptbl ? old_ptbl->len : 0;
        int i, target;

        /*
         * check for int overflow, since we can get here from blkpg_ioctl()
         * with a user passed 'partno'.
         */
        target = partno + 1;
        if (target < 0)
                return -EINVAL;

        /* disk_max_parts() is zero during initialization, ignore if so */
        if (disk_max_parts(disk) && target > disk_max_parts(disk))
                return -EINVAL;

        if (target <= len)
                return 0;

        new_ptbl = kzalloc_node(struct_size(new_ptbl, part, target), GFP_KERNEL,
                                disk->node_id);
        if (!new_ptbl)
                return -ENOMEM;

        new_ptbl->len = target;

        for (i = 0; i < len; i++)
                rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]);

        disk_replace_part_tbl(disk, new_ptbl);
        return 0;
}

/**
 * disk_release - releases all allocated resources of the gendisk
 * @dev: the device representing this disk
 *
 * This function releases all allocated resources of the gendisk.
 *
 * The struct gendisk refcount is incremented with get_gendisk() or
 * get_disk_and_module(), and its refcount is decremented with
 * put_disk_and_module() or put_disk(). Once the refcount reaches 0 this
 * function is called.
 *
 * Drivers which used __device_add_disk() have a gendisk with a request_queue
 * assigned. Since the request_queue sits on top of the gendisk for these
 * drivers we also call blk_put_queue() for them, and we expect the
 * request_queue refcount to reach 0 at this point, and so the request_queue
 * will also be freed prior to the disk.
 *
 * Context: can sleep
 */
static void disk_release(struct device *dev)
{
        struct gendisk *disk = dev_to_disk(dev);

        might_sleep();

        blk_free_devt(dev->devt);
        disk_release_events(disk);
        kfree(disk->random);
        disk_replace_part_tbl(disk, NULL);
        hd_free_part(&disk->part0);
        if (disk->queue)
                blk_put_queue(disk->queue);
        kfree(disk);
}
struct class block_class = {
        .name                = "block",
};

static char *block_devnode(struct device *dev, umode_t *mode,
                           kuid_t *uid, kgid_t *gid)
{
        struct gendisk *disk = dev_to_disk(dev);

        if (disk->fops->devnode)
                return disk->fops->devnode(disk, mode);
        return NULL;
}

const struct device_type disk_type = {
        .name                = "disk",
        .groups                = disk_attr_groups,
        .release        = disk_release,
        .devnode        = block_devnode,
};

#ifdef CONFIG_PROC_FS
/*
 * aggregate disk stat collector.  Uses the same stats that the sysfs
 * entries do, above, but makes them available through one seq_file.
 *
 * The output looks suspiciously like /proc/partitions with a bunch of
 * extra fields.
 */
static int diskstats_show(struct seq_file *seqf, void *v)
{
        struct gendisk *gp = v;
        struct disk_part_iter piter;
        struct hd_struct *hd;
        char buf[BDEVNAME_SIZE];
        unsigned int inflight;
        struct disk_stats stat;

        /*
        if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next)
                seq_puts(seqf,        "major minor name"
                                "     rio rmerge rsect ruse wio wmerge "
                                "wsect wuse running use aveq"
                                "\n\n");
        */

        disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
        while ((hd = disk_part_iter_next(&piter))) {
                part_stat_read_all(hd, &stat);
                if (queue_is_mq(gp->queue))
                        inflight = blk_mq_in_flight(gp->queue, hd);
                else
                        inflight = part_in_flight(hd);

                seq_printf(seqf, "%4d %7d %s "
                           "%lu %lu %lu %u "
                           "%lu %lu %lu %u "
                           "%u %u %u "
                           "%lu %lu %lu %u "
                           "%lu %u"
                           "\n",
                           MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
                           disk_name(gp, hd->partno, buf),
                           stat.ios[STAT_READ],
                           stat.merges[STAT_READ],
                           stat.sectors[STAT_READ],
                           (unsigned int)div_u64(stat.nsecs[STAT_READ],
                                                        NSEC_PER_MSEC),
                           stat.ios[STAT_WRITE],
                           stat.merges[STAT_WRITE],
                           stat.sectors[STAT_WRITE],
                           (unsigned int)div_u64(stat.nsecs[STAT_WRITE],
                                                        NSEC_PER_MSEC),
                           inflight,
                           jiffies_to_msecs(stat.io_ticks),
                           (unsigned int)div_u64(stat.nsecs[STAT_READ] +
                                                 stat.nsecs[STAT_WRITE] +
                                                 stat.nsecs[STAT_DISCARD] +
                                                 stat.nsecs[STAT_FLUSH],
                                                        NSEC_PER_MSEC),
                           stat.ios[STAT_DISCARD],
                           stat.merges[STAT_DISCARD],
                           stat.sectors[STAT_DISCARD],
                           (unsigned int)div_u64(stat.nsecs[STAT_DISCARD],
                                                 NSEC_PER_MSEC),
                           stat.ios[STAT_FLUSH],
                           (unsigned int)div_u64(stat.nsecs[STAT_FLUSH],
                                                 NSEC_PER_MSEC)
                        );
        }
        disk_part_iter_exit(&piter);

        return 0;
}

static const struct seq_operations diskstats_op = {
        .start        = disk_seqf_start,
        .next        = disk_seqf_next,
        .stop        = disk_seqf_stop,
        .show        = diskstats_show
};

static int __init proc_genhd_init(void)
{
        proc_create_seq("diskstats", 0, NULL, &diskstats_op);
        proc_create_seq("partitions", 0, NULL, &partitions_op);
        return 0;
}
module_init(proc_genhd_init);
#endif /* CONFIG_PROC_FS */

dev_t blk_lookup_devt(const char *name, int partno)
{
        dev_t devt = MKDEV(0, 0);
        struct class_dev_iter iter;
        struct device *dev;

        class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
        while ((dev = class_dev_iter_next(&iter))) {
                struct gendisk *disk = dev_to_disk(dev);
                struct hd_struct *part;

                if (strcmp(dev_name(dev), name))
                        continue;

                if (partno < disk->minors) {
                        /* We need to return the right devno, even
                         * if the partition doesn't exist yet.
                         */
                        devt = MKDEV(MAJOR(dev->devt),
                                     MINOR(dev->devt) + partno);
                        break;
                }
                part = disk_get_part(disk, partno);
                if (part) {
                        devt = part_devt(part);
                        disk_put_part(part);
                        break;
                }
                disk_put_part(part);
        }
        class_dev_iter_exit(&iter);
        return devt;
}

struct gendisk *__alloc_disk_node(int minors, int node_id)
{
        struct gendisk *disk;
        struct disk_part_tbl *ptbl;

        if (minors > DISK_MAX_PARTS) {
                printk(KERN_ERR
                        "block: can't allocate more than %d partitions\n",
                        DISK_MAX_PARTS);
                minors = DISK_MAX_PARTS;
        }

        disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id);
        if (!disk)
                return NULL;

        disk->part0.dkstats = alloc_percpu(struct disk_stats);
        if (!disk->part0.dkstats)
                goto out_free_disk;

        init_rwsem(&disk->lookup_sem);
        disk->node_id = node_id;
        if (disk_expand_part_tbl(disk, 0)) {
                free_percpu(disk->part0.dkstats);
                goto out_free_disk;
        }

        ptbl = rcu_dereference_protected(disk->part_tbl, 1);
        rcu_assign_pointer(ptbl->part[0], &disk->part0);

        /*
         * set_capacity() and get_capacity() currently don't use
         * seqcounter to read/update the part0->nr_sects. Still init
         * the counter as we can read the sectors in IO submission
         * patch using seqence counters.
         *
         * TODO: Ideally set_capacity() and get_capacity() should be
         * converted to make use of bd_mutex and sequence counters.
         */
        hd_sects_seq_init(&disk->part0);
        if (hd_ref_init(&disk->part0))
                goto out_free_part0;

        disk->minors = minors;
        rand_initialize_disk(disk);
        disk_to_dev(disk)->class = &block_class;
        disk_to_dev(disk)->type = &disk_type;
        device_initialize(disk_to_dev(disk));
        return disk;

out_free_part0:
        hd_free_part(&disk->part0);
out_free_disk:
        kfree(disk);
        return NULL;
}
EXPORT_SYMBOL(__alloc_disk_node);

/**
 * get_disk_and_module - increments the gendisk and gendisk fops module refcount
 * @disk: the struct gendisk to increment the refcount for
 *
 * This increments the refcount for the struct gendisk, and the gendisk's
 * fops module owner.
 *
 * Context: Any context.
 */
struct kobject *get_disk_and_module(struct gendisk *disk)
{
        struct module *owner;
        struct kobject *kobj;

        if (!disk->fops)
                return NULL;
        owner = disk->fops->owner;
        if (owner && !try_module_get(owner))
                return NULL;
        kobj = kobject_get_unless_zero(&disk_to_dev(disk)->kobj);
        if (kobj == NULL) {
                module_put(owner);
                return NULL;
        }
        return kobj;

}
EXPORT_SYMBOL(get_disk_and_module);

/**
 * put_disk - decrements the gendisk refcount
 * @disk: the struct gendisk to decrement the refcount for
 *
 * This decrements the refcount for the struct gendisk. When this reaches 0
 * we'll have disk_release() called.
 *
 * Context: Any context, but the last reference must not be dropped from
 *          atomic context.
 */
void put_disk(struct gendisk *disk)
{
        if (disk)
                kobject_put(&disk_to_dev(disk)->kobj);
}
EXPORT_SYMBOL(put_disk);

/**
 * put_disk_and_module - decrements the module and gendisk refcount
 * @disk: the struct gendisk to decrement the refcount for
 *
 * This is a counterpart of get_disk_and_module() and thus also of
 * get_gendisk().
 *
 * Context: Any context, but the last reference must not be dropped from
 *          atomic context.
 */
void put_disk_and_module(struct gendisk *disk)
{
        if (disk) {
                struct module *owner = disk->fops->owner;

                put_disk(disk);
                module_put(owner);
        }
}
EXPORT_SYMBOL(put_disk_and_module);

static void set_disk_ro_uevent(struct gendisk *gd, int ro)
{
        char event[] = "DISK_RO=1";
        char *envp[] = { event, NULL };

        if (!ro)
                event[8] = '0';
        kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp);
}

void set_device_ro(struct block_device *bdev, int flag)
{
        bdev->bd_part->policy = flag;
}

EXPORT_SYMBOL(set_device_ro);

void set_disk_ro(struct gendisk *disk, int flag)
{
        struct disk_part_iter piter;
        struct hd_struct *part;

        if (disk->part0.policy != flag) {
                set_disk_ro_uevent(disk, flag);
                disk->part0.policy = flag;
        }

        disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
        while ((part = disk_part_iter_next(&piter)))
                part->policy = flag;
        disk_part_iter_exit(&piter);
}

EXPORT_SYMBOL(set_disk_ro);

int bdev_read_only(struct block_device *bdev)
{
        if (!bdev)
                return 0;
        return bdev->bd_part->policy;
}

EXPORT_SYMBOL(bdev_read_only);

/*
 * Disk events - monitor disk events like media change and eject request.
 */
struct disk_events {
        struct list_head        node;                /* all disk_event's */
        struct gendisk                *disk;                /* the associated disk */
        spinlock_t                lock;

        struct mutex                block_mutex;        /* protects blocking */
        int                        block;                /* event blocking depth */
        unsigned int                pending;        /* events already sent out */
        unsigned int                clearing;        /* events being cleared */

        long                        poll_msecs;        /* interval, -1 for default */
        struct delayed_work        dwork;
};

static const char *disk_events_strs[] = {
        [ilog2(DISK_EVENT_MEDIA_CHANGE)]        = "media_change",
        [ilog2(DISK_EVENT_EJECT_REQUEST)]        = "eject_request",
};

static char *disk_uevents[] = {
        [ilog2(DISK_EVENT_MEDIA_CHANGE)]        = "DISK_MEDIA_CHANGE=1",
        [ilog2(DISK_EVENT_EJECT_REQUEST)]        = "DISK_EJECT_REQUEST=1",
};

/* list of all disk_events */
static DEFINE_MUTEX(disk_events_mutex);
static LIST_HEAD(disk_events);

/* disable in-kernel polling by default */
static unsigned long disk_events_dfl_poll_msecs;

static unsigned long disk_events_poll_jiffies(struct gendisk *disk)
{
        struct disk_events *ev = disk->ev;
        long intv_msecs = 0;

        /*
         * If device-specific poll interval is set, always use it.  If
         * the default is being used, poll if the POLL flag is set.
         */
        if (ev->poll_msecs >= 0)
                intv_msecs = ev->poll_msecs;
        else if (disk->event_flags & DISK_EVENT_FLAG_POLL)
                intv_msecs = disk_events_dfl_poll_msecs;

        return msecs_to_jiffies(intv_msecs);
}

/**
 * disk_block_events - block and flush disk event checking
 * @disk: disk to block events for
 *
 * On return from this function, it is guaranteed that event checking
 * isn't in progress and won't happen until unblocked by
 * disk_unblock_events().  Events blocking is counted and the actual
 * unblocking happens after the matching number of unblocks are done.
 *
 * Note that this intentionally does not block event checking from
 * disk_clear_events().
 *
 * CONTEXT:
 * Might sleep.
 */
void disk_block_events(struct gendisk *disk)
{
        struct disk_events *ev = disk->ev;
        unsigned long flags;
        bool cancel;

        if (!ev)
                return;

        /*
         * Outer mutex ensures that the first blocker completes canceling
         * the event work before further blockers are allowed to finish.
         */
        mutex_lock(&ev->block_mutex);

        spin_lock_irqsave(&ev->lock, flags);
        cancel = !ev->block++;
        spin_unlock_irqrestore(&ev->lock, flags);

        if (cancel)
                cancel_delayed_work_sync(&disk->ev->dwork);

        mutex_unlock(&ev->block_mutex);
}

static void __disk_unblock_events(struct gendisk *disk, bool check_now)
{
        struct disk_events *ev = disk->ev;
        unsigned long intv;
        unsigned long flags;

        spin_lock_irqsave(&ev->lock, flags);

        if (WARN_ON_ONCE(ev->block <= 0))
                goto out_unlock;

        if (--ev->block)
                goto out_unlock;

        intv = disk_events_poll_jiffies(disk);
        if (check_now)
                queue_delayed_work(system_freezable_power_efficient_wq,
                                &ev->dwork, 0);
        else if (intv)
                queue_delayed_work(system_freezable_power_efficient_wq,
                                &ev->dwork, intv);
out_unlock:
        spin_unlock_irqrestore(&ev->lock, flags);
}

/**
 * disk_unblock_events - unblock disk event checking
 * @disk: disk to unblock events for
 *
 * Undo disk_block_events().  When the block count reaches zero, it
 * starts events polling if configured.
 *
 * CONTEXT:
 * Don't care.  Safe to call from irq context.
 */
void disk_unblock_events(struct gendisk *disk)
{
        if (disk->ev)
                __disk_unblock_events(disk, false);
}

/**
 * disk_flush_events - schedule immediate event checking and flushing
 * @disk: disk to check and flush events for
 * @mask: events to flush
 *
 * Schedule immediate event checking on @disk if not blocked.  Events in
 * @mask are scheduled to be cleared from the driver.  Note that this
 * doesn't clear the events from @disk->ev.
 *
 * CONTEXT:
 * If @mask is non-zero must be called with bdev->bd_mutex held.
 */
void disk_flush_events(struct gendisk *disk, unsigned int mask)
{
        struct disk_events *ev = disk->ev;

        if (!ev)
                return;

        spin_lock_irq(&ev->lock);
        ev->clearing |= mask;
        if (!ev->block)
                mod_delayed_work(system_freezable_power_efficient_wq,
                                &ev->dwork, 0);
        spin_unlock_irq(&ev->lock);
}

/**
 * disk_clear_events - synchronously check, clear and return pending events
 * @disk: disk to fetch and clear events from
 * @mask: mask of events to be fetched and cleared
 *
 * Disk events are synchronously checked and pending events in @mask
 * are cleared and returned.  This ignores the block count.
 *
 * CONTEXT:
 * Might sleep.
 */
static unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
{
        struct disk_events *ev = disk->ev;
        unsigned int pending;
        unsigned int clearing = mask;

        if (!ev)
                return 0;

        disk_block_events(disk);

        /*
         * store the union of mask and ev->clearing on the stack so that the
         * race with disk_flush_events does not cause ambiguity (ev->clearing
         * can still be modified even if events are blocked).
         */
        spin_lock_irq(&ev->lock);
        clearing |= ev->clearing;
        ev->clearing = 0;
        spin_unlock_irq(&ev->lock);

        disk_check_events(ev, &clearing);
        /*
         * if ev->clearing is not 0, the disk_flush_events got called in the
         * middle of this function, so we want to run the workfn without delay.
         */
        __disk_unblock_events(disk, ev->clearing ? true : false);

        /* then, fetch and clear pending events */
        spin_lock_irq(&ev->lock);
        pending = ev->pending & mask;
        ev->pending &= ~mask;
        spin_unlock_irq(&ev->lock);
        WARN_ON_ONCE(clearing & mask);

        return pending;
}

/**
 * bdev_check_media_change - check if a removable media has been changed
 * @bdev: block device to check
 *
 * Check whether a removable media has been changed, and attempt to free all
 * dentries and inodes and invalidates all block device page cache entries in
 * that case.
 *
 * Returns %true if the block device changed, or %false if not.
 */
bool bdev_check_media_change(struct block_device *bdev)
{
        unsigned int events;

        events = disk_clear_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE |
                                   DISK_EVENT_EJECT_REQUEST);
        if (!(events & DISK_EVENT_MEDIA_CHANGE))
                return false;

        if (__invalidate_device(bdev, true))
                pr_warn("VFS: busy inodes on changed media %s\n",
                        bdev->bd_disk->disk_name);
        set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
        return true;
}
EXPORT_SYMBOL(bdev_check_media_change);

/*
 * Separate this part out so that a different pointer for clearing_ptr can be
 * passed in for disk_clear_events.
 */
static void disk_events_workfn(struct work_struct *work)
{
        struct delayed_work *dwork = to_delayed_work(work);
        struct disk_events *ev = container_of(dwork, struct disk_events, dwork);

        disk_check_events(ev, &ev->clearing);
}

static void disk_check_events(struct disk_events *ev,
                              unsigned int *clearing_ptr)
{
        struct gendisk *disk = ev->disk;
        char *envp[ARRAY_SIZE(disk_uevents) + 1] = { };
        unsigned int clearing = *clearing_ptr;
        unsigned int events;
        unsigned long intv;
        int nr_events = 0, i;

        /* check events */
        events = disk->fops->check_events(disk, clearing);

        /* accumulate pending events and schedule next poll if necessary */
        spin_lock_irq(&ev->lock);

        events &= ~ev->pending;
        ev->pending |= events;
        *clearing_ptr &= ~clearing;

        intv = disk_events_poll_jiffies(disk);
        if (!ev->block && intv)
                queue_delayed_work(system_freezable_power_efficient_wq,
                                &ev->dwork, intv);

        spin_unlock_irq(&ev->lock);

        /*
         * Tell userland about new events.  Only the events listed in
         * @disk->events are reported, and only if DISK_EVENT_FLAG_UEVENT
         * is set. Otherwise, events are processed internally but never
         * get reported to userland.
         */
        for (i = 0; i < ARRAY_SIZE(disk_uevents); i++)
                if ((events & disk->events & (1 << i)) &&
                    (disk->event_flags & DISK_EVENT_FLAG_UEVENT))
                        envp[nr_events++] = disk_uevents[i];

        if (nr_events)
                kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
}

/*
 * A disk events enabled device has the following sysfs nodes under
 * its /sys/block/X/ directory.
 *
 * events                : list of all supported events
 * events_async                : list of events which can be detected w/o polling
 *                          (always empty, only for backwards compatibility)
 * events_poll_msecs        : polling interval, 0: disable, -1: system default
 */
static ssize_t __disk_events_show(unsigned int events, char *buf)
{
        const char *delim = "";
        ssize_t pos = 0;
        int i;

        for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++)
                if (events & (1 << i)) {
                        pos += sprintf(buf + pos, "%s%s",
                                       delim, disk_events_strs[i]);
                        delim = " ";
                }
        if (pos)
                pos += sprintf(buf + pos, "\n");
        return pos;
}

static ssize_t disk_events_show(struct device *dev,
                                struct device_attribute *attr, char *buf)
{
        struct gendisk *disk = dev_to_disk(dev);

        if (!(disk->event_flags & DISK_EVENT_FLAG_UEVENT))
                return 0;

        return __disk_events_show(disk->events, buf);
}

static ssize_t disk_events_async_show(struct device *dev,
                                      struct device_attribute *attr, char *buf)
{
        return 0;
}

static ssize_t disk_events_poll_msecs_show(struct device *dev,
                                           struct device_attribute *attr,
                                           char *buf)
{
        struct gendisk *disk = dev_to_disk(dev);

        if (!disk->ev)
                return sprintf(buf, "-1\n");

        return sprintf(buf, "%ld\n", disk->ev->poll_msecs);
}

static ssize_t disk_events_poll_msecs_store(struct device *dev,
                                            struct device_attribute *attr,
                                            const char *buf, size_t count)
{
        struct gendisk *disk = dev_to_disk(dev);
        long intv;

        if (!count || !sscanf(buf, "%ld", &intv))
                return -EINVAL;

        if (intv < 0 && intv != -1)
                return -EINVAL;

        if (!disk->ev)
                return -ENODEV;

        disk_block_events(disk);
        disk->ev->poll_msecs = intv;
        __disk_unblock_events(disk, true);

        return count;
}

static const DEVICE_ATTR(events, 0444, disk_events_show, NULL);
static const DEVICE_ATTR(events_async, 0444, disk_events_async_show, NULL);
static const DEVICE_ATTR(events_poll_msecs, 0644,
                         disk_events_poll_msecs_show,
                         disk_events_poll_msecs_store);

static const struct attribute *disk_events_attrs[] = {
        &dev_attr_events.attr,
        &dev_attr_events_async.attr,
        &dev_attr_events_poll_msecs.attr,
        NULL,
};

/*
 * The default polling interval can be specified by the kernel
 * parameter block.events_dfl_poll_msecs which defaults to 0
 * (disable).  This can also be modified runtime by writing to
 * /sys/module/block/parameters/events_dfl_poll_msecs.
 */
static int disk_events_set_dfl_poll_msecs(const char *val,
                                          const struct kernel_param *kp)
{
        struct disk_events *ev;
        int ret;

        ret = param_set_ulong(val, kp);
        if (ret < 0)
                return ret;

        mutex_lock(&disk_events_mutex);

        list_for_each_entry(ev, &disk_events, node)
                disk_flush_events(ev->disk, 0);

        mutex_unlock(&disk_events_mutex);

        return 0;
}

static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = {
        .set        = disk_events_set_dfl_poll_msecs,
        .get        = param_get_ulong,
};

#undef MODULE_PARAM_PREFIX
#define MODULE_PARAM_PREFIX        "block."

module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops,
                &disk_events_dfl_poll_msecs, 0644);

/*
 * disk_{alloc|add|del|release}_events - initialize and destroy disk_events.
 */
static void disk_alloc_events(struct gendisk *disk)
{
        struct disk_events *ev;

        if (!disk->fops->check_events || !disk->events)
                return;

        ev = kzalloc(sizeof(*ev), GFP_KERNEL);
        if (!ev) {
                pr_warn("%s: failed to initialize events\n", disk->disk_name);
                return;
        }

        INIT_LIST_HEAD(&ev->node);
        ev->disk = disk;
        spin_lock_init(&ev->lock);
        mutex_init(&ev->block_mutex);
        ev->block = 1;
        ev->poll_msecs = -1;
        INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn);

        disk->ev = ev;
}

static void disk_add_events(struct gendisk *disk)
{
        /* FIXME: error handling */
        if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_events_attrs) < 0)
                pr_warn("%s: failed to create sysfs files for events\n",
                        disk->disk_name);

        if (!disk->ev)
                return;

        mutex_lock(&disk_events_mutex);
        list_add_tail(&disk->ev->node, &disk_events);
        mutex_unlock(&disk_events_mutex);

        /*
         * Block count is initialized to 1 and the following initial
         * unblock kicks it into action.
         */
        __disk_unblock_events(disk, true);
}

static void disk_del_events(struct gendisk *disk)
{
        if (disk->ev) {
                disk_block_events(disk);

                mutex_lock(&disk_events_mutex);
                list_del_init(&disk->ev->node);
                mutex_unlock(&disk_events_mutex);
        }

        sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs);
}

static void disk_release_events(struct gendisk *disk)
{
        /* the block count should be 1 from disk_del_events() */
        WARN_ON_ONCE(disk->ev && disk->ev->block != 1);
        kfree(disk->ev);
}

















































    1 













    4 

    1 




    5 



































    4 





    1 








    5 
































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PERCPU_RWSEM_H
#define _LINUX_PERCPU_RWSEM_H

#include <linux/atomic.h>
#include <linux/percpu.h>
#include <linux/rcuwait.h>
#include <linux/wait.h>
#include <linux/rcu_sync.h>
#include <linux/lockdep.h>

struct percpu_rw_semaphore {
        struct rcu_sync                rss;
        unsigned int __percpu        *read_count;
        struct rcuwait                writer;
        wait_queue_head_t        waiters;
        atomic_t                block;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map        dep_map;
#endif
};

#ifdef CONFIG_DEBUG_LOCK_ALLOC
#define __PERCPU_RWSEM_DEP_MAP_INIT(lockname)        .dep_map = { .name = #lockname },
#else
#define __PERCPU_RWSEM_DEP_MAP_INIT(lockname)
#endif

#define __DEFINE_PERCPU_RWSEM(name, is_static)                                \
static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_rc_##name);                \
is_static struct percpu_rw_semaphore name = {                                \
        .rss = __RCU_SYNC_INITIALIZER(name.rss),                        \
        .read_count = &__percpu_rwsem_rc_##name,                        \
        .writer = __RCUWAIT_INITIALIZER(name.writer),                        \
        .waiters = __WAIT_QUEUE_HEAD_INITIALIZER(name.waiters),                \
        .block = ATOMIC_INIT(0),                                        \
        __PERCPU_RWSEM_DEP_MAP_INIT(name)                                \
}

#define DEFINE_PERCPU_RWSEM(name)                \
        __DEFINE_PERCPU_RWSEM(name, /* not static */)
#define DEFINE_STATIC_PERCPU_RWSEM(name)        \
        __DEFINE_PERCPU_RWSEM(name, static)

extern bool __percpu_down_read(struct percpu_rw_semaphore *, bool);

static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
{
        might_sleep();

        rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);

        preempt_disable();
        /*
         * We are in an RCU-sched read-side critical section, so the writer
         * cannot both change sem->state from readers_fast and start checking
         * counters while we are here. So if we see !sem->state, we know that
         * the writer won't be checking until we're past the preempt_enable()
         * and that once the synchronize_rcu() is done, the writer will see
         * anything we did within this RCU-sched read-size critical section.
         */
        if (likely(rcu_sync_is_idle(&sem->rss)))
                this_cpu_inc(*sem->read_count);
        else
                __percpu_down_read(sem, false); /* Unconditional memory barrier */
        /*
         * The preempt_enable() prevents the compiler from
         * bleeding the critical section out.
         */
        preempt_enable();
}

static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
{
        bool ret = true;

        preempt_disable();
        /*
         * Same as in percpu_down_read().
         */
        if (likely(rcu_sync_is_idle(&sem->rss)))
                this_cpu_inc(*sem->read_count);
        else
                ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */
        preempt_enable();
        /*
         * The barrier() from preempt_enable() prevents the compiler from
         * bleeding the critical section out.
         */

        if (ret)
                rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);

        return ret;
}

static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
{
        rwsem_release(&sem->dep_map, _RET_IP_);

        preempt_disable();
        /*
         * Same as in percpu_down_read().
         */
        if (likely(rcu_sync_is_idle(&sem->rss))) {
                this_cpu_dec(*sem->read_count);
        } else {
                /*
                 * slowpath; reader will only ever wake a single blocked
                 * writer.
                 */
                smp_mb(); /* B matches C */
                /*
                 * In other words, if they see our decrement (presumably to
                 * aggregate zero, as that is the only time it matters) they
                 * will also see our critical section.
                 */
                this_cpu_dec(*sem->read_count);
                rcuwait_wake_up(&sem->writer);
        }
        preempt_enable();
}

extern void percpu_down_write(struct percpu_rw_semaphore *);
extern void percpu_up_write(struct percpu_rw_semaphore *);

extern int __percpu_init_rwsem(struct percpu_rw_semaphore *,
                                const char *, struct lock_class_key *);

extern void percpu_free_rwsem(struct percpu_rw_semaphore *);

#define percpu_init_rwsem(sem)                                        \
({                                                                \
        static struct lock_class_key rwsem_key;                        \
        __percpu_init_rwsem(sem, #sem, &rwsem_key);                \
})

#define percpu_rwsem_is_held(sem)        lockdep_is_held(sem)
#define percpu_rwsem_assert_held(sem)        lockdep_assert_held(sem)

static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem,
                                        bool read, unsigned long ip)
{
        lock_release(&sem->dep_map, ip);
}

static inline void percpu_rwsem_acquire(struct percpu_rw_semaphore *sem,
                                        bool read, unsigned long ip)
{
        lock_acquire(&sem->dep_map, 0, 1, read, 1, NULL, ip);
}

#endif



























































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 










    1 






















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
// SPDX-License-Identifier: GPL-2.0-only
/*
 * fs/kernfs/file.c - kernfs file implementation
 *
 * Copyright (c) 2001-3 Patrick Mochel
 * Copyright (c) 2007 SUSE Linux Products GmbH
 * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
 */

#include <linux/fs.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/pagemap.h>
#include <linux/sched/mm.h>
#include <linux/fsnotify.h>
#include <linux/uio.h>

#include "kernfs-internal.h"

/*
 * There's one kernfs_open_file for each open file and one kernfs_open_node
 * for each kernfs_node with one or more open files.
 *
 * kernfs_node->attr.open points to kernfs_open_node.  attr.open is
 * protected by kernfs_open_node_lock.
 *
 * filp->private_data points to seq_file whose ->private points to
 * kernfs_open_file.  kernfs_open_files are chained at
 * kernfs_open_node->files, which is protected by kernfs_open_file_mutex.
 */
static DEFINE_SPINLOCK(kernfs_open_node_lock);
static DEFINE_MUTEX(kernfs_open_file_mutex);

struct kernfs_open_node {
        atomic_t                refcnt;
        atomic_t                event;
        wait_queue_head_t        poll;
        struct list_head        files; /* goes through kernfs_open_file.list */
};

/*
 * kernfs_notify() may be called from any context and bounces notifications
 * through a work item.  To minimize space overhead in kernfs_node, the
 * pending queue is implemented as a singly linked list of kernfs_nodes.
 * The list is terminated with the self pointer so that whether a
 * kernfs_node is on the list or not can be determined by testing the next
 * pointer for NULL.
 */
#define KERNFS_NOTIFY_EOL                        ((void *)&kernfs_notify_list)

static DEFINE_SPINLOCK(kernfs_notify_lock);
static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL;

static struct kernfs_open_file *kernfs_of(struct file *file)
{
        return ((struct seq_file *)file->private_data)->private;
}

/*
 * Determine the kernfs_ops for the given kernfs_node.  This function must
 * be called while holding an active reference.
 */
static const struct kernfs_ops *kernfs_ops(struct kernfs_node *kn)
{
        if (kn->flags & KERNFS_LOCKDEP)
                lockdep_assert_held(kn);
        return kn->attr.ops;
}

/*
 * As kernfs_seq_stop() is also called after kernfs_seq_start() or
 * kernfs_seq_next() failure, it needs to distinguish whether it's stopping
 * a seq_file iteration which is fully initialized with an active reference
 * or an aborted kernfs_seq_start() due to get_active failure.  The
 * position pointer is the only context for each seq_file iteration and
 * thus the stop condition should be encoded in it.  As the return value is
 * directly visible to userland, ERR_PTR(-ENODEV) is the only acceptable
 * choice to indicate get_active failure.
 *
 * Unfortunately, this is complicated due to the optional custom seq_file
 * operations which may return ERR_PTR(-ENODEV) too.  kernfs_seq_stop()
 * can't distinguish whether ERR_PTR(-ENODEV) is from get_active failure or
 * custom seq_file operations and thus can't decide whether put_active
 * should be performed or not only on ERR_PTR(-ENODEV).
 *
 * This is worked around by factoring out the custom seq_stop() and
 * put_active part into kernfs_seq_stop_active(), skipping it from
 * kernfs_seq_stop() if ERR_PTR(-ENODEV) while invoking it directly after
 * custom seq_file operations fail with ERR_PTR(-ENODEV) - this ensures
 * that kernfs_seq_stop_active() is skipped only after get_active failure.
 */
static void kernfs_seq_stop_active(struct seq_file *sf, void *v)
{
        struct kernfs_open_file *of = sf->private;
        const struct kernfs_ops *ops = kernfs_ops(of->kn);

        if (ops->seq_stop)
                ops->seq_stop(sf, v);
        kernfs_put_active(of->kn);
}

static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
{
        struct kernfs_open_file *of = sf->private;
        const struct kernfs_ops *ops;

        /*
         * @of->mutex nests outside active ref and is primarily to ensure that
         * the ops aren't called concurrently for the same open file.
         */
        mutex_lock(&of->mutex);
        if (!kernfs_get_active(of->kn))
                return ERR_PTR(-ENODEV);

        ops = kernfs_ops(of->kn);
        if (ops->seq_start) {
                void *next = ops->seq_start(sf, ppos);
                /* see the comment above kernfs_seq_stop_active() */
                if (next == ERR_PTR(-ENODEV))
                        kernfs_seq_stop_active(sf, next);
                return next;
        } else {
                /*
                 * The same behavior and code as single_open().  Returns
                 * !NULL if pos is at the beginning; otherwise, NULL.
                 */
                return NULL + !*ppos;
        }
}

static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos)
{
        struct kernfs_open_file *of = sf->private;
        const struct kernfs_ops *ops = kernfs_ops(of->kn);

        if (ops->seq_next) {
                void *next = ops->seq_next(sf, v, ppos);
                /* see the comment above kernfs_seq_stop_active() */
                if (next == ERR_PTR(-ENODEV))
                        kernfs_seq_stop_active(sf, next);
                return next;
        } else {
                /*
                 * The same behavior and code as single_open(), always
                 * terminate after the initial read.
                 */
                ++*ppos;
                return NULL;
        }
}

static void kernfs_seq_stop(struct seq_file *sf, void *v)
{
        struct kernfs_open_file *of = sf->private;

        if (v != ERR_PTR(-ENODEV))
                kernfs_seq_stop_active(sf, v);
        mutex_unlock(&of->mutex);
}

static int kernfs_seq_show(struct seq_file *sf, void *v)
{
        struct kernfs_open_file *of = sf->private;

        of->event = atomic_read(&of->kn->attr.open->event);

        return of->kn->attr.ops->seq_show(sf, v);
}

static const struct seq_operations kernfs_seq_ops = {
        .start = kernfs_seq_start,
        .next = kernfs_seq_next,
        .stop = kernfs_seq_stop,
        .show = kernfs_seq_show,
};

/*
 * As reading a bin file can have side-effects, the exact offset and bytes
 * specified in read(2) call should be passed to the read callback making
 * it difficult to use seq_file.  Implement simplistic custom buffering for
 * bin files.
 */
static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
        struct kernfs_open_file *of = kernfs_of(iocb->ki_filp);
        ssize_t len = min_t(size_t, iov_iter_count(iter), PAGE_SIZE);
        const struct kernfs_ops *ops;
        char *buf;

        buf = of->prealloc_buf;
        if (buf)
                mutex_lock(&of->prealloc_mutex);
        else
                buf = kmalloc(len, GFP_KERNEL);
        if (!buf)
                return -ENOMEM;

        /*
         * @of->mutex nests outside active ref and is used both to ensure that
         * the ops aren't called concurrently for the same open file.
         */
        mutex_lock(&of->mutex);
        if (!kernfs_get_active(of->kn)) {
                len = -ENODEV;
                mutex_unlock(&of->mutex);
                goto out_free;
        }

        of->event = atomic_read(&of->kn->attr.open->event);
        ops = kernfs_ops(of->kn);
        if (ops->read)
                len = ops->read(of, buf, len, iocb->ki_pos);
        else
                len = -EINVAL;

        kernfs_put_active(of->kn);
        mutex_unlock(&of->mutex);

        if (len < 0)
                goto out_free;

        if (copy_to_iter(buf, len, iter) != len) {
                len = -EFAULT;
                goto out_free;
        }

        iocb->ki_pos += len;

 out_free:
        if (buf == of->prealloc_buf)
                mutex_unlock(&of->prealloc_mutex);
        else
                kfree(buf);
        return len;
}

static ssize_t kernfs_fop_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
        if (kernfs_of(iocb->ki_filp)->kn->flags & KERNFS_HAS_SEQ_SHOW)
                return seq_read_iter(iocb, iter);
        return kernfs_file_read_iter(iocb, iter);
}

/*
 * Copy data in from userland and pass it to the matching kernfs write
 * operation.
 *
 * There is no easy way for us to know if userspace is only doing a partial
 * write, so we don't support them. We expect the entire buffer to come on
 * the first write.  Hint: if you're writing a value, first read the file,
 * modify only the the value you're changing, then write entire buffer
 * back.
 */
static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter)
{
        struct kernfs_open_file *of = kernfs_of(iocb->ki_filp);
        ssize_t len = iov_iter_count(iter);
        const struct kernfs_ops *ops;
        char *buf;

        if (of->atomic_write_len) {
                if (len > of->atomic_write_len)
                        return -E2BIG;
        } else {
                len = min_t(size_t, len, PAGE_SIZE);
        }

        buf = of->prealloc_buf;
        if (buf)
                mutex_lock(&of->prealloc_mutex);
        else
                buf = kmalloc(len + 1, GFP_KERNEL);
        if (!buf)
                return -ENOMEM;

        if (copy_from_iter(buf, len, iter) != len) {
                len = -EFAULT;
                goto out_free;
        }
        buf[len] = '\0';        /* guarantee string termination */

        /*
         * @of->mutex nests outside active ref and is used both to ensure that
         * the ops aren't called concurrently for the same open file.
         */
        mutex_lock(&of->mutex);
        if (!kernfs_get_active(of->kn)) {
                mutex_unlock(&of->mutex);
                len = -ENODEV;
                goto out_free;
        }

        ops = kernfs_ops(of->kn);
        if (ops->write)
                len = ops->write(of, buf, len, iocb->ki_pos);
        else
                len = -EINVAL;

        kernfs_put_active(of->kn);
        mutex_unlock(&of->mutex);

        if (len > 0)
                iocb->ki_pos += len;

out_free:
        if (buf == of->prealloc_buf)
                mutex_unlock(&of->prealloc_mutex);
        else
                kfree(buf);
        return len;
}

static void kernfs_vma_open(struct vm_area_struct *vma)
{
        struct file *file = vma->vm_file;
        struct kernfs_open_file *of = kernfs_of(file);

        if (!of->vm_ops)
                return;

        if (!kernfs_get_active(of->kn))
                return;

        if (of->vm_ops->open)
                of->vm_ops->open(vma);

        kernfs_put_active(of->kn);
}

static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf)
{
        struct file *file = vmf->vma->vm_file;
        struct kernfs_open_file *of = kernfs_of(file);
        vm_fault_t ret;

        if (!of->vm_ops)
                return VM_FAULT_SIGBUS;

        if (!kernfs_get_active(of->kn))
                return VM_FAULT_SIGBUS;

        ret = VM_FAULT_SIGBUS;
        if (of->vm_ops->fault)
                ret = of->vm_ops->fault(vmf);

        kernfs_put_active(of->kn);
        return ret;
}

static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf)
{
        struct file *file = vmf->vma->vm_file;
        struct kernfs_open_file *of = kernfs_of(file);
        vm_fault_t ret;

        if (!of->vm_ops)
                return VM_FAULT_SIGBUS;

        if (!kernfs_get_active(of->kn))
                return VM_FAULT_SIGBUS;

        ret = 0;
        if (of->vm_ops->page_mkwrite)
                ret = of->vm_ops->page_mkwrite(vmf);
        else
                file_update_time(file);

        kernfs_put_active(of->kn);
        return ret;
}

static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr,
                             void *buf, int len, int write)
{
        struct file *file = vma->vm_file;
        struct kernfs_open_file *of = kernfs_of(file);
        int ret;

        if (!of->vm_ops)
                return -EINVAL;

        if (!kernfs_get_active(of->kn))
                return -EINVAL;

        ret = -EINVAL;
        if (of->vm_ops->access)
                ret = of->vm_ops->access(vma, addr, buf, len, write);

        kernfs_put_active(of->kn);
        return ret;
}

#ifdef CONFIG_NUMA
static int kernfs_vma_set_policy(struct vm_area_struct *vma,
                                 struct mempolicy *new)
{
        struct file *file = vma->vm_file;
        struct kernfs_open_file *of = kernfs_of(file);
        int ret;

        if (!of->vm_ops)
                return 0;

        if (!kernfs_get_active(of->kn))
                return -EINVAL;

        ret = 0;
        if (of->vm_ops->set_policy)
                ret = of->vm_ops->set_policy(vma, new);

        kernfs_put_active(of->kn);
        return ret;
}

static struct mempolicy *kernfs_vma_get_policy(struct vm_area_struct *vma,
                                               unsigned long addr)
{
        struct file *file = vma->vm_file;
        struct kernfs_open_file *of = kernfs_of(file);
        struct mempolicy *pol;

        if (!of->vm_ops)
                return vma->vm_policy;

        if (!kernfs_get_active(of->kn))
                return vma->vm_policy;

        pol = vma->vm_policy;
        if (of->vm_ops->get_policy)
                pol = of->vm_ops->get_policy(vma, addr);

        kernfs_put_active(of->kn);
        return pol;
}

#endif

static const struct vm_operations_struct kernfs_vm_ops = {
        .open                = kernfs_vma_open,
        .fault                = kernfs_vma_fault,
        .page_mkwrite        = kernfs_vma_page_mkwrite,
        .access                = kernfs_vma_access,
#ifdef CONFIG_NUMA
        .set_policy        = kernfs_vma_set_policy,
        .get_policy        = kernfs_vma_get_policy,
#endif
};

static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct kernfs_open_file *of = kernfs_of(file);
        const struct kernfs_ops *ops;
        int rc;

        /*
         * mmap path and of->mutex are prone to triggering spurious lockdep
         * warnings and we don't want to add spurious locking dependency
         * between the two.  Check whether mmap is actually implemented
         * without grabbing @of->mutex by testing HAS_MMAP flag.  See the
         * comment in kernfs_file_open() for more details.
         */
        if (!(of->kn->flags & KERNFS_HAS_MMAP))
                return -ENODEV;

        mutex_lock(&of->mutex);

        rc = -ENODEV;
        if (!kernfs_get_active(of->kn))
                goto out_unlock;

        ops = kernfs_ops(of->kn);
        rc = ops->mmap(of, vma);
        if (rc)
                goto out_put;

        /*
         * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup()
         * to satisfy versions of X which crash if the mmap fails: that
         * substitutes a new vm_file, and we don't then want bin_vm_ops.
         */
        if (vma->vm_file != file)
                goto out_put;

        rc = -EINVAL;
        if (of->mmapped && of->vm_ops != vma->vm_ops)
                goto out_put;

        /*
         * It is not possible to successfully wrap close.
         * So error if someone is trying to use close.
         */
        rc = -EINVAL;
        if (vma->vm_ops && vma->vm_ops->close)
                goto out_put;

        rc = 0;
        of->mmapped = true;
        of->vm_ops = vma->vm_ops;
        vma->vm_ops = &kernfs_vm_ops;
out_put:
        kernfs_put_active(of->kn);
out_unlock:
        mutex_unlock(&of->mutex);

        return rc;
}

/**
 *        kernfs_get_open_node - get or create kernfs_open_node
 *        @kn: target kernfs_node
 *        @of: kernfs_open_file for this instance of open
 *
 *        If @kn->attr.open exists, increment its reference count; otherwise,
 *        create one.  @of is chained to the files list.
 *
 *        LOCKING:
 *        Kernel thread context (may sleep).
 *
 *        RETURNS:
 *        0 on success, -errno on failure.
 */
static int kernfs_get_open_node(struct kernfs_node *kn,
                                struct kernfs_open_file *of)
{
        struct kernfs_open_node *on, *new_on = NULL;

 retry:
        mutex_lock(&kernfs_open_file_mutex);
        spin_lock_irq(&kernfs_open_node_lock);

        if (!kn->attr.open && new_on) {
                kn->attr.open = new_on;
                new_on = NULL;
        }

        on = kn->attr.open;
        if (on) {
                atomic_inc(&on->refcnt);
                list_add_tail(&of->list, &on->files);
        }

        spin_unlock_irq(&kernfs_open_node_lock);
        mutex_unlock(&kernfs_open_file_mutex);

        if (on) {
                kfree(new_on);
                return 0;
        }

        /* not there, initialize a new one and retry */
        new_on = kmalloc(sizeof(*new_on), GFP_KERNEL);
        if (!new_on)
                return -ENOMEM;

        atomic_set(&new_on->refcnt, 0);
        atomic_set(&new_on->event, 1);
        init_waitqueue_head(&new_on->poll);
        INIT_LIST_HEAD(&new_on->files);
        goto retry;
}

/**
 *        kernfs_put_open_node - put kernfs_open_node
 *        @kn: target kernfs_nodet
 *        @of: associated kernfs_open_file
 *
 *        Put @kn->attr.open and unlink @of from the files list.  If
 *        reference count reaches zero, disassociate and free it.
 *
 *        LOCKING:
 *        None.
 */
static void kernfs_put_open_node(struct kernfs_node *kn,
                                 struct kernfs_open_file *of)
{
        struct kernfs_open_node *on = kn->attr.open;
        unsigned long flags;

        mutex_lock(&kernfs_open_file_mutex);
        spin_lock_irqsave(&kernfs_open_node_lock, flags);

        if (of)
                list_del(&of->list);

        if (atomic_dec_and_test(&on->refcnt))
                kn->attr.open = NULL;
        else
                on = NULL;

        spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
        mutex_unlock(&kernfs_open_file_mutex);

        kfree(on);
}

static int kernfs_fop_open(struct inode *inode, struct file *file)
{
        struct kernfs_node *kn = inode->i_private;
        struct kernfs_root *root = kernfs_root(kn);
        const struct kernfs_ops *ops;
        struct kernfs_open_file *of;
        bool has_read, has_write, has_mmap;
        int error = -EACCES;

        if (!kernfs_get_active(kn))
                return -ENODEV;

        ops = kernfs_ops(kn);

        has_read = ops->seq_show || ops->read || ops->mmap;
        has_write = ops->write || ops->mmap;
        has_mmap = ops->mmap;

        /* see the flag definition for details */
        if (root->flags & KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK) {
                if ((file->f_mode & FMODE_WRITE) &&
                    (!(inode->i_mode & S_IWUGO) || !has_write))
                        goto err_out;

                if ((file->f_mode & FMODE_READ) &&
                    (!(inode->i_mode & S_IRUGO) || !has_read))
                        goto err_out;
        }

        /* allocate a kernfs_open_file for the file */
        error = -ENOMEM;
        of = kzalloc(sizeof(struct kernfs_open_file), GFP_KERNEL);
        if (!of)
                goto err_out;

        /*
         * The following is done to give a different lockdep key to
         * @of->mutex for files which implement mmap.  This is a rather
         * crude way to avoid false positive lockdep warning around
         * mm->mmap_lock - mmap nests @of->mutex under mm->mmap_lock and
         * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under
         * which mm->mmap_lock nests, while holding @of->mutex.  As each
         * open file has a separate mutex, it's okay as long as those don't
         * happen on the same file.  At this point, we can't easily give
         * each file a separate locking class.  Let's differentiate on
         * whether the file has mmap or not for now.
         *
         * Both paths of the branch look the same.  They're supposed to
         * look that way and give @of->mutex different static lockdep keys.
         */
        if (has_mmap)
                mutex_init(&of->mutex);
        else
                mutex_init(&of->mutex);

        of->kn = kn;
        of->file = file;

        /*
         * Write path needs to atomic_write_len outside active reference.
         * Cache it in open_file.  See kernfs_fop_write_iter() for details.
         */
        of->atomic_write_len = ops->atomic_write_len;

        error = -EINVAL;
        /*
         * ->seq_show is incompatible with ->prealloc,
         * as seq_read does its own allocation.
         * ->read must be used instead.
         */
        if (ops->prealloc && ops->seq_show)
                goto err_free;
        if (ops->prealloc) {
                int len = of->atomic_write_len ?: PAGE_SIZE;
                of->prealloc_buf = kmalloc(len + 1, GFP_KERNEL);
                error = -ENOMEM;
                if (!of->prealloc_buf)
                        goto err_free;
                mutex_init(&of->prealloc_mutex);
        }

        /*
         * Always instantiate seq_file even if read access doesn't use
         * seq_file or is not requested.  This unifies private data access
         * and readable regular files are the vast majority anyway.
         */
        if (ops->seq_show)
                error = seq_open(file, &kernfs_seq_ops);
        else
                error = seq_open(file, NULL);
        if (error)
                goto err_free;

        of->seq_file = file->private_data;
        of->seq_file->private = of;

        /* seq_file clears PWRITE unconditionally, restore it if WRITE */
        if (file->f_mode & FMODE_WRITE)
                file->f_mode |= FMODE_PWRITE;

        /* make sure we have open node struct */
        error = kernfs_get_open_node(kn, of);
        if (error)
                goto err_seq_release;

        if (ops->open) {
                /* nobody has access to @of yet, skip @of->mutex */
                error = ops->open(of);
                if (error)
                        goto err_put_node;
        }

        /* open succeeded, put active references */
        kernfs_put_active(kn);
        return 0;

err_put_node:
        kernfs_put_open_node(kn, of);
err_seq_release:
        seq_release(inode, file);
err_free:
        kfree(of->prealloc_buf);
        kfree(of);
err_out:
        kernfs_put_active(kn);
        return error;
}

/* used from release/drain to ensure that ->release() is called exactly once */
static void kernfs_release_file(struct kernfs_node *kn,
                                struct kernfs_open_file *of)
{
        /*
         * @of is guaranteed to have no other file operations in flight and
         * we just want to synchronize release and drain paths.
         * @kernfs_open_file_mutex is enough.  @of->mutex can't be used
         * here because drain path may be called from places which can
         * cause circular dependency.
         */
        lockdep_assert_held(&kernfs_open_file_mutex);

        if (!of->released) {
                /*
                 * A file is never detached without being released and we
                 * need to be able to release files which are deactivated
                 * and being drained.  Don't use kernfs_ops().
                 */
                kn->attr.ops->release(of);
                of->released = true;
        }
}

static int kernfs_fop_release(struct inode *inode, struct file *filp)
{
        struct kernfs_node *kn = inode->i_private;
        struct kernfs_open_file *of = kernfs_of(filp);

        if (kn->flags & KERNFS_HAS_RELEASE) {
                mutex_lock(&kernfs_open_file_mutex);
                kernfs_release_file(kn, of);
                mutex_unlock(&kernfs_open_file_mutex);
        }

        kernfs_put_open_node(kn, of);
        seq_release(inode, filp);
        kfree(of->prealloc_buf);
        kfree(of);

        return 0;
}

void kernfs_drain_open_files(struct kernfs_node *kn)
{
        struct kernfs_open_node *on;
        struct kernfs_open_file *of;

        if (!(kn->flags & (KERNFS_HAS_MMAP | KERNFS_HAS_RELEASE)))
                return;

        spin_lock_irq(&kernfs_open_node_lock);
        on = kn->attr.open;
        if (on)
                atomic_inc(&on->refcnt);
        spin_unlock_irq(&kernfs_open_node_lock);
        if (!on)
                return;

        mutex_lock(&kernfs_open_file_mutex);

        list_for_each_entry(of, &on->files, list) {
                struct inode *inode = file_inode(of->file);

                if (kn->flags & KERNFS_HAS_MMAP)
                        unmap_mapping_range(inode->i_mapping, 0, 0, 1);

                if (kn->flags & KERNFS_HAS_RELEASE)
                        kernfs_release_file(kn, of);
        }

        mutex_unlock(&kernfs_open_file_mutex);

        kernfs_put_open_node(kn, NULL);
}

/*
 * Kernfs attribute files are pollable.  The idea is that you read
 * the content and then you use 'poll' or 'select' to wait for
 * the content to change.  When the content changes (assuming the
 * manager for the kobject supports notification), poll will
 * return EPOLLERR|EPOLLPRI, and select will return the fd whether
 * it is waiting for read, write, or exceptions.
 * Once poll/select indicates that the value has changed, you
 * need to close and re-open the file, or seek to 0 and read again.
 * Reminder: this only works for attributes which actively support
 * it, and it is not possible to test an attribute from userspace
 * to see if it supports poll (Neither 'poll' nor 'select' return
 * an appropriate error code).  When in doubt, set a suitable timeout value.
 */
__poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait)
{
        struct kernfs_node *kn = kernfs_dentry_node(of->file->f_path.dentry);
        struct kernfs_open_node *on = kn->attr.open;

        poll_wait(of->file, &on->poll, wait);

        if (of->event != atomic_read(&on->event))
                return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;

        return DEFAULT_POLLMASK;
}

static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait)
{
        struct kernfs_open_file *of = kernfs_of(filp);
        struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry);
        __poll_t ret;

        if (!kernfs_get_active(kn))
                return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;

        if (kn->attr.ops->poll)
                ret = kn->attr.ops->poll(of, wait);
        else
                ret = kernfs_generic_poll(of, wait);

        kernfs_put_active(kn);
        return ret;
}

static void kernfs_notify_workfn(struct work_struct *work)
{
        struct kernfs_node *kn;
        struct kernfs_super_info *info;
repeat:
        /* pop one off the notify_list */
        spin_lock_irq(&kernfs_notify_lock);
        kn = kernfs_notify_list;
        if (kn == KERNFS_NOTIFY_EOL) {
                spin_unlock_irq(&kernfs_notify_lock);
                return;
        }
        kernfs_notify_list = kn->attr.notify_next;
        kn->attr.notify_next = NULL;
        spin_unlock_irq(&kernfs_notify_lock);

        /* kick fsnotify */
        mutex_lock(&kernfs_mutex);

        list_for_each_entry(info, &kernfs_root(kn)->supers, node) {
                struct kernfs_node *parent;
                struct inode *p_inode = NULL;
                struct inode *inode;
                struct qstr name;

                /*
                 * We want fsnotify_modify() on @kn but as the
                 * modifications aren't originating from userland don't
                 * have the matching @file available.  Look up the inodes
                 * and generate the events manually.
                 */
                inode = ilookup(info->sb, kernfs_ino(kn));
                if (!inode)
                        continue;

                name = (struct qstr)QSTR_INIT(kn->name, strlen(kn->name));
                parent = kernfs_get_parent(kn);
                if (parent) {
                        p_inode = ilookup(info->sb, kernfs_ino(parent));
                        if (p_inode) {
                                fsnotify(FS_MODIFY | FS_EVENT_ON_CHILD,
                                         inode, FSNOTIFY_EVENT_INODE,
                                         p_inode, &name, inode, 0);
                                iput(p_inode);
                        }

                        kernfs_put(parent);
                }

                if (!p_inode)
                        fsnotify_inode(inode, FS_MODIFY);

                iput(inode);
        }

        mutex_unlock(&kernfs_mutex);
        kernfs_put(kn);
        goto repeat;
}

/**
 * kernfs_notify - notify a kernfs file
 * @kn: file to notify
 *
 * Notify @kn such that poll(2) on @kn wakes up.  Maybe be called from any
 * context.
 */
void kernfs_notify(struct kernfs_node *kn)
{
        static DECLARE_WORK(kernfs_notify_work, kernfs_notify_workfn);
        unsigned long flags;
        struct kernfs_open_node *on;

        if (WARN_ON(kernfs_type(kn) != KERNFS_FILE))
                return;

        /* kick poll immediately */
        spin_lock_irqsave(&kernfs_open_node_lock, flags);
        on = kn->attr.open;
        if (on) {
                atomic_inc(&on->event);
                wake_up_interruptible(&on->poll);
        }
        spin_unlock_irqrestore(&kernfs_open_node_lock, flags);

        /* schedule work to kick fsnotify */
        spin_lock_irqsave(&kernfs_notify_lock, flags);
        if (!kn->attr.notify_next) {
                kernfs_get(kn);
                kn->attr.notify_next = kernfs_notify_list;
                kernfs_notify_list = kn;
                schedule_work(&kernfs_notify_work);
        }
        spin_unlock_irqrestore(&kernfs_notify_lock, flags);
}
EXPORT_SYMBOL_GPL(kernfs_notify);

const struct file_operations kernfs_file_fops = {
        .read_iter        = kernfs_fop_read_iter,
        .write_iter        = kernfs_fop_write_iter,
        .llseek                = generic_file_llseek,
        .mmap                = kernfs_fop_mmap,
        .open                = kernfs_fop_open,
        .release        = kernfs_fop_release,
        .poll                = kernfs_fop_poll,
        .fsync                = noop_fsync,
        .splice_read        = generic_file_splice_read,
        .splice_write        = iter_file_splice_write,
};

/**
 * __kernfs_create_file - kernfs internal function to create a file
 * @parent: directory to create the file in
 * @name: name of the file
 * @mode: mode of the file
 * @uid: uid of the file
 * @gid: gid of the file
 * @size: size of the file
 * @ops: kernfs operations for the file
 * @priv: private data for the file
 * @ns: optional namespace tag of the file
 * @key: lockdep key for the file's active_ref, %NULL to disable lockdep
 *
 * Returns the created node on success, ERR_PTR() value on error.
 */
struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
                                         const char *name,
                                         umode_t mode, kuid_t uid, kgid_t gid,
                                         loff_t size,
                                         const struct kernfs_ops *ops,
                                         void *priv, const void *ns,
                                         struct lock_class_key *key)
{
        struct kernfs_node *kn;
        unsigned flags;
        int rc;

        flags = KERNFS_FILE;

        kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG,
                             uid, gid, flags);
        if (!kn)
                return ERR_PTR(-ENOMEM);

        kn->attr.ops = ops;
        kn->attr.size = size;
        kn->ns = ns;
        kn->priv = priv;

#ifdef CONFIG_DEBUG_LOCK_ALLOC
        if (key) {
                lockdep_init_map(&kn->dep_map, "kn->active", key, 0);
                kn->flags |= KERNFS_LOCKDEP;
        }
#endif

        /*
         * kn->attr.ops is accesible only while holding active ref.  We
         * need to know whether some ops are implemented outside active
         * ref.  Cache their existence in flags.
         */
        if (ops->seq_show)
                kn->flags |= KERNFS_HAS_SEQ_SHOW;
        if (ops->mmap)
                kn->flags |= KERNFS_HAS_MMAP;
        if (ops->release)
                kn->flags |= KERNFS_HAS_RELEASE;

        rc = kernfs_add_one(kn);
        if (rc) {
                kernfs_put(kn);
                return ERR_PTR(rc);
        }
        return kn;
}

























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * include/linux/prandom.h
 *
 * Include file for the fast pseudo-random 32-bit
 * generation.
 */
#ifndef _LINUX_PRANDOM_H
#define _LINUX_PRANDOM_H

#include <linux/types.h>
#include <linux/percpu.h>
#include <linux/siphash.h>

u32 prandom_u32(void);
void prandom_bytes(void *buf, size_t nbytes);
void prandom_seed(u32 seed);
void prandom_reseed_late(void);

DECLARE_PER_CPU(unsigned long, net_rand_noise);

#define PRANDOM_ADD_NOISE(a, b, c, d) \
        prandom_u32_add_noise((unsigned long)(a), (unsigned long)(b), \
                              (unsigned long)(c), (unsigned long)(d))

#if BITS_PER_LONG == 64
/*
 * The core SipHash round function.  Each line can be executed in
 * parallel given enough CPU resources.
 */
#define PRND_SIPROUND(v0, v1, v2, v3) SIPHASH_PERMUTATION(v0, v1, v2, v3)

#define PRND_K0 (SIPHASH_CONST_0 ^ SIPHASH_CONST_2)
#define PRND_K1 (SIPHASH_CONST_1 ^ SIPHASH_CONST_3)

#elif BITS_PER_LONG == 32
/*
 * On 32-bit machines, we use HSipHash, a reduced-width version of SipHash.
 * This is weaker, but 32-bit machines are not used for high-traffic
 * applications, so there is less output for an attacker to analyze.
 */
#define PRND_SIPROUND(v0, v1, v2, v3) HSIPHASH_PERMUTATION(v0, v1, v2, v3)
#define PRND_K0 (HSIPHASH_CONST_0 ^ HSIPHASH_CONST_2)
#define PRND_K1 (HSIPHASH_CONST_1 ^ HSIPHASH_CONST_3)

#else
#error Unsupported BITS_PER_LONG
#endif

static inline void prandom_u32_add_noise(unsigned long a, unsigned long b,
                                         unsigned long c, unsigned long d)
{
        /*
         * This is not used cryptographically; it's just
         * a convenient 4-word hash function. (3 xor, 2 add, 2 rol)
         */
        a ^= raw_cpu_read(net_rand_noise);
        PRND_SIPROUND(a, b, c, d);
        raw_cpu_write(net_rand_noise, d);
}

struct rnd_state {
        __u32 s1, s2, s3, s4;
};

u32 prandom_u32_state(struct rnd_state *state);
void prandom_bytes_state(struct rnd_state *state, void *buf, size_t nbytes);
void prandom_seed_full_state(struct rnd_state __percpu *pcpu_state);

#define prandom_init_once(pcpu_state)                        \
        DO_ONCE(prandom_seed_full_state, (pcpu_state))

/**
 * prandom_u32_max - returns a pseudo-random number in interval [0, ep_ro)
 * @ep_ro: right open interval endpoint
 *
 * Returns a pseudo-random number that is in interval [0, ep_ro). Note
 * that the result depends on PRNG being well distributed in [0, ~0U]
 * u32 space. Here we use maximally equidistributed combined Tausworthe
 * generator, that is, prandom_u32(). This is useful when requesting a
 * random index of an array containing ep_ro elements, for example.
 *
 * Returns: pseudo-random number in interval [0, ep_ro)
 */
static inline u32 prandom_u32_max(u32 ep_ro)
{
        return (u32)(((u64) prandom_u32() * ep_ro) >> 32);
}

/*
 * Handle minimum values for seeds
 */
static inline u32 __seed(u32 x, u32 m)
{
        return (x < m) ? x + m : x;
}

/**
 * prandom_seed_state - set seed for prandom_u32_state().
 * @state: pointer to state structure to receive the seed.
 * @seed: arbitrary 64-bit value to use as a seed.
 */
static inline void prandom_seed_state(struct rnd_state *state, u64 seed)
{
        u32 i = ((seed >> 32) ^ (seed << 10) ^ seed) & 0xffffffffUL;

        state->s1 = __seed(i,   2U);
        state->s2 = __seed(i,   8U);
        state->s3 = __seed(i,  16U);
        state->s4 = __seed(i, 128U);
        PRANDOM_ADD_NOISE(state, i, 0, 0);
}

/* Pseudo random number generator from numerical recipes. */
static inline u32 next_pseudo_random32(u32 seed)
{
        return seed * 1664525 + 1013904223;
}

#endif

























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __VDSO_HELPERS_H
#define __VDSO_HELPERS_H

#ifndef __ASSEMBLY__

#include <vdso/datapage.h>

static __always_inline u32 vdso_read_begin(const struct vdso_data *vd)
{
        u32 seq;

        while (unlikely((seq = READ_ONCE(vd->seq)) & 1))
                cpu_relax();

        smp_rmb();
        return seq;
}

static __always_inline u32 vdso_read_retry(const struct vdso_data *vd,
                                           u32 start)
{
        u32 seq;

        smp_rmb();
        seq = READ_ONCE(vd->seq);
        return seq != start;
}

static __always_inline void vdso_write_begin(struct vdso_data *vd)
{
        /*
         * WRITE_ONCE it is required otherwise the compiler can validly tear
         * updates to vd[x].seq and it is possible that the value seen by the
         * reader it is inconsistent.
         */
        WRITE_ONCE(vd[CS_HRES_COARSE].seq, vd[CS_HRES_COARSE].seq + 1);
        WRITE_ONCE(vd[CS_RAW].seq, vd[CS_RAW].seq + 1);
        smp_wmb();
}

static __always_inline void vdso_write_end(struct vdso_data *vd)
{
        smp_wmb();
        /*
         * WRITE_ONCE it is required otherwise the compiler can validly tear
         * updates to vd[x].seq and it is possible that the value seen by the
         * reader it is inconsistent.
         */
        WRITE_ONCE(vd[CS_HRES_COARSE].seq, vd[CS_HRES_COARSE].seq + 1);
        WRITE_ONCE(vd[CS_RAW].seq, vd[CS_RAW].seq + 1);
}

#endif /* !__ASSEMBLY__ */

#endif /* __VDSO_HELPERS_H */














































































    2 





























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * NUMA memory policies for Linux.
 * Copyright 2003,2004 Andi Kleen SuSE Labs
 */
#ifndef _LINUX_MEMPOLICY_H
#define _LINUX_MEMPOLICY_H 1

#include <linux/sched.h>
#include <linux/mmzone.h>
#include <linux/dax.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
#include <linux/spinlock.h>
#include <linux/nodemask.h>
#include <linux/pagemap.h>
#include <uapi/linux/mempolicy.h>

struct mm_struct;

#ifdef CONFIG_NUMA

/*
 * Describe a memory policy.
 *
 * A mempolicy can be either associated with a process or with a VMA.
 * For VMA related allocations the VMA policy is preferred, otherwise
 * the process policy is used. Interrupts ignore the memory policy
 * of the current process.
 *
 * Locking policy for interleave:
 * In process context there is no locking because only the process accesses
 * its own state. All vma manipulation is somewhat protected by a down_read on
 * mmap_lock.
 *
 * Freeing policy:
 * Mempolicy objects are reference counted.  A mempolicy will be freed when
 * mpol_put() decrements the reference count to zero.
 *
 * Duplicating policy objects:
 * mpol_dup() allocates a new mempolicy and copies the specified mempolicy
 * to the new storage.  The reference count of the new object is initialized
 * to 1, representing the caller of mpol_dup().
 */
struct mempolicy {
        atomic_t refcnt;
        unsigned short mode;         /* See MPOL_* above */
        unsigned short flags;        /* See set_mempolicy() MPOL_F_* above */
        union {
                short                  preferred_node; /* preferred */
                nodemask_t         nodes;                /* interleave/bind */
                /* undefined for default */
        } v;
        union {
                nodemask_t cpuset_mems_allowed;        /* relative to these nodes */
                nodemask_t user_nodemask;        /* nodemask passed by user */
        } w;
};

/*
 * Support for managing mempolicy data objects (clone, copy, destroy)
 * The default fast path of a NULL MPOL_DEFAULT policy is always inlined.
 */

extern void __mpol_put(struct mempolicy *pol);
static inline void mpol_put(struct mempolicy *pol)
{
        if (pol)
                __mpol_put(pol);
}

/*
 * Does mempolicy pol need explicit unref after use?
 * Currently only needed for shared policies.
 */
static inline int mpol_needs_cond_ref(struct mempolicy *pol)
{
        return (pol && (pol->flags & MPOL_F_SHARED));
}

static inline void mpol_cond_put(struct mempolicy *pol)
{
        if (mpol_needs_cond_ref(pol))
                __mpol_put(pol);
}

extern struct mempolicy *__mpol_dup(struct mempolicy *pol);
static inline struct mempolicy *mpol_dup(struct mempolicy *pol)
{
        if (pol)
                pol = __mpol_dup(pol);
        return pol;
}

#define vma_policy(vma) ((vma)->vm_policy)

static inline void mpol_get(struct mempolicy *pol)
{
        if (pol)
                atomic_inc(&pol->refcnt);
}

extern bool __mpol_equal(struct mempolicy *a, struct mempolicy *b);
static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
        if (a == b)
                return true;
        return __mpol_equal(a, b);
}

/*
 * Tree of shared policies for a shared memory region.
 * Maintain the policies in a pseudo mm that contains vmas. The vmas
 * carry the policy. As a special twist the pseudo mm is indexed in pages, not
 * bytes, so that we can work with shared memory segments bigger than
 * unsigned long.
 */

struct sp_node {
        struct rb_node nd;
        unsigned long start, end;
        struct mempolicy *policy;
};

struct shared_policy {
        struct rb_root root;
        rwlock_t lock;
};

int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst);
void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol);
int mpol_set_shared_policy(struct shared_policy *info,
                                struct vm_area_struct *vma,
                                struct mempolicy *new);
void mpol_free_shared_policy(struct shared_policy *p);
struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
                                            unsigned long idx);

struct mempolicy *get_task_policy(struct task_struct *p);
struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
                unsigned long addr);
bool vma_policy_mof(struct vm_area_struct *vma);

extern void numa_default_policy(void);
extern void numa_policy_init(void);
extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new);
extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);

extern int huge_node(struct vm_area_struct *vma,
                                unsigned long addr, gfp_t gfp_flags,
                                struct mempolicy **mpol, nodemask_t **nodemask);
extern bool init_nodemask_of_mempolicy(nodemask_t *mask);
extern bool mempolicy_nodemask_intersects(struct task_struct *tsk,
                                const nodemask_t *mask);
extern nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy);

static inline nodemask_t *policy_nodemask_current(gfp_t gfp)
{
        struct mempolicy *mpol = get_task_policy(current);

        return policy_nodemask(gfp, mpol);
}

extern unsigned int mempolicy_slab_node(void);

extern enum zone_type policy_zone;

static inline void check_highest_zone(enum zone_type k)
{
        if (k > policy_zone && k != ZONE_MOVABLE)
                policy_zone = k;
}

int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
                     const nodemask_t *to, int flags);


#ifdef CONFIG_TMPFS
extern int mpol_parse_str(char *str, struct mempolicy **mpol);
#endif

extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol);

/* Check if a vma is migratable */
extern bool vma_migratable(struct vm_area_struct *vma);

extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long);
extern void mpol_put_task_policy(struct task_struct *);

#else

struct mempolicy {};

static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
        return true;
}

static inline void mpol_put(struct mempolicy *p)
{
}

static inline void mpol_cond_put(struct mempolicy *pol)
{
}

static inline void mpol_get(struct mempolicy *pol)
{
}

struct shared_policy {};

static inline void mpol_shared_policy_init(struct shared_policy *sp,
                                                struct mempolicy *mpol)
{
}

static inline void mpol_free_shared_policy(struct shared_policy *p)
{
}

static inline struct mempolicy *
mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
{
        return NULL;
}

#define vma_policy(vma) NULL

static inline int
vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
{
        return 0;
}

static inline void numa_policy_init(void)
{
}

static inline void numa_default_policy(void)
{
}

static inline void mpol_rebind_task(struct task_struct *tsk,
                                const nodemask_t *new)
{
}

static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
{
}

static inline int huge_node(struct vm_area_struct *vma,
                                unsigned long addr, gfp_t gfp_flags,
                                struct mempolicy **mpol, nodemask_t **nodemask)
{
        *mpol = NULL;
        *nodemask = NULL;
        return 0;
}

static inline bool init_nodemask_of_mempolicy(nodemask_t *m)
{
        return false;
}

static inline int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
                                   const nodemask_t *to, int flags)
{
        return 0;
}

static inline void check_highest_zone(int k)
{
}

#ifdef CONFIG_TMPFS
static inline int mpol_parse_str(char *str, struct mempolicy **mpol)
{
        return 1;        /* error */
}
#endif

static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
                                 unsigned long address)
{
        return -1; /* no node preference */
}

static inline void mpol_put_task_policy(struct task_struct *task)
{
}

static inline nodemask_t *policy_nodemask_current(gfp_t gfp)
{
        return NULL;
}
#endif /* CONFIG_NUMA */
#endif




























    2 





























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Queued spinlock
 *
 * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
 * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
 *
 * Authors: Waiman Long <waiman.long@hpe.com>
 */
#ifndef __ASM_GENERIC_QSPINLOCK_H
#define __ASM_GENERIC_QSPINLOCK_H

#include <asm-generic/qspinlock_types.h>
#include <linux/atomic.h>

#ifndef queued_spin_is_locked
/**
 * queued_spin_is_locked - is the spinlock locked?
 * @lock: Pointer to queued spinlock structure
 * Return: 1 if it is locked, 0 otherwise
 */
static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
{
        /*
         * Any !0 state indicates it is locked, even if _Q_LOCKED_VAL
         * isn't immediately observable.
         */
        return atomic_read(&lock->val);
}
#endif

/**
 * queued_spin_value_unlocked - is the spinlock structure unlocked?
 * @lock: queued spinlock structure
 * Return: 1 if it is unlocked, 0 otherwise
 *
 * N.B. Whenever there are tasks waiting for the lock, it is considered
 *      locked wrt the lockref code to avoid lock stealing by the lockref
 *      code and change things underneath the lock. This also allows some
 *      optimizations to be applied without conflict with lockref.
 */
static __always_inline int queued_spin_value_unlocked(struct qspinlock lock)
{
        return !lock.val.counter;
}

/**
 * queued_spin_is_contended - check if the lock is contended
 * @lock : Pointer to queued spinlock structure
 * Return: 1 if lock contended, 0 otherwise
 */
static __always_inline int queued_spin_is_contended(struct qspinlock *lock)
{
        return atomic_read(&lock->val) & ~_Q_LOCKED_MASK;
}
/**
 * queued_spin_trylock - try to acquire the queued spinlock
 * @lock : Pointer to queued spinlock structure
 * Return: 1 if lock acquired, 0 if failed
 */
static __always_inline int queued_spin_trylock(struct qspinlock *lock)
{
        u32 val = atomic_read(&lock->val);

        if (unlikely(val))
                return 0;

        return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL));
}

extern void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);

#ifndef queued_spin_lock
/**
 * queued_spin_lock - acquire a queued spinlock
 * @lock: Pointer to queued spinlock structure
 */
static __always_inline void queued_spin_lock(struct qspinlock *lock)
{
        u32 val = 0;

        if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)))
                return;

        queued_spin_lock_slowpath(lock, val);
}
#endif

#ifndef queued_spin_unlock
/**
 * queued_spin_unlock - release a queued spinlock
 * @lock : Pointer to queued spinlock structure
 */
static __always_inline void queued_spin_unlock(struct qspinlock *lock)
{
        /*
         * unlock() needs release semantics:
         */
        smp_store_release(&lock->locked, 0);
}
#endif

#ifndef virt_spin_lock
static __always_inline bool virt_spin_lock(struct qspinlock *lock)
{
        return false;
}
#endif

/*
 * Remapping spinlock architecture specific functions to the corresponding
 * queued spinlock functions.
 */
#define arch_spin_is_locked(l)                queued_spin_is_locked(l)
#define arch_spin_is_contended(l)        queued_spin_is_contended(l)
#define arch_spin_value_unlocked(l)        queued_spin_value_unlocked(l)
#define arch_spin_lock(l)                queued_spin_lock(l)
#define arch_spin_trylock(l)                queued_spin_trylock(l)
#define arch_spin_unlock(l)                queued_spin_unlock(l)

#endif /* __ASM_GENERIC_QSPINLOCK_H */



















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_VIRTIO_NET_H
#define _LINUX_VIRTIO_NET_H

#include <linux/if_vlan.h>
#include <uapi/linux/tcp.h>
#include <uapi/linux/udp.h>
#include <uapi/linux/virtio_net.h>

static inline bool virtio_net_hdr_match_proto(__be16 protocol, __u8 gso_type)
{
        switch (gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
        case VIRTIO_NET_HDR_GSO_TCPV4:
                return protocol == cpu_to_be16(ETH_P_IP);
        case VIRTIO_NET_HDR_GSO_TCPV6:
                return protocol == cpu_to_be16(ETH_P_IPV6);
        case VIRTIO_NET_HDR_GSO_UDP:
                return protocol == cpu_to_be16(ETH_P_IP) ||
                       protocol == cpu_to_be16(ETH_P_IPV6);
        default:
                return false;
        }
}

static inline int virtio_net_hdr_set_proto(struct sk_buff *skb,
                                           const struct virtio_net_hdr *hdr)
{
        if (skb->protocol)
                return 0;

        switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
        case VIRTIO_NET_HDR_GSO_TCPV4:
        case VIRTIO_NET_HDR_GSO_UDP:
                skb->protocol = cpu_to_be16(ETH_P_IP);
                break;
        case VIRTIO_NET_HDR_GSO_TCPV6:
                skb->protocol = cpu_to_be16(ETH_P_IPV6);
                break;
        default:
                return -EINVAL;
        }

        return 0;
}

static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
                                        const struct virtio_net_hdr *hdr,
                                        bool little_endian)
{
        unsigned int gso_type = 0;
        unsigned int thlen = 0;
        unsigned int p_off = 0;
        unsigned int ip_proto;

        if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
                switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
                case VIRTIO_NET_HDR_GSO_TCPV4:
                        gso_type = SKB_GSO_TCPV4;
                        ip_proto = IPPROTO_TCP;
                        thlen = sizeof(struct tcphdr);
                        break;
                case VIRTIO_NET_HDR_GSO_TCPV6:
                        gso_type = SKB_GSO_TCPV6;
                        ip_proto = IPPROTO_TCP;
                        thlen = sizeof(struct tcphdr);
                        break;
                case VIRTIO_NET_HDR_GSO_UDP:
                        gso_type = SKB_GSO_UDP;
                        ip_proto = IPPROTO_UDP;
                        thlen = sizeof(struct udphdr);
                        break;
                default:
                        return -EINVAL;
                }

                if (hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN)
                        gso_type |= SKB_GSO_TCP_ECN;

                if (hdr->gso_size == 0)
                        return -EINVAL;
        }

        skb_reset_mac_header(skb);

        if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
                u32 start = __virtio16_to_cpu(little_endian, hdr->csum_start);
                u32 off = __virtio16_to_cpu(little_endian, hdr->csum_offset);
                u32 needed = start + max_t(u32, thlen, off + sizeof(__sum16));

                if (!pskb_may_pull(skb, needed))
                        return -EINVAL;

                if (!skb_partial_csum_set(skb, start, off))
                        return -EINVAL;

                p_off = skb_transport_offset(skb) + thlen;
                if (!pskb_may_pull(skb, p_off))
                        return -EINVAL;
        } else {
                /* gso packets without NEEDS_CSUM do not set transport_offset.
                 * probe and drop if does not match one of the above types.
                 */
                if (gso_type && skb->network_header) {
                        struct flow_keys_basic keys;

                        if (!skb->protocol) {
                                __be16 protocol = dev_parse_header_protocol(skb);

                                if (!protocol)
                                        virtio_net_hdr_set_proto(skb, hdr);
                                else if (!virtio_net_hdr_match_proto(protocol, hdr->gso_type))
                                        return -EINVAL;
                                else
                                        skb->protocol = protocol;
                        }
retry:
                        if (!skb_flow_dissect_flow_keys_basic(NULL, skb, &keys,
                                                              NULL, 0, 0, 0,
                                                              0)) {
                                /* UFO does not specify ipv4 or 6: try both */
                                if (gso_type & SKB_GSO_UDP &&
                                    skb->protocol == htons(ETH_P_IP)) {
                                        skb->protocol = htons(ETH_P_IPV6);
                                        goto retry;
                                }
                                return -EINVAL;
                        }

                        p_off = keys.control.thoff + thlen;
                        if (!pskb_may_pull(skb, p_off) ||
                            keys.basic.ip_proto != ip_proto)
                                return -EINVAL;

                        skb_set_transport_header(skb, keys.control.thoff);
                } else if (gso_type) {
                        p_off = thlen;
                        if (!pskb_may_pull(skb, p_off))
                                return -EINVAL;
                }
        }

        if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
                u16 gso_size = __virtio16_to_cpu(little_endian, hdr->gso_size);
                unsigned int nh_off = p_off;
                struct skb_shared_info *shinfo = skb_shinfo(skb);

                /* UFO may not include transport header in gso_size. */
                if (gso_type & SKB_GSO_UDP)
                        nh_off -= thlen;

                /* Kernel has a special handling for GSO_BY_FRAGS. */
                if (gso_size == GSO_BY_FRAGS)
                        return -EINVAL;

                /* Too small packets are not really GSO ones. */
                if (skb->len - nh_off > gso_size) {
                        shinfo->gso_size = gso_size;
                        shinfo->gso_type = gso_type;

                        /* Header must be checked, and gso_segs computed. */
                        shinfo->gso_type |= SKB_GSO_DODGY;
                        shinfo->gso_segs = 0;
                }
        }

        return 0;
}

static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb,
                                          struct virtio_net_hdr *hdr,
                                          bool little_endian,
                                          bool has_data_valid,
                                          int vlan_hlen)
{
        memset(hdr, 0, sizeof(*hdr));   /* no info leak */

        if (skb_is_gso(skb)) {
                struct skb_shared_info *sinfo = skb_shinfo(skb);

                /* This is a hint as to how much should be linear. */
                hdr->hdr_len = __cpu_to_virtio16(little_endian,
                                                 skb_headlen(skb));
                hdr->gso_size = __cpu_to_virtio16(little_endian,
                                                  sinfo->gso_size);
                if (sinfo->gso_type & SKB_GSO_TCPV4)
                        hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
                else if (sinfo->gso_type & SKB_GSO_TCPV6)
                        hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
                else
                        return -EINVAL;
                if (sinfo->gso_type & SKB_GSO_TCP_ECN)
                        hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN;
        } else
                hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;

        if (skb->ip_summed == CHECKSUM_PARTIAL) {
                hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
                hdr->csum_start = __cpu_to_virtio16(little_endian,
                        skb_checksum_start_offset(skb) + vlan_hlen);
                hdr->csum_offset = __cpu_to_virtio16(little_endian,
                                skb->csum_offset);
        } else if (has_data_valid &&
                   skb->ip_summed == CHECKSUM_UNNECESSARY) {
                hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID;
        } /* else everything is zero */

        return 0;
}

#endif /* _LINUX_VIRTIO_NET_H */




































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * IEEE802.15.4-2003 specification
 *
 * Copyright (C) 2007, 2008 Siemens AG
 *
 * Written by:
 * Pavel Smolenskiy <pavel.smolenskiy@gmail.com>
 * Maxim Gorbachyov <maxim.gorbachev@siemens.com>
 * Maxim Osipov <maxim.osipov@siemens.com>
 * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
 * Alexander Smirnov <alex.bluesman.smirnov@gmail.com>
 */

#ifndef LINUX_IEEE802154_H
#define LINUX_IEEE802154_H

#include <linux/types.h>
#include <linux/random.h>

#define IEEE802154_MTU                        127
#define IEEE802154_ACK_PSDU_LEN                5
#define IEEE802154_MIN_PSDU_LEN                9
#define IEEE802154_FCS_LEN                2
#define IEEE802154_MAX_AUTH_TAG_LEN        16
#define IEEE802154_FC_LEN                2
#define IEEE802154_SEQ_LEN                1

/*  General MAC frame format:
 *  2 bytes: Frame Control
 *  1 byte:  Sequence Number
 * 20 bytes: Addressing fields
 * 14 bytes: Auxiliary Security Header
 */
#define IEEE802154_MAX_HEADER_LEN        (2 + 1 + 20 + 14)
#define IEEE802154_MIN_HEADER_LEN        (IEEE802154_ACK_PSDU_LEN - \
                                         IEEE802154_FCS_LEN)

#define IEEE802154_PAN_ID_BROADCAST        0xffff
#define IEEE802154_ADDR_SHORT_BROADCAST        0xffff
#define IEEE802154_ADDR_SHORT_UNSPEC        0xfffe

#define IEEE802154_EXTENDED_ADDR_LEN        8
#define IEEE802154_SHORT_ADDR_LEN        2
#define IEEE802154_PAN_ID_LEN                2

#define IEEE802154_LIFS_PERIOD                40
#define IEEE802154_SIFS_PERIOD                12
#define IEEE802154_MAX_SIFS_FRAME_SIZE        18

#define IEEE802154_MAX_CHANNEL                26
#define IEEE802154_MAX_PAGE                31

#define IEEE802154_FC_TYPE_BEACON        0x0        /* Frame is beacon */
#define        IEEE802154_FC_TYPE_DATA                0x1        /* Frame is data */
#define IEEE802154_FC_TYPE_ACK                0x2        /* Frame is acknowledgment */
#define IEEE802154_FC_TYPE_MAC_CMD        0x3        /* Frame is MAC command */

#define IEEE802154_FC_TYPE_SHIFT                0
#define IEEE802154_FC_TYPE_MASK                ((1 << 3) - 1)
#define IEEE802154_FC_TYPE(x)                ((x & IEEE802154_FC_TYPE_MASK) >> IEEE802154_FC_TYPE_SHIFT)
#define IEEE802154_FC_SET_TYPE(v, x)        do {        \
        v = (((v) & ~IEEE802154_FC_TYPE_MASK) | \
            (((x) << IEEE802154_FC_TYPE_SHIFT) & IEEE802154_FC_TYPE_MASK)); \
        } while (0)

#define IEEE802154_FC_SECEN_SHIFT        3
#define IEEE802154_FC_SECEN                (1 << IEEE802154_FC_SECEN_SHIFT)
#define IEEE802154_FC_FRPEND_SHIFT        4
#define IEEE802154_FC_FRPEND                (1 << IEEE802154_FC_FRPEND_SHIFT)
#define IEEE802154_FC_ACK_REQ_SHIFT        5
#define IEEE802154_FC_ACK_REQ                (1 << IEEE802154_FC_ACK_REQ_SHIFT)
#define IEEE802154_FC_INTRA_PAN_SHIFT        6
#define IEEE802154_FC_INTRA_PAN                (1 << IEEE802154_FC_INTRA_PAN_SHIFT)

#define IEEE802154_FC_SAMODE_SHIFT        14
#define IEEE802154_FC_SAMODE_MASK        (3 << IEEE802154_FC_SAMODE_SHIFT)
#define IEEE802154_FC_DAMODE_SHIFT        10
#define IEEE802154_FC_DAMODE_MASK        (3 << IEEE802154_FC_DAMODE_SHIFT)

#define IEEE802154_FC_VERSION_SHIFT        12
#define IEEE802154_FC_VERSION_MASK        (3 << IEEE802154_FC_VERSION_SHIFT)
#define IEEE802154_FC_VERSION(x)        ((x & IEEE802154_FC_VERSION_MASK) >> IEEE802154_FC_VERSION_SHIFT)

#define IEEE802154_FC_SAMODE(x)                \
        (((x) & IEEE802154_FC_SAMODE_MASK) >> IEEE802154_FC_SAMODE_SHIFT)

#define IEEE802154_FC_DAMODE(x)                \
        (((x) & IEEE802154_FC_DAMODE_MASK) >> IEEE802154_FC_DAMODE_SHIFT)

#define IEEE802154_SCF_SECLEVEL_MASK                7
#define IEEE802154_SCF_SECLEVEL_SHIFT                0
#define IEEE802154_SCF_SECLEVEL(x)                (x & IEEE802154_SCF_SECLEVEL_MASK)
#define IEEE802154_SCF_KEY_ID_MODE_SHIFT        3
#define IEEE802154_SCF_KEY_ID_MODE_MASK                (3 << IEEE802154_SCF_KEY_ID_MODE_SHIFT)
#define IEEE802154_SCF_KEY_ID_MODE(x)                \
        ((x & IEEE802154_SCF_KEY_ID_MODE_MASK) >> IEEE802154_SCF_KEY_ID_MODE_SHIFT)

#define IEEE802154_SCF_KEY_IMPLICIT                0
#define IEEE802154_SCF_KEY_INDEX                1
#define IEEE802154_SCF_KEY_SHORT_INDEX                2
#define IEEE802154_SCF_KEY_HW_INDEX                3

#define IEEE802154_SCF_SECLEVEL_NONE                0
#define IEEE802154_SCF_SECLEVEL_MIC32                1
#define IEEE802154_SCF_SECLEVEL_MIC64                2
#define IEEE802154_SCF_SECLEVEL_MIC128                3
#define IEEE802154_SCF_SECLEVEL_ENC                4
#define IEEE802154_SCF_SECLEVEL_ENC_MIC32        5
#define IEEE802154_SCF_SECLEVEL_ENC_MIC64        6
#define IEEE802154_SCF_SECLEVEL_ENC_MIC128        7

/* MAC footer size */
#define IEEE802154_MFR_SIZE        2 /* 2 octets */

/* MAC's Command Frames Identifiers */
#define IEEE802154_CMD_ASSOCIATION_REQ                0x01
#define IEEE802154_CMD_ASSOCIATION_RESP                0x02
#define IEEE802154_CMD_DISASSOCIATION_NOTIFY        0x03
#define IEEE802154_CMD_DATA_REQ                        0x04
#define IEEE802154_CMD_PANID_CONFLICT_NOTIFY        0x05
#define IEEE802154_CMD_ORPHAN_NOTIFY                0x06
#define IEEE802154_CMD_BEACON_REQ                0x07
#define IEEE802154_CMD_COORD_REALIGN_NOTIFY        0x08
#define IEEE802154_CMD_GTS_REQ                        0x09

/*
 * The return values of MAC operations
 */
enum {
        /*
         * The requested operation was completed successfully.
         * For a transmission request, this value indicates
         * a successful transmission.
         */
        IEEE802154_SUCCESS = 0x0,

        /* The beacon was lost following a synchronization request. */
        IEEE802154_BEACON_LOSS = 0xe0,
        /*
         * A transmission could not take place due to activity on the
         * channel, i.e., the CSMA-CA mechanism has failed.
         */
        IEEE802154_CHNL_ACCESS_FAIL = 0xe1,
        /* The GTS request has been denied by the PAN coordinator. */
        IEEE802154_DENINED = 0xe2,
        /* The attempt to disable the transceiver has failed. */
        IEEE802154_DISABLE_TRX_FAIL = 0xe3,
        /*
         * The received frame induces a failed security check according to
         * the security suite.
         */
        IEEE802154_FAILED_SECURITY_CHECK = 0xe4,
        /*
         * The frame resulting from secure processing has a length that is
         * greater than aMACMaxFrameSize.
         */
        IEEE802154_FRAME_TOO_LONG = 0xe5,
        /*
         * The requested GTS transmission failed because the specified GTS
         * either did not have a transmit GTS direction or was not defined.
         */
        IEEE802154_INVALID_GTS = 0xe6,
        /*
         * A request to purge an MSDU from the transaction queue was made using
         * an MSDU handle that was not found in the transaction table.
         */
        IEEE802154_INVALID_HANDLE = 0xe7,
        /* A parameter in the primitive is out of the valid range.*/
        IEEE802154_INVALID_PARAMETER = 0xe8,
        /* No acknowledgment was received after aMaxFrameRetries. */
        IEEE802154_NO_ACK = 0xe9,
        /* A scan operation failed to find any network beacons.*/
        IEEE802154_NO_BEACON = 0xea,
        /* No response data were available following a request. */
        IEEE802154_NO_DATA = 0xeb,
        /* The operation failed because a short address was not allocated. */
        IEEE802154_NO_SHORT_ADDRESS = 0xec,
        /*
         * A receiver enable request was unsuccessful because it could not be
         * completed within the CAP.
         */
        IEEE802154_OUT_OF_CAP = 0xed,
        /*
         * A PAN identifier conflict has been detected and communicated to the
         * PAN coordinator.
         */
        IEEE802154_PANID_CONFLICT = 0xee,
        /* A coordinator realignment command has been received. */
        IEEE802154_REALIGMENT = 0xef,
        /* The transaction has expired and its information discarded. */
        IEEE802154_TRANSACTION_EXPIRED = 0xf0,
        /* There is no capacity to store the transaction. */
        IEEE802154_TRANSACTION_OVERFLOW = 0xf1,
        /*
         * The transceiver was in the transmitter enabled state when the
         * receiver was requested to be enabled.
         */
        IEEE802154_TX_ACTIVE = 0xf2,
        /* The appropriate key is not available in the ACL. */
        IEEE802154_UNAVAILABLE_KEY = 0xf3,
        /*
         * A SET/GET request was issued with the identifier of a PIB attribute
         * that is not supported.
         */
        IEEE802154_UNSUPPORTED_ATTR = 0xf4,
        /*
         * A request to perform a scan operation failed because the MLME was
         * in the process of performing a previously initiated scan operation.
         */
        IEEE802154_SCAN_IN_PROGRESS = 0xfc,
};

/* frame control handling */
#define IEEE802154_FCTL_FTYPE                0x0003
#define IEEE802154_FCTL_ACKREQ                0x0020
#define IEEE802154_FCTL_SECEN                0x0004
#define IEEE802154_FCTL_INTRA_PAN        0x0040
#define IEEE802154_FCTL_DADDR                0x0c00
#define IEEE802154_FCTL_SADDR                0xc000

#define IEEE802154_FTYPE_DATA                0x0001

#define IEEE802154_FCTL_ADDR_NONE        0x0000
#define IEEE802154_FCTL_DADDR_SHORT        0x0800
#define IEEE802154_FCTL_DADDR_EXTENDED        0x0c00
#define IEEE802154_FCTL_SADDR_SHORT        0x8000
#define IEEE802154_FCTL_SADDR_EXTENDED        0xc000

/*
 * ieee802154_is_data - check if type is IEEE802154_FTYPE_DATA
 * @fc: frame control bytes in little-endian byteorder
 */
static inline int ieee802154_is_data(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE802154_FCTL_FTYPE)) ==
                cpu_to_le16(IEEE802154_FTYPE_DATA);
}

/**
 * ieee802154_is_secen - check if Security bit is set
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee802154_is_secen(__le16 fc)
{
        return fc & cpu_to_le16(IEEE802154_FCTL_SECEN);
}

/**
 * ieee802154_is_ackreq - check if acknowledgment request bit is set
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee802154_is_ackreq(__le16 fc)
{
        return fc & cpu_to_le16(IEEE802154_FCTL_ACKREQ);
}

/**
 * ieee802154_is_intra_pan - check if intra pan id communication
 * @fc: frame control bytes in little-endian byteorder
 */
static inline bool ieee802154_is_intra_pan(__le16 fc)
{
        return fc & cpu_to_le16(IEEE802154_FCTL_INTRA_PAN);
}

/*
 * ieee802154_daddr_mode - get daddr mode from fc
 * @fc: frame control bytes in little-endian byteorder
 */
static inline __le16 ieee802154_daddr_mode(__le16 fc)
{
        return fc & cpu_to_le16(IEEE802154_FCTL_DADDR);
}

/*
 * ieee802154_saddr_mode - get saddr mode from fc
 * @fc: frame control bytes in little-endian byteorder
 */
static inline __le16 ieee802154_saddr_mode(__le16 fc)
{
        return fc & cpu_to_le16(IEEE802154_FCTL_SADDR);
}

/**
 * ieee802154_is_valid_psdu_len - check if psdu len is valid
 * available lengths:
 *        0-4        Reserved
 *        5        MPDU (Acknowledgment)
 *        6-8        Reserved
 *        9-127        MPDU
 *
 * @len: psdu len with (MHR + payload + MFR)
 */
static inline bool ieee802154_is_valid_psdu_len(u8 len)
{
        return (len == IEEE802154_ACK_PSDU_LEN ||
                (len >= IEEE802154_MIN_PSDU_LEN && len <= IEEE802154_MTU));
}

/**
 * ieee802154_is_valid_extended_unicast_addr - check if extended addr is valid
 * @addr: extended addr to check
 */
static inline bool ieee802154_is_valid_extended_unicast_addr(__le64 addr)
{
        /* Bail out if the address is all zero, or if the group
         * address bit is set.
         */
        return ((addr != cpu_to_le64(0x0000000000000000ULL)) &&
                !(addr & cpu_to_le64(0x0100000000000000ULL)));
}

/**
 * ieee802154_is_broadcast_short_addr - check if short addr is broadcast
 * @addr: short addr to check
 */
static inline bool ieee802154_is_broadcast_short_addr(__le16 addr)
{
        return (addr == cpu_to_le16(IEEE802154_ADDR_SHORT_BROADCAST));
}

/**
 * ieee802154_is_unspec_short_addr - check if short addr is unspecified
 * @addr: short addr to check
 */
static inline bool ieee802154_is_unspec_short_addr(__le16 addr)
{
        return (addr == cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC));
}

/**
 * ieee802154_is_valid_src_short_addr - check if source short address is valid
 * @addr: short addr to check
 */
static inline bool ieee802154_is_valid_src_short_addr(__le16 addr)
{
        return !(ieee802154_is_broadcast_short_addr(addr) ||
                 ieee802154_is_unspec_short_addr(addr));
}

/**
 * ieee802154_random_extended_addr - generates a random extended address
 * @addr: extended addr pointer to place the random address
 */
static inline void ieee802154_random_extended_addr(__le64 *addr)
{
        get_random_bytes(addr, IEEE802154_EXTENDED_ADDR_LEN);

        /* clear the group bit, and set the locally administered bit */
        ((u8 *)addr)[IEEE802154_EXTENDED_ADDR_LEN - 1] &= ~0x01;
        ((u8 *)addr)[IEEE802154_EXTENDED_ADDR_LEN - 1] |= 0x02;
}

#endif /* LINUX_IEEE802154_H */






















































































































































    1 




    1 











    1 




    1 





































































































    1 






    1 
    1 


    1 


    1 












    1 


    1 




    1 

























































































































































































































































































































































































































    1 






    1 







    1 










    1 



















    1 






    1 






    1 





    1 
    1 

    1 

    1 




    1 


















    1 




















    1 

    1 




    1 



    1 







































    1 


    1 







































    1 




    1 









    1 
























































































    1 

    1 




    1 

    1 
























































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
// SPDX-License-Identifier: GPL-2.0
/*
 * Basic worker thread pool for io_uring
 *
 * Copyright (C) 2019 Jens Axboe
 *
 */
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/sched/signal.h>
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/rculist_nulls.h>
#include <linux/cpu.h>
#include <linux/tracehook.h>
#include <uapi/linux/io_uring.h>

#include "io-wq.h"

#define WORKER_IDLE_TIMEOUT        (5 * HZ)
#define WORKER_INIT_LIMIT        3

enum {
        IO_WORKER_F_UP                = 1,        /* up and active */
        IO_WORKER_F_RUNNING        = 2,        /* account as running */
        IO_WORKER_F_FREE        = 4,        /* worker on free list */
        IO_WORKER_F_BOUND        = 8,        /* is doing bounded work */
};

enum {
        IO_WQ_BIT_EXIT                = 0,        /* wq exiting */
};

enum {
        IO_ACCT_STALLED_BIT        = 0,        /* stalled on hash */
};

/*
 * One for each thread in a wqe pool
 */
struct io_worker {
        refcount_t ref;
        unsigned flags;
        struct hlist_nulls_node nulls_node;
        struct list_head all_list;
        struct task_struct *task;
        struct io_wqe *wqe;

        struct io_wq_work *cur_work;
        spinlock_t lock;

        struct completion ref_done;

        unsigned long create_state;
        struct callback_head create_work;
        int create_index;
        int init_retries;

        union {
                struct rcu_head rcu;
                struct work_struct work;
        };
};

#if BITS_PER_LONG == 64
#define IO_WQ_HASH_ORDER        6
#else
#define IO_WQ_HASH_ORDER        5
#endif

#define IO_WQ_NR_HASH_BUCKETS        (1u << IO_WQ_HASH_ORDER)

struct io_wqe_acct {
        unsigned nr_workers;
        unsigned max_workers;
        int index;
        atomic_t nr_running;
        struct io_wq_work_list work_list;
        unsigned long flags;
};

enum {
        IO_WQ_ACCT_BOUND,
        IO_WQ_ACCT_UNBOUND,
        IO_WQ_ACCT_NR,
};

/*
 * Per-node worker thread pool
 */
struct io_wqe {
        raw_spinlock_t lock;
        struct io_wqe_acct acct[2];

        int node;

        struct hlist_nulls_head free_list;
        struct list_head all_list;

        struct wait_queue_entry wait;

        struct io_wq *wq;
        struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS];

        cpumask_var_t cpu_mask;
};

/*
 * Per io_wq state
  */
struct io_wq {
        unsigned long state;

        free_work_fn *free_work;
        io_wq_work_fn *do_work;

        struct io_wq_hash *hash;

        atomic_t worker_refs;
        struct completion worker_done;

        struct hlist_node cpuhp_node;

        struct task_struct *task;

        struct io_wqe *wqes[];
};

static enum cpuhp_state io_wq_online;

struct io_cb_cancel_data {
        work_cancel_fn *fn;
        void *data;
        int nr_running;
        int nr_pending;
        bool cancel_all;
};

static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index);
static void io_wqe_dec_running(struct io_worker *worker);
static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
                                        struct io_wqe_acct *acct,
                                        struct io_cb_cancel_data *match);
static void create_worker_cb(struct callback_head *cb);
static void io_wq_cancel_tw_create(struct io_wq *wq);

static bool io_worker_get(struct io_worker *worker)
{
        return refcount_inc_not_zero(&worker->ref);
}

static void io_worker_release(struct io_worker *worker)
{
        if (refcount_dec_and_test(&worker->ref))
                complete(&worker->ref_done);
}

static inline struct io_wqe_acct *io_get_acct(struct io_wqe *wqe, bool bound)
{
        return &wqe->acct[bound ? IO_WQ_ACCT_BOUND : IO_WQ_ACCT_UNBOUND];
}

static inline struct io_wqe_acct *io_work_get_acct(struct io_wqe *wqe,
                                                   struct io_wq_work *work)
{
        return io_get_acct(wqe, !(work->flags & IO_WQ_WORK_UNBOUND));
}

static inline struct io_wqe_acct *io_wqe_get_acct(struct io_worker *worker)
{
        return io_get_acct(worker->wqe, worker->flags & IO_WORKER_F_BOUND);
}

static void io_worker_ref_put(struct io_wq *wq)
{
        if (atomic_dec_and_test(&wq->worker_refs))
                complete(&wq->worker_done);
}

bool io_wq_worker_stopped(void)
{
        struct io_worker *worker = current->pf_io_worker;

        if (WARN_ON_ONCE(!io_wq_current_is_worker()))
                return true;

        return test_bit(IO_WQ_BIT_EXIT, &worker->wqe->wq->state);
}

static void io_worker_cancel_cb(struct io_worker *worker)
{
        struct io_wqe_acct *acct = io_wqe_get_acct(worker);
        struct io_wqe *wqe = worker->wqe;
        struct io_wq *wq = wqe->wq;

        atomic_dec(&acct->nr_running);
        raw_spin_lock(&worker->wqe->lock);
        acct->nr_workers--;
        raw_spin_unlock(&worker->wqe->lock);
        io_worker_ref_put(wq);
        clear_bit_unlock(0, &worker->create_state);
        io_worker_release(worker);
}

static bool io_task_worker_match(struct callback_head *cb, void *data)
{
        struct io_worker *worker;

        if (cb->func != create_worker_cb)
                return false;
        worker = container_of(cb, struct io_worker, create_work);
        return worker == data;
}

static void io_worker_exit(struct io_worker *worker)
{
        struct io_wqe *wqe = worker->wqe;
        struct io_wq *wq = wqe->wq;

        while (1) {
                struct callback_head *cb = task_work_cancel_match(wq->task,
                                                io_task_worker_match, worker);

                if (!cb)
                        break;
                io_worker_cancel_cb(worker);
        }

        if (refcount_dec_and_test(&worker->ref))
                complete(&worker->ref_done);
        wait_for_completion(&worker->ref_done);

        raw_spin_lock(&wqe->lock);
        if (worker->flags & IO_WORKER_F_FREE)
                hlist_nulls_del_rcu(&worker->nulls_node);
        list_del_rcu(&worker->all_list);
        preempt_disable();
        io_wqe_dec_running(worker);
        worker->flags = 0;
        current->flags &= ~PF_IO_WORKER;
        preempt_enable();
        raw_spin_unlock(&wqe->lock);

        kfree_rcu(worker, rcu);
        io_worker_ref_put(wqe->wq);
        do_exit(0);
}

static inline bool io_acct_run_queue(struct io_wqe_acct *acct)
{
        if (!wq_list_empty(&acct->work_list) &&
            !test_bit(IO_ACCT_STALLED_BIT, &acct->flags))
                return true;
        return false;
}

/*
 * Check head of free list for an available worker. If one isn't available,
 * caller must create one.
 */
static bool io_wqe_activate_free_worker(struct io_wqe *wqe,
                                        struct io_wqe_acct *acct)
        __must_hold(RCU)
{
        struct hlist_nulls_node *n;
        struct io_worker *worker;

        /*
         * Iterate free_list and see if we can find an idle worker to
         * activate. If a given worker is on the free_list but in the process
         * of exiting, keep trying.
         */
        hlist_nulls_for_each_entry_rcu(worker, n, &wqe->free_list, nulls_node) {
                if (!io_worker_get(worker))
                        continue;
                if (io_wqe_get_acct(worker) != acct) {
                        io_worker_release(worker);
                        continue;
                }
                if (wake_up_process(worker->task)) {
                        io_worker_release(worker);
                        return true;
                }
                io_worker_release(worker);
        }

        return false;
}

/*
 * We need a worker. If we find a free one, we're good. If not, and we're
 * below the max number of workers, create one.
 */
static bool io_wqe_create_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
{
        /*
         * Most likely an attempt to queue unbounded work on an io_wq that
         * wasn't setup with any unbounded workers.
         */
        if (unlikely(!acct->max_workers))
                pr_warn_once("io-wq is not configured for unbound workers");

        raw_spin_lock(&wqe->lock);
        if (acct->nr_workers >= acct->max_workers) {
                raw_spin_unlock(&wqe->lock);
                return true;
        }
        acct->nr_workers++;
        raw_spin_unlock(&wqe->lock);
        atomic_inc(&acct->nr_running);
        atomic_inc(&wqe->wq->worker_refs);
        return create_io_worker(wqe->wq, wqe, acct->index);
}

static void io_wqe_inc_running(struct io_worker *worker)
{
        struct io_wqe_acct *acct = io_wqe_get_acct(worker);

        atomic_inc(&acct->nr_running);
}

static void create_worker_cb(struct callback_head *cb)
{
        struct io_worker *worker;
        struct io_wq *wq;
        struct io_wqe *wqe;
        struct io_wqe_acct *acct;
        bool do_create = false;

        worker = container_of(cb, struct io_worker, create_work);
        wqe = worker->wqe;
        wq = wqe->wq;
        acct = &wqe->acct[worker->create_index];
        raw_spin_lock(&wqe->lock);
        if (acct->nr_workers < acct->max_workers) {
                acct->nr_workers++;
                do_create = true;
        }
        raw_spin_unlock(&wqe->lock);
        if (do_create) {
                create_io_worker(wq, wqe, worker->create_index);
        } else {
                atomic_dec(&acct->nr_running);
                io_worker_ref_put(wq);
        }
        clear_bit_unlock(0, &worker->create_state);
        io_worker_release(worker);
}

static bool io_queue_worker_create(struct io_worker *worker,
                                   struct io_wqe_acct *acct,
                                   task_work_func_t func)
{
        struct io_wqe *wqe = worker->wqe;
        struct io_wq *wq = wqe->wq;

        /* raced with exit, just ignore create call */
        if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
                goto fail;
        if (!io_worker_get(worker))
                goto fail;
        /*
         * create_state manages ownership of create_work/index. We should
         * only need one entry per worker, as the worker going to sleep
         * will trigger the condition, and waking will clear it once it
         * runs the task_work.
         */
        if (test_bit(0, &worker->create_state) ||
            test_and_set_bit_lock(0, &worker->create_state))
                goto fail_release;

        atomic_inc(&wq->worker_refs);
        init_task_work(&worker->create_work, func);
        worker->create_index = acct->index;
        if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) {
                /*
                 * EXIT may have been set after checking it above, check after
                 * adding the task_work and remove any creation item if it is
                 * now set. wq exit does that too, but we can have added this
                 * work item after we canceled in io_wq_exit_workers().
                 */
                if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
                        io_wq_cancel_tw_create(wq);
                io_worker_ref_put(wq);
                return true;
        }
        io_worker_ref_put(wq);
        clear_bit_unlock(0, &worker->create_state);
fail_release:
        io_worker_release(worker);
fail:
        atomic_dec(&acct->nr_running);
        io_worker_ref_put(wq);
        return false;
}

static void io_wqe_dec_running(struct io_worker *worker)
        __must_hold(wqe->lock)
{
        struct io_wqe_acct *acct = io_wqe_get_acct(worker);
        struct io_wqe *wqe = worker->wqe;

        if (!(worker->flags & IO_WORKER_F_UP))
                return;

        if (atomic_dec_and_test(&acct->nr_running) && io_acct_run_queue(acct)) {
                atomic_inc(&acct->nr_running);
                atomic_inc(&wqe->wq->worker_refs);
                raw_spin_unlock(&wqe->lock);
                io_queue_worker_create(worker, acct, create_worker_cb);
                raw_spin_lock(&wqe->lock);
        }
}

/*
 * Worker will start processing some work. Move it to the busy list, if
 * it's currently on the freelist
 */
static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
                             struct io_wq_work *work)
        __must_hold(wqe->lock)
{
        if (worker->flags & IO_WORKER_F_FREE) {
                worker->flags &= ~IO_WORKER_F_FREE;
                hlist_nulls_del_init_rcu(&worker->nulls_node);
        }
}

/*
 * No work, worker going to sleep. Move to freelist, and unuse mm if we
 * have one attached. Dropping the mm may potentially sleep, so we drop
 * the lock in that case and return success. Since the caller has to
 * retry the loop in that case (we changed task state), we don't regrab
 * the lock if we return success.
 */
static void __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker)
        __must_hold(wqe->lock)
{
        if (!(worker->flags & IO_WORKER_F_FREE)) {
                worker->flags |= IO_WORKER_F_FREE;
                hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
        }
}

static inline unsigned int io_get_work_hash(struct io_wq_work *work)
{
        return work->flags >> IO_WQ_HASH_SHIFT;
}

static bool io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
{
        struct io_wq *wq = wqe->wq;
        bool ret = false;

        spin_lock_irq(&wq->hash->wait.lock);
        if (list_empty(&wqe->wait.entry)) {
                __add_wait_queue(&wq->hash->wait, &wqe->wait);
                if (!test_bit(hash, &wq->hash->map)) {
                        __set_current_state(TASK_RUNNING);
                        list_del_init(&wqe->wait.entry);
                        ret = true;
                }
        }
        spin_unlock_irq(&wq->hash->wait.lock);
        return ret;
}

static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct,
                                           struct io_worker *worker)
        __must_hold(wqe->lock)
{
        struct io_wq_work_node *node, *prev;
        struct io_wq_work *work, *tail;
        unsigned int stall_hash = -1U;
        struct io_wqe *wqe = worker->wqe;

        wq_list_for_each(node, prev, &acct->work_list) {
                unsigned int hash;

                work = container_of(node, struct io_wq_work, list);

                /* not hashed, can run anytime */
                if (!io_wq_is_hashed(work)) {
                        wq_list_del(&acct->work_list, node, prev);
                        return work;
                }

                hash = io_get_work_hash(work);
                /* all items with this hash lie in [work, tail] */
                tail = wqe->hash_tail[hash];

                /* hashed, can run if not already running */
                if (!test_and_set_bit(hash, &wqe->wq->hash->map)) {
                        wqe->hash_tail[hash] = NULL;
                        wq_list_cut(&acct->work_list, &tail->list, prev);
                        return work;
                }
                if (stall_hash == -1U)
                        stall_hash = hash;
                /* fast forward to a next hash, for-each will fix up @prev */
                node = &tail->list;
        }

        if (stall_hash != -1U) {
                bool unstalled;

                /*
                 * Set this before dropping the lock to avoid racing with new
                 * work being added and clearing the stalled bit.
                 */
                set_bit(IO_ACCT_STALLED_BIT, &acct->flags);
                raw_spin_unlock(&wqe->lock);
                unstalled = io_wait_on_hash(wqe, stall_hash);
                raw_spin_lock(&wqe->lock);
                if (unstalled) {
                        clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
                        if (wq_has_sleeper(&wqe->wq->hash->wait))
                                wake_up(&wqe->wq->hash->wait);
                }
        }

        return NULL;
}

static bool io_flush_signals(void)
{
        if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) {
                __set_current_state(TASK_RUNNING);
                tracehook_notify_signal();
                return true;
        }
        return false;
}

static void io_assign_current_work(struct io_worker *worker,
                                   struct io_wq_work *work)
{
        if (work) {
                io_flush_signals();
                cond_resched();
        }

        spin_lock(&worker->lock);
        worker->cur_work = work;
        spin_unlock(&worker->lock);
}

static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);

static void io_worker_handle_work(struct io_worker *worker)
        __releases(wqe->lock)
{
        struct io_wqe_acct *acct = io_wqe_get_acct(worker);
        struct io_wqe *wqe = worker->wqe;
        struct io_wq *wq = wqe->wq;

        do {
                bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state);
                struct io_wq_work *work;
get_next:
                /*
                 * If we got some work, mark us as busy. If we didn't, but
                 * the list isn't empty, it means we stalled on hashed work.
                 * Mark us stalled so we don't keep looking for work when we
                 * can't make progress, any work completion or insertion will
                 * clear the stalled flag.
                 */
                work = io_get_next_work(acct, worker);
                if (work)
                        __io_worker_busy(wqe, worker, work);

                raw_spin_unlock(&wqe->lock);
                if (!work)
                        break;
                io_assign_current_work(worker, work);
                __set_current_state(TASK_RUNNING);

                /* handle a whole dependent link */
                do {
                        struct io_wq_work *next_hashed, *linked;
                        unsigned int hash = io_get_work_hash(work);

                        next_hashed = wq_next_work(work);

                        if (unlikely(do_kill) && (work->flags & IO_WQ_WORK_UNBOUND))
                                work->flags |= IO_WQ_WORK_CANCEL;
                        wq->do_work(work);
                        io_assign_current_work(worker, NULL);

                        linked = wq->free_work(work);
                        work = next_hashed;
                        if (!work && linked && !io_wq_is_hashed(linked)) {
                                work = linked;
                                linked = NULL;
                        }
                        io_assign_current_work(worker, work);
                        if (linked)
                                io_wqe_enqueue(wqe, linked);

                        if (hash != -1U && !next_hashed) {
                                /* serialize hash clear with wake_up() */
                                spin_lock_irq(&wq->hash->wait.lock);
                                clear_bit(hash, &wq->hash->map);
                                clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
                                spin_unlock_irq(&wq->hash->wait.lock);
                                if (wq_has_sleeper(&wq->hash->wait))
                                        wake_up(&wq->hash->wait);
                                raw_spin_lock(&wqe->lock);
                                /* skip unnecessary unlock-lock wqe->lock */
                                if (!work)
                                        goto get_next;
                                raw_spin_unlock(&wqe->lock);
                        }
                } while (work);

                raw_spin_lock(&wqe->lock);
        } while (1);
}

static int io_wqe_worker(void *data)
{
        struct io_worker *worker = data;
        struct io_wqe_acct *acct = io_wqe_get_acct(worker);
        struct io_wqe *wqe = worker->wqe;
        struct io_wq *wq = wqe->wq;
        bool last_timeout = false;
        char buf[TASK_COMM_LEN];

        worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);

        snprintf(buf, sizeof(buf), "iou-wrk-%d", wq->task->pid);
        set_task_comm(current, buf);

        while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
                long ret;

                set_current_state(TASK_INTERRUPTIBLE);
loop:
                raw_spin_lock(&wqe->lock);
                if (io_acct_run_queue(acct)) {
                        io_worker_handle_work(worker);
                        goto loop;
                }
                /* timed out, exit unless we're the last worker */
                if (last_timeout && acct->nr_workers > 1) {
                        acct->nr_workers--;
                        raw_spin_unlock(&wqe->lock);
                        __set_current_state(TASK_RUNNING);
                        break;
                }
                last_timeout = false;
                __io_worker_idle(wqe, worker);
                raw_spin_unlock(&wqe->lock);
                if (io_flush_signals())
                        continue;
                ret = schedule_timeout(WORKER_IDLE_TIMEOUT);
                if (signal_pending(current)) {
                        struct ksignal ksig;

                        if (!get_signal(&ksig))
                                continue;
                        break;
                }
                last_timeout = !ret;
        }

        if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
                raw_spin_lock(&wqe->lock);
                io_worker_handle_work(worker);
        }

        io_worker_exit(worker);
        return 0;
}

/*
 * Called when a worker is scheduled in. Mark us as currently running.
 */
void io_wq_worker_running(struct task_struct *tsk)
{
        struct io_worker *worker = tsk->pf_io_worker;

        if (!worker)
                return;
        if (!(worker->flags & IO_WORKER_F_UP))
                return;
        if (worker->flags & IO_WORKER_F_RUNNING)
                return;
        worker->flags |= IO_WORKER_F_RUNNING;
        io_wqe_inc_running(worker);
}

/*
 * Called when worker is going to sleep. If there are no workers currently
 * running and we have work pending, wake up a free one or create a new one.
 */
void io_wq_worker_sleeping(struct task_struct *tsk)
{
        struct io_worker *worker = tsk->pf_io_worker;

        if (!worker)
                return;
        if (!(worker->flags & IO_WORKER_F_UP))
                return;
        if (!(worker->flags & IO_WORKER_F_RUNNING))
                return;

        worker->flags &= ~IO_WORKER_F_RUNNING;

        raw_spin_lock(&worker->wqe->lock);
        io_wqe_dec_running(worker);
        raw_spin_unlock(&worker->wqe->lock);
}

static void io_init_new_worker(struct io_wqe *wqe, struct io_worker *worker,
                               struct task_struct *tsk)
{
        tsk->pf_io_worker = worker;
        worker->task = tsk;
        set_cpus_allowed_ptr(tsk, wqe->cpu_mask);
        tsk->flags |= PF_NO_SETAFFINITY;

        raw_spin_lock(&wqe->lock);
        hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
        list_add_tail_rcu(&worker->all_list, &wqe->all_list);
        worker->flags |= IO_WORKER_F_FREE;
        raw_spin_unlock(&wqe->lock);
        wake_up_new_task(tsk);
}

static bool io_wq_work_match_all(struct io_wq_work *work, void *data)
{
        return true;
}

static inline bool io_should_retry_thread(struct io_worker *worker, long err)
{
        /*
         * Prevent perpetual task_work retry, if the task (or its group) is
         * exiting.
         */
        if (fatal_signal_pending(current))
                return false;
        if (worker->init_retries++ >= WORKER_INIT_LIMIT)
                return false;

        switch (err) {
        case -EAGAIN:
        case -ERESTARTSYS:
        case -ERESTARTNOINTR:
        case -ERESTARTNOHAND:
                return true;
        default:
                return false;
        }
}

static void create_worker_cont(struct callback_head *cb)
{
        struct io_worker *worker;
        struct task_struct *tsk;
        struct io_wqe *wqe;

        worker = container_of(cb, struct io_worker, create_work);
        clear_bit_unlock(0, &worker->create_state);
        wqe = worker->wqe;
        tsk = create_io_thread(io_wqe_worker, worker, wqe->node);
        if (!IS_ERR(tsk)) {
                io_init_new_worker(wqe, worker, tsk);
                io_worker_release(worker);
                return;
        } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) {
                struct io_wqe_acct *acct = io_wqe_get_acct(worker);

                atomic_dec(&acct->nr_running);
                raw_spin_lock(&wqe->lock);
                acct->nr_workers--;
                if (!acct->nr_workers) {
                        struct io_cb_cancel_data match = {
                                .fn                = io_wq_work_match_all,
                                .cancel_all        = true,
                        };

                        while (io_acct_cancel_pending_work(wqe, acct, &match))
                                raw_spin_lock(&wqe->lock);
                }
                raw_spin_unlock(&wqe->lock);
                io_worker_ref_put(wqe->wq);
                kfree(worker);
                return;
        }

        /* re-create attempts grab a new worker ref, drop the existing one */
        io_worker_release(worker);
        schedule_work(&worker->work);
}

static void io_workqueue_create(struct work_struct *work)
{
        struct io_worker *worker = container_of(work, struct io_worker, work);
        struct io_wqe_acct *acct = io_wqe_get_acct(worker);

        if (!io_queue_worker_create(worker, acct, create_worker_cont))
                kfree(worker);
}

static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
{
        struct io_wqe_acct *acct = &wqe->acct[index];
        struct io_worker *worker;
        struct task_struct *tsk;

        __set_current_state(TASK_RUNNING);

        worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node);
        if (!worker) {
fail:
                atomic_dec(&acct->nr_running);
                raw_spin_lock(&wqe->lock);
                acct->nr_workers--;
                raw_spin_unlock(&wqe->lock);
                io_worker_ref_put(wq);
                return false;
        }

        refcount_set(&worker->ref, 1);
        worker->wqe = wqe;
        spin_lock_init(&worker->lock);
        init_completion(&worker->ref_done);

        if (index == IO_WQ_ACCT_BOUND)
                worker->flags |= IO_WORKER_F_BOUND;

        tsk = create_io_thread(io_wqe_worker, worker, wqe->node);
        if (!IS_ERR(tsk)) {
                io_init_new_worker(wqe, worker, tsk);
        } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) {
                kfree(worker);
                goto fail;
        } else {
                INIT_WORK(&worker->work, io_workqueue_create);
                schedule_work(&worker->work);
        }

        return true;
}

/*
 * Iterate the passed in list and call the specific function for each
 * worker that isn't exiting
 */
static bool io_wq_for_each_worker(struct io_wqe *wqe,
                                  bool (*func)(struct io_worker *, void *),
                                  void *data)
{
        struct io_worker *worker;
        bool ret = false;

        list_for_each_entry_rcu(worker, &wqe->all_list, all_list) {
                if (io_worker_get(worker)) {
                        /* no task if node is/was offline */
                        if (worker->task)
                                ret = func(worker, data);
                        io_worker_release(worker);
                        if (ret)
                                break;
                }
        }

        return ret;
}

static bool io_wq_worker_wake(struct io_worker *worker, void *data)
{
        set_notify_signal(worker->task);
        wake_up_process(worker->task);
        return false;
}

static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
{
        struct io_wq *wq = wqe->wq;

        do {
                work->flags |= IO_WQ_WORK_CANCEL;
                wq->do_work(work);
                work = wq->free_work(work);
        } while (work);
}

static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work)
{
        struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
        unsigned int hash;
        struct io_wq_work *tail;

        if (!io_wq_is_hashed(work)) {
append:
                wq_list_add_tail(&work->list, &acct->work_list);
                return;
        }

        hash = io_get_work_hash(work);
        tail = wqe->hash_tail[hash];
        wqe->hash_tail[hash] = work;
        if (!tail)
                goto append;

        wq_list_add_after(&work->list, &tail->list, &acct->work_list);
}

static bool io_wq_work_match_item(struct io_wq_work *work, void *data)
{
        return work == data;
}

static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{
        struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
        unsigned work_flags = work->flags;
        bool do_create;

        /*
         * If io-wq is exiting for this task, or if the request has explicitly
         * been marked as one that should not get executed, cancel it here.
         */
        if (test_bit(IO_WQ_BIT_EXIT, &wqe->wq->state) ||
            (work->flags & IO_WQ_WORK_CANCEL)) {
                io_run_cancel(work, wqe);
                return;
        }

        raw_spin_lock(&wqe->lock);
        io_wqe_insert_work(wqe, work);
        clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);

        rcu_read_lock();
        do_create = !io_wqe_activate_free_worker(wqe, acct);
        rcu_read_unlock();

        raw_spin_unlock(&wqe->lock);

        if (do_create && ((work_flags & IO_WQ_WORK_CONCURRENT) ||
            !atomic_read(&acct->nr_running))) {
                bool did_create;

                did_create = io_wqe_create_worker(wqe, acct);
                if (likely(did_create))
                        return;

                raw_spin_lock(&wqe->lock);
                /* fatal condition, failed to create the first worker */
                if (!acct->nr_workers) {
                        struct io_cb_cancel_data match = {
                                .fn                = io_wq_work_match_item,
                                .data                = work,
                                .cancel_all        = false,
                        };

                        if (io_acct_cancel_pending_work(wqe, acct, &match))
                                raw_spin_lock(&wqe->lock);
                }
                raw_spin_unlock(&wqe->lock);
        }
}

void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
{
        struct io_wqe *wqe = wq->wqes[numa_node_id()];

        io_wqe_enqueue(wqe, work);
}

/*
 * Work items that hash to the same value will not be done in parallel.
 * Used to limit concurrent writes, generally hashed by inode.
 */
void io_wq_hash_work(struct io_wq_work *work, void *val)
{
        unsigned int bit;

        bit = hash_ptr(val, IO_WQ_HASH_ORDER);
        work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT));
}

static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
{
        struct io_cb_cancel_data *match = data;

        /*
         * Hold the lock to avoid ->cur_work going out of scope, caller
         * may dereference the passed in work.
         */
        spin_lock(&worker->lock);
        if (worker->cur_work &&
            match->fn(worker->cur_work, match->data)) {
                set_notify_signal(worker->task);
                match->nr_running++;
        }
        spin_unlock(&worker->lock);

        return match->nr_running && !match->cancel_all;
}

static inline void io_wqe_remove_pending(struct io_wqe *wqe,
                                         struct io_wq_work *work,
                                         struct io_wq_work_node *prev)
{
        struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
        unsigned int hash = io_get_work_hash(work);
        struct io_wq_work *prev_work = NULL;

        if (io_wq_is_hashed(work) && work == wqe->hash_tail[hash]) {
                if (prev)
                        prev_work = container_of(prev, struct io_wq_work, list);
                if (prev_work && io_get_work_hash(prev_work) == hash)
                        wqe->hash_tail[hash] = prev_work;
                else
                        wqe->hash_tail[hash] = NULL;
        }
        wq_list_del(&acct->work_list, &work->list, prev);
}

static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
                                        struct io_wqe_acct *acct,
                                        struct io_cb_cancel_data *match)
        __releases(wqe->lock)
{
        struct io_wq_work_node *node, *prev;
        struct io_wq_work *work;

        wq_list_for_each(node, prev, &acct->work_list) {
                work = container_of(node, struct io_wq_work, list);
                if (!match->fn(work, match->data))
                        continue;
                io_wqe_remove_pending(wqe, work, prev);
                raw_spin_unlock(&wqe->lock);
                io_run_cancel(work, wqe);
                match->nr_pending++;
                /* not safe to continue after unlock */
                return true;
        }

        return false;
}

static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
                                       struct io_cb_cancel_data *match)
{
        int i;
retry:
        raw_spin_lock(&wqe->lock);
        for (i = 0; i < IO_WQ_ACCT_NR; i++) {
                struct io_wqe_acct *acct = io_get_acct(wqe, i == 0);

                if (io_acct_cancel_pending_work(wqe, acct, match)) {
                        if (match->cancel_all)
                                goto retry;
                        return;
                }
        }
        raw_spin_unlock(&wqe->lock);
}

static void io_wqe_cancel_running_work(struct io_wqe *wqe,
                                       struct io_cb_cancel_data *match)
{
        rcu_read_lock();
        io_wq_for_each_worker(wqe, io_wq_worker_cancel, match);
        rcu_read_unlock();
}

enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
                                  void *data, bool cancel_all)
{
        struct io_cb_cancel_data match = {
                .fn                = cancel,
                .data                = data,
                .cancel_all        = cancel_all,
        };
        int node;

        /*
         * First check pending list, if we're lucky we can just remove it
         * from there. CANCEL_OK means that the work is returned as-new,
         * no completion will be posted for it.
         */
        for_each_node(node) {
                struct io_wqe *wqe = wq->wqes[node];

                io_wqe_cancel_pending_work(wqe, &match);
                if (match.nr_pending && !match.cancel_all)
                        return IO_WQ_CANCEL_OK;
        }

        /*
         * Now check if a free (going busy) or busy worker has the work
         * currently running. If we find it there, we'll return CANCEL_RUNNING
         * as an indication that we attempt to signal cancellation. The
         * completion will run normally in this case.
         */
        for_each_node(node) {
                struct io_wqe *wqe = wq->wqes[node];

                io_wqe_cancel_running_work(wqe, &match);
                if (match.nr_running && !match.cancel_all)
                        return IO_WQ_CANCEL_RUNNING;
        }

        if (match.nr_running)
                return IO_WQ_CANCEL_RUNNING;
        if (match.nr_pending)
                return IO_WQ_CANCEL_OK;
        return IO_WQ_CANCEL_NOTFOUND;
}

static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode,
                            int sync, void *key)
{
        struct io_wqe *wqe = container_of(wait, struct io_wqe, wait);
        int i;

        list_del_init(&wait->entry);

        rcu_read_lock();
        for (i = 0; i < IO_WQ_ACCT_NR; i++) {
                struct io_wqe_acct *acct = &wqe->acct[i];

                if (test_and_clear_bit(IO_ACCT_STALLED_BIT, &acct->flags))
                        io_wqe_activate_free_worker(wqe, acct);
        }
        rcu_read_unlock();
        return 1;
}

struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
{
        int ret, node, i;
        struct io_wq *wq;

        if (WARN_ON_ONCE(!data->free_work || !data->do_work))
                return ERR_PTR(-EINVAL);
        if (WARN_ON_ONCE(!bounded))
                return ERR_PTR(-EINVAL);

        wq = kzalloc(struct_size(wq, wqes, nr_node_ids), GFP_KERNEL);
        if (!wq)
                return ERR_PTR(-ENOMEM);
        ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node);
        if (ret)
                goto err_wq;

        refcount_inc(&data->hash->refs);
        wq->hash = data->hash;
        wq->free_work = data->free_work;
        wq->do_work = data->do_work;

        ret = -ENOMEM;
        for_each_node(node) {
                struct io_wqe *wqe;
                int alloc_node = node;

                if (!node_online(alloc_node))
                        alloc_node = NUMA_NO_NODE;
                wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, alloc_node);
                if (!wqe)
                        goto err;
                wq->wqes[node] = wqe;
                if (!alloc_cpumask_var(&wqe->cpu_mask, GFP_KERNEL))
                        goto err;
                cpumask_copy(wqe->cpu_mask, cpumask_of_node(node));
                wqe->node = alloc_node;
                wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
                wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers =
                                        task_rlimit(current, RLIMIT_NPROC);
                INIT_LIST_HEAD(&wqe->wait.entry);
                wqe->wait.func = io_wqe_hash_wake;
                for (i = 0; i < IO_WQ_ACCT_NR; i++) {
                        struct io_wqe_acct *acct = &wqe->acct[i];

                        acct->index = i;
                        atomic_set(&acct->nr_running, 0);
                        INIT_WQ_LIST(&acct->work_list);
                }
                wqe->wq = wq;
                raw_spin_lock_init(&wqe->lock);
                INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0);
                INIT_LIST_HEAD(&wqe->all_list);
        }

        wq->task = get_task_struct(data->task);
        atomic_set(&wq->worker_refs, 1);
        init_completion(&wq->worker_done);
        return wq;
err:
        io_wq_put_hash(data->hash);
        cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
        for_each_node(node) {
                if (!wq->wqes[node])
                        continue;
                free_cpumask_var(wq->wqes[node]->cpu_mask);
                kfree(wq->wqes[node]);
        }
err_wq:
        kfree(wq);
        return ERR_PTR(ret);
}

static bool io_task_work_match(struct callback_head *cb, void *data)
{
        struct io_worker *worker;

        if (cb->func != create_worker_cb && cb->func != create_worker_cont)
                return false;
        worker = container_of(cb, struct io_worker, create_work);
        return worker->wqe->wq == data;
}

void io_wq_exit_start(struct io_wq *wq)
{
        set_bit(IO_WQ_BIT_EXIT, &wq->state);
}

static void io_wq_cancel_tw_create(struct io_wq *wq)
{
        struct callback_head *cb;

        while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) {
                struct io_worker *worker;

                worker = container_of(cb, struct io_worker, create_work);
                io_worker_cancel_cb(worker);
                /*
                 * Only the worker continuation helper has worker allocated and
                 * hence needs freeing.
                 */
                if (cb->func == create_worker_cont)
                        kfree(worker);
        }
}

static void io_wq_exit_workers(struct io_wq *wq)
{
        int node;

        if (!wq->task)
                return;

        io_wq_cancel_tw_create(wq);

        rcu_read_lock();
        for_each_node(node) {
                struct io_wqe *wqe = wq->wqes[node];

                io_wq_for_each_worker(wqe, io_wq_worker_wake, NULL);
        }
        rcu_read_unlock();
        io_worker_ref_put(wq);
        wait_for_completion(&wq->worker_done);

        for_each_node(node) {
                spin_lock_irq(&wq->hash->wait.lock);
                list_del_init(&wq->wqes[node]->wait.entry);
                spin_unlock_irq(&wq->hash->wait.lock);
        }
        put_task_struct(wq->task);
        wq->task = NULL;
}

static void io_wq_destroy(struct io_wq *wq)
{
        int node;

        cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);

        for_each_node(node) {
                struct io_wqe *wqe = wq->wqes[node];
                struct io_cb_cancel_data match = {
                        .fn                = io_wq_work_match_all,
                        .cancel_all        = true,
                };
                io_wqe_cancel_pending_work(wqe, &match);
                free_cpumask_var(wqe->cpu_mask);
                kfree(wqe);
        }
        io_wq_put_hash(wq->hash);
        kfree(wq);
}

void io_wq_put_and_exit(struct io_wq *wq)
{
        WARN_ON_ONCE(!test_bit(IO_WQ_BIT_EXIT, &wq->state));

        io_wq_exit_workers(wq);
        io_wq_destroy(wq);
}

struct online_data {
        unsigned int cpu;
        bool online;
};

static bool io_wq_worker_affinity(struct io_worker *worker, void *data)
{
        struct online_data *od = data;

        if (od->online)
                cpumask_set_cpu(od->cpu, worker->wqe->cpu_mask);
        else
                cpumask_clear_cpu(od->cpu, worker->wqe->cpu_mask);
        return false;
}

static int __io_wq_cpu_online(struct io_wq *wq, unsigned int cpu, bool online)
{
        struct online_data od = {
                .cpu = cpu,
                .online = online
        };
        int i;

        rcu_read_lock();
        for_each_node(i)
                io_wq_for_each_worker(wq->wqes[i], io_wq_worker_affinity, &od);
        rcu_read_unlock();
        return 0;
}

static int io_wq_cpu_online(unsigned int cpu, struct hlist_node *node)
{
        struct io_wq *wq = hlist_entry_safe(node, struct io_wq, cpuhp_node);

        return __io_wq_cpu_online(wq, cpu, true);
}

static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node)
{
        struct io_wq *wq = hlist_entry_safe(node, struct io_wq, cpuhp_node);

        return __io_wq_cpu_online(wq, cpu, false);
}

int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask)
{
        int i;

        rcu_read_lock();
        for_each_node(i) {
                struct io_wqe *wqe = wq->wqes[i];

                if (mask)
                        cpumask_copy(wqe->cpu_mask, mask);
                else
                        cpumask_copy(wqe->cpu_mask, cpumask_of_node(i));
        }
        rcu_read_unlock();
        return 0;
}

/*
 * Set max number of unbounded workers, returns old value. If new_count is 0,
 * then just return the old value.
 */
int io_wq_max_workers(struct io_wq *wq, int *new_count)
{
        int prev[IO_WQ_ACCT_NR];
        bool first_node = true;
        int i, node;

        BUILD_BUG_ON((int) IO_WQ_ACCT_BOUND   != (int) IO_WQ_BOUND);
        BUILD_BUG_ON((int) IO_WQ_ACCT_UNBOUND != (int) IO_WQ_UNBOUND);
        BUILD_BUG_ON((int) IO_WQ_ACCT_NR      != 2);

        for (i = 0; i < 2; i++) {
                if (new_count[i] > task_rlimit(current, RLIMIT_NPROC))
                        new_count[i] = task_rlimit(current, RLIMIT_NPROC);
        }

        for (i = 0; i < IO_WQ_ACCT_NR; i++)
                prev[i] = 0;

        rcu_read_lock();
        for_each_node(node) {
                struct io_wqe *wqe = wq->wqes[node];
                struct io_wqe_acct *acct;

                raw_spin_lock(&wqe->lock);
                for (i = 0; i < IO_WQ_ACCT_NR; i++) {
                        acct = &wqe->acct[i];
                        if (first_node)
                                prev[i] = max_t(int, acct->max_workers, prev[i]);
                        if (new_count[i])
                                acct->max_workers = new_count[i];
                }
                raw_spin_unlock(&wqe->lock);
                first_node = false;
        }
        rcu_read_unlock();

        for (i = 0; i < IO_WQ_ACCT_NR; i++)
                new_count[i] = prev[i];

        return 0;
}

static __init int io_wq_init(void)
{
        int ret;

        ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "io-wq/online",
                                        io_wq_cpu_online, io_wq_cpu_offline);
        if (ret < 0)
                return ret;
        io_wq_online = ret;
        return 0;
}
subsys_initcall(io_wq_init);

























    1 


    1 
    1 
    1 


    1 




























































































    1 


















































































































































    1 
















    1 

    1 





    1 



    1 














    1 









    1 




    1 
    1 




    1 


















































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2007 Jens Axboe <jens.axboe@oracle.com>
 *
 * Scatterlist handling helpers.
 */
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/scatterlist.h>
#include <linux/highmem.h>
#include <linux/kmemleak.h>

/**
 * sg_next - return the next scatterlist entry in a list
 * @sg:                The current sg entry
 *
 * Description:
 *   Usually the next entry will be @sg@ + 1, but if this sg element is part
 *   of a chained scatterlist, it could jump to the start of a new
 *   scatterlist array.
 *
 **/
struct scatterlist *sg_next(struct scatterlist *sg)
{
        if (sg_is_last(sg))
                return NULL;

        sg++;
        if (unlikely(sg_is_chain(sg)))
                sg = sg_chain_ptr(sg);

        return sg;
}
EXPORT_SYMBOL(sg_next);

/**
 * sg_nents - return total count of entries in scatterlist
 * @sg:                The scatterlist
 *
 * Description:
 * Allows to know how many entries are in sg, taking into acount
 * chaining as well
 *
 **/
int sg_nents(struct scatterlist *sg)
{
        int nents;
        for (nents = 0; sg; sg = sg_next(sg))
                nents++;
        return nents;
}
EXPORT_SYMBOL(sg_nents);

/**
 * sg_nents_for_len - return total count of entries in scatterlist
 *                    needed to satisfy the supplied length
 * @sg:                The scatterlist
 * @len:        The total required length
 *
 * Description:
 * Determines the number of entries in sg that are required to meet
 * the supplied length, taking into acount chaining as well
 *
 * Returns:
 *   the number of sg entries needed, negative error on failure
 *
 **/
int sg_nents_for_len(struct scatterlist *sg, u64 len)
{
        int nents;
        u64 total;

        if (!len)
                return 0;

        for (nents = 0, total = 0; sg; sg = sg_next(sg)) {
                nents++;
                total += sg->length;
                if (total >= len)
                        return nents;
        }

        return -EINVAL;
}
EXPORT_SYMBOL(sg_nents_for_len);

/**
 * sg_last - return the last scatterlist entry in a list
 * @sgl:        First entry in the scatterlist
 * @nents:        Number of entries in the scatterlist
 *
 * Description:
 *   Should only be used casually, it (currently) scans the entire list
 *   to get the last entry.
 *
 *   Note that the @sgl@ pointer passed in need not be the first one,
 *   the important bit is that @nents@ denotes the number of entries that
 *   exist from @sgl@.
 *
 **/
struct scatterlist *sg_last(struct scatterlist *sgl, unsigned int nents)
{
        struct scatterlist *sg, *ret = NULL;
        unsigned int i;

        for_each_sg(sgl, sg, nents, i)
                ret = sg;

        BUG_ON(!sg_is_last(ret));
        return ret;
}
EXPORT_SYMBOL(sg_last);

/**
 * sg_init_table - Initialize SG table
 * @sgl:           The SG table
 * @nents:           Number of entries in table
 *
 * Notes:
 *   If this is part of a chained sg table, sg_mark_end() should be
 *   used only on the last table part.
 *
 **/
void sg_init_table(struct scatterlist *sgl, unsigned int nents)
{
        memset(sgl, 0, sizeof(*sgl) * nents);
        sg_init_marker(sgl, nents);
}
EXPORT_SYMBOL(sg_init_table);

/**
 * sg_init_one - Initialize a single entry sg list
 * @sg:                 SG entry
 * @buf:         Virtual address for IO
 * @buflen:         IO length
 *
 **/
void sg_init_one(struct scatterlist *sg, const void *buf, unsigned int buflen)
{
        sg_init_table(sg, 1);
        sg_set_buf(sg, buf, buflen);
}
EXPORT_SYMBOL(sg_init_one);

/*
 * The default behaviour of sg_alloc_table() is to use these kmalloc/kfree
 * helpers.
 */
static struct scatterlist *sg_kmalloc(unsigned int nents, gfp_t gfp_mask)
{
        if (nents == SG_MAX_SINGLE_ALLOC) {
                /*
                 * Kmemleak doesn't track page allocations as they are not
                 * commonly used (in a raw form) for kernel data structures.
                 * As we chain together a list of pages and then a normal
                 * kmalloc (tracked by kmemleak), in order to for that last
                 * allocation not to become decoupled (and thus a
                 * false-positive) we need to inform kmemleak of all the
                 * intermediate allocations.
                 */
                void *ptr = (void *) __get_free_page(gfp_mask);
                kmemleak_alloc(ptr, PAGE_SIZE, 1, gfp_mask);
                return ptr;
        } else
                return kmalloc_array(nents, sizeof(struct scatterlist),
                                     gfp_mask);
}

static void sg_kfree(struct scatterlist *sg, unsigned int nents)
{
        if (nents == SG_MAX_SINGLE_ALLOC) {
                kmemleak_free(sg);
                free_page((unsigned long) sg);
        } else
                kfree(sg);
}

/**
 * __sg_free_table - Free a previously mapped sg table
 * @table:        The sg table header to use
 * @max_ents:        The maximum number of entries per single scatterlist
 * @nents_first_chunk: Number of entries int the (preallocated) first
 *         scatterlist chunk, 0 means no such preallocated first chunk
 * @free_fn:        Free function
 *
 *  Description:
 *    Free an sg table previously allocated and setup with
 *    __sg_alloc_table().  The @max_ents value must be identical to
 *    that previously used with __sg_alloc_table().
 *
 **/
void __sg_free_table(struct sg_table *table, unsigned int max_ents,
                     unsigned int nents_first_chunk, sg_free_fn *free_fn)
{
        struct scatterlist *sgl, *next;
        unsigned curr_max_ents = nents_first_chunk ?: max_ents;

        if (unlikely(!table->sgl))
                return;

        sgl = table->sgl;
        while (table->orig_nents) {
                unsigned int alloc_size = table->orig_nents;
                unsigned int sg_size;

                /*
                 * If we have more than max_ents segments left,
                 * then assign 'next' to the sg table after the current one.
                 * sg_size is then one less than alloc size, since the last
                 * element is the chain pointer.
                 */
                if (alloc_size > curr_max_ents) {
                        next = sg_chain_ptr(&sgl[curr_max_ents - 1]);
                        alloc_size = curr_max_ents;
                        sg_size = alloc_size - 1;
                } else {
                        sg_size = alloc_size;
                        next = NULL;
                }

                table->orig_nents -= sg_size;
                if (nents_first_chunk)
                        nents_first_chunk = 0;
                else
                        free_fn(sgl, alloc_size);
                sgl = next;
                curr_max_ents = max_ents;
        }

        table->sgl = NULL;
}
EXPORT_SYMBOL(__sg_free_table);

/**
 * sg_free_table - Free a previously allocated sg table
 * @table:        The mapped sg table header
 *
 **/
void sg_free_table(struct sg_table *table)
{
        __sg_free_table(table, SG_MAX_SINGLE_ALLOC, false, sg_kfree);
}
EXPORT_SYMBOL(sg_free_table);

/**
 * __sg_alloc_table - Allocate and initialize an sg table with given allocator
 * @table:        The sg table header to use
 * @nents:        Number of entries in sg list
 * @max_ents:        The maximum number of entries the allocator returns per call
 * @nents_first_chunk: Number of entries int the (preallocated) first
 *         scatterlist chunk, 0 means no such preallocated chunk provided by user
 * @gfp_mask:        GFP allocation mask
 * @alloc_fn:        Allocator to use
 *
 * Description:
 *   This function returns a @table @nents long. The allocator is
 *   defined to return scatterlist chunks of maximum size @max_ents.
 *   Thus if @nents is bigger than @max_ents, the scatterlists will be
 *   chained in units of @max_ents.
 *
 * Notes:
 *   If this function returns non-0 (eg failure), the caller must call
 *   __sg_free_table() to cleanup any leftover allocations.
 *
 **/
int __sg_alloc_table(struct sg_table *table, unsigned int nents,
                     unsigned int max_ents, struct scatterlist *first_chunk,
                     unsigned int nents_first_chunk, gfp_t gfp_mask,
                     sg_alloc_fn *alloc_fn)
{
        struct scatterlist *sg, *prv;
        unsigned int left;
        unsigned curr_max_ents = nents_first_chunk ?: max_ents;
        unsigned prv_max_ents;

        memset(table, 0, sizeof(*table));

        if (nents == 0)
                return -EINVAL;
#ifdef CONFIG_ARCH_NO_SG_CHAIN
        if (WARN_ON_ONCE(nents > max_ents))
                return -EINVAL;
#endif

        left = nents;
        prv = NULL;
        do {
                unsigned int sg_size, alloc_size = left;

                if (alloc_size > curr_max_ents) {
                        alloc_size = curr_max_ents;
                        sg_size = alloc_size - 1;
                } else
                        sg_size = alloc_size;

                left -= sg_size;

                if (first_chunk) {
                        sg = first_chunk;
                        first_chunk = NULL;
                } else {
                        sg = alloc_fn(alloc_size, gfp_mask);
                }
                if (unlikely(!sg)) {
                        /*
                         * Adjust entry count to reflect that the last
                         * entry of the previous table won't be used for
                         * linkage.  Without this, sg_kfree() may get
                         * confused.
                         */
                        if (prv)
                                table->nents = ++table->orig_nents;

                        return -ENOMEM;
                }

                sg_init_table(sg, alloc_size);
                table->nents = table->orig_nents += sg_size;

                /*
                 * If this is the first mapping, assign the sg table header.
                 * If this is not the first mapping, chain previous part.
                 */
                if (prv)
                        sg_chain(prv, prv_max_ents, sg);
                else
                        table->sgl = sg;

                /*
                 * If no more entries after this one, mark the end
                 */
                if (!left)
                        sg_mark_end(&sg[sg_size - 1]);

                prv = sg;
                prv_max_ents = curr_max_ents;
                curr_max_ents = max_ents;
        } while (left);

        return 0;
}
EXPORT_SYMBOL(__sg_alloc_table);

/**
 * sg_alloc_table - Allocate and initialize an sg table
 * @table:        The sg table header to use
 * @nents:        Number of entries in sg list
 * @gfp_mask:        GFP allocation mask
 *
 *  Description:
 *    Allocate and initialize an sg table. If @nents@ is larger than
 *    SG_MAX_SINGLE_ALLOC a chained sg table will be setup.
 *
 **/
int sg_alloc_table(struct sg_table *table, unsigned int nents, gfp_t gfp_mask)
{
        int ret;

        ret = __sg_alloc_table(table, nents, SG_MAX_SINGLE_ALLOC,
                               NULL, 0, gfp_mask, sg_kmalloc);
        if (unlikely(ret))
                __sg_free_table(table, SG_MAX_SINGLE_ALLOC, 0, sg_kfree);

        return ret;
}
EXPORT_SYMBOL(sg_alloc_table);

static struct scatterlist *get_next_sg(struct sg_table *table,
                                       struct scatterlist *cur,
                                       unsigned long needed_sges,
                                       gfp_t gfp_mask)
{
        struct scatterlist *new_sg, *next_sg;
        unsigned int alloc_size;

        if (cur) {
                next_sg = sg_next(cur);
                /* Check if last entry should be keeped for chainning */
                if (!sg_is_last(next_sg) || needed_sges == 1)
                        return next_sg;
        }

        alloc_size = min_t(unsigned long, needed_sges, SG_MAX_SINGLE_ALLOC);
        new_sg = sg_kmalloc(alloc_size, gfp_mask);
        if (!new_sg)
                return ERR_PTR(-ENOMEM);
        sg_init_table(new_sg, alloc_size);
        if (cur) {
                __sg_chain(next_sg, new_sg);
                table->orig_nents += alloc_size - 1;
        } else {
                table->sgl = new_sg;
                table->orig_nents = alloc_size;
                table->nents = 0;
        }
        return new_sg;
}

/**
 * __sg_alloc_table_from_pages - Allocate and initialize an sg table from
 *                                 an array of pages
 * @sgt:         The sg table header to use
 * @pages:         Pointer to an array of page pointers
 * @n_pages:         Number of pages in the pages array
 * @offset:      Offset from start of the first page to the start of a buffer
 * @size:        Number of valid bytes in the buffer (after offset)
 * @max_segment: Maximum size of a scatterlist element in bytes
 * @prv:         Last populated sge in sgt
 * @left_pages:  Left pages caller have to set after this call
 * @gfp_mask:         GFP allocation mask
 *
 * Description:
 *    If @prv is NULL, allocate and initialize an sg table from a list of pages,
 *    else reuse the scatterlist passed in at @prv.
 *    Contiguous ranges of the pages are squashed into a single scatterlist
 *    entry up to the maximum size specified in @max_segment.  A user may
 *    provide an offset at a start and a size of valid data in a buffer
 *    specified by the page array.
 *
 * Returns:
 *   Last SGE in sgt on success, PTR_ERR on otherwise.
 *   The allocation in @sgt must be released by sg_free_table.
 *
 * Notes:
 *   If this function returns non-0 (eg failure), the caller must call
 *   sg_free_table() to cleanup any leftover allocations.
 */
struct scatterlist *__sg_alloc_table_from_pages(struct sg_table *sgt,
                struct page **pages, unsigned int n_pages, unsigned int offset,
                unsigned long size, unsigned int max_segment,
                struct scatterlist *prv, unsigned int left_pages,
                gfp_t gfp_mask)
{
        unsigned int chunks, cur_page, seg_len, i, prv_len = 0;
        unsigned int added_nents = 0;
        struct scatterlist *s = prv;

        /*
         * The algorithm below requires max_segment to be aligned to PAGE_SIZE
         * otherwise it can overshoot.
         */
        max_segment = ALIGN_DOWN(max_segment, PAGE_SIZE);
        if (WARN_ON(max_segment < PAGE_SIZE))
                return ERR_PTR(-EINVAL);

        if (IS_ENABLED(CONFIG_ARCH_NO_SG_CHAIN) && prv)
                return ERR_PTR(-EOPNOTSUPP);

        if (prv) {
                unsigned long paddr = (page_to_pfn(sg_page(prv)) * PAGE_SIZE +
                                       prv->offset + prv->length) /
                                      PAGE_SIZE;

                if (WARN_ON(offset))
                        return ERR_PTR(-EINVAL);

                /* Merge contiguous pages into the last SG */
                prv_len = prv->length;
                while (n_pages && page_to_pfn(pages[0]) == paddr) {
                        if (prv->length + PAGE_SIZE > max_segment)
                                break;
                        prv->length += PAGE_SIZE;
                        paddr++;
                        pages++;
                        n_pages--;
                }
                if (!n_pages)
                        goto out;
        }

        /* compute number of contiguous chunks */
        chunks = 1;
        seg_len = 0;
        for (i = 1; i < n_pages; i++) {
                seg_len += PAGE_SIZE;
                if (seg_len >= max_segment ||
                    page_to_pfn(pages[i]) != page_to_pfn(pages[i - 1]) + 1) {
                        chunks++;
                        seg_len = 0;
                }
        }

        /* merging chunks and putting them into the scatterlist */
        cur_page = 0;
        for (i = 0; i < chunks; i++) {
                unsigned int j, chunk_size;

                /* look for the end of the current chunk */
                seg_len = 0;
                for (j = cur_page + 1; j < n_pages; j++) {
                        seg_len += PAGE_SIZE;
                        if (seg_len >= max_segment ||
                            page_to_pfn(pages[j]) !=
                            page_to_pfn(pages[j - 1]) + 1)
                                break;
                }

                /* Pass how many chunks might be left */
                s = get_next_sg(sgt, s, chunks - i + left_pages, gfp_mask);
                if (IS_ERR(s)) {
                        /*
                         * Adjust entry length to be as before function was
                         * called.
                         */
                        if (prv)
                                prv->length = prv_len;
                        return s;
                }
                chunk_size = ((j - cur_page) << PAGE_SHIFT) - offset;
                sg_set_page(s, pages[cur_page],
                            min_t(unsigned long, size, chunk_size), offset);
                added_nents++;
                size -= chunk_size;
                offset = 0;
                cur_page = j;
        }
        sgt->nents += added_nents;
out:
        if (!left_pages)
                sg_mark_end(s);
        return s;
}
EXPORT_SYMBOL(__sg_alloc_table_from_pages);

/**
 * sg_alloc_table_from_pages - Allocate and initialize an sg table from
 *                               an array of pages
 * @sgt:         The sg table header to use
 * @pages:         Pointer to an array of page pointers
 * @n_pages:         Number of pages in the pages array
 * @offset:      Offset from start of the first page to the start of a buffer
 * @size:        Number of valid bytes in the buffer (after offset)
 * @gfp_mask:         GFP allocation mask
 *
 *  Description:
 *    Allocate and initialize an sg table from a list of pages. Contiguous
 *    ranges of the pages are squashed into a single scatterlist node. A user
 *    may provide an offset at a start and a size of valid data in a buffer
 *    specified by the page array. The returned sg table is released by
 *    sg_free_table.
 *
 * Returns:
 *   0 on success, negative error on failure
 */
int sg_alloc_table_from_pages(struct sg_table *sgt, struct page **pages,
                              unsigned int n_pages, unsigned int offset,
                              unsigned long size, gfp_t gfp_mask)
{
        return PTR_ERR_OR_ZERO(__sg_alloc_table_from_pages(sgt, pages, n_pages,
                        offset, size, UINT_MAX, NULL, 0, gfp_mask));
}
EXPORT_SYMBOL(sg_alloc_table_from_pages);

#ifdef CONFIG_SGL_ALLOC

/**
 * sgl_alloc_order - allocate a scatterlist and its pages
 * @length: Length in bytes of the scatterlist. Must be at least one
 * @order: Second argument for alloc_pages()
 * @chainable: Whether or not to allocate an extra element in the scatterlist
 *        for scatterlist chaining purposes
 * @gfp: Memory allocation flags
 * @nent_p: [out] Number of entries in the scatterlist that have pages
 *
 * Returns: A pointer to an initialized scatterlist or %NULL upon failure.
 */
struct scatterlist *sgl_alloc_order(unsigned long long length,
                                    unsigned int order, bool chainable,
                                    gfp_t gfp, unsigned int *nent_p)
{
        struct scatterlist *sgl, *sg;
        struct page *page;
        unsigned int nent, nalloc;
        u32 elem_len;

        nent = round_up(length, PAGE_SIZE << order) >> (PAGE_SHIFT + order);
        /* Check for integer overflow */
        if (length > (nent << (PAGE_SHIFT + order)))
                return NULL;
        nalloc = nent;
        if (chainable) {
                /* Check for integer overflow */
                if (nalloc + 1 < nalloc)
                        return NULL;
                nalloc++;
        }
        sgl = kmalloc_array(nalloc, sizeof(struct scatterlist),
                            gfp & ~GFP_DMA);
        if (!sgl)
                return NULL;

        sg_init_table(sgl, nalloc);
        sg = sgl;
        while (length) {
                elem_len = min_t(u64, length, PAGE_SIZE << order);
                page = alloc_pages(gfp, order);
                if (!page) {
                        sgl_free_order(sgl, order);
                        return NULL;
                }

                sg_set_page(sg, page, elem_len, 0);
                length -= elem_len;
                sg = sg_next(sg);
        }
        WARN_ONCE(length, "length = %lld\n", length);
        if (nent_p)
                *nent_p = nent;
        return sgl;
}
EXPORT_SYMBOL(sgl_alloc_order);

/**
 * sgl_alloc - allocate a scatterlist and its pages
 * @length: Length in bytes of the scatterlist
 * @gfp: Memory allocation flags
 * @nent_p: [out] Number of entries in the scatterlist
 *
 * Returns: A pointer to an initialized scatterlist or %NULL upon failure.
 */
struct scatterlist *sgl_alloc(unsigned long long length, gfp_t gfp,
                              unsigned int *nent_p)
{
        return sgl_alloc_order(length, 0, false, gfp, nent_p);
}
EXPORT_SYMBOL(sgl_alloc);

/**
 * sgl_free_n_order - free a scatterlist and its pages
 * @sgl: Scatterlist with one or more elements
 * @nents: Maximum number of elements to free
 * @order: Second argument for __free_pages()
 *
 * Notes:
 * - If several scatterlists have been chained and each chain element is
 *   freed separately then it's essential to set nents correctly to avoid that a
 *   page would get freed twice.
 * - All pages in a chained scatterlist can be freed at once by setting @nents
 *   to a high number.
 */
void sgl_free_n_order(struct scatterlist *sgl, int nents, int order)
{
        struct scatterlist *sg;
        struct page *page;
        int i;

        for_each_sg(sgl, sg, nents, i) {
                if (!sg)
                        break;
                page = sg_page(sg);
                if (page)
                        __free_pages(page, order);
        }
        kfree(sgl);
}
EXPORT_SYMBOL(sgl_free_n_order);

/**
 * sgl_free_order - free a scatterlist and its pages
 * @sgl: Scatterlist with one or more elements
 * @order: Second argument for __free_pages()
 */
void sgl_free_order(struct scatterlist *sgl, int order)
{
        sgl_free_n_order(sgl, INT_MAX, order);
}
EXPORT_SYMBOL(sgl_free_order);

/**
 * sgl_free - free a scatterlist and its pages
 * @sgl: Scatterlist with one or more elements
 */
void sgl_free(struct scatterlist *sgl)
{
        sgl_free_order(sgl, 0);
}
EXPORT_SYMBOL(sgl_free);

#endif /* CONFIG_SGL_ALLOC */

void __sg_page_iter_start(struct sg_page_iter *piter,
                          struct scatterlist *sglist, unsigned int nents,
                          unsigned long pgoffset)
{
        piter->__pg_advance = 0;
        piter->__nents = nents;

        piter->sg = sglist;
        piter->sg_pgoffset = pgoffset;
}
EXPORT_SYMBOL(__sg_page_iter_start);

static int sg_page_count(struct scatterlist *sg)
{
        return PAGE_ALIGN(sg->offset + sg->length) >> PAGE_SHIFT;
}

bool __sg_page_iter_next(struct sg_page_iter *piter)
{
        if (!piter->__nents || !piter->sg)
                return false;

        piter->sg_pgoffset += piter->__pg_advance;
        piter->__pg_advance = 1;

        while (piter->sg_pgoffset >= sg_page_count(piter->sg)) {
                piter->sg_pgoffset -= sg_page_count(piter->sg);
                piter->sg = sg_next(piter->sg);
                if (!--piter->__nents || !piter->sg)
                        return false;
        }

        return true;
}
EXPORT_SYMBOL(__sg_page_iter_next);

static int sg_dma_page_count(struct scatterlist *sg)
{
        return PAGE_ALIGN(sg->offset + sg_dma_len(sg)) >> PAGE_SHIFT;
}

bool __sg_page_iter_dma_next(struct sg_dma_page_iter *dma_iter)
{
        struct sg_page_iter *piter = &dma_iter->base;

        if (!piter->__nents || !piter->sg)
                return false;

        piter->sg_pgoffset += piter->__pg_advance;
        piter->__pg_advance = 1;

        while (piter->sg_pgoffset >= sg_dma_page_count(piter->sg)) {
                piter->sg_pgoffset -= sg_dma_page_count(piter->sg);
                piter->sg = sg_next(piter->sg);
                if (!--piter->__nents || !piter->sg)
                        return false;
        }

        return true;
}
EXPORT_SYMBOL(__sg_page_iter_dma_next);

/**
 * sg_miter_start - start mapping iteration over a sg list
 * @miter: sg mapping iter to be started
 * @sgl: sg list to iterate over
 * @nents: number of sg entries
 *
 * Description:
 *   Starts mapping iterator @miter.
 *
 * Context:
 *   Don't care.
 */
void sg_miter_start(struct sg_mapping_iter *miter, struct scatterlist *sgl,
                    unsigned int nents, unsigned int flags)
{
        memset(miter, 0, sizeof(struct sg_mapping_iter));

        __sg_page_iter_start(&miter->piter, sgl, nents, 0);
        WARN_ON(!(flags & (SG_MITER_TO_SG | SG_MITER_FROM_SG)));
        miter->__flags = flags;
}
EXPORT_SYMBOL(sg_miter_start);

static bool sg_miter_get_next_page(struct sg_mapping_iter *miter)
{
        if (!miter->__remaining) {
                struct scatterlist *sg;

                if (!__sg_page_iter_next(&miter->piter))
                        return false;

                sg = miter->piter.sg;

                miter->__offset = miter->piter.sg_pgoffset ? 0 : sg->offset;
                miter->piter.sg_pgoffset += miter->__offset >> PAGE_SHIFT;
                miter->__offset &= PAGE_SIZE - 1;
                miter->__remaining = sg->offset + sg->length -
                                     (miter->piter.sg_pgoffset << PAGE_SHIFT) -
                                     miter->__offset;
                miter->__remaining = min_t(unsigned long, miter->__remaining,
                                           PAGE_SIZE - miter->__offset);
        }

        return true;
}

/**
 * sg_miter_skip - reposition mapping iterator
 * @miter: sg mapping iter to be skipped
 * @offset: number of bytes to plus the current location
 *
 * Description:
 *   Sets the offset of @miter to its current location plus @offset bytes.
 *   If mapping iterator @miter has been proceeded by sg_miter_next(), this
 *   stops @miter.
 *
 * Context:
 *   Don't care if @miter is stopped, or not proceeded yet.
 *   Otherwise, preemption disabled if the SG_MITER_ATOMIC is set.
 *
 * Returns:
 *   true if @miter contains the valid mapping.  false if end of sg
 *   list is reached.
 */
bool sg_miter_skip(struct sg_mapping_iter *miter, off_t offset)
{
        sg_miter_stop(miter);

        while (offset) {
                off_t consumed;

                if (!sg_miter_get_next_page(miter))
                        return false;

                consumed = min_t(off_t, offset, miter->__remaining);
                miter->__offset += consumed;
                miter->__remaining -= consumed;
                offset -= consumed;
        }

        return true;
}
EXPORT_SYMBOL(sg_miter_skip);

/**
 * sg_miter_next - proceed mapping iterator to the next mapping
 * @miter: sg mapping iter to proceed
 *
 * Description:
 *   Proceeds @miter to the next mapping.  @miter should have been started
 *   using sg_miter_start().  On successful return, @miter->page,
 *   @miter->addr and @miter->length point to the current mapping.
 *
 * Context:
 *   Preemption disabled if SG_MITER_ATOMIC.  Preemption must stay disabled
 *   till @miter is stopped.  May sleep if !SG_MITER_ATOMIC.
 *
 * Returns:
 *   true if @miter contains the next mapping.  false if end of sg
 *   list is reached.
 */
bool sg_miter_next(struct sg_mapping_iter *miter)
{
        sg_miter_stop(miter);

        /*
         * Get to the next page if necessary.
         * __remaining, __offset is adjusted by sg_miter_stop
         */
        if (!sg_miter_get_next_page(miter))
                return false;

        miter->page = sg_page_iter_page(&miter->piter);
        miter->consumed = miter->length = miter->__remaining;

        if (miter->__flags & SG_MITER_ATOMIC)
                miter->addr = kmap_atomic(miter->page) + miter->__offset;
        else
                miter->addr = kmap(miter->page) + miter->__offset;

        return true;
}
EXPORT_SYMBOL(sg_miter_next);

/**
 * sg_miter_stop - stop mapping iteration
 * @miter: sg mapping iter to be stopped
 *
 * Description:
 *   Stops mapping iterator @miter.  @miter should have been started
 *   using sg_miter_start().  A stopped iteration can be resumed by
 *   calling sg_miter_next() on it.  This is useful when resources (kmap)
 *   need to be released during iteration.
 *
 * Context:
 *   Preemption disabled if the SG_MITER_ATOMIC is set.  Don't care
 *   otherwise.
 */
void sg_miter_stop(struct sg_mapping_iter *miter)
{
        WARN_ON(miter->consumed > miter->length);

        /* drop resources from the last iteration */
        if (miter->addr) {
                miter->__offset += miter->consumed;
                miter->__remaining -= miter->consumed;

                if ((miter->__flags & SG_MITER_TO_SG) &&
                    !PageSlab(miter->page))
                        flush_kernel_dcache_page(miter->page);

                if (miter->__flags & SG_MITER_ATOMIC) {
                        WARN_ON_ONCE(preemptible());
                        kunmap_atomic(miter->addr);
                } else
                        kunmap(miter->page);

                miter->page = NULL;
                miter->addr = NULL;
                miter->length = 0;
                miter->consumed = 0;
        }
}
EXPORT_SYMBOL(sg_miter_stop);

/**
 * sg_copy_buffer - Copy data between a linear buffer and an SG list
 * @sgl:                 The SG list
 * @nents:                 Number of SG entries
 * @buf:                 Where to copy from
 * @buflen:                 The number of bytes to copy
 * @skip:                 Number of bytes to skip before copying
 * @to_buffer:                 transfer direction (true == from an sg list to a
 *                         buffer, false == from a buffer to an sg list)
 *
 * Returns the number of copied bytes.
 *
 **/
size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
                      size_t buflen, off_t skip, bool to_buffer)
{
        unsigned int offset = 0;
        struct sg_mapping_iter miter;
        unsigned int sg_flags = SG_MITER_ATOMIC;

        if (to_buffer)
                sg_flags |= SG_MITER_FROM_SG;
        else
                sg_flags |= SG_MITER_TO_SG;

        sg_miter_start(&miter, sgl, nents, sg_flags);

        if (!sg_miter_skip(&miter, skip))
                return 0;

        while ((offset < buflen) && sg_miter_next(&miter)) {
                unsigned int len;

                len = min(miter.length, buflen - offset);

                if (to_buffer)
                        memcpy(buf + offset, miter.addr, len);
                else
                        memcpy(miter.addr, buf + offset, len);

                offset += len;
        }

        sg_miter_stop(&miter);

        return offset;
}
EXPORT_SYMBOL(sg_copy_buffer);

/**
 * sg_copy_from_buffer - Copy from a linear buffer to an SG list
 * @sgl:                 The SG list
 * @nents:                 Number of SG entries
 * @buf:                 Where to copy from
 * @buflen:                 The number of bytes to copy
 *
 * Returns the number of copied bytes.
 *
 **/
size_t sg_copy_from_buffer(struct scatterlist *sgl, unsigned int nents,
                           const void *buf, size_t buflen)
{
        return sg_copy_buffer(sgl, nents, (void *)buf, buflen, 0, false);
}
EXPORT_SYMBOL(sg_copy_from_buffer);

/**
 * sg_copy_to_buffer - Copy from an SG list to a linear buffer
 * @sgl:                 The SG list
 * @nents:                 Number of SG entries
 * @buf:                 Where to copy to
 * @buflen:                 The number of bytes to copy
 *
 * Returns the number of copied bytes.
 *
 **/
size_t sg_copy_to_buffer(struct scatterlist *sgl, unsigned int nents,
                         void *buf, size_t buflen)
{
        return sg_copy_buffer(sgl, nents, buf, buflen, 0, true);
}
EXPORT_SYMBOL(sg_copy_to_buffer);

/**
 * sg_pcopy_from_buffer - Copy from a linear buffer to an SG list
 * @sgl:                 The SG list
 * @nents:                 Number of SG entries
 * @buf:                 Where to copy from
 * @buflen:                 The number of bytes to copy
 * @skip:                 Number of bytes to skip before copying
 *
 * Returns the number of copied bytes.
 *
 **/
size_t sg_pcopy_from_buffer(struct scatterlist *sgl, unsigned int nents,
                            const void *buf, size_t buflen, off_t skip)
{
        return sg_copy_buffer(sgl, nents, (void *)buf, buflen, skip, false);
}
EXPORT_SYMBOL(sg_pcopy_from_buffer);

/**
 * sg_pcopy_to_buffer - Copy from an SG list to a linear buffer
 * @sgl:                 The SG list
 * @nents:                 Number of SG entries
 * @buf:                 Where to copy to
 * @buflen:                 The number of bytes to copy
 * @skip:                 Number of bytes to skip before copying
 *
 * Returns the number of copied bytes.
 *
 **/
size_t sg_pcopy_to_buffer(struct scatterlist *sgl, unsigned int nents,
                          void *buf, size_t buflen, off_t skip)
{
        return sg_copy_buffer(sgl, nents, buf, buflen, skip, true);
}
EXPORT_SYMBOL(sg_pcopy_to_buffer);

/**
 * sg_zero_buffer - Zero-out a part of a SG list
 * @sgl:                 The SG list
 * @nents:                 Number of SG entries
 * @buflen:                 The number of bytes to zero out
 * @skip:                 Number of bytes to skip before zeroing
 *
 * Returns the number of bytes zeroed.
 **/
size_t sg_zero_buffer(struct scatterlist *sgl, unsigned int nents,
                       size_t buflen, off_t skip)
{
        unsigned int offset = 0;
        struct sg_mapping_iter miter;
        unsigned int sg_flags = SG_MITER_ATOMIC | SG_MITER_TO_SG;

        sg_miter_start(&miter, sgl, nents, sg_flags);

        if (!sg_miter_skip(&miter, skip))
                return false;

        while (offset < buflen && sg_miter_next(&miter)) {
                unsigned int len;

                len = min(miter.length, buflen - offset);
                memset(miter.addr, 0, len);

                offset += len;
        }

        sg_miter_stop(&miter);
        return offset;
}
EXPORT_SYMBOL(sg_zero_buffer);





































































    2 
















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
/* SPDX-License-Identifier: GPL-2.0 */
/*
  File: linux/xattr.h

  Extended attributes handling.

  Copyright (C) 2001 by Andreas Gruenbacher <a.gruenbacher@computer.org>
  Copyright (c) 2001-2002 Silicon Graphics, Inc.  All Rights Reserved.
  Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
*/
#ifndef _LINUX_XATTR_H
#define _LINUX_XATTR_H


#include <linux/slab.h>
#include <linux/types.h>
#include <linux/spinlock.h>
#include <linux/mm.h>
#include <uapi/linux/xattr.h>

struct inode;
struct dentry;

/*
 * struct xattr_handler: When @name is set, match attributes with exactly that
 * name.  When @prefix is set instead, match attributes with that prefix and
 * with a non-empty suffix.
 */
struct xattr_handler {
        const char *name;
        const char *prefix;
        int flags;      /* fs private flags */
        bool (*list)(struct dentry *dentry);
        int (*get)(const struct xattr_handler *, struct dentry *dentry,
                   struct inode *inode, const char *name, void *buffer,
                   size_t size);
        int (*set)(const struct xattr_handler *, struct dentry *dentry,
                   struct inode *inode, const char *name, const void *buffer,
                   size_t size, int flags);
};

const char *xattr_full_name(const struct xattr_handler *, const char *);

struct xattr {
        const char *name;
        void *value;
        size_t value_len;
};

ssize_t __vfs_getxattr(struct dentry *, struct inode *, const char *, void *, size_t);
ssize_t vfs_getxattr(struct dentry *, const char *, void *, size_t);
ssize_t vfs_listxattr(struct dentry *d, char *list, size_t size);
int __vfs_setxattr(struct dentry *, struct inode *, const char *, const void *, size_t, int);
int __vfs_setxattr_noperm(struct dentry *, const char *, const void *, size_t, int);
int __vfs_setxattr_locked(struct dentry *, const char *, const void *, size_t, int, struct inode **);
int vfs_setxattr(struct dentry *, const char *, const void *, size_t, int);
int __vfs_removexattr(struct dentry *, const char *);
int __vfs_removexattr_locked(struct dentry *, const char *, struct inode **);
int vfs_removexattr(struct dentry *, const char *);

ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size);
ssize_t vfs_getxattr_alloc(struct dentry *dentry, const char *name,
                           char **xattr_value, size_t size, gfp_t flags);

int xattr_supported_namespace(struct inode *inode, const char *prefix);

static inline const char *xattr_prefix(const struct xattr_handler *handler)
{
        return handler->prefix ?: handler->name;
}

struct simple_xattrs {
        struct list_head head;
        spinlock_t lock;
};

struct simple_xattr {
        struct list_head list;
        char *name;
        size_t size;
        char value[];
};

/*
 * initialize the simple_xattrs structure
 */
static inline void simple_xattrs_init(struct simple_xattrs *xattrs)
{
        INIT_LIST_HEAD(&xattrs->head);
        spin_lock_init(&xattrs->lock);
}

/*
 * free all the xattrs
 */
static inline void simple_xattrs_free(struct simple_xattrs *xattrs)
{
        struct simple_xattr *xattr, *node;

        list_for_each_entry_safe(xattr, node, &xattrs->head, list) {
                kfree(xattr->name);
                kvfree(xattr);
        }
}

struct simple_xattr *simple_xattr_alloc(const void *value, size_t size);
int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
                     void *buffer, size_t size);
int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
                     const void *value, size_t size, int flags,
                     ssize_t *removed_size);
ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer,
                          size_t size);
void simple_xattr_list_add(struct simple_xattrs *xattrs,
                           struct simple_xattr *new_xattr);

#endif        /* _LINUX_XATTR_H */











































































































































































































































































































































































































































































































































































   10 




    1 

















    2 












    2 



    1 





























































    9 









    2 

























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Written by Mark Hemment, 1996 (markhe@nextd.demon.co.uk).
 *
 * (C) SGI 2006, Christoph Lameter
 *         Cleaned up and restructured to ease the addition of alternative
 *         implementations of SLAB allocators.
 * (C) Linux Foundation 2008-2013
 *      Unified interface for all slab allocators
 */

#ifndef _LINUX_SLAB_H
#define        _LINUX_SLAB_H

#include <linux/gfp.h>
#include <linux/overflow.h>
#include <linux/types.h>
#include <linux/workqueue.h>
#include <linux/percpu-refcount.h>
#include <linux/cleanup.h>


/*
 * Flags to pass to kmem_cache_create().
 * The ones marked DEBUG are only valid if CONFIG_DEBUG_SLAB is set.
 */
/* DEBUG: Perform (expensive) checks on alloc/free */
#define SLAB_CONSISTENCY_CHECKS        ((slab_flags_t __force)0x00000100U)
/* DEBUG: Red zone objs in a cache */
#define SLAB_RED_ZONE                ((slab_flags_t __force)0x00000400U)
/* DEBUG: Poison objects */
#define SLAB_POISON                ((slab_flags_t __force)0x00000800U)
/* Align objs on cache lines */
#define SLAB_HWCACHE_ALIGN        ((slab_flags_t __force)0x00002000U)
/* Use GFP_DMA memory */
#define SLAB_CACHE_DMA                ((slab_flags_t __force)0x00004000U)
/* Use GFP_DMA32 memory */
#define SLAB_CACHE_DMA32        ((slab_flags_t __force)0x00008000U)
/* DEBUG: Store the last owner for bug hunting */
#define SLAB_STORE_USER                ((slab_flags_t __force)0x00010000U)
/* Panic if kmem_cache_create() fails */
#define SLAB_PANIC                ((slab_flags_t __force)0x00040000U)
/*
 * SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS!
 *
 * This delays freeing the SLAB page by a grace period, it does _NOT_
 * delay object freeing. This means that if you do kmem_cache_free()
 * that memory location is free to be reused at any time. Thus it may
 * be possible to see another object there in the same RCU grace period.
 *
 * This feature only ensures the memory location backing the object
 * stays valid, the trick to using this is relying on an independent
 * object validation pass. Something like:
 *
 *  rcu_read_lock()
 * again:
 *  obj = lockless_lookup(key);
 *  if (obj) {
 *    if (!try_get_ref(obj)) // might fail for free objects
 *      goto again;
 *
 *    if (obj->key != key) { // not the object we expected
 *      put_ref(obj);
 *      goto again;
 *    }
 *  }
 *  rcu_read_unlock();
 *
 * This is useful if we need to approach a kernel structure obliquely,
 * from its address obtained without the usual locking. We can lock
 * the structure to stabilize it and check it's still at the given address,
 * only if we can be sure that the memory has not been meanwhile reused
 * for some other kind of object (which our subsystem's lock might corrupt).
 *
 * rcu_read_lock before reading the address, then rcu_read_unlock after
 * taking the spinlock within the structure expected at that address.
 *
 * Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU.
 */
/* Defer freeing slabs to RCU */
#define SLAB_TYPESAFE_BY_RCU        ((slab_flags_t __force)0x00080000U)
/* Spread some memory over cpuset */
#define SLAB_MEM_SPREAD                ((slab_flags_t __force)0x00100000U)
/* Trace allocations and frees */
#define SLAB_TRACE                ((slab_flags_t __force)0x00200000U)

/* Flag to prevent checks on free */
#ifdef CONFIG_DEBUG_OBJECTS
# define SLAB_DEBUG_OBJECTS        ((slab_flags_t __force)0x00400000U)
#else
# define SLAB_DEBUG_OBJECTS        0
#endif

/* Avoid kmemleak tracing */
#define SLAB_NOLEAKTRACE        ((slab_flags_t __force)0x00800000U)

/* Fault injection mark */
#ifdef CONFIG_FAILSLAB
# define SLAB_FAILSLAB                ((slab_flags_t __force)0x02000000U)
#else
# define SLAB_FAILSLAB                0
#endif
/* Account to memcg */
#ifdef CONFIG_MEMCG_KMEM
# define SLAB_ACCOUNT                ((slab_flags_t __force)0x04000000U)
#else
# define SLAB_ACCOUNT                0
#endif

#ifdef CONFIG_KASAN
#define SLAB_KASAN                ((slab_flags_t __force)0x08000000U)
#else
#define SLAB_KASAN                0
#endif

/* The following flags affect the page allocator grouping pages by mobility */
/* Objects are reclaimable */
#define SLAB_RECLAIM_ACCOUNT        ((slab_flags_t __force)0x00020000U)
#define SLAB_TEMPORARY                SLAB_RECLAIM_ACCOUNT        /* Objects are short-lived */

/* Slab deactivation flag */
#define SLAB_DEACTIVATED        ((slab_flags_t __force)0x10000000U)

/*
 * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
 *
 * Dereferencing ZERO_SIZE_PTR will lead to a distinct access fault.
 *
 * ZERO_SIZE_PTR can be passed to kfree though in the same way that NULL can.
 * Both make kfree a no-op.
 */
#define ZERO_SIZE_PTR ((void *)16)

#define ZERO_OR_NULL_PTR(x) ((unsigned long)(x) <= \
                                (unsigned long)ZERO_SIZE_PTR)

#include <linux/kasan.h>

struct mem_cgroup;
/*
 * struct kmem_cache related prototypes
 */
void __init kmem_cache_init(void);
bool slab_is_available(void);

extern bool usercopy_fallback;

struct kmem_cache *kmem_cache_create(const char *name, unsigned int size,
                        unsigned int align, slab_flags_t flags,
                        void (*ctor)(void *));
struct kmem_cache *kmem_cache_create_usercopy(const char *name,
                        unsigned int size, unsigned int align,
                        slab_flags_t flags,
                        unsigned int useroffset, unsigned int usersize,
                        void (*ctor)(void *));
void kmem_cache_destroy(struct kmem_cache *);
int kmem_cache_shrink(struct kmem_cache *);

/*
 * Please use this macro to create slab caches. Simply specify the
 * name of the structure and maybe some flags that are listed above.
 *
 * The alignment of the struct determines object alignment. If you
 * f.e. add ____cacheline_aligned_in_smp to the struct declaration
 * then the objects will be properly aligned in SMP configurations.
 */
#define KMEM_CACHE(__struct, __flags)                                        \
                kmem_cache_create(#__struct, sizeof(struct __struct),        \
                        __alignof__(struct __struct), (__flags), NULL)

/*
 * To whitelist a single field for copying to/from usercopy, use this
 * macro instead for KMEM_CACHE() above.
 */
#define KMEM_CACHE_USERCOPY(__struct, __flags, __field)                        \
                kmem_cache_create_usercopy(#__struct,                        \
                        sizeof(struct __struct),                        \
                        __alignof__(struct __struct), (__flags),        \
                        offsetof(struct __struct, __field),                \
                        sizeof_field(struct __struct, __field), NULL)

/*
 * Common kmalloc functions provided by all allocators
 */
void * __must_check krealloc(const void *, size_t, gfp_t);
void kfree(const void *);
void kfree_sensitive(const void *);
size_t __ksize(const void *);
size_t ksize(const void *);

DEFINE_FREE(kfree, void *, if (!IS_ERR_OR_NULL(_T)) kfree(_T))

#ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR
void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
                        bool to_user);
#else
static inline void __check_heap_object(const void *ptr, unsigned long n,
                                       struct page *page, bool to_user) { }
#endif

/*
 * Some archs want to perform DMA into kmalloc caches and need a guaranteed
 * alignment larger than the alignment of a 64-bit integer.
 * Setting ARCH_KMALLOC_MINALIGN in arch headers allows that.
 */
#if defined(ARCH_DMA_MINALIGN) && ARCH_DMA_MINALIGN > 8
#define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN
#define KMALLOC_MIN_SIZE ARCH_DMA_MINALIGN
#define KMALLOC_SHIFT_LOW ilog2(ARCH_DMA_MINALIGN)
#else
#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
#endif

/*
 * Setting ARCH_SLAB_MINALIGN in arch headers allows a different alignment.
 * Intended for arches that get misalignment faults even for 64 bit integer
 * aligned buffers.
 */
#ifndef ARCH_SLAB_MINALIGN
#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
#endif

/*
 * kmalloc and friends return ARCH_KMALLOC_MINALIGN aligned
 * pointers. kmem_cache_alloc and friends return ARCH_SLAB_MINALIGN
 * aligned pointers.
 */
#define __assume_kmalloc_alignment __assume_aligned(ARCH_KMALLOC_MINALIGN)
#define __assume_slab_alignment __assume_aligned(ARCH_SLAB_MINALIGN)
#define __assume_page_alignment __assume_aligned(PAGE_SIZE)

/*
 * Kmalloc array related definitions
 */

#ifdef CONFIG_SLAB
/*
 * The largest kmalloc size supported by the SLAB allocators is
 * 32 megabyte (2^25) or the maximum allocatable page order if that is
 * less than 32 MB.
 *
 * WARNING: Its not easy to increase this value since the allocators have
 * to do various tricks to work around compiler limitations in order to
 * ensure proper constant folding.
 */
#define KMALLOC_SHIFT_HIGH        ((MAX_ORDER + PAGE_SHIFT - 1) <= 25 ? \
                                (MAX_ORDER + PAGE_SHIFT - 1) : 25)
#define KMALLOC_SHIFT_MAX        KMALLOC_SHIFT_HIGH
#ifndef KMALLOC_SHIFT_LOW
#define KMALLOC_SHIFT_LOW        5
#endif
#endif

#ifdef CONFIG_SLUB
/*
 * SLUB directly allocates requests fitting in to an order-1 page
 * (PAGE_SIZE*2).  Larger requests are passed to the page allocator.
 */
#define KMALLOC_SHIFT_HIGH        (PAGE_SHIFT + 1)
#define KMALLOC_SHIFT_MAX        (MAX_ORDER + PAGE_SHIFT - 1)
#ifndef KMALLOC_SHIFT_LOW
#define KMALLOC_SHIFT_LOW        3
#endif
#endif

#ifdef CONFIG_SLOB
/*
 * SLOB passes all requests larger than one page to the page allocator.
 * No kmalloc array is necessary since objects of different sizes can
 * be allocated from the same page.
 */
#define KMALLOC_SHIFT_HIGH        PAGE_SHIFT
#define KMALLOC_SHIFT_MAX        (MAX_ORDER + PAGE_SHIFT - 1)
#ifndef KMALLOC_SHIFT_LOW
#define KMALLOC_SHIFT_LOW        3
#endif
#endif

/* Maximum allocatable size */
#define KMALLOC_MAX_SIZE        (1UL << KMALLOC_SHIFT_MAX)
/* Maximum size for which we actually use a slab cache */
#define KMALLOC_MAX_CACHE_SIZE        (1UL << KMALLOC_SHIFT_HIGH)
/* Maximum order allocatable via the slab allocator */
#define KMALLOC_MAX_ORDER        (KMALLOC_SHIFT_MAX - PAGE_SHIFT)

/*
 * Kmalloc subsystem.
 */
#ifndef KMALLOC_MIN_SIZE
#define KMALLOC_MIN_SIZE (1 << KMALLOC_SHIFT_LOW)
#endif

/*
 * This restriction comes from byte sized index implementation.
 * Page size is normally 2^12 bytes and, in this case, if we want to use
 * byte sized index which can represent 2^8 entries, the size of the object
 * should be equal or greater to 2^12 / 2^8 = 2^4 = 16.
 * If minimum size of kmalloc is less than 16, we use it as minimum object
 * size and give up to use byte sized index.
 */
#define SLAB_OBJ_MIN_SIZE      (KMALLOC_MIN_SIZE < 16 ? \
                               (KMALLOC_MIN_SIZE) : 16)

/*
 * Whenever changing this, take care of that kmalloc_type() and
 * create_kmalloc_caches() still work as intended.
 */
enum kmalloc_cache_type {
        KMALLOC_NORMAL = 0,
        KMALLOC_RECLAIM,
#ifdef CONFIG_ZONE_DMA
        KMALLOC_DMA,
#endif
        NR_KMALLOC_TYPES
};

#ifndef CONFIG_SLOB
extern struct kmem_cache *
kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1];

static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags)
{
#ifdef CONFIG_ZONE_DMA
        /*
         * The most common case is KMALLOC_NORMAL, so test for it
         * with a single branch for both flags.
         */
        if (likely((flags & (__GFP_DMA | __GFP_RECLAIMABLE)) == 0))
                return KMALLOC_NORMAL;

        /*
         * At least one of the flags has to be set. If both are, __GFP_DMA
         * is more important.
         */
        return flags & __GFP_DMA ? KMALLOC_DMA : KMALLOC_RECLAIM;
#else
        return flags & __GFP_RECLAIMABLE ? KMALLOC_RECLAIM : KMALLOC_NORMAL;
#endif
}

/*
 * Figure out which kmalloc slab an allocation of a certain size
 * belongs to.
 * 0 = zero alloc
 * 1 =  65 .. 96 bytes
 * 2 = 129 .. 192 bytes
 * n = 2^(n-1)+1 .. 2^n
 */
static __always_inline unsigned int kmalloc_index(size_t size)
{
        if (!size)
                return 0;

        if (size <= KMALLOC_MIN_SIZE)
                return KMALLOC_SHIFT_LOW;

        if (KMALLOC_MIN_SIZE <= 32 && size > 64 && size <= 96)
                return 1;
        if (KMALLOC_MIN_SIZE <= 64 && size > 128 && size <= 192)
                return 2;
        if (size <=          8) return 3;
        if (size <=         16) return 4;
        if (size <=         32) return 5;
        if (size <=         64) return 6;
        if (size <=        128) return 7;
        if (size <=        256) return 8;
        if (size <=        512) return 9;
        if (size <=       1024) return 10;
        if (size <=   2 * 1024) return 11;
        if (size <=   4 * 1024) return 12;
        if (size <=   8 * 1024) return 13;
        if (size <=  16 * 1024) return 14;
        if (size <=  32 * 1024) return 15;
        if (size <=  64 * 1024) return 16;
        if (size <= 128 * 1024) return 17;
        if (size <= 256 * 1024) return 18;
        if (size <= 512 * 1024) return 19;
        if (size <= 1024 * 1024) return 20;
        if (size <=  2 * 1024 * 1024) return 21;
        if (size <=  4 * 1024 * 1024) return 22;
        if (size <=  8 * 1024 * 1024) return 23;
        if (size <=  16 * 1024 * 1024) return 24;
        if (size <=  32 * 1024 * 1024) return 25;
        if (size <=  64 * 1024 * 1024) return 26;
        BUG();

        /* Will never be reached. Needed because the compiler may complain */
        return -1;
}
#endif /* !CONFIG_SLOB */

void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc;
void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment __malloc;
void kmem_cache_free(struct kmem_cache *, void *);

/*
 * Bulk allocation and freeing operations. These are accelerated in an
 * allocator specific way to avoid taking locks repeatedly or building
 * metadata structures unnecessarily.
 *
 * Note that interrupts must be enabled when calling these functions.
 */
void kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
int kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);

/*
 * Caller must not use kfree_bulk() on memory not originally allocated
 * by kmalloc(), because the SLOB allocator cannot handle this.
 */
static __always_inline void kfree_bulk(size_t size, void **p)
{
        kmem_cache_free_bulk(NULL, size, p);
}

#ifdef CONFIG_NUMA
void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc;
void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment __malloc;
#else
static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node)
{
        return __kmalloc(size, flags);
}

static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node)
{
        return kmem_cache_alloc(s, flags);
}
#endif

#ifdef CONFIG_TRACING
extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t) __assume_slab_alignment __malloc;

#ifdef CONFIG_NUMA
extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
                                           gfp_t gfpflags,
                                           int node, size_t size) __assume_slab_alignment __malloc;
#else
static __always_inline void *
kmem_cache_alloc_node_trace(struct kmem_cache *s,
                              gfp_t gfpflags,
                              int node, size_t size)
{
        return kmem_cache_alloc_trace(s, gfpflags, size);
}
#endif /* CONFIG_NUMA */

#else /* CONFIG_TRACING */
static __always_inline void *kmem_cache_alloc_trace(struct kmem_cache *s,
                gfp_t flags, size_t size)
{
        void *ret = kmem_cache_alloc(s, flags);

        ret = kasan_kmalloc(s, ret, size, flags);
        return ret;
}

static __always_inline void *
kmem_cache_alloc_node_trace(struct kmem_cache *s,
                              gfp_t gfpflags,
                              int node, size_t size)
{
        void *ret = kmem_cache_alloc_node(s, gfpflags, node);

        ret = kasan_kmalloc(s, ret, size, gfpflags);
        return ret;
}
#endif /* CONFIG_TRACING */

extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment __malloc;

#ifdef CONFIG_TRACING
extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment __malloc;
#else
static __always_inline void *
kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
{
        return kmalloc_order(size, flags, order);
}
#endif

static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
{
        unsigned int order = get_order(size);
        return kmalloc_order_trace(size, flags, order);
}

/**
 * kmalloc - allocate memory
 * @size: how many bytes of memory are required.
 * @flags: the type of memory to allocate.
 *
 * kmalloc is the normal method of allocating memory
 * for objects smaller than page size in the kernel.
 *
 * The allocated object address is aligned to at least ARCH_KMALLOC_MINALIGN
 * bytes. For @size of power of two bytes, the alignment is also guaranteed
 * to be at least to the size.
 *
 * The @flags argument may be one of the GFP flags defined at
 * include/linux/gfp.h and described at
 * :ref:`Documentation/core-api/mm-api.rst <mm-api-gfp-flags>`
 *
 * The recommended usage of the @flags is described at
 * :ref:`Documentation/core-api/memory-allocation.rst <memory_allocation>`
 *
 * Below is a brief outline of the most useful GFP flags
 *
 * %GFP_KERNEL
 *        Allocate normal kernel ram. May sleep.
 *
 * %GFP_NOWAIT
 *        Allocation will not sleep.
 *
 * %GFP_ATOMIC
 *        Allocation will not sleep.  May use emergency pools.
 *
 * %GFP_HIGHUSER
 *        Allocate memory from high memory on behalf of user.
 *
 * Also it is possible to set different flags by OR'ing
 * in one or more of the following additional @flags:
 *
 * %__GFP_HIGH
 *        This allocation has high priority and may use emergency pools.
 *
 * %__GFP_NOFAIL
 *        Indicate that this allocation is in no way allowed to fail
 *        (think twice before using).
 *
 * %__GFP_NORETRY
 *        If memory is not immediately available,
 *        then give up at once.
 *
 * %__GFP_NOWARN
 *        If allocation fails, don't issue any warnings.
 *
 * %__GFP_RETRY_MAYFAIL
 *        Try really hard to succeed the allocation but fail
 *        eventually.
 */
static __always_inline void *kmalloc(size_t size, gfp_t flags)
{
        if (__builtin_constant_p(size)) {
#ifndef CONFIG_SLOB
                unsigned int index;
#endif
                if (size > KMALLOC_MAX_CACHE_SIZE)
                        return kmalloc_large(size, flags);
#ifndef CONFIG_SLOB
                index = kmalloc_index(size);

                if (!index)
                        return ZERO_SIZE_PTR;

                return kmem_cache_alloc_trace(
                                kmalloc_caches[kmalloc_type(flags)][index],
                                flags, size);
#endif
        }
        return __kmalloc(size, flags);
}

static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
{
#ifndef CONFIG_SLOB
        if (__builtin_constant_p(size) &&
                size <= KMALLOC_MAX_CACHE_SIZE) {
                unsigned int i = kmalloc_index(size);

                if (!i)
                        return ZERO_SIZE_PTR;

                return kmem_cache_alloc_node_trace(
                                kmalloc_caches[kmalloc_type(flags)][i],
                                                flags, node, size);
        }
#endif
        return __kmalloc_node(size, flags, node);
}

/**
 * kmalloc_array - allocate memory for an array.
 * @n: number of elements.
 * @size: element size.
 * @flags: the type of memory to allocate (see kmalloc).
 */
static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags)
{
        size_t bytes;

        if (unlikely(check_mul_overflow(n, size, &bytes)))
                return NULL;
        if (__builtin_constant_p(n) && __builtin_constant_p(size))
                return kmalloc(bytes, flags);
        return __kmalloc(bytes, flags);
}

/**
 * kcalloc - allocate memory for an array. The memory is set to zero.
 * @n: number of elements.
 * @size: element size.
 * @flags: the type of memory to allocate (see kmalloc).
 */
static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
{
        return kmalloc_array(n, size, flags | __GFP_ZERO);
}

/*
 * kmalloc_track_caller is a special version of kmalloc that records the
 * calling function of the routine calling it for slab leak tracking instead
 * of just the calling function (confusing, eh?).
 * It's useful when the call to kmalloc comes from a widely-used standard
 * allocator where we care about the real place the memory allocation
 * request comes from.
 */
extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long);
#define kmalloc_track_caller(size, flags) \
        __kmalloc_track_caller(size, flags, _RET_IP_)

static inline void *kmalloc_array_node(size_t n, size_t size, gfp_t flags,
                                       int node)
{
        size_t bytes;

        if (unlikely(check_mul_overflow(n, size, &bytes)))
                return NULL;
        if (__builtin_constant_p(n) && __builtin_constant_p(size))
                return kmalloc_node(bytes, flags, node);
        return __kmalloc_node(bytes, flags, node);
}

static inline void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node)
{
        return kmalloc_array_node(n, size, flags | __GFP_ZERO, node);
}


#ifdef CONFIG_NUMA
extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long);
#define kmalloc_node_track_caller(size, flags, node) \
        __kmalloc_node_track_caller(size, flags, node, \
                        _RET_IP_)

#else /* CONFIG_NUMA */

#define kmalloc_node_track_caller(size, flags, node) \
        kmalloc_track_caller(size, flags)

#endif /* CONFIG_NUMA */

/*
 * Shortcuts
 */
static inline void *kmem_cache_zalloc(struct kmem_cache *k, gfp_t flags)
{
        return kmem_cache_alloc(k, flags | __GFP_ZERO);
}

/**
 * kzalloc - allocate memory. The memory is set to zero.
 * @size: how many bytes of memory are required.
 * @flags: the type of memory to allocate (see kmalloc).
 */
static inline void *kzalloc(size_t size, gfp_t flags)
{
        return kmalloc(size, flags | __GFP_ZERO);
}

/**
 * kzalloc_node - allocate zeroed memory from a particular memory node.
 * @size: how many bytes of memory are required.
 * @flags: the type of memory to allocate (see kmalloc).
 * @node: memory node from which to allocate
 */
static inline void *kzalloc_node(size_t size, gfp_t flags, int node)
{
        return kmalloc_node(size, flags | __GFP_ZERO, node);
}

unsigned int kmem_cache_size(struct kmem_cache *s);
void __init kmem_cache_init_late(void);

#if defined(CONFIG_SMP) && defined(CONFIG_SLAB)
int slab_prepare_cpu(unsigned int cpu);
int slab_dead_cpu(unsigned int cpu);
#else
#define slab_prepare_cpu        NULL
#define slab_dead_cpu                NULL
#endif

#endif        /* _LINUX_SLAB_H */



























































































































    1 





    1 
    1 





    1 

    1 















    1 


















    1 
























    1 







































    1 






























    1 




























    1 





    1 













    1 







    1 
    1 







    1 
    1 
    1 


    1 








    1 






    1 




    1 
    1 






    2 





    2 


































    2 








    2 























































































    2 
    2 




    2 





    2 


































    2 














































































    2 





















    2 

















    2 

    2 






































    1 


















    1 
























    1 








    1 


    1 
































































    1 






    1 












    1 


    1 






    1 







    1 










    1 





    1 





    1 










    1 































    1 

















    1 

    1 

    1 


























































































































































































































































































































































































































































































































    1 































































































































































































































    4 












    4 










    4 




























    1 










    1 








































    4 


    4 


    4 


















    4 
    4 





    4 



    1 

    1 

    1 


    1 



    1 



    1 


    1 











    1 






































    1 



    1 



    1 














    1 











    1 
































































































































































    2 







    2 



































    2 


    2 
































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 










    1 
















    1 









    1 














    1 

















    1 





    1 


























    1 







    1 




    1 



    1 













































    1 

    1 



    1 


















    1 






    1 




    1 






















    1 





    1 
    1 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
// SPDX-License-Identifier: GPL-2.0-only
/*
 *        linux/mm/filemap.c
 *
 * Copyright (C) 1994-1999  Linus Torvalds
 */

/*
 * This file handles the generic file mmap semantics used by
 * most "normal" filesystems (but you don't /have/ to use this:
 * the NFS filesystem used to do this differently, for example)
 */
#include <linux/export.h>
#include <linux/compiler.h>
#include <linux/dax.h>
#include <linux/fs.h>
#include <linux/sched/signal.h>
#include <linux/uaccess.h>
#include <linux/capability.h>
#include <linux/kernel_stat.h>
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/uio.h>
#include <linux/error-injection.h>
#include <linux/hash.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
#include <linux/blkdev.h>
#include <linux/security.h>
#include <linux/cpuset.h>
#include <linux/hugetlb.h>
#include <linux/memcontrol.h>
#include <linux/cleancache.h>
#include <linux/shmem_fs.h>
#include <linux/rmap.h>
#include <linux/delayacct.h>
#include <linux/psi.h>
#include <linux/ramfs.h>
#include <linux/page_idle.h>
#include "internal.h"

#define CREATE_TRACE_POINTS
#include <trace/events/filemap.h>

/*
 * FIXME: remove all knowledge of the buffer layer from the core VM
 */
#include <linux/buffer_head.h> /* for try_to_free_buffers */

#include <asm/mman.h>

/*
 * Shared mappings implemented 30.11.1994. It's not fully working yet,
 * though.
 *
 * Shared mappings now work. 15.8.1995  Bruno.
 *
 * finished 'unifying' the page and buffer cache and SMP-threaded the
 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
 *
 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
 */

/*
 * Lock ordering:
 *
 *  ->i_mmap_rwsem                (truncate_pagecache)
 *    ->private_lock                (__free_pte->__set_page_dirty_buffers)
 *      ->swap_lock                (exclusive_swap_page, others)
 *        ->i_pages lock
 *
 *  ->i_mutex
 *    ->i_mmap_rwsem                (truncate->unmap_mapping_range)
 *
 *  ->mmap_lock
 *    ->i_mmap_rwsem
 *      ->page_table_lock or pte_lock        (various, mainly in memory.c)
 *        ->i_pages lock        (arch-dependent flush_dcache_mmap_lock)
 *
 *  ->mmap_lock
 *    ->lock_page                (access_process_vm)
 *
 *  ->i_mutex                        (generic_perform_write)
 *    ->mmap_lock                (fault_in_pages_readable->do_page_fault)
 *
 *  bdi->wb.list_lock
 *    sb_lock                        (fs/fs-writeback.c)
 *    ->i_pages lock                (__sync_single_inode)
 *
 *  ->i_mmap_rwsem
 *    ->anon_vma.lock                (vma_adjust)
 *
 *  ->anon_vma.lock
 *    ->page_table_lock or pte_lock        (anon_vma_prepare and various)
 *
 *  ->page_table_lock or pte_lock
 *    ->swap_lock                (try_to_unmap_one)
 *    ->private_lock                (try_to_unmap_one)
 *    ->i_pages lock                (try_to_unmap_one)
 *    ->pgdat->lru_lock                (follow_page->mark_page_accessed)
 *    ->pgdat->lru_lock                (check_pte_range->isolate_lru_page)
 *    ->private_lock                (page_remove_rmap->set_page_dirty)
 *    ->i_pages lock                (page_remove_rmap->set_page_dirty)
 *    bdi.wb->list_lock                (page_remove_rmap->set_page_dirty)
 *    ->inode->i_lock                (page_remove_rmap->set_page_dirty)
 *    ->memcg->move_lock        (page_remove_rmap->lock_page_memcg)
 *    bdi.wb->list_lock                (zap_pte_range->set_page_dirty)
 *    ->inode->i_lock                (zap_pte_range->set_page_dirty)
 *    ->private_lock                (zap_pte_range->__set_page_dirty_buffers)
 *
 * ->i_mmap_rwsem
 *   ->tasklist_lock            (memory_failure, collect_procs_ao)
 */

static void page_cache_delete(struct address_space *mapping,
                                   struct page *page, void *shadow)
{
        XA_STATE(xas, &mapping->i_pages, page->index);
        unsigned int nr = 1;

        mapping_set_update(&xas, mapping);

        /* hugetlb pages are represented by a single entry in the xarray */
        if (!PageHuge(page)) {
                xas_set_order(&xas, page->index, compound_order(page));
                nr = compound_nr(page);
        }

        VM_BUG_ON_PAGE(!PageLocked(page), page);
        VM_BUG_ON_PAGE(PageTail(page), page);
        VM_BUG_ON_PAGE(nr != 1 && shadow, page);

        xas_store(&xas, shadow);
        xas_init_marks(&xas);

        page->mapping = NULL;
        /* Leave page->index set: truncation lookup relies upon it */

        if (shadow) {
                mapping->nrexceptional += nr;
                /*
                 * Make sure the nrexceptional update is committed before
                 * the nrpages update so that final truncate racing
                 * with reclaim does not see both counters 0 at the
                 * same time and miss a shadow entry.
                 */
                smp_wmb();
        }
        mapping->nrpages -= nr;
}

static void unaccount_page_cache_page(struct address_space *mapping,
                                      struct page *page)
{
        int nr;

        /*
         * if we're uptodate, flush out into the cleancache, otherwise
         * invalidate any existing cleancache entries.  We can't leave
         * stale data around in the cleancache once our page is gone
         */
        if (PageUptodate(page) && PageMappedToDisk(page))
                cleancache_put_page(page);
        else
                cleancache_invalidate_page(mapping, page);

        VM_BUG_ON_PAGE(PageTail(page), page);
        VM_BUG_ON_PAGE(page_mapped(page), page);
        if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) {
                int mapcount;

                pr_alert("BUG: Bad page cache in process %s  pfn:%05lx\n",
                         current->comm, page_to_pfn(page));
                dump_page(page, "still mapped when deleted");
                dump_stack();
                add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);

                mapcount = page_mapcount(page);
                if (mapping_exiting(mapping) &&
                    page_count(page) >= mapcount + 2) {
                        /*
                         * All vmas have already been torn down, so it's
                         * a good bet that actually the page is unmapped,
                         * and we'd prefer not to leak it: if we're wrong,
                         * some other bad page check should catch it later.
                         */
                        page_mapcount_reset(page);
                        page_ref_sub(page, mapcount);
                }
        }

        /* hugetlb pages do not participate in page cache accounting. */
        if (PageHuge(page))
                return;

        nr = thp_nr_pages(page);

        __mod_lruvec_page_state(page, NR_FILE_PAGES, -nr);
        if (PageSwapBacked(page)) {
                __mod_lruvec_page_state(page, NR_SHMEM, -nr);
                if (PageTransHuge(page))
                        __dec_node_page_state(page, NR_SHMEM_THPS);
        } else if (PageTransHuge(page)) {
                __dec_node_page_state(page, NR_FILE_THPS);
                filemap_nr_thps_dec(mapping);
        }

        /*
         * At this point page must be either written or cleaned by
         * truncate.  Dirty page here signals a bug and loss of
         * unwritten data.
         *
         * This fixes dirty accounting after removing the page entirely
         * but leaves PageDirty set: it has no effect for truncated
         * page and anyway will be cleared before returning page into
         * buddy allocator.
         */
        if (WARN_ON_ONCE(PageDirty(page)))
                account_page_cleaned(page, mapping, inode_to_wb(mapping->host));
}

/*
 * Delete a page from the page cache and free it. Caller has to make
 * sure the page is locked and that nobody else uses it - or that usage
 * is safe.  The caller must hold the i_pages lock.
 */
void __delete_from_page_cache(struct page *page, void *shadow)
{
        struct address_space *mapping = page->mapping;

        trace_mm_filemap_delete_from_page_cache(page);

        unaccount_page_cache_page(mapping, page);
        page_cache_delete(mapping, page, shadow);
}

static void page_cache_free_page(struct address_space *mapping,
                                struct page *page)
{
        void (*freepage)(struct page *);

        freepage = mapping->a_ops->freepage;
        if (freepage)
                freepage(page);

        if (PageTransHuge(page) && !PageHuge(page)) {
                page_ref_sub(page, thp_nr_pages(page));
                VM_BUG_ON_PAGE(page_count(page) <= 0, page);
        } else {
                put_page(page);
        }
}

/**
 * delete_from_page_cache - delete page from page cache
 * @page: the page which the kernel is trying to remove from page cache
 *
 * This must be called only on pages that have been verified to be in the page
 * cache and locked.  It will never put the page into the free list, the caller
 * has a reference on the page.
 */
void delete_from_page_cache(struct page *page)
{
        struct address_space *mapping = page_mapping(page);
        unsigned long flags;

        BUG_ON(!PageLocked(page));
        xa_lock_irqsave(&mapping->i_pages, flags);
        __delete_from_page_cache(page, NULL);
        xa_unlock_irqrestore(&mapping->i_pages, flags);

        page_cache_free_page(mapping, page);
}
EXPORT_SYMBOL(delete_from_page_cache);

/*
 * page_cache_delete_batch - delete several pages from page cache
 * @mapping: the mapping to which pages belong
 * @pvec: pagevec with pages to delete
 *
 * The function walks over mapping->i_pages and removes pages passed in @pvec
 * from the mapping. The function expects @pvec to be sorted by page index
 * and is optimised for it to be dense.
 * It tolerates holes in @pvec (mapping entries at those indices are not
 * modified). The function expects only THP head pages to be present in the
 * @pvec.
 *
 * The function expects the i_pages lock to be held.
 */
static void page_cache_delete_batch(struct address_space *mapping,
                             struct pagevec *pvec)
{
        XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index);
        int total_pages = 0;
        int i = 0;
        struct page *page;

        mapping_set_update(&xas, mapping);
        xas_for_each(&xas, page, ULONG_MAX) {
                if (i >= pagevec_count(pvec))
                        break;

                /* A swap/dax/shadow entry got inserted? Skip it. */
                if (xa_is_value(page))
                        continue;
                /*
                 * A page got inserted in our range? Skip it. We have our
                 * pages locked so they are protected from being removed.
                 * If we see a page whose index is higher than ours, it
                 * means our page has been removed, which shouldn't be
                 * possible because we're holding the PageLock.
                 */
                if (page != pvec->pages[i]) {
                        VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index,
                                        page);
                        continue;
                }

                WARN_ON_ONCE(!PageLocked(page));

                if (page->index == xas.xa_index)
                        page->mapping = NULL;
                /* Leave page->index set: truncation lookup relies on it */

                /*
                 * Move to the next page in the vector if this is a regular
                 * page or the index is of the last sub-page of this compound
                 * page.
                 */
                if (page->index + compound_nr(page) - 1 == xas.xa_index)
                        i++;
                xas_store(&xas, NULL);
                total_pages++;
        }
        mapping->nrpages -= total_pages;
}

void delete_from_page_cache_batch(struct address_space *mapping,
                                  struct pagevec *pvec)
{
        int i;
        unsigned long flags;

        if (!pagevec_count(pvec))
                return;

        xa_lock_irqsave(&mapping->i_pages, flags);
        for (i = 0; i < pagevec_count(pvec); i++) {
                trace_mm_filemap_delete_from_page_cache(pvec->pages[i]);

                unaccount_page_cache_page(mapping, pvec->pages[i]);
        }
        page_cache_delete_batch(mapping, pvec);
        xa_unlock_irqrestore(&mapping->i_pages, flags);

        for (i = 0; i < pagevec_count(pvec); i++)
                page_cache_free_page(mapping, pvec->pages[i]);
}

int filemap_check_errors(struct address_space *mapping)
{
        int ret = 0;
        /* Check for outstanding write errors */
        if (test_bit(AS_ENOSPC, &mapping->flags) &&
            test_and_clear_bit(AS_ENOSPC, &mapping->flags))
                ret = -ENOSPC;
        if (test_bit(AS_EIO, &mapping->flags) &&
            test_and_clear_bit(AS_EIO, &mapping->flags))
                ret = -EIO;
        return ret;
}
EXPORT_SYMBOL(filemap_check_errors);

static int filemap_check_and_keep_errors(struct address_space *mapping)
{
        /* Check for outstanding write errors */
        if (test_bit(AS_EIO, &mapping->flags))
                return -EIO;
        if (test_bit(AS_ENOSPC, &mapping->flags))
                return -ENOSPC;
        return 0;
}

/**
 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
 * @mapping:        address space structure to write
 * @start:        offset in bytes where the range starts
 * @end:        offset in bytes where the range ends (inclusive)
 * @sync_mode:        enable synchronous operation
 *
 * Start writeback against all of a mapping's dirty pages that lie
 * within the byte offsets <start, end> inclusive.
 *
 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
 * opposed to a regular memory cleansing writeback.  The difference between
 * these two operations is that if a dirty page/buffer is encountered, it must
 * be waited upon, and not just skipped over.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
                                loff_t end, int sync_mode)
{
        int ret;
        struct writeback_control wbc = {
                .sync_mode = sync_mode,
                .nr_to_write = LONG_MAX,
                .range_start = start,
                .range_end = end,
        };

        if (!mapping_can_writeback(mapping) ||
            !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
                return 0;

        wbc_attach_fdatawrite_inode(&wbc, mapping->host);
        ret = do_writepages(mapping, &wbc);
        wbc_detach_inode(&wbc);
        return ret;
}

static inline int __filemap_fdatawrite(struct address_space *mapping,
        int sync_mode)
{
        return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
}

int filemap_fdatawrite(struct address_space *mapping)
{
        return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
}
EXPORT_SYMBOL(filemap_fdatawrite);

int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
                                loff_t end)
{
        return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
}
EXPORT_SYMBOL(filemap_fdatawrite_range);

/**
 * filemap_flush - mostly a non-blocking flush
 * @mapping:        target address_space
 *
 * This is a mostly non-blocking flush.  Not suitable for data-integrity
 * purposes - I/O may not be started against all dirty pages.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int filemap_flush(struct address_space *mapping)
{
        return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
}
EXPORT_SYMBOL(filemap_flush);

/**
 * filemap_range_has_page - check if a page exists in range.
 * @mapping:           address space within which to check
 * @start_byte:        offset in bytes where the range starts
 * @end_byte:          offset in bytes where the range ends (inclusive)
 *
 * Find at least one page in the range supplied, usually used to check if
 * direct writing in this range will trigger a writeback.
 *
 * Return: %true if at least one page exists in the specified range,
 * %false otherwise.
 */
bool filemap_range_has_page(struct address_space *mapping,
                           loff_t start_byte, loff_t end_byte)
{
        struct page *page;
        XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
        pgoff_t max = end_byte >> PAGE_SHIFT;

        if (end_byte < start_byte)
                return false;

        rcu_read_lock();
        for (;;) {
                page = xas_find(&xas, max);
                if (xas_retry(&xas, page))
                        continue;
                /* Shadow entries don't count */
                if (xa_is_value(page))
                        continue;
                /*
                 * We don't need to try to pin this page; we're about to
                 * release the RCU lock anyway.  It is enough to know that
                 * there was a page here recently.
                 */
                break;
        }
        rcu_read_unlock();

        return page != NULL;
}
EXPORT_SYMBOL(filemap_range_has_page);

static void __filemap_fdatawait_range(struct address_space *mapping,
                                     loff_t start_byte, loff_t end_byte)
{
        pgoff_t index = start_byte >> PAGE_SHIFT;
        pgoff_t end = end_byte >> PAGE_SHIFT;
        struct pagevec pvec;
        int nr_pages;

        if (end_byte < start_byte)
                return;

        pagevec_init(&pvec);
        while (index <= end) {
                unsigned i;

                nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index,
                                end, PAGECACHE_TAG_WRITEBACK);
                if (!nr_pages)
                        break;

                for (i = 0; i < nr_pages; i++) {
                        struct page *page = pvec.pages[i];

                        wait_on_page_writeback(page);
                        ClearPageError(page);
                }
                pagevec_release(&pvec);
                cond_resched();
        }
}

/**
 * filemap_fdatawait_range - wait for writeback to complete
 * @mapping:                address space structure to wait for
 * @start_byte:                offset in bytes where the range starts
 * @end_byte:                offset in bytes where the range ends (inclusive)
 *
 * Walk the list of under-writeback pages of the given address space
 * in the given range and wait for all of them.  Check error status of
 * the address space and return it.
 *
 * Since the error status of the address space is cleared by this function,
 * callers are responsible for checking the return value and handling and/or
 * reporting the error.
 *
 * Return: error status of the address space.
 */
int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
                            loff_t end_byte)
{
        __filemap_fdatawait_range(mapping, start_byte, end_byte);
        return filemap_check_errors(mapping);
}
EXPORT_SYMBOL(filemap_fdatawait_range);

/**
 * filemap_fdatawait_range_keep_errors - wait for writeback to complete
 * @mapping:                address space structure to wait for
 * @start_byte:                offset in bytes where the range starts
 * @end_byte:                offset in bytes where the range ends (inclusive)
 *
 * Walk the list of under-writeback pages of the given address space in the
 * given range and wait for all of them.  Unlike filemap_fdatawait_range(),
 * this function does not clear error status of the address space.
 *
 * Use this function if callers don't handle errors themselves.  Expected
 * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
 * fsfreeze(8)
 */
int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
                loff_t start_byte, loff_t end_byte)
{
        __filemap_fdatawait_range(mapping, start_byte, end_byte);
        return filemap_check_and_keep_errors(mapping);
}
EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors);

/**
 * file_fdatawait_range - wait for writeback to complete
 * @file:                file pointing to address space structure to wait for
 * @start_byte:                offset in bytes where the range starts
 * @end_byte:                offset in bytes where the range ends (inclusive)
 *
 * Walk the list of under-writeback pages of the address space that file
 * refers to, in the given range and wait for all of them.  Check error
 * status of the address space vs. the file->f_wb_err cursor and return it.
 *
 * Since the error status of the file is advanced by this function,
 * callers are responsible for checking the return value and handling and/or
 * reporting the error.
 *
 * Return: error status of the address space vs. the file->f_wb_err cursor.
 */
int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)
{
        struct address_space *mapping = file->f_mapping;

        __filemap_fdatawait_range(mapping, start_byte, end_byte);
        return file_check_and_advance_wb_err(file);
}
EXPORT_SYMBOL(file_fdatawait_range);

/**
 * filemap_fdatawait_keep_errors - wait for writeback without clearing errors
 * @mapping: address space structure to wait for
 *
 * Walk the list of under-writeback pages of the given address space
 * and wait for all of them.  Unlike filemap_fdatawait(), this function
 * does not clear error status of the address space.
 *
 * Use this function if callers don't handle errors themselves.  Expected
 * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
 * fsfreeze(8)
 *
 * Return: error status of the address space.
 */
int filemap_fdatawait_keep_errors(struct address_space *mapping)
{
        __filemap_fdatawait_range(mapping, 0, LLONG_MAX);
        return filemap_check_and_keep_errors(mapping);
}
EXPORT_SYMBOL(filemap_fdatawait_keep_errors);

/* Returns true if writeback might be needed or already in progress. */
static bool mapping_needs_writeback(struct address_space *mapping)
{
        if (dax_mapping(mapping))
                return mapping->nrexceptional;

        return mapping->nrpages;
}

/**
 * filemap_write_and_wait_range - write out & wait on a file range
 * @mapping:        the address_space for the pages
 * @lstart:        offset in bytes where the range starts
 * @lend:        offset in bytes where the range ends (inclusive)
 *
 * Write out and wait upon file offsets lstart->lend, inclusive.
 *
 * Note that @lend is inclusive (describes the last byte to be written) so
 * that this function can be used to write to the very end-of-file (end = -1).
 *
 * Return: error status of the address space.
 */
int filemap_write_and_wait_range(struct address_space *mapping,
                                 loff_t lstart, loff_t lend)
{
        int err = 0;

        if (mapping_needs_writeback(mapping)) {
                err = __filemap_fdatawrite_range(mapping, lstart, lend,
                                                 WB_SYNC_ALL);
                /*
                 * Even if the above returned error, the pages may be
                 * written partially (e.g. -ENOSPC), so we wait for it.
                 * But the -EIO is special case, it may indicate the worst
                 * thing (e.g. bug) happened, so we avoid waiting for it.
                 */
                if (err != -EIO) {
                        int err2 = filemap_fdatawait_range(mapping,
                                                lstart, lend);
                        if (!err)
                                err = err2;
                } else {
                        /* Clear any previously stored errors */
                        filemap_check_errors(mapping);
                }
        } else {
                err = filemap_check_errors(mapping);
        }
        return err;
}
EXPORT_SYMBOL(filemap_write_and_wait_range);

void __filemap_set_wb_err(struct address_space *mapping, int err)
{
        errseq_t eseq = errseq_set(&mapping->wb_err, err);

        trace_filemap_set_wb_err(mapping, eseq);
}
EXPORT_SYMBOL(__filemap_set_wb_err);

/**
 * file_check_and_advance_wb_err - report wb error (if any) that was previously
 *                                    and advance wb_err to current one
 * @file: struct file on which the error is being reported
 *
 * When userland calls fsync (or something like nfsd does the equivalent), we
 * want to report any writeback errors that occurred since the last fsync (or
 * since the file was opened if there haven't been any).
 *
 * Grab the wb_err from the mapping. If it matches what we have in the file,
 * then just quickly return 0. The file is all caught up.
 *
 * If it doesn't match, then take the mapping value, set the "seen" flag in
 * it and try to swap it into place. If it works, or another task beat us
 * to it with the new value, then update the f_wb_err and return the error
 * portion. The error at this point must be reported via proper channels
 * (a'la fsync, or NFS COMMIT operation, etc.).
 *
 * While we handle mapping->wb_err with atomic operations, the f_wb_err
 * value is protected by the f_lock since we must ensure that it reflects
 * the latest value swapped in for this file descriptor.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int file_check_and_advance_wb_err(struct file *file)
{
        int err = 0;
        errseq_t old = READ_ONCE(file->f_wb_err);
        struct address_space *mapping = file->f_mapping;

        /* Locklessly handle the common case where nothing has changed */
        if (errseq_check(&mapping->wb_err, old)) {
                /* Something changed, must use slow path */
                spin_lock(&file->f_lock);
                old = file->f_wb_err;
                err = errseq_check_and_advance(&mapping->wb_err,
                                                &file->f_wb_err);
                trace_file_check_and_advance_wb_err(file, old);
                spin_unlock(&file->f_lock);
        }

        /*
         * We're mostly using this function as a drop in replacement for
         * filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect
         * that the legacy code would have had on these flags.
         */
        clear_bit(AS_EIO, &mapping->flags);
        clear_bit(AS_ENOSPC, &mapping->flags);
        return err;
}
EXPORT_SYMBOL(file_check_and_advance_wb_err);

/**
 * file_write_and_wait_range - write out & wait on a file range
 * @file:        file pointing to address_space with pages
 * @lstart:        offset in bytes where the range starts
 * @lend:        offset in bytes where the range ends (inclusive)
 *
 * Write out and wait upon file offsets lstart->lend, inclusive.
 *
 * Note that @lend is inclusive (describes the last byte to be written) so
 * that this function can be used to write to the very end-of-file (end = -1).
 *
 * After writing out and waiting on the data, we check and advance the
 * f_wb_err cursor to the latest value, and return any errors detected there.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
{
        int err = 0, err2;
        struct address_space *mapping = file->f_mapping;

        if (mapping_needs_writeback(mapping)) {
                err = __filemap_fdatawrite_range(mapping, lstart, lend,
                                                 WB_SYNC_ALL);
                /* See comment of filemap_write_and_wait() */
                if (err != -EIO)
                        __filemap_fdatawait_range(mapping, lstart, lend);
        }
        err2 = file_check_and_advance_wb_err(file);
        if (!err)
                err = err2;
        return err;
}
EXPORT_SYMBOL(file_write_and_wait_range);

/**
 * replace_page_cache_page - replace a pagecache page with a new one
 * @old:        page to be replaced
 * @new:        page to replace with
 * @gfp_mask:        allocation mode
 *
 * This function replaces a page in the pagecache with a new one.  On
 * success it acquires the pagecache reference for the new page and
 * drops it for the old page.  Both the old and new pages must be
 * locked.  This function does not add the new page to the LRU, the
 * caller must do that.
 *
 * The remove + add is atomic.  This function cannot fail.
 *
 * Return: %0
 */
int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
{
        struct address_space *mapping = old->mapping;
        void (*freepage)(struct page *) = mapping->a_ops->freepage;
        pgoff_t offset = old->index;
        XA_STATE(xas, &mapping->i_pages, offset);
        unsigned long flags;

        VM_BUG_ON_PAGE(!PageLocked(old), old);
        VM_BUG_ON_PAGE(!PageLocked(new), new);
        VM_BUG_ON_PAGE(new->mapping, new);

        get_page(new);
        new->mapping = mapping;
        new->index = offset;

        mem_cgroup_migrate(old, new);

        xas_lock_irqsave(&xas, flags);
        xas_store(&xas, new);

        old->mapping = NULL;
        /* hugetlb pages do not participate in page cache accounting. */
        if (!PageHuge(old))
                __dec_lruvec_page_state(old, NR_FILE_PAGES);
        if (!PageHuge(new))
                __inc_lruvec_page_state(new, NR_FILE_PAGES);
        if (PageSwapBacked(old))
                __dec_lruvec_page_state(old, NR_SHMEM);
        if (PageSwapBacked(new))
                __inc_lruvec_page_state(new, NR_SHMEM);
        xas_unlock_irqrestore(&xas, flags);
        if (freepage)
                freepage(old);
        put_page(old);

        return 0;
}
EXPORT_SYMBOL_GPL(replace_page_cache_page);

noinline int __add_to_page_cache_locked(struct page *page,
                                        struct address_space *mapping,
                                        pgoff_t offset, gfp_t gfp,
                                        void **shadowp)
{
        XA_STATE(xas, &mapping->i_pages, offset);
        int huge = PageHuge(page);
        int error;
        bool charged = false;

        VM_BUG_ON_PAGE(!PageLocked(page), page);
        VM_BUG_ON_PAGE(PageSwapBacked(page), page);
        mapping_set_update(&xas, mapping);

        get_page(page);
        page->mapping = mapping;
        page->index = offset;

        if (!huge) {
                error = mem_cgroup_charge(page, current->mm, gfp);
                if (error)
                        goto error;
                charged = true;
        }

        gfp &= GFP_RECLAIM_MASK;

        do {
                unsigned int order = xa_get_order(xas.xa, xas.xa_index);
                void *entry, *old = NULL;

                if (order > thp_order(page))
                        xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index),
                                        order, gfp);
                xas_lock_irq(&xas);
                xas_for_each_conflict(&xas, entry) {
                        old = entry;
                        if (!xa_is_value(entry)) {
                                xas_set_err(&xas, -EEXIST);
                                goto unlock;
                        }
                }

                if (old) {
                        if (shadowp)
                                *shadowp = old;
                        /* entry may have been split before we acquired lock */
                        order = xa_get_order(xas.xa, xas.xa_index);
                        if (order > thp_order(page)) {
                                xas_split(&xas, old, order);
                                xas_reset(&xas);
                        }
                }

                xas_store(&xas, page);
                if (xas_error(&xas))
                        goto unlock;

                if (old)
                        mapping->nrexceptional--;
                mapping->nrpages++;

                /* hugetlb pages do not participate in page cache accounting */
                if (!huge)
                        __inc_lruvec_page_state(page, NR_FILE_PAGES);
unlock:
                xas_unlock_irq(&xas);
        } while (xas_nomem(&xas, gfp));

        if (xas_error(&xas)) {
                error = xas_error(&xas);
                if (charged)
                        mem_cgroup_uncharge(page);
                goto error;
        }

        trace_mm_filemap_add_to_page_cache(page);
        return 0;
error:
        page->mapping = NULL;
        /* Leave page->index set: truncation relies upon it */
        put_page(page);
        return error;
}
ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO);

/**
 * add_to_page_cache_locked - add a locked page to the pagecache
 * @page:        page to add
 * @mapping:        the page's address_space
 * @offset:        page index
 * @gfp_mask:        page allocation mode
 *
 * This function is used to add a page to the pagecache. It must be locked.
 * This function does not add the page to the LRU.  The caller must do that.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
                pgoff_t offset, gfp_t gfp_mask)
{
        return __add_to_page_cache_locked(page, mapping, offset,
                                          gfp_mask, NULL);
}
EXPORT_SYMBOL(add_to_page_cache_locked);

int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
                                pgoff_t offset, gfp_t gfp_mask)
{
        void *shadow = NULL;
        int ret;

        __SetPageLocked(page);
        ret = __add_to_page_cache_locked(page, mapping, offset,
                                         gfp_mask, &shadow);
        if (unlikely(ret))
                __ClearPageLocked(page);
        else {
                /*
                 * The page might have been evicted from cache only
                 * recently, in which case it should be activated like
                 * any other repeatedly accessed page.
                 * The exception is pages getting rewritten; evicting other
                 * data from the working set, only to cache data that will
                 * get overwritten with something else, is a waste of memory.
                 */
                WARN_ON_ONCE(PageActive(page));
                if (!(gfp_mask & __GFP_WRITE) && shadow)
                        workingset_refault(page, shadow);
                lru_cache_add(page);
        }
        return ret;
}
EXPORT_SYMBOL_GPL(add_to_page_cache_lru);

#ifdef CONFIG_NUMA
struct page *__page_cache_alloc(gfp_t gfp)
{
        int n;
        struct page *page;

        if (cpuset_do_page_mem_spread()) {
                unsigned int cpuset_mems_cookie;
                do {
                        cpuset_mems_cookie = read_mems_allowed_begin();
                        n = cpuset_mem_spread_node();
                        page = __alloc_pages_node(n, gfp, 0);
                } while (!page && read_mems_allowed_retry(cpuset_mems_cookie));

                return page;
        }
        return alloc_pages(gfp, 0);
}
EXPORT_SYMBOL(__page_cache_alloc);
#endif

/*
 * In order to wait for pages to become available there must be
 * waitqueues associated with pages. By using a hash table of
 * waitqueues where the bucket discipline is to maintain all
 * waiters on the same queue and wake all when any of the pages
 * become available, and for the woken contexts to check to be
 * sure the appropriate page became available, this saves space
 * at a cost of "thundering herd" phenomena during rare hash
 * collisions.
 */
#define PAGE_WAIT_TABLE_BITS 8
#define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;

static wait_queue_head_t *page_waitqueue(struct page *page)
{
        return &page_wait_table[hash_ptr(page, PAGE_WAIT_TABLE_BITS)];
}

void __init pagecache_init(void)
{
        int i;

        for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
                init_waitqueue_head(&page_wait_table[i]);

        page_writeback_init();
}

/*
 * The page wait code treats the "wait->flags" somewhat unusually, because
 * we have multiple different kinds of waits, not just the usual "exclusive"
 * one.
 *
 * We have:
 *
 *  (a) no special bits set:
 *
 *        We're just waiting for the bit to be released, and when a waker
 *        calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up,
 *        and remove it from the wait queue.
 *
 *        Simple and straightforward.
 *
 *  (b) WQ_FLAG_EXCLUSIVE:
 *
 *        The waiter is waiting to get the lock, and only one waiter should
 *        be woken up to avoid any thundering herd behavior. We'll set the
 *        WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue.
 *
 *        This is the traditional exclusive wait.
 *
 *  (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM:
 *
 *        The waiter is waiting to get the bit, and additionally wants the
 *        lock to be transferred to it for fair lock behavior. If the lock
 *        cannot be taken, we stop walking the wait queue without waking
 *        the waiter.
 *
 *        This is the "fair lock handoff" case, and in addition to setting
 *        WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see
 *        that it now has the lock.
 */
static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
{
        unsigned int flags;
        struct wait_page_key *key = arg;
        struct wait_page_queue *wait_page
                = container_of(wait, struct wait_page_queue, wait);

        if (!wake_page_match(wait_page, key))
                return 0;

        /*
         * If it's a lock handoff wait, we get the bit for it, and
         * stop walking (and do not wake it up) if we can't.
         */
        flags = wait->flags;
        if (flags & WQ_FLAG_EXCLUSIVE) {
                if (test_bit(key->bit_nr, &key->page->flags))
                        return -1;
                if (flags & WQ_FLAG_CUSTOM) {
                        if (test_and_set_bit(key->bit_nr, &key->page->flags))
                                return -1;
                        flags |= WQ_FLAG_DONE;
                }
        }

        /*
         * We are holding the wait-queue lock, but the waiter that
         * is waiting for this will be checking the flags without
         * any locking.
         *
         * So update the flags atomically, and wake up the waiter
         * afterwards to avoid any races. This store-release pairs
         * with the load-acquire in wait_on_page_bit_common().
         */
        smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);
        wake_up_state(wait->private, mode);

        /*
         * Ok, we have successfully done what we're waiting for,
         * and we can unconditionally remove the wait entry.
         *
         * Note that this pairs with the "finish_wait()" in the
         * waiter, and has to be the absolute last thing we do.
         * After this list_del_init(&wait->entry) the wait entry
         * might be de-allocated and the process might even have
         * exited.
         */
        list_del_init_careful(&wait->entry);
        return (flags & WQ_FLAG_EXCLUSIVE) != 0;
}

static void wake_up_page_bit(struct page *page, int bit_nr)
{
        wait_queue_head_t *q = page_waitqueue(page);
        struct wait_page_key key;
        unsigned long flags;
        wait_queue_entry_t bookmark;

        key.page = page;
        key.bit_nr = bit_nr;
        key.page_match = 0;

        bookmark.flags = 0;
        bookmark.private = NULL;
        bookmark.func = NULL;
        INIT_LIST_HEAD(&bookmark.entry);

        spin_lock_irqsave(&q->lock, flags);
        __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);

        while (bookmark.flags & WQ_FLAG_BOOKMARK) {
                /*
                 * Take a breather from holding the lock,
                 * allow pages that finish wake up asynchronously
                 * to acquire the lock and remove themselves
                 * from wait queue
                 */
                spin_unlock_irqrestore(&q->lock, flags);
                cpu_relax();
                spin_lock_irqsave(&q->lock, flags);
                __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
        }

        /*
         * It is possible for other pages to have collided on the waitqueue
         * hash, so in that case check for a page match. That prevents a long-
         * term waiter
         *
         * It is still possible to miss a case here, when we woke page waiters
         * and removed them from the waitqueue, but there are still other
         * page waiters.
         */
        if (!waitqueue_active(q) || !key.page_match) {
                ClearPageWaiters(page);
                /*
                 * It's possible to miss clearing Waiters here, when we woke
                 * our page waiters, but the hashed waitqueue has waiters for
                 * other pages on it.
                 *
                 * That's okay, it's a rare case. The next waker will clear it.
                 */
        }
        spin_unlock_irqrestore(&q->lock, flags);
}

static void wake_up_page(struct page *page, int bit)
{
        if (!PageWaiters(page))
                return;
        wake_up_page_bit(page, bit);
}

/*
 * A choice of three behaviors for wait_on_page_bit_common():
 */
enum behavior {
        EXCLUSIVE,        /* Hold ref to page and take the bit when woken, like
                         * __lock_page() waiting on then setting PG_locked.
                         */
        SHARED,                /* Hold ref to page and check the bit when woken, like
                         * wait_on_page_writeback() waiting on PG_writeback.
                         */
        DROP,                /* Drop ref to page before wait, no check when woken,
                         * like put_and_wait_on_page_locked() on PG_locked.
                         */
};

/*
 * Attempt to check (or get) the page bit, and mark us done
 * if successful.
 */
static inline bool trylock_page_bit_common(struct page *page, int bit_nr,
                                        struct wait_queue_entry *wait)
{
        if (wait->flags & WQ_FLAG_EXCLUSIVE) {
                if (test_and_set_bit(bit_nr, &page->flags))
                        return false;
        } else if (test_bit(bit_nr, &page->flags))
                return false;

        wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
        return true;
}

/* How many times do we accept lock stealing from under a waiter? */
int sysctl_page_lock_unfairness = 5;

static inline int wait_on_page_bit_common(wait_queue_head_t *q,
        struct page *page, int bit_nr, int state, enum behavior behavior)
{
        int unfairness = sysctl_page_lock_unfairness;
        struct wait_page_queue wait_page;
        wait_queue_entry_t *wait = &wait_page.wait;
        bool thrashing = false;
        bool delayacct = false;
        unsigned long pflags;

        if (bit_nr == PG_locked &&
            !PageUptodate(page) && PageWorkingset(page)) {
                if (!PageSwapBacked(page)) {
                        delayacct_thrashing_start();
                        delayacct = true;
                }
                psi_memstall_enter(&pflags);
                thrashing = true;
        }

        init_wait(wait);
        wait->func = wake_page_function;
        wait_page.page = page;
        wait_page.bit_nr = bit_nr;

repeat:
        wait->flags = 0;
        if (behavior == EXCLUSIVE) {
                wait->flags = WQ_FLAG_EXCLUSIVE;
                if (--unfairness < 0)
                        wait->flags |= WQ_FLAG_CUSTOM;
        }

        /*
         * Do one last check whether we can get the
         * page bit synchronously.
         *
         * Do the SetPageWaiters() marking before that
         * to let any waker we _just_ missed know they
         * need to wake us up (otherwise they'll never
         * even go to the slow case that looks at the
         * page queue), and add ourselves to the wait
         * queue if we need to sleep.
         *
         * This part needs to be done under the queue
         * lock to avoid races.
         */
        spin_lock_irq(&q->lock);
        SetPageWaiters(page);
        if (!trylock_page_bit_common(page, bit_nr, wait))
                __add_wait_queue_entry_tail(q, wait);
        spin_unlock_irq(&q->lock);

        /*
         * From now on, all the logic will be based on
         * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to
         * see whether the page bit testing has already
         * been done by the wake function.
         *
         * We can drop our reference to the page.
         */
        if (behavior == DROP)
                put_page(page);

        /*
         * Note that until the "finish_wait()", or until
         * we see the WQ_FLAG_WOKEN flag, we need to
         * be very careful with the 'wait->flags', because
         * we may race with a waker that sets them.
         */
        for (;;) {
                unsigned int flags;

                set_current_state(state);

                /* Loop until we've been woken or interrupted */
                flags = smp_load_acquire(&wait->flags);
                if (!(flags & WQ_FLAG_WOKEN)) {
                        if (signal_pending_state(state, current))
                                break;

                        io_schedule();
                        continue;
                }

                /* If we were non-exclusive, we're done */
                if (behavior != EXCLUSIVE)
                        break;

                /* If the waker got the lock for us, we're done */
                if (flags & WQ_FLAG_DONE)
                        break;

                /*
                 * Otherwise, if we're getting the lock, we need to
                 * try to get it ourselves.
                 *
                 * And if that fails, we'll have to retry this all.
                 */
                if (unlikely(test_and_set_bit(bit_nr, &page->flags)))
                        goto repeat;

                wait->flags |= WQ_FLAG_DONE;
                break;
        }

        /*
         * If a signal happened, this 'finish_wait()' may remove the last
         * waiter from the wait-queues, but the PageWaiters bit will remain
         * set. That's ok. The next wakeup will take care of it, and trying
         * to do it here would be difficult and prone to races.
         */
        finish_wait(q, wait);

        if (thrashing) {
                if (delayacct)
                        delayacct_thrashing_end();
                psi_memstall_leave(&pflags);
        }

        /*
         * NOTE! The wait->flags weren't stable until we've done the
         * 'finish_wait()', and we could have exited the loop above due
         * to a signal, and had a wakeup event happen after the signal
         * test but before the 'finish_wait()'.
         *
         * So only after the finish_wait() can we reliably determine
         * if we got woken up or not, so we can now figure out the final
         * return value based on that state without races.
         *
         * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive
         * waiter, but an exclusive one requires WQ_FLAG_DONE.
         */
        if (behavior == EXCLUSIVE)
                return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR;

        return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
}

void wait_on_page_bit(struct page *page, int bit_nr)
{
        wait_queue_head_t *q = page_waitqueue(page);
        wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
}
EXPORT_SYMBOL(wait_on_page_bit);

int wait_on_page_bit_killable(struct page *page, int bit_nr)
{
        wait_queue_head_t *q = page_waitqueue(page);
        return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, SHARED);
}
EXPORT_SYMBOL(wait_on_page_bit_killable);

static int __wait_on_page_locked_async(struct page *page,
                                       struct wait_page_queue *wait, bool set)
{
        struct wait_queue_head *q = page_waitqueue(page);
        int ret = 0;

        wait->page = page;
        wait->bit_nr = PG_locked;

        spin_lock_irq(&q->lock);
        __add_wait_queue_entry_tail(q, &wait->wait);
        SetPageWaiters(page);
        if (set)
                ret = !trylock_page(page);
        else
                ret = PageLocked(page);
        /*
         * If we were succesful now, we know we're still on the
         * waitqueue as we're still under the lock. This means it's
         * safe to remove and return success, we know the callback
         * isn't going to trigger.
         */
        if (!ret)
                __remove_wait_queue(q, &wait->wait);
        else
                ret = -EIOCBQUEUED;
        spin_unlock_irq(&q->lock);
        return ret;
}

static int wait_on_page_locked_async(struct page *page,
                                     struct wait_page_queue *wait)
{
        if (!PageLocked(page))
                return 0;
        return __wait_on_page_locked_async(compound_head(page), wait, false);
}

/**
 * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked
 * @page: The page to wait for.
 *
 * The caller should hold a reference on @page.  They expect the page to
 * become unlocked relatively soon, but do not wish to hold up migration
 * (for example) by holding the reference while waiting for the page to
 * come unlocked.  After this function returns, the caller should not
 * dereference @page.
 */
void put_and_wait_on_page_locked(struct page *page)
{
        wait_queue_head_t *q;

        page = compound_head(page);
        q = page_waitqueue(page);
        wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, DROP);
}

/**
 * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
 * @page: Page defining the wait queue of interest
 * @waiter: Waiter to add to the queue
 *
 * Add an arbitrary @waiter to the wait queue for the nominated @page.
 */
void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter)
{
        wait_queue_head_t *q = page_waitqueue(page);
        unsigned long flags;

        spin_lock_irqsave(&q->lock, flags);
        __add_wait_queue_entry_tail(q, waiter);
        SetPageWaiters(page);
        spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL_GPL(add_page_wait_queue);

#ifndef clear_bit_unlock_is_negative_byte

/*
 * PG_waiters is the high bit in the same byte as PG_lock.
 *
 * On x86 (and on many other architectures), we can clear PG_lock and
 * test the sign bit at the same time. But if the architecture does
 * not support that special operation, we just do this all by hand
 * instead.
 *
 * The read of PG_waiters has to be after (or concurrently with) PG_locked
 * being cleared, but a memory barrier should be unnecessary since it is
 * in the same byte as PG_locked.
 */
static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem)
{
        clear_bit_unlock(nr, mem);
        /* smp_mb__after_atomic(); */
        return test_bit(PG_waiters, mem);
}

#endif

/**
 * unlock_page - unlock a locked page
 * @page: the page
 *
 * Unlocks the page and wakes up sleepers in wait_on_page_locked().
 * Also wakes sleepers in wait_on_page_writeback() because the wakeup
 * mechanism between PageLocked pages and PageWriteback pages is shared.
 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
 *
 * Note that this depends on PG_waiters being the sign bit in the byte
 * that contains PG_locked - thus the BUILD_BUG_ON(). That allows us to
 * clear the PG_locked bit and test PG_waiters at the same time fairly
 * portably (architectures that do LL/SC can test any bit, while x86 can
 * test the sign bit).
 */
void unlock_page(struct page *page)
{
        BUILD_BUG_ON(PG_waiters != 7);
        page = compound_head(page);
        VM_BUG_ON_PAGE(!PageLocked(page), page);
        if (clear_bit_unlock_is_negative_byte(PG_locked, &page->flags))
                wake_up_page_bit(page, PG_locked);
}
EXPORT_SYMBOL(unlock_page);

/**
 * end_page_writeback - end writeback against a page
 * @page: the page
 */
void end_page_writeback(struct page *page)
{
        /*
         * TestClearPageReclaim could be used here but it is an atomic
         * operation and overkill in this particular case. Failing to
         * shuffle a page marked for immediate reclaim is too mild to
         * justify taking an atomic operation penalty at the end of
         * ever page writeback.
         */
        if (PageReclaim(page)) {
                ClearPageReclaim(page);
                rotate_reclaimable_page(page);
        }

        /*
         * Writeback does not hold a page reference of its own, relying
         * on truncation to wait for the clearing of PG_writeback.
         * But here we must make sure that the page is not freed and
         * reused before the wake_up_page().
         */
        get_page(page);
        if (!test_clear_page_writeback(page))
                BUG();

        smp_mb__after_atomic();
        wake_up_page(page, PG_writeback);
        put_page(page);
}
EXPORT_SYMBOL(end_page_writeback);

/*
 * After completing I/O on a page, call this routine to update the page
 * flags appropriately
 */
void page_endio(struct page *page, bool is_write, int err)
{
        if (!is_write) {
                if (!err) {
                        SetPageUptodate(page);
                } else {
                        ClearPageUptodate(page);
                        SetPageError(page);
                }
                unlock_page(page);
        } else {
                if (err) {
                        struct address_space *mapping;

                        SetPageError(page);
                        mapping = page_mapping(page);
                        if (mapping)
                                mapping_set_error(mapping, err);
                }
                end_page_writeback(page);
        }
}
EXPORT_SYMBOL_GPL(page_endio);

/**
 * __lock_page - get a lock on the page, assuming we need to sleep to get it
 * @__page: the page to lock
 */
void __lock_page(struct page *__page)
{
        struct page *page = compound_head(__page);
        wait_queue_head_t *q = page_waitqueue(page);
        wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE,
                                EXCLUSIVE);
}
EXPORT_SYMBOL(__lock_page);

int __lock_page_killable(struct page *__page)
{
        struct page *page = compound_head(__page);
        wait_queue_head_t *q = page_waitqueue(page);
        return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE,
                                        EXCLUSIVE);
}
EXPORT_SYMBOL_GPL(__lock_page_killable);

int __lock_page_async(struct page *page, struct wait_page_queue *wait)
{
        return __wait_on_page_locked_async(page, wait, true);
}

/*
 * Return values:
 * 1 - page is locked; mmap_lock is still held.
 * 0 - page is not locked.
 *     mmap_lock has been released (mmap_read_unlock(), unless flags had both
 *     FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in
 *     which case mmap_lock is still held.
 *
 * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1
 * with the page locked and the mmap_lock unperturbed.
 */
int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
                         unsigned int flags)
{
        if (fault_flag_allow_retry_first(flags)) {
                /*
                 * CAUTION! In this case, mmap_lock is not released
                 * even though return 0.
                 */
                if (flags & FAULT_FLAG_RETRY_NOWAIT)
                        return 0;

                mmap_read_unlock(mm);
                if (flags & FAULT_FLAG_KILLABLE)
                        wait_on_page_locked_killable(page);
                else
                        wait_on_page_locked(page);
                return 0;
        } else {
                if (flags & FAULT_FLAG_KILLABLE) {
                        int ret;

                        ret = __lock_page_killable(page);
                        if (ret) {
                                mmap_read_unlock(mm);
                                return 0;
                        }
                } else
                        __lock_page(page);
                return 1;
        }
}

/**
 * page_cache_next_miss() - Find the next gap in the page cache.
 * @mapping: Mapping.
 * @index: Index.
 * @max_scan: Maximum range to search.
 *
 * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the
 * gap with the lowest index.
 *
 * This function may be called under the rcu_read_lock.  However, this will
 * not atomically search a snapshot of the cache at a single point in time.
 * For example, if a gap is created at index 5, then subsequently a gap is
 * created at index 10, page_cache_next_miss covering both indices may
 * return 10 if called under the rcu_read_lock.
 *
 * Return: The index of the gap if found, otherwise an index outside the
 * range specified (in which case 'return - index >= max_scan' will be true).
 * In the rare case of index wrap-around, 0 will be returned.
 */
pgoff_t page_cache_next_miss(struct address_space *mapping,
                             pgoff_t index, unsigned long max_scan)
{
        XA_STATE(xas, &mapping->i_pages, index);

        while (max_scan--) {
                void *entry = xas_next(&xas);
                if (!entry || xa_is_value(entry))
                        break;
                if (xas.xa_index == 0)
                        break;
        }

        return xas.xa_index;
}
EXPORT_SYMBOL(page_cache_next_miss);

/**
 * page_cache_prev_miss() - Find the previous gap in the page cache.
 * @mapping: Mapping.
 * @index: Index.
 * @max_scan: Maximum range to search.
 *
 * Search the range [max(index - max_scan + 1, 0), index] for the
 * gap with the highest index.
 *
 * This function may be called under the rcu_read_lock.  However, this will
 * not atomically search a snapshot of the cache at a single point in time.
 * For example, if a gap is created at index 10, then subsequently a gap is
 * created at index 5, page_cache_prev_miss() covering both indices may
 * return 5 if called under the rcu_read_lock.
 *
 * Return: The index of the gap if found, otherwise an index outside the
 * range specified (in which case 'index - return >= max_scan' will be true).
 * In the rare case of wrap-around, ULONG_MAX will be returned.
 */
pgoff_t page_cache_prev_miss(struct address_space *mapping,
                             pgoff_t index, unsigned long max_scan)
{
        XA_STATE(xas, &mapping->i_pages, index);

        while (max_scan--) {
                void *entry = xas_prev(&xas);
                if (!entry || xa_is_value(entry))
                        break;
                if (xas.xa_index == ULONG_MAX)
                        break;
        }

        return xas.xa_index;
}
EXPORT_SYMBOL(page_cache_prev_miss);

/**
 * find_get_entry - find and get a page cache entry
 * @mapping: the address_space to search
 * @index: The page cache index.
 *
 * Looks up the page cache slot at @mapping & @offset.  If there is a
 * page cache page, the head page is returned with an increased refcount.
 *
 * If the slot holds a shadow entry of a previously evicted page, or a
 * swap entry from shmem/tmpfs, it is returned.
 *
 * Return: The head page or shadow entry, %NULL if nothing is found.
 */
struct page *find_get_entry(struct address_space *mapping, pgoff_t index)
{
        XA_STATE(xas, &mapping->i_pages, index);
        struct page *page;

        rcu_read_lock();
repeat:
        xas_reset(&xas);
        page = xas_load(&xas);
        if (xas_retry(&xas, page))
                goto repeat;
        /*
         * A shadow entry of a recently evicted page, or a swap entry from
         * shmem/tmpfs.  Return it without attempting to raise page count.
         */
        if (!page || xa_is_value(page))
                goto out;

        if (!page_cache_get_speculative(page))
                goto repeat;

        /*
         * Has the page moved or been split?
         * This is part of the lockless pagecache protocol. See
         * include/linux/pagemap.h for details.
         */
        if (unlikely(page != xas_reload(&xas))) {
                put_page(page);
                goto repeat;
        }
out:
        rcu_read_unlock();

        return page;
}

/**
 * find_lock_entry - Locate and lock a page cache entry.
 * @mapping: The address_space to search.
 * @index: The page cache index.
 *
 * Looks up the page at @mapping & @index.  If there is a page in the
 * cache, the head page is returned locked and with an increased refcount.
 *
 * If the slot holds a shadow entry of a previously evicted page, or a
 * swap entry from shmem/tmpfs, it is returned.
 *
 * Context: May sleep.
 * Return: The head page or shadow entry, %NULL if nothing is found.
 */
struct page *find_lock_entry(struct address_space *mapping, pgoff_t index)
{
        struct page *page;

repeat:
        page = find_get_entry(mapping, index);
        if (page && !xa_is_value(page)) {
                lock_page(page);
                /* Has the page been truncated? */
                if (unlikely(page->mapping != mapping)) {
                        unlock_page(page);
                        put_page(page);
                        goto repeat;
                }
                VM_BUG_ON_PAGE(!thp_contains(page, index), page);
        }
        return page;
}

/**
 * pagecache_get_page - Find and get a reference to a page.
 * @mapping: The address_space to search.
 * @index: The page index.
 * @fgp_flags: %FGP flags modify how the page is returned.
 * @gfp_mask: Memory allocation flags to use if %FGP_CREAT is specified.
 *
 * Looks up the page cache entry at @mapping & @index.
 *
 * @fgp_flags can be zero or more of these flags:
 *
 * * %FGP_ACCESSED - The page will be marked accessed.
 * * %FGP_LOCK - The page is returned locked.
 * * %FGP_HEAD - If the page is present and a THP, return the head page
 *   rather than the exact page specified by the index.
 * * %FGP_CREAT - If no page is present then a new page is allocated using
 *   @gfp_mask and added to the page cache and the VM's LRU list.
 *   The page is returned locked and with an increased refcount.
 * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the
 *   page is already in cache.  If the page was allocated, unlock it before
 *   returning so the caller can do the same dance.
 * * %FGP_WRITE - The page will be written
 * * %FGP_NOFS - __GFP_FS will get cleared in gfp mask
 * * %FGP_NOWAIT - Don't get blocked by page lock
 *
 * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
 * if the %GFP flags specified for %FGP_CREAT are atomic.
 *
 * If there is a page cache page, it is returned with an increased refcount.
 *
 * Return: The found page or %NULL otherwise.
 */
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
                int fgp_flags, gfp_t gfp_mask)
{
        struct page *page;

repeat:
        page = find_get_entry(mapping, index);
        if (xa_is_value(page))
                page = NULL;
        if (!page)
                goto no_page;

        if (fgp_flags & FGP_LOCK) {
                if (fgp_flags & FGP_NOWAIT) {
                        if (!trylock_page(page)) {
                                put_page(page);
                                return NULL;
                        }
                } else {
                        lock_page(page);
                }

                /* Has the page been truncated? */
                if (unlikely(page->mapping != mapping)) {
                        unlock_page(page);
                        put_page(page);
                        goto repeat;
                }
                VM_BUG_ON_PAGE(!thp_contains(page, index), page);
        }

        if (fgp_flags & FGP_ACCESSED)
                mark_page_accessed(page);
        else if (fgp_flags & FGP_WRITE) {
                /* Clear idle flag for buffer write */
                if (page_is_idle(page))
                        clear_page_idle(page);
        }
        if (!(fgp_flags & FGP_HEAD))
                page = find_subpage(page, index);

no_page:
        if (!page && (fgp_flags & FGP_CREAT)) {
                int err;
                if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
                        gfp_mask |= __GFP_WRITE;
                if (fgp_flags & FGP_NOFS)
                        gfp_mask &= ~__GFP_FS;

                page = __page_cache_alloc(gfp_mask);
                if (!page)
                        return NULL;

                if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
                        fgp_flags |= FGP_LOCK;

                /* Init accessed so avoid atomic mark_page_accessed later */
                if (fgp_flags & FGP_ACCESSED)
                        __SetPageReferenced(page);

                err = add_to_page_cache_lru(page, mapping, index, gfp_mask);
                if (unlikely(err)) {
                        put_page(page);
                        page = NULL;
                        if (err == -EEXIST)
                                goto repeat;
                }

                /*
                 * add_to_page_cache_lru locks the page, and for mmap we expect
                 * an unlocked page.
                 */
                if (page && (fgp_flags & FGP_FOR_MMAP))
                        unlock_page(page);
        }

        return page;
}
EXPORT_SYMBOL(pagecache_get_page);

/**
 * find_get_entries - gang pagecache lookup
 * @mapping:        The address_space to search
 * @start:        The starting page cache index
 * @nr_entries:        The maximum number of entries
 * @entries:        Where the resulting entries are placed
 * @indices:        The cache indices corresponding to the entries in @entries
 *
 * find_get_entries() will search for and return a group of up to
 * @nr_entries entries in the mapping.  The entries are placed at
 * @entries.  find_get_entries() takes a reference against any actual
 * pages it returns.
 *
 * The search returns a group of mapping-contiguous page cache entries
 * with ascending indexes.  There may be holes in the indices due to
 * not-present pages.
 *
 * Any shadow entries of evicted pages, or swap entries from
 * shmem/tmpfs, are included in the returned array.
 *
 * If it finds a Transparent Huge Page, head or tail, find_get_entries()
 * stops at that page: the caller is likely to have a better way to handle
 * the compound page as a whole, and then skip its extent, than repeatedly
 * calling find_get_entries() to return all its tails.
 *
 * Return: the number of pages and shadow entries which were found.
 */
unsigned find_get_entries(struct address_space *mapping,
                          pgoff_t start, unsigned int nr_entries,
                          struct page **entries, pgoff_t *indices)
{
        XA_STATE(xas, &mapping->i_pages, start);
        struct page *page;
        unsigned int ret = 0;

        if (!nr_entries)
                return 0;

        rcu_read_lock();
        xas_for_each(&xas, page, ULONG_MAX) {
                if (xas_retry(&xas, page))
                        continue;
                /*
                 * A shadow entry of a recently evicted page, a swap
                 * entry from shmem/tmpfs or a DAX entry.  Return it
                 * without attempting to raise page count.
                 */
                if (xa_is_value(page))
                        goto export;

                if (!page_cache_get_speculative(page))
                        goto retry;

                /* Has the page moved or been split? */
                if (unlikely(page != xas_reload(&xas)))
                        goto put_page;

                /*
                 * Terminate early on finding a THP, to allow the caller to
                 * handle it all at once; but continue if this is hugetlbfs.
                 */
                if (PageTransHuge(page) && !PageHuge(page)) {
                        page = find_subpage(page, xas.xa_index);
                        nr_entries = ret + 1;
                }
export:
                indices[ret] = xas.xa_index;
                entries[ret] = page;
                if (++ret == nr_entries)
                        break;
                continue;
put_page:
                put_page(page);
retry:
                xas_reset(&xas);
        }
        rcu_read_unlock();
        return ret;
}

/**
 * find_get_pages_range - gang pagecache lookup
 * @mapping:        The address_space to search
 * @start:        The starting page index
 * @end:        The final page index (inclusive)
 * @nr_pages:        The maximum number of pages
 * @pages:        Where the resulting pages are placed
 *
 * find_get_pages_range() will search for and return a group of up to @nr_pages
 * pages in the mapping starting at index @start and up to index @end
 * (inclusive).  The pages are placed at @pages.  find_get_pages_range() takes
 * a reference against the returned pages.
 *
 * The search returns a group of mapping-contiguous pages with ascending
 * indexes.  There may be holes in the indices due to not-present pages.
 * We also update @start to index the next page for the traversal.
 *
 * Return: the number of pages which were found. If this number is
 * smaller than @nr_pages, the end of specified range has been
 * reached.
 */
unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
                              pgoff_t end, unsigned int nr_pages,
                              struct page **pages)
{
        XA_STATE(xas, &mapping->i_pages, *start);
        struct page *page;
        unsigned ret = 0;

        if (unlikely(!nr_pages))
                return 0;

        rcu_read_lock();
        xas_for_each(&xas, page, end) {
                if (xas_retry(&xas, page))
                        continue;
                /* Skip over shadow, swap and DAX entries */
                if (xa_is_value(page))
                        continue;

                if (!page_cache_get_speculative(page))
                        goto retry;

                /* Has the page moved or been split? */
                if (unlikely(page != xas_reload(&xas)))
                        goto put_page;

                pages[ret] = find_subpage(page, xas.xa_index);
                if (++ret == nr_pages) {
                        *start = xas.xa_index + 1;
                        goto out;
                }
                continue;
put_page:
                put_page(page);
retry:
                xas_reset(&xas);
        }

        /*
         * We come here when there is no page beyond @end. We take care to not
         * overflow the index @start as it confuses some of the callers. This
         * breaks the iteration when there is a page at index -1 but that is
         * already broken anyway.
         */
        if (end == (pgoff_t)-1)
                *start = (pgoff_t)-1;
        else
                *start = end + 1;
out:
        rcu_read_unlock();

        return ret;
}

/**
 * find_get_pages_contig - gang contiguous pagecache lookup
 * @mapping:        The address_space to search
 * @index:        The starting page index
 * @nr_pages:        The maximum number of pages
 * @pages:        Where the resulting pages are placed
 *
 * find_get_pages_contig() works exactly like find_get_pages(), except
 * that the returned number of pages are guaranteed to be contiguous.
 *
 * Return: the number of pages which were found.
 */
unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
                               unsigned int nr_pages, struct page **pages)
{
        XA_STATE(xas, &mapping->i_pages, index);
        struct page *page;
        unsigned int ret = 0;

        if (unlikely(!nr_pages))
                return 0;

        rcu_read_lock();
        for (page = xas_load(&xas); page; page = xas_next(&xas)) {
                if (xas_retry(&xas, page))
                        continue;
                /*
                 * If the entry has been swapped out, we can stop looking.
                 * No current caller is looking for DAX entries.
                 */
                if (xa_is_value(page))
                        break;

                if (!page_cache_get_speculative(page))
                        goto retry;

                /* Has the page moved or been split? */
                if (unlikely(page != xas_reload(&xas)))
                        goto put_page;

                pages[ret] = find_subpage(page, xas.xa_index);
                if (++ret == nr_pages)
                        break;
                continue;
put_page:
                put_page(page);
retry:
                xas_reset(&xas);
        }
        rcu_read_unlock();
        return ret;
}
EXPORT_SYMBOL(find_get_pages_contig);

/**
 * find_get_pages_range_tag - find and return pages in given range matching @tag
 * @mapping:        the address_space to search
 * @index:        the starting page index
 * @end:        The final page index (inclusive)
 * @tag:        the tag index
 * @nr_pages:        the maximum number of pages
 * @pages:        where the resulting pages are placed
 *
 * Like find_get_pages, except we only return pages which are tagged with
 * @tag.   We update @index to index the next page for the traversal.
 *
 * Return: the number of pages which were found.
 */
unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
                        pgoff_t end, xa_mark_t tag, unsigned int nr_pages,
                        struct page **pages)
{
        XA_STATE(xas, &mapping->i_pages, *index);
        struct page *page;
        unsigned ret = 0;

        if (unlikely(!nr_pages))
                return 0;

        rcu_read_lock();
        xas_for_each_marked(&xas, page, end, tag) {
                if (xas_retry(&xas, page))
                        continue;
                /*
                 * Shadow entries should never be tagged, but this iteration
                 * is lockless so there is a window for page reclaim to evict
                 * a page we saw tagged.  Skip over it.
                 */
                if (xa_is_value(page))
                        continue;

                if (!page_cache_get_speculative(page))
                        goto retry;

                /* Has the page moved or been split? */
                if (unlikely(page != xas_reload(&xas)))
                        goto put_page;

                pages[ret] = find_subpage(page, xas.xa_index);
                if (++ret == nr_pages) {
                        *index = xas.xa_index + 1;
                        goto out;
                }
                continue;
put_page:
                put_page(page);
retry:
                xas_reset(&xas);
        }

        /*
         * We come here when we got to @end. We take care to not overflow the
         * index @index as it confuses some of the callers. This breaks the
         * iteration when there is a page at index -1 but that is already
         * broken anyway.
         */
        if (end == (pgoff_t)-1)
                *index = (pgoff_t)-1;
        else
                *index = end + 1;
out:
        rcu_read_unlock();

        return ret;
}
EXPORT_SYMBOL(find_get_pages_range_tag);

/*
 * CD/DVDs are error prone. When a medium error occurs, the driver may fail
 * a _large_ part of the i/o request. Imagine the worst scenario:
 *
 *      ---R__________________________________________B__________
 *         ^ reading here                             ^ bad block(assume 4k)
 *
 * read(R) => miss => readahead(R...B) => media error => frustrating retries
 * => failing the whole request => read(R) => read(R+1) =>
 * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
 * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
 * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
 *
 * It is going insane. Fix it by quickly scaling down the readahead size.
 */
static void shrink_readahead_size_eio(struct file_ra_state *ra)
{
        ra->ra_pages /= 4;
}

/**
 * generic_file_buffered_read - generic file read routine
 * @iocb:        the iocb to read
 * @iter:        data destination
 * @written:        already copied
 *
 * This is a generic file read routine, and uses the
 * mapping->a_ops->readpage() function for the actual low-level stuff.
 *
 * This is really ugly. But the goto's actually try to clarify some
 * of the logic when it comes to error handling etc.
 *
 * Return:
 * * total number of bytes copied, including those the were already @written
 * * negative error code if nothing was copied
 */
ssize_t generic_file_buffered_read(struct kiocb *iocb,
                struct iov_iter *iter, ssize_t written)
{
        struct file *filp = iocb->ki_filp;
        struct address_space *mapping = filp->f_mapping;
        struct inode *inode = mapping->host;
        struct file_ra_state *ra = &filp->f_ra;
        loff_t *ppos = &iocb->ki_pos;
        pgoff_t index;
        pgoff_t last_index;
        pgoff_t prev_index;
        unsigned long offset;      /* offset into pagecache page */
        unsigned int prev_offset;
        int error = 0;

        if (unlikely(*ppos >= inode->i_sb->s_maxbytes))
                return 0;
        if (unlikely(!iov_iter_count(iter)))
                return 0;

        iov_iter_truncate(iter, inode->i_sb->s_maxbytes);

        index = *ppos >> PAGE_SHIFT;
        prev_index = ra->prev_pos >> PAGE_SHIFT;
        prev_offset = ra->prev_pos & (PAGE_SIZE-1);
        last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
        offset = *ppos & ~PAGE_MASK;

        /*
         * If we've already successfully copied some data, then we
         * can no longer safely return -EIOCBQUEUED. Hence mark
         * an async read NOWAIT at that point.
         */
        if (written && (iocb->ki_flags & IOCB_WAITQ))
                iocb->ki_flags |= IOCB_NOWAIT;

        for (;;) {
                struct page *page;
                pgoff_t end_index;
                loff_t isize;
                unsigned long nr, ret;

                cond_resched();
find_page:
                if (fatal_signal_pending(current)) {
                        error = -EINTR;
                        goto out;
                }

                page = find_get_page(mapping, index);
                if (!page) {
                        if (iocb->ki_flags & IOCB_NOIO)
                                goto would_block;
                        page_cache_sync_readahead(mapping,
                                        ra, filp,
                                        index, last_index - index);
                        page = find_get_page(mapping, index);
                        if (unlikely(page == NULL))
                                goto no_cached_page;
                }
                if (PageReadahead(page)) {
                        if (iocb->ki_flags & IOCB_NOIO) {
                                put_page(page);
                                goto out;
                        }
                        page_cache_async_readahead(mapping,
                                        ra, filp, page,
                                        index, last_index - index);
                }
                if (!PageUptodate(page)) {
                        /*
                         * See comment in do_read_cache_page on why
                         * wait_on_page_locked is used to avoid unnecessarily
                         * serialisations and why it's safe.
                         */
                        if (iocb->ki_flags & IOCB_WAITQ) {
                                if (written) {
                                        put_page(page);
                                        goto out;
                                }
                                error = wait_on_page_locked_async(page,
                                                                iocb->ki_waitq);
                        } else {
                                if (iocb->ki_flags & IOCB_NOWAIT) {
                                        put_page(page);
                                        goto would_block;
                                }
                                error = wait_on_page_locked_killable(page);
                        }
                        if (unlikely(error))
                                goto readpage_error;
                        if (PageUptodate(page))
                                goto page_ok;

                        if (inode->i_blkbits == PAGE_SHIFT ||
                                        !mapping->a_ops->is_partially_uptodate)
                                goto page_not_up_to_date;
                        /* pipes can't handle partially uptodate pages */
                        if (unlikely(iov_iter_is_pipe(iter)))
                                goto page_not_up_to_date;
                        if (!trylock_page(page))
                                goto page_not_up_to_date;
                        /* Did it get truncated before we got the lock? */
                        if (!page->mapping)
                                goto page_not_up_to_date_locked;
                        if (!mapping->a_ops->is_partially_uptodate(page,
                                                        offset, iter->count))
                                goto page_not_up_to_date_locked;
                        unlock_page(page);
                }
page_ok:
                /*
                 * i_size must be checked after we know the page is Uptodate.
                 *
                 * Checking i_size after the check allows us to calculate
                 * the correct value for "nr", which means the zero-filled
                 * part of the page is not copied back to userspace (unless
                 * another truncate extends the file - this is desired though).
                 */

                isize = i_size_read(inode);
                end_index = (isize - 1) >> PAGE_SHIFT;
                if (unlikely(!isize || index > end_index)) {
                        put_page(page);
                        goto out;
                }

                /* nr is the maximum number of bytes to copy from this page */
                nr = PAGE_SIZE;
                if (index == end_index) {
                        nr = ((isize - 1) & ~PAGE_MASK) + 1;
                        if (nr <= offset) {
                                put_page(page);
                                goto out;
                        }
                }
                nr = nr - offset;

                /* If users can be writing to this page using arbitrary
                 * virtual addresses, take care about potential aliasing
                 * before reading the page on the kernel side.
                 */
                if (mapping_writably_mapped(mapping))
                        flush_dcache_page(page);

                /*
                 * When a sequential read accesses a page several times,
                 * only mark it as accessed the first time.
                 */
                if (prev_index != index || offset != prev_offset)
                        mark_page_accessed(page);
                prev_index = index;

                /*
                 * Ok, we have the page, and it's up-to-date, so
                 * now we can copy it to user space...
                 */

                ret = copy_page_to_iter(page, offset, nr, iter);
                offset += ret;
                index += offset >> PAGE_SHIFT;
                offset &= ~PAGE_MASK;
                prev_offset = offset;

                put_page(page);
                written += ret;
                if (!iov_iter_count(iter))
                        goto out;
                if (ret < nr) {
                        error = -EFAULT;
                        goto out;
                }
                continue;

page_not_up_to_date:
                /* Get exclusive access to the page ... */
                if (iocb->ki_flags & IOCB_WAITQ) {
                        if (written) {
                                put_page(page);
                                goto out;
                        }
                        error = lock_page_async(page, iocb->ki_waitq);
                } else {
                        error = lock_page_killable(page);
                }
                if (unlikely(error))
                        goto readpage_error;

page_not_up_to_date_locked:
                /* Did it get truncated before we got the lock? */
                if (!page->mapping) {
                        unlock_page(page);
                        put_page(page);
                        continue;
                }

                /* Did somebody else fill it already? */
                if (PageUptodate(page)) {
                        unlock_page(page);
                        goto page_ok;
                }

readpage:
                if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) {
                        unlock_page(page);
                        put_page(page);
                        goto would_block;
                }
                /*
                 * A previous I/O error may have been due to temporary
                 * failures, eg. multipath errors.
                 * PG_error will be set again if readpage fails.
                 */
                ClearPageError(page);
                /* Start the actual read. The read will unlock the page. */
                error = mapping->a_ops->readpage(filp, page);

                if (unlikely(error)) {
                        if (error == AOP_TRUNCATED_PAGE) {
                                put_page(page);
                                error = 0;
                                goto find_page;
                        }
                        goto readpage_error;
                }

                if (!PageUptodate(page)) {
                        if (iocb->ki_flags & IOCB_WAITQ) {
                                if (written) {
                                        put_page(page);
                                        goto out;
                                }
                                error = lock_page_async(page, iocb->ki_waitq);
                        } else {
                                error = lock_page_killable(page);
                        }

                        if (unlikely(error))
                                goto readpage_error;
                        if (!PageUptodate(page)) {
                                if (page->mapping == NULL) {
                                        /*
                                         * invalidate_mapping_pages got it
                                         */
                                        unlock_page(page);
                                        put_page(page);
                                        goto find_page;
                                }
                                unlock_page(page);
                                shrink_readahead_size_eio(ra);
                                error = -EIO;
                                goto readpage_error;
                        }
                        unlock_page(page);
                }

                goto page_ok;

readpage_error:
                /* UHHUH! A synchronous read error occurred. Report it */
                put_page(page);
                goto out;

no_cached_page:
                /*
                 * Ok, it wasn't cached, so we need to create a new
                 * page..
                 */
                page = page_cache_alloc(mapping);
                if (!page) {
                        error = -ENOMEM;
                        goto out;
                }
                error = add_to_page_cache_lru(page, mapping, index,
                                mapping_gfp_constraint(mapping, GFP_KERNEL));
                if (error) {
                        put_page(page);
                        if (error == -EEXIST) {
                                error = 0;
                                goto find_page;
                        }
                        goto out;
                }
                goto readpage;
        }

would_block:
        error = -EAGAIN;
out:
        ra->prev_pos = prev_index;
        ra->prev_pos <<= PAGE_SHIFT;
        ra->prev_pos |= prev_offset;

        *ppos = ((loff_t)index << PAGE_SHIFT) + offset;
        file_accessed(filp);
        return written ? written : error;
}
EXPORT_SYMBOL_GPL(generic_file_buffered_read);

/**
 * generic_file_read_iter - generic filesystem read routine
 * @iocb:        kernel I/O control block
 * @iter:        destination for the data read
 *
 * This is the "read_iter()" routine for all filesystems
 * that can use the page cache directly.
 *
 * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall
 * be returned when no data can be read without waiting for I/O requests
 * to complete; it doesn't prevent readahead.
 *
 * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O
 * requests shall be made for the read or for readahead.  When no data
 * can be read, -EAGAIN shall be returned.  When readahead would be
 * triggered, a partial, possibly empty read shall be returned.
 *
 * Return:
 * * number of bytes copied, even for partial reads
 * * negative error code (or 0 if IOCB_NOIO) if nothing was read
 */
ssize_t
generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
        size_t count = iov_iter_count(iter);
        ssize_t retval = 0;

        if (!count)
                goto out; /* skip atime */

        if (iocb->ki_flags & IOCB_DIRECT) {
                struct file *file = iocb->ki_filp;
                struct address_space *mapping = file->f_mapping;
                struct inode *inode = mapping->host;
                loff_t size;

                size = i_size_read(inode);
                if (iocb->ki_flags & IOCB_NOWAIT) {
                        if (filemap_range_has_page(mapping, iocb->ki_pos,
                                                   iocb->ki_pos + count - 1))
                                return -EAGAIN;
                } else {
                        retval = filemap_write_and_wait_range(mapping,
                                                iocb->ki_pos,
                                                iocb->ki_pos + count - 1);
                        if (retval < 0)
                                goto out;
                }

                file_accessed(file);

                retval = mapping->a_ops->direct_IO(iocb, iter);
                if (retval >= 0) {
                        iocb->ki_pos += retval;
                        count -= retval;
                }
                iov_iter_revert(iter, count - iov_iter_count(iter));

                /*
                 * Btrfs can have a short DIO read if we encounter
                 * compressed extents, so if there was an error, or if
                 * we've already read everything we wanted to, or if
                 * there was a short read because we hit EOF, go ahead
                 * and return.  Otherwise fallthrough to buffered io for
                 * the rest of the read.  Buffered reads will not work for
                 * DAX files, so don't bother trying.
                 */
                if (retval < 0 || !count || iocb->ki_pos >= size ||
                    IS_DAX(inode))
                        goto out;
        }

        retval = generic_file_buffered_read(iocb, iter, retval);
out:
        return retval;
}
EXPORT_SYMBOL(generic_file_read_iter);

#ifdef CONFIG_MMU
#define MMAP_LOTSAMISS  (100)
/*
 * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock
 * @vmf - the vm_fault for this fault.
 * @page - the page to lock.
 * @fpin - the pointer to the file we may pin (or is already pinned).
 *
 * This works similar to lock_page_or_retry in that it can drop the mmap_lock.
 * It differs in that it actually returns the page locked if it returns 1 and 0
 * if it couldn't lock the page.  If we did have to drop the mmap_lock then fpin
 * will point to the pinned file and needs to be fput()'ed at a later point.
 */
static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
                                     struct file **fpin)
{
        if (trylock_page(page))
                return 1;

        /*
         * NOTE! This will make us return with VM_FAULT_RETRY, but with
         * the mmap_lock still held. That's how FAULT_FLAG_RETRY_NOWAIT
         * is supposed to work. We have way too many special cases..
         */
        if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
                return 0;

        *fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
        if (vmf->flags & FAULT_FLAG_KILLABLE) {
                if (__lock_page_killable(page)) {
                        /*
                         * We didn't have the right flags to drop the mmap_lock,
                         * but all fault_handlers only check for fatal signals
                         * if we return VM_FAULT_RETRY, so we need to drop the
                         * mmap_lock here and return 0 if we don't have a fpin.
                         */
                        if (*fpin == NULL)
                                mmap_read_unlock(vmf->vma->vm_mm);
                        return 0;
                }
        } else
                __lock_page(page);
        return 1;
}


/*
 * Synchronous readahead happens when we don't even find a page in the page
 * cache at all.  We don't want to perform IO under the mmap sem, so if we have
 * to drop the mmap sem we return the file that was pinned in order for us to do
 * that.  If we didn't pin a file then we return NULL.  The file that is
 * returned needs to be fput()'ed when we're done with it.
 */
static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
{
        struct file *file = vmf->vma->vm_file;
        struct file_ra_state *ra = &file->f_ra;
        struct address_space *mapping = file->f_mapping;
        DEFINE_READAHEAD(ractl, file, mapping, vmf->pgoff);
        struct file *fpin = NULL;
        unsigned int mmap_miss;

        /* If we don't want any read-ahead, don't bother */
        if (vmf->vma->vm_flags & VM_RAND_READ)
                return fpin;
        if (!ra->ra_pages)
                return fpin;

        if (vmf->vma->vm_flags & VM_SEQ_READ) {
                fpin = maybe_unlock_mmap_for_io(vmf, fpin);
                page_cache_sync_ra(&ractl, ra, ra->ra_pages);
                return fpin;
        }

        /* Avoid banging the cache line if not needed */
        mmap_miss = READ_ONCE(ra->mmap_miss);
        if (mmap_miss < MMAP_LOTSAMISS * 10)
                WRITE_ONCE(ra->mmap_miss, ++mmap_miss);

        /*
         * Do we miss much more than hit in this file? If so,
         * stop bothering with read-ahead. It will only hurt.
         */
        if (mmap_miss > MMAP_LOTSAMISS)
                return fpin;

        /*
         * mmap read-around
         */
        fpin = maybe_unlock_mmap_for_io(vmf, fpin);
        ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
        ra->size = ra->ra_pages;
        ra->async_size = ra->ra_pages / 4;
        ractl._index = ra->start;
        do_page_cache_ra(&ractl, ra->size, ra->async_size);
        return fpin;
}

/*
 * Asynchronous readahead happens when we find the page and PG_readahead,
 * so we want to possibly extend the readahead further.  We return the file that
 * was pinned if we have to drop the mmap_lock in order to do IO.
 */
static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
                                            struct page *page)
{
        struct file *file = vmf->vma->vm_file;
        struct file_ra_state *ra = &file->f_ra;
        struct address_space *mapping = file->f_mapping;
        struct file *fpin = NULL;
        unsigned int mmap_miss;
        pgoff_t offset = vmf->pgoff;

        /* If we don't want any read-ahead, don't bother */
        if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
                return fpin;
        mmap_miss = READ_ONCE(ra->mmap_miss);
        if (mmap_miss)
                WRITE_ONCE(ra->mmap_miss, --mmap_miss);
        if (PageReadahead(page)) {
                fpin = maybe_unlock_mmap_for_io(vmf, fpin);
                page_cache_async_readahead(mapping, ra, file,
                                           page, offset, ra->ra_pages);
        }
        return fpin;
}

/**
 * filemap_fault - read in file data for page fault handling
 * @vmf:        struct vm_fault containing details of the fault
 *
 * filemap_fault() is invoked via the vma operations vector for a
 * mapped memory region to read in file data during a page fault.
 *
 * The goto's are kind of ugly, but this streamlines the normal case of having
 * it in the page cache, and handles the special cases reasonably without
 * having a lot of duplicated code.
 *
 * vma->vm_mm->mmap_lock must be held on entry.
 *
 * If our return value has VM_FAULT_RETRY set, it's because the mmap_lock
 * may be dropped before doing I/O or by lock_page_maybe_drop_mmap().
 *
 * If our return value does not have VM_FAULT_RETRY set, the mmap_lock
 * has not been released.
 *
 * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
 *
 * Return: bitwise-OR of %VM_FAULT_ codes.
 */
vm_fault_t filemap_fault(struct vm_fault *vmf)
{
        int error;
        struct file *file = vmf->vma->vm_file;
        struct file *fpin = NULL;
        struct address_space *mapping = file->f_mapping;
        struct file_ra_state *ra = &file->f_ra;
        struct inode *inode = mapping->host;
        pgoff_t offset = vmf->pgoff;
        pgoff_t max_off;
        struct page *page;
        vm_fault_t ret = 0;

        max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
        if (unlikely(offset >= max_off))
                return VM_FAULT_SIGBUS;

        /*
         * Do we have something in the page cache already?
         */
        page = find_get_page(mapping, offset);
        if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
                /*
                 * We found the page, so try async readahead before
                 * waiting for the lock.
                 */
                fpin = do_async_mmap_readahead(vmf, page);
        } else if (!page) {
                /* No page in the page cache at all */
                count_vm_event(PGMAJFAULT);
                count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
                ret = VM_FAULT_MAJOR;
                fpin = do_sync_mmap_readahead(vmf);
retry_find:
                page = pagecache_get_page(mapping, offset,
                                          FGP_CREAT|FGP_FOR_MMAP,
                                          vmf->gfp_mask);
                if (!page) {
                        if (fpin)
                                goto out_retry;
                        return VM_FAULT_OOM;
                }
        }

        if (!lock_page_maybe_drop_mmap(vmf, page, &fpin))
                goto out_retry;

        /* Did it get truncated? */
        if (unlikely(compound_head(page)->mapping != mapping)) {
                unlock_page(page);
                put_page(page);
                goto retry_find;
        }
        VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);

        /*
         * We have a locked page in the page cache, now we need to check
         * that it's up-to-date. If not, it is going to be due to an error.
         */
        if (unlikely(!PageUptodate(page)))
                goto page_not_uptodate;

        /*
         * We've made it this far and we had to drop our mmap_lock, now is the
         * time to return to the upper layer and have it re-find the vma and
         * redo the fault.
         */
        if (fpin) {
                unlock_page(page);
                goto out_retry;
        }

        /*
         * Found the page and have a reference on it.
         * We must recheck i_size under page lock.
         */
        max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
        if (unlikely(offset >= max_off)) {
                unlock_page(page);
                put_page(page);
                return VM_FAULT_SIGBUS;
        }

        vmf->page = page;
        return ret | VM_FAULT_LOCKED;

page_not_uptodate:
        /*
         * Umm, take care of errors if the page isn't up-to-date.
         * Try to re-read it _once_. We do this synchronously,
         * because there really aren't any performance issues here
         * and we need to check for errors.
         */
        ClearPageError(page);
        fpin = maybe_unlock_mmap_for_io(vmf, fpin);
        error = mapping->a_ops->readpage(file, page);
        if (!error) {
                wait_on_page_locked(page);
                if (!PageUptodate(page))
                        error = -EIO;
        }
        if (fpin)
                goto out_retry;
        put_page(page);

        if (!error || error == AOP_TRUNCATED_PAGE)
                goto retry_find;

        shrink_readahead_size_eio(ra);
        return VM_FAULT_SIGBUS;

out_retry:
        /*
         * We dropped the mmap_lock, we need to return to the fault handler to
         * re-find the vma and come back and find our hopefully still populated
         * page.
         */
        if (page)
                put_page(page);
        if (fpin)
                fput(fpin);
        return ret | VM_FAULT_RETRY;
}
EXPORT_SYMBOL(filemap_fault);

void filemap_map_pages(struct vm_fault *vmf,
                pgoff_t start_pgoff, pgoff_t end_pgoff)
{
        struct file *file = vmf->vma->vm_file;
        struct address_space *mapping = file->f_mapping;
        pgoff_t last_pgoff = start_pgoff;
        unsigned long max_idx;
        XA_STATE(xas, &mapping->i_pages, start_pgoff);
        struct page *head, *page;
        unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);

        rcu_read_lock();
        xas_for_each(&xas, head, end_pgoff) {
                if (xas_retry(&xas, head))
                        continue;
                if (xa_is_value(head))
                        goto next;

                /*
                 * Check for a locked page first, as a speculative
                 * reference may adversely influence page migration.
                 */
                if (PageLocked(head))
                        goto next;
                if (!page_cache_get_speculative(head))
                        goto next;

                /* Has the page moved or been split? */
                if (unlikely(head != xas_reload(&xas)))
                        goto skip;
                page = find_subpage(head, xas.xa_index);

                if (!PageUptodate(head) ||
                                PageReadahead(page) ||
                                PageHWPoison(page))
                        goto skip;
                if (!trylock_page(head))
                        goto skip;

                if (head->mapping != mapping || !PageUptodate(head))
                        goto unlock;

                max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
                if (xas.xa_index >= max_idx)
                        goto unlock;

                if (mmap_miss > 0)
                        mmap_miss--;

                vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
                if (vmf->pte)
                        vmf->pte += xas.xa_index - last_pgoff;
                last_pgoff = xas.xa_index;
                if (alloc_set_pte(vmf, page))
                        goto unlock;
                unlock_page(head);
                goto next;
unlock:
                unlock_page(head);
skip:
                put_page(head);
next:
                /* Huge page is mapped? No need to proceed. */
                if (pmd_trans_huge(*vmf->pmd))
                        break;
        }
        rcu_read_unlock();
        WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss);
}
EXPORT_SYMBOL(filemap_map_pages);

vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
{
        struct page *page = vmf->page;
        struct inode *inode = file_inode(vmf->vma->vm_file);
        vm_fault_t ret = VM_FAULT_LOCKED;

        sb_start_pagefault(inode->i_sb);
        file_update_time(vmf->vma->vm_file);
        lock_page(page);
        if (page->mapping != inode->i_mapping) {
                unlock_page(page);
                ret = VM_FAULT_NOPAGE;
                goto out;
        }
        /*
         * We mark the page dirty already here so that when freeze is in
         * progress, we are guaranteed that writeback during freezing will
         * see the dirty page and writeprotect it again.
         */
        set_page_dirty(page);
        wait_for_stable_page(page);
out:
        sb_end_pagefault(inode->i_sb);
        return ret;
}

const struct vm_operations_struct generic_file_vm_ops = {
        .fault                = filemap_fault,
        .map_pages        = filemap_map_pages,
        .page_mkwrite        = filemap_page_mkwrite,
};

/* This is used for a general mmap of a disk file */

int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
{
        struct address_space *mapping = file->f_mapping;

        if (!mapping->a_ops->readpage)
                return -ENOEXEC;
        file_accessed(file);
        vma->vm_ops = &generic_file_vm_ops;
        return 0;
}

/*
 * This is for filesystems which do not implement ->writepage.
 */
int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
{
        if (vma_is_shared_maywrite(vma))
                return -EINVAL;
        return generic_file_mmap(file, vma);
}
#else
vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
{
        return VM_FAULT_SIGBUS;
}
int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
{
        return -ENOSYS;
}
int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
{
        return -ENOSYS;
}
#endif /* CONFIG_MMU */

EXPORT_SYMBOL(filemap_page_mkwrite);
EXPORT_SYMBOL(generic_file_mmap);
EXPORT_SYMBOL(generic_file_readonly_mmap);

static struct page *wait_on_page_read(struct page *page)
{
        if (!IS_ERR(page)) {
                wait_on_page_locked(page);
                if (!PageUptodate(page)) {
                        put_page(page);
                        page = ERR_PTR(-EIO);
                }
        }
        return page;
}

static struct page *do_read_cache_page(struct address_space *mapping,
                                pgoff_t index,
                                int (*filler)(void *, struct page *),
                                void *data,
                                gfp_t gfp)
{
        struct page *page;
        int err;
repeat:
        page = find_get_page(mapping, index);
        if (!page) {
                page = __page_cache_alloc(gfp);
                if (!page)
                        return ERR_PTR(-ENOMEM);
                err = add_to_page_cache_lru(page, mapping, index, gfp);
                if (unlikely(err)) {
                        put_page(page);
                        if (err == -EEXIST)
                                goto repeat;
                        /* Presumably ENOMEM for xarray node */
                        return ERR_PTR(err);
                }

filler:
                if (filler)
                        err = filler(data, page);
                else
                        err = mapping->a_ops->readpage(data, page);

                if (err < 0) {
                        put_page(page);
                        return ERR_PTR(err);
                }

                page = wait_on_page_read(page);
                if (IS_ERR(page))
                        return page;
                goto out;
        }
        if (PageUptodate(page))
                goto out;

        /*
         * Page is not up to date and may be locked due to one of the following
         * case a: Page is being filled and the page lock is held
         * case b: Read/write error clearing the page uptodate status
         * case c: Truncation in progress (page locked)
         * case d: Reclaim in progress
         *
         * Case a, the page will be up to date when the page is unlocked.
         *    There is no need to serialise on the page lock here as the page
         *    is pinned so the lock gives no additional protection. Even if the
         *    page is truncated, the data is still valid if PageUptodate as
         *    it's a race vs truncate race.
         * Case b, the page will not be up to date
         * Case c, the page may be truncated but in itself, the data may still
         *    be valid after IO completes as it's a read vs truncate race. The
         *    operation must restart if the page is not uptodate on unlock but
         *    otherwise serialising on page lock to stabilise the mapping gives
         *    no additional guarantees to the caller as the page lock is
         *    released before return.
         * Case d, similar to truncation. If reclaim holds the page lock, it
         *    will be a race with remove_mapping that determines if the mapping
         *    is valid on unlock but otherwise the data is valid and there is
         *    no need to serialise with page lock.
         *
         * As the page lock gives no additional guarantee, we optimistically
         * wait on the page to be unlocked and check if it's up to date and
         * use the page if it is. Otherwise, the page lock is required to
         * distinguish between the different cases. The motivation is that we
         * avoid spurious serialisations and wakeups when multiple processes
         * wait on the same page for IO to complete.
         */
        wait_on_page_locked(page);
        if (PageUptodate(page))
                goto out;

        /* Distinguish between all the cases under the safety of the lock */
        lock_page(page);

        /* Case c or d, restart the operation */
        if (!page->mapping) {
                unlock_page(page);
                put_page(page);
                goto repeat;
        }

        /* Someone else locked and filled the page in a very small window */
        if (PageUptodate(page)) {
                unlock_page(page);
                goto out;
        }

        /*
         * A previous I/O error may have been due to temporary
         * failures.
         * Clear page error before actual read, PG_error will be
         * set again if read page fails.
         */
        ClearPageError(page);
        goto filler;

out:
        mark_page_accessed(page);
        return page;
}

/**
 * read_cache_page - read into page cache, fill it if needed
 * @mapping:        the page's address_space
 * @index:        the page index
 * @filler:        function to perform the read
 * @data:        first arg to filler(data, page) function, often left as NULL
 *
 * Read into the page cache. If a page already exists, and PageUptodate() is
 * not set, try to fill the page and wait for it to become unlocked.
 *
 * If the page does not get brought uptodate, return -EIO.
 *
 * Return: up to date page on success, ERR_PTR() on failure.
 */
struct page *read_cache_page(struct address_space *mapping,
                                pgoff_t index,
                                int (*filler)(void *, struct page *),
                                void *data)
{
        return do_read_cache_page(mapping, index, filler, data,
                        mapping_gfp_mask(mapping));
}
EXPORT_SYMBOL(read_cache_page);

/**
 * read_cache_page_gfp - read into page cache, using specified page allocation flags.
 * @mapping:        the page's address_space
 * @index:        the page index
 * @gfp:        the page allocator flags to use if allocating
 *
 * This is the same as "read_mapping_page(mapping, index, NULL)", but with
 * any new page allocations done using the specified allocation flags.
 *
 * If the page does not get brought uptodate, return -EIO.
 *
 * Return: up to date page on success, ERR_PTR() on failure.
 */
struct page *read_cache_page_gfp(struct address_space *mapping,
                                pgoff_t index,
                                gfp_t gfp)
{
        return do_read_cache_page(mapping, index, NULL, NULL, gfp);
}
EXPORT_SYMBOL(read_cache_page_gfp);

int pagecache_write_begin(struct file *file, struct address_space *mapping,
                                loff_t pos, unsigned len, unsigned flags,
                                struct page **pagep, void **fsdata)
{
        const struct address_space_operations *aops = mapping->a_ops;

        return aops->write_begin(file, mapping, pos, len, flags,
                                                        pagep, fsdata);
}
EXPORT_SYMBOL(pagecache_write_begin);

int pagecache_write_end(struct file *file, struct address_space *mapping,
                                loff_t pos, unsigned len, unsigned copied,
                                struct page *page, void *fsdata)
{
        const struct address_space_operations *aops = mapping->a_ops;

        return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
}
EXPORT_SYMBOL(pagecache_write_end);

/*
 * Warn about a page cache invalidation failure during a direct I/O write.
 */
void dio_warn_stale_pagecache(struct file *filp)
{
        static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
        char pathname[128];
        struct inode *inode = file_inode(filp);
        char *path;

        errseq_set(&inode->i_mapping->wb_err, -EIO);
        if (__ratelimit(&_rs)) {
                path = file_path(filp, pathname, sizeof(pathname));
                if (IS_ERR(path))
                        path = "(unknown)";
                pr_crit("Page cache invalidation failure on direct I/O.  Possible data corruption due to collision with buffered I/O!\n");
                pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid,
                        current->comm);
        }
}

ssize_t
generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
{
        struct file        *file = iocb->ki_filp;
        struct address_space *mapping = file->f_mapping;
        struct inode        *inode = mapping->host;
        loff_t                pos = iocb->ki_pos;
        ssize_t                written;
        size_t                write_len;
        pgoff_t                end;

        write_len = iov_iter_count(from);
        end = (pos + write_len - 1) >> PAGE_SHIFT;

        if (iocb->ki_flags & IOCB_NOWAIT) {
                /* If there are pages to writeback, return */
                if (filemap_range_has_page(inode->i_mapping, pos,
                                           pos + write_len - 1))
                        return -EAGAIN;
        } else {
                written = filemap_write_and_wait_range(mapping, pos,
                                                        pos + write_len - 1);
                if (written)
                        goto out;
        }

        /*
         * After a write we want buffered reads to be sure to go to disk to get
         * the new data.  We invalidate clean cached page from the region we're
         * about to write.  We do this *before* the write so that we can return
         * without clobbering -EIOCBQUEUED from ->direct_IO().
         */
        written = invalidate_inode_pages2_range(mapping,
                                        pos >> PAGE_SHIFT, end);
        /*
         * If a page can not be invalidated, return 0 to fall back
         * to buffered write.
         */
        if (written) {
                if (written == -EBUSY)
                        return 0;
                goto out;
        }

        written = mapping->a_ops->direct_IO(iocb, from);

        /*
         * Finally, try again to invalidate clean pages which might have been
         * cached by non-direct readahead, or faulted in by get_user_pages()
         * if the source of the write was an mmap'ed region of the file
         * we're writing.  Either one is a pretty crazy thing to do,
         * so we don't support it 100%.  If this invalidation
         * fails, tough, the write still worked...
         *
         * Most of the time we do not need this since dio_complete() will do
         * the invalidation for us. However there are some file systems that
         * do not end up with dio_complete() being called, so let's not break
         * them by removing it completely.
         *
         * Noticeable example is a blkdev_direct_IO().
         *
         * Skip invalidation for async writes or if mapping has no pages.
         */
        if (written > 0 && mapping->nrpages &&
            invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end))
                dio_warn_stale_pagecache(file);

        if (written > 0) {
                pos += written;
                write_len -= written;
                if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
                        i_size_write(inode, pos);
                        mark_inode_dirty(inode);
                }
                iocb->ki_pos = pos;
        }
        iov_iter_revert(from, write_len - iov_iter_count(from));
out:
        return written;
}
EXPORT_SYMBOL(generic_file_direct_write);

/*
 * Find or create a page at the given pagecache position. Return the locked
 * page. This function is specifically for buffered writes.
 */
struct page *grab_cache_page_write_begin(struct address_space *mapping,
                                        pgoff_t index, unsigned flags)
{
        struct page *page;
        int fgp_flags = FGP_LOCK|FGP_WRITE|FGP_CREAT;

        if (flags & AOP_FLAG_NOFS)
                fgp_flags |= FGP_NOFS;

        page = pagecache_get_page(mapping, index, fgp_flags,
                        mapping_gfp_mask(mapping));
        if (page)
                wait_for_stable_page(page);

        return page;
}
EXPORT_SYMBOL(grab_cache_page_write_begin);

ssize_t generic_perform_write(struct file *file,
                                struct iov_iter *i, loff_t pos)
{
        struct address_space *mapping = file->f_mapping;
        const struct address_space_operations *a_ops = mapping->a_ops;
        long status = 0;
        ssize_t written = 0;
        unsigned int flags = 0;

        do {
                struct page *page;
                unsigned long offset;        /* Offset into pagecache page */
                unsigned long bytes;        /* Bytes to write to page */
                size_t copied;                /* Bytes copied from user */
                void *fsdata = NULL;

                offset = (pos & (PAGE_SIZE - 1));
                bytes = min_t(unsigned long, PAGE_SIZE - offset,
                                                iov_iter_count(i));

again:
                /*
                 * Bring in the user page that we will copy from _first_.
                 * Otherwise there's a nasty deadlock on copying from the
                 * same page as we're writing to, without it being marked
                 * up-to-date.
                 *
                 * Not only is this an optimisation, but it is also required
                 * to check that the address is actually valid, when atomic
                 * usercopies are used, below.
                 */
                if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
                        status = -EFAULT;
                        break;
                }

                if (fatal_signal_pending(current)) {
                        status = -EINTR;
                        break;
                }

                status = a_ops->write_begin(file, mapping, pos, bytes, flags,
                                                &page, &fsdata);
                if (unlikely(status < 0))
                        break;

                if (mapping_writably_mapped(mapping))
                        flush_dcache_page(page);

                copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
                flush_dcache_page(page);

                status = a_ops->write_end(file, mapping, pos, bytes, copied,
                                                page, fsdata);
                if (unlikely(status < 0))
                        break;
                copied = status;

                cond_resched();

                iov_iter_advance(i, copied);
                if (unlikely(copied == 0)) {
                        /*
                         * If we were unable to copy any data at all, we must
                         * fall back to a single segment length write.
                         *
                         * If we didn't fallback here, we could livelock
                         * because not all segments in the iov can be copied at
                         * once without a pagefault.
                         */
                        bytes = min_t(unsigned long, PAGE_SIZE - offset,
                                                iov_iter_single_seg_count(i));
                        goto again;
                }
                pos += copied;
                written += copied;

                balance_dirty_pages_ratelimited(mapping);
        } while (iov_iter_count(i));

        return written ? written : status;
}
EXPORT_SYMBOL(generic_perform_write);

/**
 * __generic_file_write_iter - write data to a file
 * @iocb:        IO state structure (file, offset, etc.)
 * @from:        iov_iter with data to write
 *
 * This function does all the work needed for actually writing data to a
 * file. It does all basic checks, removes SUID from the file, updates
 * modification times and calls proper subroutines depending on whether we
 * do direct IO or a standard buffered write.
 *
 * It expects i_mutex to be grabbed unless we work on a block device or similar
 * object which does not need locking at all.
 *
 * This function does *not* take care of syncing data in case of O_SYNC write.
 * A caller has to handle it. This is mainly due to the fact that we want to
 * avoid syncing under i_mutex.
 *
 * Return:
 * * number of bytes written, even for truncated writes
 * * negative error code if no data has been written at all
 */
ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        struct file *file = iocb->ki_filp;
        struct address_space * mapping = file->f_mapping;
        struct inode         *inode = mapping->host;
        ssize_t                written = 0;
        ssize_t                err;
        ssize_t                status;

        /* We can write back this queue in page reclaim */
        current->backing_dev_info = inode_to_bdi(inode);
        err = file_remove_privs(file);
        if (err)
                goto out;

        err = file_update_time(file);
        if (err)
                goto out;

        if (iocb->ki_flags & IOCB_DIRECT) {
                loff_t pos, endbyte;

                written = generic_file_direct_write(iocb, from);
                /*
                 * If the write stopped short of completing, fall back to
                 * buffered writes.  Some filesystems do this for writes to
                 * holes, for example.  For DAX files, a buffered write will
                 * not succeed (even if it did, DAX does not handle dirty
                 * page-cache pages correctly).
                 */
                if (written < 0 || !iov_iter_count(from) || IS_DAX(inode))
                        goto out;

                status = generic_perform_write(file, from, pos = iocb->ki_pos);
                /*
                 * If generic_perform_write() returned a synchronous error
                 * then we want to return the number of bytes which were
                 * direct-written, or the error code if that was zero.  Note
                 * that this differs from normal direct-io semantics, which
                 * will return -EFOO even if some bytes were written.
                 */
                if (unlikely(status < 0)) {
                        err = status;
                        goto out;
                }
                /*
                 * We need to ensure that the page cache pages are written to
                 * disk and invalidated to preserve the expected O_DIRECT
                 * semantics.
                 */
                endbyte = pos + status - 1;
                err = filemap_write_and_wait_range(mapping, pos, endbyte);
                if (err == 0) {
                        iocb->ki_pos = endbyte + 1;
                        written += status;
                        invalidate_mapping_pages(mapping,
                                                 pos >> PAGE_SHIFT,
                                                 endbyte >> PAGE_SHIFT);
                } else {
                        /*
                         * We don't know how much we wrote, so just return
                         * the number of bytes which were direct-written
                         */
                }
        } else {
                written = generic_perform_write(file, from, iocb->ki_pos);
                if (likely(written > 0))
                        iocb->ki_pos += written;
        }
out:
        current->backing_dev_info = NULL;
        return written ? written : err;
}
EXPORT_SYMBOL(__generic_file_write_iter);

/**
 * generic_file_write_iter - write data to a file
 * @iocb:        IO state structure
 * @from:        iov_iter with data to write
 *
 * This is a wrapper around __generic_file_write_iter() to be used by most
 * filesystems. It takes care of syncing the file in case of O_SYNC file
 * and acquires i_mutex as needed.
 * Return:
 * * negative error code if no data has been written at all of
 *   vfs_fsync_range() failed for a synchronous write
 * * number of bytes written, even for truncated writes
 */
ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
        ssize_t ret;

        inode_lock(inode);
        ret = generic_write_checks(iocb, from);
        if (ret > 0)
                ret = __generic_file_write_iter(iocb, from);
        inode_unlock(inode);

        if (ret > 0)
                ret = generic_write_sync(iocb, ret);
        return ret;
}
EXPORT_SYMBOL(generic_file_write_iter);

/**
 * try_to_release_page() - release old fs-specific metadata on a page
 *
 * @page: the page which the kernel is trying to free
 * @gfp_mask: memory allocation flags (and I/O mode)
 *
 * The address_space is to try to release any data against the page
 * (presumably at page->private).
 *
 * This may also be called if PG_fscache is set on a page, indicating that the
 * page is known to the local caching routines.
 *
 * The @gfp_mask argument specifies whether I/O may be performed to release
 * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS).
 *
 * Return: %1 if the release was successful, otherwise return zero.
 */
int try_to_release_page(struct page *page, gfp_t gfp_mask)
{
        struct address_space * const mapping = page->mapping;

        BUG_ON(!PageLocked(page));
        if (PageWriteback(page))
                return 0;

        if (mapping && mapping->a_ops->releasepage)
                return mapping->a_ops->releasepage(page, gfp_mask);
        return try_to_free_buffers(page);
}

EXPORT_SYMBOL(try_to_release_page);
















































































































































































































































































































































    2 
















































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
/* SPDX-License-Identifier: GPL-2.0 */

#ifndef _LINUX_TRACE_EVENT_H
#define _LINUX_TRACE_EVENT_H

#include <linux/ring_buffer.h>
#include <linux/trace_seq.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/perf_event.h>
#include <linux/tracepoint.h>

struct trace_array;
struct array_buffer;
struct tracer;
struct dentry;
struct bpf_prog;

const char *trace_print_flags_seq(struct trace_seq *p, const char *delim,
                                  unsigned long flags,
                                  const struct trace_print_flags *flag_array);

const char *trace_print_symbols_seq(struct trace_seq *p, unsigned long val,
                                    const struct trace_print_flags *symbol_array);

#if BITS_PER_LONG == 32
const char *trace_print_flags_seq_u64(struct trace_seq *p, const char *delim,
                      unsigned long long flags,
                      const struct trace_print_flags_u64 *flag_array);

const char *trace_print_symbols_seq_u64(struct trace_seq *p,
                                        unsigned long long val,
                                        const struct trace_print_flags_u64
                                                                 *symbol_array);
#endif

const char *trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
                                    unsigned int bitmask_size);

const char *trace_print_hex_seq(struct trace_seq *p,
                                const unsigned char *buf, int len,
                                bool concatenate);

const char *trace_print_array_seq(struct trace_seq *p,
                                   const void *buf, int count,
                                   size_t el_size);

const char *
trace_print_hex_dump_seq(struct trace_seq *p, const char *prefix_str,
                         int prefix_type, int rowsize, int groupsize,
                         const void *buf, size_t len, bool ascii);

struct trace_iterator;
struct trace_event;

int trace_raw_output_prep(struct trace_iterator *iter,
                          struct trace_event *event);
extern __printf(2, 3)
void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...);

/*
 * The trace entry - the most basic unit of tracing. This is what
 * is printed in the end as a single line in the trace output, such as:
 *
 *     bash-15816 [01]   235.197585: idle_cpu <- irq_enter
 */
struct trace_entry {
        unsigned short                type;
        unsigned char                flags;
        unsigned char                preempt_count;
        int                        pid;
};

#define TRACE_EVENT_TYPE_MAX                                                \
        ((1 << (sizeof(((struct trace_entry *)0)->type) * 8)) - 1)

/*
 * Trace iterator - used by printout routines who present trace
 * results to users and which routines might sleep, etc:
 */
struct trace_iterator {
        struct trace_array        *tr;
        struct tracer                *trace;
        struct array_buffer        *array_buffer;
        void                        *private;
        int                        cpu_file;
        struct mutex                mutex;
        struct ring_buffer_iter        **buffer_iter;
        unsigned long                iter_flags;
        void                        *temp;        /* temp holder */
        unsigned int                temp_size;
        char                        *fmt;        /* modified format holder */
        unsigned int                fmt_size;

        /* trace_seq for __print_flags() and __print_symbolic() etc. */
        struct trace_seq        tmp_seq;

        cpumask_var_t                started;

        /* it's true when current open file is snapshot */
        bool                        snapshot;

        /* The below is zeroed out in pipe_read */
        struct trace_seq        seq;
        struct trace_entry        *ent;
        unsigned long                lost_events;
        int                        leftover;
        int                        ent_size;
        int                        cpu;
        u64                        ts;

        loff_t                        pos;
        long                        idx;

        /* All new field here will be zeroed out in pipe_read */
};

enum trace_iter_flags {
        TRACE_FILE_LAT_FMT        = 1,
        TRACE_FILE_ANNOTATE        = 2,
        TRACE_FILE_TIME_IN_NS        = 4,
};


typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
                                      int flags, struct trace_event *event);

struct trace_event_functions {
        trace_print_func        trace;
        trace_print_func        raw;
        trace_print_func        hex;
        trace_print_func        binary;
};

struct trace_event {
        struct hlist_node                node;
        struct list_head                list;
        int                                type;
        struct trace_event_functions        *funcs;
};

extern int register_trace_event(struct trace_event *event);
extern int unregister_trace_event(struct trace_event *event);

/* Return values for print_line callback */
enum print_line_t {
        TRACE_TYPE_PARTIAL_LINE        = 0,        /* Retry after flushing the seq */
        TRACE_TYPE_HANDLED        = 1,
        TRACE_TYPE_UNHANDLED        = 2,        /* Relay to other output functions */
        TRACE_TYPE_NO_CONSUME        = 3        /* Handled but ask to not consume */
};

enum print_line_t trace_handle_return(struct trace_seq *s);

void tracing_generic_entry_update(struct trace_entry *entry,
                                  unsigned short type,
                                  unsigned long flags,
                                  int pc);
struct trace_event_file;

struct ring_buffer_event *
trace_event_buffer_lock_reserve(struct trace_buffer **current_buffer,
                                struct trace_event_file *trace_file,
                                int type, unsigned long len,
                                unsigned long flags, int pc);

#define TRACE_RECORD_CMDLINE        BIT(0)
#define TRACE_RECORD_TGID        BIT(1)

void tracing_record_taskinfo(struct task_struct *task, int flags);
void tracing_record_taskinfo_sched_switch(struct task_struct *prev,
                                          struct task_struct *next, int flags);

void tracing_record_cmdline(struct task_struct *task);
void tracing_record_tgid(struct task_struct *task);

int trace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...);

struct event_filter;

enum trace_reg {
        TRACE_REG_REGISTER,
        TRACE_REG_UNREGISTER,
#ifdef CONFIG_PERF_EVENTS
        TRACE_REG_PERF_REGISTER,
        TRACE_REG_PERF_UNREGISTER,
        TRACE_REG_PERF_OPEN,
        TRACE_REG_PERF_CLOSE,
        /*
         * These (ADD/DEL) use a 'boolean' return value, where 1 (true) means a
         * custom action was taken and the default action is not to be
         * performed.
         */
        TRACE_REG_PERF_ADD,
        TRACE_REG_PERF_DEL,
#endif
};

struct trace_event_call;

#define TRACE_FUNCTION_TYPE ((const char *)~0UL)

struct trace_event_fields {
        const char *type;
        union {
                struct {
                        const char *name;
                        const int  size;
                        const int  align;
                        const int  is_signed;
                        const int  filter_type;
                };
                int (*define_fields)(struct trace_event_call *);
        };
};

struct trace_event_class {
        const char                *system;
        void                        *probe;
#ifdef CONFIG_PERF_EVENTS
        void                        *perf_probe;
#endif
        int                        (*reg)(struct trace_event_call *event,
                                       enum trace_reg type, void *data);
        struct trace_event_fields *fields_array;
        struct list_head        *(*get_fields)(struct trace_event_call *);
        struct list_head        fields;
        int                        (*raw_init)(struct trace_event_call *);
};

extern int trace_event_reg(struct trace_event_call *event,
                            enum trace_reg type, void *data);

struct trace_event_buffer {
        struct trace_buffer                *buffer;
        struct ring_buffer_event        *event;
        struct trace_event_file                *trace_file;
        void                                *entry;
        unsigned long                        flags;
        int                                pc;
        struct pt_regs                        *regs;
};

void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
                                  struct trace_event_file *trace_file,
                                  unsigned long len);

void trace_event_buffer_commit(struct trace_event_buffer *fbuffer);

enum {
        TRACE_EVENT_FL_FILTERED_BIT,
        TRACE_EVENT_FL_CAP_ANY_BIT,
        TRACE_EVENT_FL_NO_SET_FILTER_BIT,
        TRACE_EVENT_FL_IGNORE_ENABLE_BIT,
        TRACE_EVENT_FL_TRACEPOINT_BIT,
        TRACE_EVENT_FL_KPROBE_BIT,
        TRACE_EVENT_FL_UPROBE_BIT,
};

/*
 * Event flags:
 *  FILTERED          - The event has a filter attached
 *  CAP_ANY          - Any user can enable for perf
 *  NO_SET_FILTER - Set when filter has error and is to be ignored
 *  IGNORE_ENABLE - For trace internal events, do not enable with debugfs file
 *  TRACEPOINT    - Event is a tracepoint
 *  KPROBE        - Event is a kprobe
 *  UPROBE        - Event is a uprobe
 */
enum {
        TRACE_EVENT_FL_FILTERED                = (1 << TRACE_EVENT_FL_FILTERED_BIT),
        TRACE_EVENT_FL_CAP_ANY                = (1 << TRACE_EVENT_FL_CAP_ANY_BIT),
        TRACE_EVENT_FL_NO_SET_FILTER        = (1 << TRACE_EVENT_FL_NO_SET_FILTER_BIT),
        TRACE_EVENT_FL_IGNORE_ENABLE        = (1 << TRACE_EVENT_FL_IGNORE_ENABLE_BIT),
        TRACE_EVENT_FL_TRACEPOINT        = (1 << TRACE_EVENT_FL_TRACEPOINT_BIT),
        TRACE_EVENT_FL_KPROBE                = (1 << TRACE_EVENT_FL_KPROBE_BIT),
        TRACE_EVENT_FL_UPROBE                = (1 << TRACE_EVENT_FL_UPROBE_BIT),
};

#define TRACE_EVENT_FL_UKPROBE (TRACE_EVENT_FL_KPROBE | TRACE_EVENT_FL_UPROBE)

struct trace_event_call {
        struct list_head        list;
        struct trace_event_class *class;
        union {
                const char                *name;
                /* Set TRACE_EVENT_FL_TRACEPOINT flag when using "tp" */
                struct tracepoint        *tp;
        };
        struct trace_event        event;
        char                        *print_fmt;
        struct event_filter        *filter;
        void                        *mod;
        void                        *data;
        /*
         *   bit 0:                filter_active
         *   bit 1:                allow trace by non root (cap any)
         *   bit 2:                failed to apply filter
         *   bit 3:                trace internal event (do not enable)
         *   bit 4:                Event was enabled by module
         *   bit 5:                use call filter rather than file filter
         *   bit 6:                Event is a tracepoint
         */
        int                        flags; /* static flags of different events */

#ifdef CONFIG_PERF_EVENTS
        int                                perf_refcount;
        struct hlist_head __percpu        *perf_events;
        struct bpf_prog_array __rcu        *prog_array;

        int        (*perf_perm)(struct trace_event_call *,
                             struct perf_event *);
#endif
};

#ifdef CONFIG_PERF_EVENTS
static inline bool bpf_prog_array_valid(struct trace_event_call *call)
{
        /*
         * This inline function checks whether call->prog_array
         * is valid or not. The function is called in various places,
         * outside rcu_read_lock/unlock, as a heuristic to speed up execution.
         *
         * If this function returns true, and later call->prog_array
         * becomes false inside rcu_read_lock/unlock region,
         * we bail out then. If this function return false,
         * there is a risk that we might miss a few events if the checking
         * were delayed until inside rcu_read_lock/unlock region and
         * call->prog_array happened to become non-NULL then.
         *
         * Here, READ_ONCE() is used instead of rcu_access_pointer().
         * rcu_access_pointer() requires the actual definition of
         * "struct bpf_prog_array" while READ_ONCE() only needs
         * a declaration of the same type.
         */
        return !!READ_ONCE(call->prog_array);
}
#endif

static inline const char *
trace_event_name(struct trace_event_call *call)
{
        if (call->flags & TRACE_EVENT_FL_TRACEPOINT)
                return call->tp ? call->tp->name : NULL;
        else
                return call->name;
}

static inline struct list_head *
trace_get_fields(struct trace_event_call *event_call)
{
        if (!event_call->class->get_fields)
                return &event_call->class->fields;
        return event_call->class->get_fields(event_call);
}

struct trace_array;
struct trace_subsystem_dir;

enum {
        EVENT_FILE_FL_ENABLED_BIT,
        EVENT_FILE_FL_RECORDED_CMD_BIT,
        EVENT_FILE_FL_RECORDED_TGID_BIT,
        EVENT_FILE_FL_FILTERED_BIT,
        EVENT_FILE_FL_NO_SET_FILTER_BIT,
        EVENT_FILE_FL_SOFT_MODE_BIT,
        EVENT_FILE_FL_SOFT_DISABLED_BIT,
        EVENT_FILE_FL_TRIGGER_MODE_BIT,
        EVENT_FILE_FL_TRIGGER_COND_BIT,
        EVENT_FILE_FL_PID_FILTER_BIT,
        EVENT_FILE_FL_WAS_ENABLED_BIT,
        EVENT_FILE_FL_FREED_BIT,
};

extern struct trace_event_file *trace_get_event_file(const char *instance,
                                                     const char *system,
                                                     const char *event);
extern void trace_put_event_file(struct trace_event_file *file);

#define MAX_DYNEVENT_CMD_LEN        (2048)

enum dynevent_type {
        DYNEVENT_TYPE_SYNTH = 1,
        DYNEVENT_TYPE_KPROBE,
        DYNEVENT_TYPE_NONE,
};

struct dynevent_cmd;

typedef int (*dynevent_create_fn_t)(struct dynevent_cmd *cmd);

struct dynevent_cmd {
        struct seq_buf                seq;
        const char                *event_name;
        unsigned int                n_fields;
        enum dynevent_type        type;
        dynevent_create_fn_t        run_command;
        void                        *private_data;
};

extern int dynevent_create(struct dynevent_cmd *cmd);

extern int synth_event_delete(const char *name);

extern void synth_event_cmd_init(struct dynevent_cmd *cmd,
                                 char *buf, int maxlen);

extern int __synth_event_gen_cmd_start(struct dynevent_cmd *cmd,
                                       const char *name,
                                       struct module *mod, ...);

#define synth_event_gen_cmd_start(cmd, name, mod, ...)        \
        __synth_event_gen_cmd_start(cmd, name, mod, ## __VA_ARGS__, NULL)

struct synth_field_desc {
        const char *type;
        const char *name;
};

extern int synth_event_gen_cmd_array_start(struct dynevent_cmd *cmd,
                                           const char *name,
                                           struct module *mod,
                                           struct synth_field_desc *fields,
                                           unsigned int n_fields);
extern int synth_event_create(const char *name,
                              struct synth_field_desc *fields,
                              unsigned int n_fields, struct module *mod);

extern int synth_event_add_field(struct dynevent_cmd *cmd,
                                 const char *type,
                                 const char *name);
extern int synth_event_add_field_str(struct dynevent_cmd *cmd,
                                     const char *type_name);
extern int synth_event_add_fields(struct dynevent_cmd *cmd,
                                  struct synth_field_desc *fields,
                                  unsigned int n_fields);

#define synth_event_gen_cmd_end(cmd)        \
        dynevent_create(cmd)

struct synth_event;

struct synth_event_trace_state {
        struct trace_event_buffer fbuffer;
        struct synth_trace_event *entry;
        struct trace_buffer *buffer;
        struct synth_event *event;
        unsigned int cur_field;
        unsigned int n_u64;
        bool disabled;
        bool add_next;
        bool add_name;
};

extern int synth_event_trace(struct trace_event_file *file,
                             unsigned int n_vals, ...);
extern int synth_event_trace_array(struct trace_event_file *file, u64 *vals,
                                   unsigned int n_vals);
extern int synth_event_trace_start(struct trace_event_file *file,
                                   struct synth_event_trace_state *trace_state);
extern int synth_event_add_next_val(u64 val,
                                    struct synth_event_trace_state *trace_state);
extern int synth_event_add_val(const char *field_name, u64 val,
                               struct synth_event_trace_state *trace_state);
extern int synth_event_trace_end(struct synth_event_trace_state *trace_state);

extern int kprobe_event_delete(const char *name);

extern void kprobe_event_cmd_init(struct dynevent_cmd *cmd,
                                  char *buf, int maxlen);

#define kprobe_event_gen_cmd_start(cmd, name, loc, ...)                        \
        __kprobe_event_gen_cmd_start(cmd, false, name, loc, ## __VA_ARGS__, NULL)

#define kretprobe_event_gen_cmd_start(cmd, name, loc, ...)                \
        __kprobe_event_gen_cmd_start(cmd, true, name, loc, ## __VA_ARGS__, NULL)

extern int __kprobe_event_gen_cmd_start(struct dynevent_cmd *cmd,
                                        bool kretprobe,
                                        const char *name,
                                        const char *loc, ...);

#define kprobe_event_add_fields(cmd, ...)        \
        __kprobe_event_add_fields(cmd, ## __VA_ARGS__, NULL)

#define kprobe_event_add_field(cmd, field)        \
        __kprobe_event_add_fields(cmd, field, NULL)

extern int __kprobe_event_add_fields(struct dynevent_cmd *cmd, ...);

#define kprobe_event_gen_cmd_end(cmd)                \
        dynevent_create(cmd)

#define kretprobe_event_gen_cmd_end(cmd)        \
        dynevent_create(cmd)

/*
 * Event file flags:
 *  ENABLED          - The event is enabled
 *  RECORDED_CMD  - The comms should be recorded at sched_switch
 *  RECORDED_TGID - The tgids should be recorded at sched_switch
 *  FILTERED          - The event has a filter attached
 *  NO_SET_FILTER - Set when filter has error and is to be ignored
 *  SOFT_MODE     - The event is enabled/disabled by SOFT_DISABLED
 *  SOFT_DISABLED - When set, do not trace the event (even though its
 *                   tracepoint may be enabled)
 *  TRIGGER_MODE  - When set, invoke the triggers associated with the event
 *  TRIGGER_COND  - When set, one or more triggers has an associated filter
 *  PID_FILTER    - When set, the event is filtered based on pid
 *  WAS_ENABLED   - Set when enabled to know to clear trace on module removal
 *  FREED         - File descriptor is freed, all fields should be considered invalid
 */
enum {
        EVENT_FILE_FL_ENABLED                = (1 << EVENT_FILE_FL_ENABLED_BIT),
        EVENT_FILE_FL_RECORDED_CMD        = (1 << EVENT_FILE_FL_RECORDED_CMD_BIT),
        EVENT_FILE_FL_RECORDED_TGID        = (1 << EVENT_FILE_FL_RECORDED_TGID_BIT),
        EVENT_FILE_FL_FILTERED                = (1 << EVENT_FILE_FL_FILTERED_BIT),
        EVENT_FILE_FL_NO_SET_FILTER        = (1 << EVENT_FILE_FL_NO_SET_FILTER_BIT),
        EVENT_FILE_FL_SOFT_MODE                = (1 << EVENT_FILE_FL_SOFT_MODE_BIT),
        EVENT_FILE_FL_SOFT_DISABLED        = (1 << EVENT_FILE_FL_SOFT_DISABLED_BIT),
        EVENT_FILE_FL_TRIGGER_MODE        = (1 << EVENT_FILE_FL_TRIGGER_MODE_BIT),
        EVENT_FILE_FL_TRIGGER_COND        = (1 << EVENT_FILE_FL_TRIGGER_COND_BIT),
        EVENT_FILE_FL_PID_FILTER        = (1 << EVENT_FILE_FL_PID_FILTER_BIT),
        EVENT_FILE_FL_WAS_ENABLED        = (1 << EVENT_FILE_FL_WAS_ENABLED_BIT),
        EVENT_FILE_FL_FREED                = (1 << EVENT_FILE_FL_FREED_BIT),
};

struct trace_event_file {
        struct list_head                list;
        struct trace_event_call                *event_call;
        struct event_filter __rcu        *filter;
        struct dentry                        *dir;
        struct trace_array                *tr;
        struct trace_subsystem_dir        *system;
        struct list_head                triggers;

        /*
         * 32 bit flags:
         *   bit 0:                enabled
         *   bit 1:                enabled cmd record
         *   bit 2:                enable/disable with the soft disable bit
         *   bit 3:                soft disabled
         *   bit 4:                trigger enabled
         *
         * Note: The bits must be set atomically to prevent races
         * from other writers. Reads of flags do not need to be in
         * sync as they occur in critical sections. But the way flags
         * is currently used, these changes do not affect the code
         * except that when a change is made, it may have a slight
         * delay in propagating the changes to other CPUs due to
         * caching and such. Which is mostly OK ;-)
         */
        unsigned long                flags;
        atomic_t                ref;        /* ref count for opened files */
        atomic_t                sm_ref;        /* soft-mode reference counter */
        atomic_t                tm_ref;        /* trigger-mode reference counter */
};

#define __TRACE_EVENT_FLAGS(name, value)                                \
        static int __init trace_init_flags_##name(void)                        \
        {                                                                \
                event_##name.flags |= value;                                \
                return 0;                                                \
        }                                                                \
        early_initcall(trace_init_flags_##name);

#define __TRACE_EVENT_PERF_PERM(name, expr...)                                \
        static int perf_perm_##name(struct trace_event_call *tp_event, \
                                    struct perf_event *p_event)                \
        {                                                                \
                return ({ expr; });                                        \
        }                                                                \
        static int __init trace_init_perf_perm_##name(void)                \
        {                                                                \
                event_##name.perf_perm = &perf_perm_##name;                \
                return 0;                                                \
        }                                                                \
        early_initcall(trace_init_perf_perm_##name);

#define PERF_MAX_TRACE_SIZE        8192

#define MAX_FILTER_STR_VAL        256U        /* Should handle KSYM_SYMBOL_LEN */

enum event_trigger_type {
        ETT_NONE                = (0),
        ETT_TRACE_ONOFF                = (1 << 0),
        ETT_SNAPSHOT                = (1 << 1),
        ETT_STACKTRACE                = (1 << 2),
        ETT_EVENT_ENABLE        = (1 << 3),
        ETT_EVENT_HIST                = (1 << 4),
        ETT_HIST_ENABLE                = (1 << 5),
};

extern int filter_match_preds(struct event_filter *filter, void *rec);

extern enum event_trigger_type
event_triggers_call(struct trace_event_file *file, void *rec,
                    struct ring_buffer_event *event);
extern void
event_triggers_post_call(struct trace_event_file *file,
                         enum event_trigger_type tt);

bool trace_event_ignore_this_pid(struct trace_event_file *trace_file);

/**
 * trace_trigger_soft_disabled - do triggers and test if soft disabled
 * @file: The file pointer of the event to test
 *
 * If any triggers without filters are attached to this event, they
 * will be called here. If the event is soft disabled and has no
 * triggers that require testing the fields, it will return true,
 * otherwise false.
 */
static inline bool
trace_trigger_soft_disabled(struct trace_event_file *file)
{
        unsigned long eflags = file->flags;

        if (!(eflags & EVENT_FILE_FL_TRIGGER_COND)) {
                if (eflags & EVENT_FILE_FL_TRIGGER_MODE)
                        event_triggers_call(file, NULL, NULL);
                if (eflags & EVENT_FILE_FL_SOFT_DISABLED)
                        return true;
                if (eflags & EVENT_FILE_FL_PID_FILTER)
                        return trace_event_ignore_this_pid(file);
        }
        return false;
}

#ifdef CONFIG_BPF_EVENTS
unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx);
int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog);
void perf_event_detach_bpf_prog(struct perf_event *event);
int perf_event_query_prog_array(struct perf_event *event, void __user *info);
int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name);
void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp);
int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
                            u32 *fd_type, const char **buf,
                            u64 *probe_offset, u64 *probe_addr);
#else
static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
{
        return 1;
}

static inline int
perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog)
{
        return -EOPNOTSUPP;
}

static inline void perf_event_detach_bpf_prog(struct perf_event *event) { }

static inline int
perf_event_query_prog_array(struct perf_event *event, void __user *info)
{
        return -EOPNOTSUPP;
}
static inline int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *p)
{
        return -EOPNOTSUPP;
}
static inline int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *p)
{
        return -EOPNOTSUPP;
}
static inline struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name)
{
        return NULL;
}
static inline void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
{
}
static inline int bpf_get_perf_event_info(const struct perf_event *event,
                                          u32 *prog_id, u32 *fd_type,
                                          const char **buf, u64 *probe_offset,
                                          u64 *probe_addr)
{
        return -EOPNOTSUPP;
}
#endif

enum {
        FILTER_OTHER = 0,
        FILTER_STATIC_STRING,
        FILTER_DYN_STRING,
        FILTER_PTR_STRING,
        FILTER_TRACE_FN,
        FILTER_COMM,
        FILTER_CPU,
};

extern int trace_event_raw_init(struct trace_event_call *call);
extern int trace_define_field(struct trace_event_call *call, const char *type,
                              const char *name, int offset, int size,
                              int is_signed, int filter_type);
extern int trace_add_event_call(struct trace_event_call *call);
extern int trace_remove_event_call(struct trace_event_call *call);
extern int trace_event_get_offsets(struct trace_event_call *call);

int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set);
int trace_set_clr_event(const char *system, const char *event, int set);
int trace_array_set_clr_event(struct trace_array *tr, const char *system,
                const char *event, bool enable);
/*
 * The double __builtin_constant_p is because gcc will give us an error
 * if we try to allocate the static variable to fmt if it is not a
 * constant. Even with the outer if statement optimizing out.
 */
#define event_trace_printk(ip, fmt, args...)                                \
do {                                                                        \
        __trace_printk_check_format(fmt, ##args);                        \
        tracing_record_cmdline(current);                                \
        if (__builtin_constant_p(fmt)) {                                \
                static const char *trace_printk_fmt                        \
                  __section("__trace_printk_fmt") =                        \
                        __builtin_constant_p(fmt) ? fmt : NULL;                \
                                                                        \
                __trace_bprintk(ip, trace_printk_fmt, ##args);                \
        } else                                                                \
                __trace_printk(ip, fmt, ##args);                        \
} while (0)

#ifdef CONFIG_PERF_EVENTS
struct perf_event;

DECLARE_PER_CPU(struct pt_regs, perf_trace_regs);

extern int  perf_trace_init(struct perf_event *event);
extern void perf_trace_destroy(struct perf_event *event);
extern int  perf_trace_add(struct perf_event *event, int flags);
extern void perf_trace_del(struct perf_event *event, int flags);
#ifdef CONFIG_KPROBE_EVENTS
extern int  perf_kprobe_init(struct perf_event *event, bool is_retprobe);
extern void perf_kprobe_destroy(struct perf_event *event);
extern int bpf_get_kprobe_info(const struct perf_event *event,
                               u32 *fd_type, const char **symbol,
                               u64 *probe_offset, u64 *probe_addr,
                               bool perf_type_tracepoint);
#endif
#ifdef CONFIG_UPROBE_EVENTS
extern int  perf_uprobe_init(struct perf_event *event,
                             unsigned long ref_ctr_offset, bool is_retprobe);
extern void perf_uprobe_destroy(struct perf_event *event);
extern int bpf_get_uprobe_info(const struct perf_event *event,
                               u32 *fd_type, const char **filename,
                               u64 *probe_offset, u64 *probe_addr,
                               bool perf_type_tracepoint);
#endif
extern int  ftrace_profile_set_filter(struct perf_event *event, int event_id,
                                     char *filter_str);
extern void ftrace_profile_free_filter(struct perf_event *event);
void perf_trace_buf_update(void *record, u16 type);
void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp);

void bpf_trace_run1(struct bpf_prog *prog, u64 arg1);
void bpf_trace_run2(struct bpf_prog *prog, u64 arg1, u64 arg2);
void bpf_trace_run3(struct bpf_prog *prog, u64 arg1, u64 arg2,
                    u64 arg3);
void bpf_trace_run4(struct bpf_prog *prog, u64 arg1, u64 arg2,
                    u64 arg3, u64 arg4);
void bpf_trace_run5(struct bpf_prog *prog, u64 arg1, u64 arg2,
                    u64 arg3, u64 arg4, u64 arg5);
void bpf_trace_run6(struct bpf_prog *prog, u64 arg1, u64 arg2,
                    u64 arg3, u64 arg4, u64 arg5, u64 arg6);
void bpf_trace_run7(struct bpf_prog *prog, u64 arg1, u64 arg2,
                    u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7);
void bpf_trace_run8(struct bpf_prog *prog, u64 arg1, u64 arg2,
                    u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7,
                    u64 arg8);
void bpf_trace_run9(struct bpf_prog *prog, u64 arg1, u64 arg2,
                    u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7,
                    u64 arg8, u64 arg9);
void bpf_trace_run10(struct bpf_prog *prog, u64 arg1, u64 arg2,
                     u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7,
                     u64 arg8, u64 arg9, u64 arg10);
void bpf_trace_run11(struct bpf_prog *prog, u64 arg1, u64 arg2,
                     u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7,
                     u64 arg8, u64 arg9, u64 arg10, u64 arg11);
void bpf_trace_run12(struct bpf_prog *prog, u64 arg1, u64 arg2,
                     u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7,
                     u64 arg8, u64 arg9, u64 arg10, u64 arg11, u64 arg12);
void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
                               struct trace_event_call *call, u64 count,
                               struct pt_regs *regs, struct hlist_head *head,
                               struct task_struct *task);

static inline void
perf_trace_buf_submit(void *raw_data, int size, int rctx, u16 type,
                       u64 count, struct pt_regs *regs, void *head,
                       struct task_struct *task)
{
        perf_tp_event(type, count, raw_data, size, regs, head, rctx, task);
}

#endif

#endif /* _LINUX_TRACE_EVENT_H */































































































































































    3 









    1 




    1 




    1 














    1 














    3 

















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
// SPDX-License-Identifier: GPL-2.0
/*
 *  fs/ext4/extents_status.h
 *
 * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
 * Modified by
 *        Allison Henderson <achender@linux.vnet.ibm.com>
 *        Zheng Liu <wenqing.lz@taobao.com>
 *
 */

#ifndef _EXT4_EXTENTS_STATUS_H
#define _EXT4_EXTENTS_STATUS_H

/*
 * Turn on ES_DEBUG__ to get lots of info about extent status operations.
 */
#ifdef ES_DEBUG__
#define es_debug(fmt, ...)        printk(fmt, ##__VA_ARGS__)
#else
#define es_debug(fmt, ...)        no_printk(fmt, ##__VA_ARGS__)
#endif

/*
 * With ES_AGGRESSIVE_TEST defined, the result of es caching will be
 * checked with old map_block's result.
 */
#define ES_AGGRESSIVE_TEST__

/*
 * These flags live in the high bits of extent_status.es_pblk
 */
enum {
        ES_WRITTEN_B,
        ES_UNWRITTEN_B,
        ES_DELAYED_B,
        ES_HOLE_B,
        ES_REFERENCED_B,
        ES_FLAGS
};

#define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS)
#define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT)

#define EXTENT_STATUS_WRITTEN        (1 << ES_WRITTEN_B)
#define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B)
#define EXTENT_STATUS_DELAYED        (1 << ES_DELAYED_B)
#define EXTENT_STATUS_HOLE        (1 << ES_HOLE_B)
#define EXTENT_STATUS_REFERENCED        (1 << ES_REFERENCED_B)

#define ES_TYPE_MASK        ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \
                          EXTENT_STATUS_UNWRITTEN | \
                          EXTENT_STATUS_DELAYED | \
                          EXTENT_STATUS_HOLE) << ES_SHIFT)

struct ext4_sb_info;
struct ext4_extent;

struct extent_status {
        struct rb_node rb_node;
        ext4_lblk_t es_lblk;        /* first logical block extent covers */
        ext4_lblk_t es_len;        /* length of extent in block */
        ext4_fsblk_t es_pblk;        /* first physical block */
};

struct ext4_es_tree {
        struct rb_root root;
        struct extent_status *cache_es;        /* recently accessed extent */
};

struct ext4_es_stats {
        unsigned long es_stats_shrunk;
        struct percpu_counter es_stats_cache_hits;
        struct percpu_counter es_stats_cache_misses;
        u64 es_stats_scan_time;
        u64 es_stats_max_scan_time;
        struct percpu_counter es_stats_all_cnt;
        struct percpu_counter es_stats_shk_cnt;
};

/*
 * Pending cluster reservations for bigalloc file systems
 *
 * A cluster with a pending reservation is a logical cluster shared by at
 * least one extent in the extents status tree with delayed and unwritten
 * status and at least one other written or unwritten extent.  The
 * reservation is said to be pending because a cluster reservation would
 * have to be taken in the event all blocks in the cluster shared with
 * written or unwritten extents were deleted while the delayed and
 * unwritten blocks remained.
 *
 * The set of pending cluster reservations is an auxiliary data structure
 * used with the extents status tree to implement reserved cluster/block
 * accounting for bigalloc file systems.  The set is kept in memory and
 * records all pending cluster reservations.
 *
 * Its primary function is to avoid the need to read extents from the
 * disk when invalidating pages as a result of a truncate, punch hole, or
 * collapse range operation.  Page invalidation requires a decrease in the
 * reserved cluster count if it results in the removal of all delayed
 * and unwritten extents (blocks) from a cluster that is not shared with a
 * written or unwritten extent, and no decrease otherwise.  Determining
 * whether the cluster is shared can be done by searching for a pending
 * reservation on it.
 *
 * Secondarily, it provides a potentially faster method for determining
 * whether the reserved cluster count should be increased when a physical
 * cluster is deallocated as a result of a truncate, punch hole, or
 * collapse range operation.  The necessary information is also present
 * in the extents status tree, but might be more rapidly accessed in
 * the pending reservation set in many cases due to smaller size.
 *
 * The pending cluster reservation set is implemented as a red-black tree
 * with the goal of minimizing per page search time overhead.
 */

struct pending_reservation {
        struct rb_node rb_node;
        ext4_lblk_t lclu;
};

struct ext4_pending_tree {
        struct rb_root root;
};

extern int __init ext4_init_es(void);
extern void ext4_exit_es(void);
extern void ext4_es_init_tree(struct ext4_es_tree *tree);

extern void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
                                  ext4_lblk_t len, ext4_fsblk_t pblk,
                                  unsigned int status);
extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
                                 ext4_lblk_t len, ext4_fsblk_t pblk,
                                 unsigned int status);
extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
                                 ext4_lblk_t len);
extern void ext4_es_find_extent_range(struct inode *inode,
                                      int (*match_fn)(struct extent_status *es),
                                      ext4_lblk_t lblk, ext4_lblk_t end,
                                      struct extent_status *es);
extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
                                 ext4_lblk_t *next_lblk,
                                 struct extent_status *es);
extern bool ext4_es_scan_range(struct inode *inode,
                               int (*matching_fn)(struct extent_status *es),
                               ext4_lblk_t lblk, ext4_lblk_t end);
extern bool ext4_es_scan_clu(struct inode *inode,
                             int (*matching_fn)(struct extent_status *es),
                             ext4_lblk_t lblk);

static inline unsigned int ext4_es_status(struct extent_status *es)
{
        return es->es_pblk >> ES_SHIFT;
}

static inline unsigned int ext4_es_type(struct extent_status *es)
{
        return (es->es_pblk & ES_TYPE_MASK) >> ES_SHIFT;
}

static inline int ext4_es_is_written(struct extent_status *es)
{
        return (ext4_es_type(es) & EXTENT_STATUS_WRITTEN) != 0;
}

static inline int ext4_es_is_unwritten(struct extent_status *es)
{
        return (ext4_es_type(es) & EXTENT_STATUS_UNWRITTEN) != 0;
}

static inline int ext4_es_is_delayed(struct extent_status *es)
{
        return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0;
}

static inline int ext4_es_is_hole(struct extent_status *es)
{
        return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0;
}

static inline int ext4_es_is_mapped(struct extent_status *es)
{
        return (ext4_es_is_written(es) || ext4_es_is_unwritten(es));
}

static inline int ext4_es_is_delonly(struct extent_status *es)
{
        return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es));
}

static inline void ext4_es_set_referenced(struct extent_status *es)
{
        es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT;
}

static inline void ext4_es_clear_referenced(struct extent_status *es)
{
        es->es_pblk &= ~(((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT);
}

static inline int ext4_es_is_referenced(struct extent_status *es)
{
        return (ext4_es_status(es) & EXTENT_STATUS_REFERENCED) != 0;
}

static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es)
{
        return es->es_pblk & ~ES_MASK;
}

static inline ext4_fsblk_t ext4_es_show_pblock(struct extent_status *es)
{
        ext4_fsblk_t pblock = ext4_es_pblock(es);
        return pblock == ~ES_MASK ? 0 : pblock;
}

static inline void ext4_es_store_pblock(struct extent_status *es,
                                        ext4_fsblk_t pb)
{
        ext4_fsblk_t block;

        block = (pb & ~ES_MASK) | (es->es_pblk & ES_MASK);
        es->es_pblk = block;
}

static inline void ext4_es_store_status(struct extent_status *es,
                                        unsigned int status)
{
        es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) |
                      (es->es_pblk & ~ES_MASK);
}

static inline void ext4_es_store_pblock_status(struct extent_status *es,
                                               ext4_fsblk_t pb,
                                               unsigned int status)
{
        es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) |
                      (pb & ~ES_MASK);
}

extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);

extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v);

extern int __init ext4_init_pending(void);
extern void ext4_exit_pending(void);
extern void ext4_init_pending_tree(struct ext4_pending_tree *tree);
extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk);
extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk);
extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
                                        bool allocated);
extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
                                        ext4_lblk_t len);
extern void ext4_clear_inode_es(struct inode *inode);

#endif /* _EXT4_EXTENTS_STATUS_H */























































































    1 





































































    1 
























    1 

    3 





























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
/* SPDX-License-Identifier: GPL-2.0+ */
/*
 * Sleepable Read-Copy Update mechanism for mutual exclusion
 *
 * Copyright (C) IBM Corporation, 2006
 * Copyright (C) Fujitsu, 2012
 *
 * Author: Paul McKenney <paulmck@linux.ibm.com>
 *           Lai Jiangshan <laijs@cn.fujitsu.com>
 *
 * For detailed explanation of Read-Copy Update mechanism see -
 *                Documentation/RCU/ *.txt
 *
 */

#ifndef _LINUX_SRCU_H
#define _LINUX_SRCU_H

#include <linux/mutex.h>
#include <linux/rcupdate.h>
#include <linux/workqueue.h>
#include <linux/rcu_segcblist.h>

struct srcu_struct;

#ifdef CONFIG_DEBUG_LOCK_ALLOC

int __init_srcu_struct(struct srcu_struct *ssp, const char *name,
                       struct lock_class_key *key);

#define init_srcu_struct(ssp) \
({ \
        static struct lock_class_key __srcu_key; \
        \
        __init_srcu_struct((ssp), #ssp, &__srcu_key); \
})

#define __SRCU_DEP_MAP_INIT(srcu_name)        .dep_map = { .name = #srcu_name },
#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */

int init_srcu_struct(struct srcu_struct *ssp);

#define __SRCU_DEP_MAP_INIT(srcu_name)
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */

#ifdef CONFIG_TINY_SRCU
#include <linux/srcutiny.h>
#elif defined(CONFIG_TREE_SRCU)
#include <linux/srcutree.h>
#elif defined(CONFIG_SRCU)
#error "Unknown SRCU implementation specified to kernel configuration"
#else
/* Dummy definition for things like notifiers.  Actual use gets link error. */
struct srcu_struct { };
#endif

void call_srcu(struct srcu_struct *ssp, struct rcu_head *head,
                void (*func)(struct rcu_head *head));
void cleanup_srcu_struct(struct srcu_struct *ssp);
int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp);
void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp);
void synchronize_srcu(struct srcu_struct *ssp);
unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp);
unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp);
bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie);

#ifdef CONFIG_DEBUG_LOCK_ALLOC

/**
 * srcu_read_lock_held - might we be in SRCU read-side critical section?
 * @ssp: The srcu_struct structure to check
 *
 * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an SRCU
 * read-side critical section.  In absence of CONFIG_DEBUG_LOCK_ALLOC,
 * this assumes we are in an SRCU read-side critical section unless it can
 * prove otherwise.
 *
 * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot
 * and while lockdep is disabled.
 *
 * Note that SRCU is based on its own statemachine and it doesn't
 * relies on normal RCU, it can be called from the CPU which
 * is in the idle loop from an RCU point of view or offline.
 */
static inline int srcu_read_lock_held(const struct srcu_struct *ssp)
{
        if (!debug_lockdep_rcu_enabled())
                return 1;
        return lock_is_held(&ssp->dep_map);
}

#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */

static inline int srcu_read_lock_held(const struct srcu_struct *ssp)
{
        return 1;
}

#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */

/**
 * srcu_dereference_check - fetch SRCU-protected pointer for later dereferencing
 * @p: the pointer to fetch and protect for later dereferencing
 * @ssp: pointer to the srcu_struct, which is used to check that we
 *        really are in an SRCU read-side critical section.
 * @c: condition to check for update-side use
 *
 * If PROVE_RCU is enabled, invoking this outside of an RCU read-side
 * critical section will result in an RCU-lockdep splat, unless @c evaluates
 * to 1.  The @c argument will normally be a logical expression containing
 * lockdep_is_held() calls.
 */
#define srcu_dereference_check(p, ssp, c) \
        __rcu_dereference_check((p), (c) || srcu_read_lock_held(ssp), __rcu)

/**
 * srcu_dereference - fetch SRCU-protected pointer for later dereferencing
 * @p: the pointer to fetch and protect for later dereferencing
 * @ssp: pointer to the srcu_struct, which is used to check that we
 *        really are in an SRCU read-side critical section.
 *
 * Makes rcu_dereference_check() do the dirty work.  If PROVE_RCU
 * is enabled, invoking this outside of an RCU read-side critical
 * section will result in an RCU-lockdep splat.
 */
#define srcu_dereference(p, ssp) srcu_dereference_check((p), (ssp), 0)

/**
 * srcu_dereference_notrace - no tracing and no lockdep calls from here
 * @p: the pointer to fetch and protect for later dereferencing
 * @ssp: pointer to the srcu_struct, which is used to check that we
 *        really are in an SRCU read-side critical section.
 */
#define srcu_dereference_notrace(p, ssp) srcu_dereference_check((p), (ssp), 1)

/**
 * srcu_read_lock - register a new reader for an SRCU-protected structure.
 * @ssp: srcu_struct in which to register the new reader.
 *
 * Enter an SRCU read-side critical section.  Note that SRCU read-side
 * critical sections may be nested.  However, it is illegal to
 * call anything that waits on an SRCU grace period for the same
 * srcu_struct, whether directly or indirectly.  Please note that
 * one way to indirectly wait on an SRCU grace period is to acquire
 * a mutex that is held elsewhere while calling synchronize_srcu() or
 * synchronize_srcu_expedited().
 *
 * Note that srcu_read_lock() and the matching srcu_read_unlock() must
 * occur in the same context, for example, it is illegal to invoke
 * srcu_read_unlock() in an irq handler if the matching srcu_read_lock()
 * was invoked in process context.
 */
static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp)
{
        int retval;

        retval = __srcu_read_lock(ssp);
        rcu_lock_acquire(&(ssp)->dep_map);
        return retval;
}

/* Used by tracing, cannot be traced and cannot invoke lockdep. */
static inline notrace int
srcu_read_lock_notrace(struct srcu_struct *ssp) __acquires(ssp)
{
        int retval;

        retval = __srcu_read_lock(ssp);
        return retval;
}

/**
 * srcu_read_unlock - unregister a old reader from an SRCU-protected structure.
 * @ssp: srcu_struct in which to unregister the old reader.
 * @idx: return value from corresponding srcu_read_lock().
 *
 * Exit an SRCU read-side critical section.
 */
static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx)
        __releases(ssp)
{
        WARN_ON_ONCE(idx & ~0x1);
        rcu_lock_release(&(ssp)->dep_map);
        __srcu_read_unlock(ssp, idx);
}

/* Used by tracing, cannot be traced and cannot call lockdep. */
static inline notrace void
srcu_read_unlock_notrace(struct srcu_struct *ssp, int idx) __releases(ssp)
{
        __srcu_read_unlock(ssp, idx);
}

/**
 * smp_mb__after_srcu_read_unlock - ensure full ordering after srcu_read_unlock
 *
 * Converts the preceding srcu_read_unlock into a two-way memory barrier.
 *
 * Call this after srcu_read_unlock, to guarantee that all memory operations
 * that occur after smp_mb__after_srcu_read_unlock will appear to happen after
 * the preceding srcu_read_unlock.
 */
static inline void smp_mb__after_srcu_read_unlock(void)
{
        /* __srcu_read_unlock has smp_mb() internally so nothing to do here. */
}

DEFINE_LOCK_GUARD_1(srcu, struct srcu_struct,
                    _T->idx = srcu_read_lock(_T->lock),
                    srcu_read_unlock(_T->lock, _T->idx),
                    int idx)

#endif















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Access to user system call parameters and results
 *
 * Copyright (C) 2008-2009 Red Hat, Inc.  All rights reserved.
 *
 * See asm-generic/syscall.h for descriptions of what we must do here.
 */

#ifndef _ASM_X86_SYSCALL_H
#define _ASM_X86_SYSCALL_H

#include <uapi/linux/audit.h>
#include <linux/sched.h>
#include <linux/err.h>
#include <asm/thread_info.h>        /* for TS_COMPAT */
#include <asm/unistd.h>

typedef long (*sys_call_ptr_t)(const struct pt_regs *);
extern const sys_call_ptr_t sys_call_table[];

#if defined(CONFIG_X86_32)
#define ia32_sys_call_table sys_call_table
#endif

#if defined(CONFIG_IA32_EMULATION)
extern const sys_call_ptr_t ia32_sys_call_table[];
#endif

#ifdef CONFIG_X86_X32_ABI
extern const sys_call_ptr_t x32_sys_call_table[];
#endif

/*
 * Only the low 32 bits of orig_ax are meaningful, so we return int.
 * This importantly ignores the high bits on 64-bit, so comparisons
 * sign-extend the low 32 bits.
 */
static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs)
{
        return regs->orig_ax;
}

static inline void syscall_rollback(struct task_struct *task,
                                    struct pt_regs *regs)
{
        regs->ax = regs->orig_ax;
}

static inline long syscall_get_error(struct task_struct *task,
                                     struct pt_regs *regs)
{
        unsigned long error = regs->ax;
#ifdef CONFIG_IA32_EMULATION
        /*
         * TS_COMPAT is set for 32-bit syscall entries and then
         * remains set until we return to user mode.
         */
        if (task->thread_info.status & (TS_COMPAT|TS_I386_REGS_POKED))
                /*
                 * Sign-extend the value so (int)-EFOO becomes (long)-EFOO
                 * and will match correctly in comparisons.
                 */
                error = (long) (int) error;
#endif
        return IS_ERR_VALUE(error) ? error : 0;
}

static inline long syscall_get_return_value(struct task_struct *task,
                                            struct pt_regs *regs)
{
        return regs->ax;
}

static inline void syscall_set_return_value(struct task_struct *task,
                                            struct pt_regs *regs,
                                            int error, long val)
{
        regs->ax = (long) error ?: val;
}

#ifdef CONFIG_X86_32

static inline void syscall_get_arguments(struct task_struct *task,
                                         struct pt_regs *regs,
                                         unsigned long *args)
{
        args[0] = regs->bx;
        args[1] = regs->cx;
        args[2] = regs->dx;
        args[3] = regs->si;
        args[4] = regs->di;
        args[5] = regs->bp;
}

static inline void syscall_set_arguments(struct task_struct *task,
                                         struct pt_regs *regs,
                                         unsigned int i, unsigned int n,
                                         const unsigned long *args)
{
        BUG_ON(i + n > 6);
        memcpy(&regs->bx + i, args, n * sizeof(args[0]));
}

static inline int syscall_get_arch(struct task_struct *task)
{
        return AUDIT_ARCH_I386;
}

#else         /* CONFIG_X86_64 */

static inline void syscall_get_arguments(struct task_struct *task,
                                         struct pt_regs *regs,
                                         unsigned long *args)
{
# ifdef CONFIG_IA32_EMULATION
        if (task->thread_info.status & TS_COMPAT) {
                *args++ = regs->bx;
                *args++ = regs->cx;
                *args++ = regs->dx;
                *args++ = regs->si;
                *args++ = regs->di;
                *args   = regs->bp;
        } else
# endif
        {
                *args++ = regs->di;
                *args++ = regs->si;
                *args++ = regs->dx;
                *args++ = regs->r10;
                *args++ = regs->r8;
                *args   = regs->r9;
        }
}

static inline void syscall_set_arguments(struct task_struct *task,
                                         struct pt_regs *regs,
                                         const unsigned long *args)
{
# ifdef CONFIG_IA32_EMULATION
        if (task->thread_info.status & TS_COMPAT) {
                regs->bx = *args++;
                regs->cx = *args++;
                regs->dx = *args++;
                regs->si = *args++;
                regs->di = *args++;
                regs->bp = *args;
        } else
# endif
        {
                regs->di = *args++;
                regs->si = *args++;
                regs->dx = *args++;
                regs->r10 = *args++;
                regs->r8 = *args++;
                regs->r9 = *args;
        }
}

static inline int syscall_get_arch(struct task_struct *task)
{
        /* x32 tasks should be considered AUDIT_ARCH_X86_64. */
        return (IS_ENABLED(CONFIG_IA32_EMULATION) &&
                task->thread_info.status & TS_COMPAT)
                ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
}

void do_syscall_64(unsigned long nr, struct pt_regs *regs);
void do_int80_syscall_32(struct pt_regs *regs);
long do_fast_syscall_32(struct pt_regs *regs);

#endif        /* CONFIG_X86_32 */

#endif        /* _ASM_X86_SYSCALL_H */


























































































    3 




    9 

   10 
































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
// SPDX-License-Identifier: GPL-2.0
/*
 * Fast batching percpu counters.
 */

#include <linux/percpu_counter.h>
#include <linux/mutex.h>
#include <linux/init.h>
#include <linux/cpu.h>
#include <linux/module.h>
#include <linux/debugobjects.h>

#ifdef CONFIG_HOTPLUG_CPU
static LIST_HEAD(percpu_counters);
static DEFINE_SPINLOCK(percpu_counters_lock);
#endif

#ifdef CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER

static const struct debug_obj_descr percpu_counter_debug_descr;

static bool percpu_counter_fixup_free(void *addr, enum debug_obj_state state)
{
        struct percpu_counter *fbc = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                percpu_counter_destroy(fbc);
                debug_object_free(fbc, &percpu_counter_debug_descr);
                return true;
        default:
                return false;
        }
}

static const struct debug_obj_descr percpu_counter_debug_descr = {
        .name                = "percpu_counter",
        .fixup_free        = percpu_counter_fixup_free,
};

static inline void debug_percpu_counter_activate(struct percpu_counter *fbc)
{
        debug_object_init(fbc, &percpu_counter_debug_descr);
        debug_object_activate(fbc, &percpu_counter_debug_descr);
}

static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc)
{
        debug_object_deactivate(fbc, &percpu_counter_debug_descr);
        debug_object_free(fbc, &percpu_counter_debug_descr);
}

#else        /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */
static inline void debug_percpu_counter_activate(struct percpu_counter *fbc)
{ }
static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc)
{ }
#endif        /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */

void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
{
        int cpu;
        unsigned long flags;

        raw_spin_lock_irqsave(&fbc->lock, flags);
        for_each_possible_cpu(cpu) {
                s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
                *pcount = 0;
        }
        fbc->count = amount;
        raw_spin_unlock_irqrestore(&fbc->lock, flags);
}
EXPORT_SYMBOL(percpu_counter_set);

/**
 * This function is both preempt and irq safe. The former is due to explicit
 * preemption disable. The latter is guaranteed by the fact that the slow path
 * is explicitly protected by an irq-safe spinlock whereas the fast patch uses
 * this_cpu_add which is irq-safe by definition. Hence there is no need muck
 * with irq state before calling this one
 */
void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch)
{
        s64 count;

        preempt_disable();
        count = __this_cpu_read(*fbc->counters) + amount;
        if (abs(count) >= batch) {
                unsigned long flags;
                raw_spin_lock_irqsave(&fbc->lock, flags);
                fbc->count += count;
                __this_cpu_sub(*fbc->counters, count - amount);
                raw_spin_unlock_irqrestore(&fbc->lock, flags);
        } else {
                this_cpu_add(*fbc->counters, amount);
        }
        preempt_enable();
}
EXPORT_SYMBOL(percpu_counter_add_batch);

/*
 * For percpu_counter with a big batch, the devication of its count could
 * be big, and there is requirement to reduce the deviation, like when the
 * counter's batch could be runtime decreased to get a better accuracy,
 * which can be achieved by running this sync function on each CPU.
 */
void percpu_counter_sync(struct percpu_counter *fbc)
{
        unsigned long flags;
        s64 count;

        raw_spin_lock_irqsave(&fbc->lock, flags);
        count = __this_cpu_read(*fbc->counters);
        fbc->count += count;
        __this_cpu_sub(*fbc->counters, count);
        raw_spin_unlock_irqrestore(&fbc->lock, flags);
}
EXPORT_SYMBOL(percpu_counter_sync);

/*
 * Add up all the per-cpu counts, return the result.  This is a more accurate
 * but much slower version of percpu_counter_read_positive()
 */
s64 __percpu_counter_sum(struct percpu_counter *fbc)
{
        s64 ret;
        int cpu;
        unsigned long flags;

        raw_spin_lock_irqsave(&fbc->lock, flags);
        ret = fbc->count;
        for_each_online_cpu(cpu) {
                s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
                ret += *pcount;
        }
        raw_spin_unlock_irqrestore(&fbc->lock, flags);
        return ret;
}
EXPORT_SYMBOL(__percpu_counter_sum);

int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
                          struct lock_class_key *key)
{
        unsigned long flags __maybe_unused;

        raw_spin_lock_init(&fbc->lock);
        lockdep_set_class(&fbc->lock, key);
        fbc->count = amount;
        fbc->counters = alloc_percpu_gfp(s32, gfp);
        if (!fbc->counters)
                return -ENOMEM;

        debug_percpu_counter_activate(fbc);

#ifdef CONFIG_HOTPLUG_CPU
        INIT_LIST_HEAD(&fbc->list);
        spin_lock_irqsave(&percpu_counters_lock, flags);
        list_add(&fbc->list, &percpu_counters);
        spin_unlock_irqrestore(&percpu_counters_lock, flags);
#endif
        return 0;
}
EXPORT_SYMBOL(__percpu_counter_init);

void percpu_counter_destroy(struct percpu_counter *fbc)
{
        unsigned long flags __maybe_unused;

        if (!fbc->counters)
                return;

        debug_percpu_counter_deactivate(fbc);

#ifdef CONFIG_HOTPLUG_CPU
        spin_lock_irqsave(&percpu_counters_lock, flags);
        list_del(&fbc->list);
        spin_unlock_irqrestore(&percpu_counters_lock, flags);
#endif
        free_percpu(fbc->counters);
        fbc->counters = NULL;
}
EXPORT_SYMBOL(percpu_counter_destroy);

int percpu_counter_batch __read_mostly = 32;
EXPORT_SYMBOL(percpu_counter_batch);

static int compute_batch_value(unsigned int cpu)
{
        int nr = num_online_cpus();

        percpu_counter_batch = max(32, nr*2);
        return 0;
}

static int percpu_counter_cpu_dead(unsigned int cpu)
{
#ifdef CONFIG_HOTPLUG_CPU
        struct percpu_counter *fbc;

        compute_batch_value(cpu);

        spin_lock_irq(&percpu_counters_lock);
        list_for_each_entry(fbc, &percpu_counters, list) {
                s32 *pcount;

                raw_spin_lock(&fbc->lock);
                pcount = per_cpu_ptr(fbc->counters, cpu);
                fbc->count += *pcount;
                *pcount = 0;
                raw_spin_unlock(&fbc->lock);
        }
        spin_unlock_irq(&percpu_counters_lock);
#endif
        return 0;
}

/*
 * Compare counter against given value.
 * Return 1 if greater, 0 if equal and -1 if less
 */
int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch)
{
        s64        count;

        count = percpu_counter_read(fbc);
        /* Check to see if rough count will be sufficient for comparison */
        if (abs(count - rhs) > (batch * num_online_cpus())) {
                if (count > rhs)
                        return 1;
                else
                        return -1;
        }
        /* Need to use precise count */
        count = percpu_counter_sum(fbc);
        if (count > rhs)
                return 1;
        else if (count < rhs)
                return -1;
        else
                return 0;
}
EXPORT_SYMBOL(__percpu_counter_compare);

static int __init percpu_counter_startup(void)
{
        int ret;

        ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "lib/percpu_cnt:online",
                                compute_batch_value, NULL);
        WARN_ON(ret < 0);
        ret = cpuhp_setup_state_nocalls(CPUHP_PERCPU_CNT_DEAD,
                                        "lib/percpu_cnt:dead", NULL,
                                        percpu_counter_cpu_dead);
        WARN_ON(ret < 0);
        return 0;
}
module_init(percpu_counter_startup);











































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MMU_NOTIFIER_H
#define _LINUX_MMU_NOTIFIER_H

#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/mm_types.h>
#include <linux/mmap_lock.h>
#include <linux/srcu.h>
#include <linux/interval_tree.h>

struct mmu_notifier_subscriptions;
struct mmu_notifier;
struct mmu_notifier_range;
struct mmu_interval_notifier;

/**
 * enum mmu_notifier_event - reason for the mmu notifier callback
 * @MMU_NOTIFY_UNMAP: either munmap() that unmap the range or a mremap() that
 * move the range
 *
 * @MMU_NOTIFY_CLEAR: clear page table entry (many reasons for this like
 * madvise() or replacing a page by another one, ...).
 *
 * @MMU_NOTIFY_PROTECTION_VMA: update is due to protection change for the range
 * ie using the vma access permission (vm_page_prot) to update the whole range
 * is enough no need to inspect changes to the CPU page table (mprotect()
 * syscall)
 *
 * @MMU_NOTIFY_PROTECTION_PAGE: update is due to change in read/write flag for
 * pages in the range so to mirror those changes the user must inspect the CPU
 * page table (from the end callback).
 *
 * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same
 * access flags). User should soft dirty the page in the end callback to make
 * sure that anyone relying on soft dirtyness catch pages that might be written
 * through non CPU mappings.
 *
 * @MMU_NOTIFY_RELEASE: used during mmu_interval_notifier invalidate to signal
 * that the mm refcount is zero and the range is no longer accessible.
 *
 * @MMU_NOTIFY_MIGRATE: used during migrate_vma_collect() invalidate to signal
 * a device driver to possibly ignore the invalidation if the
 * migrate_pgmap_owner field matches the driver's device private pgmap owner.
 */
enum mmu_notifier_event {
        MMU_NOTIFY_UNMAP = 0,
        MMU_NOTIFY_CLEAR,
        MMU_NOTIFY_PROTECTION_VMA,
        MMU_NOTIFY_PROTECTION_PAGE,
        MMU_NOTIFY_SOFT_DIRTY,
        MMU_NOTIFY_RELEASE,
        MMU_NOTIFY_MIGRATE,
};

#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)

struct mmu_notifier_ops {
        /*
         * Called either by mmu_notifier_unregister or when the mm is
         * being destroyed by exit_mmap, always before all pages are
         * freed. This can run concurrently with other mmu notifier
         * methods (the ones invoked outside the mm context) and it
         * should tear down all secondary mmu mappings and freeze the
         * secondary mmu. If this method isn't implemented you've to
         * be sure that nothing could possibly write to the pages
         * through the secondary mmu by the time the last thread with
         * tsk->mm == mm exits.
         *
         * As side note: the pages freed after ->release returns could
         * be immediately reallocated by the gart at an alias physical
         * address with a different cache model, so if ->release isn't
         * implemented because all _software_ driven memory accesses
         * through the secondary mmu are terminated by the time the
         * last thread of this mm quits, you've also to be sure that
         * speculative _hardware_ operations can't allocate dirty
         * cachelines in the cpu that could not be snooped and made
         * coherent with the other read and write operations happening
         * through the gart alias address, so leading to memory
         * corruption.
         */
        void (*release)(struct mmu_notifier *subscription,
                        struct mm_struct *mm);

        /*
         * clear_flush_young is called after the VM is
         * test-and-clearing the young/accessed bitflag in the
         * pte. This way the VM will provide proper aging to the
         * accesses to the page through the secondary MMUs and not
         * only to the ones through the Linux pte.
         * Start-end is necessary in case the secondary MMU is mapping the page
         * at a smaller granularity than the primary MMU.
         */
        int (*clear_flush_young)(struct mmu_notifier *subscription,
                                 struct mm_struct *mm,
                                 unsigned long start,
                                 unsigned long end);

        /*
         * clear_young is a lightweight version of clear_flush_young. Like the
         * latter, it is supposed to test-and-clear the young/accessed bitflag
         * in the secondary pte, but it may omit flushing the secondary tlb.
         */
        int (*clear_young)(struct mmu_notifier *subscription,
                           struct mm_struct *mm,
                           unsigned long start,
                           unsigned long end);

        /*
         * test_young is called to check the young/accessed bitflag in
         * the secondary pte. This is used to know if the page is
         * frequently used without actually clearing the flag or tearing
         * down the secondary mapping on the page.
         */
        int (*test_young)(struct mmu_notifier *subscription,
                          struct mm_struct *mm,
                          unsigned long address);

        /*
         * change_pte is called in cases that pte mapping to page is changed:
         * for example, when ksm remaps pte to point to a new shared page.
         */
        void (*change_pte)(struct mmu_notifier *subscription,
                           struct mm_struct *mm,
                           unsigned long address,
                           pte_t pte);

        /*
         * invalidate_range_start() and invalidate_range_end() must be
         * paired and are called only when the mmap_lock and/or the
         * locks protecting the reverse maps are held. If the subsystem
         * can't guarantee that no additional references are taken to
         * the pages in the range, it has to implement the
         * invalidate_range() notifier to remove any references taken
         * after invalidate_range_start().
         *
         * Invalidation of multiple concurrent ranges may be
         * optionally permitted by the driver. Either way the
         * establishment of sptes is forbidden in the range passed to
         * invalidate_range_begin/end for the whole duration of the
         * invalidate_range_begin/end critical section.
         *
         * invalidate_range_start() is called when all pages in the
         * range are still mapped and have at least a refcount of one.
         *
         * invalidate_range_end() is called when all pages in the
         * range have been unmapped and the pages have been freed by
         * the VM.
         *
         * The VM will remove the page table entries and potentially
         * the page between invalidate_range_start() and
         * invalidate_range_end(). If the page must not be freed
         * because of pending I/O or other circumstances then the
         * invalidate_range_start() callback (or the initial mapping
         * by the driver) must make sure that the refcount is kept
         * elevated.
         *
         * If the driver increases the refcount when the pages are
         * initially mapped into an address space then either
         * invalidate_range_start() or invalidate_range_end() may
         * decrease the refcount. If the refcount is decreased on
         * invalidate_range_start() then the VM can free pages as page
         * table entries are removed.  If the refcount is only
         * droppped on invalidate_range_end() then the driver itself
         * will drop the last refcount but it must take care to flush
         * any secondary tlb before doing the final free on the
         * page. Pages will no longer be referenced by the linux
         * address space but may still be referenced by sptes until
         * the last refcount is dropped.
         *
         * If blockable argument is set to false then the callback cannot
         * sleep and has to return with -EAGAIN if sleeping would be required.
         * 0 should be returned otherwise. Please note that notifiers that can
         * fail invalidate_range_start are not allowed to implement
         * invalidate_range_end, as there is no mechanism for informing the
         * notifier that its start failed.
         */
        int (*invalidate_range_start)(struct mmu_notifier *subscription,
                                      const struct mmu_notifier_range *range);
        void (*invalidate_range_end)(struct mmu_notifier *subscription,
                                     const struct mmu_notifier_range *range);

        /*
         * invalidate_range() is either called between
         * invalidate_range_start() and invalidate_range_end() when the
         * VM has to free pages that where unmapped, but before the
         * pages are actually freed, or outside of _start()/_end() when
         * a (remote) TLB is necessary.
         *
         * If invalidate_range() is used to manage a non-CPU TLB with
         * shared page-tables, it not necessary to implement the
         * invalidate_range_start()/end() notifiers, as
         * invalidate_range() alread catches the points in time when an
         * external TLB range needs to be flushed. For more in depth
         * discussion on this see Documentation/vm/mmu_notifier.rst
         *
         * Note that this function might be called with just a sub-range
         * of what was passed to invalidate_range_start()/end(), if
         * called between those functions.
         */
        void (*invalidate_range)(struct mmu_notifier *subscription,
                                 struct mm_struct *mm,
                                 unsigned long start,
                                 unsigned long end);

        /*
         * These callbacks are used with the get/put interface to manage the
         * lifetime of the mmu_notifier memory. alloc_notifier() returns a new
         * notifier for use with the mm.
         *
         * free_notifier() is only called after the mmu_notifier has been
         * fully put, calls to any ops callback are prevented and no ops
         * callbacks are currently running. It is called from a SRCU callback
         * and cannot sleep.
         */
        struct mmu_notifier *(*alloc_notifier)(struct mm_struct *mm);
        void (*free_notifier)(struct mmu_notifier *subscription);
};

/*
 * The notifier chains are protected by mmap_lock and/or the reverse map
 * semaphores. Notifier chains are only changed when all reverse maps and
 * the mmap_lock locks are taken.
 *
 * Therefore notifier chains can only be traversed when either
 *
 * 1. mmap_lock is held.
 * 2. One of the reverse map locks is held (i_mmap_rwsem or anon_vma->rwsem).
 * 3. No other concurrent thread can access the list (release)
 */
struct mmu_notifier {
        struct hlist_node hlist;
        const struct mmu_notifier_ops *ops;
        struct mm_struct *mm;
        struct rcu_head rcu;
        unsigned int users;
};

/**
 * struct mmu_interval_notifier_ops
 * @invalidate: Upon return the caller must stop using any SPTEs within this
 *              range. This function can sleep. Return false only if sleeping
 *              was required but mmu_notifier_range_blockable(range) is false.
 */
struct mmu_interval_notifier_ops {
        bool (*invalidate)(struct mmu_interval_notifier *interval_sub,
                           const struct mmu_notifier_range *range,
                           unsigned long cur_seq);
};

struct mmu_interval_notifier {
        struct interval_tree_node interval_tree;
        const struct mmu_interval_notifier_ops *ops;
        struct mm_struct *mm;
        struct hlist_node deferred_item;
        unsigned long invalidate_seq;
};

#ifdef CONFIG_MMU_NOTIFIER

#ifdef CONFIG_LOCKDEP
extern struct lockdep_map __mmu_notifier_invalidate_range_start_map;
#endif

struct mmu_notifier_range {
        struct vm_area_struct *vma;
        struct mm_struct *mm;
        unsigned long start;
        unsigned long end;
        unsigned flags;
        enum mmu_notifier_event event;
        void *migrate_pgmap_owner;
};

static inline int mm_has_notifiers(struct mm_struct *mm)
{
        return unlikely(mm->notifier_subscriptions);
}

struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops,
                                             struct mm_struct *mm);
static inline struct mmu_notifier *
mmu_notifier_get(const struct mmu_notifier_ops *ops, struct mm_struct *mm)
{
        struct mmu_notifier *ret;

        mmap_write_lock(mm);
        ret = mmu_notifier_get_locked(ops, mm);
        mmap_write_unlock(mm);
        return ret;
}
void mmu_notifier_put(struct mmu_notifier *subscription);
void mmu_notifier_synchronize(void);

extern int mmu_notifier_register(struct mmu_notifier *subscription,
                                 struct mm_struct *mm);
extern int __mmu_notifier_register(struct mmu_notifier *subscription,
                                   struct mm_struct *mm);
extern void mmu_notifier_unregister(struct mmu_notifier *subscription,
                                    struct mm_struct *mm);

unsigned long
mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub);
int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub,
                                 struct mm_struct *mm, unsigned long start,
                                 unsigned long length,
                                 const struct mmu_interval_notifier_ops *ops);
int mmu_interval_notifier_insert_locked(
        struct mmu_interval_notifier *interval_sub, struct mm_struct *mm,
        unsigned long start, unsigned long length,
        const struct mmu_interval_notifier_ops *ops);
void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub);

/**
 * mmu_interval_set_seq - Save the invalidation sequence
 * @interval_sub - The subscription passed to invalidate
 * @cur_seq - The cur_seq passed to the invalidate() callback
 *
 * This must be called unconditionally from the invalidate callback of a
 * struct mmu_interval_notifier_ops under the same lock that is used to call
 * mmu_interval_read_retry(). It updates the sequence number for later use by
 * mmu_interval_read_retry(). The provided cur_seq will always be odd.
 *
 * If the caller does not call mmu_interval_read_begin() or
 * mmu_interval_read_retry() then this call is not required.
 */
static inline void
mmu_interval_set_seq(struct mmu_interval_notifier *interval_sub,
                     unsigned long cur_seq)
{
        WRITE_ONCE(interval_sub->invalidate_seq, cur_seq);
}

/**
 * mmu_interval_read_retry - End a read side critical section against a VA range
 * interval_sub: The subscription
 * seq: The return of the paired mmu_interval_read_begin()
 *
 * This MUST be called under a user provided lock that is also held
 * unconditionally by op->invalidate() when it calls mmu_interval_set_seq().
 *
 * Each call should be paired with a single mmu_interval_read_begin() and
 * should be used to conclude the read side.
 *
 * Returns true if an invalidation collided with this critical section, and
 * the caller should retry.
 */
static inline bool
mmu_interval_read_retry(struct mmu_interval_notifier *interval_sub,
                        unsigned long seq)
{
        return interval_sub->invalidate_seq != seq;
}

/**
 * mmu_interval_check_retry - Test if a collision has occurred
 * interval_sub: The subscription
 * seq: The return of the matching mmu_interval_read_begin()
 *
 * This can be used in the critical section between mmu_interval_read_begin()
 * and mmu_interval_read_retry().  A return of true indicates an invalidation
 * has collided with this critical region and a future
 * mmu_interval_read_retry() will return true.
 *
 * False is not reliable and only suggests a collision may not have
 * occured. It can be called many times and does not have to hold the user
 * provided lock.
 *
 * This call can be used as part of loops and other expensive operations to
 * expedite a retry.
 */
static inline bool
mmu_interval_check_retry(struct mmu_interval_notifier *interval_sub,
                         unsigned long seq)
{
        /* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */
        return READ_ONCE(interval_sub->invalidate_seq) != seq;
}

extern void __mmu_notifier_subscriptions_destroy(struct mm_struct *mm);
extern void __mmu_notifier_release(struct mm_struct *mm);
extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
                                          unsigned long start,
                                          unsigned long end);
extern int __mmu_notifier_clear_young(struct mm_struct *mm,
                                      unsigned long start,
                                      unsigned long end);
extern int __mmu_notifier_test_young(struct mm_struct *mm,
                                     unsigned long address);
extern void __mmu_notifier_change_pte(struct mm_struct *mm,
                                      unsigned long address, pte_t pte);
extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r);
extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r,
                                  bool only_end);
extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
                                  unsigned long start, unsigned long end);
extern bool
mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range);

static inline bool
mmu_notifier_range_blockable(const struct mmu_notifier_range *range)
{
        return (range->flags & MMU_NOTIFIER_RANGE_BLOCKABLE);
}

static inline void mmu_notifier_release(struct mm_struct *mm)
{
        if (mm_has_notifiers(mm))
                __mmu_notifier_release(mm);
}

static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
                                          unsigned long start,
                                          unsigned long end)
{
        if (mm_has_notifiers(mm))
                return __mmu_notifier_clear_flush_young(mm, start, end);
        return 0;
}

static inline int mmu_notifier_clear_young(struct mm_struct *mm,
                                           unsigned long start,
                                           unsigned long end)
{
        if (mm_has_notifiers(mm))
                return __mmu_notifier_clear_young(mm, start, end);
        return 0;
}

static inline int mmu_notifier_test_young(struct mm_struct *mm,
                                          unsigned long address)
{
        if (mm_has_notifiers(mm))
                return __mmu_notifier_test_young(mm, address);
        return 0;
}

static inline void mmu_notifier_change_pte(struct mm_struct *mm,
                                           unsigned long address, pte_t pte)
{
        if (mm_has_notifiers(mm))
                __mmu_notifier_change_pte(mm, address, pte);
}

static inline void
mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
{
        might_sleep();

        lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
        if (mm_has_notifiers(range->mm)) {
                range->flags |= MMU_NOTIFIER_RANGE_BLOCKABLE;
                __mmu_notifier_invalidate_range_start(range);
        }
        lock_map_release(&__mmu_notifier_invalidate_range_start_map);
}

static inline int
mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range)
{
        int ret = 0;

        lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
        if (mm_has_notifiers(range->mm)) {
                range->flags &= ~MMU_NOTIFIER_RANGE_BLOCKABLE;
                ret = __mmu_notifier_invalidate_range_start(range);
        }
        lock_map_release(&__mmu_notifier_invalidate_range_start_map);
        return ret;
}

static inline void
mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range)
{
        if (mmu_notifier_range_blockable(range))
                might_sleep();

        if (mm_has_notifiers(range->mm))
                __mmu_notifier_invalidate_range_end(range, false);
}

static inline void
mmu_notifier_invalidate_range_only_end(struct mmu_notifier_range *range)
{
        if (mm_has_notifiers(range->mm))
                __mmu_notifier_invalidate_range_end(range, true);
}

static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
                                  unsigned long start, unsigned long end)
{
        if (mm_has_notifiers(mm))
                __mmu_notifier_invalidate_range(mm, start, end);
}

static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm)
{
        mm->notifier_subscriptions = NULL;
}

static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm)
{
        if (mm_has_notifiers(mm))
                __mmu_notifier_subscriptions_destroy(mm);
}


static inline void mmu_notifier_range_init(struct mmu_notifier_range *range,
                                           enum mmu_notifier_event event,
                                           unsigned flags,
                                           struct vm_area_struct *vma,
                                           struct mm_struct *mm,
                                           unsigned long start,
                                           unsigned long end)
{
        range->vma = vma;
        range->event = event;
        range->mm = mm;
        range->start = start;
        range->end = end;
        range->flags = flags;
}

static inline void mmu_notifier_range_init_migrate(
                        struct mmu_notifier_range *range, unsigned int flags,
                        struct vm_area_struct *vma, struct mm_struct *mm,
                        unsigned long start, unsigned long end, void *pgmap)
{
        mmu_notifier_range_init(range, MMU_NOTIFY_MIGRATE, flags, vma, mm,
                                start, end);
        range->migrate_pgmap_owner = pgmap;
}

#define ptep_clear_flush_young_notify(__vma, __address, __ptep)                \
({                                                                        \
        int __young;                                                        \
        struct vm_area_struct *___vma = __vma;                                \
        unsigned long ___address = __address;                                \
        __young = ptep_clear_flush_young(___vma, ___address, __ptep);        \
        __young |= mmu_notifier_clear_flush_young(___vma->vm_mm,        \
                                                  ___address,                \
                                                  ___address +                \
                                                        PAGE_SIZE);        \
        __young;                                                        \
})

#define pmdp_clear_flush_young_notify(__vma, __address, __pmdp)                \
({                                                                        \
        int __young;                                                        \
        struct vm_area_struct *___vma = __vma;                                \
        unsigned long ___address = __address;                                \
        __young = pmdp_clear_flush_young(___vma, ___address, __pmdp);        \
        __young |= mmu_notifier_clear_flush_young(___vma->vm_mm,        \
                                                  ___address,                \
                                                  ___address +                \
                                                        PMD_SIZE);        \
        __young;                                                        \
})

#define ptep_clear_young_notify(__vma, __address, __ptep)                \
({                                                                        \
        int __young;                                                        \
        struct vm_area_struct *___vma = __vma;                                \
        unsigned long ___address = __address;                                \
        __young = ptep_test_and_clear_young(___vma, ___address, __ptep);\
        __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address,        \
                                            ___address + PAGE_SIZE);        \
        __young;                                                        \
})

#define pmdp_clear_young_notify(__vma, __address, __pmdp)                \
({                                                                        \
        int __young;                                                        \
        struct vm_area_struct *___vma = __vma;                                \
        unsigned long ___address = __address;                                \
        __young = pmdp_test_and_clear_young(___vma, ___address, __pmdp);\
        __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address,        \
                                            ___address + PMD_SIZE);        \
        __young;                                                        \
})

#define        ptep_clear_flush_notify(__vma, __address, __ptep)                \
({                                                                        \
        unsigned long ___addr = __address & PAGE_MASK;                        \
        struct mm_struct *___mm = (__vma)->vm_mm;                        \
        pte_t ___pte;                                                        \
                                                                        \
        ___pte = ptep_clear_flush(__vma, __address, __ptep);                \
        mmu_notifier_invalidate_range(___mm, ___addr,                        \
                                        ___addr + PAGE_SIZE);                \
                                                                        \
        ___pte;                                                                \
})

#define pmdp_huge_clear_flush_notify(__vma, __haddr, __pmd)                \
({                                                                        \
        unsigned long ___haddr = __haddr & HPAGE_PMD_MASK;                \
        struct mm_struct *___mm = (__vma)->vm_mm;                        \
        pmd_t ___pmd;                                                        \
                                                                        \
        ___pmd = pmdp_huge_clear_flush(__vma, __haddr, __pmd);                \
        mmu_notifier_invalidate_range(___mm, ___haddr,                        \
                                      ___haddr + HPAGE_PMD_SIZE);        \
                                                                        \
        ___pmd;                                                                \
})

#define pudp_huge_clear_flush_notify(__vma, __haddr, __pud)                \
({                                                                        \
        unsigned long ___haddr = __haddr & HPAGE_PUD_MASK;                \
        struct mm_struct *___mm = (__vma)->vm_mm;                        \
        pud_t ___pud;                                                        \
                                                                        \
        ___pud = pudp_huge_clear_flush(__vma, __haddr, __pud);                \
        mmu_notifier_invalidate_range(___mm, ___haddr,                        \
                                      ___haddr + HPAGE_PUD_SIZE);        \
                                                                        \
        ___pud;                                                                \
})

/*
 * set_pte_at_notify() sets the pte _after_ running the notifier.
 * This is safe to start by updating the secondary MMUs, because the primary MMU
 * pte invalidate must have already happened with a ptep_clear_flush() before
 * set_pte_at_notify() has been invoked.  Updating the secondary MMUs first is
 * required when we change both the protection of the mapping from read-only to
 * read-write and the pfn (like during copy on write page faults). Otherwise the
 * old page would remain mapped readonly in the secondary MMUs after the new
 * page is already writable by some CPU through the primary MMU.
 */
#define set_pte_at_notify(__mm, __address, __ptep, __pte)                \
({                                                                        \
        struct mm_struct *___mm = __mm;                                        \
        unsigned long ___address = __address;                                \
        pte_t ___pte = __pte;                                                \
                                                                        \
        mmu_notifier_change_pte(___mm, ___address, ___pte);                \
        set_pte_at(___mm, ___address, __ptep, ___pte);                        \
})

#else /* CONFIG_MMU_NOTIFIER */

struct mmu_notifier_range {
        unsigned long start;
        unsigned long end;
};

static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range,
                                            unsigned long start,
                                            unsigned long end)
{
        range->start = start;
        range->end = end;
}

#define mmu_notifier_range_init(range,event,flags,vma,mm,start,end)  \
        _mmu_notifier_range_init(range, start, end)
#define mmu_notifier_range_init_migrate(range, flags, vma, mm, start, end, \
                                        pgmap) \
        _mmu_notifier_range_init(range, start, end)

static inline bool
mmu_notifier_range_blockable(const struct mmu_notifier_range *range)
{
        return true;
}

static inline int mm_has_notifiers(struct mm_struct *mm)
{
        return 0;
}

static inline void mmu_notifier_release(struct mm_struct *mm)
{
}

static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
                                          unsigned long start,
                                          unsigned long end)
{
        return 0;
}

static inline int mmu_notifier_test_young(struct mm_struct *mm,
                                          unsigned long address)
{
        return 0;
}

static inline void mmu_notifier_change_pte(struct mm_struct *mm,
                                           unsigned long address, pte_t pte)
{
}

static inline void
mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
{
}

static inline int
mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range)
{
        return 0;
}

static inline
void mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range)
{
}

static inline void
mmu_notifier_invalidate_range_only_end(struct mmu_notifier_range *range)
{
}

static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
                                  unsigned long start, unsigned long end)
{
}

static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm)
{
}

static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm)
{
}

#define mmu_notifier_range_update_to_read_only(r) false

#define ptep_clear_flush_young_notify ptep_clear_flush_young
#define pmdp_clear_flush_young_notify pmdp_clear_flush_young
#define ptep_clear_young_notify ptep_test_and_clear_young
#define pmdp_clear_young_notify pmdp_test_and_clear_young
#define        ptep_clear_flush_notify ptep_clear_flush
#define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush
#define pudp_huge_clear_flush_notify pudp_huge_clear_flush
#define set_pte_at_notify set_pte_at

static inline void mmu_notifier_synchronize(void)
{
}

#endif /* CONFIG_MMU_NOTIFIER */

#endif /* _LINUX_MMU_NOTIFIER_H */

































































































































































































































































































































































    1 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_PREEMPT_H
#define __LINUX_PREEMPT_H

/*
 * include/linux/preempt.h - macros for accessing and manipulating
 * preempt_count (used for kernel preemption, interrupt count, etc.)
 */

#include <linux/linkage.h>
#include <linux/cleanup.h>
#include <linux/list.h>

/*
 * We put the hardirq and softirq counter into the preemption
 * counter. The bitmask has the following meaning:
 *
 * - bits 0-7 are the preemption count (max preemption depth: 256)
 * - bits 8-15 are the softirq count (max # of softirqs: 256)
 *
 * The hardirq count could in theory be the same as the number of
 * interrupts in the system, but we run all interrupt handlers with
 * interrupts disabled, so we cannot have nesting interrupts. Though
 * there are a few palaeontologic drivers which reenable interrupts in
 * the handler, so we need more than one bit here.
 *
 *         PREEMPT_MASK:        0x000000ff
 *         SOFTIRQ_MASK:        0x0000ff00
 *         HARDIRQ_MASK:        0x000f0000
 *             NMI_MASK:        0x00f00000
 * PREEMPT_NEED_RESCHED:        0x80000000
 */
#define PREEMPT_BITS        8
#define SOFTIRQ_BITS        8
#define HARDIRQ_BITS        4
#define NMI_BITS        4

#define PREEMPT_SHIFT        0
#define SOFTIRQ_SHIFT        (PREEMPT_SHIFT + PREEMPT_BITS)
#define HARDIRQ_SHIFT        (SOFTIRQ_SHIFT + SOFTIRQ_BITS)
#define NMI_SHIFT        (HARDIRQ_SHIFT + HARDIRQ_BITS)

#define __IRQ_MASK(x)        ((1UL << (x))-1)

#define PREEMPT_MASK        (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
#define SOFTIRQ_MASK        (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
#define HARDIRQ_MASK        (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
#define NMI_MASK        (__IRQ_MASK(NMI_BITS)     << NMI_SHIFT)

#define PREEMPT_OFFSET        (1UL << PREEMPT_SHIFT)
#define SOFTIRQ_OFFSET        (1UL << SOFTIRQ_SHIFT)
#define HARDIRQ_OFFSET        (1UL << HARDIRQ_SHIFT)
#define NMI_OFFSET        (1UL << NMI_SHIFT)

#define SOFTIRQ_DISABLE_OFFSET        (2 * SOFTIRQ_OFFSET)

#define PREEMPT_DISABLED        (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)

/*
 * Disable preemption until the scheduler is running -- use an unconditional
 * value so that it also works on !PREEMPT_COUNT kernels.
 *
 * Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count().
 */
#define INIT_PREEMPT_COUNT        PREEMPT_OFFSET

/*
 * Initial preempt_count value; reflects the preempt_count schedule invariant
 * which states that during context switches:
 *
 *    preempt_count() == 2*PREEMPT_DISABLE_OFFSET
 *
 * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels.
 * Note: See finish_task_switch().
 */
#define FORK_PREEMPT_COUNT        (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)

/* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */
#include <asm/preempt.h>

#define hardirq_count()        (preempt_count() & HARDIRQ_MASK)
#define softirq_count()        (preempt_count() & SOFTIRQ_MASK)
#define irq_count()        (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
                                 | NMI_MASK))

/*
 * Are we doing bottom half or hardware interrupt processing?
 *
 * in_irq()       - We're in (hard) IRQ context
 * in_softirq()   - We have BH disabled, or are processing softirqs
 * in_interrupt() - We're in NMI,IRQ,SoftIRQ context or have BH disabled
 * in_serving_softirq() - We're in softirq context
 * in_nmi()       - We're in NMI context
 * in_task()          - We're in task context
 *
 * Note: due to the BH disabled confusion: in_softirq(),in_interrupt() really
 *       should not be used in new code.
 */
#define in_irq()                (hardirq_count())
#define in_softirq()                (softirq_count())
#define in_interrupt()                (irq_count())
#define in_serving_softirq()        (softirq_count() & SOFTIRQ_OFFSET)
#define in_nmi()                (preempt_count() & NMI_MASK)
#define in_task()                (!(preempt_count() & \
                                   (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))

/*
 * The preempt_count offset after preempt_disable();
 */
#if defined(CONFIG_PREEMPT_COUNT)
# define PREEMPT_DISABLE_OFFSET        PREEMPT_OFFSET
#else
# define PREEMPT_DISABLE_OFFSET        0
#endif

/*
 * The preempt_count offset after spin_lock()
 */
#define PREEMPT_LOCK_OFFSET        PREEMPT_DISABLE_OFFSET

/*
 * The preempt_count offset needed for things like:
 *
 *  spin_lock_bh()
 *
 * Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and
 * softirqs, such that unlock sequences of:
 *
 *  spin_unlock();
 *  local_bh_enable();
 *
 * Work as expected.
 */
#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_LOCK_OFFSET)

/*
 * Are we running in atomic context?  WARNING: this macro cannot
 * always detect atomic context; in particular, it cannot know about
 * held spinlocks in non-preemptible kernels.  Thus it should not be
 * used in the general case to determine whether sleeping is possible.
 * Do not use in_atomic() in driver code.
 */
#define in_atomic()        (preempt_count() != 0)

/*
 * Check whether we were atomic before we did preempt_disable():
 * (used by the scheduler)
 */
#define in_atomic_preempt_off() (preempt_count() != PREEMPT_DISABLE_OFFSET)

#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_TRACE_PREEMPT_TOGGLE)
extern void preempt_count_add(int val);
extern void preempt_count_sub(int val);
#define preempt_count_dec_and_test() \
        ({ preempt_count_sub(1); should_resched(0); })
#else
#define preempt_count_add(val)        __preempt_count_add(val)
#define preempt_count_sub(val)        __preempt_count_sub(val)
#define preempt_count_dec_and_test() __preempt_count_dec_and_test()
#endif

#define __preempt_count_inc() __preempt_count_add(1)
#define __preempt_count_dec() __preempt_count_sub(1)

#define preempt_count_inc() preempt_count_add(1)
#define preempt_count_dec() preempt_count_sub(1)

#ifdef CONFIG_PREEMPT_COUNT

#define preempt_disable() \
do { \
        preempt_count_inc(); \
        barrier(); \
} while (0)

#define sched_preempt_enable_no_resched() \
do { \
        barrier(); \
        preempt_count_dec(); \
} while (0)

#define preempt_enable_no_resched() sched_preempt_enable_no_resched()

#define preemptible()        (preempt_count() == 0 && !irqs_disabled())

#ifdef CONFIG_PREEMPTION
#define preempt_enable() \
do { \
        barrier(); \
        if (unlikely(preempt_count_dec_and_test())) \
                __preempt_schedule(); \
} while (0)

#define preempt_enable_notrace() \
do { \
        barrier(); \
        if (unlikely(__preempt_count_dec_and_test())) \
                __preempt_schedule_notrace(); \
} while (0)

#define preempt_check_resched() \
do { \
        if (should_resched(0)) \
                __preempt_schedule(); \
} while (0)

#else /* !CONFIG_PREEMPTION */
#define preempt_enable() \
do { \
        barrier(); \
        preempt_count_dec(); \
} while (0)

#define preempt_enable_notrace() \
do { \
        barrier(); \
        __preempt_count_dec(); \
} while (0)

#define preempt_check_resched() do { } while (0)
#endif /* CONFIG_PREEMPTION */

#define preempt_disable_notrace() \
do { \
        __preempt_count_inc(); \
        barrier(); \
} while (0)

#define preempt_enable_no_resched_notrace() \
do { \
        barrier(); \
        __preempt_count_dec(); \
} while (0)

#else /* !CONFIG_PREEMPT_COUNT */

/*
 * Even if we don't have any preemption, we need preempt disable/enable
 * to be barriers, so that we don't have things like get_user/put_user
 * that can cause faults and scheduling migrate into our preempt-protected
 * region.
 */
#define preempt_disable()                        barrier()
#define sched_preempt_enable_no_resched()        barrier()
#define preempt_enable_no_resched()                barrier()
#define preempt_enable()                        barrier()
#define preempt_check_resched()                        do { } while (0)

#define preempt_disable_notrace()                barrier()
#define preempt_enable_no_resched_notrace()        barrier()
#define preempt_enable_notrace()                barrier()
#define preemptible()                                0

#endif /* CONFIG_PREEMPT_COUNT */

#ifdef MODULE
/*
 * Modules have no business playing preemption tricks.
 */
#undef sched_preempt_enable_no_resched
#undef preempt_enable_no_resched
#undef preempt_enable_no_resched_notrace
#undef preempt_check_resched
#endif

#define preempt_set_need_resched() \
do { \
        set_preempt_need_resched(); \
} while (0)
#define preempt_fold_need_resched() \
do { \
        if (tif_need_resched()) \
                set_preempt_need_resched(); \
} while (0)

#ifdef CONFIG_PREEMPT_NOTIFIERS

struct preempt_notifier;

/**
 * preempt_ops - notifiers called when a task is preempted and rescheduled
 * @sched_in: we're about to be rescheduled:
 *    notifier: struct preempt_notifier for the task being scheduled
 *    cpu:  cpu we're scheduled on
 * @sched_out: we've just been preempted
 *    notifier: struct preempt_notifier for the task being preempted
 *    next: the task that's kicking us out
 *
 * Please note that sched_in and out are called under different
 * contexts.  sched_out is called with rq lock held and irq disabled
 * while sched_in is called without rq lock and irq enabled.  This
 * difference is intentional and depended upon by its users.
 */
struct preempt_ops {
        void (*sched_in)(struct preempt_notifier *notifier, int cpu);
        void (*sched_out)(struct preempt_notifier *notifier,
                          struct task_struct *next);
};

/**
 * preempt_notifier - key for installing preemption notifiers
 * @link: internal use
 * @ops: defines the notifier functions to be called
 *
 * Usually used in conjunction with container_of().
 */
struct preempt_notifier {
        struct hlist_node link;
        struct preempt_ops *ops;
};

void preempt_notifier_inc(void);
void preempt_notifier_dec(void);
void preempt_notifier_register(struct preempt_notifier *notifier);
void preempt_notifier_unregister(struct preempt_notifier *notifier);

static inline void preempt_notifier_init(struct preempt_notifier *notifier,
                                     struct preempt_ops *ops)
{
        INIT_HLIST_NODE(&notifier->link);
        notifier->ops = ops;
}

#endif

/**
 * migrate_disable - Prevent migration of the current task
 *
 * Maps to preempt_disable() which also disables preemption. Use
 * migrate_disable() to annotate that the intent is to prevent migration,
 * but not necessarily preemption.
 *
 * Can be invoked nested like preempt_disable() and needs the corresponding
 * number of migrate_enable() invocations.
 */
static __always_inline void migrate_disable(void)
{
        preempt_disable();
}

/**
 * migrate_enable - Allow migration of the current task
 *
 * Counterpart to migrate_disable().
 *
 * As migrate_disable() can be invoked nested, only the outermost invocation
 * reenables migration.
 *
 * Currently mapped to preempt_enable().
 */
static __always_inline void migrate_enable(void)
{
        preempt_enable();
}

DEFINE_LOCK_GUARD_0(preempt, preempt_disable(), preempt_enable())
DEFINE_LOCK_GUARD_0(preempt_notrace, preempt_disable_notrace(), preempt_enable_notrace())
DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())

#endif /* __LINUX_PREEMPT_H */






































    2 




    2 


    2 





    2 




    2 

























    1 





    1 



    1 



























    1 
    1 

    1 


    1 




    1 


    1 
    1 



















    1 



















    2 






















































































































    2 





    2 
    2 

    2 
















































































    1 







    1 



    1 

    1 


    1 




























    1 
    1 


    1 


    1 
















    1 





    1 
    1 


    1 











    1 




























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/bitmap.h>
#include <linux/bug.h>
#include <linux/export.h>
#include <linux/idr.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/xarray.h>

/**
 * idr_alloc_u32() - Allocate an ID.
 * @idr: IDR handle.
 * @ptr: Pointer to be associated with the new ID.
 * @nextid: Pointer to an ID.
 * @max: The maximum ID to allocate (inclusive).
 * @gfp: Memory allocation flags.
 *
 * Allocates an unused ID in the range specified by @nextid and @max.
 * Note that @max is inclusive whereas the @end parameter to idr_alloc()
 * is exclusive.  The new ID is assigned to @nextid before the pointer
 * is inserted into the IDR, so if @nextid points into the object pointed
 * to by @ptr, a concurrent lookup will not find an uninitialised ID.
 *
 * The caller should provide their own locking to ensure that two
 * concurrent modifications to the IDR are not possible.  Read-only
 * accesses to the IDR may be done under the RCU read lock or may
 * exclude simultaneous writers.
 *
 * Return: 0 if an ID was allocated, -ENOMEM if memory allocation failed,
 * or -ENOSPC if no free IDs could be found.  If an error occurred,
 * @nextid is unchanged.
 */
int idr_alloc_u32(struct idr *idr, void *ptr, u32 *nextid,
                        unsigned long max, gfp_t gfp)
{
        struct radix_tree_iter iter;
        void __rcu **slot;
        unsigned int base = idr->idr_base;
        unsigned int id = *nextid;

        if (WARN_ON_ONCE(!(idr->idr_rt.xa_flags & ROOT_IS_IDR)))
                idr->idr_rt.xa_flags |= IDR_RT_MARKER;
        if (max < base)
                return -ENOSPC;

        id = (id < base) ? 0 : id - base;
        radix_tree_iter_init(&iter, id);
        slot = idr_get_free(&idr->idr_rt, &iter, gfp, max - base);
        if (IS_ERR(slot))
                return PTR_ERR(slot);

        *nextid = iter.index + base;
        /* there is a memory barrier inside radix_tree_iter_replace() */
        radix_tree_iter_replace(&idr->idr_rt, &iter, slot, ptr);
        radix_tree_iter_tag_clear(&idr->idr_rt, &iter, IDR_FREE);

        return 0;
}
EXPORT_SYMBOL_GPL(idr_alloc_u32);

/**
 * idr_alloc() - Allocate an ID.
 * @idr: IDR handle.
 * @ptr: Pointer to be associated with the new ID.
 * @start: The minimum ID (inclusive).
 * @end: The maximum ID (exclusive).
 * @gfp: Memory allocation flags.
 *
 * Allocates an unused ID in the range specified by @start and @end.  If
 * @end is <= 0, it is treated as one larger than %INT_MAX.  This allows
 * callers to use @start + N as @end as long as N is within integer range.
 *
 * The caller should provide their own locking to ensure that two
 * concurrent modifications to the IDR are not possible.  Read-only
 * accesses to the IDR may be done under the RCU read lock or may
 * exclude simultaneous writers.
 *
 * Return: The newly allocated ID, -ENOMEM if memory allocation failed,
 * or -ENOSPC if no free IDs could be found.
 */
int idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp)
{
        u32 id = start;
        int ret;

        if (WARN_ON_ONCE(start < 0))
                return -EINVAL;

        ret = idr_alloc_u32(idr, ptr, &id, end > 0 ? end - 1 : INT_MAX, gfp);
        if (ret)
                return ret;

        return id;
}
EXPORT_SYMBOL_GPL(idr_alloc);

/**
 * idr_alloc_cyclic() - Allocate an ID cyclically.
 * @idr: IDR handle.
 * @ptr: Pointer to be associated with the new ID.
 * @start: The minimum ID (inclusive).
 * @end: The maximum ID (exclusive).
 * @gfp: Memory allocation flags.
 *
 * Allocates an unused ID in the range specified by @start and @end.  If
 * @end is <= 0, it is treated as one larger than %INT_MAX.  This allows
 * callers to use @start + N as @end as long as N is within integer range.
 * The search for an unused ID will start at the last ID allocated and will
 * wrap around to @start if no free IDs are found before reaching @end.
 *
 * The caller should provide their own locking to ensure that two
 * concurrent modifications to the IDR are not possible.  Read-only
 * accesses to the IDR may be done under the RCU read lock or may
 * exclude simultaneous writers.
 *
 * Return: The newly allocated ID, -ENOMEM if memory allocation failed,
 * or -ENOSPC if no free IDs could be found.
 */
int idr_alloc_cyclic(struct idr *idr, void *ptr, int start, int end, gfp_t gfp)
{
        u32 id = idr->idr_next;
        int err, max = end > 0 ? end - 1 : INT_MAX;

        if ((int)id < start)
                id = start;

        err = idr_alloc_u32(idr, ptr, &id, max, gfp);
        if ((err == -ENOSPC) && (id > start)) {
                id = start;
                err = idr_alloc_u32(idr, ptr, &id, max, gfp);
        }
        if (err)
                return err;

        idr->idr_next = id + 1;
        return id;
}
EXPORT_SYMBOL(idr_alloc_cyclic);

/**
 * idr_remove() - Remove an ID from the IDR.
 * @idr: IDR handle.
 * @id: Pointer ID.
 *
 * Removes this ID from the IDR.  If the ID was not previously in the IDR,
 * this function returns %NULL.
 *
 * Since this function modifies the IDR, the caller should provide their
 * own locking to ensure that concurrent modification of the same IDR is
 * not possible.
 *
 * Return: The pointer formerly associated with this ID.
 */
void *idr_remove(struct idr *idr, unsigned long id)
{
        return radix_tree_delete_item(&idr->idr_rt, id - idr->idr_base, NULL);
}
EXPORT_SYMBOL_GPL(idr_remove);

/**
 * idr_find() - Return pointer for given ID.
 * @idr: IDR handle.
 * @id: Pointer ID.
 *
 * Looks up the pointer associated with this ID.  A %NULL pointer may
 * indicate that @id is not allocated or that the %NULL pointer was
 * associated with this ID.
 *
 * This function can be called under rcu_read_lock(), given that the leaf
 * pointers lifetimes are correctly managed.
 *
 * Return: The pointer associated with this ID.
 */
void *idr_find(const struct idr *idr, unsigned long id)
{
        return radix_tree_lookup(&idr->idr_rt, id - idr->idr_base);
}
EXPORT_SYMBOL_GPL(idr_find);

/**
 * idr_for_each() - Iterate through all stored pointers.
 * @idr: IDR handle.
 * @fn: Function to be called for each pointer.
 * @data: Data passed to callback function.
 *
 * The callback function will be called for each entry in @idr, passing
 * the ID, the entry and @data.
 *
 * If @fn returns anything other than %0, the iteration stops and that
 * value is returned from this function.
 *
 * idr_for_each() can be called concurrently with idr_alloc() and
 * idr_remove() if protected by RCU.  Newly added entries may not be
 * seen and deleted entries may be seen, but adding and removing entries
 * will not cause other entries to be skipped, nor spurious ones to be seen.
 */
int idr_for_each(const struct idr *idr,
                int (*fn)(int id, void *p, void *data), void *data)
{
        struct radix_tree_iter iter;
        void __rcu **slot;
        int base = idr->idr_base;

        radix_tree_for_each_slot(slot, &idr->idr_rt, &iter, 0) {
                int ret;
                unsigned long id = iter.index + base;

                if (WARN_ON_ONCE(id > INT_MAX))
                        break;
                ret = fn(id, rcu_dereference_raw(*slot), data);
                if (ret)
                        return ret;
        }

        return 0;
}
EXPORT_SYMBOL(idr_for_each);

/**
 * idr_get_next_ul() - Find next populated entry.
 * @idr: IDR handle.
 * @nextid: Pointer to an ID.
 *
 * Returns the next populated entry in the tree with an ID greater than
 * or equal to the value pointed to by @nextid.  On exit, @nextid is updated
 * to the ID of the found value.  To use in a loop, the value pointed to by
 * nextid must be incremented by the user.
 */
void *idr_get_next_ul(struct idr *idr, unsigned long *nextid)
{
        struct radix_tree_iter iter;
        void __rcu **slot;
        void *entry = NULL;
        unsigned long base = idr->idr_base;
        unsigned long id = *nextid;

        id = (id < base) ? 0 : id - base;
        radix_tree_for_each_slot(slot, &idr->idr_rt, &iter, id) {
                entry = rcu_dereference_raw(*slot);
                if (!entry)
                        continue;
                if (!xa_is_internal(entry))
                        break;
                if (slot != &idr->idr_rt.xa_head && !xa_is_retry(entry))
                        break;
                slot = radix_tree_iter_retry(&iter);
        }
        if (!slot)
                return NULL;

        *nextid = iter.index + base;
        return entry;
}
EXPORT_SYMBOL(idr_get_next_ul);

/**
 * idr_get_next() - Find next populated entry.
 * @idr: IDR handle.
 * @nextid: Pointer to an ID.
 *
 * Returns the next populated entry in the tree with an ID greater than
 * or equal to the value pointed to by @nextid.  On exit, @nextid is updated
 * to the ID of the found value.  To use in a loop, the value pointed to by
 * nextid must be incremented by the user.
 */
void *idr_get_next(struct idr *idr, int *nextid)
{
        unsigned long id = *nextid;
        void *entry = idr_get_next_ul(idr, &id);

        if (WARN_ON_ONCE(id > INT_MAX))
                return NULL;
        *nextid = id;
        return entry;
}
EXPORT_SYMBOL(idr_get_next);

/**
 * idr_replace() - replace pointer for given ID.
 * @idr: IDR handle.
 * @ptr: New pointer to associate with the ID.
 * @id: ID to change.
 *
 * Replace the pointer registered with an ID and return the old value.
 * This function can be called under the RCU read lock concurrently with
 * idr_alloc() and idr_remove() (as long as the ID being removed is not
 * the one being replaced!).
 *
 * Returns: the old value on success.  %-ENOENT indicates that @id was not
 * found.  %-EINVAL indicates that @ptr was not valid.
 */
void *idr_replace(struct idr *idr, void *ptr, unsigned long id)
{
        struct radix_tree_node *node;
        void __rcu **slot = NULL;
        void *entry;

        id -= idr->idr_base;

        entry = __radix_tree_lookup(&idr->idr_rt, id, &node, &slot);
        if (!slot || radix_tree_tag_get(&idr->idr_rt, id, IDR_FREE))
                return ERR_PTR(-ENOENT);

        __radix_tree_replace(&idr->idr_rt, node, slot, ptr);

        return entry;
}
EXPORT_SYMBOL(idr_replace);

/**
 * DOC: IDA description
 *
 * The IDA is an ID allocator which does not provide the ability to
 * associate an ID with a pointer.  As such, it only needs to store one
 * bit per ID, and so is more space efficient than an IDR.  To use an IDA,
 * define it using DEFINE_IDA() (or embed a &struct ida in a data structure,
 * then initialise it using ida_init()).  To allocate a new ID, call
 * ida_alloc(), ida_alloc_min(), ida_alloc_max() or ida_alloc_range().
 * To free an ID, call ida_free().
 *
 * ida_destroy() can be used to dispose of an IDA without needing to
 * free the individual IDs in it.  You can use ida_is_empty() to find
 * out whether the IDA has any IDs currently allocated.
 *
 * The IDA handles its own locking.  It is safe to call any of the IDA
 * functions without synchronisation in your code.
 *
 * IDs are currently limited to the range [0-INT_MAX].  If this is an awkward
 * limitation, it should be quite straightforward to raise the maximum.
 */

/*
 * Developer's notes:
 *
 * The IDA uses the functionality provided by the XArray to store bitmaps in
 * each entry.  The XA_FREE_MARK is only cleared when all bits in the bitmap
 * have been set.
 *
 * I considered telling the XArray that each slot is an order-10 node
 * and indexing by bit number, but the XArray can't allow a single multi-index
 * entry in the head, which would significantly increase memory consumption
 * for the IDA.  So instead we divide the index by the number of bits in the
 * leaf bitmap before doing a radix tree lookup.
 *
 * As an optimisation, if there are only a few low bits set in any given
 * leaf, instead of allocating a 128-byte bitmap, we store the bits
 * as a value entry.  Value entries never have the XA_FREE_MARK cleared
 * because we can always convert them into a bitmap entry.
 *
 * It would be possible to optimise further; once we've run out of a
 * single 128-byte bitmap, we currently switch to a 576-byte node, put
 * the 128-byte bitmap in the first entry and then start allocating extra
 * 128-byte entries.  We could instead use the 512 bytes of the node's
 * data as a bitmap before moving to that scheme.  I do not believe this
 * is a worthwhile optimisation; Rasmus Villemoes surveyed the current
 * users of the IDA and almost none of them use more than 1024 entries.
 * Those that do use more than the 8192 IDs that the 512 bytes would
 * provide.
 *
 * The IDA always uses a lock to alloc/free.  If we add a 'test_bit'
 * equivalent, it will still need locking.  Going to RCU lookup would require
 * using RCU to free bitmaps, and that's not trivial without embedding an
 * RCU head in the bitmap, which adds a 2-pointer overhead to each 128-byte
 * bitmap, which is excessive.
 */

/**
 * ida_alloc_range() - Allocate an unused ID.
 * @ida: IDA handle.
 * @min: Lowest ID to allocate.
 * @max: Highest ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Allocate an ID between @min and @max, inclusive.  The allocated ID will
 * not exceed %INT_MAX, even if @max is larger.
 *
 * Context: Any context. It is safe to call this function without
 * locking in your code.
 * Return: The allocated ID, or %-ENOMEM if memory could not be allocated,
 * or %-ENOSPC if there are no free IDs.
 */
int ida_alloc_range(struct ida *ida, unsigned int min, unsigned int max,
                        gfp_t gfp)
{
        XA_STATE(xas, &ida->xa, min / IDA_BITMAP_BITS);
        unsigned bit = min % IDA_BITMAP_BITS;
        unsigned long flags;
        struct ida_bitmap *bitmap, *alloc = NULL;

        if ((int)min < 0)
                return -ENOSPC;

        if ((int)max < 0)
                max = INT_MAX;

retry:
        xas_lock_irqsave(&xas, flags);
next:
        bitmap = xas_find_marked(&xas, max / IDA_BITMAP_BITS, XA_FREE_MARK);
        if (xas.xa_index > min / IDA_BITMAP_BITS)
                bit = 0;
        if (xas.xa_index * IDA_BITMAP_BITS + bit > max)
                goto nospc;

        if (xa_is_value(bitmap)) {
                unsigned long tmp = xa_to_value(bitmap);

                if (bit < BITS_PER_XA_VALUE) {
                        bit = find_next_zero_bit(&tmp, BITS_PER_XA_VALUE, bit);
                        if (xas.xa_index * IDA_BITMAP_BITS + bit > max)
                                goto nospc;
                        if (bit < BITS_PER_XA_VALUE) {
                                tmp |= 1UL << bit;
                                xas_store(&xas, xa_mk_value(tmp));
                                goto out;
                        }
                }
                bitmap = alloc;
                if (!bitmap)
                        bitmap = kzalloc(sizeof(*bitmap), GFP_NOWAIT);
                if (!bitmap)
                        goto alloc;
                bitmap->bitmap[0] = tmp;
                xas_store(&xas, bitmap);
                if (xas_error(&xas)) {
                        bitmap->bitmap[0] = 0;
                        goto out;
                }
        }

        if (bitmap) {
                bit = find_next_zero_bit(bitmap->bitmap, IDA_BITMAP_BITS, bit);
                if (xas.xa_index * IDA_BITMAP_BITS + bit > max)
                        goto nospc;
                if (bit == IDA_BITMAP_BITS)
                        goto next;

                __set_bit(bit, bitmap->bitmap);
                if (bitmap_full(bitmap->bitmap, IDA_BITMAP_BITS))
                        xas_clear_mark(&xas, XA_FREE_MARK);
        } else {
                if (bit < BITS_PER_XA_VALUE) {
                        bitmap = xa_mk_value(1UL << bit);
                } else {
                        bitmap = alloc;
                        if (!bitmap)
                                bitmap = kzalloc(sizeof(*bitmap), GFP_NOWAIT);
                        if (!bitmap)
                                goto alloc;
                        __set_bit(bit, bitmap->bitmap);
                }
                xas_store(&xas, bitmap);
        }
out:
        xas_unlock_irqrestore(&xas, flags);
        if (xas_nomem(&xas, gfp)) {
                xas.xa_index = min / IDA_BITMAP_BITS;
                bit = min % IDA_BITMAP_BITS;
                goto retry;
        }
        if (bitmap != alloc)
                kfree(alloc);
        if (xas_error(&xas))
                return xas_error(&xas);
        return xas.xa_index * IDA_BITMAP_BITS + bit;
alloc:
        xas_unlock_irqrestore(&xas, flags);
        alloc = kzalloc(sizeof(*bitmap), gfp);
        if (!alloc)
                return -ENOMEM;
        xas_set(&xas, min / IDA_BITMAP_BITS);
        bit = min % IDA_BITMAP_BITS;
        goto retry;
nospc:
        xas_unlock_irqrestore(&xas, flags);
        kfree(alloc);
        return -ENOSPC;
}
EXPORT_SYMBOL(ida_alloc_range);

/**
 * ida_free() - Release an allocated ID.
 * @ida: IDA handle.
 * @id: Previously allocated ID.
 *
 * Context: Any context. It is safe to call this function without
 * locking in your code.
 */
void ida_free(struct ida *ida, unsigned int id)
{
        XA_STATE(xas, &ida->xa, id / IDA_BITMAP_BITS);
        unsigned bit = id % IDA_BITMAP_BITS;
        struct ida_bitmap *bitmap;
        unsigned long flags;

        if ((int)id < 0)
                return;

        xas_lock_irqsave(&xas, flags);
        bitmap = xas_load(&xas);

        if (xa_is_value(bitmap)) {
                unsigned long v = xa_to_value(bitmap);
                if (bit >= BITS_PER_XA_VALUE)
                        goto err;
                if (!(v & (1UL << bit)))
                        goto err;
                v &= ~(1UL << bit);
                if (!v)
                        goto delete;
                xas_store(&xas, xa_mk_value(v));
        } else {
                if (!bitmap || !test_bit(bit, bitmap->bitmap))
                        goto err;
                __clear_bit(bit, bitmap->bitmap);
                xas_set_mark(&xas, XA_FREE_MARK);
                if (bitmap_empty(bitmap->bitmap, IDA_BITMAP_BITS)) {
                        kfree(bitmap);
delete:
                        xas_store(&xas, NULL);
                }
        }
        xas_unlock_irqrestore(&xas, flags);
        return;
 err:
        xas_unlock_irqrestore(&xas, flags);
        WARN(1, "ida_free called for id=%d which is not allocated.\n", id);
}
EXPORT_SYMBOL(ida_free);

/**
 * ida_destroy() - Free all IDs.
 * @ida: IDA handle.
 *
 * Calling this function frees all IDs and releases all resources used
 * by an IDA.  When this call returns, the IDA is empty and can be reused
 * or freed.  If the IDA is already empty, there is no need to call this
 * function.
 *
 * Context: Any context. It is safe to call this function without
 * locking in your code.
 */
void ida_destroy(struct ida *ida)
{
        XA_STATE(xas, &ida->xa, 0);
        struct ida_bitmap *bitmap;
        unsigned long flags;

        xas_lock_irqsave(&xas, flags);
        xas_for_each(&xas, bitmap, ULONG_MAX) {
                if (!xa_is_value(bitmap))
                        kfree(bitmap);
                xas_store(&xas, NULL);
        }
        xas_unlock_irqrestore(&xas, flags);
}
EXPORT_SYMBOL(ida_destroy);

#ifndef __KERNEL__
extern void xa_dump_index(unsigned long index, unsigned int shift);
#define IDA_CHUNK_SHIFT                ilog2(IDA_BITMAP_BITS)

static void ida_dump_entry(void *entry, unsigned long index)
{
        unsigned long i;

        if (!entry)
                return;

        if (xa_is_node(entry)) {
                struct xa_node *node = xa_to_node(entry);
                unsigned int shift = node->shift + IDA_CHUNK_SHIFT +
                        XA_CHUNK_SHIFT;

                xa_dump_index(index * IDA_BITMAP_BITS, shift);
                xa_dump_node(node);
                for (i = 0; i < XA_CHUNK_SIZE; i++)
                        ida_dump_entry(node->slots[i],
                                        index | (i << node->shift));
        } else if (xa_is_value(entry)) {
                xa_dump_index(index * IDA_BITMAP_BITS, ilog2(BITS_PER_LONG));
                pr_cont("value: data %lx [%px]\n", xa_to_value(entry), entry);
        } else {
                struct ida_bitmap *bitmap = entry;

                xa_dump_index(index * IDA_BITMAP_BITS, IDA_CHUNK_SHIFT);
                pr_cont("bitmap: %p data", bitmap);
                for (i = 0; i < IDA_BITMAP_LONGS; i++)
                        pr_cont(" %lx", bitmap->bitmap[i]);
                pr_cont("\n");
        }
}

static void ida_dump(struct ida *ida)
{
        struct xarray *xa = &ida->xa;
        pr_debug("ida: %p node %p free %d\n", ida, xa->xa_head,
                                xa->xa_flags >> ROOT_TAG_SHIFT);
        ida_dump_entry(xa->xa_head, 0);
}
#endif


















































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef _UAPI_LINUX_SWAB_H
#define _UAPI_LINUX_SWAB_H

#include <linux/types.h>
#include <linux/stddef.h>
#include <asm/bitsperlong.h>
#include <asm/swab.h>

/*
 * casts are necessary for constants, because we never know how for sure
 * how U/UL/ULL map to __u16, __u32, __u64. At least not in a portable way.
 */
#define ___constant_swab16(x) ((__u16)(                                \
        (((__u16)(x) & (__u16)0x00ffU) << 8) |                        \
        (((__u16)(x) & (__u16)0xff00U) >> 8)))

#define ___constant_swab32(x) ((__u32)(                                \
        (((__u32)(x) & (__u32)0x000000ffUL) << 24) |                \
        (((__u32)(x) & (__u32)0x0000ff00UL) <<  8) |                \
        (((__u32)(x) & (__u32)0x00ff0000UL) >>  8) |                \
        (((__u32)(x) & (__u32)0xff000000UL) >> 24)))

#define ___constant_swab64(x) ((__u64)(                                \
        (((__u64)(x) & (__u64)0x00000000000000ffULL) << 56) |        \
        (((__u64)(x) & (__u64)0x000000000000ff00ULL) << 40) |        \
        (((__u64)(x) & (__u64)0x0000000000ff0000ULL) << 24) |        \
        (((__u64)(x) & (__u64)0x00000000ff000000ULL) <<  8) |        \
        (((__u64)(x) & (__u64)0x000000ff00000000ULL) >>  8) |        \
        (((__u64)(x) & (__u64)0x0000ff0000000000ULL) >> 24) |        \
        (((__u64)(x) & (__u64)0x00ff000000000000ULL) >> 40) |        \
        (((__u64)(x) & (__u64)0xff00000000000000ULL) >> 56)))

#define ___constant_swahw32(x) ((__u32)(                        \
        (((__u32)(x) & (__u32)0x0000ffffUL) << 16) |                \
        (((__u32)(x) & (__u32)0xffff0000UL) >> 16)))

#define ___constant_swahb32(x) ((__u32)(                        \
        (((__u32)(x) & (__u32)0x00ff00ffUL) << 8) |                \
        (((__u32)(x) & (__u32)0xff00ff00UL) >> 8)))

/*
 * Implement the following as inlines, but define the interface using
 * macros to allow constant folding when possible:
 * ___swab16, ___swab32, ___swab64, ___swahw32, ___swahb32
 */

static inline __attribute_const__ __u16 __fswab16(__u16 val)
{
#if defined (__arch_swab16)
        return __arch_swab16(val);
#else
        return ___constant_swab16(val);
#endif
}

static inline __attribute_const__ __u32 __fswab32(__u32 val)
{
#if defined(__arch_swab32)
        return __arch_swab32(val);
#else
        return ___constant_swab32(val);
#endif
}

static inline __attribute_const__ __u64 __fswab64(__u64 val)
{
#if defined (__arch_swab64)
        return __arch_swab64(val);
#elif defined(__SWAB_64_THRU_32__)
        __u32 h = val >> 32;
        __u32 l = val & ((1ULL << 32) - 1);
        return (((__u64)__fswab32(l)) << 32) | ((__u64)(__fswab32(h)));
#else
        return ___constant_swab64(val);
#endif
}

static inline __attribute_const__ __u32 __fswahw32(__u32 val)
{
#ifdef __arch_swahw32
        return __arch_swahw32(val);
#else
        return ___constant_swahw32(val);
#endif
}

static inline __attribute_const__ __u32 __fswahb32(__u32 val)
{
#ifdef __arch_swahb32
        return __arch_swahb32(val);
#else
        return ___constant_swahb32(val);
#endif
}

/**
 * __swab16 - return a byteswapped 16-bit value
 * @x: value to byteswap
 */
#ifdef __HAVE_BUILTIN_BSWAP16__
#define __swab16(x) (__u16)__builtin_bswap16((__u16)(x))
#else
#define __swab16(x)                                \
        (__builtin_constant_p((__u16)(x)) ?        \
        ___constant_swab16(x) :                        \
        __fswab16(x))
#endif

/**
 * __swab32 - return a byteswapped 32-bit value
 * @x: value to byteswap
 */
#ifdef __HAVE_BUILTIN_BSWAP32__
#define __swab32(x) (__u32)__builtin_bswap32((__u32)(x))
#else
#define __swab32(x)                                \
        (__builtin_constant_p((__u32)(x)) ?        \
        ___constant_swab32(x) :                        \
        __fswab32(x))
#endif

/**
 * __swab64 - return a byteswapped 64-bit value
 * @x: value to byteswap
 */
#ifdef __HAVE_BUILTIN_BSWAP64__
#define __swab64(x) (__u64)__builtin_bswap64((__u64)(x))
#else
#define __swab64(x)                                \
        (__builtin_constant_p((__u64)(x)) ?        \
        ___constant_swab64(x) :                        \
        __fswab64(x))
#endif

static __always_inline unsigned long __swab(const unsigned long y)
{
#if __BITS_PER_LONG == 64
        return __swab64(y);
#else /* __BITS_PER_LONG == 32 */
        return __swab32(y);
#endif
}

/**
 * __swahw32 - return a word-swapped 32-bit value
 * @x: value to wordswap
 *
 * __swahw32(0x12340000) is 0x00001234
 */
#define __swahw32(x)                                \
        (__builtin_constant_p((__u32)(x)) ?        \
        ___constant_swahw32(x) :                \
        __fswahw32(x))

/**
 * __swahb32 - return a high and low byte-swapped 32-bit value
 * @x: value to byteswap
 *
 * __swahb32(0x12345678) is 0x34127856
 */
#define __swahb32(x)                                \
        (__builtin_constant_p((__u32)(x)) ?        \
        ___constant_swahb32(x) :                \
        __fswahb32(x))

/**
 * __swab16p - return a byteswapped 16-bit value from a pointer
 * @p: pointer to a naturally-aligned 16-bit value
 */
static __always_inline __u16 __swab16p(const __u16 *p)
{
#ifdef __arch_swab16p
        return __arch_swab16p(p);
#else
        return __swab16(*p);
#endif
}

/**
 * __swab32p - return a byteswapped 32-bit value from a pointer
 * @p: pointer to a naturally-aligned 32-bit value
 */
static __always_inline __u32 __swab32p(const __u32 *p)
{
#ifdef __arch_swab32p
        return __arch_swab32p(p);
#else
        return __swab32(*p);
#endif
}

/**
 * __swab64p - return a byteswapped 64-bit value from a pointer
 * @p: pointer to a naturally-aligned 64-bit value
 */
static __always_inline __u64 __swab64p(const __u64 *p)
{
#ifdef __arch_swab64p
        return __arch_swab64p(p);
#else
        return __swab64(*p);
#endif
}

/**
 * __swahw32p - return a wordswapped 32-bit value from a pointer
 * @p: pointer to a naturally-aligned 32-bit value
 *
 * See __swahw32() for details of wordswapping.
 */
static inline __u32 __swahw32p(const __u32 *p)
{
#ifdef __arch_swahw32p
        return __arch_swahw32p(p);
#else
        return __swahw32(*p);
#endif
}

/**
 * __swahb32p - return a high and low byteswapped 32-bit value from a pointer
 * @p: pointer to a naturally-aligned 32-bit value
 *
 * See __swahb32() for details of high/low byteswapping.
 */
static inline __u32 __swahb32p(const __u32 *p)
{
#ifdef __arch_swahb32p
        return __arch_swahb32p(p);
#else
        return __swahb32(*p);
#endif
}

/**
 * __swab16s - byteswap a 16-bit value in-place
 * @p: pointer to a naturally-aligned 16-bit value
 */
static inline void __swab16s(__u16 *p)
{
#ifdef __arch_swab16s
        __arch_swab16s(p);
#else
        *p = __swab16p(p);
#endif
}
/**
 * __swab32s - byteswap a 32-bit value in-place
 * @p: pointer to a naturally-aligned 32-bit value
 */
static __always_inline void __swab32s(__u32 *p)
{
#ifdef __arch_swab32s
        __arch_swab32s(p);
#else
        *p = __swab32p(p);
#endif
}

/**
 * __swab64s - byteswap a 64-bit value in-place
 * @p: pointer to a naturally-aligned 64-bit value
 */
static __always_inline void __swab64s(__u64 *p)
{
#ifdef __arch_swab64s
        __arch_swab64s(p);
#else
        *p = __swab64p(p);
#endif
}

/**
 * __swahw32s - wordswap a 32-bit value in-place
 * @p: pointer to a naturally-aligned 32-bit value
 *
 * See __swahw32() for details of wordswapping
 */
static inline void __swahw32s(__u32 *p)
{
#ifdef __arch_swahw32s
        __arch_swahw32s(p);
#else
        *p = __swahw32p(p);
#endif
}

/**
 * __swahb32s - high and low byteswap a 32-bit value in-place
 * @p: pointer to a naturally-aligned 32-bit value
 *
 * See __swahb32() for details of high and low byte swapping
 */
static inline void __swahb32s(__u32 *p)
{
#ifdef __arch_swahb32s
        __arch_swahb32s(p);
#else
        *p = __swahb32p(p);
#endif
}


#endif /* _UAPI_LINUX_SWAB_H */










































































   12 





































































































































































































































   12 




   12 

























































































































































































































































































































































































































   12 
















































   12 


























































    9 

















    9 



























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
/* SPDX-License-Identifier: GPL-2.0+ */
/*
 * Read-Copy Update mechanism for mutual exclusion
 *
 * Copyright IBM Corporation, 2001
 *
 * Author: Dipankar Sarma <dipankar@in.ibm.com>
 *
 * Based on the original work by Paul McKenney <paulmck@vnet.ibm.com>
 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
 * Papers:
 * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
 *
 * For detailed explanation of Read-Copy Update mechanism see -
 *                http://lse.sourceforge.net/locking/rcupdate.html
 *
 */

#ifndef __LINUX_RCUPDATE_H
#define __LINUX_RCUPDATE_H

#include <linux/types.h>
#include <linux/compiler.h>
#include <linux/atomic.h>
#include <linux/irqflags.h>
#include <linux/preempt.h>
#include <linux/bottom_half.h>
#include <linux/lockdep.h>
#include <linux/cleanup.h>
#include <asm/processor.h>
#include <linux/cpumask.h>

#define ULONG_CMP_GE(a, b)        (ULONG_MAX / 2 >= (a) - (b))
#define ULONG_CMP_LT(a, b)        (ULONG_MAX / 2 < (a) - (b))
#define ulong2long(a)                (*(long *)(&(a)))
#define USHORT_CMP_GE(a, b)        (USHRT_MAX / 2 >= (unsigned short)((a) - (b)))
#define USHORT_CMP_LT(a, b)        (USHRT_MAX / 2 < (unsigned short)((a) - (b)))

/* Exported common interfaces */
void call_rcu(struct rcu_head *head, rcu_callback_t func);
void rcu_barrier_tasks(void);
void rcu_barrier_tasks_rude(void);
void synchronize_rcu(void);

#ifdef CONFIG_PREEMPT_RCU

void __rcu_read_lock(void);
void __rcu_read_unlock(void);

/*
 * Defined as a macro as it is a very low level header included from
 * areas that don't even know about current.  This gives the rcu_read_lock()
 * nesting depth, but makes sense only if CONFIG_PREEMPT_RCU -- in other
 * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
 */
#define rcu_preempt_depth() (current->rcu_read_lock_nesting)

#else /* #ifdef CONFIG_PREEMPT_RCU */

#ifdef CONFIG_TINY_RCU
#define rcu_read_unlock_strict() do { } while (0)
#else
void rcu_read_unlock_strict(void);
#endif

static inline void __rcu_read_lock(void)
{
        preempt_disable();
}

static inline void __rcu_read_unlock(void)
{
        preempt_enable();
        rcu_read_unlock_strict();
}

static inline int rcu_preempt_depth(void)
{
        return 0;
}

#endif /* #else #ifdef CONFIG_PREEMPT_RCU */

/* Internal to kernel */
void rcu_init(void);
extern int rcu_scheduler_active __read_mostly;
void rcu_sched_clock_irq(int user);
void rcu_report_dead(unsigned int cpu);
void rcutree_migrate_callbacks(int cpu);

#ifdef CONFIG_TASKS_RCU_GENERIC
void rcu_init_tasks_generic(void);
#else
static inline void rcu_init_tasks_generic(void) { }
#endif

#ifdef CONFIG_RCU_STALL_COMMON
void rcu_sysrq_start(void);
void rcu_sysrq_end(void);
#else /* #ifdef CONFIG_RCU_STALL_COMMON */
static inline void rcu_sysrq_start(void) { }
static inline void rcu_sysrq_end(void) { }
#endif /* #else #ifdef CONFIG_RCU_STALL_COMMON */

#ifdef CONFIG_NO_HZ_FULL
void rcu_user_enter(void);
void rcu_user_exit(void);
#else
static inline void rcu_user_enter(void) { }
static inline void rcu_user_exit(void) { }
#endif /* CONFIG_NO_HZ_FULL */

#ifdef CONFIG_RCU_NOCB_CPU
void rcu_init_nohz(void);
void rcu_nocb_flush_deferred_wakeup(void);
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
static inline void rcu_init_nohz(void) { }
static inline void rcu_nocb_flush_deferred_wakeup(void) { }
#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */

/**
 * RCU_NONIDLE - Indicate idle-loop code that needs RCU readers
 * @a: Code that RCU needs to pay attention to.
 *
 * RCU read-side critical sections are forbidden in the inner idle loop,
 * that is, between the rcu_idle_enter() and the rcu_idle_exit() -- RCU
 * will happily ignore any such read-side critical sections.  However,
 * things like powertop need tracepoints in the inner idle loop.
 *
 * This macro provides the way out:  RCU_NONIDLE(do_something_with_RCU())
 * will tell RCU that it needs to pay attention, invoke its argument
 * (in this example, calling the do_something_with_RCU() function),
 * and then tell RCU to go back to ignoring this CPU.  It is permissible
 * to nest RCU_NONIDLE() wrappers, but not indefinitely (but the limit is
 * on the order of a million or so, even on 32-bit systems).  It is
 * not legal to block within RCU_NONIDLE(), nor is it permissible to
 * transfer control either into or out of RCU_NONIDLE()'s statement.
 */
#define RCU_NONIDLE(a) \
        do { \
                rcu_irq_enter_irqson(); \
                do { a; } while (0); \
                rcu_irq_exit_irqson(); \
        } while (0)

/*
 * Note a quasi-voluntary context switch for RCU-tasks's benefit.
 * This is a macro rather than an inline function to avoid #include hell.
 */
#ifdef CONFIG_TASKS_RCU_GENERIC

# ifdef CONFIG_TASKS_RCU
# define rcu_tasks_classic_qs(t, preempt)                                \
        do {                                                                \
                if (!(preempt) && READ_ONCE((t)->rcu_tasks_holdout))        \
                        WRITE_ONCE((t)->rcu_tasks_holdout, false);        \
        } while (0)
void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func);
void synchronize_rcu_tasks(void);
# else
# define rcu_tasks_classic_qs(t, preempt) do { } while (0)
# define call_rcu_tasks call_rcu
# define synchronize_rcu_tasks synchronize_rcu
# endif

# ifdef CONFIG_TASKS_TRACE_RCU
# define rcu_tasks_trace_qs(t)                                                \
        do {                                                                \
                if (!likely(READ_ONCE((t)->trc_reader_checked)) &&        \
                    !unlikely(READ_ONCE((t)->trc_reader_nesting))) {        \
                        smp_store_release(&(t)->trc_reader_checked, true); \
                        smp_mb(); /* Readers partitioned by store. */        \
                }                                                        \
        } while (0)
# else
# define rcu_tasks_trace_qs(t) do { } while (0)
# endif

#define rcu_tasks_qs(t, preempt)                                        \
do {                                                                        \
        rcu_tasks_classic_qs((t), (preempt));                                \
        rcu_tasks_trace_qs((t));                                        \
} while (0)

# ifdef CONFIG_TASKS_RUDE_RCU
void call_rcu_tasks_rude(struct rcu_head *head, rcu_callback_t func);
void synchronize_rcu_tasks_rude(void);
# endif

#define rcu_note_voluntary_context_switch(t) rcu_tasks_qs(t, false)
void exit_tasks_rcu_start(void);
void exit_tasks_rcu_stop(void);
void exit_tasks_rcu_finish(void);
#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
#define rcu_tasks_qs(t, preempt) do { } while (0)
#define rcu_note_voluntary_context_switch(t) do { } while (0)
#define call_rcu_tasks call_rcu
#define synchronize_rcu_tasks synchronize_rcu
static inline void exit_tasks_rcu_start(void) { }
static inline void exit_tasks_rcu_stop(void) { }
static inline void exit_tasks_rcu_finish(void) { }
#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */

/**
 * rcu_trace_implies_rcu_gp - does an RCU Tasks Trace grace period imply an RCU grace period?
 *
 * As an accident of implementation, an RCU Tasks Trace grace period also
 * acts as an RCU grace period.  However, this could change at any time.
 * Code relying on this accident must call this function to verify that
 * this accident is still happening.
 *
 * You have been warned!
 */
static inline bool rcu_trace_implies_rcu_gp(void) { return true; }

/**
 * cond_resched_tasks_rcu_qs - Report potential quiescent states to RCU
 *
 * This macro resembles cond_resched(), except that it is defined to
 * report potential quiescent states to RCU-tasks even if the cond_resched()
 * machinery were to be shut off, as some advocate for PREEMPTION kernels.
 */
#define cond_resched_tasks_rcu_qs() \
do { \
        rcu_tasks_qs(current, false); \
        cond_resched(); \
} while (0)

/**
 * rcu_softirq_qs_periodic - Report RCU and RCU-Tasks quiescent states
 * @old_ts: jiffies at start of processing.
 *
 * This helper is for long-running softirq handlers, such as NAPI threads in
 * networking. The caller should initialize the variable passed in as @old_ts
 * at the beginning of the softirq handler. When invoked frequently, this macro
 * will invoke rcu_softirq_qs() every 100 milliseconds thereafter, which will
 * provide both RCU and RCU-Tasks quiescent states. Note that this macro
 * modifies its old_ts argument.
 *
 * Because regions of code that have disabled softirq act as RCU read-side
 * critical sections, this macro should be invoked with softirq (and
 * preemption) enabled.
 *
 * The macro is not needed when CONFIG_PREEMPT_RT is defined. RT kernels would
 * have more chance to invoke schedule() calls and provide necessary quiescent
 * states. As a contrast, calling cond_resched() only won't achieve the same
 * effect because cond_resched() does not provide RCU-Tasks quiescent states.
 */
#define rcu_softirq_qs_periodic(old_ts) \
do { \
        if (!IS_ENABLED(CONFIG_PREEMPT_RT) && \
            time_after(jiffies, (old_ts) + HZ / 10)) { \
                preempt_disable(); \
                rcu_softirq_qs(); \
                preempt_enable(); \
                (old_ts) = jiffies; \
        } \
} while (0)

/*
 * Infrastructure to implement the synchronize_() primitives in
 * TREE_RCU and rcu_barrier_() primitives in TINY_RCU.
 */

#if defined(CONFIG_TREE_RCU)
#include <linux/rcutree.h>
#elif defined(CONFIG_TINY_RCU)
#include <linux/rcutiny.h>
#else
#error "Unknown RCU implementation specified to kernel configuration"
#endif

/*
 * The init_rcu_head_on_stack() and destroy_rcu_head_on_stack() calls
 * are needed for dynamic initialization and destruction of rcu_head
 * on the stack, and init_rcu_head()/destroy_rcu_head() are needed for
 * dynamic initialization and destruction of statically allocated rcu_head
 * structures.  However, rcu_head structures allocated dynamically in the
 * heap don't need any initialization.
 */
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
void init_rcu_head(struct rcu_head *head);
void destroy_rcu_head(struct rcu_head *head);
void init_rcu_head_on_stack(struct rcu_head *head);
void destroy_rcu_head_on_stack(struct rcu_head *head);
#else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
static inline void init_rcu_head(struct rcu_head *head) { }
static inline void destroy_rcu_head(struct rcu_head *head) { }
static inline void init_rcu_head_on_stack(struct rcu_head *head) { }
static inline void destroy_rcu_head_on_stack(struct rcu_head *head) { }
#endif        /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */

#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU)
bool rcu_lockdep_current_cpu_online(void);
#else /* #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */
static inline bool rcu_lockdep_current_cpu_online(void) { return true; }
#endif /* #else #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */

#ifdef CONFIG_DEBUG_LOCK_ALLOC

static inline void rcu_lock_acquire(struct lockdep_map *map)
{
        lock_acquire(map, 0, 0, 2, 0, NULL, _THIS_IP_);
}

static inline void rcu_lock_release(struct lockdep_map *map)
{
        lock_release(map, _THIS_IP_);
}

extern struct lockdep_map rcu_lock_map;
extern struct lockdep_map rcu_bh_lock_map;
extern struct lockdep_map rcu_sched_lock_map;
extern struct lockdep_map rcu_callback_map;
int debug_lockdep_rcu_enabled(void);
int rcu_read_lock_held(void);
int rcu_read_lock_bh_held(void);
int rcu_read_lock_sched_held(void);
int rcu_read_lock_any_held(void);

#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */

# define rcu_lock_acquire(a)                do { } while (0)
# define rcu_lock_release(a)                do { } while (0)

static inline int rcu_read_lock_held(void)
{
        return 1;
}

static inline int rcu_read_lock_bh_held(void)
{
        return 1;
}

static inline int rcu_read_lock_sched_held(void)
{
        return !preemptible();
}

static inline int rcu_read_lock_any_held(void)
{
        return !preemptible();
}

#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */

#ifdef CONFIG_PROVE_RCU

/**
 * RCU_LOCKDEP_WARN - emit lockdep splat if specified condition is met
 * @c: condition to check
 * @s: informative message
 *
 * This checks debug_lockdep_rcu_enabled() before checking (c) to
 * prevent early boot splats due to lockdep not yet being initialized,
 * and rechecks it after checking (c) to prevent false-positive splats
 * due to races with lockdep being disabled.  See commit 3066820034b5dd
 * ("rcu: Reject RCU_LOCKDEP_WARN() false positives") for more detail.
 */
#define RCU_LOCKDEP_WARN(c, s)                                                \
        do {                                                                \
                static bool __section(".data.unlikely") __warned;        \
                if (debug_lockdep_rcu_enabled() && (c) &&                \
                    debug_lockdep_rcu_enabled() && !__warned) {                \
                        __warned = true;                                \
                        lockdep_rcu_suspicious(__FILE__, __LINE__, s);        \
                }                                                        \
        } while (0)

#if defined(CONFIG_PROVE_RCU) && !defined(CONFIG_PREEMPT_RCU)
static inline void rcu_preempt_sleep_check(void)
{
        RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map),
                         "Illegal context switch in RCU read-side critical section");
}
#else /* #ifdef CONFIG_PROVE_RCU */
static inline void rcu_preempt_sleep_check(void) { }
#endif /* #else #ifdef CONFIG_PROVE_RCU */

#define rcu_sleep_check()                                                \
        do {                                                                \
                rcu_preempt_sleep_check();                                \
                RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map),        \
                                 "Illegal context switch in RCU-bh read-side critical section"); \
                RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map),        \
                                 "Illegal context switch in RCU-sched read-side critical section"); \
        } while (0)

#else /* #ifdef CONFIG_PROVE_RCU */

#define RCU_LOCKDEP_WARN(c, s) do { } while (0)
#define rcu_sleep_check() do { } while (0)

#endif /* #else #ifdef CONFIG_PROVE_RCU */

/*
 * Helper functions for rcu_dereference_check(), rcu_dereference_protected()
 * and rcu_assign_pointer().  Some of these could be folded into their
 * callers, but they are left separate in order to ease introduction of
 * multiple pointers markings to match different RCU implementations
 * (e.g., __srcu), should this make sense in the future.
 */

#ifdef __CHECKER__
#define rcu_check_sparse(p, space) \
        ((void)(((typeof(*p) space *)p) == p))
#else /* #ifdef __CHECKER__ */
#define rcu_check_sparse(p, space)
#endif /* #else #ifdef __CHECKER__ */

/**
 * unrcu_pointer - mark a pointer as not being RCU protected
 * @p: pointer needing to lose its __rcu property
 *
 * Converts @p from an __rcu pointer to a __kernel pointer.
 * This allows an __rcu pointer to be used with xchg() and friends.
 */
#define unrcu_pointer(p)                                                \
({                                                                        \
        typeof(*p) *_________p1 = (typeof(*p) *__force)(p);                \
        rcu_check_sparse(p, __rcu);                                         \
        ((typeof(*p) __force __kernel *)(_________p1));                 \
})

#define __rcu_access_pointer(p, space) \
({ \
        typeof(*p) *_________p1 = (typeof(*p) *__force)READ_ONCE(p); \
        rcu_check_sparse(p, space); \
        ((typeof(*p) __force __kernel *)(_________p1)); \
})
#define __rcu_dereference_check(p, c, space) \
({ \
        /* Dependency order vs. p above. */ \
        typeof(*p) *________p1 = (typeof(*p) *__force)READ_ONCE(p); \
        RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_check() usage"); \
        rcu_check_sparse(p, space); \
        ((typeof(*p) __force __kernel *)(________p1)); \
})
#define __rcu_dereference_protected(p, c, space) \
({ \
        RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_protected() usage"); \
        rcu_check_sparse(p, space); \
        ((typeof(*p) __force __kernel *)(p)); \
})
#define rcu_dereference_raw(p) \
({ \
        /* Dependency order vs. p above. */ \
        typeof(p) ________p1 = READ_ONCE(p); \
        ((typeof(*p) __force __kernel *)(________p1)); \
})

/**
 * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
 * @v: The value to statically initialize with.
 */
#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)

/**
 * rcu_assign_pointer() - assign to RCU-protected pointer
 * @p: pointer to assign to
 * @v: value to assign (publish)
 *
 * Assigns the specified value to the specified RCU-protected
 * pointer, ensuring that any concurrent RCU readers will see
 * any prior initialization.
 *
 * Inserts memory barriers on architectures that require them
 * (which is most of them), and also prevents the compiler from
 * reordering the code that initializes the structure after the pointer
 * assignment.  More importantly, this call documents which pointers
 * will be dereferenced by RCU read-side code.
 *
 * In some special cases, you may use RCU_INIT_POINTER() instead
 * of rcu_assign_pointer().  RCU_INIT_POINTER() is a bit faster due
 * to the fact that it does not constrain either the CPU or the compiler.
 * That said, using RCU_INIT_POINTER() when you should have used
 * rcu_assign_pointer() is a very bad thing that results in
 * impossible-to-diagnose memory corruption.  So please be careful.
 * See the RCU_INIT_POINTER() comment header for details.
 *
 * Note that rcu_assign_pointer() evaluates each of its arguments only
 * once, appearances notwithstanding.  One of the "extra" evaluations
 * is in typeof() and the other visible only to sparse (__CHECKER__),
 * neither of which actually execute the argument.  As with most cpp
 * macros, this execute-arguments-only-once property is important, so
 * please be careful when making changes to rcu_assign_pointer() and the
 * other macros that it invokes.
 */
#define rcu_assign_pointer(p, v)                                              \
do {                                                                              \
        uintptr_t _r_a_p__v = (uintptr_t)(v);                                      \
        rcu_check_sparse(p, __rcu);                                              \
                                                                              \
        if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL)              \
                WRITE_ONCE((p), (typeof(p))(_r_a_p__v));                      \
        else                                                                      \
                smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
} while (0)

/**
 * rcu_replace_pointer() - replace an RCU pointer, returning its old value
 * @rcu_ptr: RCU pointer, whose old value is returned
 * @ptr: regular pointer
 * @c: the lockdep conditions under which the dereference will take place
 *
 * Perform a replacement, where @rcu_ptr is an RCU-annotated
 * pointer and @c is the lockdep argument that is passed to the
 * rcu_dereference_protected() call used to read that pointer.  The old
 * value of @rcu_ptr is returned, and @rcu_ptr is set to @ptr.
 */
#define rcu_replace_pointer(rcu_ptr, ptr, c)                                \
({                                                                        \
        typeof(ptr) __tmp = rcu_dereference_protected((rcu_ptr), (c));        \
        rcu_assign_pointer((rcu_ptr), (ptr));                                \
        __tmp;                                                                \
})

/**
 * rcu_access_pointer() - fetch RCU pointer with no dereferencing
 * @p: The pointer to read
 *
 * Return the value of the specified RCU-protected pointer, but omit the
 * lockdep checks for being in an RCU read-side critical section.  This is
 * useful when the value of this pointer is accessed, but the pointer is
 * not dereferenced, for example, when testing an RCU-protected pointer
 * against NULL.  Although rcu_access_pointer() may also be used in cases
 * where update-side locks prevent the value of the pointer from changing,
 * you should instead use rcu_dereference_protected() for this use case.
 *
 * It is also permissible to use rcu_access_pointer() when read-side
 * access to the pointer was removed at least one grace period ago, as
 * is the case in the context of the RCU callback that is freeing up
 * the data, or after a synchronize_rcu() returns.  This can be useful
 * when tearing down multi-linked structures after a grace period
 * has elapsed.
 */
#define rcu_access_pointer(p) __rcu_access_pointer((p), __rcu)

/**
 * rcu_dereference_check() - rcu_dereference with debug checking
 * @p: The pointer to read, prior to dereferencing
 * @c: The conditions under which the dereference will take place
 *
 * Do an rcu_dereference(), but check that the conditions under which the
 * dereference will take place are correct.  Typically the conditions
 * indicate the various locking conditions that should be held at that
 * point.  The check should return true if the conditions are satisfied.
 * An implicit check for being in an RCU read-side critical section
 * (rcu_read_lock()) is included.
 *
 * For example:
 *
 *        bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock));
 *
 * could be used to indicate to lockdep that foo->bar may only be dereferenced
 * if either rcu_read_lock() is held, or that the lock required to replace
 * the bar struct at foo->bar is held.
 *
 * Note that the list of conditions may also include indications of when a lock
 * need not be held, for example during initialisation or destruction of the
 * target struct:
 *
 *        bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock) ||
 *                                              atomic_read(&foo->usage) == 0);
 *
 * Inserts memory barriers on architectures that require them
 * (currently only the Alpha), prevents the compiler from refetching
 * (and from merging fetches), and, more importantly, documents exactly
 * which pointers are protected by RCU and checks that the pointer is
 * annotated as __rcu.
 */
#define rcu_dereference_check(p, c) \
        __rcu_dereference_check((p), (c) || rcu_read_lock_held(), __rcu)

/**
 * rcu_dereference_bh_check() - rcu_dereference_bh with debug checking
 * @p: The pointer to read, prior to dereferencing
 * @c: The conditions under which the dereference will take place
 *
 * This is the RCU-bh counterpart to rcu_dereference_check().
 */
#define rcu_dereference_bh_check(p, c) \
        __rcu_dereference_check((p), (c) || rcu_read_lock_bh_held(), __rcu)

/**
 * rcu_dereference_sched_check() - rcu_dereference_sched with debug checking
 * @p: The pointer to read, prior to dereferencing
 * @c: The conditions under which the dereference will take place
 *
 * This is the RCU-sched counterpart to rcu_dereference_check().
 */
#define rcu_dereference_sched_check(p, c) \
        __rcu_dereference_check((p), (c) || rcu_read_lock_sched_held(), \
                                __rcu)

/*
 * The tracing infrastructure traces RCU (we want that), but unfortunately
 * some of the RCU checks causes tracing to lock up the system.
 *
 * The no-tracing version of rcu_dereference_raw() must not call
 * rcu_read_lock_held().
 */
#define rcu_dereference_raw_check(p) __rcu_dereference_check((p), 1, __rcu)

/**
 * rcu_dereference_protected() - fetch RCU pointer when updates prevented
 * @p: The pointer to read, prior to dereferencing
 * @c: The conditions under which the dereference will take place
 *
 * Return the value of the specified RCU-protected pointer, but omit
 * the READ_ONCE().  This is useful in cases where update-side locks
 * prevent the value of the pointer from changing.  Please note that this
 * primitive does *not* prevent the compiler from repeating this reference
 * or combining it with other references, so it should not be used without
 * protection of appropriate locks.
 *
 * This function is only for update-side use.  Using this function
 * when protected only by rcu_read_lock() will result in infrequent
 * but very ugly failures.
 */
#define rcu_dereference_protected(p, c) \
        __rcu_dereference_protected((p), (c), __rcu)


/**
 * rcu_dereference() - fetch RCU-protected pointer for dereferencing
 * @p: The pointer to read, prior to dereferencing
 *
 * This is a simple wrapper around rcu_dereference_check().
 */
#define rcu_dereference(p) rcu_dereference_check(p, 0)

/**
 * rcu_dereference_bh() - fetch an RCU-bh-protected pointer for dereferencing
 * @p: The pointer to read, prior to dereferencing
 *
 * Makes rcu_dereference_check() do the dirty work.
 */
#define rcu_dereference_bh(p) rcu_dereference_bh_check(p, 0)

/**
 * rcu_dereference_sched() - fetch RCU-sched-protected pointer for dereferencing
 * @p: The pointer to read, prior to dereferencing
 *
 * Makes rcu_dereference_check() do the dirty work.
 */
#define rcu_dereference_sched(p) rcu_dereference_sched_check(p, 0)

/**
 * rcu_pointer_handoff() - Hand off a pointer from RCU to other mechanism
 * @p: The pointer to hand off
 *
 * This is simply an identity function, but it documents where a pointer
 * is handed off from RCU to some other synchronization mechanism, for
 * example, reference counting or locking.  In C11, it would map to
 * kill_dependency().  It could be used as follows::
 *
 *        rcu_read_lock();
 *        p = rcu_dereference(gp);
 *        long_lived = is_long_lived(p);
 *        if (long_lived) {
 *                if (!atomic_inc_not_zero(p->refcnt))
 *                        long_lived = false;
 *                else
 *                        p = rcu_pointer_handoff(p);
 *        }
 *        rcu_read_unlock();
 */
#define rcu_pointer_handoff(p) (p)

/**
 * rcu_read_lock() - mark the beginning of an RCU read-side critical section
 *
 * When synchronize_rcu() is invoked on one CPU while other CPUs
 * are within RCU read-side critical sections, then the
 * synchronize_rcu() is guaranteed to block until after all the other
 * CPUs exit their critical sections.  Similarly, if call_rcu() is invoked
 * on one CPU while other CPUs are within RCU read-side critical
 * sections, invocation of the corresponding RCU callback is deferred
 * until after the all the other CPUs exit their critical sections.
 *
 * Note, however, that RCU callbacks are permitted to run concurrently
 * with new RCU read-side critical sections.  One way that this can happen
 * is via the following sequence of events: (1) CPU 0 enters an RCU
 * read-side critical section, (2) CPU 1 invokes call_rcu() to register
 * an RCU callback, (3) CPU 0 exits the RCU read-side critical section,
 * (4) CPU 2 enters a RCU read-side critical section, (5) the RCU
 * callback is invoked.  This is legal, because the RCU read-side critical
 * section that was running concurrently with the call_rcu() (and which
 * therefore might be referencing something that the corresponding RCU
 * callback would free up) has completed before the corresponding
 * RCU callback is invoked.
 *
 * RCU read-side critical sections may be nested.  Any deferred actions
 * will be deferred until the outermost RCU read-side critical section
 * completes.
 *
 * You can avoid reading and understanding the next paragraph by
 * following this rule: don't put anything in an rcu_read_lock() RCU
 * read-side critical section that would block in a !PREEMPTION kernel.
 * But if you want the full story, read on!
 *
 * In non-preemptible RCU implementations (pure TREE_RCU and TINY_RCU),
 * it is illegal to block while in an RCU read-side critical section.
 * In preemptible RCU implementations (PREEMPT_RCU) in CONFIG_PREEMPTION
 * kernel builds, RCU read-side critical sections may be preempted,
 * but explicit blocking is illegal.  Finally, in preemptible RCU
 * implementations in real-time (with -rt patchset) kernel builds, RCU
 * read-side critical sections may be preempted and they may also block, but
 * only when acquiring spinlocks that are subject to priority inheritance.
 */
static __always_inline void rcu_read_lock(void)
{
        __rcu_read_lock();
        __acquire(RCU);
        rcu_lock_acquire(&rcu_lock_map);
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_lock() used illegally while idle");
}

/*
 * So where is rcu_write_lock()?  It does not exist, as there is no
 * way for writers to lock out RCU readers.  This is a feature, not
 * a bug -- this property is what provides RCU's performance benefits.
 * Of course, writers must coordinate with each other.  The normal
 * spinlock primitives work well for this, but any other technique may be
 * used as well.  RCU does not care how the writers keep out of each
 * others' way, as long as they do so.
 */

/**
 * rcu_read_unlock() - marks the end of an RCU read-side critical section.
 *
 * In most situations, rcu_read_unlock() is immune from deadlock.
 * However, in kernels built with CONFIG_RCU_BOOST, rcu_read_unlock()
 * is responsible for deboosting, which it does via rt_mutex_unlock().
 * Unfortunately, this function acquires the scheduler's runqueue and
 * priority-inheritance spinlocks.  This means that deadlock could result
 * if the caller of rcu_read_unlock() already holds one of these locks or
 * any lock that is ever acquired while holding them.
 *
 * That said, RCU readers are never priority boosted unless they were
 * preempted.  Therefore, one way to avoid deadlock is to make sure
 * that preemption never happens within any RCU read-side critical
 * section whose outermost rcu_read_unlock() is called with one of
 * rt_mutex_unlock()'s locks held.  Such preemption can be avoided in
 * a number of ways, for example, by invoking preempt_disable() before
 * critical section's outermost rcu_read_lock().
 *
 * Given that the set of locks acquired by rt_mutex_unlock() might change
 * at any time, a somewhat more future-proofed approach is to make sure
 * that that preemption never happens within any RCU read-side critical
 * section whose outermost rcu_read_unlock() is called with irqs disabled.
 * This approach relies on the fact that rt_mutex_unlock() currently only
 * acquires irq-disabled locks.
 *
 * The second of these two approaches is best in most situations,
 * however, the first approach can also be useful, at least to those
 * developers willing to keep abreast of the set of locks acquired by
 * rt_mutex_unlock().
 *
 * See rcu_read_lock() for more information.
 */
static inline void rcu_read_unlock(void)
{
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_unlock() used illegally while idle");
        __release(RCU);
        __rcu_read_unlock();
        rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */
}

/**
 * rcu_read_lock_bh() - mark the beginning of an RCU-bh critical section
 *
 * This is equivalent of rcu_read_lock(), but also disables softirqs.
 * Note that anything else that disables softirqs can also serve as
 * an RCU read-side critical section.
 *
 * Note that rcu_read_lock_bh() and the matching rcu_read_unlock_bh()
 * must occur in the same context, for example, it is illegal to invoke
 * rcu_read_unlock_bh() from one task if the matching rcu_read_lock_bh()
 * was invoked from some other task.
 */
static inline void rcu_read_lock_bh(void)
{
        local_bh_disable();
        __acquire(RCU_BH);
        rcu_lock_acquire(&rcu_bh_lock_map);
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_lock_bh() used illegally while idle");
}

/**
 * rcu_read_unlock_bh() - marks the end of a softirq-only RCU critical section
 *
 * See rcu_read_lock_bh() for more information.
 */
static inline void rcu_read_unlock_bh(void)
{
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_unlock_bh() used illegally while idle");
        rcu_lock_release(&rcu_bh_lock_map);
        __release(RCU_BH);
        local_bh_enable();
}

/**
 * rcu_read_lock_sched() - mark the beginning of a RCU-sched critical section
 *
 * This is equivalent of rcu_read_lock(), but disables preemption.
 * Read-side critical sections can also be introduced by anything else
 * that disables preemption, including local_irq_disable() and friends.
 *
 * Note that rcu_read_lock_sched() and the matching rcu_read_unlock_sched()
 * must occur in the same context, for example, it is illegal to invoke
 * rcu_read_unlock_sched() from process context if the matching
 * rcu_read_lock_sched() was invoked from an NMI handler.
 */
static inline void rcu_read_lock_sched(void)
{
        preempt_disable();
        __acquire(RCU_SCHED);
        rcu_lock_acquire(&rcu_sched_lock_map);
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_lock_sched() used illegally while idle");
}

/* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */
static inline notrace void rcu_read_lock_sched_notrace(void)
{
        preempt_disable_notrace();
        __acquire(RCU_SCHED);
}

/**
 * rcu_read_unlock_sched() - marks the end of a RCU-classic critical section
 *
 * See rcu_read_lock_sched() for more information.
 */
static inline void rcu_read_unlock_sched(void)
{
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_unlock_sched() used illegally while idle");
        rcu_lock_release(&rcu_sched_lock_map);
        __release(RCU_SCHED);
        preempt_enable();
}

/* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */
static inline notrace void rcu_read_unlock_sched_notrace(void)
{
        __release(RCU_SCHED);
        preempt_enable_notrace();
}

/**
 * RCU_INIT_POINTER() - initialize an RCU protected pointer
 * @p: The pointer to be initialized.
 * @v: The value to initialized the pointer to.
 *
 * Initialize an RCU-protected pointer in special cases where readers
 * do not need ordering constraints on the CPU or the compiler.  These
 * special cases are:
 *
 * 1.        This use of RCU_INIT_POINTER() is NULLing out the pointer *or*
 * 2.        The caller has taken whatever steps are required to prevent
 *        RCU readers from concurrently accessing this pointer *or*
 * 3.        The referenced data structure has already been exposed to
 *        readers either at compile time or via rcu_assign_pointer() *and*
 *
 *        a.        You have not made *any* reader-visible changes to
 *                this structure since then *or*
 *        b.        It is OK for readers accessing this structure from its
 *                new location to see the old state of the structure.  (For
 *                example, the changes were to statistical counters or to
 *                other state where exact synchronization is not required.)
 *
 * Failure to follow these rules governing use of RCU_INIT_POINTER() will
 * result in impossible-to-diagnose memory corruption.  As in the structures
 * will look OK in crash dumps, but any concurrent RCU readers might
 * see pre-initialized values of the referenced data structure.  So
 * please be very careful how you use RCU_INIT_POINTER()!!!
 *
 * If you are creating an RCU-protected linked structure that is accessed
 * by a single external-to-structure RCU-protected pointer, then you may
 * use RCU_INIT_POINTER() to initialize the internal RCU-protected
 * pointers, but you must use rcu_assign_pointer() to initialize the
 * external-to-structure pointer *after* you have completely initialized
 * the reader-accessible portions of the linked structure.
 *
 * Note that unlike rcu_assign_pointer(), RCU_INIT_POINTER() provides no
 * ordering guarantees for either the CPU or the compiler.
 */
#define RCU_INIT_POINTER(p, v) \
        do { \
                rcu_check_sparse(p, __rcu); \
                WRITE_ONCE(p, RCU_INITIALIZER(v)); \
        } while (0)

/**
 * RCU_POINTER_INITIALIZER() - statically initialize an RCU protected pointer
 * @p: The pointer to be initialized.
 * @v: The value to initialized the pointer to.
 *
 * GCC-style initialization for an RCU-protected pointer in a structure field.
 */
#define RCU_POINTER_INITIALIZER(p, v) \
                .p = RCU_INITIALIZER(v)

/*
 * Does the specified offset indicate that the corresponding rcu_head
 * structure can be handled by kvfree_rcu()?
 */
#define __is_kvfree_rcu_offset(offset) ((offset) < 4096)

/*
 * Helper macro for kfree_rcu() to prevent argument-expansion eyestrain.
 */
#define __kvfree_rcu(head, offset) \
        do { \
                BUILD_BUG_ON(!__is_kvfree_rcu_offset(offset)); \
                kvfree_call_rcu(head, (rcu_callback_t)(unsigned long)(offset)); \
        } while (0)

/**
 * kfree_rcu() - kfree an object after a grace period.
 * @ptr:        pointer to kfree
 * @rhf:        the name of the struct rcu_head within the type of @ptr.
 *
 * Many rcu callbacks functions just call kfree() on the base structure.
 * These functions are trivial, but their size adds up, and furthermore
 * when they are used in a kernel module, that module must invoke the
 * high-latency rcu_barrier() function at module-unload time.
 *
 * The kfree_rcu() function handles this issue.  Rather than encoding a
 * function address in the embedded rcu_head structure, kfree_rcu() instead
 * encodes the offset of the rcu_head structure within the base structure.
 * Because the functions are not allowed in the low-order 4096 bytes of
 * kernel virtual memory, offsets up to 4095 bytes can be accommodated.
 * If the offset is larger than 4095 bytes, a compile-time error will
 * be generated in __kvfree_rcu(). If this error is triggered, you can
 * either fall back to use of call_rcu() or rearrange the structure to
 * position the rcu_head structure into the first 4096 bytes.
 *
 * Note that the allowable offset might decrease in the future, for example,
 * to allow something like kmem_cache_free_rcu().
 *
 * The BUILD_BUG_ON check must not involve any function calls, hence the
 * checks are done in macros here.
 */
#define kfree_rcu(ptr, rhf)                                                \
do {                                                                        \
        typeof (ptr) ___p = (ptr);                                        \
                                                                        \
        if (___p)                                                        \
                __kvfree_rcu(&((___p)->rhf), offsetof(typeof(*(ptr)), rhf)); \
} while (0)

/**
 * kvfree_rcu() - kvfree an object after a grace period.
 *
 * This macro consists of one or two arguments and it is
 * based on whether an object is head-less or not. If it
 * has a head then a semantic stays the same as it used
 * to be before:
 *
 *     kvfree_rcu(ptr, rhf);
 *
 * where @ptr is a pointer to kvfree(), @rhf is the name
 * of the rcu_head structure within the type of @ptr.
 *
 * When it comes to head-less variant, only one argument
 * is passed and that is just a pointer which has to be
 * freed after a grace period. Therefore the semantic is
 *
 *     kvfree_rcu(ptr);
 *
 * where @ptr is a pointer to kvfree().
 *
 * Please note, head-less way of freeing is permitted to
 * use from a context that has to follow might_sleep()
 * annotation. Otherwise, please switch and embed the
 * rcu_head structure within the type of @ptr.
 */
#define kvfree_rcu(...) KVFREE_GET_MACRO(__VA_ARGS__,                \
        kvfree_rcu_arg_2, kvfree_rcu_arg_1)(__VA_ARGS__)

#define kvfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr)
#define kfree_rcu_mightsleep(ptr) kvfree_rcu_mightsleep(ptr)

#define KVFREE_GET_MACRO(_1, _2, NAME, ...) NAME
#define kvfree_rcu_arg_2(ptr, rhf) kfree_rcu(ptr, rhf)
#define kvfree_rcu_arg_1(ptr)                                        \
do {                                                                \
        typeof(ptr) ___p = (ptr);                                \
                                                                \
        if (___p)                                                \
                kvfree_call_rcu(NULL, (rcu_callback_t) (___p));        \
} while (0)

/*
 * Place this after a lock-acquisition primitive to guarantee that
 * an UNLOCK+LOCK pair acts as a full barrier.  This guarantee applies
 * if the UNLOCK and LOCK are executed by the same CPU or if the
 * UNLOCK and LOCK operate on the same lock variable.
 */
#ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE
#define smp_mb__after_unlock_lock()        smp_mb()  /* Full ordering for lock. */
#else /* #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */
#define smp_mb__after_unlock_lock()        do { } while (0)
#endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */


/* Has the specified rcu_head structure been handed to call_rcu()? */

/**
 * rcu_head_init - Initialize rcu_head for rcu_head_after_call_rcu()
 * @rhp: The rcu_head structure to initialize.
 *
 * If you intend to invoke rcu_head_after_call_rcu() to test whether a
 * given rcu_head structure has already been passed to call_rcu(), then
 * you must also invoke this rcu_head_init() function on it just after
 * allocating that structure.  Calls to this function must not race with
 * calls to call_rcu(), rcu_head_after_call_rcu(), or callback invocation.
 */
static inline void rcu_head_init(struct rcu_head *rhp)
{
        rhp->func = (rcu_callback_t)~0L;
}

/**
 * rcu_head_after_call_rcu() - Has this rcu_head been passed to call_rcu()?
 * @rhp: The rcu_head structure to test.
 * @f: The function passed to call_rcu() along with @rhp.
 *
 * Returns @true if the @rhp has been passed to call_rcu() with @func,
 * and @false otherwise.  Emits a warning in any other case, including
 * the case where @rhp has already been invoked after a grace period.
 * Calls to this function must not race with callback invocation.  One way
 * to avoid such races is to enclose the call to rcu_head_after_call_rcu()
 * in an RCU read-side critical section that includes a read-side fetch
 * of the pointer to the structure containing @rhp.
 */
static inline bool
rcu_head_after_call_rcu(struct rcu_head *rhp, rcu_callback_t f)
{
        rcu_callback_t func = READ_ONCE(rhp->func);

        if (func == f)
                return true;
        WARN_ON_ONCE(func != (rcu_callback_t)~0L);
        return false;
}

/* kernel/ksysfs.c definitions */
extern int rcu_expedited;
extern int rcu_normal;

DEFINE_LOCK_GUARD_0(rcu, rcu_read_lock(), rcu_read_unlock())

#endif /* __LINUX_RCUPDATE_H */


































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * NFS internal definitions
 */

#include "nfs4_fs.h"
#include <linux/fs_context.h>
#include <linux/security.h>
#include <linux/crc32.h>
#include <linux/sunrpc/addr.h>
#include <linux/nfs_page.h>
#include <linux/wait_bit.h>

#define NFS_SB_MASK (SB_RDONLY|SB_NOSUID|SB_NODEV|SB_NOEXEC|SB_SYNCHRONOUS)

extern const struct export_operations nfs_export_ops;

struct nfs_string;
struct nfs_pageio_descriptor;

static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr)
{
        if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid))
                fattr->valid |= NFS_ATTR_FATTR_MOUNTPOINT;
}

static inline int nfs_attr_use_mounted_on_fileid(struct nfs_fattr *fattr)
{
        if (((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) == 0) ||
            (((fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT) == 0) &&
             ((fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) == 0)))
                return 0;
        return 1;
}

static inline bool nfs_lookup_is_soft_revalidate(const struct dentry *dentry)
{
        if (!(NFS_SB(dentry->d_sb)->flags & NFS_MOUNT_SOFTREVAL))
                return false;
        if (!d_is_positive(dentry) || !NFS_FH(d_inode(dentry))->size)
                return false;
        return true;
}

static inline fmode_t flags_to_mode(int flags)
{
        fmode_t res = (__force fmode_t)flags & FMODE_EXEC;
        if ((flags & O_ACCMODE) != O_WRONLY)
                res |= FMODE_READ;
        if ((flags & O_ACCMODE) != O_RDONLY)
                res |= FMODE_WRITE;
        return res;
}

/*
 * Note: RFC 1813 doesn't limit the number of auth flavors that
 * a server can return, so make something up.
 */
#define NFS_MAX_SECFLAVORS        (12)

/*
 * Value used if the user did not specify a port value.
 */
#define NFS_UNSPEC_PORT                (-1)

#define NFS_UNSPEC_RETRANS        (UINT_MAX)
#define NFS_UNSPEC_TIMEO        (UINT_MAX)

/*
 * Maximum number of pages that readdir can use for creating
 * a vmapped array of pages.
 */
#define NFS_MAX_READDIR_PAGES 8

struct nfs_client_initdata {
        unsigned long init_flags;
        const char *hostname;                        /* Hostname of the server */
        const struct sockaddr *addr;                /* Address of the server */
        const char *nodename;                        /* Hostname of the client */
        const char *ip_addr;                        /* IP address of the client */
        size_t addrlen;
        struct nfs_subversion *nfs_mod;
        int proto;
        u32 minorversion;
        unsigned int nconnect;
        struct net *net;
        const struct rpc_timeout *timeparms;
        const struct cred *cred;
};

/*
 * In-kernel mount arguments
 */
struct nfs_fs_context {
        bool                        internal;
        bool                        skip_reconfig_option_check;
        bool                        need_mount;
        bool                        sloppy;
        unsigned int                flags;                /* NFS{,4}_MOUNT_* flags */
        unsigned int                rsize, wsize;
        unsigned int                timeo, retrans;
        unsigned int                acregmin, acregmax;
        unsigned int                acdirmin, acdirmax;
        unsigned int                namlen;
        unsigned int                options;
        unsigned int                bsize;
        struct nfs_auth_info        auth_info;
        rpc_authflavor_t        selected_flavor;
        char                        *client_address;
        unsigned int                version;
        unsigned int                minorversion;
        char                        *fscache_uniq;
        unsigned short                protofamily;
        unsigned short                mountfamily;

        struct {
                union {
                        struct sockaddr        address;
                        struct sockaddr_storage        _address;
                };
                size_t                        addrlen;
                char                        *hostname;
                u32                        version;
                int                        port;
                unsigned short                protocol;
        } mount_server;

        struct {
                union {
                        struct sockaddr        address;
                        struct sockaddr_storage        _address;
                };
                size_t                        addrlen;
                char                        *hostname;
                char                        *export_path;
                int                        port;
                unsigned short                protocol;
                unsigned short                nconnect;
                unsigned short                export_path_len;
        } nfs_server;

        struct nfs_fh                *mntfh;
        struct nfs_server        *server;
        struct nfs_subversion        *nfs_mod;

        /* Information for a cloned mount. */
        struct nfs_clone_mount {
                struct super_block        *sb;
                struct dentry                *dentry;
                struct nfs_fattr        *fattr;
                unsigned int                inherited_bsize;
        } clone_data;
};

#define nfs_errorf(fc, fmt, ...) ((fc)->log.log ?                \
        errorf(fc, fmt, ## __VA_ARGS__) :                        \
        ({ dprintk(fmt "\n", ## __VA_ARGS__); }))

#define nfs_ferrorf(fc, fac, fmt, ...) ((fc)->log.log ?                \
        errorf(fc, fmt, ## __VA_ARGS__) :                        \
        ({ dfprintk(fac, fmt "\n", ## __VA_ARGS__); }))

#define nfs_invalf(fc, fmt, ...) ((fc)->log.log ?                \
        invalf(fc, fmt, ## __VA_ARGS__) :                        \
        ({ dprintk(fmt "\n", ## __VA_ARGS__);  -EINVAL; }))

#define nfs_finvalf(fc, fac, fmt, ...) ((fc)->log.log ?                \
        invalf(fc, fmt, ## __VA_ARGS__) :                        \
        ({ dfprintk(fac, fmt "\n", ## __VA_ARGS__);  -EINVAL; }))

#define nfs_warnf(fc, fmt, ...) ((fc)->log.log ?                \
        warnf(fc, fmt, ## __VA_ARGS__) :                        \
        ({ dprintk(fmt "\n", ## __VA_ARGS__); }))

#define nfs_fwarnf(fc, fac, fmt, ...) ((fc)->log.log ?                \
        warnf(fc, fmt, ## __VA_ARGS__) :                        \
        ({ dfprintk(fac, fmt "\n", ## __VA_ARGS__); }))

static inline struct nfs_fs_context *nfs_fc2context(const struct fs_context *fc)
{
        return fc->fs_private;
}

/* mount_clnt.c */
struct nfs_mount_request {
        struct sockaddr                *sap;
        size_t                        salen;
        char                        *hostname;
        char                        *dirpath;
        u32                        version;
        unsigned short                protocol;
        struct nfs_fh                *fh;
        int                        noresvport;
        unsigned int                *auth_flav_len;
        rpc_authflavor_t        *auth_flavs;
        struct net                *net;
};

extern int nfs_mount(struct nfs_mount_request *info);
extern void nfs_umount(const struct nfs_mount_request *info);

/* client.c */
extern const struct rpc_program nfs_program;
extern void nfs_clients_init(struct net *net);
extern void nfs_clients_exit(struct net *net);
extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *);
int nfs_create_rpc_client(struct nfs_client *, const struct nfs_client_initdata *, rpc_authflavor_t);
struct nfs_client *nfs_get_client(const struct nfs_client_initdata *);
int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *);
void nfs_server_insert_lists(struct nfs_server *);
void nfs_server_remove_lists(struct nfs_server *);
void nfs_init_timeout_values(struct rpc_timeout *to, int proto, int timeo, int retrans);
int nfs_init_server_rpcclient(struct nfs_server *, const struct rpc_timeout *t,
                rpc_authflavor_t);
struct nfs_server *nfs_alloc_server(void);
void nfs_server_copy_userdata(struct nfs_server *, struct nfs_server *);

extern void nfs_put_client(struct nfs_client *);
extern void nfs_free_client(struct nfs_client *);
extern struct nfs_client *nfs4_find_client_ident(struct net *, int);
extern struct nfs_client *
nfs4_find_client_sessionid(struct net *, const struct sockaddr *,
                                struct nfs4_sessionid *, u32);
extern struct nfs_server *nfs_create_server(struct fs_context *);
extern void nfs_server_set_init_caps(struct nfs_server *);
extern struct nfs_server *nfs4_create_server(struct fs_context *);
extern struct nfs_server *nfs4_create_referral_server(struct fs_context *);
extern int nfs4_update_server(struct nfs_server *server, const char *hostname,
                                        struct sockaddr *sap, size_t salen,
                                        struct net *net);
extern void nfs_free_server(struct nfs_server *server);
extern struct nfs_server *nfs_clone_server(struct nfs_server *,
                                           struct nfs_fh *,
                                           struct nfs_fattr *,
                                           rpc_authflavor_t);
extern bool nfs_client_init_is_complete(const struct nfs_client *clp);
extern int nfs_client_init_status(const struct nfs_client *clp);
extern int nfs_wait_client_init_complete(const struct nfs_client *clp);
extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
                                             const struct sockaddr *ds_addr,
                                             int ds_addrlen, int ds_proto,
                                             unsigned int ds_timeo,
                                             unsigned int ds_retrans,
                                             u32 minor_version);
extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *,
                                                struct inode *);
extern struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
                        const struct sockaddr *ds_addr, int ds_addrlen,
                        int ds_proto, unsigned int ds_timeo,
                        unsigned int ds_retrans);
#ifdef CONFIG_PROC_FS
extern int __init nfs_fs_proc_init(void);
extern void nfs_fs_proc_exit(void);
extern int nfs_fs_proc_net_init(struct net *net);
extern void nfs_fs_proc_net_exit(struct net *net);
#else
static inline int nfs_fs_proc_net_init(struct net *net)
{
        return 0;
}
static inline void nfs_fs_proc_net_exit(struct net *net)
{
}
static inline int nfs_fs_proc_init(void)
{
        return 0;
}
static inline void nfs_fs_proc_exit(void)
{
}
#endif

/* callback_xdr.c */
extern const struct svc_version nfs4_callback_version1;
extern const struct svc_version nfs4_callback_version4;

/* fs_context.c */
extern struct file_system_type nfs_fs_type;

/* pagelist.c */
extern int __init nfs_init_nfspagecache(void);
extern void nfs_destroy_nfspagecache(void);
extern int __init nfs_init_readpagecache(void);
extern void nfs_destroy_readpagecache(void);
extern int __init nfs_init_writepagecache(void);
extern void nfs_destroy_writepagecache(void);

extern int __init nfs_init_directcache(void);
extern void nfs_destroy_directcache(void);
extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
                              struct nfs_pgio_header *hdr,
                              void (*release)(struct nfs_pgio_header *hdr));
void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos);
int nfs_iocounter_wait(struct nfs_lock_context *l_ctx);

extern const struct nfs_pageio_ops nfs_pgio_rw_ops;
struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *);
void nfs_pgio_header_free(struct nfs_pgio_header *);
int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *);
int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
                      const struct cred *cred, const struct nfs_rpc_ops *rpc_ops,
                      const struct rpc_call_ops *call_ops, int how, int flags);
void nfs_free_request(struct nfs_page *req);
struct nfs_pgio_mirror *
nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc);

static inline bool nfs_match_open_context(const struct nfs_open_context *ctx1,
                const struct nfs_open_context *ctx2)
{
        return cred_fscmp(ctx1->cred, ctx2->cred) == 0 && ctx1->state == ctx2->state;
}

/* nfs2xdr.c */
extern const struct rpc_procinfo nfs_procedures[];
extern int nfs2_decode_dirent(struct xdr_stream *,
                                struct nfs_entry *, bool);

/* nfs3xdr.c */
extern const struct rpc_procinfo nfs3_procedures[];
extern int nfs3_decode_dirent(struct xdr_stream *,
                                struct nfs_entry *, bool);

/* nfs4xdr.c */
#if IS_ENABLED(CONFIG_NFS_V4)
extern int nfs4_decode_dirent(struct xdr_stream *,
                                struct nfs_entry *, bool);
#endif
#ifdef CONFIG_NFS_V4_1
extern const u32 nfs41_maxread_overhead;
extern const u32 nfs41_maxwrite_overhead;
extern const u32 nfs41_maxgetdevinfo_overhead;
#endif

/* nfs4proc.c */
#if IS_ENABLED(CONFIG_NFS_V4)
extern const struct rpc_procinfo nfs4_procedures[];
#endif

#ifdef CONFIG_NFS_V4_SECURITY_LABEL
extern struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags);
static inline struct nfs4_label *
nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src)
{
        if (!dst || !src)
                return NULL;

        if (src->len > NFS4_MAXLABELLEN)
                return NULL;

        dst->lfs = src->lfs;
        dst->pi = src->pi;
        dst->len = src->len;
        memcpy(dst->label, src->label, src->len);

        return dst;
}
static inline void nfs4_label_free(struct nfs4_label *label)
{
        if (label) {
                kfree(label->label);
                kfree(label);
        }
        return;
}

static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi)
{
        if (nfs_server_capable(&nfsi->vfs_inode, NFS_CAP_SECURITY_LABEL))
                nfsi->cache_validity |= NFS_INO_INVALID_LABEL;
}
#else
static inline struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) { return NULL; }
static inline void nfs4_label_free(void *label) {}
static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi)
{
}
static inline struct nfs4_label *
nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src)
{
        return NULL;
}
#endif /* CONFIG_NFS_V4_SECURITY_LABEL */

/* proc.c */
void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
                           const struct nfs_client_initdata *);

/* dir.c */
extern void nfs_advise_use_readdirplus(struct inode *dir);
extern void nfs_force_use_readdirplus(struct inode *dir);
extern unsigned long nfs_access_cache_count(struct shrinker *shrink,
                                            struct shrink_control *sc);
extern unsigned long nfs_access_cache_scan(struct shrinker *shrink,
                                           struct shrink_control *sc);
struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int);
int nfs_create(struct inode *, struct dentry *, umode_t, bool);
int nfs_mkdir(struct inode *, struct dentry *, umode_t);
int nfs_rmdir(struct inode *, struct dentry *);
int nfs_unlink(struct inode *, struct dentry *);
int nfs_symlink(struct inode *, struct dentry *, const char *);
int nfs_link(struct dentry *, struct inode *, struct dentry *);
int nfs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
int nfs_rename(struct inode *, struct dentry *,
               struct inode *, struct dentry *, unsigned int);

/* file.c */
int nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync);
loff_t nfs_file_llseek(struct file *, loff_t, int);
ssize_t nfs_file_read(struct kiocb *, struct iov_iter *);
int nfs_file_mmap(struct file *, struct vm_area_struct *);
ssize_t nfs_file_write(struct kiocb *, struct iov_iter *);
int nfs_file_release(struct inode *, struct file *);
int nfs_lock(struct file *, int, struct file_lock *);
int nfs_flock(struct file *, int, struct file_lock *);
int nfs_check_flags(int);

/* inode.c */
extern struct workqueue_struct *nfsiod_workqueue;
extern struct inode *nfs_alloc_inode(struct super_block *sb);
extern void nfs_free_inode(struct inode *);
extern int nfs_write_inode(struct inode *, struct writeback_control *);
extern int nfs_drop_inode(struct inode *);
extern void nfs_clear_inode(struct inode *);
extern void nfs_evict_inode(struct inode *);
extern void nfs_zap_acl_cache(struct inode *inode);
extern void nfs_set_cache_invalid(struct inode *inode, unsigned long flags);
extern bool nfs_check_cache_invalid(struct inode *, unsigned long);
extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode);
extern int nfs_wait_atomic_killable(atomic_t *p, unsigned int mode);

/* super.c */
extern const struct super_operations nfs_sops;
bool nfs_auth_info_match(const struct nfs_auth_info *, rpc_authflavor_t);
int nfs_try_get_tree(struct fs_context *);
int nfs_get_tree_common(struct fs_context *);
void nfs_kill_super(struct super_block *);

extern int __init register_nfs_fs(void);
extern void __exit unregister_nfs_fs(void);
extern bool nfs_sb_active(struct super_block *sb);
extern void nfs_sb_deactive(struct super_block *sb);
extern int nfs_client_for_each_server(struct nfs_client *clp,
                                      int (*fn)(struct nfs_server *, void *),
                                      void *data);
/* io.c */
extern void nfs_start_io_read(struct inode *inode);
extern void nfs_end_io_read(struct inode *inode);
extern void nfs_start_io_write(struct inode *inode);
extern void nfs_end_io_write(struct inode *inode);
extern void nfs_start_io_direct(struct inode *inode);
extern void nfs_end_io_direct(struct inode *inode);

static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi)
{
        return test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0;
}

/* namespace.c */
#define NFS_PATH_CANONICAL 1
extern char *nfs_path(char **p, struct dentry *dentry,
                      char *buffer, ssize_t buflen, unsigned flags);
extern struct vfsmount *nfs_d_automount(struct path *path);
int nfs_submount(struct fs_context *, struct nfs_server *);
int nfs_do_submount(struct fs_context *);

/* getroot.c */
extern int nfs_get_root(struct super_block *s, struct fs_context *fc);
#if IS_ENABLED(CONFIG_NFS_V4)
extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool);
#endif

struct nfs_pgio_completion_ops;
/* read.c */
extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
                        struct inode *inode, bool force_mds,
                        const struct nfs_pgio_completion_ops *compl_ops);
extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);

/* super.c */
void nfs_umount_begin(struct super_block *);
int  nfs_statfs(struct dentry *, struct kstatfs *);
int  nfs_show_options(struct seq_file *, struct dentry *);
int  nfs_show_devname(struct seq_file *, struct dentry *);
int  nfs_show_path(struct seq_file *, struct dentry *);
int  nfs_show_stats(struct seq_file *, struct dentry *);
int  nfs_reconfigure(struct fs_context *);

/* write.c */
extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
                        struct inode *inode, int ioflags, bool force_mds,
                        const struct nfs_pgio_completion_ops *compl_ops);
extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
extern void nfs_commit_free(struct nfs_commit_data *p);
extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
extern void nfs_commit_prepare(struct rpc_task *task, void *calldata);
extern int nfs_initiate_commit(struct rpc_clnt *clnt,
                               struct nfs_commit_data *data,
                               const struct nfs_rpc_ops *nfs_ops,
                               const struct rpc_call_ops *call_ops,
                               int how, int flags);
extern void nfs_init_commit(struct nfs_commit_data *data,
                            struct list_head *head,
                            struct pnfs_layout_segment *lseg,
                            struct nfs_commit_info *cinfo);
int nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
                         struct nfs_commit_info *cinfo, int max);
unsigned long nfs_reqs_to_commit(struct nfs_commit_info *);
int nfs_scan_commit(struct inode *inode, struct list_head *dst,
                    struct nfs_commit_info *cinfo);
void nfs_mark_request_commit(struct nfs_page *req,
                             struct pnfs_layout_segment *lseg,
                             struct nfs_commit_info *cinfo,
                             u32 ds_commit_idx);
int nfs_write_need_commit(struct nfs_pgio_header *);
void nfs_writeback_update_inode(struct nfs_pgio_header *hdr);
int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
                            int how, struct nfs_commit_info *cinfo);
void nfs_retry_commit(struct list_head *page_list,
                      struct pnfs_layout_segment *lseg,
                      struct nfs_commit_info *cinfo,
                      u32 ds_commit_idx);
void nfs_commitdata_release(struct nfs_commit_data *data);
void nfs_request_add_commit_list(struct nfs_page *req,
                                 struct nfs_commit_info *cinfo);
void nfs_request_add_commit_list_locked(struct nfs_page *req,
                struct list_head *dst,
                struct nfs_commit_info *cinfo);
void nfs_request_remove_commit_list(struct nfs_page *req,
                                    struct nfs_commit_info *cinfo);
void nfs_init_cinfo(struct nfs_commit_info *cinfo,
                    struct inode *inode,
                    struct nfs_direct_req *dreq);
int nfs_key_timeout_notify(struct file *filp, struct inode *inode);
bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode);
void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio);

int nfs_filemap_write_and_wait_range(struct address_space *mapping,
                loff_t lstart, loff_t lend);

#ifdef CONFIG_NFS_V4_1
static inline void
pnfs_bucket_clear_pnfs_ds_commit_verifiers(struct pnfs_commit_bucket *buckets,
                unsigned int nbuckets)
{
        unsigned int i;

        for (i = 0; i < nbuckets; i++)
                buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW;
}
static inline
void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo)
{
        struct pnfs_commit_array *array;

        rcu_read_lock();
        list_for_each_entry_rcu(array, &cinfo->commits, cinfo_list)
                pnfs_bucket_clear_pnfs_ds_commit_verifiers(array->buckets,
                                array->nbuckets);
        rcu_read_unlock();
}
#else
static inline
void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo)
{
}
#endif

#ifdef CONFIG_MIGRATION
extern int nfs_migrate_page(struct address_space *,
                struct page *, struct page *, enum migrate_mode);
#endif

static inline int
nfs_write_verifier_cmp(const struct nfs_write_verifier *v1,
                const struct nfs_write_verifier *v2)
{
        return memcmp(v1->data, v2->data, sizeof(v1->data));
}

static inline bool
nfs_write_match_verf(const struct nfs_writeverf *verf,
                struct nfs_page *req)
{
        return verf->committed > NFS_UNSTABLE &&
                !nfs_write_verifier_cmp(&req->wb_verf, &verf->verifier);
}

static inline gfp_t nfs_io_gfp_mask(void)
{
        gfp_t ret = current_gfp_context(GFP_KERNEL);

        /* For workers __GFP_NORETRY only with __GFP_IO or __GFP_FS */
        if ((current->flags & PF_WQ_WORKER) && ret == GFP_KERNEL)
                ret |= __GFP_NORETRY | __GFP_NOWARN;
        return ret;
}

/* unlink.c */
extern struct rpc_task *
nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
                 struct dentry *old_dentry, struct dentry *new_dentry,
                 void (*complete)(struct rpc_task *, struct nfs_renamedata *));
extern int nfs_sillyrename(struct inode *dir, struct dentry *dentry);

/* direct.c */
void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
                              struct nfs_direct_req *dreq);
extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);

/* nfs4proc.c */
extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
                            const struct nfs_client_initdata *);
extern int nfs40_walk_client_list(struct nfs_client *clp,
                                struct nfs_client **result,
                                const struct cred *cred);
extern int nfs41_walk_client_list(struct nfs_client *clp,
                                struct nfs_client **result,
                                const struct cred *cred);
extern void nfs4_test_session_trunk(struct rpc_clnt *clnt,
                                struct rpc_xprt *xprt,
                                void *data);

static inline struct inode *nfs_igrab_and_active(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;

        if (sb && nfs_sb_active(sb)) {
                if (igrab(inode))
                        return inode;
                nfs_sb_deactive(sb);
        }
        return NULL;
}

static inline void nfs_iput_and_deactive(struct inode *inode)
{
        if (inode != NULL) {
                struct super_block *sb = inode->i_sb;

                iput(inode);
                nfs_sb_deactive(sb);
        }
}

/*
 * Determine the device name as a string
 */
static inline char *nfs_devname(struct dentry *dentry,
                                char *buffer, ssize_t buflen)
{
        char *dummy;
        return nfs_path(&dummy, dentry, buffer, buflen, NFS_PATH_CANONICAL);
}

/*
 * Determine the actual block size (and log2 thereof)
 */
static inline
unsigned long nfs_block_bits(unsigned long bsize, unsigned char *nrbitsp)
{
        /* make sure blocksize is a power of two */
        if ((bsize & (bsize - 1)) || nrbitsp) {
                unsigned char        nrbits;

                for (nrbits = 31; nrbits && !(bsize & (1UL << nrbits)); nrbits--)
                        ;
                bsize = 1UL << nrbits;
                if (nrbitsp)
                        *nrbitsp = nrbits;
        }

        return bsize;
}

/*
 * Calculate the number of 512byte blocks used.
 */
static inline blkcnt_t nfs_calc_block_size(u64 tsize)
{
        blkcnt_t used = (tsize + 511) >> 9;
        return (used > ULONG_MAX) ? ULONG_MAX : used;
}

/*
 * Compute and set NFS server blocksize
 */
static inline
unsigned long nfs_block_size(unsigned long bsize, unsigned char *nrbitsp)
{
        if (bsize < NFS_MIN_FILE_IO_SIZE)
                bsize = NFS_DEF_FILE_IO_SIZE;
        else if (bsize >= NFS_MAX_FILE_IO_SIZE)
                bsize = NFS_MAX_FILE_IO_SIZE;

        return nfs_block_bits(bsize, nrbitsp);
}

/*
 * Determine the maximum file size for a superblock
 */
static inline
void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize)
{
        sb->s_maxbytes = (loff_t)maxfilesize;
        if (sb->s_maxbytes > MAX_LFS_FILESIZE || sb->s_maxbytes <= 0)
                sb->s_maxbytes = MAX_LFS_FILESIZE;
}

/*
 * Record the page as unstable (an extra writeback period) and mark its
 * inode as dirty.
 */
static inline
void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo)
{
        if (!cinfo->dreq) {
                struct inode *inode = page_file_mapping(page)->host;

                /* This page is really still in write-back - just that the
                 * writeback is happening on the server now.
                 */
                inc_node_page_state(page, NR_WRITEBACK);
                inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
                __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
        }
}

/*
 * Determine the number of bytes of data the page contains
 */
static inline
unsigned int nfs_page_length(struct page *page)
{
        loff_t i_size = i_size_read(page_file_mapping(page)->host);

        if (i_size > 0) {
                pgoff_t index = page_index(page);
                pgoff_t end_index = (i_size - 1) >> PAGE_SHIFT;
                if (index < end_index)
                        return PAGE_SIZE;
                if (index == end_index)
                        return ((i_size - 1) & ~PAGE_MASK) + 1;
        }
        return 0;
}

/*
 * Convert a umode to a dirent->d_type
 */
static inline
unsigned char nfs_umode_to_dtype(umode_t mode)
{
        return (mode >> 12) & 15;
}

/*
 * Determine the number of pages in an array of length 'len' and
 * with a base offset of 'base'
 */
static inline
unsigned int nfs_page_array_len(unsigned int base, size_t len)
{
        return ((unsigned long)len + (unsigned long)base +
                PAGE_SIZE - 1) >> PAGE_SHIFT;
}

/*
 * Convert a struct timespec64 into a 64-bit change attribute
 *
 * This does approximately the same thing as timespec64_to_ns(),
 * but for calculation efficiency, we multiply the seconds by
 * 1024*1024*1024.
 */
static inline
u64 nfs_timespec_to_change_attr(const struct timespec64 *ts)
{
        return ((u64)ts->tv_sec << 30) + ts->tv_nsec;
}

static inline u32 nfs_stateid_hash(const nfs4_stateid *stateid)
{
        return ~crc32_le(0xFFFFFFFF, &stateid->other[0],
                                NFS4_STATEID_OTHER_SIZE);
}

static inline bool nfs_error_is_fatal(int err)
{
        switch (err) {
        case -ERESTARTSYS:
        case -EINTR:
        case -EACCES:
        case -EDQUOT:
        case -EFBIG:
        case -EIO:
        case -ENOSPC:
        case -EROFS:
        case -ESTALE:
        case -E2BIG:
        case -ENOMEM:
        case -ETIMEDOUT:
                return true;
        default:
                return false;
        }
}

static inline bool nfs_error_is_fatal_on_server(int err)
{
        switch (err) {
        case 0:
        case -ERESTARTSYS:
        case -EINTR:
        case -ENOMEM:
                return false;
        }
        return nfs_error_is_fatal(err);
}

/*
 * Select between a default port value and a user-specified port value.
 * If a zero value is set, then autobind will be used.
 */
static inline void nfs_set_port(struct sockaddr *sap, int *port,
                                const unsigned short default_port)
{
        if (*port == NFS_UNSPEC_PORT)
                *port = default_port;

        rpc_set_port(sap, *port);
}



















































































































































































































































































































































































































































































































































































































































































































    1 










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 


    1 



































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NET_XFRM_H
#define _NET_XFRM_H

#include <linux/compiler.h>
#include <linux/xfrm.h>
#include <linux/spinlock.h>
#include <linux/list.h>
#include <linux/skbuff.h>
#include <linux/socket.h>
#include <linux/pfkeyv2.h>
#include <linux/ipsec.h>
#include <linux/in6.h>
#include <linux/mutex.h>
#include <linux/audit.h>
#include <linux/slab.h>
#include <linux/refcount.h>
#include <linux/sockptr.h>

#include <net/sock.h>
#include <net/dst.h>
#include <net/ip.h>
#include <net/route.h>
#include <net/ipv6.h>
#include <net/ip6_fib.h>
#include <net/flow.h>
#include <net/gro_cells.h>

#include <linux/interrupt.h>

#ifdef CONFIG_XFRM_STATISTICS
#include <net/snmp.h>
#endif

#define XFRM_PROTO_ESP                50
#define XFRM_PROTO_AH                51
#define XFRM_PROTO_COMP                108
#define XFRM_PROTO_IPIP                4
#define XFRM_PROTO_IPV6                41
#define XFRM_PROTO_ROUTING        IPPROTO_ROUTING
#define XFRM_PROTO_DSTOPTS        IPPROTO_DSTOPTS

#define XFRM_ALIGN4(len)        (((len) + 3) & ~3)
#define XFRM_ALIGN8(len)        (((len) + 7) & ~7)
#define MODULE_ALIAS_XFRM_MODE(family, encap) \
        MODULE_ALIAS("xfrm-mode-" __stringify(family) "-" __stringify(encap))
#define MODULE_ALIAS_XFRM_TYPE(family, proto) \
        MODULE_ALIAS("xfrm-type-" __stringify(family) "-" __stringify(proto))
#define MODULE_ALIAS_XFRM_OFFLOAD_TYPE(family, proto) \
        MODULE_ALIAS("xfrm-offload-" __stringify(family) "-" __stringify(proto))

#ifdef CONFIG_XFRM_STATISTICS
#define XFRM_INC_STATS(net, field)        SNMP_INC_STATS((net)->mib.xfrm_statistics, field)
#else
#define XFRM_INC_STATS(net, field)        ((void)(net))
#endif


/* Organization of SPD aka "XFRM rules"
   ------------------------------------

   Basic objects:
   - policy rule, struct xfrm_policy (=SPD entry)
   - bundle of transformations, struct dst_entry == struct xfrm_dst (=SA bundle)
   - instance of a transformer, struct xfrm_state (=SA)
   - template to clone xfrm_state, struct xfrm_tmpl

   SPD is plain linear list of xfrm_policy rules, ordered by priority.
   (To be compatible with existing pfkeyv2 implementations,
   many rules with priority of 0x7fffffff are allowed to exist and
   such rules are ordered in an unpredictable way, thanks to bsd folks.)

   Lookup is plain linear search until the first match with selector.

   If "action" is "block", then we prohibit the flow, otherwise:
   if "xfrms_nr" is zero, the flow passes untransformed. Otherwise,
   policy entry has list of up to XFRM_MAX_DEPTH transformations,
   described by templates xfrm_tmpl. Each template is resolved
   to a complete xfrm_state (see below) and we pack bundle of transformations
   to a dst_entry returned to requestor.

   dst -. xfrm  .-> xfrm_state #1
    |---. child .-> dst -. xfrm .-> xfrm_state #2
                     |---. child .-> dst -. xfrm .-> xfrm_state #3
                                      |---. child .-> NULL

   Bundles are cached at xrfm_policy struct (field ->bundles).


   Resolution of xrfm_tmpl
   -----------------------
   Template contains:
   1. ->mode                Mode: transport or tunnel
   2. ->id.proto        Protocol: AH/ESP/IPCOMP
   3. ->id.daddr        Remote tunnel endpoint, ignored for transport mode.
      Q: allow to resolve security gateway?
   4. ->id.spi          If not zero, static SPI.
   5. ->saddr                Local tunnel endpoint, ignored for transport mode.
   6. ->algos                List of allowed algos. Plain bitmask now.
      Q: ealgos, aalgos, calgos. What a mess...
   7. ->share                Sharing mode.
      Q: how to implement private sharing mode? To add struct sock* to
      flow id?

   Having this template we search through SAD searching for entries
   with appropriate mode/proto/algo, permitted by selector.
   If no appropriate entry found, it is requested from key manager.

   PROBLEMS:
   Q: How to find all the bundles referring to a physical path for
      PMTU discovery? Seems, dst should contain list of all parents...
      and enter to infinite locking hierarchy disaster.
      No! It is easier, we will not search for them, let them find us.
      We add genid to each dst plus pointer to genid of raw IP route,
      pmtu disc will update pmtu on raw IP route and increase its genid.
      dst_check() will see this for top level and trigger resyncing
      metrics. Plus, it will be made via sk->sk_dst_cache. Solved.
 */

struct xfrm_state_walk {
        struct list_head        all;
        u8                        state;
        u8                        dying;
        u8                        proto;
        u32                        seq;
        struct xfrm_address_filter *filter;
};

enum {
        XFRM_DEV_OFFLOAD_IN = 1,
        XFRM_DEV_OFFLOAD_OUT,
};

struct xfrm_dev_offload {
        struct net_device        *dev;
        struct net_device        *real_dev;
        unsigned long                offload_handle;
        unsigned int                num_exthdrs;
        u8                        flags;
        u8                        dir : 2;
};

struct xfrm_mode {
        u8 encap;
        u8 family;
        u8 flags;
};

/* Flags for xfrm_mode. */
enum {
        XFRM_MODE_FLAG_TUNNEL = 1,
};

/* Full description of state of transformer. */
struct xfrm_state {
        possible_net_t                xs_net;
        union {
                struct hlist_node        gclist;
                struct hlist_node        bydst;
        };
        struct hlist_node        bysrc;
        struct hlist_node        byspi;

        refcount_t                refcnt;
        spinlock_t                lock;

        struct xfrm_id                id;
        struct xfrm_selector        sel;
        struct xfrm_mark        mark;
        u32                        if_id;
        u32                        tfcpad;

        u32                        genid;

        /* Key manager bits */
        struct xfrm_state_walk        km;

        /* Parameters of this state. */
        struct {
                u32                reqid;
                u8                mode;
                u8                replay_window;
                u8                aalgo, ealgo, calgo;
                u8                flags;
                u16                family;
                xfrm_address_t        saddr;
                int                header_len;
                int                trailer_len;
                u32                extra_flags;
                struct xfrm_mark        smark;
        } props;

        struct xfrm_lifetime_cfg lft;

        /* Data for transformer */
        struct xfrm_algo_auth        *aalg;
        struct xfrm_algo        *ealg;
        struct xfrm_algo        *calg;
        struct xfrm_algo_aead        *aead;
        const char                *geniv;

        /* mapping change rate limiting */
        __be16 new_mapping_sport;
        u32 new_mapping;        /* seconds */
        u32 mapping_maxage;        /* seconds for input SA */

        /* Data for encapsulator */
        struct xfrm_encap_tmpl        *encap;
        struct sock __rcu        *encap_sk;

        /* Data for care-of address */
        xfrm_address_t        *coaddr;

        /* IPComp needs an IPIP tunnel for handling uncompressed packets */
        struct xfrm_state        *tunnel;

        /* If a tunnel, number of users + 1 */
        atomic_t                tunnel_users;

        /* State for replay detection */
        struct xfrm_replay_state replay;
        struct xfrm_replay_state_esn *replay_esn;

        /* Replay detection state at the time we sent the last notification */
        struct xfrm_replay_state preplay;
        struct xfrm_replay_state_esn *preplay_esn;

        /* The functions for replay detection. */
        const struct xfrm_replay *repl;

        /* internal flag that only holds state for delayed aevent at the
         * moment
        */
        u32                        xflags;

        /* Replay detection notification settings */
        u32                        replay_maxage;
        u32                        replay_maxdiff;

        /* Replay detection notification timer */
        struct timer_list        rtimer;

        /* Statistics */
        struct xfrm_stats        stats;

        struct xfrm_lifetime_cur curlft;
        struct hrtimer                mtimer;

        struct xfrm_dev_offload xso;

        /* used to fix curlft->add_time when changing date */
        long                saved_tmo;

        /* Last used time */
        time64_t                lastused;

        struct page_frag xfrag;

        /* Reference to data common to all the instances of this
         * transformer. */
        const struct xfrm_type        *type;
        struct xfrm_mode        inner_mode;
        struct xfrm_mode        inner_mode_iaf;
        struct xfrm_mode        outer_mode;

        const struct xfrm_type_offload        *type_offload;

        /* Security context */
        struct xfrm_sec_ctx        *security;

        /* Private data of this transformer, format is opaque,
         * interpreted by xfrm_type methods. */
        void                        *data;
};

static inline struct net *xs_net(struct xfrm_state *x)
{
        return read_pnet(&x->xs_net);
}

/* xflags - make enum if more show up */
#define XFRM_TIME_DEFER        1
#define XFRM_SOFT_EXPIRE 2

enum {
        XFRM_STATE_VOID,
        XFRM_STATE_ACQ,
        XFRM_STATE_VALID,
        XFRM_STATE_ERROR,
        XFRM_STATE_EXPIRED,
        XFRM_STATE_DEAD
};

/* callback structure passed from either netlink or pfkey */
struct km_event {
        union {
                u32 hard;
                u32 proto;
                u32 byid;
                u32 aevent;
                u32 type;
        } data;

        u32        seq;
        u32        portid;
        u32        event;
        struct net *net;
};

struct xfrm_replay {
        void        (*advance)(struct xfrm_state *x, __be32 net_seq);
        int        (*check)(struct xfrm_state *x,
                         struct sk_buff *skb,
                         __be32 net_seq);
        int        (*recheck)(struct xfrm_state *x,
                           struct sk_buff *skb,
                           __be32 net_seq);
        void        (*notify)(struct xfrm_state *x, int event);
        int        (*overflow)(struct xfrm_state *x, struct sk_buff *skb);
};

struct xfrm_if_cb {
        struct xfrm_if        *(*decode_session)(struct sk_buff *skb,
                                           unsigned short family);
};

void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb);
void xfrm_if_unregister_cb(void);

struct xfrm_dst_lookup_params {
        struct net *net;
        int tos;
        int oif;
        xfrm_address_t *saddr;
        xfrm_address_t *daddr;
        u32 mark;
        __u8 ipproto;
        union flowi_uli uli;
};

struct net_device;
struct xfrm_type;
struct xfrm_dst;
struct xfrm_policy_afinfo {
        struct dst_ops                *dst_ops;
        struct dst_entry        *(*dst_lookup)(const struct xfrm_dst_lookup_params *params);
        int                        (*get_saddr)(xfrm_address_t *saddr,
                                             const struct xfrm_dst_lookup_params *params);
        int                        (*fill_dst)(struct xfrm_dst *xdst,
                                            struct net_device *dev,
                                            const struct flowi *fl);
        struct dst_entry        *(*blackhole_route)(struct net *net, struct dst_entry *orig);
};

int xfrm_policy_register_afinfo(const struct xfrm_policy_afinfo *afinfo, int family);
void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo);
void km_policy_notify(struct xfrm_policy *xp, int dir,
                      const struct km_event *c);
void km_state_notify(struct xfrm_state *x, const struct km_event *c);

struct xfrm_tmpl;
int km_query(struct xfrm_state *x, struct xfrm_tmpl *t,
             struct xfrm_policy *pol);
void km_state_expired(struct xfrm_state *x, int hard, u32 portid);
int __xfrm_state_delete(struct xfrm_state *x);

struct xfrm_state_afinfo {
        u8                                family;
        u8                                proto;

        const struct xfrm_type_offload *type_offload_esp;

        const struct xfrm_type                *type_esp;
        const struct xfrm_type                *type_ipip;
        const struct xfrm_type                *type_ipip6;
        const struct xfrm_type                *type_comp;
        const struct xfrm_type                *type_ah;
        const struct xfrm_type                *type_routing;
        const struct xfrm_type                *type_dstopts;

        int                        (*output)(struct net *net, struct sock *sk, struct sk_buff *skb);
        int                        (*transport_finish)(struct sk_buff *skb,
                                                    int async);
        void                        (*local_error)(struct sk_buff *skb, u32 mtu);
};

int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo);
int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo);
struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family);
struct xfrm_state_afinfo *xfrm_state_afinfo_get_rcu(unsigned int family);

struct xfrm_input_afinfo {
        u8                        family;
        bool                        is_ipip;
        int                        (*callback)(struct sk_buff *skb, u8 protocol,
                                            int err);
};

int xfrm_input_register_afinfo(const struct xfrm_input_afinfo *afinfo);
int xfrm_input_unregister_afinfo(const struct xfrm_input_afinfo *afinfo);

void xfrm_flush_gc(void);

struct xfrm_type {
        char                        *description;
        struct module                *owner;
        u8                        proto;
        u8                        flags;
#define XFRM_TYPE_NON_FRAGMENT        1
#define XFRM_TYPE_REPLAY_PROT        2
#define XFRM_TYPE_LOCAL_COADDR        4
#define XFRM_TYPE_REMOTE_COADDR        8

        int                        (*init_state)(struct xfrm_state *x);
        void                        (*destructor)(struct xfrm_state *);
        int                        (*input)(struct xfrm_state *, struct sk_buff *skb);
        int                        (*output)(struct xfrm_state *, struct sk_buff *pskb);
        int                        (*reject)(struct xfrm_state *, struct sk_buff *,
                                          const struct flowi *);
        int                        (*hdr_offset)(struct xfrm_state *, struct sk_buff *, u8 **);
};

int xfrm_register_type(const struct xfrm_type *type, unsigned short family);
void xfrm_unregister_type(const struct xfrm_type *type, unsigned short family);

struct xfrm_type_offload {
        char                *description;
        struct module        *owner;
        u8                proto;
        void                (*encap)(struct xfrm_state *, struct sk_buff *pskb);
        int                (*input_tail)(struct xfrm_state *x, struct sk_buff *skb);
        int                (*xmit)(struct xfrm_state *, struct sk_buff *pskb, netdev_features_t features);
};

int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned short family);
void xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family);

static inline int xfrm_af2proto(unsigned int family)
{
        switch(family) {
        case AF_INET:
                return IPPROTO_IPIP;
        case AF_INET6:
                return IPPROTO_IPV6;
        default:
                return 0;
        }
}

static inline const struct xfrm_mode *xfrm_ip2inner_mode(struct xfrm_state *x, int ipproto)
{
        if ((ipproto == IPPROTO_IPIP && x->props.family == AF_INET) ||
            (ipproto == IPPROTO_IPV6 && x->props.family == AF_INET6))
                return &x->inner_mode;
        else
                return &x->inner_mode_iaf;
}

struct xfrm_tmpl {
/* id in template is interpreted as:
 * daddr - destination of tunnel, may be zero for transport mode.
 * spi   - zero to acquire spi. Not zero if spi is static, then
 *           daddr must be fixed too.
 * proto - AH/ESP/IPCOMP
 */
        struct xfrm_id                id;

/* Source address of tunnel. Ignored, if it is not a tunnel. */
        xfrm_address_t                saddr;

        unsigned short                encap_family;

        u32                        reqid;

/* Mode: transport, tunnel etc. */
        u8                        mode;

/* Sharing mode: unique, this session only, this user only etc. */
        u8                        share;

/* May skip this transfomration if no SA is found */
        u8                        optional;

/* Skip aalgos/ealgos/calgos checks. */
        u8                        allalgs;

/* Bit mask of algos allowed for acquisition */
        u32                        aalgos;
        u32                        ealgos;
        u32                        calgos;
};

#define XFRM_MAX_DEPTH                6
#define XFRM_MAX_OFFLOAD_DEPTH        1

struct xfrm_policy_walk_entry {
        struct list_head        all;
        u8                        dead;
};

struct xfrm_policy_walk {
        struct xfrm_policy_walk_entry walk;
        u8 type;
        u32 seq;
};

struct xfrm_policy_queue {
        struct sk_buff_head        hold_queue;
        struct timer_list        hold_timer;
        unsigned long                timeout;
};

struct xfrm_policy {
        possible_net_t                xp_net;
        struct hlist_node        bydst;
        struct hlist_node        byidx;

        /* This lock only affects elements except for entry. */
        rwlock_t                lock;
        refcount_t                refcnt;
        u32                        pos;
        struct timer_list        timer;

        atomic_t                genid;
        u32                        priority;
        u32                        index;
        u32                        if_id;
        struct xfrm_mark        mark;
        struct xfrm_selector        selector;
        struct xfrm_lifetime_cfg lft;
        struct xfrm_lifetime_cur curlft;
        struct xfrm_policy_walk_entry walk;
        struct xfrm_policy_queue polq;
        bool                    bydst_reinsert;
        u8                        type;
        u8                        action;
        u8                        flags;
        u8                        xfrm_nr;
        u16                        family;
        struct xfrm_sec_ctx        *security;
        struct xfrm_tmpl               xfrm_vec[XFRM_MAX_DEPTH];
        struct hlist_node        bydst_inexact_list;
        struct rcu_head                rcu;
};

static inline struct net *xp_net(const struct xfrm_policy *xp)
{
        return read_pnet(&xp->xp_net);
}

struct xfrm_kmaddress {
        xfrm_address_t          local;
        xfrm_address_t          remote;
        u32                        reserved;
        u16                        family;
};

struct xfrm_migrate {
        xfrm_address_t                old_daddr;
        xfrm_address_t                old_saddr;
        xfrm_address_t                new_daddr;
        xfrm_address_t                new_saddr;
        u8                        proto;
        u8                        mode;
        u16                        reserved;
        u32                        reqid;
        u16                        old_family;
        u16                        new_family;
};

#define XFRM_KM_TIMEOUT                30
/* what happened */
#define XFRM_REPLAY_UPDATE        XFRM_AE_CR
#define XFRM_REPLAY_TIMEOUT        XFRM_AE_CE

/* default aevent timeout in units of 100ms */
#define XFRM_AE_ETIME                        10
/* Async Event timer multiplier */
#define XFRM_AE_ETH_M                        10
/* default seq threshold size */
#define XFRM_AE_SEQT_SIZE                2

struct xfrm_mgr {
        struct list_head        list;
        int                        (*notify)(struct xfrm_state *x, const struct km_event *c);
        int                        (*acquire)(struct xfrm_state *x, struct xfrm_tmpl *, struct xfrm_policy *xp);
        struct xfrm_policy        *(*compile_policy)(struct sock *sk, int opt, u8 *data, int len, int *dir);
        int                        (*new_mapping)(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport);
        int                        (*notify_policy)(struct xfrm_policy *x, int dir, const struct km_event *c);
        int                        (*report)(struct net *net, u8 proto, struct xfrm_selector *sel, xfrm_address_t *addr);
        int                        (*migrate)(const struct xfrm_selector *sel,
                                           u8 dir, u8 type,
                                           const struct xfrm_migrate *m,
                                           int num_bundles,
                                           const struct xfrm_kmaddress *k,
                                           const struct xfrm_encap_tmpl *encap);
        bool                        (*is_alive)(const struct km_event *c);
};

int xfrm_register_km(struct xfrm_mgr *km);
int xfrm_unregister_km(struct xfrm_mgr *km);

struct xfrm_tunnel_skb_cb {
        union {
                struct inet_skb_parm h4;
                struct inet6_skb_parm h6;
        } header;

        union {
                struct ip_tunnel *ip4;
                struct ip6_tnl *ip6;
        } tunnel;
};

#define XFRM_TUNNEL_SKB_CB(__skb) ((struct xfrm_tunnel_skb_cb *)&((__skb)->cb[0]))

/*
 * This structure is used for the duration where packets are being
 * transformed by IPsec.  As soon as the packet leaves IPsec the
 * area beyond the generic IP part may be overwritten.
 */
struct xfrm_skb_cb {
        struct xfrm_tunnel_skb_cb header;

        /* Sequence number for replay protection. */
        union {
                struct {
                        __u32 low;
                        __u32 hi;
                } output;
                struct {
                        __be32 low;
                        __be32 hi;
                } input;
        } seq;
};

#define XFRM_SKB_CB(__skb) ((struct xfrm_skb_cb *)&((__skb)->cb[0]))

/*
 * This structure is used by the afinfo prepare_input/prepare_output functions
 * to transmit header information to the mode input/output functions.
 */
struct xfrm_mode_skb_cb {
        struct xfrm_tunnel_skb_cb header;

        /* Copied from header for IPv4, always set to zero and DF for IPv6. */
        __be16 id;
        __be16 frag_off;

        /* IP header length (excluding options or extension headers). */
        u8 ihl;

        /* TOS for IPv4, class for IPv6. */
        u8 tos;

        /* TTL for IPv4, hop limitfor IPv6. */
        u8 ttl;

        /* Protocol for IPv4, NH for IPv6. */
        u8 protocol;

        /* Option length for IPv4, zero for IPv6. */
        u8 optlen;

        /* Used by IPv6 only, zero for IPv4. */
        u8 flow_lbl[3];
};

#define XFRM_MODE_SKB_CB(__skb) ((struct xfrm_mode_skb_cb *)&((__skb)->cb[0]))

/*
 * This structure is used by the input processing to locate the SPI and
 * related information.
 */
struct xfrm_spi_skb_cb {
        struct xfrm_tunnel_skb_cb header;

        unsigned int daddroff;
        unsigned int family;
        __be32 seq;
};

#define XFRM_SPI_SKB_CB(__skb) ((struct xfrm_spi_skb_cb *)&((__skb)->cb[0]))

#ifdef CONFIG_AUDITSYSCALL
static inline struct audit_buffer *xfrm_audit_start(const char *op)
{
        struct audit_buffer *audit_buf = NULL;

        if (audit_enabled == AUDIT_OFF)
                return NULL;
        audit_buf = audit_log_start(audit_context(), GFP_ATOMIC,
                                    AUDIT_MAC_IPSEC_EVENT);
        if (audit_buf == NULL)
                return NULL;
        audit_log_format(audit_buf, "op=%s", op);
        return audit_buf;
}

static inline void xfrm_audit_helper_usrinfo(bool task_valid,
                                             struct audit_buffer *audit_buf)
{
        const unsigned int auid = from_kuid(&init_user_ns, task_valid ?
                                            audit_get_loginuid(current) :
                                            INVALID_UID);
        const unsigned int ses = task_valid ? audit_get_sessionid(current) :
                AUDIT_SID_UNSET;

        audit_log_format(audit_buf, " auid=%u ses=%u", auid, ses);
        audit_log_task_context(audit_buf);
}

void xfrm_audit_policy_add(struct xfrm_policy *xp, int result, bool task_valid);
void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result,
                              bool task_valid);
void xfrm_audit_state_add(struct xfrm_state *x, int result, bool task_valid);
void xfrm_audit_state_delete(struct xfrm_state *x, int result, bool task_valid);
void xfrm_audit_state_replay_overflow(struct xfrm_state *x,
                                      struct sk_buff *skb);
void xfrm_audit_state_replay(struct xfrm_state *x, struct sk_buff *skb,
                             __be32 net_seq);
void xfrm_audit_state_notfound_simple(struct sk_buff *skb, u16 family);
void xfrm_audit_state_notfound(struct sk_buff *skb, u16 family, __be32 net_spi,
                               __be32 net_seq);
void xfrm_audit_state_icvfail(struct xfrm_state *x, struct sk_buff *skb,
                              u8 proto);
#else

static inline void xfrm_audit_policy_add(struct xfrm_policy *xp, int result,
                                         bool task_valid)
{
}

static inline void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result,
                                            bool task_valid)
{
}

static inline void xfrm_audit_state_add(struct xfrm_state *x, int result,
                                        bool task_valid)
{
}

static inline void xfrm_audit_state_delete(struct xfrm_state *x, int result,
                                           bool task_valid)
{
}

static inline void xfrm_audit_state_replay_overflow(struct xfrm_state *x,
                                             struct sk_buff *skb)
{
}

static inline void xfrm_audit_state_replay(struct xfrm_state *x,
                                           struct sk_buff *skb, __be32 net_seq)
{
}

static inline void xfrm_audit_state_notfound_simple(struct sk_buff *skb,
                                      u16 family)
{
}

static inline void xfrm_audit_state_notfound(struct sk_buff *skb, u16 family,
                                      __be32 net_spi, __be32 net_seq)
{
}

static inline void xfrm_audit_state_icvfail(struct xfrm_state *x,
                                     struct sk_buff *skb, u8 proto)
{
}
#endif /* CONFIG_AUDITSYSCALL */

static inline void xfrm_pol_hold(struct xfrm_policy *policy)
{
        if (likely(policy != NULL))
                refcount_inc(&policy->refcnt);
}

void xfrm_policy_destroy(struct xfrm_policy *policy);

static inline void xfrm_pol_put(struct xfrm_policy *policy)
{
        if (refcount_dec_and_test(&policy->refcnt))
                xfrm_policy_destroy(policy);
}

static inline void xfrm_pols_put(struct xfrm_policy **pols, int npols)
{
        int i;
        for (i = npols - 1; i >= 0; --i)
                xfrm_pol_put(pols[i]);
}

void __xfrm_state_destroy(struct xfrm_state *);

static inline void __xfrm_state_put(struct xfrm_state *x)
{
        refcount_dec(&x->refcnt);
}

static inline void xfrm_state_put(struct xfrm_state *x)
{
        if (refcount_dec_and_test(&x->refcnt))
                __xfrm_state_destroy(x);
}

static inline void xfrm_state_hold(struct xfrm_state *x)
{
        refcount_inc(&x->refcnt);
}

static inline bool addr_match(const void *token1, const void *token2,
                              unsigned int prefixlen)
{
        const __be32 *a1 = token1;
        const __be32 *a2 = token2;
        unsigned int pdw;
        unsigned int pbi;

        pdw = prefixlen >> 5;          /* num of whole u32 in prefix */
        pbi = prefixlen &  0x1f;  /* num of bits in incomplete u32 in prefix */

        if (pdw)
                if (memcmp(a1, a2, pdw << 2))
                        return false;

        if (pbi) {
                __be32 mask;

                mask = htonl((0xffffffff) << (32 - pbi));

                if ((a1[pdw] ^ a2[pdw]) & mask)
                        return false;
        }

        return true;
}

static inline bool addr4_match(__be32 a1, __be32 a2, u8 prefixlen)
{
        /* C99 6.5.7 (3): u32 << 32 is undefined behaviour */
        if (sizeof(long) == 4 && prefixlen == 0)
                return true;
        return !((a1 ^ a2) & htonl(~0UL << (32 - prefixlen)));
}

static __inline__
__be16 xfrm_flowi_sport(const struct flowi *fl, const union flowi_uli *uli)
{
        __be16 port;
        switch(fl->flowi_proto) {
        case IPPROTO_TCP:
        case IPPROTO_UDP:
        case IPPROTO_UDPLITE:
        case IPPROTO_SCTP:
                port = uli->ports.sport;
                break;
        case IPPROTO_ICMP:
        case IPPROTO_ICMPV6:
                port = htons(uli->icmpt.type);
                break;
        case IPPROTO_MH:
                port = htons(uli->mht.type);
                break;
        case IPPROTO_GRE:
                port = htons(ntohl(uli->gre_key) >> 16);
                break;
        default:
                port = 0;        /*XXX*/
        }
        return port;
}

static __inline__
__be16 xfrm_flowi_dport(const struct flowi *fl, const union flowi_uli *uli)
{
        __be16 port;
        switch(fl->flowi_proto) {
        case IPPROTO_TCP:
        case IPPROTO_UDP:
        case IPPROTO_UDPLITE:
        case IPPROTO_SCTP:
                port = uli->ports.dport;
                break;
        case IPPROTO_ICMP:
        case IPPROTO_ICMPV6:
                port = htons(uli->icmpt.code);
                break;
        case IPPROTO_GRE:
                port = htons(ntohl(uli->gre_key) & 0xffff);
                break;
        default:
                port = 0;        /*XXX*/
        }
        return port;
}

bool xfrm_selector_match(const struct xfrm_selector *sel,
                         const struct flowi *fl, unsigned short family);

#ifdef CONFIG_SECURITY_NETWORK_XFRM
/*        If neither has a context --> match
 *         Otherwise, both must have a context and the sids, doi, alg must match
 */
static inline bool xfrm_sec_ctx_match(struct xfrm_sec_ctx *s1, struct xfrm_sec_ctx *s2)
{
        return ((!s1 && !s2) ||
                (s1 && s2 &&
                 (s1->ctx_sid == s2->ctx_sid) &&
                 (s1->ctx_doi == s2->ctx_doi) &&
                 (s1->ctx_alg == s2->ctx_alg)));
}
#else
static inline bool xfrm_sec_ctx_match(struct xfrm_sec_ctx *s1, struct xfrm_sec_ctx *s2)
{
        return true;
}
#endif

/* A struct encoding bundle of transformations to apply to some set of flow.
 *
 * xdst->child points to the next element of bundle.
 * dst->xfrm  points to an instanse of transformer.
 *
 * Due to unfortunate limitations of current routing cache, which we
 * have no time to fix, it mirrors struct rtable and bound to the same
 * routing key, including saddr,daddr. However, we can have many of
 * bundles differing by session id. All the bundles grow from a parent
 * policy rule.
 */
struct xfrm_dst {
        union {
                struct dst_entry        dst;
                struct rtable                rt;
                struct rt6_info                rt6;
        } u;
        struct dst_entry *route;
        struct dst_entry *child;
        struct dst_entry *path;
        struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
        int num_pols, num_xfrms;
        u32 xfrm_genid;
        u32 policy_genid;
        u32 route_mtu_cached;
        u32 child_mtu_cached;
        u32 route_cookie;
        u32 path_cookie;
};

static inline struct dst_entry *xfrm_dst_path(const struct dst_entry *dst)
{
#ifdef CONFIG_XFRM
        if (dst->xfrm || (dst->flags & DST_XFRM_QUEUE)) {
                const struct xfrm_dst *xdst = (const struct xfrm_dst *) dst;

                return xdst->path;
        }
#endif
        return (struct dst_entry *) dst;
}

static inline struct dst_entry *xfrm_dst_child(const struct dst_entry *dst)
{
#ifdef CONFIG_XFRM
        if (dst->xfrm || (dst->flags & DST_XFRM_QUEUE)) {
                struct xfrm_dst *xdst = (struct xfrm_dst *) dst;
                return xdst->child;
        }
#endif
        return NULL;
}

#ifdef CONFIG_XFRM
static inline void xfrm_dst_set_child(struct xfrm_dst *xdst, struct dst_entry *child)
{
        xdst->child = child;
}

static inline void xfrm_dst_destroy(struct xfrm_dst *xdst)
{
        xfrm_pols_put(xdst->pols, xdst->num_pols);
        dst_release(xdst->route);
        if (likely(xdst->u.dst.xfrm))
                xfrm_state_put(xdst->u.dst.xfrm);
}
#endif

void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev);

struct xfrm_if_parms {
        int link;                /* ifindex of underlying L2 interface */
        u32 if_id;                /* interface identifyer */
};

struct xfrm_if {
        struct xfrm_if __rcu *next;        /* next interface in list */
        struct net_device *dev;                /* virtual device associated with interface */
        struct net *net;                /* netns for packet i/o */
        struct xfrm_if_parms p;                /* interface parms */

        struct gro_cells gro_cells;
};

struct xfrm_offload {
        /* Output sequence number for replay protection on offloading. */
        struct {
                __u32 low;
                __u32 hi;
        } seq;

        __u32                        flags;
#define        SA_DELETE_REQ                1
#define        CRYPTO_DONE                2
#define        CRYPTO_NEXT_DONE        4
#define        CRYPTO_FALLBACK                8
#define        XFRM_GSO_SEGMENT        16
#define        XFRM_GRO                32
#define        XFRM_ESP_NO_TRAILER        64
#define        XFRM_DEV_RESUME                128
#define        XFRM_XMIT                256

        __u32                        status;
#define CRYPTO_SUCCESS                                1
#define CRYPTO_GENERIC_ERROR                        2
#define CRYPTO_TRANSPORT_AH_AUTH_FAILED                4
#define CRYPTO_TRANSPORT_ESP_AUTH_FAILED        8
#define CRYPTO_TUNNEL_AH_AUTH_FAILED                16
#define CRYPTO_TUNNEL_ESP_AUTH_FAILED                32
#define CRYPTO_INVALID_PACKET_SYNTAX                64
#define CRYPTO_INVALID_PROTOCOL                        128

        /* Used to keep whole l2 header for transport mode GRO */
        __u32                        orig_mac_len;

        __u8                        proto;
};

struct sec_path {
        int                        len;
        int                        olen;
        int                        verified_cnt;

        struct xfrm_state        *xvec[XFRM_MAX_DEPTH];
        struct xfrm_offload        ovec[XFRM_MAX_OFFLOAD_DEPTH];
};

struct sec_path *secpath_set(struct sk_buff *skb);

static inline void
secpath_reset(struct sk_buff *skb)
{
#ifdef CONFIG_XFRM
        skb_ext_del(skb, SKB_EXT_SEC_PATH);
#endif
}

static inline int
xfrm_addr_any(const xfrm_address_t *addr, unsigned short family)
{
        switch (family) {
        case AF_INET:
                return addr->a4 == 0;
        case AF_INET6:
                return ipv6_addr_any(&addr->in6);
        }
        return 0;
}

static inline int
__xfrm4_state_addr_cmp(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x)
{
        return        (tmpl->saddr.a4 &&
                 tmpl->saddr.a4 != x->props.saddr.a4);
}

static inline int
__xfrm6_state_addr_cmp(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x)
{
        return        (!ipv6_addr_any((struct in6_addr*)&tmpl->saddr) &&
                 !ipv6_addr_equal((struct in6_addr *)&tmpl->saddr, (struct in6_addr*)&x->props.saddr));
}

static inline int
xfrm_state_addr_cmp(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x, unsigned short family)
{
        switch (family) {
        case AF_INET:
                return __xfrm4_state_addr_cmp(tmpl, x);
        case AF_INET6:
                return __xfrm6_state_addr_cmp(tmpl, x);
        }
        return !0;
}

#ifdef CONFIG_XFRM
int __xfrm_policy_check(struct sock *, int dir, struct sk_buff *skb,
                        unsigned short family);

static inline bool __xfrm_check_nopolicy(struct net *net, struct sk_buff *skb,
                                         int dir)
{
        if (!net->xfrm.policy_count[dir] && !secpath_exists(skb))
                return net->xfrm.policy_default[dir] == XFRM_USERPOLICY_ACCEPT;

        return false;
}

static inline bool __xfrm_check_dev_nopolicy(struct sk_buff *skb,
                                             int dir, unsigned short family)
{
        if (dir != XFRM_POLICY_OUT && family == AF_INET) {
                /* same dst may be used for traffic originating from
                 * devices with different policy settings.
                 */
                return IPCB(skb)->flags & IPSKB_NOPOLICY;
        }
        return skb_dst(skb) && (skb_dst(skb)->flags & DST_NOPOLICY);
}

static inline int __xfrm_policy_check2(struct sock *sk, int dir,
                                       struct sk_buff *skb,
                                       unsigned int family, int reverse)
{
        struct net *net = dev_net(skb->dev);
        int ndir = dir | (reverse ? XFRM_POLICY_MASK + 1 : 0);

        if (sk && sk->sk_policy[XFRM_POLICY_IN])
                return __xfrm_policy_check(sk, ndir, skb, family);

        return __xfrm_check_nopolicy(net, skb, dir) ||
               __xfrm_check_dev_nopolicy(skb, dir, family) ||
               __xfrm_policy_check(sk, ndir, skb, family);
}

static inline int xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, unsigned short family)
{
        return __xfrm_policy_check2(sk, dir, skb, family, 0);
}

static inline int xfrm4_policy_check(struct sock *sk, int dir, struct sk_buff *skb)
{
        return xfrm_policy_check(sk, dir, skb, AF_INET);
}

static inline int xfrm6_policy_check(struct sock *sk, int dir, struct sk_buff *skb)
{
        return xfrm_policy_check(sk, dir, skb, AF_INET6);
}

static inline int xfrm4_policy_check_reverse(struct sock *sk, int dir,
                                             struct sk_buff *skb)
{
        return __xfrm_policy_check2(sk, dir, skb, AF_INET, 1);
}

static inline int xfrm6_policy_check_reverse(struct sock *sk, int dir,
                                             struct sk_buff *skb)
{
        return __xfrm_policy_check2(sk, dir, skb, AF_INET6, 1);
}

int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl,
                          unsigned int family, int reverse);

static inline int xfrm_decode_session(struct sk_buff *skb, struct flowi *fl,
                                      unsigned int family)
{
        return __xfrm_decode_session(skb, fl, family, 0);
}

static inline int xfrm_decode_session_reverse(struct sk_buff *skb,
                                              struct flowi *fl,
                                              unsigned int family)
{
        return __xfrm_decode_session(skb, fl, family, 1);
}

int __xfrm_route_forward(struct sk_buff *skb, unsigned short family);

static inline int xfrm_route_forward(struct sk_buff *skb, unsigned short family)
{
        struct net *net = dev_net(skb->dev);

        if (!net->xfrm.policy_count[XFRM_POLICY_OUT] &&
            net->xfrm.policy_default[XFRM_POLICY_OUT] == XFRM_USERPOLICY_ACCEPT)
                return true;

        return (skb_dst(skb)->flags & DST_NOXFRM) ||
               __xfrm_route_forward(skb, family);
}

static inline int xfrm4_route_forward(struct sk_buff *skb)
{
        return xfrm_route_forward(skb, AF_INET);
}

static inline int xfrm6_route_forward(struct sk_buff *skb)
{
        return xfrm_route_forward(skb, AF_INET6);
}

int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk);

static inline int xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk)
{
        if (!sk_fullsock(osk))
                return 0;
        sk->sk_policy[0] = NULL;
        sk->sk_policy[1] = NULL;
        if (unlikely(osk->sk_policy[0] || osk->sk_policy[1]))
                return __xfrm_sk_clone_policy(sk, osk);
        return 0;
}

int xfrm_policy_delete(struct xfrm_policy *pol, int dir);

static inline void xfrm_sk_free_policy(struct sock *sk)
{
        struct xfrm_policy *pol;

        pol = rcu_dereference_protected(sk->sk_policy[0], 1);
        if (unlikely(pol != NULL)) {
                xfrm_policy_delete(pol, XFRM_POLICY_MAX);
                sk->sk_policy[0] = NULL;
        }
        pol = rcu_dereference_protected(sk->sk_policy[1], 1);
        if (unlikely(pol != NULL)) {
                xfrm_policy_delete(pol, XFRM_POLICY_MAX+1);
                sk->sk_policy[1] = NULL;
        }
}

#else

static inline void xfrm_sk_free_policy(struct sock *sk) {}
static inline int xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk) { return 0; }
static inline int xfrm6_route_forward(struct sk_buff *skb) { return 1; }
static inline int xfrm4_route_forward(struct sk_buff *skb) { return 1; }
static inline int xfrm6_policy_check(struct sock *sk, int dir, struct sk_buff *skb)
{
        return 1;
}
static inline int xfrm4_policy_check(struct sock *sk, int dir, struct sk_buff *skb)
{
        return 1;
}
static inline int xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, unsigned short family)
{
        return 1;
}
static inline int xfrm_decode_session_reverse(struct sk_buff *skb,
                                              struct flowi *fl,
                                              unsigned int family)
{
        return -ENOSYS;
}
static inline int xfrm4_policy_check_reverse(struct sock *sk, int dir,
                                             struct sk_buff *skb)
{
        return 1;
}
static inline int xfrm6_policy_check_reverse(struct sock *sk, int dir,
                                             struct sk_buff *skb)
{
        return 1;
}
#endif

static __inline__
xfrm_address_t *xfrm_flowi_daddr(const struct flowi *fl, unsigned short family)
{
        switch (family){
        case AF_INET:
                return (xfrm_address_t *)&fl->u.ip4.daddr;
        case AF_INET6:
                return (xfrm_address_t *)&fl->u.ip6.daddr;
        }
        return NULL;
}

static __inline__
xfrm_address_t *xfrm_flowi_saddr(const struct flowi *fl, unsigned short family)
{
        switch (family){
        case AF_INET:
                return (xfrm_address_t *)&fl->u.ip4.saddr;
        case AF_INET6:
                return (xfrm_address_t *)&fl->u.ip6.saddr;
        }
        return NULL;
}

static __inline__
void xfrm_flowi_addr_get(const struct flowi *fl,
                         xfrm_address_t *saddr, xfrm_address_t *daddr,
                         unsigned short family)
{
        switch(family) {
        case AF_INET:
                memcpy(&saddr->a4, &fl->u.ip4.saddr, sizeof(saddr->a4));
                memcpy(&daddr->a4, &fl->u.ip4.daddr, sizeof(daddr->a4));
                break;
        case AF_INET6:
                saddr->in6 = fl->u.ip6.saddr;
                daddr->in6 = fl->u.ip6.daddr;
                break;
        }
}

static __inline__ int
__xfrm4_state_addr_check(const struct xfrm_state *x,
                         const xfrm_address_t *daddr, const xfrm_address_t *saddr)
{
        if (daddr->a4 == x->id.daddr.a4 &&
            (saddr->a4 == x->props.saddr.a4 || !saddr->a4 || !x->props.saddr.a4))
                return 1;
        return 0;
}

static __inline__ int
__xfrm6_state_addr_check(const struct xfrm_state *x,
                         const xfrm_address_t *daddr, const xfrm_address_t *saddr)
{
        if (ipv6_addr_equal((struct in6_addr *)daddr, (struct in6_addr *)&x->id.daddr) &&
            (ipv6_addr_equal((struct in6_addr *)saddr, (struct in6_addr *)&x->props.saddr) ||
             ipv6_addr_any((struct in6_addr *)saddr) ||
             ipv6_addr_any((struct in6_addr *)&x->props.saddr)))
                return 1;
        return 0;
}

static __inline__ int
xfrm_state_addr_check(const struct xfrm_state *x,
                      const xfrm_address_t *daddr, const xfrm_address_t *saddr,
                      unsigned short family)
{
        switch (family) {
        case AF_INET:
                return __xfrm4_state_addr_check(x, daddr, saddr);
        case AF_INET6:
                return __xfrm6_state_addr_check(x, daddr, saddr);
        }
        return 0;
}

static __inline__ int
xfrm_state_addr_flow_check(const struct xfrm_state *x, const struct flowi *fl,
                           unsigned short family)
{
        switch (family) {
        case AF_INET:
                return __xfrm4_state_addr_check(x,
                                                (const xfrm_address_t *)&fl->u.ip4.daddr,
                                                (const xfrm_address_t *)&fl->u.ip4.saddr);
        case AF_INET6:
                return __xfrm6_state_addr_check(x,
                                                (const xfrm_address_t *)&fl->u.ip6.daddr,
                                                (const xfrm_address_t *)&fl->u.ip6.saddr);
        }
        return 0;
}

static inline int xfrm_state_kern(const struct xfrm_state *x)
{
        return atomic_read(&x->tunnel_users);
}

static inline bool xfrm_id_proto_valid(u8 proto)
{
        switch (proto) {
        case IPPROTO_AH:
        case IPPROTO_ESP:
        case IPPROTO_COMP:
#if IS_ENABLED(CONFIG_IPV6)
        case IPPROTO_ROUTING:
        case IPPROTO_DSTOPTS:
#endif
                return true;
        default:
                return false;
        }
}

/* IPSEC_PROTO_ANY only matches 3 IPsec protocols, 0 could match all. */
static inline int xfrm_id_proto_match(u8 proto, u8 userproto)
{
        return (!userproto || proto == userproto ||
                (userproto == IPSEC_PROTO_ANY && (proto == IPPROTO_AH ||
                                                  proto == IPPROTO_ESP ||
                                                  proto == IPPROTO_COMP)));
}

/*
 * xfrm algorithm information
 */
struct xfrm_algo_aead_info {
        char *geniv;
        u16 icv_truncbits;
};

struct xfrm_algo_auth_info {
        u16 icv_truncbits;
        u16 icv_fullbits;
};

struct xfrm_algo_encr_info {
        char *geniv;
        u16 blockbits;
        u16 defkeybits;
};

struct xfrm_algo_comp_info {
        u16 threshold;
};

struct xfrm_algo_desc {
        char *name;
        char *compat;
        u8 available:1;
        u8 pfkey_supported:1;
        union {
                struct xfrm_algo_aead_info aead;
                struct xfrm_algo_auth_info auth;
                struct xfrm_algo_encr_info encr;
                struct xfrm_algo_comp_info comp;
        } uinfo;
        struct sadb_alg desc;
};

/* XFRM protocol handlers.  */
struct xfrm4_protocol {
        int (*handler)(struct sk_buff *skb);
        int (*input_handler)(struct sk_buff *skb, int nexthdr, __be32 spi,
                             int encap_type);
        int (*cb_handler)(struct sk_buff *skb, int err);
        int (*err_handler)(struct sk_buff *skb, u32 info);

        struct xfrm4_protocol __rcu *next;
        int priority;
};

struct xfrm6_protocol {
        int (*handler)(struct sk_buff *skb);
        int (*input_handler)(struct sk_buff *skb, int nexthdr, __be32 spi,
                             int encap_type);
        int (*cb_handler)(struct sk_buff *skb, int err);
        int (*err_handler)(struct sk_buff *skb, struct inet6_skb_parm *opt,
                           u8 type, u8 code, int offset, __be32 info);

        struct xfrm6_protocol __rcu *next;
        int priority;
};

/* XFRM tunnel handlers.  */
struct xfrm_tunnel {
        int (*handler)(struct sk_buff *skb);
        int (*cb_handler)(struct sk_buff *skb, int err);
        int (*err_handler)(struct sk_buff *skb, u32 info);

        struct xfrm_tunnel __rcu *next;
        int priority;
};

struct xfrm6_tunnel {
        int (*handler)(struct sk_buff *skb);
        int (*cb_handler)(struct sk_buff *skb, int err);
        int (*err_handler)(struct sk_buff *skb, struct inet6_skb_parm *opt,
                           u8 type, u8 code, int offset, __be32 info);
        struct xfrm6_tunnel __rcu *next;
        int priority;
};

void xfrm_init(void);
void xfrm4_init(void);
int xfrm_state_init(struct net *net);
void xfrm_state_fini(struct net *net);
void xfrm4_state_init(void);
void xfrm4_protocol_init(void);
#ifdef CONFIG_XFRM
int xfrm6_init(void);
void xfrm6_fini(void);
int xfrm6_state_init(void);
void xfrm6_state_fini(void);
int xfrm6_protocol_init(void);
void xfrm6_protocol_fini(void);
#else
static inline int xfrm6_init(void)
{
        return 0;
}
static inline void xfrm6_fini(void)
{
        ;
}
#endif

#ifdef CONFIG_XFRM_STATISTICS
int xfrm_proc_init(struct net *net);
void xfrm_proc_fini(struct net *net);
#endif

int xfrm_sysctl_init(struct net *net);
#ifdef CONFIG_SYSCTL
void xfrm_sysctl_fini(struct net *net);
#else
static inline void xfrm_sysctl_fini(struct net *net)
{
}
#endif

void xfrm_state_walk_init(struct xfrm_state_walk *walk, u8 proto,
                          struct xfrm_address_filter *filter);
int xfrm_state_walk(struct net *net, struct xfrm_state_walk *walk,
                    int (*func)(struct xfrm_state *, int, void*), void *);
void xfrm_state_walk_done(struct xfrm_state_walk *walk, struct net *net);
struct xfrm_state *xfrm_state_alloc(struct net *net);
void xfrm_state_free(struct xfrm_state *x);
struct xfrm_state *xfrm_state_find(const xfrm_address_t *daddr,
                                   const xfrm_address_t *saddr,
                                   const struct flowi *fl,
                                   struct xfrm_tmpl *tmpl,
                                   struct xfrm_policy *pol, int *err,
                                   unsigned short family, u32 if_id);
struct xfrm_state *xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id,
                                       xfrm_address_t *daddr,
                                       xfrm_address_t *saddr,
                                       unsigned short family,
                                       u8 mode, u8 proto, u32 reqid);
struct xfrm_state *xfrm_state_lookup_byspi(struct net *net, __be32 spi,
                                              unsigned short family);
int xfrm_state_check_expire(struct xfrm_state *x);
void xfrm_state_insert(struct xfrm_state *x);
int xfrm_state_add(struct xfrm_state *x);
int xfrm_state_update(struct xfrm_state *x);
struct xfrm_state *xfrm_state_lookup(struct net *net, u32 mark,
                                     const xfrm_address_t *daddr, __be32 spi,
                                     u8 proto, unsigned short family);
struct xfrm_state *xfrm_state_lookup_byaddr(struct net *net, u32 mark,
                                            const xfrm_address_t *daddr,
                                            const xfrm_address_t *saddr,
                                            u8 proto,
                                            unsigned short family);
#ifdef CONFIG_XFRM_SUB_POLICY
void xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n,
                    unsigned short family);
void xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n,
                     unsigned short family);
#else
static inline void xfrm_tmpl_sort(struct xfrm_tmpl **d, struct xfrm_tmpl **s,
                                  int n, unsigned short family)
{
}

static inline void xfrm_state_sort(struct xfrm_state **d, struct xfrm_state **s,
                                   int n, unsigned short family)
{
}
#endif

struct xfrmk_sadinfo {
        u32 sadhcnt; /* current hash bkts */
        u32 sadhmcnt; /* max allowed hash bkts */
        u32 sadcnt; /* current running count */
};

struct xfrmk_spdinfo {
        u32 incnt;
        u32 outcnt;
        u32 fwdcnt;
        u32 inscnt;
        u32 outscnt;
        u32 fwdscnt;
        u32 spdhcnt;
        u32 spdhmcnt;
};

struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq);
int xfrm_state_delete(struct xfrm_state *x);
int xfrm_state_flush(struct net *net, u8 proto, bool task_valid);
int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid);
void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si);
void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si);
u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq);
int xfrm_init_replay(struct xfrm_state *x);
u32 xfrm_state_mtu(struct xfrm_state *x, int mtu);
int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload);
int xfrm_init_state(struct xfrm_state *x);
int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type);
int xfrm_input_resume(struct sk_buff *skb, int nexthdr);
int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb,
                         int (*finish)(struct net *, struct sock *,
                                       struct sk_buff *));
int xfrm_trans_queue(struct sk_buff *skb,
                     int (*finish)(struct net *, struct sock *,
                                   struct sk_buff *));
int xfrm_output_resume(struct sock *sk, struct sk_buff *skb, int err);
int xfrm_output(struct sock *sk, struct sk_buff *skb);

#if IS_ENABLED(CONFIG_NET_PKTGEN)
int pktgen_xfrm_outer_mode_output(struct xfrm_state *x, struct sk_buff *skb);
#endif

void xfrm_local_error(struct sk_buff *skb, int mtu);
int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb);
int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
                    int encap_type);
int xfrm4_transport_finish(struct sk_buff *skb, int async);
int xfrm4_rcv(struct sk_buff *skb);
int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq);

static inline int xfrm4_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi)
{
        XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
        XFRM_SPI_SKB_CB(skb)->family = AF_INET;
        XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
        return xfrm_input(skb, nexthdr, spi, 0);
}

int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb);
int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb);
int xfrm4_protocol_register(struct xfrm4_protocol *handler, unsigned char protocol);
int xfrm4_protocol_deregister(struct xfrm4_protocol *handler, unsigned char protocol);
int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family);
int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family);
void xfrm4_local_error(struct sk_buff *skb, u32 mtu);
int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb);
int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi,
                  struct ip6_tnl *t);
int xfrm6_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
                    int encap_type);
int xfrm6_transport_finish(struct sk_buff *skb, int async);
int xfrm6_rcv_tnl(struct sk_buff *skb, struct ip6_tnl *t);
int xfrm6_rcv(struct sk_buff *skb);
int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr,
                     xfrm_address_t *saddr, u8 proto);
void xfrm6_local_error(struct sk_buff *skb, u32 mtu);
int xfrm6_protocol_register(struct xfrm6_protocol *handler, unsigned char protocol);
int xfrm6_protocol_deregister(struct xfrm6_protocol *handler, unsigned char protocol);
int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family);
int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family);
__be32 xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr);
__be32 xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr);
int xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb);
int xfrm6_output_finish(struct sock *sk, struct sk_buff *skb);
int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb,
                          u8 **prevhdr);

#ifdef CONFIG_XFRM
void xfrm6_local_rxpmtu(struct sk_buff *skb, u32 mtu);
int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb);
int xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb);
int xfrm_user_policy(struct sock *sk, int optname, sockptr_t optval,
                     int optlen);
#else
static inline int xfrm_user_policy(struct sock *sk, int optname,
                                   sockptr_t optval, int optlen)
{
         return -ENOPROTOOPT;
}
#endif

struct dst_entry *__xfrm_dst_lookup(int family, const struct xfrm_dst_lookup_params *params);

struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp);

void xfrm_policy_walk_init(struct xfrm_policy_walk *walk, u8 type);
int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk,
                     int (*func)(struct xfrm_policy *, int, int, void*),
                     void *);
void xfrm_policy_walk_done(struct xfrm_policy_walk *walk, struct net *net);
int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl);
struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net,
                                          const struct xfrm_mark *mark,
                                          u32 if_id, u8 type, int dir,
                                          struct xfrm_selector *sel,
                                          struct xfrm_sec_ctx *ctx, int delete,
                                          int *err);
struct xfrm_policy *xfrm_policy_byid(struct net *net,
                                     const struct xfrm_mark *mark, u32 if_id,
                                     u8 type, int dir, u32 id, int delete,
                                     int *err);
int xfrm_policy_flush(struct net *net, u8 type, bool task_valid);
void xfrm_policy_hash_rebuild(struct net *net);
u32 xfrm_get_acqseq(void);
int verify_spi_info(u8 proto, u32 min, u32 max);
int xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi);
struct xfrm_state *xfrm_find_acq(struct net *net, const struct xfrm_mark *mark,
                                 u8 mode, u32 reqid, u32 if_id, u8 proto,
                                 const xfrm_address_t *daddr,
                                 const xfrm_address_t *saddr, int create,
                                 unsigned short family);
int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol);

#ifdef CONFIG_XFRM_MIGRATE
int km_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
               const struct xfrm_migrate *m, int num_bundles,
               const struct xfrm_kmaddress *k,
               const struct xfrm_encap_tmpl *encap);
struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net,
                                                u32 if_id);
struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x,
                                      struct xfrm_migrate *m,
                                      struct xfrm_encap_tmpl *encap);
int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
                 struct xfrm_migrate *m, int num_bundles,
                 struct xfrm_kmaddress *k, struct net *net,
                 struct xfrm_encap_tmpl *encap, u32 if_id);
#endif

int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport);
void km_policy_expired(struct xfrm_policy *pol, int dir, int hard, u32 portid);
int km_report(struct net *net, u8 proto, struct xfrm_selector *sel,
              xfrm_address_t *addr);

void xfrm_input_init(void);
int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq);

void xfrm_probe_algs(void);
int xfrm_count_pfkey_auth_supported(void);
int xfrm_count_pfkey_enc_supported(void);
struct xfrm_algo_desc *xfrm_aalg_get_byidx(unsigned int idx);
struct xfrm_algo_desc *xfrm_ealg_get_byidx(unsigned int idx);
struct xfrm_algo_desc *xfrm_aalg_get_byid(int alg_id);
struct xfrm_algo_desc *xfrm_ealg_get_byid(int alg_id);
struct xfrm_algo_desc *xfrm_calg_get_byid(int alg_id);
struct xfrm_algo_desc *xfrm_aalg_get_byname(const char *name, int probe);
struct xfrm_algo_desc *xfrm_ealg_get_byname(const char *name, int probe);
struct xfrm_algo_desc *xfrm_calg_get_byname(const char *name, int probe);
struct xfrm_algo_desc *xfrm_aead_get_byname(const char *name, int icv_len,
                                            int probe);

static inline bool xfrm6_addr_equal(const xfrm_address_t *a,
                                    const xfrm_address_t *b)
{
        return ipv6_addr_equal((const struct in6_addr *)a,
                               (const struct in6_addr *)b);
}

static inline bool xfrm_addr_equal(const xfrm_address_t *a,
                                   const xfrm_address_t *b,
                                   sa_family_t family)
{
        switch (family) {
        default:
        case AF_INET:
                return ((__force u32)a->a4 ^ (__force u32)b->a4) == 0;
        case AF_INET6:
                return xfrm6_addr_equal(a, b);
        }
}

static inline int xfrm_policy_id2dir(u32 index)
{
        return index & 7;
}

#ifdef CONFIG_XFRM
static inline int xfrm_aevent_is_on(struct net *net)
{
        struct sock *nlsk;
        int ret = 0;

        rcu_read_lock();
        nlsk = rcu_dereference(net->xfrm.nlsk);
        if (nlsk)
                ret = netlink_has_listeners(nlsk, XFRMNLGRP_AEVENTS);
        rcu_read_unlock();
        return ret;
}

static inline int xfrm_acquire_is_on(struct net *net)
{
        struct sock *nlsk;
        int ret = 0;

        rcu_read_lock();
        nlsk = rcu_dereference(net->xfrm.nlsk);
        if (nlsk)
                ret = netlink_has_listeners(nlsk, XFRMNLGRP_ACQUIRE);
        rcu_read_unlock();

        return ret;
}
#endif

static inline unsigned int aead_len(struct xfrm_algo_aead *alg)
{
        return sizeof(*alg) + ((alg->alg_key_len + 7) / 8);
}

static inline unsigned int xfrm_alg_len(const struct xfrm_algo *alg)
{
        return sizeof(*alg) + ((alg->alg_key_len + 7) / 8);
}

static inline unsigned int xfrm_alg_auth_len(const struct xfrm_algo_auth *alg)
{
        return sizeof(*alg) + ((alg->alg_key_len + 7) / 8);
}

static inline unsigned int xfrm_replay_state_esn_len(struct xfrm_replay_state_esn *replay_esn)
{
        return sizeof(*replay_esn) + replay_esn->bmp_len * sizeof(__u32);
}

#ifdef CONFIG_XFRM_MIGRATE
static inline int xfrm_replay_clone(struct xfrm_state *x,
                                     struct xfrm_state *orig)
{

        x->replay_esn = kmemdup(orig->replay_esn,
                                xfrm_replay_state_esn_len(orig->replay_esn),
                                GFP_KERNEL);
        if (!x->replay_esn)
                return -ENOMEM;
        x->preplay_esn = kmemdup(orig->preplay_esn,
                                 xfrm_replay_state_esn_len(orig->preplay_esn),
                                 GFP_KERNEL);
        if (!x->preplay_esn)
                return -ENOMEM;

        return 0;
}

static inline struct xfrm_algo_aead *xfrm_algo_aead_clone(struct xfrm_algo_aead *orig)
{
        return kmemdup(orig, aead_len(orig), GFP_KERNEL);
}


static inline struct xfrm_algo *xfrm_algo_clone(struct xfrm_algo *orig)
{
        return kmemdup(orig, xfrm_alg_len(orig), GFP_KERNEL);
}

static inline struct xfrm_algo_auth *xfrm_algo_auth_clone(struct xfrm_algo_auth *orig)
{
        return kmemdup(orig, xfrm_alg_auth_len(orig), GFP_KERNEL);
}

static inline void xfrm_states_put(struct xfrm_state **states, int n)
{
        int i;
        for (i = 0; i < n; i++)
                xfrm_state_put(*(states + i));
}

static inline void xfrm_states_delete(struct xfrm_state **states, int n)
{
        int i;
        for (i = 0; i < n; i++)
                xfrm_state_delete(*(states + i));
}
#endif

#ifdef CONFIG_XFRM
static inline struct xfrm_state *xfrm_input_state(struct sk_buff *skb)
{
        struct sec_path *sp = skb_sec_path(skb);

        return sp->xvec[sp->len - 1];
}
#endif

static inline struct xfrm_offload *xfrm_offload(struct sk_buff *skb)
{
#ifdef CONFIG_XFRM
        struct sec_path *sp = skb_sec_path(skb);

        if (!sp || !sp->olen || sp->len != sp->olen)
                return NULL;

        return &sp->ovec[sp->olen - 1];
#else
        return NULL;
#endif
}

void __init xfrm_dev_init(void);

#ifdef CONFIG_XFRM_OFFLOAD
void xfrm_dev_resume(struct sk_buff *skb);
void xfrm_dev_backlog(struct softnet_data *sd);
struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features, bool *again);
int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
                       struct xfrm_user_offload *xuo);
bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x);

static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x)
{
        struct xfrm_dev_offload *xso = &x->xso;

        if (xso->dev && xso->dev->xfrmdev_ops->xdo_dev_state_advance_esn)
                xso->dev->xfrmdev_ops->xdo_dev_state_advance_esn(x);
}

static inline bool xfrm_dst_offload_ok(struct dst_entry *dst)
{
        struct xfrm_state *x = dst->xfrm;
        struct xfrm_dst *xdst;

        if (!x || !x->type_offload)
                return false;

        xdst = (struct xfrm_dst *) dst;
        if (!x->xso.offload_handle && !xdst->child->xfrm)
                return true;
        if (x->xso.offload_handle && (x->xso.dev == xfrm_dst_path(dst)->dev) &&
            !xdst->child->xfrm)
                return true;

        return false;
}

static inline void xfrm_dev_state_delete(struct xfrm_state *x)
{
        struct xfrm_dev_offload *xso = &x->xso;

        if (xso->dev)
                xso->dev->xfrmdev_ops->xdo_dev_state_delete(x);
}

static inline void xfrm_dev_state_free(struct xfrm_state *x)
{
        struct xfrm_dev_offload *xso = &x->xso;
        struct net_device *dev = xso->dev;

        if (dev && dev->xfrmdev_ops) {
                if (dev->xfrmdev_ops->xdo_dev_state_free)
                        dev->xfrmdev_ops->xdo_dev_state_free(x);
                xso->dev = NULL;
                dev_put(dev);
        }
}
#else
static inline void xfrm_dev_resume(struct sk_buff *skb)
{
}

static inline void xfrm_dev_backlog(struct softnet_data *sd)
{
}

static inline struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features, bool *again)
{
        return skb;
}

static inline int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, struct xfrm_user_offload *xuo)
{
        return 0;
}

static inline void xfrm_dev_state_delete(struct xfrm_state *x)
{
}

static inline void xfrm_dev_state_free(struct xfrm_state *x)
{
}

static inline bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
{
        return false;
}

static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x)
{
}

static inline bool xfrm_dst_offload_ok(struct dst_entry *dst)
{
        return false;
}
#endif

static inline int xfrm_mark_get(struct nlattr **attrs, struct xfrm_mark *m)
{
        if (attrs[XFRMA_MARK])
                memcpy(m, nla_data(attrs[XFRMA_MARK]), sizeof(struct xfrm_mark));
        else
                m->v = m->m = 0;

        return m->v & m->m;
}

static inline int xfrm_mark_put(struct sk_buff *skb, const struct xfrm_mark *m)
{
        int ret = 0;

        if (m->m | m->v)
                ret = nla_put(skb, XFRMA_MARK, sizeof(struct xfrm_mark), m);
        return ret;
}

static inline __u32 xfrm_smark_get(__u32 mark, struct xfrm_state *x)
{
        struct xfrm_mark *m = &x->props.smark;

        return (m->v & m->m) | (mark & ~m->m);
}

static inline int xfrm_if_id_put(struct sk_buff *skb, __u32 if_id)
{
        int ret = 0;

        if (if_id)
                ret = nla_put_u32(skb, XFRMA_IF_ID, if_id);
        return ret;
}

static inline int xfrm_tunnel_check(struct sk_buff *skb, struct xfrm_state *x,
                                    unsigned int family)
{
        bool tunnel = false;

        switch(family) {
        case AF_INET:
                if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4)
                        tunnel = true;
                break;
        case AF_INET6:
                if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6)
                        tunnel = true;
                break;
        }
        if (tunnel && !(x->outer_mode.flags & XFRM_MODE_FLAG_TUNNEL))
                return -EINVAL;

        return 0;
}

extern const int xfrm_msg_min[XFRM_NR_MSGTYPES];
extern const struct nla_policy xfrma_policy[XFRMA_MAX+1];

struct xfrm_translator {
        /* Allocate frag_list and put compat translation there */
        int (*alloc_compat)(struct sk_buff *skb, const struct nlmsghdr *src);

        /* Allocate nlmsg with 64-bit translaton of received 32-bit message */
        struct nlmsghdr *(*rcv_msg_compat)(const struct nlmsghdr *nlh,
                        int maxtype, const struct nla_policy *policy,
                        struct netlink_ext_ack *extack);

        /* Translate 32-bit user_policy from sockptr */
        int (*xlate_user_policy_sockptr)(u8 **pdata32, int optlen);

        struct module *owner;
};

#if IS_ENABLED(CONFIG_XFRM_USER_COMPAT)
extern int xfrm_register_translator(struct xfrm_translator *xtr);
extern int xfrm_unregister_translator(struct xfrm_translator *xtr);
extern struct xfrm_translator *xfrm_get_translator(void);
extern void xfrm_put_translator(struct xfrm_translator *xtr);
#else
static inline struct xfrm_translator *xfrm_get_translator(void)
{
        return NULL;
}
static inline void xfrm_put_translator(struct xfrm_translator *xtr)
{
}
#endif

#if IS_ENABLED(CONFIG_IPV6)
static inline bool xfrm6_local_dontfrag(const struct sock *sk)
{
        int proto;

        if (!sk || sk->sk_family != AF_INET6)
                return false;

        proto = sk->sk_protocol;
        if (proto == IPPROTO_UDP || proto == IPPROTO_RAW)
                return inet6_sk(sk)->dontfrag;

        return false;
}
#endif
#endif        /* _NET_XFRM_H */















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
// SPDX-License-Identifier: GPL-2.0
/*
  File: fs/ext4/acl.h

  (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
*/

#include <linux/posix_acl_xattr.h>

#define EXT4_ACL_VERSION        0x0001

typedef struct {
        __le16                e_tag;
        __le16                e_perm;
        __le32                e_id;
} ext4_acl_entry;

typedef struct {
        __le16                e_tag;
        __le16                e_perm;
} ext4_acl_entry_short;

typedef struct {
        __le32                a_version;
} ext4_acl_header;

static inline size_t ext4_acl_size(int count)
{
        if (count <= 4) {
                return sizeof(ext4_acl_header) +
                       count * sizeof(ext4_acl_entry_short);
        } else {
                return sizeof(ext4_acl_header) +
                       4 * sizeof(ext4_acl_entry_short) +
                       (count - 4) * sizeof(ext4_acl_entry);
        }
}

static inline int ext4_acl_count(size_t size)
{
        ssize_t s;
        size -= sizeof(ext4_acl_header);
        s = size - 4 * sizeof(ext4_acl_entry_short);
        if (s < 0) {
                if (size % sizeof(ext4_acl_entry_short))
                        return -1;
                return size / sizeof(ext4_acl_entry_short);
        } else {
                if (s % sizeof(ext4_acl_entry))
                        return -1;
                return s / sizeof(ext4_acl_entry) + 4;
        }
}

#ifdef CONFIG_EXT4_FS_POSIX_ACL

/* acl.c */
struct posix_acl *ext4_get_acl(struct inode *inode, int type);
int ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type);
extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);

#else  /* CONFIG_EXT4_FS_POSIX_ACL */
#include <linux/sched.h>
#define ext4_get_acl NULL
#define ext4_set_acl NULL

static inline int
ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
{
        /* usually, the umask is applied by posix_acl_create(), but if
           ext4 ACL support is disabled at compile time, we need to do
           it here, because posix_acl_create() will never be called */
        inode->i_mode &= ~current_umask();

        return 0;
}
#endif  /* CONFIG_EXT4_FS_POSIX_ACL */






















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the RAW-IP module.
 *
 * Version:        @(#)raw.h        1.0.2        05/07/93
 *
 * Author:        Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 */
#ifndef _RAW_H
#define _RAW_H

#include <net/inet_sock.h>
#include <net/protocol.h>
#include <linux/icmp.h>

extern struct proto raw_prot;

extern struct raw_hashinfo raw_v4_hashinfo;
struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
                             unsigned short num, __be32 raddr,
                             __be32 laddr, int dif, int sdif);

int raw_abort(struct sock *sk, int err);
void raw_icmp_error(struct sk_buff *, int, u32);
int raw_local_deliver(struct sk_buff *, int);

int raw_rcv(struct sock *, struct sk_buff *);

#define RAW_HTABLE_SIZE        MAX_INET_PROTOS

struct raw_hashinfo {
        rwlock_t lock;
        struct hlist_head ht[RAW_HTABLE_SIZE];
};

#ifdef CONFIG_PROC_FS
int raw_proc_init(void);
void raw_proc_exit(void);

struct raw_iter_state {
        struct seq_net_private p;
        int bucket;
};

static inline struct raw_iter_state *raw_seq_private(struct seq_file *seq)
{
        return seq->private;
}
void *raw_seq_start(struct seq_file *seq, loff_t *pos);
void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos);
void raw_seq_stop(struct seq_file *seq, void *v);
#endif

int raw_hash_sk(struct sock *sk);
void raw_unhash_sk(struct sock *sk);
void raw_init(void);

struct raw_sock {
        /* inet_sock has to be the first member */
        struct inet_sock   inet;
        struct icmp_filter filter;
        u32                   ipmr_table;
};

static inline struct raw_sock *raw_sk(const struct sock *sk)
{
        return (struct raw_sock *)sk;
}

static inline bool raw_sk_bound_dev_eq(struct net *net, int bound_dev_if,
                                       int dif, int sdif)
{
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
        return inet_bound_dev_eq(READ_ONCE(net->ipv4.sysctl_raw_l3mdev_accept),
                                 bound_dev_if, dif, sdif);
#else
        return inet_bound_dev_eq(true, bound_dev_if, dif, sdif);
#endif
}

#endif        /* _RAW_H */



















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __IEEE802154_CORE_H
#define __IEEE802154_CORE_H

#include <net/cfg802154.h>

struct cfg802154_registered_device {
        const struct cfg802154_ops *ops;
        struct list_head list;

        /* wpan_phy index, internal only */
        int wpan_phy_idx;

        /* also protected by devlist_mtx */
        int opencount;
        wait_queue_head_t dev_wait;

        /* protected by RTNL only */
        int num_running_ifaces;

        /* associated wpan interfaces, protected by rtnl or RCU */
        struct list_head wpan_dev_list;
        int devlist_generation, wpan_dev_id;

        /* must be last because of the way we do wpan_phy_priv(),
         * and it should at least be aligned to NETDEV_ALIGN
         */
        struct wpan_phy wpan_phy __aligned(NETDEV_ALIGN);
};

static inline struct cfg802154_registered_device *
wpan_phy_to_rdev(struct wpan_phy *wpan_phy)
{
        BUG_ON(!wpan_phy);
        return container_of(wpan_phy, struct cfg802154_registered_device,
                            wpan_phy);
}

extern struct list_head cfg802154_rdev_list;
extern int cfg802154_rdev_list_generation;

int cfg802154_switch_netns(struct cfg802154_registered_device *rdev,
                           struct net *net);
/* free object */
void cfg802154_dev_free(struct cfg802154_registered_device *rdev);
struct cfg802154_registered_device *
cfg802154_rdev_by_wpan_phy_idx(int wpan_phy_idx);
struct wpan_phy *wpan_phy_idx_to_wpan_phy(int wpan_phy_idx);

#endif /* __IEEE802154_CORE_H */





































































































































































































































    9 


    9 




















    9 










    5 




    9 







    1 





















































































































































































    1 











    1 


























   10 





   10 
















   10 

   10 










   10 


    9 







    9 


    9 
    9 


    9 



    9 


























    1 


    1 



































    9 









    9 
    9 
    9 
    9 











    1 
    1 

    1 



    1 
















    1 













































































































































































    7 

    6 

    6 






































































































    1 
    1 

    1 




    6 




    8 















































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/file.c
 *
 *  Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes
 *
 *  Manage the dynamic fd arrays in the process files_struct.
 */

#include <linux/syscalls.h>
#include <linux/export.h>
#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/bitops.h>
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/close_range.h>
#include <net/sock.h>
#include <linux/init_task.h>

#include "internal.h"

unsigned int sysctl_nr_open __read_mostly = 1024*1024;
unsigned int sysctl_nr_open_min = BITS_PER_LONG;
/* our min() is unusable in constant expressions ;-/ */
#define __const_min(x, y) ((x) < (y) ? (x) : (y))
unsigned int sysctl_nr_open_max =
        __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG;

static void __free_fdtable(struct fdtable *fdt)
{
        kvfree(fdt->fd);
        kvfree(fdt->open_fds);
        kfree(fdt);
}

static void free_fdtable_rcu(struct rcu_head *rcu)
{
        __free_fdtable(container_of(rcu, struct fdtable, rcu));
}

#define BITBIT_NR(nr)        BITS_TO_LONGS(BITS_TO_LONGS(nr))
#define BITBIT_SIZE(nr)        (BITBIT_NR(nr) * sizeof(long))

#define fdt_words(fdt) ((fdt)->max_fds / BITS_PER_LONG) // words in ->open_fds
/*
 * Copy 'count' fd bits from the old table to the new table and clear the extra
 * space if any.  This does not copy the file pointers.  Called with the files
 * spinlock held for write.
 */
static inline void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
                            unsigned int copy_words)
{
        unsigned int nwords = fdt_words(nfdt);

        bitmap_copy_and_extend(nfdt->open_fds, ofdt->open_fds,
                        copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG);
        bitmap_copy_and_extend(nfdt->close_on_exec, ofdt->close_on_exec,
                        copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG);
        bitmap_copy_and_extend(nfdt->full_fds_bits, ofdt->full_fds_bits,
                        copy_words, nwords);
}

/*
 * Copy all file descriptors from the old table to the new, expanded table and
 * clear the extra space.  Called with the files spinlock held for write.
 */
static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
{
        size_t cpy, set;

        BUG_ON(nfdt->max_fds < ofdt->max_fds);

        cpy = ofdt->max_fds * sizeof(struct file *);
        set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
        memcpy(nfdt->fd, ofdt->fd, cpy);
        memset((char *)nfdt->fd + cpy, 0, set);

        copy_fd_bitmaps(nfdt, ofdt, fdt_words(ofdt));
}

/*
 * Note how the fdtable bitmap allocations very much have to be a multiple of
 * BITS_PER_LONG. This is not only because we walk those things in chunks of
 * 'unsigned long' in some places, but simply because that is how the Linux
 * kernel bitmaps are defined to work: they are not "bits in an array of bytes",
 * they are very much "bits in an array of unsigned long".
 */
static struct fdtable *alloc_fdtable(unsigned int slots_wanted)
{
        struct fdtable *fdt;
        unsigned int nr;
        void *data;

        /*
         * Figure out how many fds we actually want to support in this fdtable.
         * Allocation steps are keyed to the size of the fdarray, since it
         * grows far faster than any of the other dynamic data. We try to fit
         * the fdarray into comfortable page-tuned chunks: starting at 1024B
         * and growing in powers of two from there on.  Since we called only
         * with slots_wanted > BITS_PER_LONG (embedded instance in files->fdtab
         * already gives BITS_PER_LONG slots), the above boils down to
         * 1.  use the smallest power of two large enough to give us that many
         * slots.
         * 2.  on 32bit skip 64 and 128 - the minimal capacity we want there is
         * 256 slots (i.e. 1Kb fd array).
         * 3.  on 64bit don't skip anything, 1Kb fd array means 128 slots there
         * and we are never going to be asked for 64 or less.
         */
        if (IS_ENABLED(CONFIG_32BIT) && slots_wanted < 256)
                nr = 256;
        else
                nr = roundup_pow_of_two(slots_wanted);
        /*
         * Note that this can drive nr *below* what we had passed if sysctl_nr_open
         * had been set lower between the check in expand_files() and here.
         *
         * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise
         * bitmaps handling below becomes unpleasant, to put it mildly...
         */
        if (unlikely(nr > sysctl_nr_open)) {
                nr = round_down(sysctl_nr_open, BITS_PER_LONG);
                if (nr < slots_wanted)
                        return ERR_PTR(-EMFILE);
        }

        /*
         * Check if the allocation size would exceed INT_MAX. kvmalloc_array()
         * and kvmalloc() will warn if the allocation size is greater than
         * INT_MAX, as filp_cachep objects are not __GFP_NOWARN.
         *
         * This can happen when sysctl_nr_open is set to a very high value and
         * a process tries to use a file descriptor near that limit. For example,
         * if sysctl_nr_open is set to 1073741816 (0x3ffffff8) - which is what
         * systemd typically sets it to - then trying to use a file descriptor
         * close to that value will require allocating a file descriptor table
         * that exceeds 8GB in size.
         */
        if (unlikely(nr > INT_MAX / sizeof(struct file *)))
                return ERR_PTR(-EMFILE);

        fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);
        if (!fdt)
                goto out;
        fdt->max_fds = nr;
        data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT);
        if (!data)
                goto out_fdt;
        fdt->fd = data;

        data = kvmalloc(max_t(size_t,
                                 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES),
                                 GFP_KERNEL_ACCOUNT);
        if (!data)
                goto out_arr;
        fdt->open_fds = data;
        data += nr / BITS_PER_BYTE;
        fdt->close_on_exec = data;
        data += nr / BITS_PER_BYTE;
        fdt->full_fds_bits = data;

        return fdt;

out_arr:
        kvfree(fdt->fd);
out_fdt:
        kfree(fdt);
out:
        return ERR_PTR(-ENOMEM);
}

/*
 * Expand the file descriptor table.
 * This function will allocate a new fdtable and both fd array and fdset, of
 * the given size.
 * Return <0 error code on error; 1 on successful completion.
 * The files->file_lock should be held on entry, and will be held on exit.
 */
static int expand_fdtable(struct files_struct *files, unsigned int nr)
        __releases(files->file_lock)
        __acquires(files->file_lock)
{
        struct fdtable *new_fdt, *cur_fdt;

        spin_unlock(&files->file_lock);
        new_fdt = alloc_fdtable(nr + 1);

        /* make sure all fd_install() have seen resize_in_progress
         * or have finished their rcu_read_lock_sched() section.
         */
        if (atomic_read(&files->count) > 1)
                synchronize_rcu();

        spin_lock(&files->file_lock);
        if (IS_ERR(new_fdt))
                return PTR_ERR(new_fdt);
        cur_fdt = files_fdtable(files);
        BUG_ON(nr < cur_fdt->max_fds);
        copy_fdtable(new_fdt, cur_fdt);
        rcu_assign_pointer(files->fdt, new_fdt);
        if (cur_fdt != &files->fdtab)
                call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
        /* coupled with smp_rmb() in fd_install() */
        smp_wmb();
        return 1;
}

/*
 * Expand files.
 * This function will expand the file structures, if the requested size exceeds
 * the current capacity and there is room for expansion.
 * Return <0 error code on error; 0 when nothing done; 1 when files were
 * expanded and execution may have blocked.
 * The files->file_lock should be held on entry, and will be held on exit.
 */
static int expand_files(struct files_struct *files, unsigned int nr)
        __releases(files->file_lock)
        __acquires(files->file_lock)
{
        struct fdtable *fdt;
        int expanded = 0;

repeat:
        fdt = files_fdtable(files);

        /* Do we need to expand? */
        if (nr < fdt->max_fds)
                return expanded;

        /* Can we expand? */
        if (nr >= sysctl_nr_open)
                return -EMFILE;

        if (unlikely(files->resize_in_progress)) {
                spin_unlock(&files->file_lock);
                expanded = 1;
                wait_event(files->resize_wait, !files->resize_in_progress);
                spin_lock(&files->file_lock);
                goto repeat;
        }

        /* All good, so we try */
        files->resize_in_progress = true;
        expanded = expand_fdtable(files, nr);
        files->resize_in_progress = false;

        wake_up_all(&files->resize_wait);
        return expanded;
}

static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt)
{
        __set_bit(fd, fdt->close_on_exec);
}

static inline void __clear_close_on_exec(unsigned int fd, struct fdtable *fdt)
{
        if (test_bit(fd, fdt->close_on_exec))
                __clear_bit(fd, fdt->close_on_exec);
}

static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt)
{
        __set_bit(fd, fdt->open_fds);
        fd /= BITS_PER_LONG;
        if (!~fdt->open_fds[fd])
                __set_bit(fd, fdt->full_fds_bits);
}

static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)
{
        __clear_bit(fd, fdt->open_fds);
        __clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits);
}

static unsigned int count_open_files(struct fdtable *fdt)
{
        unsigned int size = fdt->max_fds;
        unsigned int i;

        /* Find the last open fd */
        for (i = size / BITS_PER_LONG; i > 0; ) {
                if (fdt->open_fds[--i])
                        break;
        }
        i = (i + 1) * BITS_PER_LONG;
        return i;
}

/*
 * Note that a sane fdtable size always has to be a multiple of
 * BITS_PER_LONG, since we have bitmaps that are sized by this.
 *
 * 'max_fds' will normally already be properly aligned, but it
 * turns out that in the close_range() -> __close_range() ->
 * unshare_fd() -> dup_fd() -> sane_fdtable_size() we can end
 * up having a 'max_fds' value that isn't already aligned.
 *
 * Rather than make close_range() have to worry about this,
 * just make that BITS_PER_LONG alignment be part of a sane
 * fdtable size. Becuase that's really what it is.
 */
static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds)
{
        unsigned int count;

        count = count_open_files(fdt);
        if (max_fds < NR_OPEN_DEFAULT)
                max_fds = NR_OPEN_DEFAULT;
        return ALIGN(min(count, max_fds), BITS_PER_LONG);
}

/*
 * Allocate a new files structure and copy contents from the
 * passed in files structure.
 * errorp will be valid only when the returned files_struct is NULL.
 */
struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int *errorp)
{
        struct files_struct *newf;
        struct file **old_fds, **new_fds;
        unsigned int open_files, i;
        struct fdtable *old_fdt, *new_fdt;

        *errorp = -ENOMEM;
        newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
        if (!newf)
                goto out;

        atomic_set(&newf->count, 1);

        spin_lock_init(&newf->file_lock);
        newf->resize_in_progress = false;
        init_waitqueue_head(&newf->resize_wait);
        newf->next_fd = 0;
        new_fdt = &newf->fdtab;
        new_fdt->max_fds = NR_OPEN_DEFAULT;
        new_fdt->close_on_exec = newf->close_on_exec_init;
        new_fdt->open_fds = newf->open_fds_init;
        new_fdt->full_fds_bits = newf->full_fds_bits_init;
        new_fdt->fd = &newf->fd_array[0];

        spin_lock(&oldf->file_lock);
        old_fdt = files_fdtable(oldf);
        open_files = sane_fdtable_size(old_fdt, max_fds);

        /*
         * Check whether we need to allocate a larger fd array and fd set.
         */
        while (unlikely(open_files > new_fdt->max_fds)) {
                spin_unlock(&oldf->file_lock);

                if (new_fdt != &newf->fdtab)
                        __free_fdtable(new_fdt);

                new_fdt = alloc_fdtable(open_files);
                if (IS_ERR(new_fdt)) {
                        *errorp = PTR_ERR(new_fdt);
                        goto out_release;
                }

                /*
                 * Reacquire the oldf lock and a pointer to its fd table
                 * who knows it may have a new bigger fd table. We need
                 * the latest pointer.
                 */
                spin_lock(&oldf->file_lock);
                old_fdt = files_fdtable(oldf);
                open_files = sane_fdtable_size(old_fdt, max_fds);
        }

        copy_fd_bitmaps(new_fdt, old_fdt, open_files / BITS_PER_LONG);

        old_fds = old_fdt->fd;
        new_fds = new_fdt->fd;

        for (i = open_files; i != 0; i--) {
                struct file *f = *old_fds++;
                if (f) {
                        get_file(f);
                } else {
                        /*
                         * The fd may be claimed in the fd bitmap but not yet
                         * instantiated in the files array if a sibling thread
                         * is partway through open().  So make sure that this
                         * fd is available to the new process.
                         */
                        __clear_open_fd(open_files - i, new_fdt);
                }
                rcu_assign_pointer(*new_fds++, f);
        }
        spin_unlock(&oldf->file_lock);

        /* clear the remainder */
        memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *));

        rcu_assign_pointer(newf->fdt, new_fdt);

        return newf;

out_release:
        kmem_cache_free(files_cachep, newf);
out:
        return NULL;
}

static struct fdtable *close_files(struct files_struct * files)
{
        /*
         * It is safe to dereference the fd table without RCU or
         * ->file_lock because this is the last reference to the
         * files structure.
         */
        struct fdtable *fdt = rcu_dereference_raw(files->fdt);
        unsigned int i, j = 0;

        for (;;) {
                unsigned long set;
                i = j * BITS_PER_LONG;
                if (i >= fdt->max_fds)
                        break;
                set = fdt->open_fds[j++];
                while (set) {
                        if (set & 1) {
                                struct file * file = xchg(&fdt->fd[i], NULL);
                                if (file) {
                                        filp_close(file, files);
                                        cond_resched();
                                }
                        }
                        i++;
                        set >>= 1;
                }
        }

        return fdt;
}

struct files_struct *get_files_struct(struct task_struct *task)
{
        struct files_struct *files;

        task_lock(task);
        files = task->files;
        if (files)
                atomic_inc(&files->count);
        task_unlock(task);

        return files;
}

void put_files_struct(struct files_struct *files)
{
        if (atomic_dec_and_test(&files->count)) {
                struct fdtable *fdt = close_files(files);

                /* free the arrays if they are not embedded */
                if (fdt != &files->fdtab)
                        __free_fdtable(fdt);
                kmem_cache_free(files_cachep, files);
        }
}

void exit_files(struct task_struct *tsk)
{
        struct files_struct * files = tsk->files;

        if (files) {
                task_lock(tsk);
                tsk->files = NULL;
                task_unlock(tsk);
                put_files_struct(files);
        }
}

struct files_struct init_files = {
        .count                = ATOMIC_INIT(1),
        .fdt                = &init_files.fdtab,
        .fdtab                = {
                .max_fds        = NR_OPEN_DEFAULT,
                .fd                = &init_files.fd_array[0],
                .close_on_exec        = init_files.close_on_exec_init,
                .open_fds        = init_files.open_fds_init,
                .full_fds_bits        = init_files.full_fds_bits_init,
        },
        .file_lock        = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
        .resize_wait        = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait),
};

static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
{
        unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */
        unsigned int maxbit = maxfd / BITS_PER_LONG;
        unsigned int bitbit = start / BITS_PER_LONG;

        bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
        if (bitbit >= maxfd)
                return maxfd;
        if (bitbit > start)
                start = bitbit;
        return find_next_zero_bit(fdt->open_fds, maxfd, start);
}

/*
 * allocate a file descriptor, mark it busy.
 */
static int alloc_fd(unsigned start, unsigned end, unsigned flags)
{
        struct files_struct *files = current->files;
        unsigned int fd;
        int error;
        struct fdtable *fdt;

        spin_lock(&files->file_lock);
repeat:
        fdt = files_fdtable(files);
        fd = start;
        if (fd < files->next_fd)
                fd = files->next_fd;

        if (fd < fdt->max_fds)
                fd = find_next_fd(fdt, fd);

        /*
         * N.B. For clone tasks sharing a files structure, this test
         * will limit the total number of files that can be opened.
         */
        error = -EMFILE;
        if (fd >= end)
                goto out;

        error = expand_files(files, fd);
        if (error < 0)
                goto out;

        /*
         * If we needed to expand the fs array we
         * might have blocked - try again.
         */
        if (error)
                goto repeat;

        if (start <= files->next_fd)
                files->next_fd = fd + 1;

        __set_open_fd(fd, fdt);
        if (flags & O_CLOEXEC)
                __set_close_on_exec(fd, fdt);
        else
                __clear_close_on_exec(fd, fdt);
        error = fd;
#if 1
        /* Sanity check */
        if (rcu_access_pointer(fdt->fd[fd]) != NULL) {
                printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
                rcu_assign_pointer(fdt->fd[fd], NULL);
        }
#endif

out:
        spin_unlock(&files->file_lock);
        return error;
}

int __get_unused_fd_flags(unsigned flags, unsigned long nofile)
{
        return alloc_fd(0, nofile, flags);
}

int get_unused_fd_flags(unsigned flags)
{
        return __get_unused_fd_flags(flags, rlimit(RLIMIT_NOFILE));
}
EXPORT_SYMBOL(get_unused_fd_flags);

static void __put_unused_fd(struct files_struct *files, unsigned int fd)
{
        struct fdtable *fdt = files_fdtable(files);
        __clear_open_fd(fd, fdt);
        if (fd < files->next_fd)
                files->next_fd = fd;
}

void put_unused_fd(unsigned int fd)
{
        struct files_struct *files = current->files;
        spin_lock(&files->file_lock);
        __put_unused_fd(files, fd);
        spin_unlock(&files->file_lock);
}

EXPORT_SYMBOL(put_unused_fd);

/*
 * Install a file pointer in the fd array.
 *
 * The VFS is full of places where we drop the files lock between
 * setting the open_fds bitmap and installing the file in the file
 * array.  At any such point, we are vulnerable to a dup2() race
 * installing a file in the array before us.  We need to detect this and
 * fput() the struct file we are about to overwrite in this case.
 *
 * It should never happen - if we allow dup2() do it, _really_ bad things
 * will follow.
 *
 * This consumes the "file" refcount, so callers should treat it
 * as if they had called fput(file).
 */

void fd_install(unsigned int fd, struct file *file)
{
        struct files_struct *files = current->files;
        struct fdtable *fdt;

        rcu_read_lock_sched();

        if (unlikely(files->resize_in_progress)) {
                rcu_read_unlock_sched();
                spin_lock(&files->file_lock);
                fdt = files_fdtable(files);
                BUG_ON(fdt->fd[fd] != NULL);
                rcu_assign_pointer(fdt->fd[fd], file);
                spin_unlock(&files->file_lock);
                return;
        }
        /* coupled with smp_wmb() in expand_fdtable() */
        smp_rmb();
        fdt = rcu_dereference_sched(files->fdt);
        BUG_ON(fdt->fd[fd] != NULL);
        rcu_assign_pointer(fdt->fd[fd], file);
        rcu_read_unlock_sched();
}

EXPORT_SYMBOL(fd_install);

static struct file *pick_file(struct files_struct *files, unsigned fd)
{
        struct file *file = NULL;
        struct fdtable *fdt;

        spin_lock(&files->file_lock);
        fdt = files_fdtable(files);
        if (fd >= fdt->max_fds)
                goto out_unlock;
        fd = array_index_nospec(fd, fdt->max_fds);
        file = fdt->fd[fd];
        if (!file)
                goto out_unlock;
        rcu_assign_pointer(fdt->fd[fd], NULL);
        __put_unused_fd(files, fd);

out_unlock:
        spin_unlock(&files->file_lock);
        return file;
}

int close_fd(unsigned fd)
{
        struct files_struct *files = current->files;
        struct file *file;

        file = pick_file(files, fd);
        if (!file)
                return -EBADF;

        return filp_close(file, files);
}
EXPORT_SYMBOL(close_fd); /* for ksys_close() */

/**
 * __close_range() - Close all file descriptors in a given range.
 *
 * @fd:     starting file descriptor to close
 * @max_fd: last file descriptor to close
 *
 * This closes a range of file descriptors. All file descriptors
 * from @fd up to and including @max_fd are closed.
 */
int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
{
        unsigned int cur_max;
        struct task_struct *me = current;
        struct files_struct *cur_fds = me->files, *fds = NULL;

        if (flags & ~CLOSE_RANGE_UNSHARE)
                return -EINVAL;

        if (fd > max_fd)
                return -EINVAL;

        rcu_read_lock();
        cur_max = files_fdtable(cur_fds)->max_fds;
        rcu_read_unlock();

        /* cap to last valid index into fdtable */
        cur_max--;

        if (flags & CLOSE_RANGE_UNSHARE) {
                int ret;
                unsigned int max_unshare_fds = NR_OPEN_MAX;

                /*
                 * If the requested range is greater than the current maximum,
                 * we're closing everything so only copy all file descriptors
                 * beneath the lowest file descriptor.
                 */
                if (max_fd >= cur_max)
                        max_unshare_fds = fd;

                ret = unshare_fd(CLONE_FILES, max_unshare_fds, &fds);
                if (ret)
                        return ret;

                /*
                 * We used to share our file descriptor table, and have now
                 * created a private one, make sure we're using it below.
                 */
                if (fds)
                        swap(cur_fds, fds);
        }

        max_fd = min(max_fd, cur_max);
        while (fd <= max_fd) {
                struct file *file;

                file = pick_file(cur_fds, fd++);
                if (!file)
                        continue;

                filp_close(file, cur_fds);
                cond_resched();
        }

        if (fds) {
                /*
                 * We're done closing the files we were supposed to. Time to install
                 * the new file descriptor table and drop the old one.
                 */
                task_lock(me);
                me->files = cur_fds;
                task_unlock(me);
                put_files_struct(fds);
        }

        return 0;
}

/*
 * See close_fd_get_file() below, this variant assumes current->files->file_lock
 * is held.
 */
int __close_fd_get_file(unsigned int fd, struct file **res)
{
        struct files_struct *files = current->files;
        struct file *file;
        struct fdtable *fdt;

        fdt = files_fdtable(files);
        if (fd >= fdt->max_fds)
                goto out_err;
        file = fdt->fd[fd];
        if (!file)
                goto out_err;
        rcu_assign_pointer(fdt->fd[fd], NULL);
        __put_unused_fd(files, fd);
        get_file(file);
        *res = file;
        return 0;
out_err:
        *res = NULL;
        return -ENOENT;
}

/*
 * variant of close_fd that gets a ref on the file for later fput.
 * The caller must ensure that filp_close() called on the file, and then
 * an fput().
 */
int close_fd_get_file(unsigned int fd, struct file **res)
{
        struct files_struct *files = current->files;
        int ret;

        spin_lock(&files->file_lock);
        ret = __close_fd_get_file(fd, res);
        spin_unlock(&files->file_lock);

        return ret;
}

void do_close_on_exec(struct files_struct *files)
{
        unsigned i;
        struct fdtable *fdt;

        /* exec unshares first */
        spin_lock(&files->file_lock);
        for (i = 0; ; i++) {
                unsigned long set;
                unsigned fd = i * BITS_PER_LONG;
                fdt = files_fdtable(files);
                if (fd >= fdt->max_fds)
                        break;
                set = fdt->close_on_exec[i];
                if (!set)
                        continue;
                fdt->close_on_exec[i] = 0;
                for ( ; set ; fd++, set >>= 1) {
                        struct file *file;
                        if (!(set & 1))
                                continue;
                        file = fdt->fd[fd];
                        if (!file)
                                continue;
                        rcu_assign_pointer(fdt->fd[fd], NULL);
                        __put_unused_fd(files, fd);
                        spin_unlock(&files->file_lock);
                        filp_close(file, files);
                        cond_resched();
                        spin_lock(&files->file_lock);
                }

        }
        spin_unlock(&files->file_lock);
}

static struct file *__fget_files(struct files_struct *files, unsigned int fd,
                                 fmode_t mask, unsigned int refs)
{
        struct file *file;

        rcu_read_lock();
loop:
        file = files_lookup_fd_rcu(files, fd);
        if (file) {
                /* File object ref couldn't be taken.
                 * dup2() atomicity guarantee is the reason
                 * we loop to catch the new file (or NULL pointer)
                 */
                if (file->f_mode & mask)
                        file = NULL;
                else if (!get_file_rcu_many(file, refs))
                        goto loop;
                else if (files_lookup_fd_raw(files, fd) != file) {
                        fput_many(file, refs);
                        goto loop;
                }
        }
        rcu_read_unlock();

        return file;
}

static inline struct file *__fget(unsigned int fd, fmode_t mask,
                                  unsigned int refs)
{
        return __fget_files(current->files, fd, mask, refs);
}

struct file *fget_many(unsigned int fd, unsigned int refs)
{
        return __fget(fd, FMODE_PATH, refs);
}

struct file *fget(unsigned int fd)
{
        return __fget(fd, FMODE_PATH, 1);
}
EXPORT_SYMBOL(fget);

struct file *fget_raw(unsigned int fd)
{
        return __fget(fd, 0, 1);
}
EXPORT_SYMBOL(fget_raw);

struct file *fget_task(struct task_struct *task, unsigned int fd)
{
        struct file *file = NULL;

        task_lock(task);
        if (task->files)
                file = __fget_files(task->files, fd, 0, 1);
        task_unlock(task);

        return file;
}

struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd)
{
        /* Must be called with rcu_read_lock held */
        struct files_struct *files;
        struct file *file = NULL;

        task_lock(task);
        files = task->files;
        if (files)
                file = files_lookup_fd_rcu(files, fd);
        task_unlock(task);

        return file;
}

struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret_fd)
{
        /* Must be called with rcu_read_lock held */
        struct files_struct *files;
        unsigned int fd = *ret_fd;
        struct file *file = NULL;

        task_lock(task);
        files = task->files;
        if (files) {
                for (; fd < files_fdtable(files)->max_fds; fd++) {
                        file = files_lookup_fd_rcu(files, fd);
                        if (file)
                                break;
                }
        }
        task_unlock(task);
        *ret_fd = fd;
        return file;
}

/*
 * Lightweight file lookup - no refcnt increment if fd table isn't shared.
 *
 * You can use this instead of fget if you satisfy all of the following
 * conditions:
 * 1) You must call fput_light before exiting the syscall and returning control
 *    to userspace (i.e. you cannot remember the returned struct file * after
 *    returning to userspace).
 * 2) You must not call filp_close on the returned struct file * in between
 *    calls to fget_light and fput_light.
 * 3) You must not clone the current task in between the calls to fget_light
 *    and fput_light.
 *
 * The fput_needed flag returned by fget_light should be passed to the
 * corresponding fput_light.
 */
static unsigned long __fget_light(unsigned int fd, fmode_t mask)
{
        struct files_struct *files = current->files;
        struct file *file;

        if (atomic_read(&files->count) == 1) {
                file = files_lookup_fd_raw(files, fd);
                if (!file || unlikely(file->f_mode & mask))
                        return 0;
                return (unsigned long)file;
        } else {
                file = __fget(fd, mask, 1);
                if (!file)
                        return 0;
                return FDPUT_FPUT | (unsigned long)file;
        }
}
unsigned long __fdget(unsigned int fd)
{
        return __fget_light(fd, FMODE_PATH);
}
EXPORT_SYMBOL(__fdget);

unsigned long __fdget_raw(unsigned int fd)
{
        return __fget_light(fd, 0);
}

/*
 * Try to avoid f_pos locking. We only need it if the
 * file is marked for FMODE_ATOMIC_POS, and it can be
 * accessed multiple ways.
 *
 * Always do it for directories, because pidfd_getfd()
 * can make a file accessible even if it otherwise would
 * not be, and for directories this is a correctness
 * issue, not a "POSIX requirement".
 */
static inline bool file_needs_f_pos_lock(struct file *file)
{
        return (file->f_mode & FMODE_ATOMIC_POS) &&
                (file_count(file) > 1 || S_ISDIR(file_inode(file)->i_mode));
}

unsigned long __fdget_pos(unsigned int fd)
{
        unsigned long v = __fdget(fd);
        struct file *file = (struct file *)(v & ~3);

        if (file && file_needs_f_pos_lock(file)) {
                v |= FDPUT_POS_UNLOCK;
                mutex_lock(&file->f_pos_lock);
        }
        return v;
}

void __f_unlock_pos(struct file *f)
{
        mutex_unlock(&f->f_pos_lock);
}

/*
 * We only lock f_pos if we have threads or if the file might be
 * shared with another process. In both cases we'll have an elevated
 * file count (done either by fdget() or by fork()).
 */

void set_close_on_exec(unsigned int fd, int flag)
{
        struct files_struct *files = current->files;
        struct fdtable *fdt;
        spin_lock(&files->file_lock);
        fdt = files_fdtable(files);
        if (flag)
                __set_close_on_exec(fd, fdt);
        else
                __clear_close_on_exec(fd, fdt);
        spin_unlock(&files->file_lock);
}

bool get_close_on_exec(unsigned int fd)
{
        struct files_struct *files = current->files;
        struct fdtable *fdt;
        bool res;
        rcu_read_lock();
        fdt = files_fdtable(files);
        res = close_on_exec(fd, fdt);
        rcu_read_unlock();
        return res;
}

static int do_dup2(struct files_struct *files,
        struct file *file, unsigned fd, unsigned flags)
__releases(&files->file_lock)
{
        struct file *tofree;
        struct fdtable *fdt;

        /*
         * We need to detect attempts to do dup2() over allocated but still
         * not finished descriptor.  NB: OpenBSD avoids that at the price of
         * extra work in their equivalent of fget() - they insert struct
         * file immediately after grabbing descriptor, mark it larval if
         * more work (e.g. actual opening) is needed and make sure that
         * fget() treats larval files as absent.  Potentially interesting,
         * but while extra work in fget() is trivial, locking implications
         * and amount of surgery on open()-related paths in VFS are not.
         * FreeBSD fails with -EBADF in the same situation, NetBSD "solution"
         * deadlocks in rather amusing ways, AFAICS.  All of that is out of
         * scope of POSIX or SUS, since neither considers shared descriptor
         * tables and this condition does not arise without those.
         */
        fdt = files_fdtable(files);
        fd = array_index_nospec(fd, fdt->max_fds);
        tofree = fdt->fd[fd];
        if (!tofree && fd_is_open(fd, fdt))
                goto Ebusy;
        get_file(file);
        rcu_assign_pointer(fdt->fd[fd], file);
        __set_open_fd(fd, fdt);
        if (flags & O_CLOEXEC)
                __set_close_on_exec(fd, fdt);
        else
                __clear_close_on_exec(fd, fdt);
        spin_unlock(&files->file_lock);

        if (tofree)
                filp_close(tofree, files);

        return fd;

Ebusy:
        spin_unlock(&files->file_lock);
        return -EBUSY;
}

int replace_fd(unsigned fd, struct file *file, unsigned flags)
{
        int err;
        struct files_struct *files = current->files;

        if (!file)
                return close_fd(fd);

        if (fd >= rlimit(RLIMIT_NOFILE))
                return -EBADF;

        spin_lock(&files->file_lock);
        err = expand_files(files, fd);
        if (unlikely(err < 0))
                goto out_unlock;
        err = do_dup2(files, file, fd, flags);
        if (err < 0)
                return err;
        return 0;

out_unlock:
        spin_unlock(&files->file_lock);
        return err;
}

/**
 * __receive_fd() - Install received file into file descriptor table
 *
 * @fd: fd to install into (if negative, a new fd will be allocated)
 * @file: struct file that was received from another process
 * @ufd: __user pointer to write new fd number to
 * @o_flags: the O_* flags to apply to the new fd entry
 *
 * Installs a received file into the file descriptor table, with appropriate
 * checks and count updates. Optionally writes the fd number to userspace, if
 * @ufd is non-NULL.
 *
 * This helper handles its own reference counting of the incoming
 * struct file.
 *
 * Returns newly install fd or -ve on error.
 */
int __receive_fd(int fd, struct file *file, int __user *ufd, unsigned int o_flags)
{
        int new_fd;
        int error;

        error = security_file_receive(file);
        if (error)
                return error;

        if (fd < 0) {
                new_fd = get_unused_fd_flags(o_flags);
                if (new_fd < 0)
                        return new_fd;
        } else {
                new_fd = fd;
        }

        if (ufd) {
                error = put_user(new_fd, ufd);
                if (error) {
                        if (fd < 0)
                                put_unused_fd(new_fd);
                        return error;
                }
        }

        if (fd < 0) {
                fd_install(new_fd, get_file(file));
        } else {
                error = replace_fd(new_fd, file, o_flags);
                if (error)
                        return error;
        }

        /* Bump the sock usage counts, if any. */
        __receive_sock(file);
        return new_fd;
}

static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
{
        int err = -EBADF;
        struct file *file;
        struct files_struct *files = current->files;

        if ((flags & ~O_CLOEXEC) != 0)
                return -EINVAL;

        if (unlikely(oldfd == newfd))
                return -EINVAL;

        if (newfd >= rlimit(RLIMIT_NOFILE))
                return -EBADF;

        spin_lock(&files->file_lock);
        err = expand_files(files, newfd);
        file = files_lookup_fd_locked(files, oldfd);
        if (unlikely(!file))
                goto Ebadf;
        if (unlikely(err < 0)) {
                if (err == -EMFILE)
                        goto Ebadf;
                goto out_unlock;
        }
        return do_dup2(files, file, newfd, flags);

Ebadf:
        err = -EBADF;
out_unlock:
        spin_unlock(&files->file_lock);
        return err;
}

SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
{
        return ksys_dup3(oldfd, newfd, flags);
}

SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
{
        if (unlikely(newfd == oldfd)) { /* corner case */
                struct files_struct *files = current->files;
                int retval = oldfd;

                rcu_read_lock();
                if (!files_lookup_fd_rcu(files, oldfd))
                        retval = -EBADF;
                rcu_read_unlock();
                return retval;
        }
        return ksys_dup3(oldfd, newfd, 0);
}

SYSCALL_DEFINE1(dup, unsigned int, fildes)
{
        int ret = -EBADF;
        struct file *file = fget_raw(fildes);

        if (file) {
                ret = get_unused_fd_flags(0);
                if (ret >= 0)
                        fd_install(ret, file);
                else
                        fput(file);
        }
        return ret;
}

int f_dupfd(unsigned int from, struct file *file, unsigned flags)
{
        unsigned long nofile = rlimit(RLIMIT_NOFILE);
        int err;
        if (from >= nofile)
                return -EINVAL;
        err = alloc_fd(from, nofile, flags);
        if (err >= 0) {
                get_file(file);
                fd_install(err, file);
        }
        return err;
}

int iterate_fd(struct files_struct *files, unsigned n,
                int (*f)(const void *, struct file *, unsigned),
                const void *p)
{
        struct fdtable *fdt;
        int res = 0;
        if (!files)
                return 0;
        spin_lock(&files->file_lock);
        for (fdt = files_fdtable(files); n < fdt->max_fds; n++) {
                struct file *file;
                file = rcu_dereference_check_fdtable(files, fdt->fd[n]);
                if (!file)
                        continue;
                res = f(p, file, n);
                if (res)
                        break;
        }
        spin_unlock(&files->file_lock);
        return res;
}
EXPORT_SYMBOL(iterate_fd);







































































    6 




















    7 


    7 







































    7 































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Access vector cache interface for object managers.
 *
 * Author : Stephen Smalley, <sds@tycho.nsa.gov>
 */
#ifndef _SELINUX_AVC_H_
#define _SELINUX_AVC_H_

#include <linux/stddef.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/kdev_t.h>
#include <linux/spinlock.h>
#include <linux/init.h>
#include <linux/audit.h>
#include <linux/lsm_audit.h>
#include <linux/in6.h>
#include "flask.h"
#include "av_permissions.h"
#include "security.h"

/*
 * An entry in the AVC.
 */
struct avc_entry;

struct task_struct;
struct inode;
struct sock;
struct sk_buff;

/*
 * AVC statistics
 */
struct avc_cache_stats {
        unsigned int lookups;
        unsigned int misses;
        unsigned int allocations;
        unsigned int reclaims;
        unsigned int frees;
};

/*
 * We only need this data after we have decided to send an audit message.
 */
struct selinux_audit_data {
        u32 ssid;
        u32 tsid;
        u16 tclass;
        u32 requested;
        u32 audited;
        u32 denied;
        int result;
        struct selinux_state *state;
};

/*
 * AVC operations
 */

void __init avc_init(void);

static inline u32 avc_audit_required(u32 requested,
                              struct av_decision *avd,
                              int result,
                              u32 auditdeny,
                              u32 *deniedp)
{
        u32 denied, audited;
        denied = requested & ~avd->allowed;
        if (unlikely(denied)) {
                audited = denied & avd->auditdeny;
                /*
                 * auditdeny is TRICKY!  Setting a bit in
                 * this field means that ANY denials should NOT be audited if
                 * the policy contains an explicit dontaudit rule for that
                 * permission.  Take notice that this is unrelated to the
                 * actual permissions that were denied.  As an example lets
                 * assume:
                 *
                 * denied == READ
                 * avd.auditdeny & ACCESS == 0 (not set means explicit rule)
                 * auditdeny & ACCESS == 1
                 *
                 * We will NOT audit the denial even though the denied
                 * permission was READ and the auditdeny checks were for
                 * ACCESS
                 */
                if (auditdeny && !(auditdeny & avd->auditdeny))
                        audited = 0;
        } else if (result)
                audited = denied = requested;
        else
                audited = requested & avd->auditallow;
        *deniedp = denied;
        return audited;
}

int slow_avc_audit(struct selinux_state *state,
                   u32 ssid, u32 tsid, u16 tclass,
                   u32 requested, u32 audited, u32 denied, int result,
                   struct common_audit_data *a);

/**
 * avc_audit - Audit the granting or denial of permissions.
 * @ssid: source security identifier
 * @tsid: target security identifier
 * @tclass: target security class
 * @requested: requested permissions
 * @avd: access vector decisions
 * @result: result from avc_has_perm_noaudit
 * @a:  auxiliary audit data
 * @flags: VFS walk flags
 *
 * Audit the granting or denial of permissions in accordance
 * with the policy.  This function is typically called by
 * avc_has_perm() after a permission check, but can also be
 * called directly by callers who use avc_has_perm_noaudit()
 * in order to separate the permission check from the auditing.
 * For example, this separation is useful when the permission check must
 * be performed under a lock, to allow the lock to be released
 * before calling the auditing code.
 */
static inline int avc_audit(struct selinux_state *state,
                            u32 ssid, u32 tsid,
                            u16 tclass, u32 requested,
                            struct av_decision *avd,
                            int result,
                            struct common_audit_data *a,
                            int flags)
{
        u32 audited, denied;
        audited = avc_audit_required(requested, avd, result, 0, &denied);
        if (likely(!audited))
                return 0;
        /* fall back to ref-walk if we have to generate audit */
        if (flags & MAY_NOT_BLOCK)
                return -ECHILD;
        return slow_avc_audit(state, ssid, tsid, tclass,
                              requested, audited, denied, result,
                              a);
}

#define AVC_STRICT 1 /* Ignore permissive mode. */
#define AVC_EXTENDED_PERMS 2        /* update extended permissions */
#define AVC_NONBLOCKING    4        /* non blocking */
int avc_has_perm_noaudit(struct selinux_state *state,
                         u32 ssid, u32 tsid,
                         u16 tclass, u32 requested,
                         unsigned flags,
                         struct av_decision *avd);

int avc_has_perm(struct selinux_state *state,
                 u32 ssid, u32 tsid,
                 u16 tclass, u32 requested,
                 struct common_audit_data *auditdata);
int avc_has_perm_flags(struct selinux_state *state,
                       u32 ssid, u32 tsid,
                       u16 tclass, u32 requested,
                       struct common_audit_data *auditdata,
                       int flags);

int avc_has_extended_perms(struct selinux_state *state,
                           u32 ssid, u32 tsid, u16 tclass, u32 requested,
                           u8 driver, u8 perm, struct common_audit_data *ad);


u32 avc_policy_seqno(struct selinux_state *state);

#define AVC_CALLBACK_GRANT                1
#define AVC_CALLBACK_TRY_REVOKE                2
#define AVC_CALLBACK_REVOKE                4
#define AVC_CALLBACK_RESET                8
#define AVC_CALLBACK_AUDITALLOW_ENABLE        16
#define AVC_CALLBACK_AUDITALLOW_DISABLE        32
#define AVC_CALLBACK_AUDITDENY_ENABLE        64
#define AVC_CALLBACK_AUDITDENY_DISABLE        128
#define AVC_CALLBACK_ADD_XPERMS                256

int avc_add_callback(int (*callback)(u32 event), u32 events);

/* Exported to selinuxfs */
struct selinux_avc;
int avc_get_hash_stats(struct selinux_avc *avc, char *page);
unsigned int avc_get_cache_threshold(struct selinux_avc *avc);
void avc_set_cache_threshold(struct selinux_avc *avc,
                             unsigned int cache_threshold);

/* Attempt to free avc node cache */
void avc_disable(void);

#ifdef CONFIG_SECURITY_SELINUX_AVC_STATS
DECLARE_PER_CPU(struct avc_cache_stats, avc_cache_stats);
#endif

#endif /* _SELINUX_AVC_H_ */






























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __SOCK_DIAG_H__
#define __SOCK_DIAG_H__

#include <linux/netlink.h>
#include <linux/user_namespace.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <uapi/linux/sock_diag.h>

struct sk_buff;
struct nlmsghdr;
struct sock;

struct sock_diag_handler {
        __u8 family;
        int (*dump)(struct sk_buff *skb, struct nlmsghdr *nlh);
        int (*get_info)(struct sk_buff *skb, struct sock *sk);
        int (*destroy)(struct sk_buff *skb, struct nlmsghdr *nlh);
};

int sock_diag_register(const struct sock_diag_handler *h);
void sock_diag_unregister(const struct sock_diag_handler *h);

void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh));
void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh));

u64 __sock_gen_cookie(struct sock *sk);

static inline u64 sock_gen_cookie(struct sock *sk)
{
        u64 cookie;

        preempt_disable();
        cookie = __sock_gen_cookie(sk);
        preempt_enable();

        return cookie;
}

int sock_diag_check_cookie(struct sock *sk, const __u32 *cookie);
void sock_diag_save_cookie(struct sock *sk, __u32 *cookie);

int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attr);
int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk,
                             struct sk_buff *skb, int attrtype);

static inline
enum sknetlink_groups sock_diag_destroy_group(const struct sock *sk)
{
        switch (sk->sk_family) {
        case AF_INET:
                if (sk->sk_type == SOCK_RAW)
                        return SKNLGRP_NONE;

                switch (sk->sk_protocol) {
                case IPPROTO_TCP:
                        return SKNLGRP_INET_TCP_DESTROY;
                case IPPROTO_UDP:
                        return SKNLGRP_INET_UDP_DESTROY;
                default:
                        return SKNLGRP_NONE;
                }
        case AF_INET6:
                if (sk->sk_type == SOCK_RAW)
                        return SKNLGRP_NONE;

                switch (sk->sk_protocol) {
                case IPPROTO_TCP:
                        return SKNLGRP_INET6_TCP_DESTROY;
                case IPPROTO_UDP:
                        return SKNLGRP_INET6_UDP_DESTROY;
                default:
                        return SKNLGRP_NONE;
                }
        default:
                return SKNLGRP_NONE;
        }
}

static inline
bool sock_diag_has_destroy_listeners(const struct sock *sk)
{
        const struct net *n = sock_net(sk);
        const enum sknetlink_groups group = sock_diag_destroy_group(sk);

        return group != SKNLGRP_NONE && n->diag_nlsk &&
                netlink_has_listeners(n->diag_nlsk, group);
}
void sock_diag_broadcast_destroy(struct sock *sk);

int sock_diag_destroy(struct sock *sk, int err);
#endif




















































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *  Copyright 1997-1998 Transmeta Corporation - All Rights Reserved
 *  Copyright 2005-2006 Ian Kent <raven@themaw.net>
 */

/* Internal header file for autofs */

#include <linux/auto_fs.h>
#include <linux/auto_dev-ioctl.h>

#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/time.h>
#include <linux/string.h>
#include <linux/wait.h>
#include <linux/sched.h>
#include <linux/sched/signal.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/uaccess.h>
#include <linux/mutex.h>
#include <linux/spinlock.h>
#include <linux/list.h>
#include <linux/completion.h>
#include <linux/file.h>
#include <linux/magic.h>

/* This is the range of ioctl() numbers we claim as ours */
#define AUTOFS_IOC_FIRST     AUTOFS_IOC_READY
#define AUTOFS_IOC_COUNT     32

#define AUTOFS_DEV_IOCTL_IOC_FIRST        (AUTOFS_DEV_IOCTL_VERSION)
#define AUTOFS_DEV_IOCTL_IOC_COUNT \
        (AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD - AUTOFS_DEV_IOCTL_VERSION_CMD)

#ifdef pr_fmt
#undef pr_fmt
#endif
#define pr_fmt(fmt) KBUILD_MODNAME ":pid:%d:%s: " fmt, current->pid, __func__

extern struct file_system_type autofs_fs_type;

/*
 * Unified info structure.  This is pointed to by both the dentry and
 * inode structures.  Each file in the filesystem has an instance of this
 * structure.  It holds a reference to the dentry, so dentries are never
 * flushed while the file exists.  All name lookups are dealt with at the
 * dentry level, although the filesystem can interfere in the validation
 * process.  Readdir is implemented by traversing the dentry lists.
 */
struct autofs_info {
        struct dentry        *dentry;
        struct inode        *inode;

        int                flags;

        struct completion expire_complete;

        struct list_head active;

        struct list_head expiring;

        struct autofs_sb_info *sbi;
        unsigned long last_used;
        int count;

        kuid_t uid;
        kgid_t gid;
        struct rcu_head rcu;
};

#define AUTOFS_INF_EXPIRING        (1<<0) /* dentry in the process of expiring */
#define AUTOFS_INF_WANT_EXPIRE        (1<<1) /* the dentry is being considered
                                        * for expiry, so RCU_walk is
                                        * not permitted.  If it progresses to
                                        * actual expiry attempt, the flag is
                                        * not cleared when EXPIRING is set -
                                        * in that case it gets cleared only
                                        * when it comes to clearing EXPIRING.
                                        */
#define AUTOFS_INF_PENDING        (1<<2) /* dentry pending mount */

struct autofs_wait_queue {
        wait_queue_head_t queue;
        struct autofs_wait_queue *next;
        autofs_wqt_t wait_queue_token;
        /* We use the following to see what we are waiting for */
        struct qstr name;
        u32 dev;
        u64 ino;
        kuid_t uid;
        kgid_t gid;
        pid_t pid;
        pid_t tgid;
        /* This is for status reporting upon return */
        int status;
        unsigned int wait_ctr;
};

#define AUTOFS_SBI_MAGIC 0x6d4a556d

#define AUTOFS_SBI_CATATONIC        0x0001
#define AUTOFS_SBI_STRICTEXPIRE 0x0002
#define AUTOFS_SBI_IGNORE        0x0004

struct autofs_sb_info {
        u32 magic;
        int pipefd;
        struct file *pipe;
        struct pid *oz_pgrp;
        int version;
        int sub_version;
        int min_proto;
        int max_proto;
        unsigned int flags;
        unsigned long exp_timeout;
        unsigned int type;
        struct super_block *sb;
        struct mutex wq_mutex;
        struct mutex pipe_mutex;
        spinlock_t fs_lock;
        struct autofs_wait_queue *queues; /* Wait queue pointer */
        spinlock_t lookup_lock;
        struct list_head active_list;
        struct list_head expiring_list;
        struct rcu_head rcu;
};

static inline struct autofs_sb_info *autofs_sbi(struct super_block *sb)
{
        return (struct autofs_sb_info *)(sb->s_fs_info);
}

static inline struct autofs_info *autofs_dentry_ino(struct dentry *dentry)
{
        return (struct autofs_info *)(dentry->d_fsdata);
}

/* autofs_oz_mode(): do we see the man behind the curtain?  (The
 * processes which do manipulations for us in user space sees the raw
 * filesystem without "magic".)
 */
static inline int autofs_oz_mode(struct autofs_sb_info *sbi)
{
        return ((sbi->flags & AUTOFS_SBI_CATATONIC) ||
                 task_pgrp(current) == sbi->oz_pgrp);
}

struct inode *autofs_get_inode(struct super_block *, umode_t);
void autofs_free_ino(struct autofs_info *);

/* Expiration */
int is_autofs_dentry(struct dentry *);
int autofs_expire_wait(const struct path *path, int rcu_walk);
int autofs_expire_run(struct super_block *, struct vfsmount *,
                      struct autofs_sb_info *,
                      struct autofs_packet_expire __user *);
int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
                           struct autofs_sb_info *sbi, unsigned int how);
int autofs_expire_multi(struct super_block *, struct vfsmount *,
                        struct autofs_sb_info *, int __user *);

/* Device node initialization */

int autofs_dev_ioctl_init(void);
void autofs_dev_ioctl_exit(void);

/* Operations structures */

extern const struct inode_operations autofs_symlink_inode_operations;
extern const struct inode_operations autofs_dir_inode_operations;
extern const struct file_operations autofs_dir_operations;
extern const struct file_operations autofs_root_operations;
extern const struct dentry_operations autofs_dentry_operations;

/* VFS automount flags management functions */
static inline void __managed_dentry_set_managed(struct dentry *dentry)
{
        dentry->d_flags |= (DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT);
}

static inline void managed_dentry_set_managed(struct dentry *dentry)
{
        spin_lock(&dentry->d_lock);
        __managed_dentry_set_managed(dentry);
        spin_unlock(&dentry->d_lock);
}

static inline void __managed_dentry_clear_managed(struct dentry *dentry)
{
        dentry->d_flags &= ~(DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT);
}

static inline void managed_dentry_clear_managed(struct dentry *dentry)
{
        spin_lock(&dentry->d_lock);
        __managed_dentry_clear_managed(dentry);
        spin_unlock(&dentry->d_lock);
}

/* Initializing function */

int autofs_fill_super(struct super_block *, void *, int);
struct autofs_info *autofs_new_ino(struct autofs_sb_info *);
void autofs_clean_ino(struct autofs_info *);

static inline int autofs_prepare_pipe(struct file *pipe)
{
        if (!(pipe->f_mode & FMODE_CAN_WRITE))
                return -EINVAL;
        if (!S_ISFIFO(file_inode(pipe)->i_mode))
                return -EINVAL;
        /* We want a packet pipe */
        pipe->f_flags |= O_DIRECT;
        /* We don't expect -EAGAIN */
        pipe->f_flags &= ~O_NONBLOCK;
        return 0;
}

/* Queue management functions */

int autofs_wait(struct autofs_sb_info *,
                 const struct path *, enum autofs_notify);
int autofs_wait_release(struct autofs_sb_info *, autofs_wqt_t, int);
void autofs_catatonic_mode(struct autofs_sb_info *);

static inline u32 autofs_get_dev(struct autofs_sb_info *sbi)
{
        return new_encode_dev(sbi->sb->s_dev);
}

static inline u64 autofs_get_ino(struct autofs_sb_info *sbi)
{
        return d_inode(sbi->sb->s_root)->i_ino;
}

static inline void __autofs_add_expiring(struct dentry *dentry)
{
        struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb);
        struct autofs_info *ino = autofs_dentry_ino(dentry);

        if (ino) {
                if (list_empty(&ino->expiring))
                        list_add(&ino->expiring, &sbi->expiring_list);
        }
}

static inline void autofs_add_expiring(struct dentry *dentry)
{
        struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb);
        struct autofs_info *ino = autofs_dentry_ino(dentry);

        if (ino) {
                spin_lock(&sbi->lookup_lock);
                if (list_empty(&ino->expiring))
                        list_add(&ino->expiring, &sbi->expiring_list);
                spin_unlock(&sbi->lookup_lock);
        }
}

static inline void autofs_del_expiring(struct dentry *dentry)
{
        struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb);
        struct autofs_info *ino = autofs_dentry_ino(dentry);

        if (ino) {
                spin_lock(&sbi->lookup_lock);
                if (!list_empty(&ino->expiring))
                        list_del_init(&ino->expiring);
                spin_unlock(&sbi->lookup_lock);
        }
}

void autofs_kill_sb(struct super_block *);



























































































































































































   14 










































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_CPUFEATURE_H
#define _ASM_X86_CPUFEATURE_H

#include <asm/processor.h>

#if defined(__KERNEL__) && !defined(__ASSEMBLY__)

#include <asm/asm.h>
#include <linux/bitops.h>
#include <asm/alternative.h>

enum cpuid_leafs
{
        CPUID_1_EDX                = 0,
        CPUID_8000_0001_EDX,
        CPUID_8086_0001_EDX,
        CPUID_LNX_1,
        CPUID_1_ECX,
        CPUID_C000_0001_EDX,
        CPUID_8000_0001_ECX,
        CPUID_LNX_2,
        CPUID_LNX_3,
        CPUID_7_0_EBX,
        CPUID_D_1_EAX,
        CPUID_LNX_4,
        CPUID_7_1_EAX,
        CPUID_8000_0008_EBX,
        CPUID_6_EAX,
        CPUID_8000_000A_EDX,
        CPUID_7_ECX,
        CPUID_8000_0007_EBX,
        CPUID_7_EDX,
        CPUID_8000_001F_EAX,
        CPUID_8000_0021_EAX,
        CPUID_LNX_5,
        CPUID_8000_0021_ECX,
        NR_CPUID_WORDS,
};

#ifdef CONFIG_X86_FEATURE_NAMES
extern const char * const x86_cap_flags[NCAPINTS*32];
extern const char * const x86_power_flags[32];
#define X86_CAP_FMT "%s"
#define x86_cap_flag(flag) x86_cap_flags[flag]
#else
#define X86_CAP_FMT "%d:%d"
#define x86_cap_flag(flag) ((flag) >> 5), ((flag) & 31)
#endif

/*
 * In order to save room, we index into this array by doing
 * X86_BUG_<name> - NCAPINTS*32.
 */
extern const char * const x86_bug_flags[NBUGINTS*32];

#define test_cpu_cap(c, bit)                                                \
         arch_test_bit(bit, (unsigned long *)((c)->x86_capability))

/*
 * There are 32 bits/features in each mask word.  The high bits
 * (selected with (bit>>5) give us the word number and the low 5
 * bits give us the bit/feature number inside the word.
 * (1UL<<((bit)&31) gives us a mask for the feature_bit so we can
 * see if it is set in the mask word.
 */
#define CHECK_BIT_IN_MASK_WORD(maskname, word, bit)        \
        (((bit)>>5)==(word) && (1UL<<((bit)&31) & maskname##word ))

/*
 * {REQUIRED,DISABLED}_MASK_CHECK below may seem duplicated with the
 * following BUILD_BUG_ON_ZERO() check but when NCAPINTS gets changed, all
 * header macros which use NCAPINTS need to be changed. The duplicated macro
 * use causes the compiler to issue errors for all headers so that all usage
 * sites can be corrected.
 */
#define REQUIRED_MASK_BIT_SET(feature_bit)                \
         ( CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  0, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  1, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  2, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  3, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  4, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  5, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  6, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  7, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  8, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  9, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 10, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 11, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 12, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 13, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 14, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 15, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 19, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 20, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 21, feature_bit) ||        \
           REQUIRED_MASK_CHECK                                          ||        \
           BUILD_BUG_ON_ZERO(NCAPINTS != 23))

#define DISABLED_MASK_BIT_SET(feature_bit)                                \
         ( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  0, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  1, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  2, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  3, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  4, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  5, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  6, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  7, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  8, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  9, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 10, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 11, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 12, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 13, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 14, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 15, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 19, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 20, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 21, feature_bit) ||        \
           DISABLED_MASK_CHECK                                          ||        \
           BUILD_BUG_ON_ZERO(NCAPINTS != 23))

#define cpu_has(c, bit)                                                        \
        (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 :        \
         test_cpu_cap(c, bit))

#define this_cpu_has(bit)                                                \
        (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 :        \
         x86_this_cpu_test_bit(bit,                                        \
                (unsigned long __percpu *)&cpu_info.x86_capability))

/*
 * This macro is for detection of features which need kernel
 * infrastructure to be used.  It may *not* directly test the CPU
 * itself.  Use the cpu_has() family if you want true runtime
 * testing of CPU features, like in hypervisor code where you are
 * supporting a possible guest feature where host support for it
 * is not relevant.
 */
#define cpu_feature_enabled(bit)        \
        (__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : static_cpu_has(bit))

#define boot_cpu_has(bit)        cpu_has(&boot_cpu_data, bit)

#define set_cpu_cap(c, bit)        set_bit(bit, (unsigned long *)((c)->x86_capability))

extern void setup_clear_cpu_cap(unsigned int bit);
extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);

#define setup_force_cpu_cap(bit) do { \
        set_cpu_cap(&boot_cpu_data, bit);        \
        set_bit(bit, (unsigned long *)cpu_caps_set);        \
} while (0)

#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit)

#if defined(__clang__) && !defined(CONFIG_CC_HAS_ASM_GOTO)

/*
 * Workaround for the sake of BPF compilation which utilizes kernel
 * headers, but clang does not support ASM GOTO and fails the build.
 */
#ifndef __BPF_TRACING__
#warning "Compiler lacks ASM_GOTO support. Add -D __BPF_TRACING__ to your compiler arguments"
#endif

#define static_cpu_has(bit)            boot_cpu_has(bit)

#else

/*
 * Static testing of CPU features. Used the same as boot_cpu_has(). It
 * statically patches the target code for additional performance. Use
 * static_cpu_has() only in fast paths, where every cycle counts. Which
 * means that the boot_cpu_has() variant is already fast enough for the
 * majority of cases and you should stick to using it as it is generally
 * only two instructions: a RIP-relative MOV and a TEST.
 */
static __always_inline bool _static_cpu_has(u16 bit)
{
        asm_volatile_goto(
                ALTERNATIVE_TERNARY("jmp 6f", %P[feature], "", "jmp %l[t_no]")
                ".section .altinstr_aux,\"ax\"\n"
                "6:\n"
                " testb %[bitnum],%[cap_byte]\n"
                " jnz %l[t_yes]\n"
                " jmp %l[t_no]\n"
                ".previous\n"
                 : : [feature]  "i" (bit),
                     [bitnum]   "i" (1 << (bit & 7)),
                     [cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3])
                 : : t_yes, t_no);
t_yes:
        return true;
t_no:
        return false;
}

#define static_cpu_has(bit)                                        \
(                                                                \
        __builtin_constant_p(boot_cpu_has(bit)) ?                \
                boot_cpu_has(bit) :                                \
                _static_cpu_has(bit)                                \
)
#endif

#define cpu_has_bug(c, bit)                cpu_has(c, (bit))
#define set_cpu_bug(c, bit)                set_cpu_cap(c, (bit))
#define clear_cpu_bug(c, bit)                clear_cpu_cap(c, (bit))

#define static_cpu_has_bug(bit)                static_cpu_has((bit))
#define boot_cpu_has_bug(bit)                cpu_has_bug(&boot_cpu_data, (bit))
#define boot_cpu_set_bug(bit)                set_cpu_cap(&boot_cpu_data, (bit))

#define MAX_CPU_FEATURES                (NCAPINTS * 32)
#define cpu_have_feature                boot_cpu_has

#define CPU_FEATURE_TYPEFMT                "x86,ven%04Xfam%04Xmod%04X"
#define CPU_FEATURE_TYPEVAL                boot_cpu_data.x86_vendor, boot_cpu_data.x86, \
                                        boot_cpu_data.x86_model

#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */
#endif /* _ASM_X86_CPUFEATURE_H */

















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 







































































































    2 

















































    2 
























































































































































































































































































































    1 



















































































































































































































































































































































































































































    1 








































































































































































































































































    1 





















































































    2 














































































































    1 

    1 




    1 
















































    1 

















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
































































    1 
































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *        Definitions for the 'struct sk_buff' memory handlers.
 *
 *        Authors:
 *                Alan Cox, <gw4pts@gw4pts.ampr.org>
 *                Florian La Roche, <rzsfl@rz.uni-sb.de>
 */

#ifndef _LINUX_SKBUFF_H
#define _LINUX_SKBUFF_H

#include <linux/kernel.h>
#include <linux/compiler.h>
#include <linux/time.h>
#include <linux/bug.h>
#include <linux/bvec.h>
#include <linux/cache.h>
#include <linux/rbtree.h>
#include <linux/socket.h>
#include <linux/refcount.h>

#include <linux/atomic.h>
#include <asm/types.h>
#include <linux/spinlock.h>
#include <linux/net.h>
#include <linux/textsearch.h>
#include <net/checksum.h>
#include <linux/rcupdate.h>
#include <linux/hrtimer.h>
#include <linux/dma-mapping.h>
#include <linux/netdev_features.h>
#include <linux/sched.h>
#include <linux/sched/clock.h>
#include <net/flow_dissector.h>
#include <linux/splice.h>
#include <linux/in6.h>
#include <linux/if_packet.h>
#include <net/flow.h>
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#include <linux/netfilter/nf_conntrack_common.h>
#endif

/* The interface for checksum offload between the stack and networking drivers
 * is as follows...
 *
 * A. IP checksum related features
 *
 * Drivers advertise checksum offload capabilities in the features of a device.
 * From the stack's point of view these are capabilities offered by the driver.
 * A driver typically only advertises features that it is capable of offloading
 * to its device.
 *
 * The checksum related features are:
 *
 *        NETIF_F_HW_CSUM        - The driver (or its device) is able to compute one
 *                          IP (one's complement) checksum for any combination
 *                          of protocols or protocol layering. The checksum is
 *                          computed and set in a packet per the CHECKSUM_PARTIAL
 *                          interface (see below).
 *
 *        NETIF_F_IP_CSUM - Driver (device) is only able to checksum plain
 *                          TCP or UDP packets over IPv4. These are specifically
 *                          unencapsulated packets of the form IPv4|TCP or
 *                          IPv4|UDP where the Protocol field in the IPv4 header
 *                          is TCP or UDP. The IPv4 header may contain IP options.
 *                          This feature cannot be set in features for a device
 *                          with NETIF_F_HW_CSUM also set. This feature is being
 *                          DEPRECATED (see below).
 *
 *        NETIF_F_IPV6_CSUM - Driver (device) is only able to checksum plain
 *                          TCP or UDP packets over IPv6. These are specifically
 *                          unencapsulated packets of the form IPv6|TCP or
 *                          IPv6|UDP where the Next Header field in the IPv6
 *                          header is either TCP or UDP. IPv6 extension headers
 *                          are not supported with this feature. This feature
 *                          cannot be set in features for a device with
 *                          NETIF_F_HW_CSUM also set. This feature is being
 *                          DEPRECATED (see below).
 *
 *        NETIF_F_RXCSUM - Driver (device) performs receive checksum offload.
 *                         This flag is only used to disable the RX checksum
 *                         feature for a device. The stack will accept receive
 *                         checksum indication in packets received on a device
 *                         regardless of whether NETIF_F_RXCSUM is set.
 *
 * B. Checksumming of received packets by device. Indication of checksum
 *    verification is set in skb->ip_summed. Possible values are:
 *
 * CHECKSUM_NONE:
 *
 *   Device did not checksum this packet e.g. due to lack of capabilities.
 *   The packet contains full (though not verified) checksum in packet but
 *   not in skb->csum. Thus, skb->csum is undefined in this case.
 *
 * CHECKSUM_UNNECESSARY:
 *
 *   The hardware you're dealing with doesn't calculate the full checksum
 *   (as in CHECKSUM_COMPLETE), but it does parse headers and verify checksums
 *   for specific protocols. For such packets it will set CHECKSUM_UNNECESSARY
 *   if their checksums are okay. skb->csum is still undefined in this case
 *   though. A driver or device must never modify the checksum field in the
 *   packet even if checksum is verified.
 *
 *   CHECKSUM_UNNECESSARY is applicable to following protocols:
 *     TCP: IPv6 and IPv4.
 *     UDP: IPv4 and IPv6. A device may apply CHECKSUM_UNNECESSARY to a
 *       zero UDP checksum for either IPv4 or IPv6, the networking stack
 *       may perform further validation in this case.
 *     GRE: only if the checksum is present in the header.
 *     SCTP: indicates the CRC in SCTP header has been validated.
 *     FCOE: indicates the CRC in FC frame has been validated.
 *
 *   skb->csum_level indicates the number of consecutive checksums found in
 *   the packet minus one that have been verified as CHECKSUM_UNNECESSARY.
 *   For instance if a device receives an IPv6->UDP->GRE->IPv4->TCP packet
 *   and a device is able to verify the checksums for UDP (possibly zero),
 *   GRE (checksum flag is set) and TCP, skb->csum_level would be set to
 *   two. If the device were only able to verify the UDP checksum and not
 *   GRE, either because it doesn't support GRE checksum or because GRE
 *   checksum is bad, skb->csum_level would be set to zero (TCP checksum is
 *   not considered in this case).
 *
 * CHECKSUM_COMPLETE:
 *
 *   This is the most generic way. The device supplied checksum of the _whole_
 *   packet as seen by netif_rx() and fills in skb->csum. This means the
 *   hardware doesn't need to parse L3/L4 headers to implement this.
 *
 *   Notes:
 *   - Even if device supports only some protocols, but is able to produce
 *     skb->csum, it MUST use CHECKSUM_COMPLETE, not CHECKSUM_UNNECESSARY.
 *   - CHECKSUM_COMPLETE is not applicable to SCTP and FCoE protocols.
 *
 * CHECKSUM_PARTIAL:
 *
 *   A checksum is set up to be offloaded to a device as described in the
 *   output description for CHECKSUM_PARTIAL. This may occur on a packet
 *   received directly from another Linux OS, e.g., a virtualized Linux kernel
 *   on the same host, or it may be set in the input path in GRO or remote
 *   checksum offload. For the purposes of checksum verification, the checksum
 *   referred to by skb->csum_start + skb->csum_offset and any preceding
 *   checksums in the packet are considered verified. Any checksums in the
 *   packet that are after the checksum being offloaded are not considered to
 *   be verified.
 *
 * C. Checksumming on transmit for non-GSO. The stack requests checksum offload
 *    in the skb->ip_summed for a packet. Values are:
 *
 * CHECKSUM_PARTIAL:
 *
 *   The driver is required to checksum the packet as seen by hard_start_xmit()
 *   from skb->csum_start up to the end, and to record/write the checksum at
 *   offset skb->csum_start + skb->csum_offset. A driver may verify that the
 *   csum_start and csum_offset values are valid values given the length and
 *   offset of the packet, but it should not attempt to validate that the
 *   checksum refers to a legitimate transport layer checksum -- it is the
 *   purview of the stack to validate that csum_start and csum_offset are set
 *   correctly.
 *
 *   When the stack requests checksum offload for a packet, the driver MUST
 *   ensure that the checksum is set correctly. A driver can either offload the
 *   checksum calculation to the device, or call skb_checksum_help (in the case
 *   that the device does not support offload for a particular checksum).
 *
 *   NETIF_F_IP_CSUM and NETIF_F_IPV6_CSUM are being deprecated in favor of
 *   NETIF_F_HW_CSUM. New devices should use NETIF_F_HW_CSUM to indicate
 *   checksum offload capability.
 *   skb_csum_hwoffload_help() can be called to resolve CHECKSUM_PARTIAL based
 *   on network device checksumming capabilities: if a packet does not match
 *   them, skb_checksum_help or skb_crc32c_help (depending on the value of
 *   csum_not_inet, see item D.) is called to resolve the checksum.
 *
 * CHECKSUM_NONE:
 *
 *   The skb was already checksummed by the protocol, or a checksum is not
 *   required.
 *
 * CHECKSUM_UNNECESSARY:
 *
 *   This has the same meaning as CHECKSUM_NONE for checksum offload on
 *   output.
 *
 * CHECKSUM_COMPLETE:
 *   Not used in checksum output. If a driver observes a packet with this value
 *   set in skbuff, it should treat the packet as if CHECKSUM_NONE were set.
 *
 * D. Non-IP checksum (CRC) offloads
 *
 *   NETIF_F_SCTP_CRC - This feature indicates that a device is capable of
 *     offloading the SCTP CRC in a packet. To perform this offload the stack
 *     will set csum_start and csum_offset accordingly, set ip_summed to
 *     CHECKSUM_PARTIAL and set csum_not_inet to 1, to provide an indication in
 *     the skbuff that the CHECKSUM_PARTIAL refers to CRC32c.
 *     A driver that supports both IP checksum offload and SCTP CRC32c offload
 *     must verify which offload is configured for a packet by testing the
 *     value of skb->csum_not_inet; skb_crc32c_csum_help is provided to resolve
 *     CHECKSUM_PARTIAL on skbs where csum_not_inet is set to 1.
 *
 *   NETIF_F_FCOE_CRC - This feature indicates that a device is capable of
 *     offloading the FCOE CRC in a packet. To perform this offload the stack
 *     will set ip_summed to CHECKSUM_PARTIAL and set csum_start and csum_offset
 *     accordingly. Note that there is no indication in the skbuff that the
 *     CHECKSUM_PARTIAL refers to an FCOE checksum, so a driver that supports
 *     both IP checksum offload and FCOE CRC offload must verify which offload
 *     is configured for a packet, presumably by inspecting packet headers.
 *
 * E. Checksumming on output with GSO.
 *
 * In the case of a GSO packet (skb_is_gso(skb) is true), checksum offload
 * is implied by the SKB_GSO_* flags in gso_type. Most obviously, if the
 * gso_type is SKB_GSO_TCPV4 or SKB_GSO_TCPV6, TCP checksum offload as
 * part of the GSO operation is implied. If a checksum is being offloaded
 * with GSO then ip_summed is CHECKSUM_PARTIAL, and both csum_start and
 * csum_offset are set to refer to the outermost checksum being offloaded
 * (two offloaded checksums are possible with UDP encapsulation).
 */

/* Don't change this without changing skb_csum_unnecessary! */
#define CHECKSUM_NONE                0
#define CHECKSUM_UNNECESSARY        1
#define CHECKSUM_COMPLETE        2
#define CHECKSUM_PARTIAL        3

/* Maximum value in skb->csum_level */
#define SKB_MAX_CSUM_LEVEL        3

#define SKB_DATA_ALIGN(X)        ALIGN(X, SMP_CACHE_BYTES)
#define SKB_WITH_OVERHEAD(X)        \
        ((X) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
#define SKB_MAX_ORDER(X, ORDER) \
        SKB_WITH_OVERHEAD((PAGE_SIZE << (ORDER)) - (X))
#define SKB_MAX_HEAD(X)                (SKB_MAX_ORDER((X), 0))
#define SKB_MAX_ALLOC                (SKB_MAX_ORDER(0, 2))

/* return minimum truesize of one skb containing X bytes of data */
#define SKB_TRUESIZE(X) ((X) +                                                \
                         SKB_DATA_ALIGN(sizeof(struct sk_buff)) +        \
                         SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))

struct ahash_request;
struct net_device;
struct scatterlist;
struct pipe_inode_info;
struct iov_iter;
struct napi_struct;
struct bpf_prog;
union bpf_attr;
struct skb_ext;

#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
struct nf_bridge_info {
        enum {
                BRNF_PROTO_UNCHANGED,
                BRNF_PROTO_8021Q,
                BRNF_PROTO_PPPOE
        } orig_proto:8;
        u8                        pkt_otherhost:1;
        u8                        in_prerouting:1;
        u8                        bridged_dnat:1;
        u8                        sabotage_in_done:1;
        __u16                        frag_max_size;
        struct net_device        *physindev;

        /* always valid & non-NULL from FORWARD on, for physdev match */
        struct net_device        *physoutdev;
        union {
                /* prerouting: detect dnat in orig/reply direction */
                __be32          ipv4_daddr;
                struct in6_addr ipv6_daddr;

                /* after prerouting + nat detected: store original source
                 * mac since neigh resolution overwrites it, only used while
                 * skb is out in neigh layer.
                 */
                char neigh_header[8];
        };
};
#endif

#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
/* Chain in tc_skb_ext will be used to share the tc chain with
 * ovs recirc_id. It will be set to the current chain by tc
 * and read by ovs to recirc_id.
 */
struct tc_skb_ext {
        __u32 chain;
        __u16 mru;
};
#endif

struct sk_buff_head {
        /* These two members must be first. */
        struct sk_buff        *next;
        struct sk_buff        *prev;

        __u32                qlen;
        spinlock_t        lock;
};

struct sk_buff;

/* To allow 64K frame to be packed as single skb without frag_list we
 * require 64K/PAGE_SIZE pages plus 1 additional page to allow for
 * buffers which do not start on a page boundary.
 *
 * Since GRO uses frags we allocate at least 16 regardless of page
 * size.
 */
#if (65536/PAGE_SIZE + 1) < 16
#define MAX_SKB_FRAGS 16UL
#else
#define MAX_SKB_FRAGS (65536/PAGE_SIZE + 1)
#endif
extern int sysctl_max_skb_frags;

/* Set skb_shinfo(skb)->gso_size to this in case you want skb_segment to
 * segment using its current segmentation instead.
 */
#define GSO_BY_FRAGS        0xFFFF

typedef struct bio_vec skb_frag_t;

/**
 * skb_frag_size() - Returns the size of a skb fragment
 * @frag: skb fragment
 */
static inline unsigned int skb_frag_size(const skb_frag_t *frag)
{
        return frag->bv_len;
}

/**
 * skb_frag_size_set() - Sets the size of a skb fragment
 * @frag: skb fragment
 * @size: size of fragment
 */
static inline void skb_frag_size_set(skb_frag_t *frag, unsigned int size)
{
        frag->bv_len = size;
}

/**
 * skb_frag_size_add() - Increments the size of a skb fragment by @delta
 * @frag: skb fragment
 * @delta: value to add
 */
static inline void skb_frag_size_add(skb_frag_t *frag, int delta)
{
        frag->bv_len += delta;
}

/**
 * skb_frag_size_sub() - Decrements the size of a skb fragment by @delta
 * @frag: skb fragment
 * @delta: value to subtract
 */
static inline void skb_frag_size_sub(skb_frag_t *frag, int delta)
{
        frag->bv_len -= delta;
}

/**
 * skb_frag_must_loop - Test if %p is a high memory page
 * @p: fragment's page
 */
static inline bool skb_frag_must_loop(struct page *p)
{
#if defined(CONFIG_HIGHMEM)
        if (PageHighMem(p))
                return true;
#endif
        return false;
}

/**
 *        skb_frag_foreach_page - loop over pages in a fragment
 *
 *        @f:                skb frag to operate on
 *        @f_off:                offset from start of f->bv_page
 *        @f_len:                length from f_off to loop over
 *        @p:                (temp var) current page
 *        @p_off:                (temp var) offset from start of current page,
 *                                   non-zero only on first page.
 *        @p_len:                (temp var) length in current page,
 *                                   < PAGE_SIZE only on first and last page.
 *        @copied:        (temp var) length so far, excluding current p_len.
 *
 *        A fragment can hold a compound page, in which case per-page
 *        operations, notably kmap_atomic, must be called for each
 *        regular page.
 */
#define skb_frag_foreach_page(f, f_off, f_len, p, p_off, p_len, copied)        \
        for (p = skb_frag_page(f) + ((f_off) >> PAGE_SHIFT),                \
             p_off = (f_off) & (PAGE_SIZE - 1),                                \
             p_len = skb_frag_must_loop(p) ?                                \
             min_t(u32, f_len, PAGE_SIZE - p_off) : f_len,                \
             copied = 0;                                                \
             copied < f_len;                                                \
             copied += p_len, p++, p_off = 0,                                \
             p_len = min_t(u32, f_len - copied, PAGE_SIZE))                \

#define HAVE_HW_TIME_STAMP

/**
 * struct skb_shared_hwtstamps - hardware time stamps
 * @hwtstamp:        hardware time stamp transformed into duration
 *                since arbitrary point in time
 *
 * Software time stamps generated by ktime_get_real() are stored in
 * skb->tstamp.
 *
 * hwtstamps can only be compared against other hwtstamps from
 * the same device.
 *
 * This structure is attached to packets as part of the
 * &skb_shared_info. Use skb_hwtstamps() to get a pointer.
 */
struct skb_shared_hwtstamps {
        ktime_t        hwtstamp;
};

/* Definitions for tx_flags in struct skb_shared_info */
enum {
        /* generate hardware time stamp */
        SKBTX_HW_TSTAMP = 1 << 0,

        /* generate software time stamp when queueing packet to NIC */
        SKBTX_SW_TSTAMP = 1 << 1,

        /* device driver is going to provide hardware time stamp */
        SKBTX_IN_PROGRESS = 1 << 2,

        /* device driver supports TX zero-copy buffers */
        SKBTX_DEV_ZEROCOPY = 1 << 3,

        /* generate wifi status information (where possible) */
        SKBTX_WIFI_STATUS = 1 << 4,

        /* This indicates at least one fragment might be overwritten
         * (as in vmsplice(), sendfile() ...)
         * If we need to compute a TX checksum, we'll need to copy
         * all frags to avoid possible bad checksum
         */
        SKBTX_SHARED_FRAG = 1 << 5,

        /* generate software time stamp when entering packet scheduling */
        SKBTX_SCHED_TSTAMP = 1 << 6,
};

#define SKBTX_ZEROCOPY_FRAG        (SKBTX_DEV_ZEROCOPY | SKBTX_SHARED_FRAG)
#define SKBTX_ANY_SW_TSTAMP        (SKBTX_SW_TSTAMP    | \
                                 SKBTX_SCHED_TSTAMP)
#define SKBTX_ANY_TSTAMP        (SKBTX_HW_TSTAMP | SKBTX_ANY_SW_TSTAMP)

/*
 * The callback notifies userspace to release buffers when skb DMA is done in
 * lower device, the skb last reference should be 0 when calling this.
 * The zerocopy_success argument is true if zero copy transmit occurred,
 * false on data copy or out of memory error caused by data copy attempt.
 * The ctx field is used to track device context.
 * The desc field is used to track userspace buffer index.
 */
struct ubuf_info {
        void (*callback)(struct ubuf_info *, bool zerocopy_success);
        union {
                struct {
                        unsigned long desc;
                        void *ctx;
                };
                struct {
                        u32 id;
                        u16 len;
                        u16 zerocopy:1;
                        u32 bytelen;
                };
        };
        refcount_t refcnt;

        struct mmpin {
                struct user_struct *user;
                unsigned int num_pg;
        } mmp;
};

#define skb_uarg(SKB)        ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg))

int mm_account_pinned_pages(struct mmpin *mmp, size_t size);
void mm_unaccount_pinned_pages(struct mmpin *mmp);

struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size);
struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
                                        struct ubuf_info *uarg);

static inline void sock_zerocopy_get(struct ubuf_info *uarg)
{
        refcount_inc(&uarg->refcnt);
}

void sock_zerocopy_put(struct ubuf_info *uarg);
void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref);

void sock_zerocopy_callback(struct ubuf_info *uarg, bool success);

int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len);
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
                             struct msghdr *msg, int len,
                             struct ubuf_info *uarg);

/* This data is invariant across clones and lives at
 * the end of the header data, ie. at skb->end.
 */
struct skb_shared_info {
        __u8                __unused;
        __u8                meta_len;
        __u8                nr_frags;
        __u8                tx_flags;
        unsigned short        gso_size;
        /* Warning: this field is not always filled in (UFO)! */
        unsigned short        gso_segs;
        struct sk_buff        *frag_list;
        struct skb_shared_hwtstamps hwtstamps;
        unsigned int        gso_type;
        u32                tskey;

        /*
         * Warning : all fields before dataref are cleared in __alloc_skb()
         */
        atomic_t        dataref;

        /* Intermediate layers must ensure that destructor_arg
         * remains valid until skb destructor */
        void *                destructor_arg;

        /* must be last field, see pskb_expand_head() */
        skb_frag_t        frags[MAX_SKB_FRAGS];
};

/* We divide dataref into two halves.  The higher 16 bits hold references
 * to the payload part of skb->data.  The lower 16 bits hold references to
 * the entire skb->data.  A clone of a headerless skb holds the length of
 * the header in skb->hdr_len.
 *
 * All users must obey the rule that the skb->data reference count must be
 * greater than or equal to the payload reference count.
 *
 * Holding a reference to the payload part means that the user does not
 * care about modifications to the header part of skb->data.
 */
#define SKB_DATAREF_SHIFT 16
#define SKB_DATAREF_MASK ((1 << SKB_DATAREF_SHIFT) - 1)


enum {
        SKB_FCLONE_UNAVAILABLE,        /* skb has no fclone (from head_cache) */
        SKB_FCLONE_ORIG,        /* orig skb (from fclone_cache) */
        SKB_FCLONE_CLONE,        /* companion fclone skb (from fclone_cache) */
};

enum {
        SKB_GSO_TCPV4 = 1 << 0,

        /* This indicates the skb is from an untrusted source. */
        SKB_GSO_DODGY = 1 << 1,

        /* This indicates the tcp segment has CWR set. */
        SKB_GSO_TCP_ECN = 1 << 2,

        SKB_GSO_TCP_FIXEDID = 1 << 3,

        SKB_GSO_TCPV6 = 1 << 4,

        SKB_GSO_FCOE = 1 << 5,

        SKB_GSO_GRE = 1 << 6,

        SKB_GSO_GRE_CSUM = 1 << 7,

        SKB_GSO_IPXIP4 = 1 << 8,

        SKB_GSO_IPXIP6 = 1 << 9,

        SKB_GSO_UDP_TUNNEL = 1 << 10,

        SKB_GSO_UDP_TUNNEL_CSUM = 1 << 11,

        SKB_GSO_PARTIAL = 1 << 12,

        SKB_GSO_TUNNEL_REMCSUM = 1 << 13,

        SKB_GSO_SCTP = 1 << 14,

        SKB_GSO_ESP = 1 << 15,

        SKB_GSO_UDP = 1 << 16,

        SKB_GSO_UDP_L4 = 1 << 17,

        SKB_GSO_FRAGLIST = 1 << 18,
};

#if BITS_PER_LONG > 32
#define NET_SKBUFF_DATA_USES_OFFSET 1
#endif

#ifdef NET_SKBUFF_DATA_USES_OFFSET
typedef unsigned int sk_buff_data_t;
#else
typedef unsigned char *sk_buff_data_t;
#endif

/**
 *        struct sk_buff - socket buffer
 *        @next: Next buffer in list
 *        @prev: Previous buffer in list
 *        @tstamp: Time we arrived/left
 *        @skb_mstamp_ns: (aka @tstamp) earliest departure time; start point
 *                for retransmit timer
 *        @rbnode: RB tree node, alternative to next/prev for netem/tcp
 *        @list: queue head
 *        @sk: Socket we are owned by
 *        @ip_defrag_offset: (aka @sk) alternate use of @sk, used in
 *                fragmentation management
 *        @dev: Device we arrived on/are leaving by
 *        @dev_scratch: (aka @dev) alternate use of @dev when @dev would be %NULL
 *        @cb: Control buffer. Free for use by every layer. Put private vars here
 *        @_skb_refdst: destination entry (with norefcount bit)
 *        @sp: the security path, used for xfrm
 *        @len: Length of actual data
 *        @data_len: Data length
 *        @mac_len: Length of link layer header
 *        @hdr_len: writable header length of cloned skb
 *        @csum: Checksum (must include start/offset pair)
 *        @csum_start: Offset from skb->head where checksumming should start
 *        @csum_offset: Offset from csum_start where checksum should be stored
 *        @priority: Packet queueing priority
 *        @ignore_df: allow local fragmentation
 *        @cloned: Head may be cloned (check refcnt to be sure)
 *        @ip_summed: Driver fed us an IP checksum
 *        @nohdr: Payload reference only, must not modify header
 *        @pkt_type: Packet class
 *        @fclone: skbuff clone status
 *        @ipvs_property: skbuff is owned by ipvs
 *        @inner_protocol_type: whether the inner protocol is
 *                ENCAP_TYPE_ETHER or ENCAP_TYPE_IPPROTO
 *        @remcsum_offload: remote checksum offload is enabled
 *        @offload_fwd_mark: Packet was L2-forwarded in hardware
 *        @offload_l3_fwd_mark: Packet was L3-forwarded in hardware
 *        @tc_skip_classify: do not classify packet. set by IFB device
 *        @tc_at_ingress: used within tc_classify to distinguish in/egress
 *        @redirected: packet was redirected by packet classifier
 *        @from_ingress: packet was redirected from the ingress path
 *        @peeked: this packet has been seen already, so stats have been
 *                done for it, don't do them again
 *        @nf_trace: netfilter packet trace flag
 *        @protocol: Packet protocol from driver
 *        @destructor: Destruct function
 *        @tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue)
 *        @_nfct: Associated connection, if any (with nfctinfo bits)
 *        @nf_bridge: Saved data about a bridged frame - see br_netfilter.c
 *        @skb_iif: ifindex of device we arrived on
 *        @tc_index: Traffic control index
 *        @hash: the packet hash
 *        @queue_mapping: Queue mapping for multiqueue devices
 *        @head_frag: skb was allocated from page fragments,
 *                not allocated by kmalloc() or vmalloc().
 *        @pfmemalloc: skbuff was allocated from PFMEMALLOC reserves
 *        @active_extensions: active extensions (skb_ext_id types)
 *        @ndisc_nodetype: router type (from link layer)
 *        @ooo_okay: allow the mapping of a socket to a queue to be changed
 *        @l4_hash: indicate hash is a canonical 4-tuple hash over transport
 *                ports.
 *        @sw_hash: indicates hash was computed in software stack
 *        @wifi_acked_valid: wifi_acked was set
 *        @wifi_acked: whether frame was acked on wifi or not
 *        @no_fcs:  Request NIC to treat last 4 bytes as Ethernet FCS
 *        @encapsulation: indicates the inner headers in the skbuff are valid
 *        @encap_hdr_csum: software checksum is needed
 *        @csum_valid: checksum is already valid
 *        @csum_not_inet: use CRC32c to resolve CHECKSUM_PARTIAL
 *        @csum_complete_sw: checksum was completed by software
 *        @csum_level: indicates the number of consecutive checksums found in
 *                the packet minus one that have been verified as
 *                CHECKSUM_UNNECESSARY (max 3)
 *        @scm_io_uring: SKB holds io_uring registered files
 *        @dst_pending_confirm: need to confirm neighbour
 *        @decrypted: Decrypted SKB
 *        @napi_id: id of the NAPI struct this skb came from
 *        @sender_cpu: (aka @napi_id) source CPU in XPS
 *        @secmark: security marking
 *        @mark: Generic packet mark
 *        @reserved_tailroom: (aka @mark) number of bytes of free space available
 *                at the tail of an sk_buff
 *        @vlan_present: VLAN tag is present
 *        @vlan_proto: vlan encapsulation protocol
 *        @vlan_tci: vlan tag control information
 *        @inner_protocol: Protocol (encapsulation)
 *        @inner_ipproto: (aka @inner_protocol) stores ipproto when
 *                skb->inner_protocol_type == ENCAP_TYPE_IPPROTO;
 *        @inner_transport_header: Inner transport layer header (encapsulation)
 *        @inner_network_header: Network layer header (encapsulation)
 *        @inner_mac_header: Link layer header (encapsulation)
 *        @transport_header: Transport layer header
 *        @network_header: Network layer header
 *        @mac_header: Link layer header
 *        @kcov_handle: KCOV remote handle for remote coverage collection
 *        @tail: Tail pointer
 *        @end: End pointer
 *        @head: Head of buffer
 *        @data: Data head pointer
 *        @truesize: Buffer size
 *        @users: User count - see {datagram,tcp}.c
 *        @extensions: allocated extensions, valid if active_extensions is nonzero
 */

struct sk_buff {
        union {
                struct {
                        /* These two members must be first. */
                        struct sk_buff                *next;
                        struct sk_buff                *prev;

                        union {
                                struct net_device        *dev;
                                /* Some protocols might use this space to store information,
                                 * while device pointer would be NULL.
                                 * UDP receive path is one user.
                                 */
                                unsigned long                dev_scratch;
                        };
                };
                struct rb_node                rbnode; /* used in netem, ip4 defrag, and tcp stack */
                struct list_head        list;
        };

        struct sock                *sk;

        union {
                ktime_t                tstamp;
                u64                skb_mstamp_ns; /* earliest departure time */
        };
        /*
         * This is the control buffer. It is free to use for every
         * layer. Please put your private variables there. If you
         * want to keep them across layers you have to do a skb_clone()
         * first. This is owned by whoever has the skb queued ATM.
         */
        char                        cb[48] __aligned(8);

        union {
                struct {
                        unsigned long        _skb_refdst;
                        void                (*destructor)(struct sk_buff *skb);
                };
                struct list_head        tcp_tsorted_anchor;
        };

#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
        unsigned long                 _nfct;
#endif
        unsigned int                len,
                                data_len;
        __u16                        mac_len,
                                hdr_len;

        /* Following fields are _not_ copied in __copy_skb_header()
         * Note that queue_mapping is here mostly to fill a hole.
         */
        __u16                        queue_mapping;

/* if you move cloned around you also must adapt those constants */
#ifdef __BIG_ENDIAN_BITFIELD
#define CLONED_MASK        (1 << 7)
#else
#define CLONED_MASK        1
#endif
#define CLONED_OFFSET()                offsetof(struct sk_buff, __cloned_offset)

        /* private: */
        __u8                        __cloned_offset[0];
        /* public: */
        __u8                        cloned:1,
                                nohdr:1,
                                fclone:2,
                                peeked:1,
                                head_frag:1,
                                pfmemalloc:1;
#ifdef CONFIG_SKB_EXTENSIONS
        __u8                        active_extensions;
#endif
        /* fields enclosed in headers_start/headers_end are copied
         * using a single memcpy() in __copy_skb_header()
         */
        /* private: */
        __u32                        headers_start[0];
        /* public: */

/* if you move pkt_type around you also must adapt those constants */
#ifdef __BIG_ENDIAN_BITFIELD
#define PKT_TYPE_MAX        (7 << 5)
#else
#define PKT_TYPE_MAX        7
#endif
#define PKT_TYPE_OFFSET()        offsetof(struct sk_buff, __pkt_type_offset)

        /* private: */
        __u8                        __pkt_type_offset[0];
        /* public: */
        __u8                        pkt_type:3;
        __u8                        ignore_df:1;
        __u8                        nf_trace:1;
        __u8                        ip_summed:2;
        __u8                        ooo_okay:1;

        __u8                        l4_hash:1;
        __u8                        sw_hash:1;
        __u8                        wifi_acked_valid:1;
        __u8                        wifi_acked:1;
        __u8                        no_fcs:1;
        /* Indicates the inner headers are valid in the skbuff. */
        __u8                        encapsulation:1;
        __u8                        encap_hdr_csum:1;
        __u8                        csum_valid:1;

#ifdef __BIG_ENDIAN_BITFIELD
#define PKT_VLAN_PRESENT_BIT        7
#else
#define PKT_VLAN_PRESENT_BIT        0
#endif
#define PKT_VLAN_PRESENT_OFFSET()        offsetof(struct sk_buff, __pkt_vlan_present_offset)
        /* private: */
        __u8                        __pkt_vlan_present_offset[0];
        /* public: */
        __u8                        vlan_present:1;
        __u8                        csum_complete_sw:1;
        __u8                        csum_level:2;
        __u8                        csum_not_inet:1;
        __u8                        dst_pending_confirm:1;
#ifdef CONFIG_IPV6_NDISC_NODETYPE
        __u8                        ndisc_nodetype:2;
#endif

        __u8                        ipvs_property:1;
        __u8                        inner_protocol_type:1;
        __u8                        remcsum_offload:1;
#ifdef CONFIG_NET_SWITCHDEV
        __u8                        offload_fwd_mark:1;
        __u8                        offload_l3_fwd_mark:1;
#endif
#ifdef CONFIG_NET_CLS_ACT
        __u8                        tc_skip_classify:1;
        __u8                        tc_at_ingress:1;
#endif
#ifdef CONFIG_NET_REDIRECT
        __u8                        redirected:1;
        __u8                        from_ingress:1;
#endif
#ifdef CONFIG_TLS_DEVICE
        __u8                        decrypted:1;
#endif
        __u8                        scm_io_uring:1;

#ifdef CONFIG_NET_SCHED
        __u16                        tc_index;        /* traffic control index */
#endif

        union {
                __wsum                csum;
                struct {
                        __u16        csum_start;
                        __u16        csum_offset;
                };
        };
        __u32                        priority;
        int                        skb_iif;
        __u32                        hash;
        __be16                        vlan_proto;
        __u16                        vlan_tci;
#if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS)
        union {
                unsigned int        napi_id;
                unsigned int        sender_cpu;
        };
#endif
#ifdef CONFIG_NETWORK_SECMARK
        __u32                secmark;
#endif

        union {
                __u32                mark;
                __u32                reserved_tailroom;
        };

        union {
                __be16                inner_protocol;
                __u8                inner_ipproto;
        };

        __u16                        inner_transport_header;
        __u16                        inner_network_header;
        __u16                        inner_mac_header;

        __be16                        protocol;
        __u16                        transport_header;
        __u16                        network_header;
        __u16                        mac_header;

#ifdef CONFIG_KCOV
        u64                        kcov_handle;
#endif

        /* private: */
        __u32                        headers_end[0];
        /* public: */

        /* These elements must be at the end, see alloc_skb() for details.  */
        sk_buff_data_t                tail;
        sk_buff_data_t                end;
        unsigned char                *head,
                                *data;
        unsigned int                truesize;
        refcount_t                users;

#ifdef CONFIG_SKB_EXTENSIONS
        /* only useable after checking ->active_extensions != 0 */
        struct skb_ext                *extensions;
#endif
};

#ifdef __KERNEL__
/*
 *        Handling routines are only of interest to the kernel
 */

#define SKB_ALLOC_FCLONE        0x01
#define SKB_ALLOC_RX                0x02
#define SKB_ALLOC_NAPI                0x04

/**
 * skb_pfmemalloc - Test if the skb was allocated from PFMEMALLOC reserves
 * @skb: buffer
 */
static inline bool skb_pfmemalloc(const struct sk_buff *skb)
{
        return unlikely(skb->pfmemalloc);
}

/*
 * skb might have a dst pointer attached, refcounted or not.
 * _skb_refdst low order bit is set if refcount was _not_ taken
 */
#define SKB_DST_NOREF        1UL
#define SKB_DST_PTRMASK        ~(SKB_DST_NOREF)

/**
 * skb_dst - returns skb dst_entry
 * @skb: buffer
 *
 * Returns skb dst_entry, regardless of reference taken or not.
 */
static inline struct dst_entry *skb_dst(const struct sk_buff *skb)
{
        /* If refdst was not refcounted, check we still are in a
         * rcu_read_lock section
         */
        WARN_ON((skb->_skb_refdst & SKB_DST_NOREF) &&
                !rcu_read_lock_held() &&
                !rcu_read_lock_bh_held());
        return (struct dst_entry *)(skb->_skb_refdst & SKB_DST_PTRMASK);
}

/**
 * skb_dst_set - sets skb dst
 * @skb: buffer
 * @dst: dst entry
 *
 * Sets skb dst, assuming a reference was taken on dst and should
 * be released by skb_dst_drop()
 */
static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst)
{
        skb->_skb_refdst = (unsigned long)dst;
}

/**
 * skb_dst_set_noref - sets skb dst, hopefully, without taking reference
 * @skb: buffer
 * @dst: dst entry
 *
 * Sets skb dst, assuming a reference was not taken on dst.
 * If dst entry is cached, we do not take reference and dst_release
 * will be avoided by refdst_drop. If dst entry is not cached, we take
 * reference, so that last dst_release can destroy the dst immediately.
 */
static inline void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst)
{
        WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
        skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF;
}

/**
 * skb_dst_is_noref - Test if skb dst isn't refcounted
 * @skb: buffer
 */
static inline bool skb_dst_is_noref(const struct sk_buff *skb)
{
        return (skb->_skb_refdst & SKB_DST_NOREF) && skb_dst(skb);
}

/**
 * skb_rtable - Returns the skb &rtable
 * @skb: buffer
 */
static inline struct rtable *skb_rtable(const struct sk_buff *skb)
{
        return (struct rtable *)skb_dst(skb);
}

/* For mangling skb->pkt_type from user space side from applications
 * such as nft, tc, etc, we only allow a conservative subset of
 * possible pkt_types to be set.
*/
static inline bool skb_pkt_type_ok(u32 ptype)
{
        return ptype <= PACKET_OTHERHOST;
}

/**
 * skb_napi_id - Returns the skb's NAPI id
 * @skb: buffer
 */
static inline unsigned int skb_napi_id(const struct sk_buff *skb)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        return skb->napi_id;
#else
        return 0;
#endif
}

/**
 * skb_unref - decrement the skb's reference count
 * @skb: buffer
 *
 * Returns true if we can free the skb.
 */
static inline bool skb_unref(struct sk_buff *skb)
{
        if (unlikely(!skb))
                return false;
        if (likely(refcount_read(&skb->users) == 1))
                smp_rmb();
        else if (likely(!refcount_dec_and_test(&skb->users)))
                return false;

        return true;
}

void skb_release_head_state(struct sk_buff *skb);
void kfree_skb(struct sk_buff *skb);
void kfree_skb_list(struct sk_buff *segs);
void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt);
void skb_tx_error(struct sk_buff *skb);

#ifdef CONFIG_TRACEPOINTS
void consume_skb(struct sk_buff *skb);
#else
static inline void consume_skb(struct sk_buff *skb)
{
        return kfree_skb(skb);
}
#endif

void __consume_stateless_skb(struct sk_buff *skb);
void  __kfree_skb(struct sk_buff *skb);
extern struct kmem_cache *skbuff_head_cache;

void kfree_skb_partial(struct sk_buff *skb, bool head_stolen);
bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
                      bool *fragstolen, int *delta_truesize);

struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags,
                            int node);
struct sk_buff *__build_skb(void *data, unsigned int frag_size);
struct sk_buff *build_skb(void *data, unsigned int frag_size);
struct sk_buff *build_skb_around(struct sk_buff *skb,
                                 void *data, unsigned int frag_size);

/**
 * alloc_skb - allocate a network buffer
 * @size: size to allocate
 * @priority: allocation mask
 *
 * This function is a convenient wrapper around __alloc_skb().
 */
static inline struct sk_buff *alloc_skb(unsigned int size,
                                        gfp_t priority)
{
        return __alloc_skb(size, priority, 0, NUMA_NO_NODE);
}

struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
                                     unsigned long data_len,
                                     int max_page_order,
                                     int *errcode,
                                     gfp_t gfp_mask);
struct sk_buff *alloc_skb_for_msg(struct sk_buff *first);

/* Layout of fast clones : [skb1][skb2][fclone_ref] */
struct sk_buff_fclones {
        struct sk_buff        skb1;

        struct sk_buff        skb2;

        refcount_t        fclone_ref;
};

/**
 *        skb_fclone_busy - check if fclone is busy
 *        @sk: socket
 *        @skb: buffer
 *
 * Returns true if skb is a fast clone, and its clone is not freed.
 * Some drivers call skb_orphan() in their ndo_start_xmit(),
 * so we also check that this didnt happen.
 */
static inline bool skb_fclone_busy(const struct sock *sk,
                                   const struct sk_buff *skb)
{
        const struct sk_buff_fclones *fclones;

        fclones = container_of(skb, struct sk_buff_fclones, skb1);

        return skb->fclone == SKB_FCLONE_ORIG &&
               refcount_read(&fclones->fclone_ref) > 1 &&
               fclones->skb2.sk == sk;
}

/**
 * alloc_skb_fclone - allocate a network buffer from fclone cache
 * @size: size to allocate
 * @priority: allocation mask
 *
 * This function is a convenient wrapper around __alloc_skb().
 */
static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
                                               gfp_t priority)
{
        return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE);
}

struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src);
void skb_headers_offset_update(struct sk_buff *skb, int off);
int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask);
struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t priority);
void skb_copy_header(struct sk_buff *new, const struct sk_buff *old);
struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t priority);
struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
                                   gfp_t gfp_mask, bool fclone);
static inline struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom,
                                          gfp_t gfp_mask)
{
        return __pskb_copy_fclone(skb, headroom, gfp_mask, false);
}

int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, gfp_t gfp_mask);
struct sk_buff *skb_realloc_headroom(struct sk_buff *skb,
                                     unsigned int headroom);
struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom);
struct sk_buff *skb_copy_expand(const struct sk_buff *skb, int newheadroom,
                                int newtailroom, gfp_t priority);
int __must_check skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
                                     int offset, int len);
int __must_check skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg,
                              int offset, int len);
int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer);
int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error);

/**
 *        skb_pad                        -        zero pad the tail of an skb
 *        @skb: buffer to pad
 *        @pad: space to pad
 *
 *        Ensure that a buffer is followed by a padding area that is zero
 *        filled. Used by network drivers which may DMA or transfer data
 *        beyond the buffer end onto the wire.
 *
 *        May return error in out of memory cases. The skb is freed on error.
 */
static inline int skb_pad(struct sk_buff *skb, int pad)
{
        return __skb_pad(skb, pad, true);
}
#define dev_kfree_skb(a)        consume_skb(a)

int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
                         int offset, size_t size);

struct skb_seq_state {
        __u32                lower_offset;
        __u32                upper_offset;
        __u32                frag_idx;
        __u32                stepped_offset;
        struct sk_buff        *root_skb;
        struct sk_buff        *cur_skb;
        __u8                *frag_data;
};

void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
                          unsigned int to, struct skb_seq_state *st);
unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
                          struct skb_seq_state *st);
void skb_abort_seq_read(struct skb_seq_state *st);

unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
                           unsigned int to, struct ts_config *config);

/*
 * Packet hash types specify the type of hash in skb_set_hash.
 *
 * Hash types refer to the protocol layer addresses which are used to
 * construct a packet's hash. The hashes are used to differentiate or identify
 * flows of the protocol layer for the hash type. Hash types are either
 * layer-2 (L2), layer-3 (L3), or layer-4 (L4).
 *
 * Properties of hashes:
 *
 * 1) Two packets in different flows have different hash values
 * 2) Two packets in the same flow should have the same hash value
 *
 * A hash at a higher layer is considered to be more specific. A driver should
 * set the most specific hash possible.
 *
 * A driver cannot indicate a more specific hash than the layer at which a hash
 * was computed. For instance an L3 hash cannot be set as an L4 hash.
 *
 * A driver may indicate a hash level which is less specific than the
 * actual layer the hash was computed on. For instance, a hash computed
 * at L4 may be considered an L3 hash. This should only be done if the
 * driver can't unambiguously determine that the HW computed the hash at
 * the higher layer. Note that the "should" in the second property above
 * permits this.
 */
enum pkt_hash_types {
        PKT_HASH_TYPE_NONE,        /* Undefined type */
        PKT_HASH_TYPE_L2,        /* Input: src_MAC, dest_MAC */
        PKT_HASH_TYPE_L3,        /* Input: src_IP, dst_IP */
        PKT_HASH_TYPE_L4,        /* Input: src_IP, dst_IP, src_port, dst_port */
};

static inline void skb_clear_hash(struct sk_buff *skb)
{
        skb->hash = 0;
        skb->sw_hash = 0;
        skb->l4_hash = 0;
}

static inline void skb_clear_hash_if_not_l4(struct sk_buff *skb)
{
        if (!skb->l4_hash)
                skb_clear_hash(skb);
}

static inline void
__skb_set_hash(struct sk_buff *skb, __u32 hash, bool is_sw, bool is_l4)
{
        skb->l4_hash = is_l4;
        skb->sw_hash = is_sw;
        skb->hash = hash;
}

static inline void
skb_set_hash(struct sk_buff *skb, __u32 hash, enum pkt_hash_types type)
{
        /* Used by drivers to set hash from HW */
        __skb_set_hash(skb, hash, false, type == PKT_HASH_TYPE_L4);
}

static inline void
__skb_set_sw_hash(struct sk_buff *skb, __u32 hash, bool is_l4)
{
        __skb_set_hash(skb, hash, true, is_l4);
}

void __skb_get_hash(struct sk_buff *skb);
u32 __skb_get_hash_symmetric(const struct sk_buff *skb);
u32 skb_get_poff(const struct sk_buff *skb);
u32 __skb_get_poff(const struct sk_buff *skb, void *data,
                   const struct flow_keys_basic *keys, int hlen);
__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
                            void *data, int hlen_proto);

static inline __be32 skb_flow_get_ports(const struct sk_buff *skb,
                                        int thoff, u8 ip_proto)
{
        return __skb_flow_get_ports(skb, thoff, ip_proto, NULL, 0);
}

void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
                             const struct flow_dissector_key *key,
                             unsigned int key_count);

struct bpf_flow_dissector;
bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
                      __be16 proto, int nhoff, int hlen, unsigned int flags);

bool __skb_flow_dissect(const struct net *net,
                        const struct sk_buff *skb,
                        struct flow_dissector *flow_dissector,
                        void *target_container,
                        void *data, __be16 proto, int nhoff, int hlen,
                        unsigned int flags);

static inline bool skb_flow_dissect(const struct sk_buff *skb,
                                    struct flow_dissector *flow_dissector,
                                    void *target_container, unsigned int flags)
{
        return __skb_flow_dissect(NULL, skb, flow_dissector,
                                  target_container, NULL, 0, 0, 0, flags);
}

static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb,
                                              struct flow_keys *flow,
                                              unsigned int flags)
{
        memset(flow, 0, sizeof(*flow));
        return __skb_flow_dissect(NULL, skb, &flow_keys_dissector,
                                  flow, NULL, 0, 0, 0, flags);
}

static inline bool
skb_flow_dissect_flow_keys_basic(const struct net *net,
                                 const struct sk_buff *skb,
                                 struct flow_keys_basic *flow, void *data,
                                 __be16 proto, int nhoff, int hlen,
                                 unsigned int flags)
{
        memset(flow, 0, sizeof(*flow));
        return __skb_flow_dissect(net, skb, &flow_keys_basic_dissector, flow,
                                  data, proto, nhoff, hlen, flags);
}

void skb_flow_dissect_meta(const struct sk_buff *skb,
                           struct flow_dissector *flow_dissector,
                           void *target_container);

/* Gets a skb connection tracking info, ctinfo map should be a
 * map of mapsize to translate enum ip_conntrack_info states
 * to user states.
 */
void
skb_flow_dissect_ct(const struct sk_buff *skb,
                    struct flow_dissector *flow_dissector,
                    void *target_container,
                    u16 *ctinfo_map,
                    size_t mapsize);
void
skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
                             struct flow_dissector *flow_dissector,
                             void *target_container);

void skb_flow_dissect_hash(const struct sk_buff *skb,
                           struct flow_dissector *flow_dissector,
                           void *target_container);

static inline __u32 skb_get_hash(struct sk_buff *skb)
{
        if (!skb->l4_hash && !skb->sw_hash)
                __skb_get_hash(skb);

        return skb->hash;
}

static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6)
{
        if (!skb->l4_hash && !skb->sw_hash) {
                struct flow_keys keys;
                __u32 hash = __get_hash_from_flowi6(fl6, &keys);

                __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
        }

        return skb->hash;
}

__u32 skb_get_hash_perturb(const struct sk_buff *skb,
                           const siphash_key_t *perturb);

static inline __u32 skb_get_hash_raw(const struct sk_buff *skb)
{
        return skb->hash;
}

static inline void skb_copy_hash(struct sk_buff *to, const struct sk_buff *from)
{
        to->hash = from->hash;
        to->sw_hash = from->sw_hash;
        to->l4_hash = from->l4_hash;
};

static inline void skb_copy_decrypted(struct sk_buff *to,
                                      const struct sk_buff *from)
{
#ifdef CONFIG_TLS_DEVICE
        to->decrypted = from->decrypted;
#endif
}

#ifdef NET_SKBUFF_DATA_USES_OFFSET
static inline unsigned char *skb_end_pointer(const struct sk_buff *skb)
{
        return skb->head + skb->end;
}

static inline unsigned int skb_end_offset(const struct sk_buff *skb)
{
        return skb->end;
}
#else
static inline unsigned char *skb_end_pointer(const struct sk_buff *skb)
{
        return skb->end;
}

static inline unsigned int skb_end_offset(const struct sk_buff *skb)
{
        return skb->end - skb->head;
}
#endif

/* Internal */
#define skb_shinfo(SKB)        ((struct skb_shared_info *)(skb_end_pointer(SKB)))

static inline struct skb_shared_hwtstamps *skb_hwtstamps(struct sk_buff *skb)
{
        return &skb_shinfo(skb)->hwtstamps;
}

static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
{
        bool is_zcopy = skb && skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY;

        return is_zcopy ? skb_uarg(skb) : NULL;
}

static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg,
                                 bool *have_ref)
{
        if (skb && uarg && !skb_zcopy(skb)) {
                if (unlikely(have_ref && *have_ref))
                        *have_ref = false;
                else
                        sock_zerocopy_get(uarg);
                skb_shinfo(skb)->destructor_arg = uarg;
                skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG;
        }
}

static inline void skb_zcopy_set_nouarg(struct sk_buff *skb, void *val)
{
        skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t) val | 0x1UL);
        skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG;
}

static inline bool skb_zcopy_is_nouarg(struct sk_buff *skb)
{
        return (uintptr_t) skb_shinfo(skb)->destructor_arg & 0x1UL;
}

static inline void *skb_zcopy_get_nouarg(struct sk_buff *skb)
{
        return (void *)((uintptr_t) skb_shinfo(skb)->destructor_arg & ~0x1UL);
}

/* Release a reference on a zerocopy structure */
static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy)
{
        struct ubuf_info *uarg = skb_zcopy(skb);

        if (uarg) {
                if (skb_zcopy_is_nouarg(skb)) {
                        /* no notification callback */
                } else if (uarg->callback == sock_zerocopy_callback) {
                        uarg->zerocopy = uarg->zerocopy && zerocopy;
                        sock_zerocopy_put(uarg);
                } else {
                        uarg->callback(uarg, zerocopy);
                }

                skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG;
        }
}

/* Abort a zerocopy operation and revert zckey on error in send syscall */
static inline void skb_zcopy_abort(struct sk_buff *skb)
{
        struct ubuf_info *uarg = skb_zcopy(skb);

        if (uarg) {
                sock_zerocopy_put_abort(uarg, false);
                skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG;
        }
}

static inline void skb_mark_not_on_list(struct sk_buff *skb)
{
        skb->next = NULL;
}

/* Iterate through singly-linked GSO fragments of an skb. */
#define skb_list_walk_safe(first, skb, next_skb)                               \
        for ((skb) = (first), (next_skb) = (skb) ? (skb)->next : NULL; (skb);  \
             (skb) = (next_skb), (next_skb) = (skb) ? (skb)->next : NULL)

static inline void skb_list_del_init(struct sk_buff *skb)
{
        __list_del_entry(&skb->list);
        skb_mark_not_on_list(skb);
}

/**
 *        skb_queue_empty - check if a queue is empty
 *        @list: queue head
 *
 *        Returns true if the queue is empty, false otherwise.
 */
static inline int skb_queue_empty(const struct sk_buff_head *list)
{
        return list->next == (const struct sk_buff *) list;
}

/**
 *        skb_queue_empty_lockless - check if a queue is empty
 *        @list: queue head
 *
 *        Returns true if the queue is empty, false otherwise.
 *        This variant can be used in lockless contexts.
 */
static inline bool skb_queue_empty_lockless(const struct sk_buff_head *list)
{
        return READ_ONCE(list->next) == (const struct sk_buff *) list;
}


/**
 *        skb_queue_is_last - check if skb is the last entry in the queue
 *        @list: queue head
 *        @skb: buffer
 *
 *        Returns true if @skb is the last buffer on the list.
 */
static inline bool skb_queue_is_last(const struct sk_buff_head *list,
                                     const struct sk_buff *skb)
{
        return skb->next == (const struct sk_buff *) list;
}

/**
 *        skb_queue_is_first - check if skb is the first entry in the queue
 *        @list: queue head
 *        @skb: buffer
 *
 *        Returns true if @skb is the first buffer on the list.
 */
static inline bool skb_queue_is_first(const struct sk_buff_head *list,
                                      const struct sk_buff *skb)
{
        return skb->prev == (const struct sk_buff *) list;
}

/**
 *        skb_queue_next - return the next packet in the queue
 *        @list: queue head
 *        @skb: current buffer
 *
 *        Return the next packet in @list after @skb.  It is only valid to
 *        call this if skb_queue_is_last() evaluates to false.
 */
static inline struct sk_buff *skb_queue_next(const struct sk_buff_head *list,
                                             const struct sk_buff *skb)
{
        /* This BUG_ON may seem severe, but if we just return then we
         * are going to dereference garbage.
         */
        BUG_ON(skb_queue_is_last(list, skb));
        return skb->next;
}

/**
 *        skb_queue_prev - return the prev packet in the queue
 *        @list: queue head
 *        @skb: current buffer
 *
 *        Return the prev packet in @list before @skb.  It is only valid to
 *        call this if skb_queue_is_first() evaluates to false.
 */
static inline struct sk_buff *skb_queue_prev(const struct sk_buff_head *list,
                                             const struct sk_buff *skb)
{
        /* This BUG_ON may seem severe, but if we just return then we
         * are going to dereference garbage.
         */
        BUG_ON(skb_queue_is_first(list, skb));
        return skb->prev;
}

/**
 *        skb_get - reference buffer
 *        @skb: buffer to reference
 *
 *        Makes another reference to a socket buffer and returns a pointer
 *        to the buffer.
 */
static inline struct sk_buff *skb_get(struct sk_buff *skb)
{
        refcount_inc(&skb->users);
        return skb;
}

/*
 * If users == 1, we are the only owner and can avoid redundant atomic changes.
 */

/**
 *        skb_cloned - is the buffer a clone
 *        @skb: buffer to check
 *
 *        Returns true if the buffer was generated with skb_clone() and is
 *        one of multiple shared copies of the buffer. Cloned buffers are
 *        shared data so must not be written to under normal circumstances.
 */
static inline int skb_cloned(const struct sk_buff *skb)
{
        return skb->cloned &&
               (atomic_read(&skb_shinfo(skb)->dataref) & SKB_DATAREF_MASK) != 1;
}

static inline int skb_unclone(struct sk_buff *skb, gfp_t pri)
{
        might_sleep_if(gfpflags_allow_blocking(pri));

        if (skb_cloned(skb))
                return pskb_expand_head(skb, 0, 0, pri);

        return 0;
}

/**
 *        skb_header_cloned - is the header a clone
 *        @skb: buffer to check
 *
 *        Returns true if modifying the header part of the buffer requires
 *        the data to be copied.
 */
static inline int skb_header_cloned(const struct sk_buff *skb)
{
        int dataref;

        if (!skb->cloned)
                return 0;

        dataref = atomic_read(&skb_shinfo(skb)->dataref);
        dataref = (dataref & SKB_DATAREF_MASK) - (dataref >> SKB_DATAREF_SHIFT);
        return dataref != 1;
}

static inline int skb_header_unclone(struct sk_buff *skb, gfp_t pri)
{
        might_sleep_if(gfpflags_allow_blocking(pri));

        if (skb_header_cloned(skb))
                return pskb_expand_head(skb, 0, 0, pri);

        return 0;
}

/**
 *        __skb_header_release - release reference to header
 *        @skb: buffer to operate on
 */
static inline void __skb_header_release(struct sk_buff *skb)
{
        skb->nohdr = 1;
        atomic_set(&skb_shinfo(skb)->dataref, 1 + (1 << SKB_DATAREF_SHIFT));
}


/**
 *        skb_shared - is the buffer shared
 *        @skb: buffer to check
 *
 *        Returns true if more than one person has a reference to this
 *        buffer.
 */
static inline int skb_shared(const struct sk_buff *skb)
{
        return refcount_read(&skb->users) != 1;
}

/**
 *        skb_share_check - check if buffer is shared and if so clone it
 *        @skb: buffer to check
 *        @pri: priority for memory allocation
 *
 *        If the buffer is shared the buffer is cloned and the old copy
 *        drops a reference. A new clone with a single reference is returned.
 *        If the buffer is not shared the original buffer is returned. When
 *        being called from interrupt status or with spinlocks held pri must
 *        be GFP_ATOMIC.
 *
 *        NULL is returned on a memory allocation failure.
 */
static inline struct sk_buff *skb_share_check(struct sk_buff *skb, gfp_t pri)
{
        might_sleep_if(gfpflags_allow_blocking(pri));
        if (skb_shared(skb)) {
                struct sk_buff *nskb = skb_clone(skb, pri);

                if (likely(nskb))
                        consume_skb(skb);
                else
                        kfree_skb(skb);
                skb = nskb;
        }
        return skb;
}

/*
 *        Copy shared buffers into a new sk_buff. We effectively do COW on
 *        packets to handle cases where we have a local reader and forward
 *        and a couple of other messy ones. The normal one is tcpdumping
 *        a packet thats being forwarded.
 */

/**
 *        skb_unshare - make a copy of a shared buffer
 *        @skb: buffer to check
 *        @pri: priority for memory allocation
 *
 *        If the socket buffer is a clone then this function creates a new
 *        copy of the data, drops a reference count on the old copy and returns
 *        the new copy with the reference count at 1. If the buffer is not a clone
 *        the original buffer is returned. When called with a spinlock held or
 *        from interrupt state @pri must be %GFP_ATOMIC
 *
 *        %NULL is returned on a memory allocation failure.
 */
static inline struct sk_buff *skb_unshare(struct sk_buff *skb,
                                          gfp_t pri)
{
        might_sleep_if(gfpflags_allow_blocking(pri));
        if (skb_cloned(skb)) {
                struct sk_buff *nskb = skb_copy(skb, pri);

                /* Free our shared copy */
                if (likely(nskb))
                        consume_skb(skb);
                else
                        kfree_skb(skb);
                skb = nskb;
        }
        return skb;
}

/**
 *        skb_peek - peek at the head of an &sk_buff_head
 *        @list_: list to peek at
 *
 *        Peek an &sk_buff. Unlike most other operations you _MUST_
 *        be careful with this one. A peek leaves the buffer on the
 *        list and someone else may run off with it. You must hold
 *        the appropriate locks or have a private queue to do this.
 *
 *        Returns %NULL for an empty list or a pointer to the head element.
 *        The reference count is not incremented and the reference is therefore
 *        volatile. Use with caution.
 */
static inline struct sk_buff *skb_peek(const struct sk_buff_head *list_)
{
        struct sk_buff *skb = list_->next;

        if (skb == (struct sk_buff *)list_)
                skb = NULL;
        return skb;
}

/**
 *        __skb_peek - peek at the head of a non-empty &sk_buff_head
 *        @list_: list to peek at
 *
 *        Like skb_peek(), but the caller knows that the list is not empty.
 */
static inline struct sk_buff *__skb_peek(const struct sk_buff_head *list_)
{
        return list_->next;
}

/**
 *        skb_peek_next - peek skb following the given one from a queue
 *        @skb: skb to start from
 *        @list_: list to peek at
 *
 *        Returns %NULL when the end of the list is met or a pointer to the
 *        next element. The reference count is not incremented and the
 *        reference is therefore volatile. Use with caution.
 */
static inline struct sk_buff *skb_peek_next(struct sk_buff *skb,
                const struct sk_buff_head *list_)
{
        struct sk_buff *next = skb->next;

        if (next == (struct sk_buff *)list_)
                next = NULL;
        return next;
}

/**
 *        skb_peek_tail - peek at the tail of an &sk_buff_head
 *        @list_: list to peek at
 *
 *        Peek an &sk_buff. Unlike most other operations you _MUST_
 *        be careful with this one. A peek leaves the buffer on the
 *        list and someone else may run off with it. You must hold
 *        the appropriate locks or have a private queue to do this.
 *
 *        Returns %NULL for an empty list or a pointer to the tail element.
 *        The reference count is not incremented and the reference is therefore
 *        volatile. Use with caution.
 */
static inline struct sk_buff *skb_peek_tail(const struct sk_buff_head *list_)
{
        struct sk_buff *skb = READ_ONCE(list_->prev);

        if (skb == (struct sk_buff *)list_)
                skb = NULL;
        return skb;

}

/**
 *        skb_queue_len        - get queue length
 *        @list_: list to measure
 *
 *        Return the length of an &sk_buff queue.
 */
static inline __u32 skb_queue_len(const struct sk_buff_head *list_)
{
        return list_->qlen;
}

/**
 *        skb_queue_len_lockless        - get queue length
 *        @list_: list to measure
 *
 *        Return the length of an &sk_buff queue.
 *        This variant can be used in lockless contexts.
 */
static inline __u32 skb_queue_len_lockless(const struct sk_buff_head *list_)
{
        return READ_ONCE(list_->qlen);
}

/**
 *        __skb_queue_head_init - initialize non-spinlock portions of sk_buff_head
 *        @list: queue to initialize
 *
 *        This initializes only the list and queue length aspects of
 *        an sk_buff_head object.  This allows to initialize the list
 *        aspects of an sk_buff_head without reinitializing things like
 *        the spinlock.  It can also be used for on-stack sk_buff_head
 *        objects where the spinlock is known to not be used.
 */
static inline void __skb_queue_head_init(struct sk_buff_head *list)
{
        list->prev = list->next = (struct sk_buff *)list;
        list->qlen = 0;
}

/*
 * This function creates a split out lock class for each invocation;
 * this is needed for now since a whole lot of users of the skb-queue
 * infrastructure in drivers have different locking usage (in hardirq)
 * than the networking core (in softirq only). In the long run either the
 * network layer or drivers should need annotation to consolidate the
 * main types of usage into 3 classes.
 */
static inline void skb_queue_head_init(struct sk_buff_head *list)
{
        spin_lock_init(&list->lock);
        __skb_queue_head_init(list);
}

static inline void skb_queue_head_init_class(struct sk_buff_head *list,
                struct lock_class_key *class)
{
        skb_queue_head_init(list);
        lockdep_set_class(&list->lock, class);
}

/*
 *        Insert an sk_buff on a list.
 *
 *        The "__skb_xxxx()" functions are the non-atomic ones that
 *        can only be called with interrupts disabled.
 */
static inline void __skb_insert(struct sk_buff *newsk,
                                struct sk_buff *prev, struct sk_buff *next,
                                struct sk_buff_head *list)
{
        /* See skb_queue_empty_lockless() and skb_peek_tail()
         * for the opposite READ_ONCE()
         */
        WRITE_ONCE(newsk->next, next);
        WRITE_ONCE(newsk->prev, prev);
        WRITE_ONCE(next->prev, newsk);
        WRITE_ONCE(prev->next, newsk);
        WRITE_ONCE(list->qlen, list->qlen + 1);
}

static inline void __skb_queue_splice(const struct sk_buff_head *list,
                                      struct sk_buff *prev,
                                      struct sk_buff *next)
{
        struct sk_buff *first = list->next;
        struct sk_buff *last = list->prev;

        WRITE_ONCE(first->prev, prev);
        WRITE_ONCE(prev->next, first);

        WRITE_ONCE(last->next, next);
        WRITE_ONCE(next->prev, last);
}

/**
 *        skb_queue_splice - join two skb lists, this is designed for stacks
 *        @list: the new list to add
 *        @head: the place to add it in the first list
 */
static inline void skb_queue_splice(const struct sk_buff_head *list,
                                    struct sk_buff_head *head)
{
        if (!skb_queue_empty(list)) {
                __skb_queue_splice(list, (struct sk_buff *) head, head->next);
                head->qlen += list->qlen;
        }
}

/**
 *        skb_queue_splice_init - join two skb lists and reinitialise the emptied list
 *        @list: the new list to add
 *        @head: the place to add it in the first list
 *
 *        The list at @list is reinitialised
 */
static inline void skb_queue_splice_init(struct sk_buff_head *list,
                                         struct sk_buff_head *head)
{
        if (!skb_queue_empty(list)) {
                __skb_queue_splice(list, (struct sk_buff *) head, head->next);
                head->qlen += list->qlen;
                __skb_queue_head_init(list);
        }
}

/**
 *        skb_queue_splice_tail - join two skb lists, each list being a queue
 *        @list: the new list to add
 *        @head: the place to add it in the first list
 */
static inline void skb_queue_splice_tail(const struct sk_buff_head *list,
                                         struct sk_buff_head *head)
{
        if (!skb_queue_empty(list)) {
                __skb_queue_splice(list, head->prev, (struct sk_buff *) head);
                head->qlen += list->qlen;
        }
}

/**
 *        skb_queue_splice_tail_init - join two skb lists and reinitialise the emptied list
 *        @list: the new list to add
 *        @head: the place to add it in the first list
 *
 *        Each of the lists is a queue.
 *        The list at @list is reinitialised
 */
static inline void skb_queue_splice_tail_init(struct sk_buff_head *list,
                                              struct sk_buff_head *head)
{
        if (!skb_queue_empty(list)) {
                __skb_queue_splice(list, head->prev, (struct sk_buff *) head);
                head->qlen += list->qlen;
                __skb_queue_head_init(list);
        }
}

/**
 *        __skb_queue_after - queue a buffer at the list head
 *        @list: list to use
 *        @prev: place after this buffer
 *        @newsk: buffer to queue
 *
 *        Queue a buffer int the middle of a list. This function takes no locks
 *        and you must therefore hold required locks before calling it.
 *
 *        A buffer cannot be placed on two lists at the same time.
 */
static inline void __skb_queue_after(struct sk_buff_head *list,
                                     struct sk_buff *prev,
                                     struct sk_buff *newsk)
{
        __skb_insert(newsk, prev, prev->next, list);
}

void skb_append(struct sk_buff *old, struct sk_buff *newsk,
                struct sk_buff_head *list);

static inline void __skb_queue_before(struct sk_buff_head *list,
                                      struct sk_buff *next,
                                      struct sk_buff *newsk)
{
        __skb_insert(newsk, next->prev, next, list);
}

/**
 *        __skb_queue_head - queue a buffer at the list head
 *        @list: list to use
 *        @newsk: buffer to queue
 *
 *        Queue a buffer at the start of a list. This function takes no locks
 *        and you must therefore hold required locks before calling it.
 *
 *        A buffer cannot be placed on two lists at the same time.
 */
static inline void __skb_queue_head(struct sk_buff_head *list,
                                    struct sk_buff *newsk)
{
        __skb_queue_after(list, (struct sk_buff *)list, newsk);
}
void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk);

/**
 *        __skb_queue_tail - queue a buffer at the list tail
 *        @list: list to use
 *        @newsk: buffer to queue
 *
 *        Queue a buffer at the end of a list. This function takes no locks
 *        and you must therefore hold required locks before calling it.
 *
 *        A buffer cannot be placed on two lists at the same time.
 */
static inline void __skb_queue_tail(struct sk_buff_head *list,
                                   struct sk_buff *newsk)
{
        __skb_queue_before(list, (struct sk_buff *)list, newsk);
}
void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk);

/*
 * remove sk_buff from list. _Must_ be called atomically, and with
 * the list known..
 */
void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list);
static inline void __skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
{
        struct sk_buff *next, *prev;

        WRITE_ONCE(list->qlen, list->qlen - 1);
        next           = skb->next;
        prev           = skb->prev;
        skb->next  = skb->prev = NULL;
        WRITE_ONCE(next->prev, prev);
        WRITE_ONCE(prev->next, next);
}

/**
 *        __skb_dequeue - remove from the head of the queue
 *        @list: list to dequeue from
 *
 *        Remove the head of the list. This function does not take any locks
 *        so must be used with appropriate locks held only. The head item is
 *        returned or %NULL if the list is empty.
 */
static inline struct sk_buff *__skb_dequeue(struct sk_buff_head *list)
{
        struct sk_buff *skb = skb_peek(list);
        if (skb)
                __skb_unlink(skb, list);
        return skb;
}
struct sk_buff *skb_dequeue(struct sk_buff_head *list);

/**
 *        __skb_dequeue_tail - remove from the tail of the queue
 *        @list: list to dequeue from
 *
 *        Remove the tail of the list. This function does not take any locks
 *        so must be used with appropriate locks held only. The tail item is
 *        returned or %NULL if the list is empty.
 */
static inline struct sk_buff *__skb_dequeue_tail(struct sk_buff_head *list)
{
        struct sk_buff *skb = skb_peek_tail(list);
        if (skb)
                __skb_unlink(skb, list);
        return skb;
}
struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list);


static inline bool skb_is_nonlinear(const struct sk_buff *skb)
{
        return skb->data_len;
}

static inline unsigned int skb_headlen(const struct sk_buff *skb)
{
        return skb->len - skb->data_len;
}

static inline unsigned int __skb_pagelen(const struct sk_buff *skb)
{
        unsigned int i, len = 0;

        for (i = skb_shinfo(skb)->nr_frags - 1; (int)i >= 0; i--)
                len += skb_frag_size(&skb_shinfo(skb)->frags[i]);
        return len;
}

static inline unsigned int skb_pagelen(const struct sk_buff *skb)
{
        return skb_headlen(skb) + __skb_pagelen(skb);
}

/**
 * __skb_fill_page_desc - initialise a paged fragment in an skb
 * @skb: buffer containing fragment to be initialised
 * @i: paged fragment index to initialise
 * @page: the page to use for this fragment
 * @off: the offset to the data with @page
 * @size: the length of the data
 *
 * Initialises the @i'th fragment of @skb to point to &size bytes at
 * offset @off within @page.
 *
 * Does not take any additional reference on the fragment.
 */
static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
                                        struct page *page, int off, int size)
{
        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];

        /*
         * Propagate page pfmemalloc to the skb if we can. The problem is
         * that not all callers have unique ownership of the page but rely
         * on page_is_pfmemalloc doing the right thing(tm).
         */
        frag->bv_page                  = page;
        frag->bv_offset                  = off;
        skb_frag_size_set(frag, size);

        page = compound_head(page);
        if (page_is_pfmemalloc(page))
                skb->pfmemalloc        = true;
}

/**
 * skb_fill_page_desc - initialise a paged fragment in an skb
 * @skb: buffer containing fragment to be initialised
 * @i: paged fragment index to initialise
 * @page: the page to use for this fragment
 * @off: the offset to the data with @page
 * @size: the length of the data
 *
 * As per __skb_fill_page_desc() -- initialises the @i'th fragment of
 * @skb to point to @size bytes at offset @off within @page. In
 * addition updates @skb such that @i is the last fragment.
 *
 * Does not take any additional reference on the fragment.
 */
static inline void skb_fill_page_desc(struct sk_buff *skb, int i,
                                      struct page *page, int off, int size)
{
        __skb_fill_page_desc(skb, i, page, off, size);
        skb_shinfo(skb)->nr_frags = i + 1;
}

void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
                     int size, unsigned int truesize);

void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
                          unsigned int truesize);

#define SKB_LINEAR_ASSERT(skb)  BUG_ON(skb_is_nonlinear(skb))

#ifdef NET_SKBUFF_DATA_USES_OFFSET
static inline unsigned char *skb_tail_pointer(const struct sk_buff *skb)
{
        return skb->head + skb->tail;
}

static inline void skb_reset_tail_pointer(struct sk_buff *skb)
{
        skb->tail = skb->data - skb->head;
}

static inline void skb_set_tail_pointer(struct sk_buff *skb, const int offset)
{
        skb_reset_tail_pointer(skb);
        skb->tail += offset;
}

#else /* NET_SKBUFF_DATA_USES_OFFSET */
static inline unsigned char *skb_tail_pointer(const struct sk_buff *skb)
{
        return skb->tail;
}

static inline void skb_reset_tail_pointer(struct sk_buff *skb)
{
        skb->tail = skb->data;
}

static inline void skb_set_tail_pointer(struct sk_buff *skb, const int offset)
{
        skb->tail = skb->data + offset;
}

#endif /* NET_SKBUFF_DATA_USES_OFFSET */

static inline void skb_assert_len(struct sk_buff *skb)
{
#ifdef CONFIG_DEBUG_NET
        if (WARN_ONCE(!skb->len, "%s\n", __func__))
                DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
#endif /* CONFIG_DEBUG_NET */
}

/*
 *        Add data to an sk_buff
 */
void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len);
void *skb_put(struct sk_buff *skb, unsigned int len);
static inline void *__skb_put(struct sk_buff *skb, unsigned int len)
{
        void *tmp = skb_tail_pointer(skb);
        SKB_LINEAR_ASSERT(skb);
        skb->tail += len;
        skb->len  += len;
        return tmp;
}

static inline void *__skb_put_zero(struct sk_buff *skb, unsigned int len)
{
        void *tmp = __skb_put(skb, len);

        memset(tmp, 0, len);
        return tmp;
}

static inline void *__skb_put_data(struct sk_buff *skb, const void *data,
                                   unsigned int len)
{
        void *tmp = __skb_put(skb, len);

        memcpy(tmp, data, len);
        return tmp;
}

static inline void __skb_put_u8(struct sk_buff *skb, u8 val)
{
        *(u8 *)__skb_put(skb, 1) = val;
}

static inline void *skb_put_zero(struct sk_buff *skb, unsigned int len)
{
        void *tmp = skb_put(skb, len);

        memset(tmp, 0, len);

        return tmp;
}

static inline void *skb_put_data(struct sk_buff *skb, const void *data,
                                 unsigned int len)
{
        void *tmp = skb_put(skb, len);

        memcpy(tmp, data, len);

        return tmp;
}

static inline void skb_put_u8(struct sk_buff *skb, u8 val)
{
        *(u8 *)skb_put(skb, 1) = val;
}

void *skb_push(struct sk_buff *skb, unsigned int len);
static inline void *__skb_push(struct sk_buff *skb, unsigned int len)
{
        skb->data -= len;
        skb->len  += len;
        return skb->data;
}

void *skb_pull(struct sk_buff *skb, unsigned int len);
static inline void *__skb_pull(struct sk_buff *skb, unsigned int len)
{
        skb->len -= len;
        BUG_ON(skb->len < skb->data_len);
        return skb->data += len;
}

static inline void *skb_pull_inline(struct sk_buff *skb, unsigned int len)
{
        return unlikely(len > skb->len) ? NULL : __skb_pull(skb, len);
}

void *__pskb_pull_tail(struct sk_buff *skb, int delta);

static inline void *__pskb_pull(struct sk_buff *skb, unsigned int len)
{
        if (len > skb_headlen(skb) &&
            !__pskb_pull_tail(skb, len - skb_headlen(skb)))
                return NULL;
        skb->len -= len;
        return skb->data += len;
}

static inline void *pskb_pull(struct sk_buff *skb, unsigned int len)
{
        return unlikely(len > skb->len) ? NULL : __pskb_pull(skb, len);
}

static inline bool pskb_may_pull(struct sk_buff *skb, unsigned int len)
{
        if (likely(len <= skb_headlen(skb)))
                return true;
        if (unlikely(len > skb->len))
                return false;
        return __pskb_pull_tail(skb, len - skb_headlen(skb)) != NULL;
}

void skb_condense(struct sk_buff *skb);

/**
 *        skb_headroom - bytes at buffer head
 *        @skb: buffer to check
 *
 *        Return the number of bytes of free space at the head of an &sk_buff.
 */
static inline unsigned int skb_headroom(const struct sk_buff *skb)
{
        return skb->data - skb->head;
}

/**
 *        skb_tailroom - bytes at buffer end
 *        @skb: buffer to check
 *
 *        Return the number of bytes of free space at the tail of an sk_buff
 */
static inline int skb_tailroom(const struct sk_buff *skb)
{
        return skb_is_nonlinear(skb) ? 0 : skb->end - skb->tail;
}

/**
 *        skb_availroom - bytes at buffer end
 *        @skb: buffer to check
 *
 *        Return the number of bytes of free space at the tail of an sk_buff
 *        allocated by sk_stream_alloc()
 */
static inline int skb_availroom(const struct sk_buff *skb)
{
        if (skb_is_nonlinear(skb))
                return 0;

        return skb->end - skb->tail - skb->reserved_tailroom;
}

/**
 *        skb_reserve - adjust headroom
 *        @skb: buffer to alter
 *        @len: bytes to move
 *
 *        Increase the headroom of an empty &sk_buff by reducing the tail
 *        room. This is only allowed for an empty buffer.
 */
static inline void skb_reserve(struct sk_buff *skb, int len)
{
        skb->data += len;
        skb->tail += len;
}

/**
 *        skb_tailroom_reserve - adjust reserved_tailroom
 *        @skb: buffer to alter
 *        @mtu: maximum amount of headlen permitted
 *        @needed_tailroom: minimum amount of reserved_tailroom
 *
 *        Set reserved_tailroom so that headlen can be as large as possible but
 *        not larger than mtu and tailroom cannot be smaller than
 *        needed_tailroom.
 *        The required headroom should already have been reserved before using
 *        this function.
 */
static inline void skb_tailroom_reserve(struct sk_buff *skb, unsigned int mtu,
                                        unsigned int needed_tailroom)
{
        SKB_LINEAR_ASSERT(skb);
        if (mtu < skb_tailroom(skb) - needed_tailroom)
                /* use at most mtu */
                skb->reserved_tailroom = skb_tailroom(skb) - mtu;
        else
                /* use up to all available space */
                skb->reserved_tailroom = needed_tailroom;
}

#define ENCAP_TYPE_ETHER        0
#define ENCAP_TYPE_IPPROTO        1

static inline void skb_set_inner_protocol(struct sk_buff *skb,
                                          __be16 protocol)
{
        skb->inner_protocol = protocol;
        skb->inner_protocol_type = ENCAP_TYPE_ETHER;
}

static inline void skb_set_inner_ipproto(struct sk_buff *skb,
                                         __u8 ipproto)
{
        skb->inner_ipproto = ipproto;
        skb->inner_protocol_type = ENCAP_TYPE_IPPROTO;
}

static inline void skb_reset_inner_headers(struct sk_buff *skb)
{
        skb->inner_mac_header = skb->mac_header;
        skb->inner_network_header = skb->network_header;
        skb->inner_transport_header = skb->transport_header;
}

static inline void skb_reset_mac_len(struct sk_buff *skb)
{
        skb->mac_len = skb->network_header - skb->mac_header;
}

static inline unsigned char *skb_inner_transport_header(const struct sk_buff
                                                        *skb)
{
        return skb->head + skb->inner_transport_header;
}

static inline int skb_inner_transport_offset(const struct sk_buff *skb)
{
        return skb_inner_transport_header(skb) - skb->data;
}

static inline void skb_reset_inner_transport_header(struct sk_buff *skb)
{
        skb->inner_transport_header = skb->data - skb->head;
}

static inline void skb_set_inner_transport_header(struct sk_buff *skb,
                                                   const int offset)
{
        skb_reset_inner_transport_header(skb);
        skb->inner_transport_header += offset;
}

static inline unsigned char *skb_inner_network_header(const struct sk_buff *skb)
{
        return skb->head + skb->inner_network_header;
}

static inline void skb_reset_inner_network_header(struct sk_buff *skb)
{
        skb->inner_network_header = skb->data - skb->head;
}

static inline void skb_set_inner_network_header(struct sk_buff *skb,
                                                const int offset)
{
        skb_reset_inner_network_header(skb);
        skb->inner_network_header += offset;
}

static inline unsigned char *skb_inner_mac_header(const struct sk_buff *skb)
{
        return skb->head + skb->inner_mac_header;
}

static inline void skb_reset_inner_mac_header(struct sk_buff *skb)
{
        skb->inner_mac_header = skb->data - skb->head;
}

static inline void skb_set_inner_mac_header(struct sk_buff *skb,
                                            const int offset)
{
        skb_reset_inner_mac_header(skb);
        skb->inner_mac_header += offset;
}
static inline bool skb_transport_header_was_set(const struct sk_buff *skb)
{
        return skb->transport_header != (typeof(skb->transport_header))~0U;
}

static inline unsigned char *skb_transport_header(const struct sk_buff *skb)
{
        return skb->head + skb->transport_header;
}

static inline void skb_reset_transport_header(struct sk_buff *skb)
{
        skb->transport_header = skb->data - skb->head;
}

/**
 * skb_reset_transport_header_careful - conditionally reset transport header
 * @skb: buffer to alter
 *
 * Hardened version of skb_reset_transport_header().
 *
 * Returns: true if the operation was a success.
 */
static inline bool __must_check
skb_reset_transport_header_careful(struct sk_buff *skb)
{
        long offset = skb->data - skb->head;

        if (unlikely(offset != (typeof(skb->transport_header))offset))
                return false;

        if (unlikely(offset == (typeof(skb->transport_header))~0U))
                return false;

        skb->transport_header = offset;
        return true;
}

static inline void skb_set_transport_header(struct sk_buff *skb,
                                            const int offset)
{
        skb_reset_transport_header(skb);
        skb->transport_header += offset;
}

static inline unsigned char *skb_network_header(const struct sk_buff *skb)
{
        return skb->head + skb->network_header;
}

static inline void skb_reset_network_header(struct sk_buff *skb)
{
        skb->network_header = skb->data - skb->head;
}

static inline void skb_set_network_header(struct sk_buff *skb, const int offset)
{
        skb_reset_network_header(skb);
        skb->network_header += offset;
}

static inline unsigned char *skb_mac_header(const struct sk_buff *skb)
{
        return skb->head + skb->mac_header;
}

static inline int skb_mac_offset(const struct sk_buff *skb)
{
        return skb_mac_header(skb) - skb->data;
}

static inline u32 skb_mac_header_len(const struct sk_buff *skb)
{
        return skb->network_header - skb->mac_header;
}

static inline int skb_mac_header_was_set(const struct sk_buff *skb)
{
        return skb->mac_header != (typeof(skb->mac_header))~0U;
}

static inline void skb_unset_mac_header(struct sk_buff *skb)
{
        skb->mac_header = (typeof(skb->mac_header))~0U;
}

static inline void skb_reset_mac_header(struct sk_buff *skb)
{
        skb->mac_header = skb->data - skb->head;
}

static inline void skb_set_mac_header(struct sk_buff *skb, const int offset)
{
        skb_reset_mac_header(skb);
        skb->mac_header += offset;
}

static inline void skb_pop_mac_header(struct sk_buff *skb)
{
        skb->mac_header = skb->network_header;
}

static inline void skb_probe_transport_header(struct sk_buff *skb)
{
        struct flow_keys_basic keys;

        if (skb_transport_header_was_set(skb))
                return;

        if (skb_flow_dissect_flow_keys_basic(NULL, skb, &keys,
                                             NULL, 0, 0, 0, 0))
                skb_set_transport_header(skb, keys.control.thoff);
}

static inline void skb_mac_header_rebuild(struct sk_buff *skb)
{
        if (skb_mac_header_was_set(skb)) {
                const unsigned char *old_mac = skb_mac_header(skb);

                skb_set_mac_header(skb, -skb->mac_len);
                memmove(skb_mac_header(skb), old_mac, skb->mac_len);
        }
}

/* Move the full mac header up to current network_header.
 * Leaves skb->data pointing at offset skb->mac_len into the mac_header.
 * Must be provided the complete mac header length.
 */
static inline void skb_mac_header_rebuild_full(struct sk_buff *skb, u32 full_mac_len)
{
        if (skb_mac_header_was_set(skb)) {
                const unsigned char *old_mac = skb_mac_header(skb);

                skb_set_mac_header(skb, -full_mac_len);
                memmove(skb_mac_header(skb), old_mac, full_mac_len);
                __skb_push(skb, full_mac_len - skb->mac_len);
        }
}

static inline int skb_checksum_start_offset(const struct sk_buff *skb)
{
        return skb->csum_start - skb_headroom(skb);
}

static inline unsigned char *skb_checksum_start(const struct sk_buff *skb)
{
        return skb->head + skb->csum_start;
}

static inline int skb_transport_offset(const struct sk_buff *skb)
{
        return skb_transport_header(skb) - skb->data;
}

static inline u32 skb_network_header_len(const struct sk_buff *skb)
{
        return skb->transport_header - skb->network_header;
}

static inline u32 skb_inner_network_header_len(const struct sk_buff *skb)
{
        return skb->inner_transport_header - skb->inner_network_header;
}

static inline int skb_network_offset(const struct sk_buff *skb)
{
        return skb_network_header(skb) - skb->data;
}

static inline int skb_inner_network_offset(const struct sk_buff *skb)
{
        return skb_inner_network_header(skb) - skb->data;
}

static inline int pskb_network_may_pull(struct sk_buff *skb, unsigned int len)
{
        return pskb_may_pull(skb, skb_network_offset(skb) + len);
}

/*
 * CPUs often take a performance hit when accessing unaligned memory
 * locations. The actual performance hit varies, it can be small if the
 * hardware handles it or large if we have to take an exception and fix it
 * in software.
 *
 * Since an ethernet header is 14 bytes network drivers often end up with
 * the IP header at an unaligned offset. The IP header can be aligned by
 * shifting the start of the packet by 2 bytes. Drivers should do this
 * with:
 *
 * skb_reserve(skb, NET_IP_ALIGN);
 *
 * The downside to this alignment of the IP header is that the DMA is now
 * unaligned. On some architectures the cost of an unaligned DMA is high
 * and this cost outweighs the gains made by aligning the IP header.
 *
 * Since this trade off varies between architectures, we allow NET_IP_ALIGN
 * to be overridden.
 */
#ifndef NET_IP_ALIGN
#define NET_IP_ALIGN        2
#endif

/*
 * The networking layer reserves some headroom in skb data (via
 * dev_alloc_skb). This is used to avoid having to reallocate skb data when
 * the header has to grow. In the default case, if the header has to grow
 * 32 bytes or less we avoid the reallocation.
 *
 * Unfortunately this headroom changes the DMA alignment of the resulting
 * network packet. As for NET_IP_ALIGN, this unaligned DMA is expensive
 * on some architectures. An architecture can override this value,
 * perhaps setting it to a cacheline in size (since that will maintain
 * cacheline alignment of the DMA). It must be a power of 2.
 *
 * Various parts of the networking layer expect at least 32 bytes of
 * headroom, you should not reduce this.
 *
 * Using max(32, L1_CACHE_BYTES) makes sense (especially with RPS)
 * to reduce average number of cache lines per packet.
 * get_rps_cpu() for example only access one 64 bytes aligned block :
 * NET_IP_ALIGN(2) + ethernet_header(14) + IP_header(20/40) + ports(8)
 */
#ifndef NET_SKB_PAD
#define NET_SKB_PAD        max(32, L1_CACHE_BYTES)
#endif

int ___pskb_trim(struct sk_buff *skb, unsigned int len);

static inline void __skb_set_length(struct sk_buff *skb, unsigned int len)
{
        if (WARN_ON(skb_is_nonlinear(skb)))
                return;
        skb->len = len;
        skb_set_tail_pointer(skb, len);
}

static inline void __skb_trim(struct sk_buff *skb, unsigned int len)
{
        __skb_set_length(skb, len);
}

void skb_trim(struct sk_buff *skb, unsigned int len);

static inline int __pskb_trim(struct sk_buff *skb, unsigned int len)
{
        if (skb->data_len)
                return ___pskb_trim(skb, len);
        __skb_trim(skb, len);
        return 0;
}

static inline int pskb_trim(struct sk_buff *skb, unsigned int len)
{
        return (len < skb->len) ? __pskb_trim(skb, len) : 0;
}

/**
 *        pskb_trim_unique - remove end from a paged unique (not cloned) buffer
 *        @skb: buffer to alter
 *        @len: new length
 *
 *        This is identical to pskb_trim except that the caller knows that
 *        the skb is not cloned so we should never get an error due to out-
 *        of-memory.
 */
static inline void pskb_trim_unique(struct sk_buff *skb, unsigned int len)
{
        int err = pskb_trim(skb, len);
        BUG_ON(err);
}

static inline int __skb_grow(struct sk_buff *skb, unsigned int len)
{
        unsigned int diff = len - skb->len;

        if (skb_tailroom(skb) < diff) {
                int ret = pskb_expand_head(skb, 0, diff - skb_tailroom(skb),
                                           GFP_ATOMIC);
                if (ret)
                        return ret;
        }
        __skb_set_length(skb, len);
        return 0;
}

/**
 *        skb_orphan - orphan a buffer
 *        @skb: buffer to orphan
 *
 *        If a buffer currently has an owner then we call the owner's
 *        destructor function and make the @skb unowned. The buffer continues
 *        to exist but is no longer charged to its former owner.
 */
static inline void skb_orphan(struct sk_buff *skb)
{
        if (skb->destructor) {
                skb->destructor(skb);
                skb->destructor = NULL;
                skb->sk                = NULL;
        } else {
                BUG_ON(skb->sk);
        }
}

/**
 *        skb_orphan_frags - orphan the frags contained in a buffer
 *        @skb: buffer to orphan frags from
 *        @gfp_mask: allocation mask for replacement pages
 *
 *        For each frag in the SKB which needs a destructor (i.e. has an
 *        owner) create a copy of that frag and release the original
 *        page by calling the destructor.
 */
static inline int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask)
{
        if (likely(!skb_zcopy(skb)))
                return 0;
        if (!skb_zcopy_is_nouarg(skb) &&
            skb_uarg(skb)->callback == sock_zerocopy_callback)
                return 0;
        return skb_copy_ubufs(skb, gfp_mask);
}

/* Frags must be orphaned, even if refcounted, if skb might loop to rx path */
static inline int skb_orphan_frags_rx(struct sk_buff *skb, gfp_t gfp_mask)
{
        if (likely(!skb_zcopy(skb)))
                return 0;
        return skb_copy_ubufs(skb, gfp_mask);
}

/**
 *        __skb_queue_purge - empty a list
 *        @list: list to empty
 *
 *        Delete all buffers on an &sk_buff list. Each buffer is removed from
 *        the list and one reference dropped. This function does not take the
 *        list lock and the caller must hold the relevant locks to use it.
 */
static inline void __skb_queue_purge(struct sk_buff_head *list)
{
        struct sk_buff *skb;
        while ((skb = __skb_dequeue(list)) != NULL)
                kfree_skb(skb);
}
void skb_queue_purge(struct sk_buff_head *list);

unsigned int skb_rbtree_purge(struct rb_root *root);

void *netdev_alloc_frag(unsigned int fragsz);

struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length,
                                   gfp_t gfp_mask);

/**
 *        netdev_alloc_skb - allocate an skbuff for rx on a specific device
 *        @dev: network device to receive on
 *        @length: length to allocate
 *
 *        Allocate a new &sk_buff and assign it a usage count of one. The
 *        buffer has unspecified headroom built in. Users should allocate
 *        the headroom they think they need without accounting for the
 *        built in space. The built in space is used for optimisations.
 *
 *        %NULL is returned if there is no free memory. Although this function
 *        allocates memory it can be called from an interrupt.
 */
static inline struct sk_buff *netdev_alloc_skb(struct net_device *dev,
                                               unsigned int length)
{
        return __netdev_alloc_skb(dev, length, GFP_ATOMIC);
}

/* legacy helper around __netdev_alloc_skb() */
static inline struct sk_buff *__dev_alloc_skb(unsigned int length,
                                              gfp_t gfp_mask)
{
        return __netdev_alloc_skb(NULL, length, gfp_mask);
}

/* legacy helper around netdev_alloc_skb() */
static inline struct sk_buff *dev_alloc_skb(unsigned int length)
{
        return netdev_alloc_skb(NULL, length);
}


static inline struct sk_buff *__netdev_alloc_skb_ip_align(struct net_device *dev,
                unsigned int length, gfp_t gfp)
{
        struct sk_buff *skb = __netdev_alloc_skb(dev, length + NET_IP_ALIGN, gfp);

        if (NET_IP_ALIGN && skb)
                skb_reserve(skb, NET_IP_ALIGN);
        return skb;
}

static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev,
                unsigned int length)
{
        return __netdev_alloc_skb_ip_align(dev, length, GFP_ATOMIC);
}

static inline void skb_free_frag(void *addr)
{
        page_frag_free(addr);
}

void *napi_alloc_frag(unsigned int fragsz);
struct sk_buff *__napi_alloc_skb(struct napi_struct *napi,
                                 unsigned int length, gfp_t gfp_mask);
static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi,
                                             unsigned int length)
{
        return __napi_alloc_skb(napi, length, GFP_ATOMIC);
}
void napi_consume_skb(struct sk_buff *skb, int budget);

void __kfree_skb_flush(void);
void __kfree_skb_defer(struct sk_buff *skb);

/**
 * __dev_alloc_pages - allocate page for network Rx
 * @gfp_mask: allocation priority. Set __GFP_NOMEMALLOC if not for network Rx
 * @order: size of the allocation
 *
 * Allocate a new page.
 *
 * %NULL is returned if there is no free memory.
*/
static inline struct page *__dev_alloc_pages(gfp_t gfp_mask,
                                             unsigned int order)
{
        /* This piece of code contains several assumptions.
         * 1.  This is for device Rx, therefor a cold page is preferred.
         * 2.  The expectation is the user wants a compound page.
         * 3.  If requesting a order 0 page it will not be compound
         *     due to the check to see if order has a value in prep_new_page
         * 4.  __GFP_MEMALLOC is ignored if __GFP_NOMEMALLOC is set due to
         *     code in gfp_to_alloc_flags that should be enforcing this.
         */
        gfp_mask |= __GFP_COMP | __GFP_MEMALLOC;

        return alloc_pages_node(NUMA_NO_NODE, gfp_mask, order);
}

static inline struct page *dev_alloc_pages(unsigned int order)
{
        return __dev_alloc_pages(GFP_ATOMIC | __GFP_NOWARN, order);
}

/**
 * __dev_alloc_page - allocate a page for network Rx
 * @gfp_mask: allocation priority. Set __GFP_NOMEMALLOC if not for network Rx
 *
 * Allocate a new page.
 *
 * %NULL is returned if there is no free memory.
 */
static inline struct page *__dev_alloc_page(gfp_t gfp_mask)
{
        return __dev_alloc_pages(gfp_mask, 0);
}

static inline struct page *dev_alloc_page(void)
{
        return dev_alloc_pages(0);
}

/**
 *        skb_propagate_pfmemalloc - Propagate pfmemalloc if skb is allocated after RX page
 *        @page: The page that was allocated from skb_alloc_page
 *        @skb: The skb that may need pfmemalloc set
 */
static inline void skb_propagate_pfmemalloc(struct page *page,
                                             struct sk_buff *skb)
{
        if (page_is_pfmemalloc(page))
                skb->pfmemalloc = true;
}

/**
 * skb_frag_off() - Returns the offset of a skb fragment
 * @frag: the paged fragment
 */
static inline unsigned int skb_frag_off(const skb_frag_t *frag)
{
        return frag->bv_offset;
}

/**
 * skb_frag_off_add() - Increments the offset of a skb fragment by @delta
 * @frag: skb fragment
 * @delta: value to add
 */
static inline void skb_frag_off_add(skb_frag_t *frag, int delta)
{
        frag->bv_offset += delta;
}

/**
 * skb_frag_off_set() - Sets the offset of a skb fragment
 * @frag: skb fragment
 * @offset: offset of fragment
 */
static inline void skb_frag_off_set(skb_frag_t *frag, unsigned int offset)
{
        frag->bv_offset = offset;
}

/**
 * skb_frag_off_copy() - Sets the offset of a skb fragment from another fragment
 * @fragto: skb fragment where offset is set
 * @fragfrom: skb fragment offset is copied from
 */
static inline void skb_frag_off_copy(skb_frag_t *fragto,
                                     const skb_frag_t *fragfrom)
{
        fragto->bv_offset = fragfrom->bv_offset;
}

/**
 * skb_frag_page - retrieve the page referred to by a paged fragment
 * @frag: the paged fragment
 *
 * Returns the &struct page associated with @frag.
 */
static inline struct page *skb_frag_page(const skb_frag_t *frag)
{
        return frag->bv_page;
}

/**
 * __skb_frag_ref - take an addition reference on a paged fragment.
 * @frag: the paged fragment
 *
 * Takes an additional reference on the paged fragment @frag.
 */
static inline void __skb_frag_ref(skb_frag_t *frag)
{
        get_page(skb_frag_page(frag));
}

/**
 * skb_frag_ref - take an addition reference on a paged fragment of an skb.
 * @skb: the buffer
 * @f: the fragment offset.
 *
 * Takes an additional reference on the @f'th paged fragment of @skb.
 */
static inline void skb_frag_ref(struct sk_buff *skb, int f)
{
        __skb_frag_ref(&skb_shinfo(skb)->frags[f]);
}

/**
 * __skb_frag_unref - release a reference on a paged fragment.
 * @frag: the paged fragment
 *
 * Releases a reference on the paged fragment @frag.
 */
static inline void __skb_frag_unref(skb_frag_t *frag)
{
        put_page(skb_frag_page(frag));
}

/**
 * skb_frag_unref - release a reference on a paged fragment of an skb.
 * @skb: the buffer
 * @f: the fragment offset
 *
 * Releases a reference on the @f'th paged fragment of @skb.
 */
static inline void skb_frag_unref(struct sk_buff *skb, int f)
{
        __skb_frag_unref(&skb_shinfo(skb)->frags[f]);
}

/**
 * skb_frag_address - gets the address of the data contained in a paged fragment
 * @frag: the paged fragment buffer
 *
 * Returns the address of the data within @frag. The page must already
 * be mapped.
 */
static inline void *skb_frag_address(const skb_frag_t *frag)
{
        return page_address(skb_frag_page(frag)) + skb_frag_off(frag);
}

/**
 * skb_frag_address_safe - gets the address of the data contained in a paged fragment
 * @frag: the paged fragment buffer
 *
 * Returns the address of the data within @frag. Checks that the page
 * is mapped and returns %NULL otherwise.
 */
static inline void *skb_frag_address_safe(const skb_frag_t *frag)
{
        struct page *page = skb_frag_page(frag);
        void *ptr;

        if (!page)
                return NULL;

        ptr = page_address(page);
        if (unlikely(!ptr))
                return NULL;

        return ptr + skb_frag_off(frag);
}

/**
 * skb_frag_page_copy() - sets the page in a fragment from another fragment
 * @fragto: skb fragment where page is set
 * @fragfrom: skb fragment page is copied from
 */
static inline void skb_frag_page_copy(skb_frag_t *fragto,
                                      const skb_frag_t *fragfrom)
{
        fragto->bv_page = fragfrom->bv_page;
}

/**
 * __skb_frag_set_page - sets the page contained in a paged fragment
 * @frag: the paged fragment
 * @page: the page to set
 *
 * Sets the fragment @frag to contain @page.
 */
static inline void __skb_frag_set_page(skb_frag_t *frag, struct page *page)
{
        frag->bv_page = page;
}

/**
 * skb_frag_set_page - sets the page contained in a paged fragment of an skb
 * @skb: the buffer
 * @f: the fragment offset
 * @page: the page to set
 *
 * Sets the @f'th fragment of @skb to contain @page.
 */
static inline void skb_frag_set_page(struct sk_buff *skb, int f,
                                     struct page *page)
{
        __skb_frag_set_page(&skb_shinfo(skb)->frags[f], page);
}

bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio);

/**
 * skb_frag_dma_map - maps a paged fragment via the DMA API
 * @dev: the device to map the fragment to
 * @frag: the paged fragment to map
 * @offset: the offset within the fragment (starting at the
 *          fragment's own offset)
 * @size: the number of bytes to map
 * @dir: the direction of the mapping (``PCI_DMA_*``)
 *
 * Maps the page associated with @frag to @device.
 */
static inline dma_addr_t skb_frag_dma_map(struct device *dev,
                                          const skb_frag_t *frag,
                                          size_t offset, size_t size,
                                          enum dma_data_direction dir)
{
        return dma_map_page(dev, skb_frag_page(frag),
                            skb_frag_off(frag) + offset, size, dir);
}

static inline struct sk_buff *pskb_copy(struct sk_buff *skb,
                                        gfp_t gfp_mask)
{
        return __pskb_copy(skb, skb_headroom(skb), gfp_mask);
}


static inline struct sk_buff *pskb_copy_for_clone(struct sk_buff *skb,
                                                  gfp_t gfp_mask)
{
        return __pskb_copy_fclone(skb, skb_headroom(skb), gfp_mask, true);
}


/**
 *        skb_clone_writable - is the header of a clone writable
 *        @skb: buffer to check
 *        @len: length up to which to write
 *
 *        Returns true if modifying the header part of the cloned buffer
 *        does not requires the data to be copied.
 */
static inline int skb_clone_writable(const struct sk_buff *skb, unsigned int len)
{
        return !skb_header_cloned(skb) &&
               skb_headroom(skb) + len <= skb->hdr_len;
}

static inline int skb_try_make_writable(struct sk_buff *skb,
                                        unsigned int write_len)
{
        return skb_cloned(skb) && !skb_clone_writable(skb, write_len) &&
               pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
}

static inline int __skb_cow(struct sk_buff *skb, unsigned int headroom,
                            int cloned)
{
        int delta = 0;

        if (headroom > skb_headroom(skb))
                delta = headroom - skb_headroom(skb);

        if (delta || cloned)
                return pskb_expand_head(skb, ALIGN(delta, NET_SKB_PAD), 0,
                                        GFP_ATOMIC);
        return 0;
}

/**
 *        skb_cow - copy header of skb when it is required
 *        @skb: buffer to cow
 *        @headroom: needed headroom
 *
 *        If the skb passed lacks sufficient headroom or its data part
 *        is shared, data is reallocated. If reallocation fails, an error
 *        is returned and original skb is not changed.
 *
 *        The result is skb with writable area skb->head...skb->tail
 *        and at least @headroom of space at head.
 */
static inline int skb_cow(struct sk_buff *skb, unsigned int headroom)
{
        return __skb_cow(skb, headroom, skb_cloned(skb));
}

/**
 *        skb_cow_head - skb_cow but only making the head writable
 *        @skb: buffer to cow
 *        @headroom: needed headroom
 *
 *        This function is identical to skb_cow except that we replace the
 *        skb_cloned check by skb_header_cloned.  It should be used when
 *        you only need to push on some header and do not need to modify
 *        the data.
 */
static inline int skb_cow_head(struct sk_buff *skb, unsigned int headroom)
{
        return __skb_cow(skb, headroom, skb_header_cloned(skb));
}

/**
 *        skb_padto        - pad an skbuff up to a minimal size
 *        @skb: buffer to pad
 *        @len: minimal length
 *
 *        Pads up a buffer to ensure the trailing bytes exist and are
 *        blanked. If the buffer already contains sufficient data it
 *        is untouched. Otherwise it is extended. Returns zero on
 *        success. The skb is freed on error.
 */
static inline int skb_padto(struct sk_buff *skb, unsigned int len)
{
        unsigned int size = skb->len;
        if (likely(size >= len))
                return 0;
        return skb_pad(skb, len - size);
}

/**
 *        __skb_put_padto - increase size and pad an skbuff up to a minimal size
 *        @skb: buffer to pad
 *        @len: minimal length
 *        @free_on_error: free buffer on error
 *
 *        Pads up a buffer to ensure the trailing bytes exist and are
 *        blanked. If the buffer already contains sufficient data it
 *        is untouched. Otherwise it is extended. Returns zero on
 *        success. The skb is freed on error if @free_on_error is true.
 */
static inline int __must_check __skb_put_padto(struct sk_buff *skb,
                                               unsigned int len,
                                               bool free_on_error)
{
        unsigned int size = skb->len;

        if (unlikely(size < len)) {
                len -= size;
                if (__skb_pad(skb, len, free_on_error))
                        return -ENOMEM;
                __skb_put(skb, len);
        }
        return 0;
}

/**
 *        skb_put_padto - increase size and pad an skbuff up to a minimal size
 *        @skb: buffer to pad
 *        @len: minimal length
 *
 *        Pads up a buffer to ensure the trailing bytes exist and are
 *        blanked. If the buffer already contains sufficient data it
 *        is untouched. Otherwise it is extended. Returns zero on
 *        success. The skb is freed on error.
 */
static inline int __must_check skb_put_padto(struct sk_buff *skb, unsigned int len)
{
        return __skb_put_padto(skb, len, true);
}

static inline int skb_add_data(struct sk_buff *skb,
                               struct iov_iter *from, int copy)
{
        const int off = skb->len;

        if (skb->ip_summed == CHECKSUM_NONE) {
                __wsum csum = 0;
                if (csum_and_copy_from_iter_full(skb_put(skb, copy), copy,
                                                 &csum, from)) {
                        skb->csum = csum_block_add(skb->csum, csum, off);
                        return 0;
                }
        } else if (copy_from_iter_full(skb_put(skb, copy), copy, from))
                return 0;

        __skb_trim(skb, off);
        return -EFAULT;
}

static inline bool skb_can_coalesce(struct sk_buff *skb, int i,
                                    const struct page *page, int off)
{
        if (skb_zcopy(skb))
                return false;
        if (i) {
                const skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1];

                return page == skb_frag_page(frag) &&
                       off == skb_frag_off(frag) + skb_frag_size(frag);
        }
        return false;
}

static inline int __skb_linearize(struct sk_buff *skb)
{
        return __pskb_pull_tail(skb, skb->data_len) ? 0 : -ENOMEM;
}

/**
 *        skb_linearize - convert paged skb to linear one
 *        @skb: buffer to linarize
 *
 *        If there is no free memory -ENOMEM is returned, otherwise zero
 *        is returned and the old skb data released.
 */
static inline int skb_linearize(struct sk_buff *skb)
{
        return skb_is_nonlinear(skb) ? __skb_linearize(skb) : 0;
}

/**
 * skb_has_shared_frag - can any frag be overwritten
 * @skb: buffer to test
 *
 * Return true if the skb has at least one frag that might be modified
 * by an external entity (as in vmsplice()/sendfile())
 */
static inline bool skb_has_shared_frag(const struct sk_buff *skb)
{
        return skb_is_nonlinear(skb) &&
               skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG;
}

/**
 *        skb_linearize_cow - make sure skb is linear and writable
 *        @skb: buffer to process
 *
 *        If there is no free memory -ENOMEM is returned, otherwise zero
 *        is returned and the old skb data released.
 */
static inline int skb_linearize_cow(struct sk_buff *skb)
{
        return skb_is_nonlinear(skb) || skb_cloned(skb) ?
               __skb_linearize(skb) : 0;
}

static __always_inline void
__skb_postpull_rcsum(struct sk_buff *skb, const void *start, unsigned int len,
                     unsigned int off)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->csum = csum_block_sub(skb->csum,
                                           csum_partial(start, len, 0), off);
        else if (skb->ip_summed == CHECKSUM_PARTIAL &&
                 skb_checksum_start_offset(skb) < 0)
                skb->ip_summed = CHECKSUM_NONE;
}

/**
 *        skb_postpull_rcsum - update checksum for received skb after pull
 *        @skb: buffer to update
 *        @start: start of data before pull
 *        @len: length of data pulled
 *
 *        After doing a pull on a received packet, you need to call this to
 *        update the CHECKSUM_COMPLETE checksum, or set ip_summed to
 *        CHECKSUM_NONE so that it can be recomputed from scratch.
 */
static inline void skb_postpull_rcsum(struct sk_buff *skb,
                                      const void *start, unsigned int len)
{
        __skb_postpull_rcsum(skb, start, len, 0);
}

static __always_inline void
__skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len,
                     unsigned int off)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->csum = csum_block_add(skb->csum,
                                           csum_partial(start, len, 0), off);
}

/**
 *        skb_postpush_rcsum - update checksum for received skb after push
 *        @skb: buffer to update
 *        @start: start of data after push
 *        @len: length of data pushed
 *
 *        After doing a push on a received packet, you need to call this to
 *        update the CHECKSUM_COMPLETE checksum.
 */
static inline void skb_postpush_rcsum(struct sk_buff *skb,
                                      const void *start, unsigned int len)
{
        __skb_postpush_rcsum(skb, start, len, 0);
}

void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len);

/**
 *        skb_push_rcsum - push skb and update receive checksum
 *        @skb: buffer to update
 *        @len: length of data pulled
 *
 *        This function performs an skb_push on the packet and updates
 *        the CHECKSUM_COMPLETE checksum.  It should be used on
 *        receive path processing instead of skb_push unless you know
 *        that the checksum difference is zero (e.g., a valid IP header)
 *        or you are setting ip_summed to CHECKSUM_NONE.
 */
static inline void *skb_push_rcsum(struct sk_buff *skb, unsigned int len)
{
        skb_push(skb, len);
        skb_postpush_rcsum(skb, skb->data, len);
        return skb->data;
}

int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len);
/**
 *        pskb_trim_rcsum - trim received skb and update checksum
 *        @skb: buffer to trim
 *        @len: new length
 *
 *        This is exactly the same as pskb_trim except that it ensures the
 *        checksum of received packets are still valid after the operation.
 *        It can change skb pointers.
 */

static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len)
{
        if (likely(len >= skb->len))
                return 0;
        return pskb_trim_rcsum_slow(skb, len);
}

static inline int __skb_trim_rcsum(struct sk_buff *skb, unsigned int len)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->ip_summed = CHECKSUM_NONE;
        __skb_trim(skb, len);
        return 0;
}

static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->ip_summed = CHECKSUM_NONE;
        return __skb_grow(skb, len);
}

#define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode)
#define skb_rb_first(root) rb_to_skb(rb_first(root))
#define skb_rb_last(root)  rb_to_skb(rb_last(root))
#define skb_rb_next(skb)   rb_to_skb(rb_next(&(skb)->rbnode))
#define skb_rb_prev(skb)   rb_to_skb(rb_prev(&(skb)->rbnode))

#define skb_queue_walk(queue, skb) \
                for (skb = (queue)->next;                                        \
                     skb != (struct sk_buff *)(queue);                                \
                     skb = skb->next)

#define skb_queue_walk_safe(queue, skb, tmp)                                        \
                for (skb = (queue)->next, tmp = skb->next;                        \
                     skb != (struct sk_buff *)(queue);                                \
                     skb = tmp, tmp = skb->next)

#define skb_queue_walk_from(queue, skb)                                                \
                for (; skb != (struct sk_buff *)(queue);                        \
                     skb = skb->next)

#define skb_rbtree_walk(skb, root)                                                \
                for (skb = skb_rb_first(root); skb != NULL;                        \
                     skb = skb_rb_next(skb))

#define skb_rbtree_walk_from(skb)                                                \
                for (; skb != NULL;                                                \
                     skb = skb_rb_next(skb))

#define skb_rbtree_walk_from_safe(skb, tmp)                                        \
                for (; tmp = skb ? skb_rb_next(skb) : NULL, (skb != NULL);        \
                     skb = tmp)

#define skb_queue_walk_from_safe(queue, skb, tmp)                                \
                for (tmp = skb->next;                                                \
                     skb != (struct sk_buff *)(queue);                                \
                     skb = tmp, tmp = skb->next)

#define skb_queue_reverse_walk(queue, skb) \
                for (skb = (queue)->prev;                                        \
                     skb != (struct sk_buff *)(queue);                                \
                     skb = skb->prev)

#define skb_queue_reverse_walk_safe(queue, skb, tmp)                                \
                for (skb = (queue)->prev, tmp = skb->prev;                        \
                     skb != (struct sk_buff *)(queue);                                \
                     skb = tmp, tmp = skb->prev)

#define skb_queue_reverse_walk_from_safe(queue, skb, tmp)                        \
                for (tmp = skb->prev;                                                \
                     skb != (struct sk_buff *)(queue);                                \
                     skb = tmp, tmp = skb->prev)

static inline bool skb_has_frag_list(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->frag_list != NULL;
}

static inline void skb_frag_list_init(struct sk_buff *skb)
{
        skb_shinfo(skb)->frag_list = NULL;
}

#define skb_walk_frags(skb, iter)        \
        for (iter = skb_shinfo(skb)->frag_list; iter; iter = iter->next)


int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue,
                                int *err, long *timeo_p,
                                const struct sk_buff *skb);
struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
                                          struct sk_buff_head *queue,
                                          unsigned int flags,
                                          int *off, int *err,
                                          struct sk_buff **last);
struct sk_buff *__skb_try_recv_datagram(struct sock *sk,
                                        struct sk_buff_head *queue,
                                        unsigned int flags, int *off, int *err,
                                        struct sk_buff **last);
struct sk_buff *__skb_recv_datagram(struct sock *sk,
                                    struct sk_buff_head *sk_queue,
                                    unsigned int flags, int *off, int *err);
struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock,
                                  int *err);
__poll_t datagram_poll(struct file *file, struct socket *sock,
                           struct poll_table_struct *wait);
int skb_copy_datagram_iter(const struct sk_buff *from, int offset,
                           struct iov_iter *to, int size);
static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset,
                                        struct msghdr *msg, int size)
{
        return skb_copy_datagram_iter(from, offset, &msg->msg_iter, size);
}
int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen,
                                   struct msghdr *msg);
int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset,
                           struct iov_iter *to, int len,
                           struct ahash_request *hash);
int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
                                 struct iov_iter *from, int len);
int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm);
void skb_free_datagram(struct sock *sk, struct sk_buff *skb);
void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len);
static inline void skb_free_datagram_locked(struct sock *sk,
                                            struct sk_buff *skb)
{
        __skb_free_datagram_locked(sk, skb, 0);
}
int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags);
int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len);
int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len);
__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, u8 *to,
                              int len);
int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
                    struct pipe_inode_info *pipe, unsigned int len,
                    unsigned int flags);
int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
                         int len);
void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to);
unsigned int skb_zerocopy_headlen(const struct sk_buff *from);
int skb_zerocopy(struct sk_buff *to, struct sk_buff *from,
                 int len, int hlen);
void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len);
int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen);
void skb_scrub_packet(struct sk_buff *skb, bool xnet);
bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu);
bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len);
struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features);
struct sk_buff *skb_segment_list(struct sk_buff *skb, netdev_features_t features,
                                 unsigned int offset);
struct sk_buff *skb_vlan_untag(struct sk_buff *skb);
int skb_ensure_writable(struct sk_buff *skb, int write_len);
int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci);
int skb_vlan_pop(struct sk_buff *skb);
int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci);
int skb_eth_pop(struct sk_buff *skb);
int skb_eth_push(struct sk_buff *skb, const unsigned char *dst,
                 const unsigned char *src);
int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto,
                  int mac_len, bool ethernet);
int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len,
                 bool ethernet);
int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse);
int skb_mpls_dec_ttl(struct sk_buff *skb);
struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy,
                             gfp_t gfp);

static inline int memcpy_from_msg(void *data, struct msghdr *msg, int len)
{
        return copy_from_iter_full(data, len, &msg->msg_iter) ? 0 : -EFAULT;
}

static inline int memcpy_to_msg(struct msghdr *msg, void *data, int len)
{
        return copy_to_iter(data, len, &msg->msg_iter) == len ? 0 : -EFAULT;
}

struct skb_checksum_ops {
        __wsum (*update)(const void *mem, int len, __wsum wsum);
        __wsum (*combine)(__wsum csum, __wsum csum2, int offset, int len);
};

extern const struct skb_checksum_ops *crc32c_csum_stub __read_mostly;

__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
                      __wsum csum, const struct skb_checksum_ops *ops);
__wsum skb_checksum(const struct sk_buff *skb, int offset, int len,
                    __wsum csum);

static inline void * __must_check
__skb_header_pointer(const struct sk_buff *skb, int offset,
                     int len, void *data, int hlen, void *buffer)
{
        if (hlen - offset >= len)
                return data + offset;

        if (!skb ||
            skb_copy_bits(skb, offset, buffer, len) < 0)
                return NULL;

        return buffer;
}

static inline void * __must_check
skb_header_pointer(const struct sk_buff *skb, int offset, int len, void *buffer)
{
        return __skb_header_pointer(skb, offset, len, skb->data,
                                    skb_headlen(skb), buffer);
}

/**
 *        skb_needs_linearize - check if we need to linearize a given skb
 *                              depending on the given device features.
 *        @skb: socket buffer to check
 *        @features: net device features
 *
 *        Returns true if either:
 *        1. skb has frag_list and the device doesn't support FRAGLIST, or
 *        2. skb is fragmented and the device does not support SG.
 */
static inline bool skb_needs_linearize(struct sk_buff *skb,
                                       netdev_features_t features)
{
        return skb_is_nonlinear(skb) &&
               ((skb_has_frag_list(skb) && !(features & NETIF_F_FRAGLIST)) ||
                (skb_shinfo(skb)->nr_frags && !(features & NETIF_F_SG)));
}

static inline void skb_copy_from_linear_data(const struct sk_buff *skb,
                                             void *to,
                                             const unsigned int len)
{
        memcpy(to, skb->data, len);
}

static inline void skb_copy_from_linear_data_offset(const struct sk_buff *skb,
                                                    const int offset, void *to,
                                                    const unsigned int len)
{
        memcpy(to, skb->data + offset, len);
}

static inline void skb_copy_to_linear_data(struct sk_buff *skb,
                                           const void *from,
                                           const unsigned int len)
{
        memcpy(skb->data, from, len);
}

static inline void skb_copy_to_linear_data_offset(struct sk_buff *skb,
                                                  const int offset,
                                                  const void *from,
                                                  const unsigned int len)
{
        memcpy(skb->data + offset, from, len);
}

void skb_init(void);

static inline ktime_t skb_get_ktime(const struct sk_buff *skb)
{
        return skb->tstamp;
}

/**
 *        skb_get_timestamp - get timestamp from a skb
 *        @skb: skb to get stamp from
 *        @stamp: pointer to struct __kernel_old_timeval to store stamp in
 *
 *        Timestamps are stored in the skb as offsets to a base timestamp.
 *        This function converts the offset back to a struct timeval and stores
 *        it in stamp.
 */
static inline void skb_get_timestamp(const struct sk_buff *skb,
                                     struct __kernel_old_timeval *stamp)
{
        *stamp = ns_to_kernel_old_timeval(skb->tstamp);
}

static inline void skb_get_new_timestamp(const struct sk_buff *skb,
                                         struct __kernel_sock_timeval *stamp)
{
        struct timespec64 ts = ktime_to_timespec64(skb->tstamp);

        stamp->tv_sec = ts.tv_sec;
        stamp->tv_usec = ts.tv_nsec / 1000;
}

static inline void skb_get_timestampns(const struct sk_buff *skb,
                                       struct __kernel_old_timespec *stamp)
{
        struct timespec64 ts = ktime_to_timespec64(skb->tstamp);

        stamp->tv_sec = ts.tv_sec;
        stamp->tv_nsec = ts.tv_nsec;
}

static inline void skb_get_new_timestampns(const struct sk_buff *skb,
                                           struct __kernel_timespec *stamp)
{
        struct timespec64 ts = ktime_to_timespec64(skb->tstamp);

        stamp->tv_sec = ts.tv_sec;
        stamp->tv_nsec = ts.tv_nsec;
}

static inline void __net_timestamp(struct sk_buff *skb)
{
        skb->tstamp = ktime_get_real();
}

static inline ktime_t net_timedelta(ktime_t t)
{
        return ktime_sub(ktime_get_real(), t);
}

static inline ktime_t net_invalid_timestamp(void)
{
        return 0;
}

static inline u8 skb_metadata_len(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->meta_len;
}

static inline void *skb_metadata_end(const struct sk_buff *skb)
{
        return skb_mac_header(skb);
}

static inline bool __skb_metadata_differs(const struct sk_buff *skb_a,
                                          const struct sk_buff *skb_b,
                                          u8 meta_len)
{
        const void *a = skb_metadata_end(skb_a);
        const void *b = skb_metadata_end(skb_b);
        /* Using more efficient varaiant than plain call to memcmp(). */
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        u64 diffs = 0;

        switch (meta_len) {
#define __it(x, op) (x -= sizeof(u##op))
#define __it_diff(a, b, op) (*(u##op *)__it(a, op)) ^ (*(u##op *)__it(b, op))
        case 32: diffs |= __it_diff(a, b, 64);
                fallthrough;
        case 24: diffs |= __it_diff(a, b, 64);
                fallthrough;
        case 16: diffs |= __it_diff(a, b, 64);
                fallthrough;
        case  8: diffs |= __it_diff(a, b, 64);
                break;
        case 28: diffs |= __it_diff(a, b, 64);
                fallthrough;
        case 20: diffs |= __it_diff(a, b, 64);
                fallthrough;
        case 12: diffs |= __it_diff(a, b, 64);
                fallthrough;
        case  4: diffs |= __it_diff(a, b, 32);
                break;
        }
        return diffs;
#else
        return memcmp(a - meta_len, b - meta_len, meta_len);
#endif
}

static inline bool skb_metadata_differs(const struct sk_buff *skb_a,
                                        const struct sk_buff *skb_b)
{
        u8 len_a = skb_metadata_len(skb_a);
        u8 len_b = skb_metadata_len(skb_b);

        if (!(len_a | len_b))
                return false;

        return len_a != len_b ?
               true : __skb_metadata_differs(skb_a, skb_b, len_a);
}

static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len)
{
        skb_shinfo(skb)->meta_len = meta_len;
}

static inline void skb_metadata_clear(struct sk_buff *skb)
{
        skb_metadata_set(skb, 0);
}

struct sk_buff *skb_clone_sk(struct sk_buff *skb);

#ifdef CONFIG_NETWORK_PHY_TIMESTAMPING

void skb_clone_tx_timestamp(struct sk_buff *skb);
bool skb_defer_rx_timestamp(struct sk_buff *skb);

#else /* CONFIG_NETWORK_PHY_TIMESTAMPING */

static inline void skb_clone_tx_timestamp(struct sk_buff *skb)
{
}

static inline bool skb_defer_rx_timestamp(struct sk_buff *skb)
{
        return false;
}

#endif /* !CONFIG_NETWORK_PHY_TIMESTAMPING */

/**
 * skb_complete_tx_timestamp() - deliver cloned skb with tx timestamps
 *
 * PHY drivers may accept clones of transmitted packets for
 * timestamping via their phy_driver.txtstamp method. These drivers
 * must call this function to return the skb back to the stack with a
 * timestamp.
 *
 * @skb: clone of the original outgoing packet
 * @hwtstamps: hardware time stamps
 *
 */
void skb_complete_tx_timestamp(struct sk_buff *skb,
                               struct skb_shared_hwtstamps *hwtstamps);

void __skb_tstamp_tx(struct sk_buff *orig_skb,
                     struct skb_shared_hwtstamps *hwtstamps,
                     struct sock *sk, int tstype);

/**
 * skb_tstamp_tx - queue clone of skb with send time stamps
 * @orig_skb:        the original outgoing packet
 * @hwtstamps:        hardware time stamps, may be NULL if not available
 *
 * If the skb has a socket associated, then this function clones the
 * skb (thus sharing the actual data and optional structures), stores
 * the optional hardware time stamping information (if non NULL) or
 * generates a software time stamp (otherwise), then queues the clone
 * to the error queue of the socket.  Errors are silently ignored.
 */
void skb_tstamp_tx(struct sk_buff *orig_skb,
                   struct skb_shared_hwtstamps *hwtstamps);

/**
 * skb_tx_timestamp() - Driver hook for transmit timestamping
 *
 * Ethernet MAC Drivers should call this function in their hard_xmit()
 * function immediately before giving the sk_buff to the MAC hardware.
 *
 * Specifically, one should make absolutely sure that this function is
 * called before TX completion of this packet can trigger.  Otherwise
 * the packet could potentially already be freed.
 *
 * @skb: A socket buffer.
 */
static inline void skb_tx_timestamp(struct sk_buff *skb)
{
        skb_clone_tx_timestamp(skb);
        if (skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP)
                skb_tstamp_tx(skb, NULL);
}

/**
 * skb_complete_wifi_ack - deliver skb with wifi status
 *
 * @skb: the original outgoing packet
 * @acked: ack status
 *
 */
void skb_complete_wifi_ack(struct sk_buff *skb, bool acked);

__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len);
__sum16 __skb_checksum_complete(struct sk_buff *skb);

static inline int skb_csum_unnecessary(const struct sk_buff *skb)
{
        return ((skb->ip_summed == CHECKSUM_UNNECESSARY) ||
                skb->csum_valid ||
                (skb->ip_summed == CHECKSUM_PARTIAL &&
                 skb_checksum_start_offset(skb) >= 0));
}

/**
 *        skb_checksum_complete - Calculate checksum of an entire packet
 *        @skb: packet to process
 *
 *        This function calculates the checksum over the entire packet plus
 *        the value of skb->csum.  The latter can be used to supply the
 *        checksum of a pseudo header as used by TCP/UDP.  It returns the
 *        checksum.
 *
 *        For protocols that contain complete checksums such as ICMP/TCP/UDP,
 *        this function can be used to verify that checksum on received
 *        packets.  In that case the function should return zero if the
 *        checksum is correct.  In particular, this function will return zero
 *        if skb->ip_summed is CHECKSUM_UNNECESSARY which indicates that the
 *        hardware has already verified the correctness of the checksum.
 */
static inline __sum16 skb_checksum_complete(struct sk_buff *skb)
{
        return skb_csum_unnecessary(skb) ?
               0 : __skb_checksum_complete(skb);
}

static inline void __skb_decr_checksum_unnecessary(struct sk_buff *skb)
{
        if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
                if (skb->csum_level == 0)
                        skb->ip_summed = CHECKSUM_NONE;
                else
                        skb->csum_level--;
        }
}

static inline void __skb_incr_checksum_unnecessary(struct sk_buff *skb)
{
        if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
                if (skb->csum_level < SKB_MAX_CSUM_LEVEL)
                        skb->csum_level++;
        } else if (skb->ip_summed == CHECKSUM_NONE) {
                skb->ip_summed = CHECKSUM_UNNECESSARY;
                skb->csum_level = 0;
        }
}

static inline void __skb_reset_checksum_unnecessary(struct sk_buff *skb)
{
        if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
                skb->ip_summed = CHECKSUM_NONE;
                skb->csum_level = 0;
        }
}

/* Check if we need to perform checksum complete validation.
 *
 * Returns true if checksum complete is needed, false otherwise
 * (either checksum is unnecessary or zero checksum is allowed).
 */
static inline bool __skb_checksum_validate_needed(struct sk_buff *skb,
                                                  bool zero_okay,
                                                  __sum16 check)
{
        if (skb_csum_unnecessary(skb) || (zero_okay && !check)) {
                skb->csum_valid = 1;
                __skb_decr_checksum_unnecessary(skb);
                return false;
        }

        return true;
}

/* For small packets <= CHECKSUM_BREAK perform checksum complete directly
 * in checksum_init.
 */
#define CHECKSUM_BREAK 76

/* Unset checksum-complete
 *
 * Unset checksum complete can be done when packet is being modified
 * (uncompressed for instance) and checksum-complete value is
 * invalidated.
 */
static inline void skb_checksum_complete_unset(struct sk_buff *skb)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->ip_summed = CHECKSUM_NONE;
}

/* Validate (init) checksum based on checksum complete.
 *
 * Return values:
 *   0: checksum is validated or try to in skb_checksum_complete. In the latter
 *        case the ip_summed will not be CHECKSUM_UNNECESSARY and the pseudo
 *        checksum is stored in skb->csum for use in __skb_checksum_complete
 *   non-zero: value of invalid checksum
 *
 */
static inline __sum16 __skb_checksum_validate_complete(struct sk_buff *skb,
                                                       bool complete,
                                                       __wsum psum)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE) {
                if (!csum_fold(csum_add(psum, skb->csum))) {
                        skb->csum_valid = 1;
                        return 0;
                }
        }

        skb->csum = psum;

        if (complete || skb->len <= CHECKSUM_BREAK) {
                __sum16 csum;

                csum = __skb_checksum_complete(skb);
                skb->csum_valid = !csum;
                return csum;
        }

        return 0;
}

static inline __wsum null_compute_pseudo(struct sk_buff *skb, int proto)
{
        return 0;
}

/* Perform checksum validate (init). Note that this is a macro since we only
 * want to calculate the pseudo header which is an input function if necessary.
 * First we try to validate without any computation (checksum unnecessary) and
 * then calculate based on checksum complete calling the function to compute
 * pseudo header.
 *
 * Return values:
 *   0: checksum is validated or try to in skb_checksum_complete
 *   non-zero: value of invalid checksum
 */
#define __skb_checksum_validate(skb, proto, complete,                        \
                                zero_okay, check, compute_pseudo)        \
({                                                                        \
        __sum16 __ret = 0;                                                \
        skb->csum_valid = 0;                                                \
        if (__skb_checksum_validate_needed(skb, zero_okay, check))        \
                __ret = __skb_checksum_validate_complete(skb,                \
                                complete, compute_pseudo(skb, proto));        \
        __ret;                                                                \
})

#define skb_checksum_init(skb, proto, compute_pseudo)                        \
        __skb_checksum_validate(skb, proto, false, false, 0, compute_pseudo)

#define skb_checksum_init_zero_check(skb, proto, check, compute_pseudo)        \
        __skb_checksum_validate(skb, proto, false, true, check, compute_pseudo)

#define skb_checksum_validate(skb, proto, compute_pseudo)                \
        __skb_checksum_validate(skb, proto, true, false, 0, compute_pseudo)

#define skb_checksum_validate_zero_check(skb, proto, check,                \
                                         compute_pseudo)                \
        __skb_checksum_validate(skb, proto, true, true, check, compute_pseudo)

#define skb_checksum_simple_validate(skb)                                \
        __skb_checksum_validate(skb, 0, true, false, 0, null_compute_pseudo)

static inline bool __skb_checksum_convert_check(struct sk_buff *skb)
{
        return (skb->ip_summed == CHECKSUM_NONE && skb->csum_valid);
}

static inline void __skb_checksum_convert(struct sk_buff *skb, __wsum pseudo)
{
        skb->csum = ~pseudo;
        skb->ip_summed = CHECKSUM_COMPLETE;
}

#define skb_checksum_try_convert(skb, proto, compute_pseudo)        \
do {                                                                        \
        if (__skb_checksum_convert_check(skb))                                \
                __skb_checksum_convert(skb, compute_pseudo(skb, proto)); \
} while (0)

static inline void skb_remcsum_adjust_partial(struct sk_buff *skb, void *ptr,
                                              u16 start, u16 offset)
{
        skb->ip_summed = CHECKSUM_PARTIAL;
        skb->csum_start = ((unsigned char *)ptr + start) - skb->head;
        skb->csum_offset = offset - start;
}

/* Update skbuf and packet to reflect the remote checksum offload operation.
 * When called, ptr indicates the starting point for skb->csum when
 * ip_summed is CHECKSUM_COMPLETE. If we need create checksum complete
 * here, skb_postpull_rcsum is done so skb->csum start is ptr.
 */
static inline void skb_remcsum_process(struct sk_buff *skb, void *ptr,
                                       int start, int offset, bool nopartial)
{
        __wsum delta;

        if (!nopartial) {
                skb_remcsum_adjust_partial(skb, ptr, start, offset);
                return;
        }

         if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) {
                __skb_checksum_complete(skb);
                skb_postpull_rcsum(skb, skb->data, ptr - (void *)skb->data);
        }

        delta = remcsum_adjust(ptr, skb->csum, start, offset);

        /* Adjust skb->csum since we changed the packet */
        skb->csum = csum_add(skb->csum, delta);
}

static inline struct nf_conntrack *skb_nfct(const struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        return (void *)(skb->_nfct & NFCT_PTRMASK);
#else
        return NULL;
#endif
}

static inline unsigned long skb_get_nfct(const struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        return skb->_nfct;
#else
        return 0UL;
#endif
}

static inline void skb_set_nfct(struct sk_buff *skb, unsigned long nfct)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        skb->_nfct = nfct;
#endif
}

#ifdef CONFIG_SKB_EXTENSIONS
enum skb_ext_id {
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
        SKB_EXT_BRIDGE_NF,
#endif
#ifdef CONFIG_XFRM
        SKB_EXT_SEC_PATH,
#endif
#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
        TC_SKB_EXT,
#endif
#if IS_ENABLED(CONFIG_MPTCP)
        SKB_EXT_MPTCP,
#endif
        SKB_EXT_NUM, /* must be last */
};

/**
 *        struct skb_ext - sk_buff extensions
 *        @refcnt: 1 on allocation, deallocated on 0
 *        @offset: offset to add to @data to obtain extension address
 *        @chunks: size currently allocated, stored in SKB_EXT_ALIGN_SHIFT units
 *        @data: start of extension data, variable sized
 *
 *        Note: offsets/lengths are stored in chunks of 8 bytes, this allows
 *        to use 'u8' types while allowing up to 2kb worth of extension data.
 */
struct skb_ext {
        refcount_t refcnt;
        u8 offset[SKB_EXT_NUM]; /* in chunks of 8 bytes */
        u8 chunks;                /* same */
        char data[] __aligned(8);
};

struct skb_ext *__skb_ext_alloc(gfp_t flags);
void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id,
                    struct skb_ext *ext);
void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id);
void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id);
void __skb_ext_put(struct skb_ext *ext);

static inline void skb_ext_put(struct sk_buff *skb)
{
        if (skb->active_extensions)
                __skb_ext_put(skb->extensions);
}

static inline void __skb_ext_copy(struct sk_buff *dst,
                                  const struct sk_buff *src)
{
        dst->active_extensions = src->active_extensions;

        if (src->active_extensions) {
                struct skb_ext *ext = src->extensions;

                refcount_inc(&ext->refcnt);
                dst->extensions = ext;
        }
}

static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *src)
{
        skb_ext_put(dst);
        __skb_ext_copy(dst, src);
}

static inline bool __skb_ext_exist(const struct skb_ext *ext, enum skb_ext_id i)
{
        return !!ext->offset[i];
}

static inline bool skb_ext_exist(const struct sk_buff *skb, enum skb_ext_id id)
{
        return skb->active_extensions & (1 << id);
}

static inline void skb_ext_del(struct sk_buff *skb, enum skb_ext_id id)
{
        if (skb_ext_exist(skb, id))
                __skb_ext_del(skb, id);
}

static inline void *skb_ext_find(const struct sk_buff *skb, enum skb_ext_id id)
{
        if (skb_ext_exist(skb, id)) {
                struct skb_ext *ext = skb->extensions;

                return (void *)ext + (ext->offset[id] << 3);
        }

        return NULL;
}

static inline void skb_ext_reset(struct sk_buff *skb)
{
        if (unlikely(skb->active_extensions)) {
                __skb_ext_put(skb->extensions);
                skb->active_extensions = 0;
        }
}

static inline bool skb_has_extensions(struct sk_buff *skb)
{
        return unlikely(skb->active_extensions);
}
#else
static inline void skb_ext_put(struct sk_buff *skb) {}
static inline void skb_ext_reset(struct sk_buff *skb) {}
static inline void skb_ext_del(struct sk_buff *skb, int unused) {}
static inline void __skb_ext_copy(struct sk_buff *d, const struct sk_buff *s) {}
static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) {}
static inline bool skb_has_extensions(struct sk_buff *skb) { return false; }
#endif /* CONFIG_SKB_EXTENSIONS */

static inline void nf_reset_ct(struct sk_buff *skb)
{
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
        nf_conntrack_put(skb_nfct(skb));
        skb->_nfct = 0;
#endif
}

static inline void nf_reset_trace(struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES)
        skb->nf_trace = 0;
#endif
}

static inline void ipvs_reset(struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_IP_VS)
        skb->ipvs_property = 0;
#endif
}

/* Note: This doesn't put any conntrack info in dst. */
static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src,
                             bool copy)
{
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
        dst->_nfct = src->_nfct;
        nf_conntrack_get(skb_nfct(src));
#endif
#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES)
        if (copy)
                dst->nf_trace = src->nf_trace;
#endif
}

static inline void nf_copy(struct sk_buff *dst, const struct sk_buff *src)
{
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
        nf_conntrack_put(skb_nfct(dst));
#endif
        __nf_copy(dst, src, true);
}

#ifdef CONFIG_NETWORK_SECMARK
static inline void skb_copy_secmark(struct sk_buff *to, const struct sk_buff *from)
{
        to->secmark = from->secmark;
}

static inline void skb_init_secmark(struct sk_buff *skb)
{
        skb->secmark = 0;
}
#else
static inline void skb_copy_secmark(struct sk_buff *to, const struct sk_buff *from)
{ }

static inline void skb_init_secmark(struct sk_buff *skb)
{ }
#endif

static inline int secpath_exists(const struct sk_buff *skb)
{
#ifdef CONFIG_XFRM
        return skb_ext_exist(skb, SKB_EXT_SEC_PATH);
#else
        return 0;
#endif
}

static inline bool skb_irq_freeable(const struct sk_buff *skb)
{
        return !skb->destructor &&
                !secpath_exists(skb) &&
                !skb_nfct(skb) &&
                !skb->_skb_refdst &&
                !skb_has_frag_list(skb);
}

static inline void skb_set_queue_mapping(struct sk_buff *skb, u16 queue_mapping)
{
        skb->queue_mapping = queue_mapping;
}

static inline u16 skb_get_queue_mapping(const struct sk_buff *skb)
{
        return skb->queue_mapping;
}

static inline void skb_copy_queue_mapping(struct sk_buff *to, const struct sk_buff *from)
{
        to->queue_mapping = from->queue_mapping;
}

static inline void skb_record_rx_queue(struct sk_buff *skb, u16 rx_queue)
{
        skb->queue_mapping = rx_queue + 1;
}

static inline u16 skb_get_rx_queue(const struct sk_buff *skb)
{
        return skb->queue_mapping - 1;
}

static inline bool skb_rx_queue_recorded(const struct sk_buff *skb)
{
        return skb->queue_mapping != 0;
}

static inline void skb_set_dst_pending_confirm(struct sk_buff *skb, u32 val)
{
        skb->dst_pending_confirm = val;
}

static inline bool skb_get_dst_pending_confirm(const struct sk_buff *skb)
{
        return skb->dst_pending_confirm != 0;
}

static inline struct sec_path *skb_sec_path(const struct sk_buff *skb)
{
#ifdef CONFIG_XFRM
        return skb_ext_find(skb, SKB_EXT_SEC_PATH);
#else
        return NULL;
#endif
}

/* Keeps track of mac header offset relative to skb->head.
 * It is useful for TSO of Tunneling protocol. e.g. GRE.
 * For non-tunnel skb it points to skb_mac_header() and for
 * tunnel skb it points to outer mac header.
 * Keeps track of level of encapsulation of network headers.
 */
struct skb_gso_cb {
        union {
                int        mac_offset;
                int        data_offset;
        };
        int        encap_level;
        __wsum        csum;
        __u16        csum_start;
};
#define SKB_GSO_CB_OFFSET        32
#define SKB_GSO_CB(skb) ((struct skb_gso_cb *)((skb)->cb + SKB_GSO_CB_OFFSET))

static inline int skb_tnl_header_len(const struct sk_buff *inner_skb)
{
        return (skb_mac_header(inner_skb) - inner_skb->head) -
                SKB_GSO_CB(inner_skb)->mac_offset;
}

static inline int gso_pskb_expand_head(struct sk_buff *skb, int extra)
{
        int new_headroom, headroom;
        int ret;

        headroom = skb_headroom(skb);
        ret = pskb_expand_head(skb, extra, 0, GFP_ATOMIC);
        if (ret)
                return ret;

        new_headroom = skb_headroom(skb);
        SKB_GSO_CB(skb)->mac_offset += (new_headroom - headroom);
        return 0;
}

static inline void gso_reset_checksum(struct sk_buff *skb, __wsum res)
{
        /* Do not update partial checksums if remote checksum is enabled. */
        if (skb->remcsum_offload)
                return;

        SKB_GSO_CB(skb)->csum = res;
        SKB_GSO_CB(skb)->csum_start = skb_checksum_start(skb) - skb->head;
}

/* Compute the checksum for a gso segment. First compute the checksum value
 * from the start of transport header to SKB_GSO_CB(skb)->csum_start, and
 * then add in skb->csum (checksum from csum_start to end of packet).
 * skb->csum and csum_start are then updated to reflect the checksum of the
 * resultant packet starting from the transport header-- the resultant checksum
 * is in the res argument (i.e. normally zero or ~ of checksum of a pseudo
 * header.
 */
static inline __sum16 gso_make_checksum(struct sk_buff *skb, __wsum res)
{
        unsigned char *csum_start = skb_transport_header(skb);
        int plen = (skb->head + SKB_GSO_CB(skb)->csum_start) - csum_start;
        __wsum partial = SKB_GSO_CB(skb)->csum;

        SKB_GSO_CB(skb)->csum = res;
        SKB_GSO_CB(skb)->csum_start = csum_start - skb->head;

        return csum_fold(csum_partial(csum_start, plen, partial));
}

static inline bool skb_is_gso(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->gso_size;
}

/* Note: Should be called only if skb_is_gso(skb) is true */
static inline bool skb_is_gso_v6(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6;
}

/* Note: Should be called only if skb_is_gso(skb) is true */
static inline bool skb_is_gso_sctp(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->gso_type & SKB_GSO_SCTP;
}

/* Note: Should be called only if skb_is_gso(skb) is true */
static inline bool skb_is_gso_tcp(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6);
}

static inline void skb_gso_reset(struct sk_buff *skb)
{
        skb_shinfo(skb)->gso_size = 0;
        skb_shinfo(skb)->gso_segs = 0;
        skb_shinfo(skb)->gso_type = 0;
}

static inline void skb_increase_gso_size(struct skb_shared_info *shinfo,
                                         u16 increment)
{
        if (WARN_ON_ONCE(shinfo->gso_size == GSO_BY_FRAGS))
                return;
        shinfo->gso_size += increment;
}

static inline void skb_decrease_gso_size(struct skb_shared_info *shinfo,
                                         u16 decrement)
{
        if (WARN_ON_ONCE(shinfo->gso_size == GSO_BY_FRAGS))
                return;
        shinfo->gso_size -= decrement;
}

void __skb_warn_lro_forwarding(const struct sk_buff *skb);

static inline bool skb_warn_if_lro(const struct sk_buff *skb)
{
        /* LRO sets gso_size but not gso_type, whereas if GSO is really
         * wanted then gso_type will be set. */
        const struct skb_shared_info *shinfo = skb_shinfo(skb);

        if (skb_is_nonlinear(skb) && shinfo->gso_size != 0 &&
            unlikely(shinfo->gso_type == 0)) {
                __skb_warn_lro_forwarding(skb);
                return true;
        }
        return false;
}

static inline void skb_forward_csum(struct sk_buff *skb)
{
        /* Unfortunately we don't support this one.  Any brave souls? */
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->ip_summed = CHECKSUM_NONE;
}

/**
 * skb_checksum_none_assert - make sure skb ip_summed is CHECKSUM_NONE
 * @skb: skb to check
 *
 * fresh skbs have their ip_summed set to CHECKSUM_NONE.
 * Instead of forcing ip_summed to CHECKSUM_NONE, we can
 * use this helper, to document places where we make this assertion.
 */
static inline void skb_checksum_none_assert(const struct sk_buff *skb)
{
#ifdef DEBUG
        BUG_ON(skb->ip_summed != CHECKSUM_NONE);
#endif
}

bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off);

int skb_checksum_setup(struct sk_buff *skb, bool recalculate);
struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
                                     unsigned int transport_len,
                                     __sum16(*skb_chkf)(struct sk_buff *skb));

/**
 * skb_head_is_locked - Determine if the skb->head is locked down
 * @skb: skb to check
 *
 * The head on skbs build around a head frag can be removed if they are
 * not cloned.  This function returns true if the skb head is locked down
 * due to either being allocated via kmalloc, or by being a clone with
 * multiple references to the head.
 */
static inline bool skb_head_is_locked(const struct sk_buff *skb)
{
        return !skb->head_frag || skb_cloned(skb);
}

/* Local Checksum Offload.
 * Compute outer checksum based on the assumption that the
 * inner checksum will be offloaded later.
 * See Documentation/networking/checksum-offloads.rst for
 * explanation of how this works.
 * Fill in outer checksum adjustment (e.g. with sum of outer
 * pseudo-header) before calling.
 * Also ensure that inner checksum is in linear data area.
 */
static inline __wsum lco_csum(struct sk_buff *skb)
{
        unsigned char *csum_start = skb_checksum_start(skb);
        unsigned char *l4_hdr = skb_transport_header(skb);
        __wsum partial;

        /* Start with complement of inner checksum adjustment */
        partial = ~csum_unfold(*(__force __sum16 *)(csum_start +
                                                    skb->csum_offset));

        /* Add in checksum of our headers (incl. outer checksum
         * adjustment filled in by caller) and return result.
         */
        return csum_partial(l4_hdr, csum_start - l4_hdr, partial);
}

static inline bool skb_is_redirected(const struct sk_buff *skb)
{
#ifdef CONFIG_NET_REDIRECT
        return skb->redirected;
#else
        return false;
#endif
}

static inline void skb_set_redirected(struct sk_buff *skb, bool from_ingress)
{
#ifdef CONFIG_NET_REDIRECT
        skb->redirected = 1;
        skb->from_ingress = from_ingress;
        if (skb->from_ingress)
                skb->tstamp = 0;
#endif
}

static inline void skb_reset_redirect(struct sk_buff *skb)
{
#ifdef CONFIG_NET_REDIRECT
        skb->redirected = 0;
#endif
}

static inline bool skb_csum_is_sctp(struct sk_buff *skb)
{
        return skb->csum_not_inet;
}

static inline void skb_set_kcov_handle(struct sk_buff *skb,
                                       const u64 kcov_handle)
{
#ifdef CONFIG_KCOV
        skb->kcov_handle = kcov_handle;
#endif
}

static inline u64 skb_get_kcov_handle(struct sk_buff *skb)
{
#ifdef CONFIG_KCOV
        return skb->kcov_handle;
#else
        return 0;
#endif
}

#endif        /* __KERNEL__ */
#endif        /* _LINUX_SKBUFF_H */












































































































    1 




























    1 






























































































































































    1 






































    1 








































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM io_uring

#if !defined(_TRACE_IO_URING_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_IO_URING_H

#include <linux/tracepoint.h>

struct io_wq_work;

/**
 * io_uring_create - called after a new io_uring context was prepared
 *
 * @fd:                corresponding file descriptor
 * @ctx:        pointer to a ring context structure
 * @sq_entries:        actual SQ size
 * @cq_entries:        actual CQ size
 * @flags:        SQ ring flags, provided to io_uring_setup(2)
 *
 * Allows to trace io_uring creation and provide pointer to a context, that can
 * be used later to find correlated events.
 */
TRACE_EVENT(io_uring_create,

        TP_PROTO(int fd, void *ctx, u32 sq_entries, u32 cq_entries, u32 flags),

        TP_ARGS(fd, ctx, sq_entries, cq_entries, flags),

        TP_STRUCT__entry (
                __field(  int,                fd                        )
                __field(  void *,        ctx                        )
                __field(  u32,                sq_entries        )
                __field(  u32,                cq_entries        )
                __field(  u32,                flags                )
        ),

        TP_fast_assign(
                __entry->fd                        = fd;
                __entry->ctx                = ctx;
                __entry->sq_entries        = sq_entries;
                __entry->cq_entries        = cq_entries;
                __entry->flags                = flags;
        ),

        TP_printk("ring %p, fd %d sq size %d, cq size %d, flags %d",
                          __entry->ctx, __entry->fd, __entry->sq_entries,
                          __entry->cq_entries, __entry->flags)
);

/**
 * io_uring_register - called after a buffer/file/eventfd was successfully
 *                                            registered for a ring
 *
 * @ctx:                pointer to a ring context structure
 * @opcode:                describes which operation to perform
 * @nr_user_files:        number of registered files
 * @nr_user_bufs:        number of registered buffers
 * @cq_ev_fd:                whether eventfs registered or not
 * @ret:                return code
 *
 * Allows to trace fixed files/buffers/eventfds, that could be registered to
 * avoid an overhead of getting references to them for every operation. This
 * event, together with io_uring_file_get, can provide a full picture of how
 * much overhead one can reduce via fixing.
 */
TRACE_EVENT(io_uring_register,

        TP_PROTO(void *ctx, unsigned opcode, unsigned nr_files,
                         unsigned nr_bufs, bool eventfd, long ret),

        TP_ARGS(ctx, opcode, nr_files, nr_bufs, eventfd, ret),

        TP_STRUCT__entry (
                __field(  void *,        ctx                        )
                __field(  unsigned,        opcode                )
                __field(  unsigned,        nr_files        )
                __field(  unsigned,        nr_bufs                )
                __field(  bool,                eventfd                )
                __field(  long,                ret                        )
        ),

        TP_fast_assign(
                __entry->ctx                = ctx;
                __entry->opcode                = opcode;
                __entry->nr_files        = nr_files;
                __entry->nr_bufs        = nr_bufs;
                __entry->eventfd        = eventfd;
                __entry->ret                = ret;
        ),

        TP_printk("ring %p, opcode %d, nr_user_files %d, nr_user_bufs %d, "
                          "eventfd %d, ret %ld",
                          __entry->ctx, __entry->opcode, __entry->nr_files,
                          __entry->nr_bufs, __entry->eventfd, __entry->ret)
);

/**
 * io_uring_file_get - called before getting references to an SQE file
 *
 * @ctx:        pointer to a ring context structure
 * @fd:                SQE file descriptor
 *
 * Allows to trace out how often an SQE file reference is obtained, which can
 * help figuring out if it makes sense to use fixed files, or check that fixed
 * files are used correctly.
 */
TRACE_EVENT(io_uring_file_get,

        TP_PROTO(void *ctx, int fd),

        TP_ARGS(ctx, fd),

        TP_STRUCT__entry (
                __field(  void *,        ctx        )
                __field(  int,                fd        )
        ),

        TP_fast_assign(
                __entry->ctx        = ctx;
                __entry->fd                = fd;
        ),

        TP_printk("ring %p, fd %d", __entry->ctx, __entry->fd)
);

/**
 * io_uring_queue_async_work - called before submitting a new async work
 *
 * @ctx:        pointer to a ring context structure
 * @hashed:        type of workqueue, hashed or normal
 * @req:        pointer to a submitted request
 * @work:        pointer to a submitted io_wq_work
 *
 * Allows to trace asynchronous work submission.
 */
TRACE_EVENT(io_uring_queue_async_work,

        TP_PROTO(void *ctx, int rw, void * req, struct io_wq_work *work,
                         unsigned int flags),

        TP_ARGS(ctx, rw, req, work, flags),

        TP_STRUCT__entry (
                __field(  void *,                        ctx        )
                __field(  int,                                rw        )
                __field(  void *,                        req        )
                __field(  struct io_wq_work *,                work        )
                __field(  unsigned int,                        flags        )
        ),

        TP_fast_assign(
                __entry->ctx        = ctx;
                __entry->rw        = rw;
                __entry->req        = req;
                __entry->work        = work;
                __entry->flags        = flags;
        ),

        TP_printk("ring %p, request %p, flags %d, %s queue, work %p",
                          __entry->ctx, __entry->req, __entry->flags,
                          __entry->rw ? "hashed" : "normal", __entry->work)
);

/**
 * io_uring_defer - called when an io_uring request is deferred
 *
 * @ctx:        pointer to a ring context structure
 * @req:        pointer to a deferred request
 * @user_data:        user data associated with the request
 *
 * Allows to track deferred requests, to get an insight about what requests are
 * not started immediately.
 */
TRACE_EVENT(io_uring_defer,

        TP_PROTO(void *ctx, void *req, unsigned long long user_data),

        TP_ARGS(ctx, req, user_data),

        TP_STRUCT__entry (
                __field(  void *,        ctx                )
                __field(  void *,        req                )
                __field(  unsigned long long, data        )
        ),

        TP_fast_assign(
                __entry->ctx        = ctx;
                __entry->req        = req;
                __entry->data        = user_data;
        ),

        TP_printk("ring %p, request %p user_data %llu", __entry->ctx,
                        __entry->req, __entry->data)
);

/**
 * io_uring_link - called before the io_uring request added into link_list of
 *                    another request
 *
 * @ctx:                pointer to a ring context structure
 * @req:                pointer to a linked request
 * @target_req:                pointer to a previous request, that would contain @req
 *
 * Allows to track linked requests, to understand dependencies between requests
 * and how does it influence their execution flow.
 */
TRACE_EVENT(io_uring_link,

        TP_PROTO(void *ctx, void *req, void *target_req),

        TP_ARGS(ctx, req, target_req),

        TP_STRUCT__entry (
                __field(  void *,        ctx                )
                __field(  void *,        req                )
                __field(  void *,        target_req        )
        ),

        TP_fast_assign(
                __entry->ctx                = ctx;
                __entry->req                = req;
                __entry->target_req        = target_req;
        ),

        TP_printk("ring %p, request %p linked after %p",
                          __entry->ctx, __entry->req, __entry->target_req)
);

/**
 * io_uring_cqring_wait - called before start waiting for an available CQE
 *
 * @ctx:                pointer to a ring context structure
 * @min_events:        minimal number of events to wait for
 *
 * Allows to track waiting for CQE, so that we can e.g. troubleshoot
 * situations, when an application wants to wait for an event, that never
 * comes.
 */
TRACE_EVENT(io_uring_cqring_wait,

        TP_PROTO(void *ctx, int min_events),

        TP_ARGS(ctx, min_events),

        TP_STRUCT__entry (
                __field(  void *,        ctx                )
                __field(  int,                min_events        )
        ),

        TP_fast_assign(
                __entry->ctx        = ctx;
                __entry->min_events        = min_events;
        ),

        TP_printk("ring %p, min_events %d", __entry->ctx, __entry->min_events)
);

/**
 * io_uring_fail_link - called before failing a linked request
 *
 * @req:        request, which links were cancelled
 * @link:        cancelled link
 *
 * Allows to track linked requests cancellation, to see not only that some work
 * was cancelled, but also which request was the reason.
 */
TRACE_EVENT(io_uring_fail_link,

        TP_PROTO(void *req, void *link),

        TP_ARGS(req, link),

        TP_STRUCT__entry (
                __field(  void *,        req        )
                __field(  void *,        link        )
        ),

        TP_fast_assign(
                __entry->req        = req;
                __entry->link        = link;
        ),

        TP_printk("request %p, link %p", __entry->req, __entry->link)
);

/**
 * io_uring_complete - called when completing an SQE
 *
 * @ctx:                pointer to a ring context structure
 * @user_data:                user data associated with the request
 * @res:                result of the request
 * @cflags:                completion flags
 *
 */
TRACE_EVENT(io_uring_complete,

        TP_PROTO(void *ctx, u64 user_data, int res, unsigned cflags),

        TP_ARGS(ctx, user_data, res, cflags),

        TP_STRUCT__entry (
                __field(  void *,        ctx                )
                __field(  u64,                user_data        )
                __field(  int,                res                )
                __field(  unsigned,        cflags                )
        ),

        TP_fast_assign(
                __entry->ctx                = ctx;
                __entry->user_data        = user_data;
                __entry->res                = res;
                __entry->cflags                = cflags;
        ),

        TP_printk("ring %p, user_data 0x%llx, result %d, cflags %x",
                          __entry->ctx, (unsigned long long)__entry->user_data,
                          __entry->res, __entry->cflags)
);

/**
 * io_uring_submit_sqe - called before submitting one SQE
 *
 * @ctx:                pointer to a ring context structure
 * @req:                pointer to a submitted request
 * @opcode:                opcode of request
 * @user_data:                user data associated with the request
 * @flags                request flags
 * @force_nonblock:        whether a context blocking or not
 * @sq_thread:                true if sq_thread has submitted this SQE
 *
 * Allows to track SQE submitting, to understand what was the source of it, SQ
 * thread or io_uring_enter call.
 */
TRACE_EVENT(io_uring_submit_sqe,

        TP_PROTO(void *ctx, void *req, u8 opcode, u64 user_data, u32 flags,
                 bool force_nonblock, bool sq_thread),

        TP_ARGS(ctx, req, opcode, user_data, flags, force_nonblock, sq_thread),

        TP_STRUCT__entry (
                __field(  void *,        ctx                )
                __field(  void *,        req                )
                __field(  u8,                opcode                )
                __field(  u64,                user_data        )
                __field(  u32,                flags                )
                __field(  bool,                force_nonblock        )
                __field(  bool,                sq_thread        )
        ),

        TP_fast_assign(
                __entry->ctx                = ctx;
                __entry->req                = req;
                __entry->opcode                = opcode;
                __entry->user_data        = user_data;
                __entry->flags                = flags;
                __entry->force_nonblock        = force_nonblock;
                __entry->sq_thread        = sq_thread;
        ),

        TP_printk("ring %p, req %p, op %d, data 0x%llx, flags %u, "
                  "non block %d, sq_thread %d", __entry->ctx, __entry->req,
                  __entry->opcode, (unsigned long long)__entry->user_data,
                  __entry->flags, __entry->force_nonblock, __entry->sq_thread)
);

/*
 * io_uring_poll_arm - called after arming a poll wait if successful
 *
 * @ctx:                pointer to a ring context structure
 * @req:                pointer to the armed request
 * @opcode:                opcode of request
 * @user_data:                user data associated with the request
 * @mask:                request poll events mask
 * @events:                registered events of interest
 *
 * Allows to track which fds are waiting for and what are the events of
 * interest.
 */
TRACE_EVENT(io_uring_poll_arm,

        TP_PROTO(void *ctx, void *req, u8 opcode, u64 user_data,
                 int mask, int events),

        TP_ARGS(ctx, req, opcode, user_data, mask, events),

        TP_STRUCT__entry (
                __field(  void *,        ctx                )
                __field(  void *,        req                )
                __field(  u8,                opcode                )
                __field(  u64,                user_data        )
                __field(  int,                mask                )
                __field(  int,                events                )
        ),

        TP_fast_assign(
                __entry->ctx                = ctx;
                __entry->req                = req;
                __entry->opcode                = opcode;
                __entry->user_data        = user_data;
                __entry->mask                = mask;
                __entry->events                = events;
        ),

        TP_printk("ring %p, req %p, op %d, data 0x%llx, mask 0x%x, events 0x%x",
                  __entry->ctx, __entry->req, __entry->opcode,
                  (unsigned long long) __entry->user_data,
                  __entry->mask, __entry->events)
);

TRACE_EVENT(io_uring_poll_wake,

        TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask),

        TP_ARGS(ctx, opcode, user_data, mask),

        TP_STRUCT__entry (
                __field(  void *,        ctx                )
                __field(  u8,                opcode                )
                __field(  u64,                user_data        )
                __field(  int,                mask                )
        ),

        TP_fast_assign(
                __entry->ctx                = ctx;
                __entry->opcode                = opcode;
                __entry->user_data        = user_data;
                __entry->mask                = mask;
        ),

        TP_printk("ring %p, op %d, data 0x%llx, mask 0x%x",
                          __entry->ctx, __entry->opcode,
                          (unsigned long long) __entry->user_data,
                          __entry->mask)
);

TRACE_EVENT(io_uring_task_add,

        TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask),

        TP_ARGS(ctx, opcode, user_data, mask),

        TP_STRUCT__entry (
                __field(  void *,        ctx                )
                __field(  u8,                opcode                )
                __field(  u64,                user_data        )
                __field(  int,                mask                )
        ),

        TP_fast_assign(
                __entry->ctx                = ctx;
                __entry->opcode                = opcode;
                __entry->user_data        = user_data;
                __entry->mask                = mask;
        ),

        TP_printk("ring %p, op %d, data 0x%llx, mask %x",
                          __entry->ctx, __entry->opcode,
                          (unsigned long long) __entry->user_data,
                          __entry->mask)
);

/*
 * io_uring_task_run - called when task_work_run() executes the poll events
 *                     notification callbacks
 *
 * @ctx:                pointer to a ring context structure
 * @req:                pointer to the armed request
 * @opcode:                opcode of request
 * @user_data:                user data associated with the request
 *
 * Allows to track when notified poll events are processed
 */
TRACE_EVENT(io_uring_task_run,

        TP_PROTO(void *ctx, void *req, u8 opcode, u64 user_data),

        TP_ARGS(ctx, req, opcode, user_data),

        TP_STRUCT__entry (
                __field(  void *,        ctx                )
                __field(  void *,        req                )
                __field(  u8,                opcode                )
                __field(  u64,                user_data        )
        ),

        TP_fast_assign(
                __entry->ctx                = ctx;
                __entry->req                = req;
                __entry->opcode                = opcode;
                __entry->user_data        = user_data;
        ),

        TP_printk("ring %p, req %p, op %d, data 0x%llx",
                  __entry->ctx, __entry->req, __entry->opcode,
                  (unsigned long long) __entry->user_data)
);

#endif /* _TRACE_IO_URING_H */

/* This part must be outside protection */
#include <trace/define_trace.h>








































































































































































































































































































































































































































































































    2 














































    1 















































    1 

    1 































































































































































































































    4 






































    2 

























    3 




















    3 









































































































































































































































































































































































































































































































































































































































































































    1 























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* memcontrol.h - Memory Controller
 *
 * Copyright IBM Corporation, 2007
 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
 *
 * Copyright 2007 OpenVZ SWsoft Inc
 * Author: Pavel Emelianov <xemul@openvz.org>
 */

#ifndef _LINUX_MEMCONTROL_H
#define _LINUX_MEMCONTROL_H
#include <linux/cgroup.h>
#include <linux/vm_event_item.h>
#include <linux/hardirq.h>
#include <linux/jump_label.h>
#include <linux/page_counter.h>
#include <linux/vmpressure.h>
#include <linux/eventfd.h>
#include <linux/mm.h>
#include <linux/vmstat.h>
#include <linux/writeback.h>
#include <linux/page-flags.h>

struct mem_cgroup;
struct obj_cgroup;
struct page;
struct mm_struct;
struct kmem_cache;

/* Cgroup-specific page state, on top of universal node page state */
enum memcg_stat_item {
        MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS,
        MEMCG_SOCK,
        MEMCG_PERCPU_B,
        MEMCG_NR_STAT,
};

enum memcg_memory_event {
        MEMCG_LOW,
        MEMCG_HIGH,
        MEMCG_MAX,
        MEMCG_OOM,
        MEMCG_OOM_KILL,
        MEMCG_SWAP_HIGH,
        MEMCG_SWAP_MAX,
        MEMCG_SWAP_FAIL,
        MEMCG_NR_MEMORY_EVENTS,
};

struct mem_cgroup_reclaim_cookie {
        pg_data_t *pgdat;
        unsigned int generation;
};

#ifdef CONFIG_MEMCG

#define MEM_CGROUP_ID_SHIFT        16
#define MEM_CGROUP_ID_MAX        USHRT_MAX

struct mem_cgroup_id {
        int id;
        refcount_t ref;
};

/*
 * Per memcg event counter is incremented at every pagein/pageout. With THP,
 * it will be incremented by the number of pages. This counter is used
 * to trigger some periodic events. This is straightforward and better
 * than using jiffies etc. to handle periodic memcg event.
 */
enum mem_cgroup_events_target {
        MEM_CGROUP_TARGET_THRESH,
        MEM_CGROUP_TARGET_SOFTLIMIT,
        MEM_CGROUP_NTARGETS,
};

struct memcg_vmstats_percpu {
        long stat[MEMCG_NR_STAT];
        unsigned long events[NR_VM_EVENT_ITEMS];
        unsigned long nr_page_events;
        unsigned long targets[MEM_CGROUP_NTARGETS];
};

struct mem_cgroup_reclaim_iter {
        struct mem_cgroup *position;
        /* scan generation, increased every round-trip */
        unsigned int generation;
};

struct lruvec_stat {
        long count[NR_VM_NODE_STAT_ITEMS];
};

/*
 * Bitmap of shrinker::id corresponding to memcg-aware shrinkers,
 * which have elements charged to this memcg.
 */
struct memcg_shrinker_map {
        struct rcu_head rcu;
        unsigned long map[];
};

/*
 * per-node information in memory controller.
 */
struct mem_cgroup_per_node {
        struct lruvec                lruvec;

        /* Legacy local VM stats */
        struct lruvec_stat __percpu *lruvec_stat_local;

        /* Subtree VM stats (batched updates) */
        struct lruvec_stat __percpu *lruvec_stat_cpu;
        atomic_long_t                lruvec_stat[NR_VM_NODE_STAT_ITEMS];

        unsigned long                lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];

        struct mem_cgroup_reclaim_iter        iter;

        struct memcg_shrinker_map __rcu        *shrinker_map;

        struct rb_node                tree_node;        /* RB tree node */
        unsigned long                usage_in_excess;/* Set to the value by which */
                                                /* the soft limit is exceeded*/
        bool                        on_tree;
        struct mem_cgroup        *memcg;                /* Back pointer, we cannot */
                                                /* use container_of           */
};

struct mem_cgroup_threshold {
        struct eventfd_ctx *eventfd;
        unsigned long threshold;
};

/* For threshold */
struct mem_cgroup_threshold_ary {
        /* An array index points to threshold just below or equal to usage. */
        int current_threshold;
        /* Size of entries[] */
        unsigned int size;
        /* Array of thresholds */
        struct mem_cgroup_threshold entries[];
};

struct mem_cgroup_thresholds {
        /* Primary thresholds array */
        struct mem_cgroup_threshold_ary *primary;
        /*
         * Spare threshold array.
         * This is needed to make mem_cgroup_unregister_event() "never fail".
         * It must be able to store at least primary->size - 1 entries.
         */
        struct mem_cgroup_threshold_ary *spare;
};

enum memcg_kmem_state {
        KMEM_NONE,
        KMEM_ALLOCATED,
        KMEM_ONLINE,
};

#if defined(CONFIG_SMP)
struct memcg_padding {
        char x[0];
} ____cacheline_internodealigned_in_smp;
#define MEMCG_PADDING(name)      struct memcg_padding name;
#else
#define MEMCG_PADDING(name)
#endif

/*
 * Remember four most recent foreign writebacks with dirty pages in this
 * cgroup.  Inode sharing is expected to be uncommon and, even if we miss
 * one in a given round, we're likely to catch it later if it keeps
 * foreign-dirtying, so a fairly low count should be enough.
 *
 * See mem_cgroup_track_foreign_dirty_slowpath() for details.
 */
#define MEMCG_CGWB_FRN_CNT        4

struct memcg_cgwb_frn {
        u64 bdi_id;                        /* bdi->id of the foreign inode */
        int memcg_id;                        /* memcg->css.id of foreign inode */
        u64 at;                                /* jiffies_64 at the time of dirtying */
        struct wb_completion done;        /* tracks in-flight foreign writebacks */
};

/*
 * Bucket for arbitrarily byte-sized objects charged to a memory
 * cgroup. The bucket can be reparented in one piece when the cgroup
 * is destroyed, without having to round up the individual references
 * of all live memory objects in the wild.
 */
struct obj_cgroup {
        struct percpu_ref refcnt;
        struct mem_cgroup *memcg;
        atomic_t nr_charged_bytes;
        union {
                struct list_head list; /* protected by objcg_lock */
                struct rcu_head rcu;
        };
};

/*
 * The memory controller data structure. The memory controller controls both
 * page cache and RSS per cgroup. We would eventually like to provide
 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
 * to help the administrator determine what knobs to tune.
 */
struct mem_cgroup {
        struct cgroup_subsys_state css;

        /* Private memcg ID. Used to ID objects that outlive the cgroup */
        struct mem_cgroup_id id;

        /* Accounted resources */
        struct page_counter memory;                /* Both v1 & v2 */

        union {
                struct page_counter swap;        /* v2 only */
                struct page_counter memsw;        /* v1 only */
        };

        /* Legacy consumer-oriented counters */
        struct page_counter kmem;                /* v1 only */
        struct page_counter tcpmem;                /* v1 only */

        /* Range enforcement for interrupt charges */
        struct work_struct high_work;

        unsigned long soft_limit;

        /* vmpressure notifications */
        struct vmpressure vmpressure;

        /*
         * Should the accounting and control be hierarchical, per subtree?
         */
        bool use_hierarchy;

        /*
         * Should the OOM killer kill all belonging tasks, had it kill one?
         */
        bool oom_group;

        /* protected by memcg_oom_lock */
        bool                oom_lock;
        int                under_oom;

        int        swappiness;
        /* OOM-Killer disable */
        int                oom_kill_disable;

        /* memory.events and memory.events.local */
        struct cgroup_file events_file;
        struct cgroup_file events_local_file;

        /* handle for "memory.swap.events" */
        struct cgroup_file swap_events_file;

        /* protect arrays of thresholds */
        struct mutex thresholds_lock;

        /* thresholds for memory usage. RCU-protected */
        struct mem_cgroup_thresholds thresholds;

        /* thresholds for mem+swap usage. RCU-protected */
        struct mem_cgroup_thresholds memsw_thresholds;

        /* For oom notifier event fd */
        struct list_head oom_notify;

        /*
         * Should we move charges of a task when a task is moved into this
         * mem_cgroup ? And what type of charges should we move ?
         */
        unsigned long move_charge_at_immigrate;
        /* taken only while moving_account > 0 */
        spinlock_t                move_lock;
        unsigned long                move_lock_flags;

        MEMCG_PADDING(_pad1_);

        atomic_long_t                vmstats[MEMCG_NR_STAT];
        atomic_long_t                vmevents[NR_VM_EVENT_ITEMS];

        /* memory.events */
        atomic_long_t                memory_events[MEMCG_NR_MEMORY_EVENTS];
        atomic_long_t                memory_events_local[MEMCG_NR_MEMORY_EVENTS];

        unsigned long                socket_pressure;

        /* Legacy tcp memory accounting */
        bool                        tcpmem_active;
        int                        tcpmem_pressure;

#ifdef CONFIG_MEMCG_KMEM
        /* Index in the kmem_cache->memcg_params.memcg_caches array */
        int kmemcg_id;
        enum memcg_kmem_state kmem_state;
        struct obj_cgroup __rcu *objcg;
        /* list of inherited objcgs, protected by objcg_lock */
        struct list_head objcg_list;
#endif

        MEMCG_PADDING(_pad2_);

        /*
         * set > 0 if pages under this cgroup are moving to other cgroup.
         */
        atomic_t                moving_account;
        struct task_struct        *move_lock_task;

        /* Legacy local VM stats and events */
        struct memcg_vmstats_percpu __percpu *vmstats_local;

        /* Subtree VM stats and events (batched updates) */
        struct memcg_vmstats_percpu __percpu *vmstats_percpu;

#ifdef CONFIG_CGROUP_WRITEBACK
        struct list_head cgwb_list;
        struct wb_domain cgwb_domain;
        struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT];
#endif

        /* List of events which userspace want to receive */
        struct list_head event_list;
        spinlock_t event_list_lock;

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        struct deferred_split deferred_split_queue;
#endif

        struct mem_cgroup_per_node *nodeinfo[0];
        /* WARNING: nodeinfo must be the last member here */
};

/*
 * size of first charge trial. "32" comes from vmscan.c's magic value.
 * TODO: maybe necessary to use big numbers in big irons.
 */
#define MEMCG_CHARGE_BATCH 32U

extern struct mem_cgroup *root_mem_cgroup;

static __always_inline bool memcg_stat_item_in_bytes(int idx)
{
        if (idx == MEMCG_PERCPU_B)
                return true;
        return vmstat_item_in_bytes(idx);
}

static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
        return (memcg == root_mem_cgroup);
}

static inline bool mem_cgroup_disabled(void)
{
        return !cgroup_subsys_enabled(memory_cgrp_subsys);
}

static inline void mem_cgroup_protection(struct mem_cgroup *root,
                                         struct mem_cgroup *memcg,
                                         unsigned long *min,
                                         unsigned long *low)
{
        *min = *low = 0;

        if (mem_cgroup_disabled())
                return;

        /*
         * There is no reclaim protection applied to a targeted reclaim.
         * We are special casing this specific case here because
         * mem_cgroup_protected calculation is not robust enough to keep
         * the protection invariant for calculated effective values for
         * parallel reclaimers with different reclaim target. This is
         * especially a problem for tail memcgs (as they have pages on LRU)
         * which would want to have effective values 0 for targeted reclaim
         * but a different value for external reclaim.
         *
         * Example
         * Let's have global and A's reclaim in parallel:
         *  |
         *  A (low=2G, usage = 3G, max = 3G, children_low_usage = 1.5G)
         *  |\
         *  | C (low = 1G, usage = 2.5G)
         *  B (low = 1G, usage = 0.5G)
         *
         * For the global reclaim
         * A.elow = A.low
         * B.elow = min(B.usage, B.low) because children_low_usage <= A.elow
         * C.elow = min(C.usage, C.low)
         *
         * With the effective values resetting we have A reclaim
         * A.elow = 0
         * B.elow = B.low
         * C.elow = C.low
         *
         * If the global reclaim races with A's reclaim then
         * B.elow = C.elow = 0 because children_low_usage > A.elow)
         * is possible and reclaiming B would be violating the protection.
         *
         */
        if (root == memcg)
                return;

        *min = READ_ONCE(memcg->memory.emin);
        *low = READ_ONCE(memcg->memory.elow);
}

void mem_cgroup_calculate_protection(struct mem_cgroup *root,
                                     struct mem_cgroup *memcg);

static inline bool mem_cgroup_supports_protection(struct mem_cgroup *memcg)
{
        /*
         * The root memcg doesn't account charges, and doesn't support
         * protection.
         */
        return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg);

}

static inline bool mem_cgroup_below_low(struct mem_cgroup *memcg)
{
        if (!mem_cgroup_supports_protection(memcg))
                return false;

        return READ_ONCE(memcg->memory.elow) >=
                page_counter_read(&memcg->memory);
}

static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg)
{
        if (!mem_cgroup_supports_protection(memcg))
                return false;

        return READ_ONCE(memcg->memory.emin) >=
                page_counter_read(&memcg->memory);
}

int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask);

void mem_cgroup_uncharge(struct page *page);
void mem_cgroup_uncharge_list(struct list_head *page_list);

void mem_cgroup_migrate(struct page *oldpage, struct page *newpage);

static struct mem_cgroup_per_node *
mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid)
{
        return memcg->nodeinfo[nid];
}

/**
 * mem_cgroup_lruvec - get the lru list vector for a memcg & node
 * @memcg: memcg of the wanted lruvec
 *
 * Returns the lru list vector holding pages for a given @memcg &
 * @node combination. This can be the node lruvec, if the memory
 * controller is disabled.
 */
static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
                                               struct pglist_data *pgdat)
{
        struct mem_cgroup_per_node *mz;
        struct lruvec *lruvec;

        if (mem_cgroup_disabled()) {
                lruvec = &pgdat->__lruvec;
                goto out;
        }

        if (!memcg)
                memcg = root_mem_cgroup;

        mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
        lruvec = &mz->lruvec;
out:
        /*
         * Since a node can be onlined after the mem_cgroup was created,
         * we have to be prepared to initialize lruvec->pgdat here;
         * and if offlined then reonlined, we need to reinitialize it.
         */
        if (unlikely(lruvec->pgdat != pgdat))
                lruvec->pgdat = pgdat;
        return lruvec;
}

struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *);

struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);

struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);

struct mem_cgroup *get_mem_cgroup_from_page(struct page *page);

static inline
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
        return css ? container_of(css, struct mem_cgroup, css) : NULL;
}

static inline bool obj_cgroup_tryget(struct obj_cgroup *objcg)
{
        return percpu_ref_tryget(&objcg->refcnt);
}

static inline void obj_cgroup_get(struct obj_cgroup *objcg)
{
        percpu_ref_get(&objcg->refcnt);
}

static inline void obj_cgroup_put(struct obj_cgroup *objcg)
{
        percpu_ref_put(&objcg->refcnt);
}

/*
 * After the initialization objcg->memcg is always pointing at
 * a valid memcg, but can be atomically swapped to the parent memcg.
 *
 * The caller must ensure that the returned memcg won't be released:
 * e.g. acquire the rcu_read_lock or css_set_lock.
 */
static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
{
        return READ_ONCE(objcg->memcg);
}

static inline void mem_cgroup_put(struct mem_cgroup *memcg)
{
        if (memcg)
                css_put(&memcg->css);
}

#define mem_cgroup_from_counter(counter, member)        \
        container_of(counter, struct mem_cgroup, member)

struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
                                   struct mem_cgroup *,
                                   struct mem_cgroup_reclaim_cookie *);
void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
int mem_cgroup_scan_tasks(struct mem_cgroup *,
                          int (*)(struct task_struct *, void *), void *);

static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
{
        if (mem_cgroup_disabled())
                return 0;

        return memcg->id.id;
}
struct mem_cgroup *mem_cgroup_from_id(unsigned short id);

static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m)
{
        return mem_cgroup_from_css(seq_css(m));
}

static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec)
{
        struct mem_cgroup_per_node *mz;

        if (mem_cgroup_disabled())
                return NULL;

        mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
        return mz->memcg;
}

/**
 * parent_mem_cgroup - find the accounting parent of a memcg
 * @memcg: memcg whose parent to find
 *
 * Returns the parent memcg, or NULL if this is the root or the memory
 * controller is in legacy no-hierarchy mode.
 */
static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
{
        if (!memcg->memory.parent)
                return NULL;
        return mem_cgroup_from_counter(memcg->memory.parent, memory);
}

static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
                              struct mem_cgroup *root)
{
        if (root == memcg)
                return true;
        if (!root->use_hierarchy)
                return false;
        return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
}

static inline bool mm_match_cgroup(struct mm_struct *mm,
                                   struct mem_cgroup *memcg)
{
        struct mem_cgroup *task_memcg;
        bool match = false;

        rcu_read_lock();
        task_memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
        if (task_memcg)
                match = mem_cgroup_is_descendant(task_memcg, memcg);
        rcu_read_unlock();
        return match;
}

struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
ino_t page_cgroup_ino(struct page *page);

static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
{
        if (mem_cgroup_disabled())
                return true;
        return !!(memcg->css.flags & CSS_ONLINE);
}

/*
 * For memory reclaim.
 */
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);

void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
                int zid, int nr_pages);

static inline
unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
                enum lru_list lru, int zone_idx)
{
        struct mem_cgroup_per_node *mz;

        mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
        return READ_ONCE(mz->lru_zone_size[zone_idx][lru]);
}

void mem_cgroup_handle_over_high(void);

unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg);

unsigned long mem_cgroup_size(struct mem_cgroup *memcg);

void mem_cgroup_print_oom_context(struct mem_cgroup *memcg,
                                struct task_struct *p);

void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg);

static inline void mem_cgroup_enter_user_fault(void)
{
        WARN_ON(current->in_user_fault);
        current->in_user_fault = 1;
}

static inline void mem_cgroup_exit_user_fault(void)
{
        WARN_ON(!current->in_user_fault);
        current->in_user_fault = 0;
}

static inline bool task_in_memcg_oom(struct task_struct *p)
{
        return p->memcg_in_oom;
}

bool mem_cgroup_oom_synchronize(bool wait);
struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
                                            struct mem_cgroup *oom_domain);
void mem_cgroup_print_oom_group(struct mem_cgroup *memcg);

#ifdef CONFIG_MEMCG_SWAP
extern bool cgroup_memory_noswap;
#endif

struct mem_cgroup *lock_page_memcg(struct page *page);
void __unlock_page_memcg(struct mem_cgroup *memcg);
void unlock_page_memcg(struct page *page);

/*
 * idx can be of type enum memcg_stat_item or node_stat_item.
 * Keep in sync with memcg_exact_page_state().
 */
static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
{
        long x = atomic_long_read(&memcg->vmstats[idx]);
#ifdef CONFIG_SMP
        if (x < 0)
                x = 0;
#endif
        return x;
}

/*
 * idx can be of type enum memcg_stat_item or node_stat_item.
 * Keep in sync with memcg_exact_page_state().
 */
static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg,
                                                   int idx)
{
        long x = 0;
        int cpu;

        for_each_possible_cpu(cpu)
                x += per_cpu(memcg->vmstats_local->stat[idx], cpu);
#ifdef CONFIG_SMP
        if (x < 0)
                x = 0;
#endif
        return x;
}

void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val);

/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void mod_memcg_state(struct mem_cgroup *memcg,
                                   int idx, int val)
{
        unsigned long flags;

        local_irq_save(flags);
        __mod_memcg_state(memcg, idx, val);
        local_irq_restore(flags);
}

/**
 * mod_memcg_page_state - update page state statistics
 * @page: the page
 * @idx: page state item to account
 * @val: number of pages (positive or negative)
 *
 * The @page must be locked or the caller must use lock_page_memcg()
 * to prevent double accounting when the page is concurrently being
 * moved to another memcg:
 *
 *   lock_page(page) or lock_page_memcg(page)
 *   if (TestClearPageState(page))
 *     mod_memcg_page_state(page, state, -1);
 *   unlock_page(page) or unlock_page_memcg(page)
 *
 * Kernel pages are an exception to this, since they'll never move.
 */
static inline void __mod_memcg_page_state(struct page *page,
                                          int idx, int val)
{
        if (page->mem_cgroup)
                __mod_memcg_state(page->mem_cgroup, idx, val);
}

static inline void mod_memcg_page_state(struct page *page,
                                        int idx, int val)
{
        if (page->mem_cgroup)
                mod_memcg_state(page->mem_cgroup, idx, val);
}

static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
                                              enum node_stat_item idx)
{
        struct mem_cgroup_per_node *pn;
        long x;

        if (mem_cgroup_disabled())
                return node_page_state(lruvec_pgdat(lruvec), idx);

        pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
        x = atomic_long_read(&pn->lruvec_stat[idx]);
#ifdef CONFIG_SMP
        if (x < 0)
                x = 0;
#endif
        return x;
}

static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
                                                    enum node_stat_item idx)
{
        struct mem_cgroup_per_node *pn;
        long x = 0;
        int cpu;

        if (mem_cgroup_disabled())
                return node_page_state(lruvec_pgdat(lruvec), idx);

        pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
        for_each_possible_cpu(cpu)
                x += per_cpu(pn->lruvec_stat_local->count[idx], cpu);
#ifdef CONFIG_SMP
        if (x < 0)
                x = 0;
#endif
        return x;
}

void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
                              int val);
void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
                        int val);
void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val);

void mod_memcg_obj_state(void *p, int idx, int val);

static inline void mod_lruvec_slab_state(void *p, enum node_stat_item idx,
                                         int val)
{
        unsigned long flags;

        local_irq_save(flags);
        __mod_lruvec_slab_state(p, idx, val);
        local_irq_restore(flags);
}

static inline void mod_memcg_lruvec_state(struct lruvec *lruvec,
                                          enum node_stat_item idx, int val)
{
        unsigned long flags;

        local_irq_save(flags);
        __mod_memcg_lruvec_state(lruvec, idx, val);
        local_irq_restore(flags);
}

static inline void mod_lruvec_state(struct lruvec *lruvec,
                                    enum node_stat_item idx, int val)
{
        unsigned long flags;

        local_irq_save(flags);
        __mod_lruvec_state(lruvec, idx, val);
        local_irq_restore(flags);
}

static inline void __mod_lruvec_page_state(struct page *page,
                                           enum node_stat_item idx, int val)
{
        struct page *head = compound_head(page); /* rmap on tail pages */
        pg_data_t *pgdat = page_pgdat(page);
        struct lruvec *lruvec;

        /* Untracked pages have no memcg, no lruvec. Update only the node */
        if (!head->mem_cgroup) {
                __mod_node_page_state(pgdat, idx, val);
                return;
        }

        lruvec = mem_cgroup_lruvec(head->mem_cgroup, pgdat);
        __mod_lruvec_state(lruvec, idx, val);
}

static inline void mod_lruvec_page_state(struct page *page,
                                         enum node_stat_item idx, int val)
{
        unsigned long flags;

        local_irq_save(flags);
        __mod_lruvec_page_state(page, idx, val);
        local_irq_restore(flags);
}

unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
                                                gfp_t gfp_mask,
                                                unsigned long *total_scanned);

void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
                          unsigned long count);

static inline void count_memcg_events(struct mem_cgroup *memcg,
                                      enum vm_event_item idx,
                                      unsigned long count)
{
        unsigned long flags;

        local_irq_save(flags);
        __count_memcg_events(memcg, idx, count);
        local_irq_restore(flags);
}

static inline void count_memcg_page_event(struct page *page,
                                          enum vm_event_item idx)
{
        if (page->mem_cgroup)
                count_memcg_events(page->mem_cgroup, idx, 1);
}

static inline void count_memcg_event_mm(struct mm_struct *mm,
                                        enum vm_event_item idx)
{
        struct mem_cgroup *memcg;

        if (mem_cgroup_disabled())
                return;

        rcu_read_lock();
        memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
        if (likely(memcg))
                count_memcg_events(memcg, idx, 1);
        rcu_read_unlock();
}

static inline void memcg_memory_event(struct mem_cgroup *memcg,
                                      enum memcg_memory_event event)
{
        bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX ||
                          event == MEMCG_SWAP_FAIL;

        atomic_long_inc(&memcg->memory_events_local[event]);
        if (!swap_event)
                cgroup_file_notify(&memcg->events_local_file);

        do {
                atomic_long_inc(&memcg->memory_events[event]);
                if (swap_event)
                        cgroup_file_notify(&memcg->swap_events_file);
                else
                        cgroup_file_notify(&memcg->events_file);

                if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
                        break;
                if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
                        break;
        } while ((memcg = parent_mem_cgroup(memcg)) &&
                 !mem_cgroup_is_root(memcg));
}

static inline void memcg_memory_event_mm(struct mm_struct *mm,
                                         enum memcg_memory_event event)
{
        struct mem_cgroup *memcg;

        if (mem_cgroup_disabled())
                return;

        rcu_read_lock();
        memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
        if (likely(memcg))
                memcg_memory_event(memcg, event);
        rcu_read_unlock();
}

void split_page_memcg(struct page *head, unsigned int nr);

#else /* CONFIG_MEMCG */

#define MEM_CGROUP_ID_SHIFT        0
#define MEM_CGROUP_ID_MAX        0

struct mem_cgroup;

static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
        return true;
}

static inline bool mem_cgroup_disabled(void)
{
        return true;
}

static inline void memcg_memory_event(struct mem_cgroup *memcg,
                                      enum memcg_memory_event event)
{
}

static inline void memcg_memory_event_mm(struct mm_struct *mm,
                                         enum memcg_memory_event event)
{
}

static inline void mem_cgroup_protection(struct mem_cgroup *root,
                                         struct mem_cgroup *memcg,
                                         unsigned long *min,
                                         unsigned long *low)
{
        *min = *low = 0;
}

static inline void mem_cgroup_calculate_protection(struct mem_cgroup *root,
                                                   struct mem_cgroup *memcg)
{
}

static inline bool mem_cgroup_below_low(struct mem_cgroup *memcg)
{
        return false;
}

static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg)
{
        return false;
}

static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
                                    gfp_t gfp_mask)
{
        return 0;
}

static inline void mem_cgroup_uncharge(struct page *page)
{
}

static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
{
}

static inline void mem_cgroup_migrate(struct page *old, struct page *new)
{
}

static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
                                               struct pglist_data *pgdat)
{
        return &pgdat->__lruvec;
}

static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
                                                    struct pglist_data *pgdat)
{
        return &pgdat->__lruvec;
}

static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
{
        return NULL;
}

static inline bool mm_match_cgroup(struct mm_struct *mm,
                struct mem_cgroup *memcg)
{
        return true;
}

static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
{
        return NULL;
}

static inline struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
{
        return NULL;
}

static inline void mem_cgroup_put(struct mem_cgroup *memcg)
{
}

static inline struct mem_cgroup *
mem_cgroup_iter(struct mem_cgroup *root,
                struct mem_cgroup *prev,
                struct mem_cgroup_reclaim_cookie *reclaim)
{
        return NULL;
}

static inline void mem_cgroup_iter_break(struct mem_cgroup *root,
                                         struct mem_cgroup *prev)
{
}

static inline int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
                int (*fn)(struct task_struct *, void *), void *arg)
{
        return 0;
}

static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
{
        return 0;
}

static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
{
        WARN_ON_ONCE(id);
        /* XXX: This should always return root_mem_cgroup */
        return NULL;
}

static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m)
{
        return NULL;
}

static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec)
{
        return NULL;
}

static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
{
        return true;
}

static inline
unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
                enum lru_list lru, int zone_idx)
{
        return 0;
}

static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
{
        return 0;
}

static inline unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
{
        return 0;
}

static inline void
mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
{
}

static inline void
mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
{
}

static inline struct mem_cgroup *lock_page_memcg(struct page *page)
{
        return NULL;
}

static inline void __unlock_page_memcg(struct mem_cgroup *memcg)
{
}

static inline void unlock_page_memcg(struct page *page)
{
}

static inline void mem_cgroup_handle_over_high(void)
{
}

static inline void mem_cgroup_enter_user_fault(void)
{
}

static inline void mem_cgroup_exit_user_fault(void)
{
}

static inline bool task_in_memcg_oom(struct task_struct *p)
{
        return false;
}

static inline bool mem_cgroup_oom_synchronize(bool wait)
{
        return false;
}

static inline struct mem_cgroup *mem_cgroup_get_oom_group(
        struct task_struct *victim, struct mem_cgroup *oom_domain)
{
        return NULL;
}

static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
{
}

static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
{
        return 0;
}

static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg,
                                                   int idx)
{
        return 0;
}

static inline void __mod_memcg_state(struct mem_cgroup *memcg,
                                     int idx,
                                     int nr)
{
}

static inline void mod_memcg_state(struct mem_cgroup *memcg,
                                   int idx,
                                   int nr)
{
}

static inline void __mod_memcg_page_state(struct page *page,
                                          int idx,
                                          int nr)
{
}

static inline void mod_memcg_page_state(struct page *page,
                                        int idx,
                                        int nr)
{
}

static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
                                              enum node_stat_item idx)
{
        return node_page_state(lruvec_pgdat(lruvec), idx);
}

static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
                                                    enum node_stat_item idx)
{
        return node_page_state(lruvec_pgdat(lruvec), idx);
}

static inline void __mod_memcg_lruvec_state(struct lruvec *lruvec,
                                            enum node_stat_item idx, int val)
{
}

static inline void __mod_lruvec_state(struct lruvec *lruvec,
                                      enum node_stat_item idx, int val)
{
        __mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
}

static inline void mod_lruvec_state(struct lruvec *lruvec,
                                    enum node_stat_item idx, int val)
{
        mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
}

static inline void __mod_lruvec_page_state(struct page *page,
                                           enum node_stat_item idx, int val)
{
        __mod_node_page_state(page_pgdat(page), idx, val);
}

static inline void mod_lruvec_page_state(struct page *page,
                                         enum node_stat_item idx, int val)
{
        mod_node_page_state(page_pgdat(page), idx, val);
}

static inline void __mod_lruvec_slab_state(void *p, enum node_stat_item idx,
                                           int val)
{
        struct page *page = virt_to_head_page(p);

        __mod_node_page_state(page_pgdat(page), idx, val);
}

static inline void mod_lruvec_slab_state(void *p, enum node_stat_item idx,
                                         int val)
{
        struct page *page = virt_to_head_page(p);

        mod_node_page_state(page_pgdat(page), idx, val);
}

static inline void mod_memcg_obj_state(void *p, int idx, int val)
{
}

static inline
unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
                                            gfp_t gfp_mask,
                                            unsigned long *total_scanned)
{
        return 0;
}

static inline void split_page_memcg(struct page *head, unsigned int nr)
{
}

static inline void count_memcg_events(struct mem_cgroup *memcg,
                                      enum vm_event_item idx,
                                      unsigned long count)
{
}

static inline void __count_memcg_events(struct mem_cgroup *memcg,
                                        enum vm_event_item idx,
                                        unsigned long count)
{
}

static inline void count_memcg_page_event(struct page *page,
                                          int idx)
{
}

static inline
void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
{
}
#endif /* CONFIG_MEMCG */

/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void __inc_memcg_state(struct mem_cgroup *memcg,
                                     int idx)
{
        __mod_memcg_state(memcg, idx, 1);
}

/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void __dec_memcg_state(struct mem_cgroup *memcg,
                                     int idx)
{
        __mod_memcg_state(memcg, idx, -1);
}

/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void __inc_memcg_page_state(struct page *page,
                                          int idx)
{
        __mod_memcg_page_state(page, idx, 1);
}

/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void __dec_memcg_page_state(struct page *page,
                                          int idx)
{
        __mod_memcg_page_state(page, idx, -1);
}

static inline void __inc_lruvec_state(struct lruvec *lruvec,
                                      enum node_stat_item idx)
{
        __mod_lruvec_state(lruvec, idx, 1);
}

static inline void __dec_lruvec_state(struct lruvec *lruvec,
                                      enum node_stat_item idx)
{
        __mod_lruvec_state(lruvec, idx, -1);
}

static inline void __inc_lruvec_page_state(struct page *page,
                                           enum node_stat_item idx)
{
        __mod_lruvec_page_state(page, idx, 1);
}

static inline void __dec_lruvec_page_state(struct page *page,
                                           enum node_stat_item idx)
{
        __mod_lruvec_page_state(page, idx, -1);
}

static inline void __inc_lruvec_slab_state(void *p, enum node_stat_item idx)
{
        __mod_lruvec_slab_state(p, idx, 1);
}

static inline void __dec_lruvec_slab_state(void *p, enum node_stat_item idx)
{
        __mod_lruvec_slab_state(p, idx, -1);
}

/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void inc_memcg_state(struct mem_cgroup *memcg,
                                   int idx)
{
        mod_memcg_state(memcg, idx, 1);
}

/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void dec_memcg_state(struct mem_cgroup *memcg,
                                   int idx)
{
        mod_memcg_state(memcg, idx, -1);
}

/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void inc_memcg_page_state(struct page *page,
                                        int idx)
{
        mod_memcg_page_state(page, idx, 1);
}

/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void dec_memcg_page_state(struct page *page,
                                        int idx)
{
        mod_memcg_page_state(page, idx, -1);
}

static inline void inc_lruvec_state(struct lruvec *lruvec,
                                    enum node_stat_item idx)
{
        mod_lruvec_state(lruvec, idx, 1);
}

static inline void dec_lruvec_state(struct lruvec *lruvec,
                                    enum node_stat_item idx)
{
        mod_lruvec_state(lruvec, idx, -1);
}

static inline void inc_lruvec_page_state(struct page *page,
                                         enum node_stat_item idx)
{
        mod_lruvec_page_state(page, idx, 1);
}

static inline void dec_lruvec_page_state(struct page *page,
                                         enum node_stat_item idx)
{
        mod_lruvec_page_state(page, idx, -1);
}

static inline struct lruvec *parent_lruvec(struct lruvec *lruvec)
{
        struct mem_cgroup *memcg;

        memcg = lruvec_memcg(lruvec);
        if (!memcg)
                return NULL;
        memcg = parent_mem_cgroup(memcg);
        if (!memcg)
                return NULL;
        return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec));
}

#ifdef CONFIG_CGROUP_WRITEBACK

struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
                         unsigned long *pheadroom, unsigned long *pdirty,
                         unsigned long *pwriteback);

void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
                                             struct bdi_writeback *wb);

static inline void mem_cgroup_track_foreign_dirty(struct page *page,
                                                  struct bdi_writeback *wb)
{
        if (mem_cgroup_disabled())
                return;

        if (unlikely(&page->mem_cgroup->css != wb->memcg_css))
                mem_cgroup_track_foreign_dirty_slowpath(page, wb);
}

void mem_cgroup_flush_foreign(struct bdi_writeback *wb);

#else        /* CONFIG_CGROUP_WRITEBACK */

static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
{
        return NULL;
}

static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
                                       unsigned long *pfilepages,
                                       unsigned long *pheadroom,
                                       unsigned long *pdirty,
                                       unsigned long *pwriteback)
{
}

static inline void mem_cgroup_track_foreign_dirty(struct page *page,
                                                  struct bdi_writeback *wb)
{
}

static inline void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
{
}

#endif        /* CONFIG_CGROUP_WRITEBACK */

struct sock;
bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages);
void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages);
#ifdef CONFIG_MEMCG
extern struct static_key_false memcg_sockets_enabled_key;
#define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key)
void mem_cgroup_sk_alloc(struct sock *sk);
void mem_cgroup_sk_free(struct sock *sk);
static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
{
        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_pressure)
                return true;
        do {
                if (time_before(jiffies, memcg->socket_pressure))
                        return true;
        } while ((memcg = parent_mem_cgroup(memcg)));
        return false;
}

extern int memcg_expand_shrinker_maps(int new_id);

extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
                                   int nid, int shrinker_id);
#else
#define mem_cgroup_sockets_enabled 0
static inline void mem_cgroup_sk_alloc(struct sock *sk) { };
static inline void mem_cgroup_sk_free(struct sock *sk) { };
static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
{
        return false;
}

static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
                                          int nid, int shrinker_id)
{
}
#endif

#ifdef CONFIG_MEMCG_KMEM
int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
                        unsigned int nr_pages);
void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages);
int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order);
void __memcg_kmem_uncharge_page(struct page *page, int order);

struct obj_cgroup *get_obj_cgroup_from_current(void);

int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size);
void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size);

extern struct static_key_false memcg_kmem_enabled_key;

extern int memcg_nr_cache_ids;
void memcg_get_cache_ids(void);
void memcg_put_cache_ids(void);

/*
 * Helper macro to loop through all memcg-specific caches. Callers must still
 * check if the cache is valid (it is either valid or NULL).
 * the slab_mutex must be held when looping through those caches
 */
#define for_each_memcg_cache_index(_idx)        \
        for ((_idx) = 0; (_idx) < memcg_nr_cache_ids; (_idx)++)

static inline bool memcg_kmem_enabled(void)
{
        return static_branch_likely(&memcg_kmem_enabled_key);
}

static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp,
                                         int order)
{
        if (memcg_kmem_enabled())
                return __memcg_kmem_charge_page(page, gfp, order);
        return 0;
}

static inline void memcg_kmem_uncharge_page(struct page *page, int order)
{
        if (memcg_kmem_enabled())
                __memcg_kmem_uncharge_page(page, order);
}

static inline int memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
                                    unsigned int nr_pages)
{
        if (memcg_kmem_enabled())
                return __memcg_kmem_charge(memcg, gfp, nr_pages);
        return 0;
}

static inline void memcg_kmem_uncharge(struct mem_cgroup *memcg,
                                       unsigned int nr_pages)
{
        if (memcg_kmem_enabled())
                __memcg_kmem_uncharge(memcg, nr_pages);
}

/*
 * helper for accessing a memcg's index. It will be used as an index in the
 * child cache array in kmem_cache, and also to derive its name. This function
 * will return -1 when this is not a kmem-limited memcg.
 */
static inline int memcg_cache_id(struct mem_cgroup *memcg)
{
        return memcg ? memcg->kmemcg_id : -1;
}

struct mem_cgroup *mem_cgroup_from_obj(void *p);

#else

static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp,
                                         int order)
{
        return 0;
}

static inline void memcg_kmem_uncharge_page(struct page *page, int order)
{
}

static inline int __memcg_kmem_charge_page(struct page *page, gfp_t gfp,
                                           int order)
{
        return 0;
}

static inline void __memcg_kmem_uncharge_page(struct page *page, int order)
{
}

#define for_each_memcg_cache_index(_idx)        \
        for (; NULL; )

static inline bool memcg_kmem_enabled(void)
{
        return false;
}

static inline int memcg_cache_id(struct mem_cgroup *memcg)
{
        return -1;
}

static inline void memcg_get_cache_ids(void)
{
}

static inline void memcg_put_cache_ids(void)
{
}

static inline struct mem_cgroup *mem_cgroup_from_obj(void *p)
{
       return NULL;
}

#endif /* CONFIG_MEMCG_KMEM */

#endif /* _LINUX_MEMCONTROL_H */


























    2 




    4 




    6 




    2 


























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_ERR_H
#define _LINUX_ERR_H

#include <linux/compiler.h>
#include <linux/types.h>

#include <asm/errno.h>

/*
 * Kernel pointers have redundant information, so we can use a
 * scheme where we can return either an error code or a normal
 * pointer with the same return value.
 *
 * This should be a per-architecture thing, to allow different
 * error and pointer decisions.
 */
#define MAX_ERRNO        4095

#ifndef __ASSEMBLY__

#define IS_ERR_VALUE(x) unlikely((unsigned long)(void *)(x) >= (unsigned long)-MAX_ERRNO)

static inline void * __must_check ERR_PTR(long error)
{
        return (void *) error;
}

static inline long __must_check PTR_ERR(__force const void *ptr)
{
        return (long) ptr;
}

static inline bool __must_check IS_ERR(__force const void *ptr)
{
        return IS_ERR_VALUE((unsigned long)ptr);
}

static inline bool __must_check IS_ERR_OR_NULL(__force const void *ptr)
{
        return unlikely(!ptr) || IS_ERR_VALUE((unsigned long)ptr);
}

/**
 * ERR_CAST - Explicitly cast an error-valued pointer to another pointer type
 * @ptr: The pointer to cast.
 *
 * Explicitly cast an error-valued pointer to another pointer type in such a
 * way as to make it clear that's what's going on.
 */
static inline void * __must_check ERR_CAST(__force const void *ptr)
{
        /* cast away the const */
        return (void *) ptr;
}

static inline int __must_check PTR_ERR_OR_ZERO(__force const void *ptr)
{
        if (IS_ERR(ptr))
                return PTR_ERR(ptr);
        else
                return 0;
}

#endif

#endif /* _LINUX_ERR_H */






































    1 




    1 




    1 






















































































































    1 
















    1 


















    1 

    1 
    1 
    1 









    1 




    1 




















































    1 


    1 




































    1 
    1 
    1 

























































































































    1 
    1 





    1 

    1 

    1 







































































































































    1 



    1 


















    1 





    1 
    1 






    1 









    1 














    1 




    1 
    1 





























    1 


    1 
    1 




    1 
    1 
    1 
    1 









    1 




    1 






    1 










    1 


    1 


















    1 








    1 











    1 
    1 





















    1 
    1 

















    1 
    1 









    1 















    1 

    1 



    1 








    1 

    1 



    1 













































    1 






    1 


    1 
    1 




    1 

    1 
    1 






































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
// SPDX-License-Identifier: GPL-2.0-or-later
/* Keyring handling
 *
 * Copyright (C) 2004-2005, 2008, 2013 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#include <linux/export.h>
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/security.h>
#include <linux/seq_file.h>
#include <linux/err.h>
#include <linux/user_namespace.h>
#include <linux/nsproxy.h>
#include <keys/keyring-type.h>
#include <keys/user-type.h>
#include <linux/assoc_array_priv.h>
#include <linux/uaccess.h>
#include <net/net_namespace.h>
#include "internal.h"

/*
 * When plumbing the depths of the key tree, this sets a hard limit
 * set on how deep we're willing to go.
 */
#define KEYRING_SEARCH_MAX_DEPTH 6

/*
 * We mark pointers we pass to the associative array with bit 1 set if
 * they're keyrings and clear otherwise.
 */
#define KEYRING_PTR_SUBTYPE        0x2UL

static inline bool keyring_ptr_is_keyring(const struct assoc_array_ptr *x)
{
        return (unsigned long)x & KEYRING_PTR_SUBTYPE;
}
static inline struct key *keyring_ptr_to_key(const struct assoc_array_ptr *x)
{
        void *object = assoc_array_ptr_to_leaf(x);
        return (struct key *)((unsigned long)object & ~KEYRING_PTR_SUBTYPE);
}
static inline void *keyring_key_to_ptr(struct key *key)
{
        if (key->type == &key_type_keyring)
                return (void *)((unsigned long)key | KEYRING_PTR_SUBTYPE);
        return key;
}

static DEFINE_RWLOCK(keyring_name_lock);

/*
 * Clean up the bits of user_namespace that belong to us.
 */
void key_free_user_ns(struct user_namespace *ns)
{
        write_lock(&keyring_name_lock);
        list_del_init(&ns->keyring_name_list);
        write_unlock(&keyring_name_lock);

        key_put(ns->user_keyring_register);
#ifdef CONFIG_PERSISTENT_KEYRINGS
        key_put(ns->persistent_keyring_register);
#endif
}

/*
 * The keyring key type definition.  Keyrings are simply keys of this type and
 * can be treated as ordinary keys in addition to having their own special
 * operations.
 */
static int keyring_preparse(struct key_preparsed_payload *prep);
static void keyring_free_preparse(struct key_preparsed_payload *prep);
static int keyring_instantiate(struct key *keyring,
                               struct key_preparsed_payload *prep);
static void keyring_revoke(struct key *keyring);
static void keyring_destroy(struct key *keyring);
static void keyring_describe(const struct key *keyring, struct seq_file *m);
static long keyring_read(const struct key *keyring,
                         char __user *buffer, size_t buflen);

struct key_type key_type_keyring = {
        .name                = "keyring",
        .def_datalen        = 0,
        .preparse        = keyring_preparse,
        .free_preparse        = keyring_free_preparse,
        .instantiate        = keyring_instantiate,
        .revoke                = keyring_revoke,
        .destroy        = keyring_destroy,
        .describe        = keyring_describe,
        .read                = keyring_read,
};
EXPORT_SYMBOL(key_type_keyring);

/*
 * Semaphore to serialise link/link calls to prevent two link calls in parallel
 * introducing a cycle.
 */
static DEFINE_MUTEX(keyring_serialise_link_lock);

/*
 * Publish the name of a keyring so that it can be found by name (if it has
 * one and it doesn't begin with a dot).
 */
static void keyring_publish_name(struct key *keyring)
{
        struct user_namespace *ns = current_user_ns();

        if (keyring->description &&
            keyring->description[0] &&
            keyring->description[0] != '.') {
                write_lock(&keyring_name_lock);
                list_add_tail(&keyring->name_link, &ns->keyring_name_list);
                write_unlock(&keyring_name_lock);
        }
}

/*
 * Preparse a keyring payload
 */
static int keyring_preparse(struct key_preparsed_payload *prep)
{
        return prep->datalen != 0 ? -EINVAL : 0;
}

/*
 * Free a preparse of a user defined key payload
 */
static void keyring_free_preparse(struct key_preparsed_payload *prep)
{
}

/*
 * Initialise a keyring.
 *
 * Returns 0 on success, -EINVAL if given any data.
 */
static int keyring_instantiate(struct key *keyring,
                               struct key_preparsed_payload *prep)
{
        assoc_array_init(&keyring->keys);
        /* make the keyring available by name if it has one */
        keyring_publish_name(keyring);
        return 0;
}

/*
 * Multiply 64-bits by 32-bits to 96-bits and fold back to 64-bit.  Ideally we'd
 * fold the carry back too, but that requires inline asm.
 */
static u64 mult_64x32_and_fold(u64 x, u32 y)
{
        u64 hi = (u64)(u32)(x >> 32) * y;
        u64 lo = (u64)(u32)(x) * y;
        return lo + ((u64)(u32)hi << 32) + (u32)(hi >> 32);
}

/*
 * Hash a key type and description.
 */
static void hash_key_type_and_desc(struct keyring_index_key *index_key)
{
        const unsigned level_shift = ASSOC_ARRAY_LEVEL_STEP;
        const unsigned long fan_mask = ASSOC_ARRAY_FAN_MASK;
        const char *description = index_key->description;
        unsigned long hash, type;
        u32 piece;
        u64 acc;
        int n, desc_len = index_key->desc_len;

        type = (unsigned long)index_key->type;
        acc = mult_64x32_and_fold(type, desc_len + 13);
        acc = mult_64x32_and_fold(acc, 9207);
        piece = (unsigned long)index_key->domain_tag;
        acc = mult_64x32_and_fold(acc, piece);
        acc = mult_64x32_and_fold(acc, 9207);

        for (;;) {
                n = desc_len;
                if (n <= 0)
                        break;
                if (n > 4)
                        n = 4;
                piece = 0;
                memcpy(&piece, description, n);
                description += n;
                desc_len -= n;
                acc = mult_64x32_and_fold(acc, piece);
                acc = mult_64x32_and_fold(acc, 9207);
        }

        /* Fold the hash down to 32 bits if need be. */
        hash = acc;
        if (ASSOC_ARRAY_KEY_CHUNK_SIZE == 32)
                hash ^= acc >> 32;

        /* Squidge all the keyrings into a separate part of the tree to
         * ordinary keys by making sure the lowest level segment in the hash is
         * zero for keyrings and non-zero otherwise.
         */
        if (index_key->type != &key_type_keyring && (hash & fan_mask) == 0)
                hash |= (hash >> (ASSOC_ARRAY_KEY_CHUNK_SIZE - level_shift)) | 1;
        else if (index_key->type == &key_type_keyring && (hash & fan_mask) != 0)
                hash = (hash + (hash << level_shift)) & ~fan_mask;
        index_key->hash = hash;
}

/*
 * Finalise an index key to include a part of the description actually in the
 * index key, to set the domain tag and to calculate the hash.
 */
void key_set_index_key(struct keyring_index_key *index_key)
{
        static struct key_tag default_domain_tag = { .usage = REFCOUNT_INIT(1), };
        size_t n = min_t(size_t, index_key->desc_len, sizeof(index_key->desc));

        memcpy(index_key->desc, index_key->description, n);

        if (!index_key->domain_tag) {
                if (index_key->type->flags & KEY_TYPE_NET_DOMAIN)
                        index_key->domain_tag = current->nsproxy->net_ns->key_domain;
                else
                        index_key->domain_tag = &default_domain_tag;
        }

        hash_key_type_and_desc(index_key);
}

/**
 * key_put_tag - Release a ref on a tag.
 * @tag: The tag to release.
 *
 * This releases a reference the given tag and returns true if that ref was the
 * last one.
 */
bool key_put_tag(struct key_tag *tag)
{
        if (refcount_dec_and_test(&tag->usage)) {
                kfree_rcu(tag, rcu);
                return true;
        }

        return false;
}

/**
 * key_remove_domain - Kill off a key domain and gc its keys
 * @domain_tag: The domain tag to release.
 *
 * This marks a domain tag as being dead and releases a ref on it.  If that
 * wasn't the last reference, the garbage collector is poked to try and delete
 * all keys that were in the domain.
 */
void key_remove_domain(struct key_tag *domain_tag)
{
        domain_tag->removed = true;
        if (!key_put_tag(domain_tag))
                key_schedule_gc_links();
}

/*
 * Build the next index key chunk.
 *
 * We return it one word-sized chunk at a time.
 */
static unsigned long keyring_get_key_chunk(const void *data, int level)
{
        const struct keyring_index_key *index_key = data;
        unsigned long chunk = 0;
        const u8 *d;
        int desc_len = index_key->desc_len, n = sizeof(chunk);

        level /= ASSOC_ARRAY_KEY_CHUNK_SIZE;
        switch (level) {
        case 0:
                return index_key->hash;
        case 1:
                return index_key->x;
        case 2:
                return (unsigned long)index_key->type;
        case 3:
                return (unsigned long)index_key->domain_tag;
        default:
                level -= 4;
                if (desc_len <= sizeof(index_key->desc))
                        return 0;

                d = index_key->description + sizeof(index_key->desc);
                d += level * sizeof(long);
                desc_len -= sizeof(index_key->desc);
                if (desc_len > n)
                        desc_len = n;
                do {
                        chunk <<= 8;
                        chunk |= *d++;
                } while (--desc_len > 0);
                return chunk;
        }
}

static unsigned long keyring_get_object_key_chunk(const void *object, int level)
{
        const struct key *key = keyring_ptr_to_key(object);
        return keyring_get_key_chunk(&key->index_key, level);
}

static bool keyring_compare_object(const void *object, const void *data)
{
        const struct keyring_index_key *index_key = data;
        const struct key *key = keyring_ptr_to_key(object);

        return key->index_key.type == index_key->type &&
                key->index_key.domain_tag == index_key->domain_tag &&
                key->index_key.desc_len == index_key->desc_len &&
                memcmp(key->index_key.description, index_key->description,
                       index_key->desc_len) == 0;
}

/*
 * Compare the index keys of a pair of objects and determine the bit position
 * at which they differ - if they differ.
 */
static int keyring_diff_objects(const void *object, const void *data)
{
        const struct key *key_a = keyring_ptr_to_key(object);
        const struct keyring_index_key *a = &key_a->index_key;
        const struct keyring_index_key *b = data;
        unsigned long seg_a, seg_b;
        int level, i;

        level = 0;
        seg_a = a->hash;
        seg_b = b->hash;
        if ((seg_a ^ seg_b) != 0)
                goto differ;
        level += ASSOC_ARRAY_KEY_CHUNK_SIZE / 8;

        /* The number of bits contributed by the hash is controlled by a
         * constant in the assoc_array headers.  Everything else thereafter we
         * can deal with as being machine word-size dependent.
         */
        seg_a = a->x;
        seg_b = b->x;
        if ((seg_a ^ seg_b) != 0)
                goto differ;
        level += sizeof(unsigned long);

        /* The next bit may not work on big endian */
        seg_a = (unsigned long)a->type;
        seg_b = (unsigned long)b->type;
        if ((seg_a ^ seg_b) != 0)
                goto differ;
        level += sizeof(unsigned long);

        seg_a = (unsigned long)a->domain_tag;
        seg_b = (unsigned long)b->domain_tag;
        if ((seg_a ^ seg_b) != 0)
                goto differ;
        level += sizeof(unsigned long);

        i = sizeof(a->desc);
        if (a->desc_len <= i)
                goto same;

        for (; i < a->desc_len; i++) {
                seg_a = *(unsigned char *)(a->description + i);
                seg_b = *(unsigned char *)(b->description + i);
                if ((seg_a ^ seg_b) != 0)
                        goto differ_plus_i;
        }

same:
        return -1;

differ_plus_i:
        level += i;
differ:
        i = level * 8 + __ffs(seg_a ^ seg_b);
        return i;
}

/*
 * Free an object after stripping the keyring flag off of the pointer.
 */
static void keyring_free_object(void *object)
{
        key_put(keyring_ptr_to_key(object));
}

/*
 * Operations for keyring management by the index-tree routines.
 */
static const struct assoc_array_ops keyring_assoc_array_ops = {
        .get_key_chunk                = keyring_get_key_chunk,
        .get_object_key_chunk        = keyring_get_object_key_chunk,
        .compare_object                = keyring_compare_object,
        .diff_objects                = keyring_diff_objects,
        .free_object                = keyring_free_object,
};

/*
 * Clean up a keyring when it is destroyed.  Unpublish its name if it had one
 * and dispose of its data.
 *
 * The garbage collector detects the final key_put(), removes the keyring from
 * the serial number tree and then does RCU synchronisation before coming here,
 * so we shouldn't need to worry about code poking around here with the RCU
 * readlock held by this time.
 */
static void keyring_destroy(struct key *keyring)
{
        if (keyring->description) {
                write_lock(&keyring_name_lock);

                if (keyring->name_link.next != NULL &&
                    !list_empty(&keyring->name_link))
                        list_del(&keyring->name_link);

                write_unlock(&keyring_name_lock);
        }

        if (keyring->restrict_link) {
                struct key_restriction *keyres = keyring->restrict_link;

                key_put(keyres->key);
                kfree(keyres);
        }

        assoc_array_destroy(&keyring->keys, &keyring_assoc_array_ops);
}

/*
 * Describe a keyring for /proc.
 */
static void keyring_describe(const struct key *keyring, struct seq_file *m)
{
        if (keyring->description)
                seq_puts(m, keyring->description);
        else
                seq_puts(m, "[anon]");

        if (key_is_positive(keyring)) {
                if (keyring->keys.nr_leaves_on_tree != 0)
                        seq_printf(m, ": %lu", keyring->keys.nr_leaves_on_tree);
                else
                        seq_puts(m, ": empty");
        }
}

struct keyring_read_iterator_context {
        size_t                        buflen;
        size_t                        count;
        key_serial_t __user        *buffer;
};

static int keyring_read_iterator(const void *object, void *data)
{
        struct keyring_read_iterator_context *ctx = data;
        const struct key *key = keyring_ptr_to_key(object);

        kenter("{%s,%d},,{%zu/%zu}",
               key->type->name, key->serial, ctx->count, ctx->buflen);

        if (ctx->count >= ctx->buflen)
                return 1;

        *ctx->buffer++ = key->serial;
        ctx->count += sizeof(key->serial);
        return 0;
}

/*
 * Read a list of key IDs from the keyring's contents in binary form
 *
 * The keyring's semaphore is read-locked by the caller.  This prevents someone
 * from modifying it under us - which could cause us to read key IDs multiple
 * times.
 */
static long keyring_read(const struct key *keyring,
                         char __user *buffer, size_t buflen)
{
        struct keyring_read_iterator_context ctx;
        long ret;

        kenter("{%d},,%zu", key_serial(keyring), buflen);

        if (buflen & (sizeof(key_serial_t) - 1))
                return -EINVAL;

        /* Copy as many key IDs as fit into the buffer */
        if (buffer && buflen) {
                ctx.buffer = (key_serial_t __user *)buffer;
                ctx.buflen = buflen;
                ctx.count = 0;
                ret = assoc_array_iterate(&keyring->keys,
                                          keyring_read_iterator, &ctx);
                if (ret < 0) {
                        kleave(" = %ld [iterate]", ret);
                        return ret;
                }
        }

        /* Return the size of the buffer needed */
        ret = keyring->keys.nr_leaves_on_tree * sizeof(key_serial_t);
        if (ret <= buflen)
                kleave("= %ld [ok]", ret);
        else
                kleave("= %ld [buffer too small]", ret);
        return ret;
}

/*
 * Allocate a keyring and link into the destination keyring.
 */
struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid,
                          const struct cred *cred, key_perm_t perm,
                          unsigned long flags,
                          struct key_restriction *restrict_link,
                          struct key *dest)
{
        struct key *keyring;
        int ret;

        keyring = key_alloc(&key_type_keyring, description,
                            uid, gid, cred, perm, flags, restrict_link);
        if (!IS_ERR(keyring)) {
                ret = key_instantiate_and_link(keyring, NULL, 0, dest, NULL);
                if (ret < 0) {
                        key_put(keyring);
                        keyring = ERR_PTR(ret);
                }
        }

        return keyring;
}
EXPORT_SYMBOL(keyring_alloc);

/**
 * restrict_link_reject - Give -EPERM to restrict link
 * @keyring: The keyring being added to.
 * @type: The type of key being added.
 * @payload: The payload of the key intended to be added.
 * @restriction_key: Keys providing additional data for evaluating restriction.
 *
 * Reject the addition of any links to a keyring.  It can be overridden by
 * passing KEY_ALLOC_BYPASS_RESTRICTION to key_instantiate_and_link() when
 * adding a key to a keyring.
 *
 * This is meant to be stored in a key_restriction structure which is passed
 * in the restrict_link parameter to keyring_alloc().
 */
int restrict_link_reject(struct key *keyring,
                         const struct key_type *type,
                         const union key_payload *payload,
                         struct key *restriction_key)
{
        return -EPERM;
}

/*
 * By default, we keys found by getting an exact match on their descriptions.
 */
bool key_default_cmp(const struct key *key,
                     const struct key_match_data *match_data)
{
        return strcmp(key->description, match_data->raw_data) == 0;
}

/*
 * Iteration function to consider each key found.
 */
static int keyring_search_iterator(const void *object, void *iterator_data)
{
        struct keyring_search_context *ctx = iterator_data;
        const struct key *key = keyring_ptr_to_key(object);
        unsigned long kflags = READ_ONCE(key->flags);
        short state = READ_ONCE(key->state);

        kenter("{%d}", key->serial);

        /* ignore keys not of this type */
        if (key->type != ctx->index_key.type) {
                kleave(" = 0 [!type]");
                return 0;
        }

        /* skip invalidated, revoked and expired keys */
        if (ctx->flags & KEYRING_SEARCH_DO_STATE_CHECK) {
                time64_t expiry = READ_ONCE(key->expiry);

                if (kflags & ((1 << KEY_FLAG_INVALIDATED) |
                              (1 << KEY_FLAG_REVOKED))) {
                        ctx->result = ERR_PTR(-EKEYREVOKED);
                        kleave(" = %d [invrev]", ctx->skipped_ret);
                        goto skipped;
                }

                if (expiry && ctx->now >= expiry) {
                        if (!(ctx->flags & KEYRING_SEARCH_SKIP_EXPIRED))
                                ctx->result = ERR_PTR(-EKEYEXPIRED);
                        kleave(" = %d [expire]", ctx->skipped_ret);
                        goto skipped;
                }
        }

        /* keys that don't match */
        if (!ctx->match_data.cmp(key, &ctx->match_data)) {
                kleave(" = 0 [!match]");
                return 0;
        }

        /* key must have search permissions */
        if (!(ctx->flags & KEYRING_SEARCH_NO_CHECK_PERM) &&
            key_task_permission(make_key_ref(key, ctx->possessed),
                                ctx->cred, KEY_NEED_SEARCH) < 0) {
                ctx->result = ERR_PTR(-EACCES);
                kleave(" = %d [!perm]", ctx->skipped_ret);
                goto skipped;
        }

        if (ctx->flags & KEYRING_SEARCH_DO_STATE_CHECK) {
                /* we set a different error code if we pass a negative key */
                if (state < 0) {
                        ctx->result = ERR_PTR(state);
                        kleave(" = %d [neg]", ctx->skipped_ret);
                        goto skipped;
                }
        }

        /* Found */
        ctx->result = make_key_ref(key, ctx->possessed);
        kleave(" = 1 [found]");
        return 1;

skipped:
        return ctx->skipped_ret;
}

/*
 * Search inside a keyring for a key.  We can search by walking to it
 * directly based on its index-key or we can iterate over the entire
 * tree looking for it, based on the match function.
 */
static int search_keyring(struct key *keyring, struct keyring_search_context *ctx)
{
        if (ctx->match_data.lookup_type == KEYRING_SEARCH_LOOKUP_DIRECT) {
                const void *object;

                object = assoc_array_find(&keyring->keys,
                                          &keyring_assoc_array_ops,
                                          &ctx->index_key);
                return object ? ctx->iterator(object, ctx) : 0;
        }
        return assoc_array_iterate(&keyring->keys, ctx->iterator, ctx);
}

/*
 * Search a tree of keyrings that point to other keyrings up to the maximum
 * depth.
 */
static bool search_nested_keyrings(struct key *keyring,
                                   struct keyring_search_context *ctx)
{
        struct {
                struct key *keyring;
                struct assoc_array_node *node;
                int slot;
        } stack[KEYRING_SEARCH_MAX_DEPTH];

        struct assoc_array_shortcut *shortcut;
        struct assoc_array_node *node;
        struct assoc_array_ptr *ptr;
        struct key *key;
        int sp = 0, slot;

        kenter("{%d},{%s,%s}",
               keyring->serial,
               ctx->index_key.type->name,
               ctx->index_key.description);

#define STATE_CHECKS (KEYRING_SEARCH_NO_STATE_CHECK | KEYRING_SEARCH_DO_STATE_CHECK)
        BUG_ON((ctx->flags & STATE_CHECKS) == 0 ||
               (ctx->flags & STATE_CHECKS) == STATE_CHECKS);

        if (ctx->index_key.description)
                key_set_index_key(&ctx->index_key);

        /* Check to see if this top-level keyring is what we are looking for
         * and whether it is valid or not.
         */
        if (ctx->match_data.lookup_type == KEYRING_SEARCH_LOOKUP_ITERATE ||
            keyring_compare_object(keyring, &ctx->index_key)) {
                ctx->skipped_ret = 2;
                switch (ctx->iterator(keyring_key_to_ptr(keyring), ctx)) {
                case 1:
                        goto found;
                case 2:
                        return false;
                default:
                        break;
                }
        }

        ctx->skipped_ret = 0;

        /* Start processing a new keyring */
descend_to_keyring:
        kdebug("descend to %d", keyring->serial);
        if (keyring->flags & ((1 << KEY_FLAG_INVALIDATED) |
                              (1 << KEY_FLAG_REVOKED)))
                goto not_this_keyring;

        /* Search through the keys in this keyring before its searching its
         * subtrees.
         */
        if (search_keyring(keyring, ctx))
                goto found;

        /* Then manually iterate through the keyrings nested in this one.
         *
         * Start from the root node of the index tree.  Because of the way the
         * hash function has been set up, keyrings cluster on the leftmost
         * branch of the root node (root slot 0) or in the root node itself.
         * Non-keyrings avoid the leftmost branch of the root entirely (root
         * slots 1-15).
         */
        if (!(ctx->flags & KEYRING_SEARCH_RECURSE))
                goto not_this_keyring;

        ptr = READ_ONCE(keyring->keys.root);
        if (!ptr)
                goto not_this_keyring;

        if (assoc_array_ptr_is_shortcut(ptr)) {
                /* If the root is a shortcut, either the keyring only contains
                 * keyring pointers (everything clusters behind root slot 0) or
                 * doesn't contain any keyring pointers.
                 */
                shortcut = assoc_array_ptr_to_shortcut(ptr);
                if ((shortcut->index_key[0] & ASSOC_ARRAY_FAN_MASK) != 0)
                        goto not_this_keyring;

                ptr = READ_ONCE(shortcut->next_node);
                node = assoc_array_ptr_to_node(ptr);
                goto begin_node;
        }

        node = assoc_array_ptr_to_node(ptr);
        ptr = node->slots[0];
        if (!assoc_array_ptr_is_meta(ptr))
                goto begin_node;

descend_to_node:
        /* Descend to a more distal node in this keyring's content tree and go
         * through that.
         */
        kdebug("descend");
        if (assoc_array_ptr_is_shortcut(ptr)) {
                shortcut = assoc_array_ptr_to_shortcut(ptr);
                ptr = READ_ONCE(shortcut->next_node);
                BUG_ON(!assoc_array_ptr_is_node(ptr));
        }
        node = assoc_array_ptr_to_node(ptr);

begin_node:
        kdebug("begin_node");
        slot = 0;
ascend_to_node:
        /* Go through the slots in a node */
        for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) {
                ptr = READ_ONCE(node->slots[slot]);

                if (assoc_array_ptr_is_meta(ptr)) {
                        if (node->back_pointer ||
                            assoc_array_ptr_is_shortcut(ptr))
                                goto descend_to_node;
                }

                if (!keyring_ptr_is_keyring(ptr))
                        continue;

                key = keyring_ptr_to_key(ptr);

                if (sp >= KEYRING_SEARCH_MAX_DEPTH) {
                        if (ctx->flags & KEYRING_SEARCH_DETECT_TOO_DEEP) {
                                ctx->result = ERR_PTR(-ELOOP);
                                return false;
                        }
                        goto not_this_keyring;
                }

                /* Search a nested keyring */
                if (!(ctx->flags & KEYRING_SEARCH_NO_CHECK_PERM) &&
                    key_task_permission(make_key_ref(key, ctx->possessed),
                                        ctx->cred, KEY_NEED_SEARCH) < 0)
                        continue;

                /* stack the current position */
                stack[sp].keyring = keyring;
                stack[sp].node = node;
                stack[sp].slot = slot;
                sp++;

                /* begin again with the new keyring */
                keyring = key;
                goto descend_to_keyring;
        }

        /* We've dealt with all the slots in the current node, so now we need
         * to ascend to the parent and continue processing there.
         */
        ptr = READ_ONCE(node->back_pointer);
        slot = node->parent_slot;

        if (ptr && assoc_array_ptr_is_shortcut(ptr)) {
                shortcut = assoc_array_ptr_to_shortcut(ptr);
                ptr = READ_ONCE(shortcut->back_pointer);
                slot = shortcut->parent_slot;
        }
        if (!ptr)
                goto not_this_keyring;
        node = assoc_array_ptr_to_node(ptr);
        slot++;

        /* If we've ascended to the root (zero backpointer), we must have just
         * finished processing the leftmost branch rather than the root slots -
         * so there can't be any more keyrings for us to find.
         */
        if (node->back_pointer) {
                kdebug("ascend %d", slot);
                goto ascend_to_node;
        }

        /* The keyring we're looking at was disqualified or didn't contain a
         * matching key.
         */
not_this_keyring:
        kdebug("not_this_keyring %d", sp);
        if (sp <= 0) {
                kleave(" = false");
                return false;
        }

        /* Resume the processing of a keyring higher up in the tree */
        sp--;
        keyring = stack[sp].keyring;
        node = stack[sp].node;
        slot = stack[sp].slot + 1;
        kdebug("ascend to %d [%d]", keyring->serial, slot);
        goto ascend_to_node;

        /* We found a viable match */
found:
        key = key_ref_to_ptr(ctx->result);
        key_check(key);
        if (!(ctx->flags & KEYRING_SEARCH_NO_UPDATE_TIME)) {
                key->last_used_at = ctx->now;
                keyring->last_used_at = ctx->now;
                while (sp > 0)
                        stack[--sp].keyring->last_used_at = ctx->now;
        }
        kleave(" = true");
        return true;
}

/**
 * keyring_search_rcu - Search a keyring tree for a matching key under RCU
 * @keyring_ref: A pointer to the keyring with possession indicator.
 * @ctx: The keyring search context.
 *
 * Search the supplied keyring tree for a key that matches the criteria given.
 * The root keyring and any linked keyrings must grant Search permission to the
 * caller to be searchable and keys can only be found if they too grant Search
 * to the caller. The possession flag on the root keyring pointer controls use
 * of the possessor bits in permissions checking of the entire tree.  In
 * addition, the LSM gets to forbid keyring searches and key matches.
 *
 * The search is performed as a breadth-then-depth search up to the prescribed
 * limit (KEYRING_SEARCH_MAX_DEPTH).  The caller must hold the RCU read lock to
 * prevent keyrings from being destroyed or rearranged whilst they are being
 * searched.
 *
 * Keys are matched to the type provided and are then filtered by the match
 * function, which is given the description to use in any way it sees fit.  The
 * match function may use any attributes of a key that it wishes to to
 * determine the match.  Normally the match function from the key type would be
 * used.
 *
 * RCU can be used to prevent the keyring key lists from disappearing without
 * the need to take lots of locks.
 *
 * Returns a pointer to the found key and increments the key usage count if
 * successful; -EAGAIN if no matching keys were found, or if expired or revoked
 * keys were found; -ENOKEY if only negative keys were found; -ENOTDIR if the
 * specified keyring wasn't a keyring.
 *
 * In the case of a successful return, the possession attribute from
 * @keyring_ref is propagated to the returned key reference.
 */
key_ref_t keyring_search_rcu(key_ref_t keyring_ref,
                             struct keyring_search_context *ctx)
{
        struct key *keyring;
        long err;

        ctx->iterator = keyring_search_iterator;
        ctx->possessed = is_key_possessed(keyring_ref);
        ctx->result = ERR_PTR(-EAGAIN);

        keyring = key_ref_to_ptr(keyring_ref);
        key_check(keyring);

        if (keyring->type != &key_type_keyring)
                return ERR_PTR(-ENOTDIR);

        if (!(ctx->flags & KEYRING_SEARCH_NO_CHECK_PERM)) {
                err = key_task_permission(keyring_ref, ctx->cred, KEY_NEED_SEARCH);
                if (err < 0)
                        return ERR_PTR(err);
        }

        ctx->now = ktime_get_real_seconds();
        if (search_nested_keyrings(keyring, ctx))
                __key_get(key_ref_to_ptr(ctx->result));
        return ctx->result;
}

/**
 * keyring_search - Search the supplied keyring tree for a matching key
 * @keyring: The root of the keyring tree to be searched.
 * @type: The type of keyring we want to find.
 * @description: The name of the keyring we want to find.
 * @recurse: True to search the children of @keyring also
 *
 * As keyring_search_rcu() above, but using the current task's credentials and
 * type's default matching function and preferred search method.
 */
key_ref_t keyring_search(key_ref_t keyring,
                         struct key_type *type,
                         const char *description,
                         bool recurse)
{
        struct keyring_search_context ctx = {
                .index_key.type                = type,
                .index_key.description        = description,
                .index_key.desc_len        = strlen(description),
                .cred                        = current_cred(),
                .match_data.cmp                = key_default_cmp,
                .match_data.raw_data        = description,
                .match_data.lookup_type        = KEYRING_SEARCH_LOOKUP_DIRECT,
                .flags                        = KEYRING_SEARCH_DO_STATE_CHECK,
        };
        key_ref_t key;
        int ret;

        if (recurse)
                ctx.flags |= KEYRING_SEARCH_RECURSE;
        if (type->match_preparse) {
                ret = type->match_preparse(&ctx.match_data);
                if (ret < 0)
                        return ERR_PTR(ret);
        }

        rcu_read_lock();
        key = keyring_search_rcu(keyring, &ctx);
        rcu_read_unlock();

        if (type->match_free)
                type->match_free(&ctx.match_data);
        return key;
}
EXPORT_SYMBOL(keyring_search);

static struct key_restriction *keyring_restriction_alloc(
        key_restrict_link_func_t check)
{
        struct key_restriction *keyres =
                kzalloc(sizeof(struct key_restriction), GFP_KERNEL);

        if (!keyres)
                return ERR_PTR(-ENOMEM);

        keyres->check = check;

        return keyres;
}

/*
 * Semaphore to serialise restriction setup to prevent reference count
 * cycles through restriction key pointers.
 */
static DECLARE_RWSEM(keyring_serialise_restrict_sem);

/*
 * Check for restriction cycles that would prevent keyring garbage collection.
 * keyring_serialise_restrict_sem must be held.
 */
static bool keyring_detect_restriction_cycle(const struct key *dest_keyring,
                                             struct key_restriction *keyres)
{
        while (keyres && keyres->key &&
               keyres->key->type == &key_type_keyring) {
                if (keyres->key == dest_keyring)
                        return true;

                keyres = keyres->key->restrict_link;
        }

        return false;
}

/**
 * keyring_restrict - Look up and apply a restriction to a keyring
 * @keyring_ref: The keyring to be restricted
 * @type: The key type that will provide the restriction checker.
 * @restriction: The restriction options to apply to the keyring
 *
 * Look up a keyring and apply a restriction to it.  The restriction is managed
 * by the specific key type, but can be configured by the options specified in
 * the restriction string.
 */
int keyring_restrict(key_ref_t keyring_ref, const char *type,
                     const char *restriction)
{
        struct key *keyring;
        struct key_type *restrict_type = NULL;
        struct key_restriction *restrict_link;
        int ret = 0;

        keyring = key_ref_to_ptr(keyring_ref);
        key_check(keyring);

        if (keyring->type != &key_type_keyring)
                return -ENOTDIR;

        if (!type) {
                restrict_link = keyring_restriction_alloc(restrict_link_reject);
        } else {
                restrict_type = key_type_lookup(type);

                if (IS_ERR(restrict_type))
                        return PTR_ERR(restrict_type);

                if (!restrict_type->lookup_restriction) {
                        ret = -ENOENT;
                        goto error;
                }

                restrict_link = restrict_type->lookup_restriction(restriction);
        }

        if (IS_ERR(restrict_link)) {
                ret = PTR_ERR(restrict_link);
                goto error;
        }

        down_write(&keyring->sem);
        down_write(&keyring_serialise_restrict_sem);

        if (keyring->restrict_link) {
                ret = -EEXIST;
        } else if (keyring_detect_restriction_cycle(keyring, restrict_link)) {
                ret = -EDEADLK;
        } else {
                keyring->restrict_link = restrict_link;
                notify_key(keyring, NOTIFY_KEY_SETATTR, 0);
        }

        up_write(&keyring_serialise_restrict_sem);
        up_write(&keyring->sem);

        if (ret < 0) {
                key_put(restrict_link->key);
                kfree(restrict_link);
        }

error:
        if (restrict_type)
                key_type_put(restrict_type);

        return ret;
}
EXPORT_SYMBOL(keyring_restrict);

/*
 * Search the given keyring for a key that might be updated.
 *
 * The caller must guarantee that the keyring is a keyring and that the
 * permission is granted to modify the keyring as no check is made here.  The
 * caller must also hold a lock on the keyring semaphore.
 *
 * Returns a pointer to the found key with usage count incremented if
 * successful and returns NULL if not found.  Revoked and invalidated keys are
 * skipped over.
 *
 * If successful, the possession indicator is propagated from the keyring ref
 * to the returned key reference.
 */
key_ref_t find_key_to_update(key_ref_t keyring_ref,
                             const struct keyring_index_key *index_key)
{
        struct key *keyring, *key;
        const void *object;

        keyring = key_ref_to_ptr(keyring_ref);

        kenter("{%d},{%s,%s}",
               keyring->serial, index_key->type->name, index_key->description);

        object = assoc_array_find(&keyring->keys, &keyring_assoc_array_ops,
                                  index_key);

        if (object)
                goto found;

        kleave(" = NULL");
        return NULL;

found:
        key = keyring_ptr_to_key(object);
        if (key->flags & ((1 << KEY_FLAG_INVALIDATED) |
                          (1 << KEY_FLAG_REVOKED))) {
                kleave(" = NULL [x]");
                return NULL;
        }
        __key_get(key);
        kleave(" = {%d}", key->serial);
        return make_key_ref(key, is_key_possessed(keyring_ref));
}

/*
 * Find a keyring with the specified name.
 *
 * Only keyrings that have nonzero refcount, are not revoked, and are owned by a
 * user in the current user namespace are considered.  If @uid_keyring is %true,
 * the keyring additionally must have been allocated as a user or user session
 * keyring; otherwise, it must grant Search permission directly to the caller.
 *
 * Returns a pointer to the keyring with the keyring's refcount having being
 * incremented on success.  -ENOKEY is returned if a key could not be found.
 */
struct key *find_keyring_by_name(const char *name, bool uid_keyring)
{
        struct user_namespace *ns = current_user_ns();
        struct key *keyring;

        if (!name)
                return ERR_PTR(-EINVAL);

        read_lock(&keyring_name_lock);

        /* Search this hash bucket for a keyring with a matching name that
         * grants Search permission and that hasn't been revoked
         */
        list_for_each_entry(keyring, &ns->keyring_name_list, name_link) {
                if (!kuid_has_mapping(ns, keyring->user->uid))
                        continue;

                if (test_bit(KEY_FLAG_REVOKED, &keyring->flags))
                        continue;

                if (strcmp(keyring->description, name) != 0)
                        continue;

                if (uid_keyring) {
                        if (!test_bit(KEY_FLAG_UID_KEYRING,
                                      &keyring->flags))
                                continue;
                } else {
                        if (key_permission(make_key_ref(keyring, 0),
                                           KEY_NEED_SEARCH) < 0)
                                continue;
                }

                /* we've got a match but we might end up racing with
                 * key_cleanup() if the keyring is currently 'dead'
                 * (ie. it has a zero usage count) */
                if (!refcount_inc_not_zero(&keyring->usage))
                        continue;
                keyring->last_used_at = ktime_get_real_seconds();
                goto out;
        }

        keyring = ERR_PTR(-ENOKEY);
out:
        read_unlock(&keyring_name_lock);
        return keyring;
}

static int keyring_detect_cycle_iterator(const void *object,
                                         void *iterator_data)
{
        struct keyring_search_context *ctx = iterator_data;
        const struct key *key = keyring_ptr_to_key(object);

        kenter("{%d}", key->serial);

        /* We might get a keyring with matching index-key that is nonetheless a
         * different keyring. */
        if (key != ctx->match_data.raw_data)
                return 0;

        ctx->result = ERR_PTR(-EDEADLK);
        return 1;
}

/*
 * See if a cycle will will be created by inserting acyclic tree B in acyclic
 * tree A at the topmost level (ie: as a direct child of A).
 *
 * Since we are adding B to A at the top level, checking for cycles should just
 * be a matter of seeing if node A is somewhere in tree B.
 */
static int keyring_detect_cycle(struct key *A, struct key *B)
{
        struct keyring_search_context ctx = {
                .index_key                = A->index_key,
                .match_data.raw_data        = A,
                .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT,
                .iterator                = keyring_detect_cycle_iterator,
                .flags                        = (KEYRING_SEARCH_NO_STATE_CHECK |
                                           KEYRING_SEARCH_NO_UPDATE_TIME |
                                           KEYRING_SEARCH_NO_CHECK_PERM |
                                           KEYRING_SEARCH_DETECT_TOO_DEEP |
                                           KEYRING_SEARCH_RECURSE),
        };

        rcu_read_lock();
        search_nested_keyrings(B, &ctx);
        rcu_read_unlock();
        return PTR_ERR(ctx.result) == -EAGAIN ? 0 : PTR_ERR(ctx.result);
}

/*
 * Lock keyring for link.
 */
int __key_link_lock(struct key *keyring,
                    const struct keyring_index_key *index_key)
        __acquires(&keyring->sem)
        __acquires(&keyring_serialise_link_lock)
{
        if (keyring->type != &key_type_keyring)
                return -ENOTDIR;

        down_write(&keyring->sem);

        /* Serialise link/link calls to prevent parallel calls causing a cycle
         * when linking two keyring in opposite orders.
         */
        if (index_key->type == &key_type_keyring)
                mutex_lock(&keyring_serialise_link_lock);

        return 0;
}

/*
 * Lock keyrings for move (link/unlink combination).
 */
int __key_move_lock(struct key *l_keyring, struct key *u_keyring,
                    const struct keyring_index_key *index_key)
        __acquires(&l_keyring->sem)
        __acquires(&u_keyring->sem)
        __acquires(&keyring_serialise_link_lock)
{
        if (l_keyring->type != &key_type_keyring ||
            u_keyring->type != &key_type_keyring)
                return -ENOTDIR;

        /* We have to be very careful here to take the keyring locks in the
         * right order, lest we open ourselves to deadlocking against another
         * move operation.
         */
        if (l_keyring < u_keyring) {
                down_write(&l_keyring->sem);
                down_write_nested(&u_keyring->sem, 1);
        } else {
                down_write(&u_keyring->sem);
                down_write_nested(&l_keyring->sem, 1);
        }

        /* Serialise link/link calls to prevent parallel calls causing a cycle
         * when linking two keyring in opposite orders.
         */
        if (index_key->type == &key_type_keyring)
                mutex_lock(&keyring_serialise_link_lock);

        return 0;
}

/*
 * Preallocate memory so that a key can be linked into to a keyring.
 */
int __key_link_begin(struct key *keyring,
                     const struct keyring_index_key *index_key,
                     struct assoc_array_edit **_edit)
{
        struct assoc_array_edit *edit;
        int ret;

        kenter("%d,%s,%s,",
               keyring->serial, index_key->type->name, index_key->description);

        BUG_ON(index_key->desc_len == 0);
        BUG_ON(*_edit != NULL);

        *_edit = NULL;

        ret = -EKEYREVOKED;
        if (test_bit(KEY_FLAG_REVOKED, &keyring->flags))
                goto error;

        /* Create an edit script that will insert/replace the key in the
         * keyring tree.
         */
        edit = assoc_array_insert(&keyring->keys,
                                  &keyring_assoc_array_ops,
                                  index_key,
                                  NULL);
        if (IS_ERR(edit)) {
                ret = PTR_ERR(edit);
                goto error;
        }

        /* If we're not replacing a link in-place then we're going to need some
         * extra quota.
         */
        if (!edit->dead_leaf) {
                ret = key_payload_reserve(keyring,
                                          keyring->datalen + KEYQUOTA_LINK_BYTES);
                if (ret < 0)
                        goto error_cancel;
        }

        *_edit = edit;
        kleave(" = 0");
        return 0;

error_cancel:
        assoc_array_cancel_edit(edit);
error:
        kleave(" = %d", ret);
        return ret;
}

/*
 * Check already instantiated keys aren't going to be a problem.
 *
 * The caller must have called __key_link_begin(). Don't need to call this for
 * keys that were created since __key_link_begin() was called.
 */
int __key_link_check_live_key(struct key *keyring, struct key *key)
{
        if (key->type == &key_type_keyring)
                /* check that we aren't going to create a cycle by linking one
                 * keyring to another */
                return keyring_detect_cycle(keyring, key);
        return 0;
}

/*
 * Link a key into to a keyring.
 *
 * Must be called with __key_link_begin() having being called.  Discards any
 * already extant link to matching key if there is one, so that each keyring
 * holds at most one link to any given key of a particular type+description
 * combination.
 */
void __key_link(struct key *keyring, struct key *key,
                struct assoc_array_edit **_edit)
{
        __key_get(key);
        assoc_array_insert_set_object(*_edit, keyring_key_to_ptr(key));
        assoc_array_apply_edit(*_edit);
        *_edit = NULL;
        notify_key(keyring, NOTIFY_KEY_LINKED, key_serial(key));
}

/*
 * Finish linking a key into to a keyring.
 *
 * Must be called with __key_link_begin() having being called.
 */
void __key_link_end(struct key *keyring,
                    const struct keyring_index_key *index_key,
                    struct assoc_array_edit *edit)
        __releases(&keyring->sem)
        __releases(&keyring_serialise_link_lock)
{
        BUG_ON(index_key->type == NULL);
        kenter("%d,%s,", keyring->serial, index_key->type->name);

        if (edit) {
                if (!edit->dead_leaf) {
                        key_payload_reserve(keyring,
                                keyring->datalen - KEYQUOTA_LINK_BYTES);
                }
                assoc_array_cancel_edit(edit);
        }
        up_write(&keyring->sem);

        if (index_key->type == &key_type_keyring)
                mutex_unlock(&keyring_serialise_link_lock);
}

/*
 * Check addition of keys to restricted keyrings.
 */
static int __key_link_check_restriction(struct key *keyring, struct key *key)
{
        if (!keyring->restrict_link || !keyring->restrict_link->check)
                return 0;
        return keyring->restrict_link->check(keyring, key->type, &key->payload,
                                             keyring->restrict_link->key);
}

/**
 * key_link - Link a key to a keyring
 * @keyring: The keyring to make the link in.
 * @key: The key to link to.
 *
 * Make a link in a keyring to a key, such that the keyring holds a reference
 * on that key and the key can potentially be found by searching that keyring.
 *
 * This function will write-lock the keyring's semaphore and will consume some
 * of the user's key data quota to hold the link.
 *
 * Returns 0 if successful, -ENOTDIR if the keyring isn't a keyring,
 * -EKEYREVOKED if the keyring has been revoked, -ENFILE if the keyring is
 * full, -EDQUOT if there is insufficient key data quota remaining to add
 * another link or -ENOMEM if there's insufficient memory.
 *
 * It is assumed that the caller has checked that it is permitted for a link to
 * be made (the keyring should have Write permission and the key Link
 * permission).
 */
int key_link(struct key *keyring, struct key *key)
{
        struct assoc_array_edit *edit = NULL;
        int ret;

        kenter("{%d,%d}", keyring->serial, refcount_read(&keyring->usage));

        key_check(keyring);
        key_check(key);

        ret = __key_link_lock(keyring, &key->index_key);
        if (ret < 0)
                goto error;

        ret = __key_link_begin(keyring, &key->index_key, &edit);
        if (ret < 0)
                goto error_end;

        kdebug("begun {%d,%d}", keyring->serial, refcount_read(&keyring->usage));
        ret = __key_link_check_restriction(keyring, key);
        if (ret == 0)
                ret = __key_link_check_live_key(keyring, key);
        if (ret == 0)
                __key_link(keyring, key, &edit);

error_end:
        __key_link_end(keyring, &key->index_key, edit);
error:
        kleave(" = %d {%d,%d}", ret, keyring->serial, refcount_read(&keyring->usage));
        return ret;
}
EXPORT_SYMBOL(key_link);

/*
 * Lock a keyring for unlink.
 */
static int __key_unlink_lock(struct key *keyring)
        __acquires(&keyring->sem)
{
        if (keyring->type != &key_type_keyring)
                return -ENOTDIR;

        down_write(&keyring->sem);
        return 0;
}

/*
 * Begin the process of unlinking a key from a keyring.
 */
static int __key_unlink_begin(struct key *keyring, struct key *key,
                              struct assoc_array_edit **_edit)
{
        struct assoc_array_edit *edit;

        BUG_ON(*_edit != NULL);

        edit = assoc_array_delete(&keyring->keys, &keyring_assoc_array_ops,
                                  &key->index_key);
        if (IS_ERR(edit))
                return PTR_ERR(edit);

        if (!edit)
                return -ENOENT;

        *_edit = edit;
        return 0;
}

/*
 * Apply an unlink change.
 */
static void __key_unlink(struct key *keyring, struct key *key,
                         struct assoc_array_edit **_edit)
{
        assoc_array_apply_edit(*_edit);
        notify_key(keyring, NOTIFY_KEY_UNLINKED, key_serial(key));
        *_edit = NULL;
        key_payload_reserve(keyring, keyring->datalen - KEYQUOTA_LINK_BYTES);
}

/*
 * Finish unlinking a key from to a keyring.
 */
static void __key_unlink_end(struct key *keyring,
                             struct key *key,
                             struct assoc_array_edit *edit)
        __releases(&keyring->sem)
{
        if (edit)
                assoc_array_cancel_edit(edit);
        up_write(&keyring->sem);
}

/**
 * key_unlink - Unlink the first link to a key from a keyring.
 * @keyring: The keyring to remove the link from.
 * @key: The key the link is to.
 *
 * Remove a link from a keyring to a key.
 *
 * This function will write-lock the keyring's semaphore.
 *
 * Returns 0 if successful, -ENOTDIR if the keyring isn't a keyring, -ENOENT if
 * the key isn't linked to by the keyring or -ENOMEM if there's insufficient
 * memory.
 *
 * It is assumed that the caller has checked that it is permitted for a link to
 * be removed (the keyring should have Write permission; no permissions are
 * required on the key).
 */
int key_unlink(struct key *keyring, struct key *key)
{
        struct assoc_array_edit *edit = NULL;
        int ret;

        key_check(keyring);
        key_check(key);

        ret = __key_unlink_lock(keyring);
        if (ret < 0)
                return ret;

        ret = __key_unlink_begin(keyring, key, &edit);
        if (ret == 0)
                __key_unlink(keyring, key, &edit);
        __key_unlink_end(keyring, key, edit);
        return ret;
}
EXPORT_SYMBOL(key_unlink);

/**
 * key_move - Move a key from one keyring to another
 * @key: The key to move
 * @from_keyring: The keyring to remove the link from.
 * @to_keyring: The keyring to make the link in.
 * @flags: Qualifying flags, such as KEYCTL_MOVE_EXCL.
 *
 * Make a link in @to_keyring to a key, such that the keyring holds a reference
 * on that key and the key can potentially be found by searching that keyring
 * whilst simultaneously removing a link to the key from @from_keyring.
 *
 * This function will write-lock both keyring's semaphores and will consume
 * some of the user's key data quota to hold the link on @to_keyring.
 *
 * Returns 0 if successful, -ENOTDIR if either keyring isn't a keyring,
 * -EKEYREVOKED if either keyring has been revoked, -ENFILE if the second
 * keyring is full, -EDQUOT if there is insufficient key data quota remaining
 * to add another link or -ENOMEM if there's insufficient memory.  If
 * KEYCTL_MOVE_EXCL is set, then -EEXIST will be returned if there's already a
 * matching key in @to_keyring.
 *
 * It is assumed that the caller has checked that it is permitted for a link to
 * be made (the keyring should have Write permission and the key Link
 * permission).
 */
int key_move(struct key *key,
             struct key *from_keyring,
             struct key *to_keyring,
             unsigned int flags)
{
        struct assoc_array_edit *from_edit = NULL, *to_edit = NULL;
        int ret;

        kenter("%d,%d,%d", key->serial, from_keyring->serial, to_keyring->serial);

        if (from_keyring == to_keyring)
                return 0;

        key_check(key);
        key_check(from_keyring);
        key_check(to_keyring);

        ret = __key_move_lock(from_keyring, to_keyring, &key->index_key);
        if (ret < 0)
                goto out;
        ret = __key_unlink_begin(from_keyring, key, &from_edit);
        if (ret < 0)
                goto error;
        ret = __key_link_begin(to_keyring, &key->index_key, &to_edit);
        if (ret < 0)
                goto error;

        ret = -EEXIST;
        if (to_edit->dead_leaf && (flags & KEYCTL_MOVE_EXCL))
                goto error;

        ret = __key_link_check_restriction(to_keyring, key);
        if (ret < 0)
                goto error;
        ret = __key_link_check_live_key(to_keyring, key);
        if (ret < 0)
                goto error;

        __key_unlink(from_keyring, key, &from_edit);
        __key_link(to_keyring, key, &to_edit);
error:
        __key_link_end(to_keyring, &key->index_key, to_edit);
        __key_unlink_end(from_keyring, key, from_edit);
out:
        kleave(" = %d", ret);
        return ret;
}
EXPORT_SYMBOL(key_move);

/**
 * keyring_clear - Clear a keyring
 * @keyring: The keyring to clear.
 *
 * Clear the contents of the specified keyring.
 *
 * Returns 0 if successful or -ENOTDIR if the keyring isn't a keyring.
 */
int keyring_clear(struct key *keyring)
{
        struct assoc_array_edit *edit;
        int ret;

        if (keyring->type != &key_type_keyring)
                return -ENOTDIR;

        down_write(&keyring->sem);

        edit = assoc_array_clear(&keyring->keys, &keyring_assoc_array_ops);
        if (IS_ERR(edit)) {
                ret = PTR_ERR(edit);
        } else {
                if (edit)
                        assoc_array_apply_edit(edit);
                notify_key(keyring, NOTIFY_KEY_CLEARED, 0);
                key_payload_reserve(keyring, 0);
                ret = 0;
        }

        up_write(&keyring->sem);
        return ret;
}
EXPORT_SYMBOL(keyring_clear);

/*
 * Dispose of the links from a revoked keyring.
 *
 * This is called with the key sem write-locked.
 */
static void keyring_revoke(struct key *keyring)
{
        struct assoc_array_edit *edit;

        edit = assoc_array_clear(&keyring->keys, &keyring_assoc_array_ops);
        if (!IS_ERR(edit)) {
                if (edit)
                        assoc_array_apply_edit(edit);
                key_payload_reserve(keyring, 0);
        }
}

static bool keyring_gc_select_iterator(void *object, void *iterator_data)
{
        struct key *key = keyring_ptr_to_key(object);
        time64_t *limit = iterator_data;

        if (key_is_dead(key, *limit))
                return false;
        key_get(key);
        return true;
}

static int keyring_gc_check_iterator(const void *object, void *iterator_data)
{
        const struct key *key = keyring_ptr_to_key(object);
        time64_t *limit = iterator_data;

        key_check(key);
        return key_is_dead(key, *limit);
}

/*
 * Garbage collect pointers from a keyring.
 *
 * Not called with any locks held.  The keyring's key struct will not be
 * deallocated under us as only our caller may deallocate it.
 */
void keyring_gc(struct key *keyring, time64_t limit)
{
        int result;

        kenter("%x{%s}", keyring->serial, keyring->description ?: "");

        if (keyring->flags & ((1 << KEY_FLAG_INVALIDATED) |
                              (1 << KEY_FLAG_REVOKED)))
                goto dont_gc;

        /* scan the keyring looking for dead keys */
        rcu_read_lock();
        result = assoc_array_iterate(&keyring->keys,
                                     keyring_gc_check_iterator, &limit);
        rcu_read_unlock();
        if (result == true)
                goto do_gc;

dont_gc:
        kleave(" [no gc]");
        return;

do_gc:
        down_write(&keyring->sem);
        assoc_array_gc(&keyring->keys, &keyring_assoc_array_ops,
                       keyring_gc_select_iterator, &limit);
        up_write(&keyring->sem);
        kleave(" [gc]");
}

/*
 * Garbage collect restriction pointers from a keyring.
 *
 * Keyring restrictions are associated with a key type, and must be cleaned
 * up if the key type is unregistered. The restriction is altered to always
 * reject additional keys so a keyring cannot be opened up by unregistering
 * a key type.
 *
 * Not called with any keyring locks held. The keyring's key struct will not
 * be deallocated under us as only our caller may deallocate it.
 *
 * The caller is required to hold key_types_sem and dead_type->sem. This is
 * fulfilled by key_gc_keytype() holding the locks on behalf of
 * key_garbage_collector(), which it invokes on a workqueue.
 */
void keyring_restriction_gc(struct key *keyring, struct key_type *dead_type)
{
        struct key_restriction *keyres;

        kenter("%x{%s}", keyring->serial, keyring->description ?: "");

        /*
         * keyring->restrict_link is only assigned at key allocation time
         * or with the key type locked, so the only values that could be
         * concurrently assigned to keyring->restrict_link are for key
         * types other than dead_type. Given this, it's ok to check
         * the key type before acquiring keyring->sem.
         */
        if (!dead_type || !keyring->restrict_link ||
            keyring->restrict_link->keytype != dead_type) {
                kleave(" [no restriction gc]");
                return;
        }

        /* Lock the keyring to ensure that a link is not in progress */
        down_write(&keyring->sem);

        keyres = keyring->restrict_link;

        keyres->check = restrict_link_reject;

        key_put(keyres->key);
        keyres->key = NULL;
        keyres->keytype = NULL;

        up_write(&keyring->sem);

        kleave(" [restriction gc]");
}












































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BLK_STAT_H
#define BLK_STAT_H

#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/ktime.h>
#include <linux/rcupdate.h>
#include <linux/timer.h>

/**
 * struct blk_stat_callback - Block statistics callback.
 *
 * A &struct blk_stat_callback is associated with a &struct request_queue. While
 * @timer is active, that queue's request completion latencies are sorted into
 * buckets by @bucket_fn and added to a per-cpu buffer, @cpu_stat. When the
 * timer fires, @cpu_stat is flushed to @stat and @timer_fn is invoked.
 */
struct blk_stat_callback {
        /*
         * @list: RCU list of callbacks for a &struct request_queue.
         */
        struct list_head list;

        /**
         * @timer: Timer for the next callback invocation.
         */
        struct timer_list timer;

        /**
         * @cpu_stat: Per-cpu statistics buckets.
         */
        struct blk_rq_stat __percpu *cpu_stat;

        /**
         * @bucket_fn: Given a request, returns which statistics bucket it
         * should be accounted under. Return -1 for no bucket for this
         * request.
         */
        int (*bucket_fn)(const struct request *);

        /**
         * @buckets: Number of statistics buckets.
         */
        unsigned int buckets;

        /**
         * @stat: Array of statistics buckets.
         */
        struct blk_rq_stat *stat;

        /**
         * @fn: Callback function.
         */
        void (*timer_fn)(struct blk_stat_callback *);

        /**
         * @data: Private pointer for the user.
         */
        void *data;

        struct rcu_head rcu;
};

struct blk_queue_stats *blk_alloc_queue_stats(void);
void blk_free_queue_stats(struct blk_queue_stats *);

void blk_stat_add(struct request *rq, u64 now);

/* record time/size info in request but not add a callback */
void blk_stat_enable_accounting(struct request_queue *q);

/**
 * blk_stat_alloc_callback() - Allocate a block statistics callback.
 * @timer_fn: Timer callback function.
 * @bucket_fn: Bucket callback function.
 * @buckets: Number of statistics buckets.
 * @data: Value for the @data field of the &struct blk_stat_callback.
 *
 * See &struct blk_stat_callback for details on the callback functions.
 *
 * Return: &struct blk_stat_callback on success or NULL on ENOMEM.
 */
struct blk_stat_callback *
blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *),
                        int (*bucket_fn)(const struct request *),
                        unsigned int buckets, void *data);

/**
 * blk_stat_add_callback() - Add a block statistics callback to be run on a
 * request queue.
 * @q: The request queue.
 * @cb: The callback.
 *
 * Note that a single &struct blk_stat_callback can only be added to a single
 * &struct request_queue.
 */
void blk_stat_add_callback(struct request_queue *q,
                           struct blk_stat_callback *cb);

/**
 * blk_stat_remove_callback() - Remove a block statistics callback from a
 * request queue.
 * @q: The request queue.
 * @cb: The callback.
 *
 * When this returns, the callback is not running on any CPUs and will not be
 * called again unless readded.
 */
void blk_stat_remove_callback(struct request_queue *q,
                              struct blk_stat_callback *cb);

/**
 * blk_stat_free_callback() - Free a block statistics callback.
 * @cb: The callback.
 *
 * @cb may be NULL, in which case this does nothing. If it is not NULL, @cb must
 * not be associated with a request queue. I.e., if it was previously added with
 * blk_stat_add_callback(), it must also have been removed since then with
 * blk_stat_remove_callback().
 */
void blk_stat_free_callback(struct blk_stat_callback *cb);

/**
 * blk_stat_is_active() - Check if a block statistics callback is currently
 * gathering statistics.
 * @cb: The callback.
 */
static inline bool blk_stat_is_active(struct blk_stat_callback *cb)
{
        return timer_pending(&cb->timer);
}

/**
 * blk_stat_activate_nsecs() - Gather block statistics during a time window in
 * nanoseconds.
 * @cb: The callback.
 * @nsecs: Number of nanoseconds to gather statistics for.
 *
 * The timer callback will be called when the window expires.
 */
static inline void blk_stat_activate_nsecs(struct blk_stat_callback *cb,
                                           u64 nsecs)
{
        mod_timer(&cb->timer, jiffies + nsecs_to_jiffies(nsecs));
}

static inline void blk_stat_deactivate(struct blk_stat_callback *cb)
{
        del_timer_sync(&cb->timer);
}

/**
 * blk_stat_activate_msecs() - Gather block statistics during a time window in
 * milliseconds.
 * @cb: The callback.
 * @msecs: Number of milliseconds to gather statistics for.
 *
 * The timer callback will be called when the window expires.
 */
static inline void blk_stat_activate_msecs(struct blk_stat_callback *cb,
                                           unsigned int msecs)
{
        mod_timer(&cb->timer, jiffies + msecs_to_jiffies(msecs));
}

void blk_rq_stat_add(struct blk_rq_stat *, u64);
void blk_rq_stat_sum(struct blk_rq_stat *, struct blk_rq_stat *);
void blk_rq_stat_init(struct blk_rq_stat *);

#endif
































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_UIDGID_H
#define _LINUX_UIDGID_H

/*
 * A set of types for the internal kernel types representing uids and gids.
 *
 * The types defined in this header allow distinguishing which uids and gids in
 * the kernel are values used by userspace and which uid and gid values are
 * the internal kernel values.  With the addition of user namespaces the values
 * can be different.  Using the type system makes it possible for the compiler
 * to detect when we overlook these differences.
 *
 */
#include <linux/types.h>
#include <linux/highuid.h>

struct user_namespace;
extern struct user_namespace init_user_ns;

typedef struct {
        uid_t val;
} kuid_t;


typedef struct {
        gid_t val;
} kgid_t;

#define KUIDT_INIT(value) (kuid_t){ value }
#define KGIDT_INIT(value) (kgid_t){ value }

#ifdef CONFIG_MULTIUSER
static inline uid_t __kuid_val(kuid_t uid)
{
        return uid.val;
}

static inline gid_t __kgid_val(kgid_t gid)
{
        return gid.val;
}
#else
static inline uid_t __kuid_val(kuid_t uid)
{
        return 0;
}

static inline gid_t __kgid_val(kgid_t gid)
{
        return 0;
}
#endif

#define GLOBAL_ROOT_UID KUIDT_INIT(0)
#define GLOBAL_ROOT_GID KGIDT_INIT(0)

#define INVALID_UID KUIDT_INIT(-1)
#define INVALID_GID KGIDT_INIT(-1)

static inline bool uid_eq(kuid_t left, kuid_t right)
{
        return __kuid_val(left) == __kuid_val(right);
}

static inline bool gid_eq(kgid_t left, kgid_t right)
{
        return __kgid_val(left) == __kgid_val(right);
}

static inline bool uid_gt(kuid_t left, kuid_t right)
{
        return __kuid_val(left) > __kuid_val(right);
}

static inline bool gid_gt(kgid_t left, kgid_t right)
{
        return __kgid_val(left) > __kgid_val(right);
}

static inline bool uid_gte(kuid_t left, kuid_t right)
{
        return __kuid_val(left) >= __kuid_val(right);
}

static inline bool gid_gte(kgid_t left, kgid_t right)
{
        return __kgid_val(left) >= __kgid_val(right);
}

static inline bool uid_lt(kuid_t left, kuid_t right)
{
        return __kuid_val(left) < __kuid_val(right);
}

static inline bool gid_lt(kgid_t left, kgid_t right)
{
        return __kgid_val(left) < __kgid_val(right);
}

static inline bool uid_lte(kuid_t left, kuid_t right)
{
        return __kuid_val(left) <= __kuid_val(right);
}

static inline bool gid_lte(kgid_t left, kgid_t right)
{
        return __kgid_val(left) <= __kgid_val(right);
}

static inline bool uid_valid(kuid_t uid)
{
        return __kuid_val(uid) != (uid_t) -1;
}

static inline bool gid_valid(kgid_t gid)
{
        return __kgid_val(gid) != (gid_t) -1;
}

#ifdef CONFIG_USER_NS

extern kuid_t make_kuid(struct user_namespace *from, uid_t uid);
extern kgid_t make_kgid(struct user_namespace *from, gid_t gid);

extern uid_t from_kuid(struct user_namespace *to, kuid_t uid);
extern gid_t from_kgid(struct user_namespace *to, kgid_t gid);
extern uid_t from_kuid_munged(struct user_namespace *to, kuid_t uid);
extern gid_t from_kgid_munged(struct user_namespace *to, kgid_t gid);

static inline bool kuid_has_mapping(struct user_namespace *ns, kuid_t uid)
{
        return from_kuid(ns, uid) != (uid_t) -1;
}

static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid)
{
        return from_kgid(ns, gid) != (gid_t) -1;
}

#else

static inline kuid_t make_kuid(struct user_namespace *from, uid_t uid)
{
        return KUIDT_INIT(uid);
}

static inline kgid_t make_kgid(struct user_namespace *from, gid_t gid)
{
        return KGIDT_INIT(gid);
}

static inline uid_t from_kuid(struct user_namespace *to, kuid_t kuid)
{
        return __kuid_val(kuid);
}

static inline gid_t from_kgid(struct user_namespace *to, kgid_t kgid)
{
        return __kgid_val(kgid);
}

static inline uid_t from_kuid_munged(struct user_namespace *to, kuid_t kuid)
{
        uid_t uid = from_kuid(to, kuid);
        if (uid == (uid_t)-1)
                uid = overflowuid;
        return uid;
}

static inline gid_t from_kgid_munged(struct user_namespace *to, kgid_t kgid)
{
        gid_t gid = from_kgid(to, kgid);
        if (gid == (gid_t)-1)
                gid = overflowgid;
        return gid;
}

static inline bool kuid_has_mapping(struct user_namespace *ns, kuid_t uid)
{
        return uid_valid(uid);
}

static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid)
{
        return gid_valid(gid);
}

#endif /* CONFIG_USER_NS */

#endif /* _LINUX_UIDGID_H */

































































































































































    2 


    2 






















































    3 



    3 

    2 






    3 

    3 














    3 
    3 










    1 






















    3 
    1 




    3 






























































































































































    3 








    3 





    3 



    2 



    2 


    2 













    3 
    2 
    3 
    2 





    2 



    3 



    3 




    3 
    1 
    1 
    1 



    3 





    3 












    3 
    3 

    3 
    1 
    1 
    1 



    3 





    3 
    3 
    3 
    3 


    3 





    3 





















    1 





    1 

    1 













    3 



    3 


    1 

    1 
    1 

    1 
    1 
    1 
    1 












    3 


    3 
    3 
    3 


    3 














































    3 























    3 

























































































































































































































































    1 










    1 























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 








































    1 
















    1 





















































    4 













    4 








    4 

    3 
    4 




    4 
    4 


    4 




    4 


    1 



    2 








    4 


    2 
    4 







    4 

    1 

    1 












    4 

    2 

    2 

    2 








    4 


    1 



    2 



    1 












    2 



    2 




    2 

















    3 
    2 
    3 

    3 
    3 

    3 




    3 




    3 


    3 




















































    4 







    4 







    4 

    4 






    3 
    3 



    3 




    3 









    1 
    1 






    1 
    1 

    1 









    2 



    1 


    1 






















    2 


    2 


    1 























    2 


    3 


    3 




    4 
    4 
    4 





    4 




















































    1 




































































    2 

























































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/lib/vsprintf.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/* vsprintf.c -- Lars Wirzenius & Linus Torvalds. */
/*
 * Wirzenius wrote this portably, Torvalds fucked it up :-)
 */

/*
 * Fri Jul 13 2001 Crutcher Dunnavant <crutcher+kernel@datastacks.com>
 * - changed to provide snprintf and vsnprintf functions
 * So Feb  1 16:51:32 CET 2004 Juergen Quade <quade@hsnr.de>
 * - scnprintf and vscnprintf
 */

#include <stdarg.h>
#include <linux/build_bug.h>
#include <linux/clk.h>
#include <linux/clk-provider.h>
#include <linux/errname.h>
#include <linux/module.h>        /* for KSYM_SYMBOL_LEN */
#include <linux/types.h>
#include <linux/string.h>
#include <linux/ctype.h>
#include <linux/kernel.h>
#include <linux/kallsyms.h>
#include <linux/math64.h>
#include <linux/uaccess.h>
#include <linux/ioport.h>
#include <linux/dcache.h>
#include <linux/cred.h>
#include <linux/rtc.h>
#include <linux/time.h>
#include <linux/uuid.h>
#include <linux/of.h>
#include <net/addrconf.h>
#include <linux/siphash.h>
#include <linux/compiler.h>
#include <linux/property.h>
#ifdef CONFIG_BLOCK
#include <linux/blkdev.h>
#endif

#include "../mm/internal.h"        /* For the trace_print_flags arrays */

#include <asm/page.h>                /* for PAGE_SIZE */
#include <asm/byteorder.h>        /* cpu_to_le16 */

#include <linux/string_helpers.h>
#include "kstrtox.h"

static unsigned long long simple_strntoull(const char *startp, size_t max_chars,
                                           char **endp, unsigned int base)
{
        const char *cp;
        unsigned long long result = 0ULL;
        size_t prefix_chars;
        unsigned int rv;

        cp = _parse_integer_fixup_radix(startp, &base);
        prefix_chars = cp - startp;
        if (prefix_chars < max_chars) {
                rv = _parse_integer_limit(cp, base, &result, max_chars - prefix_chars);
                /* FIXME */
                cp += (rv & ~KSTRTOX_OVERFLOW);
        } else {
                /* Field too short for prefix + digit, skip over without converting */
                cp = startp + max_chars;
        }

        if (endp)
                *endp = (char *)cp;

        return result;
}

/**
 * simple_strtoull - convert a string to an unsigned long long
 * @cp: The start of the string
 * @endp: A pointer to the end of the parsed string will be placed here
 * @base: The number base to use
 *
 * This function has caveats. Please use kstrtoull instead.
 */
unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base)
{
        return simple_strntoull(cp, INT_MAX, endp, base);
}
EXPORT_SYMBOL(simple_strtoull);

/**
 * simple_strtoul - convert a string to an unsigned long
 * @cp: The start of the string
 * @endp: A pointer to the end of the parsed string will be placed here
 * @base: The number base to use
 *
 * This function has caveats. Please use kstrtoul instead.
 */
unsigned long simple_strtoul(const char *cp, char **endp, unsigned int base)
{
        return simple_strtoull(cp, endp, base);
}
EXPORT_SYMBOL(simple_strtoul);

/**
 * simple_strtol - convert a string to a signed long
 * @cp: The start of the string
 * @endp: A pointer to the end of the parsed string will be placed here
 * @base: The number base to use
 *
 * This function has caveats. Please use kstrtol instead.
 */
long simple_strtol(const char *cp, char **endp, unsigned int base)
{
        if (*cp == '-')
                return -simple_strtoul(cp + 1, endp, base);

        return simple_strtoul(cp, endp, base);
}
EXPORT_SYMBOL(simple_strtol);

static long long simple_strntoll(const char *cp, size_t max_chars, char **endp,
                                 unsigned int base)
{
        /*
         * simple_strntoull() safely handles receiving max_chars==0 in the
         * case cp[0] == '-' && max_chars == 1.
         * If max_chars == 0 we can drop through and pass it to simple_strntoull()
         * and the content of *cp is irrelevant.
         */
        if (*cp == '-' && max_chars > 0)
                return -simple_strntoull(cp + 1, max_chars - 1, endp, base);

        return simple_strntoull(cp, max_chars, endp, base);
}

/**
 * simple_strtoll - convert a string to a signed long long
 * @cp: The start of the string
 * @endp: A pointer to the end of the parsed string will be placed here
 * @base: The number base to use
 *
 * This function has caveats. Please use kstrtoll instead.
 */
long long simple_strtoll(const char *cp, char **endp, unsigned int base)
{
        return simple_strntoll(cp, INT_MAX, endp, base);
}
EXPORT_SYMBOL(simple_strtoll);

static noinline_for_stack
int skip_atoi(const char **s)
{
        int i = 0;

        do {
                i = i*10 + *((*s)++) - '0';
        } while (isdigit(**s));

        return i;
}

/*
 * Decimal conversion is by far the most typical, and is used for
 * /proc and /sys data. This directly impacts e.g. top performance
 * with many processes running. We optimize it for speed by emitting
 * two characters at a time, using a 200 byte lookup table. This
 * roughly halves the number of multiplications compared to computing
 * the digits one at a time. Implementation strongly inspired by the
 * previous version, which in turn used ideas described at
 * <http://www.cs.uiowa.edu/~jones/bcd/divide.html> (with permission
 * from the author, Douglas W. Jones).
 *
 * It turns out there is precisely one 26 bit fixed-point
 * approximation a of 64/100 for which x/100 == (x * (u64)a) >> 32
 * holds for all x in [0, 10^8-1], namely a = 0x28f5c29. The actual
 * range happens to be somewhat larger (x <= 1073741898), but that's
 * irrelevant for our purpose.
 *
 * For dividing a number in the range [10^4, 10^6-1] by 100, we still
 * need a 32x32->64 bit multiply, so we simply use the same constant.
 *
 * For dividing a number in the range [100, 10^4-1] by 100, there are
 * several options. The simplest is (x * 0x147b) >> 19, which is valid
 * for all x <= 43698.
 */

static const u16 decpair[100] = {
#define _(x) (__force u16) cpu_to_le16(((x % 10) | ((x / 10) << 8)) + 0x3030)
        _( 0), _( 1), _( 2), _( 3), _( 4), _( 5), _( 6), _( 7), _( 8), _( 9),
        _(10), _(11), _(12), _(13), _(14), _(15), _(16), _(17), _(18), _(19),
        _(20), _(21), _(22), _(23), _(24), _(25), _(26), _(27), _(28), _(29),
        _(30), _(31), _(32), _(33), _(34), _(35), _(36), _(37), _(38), _(39),
        _(40), _(41), _(42), _(43), _(44), _(45), _(46), _(47), _(48), _(49),
        _(50), _(51), _(52), _(53), _(54), _(55), _(56), _(57), _(58), _(59),
        _(60), _(61), _(62), _(63), _(64), _(65), _(66), _(67), _(68), _(69),
        _(70), _(71), _(72), _(73), _(74), _(75), _(76), _(77), _(78), _(79),
        _(80), _(81), _(82), _(83), _(84), _(85), _(86), _(87), _(88), _(89),
        _(90), _(91), _(92), _(93), _(94), _(95), _(96), _(97), _(98), _(99),
#undef _
};

/*
 * This will print a single '0' even if r == 0, since we would
 * immediately jump to out_r where two 0s would be written but only
 * one of them accounted for in buf. This is needed by ip4_string
 * below. All other callers pass a non-zero value of r.
*/
static noinline_for_stack
char *put_dec_trunc8(char *buf, unsigned r)
{
        unsigned q;

        /* 1 <= r < 10^8 */
        if (r < 100)
                goto out_r;

        /* 100 <= r < 10^8 */
        q = (r * (u64)0x28f5c29) >> 32;
        *((u16 *)buf) = decpair[r - 100*q];
        buf += 2;

        /* 1 <= q < 10^6 */
        if (q < 100)
                goto out_q;

        /*  100 <= q < 10^6 */
        r = (q * (u64)0x28f5c29) >> 32;
        *((u16 *)buf) = decpair[q - 100*r];
        buf += 2;

        /* 1 <= r < 10^4 */
        if (r < 100)
                goto out_r;

        /* 100 <= r < 10^4 */
        q = (r * 0x147b) >> 19;
        *((u16 *)buf) = decpair[r - 100*q];
        buf += 2;
out_q:
        /* 1 <= q < 100 */
        r = q;
out_r:
        /* 1 <= r < 100 */
        *((u16 *)buf) = decpair[r];
        buf += r < 10 ? 1 : 2;
        return buf;
}

#if BITS_PER_LONG == 64 && BITS_PER_LONG_LONG == 64
static noinline_for_stack
char *put_dec_full8(char *buf, unsigned r)
{
        unsigned q;

        /* 0 <= r < 10^8 */
        q = (r * (u64)0x28f5c29) >> 32;
        *((u16 *)buf) = decpair[r - 100*q];
        buf += 2;

        /* 0 <= q < 10^6 */
        r = (q * (u64)0x28f5c29) >> 32;
        *((u16 *)buf) = decpair[q - 100*r];
        buf += 2;

        /* 0 <= r < 10^4 */
        q = (r * 0x147b) >> 19;
        *((u16 *)buf) = decpair[r - 100*q];
        buf += 2;

        /* 0 <= q < 100 */
        *((u16 *)buf) = decpair[q];
        buf += 2;
        return buf;
}

static noinline_for_stack
char *put_dec(char *buf, unsigned long long n)
{
        if (n >= 100*1000*1000)
                buf = put_dec_full8(buf, do_div(n, 100*1000*1000));
        /* 1 <= n <= 1.6e11 */
        if (n >= 100*1000*1000)
                buf = put_dec_full8(buf, do_div(n, 100*1000*1000));
        /* 1 <= n < 1e8 */
        return put_dec_trunc8(buf, n);
}

#elif BITS_PER_LONG == 32 && BITS_PER_LONG_LONG == 64

static void
put_dec_full4(char *buf, unsigned r)
{
        unsigned q;

        /* 0 <= r < 10^4 */
        q = (r * 0x147b) >> 19;
        *((u16 *)buf) = decpair[r - 100*q];
        buf += 2;
        /* 0 <= q < 100 */
        *((u16 *)buf) = decpair[q];
}

/*
 * Call put_dec_full4 on x % 10000, return x / 10000.
 * The approximation x/10000 == (x * 0x346DC5D7) >> 43
 * holds for all x < 1,128,869,999.  The largest value this
 * helper will ever be asked to convert is 1,125,520,955.
 * (second call in the put_dec code, assuming n is all-ones).
 */
static noinline_for_stack
unsigned put_dec_helper4(char *buf, unsigned x)
{
        uint32_t q = (x * (uint64_t)0x346DC5D7) >> 43;

        put_dec_full4(buf, x - q * 10000);
        return q;
}

/* Based on code by Douglas W. Jones found at
 * <http://www.cs.uiowa.edu/~jones/bcd/decimal.html#sixtyfour>
 * (with permission from the author).
 * Performs no 64-bit division and hence should be fast on 32-bit machines.
 */
static
char *put_dec(char *buf, unsigned long long n)
{
        uint32_t d3, d2, d1, q, h;

        if (n < 100*1000*1000)
                return put_dec_trunc8(buf, n);

        d1  = ((uint32_t)n >> 16); /* implicit "& 0xffff" */
        h   = (n >> 32);
        d2  = (h      ) & 0xffff;
        d3  = (h >> 16); /* implicit "& 0xffff" */

        /* n = 2^48 d3 + 2^32 d2 + 2^16 d1 + d0
             = 281_4749_7671_0656 d3 + 42_9496_7296 d2 + 6_5536 d1 + d0 */
        q   = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff);
        q = put_dec_helper4(buf, q);

        q += 7671 * d3 + 9496 * d2 + 6 * d1;
        q = put_dec_helper4(buf+4, q);

        q += 4749 * d3 + 42 * d2;
        q = put_dec_helper4(buf+8, q);

        q += 281 * d3;
        buf += 12;
        if (q)
                buf = put_dec_trunc8(buf, q);
        else while (buf[-1] == '0')
                --buf;

        return buf;
}

#endif

/*
 * Convert passed number to decimal string.
 * Returns the length of string.  On buffer overflow, returns 0.
 *
 * If speed is not important, use snprintf(). It's easy to read the code.
 */
int num_to_str(char *buf, int size, unsigned long long num, unsigned int width)
{
        /* put_dec requires 2-byte alignment of the buffer. */
        char tmp[sizeof(num) * 3] __aligned(2);
        int idx, len;

        /* put_dec() may work incorrectly for num = 0 (generate "", not "0") */
        if (num <= 9) {
                tmp[0] = '0' + num;
                len = 1;
        } else {
                len = put_dec(tmp, num) - tmp;
        }

        if (len > size || width > size)
                return 0;

        if (width > len) {
                width = width - len;
                for (idx = 0; idx < width; idx++)
                        buf[idx] = ' ';
        } else {
                width = 0;
        }

        for (idx = 0; idx < len; ++idx)
                buf[idx + width] = tmp[len - idx - 1];

        return len + width;
}

#define SIGN        1                /* unsigned/signed, must be 1 */
#define LEFT        2                /* left justified */
#define PLUS        4                /* show plus */
#define SPACE        8                /* space if plus */
#define ZEROPAD        16                /* pad with zero, must be 16 == '0' - ' ' */
#define SMALL        32                /* use lowercase in hex (must be 32 == 0x20) */
#define SPECIAL        64                /* prefix hex with "0x", octal with "0" */

static_assert(ZEROPAD == ('0' - ' '));
static_assert(SMALL == ' ');

enum format_type {
        FORMAT_TYPE_NONE, /* Just a string part */
        FORMAT_TYPE_WIDTH,
        FORMAT_TYPE_PRECISION,
        FORMAT_TYPE_CHAR,
        FORMAT_TYPE_STR,
        FORMAT_TYPE_PTR,
        FORMAT_TYPE_PERCENT_CHAR,
        FORMAT_TYPE_INVALID,
        FORMAT_TYPE_LONG_LONG,
        FORMAT_TYPE_ULONG,
        FORMAT_TYPE_LONG,
        FORMAT_TYPE_UBYTE,
        FORMAT_TYPE_BYTE,
        FORMAT_TYPE_USHORT,
        FORMAT_TYPE_SHORT,
        FORMAT_TYPE_UINT,
        FORMAT_TYPE_INT,
        FORMAT_TYPE_SIZE_T,
        FORMAT_TYPE_PTRDIFF
};

struct printf_spec {
        unsigned int        type:8;                /* format_type enum */
        signed int        field_width:24;        /* width of output field */
        unsigned int        flags:8;        /* flags to number() */
        unsigned int        base:8;                /* number base, 8, 10 or 16 only */
        signed int        precision:16;        /* # of digits/chars */
} __packed;
static_assert(sizeof(struct printf_spec) == 8);

#define FIELD_WIDTH_MAX ((1 << 23) - 1)
#define PRECISION_MAX ((1 << 15) - 1)

static noinline_for_stack
char *number(char *buf, char *end, unsigned long long num,
             struct printf_spec spec)
{
        /* put_dec requires 2-byte alignment of the buffer. */
        char tmp[3 * sizeof(num)] __aligned(2);
        char sign;
        char locase;
        int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10);
        int i;
        bool is_zero = num == 0LL;
        int field_width = spec.field_width;
        int precision = spec.precision;

        /* locase = 0 or 0x20. ORing digits or letters with 'locase'
         * produces same digits or (maybe lowercased) letters */
        locase = (spec.flags & SMALL);
        if (spec.flags & LEFT)
                spec.flags &= ~ZEROPAD;
        sign = 0;
        if (spec.flags & SIGN) {
                if ((signed long long)num < 0) {
                        sign = '-';
                        num = -(signed long long)num;
                        field_width--;
                } else if (spec.flags & PLUS) {
                        sign = '+';
                        field_width--;
                } else if (spec.flags & SPACE) {
                        sign = ' ';
                        field_width--;
                }
        }
        if (need_pfx) {
                if (spec.base == 16)
                        field_width -= 2;
                else if (!is_zero)
                        field_width--;
        }

        /* generate full string in tmp[], in reverse order */
        i = 0;
        if (num < spec.base)
                tmp[i++] = hex_asc_upper[num] | locase;
        else if (spec.base != 10) { /* 8 or 16 */
                int mask = spec.base - 1;
                int shift = 3;

                if (spec.base == 16)
                        shift = 4;
                do {
                        tmp[i++] = (hex_asc_upper[((unsigned char)num) & mask] | locase);
                        num >>= shift;
                } while (num);
        } else { /* base 10 */
                i = put_dec(tmp, num) - tmp;
        }

        /* printing 100 using %2d gives "100", not "00" */
        if (i > precision)
                precision = i;
        /* leading space padding */
        field_width -= precision;
        if (!(spec.flags & (ZEROPAD | LEFT))) {
                while (--field_width >= 0) {
                        if (buf < end)
                                *buf = ' ';
                        ++buf;
                }
        }
        /* sign */
        if (sign) {
                if (buf < end)
                        *buf = sign;
                ++buf;
        }
        /* "0x" / "0" prefix */
        if (need_pfx) {
                if (spec.base == 16 || !is_zero) {
                        if (buf < end)
                                *buf = '0';
                        ++buf;
                }
                if (spec.base == 16) {
                        if (buf < end)
                                *buf = ('X' | locase);
                        ++buf;
                }
        }
        /* zero or space padding */
        if (!(spec.flags & LEFT)) {
                char c = ' ' + (spec.flags & ZEROPAD);

                while (--field_width >= 0) {
                        if (buf < end)
                                *buf = c;
                        ++buf;
                }
        }
        /* hmm even more zero padding? */
        while (i <= --precision) {
                if (buf < end)
                        *buf = '0';
                ++buf;
        }
        /* actual digits of result */
        while (--i >= 0) {
                if (buf < end)
                        *buf = tmp[i];
                ++buf;
        }
        /* trailing space padding */
        while (--field_width >= 0) {
                if (buf < end)
                        *buf = ' ';
                ++buf;
        }

        return buf;
}

static noinline_for_stack
char *special_hex_number(char *buf, char *end, unsigned long long num, int size)
{
        struct printf_spec spec;

        spec.type = FORMAT_TYPE_PTR;
        spec.field_width = 2 + 2 * size;        /* 0x + hex */
        spec.flags = SPECIAL | SMALL | ZEROPAD;
        spec.base = 16;
        spec.precision = -1;

        return number(buf, end, num, spec);
}

static void move_right(char *buf, char *end, unsigned len, unsigned spaces)
{
        size_t size;
        if (buf >= end)        /* nowhere to put anything */
                return;
        size = end - buf;
        if (size <= spaces) {
                memset(buf, ' ', size);
                return;
        }
        if (len) {
                if (len > size - spaces)
                        len = size - spaces;
                memmove(buf + spaces, buf, len);
        }
        memset(buf, ' ', spaces);
}

/*
 * Handle field width padding for a string.
 * @buf: current buffer position
 * @n: length of string
 * @end: end of output buffer
 * @spec: for field width and flags
 * Returns: new buffer position after padding.
 */
static noinline_for_stack
char *widen_string(char *buf, int n, char *end, struct printf_spec spec)
{
        unsigned spaces;

        if (likely(n >= spec.field_width))
                return buf;
        /* we want to pad the sucker */
        spaces = spec.field_width - n;
        if (!(spec.flags & LEFT)) {
                move_right(buf - n, end, n, spaces);
                return buf + spaces;
        }
        while (spaces--) {
                if (buf < end)
                        *buf = ' ';
                ++buf;
        }
        return buf;
}

/* Handle string from a well known address. */
static char *string_nocheck(char *buf, char *end, const char *s,
                            struct printf_spec spec)
{
        int len = 0;
        int lim = spec.precision;

        while (lim--) {
                char c = *s++;
                if (!c)
                        break;
                if (buf < end)
                        *buf = c;
                ++buf;
                ++len;
        }
        return widen_string(buf, len, end, spec);
}

static char *err_ptr(char *buf, char *end, void *ptr,
                     struct printf_spec spec)
{
        int err = PTR_ERR(ptr);
        const char *sym = errname(err);

        if (sym)
                return string_nocheck(buf, end, sym, spec);

        /*
         * Somebody passed ERR_PTR(-1234) or some other non-existing
         * Efoo - or perhaps CONFIG_SYMBOLIC_ERRNAME=n. Fall back to
         * printing it as its decimal representation.
         */
        spec.flags |= SIGN;
        spec.base = 10;
        return number(buf, end, err, spec);
}

/* Be careful: error messages must fit into the given buffer. */
static char *error_string(char *buf, char *end, const char *s,
                          struct printf_spec spec)
{
        /*
         * Hard limit to avoid a completely insane messages. It actually
         * works pretty well because most error messages are in
         * the many pointer format modifiers.
         */
        if (spec.precision == -1)
                spec.precision = 2 * sizeof(void *);

        return string_nocheck(buf, end, s, spec);
}

/*
 * Do not call any complex external code here. Nested printk()/vsprintf()
 * might cause infinite loops. Failures might break printk() and would
 * be hard to debug.
 */
static const char *check_pointer_msg(const void *ptr)
{
        if (!ptr)
                return "(null)";

        if ((unsigned long)ptr < PAGE_SIZE || IS_ERR_VALUE(ptr))
                return "(efault)";

        return NULL;
}

static int check_pointer(char **buf, char *end, const void *ptr,
                         struct printf_spec spec)
{
        const char *err_msg;

        err_msg = check_pointer_msg(ptr);
        if (err_msg) {
                *buf = error_string(*buf, end, err_msg, spec);
                return -EFAULT;
        }

        return 0;
}

static noinline_for_stack
char *string(char *buf, char *end, const char *s,
             struct printf_spec spec)
{
        if (check_pointer(&buf, end, s, spec))
                return buf;

        return string_nocheck(buf, end, s, spec);
}

static char *pointer_string(char *buf, char *end,
                            const void *ptr,
                            struct printf_spec spec)
{
        spec.base = 16;
        spec.flags |= SMALL;
        if (spec.field_width == -1) {
                spec.field_width = 2 * sizeof(ptr);
                spec.flags |= ZEROPAD;
        }

        return number(buf, end, (unsigned long int)ptr, spec);
}

/* Make pointers available for printing early in the boot sequence. */
static int debug_boot_weak_hash __ro_after_init;

static int __init debug_boot_weak_hash_enable(char *str)
{
        debug_boot_weak_hash = 1;
        pr_info("debug_boot_weak_hash enabled\n");
        return 0;
}
early_param("debug_boot_weak_hash", debug_boot_weak_hash_enable);

static DEFINE_STATIC_KEY_TRUE(not_filled_random_ptr_key);
static siphash_key_t ptr_key __read_mostly;

static void enable_ptr_key_workfn(struct work_struct *work)
{
        get_random_bytes(&ptr_key, sizeof(ptr_key));
        /* Needs to run from preemptible context */
        static_branch_disable(&not_filled_random_ptr_key);
}

static DECLARE_WORK(enable_ptr_key_work, enable_ptr_key_workfn);

static int fill_random_ptr_key(struct notifier_block *nb,
                               unsigned long action, void *data)
{
        /* This may be in an interrupt handler. */
        queue_work(system_unbound_wq, &enable_ptr_key_work);
        return 0;
}

static struct notifier_block random_ready = {
        .notifier_call = fill_random_ptr_key
};

static int __init initialize_ptr_random(void)
{
        int key_size = sizeof(ptr_key);
        int ret;

        /* Use hw RNG if available. */
        if (get_random_bytes_arch(&ptr_key, key_size) == key_size) {
                static_branch_disable(&not_filled_random_ptr_key);
                return 0;
        }

        ret = register_random_ready_notifier(&random_ready);
        if (!ret) {
                return 0;
        } else if (ret == -EALREADY) {
                /* This is in preemptible context */
                enable_ptr_key_workfn(&enable_ptr_key_work);
                return 0;
        }

        return ret;
}
early_initcall(initialize_ptr_random);

/* Maps a pointer to a 32 bit unique identifier. */
static inline int __ptr_to_hashval(const void *ptr, unsigned long *hashval_out)
{
        unsigned long hashval;

        if (static_branch_unlikely(&not_filled_random_ptr_key))
                return -EAGAIN;

#ifdef CONFIG_64BIT
        hashval = (unsigned long)siphash_1u64((u64)ptr, &ptr_key);
        /*
         * Mask off the first 32 bits, this makes explicit that we have
         * modified the address (and 32 bits is plenty for a unique ID).
         */
        hashval = hashval & 0xffffffff;
#else
        hashval = (unsigned long)siphash_1u32((u32)ptr, &ptr_key);
#endif
        *hashval_out = hashval;
        return 0;
}

int ptr_to_hashval(const void *ptr, unsigned long *hashval_out)
{
        return __ptr_to_hashval(ptr, hashval_out);
}

static char *ptr_to_id(char *buf, char *end, const void *ptr,
                       struct printf_spec spec)
{
        const char *str = sizeof(ptr) == 8 ? "(____ptrval____)" : "(ptrval)";
        unsigned long hashval;
        int ret;

        /*
         * Print the real pointer value for NULL and error pointers,
         * as they are not actual addresses.
         */
        if (IS_ERR_OR_NULL(ptr))
                return pointer_string(buf, end, ptr, spec);

        /* When debugging early boot use non-cryptographically secure hash. */
        if (unlikely(debug_boot_weak_hash)) {
                hashval = hash_long((unsigned long)ptr, 32);
                return pointer_string(buf, end, (const void *)hashval, spec);
        }

        ret = __ptr_to_hashval(ptr, &hashval);
        if (ret) {
                spec.field_width = 2 * sizeof(ptr);
                /* string length must be less than default_width */
                return error_string(buf, end, str, spec);
        }

        return pointer_string(buf, end, (const void *)hashval, spec);
}

int kptr_restrict __read_mostly;

static noinline_for_stack
char *restricted_pointer(char *buf, char *end, const void *ptr,
                         struct printf_spec spec)
{
        switch (kptr_restrict) {
        case 0:
                /* Handle as %p, hash and do _not_ leak addresses. */
                return ptr_to_id(buf, end, ptr, spec);
        case 1: {
                const struct cred *cred;

                /*
                 * kptr_restrict==1 cannot be used in IRQ context
                 * because its test for CAP_SYSLOG would be meaningless.
                 */
                if (in_irq() || in_serving_softirq() || in_nmi()) {
                        if (spec.field_width == -1)
                                spec.field_width = 2 * sizeof(ptr);
                        return error_string(buf, end, "pK-error", spec);
                }

                /*
                 * Only print the real pointer value if the current
                 * process has CAP_SYSLOG and is running with the
                 * same credentials it started with. This is because
                 * access to files is checked at open() time, but %pK
                 * checks permission at read() time. We don't want to
                 * leak pointer values if a binary opens a file using
                 * %pK and then elevates privileges before reading it.
                 */
                cred = current_cred();
                if (!has_capability_noaudit(current, CAP_SYSLOG) ||
                    !uid_eq(cred->euid, cred->uid) ||
                    !gid_eq(cred->egid, cred->gid))
                        ptr = NULL;
                break;
        }
        case 2:
        default:
                /* Always print 0's for %pK */
                ptr = NULL;
                break;
        }

        return pointer_string(buf, end, ptr, spec);
}

static noinline_for_stack
char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_spec spec,
                  const char *fmt)
{
        const char *array[4], *s;
        const struct dentry *p;
        int depth;
        int i, n;

        switch (fmt[1]) {
                case '2': case '3': case '4':
                        depth = fmt[1] - '0';
                        break;
                default:
                        depth = 1;
        }

        rcu_read_lock();
        for (i = 0; i < depth; i++, d = p) {
                if (check_pointer(&buf, end, d, spec)) {
                        rcu_read_unlock();
                        return buf;
                }

                p = READ_ONCE(d->d_parent);
                array[i] = READ_ONCE(d->d_name.name);
                if (p == d) {
                        if (i)
                                array[i] = "";
                        i++;
                        break;
                }
        }
        s = array[--i];
        for (n = 0; n != spec.precision; n++, buf++) {
                char c = *s++;
                if (!c) {
                        if (!i)
                                break;
                        c = '/';
                        s = array[--i];
                }
                if (buf < end)
                        *buf = c;
        }
        rcu_read_unlock();
        return widen_string(buf, n, end, spec);
}

static noinline_for_stack
char *file_dentry_name(char *buf, char *end, const struct file *f,
                        struct printf_spec spec, const char *fmt)
{
        if (check_pointer(&buf, end, f, spec))
                return buf;

        return dentry_name(buf, end, f->f_path.dentry, spec, fmt);
}
#ifdef CONFIG_BLOCK
static noinline_for_stack
char *bdev_name(char *buf, char *end, struct block_device *bdev,
                struct printf_spec spec, const char *fmt)
{
        struct gendisk *hd;

        if (check_pointer(&buf, end, bdev, spec))
                return buf;

        hd = bdev->bd_disk;
        buf = string(buf, end, hd->disk_name, spec);
        if (bdev->bd_partno) {
                if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) {
                        if (buf < end)
                                *buf = 'p';
                        buf++;
                }
                buf = number(buf, end, bdev->bd_partno, spec);
        }
        return buf;
}
#endif

static noinline_for_stack
char *symbol_string(char *buf, char *end, void *ptr,
                    struct printf_spec spec, const char *fmt)
{
        unsigned long value;
#ifdef CONFIG_KALLSYMS
        char sym[KSYM_SYMBOL_LEN];
#endif

        if (fmt[1] == 'R')
                ptr = __builtin_extract_return_addr(ptr);
        value = (unsigned long)ptr;

#ifdef CONFIG_KALLSYMS
        if (*fmt == 'B')
                sprint_backtrace(sym, value);
        else if (*fmt != 's')
                sprint_symbol(sym, value);
        else
                sprint_symbol_no_offset(sym, value);

        return string_nocheck(buf, end, sym, spec);
#else
        return special_hex_number(buf, end, value, sizeof(void *));
#endif
}

static const struct printf_spec default_str_spec = {
        .field_width = -1,
        .precision = -1,
};

static const struct printf_spec default_flag_spec = {
        .base = 16,
        .precision = -1,
        .flags = SPECIAL | SMALL,
};

static const struct printf_spec default_dec_spec = {
        .base = 10,
        .precision = -1,
};

static const struct printf_spec default_dec02_spec = {
        .base = 10,
        .field_width = 2,
        .precision = -1,
        .flags = ZEROPAD,
};

static const struct printf_spec default_dec04_spec = {
        .base = 10,
        .field_width = 4,
        .precision = -1,
        .flags = ZEROPAD,
};

static noinline_for_stack
char *resource_string(char *buf, char *end, struct resource *res,
                      struct printf_spec spec, const char *fmt)
{
#ifndef IO_RSRC_PRINTK_SIZE
#define IO_RSRC_PRINTK_SIZE        6
#endif

#ifndef MEM_RSRC_PRINTK_SIZE
#define MEM_RSRC_PRINTK_SIZE        10
#endif
        static const struct printf_spec io_spec = {
                .base = 16,
                .field_width = IO_RSRC_PRINTK_SIZE,
                .precision = -1,
                .flags = SPECIAL | SMALL | ZEROPAD,
        };
        static const struct printf_spec mem_spec = {
                .base = 16,
                .field_width = MEM_RSRC_PRINTK_SIZE,
                .precision = -1,
                .flags = SPECIAL | SMALL | ZEROPAD,
        };
        static const struct printf_spec bus_spec = {
                .base = 16,
                .field_width = 2,
                .precision = -1,
                .flags = SMALL | ZEROPAD,
        };
        static const struct printf_spec str_spec = {
                .field_width = -1,
                .precision = 10,
                .flags = LEFT,
        };

        /* 32-bit res (sizeof==4): 10 chars in dec, 10 in hex ("0x" + 8)
         * 64-bit res (sizeof==8): 20 chars in dec, 18 in hex ("0x" + 16) */
#define RSRC_BUF_SIZE                ((2 * sizeof(resource_size_t)) + 4)
#define FLAG_BUF_SIZE                (2 * sizeof(res->flags))
#define DECODED_BUF_SIZE        sizeof("[mem - 64bit pref window disabled]")
#define RAW_BUF_SIZE                sizeof("[mem - flags 0x]")
        char sym[MAX(2*RSRC_BUF_SIZE + DECODED_BUF_SIZE,
                     2*RSRC_BUF_SIZE + FLAG_BUF_SIZE + RAW_BUF_SIZE)];

        char *p = sym, *pend = sym + sizeof(sym);
        int decode = (fmt[0] == 'R') ? 1 : 0;
        const struct printf_spec *specp;

        if (check_pointer(&buf, end, res, spec))
                return buf;

        *p++ = '[';
        if (res->flags & IORESOURCE_IO) {
                p = string_nocheck(p, pend, "io  ", str_spec);
                specp = &io_spec;
        } else if (res->flags & IORESOURCE_MEM) {
                p = string_nocheck(p, pend, "mem ", str_spec);
                specp = &mem_spec;
        } else if (res->flags & IORESOURCE_IRQ) {
                p = string_nocheck(p, pend, "irq ", str_spec);
                specp = &default_dec_spec;
        } else if (res->flags & IORESOURCE_DMA) {
                p = string_nocheck(p, pend, "dma ", str_spec);
                specp = &default_dec_spec;
        } else if (res->flags & IORESOURCE_BUS) {
                p = string_nocheck(p, pend, "bus ", str_spec);
                specp = &bus_spec;
        } else {
                p = string_nocheck(p, pend, "??? ", str_spec);
                specp = &mem_spec;
                decode = 0;
        }
        if (decode && res->flags & IORESOURCE_UNSET) {
                p = string_nocheck(p, pend, "size ", str_spec);
                p = number(p, pend, resource_size(res), *specp);
        } else {
                p = number(p, pend, res->start, *specp);
                if (res->start != res->end) {
                        *p++ = '-';
                        p = number(p, pend, res->end, *specp);
                }
        }
        if (decode) {
                if (res->flags & IORESOURCE_MEM_64)
                        p = string_nocheck(p, pend, " 64bit", str_spec);
                if (res->flags & IORESOURCE_PREFETCH)
                        p = string_nocheck(p, pend, " pref", str_spec);
                if (res->flags & IORESOURCE_WINDOW)
                        p = string_nocheck(p, pend, " window", str_spec);
                if (res->flags & IORESOURCE_DISABLED)
                        p = string_nocheck(p, pend, " disabled", str_spec);
        } else {
                p = string_nocheck(p, pend, " flags ", str_spec);
                p = number(p, pend, res->flags, default_flag_spec);
        }
        *p++ = ']';
        *p = '\0';

        return string_nocheck(buf, end, sym, spec);
}

static noinline_for_stack
char *hex_string(char *buf, char *end, u8 *addr, struct printf_spec spec,
                 const char *fmt)
{
        int i, len = 1;                /* if we pass '%ph[CDN]', field width remains
                                   negative value, fallback to the default */
        char separator;

        if (spec.field_width == 0)
                /* nothing to print */
                return buf;

        if (check_pointer(&buf, end, addr, spec))
                return buf;

        switch (fmt[1]) {
        case 'C':
                separator = ':';
                break;
        case 'D':
                separator = '-';
                break;
        case 'N':
                separator = 0;
                break;
        default:
                separator = ' ';
                break;
        }

        if (spec.field_width > 0)
                len = min_t(int, spec.field_width, 64);

        for (i = 0; i < len; ++i) {
                if (buf < end)
                        *buf = hex_asc_hi(addr[i]);
                ++buf;
                if (buf < end)
                        *buf = hex_asc_lo(addr[i]);
                ++buf;

                if (separator && i != len - 1) {
                        if (buf < end)
                                *buf = separator;
                        ++buf;
                }
        }

        return buf;
}

static noinline_for_stack
char *bitmap_string(char *buf, char *end, unsigned long *bitmap,
                    struct printf_spec spec, const char *fmt)
{
        const int CHUNKSZ = 32;
        int nr_bits = max_t(int, spec.field_width, 0);
        int i, chunksz;
        bool first = true;

        if (check_pointer(&buf, end, bitmap, spec))
                return buf;

        /* reused to print numbers */
        spec = (struct printf_spec){ .flags = SMALL | ZEROPAD, .base = 16 };

        chunksz = nr_bits & (CHUNKSZ - 1);
        if (chunksz == 0)
                chunksz = CHUNKSZ;

        i = ALIGN(nr_bits, CHUNKSZ) - CHUNKSZ;
        for (; i >= 0; i -= CHUNKSZ) {
                u32 chunkmask, val;
                int word, bit;

                chunkmask = ((1ULL << chunksz) - 1);
                word = i / BITS_PER_LONG;
                bit = i % BITS_PER_LONG;
                val = (bitmap[word] >> bit) & chunkmask;

                if (!first) {
                        if (buf < end)
                                *buf = ',';
                        buf++;
                }
                first = false;

                spec.field_width = DIV_ROUND_UP(chunksz, 4);
                buf = number(buf, end, val, spec);

                chunksz = CHUNKSZ;
        }
        return buf;
}

static noinline_for_stack
char *bitmap_list_string(char *buf, char *end, unsigned long *bitmap,
                         struct printf_spec spec, const char *fmt)
{
        int nr_bits = max_t(int, spec.field_width, 0);
        /* current bit is 'cur', most recently seen range is [rbot, rtop] */
        int cur, rbot, rtop;
        bool first = true;

        if (check_pointer(&buf, end, bitmap, spec))
                return buf;

        rbot = cur = find_first_bit(bitmap, nr_bits);
        while (cur < nr_bits) {
                rtop = cur;
                cur = find_next_bit(bitmap, nr_bits, cur + 1);
                if (cur < nr_bits && cur <= rtop + 1)
                        continue;

                if (!first) {
                        if (buf < end)
                                *buf = ',';
                        buf++;
                }
                first = false;

                buf = number(buf, end, rbot, default_dec_spec);
                if (rbot < rtop) {
                        if (buf < end)
                                *buf = '-';
                        buf++;

                        buf = number(buf, end, rtop, default_dec_spec);
                }

                rbot = cur;
        }
        return buf;
}

static noinline_for_stack
char *mac_address_string(char *buf, char *end, u8 *addr,
                         struct printf_spec spec, const char *fmt)
{
        char mac_addr[sizeof("xx:xx:xx:xx:xx:xx")];
        char *p = mac_addr;
        int i;
        char separator;
        bool reversed = false;

        if (check_pointer(&buf, end, addr, spec))
                return buf;

        switch (fmt[1]) {
        case 'F':
                separator = '-';
                break;

        case 'R':
                reversed = true;
                /* fall through */

        default:
                separator = ':';
                break;
        }

        for (i = 0; i < 6; i++) {
                if (reversed)
                        p = hex_byte_pack(p, addr[5 - i]);
                else
                        p = hex_byte_pack(p, addr[i]);

                if (fmt[0] == 'M' && i != 5)
                        *p++ = separator;
        }
        *p = '\0';

        return string_nocheck(buf, end, mac_addr, spec);
}

static noinline_for_stack
char *ip4_string(char *p, const u8 *addr, const char *fmt)
{
        int i;
        bool leading_zeros = (fmt[0] == 'i');
        int index;
        int step;

        switch (fmt[2]) {
        case 'h':
#ifdef __BIG_ENDIAN
                index = 0;
                step = 1;
#else
                index = 3;
                step = -1;
#endif
                break;
        case 'l':
                index = 3;
                step = -1;
                break;
        case 'n':
        case 'b':
        default:
                index = 0;
                step = 1;
                break;
        }
        for (i = 0; i < 4; i++) {
                char temp[4] __aligned(2);        /* hold each IP quad in reverse order */
                int digits = put_dec_trunc8(temp, addr[index]) - temp;
                if (leading_zeros) {
                        if (digits < 3)
                                *p++ = '0';
                        if (digits < 2)
                                *p++ = '0';
                }
                /* reverse the digits in the quad */
                while (digits--)
                        *p++ = temp[digits];
                if (i < 3)
                        *p++ = '.';
                index += step;
        }
        *p = '\0';

        return p;
}

static noinline_for_stack
char *ip6_compressed_string(char *p, const char *addr)
{
        int i, j, range;
        unsigned char zerolength[8];
        int longest = 1;
        int colonpos = -1;
        u16 word;
        u8 hi, lo;
        bool needcolon = false;
        bool useIPv4;
        struct in6_addr in6;

        memcpy(&in6, addr, sizeof(struct in6_addr));

        useIPv4 = ipv6_addr_v4mapped(&in6) || ipv6_addr_is_isatap(&in6);

        memset(zerolength, 0, sizeof(zerolength));

        if (useIPv4)
                range = 6;
        else
                range = 8;

        /* find position of longest 0 run */
        for (i = 0; i < range; i++) {
                for (j = i; j < range; j++) {
                        if (in6.s6_addr16[j] != 0)
                                break;
                        zerolength[i]++;
                }
        }
        for (i = 0; i < range; i++) {
                if (zerolength[i] > longest) {
                        longest = zerolength[i];
                        colonpos = i;
                }
        }
        if (longest == 1)                /* don't compress a single 0 */
                colonpos = -1;

        /* emit address */
        for (i = 0; i < range; i++) {
                if (i == colonpos) {
                        if (needcolon || i == 0)
                                *p++ = ':';
                        *p++ = ':';
                        needcolon = false;
                        i += longest - 1;
                        continue;
                }
                if (needcolon) {
                        *p++ = ':';
                        needcolon = false;
                }
                /* hex u16 without leading 0s */
                word = ntohs(in6.s6_addr16[i]);
                hi = word >> 8;
                lo = word & 0xff;
                if (hi) {
                        if (hi > 0x0f)
                                p = hex_byte_pack(p, hi);
                        else
                                *p++ = hex_asc_lo(hi);
                        p = hex_byte_pack(p, lo);
                }
                else if (lo > 0x0f)
                        p = hex_byte_pack(p, lo);
                else
                        *p++ = hex_asc_lo(lo);
                needcolon = true;
        }

        if (useIPv4) {
                if (needcolon)
                        *p++ = ':';
                p = ip4_string(p, &in6.s6_addr[12], "I4");
        }
        *p = '\0';

        return p;
}

static noinline_for_stack
char *ip6_string(char *p, const char *addr, const char *fmt)
{
        int i;

        for (i = 0; i < 8; i++) {
                p = hex_byte_pack(p, *addr++);
                p = hex_byte_pack(p, *addr++);
                if (fmt[0] == 'I' && i != 7)
                        *p++ = ':';
        }
        *p = '\0';

        return p;
}

static noinline_for_stack
char *ip6_addr_string(char *buf, char *end, const u8 *addr,
                      struct printf_spec spec, const char *fmt)
{
        char ip6_addr[sizeof("xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255")];

        if (fmt[0] == 'I' && fmt[2] == 'c')
                ip6_compressed_string(ip6_addr, addr);
        else
                ip6_string(ip6_addr, addr, fmt);

        return string_nocheck(buf, end, ip6_addr, spec);
}

static noinline_for_stack
char *ip4_addr_string(char *buf, char *end, const u8 *addr,
                      struct printf_spec spec, const char *fmt)
{
        char ip4_addr[sizeof("255.255.255.255")];

        ip4_string(ip4_addr, addr, fmt);

        return string_nocheck(buf, end, ip4_addr, spec);
}

static noinline_for_stack
char *ip6_addr_string_sa(char *buf, char *end, const struct sockaddr_in6 *sa,
                         struct printf_spec spec, const char *fmt)
{
        bool have_p = false, have_s = false, have_f = false, have_c = false;
        char ip6_addr[sizeof("[xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255]") +
                      sizeof(":12345") + sizeof("/123456789") +
                      sizeof("%1234567890")];
        char *p = ip6_addr, *pend = ip6_addr + sizeof(ip6_addr);
        const u8 *addr = (const u8 *) &sa->sin6_addr;
        char fmt6[2] = { fmt[0], '6' };
        u8 off = 0;

        fmt++;
        while (isalpha(*++fmt)) {
                switch (*fmt) {
                case 'p':
                        have_p = true;
                        break;
                case 'f':
                        have_f = true;
                        break;
                case 's':
                        have_s = true;
                        break;
                case 'c':
                        have_c = true;
                        break;
                }
        }

        if (have_p || have_s || have_f) {
                *p = '[';
                off = 1;
        }

        if (fmt6[0] == 'I' && have_c)
                p = ip6_compressed_string(ip6_addr + off, addr);
        else
                p = ip6_string(ip6_addr + off, addr, fmt6);

        if (have_p || have_s || have_f)
                *p++ = ']';

        if (have_p) {
                *p++ = ':';
                p = number(p, pend, ntohs(sa->sin6_port), spec);
        }
        if (have_f) {
                *p++ = '/';
                p = number(p, pend, ntohl(sa->sin6_flowinfo &
                                          IPV6_FLOWINFO_MASK), spec);
        }
        if (have_s) {
                *p++ = '%';
                p = number(p, pend, sa->sin6_scope_id, spec);
        }
        *p = '\0';

        return string_nocheck(buf, end, ip6_addr, spec);
}

static noinline_for_stack
char *ip4_addr_string_sa(char *buf, char *end, const struct sockaddr_in *sa,
                         struct printf_spec spec, const char *fmt)
{
        bool have_p = false;
        char *p, ip4_addr[sizeof("255.255.255.255") + sizeof(":12345")];
        char *pend = ip4_addr + sizeof(ip4_addr);
        const u8 *addr = (const u8 *) &sa->sin_addr.s_addr;
        char fmt4[3] = { fmt[0], '4', 0 };

        fmt++;
        while (isalpha(*++fmt)) {
                switch (*fmt) {
                case 'p':
                        have_p = true;
                        break;
                case 'h':
                case 'l':
                case 'n':
                case 'b':
                        fmt4[2] = *fmt;
                        break;
                }
        }

        p = ip4_string(ip4_addr, addr, fmt4);
        if (have_p) {
                *p++ = ':';
                p = number(p, pend, ntohs(sa->sin_port), spec);
        }
        *p = '\0';

        return string_nocheck(buf, end, ip4_addr, spec);
}

static noinline_for_stack
char *ip_addr_string(char *buf, char *end, const void *ptr,
                     struct printf_spec spec, const char *fmt)
{
        char *err_fmt_msg;

        if (check_pointer(&buf, end, ptr, spec))
                return buf;

        switch (fmt[1]) {
        case '6':
                return ip6_addr_string(buf, end, ptr, spec, fmt);
        case '4':
                return ip4_addr_string(buf, end, ptr, spec, fmt);
        case 'S': {
                const union {
                        struct sockaddr                raw;
                        struct sockaddr_in        v4;
                        struct sockaddr_in6        v6;
                } *sa = ptr;

                switch (sa->raw.sa_family) {
                case AF_INET:
                        return ip4_addr_string_sa(buf, end, &sa->v4, spec, fmt);
                case AF_INET6:
                        return ip6_addr_string_sa(buf, end, &sa->v6, spec, fmt);
                default:
                        return error_string(buf, end, "(einval)", spec);
                }}
        }

        err_fmt_msg = fmt[0] == 'i' ? "(%pi?)" : "(%pI?)";
        return error_string(buf, end, err_fmt_msg, spec);
}

static noinline_for_stack
char *escaped_string(char *buf, char *end, u8 *addr, struct printf_spec spec,
                     const char *fmt)
{
        bool found = true;
        int count = 1;
        unsigned int flags = 0;
        int len;

        if (spec.field_width == 0)
                return buf;                                /* nothing to print */

        if (check_pointer(&buf, end, addr, spec))
                return buf;

        do {
                switch (fmt[count++]) {
                case 'a':
                        flags |= ESCAPE_ANY;
                        break;
                case 'c':
                        flags |= ESCAPE_SPECIAL;
                        break;
                case 'h':
                        flags |= ESCAPE_HEX;
                        break;
                case 'n':
                        flags |= ESCAPE_NULL;
                        break;
                case 'o':
                        flags |= ESCAPE_OCTAL;
                        break;
                case 'p':
                        flags |= ESCAPE_NP;
                        break;
                case 's':
                        flags |= ESCAPE_SPACE;
                        break;
                default:
                        found = false;
                        break;
                }
        } while (found);

        if (!flags)
                flags = ESCAPE_ANY_NP;

        len = spec.field_width < 0 ? 1 : spec.field_width;

        /*
         * string_escape_mem() writes as many characters as it can to
         * the given buffer, and returns the total size of the output
         * had the buffer been big enough.
         */
        buf += string_escape_mem(addr, len, buf, buf < end ? end - buf : 0, flags, NULL);

        return buf;
}

static char *va_format(char *buf, char *end, struct va_format *va_fmt,
                       struct printf_spec spec, const char *fmt)
{
        va_list va;

        if (check_pointer(&buf, end, va_fmt, spec))
                return buf;

        va_copy(va, *va_fmt->va);
        buf += vsnprintf(buf, end > buf ? end - buf : 0, va_fmt->fmt, va);
        va_end(va);

        return buf;
}

static noinline_for_stack
char *uuid_string(char *buf, char *end, const u8 *addr,
                  struct printf_spec spec, const char *fmt)
{
        char uuid[UUID_STRING_LEN + 1];
        char *p = uuid;
        int i;
        const u8 *index = uuid_index;
        bool uc = false;

        if (check_pointer(&buf, end, addr, spec))
                return buf;

        switch (*(++fmt)) {
        case 'L':
                uc = true;
                /* fall through */
        case 'l':
                index = guid_index;
                break;
        case 'B':
                uc = true;
                break;
        }

        for (i = 0; i < 16; i++) {
                if (uc)
                        p = hex_byte_pack_upper(p, addr[index[i]]);
                else
                        p = hex_byte_pack(p, addr[index[i]]);
                switch (i) {
                case 3:
                case 5:
                case 7:
                case 9:
                        *p++ = '-';
                        break;
                }
        }

        *p = 0;

        return string_nocheck(buf, end, uuid, spec);
}

static noinline_for_stack
char *netdev_bits(char *buf, char *end, const void *addr,
                  struct printf_spec spec,  const char *fmt)
{
        unsigned long long num;
        int size;

        if (check_pointer(&buf, end, addr, spec))
                return buf;

        switch (fmt[1]) {
        case 'F':
                num = *(const netdev_features_t *)addr;
                size = sizeof(netdev_features_t);
                break;
        default:
                return error_string(buf, end, "(%pN?)", spec);
        }

        return special_hex_number(buf, end, num, size);
}

static noinline_for_stack
char *address_val(char *buf, char *end, const void *addr,
                  struct printf_spec spec, const char *fmt)
{
        unsigned long long num;
        int size;

        if (check_pointer(&buf, end, addr, spec))
                return buf;

        switch (fmt[1]) {
        case 'd':
                num = *(const dma_addr_t *)addr;
                size = sizeof(dma_addr_t);
                break;
        case 'p':
        default:
                num = *(const phys_addr_t *)addr;
                size = sizeof(phys_addr_t);
                break;
        }

        return special_hex_number(buf, end, num, size);
}

static noinline_for_stack
char *date_str(char *buf, char *end, const struct rtc_time *tm, bool r)
{
        int year = tm->tm_year + (r ? 0 : 1900);
        int mon = tm->tm_mon + (r ? 0 : 1);

        buf = number(buf, end, year, default_dec04_spec);
        if (buf < end)
                *buf = '-';
        buf++;

        buf = number(buf, end, mon, default_dec02_spec);
        if (buf < end)
                *buf = '-';
        buf++;

        return number(buf, end, tm->tm_mday, default_dec02_spec);
}

static noinline_for_stack
char *time_str(char *buf, char *end, const struct rtc_time *tm, bool r)
{
        buf = number(buf, end, tm->tm_hour, default_dec02_spec);
        if (buf < end)
                *buf = ':';
        buf++;

        buf = number(buf, end, tm->tm_min, default_dec02_spec);
        if (buf < end)
                *buf = ':';
        buf++;

        return number(buf, end, tm->tm_sec, default_dec02_spec);
}

static noinline_for_stack
char *rtc_str(char *buf, char *end, const struct rtc_time *tm,
              struct printf_spec spec, const char *fmt)
{
        bool have_t = true, have_d = true;
        bool raw = false;
        int count = 2;

        switch (fmt[count]) {
        case 'd':
                have_t = false;
                count++;
                break;
        case 't':
                have_d = false;
                count++;
                break;
        }

        raw = fmt[count] == 'r';

        if (have_d)
                buf = date_str(buf, end, tm, raw);
        if (have_d && have_t) {
                /* Respect ISO 8601 */
                if (buf < end)
                        *buf = 'T';
                buf++;
        }
        if (have_t)
                buf = time_str(buf, end, tm, raw);

        return buf;
}

static noinline_for_stack
char *time64_str(char *buf, char *end, const time64_t time,
                 struct printf_spec spec, const char *fmt)
{
        struct rtc_time rtc_time;
        struct tm tm;

        time64_to_tm(time, 0, &tm);

        rtc_time.tm_sec = tm.tm_sec;
        rtc_time.tm_min = tm.tm_min;
        rtc_time.tm_hour = tm.tm_hour;
        rtc_time.tm_mday = tm.tm_mday;
        rtc_time.tm_mon = tm.tm_mon;
        rtc_time.tm_year = tm.tm_year;
        rtc_time.tm_wday = tm.tm_wday;
        rtc_time.tm_yday = tm.tm_yday;

        rtc_time.tm_isdst = 0;

        return rtc_str(buf, end, &rtc_time, spec, fmt);
}

static noinline_for_stack
char *time_and_date(char *buf, char *end, void *ptr, struct printf_spec spec,
                    const char *fmt)
{
        if (check_pointer(&buf, end, ptr, spec))
                return buf;

        switch (fmt[1]) {
        case 'R':
                return rtc_str(buf, end, (const struct rtc_time *)ptr, spec, fmt);
        case 'T':
                return time64_str(buf, end, *(const time64_t *)ptr, spec, fmt);
        default:
                return error_string(buf, end, "(%pt?)", spec);
        }
}

static noinline_for_stack
char *clock(char *buf, char *end, struct clk *clk, struct printf_spec spec,
            const char *fmt)
{
        if (!IS_ENABLED(CONFIG_HAVE_CLK))
                return error_string(buf, end, "(%pC?)", spec);

        if (check_pointer(&buf, end, clk, spec))
                return buf;

        switch (fmt[1]) {
        case 'n':
        default:
#ifdef CONFIG_COMMON_CLK
                return string(buf, end, __clk_get_name(clk), spec);
#else
                return ptr_to_id(buf, end, clk, spec);
#endif
        }
}

static
char *format_flags(char *buf, char *end, unsigned long flags,
                                        const struct trace_print_flags *names)
{
        unsigned long mask;

        for ( ; flags && names->name; names++) {
                mask = names->mask;
                if ((flags & mask) != mask)
                        continue;

                buf = string(buf, end, names->name, default_str_spec);

                flags &= ~mask;
                if (flags) {
                        if (buf < end)
                                *buf = '|';
                        buf++;
                }
        }

        if (flags)
                buf = number(buf, end, flags, default_flag_spec);

        return buf;
}

static noinline_for_stack
char *flags_string(char *buf, char *end, void *flags_ptr,
                   struct printf_spec spec, const char *fmt)
{
        unsigned long flags;
        const struct trace_print_flags *names;

        if (check_pointer(&buf, end, flags_ptr, spec))
                return buf;

        switch (fmt[1]) {
        case 'p':
                flags = *(unsigned long *)flags_ptr;
                /* Remove zone id */
                flags &= (1UL << NR_PAGEFLAGS) - 1;
                names = pageflag_names;
                break;
        case 'v':
                flags = *(unsigned long *)flags_ptr;
                names = vmaflag_names;
                break;
        case 'g':
                flags = (__force unsigned long)(*(gfp_t *)flags_ptr);
                names = gfpflag_names;
                break;
        default:
                return error_string(buf, end, "(%pG?)", spec);
        }

        return format_flags(buf, end, flags, names);
}

static noinline_for_stack
char *fwnode_full_name_string(struct fwnode_handle *fwnode, char *buf,
                              char *end)
{
        int depth;

        /* Loop starting from the root node to the current node. */
        for (depth = fwnode_count_parents(fwnode); depth >= 0; depth--) {
                /*
                 * Only get a reference for other nodes (i.e. parent nodes).
                 * fwnode refcount may be 0 here.
                 */
                struct fwnode_handle *__fwnode = depth ?
                        fwnode_get_nth_parent(fwnode, depth) : fwnode;

                buf = string(buf, end, fwnode_get_name_prefix(__fwnode),
                             default_str_spec);
                buf = string(buf, end, fwnode_get_name(__fwnode),
                             default_str_spec);

                if (depth)
                        fwnode_handle_put(__fwnode);
        }

        return buf;
}

static noinline_for_stack
char *device_node_string(char *buf, char *end, struct device_node *dn,
                         struct printf_spec spec, const char *fmt)
{
        char tbuf[sizeof("xxxx") + 1];
        const char *p;
        int ret;
        char *buf_start = buf;
        struct property *prop;
        bool has_mult, pass;

        struct printf_spec str_spec = spec;
        str_spec.field_width = -1;

        if (fmt[0] != 'F')
                return error_string(buf, end, "(%pO?)", spec);

        if (!IS_ENABLED(CONFIG_OF))
                return error_string(buf, end, "(%pOF?)", spec);

        if (check_pointer(&buf, end, dn, spec))
                return buf;

        /* simple case without anything any more format specifiers */
        fmt++;
        if (fmt[0] == '\0' || strcspn(fmt,"fnpPFcC") > 0)
                fmt = "f";

        for (pass = false; strspn(fmt,"fnpPFcC"); fmt++, pass = true) {
                int precision;
                if (pass) {
                        if (buf < end)
                                *buf = ':';
                        buf++;
                }

                switch (*fmt) {
                case 'f':        /* full_name */
                        buf = fwnode_full_name_string(of_fwnode_handle(dn), buf,
                                                      end);
                        break;
                case 'n':        /* name */
                        p = fwnode_get_name(of_fwnode_handle(dn));
                        precision = str_spec.precision;
                        str_spec.precision = strchrnul(p, '@') - p;
                        buf = string(buf, end, p, str_spec);
                        str_spec.precision = precision;
                        break;
                case 'p':        /* phandle */
                        buf = number(buf, end, (unsigned int)dn->phandle, default_dec_spec);
                        break;
                case 'P':        /* path-spec */
                        p = fwnode_get_name(of_fwnode_handle(dn));
                        if (!p[1])
                                p = "/";
                        buf = string(buf, end, p, str_spec);
                        break;
                case 'F':        /* flags */
                        tbuf[0] = of_node_check_flag(dn, OF_DYNAMIC) ? 'D' : '-';
                        tbuf[1] = of_node_check_flag(dn, OF_DETACHED) ? 'd' : '-';
                        tbuf[2] = of_node_check_flag(dn, OF_POPULATED) ? 'P' : '-';
                        tbuf[3] = of_node_check_flag(dn, OF_POPULATED_BUS) ? 'B' : '-';
                        tbuf[4] = 0;
                        buf = string_nocheck(buf, end, tbuf, str_spec);
                        break;
                case 'c':        /* major compatible string */
                        ret = of_property_read_string(dn, "compatible", &p);
                        if (!ret)
                                buf = string(buf, end, p, str_spec);
                        break;
                case 'C':        /* full compatible string */
                        has_mult = false;
                        of_property_for_each_string(dn, "compatible", prop, p) {
                                if (has_mult)
                                        buf = string_nocheck(buf, end, ",", str_spec);
                                buf = string_nocheck(buf, end, "\"", str_spec);
                                buf = string(buf, end, p, str_spec);
                                buf = string_nocheck(buf, end, "\"", str_spec);

                                has_mult = true;
                        }
                        break;
                default:
                        break;
                }
        }

        return widen_string(buf, buf - buf_start, end, spec);
}

static noinline_for_stack
char *fwnode_string(char *buf, char *end, struct fwnode_handle *fwnode,
                    struct printf_spec spec, const char *fmt)
{
        struct printf_spec str_spec = spec;
        char *buf_start = buf;

        str_spec.field_width = -1;

        if (*fmt != 'w')
                return error_string(buf, end, "(%pf?)", spec);

        if (check_pointer(&buf, end, fwnode, spec))
                return buf;

        fmt++;

        switch (*fmt) {
        case 'P':        /* name */
                buf = string(buf, end, fwnode_get_name(fwnode), str_spec);
                break;
        case 'f':        /* full_name */
        default:
                buf = fwnode_full_name_string(fwnode, buf, end);
                break;
        }

        return widen_string(buf, buf - buf_start, end, spec);
}

/*
 * Show a '%p' thing.  A kernel extension is that the '%p' is followed
 * by an extra set of alphanumeric characters that are extended format
 * specifiers.
 *
 * Please update scripts/checkpatch.pl when adding/removing conversion
 * characters.  (Search for "check for vsprintf extension").
 *
 * Right now we handle:
 *
 * - 'S' For symbolic direct pointers (or function descriptors) with offset
 * - 's' For symbolic direct pointers (or function descriptors) without offset
 * - '[Ss]R' as above with __builtin_extract_return_addr() translation
 * - '[Ff]' %pf and %pF were obsoleted and later removed in favor of
 *            %ps and %pS. Be careful when re-using these specifiers.
 * - 'B' For backtraced symbolic direct pointers with offset
 * - 'R' For decoded struct resource, e.g., [mem 0x0-0x1f 64bit pref]
 * - 'r' For raw struct resource, e.g., [mem 0x0-0x1f flags 0x201]
 * - 'b[l]' For a bitmap, the number of bits is determined by the field
 *       width which must be explicitly specified either as part of the
 *       format string '%32b[l]' or through '%*b[l]', [l] selects
 *       range-list format instead of hex format
 * - 'M' For a 6-byte MAC address, it prints the address in the
 *       usual colon-separated hex notation
 * - 'm' For a 6-byte MAC address, it prints the hex address without colons
 * - 'MF' For a 6-byte MAC FDDI address, it prints the address
 *       with a dash-separated hex notation
 * - '[mM]R' For a 6-byte MAC address, Reverse order (Bluetooth)
 * - 'I' [46] for IPv4/IPv6 addresses printed in the usual way
 *       IPv4 uses dot-separated decimal without leading 0's (1.2.3.4)
 *       IPv6 uses colon separated network-order 16 bit hex with leading 0's
 *       [S][pfs]
 *       Generic IPv4/IPv6 address (struct sockaddr *) that falls back to
 *       [4] or [6] and is able to print port [p], flowinfo [f], scope [s]
 * - 'i' [46] for 'raw' IPv4/IPv6 addresses
 *       IPv6 omits the colons (01020304...0f)
 *       IPv4 uses dot-separated decimal with leading 0's (010.123.045.006)
 *       [S][pfs]
 *       Generic IPv4/IPv6 address (struct sockaddr *) that falls back to
 *       [4] or [6] and is able to print port [p], flowinfo [f], scope [s]
 * - '[Ii][4S][hnbl]' IPv4 addresses in host, network, big or little endian order
 * - 'I[6S]c' for IPv6 addresses printed as specified by
 *       https://tools.ietf.org/html/rfc5952
 * - 'E[achnops]' For an escaped buffer, where rules are defined by combination
 *                of the following flags (see string_escape_mem() for the
 *                details):
 *                  a - ESCAPE_ANY
 *                  c - ESCAPE_SPECIAL
 *                  h - ESCAPE_HEX
 *                  n - ESCAPE_NULL
 *                  o - ESCAPE_OCTAL
 *                  p - ESCAPE_NP
 *                  s - ESCAPE_SPACE
 *                By default ESCAPE_ANY_NP is used.
 * - 'U' For a 16 byte UUID/GUID, it prints the UUID/GUID in the form
 *       "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
 *       Options for %pU are:
 *         b big endian lower case hex (default)
 *         B big endian UPPER case hex
 *         l little endian lower case hex
 *         L little endian UPPER case hex
 *           big endian output byte order is:
 *             [0][1][2][3]-[4][5]-[6][7]-[8][9]-[10][11][12][13][14][15]
 *           little endian output byte order is:
 *             [3][2][1][0]-[5][4]-[7][6]-[8][9]-[10][11][12][13][14][15]
 * - 'V' For a struct va_format which contains a format string * and va_list *,
 *       call vsnprintf(->format, *->va_list).
 *       Implements a "recursive vsnprintf".
 *       Do not use this feature without some mechanism to verify the
 *       correctness of the format string and va_list arguments.
 * - 'K' For a kernel pointer that should be hidden from unprivileged users
 * - 'NF' For a netdev_features_t
 * - 'h[CDN]' For a variable-length buffer, it prints it as a hex string with
 *            a certain separator (' ' by default):
 *              C colon
 *              D dash
 *              N no separator
 *            The maximum supported length is 64 bytes of the input. Consider
 *            to use print_hex_dump() for the larger input.
 * - 'a[pd]' For address types [p] phys_addr_t, [d] dma_addr_t and derivatives
 *           (default assumed to be phys_addr_t, passed by reference)
 * - 'd[234]' For a dentry name (optionally 2-4 last components)
 * - 'D[234]' Same as 'd' but for a struct file
 * - 'g' For block_device name (gendisk + partition number)
 * - 't[RT][dt][r]' For time and date as represented by:
 *      R    struct rtc_time
 *      T    time64_t
 * - 'C' For a clock, it prints the name (Common Clock Framework) or address
 *       (legacy clock framework) of the clock
 * - 'Cn' For a clock, it prints the name (Common Clock Framework) or address
 *        (legacy clock framework) of the clock
 * - 'G' For flags to be printed as a collection of symbolic strings that would
 *       construct the specific value. Supported flags given by option:
 *       p page flags (see struct page) given as pointer to unsigned long
 *       g gfp flags (GFP_* and __GFP_*) given as pointer to gfp_t
 *       v vma flags (VM_*) given as pointer to unsigned long
 * - 'OF[fnpPcCF]'  For a device tree object
 *                  Without any optional arguments prints the full_name
 *                  f device node full_name
 *                  n device node name
 *                  p device node phandle
 *                  P device node path spec (name + @unit)
 *                  F device node flags
 *                  c major compatible string
 *                  C full compatible string
 * - 'fw[fP]'        For a firmware node (struct fwnode_handle) pointer
 *                Without an option prints the full name of the node
 *                f full name
 *                P node name, including a possible unit address
 * - 'x' For printing the address. Equivalent to "%lx".
 * - '[ku]s' For a BPF/tracing related format specifier, e.g. used out of
 *           bpf_trace_printk() where [ku] prefix specifies either kernel (k)
 *           or user (u) memory to probe, and:
 *              s a string, equivalent to "%s" on direct vsnprintf() use
 *
 * ** When making changes please also update:
 *        Documentation/core-api/printk-formats.rst
 *
 * Note: The default behaviour (unadorned %p) is to hash the address,
 * rendering it useful as a unique identifier.
 */
static noinline_for_stack
char *pointer(const char *fmt, char *buf, char *end, void *ptr,
              struct printf_spec spec)
{
        switch (*fmt) {
        case 'S':
        case 's':
                ptr = dereference_symbol_descriptor(ptr);
                /* fall through */
        case 'B':
                return symbol_string(buf, end, ptr, spec, fmt);
        case 'R':
        case 'r':
                return resource_string(buf, end, ptr, spec, fmt);
        case 'h':
                return hex_string(buf, end, ptr, spec, fmt);
        case 'b':
                switch (fmt[1]) {
                case 'l':
                        return bitmap_list_string(buf, end, ptr, spec, fmt);
                default:
                        return bitmap_string(buf, end, ptr, spec, fmt);
                }
        case 'M':                        /* Colon separated: 00:01:02:03:04:05 */
        case 'm':                        /* Contiguous: 000102030405 */
                                        /* [mM]F (FDDI) */
                                        /* [mM]R (Reverse order; Bluetooth) */
                return mac_address_string(buf, end, ptr, spec, fmt);
        case 'I':                        /* Formatted IP supported
                                         * 4:        1.2.3.4
                                         * 6:        0001:0203:...:0708
                                         * 6c:        1::708 or 1::1.2.3.4
                                         */
        case 'i':                        /* Contiguous:
                                         * 4:        001.002.003.004
                                         * 6:   000102...0f
                                         */
                return ip_addr_string(buf, end, ptr, spec, fmt);
        case 'E':
                return escaped_string(buf, end, ptr, spec, fmt);
        case 'U':
                return uuid_string(buf, end, ptr, spec, fmt);
        case 'V':
                return va_format(buf, end, ptr, spec, fmt);
        case 'K':
                return restricted_pointer(buf, end, ptr, spec);
        case 'N':
                return netdev_bits(buf, end, ptr, spec, fmt);
        case 'a':
                return address_val(buf, end, ptr, spec, fmt);
        case 'd':
                return dentry_name(buf, end, ptr, spec, fmt);
        case 't':
                return time_and_date(buf, end, ptr, spec, fmt);
        case 'C':
                return clock(buf, end, ptr, spec, fmt);
        case 'D':
                return file_dentry_name(buf, end, ptr, spec, fmt);
#ifdef CONFIG_BLOCK
        case 'g':
                return bdev_name(buf, end, ptr, spec, fmt);
#endif

        case 'G':
                return flags_string(buf, end, ptr, spec, fmt);
        case 'O':
                return device_node_string(buf, end, ptr, spec, fmt + 1);
        case 'f':
                return fwnode_string(buf, end, ptr, spec, fmt + 1);
        case 'x':
                return pointer_string(buf, end, ptr, spec);
        case 'e':
                /* %pe with a non-ERR_PTR gets treated as plain %p */
                if (!IS_ERR(ptr))
                        break;
                return err_ptr(buf, end, ptr, spec);
        case 'u':
        case 'k':
                switch (fmt[1]) {
                case 's':
                        return string(buf, end, ptr, spec);
                default:
                        return error_string(buf, end, "(einval)", spec);
                }
        }

        /* default is to _not_ leak addresses, hash before printing */
        return ptr_to_id(buf, end, ptr, spec);
}

/*
 * Helper function to decode printf style format.
 * Each call decode a token from the format and return the
 * number of characters read (or likely the delta where it wants
 * to go on the next call).
 * The decoded token is returned through the parameters
 *
 * 'h', 'l', or 'L' for integer fields
 * 'z' support added 23/7/1999 S.H.
 * 'z' changed to 'Z' --davidm 1/25/99
 * 'Z' changed to 'z' --adobriyan 2017-01-25
 * 't' added for ptrdiff_t
 *
 * @fmt: the format string
 * @type of the token returned
 * @flags: various flags such as +, -, # tokens..
 * @field_width: overwritten width
 * @base: base of the number (octal, hex, ...)
 * @precision: precision of a number
 * @qualifier: qualifier of a number (long, size_t, ...)
 */
static noinline_for_stack
int format_decode(const char *fmt, struct printf_spec *spec)
{
        const char *start = fmt;
        char qualifier;

        /* we finished early by reading the field width */
        if (spec->type == FORMAT_TYPE_WIDTH) {
                if (spec->field_width < 0) {
                        spec->field_width = -spec->field_width;
                        spec->flags |= LEFT;
                }
                spec->type = FORMAT_TYPE_NONE;
                goto precision;
        }

        /* we finished early by reading the precision */
        if (spec->type == FORMAT_TYPE_PRECISION) {
                if (spec->precision < 0)
                        spec->precision = 0;

                spec->type = FORMAT_TYPE_NONE;
                goto qualifier;
        }

        /* By default */
        spec->type = FORMAT_TYPE_NONE;

        for (; *fmt ; ++fmt) {
                if (*fmt == '%')
                        break;
        }

        /* Return the current non-format string */
        if (fmt != start || !*fmt)
                return fmt - start;

        /* Process flags */
        spec->flags = 0;

        while (1) { /* this also skips first '%' */
                bool found = true;

                ++fmt;

                switch (*fmt) {
                case '-': spec->flags |= LEFT;    break;
                case '+': spec->flags |= PLUS;    break;
                case ' ': spec->flags |= SPACE;   break;
                case '#': spec->flags |= SPECIAL; break;
                case '0': spec->flags |= ZEROPAD; break;
                default:  found = false;
                }

                if (!found)
                        break;
        }

        /* get field width */
        spec->field_width = -1;

        if (isdigit(*fmt))
                spec->field_width = skip_atoi(&fmt);
        else if (*fmt == '*') {
                /* it's the next argument */
                spec->type = FORMAT_TYPE_WIDTH;
                return ++fmt - start;
        }

precision:
        /* get the precision */
        spec->precision = -1;
        if (*fmt == '.') {
                ++fmt;
                if (isdigit(*fmt)) {
                        spec->precision = skip_atoi(&fmt);
                        if (spec->precision < 0)
                                spec->precision = 0;
                } else if (*fmt == '*') {
                        /* it's the next argument */
                        spec->type = FORMAT_TYPE_PRECISION;
                        return ++fmt - start;
                }
        }

qualifier:
        /* get the conversion qualifier */
        qualifier = 0;
        if (*fmt == 'h' || _tolower(*fmt) == 'l' ||
            *fmt == 'z' || *fmt == 't') {
                qualifier = *fmt++;
                if (unlikely(qualifier == *fmt)) {
                        if (qualifier == 'l') {
                                qualifier = 'L';
                                ++fmt;
                        } else if (qualifier == 'h') {
                                qualifier = 'H';
                                ++fmt;
                        }
                }
        }

        /* default base */
        spec->base = 10;
        switch (*fmt) {
        case 'c':
                spec->type = FORMAT_TYPE_CHAR;
                return ++fmt - start;

        case 's':
                spec->type = FORMAT_TYPE_STR;
                return ++fmt - start;

        case 'p':
                spec->type = FORMAT_TYPE_PTR;
                return ++fmt - start;

        case '%':
                spec->type = FORMAT_TYPE_PERCENT_CHAR;
                return ++fmt - start;

        /* integer number formats - set up the flags and "break" */
        case 'o':
                spec->base = 8;
                break;

        case 'x':
                spec->flags |= SMALL;
                /* fall through */

        case 'X':
                spec->base = 16;
                break;

        case 'd':
        case 'i':
                spec->flags |= SIGN;
        case 'u':
                break;

        case 'n':
                /*
                 * Since %n poses a greater security risk than
                 * utility, treat it as any other invalid or
                 * unsupported format specifier.
                 */
                /* fall through */

        default:
                WARN_ONCE(1, "Please remove unsupported %%%c in format string\n", *fmt);
                spec->type = FORMAT_TYPE_INVALID;
                return fmt - start;
        }

        if (qualifier == 'L')
                spec->type = FORMAT_TYPE_LONG_LONG;
        else if (qualifier == 'l') {
                BUILD_BUG_ON(FORMAT_TYPE_ULONG + SIGN != FORMAT_TYPE_LONG);
                spec->type = FORMAT_TYPE_ULONG + (spec->flags & SIGN);
        } else if (qualifier == 'z') {
                spec->type = FORMAT_TYPE_SIZE_T;
        } else if (qualifier == 't') {
                spec->type = FORMAT_TYPE_PTRDIFF;
        } else if (qualifier == 'H') {
                BUILD_BUG_ON(FORMAT_TYPE_UBYTE + SIGN != FORMAT_TYPE_BYTE);
                spec->type = FORMAT_TYPE_UBYTE + (spec->flags & SIGN);
        } else if (qualifier == 'h') {
                BUILD_BUG_ON(FORMAT_TYPE_USHORT + SIGN != FORMAT_TYPE_SHORT);
                spec->type = FORMAT_TYPE_USHORT + (spec->flags & SIGN);
        } else {
                BUILD_BUG_ON(FORMAT_TYPE_UINT + SIGN != FORMAT_TYPE_INT);
                spec->type = FORMAT_TYPE_UINT + (spec->flags & SIGN);
        }

        return ++fmt - start;
}

static void
set_field_width(struct printf_spec *spec, int width)
{
        spec->field_width = width;
        if (WARN_ONCE(spec->field_width != width, "field width %d too large", width)) {
                spec->field_width = clamp(width, -FIELD_WIDTH_MAX, FIELD_WIDTH_MAX);
        }
}

static void
set_precision(struct printf_spec *spec, int prec)
{
        spec->precision = prec;
        if (WARN_ONCE(spec->precision != prec, "precision %d too large", prec)) {
                spec->precision = clamp(prec, 0, PRECISION_MAX);
        }
}

/**
 * vsnprintf - Format a string and place it in a buffer
 * @buf: The buffer to place the result into
 * @size: The size of the buffer, including the trailing null space
 * @fmt: The format string to use
 * @args: Arguments for the format string
 *
 * This function generally follows C99 vsnprintf, but has some
 * extensions and a few limitations:
 *
 *  - ``%n`` is unsupported
 *  - ``%p*`` is handled by pointer()
 *
 * See pointer() or Documentation/core-api/printk-formats.rst for more
 * extensive description.
 *
 * **Please update the documentation in both places when making changes**
 *
 * The return value is the number of characters which would
 * be generated for the given input, excluding the trailing
 * '\0', as per ISO C99. If you want to have the exact
 * number of characters written into @buf as return value
 * (not including the trailing '\0'), use vscnprintf(). If the
 * return is greater than or equal to @size, the resulting
 * string is truncated.
 *
 * If you're not already dealing with a va_list consider using snprintf().
 */
int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
{
        unsigned long long num;
        char *str, *end;
        struct printf_spec spec = {0};

        /* Reject out-of-range values early.  Large positive sizes are
           used for unknown buffer sizes. */
        if (WARN_ON_ONCE(size > INT_MAX))
                return 0;

        str = buf;
        end = buf + size;

        /* Make sure end is always >= buf */
        if (end < buf) {
                end = ((void *)-1);
                size = end - buf;
        }

        while (*fmt) {
                const char *old_fmt = fmt;
                int read = format_decode(fmt, &spec);

                fmt += read;

                switch (spec.type) {
                case FORMAT_TYPE_NONE: {
                        int copy = read;
                        if (str < end) {
                                if (copy > end - str)
                                        copy = end - str;
                                memcpy(str, old_fmt, copy);
                        }
                        str += read;
                        break;
                }

                case FORMAT_TYPE_WIDTH:
                        set_field_width(&spec, va_arg(args, int));
                        break;

                case FORMAT_TYPE_PRECISION:
                        set_precision(&spec, va_arg(args, int));
                        break;

                case FORMAT_TYPE_CHAR: {
                        char c;

                        if (!(spec.flags & LEFT)) {
                                while (--spec.field_width > 0) {
                                        if (str < end)
                                                *str = ' ';
                                        ++str;

                                }
                        }
                        c = (unsigned char) va_arg(args, int);
                        if (str < end)
                                *str = c;
                        ++str;
                        while (--spec.field_width > 0) {
                                if (str < end)
                                        *str = ' ';
                                ++str;
                        }
                        break;
                }

                case FORMAT_TYPE_STR:
                        str = string(str, end, va_arg(args, char *), spec);
                        break;

                case FORMAT_TYPE_PTR:
                        str = pointer(fmt, str, end, va_arg(args, void *),
                                      spec);
                        while (isalnum(*fmt))
                                fmt++;
                        break;

                case FORMAT_TYPE_PERCENT_CHAR:
                        if (str < end)
                                *str = '%';
                        ++str;
                        break;

                case FORMAT_TYPE_INVALID:
                        /*
                         * Presumably the arguments passed gcc's type
                         * checking, but there is no safe or sane way
                         * for us to continue parsing the format and
                         * fetching from the va_list; the remaining
                         * specifiers and arguments would be out of
                         * sync.
                         */
                        goto out;

                default:
                        switch (spec.type) {
                        case FORMAT_TYPE_LONG_LONG:
                                num = va_arg(args, long long);
                                break;
                        case FORMAT_TYPE_ULONG:
                                num = va_arg(args, unsigned long);
                                break;
                        case FORMAT_TYPE_LONG:
                                num = va_arg(args, long);
                                break;
                        case FORMAT_TYPE_SIZE_T:
                                if (spec.flags & SIGN)
                                        num = va_arg(args, ssize_t);
                                else
                                        num = va_arg(args, size_t);
                                break;
                        case FORMAT_TYPE_PTRDIFF:
                                num = va_arg(args, ptrdiff_t);
                                break;
                        case FORMAT_TYPE_UBYTE:
                                num = (unsigned char) va_arg(args, int);
                                break;
                        case FORMAT_TYPE_BYTE:
                                num = (signed char) va_arg(args, int);
                                break;
                        case FORMAT_TYPE_USHORT:
                                num = (unsigned short) va_arg(args, int);
                                break;
                        case FORMAT_TYPE_SHORT:
                                num = (short) va_arg(args, int);
                                break;
                        case FORMAT_TYPE_INT:
                                num = (int) va_arg(args, int);
                                break;
                        default:
                                num = va_arg(args, unsigned int);
                        }

                        str = number(str, end, num, spec);
                }
        }

out:
        if (size > 0) {
                if (str < end)
                        *str = '\0';
                else
                        end[-1] = '\0';
        }

        /* the trailing null byte doesn't count towards the total */
        return str-buf;

}
EXPORT_SYMBOL(vsnprintf);

/**
 * vscnprintf - Format a string and place it in a buffer
 * @buf: The buffer to place the result into
 * @size: The size of the buffer, including the trailing null space
 * @fmt: The format string to use
 * @args: Arguments for the format string
 *
 * The return value is the number of characters which have been written into
 * the @buf not including the trailing '\0'. If @size is == 0 the function
 * returns 0.
 *
 * If you're not already dealing with a va_list consider using scnprintf().
 *
 * See the vsnprintf() documentation for format string extensions over C99.
 */
int vscnprintf(char *buf, size_t size, const char *fmt, va_list args)
{
        int i;

        i = vsnprintf(buf, size, fmt, args);

        if (likely(i < size))
                return i;
        if (size != 0)
                return size - 1;
        return 0;
}
EXPORT_SYMBOL(vscnprintf);

/**
 * snprintf - Format a string and place it in a buffer
 * @buf: The buffer to place the result into
 * @size: The size of the buffer, including the trailing null space
 * @fmt: The format string to use
 * @...: Arguments for the format string
 *
 * The return value is the number of characters which would be
 * generated for the given input, excluding the trailing null,
 * as per ISO C99.  If the return is greater than or equal to
 * @size, the resulting string is truncated.
 *
 * See the vsnprintf() documentation for format string extensions over C99.
 */
int snprintf(char *buf, size_t size, const char *fmt, ...)
{
        va_list args;
        int i;

        va_start(args, fmt);
        i = vsnprintf(buf, size, fmt, args);
        va_end(args);

        return i;
}
EXPORT_SYMBOL(snprintf);

/**
 * scnprintf - Format a string and place it in a buffer
 * @buf: The buffer to place the result into
 * @size: The size of the buffer, including the trailing null space
 * @fmt: The format string to use
 * @...: Arguments for the format string
 *
 * The return value is the number of characters written into @buf not including
 * the trailing '\0'. If @size is == 0 the function returns 0.
 */

int scnprintf(char *buf, size_t size, const char *fmt, ...)
{
        va_list args;
        int i;

        va_start(args, fmt);
        i = vscnprintf(buf, size, fmt, args);
        va_end(args);

        return i;
}
EXPORT_SYMBOL(scnprintf);

/**
 * vsprintf - Format a string and place it in a buffer
 * @buf: The buffer to place the result into
 * @fmt: The format string to use
 * @args: Arguments for the format string
 *
 * The function returns the number of characters written
 * into @buf. Use vsnprintf() or vscnprintf() in order to avoid
 * buffer overflows.
 *
 * If you're not already dealing with a va_list consider using sprintf().
 *
 * See the vsnprintf() documentation for format string extensions over C99.
 */
int vsprintf(char *buf, const char *fmt, va_list args)
{
        return vsnprintf(buf, INT_MAX, fmt, args);
}
EXPORT_SYMBOL(vsprintf);

/**
 * sprintf - Format a string and place it in a buffer
 * @buf: The buffer to place the result into
 * @fmt: The format string to use
 * @...: Arguments for the format string
 *
 * The function returns the number of characters written
 * into @buf. Use snprintf() or scnprintf() in order to avoid
 * buffer overflows.
 *
 * See the vsnprintf() documentation for format string extensions over C99.
 */
int sprintf(char *buf, const char *fmt, ...)
{
        va_list args;
        int i;

        va_start(args, fmt);
        i = vsnprintf(buf, INT_MAX, fmt, args);
        va_end(args);

        return i;
}
EXPORT_SYMBOL(sprintf);

#ifdef CONFIG_BINARY_PRINTF
/*
 * bprintf service:
 * vbin_printf() - VA arguments to binary data
 * bstr_printf() - Binary data to text string
 */

/**
 * vbin_printf - Parse a format string and place args' binary value in a buffer
 * @bin_buf: The buffer to place args' binary value
 * @size: The size of the buffer(by words(32bits), not characters)
 * @fmt: The format string to use
 * @args: Arguments for the format string
 *
 * The format follows C99 vsnprintf, except %n is ignored, and its argument
 * is skipped.
 *
 * The return value is the number of words(32bits) which would be generated for
 * the given input.
 *
 * NOTE:
 * If the return value is greater than @size, the resulting bin_buf is NOT
 * valid for bstr_printf().
 */
int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args)
{
        struct printf_spec spec = {0};
        char *str, *end;
        int width;

        str = (char *)bin_buf;
        end = (char *)(bin_buf + size);

#define save_arg(type)                                                        \
({                                                                        \
        unsigned long long value;                                        \
        if (sizeof(type) == 8) {                                        \
                unsigned long long val8;                                \
                str = PTR_ALIGN(str, sizeof(u32));                        \
                val8 = va_arg(args, unsigned long long);                \
                if (str + sizeof(type) <= end) {                        \
                        *(u32 *)str = *(u32 *)&val8;                        \
                        *(u32 *)(str + 4) = *((u32 *)&val8 + 1);        \
                }                                                        \
                value = val8;                                                \
        } else {                                                        \
                unsigned int val4;                                        \
                str = PTR_ALIGN(str, sizeof(type));                        \
                val4 = va_arg(args, int);                                \
                if (str + sizeof(type) <= end)                                \
                        *(typeof(type) *)str = (type)(long)val4;        \
                value = (unsigned long long)val4;                        \
        }                                                                \
        str += sizeof(type);                                                \
        value;                                                                \
})

        while (*fmt) {
                int read = format_decode(fmt, &spec);

                fmt += read;

                switch (spec.type) {
                case FORMAT_TYPE_NONE:
                case FORMAT_TYPE_PERCENT_CHAR:
                        break;
                case FORMAT_TYPE_INVALID:
                        goto out;

                case FORMAT_TYPE_WIDTH:
                case FORMAT_TYPE_PRECISION:
                        width = (int)save_arg(int);
                        /* Pointers may require the width */
                        if (*fmt == 'p')
                                set_field_width(&spec, width);
                        break;

                case FORMAT_TYPE_CHAR:
                        save_arg(char);
                        break;

                case FORMAT_TYPE_STR: {
                        const char *save_str = va_arg(args, char *);
                        const char *err_msg;
                        size_t len;

                        err_msg = check_pointer_msg(save_str);
                        if (err_msg)
                                save_str = err_msg;

                        len = strlen(save_str) + 1;
                        if (str + len < end)
                                memcpy(str, save_str, len);
                        str += len;
                        break;
                }

                case FORMAT_TYPE_PTR:
                        /* Dereferenced pointers must be done now */
                        switch (*fmt) {
                        /* Dereference of functions is still OK */
                        case 'S':
                        case 's':
                        case 'x':
                        case 'K':
                        case 'e':
                                save_arg(void *);
                                break;
                        default:
                                if (!isalnum(*fmt)) {
                                        save_arg(void *);
                                        break;
                                }
                                str = pointer(fmt, str, end, va_arg(args, void *),
                                              spec);
                                if (str + 1 < end)
                                        *str++ = '\0';
                                else
                                        end[-1] = '\0'; /* Must be nul terminated */
                        }
                        /* skip all alphanumeric pointer suffixes */
                        while (isalnum(*fmt))
                                fmt++;
                        break;

                default:
                        switch (spec.type) {

                        case FORMAT_TYPE_LONG_LONG:
                                save_arg(long long);
                                break;
                        case FORMAT_TYPE_ULONG:
                        case FORMAT_TYPE_LONG:
                                save_arg(unsigned long);
                                break;
                        case FORMAT_TYPE_SIZE_T:
                                save_arg(size_t);
                                break;
                        case FORMAT_TYPE_PTRDIFF:
                                save_arg(ptrdiff_t);
                                break;
                        case FORMAT_TYPE_UBYTE:
                        case FORMAT_TYPE_BYTE:
                                save_arg(char);
                                break;
                        case FORMAT_TYPE_USHORT:
                        case FORMAT_TYPE_SHORT:
                                save_arg(short);
                                break;
                        default:
                                save_arg(int);
                        }
                }
        }

out:
        return (u32 *)(PTR_ALIGN(str, sizeof(u32))) - bin_buf;
#undef save_arg
}
EXPORT_SYMBOL_GPL(vbin_printf);

/**
 * bstr_printf - Format a string from binary arguments and place it in a buffer
 * @buf: The buffer to place the result into
 * @size: The size of the buffer, including the trailing null space
 * @fmt: The format string to use
 * @bin_buf: Binary arguments for the format string
 *
 * This function like C99 vsnprintf, but the difference is that vsnprintf gets
 * arguments from stack, and bstr_printf gets arguments from @bin_buf which is
 * a binary buffer that generated by vbin_printf.
 *
 * The format follows C99 vsnprintf, but has some extensions:
 *  see vsnprintf comment for details.
 *
 * The return value is the number of characters which would
 * be generated for the given input, excluding the trailing
 * '\0', as per ISO C99. If you want to have the exact
 * number of characters written into @buf as return value
 * (not including the trailing '\0'), use vscnprintf(). If the
 * return is greater than or equal to @size, the resulting
 * string is truncated.
 */
int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
{
        struct printf_spec spec = {0};
        char *str, *end;
        const char *args = (const char *)bin_buf;

        if (WARN_ON_ONCE(size > INT_MAX))
                return 0;

        str = buf;
        end = buf + size;

#define get_arg(type)                                                        \
({                                                                        \
        typeof(type) value;                                                \
        if (sizeof(type) == 8) {                                        \
                args = PTR_ALIGN(args, sizeof(u32));                        \
                *(u32 *)&value = *(u32 *)args;                                \
                *((u32 *)&value + 1) = *(u32 *)(args + 4);                \
        } else {                                                        \
                args = PTR_ALIGN(args, sizeof(type));                        \
                value = *(typeof(type) *)args;                                \
        }                                                                \
        args += sizeof(type);                                                \
        value;                                                                \
})

        /* Make sure end is always >= buf */
        if (end < buf) {
                end = ((void *)-1);
                size = end - buf;
        }

        while (*fmt) {
                const char *old_fmt = fmt;
                int read = format_decode(fmt, &spec);

                fmt += read;

                switch (spec.type) {
                case FORMAT_TYPE_NONE: {
                        int copy = read;
                        if (str < end) {
                                if (copy > end - str)
                                        copy = end - str;
                                memcpy(str, old_fmt, copy);
                        }
                        str += read;
                        break;
                }

                case FORMAT_TYPE_WIDTH:
                        set_field_width(&spec, get_arg(int));
                        break;

                case FORMAT_TYPE_PRECISION:
                        set_precision(&spec, get_arg(int));
                        break;

                case FORMAT_TYPE_CHAR: {
                        char c;

                        if (!(spec.flags & LEFT)) {
                                while (--spec.field_width > 0) {
                                        if (str < end)
                                                *str = ' ';
                                        ++str;
                                }
                        }
                        c = (unsigned char) get_arg(char);
                        if (str < end)
                                *str = c;
                        ++str;
                        while (--spec.field_width > 0) {
                                if (str < end)
                                        *str = ' ';
                                ++str;
                        }
                        break;
                }

                case FORMAT_TYPE_STR: {
                        const char *str_arg = args;
                        args += strlen(str_arg) + 1;
                        str = string(str, end, (char *)str_arg, spec);
                        break;
                }

                case FORMAT_TYPE_PTR: {
                        bool process = false;
                        int copy, len;
                        /* Non function dereferences were already done */
                        switch (*fmt) {
                        case 'S':
                        case 's':
                        case 'x':
                        case 'K':
                        case 'e':
                                process = true;
                                break;
                        default:
                                if (!isalnum(*fmt)) {
                                        process = true;
                                        break;
                                }
                                /* Pointer dereference was already processed */
                                if (str < end) {
                                        len = copy = strlen(args);
                                        if (copy > end - str)
                                                copy = end - str;
                                        memcpy(str, args, copy);
                                        str += len;
                                        args += len + 1;
                                }
                        }
                        if (process)
                                str = pointer(fmt, str, end, get_arg(void *), spec);

                        while (isalnum(*fmt))
                                fmt++;
                        break;
                }

                case FORMAT_TYPE_PERCENT_CHAR:
                        if (str < end)
                                *str = '%';
                        ++str;
                        break;

                case FORMAT_TYPE_INVALID:
                        goto out;

                default: {
                        unsigned long long num;

                        switch (spec.type) {

                        case FORMAT_TYPE_LONG_LONG:
                                num = get_arg(long long);
                                break;
                        case FORMAT_TYPE_ULONG:
                        case FORMAT_TYPE_LONG:
                                num = get_arg(unsigned long);
                                break;
                        case FORMAT_TYPE_SIZE_T:
                                num = get_arg(size_t);
                                break;
                        case FORMAT_TYPE_PTRDIFF:
                                num = get_arg(ptrdiff_t);
                                break;
                        case FORMAT_TYPE_UBYTE:
                                num = get_arg(unsigned char);
                                break;
                        case FORMAT_TYPE_BYTE:
                                num = get_arg(signed char);
                                break;
                        case FORMAT_TYPE_USHORT:
                                num = get_arg(unsigned short);
                                break;
                        case FORMAT_TYPE_SHORT:
                                num = get_arg(short);
                                break;
                        case FORMAT_TYPE_UINT:
                                num = get_arg(unsigned int);
                                break;
                        default:
                                num = get_arg(int);
                        }

                        str = number(str, end, num, spec);
                } /* default: */
                } /* switch(spec.type) */
        } /* while(*fmt) */

out:
        if (size > 0) {
                if (str < end)
                        *str = '\0';
                else
                        end[-1] = '\0';
        }

#undef get_arg

        /* the trailing null byte doesn't count towards the total */
        return str - buf;
}
EXPORT_SYMBOL_GPL(bstr_printf);

/**
 * bprintf - Parse a format string and place args' binary value in a buffer
 * @bin_buf: The buffer to place args' binary value
 * @size: The size of the buffer(by words(32bits), not characters)
 * @fmt: The format string to use
 * @...: Arguments for the format string
 *
 * The function returns the number of words(u32) written
 * into @bin_buf.
 */
int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...)
{
        va_list args;
        int ret;

        va_start(args, fmt);
        ret = vbin_printf(bin_buf, size, fmt, args);
        va_end(args);

        return ret;
}
EXPORT_SYMBOL_GPL(bprintf);

#endif /* CONFIG_BINARY_PRINTF */

/**
 * vsscanf - Unformat a buffer into a list of arguments
 * @buf:        input buffer
 * @fmt:        format of buffer
 * @args:        arguments
 */
int vsscanf(const char *buf, const char *fmt, va_list args)
{
        const char *str = buf;
        char *next;
        char digit;
        int num = 0;
        u8 qualifier;
        unsigned int base;
        union {
                long long s;
                unsigned long long u;
        } val;
        s16 field_width;
        bool is_sign;

        while (*fmt) {
                /* skip any white space in format */
                /* white space in format matchs any amount of
                 * white space, including none, in the input.
                 */
                if (isspace(*fmt)) {
                        fmt = skip_spaces(++fmt);
                        str = skip_spaces(str);
                }

                /* anything that is not a conversion must match exactly */
                if (*fmt != '%' && *fmt) {
                        if (*fmt++ != *str++)
                                break;
                        continue;
                }

                if (!*fmt)
                        break;
                ++fmt;

                /* skip this conversion.
                 * advance both strings to next white space
                 */
                if (*fmt == '*') {
                        if (!*str)
                                break;
                        while (!isspace(*fmt) && *fmt != '%' && *fmt) {
                                /* '%*[' not yet supported, invalid format */
                                if (*fmt == '[')
                                        return num;
                                fmt++;
                        }
                        while (!isspace(*str) && *str)
                                str++;
                        continue;
                }

                /* get field width */
                field_width = -1;
                if (isdigit(*fmt)) {
                        field_width = skip_atoi(&fmt);
                        if (field_width <= 0)
                                break;
                }

                /* get conversion qualifier */
                qualifier = -1;
                if (*fmt == 'h' || _tolower(*fmt) == 'l' ||
                    *fmt == 'z') {
                        qualifier = *fmt++;
                        if (unlikely(qualifier == *fmt)) {
                                if (qualifier == 'h') {
                                        qualifier = 'H';
                                        fmt++;
                                } else if (qualifier == 'l') {
                                        qualifier = 'L';
                                        fmt++;
                                }
                        }
                }

                if (!*fmt)
                        break;

                if (*fmt == 'n') {
                        /* return number of characters read so far */
                        *va_arg(args, int *) = str - buf;
                        ++fmt;
                        continue;
                }

                if (!*str)
                        break;

                base = 10;
                is_sign = false;

                switch (*fmt++) {
                case 'c':
                {
                        char *s = (char *)va_arg(args, char*);
                        if (field_width == -1)
                                field_width = 1;
                        do {
                                *s++ = *str++;
                        } while (--field_width > 0 && *str);
                        num++;
                }
                continue;
                case 's':
                {
                        char *s = (char *)va_arg(args, char *);
                        if (field_width == -1)
                                field_width = SHRT_MAX;
                        /* first, skip leading white space in buffer */
                        str = skip_spaces(str);

                        /* now copy until next white space */
                        while (*str && !isspace(*str) && field_width--)
                                *s++ = *str++;
                        *s = '\0';
                        num++;
                }
                continue;
                /*
                 * Warning: This implementation of the '[' conversion specifier
                 * deviates from its glibc counterpart in the following ways:
                 * (1) It does NOT support ranges i.e. '-' is NOT a special
                 *     character
                 * (2) It cannot match the closing bracket ']' itself
                 * (3) A field width is required
                 * (4) '%*[' (discard matching input) is currently not supported
                 *
                 * Example usage:
                 * ret = sscanf("00:0a:95","%2[^:]:%2[^:]:%2[^:]",
                 *                buf1, buf2, buf3);
                 * if (ret < 3)
                 *    // etc..
                 */
                case '[':
                {
                        char *s = (char *)va_arg(args, char *);
                        DECLARE_BITMAP(set, 256) = {0};
                        unsigned int len = 0;
                        bool negate = (*fmt == '^');

                        /* field width is required */
                        if (field_width == -1)
                                return num;

                        if (negate)
                                ++fmt;

                        for ( ; *fmt && *fmt != ']'; ++fmt, ++len)
                                set_bit((u8)*fmt, set);

                        /* no ']' or no character set found */
                        if (!*fmt || !len)
                                return num;
                        ++fmt;

                        if (negate) {
                                bitmap_complement(set, set, 256);
                                /* exclude null '\0' byte */
                                clear_bit(0, set);
                        }

                        /* match must be non-empty */
                        if (!test_bit((u8)*str, set))
                                return num;

                        while (test_bit((u8)*str, set) && field_width--)
                                *s++ = *str++;
                        *s = '\0';
                        ++num;
                }
                continue;
                case 'o':
                        base = 8;
                        break;
                case 'x':
                case 'X':
                        base = 16;
                        break;
                case 'i':
                        base = 0;
                        /* fall through */
                case 'd':
                        is_sign = true;
                        /* fall through */
                case 'u':
                        break;
                case '%':
                        /* looking for '%' in str */
                        if (*str++ != '%')
                                return num;
                        continue;
                default:
                        /* invalid format; stop here */
                        return num;
                }

                /* have some sort of integer conversion.
                 * first, skip white space in buffer.
                 */
                str = skip_spaces(str);

                digit = *str;
                if (is_sign && digit == '-')
                        digit = *(str + 1);

                if (!digit
                    || (base == 16 && !isxdigit(digit))
                    || (base == 10 && !isdigit(digit))
                    || (base == 8 && (!isdigit(digit) || digit > '7'))
                    || (base == 0 && !isdigit(digit)))
                        break;

                if (is_sign)
                        val.s = simple_strntoll(str,
                                                field_width >= 0 ? field_width : INT_MAX,
                                                &next, base);
                else
                        val.u = simple_strntoull(str,
                                                 field_width >= 0 ? field_width : INT_MAX,
                                                 &next, base);

                switch (qualifier) {
                case 'H':        /* that's 'hh' in format */
                        if (is_sign)
                                *va_arg(args, signed char *) = val.s;
                        else
                                *va_arg(args, unsigned char *) = val.u;
                        break;
                case 'h':
                        if (is_sign)
                                *va_arg(args, short *) = val.s;
                        else
                                *va_arg(args, unsigned short *) = val.u;
                        break;
                case 'l':
                        if (is_sign)
                                *va_arg(args, long *) = val.s;
                        else
                                *va_arg(args, unsigned long *) = val.u;
                        break;
                case 'L':
                        if (is_sign)
                                *va_arg(args, long long *) = val.s;
                        else
                                *va_arg(args, unsigned long long *) = val.u;
                        break;
                case 'z':
                        *va_arg(args, size_t *) = val.u;
                        break;
                default:
                        if (is_sign)
                                *va_arg(args, int *) = val.s;
                        else
                                *va_arg(args, unsigned int *) = val.u;
                        break;
                }
                num++;

                if (!next)
                        break;
                str = next;
        }

        return num;
}
EXPORT_SYMBOL(vsscanf);

/**
 * sscanf - Unformat a buffer into a list of arguments
 * @buf:        input buffer
 * @fmt:        formatting of buffer
 * @...:        resulting arguments
 */
int sscanf(const char *buf, const char *fmt, ...)
{
        va_list args;
        int i;

        va_start(args, fmt);
        i = vsscanf(buf, fmt, args);
        va_end(args);

        return i;
}
EXPORT_SYMBOL(sscanf);











































































































































































































































































    1 








































































































































































































































































































































































































































































































































































    1 








    1 

    1 















































































































































    1 


























    2 




    1 












































































































































































































    1 

    1 













































































































































































































    1 
























    2 
    2 











    1 













































































































    1 



































































































    2 
























































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_BLKDEV_H
#define _LINUX_BLKDEV_H

#include <linux/sched.h>
#include <linux/sched/clock.h>
#include <linux/major.h>
#include <linux/genhd.h>
#include <linux/list.h>
#include <linux/llist.h>
#include <linux/minmax.h>
#include <linux/timer.h>
#include <linux/workqueue.h>
#include <linux/pagemap.h>
#include <linux/backing-dev-defs.h>
#include <linux/wait.h>
#include <linux/mempool.h>
#include <linux/pfn.h>
#include <linux/bio.h>
#include <linux/stringify.h>
#include <linux/gfp.h>
#include <linux/bsg.h>
#include <linux/smp.h>
#include <linux/rcupdate.h>
#include <linux/percpu-refcount.h>
#include <linux/scatterlist.h>
#include <linux/blkzoned.h>
#include <linux/pm.h>

struct module;
struct scsi_ioctl_command;

struct request_queue;
struct elevator_queue;
struct blk_trace;
struct request;
struct sg_io_hdr;
struct bsg_job;
struct blkcg_gq;
struct blk_flush_queue;
struct pr_ops;
struct rq_qos;
struct blk_queue_stats;
struct blk_stat_callback;
struct blk_keyslot_manager;

#define BLKDEV_MIN_RQ        4
#define BLKDEV_MAX_RQ        128        /* Default maximum */

/* Must be consistent with blk_mq_poll_stats_bkt() */
#define BLK_MQ_POLL_STATS_BKTS 16

/* Doing classic polling */
#define BLK_MQ_POLL_CLASSIC -1

/*
 * Maximum number of blkcg policies allowed to be registered concurrently.
 * Defined here to simplify include dependency.
 */
#define BLKCG_MAX_POLS                5

static inline int blk_validate_block_size(unsigned long bsize)
{
        if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize))
                return -EINVAL;

        return 0;
}

typedef void (rq_end_io_fn)(struct request *, blk_status_t);

/*
 * request flags */
typedef __u32 __bitwise req_flags_t;

/* elevator knows about this request */
#define RQF_SORTED                ((__force req_flags_t)(1 << 0))
/* drive already may have started this one */
#define RQF_STARTED                ((__force req_flags_t)(1 << 1))
/* may not be passed by ioscheduler */
#define RQF_SOFTBARRIER                ((__force req_flags_t)(1 << 3))
/* request for flush sequence */
#define RQF_FLUSH_SEQ                ((__force req_flags_t)(1 << 4))
/* merge of different types, fail separately */
#define RQF_MIXED_MERGE                ((__force req_flags_t)(1 << 5))
/* track inflight for MQ */
#define RQF_MQ_INFLIGHT                ((__force req_flags_t)(1 << 6))
/* don't call prep for this one */
#define RQF_DONTPREP                ((__force req_flags_t)(1 << 7))
/* vaguely specified driver internal error.  Ignored by the block layer */
#define RQF_FAILED                ((__force req_flags_t)(1 << 10))
/* don't warn about errors */
#define RQF_QUIET                ((__force req_flags_t)(1 << 11))
/* elevator private data attached */
#define RQF_ELVPRIV                ((__force req_flags_t)(1 << 12))
/* account into disk and partition IO statistics */
#define RQF_IO_STAT                ((__force req_flags_t)(1 << 13))
/* request came from our alloc pool */
#define RQF_ALLOCED                ((__force req_flags_t)(1 << 14))
/* runtime pm request */
#define RQF_PM                        ((__force req_flags_t)(1 << 15))
/* on IO scheduler merge hash */
#define RQF_HASHED                ((__force req_flags_t)(1 << 16))
/* track IO completion time */
#define RQF_STATS                ((__force req_flags_t)(1 << 17))
/* Look at ->special_vec for the actual data payload instead of the
   bio chain. */
#define RQF_SPECIAL_PAYLOAD        ((__force req_flags_t)(1 << 18))
/* The per-zone write lock is held for this request */
#define RQF_ZONE_WRITE_LOCKED        ((__force req_flags_t)(1 << 19))
/* already slept for hybrid poll */
#define RQF_MQ_POLL_SLEPT        ((__force req_flags_t)(1 << 20))
/* ->timeout has been called, don't expire again */
#define RQF_TIMED_OUT                ((__force req_flags_t)(1 << 21))

/* flags that prevent us from merging requests: */
#define RQF_NOMERGE_FLAGS \
        (RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD)

/*
 * Request state for blk-mq.
 */
enum mq_rq_state {
        MQ_RQ_IDLE                = 0,
        MQ_RQ_IN_FLIGHT                = 1,
        MQ_RQ_COMPLETE                = 2,
};

/*
 * Try to put the fields that are referenced together in the same cacheline.
 *
 * If you modify this structure, make sure to update blk_rq_init() and
 * especially blk_mq_rq_ctx_init() to take care of the added fields.
 */
struct request {
        struct request_queue *q;
        struct blk_mq_ctx *mq_ctx;
        struct blk_mq_hw_ctx *mq_hctx;

        unsigned int cmd_flags;                /* op and common flags */
        req_flags_t rq_flags;

        int tag;
        int internal_tag;

        /* the following two fields are internal, NEVER access directly */
        unsigned int __data_len;        /* total data len */
        sector_t __sector;                /* sector cursor */

        struct bio *bio;
        struct bio *biotail;

        struct list_head queuelist;

        /*
         * The hash is used inside the scheduler, and killed once the
         * request reaches the dispatch list. The ipi_list is only used
         * to queue the request for softirq completion, which is long
         * after the request has been unhashed (and even removed from
         * the dispatch list).
         */
        union {
                struct hlist_node hash;        /* merge hash */
                struct list_head ipi_list;
        };

        /*
         * The rb_node is only used inside the io scheduler, requests
         * are pruned when moved to the dispatch queue. So let the
         * completion_data share space with the rb_node.
         */
        union {
                struct rb_node rb_node;        /* sort/lookup */
                struct bio_vec special_vec;
                void *completion_data;
                int error_count; /* for legacy drivers, don't use */
        };

        /*
         * Three pointers are available for the IO schedulers, if they need
         * more they have to dynamically allocate it.  Flush requests are
         * never put on the IO scheduler. So let the flush fields share
         * space with the elevator data.
         */
        union {
                struct {
                        struct io_cq                *icq;
                        void                        *priv[2];
                } elv;

                struct {
                        unsigned int                seq;
                        struct list_head        list;
                        rq_end_io_fn                *saved_end_io;
                } flush;
        };

        struct gendisk *rq_disk;
        struct hd_struct *part;
#ifdef CONFIG_BLK_RQ_ALLOC_TIME
        /* Time that the first bio started allocating this request. */
        u64 alloc_time_ns;
#endif
        /* Time that this request was allocated for this IO. */
        u64 start_time_ns;
        /* Time that I/O was submitted to the device. */
        u64 io_start_time_ns;

#ifdef CONFIG_BLK_WBT
        unsigned short wbt_flags;
#endif
        /*
         * rq sectors used for blk stats. It has the same value
         * with blk_rq_sectors(rq), except that it never be zeroed
         * by completion.
         */
        unsigned short stats_sectors;

        /*
         * Number of scatter-gather DMA addr+len pairs after
         * physical address coalescing is performed.
         */
        unsigned short nr_phys_segments;

#if defined(CONFIG_BLK_DEV_INTEGRITY)
        unsigned short nr_integrity_segments;
#endif

#ifdef CONFIG_BLK_INLINE_ENCRYPTION
        struct bio_crypt_ctx *crypt_ctx;
        struct blk_ksm_keyslot *crypt_keyslot;
#endif

        unsigned short write_hint;
        unsigned short ioprio;

        enum mq_rq_state state;
        refcount_t ref;

        unsigned int timeout;
        unsigned long deadline;

        union {
                struct __call_single_data csd;
                u64 fifo_time;
        };

        /*
         * completion callback.
         */
        rq_end_io_fn *end_io;
        void *end_io_data;
};

static inline bool blk_op_is_scsi(unsigned int op)
{
        return op == REQ_OP_SCSI_IN || op == REQ_OP_SCSI_OUT;
}

static inline bool blk_op_is_private(unsigned int op)
{
        return op == REQ_OP_DRV_IN || op == REQ_OP_DRV_OUT;
}

static inline bool blk_rq_is_scsi(struct request *rq)
{
        return blk_op_is_scsi(req_op(rq));
}

static inline bool blk_rq_is_private(struct request *rq)
{
        return blk_op_is_private(req_op(rq));
}

static inline bool blk_rq_is_passthrough(struct request *rq)
{
        return blk_rq_is_scsi(rq) || blk_rq_is_private(rq);
}

static inline bool bio_is_passthrough(struct bio *bio)
{
        unsigned op = bio_op(bio);

        return blk_op_is_scsi(op) || blk_op_is_private(op);
}

static inline unsigned short req_get_ioprio(struct request *req)
{
        return req->ioprio;
}

#include <linux/elevator.h>

struct blk_queue_ctx;

struct bio_vec;

enum blk_eh_timer_return {
        BLK_EH_DONE,                /* drivers has completed the command */
        BLK_EH_RESET_TIMER,        /* reset timer and try again */
};

enum blk_queue_state {
        Queue_down,
        Queue_up,
};

#define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */
#define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */

#define BLK_SCSI_MAX_CMDS        (256)
#define BLK_SCSI_CMD_PER_LONG        (BLK_SCSI_MAX_CMDS / (sizeof(long) * 8))

/*
 * Zoned block device models (zoned limit).
 *
 * Note: This needs to be ordered from the least to the most severe
 * restrictions for the inheritance in blk_stack_limits() to work.
 */
enum blk_zoned_model {
        BLK_ZONED_NONE = 0,        /* Regular block device */
        BLK_ZONED_HA,                /* Host-aware zoned block device */
        BLK_ZONED_HM,                /* Host-managed zoned block device */
};

struct queue_limits {
        unsigned long                bounce_pfn;
        unsigned long                seg_boundary_mask;
        unsigned long                virt_boundary_mask;

        unsigned int                max_hw_sectors;
        unsigned int                max_dev_sectors;
        unsigned int                chunk_sectors;
        unsigned int                max_sectors;
        unsigned int                max_segment_size;
        unsigned int                physical_block_size;
        unsigned int                logical_block_size;
        unsigned int                alignment_offset;
        unsigned int                io_min;
        unsigned int                io_opt;
        unsigned int                max_discard_sectors;
        unsigned int                max_hw_discard_sectors;
        unsigned int                max_write_same_sectors;
        unsigned int                max_write_zeroes_sectors;
        unsigned int                max_zone_append_sectors;
        unsigned int                discard_granularity;
        unsigned int                discard_alignment;
        unsigned int                zone_write_granularity;

        unsigned short                max_segments;
        unsigned short                max_integrity_segments;
        unsigned short                max_discard_segments;

        unsigned char                misaligned;
        unsigned char                discard_misaligned;
        unsigned char                raid_partial_stripes_expensive;
        enum blk_zoned_model        zoned;
};

typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx,
                               void *data);

void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model);

#ifdef CONFIG_BLK_DEV_ZONED

#define BLK_ALL_ZONES  ((unsigned int)-1)
int blkdev_report_zones(struct block_device *bdev, sector_t sector,
                        unsigned int nr_zones, report_zones_cb cb, void *data);
unsigned int blkdev_nr_zones(struct gendisk *disk);
extern int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op,
                            sector_t sectors, sector_t nr_sectors,
                            gfp_t gfp_mask);
int blk_revalidate_disk_zones(struct gendisk *disk,
                              void (*update_driver_data)(struct gendisk *disk));

extern int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
                                     unsigned int cmd, unsigned long arg);
extern int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
                                  unsigned int cmd, unsigned long arg);

#else /* CONFIG_BLK_DEV_ZONED */

static inline unsigned int blkdev_nr_zones(struct gendisk *disk)
{
        return 0;
}

static inline int blkdev_report_zones_ioctl(struct block_device *bdev,
                                            fmode_t mode, unsigned int cmd,
                                            unsigned long arg)
{
        return -ENOTTY;
}

static inline int blkdev_zone_mgmt_ioctl(struct block_device *bdev,
                                         fmode_t mode, unsigned int cmd,
                                         unsigned long arg)
{
        return -ENOTTY;
}

#endif /* CONFIG_BLK_DEV_ZONED */

struct request_queue {
        struct request                *last_merge;
        struct elevator_queue        *elevator;

        struct percpu_ref        q_usage_counter;

        struct blk_queue_stats        *stats;
        struct rq_qos                *rq_qos;

        const struct blk_mq_ops        *mq_ops;

        /* sw queues */
        struct blk_mq_ctx __percpu        *queue_ctx;

        unsigned int                queue_depth;

        /* hw dispatch queues */
        struct blk_mq_hw_ctx        **queue_hw_ctx;
        unsigned int                nr_hw_queues;

        struct backing_dev_info        *backing_dev_info;

        /*
         * The queue owner gets to use this for whatever they like.
         * ll_rw_blk doesn't touch it.
         */
        void                        *queuedata;

        /*
         * various queue flags, see QUEUE_* below
         */
        unsigned long                queue_flags;
        /*
         * Number of contexts that have called blk_set_pm_only(). If this
         * counter is above zero then only RQF_PM requests are processed.
         */
        atomic_t                pm_only;

        /*
         * ida allocated id for this queue.  Used to index queues from
         * ioctx.
         */
        int                        id;

        /*
         * queue needs bounce pages for pages above this limit
         */
        gfp_t                        bounce_gfp;

        spinlock_t                queue_lock;

        /*
         * queue kobject
         */
        struct kobject kobj;

        /*
         * mq queue kobject
         */
        struct kobject *mq_kobj;

#ifdef  CONFIG_BLK_DEV_INTEGRITY
        struct blk_integrity integrity;
#endif        /* CONFIG_BLK_DEV_INTEGRITY */

#ifdef CONFIG_PM
        struct device                *dev;
        enum rpm_status                rpm_status;
        unsigned int                nr_pending;
#endif

        /*
         * queue settings
         */
        unsigned long                nr_requests;        /* Max # of requests */

        unsigned int                dma_pad_mask;
        unsigned int                dma_alignment;

#ifdef CONFIG_BLK_INLINE_ENCRYPTION
        /* Inline crypto capabilities */
        struct blk_keyslot_manager *ksm;
#endif

        unsigned int                rq_timeout;
        int                        poll_nsec;

        struct blk_stat_callback        *poll_cb;
        struct blk_rq_stat        poll_stat[BLK_MQ_POLL_STATS_BKTS];

        struct timer_list        timeout;
        struct work_struct        timeout_work;

        atomic_t                nr_active_requests_shared_sbitmap;

        struct list_head        icq_list;
#ifdef CONFIG_BLK_CGROUP
        DECLARE_BITMAP                (blkcg_pols, BLKCG_MAX_POLS);
        struct blkcg_gq                *root_blkg;
        struct list_head        blkg_list;
#endif

        struct queue_limits        limits;

        unsigned int                required_elevator_features;

#ifdef CONFIG_BLK_DEV_ZONED
        /*
         * Zoned block device information for request dispatch control.
         * nr_zones is the total number of zones of the device. This is always
         * 0 for regular block devices. conv_zones_bitmap is a bitmap of nr_zones
         * bits which indicates if a zone is conventional (bit set) or
         * sequential (bit clear). seq_zones_wlock is a bitmap of nr_zones
         * bits which indicates if a zone is write locked, that is, if a write
         * request targeting the zone was dispatched. All three fields are
         * initialized by the low level device driver (e.g. scsi/sd.c).
         * Stacking drivers (device mappers) may or may not initialize
         * these fields.
         *
         * Reads of this information must be protected with blk_queue_enter() /
         * blk_queue_exit(). Modifying this information is only allowed while
         * no requests are being processed. See also blk_mq_freeze_queue() and
         * blk_mq_unfreeze_queue().
         */
        unsigned int                nr_zones;
        unsigned long                *conv_zones_bitmap;
        unsigned long                *seq_zones_wlock;
        unsigned int                max_open_zones;
        unsigned int                max_active_zones;
#endif /* CONFIG_BLK_DEV_ZONED */

        /*
         * sg stuff
         */
        unsigned int                sg_timeout;
        unsigned int                sg_reserved_size;
        int                        node;
        struct mutex                debugfs_mutex;
#ifdef CONFIG_BLK_DEV_IO_TRACE
        struct blk_trace __rcu        *blk_trace;
#endif
        /*
         * for flush operations
         */
        struct blk_flush_queue        *fq;

        struct list_head        requeue_list;
        spinlock_t                requeue_lock;
        struct delayed_work        requeue_work;

        struct mutex                sysfs_lock;
        struct mutex                sysfs_dir_lock;

        /*
         * for reusing dead hctx instance in case of updating
         * nr_hw_queues
         */
        struct list_head        unused_hctx_list;
        spinlock_t                unused_hctx_lock;

        int                        mq_freeze_depth;

#if defined(CONFIG_BLK_DEV_BSG)
        struct bsg_class_device bsg_dev;
#endif

#ifdef CONFIG_BLK_DEV_THROTTLING
        /* Throttle data */
        struct throtl_data *td;
#endif
        struct rcu_head                rcu_head;
        wait_queue_head_t        mq_freeze_wq;
        /*
         * Protect concurrent access to q_usage_counter by
         * percpu_ref_kill() and percpu_ref_reinit().
         */
        struct mutex                mq_freeze_lock;

        struct blk_mq_tag_set        *tag_set;
        struct list_head        tag_set_list;
        struct bio_set                bio_split;

        struct dentry                *debugfs_dir;

#ifdef CONFIG_BLK_DEBUG_FS
        struct dentry                *sched_debugfs_dir;
        struct dentry                *rqos_debugfs_dir;
#endif

        bool                        mq_sysfs_init_done;

        size_t                        cmd_size;

#define BLK_MAX_WRITE_HINTS        5
        u64                        write_hints[BLK_MAX_WRITE_HINTS];
};

/* Keep blk_queue_flag_name[] in sync with the definitions below */
#define QUEUE_FLAG_STOPPED        0        /* queue is stopped */
#define QUEUE_FLAG_DYING        1        /* queue being torn down */
#define QUEUE_FLAG_NOMERGES     3        /* disable merge attempts */
#define QUEUE_FLAG_SAME_COMP        4        /* complete on same CPU-group */
#define QUEUE_FLAG_FAIL_IO        5        /* fake timeout */
#define QUEUE_FLAG_NONROT        6        /* non-rotational device (SSD) */
#define QUEUE_FLAG_VIRT                QUEUE_FLAG_NONROT /* paravirt device */
#define QUEUE_FLAG_IO_STAT        7        /* do disk/partitions IO accounting */
#define QUEUE_FLAG_DISCARD        8        /* supports DISCARD */
#define QUEUE_FLAG_NOXMERGES        9        /* No extended merges */
#define QUEUE_FLAG_ADD_RANDOM        10        /* Contributes to random pool */
#define QUEUE_FLAG_SECERASE        11        /* supports secure erase */
#define QUEUE_FLAG_SAME_FORCE        12        /* force complete on same CPU */
#define QUEUE_FLAG_DEAD                13        /* queue tear-down finished */
#define QUEUE_FLAG_INIT_DONE        14        /* queue is initialized */
#define QUEUE_FLAG_STABLE_WRITES 15        /* don't modify blks until WB is done */
#define QUEUE_FLAG_POLL                16        /* IO polling enabled if set */
#define QUEUE_FLAG_WC                17        /* Write back caching */
#define QUEUE_FLAG_FUA                18        /* device supports FUA writes */
#define QUEUE_FLAG_DAX                19        /* device supports DAX */
#define QUEUE_FLAG_STATS        20        /* track IO start and completion times */
#define QUEUE_FLAG_POLL_STATS        21        /* collecting stats for hybrid polling */
#define QUEUE_FLAG_REGISTERED        22        /* queue has been registered to a disk */
#define QUEUE_FLAG_SCSI_PASSTHROUGH 23        /* queue supports SCSI commands */
#define QUEUE_FLAG_QUIESCED        24        /* queue has been quiesced */
#define QUEUE_FLAG_PCI_P2PDMA        25        /* device supports PCI p2p requests */
#define QUEUE_FLAG_ZONE_RESETALL 26        /* supports Zone Reset All */
#define QUEUE_FLAG_RQ_ALLOC_TIME 27        /* record rq->alloc_time_ns */
#define QUEUE_FLAG_HCTX_ACTIVE        28        /* at least one blk-mq hctx is active */
#define QUEUE_FLAG_NOWAIT       29        /* device supports NOWAIT */

#define QUEUE_FLAG_MQ_DEFAULT        ((1 << QUEUE_FLAG_IO_STAT) |                \
                                 (1 << QUEUE_FLAG_SAME_COMP) |                \
                                 (1 << QUEUE_FLAG_NOWAIT))

void blk_queue_flag_set(unsigned int flag, struct request_queue *q);
void blk_queue_flag_clear(unsigned int flag, struct request_queue *q);
bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);

#define blk_queue_stopped(q)        test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
#define blk_queue_dying(q)        test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags)
#define blk_queue_dead(q)        test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags)
#define blk_queue_init_done(q)        test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags)
#define blk_queue_nomerges(q)        test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
#define blk_queue_noxmerges(q)        \
        test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags)
#define blk_queue_nonrot(q)        test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags)
#define blk_queue_stable_writes(q) \
        test_bit(QUEUE_FLAG_STABLE_WRITES, &(q)->queue_flags)
#define blk_queue_io_stat(q)        test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags)
#define blk_queue_add_random(q)        test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags)
#define blk_queue_discard(q)        test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
#define blk_queue_zone_resetall(q)        \
        test_bit(QUEUE_FLAG_ZONE_RESETALL, &(q)->queue_flags)
#define blk_queue_secure_erase(q) \
        (test_bit(QUEUE_FLAG_SECERASE, &(q)->queue_flags))
#define blk_queue_dax(q)        test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags)
#define blk_queue_scsi_passthrough(q)        \
        test_bit(QUEUE_FLAG_SCSI_PASSTHROUGH, &(q)->queue_flags)
#define blk_queue_pci_p2pdma(q)        \
        test_bit(QUEUE_FLAG_PCI_P2PDMA, &(q)->queue_flags)
#ifdef CONFIG_BLK_RQ_ALLOC_TIME
#define blk_queue_rq_alloc_time(q)        \
        test_bit(QUEUE_FLAG_RQ_ALLOC_TIME, &(q)->queue_flags)
#else
#define blk_queue_rq_alloc_time(q)        false
#endif

#define blk_noretry_request(rq) \
        ((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \
                             REQ_FAILFAST_DRIVER))
#define blk_queue_quiesced(q)        test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags)
#define blk_queue_pm_only(q)        atomic_read(&(q)->pm_only)
#define blk_queue_fua(q)        test_bit(QUEUE_FLAG_FUA, &(q)->queue_flags)
#define blk_queue_registered(q)        test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags)
#define blk_queue_nowait(q)        test_bit(QUEUE_FLAG_NOWAIT, &(q)->queue_flags)

extern void blk_set_pm_only(struct request_queue *q);
extern void blk_clear_pm_only(struct request_queue *q);

static inline bool blk_account_rq(struct request *rq)
{
        return (rq->rq_flags & RQF_STARTED) && !blk_rq_is_passthrough(rq);
}

#define list_entry_rq(ptr)        list_entry((ptr), struct request, queuelist)

#define rq_data_dir(rq)                (op_is_write(req_op(rq)) ? WRITE : READ)

#define rq_dma_dir(rq) \
        (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE)

#define dma_map_bvec(dev, bv, dir, attrs) \
        dma_map_page_attrs(dev, (bv)->bv_page, (bv)->bv_offset, (bv)->bv_len, \
        (dir), (attrs))

static inline bool queue_is_mq(struct request_queue *q)
{
        return q->mq_ops;
}

#ifdef CONFIG_PM
static inline enum rpm_status queue_rpm_status(struct request_queue *q)
{
        return q->rpm_status;
}
#else
static inline enum rpm_status queue_rpm_status(struct request_queue *q)
{
        return RPM_ACTIVE;
}
#endif

static inline enum blk_zoned_model
blk_queue_zoned_model(struct request_queue *q)
{
        if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))
                return q->limits.zoned;
        return BLK_ZONED_NONE;
}

static inline bool blk_queue_is_zoned(struct request_queue *q)
{
        switch (blk_queue_zoned_model(q)) {
        case BLK_ZONED_HA:
        case BLK_ZONED_HM:
                return true;
        default:
                return false;
        }
}

static inline sector_t blk_queue_zone_sectors(struct request_queue *q)
{
        return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0;
}

#ifdef CONFIG_BLK_DEV_ZONED
static inline unsigned int blk_queue_nr_zones(struct request_queue *q)
{
        return blk_queue_is_zoned(q) ? q->nr_zones : 0;
}

static inline unsigned int blk_queue_zone_no(struct request_queue *q,
                                             sector_t sector)
{
        if (!blk_queue_is_zoned(q))
                return 0;
        return sector >> ilog2(q->limits.chunk_sectors);
}

static inline bool blk_queue_zone_is_seq(struct request_queue *q,
                                         sector_t sector)
{
        if (!blk_queue_is_zoned(q))
                return false;
        if (!q->conv_zones_bitmap)
                return true;
        return !test_bit(blk_queue_zone_no(q, sector), q->conv_zones_bitmap);
}

static inline void blk_queue_max_open_zones(struct request_queue *q,
                unsigned int max_open_zones)
{
        q->max_open_zones = max_open_zones;
}

static inline unsigned int queue_max_open_zones(const struct request_queue *q)
{
        return q->max_open_zones;
}

static inline void blk_queue_max_active_zones(struct request_queue *q,
                unsigned int max_active_zones)
{
        q->max_active_zones = max_active_zones;
}

static inline unsigned int queue_max_active_zones(const struct request_queue *q)
{
        return q->max_active_zones;
}
#else /* CONFIG_BLK_DEV_ZONED */
static inline unsigned int blk_queue_nr_zones(struct request_queue *q)
{
        return 0;
}
static inline bool blk_queue_zone_is_seq(struct request_queue *q,
                                         sector_t sector)
{
        return false;
}
static inline unsigned int blk_queue_zone_no(struct request_queue *q,
                                             sector_t sector)
{
        return 0;
}
static inline unsigned int queue_max_open_zones(const struct request_queue *q)
{
        return 0;
}
static inline unsigned int queue_max_active_zones(const struct request_queue *q)
{
        return 0;
}
#endif /* CONFIG_BLK_DEV_ZONED */

static inline bool rq_is_sync(struct request *rq)
{
        return op_is_sync(rq->cmd_flags);
}

static inline bool rq_mergeable(struct request *rq)
{
        if (blk_rq_is_passthrough(rq))
                return false;

        if (req_op(rq) == REQ_OP_FLUSH)
                return false;

        if (req_op(rq) == REQ_OP_WRITE_ZEROES)
                return false;

        if (req_op(rq) == REQ_OP_ZONE_APPEND)
                return false;

        if (rq->cmd_flags & REQ_NOMERGE_FLAGS)
                return false;
        if (rq->rq_flags & RQF_NOMERGE_FLAGS)
                return false;

        return true;
}

static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b)
{
        if (bio_page(a) == bio_page(b) &&
            bio_offset(a) == bio_offset(b))
                return true;

        return false;
}

static inline unsigned int blk_queue_depth(struct request_queue *q)
{
        if (q->queue_depth)
                return q->queue_depth;

        return q->nr_requests;
}

extern unsigned long blk_max_low_pfn, blk_max_pfn;

/*
 * standard bounce addresses:
 *
 * BLK_BOUNCE_HIGH        : bounce all highmem pages
 * BLK_BOUNCE_ANY        : don't bounce anything
 * BLK_BOUNCE_ISA        : bounce pages above ISA DMA boundary
 */

#if BITS_PER_LONG == 32
#define BLK_BOUNCE_HIGH                ((u64)blk_max_low_pfn << PAGE_SHIFT)
#else
#define BLK_BOUNCE_HIGH                -1ULL
#endif
#define BLK_BOUNCE_ANY                (-1ULL)
#define BLK_BOUNCE_ISA                (DMA_BIT_MASK(24))

/*
 * default timeout for SG_IO if none specified
 */
#define BLK_DEFAULT_SG_TIMEOUT        (60 * HZ)
#define BLK_MIN_SG_TIMEOUT        (7 * HZ)

struct rq_map_data {
        struct page **pages;
        int page_order;
        int nr_entries;
        unsigned long offset;
        int null_mapped;
        int from_user;
};

struct req_iterator {
        struct bvec_iter iter;
        struct bio *bio;
};

/* This should not be used directly - use rq_for_each_segment */
#define for_each_bio(_bio)                \
        for (; _bio; _bio = _bio->bi_next)
#define __rq_for_each_bio(_bio, rq)        \
        if ((rq->bio))                        \
                for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next)

#define rq_for_each_segment(bvl, _rq, _iter)                        \
        __rq_for_each_bio(_iter.bio, _rq)                        \
                bio_for_each_segment(bvl, _iter.bio, _iter.iter)

#define rq_for_each_bvec(bvl, _rq, _iter)                        \
        __rq_for_each_bio(_iter.bio, _rq)                        \
                bio_for_each_bvec(bvl, _iter.bio, _iter.iter)

#define rq_iter_last(bvec, _iter)                                \
                (_iter.bio->bi_next == NULL &&                        \
                 bio_iter_last(bvec, _iter.iter))

#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
# error        "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform"
#endif
#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
extern void rq_flush_dcache_pages(struct request *rq);
#else
static inline void rq_flush_dcache_pages(struct request *rq)
{
}
#endif

extern int blk_register_queue(struct gendisk *disk);
extern void blk_unregister_queue(struct gendisk *disk);
blk_qc_t submit_bio_noacct(struct bio *bio);
extern void blk_rq_init(struct request_queue *q, struct request *rq);
extern void blk_put_request(struct request *);
extern struct request *blk_get_request(struct request_queue *, unsigned int op,
                                       blk_mq_req_flags_t flags);
extern int blk_lld_busy(struct request_queue *q);
extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
                             struct bio_set *bs, gfp_t gfp_mask,
                             int (*bio_ctr)(struct bio *, struct bio *, void *),
                             void *data);
extern void blk_rq_unprep_clone(struct request *rq);
extern blk_status_t blk_insert_cloned_request(struct request_queue *q,
                                     struct request *rq);
extern int blk_rq_append_bio(struct request *rq, struct bio **bio);
extern void blk_queue_split(struct bio **);
extern int scsi_verify_blk_ioctl(struct block_device *, unsigned int);
extern int scsi_cmd_blk_ioctl(struct block_device *, fmode_t,
                              unsigned int, void __user *);
extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t,
                          unsigned int, void __user *);
extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
                         struct scsi_ioctl_command __user *);
extern int get_sg_io_hdr(struct sg_io_hdr *hdr, const void __user *argp);
extern int put_sg_io_hdr(const struct sg_io_hdr *hdr, void __user *argp);

extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags);
extern void blk_queue_exit(struct request_queue *q);
extern void blk_sync_queue(struct request_queue *q);
extern int blk_rq_map_user(struct request_queue *, struct request *,
                           struct rq_map_data *, void __user *, unsigned long,
                           gfp_t);
extern int blk_rq_unmap_user(struct bio *);
extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, unsigned int, gfp_t);
extern int blk_rq_map_user_iov(struct request_queue *, struct request *,
                               struct rq_map_data *, const struct iov_iter *,
                               gfp_t);
extern void blk_execute_rq(struct request_queue *, struct gendisk *,
                          struct request *, int);
extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
                                  struct request *, int, rq_end_io_fn *);

/* Helper to convert REQ_OP_XXX to its string format XXX */
extern const char *blk_op_str(unsigned int op);

int blk_status_to_errno(blk_status_t status);
blk_status_t errno_to_blk_status(int errno);

int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin);

static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
{
        return bdev->bd_disk->queue;        /* this is never NULL */
}

/*
 * The basic unit of block I/O is a sector. It is used in a number of contexts
 * in Linux (blk, bio, genhd). The size of one sector is 512 = 2**9
 * bytes. Variables of type sector_t represent an offset or size that is a
 * multiple of 512 bytes. Hence these two constants.
 */
#ifndef SECTOR_SHIFT
#define SECTOR_SHIFT 9
#endif
#ifndef SECTOR_SIZE
#define SECTOR_SIZE (1 << SECTOR_SHIFT)
#endif

/*
 * blk_rq_pos()                        : the current sector
 * blk_rq_bytes()                : bytes left in the entire request
 * blk_rq_cur_bytes()                : bytes left in the current segment
 * blk_rq_err_bytes()                : bytes left till the next error boundary
 * blk_rq_sectors()                : sectors left in the entire request
 * blk_rq_cur_sectors()                : sectors left in the current segment
 * blk_rq_stats_sectors()        : sectors of the entire request used for stats
 */
static inline sector_t blk_rq_pos(const struct request *rq)
{
        return rq->__sector;
}

static inline unsigned int blk_rq_bytes(const struct request *rq)
{
        return rq->__data_len;
}

static inline int blk_rq_cur_bytes(const struct request *rq)
{
        return rq->bio ? bio_cur_bytes(rq->bio) : 0;
}

extern unsigned int blk_rq_err_bytes(const struct request *rq);

static inline unsigned int blk_rq_sectors(const struct request *rq)
{
        return blk_rq_bytes(rq) >> SECTOR_SHIFT;
}

static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
{
        return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT;
}

static inline unsigned int blk_rq_stats_sectors(const struct request *rq)
{
        return rq->stats_sectors;
}

#ifdef CONFIG_BLK_DEV_ZONED

/* Helper to convert BLK_ZONE_ZONE_XXX to its string format XXX */
const char *blk_zone_cond_str(enum blk_zone_cond zone_cond);

static inline unsigned int blk_rq_zone_no(struct request *rq)
{
        return blk_queue_zone_no(rq->q, blk_rq_pos(rq));
}

static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
{
        return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq));
}
#endif /* CONFIG_BLK_DEV_ZONED */

/*
 * Some commands like WRITE SAME have a payload or data transfer size which
 * is different from the size of the request.  Any driver that supports such
 * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to
 * calculate the data transfer size.
 */
static inline unsigned int blk_rq_payload_bytes(struct request *rq)
{
        if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
                return rq->special_vec.bv_len;
        return blk_rq_bytes(rq);
}

/*
 * Return the first full biovec in the request.  The caller needs to check that
 * there are any bvecs before calling this helper.
 */
static inline struct bio_vec req_bvec(struct request *rq)
{
        if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
                return rq->special_vec;
        return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter);
}

static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
                                                     int op)
{
        if (unlikely(op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE))
                return min(q->limits.max_discard_sectors,
                           UINT_MAX >> SECTOR_SHIFT);

        if (unlikely(op == REQ_OP_WRITE_SAME))
                return q->limits.max_write_same_sectors;

        if (unlikely(op == REQ_OP_WRITE_ZEROES))
                return q->limits.max_write_zeroes_sectors;

        return q->limits.max_sectors;
}

/*
 * Return maximum size of a request at given offset. Only valid for
 * file system requests.
 */
static inline unsigned int blk_max_size_offset(struct request_queue *q,
                                               sector_t offset,
                                               unsigned int chunk_sectors)
{
        if (!chunk_sectors) {
                if (q->limits.chunk_sectors)
                        chunk_sectors = q->limits.chunk_sectors;
                else
                        return q->limits.max_sectors;
        }

        if (likely(is_power_of_2(chunk_sectors)))
                chunk_sectors -= offset & (chunk_sectors - 1);
        else
                chunk_sectors -= sector_div(offset, chunk_sectors);

        return min(q->limits.max_sectors, chunk_sectors);
}

static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
                                                  sector_t offset)
{
        struct request_queue *q = rq->q;

        if (blk_rq_is_passthrough(rq))
                return q->limits.max_hw_sectors;

        if (!q->limits.chunk_sectors ||
            req_op(rq) == REQ_OP_DISCARD ||
            req_op(rq) == REQ_OP_SECURE_ERASE)
                return blk_queue_get_max_sectors(q, req_op(rq));

        return min(blk_max_size_offset(q, offset, 0),
                        blk_queue_get_max_sectors(q, req_op(rq)));
}

static inline unsigned int blk_rq_count_bios(struct request *rq)
{
        unsigned int nr_bios = 0;
        struct bio *bio;

        __rq_for_each_bio(bio, rq)
                nr_bios++;

        return nr_bios;
}

void blk_steal_bios(struct bio_list *list, struct request *rq);

/*
 * Request completion related functions.
 *
 * blk_update_request() completes given number of bytes and updates
 * the request without completing it.
 */
extern bool blk_update_request(struct request *rq, blk_status_t error,
                               unsigned int nr_bytes);

extern void blk_abort_request(struct request *);

/*
 * Access functions for manipulating queue properties
 */
extern void blk_cleanup_queue(struct request_queue *);
extern void blk_queue_bounce_limit(struct request_queue *, u64);
extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int);
extern void blk_queue_chunk_sectors(struct request_queue *, unsigned int);
extern void blk_queue_max_segments(struct request_queue *, unsigned short);
extern void blk_queue_max_discard_segments(struct request_queue *,
                unsigned short);
extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
extern void blk_queue_max_discard_sectors(struct request_queue *q,
                unsigned int max_discard_sectors);
extern void blk_queue_max_write_same_sectors(struct request_queue *q,
                unsigned int max_write_same_sectors);
extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
                unsigned int max_write_same_sectors);
extern void blk_queue_logical_block_size(struct request_queue *, unsigned int);
extern void blk_queue_max_zone_append_sectors(struct request_queue *q,
                unsigned int max_zone_append_sectors);
extern void blk_queue_physical_block_size(struct request_queue *, unsigned int);
void blk_queue_zone_write_granularity(struct request_queue *q,
                                      unsigned int size);
extern void blk_queue_alignment_offset(struct request_queue *q,
                                       unsigned int alignment);
void blk_queue_update_readahead(struct request_queue *q);
extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min);
extern void blk_queue_io_min(struct request_queue *q, unsigned int min);
extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt);
extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt);
extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth);
extern void blk_set_default_limits(struct queue_limits *lim);
extern void blk_set_stacking_limits(struct queue_limits *lim);
extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
                            sector_t offset);
extern void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
                              sector_t offset);
extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int);
extern void blk_queue_segment_boundary(struct request_queue *, unsigned long);
extern void blk_queue_virt_boundary(struct request_queue *, unsigned long);
extern void blk_queue_dma_alignment(struct request_queue *, int);
extern void blk_queue_update_dma_alignment(struct request_queue *, int);
extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua);
extern void blk_queue_required_elevator_features(struct request_queue *q,
                                                 unsigned int features);
extern bool blk_queue_can_use_dma_map_merging(struct request_queue *q,
                                              struct device *dev);

/*
 * Number of physical segments as sent to the device.
 *
 * Normally this is the number of discontiguous data segments sent by the
 * submitter.  But for data-less command like discard we might have no
 * actual data segments submitted, but the driver might have to add it's
 * own special payload.  In that case we still return 1 here so that this
 * special payload will be mapped.
 */
static inline unsigned short blk_rq_nr_phys_segments(struct request *rq)
{
        if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
                return 1;
        return rq->nr_phys_segments;
}

/*
 * Number of discard segments (or ranges) the driver needs to fill in.
 * Each discard bio merged into a request is counted as one segment.
 */
static inline unsigned short blk_rq_nr_discard_segments(struct request *rq)
{
        return max_t(unsigned short, rq->nr_phys_segments, 1);
}

int __blk_rq_map_sg(struct request_queue *q, struct request *rq,
                struct scatterlist *sglist, struct scatterlist **last_sg);
static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq,
                struct scatterlist *sglist)
{
        struct scatterlist *last_sg = NULL;

        return __blk_rq_map_sg(q, rq, sglist, &last_sg);
}
extern void blk_dump_rq_flags(struct request *, char *);

bool __must_check blk_get_queue(struct request_queue *);
struct request_queue *blk_alloc_queue(int node_id);
extern void blk_put_queue(struct request_queue *);
extern void blk_set_queue_dying(struct request_queue *);

#ifdef CONFIG_BLOCK
/*
 * blk_plug permits building a queue of related requests by holding the I/O
 * fragments for a short period. This allows merging of sequential requests
 * into single larger request. As the requests are moved from a per-task list to
 * the device's request_queue in a batch, this results in improved scalability
 * as the lock contention for request_queue lock is reduced.
 *
 * It is ok not to disable preemption when adding the request to the plug list
 * or when attempting a merge, because blk_schedule_flush_list() will only flush
 * the plug list when the task sleeps by itself. For details, please see
 * schedule() where blk_schedule_flush_plug() is called.
 */
struct blk_plug {
        struct list_head mq_list; /* blk-mq requests */
        struct list_head cb_list; /* md requires an unplug callback */
        unsigned short rq_count;
        bool multiple_queues;
        bool nowait;
};

struct blk_plug_cb;
typedef void (*blk_plug_cb_fn)(struct blk_plug_cb *, bool);
struct blk_plug_cb {
        struct list_head list;
        blk_plug_cb_fn callback;
        void *data;
};
extern struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug,
                                             void *data, int size);
extern void blk_start_plug(struct blk_plug *);
extern void blk_finish_plug(struct blk_plug *);
extern void blk_flush_plug_list(struct blk_plug *, bool);

static inline void blk_flush_plug(struct task_struct *tsk)
{
        struct blk_plug *plug = tsk->plug;

        if (plug)
                blk_flush_plug_list(plug, false);
}

static inline void blk_schedule_flush_plug(struct task_struct *tsk)
{
        struct blk_plug *plug = tsk->plug;

        if (plug)
                blk_flush_plug_list(plug, true);
}

static inline bool blk_needs_flush_plug(struct task_struct *tsk)
{
        struct blk_plug *plug = tsk->plug;

        return plug &&
                 (!list_empty(&plug->mq_list) ||
                 !list_empty(&plug->cb_list));
}

int blkdev_issue_flush(struct block_device *, gfp_t);
long nr_blockdev_pages(void);
#else /* CONFIG_BLOCK */
struct blk_plug {
};

static inline void blk_start_plug(struct blk_plug *plug)
{
}

static inline void blk_finish_plug(struct blk_plug *plug)
{
}

static inline void blk_flush_plug(struct task_struct *task)
{
}

static inline void blk_schedule_flush_plug(struct task_struct *task)
{
}


static inline bool blk_needs_flush_plug(struct task_struct *tsk)
{
        return false;
}

static inline int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask)
{
        return 0;
}

static inline long nr_blockdev_pages(void)
{
        return 0;
}
#endif /* CONFIG_BLOCK */

extern void blk_io_schedule(void);

extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
                sector_t nr_sects, gfp_t gfp_mask, struct page *page);

#define BLKDEV_DISCARD_SECURE        (1 << 0)        /* issue a secure erase */

extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
                sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
                sector_t nr_sects, gfp_t gfp_mask, int flags,
                struct bio **biop);

#define BLKDEV_ZERO_NOUNMAP        (1 << 0)  /* do not free blocks */
#define BLKDEV_ZERO_NOFALLBACK        (1 << 1)  /* don't write explicit zeroes */

extern int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
                sector_t nr_sects, gfp_t gfp_mask, struct bio **biop,
                unsigned flags);
extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
                sector_t nr_sects, gfp_t gfp_mask, unsigned flags);

static inline int sb_issue_discard(struct super_block *sb, sector_t block,
                sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags)
{
        return blkdev_issue_discard(sb->s_bdev,
                                    block << (sb->s_blocksize_bits -
                                              SECTOR_SHIFT),
                                    nr_blocks << (sb->s_blocksize_bits -
                                                  SECTOR_SHIFT),
                                    gfp_mask, flags);
}
static inline int sb_issue_zeroout(struct super_block *sb, sector_t block,
                sector_t nr_blocks, gfp_t gfp_mask)
{
        return blkdev_issue_zeroout(sb->s_bdev,
                                    block << (sb->s_blocksize_bits -
                                              SECTOR_SHIFT),
                                    nr_blocks << (sb->s_blocksize_bits -
                                                  SECTOR_SHIFT),
                                    gfp_mask, 0);
}

extern int blk_verify_command(unsigned char *cmd, fmode_t mode);

static inline bool bdev_is_partition(struct block_device *bdev)
{
        return bdev->bd_partno;
}

enum blk_default_limits {
        BLK_MAX_SEGMENTS        = 128,
        BLK_SAFE_MAX_SECTORS        = 255,
        BLK_DEF_MAX_SECTORS        = 2560,
        BLK_MAX_SEGMENT_SIZE        = 65536,
        BLK_SEG_BOUNDARY_MASK        = 0xFFFFFFFFUL,
};

static inline unsigned long queue_segment_boundary(const struct request_queue *q)
{
        return q->limits.seg_boundary_mask;
}

static inline unsigned long queue_virt_boundary(const struct request_queue *q)
{
        return q->limits.virt_boundary_mask;
}

static inline unsigned int queue_max_sectors(const struct request_queue *q)
{
        return q->limits.max_sectors;
}

static inline unsigned int queue_max_hw_sectors(const struct request_queue *q)
{
        return q->limits.max_hw_sectors;
}

static inline unsigned short queue_max_segments(const struct request_queue *q)
{
        return q->limits.max_segments;
}

static inline unsigned short queue_max_discard_segments(const struct request_queue *q)
{
        return q->limits.max_discard_segments;
}

static inline unsigned int queue_max_segment_size(const struct request_queue *q)
{
        return q->limits.max_segment_size;
}

static inline unsigned int queue_max_zone_append_sectors(const struct request_queue *q)
{

        const struct queue_limits *l = &q->limits;

        return min(l->max_zone_append_sectors, l->max_sectors);
}

static inline unsigned queue_logical_block_size(const struct request_queue *q)
{
        int retval = 512;

        if (q && q->limits.logical_block_size)
                retval = q->limits.logical_block_size;

        return retval;
}

static inline unsigned int bdev_logical_block_size(struct block_device *bdev)
{
        return queue_logical_block_size(bdev_get_queue(bdev));
}

static inline unsigned int queue_physical_block_size(const struct request_queue *q)
{
        return q->limits.physical_block_size;
}

static inline unsigned int bdev_physical_block_size(struct block_device *bdev)
{
        return queue_physical_block_size(bdev_get_queue(bdev));
}

static inline unsigned int queue_io_min(const struct request_queue *q)
{
        return q->limits.io_min;
}

static inline unsigned int bdev_io_min(struct block_device *bdev)
{
        return queue_io_min(bdev_get_queue(bdev));
}

static inline unsigned int queue_io_opt(const struct request_queue *q)
{
        return q->limits.io_opt;
}

static inline int bdev_io_opt(struct block_device *bdev)
{
        return queue_io_opt(bdev_get_queue(bdev));
}

static inline unsigned int
queue_zone_write_granularity(const struct request_queue *q)
{
        return q->limits.zone_write_granularity;
}

static inline unsigned int
bdev_zone_write_granularity(struct block_device *bdev)
{
        return queue_zone_write_granularity(bdev_get_queue(bdev));
}

static inline int queue_alignment_offset(const struct request_queue *q)
{
        if (q->limits.misaligned)
                return -1;

        return q->limits.alignment_offset;
}

static inline int queue_limit_alignment_offset(struct queue_limits *lim, sector_t sector)
{
        unsigned int granularity = max(lim->physical_block_size, lim->io_min);
        unsigned int alignment = sector_div(sector, granularity >> SECTOR_SHIFT)
                << SECTOR_SHIFT;

        return (granularity + lim->alignment_offset - alignment) % granularity;
}

static inline int bdev_alignment_offset(struct block_device *bdev)
{
        struct request_queue *q = bdev_get_queue(bdev);

        if (q->limits.misaligned)
                return -1;
        if (bdev_is_partition(bdev))
                return queue_limit_alignment_offset(&q->limits,
                                bdev->bd_part->start_sect);
        return q->limits.alignment_offset;
}

static inline int queue_discard_alignment(const struct request_queue *q)
{
        if (q->limits.discard_misaligned)
                return -1;

        return q->limits.discard_alignment;
}

static inline int queue_limit_discard_alignment(struct queue_limits *lim, sector_t sector)
{
        unsigned int alignment, granularity, offset;

        if (!lim->max_discard_sectors)
                return 0;

        /* Why are these in bytes, not sectors? */
        alignment = lim->discard_alignment >> SECTOR_SHIFT;
        granularity = lim->discard_granularity >> SECTOR_SHIFT;
        if (!granularity)
                return 0;

        /* Offset of the partition start in 'granularity' sectors */
        offset = sector_div(sector, granularity);

        /* And why do we do this modulus *again* in blkdev_issue_discard()? */
        offset = (granularity + alignment - offset) % granularity;

        /* Turn it back into bytes, gaah */
        return offset << SECTOR_SHIFT;
}

/*
 * Two cases of handling DISCARD merge:
 * If max_discard_segments > 1, the driver takes every bio
 * as a range and send them to controller together. The ranges
 * needn't to be contiguous.
 * Otherwise, the bios/requests will be handled as same as
 * others which should be contiguous.
 */
static inline bool blk_discard_mergable(struct request *req)
{
        if (req_op(req) == REQ_OP_DISCARD &&
            queue_max_discard_segments(req->q) > 1)
                return true;
        return false;
}

static inline int bdev_discard_alignment(struct block_device *bdev)
{
        struct request_queue *q = bdev_get_queue(bdev);

        if (bdev_is_partition(bdev))
                return queue_limit_discard_alignment(&q->limits,
                                bdev->bd_part->start_sect);
        return q->limits.discard_alignment;
}

static inline unsigned int bdev_write_same(struct block_device *bdev)
{
        struct request_queue *q = bdev_get_queue(bdev);

        if (q)
                return q->limits.max_write_same_sectors;

        return 0;
}

static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev)
{
        struct request_queue *q = bdev_get_queue(bdev);

        if (q)
                return q->limits.max_write_zeroes_sectors;

        return 0;
}

static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev)
{
        struct request_queue *q = bdev_get_queue(bdev);

        if (q)
                return blk_queue_zoned_model(q);

        return BLK_ZONED_NONE;
}

static inline bool bdev_is_zoned(struct block_device *bdev)
{
        struct request_queue *q = bdev_get_queue(bdev);

        if (q)
                return blk_queue_is_zoned(q);

        return false;
}

static inline sector_t bdev_zone_sectors(struct block_device *bdev)
{
        struct request_queue *q = bdev_get_queue(bdev);

        if (q)
                return blk_queue_zone_sectors(q);
        return 0;
}

static inline unsigned int bdev_max_open_zones(struct block_device *bdev)
{
        struct request_queue *q = bdev_get_queue(bdev);

        if (q)
                return queue_max_open_zones(q);
        return 0;
}

static inline unsigned int bdev_max_active_zones(struct block_device *bdev)
{
        struct request_queue *q = bdev_get_queue(bdev);

        if (q)
                return queue_max_active_zones(q);
        return 0;
}

static inline int queue_dma_alignment(const struct request_queue *q)
{
        return q ? q->dma_alignment : 511;
}

static inline int blk_rq_aligned(struct request_queue *q, unsigned long addr,
                                 unsigned int len)
{
        unsigned int alignment = queue_dma_alignment(q) | q->dma_pad_mask;
        return !(addr & alignment) && !(len & alignment);
}

/* assumes size > 256 */
static inline unsigned int blksize_bits(unsigned int size)
{
        unsigned int bits = 8;
        do {
                bits++;
                size >>= 1;
        } while (size > 256);
        return bits;
}

static inline unsigned int block_size(struct block_device *bdev)
{
        return 1 << bdev->bd_inode->i_blkbits;
}

int kblockd_schedule_work(struct work_struct *work);
int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);

#define MODULE_ALIAS_BLOCKDEV(major,minor) \
        MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
#define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
        MODULE_ALIAS("block-major-" __stringify(major) "-*")

#if defined(CONFIG_BLK_DEV_INTEGRITY)

enum blk_integrity_flags {
        BLK_INTEGRITY_VERIFY                = 1 << 0,
        BLK_INTEGRITY_GENERATE                = 1 << 1,
        BLK_INTEGRITY_DEVICE_CAPABLE        = 1 << 2,
        BLK_INTEGRITY_IP_CHECKSUM        = 1 << 3,
};

struct blk_integrity_iter {
        void                        *prot_buf;
        void                        *data_buf;
        sector_t                seed;
        unsigned int                data_size;
        unsigned short                interval;
        const char                *disk_name;
};

typedef blk_status_t (integrity_processing_fn) (struct blk_integrity_iter *);
typedef void (integrity_prepare_fn) (struct request *);
typedef void (integrity_complete_fn) (struct request *, unsigned int);

struct blk_integrity_profile {
        integrity_processing_fn                *generate_fn;
        integrity_processing_fn                *verify_fn;
        integrity_prepare_fn                *prepare_fn;
        integrity_complete_fn                *complete_fn;
        const char                        *name;
};

extern void blk_integrity_register(struct gendisk *, struct blk_integrity *);
extern void blk_integrity_unregister(struct gendisk *);
extern int blk_integrity_compare(struct gendisk *, struct gendisk *);
extern int blk_rq_map_integrity_sg(struct request_queue *, struct bio *,
                                   struct scatterlist *);
extern int blk_rq_count_integrity_sg(struct request_queue *, struct bio *);

static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk)
{
        struct blk_integrity *bi = &disk->queue->integrity;

        if (!bi->profile)
                return NULL;

        return bi;
}

static inline
struct blk_integrity *bdev_get_integrity(struct block_device *bdev)
{
        return blk_get_integrity(bdev->bd_disk);
}

static inline bool
blk_integrity_queue_supports_integrity(struct request_queue *q)
{
        return q->integrity.profile;
}

static inline bool blk_integrity_rq(struct request *rq)
{
        return rq->cmd_flags & REQ_INTEGRITY;
}

static inline void blk_queue_max_integrity_segments(struct request_queue *q,
                                                    unsigned int segs)
{
        q->limits.max_integrity_segments = segs;
}

static inline unsigned short
queue_max_integrity_segments(const struct request_queue *q)
{
        return q->limits.max_integrity_segments;
}

/**
 * bio_integrity_intervals - Return number of integrity intervals for a bio
 * @bi:                blk_integrity profile for device
 * @sectors:        Size of the bio in 512-byte sectors
 *
 * Description: The block layer calculates everything in 512 byte
 * sectors but integrity metadata is done in terms of the data integrity
 * interval size of the storage device.  Convert the block layer sectors
 * to the appropriate number of integrity intervals.
 */
static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
                                                   unsigned int sectors)
{
        return sectors >> (bi->interval_exp - 9);
}

static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
                                               unsigned int sectors)
{
        return bio_integrity_intervals(bi, sectors) * bi->tuple_size;
}

/*
 * Return the first bvec that contains integrity data.  Only drivers that are
 * limited to a single integrity segment should use this helper.
 */
static inline struct bio_vec *rq_integrity_vec(struct request *rq)
{
        if (WARN_ON_ONCE(queue_max_integrity_segments(rq->q) > 1))
                return NULL;
        return rq->bio->bi_integrity->bip_vec;
}

#else /* CONFIG_BLK_DEV_INTEGRITY */

struct bio;
struct block_device;
struct gendisk;
struct blk_integrity;

static inline int blk_integrity_rq(struct request *rq)
{
        return 0;
}
static inline int blk_rq_count_integrity_sg(struct request_queue *q,
                                            struct bio *b)
{
        return 0;
}
static inline int blk_rq_map_integrity_sg(struct request_queue *q,
                                          struct bio *b,
                                          struct scatterlist *s)
{
        return 0;
}
static inline struct blk_integrity *bdev_get_integrity(struct block_device *b)
{
        return NULL;
}
static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk)
{
        return NULL;
}
static inline bool
blk_integrity_queue_supports_integrity(struct request_queue *q)
{
        return false;
}
static inline int blk_integrity_compare(struct gendisk *a, struct gendisk *b)
{
        return 0;
}
static inline void blk_integrity_register(struct gendisk *d,
                                         struct blk_integrity *b)
{
}
static inline void blk_integrity_unregister(struct gendisk *d)
{
}
static inline void blk_queue_max_integrity_segments(struct request_queue *q,
                                                    unsigned int segs)
{
}
static inline unsigned short queue_max_integrity_segments(const struct request_queue *q)
{
        return 0;
}

static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
                                                   unsigned int sectors)
{
        return 0;
}

static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
                                               unsigned int sectors)
{
        return 0;
}

static inline struct bio_vec *rq_integrity_vec(struct request *rq)
{
        return NULL;
}

#endif /* CONFIG_BLK_DEV_INTEGRITY */

#ifdef CONFIG_BLK_INLINE_ENCRYPTION

bool blk_ksm_register(struct blk_keyslot_manager *ksm, struct request_queue *q);

void blk_ksm_unregister(struct request_queue *q);

#else /* CONFIG_BLK_INLINE_ENCRYPTION */

static inline bool blk_ksm_register(struct blk_keyslot_manager *ksm,
                                    struct request_queue *q)
{
        return true;
}

static inline void blk_ksm_unregister(struct request_queue *q) { }

#endif /* CONFIG_BLK_INLINE_ENCRYPTION */


struct block_device_operations {
        blk_qc_t (*submit_bio) (struct bio *bio);
        int (*open) (struct block_device *, fmode_t);
        void (*release) (struct gendisk *, fmode_t);
        int (*rw_page)(struct block_device *, sector_t, struct page *, unsigned int);
        int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
        int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
        unsigned int (*check_events) (struct gendisk *disk,
                                      unsigned int clearing);
        void (*unlock_native_capacity) (struct gendisk *);
        int (*revalidate_disk) (struct gendisk *);
        int (*getgeo)(struct block_device *, struct hd_geometry *);
        int (*set_read_only)(struct block_device *bdev, bool ro);
        /* this callback is with swap_lock and sometimes page table lock held */
        void (*swap_slot_free_notify) (struct block_device *, unsigned long);
        int (*report_zones)(struct gendisk *, sector_t sector,
                        unsigned int nr_zones, report_zones_cb cb, void *data);
        char *(*devnode)(struct gendisk *disk, umode_t *mode);
        struct module *owner;
        const struct pr_ops *pr_ops;
};

#ifdef CONFIG_COMPAT
extern int blkdev_compat_ptr_ioctl(struct block_device *, fmode_t,
                                      unsigned int, unsigned long);
#else
#define blkdev_compat_ptr_ioctl NULL
#endif

extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int,
                                 unsigned long);
extern int bdev_read_page(struct block_device *, sector_t, struct page *);
extern int bdev_write_page(struct block_device *, sector_t, struct page *,
                                                struct writeback_control *);

#ifdef CONFIG_BLK_DEV_ZONED
bool blk_req_needs_zone_write_lock(struct request *rq);
bool blk_req_zone_write_trylock(struct request *rq);
void __blk_req_zone_write_lock(struct request *rq);
void __blk_req_zone_write_unlock(struct request *rq);

static inline void blk_req_zone_write_lock(struct request *rq)
{
        if (blk_req_needs_zone_write_lock(rq))
                __blk_req_zone_write_lock(rq);
}

static inline void blk_req_zone_write_unlock(struct request *rq)
{
        if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED)
                __blk_req_zone_write_unlock(rq);
}

static inline bool blk_req_zone_is_write_locked(struct request *rq)
{
        return rq->q->seq_zones_wlock &&
                test_bit(blk_rq_zone_no(rq), rq->q->seq_zones_wlock);
}

static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
{
        if (!blk_req_needs_zone_write_lock(rq))
                return true;
        return !blk_req_zone_is_write_locked(rq);
}
#else
static inline bool blk_req_needs_zone_write_lock(struct request *rq)
{
        return false;
}

static inline void blk_req_zone_write_lock(struct request *rq)
{
}

static inline void blk_req_zone_write_unlock(struct request *rq)
{
}
static inline bool blk_req_zone_is_write_locked(struct request *rq)
{
        return false;
}

static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
{
        return true;
}
#endif /* CONFIG_BLK_DEV_ZONED */

static inline void blk_wake_io_task(struct task_struct *waiter)
{
        /*
         * If we're polling, the task itself is doing the completions. For
         * that case, we don't need to signal a wakeup, it's enough to just
         * mark us as RUNNING.
         */
        if (waiter == current)
                __set_current_state(TASK_RUNNING);
        else
                wake_up_process(waiter);
}

unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,
                unsigned int op);
void disk_end_io_acct(struct gendisk *disk, unsigned int op,
                unsigned long start_time);

unsigned long part_start_io_acct(struct gendisk *disk, struct hd_struct **part,
                                 struct bio *bio);
void part_end_io_acct(struct hd_struct *part, struct bio *bio,
                      unsigned long start_time);

/**
 * bio_start_io_acct - start I/O accounting for bio based drivers
 * @bio:        bio to start account for
 *
 * Returns the start time that should be passed back to bio_end_io_acct().
 */
static inline unsigned long bio_start_io_acct(struct bio *bio)
{
        return disk_start_io_acct(bio->bi_disk, bio_sectors(bio), bio_op(bio));
}

/**
 * bio_end_io_acct - end I/O accounting for bio based drivers
 * @bio:        bio to end account for
 * @start:        start time returned by bio_start_io_acct()
 */
static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time)
{
        return disk_end_io_acct(bio->bi_disk, bio_op(bio), start_time);
}

int bdev_read_only(struct block_device *bdev);
int set_blocksize(struct block_device *bdev, int size);

const char *bdevname(struct block_device *bdev, char *buffer);
struct block_device *lookup_bdev(const char *);

void blkdev_show(struct seq_file *seqf, off_t offset);

#define BDEVNAME_SIZE        32        /* Largest string for a blockdev identifier */
#define BDEVT_SIZE        10        /* Largest string for MAJ:MIN for blkdev */
#ifdef CONFIG_BLOCK
#define BLKDEV_MAJOR_MAX        512
#else
#define BLKDEV_MAJOR_MAX        0
#endif

struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
                void *holder);
struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder);
int bd_prepare_to_claim(struct block_device *bdev, struct block_device *whole,
                void *holder);
void bd_abort_claiming(struct block_device *bdev, struct block_device *whole,
                void *holder);
void blkdev_put(struct block_device *bdev, fmode_t mode);

struct block_device *I_BDEV(struct inode *inode);
struct block_device *bdget_part(struct hd_struct *part);
struct block_device *bdgrab(struct block_device *bdev);
void bdput(struct block_device *);

#ifdef CONFIG_BLOCK
void invalidate_bdev(struct block_device *bdev);
int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart,
                        loff_t lend);
int sync_blockdev(struct block_device *bdev);
#else
static inline void invalidate_bdev(struct block_device *bdev)
{
}
static inline int truncate_bdev_range(struct block_device *bdev, fmode_t mode,
                                      loff_t lstart, loff_t lend)
{
        return 0;
}
static inline int sync_blockdev(struct block_device *bdev)
{
        return 0;
}
#endif
int fsync_bdev(struct block_device *bdev);

struct super_block *freeze_bdev(struct block_device *bdev);
int thaw_bdev(struct block_device *bdev, struct super_block *sb);

#endif /* _LINUX_BLKDEV_H */

































































    1 






    1 



    1 

























    1 






    1 






    1 
    1 






























    1 






    1 


    1 


    1 

    1 








































































    1 

    1 



    1 
    1 






    1 












    1 



    1 


    1 










    1 






































































































































    1 





















    1 
    1 





    1 
    1 

    1 

    1 
    1 

    1 










    1 


    1 

    1 






































    1 
    1 

    1 

    1 

    1 



















































    1 






    1 




    1 

    1 







    1 








    1 


    1 
    1 



































    1 
    1 






    1 



    1 


















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
// SPDX-License-Identifier: GPL-2.0-or-later
/* Provide a way to create a superblock configuration context within the kernel
 * that allows a superblock to be set up prior to mounting.
 *
 * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/nsproxy.h>
#include <linux/slab.h>
#include <linux/magic.h>
#include <linux/security.h>
#include <linux/mnt_namespace.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <net/net_namespace.h>
#include <asm/sections.h>
#include "mount.h"
#include "internal.h"

enum legacy_fs_param {
        LEGACY_FS_UNSET_PARAMS,
        LEGACY_FS_MONOLITHIC_PARAMS,
        LEGACY_FS_INDIVIDUAL_PARAMS,
};

struct legacy_fs_context {
        char                        *legacy_data;        /* Data page for legacy filesystems */
        size_t                        data_size;
        enum legacy_fs_param        param_type;
};

static int legacy_init_fs_context(struct fs_context *fc);

static const struct constant_table common_set_sb_flag[] = {
        { "dirsync",        SB_DIRSYNC },
        { "lazytime",        SB_LAZYTIME },
        { "mand",        SB_MANDLOCK },
        { "ro",                SB_RDONLY },
        { "sync",        SB_SYNCHRONOUS },
        { },
};

static const struct constant_table common_clear_sb_flag[] = {
        { "async",        SB_SYNCHRONOUS },
        { "nolazytime",        SB_LAZYTIME },
        { "nomand",        SB_MANDLOCK },
        { "rw",                SB_RDONLY },
        { },
};

/*
 * Check for a common mount option that manipulates s_flags.
 */
static int vfs_parse_sb_flag(struct fs_context *fc, const char *key)
{
        unsigned int token;

        token = lookup_constant(common_set_sb_flag, key, 0);
        if (token) {
                fc->sb_flags |= token;
                fc->sb_flags_mask |= token;
                return 0;
        }

        token = lookup_constant(common_clear_sb_flag, key, 0);
        if (token) {
                fc->sb_flags &= ~token;
                fc->sb_flags_mask |= token;
                return 0;
        }

        return -ENOPARAM;
}

/**
 * vfs_parse_fs_param - Add a single parameter to a superblock config
 * @fc: The filesystem context to modify
 * @param: The parameter
 *
 * A single mount option in string form is applied to the filesystem context
 * being set up.  Certain standard options (for example "ro") are translated
 * into flag bits without going to the filesystem.  The active security module
 * is allowed to observe and poach options.  Any other options are passed over
 * to the filesystem to parse.
 *
 * This may be called multiple times for a context.
 *
 * Returns 0 on success and a negative error code on failure.  In the event of
 * failure, supplementary error information may have been set.
 */
int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param)
{
        int ret;

        if (!param->key)
                return invalf(fc, "Unnamed parameter\n");

        ret = vfs_parse_sb_flag(fc, param->key);
        if (ret != -ENOPARAM)
                return ret;

        ret = security_fs_context_parse_param(fc, param);
        if (ret != -ENOPARAM)
                /* Param belongs to the LSM or is disallowed by the LSM; so
                 * don't pass to the FS.
                 */
                return ret;

        if (fc->ops->parse_param) {
                ret = fc->ops->parse_param(fc, param);
                if (ret != -ENOPARAM)
                        return ret;
        }

        /* If the filesystem doesn't take any arguments, give it the
         * default handling of source.
         */
        if (strcmp(param->key, "source") == 0) {
                if (param->type != fs_value_is_string)
                        return invalf(fc, "VFS: Non-string source");
                if (fc->source)
                        return invalf(fc, "VFS: Multiple sources");
                fc->source = param->string;
                param->string = NULL;
                return 0;
        }

        return invalf(fc, "%s: Unknown parameter '%s'",
                      fc->fs_type->name, param->key);
}
EXPORT_SYMBOL(vfs_parse_fs_param);

/**
 * vfs_parse_fs_string - Convenience function to just parse a string.
 */
int vfs_parse_fs_string(struct fs_context *fc, const char *key,
                        const char *value, size_t v_size)
{
        int ret;

        struct fs_parameter param = {
                .key        = key,
                .type        = fs_value_is_flag,
                .size        = v_size,
        };

        if (value) {
                param.string = kmemdup_nul(value, v_size, GFP_KERNEL);
                if (!param.string)
                        return -ENOMEM;
                param.type = fs_value_is_string;
        }

        ret = vfs_parse_fs_param(fc, &param);
        kfree(param.string);
        return ret;
}
EXPORT_SYMBOL(vfs_parse_fs_string);

/**
 * generic_parse_monolithic - Parse key[=val][,key[=val]]* mount data
 * @ctx: The superblock configuration to fill in.
 * @data: The data to parse
 *
 * Parse a blob of data that's in key[=val][,key[=val]]* form.  This can be
 * called from the ->monolithic_mount_data() fs_context operation.
 *
 * Returns 0 on success or the error returned by the ->parse_option() fs_context
 * operation on failure.
 */
int generic_parse_monolithic(struct fs_context *fc, void *data)
{
        char *options = data, *key;
        int ret = 0;

        if (!options)
                return 0;

        ret = security_sb_eat_lsm_opts(options, &fc->security);
        if (ret)
                return ret;

        while ((key = strsep(&options, ",")) != NULL) {
                if (*key) {
                        size_t v_len = 0;
                        char *value = strchr(key, '=');

                        if (value) {
                                if (value == key)
                                        continue;
                                *value++ = 0;
                                v_len = strlen(value);
                        }
                        ret = vfs_parse_fs_string(fc, key, value, v_len);
                        if (ret < 0)
                                break;
                }
        }

        return ret;
}
EXPORT_SYMBOL(generic_parse_monolithic);

/**
 * alloc_fs_context - Create a filesystem context.
 * @fs_type: The filesystem type.
 * @reference: The dentry from which this one derives (or NULL)
 * @sb_flags: Filesystem/superblock flags (SB_*)
 * @sb_flags_mask: Applicable members of @sb_flags
 * @purpose: The purpose that this configuration shall be used for.
 *
 * Open a filesystem and create a mount context.  The mount context is
 * initialised with the supplied flags and, if a submount/automount from
 * another superblock (referred to by @reference) is supplied, may have
 * parameters such as namespaces copied across from that superblock.
 */
static struct fs_context *alloc_fs_context(struct file_system_type *fs_type,
                                      struct dentry *reference,
                                      unsigned int sb_flags,
                                      unsigned int sb_flags_mask,
                                      enum fs_context_purpose purpose)
{
        int (*init_fs_context)(struct fs_context *);
        struct fs_context *fc;
        int ret = -ENOMEM;

        fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL_ACCOUNT);
        if (!fc)
                return ERR_PTR(-ENOMEM);

        fc->purpose        = purpose;
        fc->sb_flags        = sb_flags;
        fc->sb_flags_mask = sb_flags_mask;
        fc->fs_type        = get_filesystem(fs_type);
        fc->cred        = get_current_cred();
        fc->net_ns        = get_net(current->nsproxy->net_ns);
        fc->log.prefix        = fs_type->name;

        mutex_init(&fc->uapi_mutex);

        switch (purpose) {
        case FS_CONTEXT_FOR_MOUNT:
                fc->user_ns = get_user_ns(fc->cred->user_ns);
                break;
        case FS_CONTEXT_FOR_SUBMOUNT:
                fc->user_ns = get_user_ns(reference->d_sb->s_user_ns);
                break;
        case FS_CONTEXT_FOR_RECONFIGURE:
                atomic_inc(&reference->d_sb->s_active);
                fc->user_ns = get_user_ns(reference->d_sb->s_user_ns);
                fc->root = dget(reference);
                break;
        }

        /* TODO: Make all filesystems support this unconditionally */
        init_fs_context = fc->fs_type->init_fs_context;
        if (!init_fs_context)
                init_fs_context = legacy_init_fs_context;

        ret = init_fs_context(fc);
        if (ret < 0)
                goto err_fc;
        fc->need_free = true;
        return fc;

err_fc:
        put_fs_context(fc);
        return ERR_PTR(ret);
}

struct fs_context *fs_context_for_mount(struct file_system_type *fs_type,
                                        unsigned int sb_flags)
{
        return alloc_fs_context(fs_type, NULL, sb_flags, 0,
                                        FS_CONTEXT_FOR_MOUNT);
}
EXPORT_SYMBOL(fs_context_for_mount);

struct fs_context *fs_context_for_reconfigure(struct dentry *dentry,
                                        unsigned int sb_flags,
                                        unsigned int sb_flags_mask)
{
        return alloc_fs_context(dentry->d_sb->s_type, dentry, sb_flags,
                                sb_flags_mask, FS_CONTEXT_FOR_RECONFIGURE);
}
EXPORT_SYMBOL(fs_context_for_reconfigure);

struct fs_context *fs_context_for_submount(struct file_system_type *type,
                                           struct dentry *reference)
{
        return alloc_fs_context(type, reference, 0, 0, FS_CONTEXT_FOR_SUBMOUNT);
}
EXPORT_SYMBOL(fs_context_for_submount);

void fc_drop_locked(struct fs_context *fc)
{
        struct super_block *sb = fc->root->d_sb;
        dput(fc->root);
        fc->root = NULL;
        deactivate_locked_super(sb);
}

static void legacy_fs_context_free(struct fs_context *fc);

/**
 * vfs_dup_fc_config: Duplicate a filesystem context.
 * @src_fc: The context to copy.
 */
struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc)
{
        struct fs_context *fc;
        int ret;

        if (!src_fc->ops->dup)
                return ERR_PTR(-EOPNOTSUPP);

        fc = kmemdup(src_fc, sizeof(struct fs_context), GFP_KERNEL);
        if (!fc)
                return ERR_PTR(-ENOMEM);

        mutex_init(&fc->uapi_mutex);

        fc->fs_private        = NULL;
        fc->s_fs_info        = NULL;
        fc->source        = NULL;
        fc->security        = NULL;
        get_filesystem(fc->fs_type);
        get_net(fc->net_ns);
        get_user_ns(fc->user_ns);
        get_cred(fc->cred);
        if (fc->log.log)
                refcount_inc(&fc->log.log->usage);

        /* Can't call put until we've called ->dup */
        ret = fc->ops->dup(fc, src_fc);
        if (ret < 0)
                goto err_fc;

        ret = security_fs_context_dup(fc, src_fc);
        if (ret < 0)
                goto err_fc;
        return fc;

err_fc:
        put_fs_context(fc);
        return ERR_PTR(ret);
}
EXPORT_SYMBOL(vfs_dup_fs_context);

/**
 * logfc - Log a message to a filesystem context
 * @fc: The filesystem context to log to.
 * @fmt: The format of the buffer.
 */
void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, ...)
{
        va_list va;
        struct va_format vaf = {.fmt = fmt, .va = &va};

        va_start(va, fmt);
        if (!log) {
                switch (level) {
                case 'w':
                        printk(KERN_WARNING "%s%s%pV\n", prefix ? prefix : "",
                                                prefix ? ": " : "", &vaf);
                        break;
                case 'e':
                        printk(KERN_ERR "%s%s%pV\n", prefix ? prefix : "",
                                                prefix ? ": " : "", &vaf);
                        break;
                default:
                        printk(KERN_NOTICE "%s%s%pV\n", prefix ? prefix : "",
                                                prefix ? ": " : "", &vaf);
                        break;
                }
        } else {
                unsigned int logsize = ARRAY_SIZE(log->buffer);
                u8 index;
                char *q = kasprintf(GFP_KERNEL, "%c %s%s%pV\n", level,
                                                prefix ? prefix : "",
                                                prefix ? ": " : "", &vaf);

                index = log->head & (logsize - 1);
                BUILD_BUG_ON(sizeof(log->head) != sizeof(u8) ||
                             sizeof(log->tail) != sizeof(u8));
                if ((u8)(log->head - log->tail) == logsize) {
                        /* The buffer is full, discard the oldest message */
                        if (log->need_free & (1 << index))
                                kfree(log->buffer[index]);
                        log->tail++;
                }

                log->buffer[index] = q ? q : "OOM: Can't store error string";
                if (q)
                        log->need_free |= 1 << index;
                else
                        log->need_free &= ~(1 << index);
                log->head++;
        }
        va_end(va);
}
EXPORT_SYMBOL(logfc);

/*
 * Free a logging structure.
 */
static void put_fc_log(struct fs_context *fc)
{
        struct fc_log *log = fc->log.log;
        int i;

        if (log) {
                if (refcount_dec_and_test(&log->usage)) {
                        fc->log.log = NULL;
                        for (i = 0; i <= 7; i++)
                                if (log->need_free & (1 << i))
                                        kfree(log->buffer[i]);
                        kfree(log);
                }
        }
}

/**
 * put_fs_context - Dispose of a superblock configuration context.
 * @fc: The context to dispose of.
 */
void put_fs_context(struct fs_context *fc)
{
        struct super_block *sb;

        if (fc->root) {
                sb = fc->root->d_sb;
                dput(fc->root);
                fc->root = NULL;
                deactivate_super(sb);
        }

        if (fc->need_free && fc->ops && fc->ops->free)
                fc->ops->free(fc);

        security_free_mnt_opts(&fc->security);
        put_net(fc->net_ns);
        put_user_ns(fc->user_ns);
        put_cred(fc->cred);
        put_fc_log(fc);
        put_filesystem(fc->fs_type);
        kfree(fc->source);
        kfree(fc);
}
EXPORT_SYMBOL(put_fs_context);

/*
 * Free the config for a filesystem that doesn't support fs_context.
 */
static void legacy_fs_context_free(struct fs_context *fc)
{
        struct legacy_fs_context *ctx = fc->fs_private;

        if (ctx) {
                if (ctx->param_type == LEGACY_FS_INDIVIDUAL_PARAMS)
                        kfree(ctx->legacy_data);
                kfree(ctx);
        }
}

/*
 * Duplicate a legacy config.
 */
static int legacy_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc)
{
        struct legacy_fs_context *ctx;
        struct legacy_fs_context *src_ctx = src_fc->fs_private;

        ctx = kmemdup(src_ctx, sizeof(*src_ctx), GFP_KERNEL);
        if (!ctx)
                return -ENOMEM;

        if (ctx->param_type == LEGACY_FS_INDIVIDUAL_PARAMS) {
                ctx->legacy_data = kmemdup(src_ctx->legacy_data,
                                           src_ctx->data_size, GFP_KERNEL);
                if (!ctx->legacy_data) {
                        kfree(ctx);
                        return -ENOMEM;
                }
        }

        fc->fs_private = ctx;
        return 0;
}

/*
 * Add a parameter to a legacy config.  We build up a comma-separated list of
 * options.
 */
static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
        struct legacy_fs_context *ctx = fc->fs_private;
        unsigned int size = ctx->data_size;
        size_t len = 0;

        if (strcmp(param->key, "source") == 0) {
                if (param->type != fs_value_is_string)
                        return invalf(fc, "VFS: Legacy: Non-string source");
                if (fc->source)
                        return invalf(fc, "VFS: Legacy: Multiple sources");
                fc->source = param->string;
                param->string = NULL;
                return 0;
        }

        if (ctx->param_type == LEGACY_FS_MONOLITHIC_PARAMS)
                return invalf(fc, "VFS: Legacy: Can't mix monolithic and individual options");

        switch (param->type) {
        case fs_value_is_string:
                len = 1 + param->size;
                fallthrough;
        case fs_value_is_flag:
                len += strlen(param->key);
                break;
        default:
                return invalf(fc, "VFS: Legacy: Parameter type for '%s' not supported",
                              param->key);
        }

        if (size + len + 2 > PAGE_SIZE)
                return invalf(fc, "VFS: Legacy: Cumulative options too large");
        if (strchr(param->key, ',') ||
            (param->type == fs_value_is_string &&
             memchr(param->string, ',', param->size)))
                return invalf(fc, "VFS: Legacy: Option '%s' contained comma",
                              param->key);
        if (!ctx->legacy_data) {
                ctx->legacy_data = kmalloc(PAGE_SIZE, GFP_KERNEL);
                if (!ctx->legacy_data)
                        return -ENOMEM;
        }

        if (size)
                ctx->legacy_data[size++] = ',';
        len = strlen(param->key);
        memcpy(ctx->legacy_data + size, param->key, len);
        size += len;
        if (param->type == fs_value_is_string) {
                ctx->legacy_data[size++] = '=';
                memcpy(ctx->legacy_data + size, param->string, param->size);
                size += param->size;
        }
        ctx->legacy_data[size] = '\0';
        ctx->data_size = size;
        ctx->param_type = LEGACY_FS_INDIVIDUAL_PARAMS;
        return 0;
}

/*
 * Add monolithic mount data.
 */
static int legacy_parse_monolithic(struct fs_context *fc, void *data)
{
        struct legacy_fs_context *ctx = fc->fs_private;

        if (ctx->param_type != LEGACY_FS_UNSET_PARAMS) {
                pr_warn("VFS: Can't mix monolithic and individual options\n");
                return -EINVAL;
        }

        ctx->legacy_data = data;
        ctx->param_type = LEGACY_FS_MONOLITHIC_PARAMS;
        if (!ctx->legacy_data)
                return 0;

        if (fc->fs_type->fs_flags & FS_BINARY_MOUNTDATA)
                return 0;
        return security_sb_eat_lsm_opts(ctx->legacy_data, &fc->security);
}

/*
 * Get a mountable root with the legacy mount command.
 */
static int legacy_get_tree(struct fs_context *fc)
{
        struct legacy_fs_context *ctx = fc->fs_private;
        struct super_block *sb;
        struct dentry *root;

        root = fc->fs_type->mount(fc->fs_type, fc->sb_flags,
                                      fc->source, ctx->legacy_data);
        if (IS_ERR(root))
                return PTR_ERR(root);

        sb = root->d_sb;
        BUG_ON(!sb);

        fc->root = root;
        return 0;
}

/*
 * Handle remount.
 */
static int legacy_reconfigure(struct fs_context *fc)
{
        struct legacy_fs_context *ctx = fc->fs_private;
        struct super_block *sb = fc->root->d_sb;

        if (!sb->s_op->remount_fs)
                return 0;

        return sb->s_op->remount_fs(sb, &fc->sb_flags,
                                    ctx ? ctx->legacy_data : NULL);
}

const struct fs_context_operations legacy_fs_context_ops = {
        .free                        = legacy_fs_context_free,
        .dup                        = legacy_fs_context_dup,
        .parse_param                = legacy_parse_param,
        .parse_monolithic        = legacy_parse_monolithic,
        .get_tree                = legacy_get_tree,
        .reconfigure                = legacy_reconfigure,
};

/*
 * Initialise a legacy context for a filesystem that doesn't support
 * fs_context.
 */
static int legacy_init_fs_context(struct fs_context *fc)
{
        fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL_ACCOUNT);
        if (!fc->fs_private)
                return -ENOMEM;
        fc->ops = &legacy_fs_context_ops;
        return 0;
}

int parse_monolithic_mount_data(struct fs_context *fc, void *data)
{
        int (*monolithic_mount_data)(struct fs_context *, void *);

        monolithic_mount_data = fc->ops->parse_monolithic;
        if (!monolithic_mount_data)
                monolithic_mount_data = generic_parse_monolithic;

        return monolithic_mount_data(fc, data);
}

/*
 * Clean up a context after performing an action on it and put it into a state
 * from where it can be used to reconfigure a superblock.
 *
 * Note that here we do only the parts that can't fail; the rest is in
 * finish_clean_context() below and in between those fs_context is marked
 * FS_CONTEXT_AWAITING_RECONF.  The reason for splitup is that after
 * successful mount or remount we need to report success to userland.
 * Trying to do full reinit (for the sake of possible subsequent remount)
 * and failing to allocate memory would've put us into a nasty situation.
 * So here we only discard the old state and reinitialization is left
 * until we actually try to reconfigure.
 */
void vfs_clean_context(struct fs_context *fc)
{
        if (fc->need_free && fc->ops && fc->ops->free)
                fc->ops->free(fc);
        fc->need_free = false;
        fc->fs_private = NULL;
        fc->s_fs_info = NULL;
        fc->sb_flags = 0;
        security_free_mnt_opts(&fc->security);
        kfree(fc->source);
        fc->source = NULL;

        fc->purpose = FS_CONTEXT_FOR_RECONFIGURE;
        fc->phase = FS_CONTEXT_AWAITING_RECONF;
}

int finish_clean_context(struct fs_context *fc)
{
        int error;

        if (fc->phase != FS_CONTEXT_AWAITING_RECONF)
                return 0;

        if (fc->fs_type->init_fs_context)
                error = fc->fs_type->init_fs_context(fc);
        else
                error = legacy_init_fs_context(fc);
        if (unlikely(error)) {
                fc->phase = FS_CONTEXT_FAILED;
                return error;
        }
        fc->need_free = true;
        fc->phase = FS_CONTEXT_RECONF_PARAMS;
        return 0;
}
































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
#ifndef __NET_SCHED_CODEL_IMPL_H
#define __NET_SCHED_CODEL_IMPL_H

/*
 * Codel - The Controlled-Delay Active Queue Management algorithm
 *
 *  Copyright (C) 2011-2012 Kathleen Nichols <nichols@pollere.com>
 *  Copyright (C) 2011-2012 Van Jacobson <van@pollere.net>
 *  Copyright (C) 2012 Michael D. Taht <dave.taht@bufferbloat.net>
 *  Copyright (C) 2012,2015 Eric Dumazet <edumazet@google.com>
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions, and the following disclaimer,
 *    without modification.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The names of the authors may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * Alternatively, provided that this notice is retained in full, this
 * software may be distributed under the terms of the GNU General
 * Public License ("GPL") version 2, in which case the provisions of the
 * GPL apply INSTEAD OF those given above.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 * DAMAGE.
 *
 */

/* Controlling Queue Delay (CoDel) algorithm
 * =========================================
 * Source : Kathleen Nichols and Van Jacobson
 * http://queue.acm.org/detail.cfm?id=2209336
 *
 * Implemented on linux by Dave Taht and Eric Dumazet
 */

static void codel_params_init(struct codel_params *params)
{
        params->interval = MS2TIME(100);
        params->target = MS2TIME(5);
        params->ce_threshold = CODEL_DISABLED_THRESHOLD;
        params->ecn = false;
}

static void codel_vars_init(struct codel_vars *vars)
{
        memset(vars, 0, sizeof(*vars));
}

static void codel_stats_init(struct codel_stats *stats)
{
        stats->maxpacket = 0;
}

/*
 * http://en.wikipedia.org/wiki/Methods_of_computing_square_roots#Iterative_methods_for_reciprocal_square_roots
 * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
 *
 * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
 */
static void codel_Newton_step(struct codel_vars *vars)
{
        u32 invsqrt = ((u32)vars->rec_inv_sqrt) << REC_INV_SQRT_SHIFT;
        u32 invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
        u64 val = (3LL << 32) - ((u64)vars->count * invsqrt2);

        val >>= 2; /* avoid overflow in following multiply */
        val = (val * invsqrt) >> (32 - 2 + 1);

        vars->rec_inv_sqrt = val >> REC_INV_SQRT_SHIFT;
}

/*
 * CoDel control_law is t + interval/sqrt(count)
 * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
 * both sqrt() and divide operation.
 */
static codel_time_t codel_control_law(codel_time_t t,
                                      codel_time_t interval,
                                      u32 rec_inv_sqrt)
{
        return t + reciprocal_scale(interval, rec_inv_sqrt << REC_INV_SQRT_SHIFT);
}

static bool codel_should_drop(const struct sk_buff *skb,
                              void *ctx,
                              struct codel_vars *vars,
                              struct codel_params *params,
                              struct codel_stats *stats,
                              codel_skb_len_t skb_len_func,
                              codel_skb_time_t skb_time_func,
                              u32 *backlog,
                              codel_time_t now)
{
        bool ok_to_drop;
        u32 skb_len;

        if (!skb) {
                vars->first_above_time = 0;
                return false;
        }

        skb_len = skb_len_func(skb);
        vars->ldelay = now - skb_time_func(skb);

        if (unlikely(skb_len > stats->maxpacket))
                stats->maxpacket = skb_len;

        if (codel_time_before(vars->ldelay, params->target) ||
            *backlog <= params->mtu) {
                /* went below - stay below for at least interval */
                vars->first_above_time = 0;
                return false;
        }
        ok_to_drop = false;
        if (vars->first_above_time == 0) {
                /* just went above from below. If we stay above
                 * for at least interval we'll say it's ok to drop
                 */
                vars->first_above_time = now + params->interval;
        } else if (codel_time_after(now, vars->first_above_time)) {
                ok_to_drop = true;
        }
        return ok_to_drop;
}

static struct sk_buff *codel_dequeue(void *ctx,
                                     u32 *backlog,
                                     struct codel_params *params,
                                     struct codel_vars *vars,
                                     struct codel_stats *stats,
                                     codel_skb_len_t skb_len_func,
                                     codel_skb_time_t skb_time_func,
                                     codel_skb_drop_t drop_func,
                                     codel_skb_dequeue_t dequeue_func)
{
        struct sk_buff *skb = dequeue_func(vars, ctx);
        codel_time_t now;
        bool drop;

        if (!skb) {
                vars->dropping = false;
                return skb;
        }
        now = codel_get_time();
        drop = codel_should_drop(skb, ctx, vars, params, stats,
                                 skb_len_func, skb_time_func, backlog, now);
        if (vars->dropping) {
                if (!drop) {
                        /* sojourn time below target - leave dropping state */
                        vars->dropping = false;
                } else if (codel_time_after_eq(now, vars->drop_next)) {
                        /* It's time for the next drop. Drop the current
                         * packet and dequeue the next. The dequeue might
                         * take us out of dropping state.
                         * If not, schedule the next drop.
                         * A large backlog might result in drop rates so high
                         * that the next drop should happen now,
                         * hence the while loop.
                         */
                        while (vars->dropping &&
                               codel_time_after_eq(now, vars->drop_next)) {
                                vars->count++; /* dont care of possible wrap
                                                * since there is no more divide
                                                */
                                codel_Newton_step(vars);
                                if (params->ecn && INET_ECN_set_ce(skb)) {
                                        stats->ecn_mark++;
                                        vars->drop_next =
                                                codel_control_law(vars->drop_next,
                                                                  params->interval,
                                                                  vars->rec_inv_sqrt);
                                        goto end;
                                }
                                stats->drop_len += skb_len_func(skb);
                                drop_func(skb, ctx);
                                stats->drop_count++;
                                skb = dequeue_func(vars, ctx);
                                if (!codel_should_drop(skb, ctx,
                                                       vars, params, stats,
                                                       skb_len_func,
                                                       skb_time_func,
                                                       backlog, now)) {
                                        /* leave dropping state */
                                        vars->dropping = false;
                                } else {
                                        /* and schedule the next drop */
                                        vars->drop_next =
                                                codel_control_law(vars->drop_next,
                                                                  params->interval,
                                                                  vars->rec_inv_sqrt);
                                }
                        }
                }
        } else if (drop) {
                u32 delta;

                if (params->ecn && INET_ECN_set_ce(skb)) {
                        stats->ecn_mark++;
                } else {
                        stats->drop_len += skb_len_func(skb);
                        drop_func(skb, ctx);
                        stats->drop_count++;

                        skb = dequeue_func(vars, ctx);
                        drop = codel_should_drop(skb, ctx, vars, params,
                                                 stats, skb_len_func,
                                                 skb_time_func, backlog, now);
                }
                vars->dropping = true;
                /* if min went above target close to when we last went below it
                 * assume that the drop rate that controlled the queue on the
                 * last cycle is a good starting point to control it now.
                 */
                delta = vars->count - vars->lastcount;
                if (delta > 1 &&
                    codel_time_before(now - vars->drop_next,
                                      16 * params->interval)) {
                        vars->count = delta;
                        /* we dont care if rec_inv_sqrt approximation
                         * is not very precise :
                         * Next Newton steps will correct it quadratically.
                         */
                        codel_Newton_step(vars);
                } else {
                        vars->count = 1;
                        vars->rec_inv_sqrt = ~0U >> REC_INV_SQRT_SHIFT;
                }
                vars->lastcount = vars->count;
                vars->drop_next = codel_control_law(now, params->interval,
                                                    vars->rec_inv_sqrt);
        }
end:
        if (skb && codel_time_after(vars->ldelay, params->ce_threshold) &&
            INET_ECN_set_ce(skb))
                stats->ce_mark++;
        return skb;
}

#endif

















































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PGTABLE_64_H
#define _ASM_X86_PGTABLE_64_H

#include <linux/const.h>
#include <asm/pgtable_64_types.h>

#ifndef __ASSEMBLY__

/*
 * This file contains the functions and defines necessary to modify and use
 * the x86-64 page table tree.
 */
#include <asm/processor.h>
#include <linux/bitops.h>
#include <linux/threads.h>
#include <asm/fixmap.h>

extern p4d_t level4_kernel_pgt[512];
extern p4d_t level4_ident_pgt[512];
extern pud_t level3_kernel_pgt[512];
extern pud_t level3_ident_pgt[512];
extern pmd_t level2_kernel_pgt[512];
extern pmd_t level2_fixmap_pgt[512];
extern pmd_t level2_ident_pgt[512];
extern pte_t level1_fixmap_pgt[512 * FIXMAP_PMD_NUM];
extern pgd_t init_top_pgt[];

#define swapper_pg_dir init_top_pgt

extern void paging_init(void);
static inline void sync_initial_page_table(void) { }

#define pte_ERROR(e)                                        \
        pr_err("%s:%d: bad pte %p(%016lx)\n",                \
               __FILE__, __LINE__, &(e), pte_val(e))
#define pmd_ERROR(e)                                        \
        pr_err("%s:%d: bad pmd %p(%016lx)\n",                \
               __FILE__, __LINE__, &(e), pmd_val(e))
#define pud_ERROR(e)                                        \
        pr_err("%s:%d: bad pud %p(%016lx)\n",                \
               __FILE__, __LINE__, &(e), pud_val(e))

#if CONFIG_PGTABLE_LEVELS >= 5
#define p4d_ERROR(e)                                        \
        pr_err("%s:%d: bad p4d %p(%016lx)\n",                \
               __FILE__, __LINE__, &(e), p4d_val(e))
#endif

#define pgd_ERROR(e)                                        \
        pr_err("%s:%d: bad pgd %p(%016lx)\n",                \
               __FILE__, __LINE__, &(e), pgd_val(e))

struct mm_struct;

#define mm_p4d_folded mm_p4d_folded
static inline bool mm_p4d_folded(struct mm_struct *mm)
{
        return !pgtable_l5_enabled();
}

void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte);
void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);

static inline void native_set_pte(pte_t *ptep, pte_t pte)
{
        WRITE_ONCE(*ptep, pte);
}

static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
                                    pte_t *ptep)
{
        native_set_pte(ptep, native_make_pte(0));
}

static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
{
        native_set_pte(ptep, pte);
}

static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
{
        WRITE_ONCE(*pmdp, pmd);
}

static inline void native_pmd_clear(pmd_t *pmd)
{
        native_set_pmd(pmd, native_make_pmd(0));
}

static inline pte_t native_ptep_get_and_clear(pte_t *xp)
{
#ifdef CONFIG_SMP
        return native_make_pte(xchg(&xp->pte, 0));
#else
        /* native_local_ptep_get_and_clear,
           but duplicated because of cyclic dependency */
        pte_t ret = *xp;
        native_pte_clear(NULL, 0, xp);
        return ret;
#endif
}

static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
{
#ifdef CONFIG_SMP
        return native_make_pmd(xchg(&xp->pmd, 0));
#else
        /* native_local_pmdp_get_and_clear,
           but duplicated because of cyclic dependency */
        pmd_t ret = *xp;
        native_pmd_clear(xp);
        return ret;
#endif
}

static inline void native_set_pud(pud_t *pudp, pud_t pud)
{
        WRITE_ONCE(*pudp, pud);
}

static inline void native_pud_clear(pud_t *pud)
{
        native_set_pud(pud, native_make_pud(0));
}

static inline pud_t native_pudp_get_and_clear(pud_t *xp)
{
#ifdef CONFIG_SMP
        return native_make_pud(xchg(&xp->pud, 0));
#else
        /* native_local_pudp_get_and_clear,
         * but duplicated because of cyclic dependency
         */
        pud_t ret = *xp;

        native_pud_clear(xp);
        return ret;
#endif
}

static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
{
        pgd_t pgd;

        if (pgtable_l5_enabled() || !IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) {
                WRITE_ONCE(*p4dp, p4d);
                return;
        }

        pgd = native_make_pgd(native_p4d_val(p4d));
        pgd = pti_set_user_pgtbl((pgd_t *)p4dp, pgd);
        WRITE_ONCE(*p4dp, native_make_p4d(native_pgd_val(pgd)));
}

static inline void native_p4d_clear(p4d_t *p4d)
{
        native_set_p4d(p4d, native_make_p4d(0));
}

static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
        WRITE_ONCE(*pgdp, pti_set_user_pgtbl(pgdp, pgd));
}

static inline void native_pgd_clear(pgd_t *pgd)
{
        native_set_pgd(pgd, native_make_pgd(0));
}

/*
 * Conversion functions: convert a page and protection to a page entry,
 * and a page entry and page directory to the page they refer to.
 */

/* PGD - Level 4 access */

/* PUD - Level 3 access */

/* PMD - Level 2 access */

/* PTE - Level 1 access */

/*
 * Encode and de-code a swap entry
 *
 * |     ...            | 11| 10|  9|8|7|6|5| 4| 3|2| 1|0| <- bit number
 * |     ...            |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names
 * | TYPE (59-63) | ~OFFSET (9-58)  |0|0|X|X| X| X|F|SD|0| <- swp entry
 *
 * G (8) is aliased and used as a PROT_NONE indicator for
 * !present ptes.  We need to start storing swap entries above
 * there.  We also need to avoid using A and D because of an
 * erratum where they can be incorrectly set by hardware on
 * non-present PTEs.
 *
 * SD Bits 1-4 are not used in non-present format and available for
 * special use described below:
 *
 * SD (1) in swp entry is used to store soft dirty bit, which helps us
 * remember soft dirty over page migration
 *
 * F (2) in swp entry is used to record when a pagetable is
 * writeprotected by userfaultfd WP support.
 *
 * Bit 7 in swp entry should be 0 because pmd_present checks not only P,
 * but also L and G.
 *
 * The offset is inverted by a binary not operation to make the high
 * physical bits set.
 */
#define SWP_TYPE_BITS                5

#define SWP_OFFSET_FIRST_BIT        (_PAGE_BIT_PROTNONE + 1)

/* We always extract/encode the offset by shifting it all the way up, and then down again */
#define SWP_OFFSET_SHIFT        (SWP_OFFSET_FIRST_BIT+SWP_TYPE_BITS)

#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)

/* Extract the high bits for type */
#define __swp_type(x) ((x).val >> (64 - SWP_TYPE_BITS))

/* Shift up (to get rid of type), then down to get value */
#define __swp_offset(x) (~(x).val << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT)

/*
 * Shift the offset up "too far" by TYPE bits, then down again
 * The offset is inverted by a binary not operation to make the high
 * physical bits set.
 */
#define __swp_entry(type, offset) ((swp_entry_t) { \
        (~(unsigned long)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \
        | ((unsigned long)(type) << (64-SWP_TYPE_BITS)) })

#define __pte_to_swp_entry(pte)                ((swp_entry_t) { pte_val((pte)) })
#define __pmd_to_swp_entry(pmd)                ((swp_entry_t) { pmd_val((pmd)) })
#define __swp_entry_to_pte(x)                (__pte((x).val))
#define __swp_entry_to_pmd(x)                (__pmd((x).val))

extern int kern_addr_valid(unsigned long addr);
extern void cleanup_highmap(void);

#define HAVE_ARCH_UNMAPPED_AREA
#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN

#define PAGE_AGP    PAGE_KERNEL_NOCACHE
#define HAVE_PAGE_AGP 1

/* fs/proc/kcore.c */
#define        kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK)
#define        kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK)

#define __HAVE_ARCH_PTE_SAME

#define vmemmap ((struct page *)VMEMMAP_START)

extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);

#define gup_fast_permitted gup_fast_permitted
static inline bool gup_fast_permitted(unsigned long start, unsigned long end)
{
        if (end >> __VIRTUAL_MASK_SHIFT)
                return false;
        return true;
}

#include <asm/pgtable-invert.h>

#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_PGTABLE_64_H */






















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
/* SPDX-License-Identifier: GPL-2.0 */
/* include/net/dsfield.h - Manipulation of the Differentiated Services field */

/* Written 1998-2000 by Werner Almesberger, EPFL ICA */


#ifndef __NET_DSFIELD_H
#define __NET_DSFIELD_H

#include <linux/types.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <asm/byteorder.h>


static inline __u8 ipv4_get_dsfield(const struct iphdr *iph)
{
        return iph->tos;
}


static inline __u8 ipv6_get_dsfield(const struct ipv6hdr *ipv6h)
{
        return ntohs(*(__force const __be16 *)ipv6h) >> 4;
}


static inline void ipv4_change_dsfield(struct iphdr *iph,__u8 mask,
    __u8 value)
{
        __u32 check = ntohs((__force __be16)iph->check);
        __u8 dsfield;

        dsfield = (iph->tos & mask) | value;
        check += iph->tos;
        if ((check+1) >> 16) check = (check+1) & 0xffff;
        check -= dsfield;
        check += check >> 16; /* adjust carry */
        iph->check = (__force __sum16)htons(check);
        iph->tos = dsfield;
}


static inline void ipv6_change_dsfield(struct ipv6hdr *ipv6h,__u8 mask,
    __u8 value)
{
        __be16 *p = (__force __be16 *)ipv6h;

        *p = (*p & htons((((u16)mask << 4) | 0xf00f))) | htons((u16)value << 4);
}


#endif



















































































































    1 





































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RCULIST_NULLS_H
#define _LINUX_RCULIST_NULLS_H

#ifdef __KERNEL__

/*
 * RCU-protected list version
 */
#include <linux/list_nulls.h>
#include <linux/rcupdate.h>

/**
 * hlist_nulls_del_init_rcu - deletes entry from hash list with re-initialization
 * @n: the element to delete from the hash list.
 *
 * Note: hlist_nulls_unhashed() on the node return true after this. It is
 * useful for RCU based read lockfree traversal if the writer side
 * must know if the list entry is still hashed or already unhashed.
 *
 * In particular, it means that we can not poison the forward pointers
 * that may still be used for walking the hash list and we can only
 * zero the pprev pointer so list_unhashed() will return true after
 * this.
 *
 * The caller must take whatever precautions are necessary (such as
 * holding appropriate locks) to avoid racing with another
 * list-mutation primitive, such as hlist_nulls_add_head_rcu() or
 * hlist_nulls_del_rcu(), running on this same list.  However, it is
 * perfectly legal to run concurrently with the _rcu list-traversal
 * primitives, such as hlist_nulls_for_each_entry_rcu().
 */
static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n)
{
        if (!hlist_nulls_unhashed(n)) {
                __hlist_nulls_del(n);
                WRITE_ONCE(n->pprev, NULL);
        }
}

/**
 * hlist_nulls_first_rcu - returns the first element of the hash list.
 * @head: the head of the list.
 */
#define hlist_nulls_first_rcu(head) \
        (*((struct hlist_nulls_node __rcu __force **)&(head)->first))

/**
 * hlist_nulls_next_rcu - returns the element of the list after @node.
 * @node: element of the list.
 */
#define hlist_nulls_next_rcu(node) \
        (*((struct hlist_nulls_node __rcu __force **)&(node)->next))

/**
 * hlist_nulls_pprev_rcu - returns the dereferenced pprev of @node.
 * @node: element of the list.
 */
#define hlist_nulls_pprev_rcu(node) \
        (*((struct hlist_nulls_node __rcu __force **)(node)->pprev))

/**
 * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization
 * @n: the element to delete from the hash list.
 *
 * Note: hlist_nulls_unhashed() on entry does not return true after this,
 * the entry is in an undefined state. It is useful for RCU based
 * lockfree traversal.
 *
 * In particular, it means that we can not poison the forward
 * pointers that may still be used for walking the hash list.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_nulls_add_head_rcu()
 * or hlist_nulls_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_nulls_for_each_entry().
 */
static inline void hlist_nulls_del_rcu(struct hlist_nulls_node *n)
{
        __hlist_nulls_del(n);
        WRITE_ONCE(n->pprev, LIST_POISON2);
}

/**
 * hlist_nulls_add_head_rcu
 * @n: the element to add to the hash list.
 * @h: the list to add to.
 *
 * Description:
 * Adds the specified element to the specified hlist_nulls,
 * while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_nulls_add_head_rcu()
 * or hlist_nulls_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.  Regardless of the type of CPU, the
 * list-traversal primitive must be guarded by rcu_read_lock().
 */
static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,
                                        struct hlist_nulls_head *h)
{
        struct hlist_nulls_node *first = h->first;

        n->next = first;
        WRITE_ONCE(n->pprev, &h->first);
        rcu_assign_pointer(hlist_nulls_first_rcu(h), n);
        if (!is_a_nulls(first))
                WRITE_ONCE(first->pprev, &n->next);
}

/**
 * hlist_nulls_add_tail_rcu
 * @n: the element to add to the hash list.
 * @h: the list to add to.
 *
 * Description:
 * Adds the specified element to the specified hlist_nulls,
 * while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_nulls_add_head_rcu()
 * or hlist_nulls_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.  Regardless of the type of CPU, the
 * list-traversal primitive must be guarded by rcu_read_lock().
 */
static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n,
                                            struct hlist_nulls_head *h)
{
        struct hlist_nulls_node *i, *last = NULL;

        /* Note: write side code, so rcu accessors are not needed. */
        for (i = h->first; !is_a_nulls(i); i = i->next)
                last = i;

        if (last) {
                n->next = last->next;
                n->pprev = &last->next;
                rcu_assign_pointer(hlist_next_rcu(last), n);
        } else {
                hlist_nulls_add_head_rcu(n, h);
        }
}

/* after that hlist_nulls_del will work */
static inline void hlist_nulls_add_fake(struct hlist_nulls_node *n)
{
        n->pprev = &n->next;
        n->next = (struct hlist_nulls_node *)NULLS_MARKER(NULL);
}

/**
 * hlist_nulls_replace_rcu - replace an old entry by a new one
 * @old: the element to be replaced
 * @new: the new element to insert
 *
 * Description:
 * Replace the old entry with the new one in a RCU-protected hlist_nulls, while
 * permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary (such as holding
 * appropriate locks) to avoid racing with another list-mutation primitive, such
 * as hlist_nulls_add_head_rcu() or hlist_nulls_del_rcu(), running on this same
 * list.  However, it is perfectly legal to run concurrently with the _rcu
 * list-traversal primitives, such as hlist_nulls_for_each_entry_rcu().
 */
static inline void hlist_nulls_replace_rcu(struct hlist_nulls_node *old,
                                           struct hlist_nulls_node *new)
{
        struct hlist_nulls_node *next = old->next;

        WRITE_ONCE(new->next, next);
        WRITE_ONCE(new->pprev, old->pprev);
        rcu_assign_pointer(hlist_nulls_pprev_rcu(new), new);
        if (!is_a_nulls(next))
                WRITE_ONCE(next->pprev, &new->next);
}

/**
 * hlist_nulls_replace_init_rcu - replace an old entry by a new one and
 * initialize the old
 * @old: the element to be replaced
 * @new: the new element to insert
 *
 * Description:
 * Replace the old entry with the new one in a RCU-protected hlist_nulls, while
 * permitting racing traversals, and reinitialize the old entry.
 *
 * Note: @old must be hashed.
 *
 * The caller must take whatever precautions are necessary (such as holding
 * appropriate locks) to avoid racing with another list-mutation primitive, such
 * as hlist_nulls_add_head_rcu() or hlist_nulls_del_rcu(), running on this same
 * list. However, it is perfectly legal to run concurrently with the _rcu
 * list-traversal primitives, such as hlist_nulls_for_each_entry_rcu().
 */
static inline void hlist_nulls_replace_init_rcu(struct hlist_nulls_node *old,
                                                struct hlist_nulls_node *new)
{
        hlist_nulls_replace_rcu(old, new);
        WRITE_ONCE(old->pprev, NULL);
}

/**
 * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_nulls_node to use as a loop cursor.
 * @head:        the head of the list.
 * @member:        the name of the hlist_nulls_node within the struct.
 *
 * The barrier() is needed to make sure compiler doesn't cache first element [1],
 * as this loop can be restarted [2]
 * [1] Documentation/core-api/atomic_ops.rst around line 114
 * [2] Documentation/RCU/rculist_nulls.rst around line 146
 */
#define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member)                        \
        for (({barrier();}),                                                        \
             pos = rcu_dereference_raw(hlist_nulls_first_rcu(head));                \
                (!is_a_nulls(pos)) &&                                                \
                ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \
                pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos)))

/**
 * hlist_nulls_for_each_entry_safe -
 *   iterate over list of given type safe against removal of list entry
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_nulls_node to use as a loop cursor.
 * @head:        the head of the list.
 * @member:        the name of the hlist_nulls_node within the struct.
 */
#define hlist_nulls_for_each_entry_safe(tpos, pos, head, member)                \
        for (({barrier();}),                                                        \
             pos = rcu_dereference_raw(hlist_nulls_first_rcu(head));                \
                (!is_a_nulls(pos)) &&                                                \
                ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member);        \
                   pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos)); 1; });)
#endif
#endif






















































































































































































    5 
    5 











    4 





































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 

    3 
















































    3 
    3 
    3 















    3 






    3 
    3 



    2 

    1 
    3 





    3 


    3 





















    1 






    2 














































































































































































































































































































    2 













































    4 










    4 









    4 







    4 


    4 
    4 





    4 




    4 









    4 

    4 


    4 
    1 





    4 

    4 


    4 

    1 


    3 




    4 







    4 














    4 
    2 



    4 



    4 






    4 




    4 





    4 







    4 





























    4 



















































































































































































































































































































































































































































































    2 











    2 
    2 










































































































































































































































































































































































































































































    2 






    2 
    1 


    2 

    2 
    2 




    2 


    2 
    2 
































    1 




    1 
    1 



























    1 











    1 



    1 

    1 

    1 












    1 





    1 



    1 















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 



    1 
    1 


    1 




    1 
















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Implementation of the security services.
 *
 * Authors : Stephen Smalley, <sds@tycho.nsa.gov>
 *             James Morris <jmorris@redhat.com>
 *
 * Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com>
 *
 *        Support for enhanced MLS infrastructure.
 *        Support for context based audit filters.
 *
 * Updated: Frank Mayer <mayerf@tresys.com> and Karl MacMillan <kmacmillan@tresys.com>
 *
 *        Added conditional policy language extensions
 *
 * Updated: Hewlett-Packard <paul@paul-moore.com>
 *
 *      Added support for NetLabel
 *      Added support for the policy capability bitmap
 *
 * Updated: Chad Sellers <csellers@tresys.com>
 *
 *  Added validation of kernel classes and permissions
 *
 * Updated: KaiGai Kohei <kaigai@ak.jp.nec.com>
 *
 *  Added support for bounds domain and audit messaged on masked permissions
 *
 * Updated: Guido Trentalancia <guido@trentalancia.com>
 *
 *  Added support for runtime switching of the policy type
 *
 * Copyright (C) 2008, 2009 NEC Corporation
 * Copyright (C) 2006, 2007 Hewlett-Packard Development Company, L.P.
 * Copyright (C) 2004-2006 Trusted Computer Solutions, Inc.
 * Copyright (C) 2003 - 2004, 2006 Tresys Technology, LLC
 * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
 */
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/sched.h>
#include <linux/audit.h>
#include <linux/vmalloc.h>
#include <net/netlabel.h>

#include "flask.h"
#include "avc.h"
#include "avc_ss.h"
#include "security.h"
#include "context.h"
#include "policydb.h"
#include "sidtab.h"
#include "services.h"
#include "conditional.h"
#include "mls.h"
#include "objsec.h"
#include "netlabel.h"
#include "xfrm.h"
#include "ebitmap.h"
#include "audit.h"
#include "policycap_names.h"

struct convert_context_args {
        struct selinux_state *state;
        struct policydb *oldp;
        struct policydb *newp;
};

struct selinux_policy_convert_data {
        struct convert_context_args args;
        struct sidtab_convert_params sidtab_params;
};

/* Forward declaration. */
static int context_struct_to_string(struct policydb *policydb,
                                    struct context *context,
                                    char **scontext,
                                    u32 *scontext_len);

static int sidtab_entry_to_string(struct policydb *policydb,
                                  struct sidtab *sidtab,
                                  struct sidtab_entry *entry,
                                  char **scontext,
                                  u32 *scontext_len);

static void context_struct_compute_av(struct policydb *policydb,
                                      struct context *scontext,
                                      struct context *tcontext,
                                      u16 tclass,
                                      struct av_decision *avd,
                                      struct extended_perms *xperms);

static int selinux_set_mapping(struct policydb *pol,
                               struct security_class_mapping *map,
                               struct selinux_map *out_map)
{
        u16 i, j;
        unsigned k;
        bool print_unknown_handle = false;

        /* Find number of classes in the input mapping */
        if (!map)
                return -EINVAL;
        i = 0;
        while (map[i].name)
                i++;

        /* Allocate space for the class records, plus one for class zero */
        out_map->mapping = kcalloc(++i, sizeof(*out_map->mapping), GFP_ATOMIC);
        if (!out_map->mapping)
                return -ENOMEM;

        /* Store the raw class and permission values */
        j = 0;
        while (map[j].name) {
                struct security_class_mapping *p_in = map + (j++);
                struct selinux_mapping *p_out = out_map->mapping + j;

                /* An empty class string skips ahead */
                if (!strcmp(p_in->name, "")) {
                        p_out->num_perms = 0;
                        continue;
                }

                p_out->value = string_to_security_class(pol, p_in->name);
                if (!p_out->value) {
                        pr_info("SELinux:  Class %s not defined in policy.\n",
                               p_in->name);
                        if (pol->reject_unknown)
                                goto err;
                        p_out->num_perms = 0;
                        print_unknown_handle = true;
                        continue;
                }

                k = 0;
                while (p_in->perms[k]) {
                        /* An empty permission string skips ahead */
                        if (!*p_in->perms[k]) {
                                k++;
                                continue;
                        }
                        p_out->perms[k] = string_to_av_perm(pol, p_out->value,
                                                            p_in->perms[k]);
                        if (!p_out->perms[k]) {
                                pr_info("SELinux:  Permission %s in class %s not defined in policy.\n",
                                       p_in->perms[k], p_in->name);
                                if (pol->reject_unknown)
                                        goto err;
                                print_unknown_handle = true;
                        }

                        k++;
                }
                p_out->num_perms = k;
        }

        if (print_unknown_handle)
                pr_info("SELinux: the above unknown classes and permissions will be %s\n",
                       pol->allow_unknown ? "allowed" : "denied");

        out_map->size = i;
        return 0;
err:
        kfree(out_map->mapping);
        out_map->mapping = NULL;
        return -EINVAL;
}

/*
 * Get real, policy values from mapped values
 */

static u16 unmap_class(struct selinux_map *map, u16 tclass)
{
        if (tclass < map->size)
                return map->mapping[tclass].value;

        return tclass;
}

/*
 * Get kernel value for class from its policy value
 */
static u16 map_class(struct selinux_map *map, u16 pol_value)
{
        u16 i;

        for (i = 1; i < map->size; i++) {
                if (map->mapping[i].value == pol_value)
                        return i;
        }

        return SECCLASS_NULL;
}

static void map_decision(struct selinux_map *map,
                         u16 tclass, struct av_decision *avd,
                         int allow_unknown)
{
        if (tclass < map->size) {
                struct selinux_mapping *mapping = &map->mapping[tclass];
                unsigned int i, n = mapping->num_perms;
                u32 result;

                for (i = 0, result = 0; i < n; i++) {
                        if (avd->allowed & mapping->perms[i])
                                result |= 1<<i;
                        if (allow_unknown && !mapping->perms[i])
                                result |= 1<<i;
                }
                avd->allowed = result;

                for (i = 0, result = 0; i < n; i++)
                        if (avd->auditallow & mapping->perms[i])
                                result |= 1<<i;
                avd->auditallow = result;

                for (i = 0, result = 0; i < n; i++) {
                        if (avd->auditdeny & mapping->perms[i])
                                result |= 1<<i;
                        if (!allow_unknown && !mapping->perms[i])
                                result |= 1<<i;
                }
                /*
                 * In case the kernel has a bug and requests a permission
                 * between num_perms and the maximum permission number, we
                 * should audit that denial
                 */
                for (; i < (sizeof(u32)*8); i++)
                        result |= 1<<i;
                avd->auditdeny = result;
        }
}

int security_mls_enabled(struct selinux_state *state)
{
        int mls_enabled;
        struct selinux_policy *policy;

        if (!selinux_initialized(state))
                return 0;

        rcu_read_lock();
        policy = rcu_dereference(state->policy);
        mls_enabled = policy->policydb.mls_enabled;
        rcu_read_unlock();
        return mls_enabled;
}

/*
 * Return the boolean value of a constraint expression
 * when it is applied to the specified source and target
 * security contexts.
 *
 * xcontext is a special beast...  It is used by the validatetrans rules
 * only.  For these rules, scontext is the context before the transition,
 * tcontext is the context after the transition, and xcontext is the context
 * of the process performing the transition.  All other callers of
 * constraint_expr_eval should pass in NULL for xcontext.
 */
static int constraint_expr_eval(struct policydb *policydb,
                                struct context *scontext,
                                struct context *tcontext,
                                struct context *xcontext,
                                struct constraint_expr *cexpr)
{
        u32 val1, val2;
        struct context *c;
        struct role_datum *r1, *r2;
        struct mls_level *l1, *l2;
        struct constraint_expr *e;
        int s[CEXPR_MAXDEPTH];
        int sp = -1;

        for (e = cexpr; e; e = e->next) {
                switch (e->expr_type) {
                case CEXPR_NOT:
                        BUG_ON(sp < 0);
                        s[sp] = !s[sp];
                        break;
                case CEXPR_AND:
                        BUG_ON(sp < 1);
                        sp--;
                        s[sp] &= s[sp + 1];
                        break;
                case CEXPR_OR:
                        BUG_ON(sp < 1);
                        sp--;
                        s[sp] |= s[sp + 1];
                        break;
                case CEXPR_ATTR:
                        if (sp == (CEXPR_MAXDEPTH - 1))
                                return 0;
                        switch (e->attr) {
                        case CEXPR_USER:
                                val1 = scontext->user;
                                val2 = tcontext->user;
                                break;
                        case CEXPR_TYPE:
                                val1 = scontext->type;
                                val2 = tcontext->type;
                                break;
                        case CEXPR_ROLE:
                                val1 = scontext->role;
                                val2 = tcontext->role;
                                r1 = policydb->role_val_to_struct[val1 - 1];
                                r2 = policydb->role_val_to_struct[val2 - 1];
                                switch (e->op) {
                                case CEXPR_DOM:
                                        s[++sp] = ebitmap_get_bit(&r1->dominates,
                                                                  val2 - 1);
                                        continue;
                                case CEXPR_DOMBY:
                                        s[++sp] = ebitmap_get_bit(&r2->dominates,
                                                                  val1 - 1);
                                        continue;
                                case CEXPR_INCOMP:
                                        s[++sp] = (!ebitmap_get_bit(&r1->dominates,
                                                                    val2 - 1) &&
                                                   !ebitmap_get_bit(&r2->dominates,
                                                                    val1 - 1));
                                        continue;
                                default:
                                        break;
                                }
                                break;
                        case CEXPR_L1L2:
                                l1 = &(scontext->range.level[0]);
                                l2 = &(tcontext->range.level[0]);
                                goto mls_ops;
                        case CEXPR_L1H2:
                                l1 = &(scontext->range.level[0]);
                                l2 = &(tcontext->range.level[1]);
                                goto mls_ops;
                        case CEXPR_H1L2:
                                l1 = &(scontext->range.level[1]);
                                l2 = &(tcontext->range.level[0]);
                                goto mls_ops;
                        case CEXPR_H1H2:
                                l1 = &(scontext->range.level[1]);
                                l2 = &(tcontext->range.level[1]);
                                goto mls_ops;
                        case CEXPR_L1H1:
                                l1 = &(scontext->range.level[0]);
                                l2 = &(scontext->range.level[1]);
                                goto mls_ops;
                        case CEXPR_L2H2:
                                l1 = &(tcontext->range.level[0]);
                                l2 = &(tcontext->range.level[1]);
                                goto mls_ops;
mls_ops:
                        switch (e->op) {
                        case CEXPR_EQ:
                                s[++sp] = mls_level_eq(l1, l2);
                                continue;
                        case CEXPR_NEQ:
                                s[++sp] = !mls_level_eq(l1, l2);
                                continue;
                        case CEXPR_DOM:
                                s[++sp] = mls_level_dom(l1, l2);
                                continue;
                        case CEXPR_DOMBY:
                                s[++sp] = mls_level_dom(l2, l1);
                                continue;
                        case CEXPR_INCOMP:
                                s[++sp] = mls_level_incomp(l2, l1);
                                continue;
                        default:
                                BUG();
                                return 0;
                        }
                        break;
                        default:
                                BUG();
                                return 0;
                        }

                        switch (e->op) {
                        case CEXPR_EQ:
                                s[++sp] = (val1 == val2);
                                break;
                        case CEXPR_NEQ:
                                s[++sp] = (val1 != val2);
                                break;
                        default:
                                BUG();
                                return 0;
                        }
                        break;
                case CEXPR_NAMES:
                        if (sp == (CEXPR_MAXDEPTH-1))
                                return 0;
                        c = scontext;
                        if (e->attr & CEXPR_TARGET)
                                c = tcontext;
                        else if (e->attr & CEXPR_XTARGET) {
                                c = xcontext;
                                if (!c) {
                                        BUG();
                                        return 0;
                                }
                        }
                        if (e->attr & CEXPR_USER)
                                val1 = c->user;
                        else if (e->attr & CEXPR_ROLE)
                                val1 = c->role;
                        else if (e->attr & CEXPR_TYPE)
                                val1 = c->type;
                        else {
                                BUG();
                                return 0;
                        }

                        switch (e->op) {
                        case CEXPR_EQ:
                                s[++sp] = ebitmap_get_bit(&e->names, val1 - 1);
                                break;
                        case CEXPR_NEQ:
                                s[++sp] = !ebitmap_get_bit(&e->names, val1 - 1);
                                break;
                        default:
                                BUG();
                                return 0;
                        }
                        break;
                default:
                        BUG();
                        return 0;
                }
        }

        BUG_ON(sp != 0);
        return s[0];
}

/*
 * security_dump_masked_av - dumps masked permissions during
 * security_compute_av due to RBAC, MLS/Constraint and Type bounds.
 */
static int dump_masked_av_helper(void *k, void *d, void *args)
{
        struct perm_datum *pdatum = d;
        char **permission_names = args;

        BUG_ON(pdatum->value < 1 || pdatum->value > 32);

        permission_names[pdatum->value - 1] = (char *)k;

        return 0;
}

static void security_dump_masked_av(struct policydb *policydb,
                                    struct context *scontext,
                                    struct context *tcontext,
                                    u16 tclass,
                                    u32 permissions,
                                    const char *reason)
{
        struct common_datum *common_dat;
        struct class_datum *tclass_dat;
        struct audit_buffer *ab;
        char *tclass_name;
        char *scontext_name = NULL;
        char *tcontext_name = NULL;
        char *permission_names[32];
        int index;
        u32 length;
        bool need_comma = false;

        if (!permissions)
                return;

        tclass_name = sym_name(policydb, SYM_CLASSES, tclass - 1);
        tclass_dat = policydb->class_val_to_struct[tclass - 1];
        common_dat = tclass_dat->comdatum;

        /* init permission_names */
        if (common_dat &&
            hashtab_map(&common_dat->permissions.table,
                        dump_masked_av_helper, permission_names) < 0)
                goto out;

        if (hashtab_map(&tclass_dat->permissions.table,
                        dump_masked_av_helper, permission_names) < 0)
                goto out;

        /* get scontext/tcontext in text form */
        if (context_struct_to_string(policydb, scontext,
                                     &scontext_name, &length) < 0)
                goto out;

        if (context_struct_to_string(policydb, tcontext,
                                     &tcontext_name, &length) < 0)
                goto out;

        /* audit a message */
        ab = audit_log_start(audit_context(),
                             GFP_ATOMIC, AUDIT_SELINUX_ERR);
        if (!ab)
                goto out;

        audit_log_format(ab, "op=security_compute_av reason=%s "
                         "scontext=%s tcontext=%s tclass=%s perms=",
                         reason, scontext_name, tcontext_name, tclass_name);

        for (index = 0; index < 32; index++) {
                u32 mask = (1 << index);

                if ((mask & permissions) == 0)
                        continue;

                audit_log_format(ab, "%s%s",
                                 need_comma ? "," : "",
                                 permission_names[index]
                                 ? permission_names[index] : "????");
                need_comma = true;
        }
        audit_log_end(ab);
out:
        /* release scontext/tcontext */
        kfree(tcontext_name);
        kfree(scontext_name);

        return;
}

/*
 * security_boundary_permission - drops violated permissions
 * on boundary constraint.
 */
static void type_attribute_bounds_av(struct policydb *policydb,
                                     struct context *scontext,
                                     struct context *tcontext,
                                     u16 tclass,
                                     struct av_decision *avd)
{
        struct context lo_scontext;
        struct context lo_tcontext, *tcontextp = tcontext;
        struct av_decision lo_avd;
        struct type_datum *source;
        struct type_datum *target;
        u32 masked = 0;

        source = policydb->type_val_to_struct[scontext->type - 1];
        BUG_ON(!source);

        if (!source->bounds)
                return;

        target = policydb->type_val_to_struct[tcontext->type - 1];
        BUG_ON(!target);

        memset(&lo_avd, 0, sizeof(lo_avd));

        memcpy(&lo_scontext, scontext, sizeof(lo_scontext));
        lo_scontext.type = source->bounds;

        if (target->bounds) {
                memcpy(&lo_tcontext, tcontext, sizeof(lo_tcontext));
                lo_tcontext.type = target->bounds;
                tcontextp = &lo_tcontext;
        }

        context_struct_compute_av(policydb, &lo_scontext,
                                  tcontextp,
                                  tclass,
                                  &lo_avd,
                                  NULL);

        masked = ~lo_avd.allowed & avd->allowed;

        if (likely(!masked))
                return;                /* no masked permission */

        /* mask violated permissions */
        avd->allowed &= ~masked;

        /* audit masked permissions */
        security_dump_masked_av(policydb, scontext, tcontext,
                                tclass, masked, "bounds");
}

/*
 * flag which drivers have permissions
 * only looking for ioctl based extended permssions
 */
void services_compute_xperms_drivers(
                struct extended_perms *xperms,
                struct avtab_node *node)
{
        unsigned int i;

        if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLDRIVER) {
                /* if one or more driver has all permissions allowed */
                for (i = 0; i < ARRAY_SIZE(xperms->drivers.p); i++)
                        xperms->drivers.p[i] |= node->datum.u.xperms->perms.p[i];
        } else if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLFUNCTION) {
                /* if allowing permissions within a driver */
                security_xperm_set(xperms->drivers.p,
                                        node->datum.u.xperms->driver);
        }

        /* If no ioctl commands are allowed, ignore auditallow and auditdeny */
        if (node->key.specified & AVTAB_XPERMS_ALLOWED)
                xperms->len = 1;
}

/*
 * Compute access vectors and extended permissions based on a context
 * structure pair for the permissions in a particular class.
 */
static void context_struct_compute_av(struct policydb *policydb,
                                      struct context *scontext,
                                      struct context *tcontext,
                                      u16 tclass,
                                      struct av_decision *avd,
                                      struct extended_perms *xperms)
{
        struct constraint_node *constraint;
        struct role_allow *ra;
        struct avtab_key avkey;
        struct avtab_node *node;
        struct class_datum *tclass_datum;
        struct ebitmap *sattr, *tattr;
        struct ebitmap_node *snode, *tnode;
        unsigned int i, j;

        avd->allowed = 0;
        avd->auditallow = 0;
        avd->auditdeny = 0xffffffff;
        if (xperms) {
                memset(&xperms->drivers, 0, sizeof(xperms->drivers));
                xperms->len = 0;
        }

        if (unlikely(!tclass || tclass > policydb->p_classes.nprim)) {
                if (printk_ratelimit())
                        pr_warn("SELinux:  Invalid class %hu\n", tclass);
                return;
        }

        tclass_datum = policydb->class_val_to_struct[tclass - 1];

        /*
         * If a specific type enforcement rule was defined for
         * this permission check, then use it.
         */
        avkey.target_class = tclass;
        avkey.specified = AVTAB_AV | AVTAB_XPERMS;
        sattr = &policydb->type_attr_map_array[scontext->type - 1];
        tattr = &policydb->type_attr_map_array[tcontext->type - 1];
        ebitmap_for_each_positive_bit(sattr, snode, i) {
                ebitmap_for_each_positive_bit(tattr, tnode, j) {
                        avkey.source_type = i + 1;
                        avkey.target_type = j + 1;
                        for (node = avtab_search_node(&policydb->te_avtab,
                                                      &avkey);
                             node;
                             node = avtab_search_node_next(node, avkey.specified)) {
                                if (node->key.specified == AVTAB_ALLOWED)
                                        avd->allowed |= node->datum.u.data;
                                else if (node->key.specified == AVTAB_AUDITALLOW)
                                        avd->auditallow |= node->datum.u.data;
                                else if (node->key.specified == AVTAB_AUDITDENY)
                                        avd->auditdeny &= node->datum.u.data;
                                else if (xperms && (node->key.specified & AVTAB_XPERMS))
                                        services_compute_xperms_drivers(xperms, node);
                        }

                        /* Check conditional av table for additional permissions */
                        cond_compute_av(&policydb->te_cond_avtab, &avkey,
                                        avd, xperms);

                }
        }

        /*
         * Remove any permissions prohibited by a constraint (this includes
         * the MLS policy).
         */
        constraint = tclass_datum->constraints;
        while (constraint) {
                if ((constraint->permissions & (avd->allowed)) &&
                    !constraint_expr_eval(policydb, scontext, tcontext, NULL,
                                          constraint->expr)) {
                        avd->allowed &= ~(constraint->permissions);
                }
                constraint = constraint->next;
        }

        /*
         * If checking process transition permission and the
         * role is changing, then check the (current_role, new_role)
         * pair.
         */
        if (tclass == policydb->process_class &&
            (avd->allowed & policydb->process_trans_perms) &&
            scontext->role != tcontext->role) {
                for (ra = policydb->role_allow; ra; ra = ra->next) {
                        if (scontext->role == ra->role &&
                            tcontext->role == ra->new_role)
                                break;
                }
                if (!ra)
                        avd->allowed &= ~policydb->process_trans_perms;
        }

        /*
         * If the given source and target types have boundary
         * constraint, lazy checks have to mask any violated
         * permission and notice it to userspace via audit.
         */
        type_attribute_bounds_av(policydb, scontext, tcontext,
                                 tclass, avd);
}

static int security_validtrans_handle_fail(struct selinux_state *state,
                                        struct selinux_policy *policy,
                                        struct sidtab_entry *oentry,
                                        struct sidtab_entry *nentry,
                                        struct sidtab_entry *tentry,
                                        u16 tclass)
{
        struct policydb *p = &policy->policydb;
        struct sidtab *sidtab = policy->sidtab;
        char *o = NULL, *n = NULL, *t = NULL;
        u32 olen, nlen, tlen;

        if (sidtab_entry_to_string(p, sidtab, oentry, &o, &olen))
                goto out;
        if (sidtab_entry_to_string(p, sidtab, nentry, &n, &nlen))
                goto out;
        if (sidtab_entry_to_string(p, sidtab, tentry, &t, &tlen))
                goto out;
        audit_log(audit_context(), GFP_ATOMIC, AUDIT_SELINUX_ERR,
                  "op=security_validate_transition seresult=denied"
                  " oldcontext=%s newcontext=%s taskcontext=%s tclass=%s",
                  o, n, t, sym_name(p, SYM_CLASSES, tclass-1));
out:
        kfree(o);
        kfree(n);
        kfree(t);

        if (!enforcing_enabled(state))
                return 0;
        return -EPERM;
}

static int security_compute_validatetrans(struct selinux_state *state,
                                          u32 oldsid, u32 newsid, u32 tasksid,
                                          u16 orig_tclass, bool user)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        struct sidtab_entry *oentry;
        struct sidtab_entry *nentry;
        struct sidtab_entry *tentry;
        struct class_datum *tclass_datum;
        struct constraint_node *constraint;
        u16 tclass;
        int rc = 0;


        if (!selinux_initialized(state))
                return 0;

        rcu_read_lock();

        policy = rcu_dereference(state->policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        if (!user)
                tclass = unmap_class(&policy->map, orig_tclass);
        else
                tclass = orig_tclass;

        if (!tclass || tclass > policydb->p_classes.nprim) {
                rc = -EINVAL;
                goto out;
        }
        tclass_datum = policydb->class_val_to_struct[tclass - 1];

        oentry = sidtab_search_entry(sidtab, oldsid);
        if (!oentry) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                        __func__, oldsid);
                rc = -EINVAL;
                goto out;
        }

        nentry = sidtab_search_entry(sidtab, newsid);
        if (!nentry) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                        __func__, newsid);
                rc = -EINVAL;
                goto out;
        }

        tentry = sidtab_search_entry(sidtab, tasksid);
        if (!tentry) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                        __func__, tasksid);
                rc = -EINVAL;
                goto out;
        }

        constraint = tclass_datum->validatetrans;
        while (constraint) {
                if (!constraint_expr_eval(policydb, &oentry->context,
                                          &nentry->context, &tentry->context,
                                          constraint->expr)) {
                        if (user)
                                rc = -EPERM;
                        else
                                rc = security_validtrans_handle_fail(state,
                                                                policy,
                                                                oentry,
                                                                nentry,
                                                                tentry,
                                                                tclass);
                        goto out;
                }
                constraint = constraint->next;
        }

out:
        rcu_read_unlock();
        return rc;
}

int security_validate_transition_user(struct selinux_state *state,
                                      u32 oldsid, u32 newsid, u32 tasksid,
                                      u16 tclass)
{
        return security_compute_validatetrans(state, oldsid, newsid, tasksid,
                                              tclass, true);
}

int security_validate_transition(struct selinux_state *state,
                                 u32 oldsid, u32 newsid, u32 tasksid,
                                 u16 orig_tclass)
{
        return security_compute_validatetrans(state, oldsid, newsid, tasksid,
                                              orig_tclass, false);
}

/*
 * security_bounded_transition - check whether the given
 * transition is directed to bounded, or not.
 * It returns 0, if @newsid is bounded by @oldsid.
 * Otherwise, it returns error code.
 *
 * @oldsid : current security identifier
 * @newsid : destinated security identifier
 */
int security_bounded_transition(struct selinux_state *state,
                                u32 old_sid, u32 new_sid)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        struct sidtab_entry *old_entry, *new_entry;
        struct type_datum *type;
        int index;
        int rc;

        if (!selinux_initialized(state))
                return 0;

        rcu_read_lock();
        policy = rcu_dereference(state->policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        rc = -EINVAL;
        old_entry = sidtab_search_entry(sidtab, old_sid);
        if (!old_entry) {
                pr_err("SELinux: %s: unrecognized SID %u\n",
                       __func__, old_sid);
                goto out;
        }

        rc = -EINVAL;
        new_entry = sidtab_search_entry(sidtab, new_sid);
        if (!new_entry) {
                pr_err("SELinux: %s: unrecognized SID %u\n",
                       __func__, new_sid);
                goto out;
        }

        rc = 0;
        /* type/domain unchanged */
        if (old_entry->context.type == new_entry->context.type)
                goto out;

        index = new_entry->context.type;
        while (true) {
                type = policydb->type_val_to_struct[index - 1];
                BUG_ON(!type);

                /* not bounded anymore */
                rc = -EPERM;
                if (!type->bounds)
                        break;

                /* @newsid is bounded by @oldsid */
                rc = 0;
                if (type->bounds == old_entry->context.type)
                        break;

                index = type->bounds;
        }

        if (rc) {
                char *old_name = NULL;
                char *new_name = NULL;
                u32 length;

                if (!sidtab_entry_to_string(policydb, sidtab, old_entry,
                                            &old_name, &length) &&
                    !sidtab_entry_to_string(policydb, sidtab, new_entry,
                                            &new_name, &length)) {
                        audit_log(audit_context(),
                                  GFP_ATOMIC, AUDIT_SELINUX_ERR,
                                  "op=security_bounded_transition "
                                  "seresult=denied "
                                  "oldcontext=%s newcontext=%s",
                                  old_name, new_name);
                }
                kfree(new_name);
                kfree(old_name);
        }
out:
        rcu_read_unlock();

        return rc;
}

static void avd_init(struct selinux_policy *policy, struct av_decision *avd)
{
        avd->allowed = 0;
        avd->auditallow = 0;
        avd->auditdeny = 0xffffffff;
        if (policy)
                avd->seqno = policy->latest_granting;
        else
                avd->seqno = 0;
        avd->flags = 0;
}

void services_compute_xperms_decision(struct extended_perms_decision *xpermd,
                                        struct avtab_node *node)
{
        unsigned int i;

        if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLFUNCTION) {
                if (xpermd->driver != node->datum.u.xperms->driver)
                        return;
        } else if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLDRIVER) {
                if (!security_xperm_test(node->datum.u.xperms->perms.p,
                                        xpermd->driver))
                        return;
        } else {
                pr_warn_once(
                        "SELinux: unknown extended permission (%u) will be ignored\n",
                        node->datum.u.xperms->specified);
                return;
        }

        if (node->key.specified == AVTAB_XPERMS_ALLOWED) {
                xpermd->used |= XPERMS_ALLOWED;
                if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLDRIVER) {
                        memset(xpermd->allowed->p, 0xff,
                                        sizeof(xpermd->allowed->p));
                }
                if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLFUNCTION) {
                        for (i = 0; i < ARRAY_SIZE(xpermd->allowed->p); i++)
                                xpermd->allowed->p[i] |=
                                        node->datum.u.xperms->perms.p[i];
                }
        } else if (node->key.specified == AVTAB_XPERMS_AUDITALLOW) {
                xpermd->used |= XPERMS_AUDITALLOW;
                if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLDRIVER) {
                        memset(xpermd->auditallow->p, 0xff,
                                        sizeof(xpermd->auditallow->p));
                }
                if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLFUNCTION) {
                        for (i = 0; i < ARRAY_SIZE(xpermd->auditallow->p); i++)
                                xpermd->auditallow->p[i] |=
                                        node->datum.u.xperms->perms.p[i];
                }
        } else if (node->key.specified == AVTAB_XPERMS_DONTAUDIT) {
                xpermd->used |= XPERMS_DONTAUDIT;
                if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLDRIVER) {
                        memset(xpermd->dontaudit->p, 0xff,
                                        sizeof(xpermd->dontaudit->p));
                }
                if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLFUNCTION) {
                        for (i = 0; i < ARRAY_SIZE(xpermd->dontaudit->p); i++)
                                xpermd->dontaudit->p[i] |=
                                        node->datum.u.xperms->perms.p[i];
                }
        } else {
                pr_warn_once("SELinux: unknown specified key (%u)\n",
                             node->key.specified);
        }
}

void security_compute_xperms_decision(struct selinux_state *state,
                                      u32 ssid,
                                      u32 tsid,
                                      u16 orig_tclass,
                                      u8 driver,
                                      struct extended_perms_decision *xpermd)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        u16 tclass;
        struct context *scontext, *tcontext;
        struct avtab_key avkey;
        struct avtab_node *node;
        struct ebitmap *sattr, *tattr;
        struct ebitmap_node *snode, *tnode;
        unsigned int i, j;

        xpermd->driver = driver;
        xpermd->used = 0;
        memset(xpermd->allowed->p, 0, sizeof(xpermd->allowed->p));
        memset(xpermd->auditallow->p, 0, sizeof(xpermd->auditallow->p));
        memset(xpermd->dontaudit->p, 0, sizeof(xpermd->dontaudit->p));

        rcu_read_lock();
        if (!selinux_initialized(state))
                goto allow;

        policy = rcu_dereference(state->policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        scontext = sidtab_search(sidtab, ssid);
        if (!scontext) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                       __func__, ssid);
                goto out;
        }

        tcontext = sidtab_search(sidtab, tsid);
        if (!tcontext) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                       __func__, tsid);
                goto out;
        }

        tclass = unmap_class(&policy->map, orig_tclass);
        if (unlikely(orig_tclass && !tclass)) {
                if (policydb->allow_unknown)
                        goto allow;
                goto out;
        }


        if (unlikely(!tclass || tclass > policydb->p_classes.nprim)) {
                pr_warn_ratelimited("SELinux:  Invalid class %hu\n", tclass);
                goto out;
        }

        avkey.target_class = tclass;
        avkey.specified = AVTAB_XPERMS;
        sattr = &policydb->type_attr_map_array[scontext->type - 1];
        tattr = &policydb->type_attr_map_array[tcontext->type - 1];
        ebitmap_for_each_positive_bit(sattr, snode, i) {
                ebitmap_for_each_positive_bit(tattr, tnode, j) {
                        avkey.source_type = i + 1;
                        avkey.target_type = j + 1;
                        for (node = avtab_search_node(&policydb->te_avtab,
                                                      &avkey);
                             node;
                             node = avtab_search_node_next(node, avkey.specified))
                                services_compute_xperms_decision(xpermd, node);

                        cond_compute_xperms(&policydb->te_cond_avtab,
                                                &avkey, xpermd);
                }
        }
out:
        rcu_read_unlock();
        return;
allow:
        memset(xpermd->allowed->p, 0xff, sizeof(xpermd->allowed->p));
        goto out;
}

/**
 * security_compute_av - Compute access vector decisions.
 * @ssid: source security identifier
 * @tsid: target security identifier
 * @tclass: target security class
 * @avd: access vector decisions
 * @xperms: extended permissions
 *
 * Compute a set of access vector decisions based on the
 * SID pair (@ssid, @tsid) for the permissions in @tclass.
 */
void security_compute_av(struct selinux_state *state,
                         u32 ssid,
                         u32 tsid,
                         u16 orig_tclass,
                         struct av_decision *avd,
                         struct extended_perms *xperms)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        u16 tclass;
        struct context *scontext = NULL, *tcontext = NULL;

        rcu_read_lock();
        policy = rcu_dereference(state->policy);
        avd_init(policy, avd);
        xperms->len = 0;
        if (!selinux_initialized(state))
                goto allow;

        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        scontext = sidtab_search(sidtab, ssid);
        if (!scontext) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                       __func__, ssid);
                goto out;
        }

        /* permissive domain? */
        if (ebitmap_get_bit(&policydb->permissive_map, scontext->type))
                avd->flags |= AVD_FLAGS_PERMISSIVE;

        tcontext = sidtab_search(sidtab, tsid);
        if (!tcontext) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                       __func__, tsid);
                goto out;
        }

        tclass = unmap_class(&policy->map, orig_tclass);
        if (unlikely(orig_tclass && !tclass)) {
                if (policydb->allow_unknown)
                        goto allow;
                goto out;
        }
        context_struct_compute_av(policydb, scontext, tcontext, tclass, avd,
                                  xperms);
        map_decision(&policy->map, orig_tclass, avd,
                     policydb->allow_unknown);
out:
        rcu_read_unlock();
        return;
allow:
        avd->allowed = 0xffffffff;
        goto out;
}

void security_compute_av_user(struct selinux_state *state,
                              u32 ssid,
                              u32 tsid,
                              u16 tclass,
                              struct av_decision *avd)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        struct context *scontext = NULL, *tcontext = NULL;

        rcu_read_lock();
        policy = rcu_dereference(state->policy);
        avd_init(policy, avd);
        if (!selinux_initialized(state))
                goto allow;

        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        scontext = sidtab_search(sidtab, ssid);
        if (!scontext) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                       __func__, ssid);
                goto out;
        }

        /* permissive domain? */
        if (ebitmap_get_bit(&policydb->permissive_map, scontext->type))
                avd->flags |= AVD_FLAGS_PERMISSIVE;

        tcontext = sidtab_search(sidtab, tsid);
        if (!tcontext) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                       __func__, tsid);
                goto out;
        }

        if (unlikely(!tclass)) {
                if (policydb->allow_unknown)
                        goto allow;
                goto out;
        }

        context_struct_compute_av(policydb, scontext, tcontext, tclass, avd,
                                  NULL);
 out:
        rcu_read_unlock();
        return;
allow:
        avd->allowed = 0xffffffff;
        goto out;
}

/*
 * Write the security context string representation of
 * the context structure `context' into a dynamically
 * allocated string of the correct size.  Set `*scontext'
 * to point to this string and set `*scontext_len' to
 * the length of the string.
 */
static int context_struct_to_string(struct policydb *p,
                                    struct context *context,
                                    char **scontext, u32 *scontext_len)
{
        char *scontextp;

        if (scontext)
                *scontext = NULL;
        *scontext_len = 0;

        if (context->len) {
                *scontext_len = context->len;
                if (scontext) {
                        *scontext = kstrdup(context->str, GFP_ATOMIC);
                        if (!(*scontext))
                                return -ENOMEM;
                }
                return 0;
        }

        /* Compute the size of the context. */
        *scontext_len += strlen(sym_name(p, SYM_USERS, context->user - 1)) + 1;
        *scontext_len += strlen(sym_name(p, SYM_ROLES, context->role - 1)) + 1;
        *scontext_len += strlen(sym_name(p, SYM_TYPES, context->type - 1)) + 1;
        *scontext_len += mls_compute_context_len(p, context);

        if (!scontext)
                return 0;

        /* Allocate space for the context; caller must free this space. */
        scontextp = kmalloc(*scontext_len, GFP_ATOMIC);
        if (!scontextp)
                return -ENOMEM;
        *scontext = scontextp;

        /*
         * Copy the user name, role name and type name into the context.
         */
        scontextp += sprintf(scontextp, "%s:%s:%s",
                sym_name(p, SYM_USERS, context->user - 1),
                sym_name(p, SYM_ROLES, context->role - 1),
                sym_name(p, SYM_TYPES, context->type - 1));

        mls_sid_to_context(p, context, &scontextp);

        *scontextp = 0;

        return 0;
}

static int sidtab_entry_to_string(struct policydb *p,
                                  struct sidtab *sidtab,
                                  struct sidtab_entry *entry,
                                  char **scontext, u32 *scontext_len)
{
        int rc = sidtab_sid2str_get(sidtab, entry, scontext, scontext_len);

        if (rc != -ENOENT)
                return rc;

        rc = context_struct_to_string(p, &entry->context, scontext,
                                      scontext_len);
        if (!rc && scontext)
                sidtab_sid2str_put(sidtab, entry, *scontext, *scontext_len);
        return rc;
}

#include "initial_sid_to_string.h"

int security_sidtab_hash_stats(struct selinux_state *state, char *page)
{
        struct selinux_policy *policy;
        int rc;

        if (!selinux_initialized(state)) {
                pr_err("SELinux: %s:  called before initial load_policy\n",
                       __func__);
                return -EINVAL;
        }

        rcu_read_lock();
        policy = rcu_dereference(state->policy);
        rc = sidtab_hash_stats(policy->sidtab, page);
        rcu_read_unlock();

        return rc;
}

const char *security_get_initial_sid_context(u32 sid)
{
        if (unlikely(sid > SECINITSID_NUM))
                return NULL;
        return initial_sid_to_string[sid];
}

static int security_sid_to_context_core(struct selinux_state *state,
                                        u32 sid, char **scontext,
                                        u32 *scontext_len, int force,
                                        int only_invalid)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        struct sidtab_entry *entry;
        int rc = 0;

        if (scontext)
                *scontext = NULL;
        *scontext_len  = 0;

        if (!selinux_initialized(state)) {
                if (sid <= SECINITSID_NUM) {
                        char *scontextp;
                        const char *s = initial_sid_to_string[sid];

                        if (!s)
                                return -EINVAL;
                        *scontext_len = strlen(s) + 1;
                        if (!scontext)
                                return 0;
                        scontextp = kmemdup(s, *scontext_len, GFP_ATOMIC);
                        if (!scontextp)
                                return -ENOMEM;
                        *scontext = scontextp;
                        return 0;
                }
                pr_err("SELinux: %s:  called before initial "
                       "load_policy on unknown SID %d\n", __func__, sid);
                return -EINVAL;
        }
        rcu_read_lock();
        policy = rcu_dereference(state->policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        if (force)
                entry = sidtab_search_entry_force(sidtab, sid);
        else
                entry = sidtab_search_entry(sidtab, sid);
        if (!entry) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                        __func__, sid);
                rc = -EINVAL;
                goto out_unlock;
        }
        if (only_invalid && !entry->context.len)
                goto out_unlock;

        rc = sidtab_entry_to_string(policydb, sidtab, entry, scontext,
                                    scontext_len);

out_unlock:
        rcu_read_unlock();
        return rc;

}

/**
 * security_sid_to_context - Obtain a context for a given SID.
 * @sid: security identifier, SID
 * @scontext: security context
 * @scontext_len: length in bytes
 *
 * Write the string representation of the context associated with @sid
 * into a dynamically allocated string of the correct size.  Set @scontext
 * to point to this string and set @scontext_len to the length of the string.
 */
int security_sid_to_context(struct selinux_state *state,
                            u32 sid, char **scontext, u32 *scontext_len)
{
        return security_sid_to_context_core(state, sid, scontext,
                                            scontext_len, 0, 0);
}

int security_sid_to_context_force(struct selinux_state *state, u32 sid,
                                  char **scontext, u32 *scontext_len)
{
        return security_sid_to_context_core(state, sid, scontext,
                                            scontext_len, 1, 0);
}

/**
 * security_sid_to_context_inval - Obtain a context for a given SID if it
 *                                 is invalid.
 * @sid: security identifier, SID
 * @scontext: security context
 * @scontext_len: length in bytes
 *
 * Write the string representation of the context associated with @sid
 * into a dynamically allocated string of the correct size, but only if the
 * context is invalid in the current policy.  Set @scontext to point to
 * this string (or NULL if the context is valid) and set @scontext_len to
 * the length of the string (or 0 if the context is valid).
 */
int security_sid_to_context_inval(struct selinux_state *state, u32 sid,
                                  char **scontext, u32 *scontext_len)
{
        return security_sid_to_context_core(state, sid, scontext,
                                            scontext_len, 1, 1);
}

/*
 * Caveat:  Mutates scontext.
 */
static int string_to_context_struct(struct policydb *pol,
                                    struct sidtab *sidtabp,
                                    char *scontext,
                                    struct context *ctx,
                                    u32 def_sid)
{
        struct role_datum *role;
        struct type_datum *typdatum;
        struct user_datum *usrdatum;
        char *scontextp, *p, oldc;
        int rc = 0;

        context_init(ctx);

        /* Parse the security context. */

        rc = -EINVAL;
        scontextp = (char *) scontext;

        /* Extract the user. */
        p = scontextp;
        while (*p && *p != ':')
                p++;

        if (*p == 0)
                goto out;

        *p++ = 0;

        usrdatum = symtab_search(&pol->p_users, scontextp);
        if (!usrdatum)
                goto out;

        ctx->user = usrdatum->value;

        /* Extract role. */
        scontextp = p;
        while (*p && *p != ':')
                p++;

        if (*p == 0)
                goto out;

        *p++ = 0;

        role = symtab_search(&pol->p_roles, scontextp);
        if (!role)
                goto out;
        ctx->role = role->value;

        /* Extract type. */
        scontextp = p;
        while (*p && *p != ':')
                p++;
        oldc = *p;
        *p++ = 0;

        typdatum = symtab_search(&pol->p_types, scontextp);
        if (!typdatum || typdatum->attribute)
                goto out;

        ctx->type = typdatum->value;

        rc = mls_context_to_sid(pol, oldc, p, ctx, sidtabp, def_sid);
        if (rc)
                goto out;

        /* Check the validity of the new context. */
        rc = -EINVAL;
        if (!policydb_context_isvalid(pol, ctx))
                goto out;
        rc = 0;
out:
        if (rc)
                context_destroy(ctx);
        return rc;
}

static int security_context_to_sid_core(struct selinux_state *state,
                                        const char *scontext, u32 scontext_len,
                                        u32 *sid, u32 def_sid, gfp_t gfp_flags,
                                        int force)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        char *scontext2, *str = NULL;
        struct context context;
        int rc = 0;

        /* An empty security context is never valid. */
        if (!scontext_len)
                return -EINVAL;

        /* Copy the string to allow changes and ensure a NUL terminator */
        scontext2 = kmemdup_nul(scontext, scontext_len, gfp_flags);
        if (!scontext2)
                return -ENOMEM;

        if (!selinux_initialized(state)) {
                int i;

                for (i = 1; i < SECINITSID_NUM; i++) {
                        const char *s = initial_sid_to_string[i];

                        if (s && !strcmp(s, scontext2)) {
                                *sid = i;
                                goto out;
                        }
                }
                *sid = SECINITSID_KERNEL;
                goto out;
        }
        *sid = SECSID_NULL;

        if (force) {
                /* Save another copy for storing in uninterpreted form */
                rc = -ENOMEM;
                str = kstrdup(scontext2, gfp_flags);
                if (!str)
                        goto out;
        }
retry:
        rcu_read_lock();
        policy = rcu_dereference(state->policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;
        rc = string_to_context_struct(policydb, sidtab, scontext2,
                                      &context, def_sid);
        if (rc == -EINVAL && force) {
                context.str = str;
                context.len = strlen(str) + 1;
                str = NULL;
        } else if (rc)
                goto out_unlock;
        rc = sidtab_context_to_sid(sidtab, &context, sid);
        if (rc == -ESTALE) {
                rcu_read_unlock();
                if (context.str) {
                        str = context.str;
                        context.str = NULL;
                }
                context_destroy(&context);
                goto retry;
        }
        context_destroy(&context);
out_unlock:
        rcu_read_unlock();
out:
        kfree(scontext2);
        kfree(str);
        return rc;
}

/**
 * security_context_to_sid - Obtain a SID for a given security context.
 * @scontext: security context
 * @scontext_len: length in bytes
 * @sid: security identifier, SID
 * @gfp: context for the allocation
 *
 * Obtains a SID associated with the security context that
 * has the string representation specified by @scontext.
 * Returns -%EINVAL if the context is invalid, -%ENOMEM if insufficient
 * memory is available, or 0 on success.
 */
int security_context_to_sid(struct selinux_state *state,
                            const char *scontext, u32 scontext_len, u32 *sid,
                            gfp_t gfp)
{
        return security_context_to_sid_core(state, scontext, scontext_len,
                                            sid, SECSID_NULL, gfp, 0);
}

int security_context_str_to_sid(struct selinux_state *state,
                                const char *scontext, u32 *sid, gfp_t gfp)
{
        return security_context_to_sid(state, scontext, strlen(scontext),
                                       sid, gfp);
}

/**
 * security_context_to_sid_default - Obtain a SID for a given security context,
 * falling back to specified default if needed.
 *
 * @scontext: security context
 * @scontext_len: length in bytes
 * @sid: security identifier, SID
 * @def_sid: default SID to assign on error
 *
 * Obtains a SID associated with the security context that
 * has the string representation specified by @scontext.
 * The default SID is passed to the MLS layer to be used to allow
 * kernel labeling of the MLS field if the MLS field is not present
 * (for upgrading to MLS without full relabel).
 * Implicitly forces adding of the context even if it cannot be mapped yet.
 * Returns -%EINVAL if the context is invalid, -%ENOMEM if insufficient
 * memory is available, or 0 on success.
 */
int security_context_to_sid_default(struct selinux_state *state,
                                    const char *scontext, u32 scontext_len,
                                    u32 *sid, u32 def_sid, gfp_t gfp_flags)
{
        return security_context_to_sid_core(state, scontext, scontext_len,
                                            sid, def_sid, gfp_flags, 1);
}

int security_context_to_sid_force(struct selinux_state *state,
                                  const char *scontext, u32 scontext_len,
                                  u32 *sid)
{
        return security_context_to_sid_core(state, scontext, scontext_len,
                                            sid, SECSID_NULL, GFP_KERNEL, 1);
}

static int compute_sid_handle_invalid_context(
        struct selinux_state *state,
        struct selinux_policy *policy,
        struct sidtab_entry *sentry,
        struct sidtab_entry *tentry,
        u16 tclass,
        struct context *newcontext)
{
        struct policydb *policydb = &policy->policydb;
        struct sidtab *sidtab = policy->sidtab;
        char *s = NULL, *t = NULL, *n = NULL;
        u32 slen, tlen, nlen;
        struct audit_buffer *ab;

        if (sidtab_entry_to_string(policydb, sidtab, sentry, &s, &slen))
                goto out;
        if (sidtab_entry_to_string(policydb, sidtab, tentry, &t, &tlen))
                goto out;
        if (context_struct_to_string(policydb, newcontext, &n, &nlen))
                goto out;
        ab = audit_log_start(audit_context(), GFP_ATOMIC, AUDIT_SELINUX_ERR);
        audit_log_format(ab,
                         "op=security_compute_sid invalid_context=");
        /* no need to record the NUL with untrusted strings */
        audit_log_n_untrustedstring(ab, n, nlen - 1);
        audit_log_format(ab, " scontext=%s tcontext=%s tclass=%s",
                         s, t, sym_name(policydb, SYM_CLASSES, tclass-1));
        audit_log_end(ab);
out:
        kfree(s);
        kfree(t);
        kfree(n);
        if (!enforcing_enabled(state))
                return 0;
        return -EACCES;
}

static void filename_compute_type(struct policydb *policydb,
                                  struct context *newcontext,
                                  u32 stype, u32 ttype, u16 tclass,
                                  const char *objname)
{
        struct filename_trans_key ft;
        struct filename_trans_datum *datum;

        /*
         * Most filename trans rules are going to live in specific directories
         * like /dev or /var/run.  This bitmap will quickly skip rule searches
         * if the ttype does not contain any rules.
         */
        if (!ebitmap_get_bit(&policydb->filename_trans_ttypes, ttype))
                return;

        ft.ttype = ttype;
        ft.tclass = tclass;
        ft.name = objname;

        datum = policydb_filenametr_search(policydb, &ft);
        while (datum) {
                if (ebitmap_get_bit(&datum->stypes, stype - 1)) {
                        newcontext->type = datum->otype;
                        return;
                }
                datum = datum->next;
        }
}

static int security_compute_sid(struct selinux_state *state,
                                u32 ssid,
                                u32 tsid,
                                u16 orig_tclass,
                                u32 specified,
                                const char *objname,
                                u32 *out_sid,
                                bool kern)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        struct class_datum *cladatum;
        struct context *scontext, *tcontext, newcontext;
        struct sidtab_entry *sentry, *tentry;
        struct avtab_key avkey;
        struct avtab_datum *avdatum;
        struct avtab_node *node;
        u16 tclass;
        int rc = 0;
        bool sock;

        if (!selinux_initialized(state)) {
                switch (orig_tclass) {
                case SECCLASS_PROCESS: /* kernel value */
                        *out_sid = ssid;
                        break;
                default:
                        *out_sid = tsid;
                        break;
                }
                goto out;
        }

retry:
        cladatum = NULL;
        context_init(&newcontext);

        rcu_read_lock();

        policy = rcu_dereference(state->policy);

        if (kern) {
                tclass = unmap_class(&policy->map, orig_tclass);
                sock = security_is_socket_class(orig_tclass);
        } else {
                tclass = orig_tclass;
                sock = security_is_socket_class(map_class(&policy->map,
                                                          tclass));
        }

        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        sentry = sidtab_search_entry(sidtab, ssid);
        if (!sentry) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                       __func__, ssid);
                rc = -EINVAL;
                goto out_unlock;
        }
        tentry = sidtab_search_entry(sidtab, tsid);
        if (!tentry) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                       __func__, tsid);
                rc = -EINVAL;
                goto out_unlock;
        }

        scontext = &sentry->context;
        tcontext = &tentry->context;

        if (tclass && tclass <= policydb->p_classes.nprim)
                cladatum = policydb->class_val_to_struct[tclass - 1];

        /* Set the user identity. */
        switch (specified) {
        case AVTAB_TRANSITION:
        case AVTAB_CHANGE:
                if (cladatum && cladatum->default_user == DEFAULT_TARGET) {
                        newcontext.user = tcontext->user;
                } else {
                        /* notice this gets both DEFAULT_SOURCE and unset */
                        /* Use the process user identity. */
                        newcontext.user = scontext->user;
                }
                break;
        case AVTAB_MEMBER:
                /* Use the related object owner. */
                newcontext.user = tcontext->user;
                break;
        }

        /* Set the role to default values. */
        if (cladatum && cladatum->default_role == DEFAULT_SOURCE) {
                newcontext.role = scontext->role;
        } else if (cladatum && cladatum->default_role == DEFAULT_TARGET) {
                newcontext.role = tcontext->role;
        } else {
                if ((tclass == policydb->process_class) || sock)
                        newcontext.role = scontext->role;
                else
                        newcontext.role = OBJECT_R_VAL;
        }

        /* Set the type to default values. */
        if (cladatum && cladatum->default_type == DEFAULT_SOURCE) {
                newcontext.type = scontext->type;
        } else if (cladatum && cladatum->default_type == DEFAULT_TARGET) {
                newcontext.type = tcontext->type;
        } else {
                if ((tclass == policydb->process_class) || sock) {
                        /* Use the type of process. */
                        newcontext.type = scontext->type;
                } else {
                        /* Use the type of the related object. */
                        newcontext.type = tcontext->type;
                }
        }

        /* Look for a type transition/member/change rule. */
        avkey.source_type = scontext->type;
        avkey.target_type = tcontext->type;
        avkey.target_class = tclass;
        avkey.specified = specified;
        avdatum = avtab_search(&policydb->te_avtab, &avkey);

        /* If no permanent rule, also check for enabled conditional rules */
        if (!avdatum) {
                node = avtab_search_node(&policydb->te_cond_avtab, &avkey);
                for (; node; node = avtab_search_node_next(node, specified)) {
                        if (node->key.specified & AVTAB_ENABLED) {
                                avdatum = &node->datum;
                                break;
                        }
                }
        }

        if (avdatum) {
                /* Use the type from the type transition/member/change rule. */
                newcontext.type = avdatum->u.data;
        }

        /* if we have a objname this is a file trans check so check those rules */
        if (objname)
                filename_compute_type(policydb, &newcontext, scontext->type,
                                      tcontext->type, tclass, objname);

        /* Check for class-specific changes. */
        if (specified & AVTAB_TRANSITION) {
                /* Look for a role transition rule. */
                struct role_trans_datum *rtd;
                struct role_trans_key rtk = {
                        .role = scontext->role,
                        .type = tcontext->type,
                        .tclass = tclass,
                };

                rtd = policydb_roletr_search(policydb, &rtk);
                if (rtd)
                        newcontext.role = rtd->new_role;
        }

        /* Set the MLS attributes.
           This is done last because it may allocate memory. */
        rc = mls_compute_sid(policydb, scontext, tcontext, tclass, specified,
                             &newcontext, sock);
        if (rc)
                goto out_unlock;

        /* Check the validity of the context. */
        if (!policydb_context_isvalid(policydb, &newcontext)) {
                rc = compute_sid_handle_invalid_context(state, policy, sentry,
                                                        tentry, tclass,
                                                        &newcontext);
                if (rc)
                        goto out_unlock;
        }
        /* Obtain the sid for the context. */
        rc = sidtab_context_to_sid(sidtab, &newcontext, out_sid);
        if (rc == -ESTALE) {
                rcu_read_unlock();
                context_destroy(&newcontext);
                goto retry;
        }
out_unlock:
        rcu_read_unlock();
        context_destroy(&newcontext);
out:
        return rc;
}

/**
 * security_transition_sid - Compute the SID for a new subject/object.
 * @ssid: source security identifier
 * @tsid: target security identifier
 * @tclass: target security class
 * @out_sid: security identifier for new subject/object
 *
 * Compute a SID to use for labeling a new subject or object in the
 * class @tclass based on a SID pair (@ssid, @tsid).
 * Return -%EINVAL if any of the parameters are invalid, -%ENOMEM
 * if insufficient memory is available, or %0 if the new SID was
 * computed successfully.
 */
int security_transition_sid(struct selinux_state *state,
                            u32 ssid, u32 tsid, u16 tclass,
                            const struct qstr *qstr, u32 *out_sid)
{
        return security_compute_sid(state, ssid, tsid, tclass,
                                    AVTAB_TRANSITION,
                                    qstr ? qstr->name : NULL, out_sid, true);
}

int security_transition_sid_user(struct selinux_state *state,
                                 u32 ssid, u32 tsid, u16 tclass,
                                 const char *objname, u32 *out_sid)
{
        return security_compute_sid(state, ssid, tsid, tclass,
                                    AVTAB_TRANSITION,
                                    objname, out_sid, false);
}

/**
 * security_member_sid - Compute the SID for member selection.
 * @ssid: source security identifier
 * @tsid: target security identifier
 * @tclass: target security class
 * @out_sid: security identifier for selected member
 *
 * Compute a SID to use when selecting a member of a polyinstantiated
 * object of class @tclass based on a SID pair (@ssid, @tsid).
 * Return -%EINVAL if any of the parameters are invalid, -%ENOMEM
 * if insufficient memory is available, or %0 if the SID was
 * computed successfully.
 */
int security_member_sid(struct selinux_state *state,
                        u32 ssid,
                        u32 tsid,
                        u16 tclass,
                        u32 *out_sid)
{
        return security_compute_sid(state, ssid, tsid, tclass,
                                    AVTAB_MEMBER, NULL,
                                    out_sid, false);
}

/**
 * security_change_sid - Compute the SID for object relabeling.
 * @ssid: source security identifier
 * @tsid: target security identifier
 * @tclass: target security class
 * @out_sid: security identifier for selected member
 *
 * Compute a SID to use for relabeling an object of class @tclass
 * based on a SID pair (@ssid, @tsid).
 * Return -%EINVAL if any of the parameters are invalid, -%ENOMEM
 * if insufficient memory is available, or %0 if the SID was
 * computed successfully.
 */
int security_change_sid(struct selinux_state *state,
                        u32 ssid,
                        u32 tsid,
                        u16 tclass,
                        u32 *out_sid)
{
        return security_compute_sid(state,
                                    ssid, tsid, tclass, AVTAB_CHANGE, NULL,
                                    out_sid, false);
}

static inline int convert_context_handle_invalid_context(
        struct selinux_state *state,
        struct policydb *policydb,
        struct context *context)
{
        char *s;
        u32 len;

        if (enforcing_enabled(state))
                return -EINVAL;

        if (!context_struct_to_string(policydb, context, &s, &len)) {
                pr_warn("SELinux:  Context %s would be invalid if enforcing\n",
                        s);
                kfree(s);
        }
        return 0;
}

/*
 * Convert the values in the security context
 * structure `oldc' from the values specified
 * in the policy `p->oldp' to the values specified
 * in the policy `p->newp', storing the new context
 * in `newc'.  Verify that the context is valid
 * under the new policy.
 */
static int convert_context(struct context *oldc, struct context *newc, void *p,
                           gfp_t gfp_flags)
{
        struct convert_context_args *args;
        struct ocontext *oc;
        struct role_datum *role;
        struct type_datum *typdatum;
        struct user_datum *usrdatum;
        char *s;
        u32 len;
        int rc;

        args = p;

        if (oldc->str) {
                s = kstrdup(oldc->str, gfp_flags);
                if (!s)
                        return -ENOMEM;

                rc = string_to_context_struct(args->newp, NULL, s,
                                              newc, SECSID_NULL);
                if (rc == -EINVAL) {
                        /*
                         * Retain string representation for later mapping.
                         *
                         * IMPORTANT: We need to copy the contents of oldc->str
                         * back into s again because string_to_context_struct()
                         * may have garbled it.
                         */
                        memcpy(s, oldc->str, oldc->len);
                        context_init(newc);
                        newc->str = s;
                        newc->len = oldc->len;
                        return 0;
                }
                kfree(s);
                if (rc) {
                        /* Other error condition, e.g. ENOMEM. */
                        pr_err("SELinux:   Unable to map context %s, rc = %d.\n",
                               oldc->str, -rc);
                        return rc;
                }
                pr_info("SELinux:  Context %s became valid (mapped).\n",
                        oldc->str);
                return 0;
        }

        context_init(newc);

        /* Convert the user. */
        rc = -EINVAL;
        usrdatum = symtab_search(&args->newp->p_users,
                                 sym_name(args->oldp,
                                          SYM_USERS, oldc->user - 1));
        if (!usrdatum)
                goto bad;
        newc->user = usrdatum->value;

        /* Convert the role. */
        rc = -EINVAL;
        role = symtab_search(&args->newp->p_roles,
                             sym_name(args->oldp, SYM_ROLES, oldc->role - 1));
        if (!role)
                goto bad;
        newc->role = role->value;

        /* Convert the type. */
        rc = -EINVAL;
        typdatum = symtab_search(&args->newp->p_types,
                                 sym_name(args->oldp,
                                          SYM_TYPES, oldc->type - 1));
        if (!typdatum)
                goto bad;
        newc->type = typdatum->value;

        /* Convert the MLS fields if dealing with MLS policies */
        if (args->oldp->mls_enabled && args->newp->mls_enabled) {
                rc = mls_convert_context(args->oldp, args->newp, oldc, newc);
                if (rc)
                        goto bad;
        } else if (!args->oldp->mls_enabled && args->newp->mls_enabled) {
                /*
                 * Switching between non-MLS and MLS policy:
                 * ensure that the MLS fields of the context for all
                 * existing entries in the sidtab are filled in with a
                 * suitable default value, likely taken from one of the
                 * initial SIDs.
                 */
                oc = args->newp->ocontexts[OCON_ISID];
                while (oc && oc->sid[0] != SECINITSID_UNLABELED)
                        oc = oc->next;
                rc = -EINVAL;
                if (!oc) {
                        pr_err("SELinux:  unable to look up"
                                " the initial SIDs list\n");
                        goto bad;
                }
                rc = mls_range_set(newc, &oc->context[0].range);
                if (rc)
                        goto bad;
        }

        /* Check the validity of the new context. */
        if (!policydb_context_isvalid(args->newp, newc)) {
                rc = convert_context_handle_invalid_context(args->state,
                                                        args->oldp,
                                                        oldc);
                if (rc)
                        goto bad;
        }

        return 0;
bad:
        /* Map old representation to string and save it. */
        rc = context_struct_to_string(args->oldp, oldc, &s, &len);
        if (rc)
                return rc;
        context_destroy(newc);
        newc->str = s;
        newc->len = len;
        pr_info("SELinux:  Context %s became invalid (unmapped).\n",
                newc->str);
        return 0;
}

static void security_load_policycaps(struct selinux_state *state,
                                struct selinux_policy *policy)
{
        struct policydb *p;
        unsigned int i;
        struct ebitmap_node *node;

        p = &policy->policydb;

        for (i = 0; i < ARRAY_SIZE(state->policycap); i++)
                WRITE_ONCE(state->policycap[i],
                        ebitmap_get_bit(&p->policycaps, i));

        for (i = 0; i < ARRAY_SIZE(selinux_policycap_names); i++)
                pr_info("SELinux:  policy capability %s=%d\n",
                        selinux_policycap_names[i],
                        ebitmap_get_bit(&p->policycaps, i));

        ebitmap_for_each_positive_bit(&p->policycaps, node, i) {
                if (i >= ARRAY_SIZE(selinux_policycap_names))
                        pr_info("SELinux:  unknown policy capability %u\n",
                                i);
        }
}

static int security_preserve_bools(struct selinux_policy *oldpolicy,
                                struct selinux_policy *newpolicy);

static void selinux_policy_free(struct selinux_policy *policy)
{
        if (!policy)
                return;

        sidtab_destroy(policy->sidtab);
        kfree(policy->map.mapping);
        policydb_destroy(&policy->policydb);
        kfree(policy->sidtab);
        kfree(policy);
}

static void selinux_policy_cond_free(struct selinux_policy *policy)
{
        cond_policydb_destroy_dup(&policy->policydb);
        kfree(policy);
}

void selinux_policy_cancel(struct selinux_state *state,
                           struct selinux_load_state *load_state)
{
        struct selinux_policy *oldpolicy;

        oldpolicy = rcu_dereference_protected(state->policy,
                                        lockdep_is_held(&state->policy_mutex));

        sidtab_cancel_convert(oldpolicy->sidtab);
        selinux_policy_free(load_state->policy);
        kfree(load_state->convert_data);
}

static void selinux_notify_policy_change(struct selinux_state *state,
                                        u32 seqno)
{
        /* Flush external caches and notify userspace of policy load */
        avc_ss_reset(state->avc, seqno);
        selnl_notify_policyload(seqno);
        selinux_status_update_policyload(state, seqno);
        selinux_netlbl_cache_invalidate();
        selinux_xfrm_notify_policyload();
}

void selinux_policy_commit(struct selinux_state *state,
                           struct selinux_load_state *load_state)
{
        struct selinux_policy *oldpolicy, *newpolicy = load_state->policy;
        unsigned long flags;
        u32 seqno;

        oldpolicy = rcu_dereference_protected(state->policy,
                                        lockdep_is_held(&state->policy_mutex));

        /* If switching between different policy types, log MLS status */
        if (oldpolicy) {
                if (oldpolicy->policydb.mls_enabled && !newpolicy->policydb.mls_enabled)
                        pr_info("SELinux: Disabling MLS support...\n");
                else if (!oldpolicy->policydb.mls_enabled && newpolicy->policydb.mls_enabled)
                        pr_info("SELinux: Enabling MLS support...\n");
        }

        /* Set latest granting seqno for new policy. */
        if (oldpolicy)
                newpolicy->latest_granting = oldpolicy->latest_granting + 1;
        else
                newpolicy->latest_granting = 1;
        seqno = newpolicy->latest_granting;

        /* Install the new policy. */
        if (oldpolicy) {
                sidtab_freeze_begin(oldpolicy->sidtab, &flags);
                rcu_assign_pointer(state->policy, newpolicy);
                sidtab_freeze_end(oldpolicy->sidtab, &flags);
        } else {
                rcu_assign_pointer(state->policy, newpolicy);
        }

        /* Load the policycaps from the new policy */
        security_load_policycaps(state, newpolicy);

        if (!selinux_initialized(state)) {
                /*
                 * After first policy load, the security server is
                 * marked as initialized and ready to handle requests and
                 * any objects created prior to policy load are then labeled.
                 */
                selinux_mark_initialized(state);
                selinux_complete_init();
        }

        /* Free the old policy */
        synchronize_rcu();
        selinux_policy_free(oldpolicy);
        kfree(load_state->convert_data);

        /* Notify others of the policy change */
        selinux_notify_policy_change(state, seqno);
}

/**
 * security_load_policy - Load a security policy configuration.
 * @data: binary policy data
 * @len: length of data in bytes
 *
 * Load a new set of security policy configuration data,
 * validate it and convert the SID table as necessary.
 * This function will flush the access vector cache after
 * loading the new policy.
 */
int security_load_policy(struct selinux_state *state, void *data, size_t len,
                         struct selinux_load_state *load_state)
{
        struct selinux_policy *newpolicy, *oldpolicy;
        struct selinux_policy_convert_data *convert_data;
        int rc = 0;
        struct policy_file file = { data, len }, *fp = &file;

        newpolicy = kzalloc(sizeof(*newpolicy), GFP_KERNEL);
        if (!newpolicy)
                return -ENOMEM;

        newpolicy->sidtab = kzalloc(sizeof(*newpolicy->sidtab), GFP_KERNEL);
        if (!newpolicy->sidtab) {
                rc = -ENOMEM;
                goto err_policy;
        }

        rc = policydb_read(&newpolicy->policydb, fp);
        if (rc)
                goto err_sidtab;

        newpolicy->policydb.len = len;
        rc = selinux_set_mapping(&newpolicy->policydb, secclass_map,
                                &newpolicy->map);
        if (rc)
                goto err_policydb;

        rc = policydb_load_isids(&newpolicy->policydb, newpolicy->sidtab);
        if (rc) {
                pr_err("SELinux:  unable to load the initial SIDs\n");
                goto err_mapping;
        }

        if (!selinux_initialized(state)) {
                /* First policy load, so no need to preserve state from old policy */
                load_state->policy = newpolicy;
                load_state->convert_data = NULL;
                return 0;
        }

        oldpolicy = rcu_dereference_protected(state->policy,
                                        lockdep_is_held(&state->policy_mutex));

        /* Preserve active boolean values from the old policy */
        rc = security_preserve_bools(oldpolicy, newpolicy);
        if (rc) {
                pr_err("SELinux:  unable to preserve booleans\n");
                goto err_free_isids;
        }

        convert_data = kmalloc(sizeof(*convert_data), GFP_KERNEL);
        if (!convert_data) {
                rc = -ENOMEM;
                goto err_free_isids;
        }

        /*
         * Convert the internal representations of contexts
         * in the new SID table.
         */
        convert_data->args.state = state;
        convert_data->args.oldp = &oldpolicy->policydb;
        convert_data->args.newp = &newpolicy->policydb;

        convert_data->sidtab_params.func = convert_context;
        convert_data->sidtab_params.args = &convert_data->args;
        convert_data->sidtab_params.target = newpolicy->sidtab;

        rc = sidtab_convert(oldpolicy->sidtab, &convert_data->sidtab_params);
        if (rc) {
                pr_err("SELinux:  unable to convert the internal"
                        " representation of contexts in the new SID"
                        " table\n");
                goto err_free_convert_data;
        }

        load_state->policy = newpolicy;
        load_state->convert_data = convert_data;
        return 0;

err_free_convert_data:
        kfree(convert_data);
err_free_isids:
        sidtab_destroy(newpolicy->sidtab);
err_mapping:
        kfree(newpolicy->map.mapping);
err_policydb:
        policydb_destroy(&newpolicy->policydb);
err_sidtab:
        kfree(newpolicy->sidtab);
err_policy:
        kfree(newpolicy);

        return rc;
}

/**
 * ocontext_to_sid - Helper to safely get sid for an ocontext
 * @sidtab: SID table
 * @c: ocontext structure
 * @index: index of the context entry (0 or 1)
 * @out_sid: pointer to the resulting SID value
 *
 * For all ocontexts except OCON_ISID the SID fields are populated
 * on-demand when needed. Since updating the SID value is an SMP-sensitive
 * operation, this helper must be used to do that safely.
 *
 * WARNING: This function may return -ESTALE, indicating that the caller
 * must retry the operation after re-acquiring the policy pointer!
 */
static int ocontext_to_sid(struct sidtab *sidtab, struct ocontext *c,
                           size_t index, u32 *out_sid)
{
        int rc;
        u32 sid;

        /* Ensure the associated sidtab entry is visible to this thread. */
        sid = smp_load_acquire(&c->sid[index]);
        if (!sid) {
                rc = sidtab_context_to_sid(sidtab, &c->context[index], &sid);
                if (rc)
                        return rc;

                /*
                 * Ensure the new sidtab entry is visible to other threads
                 * when they see the SID.
                 */
                smp_store_release(&c->sid[index], sid);
        }
        *out_sid = sid;
        return 0;
}

/**
 * security_port_sid - Obtain the SID for a port.
 * @protocol: protocol number
 * @port: port number
 * @out_sid: security identifier
 */
int security_port_sid(struct selinux_state *state,
                      u8 protocol, u16 port, u32 *out_sid)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        struct ocontext *c;
        int rc;

        if (!selinux_initialized(state)) {
                *out_sid = SECINITSID_PORT;
                return 0;
        }

retry:
        rc = 0;
        rcu_read_lock();
        policy = rcu_dereference(state->policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        c = policydb->ocontexts[OCON_PORT];
        while (c) {
                if (c->u.port.protocol == protocol &&
                    c->u.port.low_port <= port &&
                    c->u.port.high_port >= port)
                        break;
                c = c->next;
        }

        if (c) {
                rc = ocontext_to_sid(sidtab, c, 0, out_sid);
                if (rc == -ESTALE) {
                        rcu_read_unlock();
                        goto retry;
                }
                if (rc)
                        goto out;
        } else {
                *out_sid = SECINITSID_PORT;
        }

out:
        rcu_read_unlock();
        return rc;
}

/**
 * security_pkey_sid - Obtain the SID for a pkey.
 * @subnet_prefix: Subnet Prefix
 * @pkey_num: pkey number
 * @out_sid: security identifier
 */
int security_ib_pkey_sid(struct selinux_state *state,
                         u64 subnet_prefix, u16 pkey_num, u32 *out_sid)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        struct ocontext *c;
        int rc;

        if (!selinux_initialized(state)) {
                *out_sid = SECINITSID_UNLABELED;
                return 0;
        }

retry:
        rc = 0;
        rcu_read_lock();
        policy = rcu_dereference(state->policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        c = policydb->ocontexts[OCON_IBPKEY];
        while (c) {
                if (c->u.ibpkey.low_pkey <= pkey_num &&
                    c->u.ibpkey.high_pkey >= pkey_num &&
                    c->u.ibpkey.subnet_prefix == subnet_prefix)
                        break;

                c = c->next;
        }

        if (c) {
                rc = ocontext_to_sid(sidtab, c, 0, out_sid);
                if (rc == -ESTALE) {
                        rcu_read_unlock();
                        goto retry;
                }
                if (rc)
                        goto out;
        } else
                *out_sid = SECINITSID_UNLABELED;

out:
        rcu_read_unlock();
        return rc;
}

/**
 * security_ib_endport_sid - Obtain the SID for a subnet management interface.
 * @dev_name: device name
 * @port: port number
 * @out_sid: security identifier
 */
int security_ib_endport_sid(struct selinux_state *state,
                            const char *dev_name, u8 port_num, u32 *out_sid)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        struct ocontext *c;
        int rc;

        if (!selinux_initialized(state)) {
                *out_sid = SECINITSID_UNLABELED;
                return 0;
        }

retry:
        rc = 0;
        rcu_read_lock();
        policy = rcu_dereference(state->policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        c = policydb->ocontexts[OCON_IBENDPORT];
        while (c) {
                if (c->u.ibendport.port == port_num &&
                    !strncmp(c->u.ibendport.dev_name,
                             dev_name,
                             IB_DEVICE_NAME_MAX))
                        break;

                c = c->next;
        }

        if (c) {
                rc = ocontext_to_sid(sidtab, c, 0, out_sid);
                if (rc == -ESTALE) {
                        rcu_read_unlock();
                        goto retry;
                }
                if (rc)
                        goto out;
        } else
                *out_sid = SECINITSID_UNLABELED;

out:
        rcu_read_unlock();
        return rc;
}

/**
 * security_netif_sid - Obtain the SID for a network interface.
 * @name: interface name
 * @if_sid: interface SID
 */
int security_netif_sid(struct selinux_state *state,
                       char *name, u32 *if_sid)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        int rc;
        struct ocontext *c;

        if (!selinux_initialized(state)) {
                *if_sid = SECINITSID_NETIF;
                return 0;
        }

retry:
        rc = 0;
        rcu_read_lock();
        policy = rcu_dereference(state->policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        c = policydb->ocontexts[OCON_NETIF];
        while (c) {
                if (strcmp(name, c->u.name) == 0)
                        break;
                c = c->next;
        }

        if (c) {
                rc = ocontext_to_sid(sidtab, c, 0, if_sid);
                if (rc == -ESTALE) {
                        rcu_read_unlock();
                        goto retry;
                }
                if (rc)
                        goto out;
        } else
                *if_sid = SECINITSID_NETIF;

out:
        rcu_read_unlock();
        return rc;
}

static int match_ipv6_addrmask(u32 *input, u32 *addr, u32 *mask)
{
        int i, fail = 0;

        for (i = 0; i < 4; i++)
                if (addr[i] != (input[i] & mask[i])) {
                        fail = 1;
                        break;
                }

        return !fail;
}

/**
 * security_node_sid - Obtain the SID for a node (host).
 * @domain: communication domain aka address family
 * @addrp: address
 * @addrlen: address length in bytes
 * @out_sid: security identifier
 */
int security_node_sid(struct selinux_state *state,
                      u16 domain,
                      void *addrp,
                      u32 addrlen,
                      u32 *out_sid)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        int rc;
        struct ocontext *c;

        if (!selinux_initialized(state)) {
                *out_sid = SECINITSID_NODE;
                return 0;
        }

retry:
        rcu_read_lock();
        policy = rcu_dereference(state->policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        switch (domain) {
        case AF_INET: {
                u32 addr;

                rc = -EINVAL;
                if (addrlen != sizeof(u32))
                        goto out;

                addr = *((u32 *)addrp);

                c = policydb->ocontexts[OCON_NODE];
                while (c) {
                        if (c->u.node.addr == (addr & c->u.node.mask))
                                break;
                        c = c->next;
                }
                break;
        }

        case AF_INET6:
                rc = -EINVAL;
                if (addrlen != sizeof(u64) * 2)
                        goto out;
                c = policydb->ocontexts[OCON_NODE6];
                while (c) {
                        if (match_ipv6_addrmask(addrp, c->u.node6.addr,
                                                c->u.node6.mask))
                                break;
                        c = c->next;
                }
                break;

        default:
                rc = 0;
                *out_sid = SECINITSID_NODE;
                goto out;
        }

        if (c) {
                rc = ocontext_to_sid(sidtab, c, 0, out_sid);
                if (rc == -ESTALE) {
                        rcu_read_unlock();
                        goto retry;
                }
                if (rc)
                        goto out;
        } else {
                *out_sid = SECINITSID_NODE;
        }

        rc = 0;
out:
        rcu_read_unlock();
        return rc;
}

#define SIDS_NEL 25

/**
 * security_get_user_sids - Obtain reachable SIDs for a user.
 * @fromsid: starting SID
 * @username: username
 * @sids: array of reachable SIDs for user
 * @nel: number of elements in @sids
 *
 * Generate the set of SIDs for legal security contexts
 * for a given user that can be reached by @fromsid.
 * Set *@sids to point to a dynamically allocated
 * array containing the set of SIDs.  Set *@nel to the
 * number of elements in the array.
 */

int security_get_user_sids(struct selinux_state *state,
                           u32 fromsid,
                           char *username,
                           u32 **sids,
                           u32 *nel)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        struct context *fromcon, usercon;
        u32 *mysids = NULL, *mysids2, sid;
        u32 i, j, mynel, maxnel = SIDS_NEL;
        struct user_datum *user;
        struct role_datum *role;
        struct ebitmap_node *rnode, *tnode;
        int rc;

        *sids = NULL;
        *nel = 0;

        if (!selinux_initialized(state))
                return 0;

        mysids = kcalloc(maxnel, sizeof(*mysids), GFP_KERNEL);
        if (!mysids)
                return -ENOMEM;

retry:
        mynel = 0;
        rcu_read_lock();
        policy = rcu_dereference(state->policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        context_init(&usercon);

        rc = -EINVAL;
        fromcon = sidtab_search(sidtab, fromsid);
        if (!fromcon)
                goto out_unlock;

        rc = -EINVAL;
        user = symtab_search(&policydb->p_users, username);
        if (!user)
                goto out_unlock;

        usercon.user = user->value;

        ebitmap_for_each_positive_bit(&user->roles, rnode, i) {
                role = policydb->role_val_to_struct[i];
                usercon.role = i + 1;
                ebitmap_for_each_positive_bit(&role->types, tnode, j) {
                        usercon.type = j + 1;

                        if (mls_setup_user_range(policydb, fromcon, user,
                                                 &usercon))
                                continue;

                        rc = sidtab_context_to_sid(sidtab, &usercon, &sid);
                        if (rc == -ESTALE) {
                                rcu_read_unlock();
                                goto retry;
                        }
                        if (rc)
                                goto out_unlock;
                        if (mynel < maxnel) {
                                mysids[mynel++] = sid;
                        } else {
                                rc = -ENOMEM;
                                maxnel += SIDS_NEL;
                                mysids2 = kcalloc(maxnel, sizeof(*mysids2), GFP_ATOMIC);
                                if (!mysids2)
                                        goto out_unlock;
                                memcpy(mysids2, mysids, mynel * sizeof(*mysids2));
                                kfree(mysids);
                                mysids = mysids2;
                                mysids[mynel++] = sid;
                        }
                }
        }
        rc = 0;
out_unlock:
        rcu_read_unlock();
        if (rc || !mynel) {
                kfree(mysids);
                return rc;
        }

        rc = -ENOMEM;
        mysids2 = kcalloc(mynel, sizeof(*mysids2), GFP_KERNEL);
        if (!mysids2) {
                kfree(mysids);
                return rc;
        }
        for (i = 0, j = 0; i < mynel; i++) {
                struct av_decision dummy_avd;
                rc = avc_has_perm_noaudit(state,
                                          fromsid, mysids[i],
                                          SECCLASS_PROCESS, /* kernel value */
                                          PROCESS__TRANSITION, AVC_STRICT,
                                          &dummy_avd);
                if (!rc)
                        mysids2[j++] = mysids[i];
                cond_resched();
        }
        kfree(mysids);
        *sids = mysids2;
        *nel = j;
        return 0;
}

/**
 * __security_genfs_sid - Helper to obtain a SID for a file in a filesystem
 * @fstype: filesystem type
 * @path: path from root of mount
 * @sclass: file security class
 * @sid: SID for path
 *
 * Obtain a SID to use for a file in a filesystem that
 * cannot support xattr or use a fixed labeling behavior like
 * transition SIDs or task SIDs.
 *
 * WARNING: This function may return -ESTALE, indicating that the caller
 * must retry the operation after re-acquiring the policy pointer!
 */
static inline int __security_genfs_sid(struct selinux_policy *policy,
                                       const char *fstype,
                                       char *path,
                                       u16 orig_sclass,
                                       u32 *sid)
{
        struct policydb *policydb = &policy->policydb;
        struct sidtab *sidtab = policy->sidtab;
        int len;
        u16 sclass;
        struct genfs *genfs;
        struct ocontext *c;
        int cmp = 0;

        while (path[0] == '/' && path[1] == '/')
                path++;

        sclass = unmap_class(&policy->map, orig_sclass);
        *sid = SECINITSID_UNLABELED;

        for (genfs = policydb->genfs; genfs; genfs = genfs->next) {
                cmp = strcmp(fstype, genfs->fstype);
                if (cmp <= 0)
                        break;
        }

        if (!genfs || cmp)
                return -ENOENT;

        for (c = genfs->head; c; c = c->next) {
                len = strlen(c->u.name);
                if ((!c->v.sclass || sclass == c->v.sclass) &&
                    (strncmp(c->u.name, path, len) == 0))
                        break;
        }

        if (!c)
                return -ENOENT;

        return ocontext_to_sid(sidtab, c, 0, sid);
}

/**
 * security_genfs_sid - Obtain a SID for a file in a filesystem
 * @fstype: filesystem type
 * @path: path from root of mount
 * @sclass: file security class
 * @sid: SID for path
 *
 * Acquire policy_rwlock before calling __security_genfs_sid() and release
 * it afterward.
 */
int security_genfs_sid(struct selinux_state *state,
                       const char *fstype,
                       char *path,
                       u16 orig_sclass,
                       u32 *sid)
{
        struct selinux_policy *policy;
        int retval;

        if (!selinux_initialized(state)) {
                *sid = SECINITSID_UNLABELED;
                return 0;
        }

        do {
                rcu_read_lock();
                policy = rcu_dereference(state->policy);
                retval = __security_genfs_sid(policy, fstype, path,
                                              orig_sclass, sid);
                rcu_read_unlock();
        } while (retval == -ESTALE);
        return retval;
}

int selinux_policy_genfs_sid(struct selinux_policy *policy,
                        const char *fstype,
                        char *path,
                        u16 orig_sclass,
                        u32 *sid)
{
        /* no lock required, policy is not yet accessible by other threads */
        return __security_genfs_sid(policy, fstype, path, orig_sclass, sid);
}

/**
 * security_fs_use - Determine how to handle labeling for a filesystem.
 * @sb: superblock in question
 */
int security_fs_use(struct selinux_state *state, struct super_block *sb)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        int rc;
        struct ocontext *c;
        struct superblock_security_struct *sbsec = sb->s_security;
        const char *fstype = sb->s_type->name;

        if (!selinux_initialized(state)) {
                sbsec->behavior = SECURITY_FS_USE_NONE;
                sbsec->sid = SECINITSID_UNLABELED;
                return 0;
        }

retry:
        rc = 0;
        rcu_read_lock();
        policy = rcu_dereference(state->policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        c = policydb->ocontexts[OCON_FSUSE];
        while (c) {
                if (strcmp(fstype, c->u.name) == 0)
                        break;
                c = c->next;
        }

        if (c) {
                sbsec->behavior = c->v.behavior;
                rc = ocontext_to_sid(sidtab, c, 0, &sbsec->sid);
                if (rc == -ESTALE) {
                        rcu_read_unlock();
                        goto retry;
                }
                if (rc)
                        goto out;
        } else {
                rc = __security_genfs_sid(policy, fstype, "/",
                                        SECCLASS_DIR, &sbsec->sid);
                if (rc == -ESTALE) {
                        rcu_read_unlock();
                        goto retry;
                }
                if (rc) {
                        sbsec->behavior = SECURITY_FS_USE_NONE;
                        rc = 0;
                } else {
                        sbsec->behavior = SECURITY_FS_USE_GENFS;
                }
        }

out:
        rcu_read_unlock();
        return rc;
}

int security_get_bools(struct selinux_policy *policy,
                       u32 *len, char ***names, int **values)
{
        struct policydb *policydb;
        u32 i;
        int rc;

        policydb = &policy->policydb;

        *names = NULL;
        *values = NULL;

        rc = 0;
        *len = policydb->p_bools.nprim;
        if (!*len)
                goto out;

        rc = -ENOMEM;
        *names = kcalloc(*len, sizeof(char *), GFP_ATOMIC);
        if (!*names)
                goto err;

        rc = -ENOMEM;
        *values = kcalloc(*len, sizeof(int), GFP_ATOMIC);
        if (!*values)
                goto err;

        for (i = 0; i < *len; i++) {
                (*values)[i] = policydb->bool_val_to_struct[i]->state;

                rc = -ENOMEM;
                (*names)[i] = kstrdup(sym_name(policydb, SYM_BOOLS, i),
                                      GFP_ATOMIC);
                if (!(*names)[i])
                        goto err;
        }
        rc = 0;
out:
        return rc;
err:
        if (*names) {
                for (i = 0; i < *len; i++)
                        kfree((*names)[i]);
                kfree(*names);
        }
        kfree(*values);
        *len = 0;
        *names = NULL;
        *values = NULL;
        goto out;
}


int security_set_bools(struct selinux_state *state, u32 len, int *values)
{
        struct selinux_policy *newpolicy, *oldpolicy;
        int rc;
        u32 i, seqno = 0;

        if (!selinux_initialized(state))
                return -EINVAL;

        oldpolicy = rcu_dereference_protected(state->policy,
                                        lockdep_is_held(&state->policy_mutex));

        /* Consistency check on number of booleans, should never fail */
        if (WARN_ON(len != oldpolicy->policydb.p_bools.nprim))
                return -EINVAL;

        newpolicy = kmemdup(oldpolicy, sizeof(*newpolicy), GFP_KERNEL);
        if (!newpolicy)
                return -ENOMEM;

        /*
         * Deep copy only the parts of the policydb that might be
         * modified as a result of changing booleans.
         */
        rc = cond_policydb_dup(&newpolicy->policydb, &oldpolicy->policydb);
        if (rc) {
                kfree(newpolicy);
                return -ENOMEM;
        }

        /* Update the boolean states in the copy */
        for (i = 0; i < len; i++) {
                int new_state = !!values[i];
                int old_state = newpolicy->policydb.bool_val_to_struct[i]->state;

                if (new_state != old_state) {
                        audit_log(audit_context(), GFP_ATOMIC,
                                AUDIT_MAC_CONFIG_CHANGE,
                                "bool=%s val=%d old_val=%d auid=%u ses=%u",
                                sym_name(&newpolicy->policydb, SYM_BOOLS, i),
                                new_state,
                                old_state,
                                from_kuid(&init_user_ns, audit_get_loginuid(current)),
                                audit_get_sessionid(current));
                        newpolicy->policydb.bool_val_to_struct[i]->state = new_state;
                }
        }

        /* Re-evaluate the conditional rules in the copy */
        evaluate_cond_nodes(&newpolicy->policydb);

        /* Set latest granting seqno for new policy */
        newpolicy->latest_granting = oldpolicy->latest_granting + 1;
        seqno = newpolicy->latest_granting;

        /* Install the new policy */
        rcu_assign_pointer(state->policy, newpolicy);

        /*
         * Free the conditional portions of the old policydb
         * that were copied for the new policy, and the oldpolicy
         * structure itself but not what it references.
         */
        synchronize_rcu();
        selinux_policy_cond_free(oldpolicy);

        /* Notify others of the policy change */
        selinux_notify_policy_change(state, seqno);
        return 0;
}

int security_get_bool_value(struct selinux_state *state,
                            u32 index)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        int rc;
        u32 len;

        if (!selinux_initialized(state))
                return 0;

        rcu_read_lock();
        policy = rcu_dereference(state->policy);
        policydb = &policy->policydb;

        rc = -EFAULT;
        len = policydb->p_bools.nprim;
        if (index >= len)
                goto out;

        rc = policydb->bool_val_to_struct[index]->state;
out:
        rcu_read_unlock();
        return rc;
}

static int security_preserve_bools(struct selinux_policy *oldpolicy,
                                struct selinux_policy *newpolicy)
{
        int rc, *bvalues = NULL;
        char **bnames = NULL;
        struct cond_bool_datum *booldatum;
        u32 i, nbools = 0;

        rc = security_get_bools(oldpolicy, &nbools, &bnames, &bvalues);
        if (rc)
                goto out;
        for (i = 0; i < nbools; i++) {
                booldatum = symtab_search(&newpolicy->policydb.p_bools,
                                        bnames[i]);
                if (booldatum)
                        booldatum->state = bvalues[i];
        }
        evaluate_cond_nodes(&newpolicy->policydb);

out:
        if (bnames) {
                for (i = 0; i < nbools; i++)
                        kfree(bnames[i]);
        }
        kfree(bnames);
        kfree(bvalues);
        return rc;
}

/*
 * security_sid_mls_copy() - computes a new sid based on the given
 * sid and the mls portion of mls_sid.
 */
int security_sid_mls_copy(struct selinux_state *state,
                          u32 sid, u32 mls_sid, u32 *new_sid)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        struct context *context1;
        struct context *context2;
        struct context newcon;
        char *s;
        u32 len;
        int rc;

        if (!selinux_initialized(state)) {
                *new_sid = sid;
                return 0;
        }

retry:
        rc = 0;
        context_init(&newcon);

        rcu_read_lock();
        policy = rcu_dereference(state->policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        if (!policydb->mls_enabled) {
                *new_sid = sid;
                goto out_unlock;
        }

        rc = -EINVAL;
        context1 = sidtab_search(sidtab, sid);
        if (!context1) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                        __func__, sid);
                goto out_unlock;
        }

        rc = -EINVAL;
        context2 = sidtab_search(sidtab, mls_sid);
        if (!context2) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                        __func__, mls_sid);
                goto out_unlock;
        }

        newcon.user = context1->user;
        newcon.role = context1->role;
        newcon.type = context1->type;
        rc = mls_context_cpy(&newcon, context2);
        if (rc)
                goto out_unlock;

        /* Check the validity of the new context. */
        if (!policydb_context_isvalid(policydb, &newcon)) {
                rc = convert_context_handle_invalid_context(state, policydb,
                                                        &newcon);
                if (rc) {
                        if (!context_struct_to_string(policydb, &newcon, &s,
                                                      &len)) {
                                struct audit_buffer *ab;

                                ab = audit_log_start(audit_context(),
                                                     GFP_ATOMIC,
                                                     AUDIT_SELINUX_ERR);
                                audit_log_format(ab,
                                                 "op=security_sid_mls_copy invalid_context=");
                                /* don't record NUL with untrusted strings */
                                audit_log_n_untrustedstring(ab, s, len - 1);
                                audit_log_end(ab);
                                kfree(s);
                        }
                        goto out_unlock;
                }
        }
        rc = sidtab_context_to_sid(sidtab, &newcon, new_sid);
        if (rc == -ESTALE) {
                rcu_read_unlock();
                context_destroy(&newcon);
                goto retry;
        }
out_unlock:
        rcu_read_unlock();
        context_destroy(&newcon);
        return rc;
}

/**
 * security_net_peersid_resolve - Compare and resolve two network peer SIDs
 * @nlbl_sid: NetLabel SID
 * @nlbl_type: NetLabel labeling protocol type
 * @xfrm_sid: XFRM SID
 *
 * Description:
 * Compare the @nlbl_sid and @xfrm_sid values and if the two SIDs can be
 * resolved into a single SID it is returned via @peer_sid and the function
 * returns zero.  Otherwise @peer_sid is set to SECSID_NULL and the function
 * returns a negative value.  A table summarizing the behavior is below:
 *
 *                                 | function return |      @sid
 *   ------------------------------+-----------------+-----------------
 *   no peer labels                |        0        |    SECSID_NULL
 *   single peer label             |        0        |    <peer_label>
 *   multiple, consistent labels   |        0        |    <peer_label>
 *   multiple, inconsistent labels |    -<errno>     |    SECSID_NULL
 *
 */
int security_net_peersid_resolve(struct selinux_state *state,
                                 u32 nlbl_sid, u32 nlbl_type,
                                 u32 xfrm_sid,
                                 u32 *peer_sid)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        int rc;
        struct context *nlbl_ctx;
        struct context *xfrm_ctx;

        *peer_sid = SECSID_NULL;

        /* handle the common (which also happens to be the set of easy) cases
         * right away, these two if statements catch everything involving a
         * single or absent peer SID/label */
        if (xfrm_sid == SECSID_NULL) {
                *peer_sid = nlbl_sid;
                return 0;
        }
        /* NOTE: an nlbl_type == NETLBL_NLTYPE_UNLABELED is a "fallback" label
         * and is treated as if nlbl_sid == SECSID_NULL when a XFRM SID/label
         * is present */
        if (nlbl_sid == SECSID_NULL || nlbl_type == NETLBL_NLTYPE_UNLABELED) {
                *peer_sid = xfrm_sid;
                return 0;
        }

        if (!selinux_initialized(state))
                return 0;

        rcu_read_lock();
        policy = rcu_dereference(state->policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        /*
         * We don't need to check initialized here since the only way both
         * nlbl_sid and xfrm_sid are not equal to SECSID_NULL would be if the
         * security server was initialized and state->initialized was true.
         */
        if (!policydb->mls_enabled) {
                rc = 0;
                goto out;
        }

        rc = -EINVAL;
        nlbl_ctx = sidtab_search(sidtab, nlbl_sid);
        if (!nlbl_ctx) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                       __func__, nlbl_sid);
                goto out;
        }
        rc = -EINVAL;
        xfrm_ctx = sidtab_search(sidtab, xfrm_sid);
        if (!xfrm_ctx) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                       __func__, xfrm_sid);
                goto out;
        }
        rc = (mls_context_cmp(nlbl_ctx, xfrm_ctx) ? 0 : -EACCES);
        if (rc)
                goto out;

        /* at present NetLabel SIDs/labels really only carry MLS
         * information so if the MLS portion of the NetLabel SID
         * matches the MLS portion of the labeled XFRM SID/label
         * then pass along the XFRM SID as it is the most
         * expressive */
        *peer_sid = xfrm_sid;
out:
        rcu_read_unlock();
        return rc;
}

static int get_classes_callback(void *k, void *d, void *args)
{
        struct class_datum *datum = d;
        char *name = k, **classes = args;
        int value = datum->value - 1;

        classes[value] = kstrdup(name, GFP_ATOMIC);
        if (!classes[value])
                return -ENOMEM;

        return 0;
}

int security_get_classes(struct selinux_policy *policy,
                         char ***classes, int *nclasses)
{
        struct policydb *policydb;
        int rc;

        policydb = &policy->policydb;

        rc = -ENOMEM;
        *nclasses = policydb->p_classes.nprim;
        *classes = kcalloc(*nclasses, sizeof(**classes), GFP_ATOMIC);
        if (!*classes)
                goto out;

        rc = hashtab_map(&policydb->p_classes.table, get_classes_callback,
                         *classes);
        if (rc) {
                int i;
                for (i = 0; i < *nclasses; i++)
                        kfree((*classes)[i]);
                kfree(*classes);
        }

out:
        return rc;
}

static int get_permissions_callback(void *k, void *d, void *args)
{
        struct perm_datum *datum = d;
        char *name = k, **perms = args;
        int value = datum->value - 1;

        perms[value] = kstrdup(name, GFP_ATOMIC);
        if (!perms[value])
                return -ENOMEM;

        return 0;
}

int security_get_permissions(struct selinux_policy *policy,
                             char *class, char ***perms, int *nperms)
{
        struct policydb *policydb;
        int rc, i;
        struct class_datum *match;

        policydb = &policy->policydb;

        rc = -EINVAL;
        match = symtab_search(&policydb->p_classes, class);
        if (!match) {
                pr_err("SELinux: %s:  unrecognized class %s\n",
                        __func__, class);
                goto out;
        }

        rc = -ENOMEM;
        *nperms = match->permissions.nprim;
        *perms = kcalloc(*nperms, sizeof(**perms), GFP_ATOMIC);
        if (!*perms)
                goto out;

        if (match->comdatum) {
                rc = hashtab_map(&match->comdatum->permissions.table,
                                 get_permissions_callback, *perms);
                if (rc)
                        goto err;
        }

        rc = hashtab_map(&match->permissions.table, get_permissions_callback,
                         *perms);
        if (rc)
                goto err;

out:
        return rc;

err:
        for (i = 0; i < *nperms; i++)
                kfree((*perms)[i]);
        kfree(*perms);
        return rc;
}

int security_get_reject_unknown(struct selinux_state *state)
{
        struct selinux_policy *policy;
        int value;

        if (!selinux_initialized(state))
                return 0;

        rcu_read_lock();
        policy = rcu_dereference(state->policy);
        value = policy->policydb.reject_unknown;
        rcu_read_unlock();
        return value;
}

int security_get_allow_unknown(struct selinux_state *state)
{
        struct selinux_policy *policy;
        int value;

        if (!selinux_initialized(state))
                return 0;

        rcu_read_lock();
        policy = rcu_dereference(state->policy);
        value = policy->policydb.allow_unknown;
        rcu_read_unlock();
        return value;
}

/**
 * security_policycap_supported - Check for a specific policy capability
 * @req_cap: capability
 *
 * Description:
 * This function queries the currently loaded policy to see if it supports the
 * capability specified by @req_cap.  Returns true (1) if the capability is
 * supported, false (0) if it isn't supported.
 *
 */
int security_policycap_supported(struct selinux_state *state,
                                 unsigned int req_cap)
{
        struct selinux_policy *policy;
        int rc;

        if (!selinux_initialized(state))
                return 0;

        rcu_read_lock();
        policy = rcu_dereference(state->policy);
        rc = ebitmap_get_bit(&policy->policydb.policycaps, req_cap);
        rcu_read_unlock();

        return rc;
}

struct selinux_audit_rule {
        u32 au_seqno;
        struct context au_ctxt;
};

void selinux_audit_rule_free(void *vrule)
{
        struct selinux_audit_rule *rule = vrule;

        if (rule) {
                context_destroy(&rule->au_ctxt);
                kfree(rule);
        }
}

int selinux_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule,
                            gfp_t gfp)
{
        struct selinux_state *state = &selinux_state;
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct selinux_audit_rule *tmprule;
        struct role_datum *roledatum;
        struct type_datum *typedatum;
        struct user_datum *userdatum;
        struct selinux_audit_rule **rule = (struct selinux_audit_rule **)vrule;
        int rc = 0;

        *rule = NULL;

        if (!selinux_initialized(state))
                return -EOPNOTSUPP;

        switch (field) {
        case AUDIT_SUBJ_USER:
        case AUDIT_SUBJ_ROLE:
        case AUDIT_SUBJ_TYPE:
        case AUDIT_OBJ_USER:
        case AUDIT_OBJ_ROLE:
        case AUDIT_OBJ_TYPE:
                /* only 'equals' and 'not equals' fit user, role, and type */
                if (op != Audit_equal && op != Audit_not_equal)
                        return -EINVAL;
                break;
        case AUDIT_SUBJ_SEN:
        case AUDIT_SUBJ_CLR:
        case AUDIT_OBJ_LEV_LOW:
        case AUDIT_OBJ_LEV_HIGH:
                /* we do not allow a range, indicated by the presence of '-' */
                if (strchr(rulestr, '-'))
                        return -EINVAL;
                break;
        default:
                /* only the above fields are valid */
                return -EINVAL;
        }

        tmprule = kzalloc(sizeof(struct selinux_audit_rule), gfp);
        if (!tmprule)
                return -ENOMEM;

        context_init(&tmprule->au_ctxt);

        rcu_read_lock();
        policy = rcu_dereference(state->policy);
        policydb = &policy->policydb;

        tmprule->au_seqno = policy->latest_granting;

        switch (field) {
        case AUDIT_SUBJ_USER:
        case AUDIT_OBJ_USER:
                rc = -EINVAL;
                userdatum = symtab_search(&policydb->p_users, rulestr);
                if (!userdatum)
                        goto out;
                tmprule->au_ctxt.user = userdatum->value;
                break;
        case AUDIT_SUBJ_ROLE:
        case AUDIT_OBJ_ROLE:
                rc = -EINVAL;
                roledatum = symtab_search(&policydb->p_roles, rulestr);
                if (!roledatum)
                        goto out;
                tmprule->au_ctxt.role = roledatum->value;
                break;
        case AUDIT_SUBJ_TYPE:
        case AUDIT_OBJ_TYPE:
                rc = -EINVAL;
                typedatum = symtab_search(&policydb->p_types, rulestr);
                if (!typedatum)
                        goto out;
                tmprule->au_ctxt.type = typedatum->value;
                break;
        case AUDIT_SUBJ_SEN:
        case AUDIT_SUBJ_CLR:
        case AUDIT_OBJ_LEV_LOW:
        case AUDIT_OBJ_LEV_HIGH:
                rc = mls_from_string(policydb, rulestr, &tmprule->au_ctxt,
                                     GFP_ATOMIC);
                if (rc)
                        goto out;
                break;
        }
        rc = 0;
out:
        rcu_read_unlock();

        if (rc) {
                selinux_audit_rule_free(tmprule);
                tmprule = NULL;
        }

        *rule = tmprule;

        return rc;
}

/* Check to see if the rule contains any selinux fields */
int selinux_audit_rule_known(struct audit_krule *rule)
{
        int i;

        for (i = 0; i < rule->field_count; i++) {
                struct audit_field *f = &rule->fields[i];
                switch (f->type) {
                case AUDIT_SUBJ_USER:
                case AUDIT_SUBJ_ROLE:
                case AUDIT_SUBJ_TYPE:
                case AUDIT_SUBJ_SEN:
                case AUDIT_SUBJ_CLR:
                case AUDIT_OBJ_USER:
                case AUDIT_OBJ_ROLE:
                case AUDIT_OBJ_TYPE:
                case AUDIT_OBJ_LEV_LOW:
                case AUDIT_OBJ_LEV_HIGH:
                        return 1;
                }
        }

        return 0;
}

int selinux_audit_rule_match(u32 sid, u32 field, u32 op, void *vrule)
{
        struct selinux_state *state = &selinux_state;
        struct selinux_policy *policy;
        struct context *ctxt;
        struct mls_level *level;
        struct selinux_audit_rule *rule = vrule;
        int match = 0;

        if (unlikely(!rule)) {
                WARN_ONCE(1, "selinux_audit_rule_match: missing rule\n");
                return -ENOENT;
        }

        if (!selinux_initialized(state))
                return 0;

        rcu_read_lock();

        policy = rcu_dereference(state->policy);

        if (rule->au_seqno < policy->latest_granting) {
                match = -ESTALE;
                goto out;
        }

        ctxt = sidtab_search(policy->sidtab, sid);
        if (unlikely(!ctxt)) {
                WARN_ONCE(1, "selinux_audit_rule_match: unrecognized SID %d\n",
                          sid);
                match = -ENOENT;
                goto out;
        }

        /* a field/op pair that is not caught here will simply fall through
           without a match */
        switch (field) {
        case AUDIT_SUBJ_USER:
        case AUDIT_OBJ_USER:
                switch (op) {
                case Audit_equal:
                        match = (ctxt->user == rule->au_ctxt.user);
                        break;
                case Audit_not_equal:
                        match = (ctxt->user != rule->au_ctxt.user);
                        break;
                }
                break;
        case AUDIT_SUBJ_ROLE:
        case AUDIT_OBJ_ROLE:
                switch (op) {
                case Audit_equal:
                        match = (ctxt->role == rule->au_ctxt.role);
                        break;
                case Audit_not_equal:
                        match = (ctxt->role != rule->au_ctxt.role);
                        break;
                }
                break;
        case AUDIT_SUBJ_TYPE:
        case AUDIT_OBJ_TYPE:
                switch (op) {
                case Audit_equal:
                        match = (ctxt->type == rule->au_ctxt.type);
                        break;
                case Audit_not_equal:
                        match = (ctxt->type != rule->au_ctxt.type);
                        break;
                }
                break;
        case AUDIT_SUBJ_SEN:
        case AUDIT_SUBJ_CLR:
        case AUDIT_OBJ_LEV_LOW:
        case AUDIT_OBJ_LEV_HIGH:
                level = ((field == AUDIT_SUBJ_SEN ||
                          field == AUDIT_OBJ_LEV_LOW) ?
                         &ctxt->range.level[0] : &ctxt->range.level[1]);
                switch (op) {
                case Audit_equal:
                        match = mls_level_eq(&rule->au_ctxt.range.level[0],
                                             level);
                        break;
                case Audit_not_equal:
                        match = !mls_level_eq(&rule->au_ctxt.range.level[0],
                                              level);
                        break;
                case Audit_lt:
                        match = (mls_level_dom(&rule->au_ctxt.range.level[0],
                                               level) &&
                                 !mls_level_eq(&rule->au_ctxt.range.level[0],
                                               level));
                        break;
                case Audit_le:
                        match = mls_level_dom(&rule->au_ctxt.range.level[0],
                                              level);
                        break;
                case Audit_gt:
                        match = (mls_level_dom(level,
                                              &rule->au_ctxt.range.level[0]) &&
                                 !mls_level_eq(level,
                                               &rule->au_ctxt.range.level[0]));
                        break;
                case Audit_ge:
                        match = mls_level_dom(level,
                                              &rule->au_ctxt.range.level[0]);
                        break;
                }
        }

out:
        rcu_read_unlock();
        return match;
}

static int (*aurule_callback)(void) = audit_update_lsm_rules;

static int aurule_avc_callback(u32 event)
{
        int err = 0;

        if (event == AVC_CALLBACK_RESET && aurule_callback)
                err = aurule_callback();
        return err;
}

static int __init aurule_init(void)
{
        int err;

        err = avc_add_callback(aurule_avc_callback, AVC_CALLBACK_RESET);
        if (err)
                panic("avc_add_callback() failed, error %d\n", err);

        return err;
}
__initcall(aurule_init);

#ifdef CONFIG_NETLABEL
/**
 * security_netlbl_cache_add - Add an entry to the NetLabel cache
 * @secattr: the NetLabel packet security attributes
 * @sid: the SELinux SID
 *
 * Description:
 * Attempt to cache the context in @ctx, which was derived from the packet in
 * @skb, in the NetLabel subsystem cache.  This function assumes @secattr has
 * already been initialized.
 *
 */
static void security_netlbl_cache_add(struct netlbl_lsm_secattr *secattr,
                                      u32 sid)
{
        u32 *sid_cache;

        sid_cache = kmalloc(sizeof(*sid_cache), GFP_ATOMIC);
        if (sid_cache == NULL)
                return;
        secattr->cache = netlbl_secattr_cache_alloc(GFP_ATOMIC);
        if (secattr->cache == NULL) {
                kfree(sid_cache);
                return;
        }

        *sid_cache = sid;
        secattr->cache->free = kfree;
        secattr->cache->data = sid_cache;
        secattr->flags |= NETLBL_SECATTR_CACHE;
}

/**
 * security_netlbl_secattr_to_sid - Convert a NetLabel secattr to a SELinux SID
 * @secattr: the NetLabel packet security attributes
 * @sid: the SELinux SID
 *
 * Description:
 * Convert the given NetLabel security attributes in @secattr into a
 * SELinux SID.  If the @secattr field does not contain a full SELinux
 * SID/context then use SECINITSID_NETMSG as the foundation.  If possible the
 * 'cache' field of @secattr is set and the CACHE flag is set; this is to
 * allow the @secattr to be used by NetLabel to cache the secattr to SID
 * conversion for future lookups.  Returns zero on success, negative values on
 * failure.
 *
 */
int security_netlbl_secattr_to_sid(struct selinux_state *state,
                                   struct netlbl_lsm_secattr *secattr,
                                   u32 *sid)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        int rc;
        struct context *ctx;
        struct context ctx_new;

        if (!selinux_initialized(state)) {
                *sid = SECSID_NULL;
                return 0;
        }

retry:
        rc = 0;
        rcu_read_lock();
        policy = rcu_dereference(state->policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        if (secattr->flags & NETLBL_SECATTR_CACHE)
                *sid = *(u32 *)secattr->cache->data;
        else if (secattr->flags & NETLBL_SECATTR_SECID)
                *sid = secattr->attr.secid;
        else if (secattr->flags & NETLBL_SECATTR_MLS_LVL) {
                rc = -EIDRM;
                ctx = sidtab_search(sidtab, SECINITSID_NETMSG);
                if (ctx == NULL)
                        goto out;

                context_init(&ctx_new);
                ctx_new.user = ctx->user;
                ctx_new.role = ctx->role;
                ctx_new.type = ctx->type;
                mls_import_netlbl_lvl(policydb, &ctx_new, secattr);
                if (secattr->flags & NETLBL_SECATTR_MLS_CAT) {
                        rc = mls_import_netlbl_cat(policydb, &ctx_new, secattr);
                        if (rc)
                                goto out;
                }
                rc = -EIDRM;
                if (!mls_context_isvalid(policydb, &ctx_new)) {
                        ebitmap_destroy(&ctx_new.range.level[0].cat);
                        goto out;
                }

                rc = sidtab_context_to_sid(sidtab, &ctx_new, sid);
                ebitmap_destroy(&ctx_new.range.level[0].cat);
                if (rc == -ESTALE) {
                        rcu_read_unlock();
                        goto retry;
                }
                if (rc)
                        goto out;

                security_netlbl_cache_add(secattr, *sid);
        } else
                *sid = SECSID_NULL;

out:
        rcu_read_unlock();
        return rc;
}

/**
 * security_netlbl_sid_to_secattr - Convert a SELinux SID to a NetLabel secattr
 * @sid: the SELinux SID
 * @secattr: the NetLabel packet security attributes
 *
 * Description:
 * Convert the given SELinux SID in @sid into a NetLabel security attribute.
 * Returns zero on success, negative values on failure.
 *
 */
int security_netlbl_sid_to_secattr(struct selinux_state *state,
                                   u32 sid, struct netlbl_lsm_secattr *secattr)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        int rc;
        struct context *ctx;

        if (!selinux_initialized(state))
                return 0;

        rcu_read_lock();
        policy = rcu_dereference(state->policy);
        policydb = &policy->policydb;

        rc = -ENOENT;
        ctx = sidtab_search(policy->sidtab, sid);
        if (ctx == NULL)
                goto out;

        rc = -ENOMEM;
        secattr->domain = kstrdup(sym_name(policydb, SYM_TYPES, ctx->type - 1),
                                  GFP_ATOMIC);
        if (secattr->domain == NULL)
                goto out;

        secattr->attr.secid = sid;
        secattr->flags |= NETLBL_SECATTR_DOMAIN_CPY | NETLBL_SECATTR_SECID;
        mls_export_netlbl_lvl(policydb, ctx, secattr);
        rc = mls_export_netlbl_cat(policydb, ctx, secattr);
out:
        rcu_read_unlock();
        return rc;
}
#endif /* CONFIG_NETLABEL */

/**
 * security_read_policy - read the policy.
 * @data: binary policy data
 * @len: length of data in bytes
 *
 */
int security_read_policy(struct selinux_state *state,
                         void **data, size_t *len)
{
        struct selinux_policy *policy;
        int rc;
        struct policy_file fp;

        policy = rcu_dereference_protected(
                        state->policy, lockdep_is_held(&state->policy_mutex));
        if (!policy)
                return -EINVAL;

        *len = policy->policydb.len;
        *data = vmalloc_user(*len);
        if (!*data)
                return -ENOMEM;

        fp.data = *data;
        fp.len = *len;

        rc = policydb_write(&policy->policydb, &fp);
        if (rc)
                return rc;

        *len = (unsigned long)fp.data - (unsigned long)*data;
        return 0;

}

































    1 





















    1 





    1 





















    1 

    1 




    1 

































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
// SPDX-License-Identifier: GPL-2.0-or-later
/* Key permission checking
 *
 * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#include <linux/export.h>
#include <linux/security.h>
#include "internal.h"

/**
 * key_task_permission - Check a key can be used
 * @key_ref: The key to check.
 * @cred: The credentials to use.
 * @need_perm: The permission required.
 *
 * Check to see whether permission is granted to use a key in the desired way,
 * but permit the security modules to override.
 *
 * The caller must hold either a ref on cred or must hold the RCU readlock.
 *
 * Returns 0 if successful, -EACCES if access is denied based on the
 * permissions bits or the LSM check.
 */
int key_task_permission(const key_ref_t key_ref, const struct cred *cred,
                        enum key_need_perm need_perm)
{
        struct key *key;
        key_perm_t kperm, mask;
        int ret;

        switch (need_perm) {
        default:
                WARN_ON(1);
                return -EACCES;
        case KEY_NEED_UNLINK:
        case KEY_SYSADMIN_OVERRIDE:
        case KEY_AUTHTOKEN_OVERRIDE:
        case KEY_DEFER_PERM_CHECK:
                goto lsm;

        case KEY_NEED_VIEW:        mask = KEY_OTH_VIEW;        break;
        case KEY_NEED_READ:        mask = KEY_OTH_READ;        break;
        case KEY_NEED_WRITE:        mask = KEY_OTH_WRITE;        break;
        case KEY_NEED_SEARCH:        mask = KEY_OTH_SEARCH;        break;
        case KEY_NEED_LINK:        mask = KEY_OTH_LINK;        break;
        case KEY_NEED_SETATTR:        mask = KEY_OTH_SETATTR;        break;
        }

        key = key_ref_to_ptr(key_ref);

        /* use the second 8-bits of permissions for keys the caller owns */
        if (uid_eq(key->uid, cred->fsuid)) {
                kperm = key->perm >> 16;
                goto use_these_perms;
        }

        /* use the third 8-bits of permissions for keys the caller has a group
         * membership in common with */
        if (gid_valid(key->gid) && key->perm & KEY_GRP_ALL) {
                if (gid_eq(key->gid, cred->fsgid)) {
                        kperm = key->perm >> 8;
                        goto use_these_perms;
                }

                ret = groups_search(cred->group_info, key->gid);
                if (ret) {
                        kperm = key->perm >> 8;
                        goto use_these_perms;
                }
        }

        /* otherwise use the least-significant 8-bits */
        kperm = key->perm;

use_these_perms:

        /* use the top 8-bits of permissions for keys the caller possesses
         * - possessor permissions are additive with other permissions
         */
        if (is_key_possessed(key_ref))
                kperm |= key->perm >> 24;

        if ((kperm & mask) != mask)
                return -EACCES;

        /* let LSM be the final arbiter */
lsm:
        return security_key_permission(key_ref, cred, need_perm);
}
EXPORT_SYMBOL(key_task_permission);

/**
 * key_validate - Validate a key.
 * @key: The key to be validated.
 *
 * Check that a key is valid, returning 0 if the key is okay, -ENOKEY if the
 * key is invalidated, -EKEYREVOKED if the key's type has been removed or if
 * the key has been revoked or -EKEYEXPIRED if the key has expired.
 */
int key_validate(const struct key *key)
{
        unsigned long flags = READ_ONCE(key->flags);
        time64_t expiry = READ_ONCE(key->expiry);

        if (flags & (1 << KEY_FLAG_INVALIDATED))
                return -ENOKEY;

        /* check it's still accessible */
        if (flags & ((1 << KEY_FLAG_REVOKED) |
                     (1 << KEY_FLAG_DEAD)))
                return -EKEYREVOKED;

        /* check it hasn't expired */
        if (expiry) {
                if (ktime_get_real_seconds() >= expiry)
                        return -EKEYEXPIRED;
        }

        return 0;
}
EXPORT_SYMBOL(key_validate);


























































































































































































































































































































































































































































































































































































































































































    3 
















































































































































































































































































































































































































































    1 











































































































    1 











































































































    2 












































































































































































































    1 
    1 




















    2 





















































































































































































































































































































    1 










































































































































































































































































































    3 




















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MM_H
#define _LINUX_MM_H

#include <linux/errno.h>

#ifdef __KERNEL__

#include <linux/mmdebug.h>
#include <linux/gfp.h>
#include <linux/bug.h>
#include <linux/list.h>
#include <linux/mmzone.h>
#include <linux/rbtree.h>
#include <linux/atomic.h>
#include <linux/debug_locks.h>
#include <linux/mm_types.h>
#include <linux/mmap_lock.h>
#include <linux/range.h>
#include <linux/pfn.h>
#include <linux/percpu-refcount.h>
#include <linux/bit_spinlock.h>
#include <linux/shrinker.h>
#include <linux/resource.h>
#include <linux/page_ext.h>
#include <linux/err.h>
#include <linux/page-flags.h>
#include <linux/page_ref.h>
#include <linux/memremap.h>
#include <linux/overflow.h>
#include <linux/sizes.h>
#include <linux/sched.h>
#include <linux/pgtable.h>

struct mempolicy;
struct anon_vma;
struct anon_vma_chain;
struct file_ra_state;
struct user_struct;
struct writeback_control;
struct bdi_writeback;
struct pt_regs;

extern int sysctl_page_lock_unfairness;

void init_mm_internals(void);

#ifndef CONFIG_NEED_MULTIPLE_NODES        /* Don't use mapnrs, do it properly */
extern unsigned long max_mapnr;

static inline void set_max_mapnr(unsigned long limit)
{
        max_mapnr = limit;
}
#else
static inline void set_max_mapnr(unsigned long limit) { }
#endif

extern atomic_long_t _totalram_pages;
static inline unsigned long totalram_pages(void)
{
        return (unsigned long)atomic_long_read(&_totalram_pages);
}

static inline void totalram_pages_inc(void)
{
        atomic_long_inc(&_totalram_pages);
}

static inline void totalram_pages_dec(void)
{
        atomic_long_dec(&_totalram_pages);
}

static inline void totalram_pages_add(long count)
{
        atomic_long_add(count, &_totalram_pages);
}

extern void * high_memory;
extern int page_cluster;

#ifdef CONFIG_SYSCTL
extern int sysctl_legacy_va_layout;
#else
#define sysctl_legacy_va_layout 0
#endif

#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
extern const int mmap_rnd_bits_min;
extern const int mmap_rnd_bits_max;
extern int mmap_rnd_bits __read_mostly;
#endif
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
extern const int mmap_rnd_compat_bits_min;
extern const int mmap_rnd_compat_bits_max;
extern int mmap_rnd_compat_bits __read_mostly;
#endif

#include <asm/page.h>
#include <asm/processor.h>

/*
 * Architectures that support memory tagging (assigning tags to memory regions,
 * embedding these tags into addresses that point to these memory regions, and
 * checking that the memory and the pointer tags match on memory accesses)
 * redefine this macro to strip tags from pointers.
 * It's defined as noop for arcitectures that don't support memory tagging.
 */
#ifndef untagged_addr
#define untagged_addr(addr) (addr)
#endif

#ifndef __pa_symbol
#define __pa_symbol(x)  __pa(RELOC_HIDE((unsigned long)(x), 0))
#endif

#ifndef page_to_virt
#define page_to_virt(x)        __va(PFN_PHYS(page_to_pfn(x)))
#endif

#ifndef lm_alias
#define lm_alias(x)        __va(__pa_symbol(x))
#endif

/*
 * To prevent common memory management code establishing
 * a zero page mapping on a read fault.
 * This macro should be defined within <asm/pgtable.h>.
 * s390 does this to prevent multiplexing of hardware bits
 * related to the physical page in case of virtualization.
 */
#ifndef mm_forbids_zeropage
#define mm_forbids_zeropage(X)        (0)
#endif

/*
 * On some architectures it is expensive to call memset() for small sizes.
 * If an architecture decides to implement their own version of
 * mm_zero_struct_page they should wrap the defines below in a #ifndef and
 * define their own version of this macro in <asm/pgtable.h>
 */
#if BITS_PER_LONG == 64
/* This function must be updated when the size of struct page grows above 80
 * or reduces below 56. The idea that compiler optimizes out switch()
 * statement, and only leaves move/store instructions. Also the compiler can
 * combine write statments if they are both assignments and can be reordered,
 * this can result in several of the writes here being dropped.
 */
#define        mm_zero_struct_page(pp) __mm_zero_struct_page(pp)
static inline void __mm_zero_struct_page(struct page *page)
{
        unsigned long *_pp = (void *)page;

         /* Check that struct page is either 56, 64, 72, or 80 bytes */
        BUILD_BUG_ON(sizeof(struct page) & 7);
        BUILD_BUG_ON(sizeof(struct page) < 56);
        BUILD_BUG_ON(sizeof(struct page) > 80);

        switch (sizeof(struct page)) {
        case 80:
                _pp[9] = 0;
                fallthrough;
        case 72:
                _pp[8] = 0;
                fallthrough;
        case 64:
                _pp[7] = 0;
                fallthrough;
        case 56:
                _pp[6] = 0;
                _pp[5] = 0;
                _pp[4] = 0;
                _pp[3] = 0;
                _pp[2] = 0;
                _pp[1] = 0;
                _pp[0] = 0;
        }
}
#else
#define mm_zero_struct_page(pp)  ((void)memset((pp), 0, sizeof(struct page)))
#endif

/*
 * Default maximum number of active map areas, this limits the number of vmas
 * per mm struct. Users can overwrite this number by sysctl but there is a
 * problem.
 *
 * When a program's coredump is generated as ELF format, a section is created
 * per a vma. In ELF, the number of sections is represented in unsigned short.
 * This means the number of sections should be smaller than 65535 at coredump.
 * Because the kernel adds some informative sections to a image of program at
 * generating coredump, we need some margin. The number of extra sections is
 * 1-3 now and depends on arch. We use "5" as safe margin, here.
 *
 * ELF extended numbering allows more than 65535 sections, so 16-bit bound is
 * not a hard limit any more. Although some userspace tools can be surprised by
 * that.
 */
#define MAPCOUNT_ELF_CORE_MARGIN        (5)
#define DEFAULT_MAX_MAP_COUNT        (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)

extern int sysctl_max_map_count;

extern unsigned long sysctl_user_reserve_kbytes;
extern unsigned long sysctl_admin_reserve_kbytes;

extern int sysctl_overcommit_memory;
extern int sysctl_overcommit_ratio;
extern unsigned long sysctl_overcommit_kbytes;

int overcommit_ratio_handler(struct ctl_table *, int, void *, size_t *,
                loff_t *);
int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *,
                loff_t *);
int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *,
                loff_t *);

#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))

/* to align the pointer to the (next) page boundary */
#define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE)

/* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */
#define PAGE_ALIGNED(addr)        IS_ALIGNED((unsigned long)(addr), PAGE_SIZE)

#define lru_to_page(head) (list_entry((head)->prev, struct page, lru))

/*
 * Linux kernel virtual memory manager primitives.
 * The idea being to have a "virtual" mm in the same way
 * we have a virtual fs - giving a cleaner interface to the
 * mm details, and allowing different kinds of memory mappings
 * (from shared memory to executable loading to arbitrary
 * mmap() functions).
 */

struct vm_area_struct *vm_area_alloc(struct mm_struct *);
struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
void vm_area_free(struct vm_area_struct *);

#ifndef CONFIG_MMU
extern struct rb_root nommu_region_tree;
extern struct rw_semaphore nommu_region_sem;

extern unsigned int kobjsize(const void *objp);
#endif

/*
 * vm_flags in vm_area_struct, see mm_types.h.
 * When changing, update also include/trace/events/mmflags.h
 */
#define VM_NONE                0x00000000

#define VM_READ                0x00000001        /* currently active flags */
#define VM_WRITE        0x00000002
#define VM_EXEC                0x00000004
#define VM_SHARED        0x00000008

/* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */
#define VM_MAYREAD        0x00000010        /* limits for mprotect() etc */
#define VM_MAYWRITE        0x00000020
#define VM_MAYEXEC        0x00000040
#define VM_MAYSHARE        0x00000080

#define VM_GROWSDOWN        0x00000100        /* general info on the segment */
#define VM_UFFD_MISSING        0x00000200        /* missing pages tracking */
#define VM_PFNMAP        0x00000400        /* Page-ranges managed without "struct page", just pure PFN */
#define VM_DENYWRITE        0x00000800        /* ETXTBSY on write attempts.. */
#define VM_UFFD_WP        0x00001000        /* wrprotect pages tracking */

#define VM_LOCKED        0x00002000
#define VM_IO           0x00004000        /* Memory mapped I/O or similar */

                                        /* Used by sys_madvise() */
#define VM_SEQ_READ        0x00008000        /* App will access data sequentially */
#define VM_RAND_READ        0x00010000        /* App will not benefit from clustered reads */

#define VM_DONTCOPY        0x00020000      /* Do not copy this vma on fork */
#define VM_DONTEXPAND        0x00040000        /* Cannot expand with mremap() */
#define VM_LOCKONFAULT        0x00080000        /* Lock the pages covered when they are faulted in */
#define VM_ACCOUNT        0x00100000        /* Is a VM accounted object */
#define VM_NORESERVE        0x00200000        /* should the VM suppress accounting */
#define VM_HUGETLB        0x00400000        /* Huge TLB Page VM */
#define VM_SYNC                0x00800000        /* Synchronous page faults */
#define VM_ARCH_1        0x01000000        /* Architecture-specific flag */
#define VM_WIPEONFORK        0x02000000        /* Wipe VMA contents in child. */
#define VM_DONTDUMP        0x04000000        /* Do not include in the core dump */

#ifdef CONFIG_MEM_SOFT_DIRTY
# define VM_SOFTDIRTY        0x08000000        /* Not soft dirty clean area */
#else
# define VM_SOFTDIRTY        0
#endif

#define VM_MIXEDMAP        0x10000000        /* Can contain "struct page" and pure PFN pages */
#define VM_HUGEPAGE        0x20000000        /* MADV_HUGEPAGE marked this vma */
#define VM_NOHUGEPAGE        0x40000000        /* MADV_NOHUGEPAGE marked this vma */
#define VM_MERGEABLE        BIT(31)                /* KSM may merge identical pages */

#ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS
#define VM_HIGH_ARCH_BIT_0        32        /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_1        33        /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_2        34        /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_3        35        /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_4        36        /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_0        BIT(VM_HIGH_ARCH_BIT_0)
#define VM_HIGH_ARCH_1        BIT(VM_HIGH_ARCH_BIT_1)
#define VM_HIGH_ARCH_2        BIT(VM_HIGH_ARCH_BIT_2)
#define VM_HIGH_ARCH_3        BIT(VM_HIGH_ARCH_BIT_3)
#define VM_HIGH_ARCH_4        BIT(VM_HIGH_ARCH_BIT_4)
#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */

#ifdef CONFIG_ARCH_HAS_PKEYS
# define VM_PKEY_SHIFT        VM_HIGH_ARCH_BIT_0
# define VM_PKEY_BIT0        VM_HIGH_ARCH_0        /* A protection key is a 4-bit value */
# define VM_PKEY_BIT1        VM_HIGH_ARCH_1        /* on x86 and 5-bit value on ppc64   */
# define VM_PKEY_BIT2        VM_HIGH_ARCH_2
# define VM_PKEY_BIT3        VM_HIGH_ARCH_3
#ifdef CONFIG_PPC
# define VM_PKEY_BIT4  VM_HIGH_ARCH_4
#else
# define VM_PKEY_BIT4  0
#endif
#endif /* CONFIG_ARCH_HAS_PKEYS */

#if defined(CONFIG_X86)
# define VM_PAT                VM_ARCH_1        /* PAT reserves whole VMA at once (x86) */
#elif defined(CONFIG_PPC)
# define VM_SAO                VM_ARCH_1        /* Strong Access Ordering (powerpc) */
#elif defined(CONFIG_PARISC)
# define VM_GROWSUP        VM_ARCH_1
#elif defined(CONFIG_IA64)
# define VM_GROWSUP        VM_ARCH_1
#elif defined(CONFIG_SPARC64)
# define VM_SPARC_ADI        VM_ARCH_1        /* Uses ADI tag for access control */
# define VM_ARCH_CLEAR        VM_SPARC_ADI
#elif defined(CONFIG_ARM64)
# define VM_ARM64_BTI        VM_ARCH_1        /* BTI guarded page, a.k.a. GP bit */
# define VM_ARCH_CLEAR        VM_ARM64_BTI
#elif !defined(CONFIG_MMU)
# define VM_MAPPED_COPY        VM_ARCH_1        /* T if mapped copy of data (nommu mmap) */
#endif

#if defined(CONFIG_ARM64_MTE)
# define VM_MTE                VM_HIGH_ARCH_0        /* Use Tagged memory for access control */
# define VM_MTE_ALLOWED        VM_HIGH_ARCH_1        /* Tagged memory permitted */
#else
# define VM_MTE                VM_NONE
# define VM_MTE_ALLOWED        VM_NONE
#endif

#ifndef VM_GROWSUP
# define VM_GROWSUP        VM_NONE
#endif

/* Bits set in the VMA until the stack is in its final location */
#define VM_STACK_INCOMPLETE_SETUP        (VM_RAND_READ | VM_SEQ_READ)

#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0)

/* Common data flag combinations */
#define VM_DATA_FLAGS_TSK_EXEC        (VM_READ | VM_WRITE | TASK_EXEC | \
                                 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
#define VM_DATA_FLAGS_NON_EXEC        (VM_READ | VM_WRITE | VM_MAYREAD | \
                                 VM_MAYWRITE | VM_MAYEXEC)
#define VM_DATA_FLAGS_EXEC        (VM_READ | VM_WRITE | VM_EXEC | \
                                 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)

#ifndef VM_DATA_DEFAULT_FLAGS                /* arch can override this */
#define VM_DATA_DEFAULT_FLAGS  VM_DATA_FLAGS_EXEC
#endif

#ifndef VM_STACK_DEFAULT_FLAGS                /* arch can override this */
#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
#endif

#ifdef CONFIG_STACK_GROWSUP
#define VM_STACK        VM_GROWSUP
#else
#define VM_STACK        VM_GROWSDOWN
#endif

#define VM_STACK_FLAGS        (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)

/* VMA basic access permission flags */
#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC)


/*
 * Special vmas that are non-mergable, non-mlock()able.
 */
#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)

/* This mask prevents VMA from being scanned with khugepaged */
#define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB)

/* This mask defines which mm->def_flags a process can inherit its parent */
#define VM_INIT_DEF_MASK        VM_NOHUGEPAGE

/* This mask is used to clear all the VMA flags used by mlock */
#define VM_LOCKED_CLEAR_MASK        (~(VM_LOCKED | VM_LOCKONFAULT))

/* Arch-specific flags to clear when updating VM flags on protection change */
#ifndef VM_ARCH_CLEAR
# define VM_ARCH_CLEAR        VM_NONE
#endif
#define VM_FLAGS_CLEAR        (ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR)

/*
 * mapping from the currently active vm_flags protection bits (the
 * low four bits) to a page protection mask..
 */
extern pgprot_t protection_map[16];

/**
 * Fault flag definitions.
 *
 * @FAULT_FLAG_WRITE: Fault was a write fault.
 * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE.
 * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked.
 * @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_lock and wait when retrying.
 * @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region.
 * @FAULT_FLAG_TRIED: The fault has been tried once.
 * @FAULT_FLAG_USER: The fault originated in userspace.
 * @FAULT_FLAG_REMOTE: The fault is not for current task/mm.
 * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch.
 * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals.
 *
 * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify
 * whether we would allow page faults to retry by specifying these two
 * fault flags correctly.  Currently there can be three legal combinations:
 *
 * (a) ALLOW_RETRY and !TRIED:  this means the page fault allows retry, and
 *                              this is the first try
 *
 * (b) ALLOW_RETRY and TRIED:   this means the page fault allows retry, and
 *                              we've already tried at least once
 *
 * (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry
 *
 * The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never
 * be used.  Note that page faults can be allowed to retry for multiple times,
 * in which case we'll have an initial fault with flags (a) then later on
 * continuous faults with flags (b).  We should always try to detect pending
 * signals before a retry to make sure the continuous page faults can still be
 * interrupted if necessary.
 */
#define FAULT_FLAG_WRITE                        0x01
#define FAULT_FLAG_MKWRITE                        0x02
#define FAULT_FLAG_ALLOW_RETRY                        0x04
#define FAULT_FLAG_RETRY_NOWAIT                        0x08
#define FAULT_FLAG_KILLABLE                        0x10
#define FAULT_FLAG_TRIED                        0x20
#define FAULT_FLAG_USER                                0x40
#define FAULT_FLAG_REMOTE                        0x80
#define FAULT_FLAG_INSTRUCTION                  0x100
#define FAULT_FLAG_INTERRUPTIBLE                0x200

/*
 * The default fault flags that should be used by most of the
 * arch-specific page fault handlers.
 */
#define FAULT_FLAG_DEFAULT  (FAULT_FLAG_ALLOW_RETRY | \
                             FAULT_FLAG_KILLABLE | \
                             FAULT_FLAG_INTERRUPTIBLE)

/**
 * fault_flag_allow_retry_first - check ALLOW_RETRY the first time
 *
 * This is mostly used for places where we want to try to avoid taking
 * the mmap_lock for too long a time when waiting for another condition
 * to change, in which case we can try to be polite to release the
 * mmap_lock in the first round to avoid potential starvation of other
 * processes that would also want the mmap_lock.
 *
 * Return: true if the page fault allows retry and this is the first
 * attempt of the fault handling; false otherwise.
 */
static inline bool fault_flag_allow_retry_first(unsigned int flags)
{
        return (flags & FAULT_FLAG_ALLOW_RETRY) &&
            (!(flags & FAULT_FLAG_TRIED));
}

#define FAULT_FLAG_TRACE \
        { FAULT_FLAG_WRITE,                "WRITE" }, \
        { FAULT_FLAG_MKWRITE,                "MKWRITE" }, \
        { FAULT_FLAG_ALLOW_RETRY,        "ALLOW_RETRY" }, \
        { FAULT_FLAG_RETRY_NOWAIT,        "RETRY_NOWAIT" }, \
        { FAULT_FLAG_KILLABLE,                "KILLABLE" }, \
        { FAULT_FLAG_TRIED,                "TRIED" }, \
        { FAULT_FLAG_USER,                "USER" }, \
        { FAULT_FLAG_REMOTE,                "REMOTE" }, \
        { FAULT_FLAG_INSTRUCTION,        "INSTRUCTION" }, \
        { FAULT_FLAG_INTERRUPTIBLE,        "INTERRUPTIBLE" }

/*
 * vm_fault is filled by the pagefault handler and passed to the vma's
 * ->fault function. The vma's ->fault is responsible for returning a bitmask
 * of VM_FAULT_xxx flags that give details about how the fault was handled.
 *
 * MM layer fills up gfp_mask for page allocations but fault handler might
 * alter it if its implementation requires a different allocation context.
 *
 * pgoff should be used in favour of virtual_address, if possible.
 */
struct vm_fault {
        struct vm_area_struct *vma;        /* Target VMA */
        unsigned int flags;                /* FAULT_FLAG_xxx flags */
        gfp_t gfp_mask;                        /* gfp mask to be used for allocations */
        pgoff_t pgoff;                        /* Logical page offset based on vma */
        unsigned long address;                /* Faulting virtual address */
        pmd_t *pmd;                        /* Pointer to pmd entry matching
                                         * the 'address' */
        pud_t *pud;                        /* Pointer to pud entry matching
                                         * the 'address'
                                         */
        pte_t orig_pte;                        /* Value of PTE at the time of fault */

        struct page *cow_page;                /* Page handler may use for COW fault */
        struct page *page;                /* ->fault handlers should return a
                                         * page here, unless VM_FAULT_NOPAGE
                                         * is set (which is also implied by
                                         * VM_FAULT_ERROR).
                                         */
        /* These three entries are valid only while holding ptl lock */
        pte_t *pte;                        /* Pointer to pte entry matching
                                         * the 'address'. NULL if the page
                                         * table hasn't been allocated.
                                         */
        spinlock_t *ptl;                /* Page table lock.
                                         * Protects pte page table if 'pte'
                                         * is not NULL, otherwise pmd.
                                         */
        pgtable_t prealloc_pte;                /* Pre-allocated pte page table.
                                         * vm_ops->map_pages() calls
                                         * alloc_set_pte() from atomic context.
                                         * do_fault_around() pre-allocates
                                         * page table to avoid allocation from
                                         * atomic context.
                                         */
};

/* page entry size for vm->huge_fault() */
enum page_entry_size {
        PE_SIZE_PTE = 0,
        PE_SIZE_PMD,
        PE_SIZE_PUD,
};

/*
 * These are the virtual MM functions - opening of an area, closing and
 * unmapping it (needed to keep files on disk up-to-date etc), pointer
 * to the functions called when a no-page or a wp-page exception occurs.
 */
struct vm_operations_struct {
        void (*open)(struct vm_area_struct * area);
        void (*close)(struct vm_area_struct * area);
        int (*split)(struct vm_area_struct * area, unsigned long addr);
        int (*mremap)(struct vm_area_struct * area);
        vm_fault_t (*fault)(struct vm_fault *vmf);
        vm_fault_t (*huge_fault)(struct vm_fault *vmf,
                        enum page_entry_size pe_size);
        void (*map_pages)(struct vm_fault *vmf,
                        pgoff_t start_pgoff, pgoff_t end_pgoff);
        unsigned long (*pagesize)(struct vm_area_struct * area);

        /* notification that a previously read-only page is about to become
         * writable, if an error is returned it will cause a SIGBUS */
        vm_fault_t (*page_mkwrite)(struct vm_fault *vmf);

        /* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */
        vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf);

        /* called by access_process_vm when get_user_pages() fails, typically
         * for use by special VMAs that can switch between memory and hardware
         */
        int (*access)(struct vm_area_struct *vma, unsigned long addr,
                      void *buf, int len, int write);

        /* Called by the /proc/PID/maps code to ask the vma whether it
         * has a special name.  Returning non-NULL will also cause this
         * vma to be dumped unconditionally. */
        const char *(*name)(struct vm_area_struct *vma);

#ifdef CONFIG_NUMA
        /*
         * set_policy() op must add a reference to any non-NULL @new mempolicy
         * to hold the policy upon return.  Caller should pass NULL @new to
         * remove a policy and fall back to surrounding context--i.e. do not
         * install a MPOL_DEFAULT policy, nor the task or system default
         * mempolicy.
         */
        int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);

        /*
         * get_policy() op must add reference [mpol_get()] to any policy at
         * (vma,addr) marked as MPOL_SHARED.  The shared policy infrastructure
         * in mm/mempolicy.c will do this automatically.
         * get_policy() must NOT add a ref if the policy at (vma,addr) is not
         * marked as MPOL_SHARED. vma policies are protected by the mmap_lock.
         * If no [shared/vma] mempolicy exists at the addr, get_policy() op
         * must return NULL--i.e., do not "fallback" to task or system default
         * policy.
         */
        struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
                                        unsigned long addr);
#endif
        /*
         * Called by vm_normal_page() for special PTEs to find the
         * page for @addr.  This is useful if the default behavior
         * (using pte_page()) would not find the correct page.
         */
        struct page *(*find_special_page)(struct vm_area_struct *vma,
                                          unsigned long addr);
};

static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
{
        static const struct vm_operations_struct dummy_vm_ops = {};

        memset(vma, 0, sizeof(*vma));
        vma->vm_mm = mm;
        vma->vm_ops = &dummy_vm_ops;
        INIT_LIST_HEAD(&vma->anon_vma_chain);
}

static inline void vma_set_anonymous(struct vm_area_struct *vma)
{
        vma->vm_ops = NULL;
}

static inline bool vma_is_anonymous(struct vm_area_struct *vma)
{
        return !vma->vm_ops;
}

static inline bool vma_is_temporary_stack(struct vm_area_struct *vma)
{
        int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);

        if (!maybe_stack)
                return false;

        if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
                                                VM_STACK_INCOMPLETE_SETUP)
                return true;

        return false;
}

static inline bool vma_is_foreign(struct vm_area_struct *vma)
{
        if (!current->mm)
                return true;

        if (current->mm != vma->vm_mm)
                return true;

        return false;
}

static inline bool vma_is_accessible(struct vm_area_struct *vma)
{
        return vma->vm_flags & VM_ACCESS_FLAGS;
}

static inline bool is_shared_maywrite(vm_flags_t vm_flags)
{
        return (vm_flags & (VM_SHARED | VM_MAYWRITE)) ==
                (VM_SHARED | VM_MAYWRITE);
}

static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma)
{
        return is_shared_maywrite(vma->vm_flags);
}

#ifdef CONFIG_SHMEM
/*
 * The vma_is_shmem is not inline because it is used only by slow
 * paths in userfault.
 */
bool vma_is_shmem(struct vm_area_struct *vma);
#else
static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; }
#endif

int vma_is_stack_for_current(struct vm_area_struct *vma);

/* flush_tlb_range() takes a vma, not a mm, and can care about flags */
#define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) }

struct mmu_gather;
struct inode;

#include <linux/huge_mm.h>

/*
 * Methods to modify the page usage count.
 *
 * What counts for a page usage:
 * - cache mapping   (page->mapping)
 * - private data    (page->private)
 * - page mapped in a task's page tables, each mapping
 *   is counted separately
 *
 * Also, many kernel routines increase the page count before a critical
 * routine so they can be sure the page doesn't go away from under them.
 */

/*
 * Drop a ref, return true if the refcount fell to zero (the page has no users)
 */
static inline int put_page_testzero(struct page *page)
{
        VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
        return page_ref_dec_and_test(page);
}

/*
 * Try to grab a ref unless the page has a refcount of zero, return false if
 * that is the case.
 * This can be called when MMU is off so it must not access
 * any of the virtual mappings.
 */
static inline int get_page_unless_zero(struct page *page)
{
        return page_ref_add_unless(page, 1, 0);
}

extern int page_is_ram(unsigned long pfn);

enum {
        REGION_INTERSECTS,
        REGION_DISJOINT,
        REGION_MIXED,
};

int region_intersects(resource_size_t offset, size_t size, unsigned long flags,
                      unsigned long desc);

/* Support for virtually mapped pages */
struct page *vmalloc_to_page(const void *addr);
unsigned long vmalloc_to_pfn(const void *addr);

/*
 * Determine if an address is within the vmalloc range
 *
 * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there
 * is no special casing required.
 */

#ifndef is_ioremap_addr
#define is_ioremap_addr(x) is_vmalloc_addr(x)
#endif

#ifdef CONFIG_MMU
extern bool is_vmalloc_addr(const void *x);
extern int is_vmalloc_or_module_addr(const void *x);
#else
static inline bool is_vmalloc_addr(const void *x)
{
        return false;
}
static inline int is_vmalloc_or_module_addr(const void *x)
{
        return 0;
}
#endif

extern void *kvmalloc_node(size_t size, gfp_t flags, int node);
static inline void *kvmalloc(size_t size, gfp_t flags)
{
        return kvmalloc_node(size, flags, NUMA_NO_NODE);
}
static inline void *kvzalloc_node(size_t size, gfp_t flags, int node)
{
        return kvmalloc_node(size, flags | __GFP_ZERO, node);
}
static inline void *kvzalloc(size_t size, gfp_t flags)
{
        return kvmalloc(size, flags | __GFP_ZERO);
}

static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags)
{
        size_t bytes;

        if (unlikely(check_mul_overflow(n, size, &bytes)))
                return NULL;

        return kvmalloc(bytes, flags);
}

static inline void *kvcalloc(size_t n, size_t size, gfp_t flags)
{
        return kvmalloc_array(n, size, flags | __GFP_ZERO);
}

extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize,
                gfp_t flags);
extern void kvfree(const void *addr);
extern void kvfree_sensitive(const void *addr, size_t len);

static inline int head_compound_mapcount(struct page *head)
{
        return atomic_read(compound_mapcount_ptr(head)) + 1;
}

/*
 * Mapcount of compound page as a whole, does not include mapped sub-pages.
 *
 * Must be called only for compound pages or any their tail sub-pages.
 */
static inline int compound_mapcount(struct page *page)
{
        VM_BUG_ON_PAGE(!PageCompound(page), page);
        page = compound_head(page);
        return head_compound_mapcount(page);
}

/*
 * The atomic page->_mapcount, starts from -1: so that transitions
 * both from it and to it can be tracked, using atomic_inc_and_test
 * and atomic_add_negative(-1).
 */
static inline void page_mapcount_reset(struct page *page)
{
        atomic_set(&(page)->_mapcount, -1);
}

int __page_mapcount(struct page *page);

/*
 * Mapcount of 0-order page; when compound sub-page, includes
 * compound_mapcount().
 *
 * Result is undefined for pages which cannot be mapped into userspace.
 * For example SLAB or special types of pages. See function page_has_type().
 * They use this place in struct page differently.
 */
static inline int page_mapcount(struct page *page)
{
        if (unlikely(PageCompound(page)))
                return __page_mapcount(page);
        return atomic_read(&page->_mapcount) + 1;
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
int total_mapcount(struct page *page);
int page_trans_huge_mapcount(struct page *page, int *total_mapcount);
#else
static inline int total_mapcount(struct page *page)
{
        return page_mapcount(page);
}
static inline int page_trans_huge_mapcount(struct page *page,
                                           int *total_mapcount)
{
        int mapcount = page_mapcount(page);
        if (total_mapcount)
                *total_mapcount = mapcount;
        return mapcount;
}
#endif

static inline struct page *virt_to_head_page(const void *x)
{
        struct page *page = virt_to_page(x);

        return compound_head(page);
}

void __put_page(struct page *page);

void put_pages_list(struct list_head *pages);

void split_page(struct page *page, unsigned int order);

/*
 * Compound pages have a destructor function.  Provide a
 * prototype for that function and accessor functions.
 * These are _only_ valid on the head of a compound page.
 */
typedef void compound_page_dtor(struct page *);

/* Keep the enum in sync with compound_page_dtors array in mm/page_alloc.c */
enum compound_dtor_id {
        NULL_COMPOUND_DTOR,
        COMPOUND_PAGE_DTOR,
#ifdef CONFIG_HUGETLB_PAGE
        HUGETLB_PAGE_DTOR,
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        TRANSHUGE_PAGE_DTOR,
#endif
        NR_COMPOUND_DTORS,
};
extern compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS];

static inline void set_compound_page_dtor(struct page *page,
                enum compound_dtor_id compound_dtor)
{
        VM_BUG_ON_PAGE(compound_dtor >= NR_COMPOUND_DTORS, page);
        page[1].compound_dtor = compound_dtor;
}

static inline void destroy_compound_page(struct page *page)
{
        VM_BUG_ON_PAGE(page[1].compound_dtor >= NR_COMPOUND_DTORS, page);
        compound_page_dtors[page[1].compound_dtor](page);
}

static inline unsigned int compound_order(struct page *page)
{
        if (!PageHead(page))
                return 0;
        return page[1].compound_order;
}

static inline bool hpage_pincount_available(struct page *page)
{
        /*
         * Can the page->hpage_pinned_refcount field be used? That field is in
         * the 3rd page of the compound page, so the smallest (2-page) compound
         * pages cannot support it.
         */
        page = compound_head(page);
        return PageCompound(page) && compound_order(page) > 1;
}

static inline int head_compound_pincount(struct page *head)
{
        return atomic_read(compound_pincount_ptr(head));
}

static inline int compound_pincount(struct page *page)
{
        VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
        page = compound_head(page);
        return head_compound_pincount(page);
}

static inline void set_compound_order(struct page *page, unsigned int order)
{
        page[1].compound_order = order;
        page[1].compound_nr = 1U << order;
}

/* Returns the number of pages in this potentially compound page. */
static inline unsigned long compound_nr(struct page *page)
{
        if (!PageHead(page))
                return 1;
        return page[1].compound_nr;
}

/* Returns the number of bytes in this potentially compound page. */
static inline unsigned long page_size(struct page *page)
{
        return PAGE_SIZE << compound_order(page);
}

/* Returns the number of bits needed for the number of bytes in a page */
static inline unsigned int page_shift(struct page *page)
{
        return PAGE_SHIFT + compound_order(page);
}

void free_compound_page(struct page *page);

#ifdef CONFIG_MMU
/*
 * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when
 * servicing faults for write access.  In the normal case, do always want
 * pte_mkwrite.  But get_user_pages can cause write faults for mappings
 * that do not have writing enabled, when used by access_process_vm.
 */
static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
        if (likely(vma->vm_flags & VM_WRITE))
                pte = pte_mkwrite(pte);
        return pte;
}

vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page);
vm_fault_t finish_fault(struct vm_fault *vmf);
vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
#endif

/*
 * Multiple processes may "see" the same page. E.g. for untouched
 * mappings of /dev/null, all processes see the same page full of
 * zeroes, and text pages of executables and shared libraries have
 * only one copy in memory, at most, normally.
 *
 * For the non-reserved pages, page_count(page) denotes a reference count.
 *   page_count() == 0 means the page is free. page->lru is then used for
 *   freelist management in the buddy allocator.
 *   page_count() > 0  means the page has been allocated.
 *
 * Pages are allocated by the slab allocator in order to provide memory
 * to kmalloc and kmem_cache_alloc. In this case, the management of the
 * page, and the fields in 'struct page' are the responsibility of mm/slab.c
 * unless a particular usage is carefully commented. (the responsibility of
 * freeing the kmalloc memory is the caller's, of course).
 *
 * A page may be used by anyone else who does a __get_free_page().
 * In this case, page_count still tracks the references, and should only
 * be used through the normal accessor functions. The top bits of page->flags
 * and page->virtual store page management information, but all other fields
 * are unused and could be used privately, carefully. The management of this
 * page is the responsibility of the one who allocated it, and those who have
 * subsequently been given references to it.
 *
 * The other pages (we may call them "pagecache pages") are completely
 * managed by the Linux memory manager: I/O, buffers, swapping etc.
 * The following discussion applies only to them.
 *
 * A pagecache page contains an opaque `private' member, which belongs to the
 * page's address_space. Usually, this is the address of a circular list of
 * the page's disk buffers. PG_private must be set to tell the VM to call
 * into the filesystem to release these pages.
 *
 * A page may belong to an inode's memory mapping. In this case, page->mapping
 * is the pointer to the inode, and page->index is the file offset of the page,
 * in units of PAGE_SIZE.
 *
 * If pagecache pages are not associated with an inode, they are said to be
 * anonymous pages. These may become associated with the swapcache, and in that
 * case PG_swapcache is set, and page->private is an offset into the swapcache.
 *
 * In either case (swapcache or inode backed), the pagecache itself holds one
 * reference to the page. Setting PG_private should also increment the
 * refcount. The each user mapping also has a reference to the page.
 *
 * The pagecache pages are stored in a per-mapping radix tree, which is
 * rooted at mapping->i_pages, and indexed by offset.
 * Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space
 * lists, we instead now tag pages as dirty/writeback in the radix tree.
 *
 * All pagecache pages may be subject to I/O:
 * - inode pages may need to be read from disk,
 * - inode pages which have been modified and are MAP_SHARED may need
 *   to be written back to the inode on disk,
 * - anonymous pages (including MAP_PRIVATE file mappings) which have been
 *   modified may need to be swapped out to swap space and (later) to be read
 *   back into memory.
 */

/*
 * The zone field is never updated after free_area_init_core()
 * sets it, so none of the operations on it need to be atomic.
 */

/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */
#define SECTIONS_PGOFF                ((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
#define NODES_PGOFF                (SECTIONS_PGOFF - NODES_WIDTH)
#define ZONES_PGOFF                (NODES_PGOFF - ZONES_WIDTH)
#define LAST_CPUPID_PGOFF        (ZONES_PGOFF - LAST_CPUPID_WIDTH)
#define KASAN_TAG_PGOFF                (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)

/*
 * Define the bit shifts to access each section.  For non-existent
 * sections we define the shift as 0; that plus a 0 mask ensures
 * the compiler will optimise away reference to them.
 */
#define SECTIONS_PGSHIFT        (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
#define NODES_PGSHIFT                (NODES_PGOFF * (NODES_WIDTH != 0))
#define ZONES_PGSHIFT                (ZONES_PGOFF * (ZONES_WIDTH != 0))
#define LAST_CPUPID_PGSHIFT        (LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0))
#define KASAN_TAG_PGSHIFT        (KASAN_TAG_PGOFF * (KASAN_TAG_WIDTH != 0))

/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */
#ifdef NODE_NOT_IN_PAGE_FLAGS
#define ZONEID_SHIFT                (SECTIONS_SHIFT + ZONES_SHIFT)
#define ZONEID_PGOFF                ((SECTIONS_PGOFF < ZONES_PGOFF)? \
                                                SECTIONS_PGOFF : ZONES_PGOFF)
#else
#define ZONEID_SHIFT                (NODES_SHIFT + ZONES_SHIFT)
#define ZONEID_PGOFF                ((NODES_PGOFF < ZONES_PGOFF)? \
                                                NODES_PGOFF : ZONES_PGOFF)
#endif

#define ZONEID_PGSHIFT                (ZONEID_PGOFF * (ZONEID_SHIFT != 0))

#define ZONES_MASK                ((1UL << ZONES_WIDTH) - 1)
#define NODES_MASK                ((1UL << NODES_WIDTH) - 1)
#define SECTIONS_MASK                ((1UL << SECTIONS_WIDTH) - 1)
#define LAST_CPUPID_MASK        ((1UL << LAST_CPUPID_SHIFT) - 1)
#define KASAN_TAG_MASK                ((1UL << KASAN_TAG_WIDTH) - 1)
#define ZONEID_MASK                ((1UL << ZONEID_SHIFT) - 1)

static inline enum zone_type page_zonenum(const struct page *page)
{
        ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT);
        return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
}

#ifdef CONFIG_ZONE_DEVICE
static inline bool is_zone_device_page(const struct page *page)
{
        return page_zonenum(page) == ZONE_DEVICE;
}
extern void memmap_init_zone_device(struct zone *, unsigned long,
                                    unsigned long, struct dev_pagemap *);
#else
static inline bool is_zone_device_page(const struct page *page)
{
        return false;
}
#endif

#ifdef CONFIG_DEV_PAGEMAP_OPS
void free_devmap_managed_page(struct page *page);
DECLARE_STATIC_KEY_FALSE(devmap_managed_key);

static inline bool page_is_devmap_managed(struct page *page)
{
        if (!static_branch_unlikely(&devmap_managed_key))
                return false;
        if (!is_zone_device_page(page))
                return false;
        switch (page->pgmap->type) {
        case MEMORY_DEVICE_PRIVATE:
        case MEMORY_DEVICE_FS_DAX:
                return true;
        default:
                break;
        }
        return false;
}

void put_devmap_managed_page(struct page *page);

#else /* CONFIG_DEV_PAGEMAP_OPS */
static inline bool page_is_devmap_managed(struct page *page)
{
        return false;
}

static inline void put_devmap_managed_page(struct page *page)
{
}
#endif /* CONFIG_DEV_PAGEMAP_OPS */

static inline bool is_device_private_page(const struct page *page)
{
        return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
                IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
                is_zone_device_page(page) &&
                page->pgmap->type == MEMORY_DEVICE_PRIVATE;
}

static inline bool is_pci_p2pdma_page(const struct page *page)
{
        return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
                IS_ENABLED(CONFIG_PCI_P2PDMA) &&
                is_zone_device_page(page) &&
                page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
}

/* 127: arbitrary random number, small enough to assemble well */
#define page_ref_zero_or_close_to_overflow(page) \
        ((unsigned int) page_ref_count(page) + 127u <= 127u)

static inline void get_page(struct page *page)
{
        page = compound_head(page);
        /*
         * Getting a normal page or the head of a compound page
         * requires to already have an elevated page->_refcount.
         */
        VM_BUG_ON_PAGE(page_ref_zero_or_close_to_overflow(page), page);
        page_ref_inc(page);
}

bool __must_check try_grab_page(struct page *page, unsigned int flags);

static inline __must_check bool try_get_page(struct page *page)
{
        page = compound_head(page);
        if (WARN_ON_ONCE(page_ref_count(page) <= 0))
                return false;
        page_ref_inc(page);
        return true;
}

static inline void put_page(struct page *page)
{
        page = compound_head(page);

        /*
         * For devmap managed pages we need to catch refcount transition from
         * 2 to 1, when refcount reach one it means the page is free and we
         * need to inform the device driver through callback. See
         * include/linux/memremap.h and HMM for details.
         */
        if (page_is_devmap_managed(page)) {
                put_devmap_managed_page(page);
                return;
        }

        if (put_page_testzero(page))
                __put_page(page);
}

/*
 * GUP_PIN_COUNTING_BIAS, and the associated functions that use it, overload
 * the page's refcount so that two separate items are tracked: the original page
 * reference count, and also a new count of how many pin_user_pages() calls were
 * made against the page. ("gup-pinned" is another term for the latter).
 *
 * With this scheme, pin_user_pages() becomes special: such pages are marked as
 * distinct from normal pages. As such, the unpin_user_page() call (and its
 * variants) must be used in order to release gup-pinned pages.
 *
 * Choice of value:
 *
 * By making GUP_PIN_COUNTING_BIAS a power of two, debugging of page reference
 * counts with respect to pin_user_pages() and unpin_user_page() becomes
 * simpler, due to the fact that adding an even power of two to the page
 * refcount has the effect of using only the upper N bits, for the code that
 * counts up using the bias value. This means that the lower bits are left for
 * the exclusive use of the original code that increments and decrements by one
 * (or at least, by much smaller values than the bias value).
 *
 * Of course, once the lower bits overflow into the upper bits (and this is
 * OK, because subtraction recovers the original values), then visual inspection
 * no longer suffices to directly view the separate counts. However, for normal
 * applications that don't have huge page reference counts, this won't be an
 * issue.
 *
 * Locking: the lockless algorithm described in page_cache_get_speculative()
 * and page_cache_gup_pin_speculative() provides safe operation for
 * get_user_pages and page_mkclean and other calls that race to set up page
 * table entries.
 */
#define GUP_PIN_COUNTING_BIAS (1U << 10)

void unpin_user_page(struct page *page);
void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
                                 bool make_dirty);
void unpin_user_pages(struct page **pages, unsigned long npages);

/**
 * page_maybe_dma_pinned() - report if a page is pinned for DMA.
 *
 * This function checks if a page has been pinned via a call to
 * pin_user_pages*().
 *
 * For non-huge pages, the return value is partially fuzzy: false is not fuzzy,
 * because it means "definitely not pinned for DMA", but true means "probably
 * pinned for DMA, but possibly a false positive due to having at least
 * GUP_PIN_COUNTING_BIAS worth of normal page references".
 *
 * False positives are OK, because: a) it's unlikely for a page to get that many
 * refcounts, and b) all the callers of this routine are expected to be able to
 * deal gracefully with a false positive.
 *
 * For huge pages, the result will be exactly correct. That's because we have
 * more tracking data available: the 3rd struct page in the compound page is
 * used to track the pincount (instead using of the GUP_PIN_COUNTING_BIAS
 * scheme).
 *
 * For more information, please see Documentation/core-api/pin_user_pages.rst.
 *
 * @page:        pointer to page to be queried.
 * @Return:        True, if it is likely that the page has been "dma-pinned".
 *                False, if the page is definitely not dma-pinned.
 */
static inline bool page_maybe_dma_pinned(struct page *page)
{
        if (hpage_pincount_available(page))
                return compound_pincount(page) > 0;

        /*
         * page_ref_count() is signed. If that refcount overflows, then
         * page_ref_count() returns a negative value, and callers will avoid
         * further incrementing the refcount.
         *
         * Here, for that overflow case, use the signed bit to count a little
         * bit higher via unsigned math, and thus still get an accurate result.
         */
        return ((unsigned int)page_ref_count(compound_head(page))) >=
                GUP_PIN_COUNTING_BIAS;
}

#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
#define SECTION_IN_PAGE_FLAGS
#endif

/*
 * The identification function is mainly used by the buddy allocator for
 * determining if two pages could be buddies. We are not really identifying
 * the zone since we could be using the section number id if we do not have
 * node id available in page flags.
 * We only guarantee that it will return the same value for two combinable
 * pages in a zone.
 */
static inline int page_zone_id(struct page *page)
{
        return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK;
}

#ifdef NODE_NOT_IN_PAGE_FLAGS
extern int page_to_nid(const struct page *page);
#else
static inline int page_to_nid(const struct page *page)
{
        struct page *p = (struct page *)page;

        return (PF_POISONED_CHECK(p)->flags >> NODES_PGSHIFT) & NODES_MASK;
}
#endif

#ifdef CONFIG_NUMA_BALANCING
static inline int cpu_pid_to_cpupid(int cpu, int pid)
{
        return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK);
}

static inline int cpupid_to_pid(int cpupid)
{
        return cpupid & LAST__PID_MASK;
}

static inline int cpupid_to_cpu(int cpupid)
{
        return (cpupid >> LAST__PID_SHIFT) & LAST__CPU_MASK;
}

static inline int cpupid_to_nid(int cpupid)
{
        return cpu_to_node(cpupid_to_cpu(cpupid));
}

static inline bool cpupid_pid_unset(int cpupid)
{
        return cpupid_to_pid(cpupid) == (-1 & LAST__PID_MASK);
}

static inline bool cpupid_cpu_unset(int cpupid)
{
        return cpupid_to_cpu(cpupid) == (-1 & LAST__CPU_MASK);
}

static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid)
{
        return (task_pid & LAST__PID_MASK) == cpupid_to_pid(cpupid);
}

#define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid)
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
{
        return xchg(&page->_last_cpupid, cpupid & LAST_CPUPID_MASK);
}

static inline int page_cpupid_last(struct page *page)
{
        return page->_last_cpupid;
}
static inline void page_cpupid_reset_last(struct page *page)
{
        page->_last_cpupid = -1 & LAST_CPUPID_MASK;
}
#else
static inline int page_cpupid_last(struct page *page)
{
        return (page->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
}

extern int page_cpupid_xchg_last(struct page *page, int cpupid);

static inline void page_cpupid_reset_last(struct page *page)
{
        page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT;
}
#endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */
#else /* !CONFIG_NUMA_BALANCING */
static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
{
        return page_to_nid(page); /* XXX */
}

static inline int page_cpupid_last(struct page *page)
{
        return page_to_nid(page); /* XXX */
}

static inline int cpupid_to_nid(int cpupid)
{
        return -1;
}

static inline int cpupid_to_pid(int cpupid)
{
        return -1;
}

static inline int cpupid_to_cpu(int cpupid)
{
        return -1;
}

static inline int cpu_pid_to_cpupid(int nid, int pid)
{
        return -1;
}

static inline bool cpupid_pid_unset(int cpupid)
{
        return true;
}

static inline void page_cpupid_reset_last(struct page *page)
{
}

static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
{
        return false;
}
#endif /* CONFIG_NUMA_BALANCING */

#ifdef CONFIG_KASAN_SW_TAGS

/*
 * KASAN per-page tags are stored xor'ed with 0xff. This allows to avoid
 * setting tags for all pages to native kernel tag value 0xff, as the default
 * value 0x00 maps to 0xff.
 */

static inline u8 page_kasan_tag(const struct page *page)
{
        u8 tag;

        tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK;
        tag ^= 0xff;

        return tag;
}

static inline void page_kasan_tag_set(struct page *page, u8 tag)
{
        tag ^= 0xff;
        page->flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT);
        page->flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT;
}

static inline void page_kasan_tag_reset(struct page *page)
{
        page_kasan_tag_set(page, 0xff);
}
#else
static inline u8 page_kasan_tag(const struct page *page)
{
        return 0xff;
}

static inline void page_kasan_tag_set(struct page *page, u8 tag) { }
static inline void page_kasan_tag_reset(struct page *page) { }
#endif

static inline struct zone *page_zone(const struct page *page)
{
        return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
}

static inline pg_data_t *page_pgdat(const struct page *page)
{
        return NODE_DATA(page_to_nid(page));
}

#ifdef SECTION_IN_PAGE_FLAGS
static inline void set_page_section(struct page *page, unsigned long section)
{
        page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
        page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT;
}

static inline unsigned long page_to_section(const struct page *page)
{
        return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
}
#endif

static inline void set_page_zone(struct page *page, enum zone_type zone)
{
        page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
        page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
}

static inline void set_page_node(struct page *page, unsigned long node)
{
        page->flags &= ~(NODES_MASK << NODES_PGSHIFT);
        page->flags |= (node & NODES_MASK) << NODES_PGSHIFT;
}

static inline void set_page_links(struct page *page, enum zone_type zone,
        unsigned long node, unsigned long pfn)
{
        set_page_zone(page, zone);
        set_page_node(page, node);
#ifdef SECTION_IN_PAGE_FLAGS
        set_page_section(page, pfn_to_section_nr(pfn));
#endif
}

#ifdef CONFIG_MEMCG
static inline struct mem_cgroup *page_memcg(struct page *page)
{
        return page->mem_cgroup;
}
static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
{
        WARN_ON_ONCE(!rcu_read_lock_held());
        return READ_ONCE(page->mem_cgroup);
}
#else
static inline struct mem_cgroup *page_memcg(struct page *page)
{
        return NULL;
}
static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
{
        WARN_ON_ONCE(!rcu_read_lock_held());
        return NULL;
}
#endif

/*
 * Some inline functions in vmstat.h depend on page_zone()
 */
#include <linux/vmstat.h>

static __always_inline void *lowmem_page_address(const struct page *page)
{
        return page_to_virt(page);
}

#if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL)
#define HASHED_PAGE_VIRTUAL
#endif

#if defined(WANT_PAGE_VIRTUAL)
static inline void *page_address(const struct page *page)
{
        return page->virtual;
}
static inline void set_page_address(struct page *page, void *address)
{
        page->virtual = address;
}
#define page_address_init()  do { } while(0)
#endif

#if defined(HASHED_PAGE_VIRTUAL)
void *page_address(const struct page *page);
void set_page_address(struct page *page, void *virtual);
void page_address_init(void);
#endif

#if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL)
#define page_address(page) lowmem_page_address(page)
#define set_page_address(page, address)  do { } while(0)
#define page_address_init()  do { } while(0)
#endif

extern void *page_rmapping(struct page *page);
extern struct anon_vma *page_anon_vma(struct page *page);
extern struct address_space *page_mapping(struct page *page);

extern struct address_space *__page_file_mapping(struct page *);

static inline
struct address_space *page_file_mapping(struct page *page)
{
        if (unlikely(PageSwapCache(page)))
                return __page_file_mapping(page);

        return page->mapping;
}

extern pgoff_t __page_file_index(struct page *page);

/*
 * Return the pagecache index of the passed page.  Regular pagecache pages
 * use ->index whereas swapcache pages use swp_offset(->private)
 */
static inline pgoff_t page_index(struct page *page)
{
        if (unlikely(PageSwapCache(page)))
                return __page_file_index(page);
        return page->index;
}

bool page_mapped(struct page *page);
struct address_space *page_mapping(struct page *page);
struct address_space *page_mapping_file(struct page *page);

/*
 * Return true only if the page has been allocated with
 * ALLOC_NO_WATERMARKS and the low watermark was not
 * met implying that the system is under some pressure.
 */
static inline bool page_is_pfmemalloc(struct page *page)
{
        /*
         * Page index cannot be this large so this must be
         * a pfmemalloc page.
         */
        return page->index == -1UL;
}

/*
 * Only to be called by the page allocator on a freshly allocated
 * page.
 */
static inline void set_page_pfmemalloc(struct page *page)
{
        page->index = -1UL;
}

static inline void clear_page_pfmemalloc(struct page *page)
{
        page->index = 0;
}

/*
 * Can be called by the pagefault handler when it gets a VM_FAULT_OOM.
 */
extern void pagefault_out_of_memory(void);

#define offset_in_page(p)        ((unsigned long)(p) & ~PAGE_MASK)
#define offset_in_thp(page, p)        ((unsigned long)(p) & (thp_size(page) - 1))

/*
 * Flags passed to show_mem() and show_free_areas() to suppress output in
 * various contexts.
 */
#define SHOW_MEM_FILTER_NODES                (0x0001u)        /* disallowed nodes */

extern void show_free_areas(unsigned int flags, nodemask_t *nodemask);

#ifdef CONFIG_MMU
extern bool can_do_mlock(void);
#else
static inline bool can_do_mlock(void) { return false; }
#endif
extern int user_shm_lock(size_t, struct user_struct *);
extern void user_shm_unlock(size_t, struct user_struct *);

/*
 * Parameter block passed down to zap_pte_range in exceptional cases.
 */
struct zap_details {
        struct address_space *check_mapping;        /* Check page->mapping if set */
        pgoff_t        first_index;                        /* Lowest page->index to unmap */
        pgoff_t last_index;                        /* Highest page->index to unmap */
        struct page *single_page;                /* Locked page to be unmapped */
};

struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
                             pte_t pte);
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
                                pmd_t pmd);

void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
                  unsigned long size);
void zap_page_range(struct vm_area_struct *vma, unsigned long address,
                    unsigned long size);
void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
                unsigned long start, unsigned long end);

struct mmu_notifier_range;

void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
                unsigned long end, unsigned long floor, unsigned long ceiling);
int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
                          struct mmu_notifier_range *range, pte_t **ptepp,
                          pmd_t **pmdpp, spinlock_t **ptlp);
int follow_pte(struct mm_struct *mm, unsigned long address,
               pte_t **ptepp, spinlock_t **ptlp);
int follow_pfn(struct vm_area_struct *vma, unsigned long address,
        unsigned long *pfn);
int follow_phys(struct vm_area_struct *vma, unsigned long address,
                unsigned int flags, unsigned long *prot, resource_size_t *phys);
int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
                        void *buf, int len, int write);

extern void truncate_pagecache(struct inode *inode, loff_t new);
extern void truncate_setsize(struct inode *inode, loff_t newsize);
void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to);
void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end);
int truncate_inode_page(struct address_space *mapping, struct page *page);
int generic_error_remove_page(struct address_space *mapping, struct page *page);
int invalidate_inode_page(struct page *page);

#ifdef CONFIG_MMU
extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
                                  unsigned long address, unsigned int flags,
                                  struct pt_regs *regs);
extern int fixup_user_fault(struct mm_struct *mm,
                            unsigned long address, unsigned int fault_flags,
                            bool *unlocked);
void unmap_mapping_page(struct page *page);
void unmap_mapping_pages(struct address_space *mapping,
                pgoff_t start, pgoff_t nr, bool even_cows);
void unmap_mapping_range(struct address_space *mapping,
                loff_t const holebegin, loff_t const holelen, int even_cows);
#else
static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
                                         unsigned long address, unsigned int flags,
                                         struct pt_regs *regs)
{
        /* should never happen if there's no MMU */
        BUG();
        return VM_FAULT_SIGBUS;
}
static inline int fixup_user_fault(struct mm_struct *mm, unsigned long address,
                unsigned int fault_flags, bool *unlocked)
{
        /* should never happen if there's no MMU */
        BUG();
        return -EFAULT;
}
static inline void unmap_mapping_page(struct page *page) { }
static inline void unmap_mapping_pages(struct address_space *mapping,
                pgoff_t start, pgoff_t nr, bool even_cows) { }
static inline void unmap_mapping_range(struct address_space *mapping,
                loff_t const holebegin, loff_t const holelen, int even_cows) { }
#endif

static inline void unmap_shared_mapping_range(struct address_space *mapping,
                loff_t const holebegin, loff_t const holelen)
{
        unmap_mapping_range(mapping, holebegin, holelen, 0);
}

extern int access_process_vm(struct task_struct *tsk, unsigned long addr,
                void *buf, int len, unsigned int gup_flags);
extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
                void *buf, int len, unsigned int gup_flags);
extern int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
                unsigned long addr, void *buf, int len, unsigned int gup_flags);

long get_user_pages_remote(struct mm_struct *mm,
                            unsigned long start, unsigned long nr_pages,
                            unsigned int gup_flags, struct page **pages,
                            struct vm_area_struct **vmas, int *locked);
long pin_user_pages_remote(struct mm_struct *mm,
                           unsigned long start, unsigned long nr_pages,
                           unsigned int gup_flags, struct page **pages,
                           struct vm_area_struct **vmas, int *locked);
long get_user_pages(unsigned long start, unsigned long nr_pages,
                            unsigned int gup_flags, struct page **pages,
                            struct vm_area_struct **vmas);
long pin_user_pages(unsigned long start, unsigned long nr_pages,
                    unsigned int gup_flags, struct page **pages,
                    struct vm_area_struct **vmas);
long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
                    unsigned int gup_flags, struct page **pages, int *locked);
long pin_user_pages_locked(unsigned long start, unsigned long nr_pages,
                    unsigned int gup_flags, struct page **pages, int *locked);
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
                    struct page **pages, unsigned int gup_flags);
long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
                    struct page **pages, unsigned int gup_flags);

int get_user_pages_fast(unsigned long start, int nr_pages,
                        unsigned int gup_flags, struct page **pages);
int pin_user_pages_fast(unsigned long start, int nr_pages,
                        unsigned int gup_flags, struct page **pages);

int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc);
int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
                        struct task_struct *task, bool bypass_rlim);

/* Container for pinned pfns / pages */
struct frame_vector {
        unsigned int nr_allocated;        /* Number of frames we have space for */
        unsigned int nr_frames;        /* Number of frames stored in ptrs array */
        bool got_ref;                /* Did we pin pages by getting page ref? */
        bool is_pfns;                /* Does array contain pages or pfns? */
        void *ptrs[];                /* Array of pinned pfns / pages. Use
                                 * pfns_vector_pages() or pfns_vector_pfns()
                                 * for access */
};

struct frame_vector *frame_vector_create(unsigned int nr_frames);
void frame_vector_destroy(struct frame_vector *vec);
int get_vaddr_frames(unsigned long start, unsigned int nr_pfns,
                     unsigned int gup_flags, struct frame_vector *vec);
void put_vaddr_frames(struct frame_vector *vec);
int frame_vector_to_pages(struct frame_vector *vec);
void frame_vector_to_pfns(struct frame_vector *vec);

static inline unsigned int frame_vector_count(struct frame_vector *vec)
{
        return vec->nr_frames;
}

static inline struct page **frame_vector_pages(struct frame_vector *vec)
{
        if (vec->is_pfns) {
                int err = frame_vector_to_pages(vec);

                if (err)
                        return ERR_PTR(err);
        }
        return (struct page **)(vec->ptrs);
}

static inline unsigned long *frame_vector_pfns(struct frame_vector *vec)
{
        if (!vec->is_pfns)
                frame_vector_to_pfns(vec);
        return (unsigned long *)(vec->ptrs);
}

struct kvec;
int get_kernel_pages(const struct kvec *iov, int nr_pages, int write,
                        struct page **pages);
int get_kernel_page(unsigned long start, int write, struct page **pages);
struct page *get_dump_page(unsigned long addr);

extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
extern void do_invalidatepage(struct page *page, unsigned int offset,
                              unsigned int length);

void __set_page_dirty(struct page *, struct address_space *, int warn);
int __set_page_dirty_nobuffers(struct page *page);
int __set_page_dirty_no_writeback(struct page *page);
int redirty_page_for_writepage(struct writeback_control *wbc,
                                struct page *page);
void account_page_dirtied(struct page *page, struct address_space *mapping);
void account_page_cleaned(struct page *page, struct address_space *mapping,
                          struct bdi_writeback *wb);
int set_page_dirty(struct page *page);
int set_page_dirty_lock(struct page *page);
void __cancel_dirty_page(struct page *page);
static inline void cancel_dirty_page(struct page *page)
{
        /* Avoid atomic ops, locking, etc. when not actually needed. */
        if (PageDirty(page))
                __cancel_dirty_page(page);
}
int clear_page_dirty_for_io(struct page *page);

int get_cmdline(struct task_struct *task, char *buffer, int buflen);

extern unsigned long move_page_tables(struct vm_area_struct *vma,
                unsigned long old_addr, struct vm_area_struct *new_vma,
                unsigned long new_addr, unsigned long len,
                bool need_rmap_locks);

/*
 * Flags used by change_protection().  For now we make it a bitmap so
 * that we can pass in multiple flags just like parameters.  However
 * for now all the callers are only use one of the flags at the same
 * time.
 */
/* Whether we should allow dirty bit accounting */
#define  MM_CP_DIRTY_ACCT                  (1UL << 0)
/* Whether this protection change is for NUMA hints */
#define  MM_CP_PROT_NUMA                   (1UL << 1)
/* Whether this change is for write protecting */
#define  MM_CP_UFFD_WP                     (1UL << 2) /* do wp */
#define  MM_CP_UFFD_WP_RESOLVE             (1UL << 3) /* Resolve wp */
#define  MM_CP_UFFD_WP_ALL                 (MM_CP_UFFD_WP | \
                                            MM_CP_UFFD_WP_RESOLVE)

extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
                              unsigned long end, pgprot_t newprot,
                              unsigned long cp_flags);
extern int mprotect_fixup(struct vm_area_struct *vma,
                          struct vm_area_struct **pprev, unsigned long start,
                          unsigned long end, unsigned long newflags);

/*
 * doesn't attempt to fault and will return short.
 */
int get_user_pages_fast_only(unsigned long start, int nr_pages,
                             unsigned int gup_flags, struct page **pages);
int pin_user_pages_fast_only(unsigned long start, int nr_pages,
                             unsigned int gup_flags, struct page **pages);

static inline bool get_user_page_fast_only(unsigned long addr,
                        unsigned int gup_flags, struct page **pagep)
{
        return get_user_pages_fast_only(addr, 1, gup_flags, pagep) == 1;
}
/*
 * per-process(per-mm_struct) statistics.
 */
static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
{
        long val = atomic_long_read(&mm->rss_stat.count[member]);

#ifdef SPLIT_RSS_COUNTING
        /*
         * counter is updated in asynchronous manner and may go to minus.
         * But it's never be expected number for users.
         */
        if (val < 0)
                val = 0;
#endif
        return (unsigned long)val;
}

void mm_trace_rss_stat(struct mm_struct *mm, int member, long count);

static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
{
        long count = atomic_long_add_return(value, &mm->rss_stat.count[member]);

        mm_trace_rss_stat(mm, member, count);
}

static inline void inc_mm_counter(struct mm_struct *mm, int member)
{
        long count = atomic_long_inc_return(&mm->rss_stat.count[member]);

        mm_trace_rss_stat(mm, member, count);
}

static inline void dec_mm_counter(struct mm_struct *mm, int member)
{
        long count = atomic_long_dec_return(&mm->rss_stat.count[member]);

        mm_trace_rss_stat(mm, member, count);
}

/* Optimized variant when page is already known not to be PageAnon */
static inline int mm_counter_file(struct page *page)
{
        if (PageSwapBacked(page))
                return MM_SHMEMPAGES;
        return MM_FILEPAGES;
}

static inline int mm_counter(struct page *page)
{
        if (PageAnon(page))
                return MM_ANONPAGES;
        return mm_counter_file(page);
}

static inline unsigned long get_mm_rss(struct mm_struct *mm)
{
        return get_mm_counter(mm, MM_FILEPAGES) +
                get_mm_counter(mm, MM_ANONPAGES) +
                get_mm_counter(mm, MM_SHMEMPAGES);
}

static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
{
        return max(mm->hiwater_rss, get_mm_rss(mm));
}

static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm)
{
        return max(mm->hiwater_vm, mm->total_vm);
}

static inline void update_hiwater_rss(struct mm_struct *mm)
{
        unsigned long _rss = get_mm_rss(mm);

        if ((mm)->hiwater_rss < _rss)
                (mm)->hiwater_rss = _rss;
}

static inline void update_hiwater_vm(struct mm_struct *mm)
{
        if (mm->hiwater_vm < mm->total_vm)
                mm->hiwater_vm = mm->total_vm;
}

static inline void reset_mm_hiwater_rss(struct mm_struct *mm)
{
        mm->hiwater_rss = get_mm_rss(mm);
}

static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
                                         struct mm_struct *mm)
{
        unsigned long hiwater_rss = get_mm_hiwater_rss(mm);

        if (*maxrss < hiwater_rss)
                *maxrss = hiwater_rss;
}

#if defined(SPLIT_RSS_COUNTING)
void sync_mm_rss(struct mm_struct *mm);
#else
static inline void sync_mm_rss(struct mm_struct *mm)
{
}
#endif

#ifndef CONFIG_ARCH_HAS_PTE_SPECIAL
static inline int pte_special(pte_t pte)
{
        return 0;
}

static inline pte_t pte_mkspecial(pte_t pte)
{
        return pte;
}
#endif

#ifndef CONFIG_ARCH_HAS_PTE_DEVMAP
static inline int pte_devmap(pte_t pte)
{
        return 0;
}
#endif

int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);

extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
                               spinlock_t **ptl);
static inline pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
                                    spinlock_t **ptl)
{
        pte_t *ptep;
        __cond_lock(*ptl, ptep = __get_locked_pte(mm, addr, ptl));
        return ptep;
}

#ifdef __PAGETABLE_P4D_FOLDED
static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd,
                                                unsigned long address)
{
        return 0;
}
#else
int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
#endif

#if defined(__PAGETABLE_PUD_FOLDED) || !defined(CONFIG_MMU)
static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d,
                                                unsigned long address)
{
        return 0;
}
static inline void mm_inc_nr_puds(struct mm_struct *mm) {}
static inline void mm_dec_nr_puds(struct mm_struct *mm) {}

#else
int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address);

static inline void mm_inc_nr_puds(struct mm_struct *mm)
{
        if (mm_pud_folded(mm))
                return;
        atomic_long_add(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes);
}

static inline void mm_dec_nr_puds(struct mm_struct *mm)
{
        if (mm_pud_folded(mm))
                return;
        atomic_long_sub(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes);
}
#endif

#if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU)
static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
                                                unsigned long address)
{
        return 0;
}

static inline void mm_inc_nr_pmds(struct mm_struct *mm) {}
static inline void mm_dec_nr_pmds(struct mm_struct *mm) {}

#else
int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);

static inline void mm_inc_nr_pmds(struct mm_struct *mm)
{
        if (mm_pmd_folded(mm))
                return;
        atomic_long_add(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes);
}

static inline void mm_dec_nr_pmds(struct mm_struct *mm)
{
        if (mm_pmd_folded(mm))
                return;
        atomic_long_sub(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes);
}
#endif

#ifdef CONFIG_MMU
static inline void mm_pgtables_bytes_init(struct mm_struct *mm)
{
        atomic_long_set(&mm->pgtables_bytes, 0);
}

static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm)
{
        return atomic_long_read(&mm->pgtables_bytes);
}

static inline void mm_inc_nr_ptes(struct mm_struct *mm)
{
        atomic_long_add(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes);
}

static inline void mm_dec_nr_ptes(struct mm_struct *mm)
{
        atomic_long_sub(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes);
}
#else

static inline void mm_pgtables_bytes_init(struct mm_struct *mm) {}
static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm)
{
        return 0;
}

static inline void mm_inc_nr_ptes(struct mm_struct *mm) {}
static inline void mm_dec_nr_ptes(struct mm_struct *mm) {}
#endif

int __pte_alloc(struct mm_struct *mm, pmd_t *pmd);
int __pte_alloc_kernel(pmd_t *pmd);

#if defined(CONFIG_MMU)

static inline p4d_t *p4d_alloc(struct mm_struct *mm, pgd_t *pgd,
                unsigned long address)
{
        return (unlikely(pgd_none(*pgd)) && __p4d_alloc(mm, pgd, address)) ?
                NULL : p4d_offset(pgd, address);
}

static inline pud_t *pud_alloc(struct mm_struct *mm, p4d_t *p4d,
                unsigned long address)
{
        return (unlikely(p4d_none(*p4d)) && __pud_alloc(mm, p4d, address)) ?
                NULL : pud_offset(p4d, address);
}

static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
{
        return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))?
                NULL: pmd_offset(pud, address);
}
#endif /* CONFIG_MMU */

#if USE_SPLIT_PTE_PTLOCKS
#if ALLOC_SPLIT_PTLOCKS
void __init ptlock_cache_init(void);
extern bool ptlock_alloc(struct page *page);
extern void ptlock_free(struct page *page);

static inline spinlock_t *ptlock_ptr(struct page *page)
{
        return page->ptl;
}
#else /* ALLOC_SPLIT_PTLOCKS */
static inline void ptlock_cache_init(void)
{
}

static inline bool ptlock_alloc(struct page *page)
{
        return true;
}

static inline void ptlock_free(struct page *page)
{
}

static inline spinlock_t *ptlock_ptr(struct page *page)
{
        return &page->ptl;
}
#endif /* ALLOC_SPLIT_PTLOCKS */

static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
        return ptlock_ptr(pmd_page(*pmd));
}

static inline bool ptlock_init(struct page *page)
{
        /*
         * prep_new_page() initialize page->private (and therefore page->ptl)
         * with 0. Make sure nobody took it in use in between.
         *
         * It can happen if arch try to use slab for page table allocation:
         * slab code uses page->slab_cache, which share storage with page->ptl.
         */
        VM_BUG_ON_PAGE(*(unsigned long *)&page->ptl, page);
        if (!ptlock_alloc(page))
                return false;
        spin_lock_init(ptlock_ptr(page));
        return true;
}

#else        /* !USE_SPLIT_PTE_PTLOCKS */
/*
 * We use mm->page_table_lock to guard all pagetable pages of the mm.
 */
static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
        return &mm->page_table_lock;
}
static inline void ptlock_cache_init(void) {}
static inline bool ptlock_init(struct page *page) { return true; }
static inline void ptlock_free(struct page *page) {}
#endif /* USE_SPLIT_PTE_PTLOCKS */

static inline void pgtable_init(void)
{
        ptlock_cache_init();
        pgtable_cache_init();
}

static inline bool pgtable_pte_page_ctor(struct page *page)
{
        if (!ptlock_init(page))
                return false;
        __SetPageTable(page);
        inc_zone_page_state(page, NR_PAGETABLE);
        return true;
}

static inline void pgtable_pte_page_dtor(struct page *page)
{
        ptlock_free(page);
        __ClearPageTable(page);
        dec_zone_page_state(page, NR_PAGETABLE);
}

#define pte_offset_map_lock(mm, pmd, address, ptlp)        \
({                                                        \
        spinlock_t *__ptl = pte_lockptr(mm, pmd);        \
        pte_t *__pte = pte_offset_map(pmd, address);        \
        *(ptlp) = __ptl;                                \
        spin_lock(__ptl);                                \
        __pte;                                                \
})

#define pte_unmap_unlock(pte, ptl)        do {                \
        spin_unlock(ptl);                                \
        pte_unmap(pte);                                        \
} while (0)

#define pte_alloc(mm, pmd) (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd))

#define pte_alloc_map(mm, pmd, address)                        \
        (pte_alloc(mm, pmd) ? NULL : pte_offset_map(pmd, address))

#define pte_alloc_map_lock(mm, pmd, address, ptlp)        \
        (pte_alloc(mm, pmd) ?                        \
                 NULL : pte_offset_map_lock(mm, pmd, address, ptlp))

#define pte_alloc_kernel(pmd, address)                        \
        ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \
                NULL: pte_offset_kernel(pmd, address))

#if USE_SPLIT_PMD_PTLOCKS

static struct page *pmd_to_page(pmd_t *pmd)
{
        unsigned long mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
        return virt_to_page((void *)((unsigned long) pmd & mask));
}

static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
        return ptlock_ptr(pmd_to_page(pmd));
}

static inline bool pmd_ptlock_init(struct page *page)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        page->pmd_huge_pte = NULL;
#endif
        return ptlock_init(page);
}

static inline void pmd_ptlock_free(struct page *page)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        VM_BUG_ON_PAGE(page->pmd_huge_pte, page);
#endif
        ptlock_free(page);
}

#define pmd_huge_pte(mm, pmd) (pmd_to_page(pmd)->pmd_huge_pte)

#else

static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
        return &mm->page_table_lock;
}

static inline bool pmd_ptlock_init(struct page *page) { return true; }
static inline void pmd_ptlock_free(struct page *page) {}

#define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte)

#endif

static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd)
{
        spinlock_t *ptl = pmd_lockptr(mm, pmd);
        spin_lock(ptl);
        return ptl;
}

static inline bool pgtable_pmd_page_ctor(struct page *page)
{
        if (!pmd_ptlock_init(page))
                return false;
        __SetPageTable(page);
#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
        atomic_set(&page->pt_share_count, 0);
#endif
        inc_zone_page_state(page, NR_PAGETABLE);
        return true;
}

static inline void pgtable_pmd_page_dtor(struct page *page)
{
        pmd_ptlock_free(page);
        __ClearPageTable(page);
        dec_zone_page_state(page, NR_PAGETABLE);
}

/*
 * No scalability reason to split PUD locks yet, but follow the same pattern
 * as the PMD locks to make it easier if we decide to.  The VM should not be
 * considered ready to switch to split PUD locks yet; there may be places
 * which need to be converted from page_table_lock.
 */
static inline spinlock_t *pud_lockptr(struct mm_struct *mm, pud_t *pud)
{
        return &mm->page_table_lock;
}

static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud)
{
        spinlock_t *ptl = pud_lockptr(mm, pud);

        spin_lock(ptl);
        return ptl;
}

extern void __init pagecache_init(void);
extern void __init free_area_init_memoryless_node(int nid);
extern void free_initmem(void);

/*
 * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK)
 * into the buddy system. The freed pages will be poisoned with pattern
 * "poison" if it's within range [0, UCHAR_MAX].
 * Return pages freed into the buddy system.
 */
extern unsigned long free_reserved_area(void *start, void *end,
                                        int poison, const char *s);

#ifdef        CONFIG_HIGHMEM
/*
 * Free a highmem page into the buddy system, adjusting totalhigh_pages
 * and totalram_pages.
 */
extern void free_highmem_page(struct page *page);
#endif

extern void adjust_managed_page_count(struct page *page, long count);
extern void mem_init_print_info(const char *str);

extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end);

/* Free the reserved page into the buddy system, so it gets managed. */
static inline void __free_reserved_page(struct page *page)
{
        ClearPageReserved(page);
        init_page_count(page);
        __free_page(page);
}

static inline void free_reserved_page(struct page *page)
{
        __free_reserved_page(page);
        adjust_managed_page_count(page, 1);
}

static inline void mark_page_reserved(struct page *page)
{
        SetPageReserved(page);
        adjust_managed_page_count(page, -1);
}

/*
 * Default method to free all the __init memory into the buddy system.
 * The freed pages will be poisoned with pattern "poison" if it's within
 * range [0, UCHAR_MAX].
 * Return pages freed into the buddy system.
 */
static inline unsigned long free_initmem_default(int poison)
{
        extern char __init_begin[], __init_end[];

        return free_reserved_area(&__init_begin, &__init_end,
                                  poison, "unused kernel");
}

static inline unsigned long get_num_physpages(void)
{
        int nid;
        unsigned long phys_pages = 0;

        for_each_online_node(nid)
                phys_pages += node_present_pages(nid);

        return phys_pages;
}

/*
 * Using memblock node mappings, an architecture may initialise its
 * zones, allocate the backing mem_map and account for memory holes in an
 * architecture independent manner.
 *
 * An architecture is expected to register range of page frames backed by
 * physical memory with memblock_add[_node]() before calling
 * free_area_init() passing in the PFN each zone ends at. At a basic
 * usage, an architecture is expected to do something like
 *
 * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn,
 *                                                          max_highmem_pfn};
 * for_each_valid_physical_page_range()
 *         memblock_add_node(base, size, nid)
 * free_area_init(max_zone_pfns);
 */
void free_area_init(unsigned long *max_zone_pfn);
unsigned long node_map_pfn_alignment(void);
unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn,
                                                unsigned long end_pfn);
extern unsigned long absent_pages_in_range(unsigned long start_pfn,
                                                unsigned long end_pfn);
extern void get_pfn_range_for_nid(unsigned int nid,
                        unsigned long *start_pfn, unsigned long *end_pfn);
extern unsigned long find_min_pfn_with_active_regions(void);

#ifndef CONFIG_NEED_MULTIPLE_NODES
static inline int early_pfn_to_nid(unsigned long pfn)
{
        return 0;
}
#else
/* please see mm/page_alloc.c */
extern int __meminit early_pfn_to_nid(unsigned long pfn);
/* there is a per-arch backend function. */
extern int __meminit __early_pfn_to_nid(unsigned long pfn,
                                        struct mminit_pfnnid_cache *state);
#endif

extern void set_dma_reserve(unsigned long new_dma_reserve);
extern void memmap_init_zone(unsigned long, int, unsigned long,
                unsigned long, unsigned long, enum meminit_context,
                struct vmem_altmap *, int migratetype);
extern void setup_per_zone_wmarks(void);
extern int __meminit init_per_zone_wmark_min(void);
extern void mem_init(void);
extern void __init mmap_init(void);
extern void show_mem(unsigned int flags, nodemask_t *nodemask);
extern long si_mem_available(void);
extern void si_meminfo(struct sysinfo * val);
extern void si_meminfo_node(struct sysinfo *val, int nid);
#ifdef __HAVE_ARCH_RESERVED_KERNEL_PAGES
extern unsigned long arch_reserved_kernel_pages(void);
#endif

extern __printf(3, 4)
void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...);

extern void setup_per_cpu_pageset(void);

/* page_alloc.c */
extern int min_free_kbytes;
extern int watermark_boost_factor;
extern int watermark_scale_factor;
extern bool arch_has_descending_max_zone_pfns(void);

/* nommu.c */
extern atomic_long_t mmap_pages_allocated;
extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t);

/* interval_tree.c */
void vma_interval_tree_insert(struct vm_area_struct *node,
                              struct rb_root_cached *root);
void vma_interval_tree_insert_after(struct vm_area_struct *node,
                                    struct vm_area_struct *prev,
                                    struct rb_root_cached *root);
void vma_interval_tree_remove(struct vm_area_struct *node,
                              struct rb_root_cached *root);
struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root_cached *root,
                                unsigned long start, unsigned long last);
struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node,
                                unsigned long start, unsigned long last);

#define vma_interval_tree_foreach(vma, root, start, last)                \
        for (vma = vma_interval_tree_iter_first(root, start, last);        \
             vma; vma = vma_interval_tree_iter_next(vma, start, last))

void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
                                   struct rb_root_cached *root);
void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
                                   struct rb_root_cached *root);
struct anon_vma_chain *
anon_vma_interval_tree_iter_first(struct rb_root_cached *root,
                                  unsigned long start, unsigned long last);
struct anon_vma_chain *anon_vma_interval_tree_iter_next(
        struct anon_vma_chain *node, unsigned long start, unsigned long last);
#ifdef CONFIG_DEBUG_VM_RB
void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
#endif

#define anon_vma_interval_tree_foreach(avc, root, start, last)                 \
        for (avc = anon_vma_interval_tree_iter_first(root, start, last); \
             avc; avc = anon_vma_interval_tree_iter_next(avc, start, last))

/* mmap.c */
extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
extern int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
        unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
        struct vm_area_struct *expand);
static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start,
        unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
{
        return __vma_adjust(vma, start, end, pgoff, insert, NULL);
}
extern struct vm_area_struct *vma_merge(struct mm_struct *,
        struct vm_area_struct *prev, unsigned long addr, unsigned long end,
        unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
        struct mempolicy *, struct vm_userfaultfd_ctx);
extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
extern int __split_vma(struct mm_struct *, struct vm_area_struct *,
        unsigned long addr, int new_below);
extern int split_vma(struct mm_struct *, struct vm_area_struct *,
        unsigned long addr, int new_below);
extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
        struct rb_node **, struct rb_node *);
extern void unlink_file_vma(struct vm_area_struct *);
extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
        unsigned long addr, unsigned long len, pgoff_t pgoff,
        bool *need_rmap_locks);
extern void exit_mmap(struct mm_struct *);

static inline int check_data_rlimit(unsigned long rlim,
                                    unsigned long new,
                                    unsigned long start,
                                    unsigned long end_data,
                                    unsigned long start_data)
{
        if (rlim < RLIM_INFINITY) {
                if (((new - start) + (end_data - start_data)) > rlim)
                        return -ENOSPC;
        }

        return 0;
}

extern int mm_take_all_locks(struct mm_struct *mm);
extern void mm_drop_all_locks(struct mm_struct *mm);

extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
extern struct file *get_mm_exe_file(struct mm_struct *mm);
extern struct file *get_task_exe_file(struct task_struct *task);

extern bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long npages);
extern void vm_stat_account(struct mm_struct *, vm_flags_t, long npages);

extern bool vma_is_special_mapping(const struct vm_area_struct *vma,
                                   const struct vm_special_mapping *sm);
extern struct vm_area_struct *_install_special_mapping(struct mm_struct *mm,
                                   unsigned long addr, unsigned long len,
                                   unsigned long flags,
                                   const struct vm_special_mapping *spec);
/* This is an obsolete alternative to _install_special_mapping. */
extern int install_special_mapping(struct mm_struct *mm,
                                   unsigned long addr, unsigned long len,
                                   unsigned long flags, struct page **pages);

unsigned long randomize_stack_top(unsigned long stack_top);
unsigned long randomize_page(unsigned long start, unsigned long range);

extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);

extern unsigned long mmap_region(struct file *file, unsigned long addr,
        unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
        struct list_head *uf);
extern unsigned long do_mmap(struct file *file, unsigned long addr,
        unsigned long len, unsigned long prot, unsigned long flags,
        unsigned long pgoff, unsigned long *populate, struct list_head *uf);
extern int __do_munmap(struct mm_struct *, unsigned long, size_t,
                       struct list_head *uf, bool downgrade);
extern int do_munmap(struct mm_struct *, unsigned long, size_t,
                     struct list_head *uf);
extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior);

#ifdef CONFIG_MMU
extern int __mm_populate(unsigned long addr, unsigned long len,
                         int ignore_errors);
static inline void mm_populate(unsigned long addr, unsigned long len)
{
        /* Ignore errors */
        (void) __mm_populate(addr, len, 1);
}
#else
static inline void mm_populate(unsigned long addr, unsigned long len) {}
#endif

/* These take the mm semaphore themselves */
extern int __must_check vm_brk(unsigned long, unsigned long);
extern int __must_check vm_brk_flags(unsigned long, unsigned long, unsigned long);
extern int vm_munmap(unsigned long, size_t);
extern unsigned long __must_check vm_mmap(struct file *, unsigned long,
        unsigned long, unsigned long,
        unsigned long, unsigned long);

struct vm_unmapped_area_info {
#define VM_UNMAPPED_AREA_TOPDOWN 1
        unsigned long flags;
        unsigned long length;
        unsigned long low_limit;
        unsigned long high_limit;
        unsigned long align_mask;
        unsigned long align_offset;
};

extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info);

/* truncate.c */
extern void truncate_inode_pages(struct address_space *, loff_t);
extern void truncate_inode_pages_range(struct address_space *,
                                       loff_t lstart, loff_t lend);
extern void truncate_inode_pages_final(struct address_space *);

/* generic vm_area_ops exported for stackable file systems */
extern vm_fault_t filemap_fault(struct vm_fault *vmf);
extern void filemap_map_pages(struct vm_fault *vmf,
                pgoff_t start_pgoff, pgoff_t end_pgoff);
extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf);

/* mm/page-writeback.c */
int __must_check write_one_page(struct page *page);
void task_dirty_inc(struct task_struct *tsk);

extern unsigned long stack_guard_gap;
/* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
extern int expand_stack(struct vm_area_struct *vma, unsigned long address);

/* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */
extern int expand_downwards(struct vm_area_struct *vma,
                unsigned long address);
#if VM_GROWSUP
extern int expand_upwards(struct vm_area_struct *vma, unsigned long address);
#else
  #define expand_upwards(vma, address) (0)
#endif

/* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
                                             struct vm_area_struct **pprev);

/* Look up the first VMA which intersects the interval start_addr..end_addr-1,
   NULL if none.  Assume start_addr < end_addr. */
static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr)
{
        struct vm_area_struct * vma = find_vma(mm,start_addr);

        if (vma && end_addr <= vma->vm_start)
                vma = NULL;
        return vma;
}

static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
{
        unsigned long vm_start = vma->vm_start;

        if (vma->vm_flags & VM_GROWSDOWN) {
                vm_start -= stack_guard_gap;
                if (vm_start > vma->vm_start)
                        vm_start = 0;
        }
        return vm_start;
}

static inline unsigned long vm_end_gap(struct vm_area_struct *vma)
{
        unsigned long vm_end = vma->vm_end;

        if (vma->vm_flags & VM_GROWSUP) {
                vm_end += stack_guard_gap;
                if (vm_end < vma->vm_end)
                        vm_end = -PAGE_SIZE;
        }
        return vm_end;
}

static inline unsigned long vma_pages(struct vm_area_struct *vma)
{
        return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
}

/* Look up the first VMA which exactly match the interval vm_start ... vm_end */
static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm,
                                unsigned long vm_start, unsigned long vm_end)
{
        struct vm_area_struct *vma = find_vma(mm, vm_start);

        if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
                vma = NULL;

        return vma;
}

static inline bool range_in_vma(struct vm_area_struct *vma,
                                unsigned long start, unsigned long end)
{
        return (vma && vma->vm_start <= start && end <= vma->vm_end);
}

#ifdef CONFIG_MMU
pgprot_t vm_get_page_prot(unsigned long vm_flags);
void vma_set_page_prot(struct vm_area_struct *vma);
#else
static inline pgprot_t vm_get_page_prot(unsigned long vm_flags)
{
        return __pgprot(0);
}
static inline void vma_set_page_prot(struct vm_area_struct *vma)
{
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
}
#endif

#ifdef CONFIG_NUMA_BALANCING
unsigned long change_prot_numa(struct vm_area_struct *vma,
                        unsigned long start, unsigned long end);
#endif

struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
                        unsigned long pfn, unsigned long size, pgprot_t);
int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
                unsigned long pfn, unsigned long size, pgprot_t prot);
int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
                        struct page **pages, unsigned long *num);
int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
                                unsigned long num);
int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
                                unsigned long num);
vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn);
vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn, pgprot_t pgprot);
vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
                        pfn_t pfn);
vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr,
                        pfn_t pfn, pgprot_t pgprot);
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
                unsigned long addr, pfn_t pfn);
int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);

static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma,
                                unsigned long addr, struct page *page)
{
        int err = vm_insert_page(vma, addr, page);

        if (err == -ENOMEM)
                return VM_FAULT_OOM;
        if (err < 0 && err != -EBUSY)
                return VM_FAULT_SIGBUS;

        return VM_FAULT_NOPAGE;
}

#ifndef io_remap_pfn_range
static inline int io_remap_pfn_range(struct vm_area_struct *vma,
                                     unsigned long addr, unsigned long pfn,
                                     unsigned long size, pgprot_t prot)
{
        return remap_pfn_range(vma, addr, pfn, size, pgprot_decrypted(prot));
}
#endif

static inline vm_fault_t vmf_error(int err)
{
        if (err == -ENOMEM)
                return VM_FAULT_OOM;
        return VM_FAULT_SIGBUS;
}

struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
                         unsigned int foll_flags);

#define FOLL_WRITE        0x01        /* check pte is writable */
#define FOLL_TOUCH        0x02        /* mark page accessed */
#define FOLL_GET        0x04        /* do get_page on page */
#define FOLL_DUMP        0x08        /* give error on hole if it would be zero */
#define FOLL_FORCE        0x10        /* get_user_pages read/write w/o permission */
#define FOLL_NOWAIT        0x20        /* if a disk transfer is needed, start the IO
                                 * and return without waiting upon it */
#define FOLL_POPULATE        0x40        /* fault in page */
#define FOLL_SPLIT        0x80        /* don't return transhuge pages, split them */
#define FOLL_HWPOISON        0x100        /* check page is hwpoisoned */
#define FOLL_NUMA        0x200        /* force NUMA hinting page fault */
#define FOLL_MIGRATION        0x400        /* wait for page to replace migration entry */
#define FOLL_TRIED        0x800        /* a retry, previous pass started an IO */
#define FOLL_MLOCK        0x1000        /* lock present pages */
#define FOLL_REMOTE        0x2000        /* we are working on non-current tsk/mm */
#define FOLL_COW        0x4000        /* internal GUP flag */
#define FOLL_ANON        0x8000        /* don't do file mappings */
#define FOLL_LONGTERM        0x10000        /* mapping lifetime is indefinite: see below */
#define FOLL_SPLIT_PMD        0x20000        /* split huge pmd before returning */
#define FOLL_PIN        0x40000        /* pages must be released via unpin_user_page */
#define FOLL_FAST_ONLY        0x80000        /* gup_fast: prevent fall-back to slow gup */

/*
 * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each
 * other. Here is what they mean, and how to use them:
 *
 * FOLL_LONGTERM indicates that the page will be held for an indefinite time
 * period _often_ under userspace control.  This is in contrast to
 * iov_iter_get_pages(), whose usages are transient.
 *
 * FIXME: For pages which are part of a filesystem, mappings are subject to the
 * lifetime enforced by the filesystem and we need guarantees that longterm
 * users like RDMA and V4L2 only establish mappings which coordinate usage with
 * the filesystem.  Ideas for this coordination include revoking the longterm
 * pin, delaying writeback, bounce buffer page writeback, etc.  As FS DAX was
 * added after the problem with filesystems was found FS DAX VMAs are
 * specifically failed.  Filesystem pages are still subject to bugs and use of
 * FOLL_LONGTERM should be avoided on those pages.
 *
 * FIXME: Also NOTE that FOLL_LONGTERM is not supported in every GUP call.
 * Currently only get_user_pages() and get_user_pages_fast() support this flag
 * and calls to get_user_pages_[un]locked are specifically not allowed.  This
 * is due to an incompatibility with the FS DAX check and
 * FAULT_FLAG_ALLOW_RETRY.
 *
 * In the CMA case: long term pins in a CMA region would unnecessarily fragment
 * that region.  And so, CMA attempts to migrate the page before pinning, when
 * FOLL_LONGTERM is specified.
 *
 * FOLL_PIN indicates that a special kind of tracking (not just page->_refcount,
 * but an additional pin counting system) will be invoked. This is intended for
 * anything that gets a page reference and then touches page data (for example,
 * Direct IO). This lets the filesystem know that some non-file-system entity is
 * potentially changing the pages' data. In contrast to FOLL_GET (whose pages
 * are released via put_page()), FOLL_PIN pages must be released, ultimately, by
 * a call to unpin_user_page().
 *
 * FOLL_PIN is similar to FOLL_GET: both of these pin pages. They use different
 * and separate refcounting mechanisms, however, and that means that each has
 * its own acquire and release mechanisms:
 *
 *     FOLL_GET: get_user_pages*() to acquire, and put_page() to release.
 *
 *     FOLL_PIN: pin_user_pages*() to acquire, and unpin_user_pages to release.
 *
 * FOLL_PIN and FOLL_GET are mutually exclusive for a given function call.
 * (The underlying pages may experience both FOLL_GET-based and FOLL_PIN-based
 * calls applied to them, and that's perfectly OK. This is a constraint on the
 * callers, not on the pages.)
 *
 * FOLL_PIN should be set internally by the pin_user_pages*() APIs, never
 * directly by the caller. That's in order to help avoid mismatches when
 * releasing pages: get_user_pages*() pages must be released via put_page(),
 * while pin_user_pages*() pages must be released via unpin_user_page().
 *
 * Please see Documentation/core-api/pin_user_pages.rst for more information.
 */

static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
{
        if (vm_fault & VM_FAULT_OOM)
                return -ENOMEM;
        if (vm_fault & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
                return (foll_flags & FOLL_HWPOISON) ? -EHWPOISON : -EFAULT;
        if (vm_fault & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV))
                return -EFAULT;
        return 0;
}

typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data);
extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
                               unsigned long size, pte_fn_t fn, void *data);
extern int apply_to_existing_page_range(struct mm_struct *mm,
                                   unsigned long address, unsigned long size,
                                   pte_fn_t fn, void *data);

#ifdef CONFIG_PAGE_POISONING
extern bool page_poisoning_enabled(void);
extern void kernel_poison_pages(struct page *page, int numpages, int enable);
#else
static inline bool page_poisoning_enabled(void) { return false; }
static inline void kernel_poison_pages(struct page *page, int numpages,
                                        int enable) { }
#endif

#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON
DECLARE_STATIC_KEY_TRUE(init_on_alloc);
#else
DECLARE_STATIC_KEY_FALSE(init_on_alloc);
#endif
static inline bool want_init_on_alloc(gfp_t flags)
{
        if (static_branch_unlikely(&init_on_alloc) &&
            !page_poisoning_enabled())
                return true;
        return flags & __GFP_ZERO;
}

#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON
DECLARE_STATIC_KEY_TRUE(init_on_free);
#else
DECLARE_STATIC_KEY_FALSE(init_on_free);
#endif
static inline bool want_init_on_free(void)
{
        return static_branch_unlikely(&init_on_free) &&
               !page_poisoning_enabled();
}

#ifdef CONFIG_DEBUG_PAGEALLOC
extern void init_debug_pagealloc(void);
#else
static inline void init_debug_pagealloc(void) {}
#endif
extern bool _debug_pagealloc_enabled_early;
DECLARE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);

static inline bool debug_pagealloc_enabled(void)
{
        return IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) &&
                _debug_pagealloc_enabled_early;
}

/*
 * For use in fast paths after init_debug_pagealloc() has run, or when a
 * false negative result is not harmful when called too early.
 */
static inline bool debug_pagealloc_enabled_static(void)
{
        if (!IS_ENABLED(CONFIG_DEBUG_PAGEALLOC))
                return false;

        return static_branch_unlikely(&_debug_pagealloc_enabled);
}

#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_ARCH_HAS_SET_DIRECT_MAP)
extern void __kernel_map_pages(struct page *page, int numpages, int enable);

/*
 * When called in DEBUG_PAGEALLOC context, the call should most likely be
 * guarded by debug_pagealloc_enabled() or debug_pagealloc_enabled_static()
 */
static inline void
kernel_map_pages(struct page *page, int numpages, int enable)
{
        __kernel_map_pages(page, numpages, enable);
}
#ifdef CONFIG_HIBERNATION
extern bool kernel_page_present(struct page *page);
#endif        /* CONFIG_HIBERNATION */
#else        /* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */
static inline void
kernel_map_pages(struct page *page, int numpages, int enable) {}
#ifdef CONFIG_HIBERNATION
static inline bool kernel_page_present(struct page *page) { return true; }
#endif        /* CONFIG_HIBERNATION */
#endif        /* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */

#ifdef __HAVE_ARCH_GATE_AREA
extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm);
extern int in_gate_area_no_mm(unsigned long addr);
extern int in_gate_area(struct mm_struct *mm, unsigned long addr);
#else
static inline struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
{
        return NULL;
}
static inline int in_gate_area_no_mm(unsigned long addr) { return 0; }
static inline int in_gate_area(struct mm_struct *mm, unsigned long addr)
{
        return 0;
}
#endif        /* __HAVE_ARCH_GATE_AREA */

extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm);

#ifdef CONFIG_SYSCTL
extern int sysctl_drop_caches;
int drop_caches_sysctl_handler(struct ctl_table *, int, void *, size_t *,
                loff_t *);
#endif

void drop_slab(void);
void drop_slab_node(int nid);

#ifndef CONFIG_MMU
#define randomize_va_space 0
#else
extern int randomize_va_space;
#endif

const char * arch_vma_name(struct vm_area_struct *vma);
#ifdef CONFIG_MMU
void print_vma_addr(char *prefix, unsigned long rip);
#else
static inline void print_vma_addr(char *prefix, unsigned long rip)
{
}
#endif

void *sparse_buffer_alloc(unsigned long size);
struct page * __populate_section_memmap(unsigned long pfn,
                unsigned long nr_pages, int nid, struct vmem_altmap *altmap);
pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node);
pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node);
pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node);
pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
                            struct vmem_altmap *altmap);
void *vmemmap_alloc_block(unsigned long size, int node);
struct vmem_altmap;
void *vmemmap_alloc_block_buf(unsigned long size, int node,
                              struct vmem_altmap *altmap);
void vmemmap_verify(pte_t *, int, unsigned long, unsigned long);
int vmemmap_populate_basepages(unsigned long start, unsigned long end,
                               int node, struct vmem_altmap *altmap);
int vmemmap_populate(unsigned long start, unsigned long end, int node,
                struct vmem_altmap *altmap);
void vmemmap_populate_print_last(void);
#ifdef CONFIG_MEMORY_HOTPLUG
void vmemmap_free(unsigned long start, unsigned long end,
                struct vmem_altmap *altmap);
#endif
void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
                                  unsigned long nr_pages);

enum mf_flags {
        MF_COUNT_INCREASED = 1 << 0,
        MF_ACTION_REQUIRED = 1 << 1,
        MF_MUST_KILL = 1 << 2,
        MF_SOFT_OFFLINE = 1 << 3,
};
extern int memory_failure(unsigned long pfn, int flags);
extern void memory_failure_queue(unsigned long pfn, int flags);
extern void memory_failure_queue_kick(int cpu);
extern int unpoison_memory(unsigned long pfn);
extern int sysctl_memory_failure_early_kill;
extern int sysctl_memory_failure_recovery;
extern void shake_page(struct page *p, int access);
extern atomic_long_t num_poisoned_pages __read_mostly;
extern int soft_offline_page(unsigned long pfn, int flags);


/*
 * Error handlers for various types of pages.
 */
enum mf_result {
        MF_IGNORED,        /* Error: cannot be handled */
        MF_FAILED,        /* Error: handling failed */
        MF_DELAYED,        /* Will be handled later */
        MF_RECOVERED,        /* Successfully recovered */
};

enum mf_action_page_type {
        MF_MSG_KERNEL,
        MF_MSG_KERNEL_HIGH_ORDER,
        MF_MSG_SLAB,
        MF_MSG_DIFFERENT_COMPOUND,
        MF_MSG_POISONED_HUGE,
        MF_MSG_HUGE,
        MF_MSG_FREE_HUGE,
        MF_MSG_NON_PMD_HUGE,
        MF_MSG_UNMAP_FAILED,
        MF_MSG_DIRTY_SWAPCACHE,
        MF_MSG_CLEAN_SWAPCACHE,
        MF_MSG_DIRTY_MLOCKED_LRU,
        MF_MSG_CLEAN_MLOCKED_LRU,
        MF_MSG_DIRTY_UNEVICTABLE_LRU,
        MF_MSG_CLEAN_UNEVICTABLE_LRU,
        MF_MSG_DIRTY_LRU,
        MF_MSG_CLEAN_LRU,
        MF_MSG_TRUNCATED_LRU,
        MF_MSG_BUDDY,
        MF_MSG_BUDDY_2ND,
        MF_MSG_DAX,
        MF_MSG_UNSPLIT_THP,
        MF_MSG_UNKNOWN,
};

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
extern void clear_huge_page(struct page *page,
                            unsigned long addr_hint,
                            unsigned int pages_per_huge_page);
extern void copy_user_huge_page(struct page *dst, struct page *src,
                                unsigned long addr_hint,
                                struct vm_area_struct *vma,
                                unsigned int pages_per_huge_page);
extern long copy_huge_page_from_user(struct page *dst_page,
                                const void __user *usr_src,
                                unsigned int pages_per_huge_page,
                                bool allow_pagefault);

/**
 * vma_is_special_huge - Are transhuge page-table entries considered special?
 * @vma: Pointer to the struct vm_area_struct to consider
 *
 * Whether transhuge page-table entries are considered "special" following
 * the definition in vm_normal_page().
 *
 * Return: true if transhuge page-table entries should be considered special,
 * false otherwise.
 */
static inline bool vma_is_special_huge(const struct vm_area_struct *vma)
{
        return vma_is_dax(vma) || (vma->vm_file &&
                                   (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)));
}

#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */

#ifdef CONFIG_DEBUG_PAGEALLOC
extern unsigned int _debug_guardpage_minorder;
DECLARE_STATIC_KEY_FALSE(_debug_guardpage_enabled);

static inline unsigned int debug_guardpage_minorder(void)
{
        return _debug_guardpage_minorder;
}

static inline bool debug_guardpage_enabled(void)
{
        return static_branch_unlikely(&_debug_guardpage_enabled);
}

static inline bool page_is_guard(struct page *page)
{
        if (!debug_guardpage_enabled())
                return false;

        return PageGuard(page);
}
#else
static inline unsigned int debug_guardpage_minorder(void) { return 0; }
static inline bool debug_guardpage_enabled(void) { return false; }
static inline bool page_is_guard(struct page *page) { return false; }
#endif /* CONFIG_DEBUG_PAGEALLOC */

#if MAX_NUMNODES > 1
void __init setup_nr_node_ids(void);
#else
static inline void setup_nr_node_ids(void) {}
#endif

extern int memcmp_pages(struct page *page1, struct page *page2);

static inline int pages_identical(struct page *page1, struct page *page2)
{
        return !memcmp_pages(page1, page2);
}

#ifdef CONFIG_MAPPING_DIRTY_HELPERS
unsigned long clean_record_shared_mapping_range(struct address_space *mapping,
                                                pgoff_t first_index, pgoff_t nr,
                                                pgoff_t bitmap_pgoff,
                                                unsigned long *bitmap,
                                                pgoff_t *start,
                                                pgoff_t *end);

unsigned long wp_shared_mapping_range(struct address_space *mapping,
                                      pgoff_t first_index, pgoff_t nr);
#endif

extern int sysctl_nr_trim_pages;

static inline bool is_write_sealed(int seals)
{
        return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE);
}

/**
 * is_readonly_sealed - Checks whether write-sealed but mapped read-only,
 *                      in which case writes should be disallowing moving
 *                      forwards.
 * @seals: the seals to check
 * @vm_flags: the VMA flags to check
 *
 * Returns whether readonly sealed, in which case writess should be disallowed
 * going forward.
 */
static inline bool is_readonly_sealed(int seals, vm_flags_t vm_flags)
{
        /*
         * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as
         * MAP_SHARED and read-only, take care to not allow mprotect to
         * revert protections on such mappings. Do this only for shared
         * mappings. For private mappings, don't need to mask
         * VM_MAYWRITE as we still want them to be COW-writable.
         */
        if (is_write_sealed(seals) &&
            ((vm_flags & (VM_SHARED | VM_WRITE)) == VM_SHARED))
                return true;

        return false;
}

/**
 * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and
 *                    handle them.
 * @seals: the seals to check
 * @vma: the vma to operate on
 *
 * Check whether F_SEAL_WRITE or F_SEAL_FUTURE_WRITE are set; if so, do proper
 * check/handling on the vma flags.  Return 0 if check pass, or <0 for errors.
 */
static inline int seal_check_write(int seals, struct vm_area_struct *vma)
{
        if (!is_write_sealed(seals))
                return 0;

        /*
         * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
         * write seals are active.
         */
        if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
                return -EPERM;

        return 0;
}

#endif /* __KERNEL__ */
#endif /* _LINUX_MM_H */




























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_KSTRTOX_H
#define _LINUX_KSTRTOX_H

#include <linux/compiler.h>
#include <linux/types.h>

/* Internal, do not use. */
int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res);
int __must_check _kstrtol(const char *s, unsigned int base, long *res);

int __must_check kstrtoull(const char *s, unsigned int base, unsigned long long *res);
int __must_check kstrtoll(const char *s, unsigned int base, long long *res);

/**
 * kstrtoul - convert a string to an unsigned long
 * @s: The start of the string. The string must be null-terminated, and may also
 *  include a single newline before its terminating null. The first character
 *  may also be a plus sign, but not a minus sign.
 * @base: The number base to use. The maximum supported base is 16. If base is
 *  given as 0, then the base of the string is automatically detected with the
 *  conventional semantics - If it begins with 0x the number will be parsed as a
 *  hexadecimal (case insensitive), if it otherwise begins with 0, it will be
 *  parsed as an octal number. Otherwise it will be parsed as a decimal.
 * @res: Where to write the result of the conversion on success.
 *
 * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
 * Preferred over simple_strtoul(). Return code must be checked.
*/
static inline int __must_check kstrtoul(const char *s, unsigned int base, unsigned long *res)
{
        /*
         * We want to shortcut function call, but
         * __builtin_types_compatible_p(unsigned long, unsigned long long) = 0.
         */
        if (sizeof(unsigned long) == sizeof(unsigned long long) &&
            __alignof__(unsigned long) == __alignof__(unsigned long long))
                return kstrtoull(s, base, (unsigned long long *)res);
        else
                return _kstrtoul(s, base, res);
}

/**
 * kstrtol - convert a string to a long
 * @s: The start of the string. The string must be null-terminated, and may also
 *  include a single newline before its terminating null. The first character
 *  may also be a plus sign or a minus sign.
 * @base: The number base to use. The maximum supported base is 16. If base is
 *  given as 0, then the base of the string is automatically detected with the
 *  conventional semantics - If it begins with 0x the number will be parsed as a
 *  hexadecimal (case insensitive), if it otherwise begins with 0, it will be
 *  parsed as an octal number. Otherwise it will be parsed as a decimal.
 * @res: Where to write the result of the conversion on success.
 *
 * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
 * Preferred over simple_strtol(). Return code must be checked.
 */
static inline int __must_check kstrtol(const char *s, unsigned int base, long *res)
{
        /*
         * We want to shortcut function call, but
         * __builtin_types_compatible_p(long, long long) = 0.
         */
        if (sizeof(long) == sizeof(long long) &&
            __alignof__(long) == __alignof__(long long))
                return kstrtoll(s, base, (long long *)res);
        else
                return _kstrtol(s, base, res);
}

int __must_check kstrtouint(const char *s, unsigned int base, unsigned int *res);
int __must_check kstrtoint(const char *s, unsigned int base, int *res);

static inline int __must_check kstrtou64(const char *s, unsigned int base, u64 *res)
{
        return kstrtoull(s, base, res);
}

static inline int __must_check kstrtos64(const char *s, unsigned int base, s64 *res)
{
        return kstrtoll(s, base, res);
}

static inline int __must_check kstrtou32(const char *s, unsigned int base, u32 *res)
{
        return kstrtouint(s, base, res);
}

static inline int __must_check kstrtos32(const char *s, unsigned int base, s32 *res)
{
        return kstrtoint(s, base, res);
}

int __must_check kstrtou16(const char *s, unsigned int base, u16 *res);
int __must_check kstrtos16(const char *s, unsigned int base, s16 *res);
int __must_check kstrtou8(const char *s, unsigned int base, u8 *res);
int __must_check kstrtos8(const char *s, unsigned int base, s8 *res);
int __must_check kstrtobool(const char *s, bool *res);

int __must_check kstrtoull_from_user(const char __user *s, size_t count, unsigned int base, unsigned long long *res);
int __must_check kstrtoll_from_user(const char __user *s, size_t count, unsigned int base, long long *res);
int __must_check kstrtoul_from_user(const char __user *s, size_t count, unsigned int base, unsigned long *res);
int __must_check kstrtol_from_user(const char __user *s, size_t count, unsigned int base, long *res);
int __must_check kstrtouint_from_user(const char __user *s, size_t count, unsigned int base, unsigned int *res);
int __must_check kstrtoint_from_user(const char __user *s, size_t count, unsigned int base, int *res);
int __must_check kstrtou16_from_user(const char __user *s, size_t count, unsigned int base, u16 *res);
int __must_check kstrtos16_from_user(const char __user *s, size_t count, unsigned int base, s16 *res);
int __must_check kstrtou8_from_user(const char __user *s, size_t count, unsigned int base, u8 *res);
int __must_check kstrtos8_from_user(const char __user *s, size_t count, unsigned int base, s8 *res);
int __must_check kstrtobool_from_user(const char __user *s, size_t count, bool *res);

static inline int __must_check kstrtou64_from_user(const char __user *s, size_t count, unsigned int base, u64 *res)
{
        return kstrtoull_from_user(s, count, base, res);
}

static inline int __must_check kstrtos64_from_user(const char __user *s, size_t count, unsigned int base, s64 *res)
{
        return kstrtoll_from_user(s, count, base, res);
}

static inline int __must_check kstrtou32_from_user(const char __user *s, size_t count, unsigned int base, u32 *res)
{
        return kstrtouint_from_user(s, count, base, res);
}

static inline int __must_check kstrtos32_from_user(const char __user *s, size_t count, unsigned int base, s32 *res)
{
        return kstrtoint_from_user(s, count, base, res);
}

/*
 * Use kstrto<foo> instead.
 *
 * NOTE: simple_strto<foo> does not check for the range overflow and,
 *         depending on the input, may give interesting results.
 *
 * Use these functions if and only if you cannot use kstrto<foo>, because
 * the conversion ends on the first non-digit character, which may be far
 * beyond the supported range. It might be useful to parse the strings like
 * 10x50 or 12:21 without altering original string or temporary buffer in use.
 * Keep in mind above caveat.
 */

extern unsigned long simple_strtoul(const char *,char **,unsigned int);
extern long simple_strtol(const char *,char **,unsigned int);
extern unsigned long long simple_strtoull(const char *,char **,unsigned int);
extern long long simple_strtoll(const char *,char **,unsigned int);

static inline int strtobool(const char *s, bool *res)
{
        return kstrtobool(s, res);
}

#endif        /* _LINUX_KSTRTOX_H */






































































































































































































    1 



































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
// SPDX-License-Identifier: GPL-2.0
/*
 *  fs/ext4/mballoc.h
 *
 *  Written by: Alex Tomas <alex@clusterfs.com>
 *
 */
#ifndef _EXT4_MBALLOC_H
#define _EXT4_MBALLOC_H

#include <linux/time.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/quotaops.h>
#include <linux/buffer_head.h>
#include <linux/module.h>
#include <linux/swap.h>
#include <linux/proc_fs.h>
#include <linux/pagemap.h>
#include <linux/seq_file.h>
#include <linux/blkdev.h>
#include <linux/mutex.h>
#include "ext4_jbd2.h"
#include "ext4.h"

/*
 * mb_debug() dynamic printk msgs could be used to debug mballoc code.
 */
#ifdef CONFIG_EXT4_DEBUG
#define mb_debug(sb, fmt, ...)                                                \
        pr_debug("[%s/%d] EXT4-fs (%s): (%s, %d): %s: " fmt,                \
                current->comm, task_pid_nr(current), sb->s_id,                \
               __FILE__, __LINE__, __func__, ##__VA_ARGS__)
#else
#define mb_debug(sb, fmt, ...)        no_printk(fmt, ##__VA_ARGS__)
#endif

#define EXT4_MB_HISTORY_ALLOC                1        /* allocation */
#define EXT4_MB_HISTORY_PREALLOC        2        /* preallocated blocks used */

/*
 * How long mballoc can look for a best extent (in found extents)
 */
#define MB_DEFAULT_MAX_TO_SCAN                200

/*
 * How long mballoc must look for a best extent
 */
#define MB_DEFAULT_MIN_TO_SCAN                10

/*
 * with 'ext4_mb_stats' allocator will collect stats that will be
 * shown at umount. The collecting costs though!
 */
#define MB_DEFAULT_STATS                0

/*
 * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served
 * by the stream allocator, which purpose is to pack requests
 * as close each to other as possible to produce smooth I/O traffic
 * We use locality group prealloc space for stream request.
 * We can tune the same via /proc/fs/ext4/<partition>/stream_req
 */
#define MB_DEFAULT_STREAM_THRESHOLD        16        /* 64K */

/*
 * for which requests use 2^N search using buddies
 */
#define MB_DEFAULT_ORDER2_REQS                2

/*
 * default group prealloc size 512 blocks
 */
#define MB_DEFAULT_GROUP_PREALLOC        512

/*
 * maximum length of inode prealloc list
 */
#define MB_DEFAULT_MAX_INODE_PREALLOC        512

struct ext4_free_data {
        /* this links the free block information from sb_info */
        struct list_head                efd_list;

        /* this links the free block information from group_info */
        struct rb_node                        efd_node;

        /* group which free block extent belongs */
        ext4_group_t                        efd_group;

        /* free block extent */
        ext4_grpblk_t                        efd_start_cluster;
        ext4_grpblk_t                        efd_count;

        /* transaction which freed this extent */
        tid_t                                efd_tid;
};

struct ext4_prealloc_space {
        struct list_head        pa_inode_list;
        struct list_head        pa_group_list;
        union {
                struct list_head pa_tmp_list;
                struct rcu_head        pa_rcu;
        } u;
        spinlock_t                pa_lock;
        atomic_t                pa_count;
        unsigned                pa_deleted;
        ext4_fsblk_t                pa_pstart;        /* phys. block */
        ext4_lblk_t                pa_lstart;        /* log. block */
        ext4_grpblk_t                pa_len;                /* len of preallocated chunk */
        ext4_grpblk_t                pa_free;        /* how many blocks are free */
        unsigned short                pa_type;        /* pa type. inode or group */
        spinlock_t                *pa_obj_lock;
        struct inode                *pa_inode;        /* hack, for history only */
};

enum {
        MB_INODE_PA = 0,
        MB_GROUP_PA = 1
};

struct ext4_free_extent {
        ext4_lblk_t fe_logical;
        ext4_grpblk_t fe_start;        /* In cluster units */
        ext4_group_t fe_group;
        ext4_grpblk_t fe_len;        /* In cluster units */
};

/*
 * Locality group:
 *   we try to group all related changes together
 *   so that writeback can flush/allocate them together as well
 *   Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC
 *   (512). We store prealloc space into the hash based on the pa_free blocks
 *   order value.ie, fls(pa_free)-1;
 */
#define PREALLOC_TB_SIZE 10
struct ext4_locality_group {
        /* for allocator */
        /* to serialize allocates */
        struct mutex                lg_mutex;
        /* list of preallocations */
        struct list_head        lg_prealloc_list[PREALLOC_TB_SIZE];
        spinlock_t                lg_prealloc_lock;
};

struct ext4_allocation_context {
        struct inode *ac_inode;
        struct super_block *ac_sb;

        /* original request */
        struct ext4_free_extent ac_o_ex;

        /* goal request (normalized ac_o_ex) */
        struct ext4_free_extent ac_g_ex;

        /* the best found extent */
        struct ext4_free_extent ac_b_ex;

        /* copy of the best found extent taken before preallocation efforts */
        struct ext4_free_extent ac_f_ex;

        __u16 ac_groups_scanned;
        __u16 ac_found;
        __u16 ac_tail;
        __u16 ac_buddy;
        __u16 ac_flags;                /* allocation hints */
        __u8 ac_status;
        __u8 ac_criteria;
        __u8 ac_2order;                /* if request is to allocate 2^N blocks and
                                 * N > 0, the field stores N, otherwise 0 */
        __u8 ac_op;                /* operation, for history only */
        struct page *ac_bitmap_page;
        struct page *ac_buddy_page;
        struct ext4_prealloc_space *ac_pa;
        struct ext4_locality_group *ac_lg;
};

#define AC_STATUS_CONTINUE        1
#define AC_STATUS_FOUND                2
#define AC_STATUS_BREAK                3

struct ext4_buddy {
        struct page *bd_buddy_page;
        void *bd_buddy;
        struct page *bd_bitmap_page;
        void *bd_bitmap;
        struct ext4_group_info *bd_info;
        struct super_block *bd_sb;
        __u16 bd_blkbits;
        ext4_group_t bd_group;
};

static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
                                        struct ext4_free_extent *fex)
{
        return ext4_group_first_block_no(sb, fex->fe_group) +
                (fex->fe_start << EXT4_SB(sb)->s_cluster_bits);
}

static inline loff_t extent_logical_end(struct ext4_sb_info *sbi,
                                        struct ext4_free_extent *fex)
{
        /* Use loff_t to avoid end exceeding ext4_lblk_t max. */
        return (loff_t)fex->fe_logical + EXT4_C2B(sbi, fex->fe_len);
}

static inline loff_t pa_logical_end(struct ext4_sb_info *sbi,
                                    struct ext4_prealloc_space *pa)
{
        /* Use loff_t to avoid end exceeding ext4_lblk_t max. */
        return (loff_t)pa->pa_lstart + EXT4_C2B(sbi, pa->pa_len);
}

typedef int (*ext4_mballoc_query_range_fn)(
        struct super_block                *sb,
        ext4_group_t                        agno,
        ext4_grpblk_t                        start,
        ext4_grpblk_t                        len,
        void                                *priv);

int
ext4_mballoc_query_range(
        struct super_block                *sb,
        ext4_group_t                        agno,
        ext4_grpblk_t                        start,
        ext4_grpblk_t                        end,
        ext4_mballoc_query_range_fn        meta_formatter,
        ext4_mballoc_query_range_fn        formatter,
        void                                *priv);

#endif









































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
/* SPDX-License-Identifier: GPL-2.0 */
/*
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
 */

#ifndef _ASM_X86_STACKTRACE_H
#define _ASM_X86_STACKTRACE_H

#include <linux/uaccess.h>
#include <linux/ptrace.h>

#include <asm/cpu_entry_area.h>
#include <asm/switch_to.h>

enum stack_type {
        STACK_TYPE_UNKNOWN,
        STACK_TYPE_TASK,
        STACK_TYPE_IRQ,
        STACK_TYPE_SOFTIRQ,
        STACK_TYPE_ENTRY,
        STACK_TYPE_EXCEPTION,
        STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1,
};

struct stack_info {
        enum stack_type type;
        unsigned long *begin, *end, *next_sp;
};

bool in_task_stack(unsigned long *stack, struct task_struct *task,
                   struct stack_info *info);

bool in_entry_stack(unsigned long *stack, struct stack_info *info);

int get_stack_info(unsigned long *stack, struct task_struct *task,
                   struct stack_info *info, unsigned long *visit_mask);
bool get_stack_info_noinstr(unsigned long *stack, struct task_struct *task,
                            struct stack_info *info);

const char *stack_type_name(enum stack_type type);

static inline bool on_stack(struct stack_info *info, void *addr, size_t len)
{
        void *begin = info->begin;
        void *end   = info->end;

        return (info->type != STACK_TYPE_UNKNOWN &&
                addr >= begin && addr < end &&
                addr + len > begin && addr + len <= end);
}

#ifdef CONFIG_X86_32
#define STACKSLOTS_PER_LINE 8
#else
#define STACKSLOTS_PER_LINE 4
#endif

#ifdef CONFIG_FRAME_POINTER
static inline unsigned long *
get_frame_pointer(struct task_struct *task, struct pt_regs *regs)
{
        if (regs)
                return (unsigned long *)regs->bp;

        if (task == current)
                return __builtin_frame_address(0);

        return &((struct inactive_task_frame *)task->thread.sp)->bp;
}
#else
static inline unsigned long *
get_frame_pointer(struct task_struct *task, struct pt_regs *regs)
{
        return NULL;
}
#endif /* CONFIG_FRAME_POINTER */

static inline unsigned long *
get_stack_pointer(struct task_struct *task, struct pt_regs *regs)
{
        if (regs)
                return (unsigned long *)regs->sp;

        if (task == current)
                return __builtin_frame_address(0);

        return (unsigned long *)task->thread.sp;
}

/* The form of the top of the frame on the stack */
struct stack_frame {
        struct stack_frame *next_frame;
        unsigned long return_address;
};

struct stack_frame_ia32 {
    u32 next_frame;
    u32 return_address;
};

void show_opcodes(struct pt_regs *regs, const char *loglvl);
void show_ip(struct pt_regs *regs, const char *loglvl);
#endif /* _ASM_X86_STACKTRACE_H */
















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/exec.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 * #!-checking implemented by tytso.
 */
/*
 * Demand-loading implemented 01.12.91 - no need to read anything but
 * the header into memory. The inode of the executable is put into
 * "current->executable", and page faults do the actual loading. Clean.
 *
 * Once more I can proudly say that linux stood up to being changed: it
 * was less than 2 hours work to get demand-loading completely implemented.
 *
 * Demand loading changed July 1993 by Eric Youngdale.   Use mmap instead,
 * current->executable is only used by the procfs.  This allows a dispatch
 * table to check for several different types  of binary formats.  We keep
 * trying until we recognize the file or we run out of supported binary
 * formats.
 */

#include <linux/kernel_read_file.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/mm.h>
#include <linux/vmacache.h>
#include <linux/stat.h>
#include <linux/fcntl.h>
#include <linux/swap.h>
#include <linux/string.h>
#include <linux/init.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/signal.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/task.h>
#include <linux/pagemap.h>
#include <linux/perf_event.h>
#include <linux/highmem.h>
#include <linux/spinlock.h>
#include <linux/key.h>
#include <linux/personality.h>
#include <linux/binfmts.h>
#include <linux/utsname.h>
#include <linux/pid_namespace.h>
#include <linux/module.h>
#include <linux/namei.h>
#include <linux/mount.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/audit.h>
#include <linux/tracehook.h>
#include <linux/kmod.h>
#include <linux/fsnotify.h>
#include <linux/fs_struct.h>
#include <linux/oom.h>
#include <linux/compat.h>
#include <linux/vmalloc.h>
#include <linux/io_uring.h>

#include <linux/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/tlb.h>

#include <trace/events/task.h>
#include "internal.h"

#include <trace/events/sched.h>

static int bprm_creds_from_file(struct linux_binprm *bprm);

int suid_dumpable = 0;

static LIST_HEAD(formats);
static DEFINE_RWLOCK(binfmt_lock);

void __register_binfmt(struct linux_binfmt * fmt, int insert)
{
        BUG_ON(!fmt);
        if (WARN_ON(!fmt->load_binary))
                return;
        write_lock(&binfmt_lock);
        insert ? list_add(&fmt->lh, &formats) :
                 list_add_tail(&fmt->lh, &formats);
        write_unlock(&binfmt_lock);
}

EXPORT_SYMBOL(__register_binfmt);

void unregister_binfmt(struct linux_binfmt * fmt)
{
        write_lock(&binfmt_lock);
        list_del(&fmt->lh);
        write_unlock(&binfmt_lock);
}

EXPORT_SYMBOL(unregister_binfmt);

static inline void put_binfmt(struct linux_binfmt * fmt)
{
        module_put(fmt->module);
}

bool path_noexec(const struct path *path)
{
        return (path->mnt->mnt_flags & MNT_NOEXEC) ||
               (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC);
}

#ifdef CONFIG_USELIB
/*
 * Note that a shared library must be both readable and executable due to
 * security reasons.
 *
 * Also note that we take the address to load from from the file itself.
 */
SYSCALL_DEFINE1(uselib, const char __user *, library)
{
        struct linux_binfmt *fmt;
        struct file *file;
        struct filename *tmp = getname(library);
        int error = PTR_ERR(tmp);
        static const struct open_flags uselib_flags = {
                .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
                .acc_mode = MAY_READ | MAY_EXEC,
                .intent = LOOKUP_OPEN,
                .lookup_flags = LOOKUP_FOLLOW,
        };

        if (IS_ERR(tmp))
                goto out;

        file = do_filp_open(AT_FDCWD, tmp, &uselib_flags);
        putname(tmp);
        error = PTR_ERR(file);
        if (IS_ERR(file))
                goto out;

        /*
         * Check do_open_execat() for an explanation.
         */
        error = -EACCES;
        if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) ||
            path_noexec(&file->f_path))
                goto exit;

        fsnotify_open(file);

        error = -ENOEXEC;

        read_lock(&binfmt_lock);
        list_for_each_entry(fmt, &formats, lh) {
                if (!fmt->load_shlib)
                        continue;
                if (!try_module_get(fmt->module))
                        continue;
                read_unlock(&binfmt_lock);
                error = fmt->load_shlib(file);
                read_lock(&binfmt_lock);
                put_binfmt(fmt);
                if (error != -ENOEXEC)
                        break;
        }
        read_unlock(&binfmt_lock);
exit:
        fput(file);
out:
          return error;
}
#endif /* #ifdef CONFIG_USELIB */

#ifdef CONFIG_MMU
/*
 * The nascent bprm->mm is not visible until exec_mmap() but it can
 * use a lot of memory, account these pages in current->mm temporary
 * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we
 * change the counter back via acct_arg_size(0).
 */
static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
{
        struct mm_struct *mm = current->mm;
        long diff = (long)(pages - bprm->vma_pages);

        if (!mm || !diff)
                return;

        bprm->vma_pages = pages;
        add_mm_counter(mm, MM_ANONPAGES, diff);
}

static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
                int write)
{
        struct page *page;
        int ret;
        unsigned int gup_flags = FOLL_FORCE;

#ifdef CONFIG_STACK_GROWSUP
        if (write) {
                ret = expand_downwards(bprm->vma, pos);
                if (ret < 0)
                        return NULL;
        }
#endif

        if (write)
                gup_flags |= FOLL_WRITE;

        /*
         * We are doing an exec().  'current' is the process
         * doing the exec and bprm->mm is the new process's mm.
         */
        ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags,
                        &page, NULL, NULL);
        if (ret <= 0)
                return NULL;

        if (write)
                acct_arg_size(bprm, vma_pages(bprm->vma));

        return page;
}

static void put_arg_page(struct page *page)
{
        put_page(page);
}

static void free_arg_pages(struct linux_binprm *bprm)
{
}

static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
                struct page *page)
{
        flush_cache_page(bprm->vma, pos, page_to_pfn(page));
}

static int __bprm_mm_init(struct linux_binprm *bprm)
{
        int err;
        struct vm_area_struct *vma = NULL;
        struct mm_struct *mm = bprm->mm;

        bprm->vma = vma = vm_area_alloc(mm);
        if (!vma)
                return -ENOMEM;
        vma_set_anonymous(vma);

        if (mmap_write_lock_killable(mm)) {
                err = -EINTR;
                goto err_free;
        }

        /*
         * Place the stack at the largest stack address the architecture
         * supports. Later, we'll move this to an appropriate place. We don't
         * use STACK_TOP because that can depend on attributes which aren't
         * configured yet.
         */
        BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
        vma->vm_end = STACK_TOP_MAX;
        vma->vm_start = vma->vm_end - PAGE_SIZE;
        vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);

        err = insert_vm_struct(mm, vma);
        if (err)
                goto err;

        mm->stack_vm = mm->total_vm = 1;
        mmap_write_unlock(mm);
        bprm->p = vma->vm_end - sizeof(void *);
        return 0;
err:
        mmap_write_unlock(mm);
err_free:
        bprm->vma = NULL;
        vm_area_free(vma);
        return err;
}

static bool valid_arg_len(struct linux_binprm *bprm, long len)
{
        return len <= MAX_ARG_STRLEN;
}

#else

static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
{
}

static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
                int write)
{
        struct page *page;

        page = bprm->page[pos / PAGE_SIZE];
        if (!page && write) {
                page = alloc_page(GFP_HIGHUSER|__GFP_ZERO);
                if (!page)
                        return NULL;
                bprm->page[pos / PAGE_SIZE] = page;
        }

        return page;
}

static void put_arg_page(struct page *page)
{
}

static void free_arg_page(struct linux_binprm *bprm, int i)
{
        if (bprm->page[i]) {
                __free_page(bprm->page[i]);
                bprm->page[i] = NULL;
        }
}

static void free_arg_pages(struct linux_binprm *bprm)
{
        int i;

        for (i = 0; i < MAX_ARG_PAGES; i++)
                free_arg_page(bprm, i);
}

static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
                struct page *page)
{
}

static int __bprm_mm_init(struct linux_binprm *bprm)
{
        bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
        return 0;
}

static bool valid_arg_len(struct linux_binprm *bprm, long len)
{
        return len <= bprm->p;
}

#endif /* CONFIG_MMU */

/*
 * Create a new mm_struct and populate it with a temporary stack
 * vm_area_struct.  We don't have enough context at this point to set the stack
 * flags, permissions, and offset, so we use temporary values.  We'll update
 * them later in setup_arg_pages().
 */
static int bprm_mm_init(struct linux_binprm *bprm)
{
        int err;
        struct mm_struct *mm = NULL;

        bprm->mm = mm = mm_alloc();
        err = -ENOMEM;
        if (!mm)
                goto err;

        /* Save current stack limit for all calculations made during exec. */
        task_lock(current->group_leader);
        bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK];
        task_unlock(current->group_leader);

        err = __bprm_mm_init(bprm);
        if (err)
                goto err;

        return 0;

err:
        if (mm) {
                bprm->mm = NULL;
                mmdrop(mm);
        }

        return err;
}

struct user_arg_ptr {
#ifdef CONFIG_COMPAT
        bool is_compat;
#endif
        union {
                const char __user *const __user *native;
#ifdef CONFIG_COMPAT
                const compat_uptr_t __user *compat;
#endif
        } ptr;
};

static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr)
{
        const char __user *native;

#ifdef CONFIG_COMPAT
        if (unlikely(argv.is_compat)) {
                compat_uptr_t compat;

                if (get_user(compat, argv.ptr.compat + nr))
                        return ERR_PTR(-EFAULT);

                return compat_ptr(compat);
        }
#endif

        if (get_user(native, argv.ptr.native + nr))
                return ERR_PTR(-EFAULT);

        return native;
}

/*
 * count() counts the number of strings in array ARGV.
 */
static int count(struct user_arg_ptr argv, int max)
{
        int i = 0;

        if (argv.ptr.native != NULL) {
                for (;;) {
                        const char __user *p = get_user_arg_ptr(argv, i);

                        if (!p)
                                break;

                        if (IS_ERR(p))
                                return -EFAULT;

                        if (i >= max)
                                return -E2BIG;
                        ++i;

                        if (fatal_signal_pending(current))
                                return -ERESTARTNOHAND;
                        cond_resched();
                }
        }
        return i;
}

static int count_strings_kernel(const char *const *argv)
{
        int i;

        if (!argv)
                return 0;

        for (i = 0; argv[i]; ++i) {
                if (i >= MAX_ARG_STRINGS)
                        return -E2BIG;
                if (fatal_signal_pending(current))
                        return -ERESTARTNOHAND;
                cond_resched();
        }
        return i;
}

static int bprm_stack_limits(struct linux_binprm *bprm)
{
        unsigned long limit, ptr_size;

        /*
         * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM
         * (whichever is smaller) for the argv+env strings.
         * This ensures that:
         *  - the remaining binfmt code will not run out of stack space,
         *  - the program will have a reasonable amount of stack left
         *    to work from.
         */
        limit = _STK_LIM / 4 * 3;
        limit = min(limit, bprm->rlim_stack.rlim_cur / 4);
        /*
         * We've historically supported up to 32 pages (ARG_MAX)
         * of argument strings even with small stacks
         */
        limit = max_t(unsigned long, limit, ARG_MAX);
        /*
         * We must account for the size of all the argv and envp pointers to
         * the argv and envp strings, since they will also take up space in
         * the stack. They aren't stored until much later when we can't
         * signal to the parent that the child has run out of stack space.
         * Instead, calculate it here so it's possible to fail gracefully.
         *
         * In the case of argc = 0, make sure there is space for adding a
         * empty string (which will bump argc to 1), to ensure confused
         * userspace programs don't start processing from argv[1], thinking
         * argc can never be 0, to keep them from walking envp by accident.
         * See do_execveat_common().
         */
        ptr_size = (max(bprm->argc, 1) + bprm->envc) * sizeof(void *);
        if (limit <= ptr_size)
                return -E2BIG;
        limit -= ptr_size;

        bprm->argmin = bprm->p - limit;
        return 0;
}

/*
 * 'copy_strings()' copies argument/environment strings from the old
 * processes's memory to the new process's stack.  The call to get_user_pages()
 * ensures the destination page is created and not swapped out.
 */
static int copy_strings(int argc, struct user_arg_ptr argv,
                        struct linux_binprm *bprm)
{
        struct page *kmapped_page = NULL;
        char *kaddr = NULL;
        unsigned long kpos = 0;
        int ret;

        while (argc-- > 0) {
                const char __user *str;
                int len;
                unsigned long pos;

                ret = -EFAULT;
                str = get_user_arg_ptr(argv, argc);
                if (IS_ERR(str))
                        goto out;

                len = strnlen_user(str, MAX_ARG_STRLEN);
                if (!len)
                        goto out;

                ret = -E2BIG;
                if (!valid_arg_len(bprm, len))
                        goto out;

                /* We're going to work our way backwords. */
                pos = bprm->p;
                str += len;
                bprm->p -= len;
#ifdef CONFIG_MMU
                if (bprm->p < bprm->argmin)
                        goto out;
#endif

                while (len > 0) {
                        int offset, bytes_to_copy;

                        if (fatal_signal_pending(current)) {
                                ret = -ERESTARTNOHAND;
                                goto out;
                        }
                        cond_resched();

                        offset = pos % PAGE_SIZE;
                        if (offset == 0)
                                offset = PAGE_SIZE;

                        bytes_to_copy = offset;
                        if (bytes_to_copy > len)
                                bytes_to_copy = len;

                        offset -= bytes_to_copy;
                        pos -= bytes_to_copy;
                        str -= bytes_to_copy;
                        len -= bytes_to_copy;

                        if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
                                struct page *page;

                                page = get_arg_page(bprm, pos, 1);
                                if (!page) {
                                        ret = -E2BIG;
                                        goto out;
                                }

                                if (kmapped_page) {
                                        flush_kernel_dcache_page(kmapped_page);
                                        kunmap(kmapped_page);
                                        put_arg_page(kmapped_page);
                                }
                                kmapped_page = page;
                                kaddr = kmap(kmapped_page);
                                kpos = pos & PAGE_MASK;
                                flush_arg_page(bprm, kpos, kmapped_page);
                        }
                        if (copy_from_user(kaddr+offset, str, bytes_to_copy)) {
                                ret = -EFAULT;
                                goto out;
                        }
                }
        }
        ret = 0;
out:
        if (kmapped_page) {
                flush_kernel_dcache_page(kmapped_page);
                kunmap(kmapped_page);
                put_arg_page(kmapped_page);
        }
        return ret;
}

/*
 * Copy and argument/environment string from the kernel to the processes stack.
 */
int copy_string_kernel(const char *arg, struct linux_binprm *bprm)
{
        int len = strnlen(arg, MAX_ARG_STRLEN) + 1 /* terminating NUL */;
        unsigned long pos = bprm->p;

        if (len == 0)
                return -EFAULT;
        if (!valid_arg_len(bprm, len))
                return -E2BIG;

        /* We're going to work our way backwards. */
        arg += len;
        bprm->p -= len;
        if (IS_ENABLED(CONFIG_MMU) && bprm->p < bprm->argmin)
                return -E2BIG;

        while (len > 0) {
                unsigned int bytes_to_copy = min_t(unsigned int, len,
                                min_not_zero(offset_in_page(pos), PAGE_SIZE));
                struct page *page;
                char *kaddr;

                pos -= bytes_to_copy;
                arg -= bytes_to_copy;
                len -= bytes_to_copy;

                page = get_arg_page(bprm, pos, 1);
                if (!page)
                        return -E2BIG;
                kaddr = kmap_atomic(page);
                flush_arg_page(bprm, pos & PAGE_MASK, page);
                memcpy(kaddr + offset_in_page(pos), arg, bytes_to_copy);
                flush_kernel_dcache_page(page);
                kunmap_atomic(kaddr);
                put_arg_page(page);
        }

        return 0;
}
EXPORT_SYMBOL(copy_string_kernel);

static int copy_strings_kernel(int argc, const char *const *argv,
                               struct linux_binprm *bprm)
{
        while (argc-- > 0) {
                int ret = copy_string_kernel(argv[argc], bprm);
                if (ret < 0)
                        return ret;
                if (fatal_signal_pending(current))
                        return -ERESTARTNOHAND;
                cond_resched();
        }
        return 0;
}

#ifdef CONFIG_MMU

/*
 * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX.  Once
 * the binfmt code determines where the new stack should reside, we shift it to
 * its final location.  The process proceeds as follows:
 *
 * 1) Use shift to calculate the new vma endpoints.
 * 2) Extend vma to cover both the old and new ranges.  This ensures the
 *    arguments passed to subsequent functions are consistent.
 * 3) Move vma's page tables to the new range.
 * 4) Free up any cleared pgd range.
 * 5) Shrink the vma to cover only the new range.
 */
static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
{
        struct mm_struct *mm = vma->vm_mm;
        unsigned long old_start = vma->vm_start;
        unsigned long old_end = vma->vm_end;
        unsigned long length = old_end - old_start;
        unsigned long new_start = old_start - shift;
        unsigned long new_end = old_end - shift;
        struct mmu_gather tlb;

        BUG_ON(new_start > new_end);

        /*
         * ensure there are no vmas between where we want to go
         * and where we are
         */
        if (vma != find_vma(mm, new_start))
                return -EFAULT;

        /*
         * cover the whole range: [new_start, old_end)
         */
        if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL))
                return -ENOMEM;

        /*
         * move the page tables downwards, on failure we rely on
         * process cleanup to remove whatever mess we made.
         */
        if (length != move_page_tables(vma, old_start,
                                       vma, new_start, length, false))
                return -ENOMEM;

        lru_add_drain();
        tlb_gather_mmu(&tlb, mm, old_start, old_end);
        if (new_end > old_start) {
                /*
                 * when the old and new regions overlap clear from new_end.
                 */
                free_pgd_range(&tlb, new_end, old_end, new_end,
                        vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
        } else {
                /*
                 * otherwise, clean from old_start; this is done to not touch
                 * the address space in [new_end, old_start) some architectures
                 * have constraints on va-space that make this illegal (IA64) -
                 * for the others its just a little faster.
                 */
                free_pgd_range(&tlb, old_start, old_end, new_end,
                        vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
        }
        tlb_finish_mmu(&tlb, old_start, old_end);

        /*
         * Shrink the vma to just the new range.  Always succeeds.
         */
        vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL);

        return 0;
}

/*
 * Finalizes the stack vm_area_struct. The flags and permissions are updated,
 * the stack is optionally relocated, and some extra space is added.
 */
int setup_arg_pages(struct linux_binprm *bprm,
                    unsigned long stack_top,
                    int executable_stack)
{
        int ret;
        unsigned long stack_shift;
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma = bprm->vma;
        struct vm_area_struct *prev = NULL;
        unsigned long vm_flags;
        unsigned long stack_base;
        unsigned long stack_size;
        unsigned long stack_expand;
        unsigned long rlim_stack;

#ifdef CONFIG_STACK_GROWSUP
        /* Limit stack size */
        stack_base = bprm->rlim_stack.rlim_max;
        if (stack_base > STACK_SIZE_MAX)
                stack_base = STACK_SIZE_MAX;

        /* Add space for stack randomization. */
        if (current->flags & PF_RANDOMIZE)
                stack_base += (STACK_RND_MASK << PAGE_SHIFT);

        /* Make sure we didn't let the argument array grow too large. */
        if (vma->vm_end - vma->vm_start > stack_base)
                return -ENOMEM;

        stack_base = PAGE_ALIGN(stack_top - stack_base);

        stack_shift = vma->vm_start - stack_base;
        mm->arg_start = bprm->p - stack_shift;
        bprm->p = vma->vm_end - stack_shift;
#else
        stack_top = arch_align_stack(stack_top);
        stack_top = PAGE_ALIGN(stack_top);

        if (unlikely(stack_top < mmap_min_addr) ||
            unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr))
                return -ENOMEM;

        stack_shift = vma->vm_end - stack_top;

        bprm->p -= stack_shift;
        mm->arg_start = bprm->p;
#endif

        if (bprm->loader)
                bprm->loader -= stack_shift;
        bprm->exec -= stack_shift;

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        vm_flags = VM_STACK_FLAGS;

        /*
         * Adjust stack execute permissions; explicitly enable for
         * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone
         * (arch default) otherwise.
         */
        if (unlikely(executable_stack == EXSTACK_ENABLE_X))
                vm_flags |= VM_EXEC;
        else if (executable_stack == EXSTACK_DISABLE_X)
                vm_flags &= ~VM_EXEC;
        vm_flags |= mm->def_flags;
        vm_flags |= VM_STACK_INCOMPLETE_SETUP;

        ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
                        vm_flags);
        if (ret)
                goto out_unlock;
        BUG_ON(prev != vma);

        if (unlikely(vm_flags & VM_EXEC)) {
                pr_warn_once("process '%pD4' started with executable stack\n",
                             bprm->file);
        }

        /* Move stack pages down in memory. */
        if (stack_shift) {
                ret = shift_arg_pages(vma, stack_shift);
                if (ret)
                        goto out_unlock;
        }

        /* mprotect_fixup is overkill to remove the temporary stack flags */
        vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP;

        stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
        stack_size = vma->vm_end - vma->vm_start;
        /*
         * Align this down to a page boundary as expand_stack
         * will align it up.
         */
        rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK;
#ifdef CONFIG_STACK_GROWSUP
        if (stack_size + stack_expand > rlim_stack)
                stack_base = vma->vm_start + rlim_stack;
        else
                stack_base = vma->vm_end + stack_expand;
#else
        if (stack_size + stack_expand > rlim_stack)
                stack_base = vma->vm_end - rlim_stack;
        else
                stack_base = vma->vm_start - stack_expand;
#endif
        current->mm->start_stack = bprm->p;
        ret = expand_stack(vma, stack_base);
        if (ret)
                ret = -EFAULT;

out_unlock:
        mmap_write_unlock(mm);
        return ret;
}
EXPORT_SYMBOL(setup_arg_pages);

#else

/*
 * Transfer the program arguments and environment from the holding pages
 * onto the stack. The provided stack pointer is adjusted accordingly.
 */
int transfer_args_to_stack(struct linux_binprm *bprm,
                           unsigned long *sp_location)
{
        unsigned long index, stop, sp;
        int ret = 0;

        stop = bprm->p >> PAGE_SHIFT;
        sp = *sp_location;

        for (index = MAX_ARG_PAGES - 1; index >= stop; index--) {
                unsigned int offset = index == stop ? bprm->p & ~PAGE_MASK : 0;
                char *src = kmap(bprm->page[index]) + offset;
                sp -= PAGE_SIZE - offset;
                if (copy_to_user((void *) sp, src, PAGE_SIZE - offset) != 0)
                        ret = -EFAULT;
                kunmap(bprm->page[index]);
                if (ret)
                        goto out;
        }

        bprm->exec += *sp_location - MAX_ARG_PAGES * PAGE_SIZE;
        *sp_location = sp;

out:
        return ret;
}
EXPORT_SYMBOL(transfer_args_to_stack);

#endif /* CONFIG_MMU */

static struct file *do_open_execat(int fd, struct filename *name, int flags)
{
        struct file *file;
        int err;
        struct open_flags open_exec_flags = {
                .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
                .acc_mode = MAY_EXEC,
                .intent = LOOKUP_OPEN,
                .lookup_flags = LOOKUP_FOLLOW,
        };

        if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
                return ERR_PTR(-EINVAL);
        if (flags & AT_SYMLINK_NOFOLLOW)
                open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW;
        if (flags & AT_EMPTY_PATH)
                open_exec_flags.lookup_flags |= LOOKUP_EMPTY;

        file = do_filp_open(fd, name, &open_exec_flags);
        if (IS_ERR(file))
                return file;

        /*
         * In the past the regular type check was here. It moved to may_open() in
         * 633fb6ac3980 ("exec: move S_ISREG() check earlier"). Since then it is
         * an invariant that all non-regular files error out before we get here.
         */
        err = -EACCES;
        if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) ||
            path_noexec(&file->f_path))
                goto exit;

        err = deny_write_access(file);
        if (err)
                goto exit;

        if (name->name[0] != '\0')
                fsnotify_open(file);

        return file;

exit:
        fput(file);
        return ERR_PTR(err);
}

struct file *open_exec(const char *name)
{
        struct filename *filename = getname_kernel(name);
        struct file *f = ERR_CAST(filename);

        if (!IS_ERR(filename)) {
                f = do_open_execat(AT_FDCWD, filename, 0);
                putname(filename);
        }
        return f;
}
EXPORT_SYMBOL(open_exec);

#if defined(CONFIG_HAVE_AOUT) || defined(CONFIG_BINFMT_FLAT) || \
    defined(CONFIG_BINFMT_ELF_FDPIC)
ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
{
        ssize_t res = vfs_read(file, (void __user *)addr, len, &pos);
        if (res > 0)
                flush_icache_user_range(addr, addr + len);
        return res;
}
EXPORT_SYMBOL(read_code);
#endif

/*
 * Maps the mm_struct mm into the current task struct.
 * On success, this function returns with exec_update_lock
 * held for writing.
 */
static int exec_mmap(struct mm_struct *mm)
{
        struct task_struct *tsk;
        struct mm_struct *old_mm, *active_mm;
        int ret;

        /* Notify parent that we're no longer interested in the old VM */
        tsk = current;
        old_mm = current->mm;
        exec_mm_release(tsk, old_mm);
        if (old_mm)
                sync_mm_rss(old_mm);

        ret = down_write_killable(&tsk->signal->exec_update_lock);
        if (ret)
                return ret;

        if (old_mm) {
                /*
                 * Make sure that if there is a core dump in progress
                 * for the old mm, we get out and die instead of going
                 * through with the exec.  We must hold mmap_lock around
                 * checking core_state and changing tsk->mm.
                 */
                mmap_read_lock(old_mm);
                if (unlikely(old_mm->core_state)) {
                        mmap_read_unlock(old_mm);
                        up_write(&tsk->signal->exec_update_lock);
                        return -EINTR;
                }
        }

        task_lock(tsk);
        membarrier_exec_mmap(mm);

        local_irq_disable();
        active_mm = tsk->active_mm;
        tsk->active_mm = mm;
        tsk->mm = mm;
        /*
         * This prevents preemption while active_mm is being loaded and
         * it and mm are being updated, which could cause problems for
         * lazy tlb mm refcounting when these are updated by context
         * switches. Not all architectures can handle irqs off over
         * activate_mm yet.
         */
        if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
                local_irq_enable();
        activate_mm(active_mm, mm);
        if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
                local_irq_enable();
        tsk->mm->vmacache_seqnum = 0;
        vmacache_flush(tsk);
        task_unlock(tsk);
        if (old_mm) {
                mmap_read_unlock(old_mm);
                BUG_ON(active_mm != old_mm);
                setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
                mm_update_next_owner(old_mm);
                mmput(old_mm);
                return 0;
        }
        mmdrop(active_mm);
        return 0;
}

static int de_thread(struct task_struct *tsk)
{
        struct signal_struct *sig = tsk->signal;
        struct sighand_struct *oldsighand = tsk->sighand;
        spinlock_t *lock = &oldsighand->siglock;

        if (thread_group_empty(tsk))
                goto no_thread_group;

        /*
         * Kill all other threads in the thread group.
         */
        spin_lock_irq(lock);
        if (signal_group_exit(sig)) {
                /*
                 * Another group action in progress, just
                 * return so that the signal is processed.
                 */
                spin_unlock_irq(lock);
                return -EAGAIN;
        }

        sig->group_exit_task = tsk;
        sig->notify_count = zap_other_threads(tsk);
        if (!thread_group_leader(tsk))
                sig->notify_count--;

        while (sig->notify_count) {
                __set_current_state(TASK_KILLABLE);
                spin_unlock_irq(lock);
                schedule();
                if (__fatal_signal_pending(tsk))
                        goto killed;
                spin_lock_irq(lock);
        }
        spin_unlock_irq(lock);

        /*
         * At this point all other threads have exited, all we have to
         * do is to wait for the thread group leader to become inactive,
         * and to assume its PID:
         */
        if (!thread_group_leader(tsk)) {
                struct task_struct *leader = tsk->group_leader;

                for (;;) {
                        cgroup_threadgroup_change_begin(tsk);
                        write_lock_irq(&tasklist_lock);
                        /*
                         * Do this under tasklist_lock to ensure that
                         * exit_notify() can't miss ->group_exit_task
                         */
                        sig->notify_count = -1;
                        if (likely(leader->exit_state))
                                break;
                        __set_current_state(TASK_KILLABLE);
                        write_unlock_irq(&tasklist_lock);
                        cgroup_threadgroup_change_end(tsk);
                        schedule();
                        if (__fatal_signal_pending(tsk))
                                goto killed;
                }

                /*
                 * The only record we have of the real-time age of a
                 * process, regardless of execs it's done, is start_time.
                 * All the past CPU time is accumulated in signal_struct
                 * from sister threads now dead.  But in this non-leader
                 * exec, nothing survives from the original leader thread,
                 * whose birth marks the true age of this process now.
                 * When we take on its identity by switching to its PID, we
                 * also take its birthdate (always earlier than our own).
                 */
                tsk->start_time = leader->start_time;
                tsk->start_boottime = leader->start_boottime;

                BUG_ON(!same_thread_group(leader, tsk));
                /*
                 * An exec() starts a new thread group with the
                 * TGID of the previous thread group. Rehash the
                 * two threads with a switched PID, and release
                 * the former thread group leader:
                 */

                /* Become a process group leader with the old leader's pid.
                 * The old leader becomes a thread of the this thread group.
                 */
                exchange_tids(tsk, leader);
                transfer_pid(leader, tsk, PIDTYPE_TGID);
                transfer_pid(leader, tsk, PIDTYPE_PGID);
                transfer_pid(leader, tsk, PIDTYPE_SID);

                list_replace_rcu(&leader->tasks, &tsk->tasks);
                list_replace_init(&leader->sibling, &tsk->sibling);

                tsk->group_leader = tsk;
                leader->group_leader = tsk;

                tsk->exit_signal = SIGCHLD;
                leader->exit_signal = -1;

                BUG_ON(leader->exit_state != EXIT_ZOMBIE);
                leader->exit_state = EXIT_DEAD;

                /*
                 * We are going to release_task()->ptrace_unlink() silently,
                 * the tracer can sleep in do_wait(). EXIT_DEAD guarantees
                 * the tracer wont't block again waiting for this thread.
                 */
                if (unlikely(leader->ptrace))
                        __wake_up_parent(leader, leader->parent);
                write_unlock_irq(&tasklist_lock);
                cgroup_threadgroup_change_end(tsk);

                release_task(leader);
        }

        sig->group_exit_task = NULL;
        sig->notify_count = 0;

no_thread_group:
        /* we have changed execution domain */
        tsk->exit_signal = SIGCHLD;

        BUG_ON(!thread_group_leader(tsk));
        return 0;

killed:
        /* protects against exit_notify() and __exit_signal() */
        read_lock(&tasklist_lock);
        sig->group_exit_task = NULL;
        sig->notify_count = 0;
        read_unlock(&tasklist_lock);
        return -EAGAIN;
}


/*
 * This function makes sure the current process has its own signal table,
 * so that flush_signal_handlers can later reset the handlers without
 * disturbing other processes.  (Other processes might share the signal
 * table via the CLONE_SIGHAND option to clone().)
 */
static int unshare_sighand(struct task_struct *me)
{
        struct sighand_struct *oldsighand = me->sighand;

        if (refcount_read(&oldsighand->count) != 1) {
                struct sighand_struct *newsighand;
                /*
                 * This ->sighand is shared with the CLONE_SIGHAND
                 * but not CLONE_THREAD task, switch to the new one.
                 */
                newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
                if (!newsighand)
                        return -ENOMEM;

                refcount_set(&newsighand->count, 1);

                write_lock_irq(&tasklist_lock);
                spin_lock(&oldsighand->siglock);
                memcpy(newsighand->action, oldsighand->action,
                       sizeof(newsighand->action));
                rcu_assign_pointer(me->sighand, newsighand);
                spin_unlock(&oldsighand->siglock);
                write_unlock_irq(&tasklist_lock);

                __cleanup_sighand(oldsighand);
        }
        return 0;
}

char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk)
{
        task_lock(tsk);
        strncpy(buf, tsk->comm, buf_size);
        task_unlock(tsk);
        return buf;
}
EXPORT_SYMBOL_GPL(__get_task_comm);

/*
 * These functions flushes out all traces of the currently running executable
 * so that a new one can be started
 */

void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec)
{
        task_lock(tsk);
        trace_task_rename(tsk, buf);
        strlcpy(tsk->comm, buf, sizeof(tsk->comm));
        task_unlock(tsk);
        perf_event_comm(tsk, exec);
}

/*
 * Calling this is the point of no return. None of the failures will be
 * seen by userspace since either the process is already taking a fatal
 * signal (via de_thread() or coredump), or will have SEGV raised
 * (after exec_mmap()) by search_binary_handler (see below).
 */
int begin_new_exec(struct linux_binprm * bprm)
{
        struct task_struct *me = current;
        int retval;

        /* Once we are committed compute the creds */
        retval = bprm_creds_from_file(bprm);
        if (retval)
                return retval;

        /*
         * Ensure all future errors are fatal.
         */
        bprm->point_of_no_return = true;

        /*
         * Make this the only thread in the thread group.
         */
        retval = de_thread(me);
        if (retval)
                goto out;

        /* Ensure the files table is not shared. */
        retval = unshare_files();
        if (retval)
                goto out;

        /*
         * Must be called _before_ exec_mmap() as bprm->mm is
         * not visibile until then. This also enables the update
         * to be lockless.
         */
        set_mm_exe_file(bprm->mm, bprm->file);

        /* If the binary is not readable then enforce mm->dumpable=0 */
        would_dump(bprm, bprm->file);
        if (bprm->have_execfd)
                would_dump(bprm, bprm->executable);

        /*
         * Release all of the old mmap stuff
         */
        acct_arg_size(bprm, 0);
        retval = exec_mmap(bprm->mm);
        if (retval)
                goto out;

        bprm->mm = NULL;

#ifdef CONFIG_POSIX_TIMERS
        spin_lock_irq(&me->sighand->siglock);
        posix_cpu_timers_exit(me);
        spin_unlock_irq(&me->sighand->siglock);
        exit_itimers(me);
        flush_itimer_signals();
#endif

        /*
         * Make the signal table private.
         */
        retval = unshare_sighand(me);
        if (retval)
                goto out_unlock;

        /*
         * Ensure that the uaccess routines can actually operate on userspace
         * pointers:
         */
        force_uaccess_begin();

        me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
                                        PF_NOFREEZE | PF_NO_SETAFFINITY);
        flush_thread();
        me->personality &= ~bprm->per_clear;

        /*
         * We have to apply CLOEXEC before we change whether the process is
         * dumpable (in setup_new_exec) to avoid a race with a process in userspace
         * trying to access the should-be-closed file descriptors of a process
         * undergoing exec(2).
         */
        do_close_on_exec(me->files);

        if (bprm->secureexec) {
                /* Make sure parent cannot signal privileged process. */
                me->pdeath_signal = 0;

                /*
                 * For secureexec, reset the stack limit to sane default to
                 * avoid bad behavior from the prior rlimits. This has to
                 * happen before arch_pick_mmap_layout(), which examines
                 * RLIMIT_STACK, but after the point of no return to avoid
                 * needing to clean up the change on failure.
                 */
                if (bprm->rlim_stack.rlim_cur > _STK_LIM)
                        bprm->rlim_stack.rlim_cur = _STK_LIM;
        }

        me->sas_ss_sp = me->sas_ss_size = 0;

        /*
         * Figure out dumpability. Note that this checking only of current
         * is wrong, but userspace depends on it. This should be testing
         * bprm->secureexec instead.
         */
        if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP ||
            !(uid_eq(current_euid(), current_uid()) &&
              gid_eq(current_egid(), current_gid())))
                set_dumpable(current->mm, suid_dumpable);
        else
                set_dumpable(current->mm, SUID_DUMP_USER);

        perf_event_exec();
        __set_task_comm(me, kbasename(bprm->filename), true);

        /* An exec changes our domain. We are no longer part of the thread
           group */
        WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1);
        flush_signal_handlers(me, 0);

        /*
         * install the new credentials for this executable
         */
        security_bprm_committing_creds(bprm);

        commit_creds(bprm->cred);
        bprm->cred = NULL;

        /*
         * Disable monitoring for regular users
         * when executing setuid binaries. Must
         * wait until new credentials are committed
         * by commit_creds() above
         */
        if (get_dumpable(me->mm) != SUID_DUMP_USER)
                perf_event_exit_task(me);
        /*
         * cred_guard_mutex must be held at least to this point to prevent
         * ptrace_attach() from altering our determination of the task's
         * credentials; any time after this it may be unlocked.
         */
        security_bprm_committed_creds(bprm);

        /* Pass the opened binary to the interpreter. */
        if (bprm->have_execfd) {
                retval = get_unused_fd_flags(0);
                if (retval < 0)
                        goto out_unlock;
                fd_install(retval, bprm->executable);
                bprm->executable = NULL;
                bprm->execfd = retval;
        }
        return 0;

out_unlock:
        up_write(&me->signal->exec_update_lock);
        if (!bprm->cred)
                mutex_unlock(&me->signal->cred_guard_mutex);

out:
        return retval;
}
EXPORT_SYMBOL(begin_new_exec);

void would_dump(struct linux_binprm *bprm, struct file *file)
{
        struct inode *inode = file_inode(file);
        if (inode_permission(inode, MAY_READ) < 0) {
                struct user_namespace *old, *user_ns;
                bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;

                /* Ensure mm->user_ns contains the executable */
                user_ns = old = bprm->mm->user_ns;
                while ((user_ns != &init_user_ns) &&
                       !privileged_wrt_inode_uidgid(user_ns, inode))
                        user_ns = user_ns->parent;

                if (old != user_ns) {
                        bprm->mm->user_ns = get_user_ns(user_ns);
                        put_user_ns(old);
                }
        }
}
EXPORT_SYMBOL(would_dump);

void setup_new_exec(struct linux_binprm * bprm)
{
        /* Setup things that can depend upon the personality */
        struct task_struct *me = current;

        arch_pick_mmap_layout(me->mm, &bprm->rlim_stack);

        arch_setup_new_exec();

        /* Set the new mm task size. We have to do that late because it may
         * depend on TIF_32BIT which is only updated in flush_thread() on
         * some architectures like powerpc
         */
        me->mm->task_size = TASK_SIZE;
        up_write(&me->signal->exec_update_lock);
        mutex_unlock(&me->signal->cred_guard_mutex);
}
EXPORT_SYMBOL(setup_new_exec);

/* Runs immediately before start_thread() takes over. */
void finalize_exec(struct linux_binprm *bprm)
{
        /* Store any stack rlimit changes before starting thread. */
        task_lock(current->group_leader);
        current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack;
        task_unlock(current->group_leader);
}
EXPORT_SYMBOL(finalize_exec);

/*
 * Prepare credentials and lock ->cred_guard_mutex.
 * setup_new_exec() commits the new creds and drops the lock.
 * Or, if exec fails before, free_bprm() should release ->cred and
 * and unlock.
 */
static int prepare_bprm_creds(struct linux_binprm *bprm)
{
        if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
                return -ERESTARTNOINTR;

        bprm->cred = prepare_exec_creds();
        if (likely(bprm->cred))
                return 0;

        mutex_unlock(&current->signal->cred_guard_mutex);
        return -ENOMEM;
}

static void free_bprm(struct linux_binprm *bprm)
{
        if (bprm->mm) {
                acct_arg_size(bprm, 0);
                mmput(bprm->mm);
        }
        free_arg_pages(bprm);
        if (bprm->cred) {
                mutex_unlock(&current->signal->cred_guard_mutex);
                abort_creds(bprm->cred);
        }
        if (bprm->file) {
                allow_write_access(bprm->file);
                fput(bprm->file);
        }
        if (bprm->executable)
                fput(bprm->executable);
        /* If a binfmt changed the interp, free it. */
        if (bprm->interp != bprm->filename)
                kfree(bprm->interp);
        kfree(bprm->fdpath);
        kfree(bprm);
}

static struct linux_binprm *alloc_bprm(int fd, struct filename *filename)
{
        struct linux_binprm *bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
        int retval = -ENOMEM;
        if (!bprm)
                goto out;

        if (fd == AT_FDCWD || filename->name[0] == '/') {
                bprm->filename = filename->name;
        } else {
                if (filename->name[0] == '\0')
                        bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd);
                else
                        bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s",
                                                  fd, filename->name);
                if (!bprm->fdpath)
                        goto out_free;

                bprm->filename = bprm->fdpath;
        }
        bprm->interp = bprm->filename;

        retval = bprm_mm_init(bprm);
        if (retval)
                goto out_free;
        return bprm;

out_free:
        free_bprm(bprm);
out:
        return ERR_PTR(retval);
}

int bprm_change_interp(const char *interp, struct linux_binprm *bprm)
{
        /* If a binfmt changed the interp, free it first. */
        if (bprm->interp != bprm->filename)
                kfree(bprm->interp);
        bprm->interp = kstrdup(interp, GFP_KERNEL);
        if (!bprm->interp)
                return -ENOMEM;
        return 0;
}
EXPORT_SYMBOL(bprm_change_interp);

/*
 * determine how safe it is to execute the proposed program
 * - the caller must hold ->cred_guard_mutex to protect against
 *   PTRACE_ATTACH or seccomp thread-sync
 */
static void check_unsafe_exec(struct linux_binprm *bprm)
{
        struct task_struct *p = current, *t;
        unsigned n_fs;

        if (p->ptrace)
                bprm->unsafe |= LSM_UNSAFE_PTRACE;

        /*
         * This isn't strictly necessary, but it makes it harder for LSMs to
         * mess up.
         */
        if (task_no_new_privs(current))
                bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;

        t = p;
        n_fs = 1;
        spin_lock(&p->fs->lock);
        rcu_read_lock();
        while_each_thread(p, t) {
                if (t->fs == p->fs)
                        n_fs++;
        }
        rcu_read_unlock();

        if (p->fs->users > n_fs)
                bprm->unsafe |= LSM_UNSAFE_SHARE;
        else
                p->fs->in_exec = 1;
        spin_unlock(&p->fs->lock);
}

static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
{
        /* Handle suid and sgid on files */
        struct inode *inode;
        unsigned int mode;
        kuid_t uid;
        kgid_t gid;
        int err;

        if (!mnt_may_suid(file->f_path.mnt))
                return;

        if (task_no_new_privs(current))
                return;

        inode = file->f_path.dentry->d_inode;
        mode = READ_ONCE(inode->i_mode);
        if (!(mode & (S_ISUID|S_ISGID)))
                return;

        /* Be careful if suid/sgid is set */
        inode_lock(inode);

        /* Atomically reload and check mode/uid/gid now that lock held. */
        mode = inode->i_mode;
        uid = inode->i_uid;
        gid = inode->i_gid;
        err = inode_permission(inode, MAY_EXEC);
        inode_unlock(inode);

        /* Did the exec bit vanish out from under us? Give up. */
        if (err)
                return;

        /* We ignore suid/sgid if there are no mappings for them in the ns */
        if (!kuid_has_mapping(bprm->cred->user_ns, uid) ||
                 !kgid_has_mapping(bprm->cred->user_ns, gid))
                return;

        if (mode & S_ISUID) {
                bprm->per_clear |= PER_CLEAR_ON_SETID;
                bprm->cred->euid = uid;
        }

        if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
                bprm->per_clear |= PER_CLEAR_ON_SETID;
                bprm->cred->egid = gid;
        }
}

/*
 * Compute brpm->cred based upon the final binary.
 */
static int bprm_creds_from_file(struct linux_binprm *bprm)
{
        /* Compute creds based on which file? */
        struct file *file = bprm->execfd_creds ? bprm->executable : bprm->file;

        bprm_fill_uid(bprm, file);
        return security_bprm_creds_from_file(bprm, file);
}

/*
 * Fill the binprm structure from the inode.
 * Read the first BINPRM_BUF_SIZE bytes
 *
 * This may be called multiple times for binary chains (scripts for example).
 */
static int prepare_binprm(struct linux_binprm *bprm)
{
        loff_t pos = 0;

        memset(bprm->buf, 0, BINPRM_BUF_SIZE);
        return kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, &pos);
}

/*
 * Arguments are '\0' separated strings found at the location bprm->p
 * points to; chop off the first by relocating brpm->p to right after
 * the first '\0' encountered.
 */
int remove_arg_zero(struct linux_binprm *bprm)
{
        int ret = 0;
        unsigned long offset;
        char *kaddr;
        struct page *page;

        if (!bprm->argc)
                return 0;

        do {
                offset = bprm->p & ~PAGE_MASK;
                page = get_arg_page(bprm, bprm->p, 0);
                if (!page) {
                        ret = -EFAULT;
                        goto out;
                }
                kaddr = kmap_atomic(page);

                for (; offset < PAGE_SIZE && kaddr[offset];
                                offset++, bprm->p++)
                        ;

                kunmap_atomic(kaddr);
                put_arg_page(page);
        } while (offset == PAGE_SIZE);

        bprm->p++;
        bprm->argc--;
        ret = 0;

out:
        return ret;
}
EXPORT_SYMBOL(remove_arg_zero);

#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
/*
 * cycle the list of binary formats handler, until one recognizes the image
 */
static int search_binary_handler(struct linux_binprm *bprm)
{
        bool need_retry = IS_ENABLED(CONFIG_MODULES);
        struct linux_binfmt *fmt;
        int retval;

        retval = prepare_binprm(bprm);
        if (retval < 0)
                return retval;

        retval = security_bprm_check(bprm);
        if (retval)
                return retval;

        retval = -ENOENT;
 retry:
        read_lock(&binfmt_lock);
        list_for_each_entry(fmt, &formats, lh) {
                if (!try_module_get(fmt->module))
                        continue;
                read_unlock(&binfmt_lock);

                retval = fmt->load_binary(bprm);

                read_lock(&binfmt_lock);
                put_binfmt(fmt);
                if (bprm->point_of_no_return || (retval != -ENOEXEC)) {
                        read_unlock(&binfmt_lock);
                        return retval;
                }
        }
        read_unlock(&binfmt_lock);

        if (need_retry) {
                if (printable(bprm->buf[0]) && printable(bprm->buf[1]) &&
                    printable(bprm->buf[2]) && printable(bprm->buf[3]))
                        return retval;
                if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0)
                        return retval;
                need_retry = false;
                goto retry;
        }

        return retval;
}

static int exec_binprm(struct linux_binprm *bprm)
{
        pid_t old_pid, old_vpid;
        int ret, depth;

        /* Need to fetch pid before load_binary changes it */
        old_pid = current->pid;
        rcu_read_lock();
        old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
        rcu_read_unlock();

        /* This allows 4 levels of binfmt rewrites before failing hard. */
        for (depth = 0;; depth++) {
                struct file *exec;
                if (depth > 5)
                        return -ELOOP;

                ret = search_binary_handler(bprm);
                if (ret < 0)
                        return ret;
                if (!bprm->interpreter)
                        break;

                exec = bprm->file;
                bprm->file = bprm->interpreter;
                bprm->interpreter = NULL;

                allow_write_access(exec);
                if (unlikely(bprm->have_execfd)) {
                        if (bprm->executable) {
                                fput(exec);
                                return -ENOEXEC;
                        }
                        bprm->executable = exec;
                } else
                        fput(exec);
        }

        audit_bprm(bprm);
        trace_sched_process_exec(current, old_pid, bprm);
        ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
        proc_exec_connector(current);
        return 0;
}

/*
 * sys_execve() executes a new program.
 */
static int bprm_execve(struct linux_binprm *bprm,
                       int fd, struct filename *filename, int flags)
{
        struct file *file;
        int retval;

        /*
         * Cancel any io_uring activity across execve
         */
        io_uring_task_cancel();

        retval = prepare_bprm_creds(bprm);
        if (retval)
                return retval;

        check_unsafe_exec(bprm);
        current->in_execve = 1;

        file = do_open_execat(fd, filename, flags);
        retval = PTR_ERR(file);
        if (IS_ERR(file))
                goto out_unmark;

        sched_exec();

        bprm->file = file;
        /*
         * Record that a name derived from an O_CLOEXEC fd will be
         * inaccessible after exec.  This allows the code in exec to
         * choose to fail when the executable is not mmaped into the
         * interpreter and an open file descriptor is not passed to
         * the interpreter.  This makes for a better user experience
         * than having the interpreter start and then immediately fail
         * when it finds the executable is inaccessible.
         */
        if (bprm->fdpath && get_close_on_exec(fd))
                bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;

        /* Set the unchanging part of bprm->cred */
        retval = security_bprm_creds_for_exec(bprm);
        if (retval)
                goto out;

        retval = exec_binprm(bprm);
        if (retval < 0)
                goto out;

        /* execve succeeded */
        current->fs->in_exec = 0;
        current->in_execve = 0;
        rseq_execve(current);
        acct_update_integrals(current);
        task_numa_free(current, false);
        return retval;

out:
        /*
         * If past the point of no return ensure the the code never
         * returns to the userspace process.  Use an existing fatal
         * signal if present otherwise terminate the process with
         * SIGSEGV.
         */
        if (bprm->point_of_no_return && !fatal_signal_pending(current))
                force_sigsegv(SIGSEGV);

out_unmark:
        current->fs->in_exec = 0;
        current->in_execve = 0;

        return retval;
}

static int do_execveat_common(int fd, struct filename *filename,
                              struct user_arg_ptr argv,
                              struct user_arg_ptr envp,
                              int flags)
{
        struct linux_binprm *bprm;
        int retval;

        if (IS_ERR(filename))
                return PTR_ERR(filename);

        /*
         * We move the actual failure in case of RLIMIT_NPROC excess from
         * set*uid() to execve() because too many poorly written programs
         * don't check setuid() return code.  Here we additionally recheck
         * whether NPROC limit is still exceeded.
         */
        if ((current->flags & PF_NPROC_EXCEEDED) &&
            atomic_read(&current_user()->processes) > rlimit(RLIMIT_NPROC)) {
                retval = -EAGAIN;
                goto out_ret;
        }

        /* We're below the limit (still or again), so we don't want to make
         * further execve() calls fail. */
        current->flags &= ~PF_NPROC_EXCEEDED;

        bprm = alloc_bprm(fd, filename);
        if (IS_ERR(bprm)) {
                retval = PTR_ERR(bprm);
                goto out_ret;
        }

        retval = count(argv, MAX_ARG_STRINGS);
        if (retval == 0)
                pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n",
                             current->comm, bprm->filename);
        if (retval < 0)
                goto out_free;
        bprm->argc = retval;

        retval = count(envp, MAX_ARG_STRINGS);
        if (retval < 0)
                goto out_free;
        bprm->envc = retval;

        retval = bprm_stack_limits(bprm);
        if (retval < 0)
                goto out_free;

        retval = copy_string_kernel(bprm->filename, bprm);
        if (retval < 0)
                goto out_free;
        bprm->exec = bprm->p;

        retval = copy_strings(bprm->envc, envp, bprm);
        if (retval < 0)
                goto out_free;

        retval = copy_strings(bprm->argc, argv, bprm);
        if (retval < 0)
                goto out_free;

        /*
         * When argv is empty, add an empty string ("") as argv[0] to
         * ensure confused userspace programs that start processing
         * from argv[1] won't end up walking envp. See also
         * bprm_stack_limits().
         */
        if (bprm->argc == 0) {
                retval = copy_string_kernel("", bprm);
                if (retval < 0)
                        goto out_free;
                bprm->argc = 1;
        }

        retval = bprm_execve(bprm, fd, filename, flags);
out_free:
        free_bprm(bprm);

out_ret:
        putname(filename);
        return retval;
}

int kernel_execve(const char *kernel_filename,
                  const char *const *argv, const char *const *envp)
{
        struct filename *filename;
        struct linux_binprm *bprm;
        int fd = AT_FDCWD;
        int retval;

        filename = getname_kernel(kernel_filename);
        if (IS_ERR(filename))
                return PTR_ERR(filename);

        bprm = alloc_bprm(fd, filename);
        if (IS_ERR(bprm)) {
                retval = PTR_ERR(bprm);
                goto out_ret;
        }

        retval = count_strings_kernel(argv);
        if (WARN_ON_ONCE(retval == 0))
                retval = -EINVAL;
        if (retval < 0)
                goto out_free;
        bprm->argc = retval;

        retval = count_strings_kernel(envp);
        if (retval < 0)
                goto out_free;
        bprm->envc = retval;

        retval = bprm_stack_limits(bprm);
        if (retval < 0)
                goto out_free;

        retval = copy_string_kernel(bprm->filename, bprm);
        if (retval < 0)
                goto out_free;
        bprm->exec = bprm->p;

        retval = copy_strings_kernel(bprm->envc, envp, bprm);
        if (retval < 0)
                goto out_free;

        retval = copy_strings_kernel(bprm->argc, argv, bprm);
        if (retval < 0)
                goto out_free;

        retval = bprm_execve(bprm, fd, filename, 0);
out_free:
        free_bprm(bprm);
out_ret:
        putname(filename);
        return retval;
}

static int do_execve(struct filename *filename,
        const char __user *const __user *__argv,
        const char __user *const __user *__envp)
{
        struct user_arg_ptr argv = { .ptr.native = __argv };
        struct user_arg_ptr envp = { .ptr.native = __envp };
        return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
}

static int do_execveat(int fd, struct filename *filename,
                const char __user *const __user *__argv,
                const char __user *const __user *__envp,
                int flags)
{
        struct user_arg_ptr argv = { .ptr.native = __argv };
        struct user_arg_ptr envp = { .ptr.native = __envp };

        return do_execveat_common(fd, filename, argv, envp, flags);
}

#ifdef CONFIG_COMPAT
static int compat_do_execve(struct filename *filename,
        const compat_uptr_t __user *__argv,
        const compat_uptr_t __user *__envp)
{
        struct user_arg_ptr argv = {
                .is_compat = true,
                .ptr.compat = __argv,
        };
        struct user_arg_ptr envp = {
                .is_compat = true,
                .ptr.compat = __envp,
        };
        return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
}

static int compat_do_execveat(int fd, struct filename *filename,
                              const compat_uptr_t __user *__argv,
                              const compat_uptr_t __user *__envp,
                              int flags)
{
        struct user_arg_ptr argv = {
                .is_compat = true,
                .ptr.compat = __argv,
        };
        struct user_arg_ptr envp = {
                .is_compat = true,
                .ptr.compat = __envp,
        };
        return do_execveat_common(fd, filename, argv, envp, flags);
}
#endif

void set_binfmt(struct linux_binfmt *new)
{
        struct mm_struct *mm = current->mm;

        if (mm->binfmt)
                module_put(mm->binfmt->module);

        mm->binfmt = new;
        if (new)
                __module_get(new->module);
}
EXPORT_SYMBOL(set_binfmt);

/*
 * set_dumpable stores three-value SUID_DUMP_* into mm->flags.
 */
void set_dumpable(struct mm_struct *mm, int value)
{
        if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
                return;

        set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value);
}

SYSCALL_DEFINE3(execve,
                const char __user *, filename,
                const char __user *const __user *, argv,
                const char __user *const __user *, envp)
{
        return do_execve(getname(filename), argv, envp);
}

SYSCALL_DEFINE5(execveat,
                int, fd, const char __user *, filename,
                const char __user *const __user *, argv,
                const char __user *const __user *, envp,
                int, flags)
{
        int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;

        return do_execveat(fd,
                           getname_flags(filename, lookup_flags, NULL),
                           argv, envp, flags);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
        const compat_uptr_t __user *, argv,
        const compat_uptr_t __user *, envp)
{
        return compat_do_execve(getname(filename), argv, envp);
}

COMPAT_SYSCALL_DEFINE5(execveat, int, fd,
                       const char __user *, filename,
                       const compat_uptr_t __user *, argv,
                       const compat_uptr_t __user *, envp,
                       int,  flags)
{
        int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;

        return compat_do_execveat(fd,
                                  getname_flags(filename, lookup_flags, NULL),
                                  argv, envp, flags);
}
#endif

























































    1 










    1 












    1 






    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * include/linux/pagevec.h
 *
 * In many places it is efficient to batch an operation up against multiple
 * pages.  A pagevec is a multipage container which is used for that.
 */

#ifndef _LINUX_PAGEVEC_H
#define _LINUX_PAGEVEC_H

#include <linux/xarray.h>

/* 15 pointers + header align the pagevec structure to a power of two */
#define PAGEVEC_SIZE        15

struct page;
struct address_space;

struct pagevec {
        unsigned char nr;
        bool percpu_pvec_drained;
        struct page *pages[PAGEVEC_SIZE];
};

void __pagevec_release(struct pagevec *pvec);
void __pagevec_lru_add(struct pagevec *pvec);
unsigned pagevec_lookup_entries(struct pagevec *pvec,
                                struct address_space *mapping,
                                pgoff_t start, unsigned nr_entries,
                                pgoff_t *indices);
void pagevec_remove_exceptionals(struct pagevec *pvec);
unsigned pagevec_lookup_range(struct pagevec *pvec,
                              struct address_space *mapping,
                              pgoff_t *start, pgoff_t end);
static inline unsigned pagevec_lookup(struct pagevec *pvec,
                                      struct address_space *mapping,
                                      pgoff_t *start)
{
        return pagevec_lookup_range(pvec, mapping, start, (pgoff_t)-1);
}

unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
                struct address_space *mapping, pgoff_t *index, pgoff_t end,
                xa_mark_t tag);
unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec,
                struct address_space *mapping, pgoff_t *index, pgoff_t end,
                xa_mark_t tag, unsigned max_pages);
static inline unsigned pagevec_lookup_tag(struct pagevec *pvec,
                struct address_space *mapping, pgoff_t *index, xa_mark_t tag)
{
        return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag);
}

static inline void pagevec_init(struct pagevec *pvec)
{
        pvec->nr = 0;
        pvec->percpu_pvec_drained = false;
}

static inline void pagevec_reinit(struct pagevec *pvec)
{
        pvec->nr = 0;
}

static inline unsigned pagevec_count(struct pagevec *pvec)
{
        return pvec->nr;
}

static inline unsigned pagevec_space(struct pagevec *pvec)
{
        return PAGEVEC_SIZE - pvec->nr;
}

/*
 * Add a page to a pagevec.  Returns the number of slots still available.
 */
static inline unsigned pagevec_add(struct pagevec *pvec, struct page *page)
{
        pvec->pages[pvec->nr++] = page;
        return pagevec_space(pvec);
}

static inline void pagevec_release(struct pagevec *pvec)
{
        if (pagevec_count(pvec))
                __pagevec_release(pvec);
}

#endif /* _LINUX_PAGEVEC_H */


























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_COMPLETION_H
#define __LINUX_COMPLETION_H

/*
 * (C) Copyright 2001 Linus Torvalds
 *
 * Atomic wait-for-completion handler data structures.
 * See kernel/sched/completion.c for details.
 */

#include <linux/swait.h>

/*
 * struct completion - structure used to maintain state for a "completion"
 *
 * This is the opaque structure used to maintain the state for a "completion".
 * Completions currently use a FIFO to queue threads that have to wait for
 * the "completion" event.
 *
 * See also:  complete(), wait_for_completion() (and friends _timeout,
 * _interruptible, _interruptible_timeout, and _killable), init_completion(),
 * reinit_completion(), and macros DECLARE_COMPLETION(),
 * DECLARE_COMPLETION_ONSTACK().
 */
struct completion {
        unsigned int done;
        struct swait_queue_head wait;
};

#define init_completion_map(x, m) __init_completion(x)
#define init_completion(x) __init_completion(x)
static inline void complete_acquire(struct completion *x) {}
static inline void complete_release(struct completion *x) {}

#define COMPLETION_INITIALIZER(work) \
        { 0, __SWAIT_QUEUE_HEAD_INITIALIZER((work).wait) }

#define COMPLETION_INITIALIZER_ONSTACK_MAP(work, map) \
        (*({ init_completion_map(&(work), &(map)); &(work); }))

#define COMPLETION_INITIALIZER_ONSTACK(work) \
        (*({ init_completion(&work); &work; }))

/**
 * DECLARE_COMPLETION - declare and initialize a completion structure
 * @work:  identifier for the completion structure
 *
 * This macro declares and initializes a completion structure. Generally used
 * for static declarations. You should use the _ONSTACK variant for automatic
 * variables.
 */
#define DECLARE_COMPLETION(work) \
        struct completion work = COMPLETION_INITIALIZER(work)

/*
 * Lockdep needs to run a non-constant initializer for on-stack
 * completions - so we use the _ONSTACK() variant for those that
 * are on the kernel stack:
 */
/**
 * DECLARE_COMPLETION_ONSTACK - declare and initialize a completion structure
 * @work:  identifier for the completion structure
 *
 * This macro declares and initializes a completion structure on the kernel
 * stack.
 */
#ifdef CONFIG_LOCKDEP
# define DECLARE_COMPLETION_ONSTACK(work) \
        struct completion work = COMPLETION_INITIALIZER_ONSTACK(work)
# define DECLARE_COMPLETION_ONSTACK_MAP(work, map) \
        struct completion work = COMPLETION_INITIALIZER_ONSTACK_MAP(work, map)
#else
# define DECLARE_COMPLETION_ONSTACK(work) DECLARE_COMPLETION(work)
# define DECLARE_COMPLETION_ONSTACK_MAP(work, map) DECLARE_COMPLETION(work)
#endif

/**
 * init_completion - Initialize a dynamically allocated completion
 * @x:  pointer to completion structure that is to be initialized
 *
 * This inline function will initialize a dynamically created completion
 * structure.
 */
static inline void __init_completion(struct completion *x)
{
        x->done = 0;
        init_swait_queue_head(&x->wait);
}

/**
 * reinit_completion - reinitialize a completion structure
 * @x:  pointer to completion structure that is to be reinitialized
 *
 * This inline function should be used to reinitialize a completion structure so it can
 * be reused. This is especially important after complete_all() is used.
 */
static inline void reinit_completion(struct completion *x)
{
        x->done = 0;
}

extern void wait_for_completion(struct completion *);
extern void wait_for_completion_io(struct completion *);
extern int wait_for_completion_interruptible(struct completion *x);
extern int wait_for_completion_killable(struct completion *x);
extern unsigned long wait_for_completion_timeout(struct completion *x,
                                                   unsigned long timeout);
extern unsigned long wait_for_completion_io_timeout(struct completion *x,
                                                    unsigned long timeout);
extern long wait_for_completion_interruptible_timeout(
        struct completion *x, unsigned long timeout);
extern long wait_for_completion_killable_timeout(
        struct completion *x, unsigned long timeout);
extern bool try_wait_for_completion(struct completion *x);
extern bool completion_done(struct completion *x);

extern void complete(struct completion *);
extern void complete_all(struct completion *);

#endif













































    3 
















































































































































    1 













    2 


































































































































































    2 

































































    2 
































    1 












































    1 











































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM block

#if !defined(_TRACE_BLOCK_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_BLOCK_H

#include <linux/blktrace_api.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/tracepoint.h>

#define RWBS_LEN        8

DECLARE_EVENT_CLASS(block_buffer,

        TP_PROTO(struct buffer_head *bh),

        TP_ARGS(bh),

        TP_STRUCT__entry (
                __field(  dev_t,        dev                        )
                __field(  sector_t,        sector                        )
                __field(  size_t,        size                        )
        ),

        TP_fast_assign(
                __entry->dev                = bh->b_bdev->bd_dev;
                __entry->sector                = bh->b_blocknr;
                __entry->size                = bh->b_size;
        ),

        TP_printk("%d,%d sector=%llu size=%zu",
                MAJOR(__entry->dev), MINOR(__entry->dev),
                (unsigned long long)__entry->sector, __entry->size
        )
);

/**
 * block_touch_buffer - mark a buffer accessed
 * @bh: buffer_head being touched
 *
 * Called from touch_buffer().
 */
DEFINE_EVENT(block_buffer, block_touch_buffer,

        TP_PROTO(struct buffer_head *bh),

        TP_ARGS(bh)
);

/**
 * block_dirty_buffer - mark a buffer dirty
 * @bh: buffer_head being dirtied
 *
 * Called from mark_buffer_dirty().
 */
DEFINE_EVENT(block_buffer, block_dirty_buffer,

        TP_PROTO(struct buffer_head *bh),

        TP_ARGS(bh)
);

/**
 * block_rq_requeue - place block IO request back on a queue
 * @rq: block IO operation request
 *
 * The block operation request @rq is being placed back into queue
 * @q.  For some reason the request was not completed and needs to be
 * put back in the queue.
 */
TRACE_EVENT(block_rq_requeue,

        TP_PROTO(struct request *rq),

        TP_ARGS(rq),

        TP_STRUCT__entry(
                __field(  dev_t,        dev                        )
                __field(  sector_t,        sector                        )
                __field(  unsigned int,        nr_sector                )
                __array(  char,                rwbs,        RWBS_LEN        )
                __dynamic_array( char,        cmd,        1                )
        ),

        TP_fast_assign(
                __entry->dev           = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
                __entry->sector    = blk_rq_trace_sector(rq);
                __entry->nr_sector = blk_rq_trace_nr_sectors(rq);

                blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
                __get_str(cmd)[0] = '\0';
        ),

        TP_printk("%d,%d %s (%s) %llu + %u [%d]",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->rwbs, __get_str(cmd),
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector, 0)
);

/**
 * block_rq_complete - block IO operation completed by device driver
 * @rq: block operations request
 * @error: status code
 * @nr_bytes: number of completed bytes
 *
 * The block_rq_complete tracepoint event indicates that some portion
 * of operation request has been completed by the device driver.  If
 * the @rq->bio is %NULL, then there is absolutely no additional work to
 * do for the request. If @rq->bio is non-NULL then there is
 * additional work required to complete the request.
 */
TRACE_EVENT(block_rq_complete,

        TP_PROTO(struct request *rq, int error, unsigned int nr_bytes),

        TP_ARGS(rq, error, nr_bytes),

        TP_STRUCT__entry(
                __field(  dev_t,        dev                        )
                __field(  sector_t,        sector                        )
                __field(  unsigned int,        nr_sector                )
                __field(  int,                error                        )
                __array(  char,                rwbs,        RWBS_LEN        )
                __dynamic_array( char,        cmd,        1                )
        ),

        TP_fast_assign(
                __entry->dev           = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
                __entry->sector    = blk_rq_pos(rq);
                __entry->nr_sector = nr_bytes >> 9;
                __entry->error     = error;

                blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, nr_bytes);
                __get_str(cmd)[0] = '\0';
        ),

        TP_printk("%d,%d %s (%s) %llu + %u [%d]",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->rwbs, __get_str(cmd),
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector, __entry->error)
);

DECLARE_EVENT_CLASS(block_rq,

        TP_PROTO(struct request *rq),

        TP_ARGS(rq),

        TP_STRUCT__entry(
                __field(  dev_t,        dev                        )
                __field(  sector_t,        sector                        )
                __field(  unsigned int,        nr_sector                )
                __field(  unsigned int,        bytes                        )
                __array(  char,                rwbs,        RWBS_LEN        )
                __array(  char,         comm,   TASK_COMM_LEN   )
                __dynamic_array( char,        cmd,        1                )
        ),

        TP_fast_assign(
                __entry->dev           = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
                __entry->sector    = blk_rq_trace_sector(rq);
                __entry->nr_sector = blk_rq_trace_nr_sectors(rq);
                __entry->bytes     = blk_rq_bytes(rq);

                blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
                __get_str(cmd)[0] = '\0';
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
        ),

        TP_printk("%d,%d %s %u (%s) %llu + %u [%s]",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->rwbs, __entry->bytes, __get_str(cmd),
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector, __entry->comm)
);

/**
 * block_rq_insert - insert block operation request into queue
 * @rq: block IO operation request
 *
 * Called immediately before block operation request @rq is inserted
 * into queue @q.  The fields in the operation request @rq struct can
 * be examined to determine which device and sectors the pending
 * operation would access.
 */
DEFINE_EVENT(block_rq, block_rq_insert,

        TP_PROTO(struct request *rq),

        TP_ARGS(rq)
);

/**
 * block_rq_issue - issue pending block IO request operation to device driver
 * @rq: block IO operation operation request
 *
 * Called when block operation request @rq from queue @q is sent to a
 * device driver for processing.
 */
DEFINE_EVENT(block_rq, block_rq_issue,

        TP_PROTO(struct request *rq),

        TP_ARGS(rq)
);

/**
 * block_rq_merge - merge request with another one in the elevator
 * @rq: block IO operation operation request
 *
 * Called when block operation request @rq from queue @q is merged to another
 * request queued in the elevator.
 */
DEFINE_EVENT(block_rq, block_rq_merge,

        TP_PROTO(struct request *rq),

        TP_ARGS(rq)
);

/**
 * block_bio_bounce - used bounce buffer when processing block operation
 * @q: queue holding the block operation
 * @bio: block operation
 *
 * A bounce buffer was used to handle the block operation @bio in @q.
 * This occurs when hardware limitations prevent a direct transfer of
 * data between the @bio data memory area and the IO device.  Use of a
 * bounce buffer requires extra copying of data and decreases
 * performance.
 */
TRACE_EVENT(block_bio_bounce,

        TP_PROTO(struct request_queue *q, struct bio *bio),

        TP_ARGS(q, bio),

        TP_STRUCT__entry(
                __field( dev_t,                dev                        )
                __field( sector_t,        sector                        )
                __field( unsigned int,        nr_sector                )
                __array( char,                rwbs,        RWBS_LEN        )
                __array( char,                comm,        TASK_COMM_LEN        )
        ),

        TP_fast_assign(
                __entry->dev                = bio_dev(bio);
                __entry->sector                = bio->bi_iter.bi_sector;
                __entry->nr_sector        = bio_sectors(bio);
                blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
        ),

        TP_printk("%d,%d %s %llu + %u [%s]",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector, __entry->comm)
);

/**
 * block_bio_complete - completed all work on the block operation
 * @q: queue holding the block operation
 * @bio: block operation completed
 *
 * This tracepoint indicates there is no further work to do on this
 * block IO operation @bio.
 */
TRACE_EVENT(block_bio_complete,

        TP_PROTO(struct request_queue *q, struct bio *bio),

        TP_ARGS(q, bio),

        TP_STRUCT__entry(
                __field( dev_t,                dev                )
                __field( sector_t,        sector                )
                __field( unsigned,        nr_sector        )
                __field( int,                error                )
                __array( char,                rwbs,        RWBS_LEN)
        ),

        TP_fast_assign(
                __entry->dev                = bio_dev(bio);
                __entry->sector                = bio->bi_iter.bi_sector;
                __entry->nr_sector        = bio_sectors(bio);
                __entry->error                = blk_status_to_errno(bio->bi_status);
                blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
        ),

        TP_printk("%d,%d %s %llu + %u [%d]",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector, __entry->error)
);

DECLARE_EVENT_CLASS(block_bio_merge,

        TP_PROTO(struct request_queue *q, struct request *rq, struct bio *bio),

        TP_ARGS(q, rq, bio),

        TP_STRUCT__entry(
                __field( dev_t,                dev                        )
                __field( sector_t,        sector                        )
                __field( unsigned int,        nr_sector                )
                __array( char,                rwbs,        RWBS_LEN        )
                __array( char,                comm,        TASK_COMM_LEN        )
        ),

        TP_fast_assign(
                __entry->dev                = bio_dev(bio);
                __entry->sector                = bio->bi_iter.bi_sector;
                __entry->nr_sector        = bio_sectors(bio);
                blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
        ),

        TP_printk("%d,%d %s %llu + %u [%s]",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector, __entry->comm)
);

/**
 * block_bio_backmerge - merging block operation to the end of an existing operation
 * @q: queue holding operation
 * @rq: request bio is being merged into
 * @bio: new block operation to merge
 *
 * Merging block request @bio to the end of an existing block request
 * in queue @q.
 */
DEFINE_EVENT(block_bio_merge, block_bio_backmerge,

        TP_PROTO(struct request_queue *q, struct request *rq, struct bio *bio),

        TP_ARGS(q, rq, bio)
);

/**
 * block_bio_frontmerge - merging block operation to the beginning of an existing operation
 * @q: queue holding operation
 * @rq: request bio is being merged into
 * @bio: new block operation to merge
 *
 * Merging block IO operation @bio to the beginning of an existing block
 * operation in queue @q.
 */
DEFINE_EVENT(block_bio_merge, block_bio_frontmerge,

        TP_PROTO(struct request_queue *q, struct request *rq, struct bio *bio),

        TP_ARGS(q, rq, bio)
);

/**
 * block_bio_queue - putting new block IO operation in queue
 * @q: queue holding operation
 * @bio: new block operation
 *
 * About to place the block IO operation @bio into queue @q.
 */
TRACE_EVENT(block_bio_queue,

        TP_PROTO(struct request_queue *q, struct bio *bio),

        TP_ARGS(q, bio),

        TP_STRUCT__entry(
                __field( dev_t,                dev                        )
                __field( sector_t,        sector                        )
                __field( unsigned int,        nr_sector                )
                __array( char,                rwbs,        RWBS_LEN        )
                __array( char,                comm,        TASK_COMM_LEN        )
        ),

        TP_fast_assign(
                __entry->dev                = bio_dev(bio);
                __entry->sector                = bio->bi_iter.bi_sector;
                __entry->nr_sector        = bio_sectors(bio);
                blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
        ),

        TP_printk("%d,%d %s %llu + %u [%s]",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector, __entry->comm)
);

DECLARE_EVENT_CLASS(block_get_rq,

        TP_PROTO(struct request_queue *q, struct bio *bio, int rw),

        TP_ARGS(q, bio, rw),

        TP_STRUCT__entry(
                __field( dev_t,                dev                        )
                __field( sector_t,        sector                        )
                __field( unsigned int,        nr_sector                )
                __array( char,                rwbs,        RWBS_LEN        )
                __array( char,                comm,        TASK_COMM_LEN        )
        ),

        TP_fast_assign(
                __entry->dev                = bio ? bio_dev(bio) : 0;
                __entry->sector                = bio ? bio->bi_iter.bi_sector : 0;
                __entry->nr_sector        = bio ? bio_sectors(bio) : 0;
                blk_fill_rwbs(__entry->rwbs,
                              bio ? bio->bi_opf : 0, __entry->nr_sector);
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
        ),

        TP_printk("%d,%d %s %llu + %u [%s]",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector, __entry->comm)
);

/**
 * block_getrq - get a free request entry in queue for block IO operations
 * @q: queue for operations
 * @bio: pending block IO operation (can be %NULL)
 * @rw: low bit indicates a read (%0) or a write (%1)
 *
 * A request struct for queue @q has been allocated to handle the
 * block IO operation @bio.
 */
DEFINE_EVENT(block_get_rq, block_getrq,

        TP_PROTO(struct request_queue *q, struct bio *bio, int rw),

        TP_ARGS(q, bio, rw)
);

/**
 * block_sleeprq - waiting to get a free request entry in queue for block IO operation
 * @q: queue for operation
 * @bio: pending block IO operation (can be %NULL)
 * @rw: low bit indicates a read (%0) or a write (%1)
 *
 * In the case where a request struct cannot be provided for queue @q
 * the process needs to wait for an request struct to become
 * available.  This tracepoint event is generated each time the
 * process goes to sleep waiting for request struct become available.
 */
DEFINE_EVENT(block_get_rq, block_sleeprq,

        TP_PROTO(struct request_queue *q, struct bio *bio, int rw),

        TP_ARGS(q, bio, rw)
);

/**
 * block_plug - keep operations requests in request queue
 * @q: request queue to plug
 *
 * Plug the request queue @q.  Do not allow block operation requests
 * to be sent to the device driver. Instead, accumulate requests in
 * the queue to improve throughput performance of the block device.
 */
TRACE_EVENT(block_plug,

        TP_PROTO(struct request_queue *q),

        TP_ARGS(q),

        TP_STRUCT__entry(
                __array( char,                comm,        TASK_COMM_LEN        )
        ),

        TP_fast_assign(
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
        ),

        TP_printk("[%s]", __entry->comm)
);

DECLARE_EVENT_CLASS(block_unplug,

        TP_PROTO(struct request_queue *q, unsigned int depth, bool explicit),

        TP_ARGS(q, depth, explicit),

        TP_STRUCT__entry(
                __field( int,                nr_rq                        )
                __array( char,                comm,        TASK_COMM_LEN        )
        ),

        TP_fast_assign(
                __entry->nr_rq = depth;
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
        ),

        TP_printk("[%s] %d", __entry->comm, __entry->nr_rq)
);

/**
 * block_unplug - release of operations requests in request queue
 * @q: request queue to unplug
 * @depth: number of requests just added to the queue
 * @explicit: whether this was an explicit unplug, or one from schedule()
 *
 * Unplug request queue @q because device driver is scheduled to work
 * on elements in the request queue.
 */
DEFINE_EVENT(block_unplug, block_unplug,

        TP_PROTO(struct request_queue *q, unsigned int depth, bool explicit),

        TP_ARGS(q, depth, explicit)
);

/**
 * block_split - split a single bio struct into two bio structs
 * @q: queue containing the bio
 * @bio: block operation being split
 * @new_sector: The starting sector for the new bio
 *
 * The bio request @bio in request queue @q needs to be split into two
 * bio requests. The newly created @bio request starts at
 * @new_sector. This split may be required due to hardware limitation
 * such as operation crossing device boundaries in a RAID system.
 */
TRACE_EVENT(block_split,

        TP_PROTO(struct request_queue *q, struct bio *bio,
                 unsigned int new_sector),

        TP_ARGS(q, bio, new_sector),

        TP_STRUCT__entry(
                __field( dev_t,                dev                                )
                __field( sector_t,        sector                                )
                __field( sector_t,        new_sector                        )
                __array( char,                rwbs,                RWBS_LEN        )
                __array( char,                comm,                TASK_COMM_LEN        )
        ),

        TP_fast_assign(
                __entry->dev                = bio_dev(bio);
                __entry->sector                = bio->bi_iter.bi_sector;
                __entry->new_sector        = new_sector;
                blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
        ),

        TP_printk("%d,%d %s %llu / %llu [%s]",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
                  (unsigned long long)__entry->sector,
                  (unsigned long long)__entry->new_sector,
                  __entry->comm)
);

/**
 * block_bio_remap - map request for a logical device to the raw device
 * @q: queue holding the operation
 * @bio: revised operation
 * @dev: device for the operation
 * @from: original sector for the operation
 *
 * An operation for a logical device has been mapped to the
 * raw block device.
 */
TRACE_EVENT(block_bio_remap,

        TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
                 sector_t from),

        TP_ARGS(q, bio, dev, from),

        TP_STRUCT__entry(
                __field( dev_t,                dev                )
                __field( sector_t,        sector                )
                __field( unsigned int,        nr_sector        )
                __field( dev_t,                old_dev                )
                __field( sector_t,        old_sector        )
                __array( char,                rwbs,        RWBS_LEN)
        ),

        TP_fast_assign(
                __entry->dev                = bio_dev(bio);
                __entry->sector                = bio->bi_iter.bi_sector;
                __entry->nr_sector        = bio_sectors(bio);
                __entry->old_dev        = dev;
                __entry->old_sector        = from;
                blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
        ),

        TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector,
                  MAJOR(__entry->old_dev), MINOR(__entry->old_dev),
                  (unsigned long long)__entry->old_sector)
);

/**
 * block_rq_remap - map request for a block operation request
 * @rq: block IO operation request
 * @dev: device for the operation
 * @from: original sector for the operation
 *
 * The block operation request @rq in @q has been remapped.  The block
 * operation request @rq holds the current information and @from hold
 * the original sector.
 */
TRACE_EVENT(block_rq_remap,

        TP_PROTO(struct request *rq, dev_t dev, sector_t from),

        TP_ARGS(rq, dev, from),

        TP_STRUCT__entry(
                __field( dev_t,                dev                )
                __field( sector_t,        sector                )
                __field( unsigned int,        nr_sector        )
                __field( dev_t,                old_dev                )
                __field( sector_t,        old_sector        )
                __field( unsigned int,        nr_bios                )
                __array( char,                rwbs,        RWBS_LEN)
        ),

        TP_fast_assign(
                __entry->dev                = disk_devt(rq->rq_disk);
                __entry->sector                = blk_rq_pos(rq);
                __entry->nr_sector        = blk_rq_sectors(rq);
                __entry->old_dev        = dev;
                __entry->old_sector        = from;
                __entry->nr_bios        = blk_rq_count_bios(rq);
                blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
        ),

        TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector,
                  MAJOR(__entry->old_dev), MINOR(__entry->old_dev),
                  (unsigned long long)__entry->old_sector, __entry->nr_bios)
);

#endif /* _TRACE_BLOCK_H */

/* This part must be outside protection */
#include <trace/define_trace.h>









    1 





















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM task

#if !defined(_TRACE_TASK_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_TASK_H
#include <linux/tracepoint.h>

TRACE_EVENT(task_newtask,

        TP_PROTO(struct task_struct *task, unsigned long clone_flags),

        TP_ARGS(task, clone_flags),

        TP_STRUCT__entry(
                __field(        pid_t,        pid)
                __array(        char,        comm, TASK_COMM_LEN)
                __field( unsigned long, clone_flags)
                __field(        short,        oom_score_adj)
        ),

        TP_fast_assign(
                __entry->pid = task->pid;
                memcpy(__entry->comm, task->comm, TASK_COMM_LEN);
                __entry->clone_flags = clone_flags;
                __entry->oom_score_adj = task->signal->oom_score_adj;
        ),

        TP_printk("pid=%d comm=%s clone_flags=%lx oom_score_adj=%hd",
                __entry->pid, __entry->comm,
                __entry->clone_flags, __entry->oom_score_adj)
);

TRACE_EVENT(task_rename,

        TP_PROTO(struct task_struct *task, const char *comm),

        TP_ARGS(task, comm),

        TP_STRUCT__entry(
                __field(        pid_t,        pid)
                __array(        char, oldcomm,  TASK_COMM_LEN)
                __array(        char, newcomm,  TASK_COMM_LEN)
                __field(        short,        oom_score_adj)
        ),

        TP_fast_assign(
                __entry->pid = task->pid;
                memcpy(entry->oldcomm, task->comm, TASK_COMM_LEN);
                strlcpy(entry->newcomm, comm, TASK_COMM_LEN);
                __entry->oom_score_adj = task->signal->oom_score_adj;
        ),

        TP_printk("pid=%d oldcomm=%s newcomm=%s oom_score_adj=%hd",
                __entry->pid, __entry->oldcomm,
                __entry->newcomm, __entry->oom_score_adj)
);

#endif

/* This part must be outside protection */
#include <trace/define_trace.h>







































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
/* SPDX-License-Identifier: GPL-2.0 */
/*
 *
 *        Generic internet FLOW.
 *
 */

#ifndef _NET_FLOW_H
#define _NET_FLOW_H

#include <linux/socket.h>
#include <linux/in6.h>
#include <linux/atomic.h>
#include <net/flow_dissector.h>
#include <linux/uidgid.h>

/*
 * ifindex generation is per-net namespace, and loopback is
 * always the 1st device in ns (see net_dev_init), thus any
 * loopback device should get ifindex 1
 */

#define LOOPBACK_IFINDEX        1

struct flowi_tunnel {
        __be64                        tun_id;
};

struct flowi_common {
        int        flowic_oif;
        int        flowic_iif;
        int     flowic_l3mdev;
        __u32        flowic_mark;
        __u8        flowic_tos;
        __u8        flowic_scope;
        __u8        flowic_proto;
        __u8        flowic_flags;
#define FLOWI_FLAG_ANYSRC                0x01
#define FLOWI_FLAG_KNOWN_NH                0x02
        __u32        flowic_secid;
        kuid_t  flowic_uid;
        __u32                flowic_multipath_hash;
        struct flowi_tunnel flowic_tun_key;
};

union flowi_uli {
        struct {
                __be16        dport;
                __be16        sport;
        } ports;

        struct {
                __u8        type;
                __u8        code;
        } icmpt;

        __be32                spi;
        __be32                gre_key;

        struct {
                __u8        type;
        } mht;
};

struct flowi4 {
        struct flowi_common        __fl_common;
#define flowi4_oif                __fl_common.flowic_oif
#define flowi4_iif                __fl_common.flowic_iif
#define flowi4_l3mdev                __fl_common.flowic_l3mdev
#define flowi4_mark                __fl_common.flowic_mark
#define flowi4_tos                __fl_common.flowic_tos
#define flowi4_scope                __fl_common.flowic_scope
#define flowi4_proto                __fl_common.flowic_proto
#define flowi4_flags                __fl_common.flowic_flags
#define flowi4_secid                __fl_common.flowic_secid
#define flowi4_tun_key                __fl_common.flowic_tun_key
#define flowi4_uid                __fl_common.flowic_uid
#define flowi4_multipath_hash        __fl_common.flowic_multipath_hash

        /* (saddr,daddr) must be grouped, same order as in IP header */
        __be32                        saddr;
        __be32                        daddr;

        union flowi_uli                uli;
#define fl4_sport                uli.ports.sport
#define fl4_dport                uli.ports.dport
#define fl4_icmp_type                uli.icmpt.type
#define fl4_icmp_code                uli.icmpt.code
#define fl4_ipsec_spi                uli.spi
#define fl4_mh_type                uli.mht.type
#define fl4_gre_key                uli.gre_key
} __attribute__((__aligned__(BITS_PER_LONG/8)));

static inline void flowi4_init_output(struct flowi4 *fl4, int oif,
                                      __u32 mark, __u8 tos, __u8 scope,
                                      __u8 proto, __u8 flags,
                                      __be32 daddr, __be32 saddr,
                                      __be16 dport, __be16 sport,
                                      kuid_t uid)
{
        fl4->flowi4_oif = oif;
        fl4->flowi4_iif = LOOPBACK_IFINDEX;
        fl4->flowi4_l3mdev = 0;
        fl4->flowi4_mark = mark;
        fl4->flowi4_tos = tos;
        fl4->flowi4_scope = scope;
        fl4->flowi4_proto = proto;
        fl4->flowi4_flags = flags;
        fl4->flowi4_secid = 0;
        fl4->flowi4_tun_key.tun_id = 0;
        fl4->flowi4_uid = uid;
        fl4->daddr = daddr;
        fl4->saddr = saddr;
        fl4->fl4_dport = dport;
        fl4->fl4_sport = sport;
        fl4->flowi4_multipath_hash = 0;
}

/* Reset some input parameters after previous lookup */
static inline void flowi4_update_output(struct flowi4 *fl4, int oif, __u8 tos,
                                        __be32 daddr, __be32 saddr)
{
        fl4->flowi4_oif = oif;
        fl4->flowi4_tos = tos;
        fl4->daddr = daddr;
        fl4->saddr = saddr;
}


struct flowi6 {
        struct flowi_common        __fl_common;
#define flowi6_oif                __fl_common.flowic_oif
#define flowi6_iif                __fl_common.flowic_iif
#define flowi6_l3mdev                __fl_common.flowic_l3mdev
#define flowi6_mark                __fl_common.flowic_mark
#define flowi6_scope                __fl_common.flowic_scope
#define flowi6_proto                __fl_common.flowic_proto
#define flowi6_flags                __fl_common.flowic_flags
#define flowi6_secid                __fl_common.flowic_secid
#define flowi6_tun_key                __fl_common.flowic_tun_key
#define flowi6_uid                __fl_common.flowic_uid
        struct in6_addr                daddr;
        struct in6_addr                saddr;
        /* Note: flowi6_tos is encoded in flowlabel, too. */
        __be32                        flowlabel;
        union flowi_uli                uli;
#define fl6_sport                uli.ports.sport
#define fl6_dport                uli.ports.dport
#define fl6_icmp_type                uli.icmpt.type
#define fl6_icmp_code                uli.icmpt.code
#define fl6_ipsec_spi                uli.spi
#define fl6_mh_type                uli.mht.type
#define fl6_gre_key                uli.gre_key
        __u32                        mp_hash;
} __attribute__((__aligned__(BITS_PER_LONG/8)));

struct flowi {
        union {
                struct flowi_common        __fl_common;
                struct flowi4                ip4;
                struct flowi6                ip6;
        } u;
#define flowi_oif        u.__fl_common.flowic_oif
#define flowi_iif        u.__fl_common.flowic_iif
#define flowi_l3mdev        u.__fl_common.flowic_l3mdev
#define flowi_mark        u.__fl_common.flowic_mark
#define flowi_tos        u.__fl_common.flowic_tos
#define flowi_scope        u.__fl_common.flowic_scope
#define flowi_proto        u.__fl_common.flowic_proto
#define flowi_flags        u.__fl_common.flowic_flags
#define flowi_secid        u.__fl_common.flowic_secid
#define flowi_tun_key        u.__fl_common.flowic_tun_key
#define flowi_uid        u.__fl_common.flowic_uid
} __attribute__((__aligned__(BITS_PER_LONG/8)));

static inline struct flowi *flowi4_to_flowi(struct flowi4 *fl4)
{
        return container_of(fl4, struct flowi, u.ip4);
}

static inline struct flowi_common *flowi4_to_flowi_common(struct flowi4 *fl4)
{
        return &(flowi4_to_flowi(fl4)->u.__fl_common);
}

static inline struct flowi *flowi6_to_flowi(struct flowi6 *fl6)
{
        return container_of(fl6, struct flowi, u.ip6);
}

static inline struct flowi_common *flowi6_to_flowi_common(struct flowi6 *fl6)
{
        return &(flowi6_to_flowi(fl6)->u.__fl_common);
}

__u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys);

#endif





















































































































































































































    1 





    1 







    1 



























































































































    1 
    1 





    1 


























    1 



    1 

    1 





































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/mm/vmscan.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *
 *  Swap reorganised 29.12.95, Stephen Tweedie.
 *  kswapd added: 7.1.96  sct
 *  Removed kswapd_ctl limits, and swap out as many pages as needed
 *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
 *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
 *  Multiqueue VM started 5.8.00, Rik van Riel.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/module.h>
#include <linux/gfp.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/highmem.h>
#include <linux/vmpressure.h>
#include <linux/vmstat.h>
#include <linux/file.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>        /* for try_to_release_page(),
                                        buffer_heads_over_limit */
#include <linux/mm_inline.h>
#include <linux/backing-dev.h>
#include <linux/rmap.h>
#include <linux/topology.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/compaction.h>
#include <linux/notifier.h>
#include <linux/rwsem.h>
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/memcontrol.h>
#include <linux/delayacct.h>
#include <linux/sysctl.h>
#include <linux/oom.h>
#include <linux/pagevec.h>
#include <linux/prefetch.h>
#include <linux/printk.h>
#include <linux/dax.h>
#include <linux/psi.h>

#include <asm/tlbflush.h>
#include <asm/div64.h>

#include <linux/swapops.h>
#include <linux/balloon_compaction.h>

#include "internal.h"

#define CREATE_TRACE_POINTS
#include <trace/events/vmscan.h>

struct scan_control {
        /* How many pages shrink_list() should reclaim */
        unsigned long nr_to_reclaim;

        /*
         * Nodemask of nodes allowed by the caller. If NULL, all nodes
         * are scanned.
         */
        nodemask_t        *nodemask;

        /*
         * The memory cgroup that hit its limit and as a result is the
         * primary target of this reclaim invocation.
         */
        struct mem_cgroup *target_mem_cgroup;

        /*
         * Scan pressure balancing between anon and file LRUs
         */
        unsigned long        anon_cost;
        unsigned long        file_cost;

        /* Can active pages be deactivated as part of reclaim? */
#define DEACTIVATE_ANON 1
#define DEACTIVATE_FILE 2
        unsigned int may_deactivate:2;
        unsigned int force_deactivate:1;
        unsigned int skipped_deactivate:1;

        /* Writepage batching in laptop mode; RECLAIM_WRITE */
        unsigned int may_writepage:1;

        /* Can mapped pages be reclaimed? */
        unsigned int may_unmap:1;

        /* Can pages be swapped as part of reclaim? */
        unsigned int may_swap:1;

        /*
         * Cgroup memory below memory.low is protected as long as we
         * don't threaten to OOM. If any cgroup is reclaimed at
         * reduced force or passed over entirely due to its memory.low
         * setting (memcg_low_skipped), and nothing is reclaimed as a
         * result, then go back for one more cycle that reclaims the protected
         * memory (memcg_low_reclaim) to avert OOM.
         */
        unsigned int memcg_low_reclaim:1;
        unsigned int memcg_low_skipped:1;

        unsigned int hibernation_mode:1;

        /* One of the zones is ready for compaction */
        unsigned int compaction_ready:1;

        /* There is easily reclaimable cold cache in the current node */
        unsigned int cache_trim_mode:1;

        /* The file pages on the current node are dangerously low */
        unsigned int file_is_tiny:1;

        /* Allocation order */
        s8 order;

        /* Scan (total_size >> priority) pages at once */
        s8 priority;

        /* The highest zone to isolate pages for reclaim from */
        s8 reclaim_idx;

        /* This context's GFP mask */
        gfp_t gfp_mask;

        /* Incremented by the number of inactive pages that were scanned */
        unsigned long nr_scanned;

        /* Number of pages freed so far during a call to shrink_zones() */
        unsigned long nr_reclaimed;

        struct {
                unsigned int dirty;
                unsigned int unqueued_dirty;
                unsigned int congested;
                unsigned int writeback;
                unsigned int immediate;
                unsigned int file_taken;
                unsigned int taken;
        } nr;

        /* for recording the reclaimed slab by now */
        struct reclaim_state reclaim_state;
};

#ifdef ARCH_HAS_PREFETCHW
#define prefetchw_prev_lru_page(_page, _base, _field)                        \
        do {                                                                \
                if ((_page)->lru.prev != _base) {                        \
                        struct page *prev;                                \
                                                                        \
                        prev = lru_to_page(&(_page->lru));                \
                        prefetchw(&prev->_field);                        \
                }                                                        \
        } while (0)
#else
#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
#endif

/*
 * From 0 .. 200.  Higher means more swappy.
 */
int vm_swappiness = 60;

static void set_task_reclaim_state(struct task_struct *task,
                                   struct reclaim_state *rs)
{
        /* Check for an overwrite */
        WARN_ON_ONCE(rs && task->reclaim_state);

        /* Check for the nulling of an already-nulled member */
        WARN_ON_ONCE(!rs && !task->reclaim_state);

        task->reclaim_state = rs;
}

static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);

#ifdef CONFIG_MEMCG
/*
 * We allow subsystems to populate their shrinker-related
 * LRU lists before register_shrinker_prepared() is called
 * for the shrinker, since we don't want to impose
 * restrictions on their internal registration order.
 * In this case shrink_slab_memcg() may find corresponding
 * bit is set in the shrinkers map.
 *
 * This value is used by the function to detect registering
 * shrinkers and to skip do_shrink_slab() calls for them.
 */
#define SHRINKER_REGISTERING ((struct shrinker *)~0UL)

static DEFINE_IDR(shrinker_idr);
static int shrinker_nr_max;

static int prealloc_memcg_shrinker(struct shrinker *shrinker)
{
        int id, ret = -ENOMEM;

        down_write(&shrinker_rwsem);
        /* This may call shrinker, so it must use down_read_trylock() */
        id = idr_alloc(&shrinker_idr, SHRINKER_REGISTERING, 0, 0, GFP_KERNEL);
        if (id < 0)
                goto unlock;

        if (id >= shrinker_nr_max) {
                if (memcg_expand_shrinker_maps(id)) {
                        idr_remove(&shrinker_idr, id);
                        goto unlock;
                }

                shrinker_nr_max = id + 1;
        }
        shrinker->id = id;
        ret = 0;
unlock:
        up_write(&shrinker_rwsem);
        return ret;
}

static void unregister_memcg_shrinker(struct shrinker *shrinker)
{
        int id = shrinker->id;

        BUG_ON(id < 0);

        down_write(&shrinker_rwsem);
        idr_remove(&shrinker_idr, id);
        up_write(&shrinker_rwsem);
}

static bool cgroup_reclaim(struct scan_control *sc)
{
        return sc->target_mem_cgroup;
}

/**
 * writeback_throttling_sane - is the usual dirty throttling mechanism available?
 * @sc: scan_control in question
 *
 * The normal page dirty throttling mechanism in balance_dirty_pages() is
 * completely broken with the legacy memcg and direct stalling in
 * shrink_page_list() is used for throttling instead, which lacks all the
 * niceties such as fairness, adaptive pausing, bandwidth proportional
 * allocation and configurability.
 *
 * This function tests whether the vmscan currently in progress can assume
 * that the normal dirty throttling mechanism is operational.
 */
static bool writeback_throttling_sane(struct scan_control *sc)
{
        if (!cgroup_reclaim(sc))
                return true;
#ifdef CONFIG_CGROUP_WRITEBACK
        if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
                return true;
#endif
        return false;
}
#else
static int prealloc_memcg_shrinker(struct shrinker *shrinker)
{
        return 0;
}

static void unregister_memcg_shrinker(struct shrinker *shrinker)
{
}

static bool cgroup_reclaim(struct scan_control *sc)
{
        return false;
}

static bool writeback_throttling_sane(struct scan_control *sc)
{
        return true;
}
#endif

/*
 * This misses isolated pages which are not accounted for to save counters.
 * As the data only determines if reclaim or compaction continues, it is
 * not expected that isolated pages will be a dominating factor.
 */
unsigned long zone_reclaimable_pages(struct zone *zone)
{
        unsigned long nr;

        nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
                zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
        if (get_nr_swap_pages() > 0)
                nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
                        zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
        /*
         * If there are no reclaimable file-backed or anonymous pages,
         * ensure zones with sufficient free pages are not skipped.
         * This prevents zones like DMA32 from being ignored in reclaim
         * scenarios where they can still help alleviate memory pressure.
         */
        if (nr == 0)
                nr = zone_page_state_snapshot(zone, NR_FREE_PAGES);
        return nr;
}

/**
 * lruvec_lru_size -  Returns the number of pages on the given LRU list.
 * @lruvec: lru vector
 * @lru: lru to use
 * @zone_idx: zones to consider (use MAX_NR_ZONES for the whole LRU list)
 */
unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
{
        unsigned long size = 0;
        int zid;

        for (zid = 0; zid <= zone_idx && zid < MAX_NR_ZONES; zid++) {
                struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];

                if (!managed_zone(zone))
                        continue;

                if (!mem_cgroup_disabled())
                        size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
                else
                        size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru);
        }
        return size;
}

/*
 * Add a shrinker callback to be called from the vm.
 */
int prealloc_shrinker(struct shrinker *shrinker)
{
        unsigned int size = sizeof(*shrinker->nr_deferred);

        if (shrinker->flags & SHRINKER_NUMA_AWARE)
                size *= nr_node_ids;

        shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
        if (!shrinker->nr_deferred)
                return -ENOMEM;

        if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
                if (prealloc_memcg_shrinker(shrinker))
                        goto free_deferred;
        }

        return 0;

free_deferred:
        kfree(shrinker->nr_deferred);
        shrinker->nr_deferred = NULL;
        return -ENOMEM;
}

void free_prealloced_shrinker(struct shrinker *shrinker)
{
        if (!shrinker->nr_deferred)
                return;

        if (shrinker->flags & SHRINKER_MEMCG_AWARE)
                unregister_memcg_shrinker(shrinker);

        kfree(shrinker->nr_deferred);
        shrinker->nr_deferred = NULL;
}

void register_shrinker_prepared(struct shrinker *shrinker)
{
        down_write(&shrinker_rwsem);
        list_add_tail(&shrinker->list, &shrinker_list);
#ifdef CONFIG_MEMCG
        if (shrinker->flags & SHRINKER_MEMCG_AWARE)
                idr_replace(&shrinker_idr, shrinker, shrinker->id);
#endif
        up_write(&shrinker_rwsem);
}

int register_shrinker(struct shrinker *shrinker)
{
        int err = prealloc_shrinker(shrinker);

        if (err)
                return err;
        register_shrinker_prepared(shrinker);
        return 0;
}
EXPORT_SYMBOL(register_shrinker);

/*
 * Remove one
 */
void unregister_shrinker(struct shrinker *shrinker)
{
        if (!shrinker->nr_deferred)
                return;
        if (shrinker->flags & SHRINKER_MEMCG_AWARE)
                unregister_memcg_shrinker(shrinker);
        down_write(&shrinker_rwsem);
        list_del(&shrinker->list);
        up_write(&shrinker_rwsem);
        kfree(shrinker->nr_deferred);
        shrinker->nr_deferred = NULL;
}
EXPORT_SYMBOL(unregister_shrinker);

#define SHRINK_BATCH 128

static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
                                    struct shrinker *shrinker, int priority)
{
        unsigned long freed = 0;
        unsigned long long delta;
        long total_scan;
        long freeable;
        long nr;
        long new_nr;
        int nid = shrinkctl->nid;
        long batch_size = shrinker->batch ? shrinker->batch
                                          : SHRINK_BATCH;
        long scanned = 0, next_deferred;

        if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
                nid = 0;

        freeable = shrinker->count_objects(shrinker, shrinkctl);
        if (freeable == 0 || freeable == SHRINK_EMPTY)
                return freeable;

        /*
         * copy the current shrinker scan count into a local variable
         * and zero it so that other concurrent shrinker invocations
         * don't also do this scanning work.
         */
        nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);

        total_scan = nr;
        if (shrinker->seeks) {
                delta = freeable >> priority;
                delta *= 4;
                do_div(delta, shrinker->seeks);
        } else {
                /*
                 * These objects don't require any IO to create. Trim
                 * them aggressively under memory pressure to keep
                 * them from causing refetches in the IO caches.
                 */
                delta = freeable / 2;
        }

        total_scan += delta;
        if (total_scan < 0) {
                pr_err("shrink_slab: %pS negative objects to delete nr=%ld\n",
                       shrinker->scan_objects, total_scan);
                total_scan = freeable;
                next_deferred = nr;
        } else
                next_deferred = total_scan;

        /*
         * We need to avoid excessive windup on filesystem shrinkers
         * due to large numbers of GFP_NOFS allocations causing the
         * shrinkers to return -1 all the time. This results in a large
         * nr being built up so when a shrink that can do some work
         * comes along it empties the entire cache due to nr >>>
         * freeable. This is bad for sustaining a working set in
         * memory.
         *
         * Hence only allow the shrinker to scan the entire cache when
         * a large delta change is calculated directly.
         */
        if (delta < freeable / 4)
                total_scan = min(total_scan, freeable / 2);

        /*
         * Avoid risking looping forever due to too large nr value:
         * never try to free more than twice the estimate number of
         * freeable entries.
         */
        if (total_scan > freeable * 2)
                total_scan = freeable * 2;

        trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
                                   freeable, delta, total_scan, priority);

        /*
         * Normally, we should not scan less than batch_size objects in one
         * pass to avoid too frequent shrinker calls, but if the slab has less
         * than batch_size objects in total and we are really tight on memory,
         * we will try to reclaim all available objects, otherwise we can end
         * up failing allocations although there are plenty of reclaimable
         * objects spread over several slabs with usage less than the
         * batch_size.
         *
         * We detect the "tight on memory" situations by looking at the total
         * number of objects we want to scan (total_scan). If it is greater
         * than the total number of objects on slab (freeable), we must be
         * scanning at high prio and therefore should try to reclaim as much as
         * possible.
         */
        while (total_scan >= batch_size ||
               total_scan >= freeable) {
                unsigned long ret;
                unsigned long nr_to_scan = min(batch_size, total_scan);

                shrinkctl->nr_to_scan = nr_to_scan;
                shrinkctl->nr_scanned = nr_to_scan;
                ret = shrinker->scan_objects(shrinker, shrinkctl);
                if (ret == SHRINK_STOP)
                        break;
                freed += ret;

                count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
                total_scan -= shrinkctl->nr_scanned;
                scanned += shrinkctl->nr_scanned;

                cond_resched();
        }

        if (next_deferred >= scanned)
                next_deferred -= scanned;
        else
                next_deferred = 0;
        /*
         * move the unused scan count back into the shrinker in a
         * manner that handles concurrent updates. If we exhausted the
         * scan, there is no need to do an update.
         */
        if (next_deferred > 0)
                new_nr = atomic_long_add_return(next_deferred,
                                                &shrinker->nr_deferred[nid]);
        else
                new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);

        trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan);
        return freed;
}

#ifdef CONFIG_MEMCG
static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
                        struct mem_cgroup *memcg, int priority)
{
        struct memcg_shrinker_map *map;
        unsigned long ret, freed = 0;
        int i;

        if (!mem_cgroup_online(memcg))
                return 0;

        if (!down_read_trylock(&shrinker_rwsem))
                return 0;

        map = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_map,
                                        true);
        if (unlikely(!map))
                goto unlock;

        for_each_set_bit(i, map->map, shrinker_nr_max) {
                struct shrink_control sc = {
                        .gfp_mask = gfp_mask,
                        .nid = nid,
                        .memcg = memcg,
                };
                struct shrinker *shrinker;

                shrinker = idr_find(&shrinker_idr, i);
                if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) {
                        if (!shrinker)
                                clear_bit(i, map->map);
                        continue;
                }

                /* Call non-slab shrinkers even though kmem is disabled */
                if (!memcg_kmem_enabled() &&
                    !(shrinker->flags & SHRINKER_NONSLAB))
                        continue;

                ret = do_shrink_slab(&sc, shrinker, priority);
                if (ret == SHRINK_EMPTY) {
                        clear_bit(i, map->map);
                        /*
                         * After the shrinker reported that it had no objects to
                         * free, but before we cleared the corresponding bit in
                         * the memcg shrinker map, a new object might have been
                         * added. To make sure, we have the bit set in this
                         * case, we invoke the shrinker one more time and reset
                         * the bit if it reports that it is not empty anymore.
                         * The memory barrier here pairs with the barrier in
                         * memcg_set_shrinker_bit():
                         *
                         * list_lru_add()     shrink_slab_memcg()
                         *   list_add_tail()    clear_bit()
                         *   <MB>               <MB>
                         *   set_bit()          do_shrink_slab()
                         */
                        smp_mb__after_atomic();
                        ret = do_shrink_slab(&sc, shrinker, priority);
                        if (ret == SHRINK_EMPTY)
                                ret = 0;
                        else
                                memcg_set_shrinker_bit(memcg, nid, i);
                }
                freed += ret;

                if (rwsem_is_contended(&shrinker_rwsem)) {
                        freed = freed ? : 1;
                        break;
                }
        }
unlock:
        up_read(&shrinker_rwsem);
        return freed;
}
#else /* CONFIG_MEMCG */
static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
                        struct mem_cgroup *memcg, int priority)
{
        return 0;
}
#endif /* CONFIG_MEMCG */

/**
 * shrink_slab - shrink slab caches
 * @gfp_mask: allocation context
 * @nid: node whose slab caches to target
 * @memcg: memory cgroup whose slab caches to target
 * @priority: the reclaim priority
 *
 * Call the shrink functions to age shrinkable caches.
 *
 * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
 * unaware shrinkers will receive a node id of 0 instead.
 *
 * @memcg specifies the memory cgroup to target. Unaware shrinkers
 * are called only if it is the root cgroup.
 *
 * @priority is sc->priority, we take the number of objects and >> by priority
 * in order to get the scan target.
 *
 * Returns the number of reclaimed slab objects.
 */
static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
                                 struct mem_cgroup *memcg,
                                 int priority)
{
        unsigned long ret, freed = 0;
        struct shrinker *shrinker;

        /*
         * The root memcg might be allocated even though memcg is disabled
         * via "cgroup_disable=memory" boot parameter.  This could make
         * mem_cgroup_is_root() return false, then just run memcg slab
         * shrink, but skip global shrink.  This may result in premature
         * oom.
         */
        if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
                return shrink_slab_memcg(gfp_mask, nid, memcg, priority);

        if (!down_read_trylock(&shrinker_rwsem))
                goto out;

        list_for_each_entry(shrinker, &shrinker_list, list) {
                struct shrink_control sc = {
                        .gfp_mask = gfp_mask,
                        .nid = nid,
                        .memcg = memcg,
                };

                ret = do_shrink_slab(&sc, shrinker, priority);
                if (ret == SHRINK_EMPTY)
                        ret = 0;
                freed += ret;
                /*
                 * Bail out if someone want to register a new shrinker to
                 * prevent the registration from being stalled for long periods
                 * by parallel ongoing shrinking.
                 */
                if (rwsem_is_contended(&shrinker_rwsem)) {
                        freed = freed ? : 1;
                        break;
                }
        }

        up_read(&shrinker_rwsem);
out:
        cond_resched();
        return freed;
}

void drop_slab_node(int nid)
{
        unsigned long freed;

        do {
                struct mem_cgroup *memcg = NULL;

                if (fatal_signal_pending(current))
                        return;

                freed = 0;
                memcg = mem_cgroup_iter(NULL, NULL, NULL);
                do {
                        freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
                } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
        } while (freed > 10);
}

void drop_slab(void)
{
        int nid;

        for_each_online_node(nid)
                drop_slab_node(nid);
}

static inline int is_page_cache_freeable(struct page *page)
{
        /*
         * A freeable page cache page is referenced only by the caller
         * that isolated the page, the page cache and optional buffer
         * heads at page->private.
         */
        int page_cache_pins = thp_nr_pages(page);
        return page_count(page) - page_has_private(page) == 1 + page_cache_pins;
}

static int may_write_to_inode(struct inode *inode)
{
        if (current->flags & PF_SWAPWRITE)
                return 1;
        if (!inode_write_congested(inode))
                return 1;
        if (inode_to_bdi(inode) == current->backing_dev_info)
                return 1;
        return 0;
}

/*
 * We detected a synchronous write error writing a page out.  Probably
 * -ENOSPC.  We need to propagate that into the address_space for a subsequent
 * fsync(), msync() or close().
 *
 * The tricky part is that after writepage we cannot touch the mapping: nothing
 * prevents it from being freed up.  But we have a ref on the page and once
 * that page is locked, the mapping is pinned.
 *
 * We're allowed to run sleeping lock_page() here because we know the caller has
 * __GFP_FS.
 */
static void handle_write_error(struct address_space *mapping,
                                struct page *page, int error)
{
        lock_page(page);
        if (page_mapping(page) == mapping)
                mapping_set_error(mapping, error);
        unlock_page(page);
}

/* possible outcome of pageout() */
typedef enum {
        /* failed to write page out, page is locked */
        PAGE_KEEP,
        /* move page to the active list, page is locked */
        PAGE_ACTIVATE,
        /* page has been sent to the disk successfully, page is unlocked */
        PAGE_SUCCESS,
        /* page is clean and locked */
        PAGE_CLEAN,
} pageout_t;

/*
 * pageout is called by shrink_page_list() for each dirty page.
 * Calls ->writepage().
 */
static pageout_t pageout(struct page *page, struct address_space *mapping)
{
        /*
         * If the page is dirty, only perform writeback if that write
         * will be non-blocking.  To prevent this allocation from being
         * stalled by pagecache activity.  But note that there may be
         * stalls if we need to run get_block().  We could test
         * PagePrivate for that.
         *
         * If this process is currently in __generic_file_write_iter() against
         * this page's queue, we can perform writeback even if that
         * will block.
         *
         * If the page is swapcache, write it back even if that would
         * block, for some throttling. This happens by accident, because
         * swap_backing_dev_info is bust: it doesn't reflect the
         * congestion state of the swapdevs.  Easy to fix, if needed.
         */
        if (!is_page_cache_freeable(page))
                return PAGE_KEEP;
        if (!mapping) {
                /*
                 * Some data journaling orphaned pages can have
                 * page->mapping == NULL while being dirty with clean buffers.
                 */
                if (page_has_private(page)) {
                        if (try_to_free_buffers(page)) {
                                ClearPageDirty(page);
                                pr_info("%s: orphaned page\n", __func__);
                                return PAGE_CLEAN;
                        }
                }
                return PAGE_KEEP;
        }
        if (mapping->a_ops->writepage == NULL)
                return PAGE_ACTIVATE;
        if (!may_write_to_inode(mapping->host))
                return PAGE_KEEP;

        if (clear_page_dirty_for_io(page)) {
                int res;
                struct writeback_control wbc = {
                        .sync_mode = WB_SYNC_NONE,
                        .nr_to_write = SWAP_CLUSTER_MAX,
                        .range_start = 0,
                        .range_end = LLONG_MAX,
                        .for_reclaim = 1,
                };

                SetPageReclaim(page);
                res = mapping->a_ops->writepage(page, &wbc);
                if (res < 0)
                        handle_write_error(mapping, page, res);
                if (res == AOP_WRITEPAGE_ACTIVATE) {
                        ClearPageReclaim(page);
                        return PAGE_ACTIVATE;
                }

                if (!PageWriteback(page)) {
                        /* synchronous write or broken a_ops? */
                        ClearPageReclaim(page);
                }
                trace_mm_vmscan_writepage(page);
                inc_node_page_state(page, NR_VMSCAN_WRITE);
                return PAGE_SUCCESS;
        }

        return PAGE_CLEAN;
}

/*
 * Same as remove_mapping, but if the page is removed from the mapping, it
 * gets returned with a refcount of 0.
 */
static int __remove_mapping(struct address_space *mapping, struct page *page,
                            bool reclaimed, struct mem_cgroup *target_memcg)
{
        unsigned long flags;
        int refcount;
        void *shadow = NULL;

        BUG_ON(!PageLocked(page));
        BUG_ON(mapping != page_mapping(page));

        xa_lock_irqsave(&mapping->i_pages, flags);
        /*
         * The non racy check for a busy page.
         *
         * Must be careful with the order of the tests. When someone has
         * a ref to the page, it may be possible that they dirty it then
         * drop the reference. So if PageDirty is tested before page_count
         * here, then the following race may occur:
         *
         * get_user_pages(&page);
         * [user mapping goes away]
         * write_to(page);
         *                                !PageDirty(page)    [good]
         * SetPageDirty(page);
         * put_page(page);
         *                                !page_count(page)   [good, discard it]
         *
         * [oops, our write_to data is lost]
         *
         * Reversing the order of the tests ensures such a situation cannot
         * escape unnoticed. The smp_rmb is needed to ensure the page->flags
         * load is not satisfied before that of page->_refcount.
         *
         * Note that if SetPageDirty is always performed via set_page_dirty,
         * and thus under the i_pages lock, then this ordering is not required.
         */
        refcount = 1 + compound_nr(page);
        if (!page_ref_freeze(page, refcount))
                goto cannot_free;
        /* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */
        if (unlikely(PageDirty(page))) {
                page_ref_unfreeze(page, refcount);
                goto cannot_free;
        }

        if (PageSwapCache(page)) {
                swp_entry_t swap = { .val = page_private(page) };
                mem_cgroup_swapout(page, swap);
                if (reclaimed && !mapping_exiting(mapping))
                        shadow = workingset_eviction(page, target_memcg);
                __delete_from_swap_cache(page, swap, shadow);
                xa_unlock_irqrestore(&mapping->i_pages, flags);
                put_swap_page(page, swap);
        } else {
                void (*freepage)(struct page *);

                freepage = mapping->a_ops->freepage;
                /*
                 * Remember a shadow entry for reclaimed file cache in
                 * order to detect refaults, thus thrashing, later on.
                 *
                 * But don't store shadows in an address space that is
                 * already exiting.  This is not just an optimization,
                 * inode reclaim needs to empty out the radix tree or
                 * the nodes are lost.  Don't plant shadows behind its
                 * back.
                 *
                 * We also don't store shadows for DAX mappings because the
                 * only page cache pages found in these are zero pages
                 * covering holes, and because we don't want to mix DAX
                 * exceptional entries and shadow exceptional entries in the
                 * same address_space.
                 */
                if (reclaimed && page_is_file_lru(page) &&
                    !mapping_exiting(mapping) && !dax_mapping(mapping))
                        shadow = workingset_eviction(page, target_memcg);
                __delete_from_page_cache(page, shadow);
                xa_unlock_irqrestore(&mapping->i_pages, flags);

                if (freepage != NULL)
                        freepage(page);
        }

        return 1;

cannot_free:
        xa_unlock_irqrestore(&mapping->i_pages, flags);
        return 0;
}

/*
 * Attempt to detach a locked page from its ->mapping.  If it is dirty or if
 * someone else has a ref on the page, abort and return 0.  If it was
 * successfully detached, return 1.  Assumes the caller has a single ref on
 * this page.
 */
int remove_mapping(struct address_space *mapping, struct page *page)
{
        if (__remove_mapping(mapping, page, false, NULL)) {
                /*
                 * Unfreezing the refcount with 1 rather than 2 effectively
                 * drops the pagecache ref for us without requiring another
                 * atomic operation.
                 */
                page_ref_unfreeze(page, 1);
                return 1;
        }
        return 0;
}

/**
 * putback_lru_page - put previously isolated page onto appropriate LRU list
 * @page: page to be put back to appropriate lru list
 *
 * Add previously isolated @page to appropriate LRU list.
 * Page may still be unevictable for other reasons.
 *
 * lru_lock must not be held, interrupts must be enabled.
 */
void putback_lru_page(struct page *page)
{
        lru_cache_add(page);
        put_page(page);                /* drop ref from isolate */
}

enum page_references {
        PAGEREF_RECLAIM,
        PAGEREF_RECLAIM_CLEAN,
        PAGEREF_KEEP,
        PAGEREF_ACTIVATE,
};

static enum page_references page_check_references(struct page *page,
                                                  struct scan_control *sc)
{
        int referenced_ptes, referenced_page;
        unsigned long vm_flags;

        referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
                                          &vm_flags);
        referenced_page = TestClearPageReferenced(page);

        /*
         * Mlock lost the isolation race with us.  Let try_to_unmap()
         * move the page to the unevictable list.
         */
        if (vm_flags & VM_LOCKED)
                return PAGEREF_RECLAIM;

        if (referenced_ptes) {
                /*
                 * All mapped pages start out with page table
                 * references from the instantiating fault, so we need
                 * to look twice if a mapped file page is used more
                 * than once.
                 *
                 * Mark it and spare it for another trip around the
                 * inactive list.  Another page table reference will
                 * lead to its activation.
                 *
                 * Note: the mark is set for activated pages as well
                 * so that recently deactivated but used pages are
                 * quickly recovered.
                 */
                SetPageReferenced(page);

                if (referenced_page || referenced_ptes > 1)
                        return PAGEREF_ACTIVATE;

                /*
                 * Activate file-backed executable pages after first usage.
                 */
                if ((vm_flags & VM_EXEC) && !PageSwapBacked(page))
                        return PAGEREF_ACTIVATE;

                return PAGEREF_KEEP;
        }

        /* Reclaim if clean, defer dirty pages to writeback */
        if (referenced_page && !PageSwapBacked(page))
                return PAGEREF_RECLAIM_CLEAN;

        return PAGEREF_RECLAIM;
}

/* Check if a page is dirty or under writeback */
static void page_check_dirty_writeback(struct page *page,
                                       bool *dirty, bool *writeback)
{
        struct address_space *mapping;

        /*
         * Anonymous pages are not handled by flushers and must be written
         * from reclaim context. Do not stall reclaim based on them
         */
        if (!page_is_file_lru(page) ||
            (PageAnon(page) && !PageSwapBacked(page))) {
                *dirty = false;
                *writeback = false;
                return;
        }

        /* By default assume that the page flags are accurate */
        *dirty = PageDirty(page);
        *writeback = PageWriteback(page);

        /* Verify dirty/writeback state if the filesystem supports it */
        if (!page_has_private(page))
                return;

        mapping = page_mapping(page);
        if (mapping && mapping->a_ops->is_dirty_writeback)
                mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
}

/*
 * shrink_page_list() returns the number of reclaimed pages
 */
static unsigned int shrink_page_list(struct list_head *page_list,
                                     struct pglist_data *pgdat,
                                     struct scan_control *sc,
                                     struct reclaim_stat *stat,
                                     bool ignore_references)
{
        LIST_HEAD(ret_pages);
        LIST_HEAD(free_pages);
        unsigned int nr_reclaimed = 0;
        unsigned int pgactivate = 0;

        memset(stat, 0, sizeof(*stat));
        cond_resched();

        while (!list_empty(page_list)) {
                struct address_space *mapping;
                struct page *page;
                enum page_references references = PAGEREF_RECLAIM;
                bool dirty, writeback, may_enter_fs;
                unsigned int nr_pages;

                cond_resched();

                page = lru_to_page(page_list);
                list_del(&page->lru);

                if (!trylock_page(page))
                        goto keep;

                VM_BUG_ON_PAGE(PageActive(page), page);

                nr_pages = compound_nr(page);

                /* Account the number of base pages even though THP */
                sc->nr_scanned += nr_pages;

                if (unlikely(!page_evictable(page)))
                        goto activate_locked;

                if (!sc->may_unmap && page_mapped(page))
                        goto keep_locked;

                may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
                        (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));

                /*
                 * The number of dirty pages determines if a node is marked
                 * reclaim_congested which affects wait_iff_congested. kswapd
                 * will stall and start writing pages if the tail of the LRU
                 * is all dirty unqueued pages.
                 */
                page_check_dirty_writeback(page, &dirty, &writeback);
                if (dirty || writeback)
                        stat->nr_dirty++;

                if (dirty && !writeback)
                        stat->nr_unqueued_dirty++;

                /*
                 * Treat this page as congested if the underlying BDI is or if
                 * pages are cycling through the LRU so quickly that the
                 * pages marked for immediate reclaim are making it to the
                 * end of the LRU a second time.
                 */
                mapping = page_mapping(page);
                if (((dirty || writeback) && mapping &&
                     inode_write_congested(mapping->host)) ||
                    (writeback && PageReclaim(page)))
                        stat->nr_congested++;

                /*
                 * If a page at the tail of the LRU is under writeback, there
                 * are three cases to consider.
                 *
                 * 1) If reclaim is encountering an excessive number of pages
                 *    under writeback and this page is both under writeback and
                 *    PageReclaim then it indicates that pages are being queued
                 *    for IO but are being recycled through the LRU before the
                 *    IO can complete. Waiting on the page itself risks an
                 *    indefinite stall if it is impossible to writeback the
                 *    page due to IO error or disconnected storage so instead
                 *    note that the LRU is being scanned too quickly and the
                 *    caller can stall after page list has been processed.
                 *
                 * 2) Global or new memcg reclaim encounters a page that is
                 *    not marked for immediate reclaim, or the caller does not
                 *    have __GFP_FS (or __GFP_IO if it's simply going to swap,
                 *    not to fs). In this case mark the page for immediate
                 *    reclaim and continue scanning.
                 *
                 *    Require may_enter_fs because we would wait on fs, which
                 *    may not have submitted IO yet. And the loop driver might
                 *    enter reclaim, and deadlock if it waits on a page for
                 *    which it is needed to do the write (loop masks off
                 *    __GFP_IO|__GFP_FS for this reason); but more thought
                 *    would probably show more reasons.
                 *
                 * 3) Legacy memcg encounters a page that is already marked
                 *    PageReclaim. memcg does not have any dirty pages
                 *    throttling so we could easily OOM just because too many
                 *    pages are in writeback and there is nothing else to
                 *    reclaim. Wait for the writeback to complete.
                 *
                 * In cases 1) and 2) we activate the pages to get them out of
                 * the way while we continue scanning for clean pages on the
                 * inactive list and refilling from the active list. The
                 * observation here is that waiting for disk writes is more
                 * expensive than potentially causing reloads down the line.
                 * Since they're marked for immediate reclaim, they won't put
                 * memory pressure on the cache working set any longer than it
                 * takes to write them to disk.
                 */
                if (PageWriteback(page)) {
                        /* Case 1 above */
                        if (current_is_kswapd() &&
                            PageReclaim(page) &&
                            test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
                                stat->nr_immediate++;
                                goto activate_locked;

                        /* Case 2 above */
                        } else if (writeback_throttling_sane(sc) ||
                            !PageReclaim(page) || !may_enter_fs) {
                                /*
                                 * This is slightly racy - end_page_writeback()
                                 * might have just cleared PageReclaim, then
                                 * setting PageReclaim here end up interpreted
                                 * as PageReadahead - but that does not matter
                                 * enough to care.  What we do want is for this
                                 * page to have PageReclaim set next time memcg
                                 * reclaim reaches the tests above, so it will
                                 * then wait_on_page_writeback() to avoid OOM;
                                 * and it's also appropriate in global reclaim.
                                 */
                                SetPageReclaim(page);
                                stat->nr_writeback++;
                                goto activate_locked;

                        /* Case 3 above */
                        } else {
                                unlock_page(page);
                                wait_on_page_writeback(page);
                                /* then go back and try same page again */
                                list_add_tail(&page->lru, page_list);
                                continue;
                        }
                }

                if (!ignore_references)
                        references = page_check_references(page, sc);

                switch (references) {
                case PAGEREF_ACTIVATE:
                        goto activate_locked;
                case PAGEREF_KEEP:
                        stat->nr_ref_keep += nr_pages;
                        goto keep_locked;
                case PAGEREF_RECLAIM:
                case PAGEREF_RECLAIM_CLEAN:
                        ; /* try to reclaim the page below */
                }

                /*
                 * Anonymous process memory has backing store?
                 * Try to allocate it some swap space here.
                 * Lazyfree page could be freed directly
                 */
                if (PageAnon(page) && PageSwapBacked(page)) {
                        if (!PageSwapCache(page)) {
                                if (!(sc->gfp_mask & __GFP_IO))
                                        goto keep_locked;
                                if (page_maybe_dma_pinned(page))
                                        goto keep_locked;
                                if (PageTransHuge(page)) {
                                        /* cannot split THP, skip it */
                                        if (!can_split_huge_page(page, NULL))
                                                goto activate_locked;
                                        /*
                                         * Split pages without a PMD map right
                                         * away. Chances are some or all of the
                                         * tail pages can be freed without IO.
                                         */
                                        if (!compound_mapcount(page) &&
                                            split_huge_page_to_list(page,
                                                                    page_list))
                                                goto activate_locked;
                                }
                                if (!add_to_swap(page)) {
                                        if (!PageTransHuge(page))
                                                goto activate_locked_split;
                                        /* Fallback to swap normal pages */
                                        if (split_huge_page_to_list(page,
                                                                    page_list))
                                                goto activate_locked;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
                                        count_vm_event(THP_SWPOUT_FALLBACK);
#endif
                                        if (!add_to_swap(page))
                                                goto activate_locked_split;
                                }

                                may_enter_fs = true;

                                /* Adding to swap updated mapping */
                                mapping = page_mapping(page);
                        }
                } else if (unlikely(PageTransHuge(page))) {
                        /* Split file THP */
                        if (split_huge_page_to_list(page, page_list))
                                goto keep_locked;
                }

                /*
                 * THP may get split above, need minus tail pages and update
                 * nr_pages to avoid accounting tail pages twice.
                 *
                 * The tail pages that are added into swap cache successfully
                 * reach here.
                 */
                if ((nr_pages > 1) && !PageTransHuge(page)) {
                        sc->nr_scanned -= (nr_pages - 1);
                        nr_pages = 1;
                }

                /*
                 * The page is mapped into the page tables of one or more
                 * processes. Try to unmap it here.
                 */
                if (page_mapped(page)) {
                        enum ttu_flags flags = TTU_BATCH_FLUSH;
                        bool was_swapbacked = PageSwapBacked(page);

                        if (unlikely(PageTransHuge(page)))
                                flags |= TTU_SPLIT_HUGE_PMD;

                        if (!try_to_unmap(page, flags)) {
                                stat->nr_unmap_fail += nr_pages;
                                if (!was_swapbacked && PageSwapBacked(page))
                                        stat->nr_lazyfree_fail += nr_pages;
                                goto activate_locked;
                        }
                }

                if (PageDirty(page)) {
                        /*
                         * Only kswapd can writeback filesystem pages
                         * to avoid risk of stack overflow. But avoid
                         * injecting inefficient single-page IO into
                         * flusher writeback as much as possible: only
                         * write pages when we've encountered many
                         * dirty pages, and when we've already scanned
                         * the rest of the LRU for clean pages and see
                         * the same dirty pages again (PageReclaim).
                         */
                        if (page_is_file_lru(page) &&
                            (!current_is_kswapd() || !PageReclaim(page) ||
                             !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
                                /*
                                 * Immediately reclaim when written back.
                                 * Similar in principal to deactivate_page()
                                 * except we already have the page isolated
                                 * and know it's dirty
                                 */
                                inc_node_page_state(page, NR_VMSCAN_IMMEDIATE);
                                SetPageReclaim(page);

                                goto activate_locked;
                        }

                        if (references == PAGEREF_RECLAIM_CLEAN)
                                goto keep_locked;
                        if (!may_enter_fs)
                                goto keep_locked;
                        if (!sc->may_writepage)
                                goto keep_locked;

                        /*
                         * Page is dirty. Flush the TLB if a writable entry
                         * potentially exists to avoid CPU writes after IO
                         * starts and then write it out here.
                         */
                        try_to_unmap_flush_dirty();
                        switch (pageout(page, mapping)) {
                        case PAGE_KEEP:
                                goto keep_locked;
                        case PAGE_ACTIVATE:
                                goto activate_locked;
                        case PAGE_SUCCESS:
                                stat->nr_pageout += thp_nr_pages(page);

                                if (PageWriteback(page))
                                        goto keep;
                                if (PageDirty(page))
                                        goto keep;

                                /*
                                 * A synchronous write - probably a ramdisk.  Go
                                 * ahead and try to reclaim the page.
                                 */
                                if (!trylock_page(page))
                                        goto keep;
                                if (PageDirty(page) || PageWriteback(page))
                                        goto keep_locked;
                                mapping = page_mapping(page);
                        case PAGE_CLEAN:
                                ; /* try to free the page below */
                        }
                }

                /*
                 * If the page has buffers, try to free the buffer mappings
                 * associated with this page. If we succeed we try to free
                 * the page as well.
                 *
                 * We do this even if the page is PageDirty().
                 * try_to_release_page() does not perform I/O, but it is
                 * possible for a page to have PageDirty set, but it is actually
                 * clean (all its buffers are clean).  This happens if the
                 * buffers were written out directly, with submit_bh(). ext3
                 * will do this, as well as the blockdev mapping.
                 * try_to_release_page() will discover that cleanness and will
                 * drop the buffers and mark the page clean - it can be freed.
                 *
                 * Rarely, pages can have buffers and no ->mapping.  These are
                 * the pages which were not successfully invalidated in
                 * truncate_complete_page().  We try to drop those buffers here
                 * and if that worked, and the page is no longer mapped into
                 * process address space (page_count == 1) it can be freed.
                 * Otherwise, leave the page on the LRU so it is swappable.
                 */
                if (page_has_private(page)) {
                        if (!try_to_release_page(page, sc->gfp_mask))
                                goto activate_locked;
                        if (!mapping && page_count(page) == 1) {
                                unlock_page(page);
                                if (put_page_testzero(page))
                                        goto free_it;
                                else {
                                        /*
                                         * rare race with speculative reference.
                                         * the speculative reference will free
                                         * this page shortly, so we may
                                         * increment nr_reclaimed here (and
                                         * leave it off the LRU).
                                         */
                                        nr_reclaimed++;
                                        continue;
                                }
                        }
                }

                if (PageAnon(page) && !PageSwapBacked(page)) {
                        /* follow __remove_mapping for reference */
                        if (!page_ref_freeze(page, 1))
                                goto keep_locked;
                        if (PageDirty(page)) {
                                page_ref_unfreeze(page, 1);
                                goto keep_locked;
                        }

                        count_vm_event(PGLAZYFREED);
                        count_memcg_page_event(page, PGLAZYFREED);
                } else if (!mapping || !__remove_mapping(mapping, page, true,
                                                         sc->target_mem_cgroup))
                        goto keep_locked;

                unlock_page(page);
free_it:
                /*
                 * THP may get swapped out in a whole, need account
                 * all base pages.
                 */
                nr_reclaimed += nr_pages;

                /*
                 * Is there need to periodically free_page_list? It would
                 * appear not as the counts should be low
                 */
                if (unlikely(PageTransHuge(page)))
                        destroy_compound_page(page);
                else
                        list_add(&page->lru, &free_pages);
                continue;

activate_locked_split:
                /*
                 * The tail pages that are failed to add into swap cache
                 * reach here.  Fixup nr_scanned and nr_pages.
                 */
                if (nr_pages > 1) {
                        sc->nr_scanned -= (nr_pages - 1);
                        nr_pages = 1;
                }
activate_locked:
                /* Not a candidate for swapping, so reclaim swap space. */
                if (PageSwapCache(page) && (mem_cgroup_swap_full(page) ||
                                                PageMlocked(page)))
                        try_to_free_swap(page);
                VM_BUG_ON_PAGE(PageActive(page), page);
                if (!PageMlocked(page)) {
                        int type = page_is_file_lru(page);
                        SetPageActive(page);
                        stat->nr_activate[type] += nr_pages;
                        count_memcg_page_event(page, PGACTIVATE);
                }
keep_locked:
                unlock_page(page);
keep:
                list_add(&page->lru, &ret_pages);
                VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
        }

        pgactivate = stat->nr_activate[0] + stat->nr_activate[1];

        mem_cgroup_uncharge_list(&free_pages);
        try_to_unmap_flush();
        free_unref_page_list(&free_pages);

        list_splice(&ret_pages, page_list);
        count_vm_events(PGACTIVATE, pgactivate);

        return nr_reclaimed;
}

unsigned int reclaim_clean_pages_from_list(struct zone *zone,
                                            struct list_head *page_list)
{
        struct scan_control sc = {
                .gfp_mask = GFP_KERNEL,
                .priority = DEF_PRIORITY,
                .may_unmap = 1,
        };
        struct reclaim_stat stat;
        unsigned int nr_reclaimed;
        struct page *page, *next;
        LIST_HEAD(clean_pages);

        list_for_each_entry_safe(page, next, page_list, lru) {
                if (page_is_file_lru(page) && !PageDirty(page) &&
                    !__PageMovable(page) && !PageUnevictable(page)) {
                        ClearPageActive(page);
                        list_move(&page->lru, &clean_pages);
                }
        }

        nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
                                        &stat, true);
        list_splice(&clean_pages, page_list);
        mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
                            -(long)nr_reclaimed);
        /*
         * Since lazyfree pages are isolated from file LRU from the beginning,
         * they will rotate back to anonymous LRU in the end if it failed to
         * discard so isolated count will be mismatched.
         * Compensate the isolated count for both LRU lists.
         */
        mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON,
                            stat.nr_lazyfree_fail);
        mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
                            -(long)stat.nr_lazyfree_fail);
        return nr_reclaimed;
}

/*
 * Attempt to remove the specified page from its LRU.  Only take this page
 * if it is of the appropriate PageActive status.  Pages which are being
 * freed elsewhere are also ignored.
 *
 * page:        page to consider
 * mode:        one of the LRU isolation modes defined above
 *
 * returns 0 on success, -ve errno on failure.
 */
int __isolate_lru_page(struct page *page, isolate_mode_t mode)
{
        int ret = -EINVAL;

        /* Only take pages on the LRU. */
        if (!PageLRU(page))
                return ret;

        /* Compaction should not handle unevictable pages but CMA can do so */
        if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
                return ret;

        ret = -EBUSY;

        /*
         * To minimise LRU disruption, the caller can indicate that it only
         * wants to isolate pages it will be able to operate on without
         * blocking - clean pages for the most part.
         *
         * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
         * that it is possible to migrate without blocking
         */
        if (mode & ISOLATE_ASYNC_MIGRATE) {
                /* All the caller can do on PageWriteback is block */
                if (PageWriteback(page))
                        return ret;

                if (PageDirty(page)) {
                        struct address_space *mapping;
                        bool migrate_dirty;

                        /*
                         * Only pages without mappings or that have a
                         * ->migratepage callback are possible to migrate
                         * without blocking. However, we can be racing with
                         * truncation so it's necessary to lock the page
                         * to stabilise the mapping as truncation holds
                         * the page lock until after the page is removed
                         * from the page cache.
                         */
                        if (!trylock_page(page))
                                return ret;

                        mapping = page_mapping(page);
                        migrate_dirty = !mapping || mapping->a_ops->migratepage;
                        unlock_page(page);
                        if (!migrate_dirty)
                                return ret;
                }
        }

        if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
                return ret;

        if (likely(get_page_unless_zero(page))) {
                /*
                 * Be careful not to clear PageLRU until after we're
                 * sure the page is not being freed elsewhere -- the
                 * page release code relies on it.
                 */
                ClearPageLRU(page);
                ret = 0;
        }

        return ret;
}


/*
 * Update LRU sizes after isolating pages. The LRU size updates must
 * be complete before mem_cgroup_update_lru_size due to a sanity check.
 */
static __always_inline void update_lru_sizes(struct lruvec *lruvec,
                        enum lru_list lru, unsigned long *nr_zone_taken)
{
        int zid;

        for (zid = 0; zid < MAX_NR_ZONES; zid++) {
                if (!nr_zone_taken[zid])
                        continue;

                update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
        }

}

/**
 * pgdat->lru_lock is heavily contended.  Some of the functions that
 * shrink the lists perform better by taking out a batch of pages
 * and working on them outside the LRU lock.
 *
 * For pagecache intensive workloads, this function is the hottest
 * spot in the kernel (apart from copy_*_user functions).
 *
 * Appropriate locks must be held before calling this function.
 *
 * @nr_to_scan:        The number of eligible pages to look through on the list.
 * @lruvec:        The LRU vector to pull pages from.
 * @dst:        The temp list to put pages on to.
 * @nr_scanned:        The number of pages that were scanned.
 * @sc:                The scan_control struct for this reclaim session
 * @lru:        LRU list id for isolating
 *
 * returns how many pages were moved onto *@dst.
 */
static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                struct lruvec *lruvec, struct list_head *dst,
                unsigned long *nr_scanned, struct scan_control *sc,
                enum lru_list lru)
{
        struct list_head *src = &lruvec->lists[lru];
        unsigned long nr_taken = 0;
        unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
        unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
        unsigned long skipped = 0;
        unsigned long scan, total_scan, nr_pages;
        LIST_HEAD(pages_skipped);
        isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);

        total_scan = 0;
        scan = 0;
        while (scan < nr_to_scan && !list_empty(src)) {
                struct page *page;

                page = lru_to_page(src);
                prefetchw_prev_lru_page(page, src, flags);

                VM_BUG_ON_PAGE(!PageLRU(page), page);

                nr_pages = compound_nr(page);
                total_scan += nr_pages;

                if (page_zonenum(page) > sc->reclaim_idx) {
                        list_move(&page->lru, &pages_skipped);
                        nr_skipped[page_zonenum(page)] += nr_pages;
                        continue;
                }

                /*
                 * Do not count skipped pages because that makes the function
                 * return with no isolated pages if the LRU mostly contains
                 * ineligible pages.  This causes the VM to not reclaim any
                 * pages, triggering a premature OOM.
                 *
                 * Account all tail pages of THP.  This would not cause
                 * premature OOM since __isolate_lru_page() returns -EBUSY
                 * only when the page is being freed somewhere else.
                 */
                scan += nr_pages;
                switch (__isolate_lru_page(page, mode)) {
                case 0:
                        nr_taken += nr_pages;
                        nr_zone_taken[page_zonenum(page)] += nr_pages;
                        list_move(&page->lru, dst);
                        break;

                case -EBUSY:
                        /* else it is being freed elsewhere */
                        list_move(&page->lru, src);
                        continue;

                default:
                        BUG();
                }
        }

        /*
         * Splice any skipped pages to the start of the LRU list. Note that
         * this disrupts the LRU order when reclaiming for lower zones but
         * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX
         * scanning would soon rescan the same pages to skip and put the
         * system at risk of premature OOM.
         */
        if (!list_empty(&pages_skipped)) {
                int zid;

                list_splice(&pages_skipped, src);
                for (zid = 0; zid < MAX_NR_ZONES; zid++) {
                        if (!nr_skipped[zid])
                                continue;

                        __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
                        skipped += nr_skipped[zid];
                }
        }
        *nr_scanned = total_scan;
        trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
                                    total_scan, skipped, nr_taken, mode, lru);
        update_lru_sizes(lruvec, lru, nr_zone_taken);
        return nr_taken;
}

/**
 * isolate_lru_page - tries to isolate a page from its LRU list
 * @page: page to isolate from its LRU list
 *
 * Isolates a @page from an LRU list, clears PageLRU and adjusts the
 * vmstat statistic corresponding to whatever LRU list the page was on.
 *
 * Returns 0 if the page was removed from an LRU list.
 * Returns -EBUSY if the page was not on an LRU list.
 *
 * The returned page will have PageLRU() cleared.  If it was found on
 * the active list, it will have PageActive set.  If it was found on
 * the unevictable list, it will have the PageUnevictable bit set. That flag
 * may need to be cleared by the caller before letting the page go.
 *
 * The vmstat statistic corresponding to the list on which the page was
 * found will be decremented.
 *
 * Restrictions:
 *
 * (1) Must be called with an elevated refcount on the page. This is a
 *     fundamental difference from isolate_lru_pages (which is called
 *     without a stable reference).
 * (2) the lru_lock must not be held.
 * (3) interrupts must be enabled.
 */
int isolate_lru_page(struct page *page)
{
        int ret = -EBUSY;

        VM_BUG_ON_PAGE(!page_count(page), page);
        WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");

        if (PageLRU(page)) {
                pg_data_t *pgdat = page_pgdat(page);
                struct lruvec *lruvec;

                spin_lock_irq(&pgdat->lru_lock);
                lruvec = mem_cgroup_page_lruvec(page, pgdat);
                if (PageLRU(page)) {
                        int lru = page_lru(page);
                        get_page(page);
                        ClearPageLRU(page);
                        del_page_from_lru_list(page, lruvec, lru);
                        ret = 0;
                }
                spin_unlock_irq(&pgdat->lru_lock);
        }
        return ret;
}

/*
 * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
 * then get rescheduled. When there are massive number of tasks doing page
 * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
 * the LRU list will go small and be scanned faster than necessary, leading to
 * unnecessary swapping, thrashing and OOM.
 */
static int too_many_isolated(struct pglist_data *pgdat, int file,
                struct scan_control *sc)
{
        unsigned long inactive, isolated;

        if (current_is_kswapd())
                return 0;

        if (!writeback_throttling_sane(sc))
                return 0;

        if (file) {
                inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
                isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
        } else {
                inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
                isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
        }

        /*
         * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
         * won't get blocked by normal direct-reclaimers, forming a circular
         * deadlock.
         */
        if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
                inactive >>= 3;

        return isolated > inactive;
}

/*
 * This moves pages from @list to corresponding LRU list.
 *
 * We move them the other way if the page is referenced by one or more
 * processes, from rmap.
 *
 * If the pages are mostly unmapped, the processing is fast and it is
 * appropriate to hold zone_lru_lock across the whole operation.  But if
 * the pages are mapped, the processing is slow (page_referenced()) so we
 * should drop zone_lru_lock around each page.  It's impossible to balance
 * this, so instead we remove the pages from the LRU while processing them.
 * It is safe to rely on PG_active against the non-LRU pages in here because
 * nobody will play with that bit on a non-LRU page.
 *
 * The downside is that we have to touch page->_refcount against each page.
 * But we had to alter page->flags anyway.
 *
 * Returns the number of pages moved to the given lruvec.
 */

static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
                                                     struct list_head *list)
{
        struct pglist_data *pgdat = lruvec_pgdat(lruvec);
        int nr_pages, nr_moved = 0;
        LIST_HEAD(pages_to_free);
        struct page *page;
        enum lru_list lru;

        while (!list_empty(list)) {
                page = lru_to_page(list);
                VM_BUG_ON_PAGE(PageLRU(page), page);
                if (unlikely(!page_evictable(page))) {
                        list_del(&page->lru);
                        spin_unlock_irq(&pgdat->lru_lock);
                        putback_lru_page(page);
                        spin_lock_irq(&pgdat->lru_lock);
                        continue;
                }
                lruvec = mem_cgroup_page_lruvec(page, pgdat);

                SetPageLRU(page);
                lru = page_lru(page);

                nr_pages = thp_nr_pages(page);
                update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
                list_move(&page->lru, &lruvec->lists[lru]);

                if (put_page_testzero(page)) {
                        __ClearPageLRU(page);
                        __ClearPageActive(page);
                        del_page_from_lru_list(page, lruvec, lru);

                        if (unlikely(PageCompound(page))) {
                                spin_unlock_irq(&pgdat->lru_lock);
                                destroy_compound_page(page);
                                spin_lock_irq(&pgdat->lru_lock);
                        } else
                                list_add(&page->lru, &pages_to_free);
                } else {
                        nr_moved += nr_pages;
                        if (PageActive(page))
                                workingset_age_nonresident(lruvec, nr_pages);
                }
        }

        /*
         * To save our caller's stack, now use input list for pages to free.
         */
        list_splice(&pages_to_free, list);

        return nr_moved;
}

/*
 * If a kernel thread (such as nfsd for loop-back mounts) services
 * a backing device by writing to the page cache it sets PF_LOCAL_THROTTLE.
 * In that case we should only throttle if the backing device it is
 * writing to is congested.  In other cases it is safe to throttle.
 */
static int current_may_throttle(void)
{
        return !(current->flags & PF_LOCAL_THROTTLE) ||
                current->backing_dev_info == NULL ||
                bdi_write_congested(current->backing_dev_info);
}

/*
 * shrink_inactive_list() is a helper for shrink_node().  It returns the number
 * of reclaimed pages
 */
static noinline_for_stack unsigned long
shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
                     struct scan_control *sc, enum lru_list lru)
{
        LIST_HEAD(page_list);
        unsigned long nr_scanned;
        unsigned int nr_reclaimed = 0;
        unsigned long nr_taken;
        struct reclaim_stat stat;
        bool file = is_file_lru(lru);
        enum vm_event_item item;
        struct pglist_data *pgdat = lruvec_pgdat(lruvec);
        bool stalled = false;

        while (unlikely(too_many_isolated(pgdat, file, sc))) {
                if (stalled)
                        return 0;

                /* wait a bit for the reclaimer. */
                msleep(100);
                stalled = true;

                /* We are about to die and free our memory. Return now. */
                if (fatal_signal_pending(current))
                        return SWAP_CLUSTER_MAX;
        }

        lru_add_drain();

        spin_lock_irq(&pgdat->lru_lock);

        nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
                                     &nr_scanned, sc, lru);

        __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
        item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
        if (!cgroup_reclaim(sc))
                __count_vm_events(item, nr_scanned);
        __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
        __count_vm_events(PGSCAN_ANON + file, nr_scanned);

        spin_unlock_irq(&pgdat->lru_lock);

        if (nr_taken == 0)
                return 0;

        nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, &stat, false);

        spin_lock_irq(&pgdat->lru_lock);

        move_pages_to_lru(lruvec, &page_list);

        __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
        lru_note_cost(lruvec, file, stat.nr_pageout);
        item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
        if (!cgroup_reclaim(sc))
                __count_vm_events(item, nr_reclaimed);
        __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
        __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);

        spin_unlock_irq(&pgdat->lru_lock);

        mem_cgroup_uncharge_list(&page_list);
        free_unref_page_list(&page_list);

        /*
         * If dirty pages are scanned that are not queued for IO, it
         * implies that flushers are not doing their job. This can
         * happen when memory pressure pushes dirty pages to the end of
         * the LRU before the dirty limits are breached and the dirty
         * data has expired. It can also happen when the proportion of
         * dirty pages grows not through writes but through memory
         * pressure reclaiming all the clean cache. And in some cases,
         * the flushers simply cannot keep up with the allocation
         * rate. Nudge the flusher threads in case they are asleep.
         */
        if (stat.nr_unqueued_dirty == nr_taken)
                wakeup_flusher_threads(WB_REASON_VMSCAN);

        sc->nr.dirty += stat.nr_dirty;
        sc->nr.congested += stat.nr_congested;
        sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
        sc->nr.writeback += stat.nr_writeback;
        sc->nr.immediate += stat.nr_immediate;
        sc->nr.taken += nr_taken;
        if (file)
                sc->nr.file_taken += nr_taken;

        trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
                        nr_scanned, nr_reclaimed, &stat, sc->priority, file);
        return nr_reclaimed;
}

static void shrink_active_list(unsigned long nr_to_scan,
                               struct lruvec *lruvec,
                               struct scan_control *sc,
                               enum lru_list lru)
{
        unsigned long nr_taken;
        unsigned long nr_scanned;
        unsigned long vm_flags;
        LIST_HEAD(l_hold);        /* The pages which were snipped off */
        LIST_HEAD(l_active);
        LIST_HEAD(l_inactive);
        struct page *page;
        unsigned nr_deactivate, nr_activate;
        unsigned nr_rotated = 0;
        int file = is_file_lru(lru);
        struct pglist_data *pgdat = lruvec_pgdat(lruvec);

        lru_add_drain();

        spin_lock_irq(&pgdat->lru_lock);

        nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
                                     &nr_scanned, sc, lru);

        __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);

        if (!cgroup_reclaim(sc))
                __count_vm_events(PGREFILL, nr_scanned);
        __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);

        spin_unlock_irq(&pgdat->lru_lock);

        while (!list_empty(&l_hold)) {
                cond_resched();
                page = lru_to_page(&l_hold);
                list_del(&page->lru);

                if (unlikely(!page_evictable(page))) {
                        putback_lru_page(page);
                        continue;
                }

                if (unlikely(buffer_heads_over_limit)) {
                        if (page_has_private(page) && trylock_page(page)) {
                                if (page_has_private(page))
                                        try_to_release_page(page, 0);
                                unlock_page(page);
                        }
                }

                if (page_referenced(page, 0, sc->target_mem_cgroup,
                                    &vm_flags)) {
                        /*
                         * Identify referenced, file-backed active pages and
                         * give them one more trip around the active list. So
                         * that executable code get better chances to stay in
                         * memory under moderate memory pressure.  Anon pages
                         * are not likely to be evicted by use-once streaming
                         * IO, plus JVM can create lots of anon VM_EXEC pages,
                         * so we ignore them here.
                         */
                        if ((vm_flags & VM_EXEC) && page_is_file_lru(page)) {
                                nr_rotated += thp_nr_pages(page);
                                list_add(&page->lru, &l_active);
                                continue;
                        }
                }

                ClearPageActive(page);        /* we are de-activating */
                SetPageWorkingset(page);
                list_add(&page->lru, &l_inactive);
        }

        /*
         * Move pages back to the lru list.
         */
        spin_lock_irq(&pgdat->lru_lock);

        nr_activate = move_pages_to_lru(lruvec, &l_active);
        nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
        /* Keep all free pages in l_active list */
        list_splice(&l_inactive, &l_active);

        __count_vm_events(PGDEACTIVATE, nr_deactivate);
        __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);

        __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
        spin_unlock_irq(&pgdat->lru_lock);

        mem_cgroup_uncharge_list(&l_active);
        free_unref_page_list(&l_active);
        trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
                        nr_deactivate, nr_rotated, sc->priority, file);
}

unsigned long reclaim_pages(struct list_head *page_list)
{
        int nid = NUMA_NO_NODE;
        unsigned int nr_reclaimed = 0;
        LIST_HEAD(node_page_list);
        struct reclaim_stat dummy_stat;
        struct page *page;
        struct scan_control sc = {
                .gfp_mask = GFP_KERNEL,
                .priority = DEF_PRIORITY,
                .may_writepage = 1,
                .may_unmap = 1,
                .may_swap = 1,
        };

        while (!list_empty(page_list)) {
                page = lru_to_page(page_list);
                if (nid == NUMA_NO_NODE) {
                        nid = page_to_nid(page);
                        INIT_LIST_HEAD(&node_page_list);
                }

                if (nid == page_to_nid(page)) {
                        ClearPageActive(page);
                        list_move(&page->lru, &node_page_list);
                        continue;
                }

                nr_reclaimed += shrink_page_list(&node_page_list,
                                                NODE_DATA(nid),
                                                &sc, &dummy_stat, false);
                while (!list_empty(&node_page_list)) {
                        page = lru_to_page(&node_page_list);
                        list_del(&page->lru);
                        putback_lru_page(page);
                }

                nid = NUMA_NO_NODE;
        }

        if (!list_empty(&node_page_list)) {
                nr_reclaimed += shrink_page_list(&node_page_list,
                                                NODE_DATA(nid),
                                                &sc, &dummy_stat, false);
                while (!list_empty(&node_page_list)) {
                        page = lru_to_page(&node_page_list);
                        list_del(&page->lru);
                        putback_lru_page(page);
                }
        }

        return nr_reclaimed;
}

static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
                                 struct lruvec *lruvec, struct scan_control *sc)
{
        if (is_active_lru(lru)) {
                if (sc->may_deactivate & (1 << is_file_lru(lru)))
                        shrink_active_list(nr_to_scan, lruvec, sc, lru);
                else
                        sc->skipped_deactivate = 1;
                return 0;
        }

        return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
}

/*
 * The inactive anon list should be small enough that the VM never has
 * to do too much work.
 *
 * The inactive file list should be small enough to leave most memory
 * to the established workingset on the scan-resistant active list,
 * but large enough to avoid thrashing the aggregate readahead window.
 *
 * Both inactive lists should also be large enough that each inactive
 * page has a chance to be referenced again before it is reclaimed.
 *
 * If that fails and refaulting is observed, the inactive list grows.
 *
 * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
 * on this LRU, maintained by the pageout code. An inactive_ratio
 * of 3 means 3:1 or 25% of the pages are kept on the inactive list.
 *
 * total     target    max
 * memory    ratio     inactive
 * -------------------------------------
 *   10MB       1         5MB
 *  100MB       1        50MB
 *    1GB       3       250MB
 *   10GB      10       0.9GB
 *  100GB      31         3GB
 *    1TB     101        10GB
 *   10TB     320        32GB
 */
static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru)
{
        enum lru_list active_lru = inactive_lru + LRU_ACTIVE;
        unsigned long inactive, active;
        unsigned long inactive_ratio;
        unsigned long gb;

        inactive = lruvec_page_state(lruvec, NR_LRU_BASE + inactive_lru);
        active = lruvec_page_state(lruvec, NR_LRU_BASE + active_lru);

        gb = (inactive + active) >> (30 - PAGE_SHIFT);
        if (gb)
                inactive_ratio = int_sqrt(10 * gb);
        else
                inactive_ratio = 1;

        return inactive * inactive_ratio < active;
}

enum scan_balance {
        SCAN_EQUAL,
        SCAN_FRACT,
        SCAN_ANON,
        SCAN_FILE,
};

/*
 * Determine how aggressively the anon and file LRU lists should be
 * scanned.  The relative value of each set of LRU lists is determined
 * by looking at the fraction of the pages scanned we did rotate back
 * onto the active list instead of evict.
 *
 * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
 * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
 */
static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
                           unsigned long *nr)
{
        struct mem_cgroup *memcg = lruvec_memcg(lruvec);
        unsigned long anon_cost, file_cost, total_cost;
        int swappiness = mem_cgroup_swappiness(memcg);
        u64 fraction[ANON_AND_FILE];
        u64 denominator = 0;        /* gcc */
        enum scan_balance scan_balance;
        unsigned long ap, fp;
        enum lru_list lru;

        /* If we have no swap space, do not bother scanning anon pages. */
        if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
                scan_balance = SCAN_FILE;
                goto out;
        }

        /*
         * Global reclaim will swap to prevent OOM even with no
         * swappiness, but memcg users want to use this knob to
         * disable swapping for individual groups completely when
         * using the memory controller's swap limit feature would be
         * too expensive.
         */
        if (cgroup_reclaim(sc) && !swappiness) {
                scan_balance = SCAN_FILE;
                goto out;
        }

        /*
         * Do not apply any pressure balancing cleverness when the
         * system is close to OOM, scan both anon and file equally
         * (unless the swappiness setting disagrees with swapping).
         */
        if (!sc->priority && swappiness) {
                scan_balance = SCAN_EQUAL;
                goto out;
        }

        /*
         * If the system is almost out of file pages, force-scan anon.
         */
        if (sc->file_is_tiny) {
                scan_balance = SCAN_ANON;
                goto out;
        }

        /*
         * If there is enough inactive page cache, we do not reclaim
         * anything from the anonymous working right now.
         */
        if (sc->cache_trim_mode) {
                scan_balance = SCAN_FILE;
                goto out;
        }

        scan_balance = SCAN_FRACT;
        /*
         * Calculate the pressure balance between anon and file pages.
         *
         * The amount of pressure we put on each LRU is inversely
         * proportional to the cost of reclaiming each list, as
         * determined by the share of pages that are refaulting, times
         * the relative IO cost of bringing back a swapped out
         * anonymous page vs reloading a filesystem page (swappiness).
         *
         * Although we limit that influence to ensure no list gets
         * left behind completely: at least a third of the pressure is
         * applied, before swappiness.
         *
         * With swappiness at 100, anon and file have equal IO cost.
         */
        total_cost = sc->anon_cost + sc->file_cost;
        anon_cost = total_cost + sc->anon_cost;
        file_cost = total_cost + sc->file_cost;
        total_cost = anon_cost + file_cost;

        ap = swappiness * (total_cost + 1);
        ap /= anon_cost + 1;

        fp = (200 - swappiness) * (total_cost + 1);
        fp /= file_cost + 1;

        fraction[0] = ap;
        fraction[1] = fp;
        denominator = ap + fp;
out:
        for_each_evictable_lru(lru) {
                int file = is_file_lru(lru);
                unsigned long lruvec_size;
                unsigned long low, min;
                unsigned long scan;

                lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
                mem_cgroup_protection(sc->target_mem_cgroup, memcg,
                                      &min, &low);

                if (min || low) {
                        /*
                         * Scale a cgroup's reclaim pressure by proportioning
                         * its current usage to its memory.low or memory.min
                         * setting.
                         *
                         * This is important, as otherwise scanning aggression
                         * becomes extremely binary -- from nothing as we
                         * approach the memory protection threshold, to totally
                         * nominal as we exceed it.  This results in requiring
                         * setting extremely liberal protection thresholds. It
                         * also means we simply get no protection at all if we
                         * set it too low, which is not ideal.
                         *
                         * If there is any protection in place, we reduce scan
                         * pressure by how much of the total memory used is
                         * within protection thresholds.
                         *
                         * There is one special case: in the first reclaim pass,
                         * we skip over all groups that are within their low
                         * protection. If that fails to reclaim enough pages to
                         * satisfy the reclaim goal, we come back and override
                         * the best-effort low protection. However, we still
                         * ideally want to honor how well-behaved groups are in
                         * that case instead of simply punishing them all
                         * equally. As such, we reclaim them based on how much
                         * memory they are using, reducing the scan pressure
                         * again by how much of the total memory used is under
                         * hard protection.
                         */
                        unsigned long cgroup_size = mem_cgroup_size(memcg);
                        unsigned long protection;

                        /* memory.low scaling, make sure we retry before OOM */
                        if (!sc->memcg_low_reclaim && low > min) {
                                protection = low;
                                sc->memcg_low_skipped = 1;
                        } else {
                                protection = min;
                        }

                        /* Avoid TOCTOU with earlier protection check */
                        cgroup_size = max(cgroup_size, protection);

                        scan = lruvec_size - lruvec_size * protection /
                                (cgroup_size + 1);

                        /*
                         * Minimally target SWAP_CLUSTER_MAX pages to keep
                         * reclaim moving forwards, avoiding decrementing
                         * sc->priority further than desirable.
                         */
                        scan = max(scan, SWAP_CLUSTER_MAX);
                } else {
                        scan = lruvec_size;
                }

                scan >>= sc->priority;

                /*
                 * If the cgroup's already been deleted, make sure to
                 * scrape out the remaining cache.
                 */
                if (!scan && !mem_cgroup_online(memcg))
                        scan = min(lruvec_size, SWAP_CLUSTER_MAX);

                switch (scan_balance) {
                case SCAN_EQUAL:
                        /* Scan lists relative to size */
                        break;
                case SCAN_FRACT:
                        /*
                         * Scan types proportional to swappiness and
                         * their relative recent reclaim efficiency.
                         * Make sure we don't miss the last page on
                         * the offlined memory cgroups because of a
                         * round-off error.
                         */
                        scan = mem_cgroup_online(memcg) ?
                               div64_u64(scan * fraction[file], denominator) :
                               DIV64_U64_ROUND_UP(scan * fraction[file],
                                                  denominator);
                        break;
                case SCAN_FILE:
                case SCAN_ANON:
                        /* Scan one type exclusively */
                        if ((scan_balance == SCAN_FILE) != file)
                                scan = 0;
                        break;
                default:
                        /* Look ma, no brain */
                        BUG();
                }

                nr[lru] = scan;
        }
}

static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
        unsigned long nr[NR_LRU_LISTS];
        unsigned long targets[NR_LRU_LISTS];
        unsigned long nr_to_scan;
        enum lru_list lru;
        unsigned long nr_reclaimed = 0;
        unsigned long nr_to_reclaim = sc->nr_to_reclaim;
        bool proportional_reclaim;
        struct blk_plug plug;

        get_scan_count(lruvec, sc, nr);

        /* Record the original scan target for proportional adjustments later */
        memcpy(targets, nr, sizeof(nr));

        /*
         * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
         * event that can occur when there is little memory pressure e.g.
         * multiple streaming readers/writers. Hence, we do not abort scanning
         * when the requested number of pages are reclaimed when scanning at
         * DEF_PRIORITY on the assumption that the fact we are direct
         * reclaiming implies that kswapd is not keeping up and it is best to
         * do a batch of work at once. For memcg reclaim one check is made to
         * abort proportional reclaim if either the file or anon lru has already
         * dropped to zero at the first pass.
         */
        proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
                                sc->priority == DEF_PRIORITY);

        blk_start_plug(&plug);
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
                                        nr[LRU_INACTIVE_FILE]) {
                unsigned long nr_anon, nr_file, percentage;
                unsigned long nr_scanned;

                for_each_evictable_lru(lru) {
                        if (nr[lru]) {
                                nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
                                nr[lru] -= nr_to_scan;

                                nr_reclaimed += shrink_list(lru, nr_to_scan,
                                                            lruvec, sc);
                        }
                }

                cond_resched();

                if (nr_reclaimed < nr_to_reclaim || proportional_reclaim)
                        continue;

                /*
                 * For kswapd and memcg, reclaim at least the number of pages
                 * requested. Ensure that the anon and file LRUs are scanned
                 * proportionally what was requested by get_scan_count(). We
                 * stop reclaiming one LRU and reduce the amount scanning
                 * proportional to the original scan target.
                 */
                nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
                nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];

                /*
                 * It's just vindictive to attack the larger once the smaller
                 * has gone to zero.  And given the way we stop scanning the
                 * smaller below, this makes sure that we only make one nudge
                 * towards proportionality once we've got nr_to_reclaim.
                 */
                if (!nr_file || !nr_anon)
                        break;

                if (nr_file > nr_anon) {
                        unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
                                                targets[LRU_ACTIVE_ANON] + 1;
                        lru = LRU_BASE;
                        percentage = nr_anon * 100 / scan_target;
                } else {
                        unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
                                                targets[LRU_ACTIVE_FILE] + 1;
                        lru = LRU_FILE;
                        percentage = nr_file * 100 / scan_target;
                }

                /* Stop scanning the smaller of the LRU */
                nr[lru] = 0;
                nr[lru + LRU_ACTIVE] = 0;

                /*
                 * Recalculate the other LRU scan count based on its original
                 * scan target and the percentage scanning already complete
                 */
                lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
                nr_scanned = targets[lru] - nr[lru];
                nr[lru] = targets[lru] * (100 - percentage) / 100;
                nr[lru] -= min(nr[lru], nr_scanned);

                lru += LRU_ACTIVE;
                nr_scanned = targets[lru] - nr[lru];
                nr[lru] = targets[lru] * (100 - percentage) / 100;
                nr[lru] -= min(nr[lru], nr_scanned);
        }
        blk_finish_plug(&plug);
        sc->nr_reclaimed += nr_reclaimed;

        /*
         * Even if we did not try to evict anon pages at all, we want to
         * rebalance the anon lru active/inactive ratio.
         */
        if (total_swap_pages && inactive_is_low(lruvec, LRU_INACTIVE_ANON))
                shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
                                   sc, LRU_ACTIVE_ANON);
}

/* Use reclaim/compaction for costly allocs or under memory pressure */
static bool in_reclaim_compaction(struct scan_control *sc)
{
        if (gfp_compaction_allowed(sc->gfp_mask) && sc->order &&
                        (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
                         sc->priority < DEF_PRIORITY - 2))
                return true;

        return false;
}

/*
 * Reclaim/compaction is used for high-order allocation requests. It reclaims
 * order-0 pages before compacting the zone. should_continue_reclaim() returns
 * true if more pages should be reclaimed such that when the page allocator
 * calls try_to_compact_pages() that it will have enough free pages to succeed.
 * It will give up earlier than that if there is difficulty reclaiming pages.
 */
static inline bool should_continue_reclaim(struct pglist_data *pgdat,
                                        unsigned long nr_reclaimed,
                                        struct scan_control *sc)
{
        unsigned long pages_for_compaction;
        unsigned long inactive_lru_pages;
        int z;

        /* If not in reclaim/compaction mode, stop */
        if (!in_reclaim_compaction(sc))
                return false;

        /*
         * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX
         * number of pages that were scanned. This will return to the caller
         * with the risk reclaim/compaction and the resulting allocation attempt
         * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL
         * allocations through requiring that the full LRU list has been scanned
         * first, by assuming that zero delta of sc->nr_scanned means full LRU
         * scan, but that approximation was wrong, and there were corner cases
         * where always a non-zero amount of pages were scanned.
         */
        if (!nr_reclaimed)
                return false;

        /* If compaction would go ahead or the allocation would succeed, stop */
        for (z = 0; z <= sc->reclaim_idx; z++) {
                struct zone *zone = &pgdat->node_zones[z];
                if (!managed_zone(zone))
                        continue;

                switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
                case COMPACT_SUCCESS:
                case COMPACT_CONTINUE:
                        return false;
                default:
                        /* check next zone */
                        ;
                }
        }

        /*
         * If we have not reclaimed enough pages for compaction and the
         * inactive lists are large enough, continue reclaiming
         */
        pages_for_compaction = compact_gap(sc->order);
        inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
        if (get_nr_swap_pages() > 0)
                inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);

        return inactive_lru_pages > pages_for_compaction;
}

static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
{
        struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
        struct mem_cgroup *memcg;

        memcg = mem_cgroup_iter(target_memcg, NULL, NULL);
        do {
                struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
                unsigned long reclaimed;
                unsigned long scanned;

                /*
                 * This loop can become CPU-bound when target memcgs
                 * aren't eligible for reclaim - either because they
                 * don't have any reclaimable pages, or because their
                 * memory is explicitly protected. Avoid soft lockups.
                 */
                cond_resched();

                mem_cgroup_calculate_protection(target_memcg, memcg);

                if (mem_cgroup_below_min(memcg)) {
                        /*
                         * Hard protection.
                         * If there is no reclaimable memory, OOM.
                         */
                        continue;
                } else if (mem_cgroup_below_low(memcg)) {
                        /*
                         * Soft protection.
                         * Respect the protection only as long as
                         * there is an unprotected supply
                         * of reclaimable memory from other cgroups.
                         */
                        if (!sc->memcg_low_reclaim) {
                                sc->memcg_low_skipped = 1;
                                continue;
                        }
                        memcg_memory_event(memcg, MEMCG_LOW);
                }

                reclaimed = sc->nr_reclaimed;
                scanned = sc->nr_scanned;

                shrink_lruvec(lruvec, sc);

                shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
                            sc->priority);

                /* Record the group's reclaim efficiency */
                vmpressure(sc->gfp_mask, memcg, false,
                           sc->nr_scanned - scanned,
                           sc->nr_reclaimed - reclaimed);

        } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
}

static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
{
        struct reclaim_state *reclaim_state = current->reclaim_state;
        unsigned long nr_reclaimed, nr_scanned;
        struct lruvec *target_lruvec;
        bool reclaimable = false;
        unsigned long file;

        target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);

again:
        memset(&sc->nr, 0, sizeof(sc->nr));

        nr_reclaimed = sc->nr_reclaimed;
        nr_scanned = sc->nr_scanned;

        /*
         * Determine the scan balance between anon and file LRUs.
         */
        spin_lock_irq(&pgdat->lru_lock);
        sc->anon_cost = target_lruvec->anon_cost;
        sc->file_cost = target_lruvec->file_cost;
        spin_unlock_irq(&pgdat->lru_lock);

        /*
         * Target desirable inactive:active list ratios for the anon
         * and file LRU lists.
         */
        if (!sc->force_deactivate) {
                unsigned long refaults;

                refaults = lruvec_page_state(target_lruvec,
                                WORKINGSET_ACTIVATE_ANON);
                if (refaults != target_lruvec->refaults[0] ||
                        inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
                        sc->may_deactivate |= DEACTIVATE_ANON;
                else
                        sc->may_deactivate &= ~DEACTIVATE_ANON;

                /*
                 * When refaults are being observed, it means a new
                 * workingset is being established. Deactivate to get
                 * rid of any stale active pages quickly.
                 */
                refaults = lruvec_page_state(target_lruvec,
                                WORKINGSET_ACTIVATE_FILE);
                if (refaults != target_lruvec->refaults[1] ||
                    inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
                        sc->may_deactivate |= DEACTIVATE_FILE;
                else
                        sc->may_deactivate &= ~DEACTIVATE_FILE;
        } else
                sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;

        /*
         * If we have plenty of inactive file pages that aren't
         * thrashing, try to reclaim those first before touching
         * anonymous pages.
         */
        file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
        if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
                sc->cache_trim_mode = 1;
        else
                sc->cache_trim_mode = 0;

        /*
         * Prevent the reclaimer from falling into the cache trap: as
         * cache pages start out inactive, every cache fault will tip
         * the scan balance towards the file LRU.  And as the file LRU
         * shrinks, so does the window for rotation from references.
         * This means we have a runaway feedback loop where a tiny
         * thrashing file LRU becomes infinitely more attractive than
         * anon pages.  Try to detect this based on file LRU size.
         */
        if (!cgroup_reclaim(sc)) {
                unsigned long total_high_wmark = 0;
                unsigned long free, anon;
                int z;

                free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
                file = node_page_state(pgdat, NR_ACTIVE_FILE) +
                           node_page_state(pgdat, NR_INACTIVE_FILE);

                for (z = 0; z < MAX_NR_ZONES; z++) {
                        struct zone *zone = &pgdat->node_zones[z];
                        if (!managed_zone(zone))
                                continue;

                        total_high_wmark += high_wmark_pages(zone);
                }

                /*
                 * Consider anon: if that's low too, this isn't a
                 * runaway file reclaim problem, but rather just
                 * extreme pressure. Reclaim as per usual then.
                 */
                anon = node_page_state(pgdat, NR_INACTIVE_ANON);

                sc->file_is_tiny =
                        file + free <= total_high_wmark &&
                        !(sc->may_deactivate & DEACTIVATE_ANON) &&
                        anon >> sc->priority;
        }

        shrink_node_memcgs(pgdat, sc);

        if (reclaim_state) {
                sc->nr_reclaimed += reclaim_state->reclaimed_slab;
                reclaim_state->reclaimed_slab = 0;
        }

        /* Record the subtree's reclaim efficiency */
        vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
                   sc->nr_scanned - nr_scanned,
                   sc->nr_reclaimed - nr_reclaimed);

        if (sc->nr_reclaimed - nr_reclaimed)
                reclaimable = true;

        if (current_is_kswapd()) {
                /*
                 * If reclaim is isolating dirty pages under writeback,
                 * it implies that the long-lived page allocation rate
                 * is exceeding the page laundering rate. Either the
                 * global limits are not being effective at throttling
                 * processes due to the page distribution throughout
                 * zones or there is heavy usage of a slow backing
                 * device. The only option is to throttle from reclaim
                 * context which is not ideal as there is no guarantee
                 * the dirtying process is throttled in the same way
                 * balance_dirty_pages() manages.
                 *
                 * Once a node is flagged PGDAT_WRITEBACK, kswapd will
                 * count the number of pages under pages flagged for
                 * immediate reclaim and stall if any are encountered
                 * in the nr_immediate check below.
                 */
                if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
                        set_bit(PGDAT_WRITEBACK, &pgdat->flags);

                /* Allow kswapd to start writing pages during reclaim.*/
                if (sc->nr.unqueued_dirty == sc->nr.file_taken)
                        set_bit(PGDAT_DIRTY, &pgdat->flags);

                /*
                 * If kswapd scans pages marked for immediate
                 * reclaim and under writeback (nr_immediate), it
                 * implies that pages are cycling through the LRU
                 * faster than they are written so also forcibly stall.
                 */
                if (sc->nr.immediate)
                        congestion_wait(BLK_RW_ASYNC, HZ/10);
        }

        /*
         * Tag a node/memcg as congested if all the dirty pages
         * scanned were backed by a congested BDI and
         * wait_iff_congested will stall.
         *
         * Legacy memcg will stall in page writeback so avoid forcibly
         * stalling in wait_iff_congested().
         */
        if ((current_is_kswapd() ||
             (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) &&
            sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
                set_bit(LRUVEC_CONGESTED, &target_lruvec->flags);

        /*
         * Stall direct reclaim for IO completions if underlying BDIs
         * and node is congested. Allow kswapd to continue until it
         * starts encountering unqueued dirty pages or cycling through
         * the LRU too quickly.
         */
        if (!current_is_kswapd() && current_may_throttle() &&
            !sc->hibernation_mode &&
            test_bit(LRUVEC_CONGESTED, &target_lruvec->flags))
                wait_iff_congested(BLK_RW_ASYNC, HZ/10);

        if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
                                    sc))
                goto again;

        /*
         * Kswapd gives up on balancing particular nodes after too
         * many failures to reclaim anything from them and goes to
         * sleep. On reclaim progress, reset the failure counter. A
         * successful direct reclaim run will revive a dormant kswapd.
         */
        if (reclaimable)
                pgdat->kswapd_failures = 0;
}

/*
 * Returns true if compaction should go ahead for a costly-order request, or
 * the allocation would already succeed without compaction. Return false if we
 * should reclaim first.
 */
static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
{
        unsigned long watermark;
        enum compact_result suitable;

        if (!gfp_compaction_allowed(sc->gfp_mask))
                return false;

        suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx);
        if (suitable == COMPACT_SUCCESS)
                /* Allocation should succeed already. Don't reclaim. */
                return true;
        if (suitable == COMPACT_SKIPPED)
                /* Compaction cannot yet proceed. Do reclaim. */
                return false;

        /*
         * Compaction is already possible, but it takes time to run and there
         * are potentially other callers using the pages just freed. So proceed
         * with reclaim to make a buffer of free pages available to give
         * compaction a reasonable chance of completing and allocating the page.
         * Note that we won't actually reclaim the whole buffer in one attempt
         * as the target watermark in should_continue_reclaim() is lower. But if
         * we are already above the high+gap watermark, don't reclaim at all.
         */
        watermark = high_wmark_pages(zone) + compact_gap(sc->order);

        return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
}

/*
 * This is the direct reclaim path, for page-allocating processes.  We only
 * try to reclaim pages from zones which will satisfy the caller's allocation
 * request.
 *
 * If a zone is deemed to be full of pinned pages then just give it a light
 * scan then give up on it.
 */
static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
{
        struct zoneref *z;
        struct zone *zone;
        unsigned long nr_soft_reclaimed;
        unsigned long nr_soft_scanned;
        gfp_t orig_mask;
        pg_data_t *last_pgdat = NULL;

        /*
         * If the number of buffer_heads in the machine exceeds the maximum
         * allowed level, force direct reclaim to scan the highmem zone as
         * highmem pages could be pinning lowmem pages storing buffer_heads
         */
        orig_mask = sc->gfp_mask;
        if (buffer_heads_over_limit) {
                sc->gfp_mask |= __GFP_HIGHMEM;
                sc->reclaim_idx = gfp_zone(sc->gfp_mask);
        }

        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                        sc->reclaim_idx, sc->nodemask) {
                /*
                 * Take care memory controller reclaiming has small influence
                 * to global LRU.
                 */
                if (!cgroup_reclaim(sc)) {
                        if (!cpuset_zone_allowed(zone,
                                                 GFP_KERNEL | __GFP_HARDWALL))
                                continue;

                        /*
                         * If we already have plenty of memory free for
                         * compaction in this zone, don't free any more.
                         * Even though compaction is invoked for any
                         * non-zero order, only frequent costly order
                         * reclamation is disruptive enough to become a
                         * noticeable problem, like transparent huge
                         * page allocations.
                         */
                        if (IS_ENABLED(CONFIG_COMPACTION) &&
                            sc->order > PAGE_ALLOC_COSTLY_ORDER &&
                            compaction_ready(zone, sc)) {
                                sc->compaction_ready = true;
                                continue;
                        }

                        /*
                         * Shrink each node in the zonelist once. If the
                         * zonelist is ordered by zone (not the default) then a
                         * node may be shrunk multiple times but in that case
                         * the user prefers lower zones being preserved.
                         */
                        if (zone->zone_pgdat == last_pgdat)
                                continue;

                        /*
                         * This steals pages from memory cgroups over softlimit
                         * and returns the number of reclaimed pages and
                         * scanned pages. This works for global memory pressure
                         * and balancing, not for a memcg's limit.
                         */
                        nr_soft_scanned = 0;
                        nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat,
                                                sc->order, sc->gfp_mask,
                                                &nr_soft_scanned);
                        sc->nr_reclaimed += nr_soft_reclaimed;
                        sc->nr_scanned += nr_soft_scanned;
                        /* need some check for avoid more shrink_zone() */
                }

                /* See comment about same check for global reclaim above */
                if (zone->zone_pgdat == last_pgdat)
                        continue;
                last_pgdat = zone->zone_pgdat;
                shrink_node(zone->zone_pgdat, sc);
        }

        /*
         * Restore to original mask to avoid the impact on the caller if we
         * promoted it to __GFP_HIGHMEM.
         */
        sc->gfp_mask = orig_mask;
}

static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
{
        struct lruvec *target_lruvec;
        unsigned long refaults;

        target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
        refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
        target_lruvec->refaults[0] = refaults;
        refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE);
        target_lruvec->refaults[1] = refaults;
}

/*
 * This is the main entry point to direct page reclaim.
 *
 * If a full scan of the inactive list fails to free enough memory then we
 * are "out of memory" and something needs to be killed.
 *
 * If the caller is !__GFP_FS then the probability of a failure is reasonably
 * high - the zone may be full of dirty or under-writeback pages, which this
 * caller can't do much about.  We kick the writeback threads and take explicit
 * naps in the hope that some of these pages can be written.  But if the
 * allocating task holds filesystem locks which prevent writeout this might not
 * work, and the allocation attempt will fail.
 *
 * returns:        0, if no pages reclaimed
 *                 else, the number of pages reclaimed
 */
static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                                          struct scan_control *sc)
{
        int initial_priority = sc->priority;
        pg_data_t *last_pgdat;
        struct zoneref *z;
        struct zone *zone;
retry:
        delayacct_freepages_start();

        if (!cgroup_reclaim(sc))
                __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);

        do {
                vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
                                sc->priority);
                sc->nr_scanned = 0;
                shrink_zones(zonelist, sc);

                if (sc->nr_reclaimed >= sc->nr_to_reclaim)
                        break;

                if (sc->compaction_ready)
                        break;

                /*
                 * If we're getting trouble reclaiming, start doing
                 * writepage even in laptop mode.
                 */
                if (sc->priority < DEF_PRIORITY - 2)
                        sc->may_writepage = 1;
        } while (--sc->priority >= 0);

        last_pgdat = NULL;
        for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
                                        sc->nodemask) {
                if (zone->zone_pgdat == last_pgdat)
                        continue;
                last_pgdat = zone->zone_pgdat;

                snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);

                if (cgroup_reclaim(sc)) {
                        struct lruvec *lruvec;

                        lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup,
                                                   zone->zone_pgdat);
                        clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
                }
        }

        delayacct_freepages_end();

        if (sc->nr_reclaimed)
                return sc->nr_reclaimed;

        /* Aborted reclaim to try compaction? don't OOM, then */
        if (sc->compaction_ready)
                return 1;

        /*
         * We make inactive:active ratio decisions based on the node's
         * composition of memory, but a restrictive reclaim_idx or a
         * memory.low cgroup setting can exempt large amounts of
         * memory from reclaim. Neither of which are very common, so
         * instead of doing costly eligibility calculations of the
         * entire cgroup subtree up front, we assume the estimates are
         * good, and retry with forcible deactivation if that fails.
         */
        if (sc->skipped_deactivate) {
                sc->priority = initial_priority;
                sc->force_deactivate = 1;
                sc->skipped_deactivate = 0;
                goto retry;
        }

        /* Untapped cgroup reserves?  Don't OOM, retry. */
        if (sc->memcg_low_skipped) {
                sc->priority = initial_priority;
                sc->force_deactivate = 0;
                sc->memcg_low_reclaim = 1;
                sc->memcg_low_skipped = 0;
                goto retry;
        }

        return 0;
}

static bool allow_direct_reclaim(pg_data_t *pgdat)
{
        struct zone *zone;
        unsigned long pfmemalloc_reserve = 0;
        unsigned long free_pages = 0;
        int i;
        bool wmark_ok;

        if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
                return true;

        for (i = 0; i <= ZONE_NORMAL; i++) {
                zone = &pgdat->node_zones[i];
                if (!managed_zone(zone))
                        continue;

                if (!zone_reclaimable_pages(zone))
                        continue;

                pfmemalloc_reserve += min_wmark_pages(zone);
                free_pages += zone_page_state(zone, NR_FREE_PAGES);
        }

        /* If there are no reserves (unexpected config) then do not throttle */
        if (!pfmemalloc_reserve)
                return true;

        wmark_ok = free_pages > pfmemalloc_reserve / 2;

        /* kswapd must be awake if processes are being throttled */
        if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
                if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL)
                        WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL);

                wake_up_interruptible(&pgdat->kswapd_wait);
        }

        return wmark_ok;
}

/*
 * Throttle direct reclaimers if backing storage is backed by the network
 * and the PFMEMALLOC reserve for the preferred node is getting dangerously
 * depleted. kswapd will continue to make progress and wake the processes
 * when the low watermark is reached.
 *
 * Returns true if a fatal signal was delivered during throttling. If this
 * happens, the page allocator should not consider triggering the OOM killer.
 */
static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
                                        nodemask_t *nodemask)
{
        struct zoneref *z;
        struct zone *zone;
        pg_data_t *pgdat = NULL;

        /*
         * Kernel threads should not be throttled as they may be indirectly
         * responsible for cleaning pages necessary for reclaim to make forward
         * progress. kjournald for example may enter direct reclaim while
         * committing a transaction where throttling it could forcing other
         * processes to block on log_wait_commit().
         */
        if (current->flags & PF_KTHREAD)
                goto out;

        /*
         * If a fatal signal is pending, this process should not throttle.
         * It should return quickly so it can exit and free its memory
         */
        if (fatal_signal_pending(current))
                goto out;

        /*
         * Check if the pfmemalloc reserves are ok by finding the first node
         * with a usable ZONE_NORMAL or lower zone. The expectation is that
         * GFP_KERNEL will be required for allocating network buffers when
         * swapping over the network so ZONE_HIGHMEM is unusable.
         *
         * Throttling is based on the first usable node and throttled processes
         * wait on a queue until kswapd makes progress and wakes them. There
         * is an affinity then between processes waking up and where reclaim
         * progress has been made assuming the process wakes on the same node.
         * More importantly, processes running on remote nodes will not compete
         * for remote pfmemalloc reserves and processes on different nodes
         * should make reasonable progress.
         */
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                        gfp_zone(gfp_mask), nodemask) {
                if (zone_idx(zone) > ZONE_NORMAL)
                        continue;

                /* Throttle based on the first usable node */
                pgdat = zone->zone_pgdat;
                if (allow_direct_reclaim(pgdat))
                        goto out;
                break;
        }

        /* If no zone was usable by the allocation flags then do not throttle */
        if (!pgdat)
                goto out;

        /* Account for the throttling */
        count_vm_event(PGSCAN_DIRECT_THROTTLE);

        /*
         * If the caller cannot enter the filesystem, it's possible that it
         * is due to the caller holding an FS lock or performing a journal
         * transaction in the case of a filesystem like ext[3|4]. In this case,
         * it is not safe to block on pfmemalloc_wait as kswapd could be
         * blocked waiting on the same lock. Instead, throttle for up to a
         * second before continuing.
         */
        if (!(gfp_mask & __GFP_FS)) {
                wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
                        allow_direct_reclaim(pgdat), HZ);

                goto check_pending;
        }

        /* Throttle until kswapd wakes the process */
        wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
                allow_direct_reclaim(pgdat));

check_pending:
        if (fatal_signal_pending(current))
                return true;

out:
        return false;
}

unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                                gfp_t gfp_mask, nodemask_t *nodemask)
{
        unsigned long nr_reclaimed;
        struct scan_control sc = {
                .nr_to_reclaim = SWAP_CLUSTER_MAX,
                .gfp_mask = current_gfp_context(gfp_mask),
                .reclaim_idx = gfp_zone(gfp_mask),
                .order = order,
                .nodemask = nodemask,
                .priority = DEF_PRIORITY,
                .may_writepage = !laptop_mode,
                .may_unmap = 1,
                .may_swap = 1,
        };

        /*
         * scan_control uses s8 fields for order, priority, and reclaim_idx.
         * Confirm they are large enough for max values.
         */
        BUILD_BUG_ON(MAX_ORDER > S8_MAX);
        BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
        BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);

        /*
         * Do not enter reclaim if fatal signal was delivered while throttled.
         * 1 is returned so that the page allocator does not OOM kill at this
         * point.
         */
        if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
                return 1;

        set_task_reclaim_state(current, &sc.reclaim_state);
        trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);

        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);

        trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
        set_task_reclaim_state(current, NULL);

        return nr_reclaimed;
}

#ifdef CONFIG_MEMCG

/* Only used by soft limit reclaim. Do not reuse for anything else. */
unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
                                                gfp_t gfp_mask, bool noswap,
                                                pg_data_t *pgdat,
                                                unsigned long *nr_scanned)
{
        struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
        struct scan_control sc = {
                .nr_to_reclaim = SWAP_CLUSTER_MAX,
                .target_mem_cgroup = memcg,
                .may_writepage = !laptop_mode,
                .may_unmap = 1,
                .reclaim_idx = MAX_NR_ZONES - 1,
                .may_swap = !noswap,
        };

        WARN_ON_ONCE(!current->reclaim_state);

        sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
                        (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);

        trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
                                                      sc.gfp_mask);

        /*
         * NOTE: Although we can get the priority field, using it
         * here is not a good idea, since it limits the pages we can scan.
         * if we don't reclaim here, the shrink_node from balance_pgdat
         * will pick up pages from other mem cgroup's as well. We hack
         * the priority and make it zero.
         */
        shrink_lruvec(lruvec, &sc);

        trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);

        *nr_scanned = sc.nr_scanned;

        return sc.nr_reclaimed;
}

unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
                                           unsigned long nr_pages,
                                           gfp_t gfp_mask,
                                           bool may_swap)
{
        unsigned long nr_reclaimed;
        unsigned int noreclaim_flag;
        struct scan_control sc = {
                .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
                .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
                                (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
                .reclaim_idx = MAX_NR_ZONES - 1,
                .target_mem_cgroup = memcg,
                .priority = DEF_PRIORITY,
                .may_writepage = !laptop_mode,
                .may_unmap = 1,
                .may_swap = may_swap,
        };
        /*
         * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
         * equal pressure on all the nodes. This is based on the assumption that
         * the reclaim does not bail out early.
         */
        struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);

        set_task_reclaim_state(current, &sc.reclaim_state);
        trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask);
        noreclaim_flag = memalloc_noreclaim_save();

        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);

        memalloc_noreclaim_restore(noreclaim_flag);
        trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
        set_task_reclaim_state(current, NULL);

        return nr_reclaimed;
}
#endif

static void age_active_anon(struct pglist_data *pgdat,
                                struct scan_control *sc)
{
        struct mem_cgroup *memcg;
        struct lruvec *lruvec;

        if (!total_swap_pages)
                return;

        lruvec = mem_cgroup_lruvec(NULL, pgdat);
        if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON))
                return;

        memcg = mem_cgroup_iter(NULL, NULL, NULL);
        do {
                lruvec = mem_cgroup_lruvec(memcg, pgdat);
                shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
                                   sc, LRU_ACTIVE_ANON);
                memcg = mem_cgroup_iter(NULL, memcg, NULL);
        } while (memcg);
}

static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx)
{
        int i;
        struct zone *zone;

        /*
         * Check for watermark boosts top-down as the higher zones
         * are more likely to be boosted. Both watermarks and boosts
         * should not be checked at the same time as reclaim would
         * start prematurely when there is no boosting and a lower
         * zone is balanced.
         */
        for (i = highest_zoneidx; i >= 0; i--) {
                zone = pgdat->node_zones + i;
                if (!managed_zone(zone))
                        continue;

                if (zone->watermark_boost)
                        return true;
        }

        return false;
}

/*
 * Returns true if there is an eligible zone balanced for the request order
 * and highest_zoneidx
 */
static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
{
        int i;
        unsigned long mark = -1;
        struct zone *zone;

        /*
         * Check watermarks bottom-up as lower zones are more likely to
         * meet watermarks.
         */
        for (i = 0; i <= highest_zoneidx; i++) {
                zone = pgdat->node_zones + i;

                if (!managed_zone(zone))
                        continue;

                mark = high_wmark_pages(zone);
                if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
                        return true;
        }

        /*
         * If a node has no populated zone within highest_zoneidx, it does not
         * need balancing by definition. This can happen if a zone-restricted
         * allocation tries to wake a remote kswapd.
         */
        if (mark == -1)
                return true;

        return false;
}

/* Clear pgdat state for congested, dirty or under writeback. */
static void clear_pgdat_congested(pg_data_t *pgdat)
{
        struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);

        clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
        clear_bit(PGDAT_DIRTY, &pgdat->flags);
        clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
}

/*
 * Prepare kswapd for sleeping. This verifies that there are no processes
 * waiting in throttle_direct_reclaim() and that watermarks have been met.
 *
 * Returns true if kswapd is ready to sleep
 */
static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order,
                                int highest_zoneidx)
{
        /*
         * The throttled processes are normally woken up in balance_pgdat() as
         * soon as allow_direct_reclaim() is true. But there is a potential
         * race between when kswapd checks the watermarks and a process gets
         * throttled. There is also a potential race if processes get
         * throttled, kswapd wakes, a large process exits thereby balancing the
         * zones, which causes kswapd to exit balance_pgdat() before reaching
         * the wake up checks. If kswapd is going to sleep, no process should
         * be sleeping on pfmemalloc_wait, so wake them now if necessary. If
         * the wake up is premature, processes will wake kswapd and get
         * throttled again. The difference from wake ups in balance_pgdat() is
         * that here we are under prepare_to_wait().
         */
        if (waitqueue_active(&pgdat->pfmemalloc_wait))
                wake_up_all(&pgdat->pfmemalloc_wait);

        /* Hopeless node, leave it to direct reclaim */
        if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
                return true;

        if (pgdat_balanced(pgdat, order, highest_zoneidx)) {
                clear_pgdat_congested(pgdat);
                return true;
        }

        return false;
}

/*
 * kswapd shrinks a node of pages that are at or below the highest usable
 * zone that is currently unbalanced.
 *
 * Returns true if kswapd scanned at least the requested number of pages to
 * reclaim or if the lack of progress was due to pages under writeback.
 * This is used to determine if the scanning priority needs to be raised.
 */
static bool kswapd_shrink_node(pg_data_t *pgdat,
                               struct scan_control *sc)
{
        struct zone *zone;
        int z;

        /* Reclaim a number of pages proportional to the number of zones */
        sc->nr_to_reclaim = 0;
        for (z = 0; z <= sc->reclaim_idx; z++) {
                zone = pgdat->node_zones + z;
                if (!managed_zone(zone))
                        continue;

                sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
        }

        /*
         * Historically care was taken to put equal pressure on all zones but
         * now pressure is applied based on node LRU order.
         */
        shrink_node(pgdat, sc);

        /*
         * Fragmentation may mean that the system cannot be rebalanced for
         * high-order allocations. If twice the allocation size has been
         * reclaimed then recheck watermarks only at order-0 to prevent
         * excessive reclaim. Assume that a process requested a high-order
         * can direct reclaim/compact.
         */
        if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
                sc->order = 0;

        return sc->nr_scanned >= sc->nr_to_reclaim;
}

/*
 * For kswapd, balance_pgdat() will reclaim pages across a node from zones
 * that are eligible for use by the caller until at least one zone is
 * balanced.
 *
 * Returns the order kswapd finished reclaiming at.
 *
 * kswapd scans the zones in the highmem->normal->dma direction.  It skips
 * zones which have free_pages > high_wmark_pages(zone), but once a zone is
 * found to have free_pages <= high_wmark_pages(zone), any page in that zone
 * or lower is eligible for reclaim until at least one usable zone is
 * balanced.
 */
static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
{
        int i;
        unsigned long nr_soft_reclaimed;
        unsigned long nr_soft_scanned;
        unsigned long pflags;
        unsigned long nr_boost_reclaim;
        unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
        bool boosted;
        struct zone *zone;
        struct scan_control sc = {
                .gfp_mask = GFP_KERNEL,
                .order = order,
                .may_unmap = 1,
        };

        set_task_reclaim_state(current, &sc.reclaim_state);
        psi_memstall_enter(&pflags);
        __fs_reclaim_acquire();

        count_vm_event(PAGEOUTRUN);

        /*
         * Account for the reclaim boost. Note that the zone boost is left in
         * place so that parallel allocations that are near the watermark will
         * stall or direct reclaim until kswapd is finished.
         */
        nr_boost_reclaim = 0;
        for (i = 0; i <= highest_zoneidx; i++) {
                zone = pgdat->node_zones + i;
                if (!managed_zone(zone))
                        continue;

                nr_boost_reclaim += zone->watermark_boost;
                zone_boosts[i] = zone->watermark_boost;
        }
        boosted = nr_boost_reclaim;

restart:
        sc.priority = DEF_PRIORITY;
        do {
                unsigned long nr_reclaimed = sc.nr_reclaimed;
                bool raise_priority = true;
                bool balanced;
                bool ret;

                sc.reclaim_idx = highest_zoneidx;

                /*
                 * If the number of buffer_heads exceeds the maximum allowed
                 * then consider reclaiming from all zones. This has a dual
                 * purpose -- on 64-bit systems it is expected that
                 * buffer_heads are stripped during active rotation. On 32-bit
                 * systems, highmem pages can pin lowmem memory and shrinking
                 * buffers can relieve lowmem pressure. Reclaim may still not
                 * go ahead if all eligible zones for the original allocation
                 * request are balanced to avoid excessive reclaim from kswapd.
                 */
                if (buffer_heads_over_limit) {
                        for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
                                zone = pgdat->node_zones + i;
                                if (!managed_zone(zone))
                                        continue;

                                sc.reclaim_idx = i;
                                break;
                        }
                }

                /*
                 * If the pgdat is imbalanced then ignore boosting and preserve
                 * the watermarks for a later time and restart. Note that the
                 * zone watermarks will be still reset at the end of balancing
                 * on the grounds that the normal reclaim should be enough to
                 * re-evaluate if boosting is required when kswapd next wakes.
                 */
                balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx);
                if (!balanced && nr_boost_reclaim) {
                        nr_boost_reclaim = 0;
                        goto restart;
                }

                /*
                 * If boosting is not active then only reclaim if there are no
                 * eligible zones. Note that sc.reclaim_idx is not used as
                 * buffer_heads_over_limit may have adjusted it.
                 */
                if (!nr_boost_reclaim && balanced)
                        goto out;

                /* Limit the priority of boosting to avoid reclaim writeback */
                if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)
                        raise_priority = false;

                /*
                 * Do not writeback or swap pages for boosted reclaim. The
                 * intent is to relieve pressure not issue sub-optimal IO
                 * from reclaim context. If no pages are reclaimed, the
                 * reclaim will be aborted.
                 */
                sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
                sc.may_swap = !nr_boost_reclaim;

                /*
                 * Do some background aging of the anon list, to give
                 * pages a chance to be referenced before reclaiming. All
                 * pages are rotated regardless of classzone as this is
                 * about consistent aging.
                 */
                age_active_anon(pgdat, &sc);

                /*
                 * If we're getting trouble reclaiming, start doing writepage
                 * even in laptop mode.
                 */
                if (sc.priority < DEF_PRIORITY - 2)
                        sc.may_writepage = 1;

                /* Call soft limit reclaim before calling shrink_node. */
                sc.nr_scanned = 0;
                nr_soft_scanned = 0;
                nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,
                                                sc.gfp_mask, &nr_soft_scanned);
                sc.nr_reclaimed += nr_soft_reclaimed;

                /*
                 * There should be no need to raise the scanning priority if
                 * enough pages are already being scanned that that high
                 * watermark would be met at 100% efficiency.
                 */
                if (kswapd_shrink_node(pgdat, &sc))
                        raise_priority = false;

                /*
                 * If the low watermark is met there is no need for processes
                 * to be throttled on pfmemalloc_wait as they should not be
                 * able to safely make forward progress. Wake them
                 */
                if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
                                allow_direct_reclaim(pgdat))
                        wake_up_all(&pgdat->pfmemalloc_wait);

                /* Check if kswapd should be suspending */
                __fs_reclaim_release();
                ret = try_to_freeze();
                __fs_reclaim_acquire();
                if (ret || kthread_should_stop())
                        break;

                /*
                 * Raise priority if scanning rate is too low or there was no
                 * progress in reclaiming pages
                 */
                nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
                nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);

                /*
                 * If reclaim made no progress for a boost, stop reclaim as
                 * IO cannot be queued and it could be an infinite loop in
                 * extreme circumstances.
                 */
                if (nr_boost_reclaim && !nr_reclaimed)
                        break;

                if (raise_priority || !nr_reclaimed)
                        sc.priority--;
        } while (sc.priority >= 1);

        if (!sc.nr_reclaimed)
                pgdat->kswapd_failures++;

out:
        /* If reclaim was boosted, account for the reclaim done in this pass */
        if (boosted) {
                unsigned long flags;

                for (i = 0; i <= highest_zoneidx; i++) {
                        if (!zone_boosts[i])
                                continue;

                        /* Increments are under the zone lock */
                        zone = pgdat->node_zones + i;
                        spin_lock_irqsave(&zone->lock, flags);
                        zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
                        spin_unlock_irqrestore(&zone->lock, flags);
                }

                /*
                 * As there is now likely space, wakeup kcompact to defragment
                 * pageblocks.
                 */
                wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx);
        }

        snapshot_refaults(NULL, pgdat);
        __fs_reclaim_release();
        psi_memstall_leave(&pflags);
        set_task_reclaim_state(current, NULL);

        /*
         * Return the order kswapd stopped reclaiming at as
         * prepare_kswapd_sleep() takes it into account. If another caller
         * entered the allocator slow path while kswapd was awake, order will
         * remain at the higher level.
         */
        return sc.order;
}

/*
 * The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to
 * be reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is
 * not a valid index then either kswapd runs for first time or kswapd couldn't
 * sleep after previous reclaim attempt (node is still unbalanced). In that
 * case return the zone index of the previous kswapd reclaim cycle.
 */
static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat,
                                           enum zone_type prev_highest_zoneidx)
{
        enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);

        return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx;
}

static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
                                unsigned int highest_zoneidx)
{
        long remaining = 0;
        DEFINE_WAIT(wait);

        if (freezing(current) || kthread_should_stop())
                return;

        prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);

        /*
         * Try to sleep for a short interval. Note that kcompactd will only be
         * woken if it is possible to sleep for a short interval. This is
         * deliberate on the assumption that if reclaim cannot keep an
         * eligible zone balanced that it's also unlikely that compaction will
         * succeed.
         */
        if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
                /*
                 * Compaction records what page blocks it recently failed to
                 * isolate pages from and skips them in the future scanning.
                 * When kswapd is going to sleep, it is reasonable to assume
                 * that pages and compaction may succeed so reset the cache.
                 */
                reset_isolation_suitable(pgdat);

                /*
                 * We have freed the memory, now we should compact it to make
                 * allocation of the requested order possible.
                 */
                wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx);

                remaining = schedule_timeout(HZ/10);

                /*
                 * If woken prematurely then reset kswapd_highest_zoneidx and
                 * order. The values will either be from a wakeup request or
                 * the previous request that slept prematurely.
                 */
                if (remaining) {
                        WRITE_ONCE(pgdat->kswapd_highest_zoneidx,
                                        kswapd_highest_zoneidx(pgdat,
                                                        highest_zoneidx));

                        if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
                                WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
                }

                finish_wait(&pgdat->kswapd_wait, &wait);
                prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
        }

        /*
         * After a short sleep, check if it was a premature sleep. If not, then
         * go fully to sleep until explicitly woken up.
         */
        if (!remaining &&
            prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
                trace_mm_vmscan_kswapd_sleep(pgdat->node_id);

                /*
                 * vmstat counters are not perfectly accurate and the estimated
                 * value for counters such as NR_FREE_PAGES can deviate from the
                 * true value by nr_online_cpus * threshold. To avoid the zone
                 * watermarks being breached while under pressure, we reduce the
                 * per-cpu vmstat threshold while kswapd is awake and restore
                 * them before going back to sleep.
                 */
                set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);

                if (!kthread_should_stop())
                        schedule();

                set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
        } else {
                if (remaining)
                        count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
                else
                        count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
        }
        finish_wait(&pgdat->kswapd_wait, &wait);
}

/*
 * The background pageout daemon, started as a kernel thread
 * from the init process.
 *
 * This basically trickles out pages so that we have _some_
 * free memory available even if there is no other activity
 * that frees anything up. This is needed for things like routing
 * etc, where we otherwise might have all activity going on in
 * asynchronous contexts that cannot page things out.
 *
 * If there are applications that are active memory-allocators
 * (most normal use), this basically shouldn't matter.
 */
static int kswapd(void *p)
{
        unsigned int alloc_order, reclaim_order;
        unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
        pg_data_t *pgdat = (pg_data_t*)p;
        struct task_struct *tsk = current;
        const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);

        if (!cpumask_empty(cpumask))
                set_cpus_allowed_ptr(tsk, cpumask);

        /*
         * Tell the memory management that we're a "memory allocator",
         * and that if we need more memory we should get access to it
         * regardless (see "__alloc_pages()"). "kswapd" should
         * never get caught in the normal page freeing logic.
         *
         * (Kswapd normally doesn't need memory anyway, but sometimes
         * you need a small amount of memory in order to be able to
         * page out something else, and this flag essentially protects
         * us from recursively trying to free more memory as we're
         * trying to free the first piece of memory in the first place).
         */
        tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
        set_freezable();

        WRITE_ONCE(pgdat->kswapd_order, 0);
        WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
        for ( ; ; ) {
                bool ret;

                alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
                highest_zoneidx = kswapd_highest_zoneidx(pgdat,
                                                        highest_zoneidx);

kswapd_try_sleep:
                kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
                                        highest_zoneidx);

                /* Read the new order and highest_zoneidx */
                alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
                highest_zoneidx = kswapd_highest_zoneidx(pgdat,
                                                        highest_zoneidx);
                WRITE_ONCE(pgdat->kswapd_order, 0);
                WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);

                ret = try_to_freeze();
                if (kthread_should_stop())
                        break;

                /*
                 * We can speed up thawing tasks if we don't call balance_pgdat
                 * after returning from the refrigerator
                 */
                if (ret)
                        continue;

                /*
                 * Reclaim begins at the requested order but if a high-order
                 * reclaim fails then kswapd falls back to reclaiming for
                 * order-0. If that happens, kswapd will consider sleeping
                 * for the order it finished reclaiming at (reclaim_order)
                 * but kcompactd is woken to compact for the original
                 * request (alloc_order).
                 */
                trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx,
                                                alloc_order);
                reclaim_order = balance_pgdat(pgdat, alloc_order,
                                                highest_zoneidx);
                if (reclaim_order < alloc_order)
                        goto kswapd_try_sleep;
        }

        tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);

        return 0;
}

/*
 * A zone is low on free memory or too fragmented for high-order memory.  If
 * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's
 * pgdat.  It will wake up kcompactd after reclaiming memory.  If kswapd reclaim
 * has failed or is not needed, still wake up kcompactd if only compaction is
 * needed.
 */
void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
                   enum zone_type highest_zoneidx)
{
        pg_data_t *pgdat;
        enum zone_type curr_idx;

        if (!managed_zone(zone))
                return;

        if (!cpuset_zone_allowed(zone, gfp_flags))
                return;

        pgdat = zone->zone_pgdat;
        curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);

        if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx)
                WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx);

        if (READ_ONCE(pgdat->kswapd_order) < order)
                WRITE_ONCE(pgdat->kswapd_order, order);

        if (!waitqueue_active(&pgdat->kswapd_wait))
                return;

        /* Hopeless node, leave it to direct reclaim if possible */
        if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
            (pgdat_balanced(pgdat, order, highest_zoneidx) &&
             !pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
                /*
                 * There may be plenty of free memory available, but it's too
                 * fragmented for high-order allocations.  Wake up kcompactd
                 * and rely on compaction_suitable() to determine if it's
                 * needed.  If it fails, it will defer subsequent attempts to
                 * ratelimit its work.
                 */
                if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
                        wakeup_kcompactd(pgdat, order, highest_zoneidx);
                return;
        }

        trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order,
                                      gfp_flags);
        wake_up_interruptible(&pgdat->kswapd_wait);
}

#ifdef CONFIG_HIBERNATION
/*
 * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
 * freed pages.
 *
 * Rather than trying to age LRUs the aim is to preserve the overall
 * LRU order by reclaiming preferentially
 * inactive > active > active referenced > active mapped
 */
unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
{
        struct scan_control sc = {
                .nr_to_reclaim = nr_to_reclaim,
                .gfp_mask = GFP_HIGHUSER_MOVABLE,
                .reclaim_idx = MAX_NR_ZONES - 1,
                .priority = DEF_PRIORITY,
                .may_writepage = 1,
                .may_unmap = 1,
                .may_swap = 1,
                .hibernation_mode = 1,
        };
        struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
        unsigned long nr_reclaimed;
        unsigned int noreclaim_flag;

        fs_reclaim_acquire(sc.gfp_mask);
        noreclaim_flag = memalloc_noreclaim_save();
        set_task_reclaim_state(current, &sc.reclaim_state);

        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);

        set_task_reclaim_state(current, NULL);
        memalloc_noreclaim_restore(noreclaim_flag);
        fs_reclaim_release(sc.gfp_mask);

        return nr_reclaimed;
}
#endif /* CONFIG_HIBERNATION */

/*
 * This kswapd start function will be called by init and node-hot-add.
 * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
 */
int kswapd_run(int nid)
{
        pg_data_t *pgdat = NODE_DATA(nid);
        int ret = 0;

        if (pgdat->kswapd)
                return 0;

        pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
        if (IS_ERR(pgdat->kswapd)) {
                /* failure at boot is fatal */
                BUG_ON(system_state < SYSTEM_RUNNING);
                pr_err("Failed to start kswapd on node %d\n", nid);
                ret = PTR_ERR(pgdat->kswapd);
                pgdat->kswapd = NULL;
        }
        return ret;
}

/*
 * Called by memory hotplug when all memory in a node is offlined.  Caller must
 * hold mem_hotplug_begin/end().
 */
void kswapd_stop(int nid)
{
        struct task_struct *kswapd = NODE_DATA(nid)->kswapd;

        if (kswapd) {
                kthread_stop(kswapd);
                NODE_DATA(nid)->kswapd = NULL;
        }
}

static int __init kswapd_init(void)
{
        int nid;

        swap_setup();
        for_each_node_state(nid, N_MEMORY)
                 kswapd_run(nid);
        return 0;
}

module_init(kswapd_init)

#ifdef CONFIG_NUMA
/*
 * Node reclaim mode
 *
 * If non-zero call node_reclaim when the number of free pages falls below
 * the watermarks.
 */
int node_reclaim_mode __read_mostly;

/*
 * These bit locations are exposed in the vm.zone_reclaim_mode sysctl
 * ABI.  New bits are OK, but existing bits can never change.
 */
#define RECLAIM_ZONE  (1<<0)   /* Run shrink_inactive_list on the zone */
#define RECLAIM_WRITE (1<<1)   /* Writeout pages during reclaim */
#define RECLAIM_UNMAP (1<<2)   /* Unmap pages during reclaim */

/*
 * Priority for NODE_RECLAIM. This determines the fraction of pages
 * of a node considered for each zone_reclaim. 4 scans 1/16th of
 * a zone.
 */
#define NODE_RECLAIM_PRIORITY 4

/*
 * Percentage of pages in a zone that must be unmapped for node_reclaim to
 * occur.
 */
int sysctl_min_unmapped_ratio = 1;

/*
 * If the number of slab pages in a zone grows beyond this percentage then
 * slab reclaim needs to occur.
 */
int sysctl_min_slab_ratio = 5;

static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
{
        unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED);
        unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) +
                node_page_state(pgdat, NR_ACTIVE_FILE);

        /*
         * It's possible for there to be more file mapped pages than
         * accounted for by the pages on the file LRU lists because
         * tmpfs pages accounted for as ANON can also be FILE_MAPPED
         */
        return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
}

/* Work out how many page cache pages we can reclaim in this reclaim_mode */
static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
{
        unsigned long nr_pagecache_reclaimable;
        unsigned long delta = 0;

        /*
         * If RECLAIM_UNMAP is set, then all file pages are considered
         * potentially reclaimable. Otherwise, we have to worry about
         * pages like swapcache and node_unmapped_file_pages() provides
         * a better estimate
         */
        if (node_reclaim_mode & RECLAIM_UNMAP)
                nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES);
        else
                nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);

        /* If we can't clean pages, remove dirty pages from consideration */
        if (!(node_reclaim_mode & RECLAIM_WRITE))
                delta += node_page_state(pgdat, NR_FILE_DIRTY);

        /* Watch for any possible underflows due to delta */
        if (unlikely(delta > nr_pagecache_reclaimable))
                delta = nr_pagecache_reclaimable;

        return nr_pagecache_reclaimable - delta;
}

/*
 * Try to free up some pages from this node through reclaim.
 */
static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
{
        /* Minimum pages needed in order to stay on node */
        const unsigned long nr_pages = 1 << order;
        struct task_struct *p = current;
        unsigned int noreclaim_flag;
        struct scan_control sc = {
                .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
                .gfp_mask = current_gfp_context(gfp_mask),
                .order = order,
                .priority = NODE_RECLAIM_PRIORITY,
                .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
                .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
                .may_swap = 1,
                .reclaim_idx = gfp_zone(gfp_mask),
        };

        trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
                                           sc.gfp_mask);

        cond_resched();
        fs_reclaim_acquire(sc.gfp_mask);
        /*
         * We need to be able to allocate from the reserves for RECLAIM_UNMAP
         * and we also need to be able to write out pages for RECLAIM_WRITE
         * and RECLAIM_UNMAP.
         */
        noreclaim_flag = memalloc_noreclaim_save();
        p->flags |= PF_SWAPWRITE;
        set_task_reclaim_state(p, &sc.reclaim_state);

        if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
                /*
                 * Free memory by calling shrink node with increasing
                 * priorities until we have enough memory freed.
                 */
                do {
                        shrink_node(pgdat, &sc);
                } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
        }

        set_task_reclaim_state(p, NULL);
        current->flags &= ~PF_SWAPWRITE;
        memalloc_noreclaim_restore(noreclaim_flag);
        fs_reclaim_release(sc.gfp_mask);

        trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed);

        return sc.nr_reclaimed >= nr_pages;
}

int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
{
        int ret;

        /*
         * Node reclaim reclaims unmapped file backed pages and
         * slab pages if we are over the defined limits.
         *
         * A small portion of unmapped file backed pages is needed for
         * file I/O otherwise pages read by file I/O will be immediately
         * thrown out if the node is overallocated. So we do not reclaim
         * if less than a specified percentage of the node is used by
         * unmapped file backed pages.
         */
        if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
            node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) <=
            pgdat->min_slab_pages)
                return NODE_RECLAIM_FULL;

        /*
         * Do not scan if the allocation should not be delayed.
         */
        if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
                return NODE_RECLAIM_NOSCAN;

        /*
         * Only run node reclaim on the local node or on nodes that do not
         * have associated processors. This will favor the local processor
         * over remote processors and spread off node memory allocations
         * as wide as possible.
         */
        if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
                return NODE_RECLAIM_NOSCAN;

        if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
                return NODE_RECLAIM_NOSCAN;

        ret = __node_reclaim(pgdat, gfp_mask, order);
        clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags);

        if (!ret)
                count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);

        return ret;
}
#endif

/**
 * check_move_unevictable_pages - check pages for evictability and move to
 * appropriate zone lru list
 * @pvec: pagevec with lru pages to check
 *
 * Checks pages for evictability, if an evictable page is in the unevictable
 * lru list, moves it to the appropriate evictable lru list. This function
 * should be only used for lru pages.
 */
void check_move_unevictable_pages(struct pagevec *pvec)
{
        struct lruvec *lruvec;
        struct pglist_data *pgdat = NULL;
        int pgscanned = 0;
        int pgrescued = 0;
        int i;

        for (i = 0; i < pvec->nr; i++) {
                struct page *page = pvec->pages[i];
                struct pglist_data *pagepgdat = page_pgdat(page);
                int nr_pages;

                if (PageTransTail(page))
                        continue;

                nr_pages = thp_nr_pages(page);
                pgscanned += nr_pages;

                if (pagepgdat != pgdat) {
                        if (pgdat)
                                spin_unlock_irq(&pgdat->lru_lock);
                        pgdat = pagepgdat;
                        spin_lock_irq(&pgdat->lru_lock);
                }
                lruvec = mem_cgroup_page_lruvec(page, pgdat);

                if (!PageLRU(page) || !PageUnevictable(page))
                        continue;

                if (page_evictable(page)) {
                        enum lru_list lru = page_lru_base_type(page);

                        VM_BUG_ON_PAGE(PageActive(page), page);
                        ClearPageUnevictable(page);
                        del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
                        add_page_to_lru_list(page, lruvec, lru);
                        pgrescued += nr_pages;
                }
        }

        if (pgdat) {
                __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
                __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
                spin_unlock_irq(&pgdat->lru_lock);
        }
}
EXPORT_SYMBOL_GPL(check_move_unevictable_pages);





































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NFNETLINK_H
#define _NFNETLINK_H

#include <linux/netlink.h>
#include <linux/capability.h>
#include <net/netlink.h>
#include <uapi/linux/netfilter/nfnetlink.h>

struct nfnl_callback {
        int (*call)(struct net *net, struct sock *nl, struct sk_buff *skb,
                    const struct nlmsghdr *nlh,
                    const struct nlattr * const cda[],
                    struct netlink_ext_ack *extack);
        int (*call_rcu)(struct net *net, struct sock *nl, struct sk_buff *skb,
                        const struct nlmsghdr *nlh,
                        const struct nlattr * const cda[],
                        struct netlink_ext_ack *extack);
        int (*call_batch)(struct net *net, struct sock *nl, struct sk_buff *skb,
                          const struct nlmsghdr *nlh,
                          const struct nlattr * const cda[],
                          struct netlink_ext_ack *extack);
        const struct nla_policy *policy;        /* netlink attribute policy */
        const u_int16_t attr_count;                /* number of nlattr's */
};

enum nfnl_abort_action {
        NFNL_ABORT_NONE                = 0,
        NFNL_ABORT_AUTOLOAD,
        NFNL_ABORT_VALIDATE,
};

struct nfnetlink_subsystem {
        const char *name;
        __u8 subsys_id;                        /* nfnetlink subsystem ID */
        __u8 cb_count;                        /* number of callbacks */
        const struct nfnl_callback *cb;        /* callback for individual types */
        struct module *owner;
        int (*commit)(struct net *net, struct sk_buff *skb);
        int (*abort)(struct net *net, struct sk_buff *skb,
                     enum nfnl_abort_action action);
        bool (*valid_genid)(struct net *net, u32 genid);
};

int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n);
int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n);

int nfnetlink_has_listeners(struct net *net, unsigned int group);
int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid,
                   unsigned int group, int echo, gfp_t flags);
int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error);
int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid);

static inline u16 nfnl_msg_type(u8 subsys, u8 msg_type)
{
        return subsys << 8 | msg_type;
}

static inline void nfnl_fill_hdr(struct nlmsghdr *nlh, u8 family, u8 version,
                                 __be16 res_id)
{
        struct nfgenmsg *nfmsg;

        nfmsg = nlmsg_data(nlh);
        nfmsg->nfgen_family = family;
        nfmsg->version = version;
        nfmsg->res_id = res_id;
}

static inline struct nlmsghdr *nfnl_msg_put(struct sk_buff *skb, u32 portid,
                                            u32 seq, int type, int flags,
                                            u8 family, u8 version,
                                            __be16 res_id)
{
        struct nlmsghdr *nlh;

        nlh = nlmsg_put(skb, portid, seq, type, sizeof(struct nfgenmsg), flags);
        if (!nlh)
                return NULL;

        nfnl_fill_hdr(nlh, family, version, res_id);

        return nlh;
}

void nfnl_lock(__u8 subsys_id);
void nfnl_unlock(__u8 subsys_id);
#ifdef CONFIG_PROVE_LOCKING
bool lockdep_nfnl_is_held(__u8 subsys_id);
#else
static inline bool lockdep_nfnl_is_held(__u8 subsys_id)
{
        return true;
}
#endif /* CONFIG_PROVE_LOCKING */

#define MODULE_ALIAS_NFNL_SUBSYS(subsys) \
        MODULE_ALIAS("nfnetlink-subsys-" __stringify(subsys))

#endif        /* _NFNETLINK_H */










































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PKEYS_H
#define _ASM_X86_PKEYS_H

#define ARCH_DEFAULT_PKEY        0

/*
 * If more than 16 keys are ever supported, a thorough audit
 * will be necessary to ensure that the types that store key
 * numbers and masks have sufficient capacity.
 */
#define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? 16 : 1)

extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
                unsigned long init_val);

static inline bool arch_pkeys_enabled(void)
{
        return boot_cpu_has(X86_FEATURE_OSPKE);
}

/*
 * Try to dedicate one of the protection keys to be used as an
 * execute-only protection key.
 */
extern int __execute_only_pkey(struct mm_struct *mm);
static inline int execute_only_pkey(struct mm_struct *mm)
{
        if (!boot_cpu_has(X86_FEATURE_OSPKE))
                return ARCH_DEFAULT_PKEY;

        return __execute_only_pkey(mm);
}

extern int __arch_override_mprotect_pkey(struct vm_area_struct *vma,
                int prot, int pkey);
static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma,
                int prot, int pkey)
{
        if (!boot_cpu_has(X86_FEATURE_OSPKE))
                return 0;

        return __arch_override_mprotect_pkey(vma, prot, pkey);
}

extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
                unsigned long init_val);

#define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | VM_PKEY_BIT3)

#define mm_pkey_allocation_map(mm)        (mm->context.pkey_allocation_map)
#define mm_set_pkey_allocated(mm, pkey) do {                \
        mm_pkey_allocation_map(mm) |= (1U << pkey);        \
} while (0)
#define mm_set_pkey_free(mm, pkey) do {                        \
        mm_pkey_allocation_map(mm) &= ~(1U << pkey);        \
} while (0)

static inline
bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey)
{
        /*
         * "Allocated" pkeys are those that have been returned
         * from pkey_alloc() or pkey 0 which is allocated
         * implicitly when the mm is created.
         */
        if (pkey < 0)
                return false;
        if (pkey >= arch_max_pkey())
                return false;
        /*
         * The exec-only pkey is set in the allocation map, but
         * is not available to any of the user interfaces like
         * mprotect_pkey().
         */
        if (pkey == mm->context.execute_only_pkey)
                return false;

        return mm_pkey_allocation_map(mm) & (1U << pkey);
}

/*
 * Returns a positive, 4-bit key on success, or -1 on failure.
 */
static inline
int mm_pkey_alloc(struct mm_struct *mm)
{
        /*
         * Note: this is the one and only place we make sure
         * that the pkey is valid as far as the hardware is
         * concerned.  The rest of the kernel trusts that
         * only good, valid pkeys come out of here.
         */
        u16 all_pkeys_mask = ((1U << arch_max_pkey()) - 1);
        int ret;

        /*
         * Are we out of pkeys?  We must handle this specially
         * because ffz() behavior is undefined if there are no
         * zeros.
         */
        if (mm_pkey_allocation_map(mm) == all_pkeys_mask)
                return -1;

        ret = ffz(mm_pkey_allocation_map(mm));

        mm_set_pkey_allocated(mm, ret);

        return ret;
}

static inline
int mm_pkey_free(struct mm_struct *mm, int pkey)
{
        if (!mm_pkey_is_allocated(mm, pkey))
                return -EINVAL;

        mm_set_pkey_free(mm, pkey);

        return 0;
}

extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
                unsigned long init_val);
extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
                unsigned long init_val);
extern void copy_init_pkru_to_fpregs(void);

static inline int vma_pkey(struct vm_area_struct *vma)
{
        unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 |
                                      VM_PKEY_BIT2 | VM_PKEY_BIT3;

        return (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT;
}

#endif /*_ASM_X86_PKEYS_H */









































    5 





























   10 














    5 














    6 
















    1 
















    2 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
/* SPDX-License-Identifier: GPL-2.0 */

/*
 * This header provides generic wrappers for memory access instrumentation that
 * the compiler cannot emit for: KASAN, KCSAN.
 */
#ifndef _LINUX_INSTRUMENTED_H
#define _LINUX_INSTRUMENTED_H

#include <linux/compiler.h>
#include <linux/kasan-checks.h>
#include <linux/kcsan-checks.h>
#include <linux/types.h>

/**
 * instrument_read - instrument regular read access
 *
 * Instrument a regular read access. The instrumentation should be inserted
 * before the actual read happens.
 *
 * @ptr address of access
 * @size size of access
 */
static __always_inline void instrument_read(const volatile void *v, size_t size)
{
        kasan_check_read(v, size);
        kcsan_check_read(v, size);
}

/**
 * instrument_write - instrument regular write access
 *
 * Instrument a regular write access. The instrumentation should be inserted
 * before the actual write happens.
 *
 * @ptr address of access
 * @size size of access
 */
static __always_inline void instrument_write(const volatile void *v, size_t size)
{
        kasan_check_write(v, size);
        kcsan_check_write(v, size);
}

/**
 * instrument_read_write - instrument regular read-write access
 *
 * Instrument a regular write access. The instrumentation should be inserted
 * before the actual write happens.
 *
 * @ptr address of access
 * @size size of access
 */
static __always_inline void instrument_read_write(const volatile void *v, size_t size)
{
        kasan_check_write(v, size);
        kcsan_check_read_write(v, size);
}

/**
 * instrument_atomic_read - instrument atomic read access
 *
 * Instrument an atomic read access. The instrumentation should be inserted
 * before the actual read happens.
 *
 * @ptr address of access
 * @size size of access
 */
static __always_inline void instrument_atomic_read(const volatile void *v, size_t size)
{
        kasan_check_read(v, size);
        kcsan_check_atomic_read(v, size);
}

/**
 * instrument_atomic_write - instrument atomic write access
 *
 * Instrument an atomic write access. The instrumentation should be inserted
 * before the actual write happens.
 *
 * @ptr address of access
 * @size size of access
 */
static __always_inline void instrument_atomic_write(const volatile void *v, size_t size)
{
        kasan_check_write(v, size);
        kcsan_check_atomic_write(v, size);
}

/**
 * instrument_atomic_read_write - instrument atomic read-write access
 *
 * Instrument an atomic read-write access. The instrumentation should be
 * inserted before the actual write happens.
 *
 * @ptr address of access
 * @size size of access
 */
static __always_inline void instrument_atomic_read_write(const volatile void *v, size_t size)
{
        kasan_check_write(v, size);
        kcsan_check_atomic_read_write(v, size);
}

/**
 * instrument_copy_to_user - instrument reads of copy_to_user
 *
 * Instrument reads from kernel memory, that are due to copy_to_user (and
 * variants). The instrumentation must be inserted before the accesses.
 *
 * @to destination address
 * @from source address
 * @n number of bytes to copy
 */
static __always_inline void
instrument_copy_to_user(void __user *to, const void *from, unsigned long n)
{
        kasan_check_read(from, n);
        kcsan_check_read(from, n);
}

/**
 * instrument_copy_from_user - instrument writes of copy_from_user
 *
 * Instrument writes to kernel memory, that are due to copy_from_user (and
 * variants). The instrumentation should be inserted before the accesses.
 *
 * @to destination address
 * @from source address
 * @n number of bytes to copy
 */
static __always_inline void
instrument_copy_from_user(const void *to, const void __user *from, unsigned long n)
{
        kasan_check_write(to, n);
        kcsan_check_write(to, n);
}

#endif /* _LINUX_INSTRUMENTED_H */













































    1 


















    1 














































    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * kref.h - library routines for handling generic reference counted objects
 *
 * Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com>
 * Copyright (C) 2004 IBM Corp.
 *
 * based on kobject.h which was:
 * Copyright (C) 2002-2003 Patrick Mochel <mochel@osdl.org>
 * Copyright (C) 2002-2003 Open Source Development Labs
 */

#ifndef _KREF_H_
#define _KREF_H_

#include <linux/spinlock.h>
#include <linux/refcount.h>

struct kref {
        refcount_t refcount;
};

#define KREF_INIT(n)        { .refcount = REFCOUNT_INIT(n), }

/**
 * kref_init - initialize object.
 * @kref: object in question.
 */
static inline void kref_init(struct kref *kref)
{
        refcount_set(&kref->refcount, 1);
}

static inline unsigned int kref_read(const struct kref *kref)
{
        return refcount_read(&kref->refcount);
}

/**
 * kref_get - increment refcount for object.
 * @kref: object.
 */
static inline void kref_get(struct kref *kref)
{
        refcount_inc(&kref->refcount);
}

/**
 * kref_put - decrement refcount for object.
 * @kref: object.
 * @release: pointer to the function that will clean up the object when the
 *             last reference to the object is released.
 *             This pointer is required, and it is not acceptable to pass kfree
 *             in as this function.
 *
 * Decrement the refcount, and if 0, call release().
 * Return 1 if the object was removed, otherwise return 0.  Beware, if this
 * function returns 0, you still can not count on the kref from remaining in
 * memory.  Only use the return value if you want to see if the kref is now
 * gone, not present.
 */
static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref))
{
        if (refcount_dec_and_test(&kref->refcount)) {
                release(kref);
                return 1;
        }
        return 0;
}

static inline int kref_put_mutex(struct kref *kref,
                                 void (*release)(struct kref *kref),
                                 struct mutex *lock)
{
        if (refcount_dec_and_mutex_lock(&kref->refcount, lock)) {
                release(kref);
                return 1;
        }
        return 0;
}

static inline int kref_put_lock(struct kref *kref,
                                void (*release)(struct kref *kref),
                                spinlock_t *lock)
{
        if (refcount_dec_and_lock(&kref->refcount, lock)) {
                release(kref);
                return 1;
        }
        return 0;
}

/**
 * kref_get_unless_zero - Increment refcount for object unless it is zero.
 * @kref: object.
 *
 * Return non-zero if the increment succeeded. Otherwise return 0.
 *
 * This function is intended to simplify locking around refcounting for
 * objects that can be looked up from a lookup structure, and which are
 * removed from that lookup structure in the object destructor.
 * Operations on such objects require at least a read lock around
 * lookup + kref_get, and a write lock around kref_put + remove from lookup
 * structure. Furthermore, RCU implementations become extremely tricky.
 * With a lookup followed by a kref_get_unless_zero *with return value check*
 * locking in the kref_put path can be deferred to the actual removal from
 * the lookup structure and RCU lookups become trivial.
 */
static inline int __must_check kref_get_unless_zero(struct kref *kref)
{
        return refcount_inc_not_zero(&kref->refcount);
}
#endif /* _KREF_H_ */





























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * async.h: Asynchronous function calls for boot performance
 *
 * (C) Copyright 2009 Intel Corporation
 * Author: Arjan van de Ven <arjan@linux.intel.com>
 */
#ifndef __ASYNC_H__
#define __ASYNC_H__

#include <linux/types.h>
#include <linux/list.h>
#include <linux/numa.h>
#include <linux/device.h>

typedef u64 async_cookie_t;
typedef void (*async_func_t) (void *data, async_cookie_t cookie);
struct async_domain {
        struct list_head pending;
        unsigned registered:1;
};

/*
 * domain participates in global async_synchronize_full
 */
#define ASYNC_DOMAIN(_name) \
        struct async_domain _name = { .pending = LIST_HEAD_INIT(_name.pending),        \
                                      .registered = 1 }

/*
 * domain is free to go out of scope as soon as all pending work is
 * complete, this domain does not participate in async_synchronize_full
 */
#define ASYNC_DOMAIN_EXCLUSIVE(_name) \
        struct async_domain _name = { .pending = LIST_HEAD_INIT(_name.pending), \
                                      .registered = 0 }

async_cookie_t async_schedule_node(async_func_t func, void *data,
                                   int node);
async_cookie_t async_schedule_node_domain(async_func_t func, void *data,
                                          int node,
                                          struct async_domain *domain);

/**
 * async_schedule - schedule a function for asynchronous execution
 * @func: function to execute asynchronously
 * @data: data pointer to pass to the function
 *
 * Returns an async_cookie_t that may be used for checkpointing later.
 * Note: This function may be called from atomic or non-atomic contexts.
 */
static inline async_cookie_t async_schedule(async_func_t func, void *data)
{
        return async_schedule_node(func, data, NUMA_NO_NODE);
}

/**
 * async_schedule_domain - schedule a function for asynchronous execution within a certain domain
 * @func: function to execute asynchronously
 * @data: data pointer to pass to the function
 * @domain: the domain
 *
 * Returns an async_cookie_t that may be used for checkpointing later.
 * @domain may be used in the async_synchronize_*_domain() functions to
 * wait within a certain synchronization domain rather than globally.
 * Note: This function may be called from atomic or non-atomic contexts.
 */
static inline async_cookie_t
async_schedule_domain(async_func_t func, void *data,
                      struct async_domain *domain)
{
        return async_schedule_node_domain(func, data, NUMA_NO_NODE, domain);
}

/**
 * async_schedule_dev - A device specific version of async_schedule
 * @func: function to execute asynchronously
 * @dev: device argument to be passed to function
 *
 * Returns an async_cookie_t that may be used for checkpointing later.
 * @dev is used as both the argument for the function and to provide NUMA
 * context for where to run the function. By doing this we can try to
 * provide for the best possible outcome by operating on the device on the
 * CPUs closest to the device.
 * Note: This function may be called from atomic or non-atomic contexts.
 */
static inline async_cookie_t
async_schedule_dev(async_func_t func, struct device *dev)
{
        return async_schedule_node(func, dev, dev_to_node(dev));
}

bool async_schedule_dev_nocall(async_func_t func, struct device *dev);

/**
 * async_schedule_dev_domain - A device specific version of async_schedule_domain
 * @func: function to execute asynchronously
 * @dev: device argument to be passed to function
 * @domain: the domain
 *
 * Returns an async_cookie_t that may be used for checkpointing later.
 * @dev is used as both the argument for the function and to provide NUMA
 * context for where to run the function. By doing this we can try to
 * provide for the best possible outcome by operating on the device on the
 * CPUs closest to the device.
 * @domain may be used in the async_synchronize_*_domain() functions to
 * wait within a certain synchronization domain rather than globally.
 * Note: This function may be called from atomic or non-atomic contexts.
 */
static inline async_cookie_t
async_schedule_dev_domain(async_func_t func, struct device *dev,
                          struct async_domain *domain)
{
        return async_schedule_node_domain(func, dev, dev_to_node(dev), domain);
}

void async_unregister_domain(struct async_domain *domain);
extern void async_synchronize_full(void);
extern void async_synchronize_full_domain(struct async_domain *domain);
extern void async_synchronize_cookie(async_cookie_t cookie);
extern void async_synchronize_cookie_domain(async_cookie_t cookie,
                                            struct async_domain *domain);
extern bool current_is_async(void);
#endif





















































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PARAVIRT_H
#define _ASM_X86_PARAVIRT_H
/* Various instructions on x86 need to be replaced for
 * para-virtualization: those hooks are defined here. */

#ifdef CONFIG_PARAVIRT
#include <asm/pgtable_types.h>
#include <asm/asm.h>
#include <asm/nospec-branch.h>

#include <asm/paravirt_types.h>

#ifndef __ASSEMBLY__
#include <linux/bug.h>
#include <linux/types.h>
#include <linux/cpumask.h>
#include <asm/frame.h>

static inline unsigned long long paravirt_sched_clock(void)
{
        return PVOP_CALL0(unsigned long long, time.sched_clock);
}

struct static_key;
extern struct static_key paravirt_steal_enabled;
extern struct static_key paravirt_steal_rq_enabled;

__visible void __native_queued_spin_unlock(struct qspinlock *lock);
bool pv_is_native_spin_unlock(void);
__visible bool __native_vcpu_is_preempted(long cpu);
bool pv_is_native_vcpu_is_preempted(void);

static inline u64 paravirt_steal_clock(int cpu)
{
        return PVOP_CALL1(u64, time.steal_clock, cpu);
}

/* The paravirtualized I/O functions */
static inline void slow_down_io(void)
{
        pv_ops.cpu.io_delay();
#ifdef REALLY_SLOW_IO
        pv_ops.cpu.io_delay();
        pv_ops.cpu.io_delay();
        pv_ops.cpu.io_delay();
#endif
}

void native_flush_tlb_local(void);
void native_flush_tlb_global(void);
void native_flush_tlb_one_user(unsigned long addr);
void native_flush_tlb_others(const struct cpumask *cpumask,
                             const struct flush_tlb_info *info);

static inline void __flush_tlb_local(void)
{
        PVOP_VCALL0(mmu.flush_tlb_user);
}

static inline void __flush_tlb_global(void)
{
        PVOP_VCALL0(mmu.flush_tlb_kernel);
}

static inline void __flush_tlb_one_user(unsigned long addr)
{
        PVOP_VCALL1(mmu.flush_tlb_one_user, addr);
}

static inline void __flush_tlb_others(const struct cpumask *cpumask,
                                      const struct flush_tlb_info *info)
{
        PVOP_VCALL2(mmu.flush_tlb_others, cpumask, info);
}

static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
{
        PVOP_VCALL2(mmu.tlb_remove_table, tlb, table);
}

static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
{
        PVOP_VCALL1(mmu.exit_mmap, mm);
}

#ifdef CONFIG_PARAVIRT_XXL
static inline void load_sp0(unsigned long sp0)
{
        PVOP_VCALL1(cpu.load_sp0, sp0);
}

/* The paravirtualized CPUID instruction. */
static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
                           unsigned int *ecx, unsigned int *edx)
{
        PVOP_VCALL4(cpu.cpuid, eax, ebx, ecx, edx);
}

/*
 * These special macros can be used to get or set a debugging register
 */
static inline unsigned long paravirt_get_debugreg(int reg)
{
        return PVOP_CALL1(unsigned long, cpu.get_debugreg, reg);
}
#define get_debugreg(var, reg) var = paravirt_get_debugreg(reg)
static inline void set_debugreg(unsigned long val, int reg)
{
        PVOP_VCALL2(cpu.set_debugreg, reg, val);
}

static inline unsigned long read_cr0(void)
{
        return PVOP_CALL0(unsigned long, cpu.read_cr0);
}

static inline void write_cr0(unsigned long x)
{
        PVOP_VCALL1(cpu.write_cr0, x);
}

static inline unsigned long read_cr2(void)
{
        return PVOP_CALLEE0(unsigned long, mmu.read_cr2);
}

static inline void write_cr2(unsigned long x)
{
        PVOP_VCALL1(mmu.write_cr2, x);
}

static inline unsigned long __read_cr3(void)
{
        return PVOP_CALL0(unsigned long, mmu.read_cr3);
}

static inline void write_cr3(unsigned long x)
{
        PVOP_VCALL1(mmu.write_cr3, x);
}

static inline void __write_cr4(unsigned long x)
{
        PVOP_VCALL1(cpu.write_cr4, x);
}

static inline void arch_safe_halt(void)
{
        PVOP_VCALL0(irq.safe_halt);
}

static inline void halt(void)
{
        PVOP_VCALL0(irq.halt);
}

static inline void wbinvd(void)
{
        PVOP_VCALL0(cpu.wbinvd);
}

static inline u64 paravirt_read_msr(unsigned msr)
{
        return PVOP_CALL1(u64, cpu.read_msr, msr);
}

static inline void paravirt_write_msr(unsigned msr,
                                      unsigned low, unsigned high)
{
        PVOP_VCALL3(cpu.write_msr, msr, low, high);
}

static inline u64 paravirt_read_msr_safe(unsigned msr, int *err)
{
        return PVOP_CALL2(u64, cpu.read_msr_safe, msr, err);
}

static inline int paravirt_write_msr_safe(unsigned msr,
                                          unsigned low, unsigned high)
{
        return PVOP_CALL3(int, cpu.write_msr_safe, msr, low, high);
}

#define rdmsr(msr, val1, val2)                        \
do {                                                \
        u64 _l = paravirt_read_msr(msr);        \
        val1 = (u32)_l;                                \
        val2 = _l >> 32;                        \
} while (0)

#define wrmsr(msr, val1, val2)                        \
do {                                                \
        paravirt_write_msr(msr, val1, val2);        \
} while (0)

#define rdmsrl(msr, val)                        \
do {                                                \
        val = paravirt_read_msr(msr);                \
} while (0)

static inline void wrmsrl(unsigned msr, u64 val)
{
        wrmsr(msr, (u32)val, (u32)(val>>32));
}

#define wrmsr_safe(msr, a, b)        paravirt_write_msr_safe(msr, a, b)

/* rdmsr with exception handling */
#define rdmsr_safe(msr, a, b)                                \
({                                                        \
        int _err;                                        \
        u64 _l = paravirt_read_msr_safe(msr, &_err);        \
        (*a) = (u32)_l;                                        \
        (*b) = _l >> 32;                                \
        _err;                                                \
})

static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
{
        int err;

        *p = paravirt_read_msr_safe(msr, &err);
        return err;
}

static inline unsigned long long paravirt_read_pmc(int counter)
{
        return PVOP_CALL1(u64, cpu.read_pmc, counter);
}

#define rdpmc(counter, low, high)                \
do {                                                \
        u64 _l = paravirt_read_pmc(counter);        \
        low = (u32)_l;                                \
        high = _l >> 32;                        \
} while (0)

#define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter))

static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
{
        PVOP_VCALL2(cpu.alloc_ldt, ldt, entries);
}

static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
{
        PVOP_VCALL2(cpu.free_ldt, ldt, entries);
}

static inline void load_TR_desc(void)
{
        PVOP_VCALL0(cpu.load_tr_desc);
}
static inline void load_gdt(const struct desc_ptr *dtr)
{
        PVOP_VCALL1(cpu.load_gdt, dtr);
}
static inline void load_idt(const struct desc_ptr *dtr)
{
        PVOP_VCALL1(cpu.load_idt, dtr);
}
static inline void set_ldt(const void *addr, unsigned entries)
{
        PVOP_VCALL2(cpu.set_ldt, addr, entries);
}
static inline unsigned long paravirt_store_tr(void)
{
        return PVOP_CALL0(unsigned long, cpu.store_tr);
}

#define store_tr(tr)        ((tr) = paravirt_store_tr())
static inline void load_TLS(struct thread_struct *t, unsigned cpu)
{
        PVOP_VCALL2(cpu.load_tls, t, cpu);
}

static inline void load_gs_index(unsigned int gs)
{
        PVOP_VCALL1(cpu.load_gs_index, gs);
}

static inline void write_ldt_entry(struct desc_struct *dt, int entry,
                                   const void *desc)
{
        PVOP_VCALL3(cpu.write_ldt_entry, dt, entry, desc);
}

static inline void write_gdt_entry(struct desc_struct *dt, int entry,
                                   void *desc, int type)
{
        PVOP_VCALL4(cpu.write_gdt_entry, dt, entry, desc, type);
}

static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
{
        PVOP_VCALL3(cpu.write_idt_entry, dt, entry, g);
}

#ifdef CONFIG_X86_IOPL_IOPERM
static inline void tss_invalidate_io_bitmap(void)
{
        PVOP_VCALL0(cpu.invalidate_io_bitmap);
}

static inline void tss_update_io_bitmap(void)
{
        PVOP_VCALL0(cpu.update_io_bitmap);
}
#endif

static inline void paravirt_activate_mm(struct mm_struct *prev,
                                        struct mm_struct *next)
{
        PVOP_VCALL2(mmu.activate_mm, prev, next);
}

static inline void paravirt_arch_dup_mmap(struct mm_struct *oldmm,
                                          struct mm_struct *mm)
{
        PVOP_VCALL2(mmu.dup_mmap, oldmm, mm);
}

static inline int paravirt_pgd_alloc(struct mm_struct *mm)
{
        return PVOP_CALL1(int, mmu.pgd_alloc, mm);
}

static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
        PVOP_VCALL2(mmu.pgd_free, mm, pgd);
}

static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn)
{
        PVOP_VCALL2(mmu.alloc_pte, mm, pfn);
}
static inline void paravirt_release_pte(unsigned long pfn)
{
        PVOP_VCALL1(mmu.release_pte, pfn);
}

static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
{
        PVOP_VCALL2(mmu.alloc_pmd, mm, pfn);
}

static inline void paravirt_release_pmd(unsigned long pfn)
{
        PVOP_VCALL1(mmu.release_pmd, pfn);
}

static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn)
{
        PVOP_VCALL2(mmu.alloc_pud, mm, pfn);
}
static inline void paravirt_release_pud(unsigned long pfn)
{
        PVOP_VCALL1(mmu.release_pud, pfn);
}

static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn)
{
        PVOP_VCALL2(mmu.alloc_p4d, mm, pfn);
}

static inline void paravirt_release_p4d(unsigned long pfn)
{
        PVOP_VCALL1(mmu.release_p4d, pfn);
}

static inline pte_t __pte(pteval_t val)
{
        return (pte_t) { PVOP_CALLEE1(pteval_t, mmu.make_pte, val) };
}

static inline pteval_t pte_val(pte_t pte)
{
        return PVOP_CALLEE1(pteval_t, mmu.pte_val, pte.pte);
}

static inline pgd_t __pgd(pgdval_t val)
{
        return (pgd_t) { PVOP_CALLEE1(pgdval_t, mmu.make_pgd, val) };
}

static inline pgdval_t pgd_val(pgd_t pgd)
{
        return PVOP_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd);
}

#define  __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr,
                                           pte_t *ptep)
{
        pteval_t ret;

        ret = PVOP_CALL3(pteval_t, mmu.ptep_modify_prot_start, vma, addr, ptep);

        return (pte_t) { .pte = ret };
}

static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
                                           pte_t *ptep, pte_t old_pte, pte_t pte)
{

        PVOP_VCALL4(mmu.ptep_modify_prot_commit, vma, addr, ptep, pte.pte);
}

static inline void set_pte(pte_t *ptep, pte_t pte)
{
        PVOP_VCALL2(mmu.set_pte, ptep, pte.pte);
}

static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
{
        PVOP_VCALL2(mmu.set_pmd, pmdp, native_pmd_val(pmd));
}

static inline pmd_t __pmd(pmdval_t val)
{
        return (pmd_t) { PVOP_CALLEE1(pmdval_t, mmu.make_pmd, val) };
}

static inline pmdval_t pmd_val(pmd_t pmd)
{
        return PVOP_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd);
}

static inline void set_pud(pud_t *pudp, pud_t pud)
{
        PVOP_VCALL2(mmu.set_pud, pudp, native_pud_val(pud));
}

static inline pud_t __pud(pudval_t val)
{
        pudval_t ret;

        ret = PVOP_CALLEE1(pudval_t, mmu.make_pud, val);

        return (pud_t) { ret };
}

static inline pudval_t pud_val(pud_t pud)
{
        return PVOP_CALLEE1(pudval_t, mmu.pud_val, pud.pud);
}

static inline void pud_clear(pud_t *pudp)
{
        set_pud(pudp, native_make_pud(0));
}

static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
{
        p4dval_t val = native_p4d_val(p4d);

        PVOP_VCALL2(mmu.set_p4d, p4dp, val);
}

#if CONFIG_PGTABLE_LEVELS >= 5

static inline p4d_t __p4d(p4dval_t val)
{
        p4dval_t ret = PVOP_CALLEE1(p4dval_t, mmu.make_p4d, val);

        return (p4d_t) { ret };
}

static inline p4dval_t p4d_val(p4d_t p4d)
{
        return PVOP_CALLEE1(p4dval_t, mmu.p4d_val, p4d.p4d);
}

static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd)
{
        PVOP_VCALL2(mmu.set_pgd, pgdp, native_pgd_val(pgd));
}

#define set_pgd(pgdp, pgdval) do {                                        \
        if (pgtable_l5_enabled())                                                \
                __set_pgd(pgdp, pgdval);                                \
        else                                                                \
                set_p4d((p4d_t *)(pgdp), (p4d_t) { (pgdval).pgd });        \
} while (0)

#define pgd_clear(pgdp) do {                                                \
        if (pgtable_l5_enabled())                                        \
                set_pgd(pgdp, native_make_pgd(0));                        \
} while (0)

#endif  /* CONFIG_PGTABLE_LEVELS == 5 */

static inline void p4d_clear(p4d_t *p4dp)
{
        set_p4d(p4dp, native_make_p4d(0));
}

static inline void set_pte_atomic(pte_t *ptep, pte_t pte)
{
        set_pte(ptep, pte);
}

static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
                             pte_t *ptep)
{
        set_pte(ptep, native_make_pte(0));
}

static inline void pmd_clear(pmd_t *pmdp)
{
        set_pmd(pmdp, native_make_pmd(0));
}

#define  __HAVE_ARCH_START_CONTEXT_SWITCH
static inline void arch_start_context_switch(struct task_struct *prev)
{
        PVOP_VCALL1(cpu.start_context_switch, prev);
}

static inline void arch_end_context_switch(struct task_struct *next)
{
        PVOP_VCALL1(cpu.end_context_switch, next);
}

#define  __HAVE_ARCH_ENTER_LAZY_MMU_MODE
static inline void arch_enter_lazy_mmu_mode(void)
{
        PVOP_VCALL0(mmu.lazy_mode.enter);
}

static inline void arch_leave_lazy_mmu_mode(void)
{
        PVOP_VCALL0(mmu.lazy_mode.leave);
}

static inline void arch_flush_lazy_mmu_mode(void)
{
        PVOP_VCALL0(mmu.lazy_mode.flush);
}

static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
                                phys_addr_t phys, pgprot_t flags)
{
        pv_ops.mmu.set_fixmap(idx, phys, flags);
}
#endif

#if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS)

static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock,
                                                        u32 val)
{
        PVOP_VCALL2(lock.queued_spin_lock_slowpath, lock, val);
}

static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock)
{
        PVOP_VCALLEE1(lock.queued_spin_unlock, lock);
}

static __always_inline void pv_wait(u8 *ptr, u8 val)
{
        PVOP_VCALL2(lock.wait, ptr, val);
}

static __always_inline void pv_kick(int cpu)
{
        PVOP_VCALL1(lock.kick, cpu);
}

static __always_inline bool pv_vcpu_is_preempted(long cpu)
{
        return PVOP_CALLEE1(bool, lock.vcpu_is_preempted, cpu);
}

void __raw_callee_save___native_queued_spin_unlock(struct qspinlock *lock);
bool __raw_callee_save___native_vcpu_is_preempted(long cpu);

#endif /* SMP && PARAVIRT_SPINLOCKS */

#ifdef CONFIG_X86_32
/* save and restore all caller-save registers, except return value */
#define PV_SAVE_ALL_CALLER_REGS                "pushl %ecx;"
#define PV_RESTORE_ALL_CALLER_REGS        "popl  %ecx;"
#else
/* save and restore all caller-save registers, except return value */
#define PV_SAVE_ALL_CALLER_REGS                                                \
        "push %rcx;"                                                        \
        "push %rdx;"                                                        \
        "push %rsi;"                                                        \
        "push %rdi;"                                                        \
        "push %r8;"                                                        \
        "push %r9;"                                                        \
        "push %r10;"                                                        \
        "push %r11;"
#define PV_RESTORE_ALL_CALLER_REGS                                        \
        "pop %r11;"                                                        \
        "pop %r10;"                                                        \
        "pop %r9;"                                                        \
        "pop %r8;"                                                        \
        "pop %rdi;"                                                        \
        "pop %rsi;"                                                        \
        "pop %rdx;"                                                        \
        "pop %rcx;"
#endif

/*
 * Generate a thunk around a function which saves all caller-save
 * registers except for the return value.  This allows C functions to
 * be called from assembler code where fewer than normal registers are
 * available.  It may also help code generation around calls from C
 * code if the common case doesn't use many registers.
 *
 * When a callee is wrapped in a thunk, the caller can assume that all
 * arg regs and all scratch registers are preserved across the
 * call. The return value in rax/eax will not be saved, even for void
 * functions.
 */
#define PV_THUNK_NAME(func) "__raw_callee_save_" #func
#define PV_CALLEE_SAVE_REGS_THUNK(func)                                        \
        extern typeof(func) __raw_callee_save_##func;                        \
                                                                        \
        asm(".pushsection .text;"                                        \
            ".globl " PV_THUNK_NAME(func) ";"                                \
            ".type " PV_THUNK_NAME(func) ", @function;"                        \
            PV_THUNK_NAME(func) ":"                                        \
            FRAME_BEGIN                                                        \
            PV_SAVE_ALL_CALLER_REGS                                        \
            "call " #func ";"                                                \
            PV_RESTORE_ALL_CALLER_REGS                                        \
            FRAME_END                                                        \
            ASM_RET                                                        \
            ".size " PV_THUNK_NAME(func) ", .-" PV_THUNK_NAME(func) ";"        \
            ".popsection")

/* Get a reference to a callee-save function */
#define PV_CALLEE_SAVE(func)                                                \
        ((struct paravirt_callee_save) { __raw_callee_save_##func })

/* Promise that "func" already uses the right calling convention */
#define __PV_IS_CALLEE_SAVE(func)                        \
        ((struct paravirt_callee_save) { func })

#ifdef CONFIG_PARAVIRT_XXL
static inline notrace unsigned long arch_local_save_flags(void)
{
        return PVOP_CALLEE0(unsigned long, irq.save_fl);
}

static inline notrace void arch_local_irq_restore(unsigned long f)
{
        PVOP_VCALLEE1(irq.restore_fl, f);
}

static inline notrace void arch_local_irq_disable(void)
{
        PVOP_VCALLEE0(irq.irq_disable);
}

static inline notrace void arch_local_irq_enable(void)
{
        PVOP_VCALLEE0(irq.irq_enable);
}

static inline notrace unsigned long arch_local_irq_save(void)
{
        unsigned long f;

        f = arch_local_save_flags();
        arch_local_irq_disable();
        return f;
}
#endif


/* Make sure as little as possible of this mess escapes. */
#undef PARAVIRT_CALL
#undef __PVOP_CALL
#undef __PVOP_VCALL
#undef PVOP_VCALL0
#undef PVOP_CALL0
#undef PVOP_VCALL1
#undef PVOP_CALL1
#undef PVOP_VCALL2
#undef PVOP_CALL2
#undef PVOP_VCALL3
#undef PVOP_CALL3
#undef PVOP_VCALL4
#undef PVOP_CALL4

extern void default_banner(void);

#else  /* __ASSEMBLY__ */

#define _PVSITE(ptype, ops, word, algn)                \
771:;                                                \
        ops;                                        \
772:;                                                \
        .pushsection .parainstructions,"a";        \
         .align        algn;                                \
         word 771b;                                \
         .byte ptype;                                \
         .byte 772b-771b;                        \
        .popsection


#define COND_PUSH(set, mask, reg)                        \
        .if ((~(set)) & mask); push %reg; .endif
#define COND_POP(set, mask, reg)                        \
        .if ((~(set)) & mask); pop %reg; .endif

#ifdef CONFIG_X86_64

#define PV_SAVE_REGS(set)                        \
        COND_PUSH(set, CLBR_RAX, rax);                \
        COND_PUSH(set, CLBR_RCX, rcx);                \
        COND_PUSH(set, CLBR_RDX, rdx);                \
        COND_PUSH(set, CLBR_RSI, rsi);                \
        COND_PUSH(set, CLBR_RDI, rdi);                \
        COND_PUSH(set, CLBR_R8, r8);                \
        COND_PUSH(set, CLBR_R9, r9);                \
        COND_PUSH(set, CLBR_R10, r10);                \
        COND_PUSH(set, CLBR_R11, r11)
#define PV_RESTORE_REGS(set)                        \
        COND_POP(set, CLBR_R11, r11);                \
        COND_POP(set, CLBR_R10, r10);                \
        COND_POP(set, CLBR_R9, r9);                \
        COND_POP(set, CLBR_R8, r8);                \
        COND_POP(set, CLBR_RDI, rdi);                \
        COND_POP(set, CLBR_RSI, rsi);                \
        COND_POP(set, CLBR_RDX, rdx);                \
        COND_POP(set, CLBR_RCX, rcx);                \
        COND_POP(set, CLBR_RAX, rax)

#define PARA_PATCH(off)                ((off) / 8)
#define PARA_SITE(ptype, ops)        _PVSITE(ptype, ops, .quad, 8)
#define PARA_INDIRECT(addr)        *addr(%rip)
#else
#define PV_SAVE_REGS(set)                        \
        COND_PUSH(set, CLBR_EAX, eax);                \
        COND_PUSH(set, CLBR_EDI, edi);                \
        COND_PUSH(set, CLBR_ECX, ecx);                \
        COND_PUSH(set, CLBR_EDX, edx)
#define PV_RESTORE_REGS(set)                        \
        COND_POP(set, CLBR_EDX, edx);                \
        COND_POP(set, CLBR_ECX, ecx);                \
        COND_POP(set, CLBR_EDI, edi);                \
        COND_POP(set, CLBR_EAX, eax)

#define PARA_PATCH(off)                ((off) / 4)
#define PARA_SITE(ptype, ops)        _PVSITE(ptype, ops, .long, 4)
#define PARA_INDIRECT(addr)        *%cs:addr
#endif

#ifdef CONFIG_PARAVIRT_XXL
#define INTERRUPT_RETURN                                                \
        PARA_SITE(PARA_PATCH(PV_CPU_iret),                                \
                  ANNOTATE_RETPOLINE_SAFE;                                \
                  jmp PARA_INDIRECT(pv_ops+PV_CPU_iret);)

#define DISABLE_INTERRUPTS(clobbers)                                        \
        PARA_SITE(PARA_PATCH(PV_IRQ_irq_disable),                        \
                  PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE);                \
                  ANNOTATE_RETPOLINE_SAFE;                                \
                  call PARA_INDIRECT(pv_ops+PV_IRQ_irq_disable);        \
                  PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)

#define ENABLE_INTERRUPTS(clobbers)                                        \
        PARA_SITE(PARA_PATCH(PV_IRQ_irq_enable),                        \
                  PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE);                \
                  ANNOTATE_RETPOLINE_SAFE;                                \
                  call PARA_INDIRECT(pv_ops+PV_IRQ_irq_enable);                \
                  PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
#endif

#ifdef CONFIG_X86_64
#ifdef CONFIG_PARAVIRT_XXL
#ifdef CONFIG_DEBUG_ENTRY
#define SAVE_FLAGS(clobbers)                                        \
        PARA_SITE(PARA_PATCH(PV_IRQ_save_fl),                            \
                  PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE);        \
                  ANNOTATE_RETPOLINE_SAFE;                            \
                  call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl);            \
                  PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
#endif
#endif /* CONFIG_PARAVIRT_XXL */
#endif        /* CONFIG_X86_64 */

#ifdef CONFIG_PARAVIRT_XXL

#define GET_CR2_INTO_AX                                                        \
        PARA_SITE(PARA_PATCH(PV_MMU_read_cr2),                                \
                  ANNOTATE_RETPOLINE_SAFE;                                \
                  call PARA_INDIRECT(pv_ops+PV_MMU_read_cr2);                \
                 )

#endif /* CONFIG_PARAVIRT_XXL */


#endif /* __ASSEMBLY__ */
#else  /* CONFIG_PARAVIRT */
# define default_banner x86_init_noop
#endif /* !CONFIG_PARAVIRT */

#ifndef __ASSEMBLY__
#ifndef CONFIG_PARAVIRT_XXL
static inline void paravirt_arch_dup_mmap(struct mm_struct *oldmm,
                                          struct mm_struct *mm)
{
}
#endif

#ifndef CONFIG_PARAVIRT
static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
{
}
#endif
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_PARAVIRT_H */





























































    5 





    1 

    1 





























    1 





















































































































































    4 

























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *        Berkeley style UIO structures        -        Alan Cox 1994.
 */
#ifndef __LINUX_UIO_H
#define __LINUX_UIO_H

#include <linux/kernel.h>
#include <linux/thread_info.h>
#include <uapi/linux/uio.h>

struct page;
struct pipe_inode_info;

struct kvec {
        void *iov_base; /* and that should *never* hold a userland pointer */
        size_t iov_len;
};

enum iter_type {
        /* iter types */
        ITER_IOVEC = 4,
        ITER_KVEC = 8,
        ITER_BVEC = 16,
        ITER_PIPE = 32,
        ITER_DISCARD = 64,
};

struct iov_iter_state {
        size_t iov_offset;
        size_t count;
        unsigned long nr_segs;
};

struct iov_iter {
        /*
         * Bit 0 is the read/write bit, set if we're writing.
         * Bit 1 is the BVEC_FLAG_NO_REF bit, set if type is a bvec and
         * the caller isn't expecting to drop a page reference when done.
         */
        unsigned int type;
        size_t iov_offset;
        size_t count;
        union {
                const struct iovec *iov;
                const struct kvec *kvec;
                const struct bio_vec *bvec;
                struct pipe_inode_info *pipe;
        };
        union {
                unsigned long nr_segs;
                struct {
                        unsigned int head;
                        unsigned int start_head;
                };
        };
};

static inline enum iter_type iov_iter_type(const struct iov_iter *i)
{
        return i->type & ~(READ | WRITE);
}

static inline void iov_iter_save_state(struct iov_iter *iter,
                                       struct iov_iter_state *state)
{
        state->iov_offset = iter->iov_offset;
        state->count = iter->count;
        state->nr_segs = iter->nr_segs;
}

static inline bool iter_is_iovec(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_IOVEC;
}

static inline bool iov_iter_is_kvec(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_KVEC;
}

static inline bool iov_iter_is_bvec(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_BVEC;
}

static inline bool iov_iter_is_pipe(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_PIPE;
}

static inline bool iov_iter_is_discard(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_DISCARD;
}

static inline unsigned char iov_iter_rw(const struct iov_iter *i)
{
        return i->type & (READ | WRITE);
}

/*
 * Total number of bytes covered by an iovec.
 *
 * NOTE that it is not safe to use this function until all the iovec's
 * segment lengths have been validated.  Because the individual lengths can
 * overflow a size_t when added together.
 */
static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs)
{
        unsigned long seg;
        size_t ret = 0;

        for (seg = 0; seg < nr_segs; seg++)
                ret += iov[seg].iov_len;
        return ret;
}

static inline struct iovec iov_iter_iovec(const struct iov_iter *iter)
{
        return (struct iovec) {
                .iov_base = iter->iov->iov_base + iter->iov_offset,
                .iov_len = min(iter->count,
                               iter->iov->iov_len - iter->iov_offset),
        };
}

size_t iov_iter_copy_from_user_atomic(struct page *page,
                struct iov_iter *i, unsigned long offset, size_t bytes);
void iov_iter_advance(struct iov_iter *i, size_t bytes);
void iov_iter_revert(struct iov_iter *i, size_t bytes);
int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes);
size_t iov_iter_single_seg_count(const struct iov_iter *i);
size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
                         struct iov_iter *i);
size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
                         struct iov_iter *i);

size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i);
bool _copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i);
size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i);
bool _copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i);

static __always_inline __must_check
size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
{
        if (unlikely(!check_copy_size(addr, bytes, true)))
                return 0;
        else
                return _copy_to_iter(addr, bytes, i);
}

static __always_inline __must_check
size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
{
        if (unlikely(!check_copy_size(addr, bytes, false)))
                return 0;
        else
                return _copy_from_iter(addr, bytes, i);
}

static __always_inline __must_check
bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i)
{
        if (unlikely(!check_copy_size(addr, bytes, false)))
                return false;
        else
                return _copy_from_iter_full(addr, bytes, i);
}

static __always_inline __must_check
size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
{
        if (unlikely(!check_copy_size(addr, bytes, false)))
                return 0;
        else
                return _copy_from_iter_nocache(addr, bytes, i);
}

static __always_inline __must_check
bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i)
{
        if (unlikely(!check_copy_size(addr, bytes, false)))
                return false;
        else
                return _copy_from_iter_full_nocache(addr, bytes, i);
}

#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
/*
 * Note, users like pmem that depend on the stricter semantics of
 * copy_from_iter_flushcache() than copy_from_iter_nocache() must check for
 * IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) before assuming that the
 * destination is flushed from the cache on return.
 */
size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i);
#else
#define _copy_from_iter_flushcache _copy_from_iter_nocache
#endif

#ifdef CONFIG_ARCH_HAS_COPY_MC
size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
#else
#define _copy_mc_to_iter _copy_to_iter
#endif

static __always_inline __must_check
size_t copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
{
        if (unlikely(!check_copy_size(addr, bytes, false)))
                return 0;
        else
                return _copy_from_iter_flushcache(addr, bytes, i);
}

static __always_inline __must_check
size_t copy_mc_to_iter(void *addr, size_t bytes, struct iov_iter *i)
{
        if (unlikely(!check_copy_size(addr, bytes, true)))
                return 0;
        else
                return _copy_mc_to_iter(addr, bytes, i);
}

size_t iov_iter_zero(size_t bytes, struct iov_iter *);
unsigned long iov_iter_alignment(const struct iov_iter *i);
unsigned long iov_iter_gap_alignment(const struct iov_iter *i);
void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov,
                        unsigned long nr_segs, size_t count);
void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec *kvec,
                        unsigned long nr_segs, size_t count);
void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec,
                        unsigned long nr_segs, size_t count);
void iov_iter_pipe(struct iov_iter *i, unsigned int direction, struct pipe_inode_info *pipe,
                        size_t count);
void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count);
ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages,
                        size_t maxsize, unsigned maxpages, size_t *start);
ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages,
                        size_t maxsize, size_t *start);
int iov_iter_npages(const struct iov_iter *i, int maxpages);
void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state);

const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags);

static inline size_t iov_iter_count(const struct iov_iter *i)
{
        return i->count;
}

/*
 * Cap the iov_iter by given limit; note that the second argument is
 * *not* the new size - it's upper limit for such.  Passing it a value
 * greater than the amount of data in iov_iter is fine - it'll just do
 * nothing in that case.
 */
static inline void iov_iter_truncate(struct iov_iter *i, u64 count)
{
        /*
         * count doesn't have to fit in size_t - comparison extends both
         * operands to u64 here and any value that would be truncated by
         * conversion in assignement is by definition greater than all
         * values of size_t, including old i->count.
         */
        if (i->count > count)
                i->count = count;
}

/*
 * reexpand a previously truncated iterator; count must be no more than how much
 * we had shrunk it.
 */
static inline void iov_iter_reexpand(struct iov_iter *i, size_t count)
{
        i->count = count;
}

struct csum_state {
        __wsum csum;
        size_t off;
};

size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csstate, struct iov_iter *i);
size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
                struct iov_iter *i);

struct iovec *iovec_from_user(const struct iovec __user *uvector,
                unsigned long nr_segs, unsigned long fast_segs,
                struct iovec *fast_iov, bool compat);
ssize_t import_iovec(int type, const struct iovec __user *uvec,
                 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
                 struct iov_iter *i);
ssize_t __import_iovec(int type, const struct iovec __user *uvec,
                 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
                 struct iov_iter *i, bool compat);
int import_single_range(int type, void __user *buf, size_t len,
                 struct iovec *iov, struct iov_iter *i);

int iov_iter_for_each_range(struct iov_iter *i, size_t bytes,
                            int (*f)(struct kvec *vec, void *context),
                            void *context);

#endif






































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM printk

#if !defined(_TRACE_PRINTK_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_PRINTK_H

#include <linux/tracepoint.h>

TRACE_EVENT(console,
        TP_PROTO(const char *text, size_t len),

        TP_ARGS(text, len),

        TP_STRUCT__entry(
                __dynamic_array(char, msg, len + 1)
        ),

        TP_fast_assign(
                /*
                 * Each trace entry is printed in a new line.
                 * If the msg finishes with '\n', cut it off
                 * to avoid blank lines in the trace.
                 */
                if ((len > 0) && (text[len-1] == '\n'))
                        len -= 1;

                memcpy(__get_str(msg), text, len);
                __get_str(msg)[len] = 0;
        ),

        TP_printk("%s", __get_str(msg))
);
#endif /* _TRACE_PRINTK_H */

/* This part must be outside protection */
#include <trace/define_trace.h>






















    4 
    4 





























    4 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
// SPDX-License-Identifier: GPL-2.0
/*
 * Implementation of the symbol table type.
 *
 * Author : Stephen Smalley, <sds@tycho.nsa.gov>
 */
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include "symtab.h"

static unsigned int symhash(const void *key)
{
        const char *p, *keyp;
        unsigned int size;
        unsigned int val;

        val = 0;
        keyp = key;
        size = strlen(keyp);
        for (p = keyp; (p - keyp) < size; p++)
                val = (val << 4 | (val >> (8*sizeof(unsigned int)-4))) ^ (*p);
        return val;
}

static int symcmp(const void *key1, const void *key2)
{
        const char *keyp1, *keyp2;

        keyp1 = key1;
        keyp2 = key2;
        return strcmp(keyp1, keyp2);
}

static const struct hashtab_key_params symtab_key_params = {
        .hash = symhash,
        .cmp = symcmp,
};

int symtab_init(struct symtab *s, unsigned int size)
{
        s->nprim = 0;
        return hashtab_init(&s->table, size);
}

int symtab_insert(struct symtab *s, char *name, void *datum)
{
        return hashtab_insert(&s->table, name, datum, symtab_key_params);
}

void *symtab_search(struct symtab *s, const char *name)
{
        return hashtab_search(&s->table, name, symtab_key_params);
}





























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Scatterlist Cryptographic API.
 *
 * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
 * Copyright (c) 2002 David S. Miller (davem@redhat.com)
 * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au>
 *
 * Portions derived from Cryptoapi, by Alexander Kjeldaas <astor@fast.no>
 * and Nettle, by Niels Möller.
 */
#ifndef _LINUX_CRYPTO_H
#define _LINUX_CRYPTO_H

#include <linux/atomic.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/bug.h>
#include <linux/refcount.h>
#include <linux/slab.h>
#include <linux/completion.h>

/*
 * Autoloaded crypto modules should only use a prefixed name to avoid allowing
 * arbitrary modules to be loaded. Loading from userspace may still need the
 * unprefixed names, so retains those aliases as well.
 * This uses __MODULE_INFO directly instead of MODULE_ALIAS because pre-4.3
 * gcc (e.g. avr32 toolchain) uses __LINE__ for uniqueness, and this macro
 * expands twice on the same line. Instead, use a separate base name for the
 * alias.
 */
#define MODULE_ALIAS_CRYPTO(name)        \
                __MODULE_INFO(alias, alias_userspace, name);        \
                __MODULE_INFO(alias, alias_crypto, "crypto-" name)

/*
 * Algorithm masks and types.
 */
#define CRYPTO_ALG_TYPE_MASK                0x0000000f
#define CRYPTO_ALG_TYPE_CIPHER                0x00000001
#define CRYPTO_ALG_TYPE_COMPRESS        0x00000002
#define CRYPTO_ALG_TYPE_AEAD                0x00000003
#define CRYPTO_ALG_TYPE_SKCIPHER        0x00000005
#define CRYPTO_ALG_TYPE_KPP                0x00000008
#define CRYPTO_ALG_TYPE_ACOMPRESS        0x0000000a
#define CRYPTO_ALG_TYPE_SCOMPRESS        0x0000000b
#define CRYPTO_ALG_TYPE_RNG                0x0000000c
#define CRYPTO_ALG_TYPE_AKCIPHER        0x0000000d
#define CRYPTO_ALG_TYPE_HASH                0x0000000e
#define CRYPTO_ALG_TYPE_SHASH                0x0000000e
#define CRYPTO_ALG_TYPE_AHASH                0x0000000f

#define CRYPTO_ALG_TYPE_HASH_MASK        0x0000000e
#define CRYPTO_ALG_TYPE_AHASH_MASK        0x0000000e
#define CRYPTO_ALG_TYPE_ACOMPRESS_MASK        0x0000000e

#define CRYPTO_ALG_LARVAL                0x00000010
#define CRYPTO_ALG_DEAD                        0x00000020
#define CRYPTO_ALG_DYING                0x00000040
#define CRYPTO_ALG_ASYNC                0x00000080

/*
 * Set if the algorithm (or an algorithm which it uses) requires another
 * algorithm of the same type to handle corner cases.
 */
#define CRYPTO_ALG_NEED_FALLBACK        0x00000100

/*
 * Set if the algorithm has passed automated run-time testing.  Note that
 * if there is no run-time testing for a given algorithm it is considered
 * to have passed.
 */

#define CRYPTO_ALG_TESTED                0x00000400

/*
 * Set if the algorithm is an instance that is built from templates.
 */
#define CRYPTO_ALG_INSTANCE                0x00000800

/* Set this bit if the algorithm provided is hardware accelerated but
 * not available to userspace via instruction set or so.
 */
#define CRYPTO_ALG_KERN_DRIVER_ONLY        0x00001000

/*
 * Mark a cipher as a service implementation only usable by another
 * cipher and never by a normal user of the kernel crypto API
 */
#define CRYPTO_ALG_INTERNAL                0x00002000

/*
 * Set if the algorithm has a ->setkey() method but can be used without
 * calling it first, i.e. there is a default key.
 */
#define CRYPTO_ALG_OPTIONAL_KEY                0x00004000

/*
 * Don't trigger module loading
 */
#define CRYPTO_NOLOAD                        0x00008000

/*
 * The algorithm may allocate memory during request processing, i.e. during
 * encryption, decryption, or hashing.  Users can request an algorithm with this
 * flag unset if they can't handle memory allocation failures.
 *
 * This flag is currently only implemented for algorithms of type "skcipher",
 * "aead", "ahash", "shash", and "cipher".  Algorithms of other types might not
 * have this flag set even if they allocate memory.
 *
 * In some edge cases, algorithms can allocate memory regardless of this flag.
 * To avoid these cases, users must obey the following usage constraints:
 *    skcipher:
 *        - The IV buffer and all scatterlist elements must be aligned to the
 *          algorithm's alignmask.
 *        - If the data were to be divided into chunks of size
 *          crypto_skcipher_walksize() (with any remainder going at the end), no
 *          chunk can cross a page boundary or a scatterlist element boundary.
 *    aead:
 *        - The IV buffer and all scatterlist elements must be aligned to the
 *          algorithm's alignmask.
 *        - The first scatterlist element must contain all the associated data,
 *          and its pages must be !PageHighMem.
 *        - If the plaintext/ciphertext were to be divided into chunks of size
 *          crypto_aead_walksize() (with the remainder going at the end), no chunk
 *          can cross a page boundary or a scatterlist element boundary.
 *    ahash:
 *        - The result buffer must be aligned to the algorithm's alignmask.
 *        - crypto_ahash_finup() must not be used unless the algorithm implements
 *          ->finup() natively.
 */
#define CRYPTO_ALG_ALLOCATES_MEMORY        0x00010000

/*
 * Transform masks and values (for crt_flags).
 */
#define CRYPTO_TFM_NEED_KEY                0x00000001

#define CRYPTO_TFM_REQ_MASK                0x000fff00
#define CRYPTO_TFM_REQ_FORBID_WEAK_KEYS        0x00000100
#define CRYPTO_TFM_REQ_MAY_SLEEP        0x00000200
#define CRYPTO_TFM_REQ_MAY_BACKLOG        0x00000400

/*
 * Miscellaneous stuff.
 */
#define CRYPTO_MAX_ALG_NAME                128

/*
 * The macro CRYPTO_MINALIGN_ATTR (along with the void * type in the actual
 * declaration) is used to ensure that the crypto_tfm context structure is
 * aligned correctly for the given architecture so that there are no alignment
 * faults for C data types.  On architectures that support non-cache coherent
 * DMA, such as ARM or arm64, it also takes into account the minimal alignment
 * that is required to ensure that the context struct member does not share any
 * cachelines with the rest of the struct. This is needed to ensure that cache
 * maintenance for non-coherent DMA (cache invalidation in particular) does not
 * affect data that may be accessed by the CPU concurrently.
 */
#define CRYPTO_MINALIGN ARCH_KMALLOC_MINALIGN

#define CRYPTO_MINALIGN_ATTR __attribute__ ((__aligned__(CRYPTO_MINALIGN)))

struct scatterlist;
struct crypto_async_request;
struct crypto_tfm;
struct crypto_type;

typedef void (*crypto_completion_t)(struct crypto_async_request *req, int err);

/**
 * DOC: Block Cipher Context Data Structures
 *
 * These data structures define the operating context for each block cipher
 * type.
 */

struct crypto_async_request {
        struct list_head list;
        crypto_completion_t complete;
        void *data;
        struct crypto_tfm *tfm;

        u32 flags;
};

/**
 * DOC: Block Cipher Algorithm Definitions
 *
 * These data structures define modular crypto algorithm implementations,
 * managed via crypto_register_alg() and crypto_unregister_alg().
 */

/**
 * struct cipher_alg - single-block symmetric ciphers definition
 * @cia_min_keysize: Minimum key size supported by the transformation. This is
 *                     the smallest key length supported by this transformation
 *                     algorithm. This must be set to one of the pre-defined
 *                     values as this is not hardware specific. Possible values
 *                     for this field can be found via git grep "_MIN_KEY_SIZE"
 *                     include/crypto/
 * @cia_max_keysize: Maximum key size supported by the transformation. This is
 *                    the largest key length supported by this transformation
 *                    algorithm. This must be set to one of the pre-defined values
 *                    as this is not hardware specific. Possible values for this
 *                    field can be found via git grep "_MAX_KEY_SIZE"
 *                    include/crypto/
 * @cia_setkey: Set key for the transformation. This function is used to either
 *                program a supplied key into the hardware or store the key in the
 *                transformation context for programming it later. Note that this
 *                function does modify the transformation context. This function
 *                can be called multiple times during the existence of the
 *                transformation object, so one must make sure the key is properly
 *                reprogrammed into the hardware. This function is also
 *                responsible for checking the key length for validity.
 * @cia_encrypt: Encrypt a single block. This function is used to encrypt a
 *                 single block of data, which must be @cra_blocksize big. This
 *                 always operates on a full @cra_blocksize and it is not possible
 *                 to encrypt a block of smaller size. The supplied buffers must
 *                 therefore also be at least of @cra_blocksize size. Both the
 *                 input and output buffers are always aligned to @cra_alignmask.
 *                 In case either of the input or output buffer supplied by user
 *                 of the crypto API is not aligned to @cra_alignmask, the crypto
 *                 API will re-align the buffers. The re-alignment means that a
 *                 new buffer will be allocated, the data will be copied into the
 *                 new buffer, then the processing will happen on the new buffer,
 *                 then the data will be copied back into the original buffer and
 *                 finally the new buffer will be freed. In case a software
 *                 fallback was put in place in the @cra_init call, this function
 *                 might need to use the fallback if the algorithm doesn't support
 *                 all of the key sizes. In case the key was stored in
 *                 transformation context, the key might need to be re-programmed
 *                 into the hardware in this function. This function shall not
 *                 modify the transformation context, as this function may be
 *                 called in parallel with the same transformation object.
 * @cia_decrypt: Decrypt a single block. This is a reverse counterpart to
 *                 @cia_encrypt, and the conditions are exactly the same.
 *
 * All fields are mandatory and must be filled.
 */
struct cipher_alg {
        unsigned int cia_min_keysize;
        unsigned int cia_max_keysize;
        int (*cia_setkey)(struct crypto_tfm *tfm, const u8 *key,
                          unsigned int keylen);
        void (*cia_encrypt)(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
        void (*cia_decrypt)(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
};

/**
 * struct compress_alg - compression/decompression algorithm
 * @coa_compress: Compress a buffer of specified length, storing the resulting
 *                  data in the specified buffer. Return the length of the
 *                  compressed data in dlen.
 * @coa_decompress: Decompress the source buffer, storing the uncompressed
 *                    data in the specified buffer. The length of the data is
 *                    returned in dlen.
 *
 * All fields are mandatory.
 */
struct compress_alg {
        int (*coa_compress)(struct crypto_tfm *tfm, const u8 *src,
                            unsigned int slen, u8 *dst, unsigned int *dlen);
        int (*coa_decompress)(struct crypto_tfm *tfm, const u8 *src,
                              unsigned int slen, u8 *dst, unsigned int *dlen);
};

#ifdef CONFIG_CRYPTO_STATS
/*
 * struct crypto_istat_aead - statistics for AEAD algorithm
 * @encrypt_cnt:        number of encrypt requests
 * @encrypt_tlen:        total data size handled by encrypt requests
 * @decrypt_cnt:        number of decrypt requests
 * @decrypt_tlen:        total data size handled by decrypt requests
 * @err_cnt:                number of error for AEAD requests
 */
struct crypto_istat_aead {
        atomic64_t encrypt_cnt;
        atomic64_t encrypt_tlen;
        atomic64_t decrypt_cnt;
        atomic64_t decrypt_tlen;
        atomic64_t err_cnt;
};

/*
 * struct crypto_istat_akcipher - statistics for akcipher algorithm
 * @encrypt_cnt:        number of encrypt requests
 * @encrypt_tlen:        total data size handled by encrypt requests
 * @decrypt_cnt:        number of decrypt requests
 * @decrypt_tlen:        total data size handled by decrypt requests
 * @verify_cnt:                number of verify operation
 * @sign_cnt:                number of sign requests
 * @err_cnt:                number of error for akcipher requests
 */
struct crypto_istat_akcipher {
        atomic64_t encrypt_cnt;
        atomic64_t encrypt_tlen;
        atomic64_t decrypt_cnt;
        atomic64_t decrypt_tlen;
        atomic64_t verify_cnt;
        atomic64_t sign_cnt;
        atomic64_t err_cnt;
};

/*
 * struct crypto_istat_cipher - statistics for cipher algorithm
 * @encrypt_cnt:        number of encrypt requests
 * @encrypt_tlen:        total data size handled by encrypt requests
 * @decrypt_cnt:        number of decrypt requests
 * @decrypt_tlen:        total data size handled by decrypt requests
 * @err_cnt:                number of error for cipher requests
 */
struct crypto_istat_cipher {
        atomic64_t encrypt_cnt;
        atomic64_t encrypt_tlen;
        atomic64_t decrypt_cnt;
        atomic64_t decrypt_tlen;
        atomic64_t err_cnt;
};

/*
 * struct crypto_istat_compress - statistics for compress algorithm
 * @compress_cnt:        number of compress requests
 * @compress_tlen:        total data size handled by compress requests
 * @decompress_cnt:        number of decompress requests
 * @decompress_tlen:        total data size handled by decompress requests
 * @err_cnt:                number of error for compress requests
 */
struct crypto_istat_compress {
        atomic64_t compress_cnt;
        atomic64_t compress_tlen;
        atomic64_t decompress_cnt;
        atomic64_t decompress_tlen;
        atomic64_t err_cnt;
};

/*
 * struct crypto_istat_hash - statistics for has algorithm
 * @hash_cnt:                number of hash requests
 * @hash_tlen:                total data size hashed
 * @err_cnt:                number of error for hash requests
 */
struct crypto_istat_hash {
        atomic64_t hash_cnt;
        atomic64_t hash_tlen;
        atomic64_t err_cnt;
};

/*
 * struct crypto_istat_kpp - statistics for KPP algorithm
 * @setsecret_cnt:                number of setsecrey operation
 * @generate_public_key_cnt:        number of generate_public_key operation
 * @compute_shared_secret_cnt:        number of compute_shared_secret operation
 * @err_cnt:                        number of error for KPP requests
 */
struct crypto_istat_kpp {
        atomic64_t setsecret_cnt;
        atomic64_t generate_public_key_cnt;
        atomic64_t compute_shared_secret_cnt;
        atomic64_t err_cnt;
};

/*
 * struct crypto_istat_rng: statistics for RNG algorithm
 * @generate_cnt:        number of RNG generate requests
 * @generate_tlen:        total data size of generated data by the RNG
 * @seed_cnt:                number of times the RNG was seeded
 * @err_cnt:                number of error for RNG requests
 */
struct crypto_istat_rng {
        atomic64_t generate_cnt;
        atomic64_t generate_tlen;
        atomic64_t seed_cnt;
        atomic64_t err_cnt;
};
#endif /* CONFIG_CRYPTO_STATS */

#define cra_cipher        cra_u.cipher
#define cra_compress        cra_u.compress

/**
 * struct crypto_alg - definition of a cryptograpic cipher algorithm
 * @cra_flags: Flags describing this transformation. See include/linux/crypto.h
 *               CRYPTO_ALG_* flags for the flags which go in here. Those are
 *               used for fine-tuning the description of the transformation
 *               algorithm.
 * @cra_blocksize: Minimum block size of this transformation. The size in bytes
 *                   of the smallest possible unit which can be transformed with
 *                   this algorithm. The users must respect this value.
 *                   In case of HASH transformation, it is possible for a smaller
 *                   block than @cra_blocksize to be passed to the crypto API for
 *                   transformation, in case of any other transformation type, an
 *                    error will be returned upon any attempt to transform smaller
 *                   than @cra_blocksize chunks.
 * @cra_ctxsize: Size of the operational context of the transformation. This
 *                 value informs the kernel crypto API about the memory size
 *                 needed to be allocated for the transformation context.
 * @cra_alignmask: Alignment mask for the input and output data buffer. The data
 *                   buffer containing the input data for the algorithm must be
 *                   aligned to this alignment mask. The data buffer for the
 *                   output data must be aligned to this alignment mask. Note that
 *                   the Crypto API will do the re-alignment in software, but
 *                   only under special conditions and there is a performance hit.
 *                   The re-alignment happens at these occasions for different
 *                   @cra_u types: cipher -- For both input data and output data
 *                   buffer; ahash -- For output hash destination buf; shash --
 *                   For output hash destination buf.
 *                   This is needed on hardware which is flawed by design and
 *                   cannot pick data from arbitrary addresses.
 * @cra_priority: Priority of this transformation implementation. In case
 *                  multiple transformations with same @cra_name are available to
 *                  the Crypto API, the kernel will use the one with highest
 *                  @cra_priority.
 * @cra_name: Generic name (usable by multiple implementations) of the
 *              transformation algorithm. This is the name of the transformation
 *              itself. This field is used by the kernel when looking up the
 *              providers of particular transformation.
 * @cra_driver_name: Unique name of the transformation provider. This is the
 *                     name of the provider of the transformation. This can be any
 *                     arbitrary value, but in the usual case, this contains the
 *                     name of the chip or provider and the name of the
 *                     transformation algorithm.
 * @cra_type: Type of the cryptographic transformation. This is a pointer to
 *              struct crypto_type, which implements callbacks common for all
 *              transformation types. There are multiple options, such as
 *              &crypto_skcipher_type, &crypto_ahash_type, &crypto_rng_type.
 *              This field might be empty. In that case, there are no common
 *              callbacks. This is the case for: cipher, compress, shash.
 * @cra_u: Callbacks implementing the transformation. This is a union of
 *           multiple structures. Depending on the type of transformation selected
 *           by @cra_type and @cra_flags above, the associated structure must be
 *           filled with callbacks. This field might be empty. This is the case
 *           for ahash, shash.
 * @cra_init: Initialize the cryptographic transformation object. This function
 *              is used to initialize the cryptographic transformation object.
 *              This function is called only once at the instantiation time, right
 *              after the transformation context was allocated. In case the
 *              cryptographic hardware has some special requirements which need to
 *              be handled by software, this function shall check for the precise
 *              requirement of the transformation and put any software fallbacks
 *              in place.
 * @cra_exit: Deinitialize the cryptographic transformation object. This is a
 *              counterpart to @cra_init, used to remove various changes set in
 *              @cra_init.
 * @cra_u.cipher: Union member which contains a single-block symmetric cipher
 *                  definition. See @struct @cipher_alg.
 * @cra_u.compress: Union member which contains a (de)compression algorithm.
 *                    See @struct @compress_alg.
 * @cra_module: Owner of this transformation implementation. Set to THIS_MODULE
 * @cra_list: internally used
 * @cra_users: internally used
 * @cra_refcnt: internally used
 * @cra_destroy: internally used
 *
 * @stats: union of all possible crypto_istat_xxx structures
 * @stats.aead:                statistics for AEAD algorithm
 * @stats.akcipher:        statistics for akcipher algorithm
 * @stats.cipher:        statistics for cipher algorithm
 * @stats.compress:        statistics for compress algorithm
 * @stats.hash:                statistics for hash algorithm
 * @stats.rng:                statistics for rng algorithm
 * @stats.kpp:                statistics for KPP algorithm
 *
 * The struct crypto_alg describes a generic Crypto API algorithm and is common
 * for all of the transformations. Any variable not documented here shall not
 * be used by a cipher implementation as it is internal to the Crypto API.
 */
struct crypto_alg {
        struct list_head cra_list;
        struct list_head cra_users;

        u32 cra_flags;
        unsigned int cra_blocksize;
        unsigned int cra_ctxsize;
        unsigned int cra_alignmask;

        int cra_priority;
        refcount_t cra_refcnt;

        char cra_name[CRYPTO_MAX_ALG_NAME];
        char cra_driver_name[CRYPTO_MAX_ALG_NAME];

        const struct crypto_type *cra_type;

        union {
                struct cipher_alg cipher;
                struct compress_alg compress;
        } cra_u;

        int (*cra_init)(struct crypto_tfm *tfm);
        void (*cra_exit)(struct crypto_tfm *tfm);
        void (*cra_destroy)(struct crypto_alg *alg);
        
        struct module *cra_module;

#ifdef CONFIG_CRYPTO_STATS
        union {
                struct crypto_istat_aead aead;
                struct crypto_istat_akcipher akcipher;
                struct crypto_istat_cipher cipher;
                struct crypto_istat_compress compress;
                struct crypto_istat_hash hash;
                struct crypto_istat_rng rng;
                struct crypto_istat_kpp kpp;
        } stats;
#endif /* CONFIG_CRYPTO_STATS */

} CRYPTO_MINALIGN_ATTR;

#ifdef CONFIG_CRYPTO_STATS
void crypto_stats_init(struct crypto_alg *alg);
void crypto_stats_get(struct crypto_alg *alg);
void crypto_stats_aead_encrypt(unsigned int cryptlen, struct crypto_alg *alg, int ret);
void crypto_stats_aead_decrypt(unsigned int cryptlen, struct crypto_alg *alg, int ret);
void crypto_stats_ahash_update(unsigned int nbytes, int ret, struct crypto_alg *alg);
void crypto_stats_ahash_final(unsigned int nbytes, int ret, struct crypto_alg *alg);
void crypto_stats_akcipher_encrypt(unsigned int src_len, int ret, struct crypto_alg *alg);
void crypto_stats_akcipher_decrypt(unsigned int src_len, int ret, struct crypto_alg *alg);
void crypto_stats_akcipher_sign(int ret, struct crypto_alg *alg);
void crypto_stats_akcipher_verify(int ret, struct crypto_alg *alg);
void crypto_stats_compress(unsigned int slen, int ret, struct crypto_alg *alg);
void crypto_stats_decompress(unsigned int slen, int ret, struct crypto_alg *alg);
void crypto_stats_kpp_set_secret(struct crypto_alg *alg, int ret);
void crypto_stats_kpp_generate_public_key(struct crypto_alg *alg, int ret);
void crypto_stats_kpp_compute_shared_secret(struct crypto_alg *alg, int ret);
void crypto_stats_rng_seed(struct crypto_alg *alg, int ret);
void crypto_stats_rng_generate(struct crypto_alg *alg, unsigned int dlen, int ret);
void crypto_stats_skcipher_encrypt(unsigned int cryptlen, int ret, struct crypto_alg *alg);
void crypto_stats_skcipher_decrypt(unsigned int cryptlen, int ret, struct crypto_alg *alg);
#else
static inline void crypto_stats_init(struct crypto_alg *alg)
{}
static inline void crypto_stats_get(struct crypto_alg *alg)
{}
static inline void crypto_stats_aead_encrypt(unsigned int cryptlen, struct crypto_alg *alg, int ret)
{}
static inline void crypto_stats_aead_decrypt(unsigned int cryptlen, struct crypto_alg *alg, int ret)
{}
static inline void crypto_stats_ahash_update(unsigned int nbytes, int ret, struct crypto_alg *alg)
{}
static inline void crypto_stats_ahash_final(unsigned int nbytes, int ret, struct crypto_alg *alg)
{}
static inline void crypto_stats_akcipher_encrypt(unsigned int src_len, int ret, struct crypto_alg *alg)
{}
static inline void crypto_stats_akcipher_decrypt(unsigned int src_len, int ret, struct crypto_alg *alg)
{}
static inline void crypto_stats_akcipher_sign(int ret, struct crypto_alg *alg)
{}
static inline void crypto_stats_akcipher_verify(int ret, struct crypto_alg *alg)
{}
static inline void crypto_stats_compress(unsigned int slen, int ret, struct crypto_alg *alg)
{}
static inline void crypto_stats_decompress(unsigned int slen, int ret, struct crypto_alg *alg)
{}
static inline void crypto_stats_kpp_set_secret(struct crypto_alg *alg, int ret)
{}
static inline void crypto_stats_kpp_generate_public_key(struct crypto_alg *alg, int ret)
{}
static inline void crypto_stats_kpp_compute_shared_secret(struct crypto_alg *alg, int ret)
{}
static inline void crypto_stats_rng_seed(struct crypto_alg *alg, int ret)
{}
static inline void crypto_stats_rng_generate(struct crypto_alg *alg, unsigned int dlen, int ret)
{}
static inline void crypto_stats_skcipher_encrypt(unsigned int cryptlen, int ret, struct crypto_alg *alg)
{}
static inline void crypto_stats_skcipher_decrypt(unsigned int cryptlen, int ret, struct crypto_alg *alg)
{}
#endif
/*
 * A helper struct for waiting for completion of async crypto ops
 */
struct crypto_wait {
        struct completion completion;
        int err;
};

/*
 * Macro for declaring a crypto op async wait object on stack
 */
#define DECLARE_CRYPTO_WAIT(_wait) \
        struct crypto_wait _wait = { \
                COMPLETION_INITIALIZER_ONSTACK((_wait).completion), 0 }

/*
 * Async ops completion helper functioons
 */
void crypto_req_done(struct crypto_async_request *req, int err);

static inline int crypto_wait_req(int err, struct crypto_wait *wait)
{
        switch (err) {
        case -EINPROGRESS:
        case -EBUSY:
                wait_for_completion(&wait->completion);
                reinit_completion(&wait->completion);
                err = wait->err;
                break;
        }

        return err;
}

static inline void crypto_init_wait(struct crypto_wait *wait)
{
        init_completion(&wait->completion);
}

/*
 * Algorithm registration interface.
 */
int crypto_register_alg(struct crypto_alg *alg);
void crypto_unregister_alg(struct crypto_alg *alg);
int crypto_register_algs(struct crypto_alg *algs, int count);
void crypto_unregister_algs(struct crypto_alg *algs, int count);

/*
 * Algorithm query interface.
 */
int crypto_has_alg(const char *name, u32 type, u32 mask);

/*
 * Transforms: user-instantiated objects which encapsulate algorithms
 * and core processing logic.  Managed via crypto_alloc_*() and
 * crypto_free_*(), as well as the various helpers below.
 */

struct crypto_tfm {

        u32 crt_flags;

        int node;
        
        void (*exit)(struct crypto_tfm *tfm);
        
        struct crypto_alg *__crt_alg;

        void *__crt_ctx[] CRYPTO_MINALIGN_ATTR;
};

struct crypto_cipher {
        struct crypto_tfm base;
};

struct crypto_comp {
        struct crypto_tfm base;
};

enum {
        CRYPTOA_UNSPEC,
        CRYPTOA_ALG,
        CRYPTOA_TYPE,
        CRYPTOA_U32,
        __CRYPTOA_MAX,
};

#define CRYPTOA_MAX (__CRYPTOA_MAX - 1)

/* Maximum number of (rtattr) parameters for each template. */
#define CRYPTO_MAX_ATTRS 32

struct crypto_attr_alg {
        char name[CRYPTO_MAX_ALG_NAME];
};

struct crypto_attr_type {
        u32 type;
        u32 mask;
};

struct crypto_attr_u32 {
        u32 num;
};

/* 
 * Transform user interface.
 */
 
struct crypto_tfm *crypto_alloc_base(const char *alg_name, u32 type, u32 mask);
void crypto_destroy_tfm(void *mem, struct crypto_tfm *tfm);

static inline void crypto_free_tfm(struct crypto_tfm *tfm)
{
        return crypto_destroy_tfm(tfm, tfm);
}

int alg_test(const char *driver, const char *alg, u32 type, u32 mask);

/*
 * Transform helpers which query the underlying algorithm.
 */
static inline const char *crypto_tfm_alg_name(struct crypto_tfm *tfm)
{
        return tfm->__crt_alg->cra_name;
}

static inline const char *crypto_tfm_alg_driver_name(struct crypto_tfm *tfm)
{
        return tfm->__crt_alg->cra_driver_name;
}

static inline int crypto_tfm_alg_priority(struct crypto_tfm *tfm)
{
        return tfm->__crt_alg->cra_priority;
}

static inline u32 crypto_tfm_alg_type(struct crypto_tfm *tfm)
{
        return tfm->__crt_alg->cra_flags & CRYPTO_ALG_TYPE_MASK;
}

static inline unsigned int crypto_tfm_alg_blocksize(struct crypto_tfm *tfm)
{
        return tfm->__crt_alg->cra_blocksize;
}

static inline unsigned int crypto_tfm_alg_alignmask(struct crypto_tfm *tfm)
{
        return tfm->__crt_alg->cra_alignmask;
}

static inline u32 crypto_tfm_get_flags(struct crypto_tfm *tfm)
{
        return tfm->crt_flags;
}

static inline void crypto_tfm_set_flags(struct crypto_tfm *tfm, u32 flags)
{
        tfm->crt_flags |= flags;
}

static inline void crypto_tfm_clear_flags(struct crypto_tfm *tfm, u32 flags)
{
        tfm->crt_flags &= ~flags;
}

static inline void *crypto_tfm_ctx(struct crypto_tfm *tfm)
{
        return tfm->__crt_ctx;
}

static inline unsigned int crypto_tfm_ctx_alignment(void)
{
        struct crypto_tfm *tfm;
        return __alignof__(tfm->__crt_ctx);
}

/**
 * DOC: Single Block Cipher API
 *
 * The single block cipher API is used with the ciphers of type
 * CRYPTO_ALG_TYPE_CIPHER (listed as type "cipher" in /proc/crypto).
 *
 * Using the single block cipher API calls, operations with the basic cipher
 * primitive can be implemented. These cipher primitives exclude any block
 * chaining operations including IV handling.
 *
 * The purpose of this single block cipher API is to support the implementation
 * of templates or other concepts that only need to perform the cipher operation
 * on one block at a time. Templates invoke the underlying cipher primitive
 * block-wise and process either the input or the output data of these cipher
 * operations.
 */

static inline struct crypto_cipher *__crypto_cipher_cast(struct crypto_tfm *tfm)
{
        return (struct crypto_cipher *)tfm;
}

/**
 * crypto_alloc_cipher() - allocate single block cipher handle
 * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
 *             single block cipher
 * @type: specifies the type of the cipher
 * @mask: specifies the mask for the cipher
 *
 * Allocate a cipher handle for a single block cipher. The returned struct
 * crypto_cipher is the cipher handle that is required for any subsequent API
 * invocation for that single block cipher.
 *
 * Return: allocated cipher handle in case of success; IS_ERR() is true in case
 *           of an error, PTR_ERR() returns the error code.
 */
static inline struct crypto_cipher *crypto_alloc_cipher(const char *alg_name,
                                                        u32 type, u32 mask)
{
        type &= ~CRYPTO_ALG_TYPE_MASK;
        type |= CRYPTO_ALG_TYPE_CIPHER;
        mask |= CRYPTO_ALG_TYPE_MASK;

        return __crypto_cipher_cast(crypto_alloc_base(alg_name, type, mask));
}

static inline struct crypto_tfm *crypto_cipher_tfm(struct crypto_cipher *tfm)
{
        return &tfm->base;
}

/**
 * crypto_free_cipher() - zeroize and free the single block cipher handle
 * @tfm: cipher handle to be freed
 */
static inline void crypto_free_cipher(struct crypto_cipher *tfm)
{
        crypto_free_tfm(crypto_cipher_tfm(tfm));
}

/**
 * crypto_has_cipher() - Search for the availability of a single block cipher
 * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
 *             single block cipher
 * @type: specifies the type of the cipher
 * @mask: specifies the mask for the cipher
 *
 * Return: true when the single block cipher is known to the kernel crypto API;
 *           false otherwise
 */
static inline int crypto_has_cipher(const char *alg_name, u32 type, u32 mask)
{
        type &= ~CRYPTO_ALG_TYPE_MASK;
        type |= CRYPTO_ALG_TYPE_CIPHER;
        mask |= CRYPTO_ALG_TYPE_MASK;

        return crypto_has_alg(alg_name, type, mask);
}

/**
 * crypto_cipher_blocksize() - obtain block size for cipher
 * @tfm: cipher handle
 *
 * The block size for the single block cipher referenced with the cipher handle
 * tfm is returned. The caller may use that information to allocate appropriate
 * memory for the data returned by the encryption or decryption operation
 *
 * Return: block size of cipher
 */
static inline unsigned int crypto_cipher_blocksize(struct crypto_cipher *tfm)
{
        return crypto_tfm_alg_blocksize(crypto_cipher_tfm(tfm));
}

static inline unsigned int crypto_cipher_alignmask(struct crypto_cipher *tfm)
{
        return crypto_tfm_alg_alignmask(crypto_cipher_tfm(tfm));
}

static inline u32 crypto_cipher_get_flags(struct crypto_cipher *tfm)
{
        return crypto_tfm_get_flags(crypto_cipher_tfm(tfm));
}

static inline void crypto_cipher_set_flags(struct crypto_cipher *tfm,
                                           u32 flags)
{
        crypto_tfm_set_flags(crypto_cipher_tfm(tfm), flags);
}

static inline void crypto_cipher_clear_flags(struct crypto_cipher *tfm,
                                             u32 flags)
{
        crypto_tfm_clear_flags(crypto_cipher_tfm(tfm), flags);
}

/**
 * crypto_cipher_setkey() - set key for cipher
 * @tfm: cipher handle
 * @key: buffer holding the key
 * @keylen: length of the key in bytes
 *
 * The caller provided key is set for the single block cipher referenced by the
 * cipher handle.
 *
 * Note, the key length determines the cipher type. Many block ciphers implement
 * different cipher modes depending on the key size, such as AES-128 vs AES-192
 * vs. AES-256. When providing a 16 byte key for an AES cipher handle, AES-128
 * is performed.
 *
 * Return: 0 if the setting of the key was successful; < 0 if an error occurred
 */
int crypto_cipher_setkey(struct crypto_cipher *tfm,
                         const u8 *key, unsigned int keylen);

/**
 * crypto_cipher_encrypt_one() - encrypt one block of plaintext
 * @tfm: cipher handle
 * @dst: points to the buffer that will be filled with the ciphertext
 * @src: buffer holding the plaintext to be encrypted
 *
 * Invoke the encryption operation of one block. The caller must ensure that
 * the plaintext and ciphertext buffers are at least one block in size.
 */
void crypto_cipher_encrypt_one(struct crypto_cipher *tfm,
                               u8 *dst, const u8 *src);

/**
 * crypto_cipher_decrypt_one() - decrypt one block of ciphertext
 * @tfm: cipher handle
 * @dst: points to the buffer that will be filled with the plaintext
 * @src: buffer holding the ciphertext to be decrypted
 *
 * Invoke the decryption operation of one block. The caller must ensure that
 * the plaintext and ciphertext buffers are at least one block in size.
 */
void crypto_cipher_decrypt_one(struct crypto_cipher *tfm,
                               u8 *dst, const u8 *src);

static inline struct crypto_comp *__crypto_comp_cast(struct crypto_tfm *tfm)
{
        return (struct crypto_comp *)tfm;
}

static inline struct crypto_comp *crypto_alloc_comp(const char *alg_name,
                                                    u32 type, u32 mask)
{
        type &= ~CRYPTO_ALG_TYPE_MASK;
        type |= CRYPTO_ALG_TYPE_COMPRESS;
        mask |= CRYPTO_ALG_TYPE_MASK;

        return __crypto_comp_cast(crypto_alloc_base(alg_name, type, mask));
}

static inline struct crypto_tfm *crypto_comp_tfm(struct crypto_comp *tfm)
{
        return &tfm->base;
}

static inline void crypto_free_comp(struct crypto_comp *tfm)
{
        crypto_free_tfm(crypto_comp_tfm(tfm));
}

static inline int crypto_has_comp(const char *alg_name, u32 type, u32 mask)
{
        type &= ~CRYPTO_ALG_TYPE_MASK;
        type |= CRYPTO_ALG_TYPE_COMPRESS;
        mask |= CRYPTO_ALG_TYPE_MASK;

        return crypto_has_alg(alg_name, type, mask);
}

static inline const char *crypto_comp_name(struct crypto_comp *tfm)
{
        return crypto_tfm_alg_name(crypto_comp_tfm(tfm));
}

int crypto_comp_compress(struct crypto_comp *tfm,
                         const u8 *src, unsigned int slen,
                         u8 *dst, unsigned int *dlen);

int crypto_comp_decompress(struct crypto_comp *tfm,
                           const u8 *src, unsigned int slen,
                           u8 *dst, unsigned int *dlen);

#endif        /* _LINUX_CRYPTO_H */























    1 





























    1 

    1 
    1 







    1 





    1 














































    1 












    1 




    1 





















































































































































































    1 


















    1 









    1 















    1 



    1 











































































    1 





    1 




















































































































































    1 
    1 










    1 


    1 












    1 
























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved.
 * Authors: David Chinner and Glauber Costa
 *
 * Generic LRU infrastructure
 */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/list_lru.h>
#include <linux/slab.h>
#include <linux/mutex.h>
#include <linux/memcontrol.h>
#include "slab.h"

#ifdef CONFIG_MEMCG_KMEM
static LIST_HEAD(list_lrus);
static DEFINE_MUTEX(list_lrus_mutex);

static void list_lru_register(struct list_lru *lru)
{
        mutex_lock(&list_lrus_mutex);
        list_add(&lru->list, &list_lrus);
        mutex_unlock(&list_lrus_mutex);
}

static void list_lru_unregister(struct list_lru *lru)
{
        mutex_lock(&list_lrus_mutex);
        list_del(&lru->list);
        mutex_unlock(&list_lrus_mutex);
}

static int lru_shrinker_id(struct list_lru *lru)
{
        return lru->shrinker_id;
}

static inline bool list_lru_memcg_aware(struct list_lru *lru)
{
        return lru->memcg_aware;
}

static inline struct list_lru_one *
list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
{
        struct list_lru_memcg *memcg_lrus;
        /*
         * Either lock or RCU protects the array of per cgroup lists
         * from relocation (see memcg_update_list_lru_node).
         */
        memcg_lrus = rcu_dereference_check(nlru->memcg_lrus,
                                           lockdep_is_held(&nlru->lock));
        if (memcg_lrus && idx >= 0)
                return memcg_lrus->lru[idx];
        return &nlru->lru;
}

static inline struct list_lru_one *
list_lru_from_kmem(struct list_lru_node *nlru, void *ptr,
                   struct mem_cgroup **memcg_ptr)
{
        struct list_lru_one *l = &nlru->lru;
        struct mem_cgroup *memcg = NULL;

        if (!nlru->memcg_lrus)
                goto out;

        memcg = mem_cgroup_from_obj(ptr);
        if (!memcg)
                goto out;

        l = list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
out:
        if (memcg_ptr)
                *memcg_ptr = memcg;
        return l;
}
#else
static void list_lru_register(struct list_lru *lru)
{
}

static void list_lru_unregister(struct list_lru *lru)
{
}

static int lru_shrinker_id(struct list_lru *lru)
{
        return -1;
}

static inline bool list_lru_memcg_aware(struct list_lru *lru)
{
        return false;
}

static inline struct list_lru_one *
list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
{
        return &nlru->lru;
}

static inline struct list_lru_one *
list_lru_from_kmem(struct list_lru_node *nlru, void *ptr,
                   struct mem_cgroup **memcg_ptr)
{
        if (memcg_ptr)
                *memcg_ptr = NULL;
        return &nlru->lru;
}
#endif /* CONFIG_MEMCG_KMEM */

bool list_lru_add(struct list_lru *lru, struct list_head *item)
{
        int nid = page_to_nid(virt_to_page(item));
        struct list_lru_node *nlru = &lru->node[nid];
        struct mem_cgroup *memcg;
        struct list_lru_one *l;

        spin_lock(&nlru->lock);
        if (list_empty(item)) {
                l = list_lru_from_kmem(nlru, item, &memcg);
                list_add_tail(item, &l->list);
                /* Set shrinker bit if the first element was added */
                if (!l->nr_items++)
                        memcg_set_shrinker_bit(memcg, nid,
                                               lru_shrinker_id(lru));
                nlru->nr_items++;
                spin_unlock(&nlru->lock);
                return true;
        }
        spin_unlock(&nlru->lock);
        return false;
}
EXPORT_SYMBOL_GPL(list_lru_add);

bool list_lru_del(struct list_lru *lru, struct list_head *item)
{
        int nid = page_to_nid(virt_to_page(item));
        struct list_lru_node *nlru = &lru->node[nid];
        struct list_lru_one *l;

        spin_lock(&nlru->lock);
        if (!list_empty(item)) {
                l = list_lru_from_kmem(nlru, item, NULL);
                list_del_init(item);
                l->nr_items--;
                nlru->nr_items--;
                spin_unlock(&nlru->lock);
                return true;
        }
        spin_unlock(&nlru->lock);
        return false;
}
EXPORT_SYMBOL_GPL(list_lru_del);

void list_lru_isolate(struct list_lru_one *list, struct list_head *item)
{
        list_del_init(item);
        list->nr_items--;
}
EXPORT_SYMBOL_GPL(list_lru_isolate);

void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
                           struct list_head *head)
{
        list_move(item, head);
        list->nr_items--;
}
EXPORT_SYMBOL_GPL(list_lru_isolate_move);

unsigned long list_lru_count_one(struct list_lru *lru,
                                 int nid, struct mem_cgroup *memcg)
{
        struct list_lru_node *nlru = &lru->node[nid];
        struct list_lru_one *l;
        unsigned long count;

        rcu_read_lock();
        l = list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
        count = READ_ONCE(l->nr_items);
        rcu_read_unlock();

        return count;
}
EXPORT_SYMBOL_GPL(list_lru_count_one);

unsigned long list_lru_count_node(struct list_lru *lru, int nid)
{
        struct list_lru_node *nlru;

        nlru = &lru->node[nid];
        return nlru->nr_items;
}
EXPORT_SYMBOL_GPL(list_lru_count_node);

static unsigned long
__list_lru_walk_one(struct list_lru_node *nlru, int memcg_idx,
                    list_lru_walk_cb isolate, void *cb_arg,
                    unsigned long *nr_to_walk)
{

        struct list_lru_one *l;
        struct list_head *item, *n;
        unsigned long isolated = 0;

        l = list_lru_from_memcg_idx(nlru, memcg_idx);
restart:
        list_for_each_safe(item, n, &l->list) {
                enum lru_status ret;

                /*
                 * decrement nr_to_walk first so that we don't livelock if we
                 * get stuck on large numbers of LRU_RETRY items
                 */
                if (!*nr_to_walk)
                        break;
                --*nr_to_walk;

                ret = isolate(item, l, &nlru->lock, cb_arg);
                switch (ret) {
                case LRU_REMOVED_RETRY:
                        assert_spin_locked(&nlru->lock);
                        fallthrough;
                case LRU_REMOVED:
                        isolated++;
                        nlru->nr_items--;
                        /*
                         * If the lru lock has been dropped, our list
                         * traversal is now invalid and so we have to
                         * restart from scratch.
                         */
                        if (ret == LRU_REMOVED_RETRY)
                                goto restart;
                        break;
                case LRU_ROTATE:
                        list_move_tail(item, &l->list);
                        break;
                case LRU_SKIP:
                        break;
                case LRU_RETRY:
                        /*
                         * The lru lock has been dropped, our list traversal is
                         * now invalid and so we have to restart from scratch.
                         */
                        assert_spin_locked(&nlru->lock);
                        goto restart;
                default:
                        BUG();
                }
        }
        return isolated;
}

unsigned long
list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
                  list_lru_walk_cb isolate, void *cb_arg,
                  unsigned long *nr_to_walk)
{
        struct list_lru_node *nlru = &lru->node[nid];
        unsigned long ret;

        spin_lock(&nlru->lock);
        ret = __list_lru_walk_one(nlru, memcg_cache_id(memcg), isolate, cb_arg,
                                  nr_to_walk);
        spin_unlock(&nlru->lock);
        return ret;
}
EXPORT_SYMBOL_GPL(list_lru_walk_one);

unsigned long
list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
                      list_lru_walk_cb isolate, void *cb_arg,
                      unsigned long *nr_to_walk)
{
        struct list_lru_node *nlru = &lru->node[nid];
        unsigned long ret;

        spin_lock_irq(&nlru->lock);
        ret = __list_lru_walk_one(nlru, memcg_cache_id(memcg), isolate, cb_arg,
                                  nr_to_walk);
        spin_unlock_irq(&nlru->lock);
        return ret;
}

unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
                                 list_lru_walk_cb isolate, void *cb_arg,
                                 unsigned long *nr_to_walk)
{
        long isolated = 0;
        int memcg_idx;

        isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg,
                                      nr_to_walk);
        if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
                for_each_memcg_cache_index(memcg_idx) {
                        struct list_lru_node *nlru = &lru->node[nid];

                        spin_lock(&nlru->lock);
                        isolated += __list_lru_walk_one(nlru, memcg_idx,
                                                        isolate, cb_arg,
                                                        nr_to_walk);
                        spin_unlock(&nlru->lock);

                        if (*nr_to_walk <= 0)
                                break;
                }
        }
        return isolated;
}
EXPORT_SYMBOL_GPL(list_lru_walk_node);

static void init_one_lru(struct list_lru_one *l)
{
        INIT_LIST_HEAD(&l->list);
        l->nr_items = 0;
}

#ifdef CONFIG_MEMCG_KMEM
static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus,
                                          int begin, int end)
{
        int i;

        for (i = begin; i < end; i++)
                kfree(memcg_lrus->lru[i]);
}

static int __memcg_init_list_lru_node(struct list_lru_memcg *memcg_lrus,
                                      int begin, int end)
{
        int i;

        for (i = begin; i < end; i++) {
                struct list_lru_one *l;

                l = kmalloc(sizeof(struct list_lru_one), GFP_KERNEL);
                if (!l)
                        goto fail;

                init_one_lru(l);
                memcg_lrus->lru[i] = l;
        }
        return 0;
fail:
        __memcg_destroy_list_lru_node(memcg_lrus, begin, i);
        return -ENOMEM;
}

static int memcg_init_list_lru_node(struct list_lru_node *nlru)
{
        struct list_lru_memcg *memcg_lrus;
        int size = memcg_nr_cache_ids;

        memcg_lrus = kvmalloc(sizeof(*memcg_lrus) +
                              size * sizeof(void *), GFP_KERNEL);
        if (!memcg_lrus)
                return -ENOMEM;

        if (__memcg_init_list_lru_node(memcg_lrus, 0, size)) {
                kvfree(memcg_lrus);
                return -ENOMEM;
        }
        RCU_INIT_POINTER(nlru->memcg_lrus, memcg_lrus);

        return 0;
}

static void memcg_destroy_list_lru_node(struct list_lru_node *nlru)
{
        struct list_lru_memcg *memcg_lrus;
        /*
         * This is called when shrinker has already been unregistered,
         * and nobody can use it. So, there is no need to use kvfree_rcu_local().
         */
        memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus, true);
        __memcg_destroy_list_lru_node(memcg_lrus, 0, memcg_nr_cache_ids);
        kvfree(memcg_lrus);
}

static void kvfree_rcu_local(struct rcu_head *head)
{
        struct list_lru_memcg *mlru;

        mlru = container_of(head, struct list_lru_memcg, rcu);
        kvfree(mlru);
}

static int memcg_update_list_lru_node(struct list_lru_node *nlru,
                                      int old_size, int new_size)
{
        struct list_lru_memcg *old, *new;

        BUG_ON(old_size > new_size);

        old = rcu_dereference_protected(nlru->memcg_lrus,
                                        lockdep_is_held(&list_lrus_mutex));
        new = kvmalloc(sizeof(*new) + new_size * sizeof(void *), GFP_KERNEL);
        if (!new)
                return -ENOMEM;

        if (__memcg_init_list_lru_node(new, old_size, new_size)) {
                kvfree(new);
                return -ENOMEM;
        }

        memcpy(&new->lru, &old->lru, old_size * sizeof(void *));

        /*
         * The locking below allows readers that hold nlru->lock avoid taking
         * rcu_read_lock (see list_lru_from_memcg_idx).
         *
         * Since list_lru_{add,del} may be called under an IRQ-safe lock,
         * we have to use IRQ-safe primitives here to avoid deadlock.
         */
        spin_lock_irq(&nlru->lock);
        rcu_assign_pointer(nlru->memcg_lrus, new);
        spin_unlock_irq(&nlru->lock);

        call_rcu(&old->rcu, kvfree_rcu_local);
        return 0;
}

static void memcg_cancel_update_list_lru_node(struct list_lru_node *nlru,
                                              int old_size, int new_size)
{
        struct list_lru_memcg *memcg_lrus;

        memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus,
                                               lockdep_is_held(&list_lrus_mutex));
        /* do not bother shrinking the array back to the old size, because we
         * cannot handle allocation failures here */
        __memcg_destroy_list_lru_node(memcg_lrus, old_size, new_size);
}

static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
{
        int i;

        lru->memcg_aware = memcg_aware;

        if (!memcg_aware)
                return 0;

        for_each_node(i) {
                if (memcg_init_list_lru_node(&lru->node[i]))
                        goto fail;
        }
        return 0;
fail:
        for (i = i - 1; i >= 0; i--) {
                if (!lru->node[i].memcg_lrus)
                        continue;
                memcg_destroy_list_lru_node(&lru->node[i]);
        }
        return -ENOMEM;
}

static void memcg_destroy_list_lru(struct list_lru *lru)
{
        int i;

        if (!list_lru_memcg_aware(lru))
                return;

        for_each_node(i)
                memcg_destroy_list_lru_node(&lru->node[i]);
}

static int memcg_update_list_lru(struct list_lru *lru,
                                 int old_size, int new_size)
{
        int i;

        if (!list_lru_memcg_aware(lru))
                return 0;

        for_each_node(i) {
                if (memcg_update_list_lru_node(&lru->node[i],
                                               old_size, new_size))
                        goto fail;
        }
        return 0;
fail:
        for (i = i - 1; i >= 0; i--) {
                if (!lru->node[i].memcg_lrus)
                        continue;

                memcg_cancel_update_list_lru_node(&lru->node[i],
                                                  old_size, new_size);
        }
        return -ENOMEM;
}

static void memcg_cancel_update_list_lru(struct list_lru *lru,
                                         int old_size, int new_size)
{
        int i;

        if (!list_lru_memcg_aware(lru))
                return;

        for_each_node(i)
                memcg_cancel_update_list_lru_node(&lru->node[i],
                                                  old_size, new_size);
}

int memcg_update_all_list_lrus(int new_size)
{
        int ret = 0;
        struct list_lru *lru;
        int old_size = memcg_nr_cache_ids;

        mutex_lock(&list_lrus_mutex);
        list_for_each_entry(lru, &list_lrus, list) {
                ret = memcg_update_list_lru(lru, old_size, new_size);
                if (ret)
                        goto fail;
        }
out:
        mutex_unlock(&list_lrus_mutex);
        return ret;
fail:
        list_for_each_entry_continue_reverse(lru, &list_lrus, list)
                memcg_cancel_update_list_lru(lru, old_size, new_size);
        goto out;
}

static void memcg_drain_list_lru_node(struct list_lru *lru, int nid,
                                      int src_idx, struct mem_cgroup *dst_memcg)
{
        struct list_lru_node *nlru = &lru->node[nid];
        int dst_idx = dst_memcg->kmemcg_id;
        struct list_lru_one *src, *dst;

        /*
         * Since list_lru_{add,del} may be called under an IRQ-safe lock,
         * we have to use IRQ-safe primitives here to avoid deadlock.
         */
        spin_lock_irq(&nlru->lock);

        src = list_lru_from_memcg_idx(nlru, src_idx);
        dst = list_lru_from_memcg_idx(nlru, dst_idx);

        list_splice_init(&src->list, &dst->list);

        if (src->nr_items) {
                dst->nr_items += src->nr_items;
                memcg_set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru));
                src->nr_items = 0;
        }

        spin_unlock_irq(&nlru->lock);
}

static void memcg_drain_list_lru(struct list_lru *lru,
                                 int src_idx, struct mem_cgroup *dst_memcg)
{
        int i;

        if (!list_lru_memcg_aware(lru))
                return;

        for_each_node(i)
                memcg_drain_list_lru_node(lru, i, src_idx, dst_memcg);
}

void memcg_drain_all_list_lrus(int src_idx, struct mem_cgroup *dst_memcg)
{
        struct list_lru *lru;

        mutex_lock(&list_lrus_mutex);
        list_for_each_entry(lru, &list_lrus, list)
                memcg_drain_list_lru(lru, src_idx, dst_memcg);
        mutex_unlock(&list_lrus_mutex);
}
#else
static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
{
        return 0;
}

static void memcg_destroy_list_lru(struct list_lru *lru)
{
}
#endif /* CONFIG_MEMCG_KMEM */

int __list_lru_init(struct list_lru *lru, bool memcg_aware,
                    struct lock_class_key *key, struct shrinker *shrinker)
{
        int i;
        int err = -ENOMEM;

#ifdef CONFIG_MEMCG_KMEM
        if (shrinker)
                lru->shrinker_id = shrinker->id;
        else
                lru->shrinker_id = -1;
#endif
        memcg_get_cache_ids();

        lru->node = kcalloc(nr_node_ids, sizeof(*lru->node), GFP_KERNEL);
        if (!lru->node)
                goto out;

        for_each_node(i) {
                spin_lock_init(&lru->node[i].lock);
                if (key)
                        lockdep_set_class(&lru->node[i].lock, key);
                init_one_lru(&lru->node[i].lru);
        }

        err = memcg_init_list_lru(lru, memcg_aware);
        if (err) {
                kfree(lru->node);
                /* Do this so a list_lru_destroy() doesn't crash: */
                lru->node = NULL;
                goto out;
        }

        list_lru_register(lru);
out:
        memcg_put_cache_ids();
        return err;
}
EXPORT_SYMBOL_GPL(__list_lru_init);

void list_lru_destroy(struct list_lru *lru)
{
        /* Already destroyed or not yet initialized? */
        if (!lru->node)
                return;

        memcg_get_cache_ids();

        list_lru_unregister(lru);

        memcg_destroy_list_lru(lru);
        kfree(lru->node);
        lru->node = NULL;

#ifdef CONFIG_MEMCG_KMEM
        lru->shrinker_id = -1;
#endif
        memcg_put_cache_ids();
}
EXPORT_SYMBOL_GPL(list_lru_destroy);

























































































    5 






























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
/*
 * include/linux/topology.h
 *
 * Written by: Matthew Dobson, IBM Corporation
 *
 * Copyright (C) 2002, IBM Corp.
 *
 * All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
 * NON INFRINGEMENT.  See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 * Send feedback to <colpatch@us.ibm.com>
 */
#ifndef _LINUX_TOPOLOGY_H
#define _LINUX_TOPOLOGY_H

#include <linux/arch_topology.h>
#include <linux/cpumask.h>
#include <linux/bitops.h>
#include <linux/mmzone.h>
#include <linux/smp.h>
#include <linux/percpu.h>
#include <asm/topology.h>

#ifndef nr_cpus_node
#define nr_cpus_node(node) cpumask_weight(cpumask_of_node(node))
#endif

#define for_each_node_with_cpus(node)                        \
        for_each_online_node(node)                        \
                if (nr_cpus_node(node))

int arch_update_cpu_topology(void);

/* Conform to ACPI 2.0 SLIT distance definitions */
#define LOCAL_DISTANCE                10
#define REMOTE_DISTANCE                20
#define DISTANCE_BITS           8
#ifndef node_distance
#define node_distance(from,to)        ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE)
#endif
#ifndef RECLAIM_DISTANCE
/*
 * If the distance between nodes in a system is larger than RECLAIM_DISTANCE
 * (in whatever arch specific measurement units returned by node_distance())
 * and node_reclaim_mode is enabled then the VM will only call node_reclaim()
 * on nodes within this distance.
 */
#define RECLAIM_DISTANCE 30
#endif

/*
 * The following tunable allows platforms to override the default node
 * reclaim distance (RECLAIM_DISTANCE) if remote memory accesses are
 * sufficiently fast that the default value actually hurts
 * performance.
 *
 * AMD EPYC machines use this because even though the 2-hop distance
 * is 32 (3.2x slower than a local memory access) performance actually
 * *improves* if allowed to reclaim memory and load balance tasks
 * between NUMA nodes 2-hops apart.
 */
extern int __read_mostly node_reclaim_distance;

#ifndef PENALTY_FOR_NODE_WITH_CPUS
#define PENALTY_FOR_NODE_WITH_CPUS        (1)
#endif

#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DECLARE_PER_CPU(int, numa_node);

#ifndef numa_node_id
/* Returns the number of the current Node. */
static inline int numa_node_id(void)
{
        return raw_cpu_read(numa_node);
}
#endif

#ifndef cpu_to_node
static inline int cpu_to_node(int cpu)
{
        return per_cpu(numa_node, cpu);
}
#endif

#ifndef set_numa_node
static inline void set_numa_node(int node)
{
        this_cpu_write(numa_node, node);
}
#endif

#ifndef set_cpu_numa_node
static inline void set_cpu_numa_node(int cpu, int node)
{
        per_cpu(numa_node, cpu) = node;
}
#endif

#else        /* !CONFIG_USE_PERCPU_NUMA_NODE_ID */

/* Returns the number of the current Node. */
#ifndef numa_node_id
static inline int numa_node_id(void)
{
        return cpu_to_node(raw_smp_processor_id());
}
#endif

#endif        /* [!]CONFIG_USE_PERCPU_NUMA_NODE_ID */

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

/*
 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem().
 */
DECLARE_PER_CPU(int, _numa_mem_);

#ifndef set_numa_mem
static inline void set_numa_mem(int node)
{
        this_cpu_write(_numa_mem_, node);
}
#endif

#ifndef numa_mem_id
/* Returns the number of the nearest Node with memory */
static inline int numa_mem_id(void)
{
        return raw_cpu_read(_numa_mem_);
}
#endif

#ifndef cpu_to_mem
static inline int cpu_to_mem(int cpu)
{
        return per_cpu(_numa_mem_, cpu);
}
#endif

#ifndef set_cpu_numa_mem
static inline void set_cpu_numa_mem(int cpu, int node)
{
        per_cpu(_numa_mem_, cpu) = node;
}
#endif

#else        /* !CONFIG_HAVE_MEMORYLESS_NODES */

#ifndef numa_mem_id
/* Returns the number of the nearest Node with memory */
static inline int numa_mem_id(void)
{
        return numa_node_id();
}
#endif

#ifndef cpu_to_mem
static inline int cpu_to_mem(int cpu)
{
        return cpu_to_node(cpu);
}
#endif

#endif        /* [!]CONFIG_HAVE_MEMORYLESS_NODES */

#ifndef topology_physical_package_id
#define topology_physical_package_id(cpu)        ((void)(cpu), -1)
#endif
#ifndef topology_die_id
#define topology_die_id(cpu)                        ((void)(cpu), -1)
#endif
#ifndef topology_core_id
#define topology_core_id(cpu)                        ((void)(cpu), 0)
#endif
#ifndef topology_sibling_cpumask
#define topology_sibling_cpumask(cpu)                cpumask_of(cpu)
#endif
#ifndef topology_core_cpumask
#define topology_core_cpumask(cpu)                cpumask_of(cpu)
#endif
#ifndef topology_die_cpumask
#define topology_die_cpumask(cpu)                cpumask_of(cpu)
#endif

#if defined(CONFIG_SCHED_SMT) && !defined(cpu_smt_mask)
static inline const struct cpumask *cpu_smt_mask(int cpu)
{
        return topology_sibling_cpumask(cpu);
}
#endif

static inline const struct cpumask *cpu_cpu_mask(int cpu)
{
        return cpumask_of_node(cpu_to_node(cpu));
}


#endif /* _LINUX_TOPOLOGY_H */



































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Framework and drivers for configuring and reading different PHYs
 * Based on code in sungem_phy.c and (long-removed) gianfar_phy.c
 *
 * Author: Andy Fleming
 *
 * Copyright (c) 2004 Freescale Semiconductor, Inc.
 */

#ifndef __PHY_H
#define __PHY_H

#include <linux/compiler.h>
#include <linux/spinlock.h>
#include <linux/ethtool.h>
#include <linux/linkmode.h>
#include <linux/netlink.h>
#include <linux/mdio.h>
#include <linux/mii.h>
#include <linux/mii_timestamper.h>
#include <linux/module.h>
#include <linux/timer.h>
#include <linux/workqueue.h>
#include <linux/mod_devicetable.h>
#include <linux/u64_stats_sync.h>
#include <linux/irqreturn.h>
#include <linux/iopoll.h>
#include <linux/refcount.h>

#include <linux/atomic.h>

#define PHY_DEFAULT_FEATURES        (SUPPORTED_Autoneg | \
                                 SUPPORTED_TP | \
                                 SUPPORTED_MII)

#define PHY_10BT_FEATURES        (SUPPORTED_10baseT_Half | \
                                 SUPPORTED_10baseT_Full)

#define PHY_100BT_FEATURES        (SUPPORTED_100baseT_Half | \
                                 SUPPORTED_100baseT_Full)

#define PHY_1000BT_FEATURES        (SUPPORTED_1000baseT_Half | \
                                 SUPPORTED_1000baseT_Full)

extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_basic_features) __ro_after_init;
extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_basic_t1_features) __ro_after_init;
extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_gbit_features) __ro_after_init;
extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_gbit_fibre_features) __ro_after_init;
extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_gbit_all_ports_features) __ro_after_init;
extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_10gbit_features) __ro_after_init;
extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_10gbit_fec_features) __ro_after_init;
extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_10gbit_full_features) __ro_after_init;

#define PHY_BASIC_FEATURES ((unsigned long *)&phy_basic_features)
#define PHY_BASIC_T1_FEATURES ((unsigned long *)&phy_basic_t1_features)
#define PHY_GBIT_FEATURES ((unsigned long *)&phy_gbit_features)
#define PHY_GBIT_FIBRE_FEATURES ((unsigned long *)&phy_gbit_fibre_features)
#define PHY_GBIT_ALL_PORTS_FEATURES ((unsigned long *)&phy_gbit_all_ports_features)
#define PHY_10GBIT_FEATURES ((unsigned long *)&phy_10gbit_features)
#define PHY_10GBIT_FEC_FEATURES ((unsigned long *)&phy_10gbit_fec_features)
#define PHY_10GBIT_FULL_FEATURES ((unsigned long *)&phy_10gbit_full_features)

extern const int phy_basic_ports_array[3];
extern const int phy_fibre_port_array[1];
extern const int phy_all_ports_features_array[7];
extern const int phy_10_100_features_array[4];
extern const int phy_basic_t1_features_array[2];
extern const int phy_gbit_features_array[2];
extern const int phy_10gbit_features_array[1];

/*
 * Set phydev->irq to PHY_POLL if interrupts are not supported,
 * or not desired for this PHY.  Set to PHY_IGNORE_INTERRUPT if
 * the attached driver handles the interrupt
 */
#define PHY_POLL                -1
#define PHY_IGNORE_INTERRUPT        -2

#define PHY_IS_INTERNAL                0x00000001
#define PHY_RST_AFTER_CLK_EN        0x00000002
#define PHY_POLL_CABLE_TEST        0x00000004
#define MDIO_DEVICE_IS_PHY        0x80000000

/**
 * enum phy_interface_t - Interface Mode definitions
 *
 * @PHY_INTERFACE_MODE_NA: Not Applicable - don't touch
 * @PHY_INTERFACE_MODE_INTERNAL: No interface, MAC and PHY combined
 * @PHY_INTERFACE_MODE_MII: Median-independent interface
 * @PHY_INTERFACE_MODE_GMII: Gigabit median-independent interface
 * @PHY_INTERFACE_MODE_SGMII: Serial gigabit media-independent interface
 * @PHY_INTERFACE_MODE_TBI: Ten Bit Interface
 * @PHY_INTERFACE_MODE_REVMII: Reverse Media Independent Interface
 * @PHY_INTERFACE_MODE_RMII: Reduced Media Independent Interface
 * @PHY_INTERFACE_MODE_RGMII: Reduced gigabit media-independent interface
 * @PHY_INTERFACE_MODE_RGMII_ID: RGMII with Internal RX+TX delay
 * @PHY_INTERFACE_MODE_RGMII_RXID: RGMII with Internal RX delay
 * @PHY_INTERFACE_MODE_RGMII_TXID: RGMII with Internal RX delay
 * @PHY_INTERFACE_MODE_RTBI: Reduced TBI
 * @PHY_INTERFACE_MODE_SMII: ??? MII
 * @PHY_INTERFACE_MODE_XGMII: 10 gigabit media-independent interface
 * @PHY_INTERFACE_MODE_XLGMII:40 gigabit media-independent interface
 * @PHY_INTERFACE_MODE_MOCA: Multimedia over Coax
 * @PHY_INTERFACE_MODE_QSGMII: Quad SGMII
 * @PHY_INTERFACE_MODE_TRGMII: Turbo RGMII
 * @PHY_INTERFACE_MODE_1000BASEX: 1000 BaseX
 * @PHY_INTERFACE_MODE_2500BASEX: 2500 BaseX
 * @PHY_INTERFACE_MODE_RXAUI: Reduced XAUI
 * @PHY_INTERFACE_MODE_XAUI: 10 Gigabit Attachment Unit Interface
 * @PHY_INTERFACE_MODE_10GBASER: 10G BaseR
 * @PHY_INTERFACE_MODE_USXGMII:  Universal Serial 10GE MII
 * @PHY_INTERFACE_MODE_10GKR: 10GBASE-KR - with Clause 73 AN
 * @PHY_INTERFACE_MODE_MAX: Book keeping
 *
 * Describes the interface between the MAC and PHY.
 */
typedef enum {
        PHY_INTERFACE_MODE_NA,
        PHY_INTERFACE_MODE_INTERNAL,
        PHY_INTERFACE_MODE_MII,
        PHY_INTERFACE_MODE_GMII,
        PHY_INTERFACE_MODE_SGMII,
        PHY_INTERFACE_MODE_TBI,
        PHY_INTERFACE_MODE_REVMII,
        PHY_INTERFACE_MODE_RMII,
        PHY_INTERFACE_MODE_RGMII,
        PHY_INTERFACE_MODE_RGMII_ID,
        PHY_INTERFACE_MODE_RGMII_RXID,
        PHY_INTERFACE_MODE_RGMII_TXID,
        PHY_INTERFACE_MODE_RTBI,
        PHY_INTERFACE_MODE_SMII,
        PHY_INTERFACE_MODE_XGMII,
        PHY_INTERFACE_MODE_XLGMII,
        PHY_INTERFACE_MODE_MOCA,
        PHY_INTERFACE_MODE_QSGMII,
        PHY_INTERFACE_MODE_TRGMII,
        PHY_INTERFACE_MODE_1000BASEX,
        PHY_INTERFACE_MODE_2500BASEX,
        PHY_INTERFACE_MODE_RXAUI,
        PHY_INTERFACE_MODE_XAUI,
        /* 10GBASE-R, XFI, SFI - single lane 10G Serdes */
        PHY_INTERFACE_MODE_10GBASER,
        PHY_INTERFACE_MODE_USXGMII,
        /* 10GBASE-KR - with Clause 73 AN */
        PHY_INTERFACE_MODE_10GKR,
        PHY_INTERFACE_MODE_MAX,
} phy_interface_t;

/*
 * phy_supported_speeds - return all speeds currently supported by a PHY device
 */
unsigned int phy_supported_speeds(struct phy_device *phy,
                                      unsigned int *speeds,
                                      unsigned int size);

/**
 * phy_modes - map phy_interface_t enum to device tree binding of phy-mode
 * @interface: enum phy_interface_t value
 *
 * Description: maps enum &phy_interface_t defined in this file
 * into the device tree binding of 'phy-mode', so that Ethernet
 * device driver can get PHY interface from device tree.
 */
static inline const char *phy_modes(phy_interface_t interface)
{
        switch (interface) {
        case PHY_INTERFACE_MODE_NA:
                return "";
        case PHY_INTERFACE_MODE_INTERNAL:
                return "internal";
        case PHY_INTERFACE_MODE_MII:
                return "mii";
        case PHY_INTERFACE_MODE_GMII:
                return "gmii";
        case PHY_INTERFACE_MODE_SGMII:
                return "sgmii";
        case PHY_INTERFACE_MODE_TBI:
                return "tbi";
        case PHY_INTERFACE_MODE_REVMII:
                return "rev-mii";
        case PHY_INTERFACE_MODE_RMII:
                return "rmii";
        case PHY_INTERFACE_MODE_RGMII:
                return "rgmii";
        case PHY_INTERFACE_MODE_RGMII_ID:
                return "rgmii-id";
        case PHY_INTERFACE_MODE_RGMII_RXID:
                return "rgmii-rxid";
        case PHY_INTERFACE_MODE_RGMII_TXID:
                return "rgmii-txid";
        case PHY_INTERFACE_MODE_RTBI:
                return "rtbi";
        case PHY_INTERFACE_MODE_SMII:
                return "smii";
        case PHY_INTERFACE_MODE_XGMII:
                return "xgmii";
        case PHY_INTERFACE_MODE_XLGMII:
                return "xlgmii";
        case PHY_INTERFACE_MODE_MOCA:
                return "moca";
        case PHY_INTERFACE_MODE_QSGMII:
                return "qsgmii";
        case PHY_INTERFACE_MODE_TRGMII:
                return "trgmii";
        case PHY_INTERFACE_MODE_1000BASEX:
                return "1000base-x";
        case PHY_INTERFACE_MODE_2500BASEX:
                return "2500base-x";
        case PHY_INTERFACE_MODE_RXAUI:
                return "rxaui";
        case PHY_INTERFACE_MODE_XAUI:
                return "xaui";
        case PHY_INTERFACE_MODE_10GBASER:
                return "10gbase-r";
        case PHY_INTERFACE_MODE_USXGMII:
                return "usxgmii";
        case PHY_INTERFACE_MODE_10GKR:
                return "10gbase-kr";
        default:
                return "unknown";
        }
}


#define PHY_INIT_TIMEOUT        100000
#define PHY_FORCE_TIMEOUT        10

#define PHY_MAX_ADDR        32

/* Used when trying to connect to a specific phy (mii bus id:phy device id) */
#define PHY_ID_FMT "%s:%02x"

#define MII_BUS_ID_SIZE        61

struct device;
struct phylink;
struct sfp_bus;
struct sfp_upstream_ops;
struct sk_buff;

/**
 * struct mdio_bus_stats - Statistics counters for MDIO busses
 * @transfers: Total number of transfers, i.e. @writes + @reads
 * @errors: Number of MDIO transfers that returned an error
 * @writes: Number of write transfers
 * @reads: Number of read transfers
 * @syncp: Synchronisation for incrementing statistics
 */
struct mdio_bus_stats {
        u64_stats_t transfers;
        u64_stats_t errors;
        u64_stats_t writes;
        u64_stats_t reads;
        /* Must be last, add new statistics above */
        struct u64_stats_sync syncp;
};

/**
 * struct phy_package_shared - Shared information in PHY packages
 * @addr: Common PHY address used to combine PHYs in one package
 * @refcnt: Number of PHYs connected to this shared data
 * @flags: Initialization of PHY package
 * @priv_size: Size of the shared private data @priv
 * @priv: Driver private data shared across a PHY package
 *
 * Represents a shared structure between different phydev's in the same
 * package, for example a quad PHY. See phy_package_join() and
 * phy_package_leave().
 */
struct phy_package_shared {
        int addr;
        refcount_t refcnt;
        unsigned long flags;
        size_t priv_size;

        /* private data pointer */
        /* note that this pointer is shared between different phydevs and
         * the user has to take care of appropriate locking. It is allocated
         * and freed automatically by phy_package_join() and
         * phy_package_leave().
         */
        void *priv;
};

/* used as bit number in atomic bitops */
#define PHY_SHARED_F_INIT_DONE  0
#define PHY_SHARED_F_PROBE_DONE 1

/**
 * struct mii_bus - Represents an MDIO bus
 *
 * @owner: Who owns this device
 * @name: User friendly name for this MDIO device, or driver name
 * @id: Unique identifier for this bus, typical from bus hierarchy
 * @priv: Driver private data
 *
 * The Bus class for PHYs.  Devices which provide access to
 * PHYs should register using this structure
 */
struct mii_bus {
        struct module *owner;
        const char *name;
        char id[MII_BUS_ID_SIZE];
        void *priv;
        /** @read: Perform a read transfer on the bus */
        int (*read)(struct mii_bus *bus, int addr, int regnum);
        /** @write: Perform a write transfer on the bus */
        int (*write)(struct mii_bus *bus, int addr, int regnum, u16 val);
        /** @reset: Perform a reset of the bus */
        int (*reset)(struct mii_bus *bus);

        /** @stats: Statistic counters per device on the bus */
        struct mdio_bus_stats stats[PHY_MAX_ADDR];

        /**
         * @mdio_lock: A lock to ensure that only one thing can read/write
         * the MDIO bus at a time
         */
        struct mutex mdio_lock;

        /** @parent: Parent device of this bus */
        struct device *parent;
        /** @state: State of bus structure */
        enum {
                MDIOBUS_ALLOCATED = 1,
                MDIOBUS_REGISTERED,
                MDIOBUS_UNREGISTERED,
                MDIOBUS_RELEASED,
        } state;

        /** @dev: Kernel device representation */
        struct device dev;

        /** @mdio_map: list of all MDIO devices on bus */
        struct mdio_device *mdio_map[PHY_MAX_ADDR];

        /** @phy_mask: PHY addresses to be ignored when probing */
        u32 phy_mask;

        /** @phy_ignore_ta_mask: PHY addresses to ignore the TA/read failure */
        u32 phy_ignore_ta_mask;

        /**
         * @irq: An array of interrupts, each PHY's interrupt at the index
         * matching its address
         */
        int irq[PHY_MAX_ADDR];

        /** @reset_delay_us: GPIO reset pulse width in microseconds */
        int reset_delay_us;
        /** @reset_post_delay_us: GPIO reset deassert delay in microseconds */
        int reset_post_delay_us;
        /** @reset_gpiod: Reset GPIO descriptor pointer */
        struct gpio_desc *reset_gpiod;

        /** @probe_capabilities: bus capabilities, used for probing */
        enum {
                MDIOBUS_NO_CAP = 0,
                MDIOBUS_C22,
                MDIOBUS_C45,
                MDIOBUS_C22_C45,
        } probe_capabilities;

        /** @shared_lock: protect access to the shared element */
        struct mutex shared_lock;

        /** @shared: shared state across different PHYs */
        struct phy_package_shared *shared[PHY_MAX_ADDR];
};
#define to_mii_bus(d) container_of(d, struct mii_bus, dev)

struct mii_bus *mdiobus_alloc_size(size_t size);

/**
 * mdiobus_alloc - Allocate an MDIO bus structure
 *
 * The internal state of the MDIO bus will be set of MDIOBUS_ALLOCATED ready
 * for the driver to register the bus.
 */
static inline struct mii_bus *mdiobus_alloc(void)
{
        return mdiobus_alloc_size(0);
}

int __mdiobus_register(struct mii_bus *bus, struct module *owner);
int __devm_mdiobus_register(struct device *dev, struct mii_bus *bus,
                            struct module *owner);
#define mdiobus_register(bus) __mdiobus_register(bus, THIS_MODULE)
#define devm_mdiobus_register(dev, bus) \
                __devm_mdiobus_register(dev, bus, THIS_MODULE)

void mdiobus_unregister(struct mii_bus *bus);
void mdiobus_free(struct mii_bus *bus);
struct mii_bus *devm_mdiobus_alloc_size(struct device *dev, int sizeof_priv);
static inline struct mii_bus *devm_mdiobus_alloc(struct device *dev)
{
        return devm_mdiobus_alloc_size(dev, 0);
}

struct mii_bus *mdio_find_bus(const char *mdio_name);
struct phy_device *mdiobus_scan(struct mii_bus *bus, int addr);

#define PHY_INTERRUPT_DISABLED        false
#define PHY_INTERRUPT_ENABLED        true

/**
 * enum phy_state - PHY state machine states:
 *
 * @PHY_DOWN: PHY device and driver are not ready for anything.  probe
 * should be called if and only if the PHY is in this state,
 * given that the PHY device exists.
 * - PHY driver probe function will set the state to @PHY_READY
 *
 * @PHY_READY: PHY is ready to send and receive packets, but the
 * controller is not.  By default, PHYs which do not implement
 * probe will be set to this state by phy_probe().
 * - start will set the state to UP
 *
 * @PHY_UP: The PHY and attached device are ready to do work.
 * Interrupts should be started here.
 * - timer moves to @PHY_NOLINK or @PHY_RUNNING
 *
 * @PHY_NOLINK: PHY is up, but not currently plugged in.
 * - irq or timer will set @PHY_RUNNING if link comes back
 * - phy_stop moves to @PHY_HALTED
 *
 * @PHY_RUNNING: PHY is currently up, running, and possibly sending
 * and/or receiving packets
 * - irq or timer will set @PHY_NOLINK if link goes down
 * - phy_stop moves to @PHY_HALTED
 *
 * @PHY_CABLETEST: PHY is performing a cable test. Packet reception/sending
 * is not expected to work, carrier will be indicated as down. PHY will be
 * poll once per second, or on interrupt for it current state.
 * Once complete, move to UP to restart the PHY.
 * - phy_stop aborts the running test and moves to @PHY_HALTED
 *
 * @PHY_HALTED: PHY is up, but no polling or interrupts are done. Or
 * PHY is in an error state.
 * - phy_start moves to @PHY_UP
 */
enum phy_state {
        PHY_DOWN = 0,
        PHY_READY,
        PHY_HALTED,
        PHY_UP,
        PHY_RUNNING,
        PHY_NOLINK,
        PHY_CABLETEST,
};

#define MDIO_MMD_NUM 32

/**
 * struct phy_c45_device_ids - 802.3-c45 Device Identifiers
 * @devices_in_package: IEEE 802.3 devices in package register value.
 * @mmds_present: bit vector of MMDs present.
 * @device_ids: The device identifer for each present device.
 */
struct phy_c45_device_ids {
        u32 devices_in_package;
        u32 mmds_present;
        u32 device_ids[MDIO_MMD_NUM];
};

struct macsec_context;
struct macsec_ops;

/**
 * struct phy_device - An instance of a PHY
 *
 * @mdio: MDIO bus this PHY is on
 * @drv: Pointer to the driver for this PHY instance
 * @phy_id: UID for this device found during discovery
 * @c45_ids: 802.3-c45 Device Identifiers if is_c45.
 * @is_c45:  Set to true if this PHY uses clause 45 addressing.
 * @is_internal: Set to true if this PHY is internal to a MAC.
 * @is_pseudo_fixed_link: Set to true if this PHY is an Ethernet switch, etc.
 * @is_gigabit_capable: Set to true if PHY supports 1000Mbps
 * @has_fixups: Set to true if this PHY has fixups/quirks.
 * @suspended: Set to true if this PHY has been suspended successfully.
 * @suspended_by_mdio_bus: Set to true if this PHY was suspended by MDIO bus.
 * @sysfs_links: Internal boolean tracking sysfs symbolic links setup/removal.
 * @loopback_enabled: Set true if this PHY has been loopbacked successfully.
 * @downshifted_rate: Set true if link speed has been downshifted.
 * @state: State of the PHY for management purposes
 * @dev_flags: Device-specific flags used by the PHY driver.
 * @irq: IRQ number of the PHY's interrupt (-1 if none)
 * @phy_timer: The timer for handling the state machine
 * @phylink: Pointer to phylink instance for this PHY
 * @sfp_bus_attached: Flag indicating whether the SFP bus has been attached
 * @sfp_bus: SFP bus attached to this PHY's fiber port
 * @attached_dev: The attached enet driver's device instance ptr
 * @adjust_link: Callback for the enet controller to respond to changes: in the
 *               link state.
 * @phy_link_change: Callback for phylink for notification of link change
 * @macsec_ops: MACsec offloading ops.
 *
 * @speed: Current link speed
 * @duplex: Current duplex
 * @port: Current port
 * @pause: Current pause
 * @asym_pause: Current asymmetric pause
 * @supported: Combined MAC/PHY supported linkmodes
 * @advertising: Currently advertised linkmodes
 * @adv_old: Saved advertised while power saving for WoL
 * @lp_advertising: Current link partner advertised linkmodes
 * @eee_broken_modes: Energy efficient ethernet modes which should be prohibited
 * @autoneg: Flag autoneg being used
 * @link: Current link state
 * @autoneg_complete: Flag auto negotiation of the link has completed
 * @mdix: Current crossover
 * @mdix_ctrl: User setting of crossover
 * @interrupts: Flag interrupts have been enabled
 * @interface: enum phy_interface_t value
 * @skb: Netlink message for cable diagnostics
 * @nest: Netlink nest used for cable diagnostics
 * @ehdr: nNtlink header for cable diagnostics
 * @phy_led_triggers: Array of LED triggers
 * @phy_num_led_triggers: Number of triggers in @phy_led_triggers
 * @led_link_trigger: LED trigger for link up/down
 * @last_triggered: last LED trigger for link speed
 * @master_slave_set: User requested master/slave configuration
 * @master_slave_get: Current master/slave advertisement
 * @master_slave_state: Current master/slave configuration
 * @mii_ts: Pointer to time stamper callbacks
 * @lock:  Mutex for serialization access to PHY
 * @state_queue: Work queue for state machine
 * @shared: Pointer to private data shared by phys in one package
 * @priv: Pointer to driver private data
 *
 * interrupts currently only supports enabled or disabled,
 * but could be changed in the future to support enabling
 * and disabling specific interrupts
 *
 * Contains some infrastructure for polling and interrupt
 * handling, as well as handling shifts in PHY hardware state
 */
struct phy_device {
        struct mdio_device mdio;

        /* Information about the PHY type */
        /* And management functions */
        struct phy_driver *drv;

        u32 phy_id;

        struct phy_c45_device_ids c45_ids;
        unsigned is_c45:1;
        unsigned is_internal:1;
        unsigned is_pseudo_fixed_link:1;
        unsigned is_gigabit_capable:1;
        unsigned has_fixups:1;
        unsigned suspended:1;
        unsigned suspended_by_mdio_bus:1;
        unsigned sysfs_links:1;
        unsigned loopback_enabled:1;
        unsigned downshifted_rate:1;

        unsigned autoneg:1;
        /* The most recently read link state */
        unsigned link:1;
        unsigned autoneg_complete:1;

        /* Interrupts are enabled */
        unsigned interrupts:1;

        enum phy_state state;

        u32 dev_flags;

        phy_interface_t interface;

        /*
         * forced speed & duplex (no autoneg)
         * partner speed & duplex & pause (autoneg)
         */
        int speed;
        int duplex;
        int port;
        int pause;
        int asym_pause;
        u8 master_slave_get;
        u8 master_slave_set;
        u8 master_slave_state;

        /* Union of PHY and Attached devices' supported link modes */
        /* See ethtool.h for more info */
        __ETHTOOL_DECLARE_LINK_MODE_MASK(supported);
        __ETHTOOL_DECLARE_LINK_MODE_MASK(advertising);
        __ETHTOOL_DECLARE_LINK_MODE_MASK(lp_advertising);
        /* used with phy_speed_down */
        __ETHTOOL_DECLARE_LINK_MODE_MASK(adv_old);

        /* Energy efficient ethernet modes which should be prohibited */
        u32 eee_broken_modes;

#ifdef CONFIG_LED_TRIGGER_PHY
        struct phy_led_trigger *phy_led_triggers;
        unsigned int phy_num_led_triggers;
        struct phy_led_trigger *last_triggered;

        struct phy_led_trigger *led_link_trigger;
#endif

        /*
         * Interrupt number for this PHY
         * -1 means no interrupt
         */
        int irq;

        /* private data pointer */
        /* For use by PHYs to maintain extra state */
        void *priv;

        /* shared data pointer */
        /* For use by PHYs inside the same package that need a shared state. */
        struct phy_package_shared *shared;

        /* Reporting cable test results */
        struct sk_buff *skb;
        void *ehdr;
        struct nlattr *nest;

        /* Interrupt and Polling infrastructure */
        struct delayed_work state_queue;

        struct mutex lock;

        /* This may be modified under the rtnl lock */
        bool sfp_bus_attached;
        struct sfp_bus *sfp_bus;
        struct phylink *phylink;
        struct net_device *attached_dev;
        struct mii_timestamper *mii_ts;

        u8 mdix;
        u8 mdix_ctrl;

        void (*phy_link_change)(struct phy_device *phydev, bool up);
        void (*adjust_link)(struct net_device *dev);

#if IS_ENABLED(CONFIG_MACSEC)
        /* MACsec management functions */
        const struct macsec_ops *macsec_ops;
#endif
};
#define to_phy_device(d) container_of(to_mdio_device(d), \
                                      struct phy_device, mdio)

/**
 * struct phy_tdr_config - Configuration of a TDR raw test
 *
 * @first: Distance for first data collection point
 * @last: Distance for last data collection point
 * @step: Step between data collection points
 * @pair: Bitmap of cable pairs to collect data for
 *
 * A structure containing possible configuration parameters
 * for a TDR cable test. The driver does not need to implement
 * all the parameters, but should report what is actually used.
 * All distances are in centimeters.
 */
struct phy_tdr_config {
        u32 first;
        u32 last;
        u32 step;
        s8 pair;
};
#define PHY_PAIR_ALL -1

/**
 * struct phy_driver - Driver structure for a particular PHY type
 *
 * @mdiodrv: Data common to all MDIO devices
 * @phy_id: The result of reading the UID registers of this PHY
 *   type, and ANDing them with the phy_id_mask.  This driver
 *   only works for PHYs with IDs which match this field
 * @name: The friendly name of this PHY type
 * @phy_id_mask: Defines the important bits of the phy_id
 * @features: A mandatory list of features (speed, duplex, etc)
 *   supported by this PHY
 * @flags: A bitfield defining certain other features this PHY
 *   supports (like interrupts)
 * @driver_data: Static driver data
 *
 * All functions are optional. If config_aneg or read_status
 * are not implemented, the phy core uses the genphy versions.
 * Note that none of these functions should be called from
 * interrupt time. The goal is for the bus read/write functions
 * to be able to block when the bus transaction is happening,
 * and be freed up by an interrupt (The MPC85xx has this ability,
 * though it is not currently supported in the driver).
 */
struct phy_driver {
        struct mdio_driver_common mdiodrv;
        u32 phy_id;
        char *name;
        u32 phy_id_mask;
        const unsigned long * const features;
        u32 flags;
        const void *driver_data;

        /**
         * @soft_reset: Called to issue a PHY software reset
         */
        int (*soft_reset)(struct phy_device *phydev);

        /**
         * @config_init: Called to initialize the PHY,
         * including after a reset
         */
        int (*config_init)(struct phy_device *phydev);

        /**
         * @probe: Called during discovery.  Used to set
         * up device-specific structures, if any
         */
        int (*probe)(struct phy_device *phydev);

        /**
         * @get_features: Probe the hardware to determine what
         * abilities it has.  Should only set phydev->supported.
         */
        int (*get_features)(struct phy_device *phydev);

        /* PHY Power Management */
        /** @suspend: Suspend the hardware, saving state if needed */
        int (*suspend)(struct phy_device *phydev);
        /** @resume: Resume the hardware, restoring state if needed */
        int (*resume)(struct phy_device *phydev);

        /**
         * @config_aneg: Configures the advertisement and resets
         * autonegotiation if phydev->autoneg is on,
         * forces the speed to the current settings in phydev
         * if phydev->autoneg is off
         */
        int (*config_aneg)(struct phy_device *phydev);

        /** @aneg_done: Determines the auto negotiation result */
        int (*aneg_done)(struct phy_device *phydev);

        /** @read_status: Determines the negotiated speed and duplex */
        int (*read_status)(struct phy_device *phydev);

        /** @ack_interrupt: Clears any pending interrupts */
        int (*ack_interrupt)(struct phy_device *phydev);

        /** @config_intr: Enables or disables interrupts */
        int (*config_intr)(struct phy_device *phydev);

        /**
         * @did_interrupt: Checks if the PHY generated an interrupt.
         * For multi-PHY devices with shared PHY interrupt pin
         * Set interrupt bits have to be cleared.
         */
        int (*did_interrupt)(struct phy_device *phydev);

        /** @handle_interrupt: Override default interrupt handling */
        irqreturn_t (*handle_interrupt)(struct phy_device *phydev);

        /** @remove: Clears up any memory if needed */
        void (*remove)(struct phy_device *phydev);

        /**
         * @match_phy_device: Returns true if this is a suitable
         * driver for the given phydev.         If NULL, matching is based on
         * phy_id and phy_id_mask.
         */
        int (*match_phy_device)(struct phy_device *phydev);

        /**
         * @set_wol: Some devices (e.g. qnap TS-119P II) require PHY
         * register changes to enable Wake on LAN, so set_wol is
         * provided to be called in the ethernet driver's set_wol
         * function.
         */
        int (*set_wol)(struct phy_device *dev, struct ethtool_wolinfo *wol);

        /**
         * @get_wol: See set_wol, but for checking whether Wake on LAN
         * is enabled.
         */
        void (*get_wol)(struct phy_device *dev, struct ethtool_wolinfo *wol);

        /**
         * @link_change_notify: Called to inform a PHY device driver
         * when the core is about to change the link state. This
         * callback is supposed to be used as fixup hook for drivers
         * that need to take action when the link state
         * changes. Drivers are by no means allowed to mess with the
         * PHY device structure in their implementations.
         */
        void (*link_change_notify)(struct phy_device *dev);

        /**
         * @read_mmd: PHY specific driver override for reading a MMD
         * register.  This function is optional for PHY specific
         * drivers.  When not provided, the default MMD read function
         * will be used by phy_read_mmd(), which will use either a
         * direct read for Clause 45 PHYs or an indirect read for
         * Clause 22 PHYs.  devnum is the MMD device number within the
         * PHY device, regnum is the register within the selected MMD
         * device.
         */
        int (*read_mmd)(struct phy_device *dev, int devnum, u16 regnum);

        /**
         * @write_mmd: PHY specific driver override for writing a MMD
         * register.  This function is optional for PHY specific
         * drivers.  When not provided, the default MMD write function
         * will be used by phy_write_mmd(), which will use either a
         * direct write for Clause 45 PHYs, or an indirect write for
         * Clause 22 PHYs.  devnum is the MMD device number within the
         * PHY device, regnum is the register within the selected MMD
         * device.  val is the value to be written.
         */
        int (*write_mmd)(struct phy_device *dev, int devnum, u16 regnum,
                         u16 val);

        /** @read_page: Return the current PHY register page number */
        int (*read_page)(struct phy_device *dev);
        /** @write_page: Set the current PHY register page number */
        int (*write_page)(struct phy_device *dev, int page);

        /**
         * @module_info: Get the size and type of the eeprom contained
         * within a plug-in module
         */
        int (*module_info)(struct phy_device *dev,
                           struct ethtool_modinfo *modinfo);

        /**
         * @module_eeprom: Get the eeprom information from the plug-in
         * module
         */
        int (*module_eeprom)(struct phy_device *dev,
                             struct ethtool_eeprom *ee, u8 *data);

        /** @cable_test_start: Start a cable test */
        int (*cable_test_start)(struct phy_device *dev);

        /**  @cable_test_tdr_start: Start a raw TDR cable test */
        int (*cable_test_tdr_start)(struct phy_device *dev,
                                    const struct phy_tdr_config *config);

        /**
         * @cable_test_get_status: Once per second, or on interrupt,
         * request the status of the test.
         */
        int (*cable_test_get_status)(struct phy_device *dev, bool *finished);

        /* Get statistics from the PHY using ethtool */
        /** @get_sset_count: Number of statistic counters */
        int (*get_sset_count)(struct phy_device *dev);
        /** @get_strings: Names of the statistic counters */
        void (*get_strings)(struct phy_device *dev, u8 *data);
        /** @get_stats: Return the statistic counter values */
        void (*get_stats)(struct phy_device *dev,
                          struct ethtool_stats *stats, u64 *data);

        /* Get and Set PHY tunables */
        /** @get_tunable: Return the value of a tunable */
        int (*get_tunable)(struct phy_device *dev,
                           struct ethtool_tunable *tuna, void *data);
        /** @set_tunable: Set the value of a tunable */
        int (*set_tunable)(struct phy_device *dev,
                            struct ethtool_tunable *tuna,
                            const void *data);
        /** @set_loopback: Set the loopback mood of the PHY */
        int (*set_loopback)(struct phy_device *dev, bool enable);
        /** @get_sqi: Get the signal quality indication */
        int (*get_sqi)(struct phy_device *dev);
        /** @get_sqi_max: Get the maximum signal quality indication */
        int (*get_sqi_max)(struct phy_device *dev);
};
#define to_phy_driver(d) container_of(to_mdio_common_driver(d),                \
                                      struct phy_driver, mdiodrv)

#define PHY_ANY_ID "MATCH ANY PHY"
#define PHY_ANY_UID 0xffffffff

#define PHY_ID_MATCH_EXACT(id) .phy_id = (id), .phy_id_mask = GENMASK(31, 0)
#define PHY_ID_MATCH_MODEL(id) .phy_id = (id), .phy_id_mask = GENMASK(31, 4)
#define PHY_ID_MATCH_VENDOR(id) .phy_id = (id), .phy_id_mask = GENMASK(31, 10)

/* A Structure for boards to register fixups with the PHY Lib */
struct phy_fixup {
        struct list_head list;
        char bus_id[MII_BUS_ID_SIZE + 3];
        u32 phy_uid;
        u32 phy_uid_mask;
        int (*run)(struct phy_device *phydev);
};

const char *phy_speed_to_str(int speed);
const char *phy_duplex_to_str(unsigned int duplex);

/* A structure for mapping a particular speed and duplex
 * combination to a particular SUPPORTED and ADVERTISED value
 */
struct phy_setting {
        u32 speed;
        u8 duplex;
        u8 bit;
};

const struct phy_setting *
phy_lookup_setting(int speed, int duplex, const unsigned long *mask,
                   bool exact);
size_t phy_speeds(unsigned int *speeds, size_t size,
                  unsigned long *mask);
void of_set_phy_supported(struct phy_device *phydev);
void of_set_phy_eee_broken(struct phy_device *phydev);
int phy_speed_down_core(struct phy_device *phydev);

/**
 * phy_is_started - Convenience function to check whether PHY is started
 * @phydev: The phy_device struct
 */
static inline bool phy_is_started(struct phy_device *phydev)
{
        return phydev->state >= PHY_UP;
}

void phy_resolve_aneg_pause(struct phy_device *phydev);
void phy_resolve_aneg_linkmode(struct phy_device *phydev);
void phy_check_downshift(struct phy_device *phydev);

/**
 * phy_read - Convenience function for reading a given PHY register
 * @phydev: the phy_device struct
 * @regnum: register number to read
 *
 * NOTE: MUST NOT be called from interrupt context,
 * because the bus read/write functions may wait for an interrupt
 * to conclude the operation.
 */
static inline int phy_read(struct phy_device *phydev, u32 regnum)
{
        return mdiobus_read(phydev->mdio.bus, phydev->mdio.addr, regnum);
}

#define phy_read_poll_timeout(phydev, regnum, val, cond, sleep_us, \
                                timeout_us, sleep_before_read) \
({ \
        int __ret = read_poll_timeout(phy_read, val, (cond) || val < 0, \
                sleep_us, timeout_us, sleep_before_read, phydev, regnum); \
        if (val <  0) \
                __ret = val; \
        if (__ret) \
                phydev_err(phydev, "%s failed: %d\n", __func__, __ret); \
        __ret; \
})


/**
 * __phy_read - convenience function for reading a given PHY register
 * @phydev: the phy_device struct
 * @regnum: register number to read
 *
 * The caller must have taken the MDIO bus lock.
 */
static inline int __phy_read(struct phy_device *phydev, u32 regnum)
{
        return __mdiobus_read(phydev->mdio.bus, phydev->mdio.addr, regnum);
}

/**
 * phy_write - Convenience function for writing a given PHY register
 * @phydev: the phy_device struct
 * @regnum: register number to write
 * @val: value to write to @regnum
 *
 * NOTE: MUST NOT be called from interrupt context,
 * because the bus read/write functions may wait for an interrupt
 * to conclude the operation.
 */
static inline int phy_write(struct phy_device *phydev, u32 regnum, u16 val)
{
        return mdiobus_write(phydev->mdio.bus, phydev->mdio.addr, regnum, val);
}

/**
 * __phy_write - Convenience function for writing a given PHY register
 * @phydev: the phy_device struct
 * @regnum: register number to write
 * @val: value to write to @regnum
 *
 * The caller must have taken the MDIO bus lock.
 */
static inline int __phy_write(struct phy_device *phydev, u32 regnum, u16 val)
{
        return __mdiobus_write(phydev->mdio.bus, phydev->mdio.addr, regnum,
                               val);
}

/**
 * __phy_modify_changed() - Convenience function for modifying a PHY register
 * @phydev: a pointer to a &struct phy_device
 * @regnum: register number
 * @mask: bit mask of bits to clear
 * @set: bit mask of bits to set
 *
 * Unlocked helper function which allows a PHY register to be modified as
 * new register value = (old register value & ~mask) | set
 *
 * Returns negative errno, 0 if there was no change, and 1 in case of change
 */
static inline int __phy_modify_changed(struct phy_device *phydev, u32 regnum,
                                       u16 mask, u16 set)
{
        return __mdiobus_modify_changed(phydev->mdio.bus, phydev->mdio.addr,
                                        regnum, mask, set);
}

/*
 * phy_read_mmd - Convenience function for reading a register
 * from an MMD on a given PHY.
 */
int phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum);

/**
 * phy_read_mmd_poll_timeout - Periodically poll a PHY register until a
 *                             condition is met or a timeout occurs
 *
 * @phydev: The phy_device struct
 * @devaddr: The MMD to read from
 * @regnum: The register on the MMD to read
 * @val: Variable to read the register into
 * @cond: Break condition (usually involving @val)
 * @sleep_us: Maximum time to sleep between reads in us (0
 *            tight-loops).  Should be less than ~20ms since usleep_range
 *            is used (see Documentation/timers/timers-howto.rst).
 * @timeout_us: Timeout in us, 0 means never timeout
 * @sleep_before_read: if it is true, sleep @sleep_us before read.
 * Returns 0 on success and -ETIMEDOUT upon a timeout. In either
 * case, the last read value at @args is stored in @val. Must not
 * be called from atomic context if sleep_us or timeout_us are used.
 */
#define phy_read_mmd_poll_timeout(phydev, devaddr, regnum, val, cond, \
                                  sleep_us, timeout_us, sleep_before_read) \
({ \
        int __ret = read_poll_timeout(phy_read_mmd, val, (cond) || val < 0, \
                                  sleep_us, timeout_us, sleep_before_read, \
                                  phydev, devaddr, regnum); \
        if (val <  0) \
                __ret = val; \
        if (__ret) \
                phydev_err(phydev, "%s failed: %d\n", __func__, __ret); \
        __ret; \
})

/*
 * __phy_read_mmd - Convenience function for reading a register
 * from an MMD on a given PHY.
 */
int __phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum);

/*
 * phy_write_mmd - Convenience function for writing a register
 * on an MMD on a given PHY.
 */
int phy_write_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val);

/*
 * __phy_write_mmd - Convenience function for writing a register
 * on an MMD on a given PHY.
 */
int __phy_write_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val);

int __phy_modify_changed(struct phy_device *phydev, u32 regnum, u16 mask,
                         u16 set);
int phy_modify_changed(struct phy_device *phydev, u32 regnum, u16 mask,
                       u16 set);
int __phy_modify(struct phy_device *phydev, u32 regnum, u16 mask, u16 set);
int phy_modify(struct phy_device *phydev, u32 regnum, u16 mask, u16 set);

int __phy_modify_mmd_changed(struct phy_device *phydev, int devad, u32 regnum,
                             u16 mask, u16 set);
int phy_modify_mmd_changed(struct phy_device *phydev, int devad, u32 regnum,
                           u16 mask, u16 set);
int __phy_modify_mmd(struct phy_device *phydev, int devad, u32 regnum,
                     u16 mask, u16 set);
int phy_modify_mmd(struct phy_device *phydev, int devad, u32 regnum,
                   u16 mask, u16 set);

/**
 * __phy_set_bits - Convenience function for setting bits in a PHY register
 * @phydev: the phy_device struct
 * @regnum: register number to write
 * @val: bits to set
 *
 * The caller must have taken the MDIO bus lock.
 */
static inline int __phy_set_bits(struct phy_device *phydev, u32 regnum, u16 val)
{
        return __phy_modify(phydev, regnum, 0, val);
}

/**
 * __phy_clear_bits - Convenience function for clearing bits in a PHY register
 * @phydev: the phy_device struct
 * @regnum: register number to write
 * @val: bits to clear
 *
 * The caller must have taken the MDIO bus lock.
 */
static inline int __phy_clear_bits(struct phy_device *phydev, u32 regnum,
                                   u16 val)
{
        return __phy_modify(phydev, regnum, val, 0);
}

/**
 * phy_set_bits - Convenience function for setting bits in a PHY register
 * @phydev: the phy_device struct
 * @regnum: register number to write
 * @val: bits to set
 */
static inline int phy_set_bits(struct phy_device *phydev, u32 regnum, u16 val)
{
        return phy_modify(phydev, regnum, 0, val);
}

/**
 * phy_clear_bits - Convenience function for clearing bits in a PHY register
 * @phydev: the phy_device struct
 * @regnum: register number to write
 * @val: bits to clear
 */
static inline int phy_clear_bits(struct phy_device *phydev, u32 regnum, u16 val)
{
        return phy_modify(phydev, regnum, val, 0);
}

/**
 * __phy_set_bits_mmd - Convenience function for setting bits in a register
 * on MMD
 * @phydev: the phy_device struct
 * @devad: the MMD containing register to modify
 * @regnum: register number to modify
 * @val: bits to set
 *
 * The caller must have taken the MDIO bus lock.
 */
static inline int __phy_set_bits_mmd(struct phy_device *phydev, int devad,
                u32 regnum, u16 val)
{
        return __phy_modify_mmd(phydev, devad, regnum, 0, val);
}

/**
 * __phy_clear_bits_mmd - Convenience function for clearing bits in a register
 * on MMD
 * @phydev: the phy_device struct
 * @devad: the MMD containing register to modify
 * @regnum: register number to modify
 * @val: bits to clear
 *
 * The caller must have taken the MDIO bus lock.
 */
static inline int __phy_clear_bits_mmd(struct phy_device *phydev, int devad,
                u32 regnum, u16 val)
{
        return __phy_modify_mmd(phydev, devad, regnum, val, 0);
}

/**
 * phy_set_bits_mmd - Convenience function for setting bits in a register
 * on MMD
 * @phydev: the phy_device struct
 * @devad: the MMD containing register to modify
 * @regnum: register number to modify
 * @val: bits to set
 */
static inline int phy_set_bits_mmd(struct phy_device *phydev, int devad,
                u32 regnum, u16 val)
{
        return phy_modify_mmd(phydev, devad, regnum, 0, val);
}

/**
 * phy_clear_bits_mmd - Convenience function for clearing bits in a register
 * on MMD
 * @phydev: the phy_device struct
 * @devad: the MMD containing register to modify
 * @regnum: register number to modify
 * @val: bits to clear
 */
static inline int phy_clear_bits_mmd(struct phy_device *phydev, int devad,
                u32 regnum, u16 val)
{
        return phy_modify_mmd(phydev, devad, regnum, val, 0);
}

/**
 * phy_interrupt_is_valid - Convenience function for testing a given PHY irq
 * @phydev: the phy_device struct
 *
 * NOTE: must be kept in sync with addition/removal of PHY_POLL and
 * PHY_IGNORE_INTERRUPT
 */
static inline bool phy_interrupt_is_valid(struct phy_device *phydev)
{
        return phydev->irq != PHY_POLL && phydev->irq != PHY_IGNORE_INTERRUPT;
}

/**
 * phy_polling_mode - Convenience function for testing whether polling is
 * used to detect PHY status changes
 * @phydev: the phy_device struct
 */
static inline bool phy_polling_mode(struct phy_device *phydev)
{
        if (phydev->state == PHY_CABLETEST)
                if (phydev->drv->flags & PHY_POLL_CABLE_TEST)
                        return true;

        return phydev->irq == PHY_POLL;
}

/**
 * phy_has_hwtstamp - Tests whether a PHY time stamp configuration.
 * @phydev: the phy_device struct
 */
static inline bool phy_has_hwtstamp(struct phy_device *phydev)
{
        return phydev && phydev->mii_ts && phydev->mii_ts->hwtstamp;
}

/**
 * phy_has_rxtstamp - Tests whether a PHY supports receive time stamping.
 * @phydev: the phy_device struct
 */
static inline bool phy_has_rxtstamp(struct phy_device *phydev)
{
        return phydev && phydev->mii_ts && phydev->mii_ts->rxtstamp;
}

/**
 * phy_has_tsinfo - Tests whether a PHY reports time stamping and/or
 * PTP hardware clock capabilities.
 * @phydev: the phy_device struct
 */
static inline bool phy_has_tsinfo(struct phy_device *phydev)
{
        return phydev && phydev->mii_ts && phydev->mii_ts->ts_info;
}

/**
 * phy_has_txtstamp - Tests whether a PHY supports transmit time stamping.
 * @phydev: the phy_device struct
 */
static inline bool phy_has_txtstamp(struct phy_device *phydev)
{
        return phydev && phydev->mii_ts && phydev->mii_ts->txtstamp;
}

static inline int phy_hwtstamp(struct phy_device *phydev, struct ifreq *ifr)
{
        return phydev->mii_ts->hwtstamp(phydev->mii_ts, ifr);
}

static inline bool phy_rxtstamp(struct phy_device *phydev, struct sk_buff *skb,
                                int type)
{
        return phydev->mii_ts->rxtstamp(phydev->mii_ts, skb, type);
}

static inline int phy_ts_info(struct phy_device *phydev,
                              struct ethtool_ts_info *tsinfo)
{
        return phydev->mii_ts->ts_info(phydev->mii_ts, tsinfo);
}

static inline void phy_txtstamp(struct phy_device *phydev, struct sk_buff *skb,
                                int type)
{
        phydev->mii_ts->txtstamp(phydev->mii_ts, skb, type);
}

/**
 * phy_is_internal - Convenience function for testing if a PHY is internal
 * @phydev: the phy_device struct
 */
static inline bool phy_is_internal(struct phy_device *phydev)
{
        return phydev->is_internal;
}

/**
 * phy_interface_mode_is_rgmii - Convenience function for testing if a
 * PHY interface mode is RGMII (all variants)
 * @mode: the &phy_interface_t enum
 */
static inline bool phy_interface_mode_is_rgmii(phy_interface_t mode)
{
        return mode >= PHY_INTERFACE_MODE_RGMII &&
                mode <= PHY_INTERFACE_MODE_RGMII_TXID;
};

/**
 * phy_interface_mode_is_8023z() - does the PHY interface mode use 802.3z
 *   negotiation
 * @mode: one of &enum phy_interface_t
 *
 * Returns true if the PHY interface mode uses the 16-bit negotiation
 * word as defined in 802.3z. (See 802.3-2015 37.2.1 Config_Reg encoding)
 */
static inline bool phy_interface_mode_is_8023z(phy_interface_t mode)
{
        return mode == PHY_INTERFACE_MODE_1000BASEX ||
               mode == PHY_INTERFACE_MODE_2500BASEX;
}

/**
 * phy_interface_is_rgmii - Convenience function for testing if a PHY interface
 * is RGMII (all variants)
 * @phydev: the phy_device struct
 */
static inline bool phy_interface_is_rgmii(struct phy_device *phydev)
{
        return phy_interface_mode_is_rgmii(phydev->interface);
};

/**
 * phy_is_pseudo_fixed_link - Convenience function for testing if this
 * PHY is the CPU port facing side of an Ethernet switch, or similar.
 * @phydev: the phy_device struct
 */
static inline bool phy_is_pseudo_fixed_link(struct phy_device *phydev)
{
        return phydev->is_pseudo_fixed_link;
}

int phy_save_page(struct phy_device *phydev);
int phy_select_page(struct phy_device *phydev, int page);
int phy_restore_page(struct phy_device *phydev, int oldpage, int ret);
int phy_read_paged(struct phy_device *phydev, int page, u32 regnum);
int phy_write_paged(struct phy_device *phydev, int page, u32 regnum, u16 val);
int phy_modify_paged_changed(struct phy_device *phydev, int page, u32 regnum,
                             u16 mask, u16 set);
int phy_modify_paged(struct phy_device *phydev, int page, u32 regnum,
                     u16 mask, u16 set);

struct phy_device *phy_device_create(struct mii_bus *bus, int addr, u32 phy_id,
                                     bool is_c45,
                                     struct phy_c45_device_ids *c45_ids);
#if IS_ENABLED(CONFIG_PHYLIB)
struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45);
int phy_device_register(struct phy_device *phy);
void phy_device_free(struct phy_device *phydev);
#else
static inline
struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45)
{
        return NULL;
}

static inline int phy_device_register(struct phy_device *phy)
{
        return 0;
}

static inline void phy_device_free(struct phy_device *phydev) { }
#endif /* CONFIG_PHYLIB */
void phy_device_remove(struct phy_device *phydev);
int phy_init_hw(struct phy_device *phydev);
int phy_suspend(struct phy_device *phydev);
int phy_resume(struct phy_device *phydev);
int __phy_resume(struct phy_device *phydev);
int phy_loopback(struct phy_device *phydev, bool enable);
void phy_sfp_attach(void *upstream, struct sfp_bus *bus);
void phy_sfp_detach(void *upstream, struct sfp_bus *bus);
int phy_sfp_probe(struct phy_device *phydev,
                  const struct sfp_upstream_ops *ops);
struct phy_device *phy_attach(struct net_device *dev, const char *bus_id,
                              phy_interface_t interface);
struct phy_device *phy_find_first(struct mii_bus *bus);
int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
                      u32 flags, phy_interface_t interface);
int phy_connect_direct(struct net_device *dev, struct phy_device *phydev,
                       void (*handler)(struct net_device *),
                       phy_interface_t interface);
struct phy_device *phy_connect(struct net_device *dev, const char *bus_id,
                               void (*handler)(struct net_device *),
                               phy_interface_t interface);
void phy_disconnect(struct phy_device *phydev);
void phy_detach(struct phy_device *phydev);
void phy_start(struct phy_device *phydev);
void phy_stop(struct phy_device *phydev);
int phy_start_aneg(struct phy_device *phydev);
int phy_aneg_done(struct phy_device *phydev);
int phy_speed_down(struct phy_device *phydev, bool sync);
int phy_speed_up(struct phy_device *phydev);

int phy_restart_aneg(struct phy_device *phydev);
int phy_reset_after_clk_enable(struct phy_device *phydev);

#if IS_ENABLED(CONFIG_PHYLIB)
int phy_start_cable_test(struct phy_device *phydev,
                         struct netlink_ext_ack *extack);
int phy_start_cable_test_tdr(struct phy_device *phydev,
                             struct netlink_ext_ack *extack,
                             const struct phy_tdr_config *config);
#else
static inline
int phy_start_cable_test(struct phy_device *phydev,
                         struct netlink_ext_ack *extack)
{
        NL_SET_ERR_MSG(extack, "Kernel not compiled with PHYLIB support");
        return -EOPNOTSUPP;
}
static inline
int phy_start_cable_test_tdr(struct phy_device *phydev,
                             struct netlink_ext_ack *extack,
                             const struct phy_tdr_config *config)
{
        NL_SET_ERR_MSG(extack, "Kernel not compiled with PHYLIB support");
        return -EOPNOTSUPP;
}
#endif

int phy_cable_test_result(struct phy_device *phydev, u8 pair, u16 result);
int phy_cable_test_fault_length(struct phy_device *phydev, u8 pair,
                                u16 cm);

static inline void phy_device_reset(struct phy_device *phydev, int value)
{
        mdio_device_reset(&phydev->mdio, value);
}

#define phydev_err(_phydev, format, args...)        \
        dev_err(&_phydev->mdio.dev, format, ##args)

#define phydev_info(_phydev, format, args...)        \
        dev_info(&_phydev->mdio.dev, format, ##args)

#define phydev_warn(_phydev, format, args...)        \
        dev_warn(&_phydev->mdio.dev, format, ##args)

#define phydev_dbg(_phydev, format, args...)        \
        dev_dbg(&_phydev->mdio.dev, format, ##args)

static inline const char *phydev_name(const struct phy_device *phydev)
{
        return dev_name(&phydev->mdio.dev);
}

static inline void phy_lock_mdio_bus(struct phy_device *phydev)
{
        mutex_lock(&phydev->mdio.bus->mdio_lock);
}

static inline void phy_unlock_mdio_bus(struct phy_device *phydev)
{
        mutex_unlock(&phydev->mdio.bus->mdio_lock);
}

void phy_attached_print(struct phy_device *phydev, const char *fmt, ...)
        __printf(2, 3);
char *phy_attached_info_irq(struct phy_device *phydev)
        __malloc;
void phy_attached_info(struct phy_device *phydev);

/* Clause 22 PHY */
int genphy_read_abilities(struct phy_device *phydev);
int genphy_setup_forced(struct phy_device *phydev);
int genphy_restart_aneg(struct phy_device *phydev);
int genphy_check_and_restart_aneg(struct phy_device *phydev, bool restart);
int genphy_config_eee_advert(struct phy_device *phydev);
int __genphy_config_aneg(struct phy_device *phydev, bool changed);
int genphy_aneg_done(struct phy_device *phydev);
int genphy_update_link(struct phy_device *phydev);
int genphy_read_lpa(struct phy_device *phydev);
int genphy_read_status_fixed(struct phy_device *phydev);
int genphy_read_status(struct phy_device *phydev);
int genphy_suspend(struct phy_device *phydev);
int genphy_resume(struct phy_device *phydev);
int genphy_loopback(struct phy_device *phydev, bool enable);
int genphy_soft_reset(struct phy_device *phydev);

static inline int genphy_config_aneg(struct phy_device *phydev)
{
        return __genphy_config_aneg(phydev, false);
}

static inline int genphy_no_ack_interrupt(struct phy_device *phydev)
{
        return 0;
}
static inline int genphy_no_config_intr(struct phy_device *phydev)
{
        return 0;
}
int genphy_read_mmd_unsupported(struct phy_device *phdev, int devad,
                                u16 regnum);
int genphy_write_mmd_unsupported(struct phy_device *phdev, int devnum,
                                 u16 regnum, u16 val);

/* Clause 37 */
int genphy_c37_config_aneg(struct phy_device *phydev);
int genphy_c37_read_status(struct phy_device *phydev);

/* Clause 45 PHY */
int genphy_c45_restart_aneg(struct phy_device *phydev);
int genphy_c45_check_and_restart_aneg(struct phy_device *phydev, bool restart);
int genphy_c45_aneg_done(struct phy_device *phydev);
int genphy_c45_read_link(struct phy_device *phydev);
int genphy_c45_read_lpa(struct phy_device *phydev);
int genphy_c45_read_pma(struct phy_device *phydev);
int genphy_c45_pma_setup_forced(struct phy_device *phydev);
int genphy_c45_an_config_aneg(struct phy_device *phydev);
int genphy_c45_an_disable_aneg(struct phy_device *phydev);
int genphy_c45_read_mdix(struct phy_device *phydev);
int genphy_c45_pma_read_abilities(struct phy_device *phydev);
int genphy_c45_read_status(struct phy_device *phydev);
int genphy_c45_config_aneg(struct phy_device *phydev);

/* Generic C45 PHY driver */
extern struct phy_driver genphy_c45_driver;

/* The gen10g_* functions are the old Clause 45 stub */
int gen10g_config_aneg(struct phy_device *phydev);

static inline int phy_read_status(struct phy_device *phydev)
{
        if (!phydev->drv)
                return -EIO;

        if (phydev->drv->read_status)
                return phydev->drv->read_status(phydev);
        else
                return genphy_read_status(phydev);
}

void phy_driver_unregister(struct phy_driver *drv);
void phy_drivers_unregister(struct phy_driver *drv, int n);
int phy_driver_register(struct phy_driver *new_driver, struct module *owner);
int phy_drivers_register(struct phy_driver *new_driver, int n,
                         struct module *owner);
void phy_error(struct phy_device *phydev);
void phy_state_machine(struct work_struct *work);
void phy_queue_state_machine(struct phy_device *phydev, unsigned long jiffies);
void phy_trigger_machine(struct phy_device *phydev);
void phy_mac_interrupt(struct phy_device *phydev);
void phy_start_machine(struct phy_device *phydev);
void phy_stop_machine(struct phy_device *phydev);
void phy_ethtool_ksettings_get(struct phy_device *phydev,
                               struct ethtool_link_ksettings *cmd);
int phy_ethtool_ksettings_set(struct phy_device *phydev,
                              const struct ethtool_link_ksettings *cmd);
int phy_mii_ioctl(struct phy_device *phydev, struct ifreq *ifr, int cmd);
int phy_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd);
int phy_do_ioctl_running(struct net_device *dev, struct ifreq *ifr, int cmd);
int phy_disable_interrupts(struct phy_device *phydev);
void phy_request_interrupt(struct phy_device *phydev);
void phy_free_interrupt(struct phy_device *phydev);
void phy_print_status(struct phy_device *phydev);
int phy_set_max_speed(struct phy_device *phydev, u32 max_speed);
void phy_remove_link_mode(struct phy_device *phydev, u32 link_mode);
void phy_advertise_supported(struct phy_device *phydev);
void phy_support_sym_pause(struct phy_device *phydev);
void phy_support_asym_pause(struct phy_device *phydev);
void phy_set_sym_pause(struct phy_device *phydev, bool rx, bool tx,
                       bool autoneg);
void phy_set_asym_pause(struct phy_device *phydev, bool rx, bool tx);
bool phy_validate_pause(struct phy_device *phydev,
                        struct ethtool_pauseparam *pp);
void phy_get_pause(struct phy_device *phydev, bool *tx_pause, bool *rx_pause);

s32 phy_get_internal_delay(struct phy_device *phydev, struct device *dev,
                           const int *delay_values, int size, bool is_rx);

void phy_resolve_pause(unsigned long *local_adv, unsigned long *partner_adv,
                       bool *tx_pause, bool *rx_pause);

int phy_register_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask,
                       int (*run)(struct phy_device *));
int phy_register_fixup_for_id(const char *bus_id,
                              int (*run)(struct phy_device *));
int phy_register_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask,
                               int (*run)(struct phy_device *));

int phy_unregister_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask);
int phy_unregister_fixup_for_id(const char *bus_id);
int phy_unregister_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask);

int phy_init_eee(struct phy_device *phydev, bool clk_stop_enable);
int phy_get_eee_err(struct phy_device *phydev);
int phy_ethtool_set_eee(struct phy_device *phydev, struct ethtool_eee *data);
int phy_ethtool_get_eee(struct phy_device *phydev, struct ethtool_eee *data);
int phy_ethtool_set_wol(struct phy_device *phydev, struct ethtool_wolinfo *wol);
void phy_ethtool_get_wol(struct phy_device *phydev,
                         struct ethtool_wolinfo *wol);
int phy_ethtool_get_link_ksettings(struct net_device *ndev,
                                   struct ethtool_link_ksettings *cmd);
int phy_ethtool_set_link_ksettings(struct net_device *ndev,
                                   const struct ethtool_link_ksettings *cmd);
int phy_ethtool_nway_reset(struct net_device *ndev);
int phy_package_join(struct phy_device *phydev, int addr, size_t priv_size);
void phy_package_leave(struct phy_device *phydev);
int devm_phy_package_join(struct device *dev, struct phy_device *phydev,
                          int addr, size_t priv_size);

#if IS_ENABLED(CONFIG_PHYLIB)
int __init mdio_bus_init(void);
void mdio_bus_exit(void);
#endif

int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data);
int phy_ethtool_get_sset_count(struct phy_device *phydev);
int phy_ethtool_get_stats(struct phy_device *phydev,
                          struct ethtool_stats *stats, u64 *data);

static inline int phy_package_read(struct phy_device *phydev, u32 regnum)
{
        struct phy_package_shared *shared = phydev->shared;

        if (!shared)
                return -EIO;

        return mdiobus_read(phydev->mdio.bus, shared->addr, regnum);
}

static inline int __phy_package_read(struct phy_device *phydev, u32 regnum)
{
        struct phy_package_shared *shared = phydev->shared;

        if (!shared)
                return -EIO;

        return __mdiobus_read(phydev->mdio.bus, shared->addr, regnum);
}

static inline int phy_package_write(struct phy_device *phydev,
                                    u32 regnum, u16 val)
{
        struct phy_package_shared *shared = phydev->shared;

        if (!shared)
                return -EIO;

        return mdiobus_write(phydev->mdio.bus, shared->addr, regnum, val);
}

static inline int __phy_package_write(struct phy_device *phydev,
                                      u32 regnum, u16 val)
{
        struct phy_package_shared *shared = phydev->shared;

        if (!shared)
                return -EIO;

        return __mdiobus_write(phydev->mdio.bus, shared->addr, regnum, val);
}

static inline bool __phy_package_set_once(struct phy_device *phydev,
                                          unsigned int b)
{
        struct phy_package_shared *shared = phydev->shared;

        if (!shared)
                return false;

        return !test_and_set_bit(b, &shared->flags);
}

static inline bool phy_package_init_once(struct phy_device *phydev)
{
        return __phy_package_set_once(phydev, PHY_SHARED_F_INIT_DONE);
}

static inline bool phy_package_probe_once(struct phy_device *phydev)
{
        return __phy_package_set_once(phydev, PHY_SHARED_F_PROBE_DONE);
}

extern struct bus_type mdio_bus_type;

struct mdio_board_info {
        const char        *bus_id;
        char                modalias[MDIO_NAME_SIZE];
        int                mdio_addr;
        const void        *platform_data;
};

#if IS_ENABLED(CONFIG_MDIO_DEVICE)
int mdiobus_register_board_info(const struct mdio_board_info *info,
                                unsigned int n);
#else
static inline int mdiobus_register_board_info(const struct mdio_board_info *i,
                                              unsigned int n)
{
        return 0;
}
#endif


/**
 * phy_module_driver() - Helper macro for registering PHY drivers
 * @__phy_drivers: array of PHY drivers to register
 * @__count: Numbers of members in array
 *
 * Helper macro for PHY drivers which do not do anything special in module
 * init/exit. Each module may only use this macro once, and calling it
 * replaces module_init() and module_exit().
 */
#define phy_module_driver(__phy_drivers, __count)                        \
static int __init phy_module_init(void)                                        \
{                                                                        \
        return phy_drivers_register(__phy_drivers, __count, THIS_MODULE); \
}                                                                        \
module_init(phy_module_init);                                                \
static void __exit phy_module_exit(void)                                \
{                                                                        \
        phy_drivers_unregister(__phy_drivers, __count);                        \
}                                                                        \
module_exit(phy_module_exit)

#define module_phy_driver(__phy_drivers)                                \
        phy_module_driver(__phy_drivers, ARRAY_SIZE(__phy_drivers))

bool phy_driver_is_genphy(struct phy_device *phydev);
bool phy_driver_is_genphy_10g(struct phy_device *phydev);

#endif /* __PHY_H */










































































































































































































































































































































































































































    1 































    1 




















    1 






























    1 












    1 


    1 































































    1 














































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
// SPDX-License-Identifier: GPL-2.0-or-later
/* Manage a process's keyrings
 *
 * Copyright (C) 2004-2005, 2008 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#include <linux/init.h>
#include <linux/sched.h>
#include <linux/sched/user.h>
#include <linux/keyctl.h>
#include <linux/fs.h>
#include <linux/err.h>
#include <linux/mutex.h>
#include <linux/security.h>
#include <linux/user_namespace.h>
#include <linux/uaccess.h>
#include <linux/init_task.h>
#include <keys/request_key_auth-type.h>
#include "internal.h"

/* Session keyring create vs join semaphore */
static DEFINE_MUTEX(key_session_mutex);

/* The root user's tracking struct */
struct key_user root_key_user = {
        .usage                = REFCOUNT_INIT(3),
        .cons_lock        = __MUTEX_INITIALIZER(root_key_user.cons_lock),
        .lock                = __SPIN_LOCK_UNLOCKED(root_key_user.lock),
        .nkeys                = ATOMIC_INIT(2),
        .nikeys                = ATOMIC_INIT(2),
        .uid                = GLOBAL_ROOT_UID,
};

/*
 * Get or create a user register keyring.
 */
static struct key *get_user_register(struct user_namespace *user_ns)
{
        struct key *reg_keyring = READ_ONCE(user_ns->user_keyring_register);

        if (reg_keyring)
                return reg_keyring;

        down_write(&user_ns->keyring_sem);

        /* Make sure there's a register keyring.  It gets owned by the
         * user_namespace's owner.
         */
        reg_keyring = user_ns->user_keyring_register;
        if (!reg_keyring) {
                reg_keyring = keyring_alloc(".user_reg",
                                            user_ns->owner, INVALID_GID,
                                            &init_cred,
                                            KEY_POS_WRITE | KEY_POS_SEARCH |
                                            KEY_USR_VIEW | KEY_USR_READ,
                                            0,
                                            NULL, NULL);
                if (!IS_ERR(reg_keyring))
                        smp_store_release(&user_ns->user_keyring_register,
                                          reg_keyring);
        }

        up_write(&user_ns->keyring_sem);

        /* We don't return a ref since the keyring is pinned by the user_ns */
        return reg_keyring;
}

/*
 * Look up the user and user session keyrings for the current process's UID,
 * creating them if they don't exist.
 */
int look_up_user_keyrings(struct key **_user_keyring,
                          struct key **_user_session_keyring)
{
        const struct cred *cred = current_cred();
        struct user_namespace *user_ns = current_user_ns();
        struct key *reg_keyring, *uid_keyring, *session_keyring;
        key_perm_t user_keyring_perm;
        key_ref_t uid_keyring_r, session_keyring_r;
        uid_t uid = from_kuid(user_ns, cred->user->uid);
        char buf[20];
        int ret;

        user_keyring_perm = (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL;

        kenter("%u", uid);

        reg_keyring = get_user_register(user_ns);
        if (IS_ERR(reg_keyring))
                return PTR_ERR(reg_keyring);

        down_write(&user_ns->keyring_sem);
        ret = 0;

        /* Get the user keyring.  Note that there may be one in existence
         * already as it may have been pinned by a session, but the user_struct
         * pointing to it may have been destroyed by setuid.
         */
        snprintf(buf, sizeof(buf), "_uid.%u", uid);
        uid_keyring_r = keyring_search(make_key_ref(reg_keyring, true),
                                       &key_type_keyring, buf, false);
        kdebug("_uid %p", uid_keyring_r);
        if (uid_keyring_r == ERR_PTR(-EAGAIN)) {
                uid_keyring = keyring_alloc(buf, cred->user->uid, INVALID_GID,
                                            cred, user_keyring_perm,
                                            KEY_ALLOC_UID_KEYRING |
                                            KEY_ALLOC_IN_QUOTA,
                                            NULL, reg_keyring);
                if (IS_ERR(uid_keyring)) {
                        ret = PTR_ERR(uid_keyring);
                        goto error;
                }
        } else if (IS_ERR(uid_keyring_r)) {
                ret = PTR_ERR(uid_keyring_r);
                goto error;
        } else {
                uid_keyring = key_ref_to_ptr(uid_keyring_r);
        }

        /* Get a default session keyring (which might also exist already) */
        snprintf(buf, sizeof(buf), "_uid_ses.%u", uid);
        session_keyring_r = keyring_search(make_key_ref(reg_keyring, true),
                                           &key_type_keyring, buf, false);
        kdebug("_uid_ses %p", session_keyring_r);
        if (session_keyring_r == ERR_PTR(-EAGAIN)) {
                session_keyring = keyring_alloc(buf, cred->user->uid, INVALID_GID,
                                                cred, user_keyring_perm,
                                                KEY_ALLOC_UID_KEYRING |
                                                KEY_ALLOC_IN_QUOTA,
                                                NULL, NULL);
                if (IS_ERR(session_keyring)) {
                        ret = PTR_ERR(session_keyring);
                        goto error_release;
                }

                /* We install a link from the user session keyring to
                 * the user keyring.
                 */
                ret = key_link(session_keyring, uid_keyring);
                if (ret < 0)
                        goto error_release_session;

                /* And only then link the user-session keyring to the
                 * register.
                 */
                ret = key_link(reg_keyring, session_keyring);
                if (ret < 0)
                        goto error_release_session;
        } else if (IS_ERR(session_keyring_r)) {
                ret = PTR_ERR(session_keyring_r);
                goto error_release;
        } else {
                session_keyring = key_ref_to_ptr(session_keyring_r);
        }

        up_write(&user_ns->keyring_sem);

        if (_user_session_keyring)
                *_user_session_keyring = session_keyring;
        else
                key_put(session_keyring);
        if (_user_keyring)
                *_user_keyring = uid_keyring;
        else
                key_put(uid_keyring);
        kleave(" = 0");
        return 0;

error_release_session:
        key_put(session_keyring);
error_release:
        key_put(uid_keyring);
error:
        up_write(&user_ns->keyring_sem);
        kleave(" = %d", ret);
        return ret;
}

/*
 * Get the user session keyring if it exists, but don't create it if it
 * doesn't.
 */
struct key *get_user_session_keyring_rcu(const struct cred *cred)
{
        struct key *reg_keyring = READ_ONCE(cred->user_ns->user_keyring_register);
        key_ref_t session_keyring_r;
        char buf[20];

        struct keyring_search_context ctx = {
                .index_key.type                = &key_type_keyring,
                .index_key.description        = buf,
                .cred                        = cred,
                .match_data.cmp                = key_default_cmp,
                .match_data.raw_data        = buf,
                .match_data.lookup_type        = KEYRING_SEARCH_LOOKUP_DIRECT,
                .flags                        = KEYRING_SEARCH_DO_STATE_CHECK,
        };

        if (!reg_keyring)
                return NULL;

        ctx.index_key.desc_len = snprintf(buf, sizeof(buf), "_uid_ses.%u",
                                          from_kuid(cred->user_ns,
                                                    cred->user->uid));

        session_keyring_r = keyring_search_rcu(make_key_ref(reg_keyring, true),
                                               &ctx);
        if (IS_ERR(session_keyring_r))
                return NULL;
        return key_ref_to_ptr(session_keyring_r);
}

/*
 * Install a thread keyring to the given credentials struct if it didn't have
 * one already.  This is allowed to overrun the quota.
 *
 * Return: 0 if a thread keyring is now present; -errno on failure.
 */
int install_thread_keyring_to_cred(struct cred *new)
{
        struct key *keyring;

        if (new->thread_keyring)
                return 0;

        keyring = keyring_alloc("_tid", new->uid, new->gid, new,
                                KEY_POS_ALL | KEY_USR_VIEW,
                                KEY_ALLOC_QUOTA_OVERRUN,
                                NULL, NULL);
        if (IS_ERR(keyring))
                return PTR_ERR(keyring);

        new->thread_keyring = keyring;
        return 0;
}

/*
 * Install a thread keyring to the current task if it didn't have one already.
 *
 * Return: 0 if a thread keyring is now present; -errno on failure.
 */
static int install_thread_keyring(void)
{
        struct cred *new;
        int ret;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;

        ret = install_thread_keyring_to_cred(new);
        if (ret < 0) {
                abort_creds(new);
                return ret;
        }

        return commit_creds(new);
}

/*
 * Install a process keyring to the given credentials struct if it didn't have
 * one already.  This is allowed to overrun the quota.
 *
 * Return: 0 if a process keyring is now present; -errno on failure.
 */
int install_process_keyring_to_cred(struct cred *new)
{
        struct key *keyring;

        if (new->process_keyring)
                return 0;

        keyring = keyring_alloc("_pid", new->uid, new->gid, new,
                                KEY_POS_ALL | KEY_USR_VIEW,
                                KEY_ALLOC_QUOTA_OVERRUN,
                                NULL, NULL);
        if (IS_ERR(keyring))
                return PTR_ERR(keyring);

        new->process_keyring = keyring;
        return 0;
}

/*
 * Install a process keyring to the current task if it didn't have one already.
 *
 * Return: 0 if a process keyring is now present; -errno on failure.
 */
static int install_process_keyring(void)
{
        struct cred *new;
        int ret;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;

        ret = install_process_keyring_to_cred(new);
        if (ret < 0) {
                abort_creds(new);
                return ret;
        }

        return commit_creds(new);
}

/*
 * Install the given keyring as the session keyring of the given credentials
 * struct, replacing the existing one if any.  If the given keyring is NULL,
 * then install a new anonymous session keyring.
 * @cred can not be in use by any task yet.
 *
 * Return: 0 on success; -errno on failure.
 */
int install_session_keyring_to_cred(struct cred *cred, struct key *keyring)
{
        unsigned long flags;
        struct key *old;

        might_sleep();

        /* create an empty session keyring */
        if (!keyring) {
                flags = KEY_ALLOC_QUOTA_OVERRUN;
                if (cred->session_keyring)
                        flags = KEY_ALLOC_IN_QUOTA;

                keyring = keyring_alloc("_ses", cred->uid, cred->gid, cred,
                                        KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ,
                                        flags, NULL, NULL);
                if (IS_ERR(keyring))
                        return PTR_ERR(keyring);
        } else {
                __key_get(keyring);
        }

        /* install the keyring */
        old = cred->session_keyring;
        cred->session_keyring = keyring;

        if (old)
                key_put(old);

        return 0;
}

/*
 * Install the given keyring as the session keyring of the current task,
 * replacing the existing one if any.  If the given keyring is NULL, then
 * install a new anonymous session keyring.
 *
 * Return: 0 on success; -errno on failure.
 */
static int install_session_keyring(struct key *keyring)
{
        struct cred *new;
        int ret;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;

        ret = install_session_keyring_to_cred(new, keyring);
        if (ret < 0) {
                abort_creds(new);
                return ret;
        }

        return commit_creds(new);
}

/*
 * Handle the fsuid changing.
 */
void key_fsuid_changed(struct cred *new_cred)
{
        /* update the ownership of the thread keyring */
        if (new_cred->thread_keyring) {
                down_write(&new_cred->thread_keyring->sem);
                new_cred->thread_keyring->uid = new_cred->fsuid;
                up_write(&new_cred->thread_keyring->sem);
        }
}

/*
 * Handle the fsgid changing.
 */
void key_fsgid_changed(struct cred *new_cred)
{
        /* update the ownership of the thread keyring */
        if (new_cred->thread_keyring) {
                down_write(&new_cred->thread_keyring->sem);
                new_cred->thread_keyring->gid = new_cred->fsgid;
                up_write(&new_cred->thread_keyring->sem);
        }
}

/*
 * Search the process keyrings attached to the supplied cred for the first
 * matching key under RCU conditions (the caller must be holding the RCU read
 * lock).
 *
 * The search criteria are the type and the match function.  The description is
 * given to the match function as a parameter, but doesn't otherwise influence
 * the search.  Typically the match function will compare the description
 * parameter to the key's description.
 *
 * This can only search keyrings that grant Search permission to the supplied
 * credentials.  Keyrings linked to searched keyrings will also be searched if
 * they grant Search permission too.  Keys can only be found if they grant
 * Search permission to the credentials.
 *
 * Returns a pointer to the key with the key usage count incremented if
 * successful, -EAGAIN if we didn't find any matching key or -ENOKEY if we only
 * matched negative keys.
 *
 * In the case of a successful return, the possession attribute is set on the
 * returned key reference.
 */
key_ref_t search_cred_keyrings_rcu(struct keyring_search_context *ctx)
{
        struct key *user_session;
        key_ref_t key_ref, ret, err;
        const struct cred *cred = ctx->cred;

        /* we want to return -EAGAIN or -ENOKEY if any of the keyrings were
         * searchable, but we failed to find a key or we found a negative key;
         * otherwise we want to return a sample error (probably -EACCES) if
         * none of the keyrings were searchable
         *
         * in terms of priority: success > -ENOKEY > -EAGAIN > other error
         */
        key_ref = NULL;
        ret = NULL;
        err = ERR_PTR(-EAGAIN);

        /* search the thread keyring first */
        if (cred->thread_keyring) {
                key_ref = keyring_search_rcu(
                        make_key_ref(cred->thread_keyring, 1), ctx);
                if (!IS_ERR(key_ref))
                        goto found;

                switch (PTR_ERR(key_ref)) {
                case -EAGAIN: /* no key */
                case -ENOKEY: /* negative key */
                        ret = key_ref;
                        break;
                default:
                        err = key_ref;
                        break;
                }
        }

        /* search the process keyring second */
        if (cred->process_keyring) {
                key_ref = keyring_search_rcu(
                        make_key_ref(cred->process_keyring, 1), ctx);
                if (!IS_ERR(key_ref))
                        goto found;

                switch (PTR_ERR(key_ref)) {
                case -EAGAIN: /* no key */
                        if (ret)
                                break;
                        fallthrough;
                case -ENOKEY: /* negative key */
                        ret = key_ref;
                        break;
                default:
                        err = key_ref;
                        break;
                }
        }

        /* search the session keyring */
        if (cred->session_keyring) {
                key_ref = keyring_search_rcu(
                        make_key_ref(cred->session_keyring, 1), ctx);

                if (!IS_ERR(key_ref))
                        goto found;

                switch (PTR_ERR(key_ref)) {
                case -EAGAIN: /* no key */
                        if (ret)
                                break;
                        fallthrough;
                case -ENOKEY: /* negative key */
                        ret = key_ref;
                        break;
                default:
                        err = key_ref;
                        break;
                }
        }
        /* or search the user-session keyring */
        else if ((user_session = get_user_session_keyring_rcu(cred))) {
                key_ref = keyring_search_rcu(make_key_ref(user_session, 1),
                                             ctx);
                key_put(user_session);

                if (!IS_ERR(key_ref))
                        goto found;

                switch (PTR_ERR(key_ref)) {
                case -EAGAIN: /* no key */
                        if (ret)
                                break;
                        fallthrough;
                case -ENOKEY: /* negative key */
                        ret = key_ref;
                        break;
                default:
                        err = key_ref;
                        break;
                }
        }

        /* no key - decide on the error we're going to go for */
        key_ref = ret ? ret : err;

found:
        return key_ref;
}

/*
 * Search the process keyrings attached to the supplied cred for the first
 * matching key in the manner of search_my_process_keyrings(), but also search
 * the keys attached to the assumed authorisation key using its credentials if
 * one is available.
 *
 * The caller must be holding the RCU read lock.
 *
 * Return same as search_cred_keyrings_rcu().
 */
key_ref_t search_process_keyrings_rcu(struct keyring_search_context *ctx)
{
        struct request_key_auth *rka;
        key_ref_t key_ref, ret = ERR_PTR(-EACCES), err;

        key_ref = search_cred_keyrings_rcu(ctx);
        if (!IS_ERR(key_ref))
                goto found;
        err = key_ref;

        /* if this process has an instantiation authorisation key, then we also
         * search the keyrings of the process mentioned there
         * - we don't permit access to request_key auth keys via this method
         */
        if (ctx->cred->request_key_auth &&
            ctx->cred == current_cred() &&
            ctx->index_key.type != &key_type_request_key_auth
            ) {
                const struct cred *cred = ctx->cred;

                if (key_validate(cred->request_key_auth) == 0) {
                        rka = ctx->cred->request_key_auth->payload.data[0];

                        //// was search_process_keyrings() [ie. recursive]
                        ctx->cred = rka->cred;
                        key_ref = search_cred_keyrings_rcu(ctx);
                        ctx->cred = cred;

                        if (!IS_ERR(key_ref))
                                goto found;
                        ret = key_ref;
                }
        }

        /* no key - decide on the error we're going to go for */
        if (err == ERR_PTR(-ENOKEY) || ret == ERR_PTR(-ENOKEY))
                key_ref = ERR_PTR(-ENOKEY);
        else if (err == ERR_PTR(-EACCES))
                key_ref = ret;
        else
                key_ref = err;

found:
        return key_ref;
}
/*
 * See if the key we're looking at is the target key.
 */
bool lookup_user_key_possessed(const struct key *key,
                               const struct key_match_data *match_data)
{
        return key == match_data->raw_data;
}

/*
 * Look up a key ID given us by userspace with a given permissions mask to get
 * the key it refers to.
 *
 * Flags can be passed to request that special keyrings be created if referred
 * to directly, to permit partially constructed keys to be found and to skip
 * validity and permission checks on the found key.
 *
 * Returns a pointer to the key with an incremented usage count if successful;
 * -EINVAL if the key ID is invalid; -ENOKEY if the key ID does not correspond
 * to a key or the best found key was a negative key; -EKEYREVOKED or
 * -EKEYEXPIRED if the best found key was revoked or expired; -EACCES if the
 * found key doesn't grant the requested permit or the LSM denied access to it;
 * or -ENOMEM if a special keyring couldn't be created.
 *
 * In the case of a successful return, the possession attribute is set on the
 * returned key reference.
 */
key_ref_t lookup_user_key(key_serial_t id, unsigned long lflags,
                          enum key_need_perm need_perm)
{
        struct keyring_search_context ctx = {
                .match_data.cmp                = lookup_user_key_possessed,
                .match_data.lookup_type        = KEYRING_SEARCH_LOOKUP_DIRECT,
                .flags                        = (KEYRING_SEARCH_NO_STATE_CHECK |
                                           KEYRING_SEARCH_RECURSE),
        };
        struct request_key_auth *rka;
        struct key *key, *user_session;
        key_ref_t key_ref, skey_ref;
        int ret;

try_again:
        ctx.cred = get_current_cred();
        key_ref = ERR_PTR(-ENOKEY);

        switch (id) {
        case KEY_SPEC_THREAD_KEYRING:
                if (!ctx.cred->thread_keyring) {
                        if (!(lflags & KEY_LOOKUP_CREATE))
                                goto error;

                        ret = install_thread_keyring();
                        if (ret < 0) {
                                key_ref = ERR_PTR(ret);
                                goto error;
                        }
                        goto reget_creds;
                }

                key = ctx.cred->thread_keyring;
                __key_get(key);
                key_ref = make_key_ref(key, 1);
                break;

        case KEY_SPEC_PROCESS_KEYRING:
                if (!ctx.cred->process_keyring) {
                        if (!(lflags & KEY_LOOKUP_CREATE))
                                goto error;

                        ret = install_process_keyring();
                        if (ret < 0) {
                                key_ref = ERR_PTR(ret);
                                goto error;
                        }
                        goto reget_creds;
                }

                key = ctx.cred->process_keyring;
                __key_get(key);
                key_ref = make_key_ref(key, 1);
                break;

        case KEY_SPEC_SESSION_KEYRING:
                if (!ctx.cred->session_keyring) {
                        /* always install a session keyring upon access if one
                         * doesn't exist yet */
                        ret = look_up_user_keyrings(NULL, &user_session);
                        if (ret < 0)
                                goto error;
                        if (lflags & KEY_LOOKUP_CREATE)
                                ret = join_session_keyring(NULL);
                        else
                                ret = install_session_keyring(user_session);

                        key_put(user_session);
                        if (ret < 0)
                                goto error;
                        goto reget_creds;
                } else if (test_bit(KEY_FLAG_UID_KEYRING,
                                    &ctx.cred->session_keyring->flags) &&
                           lflags & KEY_LOOKUP_CREATE) {
                        ret = join_session_keyring(NULL);
                        if (ret < 0)
                                goto error;
                        goto reget_creds;
                }

                key = ctx.cred->session_keyring;
                __key_get(key);
                key_ref = make_key_ref(key, 1);
                break;

        case KEY_SPEC_USER_KEYRING:
                ret = look_up_user_keyrings(&key, NULL);
                if (ret < 0)
                        goto error;
                key_ref = make_key_ref(key, 1);
                break;

        case KEY_SPEC_USER_SESSION_KEYRING:
                ret = look_up_user_keyrings(NULL, &key);
                if (ret < 0)
                        goto error;
                key_ref = make_key_ref(key, 1);
                break;

        case KEY_SPEC_GROUP_KEYRING:
                /* group keyrings are not yet supported */
                key_ref = ERR_PTR(-EINVAL);
                goto error;

        case KEY_SPEC_REQKEY_AUTH_KEY:
                key = ctx.cred->request_key_auth;
                if (!key)
                        goto error;

                __key_get(key);
                key_ref = make_key_ref(key, 1);
                break;

        case KEY_SPEC_REQUESTOR_KEYRING:
                if (!ctx.cred->request_key_auth)
                        goto error;

                down_read(&ctx.cred->request_key_auth->sem);
                if (test_bit(KEY_FLAG_REVOKED,
                             &ctx.cred->request_key_auth->flags)) {
                        key_ref = ERR_PTR(-EKEYREVOKED);
                        key = NULL;
                } else {
                        rka = ctx.cred->request_key_auth->payload.data[0];
                        key = rka->dest_keyring;
                        __key_get(key);
                }
                up_read(&ctx.cred->request_key_auth->sem);
                if (!key)
                        goto error;
                key_ref = make_key_ref(key, 1);
                break;

        default:
                key_ref = ERR_PTR(-EINVAL);
                if (id < 1)
                        goto error;

                key = key_lookup(id);
                if (IS_ERR(key)) {
                        key_ref = ERR_CAST(key);
                        goto error;
                }

                key_ref = make_key_ref(key, 0);

                /* check to see if we possess the key */
                ctx.index_key                        = key->index_key;
                ctx.match_data.raw_data                = key;
                kdebug("check possessed");
                rcu_read_lock();
                skey_ref = search_process_keyrings_rcu(&ctx);
                rcu_read_unlock();
                kdebug("possessed=%p", skey_ref);

                if (!IS_ERR(skey_ref)) {
                        key_put(key);
                        key_ref = skey_ref;
                }

                break;
        }

        /* unlink does not use the nominated key in any way, so can skip all
         * the permission checks as it is only concerned with the keyring */
        if (need_perm != KEY_NEED_UNLINK) {
                if (!(lflags & KEY_LOOKUP_PARTIAL)) {
                        ret = wait_for_key_construction(key, true);
                        switch (ret) {
                        case -ERESTARTSYS:
                                goto invalid_key;
                        default:
                                if (need_perm != KEY_AUTHTOKEN_OVERRIDE &&
                                    need_perm != KEY_DEFER_PERM_CHECK)
                                        goto invalid_key;
                        case 0:
                                break;
                        }
                } else if (need_perm != KEY_DEFER_PERM_CHECK) {
                        ret = key_validate(key);
                        if (ret < 0)
                                goto invalid_key;
                }

                ret = -EIO;
                if (!(lflags & KEY_LOOKUP_PARTIAL) &&
                    key_read_state(key) == KEY_IS_UNINSTANTIATED)
                        goto invalid_key;
        }

        /* check the permissions */
        ret = key_task_permission(key_ref, ctx.cred, need_perm);
        if (ret < 0)
                goto invalid_key;

        key->last_used_at = ktime_get_real_seconds();

error:
        put_cred(ctx.cred);
        return key_ref;

invalid_key:
        key_ref_put(key_ref);
        key_ref = ERR_PTR(ret);
        goto error;

        /* if we attempted to install a keyring, then it may have caused new
         * creds to be installed */
reget_creds:
        put_cred(ctx.cred);
        goto try_again;
}
EXPORT_SYMBOL(lookup_user_key);

/*
 * Join the named keyring as the session keyring if possible else attempt to
 * create a new one of that name and join that.
 *
 * If the name is NULL, an empty anonymous keyring will be installed as the
 * session keyring.
 *
 * Named session keyrings are joined with a semaphore held to prevent the
 * keyrings from going away whilst the attempt is made to going them and also
 * to prevent a race in creating compatible session keyrings.
 */
long join_session_keyring(const char *name)
{
        const struct cred *old;
        struct cred *new;
        struct key *keyring;
        long ret, serial;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;
        old = current_cred();

        /* if no name is provided, install an anonymous keyring */
        if (!name) {
                ret = install_session_keyring_to_cred(new, NULL);
                if (ret < 0)
                        goto error;

                serial = new->session_keyring->serial;
                ret = commit_creds(new);
                if (ret == 0)
                        ret = serial;
                goto okay;
        }

        /* allow the user to join or create a named keyring */
        mutex_lock(&key_session_mutex);

        /* look for an existing keyring of this name */
        keyring = find_keyring_by_name(name, false);
        if (PTR_ERR(keyring) == -ENOKEY) {
                /* not found - try and create a new one */
                keyring = keyring_alloc(
                        name, old->uid, old->gid, old,
                        KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_LINK,
                        KEY_ALLOC_IN_QUOTA, NULL, NULL);
                if (IS_ERR(keyring)) {
                        ret = PTR_ERR(keyring);
                        goto error2;
                }
        } else if (IS_ERR(keyring)) {
                ret = PTR_ERR(keyring);
                goto error2;
        } else if (keyring == new->session_keyring) {
                ret = 0;
                goto error3;
        }

        /* we've got a keyring - now to install it */
        ret = install_session_keyring_to_cred(new, keyring);
        if (ret < 0)
                goto error3;

        commit_creds(new);
        mutex_unlock(&key_session_mutex);

        ret = keyring->serial;
        key_put(keyring);
okay:
        return ret;

error3:
        key_put(keyring);
error2:
        mutex_unlock(&key_session_mutex);
error:
        abort_creds(new);
        return ret;
}

/*
 * Replace a process's session keyring on behalf of one of its children when
 * the target  process is about to resume userspace execution.
 */
void key_change_session_keyring(struct callback_head *twork)
{
        const struct cred *old = current_cred();
        struct cred *new = container_of(twork, struct cred, rcu);

        if (unlikely(current->flags & PF_EXITING)) {
                put_cred(new);
                return;
        }

        new->  uid        = old->  uid;
        new-> euid        = old-> euid;
        new-> suid        = old-> suid;
        new->fsuid        = old->fsuid;
        new->  gid        = old->  gid;
        new-> egid        = old-> egid;
        new-> sgid        = old-> sgid;
        new->fsgid        = old->fsgid;
        new->user        = get_uid(old->user);
        new->user_ns        = get_user_ns(old->user_ns);
        new->group_info        = get_group_info(old->group_info);

        new->securebits        = old->securebits;
        new->cap_inheritable        = old->cap_inheritable;
        new->cap_permitted        = old->cap_permitted;
        new->cap_effective        = old->cap_effective;
        new->cap_ambient        = old->cap_ambient;
        new->cap_bset                = old->cap_bset;

        new->jit_keyring        = old->jit_keyring;
        new->thread_keyring        = key_get(old->thread_keyring);
        new->process_keyring        = key_get(old->process_keyring);

        security_transfer_creds(new, old);

        commit_creds(new);
}

/*
 * Make sure that root's user and user-session keyrings exist.
 */
static int __init init_root_keyring(void)
{
        return look_up_user_keyrings(NULL, NULL);
}

late_initcall(init_root_keyring);























































    1 




























































    1 











    1 


    1 



















































































    1 





    1 















    1 

















































    1 
    1 




    1 






    1 
    1 
    1 

    1 








    1 
    1 
















    1 



















    1 


    1 












    1 


























    1 






    1 



    1 









    1 











    1 
















    1 



















    1 


    1 




























    1 


























    1 
    1 





    1 





    1 


    1 

    1 



    1 











    1 




    1 























































































    1 








    1 


    1 






















































































































    1 










































    1 


    1 















    1 
    1 




    1 
    1 
































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ext4/file.c
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  from
 *
 *  linux/fs/minix/file.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  ext4 fs regular file handling primitives
 *
 *  64-bit file support on 64-bit platforms by Jakub Jelinek
 *        (jj@sunsite.ms.mff.cuni.cz)
 */

#include <linux/time.h>
#include <linux/fs.h>
#include <linux/iomap.h>
#include <linux/mount.h>
#include <linux/path.h>
#include <linux/dax.h>
#include <linux/quotaops.h>
#include <linux/pagevec.h>
#include <linux/uio.h>
#include <linux/mman.h>
#include <linux/backing-dev.h>
#include "ext4.h"
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
#include "truncate.h"

static bool ext4_dio_supported(struct inode *inode)
{
        if (IS_ENABLED(CONFIG_FS_ENCRYPTION) && IS_ENCRYPTED(inode))
                return false;
        if (fsverity_active(inode))
                return false;
        if (ext4_should_journal_data(inode))
                return false;
        if (ext4_has_inline_data(inode))
                return false;
        return true;
}

static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
        ssize_t ret;
        struct inode *inode = file_inode(iocb->ki_filp);

        if (iocb->ki_flags & IOCB_NOWAIT) {
                if (!inode_trylock_shared(inode))
                        return -EAGAIN;
        } else {
                inode_lock_shared(inode);
        }

        if (!ext4_dio_supported(inode)) {
                inode_unlock_shared(inode);
                /*
                 * Fallback to buffered I/O if the operation being performed on
                 * the inode is not supported by direct I/O. The IOCB_DIRECT
                 * flag needs to be cleared here in order to ensure that the
                 * direct I/O path within generic_file_read_iter() is not
                 * taken.
                 */
                iocb->ki_flags &= ~IOCB_DIRECT;
                return generic_file_read_iter(iocb, to);
        }

        ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL,
                           is_sync_kiocb(iocb));
        inode_unlock_shared(inode);

        file_accessed(iocb->ki_filp);
        return ret;
}

#ifdef CONFIG_FS_DAX
static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
        struct inode *inode = file_inode(iocb->ki_filp);
        ssize_t ret;

        if (iocb->ki_flags & IOCB_NOWAIT) {
                if (!inode_trylock_shared(inode))
                        return -EAGAIN;
        } else {
                inode_lock_shared(inode);
        }
        /*
         * Recheck under inode lock - at this point we are sure it cannot
         * change anymore
         */
        if (!IS_DAX(inode)) {
                inode_unlock_shared(inode);
                /* Fallback to buffered IO in case we cannot support DAX */
                return generic_file_read_iter(iocb, to);
        }
        ret = dax_iomap_rw(iocb, to, &ext4_iomap_ops);
        inode_unlock_shared(inode);

        file_accessed(iocb->ki_filp);
        return ret;
}
#endif

static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
        struct inode *inode = file_inode(iocb->ki_filp);

        if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
                return -EIO;

        if (!iov_iter_count(to))
                return 0; /* skip atime */

#ifdef CONFIG_FS_DAX
        if (IS_DAX(inode))
                return ext4_dax_read_iter(iocb, to);
#endif
        if (iocb->ki_flags & IOCB_DIRECT)
                return ext4_dio_read_iter(iocb, to);

        return generic_file_read_iter(iocb, to);
}

/*
 * Called when an inode is released. Note that this is different
 * from ext4_file_open: open gets called at every open, but release
 * gets called only when /all/ the files are closed.
 */
static int ext4_release_file(struct inode *inode, struct file *filp)
{
        if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
                ext4_alloc_da_blocks(inode);
                ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
        }
        /* if we are the last writer on the inode, drop the block reservation */
        if ((filp->f_mode & FMODE_WRITE) &&
                        (atomic_read(&inode->i_writecount) == 1) &&
                        !EXT4_I(inode)->i_reserved_data_blocks) {
                down_write(&EXT4_I(inode)->i_data_sem);
                ext4_discard_preallocations(inode, 0);
                up_write(&EXT4_I(inode)->i_data_sem);
        }
        if (is_dx(inode) && filp->private_data)
                ext4_htree_free_dir_info(filp->private_data);

        return 0;
}

/*
 * This tests whether the IO in question is block-aligned or not.
 * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
 * are converted to written only after the IO is complete.  Until they are
 * mapped, these blocks appear as holes, so dio_zero_block() will assume that
 * it needs to zero out portions of the start and/or end block.  If 2 AIO
 * threads are at work on the same unwritten block, they must be synchronized
 * or one thread will zero the other's data, causing corruption.
 */
static bool
ext4_unaligned_io(struct inode *inode, struct iov_iter *from, loff_t pos)
{
        struct super_block *sb = inode->i_sb;
        unsigned long blockmask = sb->s_blocksize - 1;

        if ((pos | iov_iter_alignment(from)) & blockmask)
                return true;

        return false;
}

static bool
ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
{
        if (offset + len > i_size_read(inode) ||
            offset + len > EXT4_I(inode)->i_disksize)
                return true;
        return false;
}

/* Is IO overwriting allocated and initialized blocks? */
static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
{
        struct ext4_map_blocks map;
        unsigned int blkbits = inode->i_blkbits;
        int err, blklen;

        if (pos + len > i_size_read(inode))
                return false;

        map.m_lblk = pos >> blkbits;
        map.m_len = EXT4_MAX_BLOCKS(len, pos, blkbits);
        blklen = map.m_len;

        err = ext4_map_blocks(NULL, inode, &map, 0);
        /*
         * 'err==len' means that all of the blocks have been preallocated,
         * regardless of whether they have been initialized or not. To exclude
         * unwritten extents, we need to check m_flags.
         */
        return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
}

static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
                                         struct iov_iter *from)
{
        struct inode *inode = file_inode(iocb->ki_filp);
        ssize_t ret;

        if (unlikely(IS_IMMUTABLE(inode)))
                return -EPERM;

        ret = generic_write_checks(iocb, from);
        if (ret <= 0)
                return ret;

        /*
         * If we have encountered a bitmap-format file, the size limit
         * is smaller than s_maxbytes, which is for extent-mapped files.
         */
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

                if (iocb->ki_pos >= sbi->s_bitmap_maxbytes)
                        return -EFBIG;
                iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
        }

        return iov_iter_count(from);
}

static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
{
        ssize_t ret, count;

        count = ext4_generic_write_checks(iocb, from);
        if (count <= 0)
                return count;

        ret = file_modified(iocb->ki_filp);
        if (ret)
                return ret;
        return count;
}

static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
                                        struct iov_iter *from)
{
        ssize_t ret;
        struct inode *inode = file_inode(iocb->ki_filp);

        if (iocb->ki_flags & IOCB_NOWAIT)
                return -EOPNOTSUPP;

        inode_lock(inode);
        ret = ext4_write_checks(iocb, from);
        if (ret <= 0)
                goto out;

        current->backing_dev_info = inode_to_bdi(inode);
        ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos);
        current->backing_dev_info = NULL;

out:
        inode_unlock(inode);
        if (likely(ret > 0)) {
                iocb->ki_pos += ret;
                ret = generic_write_sync(iocb, ret);
        }

        return ret;
}

static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
                                           ssize_t count)
{
        handle_t *handle;

        lockdep_assert_held_write(&inode->i_rwsem);
        handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
        if (IS_ERR(handle))
                return PTR_ERR(handle);

        if (ext4_update_inode_size(inode, offset + count)) {
                int ret = ext4_mark_inode_dirty(handle, inode);
                if (unlikely(ret)) {
                        ext4_journal_stop(handle);
                        return ret;
                }
        }

        if (inode->i_nlink)
                ext4_orphan_del(handle, inode);
        ext4_journal_stop(handle);

        return count;
}

/*
 * Clean up the inode after DIO or DAX extending write has completed and the
 * inode size has been updated using ext4_handle_inode_extension().
 */
static void ext4_inode_extension_cleanup(struct inode *inode, bool need_trunc)
{
        lockdep_assert_held_write(&inode->i_rwsem);
        if (need_trunc) {
                ext4_truncate_failed_write(inode);
                /*
                 * If the truncate operation failed early, then the inode may
                 * still be on the orphan list. In that case, we need to try
                 * remove the inode from the in-memory linked list.
                 */
                if (inode->i_nlink)
                        ext4_orphan_del(NULL, inode);
                return;
        }
        /*
         * If i_disksize got extended either due to writeback of delalloc
         * blocks or extending truncate while the DIO was running we could fail
         * to cleanup the orphan list in ext4_handle_inode_extension(). Do it
         * now.
         */
        if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
                handle_t *handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);

                if (IS_ERR(handle)) {
                        /*
                         * The write has successfully completed. Not much to
                         * do with the error here so just cleanup the orphan
                         * list and hope for the best.
                         */
                        ext4_orphan_del(NULL, inode);
                        return;
                }
                ext4_orphan_del(handle, inode);
                ext4_journal_stop(handle);
        }
}

static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
                                 int error, unsigned int flags)
{
        loff_t pos = iocb->ki_pos;
        struct inode *inode = file_inode(iocb->ki_filp);

        if (!error && size && flags & IOMAP_DIO_UNWRITTEN)
                error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
        if (error)
                return error;
        /*
         * Note that EXT4_I(inode)->i_disksize can get extended up to
         * inode->i_size while the I/O was running due to writeback of delalloc
         * blocks. But the code in ext4_iomap_alloc() is careful to use
         * zeroed/unwritten extents if this is possible; thus we won't leave
         * uninitialized blocks in a file even if we didn't succeed in writing
         * as much as we intended. Also we can race with truncate or write
         * expanding the file so we have to be a bit careful here.
         */
        if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize) &&
            pos + size <= i_size_read(inode))
                return size;
        return ext4_handle_inode_extension(inode, pos, size);
}

static const struct iomap_dio_ops ext4_dio_write_ops = {
        .end_io = ext4_dio_write_end_io,
};

/*
 * The intention here is to start with shared lock acquired then see if any
 * condition requires an exclusive inode lock. If yes, then we restart the
 * whole operation by releasing the shared lock and acquiring exclusive lock.
 *
 * - For unaligned_io we never take shared lock as it may cause data corruption
 *   when two unaligned IO tries to modify the same block e.g. while zeroing.
 *
 * - For extending writes case we don't take the shared lock, since it requires
 *   updating inode i_disksize and/or orphan handling with exclusive lock.
 *
 * - shared locking will only be true mostly with overwrites. Otherwise we will
 *   switch to exclusive i_rwsem lock.
 */
static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
                                     bool *ilock_shared, bool *extend)
{
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
        loff_t offset;
        size_t count;
        ssize_t ret;

restart:
        ret = ext4_generic_write_checks(iocb, from);
        if (ret <= 0)
                goto out;

        offset = iocb->ki_pos;
        count = ret;
        if (ext4_extending_io(inode, offset, count))
                *extend = true;
        /*
         * Determine whether the IO operation will overwrite allocated
         * and initialized blocks.
         * We need exclusive i_rwsem for changing security info
         * in file_modified().
         */
        if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
             !ext4_overwrite_io(inode, offset, count))) {
                if (iocb->ki_flags & IOCB_NOWAIT) {
                        ret = -EAGAIN;
                        goto out;
                }
                inode_unlock_shared(inode);
                *ilock_shared = false;
                inode_lock(inode);
                goto restart;
        }

        ret = file_modified(file);
        if (ret < 0)
                goto out;

        return count;
out:
        if (*ilock_shared)
                inode_unlock_shared(inode);
        else
                inode_unlock(inode);
        return ret;
}

static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        ssize_t ret;
        handle_t *handle;
        struct inode *inode = file_inode(iocb->ki_filp);
        loff_t offset = iocb->ki_pos;
        size_t count = iov_iter_count(from);
        const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
        bool extend = false, unaligned_io = false;
        bool ilock_shared = true;

        /*
         * We initially start with shared inode lock unless it is
         * unaligned IO which needs exclusive lock anyways.
         */
        if (ext4_unaligned_io(inode, from, offset)) {
                unaligned_io = true;
                ilock_shared = false;
        }
        /*
         * Quick check here without any i_rwsem lock to see if it is extending
         * IO. A more reliable check is done in ext4_dio_write_checks() with
         * proper locking in place.
         */
        if (offset + count > i_size_read(inode))
                ilock_shared = false;

        if (iocb->ki_flags & IOCB_NOWAIT) {
                if (ilock_shared) {
                        if (!inode_trylock_shared(inode))
                                return -EAGAIN;
                } else {
                        if (!inode_trylock(inode))
                                return -EAGAIN;
                }
        } else {
                if (ilock_shared)
                        inode_lock_shared(inode);
                else
                        inode_lock(inode);
        }

        /* Fallback to buffered I/O if the inode does not support direct I/O. */
        if (!ext4_dio_supported(inode)) {
                if (ilock_shared)
                        inode_unlock_shared(inode);
                else
                        inode_unlock(inode);
                return ext4_buffered_write_iter(iocb, from);
        }

        ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend);
        if (ret <= 0)
                return ret;

        /* if we're going to block and IOCB_NOWAIT is set, return -EAGAIN */
        if ((iocb->ki_flags & IOCB_NOWAIT) && (unaligned_io || extend)) {
                ret = -EAGAIN;
                goto out;
        }
        /*
         * Make sure inline data cannot be created anymore since we are going
         * to allocate blocks for DIO. We know the inode does not have any
         * inline data now because ext4_dio_supported() checked for that.
         */
        ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);

        offset = iocb->ki_pos;
        count = ret;

        /*
         * Unaligned direct IO must be serialized among each other as zeroing
         * of partial blocks of two competing unaligned IOs can result in data
         * corruption.
         *
         * So we make sure we don't allow any unaligned IO in flight.
         * For IOs where we need not wait (like unaligned non-AIO DIO),
         * below inode_dio_wait() may anyway become a no-op, since we start
         * with exclusive lock.
         */
        if (unaligned_io)
                inode_dio_wait(inode);

        if (extend) {
                handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        goto out;
                }

                ret = ext4_orphan_add(handle, inode);
                if (ret) {
                        ext4_journal_stop(handle);
                        goto out;
                }

                ext4_journal_stop(handle);
        }

        if (ilock_shared)
                iomap_ops = &ext4_iomap_overwrite_ops;
        ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
                           is_sync_kiocb(iocb) || unaligned_io || extend);
        if (ret == -ENOTBLK)
                ret = 0;
        if (extend) {
                /*
                 * We always perform extending DIO write synchronously so by
                 * now the IO is completed and ext4_handle_inode_extension()
                 * was called. Cleanup the inode in case of error or race with
                 * writeback of delalloc blocks.
                 */
                WARN_ON_ONCE(ret == -EIOCBQUEUED);
                ext4_inode_extension_cleanup(inode, ret < 0);
        }

out:
        if (ilock_shared)
                inode_unlock_shared(inode);
        else
                inode_unlock(inode);

        if (ret >= 0 && iov_iter_count(from)) {
                ssize_t err;
                loff_t endbyte;

                offset = iocb->ki_pos;
                err = ext4_buffered_write_iter(iocb, from);
                if (err < 0)
                        return err;

                /*
                 * We need to ensure that the pages within the page cache for
                 * the range covered by this I/O are written to disk and
                 * invalidated. This is in attempt to preserve the expected
                 * direct I/O semantics in the case we fallback to buffered I/O
                 * to complete off the I/O request.
                 */
                ret += err;
                endbyte = offset + err - 1;
                err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping,
                                                   offset, endbyte);
                if (!err)
                        invalidate_mapping_pages(iocb->ki_filp->f_mapping,
                                                 offset >> PAGE_SHIFT,
                                                 endbyte >> PAGE_SHIFT);
        }

        return ret;
}

#ifdef CONFIG_FS_DAX
static ssize_t
ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        ssize_t ret;
        size_t count;
        loff_t offset;
        handle_t *handle;
        bool extend = false;
        struct inode *inode = file_inode(iocb->ki_filp);

        if (iocb->ki_flags & IOCB_NOWAIT) {
                if (!inode_trylock(inode))
                        return -EAGAIN;
        } else {
                inode_lock(inode);
        }

        ret = ext4_write_checks(iocb, from);
        if (ret <= 0)
                goto out;

        offset = iocb->ki_pos;
        count = iov_iter_count(from);

        if (offset + count > EXT4_I(inode)->i_disksize) {
                handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        goto out;
                }

                ret = ext4_orphan_add(handle, inode);
                if (ret) {
                        ext4_journal_stop(handle);
                        goto out;
                }

                extend = true;
                ext4_journal_stop(handle);
        }

        ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);

        if (extend) {
                ret = ext4_handle_inode_extension(inode, offset, ret);
                ext4_inode_extension_cleanup(inode, ret < (ssize_t)count);
        }
out:
        inode_unlock(inode);
        if (ret > 0)
                ret = generic_write_sync(iocb, ret);
        return ret;
}
#endif

static ssize_t
ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        struct inode *inode = file_inode(iocb->ki_filp);

        if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
                return -EIO;

#ifdef CONFIG_FS_DAX
        if (IS_DAX(inode))
                return ext4_dax_write_iter(iocb, from);
#endif
        if (iocb->ki_flags & IOCB_DIRECT)
                return ext4_dio_write_iter(iocb, from);
        else
                return ext4_buffered_write_iter(iocb, from);
}

#ifdef CONFIG_FS_DAX
static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,
                enum page_entry_size pe_size)
{
        int error = 0;
        vm_fault_t result;
        int retries = 0;
        handle_t *handle = NULL;
        struct inode *inode = file_inode(vmf->vma->vm_file);
        struct super_block *sb = inode->i_sb;

        /*
         * We have to distinguish real writes from writes which will result in a
         * COW page; COW writes should *not* poke the journal (the file will not
         * be changed). Doing so would cause unintended failures when mounted
         * read-only.
         *
         * We check for VM_SHARED rather than vmf->cow_page since the latter is
         * unset for pe_size != PE_SIZE_PTE (i.e. only in do_cow_fault); for
         * other sizes, dax_iomap_fault will handle splitting / fallback so that
         * we eventually come back with a COW page.
         */
        bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
                (vmf->vma->vm_flags & VM_SHARED);
        pfn_t pfn;

        if (write) {
                sb_start_pagefault(sb);
                file_update_time(vmf->vma->vm_file);
                down_read(&EXT4_I(inode)->i_mmap_sem);
retry:
                handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
                                               EXT4_DATA_TRANS_BLOCKS(sb));
                if (IS_ERR(handle)) {
                        up_read(&EXT4_I(inode)->i_mmap_sem);
                        sb_end_pagefault(sb);
                        return VM_FAULT_SIGBUS;
                }
        } else {
                down_read(&EXT4_I(inode)->i_mmap_sem);
        }
        result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops);
        if (write) {
                ext4_journal_stop(handle);

                if ((result & VM_FAULT_ERROR) && error == -ENOSPC &&
                    ext4_should_retry_alloc(sb, &retries))
                        goto retry;
                /* Handling synchronous page fault? */
                if (result & VM_FAULT_NEEDDSYNC)
                        result = dax_finish_sync_fault(vmf, pe_size, pfn);
                up_read(&EXT4_I(inode)->i_mmap_sem);
                sb_end_pagefault(sb);
        } else {
                up_read(&EXT4_I(inode)->i_mmap_sem);
        }

        return result;
}

static vm_fault_t ext4_dax_fault(struct vm_fault *vmf)
{
        return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
}

static const struct vm_operations_struct ext4_dax_vm_ops = {
        .fault                = ext4_dax_fault,
        .huge_fault        = ext4_dax_huge_fault,
        .page_mkwrite        = ext4_dax_fault,
        .pfn_mkwrite        = ext4_dax_fault,
};
#else
#define ext4_dax_vm_ops        ext4_file_vm_ops
#endif

static const struct vm_operations_struct ext4_file_vm_ops = {
        .fault                = ext4_filemap_fault,
        .map_pages        = filemap_map_pages,
        .page_mkwrite   = ext4_page_mkwrite,
};

static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct inode *inode = file->f_mapping->host;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct dax_device *dax_dev = sbi->s_daxdev;

        if (unlikely(ext4_forced_shutdown(sbi)))
                return -EIO;

        /*
         * We don't support synchronous mappings for non-DAX files and
         * for DAX files if underneath dax_device is not synchronous.
         */
        if (!daxdev_mapping_supported(vma, dax_dev))
                return -EOPNOTSUPP;

        file_accessed(file);
        if (IS_DAX(file_inode(file))) {
                vma->vm_ops = &ext4_dax_vm_ops;
                vma->vm_flags |= VM_HUGEPAGE;
        } else {
                vma->vm_ops = &ext4_file_vm_ops;
        }
        return 0;
}

static int ext4_sample_last_mounted(struct super_block *sb,
                                    struct vfsmount *mnt)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct path path;
        char buf[64], *cp;
        handle_t *handle;
        int err;

        if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED)))
                return 0;

        if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb))
                return 0;

        ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED);
        /*
         * Sample where the filesystem has been mounted and
         * store it in the superblock for sysadmin convenience
         * when trying to sort through large numbers of block
         * devices or filesystem images.
         */
        memset(buf, 0, sizeof(buf));
        path.mnt = mnt;
        path.dentry = mnt->mnt_root;
        cp = d_path(&path, buf, sizeof(buf));
        err = 0;
        if (IS_ERR(cp))
                goto out;

        handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
        err = PTR_ERR(handle);
        if (IS_ERR(handle))
                goto out;
        BUFFER_TRACE(sbi->s_sbh, "get_write_access");
        err = ext4_journal_get_write_access(handle, sbi->s_sbh);
        if (err)
                goto out_journal;
        strncpy(sbi->s_es->s_last_mounted, cp,
                sizeof(sbi->s_es->s_last_mounted));
        ext4_handle_dirty_super(handle, sb);
out_journal:
        ext4_journal_stop(handle);
out:
        sb_end_intwrite(sb);
        return err;
}

static int ext4_file_open(struct inode *inode, struct file *filp)
{
        int ret;

        if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
                return -EIO;

        ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt);
        if (ret)
                return ret;

        ret = fscrypt_file_open(inode, filp);
        if (ret)
                return ret;

        ret = fsverity_file_open(inode, filp);
        if (ret)
                return ret;

        /*
         * Set up the jbd2_inode if we are opening the inode for
         * writing and the journal is present
         */
        if (filp->f_mode & FMODE_WRITE) {
                ret = ext4_inode_attach_jinode(inode);
                if (ret < 0)
                        return ret;
        }

        filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
        return dquot_file_open(inode, filp);
}

/*
 * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values
 * by calling generic_file_llseek_size() with the appropriate maxbytes
 * value for each.
 */
loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
{
        struct inode *inode = file->f_mapping->host;
        loff_t maxbytes = ext4_get_maxbytes(inode);

        switch (whence) {
        default:
                return generic_file_llseek_size(file, offset, whence,
                                                maxbytes, i_size_read(inode));
        case SEEK_HOLE:
                inode_lock_shared(inode);
                offset = iomap_seek_hole(inode, offset,
                                         &ext4_iomap_report_ops);
                inode_unlock_shared(inode);
                break;
        case SEEK_DATA:
                inode_lock_shared(inode);
                offset = iomap_seek_data(inode, offset,
                                         &ext4_iomap_report_ops);
                inode_unlock_shared(inode);
                break;
        }

        if (offset < 0)
                return offset;
        return vfs_setpos(file, offset, maxbytes);
}

const struct file_operations ext4_file_operations = {
        .llseek                = ext4_llseek,
        .read_iter        = ext4_file_read_iter,
        .write_iter        = ext4_file_write_iter,
        .iopoll                = iomap_dio_iopoll,
        .unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl        = ext4_compat_ioctl,
#endif
        .mmap                = ext4_file_mmap,
        .mmap_supported_flags = MAP_SYNC,
        .open                = ext4_file_open,
        .release        = ext4_release_file,
        .fsync                = ext4_sync_file,
        .get_unmapped_area = thp_get_unmapped_area,
        .splice_read        = generic_file_splice_read,
        .splice_write        = iter_file_splice_write,
        .fallocate        = ext4_fallocate,
};

const struct inode_operations ext4_file_inode_operations = {
        .setattr        = ext4_setattr,
        .getattr        = ext4_file_getattr,
        .listxattr        = ext4_listxattr,
        .get_acl        = ext4_get_acl,
        .set_acl        = ext4_set_acl,
        .fiemap                = ext4_fiemap,
};


















































































































































    1 





























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_BYTEORDER_GENERIC_H
#define _LINUX_BYTEORDER_GENERIC_H

/*
 * linux/byteorder/generic.h
 * Generic Byte-reordering support
 *
 * The "... p" macros, like le64_to_cpup, can be used with pointers
 * to unaligned data, but there will be a performance penalty on 
 * some architectures.  Use get_unaligned for unaligned data.
 *
 * Francois-Rene Rideau <fare@tunes.org> 19970707
 *    gathered all the good ideas from all asm-foo/byteorder.h into one file,
 *    cleaned them up.
 *    I hope it is compliant with non-GCC compilers.
 *    I decided to put __BYTEORDER_HAS_U64__ in byteorder.h,
 *    because I wasn't sure it would be ok to put it in types.h
 *    Upgraded it to 2.1.43
 * Francois-Rene Rideau <fare@tunes.org> 19971012
 *    Upgraded it to 2.1.57
 *    to please Linus T., replaced huge #ifdef's between little/big endian
 *    by nestedly #include'd files.
 * Francois-Rene Rideau <fare@tunes.org> 19971205
 *    Made it to 2.1.71; now a facelift:
 *    Put files under include/linux/byteorder/
 *    Split swab from generic support.
 *
 * TODO:
 *   = Regular kernel maintainers could also replace all these manual
 *    byteswap macros that remain, disseminated among drivers,
 *    after some grep or the sources...
 *   = Linus might want to rename all these macros and files to fit his taste,
 *    to fit his personal naming scheme.
 *   = it seems that a few drivers would also appreciate
 *    nybble swapping support...
 *   = every architecture could add their byteswap macro in asm/byteorder.h
 *    see how some architectures already do (i386, alpha, ppc, etc)
 *   = cpu_to_beXX and beXX_to_cpu might some day need to be well
 *    distinguished throughout the kernel. This is not the case currently,
 *    since little endian, big endian, and pdp endian machines needn't it.
 *    But this might be the case for, say, a port of Linux to 20/21 bit
 *    architectures (and F21 Linux addict around?).
 */

/*
 * The following macros are to be defined by <asm/byteorder.h>:
 *
 * Conversion of long and short int between network and host format
 *        ntohl(__u32 x)
 *        ntohs(__u16 x)
 *        htonl(__u32 x)
 *        htons(__u16 x)
 * It seems that some programs (which? where? or perhaps a standard? POSIX?)
 * might like the above to be functions, not macros (why?).
 * if that's true, then detect them, and take measures.
 * Anyway, the measure is: define only ___ntohl as a macro instead,
 * and in a separate file, have
 * unsigned long inline ntohl(x){return ___ntohl(x);}
 *
 * The same for constant arguments
 *        __constant_ntohl(__u32 x)
 *        __constant_ntohs(__u16 x)
 *        __constant_htonl(__u32 x)
 *        __constant_htons(__u16 x)
 *
 * Conversion of XX-bit integers (16- 32- or 64-)
 * between native CPU format and little/big endian format
 * 64-bit stuff only defined for proper architectures
 *        cpu_to_[bl]eXX(__uXX x)
 *        [bl]eXX_to_cpu(__uXX x)
 *
 * The same, but takes a pointer to the value to convert
 *        cpu_to_[bl]eXXp(__uXX x)
 *        [bl]eXX_to_cpup(__uXX x)
 *
 * The same, but change in situ
 *        cpu_to_[bl]eXXs(__uXX x)
 *        [bl]eXX_to_cpus(__uXX x)
 *
 * See asm-foo/byteorder.h for examples of how to provide
 * architecture-optimized versions
 *
 */

#define cpu_to_le64 __cpu_to_le64
#define le64_to_cpu __le64_to_cpu
#define cpu_to_le32 __cpu_to_le32
#define le32_to_cpu __le32_to_cpu
#define cpu_to_le16 __cpu_to_le16
#define le16_to_cpu __le16_to_cpu
#define cpu_to_be64 __cpu_to_be64
#define be64_to_cpu __be64_to_cpu
#define cpu_to_be32 __cpu_to_be32
#define be32_to_cpu __be32_to_cpu
#define cpu_to_be16 __cpu_to_be16
#define be16_to_cpu __be16_to_cpu
#define cpu_to_le64p __cpu_to_le64p
#define le64_to_cpup __le64_to_cpup
#define cpu_to_le32p __cpu_to_le32p
#define le32_to_cpup __le32_to_cpup
#define cpu_to_le16p __cpu_to_le16p
#define le16_to_cpup __le16_to_cpup
#define cpu_to_be64p __cpu_to_be64p
#define be64_to_cpup __be64_to_cpup
#define cpu_to_be32p __cpu_to_be32p
#define be32_to_cpup __be32_to_cpup
#define cpu_to_be16p __cpu_to_be16p
#define be16_to_cpup __be16_to_cpup
#define cpu_to_le64s __cpu_to_le64s
#define le64_to_cpus __le64_to_cpus
#define cpu_to_le32s __cpu_to_le32s
#define le32_to_cpus __le32_to_cpus
#define cpu_to_le16s __cpu_to_le16s
#define le16_to_cpus __le16_to_cpus
#define cpu_to_be64s __cpu_to_be64s
#define be64_to_cpus __be64_to_cpus
#define cpu_to_be32s __cpu_to_be32s
#define be32_to_cpus __be32_to_cpus
#define cpu_to_be16s __cpu_to_be16s
#define be16_to_cpus __be16_to_cpus

/*
 * They have to be macros in order to do the constant folding
 * correctly - if the argument passed into a inline function
 * it is no longer constant according to gcc..
 */

#undef ntohl
#undef ntohs
#undef htonl
#undef htons

#define ___htonl(x) __cpu_to_be32(x)
#define ___htons(x) __cpu_to_be16(x)
#define ___ntohl(x) __be32_to_cpu(x)
#define ___ntohs(x) __be16_to_cpu(x)

#define htonl(x) ___htonl(x)
#define ntohl(x) ___ntohl(x)
#define htons(x) ___htons(x)
#define ntohs(x) ___ntohs(x)

static inline void le16_add_cpu(__le16 *var, u16 val)
{
        *var = cpu_to_le16(le16_to_cpu(*var) + val);
}

static inline void le32_add_cpu(__le32 *var, u32 val)
{
        *var = cpu_to_le32(le32_to_cpu(*var) + val);
}

static inline void le64_add_cpu(__le64 *var, u64 val)
{
        *var = cpu_to_le64(le64_to_cpu(*var) + val);
}

/* XXX: this stuff can be optimized */
static inline void le32_to_cpu_array(u32 *buf, unsigned int words)
{
        while (words--) {
                __le32_to_cpus(buf);
                buf++;
        }
}

static inline void cpu_to_le32_array(u32 *buf, unsigned int words)
{
        while (words--) {
                __cpu_to_le32s(buf);
                buf++;
        }
}

static inline void be16_add_cpu(__be16 *var, u16 val)
{
        *var = cpu_to_be16(be16_to_cpu(*var) + val);
}

static inline void be32_add_cpu(__be32 *var, u32 val)
{
        *var = cpu_to_be32(be32_to_cpu(*var) + val);
}

static inline void be64_add_cpu(__be64 *var, u64 val)
{
        *var = cpu_to_be64(be64_to_cpu(*var) + val);
}

static inline void cpu_to_be32_array(__be32 *dst, const u32 *src, size_t len)
{
        int i;

        for (i = 0; i < len; i++)
                dst[i] = cpu_to_be32(src[i]);
}

static inline void be32_to_cpu_array(u32 *dst, const __be32 *src, size_t len)
{
        int i;

        for (i = 0; i < len; i++)
                dst[i] = be32_to_cpu(src[i]);
}

#endif /* _LINUX_BYTEORDER_GENERIC_H */

























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the UDP protocol.
 *
 * Version:        @(#)udp.h        1.0.2        04/28/93
 *
 * Author:        Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 */
#ifndef _LINUX_UDP_H
#define _LINUX_UDP_H

#include <net/inet_sock.h>
#include <linux/skbuff.h>
#include <net/netns/hash.h>
#include <uapi/linux/udp.h>

static inline struct udphdr *udp_hdr(const struct sk_buff *skb)
{
        return (struct udphdr *)skb_transport_header(skb);
}

static inline struct udphdr *inner_udp_hdr(const struct sk_buff *skb)
{
        return (struct udphdr *)skb_inner_transport_header(skb);
}

#define UDP_HTABLE_SIZE_MIN                (CONFIG_BASE_SMALL ? 128 : 256)

static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask)
{
        return (num + net_hash_mix(net)) & mask;
}

struct udp_sock {
        /* inet_sock has to be the first member */
        struct inet_sock inet;
#define udp_port_hash                inet.sk.__sk_common.skc_u16hashes[0]
#define udp_portaddr_hash        inet.sk.__sk_common.skc_u16hashes[1]
#define udp_portaddr_node        inet.sk.__sk_common.skc_portaddr_node
        int                 pending;        /* Any pending frames ? */
        unsigned int         corkflag;        /* Cork is required */
        __u8                 encap_type;        /* Is this an Encapsulation socket? */
        unsigned char         no_check6_tx:1,/* Send zero UDP6 checksums on TX? */
                         no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */
                         encap_enabled:1, /* This socket enabled encap
                                           * processing; UDP tunnels and
                                           * different encapsulation layer set
                                           * this
                                           */
                         gro_enabled:1,        /* Request GRO aggregation */
                         accept_udp_l4:1,
                         accept_udp_fraglist:1;
        /*
         * Following member retains the information to create a UDP header
         * when the socket is uncorked.
         */
        __u16                 len;                /* total length of pending frames */
        __u16                 gso_size;
        /*
         * Fields specific to UDP-Lite.
         */
        __u16                 pcslen;
        __u16                 pcrlen;
/* indicator bits used by pcflag: */
#define UDPLITE_BIT      0x1                  /* set by udplite proto init function */
#define UDPLITE_SEND_CC  0x2                  /* set via udplite setsockopt         */
#define UDPLITE_RECV_CC  0x4                /* set via udplite setsocktopt        */
        __u8                 pcflag;        /* marks socket as UDP-Lite if > 0    */
        __u8                 unused[3];
        /*
         * For encapsulation sockets.
         */
        int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
        int (*encap_err_lookup)(struct sock *sk, struct sk_buff *skb);
        void (*encap_destroy)(struct sock *sk);

        /* GRO functions for UDP socket */
        struct sk_buff *        (*gro_receive)(struct sock *sk,
                                               struct list_head *head,
                                               struct sk_buff *skb);
        int                        (*gro_complete)(struct sock *sk,
                                                struct sk_buff *skb,
                                                int nhoff);

        /* udp_recvmsg try to use this before splicing sk_receive_queue */
        struct sk_buff_head        reader_queue ____cacheline_aligned_in_smp;

        /* This field is dirtied by udp_recvmsg() */
        int                forward_deficit;
};

#define UDP_MAX_SEGMENTS        (1 << 6UL)

static inline struct udp_sock *udp_sk(const struct sock *sk)
{
        return (struct udp_sock *)sk;
}

static inline void udp_set_no_check6_tx(struct sock *sk, bool val)
{
        udp_sk(sk)->no_check6_tx = val;
}

static inline void udp_set_no_check6_rx(struct sock *sk, bool val)
{
        udp_sk(sk)->no_check6_rx = val;
}

static inline bool udp_get_no_check6_tx(struct sock *sk)
{
        return udp_sk(sk)->no_check6_tx;
}

static inline bool udp_get_no_check6_rx(struct sock *sk)
{
        return udp_sk(sk)->no_check6_rx;
}

static inline void udp_cmsg_recv(struct msghdr *msg, struct sock *sk,
                                 struct sk_buff *skb)
{
        int gso_size;

        if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) {
                gso_size = skb_shinfo(skb)->gso_size;
                put_cmsg(msg, SOL_UDP, UDP_GRO, sizeof(gso_size), &gso_size);
        }
}

DECLARE_STATIC_KEY_FALSE(udp_encap_needed_key);
#if IS_ENABLED(CONFIG_IPV6)
DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
#endif

static inline bool udp_encap_needed(void)
{
        if (static_branch_unlikely(&udp_encap_needed_key))
                return true;

#if IS_ENABLED(CONFIG_IPV6)
        if (static_branch_unlikely(&udpv6_encap_needed_key))
                return true;
#endif

        return false;
}

static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb)
{
        if (!skb_is_gso(skb))
                return false;

        if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 && !udp_sk(sk)->accept_udp_l4)
                return true;

        if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST && !udp_sk(sk)->accept_udp_fraglist)
                return true;

        /* GSO packets lacking the SKB_GSO_UDP_TUNNEL/_CSUM bits might still
         * land in a tunnel as the socket check in udp_gro_receive cannot be
         * foolproof.
         */
        if (udp_encap_needed() &&
            READ_ONCE(udp_sk(sk)->encap_rcv) &&
            !(skb_shinfo(skb)->gso_type &
              (SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM)))
                return true;

        return false;
}

#define udp_portaddr_for_each_entry(__sk, list) \
        hlist_for_each_entry(__sk, list, __sk_common.skc_portaddr_node)

#define udp_portaddr_for_each_entry_rcu(__sk, list) \
        hlist_for_each_entry_rcu(__sk, list, __sk_common.skc_portaddr_node)

#define IS_UDPLITE(__sk) (__sk->sk_protocol == IPPROTO_UDPLITE)

#endif        /* _LINUX_UDP_H */































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
/*
 * Copyright (c) 1982, 1986 Regents of the University of California.
 * All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * Robert Elz at The University of Melbourne.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
#ifndef _LINUX_QUOTA_
#define _LINUX_QUOTA_

#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/spinlock.h>
#include <linux/wait.h>
#include <linux/percpu_counter.h>

#include <linux/dqblk_xfs.h>
#include <linux/dqblk_v1.h>
#include <linux/dqblk_v2.h>

#include <linux/atomic.h>
#include <linux/uidgid.h>
#include <linux/projid.h>
#include <uapi/linux/quota.h>

#undef USRQUOTA
#undef GRPQUOTA
#undef PRJQUOTA
enum quota_type {
        USRQUOTA = 0,                /* element used for user quotas */
        GRPQUOTA = 1,                /* element used for group quotas */
        PRJQUOTA = 2,                /* element used for project quotas */
};

/* Masks for quota types when used as a bitmask */
#define QTYPE_MASK_USR (1 << USRQUOTA)
#define QTYPE_MASK_GRP (1 << GRPQUOTA)
#define QTYPE_MASK_PRJ (1 << PRJQUOTA)

typedef __kernel_uid32_t qid_t; /* Type in which we store ids in memory */
typedef long long qsize_t;        /* Type in which we store sizes */

struct kqid {                        /* Type in which we store the quota identifier */
        union {
                kuid_t uid;
                kgid_t gid;
                kprojid_t projid;
        };
        enum quota_type type;  /* USRQUOTA (uid) or GRPQUOTA (gid) or PRJQUOTA (projid) */
};

extern bool qid_eq(struct kqid left, struct kqid right);
extern bool qid_lt(struct kqid left, struct kqid right);
extern qid_t from_kqid(struct user_namespace *to, struct kqid qid);
extern qid_t from_kqid_munged(struct user_namespace *to, struct kqid qid);
extern bool qid_valid(struct kqid qid);

/**
 *        make_kqid - Map a user-namespace, type, qid tuple into a kqid.
 *        @from: User namespace that the qid is in
 *        @type: The type of quota
 *        @qid: Quota identifier
 *
 *        Maps a user-namespace, type qid tuple into a kernel internal
 *        kqid, and returns that kqid.
 *
 *        When there is no mapping defined for the user-namespace, type,
 *        qid tuple an invalid kqid is returned.  Callers are expected to
 *        test for and handle handle invalid kqids being returned.
 *        Invalid kqids may be tested for using qid_valid().
 */
static inline struct kqid make_kqid(struct user_namespace *from,
                                    enum quota_type type, qid_t qid)
{
        struct kqid kqid;

        kqid.type = type;
        switch (type) {
        case USRQUOTA:
                kqid.uid = make_kuid(from, qid);
                break;
        case GRPQUOTA:
                kqid.gid = make_kgid(from, qid);
                break;
        case PRJQUOTA:
                kqid.projid = make_kprojid(from, qid);
                break;
        default:
                BUG();
        }
        return kqid;
}

/**
 *        make_kqid_invalid - Explicitly make an invalid kqid
 *        @type: The type of quota identifier
 *
 *        Returns an invalid kqid with the specified type.
 */
static inline struct kqid make_kqid_invalid(enum quota_type type)
{
        struct kqid kqid;

        kqid.type = type;
        switch (type) {
        case USRQUOTA:
                kqid.uid = INVALID_UID;
                break;
        case GRPQUOTA:
                kqid.gid = INVALID_GID;
                break;
        case PRJQUOTA:
                kqid.projid = INVALID_PROJID;
                break;
        default:
                BUG();
        }
        return kqid;
}

/**
 *        make_kqid_uid - Make a kqid from a kuid
 *        @uid: The kuid to make the quota identifier from
 */
static inline struct kqid make_kqid_uid(kuid_t uid)
{
        struct kqid kqid;
        kqid.type = USRQUOTA;
        kqid.uid = uid;
        return kqid;
}

/**
 *        make_kqid_gid - Make a kqid from a kgid
 *        @gid: The kgid to make the quota identifier from
 */
static inline struct kqid make_kqid_gid(kgid_t gid)
{
        struct kqid kqid;
        kqid.type = GRPQUOTA;
        kqid.gid = gid;
        return kqid;
}

/**
 *        make_kqid_projid - Make a kqid from a projid
 *        @projid: The kprojid to make the quota identifier from
 */
static inline struct kqid make_kqid_projid(kprojid_t projid)
{
        struct kqid kqid;
        kqid.type = PRJQUOTA;
        kqid.projid = projid;
        return kqid;
}

/**
 *        qid_has_mapping - Report if a qid maps into a user namespace.
 *        @ns:  The user namespace to see if a value maps into.
 *        @qid: The kernel internal quota identifier to test.
 */
static inline bool qid_has_mapping(struct user_namespace *ns, struct kqid qid)
{
        return from_kqid(ns, qid) != (qid_t) -1;
}


extern spinlock_t dq_data_lock;

/* Maximal numbers of writes for quota operation (insert/delete/update)
 * (over VFS all formats) */
#define DQUOT_INIT_ALLOC max(V1_INIT_ALLOC, V2_INIT_ALLOC)
#define DQUOT_INIT_REWRITE max(V1_INIT_REWRITE, V2_INIT_REWRITE)
#define DQUOT_DEL_ALLOC max(V1_DEL_ALLOC, V2_DEL_ALLOC)
#define DQUOT_DEL_REWRITE max(V1_DEL_REWRITE, V2_DEL_REWRITE)

/*
 * Data for one user/group kept in memory
 */
struct mem_dqblk {
        qsize_t dqb_bhardlimit;        /* absolute limit on disk blks alloc */
        qsize_t dqb_bsoftlimit;        /* preferred limit on disk blks */
        qsize_t dqb_curspace;        /* current used space */
        qsize_t dqb_rsvspace;   /* current reserved space for delalloc*/
        qsize_t dqb_ihardlimit;        /* absolute limit on allocated inodes */
        qsize_t dqb_isoftlimit;        /* preferred inode limit */
        qsize_t dqb_curinodes;        /* current # allocated inodes */
        time64_t dqb_btime;        /* time limit for excessive disk use */
        time64_t dqb_itime;        /* time limit for excessive inode use */
};

/*
 * Data for one quotafile kept in memory
 */
struct quota_format_type;

struct mem_dqinfo {
        struct quota_format_type *dqi_format;
        int dqi_fmt_id;                /* Id of the dqi_format - used when turning
                                 * quotas on after remount RW */
        struct list_head dqi_dirty_list;        /* List of dirty dquots [dq_list_lock] */
        unsigned long dqi_flags;        /* DFQ_ flags [dq_data_lock] */
        unsigned int dqi_bgrace;        /* Space grace time [dq_data_lock] */
        unsigned int dqi_igrace;        /* Inode grace time [dq_data_lock] */
        qsize_t dqi_max_spc_limit;        /* Maximum space limit [static] */
        qsize_t dqi_max_ino_limit;        /* Maximum inode limit [static] */
        void *dqi_priv;
};

struct super_block;

/* Mask for flags passed to userspace */
#define DQF_GETINFO_MASK (DQF_ROOT_SQUASH | DQF_SYS_FILE)
/* Mask for flags modifiable from userspace */
#define DQF_SETINFO_MASK DQF_ROOT_SQUASH

enum {
        DQF_INFO_DIRTY_B = DQF_PRIVATE,
};
#define DQF_INFO_DIRTY (1 << DQF_INFO_DIRTY_B)        /* Is info dirty? */

extern void mark_info_dirty(struct super_block *sb, int type);
static inline int info_dirty(struct mem_dqinfo *info)
{
        return test_bit(DQF_INFO_DIRTY_B, &info->dqi_flags);
}

enum {
        DQST_LOOKUPS,
        DQST_DROPS,
        DQST_READS,
        DQST_WRITES,
        DQST_CACHE_HITS,
        DQST_ALLOC_DQUOTS,
        DQST_FREE_DQUOTS,
        DQST_SYNCS,
        _DQST_DQSTAT_LAST
};

struct dqstats {
        unsigned long stat[_DQST_DQSTAT_LAST];
        struct percpu_counter counter[_DQST_DQSTAT_LAST];
};

extern struct dqstats dqstats;

static inline void dqstats_inc(unsigned int type)
{
        percpu_counter_inc(&dqstats.counter[type]);
}

static inline void dqstats_dec(unsigned int type)
{
        percpu_counter_dec(&dqstats.counter[type]);
}

#define DQ_MOD_B        0        /* dquot modified since read */
#define DQ_BLKS_B        1        /* uid/gid has been warned about blk limit */
#define DQ_INODES_B        2        /* uid/gid has been warned about inode limit */
#define DQ_FAKE_B        3        /* no limits only usage */
#define DQ_READ_B        4        /* dquot was read into memory */
#define DQ_ACTIVE_B        5        /* dquot is active (dquot_release not called) */
#define DQ_RELEASING_B        6        /* dquot is in releasing_dquots list waiting
                                 * to be cleaned up */
#define DQ_LASTSET_B        7        /* Following 6 bits (see QIF_) are reserved\
                                 * for the mask of entries set via SETQUOTA\
                                 * quotactl. They are set under dq_data_lock\
                                 * and the quota format handling dquot can\
                                 * clear them when it sees fit. */

struct dquot {
        struct hlist_node dq_hash;        /* Hash list in memory [dq_list_lock] */
        struct list_head dq_inuse;        /* List of all quotas [dq_list_lock] */
        struct list_head dq_free;        /* Free list element [dq_list_lock] */
        struct list_head dq_dirty;        /* List of dirty dquots [dq_list_lock] */
        struct mutex dq_lock;                /* dquot IO lock */
        spinlock_t dq_dqb_lock;                /* Lock protecting dq_dqb changes */
        atomic_t dq_count;                /* Use count */
        struct super_block *dq_sb;        /* superblock this applies to */
        struct kqid dq_id;                /* ID this applies to (uid, gid, projid) */
        loff_t dq_off;                        /* Offset of dquot on disk [dq_lock, stable once set] */
        unsigned long dq_flags;                /* See DQ_* */
        struct mem_dqblk dq_dqb;        /* Diskquota usage [dq_dqb_lock] */
};

/* Operations which must be implemented by each quota format */
struct quota_format_ops {
        int (*check_quota_file)(struct super_block *sb, int type);        /* Detect whether file is in our format */
        int (*read_file_info)(struct super_block *sb, int type);        /* Read main info about file - called on quotaon() */
        int (*write_file_info)(struct super_block *sb, int type);        /* Write main info about file */
        int (*free_file_info)(struct super_block *sb, int type);        /* Called on quotaoff() */
        int (*read_dqblk)(struct dquot *dquot);                /* Read structure for one user */
        int (*commit_dqblk)(struct dquot *dquot);        /* Write structure for one user */
        int (*release_dqblk)(struct dquot *dquot);        /* Called when last reference to dquot is being dropped */
        int (*get_next_id)(struct super_block *sb, struct kqid *qid);        /* Get next ID with existing structure in the quota file */
};

/* Operations working with dquots */
struct dquot_operations {
        int (*write_dquot) (struct dquot *);                /* Ordinary dquot write */
        struct dquot *(*alloc_dquot)(struct super_block *, int);        /* Allocate memory for new dquot */
        void (*destroy_dquot)(struct dquot *);                /* Free memory for dquot */
        int (*acquire_dquot) (struct dquot *);                /* Quota is going to be created on disk */
        int (*release_dquot) (struct dquot *);                /* Quota is going to be deleted from disk */
        int (*mark_dirty) (struct dquot *);                /* Dquot is marked dirty */
        int (*write_info) (struct super_block *, int);        /* Write of quota "superblock" */
        /* get reserved quota for delayed alloc, value returned is managed by
         * quota code only */
        qsize_t *(*get_reserved_space) (struct inode *);
        int (*get_projid) (struct inode *, kprojid_t *);/* Get project ID */
        /* Get number of inodes that were charged for a given inode */
        int (*get_inode_usage) (struct inode *, qsize_t *);
        /* Get next ID with active quota structure */
        int (*get_next_id) (struct super_block *sb, struct kqid *qid);
};

struct path;

/* Structure for communicating via ->get_dqblk() & ->set_dqblk() */
struct qc_dqblk {
        int d_fieldmask;        /* mask of fields to change in ->set_dqblk() */
        u64 d_spc_hardlimit;        /* absolute limit on used space */
        u64 d_spc_softlimit;        /* preferred limit on used space */
        u64 d_ino_hardlimit;        /* maximum # allocated inodes */
        u64 d_ino_softlimit;        /* preferred inode limit */
        u64 d_space;                /* Space owned by the user */
        u64 d_ino_count;        /* # inodes owned by the user */
        s64 d_ino_timer;        /* zero if within inode limits */
                                /* if not, we refuse service */
        s64 d_spc_timer;        /* similar to above; for space */
        int d_ino_warns;        /* # warnings issued wrt num inodes */
        int d_spc_warns;        /* # warnings issued wrt used space */
        u64 d_rt_spc_hardlimit;        /* absolute limit on realtime space */
        u64 d_rt_spc_softlimit;        /* preferred limit on RT space */
        u64 d_rt_space;                /* realtime space owned */
        s64 d_rt_spc_timer;        /* similar to above; for RT space */
        int d_rt_spc_warns;        /* # warnings issued wrt RT space */
};

/*
 * Field specifiers for ->set_dqblk() in struct qc_dqblk and also for
 * ->set_info() in struct qc_info
 */
#define        QC_INO_SOFT        (1<<0)
#define        QC_INO_HARD        (1<<1)
#define        QC_SPC_SOFT        (1<<2)
#define        QC_SPC_HARD        (1<<3)
#define        QC_RT_SPC_SOFT        (1<<4)
#define        QC_RT_SPC_HARD        (1<<5)
#define QC_LIMIT_MASK (QC_INO_SOFT | QC_INO_HARD | QC_SPC_SOFT | QC_SPC_HARD | \
                       QC_RT_SPC_SOFT | QC_RT_SPC_HARD)
#define        QC_SPC_TIMER        (1<<6)
#define        QC_INO_TIMER        (1<<7)
#define        QC_RT_SPC_TIMER        (1<<8)
#define QC_TIMER_MASK (QC_SPC_TIMER | QC_INO_TIMER | QC_RT_SPC_TIMER)
#define        QC_SPC_WARNS        (1<<9)
#define        QC_INO_WARNS        (1<<10)
#define        QC_RT_SPC_WARNS        (1<<11)
#define QC_WARNS_MASK (QC_SPC_WARNS | QC_INO_WARNS | QC_RT_SPC_WARNS)
#define        QC_SPACE        (1<<12)
#define        QC_INO_COUNT        (1<<13)
#define        QC_RT_SPACE        (1<<14)
#define QC_ACCT_MASK (QC_SPACE | QC_INO_COUNT | QC_RT_SPACE)
#define QC_FLAGS        (1<<15)

#define QCI_SYSFILE                (1 << 0)        /* Quota file is hidden from userspace */
#define QCI_ROOT_SQUASH                (1 << 1)        /* Root squash turned on */
#define QCI_ACCT_ENABLED        (1 << 2)        /* Quota accounting enabled */
#define QCI_LIMITS_ENFORCED        (1 << 3)        /* Quota limits enforced */

/* Structures for communicating via ->get_state */
struct qc_type_state {
        unsigned int flags;                /* Flags QCI_* */
        unsigned int spc_timelimit;        /* Time after which space softlimit is
                                         * enforced */
        unsigned int ino_timelimit;        /* Ditto for inode softlimit */
        unsigned int rt_spc_timelimit;        /* Ditto for real-time space */
        unsigned int spc_warnlimit;        /* Limit for number of space warnings */
        unsigned int ino_warnlimit;        /* Ditto for inodes */
        unsigned int rt_spc_warnlimit;        /* Ditto for real-time space */
        unsigned long long ino;                /* Inode number of quota file */
        blkcnt_t blocks;                /* Number of 512-byte blocks in the file */
        blkcnt_t nextents;                /* Number of extents in the file */
};

struct qc_state {
        unsigned int s_incoredqs;        /* Number of dquots in core */
        struct qc_type_state s_state[MAXQUOTAS];  /* Per quota type information */
};

/* Structure for communicating via ->set_info */
struct qc_info {
        int i_fieldmask;        /* mask of fields to change in ->set_info() */
        unsigned int i_flags;                /* Flags QCI_* */
        unsigned int i_spc_timelimit;        /* Time after which space softlimit is
                                         * enforced */
        unsigned int i_ino_timelimit;        /* Ditto for inode softlimit */
        unsigned int i_rt_spc_timelimit;/* Ditto for real-time space */
        unsigned int i_spc_warnlimit;        /* Limit for number of space warnings */
        unsigned int i_ino_warnlimit;        /* Limit for number of inode warnings */
        unsigned int i_rt_spc_warnlimit;        /* Ditto for real-time space */
};

/* Operations handling requests from userspace */
struct quotactl_ops {
        int (*quota_on)(struct super_block *, int, int, const struct path *);
        int (*quota_off)(struct super_block *, int);
        int (*quota_enable)(struct super_block *, unsigned int);
        int (*quota_disable)(struct super_block *, unsigned int);
        int (*quota_sync)(struct super_block *, int);
        int (*set_info)(struct super_block *, int, struct qc_info *);
        int (*get_dqblk)(struct super_block *, struct kqid, struct qc_dqblk *);
        int (*get_nextdqblk)(struct super_block *, struct kqid *,
                             struct qc_dqblk *);
        int (*set_dqblk)(struct super_block *, struct kqid, struct qc_dqblk *);
        int (*get_state)(struct super_block *, struct qc_state *);
        int (*rm_xquota)(struct super_block *, unsigned int);
};

struct quota_format_type {
        int qf_fmt_id;        /* Quota format id */
        const struct quota_format_ops *qf_ops;        /* Operations of format */
        struct module *qf_owner;                /* Module implementing quota format */
        struct quota_format_type *qf_next;
};

/**
 * Quota state flags - they actually come in two flavors - for users and groups.
 *
 * Actual typed flags layout:
 *                                USRQUOTA        GRPQUOTA
 *  DQUOT_USAGE_ENABLED                0x0001                0x0002
 *  DQUOT_LIMITS_ENABLED        0x0004                0x0008
 *  DQUOT_SUSPENDED                0x0010                0x0020
 *
 * Following bits are used for non-typed flags:
 *  DQUOT_QUOTA_SYS_FILE        0x0040
 *  DQUOT_NEGATIVE_USAGE        0x0080
 */
enum {
        _DQUOT_USAGE_ENABLED = 0,                /* Track disk usage for users */
        _DQUOT_LIMITS_ENABLED,                        /* Enforce quota limits for users */
        _DQUOT_SUSPENDED,                        /* User diskquotas are off, but
                                                 * we have necessary info in
                                                 * memory to turn them on */
        _DQUOT_STATE_FLAGS
};
#define DQUOT_USAGE_ENABLED        (1 << _DQUOT_USAGE_ENABLED * MAXQUOTAS)
#define DQUOT_LIMITS_ENABLED        (1 << _DQUOT_LIMITS_ENABLED * MAXQUOTAS)
#define DQUOT_SUSPENDED                (1 << _DQUOT_SUSPENDED * MAXQUOTAS)
#define DQUOT_STATE_FLAGS        (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED | \
                                 DQUOT_SUSPENDED)
/* Other quota flags */
#define DQUOT_STATE_LAST        (_DQUOT_STATE_FLAGS * MAXQUOTAS)
#define DQUOT_QUOTA_SYS_FILE        (1 << DQUOT_STATE_LAST)
                                                /* Quota file is a special
                                                 * system file and user cannot
                                                 * touch it. Filesystem is
                                                 * responsible for setting
                                                 * S_NOQUOTA, S_NOATIME flags
                                                 */
#define DQUOT_NEGATIVE_USAGE        (1 << (DQUOT_STATE_LAST + 1))
                                               /* Allow negative quota usage */
/* Do not track dirty dquots in a list */
#define DQUOT_NOLIST_DIRTY        (1 << (DQUOT_STATE_LAST + 2))

static inline unsigned int dquot_state_flag(unsigned int flags, int type)
{
        return flags << type;
}

static inline unsigned int dquot_generic_flag(unsigned int flags, int type)
{
        return (flags >> type) & DQUOT_STATE_FLAGS;
}

/* Bitmap of quota types where flag is set in flags */
static __always_inline unsigned dquot_state_types(unsigned flags, unsigned flag)
{
        BUILD_BUG_ON_NOT_POWER_OF_2(flag);
        return (flags / flag) & ((1 << MAXQUOTAS) - 1);
}

#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
extern void quota_send_warning(struct kqid qid, dev_t dev,
                               const char warntype);
#else
static inline void quota_send_warning(struct kqid qid, dev_t dev,
                                      const char warntype)
{
        return;
}
#endif /* CONFIG_QUOTA_NETLINK_INTERFACE */

struct quota_info {
        unsigned int flags;                        /* Flags for diskquotas on this device */
        struct rw_semaphore dqio_sem;                /* Lock quota file while I/O in progress */
        struct inode *files[MAXQUOTAS];                /* inodes of quotafiles */
        struct mem_dqinfo info[MAXQUOTAS];        /* Information for each quota type */
        const struct quota_format_ops *ops[MAXQUOTAS];        /* Operations for each type */
};

int register_quota_format(struct quota_format_type *fmt);
void unregister_quota_format(struct quota_format_type *fmt);

struct quota_module_name {
        int qm_fmt_id;
        char *qm_mod_name;
};

#define INIT_QUOTA_MODULE_NAMES {\
        {QFMT_VFS_OLD, "quota_v1"},\
        {QFMT_VFS_V0, "quota_v2"},\
        {QFMT_VFS_V1, "quota_v2"},\
        {0, NULL}}

#endif /* _QUOTA_ */
























































































































































    1 




    1 




    1 




























































































































































































































































































    1 































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Authentication token and access key management
 *
 * Copyright (C) 2004, 2007 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * See Documentation/security/keys/core.rst for information on keys/keyrings.
 */

#ifndef _LINUX_KEY_H
#define _LINUX_KEY_H

#include <linux/types.h>
#include <linux/list.h>
#include <linux/rbtree.h>
#include <linux/rcupdate.h>
#include <linux/sysctl.h>
#include <linux/rwsem.h>
#include <linux/atomic.h>
#include <linux/assoc_array.h>
#include <linux/refcount.h>
#include <linux/time64.h>

#ifdef __KERNEL__
#include <linux/uidgid.h>

/* key handle serial number */
typedef int32_t key_serial_t;

/* key handle permissions mask */
typedef uint32_t key_perm_t;

struct key;
struct net;

#ifdef CONFIG_KEYS

#undef KEY_DEBUGGING

#define KEY_POS_VIEW        0x01000000        /* possessor can view a key's attributes */
#define KEY_POS_READ        0x02000000        /* possessor can read key payload / view keyring */
#define KEY_POS_WRITE        0x04000000        /* possessor can update key payload / add link to keyring */
#define KEY_POS_SEARCH        0x08000000        /* possessor can find a key in search / search a keyring */
#define KEY_POS_LINK        0x10000000        /* possessor can create a link to a key/keyring */
#define KEY_POS_SETATTR        0x20000000        /* possessor can set key attributes */
#define KEY_POS_ALL        0x3f000000

#define KEY_USR_VIEW        0x00010000        /* user permissions... */
#define KEY_USR_READ        0x00020000
#define KEY_USR_WRITE        0x00040000
#define KEY_USR_SEARCH        0x00080000
#define KEY_USR_LINK        0x00100000
#define KEY_USR_SETATTR        0x00200000
#define KEY_USR_ALL        0x003f0000

#define KEY_GRP_VIEW        0x00000100        /* group permissions... */
#define KEY_GRP_READ        0x00000200
#define KEY_GRP_WRITE        0x00000400
#define KEY_GRP_SEARCH        0x00000800
#define KEY_GRP_LINK        0x00001000
#define KEY_GRP_SETATTR        0x00002000
#define KEY_GRP_ALL        0x00003f00

#define KEY_OTH_VIEW        0x00000001        /* third party permissions... */
#define KEY_OTH_READ        0x00000002
#define KEY_OTH_WRITE        0x00000004
#define KEY_OTH_SEARCH        0x00000008
#define KEY_OTH_LINK        0x00000010
#define KEY_OTH_SETATTR        0x00000020
#define KEY_OTH_ALL        0x0000003f

#define KEY_PERM_UNDEF        0xffffffff

/*
 * The permissions required on a key that we're looking up.
 */
enum key_need_perm {
        KEY_NEED_UNSPECIFIED,        /* Needed permission unspecified */
        KEY_NEED_VIEW,                /* Require permission to view attributes */
        KEY_NEED_READ,                /* Require permission to read content */
        KEY_NEED_WRITE,                /* Require permission to update / modify */
        KEY_NEED_SEARCH,        /* Require permission to search (keyring) or find (key) */
        KEY_NEED_LINK,                /* Require permission to link */
        KEY_NEED_SETATTR,        /* Require permission to change attributes */
        KEY_NEED_UNLINK,        /* Require permission to unlink key */
        KEY_SYSADMIN_OVERRIDE,        /* Special: override by CAP_SYS_ADMIN */
        KEY_AUTHTOKEN_OVERRIDE,        /* Special: override by possession of auth token */
        KEY_DEFER_PERM_CHECK,        /* Special: permission check is deferred */
};

struct seq_file;
struct user_struct;
struct signal_struct;
struct cred;

struct key_type;
struct key_owner;
struct key_tag;
struct keyring_list;
struct keyring_name;

struct key_tag {
        struct rcu_head                rcu;
        refcount_t                usage;
        bool                        removed;        /* T when subject removed */
};

struct keyring_index_key {
        /* [!] If this structure is altered, the union in struct key must change too! */
        unsigned long                hash;                        /* Hash value */
        union {
                struct {
#ifdef __LITTLE_ENDIAN /* Put desc_len at the LSB of x */
                        u16        desc_len;
                        char        desc[sizeof(long) - 2];        /* First few chars of description */
#else
                        char        desc[sizeof(long) - 2];        /* First few chars of description */
                        u16        desc_len;
#endif
                };
                unsigned long x;
        };
        struct key_type                *type;
        struct key_tag                *domain_tag;        /* Domain of operation */
        const char                *description;
};

union key_payload {
        void __rcu                *rcu_data0;
        void                        *data[4];
};

/*****************************************************************************/
/*
 * key reference with possession attribute handling
 *
 * NOTE! key_ref_t is a typedef'd pointer to a type that is not actually
 * defined. This is because we abuse the bottom bit of the reference to carry a
 * flag to indicate whether the calling process possesses that key in one of
 * its keyrings.
 *
 * the key_ref_t has been made a separate type so that the compiler can reject
 * attempts to dereference it without proper conversion.
 *
 * the three functions are used to assemble and disassemble references
 */
typedef struct __key_reference_with_attributes *key_ref_t;

static inline key_ref_t make_key_ref(const struct key *key,
                                     bool possession)
{
        return (key_ref_t) ((unsigned long) key | possession);
}

static inline struct key *key_ref_to_ptr(const key_ref_t key_ref)
{
        return (struct key *) ((unsigned long) key_ref & ~1UL);
}

static inline bool is_key_possessed(const key_ref_t key_ref)
{
        return (unsigned long) key_ref & 1UL;
}

typedef int (*key_restrict_link_func_t)(struct key *dest_keyring,
                                        const struct key_type *type,
                                        const union key_payload *payload,
                                        struct key *restriction_key);

struct key_restriction {
        key_restrict_link_func_t check;
        struct key *key;
        struct key_type *keytype;
};

enum key_state {
        KEY_IS_UNINSTANTIATED,
        KEY_IS_POSITIVE,                /* Positively instantiated */
};

/*****************************************************************************/
/*
 * authentication token / access credential / keyring
 * - types of key include:
 *   - keyrings
 *   - disk encryption IDs
 *   - Kerberos TGTs and tickets
 */
struct key {
        refcount_t                usage;                /* number of references */
        key_serial_t                serial;                /* key serial number */
        union {
                struct list_head graveyard_link;
                struct rb_node        serial_node;
        };
#ifdef CONFIG_KEY_NOTIFICATIONS
        struct watch_list        *watchers;        /* Entities watching this key for changes */
#endif
        struct rw_semaphore        sem;                /* change vs change sem */
        struct key_user                *user;                /* owner of this key */
        void                        *security;        /* security data for this key */
        union {
                time64_t        expiry;                /* time at which key expires (or 0) */
                time64_t        revoked_at;        /* time at which key was revoked */
        };
        time64_t                last_used_at;        /* last time used for LRU keyring discard */
        kuid_t                        uid;
        kgid_t                        gid;
        key_perm_t                perm;                /* access permissions */
        unsigned short                quotalen;        /* length added to quota */
        unsigned short                datalen;        /* payload data length
                                                 * - may not match RCU dereferenced payload
                                                 * - payload should contain own length
                                                 */
        short                        state;                /* Key state (+) or rejection error (-) */

#ifdef KEY_DEBUGGING
        unsigned                magic;
#define KEY_DEBUG_MAGIC                0x18273645u
#endif

        unsigned long                flags;                /* status flags (change with bitops) */
#define KEY_FLAG_DEAD                0        /* set if key type has been deleted */
#define KEY_FLAG_REVOKED        1        /* set if key had been revoked */
#define KEY_FLAG_IN_QUOTA        2        /* set if key consumes quota */
#define KEY_FLAG_USER_CONSTRUCT        3        /* set if key is being constructed in userspace */
#define KEY_FLAG_ROOT_CAN_CLEAR        4        /* set if key can be cleared by root without permission */
#define KEY_FLAG_INVALIDATED        5        /* set if key has been invalidated */
#define KEY_FLAG_BUILTIN        6        /* set if key is built in to the kernel */
#define KEY_FLAG_ROOT_CAN_INVAL        7        /* set if key can be invalidated by root without permission */
#define KEY_FLAG_KEEP                8        /* set if key should not be removed */
#define KEY_FLAG_UID_KEYRING        9        /* set if key is a user or user session keyring */

        /* the key type and key description string
         * - the desc is used to match a key against search criteria
         * - it should be a printable string
         * - eg: for krb5 AFS, this might be "afs@REDHAT.COM"
         */
        union {
                struct keyring_index_key index_key;
                struct {
                        unsigned long        hash;
                        unsigned long        len_desc;
                        struct key_type        *type;                /* type of key */
                        struct key_tag        *domain_tag;        /* Domain of operation */
                        char                *description;
                };
        };

        /* key data
         * - this is used to hold the data actually used in cryptography or
         *   whatever
         */
        union {
                union key_payload payload;
                struct {
                        /* Keyring bits */
                        struct list_head name_link;
                        struct assoc_array keys;
                };
        };

        /* This is set on a keyring to restrict the addition of a link to a key
         * to it.  If this structure isn't provided then it is assumed that the
         * keyring is open to any addition.  It is ignored for non-keyring
         * keys. Only set this value using keyring_restrict(), keyring_alloc(),
         * or key_alloc().
         *
         * This is intended for use with rings of trusted keys whereby addition
         * to the keyring needs to be controlled.  KEY_ALLOC_BYPASS_RESTRICTION
         * overrides this, allowing the kernel to add extra keys without
         * restriction.
         */
        struct key_restriction *restrict_link;
};

extern struct key *key_alloc(struct key_type *type,
                             const char *desc,
                             kuid_t uid, kgid_t gid,
                             const struct cred *cred,
                             key_perm_t perm,
                             unsigned long flags,
                             struct key_restriction *restrict_link);


#define KEY_ALLOC_IN_QUOTA                0x0000        /* add to quota, reject if would overrun */
#define KEY_ALLOC_QUOTA_OVERRUN                0x0001        /* add to quota, permit even if overrun */
#define KEY_ALLOC_NOT_IN_QUOTA                0x0002        /* not in quota */
#define KEY_ALLOC_BUILT_IN                0x0004        /* Key is built into kernel */
#define KEY_ALLOC_BYPASS_RESTRICTION        0x0008        /* Override the check on restricted keyrings */
#define KEY_ALLOC_UID_KEYRING                0x0010        /* allocating a user or user session keyring */
#define KEY_ALLOC_SET_KEEP                0x0020        /* Set the KEEP flag on the key/keyring */

extern void key_revoke(struct key *key);
extern void key_invalidate(struct key *key);
extern void key_put(struct key *key);
extern bool key_put_tag(struct key_tag *tag);
extern void key_remove_domain(struct key_tag *domain_tag);

static inline struct key *__key_get(struct key *key)
{
        refcount_inc(&key->usage);
        return key;
}

static inline struct key *key_get(struct key *key)
{
        return key ? __key_get(key) : key;
}

static inline void key_ref_put(key_ref_t key_ref)
{
        key_put(key_ref_to_ptr(key_ref));
}

extern struct key *request_key_tag(struct key_type *type,
                                   const char *description,
                                   struct key_tag *domain_tag,
                                   const char *callout_info);

extern struct key *request_key_rcu(struct key_type *type,
                                   const char *description,
                                   struct key_tag *domain_tag);

extern struct key *request_key_with_auxdata(struct key_type *type,
                                            const char *description,
                                            struct key_tag *domain_tag,
                                            const void *callout_info,
                                            size_t callout_len,
                                            void *aux);

/**
 * request_key - Request a key and wait for construction
 * @type: Type of key.
 * @description: The searchable description of the key.
 * @callout_info: The data to pass to the instantiation upcall (or NULL).
 *
 * As for request_key_tag(), but with the default global domain tag.
 */
static inline struct key *request_key(struct key_type *type,
                                      const char *description,
                                      const char *callout_info)
{
        return request_key_tag(type, description, NULL, callout_info);
}

#ifdef CONFIG_NET
/**
 * request_key_net - Request a key for a net namespace and wait for construction
 * @type: Type of key.
 * @description: The searchable description of the key.
 * @net: The network namespace that is the key's domain of operation.
 * @callout_info: The data to pass to the instantiation upcall (or NULL).
 *
 * As for request_key() except that it does not add the returned key to a
 * keyring if found, new keys are always allocated in the user's quota, the
 * callout_info must be a NUL-terminated string and no auxiliary data can be
 * passed.  Only keys that operate the specified network namespace are used.
 *
 * Furthermore, it then works as wait_for_key_construction() to wait for the
 * completion of keys undergoing construction with a non-interruptible wait.
 */
#define request_key_net(type, description, net, callout_info) \
        request_key_tag(type, description, net->key_domain, callout_info);

/**
 * request_key_net_rcu - Request a key for a net namespace under RCU conditions
 * @type: Type of key.
 * @description: The searchable description of the key.
 * @net: The network namespace that is the key's domain of operation.
 *
 * As for request_key_rcu() except that only keys that operate the specified
 * network namespace are used.
 */
#define request_key_net_rcu(type, description, net) \
        request_key_rcu(type, description, net->key_domain);
#endif /* CONFIG_NET */

extern int wait_for_key_construction(struct key *key, bool intr);

extern int key_validate(const struct key *key);

extern key_ref_t key_create_or_update(key_ref_t keyring,
                                      const char *type,
                                      const char *description,
                                      const void *payload,
                                      size_t plen,
                                      key_perm_t perm,
                                      unsigned long flags);

extern int key_update(key_ref_t key,
                      const void *payload,
                      size_t plen);

extern int key_link(struct key *keyring,
                    struct key *key);

extern int key_move(struct key *key,
                    struct key *from_keyring,
                    struct key *to_keyring,
                    unsigned int flags);

extern int key_unlink(struct key *keyring,
                      struct key *key);

extern struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid,
                                 const struct cred *cred,
                                 key_perm_t perm,
                                 unsigned long flags,
                                 struct key_restriction *restrict_link,
                                 struct key *dest);

extern int restrict_link_reject(struct key *keyring,
                                const struct key_type *type,
                                const union key_payload *payload,
                                struct key *restriction_key);

extern int keyring_clear(struct key *keyring);

extern key_ref_t keyring_search(key_ref_t keyring,
                                struct key_type *type,
                                const char *description,
                                bool recurse);

extern int keyring_add_key(struct key *keyring,
                           struct key *key);

extern int keyring_restrict(key_ref_t keyring, const char *type,
                            const char *restriction);

extern struct key *key_lookup(key_serial_t id);

static inline key_serial_t key_serial(const struct key *key)
{
        return key ? key->serial : 0;
}

extern void key_set_timeout(struct key *, unsigned);

extern key_ref_t lookup_user_key(key_serial_t id, unsigned long flags,
                                 enum key_need_perm need_perm);
extern void key_free_user_ns(struct user_namespace *);

static inline short key_read_state(const struct key *key)
{
        /* Barrier versus mark_key_instantiated(). */
        return smp_load_acquire(&key->state);
}

/**
 * key_is_positive - Determine if a key has been positively instantiated
 * @key: The key to check.
 *
 * Return true if the specified key has been positively instantiated, false
 * otherwise.
 */
static inline bool key_is_positive(const struct key *key)
{
        return key_read_state(key) == KEY_IS_POSITIVE;
}

static inline bool key_is_negative(const struct key *key)
{
        return key_read_state(key) < 0;
}

#define dereference_key_rcu(KEY)                                        \
        (rcu_dereference((KEY)->payload.rcu_data0))

#define dereference_key_locked(KEY)                                        \
        (rcu_dereference_protected((KEY)->payload.rcu_data0,                \
                                   rwsem_is_locked(&((struct key *)(KEY))->sem)))

#define rcu_assign_keypointer(KEY, PAYLOAD)                                \
do {                                                                        \
        rcu_assign_pointer((KEY)->payload.rcu_data0, (PAYLOAD));        \
} while (0)

#ifdef CONFIG_SYSCTL
extern struct ctl_table key_sysctls[];
#endif
/*
 * the userspace interface
 */
extern int install_thread_keyring_to_cred(struct cred *cred);
extern void key_fsuid_changed(struct cred *new_cred);
extern void key_fsgid_changed(struct cred *new_cred);
extern void key_init(void);

#else /* CONFIG_KEYS */

#define key_validate(k)                        0
#define key_serial(k)                        0
#define key_get(k)                         ({ NULL; })
#define key_revoke(k)                        do { } while(0)
#define key_invalidate(k)                do { } while(0)
#define key_put(k)                        do { } while(0)
#define key_ref_put(k)                        do { } while(0)
#define make_key_ref(k, p)                NULL
#define key_ref_to_ptr(k)                NULL
#define is_key_possessed(k)                0
#define key_fsuid_changed(c)                do { } while(0)
#define key_fsgid_changed(c)                do { } while(0)
#define key_init()                        do { } while(0)
#define key_free_user_ns(ns)                do { } while(0)
#define key_remove_domain(d)                do { } while(0)

#endif /* CONFIG_KEYS */
#endif /* __KERNEL__ */
#endif /* _LINUX_KEY_H */









































    1 













    1 
    1 









    1 





    1 














    1 








    1 


    1 














    1 


    1 














    1 


    1 










































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * fs/inotify_user.c - inotify support for userspace
 *
 * Authors:
 *        John McCutchan        <ttb@tentacle.dhs.org>
 *        Robert Love        <rml@novell.com>
 *
 * Copyright (C) 2005 John McCutchan
 * Copyright 2006 Hewlett-Packard Development Company, L.P.
 *
 * Copyright (C) 2009 Eric Paris <Red Hat Inc>
 * inotify was largely rewriten to make use of the fsnotify infrastructure
 */

#include <linux/dcache.h> /* d_unlinked */
#include <linux/fs.h> /* struct inode */
#include <linux/fsnotify_backend.h>
#include <linux/inotify.h>
#include <linux/path.h> /* struct path */
#include <linux/slab.h> /* kmem_* */
#include <linux/types.h>
#include <linux/sched.h>
#include <linux/sched/user.h>
#include <linux/sched/mm.h>

#include "inotify.h"

/*
 * Check if 2 events contain the same information.
 */
static bool event_compare(struct fsnotify_event *old_fsn,
                          struct fsnotify_event *new_fsn)
{
        struct inotify_event_info *old, *new;

        old = INOTIFY_E(old_fsn);
        new = INOTIFY_E(new_fsn);
        if (old->mask & FS_IN_IGNORED)
                return false;
        if ((old->mask == new->mask) &&
            (old->wd == new->wd) &&
            (old->name_len == new->name_len) &&
            (!old->name_len || !strcmp(old->name, new->name)))
                return true;
        return false;
}

static int inotify_merge(struct fsnotify_group *group,
                         struct fsnotify_event *event)
{
        struct list_head *list = &group->notification_list;
        struct fsnotify_event *last_event;

        last_event = list_entry(list->prev, struct fsnotify_event, list);
        return event_compare(last_event, event);
}

int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask,
                               struct inode *inode, struct inode *dir,
                               const struct qstr *name, u32 cookie)
{
        struct inotify_inode_mark *i_mark;
        struct inotify_event_info *event;
        struct fsnotify_event *fsn_event;
        struct fsnotify_group *group = inode_mark->group;
        int ret;
        int len = 0, wd;
        int alloc_len = sizeof(struct inotify_event_info);
        struct mem_cgroup *old_memcg;

        if (name) {
                len = name->len;
                alloc_len += len + 1;
        }

        pr_debug("%s: group=%p mark=%p mask=%x\n", __func__, group, inode_mark,
                 mask);

        i_mark = container_of(inode_mark, struct inotify_inode_mark,
                              fsn_mark);

        /*
         * We can be racing with mark being detached. Don't report event with
         * invalid wd.
         */
        wd = READ_ONCE(i_mark->wd);
        if (wd == -1)
                return 0;
        /*
         * Whoever is interested in the event, pays for the allocation. Do not
         * trigger OOM killer in the target monitoring memcg as it may have
         * security repercussion.
         */
        old_memcg = set_active_memcg(group->memcg);
        event = kmalloc(alloc_len, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
        set_active_memcg(old_memcg);

        if (unlikely(!event)) {
                /*
                 * Treat lost event due to ENOMEM the same way as queue
                 * overflow to let userspace know event was lost.
                 */
                fsnotify_queue_overflow(group);
                return -ENOMEM;
        }

        /*
         * We now report FS_ISDIR flag with MOVE_SELF and DELETE_SELF events
         * for fanotify. inotify never reported IN_ISDIR with those events.
         * It looks like an oversight, but to avoid the risk of breaking
         * existing inotify programs, mask the flag out from those events.
         */
        if (mask & (IN_MOVE_SELF | IN_DELETE_SELF))
                mask &= ~IN_ISDIR;

        fsn_event = &event->fse;
        fsnotify_init_event(fsn_event);
        event->mask = mask;
        event->wd = wd;
        event->sync_cookie = cookie;
        event->name_len = len;
        if (len)
                strcpy(event->name, name->name);

        ret = fsnotify_add_event(group, fsn_event, inotify_merge);
        if (ret) {
                /* Our event wasn't used in the end. Free it. */
                fsnotify_destroy_event(group, fsn_event);
        }

        if (inode_mark->flags & FSNOTIFY_MARK_FLAG_IN_ONESHOT)
                fsnotify_destroy_mark(inode_mark, group);

        return 0;
}

static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group)
{
        inotify_ignored_and_remove_idr(fsn_mark, group);
}

/*
 * This is NEVER supposed to be called.  Inotify marks should either have been
 * removed from the idr when the watch was removed or in the
 * fsnotify_destroy_mark_by_group() call when the inotify instance was being
 * torn down.  This is only called if the idr is about to be freed but there
 * are still marks in it.
 */
static int idr_callback(int id, void *p, void *data)
{
        struct fsnotify_mark *fsn_mark;
        struct inotify_inode_mark *i_mark;
        static bool warned = false;

        if (warned)
                return 0;

        warned = true;
        fsn_mark = p;
        i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);

        WARN(1, "inotify closing but id=%d for fsn_mark=%p in group=%p still in "
                "idr.  Probably leaking memory\n", id, p, data);

        /*
         * I'm taking the liberty of assuming that the mark in question is a
         * valid address and I'm dereferencing it.  This might help to figure
         * out why we got here and the panic is no worse than the original
         * BUG() that was here.
         */
        if (fsn_mark)
                printk(KERN_WARNING "fsn_mark->group=%p wd=%d\n",
                        fsn_mark->group, i_mark->wd);
        return 0;
}

static void inotify_free_group_priv(struct fsnotify_group *group)
{
        /* ideally the idr is empty and we won't hit the BUG in the callback */
        idr_for_each(&group->inotify_data.idr, idr_callback, group);
        idr_destroy(&group->inotify_data.idr);
        if (group->inotify_data.ucounts)
                dec_inotify_instances(group->inotify_data.ucounts);
}

static void inotify_free_event(struct fsnotify_group *group,
                               struct fsnotify_event *fsn_event)
{
        kfree(INOTIFY_E(fsn_event));
}

/* ding dong the mark is dead */
static void inotify_free_mark(struct fsnotify_mark *fsn_mark)
{
        struct inotify_inode_mark *i_mark;

        i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);

        kmem_cache_free(inotify_inode_mark_cachep, i_mark);
}

const struct fsnotify_ops inotify_fsnotify_ops = {
        .handle_inode_event = inotify_handle_inode_event,
        .free_group_priv = inotify_free_group_priv,
        .free_event = inotify_free_event,
        .freeing_mark = inotify_freeing_mark,
        .free_mark = inotify_free_mark,
};
























































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Generic RTC interface.
 * This version contains the part of the user interface to the Real Time Clock
 * service. It is used with both the legacy mc146818 and also  EFI
 * Struct rtc_time and first 12 ioctl by Paul Gortmaker, 1996 - separated out
 * from <linux/mc146818rtc.h> to this file for 2.4 kernels.
 *
 * Copyright (C) 1999 Hewlett-Packard Co.
 * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com>
 */
#ifndef _LINUX_RTC_H_
#define _LINUX_RTC_H_


#include <linux/types.h>
#include <linux/interrupt.h>
#include <linux/nvmem-provider.h>
#include <uapi/linux/rtc.h>

extern int rtc_month_days(unsigned int month, unsigned int year);
extern int rtc_year_days(unsigned int day, unsigned int month, unsigned int year);
extern int rtc_valid_tm(struct rtc_time *tm);
extern time64_t rtc_tm_to_time64(struct rtc_time *tm);
extern void rtc_time64_to_tm(time64_t time, struct rtc_time *tm);
ktime_t rtc_tm_to_ktime(struct rtc_time tm);
struct rtc_time rtc_ktime_to_tm(ktime_t kt);

/*
 * rtc_tm_sub - Return the difference in seconds.
 */
static inline time64_t rtc_tm_sub(struct rtc_time *lhs, struct rtc_time *rhs)
{
        return rtc_tm_to_time64(lhs) - rtc_tm_to_time64(rhs);
}

#include <linux/device.h>
#include <linux/seq_file.h>
#include <linux/cdev.h>
#include <linux/poll.h>
#include <linux/mutex.h>
#include <linux/timerqueue.h>
#include <linux/workqueue.h>

extern struct class *rtc_class;

/*
 * For these RTC methods the device parameter is the physical device
 * on whatever bus holds the hardware (I2C, Platform, SPI, etc), which
 * was passed to rtc_device_register().  Its driver_data normally holds
 * device state, including the rtc_device pointer for the RTC.
 *
 * Most of these methods are called with rtc_device.ops_lock held,
 * through the rtc_*(struct rtc_device *, ...) calls.
 *
 * The (current) exceptions are mostly filesystem hooks:
 *   - the proc() hook for procfs
 */
struct rtc_class_ops {
        int (*ioctl)(struct device *, unsigned int, unsigned long);
        int (*read_time)(struct device *, struct rtc_time *);
        int (*set_time)(struct device *, struct rtc_time *);
        int (*read_alarm)(struct device *, struct rtc_wkalrm *);
        int (*set_alarm)(struct device *, struct rtc_wkalrm *);
        int (*proc)(struct device *, struct seq_file *);
        int (*alarm_irq_enable)(struct device *, unsigned int enabled);
        int (*read_offset)(struct device *, long *offset);
        int (*set_offset)(struct device *, long offset);
};

struct rtc_device;

struct rtc_timer {
        struct timerqueue_node node;
        ktime_t period;
        void (*func)(struct rtc_device *rtc);
        struct rtc_device *rtc;
        int enabled;
};

/* flags */
#define RTC_DEV_BUSY 0

struct rtc_device {
        struct device dev;
        struct module *owner;

        int id;

        const struct rtc_class_ops *ops;
        struct mutex ops_lock;

        struct cdev char_dev;
        unsigned long flags;

        unsigned long irq_data;
        spinlock_t irq_lock;
        wait_queue_head_t irq_queue;
        struct fasync_struct *async_queue;

        int irq_freq;
        int max_user_freq;

        struct timerqueue_head timerqueue;
        struct rtc_timer aie_timer;
        struct rtc_timer uie_rtctimer;
        struct hrtimer pie_timer; /* sub second exp, so needs hrtimer */
        int pie_enabled;
        struct work_struct irqwork;
        /* Some hardware can't support UIE mode */
        int uie_unsupported;

        /* Number of nsec it takes to set the RTC clock. This influences when
         * the set ops are called. An offset:
         *   - of 0.5 s will call RTC set for wall clock time 10.0 s at 9.5 s
         *   - of 1.5 s will call RTC set for wall clock time 10.0 s at 8.5 s
         *   - of -0.5 s will call RTC set for wall clock time 10.0 s at 10.5 s
         */
        long set_offset_nsec;

        bool registered;

        /* Old ABI support */
        bool nvram_old_abi;
        struct bin_attribute *nvram;

        time64_t range_min;
        timeu64_t range_max;
        time64_t start_secs;
        time64_t offset_secs;
        bool set_start_time;

#ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL
        struct work_struct uie_task;
        struct timer_list uie_timer;
        /* Those fields are protected by rtc->irq_lock */
        unsigned int oldsecs;
        unsigned int uie_irq_active:1;
        unsigned int stop_uie_polling:1;
        unsigned int uie_task_active:1;
        unsigned int uie_timer_active:1;
#endif
};
#define to_rtc_device(d) container_of(d, struct rtc_device, dev)

#define rtc_lock(d) mutex_lock(&d->ops_lock)
#define rtc_unlock(d) mutex_unlock(&d->ops_lock)

/* useful timestamps */
#define RTC_TIMESTAMP_BEGIN_0000        -62167219200ULL /* 0000-01-01 00:00:00 */
#define RTC_TIMESTAMP_BEGIN_1900        -2208988800LL /* 1900-01-01 00:00:00 */
#define RTC_TIMESTAMP_BEGIN_2000        946684800LL /* 2000-01-01 00:00:00 */
#define RTC_TIMESTAMP_END_2063                2966371199LL /* 2063-12-31 23:59:59 */
#define RTC_TIMESTAMP_END_2079                3471292799LL /* 2079-12-31 23:59:59 */
#define RTC_TIMESTAMP_END_2099                4102444799LL /* 2099-12-31 23:59:59 */
#define RTC_TIMESTAMP_END_2199                7258118399LL /* 2199-12-31 23:59:59 */
#define RTC_TIMESTAMP_END_9999                253402300799LL /* 9999-12-31 23:59:59 */

extern struct rtc_device *devm_rtc_device_register(struct device *dev,
                                        const char *name,
                                        const struct rtc_class_ops *ops,
                                        struct module *owner);
struct rtc_device *devm_rtc_allocate_device(struct device *dev);
int __rtc_register_device(struct module *owner, struct rtc_device *rtc);

extern int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm);
extern int rtc_set_time(struct rtc_device *rtc, struct rtc_time *tm);
extern int rtc_set_ntp_time(struct timespec64 now, unsigned long *target_nsec);
int __rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm);
extern int rtc_read_alarm(struct rtc_device *rtc,
                        struct rtc_wkalrm *alrm);
extern int rtc_set_alarm(struct rtc_device *rtc,
                                struct rtc_wkalrm *alrm);
extern int rtc_initialize_alarm(struct rtc_device *rtc,
                                struct rtc_wkalrm *alrm);
extern void rtc_update_irq(struct rtc_device *rtc,
                        unsigned long num, unsigned long events);

extern struct rtc_device *rtc_class_open(const char *name);
extern void rtc_class_close(struct rtc_device *rtc);

extern int rtc_irq_set_state(struct rtc_device *rtc, int enabled);
extern int rtc_irq_set_freq(struct rtc_device *rtc, int freq);
extern int rtc_update_irq_enable(struct rtc_device *rtc, unsigned int enabled);
extern int rtc_alarm_irq_enable(struct rtc_device *rtc, unsigned int enabled);
extern int rtc_dev_update_irq_enable_emul(struct rtc_device *rtc,
                                                unsigned int enabled);

void rtc_handle_legacy_irq(struct rtc_device *rtc, int num, int mode);
void rtc_aie_update_irq(struct rtc_device *rtc);
void rtc_uie_update_irq(struct rtc_device *rtc);
enum hrtimer_restart rtc_pie_update_irq(struct hrtimer *timer);

void rtc_timer_init(struct rtc_timer *timer, void (*f)(struct rtc_device *r),
                    struct rtc_device *rtc);
int rtc_timer_start(struct rtc_device *rtc, struct rtc_timer *timer,
                    ktime_t expires, ktime_t period);
void rtc_timer_cancel(struct rtc_device *rtc, struct rtc_timer *timer);
int rtc_read_offset(struct rtc_device *rtc, long *offset);
int rtc_set_offset(struct rtc_device *rtc, long offset);
void rtc_timer_do_work(struct work_struct *work);

static inline bool is_leap_year(unsigned int year)
{
        return (!(year % 4) && (year % 100)) || !(year % 400);
}

/* Determine if we can call to driver to set the time. Drivers can only be
 * called to set a second aligned time value, and the field set_offset_nsec
 * specifies how far away from the second aligned time to call the driver.
 *
 * This also computes 'to_set' which is the time we are trying to set, and has
 * a zero in tv_nsecs, such that:
 *    to_set - set_delay_nsec == now +/- FUZZ
 *
 */
static inline bool rtc_tv_nsec_ok(s64 set_offset_nsec,
                                  struct timespec64 *to_set,
                                  const struct timespec64 *now)
{
        /* Allowed error in tv_nsec, arbitarily set to 5 jiffies in ns. */
        const unsigned long TIME_SET_NSEC_FUZZ = TICK_NSEC * 5;
        struct timespec64 delay = {.tv_sec = 0,
                                   .tv_nsec = set_offset_nsec};

        *to_set = timespec64_add(*now, delay);

        if (to_set->tv_nsec < TIME_SET_NSEC_FUZZ) {
                to_set->tv_nsec = 0;
                return true;
        }

        if (to_set->tv_nsec > NSEC_PER_SEC - TIME_SET_NSEC_FUZZ) {
                to_set->tv_sec++;
                to_set->tv_nsec = 0;
                return true;
        }
        return false;
}

#define rtc_register_device(device) \
        __rtc_register_device(THIS_MODULE, device)

#ifdef CONFIG_RTC_HCTOSYS_DEVICE
extern int rtc_hctosys_ret;
#else
#define rtc_hctosys_ret -ENODEV
#endif

#ifdef CONFIG_RTC_NVMEM
int rtc_nvmem_register(struct rtc_device *rtc,
                       struct nvmem_config *nvmem_config);
void rtc_nvmem_unregister(struct rtc_device *rtc);
#else
static inline int rtc_nvmem_register(struct rtc_device *rtc,
                                     struct nvmem_config *nvmem_config)
{
        return 0;
}
static inline void rtc_nvmem_unregister(struct rtc_device *rtc) {}
#endif

#ifdef CONFIG_RTC_INTF_SYSFS
int rtc_add_group(struct rtc_device *rtc, const struct attribute_group *grp);
int rtc_add_groups(struct rtc_device *rtc, const struct attribute_group **grps);
#else
static inline
int rtc_add_group(struct rtc_device *rtc, const struct attribute_group *grp)
{
        return 0;
}

static inline
int rtc_add_groups(struct rtc_device *rtc, const struct attribute_group **grps)
{
        return 0;
}
#endif
#endif /* _LINUX_RTC_H_ */










































































































































































    1 










































































































































































































































































    4 






























































































































































































































































































































































































































































    1 




























    1 











    1 





































































    1 



























































































































    1 

































































    1 
    1 




























































































































































































































    1 

































    1 

















    1 












    1 








    1 
    1 










    1 
































    1 






















    1 

























    1 










































































    1 


































    1 




















































































































































































































































    1 




























    1 








    1 

    1 






    1 







    1 








    1 








    1 








    1 
    1 



    1 





















    1 














    1 









    1 



    1 





    1 

    1 


    1 

















    1 



    1 




















































    1 





    1 
    1 























    1 


    1 



    1 



    1 

















    1 





    1 





    1 
    1 












    1 












    1 













    1 







    1 








    1 








































    1 















    1 



















    1 
    1 




































    1 


    1 



    1 




    1 





    1 










    1 




    1 
    1 

    1 

    1 


    1 

    1 

    1 


    1 


    1 

    1 

    1 

    1 

    1 



    1 



    1 




    1 


    1 













































    1 


































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/kernel/fork.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 *  'fork.c' contains the help-routines for the 'fork' system call
 * (see also entry.S and others).
 * Fork is rather simple, once you get the hang of it, but the memory
 * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
 */

#include <linux/anon_inodes.h>
#include <linux/slab.h>
#include <linux/sched/autogroup.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/user.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/stat.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
#include <linux/seq_file.h>
#include <linux/rtmutex.h>
#include <linux/init.h>
#include <linux/unistd.h>
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <linux/completion.h>
#include <linux/personality.h>
#include <linux/mempolicy.h>
#include <linux/sem.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/iocontext.h>
#include <linux/key.h>
#include <linux/binfmts.h>
#include <linux/mman.h>
#include <linux/mmu_notifier.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/vmacache.h>
#include <linux/nsproxy.h>
#include <linux/capability.h>
#include <linux/cpu.h>
#include <linux/cgroup.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
#include <linux/seccomp.h>
#include <linux/swap.h>
#include <linux/syscalls.h>
#include <linux/jiffies.h>
#include <linux/futex.h>
#include <linux/compat.h>
#include <linux/kthread.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/rcupdate.h>
#include <linux/ptrace.h>
#include <linux/mount.h>
#include <linux/audit.h>
#include <linux/memcontrol.h>
#include <linux/ftrace.h>
#include <linux/proc_fs.h>
#include <linux/profile.h>
#include <linux/rmap.h>
#include <linux/ksm.h>
#include <linux/acct.h>
#include <linux/userfaultfd_k.h>
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/freezer.h>
#include <linux/delayacct.h>
#include <linux/taskstats_kern.h>
#include <linux/random.h>
#include <linux/tty.h>
#include <linux/blkdev.h>
#include <linux/fs_struct.h>
#include <linux/magic.h>
#include <linux/perf_event.h>
#include <linux/posix-timers.h>
#include <linux/user-return-notifier.h>
#include <linux/oom.h>
#include <linux/khugepaged.h>
#include <linux/signalfd.h>
#include <linux/uprobes.h>
#include <linux/aio.h>
#include <linux/compiler.h>
#include <linux/sysctl.h>
#include <linux/kcov.h>
#include <linux/livepatch.h>
#include <linux/thread_info.h>
#include <linux/stackleak.h>
#include <linux/kasan.h>
#include <linux/scs.h>
#include <linux/io_uring.h>

#include <asm/pgalloc.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>

#include <trace/events/sched.h>

#define CREATE_TRACE_POINTS
#include <trace/events/task.h>

/*
 * Minimum number of threads to boot the kernel
 */
#define MIN_THREADS 20

/*
 * Maximum number of threads
 */
#define MAX_THREADS FUTEX_TID_MASK

/*
 * Protected counters by write_lock_irq(&tasklist_lock)
 */
unsigned long total_forks;        /* Handle normal Linux uptimes. */
int nr_threads;                        /* The idle threads do not count.. */

static int max_threads;                /* tunable limit on nr_threads */

#define NAMED_ARRAY_INDEX(x)        [x] = __stringify(x)

static const char * const resident_page_types[] = {
        NAMED_ARRAY_INDEX(MM_FILEPAGES),
        NAMED_ARRAY_INDEX(MM_ANONPAGES),
        NAMED_ARRAY_INDEX(MM_SWAPENTS),
        NAMED_ARRAY_INDEX(MM_SHMEMPAGES),
};

DEFINE_PER_CPU(unsigned long, process_counts) = 0;

__cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */

#ifdef CONFIG_PROVE_RCU
int lockdep_tasklist_lock_is_held(void)
{
        return lockdep_is_held(&tasklist_lock);
}
EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
#endif /* #ifdef CONFIG_PROVE_RCU */

int nr_processes(void)
{
        int cpu;
        int total = 0;

        for_each_possible_cpu(cpu)
                total += per_cpu(process_counts, cpu);

        return total;
}

void __weak arch_release_task_struct(struct task_struct *tsk)
{
}

#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
static struct kmem_cache *task_struct_cachep;

static inline struct task_struct *alloc_task_struct_node(int node)
{
        return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
}

static inline void free_task_struct(struct task_struct *tsk)
{
        kmem_cache_free(task_struct_cachep, tsk);
}
#endif

#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR

/*
 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
 * kmemcache based allocator.
 */
# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)

#ifdef CONFIG_VMAP_STACK
/*
 * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
 * flush.  Try to minimize the number of calls by caching stacks.
 */
#define NR_CACHED_STACKS 2
static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);

static int free_vm_stack_cache(unsigned int cpu)
{
        struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
        int i;

        for (i = 0; i < NR_CACHED_STACKS; i++) {
                struct vm_struct *vm_stack = cached_vm_stacks[i];

                if (!vm_stack)
                        continue;

                vfree(vm_stack->addr);
                cached_vm_stacks[i] = NULL;
        }

        return 0;
}
#endif

static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
{
#ifdef CONFIG_VMAP_STACK
        void *stack;
        int i;

        for (i = 0; i < NR_CACHED_STACKS; i++) {
                struct vm_struct *s;

                s = this_cpu_xchg(cached_stacks[i], NULL);

                if (!s)
                        continue;

                /* Clear the KASAN shadow of the stack. */
                kasan_unpoison_shadow(s->addr, THREAD_SIZE);

                /* Clear stale pointers from reused stack. */
                memset(s->addr, 0, THREAD_SIZE);

                tsk->stack_vm_area = s;
                tsk->stack = s->addr;
                return s->addr;
        }

        /*
         * Allocated stacks are cached and later reused by new threads,
         * so memcg accounting is performed manually on assigning/releasing
         * stacks to tasks. Drop __GFP_ACCOUNT.
         */
        stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
                                     VMALLOC_START, VMALLOC_END,
                                     THREADINFO_GFP & ~__GFP_ACCOUNT,
                                     PAGE_KERNEL,
                                     0, node, __builtin_return_address(0));

        /*
         * We can't call find_vm_area() in interrupt context, and
         * free_thread_stack() can be called in interrupt context,
         * so cache the vm_struct.
         */
        if (stack) {
                tsk->stack_vm_area = find_vm_area(stack);
                tsk->stack = stack;
        }
        return stack;
#else
        struct page *page = alloc_pages_node(node, THREADINFO_GFP,
                                             THREAD_SIZE_ORDER);

        if (likely(page)) {
                tsk->stack = kasan_reset_tag(page_address(page));
                return tsk->stack;
        }
        return NULL;
#endif
}

static inline void free_thread_stack(struct task_struct *tsk)
{
#ifdef CONFIG_VMAP_STACK
        struct vm_struct *vm = task_stack_vm_area(tsk);

        if (vm) {
                int i;

                for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
                        memcg_kmem_uncharge_page(vm->pages[i], 0);

                for (i = 0; i < NR_CACHED_STACKS; i++) {
                        if (this_cpu_cmpxchg(cached_stacks[i],
                                        NULL, tsk->stack_vm_area) != NULL)
                                continue;

                        return;
                }

                vfree_atomic(tsk->stack);
                return;
        }
#endif

        __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
}
# else
static struct kmem_cache *thread_stack_cache;

static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
                                                  int node)
{
        unsigned long *stack;
        stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
        stack = kasan_reset_tag(stack);
        tsk->stack = stack;
        return stack;
}

static void free_thread_stack(struct task_struct *tsk)
{
        kmem_cache_free(thread_stack_cache, tsk->stack);
}

void thread_stack_cache_init(void)
{
        thread_stack_cache = kmem_cache_create_usercopy("thread_stack",
                                        THREAD_SIZE, THREAD_SIZE, 0, 0,
                                        THREAD_SIZE, NULL);
        BUG_ON(thread_stack_cache == NULL);
}
# endif
#endif

/* SLAB cache for signal_struct structures (tsk->signal) */
static struct kmem_cache *signal_cachep;

/* SLAB cache for sighand_struct structures (tsk->sighand) */
struct kmem_cache *sighand_cachep;

/* SLAB cache for files_struct structures (tsk->files) */
struct kmem_cache *files_cachep;

/* SLAB cache for fs_struct structures (tsk->fs) */
struct kmem_cache *fs_cachep;

/* SLAB cache for vm_area_struct structures */
static struct kmem_cache *vm_area_cachep;

/* SLAB cache for mm_struct structures (tsk->mm) */
static struct kmem_cache *mm_cachep;

struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
{
        struct vm_area_struct *vma;

        vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
        if (vma)
                vma_init(vma, mm);
        return vma;
}

struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
{
        struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);

        if (new) {
                ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
                ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
                /*
                 * orig->shared.rb may be modified concurrently, but the clone
                 * will be reinitialized.
                 */
                *new = data_race(*orig);
                INIT_LIST_HEAD(&new->anon_vma_chain);
                new->vm_next = new->vm_prev = NULL;
        }
        return new;
}

void vm_area_free(struct vm_area_struct *vma)
{
        kmem_cache_free(vm_area_cachep, vma);
}

static void account_kernel_stack(struct task_struct *tsk, int account)
{
        void *stack = task_stack_page(tsk);
        struct vm_struct *vm = task_stack_vm_area(tsk);


        /* All stack pages are in the same node. */
        if (vm)
                mod_lruvec_page_state(vm->pages[0], NR_KERNEL_STACK_KB,
                                      account * (THREAD_SIZE / 1024));
        else
                mod_lruvec_slab_state(stack, NR_KERNEL_STACK_KB,
                                      account * (THREAD_SIZE / 1024));
}

static int memcg_charge_kernel_stack(struct task_struct *tsk)
{
#ifdef CONFIG_VMAP_STACK
        struct vm_struct *vm = task_stack_vm_area(tsk);
        int ret;

        BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);

        if (vm) {
                int i;

                BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);

                for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
                        /*
                         * If memcg_kmem_charge_page() fails, page->mem_cgroup
                         * pointer is NULL, and memcg_kmem_uncharge_page() in
                         * free_thread_stack() will ignore this page.
                         */
                        ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL,
                                                     0);
                        if (ret)
                                return ret;
                }
        }
#endif
        return 0;
}

static void release_task_stack(struct task_struct *tsk)
{
        if (WARN_ON(tsk->state != TASK_DEAD))
                return;  /* Better to leak the stack than to free prematurely */

        account_kernel_stack(tsk, -1);
        free_thread_stack(tsk);
        tsk->stack = NULL;
#ifdef CONFIG_VMAP_STACK
        tsk->stack_vm_area = NULL;
#endif
}

#ifdef CONFIG_THREAD_INFO_IN_TASK
void put_task_stack(struct task_struct *tsk)
{
        if (refcount_dec_and_test(&tsk->stack_refcount))
                release_task_stack(tsk);
}
#endif

void free_task(struct task_struct *tsk)
{
#ifdef CONFIG_SECCOMP
        WARN_ON_ONCE(tsk->seccomp.filter);
#endif
        scs_release(tsk);

#ifndef CONFIG_THREAD_INFO_IN_TASK
        /*
         * The task is finally done with both the stack and thread_info,
         * so free both.
         */
        release_task_stack(tsk);
#else
        /*
         * If the task had a separate stack allocation, it should be gone
         * by now.
         */
        WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0);
#endif
        rt_mutex_debug_task_free(tsk);
        ftrace_graph_exit_task(tsk);
        arch_release_task_struct(tsk);
        if (tsk->flags & PF_KTHREAD)
                free_kthread_struct(tsk);
        free_task_struct(tsk);
}
EXPORT_SYMBOL(free_task);

#ifdef CONFIG_MMU
static __latent_entropy int dup_mmap(struct mm_struct *mm,
                                        struct mm_struct *oldmm)
{
        struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
        struct rb_node **rb_link, *rb_parent;
        int retval;
        unsigned long charge;
        LIST_HEAD(uf);

        uprobe_start_dup_mmap();
        if (mmap_write_lock_killable(oldmm)) {
                retval = -EINTR;
                goto fail_uprobe_end;
        }
        flush_cache_dup_mm(oldmm);
        uprobe_dup_mmap(oldmm, mm);
        /*
         * Not linked in yet - no deadlock potential:
         */
        mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);

        /* No ordering required: file already has been exposed. */
        RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));

        mm->total_vm = oldmm->total_vm;
        mm->data_vm = oldmm->data_vm;
        mm->exec_vm = oldmm->exec_vm;
        mm->stack_vm = oldmm->stack_vm;

        rb_link = &mm->mm_rb.rb_node;
        rb_parent = NULL;
        pprev = &mm->mmap;
        retval = ksm_fork(mm, oldmm);
        if (retval)
                goto out;
        retval = khugepaged_fork(mm, oldmm);
        if (retval)
                goto out;

        prev = NULL;
        for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
                struct file *file;

                if (mpnt->vm_flags & VM_DONTCOPY) {
                        vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
                        continue;
                }
                charge = 0;
                /*
                 * Don't duplicate many vmas if we've been oom-killed (for
                 * example)
                 */
                if (fatal_signal_pending(current)) {
                        retval = -EINTR;
                        goto out;
                }
                if (mpnt->vm_flags & VM_ACCOUNT) {
                        unsigned long len = vma_pages(mpnt);

                        if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
                                goto fail_nomem;
                        charge = len;
                }
                tmp = vm_area_dup(mpnt);
                if (!tmp)
                        goto fail_nomem;
                retval = vma_dup_policy(mpnt, tmp);
                if (retval)
                        goto fail_nomem_policy;
                tmp->vm_mm = mm;
                retval = dup_userfaultfd(tmp, &uf);
                if (retval)
                        goto fail_nomem_anon_vma_fork;
                if (tmp->vm_flags & VM_WIPEONFORK) {
                        /*
                         * VM_WIPEONFORK gets a clean slate in the child.
                         * Don't prepare anon_vma until fault since we don't
                         * copy page for current vma.
                         */
                        tmp->anon_vma = NULL;
                } else if (anon_vma_fork(tmp, mpnt))
                        goto fail_nomem_anon_vma_fork;
                tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
                file = tmp->vm_file;
                if (file) {
                        struct inode *inode = file_inode(file);
                        struct address_space *mapping = file->f_mapping;

                        get_file(file);
                        if (tmp->vm_flags & VM_DENYWRITE)
                                put_write_access(inode);
                        i_mmap_lock_write(mapping);
                        if (vma_is_shared_maywrite(tmp))
                                mapping_allow_writable(mapping);
                        flush_dcache_mmap_lock(mapping);
                        /* insert tmp into the share list, just after mpnt */
                        vma_interval_tree_insert_after(tmp, mpnt,
                                        &mapping->i_mmap);
                        flush_dcache_mmap_unlock(mapping);
                        i_mmap_unlock_write(mapping);
                }

                /*
                 * Clear hugetlb-related page reserves for children. This only
                 * affects MAP_PRIVATE mappings. Faults generated by the child
                 * are not guaranteed to succeed, even if read-only
                 */
                if (is_vm_hugetlb_page(tmp))
                        reset_vma_resv_huge_pages(tmp);

                /*
                 * Link in the new vma and copy the page table entries.
                 */
                *pprev = tmp;
                pprev = &tmp->vm_next;
                tmp->vm_prev = prev;
                prev = tmp;

                __vma_link_rb(mm, tmp, rb_link, rb_parent);
                rb_link = &tmp->vm_rb.rb_right;
                rb_parent = &tmp->vm_rb;

                mm->map_count++;
                if (!(tmp->vm_flags & VM_WIPEONFORK))
                        retval = copy_page_range(tmp, mpnt);

                if (tmp->vm_ops && tmp->vm_ops->open)
                        tmp->vm_ops->open(tmp);

                if (retval)
                        goto out;
        }
        /* a new mm has just been created */
        retval = arch_dup_mmap(oldmm, mm);
out:
        mmap_write_unlock(mm);
        flush_tlb_mm(oldmm);
        mmap_write_unlock(oldmm);
        dup_userfaultfd_complete(&uf);
fail_uprobe_end:
        uprobe_end_dup_mmap();
        return retval;
fail_nomem_anon_vma_fork:
        mpol_put(vma_policy(tmp));
fail_nomem_policy:
        vm_area_free(tmp);
fail_nomem:
        retval = -ENOMEM;
        vm_unacct_memory(charge);
        goto out;
}

static inline int mm_alloc_pgd(struct mm_struct *mm)
{
        mm->pgd = pgd_alloc(mm);
        if (unlikely(!mm->pgd))
                return -ENOMEM;
        return 0;
}

static inline void mm_free_pgd(struct mm_struct *mm)
{
        pgd_free(mm, mm->pgd);
}
#else
static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
{
        mmap_write_lock(oldmm);
        RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
        mmap_write_unlock(oldmm);
        return 0;
}
#define mm_alloc_pgd(mm)        (0)
#define mm_free_pgd(mm)
#endif /* CONFIG_MMU */

static void check_mm(struct mm_struct *mm)
{
        int i;

        BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS,
                         "Please make sure 'struct resident_page_types[]' is updated as well");

        for (i = 0; i < NR_MM_COUNTERS; i++) {
                long x = atomic_long_read(&mm->rss_stat.count[i]);

                if (unlikely(x))
                        pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
                                 mm, resident_page_types[i], x);
        }

        if (mm_pgtables_bytes(mm))
                pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
                                mm_pgtables_bytes(mm));

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
        VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
#endif
}

#define allocate_mm()        (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
#define free_mm(mm)        (kmem_cache_free(mm_cachep, (mm)))

/*
 * Called when the last reference to the mm
 * is dropped: either by a lazy thread or by
 * mmput. Free the page directory and the mm.
 */
void __mmdrop(struct mm_struct *mm)
{
        BUG_ON(mm == &init_mm);
        WARN_ON_ONCE(mm == current->mm);
        WARN_ON_ONCE(mm == current->active_mm);
        mm_free_pgd(mm);
        destroy_context(mm);
        mmu_notifier_subscriptions_destroy(mm);
        check_mm(mm);
        put_user_ns(mm->user_ns);
        free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);

static void mmdrop_async_fn(struct work_struct *work)
{
        struct mm_struct *mm;

        mm = container_of(work, struct mm_struct, async_put_work);
        __mmdrop(mm);
}

static void mmdrop_async(struct mm_struct *mm)
{
        if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
                INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
                schedule_work(&mm->async_put_work);
        }
}

static inline void free_signal_struct(struct signal_struct *sig)
{
        taskstats_tgid_free(sig);
        sched_autogroup_exit(sig);
        /*
         * __mmdrop is not safe to call from softirq context on x86 due to
         * pgd_dtor so postpone it to the async context
         */
        if (sig->oom_mm)
                mmdrop_async(sig->oom_mm);
        kmem_cache_free(signal_cachep, sig);
}

static inline void put_signal_struct(struct signal_struct *sig)
{
        if (refcount_dec_and_test(&sig->sigcnt))
                free_signal_struct(sig);
}

void __put_task_struct(struct task_struct *tsk)
{
        WARN_ON(!tsk->exit_state);
        WARN_ON(refcount_read(&tsk->usage));
        WARN_ON(tsk == current);

        io_uring_free(tsk);
        cgroup_free(tsk);
        task_numa_free(tsk, true);
        security_task_free(tsk);
        exit_creds(tsk);
        delayacct_tsk_free(tsk);
        put_signal_struct(tsk->signal);

        if (!profile_handoff_task(tsk))
                free_task(tsk);
}
EXPORT_SYMBOL_GPL(__put_task_struct);

void __put_task_struct_rcu_cb(struct rcu_head *rhp)
{
        struct task_struct *task = container_of(rhp, struct task_struct, rcu);

        __put_task_struct(task);
}
EXPORT_SYMBOL_GPL(__put_task_struct_rcu_cb);

void __init __weak arch_task_cache_init(void) { }

/*
 * set_max_threads
 */
static void set_max_threads(unsigned int max_threads_suggested)
{
        u64 threads;
        unsigned long nr_pages = totalram_pages();

        /*
         * The number of threads shall be limited such that the thread
         * structures may only consume a small part of the available memory.
         */
        if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64)
                threads = MAX_THREADS;
        else
                threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE,
                                    (u64) THREAD_SIZE * 8UL);

        if (threads > max_threads_suggested)
                threads = max_threads_suggested;

        max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
}

#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
/* Initialized by the architecture: */
int arch_task_struct_size __read_mostly;
#endif

#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
{
        /* Fetch thread_struct whitelist for the architecture. */
        arch_thread_struct_whitelist(offset, size);

        /*
         * Handle zero-sized whitelist or empty thread_struct, otherwise
         * adjust offset to position of thread_struct in task_struct.
         */
        if (unlikely(*size == 0))
                *offset = 0;
        else
                *offset += offsetof(struct task_struct, thread);
}
#endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */

void __init fork_init(void)
{
        int i;
#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
#ifndef ARCH_MIN_TASKALIGN
#define ARCH_MIN_TASKALIGN        0
#endif
        int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
        unsigned long useroffset, usersize;

        /* create a slab on which task_structs can be allocated */
        task_struct_whitelist(&useroffset, &usersize);
        task_struct_cachep = kmem_cache_create_usercopy("task_struct",
                        arch_task_struct_size, align,
                        SLAB_PANIC|SLAB_ACCOUNT,
                        useroffset, usersize, NULL);
#endif

        /* do the arch specific task caches init */
        arch_task_cache_init();

        set_max_threads(MAX_THREADS);

        init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
        init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
        init_task.signal->rlim[RLIMIT_SIGPENDING] =
                init_task.signal->rlim[RLIMIT_NPROC];

        for (i = 0; i < UCOUNT_COUNTS; i++) {
                init_user_ns.ucount_max[i] = max_threads/2;
        }

#ifdef CONFIG_VMAP_STACK
        cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
                          NULL, free_vm_stack_cache);
#endif

        scs_init();

        lockdep_init_task(&init_task);
        uprobes_init();
}

int __weak arch_dup_task_struct(struct task_struct *dst,
                                               struct task_struct *src)
{
        *dst = *src;
        return 0;
}

void set_task_stack_end_magic(struct task_struct *tsk)
{
        unsigned long *stackend;

        stackend = end_of_stack(tsk);
        *stackend = STACK_END_MAGIC;        /* for overflow detection */
}

static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
        struct task_struct *tsk;
        unsigned long *stack;
        struct vm_struct *stack_vm_area __maybe_unused;
        int err;

        if (node == NUMA_NO_NODE)
                node = tsk_fork_get_node(orig);
        tsk = alloc_task_struct_node(node);
        if (!tsk)
                return NULL;

        stack = alloc_thread_stack_node(tsk, node);
        if (!stack)
                goto free_tsk;

        if (memcg_charge_kernel_stack(tsk))
                goto free_stack;

        stack_vm_area = task_stack_vm_area(tsk);

        err = arch_dup_task_struct(tsk, orig);

        /*
         * arch_dup_task_struct() clobbers the stack-related fields.  Make
         * sure they're properly initialized before using any stack-related
         * functions again.
         */
        tsk->stack = stack;
#ifdef CONFIG_VMAP_STACK
        tsk->stack_vm_area = stack_vm_area;
#endif
#ifdef CONFIG_THREAD_INFO_IN_TASK
        refcount_set(&tsk->stack_refcount, 1);
#endif

        if (err)
                goto free_stack;

        err = scs_prepare(tsk, node);
        if (err)
                goto free_stack;

#ifdef CONFIG_SECCOMP
        /*
         * We must handle setting up seccomp filters once we're under
         * the sighand lock in case orig has changed between now and
         * then. Until then, filter must be NULL to avoid messing up
         * the usage counts on the error path calling free_task.
         */
        tsk->seccomp.filter = NULL;
#endif

        setup_thread_stack(tsk, orig);
        clear_user_return_notifier(tsk);
        clear_tsk_need_resched(tsk);
        set_task_stack_end_magic(tsk);

#ifdef CONFIG_STACKPROTECTOR
        tsk->stack_canary = get_random_canary();
#endif
        if (orig->cpus_ptr == &orig->cpus_mask)
                tsk->cpus_ptr = &tsk->cpus_mask;

        /*
         * One for the user space visible state that goes away when reaped.
         * One for the scheduler.
         */
        refcount_set(&tsk->rcu_users, 2);
        /* One for the rcu users */
        refcount_set(&tsk->usage, 1);
#ifdef CONFIG_BLK_DEV_IO_TRACE
        tsk->btrace_seq = 0;
#endif
        tsk->splice_pipe = NULL;
        tsk->task_frag.page = NULL;
        tsk->wake_q.next = NULL;
        tsk->pf_io_worker = NULL;

        account_kernel_stack(tsk, 1);

        kcov_task_init(tsk);

#ifdef CONFIG_FAULT_INJECTION
        tsk->fail_nth = 0;
#endif

#ifdef CONFIG_BLK_CGROUP
        tsk->throttle_queue = NULL;
        tsk->use_memdelay = 0;
#endif

#ifdef CONFIG_MEMCG
        tsk->active_memcg = NULL;
#endif
        return tsk;

free_stack:
        free_thread_stack(tsk);
free_tsk:
        free_task_struct(tsk);
        return NULL;
}

__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);

static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;

static int __init coredump_filter_setup(char *s)
{
        default_dump_filter =
                (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) &
                MMF_DUMP_FILTER_MASK;
        return 1;
}

__setup("coredump_filter=", coredump_filter_setup);

#include <linux/init_task.h>

static void mm_init_aio(struct mm_struct *mm)
{
#ifdef CONFIG_AIO
        spin_lock_init(&mm->ioctx_lock);
        mm->ioctx_table = NULL;
#endif
}

static __always_inline void mm_clear_owner(struct mm_struct *mm,
                                           struct task_struct *p)
{
#ifdef CONFIG_MEMCG
        if (mm->owner == p)
                WRITE_ONCE(mm->owner, NULL);
#endif
}

static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
{
#ifdef CONFIG_MEMCG
        mm->owner = p;
#endif
}

static void mm_init_pasid(struct mm_struct *mm)
{
#ifdef CONFIG_IOMMU_SUPPORT
        mm->pasid = INIT_PASID;
#endif
}

static void mm_init_uprobes_state(struct mm_struct *mm)
{
#ifdef CONFIG_UPROBES
        mm->uprobes_state.xol_area = NULL;
#endif
}

static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
        struct user_namespace *user_ns)
{
        mm->mmap = NULL;
        mm->mm_rb = RB_ROOT;
        mm->vmacache_seqnum = 0;
        atomic_set(&mm->mm_users, 1);
        atomic_set(&mm->mm_count, 1);
        seqcount_init(&mm->write_protect_seq);
        mmap_init_lock(mm);
        INIT_LIST_HEAD(&mm->mmlist);
        mm->core_state = NULL;
        mm_pgtables_bytes_init(mm);
        mm->map_count = 0;
        mm->locked_vm = 0;
        atomic_set(&mm->has_pinned, 0);
        atomic64_set(&mm->pinned_vm, 0);
        memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
        spin_lock_init(&mm->page_table_lock);
        spin_lock_init(&mm->arg_lock);
        mm_init_cpumask(mm);
        mm_init_aio(mm);
        mm_init_owner(mm, p);
        mm_init_pasid(mm);
        RCU_INIT_POINTER(mm->exe_file, NULL);
        mmu_notifier_subscriptions_init(mm);
        init_tlb_flush_pending(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
        mm->pmd_huge_pte = NULL;
#endif
        mm_init_uprobes_state(mm);
        hugetlb_count_init(mm);

        if (current->mm) {
                mm->flags = current->mm->flags & MMF_INIT_MASK;
                mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
        } else {
                mm->flags = default_dump_filter;
                mm->def_flags = 0;
        }

        if (mm_alloc_pgd(mm))
                goto fail_nopgd;

        if (init_new_context(p, mm))
                goto fail_nocontext;

        mm->user_ns = get_user_ns(user_ns);
        return mm;

fail_nocontext:
        mm_free_pgd(mm);
fail_nopgd:
        free_mm(mm);
        return NULL;
}

/*
 * Allocate and initialize an mm_struct.
 */
struct mm_struct *mm_alloc(void)
{
        struct mm_struct *mm;

        mm = allocate_mm();
        if (!mm)
                return NULL;

        memset(mm, 0, sizeof(*mm));
        return mm_init(mm, current, current_user_ns());
}

static inline void __mmput(struct mm_struct *mm)
{
        VM_BUG_ON(atomic_read(&mm->mm_users));

        uprobe_clear_state(mm);
        exit_aio(mm);
        ksm_exit(mm);
        khugepaged_exit(mm); /* must run before exit_mmap */
        exit_mmap(mm);
        mm_put_huge_zero_page(mm);
        set_mm_exe_file(mm, NULL);
        if (!list_empty(&mm->mmlist)) {
                spin_lock(&mmlist_lock);
                list_del(&mm->mmlist);
                spin_unlock(&mmlist_lock);
        }
        if (mm->binfmt)
                module_put(mm->binfmt->module);
        mmdrop(mm);
}

/*
 * Decrement the use count and release all resources for an mm.
 */
void mmput(struct mm_struct *mm)
{
        might_sleep();

        if (atomic_dec_and_test(&mm->mm_users))
                __mmput(mm);
}
EXPORT_SYMBOL_GPL(mmput);

#ifdef CONFIG_MMU
static void mmput_async_fn(struct work_struct *work)
{
        struct mm_struct *mm = container_of(work, struct mm_struct,
                                            async_put_work);

        __mmput(mm);
}

void mmput_async(struct mm_struct *mm)
{
        if (atomic_dec_and_test(&mm->mm_users)) {
                INIT_WORK(&mm->async_put_work, mmput_async_fn);
                schedule_work(&mm->async_put_work);
        }
}
EXPORT_SYMBOL_GPL(mmput_async);
#endif

/**
 * set_mm_exe_file - change a reference to the mm's executable file
 *
 * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
 *
 * Main users are mmput() and sys_execve(). Callers prevent concurrent
 * invocations: in mmput() nobody alive left, in execve task is single
 * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the
 * mm->exe_file, but does so without using set_mm_exe_file() in order
 * to do avoid the need for any locks.
 */
void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
{
        struct file *old_exe_file;

        /*
         * It is safe to dereference the exe_file without RCU as
         * this function is only called if nobody else can access
         * this mm -- see comment above for justification.
         */
        old_exe_file = rcu_dereference_raw(mm->exe_file);

        if (new_exe_file)
                get_file(new_exe_file);
        rcu_assign_pointer(mm->exe_file, new_exe_file);
        if (old_exe_file)
                fput(old_exe_file);
}

/**
 * get_mm_exe_file - acquire a reference to the mm's executable file
 *
 * Returns %NULL if mm has no associated executable file.
 * User must release file via fput().
 */
struct file *get_mm_exe_file(struct mm_struct *mm)
{
        struct file *exe_file;

        rcu_read_lock();
        exe_file = rcu_dereference(mm->exe_file);
        if (exe_file && !get_file_rcu(exe_file))
                exe_file = NULL;
        rcu_read_unlock();
        return exe_file;
}
EXPORT_SYMBOL(get_mm_exe_file);

/**
 * get_task_exe_file - acquire a reference to the task's executable file
 *
 * Returns %NULL if task's mm (if any) has no associated executable file or
 * this is a kernel thread with borrowed mm (see the comment above get_task_mm).
 * User must release file via fput().
 */
struct file *get_task_exe_file(struct task_struct *task)
{
        struct file *exe_file = NULL;
        struct mm_struct *mm;

        task_lock(task);
        mm = task->mm;
        if (mm) {
                if (!(task->flags & PF_KTHREAD))
                        exe_file = get_mm_exe_file(mm);
        }
        task_unlock(task);
        return exe_file;
}
EXPORT_SYMBOL(get_task_exe_file);

/**
 * get_task_mm - acquire a reference to the task's mm
 *
 * Returns %NULL if the task has no mm.  Checks PF_KTHREAD (meaning
 * this kernel workthread has transiently adopted a user mm with use_mm,
 * to do its AIO) is not set and if so returns a reference to it, after
 * bumping up the use count.  User must release the mm via mmput()
 * after use.  Typically used by /proc and ptrace.
 */
struct mm_struct *get_task_mm(struct task_struct *task)
{
        struct mm_struct *mm;

        task_lock(task);
        mm = task->mm;
        if (mm) {
                if (task->flags & PF_KTHREAD)
                        mm = NULL;
                else
                        mmget(mm);
        }
        task_unlock(task);
        return mm;
}
EXPORT_SYMBOL_GPL(get_task_mm);

struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
{
        struct mm_struct *mm;
        int err;

        err =  down_read_killable(&task->signal->exec_update_lock);
        if (err)
                return ERR_PTR(err);

        mm = get_task_mm(task);
        if (mm && mm != current->mm &&
                        !ptrace_may_access(task, mode)) {
                mmput(mm);
                mm = ERR_PTR(-EACCES);
        }
        up_read(&task->signal->exec_update_lock);

        return mm;
}

static void complete_vfork_done(struct task_struct *tsk)
{
        struct completion *vfork;

        task_lock(tsk);
        vfork = tsk->vfork_done;
        if (likely(vfork)) {
                tsk->vfork_done = NULL;
                complete(vfork);
        }
        task_unlock(tsk);
}

static int wait_for_vfork_done(struct task_struct *child,
                                struct completion *vfork)
{
        int killed;

        freezer_do_not_count();
        cgroup_enter_frozen();
        killed = wait_for_completion_killable(vfork);
        cgroup_leave_frozen(false);
        freezer_count();

        if (killed) {
                task_lock(child);
                child->vfork_done = NULL;
                task_unlock(child);
        }

        put_task_struct(child);
        return killed;
}

/* Please note the differences between mmput and mm_release.
 * mmput is called whenever we stop holding onto a mm_struct,
 * error success whatever.
 *
 * mm_release is called after a mm_struct has been removed
 * from the current process.
 *
 * This difference is important for error handling, when we
 * only half set up a mm_struct for a new process and need to restore
 * the old one.  Because we mmput the new mm_struct before
 * restoring the old one. . .
 * Eric Biederman 10 January 1998
 */
static void mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
        uprobe_free_utask(tsk);

        /* Get rid of any cached register state */
        deactivate_mm(tsk, mm);

        /*
         * Signal userspace if we're not exiting with a core dump
         * because we want to leave the value intact for debugging
         * purposes.
         */
        if (tsk->clear_child_tid) {
                if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) &&
                    atomic_read(&mm->mm_users) > 1) {
                        /*
                         * We don't check the error code - if userspace has
                         * not set up a proper pointer then tough luck.
                         */
                        put_user(0, tsk->clear_child_tid);
                        do_futex(tsk->clear_child_tid, FUTEX_WAKE,
                                        1, NULL, NULL, 0, 0);
                }
                tsk->clear_child_tid = NULL;
        }

        /*
         * All done, finally we can wake up parent and return this mm to him.
         * Also kthread_stop() uses this completion for synchronization.
         */
        if (tsk->vfork_done)
                complete_vfork_done(tsk);
}

void exit_mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
        futex_exit_release(tsk);
        mm_release(tsk, mm);
}

void exec_mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
        futex_exec_release(tsk);
        mm_release(tsk, mm);
}

/**
 * dup_mm() - duplicates an existing mm structure
 * @tsk: the task_struct with which the new mm will be associated.
 * @oldmm: the mm to duplicate.
 *
 * Allocates a new mm structure and duplicates the provided @oldmm structure
 * content into it.
 *
 * Return: the duplicated mm or NULL on failure.
 */
static struct mm_struct *dup_mm(struct task_struct *tsk,
                                struct mm_struct *oldmm)
{
        struct mm_struct *mm;
        int err;

        mm = allocate_mm();
        if (!mm)
                goto fail_nomem;

        memcpy(mm, oldmm, sizeof(*mm));

        if (!mm_init(mm, tsk, mm->user_ns))
                goto fail_nomem;

        err = dup_mmap(mm, oldmm);
        if (err)
                goto free_pt;

        mm->hiwater_rss = get_mm_rss(mm);
        mm->hiwater_vm = mm->total_vm;

        if (mm->binfmt && !try_module_get(mm->binfmt->module))
                goto free_pt;

        return mm;

free_pt:
        /* don't put binfmt in mmput, we haven't got module yet */
        mm->binfmt = NULL;
        mm_init_owner(mm, NULL);
        mmput(mm);

fail_nomem:
        return NULL;
}

static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
{
        struct mm_struct *mm, *oldmm;
        int retval;

        tsk->min_flt = tsk->maj_flt = 0;
        tsk->nvcsw = tsk->nivcsw = 0;
#ifdef CONFIG_DETECT_HUNG_TASK
        tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
        tsk->last_switch_time = 0;
#endif

        tsk->mm = NULL;
        tsk->active_mm = NULL;

        /*
         * Are we cloning a kernel thread?
         *
         * We need to steal a active VM for that..
         */
        oldmm = current->mm;
        if (!oldmm)
                return 0;

        /* initialize the new vmacache entries */
        vmacache_flush(tsk);

        if (clone_flags & CLONE_VM) {
                mmget(oldmm);
                mm = oldmm;
                goto good_mm;
        }

        retval = -ENOMEM;
        mm = dup_mm(tsk, current->mm);
        if (!mm)
                goto fail_nomem;

good_mm:
        tsk->mm = mm;
        tsk->active_mm = mm;
        return 0;

fail_nomem:
        return retval;
}

static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
{
        struct fs_struct *fs = current->fs;
        if (clone_flags & CLONE_FS) {
                /* tsk->fs is already what we want */
                spin_lock(&fs->lock);
                if (fs->in_exec) {
                        spin_unlock(&fs->lock);
                        return -EAGAIN;
                }
                fs->users++;
                spin_unlock(&fs->lock);
                return 0;
        }
        tsk->fs = copy_fs_struct(fs);
        if (!tsk->fs)
                return -ENOMEM;
        return 0;
}

static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
{
        struct files_struct *oldf, *newf;
        int error = 0;

        /*
         * A background process may not have any files ...
         */
        oldf = current->files;
        if (!oldf)
                goto out;

        if (clone_flags & CLONE_FILES) {
                atomic_inc(&oldf->count);
                goto out;
        }

        newf = dup_fd(oldf, NR_OPEN_MAX, &error);
        if (!newf)
                goto out;

        tsk->files = newf;
        error = 0;
out:
        return error;
}

static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
{
#ifdef CONFIG_BLOCK
        struct io_context *ioc = current->io_context;
        struct io_context *new_ioc;

        if (!ioc)
                return 0;
        /*
         * Share io context with parent, if CLONE_IO is set
         */
        if (clone_flags & CLONE_IO) {
                ioc_task_link(ioc);
                tsk->io_context = ioc;
        } else if (ioprio_valid(ioc->ioprio)) {
                new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
                if (unlikely(!new_ioc))
                        return -ENOMEM;

                new_ioc->ioprio = ioc->ioprio;
                put_io_context(new_ioc);
        }
#endif
        return 0;
}

static int copy_sighand(u64 clone_flags, struct task_struct *tsk)
{
        struct sighand_struct *sig;

        if (clone_flags & CLONE_SIGHAND) {
                refcount_inc(&current->sighand->count);
                return 0;
        }
        sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
        RCU_INIT_POINTER(tsk->sighand, sig);
        if (!sig)
                return -ENOMEM;

        refcount_set(&sig->count, 1);
        spin_lock_irq(&current->sighand->siglock);
        memcpy(sig->action, current->sighand->action, sizeof(sig->action));
        spin_unlock_irq(&current->sighand->siglock);

        /* Reset all signal handler not set to SIG_IGN to SIG_DFL. */
        if (clone_flags & CLONE_CLEAR_SIGHAND)
                flush_signal_handlers(tsk, 0);

        return 0;
}

void __cleanup_sighand(struct sighand_struct *sighand)
{
        if (refcount_dec_and_test(&sighand->count)) {
                signalfd_cleanup(sighand);
                /*
                 * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it
                 * without an RCU grace period, see __lock_task_sighand().
                 */
                kmem_cache_free(sighand_cachep, sighand);
        }
}

/*
 * Initialize POSIX timer handling for a thread group.
 */
static void posix_cpu_timers_init_group(struct signal_struct *sig)
{
        struct posix_cputimers *pct = &sig->posix_cputimers;
        unsigned long cpu_limit;

        cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
        posix_cputimers_group_init(pct, cpu_limit);
}

static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
{
        struct signal_struct *sig;

        if (clone_flags & CLONE_THREAD)
                return 0;

        sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
        tsk->signal = sig;
        if (!sig)
                return -ENOMEM;

        sig->nr_threads = 1;
        atomic_set(&sig->live, 1);
        refcount_set(&sig->sigcnt, 1);

        /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */
        sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
        tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head);

        init_waitqueue_head(&sig->wait_chldexit);
        sig->curr_target = tsk;
        init_sigpending(&sig->shared_pending);
        INIT_HLIST_HEAD(&sig->multiprocess);
        seqlock_init(&sig->stats_lock);
        prev_cputime_init(&sig->prev_cputime);

#ifdef CONFIG_POSIX_TIMERS
        INIT_LIST_HEAD(&sig->posix_timers);
        hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        sig->real_timer.function = it_real_fn;
#endif

        task_lock(current->group_leader);
        memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
        task_unlock(current->group_leader);

        posix_cpu_timers_init_group(sig);

        tty_audit_fork(sig);
        sched_autogroup_fork(sig);

        sig->oom_score_adj = current->signal->oom_score_adj;
        sig->oom_score_adj_min = current->signal->oom_score_adj_min;

        mutex_init(&sig->cred_guard_mutex);
        init_rwsem(&sig->exec_update_lock);

        return 0;
}

static void copy_seccomp(struct task_struct *p)
{
#ifdef CONFIG_SECCOMP
        /*
         * Must be called with sighand->lock held, which is common to
         * all threads in the group. Holding cred_guard_mutex is not
         * needed because this new task is not yet running and cannot
         * be racing exec.
         */
        assert_spin_locked(&current->sighand->siglock);

        /* Ref-count the new filter user, and assign it. */
        get_seccomp_filter(current);
        p->seccomp = current->seccomp;

        /*
         * Explicitly enable no_new_privs here in case it got set
         * between the task_struct being duplicated and holding the
         * sighand lock. The seccomp state and nnp must be in sync.
         */
        if (task_no_new_privs(current))
                task_set_no_new_privs(p);

        /*
         * If the parent gained a seccomp mode after copying thread
         * flags and between before we held the sighand lock, we have
         * to manually enable the seccomp thread flag here.
         */
        if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
                set_tsk_thread_flag(p, TIF_SECCOMP);
#endif
}

SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
{
        current->clear_child_tid = tidptr;

        return task_pid_vnr(current);
}

static void rt_mutex_init_task(struct task_struct *p)
{
        raw_spin_lock_init(&p->pi_lock);
#ifdef CONFIG_RT_MUTEXES
        p->pi_waiters = RB_ROOT_CACHED;
        p->pi_top_task = NULL;
        p->pi_blocked_on = NULL;
#endif
}

static inline void init_task_pid_links(struct task_struct *task)
{
        enum pid_type type;

        for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
                INIT_HLIST_NODE(&task->pid_links[type]);
        }
}

static inline void
init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
{
        if (type == PIDTYPE_PID)
                task->thread_pid = pid;
        else
                task->signal->pids[type] = pid;
}

static inline void rcu_copy_process(struct task_struct *p)
{
#ifdef CONFIG_PREEMPT_RCU
        p->rcu_read_lock_nesting = 0;
        p->rcu_read_unlock_special.s = 0;
        p->rcu_blocked_node = NULL;
        INIT_LIST_HEAD(&p->rcu_node_entry);
#endif /* #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_TASKS_RCU
        p->rcu_tasks_holdout = false;
        INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
        p->rcu_tasks_idle_cpu = -1;
#endif /* #ifdef CONFIG_TASKS_RCU */
#ifdef CONFIG_TASKS_TRACE_RCU
        p->trc_reader_nesting = 0;
        p->trc_reader_special.s = 0;
        INIT_LIST_HEAD(&p->trc_holdout_list);
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
}

struct pid *pidfd_pid(const struct file *file)
{
        if (file->f_op == &pidfd_fops)
                return file->private_data;

        return ERR_PTR(-EBADF);
}

static int pidfd_release(struct inode *inode, struct file *file)
{
        struct pid *pid = file->private_data;

        file->private_data = NULL;
        put_pid(pid);
        return 0;
}

#ifdef CONFIG_PROC_FS
/**
 * pidfd_show_fdinfo - print information about a pidfd
 * @m: proc fdinfo file
 * @f: file referencing a pidfd
 *
 * Pid:
 * This function will print the pid that a given pidfd refers to in the
 * pid namespace of the procfs instance.
 * If the pid namespace of the process is not a descendant of the pid
 * namespace of the procfs instance 0 will be shown as its pid. This is
 * similar to calling getppid() on a process whose parent is outside of
 * its pid namespace.
 *
 * NSpid:
 * If pid namespaces are supported then this function will also print
 * the pid of a given pidfd refers to for all descendant pid namespaces
 * starting from the current pid namespace of the instance, i.e. the
 * Pid field and the first entry in the NSpid field will be identical.
 * If the pid namespace of the process is not a descendant of the pid
 * namespace of the procfs instance 0 will be shown as its first NSpid
 * entry and no others will be shown.
 * Note that this differs from the Pid and NSpid fields in
 * /proc/<pid>/status where Pid and NSpid are always shown relative to
 * the  pid namespace of the procfs instance. The difference becomes
 * obvious when sending around a pidfd between pid namespaces from a
 * different branch of the tree, i.e. where no ancestoral relation is
 * present between the pid namespaces:
 * - create two new pid namespaces ns1 and ns2 in the initial pid
 *   namespace (also take care to create new mount namespaces in the
 *   new pid namespace and mount procfs)
 * - create a process with a pidfd in ns1
 * - send pidfd from ns1 to ns2
 * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid
 *   have exactly one entry, which is 0
 */
static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
{
        struct pid *pid = f->private_data;
        struct pid_namespace *ns;
        pid_t nr = -1;

        if (likely(pid_has_task(pid, PIDTYPE_PID))) {
                ns = proc_pid_ns(file_inode(m->file)->i_sb);
                nr = pid_nr_ns(pid, ns);
        }

        seq_put_decimal_ll(m, "Pid:\t", nr);

#ifdef CONFIG_PID_NS
        seq_put_decimal_ll(m, "\nNSpid:\t", nr);
        if (nr > 0) {
                int i;

                /* If nr is non-zero it means that 'pid' is valid and that
                 * ns, i.e. the pid namespace associated with the procfs
                 * instance, is in the pid namespace hierarchy of pid.
                 * Start at one below the already printed level.
                 */
                for (i = ns->level + 1; i <= pid->level; i++)
                        seq_put_decimal_ll(m, "\t", pid->numbers[i].nr);
        }
#endif
        seq_putc(m, '\n');
}
#endif

/*
 * Poll support for process exit notification.
 */
static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
{
        struct pid *pid = file->private_data;
        __poll_t poll_flags = 0;

        poll_wait(file, &pid->wait_pidfd, pts);

        /*
         * Inform pollers only when the whole thread group exits.
         * If the thread group leader exits before all other threads in the
         * group, then poll(2) should block, similar to the wait(2) family.
         */
        if (thread_group_exited(pid))
                poll_flags = EPOLLIN | EPOLLRDNORM;

        return poll_flags;
}

const struct file_operations pidfd_fops = {
        .release = pidfd_release,
        .poll = pidfd_poll,
#ifdef CONFIG_PROC_FS
        .show_fdinfo = pidfd_show_fdinfo,
#endif
};

/**
 * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
 * @pid:   the struct pid for which to create a pidfd
 * @flags: flags of the new @pidfd
 * @pidfd: the pidfd to return
 *
 * Allocate a new file that stashes @pid and reserve a new pidfd number in the
 * caller's file descriptor table. The pidfd is reserved but not installed yet.

 * The helper doesn't perform checks on @pid which makes it useful for pidfds
 * created via CLONE_PIDFD where @pid has no task attached when the pidfd and
 * pidfd file are prepared.
 *
 * If this function returns successfully the caller is responsible to either
 * call fd_install() passing the returned pidfd and pidfd file as arguments in
 * order to install the pidfd into its file descriptor table or they must use
 * put_unused_fd() and fput() on the returned pidfd and pidfd file
 * respectively.
 *
 * This function is useful when a pidfd must already be reserved but there
 * might still be points of failure afterwards and the caller wants to ensure
 * that no pidfd is leaked into its file descriptor table.
 *
 * Return: On success, a reserved pidfd is returned from the function and a new
 *         pidfd file is returned in the last argument to the function. On
 *         error, a negative error code is returned from the function and the
 *         last argument remains unchanged.
 */
static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
{
        int pidfd;
        struct file *pidfd_file;

        if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC))
                return -EINVAL;

        pidfd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
        if (pidfd < 0)
                return pidfd;

        pidfd_file = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
                                        flags | O_RDWR | O_CLOEXEC);
        if (IS_ERR(pidfd_file)) {
                put_unused_fd(pidfd);
                return PTR_ERR(pidfd_file);
        }
        get_pid(pid); /* held by pidfd_file now */
        *ret = pidfd_file;
        return pidfd;
}

/**
 * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
 * @pid:   the struct pid for which to create a pidfd
 * @flags: flags of the new @pidfd
 * @pidfd: the pidfd to return
 *
 * Allocate a new file that stashes @pid and reserve a new pidfd number in the
 * caller's file descriptor table. The pidfd is reserved but not installed yet.
 *
 * The helper verifies that @pid is used as a thread group leader.
 *
 * If this function returns successfully the caller is responsible to either
 * call fd_install() passing the returned pidfd and pidfd file as arguments in
 * order to install the pidfd into its file descriptor table or they must use
 * put_unused_fd() and fput() on the returned pidfd and pidfd file
 * respectively.
 *
 * This function is useful when a pidfd must already be reserved but there
 * might still be points of failure afterwards and the caller wants to ensure
 * that no pidfd is leaked into its file descriptor table.
 *
 * Return: On success, a reserved pidfd is returned from the function and a new
 *         pidfd file is returned in the last argument to the function. On
 *         error, a negative error code is returned from the function and the
 *         last argument remains unchanged.
 */
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
{
        if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
                return -EINVAL;

        return __pidfd_prepare(pid, flags, ret);
}

static void __delayed_free_task(struct rcu_head *rhp)
{
        struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);

        free_task(tsk);
}

static __always_inline void delayed_free_task(struct task_struct *tsk)
{
        if (IS_ENABLED(CONFIG_MEMCG))
                call_rcu(&tsk->rcu, __delayed_free_task);
        else
                free_task(tsk);
}

static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
{
        /* Skip if kernel thread */
        if (!tsk->mm)
                return;

        /* Skip if spawning a thread or using vfork */
        if ((clone_flags & (CLONE_VM | CLONE_THREAD | CLONE_VFORK)) != CLONE_VM)
                return;

        /* We need to synchronize with __set_oom_adj */
        mutex_lock(&oom_adj_mutex);
        set_bit(MMF_MULTIPROCESS, &tsk->mm->flags);
        /* Update the values in case they were changed after copy_signal */
        tsk->signal->oom_score_adj = current->signal->oom_score_adj;
        tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min;
        mutex_unlock(&oom_adj_mutex);
}

/*
 * This creates a new process as a copy of the old one,
 * but does not actually start it yet.
 *
 * It copies the registers, and all the appropriate
 * parts of the process environment (as per the clone
 * flags). The actual kick-off is left to the caller.
 */
static __latent_entropy struct task_struct *copy_process(
                                        struct pid *pid,
                                        int trace,
                                        int node,
                                        struct kernel_clone_args *args)
{
        int pidfd = -1, retval;
        struct task_struct *p;
        struct multiprocess_signals delayed;
        struct file *pidfile = NULL;
        u64 clone_flags = args->flags;
        struct nsproxy *nsp = current->nsproxy;

        /*
         * Don't allow sharing the root directory with processes in a different
         * namespace
         */
        if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
                return ERR_PTR(-EINVAL);

        if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
                return ERR_PTR(-EINVAL);

        /*
         * Thread groups must share signals as well, and detached threads
         * can only be started up within the thread group.
         */
        if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
                return ERR_PTR(-EINVAL);

        /*
         * Shared signal handlers imply shared VM. By way of the above,
         * thread groups also imply shared VM. Blocking this case allows
         * for various simplifications in other code.
         */
        if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
                return ERR_PTR(-EINVAL);

        /*
         * Siblings of global init remain as zombies on exit since they are
         * not reaped by their parent (swapper). To solve this and to avoid
         * multi-rooted process trees, prevent global and container-inits
         * from creating siblings.
         */
        if ((clone_flags & CLONE_PARENT) &&
                                current->signal->flags & SIGNAL_UNKILLABLE)
                return ERR_PTR(-EINVAL);

        /*
         * If the new process will be in a different pid or user namespace
         * do not allow it to share a thread group with the forking task.
         */
        if (clone_flags & CLONE_THREAD) {
                if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
                    (task_active_pid_ns(current) != nsp->pid_ns_for_children))
                        return ERR_PTR(-EINVAL);
        }

        /*
         * If the new process will be in a different time namespace
         * do not allow it to share VM or a thread group with the forking task.
         */
        if (clone_flags & (CLONE_THREAD | CLONE_VM)) {
                if (nsp->time_ns != nsp->time_ns_for_children)
                        return ERR_PTR(-EINVAL);
        }

        if (clone_flags & CLONE_PIDFD) {
                /*
                 * - CLONE_DETACHED is blocked so that we can potentially
                 *   reuse it later for CLONE_PIDFD.
                 * - CLONE_THREAD is blocked until someone really needs it.
                 */
                if (clone_flags & (CLONE_DETACHED | CLONE_THREAD))
                        return ERR_PTR(-EINVAL);
        }

        /*
         * Force any signals received before this point to be delivered
         * before the fork happens.  Collect up signals sent to multiple
         * processes that happen during the fork and delay them so that
         * they appear to happen after the fork.
         */
        sigemptyset(&delayed.signal);
        INIT_HLIST_NODE(&delayed.node);

        spin_lock_irq(&current->sighand->siglock);
        if (!(clone_flags & CLONE_THREAD))
                hlist_add_head(&delayed.node, &current->signal->multiprocess);
        recalc_sigpending();
        spin_unlock_irq(&current->sighand->siglock);
        retval = -ERESTARTNOINTR;
        if (task_sigpending(current))
                goto fork_out;

        retval = -ENOMEM;
        p = dup_task_struct(current, node);
        if (!p)
                goto fork_out;
        if (args->io_thread) {
                /*
                 * Mark us an IO worker, and block any signal that isn't
                 * fatal or STOP
                 */
                p->flags |= PF_IO_WORKER;
                siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
        }

        /*
         * This _must_ happen before we call free_task(), i.e. before we jump
         * to any of the bad_fork_* labels. This is to avoid freeing
         * p->set_child_tid which is (ab)used as a kthread's data pointer for
         * kernel threads (PF_KTHREAD).
         */
        p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
        /*
         * Clear TID on mm_release()?
         */
        p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;

        ftrace_graph_init_task(p);

        rt_mutex_init_task(p);

        lockdep_assert_irqs_enabled();
#ifdef CONFIG_PROVE_LOCKING
        DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
        retval = -EAGAIN;
        if (atomic_read(&p->real_cred->user->processes) >=
                        task_rlimit(p, RLIMIT_NPROC)) {
                if (p->real_cred->user != INIT_USER &&
                    !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
                        goto bad_fork_free;
        }
        current->flags &= ~PF_NPROC_EXCEEDED;

        retval = copy_creds(p, clone_flags);
        if (retval < 0)
                goto bad_fork_free;

        /*
         * If multiple threads are within copy_process(), then this check
         * triggers too late. This doesn't hurt, the check is only there
         * to stop root fork bombs.
         */
        retval = -EAGAIN;
        if (data_race(nr_threads >= max_threads))
                goto bad_fork_cleanup_count;

        delayacct_tsk_init(p);        /* Must remain after dup_task_struct() */
        p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE);
        p->flags |= PF_FORKNOEXEC;
        INIT_LIST_HEAD(&p->children);
        INIT_LIST_HEAD(&p->sibling);
        rcu_copy_process(p);
        p->vfork_done = NULL;
        spin_lock_init(&p->alloc_lock);

        init_sigpending(&p->pending);

        p->utime = p->stime = p->gtime = 0;
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
        p->utimescaled = p->stimescaled = 0;
#endif
        prev_cputime_init(&p->prev_cputime);

#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
        seqcount_init(&p->vtime.seqcount);
        p->vtime.starttime = 0;
        p->vtime.state = VTIME_INACTIVE;
#endif

#ifdef CONFIG_IO_URING
        p->io_uring = NULL;
#endif

#if defined(SPLIT_RSS_COUNTING)
        memset(&p->rss_stat, 0, sizeof(p->rss_stat));
#endif

        p->default_timer_slack_ns = current->timer_slack_ns;

#ifdef CONFIG_PSI
        p->psi_flags = 0;
#endif

        task_io_accounting_init(&p->ioac);
        acct_clear_integrals(p);

        posix_cputimers_init(&p->posix_cputimers);

        p->io_context = NULL;
        audit_set_context(p, NULL);
        cgroup_fork(p);
#ifdef CONFIG_NUMA
        p->mempolicy = mpol_dup(p->mempolicy);
        if (IS_ERR(p->mempolicy)) {
                retval = PTR_ERR(p->mempolicy);
                p->mempolicy = NULL;
                goto bad_fork_cleanup_threadgroup_lock;
        }
#endif
#ifdef CONFIG_CPUSETS
        p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
        p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
        seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
        memset(&p->irqtrace, 0, sizeof(p->irqtrace));
        p->irqtrace.hardirq_disable_ip        = _THIS_IP_;
        p->irqtrace.softirq_enable_ip        = _THIS_IP_;
        p->softirqs_enabled                = 1;
        p->softirq_context                = 0;
#endif

        p->pagefault_disabled = 0;

#ifdef CONFIG_LOCKDEP
        lockdep_init_task(p);
#endif

#ifdef CONFIG_DEBUG_MUTEXES
        p->blocked_on = NULL; /* not blocked yet */
#endif
#ifdef CONFIG_BCACHE
        p->sequential_io        = 0;
        p->sequential_io_avg        = 0;
#endif

        /* Perform scheduler related setup. Assign this task to a CPU. */
        retval = sched_fork(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_policy;

        retval = perf_event_init_task(p);
        if (retval)
                goto bad_fork_cleanup_policy;
        retval = audit_alloc(p);
        if (retval)
                goto bad_fork_cleanup_perf;
        /* copy all the process information */
        shm_init_task(p);
        retval = security_task_alloc(p, clone_flags);
        if (retval)
                goto bad_fork_cleanup_audit;
        retval = copy_semundo(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_security;
        retval = copy_files(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_semundo;
        retval = copy_fs(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_files;
        retval = copy_sighand(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_fs;
        retval = copy_signal(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_sighand;
        retval = copy_mm(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_signal;
        retval = copy_namespaces(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_mm;
        retval = copy_io(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_namespaces;
        retval = copy_thread(clone_flags, args->stack, args->stack_size, p, args->tls);
        if (retval)
                goto bad_fork_cleanup_io;

        stackleak_task_init(p);

        if (pid != &init_struct_pid) {
                pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid,
                                args->set_tid_size);
                if (IS_ERR(pid)) {
                        retval = PTR_ERR(pid);
                        goto bad_fork_cleanup_thread;
                }
        }

        /*
         * This has to happen after we've potentially unshared the file
         * descriptor table (so that the pidfd doesn't leak into the child
         * if the fd table isn't shared).
         */
        if (clone_flags & CLONE_PIDFD) {
                /* Note that no task has been attached to @pid yet. */
                retval = __pidfd_prepare(pid, O_RDWR | O_CLOEXEC, &pidfile);
                if (retval < 0)
                        goto bad_fork_free_pid;
                pidfd = retval;

                retval = put_user(pidfd, args->pidfd);
                if (retval)
                        goto bad_fork_put_pidfd;
        }

#ifdef CONFIG_BLOCK
        p->plug = NULL;
#endif
        futex_init_task(p);

        /*
         * sigaltstack should be cleared when sharing the same VM
         */
        if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
                sas_ss_reset(p);

        /*
         * Syscall tracing and stepping should be turned off in the
         * child regardless of CLONE_PTRACE.
         */
        user_disable_single_step(p);
        clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
#ifdef TIF_SYSCALL_EMU
        clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
#endif
        clear_tsk_latency_tracing(p);

        /* ok, now we should be set up.. */
        p->pid = pid_nr(pid);
        if (clone_flags & CLONE_THREAD) {
                p->group_leader = current->group_leader;
                p->tgid = current->tgid;
        } else {
                p->group_leader = p;
                p->tgid = p->pid;
        }

        p->nr_dirtied = 0;
        p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
        p->dirty_paused_when = 0;

        p->pdeath_signal = 0;
        INIT_LIST_HEAD(&p->thread_group);
        p->task_works = NULL;
        clear_posix_cputimers_work(p);

        /*
         * Ensure that the cgroup subsystem policies allow the new process to be
         * forked. It should be noted that the new process's css_set can be changed
         * between here and cgroup_post_fork() if an organisation operation is in
         * progress.
         */
        retval = cgroup_can_fork(p, args);
        if (retval)
                goto bad_fork_put_pidfd;

        /*
         * From this point on we must avoid any synchronous user-space
         * communication until we take the tasklist-lock. In particular, we do
         * not want user-space to be able to predict the process start-time by
         * stalling fork(2) after we recorded the start_time but before it is
         * visible to the system.
         */

        p->start_time = ktime_get_ns();
        p->start_boottime = ktime_get_boottime_ns();

        /*
         * Make it visible to the rest of the system, but dont wake it up yet.
         * Need tasklist lock for parent etc handling!
         */
        write_lock_irq(&tasklist_lock);

        /* CLONE_PARENT re-uses the old parent */
        if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
                p->real_parent = current->real_parent;
                p->parent_exec_id = current->parent_exec_id;
                if (clone_flags & CLONE_THREAD)
                        p->exit_signal = -1;
                else
                        p->exit_signal = current->group_leader->exit_signal;
        } else {
                p->real_parent = current;
                p->parent_exec_id = current->self_exec_id;
                p->exit_signal = args->exit_signal;
        }

        klp_copy_process(p);

        spin_lock(&current->sighand->siglock);

        rseq_fork(p, clone_flags);

        /* Don't start children in a dying pid namespace */
        if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
                retval = -ENOMEM;
                goto bad_fork_cancel_cgroup;
        }

        /* Let kill terminate clone/fork in the middle */
        if (fatal_signal_pending(current)) {
                retval = -EINTR;
                goto bad_fork_cancel_cgroup;
        }

        /* No more failure paths after this point. */

        /*
         * Copy seccomp details explicitly here, in case they were changed
         * before holding sighand lock.
         */
        copy_seccomp(p);

        init_task_pid_links(p);
        if (likely(p->pid)) {
                ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);

                init_task_pid(p, PIDTYPE_PID, pid);
                if (thread_group_leader(p)) {
                        init_task_pid(p, PIDTYPE_TGID, pid);
                        init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
                        init_task_pid(p, PIDTYPE_SID, task_session(current));

                        if (is_child_reaper(pid)) {
                                ns_of_pid(pid)->child_reaper = p;
                                p->signal->flags |= SIGNAL_UNKILLABLE;
                        }
                        p->signal->shared_pending.signal = delayed.signal;
                        p->signal->tty = tty_kref_get(current->signal->tty);
                        /*
                         * Inherit has_child_subreaper flag under the same
                         * tasklist_lock with adding child to the process tree
                         * for propagate_has_child_subreaper optimization.
                         */
                        p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
                                                         p->real_parent->signal->is_child_subreaper;
                        list_add_tail(&p->sibling, &p->real_parent->children);
                        list_add_tail_rcu(&p->tasks, &init_task.tasks);
                        attach_pid(p, PIDTYPE_TGID);
                        attach_pid(p, PIDTYPE_PGID);
                        attach_pid(p, PIDTYPE_SID);
                        __this_cpu_inc(process_counts);
                } else {
                        current->signal->nr_threads++;
                        atomic_inc(&current->signal->live);
                        refcount_inc(&current->signal->sigcnt);
                        task_join_group_stop(p);
                        list_add_tail_rcu(&p->thread_group,
                                          &p->group_leader->thread_group);
                        list_add_tail_rcu(&p->thread_node,
                                          &p->signal->thread_head);
                }
                attach_pid(p, PIDTYPE_PID);
                nr_threads++;
        }
        total_forks++;
        hlist_del_init(&delayed.node);
        spin_unlock(&current->sighand->siglock);
        syscall_tracepoint_update(p);
        write_unlock_irq(&tasklist_lock);

        if (pidfile)
                fd_install(pidfd, pidfile);

        proc_fork_connector(p);
        sched_post_fork(p, args);
        cgroup_post_fork(p, args);
        perf_event_fork(p);

        trace_task_newtask(p, clone_flags);
        uprobe_copy_process(p, clone_flags);

        copy_oom_score_adj(clone_flags, p);

        return p;

bad_fork_cancel_cgroup:
        spin_unlock(&current->sighand->siglock);
        write_unlock_irq(&tasklist_lock);
        cgroup_cancel_fork(p, args);
bad_fork_put_pidfd:
        if (clone_flags & CLONE_PIDFD) {
                fput(pidfile);
                put_unused_fd(pidfd);
        }
bad_fork_free_pid:
        if (pid != &init_struct_pid)
                free_pid(pid);
bad_fork_cleanup_thread:
        exit_thread(p);
bad_fork_cleanup_io:
        if (p->io_context)
                exit_io_context(p);
bad_fork_cleanup_namespaces:
        exit_task_namespaces(p);
bad_fork_cleanup_mm:
        if (p->mm) {
                mm_clear_owner(p->mm, p);
                mmput(p->mm);
        }
bad_fork_cleanup_signal:
        if (!(clone_flags & CLONE_THREAD))
                free_signal_struct(p->signal);
bad_fork_cleanup_sighand:
        __cleanup_sighand(p->sighand);
bad_fork_cleanup_fs:
        exit_fs(p); /* blocking */
bad_fork_cleanup_files:
        exit_files(p); /* blocking */
bad_fork_cleanup_semundo:
        exit_sem(p);
bad_fork_cleanup_security:
        security_task_free(p);
bad_fork_cleanup_audit:
        audit_free(p);
bad_fork_cleanup_perf:
        perf_event_free_task(p);
bad_fork_cleanup_policy:
        lockdep_free_task(p);
#ifdef CONFIG_NUMA
        mpol_put(p->mempolicy);
bad_fork_cleanup_threadgroup_lock:
#endif
        delayacct_tsk_free(p);
bad_fork_cleanup_count:
        atomic_dec(&p->cred->user->processes);
        exit_creds(p);
bad_fork_free:
        p->state = TASK_DEAD;
        put_task_stack(p);
        delayed_free_task(p);
fork_out:
        spin_lock_irq(&current->sighand->siglock);
        hlist_del_init(&delayed.node);
        spin_unlock_irq(&current->sighand->siglock);
        return ERR_PTR(retval);
}

static inline void init_idle_pids(struct task_struct *idle)
{
        enum pid_type type;

        for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
                INIT_HLIST_NODE(&idle->pid_links[type]); /* not really needed */
                init_task_pid(idle, type, &init_struct_pid);
        }
}

struct task_struct * __init fork_idle(int cpu)
{
        struct task_struct *task;
        struct kernel_clone_args args = {
                .flags = CLONE_VM,
        };

        task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
        if (!IS_ERR(task)) {
                init_idle_pids(task);
                init_idle(task, cpu);
        }

        return task;
}

/*
 * This is like kernel_clone(), but shaved down and tailored to just
 * creating io_uring workers. It returns a created task, or an error pointer.
 * The returned task is inactive, and the caller must fire it up through
 * wake_up_new_task(p). All signals are blocked in the created task.
 */
struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
{
        unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
                                CLONE_IO;
        struct kernel_clone_args args = {
                .flags                = ((lower_32_bits(flags) | CLONE_VM |
                                    CLONE_UNTRACED) & ~CSIGNAL),
                .exit_signal        = (lower_32_bits(flags) & CSIGNAL),
                .stack                = (unsigned long)fn,
                .stack_size        = (unsigned long)arg,
                .io_thread        = 1,
        };

        return copy_process(NULL, 0, node, &args);
}

/*
 *  Ok, this is the main fork-routine.
 *
 * It copies the process, and if successful kick-starts
 * it and waits for it to finish using the VM if required.
 *
 * args->exit_signal is expected to be checked for sanity by the caller.
 */
pid_t kernel_clone(struct kernel_clone_args *args)
{
        u64 clone_flags = args->flags;
        struct completion vfork;
        struct pid *pid;
        struct task_struct *p;
        int trace = 0;
        pid_t nr;

        /*
         * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
         * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
         * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate
         * field in struct clone_args and it still doesn't make sense to have
         * them both point at the same memory location. Performing this check
         * here has the advantage that we don't need to have a separate helper
         * to check for legacy clone().
         */
        if ((args->flags & CLONE_PIDFD) &&
            (args->flags & CLONE_PARENT_SETTID) &&
            (args->pidfd == args->parent_tid))
                return -EINVAL;

        /*
         * Determine whether and which event to report to ptracer.  When
         * called from kernel_thread or CLONE_UNTRACED is explicitly
         * requested, no event is reported; otherwise, report if the event
         * for the type of forking is enabled.
         */
        if (!(clone_flags & CLONE_UNTRACED)) {
                if (clone_flags & CLONE_VFORK)
                        trace = PTRACE_EVENT_VFORK;
                else if (args->exit_signal != SIGCHLD)
                        trace = PTRACE_EVENT_CLONE;
                else
                        trace = PTRACE_EVENT_FORK;

                if (likely(!ptrace_event_enabled(current, trace)))
                        trace = 0;
        }

        p = copy_process(NULL, trace, NUMA_NO_NODE, args);
        add_latent_entropy();

        if (IS_ERR(p))
                return PTR_ERR(p);

        /*
         * Do this prior waking up the new thread - the thread pointer
         * might get invalid after that point, if the thread exits quickly.
         */
        trace_sched_process_fork(current, p);

        pid = get_task_pid(p, PIDTYPE_PID);
        nr = pid_vnr(pid);

        if (clone_flags & CLONE_PARENT_SETTID)
                put_user(nr, args->parent_tid);

        if (clone_flags & CLONE_VFORK) {
                p->vfork_done = &vfork;
                init_completion(&vfork);
                get_task_struct(p);
        }

        wake_up_new_task(p);

        /* forking complete and child started to run, tell ptracer */
        if (unlikely(trace))
                ptrace_event_pid(trace, pid);

        if (clone_flags & CLONE_VFORK) {
                if (!wait_for_vfork_done(p, &vfork))
                        ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
        }

        put_pid(pid);
        return nr;
}

/*
 * Create a kernel thread.
 */
pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
        struct kernel_clone_args args = {
                .flags                = ((lower_32_bits(flags) | CLONE_VM |
                                    CLONE_UNTRACED) & ~CSIGNAL),
                .exit_signal        = (lower_32_bits(flags) & CSIGNAL),
                .stack                = (unsigned long)fn,
                .stack_size        = (unsigned long)arg,
        };

        return kernel_clone(&args);
}

#ifdef __ARCH_WANT_SYS_FORK
SYSCALL_DEFINE0(fork)
{
#ifdef CONFIG_MMU
        struct kernel_clone_args args = {
                .exit_signal = SIGCHLD,
        };

        return kernel_clone(&args);
#else
        /* can not support in nommu mode */
        return -EINVAL;
#endif
}
#endif

#ifdef __ARCH_WANT_SYS_VFORK
SYSCALL_DEFINE0(vfork)
{
        struct kernel_clone_args args = {
                .flags                = CLONE_VFORK | CLONE_VM,
                .exit_signal        = SIGCHLD,
        };

        return kernel_clone(&args);
}
#endif

#ifdef __ARCH_WANT_SYS_CLONE
#ifdef CONFIG_CLONE_BACKWARDS
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
                 int __user *, parent_tidptr,
                 unsigned long, tls,
                 int __user *, child_tidptr)
#elif defined(CONFIG_CLONE_BACKWARDS2)
SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
                 int __user *, parent_tidptr,
                 int __user *, child_tidptr,
                 unsigned long, tls)
#elif defined(CONFIG_CLONE_BACKWARDS3)
SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
                int, stack_size,
                int __user *, parent_tidptr,
                int __user *, child_tidptr,
                unsigned long, tls)
#else
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
                 int __user *, parent_tidptr,
                 int __user *, child_tidptr,
                 unsigned long, tls)
#endif
{
        struct kernel_clone_args args = {
                .flags                = (lower_32_bits(clone_flags) & ~CSIGNAL),
                .pidfd                = parent_tidptr,
                .child_tid        = child_tidptr,
                .parent_tid        = parent_tidptr,
                .exit_signal        = (lower_32_bits(clone_flags) & CSIGNAL),
                .stack                = newsp,
                .tls                = tls,
        };

        return kernel_clone(&args);
}
#endif

#ifdef __ARCH_WANT_SYS_CLONE3

noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
                                              struct clone_args __user *uargs,
                                              size_t usize)
{
        int err;
        struct clone_args args;
        pid_t *kset_tid = kargs->set_tid;

        BUILD_BUG_ON(offsetofend(struct clone_args, tls) !=
                     CLONE_ARGS_SIZE_VER0);
        BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) !=
                     CLONE_ARGS_SIZE_VER1);
        BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) !=
                     CLONE_ARGS_SIZE_VER2);
        BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2);

        if (unlikely(usize > PAGE_SIZE))
                return -E2BIG;
        if (unlikely(usize < CLONE_ARGS_SIZE_VER0))
                return -EINVAL;

        err = copy_struct_from_user(&args, sizeof(args), uargs, usize);
        if (err)
                return err;

        if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL))
                return -EINVAL;

        if (unlikely(!args.set_tid && args.set_tid_size > 0))
                return -EINVAL;

        if (unlikely(args.set_tid && args.set_tid_size == 0))
                return -EINVAL;

        /*
         * Verify that higher 32bits of exit_signal are unset and that
         * it is a valid signal
         */
        if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
                     !valid_signal(args.exit_signal)))
                return -EINVAL;

        if ((args.flags & CLONE_INTO_CGROUP) &&
            (args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2))
                return -EINVAL;

        *kargs = (struct kernel_clone_args){
                .flags                = args.flags,
                .pidfd                = u64_to_user_ptr(args.pidfd),
                .child_tid        = u64_to_user_ptr(args.child_tid),
                .parent_tid        = u64_to_user_ptr(args.parent_tid),
                .exit_signal        = args.exit_signal,
                .stack                = args.stack,
                .stack_size        = args.stack_size,
                .tls                = args.tls,
                .set_tid_size        = args.set_tid_size,
                .cgroup                = args.cgroup,
        };

        if (args.set_tid &&
                copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid),
                        (kargs->set_tid_size * sizeof(pid_t))))
                return -EFAULT;

        kargs->set_tid = kset_tid;

        return 0;
}

/**
 * clone3_stack_valid - check and prepare stack
 * @kargs: kernel clone args
 *
 * Verify that the stack arguments userspace gave us are sane.
 * In addition, set the stack direction for userspace since it's easy for us to
 * determine.
 */
static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
{
        if (kargs->stack == 0) {
                if (kargs->stack_size > 0)
                        return false;
        } else {
                if (kargs->stack_size == 0)
                        return false;

                if (!access_ok((void __user *)kargs->stack, kargs->stack_size))
                        return false;

#if !defined(CONFIG_STACK_GROWSUP) && !defined(CONFIG_IA64)
                kargs->stack += kargs->stack_size;
#endif
        }

        return true;
}

static bool clone3_args_valid(struct kernel_clone_args *kargs)
{
        /* Verify that no unknown flags are passed along. */
        if (kargs->flags &
            ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
                return false;

        /*
         * - make the CLONE_DETACHED bit reuseable for clone3
         * - make the CSIGNAL bits reuseable for clone3
         */
        if (kargs->flags & (CLONE_DETACHED | (CSIGNAL & (~CLONE_NEWTIME))))
                return false;

        if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) ==
            (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND))
                return false;

        if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
            kargs->exit_signal)
                return false;

        if (!clone3_stack_valid(kargs))
                return false;

        return true;
}

/**
 * clone3 - create a new process with specific properties
 * @uargs: argument structure
 * @size:  size of @uargs
 *
 * clone3() is the extensible successor to clone()/clone2().
 * It takes a struct as argument that is versioned by its size.
 *
 * Return: On success, a positive PID for the child process.
 *         On error, a negative errno number.
 */
SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
{
        int err;

        struct kernel_clone_args kargs;
        pid_t set_tid[MAX_PID_NS_LEVEL];

        kargs.set_tid = set_tid;

        err = copy_clone_args_from_user(&kargs, uargs, size);
        if (err)
                return err;

        if (!clone3_args_valid(&kargs))
                return -EINVAL;

        return kernel_clone(&kargs);
}
#endif

void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data)
{
        struct task_struct *leader, *parent, *child;
        int res;

        read_lock(&tasklist_lock);
        leader = top = top->group_leader;
down:
        for_each_thread(leader, parent) {
                list_for_each_entry(child, &parent->children, sibling) {
                        res = visitor(child, data);
                        if (res) {
                                if (res < 0)
                                        goto out;
                                leader = child;
                                goto down;
                        }
up:
                        ;
                }
        }

        if (leader != top) {
                child = leader;
                parent = child->real_parent;
                leader = parent->group_leader;
                goto up;
        }
out:
        read_unlock(&tasklist_lock);
}

#ifndef ARCH_MIN_MMSTRUCT_ALIGN
#define ARCH_MIN_MMSTRUCT_ALIGN 0
#endif

static void sighand_ctor(void *data)
{
        struct sighand_struct *sighand = data;

        spin_lock_init(&sighand->siglock);
        init_waitqueue_head(&sighand->signalfd_wqh);
}

void __init mm_cache_init(void)
{
        unsigned int mm_size;

        /*
         * The mm_cpumask is located at the end of mm_struct, and is
         * dynamically sized based on the maximum CPU number this system
         * can have, taking hotplug into account (nr_cpu_ids).
         */
        mm_size = sizeof(struct mm_struct) + cpumask_size();

        mm_cachep = kmem_cache_create_usercopy("mm_struct",
                        mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
                        offsetof(struct mm_struct, saved_auxv),
                        sizeof_field(struct mm_struct, saved_auxv),
                        NULL);
}

void __init proc_caches_init(void)
{
        sighand_cachep = kmem_cache_create("sighand_cache",
                        sizeof(struct sighand_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
                        SLAB_ACCOUNT, sighand_ctor);
        signal_cachep = kmem_cache_create("signal_cache",
                        sizeof(struct signal_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
                        NULL);
        files_cachep = kmem_cache_create("files_cache",
                        sizeof(struct files_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
                        NULL);
        fs_cachep = kmem_cache_create("fs_cache",
                        sizeof(struct fs_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
                        NULL);

        vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
        mmap_init();
        nsproxy_cache_init();
}

/*
 * Check constraints on flags passed to the unshare system call.
 */
static int check_unshare_flags(unsigned long unshare_flags)
{
        if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
                                CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
                                CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
                                CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP|
                                CLONE_NEWTIME))
                return -EINVAL;
        /*
         * Not implemented, but pretend it works if there is nothing
         * to unshare.  Note that unsharing the address space or the
         * signal handlers also need to unshare the signal queues (aka
         * CLONE_THREAD).
         */
        if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
                if (!thread_group_empty(current))
                        return -EINVAL;
        }
        if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) {
                if (refcount_read(&current->sighand->count) > 1)
                        return -EINVAL;
        }
        if (unshare_flags & CLONE_VM) {
                if (!current_is_single_threaded())
                        return -EINVAL;
        }

        return 0;
}

/*
 * Unshare the filesystem structure if it is being shared
 */
static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
{
        struct fs_struct *fs = current->fs;

        if (!(unshare_flags & CLONE_FS) || !fs)
                return 0;

        /* don't need lock here; in the worst case we'll do useless copy */
        if (!(unshare_flags & CLONE_NEWNS) && fs->users == 1)
                return 0;

        *new_fsp = copy_fs_struct(fs);
        if (!*new_fsp)
                return -ENOMEM;

        return 0;
}

/*
 * Unshare file descriptor table if it is being shared
 */
int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
               struct files_struct **new_fdp)
{
        struct files_struct *fd = current->files;
        int error = 0;

        if ((unshare_flags & CLONE_FILES) &&
            (fd && atomic_read(&fd->count) > 1)) {
                *new_fdp = dup_fd(fd, max_fds, &error);
                if (!*new_fdp)
                        return error;
        }

        return 0;
}

/*
 * unshare allows a process to 'unshare' part of the process
 * context which was originally shared using clone.  copy_*
 * functions used by kernel_clone() cannot be used here directly
 * because they modify an inactive task_struct that is being
 * constructed. Here we are modifying the current, active,
 * task_struct.
 */
int ksys_unshare(unsigned long unshare_flags)
{
        struct fs_struct *fs, *new_fs = NULL;
        struct files_struct *fd, *new_fd = NULL;
        struct cred *new_cred = NULL;
        struct nsproxy *new_nsproxy = NULL;
        int do_sysvsem = 0;
        int err;

        /*
         * If unsharing a user namespace must also unshare the thread group
         * and unshare the filesystem root and working directories.
         */
        if (unshare_flags & CLONE_NEWUSER)
                unshare_flags |= CLONE_THREAD | CLONE_FS;
        /*
         * If unsharing vm, must also unshare signal handlers.
         */
        if (unshare_flags & CLONE_VM)
                unshare_flags |= CLONE_SIGHAND;
        /*
         * If unsharing a signal handlers, must also unshare the signal queues.
         */
        if (unshare_flags & CLONE_SIGHAND)
                unshare_flags |= CLONE_THREAD;
        /*
         * If unsharing namespace, must also unshare filesystem information.
         */
        if (unshare_flags & CLONE_NEWNS)
                unshare_flags |= CLONE_FS;

        err = check_unshare_flags(unshare_flags);
        if (err)
                goto bad_unshare_out;
        /*
         * CLONE_NEWIPC must also detach from the undolist: after switching
         * to a new ipc namespace, the semaphore arrays from the old
         * namespace are unreachable.
         */
        if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
                do_sysvsem = 1;
        err = unshare_fs(unshare_flags, &new_fs);
        if (err)
                goto bad_unshare_out;
        err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd);
        if (err)
                goto bad_unshare_cleanup_fs;
        err = unshare_userns(unshare_flags, &new_cred);
        if (err)
                goto bad_unshare_cleanup_fd;
        err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
                                         new_cred, new_fs);
        if (err)
                goto bad_unshare_cleanup_cred;

        if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
                if (do_sysvsem) {
                        /*
                         * CLONE_SYSVSEM is equivalent to sys_exit().
                         */
                        exit_sem(current);
                }
                if (unshare_flags & CLONE_NEWIPC) {
                        /* Orphan segments in old ns (see sem above). */
                        exit_shm(current);
                        shm_init_task(current);
                }

                if (new_nsproxy)
                        switch_task_namespaces(current, new_nsproxy);

                task_lock(current);

                if (new_fs) {
                        fs = current->fs;
                        spin_lock(&fs->lock);
                        current->fs = new_fs;
                        if (--fs->users)
                                new_fs = NULL;
                        else
                                new_fs = fs;
                        spin_unlock(&fs->lock);
                }

                if (new_fd) {
                        fd = current->files;
                        current->files = new_fd;
                        new_fd = fd;
                }

                task_unlock(current);

                if (new_cred) {
                        /* Install the new user namespace */
                        commit_creds(new_cred);
                        new_cred = NULL;
                }
        }

        perf_event_namespaces(current);

bad_unshare_cleanup_cred:
        if (new_cred)
                put_cred(new_cred);
bad_unshare_cleanup_fd:
        if (new_fd)
                put_files_struct(new_fd);

bad_unshare_cleanup_fs:
        if (new_fs)
                free_fs_struct(new_fs);

bad_unshare_out:
        return err;
}

SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
{
        return ksys_unshare(unshare_flags);
}

/*
 *        Helper to unshare the files of the current task.
 *        We don't want to expose copy_files internals to
 *        the exec layer of the kernel.
 */

int unshare_files(void)
{
        struct task_struct *task = current;
        struct files_struct *old, *copy = NULL;
        int error;

        error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, &copy);
        if (error || !copy)
                return error;

        old = task->files;
        task_lock(task);
        task->files = copy;
        task_unlock(task);
        put_files_struct(old);
        return 0;
}

int sysctl_max_threads(struct ctl_table *table, int write,
                       void *buffer, size_t *lenp, loff_t *ppos)
{
        struct ctl_table t;
        int ret;
        int threads = max_threads;
        int min = 1;
        int max = MAX_THREADS;

        t = *table;
        t.data = &threads;
        t.extra1 = &min;
        t.extra2 = &max;

        ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
        if (ret || !write)
                return ret;

        max_threads = threads;

        return 0;
}











































































    1 



















































































































    1 











    1 






















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
// SPDX-License-Identifier: GPL-2.0
/*
 * Implement the default iomap interfaces
 *
 * (C) Copyright 2004 Linus Torvalds
 */
#include <linux/pci.h>
#include <linux/io.h>

#include <linux/export.h>

/*
 * Read/write from/to an (offsettable) iomem cookie. It might be a PIO
 * access or a MMIO access, these functions don't care. The info is
 * encoded in the hardware mapping set up by the mapping functions
 * (or the cookie itself, depending on implementation and hw).
 *
 * The generic routines don't assume any hardware mappings, and just
 * encode the PIO/MMIO as part of the cookie. They coldly assume that
 * the MMIO IO mappings are not in the low address range.
 *
 * Architectures for which this is not true can't use this generic
 * implementation and should do their own copy.
 */

#ifndef HAVE_ARCH_PIO_SIZE
/*
 * We encode the physical PIO addresses (0-0xffff) into the
 * pointer by offsetting them with a constant (0x10000) and
 * assuming that all the low addresses are always PIO. That means
 * we can do some sanity checks on the low bits, and don't
 * need to just take things for granted.
 */
#define PIO_OFFSET        0x10000UL
#define PIO_MASK        0x0ffffUL
#define PIO_RESERVED        0x40000UL
#endif

static void bad_io_access(unsigned long port, const char *access)
{
        static int count = 10;
        if (count) {
                count--;
                WARN(1, KERN_ERR "Bad IO access at port %#lx (%s)\n", port, access);
        }
}

/*
 * Ugly macros are a way of life.
 */
#define IO_COND(addr, is_pio, is_mmio) do {                        \
        unsigned long port = (unsigned long __force)addr;        \
        if (port >= PIO_RESERVED) {                                \
                is_mmio;                                        \
        } else if (port > PIO_OFFSET) {                                \
                port &= PIO_MASK;                                \
                is_pio;                                                \
        } else                                                        \
                bad_io_access(port, #is_pio );                        \
} while (0)

#ifndef pio_read16be
#define pio_read16be(port) swab16(inw(port))
#define pio_read32be(port) swab32(inl(port))
#endif

#ifndef mmio_read16be
#define mmio_read16be(addr) swab16(readw(addr))
#define mmio_read32be(addr) swab32(readl(addr))
#define mmio_read64be(addr) swab64(readq(addr))
#endif

unsigned int ioread8(const void __iomem *addr)
{
        IO_COND(addr, return inb(port), return readb(addr));
        return 0xff;
}
unsigned int ioread16(const void __iomem *addr)
{
        IO_COND(addr, return inw(port), return readw(addr));
        return 0xffff;
}
unsigned int ioread16be(const void __iomem *addr)
{
        IO_COND(addr, return pio_read16be(port), return mmio_read16be(addr));
        return 0xffff;
}
unsigned int ioread32(const void __iomem *addr)
{
        IO_COND(addr, return inl(port), return readl(addr));
        return 0xffffffff;
}
unsigned int ioread32be(const void __iomem *addr)
{
        IO_COND(addr, return pio_read32be(port), return mmio_read32be(addr));
        return 0xffffffff;
}
EXPORT_SYMBOL(ioread8);
EXPORT_SYMBOL(ioread16);
EXPORT_SYMBOL(ioread16be);
EXPORT_SYMBOL(ioread32);
EXPORT_SYMBOL(ioread32be);

#ifdef readq
static u64 pio_read64_lo_hi(unsigned long port)
{
        u64 lo, hi;

        lo = inl(port);
        hi = inl(port + sizeof(u32));

        return lo | (hi << 32);
}

static u64 pio_read64_hi_lo(unsigned long port)
{
        u64 lo, hi;

        hi = inl(port + sizeof(u32));
        lo = inl(port);

        return lo | (hi << 32);
}

static u64 pio_read64be_lo_hi(unsigned long port)
{
        u64 lo, hi;

        lo = pio_read32be(port + sizeof(u32));
        hi = pio_read32be(port);

        return lo | (hi << 32);
}

static u64 pio_read64be_hi_lo(unsigned long port)
{
        u64 lo, hi;

        hi = pio_read32be(port);
        lo = pio_read32be(port + sizeof(u32));

        return lo | (hi << 32);
}

u64 ioread64_lo_hi(const void __iomem *addr)
{
        IO_COND(addr, return pio_read64_lo_hi(port), return readq(addr));
        return 0xffffffffffffffffULL;
}

u64 ioread64_hi_lo(const void __iomem *addr)
{
        IO_COND(addr, return pio_read64_hi_lo(port), return readq(addr));
        return 0xffffffffffffffffULL;
}

u64 ioread64be_lo_hi(const void __iomem *addr)
{
        IO_COND(addr, return pio_read64be_lo_hi(port),
                return mmio_read64be(addr));
        return 0xffffffffffffffffULL;
}

u64 ioread64be_hi_lo(const void __iomem *addr)
{
        IO_COND(addr, return pio_read64be_hi_lo(port),
                return mmio_read64be(addr));
        return 0xffffffffffffffffULL;
}

EXPORT_SYMBOL(ioread64_lo_hi);
EXPORT_SYMBOL(ioread64_hi_lo);
EXPORT_SYMBOL(ioread64be_lo_hi);
EXPORT_SYMBOL(ioread64be_hi_lo);

#endif /* readq */

#ifndef pio_write16be
#define pio_write16be(val,port) outw(swab16(val),port)
#define pio_write32be(val,port) outl(swab32(val),port)
#endif

#ifndef mmio_write16be
#define mmio_write16be(val,port) writew(swab16(val),port)
#define mmio_write32be(val,port) writel(swab32(val),port)
#define mmio_write64be(val,port) writeq(swab64(val),port)
#endif

void iowrite8(u8 val, void __iomem *addr)
{
        IO_COND(addr, outb(val,port), writeb(val, addr));
}
void iowrite16(u16 val, void __iomem *addr)
{
        IO_COND(addr, outw(val,port), writew(val, addr));
}
void iowrite16be(u16 val, void __iomem *addr)
{
        IO_COND(addr, pio_write16be(val,port), mmio_write16be(val, addr));
}
void iowrite32(u32 val, void __iomem *addr)
{
        IO_COND(addr, outl(val,port), writel(val, addr));
}
void iowrite32be(u32 val, void __iomem *addr)
{
        IO_COND(addr, pio_write32be(val,port), mmio_write32be(val, addr));
}
EXPORT_SYMBOL(iowrite8);
EXPORT_SYMBOL(iowrite16);
EXPORT_SYMBOL(iowrite16be);
EXPORT_SYMBOL(iowrite32);
EXPORT_SYMBOL(iowrite32be);

#ifdef writeq
static void pio_write64_lo_hi(u64 val, unsigned long port)
{
        outl(val, port);
        outl(val >> 32, port + sizeof(u32));
}

static void pio_write64_hi_lo(u64 val, unsigned long port)
{
        outl(val >> 32, port + sizeof(u32));
        outl(val, port);
}

static void pio_write64be_lo_hi(u64 val, unsigned long port)
{
        pio_write32be(val, port + sizeof(u32));
        pio_write32be(val >> 32, port);
}

static void pio_write64be_hi_lo(u64 val, unsigned long port)
{
        pio_write32be(val >> 32, port);
        pio_write32be(val, port + sizeof(u32));
}

void iowrite64_lo_hi(u64 val, void __iomem *addr)
{
        IO_COND(addr, pio_write64_lo_hi(val, port),
                writeq(val, addr));
}

void iowrite64_hi_lo(u64 val, void __iomem *addr)
{
        IO_COND(addr, pio_write64_hi_lo(val, port),
                writeq(val, addr));
}

void iowrite64be_lo_hi(u64 val, void __iomem *addr)
{
        IO_COND(addr, pio_write64be_lo_hi(val, port),
                mmio_write64be(val, addr));
}

void iowrite64be_hi_lo(u64 val, void __iomem *addr)
{
        IO_COND(addr, pio_write64be_hi_lo(val, port),
                mmio_write64be(val, addr));
}

EXPORT_SYMBOL(iowrite64_lo_hi);
EXPORT_SYMBOL(iowrite64_hi_lo);
EXPORT_SYMBOL(iowrite64be_lo_hi);
EXPORT_SYMBOL(iowrite64be_hi_lo);

#endif /* readq */

/*
 * These are the "repeat MMIO read/write" functions.
 * Note the "__raw" accesses, since we don't want to
 * convert to CPU byte order. We write in "IO byte
 * order" (we also don't have IO barriers).
 */
#ifndef mmio_insb
static inline void mmio_insb(const void __iomem *addr, u8 *dst, int count)
{
        while (--count >= 0) {
                u8 data = __raw_readb(addr);
                *dst = data;
                dst++;
        }
}
static inline void mmio_insw(const void __iomem *addr, u16 *dst, int count)
{
        while (--count >= 0) {
                u16 data = __raw_readw(addr);
                *dst = data;
                dst++;
        }
}
static inline void mmio_insl(const void __iomem *addr, u32 *dst, int count)
{
        while (--count >= 0) {
                u32 data = __raw_readl(addr);
                *dst = data;
                dst++;
        }
}
#endif

#ifndef mmio_outsb
static inline void mmio_outsb(void __iomem *addr, const u8 *src, int count)
{
        while (--count >= 0) {
                __raw_writeb(*src, addr);
                src++;
        }
}
static inline void mmio_outsw(void __iomem *addr, const u16 *src, int count)
{
        while (--count >= 0) {
                __raw_writew(*src, addr);
                src++;
        }
}
static inline void mmio_outsl(void __iomem *addr, const u32 *src, int count)
{
        while (--count >= 0) {
                __raw_writel(*src, addr);
                src++;
        }
}
#endif

void ioread8_rep(const void __iomem *addr, void *dst, unsigned long count)
{
        IO_COND(addr, insb(port,dst,count), mmio_insb(addr, dst, count));
}
void ioread16_rep(const void __iomem *addr, void *dst, unsigned long count)
{
        IO_COND(addr, insw(port,dst,count), mmio_insw(addr, dst, count));
}
void ioread32_rep(const void __iomem *addr, void *dst, unsigned long count)
{
        IO_COND(addr, insl(port,dst,count), mmio_insl(addr, dst, count));
}
EXPORT_SYMBOL(ioread8_rep);
EXPORT_SYMBOL(ioread16_rep);
EXPORT_SYMBOL(ioread32_rep);

void iowrite8_rep(void __iomem *addr, const void *src, unsigned long count)
{
        IO_COND(addr, outsb(port, src, count), mmio_outsb(addr, src, count));
}
void iowrite16_rep(void __iomem *addr, const void *src, unsigned long count)
{
        IO_COND(addr, outsw(port, src, count), mmio_outsw(addr, src, count));
}
void iowrite32_rep(void __iomem *addr, const void *src, unsigned long count)
{
        IO_COND(addr, outsl(port, src,count), mmio_outsl(addr, src, count));
}
EXPORT_SYMBOL(iowrite8_rep);
EXPORT_SYMBOL(iowrite16_rep);
EXPORT_SYMBOL(iowrite32_rep);

#ifdef CONFIG_HAS_IOPORT_MAP
/* Create a virtual mapping cookie for an IO port range */
void __iomem *ioport_map(unsigned long port, unsigned int nr)
{
        if (port > PIO_MASK)
                return NULL;
        return (void __iomem *) (unsigned long) (port + PIO_OFFSET);
}

void ioport_unmap(void __iomem *addr)
{
        /* Nothing to do */
}
EXPORT_SYMBOL(ioport_map);
EXPORT_SYMBOL(ioport_unmap);
#endif /* CONFIG_HAS_IOPORT_MAP */

#ifdef CONFIG_PCI
/* Hide the details if this is a MMIO or PIO address space and just do what
 * you expect in the correct way. */
void pci_iounmap(struct pci_dev *dev, void __iomem * addr)
{
        IO_COND(addr, /* nothing */, iounmap(addr));
}
EXPORT_SYMBOL(pci_iounmap);
#endif /* CONFIG_PCI */


























































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_JUMP_LABEL_H
#define _LINUX_JUMP_LABEL_H

/*
 * Jump label support
 *
 * Copyright (C) 2009-2012 Jason Baron <jbaron@redhat.com>
 * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra
 *
 * DEPRECATED API:
 *
 * The use of 'struct static_key' directly, is now DEPRECATED. In addition
 * static_key_{true,false}() is also DEPRECATED. IE DO NOT use the following:
 *
 * struct static_key false = STATIC_KEY_INIT_FALSE;
 * struct static_key true = STATIC_KEY_INIT_TRUE;
 * static_key_true()
 * static_key_false()
 *
 * The updated API replacements are:
 *
 * DEFINE_STATIC_KEY_TRUE(key);
 * DEFINE_STATIC_KEY_FALSE(key);
 * DEFINE_STATIC_KEY_ARRAY_TRUE(keys, count);
 * DEFINE_STATIC_KEY_ARRAY_FALSE(keys, count);
 * static_branch_likely()
 * static_branch_unlikely()
 *
 * Jump labels provide an interface to generate dynamic branches using
 * self-modifying code. Assuming toolchain and architecture support, if we
 * define a "key" that is initially false via "DEFINE_STATIC_KEY_FALSE(key)",
 * an "if (static_branch_unlikely(&key))" statement is an unconditional branch
 * (which defaults to false - and the true block is placed out of line).
 * Similarly, we can define an initially true key via
 * "DEFINE_STATIC_KEY_TRUE(key)", and use it in the same
 * "if (static_branch_unlikely(&key))", in which case we will generate an
 * unconditional branch to the out-of-line true branch. Keys that are
 * initially true or false can be using in both static_branch_unlikely()
 * and static_branch_likely() statements.
 *
 * At runtime we can change the branch target by setting the key
 * to true via a call to static_branch_enable(), or false using
 * static_branch_disable(). If the direction of the branch is switched by
 * these calls then we run-time modify the branch target via a
 * no-op -> jump or jump -> no-op conversion. For example, for an
 * initially false key that is used in an "if (static_branch_unlikely(&key))"
 * statement, setting the key to true requires us to patch in a jump
 * to the out-of-line of true branch.
 *
 * In addition to static_branch_{enable,disable}, we can also reference count
 * the key or branch direction via static_branch_{inc,dec}. Thus,
 * static_branch_inc() can be thought of as a 'make more true' and
 * static_branch_dec() as a 'make more false'.
 *
 * Since this relies on modifying code, the branch modifying functions
 * must be considered absolute slow paths (machine wide synchronization etc.).
 * OTOH, since the affected branches are unconditional, their runtime overhead
 * will be absolutely minimal, esp. in the default (off) case where the total
 * effect is a single NOP of appropriate size. The on case will patch in a jump
 * to the out-of-line block.
 *
 * When the control is directly exposed to userspace, it is prudent to delay the
 * decrement to avoid high frequency code modifications which can (and do)
 * cause significant performance degradation. Struct static_key_deferred and
 * static_key_slow_dec_deferred() provide for this.
 *
 * Lacking toolchain and or architecture support, static keys fall back to a
 * simple conditional branch.
 *
 * Additional babbling in: Documentation/staging/static-keys.rst
 */

#ifndef __ASSEMBLY__

#include <linux/types.h>
#include <linux/compiler.h>

extern bool static_key_initialized;

#define STATIC_KEY_CHECK_USE(key) WARN(!static_key_initialized,                      \
                                    "%s(): static key '%pS' used before call to jump_label_init()", \
                                    __func__, (key))

#ifdef CONFIG_JUMP_LABEL

struct static_key {
        atomic_t enabled;
/*
 * Note:
 *   To make anonymous unions work with old compilers, the static
 *   initialization of them requires brackets. This creates a dependency
 *   on the order of the struct with the initializers. If any fields
 *   are added, STATIC_KEY_INIT_TRUE and STATIC_KEY_INIT_FALSE may need
 *   to be modified.
 *
 * bit 0 => 1 if key is initially true
 *            0 if initially false
 * bit 1 => 1 if points to struct static_key_mod
 *            0 if points to struct jump_entry
 */
        union {
                unsigned long type;
                struct jump_entry *entries;
                struct static_key_mod *next;
        };
};

#else
struct static_key {
        atomic_t enabled;
};
#endif        /* CONFIG_JUMP_LABEL */
#endif /* __ASSEMBLY__ */

#ifdef CONFIG_JUMP_LABEL
#include <asm/jump_label.h>

#ifndef __ASSEMBLY__
#ifdef CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE

struct jump_entry {
        s32 code;
        s32 target;
        long key;        // key may be far away from the core kernel under KASLR
};

static inline unsigned long jump_entry_code(const struct jump_entry *entry)
{
        return (unsigned long)&entry->code + entry->code;
}

static inline unsigned long jump_entry_target(const struct jump_entry *entry)
{
        return (unsigned long)&entry->target + entry->target;
}

static inline struct static_key *jump_entry_key(const struct jump_entry *entry)
{
        long offset = entry->key & ~3L;

        return (struct static_key *)((unsigned long)&entry->key + offset);
}

#else

static inline unsigned long jump_entry_code(const struct jump_entry *entry)
{
        return entry->code;
}

static inline unsigned long jump_entry_target(const struct jump_entry *entry)
{
        return entry->target;
}

static inline struct static_key *jump_entry_key(const struct jump_entry *entry)
{
        return (struct static_key *)((unsigned long)entry->key & ~3UL);
}

#endif

static inline bool jump_entry_is_branch(const struct jump_entry *entry)
{
        return (unsigned long)entry->key & 1UL;
}

static inline bool jump_entry_is_init(const struct jump_entry *entry)
{
        return (unsigned long)entry->key & 2UL;
}

static inline void jump_entry_set_init(struct jump_entry *entry)
{
        entry->key |= 2;
}

#endif
#endif

#ifndef __ASSEMBLY__

enum jump_label_type {
        JUMP_LABEL_NOP = 0,
        JUMP_LABEL_JMP,
};

struct module;

#ifdef CONFIG_JUMP_LABEL

#define JUMP_TYPE_FALSE                0UL
#define JUMP_TYPE_TRUE                1UL
#define JUMP_TYPE_LINKED        2UL
#define JUMP_TYPE_MASK                3UL

static __always_inline bool static_key_false(struct static_key *key)
{
        return arch_static_branch(key, false);
}

static __always_inline bool static_key_true(struct static_key *key)
{
        return !arch_static_branch(key, true);
}

extern struct jump_entry __start___jump_table[];
extern struct jump_entry __stop___jump_table[];

extern void jump_label_init(void);
extern void jump_label_lock(void);
extern void jump_label_unlock(void);
extern void arch_jump_label_transform(struct jump_entry *entry,
                                      enum jump_label_type type);
extern void arch_jump_label_transform_static(struct jump_entry *entry,
                                             enum jump_label_type type);
extern bool arch_jump_label_transform_queue(struct jump_entry *entry,
                                            enum jump_label_type type);
extern void arch_jump_label_transform_apply(void);
extern int jump_label_text_reserved(void *start, void *end);
extern void static_key_slow_inc(struct static_key *key);
extern void static_key_slow_dec(struct static_key *key);
extern void static_key_slow_inc_cpuslocked(struct static_key *key);
extern void static_key_slow_dec_cpuslocked(struct static_key *key);
extern void jump_label_apply_nops(struct module *mod);
extern int static_key_count(struct static_key *key);
extern void static_key_enable(struct static_key *key);
extern void static_key_disable(struct static_key *key);
extern void static_key_enable_cpuslocked(struct static_key *key);
extern void static_key_disable_cpuslocked(struct static_key *key);

/*
 * We should be using ATOMIC_INIT() for initializing .enabled, but
 * the inclusion of atomic.h is problematic for inclusion of jump_label.h
 * in 'low-level' headers. Thus, we are initializing .enabled with a
 * raw value, but have added a BUILD_BUG_ON() to catch any issues in
 * jump_label_init() see: kernel/jump_label.c.
 */
#define STATIC_KEY_INIT_TRUE                                        \
        { .enabled = { 1 },                                        \
          { .entries = (void *)JUMP_TYPE_TRUE } }
#define STATIC_KEY_INIT_FALSE                                        \
        { .enabled = { 0 },                                        \
          { .entries = (void *)JUMP_TYPE_FALSE } }

#else  /* !CONFIG_JUMP_LABEL */

#include <linux/atomic.h>
#include <linux/bug.h>

static __always_inline int static_key_count(struct static_key *key)
{
        return arch_atomic_read(&key->enabled);
}

static __always_inline void jump_label_init(void)
{
        static_key_initialized = true;
}

static __always_inline bool static_key_false(struct static_key *key)
{
        if (unlikely(static_key_count(key) > 0))
                return true;
        return false;
}

static __always_inline bool static_key_true(struct static_key *key)
{
        if (likely(static_key_count(key) > 0))
                return true;
        return false;
}

static inline void static_key_slow_inc(struct static_key *key)
{
        STATIC_KEY_CHECK_USE(key);
        atomic_inc(&key->enabled);
}

static inline void static_key_slow_dec(struct static_key *key)
{
        STATIC_KEY_CHECK_USE(key);
        atomic_dec(&key->enabled);
}

#define static_key_slow_inc_cpuslocked(key) static_key_slow_inc(key)
#define static_key_slow_dec_cpuslocked(key) static_key_slow_dec(key)

static inline int jump_label_text_reserved(void *start, void *end)
{
        return 0;
}

static inline void jump_label_lock(void) {}
static inline void jump_label_unlock(void) {}

static inline int jump_label_apply_nops(struct module *mod)
{
        return 0;
}

static inline void static_key_enable(struct static_key *key)
{
        STATIC_KEY_CHECK_USE(key);

        if (atomic_read(&key->enabled) != 0) {
                WARN_ON_ONCE(atomic_read(&key->enabled) != 1);
                return;
        }
        atomic_set(&key->enabled, 1);
}

static inline void static_key_disable(struct static_key *key)
{
        STATIC_KEY_CHECK_USE(key);

        if (atomic_read(&key->enabled) != 1) {
                WARN_ON_ONCE(atomic_read(&key->enabled) != 0);
                return;
        }
        atomic_set(&key->enabled, 0);
}

#define static_key_enable_cpuslocked(k)                static_key_enable((k))
#define static_key_disable_cpuslocked(k)        static_key_disable((k))

#define STATIC_KEY_INIT_TRUE        { .enabled = ATOMIC_INIT(1) }
#define STATIC_KEY_INIT_FALSE        { .enabled = ATOMIC_INIT(0) }

#endif        /* CONFIG_JUMP_LABEL */

#define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE
#define jump_label_enabled static_key_enabled

/* -------------------------------------------------------------------------- */

/*
 * Two type wrappers around static_key, such that we can use compile time
 * type differentiation to emit the right code.
 *
 * All the below code is macros in order to play type games.
 */

struct static_key_true {
        struct static_key key;
};

struct static_key_false {
        struct static_key key;
};

#define STATIC_KEY_TRUE_INIT  (struct static_key_true) { .key = STATIC_KEY_INIT_TRUE,  }
#define STATIC_KEY_FALSE_INIT (struct static_key_false){ .key = STATIC_KEY_INIT_FALSE, }

#define DEFINE_STATIC_KEY_TRUE(name)        \
        struct static_key_true name = STATIC_KEY_TRUE_INIT

#define DEFINE_STATIC_KEY_TRUE_RO(name)        \
        struct static_key_true name __ro_after_init = STATIC_KEY_TRUE_INIT

#define DECLARE_STATIC_KEY_TRUE(name)        \
        extern struct static_key_true name

#define DEFINE_STATIC_KEY_FALSE(name)        \
        struct static_key_false name = STATIC_KEY_FALSE_INIT

#define DEFINE_STATIC_KEY_FALSE_RO(name)        \
        struct static_key_false name __ro_after_init = STATIC_KEY_FALSE_INIT

#define DECLARE_STATIC_KEY_FALSE(name)        \
        extern struct static_key_false name

#define DEFINE_STATIC_KEY_ARRAY_TRUE(name, count)                \
        struct static_key_true name[count] = {                        \
                [0 ... (count) - 1] = STATIC_KEY_TRUE_INIT,        \
        }

#define DEFINE_STATIC_KEY_ARRAY_FALSE(name, count)                \
        struct static_key_false name[count] = {                        \
                [0 ... (count) - 1] = STATIC_KEY_FALSE_INIT,        \
        }

extern bool ____wrong_branch_error(void);

#define static_key_enabled(x)                                                        \
({                                                                                \
        if (!__builtin_types_compatible_p(typeof(*x), struct static_key) &&        \
            !__builtin_types_compatible_p(typeof(*x), struct static_key_true) &&\
            !__builtin_types_compatible_p(typeof(*x), struct static_key_false))        \
                ____wrong_branch_error();                                        \
        static_key_count((struct static_key *)x) > 0;                                \
})

#ifdef CONFIG_JUMP_LABEL

/*
 * Combine the right initial value (type) with the right branch order
 * to generate the desired result.
 *
 *
 * type\branch|        likely (1)              |        unlikely (0)
 * -----------+-----------------------+------------------
 *            |                       |
 *  true (1)  |           ...                      |           ...
 *            |    NOP                      |           JMP L
 *            |    <br-stmts>              |        1: ...
 *            |        L: ...                      |
 *            |                              |
 *            |                              |        L: <br-stmts>
 *            |                              |           jmp 1b
 *            |                       |
 * -----------+-----------------------+------------------
 *            |                       |
 *  false (0) |           ...                      |           ...
 *            |    JMP L              |           NOP
 *            |    <br-stmts>              |        1: ...
 *            |        L: ...                      |
 *            |                              |
 *            |                              |        L: <br-stmts>
 *            |                              |           jmp 1b
 *            |                       |
 * -----------+-----------------------+------------------
 *
 * The initial value is encoded in the LSB of static_key::entries,
 * type: 0 = false, 1 = true.
 *
 * The branch type is encoded in the LSB of jump_entry::key,
 * branch: 0 = unlikely, 1 = likely.
 *
 * This gives the following logic table:
 *
 *        enabled        type        branch          instuction
 * -----------------------------+-----------
 *        0        0        0        | NOP
 *        0        0        1        | JMP
 *        0        1        0        | NOP
 *        0        1        1        | JMP
 *
 *        1        0        0        | JMP
 *        1        0        1        | NOP
 *        1        1        0        | JMP
 *        1        1        1        | NOP
 *
 * Which gives the following functions:
 *
 *   dynamic: instruction = enabled ^ branch
 *   static:  instruction = type ^ branch
 *
 * See jump_label_type() / jump_label_init_type().
 */

#define static_branch_likely(x)                                                        \
({                                                                                \
        bool branch;                                                                \
        if (__builtin_types_compatible_p(typeof(*x), struct static_key_true))        \
                branch = !arch_static_branch(&(x)->key, true);                        \
        else if (__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \
                branch = !arch_static_branch_jump(&(x)->key, true);                \
        else                                                                        \
                branch = ____wrong_branch_error();                                \
        likely(branch);                                                                \
})

#define static_branch_unlikely(x)                                                \
({                                                                                \
        bool branch;                                                                \
        if (__builtin_types_compatible_p(typeof(*x), struct static_key_true))        \
                branch = arch_static_branch_jump(&(x)->key, false);                \
        else if (__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \
                branch = arch_static_branch(&(x)->key, false);                        \
        else                                                                        \
                branch = ____wrong_branch_error();                                \
        unlikely(branch);                                                        \
})

#else /* !CONFIG_JUMP_LABEL */

#define static_branch_likely(x)                likely(static_key_enabled(&(x)->key))
#define static_branch_unlikely(x)        unlikely(static_key_enabled(&(x)->key))

#endif /* CONFIG_JUMP_LABEL */

/*
 * Advanced usage; refcount, branch is enabled when: count != 0
 */

#define static_branch_inc(x)                static_key_slow_inc(&(x)->key)
#define static_branch_dec(x)                static_key_slow_dec(&(x)->key)
#define static_branch_inc_cpuslocked(x)        static_key_slow_inc_cpuslocked(&(x)->key)
#define static_branch_dec_cpuslocked(x)        static_key_slow_dec_cpuslocked(&(x)->key)

/*
 * Normal usage; boolean enable/disable.
 */

#define static_branch_enable(x)                        static_key_enable(&(x)->key)
#define static_branch_disable(x)                static_key_disable(&(x)->key)
#define static_branch_enable_cpuslocked(x)        static_key_enable_cpuslocked(&(x)->key)
#define static_branch_disable_cpuslocked(x)        static_key_disable_cpuslocked(&(x)->key)

#endif /* __ASSEMBLY__ */

#endif        /* _LINUX_JUMP_LABEL_H */
































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef __NET_CFG80211_H
#define __NET_CFG80211_H
/*
 * 802.11 device and configuration interface
 *
 * Copyright 2006-2010        Johannes Berg <johannes@sipsolutions.net>
 * Copyright 2013-2014 Intel Mobile Communications GmbH
 * Copyright 2015-2017        Intel Deutschland GmbH
 * Copyright (C) 2018-2020 Intel Corporation
 */

#include <linux/netdevice.h>
#include <linux/debugfs.h>
#include <linux/list.h>
#include <linux/bug.h>
#include <linux/netlink.h>
#include <linux/skbuff.h>
#include <linux/nl80211.h>
#include <linux/if_ether.h>
#include <linux/ieee80211.h>
#include <linux/net.h>
#include <net/regulatory.h>

/**
 * DOC: Introduction
 *
 * cfg80211 is the configuration API for 802.11 devices in Linux. It bridges
 * userspace and drivers, and offers some utility functionality associated
 * with 802.11. cfg80211 must, directly or indirectly via mac80211, be used
 * by all modern wireless drivers in Linux, so that they offer a consistent
 * API through nl80211. For backward compatibility, cfg80211 also offers
 * wireless extensions to userspace, but hides them from drivers completely.
 *
 * Additionally, cfg80211 contains code to help enforce regulatory spectrum
 * use restrictions.
 */


/**
 * DOC: Device registration
 *
 * In order for a driver to use cfg80211, it must register the hardware device
 * with cfg80211. This happens through a number of hardware capability structs
 * described below.
 *
 * The fundamental structure for each device is the 'wiphy', of which each
 * instance describes a physical wireless device connected to the system. Each
 * such wiphy can have zero, one, or many virtual interfaces associated with
 * it, which need to be identified as such by pointing the network interface's
 * @ieee80211_ptr pointer to a &struct wireless_dev which further describes
 * the wireless part of the interface, normally this struct is embedded in the
 * network interface's private data area. Drivers can optionally allow creating
 * or destroying virtual interfaces on the fly, but without at least one or the
 * ability to create some the wireless device isn't useful.
 *
 * Each wiphy structure contains device capability information, and also has
 * a pointer to the various operations the driver offers. The definitions and
 * structures here describe these capabilities in detail.
 */

struct wiphy;

/*
 * wireless hardware capability structures
 */

/**
 * enum ieee80211_channel_flags - channel flags
 *
 * Channel flags set by the regulatory control code.
 *
 * @IEEE80211_CHAN_DISABLED: This channel is disabled.
 * @IEEE80211_CHAN_NO_IR: do not initiate radiation, this includes
 *        sending probe requests or beaconing.
 * @IEEE80211_CHAN_RADAR: Radar detection is required on this channel.
 * @IEEE80211_CHAN_NO_HT40PLUS: extension channel above this channel
 *        is not permitted.
 * @IEEE80211_CHAN_NO_HT40MINUS: extension channel below this channel
 *        is not permitted.
 * @IEEE80211_CHAN_NO_OFDM: OFDM is not allowed on this channel.
 * @IEEE80211_CHAN_NO_80MHZ: If the driver supports 80 MHz on the band,
 *        this flag indicates that an 80 MHz channel cannot use this
 *        channel as the control or any of the secondary channels.
 *        This may be due to the driver or due to regulatory bandwidth
 *        restrictions.
 * @IEEE80211_CHAN_NO_160MHZ: If the driver supports 160 MHz on the band,
 *        this flag indicates that an 160 MHz channel cannot use this
 *        channel as the control or any of the secondary channels.
 *        This may be due to the driver or due to regulatory bandwidth
 *        restrictions.
 * @IEEE80211_CHAN_INDOOR_ONLY: see %NL80211_FREQUENCY_ATTR_INDOOR_ONLY
 * @IEEE80211_CHAN_IR_CONCURRENT: see %NL80211_FREQUENCY_ATTR_IR_CONCURRENT
 * @IEEE80211_CHAN_NO_20MHZ: 20 MHz bandwidth is not permitted
 *        on this channel.
 * @IEEE80211_CHAN_NO_10MHZ: 10 MHz bandwidth is not permitted
 *        on this channel.
 * @IEEE80211_CHAN_NO_HE: HE operation is not permitted on this channel.
 * @IEEE80211_CHAN_1MHZ: 1 MHz bandwidth is permitted
 *        on this channel.
 * @IEEE80211_CHAN_2MHZ: 2 MHz bandwidth is permitted
 *        on this channel.
 * @IEEE80211_CHAN_4MHZ: 4 MHz bandwidth is permitted
 *        on this channel.
 * @IEEE80211_CHAN_8MHZ: 8 MHz bandwidth is permitted
 *        on this channel.
 * @IEEE80211_CHAN_16MHZ: 16 MHz bandwidth is permitted
 *        on this channel.
 *
 */
enum ieee80211_channel_flags {
        IEEE80211_CHAN_DISABLED                = 1<<0,
        IEEE80211_CHAN_NO_IR                = 1<<1,
        /* hole at 1<<2 */
        IEEE80211_CHAN_RADAR                = 1<<3,
        IEEE80211_CHAN_NO_HT40PLUS        = 1<<4,
        IEEE80211_CHAN_NO_HT40MINUS        = 1<<5,
        IEEE80211_CHAN_NO_OFDM                = 1<<6,
        IEEE80211_CHAN_NO_80MHZ                = 1<<7,
        IEEE80211_CHAN_NO_160MHZ        = 1<<8,
        IEEE80211_CHAN_INDOOR_ONLY        = 1<<9,
        IEEE80211_CHAN_IR_CONCURRENT        = 1<<10,
        IEEE80211_CHAN_NO_20MHZ                = 1<<11,
        IEEE80211_CHAN_NO_10MHZ                = 1<<12,
        IEEE80211_CHAN_NO_HE                = 1<<13,
        IEEE80211_CHAN_1MHZ                = 1<<14,
        IEEE80211_CHAN_2MHZ                = 1<<15,
        IEEE80211_CHAN_4MHZ                = 1<<16,
        IEEE80211_CHAN_8MHZ                = 1<<17,
        IEEE80211_CHAN_16MHZ                = 1<<18,
};

#define IEEE80211_CHAN_NO_HT40 \
        (IEEE80211_CHAN_NO_HT40PLUS | IEEE80211_CHAN_NO_HT40MINUS)

#define IEEE80211_DFS_MIN_CAC_TIME_MS                60000
#define IEEE80211_DFS_MIN_NOP_TIME_MS                (30 * 60 * 1000)

/**
 * struct ieee80211_channel - channel definition
 *
 * This structure describes a single channel for use
 * with cfg80211.
 *
 * @center_freq: center frequency in MHz
 * @freq_offset: offset from @center_freq, in KHz
 * @hw_value: hardware-specific value for the channel
 * @flags: channel flags from &enum ieee80211_channel_flags.
 * @orig_flags: channel flags at registration time, used by regulatory
 *        code to support devices with additional restrictions
 * @band: band this channel belongs to.
 * @max_antenna_gain: maximum antenna gain in dBi
 * @max_power: maximum transmission power (in dBm)
 * @max_reg_power: maximum regulatory transmission power (in dBm)
 * @beacon_found: helper to regulatory code to indicate when a beacon
 *        has been found on this channel. Use regulatory_hint_found_beacon()
 *        to enable this, this is useful only on 5 GHz band.
 * @orig_mag: internal use
 * @orig_mpwr: internal use
 * @dfs_state: current state of this channel. Only relevant if radar is required
 *        on this channel.
 * @dfs_state_entered: timestamp (jiffies) when the dfs state was entered.
 * @dfs_cac_ms: DFS CAC time in milliseconds, this is valid for DFS channels.
 */
struct ieee80211_channel {
        enum nl80211_band band;
        u32 center_freq;
        u16 freq_offset;
        u16 hw_value;
        u32 flags;
        int max_antenna_gain;
        int max_power;
        int max_reg_power;
        bool beacon_found;
        u32 orig_flags;
        int orig_mag, orig_mpwr;
        enum nl80211_dfs_state dfs_state;
        unsigned long dfs_state_entered;
        unsigned int dfs_cac_ms;
};

/**
 * enum ieee80211_rate_flags - rate flags
 *
 * Hardware/specification flags for rates. These are structured
 * in a way that allows using the same bitrate structure for
 * different bands/PHY modes.
 *
 * @IEEE80211_RATE_SHORT_PREAMBLE: Hardware can send with short
 *        preamble on this bitrate; only relevant in 2.4GHz band and
 *        with CCK rates.
 * @IEEE80211_RATE_MANDATORY_A: This bitrate is a mandatory rate
 *        when used with 802.11a (on the 5 GHz band); filled by the
 *        core code when registering the wiphy.
 * @IEEE80211_RATE_MANDATORY_B: This bitrate is a mandatory rate
 *        when used with 802.11b (on the 2.4 GHz band); filled by the
 *        core code when registering the wiphy.
 * @IEEE80211_RATE_MANDATORY_G: This bitrate is a mandatory rate
 *        when used with 802.11g (on the 2.4 GHz band); filled by the
 *        core code when registering the wiphy.
 * @IEEE80211_RATE_ERP_G: This is an ERP rate in 802.11g mode.
 * @IEEE80211_RATE_SUPPORTS_5MHZ: Rate can be used in 5 MHz mode
 * @IEEE80211_RATE_SUPPORTS_10MHZ: Rate can be used in 10 MHz mode
 */
enum ieee80211_rate_flags {
        IEEE80211_RATE_SHORT_PREAMBLE        = 1<<0,
        IEEE80211_RATE_MANDATORY_A        = 1<<1,
        IEEE80211_RATE_MANDATORY_B        = 1<<2,
        IEEE80211_RATE_MANDATORY_G        = 1<<3,
        IEEE80211_RATE_ERP_G                = 1<<4,
        IEEE80211_RATE_SUPPORTS_5MHZ        = 1<<5,
        IEEE80211_RATE_SUPPORTS_10MHZ        = 1<<6,
};

/**
 * enum ieee80211_bss_type - BSS type filter
 *
 * @IEEE80211_BSS_TYPE_ESS: Infrastructure BSS
 * @IEEE80211_BSS_TYPE_PBSS: Personal BSS
 * @IEEE80211_BSS_TYPE_IBSS: Independent BSS
 * @IEEE80211_BSS_TYPE_MBSS: Mesh BSS
 * @IEEE80211_BSS_TYPE_ANY: Wildcard value for matching any BSS type
 */
enum ieee80211_bss_type {
        IEEE80211_BSS_TYPE_ESS,
        IEEE80211_BSS_TYPE_PBSS,
        IEEE80211_BSS_TYPE_IBSS,
        IEEE80211_BSS_TYPE_MBSS,
        IEEE80211_BSS_TYPE_ANY
};

/**
 * enum ieee80211_privacy - BSS privacy filter
 *
 * @IEEE80211_PRIVACY_ON: privacy bit set
 * @IEEE80211_PRIVACY_OFF: privacy bit clear
 * @IEEE80211_PRIVACY_ANY: Wildcard value for matching any privacy setting
 */
enum ieee80211_privacy {
        IEEE80211_PRIVACY_ON,
        IEEE80211_PRIVACY_OFF,
        IEEE80211_PRIVACY_ANY
};

#define IEEE80211_PRIVACY(x)        \
        ((x) ? IEEE80211_PRIVACY_ON : IEEE80211_PRIVACY_OFF)

/**
 * struct ieee80211_rate - bitrate definition
 *
 * This structure describes a bitrate that an 802.11 PHY can
 * operate with. The two values @hw_value and @hw_value_short
 * are only for driver use when pointers to this structure are
 * passed around.
 *
 * @flags: rate-specific flags
 * @bitrate: bitrate in units of 100 Kbps
 * @hw_value: driver/hardware value for this rate
 * @hw_value_short: driver/hardware value for this rate when
 *        short preamble is used
 */
struct ieee80211_rate {
        u32 flags;
        u16 bitrate;
        u16 hw_value, hw_value_short;
};

/**
 * struct ieee80211_he_obss_pd - AP settings for spatial reuse
 *
 * @enable: is the feature enabled.
 * @sr_ctrl: The SR Control field of SRP element.
 * @non_srg_max_offset: non-SRG maximum tx power offset
 * @min_offset: minimal tx power offset an associated station shall use
 * @max_offset: maximum tx power offset an associated station shall use
 * @bss_color_bitmap: bitmap that indicates the BSS color values used by
 *        members of the SRG
 * @partial_bssid_bitmap: bitmap that indicates the partial BSSID values
 *        used by members of the SRG
 */
struct ieee80211_he_obss_pd {
        bool enable;
        u8 sr_ctrl;
        u8 non_srg_max_offset;
        u8 min_offset;
        u8 max_offset;
        u8 bss_color_bitmap[8];
        u8 partial_bssid_bitmap[8];
};

/**
 * struct cfg80211_he_bss_color - AP settings for BSS coloring
 *
 * @color: the current color.
 * @enabled: HE BSS color is used
 * @partial: define the AID equation.
 */
struct cfg80211_he_bss_color {
        u8 color;
        bool enabled;
        bool partial;
};

/**
 * struct ieee80211_he_bss_color - AP settings for BSS coloring
 *
 * @color: the current color.
 * @disabled: is the feature disabled.
 * @partial: define the AID equation.
 */
struct ieee80211_he_bss_color {
        u8 color;
        bool disabled;
        bool partial;
};

/**
 * struct ieee80211_sta_ht_cap - STA's HT capabilities
 *
 * This structure describes most essential parameters needed
 * to describe 802.11n HT capabilities for an STA.
 *
 * @ht_supported: is HT supported by the STA
 * @cap: HT capabilities map as described in 802.11n spec
 * @ampdu_factor: Maximum A-MPDU length factor
 * @ampdu_density: Minimum A-MPDU spacing
 * @mcs: Supported MCS rates
 */
struct ieee80211_sta_ht_cap {
        u16 cap; /* use IEEE80211_HT_CAP_ */
        bool ht_supported;
        u8 ampdu_factor;
        u8 ampdu_density;
        struct ieee80211_mcs_info mcs;
};

/**
 * struct ieee80211_sta_vht_cap - STA's VHT capabilities
 *
 * This structure describes most essential parameters needed
 * to describe 802.11ac VHT capabilities for an STA.
 *
 * @vht_supported: is VHT supported by the STA
 * @cap: VHT capabilities map as described in 802.11ac spec
 * @vht_mcs: Supported VHT MCS rates
 */
struct ieee80211_sta_vht_cap {
        bool vht_supported;
        u32 cap; /* use IEEE80211_VHT_CAP_ */
        struct ieee80211_vht_mcs_info vht_mcs;
};

#define IEEE80211_HE_PPE_THRES_MAX_LEN                25

/**
 * struct ieee80211_sta_he_cap - STA's HE capabilities
 *
 * This structure describes most essential parameters needed
 * to describe 802.11ax HE capabilities for a STA.
 *
 * @has_he: true iff HE data is valid.
 * @he_cap_elem: Fixed portion of the HE capabilities element.
 * @he_mcs_nss_supp: The supported NSS/MCS combinations.
 * @ppe_thres: Holds the PPE Thresholds data.
 */
struct ieee80211_sta_he_cap {
        bool has_he;
        struct ieee80211_he_cap_elem he_cap_elem;
        struct ieee80211_he_mcs_nss_supp he_mcs_nss_supp;
        u8 ppe_thres[IEEE80211_HE_PPE_THRES_MAX_LEN];
};

/**
 * struct ieee80211_sband_iftype_data
 *
 * This structure encapsulates sband data that is relevant for the
 * interface types defined in @types_mask.  Each type in the
 * @types_mask must be unique across all instances of iftype_data.
 *
 * @types_mask: interface types mask
 * @he_cap: holds the HE capabilities
 * @he_6ghz_capa: HE 6 GHz capabilities, must be filled in for a
 *        6 GHz band channel (and 0 may be valid value).
 */
struct ieee80211_sband_iftype_data {
        u16 types_mask;
        struct ieee80211_sta_he_cap he_cap;
        struct ieee80211_he_6ghz_capa he_6ghz_capa;
};

/**
 * enum ieee80211_edmg_bw_config - allowed channel bandwidth configurations
 *
 * @IEEE80211_EDMG_BW_CONFIG_4: 2.16GHz
 * @IEEE80211_EDMG_BW_CONFIG_5: 2.16GHz and 4.32GHz
 * @IEEE80211_EDMG_BW_CONFIG_6: 2.16GHz, 4.32GHz and 6.48GHz
 * @IEEE80211_EDMG_BW_CONFIG_7: 2.16GHz, 4.32GHz, 6.48GHz and 8.64GHz
 * @IEEE80211_EDMG_BW_CONFIG_8: 2.16GHz and 2.16GHz + 2.16GHz
 * @IEEE80211_EDMG_BW_CONFIG_9: 2.16GHz, 4.32GHz and 2.16GHz + 2.16GHz
 * @IEEE80211_EDMG_BW_CONFIG_10: 2.16GHz, 4.32GHz, 6.48GHz and 2.16GHz+2.16GHz
 * @IEEE80211_EDMG_BW_CONFIG_11: 2.16GHz, 4.32GHz, 6.48GHz, 8.64GHz and
 *        2.16GHz+2.16GHz
 * @IEEE80211_EDMG_BW_CONFIG_12: 2.16GHz, 2.16GHz + 2.16GHz and
 *        4.32GHz + 4.32GHz
 * @IEEE80211_EDMG_BW_CONFIG_13: 2.16GHz, 4.32GHz, 2.16GHz + 2.16GHz and
 *        4.32GHz + 4.32GHz
 * @IEEE80211_EDMG_BW_CONFIG_14: 2.16GHz, 4.32GHz, 6.48GHz, 2.16GHz + 2.16GHz
 *        and 4.32GHz + 4.32GHz
 * @IEEE80211_EDMG_BW_CONFIG_15: 2.16GHz, 4.32GHz, 6.48GHz, 8.64GHz,
 *        2.16GHz + 2.16GHz and 4.32GHz + 4.32GHz
 */
enum ieee80211_edmg_bw_config {
        IEEE80211_EDMG_BW_CONFIG_4        = 4,
        IEEE80211_EDMG_BW_CONFIG_5        = 5,
        IEEE80211_EDMG_BW_CONFIG_6        = 6,
        IEEE80211_EDMG_BW_CONFIG_7        = 7,
        IEEE80211_EDMG_BW_CONFIG_8        = 8,
        IEEE80211_EDMG_BW_CONFIG_9        = 9,
        IEEE80211_EDMG_BW_CONFIG_10        = 10,
        IEEE80211_EDMG_BW_CONFIG_11        = 11,
        IEEE80211_EDMG_BW_CONFIG_12        = 12,
        IEEE80211_EDMG_BW_CONFIG_13        = 13,
        IEEE80211_EDMG_BW_CONFIG_14        = 14,
        IEEE80211_EDMG_BW_CONFIG_15        = 15,
};

/**
 * struct ieee80211_edmg - EDMG configuration
 *
 * This structure describes most essential parameters needed
 * to describe 802.11ay EDMG configuration
 *
 * @channels: bitmap that indicates the 2.16 GHz channel(s)
 *        that are allowed to be used for transmissions.
 *        Bit 0 indicates channel 1, bit 1 indicates channel 2, etc.
 *        Set to 0 indicate EDMG not supported.
 * @bw_config: Channel BW Configuration subfield encodes
 *        the allowed channel bandwidth configurations
 */
struct ieee80211_edmg {
        u8 channels;
        enum ieee80211_edmg_bw_config bw_config;
};

/**
 * struct ieee80211_sta_s1g_cap - STA's S1G capabilities
 *
 * This structure describes most essential parameters needed
 * to describe 802.11ah S1G capabilities for a STA.
 *
 * @s1g_supported: is STA an S1G STA
 * @cap: S1G capabilities information
 * @nss_mcs: Supported NSS MCS set
 */
struct ieee80211_sta_s1g_cap {
        bool s1g;
        u8 cap[10]; /* use S1G_CAPAB_ */
        u8 nss_mcs[5];
};

/**
 * struct ieee80211_supported_band - frequency band definition
 *
 * This structure describes a frequency band a wiphy
 * is able to operate in.
 *
 * @channels: Array of channels the hardware can operate with
 *        in this band.
 * @band: the band this structure represents
 * @n_channels: Number of channels in @channels
 * @bitrates: Array of bitrates the hardware can operate with
 *        in this band. Must be sorted to give a valid "supported
 *        rates" IE, i.e. CCK rates first, then OFDM.
 * @n_bitrates: Number of bitrates in @bitrates
 * @ht_cap: HT capabilities in this band
 * @vht_cap: VHT capabilities in this band
 * @s1g_cap: S1G capabilities in this band
 * @edmg_cap: EDMG capabilities in this band
 * @s1g_cap: S1G capabilities in this band (S1B band only, of course)
 * @n_iftype_data: number of iftype data entries
 * @iftype_data: interface type data entries.  Note that the bits in
 *        @types_mask inside this structure cannot overlap (i.e. only
 *        one occurrence of each type is allowed across all instances of
 *        iftype_data).
 */
struct ieee80211_supported_band {
        struct ieee80211_channel *channels;
        struct ieee80211_rate *bitrates;
        enum nl80211_band band;
        int n_channels;
        int n_bitrates;
        struct ieee80211_sta_ht_cap ht_cap;
        struct ieee80211_sta_vht_cap vht_cap;
        struct ieee80211_sta_s1g_cap s1g_cap;
        struct ieee80211_edmg edmg_cap;
        u16 n_iftype_data;
        const struct ieee80211_sband_iftype_data *iftype_data;
};

/**
 * ieee80211_get_sband_iftype_data - return sband data for a given iftype
 * @sband: the sband to search for the STA on
 * @iftype: enum nl80211_iftype
 *
 * Return: pointer to struct ieee80211_sband_iftype_data, or NULL is none found
 */
static inline const struct ieee80211_sband_iftype_data *
ieee80211_get_sband_iftype_data(const struct ieee80211_supported_band *sband,
                                u8 iftype)
{
        int i;

        if (WARN_ON(iftype >= NUM_NL80211_IFTYPES))
                return NULL;

        if (iftype == NL80211_IFTYPE_AP_VLAN)
                iftype = NL80211_IFTYPE_AP;

        for (i = 0; i < sband->n_iftype_data; i++)  {
                const struct ieee80211_sband_iftype_data *data =
                        &sband->iftype_data[i];

                if (data->types_mask & BIT(iftype))
                        return data;
        }

        return NULL;
}

/**
 * ieee80211_get_he_iftype_cap - return HE capabilities for an sband's iftype
 * @sband: the sband to search for the iftype on
 * @iftype: enum nl80211_iftype
 *
 * Return: pointer to the struct ieee80211_sta_he_cap, or NULL is none found
 */
static inline const struct ieee80211_sta_he_cap *
ieee80211_get_he_iftype_cap(const struct ieee80211_supported_band *sband,
                            u8 iftype)
{
        const struct ieee80211_sband_iftype_data *data =
                ieee80211_get_sband_iftype_data(sband, iftype);

        if (data && data->he_cap.has_he)
                return &data->he_cap;

        return NULL;
}

/**
 * ieee80211_get_he_sta_cap - return HE capabilities for an sband's STA
 * @sband: the sband to search for the STA on
 *
 * Return: pointer to the struct ieee80211_sta_he_cap, or NULL is none found
 */
static inline const struct ieee80211_sta_he_cap *
ieee80211_get_he_sta_cap(const struct ieee80211_supported_band *sband)
{
        return ieee80211_get_he_iftype_cap(sband, NL80211_IFTYPE_STATION);
}

/**
 * ieee80211_get_he_6ghz_capa - return HE 6 GHz capabilities
 * @sband: the sband to search for the STA on
 * @iftype: the iftype to search for
 *
 * Return: the 6GHz capabilities
 */
static inline __le16
ieee80211_get_he_6ghz_capa(const struct ieee80211_supported_band *sband,
                           enum nl80211_iftype iftype)
{
        const struct ieee80211_sband_iftype_data *data =
                ieee80211_get_sband_iftype_data(sband, iftype);

        if (WARN_ON(!data || !data->he_cap.has_he))
                return 0;

        return data->he_6ghz_capa.capa;
}

/**
 * wiphy_read_of_freq_limits - read frequency limits from device tree
 *
 * @wiphy: the wireless device to get extra limits for
 *
 * Some devices may have extra limitations specified in DT. This may be useful
 * for chipsets that normally support more bands but are limited due to board
 * design (e.g. by antennas or external power amplifier).
 *
 * This function reads info from DT and uses it to *modify* channels (disable
 * unavailable ones). It's usually a *bad* idea to use it in drivers with
 * shared channel data as DT limitations are device specific. You should make
 * sure to call it only if channels in wiphy are copied and can be modified
 * without affecting other devices.
 *
 * As this function access device node it has to be called after set_wiphy_dev.
 * It also modifies channels so they have to be set first.
 * If using this helper, call it before wiphy_register().
 */
#ifdef CONFIG_OF
void wiphy_read_of_freq_limits(struct wiphy *wiphy);
#else /* CONFIG_OF */
static inline void wiphy_read_of_freq_limits(struct wiphy *wiphy)
{
}
#endif /* !CONFIG_OF */


/*
 * Wireless hardware/device configuration structures and methods
 */

/**
 * DOC: Actions and configuration
 *
 * Each wireless device and each virtual interface offer a set of configuration
 * operations and other actions that are invoked by userspace. Each of these
 * actions is described in the operations structure, and the parameters these
 * operations use are described separately.
 *
 * Additionally, some operations are asynchronous and expect to get status
 * information via some functions that drivers need to call.
 *
 * Scanning and BSS list handling with its associated functionality is described
 * in a separate chapter.
 */

#define VHT_MUMIMO_GROUPS_DATA_LEN (WLAN_MEMBERSHIP_LEN +\
                                    WLAN_USER_POSITION_LEN)

/**
 * struct vif_params - describes virtual interface parameters
 * @flags: monitor interface flags, unchanged if 0, otherwise
 *        %MONITOR_FLAG_CHANGED will be set
 * @use_4addr: use 4-address frames
 * @macaddr: address to use for this virtual interface.
 *        If this parameter is set to zero address the driver may
 *        determine the address as needed.
 *        This feature is only fully supported by drivers that enable the
 *        %NL80211_FEATURE_MAC_ON_CREATE flag.  Others may support creating
 **        only p2p devices with specified MAC.
 * @vht_mumimo_groups: MU-MIMO groupID, used for monitoring MU-MIMO packets
 *        belonging to that MU-MIMO groupID; %NULL if not changed
 * @vht_mumimo_follow_addr: MU-MIMO follow address, used for monitoring
 *        MU-MIMO packets going to the specified station; %NULL if not changed
 */
struct vif_params {
        u32 flags;
        int use_4addr;
        u8 macaddr[ETH_ALEN];
        const u8 *vht_mumimo_groups;
        const u8 *vht_mumimo_follow_addr;
};

/**
 * struct key_params - key information
 *
 * Information about a key
 *
 * @key: key material
 * @key_len: length of key material
 * @cipher: cipher suite selector
 * @seq: sequence counter (IV/PN) for TKIP and CCMP keys, only used
 *        with the get_key() callback, must be in little endian,
 *        length given by @seq_len.
 * @seq_len: length of @seq.
 * @vlan_id: vlan_id for VLAN group key (if nonzero)
 * @mode: key install mode (RX_TX, NO_TX or SET_TX)
 */
struct key_params {
        const u8 *key;
        const u8 *seq;
        int key_len;
        int seq_len;
        u16 vlan_id;
        u32 cipher;
        enum nl80211_key_mode mode;
};

/**
 * struct cfg80211_chan_def - channel definition
 * @chan: the (control) channel
 * @width: channel width
 * @center_freq1: center frequency of first segment
 * @center_freq2: center frequency of second segment
 *        (only with 80+80 MHz)
 * @edmg: define the EDMG channels configuration.
 *        If edmg is requested (i.e. the .channels member is non-zero),
 *        chan will define the primary channel and all other
 *        parameters are ignored.
 * @freq1_offset: offset from @center_freq1, in KHz
 */
struct cfg80211_chan_def {
        struct ieee80211_channel *chan;
        enum nl80211_chan_width width;
        u32 center_freq1;
        u32 center_freq2;
        struct ieee80211_edmg edmg;
        u16 freq1_offset;
};

/*
 * cfg80211_bitrate_mask - masks for bitrate control
 */
struct cfg80211_bitrate_mask {
        struct {
                u32 legacy;
                u8 ht_mcs[IEEE80211_HT_MCS_MASK_LEN];
                u16 vht_mcs[NL80211_VHT_NSS_MAX];
                u16 he_mcs[NL80211_HE_NSS_MAX];
                enum nl80211_txrate_gi gi;
                enum nl80211_he_gi he_gi;
                enum nl80211_he_ltf he_ltf;
        } control[NUM_NL80211_BANDS];
};


/**
 * struct cfg80211_tid_cfg - TID specific configuration
 * @config_override: Flag to notify driver to reset TID configuration
 *        of the peer.
 * @tids: bitmap of TIDs to modify
 * @mask: bitmap of attributes indicating which parameter changed,
 *        similar to &nl80211_tid_config_supp.
 * @noack: noack configuration value for the TID
 * @retry_long: retry count value
 * @retry_short: retry count value
 * @ampdu: Enable/Disable MPDU aggregation
 * @rtscts: Enable/Disable RTS/CTS
 * @amsdu: Enable/Disable MSDU aggregation
 * @txrate_type: Tx bitrate mask type
 * @txrate_mask: Tx bitrate to be applied for the TID
 */
struct cfg80211_tid_cfg {
        bool config_override;
        u8 tids;
        u64 mask;
        enum nl80211_tid_config noack;
        u8 retry_long, retry_short;
        enum nl80211_tid_config ampdu;
        enum nl80211_tid_config rtscts;
        enum nl80211_tid_config amsdu;
        enum nl80211_tx_rate_setting txrate_type;
        struct cfg80211_bitrate_mask txrate_mask;
};

/**
 * struct cfg80211_tid_config - TID configuration
 * @peer: Station's MAC address
 * @n_tid_conf: Number of TID specific configurations to be applied
 * @tid_conf: Configuration change info
 */
struct cfg80211_tid_config {
        const u8 *peer;
        u32 n_tid_conf;
        struct cfg80211_tid_cfg tid_conf[];
};

/**
 * cfg80211_get_chandef_type - return old channel type from chandef
 * @chandef: the channel definition
 *
 * Return: The old channel type (NOHT, HT20, HT40+/-) from a given
 * chandef, which must have a bandwidth allowing this conversion.
 */
static inline enum nl80211_channel_type
cfg80211_get_chandef_type(const struct cfg80211_chan_def *chandef)
{
        switch (chandef->width) {
        case NL80211_CHAN_WIDTH_20_NOHT:
                return NL80211_CHAN_NO_HT;
        case NL80211_CHAN_WIDTH_20:
                return NL80211_CHAN_HT20;
        case NL80211_CHAN_WIDTH_40:
                if (chandef->center_freq1 > chandef->chan->center_freq)
                        return NL80211_CHAN_HT40PLUS;
                return NL80211_CHAN_HT40MINUS;
        default:
                WARN_ON(1);
                return NL80211_CHAN_NO_HT;
        }
}

/**
 * cfg80211_chandef_create - create channel definition using channel type
 * @chandef: the channel definition struct to fill
 * @channel: the control channel
 * @chantype: the channel type
 *
 * Given a channel type, create a channel definition.
 */
void cfg80211_chandef_create(struct cfg80211_chan_def *chandef,
                             struct ieee80211_channel *channel,
                             enum nl80211_channel_type chantype);

/**
 * cfg80211_chandef_identical - check if two channel definitions are identical
 * @chandef1: first channel definition
 * @chandef2: second channel definition
 *
 * Return: %true if the channels defined by the channel definitions are
 * identical, %false otherwise.
 */
static inline bool
cfg80211_chandef_identical(const struct cfg80211_chan_def *chandef1,
                           const struct cfg80211_chan_def *chandef2)
{
        return (chandef1->chan == chandef2->chan &&
                chandef1->width == chandef2->width &&
                chandef1->center_freq1 == chandef2->center_freq1 &&
                chandef1->freq1_offset == chandef2->freq1_offset &&
                chandef1->center_freq2 == chandef2->center_freq2);
}

/**
 * cfg80211_chandef_is_edmg - check if chandef represents an EDMG channel
 *
 * @chandef: the channel definition
 *
 * Return: %true if EDMG defined, %false otherwise.
 */
static inline bool
cfg80211_chandef_is_edmg(const struct cfg80211_chan_def *chandef)
{
        return chandef->edmg.channels || chandef->edmg.bw_config;
}

/**
 * cfg80211_chandef_compatible - check if two channel definitions are compatible
 * @chandef1: first channel definition
 * @chandef2: second channel definition
 *
 * Return: %NULL if the given channel definitions are incompatible,
 * chandef1 or chandef2 otherwise.
 */
const struct cfg80211_chan_def *
cfg80211_chandef_compatible(const struct cfg80211_chan_def *chandef1,
                            const struct cfg80211_chan_def *chandef2);

/**
 * cfg80211_chandef_valid - check if a channel definition is valid
 * @chandef: the channel definition to check
 * Return: %true if the channel definition is valid. %false otherwise.
 */
bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef);

/**
 * cfg80211_chandef_usable - check if secondary channels can be used
 * @wiphy: the wiphy to validate against
 * @chandef: the channel definition to check
 * @prohibited_flags: the regulatory channel flags that must not be set
 * Return: %true if secondary channels are usable. %false otherwise.
 */
bool cfg80211_chandef_usable(struct wiphy *wiphy,
                             const struct cfg80211_chan_def *chandef,
                             u32 prohibited_flags);

/**
 * cfg80211_chandef_dfs_required - checks if radar detection is required
 * @wiphy: the wiphy to validate against
 * @chandef: the channel definition to check
 * @iftype: the interface type as specified in &enum nl80211_iftype
 * Returns:
 *        1 if radar detection is required, 0 if it is not, < 0 on error
 */
int cfg80211_chandef_dfs_required(struct wiphy *wiphy,
                                  const struct cfg80211_chan_def *chandef,
                                  enum nl80211_iftype iftype);

/**
 * ieee80211_chandef_rate_flags - returns rate flags for a channel
 *
 * In some channel types, not all rates may be used - for example CCK
 * rates may not be used in 5/10 MHz channels.
 *
 * @chandef: channel definition for the channel
 *
 * Returns: rate flags which apply for this channel
 */
static inline enum ieee80211_rate_flags
ieee80211_chandef_rate_flags(struct cfg80211_chan_def *chandef)
{
        switch (chandef->width) {
        case NL80211_CHAN_WIDTH_5:
                return IEEE80211_RATE_SUPPORTS_5MHZ;
        case NL80211_CHAN_WIDTH_10:
                return IEEE80211_RATE_SUPPORTS_10MHZ;
        default:
                break;
        }
        return 0;
}

/**
 * ieee80211_chandef_max_power - maximum transmission power for the chandef
 *
 * In some regulations, the transmit power may depend on the configured channel
 * bandwidth which may be defined as dBm/MHz. This function returns the actual
 * max_power for non-standard (20 MHz) channels.
 *
 * @chandef: channel definition for the channel
 *
 * Returns: maximum allowed transmission power in dBm for the chandef
 */
static inline int
ieee80211_chandef_max_power(struct cfg80211_chan_def *chandef)
{
        switch (chandef->width) {
        case NL80211_CHAN_WIDTH_5:
                return min(chandef->chan->max_reg_power - 6,
                           chandef->chan->max_power);
        case NL80211_CHAN_WIDTH_10:
                return min(chandef->chan->max_reg_power - 3,
                           chandef->chan->max_power);
        default:
                break;
        }
        return chandef->chan->max_power;
}

/**
 * enum survey_info_flags - survey information flags
 *
 * @SURVEY_INFO_NOISE_DBM: noise (in dBm) was filled in
 * @SURVEY_INFO_IN_USE: channel is currently being used
 * @SURVEY_INFO_TIME: active time (in ms) was filled in
 * @SURVEY_INFO_TIME_BUSY: busy time was filled in
 * @SURVEY_INFO_TIME_EXT_BUSY: extension channel busy time was filled in
 * @SURVEY_INFO_TIME_RX: receive time was filled in
 * @SURVEY_INFO_TIME_TX: transmit time was filled in
 * @SURVEY_INFO_TIME_SCAN: scan time was filled in
 * @SURVEY_INFO_TIME_BSS_RX: local BSS receive time was filled in
 *
 * Used by the driver to indicate which info in &struct survey_info
 * it has filled in during the get_survey().
 */
enum survey_info_flags {
        SURVEY_INFO_NOISE_DBM                = BIT(0),
        SURVEY_INFO_IN_USE                = BIT(1),
        SURVEY_INFO_TIME                = BIT(2),
        SURVEY_INFO_TIME_BUSY                = BIT(3),
        SURVEY_INFO_TIME_EXT_BUSY        = BIT(4),
        SURVEY_INFO_TIME_RX                = BIT(5),
        SURVEY_INFO_TIME_TX                = BIT(6),
        SURVEY_INFO_TIME_SCAN                = BIT(7),
        SURVEY_INFO_TIME_BSS_RX                = BIT(8),
};

/**
 * struct survey_info - channel survey response
 *
 * @channel: the channel this survey record reports, may be %NULL for a single
 *        record to report global statistics
 * @filled: bitflag of flags from &enum survey_info_flags
 * @noise: channel noise in dBm. This and all following fields are
 *        optional
 * @time: amount of time in ms the radio was turn on (on the channel)
 * @time_busy: amount of time the primary channel was sensed busy
 * @time_ext_busy: amount of time the extension channel was sensed busy
 * @time_rx: amount of time the radio spent receiving data
 * @time_tx: amount of time the radio spent transmitting data
 * @time_scan: amount of time the radio spent for scanning
 * @time_bss_rx: amount of time the radio spent receiving data on a local BSS
 *
 * Used by dump_survey() to report back per-channel survey information.
 *
 * This structure can later be expanded with things like
 * channel duty cycle etc.
 */
struct survey_info {
        struct ieee80211_channel *channel;
        u64 time;
        u64 time_busy;
        u64 time_ext_busy;
        u64 time_rx;
        u64 time_tx;
        u64 time_scan;
        u64 time_bss_rx;
        u32 filled;
        s8 noise;
};

#define CFG80211_MAX_WEP_KEYS        4

/**
 * struct cfg80211_crypto_settings - Crypto settings
 * @wpa_versions: indicates which, if any, WPA versions are enabled
 *        (from enum nl80211_wpa_versions)
 * @cipher_group: group key cipher suite (or 0 if unset)
 * @n_ciphers_pairwise: number of AP supported unicast ciphers
 * @ciphers_pairwise: unicast key cipher suites
 * @n_akm_suites: number of AKM suites
 * @akm_suites: AKM suites
 * @control_port: Whether user space controls IEEE 802.1X port, i.e.,
 *        sets/clears %NL80211_STA_FLAG_AUTHORIZED. If true, the driver is
 *        required to assume that the port is unauthorized until authorized by
 *        user space. Otherwise, port is marked authorized by default.
 * @control_port_ethertype: the control port protocol that should be
 *        allowed through even on unauthorized ports
 * @control_port_no_encrypt: TRUE to prevent encryption of control port
 *        protocol frames.
 * @control_port_over_nl80211: TRUE if userspace expects to exchange control
 *        port frames over NL80211 instead of the network interface.
 * @control_port_no_preauth: disables pre-auth rx over the nl80211 control
 *        port for mac80211
 * @wep_keys: static WEP keys, if not NULL points to an array of
 *        CFG80211_MAX_WEP_KEYS WEP keys
 * @wep_tx_key: key index (0..3) of the default TX static WEP key
 * @psk: PSK (for devices supporting 4-way-handshake offload)
 * @sae_pwd: password for SAE authentication (for devices supporting SAE
 *        offload)
 * @sae_pwd_len: length of SAE password (for devices supporting SAE offload)
 */
struct cfg80211_crypto_settings {
        u32 wpa_versions;
        u32 cipher_group;
        int n_ciphers_pairwise;
        u32 ciphers_pairwise[NL80211_MAX_NR_CIPHER_SUITES];
        int n_akm_suites;
        u32 akm_suites[NL80211_MAX_NR_AKM_SUITES];
        bool control_port;
        __be16 control_port_ethertype;
        bool control_port_no_encrypt;
        bool control_port_over_nl80211;
        bool control_port_no_preauth;
        struct key_params *wep_keys;
        int wep_tx_key;
        const u8 *psk;
        const u8 *sae_pwd;
        u8 sae_pwd_len;
};

/**
 * struct cfg80211_beacon_data - beacon data
 * @head: head portion of beacon (before TIM IE)
 *        or %NULL if not changed
 * @tail: tail portion of beacon (after TIM IE)
 *        or %NULL if not changed
 * @head_len: length of @head
 * @tail_len: length of @tail
 * @beacon_ies: extra information element(s) to add into Beacon frames or %NULL
 * @beacon_ies_len: length of beacon_ies in octets
 * @proberesp_ies: extra information element(s) to add into Probe Response
 *        frames or %NULL
 * @proberesp_ies_len: length of proberesp_ies in octets
 * @assocresp_ies: extra information element(s) to add into (Re)Association
 *        Response frames or %NULL
 * @assocresp_ies_len: length of assocresp_ies in octets
 * @probe_resp_len: length of probe response template (@probe_resp)
 * @probe_resp: probe response template (AP mode only)
 * @ftm_responder: enable FTM responder functionality; -1 for no change
 *        (which also implies no change in LCI/civic location data)
 * @lci: Measurement Report element content, starting with Measurement Token
 *        (measurement type 8)
 * @civicloc: Measurement Report element content, starting with Measurement
 *        Token (measurement type 11)
 * @lci_len: LCI data length
 * @civicloc_len: Civic location data length
 */
struct cfg80211_beacon_data {
        const u8 *head, *tail;
        const u8 *beacon_ies;
        const u8 *proberesp_ies;
        const u8 *assocresp_ies;
        const u8 *probe_resp;
        const u8 *lci;
        const u8 *civicloc;
        s8 ftm_responder;

        size_t head_len, tail_len;
        size_t beacon_ies_len;
        size_t proberesp_ies_len;
        size_t assocresp_ies_len;
        size_t probe_resp_len;
        size_t lci_len;
        size_t civicloc_len;
};

struct mac_address {
        u8 addr[ETH_ALEN];
};

/**
 * struct cfg80211_acl_data - Access control list data
 *
 * @acl_policy: ACL policy to be applied on the station's
 *        entry specified by mac_addr
 * @n_acl_entries: Number of MAC address entries passed
 * @mac_addrs: List of MAC addresses of stations to be used for ACL
 */
struct cfg80211_acl_data {
        enum nl80211_acl_policy acl_policy;
        int n_acl_entries;

        /* Keep it last */
        struct mac_address mac_addrs[];
};

/**
 * struct cfg80211_fils_discovery - FILS discovery parameters from
 * IEEE Std 802.11ai-2016, Annex C.3 MIB detail.
 *
 * @min_interval: Minimum packet interval in TUs (0 - 10000)
 * @max_interval: Maximum packet interval in TUs (0 - 10000)
 * @tmpl_len: Template length
 * @tmpl: Template data for FILS discovery frame including the action
 *        frame headers.
 */
struct cfg80211_fils_discovery {
        u32 min_interval;
        u32 max_interval;
        size_t tmpl_len;
        const u8 *tmpl;
};

/**
 * struct cfg80211_unsol_bcast_probe_resp - Unsolicited broadcast probe
 *        response parameters in 6GHz.
 *
 * @interval: Packet interval in TUs. Maximum allowed is 20 TU, as mentioned
 *        in IEEE P802.11ax/D6.0 26.17.2.3.2 - AP behavior for fast passive
 *        scanning
 * @tmpl_len: Template length
 * @tmpl: Template data for probe response
 */
struct cfg80211_unsol_bcast_probe_resp {
        u32 interval;
        size_t tmpl_len;
        const u8 *tmpl;
};

/**
 * enum cfg80211_ap_settings_flags - AP settings flags
 *
 * Used by cfg80211_ap_settings
 *
 * @AP_SETTINGS_EXTERNAL_AUTH_SUPPORT: AP supports external authentication
 */
enum cfg80211_ap_settings_flags {
        AP_SETTINGS_EXTERNAL_AUTH_SUPPORT = BIT(0),
};

/**
 * struct cfg80211_ap_settings - AP configuration
 *
 * Used to configure an AP interface.
 *
 * @chandef: defines the channel to use
 * @beacon: beacon data
 * @beacon_interval: beacon interval
 * @dtim_period: DTIM period
 * @ssid: SSID to be used in the BSS (note: may be %NULL if not provided from
 *        user space)
 * @ssid_len: length of @ssid
 * @hidden_ssid: whether to hide the SSID in Beacon/Probe Response frames
 * @crypto: crypto settings
 * @privacy: the BSS uses privacy
 * @auth_type: Authentication type (algorithm)
 * @smps_mode: SMPS mode
 * @inactivity_timeout: time in seconds to determine station's inactivity.
 * @p2p_ctwindow: P2P CT Window
 * @p2p_opp_ps: P2P opportunistic PS
 * @acl: ACL configuration used by the drivers which has support for
 *        MAC address based access control
 * @pbss: If set, start as a PCP instead of AP. Relevant for DMG
 *        networks.
 * @beacon_rate: bitrate to be used for beacons
 * @ht_cap: HT capabilities (or %NULL if HT isn't enabled)
 * @vht_cap: VHT capabilities (or %NULL if VHT isn't enabled)
 * @he_cap: HE capabilities (or %NULL if HE isn't enabled)
 * @ht_required: stations must support HT
 * @vht_required: stations must support VHT
 * @twt_responder: Enable Target Wait Time
 * @he_required: stations must support HE
 * @flags: flags, as defined in enum cfg80211_ap_settings_flags
 * @he_obss_pd: OBSS Packet Detection settings
 * @he_bss_color: BSS Color settings
 * @he_oper: HE operation IE (or %NULL if HE isn't enabled)
 * @fils_discovery: FILS discovery transmission parameters
 * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters
 */
struct cfg80211_ap_settings {
        struct cfg80211_chan_def chandef;

        struct cfg80211_beacon_data beacon;

        int beacon_interval, dtim_period;
        const u8 *ssid;
        size_t ssid_len;
        enum nl80211_hidden_ssid hidden_ssid;
        struct cfg80211_crypto_settings crypto;
        bool privacy;
        enum nl80211_auth_type auth_type;
        enum nl80211_smps_mode smps_mode;
        int inactivity_timeout;
        u8 p2p_ctwindow;
        bool p2p_opp_ps;
        const struct cfg80211_acl_data *acl;
        bool pbss;
        struct cfg80211_bitrate_mask beacon_rate;

        const struct ieee80211_ht_cap *ht_cap;
        const struct ieee80211_vht_cap *vht_cap;
        const struct ieee80211_he_cap_elem *he_cap;
        const struct ieee80211_he_operation *he_oper;
        bool ht_required, vht_required, he_required;
        bool twt_responder;
        u32 flags;
        struct ieee80211_he_obss_pd he_obss_pd;
        struct cfg80211_he_bss_color he_bss_color;
        struct cfg80211_fils_discovery fils_discovery;
        struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp;
};

/**
 * struct cfg80211_csa_settings - channel switch settings
 *
 * Used for channel switch
 *
 * @chandef: defines the channel to use after the switch
 * @beacon_csa: beacon data while performing the switch
 * @counter_offsets_beacon: offsets of the counters within the beacon (tail)
 * @counter_offsets_presp: offsets of the counters within the probe response
 * @n_counter_offsets_beacon: number of csa counters the beacon (tail)
 * @n_counter_offsets_presp: number of csa counters in the probe response
 * @beacon_after: beacon data to be used on the new channel
 * @radar_required: whether radar detection is required on the new channel
 * @block_tx: whether transmissions should be blocked while changing
 * @count: number of beacons until switch
 */
struct cfg80211_csa_settings {
        struct cfg80211_chan_def chandef;
        struct cfg80211_beacon_data beacon_csa;
        const u16 *counter_offsets_beacon;
        const u16 *counter_offsets_presp;
        unsigned int n_counter_offsets_beacon;
        unsigned int n_counter_offsets_presp;
        struct cfg80211_beacon_data beacon_after;
        bool radar_required;
        bool block_tx;
        u8 count;
};

#define CFG80211_MAX_NUM_DIFFERENT_CHANNELS 10

/**
 * struct iface_combination_params - input parameters for interface combinations
 *
 * Used to pass interface combination parameters
 *
 * @num_different_channels: the number of different channels we want
 *        to use for verification
 * @radar_detect: a bitmap where each bit corresponds to a channel
 *        width where radar detection is needed, as in the definition of
 *        &struct ieee80211_iface_combination.@radar_detect_widths
 * @iftype_num: array with the number of interfaces of each interface
 *        type.  The index is the interface type as specified in &enum
 *        nl80211_iftype.
 * @new_beacon_int: set this to the beacon interval of a new interface
 *        that's not operating yet, if such is to be checked as part of
 *        the verification
 */
struct iface_combination_params {
        int num_different_channels;
        u8 radar_detect;
        int iftype_num[NUM_NL80211_IFTYPES];
        u32 new_beacon_int;
};

/**
 * enum station_parameters_apply_mask - station parameter values to apply
 * @STATION_PARAM_APPLY_UAPSD: apply new uAPSD parameters (uapsd_queues, max_sp)
 * @STATION_PARAM_APPLY_CAPABILITY: apply new capability
 * @STATION_PARAM_APPLY_PLINK_STATE: apply new plink state
 *
 * Not all station parameters have in-band "no change" signalling,
 * for those that don't these flags will are used.
 */
enum station_parameters_apply_mask {
        STATION_PARAM_APPLY_UAPSD = BIT(0),
        STATION_PARAM_APPLY_CAPABILITY = BIT(1),
        STATION_PARAM_APPLY_PLINK_STATE = BIT(2),
        STATION_PARAM_APPLY_STA_TXPOWER = BIT(3),
};

/**
 * struct sta_txpwr - station txpower configuration
 *
 * Used to configure txpower for station.
 *
 * @power: tx power (in dBm) to be used for sending data traffic. If tx power
 *        is not provided, the default per-interface tx power setting will be
 *        overriding. Driver should be picking up the lowest tx power, either tx
 *        power per-interface or per-station.
 * @type: In particular if TPC %type is NL80211_TX_POWER_LIMITED then tx power
 *        will be less than or equal to specified from userspace, whereas if TPC
 *        %type is NL80211_TX_POWER_AUTOMATIC then it indicates default tx power.
 *        NL80211_TX_POWER_FIXED is not a valid configuration option for
 *        per peer TPC.
 */
struct sta_txpwr {
        s16 power;
        enum nl80211_tx_power_setting type;
};

/**
 * struct station_parameters - station parameters
 *
 * Used to change and create a new station.
 *
 * @vlan: vlan interface station should belong to
 * @supported_rates: supported rates in IEEE 802.11 format
 *        (or NULL for no change)
 * @supported_rates_len: number of supported rates
 * @sta_flags_mask: station flags that changed
 *        (bitmask of BIT(%NL80211_STA_FLAG_...))
 * @sta_flags_set: station flags values
 *        (bitmask of BIT(%NL80211_STA_FLAG_...))
 * @listen_interval: listen interval or -1 for no change
 * @aid: AID or zero for no change
 * @vlan_id: VLAN ID for station (if nonzero)
 * @peer_aid: mesh peer AID or zero for no change
 * @plink_action: plink action to take
 * @plink_state: set the peer link state for a station
 * @ht_capa: HT capabilities of station
 * @vht_capa: VHT capabilities of station
 * @uapsd_queues: bitmap of queues configured for uapsd. same format
 *        as the AC bitmap in the QoS info field
 * @max_sp: max Service Period. same format as the MAX_SP in the
 *        QoS info field (but already shifted down)
 * @sta_modify_mask: bitmap indicating which parameters changed
 *        (for those that don't have a natural "no change" value),
 *        see &enum station_parameters_apply_mask
 * @local_pm: local link-specific mesh power save mode (no change when set
 *        to unknown)
 * @capability: station capability
 * @ext_capab: extended capabilities of the station
 * @ext_capab_len: number of extended capabilities
 * @supported_channels: supported channels in IEEE 802.11 format
 * @supported_channels_len: number of supported channels
 * @supported_oper_classes: supported oper classes in IEEE 802.11 format
 * @supported_oper_classes_len: number of supported operating classes
 * @opmode_notif: operating mode field from Operating Mode Notification
 * @opmode_notif_used: information if operating mode field is used
 * @support_p2p_ps: information if station supports P2P PS mechanism
 * @he_capa: HE capabilities of station
 * @he_capa_len: the length of the HE capabilities
 * @airtime_weight: airtime scheduler weight for this station
 * @txpwr: transmit power for an associated station
 * @he_6ghz_capa: HE 6 GHz Band capabilities of station
 */
struct station_parameters {
        const u8 *supported_rates;
        struct net_device *vlan;
        u32 sta_flags_mask, sta_flags_set;
        u32 sta_modify_mask;
        int listen_interval;
        u16 aid;
        u16 vlan_id;
        u16 peer_aid;
        u8 supported_rates_len;
        u8 plink_action;
        u8 plink_state;
        const struct ieee80211_ht_cap *ht_capa;
        const struct ieee80211_vht_cap *vht_capa;
        u8 uapsd_queues;
        u8 max_sp;
        enum nl80211_mesh_power_mode local_pm;
        u16 capability;
        const u8 *ext_capab;
        u8 ext_capab_len;
        const u8 *supported_channels;
        u8 supported_channels_len;
        const u8 *supported_oper_classes;
        u8 supported_oper_classes_len;
        u8 opmode_notif;
        bool opmode_notif_used;
        int support_p2p_ps;
        const struct ieee80211_he_cap_elem *he_capa;
        u8 he_capa_len;
        u16 airtime_weight;
        struct sta_txpwr txpwr;
        const struct ieee80211_he_6ghz_capa *he_6ghz_capa;
};

/**
 * struct station_del_parameters - station deletion parameters
 *
 * Used to delete a station entry (or all stations).
 *
 * @mac: MAC address of the station to remove or NULL to remove all stations
 * @subtype: Management frame subtype to use for indicating removal
 *        (10 = Disassociation, 12 = Deauthentication)
 * @reason_code: Reason code for the Disassociation/Deauthentication frame
 */
struct station_del_parameters {
        const u8 *mac;
        u8 subtype;
        u16 reason_code;
};

/**
 * enum cfg80211_station_type - the type of station being modified
 * @CFG80211_STA_AP_CLIENT: client of an AP interface
 * @CFG80211_STA_AP_CLIENT_UNASSOC: client of an AP interface that is still
 *        unassociated (update properties for this type of client is permitted)
 * @CFG80211_STA_AP_MLME_CLIENT: client of an AP interface that has
 *        the AP MLME in the device
 * @CFG80211_STA_AP_STA: AP station on managed interface
 * @CFG80211_STA_IBSS: IBSS station
 * @CFG80211_STA_TDLS_PEER_SETUP: TDLS peer on managed interface (dummy entry
 *        while TDLS setup is in progress, it moves out of this state when
 *        being marked authorized; use this only if TDLS with external setup is
 *        supported/used)
 * @CFG80211_STA_TDLS_PEER_ACTIVE: TDLS peer on managed interface (active
 *        entry that is operating, has been marked authorized by userspace)
 * @CFG80211_STA_MESH_PEER_KERNEL: peer on mesh interface (kernel managed)
 * @CFG80211_STA_MESH_PEER_USER: peer on mesh interface (user managed)
 */
enum cfg80211_station_type {
        CFG80211_STA_AP_CLIENT,
        CFG80211_STA_AP_CLIENT_UNASSOC,
        CFG80211_STA_AP_MLME_CLIENT,
        CFG80211_STA_AP_STA,
        CFG80211_STA_IBSS,
        CFG80211_STA_TDLS_PEER_SETUP,
        CFG80211_STA_TDLS_PEER_ACTIVE,
        CFG80211_STA_MESH_PEER_KERNEL,
        CFG80211_STA_MESH_PEER_USER,
};

/**
 * cfg80211_check_station_change - validate parameter changes
 * @wiphy: the wiphy this operates on
 * @params: the new parameters for a station
 * @statype: the type of station being modified
 *
 * Utility function for the @change_station driver method. Call this function
 * with the appropriate station type looking up the station (and checking that
 * it exists). It will verify whether the station change is acceptable, and if
 * not will return an error code. Note that it may modify the parameters for
 * backward compatibility reasons, so don't use them before calling this.
 */
int cfg80211_check_station_change(struct wiphy *wiphy,
                                  struct station_parameters *params,
                                  enum cfg80211_station_type statype);

/**
 * enum rate_info_flags - bitrate info flags
 *
 * Used by the driver to indicate the specific rate transmission
 * type for 802.11n transmissions.
 *
 * @RATE_INFO_FLAGS_MCS: mcs field filled with HT MCS
 * @RATE_INFO_FLAGS_VHT_MCS: mcs field filled with VHT MCS
 * @RATE_INFO_FLAGS_SHORT_GI: 400ns guard interval
 * @RATE_INFO_FLAGS_DMG: 60GHz MCS
 * @RATE_INFO_FLAGS_HE_MCS: HE MCS information
 * @RATE_INFO_FLAGS_EDMG: 60GHz MCS in EDMG mode
 */
enum rate_info_flags {
        RATE_INFO_FLAGS_MCS                        = BIT(0),
        RATE_INFO_FLAGS_VHT_MCS                        = BIT(1),
        RATE_INFO_FLAGS_SHORT_GI                = BIT(2),
        RATE_INFO_FLAGS_DMG                        = BIT(3),
        RATE_INFO_FLAGS_HE_MCS                        = BIT(4),
        RATE_INFO_FLAGS_EDMG                        = BIT(5),
};

/**
 * enum rate_info_bw - rate bandwidth information
 *
 * Used by the driver to indicate the rate bandwidth.
 *
 * @RATE_INFO_BW_5: 5 MHz bandwidth
 * @RATE_INFO_BW_10: 10 MHz bandwidth
 * @RATE_INFO_BW_20: 20 MHz bandwidth
 * @RATE_INFO_BW_40: 40 MHz bandwidth
 * @RATE_INFO_BW_80: 80 MHz bandwidth
 * @RATE_INFO_BW_160: 160 MHz bandwidth
 * @RATE_INFO_BW_HE_RU: bandwidth determined by HE RU allocation
 */
enum rate_info_bw {
        RATE_INFO_BW_20 = 0,
        RATE_INFO_BW_5,
        RATE_INFO_BW_10,
        RATE_INFO_BW_40,
        RATE_INFO_BW_80,
        RATE_INFO_BW_160,
        RATE_INFO_BW_HE_RU,
};

/**
 * struct rate_info - bitrate information
 *
 * Information about a receiving or transmitting bitrate
 *
 * @flags: bitflag of flags from &enum rate_info_flags
 * @mcs: mcs index if struct describes an HT/VHT/HE rate
 * @legacy: bitrate in 100kbit/s for 802.11abg
 * @nss: number of streams (VHT & HE only)
 * @bw: bandwidth (from &enum rate_info_bw)
 * @he_gi: HE guard interval (from &enum nl80211_he_gi)
 * @he_dcm: HE DCM value
 * @he_ru_alloc: HE RU allocation (from &enum nl80211_he_ru_alloc,
 *        only valid if bw is %RATE_INFO_BW_HE_RU)
 * @n_bonded_ch: In case of EDMG the number of bonded channels (1-4)
 */
struct rate_info {
        u8 flags;
        u8 mcs;
        u16 legacy;
        u8 nss;
        u8 bw;
        u8 he_gi;
        u8 he_dcm;
        u8 he_ru_alloc;
        u8 n_bonded_ch;
};

/**
 * enum bss_param_flags - bitrate info flags
 *
 * Used by the driver to indicate the specific rate transmission
 * type for 802.11n transmissions.
 *
 * @BSS_PARAM_FLAGS_CTS_PROT: whether CTS protection is enabled
 * @BSS_PARAM_FLAGS_SHORT_PREAMBLE: whether short preamble is enabled
 * @BSS_PARAM_FLAGS_SHORT_SLOT_TIME: whether short slot time is enabled
 */
enum bss_param_flags {
        BSS_PARAM_FLAGS_CTS_PROT        = 1<<0,
        BSS_PARAM_FLAGS_SHORT_PREAMBLE        = 1<<1,
        BSS_PARAM_FLAGS_SHORT_SLOT_TIME        = 1<<2,
};

/**
 * struct sta_bss_parameters - BSS parameters for the attached station
 *
 * Information about the currently associated BSS
 *
 * @flags: bitflag of flags from &enum bss_param_flags
 * @dtim_period: DTIM period for the BSS
 * @beacon_interval: beacon interval
 */
struct sta_bss_parameters {
        u8 flags;
        u8 dtim_period;
        u16 beacon_interval;
};

/**
 * struct cfg80211_txq_stats - TXQ statistics for this TID
 * @filled: bitmap of flags using the bits of &enum nl80211_txq_stats to
 *        indicate the relevant values in this struct are filled
 * @backlog_bytes: total number of bytes currently backlogged
 * @backlog_packets: total number of packets currently backlogged
 * @flows: number of new flows seen
 * @drops: total number of packets dropped
 * @ecn_marks: total number of packets marked with ECN CE
 * @overlimit: number of drops due to queue space overflow
 * @overmemory: number of drops due to memory limit overflow
 * @collisions: number of hash collisions
 * @tx_bytes: total number of bytes dequeued
 * @tx_packets: total number of packets dequeued
 * @max_flows: maximum number of flows supported
 */
struct cfg80211_txq_stats {
        u32 filled;
        u32 backlog_bytes;
        u32 backlog_packets;
        u32 flows;
        u32 drops;
        u32 ecn_marks;
        u32 overlimit;
        u32 overmemory;
        u32 collisions;
        u32 tx_bytes;
        u32 tx_packets;
        u32 max_flows;
};

/**
 * struct cfg80211_tid_stats - per-TID statistics
 * @filled: bitmap of flags using the bits of &enum nl80211_tid_stats to
 *        indicate the relevant values in this struct are filled
 * @rx_msdu: number of received MSDUs
 * @tx_msdu: number of (attempted) transmitted MSDUs
 * @tx_msdu_retries: number of retries (not counting the first) for
 *        transmitted MSDUs
 * @tx_msdu_failed: number of failed transmitted MSDUs
 * @txq_stats: TXQ statistics
 */
struct cfg80211_tid_stats {
        u32 filled;
        u64 rx_msdu;
        u64 tx_msdu;
        u64 tx_msdu_retries;
        u64 tx_msdu_failed;
        struct cfg80211_txq_stats txq_stats;
};

#define IEEE80211_MAX_CHAINS        4

/**
 * struct station_info - station information
 *
 * Station information filled by driver for get_station() and dump_station.
 *
 * @filled: bitflag of flags using the bits of &enum nl80211_sta_info to
 *        indicate the relevant values in this struct for them
 * @connected_time: time(in secs) since a station is last connected
 * @inactive_time: time since last station activity (tx/rx) in milliseconds
 * @assoc_at: bootime (ns) of the last association
 * @rx_bytes: bytes (size of MPDUs) received from this station
 * @tx_bytes: bytes (size of MPDUs) transmitted to this station
 * @llid: mesh local link id
 * @plid: mesh peer link id
 * @plink_state: mesh peer link state
 * @signal: The signal strength, type depends on the wiphy's signal_type.
 *        For CFG80211_SIGNAL_TYPE_MBM, value is expressed in _dBm_.
 * @signal_avg: Average signal strength, type depends on the wiphy's signal_type.
 *        For CFG80211_SIGNAL_TYPE_MBM, value is expressed in _dBm_.
 * @chains: bitmask for filled values in @chain_signal, @chain_signal_avg
 * @chain_signal: per-chain signal strength of last received packet in dBm
 * @chain_signal_avg: per-chain signal strength average in dBm
 * @txrate: current unicast bitrate from this station
 * @rxrate: current unicast bitrate to this station
 * @rx_packets: packets (MSDUs & MMPDUs) received from this station
 * @tx_packets: packets (MSDUs & MMPDUs) transmitted to this station
 * @tx_retries: cumulative retry counts (MPDUs)
 * @tx_failed: number of failed transmissions (MPDUs) (retries exceeded, no ACK)
 * @rx_dropped_misc:  Dropped for un-specified reason.
 * @bss_param: current BSS parameters
 * @generation: generation number for nl80211 dumps.
 *        This number should increase every time the list of stations
 *        changes, i.e. when a station is added or removed, so that
 *        userspace can tell whether it got a consistent snapshot.
 * @assoc_req_ies: IEs from (Re)Association Request.
 *        This is used only when in AP mode with drivers that do not use
 *        user space MLME/SME implementation. The information is provided for
 *        the cfg80211_new_sta() calls to notify user space of the IEs.
 * @assoc_req_ies_len: Length of assoc_req_ies buffer in octets.
 * @sta_flags: station flags mask & values
 * @beacon_loss_count: Number of times beacon loss event has triggered.
 * @t_offset: Time offset of the station relative to this host.
 * @local_pm: local mesh STA power save mode
 * @peer_pm: peer mesh STA power save mode
 * @nonpeer_pm: non-peer mesh STA power save mode
 * @expected_throughput: expected throughput in kbps (including 802.11 headers)
 *        towards this station.
 * @rx_beacon: number of beacons received from this peer
 * @rx_beacon_signal_avg: signal strength average (in dBm) for beacons received
 *        from this peer
 * @connected_to_gate: true if mesh STA has a path to mesh gate
 * @rx_duration: aggregate PPDU duration(usecs) for all the frames from a peer
 * @tx_duration: aggregate PPDU duration(usecs) for all the frames to a peer
 * @airtime_weight: current airtime scheduling weight
 * @pertid: per-TID statistics, see &struct cfg80211_tid_stats, using the last
 *        (IEEE80211_NUM_TIDS) index for MSDUs not encapsulated in QoS-MPDUs.
 *        Note that this doesn't use the @filled bit, but is used if non-NULL.
 * @ack_signal: signal strength (in dBm) of the last ACK frame.
 * @avg_ack_signal: average rssi value of ack packet for the no of msdu's has
 *        been sent.
 * @rx_mpdu_count: number of MPDUs received from this station
 * @fcs_err_count: number of packets (MPDUs) received from this station with
 *        an FCS error. This counter should be incremented only when TA of the
 *        received packet with an FCS error matches the peer MAC address.
 * @airtime_link_metric: mesh airtime link metric.
 * @connected_to_as: true if mesh STA has a path to authentication server
 */
struct station_info {
        u64 filled;
        u32 connected_time;
        u32 inactive_time;
        u64 assoc_at;
        u64 rx_bytes;
        u64 tx_bytes;
        u16 llid;
        u16 plid;
        u8 plink_state;
        s8 signal;
        s8 signal_avg;

        u8 chains;
        s8 chain_signal[IEEE80211_MAX_CHAINS];
        s8 chain_signal_avg[IEEE80211_MAX_CHAINS];

        struct rate_info txrate;
        struct rate_info rxrate;
        u32 rx_packets;
        u32 tx_packets;
        u32 tx_retries;
        u32 tx_failed;
        u32 rx_dropped_misc;
        struct sta_bss_parameters bss_param;
        struct nl80211_sta_flag_update sta_flags;

        int generation;

        const u8 *assoc_req_ies;
        size_t assoc_req_ies_len;

        u32 beacon_loss_count;
        s64 t_offset;
        enum nl80211_mesh_power_mode local_pm;
        enum nl80211_mesh_power_mode peer_pm;
        enum nl80211_mesh_power_mode nonpeer_pm;

        u32 expected_throughput;

        u64 tx_duration;
        u64 rx_duration;
        u64 rx_beacon;
        u8 rx_beacon_signal_avg;
        u8 connected_to_gate;

        struct cfg80211_tid_stats *pertid;
        s8 ack_signal;
        s8 avg_ack_signal;

        u16 airtime_weight;

        u32 rx_mpdu_count;
        u32 fcs_err_count;

        u32 airtime_link_metric;

        u8 connected_to_as;
};

#if IS_ENABLED(CONFIG_CFG80211)
/**
 * cfg80211_get_station - retrieve information about a given station
 * @dev: the device where the station is supposed to be connected to
 * @mac_addr: the mac address of the station of interest
 * @sinfo: pointer to the structure to fill with the information
 *
 * Returns 0 on success and sinfo is filled with the available information
 * otherwise returns a negative error code and the content of sinfo has to be
 * considered undefined.
 */
int cfg80211_get_station(struct net_device *dev, const u8 *mac_addr,
                         struct station_info *sinfo);
#else
static inline int cfg80211_get_station(struct net_device *dev,
                                       const u8 *mac_addr,
                                       struct station_info *sinfo)
{
        return -ENOENT;
}
#endif

/**
 * enum monitor_flags - monitor flags
 *
 * Monitor interface configuration flags. Note that these must be the bits
 * according to the nl80211 flags.
 *
 * @MONITOR_FLAG_CHANGED: set if the flags were changed
 * @MONITOR_FLAG_FCSFAIL: pass frames with bad FCS
 * @MONITOR_FLAG_PLCPFAIL: pass frames with bad PLCP
 * @MONITOR_FLAG_CONTROL: pass control frames
 * @MONITOR_FLAG_OTHER_BSS: disable BSSID filtering
 * @MONITOR_FLAG_COOK_FRAMES: report frames after processing
 * @MONITOR_FLAG_ACTIVE: active monitor, ACKs frames on its MAC address
 */
enum monitor_flags {
        MONITOR_FLAG_CHANGED                = 1<<__NL80211_MNTR_FLAG_INVALID,
        MONITOR_FLAG_FCSFAIL                = 1<<NL80211_MNTR_FLAG_FCSFAIL,
        MONITOR_FLAG_PLCPFAIL                = 1<<NL80211_MNTR_FLAG_PLCPFAIL,
        MONITOR_FLAG_CONTROL                = 1<<NL80211_MNTR_FLAG_CONTROL,
        MONITOR_FLAG_OTHER_BSS                = 1<<NL80211_MNTR_FLAG_OTHER_BSS,
        MONITOR_FLAG_COOK_FRAMES        = 1<<NL80211_MNTR_FLAG_COOK_FRAMES,
        MONITOR_FLAG_ACTIVE                = 1<<NL80211_MNTR_FLAG_ACTIVE,
};

/**
 * enum mpath_info_flags -  mesh path information flags
 *
 * Used by the driver to indicate which info in &struct mpath_info it has filled
 * in during get_station() or dump_station().
 *
 * @MPATH_INFO_FRAME_QLEN: @frame_qlen filled
 * @MPATH_INFO_SN: @sn filled
 * @MPATH_INFO_METRIC: @metric filled
 * @MPATH_INFO_EXPTIME: @exptime filled
 * @MPATH_INFO_DISCOVERY_TIMEOUT: @discovery_timeout filled
 * @MPATH_INFO_DISCOVERY_RETRIES: @discovery_retries filled
 * @MPATH_INFO_FLAGS: @flags filled
 * @MPATH_INFO_HOP_COUNT: @hop_count filled
 * @MPATH_INFO_PATH_CHANGE: @path_change_count filled
 */
enum mpath_info_flags {
        MPATH_INFO_FRAME_QLEN                = BIT(0),
        MPATH_INFO_SN                        = BIT(1),
        MPATH_INFO_METRIC                = BIT(2),
        MPATH_INFO_EXPTIME                = BIT(3),
        MPATH_INFO_DISCOVERY_TIMEOUT        = BIT(4),
        MPATH_INFO_DISCOVERY_RETRIES        = BIT(5),
        MPATH_INFO_FLAGS                = BIT(6),
        MPATH_INFO_HOP_COUNT                = BIT(7),
        MPATH_INFO_PATH_CHANGE                = BIT(8),
};

/**
 * struct mpath_info - mesh path information
 *
 * Mesh path information filled by driver for get_mpath() and dump_mpath().
 *
 * @filled: bitfield of flags from &enum mpath_info_flags
 * @frame_qlen: number of queued frames for this destination
 * @sn: target sequence number
 * @metric: metric (cost) of this mesh path
 * @exptime: expiration time for the mesh path from now, in msecs
 * @flags: mesh path flags
 * @discovery_timeout: total mesh path discovery timeout, in msecs
 * @discovery_retries: mesh path discovery retries
 * @generation: generation number for nl80211 dumps.
 *        This number should increase every time the list of mesh paths
 *        changes, i.e. when a station is added or removed, so that
 *        userspace can tell whether it got a consistent snapshot.
 * @hop_count: hops to destination
 * @path_change_count: total number of path changes to destination
 */
struct mpath_info {
        u32 filled;
        u32 frame_qlen;
        u32 sn;
        u32 metric;
        u32 exptime;
        u32 discovery_timeout;
        u8 discovery_retries;
        u8 flags;
        u8 hop_count;
        u32 path_change_count;

        int generation;
};

/**
 * struct bss_parameters - BSS parameters
 *
 * Used to change BSS parameters (mainly for AP mode).
 *
 * @use_cts_prot: Whether to use CTS protection
 *        (0 = no, 1 = yes, -1 = do not change)
 * @use_short_preamble: Whether the use of short preambles is allowed
 *        (0 = no, 1 = yes, -1 = do not change)
 * @use_short_slot_time: Whether the use of short slot time is allowed
 *        (0 = no, 1 = yes, -1 = do not change)
 * @basic_rates: basic rates in IEEE 802.11 format
 *        (or NULL for no change)
 * @basic_rates_len: number of basic rates
 * @ap_isolate: do not forward packets between connected stations
 *        (0 = no, 1 = yes, -1 = do not change)
 * @ht_opmode: HT Operation mode
 *        (u16 = opmode, -1 = do not change)
 * @p2p_ctwindow: P2P CT Window (-1 = no change)
 * @p2p_opp_ps: P2P opportunistic PS (-1 = no change)
 */
struct bss_parameters {
        int use_cts_prot;
        int use_short_preamble;
        int use_short_slot_time;
        const u8 *basic_rates;
        u8 basic_rates_len;
        int ap_isolate;
        int ht_opmode;
        s8 p2p_ctwindow, p2p_opp_ps;
};

/**
 * struct mesh_config - 802.11s mesh configuration
 *
 * These parameters can be changed while the mesh is active.
 *
 * @dot11MeshRetryTimeout: the initial retry timeout in millisecond units used
 *        by the Mesh Peering Open message
 * @dot11MeshConfirmTimeout: the initial retry timeout in millisecond units
 *        used by the Mesh Peering Open message
 * @dot11MeshHoldingTimeout: the confirm timeout in millisecond units used by
 *        the mesh peering management to close a mesh peering
 * @dot11MeshMaxPeerLinks: the maximum number of peer links allowed on this
 *        mesh interface
 * @dot11MeshMaxRetries: the maximum number of peer link open retries that can
 *        be sent to establish a new peer link instance in a mesh
 * @dot11MeshTTL: the value of TTL field set at a source mesh STA
 * @element_ttl: the value of TTL field set at a mesh STA for path selection
 *        elements
 * @auto_open_plinks: whether we should automatically open peer links when we
 *        detect compatible mesh peers
 * @dot11MeshNbrOffsetMaxNeighbor: the maximum number of neighbors to
 *        synchronize to for 11s default synchronization method
 * @dot11MeshHWMPmaxPREQretries: the number of action frames containing a PREQ
 *        that an originator mesh STA can send to a particular path target
 * @path_refresh_time: how frequently to refresh mesh paths in milliseconds
 * @min_discovery_timeout: the minimum length of time to wait until giving up on
 *        a path discovery in milliseconds
 * @dot11MeshHWMPactivePathTimeout: the time (in TUs) for which mesh STAs
 *        receiving a PREQ shall consider the forwarding information from the
 *        root to be valid. (TU = time unit)
 * @dot11MeshHWMPpreqMinInterval: the minimum interval of time (in TUs) during
 *        which a mesh STA can send only one action frame containing a PREQ
 *        element
 * @dot11MeshHWMPperrMinInterval: the minimum interval of time (in TUs) during
 *        which a mesh STA can send only one Action frame containing a PERR
 *        element
 * @dot11MeshHWMPnetDiameterTraversalTime: the interval of time (in TUs) that
 *        it takes for an HWMP information element to propagate across the mesh
 * @dot11MeshHWMPRootMode: the configuration of a mesh STA as root mesh STA
 * @dot11MeshHWMPRannInterval: the interval of time (in TUs) between root
 *        announcements are transmitted
 * @dot11MeshGateAnnouncementProtocol: whether to advertise that this mesh
 *        station has access to a broader network beyond the MBSS. (This is
 *        missnamed in draft 12.0: dot11MeshGateAnnouncementProtocol set to true
 *        only means that the station will announce others it's a mesh gate, but
 *        not necessarily using the gate announcement protocol. Still keeping the
 *        same nomenclature to be in sync with the spec)
 * @dot11MeshForwarding: whether the Mesh STA is forwarding or non-forwarding
 *        entity (default is TRUE - forwarding entity)
 * @rssi_threshold: the threshold for average signal strength of candidate
 *        station to establish a peer link
 * @ht_opmode: mesh HT protection mode
 *
 * @dot11MeshHWMPactivePathToRootTimeout: The time (in TUs) for which mesh STAs
 *        receiving a proactive PREQ shall consider the forwarding information to
 *        the root mesh STA to be valid.
 *
 * @dot11MeshHWMProotInterval: The interval of time (in TUs) between proactive
 *        PREQs are transmitted.
 * @dot11MeshHWMPconfirmationInterval: The minimum interval of time (in TUs)
 *        during which a mesh STA can send only one Action frame containing
 *        a PREQ element for root path confirmation.
 * @power_mode: The default mesh power save mode which will be the initial
 *        setting for new peer links.
 * @dot11MeshAwakeWindowDuration: The duration in TUs the STA will remain awake
 *        after transmitting its beacon.
 * @plink_timeout: If no tx activity is seen from a STA we've established
 *        peering with for longer than this time (in seconds), then remove it
 *        from the STA's list of peers.  Default is 30 minutes.
 * @dot11MeshConnectedToMeshGate: if set to true, advertise that this STA is
 *      connected to a mesh gate in mesh formation info.  If false, the
 *      value in mesh formation is determined by the presence of root paths
 *      in the mesh path table
 * @dot11MeshNolearn: Try to avoid multi-hop path discovery (e.g. PREQ/PREP
 *      for HWMP) if the destination is a direct neighbor. Note that this might
 *      not be the optimal decision as a multi-hop route might be better. So
 *      if using this setting you will likely also want to disable
 *      dot11MeshForwarding and use another mesh routing protocol on top.
 */
struct mesh_config {
        u16 dot11MeshRetryTimeout;
        u16 dot11MeshConfirmTimeout;
        u16 dot11MeshHoldingTimeout;
        u16 dot11MeshMaxPeerLinks;
        u8 dot11MeshMaxRetries;
        u8 dot11MeshTTL;
        u8 element_ttl;
        bool auto_open_plinks;
        u32 dot11MeshNbrOffsetMaxNeighbor;
        u8 dot11MeshHWMPmaxPREQretries;
        u32 path_refresh_time;
        u16 min_discovery_timeout;
        u32 dot11MeshHWMPactivePathTimeout;
        u16 dot11MeshHWMPpreqMinInterval;
        u16 dot11MeshHWMPperrMinInterval;
        u16 dot11MeshHWMPnetDiameterTraversalTime;
        u8 dot11MeshHWMPRootMode;
        bool dot11MeshConnectedToMeshGate;
        bool dot11MeshConnectedToAuthServer;
        u16 dot11MeshHWMPRannInterval;
        bool dot11MeshGateAnnouncementProtocol;
        bool dot11MeshForwarding;
        s32 rssi_threshold;
        u16 ht_opmode;
        u32 dot11MeshHWMPactivePathToRootTimeout;
        u16 dot11MeshHWMProotInterval;
        u16 dot11MeshHWMPconfirmationInterval;
        enum nl80211_mesh_power_mode power_mode;
        u16 dot11MeshAwakeWindowDuration;
        u32 plink_timeout;
        bool dot11MeshNolearn;
};

/**
 * struct mesh_setup - 802.11s mesh setup configuration
 * @chandef: defines the channel to use
 * @mesh_id: the mesh ID
 * @mesh_id_len: length of the mesh ID, at least 1 and at most 32 bytes
 * @sync_method: which synchronization method to use
 * @path_sel_proto: which path selection protocol to use
 * @path_metric: which metric to use
 * @auth_id: which authentication method this mesh is using
 * @ie: vendor information elements (optional)
 * @ie_len: length of vendor information elements
 * @is_authenticated: this mesh requires authentication
 * @is_secure: this mesh uses security
 * @user_mpm: userspace handles all MPM functions
 * @dtim_period: DTIM period to use
 * @beacon_interval: beacon interval to use
 * @mcast_rate: multicat rate for Mesh Node [6Mbps is the default for 802.11a]
 * @basic_rates: basic rates to use when creating the mesh
 * @beacon_rate: bitrate to be used for beacons
 * @userspace_handles_dfs: whether user space controls DFS operation, i.e.
 *        changes the channel when a radar is detected. This is required
 *        to operate on DFS channels.
 * @control_port_over_nl80211: TRUE if userspace expects to exchange control
 *        port frames over NL80211 instead of the network interface.
 *
 * These parameters are fixed when the mesh is created.
 */
struct mesh_setup {
        struct cfg80211_chan_def chandef;
        const u8 *mesh_id;
        u8 mesh_id_len;
        u8 sync_method;
        u8 path_sel_proto;
        u8 path_metric;
        u8 auth_id;
        const u8 *ie;
        u8 ie_len;
        bool is_authenticated;
        bool is_secure;
        bool user_mpm;
        u8 dtim_period;
        u16 beacon_interval;
        int mcast_rate[NUM_NL80211_BANDS];
        u32 basic_rates;
        struct cfg80211_bitrate_mask beacon_rate;
        bool userspace_handles_dfs;
        bool control_port_over_nl80211;
};

/**
 * struct ocb_setup - 802.11p OCB mode setup configuration
 * @chandef: defines the channel to use
 *
 * These parameters are fixed when connecting to the network
 */
struct ocb_setup {
        struct cfg80211_chan_def chandef;
};

/**
 * struct ieee80211_txq_params - TX queue parameters
 * @ac: AC identifier
 * @txop: Maximum burst time in units of 32 usecs, 0 meaning disabled
 * @cwmin: Minimum contention window [a value of the form 2^n-1 in the range
 *        1..32767]
 * @cwmax: Maximum contention window [a value of the form 2^n-1 in the range
 *        1..32767]
 * @aifs: Arbitration interframe space [0..255]
 */
struct ieee80211_txq_params {
        enum nl80211_ac ac;
        u16 txop;
        u16 cwmin;
        u16 cwmax;
        u8 aifs;
};

/**
 * DOC: Scanning and BSS list handling
 *
 * The scanning process itself is fairly simple, but cfg80211 offers quite
 * a bit of helper functionality. To start a scan, the scan operation will
 * be invoked with a scan definition. This scan definition contains the
 * channels to scan, and the SSIDs to send probe requests for (including the
 * wildcard, if desired). A passive scan is indicated by having no SSIDs to
 * probe. Additionally, a scan request may contain extra information elements
 * that should be added to the probe request. The IEs are guaranteed to be
 * well-formed, and will not exceed the maximum length the driver advertised
 * in the wiphy structure.
 *
 * When scanning finds a BSS, cfg80211 needs to be notified of that, because
 * it is responsible for maintaining the BSS list; the driver should not
 * maintain a list itself. For this notification, various functions exist.
 *
 * Since drivers do not maintain a BSS list, there are also a number of
 * functions to search for a BSS and obtain information about it from the
 * BSS structure cfg80211 maintains. The BSS list is also made available
 * to userspace.
 */

/**
 * struct cfg80211_ssid - SSID description
 * @ssid: the SSID
 * @ssid_len: length of the ssid
 */
struct cfg80211_ssid {
        u8 ssid[IEEE80211_MAX_SSID_LEN];
        u8 ssid_len;
};

/**
 * struct cfg80211_scan_info - information about completed scan
 * @scan_start_tsf: scan start time in terms of the TSF of the BSS that the
 *        wireless device that requested the scan is connected to. If this
 *        information is not available, this field is left zero.
 * @tsf_bssid: the BSSID according to which %scan_start_tsf is set.
 * @aborted: set to true if the scan was aborted for any reason,
 *        userspace will be notified of that
 */
struct cfg80211_scan_info {
        u64 scan_start_tsf;
        u8 tsf_bssid[ETH_ALEN] __aligned(2);
        bool aborted;
};

/**
 * struct cfg80211_scan_6ghz_params - relevant for 6 GHz only
 *
 * @short_bssid: short ssid to scan for
 * @bssid: bssid to scan for
 * @channel_idx: idx of the channel in the channel array in the scan request
 *         which the above info relvant to
 * @unsolicited_probe: the AP transmits unsolicited probe response every 20 TU
 * @short_ssid_valid: short_ssid is valid and can be used
 * @psc_no_listen: when set, and the channel is a PSC channel, no need to wait
 *       20 TUs before starting to send probe requests.
 */
struct cfg80211_scan_6ghz_params {
        u32 short_ssid;
        u32 channel_idx;
        u8 bssid[ETH_ALEN];
        bool unsolicited_probe;
        bool short_ssid_valid;
        bool psc_no_listen;
};

/**
 * struct cfg80211_scan_request - scan request description
 *
 * @ssids: SSIDs to scan for (active scan only)
 * @n_ssids: number of SSIDs
 * @channels: channels to scan on.
 * @n_channels: total number of channels to scan
 * @scan_width: channel width for scanning
 * @ie: optional information element(s) to add into Probe Request or %NULL
 * @ie_len: length of ie in octets
 * @duration: how long to listen on each channel, in TUs. If
 *        %duration_mandatory is not set, this is the maximum dwell time and
 *        the actual dwell time may be shorter.
 * @duration_mandatory: if set, the scan duration must be as specified by the
 *        %duration field.
 * @flags: bit field of flags controlling operation
 * @rates: bitmap of rates to advertise for each band
 * @wiphy: the wiphy this was for
 * @scan_start: time (in jiffies) when the scan started
 * @wdev: the wireless device to scan for
 * @info: (internal) information about completed scan
 * @notified: (internal) scan request was notified as done or aborted
 * @no_cck: used to send probe requests at non CCK rate in 2GHz band
 * @mac_addr: MAC address used with randomisation
 * @mac_addr_mask: MAC address mask used with randomisation, bits that
 *        are 0 in the mask should be randomised, bits that are 1 should
 *        be taken from the @mac_addr
 * @scan_6ghz: relevant for split scan request only,
 *        true if this is the second scan request
 * @n_6ghz_params: number of 6 GHz params
 * @scan_6ghz_params: 6 GHz params
 * @bssid: BSSID to scan for (most commonly, the wildcard BSSID)
 */
struct cfg80211_scan_request {
        struct cfg80211_ssid *ssids;
        int n_ssids;
        u32 n_channels;
        enum nl80211_bss_scan_width scan_width;
        const u8 *ie;
        size_t ie_len;
        u16 duration;
        bool duration_mandatory;
        u32 flags;

        u32 rates[NUM_NL80211_BANDS];

        struct wireless_dev *wdev;

        u8 mac_addr[ETH_ALEN] __aligned(2);
        u8 mac_addr_mask[ETH_ALEN] __aligned(2);
        u8 bssid[ETH_ALEN] __aligned(2);

        /* internal */
        struct wiphy *wiphy;
        unsigned long scan_start;
        struct cfg80211_scan_info info;
        bool notified;
        bool no_cck;
        bool scan_6ghz;
        u32 n_6ghz_params;
        struct cfg80211_scan_6ghz_params *scan_6ghz_params;

        /* keep last */
        struct ieee80211_channel *channels[];
};

static inline void get_random_mask_addr(u8 *buf, const u8 *addr, const u8 *mask)
{
        int i;

        get_random_bytes(buf, ETH_ALEN);
        for (i = 0; i < ETH_ALEN; i++) {
                buf[i] &= ~mask[i];
                buf[i] |= addr[i] & mask[i];
        }
}

/**
 * struct cfg80211_match_set - sets of attributes to match
 *
 * @ssid: SSID to be matched; may be zero-length in case of BSSID match
 *        or no match (RSSI only)
 * @bssid: BSSID to be matched; may be all-zero BSSID in case of SSID match
 *        or no match (RSSI only)
 * @rssi_thold: don't report scan results below this threshold (in s32 dBm)
 * @per_band_rssi_thold: Minimum rssi threshold for each band to be applied
 *        for filtering out scan results received. Drivers advertize this support
 *        of band specific rssi based filtering through the feature capability
 *        %NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD. These band
 *        specific rssi thresholds take precedence over rssi_thold, if specified.
 *        If not specified for any band, it will be assigned with rssi_thold of
 *        corresponding matchset.
 */
struct cfg80211_match_set {
        struct cfg80211_ssid ssid;
        u8 bssid[ETH_ALEN];
        s32 rssi_thold;
        s32 per_band_rssi_thold[NUM_NL80211_BANDS];
};

/**
 * struct cfg80211_sched_scan_plan - scan plan for scheduled scan
 *
 * @interval: interval between scheduled scan iterations. In seconds.
 * @iterations: number of scan iterations in this scan plan. Zero means
 *        infinite loop.
 *        The last scan plan will always have this parameter set to zero,
 *        all other scan plans will have a finite number of iterations.
 */
struct cfg80211_sched_scan_plan {
        u32 interval;
        u32 iterations;
};

/**
 * struct cfg80211_bss_select_adjust - BSS selection with RSSI adjustment.
 *
 * @band: band of BSS which should match for RSSI level adjustment.
 * @delta: value of RSSI level adjustment.
 */
struct cfg80211_bss_select_adjust {
        enum nl80211_band band;
        s8 delta;
};

/**
 * struct cfg80211_sched_scan_request - scheduled scan request description
 *
 * @reqid: identifies this request.
 * @ssids: SSIDs to scan for (passed in the probe_reqs in active scans)
 * @n_ssids: number of SSIDs
 * @n_channels: total number of channels to scan
 * @scan_width: channel width for scanning
 * @ie: optional information element(s) to add into Probe Request or %NULL
 * @ie_len: length of ie in octets
 * @flags: bit field of flags controlling operation
 * @match_sets: sets of parameters to be matched for a scan result
 *        entry to be considered valid and to be passed to the host
 *        (others are filtered out).
 *        If ommited, all results are passed.
 * @n_match_sets: number of match sets
 * @report_results: indicates that results were reported for this request
 * @wiphy: the wiphy this was for
 * @dev: the interface
 * @scan_start: start time of the scheduled scan
 * @channels: channels to scan
 * @min_rssi_thold: for drivers only supporting a single threshold, this
 *        contains the minimum over all matchsets
 * @mac_addr: MAC address used with randomisation
 * @mac_addr_mask: MAC address mask used with randomisation, bits that
 *        are 0 in the mask should be randomised, bits that are 1 should
 *        be taken from the @mac_addr
 * @scan_plans: scan plans to be executed in this scheduled scan. Lowest
 *        index must be executed first.
 * @n_scan_plans: number of scan plans, at least 1.
 * @rcu_head: RCU callback used to free the struct
 * @owner_nlportid: netlink portid of owner (if this should is a request
 *        owned by a particular socket)
 * @nl_owner_dead: netlink owner socket was closed - this request be freed
 * @list: for keeping list of requests.
 * @delay: delay in seconds to use before starting the first scan
 *        cycle.  The driver may ignore this parameter and start
 *        immediately (or at any other time), if this feature is not
 *        supported.
 * @relative_rssi_set: Indicates whether @relative_rssi is set or not.
 * @relative_rssi: Relative RSSI threshold in dB to restrict scan result
 *        reporting in connected state to cases where a matching BSS is determined
 *        to have better or slightly worse RSSI than the current connected BSS.
 *        The relative RSSI threshold values are ignored in disconnected state.
 * @rssi_adjust: delta dB of RSSI preference to be given to the BSSs that belong
 *        to the specified band while deciding whether a better BSS is reported
 *        using @relative_rssi. If delta is a negative number, the BSSs that
 *        belong to the specified band will be penalized by delta dB in relative
 *        comparisions.
 */
struct cfg80211_sched_scan_request {
        u64 reqid;
        struct cfg80211_ssid *ssids;
        int n_ssids;
        u32 n_channels;
        enum nl80211_bss_scan_width scan_width;
        const u8 *ie;
        size_t ie_len;
        u32 flags;
        struct cfg80211_match_set *match_sets;
        int n_match_sets;
        s32 min_rssi_thold;
        u32 delay;
        struct cfg80211_sched_scan_plan *scan_plans;
        int n_scan_plans;

        u8 mac_addr[ETH_ALEN] __aligned(2);
        u8 mac_addr_mask[ETH_ALEN] __aligned(2);

        bool relative_rssi_set;
        s8 relative_rssi;
        struct cfg80211_bss_select_adjust rssi_adjust;

        /* internal */
        struct wiphy *wiphy;
        struct net_device *dev;
        unsigned long scan_start;
        bool report_results;
        struct rcu_head rcu_head;
        u32 owner_nlportid;
        bool nl_owner_dead;
        struct list_head list;

        /* keep last */
        struct ieee80211_channel *channels[];
};

/**
 * enum cfg80211_signal_type - signal type
 *
 * @CFG80211_SIGNAL_TYPE_NONE: no signal strength information available
 * @CFG80211_SIGNAL_TYPE_MBM: signal strength in mBm (100*dBm)
 * @CFG80211_SIGNAL_TYPE_UNSPEC: signal strength, increasing from 0 through 100
 */
enum cfg80211_signal_type {
        CFG80211_SIGNAL_TYPE_NONE,
        CFG80211_SIGNAL_TYPE_MBM,
        CFG80211_SIGNAL_TYPE_UNSPEC,
};

/**
 * struct cfg80211_inform_bss - BSS inform data
 * @chan: channel the frame was received on
 * @scan_width: scan width that was used
 * @signal: signal strength value, according to the wiphy's
 *        signal type
 * @boottime_ns: timestamp (CLOCK_BOOTTIME) when the information was
 *        received; should match the time when the frame was actually
 *        received by the device (not just by the host, in case it was
 *        buffered on the device) and be accurate to about 10ms.
 *        If the frame isn't buffered, just passing the return value of
 *        ktime_get_boottime_ns() is likely appropriate.
 * @parent_tsf: the time at the start of reception of the first octet of the
 *        timestamp field of the frame. The time is the TSF of the BSS specified
 *        by %parent_bssid.
 * @parent_bssid: the BSS according to which %parent_tsf is set. This is set to
 *        the BSS that requested the scan in which the beacon/probe was received.
 * @chains: bitmask for filled values in @chain_signal.
 * @chain_signal: per-chain signal strength of last received BSS in dBm.
 */
struct cfg80211_inform_bss {
        struct ieee80211_channel *chan;
        enum nl80211_bss_scan_width scan_width;
        s32 signal;
        u64 boottime_ns;
        u64 parent_tsf;
        u8 parent_bssid[ETH_ALEN] __aligned(2);
        u8 chains;
        s8 chain_signal[IEEE80211_MAX_CHAINS];
};

/**
 * struct cfg80211_bss_ies - BSS entry IE data
 * @tsf: TSF contained in the frame that carried these IEs
 * @rcu_head: internal use, for freeing
 * @len: length of the IEs
 * @from_beacon: these IEs are known to come from a beacon
 * @data: IE data
 */
struct cfg80211_bss_ies {
        u64 tsf;
        struct rcu_head rcu_head;
        int len;
        bool from_beacon;
        u8 data[];
};

/**
 * struct cfg80211_bss - BSS description
 *
 * This structure describes a BSS (which may also be a mesh network)
 * for use in scan results and similar.
 *
 * @channel: channel this BSS is on
 * @scan_width: width of the control channel
 * @bssid: BSSID of the BSS
 * @beacon_interval: the beacon interval as from the frame
 * @capability: the capability field in host byte order
 * @ies: the information elements (Note that there is no guarantee that these
 *        are well-formed!); this is a pointer to either the beacon_ies or
 *        proberesp_ies depending on whether Probe Response frame has been
 *        received. It is always non-%NULL.
 * @beacon_ies: the information elements from the last Beacon frame
 *        (implementation note: if @hidden_beacon_bss is set this struct doesn't
 *        own the beacon_ies, but they're just pointers to the ones from the
 *        @hidden_beacon_bss struct)
 * @proberesp_ies: the information elements from the last Probe Response frame
 * @hidden_beacon_bss: in case this BSS struct represents a probe response from
 *        a BSS that hides the SSID in its beacon, this points to the BSS struct
 *        that holds the beacon data. @beacon_ies is still valid, of course, and
 *        points to the same data as hidden_beacon_bss->beacon_ies in that case.
 * @transmitted_bss: pointer to the transmitted BSS, if this is a
 *        non-transmitted one (multi-BSSID support)
 * @nontrans_list: list of non-transmitted BSS, if this is a transmitted one
 *        (multi-BSSID support)
 * @signal: signal strength value (type depends on the wiphy's signal_type)
 * @chains: bitmask for filled values in @chain_signal.
 * @chain_signal: per-chain signal strength of last received BSS in dBm.
 * @bssid_index: index in the multiple BSS set
 * @max_bssid_indicator: max number of members in the BSS set
 * @priv: private area for driver use, has at least wiphy->bss_priv_size bytes
 */
struct cfg80211_bss {
        struct ieee80211_channel *channel;
        enum nl80211_bss_scan_width scan_width;

        const struct cfg80211_bss_ies __rcu *ies;
        const struct cfg80211_bss_ies __rcu *beacon_ies;
        const struct cfg80211_bss_ies __rcu *proberesp_ies;

        struct cfg80211_bss *hidden_beacon_bss;
        struct cfg80211_bss *transmitted_bss;
        struct list_head nontrans_list;

        s32 signal;

        u16 beacon_interval;
        u16 capability;

        u8 bssid[ETH_ALEN];
        u8 chains;
        s8 chain_signal[IEEE80211_MAX_CHAINS];

        u8 bssid_index;
        u8 max_bssid_indicator;

        u8 priv[] __aligned(sizeof(void *));
};

/**
 * ieee80211_bss_get_elem - find element with given ID
 * @bss: the bss to search
 * @id: the element ID
 *
 * Note that the return value is an RCU-protected pointer, so
 * rcu_read_lock() must be held when calling this function.
 * Return: %NULL if not found.
 */
const struct element *ieee80211_bss_get_elem(struct cfg80211_bss *bss, u8 id);

/**
 * ieee80211_bss_get_ie - find IE with given ID
 * @bss: the bss to search
 * @id: the element ID
 *
 * Note that the return value is an RCU-protected pointer, so
 * rcu_read_lock() must be held when calling this function.
 * Return: %NULL if not found.
 */
static inline const u8 *ieee80211_bss_get_ie(struct cfg80211_bss *bss, u8 id)
{
        return (void *)ieee80211_bss_get_elem(bss, id);
}


/**
 * struct cfg80211_auth_request - Authentication request data
 *
 * This structure provides information needed to complete IEEE 802.11
 * authentication.
 *
 * @bss: The BSS to authenticate with, the callee must obtain a reference
 *        to it if it needs to keep it.
 * @auth_type: Authentication type (algorithm)
 * @ie: Extra IEs to add to Authentication frame or %NULL
 * @ie_len: Length of ie buffer in octets
 * @key_len: length of WEP key for shared key authentication
 * @key_idx: index of WEP key for shared key authentication
 * @key: WEP key for shared key authentication
 * @auth_data: Fields and elements in Authentication frames. This contains
 *        the authentication frame body (non-IE and IE data), excluding the
 *        Authentication algorithm number, i.e., starting at the Authentication
 *        transaction sequence number field.
 * @auth_data_len: Length of auth_data buffer in octets
 */
struct cfg80211_auth_request {
        struct cfg80211_bss *bss;
        const u8 *ie;
        size_t ie_len;
        enum nl80211_auth_type auth_type;
        const u8 *key;
        u8 key_len, key_idx;
        const u8 *auth_data;
        size_t auth_data_len;
};

/**
 * enum cfg80211_assoc_req_flags - Over-ride default behaviour in association.
 *
 * @ASSOC_REQ_DISABLE_HT:  Disable HT (802.11n)
 * @ASSOC_REQ_DISABLE_VHT:  Disable VHT
 * @ASSOC_REQ_USE_RRM: Declare RRM capability in this association
 * @CONNECT_REQ_EXTERNAL_AUTH_SUPPORT: User space indicates external
 *        authentication capability. Drivers can offload authentication to
 *        userspace if this flag is set. Only applicable for cfg80211_connect()
 *        request (connect callback).
 */
enum cfg80211_assoc_req_flags {
        ASSOC_REQ_DISABLE_HT                        = BIT(0),
        ASSOC_REQ_DISABLE_VHT                        = BIT(1),
        ASSOC_REQ_USE_RRM                        = BIT(2),
        CONNECT_REQ_EXTERNAL_AUTH_SUPPORT        = BIT(3),
};

/**
 * struct cfg80211_assoc_request - (Re)Association request data
 *
 * This structure provides information needed to complete IEEE 802.11
 * (re)association.
 * @bss: The BSS to associate with. If the call is successful the driver is
 *        given a reference that it must give back to cfg80211_send_rx_assoc()
 *        or to cfg80211_assoc_timeout(). To ensure proper refcounting, new
 *        association requests while already associating must be rejected.
 * @ie: Extra IEs to add to (Re)Association Request frame or %NULL
 * @ie_len: Length of ie buffer in octets
 * @use_mfp: Use management frame protection (IEEE 802.11w) in this association
 * @crypto: crypto settings
 * @prev_bssid: previous BSSID, if not %NULL use reassociate frame. This is used
 *        to indicate a request to reassociate within the ESS instead of a request
 *        do the initial association with the ESS. When included, this is set to
 *        the BSSID of the current association, i.e., to the value that is
 *        included in the Current AP address field of the Reassociation Request
 *        frame.
 * @flags:  See &enum cfg80211_assoc_req_flags
 * @ht_capa:  HT Capabilities over-rides.  Values set in ht_capa_mask
 *        will be used in ht_capa.  Un-supported values will be ignored.
 * @ht_capa_mask:  The bits of ht_capa which are to be used.
 * @vht_capa: VHT capability override
 * @vht_capa_mask: VHT capability mask indicating which fields to use
 * @fils_kek: FILS KEK for protecting (Re)Association Request/Response frame or
 *        %NULL if FILS is not used.
 * @fils_kek_len: Length of fils_kek in octets
 * @fils_nonces: FILS nonces (part of AAD) for protecting (Re)Association
 *        Request/Response frame or %NULL if FILS is not used. This field starts
 *        with 16 octets of STA Nonce followed by 16 octets of AP Nonce.
 * @s1g_capa: S1G capability override
 * @s1g_capa_mask: S1G capability override mask
 */
struct cfg80211_assoc_request {
        struct cfg80211_bss *bss;
        const u8 *ie, *prev_bssid;
        size_t ie_len;
        struct cfg80211_crypto_settings crypto;
        bool use_mfp;
        u32 flags;
        struct ieee80211_ht_cap ht_capa;
        struct ieee80211_ht_cap ht_capa_mask;
        struct ieee80211_vht_cap vht_capa, vht_capa_mask;
        const u8 *fils_kek;
        size_t fils_kek_len;
        const u8 *fils_nonces;
        struct ieee80211_s1g_cap s1g_capa, s1g_capa_mask;
};

/**
 * struct cfg80211_deauth_request - Deauthentication request data
 *
 * This structure provides information needed to complete IEEE 802.11
 * deauthentication.
 *
 * @bssid: the BSSID of the BSS to deauthenticate from
 * @ie: Extra IEs to add to Deauthentication frame or %NULL
 * @ie_len: Length of ie buffer in octets
 * @reason_code: The reason code for the deauthentication
 * @local_state_change: if set, change local state only and
 *        do not set a deauth frame
 */
struct cfg80211_deauth_request {
        const u8 *bssid;
        const u8 *ie;
        size_t ie_len;
        u16 reason_code;
        bool local_state_change;
};

/**
 * struct cfg80211_disassoc_request - Disassociation request data
 *
 * This structure provides information needed to complete IEEE 802.11
 * disassociation.
 *
 * @bss: the BSS to disassociate from
 * @ie: Extra IEs to add to Disassociation frame or %NULL
 * @ie_len: Length of ie buffer in octets
 * @reason_code: The reason code for the disassociation
 * @local_state_change: This is a request for a local state only, i.e., no
 *        Disassociation frame is to be transmitted.
 */
struct cfg80211_disassoc_request {
        struct cfg80211_bss *bss;
        const u8 *ie;
        size_t ie_len;
        u16 reason_code;
        bool local_state_change;
};

/**
 * struct cfg80211_ibss_params - IBSS parameters
 *
 * This structure defines the IBSS parameters for the join_ibss()
 * method.
 *
 * @ssid: The SSID, will always be non-null.
 * @ssid_len: The length of the SSID, will always be non-zero.
 * @bssid: Fixed BSSID requested, maybe be %NULL, if set do not
 *        search for IBSSs with a different BSSID.
 * @chandef: defines the channel to use if no other IBSS to join can be found
 * @channel_fixed: The channel should be fixed -- do not search for
 *        IBSSs to join on other channels.
 * @ie: information element(s) to include in the beacon
 * @ie_len: length of that
 * @beacon_interval: beacon interval to use
 * @privacy: this is a protected network, keys will be configured
 *        after joining
 * @control_port: whether user space controls IEEE 802.1X port, i.e.,
 *        sets/clears %NL80211_STA_FLAG_AUTHORIZED. If true, the driver is
 *        required to assume that the port is unauthorized until authorized by
 *        user space. Otherwise, port is marked authorized by default.
 * @control_port_over_nl80211: TRUE if userspace expects to exchange control
 *        port frames over NL80211 instead of the network interface.
 * @userspace_handles_dfs: whether user space controls DFS operation, i.e.
 *        changes the channel when a radar is detected. This is required
 *        to operate on DFS channels.
 * @basic_rates: bitmap of basic rates to use when creating the IBSS
 * @mcast_rate: per-band multicast rate index + 1 (0: disabled)
 * @ht_capa:  HT Capabilities over-rides.  Values set in ht_capa_mask
 *        will be used in ht_capa.  Un-supported values will be ignored.
 * @ht_capa_mask:  The bits of ht_capa which are to be used.
 * @wep_keys: static WEP keys, if not NULL points to an array of
 *        CFG80211_MAX_WEP_KEYS WEP keys
 * @wep_tx_key: key index (0..3) of the default TX static WEP key
 */
struct cfg80211_ibss_params {
        const u8 *ssid;
        const u8 *bssid;
        struct cfg80211_chan_def chandef;
        const u8 *ie;
        u8 ssid_len, ie_len;
        u16 beacon_interval;
        u32 basic_rates;
        bool channel_fixed;
        bool privacy;
        bool control_port;
        bool control_port_over_nl80211;
        bool userspace_handles_dfs;
        int mcast_rate[NUM_NL80211_BANDS];
        struct ieee80211_ht_cap ht_capa;
        struct ieee80211_ht_cap ht_capa_mask;
        struct key_params *wep_keys;
        int wep_tx_key;
};

/**
 * struct cfg80211_bss_selection - connection parameters for BSS selection.
 *
 * @behaviour: requested BSS selection behaviour.
 * @param: parameters for requestion behaviour.
 * @band_pref: preferred band for %NL80211_BSS_SELECT_ATTR_BAND_PREF.
 * @adjust: parameters for %NL80211_BSS_SELECT_ATTR_RSSI_ADJUST.
 */
struct cfg80211_bss_selection {
        enum nl80211_bss_select_attr behaviour;
        union {
                enum nl80211_band band_pref;
                struct cfg80211_bss_select_adjust adjust;
        } param;
};

/**
 * struct cfg80211_connect_params - Connection parameters
 *
 * This structure provides information needed to complete IEEE 802.11
 * authentication and association.
 *
 * @channel: The channel to use or %NULL if not specified (auto-select based
 *        on scan results)
 * @channel_hint: The channel of the recommended BSS for initial connection or
 *        %NULL if not specified
 * @bssid: The AP BSSID or %NULL if not specified (auto-select based on scan
 *        results)
 * @bssid_hint: The recommended AP BSSID for initial connection to the BSS or
 *        %NULL if not specified. Unlike the @bssid parameter, the driver is
 *        allowed to ignore this @bssid_hint if it has knowledge of a better BSS
 *        to use.
 * @ssid: SSID
 * @ssid_len: Length of ssid in octets
 * @auth_type: Authentication type (algorithm)
 * @ie: IEs for association request
 * @ie_len: Length of assoc_ie in octets
 * @privacy: indicates whether privacy-enabled APs should be used
 * @mfp: indicate whether management frame protection is used
 * @crypto: crypto settings
 * @key_len: length of WEP key for shared key authentication
 * @key_idx: index of WEP key for shared key authentication
 * @key: WEP key for shared key authentication
 * @flags:  See &enum cfg80211_assoc_req_flags
 * @bg_scan_period:  Background scan period in seconds
 *        or -1 to indicate that default value is to be used.
 * @ht_capa:  HT Capabilities over-rides.  Values set in ht_capa_mask
 *        will be used in ht_capa.  Un-supported values will be ignored.
 * @ht_capa_mask:  The bits of ht_capa which are to be used.
 * @vht_capa:  VHT Capability overrides
 * @vht_capa_mask: The bits of vht_capa which are to be used.
 * @pbss: if set, connect to a PCP instead of AP. Valid for DMG
 *        networks.
 * @bss_select: criteria to be used for BSS selection.
 * @prev_bssid: previous BSSID, if not %NULL use reassociate frame. This is used
 *        to indicate a request to reassociate within the ESS instead of a request
 *        do the initial association with the ESS. When included, this is set to
 *        the BSSID of the current association, i.e., to the value that is
 *        included in the Current AP address field of the Reassociation Request
 *        frame.
 * @fils_erp_username: EAP re-authentication protocol (ERP) username part of the
 *        NAI or %NULL if not specified. This is used to construct FILS wrapped
 *        data IE.
 * @fils_erp_username_len: Length of @fils_erp_username in octets.
 * @fils_erp_realm: EAP re-authentication protocol (ERP) realm part of NAI or
 *        %NULL if not specified. This specifies the domain name of ER server and
 *        is used to construct FILS wrapped data IE.
 * @fils_erp_realm_len: Length of @fils_erp_realm in octets.
 * @fils_erp_next_seq_num: The next sequence number to use in the FILS ERP
 *        messages. This is also used to construct FILS wrapped data IE.
 * @fils_erp_rrk: ERP re-authentication Root Key (rRK) used to derive additional
 *        keys in FILS or %NULL if not specified.
 * @fils_erp_rrk_len: Length of @fils_erp_rrk in octets.
 * @want_1x: indicates user-space supports and wants to use 802.1X driver
 *        offload of 4-way handshake.
 * @edmg: define the EDMG channels.
 *        This may specify multiple channels and bonding options for the driver
 *        to choose from, based on BSS configuration.
 */
struct cfg80211_connect_params {
        struct ieee80211_channel *channel;
        struct ieee80211_channel *channel_hint;
        const u8 *bssid;
        const u8 *bssid_hint;
        const u8 *ssid;
        size_t ssid_len;
        enum nl80211_auth_type auth_type;
        const u8 *ie;
        size_t ie_len;
        bool privacy;
        enum nl80211_mfp mfp;
        struct cfg80211_crypto_settings crypto;
        const u8 *key;
        u8 key_len, key_idx;
        u32 flags;
        int bg_scan_period;
        struct ieee80211_ht_cap ht_capa;
        struct ieee80211_ht_cap ht_capa_mask;
        struct ieee80211_vht_cap vht_capa;
        struct ieee80211_vht_cap vht_capa_mask;
        bool pbss;
        struct cfg80211_bss_selection bss_select;
        const u8 *prev_bssid;
        const u8 *fils_erp_username;
        size_t fils_erp_username_len;
        const u8 *fils_erp_realm;
        size_t fils_erp_realm_len;
        u16 fils_erp_next_seq_num;
        const u8 *fils_erp_rrk;
        size_t fils_erp_rrk_len;
        bool want_1x;
        struct ieee80211_edmg edmg;
};

/**
 * enum cfg80211_connect_params_changed - Connection parameters being updated
 *
 * This enum provides information of all connect parameters that
 * have to be updated as part of update_connect_params() call.
 *
 * @UPDATE_ASSOC_IES: Indicates whether association request IEs are updated
 * @UPDATE_FILS_ERP_INFO: Indicates that FILS connection parameters (realm,
 *        username, erp sequence number and rrk) are updated
 * @UPDATE_AUTH_TYPE: Indicates that authentication type is updated
 */
enum cfg80211_connect_params_changed {
        UPDATE_ASSOC_IES                = BIT(0),
        UPDATE_FILS_ERP_INFO                = BIT(1),
        UPDATE_AUTH_TYPE                = BIT(2),
};

/**
 * enum wiphy_params_flags - set_wiphy_params bitfield values
 * @WIPHY_PARAM_RETRY_SHORT: wiphy->retry_short has changed
 * @WIPHY_PARAM_RETRY_LONG: wiphy->retry_long has changed
 * @WIPHY_PARAM_FRAG_THRESHOLD: wiphy->frag_threshold has changed
 * @WIPHY_PARAM_RTS_THRESHOLD: wiphy->rts_threshold has changed
 * @WIPHY_PARAM_COVERAGE_CLASS: coverage class changed
 * @WIPHY_PARAM_DYN_ACK: dynack has been enabled
 * @WIPHY_PARAM_TXQ_LIMIT: TXQ packet limit has been changed
 * @WIPHY_PARAM_TXQ_MEMORY_LIMIT: TXQ memory limit has been changed
 * @WIPHY_PARAM_TXQ_QUANTUM: TXQ scheduler quantum
 */
enum wiphy_params_flags {
        WIPHY_PARAM_RETRY_SHORT                = 1 << 0,
        WIPHY_PARAM_RETRY_LONG                = 1 << 1,
        WIPHY_PARAM_FRAG_THRESHOLD        = 1 << 2,
        WIPHY_PARAM_RTS_THRESHOLD        = 1 << 3,
        WIPHY_PARAM_COVERAGE_CLASS        = 1 << 4,
        WIPHY_PARAM_DYN_ACK                = 1 << 5,
        WIPHY_PARAM_TXQ_LIMIT                = 1 << 6,
        WIPHY_PARAM_TXQ_MEMORY_LIMIT        = 1 << 7,
        WIPHY_PARAM_TXQ_QUANTUM                = 1 << 8,
};

#define IEEE80211_DEFAULT_AIRTIME_WEIGHT        256

/* The per TXQ device queue limit in airtime */
#define IEEE80211_DEFAULT_AQL_TXQ_LIMIT_L        5000
#define IEEE80211_DEFAULT_AQL_TXQ_LIMIT_H        12000

/* The per interface airtime threshold to switch to lower queue limit */
#define IEEE80211_AQL_THRESHOLD                        24000

/**
 * struct cfg80211_pmksa - PMK Security Association
 *
 * This structure is passed to the set/del_pmksa() method for PMKSA
 * caching.
 *
 * @bssid: The AP's BSSID (may be %NULL).
 * @pmkid: The identifier to refer a PMKSA.
 * @pmk: The PMK for the PMKSA identified by @pmkid. This is used for key
 *        derivation by a FILS STA. Otherwise, %NULL.
 * @pmk_len: Length of the @pmk. The length of @pmk can differ depending on
 *        the hash algorithm used to generate this.
 * @ssid: SSID to specify the ESS within which a PMKSA is valid when using FILS
 *        cache identifier (may be %NULL).
 * @ssid_len: Length of the @ssid in octets.
 * @cache_id: 2-octet cache identifier advertized by a FILS AP identifying the
 *        scope of PMKSA. This is valid only if @ssid_len is non-zero (may be
 *        %NULL).
 * @pmk_lifetime: Maximum lifetime for PMKSA in seconds
 *        (dot11RSNAConfigPMKLifetime) or 0 if not specified.
 *        The configured PMKSA must not be used for PMKSA caching after
 *        expiration and any keys derived from this PMK become invalid on
 *        expiration, i.e., the current association must be dropped if the PMK
 *        used for it expires.
 * @pmk_reauth_threshold: Threshold time for reauthentication (percentage of
 *        PMK lifetime, dot11RSNAConfigPMKReauthThreshold) or 0 if not specified.
 *        Drivers are expected to trigger a full authentication instead of using
 *        this PMKSA for caching when reassociating to a new BSS after this
 *        threshold to generate a new PMK before the current one expires.
 */
struct cfg80211_pmksa {
        const u8 *bssid;
        const u8 *pmkid;
        const u8 *pmk;
        size_t pmk_len;
        const u8 *ssid;
        size_t ssid_len;
        const u8 *cache_id;
        u32 pmk_lifetime;
        u8 pmk_reauth_threshold;
};

/**
 * struct cfg80211_pkt_pattern - packet pattern
 * @mask: bitmask where to match pattern and where to ignore bytes,
 *        one bit per byte, in same format as nl80211
 * @pattern: bytes to match where bitmask is 1
 * @pattern_len: length of pattern (in bytes)
 * @pkt_offset: packet offset (in bytes)
 *
 * Internal note: @mask and @pattern are allocated in one chunk of
 * memory, free @mask only!
 */
struct cfg80211_pkt_pattern {
        const u8 *mask, *pattern;
        int pattern_len;
        int pkt_offset;
};

/**
 * struct cfg80211_wowlan_tcp - TCP connection parameters
 *
 * @sock: (internal) socket for source port allocation
 * @src: source IP address
 * @dst: destination IP address
 * @dst_mac: destination MAC address
 * @src_port: source port
 * @dst_port: destination port
 * @payload_len: data payload length
 * @payload: data payload buffer
 * @payload_seq: payload sequence stamping configuration
 * @data_interval: interval at which to send data packets
 * @wake_len: wakeup payload match length
 * @wake_data: wakeup payload match data
 * @wake_mask: wakeup payload match mask
 * @tokens_size: length of the tokens buffer
 * @payload_tok: payload token usage configuration
 */
struct cfg80211_wowlan_tcp {
        struct socket *sock;
        __be32 src, dst;
        u16 src_port, dst_port;
        u8 dst_mac[ETH_ALEN];
        int payload_len;
        const u8 *payload;
        struct nl80211_wowlan_tcp_data_seq payload_seq;
        u32 data_interval;
        u32 wake_len;
        const u8 *wake_data, *wake_mask;
        u32 tokens_size;
        /* must be last, variable member */
        struct nl80211_wowlan_tcp_data_token payload_tok;
};

/**
 * struct cfg80211_wowlan - Wake on Wireless-LAN support info
 *
 * This structure defines the enabled WoWLAN triggers for the device.
 * @any: wake up on any activity -- special trigger if device continues
 *        operating as normal during suspend
 * @disconnect: wake up if getting disconnected
 * @magic_pkt: wake up on receiving magic packet
 * @patterns: wake up on receiving packet matching a pattern
 * @n_patterns: number of patterns
 * @gtk_rekey_failure: wake up on GTK rekey failure
 * @eap_identity_req: wake up on EAP identity request packet
 * @four_way_handshake: wake up on 4-way handshake
 * @rfkill_release: wake up when rfkill is released
 * @tcp: TCP connection establishment/wakeup parameters, see nl80211.h.
 *        NULL if not configured.
 * @nd_config: configuration for the scan to be used for net detect wake.
 */
struct cfg80211_wowlan {
        bool any, disconnect, magic_pkt, gtk_rekey_failure,
             eap_identity_req, four_way_handshake,
             rfkill_release;
        struct cfg80211_pkt_pattern *patterns;
        struct cfg80211_wowlan_tcp *tcp;
        int n_patterns;
        struct cfg80211_sched_scan_request *nd_config;
};

/**
 * struct cfg80211_coalesce_rules - Coalesce rule parameters
 *
 * This structure defines coalesce rule for the device.
 * @delay: maximum coalescing delay in msecs.
 * @condition: condition for packet coalescence.
 *        see &enum nl80211_coalesce_condition.
 * @patterns: array of packet patterns
 * @n_patterns: number of patterns
 */
struct cfg80211_coalesce_rules {
        int delay;
        enum nl80211_coalesce_condition condition;
        struct cfg80211_pkt_pattern *patterns;
        int n_patterns;
};

/**
 * struct cfg80211_coalesce - Packet coalescing settings
 *
 * This structure defines coalescing settings.
 * @rules: array of coalesce rules
 * @n_rules: number of rules
 */
struct cfg80211_coalesce {
        struct cfg80211_coalesce_rules *rules;
        int n_rules;
};

/**
 * struct cfg80211_wowlan_nd_match - information about the match
 *
 * @ssid: SSID of the match that triggered the wake up
 * @n_channels: Number of channels where the match occurred.  This
 *        value may be zero if the driver can't report the channels.
 * @channels: center frequencies of the channels where a match
 *        occurred (in MHz)
 */
struct cfg80211_wowlan_nd_match {
        struct cfg80211_ssid ssid;
        int n_channels;
        u32 channels[];
};

/**
 * struct cfg80211_wowlan_nd_info - net detect wake up information
 *
 * @n_matches: Number of match information instances provided in
 *        @matches.  This value may be zero if the driver can't provide
 *        match information.
 * @matches: Array of pointers to matches containing information about
 *        the matches that triggered the wake up.
 */
struct cfg80211_wowlan_nd_info {
        int n_matches;
        struct cfg80211_wowlan_nd_match *matches[];
};

/**
 * struct cfg80211_wowlan_wakeup - wakeup report
 * @disconnect: woke up by getting disconnected
 * @magic_pkt: woke up by receiving magic packet
 * @gtk_rekey_failure: woke up by GTK rekey failure
 * @eap_identity_req: woke up by EAP identity request packet
 * @four_way_handshake: woke up by 4-way handshake
 * @rfkill_release: woke up by rfkill being released
 * @pattern_idx: pattern that caused wakeup, -1 if not due to pattern
 * @packet_present_len: copied wakeup packet data
 * @packet_len: original wakeup packet length
 * @packet: The packet causing the wakeup, if any.
 * @packet_80211:  For pattern match, magic packet and other data
 *        frame triggers an 802.3 frame should be reported, for
 *        disconnect due to deauth 802.11 frame. This indicates which
 *        it is.
 * @tcp_match: TCP wakeup packet received
 * @tcp_connlost: TCP connection lost or failed to establish
 * @tcp_nomoretokens: TCP data ran out of tokens
 * @net_detect: if not %NULL, woke up because of net detect
 */
struct cfg80211_wowlan_wakeup {
        bool disconnect, magic_pkt, gtk_rekey_failure,
             eap_identity_req, four_way_handshake,
             rfkill_release, packet_80211,
             tcp_match, tcp_connlost, tcp_nomoretokens;
        s32 pattern_idx;
        u32 packet_present_len, packet_len;
        const void *packet;
        struct cfg80211_wowlan_nd_info *net_detect;
};

/**
 * struct cfg80211_gtk_rekey_data - rekey data
 * @kek: key encryption key (@kek_len bytes)
 * @kck: key confirmation key (@kck_len bytes)
 * @replay_ctr: replay counter (NL80211_REPLAY_CTR_LEN bytes)
 * @kek_len: length of kek
 * @kck_len length of kck
 * @akm: akm (oui, id)
 */
struct cfg80211_gtk_rekey_data {
        const u8 *kek, *kck, *replay_ctr;
        u32 akm;
        u8 kek_len, kck_len;
};

/**
 * struct cfg80211_update_ft_ies_params - FT IE Information
 *
 * This structure provides information needed to update the fast transition IE
 *
 * @md: The Mobility Domain ID, 2 Octet value
 * @ie: Fast Transition IEs
 * @ie_len: Length of ft_ie in octets
 */
struct cfg80211_update_ft_ies_params {
        u16 md;
        const u8 *ie;
        size_t ie_len;
};

/**
 * struct cfg80211_mgmt_tx_params - mgmt tx parameters
 *
 * This structure provides information needed to transmit a mgmt frame
 *
 * @chan: channel to use
 * @offchan: indicates wether off channel operation is required
 * @wait: duration for ROC
 * @buf: buffer to transmit
 * @len: buffer length
 * @no_cck: don't use cck rates for this frame
 * @dont_wait_for_ack: tells the low level not to wait for an ack
 * @n_csa_offsets: length of csa_offsets array
 * @csa_offsets: array of all the csa offsets in the frame
 */
struct cfg80211_mgmt_tx_params {
        struct ieee80211_channel *chan;
        bool offchan;
        unsigned int wait;
        const u8 *buf;
        size_t len;
        bool no_cck;
        bool dont_wait_for_ack;
        int n_csa_offsets;
        const u16 *csa_offsets;
};

/**
 * struct cfg80211_dscp_exception - DSCP exception
 *
 * @dscp: DSCP value that does not adhere to the user priority range definition
 * @up: user priority value to which the corresponding DSCP value belongs
 */
struct cfg80211_dscp_exception {
        u8 dscp;
        u8 up;
};

/**
 * struct cfg80211_dscp_range - DSCP range definition for user priority
 *
 * @low: lowest DSCP value of this user priority range, inclusive
 * @high: highest DSCP value of this user priority range, inclusive
 */
struct cfg80211_dscp_range {
        u8 low;
        u8 high;
};

/* QoS Map Set element length defined in IEEE Std 802.11-2012, 8.4.2.97 */
#define IEEE80211_QOS_MAP_MAX_EX        21
#define IEEE80211_QOS_MAP_LEN_MIN        16
#define IEEE80211_QOS_MAP_LEN_MAX \
        (IEEE80211_QOS_MAP_LEN_MIN + 2 * IEEE80211_QOS_MAP_MAX_EX)

/**
 * struct cfg80211_qos_map - QoS Map Information
 *
 * This struct defines the Interworking QoS map setting for DSCP values
 *
 * @num_des: number of DSCP exceptions (0..21)
 * @dscp_exception: optionally up to maximum of 21 DSCP exceptions from
 *        the user priority DSCP range definition
 * @up: DSCP range definition for a particular user priority
 */
struct cfg80211_qos_map {
        u8 num_des;
        struct cfg80211_dscp_exception dscp_exception[IEEE80211_QOS_MAP_MAX_EX];
        struct cfg80211_dscp_range up[8];
};

/**
 * struct cfg80211_nan_conf - NAN configuration
 *
 * This struct defines NAN configuration parameters
 *
 * @master_pref: master preference (1 - 255)
 * @bands: operating bands, a bitmap of &enum nl80211_band values.
 *        For instance, for NL80211_BAND_2GHZ, bit 0 would be set
 *        (i.e. BIT(NL80211_BAND_2GHZ)).
 */
struct cfg80211_nan_conf {
        u8 master_pref;
        u8 bands;
};

/**
 * enum cfg80211_nan_conf_changes - indicates changed fields in NAN
 * configuration
 *
 * @CFG80211_NAN_CONF_CHANGED_PREF: master preference
 * @CFG80211_NAN_CONF_CHANGED_BANDS: operating bands
 */
enum cfg80211_nan_conf_changes {
        CFG80211_NAN_CONF_CHANGED_PREF = BIT(0),
        CFG80211_NAN_CONF_CHANGED_BANDS = BIT(1),
};

/**
 * struct cfg80211_nan_func_filter - a NAN function Rx / Tx filter
 *
 * @filter: the content of the filter
 * @len: the length of the filter
 */
struct cfg80211_nan_func_filter {
        const u8 *filter;
        u8 len;
};

/**
 * struct cfg80211_nan_func - a NAN function
 *
 * @type: &enum nl80211_nan_function_type
 * @service_id: the service ID of the function
 * @publish_type: &nl80211_nan_publish_type
 * @close_range: if true, the range should be limited. Threshold is
 *        implementation specific.
 * @publish_bcast: if true, the solicited publish should be broadcasted
 * @subscribe_active: if true, the subscribe is active
 * @followup_id: the instance ID for follow up
 * @followup_reqid: the requestor instance ID for follow up
 * @followup_dest: MAC address of the recipient of the follow up
 * @ttl: time to live counter in DW.
 * @serv_spec_info: Service Specific Info
 * @serv_spec_info_len: Service Specific Info length
 * @srf_include: if true, SRF is inclusive
 * @srf_bf: Bloom Filter
 * @srf_bf_len: Bloom Filter length
 * @srf_bf_idx: Bloom Filter index
 * @srf_macs: SRF MAC addresses
 * @srf_num_macs: number of MAC addresses in SRF
 * @rx_filters: rx filters that are matched with corresponding peer's tx_filter
 * @tx_filters: filters that should be transmitted in the SDF.
 * @num_rx_filters: length of &rx_filters.
 * @num_tx_filters: length of &tx_filters.
 * @instance_id: driver allocated id of the function.
 * @cookie: unique NAN function identifier.
 */
struct cfg80211_nan_func {
        enum nl80211_nan_function_type type;
        u8 service_id[NL80211_NAN_FUNC_SERVICE_ID_LEN];
        u8 publish_type;
        bool close_range;
        bool publish_bcast;
        bool subscribe_active;
        u8 followup_id;
        u8 followup_reqid;
        struct mac_address followup_dest;
        u32 ttl;
        const u8 *serv_spec_info;
        u8 serv_spec_info_len;
        bool srf_include;
        const u8 *srf_bf;
        u8 srf_bf_len;
        u8 srf_bf_idx;
        struct mac_address *srf_macs;
        int srf_num_macs;
        struct cfg80211_nan_func_filter *rx_filters;
        struct cfg80211_nan_func_filter *tx_filters;
        u8 num_tx_filters;
        u8 num_rx_filters;
        u8 instance_id;
        u64 cookie;
};

/**
 * struct cfg80211_pmk_conf - PMK configuration
 *
 * @aa: authenticator address
 * @pmk_len: PMK length in bytes.
 * @pmk: the PMK material
 * @pmk_r0_name: PMK-R0 Name. NULL if not applicable (i.e., the PMK
 *        is not PMK-R0). When pmk_r0_name is not NULL, the pmk field
 *        holds PMK-R0.
 */
struct cfg80211_pmk_conf {
        const u8 *aa;
        u8 pmk_len;
        const u8 *pmk;
        const u8 *pmk_r0_name;
};

/**
 * struct cfg80211_external_auth_params - Trigger External authentication.
 *
 * Commonly used across the external auth request and event interfaces.
 *
 * @action: action type / trigger for external authentication. Only significant
 *        for the authentication request event interface (driver to user space).
 * @bssid: BSSID of the peer with which the authentication has
 *        to happen. Used by both the authentication request event and
 *        authentication response command interface.
 * @ssid: SSID of the AP.  Used by both the authentication request event and
 *        authentication response command interface.
 * @key_mgmt_suite: AKM suite of the respective authentication. Used by the
 *        authentication request event interface.
 * @status: status code, %WLAN_STATUS_SUCCESS for successful authentication,
 *        use %WLAN_STATUS_UNSPECIFIED_FAILURE if user space cannot give you
 *        the real status code for failures. Used only for the authentication
 *        response command interface (user space to driver).
 * @pmkid: The identifier to refer a PMKSA.
 */
struct cfg80211_external_auth_params {
        enum nl80211_external_auth_action action;
        u8 bssid[ETH_ALEN] __aligned(2);
        struct cfg80211_ssid ssid;
        unsigned int key_mgmt_suite;
        u16 status;
        const u8 *pmkid;
};

/**
 * struct cfg80211_ftm_responder_stats - FTM responder statistics
 *
 * @filled: bitflag of flags using the bits of &enum nl80211_ftm_stats to
 *        indicate the relevant values in this struct for them
 * @success_num: number of FTM sessions in which all frames were successfully
 *        answered
 * @partial_num: number of FTM sessions in which part of frames were
 *        successfully answered
 * @failed_num: number of failed FTM sessions
 * @asap_num: number of ASAP FTM sessions
 * @non_asap_num: number of  non-ASAP FTM sessions
 * @total_duration_ms: total sessions durations - gives an indication
 *        of how much time the responder was busy
 * @unknown_triggers_num: number of unknown FTM triggers - triggers from
 *        initiators that didn't finish successfully the negotiation phase with
 *        the responder
 * @reschedule_requests_num: number of FTM reschedule requests - initiator asks
 *        for a new scheduling although it already has scheduled FTM slot
 * @out_of_window_triggers_num: total FTM triggers out of scheduled window
 */
struct cfg80211_ftm_responder_stats {
        u32 filled;
        u32 success_num;
        u32 partial_num;
        u32 failed_num;
        u32 asap_num;
        u32 non_asap_num;
        u64 total_duration_ms;
        u32 unknown_triggers_num;
        u32 reschedule_requests_num;
        u32 out_of_window_triggers_num;
};

/**
 * struct cfg80211_pmsr_ftm_result - FTM result
 * @failure_reason: if this measurement failed (PMSR status is
 *        %NL80211_PMSR_STATUS_FAILURE), this gives a more precise
 *        reason than just "failure"
 * @burst_index: if reporting partial results, this is the index
 *        in [0 .. num_bursts-1] of the burst that's being reported
 * @num_ftmr_attempts: number of FTM request frames transmitted
 * @num_ftmr_successes: number of FTM request frames acked
 * @busy_retry_time: if failure_reason is %NL80211_PMSR_FTM_FAILURE_PEER_BUSY,
 *        fill this to indicate in how many seconds a retry is deemed possible
 *        by the responder
 * @num_bursts_exp: actual number of bursts exponent negotiated
 * @burst_duration: actual burst duration negotiated
 * @ftms_per_burst: actual FTMs per burst negotiated
 * @lci_len: length of LCI information (if present)
 * @civicloc_len: length of civic location information (if present)
 * @lci: LCI data (may be %NULL)
 * @civicloc: civic location data (may be %NULL)
 * @rssi_avg: average RSSI over FTM action frames reported
 * @rssi_spread: spread of the RSSI over FTM action frames reported
 * @tx_rate: bitrate for transmitted FTM action frame response
 * @rx_rate: bitrate of received FTM action frame
 * @rtt_avg: average of RTTs measured (must have either this or @dist_avg)
 * @rtt_variance: variance of RTTs measured (note that standard deviation is
 *        the square root of the variance)
 * @rtt_spread: spread of the RTTs measured
 * @dist_avg: average of distances (mm) measured
 *        (must have either this or @rtt_avg)
 * @dist_variance: variance of distances measured (see also @rtt_variance)
 * @dist_spread: spread of distances measured (see also @rtt_spread)
 * @num_ftmr_attempts_valid: @num_ftmr_attempts is valid
 * @num_ftmr_successes_valid: @num_ftmr_successes is valid
 * @rssi_avg_valid: @rssi_avg is valid
 * @rssi_spread_valid: @rssi_spread is valid
 * @tx_rate_valid: @tx_rate is valid
 * @rx_rate_valid: @rx_rate is valid
 * @rtt_avg_valid: @rtt_avg is valid
 * @rtt_variance_valid: @rtt_variance is valid
 * @rtt_spread_valid: @rtt_spread is valid
 * @dist_avg_valid: @dist_avg is valid
 * @dist_variance_valid: @dist_variance is valid
 * @dist_spread_valid: @dist_spread is valid
 */
struct cfg80211_pmsr_ftm_result {
        const u8 *lci;
        const u8 *civicloc;
        unsigned int lci_len;
        unsigned int civicloc_len;
        enum nl80211_peer_measurement_ftm_failure_reasons failure_reason;
        u32 num_ftmr_attempts, num_ftmr_successes;
        s16 burst_index;
        u8 busy_retry_time;
        u8 num_bursts_exp;
        u8 burst_duration;
        u8 ftms_per_burst;
        s32 rssi_avg;
        s32 rssi_spread;
        struct rate_info tx_rate, rx_rate;
        s64 rtt_avg;
        s64 rtt_variance;
        s64 rtt_spread;
        s64 dist_avg;
        s64 dist_variance;
        s64 dist_spread;

        u16 num_ftmr_attempts_valid:1,
            num_ftmr_successes_valid:1,
            rssi_avg_valid:1,
            rssi_spread_valid:1,
            tx_rate_valid:1,
            rx_rate_valid:1,
            rtt_avg_valid:1,
            rtt_variance_valid:1,
            rtt_spread_valid:1,
            dist_avg_valid:1,
            dist_variance_valid:1,
            dist_spread_valid:1;
};

/**
 * struct cfg80211_pmsr_result - peer measurement result
 * @addr: address of the peer
 * @host_time: host time (use ktime_get_boottime() adjust to the time when the
 *        measurement was made)
 * @ap_tsf: AP's TSF at measurement time
 * @status: status of the measurement
 * @final: if reporting partial results, mark this as the last one; if not
 *        reporting partial results always set this flag
 * @ap_tsf_valid: indicates the @ap_tsf value is valid
 * @type: type of the measurement reported, note that we only support reporting
 *        one type at a time, but you can report multiple results separately and
 *        they're all aggregated for userspace.
 */
struct cfg80211_pmsr_result {
        u64 host_time, ap_tsf;
        enum nl80211_peer_measurement_status status;

        u8 addr[ETH_ALEN];

        u8 final:1,
           ap_tsf_valid:1;

        enum nl80211_peer_measurement_type type;

        union {
                struct cfg80211_pmsr_ftm_result ftm;
        };
};

/**
 * struct cfg80211_pmsr_ftm_request_peer - FTM request data
 * @requested: indicates FTM is requested
 * @preamble: frame preamble to use
 * @burst_period: burst period to use
 * @asap: indicates to use ASAP mode
 * @num_bursts_exp: number of bursts exponent
 * @burst_duration: burst duration
 * @ftms_per_burst: number of FTMs per burst
 * @ftmr_retries: number of retries for FTM request
 * @request_lci: request LCI information
 * @request_civicloc: request civic location information
 * @trigger_based: use trigger based ranging for the measurement
 *                 If neither @trigger_based nor @non_trigger_based is set,
 *                 EDCA based ranging will be used.
 * @non_trigger_based: use non trigger based ranging for the measurement
 *                 If neither @trigger_based nor @non_trigger_based is set,
 *                 EDCA based ranging will be used.
 *
 * See also nl80211 for the respective attribute documentation.
 */
struct cfg80211_pmsr_ftm_request_peer {
        enum nl80211_preamble preamble;
        u16 burst_period;
        u8 requested:1,
           asap:1,
           request_lci:1,
           request_civicloc:1,
           trigger_based:1,
           non_trigger_based:1;
        u8 num_bursts_exp;
        u8 burst_duration;
        u8 ftms_per_burst;
        u8 ftmr_retries;
};

/**
 * struct cfg80211_pmsr_request_peer - peer data for a peer measurement request
 * @addr: MAC address
 * @chandef: channel to use
 * @report_ap_tsf: report the associated AP's TSF
 * @ftm: FTM data, see &struct cfg80211_pmsr_ftm_request_peer
 */
struct cfg80211_pmsr_request_peer {
        u8 addr[ETH_ALEN];
        struct cfg80211_chan_def chandef;
        u8 report_ap_tsf:1;
        struct cfg80211_pmsr_ftm_request_peer ftm;
};

/**
 * struct cfg80211_pmsr_request - peer measurement request
 * @cookie: cookie, set by cfg80211
 * @nl_portid: netlink portid - used by cfg80211
 * @drv_data: driver data for this request, if required for aborting,
 *        not otherwise freed or anything by cfg80211
 * @mac_addr: MAC address used for (randomised) request
 * @mac_addr_mask: MAC address mask used for randomisation, bits that
 *        are 0 in the mask should be randomised, bits that are 1 should
 *        be taken from the @mac_addr
 * @list: used by cfg80211 to hold on to the request
 * @timeout: timeout (in milliseconds) for the whole operation, if
 *        zero it means there's no timeout
 * @n_peers: number of peers to do measurements with
 * @peers: per-peer measurement request data
 */
struct cfg80211_pmsr_request {
        u64 cookie;
        void *drv_data;
        u32 n_peers;
        u32 nl_portid;

        u32 timeout;

        u8 mac_addr[ETH_ALEN] __aligned(2);
        u8 mac_addr_mask[ETH_ALEN] __aligned(2);

        struct list_head list;

        struct cfg80211_pmsr_request_peer peers[];
};

/**
 * struct cfg80211_update_owe_info - OWE Information
 *
 * This structure provides information needed for the drivers to offload OWE
 * (Opportunistic Wireless Encryption) processing to the user space.
 *
 * Commonly used across update_owe_info request and event interfaces.
 *
 * @peer: MAC address of the peer device for which the OWE processing
 *        has to be done.
 * @status: status code, %WLAN_STATUS_SUCCESS for successful OWE info
 *        processing, use %WLAN_STATUS_UNSPECIFIED_FAILURE if user space
 *        cannot give you the real status code for failures. Used only for
 *        OWE update request command interface (user space to driver).
 * @ie: IEs obtained from the peer or constructed by the user space. These are
 *        the IEs of the remote peer in the event from the host driver and
 *        the constructed IEs by the user space in the request interface.
 * @ie_len: Length of IEs in octets.
 */
struct cfg80211_update_owe_info {
        u8 peer[ETH_ALEN] __aligned(2);
        u16 status;
        const u8 *ie;
        size_t ie_len;
};

/**
 * struct mgmt_frame_regs - management frame registrations data
 * @global_stypes: bitmap of management frame subtypes registered
 *        for the entire device
 * @interface_stypes: bitmap of management frame subtypes registered
 *        for the given interface
 * @global_mcast_rx: mcast RX is needed globally for these subtypes
 * @interface_mcast_stypes: mcast RX is needed on this interface
 *        for these subtypes
 */
struct mgmt_frame_regs {
        u32 global_stypes, interface_stypes;
        u32 global_mcast_stypes, interface_mcast_stypes;
};

/**
 * struct cfg80211_ops - backend description for wireless configuration
 *
 * This struct is registered by fullmac card drivers and/or wireless stacks
 * in order to handle configuration requests on their interfaces.
 *
 * All callbacks except where otherwise noted should return 0
 * on success or a negative error code.
 *
 * All operations are currently invoked under rtnl for consistency with the
 * wireless extensions but this is subject to reevaluation as soon as this
 * code is used more widely and we have a first user without wext.
 *
 * @suspend: wiphy device needs to be suspended. The variable @wow will
 *        be %NULL or contain the enabled Wake-on-Wireless triggers that are
 *        configured for the device.
 * @resume: wiphy device needs to be resumed
 * @set_wakeup: Called when WoWLAN is enabled/disabled, use this callback
 *        to call device_set_wakeup_enable() to enable/disable wakeup from
 *        the device.
 *
 * @add_virtual_intf: create a new virtual interface with the given name,
 *        must set the struct wireless_dev's iftype. Beware: You must create
 *        the new netdev in the wiphy's network namespace! Returns the struct
 *        wireless_dev, or an ERR_PTR. For P2P device wdevs, the driver must
 *        also set the address member in the wdev.
 *
 * @del_virtual_intf: remove the virtual interface
 *
 * @change_virtual_intf: change type/configuration of virtual interface,
 *        keep the struct wireless_dev's iftype updated.
 *
 * @add_key: add a key with the given parameters. @mac_addr will be %NULL
 *        when adding a group key.
 *
 * @get_key: get information about the key with the given parameters.
 *        @mac_addr will be %NULL when requesting information for a group
 *        key. All pointers given to the @callback function need not be valid
 *        after it returns. This function should return an error if it is
 *        not possible to retrieve the key, -ENOENT if it doesn't exist.
 *
 * @del_key: remove a key given the @mac_addr (%NULL for a group key)
 *        and @key_index, return -ENOENT if the key doesn't exist.
 *
 * @set_default_key: set the default key on an interface
 *
 * @set_default_mgmt_key: set the default management frame key on an interface
 *
 * @set_default_beacon_key: set the default Beacon frame key on an interface
 *
 * @set_rekey_data: give the data necessary for GTK rekeying to the driver
 *
 * @start_ap: Start acting in AP mode defined by the parameters.
 * @change_beacon: Change the beacon parameters for an access point mode
 *        interface. This should reject the call when AP mode wasn't started.
 * @stop_ap: Stop being an AP, including stopping beaconing.
 *
 * @add_station: Add a new station.
 * @del_station: Remove a station
 * @change_station: Modify a given station. Note that flags changes are not much
 *        validated in cfg80211, in particular the auth/assoc/authorized flags
 *        might come to the driver in invalid combinations -- make sure to check
 *        them, also against the existing state! Drivers must call
 *        cfg80211_check_station_change() to validate the information.
 * @get_station: get station information for the station identified by @mac
 * @dump_station: dump station callback -- resume dump at index @idx
 *
 * @add_mpath: add a fixed mesh path
 * @del_mpath: delete a given mesh path
 * @change_mpath: change a given mesh path
 * @get_mpath: get a mesh path for the given parameters
 * @dump_mpath: dump mesh path callback -- resume dump at index @idx
 * @get_mpp: get a mesh proxy path for the given parameters
 * @dump_mpp: dump mesh proxy path callback -- resume dump at index @idx
 * @join_mesh: join the mesh network with the specified parameters
 *        (invoked with the wireless_dev mutex held)
 * @leave_mesh: leave the current mesh network
 *        (invoked with the wireless_dev mutex held)
 *
 * @get_mesh_config: Get the current mesh configuration
 *
 * @update_mesh_config: Update mesh parameters on a running mesh.
 *        The mask is a bitfield which tells us which parameters to
 *        set, and which to leave alone.
 *
 * @change_bss: Modify parameters for a given BSS.
 *
 * @set_txq_params: Set TX queue parameters
 *
 * @libertas_set_mesh_channel: Only for backward compatibility for libertas,
 *        as it doesn't implement join_mesh and needs to set the channel to
 *        join the mesh instead.
 *
 * @set_monitor_channel: Set the monitor mode channel for the device. If other
 *        interfaces are active this callback should reject the configuration.
 *        If no interfaces are active or the device is down, the channel should
 *        be stored for when a monitor interface becomes active.
 *
 * @scan: Request to do a scan. If returning zero, the scan request is given
 *        the driver, and will be valid until passed to cfg80211_scan_done().
 *        For scan results, call cfg80211_inform_bss(); you can call this outside
 *        the scan/scan_done bracket too.
 * @abort_scan: Tell the driver to abort an ongoing scan. The driver shall
 *        indicate the status of the scan through cfg80211_scan_done().
 *
 * @auth: Request to authenticate with the specified peer
 *        (invoked with the wireless_dev mutex held)
 * @assoc: Request to (re)associate with the specified peer
 *        (invoked with the wireless_dev mutex held)
 * @deauth: Request to deauthenticate from the specified peer
 *        (invoked with the wireless_dev mutex held)
 * @disassoc: Request to disassociate from the specified peer
 *        (invoked with the wireless_dev mutex held)
 *
 * @connect: Connect to the ESS with the specified parameters. When connected,
 *        call cfg80211_connect_result()/cfg80211_connect_bss() with status code
 *        %WLAN_STATUS_SUCCESS. If the connection fails for some reason, call
 *        cfg80211_connect_result()/cfg80211_connect_bss() with the status code
 *        from the AP or cfg80211_connect_timeout() if no frame with status code
 *        was received.
 *        The driver is allowed to roam to other BSSes within the ESS when the
 *        other BSS matches the connect parameters. When such roaming is initiated
 *        by the driver, the driver is expected to verify that the target matches
 *        the configured security parameters and to use Reassociation Request
 *        frame instead of Association Request frame.
 *        The connect function can also be used to request the driver to perform a
 *        specific roam when connected to an ESS. In that case, the prev_bssid
 *        parameter is set to the BSSID of the currently associated BSS as an
 *        indication of requesting reassociation.
 *        In both the driver-initiated and new connect() call initiated roaming
 *        cases, the result of roaming is indicated with a call to
 *        cfg80211_roamed(). (invoked with the wireless_dev mutex held)
 * @update_connect_params: Update the connect parameters while connected to a
 *        BSS. The updated parameters can be used by driver/firmware for
 *        subsequent BSS selection (roaming) decisions and to form the
 *        Authentication/(Re)Association Request frames. This call does not
 *        request an immediate disassociation or reassociation with the current
 *        BSS, i.e., this impacts only subsequent (re)associations. The bits in
 *        changed are defined in &enum cfg80211_connect_params_changed.
 *        (invoked with the wireless_dev mutex held)
 * @disconnect: Disconnect from the BSS/ESS or stop connection attempts if
 *      connection is in progress. Once done, call cfg80211_disconnected() in
 *      case connection was already established (invoked with the
 *      wireless_dev mutex held), otherwise call cfg80211_connect_timeout().
 *
 * @join_ibss: Join the specified IBSS (or create if necessary). Once done, call
 *        cfg80211_ibss_joined(), also call that function when changing BSSID due
 *        to a merge.
 *        (invoked with the wireless_dev mutex held)
 * @leave_ibss: Leave the IBSS.
 *        (invoked with the wireless_dev mutex held)
 *
 * @set_mcast_rate: Set the specified multicast rate (only if vif is in ADHOC or
 *        MESH mode)
 *
 * @set_wiphy_params: Notify that wiphy parameters have changed;
 *        @changed bitfield (see &enum wiphy_params_flags) describes which values
 *        have changed. The actual parameter values are available in
 *        struct wiphy. If returning an error, no value should be changed.
 *
 * @set_tx_power: set the transmit power according to the parameters,
 *        the power passed is in mBm, to get dBm use MBM_TO_DBM(). The
 *        wdev may be %NULL if power was set for the wiphy, and will
 *        always be %NULL unless the driver supports per-vif TX power
 *        (as advertised by the nl80211 feature flag.)
 * @get_tx_power: store the current TX power into the dbm variable;
 *        return 0 if successful
 *
 * @set_wds_peer: set the WDS peer for a WDS interface
 *
 * @rfkill_poll: polls the hw rfkill line, use cfg80211 reporting
 *        functions to adjust rfkill hw state
 *
 * @dump_survey: get site survey information.
 *
 * @remain_on_channel: Request the driver to remain awake on the specified
 *        channel for the specified duration to complete an off-channel
 *        operation (e.g., public action frame exchange). When the driver is
 *        ready on the requested channel, it must indicate this with an event
 *        notification by calling cfg80211_ready_on_channel().
 * @cancel_remain_on_channel: Cancel an on-going remain-on-channel operation.
 *        This allows the operation to be terminated prior to timeout based on
 *        the duration value.
 * @mgmt_tx: Transmit a management frame.
 * @mgmt_tx_cancel_wait: Cancel the wait time from transmitting a management
 *        frame on another channel
 *
 * @testmode_cmd: run a test mode command; @wdev may be %NULL
 * @testmode_dump: Implement a test mode dump. The cb->args[2] and up may be
 *        used by the function, but 0 and 1 must not be touched. Additionally,
 *        return error codes other than -ENOBUFS and -ENOENT will terminate the
 *        dump and return to userspace with an error, so be careful. If any data
 *        was passed in from userspace then the data/len arguments will be present
 *        and point to the data contained in %NL80211_ATTR_TESTDATA.
 *
 * @set_bitrate_mask: set the bitrate mask configuration
 *
 * @set_pmksa: Cache a PMKID for a BSSID. This is mostly useful for fullmac
 *        devices running firmwares capable of generating the (re) association
 *        RSN IE. It allows for faster roaming between WPA2 BSSIDs.
 * @del_pmksa: Delete a cached PMKID.
 * @flush_pmksa: Flush all cached PMKIDs.
 * @set_power_mgmt: Configure WLAN power management. A timeout value of -1
 *        allows the driver to adjust the dynamic ps timeout value.
 * @set_cqm_rssi_config: Configure connection quality monitor RSSI threshold.
 *        After configuration, the driver should (soon) send an event indicating
 *        the current level is above/below the configured threshold; this may
 *        need some care when the configuration is changed (without first being
 *        disabled.)
 * @set_cqm_rssi_range_config: Configure two RSSI thresholds in the
 *        connection quality monitor.  An event is to be sent only when the
 *        signal level is found to be outside the two values.  The driver should
 *        set %NL80211_EXT_FEATURE_CQM_RSSI_LIST if this method is implemented.
 *        If it is provided then there's no point providing @set_cqm_rssi_config.
 * @set_cqm_txe_config: Configure connection quality monitor TX error
 *        thresholds.
 * @sched_scan_start: Tell the driver to start a scheduled scan.
 * @sched_scan_stop: Tell the driver to stop an ongoing scheduled scan with
 *        given request id. This call must stop the scheduled scan and be ready
 *        for starting a new one before it returns, i.e. @sched_scan_start may be
 *        called immediately after that again and should not fail in that case.
 *        The driver should not call cfg80211_sched_scan_stopped() for a requested
 *        stop (when this method returns 0).
 *
 * @update_mgmt_frame_registrations: Notify the driver that management frame
 *        registrations were updated. The callback is allowed to sleep.
 *
 * @set_antenna: Set antenna configuration (tx_ant, rx_ant) on the device.
 *        Parameters are bitmaps of allowed antennas to use for TX/RX. Drivers may
 *        reject TX/RX mask combinations they cannot support by returning -EINVAL
 *        (also see nl80211.h @NL80211_ATTR_WIPHY_ANTENNA_TX).
 *
 * @get_antenna: Get current antenna configuration from device (tx_ant, rx_ant).
 *
 * @tdls_mgmt: Transmit a TDLS management frame.
 * @tdls_oper: Perform a high-level TDLS operation (e.g. TDLS link setup).
 *
 * @probe_client: probe an associated client, must return a cookie that it
 *        later passes to cfg80211_probe_status().
 *
 * @set_noack_map: Set the NoAck Map for the TIDs.
 *
 * @get_channel: Get the current operating channel for the virtual interface.
 *        For monitor interfaces, it should return %NULL unless there's a single
 *        current monitoring channel.
 *
 * @start_p2p_device: Start the given P2P device.
 * @stop_p2p_device: Stop the given P2P device.
 *
 * @set_mac_acl: Sets MAC address control list in AP and P2P GO mode.
 *        Parameters include ACL policy, an array of MAC address of stations
 *        and the number of MAC addresses. If there is already a list in driver
 *        this new list replaces the existing one. Driver has to clear its ACL
 *        when number of MAC addresses entries is passed as 0. Drivers which
 *        advertise the support for MAC based ACL have to implement this callback.
 *
 * @start_radar_detection: Start radar detection in the driver.
 *
 * @end_cac: End running CAC, probably because a related CAC
 *        was finished on another phy.
 *
 * @update_ft_ies: Provide updated Fast BSS Transition information to the
 *        driver. If the SME is in the driver/firmware, this information can be
 *        used in building Authentication and Reassociation Request frames.
 *
 * @crit_proto_start: Indicates a critical protocol needs more link reliability
 *        for a given duration (milliseconds). The protocol is provided so the
 *        driver can take the most appropriate actions.
 * @crit_proto_stop: Indicates critical protocol no longer needs increased link
 *        reliability. This operation can not fail.
 * @set_coalesce: Set coalesce parameters.
 *
 * @channel_switch: initiate channel-switch procedure (with CSA). Driver is
 *        responsible for veryfing if the switch is possible. Since this is
 *        inherently tricky driver may decide to disconnect an interface later
 *        with cfg80211_stop_iface(). This doesn't mean driver can accept
 *        everything. It should do it's best to verify requests and reject them
 *        as soon as possible.
 *
 * @set_qos_map: Set QoS mapping information to the driver
 *
 * @set_ap_chanwidth: Set the AP (including P2P GO) mode channel width for the
 *        given interface This is used e.g. for dynamic HT 20/40 MHz channel width
 *        changes during the lifetime of the BSS.
 *
 * @add_tx_ts: validate (if admitted_time is 0) or add a TX TS to the device
 *        with the given parameters; action frame exchange has been handled by
 *        userspace so this just has to modify the TX path to take the TS into
 *        account.
 *        If the admitted time is 0 just validate the parameters to make sure
 *        the session can be created at all; it is valid to just always return
 *        success for that but that may result in inefficient behaviour (handshake
 *        with the peer followed by immediate teardown when the addition is later
 *        rejected)
 * @del_tx_ts: remove an existing TX TS
 *
 * @join_ocb: join the OCB network with the specified parameters
 *        (invoked with the wireless_dev mutex held)
 * @leave_ocb: leave the current OCB network
 *        (invoked with the wireless_dev mutex held)
 *
 * @tdls_channel_switch: Start channel-switching with a TDLS peer. The driver
 *        is responsible for continually initiating channel-switching operations
 *        and returning to the base channel for communication with the AP.
 * @tdls_cancel_channel_switch: Stop channel-switching with a TDLS peer. Both
 *        peers must be on the base channel when the call completes.
 * @start_nan: Start the NAN interface.
 * @stop_nan: Stop the NAN interface.
 * @add_nan_func: Add a NAN function. Returns negative value on failure.
 *        On success @nan_func ownership is transferred to the driver and
 *        it may access it outside of the scope of this function. The driver
 *        should free the @nan_func when no longer needed by calling
 *        cfg80211_free_nan_func().
 *        On success the driver should assign an instance_id in the
 *        provided @nan_func.
 * @del_nan_func: Delete a NAN function.
 * @nan_change_conf: changes NAN configuration. The changed parameters must
 *        be specified in @changes (using &enum cfg80211_nan_conf_changes);
 *        All other parameters must be ignored.
 *
 * @set_multicast_to_unicast: configure multicast to unicast conversion for BSS
 *
 * @get_txq_stats: Get TXQ stats for interface or phy. If wdev is %NULL, this
 *      function should return phy stats, and interface stats otherwise.
 *
 * @set_pmk: configure the PMK to be used for offloaded 802.1X 4-Way handshake.
 *        If not deleted through @del_pmk the PMK remains valid until disconnect
 *        upon which the driver should clear it.
 *        (invoked with the wireless_dev mutex held)
 * @del_pmk: delete the previously configured PMK for the given authenticator.
 *        (invoked with the wireless_dev mutex held)
 *
 * @external_auth: indicates result of offloaded authentication processing from
 *     user space
 *
 * @tx_control_port: TX a control port frame (EAPoL).  The noencrypt parameter
 *        tells the driver that the frame should not be encrypted.
 *
 * @get_ftm_responder_stats: Retrieve FTM responder statistics, if available.
 *        Statistics should be cumulative, currently no way to reset is provided.
 * @start_pmsr: start peer measurement (e.g. FTM)
 * @abort_pmsr: abort peer measurement
 *
 * @update_owe_info: Provide updated OWE info to driver. Driver implementing SME
 *        but offloading OWE processing to the user space will get the updated
 *        DH IE through this interface.
 *
 * @probe_mesh_link: Probe direct Mesh peer's link quality by sending data frame
 *        and overrule HWMP path selection algorithm.
 * @set_tid_config: TID specific configuration, this can be peer or BSS specific
 *        This callback may sleep.
 * @reset_tid_config: Reset TID specific configuration for the peer, for the
 *        given TIDs. This callback may sleep.
 */
struct cfg80211_ops {
        int        (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow);
        int        (*resume)(struct wiphy *wiphy);
        void        (*set_wakeup)(struct wiphy *wiphy, bool enabled);

        struct wireless_dev * (*add_virtual_intf)(struct wiphy *wiphy,
                                                  const char *name,
                                                  unsigned char name_assign_type,
                                                  enum nl80211_iftype type,
                                                  struct vif_params *params);
        int        (*del_virtual_intf)(struct wiphy *wiphy,
                                    struct wireless_dev *wdev);
        int        (*change_virtual_intf)(struct wiphy *wiphy,
                                       struct net_device *dev,
                                       enum nl80211_iftype type,
                                       struct vif_params *params);

        int        (*add_key)(struct wiphy *wiphy, struct net_device *netdev,
                           u8 key_index, bool pairwise, const u8 *mac_addr,
                           struct key_params *params);
        int        (*get_key)(struct wiphy *wiphy, struct net_device *netdev,
                           u8 key_index, bool pairwise, const u8 *mac_addr,
                           void *cookie,
                           void (*callback)(void *cookie, struct key_params*));
        int        (*del_key)(struct wiphy *wiphy, struct net_device *netdev,
                           u8 key_index, bool pairwise, const u8 *mac_addr);
        int        (*set_default_key)(struct wiphy *wiphy,
                                   struct net_device *netdev,
                                   u8 key_index, bool unicast, bool multicast);
        int        (*set_default_mgmt_key)(struct wiphy *wiphy,
                                        struct net_device *netdev,
                                        u8 key_index);
        int        (*set_default_beacon_key)(struct wiphy *wiphy,
                                          struct net_device *netdev,
                                          u8 key_index);

        int        (*start_ap)(struct wiphy *wiphy, struct net_device *dev,
                            struct cfg80211_ap_settings *settings);
        int        (*change_beacon)(struct wiphy *wiphy, struct net_device *dev,
                                 struct cfg80211_beacon_data *info);
        int        (*stop_ap)(struct wiphy *wiphy, struct net_device *dev);


        int        (*add_station)(struct wiphy *wiphy, struct net_device *dev,
                               const u8 *mac,
                               struct station_parameters *params);
        int        (*del_station)(struct wiphy *wiphy, struct net_device *dev,
                               struct station_del_parameters *params);
        int        (*change_station)(struct wiphy *wiphy, struct net_device *dev,
                                  const u8 *mac,
                                  struct station_parameters *params);
        int        (*get_station)(struct wiphy *wiphy, struct net_device *dev,
                               const u8 *mac, struct station_info *sinfo);
        int        (*dump_station)(struct wiphy *wiphy, struct net_device *dev,
                                int idx, u8 *mac, struct station_info *sinfo);

        int        (*add_mpath)(struct wiphy *wiphy, struct net_device *dev,
                               const u8 *dst, const u8 *next_hop);
        int        (*del_mpath)(struct wiphy *wiphy, struct net_device *dev,
                               const u8 *dst);
        int        (*change_mpath)(struct wiphy *wiphy, struct net_device *dev,
                                  const u8 *dst, const u8 *next_hop);
        int        (*get_mpath)(struct wiphy *wiphy, struct net_device *dev,
                             u8 *dst, u8 *next_hop, struct mpath_info *pinfo);
        int        (*dump_mpath)(struct wiphy *wiphy, struct net_device *dev,
                              int idx, u8 *dst, u8 *next_hop,
                              struct mpath_info *pinfo);
        int        (*get_mpp)(struct wiphy *wiphy, struct net_device *dev,
                           u8 *dst, u8 *mpp, struct mpath_info *pinfo);
        int        (*dump_mpp)(struct wiphy *wiphy, struct net_device *dev,
                            int idx, u8 *dst, u8 *mpp,
                            struct mpath_info *pinfo);
        int        (*get_mesh_config)(struct wiphy *wiphy,
                                struct net_device *dev,
                                struct mesh_config *conf);
        int        (*update_mesh_config)(struct wiphy *wiphy,
                                      struct net_device *dev, u32 mask,
                                      const struct mesh_config *nconf);
        int        (*join_mesh)(struct wiphy *wiphy, struct net_device *dev,
                             const struct mesh_config *conf,
                             const struct mesh_setup *setup);
        int        (*leave_mesh)(struct wiphy *wiphy, struct net_device *dev);

        int        (*join_ocb)(struct wiphy *wiphy, struct net_device *dev,
                            struct ocb_setup *setup);
        int        (*leave_ocb)(struct wiphy *wiphy, struct net_device *dev);

        int        (*change_bss)(struct wiphy *wiphy, struct net_device *dev,
                              struct bss_parameters *params);

        int        (*set_txq_params)(struct wiphy *wiphy, struct net_device *dev,
                                  struct ieee80211_txq_params *params);

        int        (*libertas_set_mesh_channel)(struct wiphy *wiphy,
                                             struct net_device *dev,
                                             struct ieee80211_channel *chan);

        int        (*set_monitor_channel)(struct wiphy *wiphy,
                                       struct cfg80211_chan_def *chandef);

        int        (*scan)(struct wiphy *wiphy,
                        struct cfg80211_scan_request *request);
        void        (*abort_scan)(struct wiphy *wiphy, struct wireless_dev *wdev);

        int        (*auth)(struct wiphy *wiphy, struct net_device *dev,
                        struct cfg80211_auth_request *req);
        int        (*assoc)(struct wiphy *wiphy, struct net_device *dev,
                         struct cfg80211_assoc_request *req);
        int        (*deauth)(struct wiphy *wiphy, struct net_device *dev,
                          struct cfg80211_deauth_request *req);
        int        (*disassoc)(struct wiphy *wiphy, struct net_device *dev,
                            struct cfg80211_disassoc_request *req);

        int        (*connect)(struct wiphy *wiphy, struct net_device *dev,
                           struct cfg80211_connect_params *sme);
        int        (*update_connect_params)(struct wiphy *wiphy,
                                         struct net_device *dev,
                                         struct cfg80211_connect_params *sme,
                                         u32 changed);
        int        (*disconnect)(struct wiphy *wiphy, struct net_device *dev,
                              u16 reason_code);

        int        (*join_ibss)(struct wiphy *wiphy, struct net_device *dev,
                             struct cfg80211_ibss_params *params);
        int        (*leave_ibss)(struct wiphy *wiphy, struct net_device *dev);

        int        (*set_mcast_rate)(struct wiphy *wiphy, struct net_device *dev,
                                  int rate[NUM_NL80211_BANDS]);

        int        (*set_wiphy_params)(struct wiphy *wiphy, u32 changed);

        int        (*set_tx_power)(struct wiphy *wiphy, struct wireless_dev *wdev,
                                enum nl80211_tx_power_setting type, int mbm);
        int        (*get_tx_power)(struct wiphy *wiphy, struct wireless_dev *wdev,
                                int *dbm);

        int        (*set_wds_peer)(struct wiphy *wiphy, struct net_device *dev,
                                const u8 *addr);

        void        (*rfkill_poll)(struct wiphy *wiphy);

#ifdef CONFIG_NL80211_TESTMODE
        int        (*testmode_cmd)(struct wiphy *wiphy, struct wireless_dev *wdev,
                                void *data, int len);
        int        (*testmode_dump)(struct wiphy *wiphy, struct sk_buff *skb,
                                 struct netlink_callback *cb,
                                 void *data, int len);
#endif

        int        (*set_bitrate_mask)(struct wiphy *wiphy,
                                    struct net_device *dev,
                                    const u8 *peer,
                                    const struct cfg80211_bitrate_mask *mask);

        int        (*dump_survey)(struct wiphy *wiphy, struct net_device *netdev,
                        int idx, struct survey_info *info);

        int        (*set_pmksa)(struct wiphy *wiphy, struct net_device *netdev,
                             struct cfg80211_pmksa *pmksa);
        int        (*del_pmksa)(struct wiphy *wiphy, struct net_device *netdev,
                             struct cfg80211_pmksa *pmksa);
        int        (*flush_pmksa)(struct wiphy *wiphy, struct net_device *netdev);

        int        (*remain_on_channel)(struct wiphy *wiphy,
                                     struct wireless_dev *wdev,
                                     struct ieee80211_channel *chan,
                                     unsigned int duration,
                                     u64 *cookie);
        int        (*cancel_remain_on_channel)(struct wiphy *wiphy,
                                            struct wireless_dev *wdev,
                                            u64 cookie);

        int        (*mgmt_tx)(struct wiphy *wiphy, struct wireless_dev *wdev,
                           struct cfg80211_mgmt_tx_params *params,
                           u64 *cookie);
        int        (*mgmt_tx_cancel_wait)(struct wiphy *wiphy,
                                       struct wireless_dev *wdev,
                                       u64 cookie);

        int        (*set_power_mgmt)(struct wiphy *wiphy, struct net_device *dev,
                                  bool enabled, int timeout);

        int        (*set_cqm_rssi_config)(struct wiphy *wiphy,
                                       struct net_device *dev,
                                       s32 rssi_thold, u32 rssi_hyst);

        int        (*set_cqm_rssi_range_config)(struct wiphy *wiphy,
                                             struct net_device *dev,
                                             s32 rssi_low, s32 rssi_high);

        int        (*set_cqm_txe_config)(struct wiphy *wiphy,
                                      struct net_device *dev,
                                      u32 rate, u32 pkts, u32 intvl);

        void        (*update_mgmt_frame_registrations)(struct wiphy *wiphy,
                                                   struct wireless_dev *wdev,
                                                   struct mgmt_frame_regs *upd);

        int        (*set_antenna)(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant);
        int        (*get_antenna)(struct wiphy *wiphy, u32 *tx_ant, u32 *rx_ant);

        int        (*sched_scan_start)(struct wiphy *wiphy,
                                struct net_device *dev,
                                struct cfg80211_sched_scan_request *request);
        int        (*sched_scan_stop)(struct wiphy *wiphy, struct net_device *dev,
                                   u64 reqid);

        int        (*set_rekey_data)(struct wiphy *wiphy, struct net_device *dev,
                                  struct cfg80211_gtk_rekey_data *data);

        int        (*tdls_mgmt)(struct wiphy *wiphy, struct net_device *dev,
                             const u8 *peer, u8 action_code,  u8 dialog_token,
                             u16 status_code, u32 peer_capability,
                             bool initiator, const u8 *buf, size_t len);
        int        (*tdls_oper)(struct wiphy *wiphy, struct net_device *dev,
                             const u8 *peer, enum nl80211_tdls_operation oper);

        int        (*probe_client)(struct wiphy *wiphy, struct net_device *dev,
                                const u8 *peer, u64 *cookie);

        int        (*set_noack_map)(struct wiphy *wiphy,
                                  struct net_device *dev,
                                  u16 noack_map);

        int        (*get_channel)(struct wiphy *wiphy,
                               struct wireless_dev *wdev,
                               struct cfg80211_chan_def *chandef);

        int        (*start_p2p_device)(struct wiphy *wiphy,
                                    struct wireless_dev *wdev);
        void        (*stop_p2p_device)(struct wiphy *wiphy,
                                   struct wireless_dev *wdev);

        int        (*set_mac_acl)(struct wiphy *wiphy, struct net_device *dev,
                               const struct cfg80211_acl_data *params);

        int        (*start_radar_detection)(struct wiphy *wiphy,
                                         struct net_device *dev,
                                         struct cfg80211_chan_def *chandef,
                                         u32 cac_time_ms);
        void        (*end_cac)(struct wiphy *wiphy,
                                struct net_device *dev);
        int        (*update_ft_ies)(struct wiphy *wiphy, struct net_device *dev,
                                 struct cfg80211_update_ft_ies_params *ftie);
        int        (*crit_proto_start)(struct wiphy *wiphy,
                                    struct wireless_dev *wdev,
                                    enum nl80211_crit_proto_id protocol,
                                    u16 duration);
        void        (*crit_proto_stop)(struct wiphy *wiphy,
                                   struct wireless_dev *wdev);
        int        (*set_coalesce)(struct wiphy *wiphy,
                                struct cfg80211_coalesce *coalesce);

        int        (*channel_switch)(struct wiphy *wiphy,
                                  struct net_device *dev,
                                  struct cfg80211_csa_settings *params);

        int     (*set_qos_map)(struct wiphy *wiphy,
                               struct net_device *dev,
                               struct cfg80211_qos_map *qos_map);

        int        (*set_ap_chanwidth)(struct wiphy *wiphy, struct net_device *dev,
                                    struct cfg80211_chan_def *chandef);

        int        (*add_tx_ts)(struct wiphy *wiphy, struct net_device *dev,
                             u8 tsid, const u8 *peer, u8 user_prio,
                             u16 admitted_time);
        int        (*del_tx_ts)(struct wiphy *wiphy, struct net_device *dev,
                             u8 tsid, const u8 *peer);

        int        (*tdls_channel_switch)(struct wiphy *wiphy,
                                       struct net_device *dev,
                                       const u8 *addr, u8 oper_class,
                                       struct cfg80211_chan_def *chandef);
        void        (*tdls_cancel_channel_switch)(struct wiphy *wiphy,
                                              struct net_device *dev,
                                              const u8 *addr);
        int        (*start_nan)(struct wiphy *wiphy, struct wireless_dev *wdev,
                             struct cfg80211_nan_conf *conf);
        void        (*stop_nan)(struct wiphy *wiphy, struct wireless_dev *wdev);
        int        (*add_nan_func)(struct wiphy *wiphy, struct wireless_dev *wdev,
                                struct cfg80211_nan_func *nan_func);
        void        (*del_nan_func)(struct wiphy *wiphy, struct wireless_dev *wdev,
                               u64 cookie);
        int        (*nan_change_conf)(struct wiphy *wiphy,
                                   struct wireless_dev *wdev,
                                   struct cfg80211_nan_conf *conf,
                                   u32 changes);

        int        (*set_multicast_to_unicast)(struct wiphy *wiphy,
                                            struct net_device *dev,
                                            const bool enabled);

        int        (*get_txq_stats)(struct wiphy *wiphy,
                                 struct wireless_dev *wdev,
                                 struct cfg80211_txq_stats *txqstats);

        int        (*set_pmk)(struct wiphy *wiphy, struct net_device *dev,
                           const struct cfg80211_pmk_conf *conf);
        int        (*del_pmk)(struct wiphy *wiphy, struct net_device *dev,
                           const u8 *aa);
        int     (*external_auth)(struct wiphy *wiphy, struct net_device *dev,
                                 struct cfg80211_external_auth_params *params);

        int        (*tx_control_port)(struct wiphy *wiphy,
                                   struct net_device *dev,
                                   const u8 *buf, size_t len,
                                   const u8 *dest, const __be16 proto,
                                   const bool noencrypt,
                                   u64 *cookie);

        int        (*get_ftm_responder_stats)(struct wiphy *wiphy,
                                struct net_device *dev,
                                struct cfg80211_ftm_responder_stats *ftm_stats);

        int        (*start_pmsr)(struct wiphy *wiphy, struct wireless_dev *wdev,
                              struct cfg80211_pmsr_request *request);
        void        (*abort_pmsr)(struct wiphy *wiphy, struct wireless_dev *wdev,
                              struct cfg80211_pmsr_request *request);
        int        (*update_owe_info)(struct wiphy *wiphy, struct net_device *dev,
                                   struct cfg80211_update_owe_info *owe_info);
        int        (*probe_mesh_link)(struct wiphy *wiphy, struct net_device *dev,
                                   const u8 *buf, size_t len);
        int     (*set_tid_config)(struct wiphy *wiphy, struct net_device *dev,
                                  struct cfg80211_tid_config *tid_conf);
        int        (*reset_tid_config)(struct wiphy *wiphy, struct net_device *dev,
                                    const u8 *peer, u8 tids);
};

/*
 * wireless hardware and networking interfaces structures
 * and registration/helper functions
 */

/**
 * enum wiphy_flags - wiphy capability flags
 *
 * @WIPHY_FLAG_SPLIT_SCAN_6GHZ: if set to true, the scan request will be split
 *         into two, first for legacy bands and second for UHB.
 * @WIPHY_FLAG_NETNS_OK: if not set, do not allow changing the netns of this
 *        wiphy at all
 * @WIPHY_FLAG_PS_ON_BY_DEFAULT: if set to true, powersave will be enabled
 *        by default -- this flag will be set depending on the kernel's default
 *        on wiphy_new(), but can be changed by the driver if it has a good
 *        reason to override the default
 * @WIPHY_FLAG_4ADDR_AP: supports 4addr mode even on AP (with a single station
 *        on a VLAN interface). This flag also serves an extra purpose of
 *        supporting 4ADDR AP mode on devices which do not support AP/VLAN iftype.
 * @WIPHY_FLAG_4ADDR_STATION: supports 4addr mode even as a station
 * @WIPHY_FLAG_CONTROL_PORT_PROTOCOL: This device supports setting the
 *        control port protocol ethertype. The device also honours the
 *        control_port_no_encrypt flag.
 * @WIPHY_FLAG_IBSS_RSN: The device supports IBSS RSN.
 * @WIPHY_FLAG_MESH_AUTH: The device supports mesh authentication by routing
 *        auth frames to userspace. See @NL80211_MESH_SETUP_USERSPACE_AUTH.
 * @WIPHY_FLAG_SUPPORTS_FW_ROAM: The device supports roaming feature in the
 *        firmware.
 * @WIPHY_FLAG_AP_UAPSD: The device supports uapsd on AP.
 * @WIPHY_FLAG_SUPPORTS_TDLS: The device supports TDLS (802.11z) operation.
 * @WIPHY_FLAG_TDLS_EXTERNAL_SETUP: The device does not handle TDLS (802.11z)
 *        link setup/discovery operations internally. Setup, discovery and
 *        teardown packets should be sent through the @NL80211_CMD_TDLS_MGMT
 *        command. When this flag is not set, @NL80211_CMD_TDLS_OPER should be
 *        used for asking the driver/firmware to perform a TDLS operation.
 * @WIPHY_FLAG_HAVE_AP_SME: device integrates AP SME
 * @WIPHY_FLAG_REPORTS_OBSS: the device will report beacons from other BSSes
 *        when there are virtual interfaces in AP mode by calling
 *        cfg80211_report_obss_beacon().
 * @WIPHY_FLAG_AP_PROBE_RESP_OFFLOAD: When operating as an AP, the device
 *        responds to probe-requests in hardware.
 * @WIPHY_FLAG_OFFCHAN_TX: Device supports direct off-channel TX.
 * @WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL: Device supports remain-on-channel call.
 * @WIPHY_FLAG_SUPPORTS_5_10_MHZ: Device supports 5 MHz and 10 MHz channels.
 * @WIPHY_FLAG_HAS_CHANNEL_SWITCH: Device supports channel switch in
 *        beaconing mode (AP, IBSS, Mesh, ...).
 * @WIPHY_FLAG_HAS_STATIC_WEP: The device supports static WEP key installation
 *        before connection.
 * @WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK: The device supports bigger kek and kck keys
 */
enum wiphy_flags {
        WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK                = BIT(0),
        /* use hole at 1 */
        WIPHY_FLAG_SPLIT_SCAN_6GHZ                = BIT(2),
        WIPHY_FLAG_NETNS_OK                        = BIT(3),
        WIPHY_FLAG_PS_ON_BY_DEFAULT                = BIT(4),
        WIPHY_FLAG_4ADDR_AP                        = BIT(5),
        WIPHY_FLAG_4ADDR_STATION                = BIT(6),
        WIPHY_FLAG_CONTROL_PORT_PROTOCOL        = BIT(7),
        WIPHY_FLAG_IBSS_RSN                        = BIT(8),
        WIPHY_FLAG_MESH_AUTH                        = BIT(10),
        /* use hole at 11 */
        /* use hole at 12 */
        WIPHY_FLAG_SUPPORTS_FW_ROAM                = BIT(13),
        WIPHY_FLAG_AP_UAPSD                        = BIT(14),
        WIPHY_FLAG_SUPPORTS_TDLS                = BIT(15),
        WIPHY_FLAG_TDLS_EXTERNAL_SETUP                = BIT(16),
        WIPHY_FLAG_HAVE_AP_SME                        = BIT(17),
        WIPHY_FLAG_REPORTS_OBSS                        = BIT(18),
        WIPHY_FLAG_AP_PROBE_RESP_OFFLOAD        = BIT(19),
        WIPHY_FLAG_OFFCHAN_TX                        = BIT(20),
        WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL        = BIT(21),
        WIPHY_FLAG_SUPPORTS_5_10_MHZ                = BIT(22),
        WIPHY_FLAG_HAS_CHANNEL_SWITCH                = BIT(23),
        WIPHY_FLAG_HAS_STATIC_WEP                = BIT(24),
};

/**
 * struct ieee80211_iface_limit - limit on certain interface types
 * @max: maximum number of interfaces of these types
 * @types: interface types (bits)
 */
struct ieee80211_iface_limit {
        u16 max;
        u16 types;
};

/**
 * struct ieee80211_iface_combination - possible interface combination
 *
 * With this structure the driver can describe which interface
 * combinations it supports concurrently.
 *
 * Examples:
 *
 * 1. Allow #STA <= 1, #AP <= 1, matching BI, channels = 1, 2 total:
 *
 *    .. code-block:: c
 *
 *        struct ieee80211_iface_limit limits1[] = {
 *                { .max = 1, .types = BIT(NL80211_IFTYPE_STATION), },
 *                { .max = 1, .types = BIT(NL80211_IFTYPE_AP}, },
 *        };
 *        struct ieee80211_iface_combination combination1 = {
 *                .limits = limits1,
 *                .n_limits = ARRAY_SIZE(limits1),
 *                .max_interfaces = 2,
 *                .beacon_int_infra_match = true,
 *        };
 *
 *
 * 2. Allow #{AP, P2P-GO} <= 8, channels = 1, 8 total:
 *
 *    .. code-block:: c
 *
 *        struct ieee80211_iface_limit limits2[] = {
 *                { .max = 8, .types = BIT(NL80211_IFTYPE_AP) |
 *                                     BIT(NL80211_IFTYPE_P2P_GO), },
 *        };
 *        struct ieee80211_iface_combination combination2 = {
 *                .limits = limits2,
 *                .n_limits = ARRAY_SIZE(limits2),
 *                .max_interfaces = 8,
 *                .num_different_channels = 1,
 *        };
 *
 *
 * 3. Allow #STA <= 1, #{P2P-client,P2P-GO} <= 3 on two channels, 4 total.
 *
 *    This allows for an infrastructure connection and three P2P connections.
 *
 *    .. code-block:: c
 *
 *        struct ieee80211_iface_limit limits3[] = {
 *                { .max = 1, .types = BIT(NL80211_IFTYPE_STATION), },
 *                { .max = 3, .types = BIT(NL80211_IFTYPE_P2P_GO) |
 *                                     BIT(NL80211_IFTYPE_P2P_CLIENT), },
 *        };
 *        struct ieee80211_iface_combination combination3 = {
 *                .limits = limits3,
 *                .n_limits = ARRAY_SIZE(limits3),
 *                .max_interfaces = 4,
 *                .num_different_channels = 2,
 *        };
 *
 */
struct ieee80211_iface_combination {
        /**
         * @limits:
         * limits for the given interface types
         */
        const struct ieee80211_iface_limit *limits;

        /**
         * @num_different_channels:
         * can use up to this many different channels
         */
        u32 num_different_channels;

        /**
         * @max_interfaces:
         * maximum number of interfaces in total allowed in this group
         */
        u16 max_interfaces;

        /**
         * @n_limits:
         * number of limitations
         */
        u8 n_limits;

        /**
         * @beacon_int_infra_match:
         * In this combination, the beacon intervals between infrastructure
         * and AP types must match. This is required only in special cases.
         */
        bool beacon_int_infra_match;

        /**
         * @radar_detect_widths:
         * bitmap of channel widths supported for radar detection
         */
        u8 radar_detect_widths;

        /**
         * @radar_detect_regions:
         * bitmap of regions supported for radar detection
         */
        u8 radar_detect_regions;

        /**
         * @beacon_int_min_gcd:
         * This interface combination supports different beacon intervals.
         *
         * = 0
         *   all beacon intervals for different interface must be same.
         * > 0
         *   any beacon interval for the interface part of this combination AND
         *   GCD of all beacon intervals from beaconing interfaces of this
         *   combination must be greater or equal to this value.
         */
        u32 beacon_int_min_gcd;
};

struct ieee80211_txrx_stypes {
        u16 tx, rx;
};

/**
 * enum wiphy_wowlan_support_flags - WoWLAN support flags
 * @WIPHY_WOWLAN_ANY: supports wakeup for the special "any"
 *        trigger that keeps the device operating as-is and
 *        wakes up the host on any activity, for example a
 *        received packet that passed filtering; note that the
 *        packet should be preserved in that case
 * @WIPHY_WOWLAN_MAGIC_PKT: supports wakeup on magic packet
 *        (see nl80211.h)
 * @WIPHY_WOWLAN_DISCONNECT: supports wakeup on disconnect
 * @WIPHY_WOWLAN_SUPPORTS_GTK_REKEY: supports GTK rekeying while asleep
 * @WIPHY_WOWLAN_GTK_REKEY_FAILURE: supports wakeup on GTK rekey failure
 * @WIPHY_WOWLAN_EAP_IDENTITY_REQ: supports wakeup on EAP identity request
 * @WIPHY_WOWLAN_4WAY_HANDSHAKE: supports wakeup on 4-way handshake failure
 * @WIPHY_WOWLAN_RFKILL_RELEASE: supports wakeup on RF-kill release
 * @WIPHY_WOWLAN_NET_DETECT: supports wakeup on network detection
 */
enum wiphy_wowlan_support_flags {
        WIPHY_WOWLAN_ANY                = BIT(0),
        WIPHY_WOWLAN_MAGIC_PKT                = BIT(1),
        WIPHY_WOWLAN_DISCONNECT                = BIT(2),
        WIPHY_WOWLAN_SUPPORTS_GTK_REKEY        = BIT(3),
        WIPHY_WOWLAN_GTK_REKEY_FAILURE        = BIT(4),
        WIPHY_WOWLAN_EAP_IDENTITY_REQ        = BIT(5),
        WIPHY_WOWLAN_4WAY_HANDSHAKE        = BIT(6),
        WIPHY_WOWLAN_RFKILL_RELEASE        = BIT(7),
        WIPHY_WOWLAN_NET_DETECT                = BIT(8),
};

struct wiphy_wowlan_tcp_support {
        const struct nl80211_wowlan_tcp_data_token_feature *tok;
        u32 data_payload_max;
        u32 data_interval_max;
        u32 wake_payload_max;
        bool seq;
};

/**
 * struct wiphy_wowlan_support - WoWLAN support data
 * @flags: see &enum wiphy_wowlan_support_flags
 * @n_patterns: number of supported wakeup patterns
 *        (see nl80211.h for the pattern definition)
 * @pattern_max_len: maximum length of each pattern
 * @pattern_min_len: minimum length of each pattern
 * @max_pkt_offset: maximum Rx packet offset
 * @max_nd_match_sets: maximum number of matchsets for net-detect,
 *        similar, but not necessarily identical, to max_match_sets for
 *        scheduled scans.
 *        See &struct cfg80211_sched_scan_request.@match_sets for more
 *        details.
 * @tcp: TCP wakeup support information
 */
struct wiphy_wowlan_support {
        u32 flags;
        int n_patterns;
        int pattern_max_len;
        int pattern_min_len;
        int max_pkt_offset;
        int max_nd_match_sets;
        const struct wiphy_wowlan_tcp_support *tcp;
};

/**
 * struct wiphy_coalesce_support - coalesce support data
 * @n_rules: maximum number of coalesce rules
 * @max_delay: maximum supported coalescing delay in msecs
 * @n_patterns: number of supported patterns in a rule
 *        (see nl80211.h for the pattern definition)
 * @pattern_max_len: maximum length of each pattern
 * @pattern_min_len: minimum length of each pattern
 * @max_pkt_offset: maximum Rx packet offset
 */
struct wiphy_coalesce_support {
        int n_rules;
        int max_delay;
        int n_patterns;
        int pattern_max_len;
        int pattern_min_len;
        int max_pkt_offset;
};

/**
 * enum wiphy_vendor_command_flags - validation flags for vendor commands
 * @WIPHY_VENDOR_CMD_NEED_WDEV: vendor command requires wdev
 * @WIPHY_VENDOR_CMD_NEED_NETDEV: vendor command requires netdev
 * @WIPHY_VENDOR_CMD_NEED_RUNNING: interface/wdev must be up & running
 *        (must be combined with %_WDEV or %_NETDEV)
 */
enum wiphy_vendor_command_flags {
        WIPHY_VENDOR_CMD_NEED_WDEV = BIT(0),
        WIPHY_VENDOR_CMD_NEED_NETDEV = BIT(1),
        WIPHY_VENDOR_CMD_NEED_RUNNING = BIT(2),
};

/**
 * enum wiphy_opmode_flag - Station's ht/vht operation mode information flags
 *
 * @STA_OPMODE_MAX_BW_CHANGED: Max Bandwidth changed
 * @STA_OPMODE_SMPS_MODE_CHANGED: SMPS mode changed
 * @STA_OPMODE_N_SS_CHANGED: max N_SS (number of spatial streams) changed
 *
 */
enum wiphy_opmode_flag {
        STA_OPMODE_MAX_BW_CHANGED        = BIT(0),
        STA_OPMODE_SMPS_MODE_CHANGED        = BIT(1),
        STA_OPMODE_N_SS_CHANGED                = BIT(2),
};

/**
 * struct sta_opmode_info - Station's ht/vht operation mode information
 * @changed: contains value from &enum wiphy_opmode_flag
 * @smps_mode: New SMPS mode value from &enum nl80211_smps_mode of a station
 * @bw: new max bandwidth value from &enum nl80211_chan_width of a station
 * @rx_nss: new rx_nss value of a station
 */

struct sta_opmode_info {
        u32 changed;
        enum nl80211_smps_mode smps_mode;
        enum nl80211_chan_width bw;
        u8 rx_nss;
};

#define VENDOR_CMD_RAW_DATA ((const struct nla_policy *)(long)(-ENODATA))

/**
 * struct wiphy_vendor_command - vendor command definition
 * @info: vendor command identifying information, as used in nl80211
 * @flags: flags, see &enum wiphy_vendor_command_flags
 * @doit: callback for the operation, note that wdev is %NULL if the
 *        flags didn't ask for a wdev and non-%NULL otherwise; the data
 *        pointer may be %NULL if userspace provided no data at all
 * @dumpit: dump callback, for transferring bigger/multiple items. The
 *        @storage points to cb->args[5], ie. is preserved over the multiple
 *        dumpit calls.
 * @policy: policy pointer for attributes within %NL80211_ATTR_VENDOR_DATA.
 *        Set this to %VENDOR_CMD_RAW_DATA if no policy can be given and the
 *        attribute is just raw data (e.g. a firmware command).
 * @maxattr: highest attribute number in policy
 * It's recommended to not have the same sub command with both @doit and
 * @dumpit, so that userspace can assume certain ones are get and others
 * are used with dump requests.
 */
struct wiphy_vendor_command {
        struct nl80211_vendor_cmd_info info;
        u32 flags;
        int (*doit)(struct wiphy *wiphy, struct wireless_dev *wdev,
                    const void *data, int data_len);
        int (*dumpit)(struct wiphy *wiphy, struct wireless_dev *wdev,
                      struct sk_buff *skb, const void *data, int data_len,
                      unsigned long *storage);
        const struct nla_policy *policy;
        unsigned int maxattr;
};

/**
 * struct wiphy_iftype_ext_capab - extended capabilities per interface type
 * @iftype: interface type
 * @extended_capabilities: extended capabilities supported by the driver,
 *        additional capabilities might be supported by userspace; these are the
 *        802.11 extended capabilities ("Extended Capabilities element") and are
 *        in the same format as in the information element. See IEEE Std
 *        802.11-2012 8.4.2.29 for the defined fields.
 * @extended_capabilities_mask: mask of the valid values
 * @extended_capabilities_len: length of the extended capabilities
 */
struct wiphy_iftype_ext_capab {
        enum nl80211_iftype iftype;
        const u8 *extended_capabilities;
        const u8 *extended_capabilities_mask;
        u8 extended_capabilities_len;
};

/**
 * struct cfg80211_pmsr_capabilities - cfg80211 peer measurement capabilities
 * @max_peers: maximum number of peers in a single measurement
 * @report_ap_tsf: can report assoc AP's TSF for radio resource measurement
 * @randomize_mac_addr: can randomize MAC address for measurement
 * @ftm.supported: FTM measurement is supported
 * @ftm.asap: ASAP-mode is supported
 * @ftm.non_asap: non-ASAP-mode is supported
 * @ftm.request_lci: can request LCI data
 * @ftm.request_civicloc: can request civic location data
 * @ftm.preambles: bitmap of preambles supported (&enum nl80211_preamble)
 * @ftm.bandwidths: bitmap of bandwidths supported (&enum nl80211_chan_width)
 * @ftm.max_bursts_exponent: maximum burst exponent supported
 *        (set to -1 if not limited; note that setting this will necessarily
 *        forbid using the value 15 to let the responder pick)
 * @ftm.max_ftms_per_burst: maximum FTMs per burst supported (set to 0 if
 *        not limited)
 * @ftm.trigger_based: trigger based ranging measurement is supported
 * @ftm.non_trigger_based: non trigger based ranging measurement is supported
 */
struct cfg80211_pmsr_capabilities {
        unsigned int max_peers;
        u8 report_ap_tsf:1,
           randomize_mac_addr:1;

        struct {
                u32 preambles;
                u32 bandwidths;
                s8 max_bursts_exponent;
                u8 max_ftms_per_burst;
                u8 supported:1,
                   asap:1,
                   non_asap:1,
                   request_lci:1,
                   request_civicloc:1,
                   trigger_based:1,
                   non_trigger_based:1;
        } ftm;
};

/**
 * struct wiphy_iftype_akm_suites - This structure encapsulates supported akm
 * suites for interface types defined in @iftypes_mask. Each type in the
 * @iftypes_mask must be unique across all instances of iftype_akm_suites.
 *
 * @iftypes_mask: bitmask of interfaces types
 * @akm_suites: points to an array of supported akm suites
 * @n_akm_suites: number of supported AKM suites
 */
struct wiphy_iftype_akm_suites {
        u16 iftypes_mask;
        const u32 *akm_suites;
        int n_akm_suites;
};

/**
 * struct wiphy - wireless hardware description
 * @reg_notifier: the driver's regulatory notification callback,
 *        note that if your driver uses wiphy_apply_custom_regulatory()
 *        the reg_notifier's request can be passed as NULL
 * @regd: the driver's regulatory domain, if one was requested via
 *        the regulatory_hint() API. This can be used by the driver
 *        on the reg_notifier() if it chooses to ignore future
 *        regulatory domain changes caused by other drivers.
 * @signal_type: signal type reported in &struct cfg80211_bss.
 * @cipher_suites: supported cipher suites
 * @n_cipher_suites: number of supported cipher suites
 * @akm_suites: supported AKM suites. These are the default AKMs supported if
 *        the supported AKMs not advertized for a specific interface type in
 *        iftype_akm_suites.
 * @n_akm_suites: number of supported AKM suites
 * @iftype_akm_suites: array of supported akm suites info per interface type.
 *        Note that the bits in @iftypes_mask inside this structure cannot
 *        overlap (i.e. only one occurrence of each type is allowed across all
 *        instances of iftype_akm_suites).
 * @num_iftype_akm_suites: number of interface types for which supported akm
 *        suites are specified separately.
 * @retry_short: Retry limit for short frames (dot11ShortRetryLimit)
 * @retry_long: Retry limit for long frames (dot11LongRetryLimit)
 * @frag_threshold: Fragmentation threshold (dot11FragmentationThreshold);
 *        -1 = fragmentation disabled, only odd values >= 256 used
 * @rts_threshold: RTS threshold (dot11RTSThreshold); -1 = RTS/CTS disabled
 * @_net: the network namespace this wiphy currently lives in
 * @perm_addr: permanent MAC address of this device
 * @addr_mask: If the device supports multiple MAC addresses by masking,
 *        set this to a mask with variable bits set to 1, e.g. if the last
 *        four bits are variable then set it to 00-00-00-00-00-0f. The actual
 *        variable bits shall be determined by the interfaces added, with
 *        interfaces not matching the mask being rejected to be brought up.
 * @n_addresses: number of addresses in @addresses.
 * @addresses: If the device has more than one address, set this pointer
 *        to a list of addresses (6 bytes each). The first one will be used
 *        by default for perm_addr. In this case, the mask should be set to
 *        all-zeroes. In this case it is assumed that the device can handle
 *        the same number of arbitrary MAC addresses.
 * @registered: protects ->resume and ->suspend sysfs callbacks against
 *        unregister hardware
 * @debugfsdir: debugfs directory used for this wiphy (ieee80211/<wiphyname>).
 *        It will be renamed automatically on wiphy renames
 * @dev: (virtual) struct device for this wiphy. The item in
 *        /sys/class/ieee80211/ points to this. You need use set_wiphy_dev()
 *        (see below).
 * @wext: wireless extension handlers
 * @priv: driver private data (sized according to wiphy_new() parameter)
 * @interface_modes: bitmask of interfaces types valid for this wiphy,
 *        must be set by driver
 * @iface_combinations: Valid interface combinations array, should not
 *        list single interface types.
 * @n_iface_combinations: number of entries in @iface_combinations array.
 * @software_iftypes: bitmask of software interface types, these are not
 *        subject to any restrictions since they are purely managed in SW.
 * @flags: wiphy flags, see &enum wiphy_flags
 * @regulatory_flags: wiphy regulatory flags, see
 *        &enum ieee80211_regulatory_flags
 * @features: features advertised to nl80211, see &enum nl80211_feature_flags.
 * @ext_features: extended features advertised to nl80211, see
 *        &enum nl80211_ext_feature_index.
 * @bss_priv_size: each BSS struct has private data allocated with it,
 *        this variable determines its size
 * @max_scan_ssids: maximum number of SSIDs the device can scan for in
 *        any given scan
 * @max_sched_scan_reqs: maximum number of scheduled scan requests that
 *        the device can run concurrently.
 * @max_sched_scan_ssids: maximum number of SSIDs the device can scan
 *        for in any given scheduled scan
 * @max_match_sets: maximum number of match sets the device can handle
 *        when performing a scheduled scan, 0 if filtering is not
 *        supported.
 * @max_scan_ie_len: maximum length of user-controlled IEs device can
 *        add to probe request frames transmitted during a scan, must not
 *        include fixed IEs like supported rates
 * @max_sched_scan_ie_len: same as max_scan_ie_len, but for scheduled
 *        scans
 * @max_sched_scan_plans: maximum number of scan plans (scan interval and number
 *        of iterations) for scheduled scan supported by the device.
 * @max_sched_scan_plan_interval: maximum interval (in seconds) for a
 *        single scan plan supported by the device.
 * @max_sched_scan_plan_iterations: maximum number of iterations for a single
 *        scan plan supported by the device.
 * @coverage_class: current coverage class
 * @fw_version: firmware version for ethtool reporting
 * @hw_version: hardware version for ethtool reporting
 * @max_num_pmkids: maximum number of PMKIDs supported by device
 * @privid: a pointer that drivers can use to identify if an arbitrary
 *        wiphy is theirs, e.g. in global notifiers
 * @bands: information about bands/channels supported by this device
 *
 * @mgmt_stypes: bitmasks of frame subtypes that can be subscribed to or
 *        transmitted through nl80211, points to an array indexed by interface
 *        type
 *
 * @available_antennas_tx: bitmap of antennas which are available to be
 *        configured as TX antennas. Antenna configuration commands will be
 *        rejected unless this or @available_antennas_rx is set.
 *
 * @available_antennas_rx: bitmap of antennas which are available to be
 *        configured as RX antennas. Antenna configuration commands will be
 *        rejected unless this or @available_antennas_tx is set.
 *
 * @probe_resp_offload:
 *         Bitmap of supported protocols for probe response offloading.
 *         See &enum nl80211_probe_resp_offload_support_attr. Only valid
 *         when the wiphy flag @WIPHY_FLAG_AP_PROBE_RESP_OFFLOAD is set.
 *
 * @max_remain_on_channel_duration: Maximum time a remain-on-channel operation
 *        may request, if implemented.
 *
 * @wowlan: WoWLAN support information
 * @wowlan_config: current WoWLAN configuration; this should usually not be
 *        used since access to it is necessarily racy, use the parameter passed
 *        to the suspend() operation instead.
 *
 * @ap_sme_capa: AP SME capabilities, flags from &enum nl80211_ap_sme_features.
 * @ht_capa_mod_mask:  Specify what ht_cap values can be over-ridden.
 *        If null, then none can be over-ridden.
 * @vht_capa_mod_mask:  Specify what VHT capabilities can be over-ridden.
 *        If null, then none can be over-ridden.
 *
 * @wdev_list: the list of associated (virtual) interfaces; this list must
 *        not be modified by the driver, but can be read with RTNL/RCU protection.
 *
 * @max_acl_mac_addrs: Maximum number of MAC addresses that the device
 *        supports for ACL.
 *
 * @extended_capabilities: extended capabilities supported by the driver,
 *        additional capabilities might be supported by userspace; these are
 *        the 802.11 extended capabilities ("Extended Capabilities element")
 *        and are in the same format as in the information element. See
 *        802.11-2012 8.4.2.29 for the defined fields. These are the default
 *        extended capabilities to be used if the capabilities are not specified
 *        for a specific interface type in iftype_ext_capab.
 * @extended_capabilities_mask: mask of the valid values
 * @extended_capabilities_len: length of the extended capabilities
 * @iftype_ext_capab: array of extended capabilities per interface type
 * @num_iftype_ext_capab: number of interface types for which extended
 *        capabilities are specified separately.
 * @coalesce: packet coalescing support information
 *
 * @vendor_commands: array of vendor commands supported by the hardware
 * @n_vendor_commands: number of vendor commands
 * @vendor_events: array of vendor events supported by the hardware
 * @n_vendor_events: number of vendor events
 *
 * @max_ap_assoc_sta: maximum number of associated stations supported in AP mode
 *        (including P2P GO) or 0 to indicate no such limit is advertised. The
 *        driver is allowed to advertise a theoretical limit that it can reach in
 *        some cases, but may not always reach.
 *
 * @max_num_csa_counters: Number of supported csa_counters in beacons
 *        and probe responses.  This value should be set if the driver
 *        wishes to limit the number of csa counters. Default (0) means
 *        infinite.
 * @bss_select_support: bitmask indicating the BSS selection criteria supported
 *        by the driver in the .connect() callback. The bit position maps to the
 *        attribute indices defined in &enum nl80211_bss_select_attr.
 *
 * @nan_supported_bands: bands supported by the device in NAN mode, a
 *        bitmap of &enum nl80211_band values.  For instance, for
 *        NL80211_BAND_2GHZ, bit 0 would be set
 *        (i.e. BIT(NL80211_BAND_2GHZ)).
 *
 * @txq_limit: configuration of internal TX queue frame limit
 * @txq_memory_limit: configuration internal TX queue memory limit
 * @txq_quantum: configuration of internal TX queue scheduler quantum
 *
 * @tx_queue_len: allow setting transmit queue len for drivers not using
 *        wake_tx_queue
 *
 * @support_mbssid: can HW support association with nontransmitted AP
 * @support_only_he_mbssid: don't parse MBSSID elements if it is not
 *        HE AP, in order to avoid compatibility issues.
 *        @support_mbssid must be set for this to have any effect.
 *
 * @pmsr_capa: peer measurement capabilities
 *
 * @tid_config_support: describes the per-TID config support that the
 *        device has
 * @tid_config_support.vif: bitmap of attributes (configurations)
 *        supported by the driver for each vif
 * @tid_config_support.peer: bitmap of attributes (configurations)
 *        supported by the driver for each peer
 * @tid_config_support.max_retry: maximum supported retry count for
 *        long/short retry configuration
 *
 * @max_data_retry_count: maximum supported per TID retry count for
 *        configuration through the %NL80211_TID_CONFIG_ATTR_RETRY_SHORT and
 *        %NL80211_TID_CONFIG_ATTR_RETRY_LONG attributes
 */
struct wiphy {
        /* assign these fields before you register the wiphy */

        u8 perm_addr[ETH_ALEN];
        u8 addr_mask[ETH_ALEN];

        struct mac_address *addresses;

        const struct ieee80211_txrx_stypes *mgmt_stypes;

        const struct ieee80211_iface_combination *iface_combinations;
        int n_iface_combinations;
        u16 software_iftypes;

        u16 n_addresses;

        /* Supported interface modes, OR together BIT(NL80211_IFTYPE_...) */
        u16 interface_modes;

        u16 max_acl_mac_addrs;

        u32 flags, regulatory_flags, features;
        u8 ext_features[DIV_ROUND_UP(NUM_NL80211_EXT_FEATURES, 8)];

        u32 ap_sme_capa;

        enum cfg80211_signal_type signal_type;

        int bss_priv_size;
        u8 max_scan_ssids;
        u8 max_sched_scan_reqs;
        u8 max_sched_scan_ssids;
        u8 max_match_sets;
        u16 max_scan_ie_len;
        u16 max_sched_scan_ie_len;
        u32 max_sched_scan_plans;
        u32 max_sched_scan_plan_interval;
        u32 max_sched_scan_plan_iterations;

        int n_cipher_suites;
        const u32 *cipher_suites;

        int n_akm_suites;
        const u32 *akm_suites;

        const struct wiphy_iftype_akm_suites *iftype_akm_suites;
        unsigned int num_iftype_akm_suites;

        u8 retry_short;
        u8 retry_long;
        u32 frag_threshold;
        u32 rts_threshold;
        u8 coverage_class;

        char fw_version[ETHTOOL_FWVERS_LEN];
        u32 hw_version;

#ifdef CONFIG_PM
        const struct wiphy_wowlan_support *wowlan;
        struct cfg80211_wowlan *wowlan_config;
#endif

        u16 max_remain_on_channel_duration;

        u8 max_num_pmkids;

        u32 available_antennas_tx;
        u32 available_antennas_rx;

        u32 probe_resp_offload;

        const u8 *extended_capabilities, *extended_capabilities_mask;
        u8 extended_capabilities_len;

        const struct wiphy_iftype_ext_capab *iftype_ext_capab;
        unsigned int num_iftype_ext_capab;

        const void *privid;

        struct ieee80211_supported_band *bands[NUM_NL80211_BANDS];

        void (*reg_notifier)(struct wiphy *wiphy,
                             struct regulatory_request *request);

        /* fields below are read-only, assigned by cfg80211 */

        const struct ieee80211_regdomain __rcu *regd;

        struct device dev;

        bool registered;

        struct dentry *debugfsdir;

        const struct ieee80211_ht_cap *ht_capa_mod_mask;
        const struct ieee80211_vht_cap *vht_capa_mod_mask;

        struct list_head wdev_list;

        possible_net_t _net;

#ifdef CONFIG_CFG80211_WEXT
        const struct iw_handler_def *wext;
#endif

        const struct wiphy_coalesce_support *coalesce;

        const struct wiphy_vendor_command *vendor_commands;
        const struct nl80211_vendor_cmd_info *vendor_events;
        int n_vendor_commands, n_vendor_events;

        u16 max_ap_assoc_sta;

        u8 max_num_csa_counters;

        u32 bss_select_support;

        u8 nan_supported_bands;

        u32 txq_limit;
        u32 txq_memory_limit;
        u32 txq_quantum;

        unsigned long tx_queue_len;

        u8 support_mbssid:1,
           support_only_he_mbssid:1;

        const struct cfg80211_pmsr_capabilities *pmsr_capa;

        struct {
                u64 peer, vif;
                u8 max_retry;
        } tid_config_support;

        u8 max_data_retry_count;

        char priv[] __aligned(NETDEV_ALIGN);
};

static inline struct net *wiphy_net(struct wiphy *wiphy)
{
        return read_pnet(&wiphy->_net);
}

static inline void wiphy_net_set(struct wiphy *wiphy, struct net *net)
{
        write_pnet(&wiphy->_net, net);
}

/**
 * wiphy_priv - return priv from wiphy
 *
 * @wiphy: the wiphy whose priv pointer to return
 * Return: The priv of @wiphy.
 */
static inline void *wiphy_priv(struct wiphy *wiphy)
{
        BUG_ON(!wiphy);
        return &wiphy->priv;
}

/**
 * priv_to_wiphy - return the wiphy containing the priv
 *
 * @priv: a pointer previously returned by wiphy_priv
 * Return: The wiphy of @priv.
 */
static inline struct wiphy *priv_to_wiphy(void *priv)
{
        BUG_ON(!priv);
        return container_of(priv, struct wiphy, priv);
}

/**
 * set_wiphy_dev - set device pointer for wiphy
 *
 * @wiphy: The wiphy whose device to bind
 * @dev: The device to parent it to
 */
static inline void set_wiphy_dev(struct wiphy *wiphy, struct device *dev)
{
        wiphy->dev.parent = dev;
}

/**
 * wiphy_dev - get wiphy dev pointer
 *
 * @wiphy: The wiphy whose device struct to look up
 * Return: The dev of @wiphy.
 */
static inline struct device *wiphy_dev(struct wiphy *wiphy)
{
        return wiphy->dev.parent;
}

/**
 * wiphy_name - get wiphy name
 *
 * @wiphy: The wiphy whose name to return
 * Return: The name of @wiphy.
 */
static inline const char *wiphy_name(const struct wiphy *wiphy)
{
        return dev_name(&wiphy->dev);
}

/**
 * wiphy_new_nm - create a new wiphy for use with cfg80211
 *
 * @ops: The configuration operations for this device
 * @sizeof_priv: The size of the private area to allocate
 * @requested_name: Request a particular name.
 *        NULL is valid value, and means use the default phy%d naming.
 *
 * Create a new wiphy and associate the given operations with it.
 * @sizeof_priv bytes are allocated for private use.
 *
 * Return: A pointer to the new wiphy. This pointer must be
 * assigned to each netdev's ieee80211_ptr for proper operation.
 */
struct wiphy *wiphy_new_nm(const struct cfg80211_ops *ops, int sizeof_priv,
                           const char *requested_name);

/**
 * wiphy_new - create a new wiphy for use with cfg80211
 *
 * @ops: The configuration operations for this device
 * @sizeof_priv: The size of the private area to allocate
 *
 * Create a new wiphy and associate the given operations with it.
 * @sizeof_priv bytes are allocated for private use.
 *
 * Return: A pointer to the new wiphy. This pointer must be
 * assigned to each netdev's ieee80211_ptr for proper operation.
 */
static inline struct wiphy *wiphy_new(const struct cfg80211_ops *ops,
                                      int sizeof_priv)
{
        return wiphy_new_nm(ops, sizeof_priv, NULL);
}

/**
 * wiphy_register - register a wiphy with cfg80211
 *
 * @wiphy: The wiphy to register.
 *
 * Return: A non-negative wiphy index or a negative error code.
 */
int wiphy_register(struct wiphy *wiphy);

/**
 * wiphy_unregister - deregister a wiphy from cfg80211
 *
 * @wiphy: The wiphy to unregister.
 *
 * After this call, no more requests can be made with this priv
 * pointer, but the call may sleep to wait for an outstanding
 * request that is being handled.
 */
void wiphy_unregister(struct wiphy *wiphy);

/**
 * wiphy_free - free wiphy
 *
 * @wiphy: The wiphy to free
 */
void wiphy_free(struct wiphy *wiphy);

/* internal structs */
struct cfg80211_conn;
struct cfg80211_internal_bss;
struct cfg80211_cached_keys;
struct cfg80211_cqm_config;

/**
 * struct wireless_dev - wireless device state
 *
 * For netdevs, this structure must be allocated by the driver
 * that uses the ieee80211_ptr field in struct net_device (this
 * is intentional so it can be allocated along with the netdev.)
 * It need not be registered then as netdev registration will
 * be intercepted by cfg80211 to see the new wireless device.
 *
 * For non-netdev uses, it must also be allocated by the driver
 * in response to the cfg80211 callbacks that require it, as
 * there's no netdev registration in that case it may not be
 * allocated outside of callback operations that return it.
 *
 * @wiphy: pointer to hardware description
 * @iftype: interface type
 * @list: (private) Used to collect the interfaces
 * @netdev: (private) Used to reference back to the netdev, may be %NULL
 * @identifier: (private) Identifier used in nl80211 to identify this
 *        wireless device if it has no netdev
 * @current_bss: (private) Used by the internal configuration code
 * @chandef: (private) Used by the internal configuration code to track
 *        the user-set channel definition.
 * @preset_chandef: (private) Used by the internal configuration code to
 *        track the channel to be used for AP later
 * @bssid: (private) Used by the internal configuration code
 * @ssid: (private) Used by the internal configuration code
 * @ssid_len: (private) Used by the internal configuration code
 * @mesh_id_len: (private) Used by the internal configuration code
 * @mesh_id_up_len: (private) Used by the internal configuration code
 * @wext: (private) Used by the internal wireless extensions compat code
 * @wext.ibss: (private) IBSS data part of wext handling
 * @wext.connect: (private) connection handling data
 * @wext.keys: (private) (WEP) key data
 * @wext.ie: (private) extra elements for association
 * @wext.ie_len: (private) length of extra elements
 * @wext.bssid: (private) selected network BSSID
 * @wext.ssid: (private) selected network SSID
 * @wext.default_key: (private) selected default key index
 * @wext.default_mgmt_key: (private) selected default management key index
 * @wext.prev_bssid: (private) previous BSSID for reassociation
 * @wext.prev_bssid_valid: (private) previous BSSID validity
 * @use_4addr: indicates 4addr mode is used on this interface, must be
 *        set by driver (if supported) on add_interface BEFORE registering the
 *        netdev and may otherwise be used by driver read-only, will be update
 *        by cfg80211 on change_interface
 * @mgmt_registrations: list of registrations for management frames
 * @mgmt_registrations_need_update: mgmt registrations were updated,
 *        need to propagate the update to the driver
 * @mtx: mutex used to lock data in this struct, may be used by drivers
 *        and some API functions require it held
 * @beacon_interval: beacon interval used on this device for transmitting
 *        beacons, 0 when not valid
 * @address: The address for this device, valid only if @netdev is %NULL
 * @is_running: true if this is a non-netdev device that has been started, e.g.
 *        the P2P Device.
 * @cac_started: true if DFS channel availability check has been started
 * @cac_start_time: timestamp (jiffies) when the dfs state was entered.
 * @cac_time_ms: CAC time in ms
 * @ps: powersave mode is enabled
 * @ps_timeout: dynamic powersave timeout
 * @ap_unexpected_nlportid: (private) netlink port ID of application
 *        registered for unexpected class 3 frames (AP mode)
 * @conn: (private) cfg80211 software SME connection state machine data
 * @connect_keys: (private) keys to set after connection is established
 * @conn_bss_type: connecting/connected BSS type
 * @conn_owner_nlportid: (private) connection owner socket port ID
 * @disconnect_wk: (private) auto-disconnect work
 * @disconnect_bssid: (private) the BSSID to use for auto-disconnect
 * @ibss_fixed: (private) IBSS is using fixed BSSID
 * @ibss_dfs_possible: (private) IBSS may change to a DFS channel
 * @event_list: (private) list for internal event processing
 * @event_lock: (private) lock for event list
 * @owner_nlportid: (private) owner socket port ID
 * @nl_owner_dead: (private) owner socket went away
 * @cqm_config: (private) nl80211 RSSI monitor state
 * @pmsr_list: (private) peer measurement requests
 * @pmsr_lock: (private) peer measurements requests/results lock
 * @pmsr_free_wk: (private) peer measurements cleanup work
 * @unprot_beacon_reported: (private) timestamp of last
 *        unprotected beacon report
 */
struct wireless_dev {
        struct wiphy *wiphy;
        enum nl80211_iftype iftype;

        /* the remainder of this struct should be private to cfg80211 */
        struct list_head list;
        struct net_device *netdev;

        u32 identifier;

        struct list_head mgmt_registrations;
        u8 mgmt_registrations_need_update:1;

        struct mutex mtx;

        bool use_4addr, is_running;

        u8 address[ETH_ALEN] __aligned(sizeof(u16));

        /* currently used for IBSS and SME - might be rearranged later */
        u8 ssid[IEEE80211_MAX_SSID_LEN];
        u8 ssid_len, mesh_id_len, mesh_id_up_len;
        struct cfg80211_conn *conn;
        struct cfg80211_cached_keys *connect_keys;
        enum ieee80211_bss_type conn_bss_type;
        u32 conn_owner_nlportid;

        struct work_struct disconnect_wk;
        u8 disconnect_bssid[ETH_ALEN];

        struct list_head event_list;
        spinlock_t event_lock;

        struct cfg80211_internal_bss *current_bss; /* associated / joined */
        struct cfg80211_chan_def preset_chandef;
        struct cfg80211_chan_def chandef;

        bool ibss_fixed;
        bool ibss_dfs_possible;

        bool ps;
        int ps_timeout;

        int beacon_interval;

        u32 ap_unexpected_nlportid;

        u32 owner_nlportid;
        bool nl_owner_dead;

        bool cac_started;
        unsigned long cac_start_time;
        unsigned int cac_time_ms;

#ifdef CONFIG_CFG80211_WEXT
        /* wext data */
        struct {
                struct cfg80211_ibss_params ibss;
                struct cfg80211_connect_params connect;
                struct cfg80211_cached_keys *keys;
                const u8 *ie;
                size_t ie_len;
                u8 bssid[ETH_ALEN];
                u8 prev_bssid[ETH_ALEN];
                u8 ssid[IEEE80211_MAX_SSID_LEN];
                s8 default_key, default_mgmt_key;
                bool prev_bssid_valid;
        } wext;
#endif

        struct cfg80211_cqm_config *cqm_config;

        struct list_head pmsr_list;
        spinlock_t pmsr_lock;
        struct work_struct pmsr_free_wk;

        unsigned long unprot_beacon_reported;
};

static inline u8 *wdev_address(struct wireless_dev *wdev)
{
        if (wdev->netdev)
                return wdev->netdev->dev_addr;
        return wdev->address;
}

static inline bool wdev_running(struct wireless_dev *wdev)
{
        if (wdev->netdev)
                return netif_running(wdev->netdev);
        return wdev->is_running;
}

/**
 * wdev_priv - return wiphy priv from wireless_dev
 *
 * @wdev: The wireless device whose wiphy's priv pointer to return
 * Return: The wiphy priv of @wdev.
 */
static inline void *wdev_priv(struct wireless_dev *wdev)
{
        BUG_ON(!wdev);
        return wiphy_priv(wdev->wiphy);
}

/**
 * DOC: Utility functions
 *
 * cfg80211 offers a number of utility functions that can be useful.
 */

/**
 * ieee80211_channel_equal - compare two struct ieee80211_channel
 *
 * @a: 1st struct ieee80211_channel
 * @b: 2nd struct ieee80211_channel
 * Return: true if center frequency of @a == @b
 */
static inline bool
ieee80211_channel_equal(struct ieee80211_channel *a,
                        struct ieee80211_channel *b)
{
        return (a->center_freq == b->center_freq &&
                a->freq_offset == b->freq_offset);
}

/**
 * ieee80211_channel_to_khz - convert ieee80211_channel to frequency in KHz
 * @chan: struct ieee80211_channel to convert
 * Return: The corresponding frequency (in KHz)
 */
static inline u32
ieee80211_channel_to_khz(const struct ieee80211_channel *chan)
{
        return MHZ_TO_KHZ(chan->center_freq) + chan->freq_offset;
}

/**
 * ieee80211_s1g_channel_width - get allowed channel width from @chan
 *
 * Only allowed for band NL80211_BAND_S1GHZ
 * @chan: channel
 * Return: The allowed channel width for this center_freq
 */
enum nl80211_chan_width
ieee80211_s1g_channel_width(const struct ieee80211_channel *chan);

/**
 * ieee80211_channel_to_freq_khz - convert channel number to frequency
 * @chan: channel number
 * @band: band, necessary due to channel number overlap
 * Return: The corresponding frequency (in KHz), or 0 if the conversion failed.
 */
u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band);

/**
 * ieee80211_channel_to_frequency - convert channel number to frequency
 * @chan: channel number
 * @band: band, necessary due to channel number overlap
 * Return: The corresponding frequency (in MHz), or 0 if the conversion failed.
 */
static inline int
ieee80211_channel_to_frequency(int chan, enum nl80211_band band)
{
        return KHZ_TO_MHZ(ieee80211_channel_to_freq_khz(chan, band));
}

/**
 * ieee80211_freq_khz_to_channel - convert frequency to channel number
 * @freq: center frequency in KHz
 * Return: The corresponding channel, or 0 if the conversion failed.
 */
int ieee80211_freq_khz_to_channel(u32 freq);

/**
 * ieee80211_frequency_to_channel - convert frequency to channel number
 * @freq: center frequency in MHz
 * Return: The corresponding channel, or 0 if the conversion failed.
 */
static inline int
ieee80211_frequency_to_channel(int freq)
{
        return ieee80211_freq_khz_to_channel(MHZ_TO_KHZ(freq));
}

/**
 * ieee80211_get_channel_khz - get channel struct from wiphy for specified
 * frequency
 * @wiphy: the struct wiphy to get the channel for
 * @freq: the center frequency (in KHz) of the channel
 * Return: The channel struct from @wiphy at @freq.
 */
struct ieee80211_channel *
ieee80211_get_channel_khz(struct wiphy *wiphy, u32 freq);

/**
 * ieee80211_get_channel - get channel struct from wiphy for specified frequency
 *
 * @wiphy: the struct wiphy to get the channel for
 * @freq: the center frequency (in MHz) of the channel
 * Return: The channel struct from @wiphy at @freq.
 */
static inline struct ieee80211_channel *
ieee80211_get_channel(struct wiphy *wiphy, int freq)
{
        return ieee80211_get_channel_khz(wiphy, MHZ_TO_KHZ(freq));
}

/**
 * cfg80211_channel_is_psc - Check if the channel is a 6 GHz PSC
 * @chan: control channel to check
 *
 * The Preferred Scanning Channels (PSC) are defined in
 * Draft IEEE P802.11ax/D5.0, 26.17.2.3.3
 */
static inline bool cfg80211_channel_is_psc(struct ieee80211_channel *chan)
{
        if (chan->band != NL80211_BAND_6GHZ)
                return false;

        return ieee80211_frequency_to_channel(chan->center_freq) % 16 == 5;
}

/**
 * ieee80211_get_response_rate - get basic rate for a given rate
 *
 * @sband: the band to look for rates in
 * @basic_rates: bitmap of basic rates
 * @bitrate: the bitrate for which to find the basic rate
 *
 * Return: The basic rate corresponding to a given bitrate, that
 * is the next lower bitrate contained in the basic rate map,
 * which is, for this function, given as a bitmap of indices of
 * rates in the band's bitrate table.
 */
struct ieee80211_rate *
ieee80211_get_response_rate(struct ieee80211_supported_band *sband,
                            u32 basic_rates, int bitrate);

/**
 * ieee80211_mandatory_rates - get mandatory rates for a given band
 * @sband: the band to look for rates in
 * @scan_width: width of the control channel
 *
 * This function returns a bitmap of the mandatory rates for the given
 * band, bits are set according to the rate position in the bitrates array.
 */
u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband,
                              enum nl80211_bss_scan_width scan_width);

/*
 * Radiotap parsing functions -- for controlled injection support
 *
 * Implemented in net/wireless/radiotap.c
 * Documentation in Documentation/networking/radiotap-headers.rst
 */

struct radiotap_align_size {
        uint8_t align:4, size:4;
};

struct ieee80211_radiotap_namespace {
        const struct radiotap_align_size *align_size;
        int n_bits;
        uint32_t oui;
        uint8_t subns;
};

struct ieee80211_radiotap_vendor_namespaces {
        const struct ieee80211_radiotap_namespace *ns;
        int n_ns;
};

/**
 * struct ieee80211_radiotap_iterator - tracks walk thru present radiotap args
 * @this_arg_index: index of current arg, valid after each successful call
 *        to ieee80211_radiotap_iterator_next()
 * @this_arg: pointer to current radiotap arg; it is valid after each
 *        call to ieee80211_radiotap_iterator_next() but also after
 *        ieee80211_radiotap_iterator_init() where it will point to
 *        the beginning of the actual data portion
 * @this_arg_size: length of the current arg, for convenience
 * @current_namespace: pointer to the current namespace definition
 *        (or internally %NULL if the current namespace is unknown)
 * @is_radiotap_ns: indicates whether the current namespace is the default
 *        radiotap namespace or not
 *
 * @_rtheader: pointer to the radiotap header we are walking through
 * @_max_length: length of radiotap header in cpu byte ordering
 * @_arg_index: next argument index
 * @_arg: next argument pointer
 * @_next_bitmap: internal pointer to next present u32
 * @_bitmap_shifter: internal shifter for curr u32 bitmap, b0 set == arg present
 * @_vns: vendor namespace definitions
 * @_next_ns_data: beginning of the next namespace's data
 * @_reset_on_ext: internal; reset the arg index to 0 when going to the
 *        next bitmap word
 *
 * Describes the radiotap parser state. Fields prefixed with an underscore
 * must not be used by users of the parser, only by the parser internally.
 */

struct ieee80211_radiotap_iterator {
        struct ieee80211_radiotap_header *_rtheader;
        const struct ieee80211_radiotap_vendor_namespaces *_vns;
        const struct ieee80211_radiotap_namespace *current_namespace;

        unsigned char *_arg, *_next_ns_data;
        __le32 *_next_bitmap;

        unsigned char *this_arg;
        int this_arg_index;
        int this_arg_size;

        int is_radiotap_ns;

        int _max_length;
        int _arg_index;
        uint32_t _bitmap_shifter;
        int _reset_on_ext;
};

int
ieee80211_radiotap_iterator_init(struct ieee80211_radiotap_iterator *iterator,
                                 struct ieee80211_radiotap_header *radiotap_header,
                                 int max_length,
                                 const struct ieee80211_radiotap_vendor_namespaces *vns);

int
ieee80211_radiotap_iterator_next(struct ieee80211_radiotap_iterator *iterator);


extern const unsigned char rfc1042_header[6];
extern const unsigned char bridge_tunnel_header[6];

/**
 * ieee80211_get_hdrlen_from_skb - get header length from data
 *
 * @skb: the frame
 *
 * Given an skb with a raw 802.11 header at the data pointer this function
 * returns the 802.11 header length.
 *
 * Return: The 802.11 header length in bytes (not including encryption
 * headers). Or 0 if the data in the sk_buff is too short to contain a valid
 * 802.11 header.
 */
unsigned int ieee80211_get_hdrlen_from_skb(const struct sk_buff *skb);

/**
 * ieee80211_hdrlen - get header length in bytes from frame control
 * @fc: frame control field in little-endian format
 * Return: The header length in bytes.
 */
unsigned int __attribute_const__ ieee80211_hdrlen(__le16 fc);

/**
 * ieee80211_get_mesh_hdrlen - get mesh extension header length
 * @meshhdr: the mesh extension header, only the flags field
 *        (first byte) will be accessed
 * Return: The length of the extension header, which is always at
 * least 6 bytes and at most 18 if address 5 and 6 are present.
 */
unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr);

/**
 * DOC: Data path helpers
 *
 * In addition to generic utilities, cfg80211 also offers
 * functions that help implement the data path for devices
 * that do not do the 802.11/802.3 conversion on the device.
 */

/**
 * ieee80211_data_to_8023_exthdr - convert an 802.11 data frame to 802.3
 * @skb: the 802.11 data frame
 * @ehdr: pointer to a &struct ethhdr that will get the header, instead
 *        of it being pushed into the SKB
 * @addr: the device MAC address
 * @iftype: the virtual interface type
 * @data_offset: offset of payload after the 802.11 header
 * Return: 0 on success. Non-zero on error.
 */
int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr,
                                  const u8 *addr, enum nl80211_iftype iftype,
                                  u8 data_offset, bool is_amsdu);

/**
 * ieee80211_data_to_8023 - convert an 802.11 data frame to 802.3
 * @skb: the 802.11 data frame
 * @addr: the device MAC address
 * @iftype: the virtual interface type
 * Return: 0 on success. Non-zero on error.
 */
static inline int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr,
                                         enum nl80211_iftype iftype)
{
        return ieee80211_data_to_8023_exthdr(skb, NULL, addr, iftype, 0, false);
}

/**
 * ieee80211_amsdu_to_8023s - decode an IEEE 802.11n A-MSDU frame
 *
 * Decode an IEEE 802.11 A-MSDU and convert it to a list of 802.3 frames.
 * The @list will be empty if the decode fails. The @skb must be fully
 * header-less before being passed in here; it is freed in this function.
 *
 * @skb: The input A-MSDU frame without any headers.
 * @list: The output list of 802.3 frames. It must be allocated and
 *        initialized by the caller.
 * @addr: The device MAC address.
 * @iftype: The device interface type.
 * @extra_headroom: The hardware extra headroom for SKBs in the @list.
 * @check_da: DA to check in the inner ethernet header, or NULL
 * @check_sa: SA to check in the inner ethernet header, or NULL
 */
void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list,
                              const u8 *addr, enum nl80211_iftype iftype,
                              const unsigned int extra_headroom,
                              const u8 *check_da, const u8 *check_sa);

/**
 * cfg80211_classify8021d - determine the 802.1p/1d tag for a data frame
 * @skb: the data frame
 * @qos_map: Interworking QoS mapping or %NULL if not in use
 * Return: The 802.1p/1d tag.
 */
unsigned int cfg80211_classify8021d(struct sk_buff *skb,
                                    struct cfg80211_qos_map *qos_map);

/**
 * cfg80211_find_elem_match - match information element and byte array in data
 *
 * @eid: element ID
 * @ies: data consisting of IEs
 * @len: length of data
 * @match: byte array to match
 * @match_len: number of bytes in the match array
 * @match_offset: offset in the IE data where the byte array should match.
 *        Note the difference to cfg80211_find_ie_match() which considers
 *        the offset to start from the element ID byte, but here we take
 *        the data portion instead.
 *
 * Return: %NULL if the element ID could not be found or if
 * the element is invalid (claims to be longer than the given
 * data) or if the byte array doesn't match; otherwise return the
 * requested element struct.
 *
 * Note: There are no checks on the element length other than
 * having to fit into the given data and being large enough for the
 * byte array to match.
 */
const struct element *
cfg80211_find_elem_match(u8 eid, const u8 *ies, unsigned int len,
                         const u8 *match, unsigned int match_len,
                         unsigned int match_offset);

/**
 * cfg80211_find_ie_match - match information element and byte array in data
 *
 * @eid: element ID
 * @ies: data consisting of IEs
 * @len: length of data
 * @match: byte array to match
 * @match_len: number of bytes in the match array
 * @match_offset: offset in the IE where the byte array should match.
 *        If match_len is zero, this must also be set to zero.
 *        Otherwise this must be set to 2 or more, because the first
 *        byte is the element id, which is already compared to eid, and
 *        the second byte is the IE length.
 *
 * Return: %NULL if the element ID could not be found or if
 * the element is invalid (claims to be longer than the given
 * data) or if the byte array doesn't match, or a pointer to the first
 * byte of the requested element, that is the byte containing the
 * element ID.
 *
 * Note: There are no checks on the element length other than
 * having to fit into the given data and being large enough for the
 * byte array to match.
 */
static inline const u8 *
cfg80211_find_ie_match(u8 eid, const u8 *ies, unsigned int len,
                       const u8 *match, unsigned int match_len,
                       unsigned int match_offset)
{
        /* match_offset can't be smaller than 2, unless match_len is
         * zero, in which case match_offset must be zero as well.
         */
        if (WARN_ON((match_len && match_offset < 2) ||
                    (!match_len && match_offset)))
                return NULL;

        return (void *)cfg80211_find_elem_match(eid, ies, len,
                                                match, match_len,
                                                match_offset ?
                                                        match_offset - 2 : 0);
}

/**
 * cfg80211_find_elem - find information element in data
 *
 * @eid: element ID
 * @ies: data consisting of IEs
 * @len: length of data
 *
 * Return: %NULL if the element ID could not be found or if
 * the element is invalid (claims to be longer than the given
 * data) or if the byte array doesn't match; otherwise return the
 * requested element struct.
 *
 * Note: There are no checks on the element length other than
 * having to fit into the given data.
 */
static inline const struct element *
cfg80211_find_elem(u8 eid, const u8 *ies, int len)
{
        return cfg80211_find_elem_match(eid, ies, len, NULL, 0, 0);
}

/**
 * cfg80211_find_ie - find information element in data
 *
 * @eid: element ID
 * @ies: data consisting of IEs
 * @len: length of data
 *
 * Return: %NULL if the element ID could not be found or if
 * the element is invalid (claims to be longer than the given
 * data), or a pointer to the first byte of the requested
 * element, that is the byte containing the element ID.
 *
 * Note: There are no checks on the element length other than
 * having to fit into the given data.
 */
static inline const u8 *cfg80211_find_ie(u8 eid, const u8 *ies, int len)
{
        return cfg80211_find_ie_match(eid, ies, len, NULL, 0, 0);
}

/**
 * cfg80211_find_ext_elem - find information element with EID Extension in data
 *
 * @ext_eid: element ID Extension
 * @ies: data consisting of IEs
 * @len: length of data
 *
 * Return: %NULL if the etended element could not be found or if
 * the element is invalid (claims to be longer than the given
 * data) or if the byte array doesn't match; otherwise return the
 * requested element struct.
 *
 * Note: There are no checks on the element length other than
 * having to fit into the given data.
 */
static inline const struct element *
cfg80211_find_ext_elem(u8 ext_eid, const u8 *ies, int len)
{
        return cfg80211_find_elem_match(WLAN_EID_EXTENSION, ies, len,
                                        &ext_eid, 1, 0);
}

/**
 * cfg80211_find_ext_ie - find information element with EID Extension in data
 *
 * @ext_eid: element ID Extension
 * @ies: data consisting of IEs
 * @len: length of data
 *
 * Return: %NULL if the extended element ID could not be found or if
 * the element is invalid (claims to be longer than the given
 * data), or a pointer to the first byte of the requested
 * element, that is the byte containing the element ID.
 *
 * Note: There are no checks on the element length other than
 * having to fit into the given data.
 */
static inline const u8 *cfg80211_find_ext_ie(u8 ext_eid, const u8 *ies, int len)
{
        return cfg80211_find_ie_match(WLAN_EID_EXTENSION, ies, len,
                                      &ext_eid, 1, 2);
}

/**
 * cfg80211_find_vendor_elem - find vendor specific information element in data
 *
 * @oui: vendor OUI
 * @oui_type: vendor-specific OUI type (must be < 0xff), negative means any
 * @ies: data consisting of IEs
 * @len: length of data
 *
 * Return: %NULL if the vendor specific element ID could not be found or if the
 * element is invalid (claims to be longer than the given data); otherwise
 * return the element structure for the requested element.
 *
 * Note: There are no checks on the element length other than having to fit into
 * the given data.
 */
const struct element *cfg80211_find_vendor_elem(unsigned int oui, int oui_type,
                                                const u8 *ies,
                                                unsigned int len);

/**
 * cfg80211_find_vendor_ie - find vendor specific information element in data
 *
 * @oui: vendor OUI
 * @oui_type: vendor-specific OUI type (must be < 0xff), negative means any
 * @ies: data consisting of IEs
 * @len: length of data
 *
 * Return: %NULL if the vendor specific element ID could not be found or if the
 * element is invalid (claims to be longer than the given data), or a pointer to
 * the first byte of the requested element, that is the byte containing the
 * element ID.
 *
 * Note: There are no checks on the element length other than having to fit into
 * the given data.
 */
static inline const u8 *
cfg80211_find_vendor_ie(unsigned int oui, int oui_type,
                        const u8 *ies, unsigned int len)
{
        return (void *)cfg80211_find_vendor_elem(oui, oui_type, ies, len);
}

/**
 * cfg80211_send_layer2_update - send layer 2 update frame
 *
 * @dev: network device
 * @addr: STA MAC address
 *
 * Wireless drivers can use this function to update forwarding tables in bridge
 * devices upon STA association.
 */
void cfg80211_send_layer2_update(struct net_device *dev, const u8 *addr);

/**
 * DOC: Regulatory enforcement infrastructure
 *
 * TODO
 */

/**
 * regulatory_hint - driver hint to the wireless core a regulatory domain
 * @wiphy: the wireless device giving the hint (used only for reporting
 *        conflicts)
 * @alpha2: the ISO/IEC 3166 alpha2 the driver claims its regulatory domain
 *        should be in. If @rd is set this should be NULL. Note that if you
 *        set this to NULL you should still set rd->alpha2 to some accepted
 *        alpha2.
 *
 * Wireless drivers can use this function to hint to the wireless core
 * what it believes should be the current regulatory domain by
 * giving it an ISO/IEC 3166 alpha2 country code it knows its regulatory
 * domain should be in or by providing a completely build regulatory domain.
 * If the driver provides an ISO/IEC 3166 alpha2 userspace will be queried
 * for a regulatory domain structure for the respective country.
 *
 * The wiphy must have been registered to cfg80211 prior to this call.
 * For cfg80211 drivers this means you must first use wiphy_register(),
 * for mac80211 drivers you must first use ieee80211_register_hw().
 *
 * Drivers should check the return value, its possible you can get
 * an -ENOMEM.
 *
 * Return: 0 on success. -ENOMEM.
 */
int regulatory_hint(struct wiphy *wiphy, const char *alpha2);

/**
 * regulatory_set_wiphy_regd - set regdom info for self managed drivers
 * @wiphy: the wireless device we want to process the regulatory domain on
 * @rd: the regulatory domain informatoin to use for this wiphy
 *
 * Set the regulatory domain information for self-managed wiphys, only they
 * may use this function. See %REGULATORY_WIPHY_SELF_MANAGED for more
 * information.
 *
 * Return: 0 on success. -EINVAL, -EPERM
 */
int regulatory_set_wiphy_regd(struct wiphy *wiphy,
                              struct ieee80211_regdomain *rd);

/**
 * regulatory_set_wiphy_regd_sync_rtnl - set regdom for self-managed drivers
 * @wiphy: the wireless device we want to process the regulatory domain on
 * @rd: the regulatory domain information to use for this wiphy
 *
 * This functions requires the RTNL to be held and applies the new regdomain
 * synchronously to this wiphy. For more details see
 * regulatory_set_wiphy_regd().
 *
 * Return: 0 on success. -EINVAL, -EPERM
 */
int regulatory_set_wiphy_regd_sync_rtnl(struct wiphy *wiphy,
                                        struct ieee80211_regdomain *rd);

/**
 * wiphy_apply_custom_regulatory - apply a custom driver regulatory domain
 * @wiphy: the wireless device we want to process the regulatory domain on
 * @regd: the custom regulatory domain to use for this wiphy
 *
 * Drivers can sometimes have custom regulatory domains which do not apply
 * to a specific country. Drivers can use this to apply such custom regulatory
 * domains. This routine must be called prior to wiphy registration. The
 * custom regulatory domain will be trusted completely and as such previous
 * default channel settings will be disregarded. If no rule is found for a
 * channel on the regulatory domain the channel will be disabled.
 * Drivers using this for a wiphy should also set the wiphy flag
 * REGULATORY_CUSTOM_REG or cfg80211 will set it for the wiphy
 * that called this helper.
 */
void wiphy_apply_custom_regulatory(struct wiphy *wiphy,
                                   const struct ieee80211_regdomain *regd);

/**
 * freq_reg_info - get regulatory information for the given frequency
 * @wiphy: the wiphy for which we want to process this rule for
 * @center_freq: Frequency in KHz for which we want regulatory information for
 *
 * Use this function to get the regulatory rule for a specific frequency on
 * a given wireless device. If the device has a specific regulatory domain
 * it wants to follow we respect that unless a country IE has been received
 * and processed already.
 *
 * Return: A valid pointer, or, when an error occurs, for example if no rule
 * can be found, the return value is encoded using ERR_PTR(). Use IS_ERR() to
 * check and PTR_ERR() to obtain the numeric return value. The numeric return
 * value will be -ERANGE if we determine the given center_freq does not even
 * have a regulatory rule for a frequency range in the center_freq's band.
 * See freq_in_rule_band() for our current definition of a band -- this is
 * purely subjective and right now it's 802.11 specific.
 */
const struct ieee80211_reg_rule *freq_reg_info(struct wiphy *wiphy,
                                               u32 center_freq);

/**
 * reg_initiator_name - map regulatory request initiator enum to name
 * @initiator: the regulatory request initiator
 *
 * You can use this to map the regulatory request initiator enum to a
 * proper string representation.
 */
const char *reg_initiator_name(enum nl80211_reg_initiator initiator);

/**
 * regulatory_pre_cac_allowed - check if pre-CAC allowed in the current regdom
 * @wiphy: wiphy for which pre-CAC capability is checked.
 *
 * Pre-CAC is allowed only in some regdomains (notable ETSI).
 */
bool regulatory_pre_cac_allowed(struct wiphy *wiphy);

/**
 * DOC: Internal regulatory db functions
 *
 */

/**
 * reg_query_regdb_wmm -  Query internal regulatory db for wmm rule
 * Regulatory self-managed driver can use it to proactively
 *
 * @alpha2: the ISO/IEC 3166 alpha2 wmm rule to be queried.
 * @freq: the freqency(in MHz) to be queried.
 * @rule: pointer to store the wmm rule from the regulatory db.
 *
 * Self-managed wireless drivers can use this function to  query
 * the internal regulatory database to check whether the given
 * ISO/IEC 3166 alpha2 country and freq have wmm rule limitations.
 *
 * Drivers should check the return value, its possible you can get
 * an -ENODATA.
 *
 * Return: 0 on success. -ENODATA.
 */
int reg_query_regdb_wmm(char *alpha2, int freq,
                        struct ieee80211_reg_rule *rule);

/*
 * callbacks for asynchronous cfg80211 methods, notification
 * functions and BSS handling helpers
 */

/**
 * cfg80211_scan_done - notify that scan finished
 *
 * @request: the corresponding scan request
 * @info: information about the completed scan
 */
void cfg80211_scan_done(struct cfg80211_scan_request *request,
                        struct cfg80211_scan_info *info);

/**
 * cfg80211_sched_scan_results - notify that new scan results are available
 *
 * @wiphy: the wiphy which got scheduled scan results
 * @reqid: identifier for the related scheduled scan request
 */
void cfg80211_sched_scan_results(struct wiphy *wiphy, u64 reqid);

/**
 * cfg80211_sched_scan_stopped - notify that the scheduled scan has stopped
 *
 * @wiphy: the wiphy on which the scheduled scan stopped
 * @reqid: identifier for the related scheduled scan request
 *
 * The driver can call this function to inform cfg80211 that the
 * scheduled scan had to be stopped, for whatever reason.  The driver
 * is then called back via the sched_scan_stop operation when done.
 */
void cfg80211_sched_scan_stopped(struct wiphy *wiphy, u64 reqid);

/**
 * cfg80211_sched_scan_stopped_rtnl - notify that the scheduled scan has stopped
 *
 * @wiphy: the wiphy on which the scheduled scan stopped
 * @reqid: identifier for the related scheduled scan request
 *
 * The driver can call this function to inform cfg80211 that the
 * scheduled scan had to be stopped, for whatever reason.  The driver
 * is then called back via the sched_scan_stop operation when done.
 * This function should be called with rtnl locked.
 */
void cfg80211_sched_scan_stopped_rtnl(struct wiphy *wiphy, u64 reqid);

/**
 * cfg80211_inform_bss_frame_data - inform cfg80211 of a received BSS frame
 * @wiphy: the wiphy reporting the BSS
 * @data: the BSS metadata
 * @mgmt: the management frame (probe response or beacon)
 * @len: length of the management frame
 * @gfp: context flags
 *
 * This informs cfg80211 that BSS information was found and
 * the BSS should be updated/added.
 *
 * Return: A referenced struct, must be released with cfg80211_put_bss()!
 * Or %NULL on error.
 */
struct cfg80211_bss * __must_check
cfg80211_inform_bss_frame_data(struct wiphy *wiphy,
                               struct cfg80211_inform_bss *data,
                               struct ieee80211_mgmt *mgmt, size_t len,
                               gfp_t gfp);

static inline struct cfg80211_bss * __must_check
cfg80211_inform_bss_width_frame(struct wiphy *wiphy,
                                struct ieee80211_channel *rx_channel,
                                enum nl80211_bss_scan_width scan_width,
                                struct ieee80211_mgmt *mgmt, size_t len,
                                s32 signal, gfp_t gfp)
{
        struct cfg80211_inform_bss data = {
                .chan = rx_channel,
                .scan_width = scan_width,
                .signal = signal,
        };

        return cfg80211_inform_bss_frame_data(wiphy, &data, mgmt, len, gfp);
}

static inline struct cfg80211_bss * __must_check
cfg80211_inform_bss_frame(struct wiphy *wiphy,
                          struct ieee80211_channel *rx_channel,
                          struct ieee80211_mgmt *mgmt, size_t len,
                          s32 signal, gfp_t gfp)
{
        struct cfg80211_inform_bss data = {
                .chan = rx_channel,
                .scan_width = NL80211_BSS_CHAN_WIDTH_20,
                .signal = signal,
        };

        return cfg80211_inform_bss_frame_data(wiphy, &data, mgmt, len, gfp);
}

/**
 * cfg80211_gen_new_bssid - generate a nontransmitted BSSID for multi-BSSID
 * @bssid: transmitter BSSID
 * @max_bssid: max BSSID indicator, taken from Multiple BSSID element
 * @mbssid_index: BSSID index, taken from Multiple BSSID index element
 * @new_bssid: calculated nontransmitted BSSID
 */
static inline void cfg80211_gen_new_bssid(const u8 *bssid, u8 max_bssid,
                                          u8 mbssid_index, u8 *new_bssid)
{
        u64 bssid_u64 = ether_addr_to_u64(bssid);
        u64 mask = GENMASK_ULL(max_bssid - 1, 0);
        u64 new_bssid_u64;

        new_bssid_u64 = bssid_u64 & ~mask;

        new_bssid_u64 |= ((bssid_u64 & mask) + mbssid_index) & mask;

        u64_to_ether_addr(new_bssid_u64, new_bssid);
}

/**
 * cfg80211_is_element_inherited - returns if element ID should be inherited
 * @element: element to check
 * @non_inherit_element: non inheritance element
 */
bool cfg80211_is_element_inherited(const struct element *element,
                                   const struct element *non_inherit_element);

/**
 * cfg80211_merge_profile - merges a MBSSID profile if it is split between IEs
 * @ie: ies
 * @ielen: length of IEs
 * @mbssid_elem: current MBSSID element
 * @sub_elem: current MBSSID subelement (profile)
 * @merged_ie: location of the merged profile
 * @max_copy_len: max merged profile length
 */
size_t cfg80211_merge_profile(const u8 *ie, size_t ielen,
                              const struct element *mbssid_elem,
                              const struct element *sub_elem,
                              u8 *merged_ie, size_t max_copy_len);

/**
 * enum cfg80211_bss_frame_type - frame type that the BSS data came from
 * @CFG80211_BSS_FTYPE_UNKNOWN: driver doesn't know whether the data is
 *        from a beacon or probe response
 * @CFG80211_BSS_FTYPE_BEACON: data comes from a beacon
 * @CFG80211_BSS_FTYPE_PRESP: data comes from a probe response
 */
enum cfg80211_bss_frame_type {
        CFG80211_BSS_FTYPE_UNKNOWN,
        CFG80211_BSS_FTYPE_BEACON,
        CFG80211_BSS_FTYPE_PRESP,
};

/**
 * cfg80211_inform_bss_data - inform cfg80211 of a new BSS
 *
 * @wiphy: the wiphy reporting the BSS
 * @data: the BSS metadata
 * @ftype: frame type (if known)
 * @bssid: the BSSID of the BSS
 * @tsf: the TSF sent by the peer in the beacon/probe response (or 0)
 * @capability: the capability field sent by the peer
 * @beacon_interval: the beacon interval announced by the peer
 * @ie: additional IEs sent by the peer
 * @ielen: length of the additional IEs
 * @gfp: context flags
 *
 * This informs cfg80211 that BSS information was found and
 * the BSS should be updated/added.
 *
 * Return: A referenced struct, must be released with cfg80211_put_bss()!
 * Or %NULL on error.
 */
struct cfg80211_bss * __must_check
cfg80211_inform_bss_data(struct wiphy *wiphy,
                         struct cfg80211_inform_bss *data,
                         enum cfg80211_bss_frame_type ftype,
                         const u8 *bssid, u64 tsf, u16 capability,
                         u16 beacon_interval, const u8 *ie, size_t ielen,
                         gfp_t gfp);

static inline struct cfg80211_bss * __must_check
cfg80211_inform_bss_width(struct wiphy *wiphy,
                          struct ieee80211_channel *rx_channel,
                          enum nl80211_bss_scan_width scan_width,
                          enum cfg80211_bss_frame_type ftype,
                          const u8 *bssid, u64 tsf, u16 capability,
                          u16 beacon_interval, const u8 *ie, size_t ielen,
                          s32 signal, gfp_t gfp)
{
        struct cfg80211_inform_bss data = {
                .chan = rx_channel,
                .scan_width = scan_width,
                .signal = signal,
        };

        return cfg80211_inform_bss_data(wiphy, &data, ftype, bssid, tsf,
                                        capability, beacon_interval, ie, ielen,
                                        gfp);
}

static inline struct cfg80211_bss * __must_check
cfg80211_inform_bss(struct wiphy *wiphy,
                    struct ieee80211_channel *rx_channel,
                    enum cfg80211_bss_frame_type ftype,
                    const u8 *bssid, u64 tsf, u16 capability,
                    u16 beacon_interval, const u8 *ie, size_t ielen,
                    s32 signal, gfp_t gfp)
{
        struct cfg80211_inform_bss data = {
                .chan = rx_channel,
                .scan_width = NL80211_BSS_CHAN_WIDTH_20,
                .signal = signal,
        };

        return cfg80211_inform_bss_data(wiphy, &data, ftype, bssid, tsf,
                                        capability, beacon_interval, ie, ielen,
                                        gfp);
}

/**
 * cfg80211_get_bss - get a BSS reference
 * @wiphy: the wiphy this BSS struct belongs to
 * @channel: the channel to search on (or %NULL)
 * @bssid: the desired BSSID (or %NULL)
 * @ssid: the desired SSID (or %NULL)
 * @ssid_len: length of the SSID (or 0)
 * @bss_type: type of BSS, see &enum ieee80211_bss_type
 * @privacy: privacy filter, see &enum ieee80211_privacy
 */
struct cfg80211_bss *cfg80211_get_bss(struct wiphy *wiphy,
                                      struct ieee80211_channel *channel,
                                      const u8 *bssid,
                                      const u8 *ssid, size_t ssid_len,
                                      enum ieee80211_bss_type bss_type,
                                      enum ieee80211_privacy privacy);
static inline struct cfg80211_bss *
cfg80211_get_ibss(struct wiphy *wiphy,
                  struct ieee80211_channel *channel,
                  const u8 *ssid, size_t ssid_len)
{
        return cfg80211_get_bss(wiphy, channel, NULL, ssid, ssid_len,
                                IEEE80211_BSS_TYPE_IBSS,
                                IEEE80211_PRIVACY_ANY);
}

/**
 * cfg80211_ref_bss - reference BSS struct
 * @wiphy: the wiphy this BSS struct belongs to
 * @bss: the BSS struct to reference
 *
 * Increments the refcount of the given BSS struct.
 */
void cfg80211_ref_bss(struct wiphy *wiphy, struct cfg80211_bss *bss);

/**
 * cfg80211_put_bss - unref BSS struct
 * @wiphy: the wiphy this BSS struct belongs to
 * @bss: the BSS struct
 *
 * Decrements the refcount of the given BSS struct.
 */
void cfg80211_put_bss(struct wiphy *wiphy, struct cfg80211_bss *bss);

/**
 * cfg80211_unlink_bss - unlink BSS from internal data structures
 * @wiphy: the wiphy
 * @bss: the bss to remove
 *
 * This function removes the given BSS from the internal data structures
 * thereby making it no longer show up in scan results etc. Use this
 * function when you detect a BSS is gone. Normally BSSes will also time
 * out, so it is not necessary to use this function at all.
 */
void cfg80211_unlink_bss(struct wiphy *wiphy, struct cfg80211_bss *bss);

/**
 * cfg80211_bss_iter - iterate all BSS entries
 *
 * This function iterates over the BSS entries associated with the given wiphy
 * and calls the callback for the iterated BSS. The iterator function is not
 * allowed to call functions that might modify the internal state of the BSS DB.
 *
 * @wiphy: the wiphy
 * @chandef: if given, the iterator function will be called only if the channel
 *     of the currently iterated BSS is a subset of the given channel.
 * @iter: the iterator function to call
 * @iter_data: an argument to the iterator function
 */
void cfg80211_bss_iter(struct wiphy *wiphy,
                       struct cfg80211_chan_def *chandef,
                       void (*iter)(struct wiphy *wiphy,
                                    struct cfg80211_bss *bss,
                                    void *data),
                       void *iter_data);

static inline enum nl80211_bss_scan_width
cfg80211_chandef_to_scan_width(const struct cfg80211_chan_def *chandef)
{
        switch (chandef->width) {
        case NL80211_CHAN_WIDTH_5:
                return NL80211_BSS_CHAN_WIDTH_5;
        case NL80211_CHAN_WIDTH_10:
                return NL80211_BSS_CHAN_WIDTH_10;
        default:
                return NL80211_BSS_CHAN_WIDTH_20;
        }
}

/**
 * cfg80211_rx_mlme_mgmt - notification of processed MLME management frame
 * @dev: network device
 * @buf: authentication frame (header + body)
 * @len: length of the frame data
 *
 * This function is called whenever an authentication, disassociation or
 * deauthentication frame has been received and processed in station mode.
 * After being asked to authenticate via cfg80211_ops::auth() the driver must
 * call either this function or cfg80211_auth_timeout().
 * After being asked to associate via cfg80211_ops::assoc() the driver must
 * call either this function or cfg80211_auth_timeout().
 * While connected, the driver must calls this for received and processed
 * disassociation and deauthentication frames. If the frame couldn't be used
 * because it was unprotected, the driver must call the function
 * cfg80211_rx_unprot_mlme_mgmt() instead.
 *
 * This function may sleep. The caller must hold the corresponding wdev's mutex.
 */
void cfg80211_rx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len);

/**
 * cfg80211_auth_timeout - notification of timed out authentication
 * @dev: network device
 * @addr: The MAC address of the device with which the authentication timed out
 *
 * This function may sleep. The caller must hold the corresponding wdev's
 * mutex.
 */
void cfg80211_auth_timeout(struct net_device *dev, const u8 *addr);

/**
 * cfg80211_rx_assoc_resp - notification of processed association response
 * @dev: network device
 * @bss: the BSS that association was requested with, ownership of the pointer
 *        moves to cfg80211 in this call
 * @buf: (Re)Association Response frame (header + body)
 * @len: length of the frame data
 * @uapsd_queues: bitmap of queues configured for uapsd. Same format
 *        as the AC bitmap in the QoS info field
 * @req_ies: information elements from the (Re)Association Request frame
 * @req_ies_len: length of req_ies data
 *
 * After being asked to associate via cfg80211_ops::assoc() the driver must
 * call either this function or cfg80211_auth_timeout().
 *
 * This function may sleep. The caller must hold the corresponding wdev's mutex.
 */
void cfg80211_rx_assoc_resp(struct net_device *dev,
                            struct cfg80211_bss *bss,
                            const u8 *buf, size_t len,
                            int uapsd_queues,
                            const u8 *req_ies, size_t req_ies_len);

/**
 * cfg80211_assoc_timeout - notification of timed out association
 * @dev: network device
 * @bss: The BSS entry with which association timed out.
 *
 * This function may sleep. The caller must hold the corresponding wdev's mutex.
 */
void cfg80211_assoc_timeout(struct net_device *dev, struct cfg80211_bss *bss);

/**
 * cfg80211_abandon_assoc - notify cfg80211 of abandoned association attempt
 * @dev: network device
 * @bss: The BSS entry with which association was abandoned.
 *
 * Call this whenever - for reasons reported through other API, like deauth RX,
 * an association attempt was abandoned.
 * This function may sleep. The caller must hold the corresponding wdev's mutex.
 */
void cfg80211_abandon_assoc(struct net_device *dev, struct cfg80211_bss *bss);

/**
 * cfg80211_tx_mlme_mgmt - notification of transmitted deauth/disassoc frame
 * @dev: network device
 * @buf: 802.11 frame (header + body)
 * @len: length of the frame data
 *
 * This function is called whenever deauthentication has been processed in
 * station mode. This includes both received deauthentication frames and
 * locally generated ones. This function may sleep. The caller must hold the
 * corresponding wdev's mutex.
 */
void cfg80211_tx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len);

/**
 * cfg80211_rx_unprot_mlme_mgmt - notification of unprotected mlme mgmt frame
 * @dev: network device
 * @buf: received management frame (header + body)
 * @len: length of the frame data
 *
 * This function is called whenever a received deauthentication or dissassoc
 * frame has been dropped in station mode because of MFP being used but the
 * frame was not protected. This is also used to notify reception of a Beacon
 * frame that was dropped because it did not include a valid MME MIC while
 * beacon protection was enabled (BIGTK configured in station mode).
 *
 * This function may sleep.
 */
void cfg80211_rx_unprot_mlme_mgmt(struct net_device *dev,
                                  const u8 *buf, size_t len);

/**
 * cfg80211_michael_mic_failure - notification of Michael MIC failure (TKIP)
 * @dev: network device
 * @addr: The source MAC address of the frame
 * @key_type: The key type that the received frame used
 * @key_id: Key identifier (0..3). Can be -1 if missing.
 * @tsc: The TSC value of the frame that generated the MIC failure (6 octets)
 * @gfp: allocation flags
 *
 * This function is called whenever the local MAC detects a MIC failure in a
 * received frame. This matches with MLME-MICHAELMICFAILURE.indication()
 * primitive.
 */
void cfg80211_michael_mic_failure(struct net_device *dev, const u8 *addr,
                                  enum nl80211_key_type key_type, int key_id,
                                  const u8 *tsc, gfp_t gfp);

/**
 * cfg80211_ibss_joined - notify cfg80211 that device joined an IBSS
 *
 * @dev: network device
 * @bssid: the BSSID of the IBSS joined
 * @channel: the channel of the IBSS joined
 * @gfp: allocation flags
 *
 * This function notifies cfg80211 that the device joined an IBSS or
 * switched to a different BSSID. Before this function can be called,
 * either a beacon has to have been received from the IBSS, or one of
 * the cfg80211_inform_bss{,_frame} functions must have been called
 * with the locally generated beacon -- this guarantees that there is
 * always a scan result for this IBSS. cfg80211 will handle the rest.
 */
void cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid,
                          struct ieee80211_channel *channel, gfp_t gfp);

/**
 * cfg80211_notify_new_peer_candidate - notify cfg80211 of a new mesh peer
 *                                         candidate
 *
 * @dev: network device
 * @macaddr: the MAC address of the new candidate
 * @ie: information elements advertised by the peer candidate
 * @ie_len: length of the information elements buffer
 * @gfp: allocation flags
 *
 * This function notifies cfg80211 that the mesh peer candidate has been
 * detected, most likely via a beacon or, less likely, via a probe response.
 * cfg80211 then sends a notification to userspace.
 */
void cfg80211_notify_new_peer_candidate(struct net_device *dev,
                const u8 *macaddr, const u8 *ie, u8 ie_len,
                int sig_dbm, gfp_t gfp);

/**
 * DOC: RFkill integration
 *
 * RFkill integration in cfg80211 is almost invisible to drivers,
 * as cfg80211 automatically registers an rfkill instance for each
 * wireless device it knows about. Soft kill is also translated
 * into disconnecting and turning all interfaces off, drivers are
 * expected to turn off the device when all interfaces are down.
 *
 * However, devices may have a hard RFkill line, in which case they
 * also need to interact with the rfkill subsystem, via cfg80211.
 * They can do this with a few helper functions documented here.
 */

/**
 * wiphy_rfkill_set_hw_state - notify cfg80211 about hw block state
 * @wiphy: the wiphy
 * @blocked: block status
 */
void wiphy_rfkill_set_hw_state(struct wiphy *wiphy, bool blocked);

/**
 * wiphy_rfkill_start_polling - start polling rfkill
 * @wiphy: the wiphy
 */
void wiphy_rfkill_start_polling(struct wiphy *wiphy);

/**
 * wiphy_rfkill_stop_polling - stop polling rfkill
 * @wiphy: the wiphy
 */
void wiphy_rfkill_stop_polling(struct wiphy *wiphy);

/**
 * DOC: Vendor commands
 *
 * Occasionally, there are special protocol or firmware features that
 * can't be implemented very openly. For this and similar cases, the
 * vendor command functionality allows implementing the features with
 * (typically closed-source) userspace and firmware, using nl80211 as
 * the configuration mechanism.
 *
 * A driver supporting vendor commands must register them as an array
 * in struct wiphy, with handlers for each one, each command has an
 * OUI and sub command ID to identify it.
 *
 * Note that this feature should not be (ab)used to implement protocol
 * features that could openly be shared across drivers. In particular,
 * it must never be required to use vendor commands to implement any
 * "normal" functionality that higher-level userspace like connection
 * managers etc. need.
 */

struct sk_buff *__cfg80211_alloc_reply_skb(struct wiphy *wiphy,
                                           enum nl80211_commands cmd,
                                           enum nl80211_attrs attr,
                                           int approxlen);

struct sk_buff *__cfg80211_alloc_event_skb(struct wiphy *wiphy,
                                           struct wireless_dev *wdev,
                                           enum nl80211_commands cmd,
                                           enum nl80211_attrs attr,
                                           unsigned int portid,
                                           int vendor_event_idx,
                                           int approxlen, gfp_t gfp);

void __cfg80211_send_event_skb(struct sk_buff *skb, gfp_t gfp);

/**
 * cfg80211_vendor_cmd_alloc_reply_skb - allocate vendor command reply
 * @wiphy: the wiphy
 * @approxlen: an upper bound of the length of the data that will
 *        be put into the skb
 *
 * This function allocates and pre-fills an skb for a reply to
 * a vendor command. Since it is intended for a reply, calling
 * it outside of a vendor command's doit() operation is invalid.
 *
 * The returned skb is pre-filled with some identifying data in
 * a way that any data that is put into the skb (with skb_put(),
 * nla_put() or similar) will end up being within the
 * %NL80211_ATTR_VENDOR_DATA attribute, so all that needs to be done
 * with the skb is adding data for the corresponding userspace tool
 * which can then read that data out of the vendor data attribute.
 * You must not modify the skb in any other way.
 *
 * When done, call cfg80211_vendor_cmd_reply() with the skb and return
 * its error code as the result of the doit() operation.
 *
 * Return: An allocated and pre-filled skb. %NULL if any errors happen.
 */
static inline struct sk_buff *
cfg80211_vendor_cmd_alloc_reply_skb(struct wiphy *wiphy, int approxlen)
{
        return __cfg80211_alloc_reply_skb(wiphy, NL80211_CMD_VENDOR,
                                          NL80211_ATTR_VENDOR_DATA, approxlen);
}

/**
 * cfg80211_vendor_cmd_reply - send the reply skb
 * @skb: The skb, must have been allocated with
 *        cfg80211_vendor_cmd_alloc_reply_skb()
 *
 * Since calling this function will usually be the last thing
 * before returning from the vendor command doit() you should
 * return the error code.  Note that this function consumes the
 * skb regardless of the return value.
 *
 * Return: An error code or 0 on success.
 */
int cfg80211_vendor_cmd_reply(struct sk_buff *skb);

/**
 * cfg80211_vendor_cmd_get_sender
 * @wiphy: the wiphy
 *
 * Return the current netlink port ID in a vendor command handler.
 * Valid to call only there.
 */
unsigned int cfg80211_vendor_cmd_get_sender(struct wiphy *wiphy);

/**
 * cfg80211_vendor_event_alloc - allocate vendor-specific event skb
 * @wiphy: the wiphy
 * @wdev: the wireless device
 * @event_idx: index of the vendor event in the wiphy's vendor_events
 * @approxlen: an upper bound of the length of the data that will
 *        be put into the skb
 * @gfp: allocation flags
 *
 * This function allocates and pre-fills an skb for an event on the
 * vendor-specific multicast group.
 *
 * If wdev != NULL, both the ifindex and identifier of the specified
 * wireless device are added to the event message before the vendor data
 * attribute.
 *
 * When done filling the skb, call cfg80211_vendor_event() with the
 * skb to send the event.
 *
 * Return: An allocated and pre-filled skb. %NULL if any errors happen.
 */
static inline struct sk_buff *
cfg80211_vendor_event_alloc(struct wiphy *wiphy, struct wireless_dev *wdev,
                             int approxlen, int event_idx, gfp_t gfp)
{
        return __cfg80211_alloc_event_skb(wiphy, wdev, NL80211_CMD_VENDOR,
                                          NL80211_ATTR_VENDOR_DATA,
                                          0, event_idx, approxlen, gfp);
}

/**
 * cfg80211_vendor_event_alloc_ucast - alloc unicast vendor-specific event skb
 * @wiphy: the wiphy
 * @wdev: the wireless device
 * @event_idx: index of the vendor event in the wiphy's vendor_events
 * @portid: port ID of the receiver
 * @approxlen: an upper bound of the length of the data that will
 *        be put into the skb
 * @gfp: allocation flags
 *
 * This function allocates and pre-fills an skb for an event to send to
 * a specific (userland) socket. This socket would previously have been
 * obtained by cfg80211_vendor_cmd_get_sender(), and the caller MUST take
 * care to register a netlink notifier to see when the socket closes.
 *
 * If wdev != NULL, both the ifindex and identifier of the specified
 * wireless device are added to the event message before the vendor data
 * attribute.
 *
 * When done filling the skb, call cfg80211_vendor_event() with the
 * skb to send the event.
 *
 * Return: An allocated and pre-filled skb. %NULL if any errors happen.
 */
static inline struct sk_buff *
cfg80211_vendor_event_alloc_ucast(struct wiphy *wiphy,
                                  struct wireless_dev *wdev,
                                  unsigned int portid, int approxlen,
                                  int event_idx, gfp_t gfp)
{
        return __cfg80211_alloc_event_skb(wiphy, wdev, NL80211_CMD_VENDOR,
                                          NL80211_ATTR_VENDOR_DATA,
                                          portid, event_idx, approxlen, gfp);
}

/**
 * cfg80211_vendor_event - send the event
 * @skb: The skb, must have been allocated with cfg80211_vendor_event_alloc()
 * @gfp: allocation flags
 *
 * This function sends the given @skb, which must have been allocated
 * by cfg80211_vendor_event_alloc(), as an event. It always consumes it.
 */
static inline void cfg80211_vendor_event(struct sk_buff *skb, gfp_t gfp)
{
        __cfg80211_send_event_skb(skb, gfp);
}

#ifdef CONFIG_NL80211_TESTMODE
/**
 * DOC: Test mode
 *
 * Test mode is a set of utility functions to allow drivers to
 * interact with driver-specific tools to aid, for instance,
 * factory programming.
 *
 * This chapter describes how drivers interact with it, for more
 * information see the nl80211 book's chapter on it.
 */

/**
 * cfg80211_testmode_alloc_reply_skb - allocate testmode reply
 * @wiphy: the wiphy
 * @approxlen: an upper bound of the length of the data that will
 *        be put into the skb
 *
 * This function allocates and pre-fills an skb for a reply to
 * the testmode command. Since it is intended for a reply, calling
 * it outside of the @testmode_cmd operation is invalid.
 *
 * The returned skb is pre-filled with the wiphy index and set up in
 * a way that any data that is put into the skb (with skb_put(),
 * nla_put() or similar) will end up being within the
 * %NL80211_ATTR_TESTDATA attribute, so all that needs to be done
 * with the skb is adding data for the corresponding userspace tool
 * which can then read that data out of the testdata attribute. You
 * must not modify the skb in any other way.
 *
 * When done, call cfg80211_testmode_reply() with the skb and return
 * its error code as the result of the @testmode_cmd operation.
 *
 * Return: An allocated and pre-filled skb. %NULL if any errors happen.
 */
static inline struct sk_buff *
cfg80211_testmode_alloc_reply_skb(struct wiphy *wiphy, int approxlen)
{
        return __cfg80211_alloc_reply_skb(wiphy, NL80211_CMD_TESTMODE,
                                          NL80211_ATTR_TESTDATA, approxlen);
}

/**
 * cfg80211_testmode_reply - send the reply skb
 * @skb: The skb, must have been allocated with
 *        cfg80211_testmode_alloc_reply_skb()
 *
 * Since calling this function will usually be the last thing
 * before returning from the @testmode_cmd you should return
 * the error code.  Note that this function consumes the skb
 * regardless of the return value.
 *
 * Return: An error code or 0 on success.
 */
static inline int cfg80211_testmode_reply(struct sk_buff *skb)
{
        return cfg80211_vendor_cmd_reply(skb);
}

/**
 * cfg80211_testmode_alloc_event_skb - allocate testmode event
 * @wiphy: the wiphy
 * @approxlen: an upper bound of the length of the data that will
 *        be put into the skb
 * @gfp: allocation flags
 *
 * This function allocates and pre-fills an skb for an event on the
 * testmode multicast group.
 *
 * The returned skb is set up in the same way as with
 * cfg80211_testmode_alloc_reply_skb() but prepared for an event. As
 * there, you should simply add data to it that will then end up in the
 * %NL80211_ATTR_TESTDATA attribute. Again, you must not modify the skb
 * in any other way.
 *
 * When done filling the skb, call cfg80211_testmode_event() with the
 * skb to send the event.
 *
 * Return: An allocated and pre-filled skb. %NULL if any errors happen.
 */
static inline struct sk_buff *
cfg80211_testmode_alloc_event_skb(struct wiphy *wiphy, int approxlen, gfp_t gfp)
{
        return __cfg80211_alloc_event_skb(wiphy, NULL, NL80211_CMD_TESTMODE,
                                          NL80211_ATTR_TESTDATA, 0, -1,
                                          approxlen, gfp);
}

/**
 * cfg80211_testmode_event - send the event
 * @skb: The skb, must have been allocated with
 *        cfg80211_testmode_alloc_event_skb()
 * @gfp: allocation flags
 *
 * This function sends the given @skb, which must have been allocated
 * by cfg80211_testmode_alloc_event_skb(), as an event. It always
 * consumes it.
 */
static inline void cfg80211_testmode_event(struct sk_buff *skb, gfp_t gfp)
{
        __cfg80211_send_event_skb(skb, gfp);
}

#define CFG80211_TESTMODE_CMD(cmd)        .testmode_cmd = (cmd),
#define CFG80211_TESTMODE_DUMP(cmd)        .testmode_dump = (cmd),
#else
#define CFG80211_TESTMODE_CMD(cmd)
#define CFG80211_TESTMODE_DUMP(cmd)
#endif

/**
 * struct cfg80211_fils_resp_params - FILS connection response params
 * @kek: KEK derived from a successful FILS connection (may be %NULL)
 * @kek_len: Length of @fils_kek in octets
 * @update_erp_next_seq_num: Boolean value to specify whether the value in
 *        @erp_next_seq_num is valid.
 * @erp_next_seq_num: The next sequence number to use in ERP message in
 *        FILS Authentication. This value should be specified irrespective of the
 *        status for a FILS connection.
 * @pmk: A new PMK if derived from a successful FILS connection (may be %NULL).
 * @pmk_len: Length of @pmk in octets
 * @pmkid: A new PMKID if derived from a successful FILS connection or the PMKID
 *        used for this FILS connection (may be %NULL).
 */
struct cfg80211_fils_resp_params {
        const u8 *kek;
        size_t kek_len;
        bool update_erp_next_seq_num;
        u16 erp_next_seq_num;
        const u8 *pmk;
        size_t pmk_len;
        const u8 *pmkid;
};

/**
 * struct cfg80211_connect_resp_params - Connection response params
 * @status: Status code, %WLAN_STATUS_SUCCESS for successful connection, use
 *        %WLAN_STATUS_UNSPECIFIED_FAILURE if your device cannot give you
 *        the real status code for failures. If this call is used to report a
 *        failure due to a timeout (e.g., not receiving an Authentication frame
 *        from the AP) instead of an explicit rejection by the AP, -1 is used to
 *        indicate that this is a failure, but without a status code.
 *        @timeout_reason is used to report the reason for the timeout in that
 *        case.
 * @bssid: The BSSID of the AP (may be %NULL)
 * @bss: Entry of bss to which STA got connected to, can be obtained through
 *        cfg80211_get_bss() (may be %NULL). But it is recommended to store the
 *        bss from the connect_request and hold a reference to it and return
 *        through this param to avoid a warning if the bss is expired during the
 *        connection, esp. for those drivers implementing connect op.
 *        Only one parameter among @bssid and @bss needs to be specified.
 * @req_ie: Association request IEs (may be %NULL)
 * @req_ie_len: Association request IEs length
 * @resp_ie: Association response IEs (may be %NULL)
 * @resp_ie_len: Association response IEs length
 * @fils: FILS connection response parameters.
 * @timeout_reason: Reason for connection timeout. This is used when the
 *        connection fails due to a timeout instead of an explicit rejection from
 *        the AP. %NL80211_TIMEOUT_UNSPECIFIED is used when the timeout reason is
 *        not known. This value is used only if @status < 0 to indicate that the
 *        failure is due to a timeout and not due to explicit rejection by the AP.
 *        This value is ignored in other cases (@status >= 0).
 */
struct cfg80211_connect_resp_params {
        int status;
        const u8 *bssid;
        struct cfg80211_bss *bss;
        const u8 *req_ie;
        size_t req_ie_len;
        const u8 *resp_ie;
        size_t resp_ie_len;
        struct cfg80211_fils_resp_params fils;
        enum nl80211_timeout_reason timeout_reason;
};

/**
 * cfg80211_connect_done - notify cfg80211 of connection result
 *
 * @dev: network device
 * @params: connection response parameters
 * @gfp: allocation flags
 *
 * It should be called by the underlying driver once execution of the connection
 * request from connect() has been completed. This is similar to
 * cfg80211_connect_bss(), but takes a structure pointer for connection response
 * parameters. Only one of the functions among cfg80211_connect_bss(),
 * cfg80211_connect_result(), cfg80211_connect_timeout(),
 * and cfg80211_connect_done() should be called.
 */
void cfg80211_connect_done(struct net_device *dev,
                           struct cfg80211_connect_resp_params *params,
                           gfp_t gfp);

/**
 * cfg80211_connect_bss - notify cfg80211 of connection result
 *
 * @dev: network device
 * @bssid: the BSSID of the AP
 * @bss: Entry of bss to which STA got connected to, can be obtained through
 *        cfg80211_get_bss() (may be %NULL). But it is recommended to store the
 *        bss from the connect_request and hold a reference to it and return
 *        through this param to avoid a warning if the bss is expired during the
 *        connection, esp. for those drivers implementing connect op.
 *        Only one parameter among @bssid and @bss needs to be specified.
 * @req_ie: association request IEs (maybe be %NULL)
 * @req_ie_len: association request IEs length
 * @resp_ie: association response IEs (may be %NULL)
 * @resp_ie_len: assoc response IEs length
 * @status: status code, %WLAN_STATUS_SUCCESS for successful connection, use
 *        %WLAN_STATUS_UNSPECIFIED_FAILURE if your device cannot give you
 *        the real status code for failures. If this call is used to report a
 *        failure due to a timeout (e.g., not receiving an Authentication frame
 *        from the AP) instead of an explicit rejection by the AP, -1 is used to
 *        indicate that this is a failure, but without a status code.
 *        @timeout_reason is used to report the reason for the timeout in that
 *        case.
 * @gfp: allocation flags
 * @timeout_reason: reason for connection timeout. This is used when the
 *        connection fails due to a timeout instead of an explicit rejection from
 *        the AP. %NL80211_TIMEOUT_UNSPECIFIED is used when the timeout reason is
 *        not known. This value is used only if @status < 0 to indicate that the
 *        failure is due to a timeout and not due to explicit rejection by the AP.
 *        This value is ignored in other cases (@status >= 0).
 *
 * It should be called by the underlying driver once execution of the connection
 * request from connect() has been completed. This is similar to
 * cfg80211_connect_result(), but with the option of identifying the exact bss
 * entry for the connection. Only one of the functions among
 * cfg80211_connect_bss(), cfg80211_connect_result(),
 * cfg80211_connect_timeout(), and cfg80211_connect_done() should be called.
 */
static inline void
cfg80211_connect_bss(struct net_device *dev, const u8 *bssid,
                     struct cfg80211_bss *bss, const u8 *req_ie,
                     size_t req_ie_len, const u8 *resp_ie,
                     size_t resp_ie_len, int status, gfp_t gfp,
                     enum nl80211_timeout_reason timeout_reason)
{
        struct cfg80211_connect_resp_params params;

        memset(&params, 0, sizeof(params));
        params.status = status;
        params.bssid = bssid;
        params.bss = bss;
        params.req_ie = req_ie;
        params.req_ie_len = req_ie_len;
        params.resp_ie = resp_ie;
        params.resp_ie_len = resp_ie_len;
        params.timeout_reason = timeout_reason;

        cfg80211_connect_done(dev, &params, gfp);
}

/**
 * cfg80211_connect_result - notify cfg80211 of connection result
 *
 * @dev: network device
 * @bssid: the BSSID of the AP
 * @req_ie: association request IEs (maybe be %NULL)
 * @req_ie_len: association request IEs length
 * @resp_ie: association response IEs (may be %NULL)
 * @resp_ie_len: assoc response IEs length
 * @status: status code, %WLAN_STATUS_SUCCESS for successful connection, use
 *        %WLAN_STATUS_UNSPECIFIED_FAILURE if your device cannot give you
 *        the real status code for failures.
 * @gfp: allocation flags
 *
 * It should be called by the underlying driver once execution of the connection
 * request from connect() has been completed. This is similar to
 * cfg80211_connect_bss() which allows the exact bss entry to be specified. Only
 * one of the functions among cfg80211_connect_bss(), cfg80211_connect_result(),
 * cfg80211_connect_timeout(), and cfg80211_connect_done() should be called.
 */
static inline void
cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
                        const u8 *req_ie, size_t req_ie_len,
                        const u8 *resp_ie, size_t resp_ie_len,
                        u16 status, gfp_t gfp)
{
        cfg80211_connect_bss(dev, bssid, NULL, req_ie, req_ie_len, resp_ie,
                             resp_ie_len, status, gfp,
                             NL80211_TIMEOUT_UNSPECIFIED);
}

/**
 * cfg80211_connect_timeout - notify cfg80211 of connection timeout
 *
 * @dev: network device
 * @bssid: the BSSID of the AP
 * @req_ie: association request IEs (maybe be %NULL)
 * @req_ie_len: association request IEs length
 * @gfp: allocation flags
 * @timeout_reason: reason for connection timeout.
 *
 * It should be called by the underlying driver whenever connect() has failed
 * in a sequence where no explicit authentication/association rejection was
 * received from the AP. This could happen, e.g., due to not being able to send
 * out the Authentication or Association Request frame or timing out while
 * waiting for the response. Only one of the functions among
 * cfg80211_connect_bss(), cfg80211_connect_result(),
 * cfg80211_connect_timeout(), and cfg80211_connect_done() should be called.
 */
static inline void
cfg80211_connect_timeout(struct net_device *dev, const u8 *bssid,
                         const u8 *req_ie, size_t req_ie_len, gfp_t gfp,
                         enum nl80211_timeout_reason timeout_reason)
{
        cfg80211_connect_bss(dev, bssid, NULL, req_ie, req_ie_len, NULL, 0, -1,
                             gfp, timeout_reason);
}

/**
 * struct cfg80211_roam_info - driver initiated roaming information
 *
 * @channel: the channel of the new AP
 * @bss: entry of bss to which STA got roamed (may be %NULL if %bssid is set)
 * @bssid: the BSSID of the new AP (may be %NULL if %bss is set)
 * @req_ie: association request IEs (maybe be %NULL)
 * @req_ie_len: association request IEs length
 * @resp_ie: association response IEs (may be %NULL)
 * @resp_ie_len: assoc response IEs length
 * @fils: FILS related roaming information.
 */
struct cfg80211_roam_info {
        struct ieee80211_channel *channel;
        struct cfg80211_bss *bss;
        const u8 *bssid;
        const u8 *req_ie;
        size_t req_ie_len;
        const u8 *resp_ie;
        size_t resp_ie_len;
        struct cfg80211_fils_resp_params fils;
};

/**
 * cfg80211_roamed - notify cfg80211 of roaming
 *
 * @dev: network device
 * @info: information about the new BSS. struct &cfg80211_roam_info.
 * @gfp: allocation flags
 *
 * This function may be called with the driver passing either the BSSID of the
 * new AP or passing the bss entry to avoid a race in timeout of the bss entry.
 * It should be called by the underlying driver whenever it roamed from one AP
 * to another while connected. Drivers which have roaming implemented in
 * firmware should pass the bss entry to avoid a race in bss entry timeout where
 * the bss entry of the new AP is seen in the driver, but gets timed out by the
 * time it is accessed in __cfg80211_roamed() due to delay in scheduling
 * rdev->event_work. In case of any failures, the reference is released
 * either in cfg80211_roamed() or in __cfg80211_romed(), Otherwise, it will be
 * released while disconnecting from the current bss.
 */
void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info,
                     gfp_t gfp);

/**
 * cfg80211_port_authorized - notify cfg80211 of successful security association
 *
 * @dev: network device
 * @bssid: the BSSID of the AP
 * @gfp: allocation flags
 *
 * This function should be called by a driver that supports 4 way handshake
 * offload after a security association was successfully established (i.e.,
 * the 4 way handshake was completed successfully). The call to this function
 * should be preceded with a call to cfg80211_connect_result(),
 * cfg80211_connect_done(), cfg80211_connect_bss() or cfg80211_roamed() to
 * indicate the 802.11 association.
 */
void cfg80211_port_authorized(struct net_device *dev, const u8 *bssid,
                              gfp_t gfp);

/**
 * cfg80211_disconnected - notify cfg80211 that connection was dropped
 *
 * @dev: network device
 * @ie: information elements of the deauth/disassoc frame (may be %NULL)
 * @ie_len: length of IEs
 * @reason: reason code for the disconnection, set it to 0 if unknown
 * @locally_generated: disconnection was requested locally
 * @gfp: allocation flags
 *
 * After it calls this function, the driver should enter an idle state
 * and not try to connect to any AP any more.
 */
void cfg80211_disconnected(struct net_device *dev, u16 reason,
                           const u8 *ie, size_t ie_len,
                           bool locally_generated, gfp_t gfp);

/**
 * cfg80211_ready_on_channel - notification of remain_on_channel start
 * @wdev: wireless device
 * @cookie: the request cookie
 * @chan: The current channel (from remain_on_channel request)
 * @duration: Duration in milliseconds that the driver intents to remain on the
 *        channel
 * @gfp: allocation flags
 */
void cfg80211_ready_on_channel(struct wireless_dev *wdev, u64 cookie,
                               struct ieee80211_channel *chan,
                               unsigned int duration, gfp_t gfp);

/**
 * cfg80211_remain_on_channel_expired - remain_on_channel duration expired
 * @wdev: wireless device
 * @cookie: the request cookie
 * @chan: The current channel (from remain_on_channel request)
 * @gfp: allocation flags
 */
void cfg80211_remain_on_channel_expired(struct wireless_dev *wdev, u64 cookie,
                                        struct ieee80211_channel *chan,
                                        gfp_t gfp);

/**
 * cfg80211_tx_mgmt_expired - tx_mgmt duration expired
 * @wdev: wireless device
 * @cookie: the requested cookie
 * @chan: The current channel (from tx_mgmt request)
 * @gfp: allocation flags
 */
void cfg80211_tx_mgmt_expired(struct wireless_dev *wdev, u64 cookie,
                              struct ieee80211_channel *chan, gfp_t gfp);

/**
 * cfg80211_sinfo_alloc_tid_stats - allocate per-tid statistics.
 *
 * @sinfo: the station information
 * @gfp: allocation flags
 */
int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp);

/**
 * cfg80211_sinfo_release_content - release contents of station info
 * @sinfo: the station information
 *
 * Releases any potentially allocated sub-information of the station
 * information, but not the struct itself (since it's typically on
 * the stack.)
 */
static inline void cfg80211_sinfo_release_content(struct station_info *sinfo)
{
        kfree(sinfo->pertid);
}

/**
 * cfg80211_new_sta - notify userspace about station
 *
 * @dev: the netdev
 * @mac_addr: the station's address
 * @sinfo: the station information
 * @gfp: allocation flags
 */
void cfg80211_new_sta(struct net_device *dev, const u8 *mac_addr,
                      struct station_info *sinfo, gfp_t gfp);

/**
 * cfg80211_del_sta_sinfo - notify userspace about deletion of a station
 * @dev: the netdev
 * @mac_addr: the station's address
 * @sinfo: the station information/statistics
 * @gfp: allocation flags
 */
void cfg80211_del_sta_sinfo(struct net_device *dev, const u8 *mac_addr,
                            struct station_info *sinfo, gfp_t gfp);

/**
 * cfg80211_del_sta - notify userspace about deletion of a station
 *
 * @dev: the netdev
 * @mac_addr: the station's address
 * @gfp: allocation flags
 */
static inline void cfg80211_del_sta(struct net_device *dev,
                                    const u8 *mac_addr, gfp_t gfp)
{
        cfg80211_del_sta_sinfo(dev, mac_addr, NULL, gfp);
}

/**
 * cfg80211_conn_failed - connection request failed notification
 *
 * @dev: the netdev
 * @mac_addr: the station's address
 * @reason: the reason for connection failure
 * @gfp: allocation flags
 *
 * Whenever a station tries to connect to an AP and if the station
 * could not connect to the AP as the AP has rejected the connection
 * for some reasons, this function is called.
 *
 * The reason for connection failure can be any of the value from
 * nl80211_connect_failed_reason enum
 */
void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr,
                          enum nl80211_connect_failed_reason reason,
                          gfp_t gfp);

/**
 * cfg80211_rx_mgmt_khz - notification of received, unprocessed management frame
 * @wdev: wireless device receiving the frame
 * @freq: Frequency on which the frame was received in KHz
 * @sig_dbm: signal strength in dBm, or 0 if unknown
 * @buf: Management frame (header + body)
 * @len: length of the frame data
 * @flags: flags, as defined in enum nl80211_rxmgmt_flags
 *
 * This function is called whenever an Action frame is received for a station
 * mode interface, but is not processed in kernel.
 *
 * Return: %true if a user space application has registered for this frame.
 * For action frames, that makes it responsible for rejecting unrecognized
 * action frames; %false otherwise, in which case for action frames the
 * driver is responsible for rejecting the frame.
 */
bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq, int sig_dbm,
                          const u8 *buf, size_t len, u32 flags);

/**
 * cfg80211_rx_mgmt - notification of received, unprocessed management frame
 * @wdev: wireless device receiving the frame
 * @freq: Frequency on which the frame was received in MHz
 * @sig_dbm: signal strength in dBm, or 0 if unknown
 * @buf: Management frame (header + body)
 * @len: length of the frame data
 * @flags: flags, as defined in enum nl80211_rxmgmt_flags
 *
 * This function is called whenever an Action frame is received for a station
 * mode interface, but is not processed in kernel.
 *
 * Return: %true if a user space application has registered for this frame.
 * For action frames, that makes it responsible for rejecting unrecognized
 * action frames; %false otherwise, in which case for action frames the
 * driver is responsible for rejecting the frame.
 */
static inline bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq,
                                    int sig_dbm, const u8 *buf, size_t len,
                                    u32 flags)
{
        return cfg80211_rx_mgmt_khz(wdev, MHZ_TO_KHZ(freq), sig_dbm, buf, len,
                                    flags);
}

/**
 * cfg80211_mgmt_tx_status - notification of TX status for management frame
 * @wdev: wireless device receiving the frame
 * @cookie: Cookie returned by cfg80211_ops::mgmt_tx()
 * @buf: Management frame (header + body)
 * @len: length of the frame data
 * @ack: Whether frame was acknowledged
 * @gfp: context flags
 *
 * This function is called whenever a management frame was requested to be
 * transmitted with cfg80211_ops::mgmt_tx() to report the TX status of the
 * transmission attempt.
 */
void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie,
                             const u8 *buf, size_t len, bool ack, gfp_t gfp);

/**
 * cfg80211_control_port_tx_status - notification of TX status for control
 *                                   port frames
 * @wdev: wireless device receiving the frame
 * @cookie: Cookie returned by cfg80211_ops::tx_control_port()
 * @buf: Data frame (header + body)
 * @len: length of the frame data
 * @ack: Whether frame was acknowledged
 * @gfp: context flags
 *
 * This function is called whenever a control port frame was requested to be
 * transmitted with cfg80211_ops::tx_control_port() to report the TX status of
 * the transmission attempt.
 */
void cfg80211_control_port_tx_status(struct wireless_dev *wdev, u64 cookie,
                                     const u8 *buf, size_t len, bool ack,
                                     gfp_t gfp);

/**
 * cfg80211_rx_control_port - notification about a received control port frame
 * @dev: The device the frame matched to
 * @skb: The skbuf with the control port frame.  It is assumed that the skbuf
 *        is 802.3 formatted (with 802.3 header).  The skb can be non-linear.
 *        This function does not take ownership of the skb, so the caller is
 *        responsible for any cleanup.  The caller must also ensure that
 *        skb->protocol is set appropriately.
 * @unencrypted: Whether the frame was received unencrypted
 *
 * This function is used to inform userspace about a received control port
 * frame.  It should only be used if userspace indicated it wants to receive
 * control port frames over nl80211.
 *
 * The frame is the data portion of the 802.3 or 802.11 data frame with all
 * network layer headers removed (e.g. the raw EAPoL frame).
 *
 * Return: %true if the frame was passed to userspace
 */
bool cfg80211_rx_control_port(struct net_device *dev,
                              struct sk_buff *skb, bool unencrypted);

/**
 * cfg80211_cqm_rssi_notify - connection quality monitoring rssi event
 * @dev: network device
 * @rssi_event: the triggered RSSI event
 * @rssi_level: new RSSI level value or 0 if not available
 * @gfp: context flags
 *
 * This function is called when a configured connection quality monitoring
 * rssi threshold reached event occurs.
 */
void cfg80211_cqm_rssi_notify(struct net_device *dev,
                              enum nl80211_cqm_rssi_threshold_event rssi_event,
                              s32 rssi_level, gfp_t gfp);

/**
 * cfg80211_cqm_pktloss_notify - notify userspace about packetloss to peer
 * @dev: network device
 * @peer: peer's MAC address
 * @num_packets: how many packets were lost -- should be a fixed threshold
 *        but probably no less than maybe 50, or maybe a throughput dependent
 *        threshold (to account for temporary interference)
 * @gfp: context flags
 */
void cfg80211_cqm_pktloss_notify(struct net_device *dev,
                                 const u8 *peer, u32 num_packets, gfp_t gfp);

/**
 * cfg80211_cqm_txe_notify - TX error rate event
 * @dev: network device
 * @peer: peer's MAC address
 * @num_packets: how many packets were lost
 * @rate: % of packets which failed transmission
 * @intvl: interval (in s) over which the TX failure threshold was breached.
 * @gfp: context flags
 *
 * Notify userspace when configured % TX failures over number of packets in a
 * given interval is exceeded.
 */
void cfg80211_cqm_txe_notify(struct net_device *dev, const u8 *peer,
                             u32 num_packets, u32 rate, u32 intvl, gfp_t gfp);

/**
 * cfg80211_cqm_beacon_loss_notify - beacon loss event
 * @dev: network device
 * @gfp: context flags
 *
 * Notify userspace about beacon loss from the connected AP.
 */
void cfg80211_cqm_beacon_loss_notify(struct net_device *dev, gfp_t gfp);

/**
 * cfg80211_radar_event - radar detection event
 * @wiphy: the wiphy
 * @chandef: chandef for the current channel
 * @gfp: context flags
 *
 * This function is called when a radar is detected on the current chanenl.
 */
void cfg80211_radar_event(struct wiphy *wiphy,
                          struct cfg80211_chan_def *chandef, gfp_t gfp);

/**
 * cfg80211_sta_opmode_change_notify - STA's ht/vht operation mode change event
 * @dev: network device
 * @mac: MAC address of a station which opmode got modified
 * @sta_opmode: station's current opmode value
 * @gfp: context flags
 *
 * Driver should call this function when station's opmode modified via action
 * frame.
 */
void cfg80211_sta_opmode_change_notify(struct net_device *dev, const u8 *mac,
                                       struct sta_opmode_info *sta_opmode,
                                       gfp_t gfp);

/**
 * cfg80211_cac_event - Channel availability check (CAC) event
 * @netdev: network device
 * @chandef: chandef for the current channel
 * @event: type of event
 * @gfp: context flags
 *
 * This function is called when a Channel availability check (CAC) is finished
 * or aborted. This must be called to notify the completion of a CAC process,
 * also by full-MAC drivers.
 */
void cfg80211_cac_event(struct net_device *netdev,
                        const struct cfg80211_chan_def *chandef,
                        enum nl80211_radar_event event, gfp_t gfp);


/**
 * cfg80211_gtk_rekey_notify - notify userspace about driver rekeying
 * @dev: network device
 * @bssid: BSSID of AP (to avoid races)
 * @replay_ctr: new replay counter
 * @gfp: allocation flags
 */
void cfg80211_gtk_rekey_notify(struct net_device *dev, const u8 *bssid,
                               const u8 *replay_ctr, gfp_t gfp);

/**
 * cfg80211_pmksa_candidate_notify - notify about PMKSA caching candidate
 * @dev: network device
 * @index: candidate index (the smaller the index, the higher the priority)
 * @bssid: BSSID of AP
 * @preauth: Whether AP advertises support for RSN pre-authentication
 * @gfp: allocation flags
 */
void cfg80211_pmksa_candidate_notify(struct net_device *dev, int index,
                                     const u8 *bssid, bool preauth, gfp_t gfp);

/**
 * cfg80211_rx_spurious_frame - inform userspace about a spurious frame
 * @dev: The device the frame matched to
 * @addr: the transmitter address
 * @gfp: context flags
 *
 * This function is used in AP mode (only!) to inform userspace that
 * a spurious class 3 frame was received, to be able to deauth the
 * sender.
 * Return: %true if the frame was passed to userspace (or this failed
 * for a reason other than not having a subscription.)
 */
bool cfg80211_rx_spurious_frame(struct net_device *dev,
                                const u8 *addr, gfp_t gfp);

/**
 * cfg80211_rx_unexpected_4addr_frame - inform about unexpected WDS frame
 * @dev: The device the frame matched to
 * @addr: the transmitter address
 * @gfp: context flags
 *
 * This function is used in AP mode (only!) to inform userspace that
 * an associated station sent a 4addr frame but that wasn't expected.
 * It is allowed and desirable to send this event only once for each
 * station to avoid event flooding.
 * Return: %true if the frame was passed to userspace (or this failed
 * for a reason other than not having a subscription.)
 */
bool cfg80211_rx_unexpected_4addr_frame(struct net_device *dev,
                                        const u8 *addr, gfp_t gfp);

/**
 * cfg80211_probe_status - notify userspace about probe status
 * @dev: the device the probe was sent on
 * @addr: the address of the peer
 * @cookie: the cookie filled in @probe_client previously
 * @acked: indicates whether probe was acked or not
 * @ack_signal: signal strength (in dBm) of the ACK frame.
 * @is_valid_ack_signal: indicates the ack_signal is valid or not.
 * @gfp: allocation flags
 */
void cfg80211_probe_status(struct net_device *dev, const u8 *addr,
                           u64 cookie, bool acked, s32 ack_signal,
                           bool is_valid_ack_signal, gfp_t gfp);

/**
 * cfg80211_report_obss_beacon_khz - report beacon from other APs
 * @wiphy: The wiphy that received the beacon
 * @frame: the frame
 * @len: length of the frame
 * @freq: frequency the frame was received on in KHz
 * @sig_dbm: signal strength in dBm, or 0 if unknown
 *
 * Use this function to report to userspace when a beacon was
 * received. It is not useful to call this when there is no
 * netdev that is in AP/GO mode.
 */
void cfg80211_report_obss_beacon_khz(struct wiphy *wiphy, const u8 *frame,
                                     size_t len, int freq, int sig_dbm);

/**
 * cfg80211_report_obss_beacon - report beacon from other APs
 * @wiphy: The wiphy that received the beacon
 * @frame: the frame
 * @len: length of the frame
 * @freq: frequency the frame was received on
 * @sig_dbm: signal strength in dBm, or 0 if unknown
 *
 * Use this function to report to userspace when a beacon was
 * received. It is not useful to call this when there is no
 * netdev that is in AP/GO mode.
 */
static inline void cfg80211_report_obss_beacon(struct wiphy *wiphy,
                                               const u8 *frame, size_t len,
                                               int freq, int sig_dbm)
{
        cfg80211_report_obss_beacon_khz(wiphy, frame, len, MHZ_TO_KHZ(freq),
                                        sig_dbm);
}

/**
 * cfg80211_reg_can_beacon - check if beaconing is allowed
 * @wiphy: the wiphy
 * @chandef: the channel definition
 * @iftype: interface type
 *
 * Return: %true if there is no secondary channel or the secondary channel(s)
 * can be used for beaconing (i.e. is not a radar channel etc.)
 */
bool cfg80211_reg_can_beacon(struct wiphy *wiphy,
                             struct cfg80211_chan_def *chandef,
                             enum nl80211_iftype iftype);

/**
 * cfg80211_reg_can_beacon_relax - check if beaconing is allowed with relaxation
 * @wiphy: the wiphy
 * @chandef: the channel definition
 * @iftype: interface type
 *
 * Return: %true if there is no secondary channel or the secondary channel(s)
 * can be used for beaconing (i.e. is not a radar channel etc.). This version
 * also checks if IR-relaxation conditions apply, to allow beaconing under
 * more permissive conditions.
 *
 * Requires the RTNL to be held.
 */
bool cfg80211_reg_can_beacon_relax(struct wiphy *wiphy,
                                   struct cfg80211_chan_def *chandef,
                                   enum nl80211_iftype iftype);

/*
 * cfg80211_ch_switch_notify - update wdev channel and notify userspace
 * @dev: the device which switched channels
 * @chandef: the new channel definition
 *
 * Caller must acquire wdev_lock, therefore must only be called from sleepable
 * driver context!
 */
void cfg80211_ch_switch_notify(struct net_device *dev,
                               struct cfg80211_chan_def *chandef);

/*
 * cfg80211_ch_switch_started_notify - notify channel switch start
 * @dev: the device on which the channel switch started
 * @chandef: the future channel definition
 * @count: the number of TBTTs until the channel switch happens
 *
 * Inform the userspace about the channel switch that has just
 * started, so that it can take appropriate actions (eg. starting
 * channel switch on other vifs), if necessary.
 */
void cfg80211_ch_switch_started_notify(struct net_device *dev,
                                       struct cfg80211_chan_def *chandef,
                                       u8 count);

/**
 * ieee80211_operating_class_to_band - convert operating class to band
 *
 * @operating_class: the operating class to convert
 * @band: band pointer to fill
 *
 * Returns %true if the conversion was successful, %false otherwise.
 */
bool ieee80211_operating_class_to_band(u8 operating_class,
                                       enum nl80211_band *band);

/**
 * ieee80211_chandef_to_operating_class - convert chandef to operation class
 *
 * @chandef: the chandef to convert
 * @op_class: a pointer to the resulting operating class
 *
 * Returns %true if the conversion was successful, %false otherwise.
 */
bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef,
                                          u8 *op_class);

/**
 * ieee80211_chandef_to_khz - convert chandef to frequency in KHz
 *
 * @chandef: the chandef to convert
 *
 * Returns the center frequency of chandef (1st segment) in KHz.
 */
static inline u32
ieee80211_chandef_to_khz(const struct cfg80211_chan_def *chandef)
{
        return MHZ_TO_KHZ(chandef->center_freq1) + chandef->freq1_offset;
}

/*
 * cfg80211_tdls_oper_request - request userspace to perform TDLS operation
 * @dev: the device on which the operation is requested
 * @peer: the MAC address of the peer device
 * @oper: the requested TDLS operation (NL80211_TDLS_SETUP or
 *        NL80211_TDLS_TEARDOWN)
 * @reason_code: the reason code for teardown request
 * @gfp: allocation flags
 *
 * This function is used to request userspace to perform TDLS operation that
 * requires knowledge of keys, i.e., link setup or teardown when the AP
 * connection uses encryption. This is optional mechanism for the driver to use
 * if it can automatically determine when a TDLS link could be useful (e.g.,
 * based on traffic and signal strength for a peer).
 */
void cfg80211_tdls_oper_request(struct net_device *dev, const u8 *peer,
                                enum nl80211_tdls_operation oper,
                                u16 reason_code, gfp_t gfp);

/*
 * cfg80211_calculate_bitrate - calculate actual bitrate (in 100Kbps units)
 * @rate: given rate_info to calculate bitrate from
 *
 * return 0 if MCS index >= 32
 */
u32 cfg80211_calculate_bitrate(struct rate_info *rate);

/**
 * cfg80211_unregister_wdev - remove the given wdev
 * @wdev: struct wireless_dev to remove
 *
 * Call this function only for wdevs that have no netdev assigned,
 * e.g. P2P Devices. It removes the device from the list so that
 * it can no longer be used. It is necessary to call this function
 * even when cfg80211 requests the removal of the interface by
 * calling the del_virtual_intf() callback. The function must also
 * be called when the driver wishes to unregister the wdev, e.g.
 * when the device is unbound from the driver.
 *
 * Requires the RTNL to be held.
 */
void cfg80211_unregister_wdev(struct wireless_dev *wdev);

/**
 * struct cfg80211_ft_event_params - FT Information Elements
 * @ies: FT IEs
 * @ies_len: length of the FT IE in bytes
 * @target_ap: target AP's MAC address
 * @ric_ies: RIC IE
 * @ric_ies_len: length of the RIC IE in bytes
 */
struct cfg80211_ft_event_params {
        const u8 *ies;
        size_t ies_len;
        const u8 *target_ap;
        const u8 *ric_ies;
        size_t ric_ies_len;
};

/**
 * cfg80211_ft_event - notify userspace about FT IE and RIC IE
 * @netdev: network device
 * @ft_event: IE information
 */
void cfg80211_ft_event(struct net_device *netdev,
                       struct cfg80211_ft_event_params *ft_event);

/**
 * cfg80211_get_p2p_attr - find and copy a P2P attribute from IE buffer
 * @ies: the input IE buffer
 * @len: the input length
 * @attr: the attribute ID to find
 * @buf: output buffer, can be %NULL if the data isn't needed, e.g.
 *        if the function is only called to get the needed buffer size
 * @bufsize: size of the output buffer
 *
 * The function finds a given P2P attribute in the (vendor) IEs and
 * copies its contents to the given buffer.
 *
 * Return: A negative error code (-%EILSEQ or -%ENOENT) if the data is
 * malformed or the attribute can't be found (respectively), or the
 * length of the found attribute (which can be zero).
 */
int cfg80211_get_p2p_attr(const u8 *ies, unsigned int len,
                          enum ieee80211_p2p_attr_id attr,
                          u8 *buf, unsigned int bufsize);

/**
 * ieee80211_ie_split_ric - split an IE buffer according to ordering (with RIC)
 * @ies: the IE buffer
 * @ielen: the length of the IE buffer
 * @ids: an array with element IDs that are allowed before
 *        the split. A WLAN_EID_EXTENSION value means that the next
 *        EID in the list is a sub-element of the EXTENSION IE.
 * @n_ids: the size of the element ID array
 * @after_ric: array IE types that come after the RIC element
 * @n_after_ric: size of the @after_ric array
 * @offset: offset where to start splitting in the buffer
 *
 * This function splits an IE buffer by updating the @offset
 * variable to point to the location where the buffer should be
 * split.
 *
 * It assumes that the given IE buffer is well-formed, this
 * has to be guaranteed by the caller!
 *
 * It also assumes that the IEs in the buffer are ordered
 * correctly, if not the result of using this function will not
 * be ordered correctly either, i.e. it does no reordering.
 *
 * The function returns the offset where the next part of the
 * buffer starts, which may be @ielen if the entire (remainder)
 * of the buffer should be used.
 */
size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen,
                              const u8 *ids, int n_ids,
                              const u8 *after_ric, int n_after_ric,
                              size_t offset);

/**
 * ieee80211_ie_split - split an IE buffer according to ordering
 * @ies: the IE buffer
 * @ielen: the length of the IE buffer
 * @ids: an array with element IDs that are allowed before
 *        the split. A WLAN_EID_EXTENSION value means that the next
 *        EID in the list is a sub-element of the EXTENSION IE.
 * @n_ids: the size of the element ID array
 * @offset: offset where to start splitting in the buffer
 *
 * This function splits an IE buffer by updating the @offset
 * variable to point to the location where the buffer should be
 * split.
 *
 * It assumes that the given IE buffer is well-formed, this
 * has to be guaranteed by the caller!
 *
 * It also assumes that the IEs in the buffer are ordered
 * correctly, if not the result of using this function will not
 * be ordered correctly either, i.e. it does no reordering.
 *
 * The function returns the offset where the next part of the
 * buffer starts, which may be @ielen if the entire (remainder)
 * of the buffer should be used.
 */
static inline size_t ieee80211_ie_split(const u8 *ies, size_t ielen,
                                        const u8 *ids, int n_ids, size_t offset)
{
        return ieee80211_ie_split_ric(ies, ielen, ids, n_ids, NULL, 0, offset);
}

/**
 * cfg80211_report_wowlan_wakeup - report wakeup from WoWLAN
 * @wdev: the wireless device reporting the wakeup
 * @wakeup: the wakeup report
 * @gfp: allocation flags
 *
 * This function reports that the given device woke up. If it
 * caused the wakeup, report the reason(s), otherwise you may
 * pass %NULL as the @wakeup parameter to advertise that something
 * else caused the wakeup.
 */
void cfg80211_report_wowlan_wakeup(struct wireless_dev *wdev,
                                   struct cfg80211_wowlan_wakeup *wakeup,
                                   gfp_t gfp);

/**
 * cfg80211_crit_proto_stopped() - indicate critical protocol stopped by driver.
 *
 * @wdev: the wireless device for which critical protocol is stopped.
 * @gfp: allocation flags
 *
 * This function can be called by the driver to indicate it has reverted
 * operation back to normal. One reason could be that the duration given
 * by .crit_proto_start() has expired.
 */
void cfg80211_crit_proto_stopped(struct wireless_dev *wdev, gfp_t gfp);

/**
 * ieee80211_get_num_supported_channels - get number of channels device has
 * @wiphy: the wiphy
 *
 * Return: the number of channels supported by the device.
 */
unsigned int ieee80211_get_num_supported_channels(struct wiphy *wiphy);

/**
 * cfg80211_check_combinations - check interface combinations
 *
 * @wiphy: the wiphy
 * @params: the interface combinations parameter
 *
 * This function can be called by the driver to check whether a
 * combination of interfaces and their types are allowed according to
 * the interface combinations.
 */
int cfg80211_check_combinations(struct wiphy *wiphy,
                                struct iface_combination_params *params);

/**
 * cfg80211_iter_combinations - iterate over matching combinations
 *
 * @wiphy: the wiphy
 * @params: the interface combinations parameter
 * @iter: function to call for each matching combination
 * @data: pointer to pass to iter function
 *
 * This function can be called by the driver to check what possible
 * combinations it fits in at a given moment, e.g. for channel switching
 * purposes.
 */
int cfg80211_iter_combinations(struct wiphy *wiphy,
                               struct iface_combination_params *params,
                               void (*iter)(const struct ieee80211_iface_combination *c,
                                            void *data),
                               void *data);

/*
 * cfg80211_stop_iface - trigger interface disconnection
 *
 * @wiphy: the wiphy
 * @wdev: wireless device
 * @gfp: context flags
 *
 * Trigger interface to be stopped as if AP was stopped, IBSS/mesh left, STA
 * disconnected.
 *
 * Note: This doesn't need any locks and is asynchronous.
 */
void cfg80211_stop_iface(struct wiphy *wiphy, struct wireless_dev *wdev,
                         gfp_t gfp);

/**
 * cfg80211_shutdown_all_interfaces - shut down all interfaces for a wiphy
 * @wiphy: the wiphy to shut down
 *
 * This function shuts down all interfaces belonging to this wiphy by
 * calling dev_close() (and treating non-netdev interfaces as needed).
 * It shouldn't really be used unless there are some fatal device errors
 * that really can't be recovered in any other way.
 *
 * Callers must hold the RTNL and be able to deal with callbacks into
 * the driver while the function is running.
 */
void cfg80211_shutdown_all_interfaces(struct wiphy *wiphy);

/**
 * wiphy_ext_feature_set - set the extended feature flag
 *
 * @wiphy: the wiphy to modify.
 * @ftidx: extended feature bit index.
 *
 * The extended features are flagged in multiple bytes (see
 * &struct wiphy.@ext_features)
 */
static inline void wiphy_ext_feature_set(struct wiphy *wiphy,
                                         enum nl80211_ext_feature_index ftidx)
{
        u8 *ft_byte;

        ft_byte = &wiphy->ext_features[ftidx / 8];
        *ft_byte |= BIT(ftidx % 8);
}

/**
 * wiphy_ext_feature_isset - check the extended feature flag
 *
 * @wiphy: the wiphy to modify.
 * @ftidx: extended feature bit index.
 *
 * The extended features are flagged in multiple bytes (see
 * &struct wiphy.@ext_features)
 */
static inline bool
wiphy_ext_feature_isset(struct wiphy *wiphy,
                        enum nl80211_ext_feature_index ftidx)
{
        u8 ft_byte;

        ft_byte = wiphy->ext_features[ftidx / 8];
        return (ft_byte & BIT(ftidx % 8)) != 0;
}

/**
 * cfg80211_free_nan_func - free NAN function
 * @f: NAN function that should be freed
 *
 * Frees all the NAN function and all it's allocated members.
 */
void cfg80211_free_nan_func(struct cfg80211_nan_func *f);

/**
 * struct cfg80211_nan_match_params - NAN match parameters
 * @type: the type of the function that triggered a match. If it is
 *         %NL80211_NAN_FUNC_SUBSCRIBE it means that we replied to a subscriber.
 *         If it is %NL80211_NAN_FUNC_PUBLISH, it means that we got a discovery
 *         result.
 *         If it is %NL80211_NAN_FUNC_FOLLOW_UP, we received a follow up.
 * @inst_id: the local instance id
 * @peer_inst_id: the instance id of the peer's function
 * @addr: the MAC address of the peer
 * @info_len: the length of the &info
 * @info: the Service Specific Info from the peer (if any)
 * @cookie: unique identifier of the corresponding function
 */
struct cfg80211_nan_match_params {
        enum nl80211_nan_function_type type;
        u8 inst_id;
        u8 peer_inst_id;
        const u8 *addr;
        u8 info_len;
        const u8 *info;
        u64 cookie;
};

/**
 * cfg80211_nan_match - report a match for a NAN function.
 * @wdev: the wireless device reporting the match
 * @match: match notification parameters
 * @gfp: allocation flags
 *
 * This function reports that the a NAN function had a match. This
 * can be a subscribe that had a match or a solicited publish that
 * was sent. It can also be a follow up that was received.
 */
void cfg80211_nan_match(struct wireless_dev *wdev,
                        struct cfg80211_nan_match_params *match, gfp_t gfp);

/**
 * cfg80211_nan_func_terminated - notify about NAN function termination.
 *
 * @wdev: the wireless device reporting the match
 * @inst_id: the local instance id
 * @reason: termination reason (one of the NL80211_NAN_FUNC_TERM_REASON_*)
 * @cookie: unique NAN function identifier
 * @gfp: allocation flags
 *
 * This function reports that the a NAN function is terminated.
 */
void cfg80211_nan_func_terminated(struct wireless_dev *wdev,
                                  u8 inst_id,
                                  enum nl80211_nan_func_term_reason reason,
                                  u64 cookie, gfp_t gfp);

/* ethtool helper */
void cfg80211_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info);

/**
 * cfg80211_external_auth_request - userspace request for authentication
 * @netdev: network device
 * @params: External authentication parameters
 * @gfp: allocation flags
 * Returns: 0 on success, < 0 on error
 */
int cfg80211_external_auth_request(struct net_device *netdev,
                                   struct cfg80211_external_auth_params *params,
                                   gfp_t gfp);

/**
 * cfg80211_pmsr_report - report peer measurement result data
 * @wdev: the wireless device reporting the measurement
 * @req: the original measurement request
 * @result: the result data
 * @gfp: allocation flags
 */
void cfg80211_pmsr_report(struct wireless_dev *wdev,
                          struct cfg80211_pmsr_request *req,
                          struct cfg80211_pmsr_result *result,
                          gfp_t gfp);

/**
 * cfg80211_pmsr_complete - report peer measurement completed
 * @wdev: the wireless device reporting the measurement
 * @req: the original measurement request
 * @gfp: allocation flags
 *
 * Report that the entire measurement completed, after this
 * the request pointer will no longer be valid.
 */
void cfg80211_pmsr_complete(struct wireless_dev *wdev,
                            struct cfg80211_pmsr_request *req,
                            gfp_t gfp);

/**
 * cfg80211_iftype_allowed - check whether the interface can be allowed
 * @wiphy: the wiphy
 * @iftype: interface type
 * @is_4addr: use_4addr flag, must be '0' when check_swif is '1'
 * @check_swif: check iftype against software interfaces
 *
 * Check whether the interface is allowed to operate; additionally, this API
 * can be used to check iftype against the software interfaces when
 * check_swif is '1'.
 */
bool cfg80211_iftype_allowed(struct wiphy *wiphy, enum nl80211_iftype iftype,
                             bool is_4addr, u8 check_swif);


/* Logging, debugging and troubleshooting/diagnostic helpers. */

/* wiphy_printk helpers, similar to dev_printk */

#define wiphy_printk(level, wiphy, format, args...)                \
        dev_printk(level, &(wiphy)->dev, format, ##args)
#define wiphy_emerg(wiphy, format, args...)                        \
        dev_emerg(&(wiphy)->dev, format, ##args)
#define wiphy_alert(wiphy, format, args...)                        \
        dev_alert(&(wiphy)->dev, format, ##args)
#define wiphy_crit(wiphy, format, args...)                        \
        dev_crit(&(wiphy)->dev, format, ##args)
#define wiphy_err(wiphy, format, args...)                        \
        dev_err(&(wiphy)->dev, format, ##args)
#define wiphy_warn(wiphy, format, args...)                        \
        dev_warn(&(wiphy)->dev, format, ##args)
#define wiphy_notice(wiphy, format, args...)                        \
        dev_notice(&(wiphy)->dev, format, ##args)
#define wiphy_info(wiphy, format, args...)                        \
        dev_info(&(wiphy)->dev, format, ##args)

#define wiphy_err_ratelimited(wiphy, format, args...)                \
        dev_err_ratelimited(&(wiphy)->dev, format, ##args)
#define wiphy_warn_ratelimited(wiphy, format, args...)                \
        dev_warn_ratelimited(&(wiphy)->dev, format, ##args)

#define wiphy_debug(wiphy, format, args...)                        \
        wiphy_printk(KERN_DEBUG, wiphy, format, ##args)

#define wiphy_dbg(wiphy, format, args...)                        \
        dev_dbg(&(wiphy)->dev, format, ##args)

#if defined(VERBOSE_DEBUG)
#define wiphy_vdbg        wiphy_dbg
#else
#define wiphy_vdbg(wiphy, format, args...)                                \
({                                                                        \
        if (0)                                                                \
                wiphy_printk(KERN_DEBUG, wiphy, format, ##args);        \
        0;                                                                \
})
#endif

/*
 * wiphy_WARN() acts like wiphy_printk(), but with the key difference
 * of using a WARN/WARN_ON to get the message out, including the
 * file/line information and a backtrace.
 */
#define wiphy_WARN(wiphy, format, args...)                        \
        WARN(1, "wiphy: %s\n" format, wiphy_name(wiphy), ##args);

/**
 * cfg80211_update_owe_info_event - Notify the peer's OWE info to user space
 * @netdev: network device
 * @owe_info: peer's owe info
 * @gfp: allocation flags
 */
void cfg80211_update_owe_info_event(struct net_device *netdev,
                                    struct cfg80211_update_owe_info *owe_info,
                                    gfp_t gfp);

/**
 * cfg80211_bss_flush - resets all the scan entries
 * @wiphy: the wiphy
 */
void cfg80211_bss_flush(struct wiphy *wiphy);

#endif /* __NET_CFG80211_H */

































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * fs-verity: read-only file-based authenticity protection
 *
 * This header declares the interface between the fs/verity/ support layer and
 * filesystems that support fs-verity.
 *
 * Copyright 2019 Google LLC
 */

#ifndef _LINUX_FSVERITY_H
#define _LINUX_FSVERITY_H

#include <linux/fs.h>
#include <uapi/linux/fsverity.h>

/* Verity operations for filesystems */
struct fsverity_operations {

        /**
         * Begin enabling verity on the given file.
         *
         * @filp: a readonly file descriptor for the file
         *
         * The filesystem must do any needed filesystem-specific preparations
         * for enabling verity, e.g. evicting inline data.  It also must return
         * -EBUSY if verity is already being enabled on the given file.
         *
         * i_rwsem is held for write.
         *
         * Return: 0 on success, -errno on failure
         */
        int (*begin_enable_verity)(struct file *filp);

        /**
         * End enabling verity on the given file.
         *
         * @filp: a readonly file descriptor for the file
         * @desc: the verity descriptor to write, or NULL on failure
         * @desc_size: size of verity descriptor, or 0 on failure
         * @merkle_tree_size: total bytes the Merkle tree took up
         *
         * If desc == NULL, then enabling verity failed and the filesystem only
         * must do any necessary cleanups.  Else, it must also store the given
         * verity descriptor to a fs-specific location associated with the inode
         * and do any fs-specific actions needed to mark the inode as a verity
         * inode, e.g. setting a bit in the on-disk inode.  The filesystem is
         * also responsible for setting the S_VERITY flag in the VFS inode.
         *
         * i_rwsem is held for write, but it may have been dropped between
         * ->begin_enable_verity() and ->end_enable_verity().
         *
         * Return: 0 on success, -errno on failure
         */
        int (*end_enable_verity)(struct file *filp, const void *desc,
                                 size_t desc_size, u64 merkle_tree_size);

        /**
         * Get the verity descriptor of the given inode.
         *
         * @inode: an inode with the S_VERITY flag set
         * @buf: buffer in which to place the verity descriptor
         * @bufsize: size of @buf, or 0 to retrieve the size only
         *
         * If bufsize == 0, then the size of the verity descriptor is returned.
         * Otherwise the verity descriptor is written to 'buf' and its actual
         * size is returned; -ERANGE is returned if it's too large.  This may be
         * called by multiple processes concurrently on the same inode.
         *
         * Return: the size on success, -errno on failure
         */
        int (*get_verity_descriptor)(struct inode *inode, void *buf,
                                     size_t bufsize);

        /**
         * Read a Merkle tree page of the given inode.
         *
         * @inode: the inode
         * @index: 0-based index of the page within the Merkle tree
         * @num_ra_pages: The number of Merkle tree pages that should be
         *                  prefetched starting at @index if the page at @index
         *                  isn't already cached.  Implementations may ignore this
         *                  argument; it's only a performance optimization.
         *
         * This can be called at any time on an open verity file, as well as
         * between ->begin_enable_verity() and ->end_enable_verity().  It may be
         * called by multiple processes concurrently, even with the same page.
         *
         * Note that this must retrieve a *page*, not necessarily a *block*.
         *
         * Return: the page on success, ERR_PTR() on failure
         */
        struct page *(*read_merkle_tree_page)(struct inode *inode,
                                              pgoff_t index,
                                              unsigned long num_ra_pages);

        /**
         * Write a Merkle tree block to the given inode.
         *
         * @inode: the inode for which the Merkle tree is being built
         * @buf: block to write
         * @index: 0-based index of the block within the Merkle tree
         * @log_blocksize: log base 2 of the Merkle tree block size
         *
         * This is only called between ->begin_enable_verity() and
         * ->end_enable_verity().
         *
         * Return: 0 on success, -errno on failure
         */
        int (*write_merkle_tree_block)(struct inode *inode, const void *buf,
                                       u64 index, int log_blocksize);
};

#ifdef CONFIG_FS_VERITY

static inline struct fsverity_info *fsverity_get_info(const struct inode *inode)
{
        /*
         * Pairs with the cmpxchg_release() in fsverity_set_info().
         * I.e., another task may publish ->i_verity_info concurrently,
         * executing a RELEASE barrier.  We need to use smp_load_acquire() here
         * to safely ACQUIRE the memory the other task published.
         */
        return smp_load_acquire(&inode->i_verity_info);
}

/* enable.c */

int fsverity_ioctl_enable(struct file *filp, const void __user *arg);

/* measure.c */

int fsverity_ioctl_measure(struct file *filp, void __user *arg);

/* open.c */

int fsverity_file_open(struct inode *inode, struct file *filp);
int fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr);
void fsverity_cleanup_inode(struct inode *inode);

/* verify.c */

bool fsverity_verify_page(struct page *page);
void fsverity_verify_bio(struct bio *bio);
void fsverity_enqueue_verify_work(struct work_struct *work);

#else /* !CONFIG_FS_VERITY */

static inline struct fsverity_info *fsverity_get_info(const struct inode *inode)
{
        return NULL;
}

/* enable.c */

static inline int fsverity_ioctl_enable(struct file *filp,
                                        const void __user *arg)
{
        return -EOPNOTSUPP;
}

/* measure.c */

static inline int fsverity_ioctl_measure(struct file *filp, void __user *arg)
{
        return -EOPNOTSUPP;
}

/* open.c */

static inline int fsverity_file_open(struct inode *inode, struct file *filp)
{
        return IS_VERITY(inode) ? -EOPNOTSUPP : 0;
}

static inline int fsverity_prepare_setattr(struct dentry *dentry,
                                           struct iattr *attr)
{
        return IS_VERITY(d_inode(dentry)) ? -EOPNOTSUPP : 0;
}

static inline void fsverity_cleanup_inode(struct inode *inode)
{
}

/* verify.c */

static inline bool fsverity_verify_page(struct page *page)
{
        WARN_ON(1);
        return false;
}

static inline void fsverity_verify_bio(struct bio *bio)
{
        WARN_ON(1);
}

static inline void fsverity_enqueue_verify_work(struct work_struct *work)
{
        WARN_ON(1);
}

#endif        /* !CONFIG_FS_VERITY */

/**
 * fsverity_active() - do reads from the inode need to go through fs-verity?
 * @inode: inode to check
 *
 * This checks whether ->i_verity_info has been set.
 *
 * Filesystems call this from ->readpages() to check whether the pages need to
 * be verified or not.  Don't use IS_VERITY() for this purpose; it's subject to
 * a race condition where the file is being read concurrently with
 * FS_IOC_ENABLE_VERITY completing.  (S_VERITY is set before ->i_verity_info.)
 *
 * Return: true if reads need to go through fs-verity, otherwise false
 */
static inline bool fsverity_active(const struct inode *inode)
{
        return fsverity_get_info(inode) != NULL;
}

#endif        /* _LINUX_FSVERITY_H */


















































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (c) 2016 Qualcomm Atheros, Inc
 *
 * Based on net/sched/sch_fq_codel.c
 */
#ifndef __NET_SCHED_FQ_IMPL_H
#define __NET_SCHED_FQ_IMPL_H

#include <net/fq.h>

/* functions that are embedded into includer */

static void fq_adjust_removal(struct fq *fq,
                              struct fq_flow *flow,
                              struct sk_buff *skb)
{
        struct fq_tin *tin = flow->tin;

        tin->backlog_bytes -= skb->len;
        tin->backlog_packets--;
        flow->backlog -= skb->len;
        fq->backlog--;
        fq->memory_usage -= skb->truesize;
}

static void fq_rejigger_backlog(struct fq *fq, struct fq_flow *flow)
{
        struct fq_flow *i;

        if (flow->backlog == 0) {
                list_del_init(&flow->backlogchain);
        } else {
                i = flow;

                list_for_each_entry_continue(i, &fq->backlogs, backlogchain)
                        if (i->backlog < flow->backlog)
                                break;

                list_move_tail(&flow->backlogchain,
                               &i->backlogchain);
        }
}

static struct sk_buff *fq_flow_dequeue(struct fq *fq,
                                       struct fq_flow *flow)
{
        struct sk_buff *skb;

        lockdep_assert_held(&fq->lock);

        skb = __skb_dequeue(&flow->queue);
        if (!skb)
                return NULL;

        fq_adjust_removal(fq, flow, skb);
        fq_rejigger_backlog(fq, flow);

        return skb;
}

static struct sk_buff *fq_tin_dequeue(struct fq *fq,
                                      struct fq_tin *tin,
                                      fq_tin_dequeue_t dequeue_func)
{
        struct fq_flow *flow;
        struct list_head *head;
        struct sk_buff *skb;

        lockdep_assert_held(&fq->lock);

begin:
        head = &tin->new_flows;
        if (list_empty(head)) {
                head = &tin->old_flows;
                if (list_empty(head))
                        return NULL;
        }

        flow = list_first_entry(head, struct fq_flow, flowchain);

        if (flow->deficit <= 0) {
                flow->deficit += fq->quantum;
                list_move_tail(&flow->flowchain,
                               &tin->old_flows);
                goto begin;
        }

        skb = dequeue_func(fq, tin, flow);
        if (!skb) {
                /* force a pass through old_flows to prevent starvation */
                if ((head == &tin->new_flows) &&
                    !list_empty(&tin->old_flows)) {
                        list_move_tail(&flow->flowchain, &tin->old_flows);
                } else {
                        list_del_init(&flow->flowchain);
                        flow->tin = NULL;
                }
                goto begin;
        }

        flow->deficit -= skb->len;
        tin->tx_bytes += skb->len;
        tin->tx_packets++;

        return skb;
}

static u32 fq_flow_idx(struct fq *fq, struct sk_buff *skb)
{
        u32 hash = skb_get_hash(skb);

        return reciprocal_scale(hash, fq->flows_cnt);
}

static struct fq_flow *fq_flow_classify(struct fq *fq,
                                        struct fq_tin *tin, u32 idx,
                                        struct sk_buff *skb,
                                        fq_flow_get_default_t get_default_func)
{
        struct fq_flow *flow;

        lockdep_assert_held(&fq->lock);

        flow = &fq->flows[idx];
        if (flow->tin && flow->tin != tin) {
                flow = get_default_func(fq, tin, idx, skb);
                tin->collisions++;
                fq->collisions++;
        }

        if (!flow->tin)
                tin->flows++;

        return flow;
}

static void fq_recalc_backlog(struct fq *fq,
                              struct fq_tin *tin,
                              struct fq_flow *flow)
{
        struct fq_flow *i;

        if (list_empty(&flow->backlogchain))
                list_add_tail(&flow->backlogchain, &fq->backlogs);

        i = flow;
        list_for_each_entry_continue_reverse(i, &fq->backlogs,
                                             backlogchain)
                if (i->backlog > flow->backlog)
                        break;

        list_move(&flow->backlogchain, &i->backlogchain);
}

static void fq_tin_enqueue(struct fq *fq,
                           struct fq_tin *tin, u32 idx,
                           struct sk_buff *skb,
                           fq_skb_free_t free_func,
                           fq_flow_get_default_t get_default_func)
{
        struct fq_flow *flow;
        bool oom;

        lockdep_assert_held(&fq->lock);

        flow = fq_flow_classify(fq, tin, idx, skb, get_default_func);

        flow->tin = tin;
        flow->backlog += skb->len;
        tin->backlog_bytes += skb->len;
        tin->backlog_packets++;
        fq->memory_usage += skb->truesize;
        fq->backlog++;

        fq_recalc_backlog(fq, tin, flow);

        if (list_empty(&flow->flowchain)) {
                flow->deficit = fq->quantum;
                list_add_tail(&flow->flowchain,
                              &tin->new_flows);
        }

        __skb_queue_tail(&flow->queue, skb);
        oom = (fq->memory_usage > fq->memory_limit);
        while (fq->backlog > fq->limit || oom) {
                flow = list_first_entry_or_null(&fq->backlogs,
                                                struct fq_flow,
                                                backlogchain);
                if (!flow)
                        return;

                skb = fq_flow_dequeue(fq, flow);
                if (!skb)
                        return;

                free_func(fq, flow->tin, flow, skb);

                flow->tin->overlimit++;
                fq->overlimit++;
                if (oom) {
                        fq->overmemory++;
                        oom = (fq->memory_usage > fq->memory_limit);
                }
        }
}

static void fq_flow_filter(struct fq *fq,
                           struct fq_flow *flow,
                           fq_skb_filter_t filter_func,
                           void *filter_data,
                           fq_skb_free_t free_func)
{
        struct fq_tin *tin = flow->tin;
        struct sk_buff *skb, *tmp;

        lockdep_assert_held(&fq->lock);

        skb_queue_walk_safe(&flow->queue, skb, tmp) {
                if (!filter_func(fq, tin, flow, skb, filter_data))
                        continue;

                __skb_unlink(skb, &flow->queue);
                fq_adjust_removal(fq, flow, skb);
                free_func(fq, tin, flow, skb);
        }

        fq_rejigger_backlog(fq, flow);
}

static void fq_tin_filter(struct fq *fq,
                          struct fq_tin *tin,
                          fq_skb_filter_t filter_func,
                          void *filter_data,
                          fq_skb_free_t free_func)
{
        struct fq_flow *flow;

        lockdep_assert_held(&fq->lock);

        list_for_each_entry(flow, &tin->new_flows, flowchain)
                fq_flow_filter(fq, flow, filter_func, filter_data, free_func);
        list_for_each_entry(flow, &tin->old_flows, flowchain)
                fq_flow_filter(fq, flow, filter_func, filter_data, free_func);
}

static void fq_flow_reset(struct fq *fq,
                          struct fq_flow *flow,
                          fq_skb_free_t free_func)
{
        struct sk_buff *skb;

        while ((skb = fq_flow_dequeue(fq, flow)))
                free_func(fq, flow->tin, flow, skb);

        if (!list_empty(&flow->flowchain))
                list_del_init(&flow->flowchain);

        if (!list_empty(&flow->backlogchain))
                list_del_init(&flow->backlogchain);

        flow->tin = NULL;

        WARN_ON_ONCE(flow->backlog);
}

static void fq_tin_reset(struct fq *fq,
                         struct fq_tin *tin,
                         fq_skb_free_t free_func)
{
        struct list_head *head;
        struct fq_flow *flow;

        for (;;) {
                head = &tin->new_flows;
                if (list_empty(head)) {
                        head = &tin->old_flows;
                        if (list_empty(head))
                                break;
                }

                flow = list_first_entry(head, struct fq_flow, flowchain);
                fq_flow_reset(fq, flow, free_func);
        }

        WARN_ON_ONCE(tin->backlog_bytes);
        WARN_ON_ONCE(tin->backlog_packets);
}

static void fq_flow_init(struct fq_flow *flow)
{
        INIT_LIST_HEAD(&flow->flowchain);
        INIT_LIST_HEAD(&flow->backlogchain);
        __skb_queue_head_init(&flow->queue);
}

static void fq_tin_init(struct fq_tin *tin)
{
        INIT_LIST_HEAD(&tin->new_flows);
        INIT_LIST_HEAD(&tin->old_flows);
}

static int fq_init(struct fq *fq, int flows_cnt)
{
        int i;

        memset(fq, 0, sizeof(fq[0]));
        INIT_LIST_HEAD(&fq->backlogs);
        spin_lock_init(&fq->lock);
        fq->flows_cnt = max_t(u32, flows_cnt, 1);
        fq->quantum = 300;
        fq->limit = 8192;
        fq->memory_limit = 16 << 20; /* 16 MBytes */

        fq->flows = kvcalloc(fq->flows_cnt, sizeof(fq->flows[0]), GFP_KERNEL);
        if (!fq->flows)
                return -ENOMEM;

        for (i = 0; i < fq->flows_cnt; i++)
                fq_flow_init(&fq->flows[i]);

        return 0;
}

static void fq_reset(struct fq *fq,
                     fq_skb_free_t free_func)
{
        int i;

        for (i = 0; i < fq->flows_cnt; i++)
                fq_flow_reset(fq, &fq->flows[i], free_func);

        kvfree(fq->flows);
        fq->flows = NULL;
}

#endif
































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright 2002-2005, Devicescape Software, Inc.
 * Copyright 2013-2014  Intel Mobile Communications GmbH
 * Copyright(c) 2015-2017 Intel Deutschland GmbH
 * Copyright(c) 2020-2021 Intel Corporation
 */

#ifndef STA_INFO_H
#define STA_INFO_H

#include <linux/list.h>
#include <linux/types.h>
#include <linux/if_ether.h>
#include <linux/workqueue.h>
#include <linux/average.h>
#include <linux/bitfield.h>
#include <linux/etherdevice.h>
#include <linux/rhashtable.h>
#include <linux/u64_stats_sync.h>
#include "key.h"

/**
 * enum ieee80211_sta_info_flags - Stations flags
 *
 * These flags are used with &struct sta_info's @flags member, but
 * only indirectly with set_sta_flag() and friends.
 *
 * @WLAN_STA_AUTH: Station is authenticated.
 * @WLAN_STA_ASSOC: Station is associated.
 * @WLAN_STA_PS_STA: Station is in power-save mode
 * @WLAN_STA_AUTHORIZED: Station is authorized to send/receive traffic.
 *        This bit is always checked so needs to be enabled for all stations
 *        when virtual port control is not in use.
 * @WLAN_STA_SHORT_PREAMBLE: Station is capable of receiving short-preamble
 *        frames.
 * @WLAN_STA_WDS: Station is one of our WDS peers.
 * @WLAN_STA_CLEAR_PS_FILT: Clear PS filter in hardware (using the
 *        IEEE80211_TX_CTL_CLEAR_PS_FILT control flag) when the next
 *        frame to this station is transmitted.
 * @WLAN_STA_MFP: Management frame protection is used with this STA.
 * @WLAN_STA_BLOCK_BA: Used to deny ADDBA requests (both TX and RX)
 *        during suspend/resume and station removal.
 * @WLAN_STA_PS_DRIVER: driver requires keeping this station in
 *        power-save mode logically to flush frames that might still
 *        be in the queues
 * @WLAN_STA_PSPOLL: Station sent PS-poll while driver was keeping
 *        station in power-save mode, reply when the driver unblocks.
 * @WLAN_STA_TDLS_PEER: Station is a TDLS peer.
 * @WLAN_STA_TDLS_PEER_AUTH: This TDLS peer is authorized to send direct
 *        packets. This means the link is enabled.
 * @WLAN_STA_TDLS_INITIATOR: We are the initiator of the TDLS link with this
 *        station.
 * @WLAN_STA_TDLS_CHAN_SWITCH: This TDLS peer supports TDLS channel-switching
 * @WLAN_STA_TDLS_OFF_CHANNEL: The local STA is currently off-channel with this
 *        TDLS peer
 * @WLAN_STA_TDLS_WIDER_BW: This TDLS peer supports working on a wider bw on
 *        the BSS base channel.
 * @WLAN_STA_UAPSD: Station requested unscheduled SP while driver was
 *        keeping station in power-save mode, reply when the driver
 *        unblocks the station.
 * @WLAN_STA_SP: Station is in a service period, so don't try to
 *        reply to other uAPSD trigger frames or PS-Poll.
 * @WLAN_STA_4ADDR_EVENT: 4-addr event was already sent for this frame.
 * @WLAN_STA_INSERTED: This station is inserted into the hash table.
 * @WLAN_STA_RATE_CONTROL: rate control was initialized for this station.
 * @WLAN_STA_TOFFSET_KNOWN: toffset calculated for this station is valid.
 * @WLAN_STA_MPSP_OWNER: local STA is owner of a mesh Peer Service Period.
 * @WLAN_STA_MPSP_RECIPIENT: local STA is recipient of a MPSP.
 * @WLAN_STA_PS_DELIVER: station woke up, but we're still blocking TX
 *        until pending frames are delivered
 * @WLAN_STA_USES_ENCRYPTION: This station was configured for encryption,
 *        so drop all packets without a key later.
 *
 * @NUM_WLAN_STA_FLAGS: number of defined flags
 */
enum ieee80211_sta_info_flags {
        WLAN_STA_AUTH,
        WLAN_STA_ASSOC,
        WLAN_STA_PS_STA,
        WLAN_STA_AUTHORIZED,
        WLAN_STA_SHORT_PREAMBLE,
        WLAN_STA_WDS,
        WLAN_STA_CLEAR_PS_FILT,
        WLAN_STA_MFP,
        WLAN_STA_BLOCK_BA,
        WLAN_STA_PS_DRIVER,
        WLAN_STA_PSPOLL,
        WLAN_STA_TDLS_PEER,
        WLAN_STA_TDLS_PEER_AUTH,
        WLAN_STA_TDLS_INITIATOR,
        WLAN_STA_TDLS_CHAN_SWITCH,
        WLAN_STA_TDLS_OFF_CHANNEL,
        WLAN_STA_TDLS_WIDER_BW,
        WLAN_STA_UAPSD,
        WLAN_STA_SP,
        WLAN_STA_4ADDR_EVENT,
        WLAN_STA_INSERTED,
        WLAN_STA_RATE_CONTROL,
        WLAN_STA_TOFFSET_KNOWN,
        WLAN_STA_MPSP_OWNER,
        WLAN_STA_MPSP_RECIPIENT,
        WLAN_STA_PS_DELIVER,
        WLAN_STA_USES_ENCRYPTION,

        NUM_WLAN_STA_FLAGS,
};

#define ADDBA_RESP_INTERVAL HZ
#define HT_AGG_MAX_RETRIES                15
#define HT_AGG_BURST_RETRIES                3
#define HT_AGG_RETRIES_PERIOD                (15 * HZ)

#define HT_AGG_STATE_DRV_READY                0
#define HT_AGG_STATE_RESPONSE_RECEIVED        1
#define HT_AGG_STATE_OPERATIONAL        2
#define HT_AGG_STATE_STOPPING                3
#define HT_AGG_STATE_WANT_START                4
#define HT_AGG_STATE_WANT_STOP                5
#define HT_AGG_STATE_START_CB                6
#define HT_AGG_STATE_STOP_CB                7
#define HT_AGG_STATE_SENT_ADDBA                8

DECLARE_EWMA(avg_signal, 10, 8)
enum ieee80211_agg_stop_reason {
        AGG_STOP_DECLINED,
        AGG_STOP_LOCAL_REQUEST,
        AGG_STOP_PEER_REQUEST,
        AGG_STOP_DESTROY_STA,
};

/* Debugfs flags to enable/disable use of RX/TX airtime in scheduler */
#define AIRTIME_USE_TX                BIT(0)
#define AIRTIME_USE_RX                BIT(1)

struct airtime_info {
        u64 rx_airtime;
        u64 tx_airtime;
        s64 deficit;
        atomic_t aql_tx_pending; /* Estimated airtime for frames pending */
        u32 aql_limit_low;
        u32 aql_limit_high;
};

void ieee80211_sta_update_pending_airtime(struct ieee80211_local *local,
                                          struct sta_info *sta, u8 ac,
                                          u16 tx_airtime, bool tx_completed);

struct sta_info;

/**
 * struct tid_ampdu_tx - TID aggregation information (Tx).
 *
 * @rcu_head: rcu head for freeing structure
 * @session_timer: check if we keep Tx-ing on the TID (by timeout value)
 * @addba_resp_timer: timer for peer's response to addba request
 * @pending: pending frames queue -- use sta's spinlock to protect
 * @sta: station we are attached to
 * @dialog_token: dialog token for aggregation session
 * @timeout: session timeout value to be filled in ADDBA requests
 * @tid: TID number
 * @state: session state (see above)
 * @last_tx: jiffies of last tx activity
 * @stop_initiator: initiator of a session stop
 * @tx_stop: TX DelBA frame when stopping
 * @buf_size: reorder buffer size at receiver
 * @failed_bar_ssn: ssn of the last failed BAR tx attempt
 * @bar_pending: BAR needs to be re-sent
 * @amsdu: support A-MSDU withing A-MDPU
 *
 * This structure's lifetime is managed by RCU, assignments to
 * the array holding it must hold the aggregation mutex.
 *
 * The TX path can access it under RCU lock-free if, and
 * only if, the state has the flag %HT_AGG_STATE_OPERATIONAL
 * set. Otherwise, the TX path must also acquire the spinlock
 * and re-check the state, see comments in the tx code
 * touching it.
 */
struct tid_ampdu_tx {
        struct rcu_head rcu_head;
        struct timer_list session_timer;
        struct timer_list addba_resp_timer;
        struct sk_buff_head pending;
        struct sta_info *sta;
        unsigned long state;
        unsigned long last_tx;
        u16 timeout;
        u8 dialog_token;
        u8 stop_initiator;
        bool tx_stop;
        u16 buf_size;
        u16 ssn;

        u16 failed_bar_ssn;
        bool bar_pending;
        bool amsdu;
        u8 tid;
};

/**
 * struct tid_ampdu_rx - TID aggregation information (Rx).
 *
 * @reorder_buf: buffer to reorder incoming aggregated MPDUs. An MPDU may be an
 *        A-MSDU with individually reported subframes.
 * @reorder_buf_filtered: bitmap indicating where there are filtered frames in
 *        the reorder buffer that should be ignored when releasing frames
 * @reorder_time: jiffies when skb was added
 * @session_timer: check if peer keeps Tx-ing on the TID (by timeout value)
 * @reorder_timer: releases expired frames from the reorder buffer.
 * @sta: station we are attached to
 * @last_rx: jiffies of last rx activity
 * @head_seq_num: head sequence number in reordering buffer.
 * @stored_mpdu_num: number of MPDUs in reordering buffer
 * @ssn: Starting Sequence Number expected to be aggregated.
 * @buf_size: buffer size for incoming A-MPDUs
 * @timeout: reset timer value (in TUs).
 * @tid: TID number
 * @rcu_head: RCU head used for freeing this struct
 * @reorder_lock: serializes access to reorder buffer, see below.
 * @auto_seq: used for offloaded BA sessions to automatically pick head_seq_and
 *        and ssn.
 * @removed: this session is removed (but might have been found due to RCU)
 * @started: this session has started (head ssn or higher was received)
 *
 * This structure's lifetime is managed by RCU, assignments to
 * the array holding it must hold the aggregation mutex.
 *
 * The @reorder_lock is used to protect the members of this
 * struct, except for @timeout, @buf_size and @dialog_token,
 * which are constant across the lifetime of the struct (the
 * dialog token being used only for debugging).
 */
struct tid_ampdu_rx {
        struct rcu_head rcu_head;
        spinlock_t reorder_lock;
        u64 reorder_buf_filtered;
        struct sk_buff_head *reorder_buf;
        unsigned long *reorder_time;
        struct sta_info *sta;
        struct timer_list session_timer;
        struct timer_list reorder_timer;
        unsigned long last_rx;
        u16 head_seq_num;
        u16 stored_mpdu_num;
        u16 ssn;
        u16 buf_size;
        u16 timeout;
        u8 tid;
        u8 auto_seq:1,
           removed:1,
           started:1;
};

/**
 * struct sta_ampdu_mlme - STA aggregation information.
 *
 * @mtx: mutex to protect all TX data (except non-NULL assignments
 *        to tid_tx[idx], which are protected by the sta spinlock)
 *        tid_start_tx is also protected by sta->lock.
 * @tid_rx: aggregation info for Rx per TID -- RCU protected
 * @tid_rx_token: dialog tokens for valid aggregation sessions
 * @tid_rx_timer_expired: bitmap indicating on which TIDs the
 *        RX timer expired until the work for it runs
 * @tid_rx_stop_requested:  bitmap indicating which BA sessions per TID the
 *        driver requested to close until the work for it runs
 * @tid_rx_manage_offl: bitmap indicating which BA sessions were requested
 *        to be treated as started/stopped due to offloading
 * @agg_session_valid: bitmap indicating which TID has a rx BA session open on
 * @unexpected_agg: bitmap indicating which TID already sent a delBA due to
 *        unexpected aggregation related frames outside a session
 * @work: work struct for starting/stopping aggregation
 * @tid_tx: aggregation info for Tx per TID
 * @tid_start_tx: sessions where start was requested
 * @last_addba_req_time: timestamp of the last addBA request.
 * @addba_req_num: number of times addBA request has been sent.
 * @dialog_token_allocator: dialog token enumerator for each new session;
 */
struct sta_ampdu_mlme {
        struct mutex mtx;
        /* rx */
        struct tid_ampdu_rx __rcu *tid_rx[IEEE80211_NUM_TIDS];
        u8 tid_rx_token[IEEE80211_NUM_TIDS];
        unsigned long tid_rx_timer_expired[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
        unsigned long tid_rx_stop_requested[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
        unsigned long tid_rx_manage_offl[BITS_TO_LONGS(2 * IEEE80211_NUM_TIDS)];
        unsigned long agg_session_valid[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
        unsigned long unexpected_agg[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
        /* tx */
        struct work_struct work;
        struct tid_ampdu_tx __rcu *tid_tx[IEEE80211_NUM_TIDS];
        struct tid_ampdu_tx *tid_start_tx[IEEE80211_NUM_TIDS];
        unsigned long last_addba_req_time[IEEE80211_NUM_TIDS];
        u8 addba_req_num[IEEE80211_NUM_TIDS];
        u8 dialog_token_allocator;
};


/* Value to indicate no TID reservation */
#define IEEE80211_TID_UNRESERVED        0xff

#define IEEE80211_FAST_XMIT_MAX_IV        18

/**
 * struct ieee80211_fast_tx - TX fastpath information
 * @key: key to use for hw crypto
 * @hdr: the 802.11 header to put with the frame
 * @hdr_len: actual 802.11 header length
 * @sa_offs: offset of the SA
 * @da_offs: offset of the DA
 * @pn_offs: offset where to put PN for crypto (or 0 if not needed)
 * @band: band this will be transmitted on, for tx_info
 * @rcu_head: RCU head to free this struct
 *
 * This struct is small enough so that the common case (maximum crypto
 * header length of 8 like for CCMP/GCMP) fits into a single 64-byte
 * cache line.
 */
struct ieee80211_fast_tx {
        struct ieee80211_key *key;
        u8 hdr_len;
        u8 sa_offs, da_offs, pn_offs;
        u8 band;
        u8 hdr[30 + 2 + IEEE80211_FAST_XMIT_MAX_IV +
               sizeof(rfc1042_header)] __aligned(2);

        struct rcu_head rcu_head;
};

/**
 * struct ieee80211_fast_rx - RX fastpath information
 * @dev: netdevice for reporting the SKB
 * @vif_type: (P2P-less) interface type of the original sdata (sdata->vif.type)
 * @vif_addr: interface address
 * @rfc1042_hdr: copy of the RFC 1042 SNAP header (to have in cache)
 * @control_port_protocol: control port protocol copied from sdata
 * @expected_ds_bits: from/to DS bits expected
 * @icv_len: length of the MIC if present
 * @key: bool indicating encryption is expected (key is set)
 * @internal_forward: forward froms internally on AP/VLAN type interfaces
 * @uses_rss: copy of USES_RSS hw flag
 * @da_offs: offset of the DA in the header (for header conversion)
 * @sa_offs: offset of the SA in the header (for header conversion)
 * @rcu_head: RCU head for freeing this structure
 */
struct ieee80211_fast_rx {
        struct net_device *dev;
        enum nl80211_iftype vif_type;
        u8 vif_addr[ETH_ALEN] __aligned(2);
        u8 rfc1042_hdr[6] __aligned(2);
        __be16 control_port_protocol;
        __le16 expected_ds_bits;
        u8 icv_len;
        u8 key:1,
           internal_forward:1,
           uses_rss:1;
        u8 da_offs, sa_offs;

        struct rcu_head rcu_head;
};

/* we use only values in the range 0-100, so pick a large precision */
DECLARE_EWMA(mesh_fail_avg, 20, 8)
DECLARE_EWMA(mesh_tx_rate_avg, 8, 16)

/**
 * struct mesh_sta - mesh STA information
 * @plink_lock: serialize access to plink fields
 * @llid: Local link ID
 * @plid: Peer link ID
 * @aid: local aid supplied by peer
 * @reason: Cancel reason on PLINK_HOLDING state
 * @plink_retries: Retries in establishment
 * @plink_state: peer link state
 * @plink_timeout: timeout of peer link
 * @plink_timer: peer link watch timer
 * @plink_sta: peer link watch timer's sta_info
 * @t_offset: timing offset relative to this host
 * @t_offset_setpoint: reference timing offset of this sta to be used when
 *         calculating clockdrift
 * @local_pm: local link-specific power save mode
 * @peer_pm: peer-specific power save mode towards local STA
 * @nonpeer_pm: STA power save mode towards non-peer neighbors
 * @processed_beacon: set to true after peer rates and capabilities are
 *        processed
 * @connected_to_gate: true if mesh STA has a path to a mesh gate
 * @connected_to_as: true if mesh STA has a path to a authentication server
 * @fail_avg: moving percentage of failed MSDUs
 * @tx_rate_avg: moving average of tx bitrate
 */
struct mesh_sta {
        struct timer_list plink_timer;
        struct sta_info *plink_sta;

        s64 t_offset;
        s64 t_offset_setpoint;

        spinlock_t plink_lock;
        u16 llid;
        u16 plid;
        u16 aid;
        u16 reason;
        u8 plink_retries;

        bool processed_beacon;
        bool connected_to_gate;
        bool connected_to_as;

        enum nl80211_plink_state plink_state;
        u32 plink_timeout;

        /* mesh power save */
        enum nl80211_mesh_power_mode local_pm;
        enum nl80211_mesh_power_mode peer_pm;
        enum nl80211_mesh_power_mode nonpeer_pm;

        /* moving percentage of failed MSDUs */
        struct ewma_mesh_fail_avg fail_avg;
        /* moving average of tx bitrate */
        struct ewma_mesh_tx_rate_avg tx_rate_avg;
};

DECLARE_EWMA(signal, 10, 8)

struct ieee80211_sta_rx_stats {
        unsigned long packets;
        unsigned long last_rx;
        unsigned long num_duplicates;
        unsigned long fragments;
        unsigned long dropped;
        int last_signal;
        u8 chains;
        s8 chain_signal_last[IEEE80211_MAX_CHAINS];
        u32 last_rate;
        struct u64_stats_sync syncp;
        u64 bytes;
        u64 msdu[IEEE80211_NUM_TIDS + 1];
};

/*
 * IEEE 802.11-2016 (10.6 "Defragmentation") recommends support for "concurrent
 * reception of at least one MSDU per access category per associated STA"
 * on APs, or "at least one MSDU per access category" on other interface types.
 *
 * This limit can be increased by changing this define, at the cost of slower
 * frame reassembly and increased memory use while fragments are pending.
 */
#define IEEE80211_FRAGMENT_MAX 4

struct ieee80211_fragment_entry {
        struct sk_buff_head skb_list;
        unsigned long first_frag_time;
        u16 seq;
        u16 extra_len;
        u16 last_frag;
        u8 rx_queue;
        u8 check_sequential_pn:1, /* needed for CCMP/GCMP */
           is_protected:1;
        u8 last_pn[6]; /* PN of the last fragment if CCMP was used */
        unsigned int key_color;
};

struct ieee80211_fragment_cache {
        struct ieee80211_fragment_entry        entries[IEEE80211_FRAGMENT_MAX];
        unsigned int next;
};

/*
 * The bandwidth threshold below which the per-station CoDel parameters will be
 * scaled to be more lenient (to prevent starvation of slow stations). This
 * value will be scaled by the number of active stations when it is being
 * applied.
 */
#define STA_SLOW_THRESHOLD 6000 /* 6 Mbps */

/**
 * struct sta_info - STA information
 *
 * This structure collects information about a station that
 * mac80211 is communicating with.
 *
 * @list: global linked list entry
 * @free_list: list entry for keeping track of stations to free
 * @hash_node: hash node for rhashtable
 * @addr: station's MAC address - duplicated from public part to
 *        let the hash table work with just a single cacheline
 * @local: pointer to the global information
 * @sdata: virtual interface this station belongs to
 * @ptk: peer keys negotiated with this station, if any
 * @ptk_idx: last installed peer key index
 * @gtk: group keys negotiated with this station, if any
 * @rate_ctrl: rate control algorithm reference
 * @rate_ctrl_lock: spinlock used to protect rate control data
 *        (data inside the algorithm, so serializes calls there)
 * @rate_ctrl_priv: rate control private per-STA pointer
 * @lock: used for locking all fields that require locking, see comments
 *        in the header file.
 * @drv_deliver_wk: used for delivering frames after driver PS unblocking
 * @listen_interval: listen interval of this station, when we're acting as AP
 * @_flags: STA flags, see &enum ieee80211_sta_info_flags, do not use directly
 * @ps_lock: used for powersave (when mac80211 is the AP) related locking
 * @ps_tx_buf: buffers (per AC) of frames to transmit to this station
 *        when it leaves power saving state or polls
 * @tx_filtered: buffers (per AC) of frames we already tried to
 *        transmit but were filtered by hardware due to STA having
 *        entered power saving state, these are also delivered to
 *        the station when it leaves powersave or polls for frames
 * @driver_buffered_tids: bitmap of TIDs the driver has data buffered on
 * @txq_buffered_tids: bitmap of TIDs that mac80211 has txq data buffered on
 * @assoc_at: clock boottime (in ns) of last association
 * @last_connected: time (in seconds) when a station got connected
 * @last_seq_ctrl: last received seq/frag number from this STA (per TID
 *        plus one for non-QoS frames)
 * @tid_seq: per-TID sequence numbers for sending to this STA
 * @airtime: per-AC struct airtime_info describing airtime statistics for this
 *        station
 * @airtime_weight: station weight for airtime fairness calculation purposes
 * @ampdu_mlme: A-MPDU state machine state
 * @mesh: mesh STA information
 * @debugfs_dir: debug filesystem directory dentry
 * @dead: set to true when sta is unlinked
 * @removed: set to true when sta is being removed from sta_list
 * @uploaded: set to true when sta is uploaded to the driver
 * @sta: station information we share with the driver
 * @sta_state: duplicates information about station state (for debug)
 * @rcu_head: RCU head used for freeing this station struct
 * @cur_max_bandwidth: maximum bandwidth to use for TX to the station,
 *        taken from HT/VHT capabilities or VHT operating mode notification
 * @known_smps_mode: the smps_mode the client thinks we are in. Relevant for
 *        AP only.
 * @cipher_scheme: optional cipher scheme for this station
 * @cparams: CoDel parameters for this station.
 * @reserved_tid: reserved TID (if any, otherwise IEEE80211_TID_UNRESERVED)
 * @fast_tx: TX fastpath information
 * @fast_rx: RX fastpath information
 * @tdls_chandef: a TDLS peer can have a wider chandef that is compatible to
 *        the BSS one.
 * @tx_stats: TX statistics
 * @tx_stats.packets: # of packets transmitted
 * @tx_stats.bytes: # of bytes in all packets transmitted
 * @tx_stats.last_rate: last TX rate
 * @tx_stats.msdu: # of transmitted MSDUs per TID
 * @rx_stats: RX statistics
 * @rx_stats_avg: averaged RX statistics
 * @rx_stats_avg.signal: averaged signal
 * @rx_stats_avg.chain_signal: averaged per-chain signal
 * @pcpu_rx_stats: per-CPU RX statistics, assigned only if the driver needs
 *        this (by advertising the USES_RSS hw flag)
 * @status_stats: TX status statistics
 * @status_stats.filtered: # of filtered frames
 * @status_stats.retry_failed: # of frames that failed after retry
 * @status_stats.retry_count: # of retries attempted
 * @status_stats.lost_packets: # of lost packets
 * @status_stats.last_pkt_time: timestamp of last ACKed packet
 * @status_stats.msdu_retries: # of MSDU retries
 * @status_stats.msdu_failed: # of failed MSDUs
 * @status_stats.last_ack: last ack timestamp (jiffies)
 * @status_stats.last_ack_signal: last ACK signal
 * @status_stats.ack_signal_filled: last ACK signal validity
 * @status_stats.avg_ack_signal: average ACK signal
 * @frags: fragment cache
 */
struct sta_info {
        /* General information, mostly static */
        struct list_head list, free_list;
        struct rcu_head rcu_head;
        struct rhlist_head hash_node;
        u8 addr[ETH_ALEN];
        struct ieee80211_local *local;
        struct ieee80211_sub_if_data *sdata;
        struct ieee80211_key __rcu *gtk[NUM_DEFAULT_KEYS +
                                        NUM_DEFAULT_MGMT_KEYS +
                                        NUM_DEFAULT_BEACON_KEYS];
        struct ieee80211_key __rcu *ptk[NUM_DEFAULT_KEYS];
        u8 ptk_idx;
        struct rate_control_ref *rate_ctrl;
        void *rate_ctrl_priv;
        spinlock_t rate_ctrl_lock;
        spinlock_t lock;

        struct ieee80211_fast_tx __rcu *fast_tx;
        struct ieee80211_fast_rx __rcu *fast_rx;
        struct ieee80211_sta_rx_stats __percpu *pcpu_rx_stats;

#ifdef CONFIG_MAC80211_MESH
        struct mesh_sta *mesh;
#endif

        struct work_struct drv_deliver_wk;

        u16 listen_interval;

        bool dead;
        bool removed;

        bool uploaded;

        enum ieee80211_sta_state sta_state;

        /* use the accessors defined below */
        unsigned long _flags;

        /* STA powersave lock and frame queues */
        spinlock_t ps_lock;
        struct sk_buff_head ps_tx_buf[IEEE80211_NUM_ACS];
        struct sk_buff_head tx_filtered[IEEE80211_NUM_ACS];
        unsigned long driver_buffered_tids;
        unsigned long txq_buffered_tids;

        u64 assoc_at;
        long last_connected;

        /* Updated from RX path only, no locking requirements */
        struct ieee80211_sta_rx_stats rx_stats;
        struct {
                struct ewma_signal signal;
                struct ewma_signal chain_signal[IEEE80211_MAX_CHAINS];
        } rx_stats_avg;

        /* Plus 1 for non-QoS frames */
        __le16 last_seq_ctrl[IEEE80211_NUM_TIDS + 1];

        /* Updated from TX status path only, no locking requirements */
        struct {
                unsigned long filtered;
                unsigned long retry_failed, retry_count;
                unsigned int lost_packets;
                unsigned long last_pkt_time;
                u64 msdu_retries[IEEE80211_NUM_TIDS + 1];
                u64 msdu_failed[IEEE80211_NUM_TIDS + 1];
                unsigned long last_ack;
                s8 last_ack_signal;
                bool ack_signal_filled;
                struct ewma_avg_signal avg_ack_signal;
        } status_stats;

        /* Updated from TX path only, no locking requirements */
        struct {
                u64 packets[IEEE80211_NUM_ACS];
                u64 bytes[IEEE80211_NUM_ACS];
                struct ieee80211_tx_rate last_rate;
                struct rate_info last_rate_info;
                u64 msdu[IEEE80211_NUM_TIDS + 1];
        } tx_stats;
        u16 tid_seq[IEEE80211_QOS_CTL_TID_MASK + 1];

        struct airtime_info airtime[IEEE80211_NUM_ACS];
        u16 airtime_weight;

        /*
         * Aggregation information, locked with lock.
         */
        struct sta_ampdu_mlme ampdu_mlme;

#ifdef CONFIG_MAC80211_DEBUGFS
        struct dentry *debugfs_dir;
#endif

        enum ieee80211_sta_rx_bandwidth cur_max_bandwidth;

        enum ieee80211_smps_mode known_smps_mode;
        const struct ieee80211_cipher_scheme *cipher_scheme;

        struct codel_params cparams;

        u8 reserved_tid;

        struct cfg80211_chan_def tdls_chandef;

        struct ieee80211_fragment_cache frags;

        /* keep last! */
        struct ieee80211_sta sta;
};

static inline enum nl80211_plink_state sta_plink_state(struct sta_info *sta)
{
#ifdef CONFIG_MAC80211_MESH
        return sta->mesh->plink_state;
#endif
        return NL80211_PLINK_LISTEN;
}

static inline void set_sta_flag(struct sta_info *sta,
                                enum ieee80211_sta_info_flags flag)
{
        WARN_ON(flag == WLAN_STA_AUTH ||
                flag == WLAN_STA_ASSOC ||
                flag == WLAN_STA_AUTHORIZED);
        set_bit(flag, &sta->_flags);
}

static inline void clear_sta_flag(struct sta_info *sta,
                                  enum ieee80211_sta_info_flags flag)
{
        WARN_ON(flag == WLAN_STA_AUTH ||
                flag == WLAN_STA_ASSOC ||
                flag == WLAN_STA_AUTHORIZED);
        clear_bit(flag, &sta->_flags);
}

static inline int test_sta_flag(struct sta_info *sta,
                                enum ieee80211_sta_info_flags flag)
{
        return test_bit(flag, &sta->_flags);
}

static inline int test_and_clear_sta_flag(struct sta_info *sta,
                                          enum ieee80211_sta_info_flags flag)
{
        WARN_ON(flag == WLAN_STA_AUTH ||
                flag == WLAN_STA_ASSOC ||
                flag == WLAN_STA_AUTHORIZED);
        return test_and_clear_bit(flag, &sta->_flags);
}

static inline int test_and_set_sta_flag(struct sta_info *sta,
                                        enum ieee80211_sta_info_flags flag)
{
        WARN_ON(flag == WLAN_STA_AUTH ||
                flag == WLAN_STA_ASSOC ||
                flag == WLAN_STA_AUTHORIZED);
        return test_and_set_bit(flag, &sta->_flags);
}

int sta_info_move_state(struct sta_info *sta,
                        enum ieee80211_sta_state new_state);

static inline void sta_info_pre_move_state(struct sta_info *sta,
                                           enum ieee80211_sta_state new_state)
{
        int ret;

        WARN_ON_ONCE(test_sta_flag(sta, WLAN_STA_INSERTED));

        ret = sta_info_move_state(sta, new_state);
        WARN_ON_ONCE(ret);
}


void ieee80211_assign_tid_tx(struct sta_info *sta, int tid,
                             struct tid_ampdu_tx *tid_tx);

static inline struct tid_ampdu_tx *
rcu_dereference_protected_tid_tx(struct sta_info *sta, int tid)
{
        return rcu_dereference_protected(sta->ampdu_mlme.tid_tx[tid],
                                         lockdep_is_held(&sta->lock) ||
                                         lockdep_is_held(&sta->ampdu_mlme.mtx));
}

/* Maximum number of frames to buffer per power saving station per AC */
#define STA_MAX_TX_BUFFER        64

/* Minimum buffered frame expiry time. If STA uses listen interval that is
 * smaller than this value, the minimum value here is used instead. */
#define STA_TX_BUFFER_EXPIRE (10 * HZ)

/* How often station data is cleaned up (e.g., expiration of buffered frames)
 */
#define STA_INFO_CLEANUP_INTERVAL (10 * HZ)

struct rhlist_head *sta_info_hash_lookup(struct ieee80211_local *local,
                                         const u8 *addr);

/*
 * Get a STA info, must be under RCU read lock.
 */
struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata,
                              const u8 *addr);

struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata,
                                  const u8 *addr);

/* user must hold sta_mtx or be in RCU critical section */
struct sta_info *sta_info_get_by_addrs(struct ieee80211_local *local,
                                       const u8 *sta_addr, const u8 *vif_addr);

#define for_each_sta_info(local, _addr, _sta, _tmp)                        \
        rhl_for_each_entry_rcu(_sta, _tmp,                                \
                               sta_info_hash_lookup(local, _addr), hash_node)

/*
 * Get STA info by index, BROKEN!
 */
struct sta_info *sta_info_get_by_idx(struct ieee80211_sub_if_data *sdata,
                                     int idx);
/*
 * Create a new STA info, caller owns returned structure
 * until sta_info_insert().
 */
struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
                                const u8 *addr, gfp_t gfp);

void sta_info_free(struct ieee80211_local *local, struct sta_info *sta);

/*
 * Insert STA info into hash table/list, returns zero or a
 * -EEXIST if (if the same MAC address is already present).
 *
 * Calling the non-rcu version makes the caller relinquish,
 * the _rcu version calls read_lock_rcu() and must be called
 * without it held.
 */
int sta_info_insert(struct sta_info *sta);
int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU);

int __must_check __sta_info_destroy(struct sta_info *sta);
int sta_info_destroy_addr(struct ieee80211_sub_if_data *sdata,
                          const u8 *addr);
int sta_info_destroy_addr_bss(struct ieee80211_sub_if_data *sdata,
                              const u8 *addr);

void sta_info_recalc_tim(struct sta_info *sta);

int sta_info_init(struct ieee80211_local *local);
void sta_info_stop(struct ieee80211_local *local);

/**
 * __sta_info_flush - flush matching STA entries from the STA table
 *
 * Returns the number of removed STA entries.
 *
 * @sdata: sdata to remove all stations from
 * @vlans: if the given interface is an AP interface, also flush VLANs
 */
int __sta_info_flush(struct ieee80211_sub_if_data *sdata, bool vlans);

/**
 * sta_info_flush - flush matching STA entries from the STA table
 *
 * Returns the number of removed STA entries.
 *
 * @sdata: sdata to remove all stations from
 */
static inline int sta_info_flush(struct ieee80211_sub_if_data *sdata)
{
        return __sta_info_flush(sdata, false);
}

void sta_set_rate_info_tx(struct sta_info *sta,
                          const struct ieee80211_tx_rate *rate,
                          struct rate_info *rinfo);
void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
                   bool tidstats);

u32 sta_get_expected_throughput(struct sta_info *sta);

void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata,
                          unsigned long exp_time);
u8 sta_info_tx_streams(struct sta_info *sta);

void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta);
void ieee80211_sta_ps_deliver_poll_response(struct sta_info *sta);
void ieee80211_sta_ps_deliver_uapsd(struct sta_info *sta);

unsigned long ieee80211_sta_last_active(struct sta_info *sta);

enum sta_stats_type {
        STA_STATS_RATE_TYPE_INVALID = 0,
        STA_STATS_RATE_TYPE_LEGACY,
        STA_STATS_RATE_TYPE_HT,
        STA_STATS_RATE_TYPE_VHT,
        STA_STATS_RATE_TYPE_HE,
        STA_STATS_RATE_TYPE_S1G,
};

#define STA_STATS_FIELD_HT_MCS                GENMASK( 7,  0)
#define STA_STATS_FIELD_LEGACY_IDX        GENMASK( 3,  0)
#define STA_STATS_FIELD_LEGACY_BAND        GENMASK( 7,  4)
#define STA_STATS_FIELD_VHT_MCS                GENMASK( 3,  0)
#define STA_STATS_FIELD_VHT_NSS                GENMASK( 7,  4)
#define STA_STATS_FIELD_HE_MCS                GENMASK( 3,  0)
#define STA_STATS_FIELD_HE_NSS                GENMASK( 7,  4)
#define STA_STATS_FIELD_BW                GENMASK(11,  8)
#define STA_STATS_FIELD_SGI                GENMASK(12, 12)
#define STA_STATS_FIELD_TYPE                GENMASK(15, 13)
#define STA_STATS_FIELD_HE_RU                GENMASK(18, 16)
#define STA_STATS_FIELD_HE_GI                GENMASK(20, 19)
#define STA_STATS_FIELD_HE_DCM                GENMASK(21, 21)

#define STA_STATS_FIELD(_n, _v)                FIELD_PREP(STA_STATS_FIELD_ ## _n, _v)
#define STA_STATS_GET(_n, _v)                FIELD_GET(STA_STATS_FIELD_ ## _n, _v)

#define STA_STATS_RATE_INVALID                0

static inline u32 sta_stats_encode_rate(struct ieee80211_rx_status *s)
{
        u32 r;

        r = STA_STATS_FIELD(BW, s->bw);

        if (s->enc_flags & RX_ENC_FLAG_SHORT_GI)
                r |= STA_STATS_FIELD(SGI, 1);

        switch (s->encoding) {
        case RX_ENC_VHT:
                r |= STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_VHT);
                r |= STA_STATS_FIELD(VHT_NSS, s->nss);
                r |= STA_STATS_FIELD(VHT_MCS, s->rate_idx);
                break;
        case RX_ENC_HT:
                r |= STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_HT);
                r |= STA_STATS_FIELD(HT_MCS, s->rate_idx);
                break;
        case RX_ENC_LEGACY:
                r |= STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_LEGACY);
                r |= STA_STATS_FIELD(LEGACY_BAND, s->band);
                r |= STA_STATS_FIELD(LEGACY_IDX, s->rate_idx);
                break;
        case RX_ENC_HE:
                r |= STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_HE);
                r |= STA_STATS_FIELD(HE_NSS, s->nss);
                r |= STA_STATS_FIELD(HE_MCS, s->rate_idx);
                r |= STA_STATS_FIELD(HE_GI, s->he_gi);
                r |= STA_STATS_FIELD(HE_RU, s->he_ru);
                r |= STA_STATS_FIELD(HE_DCM, s->he_dcm);
                break;
        default:
                WARN_ON(1);
                return STA_STATS_RATE_INVALID;
        }

        return r;
}

#endif /* STA_INFO_H */






























































































































    1 






    1 






    1 


























































































































































































































































































































































































































































































































































































































































































































































    1 
























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM writeback

#if !defined(_TRACE_WRITEBACK_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_WRITEBACK_H

#include <linux/tracepoint.h>
#include <linux/backing-dev.h>
#include <linux/writeback.h>

#define show_inode_state(state)                                        \
        __print_flags(state, "|",                                \
                {I_DIRTY_SYNC,                "I_DIRTY_SYNC"},        \
                {I_DIRTY_DATASYNC,        "I_DIRTY_DATASYNC"},        \
                {I_DIRTY_PAGES,                "I_DIRTY_PAGES"},        \
                {I_NEW,                        "I_NEW"},                \
                {I_WILL_FREE,                "I_WILL_FREE"},                \
                {I_FREEING,                "I_FREEING"},                \
                {I_CLEAR,                "I_CLEAR"},                \
                {I_SYNC,                "I_SYNC"},                \
                {I_DIRTY_TIME,                "I_DIRTY_TIME"},        \
                {I_REFERENCED,                "I_REFERENCED"}                \
        )

/* enums need to be exported to user space */
#undef EM
#undef EMe
#define EM(a,b)         TRACE_DEFINE_ENUM(a);
#define EMe(a,b)        TRACE_DEFINE_ENUM(a);

#define WB_WORK_REASON                                                        \
        EM( WB_REASON_BACKGROUND,                "background")                \
        EM( WB_REASON_VMSCAN,                        "vmscan")                \
        EM( WB_REASON_SYNC,                        "sync")                        \
        EM( WB_REASON_PERIODIC,                        "periodic")                \
        EM( WB_REASON_LAPTOP_TIMER,                "laptop_timer")                \
        EM( WB_REASON_FS_FREE_SPACE,                "fs_free_space")        \
        EMe(WB_REASON_FORKER_THREAD,                "forker_thread")

WB_WORK_REASON

/*
 * Now redefine the EM() and EMe() macros to map the enums to the strings
 * that will be printed in the output.
 */
#undef EM
#undef EMe
#define EM(a,b)                { a, b },
#define EMe(a,b)        { a, b }

struct wb_writeback_work;

DECLARE_EVENT_CLASS(writeback_page_template,

        TP_PROTO(struct page *page, struct address_space *mapping),

        TP_ARGS(page, mapping),

        TP_STRUCT__entry (
                __array(char, name, 32)
                __field(ino_t, ino)
                __field(pgoff_t, index)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name,
                            bdi_dev_name(mapping ? inode_to_bdi(mapping->host) :
                                         NULL), 32);
                __entry->ino = (mapping && mapping->host) ? mapping->host->i_ino : 0;
                __entry->index = page->index;
        ),

        TP_printk("bdi %s: ino=%lu index=%lu",
                __entry->name,
                (unsigned long)__entry->ino,
                __entry->index
        )
);

DEFINE_EVENT(writeback_page_template, writeback_dirty_page,

        TP_PROTO(struct page *page, struct address_space *mapping),

        TP_ARGS(page, mapping)
);

DEFINE_EVENT(writeback_page_template, wait_on_page_writeback,

        TP_PROTO(struct page *page, struct address_space *mapping),

        TP_ARGS(page, mapping)
);

DECLARE_EVENT_CLASS(writeback_dirty_inode_template,

        TP_PROTO(struct inode *inode, int flags),

        TP_ARGS(inode, flags),

        TP_STRUCT__entry (
                __array(char, name, 32)
                __field(ino_t, ino)
                __field(unsigned long, state)
                __field(unsigned long, flags)
        ),

        TP_fast_assign(
                struct backing_dev_info *bdi = inode_to_bdi(inode);

                /* may be called for files on pseudo FSes w/ unregistered bdi */
                strscpy_pad(__entry->name, bdi_dev_name(bdi), 32);
                __entry->ino                = inode->i_ino;
                __entry->state                = inode->i_state;
                __entry->flags                = flags;
        ),

        TP_printk("bdi %s: ino=%lu state=%s flags=%s",
                __entry->name,
                (unsigned long)__entry->ino,
                show_inode_state(__entry->state),
                show_inode_state(__entry->flags)
        )
);

DEFINE_EVENT(writeback_dirty_inode_template, writeback_mark_inode_dirty,

        TP_PROTO(struct inode *inode, int flags),

        TP_ARGS(inode, flags)
);

DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode_start,

        TP_PROTO(struct inode *inode, int flags),

        TP_ARGS(inode, flags)
);

DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode,

        TP_PROTO(struct inode *inode, int flags),

        TP_ARGS(inode, flags)
);

#ifdef CREATE_TRACE_POINTS
#ifdef CONFIG_CGROUP_WRITEBACK

static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb)
{
        return cgroup_ino(wb->memcg_css->cgroup);
}

static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc)
{
        if (wbc->wb)
                return __trace_wb_assign_cgroup(wbc->wb);
        else
                return 1;
}
#else        /* CONFIG_CGROUP_WRITEBACK */

static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb)
{
        return 1;
}

static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc)
{
        return 1;
}

#endif        /* CONFIG_CGROUP_WRITEBACK */
#endif        /* CREATE_TRACE_POINTS */

#ifdef CONFIG_CGROUP_WRITEBACK
TRACE_EVENT(inode_foreign_history,

        TP_PROTO(struct inode *inode, struct writeback_control *wbc,
                 unsigned int history),

        TP_ARGS(inode, wbc, history),

        TP_STRUCT__entry(
                __array(char,                name, 32)
                __field(ino_t,                ino)
                __field(ino_t,                cgroup_ino)
                __field(unsigned int,        history)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32);
                __entry->ino                = inode->i_ino;
                __entry->cgroup_ino        = __trace_wbc_assign_cgroup(wbc);
                __entry->history        = history;
        ),

        TP_printk("bdi %s: ino=%lu cgroup_ino=%lu history=0x%x",
                __entry->name,
                (unsigned long)__entry->ino,
                (unsigned long)__entry->cgroup_ino,
                __entry->history
        )
);

TRACE_EVENT(inode_switch_wbs,

        TP_PROTO(struct inode *inode, struct bdi_writeback *old_wb,
                 struct bdi_writeback *new_wb),

        TP_ARGS(inode, old_wb, new_wb),

        TP_STRUCT__entry(
                __array(char,                name, 32)
                __field(ino_t,                ino)
                __field(ino_t,                old_cgroup_ino)
                __field(ino_t,                new_cgroup_ino)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(old_wb->bdi), 32);
                __entry->ino                = inode->i_ino;
                __entry->old_cgroup_ino        = __trace_wb_assign_cgroup(old_wb);
                __entry->new_cgroup_ino        = __trace_wb_assign_cgroup(new_wb);
        ),

        TP_printk("bdi %s: ino=%lu old_cgroup_ino=%lu new_cgroup_ino=%lu",
                __entry->name,
                (unsigned long)__entry->ino,
                (unsigned long)__entry->old_cgroup_ino,
                (unsigned long)__entry->new_cgroup_ino
        )
);

TRACE_EVENT(track_foreign_dirty,

        TP_PROTO(struct page *page, struct bdi_writeback *wb),

        TP_ARGS(page, wb),

        TP_STRUCT__entry(
                __array(char,                name, 32)
                __field(u64,                bdi_id)
                __field(ino_t,                ino)
                __field(unsigned int,        memcg_id)
                __field(ino_t,                cgroup_ino)
                __field(ino_t,                page_cgroup_ino)
        ),

        TP_fast_assign(
                struct address_space *mapping = page_mapping(page);
                struct inode *inode = mapping ? mapping->host : NULL;

                strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
                __entry->bdi_id                = wb->bdi->id;
                __entry->ino                = inode ? inode->i_ino : 0;
                __entry->memcg_id        = wb->memcg_css->id;
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(wb);
                __entry->page_cgroup_ino = cgroup_ino(page->mem_cgroup->css.cgroup);
        ),

        TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu",
                __entry->name,
                __entry->bdi_id,
                (unsigned long)__entry->ino,
                __entry->memcg_id,
                (unsigned long)__entry->cgroup_ino,
                (unsigned long)__entry->page_cgroup_ino
        )
);

TRACE_EVENT(flush_foreign,

        TP_PROTO(struct bdi_writeback *wb, unsigned int frn_bdi_id,
                 unsigned int frn_memcg_id),

        TP_ARGS(wb, frn_bdi_id, frn_memcg_id),

        TP_STRUCT__entry(
                __array(char,                name, 32)
                __field(ino_t,                cgroup_ino)
                __field(unsigned int,        frn_bdi_id)
                __field(unsigned int,        frn_memcg_id)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(wb);
                __entry->frn_bdi_id        = frn_bdi_id;
                __entry->frn_memcg_id        = frn_memcg_id;
        ),

        TP_printk("bdi %s: cgroup_ino=%lu frn_bdi_id=%u frn_memcg_id=%u",
                __entry->name,
                (unsigned long)__entry->cgroup_ino,
                __entry->frn_bdi_id,
                __entry->frn_memcg_id
        )
);
#endif

DECLARE_EVENT_CLASS(writeback_write_inode_template,

        TP_PROTO(struct inode *inode, struct writeback_control *wbc),

        TP_ARGS(inode, wbc),

        TP_STRUCT__entry (
                __array(char, name, 32)
                __field(ino_t, ino)
                __field(int, sync_mode)
                __field(ino_t, cgroup_ino)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name,
                            bdi_dev_name(inode_to_bdi(inode)), 32);
                __entry->ino                = inode->i_ino;
                __entry->sync_mode        = wbc->sync_mode;
                __entry->cgroup_ino        = __trace_wbc_assign_cgroup(wbc);
        ),

        TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup_ino=%lu",
                __entry->name,
                (unsigned long)__entry->ino,
                __entry->sync_mode,
                (unsigned long)__entry->cgroup_ino
        )
);

DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode_start,

        TP_PROTO(struct inode *inode, struct writeback_control *wbc),

        TP_ARGS(inode, wbc)
);

DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode,

        TP_PROTO(struct inode *inode, struct writeback_control *wbc),

        TP_ARGS(inode, wbc)
);

DECLARE_EVENT_CLASS(writeback_work_class,
        TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work),
        TP_ARGS(wb, work),
        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(long, nr_pages)
                __field(dev_t, sb_dev)
                __field(int, sync_mode)
                __field(int, for_kupdate)
                __field(int, range_cyclic)
                __field(int, for_background)
                __field(int, reason)
                __field(ino_t, cgroup_ino)
        ),
        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
                __entry->nr_pages = work->nr_pages;
                __entry->sb_dev = work->sb ? work->sb->s_dev : 0;
                __entry->sync_mode = work->sync_mode;
                __entry->for_kupdate = work->for_kupdate;
                __entry->range_cyclic = work->range_cyclic;
                __entry->for_background        = work->for_background;
                __entry->reason = work->reason;
                __entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
        ),
        TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d "
                  "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup_ino=%lu",
                  __entry->name,
                  MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev),
                  __entry->nr_pages,
                  __entry->sync_mode,
                  __entry->for_kupdate,
                  __entry->range_cyclic,
                  __entry->for_background,
                  __print_symbolic(__entry->reason, WB_WORK_REASON),
                  (unsigned long)__entry->cgroup_ino
        )
);
#define DEFINE_WRITEBACK_WORK_EVENT(name) \
DEFINE_EVENT(writeback_work_class, name, \
        TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), \
        TP_ARGS(wb, work))
DEFINE_WRITEBACK_WORK_EVENT(writeback_queue);
DEFINE_WRITEBACK_WORK_EVENT(writeback_exec);
DEFINE_WRITEBACK_WORK_EVENT(writeback_start);
DEFINE_WRITEBACK_WORK_EVENT(writeback_written);
DEFINE_WRITEBACK_WORK_EVENT(writeback_wait);

TRACE_EVENT(writeback_pages_written,
        TP_PROTO(long pages_written),
        TP_ARGS(pages_written),
        TP_STRUCT__entry(
                __field(long,                pages)
        ),
        TP_fast_assign(
                __entry->pages                = pages_written;
        ),
        TP_printk("%ld", __entry->pages)
);

DECLARE_EVENT_CLASS(writeback_class,
        TP_PROTO(struct bdi_writeback *wb),
        TP_ARGS(wb),
        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(ino_t, cgroup_ino)
        ),
        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
                __entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
        ),
        TP_printk("bdi %s: cgroup_ino=%lu",
                  __entry->name,
                  (unsigned long)__entry->cgroup_ino
        )
);
#define DEFINE_WRITEBACK_EVENT(name) \
DEFINE_EVENT(writeback_class, name, \
        TP_PROTO(struct bdi_writeback *wb), \
        TP_ARGS(wb))

DEFINE_WRITEBACK_EVENT(writeback_wake_background);

TRACE_EVENT(writeback_bdi_register,
        TP_PROTO(struct backing_dev_info *bdi),
        TP_ARGS(bdi),
        TP_STRUCT__entry(
                __array(char, name, 32)
        ),
        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(bdi), 32);
        ),
        TP_printk("bdi %s",
                __entry->name
        )
);

DECLARE_EVENT_CLASS(wbc_class,
        TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi),
        TP_ARGS(wbc, bdi),
        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(long, nr_to_write)
                __field(long, pages_skipped)
                __field(int, sync_mode)
                __field(int, for_kupdate)
                __field(int, for_background)
                __field(int, for_reclaim)
                __field(int, range_cyclic)
                __field(long, range_start)
                __field(long, range_end)
                __field(ino_t, cgroup_ino)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(bdi), 32);
                __entry->nr_to_write        = wbc->nr_to_write;
                __entry->pages_skipped        = wbc->pages_skipped;
                __entry->sync_mode        = wbc->sync_mode;
                __entry->for_kupdate        = wbc->for_kupdate;
                __entry->for_background        = wbc->for_background;
                __entry->for_reclaim        = wbc->for_reclaim;
                __entry->range_cyclic        = wbc->range_cyclic;
                __entry->range_start        = (long)wbc->range_start;
                __entry->range_end        = (long)wbc->range_end;
                __entry->cgroup_ino        = __trace_wbc_assign_cgroup(wbc);
        ),

        TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d "
                "bgrd=%d reclm=%d cyclic=%d "
                "start=0x%lx end=0x%lx cgroup_ino=%lu",
                __entry->name,
                __entry->nr_to_write,
                __entry->pages_skipped,
                __entry->sync_mode,
                __entry->for_kupdate,
                __entry->for_background,
                __entry->for_reclaim,
                __entry->range_cyclic,
                __entry->range_start,
                __entry->range_end,
                (unsigned long)__entry->cgroup_ino
        )
)

#define DEFINE_WBC_EVENT(name) \
DEFINE_EVENT(wbc_class, name, \
        TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), \
        TP_ARGS(wbc, bdi))
DEFINE_WBC_EVENT(wbc_writepage);

TRACE_EVENT(writeback_queue_io,
        TP_PROTO(struct bdi_writeback *wb,
                 struct wb_writeback_work *work,
                 unsigned long dirtied_before,
                 int moved),
        TP_ARGS(wb, work, dirtied_before, moved),
        TP_STRUCT__entry(
                __array(char,                name, 32)
                __field(unsigned long,        older)
                __field(long,                age)
                __field(int,                moved)
                __field(int,                reason)
                __field(ino_t,                cgroup_ino)
        ),
        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
                __entry->older        = dirtied_before;
                __entry->age        = (jiffies - dirtied_before) * 1000 / HZ;
                __entry->moved        = moved;
                __entry->reason        = work->reason;
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(wb);
        ),
        TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup_ino=%lu",
                __entry->name,
                __entry->older,        /* dirtied_before in jiffies */
                __entry->age,        /* dirtied_before in relative milliseconds */
                __entry->moved,
                __print_symbolic(__entry->reason, WB_WORK_REASON),
                (unsigned long)__entry->cgroup_ino
        )
);

TRACE_EVENT(global_dirty_state,

        TP_PROTO(unsigned long background_thresh,
                 unsigned long dirty_thresh
        ),

        TP_ARGS(background_thresh,
                dirty_thresh
        ),

        TP_STRUCT__entry(
                __field(unsigned long,        nr_dirty)
                __field(unsigned long,        nr_writeback)
                __field(unsigned long,        background_thresh)
                __field(unsigned long,        dirty_thresh)
                __field(unsigned long,        dirty_limit)
                __field(unsigned long,        nr_dirtied)
                __field(unsigned long,        nr_written)
        ),

        TP_fast_assign(
                __entry->nr_dirty        = global_node_page_state(NR_FILE_DIRTY);
                __entry->nr_writeback        = global_node_page_state(NR_WRITEBACK);
                __entry->nr_dirtied        = global_node_page_state(NR_DIRTIED);
                __entry->nr_written        = global_node_page_state(NR_WRITTEN);
                __entry->background_thresh = background_thresh;
                __entry->dirty_thresh        = dirty_thresh;
                __entry->dirty_limit        = global_wb_domain.dirty_limit;
        ),

        TP_printk("dirty=%lu writeback=%lu "
                  "bg_thresh=%lu thresh=%lu limit=%lu "
                  "dirtied=%lu written=%lu",
                  __entry->nr_dirty,
                  __entry->nr_writeback,
                  __entry->background_thresh,
                  __entry->dirty_thresh,
                  __entry->dirty_limit,
                  __entry->nr_dirtied,
                  __entry->nr_written
        )
);

#define KBps(x)                        ((x) << (PAGE_SHIFT - 10))

TRACE_EVENT(bdi_dirty_ratelimit,

        TP_PROTO(struct bdi_writeback *wb,
                 unsigned long dirty_rate,
                 unsigned long task_ratelimit),

        TP_ARGS(wb, dirty_rate, task_ratelimit),

        TP_STRUCT__entry(
                __array(char,                bdi, 32)
                __field(unsigned long,        write_bw)
                __field(unsigned long,        avg_write_bw)
                __field(unsigned long,        dirty_rate)
                __field(unsigned long,        dirty_ratelimit)
                __field(unsigned long,        task_ratelimit)
                __field(unsigned long,        balanced_dirty_ratelimit)
                __field(ino_t,                cgroup_ino)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32);
                __entry->write_bw        = KBps(wb->write_bandwidth);
                __entry->avg_write_bw        = KBps(wb->avg_write_bandwidth);
                __entry->dirty_rate        = KBps(dirty_rate);
                __entry->dirty_ratelimit = KBps(wb->dirty_ratelimit);
                __entry->task_ratelimit        = KBps(task_ratelimit);
                __entry->balanced_dirty_ratelimit =
                                        KBps(wb->balanced_dirty_ratelimit);
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(wb);
        ),

        TP_printk("bdi %s: "
                  "write_bw=%lu awrite_bw=%lu dirty_rate=%lu "
                  "dirty_ratelimit=%lu task_ratelimit=%lu "
                  "balanced_dirty_ratelimit=%lu cgroup_ino=%lu",
                  __entry->bdi,
                  __entry->write_bw,                /* write bandwidth */
                  __entry->avg_write_bw,        /* avg write bandwidth */
                  __entry->dirty_rate,                /* bdi dirty rate */
                  __entry->dirty_ratelimit,        /* base ratelimit */
                  __entry->task_ratelimit, /* ratelimit with position control */
                  __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */
                  (unsigned long)__entry->cgroup_ino
        )
);

TRACE_EVENT(balance_dirty_pages,

        TP_PROTO(struct bdi_writeback *wb,
                 unsigned long thresh,
                 unsigned long bg_thresh,
                 unsigned long dirty,
                 unsigned long bdi_thresh,
                 unsigned long bdi_dirty,
                 unsigned long dirty_ratelimit,
                 unsigned long task_ratelimit,
                 unsigned long dirtied,
                 unsigned long period,
                 long pause,
                 unsigned long start_time),

        TP_ARGS(wb, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty,
                dirty_ratelimit, task_ratelimit,
                dirtied, period, pause, start_time),

        TP_STRUCT__entry(
                __array(         char,        bdi, 32)
                __field(unsigned long,        limit)
                __field(unsigned long,        setpoint)
                __field(unsigned long,        dirty)
                __field(unsigned long,        bdi_setpoint)
                __field(unsigned long,        bdi_dirty)
                __field(unsigned long,        dirty_ratelimit)
                __field(unsigned long,        task_ratelimit)
                __field(unsigned int,        dirtied)
                __field(unsigned int,        dirtied_pause)
                __field(unsigned long,        paused)
                __field(         long,        pause)
                __field(unsigned long,        period)
                __field(         long,        think)
                __field(ino_t,                cgroup_ino)
        ),

        TP_fast_assign(
                unsigned long freerun = (thresh + bg_thresh) / 2;
                strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32);

                __entry->limit                = global_wb_domain.dirty_limit;
                __entry->setpoint        = (global_wb_domain.dirty_limit +
                                                freerun) / 2;
                __entry->dirty                = dirty;
                __entry->bdi_setpoint        = __entry->setpoint *
                                                bdi_thresh / (thresh + 1);
                __entry->bdi_dirty        = bdi_dirty;
                __entry->dirty_ratelimit = KBps(dirty_ratelimit);
                __entry->task_ratelimit        = KBps(task_ratelimit);
                __entry->dirtied        = dirtied;
                __entry->dirtied_pause        = current->nr_dirtied_pause;
                __entry->think                = current->dirty_paused_when == 0 ? 0 :
                         (long)(jiffies - current->dirty_paused_when) * 1000/HZ;
                __entry->period                = period * 1000 / HZ;
                __entry->pause                = pause * 1000 / HZ;
                __entry->paused                = (jiffies - start_time) * 1000 / HZ;
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(wb);
        ),


        TP_printk("bdi %s: "
                  "limit=%lu setpoint=%lu dirty=%lu "
                  "bdi_setpoint=%lu bdi_dirty=%lu "
                  "dirty_ratelimit=%lu task_ratelimit=%lu "
                  "dirtied=%u dirtied_pause=%u "
                  "paused=%lu pause=%ld period=%lu think=%ld cgroup_ino=%lu",
                  __entry->bdi,
                  __entry->limit,
                  __entry->setpoint,
                  __entry->dirty,
                  __entry->bdi_setpoint,
                  __entry->bdi_dirty,
                  __entry->dirty_ratelimit,
                  __entry->task_ratelimit,
                  __entry->dirtied,
                  __entry->dirtied_pause,
                  __entry->paused,        /* ms */
                  __entry->pause,        /* ms */
                  __entry->period,        /* ms */
                  __entry->think,        /* ms */
                  (unsigned long)__entry->cgroup_ino
          )
);

TRACE_EVENT(writeback_sb_inodes_requeue,

        TP_PROTO(struct inode *inode),
        TP_ARGS(inode),

        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(ino_t, ino)
                __field(unsigned long, state)
                __field(unsigned long, dirtied_when)
                __field(ino_t, cgroup_ino)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name,
                            bdi_dev_name(inode_to_bdi(inode)), 32);
                __entry->ino                = inode->i_ino;
                __entry->state                = inode->i_state;
                __entry->dirtied_when        = inode->dirtied_when;
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(inode_to_wb(inode));
        ),

        TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup_ino=%lu",
                  __entry->name,
                  (unsigned long)__entry->ino,
                  show_inode_state(__entry->state),
                  __entry->dirtied_when,
                  (jiffies - __entry->dirtied_when) / HZ,
                  (unsigned long)__entry->cgroup_ino
        )
);

DECLARE_EVENT_CLASS(writeback_congest_waited_template,

        TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),

        TP_ARGS(usec_timeout, usec_delayed),

        TP_STRUCT__entry(
                __field(        unsigned int,        usec_timeout        )
                __field(        unsigned int,        usec_delayed        )
        ),

        TP_fast_assign(
                __entry->usec_timeout        = usec_timeout;
                __entry->usec_delayed        = usec_delayed;
        ),

        TP_printk("usec_timeout=%u usec_delayed=%u",
                        __entry->usec_timeout,
                        __entry->usec_delayed)
);

DEFINE_EVENT(writeback_congest_waited_template, writeback_congestion_wait,

        TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),

        TP_ARGS(usec_timeout, usec_delayed)
);

DEFINE_EVENT(writeback_congest_waited_template, writeback_wait_iff_congested,

        TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),

        TP_ARGS(usec_timeout, usec_delayed)
);

DECLARE_EVENT_CLASS(writeback_single_inode_template,

        TP_PROTO(struct inode *inode,
                 struct writeback_control *wbc,
                 unsigned long nr_to_write
        ),

        TP_ARGS(inode, wbc, nr_to_write),

        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(ino_t, ino)
                __field(unsigned long, state)
                __field(unsigned long, dirtied_when)
                __field(unsigned long, writeback_index)
                __field(long, nr_to_write)
                __field(unsigned long, wrote)
                __field(ino_t, cgroup_ino)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name,
                            bdi_dev_name(inode_to_bdi(inode)), 32);
                __entry->ino                = inode->i_ino;
                __entry->state                = inode->i_state;
                __entry->dirtied_when        = inode->dirtied_when;
                __entry->writeback_index = inode->i_mapping->writeback_index;
                __entry->nr_to_write        = nr_to_write;
                __entry->wrote                = nr_to_write - wbc->nr_to_write;
                __entry->cgroup_ino        = __trace_wbc_assign_cgroup(wbc);
        ),

        TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu "
                  "index=%lu to_write=%ld wrote=%lu cgroup_ino=%lu",
                  __entry->name,
                  (unsigned long)__entry->ino,
                  show_inode_state(__entry->state),
                  __entry->dirtied_when,
                  (jiffies - __entry->dirtied_when) / HZ,
                  __entry->writeback_index,
                  __entry->nr_to_write,
                  __entry->wrote,
                  (unsigned long)__entry->cgroup_ino
        )
);

DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode_start,
        TP_PROTO(struct inode *inode,
                 struct writeback_control *wbc,
                 unsigned long nr_to_write),
        TP_ARGS(inode, wbc, nr_to_write)
);

DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode,
        TP_PROTO(struct inode *inode,
                 struct writeback_control *wbc,
                 unsigned long nr_to_write),
        TP_ARGS(inode, wbc, nr_to_write)
);

DECLARE_EVENT_CLASS(writeback_inode_template,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(unsigned long,        state                        )
                __field(        __u16, mode                        )
                __field(unsigned long, dirtied_when                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->state        = inode->i_state;
                __entry->mode        = inode->i_mode;
                __entry->dirtied_when = inode->dirtied_when;
        ),

        TP_printk("dev %d,%d ino %lu dirtied %lu state %s mode 0%o",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long)__entry->ino, __entry->dirtied_when,
                  show_inode_state(__entry->state), __entry->mode)
);

DEFINE_EVENT(writeback_inode_template, writeback_lazytime,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode)
);

DEFINE_EVENT(writeback_inode_template, writeback_lazytime_iput,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode)
);

DEFINE_EVENT(writeback_inode_template, writeback_dirty_inode_enqueue,

        TP_PROTO(struct inode *inode),

        TP_ARGS(inode)
);

/*
 * Inode writeback list tracking.
 */

DEFINE_EVENT(writeback_inode_template, sb_mark_inode_writeback,
        TP_PROTO(struct inode *inode),
        TP_ARGS(inode)
);

DEFINE_EVENT(writeback_inode_template, sb_clear_inode_writeback,
        TP_PROTO(struct inode *inode),
        TP_ARGS(inode)
);

#endif /* _TRACE_WRITEBACK_H */

/* This part must be outside protection */
#include <trace/define_trace.h>






























































    4 










































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#ifndef _LINUX_HASH_H
#define _LINUX_HASH_H
/* Fast hashing routine for ints,  longs and pointers.
   (C) 2002 Nadia Yvette Chambers, IBM */

#include <asm/types.h>
#include <linux/compiler.h>

/*
 * The "GOLDEN_RATIO_PRIME" is used in ifs/btrfs/brtfs_inode.h and
 * fs/inode.c.  It's not actually prime any more (the previous primes
 * were actively bad for hashing), but the name remains.
 */
#if BITS_PER_LONG == 32
#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_32
#define hash_long(val, bits) hash_32(val, bits)
#elif BITS_PER_LONG == 64
#define hash_long(val, bits) hash_64(val, bits)
#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_64
#else
#error Wordsize not 32 or 64
#endif

/*
 * This hash multiplies the input by a large odd number and takes the
 * high bits.  Since multiplication propagates changes to the most
 * significant end only, it is essential that the high bits of the
 * product be used for the hash value.
 *
 * Chuck Lever verified the effectiveness of this technique:
 * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
 *
 * Although a random odd number will do, it turns out that the golden
 * ratio phi = (sqrt(5)-1)/2, or its negative, has particularly nice
 * properties.  (See Knuth vol 3, section 6.4, exercise 9.)
 *
 * These are the negative, (1 - phi) = phi**2 = (3 - sqrt(5))/2,
 * which is very slightly easier to multiply by and makes no
 * difference to the hash distribution.
 */
#define GOLDEN_RATIO_32 0x61C88647
#define GOLDEN_RATIO_64 0x61C8864680B583EBull

#ifdef CONFIG_HAVE_ARCH_HASH
/* This header may use the GOLDEN_RATIO_xx constants */
#include <asm/hash.h>
#endif

/*
 * The _generic versions exist only so lib/test_hash.c can compare
 * the arch-optimized versions with the generic.
 *
 * Note that if you change these, any <asm/hash.h> that aren't updated
 * to match need to have their HAVE_ARCH_* define values updated so the
 * self-test will not false-positive.
 */
#ifndef HAVE_ARCH__HASH_32
#define __hash_32 __hash_32_generic
#endif
static inline u32 __hash_32_generic(u32 val)
{
        return val * GOLDEN_RATIO_32;
}

#ifndef HAVE_ARCH_HASH_32
#define hash_32 hash_32_generic
#endif
static inline u32 hash_32_generic(u32 val, unsigned int bits)
{
        /* High bits are more random, so use them. */
        return __hash_32(val) >> (32 - bits);
}

#ifndef HAVE_ARCH_HASH_64
#define hash_64 hash_64_generic
#endif
static __always_inline u32 hash_64_generic(u64 val, unsigned int bits)
{
#if BITS_PER_LONG == 64
        /* 64x64-bit multiply is efficient on all 64-bit processors */
        return val * GOLDEN_RATIO_64 >> (64 - bits);
#else
        /* Hash 64 bits using only 32x32-bit multiply. */
        return hash_32((u32)val ^ __hash_32(val >> 32), bits);
#endif
}

static inline u32 hash_ptr(const void *ptr, unsigned int bits)
{
        return hash_long((unsigned long)ptr, bits);
}

/* This really should be called fold32_ptr; it does no hashing to speak of. */
static inline u32 hash32_ptr(const void *ptr)
{
        unsigned long val = (unsigned long)ptr;

#if BITS_PER_LONG == 64
        val ^= (val >> 32);
#endif
        return (u32)val;
}

#endif /* _LINUX_HASH_H */






















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * TTY core internal functions
 */

#ifndef _TTY_INTERNAL_H
#define _TTY_INTERNAL_H

#define tty_msg(fn, tty, f, ...) \
        fn("%s %s: " f, tty_driver_name(tty), tty_name(tty), ##__VA_ARGS__)

#define tty_debug(tty, f, ...)        tty_msg(pr_debug, tty, f, ##__VA_ARGS__)
#define tty_info(tty, f, ...)        tty_msg(pr_info, tty, f, ##__VA_ARGS__)
#define tty_notice(tty, f, ...)        tty_msg(pr_notice, tty, f, ##__VA_ARGS__)
#define tty_warn(tty, f, ...)        tty_msg(pr_warn, tty, f, ##__VA_ARGS__)
#define tty_err(tty, f, ...)        tty_msg(pr_err, tty, f, ##__VA_ARGS__)

#define tty_info_ratelimited(tty, f, ...) \
                tty_msg(pr_info_ratelimited, tty, f, ##__VA_ARGS__)

/*
 * Lock subclasses for tty locks
 *
 * TTY_LOCK_NORMAL is for normal ttys and master ptys.
 * TTY_LOCK_SLAVE is for slave ptys only.
 *
 * Lock subclasses are necessary for handling nested locking with pty pairs.
 * tty locks which use nested locking:
 *
 * legacy_mutex - Nested tty locks are necessary for releasing pty pairs.
 *                  The stable lock order is master pty first, then slave pty.
 * termios_rwsem - The stable lock order is tty_buffer lock->termios_rwsem.
 *                   Subclassing this lock enables the slave pty to hold its
 *                   termios_rwsem when claiming the master tty_buffer lock.
 * tty_buffer lock - slave ptys can claim nested buffer lock when handling
 *                     signal chars. The stable lock order is slave pty, then
 *                     master.
 */
enum {
        TTY_LOCK_NORMAL = 0,
        TTY_LOCK_SLAVE,
};

/* Values for tty->flow_change */
#define TTY_THROTTLE_SAFE        1
#define TTY_UNTHROTTLE_SAFE        2

static inline void __tty_set_flow_change(struct tty_struct *tty, int val)
{
        tty->flow_change = val;
}

static inline void tty_set_flow_change(struct tty_struct *tty, int val)
{
        tty->flow_change = val;
        smp_mb();
}

int tty_ldisc_lock(struct tty_struct *tty, unsigned long timeout);
void tty_ldisc_unlock(struct tty_struct *tty);

int __tty_check_change(struct tty_struct *tty, int sig);
int tty_check_change(struct tty_struct *tty);
void __stop_tty(struct tty_struct *tty);
void __start_tty(struct tty_struct *tty);
void tty_write_unlock(struct tty_struct *tty);
int tty_write_lock(struct tty_struct *tty, bool ndelay);
void tty_vhangup_session(struct tty_struct *tty);
void tty_open_proc_set_tty(struct file *filp, struct tty_struct *tty);
int tty_signal_session_leader(struct tty_struct *tty, int exit_session);
void session_clear_tty(struct pid *session);
void tty_buffer_free_all(struct tty_port *port);
void tty_buffer_flush(struct tty_struct *tty, struct tty_ldisc *ld);
void tty_buffer_init(struct tty_port *port);
void tty_buffer_set_lock_subclass(struct tty_port *port);
bool tty_buffer_restart_work(struct tty_port *port);
bool tty_buffer_cancel_work(struct tty_port *port);
void tty_buffer_flush_work(struct tty_port *port);
speed_t tty_termios_input_baud_rate(struct ktermios *termios);
void tty_ldisc_hangup(struct tty_struct *tty, bool reset);
int tty_ldisc_reinit(struct tty_struct *tty, int disc);
long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
long tty_jobctrl_ioctl(struct tty_struct *tty, struct tty_struct *real_tty,
                       struct file *file, unsigned int cmd, unsigned long arg);
void tty_default_fops(struct file_operations *fops);
struct tty_struct *alloc_tty_struct(struct tty_driver *driver, int idx);
int tty_alloc_file(struct file *file);
void tty_add_file(struct tty_struct *tty, struct file *file);
void tty_free_file(struct file *file);
int tty_release(struct inode *inode, struct file *filp);

#define tty_is_writelocked(tty)  (mutex_is_locked(&tty->atomic_write_lock))

int tty_ldisc_setup(struct tty_struct *tty, struct tty_struct *o_tty);
void tty_ldisc_release(struct tty_struct *tty);
int __must_check tty_ldisc_init(struct tty_struct *tty);
void tty_ldisc_deinit(struct tty_struct *tty);

void tty_sysctl_init(void);

/* tty_audit.c */
#ifdef CONFIG_AUDIT
void tty_audit_add_data(struct tty_struct *tty, const void *data, size_t size);
void tty_audit_tiocsti(struct tty_struct *tty, char ch);
#else
static inline void tty_audit_add_data(struct tty_struct *tty, const void *data,
                                      size_t size)
{
}
static inline void tty_audit_tiocsti(struct tty_struct *tty, char ch)
{
}
#endif

ssize_t redirected_tty_write(struct kiocb *, struct iov_iter *);

#endif

































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
/* SPDX-License-Identifier: GPL-2.0 */
/* interrupt.h */
#ifndef _LINUX_INTERRUPT_H
#define _LINUX_INTERRUPT_H

#include <linux/kernel.h>
#include <linux/bitops.h>
#include <linux/cpumask.h>
#include <linux/irqreturn.h>
#include <linux/irqnr.h>
#include <linux/hardirq.h>
#include <linux/irqflags.h>
#include <linux/hrtimer.h>
#include <linux/kref.h>
#include <linux/workqueue.h>

#include <linux/atomic.h>
#include <asm/ptrace.h>
#include <asm/irq.h>
#include <asm/sections.h>

/*
 * These correspond to the IORESOURCE_IRQ_* defines in
 * linux/ioport.h to select the interrupt line behaviour.  When
 * requesting an interrupt without specifying a IRQF_TRIGGER, the
 * setting should be assumed to be "as already configured", which
 * may be as per machine or firmware initialisation.
 */
#define IRQF_TRIGGER_NONE        0x00000000
#define IRQF_TRIGGER_RISING        0x00000001
#define IRQF_TRIGGER_FALLING        0x00000002
#define IRQF_TRIGGER_HIGH        0x00000004
#define IRQF_TRIGGER_LOW        0x00000008
#define IRQF_TRIGGER_MASK        (IRQF_TRIGGER_HIGH | IRQF_TRIGGER_LOW | \
                                 IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING)
#define IRQF_TRIGGER_PROBE        0x00000010

/*
 * These flags used only by the kernel as part of the
 * irq handling routines.
 *
 * IRQF_SHARED - allow sharing the irq among several devices
 * IRQF_PROBE_SHARED - set by callers when they expect sharing mismatches to occur
 * IRQF_TIMER - Flag to mark this interrupt as timer interrupt
 * IRQF_PERCPU - Interrupt is per cpu
 * IRQF_NOBALANCING - Flag to exclude this interrupt from irq balancing
 * IRQF_IRQPOLL - Interrupt is used for polling (only the interrupt that is
 *                registered first in a shared interrupt is considered for
 *                performance reasons)
 * IRQF_ONESHOT - Interrupt is not reenabled after the hardirq handler finished.
 *                Used by threaded interrupts which need to keep the
 *                irq line disabled until the threaded handler has been run.
 * IRQF_NO_SUSPEND - Do not disable this IRQ during suspend.  Does not guarantee
 *                   that this interrupt will wake the system from a suspended
 *                   state.  See Documentation/power/suspend-and-interrupts.rst
 * IRQF_FORCE_RESUME - Force enable it on resume even if IRQF_NO_SUSPEND is set
 * IRQF_NO_THREAD - Interrupt cannot be threaded
 * IRQF_EARLY_RESUME - Resume IRQ early during syscore instead of at device
 *                resume time.
 * IRQF_COND_SUSPEND - If the IRQ is shared with a NO_SUSPEND user, execute this
 *                interrupt handler after suspending interrupts. For system
 *                wakeup devices users need to implement wakeup detection in
 *                their interrupt handlers.
 * IRQF_NO_AUTOEN - Don't enable IRQ or NMI automatically when users request it.
 *                Users will enable it explicitly by enable_irq() or enable_nmi()
 *                later.
 */
#define IRQF_SHARED                0x00000080
#define IRQF_PROBE_SHARED        0x00000100
#define __IRQF_TIMER                0x00000200
#define IRQF_PERCPU                0x00000400
#define IRQF_NOBALANCING        0x00000800
#define IRQF_IRQPOLL                0x00001000
#define IRQF_ONESHOT                0x00002000
#define IRQF_NO_SUSPEND                0x00004000
#define IRQF_FORCE_RESUME        0x00008000
#define IRQF_NO_THREAD                0x00010000
#define IRQF_EARLY_RESUME        0x00020000
#define IRQF_COND_SUSPEND        0x00040000
#define IRQF_NO_AUTOEN                0x00080000

#define IRQF_TIMER                (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD)

/*
 * These values can be returned by request_any_context_irq() and
 * describe the context the interrupt will be run in.
 *
 * IRQC_IS_HARDIRQ - interrupt runs in hardirq context
 * IRQC_IS_NESTED - interrupt runs in a nested threaded context
 */
enum {
        IRQC_IS_HARDIRQ        = 0,
        IRQC_IS_NESTED,
};

typedef irqreturn_t (*irq_handler_t)(int, void *);

/**
 * struct irqaction - per interrupt action descriptor
 * @handler:        interrupt handler function
 * @name:        name of the device
 * @dev_id:        cookie to identify the device
 * @percpu_dev_id:        cookie to identify the device
 * @next:        pointer to the next irqaction for shared interrupts
 * @irq:        interrupt number
 * @flags:        flags (see IRQF_* above)
 * @thread_fn:        interrupt handler function for threaded interrupts
 * @thread:        thread pointer for threaded interrupts
 * @secondary:        pointer to secondary irqaction (force threading)
 * @thread_flags:        flags related to @thread
 * @thread_mask:        bitmask for keeping track of @thread activity
 * @dir:        pointer to the proc/irq/NN/name entry
 */
struct irqaction {
        irq_handler_t                handler;
        void                        *dev_id;
        void __percpu                *percpu_dev_id;
        struct irqaction        *next;
        irq_handler_t                thread_fn;
        struct task_struct        *thread;
        struct irqaction        *secondary;
        unsigned int                irq;
        unsigned int                flags;
        unsigned long                thread_flags;
        unsigned long                thread_mask;
        const char                *name;
        struct proc_dir_entry        *dir;
} ____cacheline_internodealigned_in_smp;

extern irqreturn_t no_action(int cpl, void *dev_id);

/*
 * If a (PCI) device interrupt is not connected we set dev->irq to
 * IRQ_NOTCONNECTED. This causes request_irq() to fail with -ENOTCONN, so we
 * can distingiush that case from other error returns.
 *
 * 0x80000000 is guaranteed to be outside the available range of interrupts
 * and easy to distinguish from other possible incorrect values.
 */
#define IRQ_NOTCONNECTED        (1U << 31)

extern int __must_check
request_threaded_irq(unsigned int irq, irq_handler_t handler,
                     irq_handler_t thread_fn,
                     unsigned long flags, const char *name, void *dev);

/**
 * request_irq - Add a handler for an interrupt line
 * @irq:        The interrupt line to allocate
 * @handler:        Function to be called when the IRQ occurs.
 *                Primary handler for threaded interrupts
 *                If NULL, the default primary handler is installed
 * @flags:        Handling flags
 * @name:        Name of the device generating this interrupt
 * @dev:        A cookie passed to the handler function
 *
 * This call allocates an interrupt and establishes a handler; see
 * the documentation for request_threaded_irq() for details.
 */
static inline int __must_check
request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags,
            const char *name, void *dev)
{
        return request_threaded_irq(irq, handler, NULL, flags, name, dev);
}

extern int __must_check
request_any_context_irq(unsigned int irq, irq_handler_t handler,
                        unsigned long flags, const char *name, void *dev_id);

extern int __must_check
__request_percpu_irq(unsigned int irq, irq_handler_t handler,
                     unsigned long flags, const char *devname,
                     void __percpu *percpu_dev_id);

extern int __must_check
request_nmi(unsigned int irq, irq_handler_t handler, unsigned long flags,
            const char *name, void *dev);

static inline int __must_check
request_percpu_irq(unsigned int irq, irq_handler_t handler,
                   const char *devname, void __percpu *percpu_dev_id)
{
        return __request_percpu_irq(irq, handler, 0,
                                    devname, percpu_dev_id);
}

extern int __must_check
request_percpu_nmi(unsigned int irq, irq_handler_t handler,
                   const char *devname, void __percpu *dev);

extern const void *free_irq(unsigned int, void *);
extern void free_percpu_irq(unsigned int, void __percpu *);

extern const void *free_nmi(unsigned int irq, void *dev_id);
extern void free_percpu_nmi(unsigned int irq, void __percpu *percpu_dev_id);

struct device;

extern int __must_check
devm_request_threaded_irq(struct device *dev, unsigned int irq,
                          irq_handler_t handler, irq_handler_t thread_fn,
                          unsigned long irqflags, const char *devname,
                          void *dev_id);

static inline int __must_check
devm_request_irq(struct device *dev, unsigned int irq, irq_handler_t handler,
                 unsigned long irqflags, const char *devname, void *dev_id)
{
        return devm_request_threaded_irq(dev, irq, handler, NULL, irqflags,
                                         devname, dev_id);
}

extern int __must_check
devm_request_any_context_irq(struct device *dev, unsigned int irq,
                 irq_handler_t handler, unsigned long irqflags,
                 const char *devname, void *dev_id);

extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id);

/*
 * On lockdep we dont want to enable hardirqs in hardirq
 * context. Use local_irq_enable_in_hardirq() to annotate
 * kernel code that has to do this nevertheless (pretty much
 * the only valid case is for old/broken hardware that is
 * insanely slow).
 *
 * NOTE: in theory this might break fragile code that relies
 * on hardirq delivery - in practice we dont seem to have such
 * places left. So the only effect should be slightly increased
 * irqs-off latencies.
 */
#ifdef CONFIG_LOCKDEP
# define local_irq_enable_in_hardirq()        do { } while (0)
#else
# define local_irq_enable_in_hardirq()        local_irq_enable()
#endif

extern void disable_irq_nosync(unsigned int irq);
extern bool disable_hardirq(unsigned int irq);
extern void disable_irq(unsigned int irq);
extern void disable_percpu_irq(unsigned int irq);
extern void enable_irq(unsigned int irq);
extern void enable_percpu_irq(unsigned int irq, unsigned int type);
extern bool irq_percpu_is_enabled(unsigned int irq);
extern void irq_wake_thread(unsigned int irq, void *dev_id);

extern void disable_nmi_nosync(unsigned int irq);
extern void disable_percpu_nmi(unsigned int irq);
extern void enable_nmi(unsigned int irq);
extern void enable_percpu_nmi(unsigned int irq, unsigned int type);
extern int prepare_percpu_nmi(unsigned int irq);
extern void teardown_percpu_nmi(unsigned int irq);

extern int irq_inject_interrupt(unsigned int irq);

/* The following three functions are for the core kernel use only. */
extern void suspend_device_irqs(void);
extern void resume_device_irqs(void);
extern void rearm_wake_irq(unsigned int irq);

/**
 * struct irq_affinity_notify - context for notification of IRQ affinity changes
 * @irq:                Interrupt to which notification applies
 * @kref:                Reference count, for internal use
 * @work:                Work item, for internal use
 * @notify:                Function to be called on change.  This will be
 *                        called in process context.
 * @release:                Function to be called on release.  This will be
 *                        called in process context.  Once registered, the
 *                        structure must only be freed when this function is
 *                        called or later.
 */
struct irq_affinity_notify {
        unsigned int irq;
        struct kref kref;
        struct work_struct work;
        void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask);
        void (*release)(struct kref *ref);
};

#define        IRQ_AFFINITY_MAX_SETS  4

/**
 * struct irq_affinity - Description for automatic irq affinity assignements
 * @pre_vectors:        Don't apply affinity to @pre_vectors at beginning of
 *                        the MSI(-X) vector space
 * @post_vectors:        Don't apply affinity to @post_vectors at end of
 *                        the MSI(-X) vector space
 * @nr_sets:                The number of interrupt sets for which affinity
 *                        spreading is required
 * @set_size:                Array holding the size of each interrupt set
 * @calc_sets:                Callback for calculating the number and size
 *                        of interrupt sets
 * @priv:                Private data for usage by @calc_sets, usually a
 *                        pointer to driver/device specific data.
 */
struct irq_affinity {
        unsigned int        pre_vectors;
        unsigned int        post_vectors;
        unsigned int        nr_sets;
        unsigned int        set_size[IRQ_AFFINITY_MAX_SETS];
        void                (*calc_sets)(struct irq_affinity *, unsigned int nvecs);
        void                *priv;
};

/**
 * struct irq_affinity_desc - Interrupt affinity descriptor
 * @mask:        cpumask to hold the affinity assignment
 * @is_managed: 1 if the interrupt is managed internally
 */
struct irq_affinity_desc {
        struct cpumask        mask;
        unsigned int        is_managed : 1;
};

#if defined(CONFIG_SMP)

extern cpumask_var_t irq_default_affinity;

extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask);
extern int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask);

extern int irq_can_set_affinity(unsigned int irq);
extern int irq_select_affinity(unsigned int irq);

extern int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m,
                                     bool setaffinity);

/**
 * irq_update_affinity_hint - Update the affinity hint
 * @irq:        Interrupt to update
 * @m:                cpumask pointer (NULL to clear the hint)
 *
 * Updates the affinity hint, but does not change the affinity of the interrupt.
 */
static inline int
irq_update_affinity_hint(unsigned int irq, const struct cpumask *m)
{
        return __irq_apply_affinity_hint(irq, m, false);
}

/**
 * irq_set_affinity_and_hint - Update the affinity hint and apply the provided
 *                             cpumask to the interrupt
 * @irq:        Interrupt to update
 * @m:                cpumask pointer (NULL to clear the hint)
 *
 * Updates the affinity hint and if @m is not NULL it applies it as the
 * affinity of that interrupt.
 */
static inline int
irq_set_affinity_and_hint(unsigned int irq, const struct cpumask *m)
{
        return __irq_apply_affinity_hint(irq, m, true);
}

/*
 * Deprecated. Use irq_update_affinity_hint() or irq_set_affinity_and_hint()
 * instead.
 */
static inline int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
{
        return irq_set_affinity_and_hint(irq, m);
}

extern int irq_update_affinity_desc(unsigned int irq,
                                    struct irq_affinity_desc *affinity);

extern int
irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify);

struct irq_affinity_desc *
irq_create_affinity_masks(unsigned int nvec, struct irq_affinity *affd);

unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
                                       const struct irq_affinity *affd);

#else /* CONFIG_SMP */

static inline int irq_set_affinity(unsigned int irq, const struct cpumask *m)
{
        return -EINVAL;
}

static inline int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask)
{
        return 0;
}

static inline int irq_can_set_affinity(unsigned int irq)
{
        return 0;
}

static inline int irq_select_affinity(unsigned int irq)  { return 0; }

static inline int irq_update_affinity_hint(unsigned int irq,
                                           const struct cpumask *m)
{
        return -EINVAL;
}

static inline int irq_set_affinity_and_hint(unsigned int irq,
                                            const struct cpumask *m)
{
        return -EINVAL;
}

static inline int irq_set_affinity_hint(unsigned int irq,
                                        const struct cpumask *m)
{
        return -EINVAL;
}

static inline int irq_update_affinity_desc(unsigned int irq,
                                           struct irq_affinity_desc *affinity)
{
        return -EINVAL;
}

static inline int
irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
{
        return 0;
}

static inline struct irq_affinity_desc *
irq_create_affinity_masks(unsigned int nvec, struct irq_affinity *affd)
{
        return NULL;
}

static inline unsigned int
irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
                          const struct irq_affinity *affd)
{
        return maxvec;
}

#endif /* CONFIG_SMP */

/*
 * Special lockdep variants of irq disabling/enabling.
 * These should be used for locking constructs that
 * know that a particular irq context which is disabled,
 * and which is the only irq-context user of a lock,
 * that it's safe to take the lock in the irq-disabled
 * section without disabling hardirqs.
 *
 * On !CONFIG_LOCKDEP they are equivalent to the normal
 * irq disable/enable methods.
 */
static inline void disable_irq_nosync_lockdep(unsigned int irq)
{
        disable_irq_nosync(irq);
#if defined(CONFIG_LOCKDEP) && !defined(CONFIG_PREEMPT_RT)
        local_irq_disable();
#endif
}

static inline void disable_irq_nosync_lockdep_irqsave(unsigned int irq, unsigned long *flags)
{
        disable_irq_nosync(irq);
#if defined(CONFIG_LOCKDEP) && !defined(CONFIG_PREEMPT_RT)
        local_irq_save(*flags);
#endif
}

static inline void disable_irq_lockdep(unsigned int irq)
{
        disable_irq(irq);
#ifdef CONFIG_LOCKDEP
        local_irq_disable();
#endif
}

static inline void enable_irq_lockdep(unsigned int irq)
{
#if defined(CONFIG_LOCKDEP) && !defined(CONFIG_PREEMPT_RT)
        local_irq_enable();
#endif
        enable_irq(irq);
}

static inline void enable_irq_lockdep_irqrestore(unsigned int irq, unsigned long *flags)
{
#if defined(CONFIG_LOCKDEP) && !defined(CONFIG_PREEMPT_RT)
        local_irq_restore(*flags);
#endif
        enable_irq(irq);
}

/* IRQ wakeup (PM) control: */
extern int irq_set_irq_wake(unsigned int irq, unsigned int on);

static inline int enable_irq_wake(unsigned int irq)
{
        return irq_set_irq_wake(irq, 1);
}

static inline int disable_irq_wake(unsigned int irq)
{
        return irq_set_irq_wake(irq, 0);
}

/*
 * irq_get_irqchip_state/irq_set_irqchip_state specific flags
 */
enum irqchip_irq_state {
        IRQCHIP_STATE_PENDING,                /* Is interrupt pending? */
        IRQCHIP_STATE_ACTIVE,                /* Is interrupt in progress? */
        IRQCHIP_STATE_MASKED,                /* Is interrupt masked? */
        IRQCHIP_STATE_LINE_LEVEL,        /* Is IRQ line high? */
};

extern int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
                                 bool *state);
extern int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
                                 bool state);

#ifdef CONFIG_IRQ_FORCED_THREADING
# ifdef CONFIG_PREEMPT_RT
#  define force_irqthreads        (true)
# else
extern bool force_irqthreads;
# endif
#else
#define force_irqthreads        (0)
#endif

#ifndef local_softirq_pending

#ifndef local_softirq_pending_ref
#define local_softirq_pending_ref irq_stat.__softirq_pending
#endif

#define local_softirq_pending()        (__this_cpu_read(local_softirq_pending_ref))
#define set_softirq_pending(x)        (__this_cpu_write(local_softirq_pending_ref, (x)))
#define or_softirq_pending(x)        (__this_cpu_or(local_softirq_pending_ref, (x)))

#endif /* local_softirq_pending */

/* Some architectures might implement lazy enabling/disabling of
 * interrupts. In some cases, such as stop_machine, we might want
 * to ensure that after a local_irq_disable(), interrupts have
 * really been disabled in hardware. Such architectures need to
 * implement the following hook.
 */
#ifndef hard_irq_disable
#define hard_irq_disable()        do { } while(0)
#endif

/* PLEASE, avoid to allocate new softirqs, if you need not _really_ high
   frequency threaded job scheduling. For almost all the purposes
   tasklets are more than enough. F.e. all serial device BHs et
   al. should be converted to tasklets, not to softirqs.
 */

enum
{
        HI_SOFTIRQ=0,
        TIMER_SOFTIRQ,
        NET_TX_SOFTIRQ,
        NET_RX_SOFTIRQ,
        BLOCK_SOFTIRQ,
        IRQ_POLL_SOFTIRQ,
        TASKLET_SOFTIRQ,
        SCHED_SOFTIRQ,
        HRTIMER_SOFTIRQ,
        RCU_SOFTIRQ,    /* Preferable RCU should always be the last softirq */

        NR_SOFTIRQS
};

#define SOFTIRQ_STOP_IDLE_MASK (~(1 << RCU_SOFTIRQ))

/* map softirq index to softirq name. update 'softirq_to_name' in
 * kernel/softirq.c when adding a new softirq.
 */
extern const char * const softirq_to_name[NR_SOFTIRQS];

/* softirq mask and active fields moved to irq_cpustat_t in
 * asm/hardirq.h to get better cache usage.  KAO
 */

struct softirq_action
{
        void        (*action)(struct softirq_action *);
};

asmlinkage void do_softirq(void);
asmlinkage void __do_softirq(void);

#ifdef __ARCH_HAS_DO_SOFTIRQ
void do_softirq_own_stack(void);
#else
static inline void do_softirq_own_stack(void)
{
        __do_softirq();
}
#endif

extern void open_softirq(int nr, void (*action)(struct softirq_action *));
extern void softirq_init(void);
extern void __raise_softirq_irqoff(unsigned int nr);

extern void raise_softirq_irqoff(unsigned int nr);
extern void raise_softirq(unsigned int nr);

DECLARE_PER_CPU(struct task_struct *, ksoftirqd);

static inline struct task_struct *this_cpu_ksoftirqd(void)
{
        return this_cpu_read(ksoftirqd);
}

/* Tasklets --- multithreaded analogue of BHs.

   This API is deprecated. Please consider using threaded IRQs instead:
   https://lore.kernel.org/lkml/20200716081538.2sivhkj4hcyrusem@linutronix.de

   Main feature differing them of generic softirqs: tasklet
   is running only on one CPU simultaneously.

   Main feature differing them of BHs: different tasklets
   may be run simultaneously on different CPUs.

   Properties:
   * If tasklet_schedule() is called, then tasklet is guaranteed
     to be executed on some cpu at least once after this.
   * If the tasklet is already scheduled, but its execution is still not
     started, it will be executed only once.
   * If this tasklet is already running on another CPU (or schedule is called
     from tasklet itself), it is rescheduled for later.
   * Tasklet is strictly serialized wrt itself, but not
     wrt another tasklets. If client needs some intertask synchronization,
     he makes it with spinlocks.
 */

struct tasklet_struct
{
        struct tasklet_struct *next;
        unsigned long state;
        atomic_t count;
        bool use_callback;
        union {
                void (*func)(unsigned long data);
                void (*callback)(struct tasklet_struct *t);
        };
        unsigned long data;
};

#define DECLARE_TASKLET(name, _callback)                \
struct tasklet_struct name = {                                \
        .count = ATOMIC_INIT(0),                        \
        .callback = _callback,                                \
        .use_callback = true,                                \
}

#define DECLARE_TASKLET_DISABLED(name, _callback)        \
struct tasklet_struct name = {                                \
        .count = ATOMIC_INIT(1),                        \
        .callback = _callback,                                \
        .use_callback = true,                                \
}

#define from_tasklet(var, callback_tasklet, tasklet_fieldname)        \
        container_of(callback_tasklet, typeof(*var), tasklet_fieldname)

#define DECLARE_TASKLET_OLD(name, _func)                \
struct tasklet_struct name = {                                \
        .count = ATOMIC_INIT(0),                        \
        .func = _func,                                        \
}

#define DECLARE_TASKLET_DISABLED_OLD(name, _func)        \
struct tasklet_struct name = {                                \
        .count = ATOMIC_INIT(1),                        \
        .func = _func,                                        \
}

enum
{
        TASKLET_STATE_SCHED,        /* Tasklet is scheduled for execution */
        TASKLET_STATE_RUN        /* Tasklet is running (SMP only) */
};

#ifdef CONFIG_SMP
static inline int tasklet_trylock(struct tasklet_struct *t)
{
        return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
}

static inline void tasklet_unlock(struct tasklet_struct *t)
{
        smp_mb__before_atomic();
        clear_bit(TASKLET_STATE_RUN, &(t)->state);
}

static inline void tasklet_unlock_wait(struct tasklet_struct *t)
{
        while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); }
}
#else
#define tasklet_trylock(t) 1
#define tasklet_unlock_wait(t) do { } while (0)
#define tasklet_unlock(t) do { } while (0)
#endif

extern void __tasklet_schedule(struct tasklet_struct *t);

static inline void tasklet_schedule(struct tasklet_struct *t)
{
        if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state))
                __tasklet_schedule(t);
}

extern void __tasklet_hi_schedule(struct tasklet_struct *t);

static inline void tasklet_hi_schedule(struct tasklet_struct *t)
{
        if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state))
                __tasklet_hi_schedule(t);
}

static inline void tasklet_disable_nosync(struct tasklet_struct *t)
{
        atomic_inc(&t->count);
        smp_mb__after_atomic();
}

static inline void tasklet_disable(struct tasklet_struct *t)
{
        tasklet_disable_nosync(t);
        tasklet_unlock_wait(t);
        smp_mb();
}

static inline void tasklet_enable(struct tasklet_struct *t)
{
        smp_mb__before_atomic();
        atomic_dec(&t->count);
}

extern void tasklet_kill(struct tasklet_struct *t);
extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
extern void tasklet_init(struct tasklet_struct *t,
                         void (*func)(unsigned long), unsigned long data);
extern void tasklet_setup(struct tasklet_struct *t,
                          void (*callback)(struct tasklet_struct *));

/*
 * Autoprobing for irqs:
 *
 * probe_irq_on() and probe_irq_off() provide robust primitives
 * for accurate IRQ probing during kernel initialization.  They are
 * reasonably simple to use, are not "fooled" by spurious interrupts,
 * and, unlike other attempts at IRQ probing, they do not get hung on
 * stuck interrupts (such as unused PS2 mouse interfaces on ASUS boards).
 *
 * For reasonably foolproof probing, use them as follows:
 *
 * 1. clear and/or mask the device's internal interrupt.
 * 2. sti();
 * 3. irqs = probe_irq_on();      // "take over" all unassigned idle IRQs
 * 4. enable the device and cause it to trigger an interrupt.
 * 5. wait for the device to interrupt, using non-intrusive polling or a delay.
 * 6. irq = probe_irq_off(irqs);  // get IRQ number, 0=none, negative=multiple
 * 7. service the device to clear its pending interrupt.
 * 8. loop again if paranoia is required.
 *
 * probe_irq_on() returns a mask of allocated irq's.
 *
 * probe_irq_off() takes the mask as a parameter,
 * and returns the irq number which occurred,
 * or zero if none occurred, or a negative irq number
 * if more than one irq occurred.
 */

#if !defined(CONFIG_GENERIC_IRQ_PROBE) 
static inline unsigned long probe_irq_on(void)
{
        return 0;
}
static inline int probe_irq_off(unsigned long val)
{
        return 0;
}
static inline unsigned int probe_irq_mask(unsigned long val)
{
        return 0;
}
#else
extern unsigned long probe_irq_on(void);        /* returns 0 on failure */
extern int probe_irq_off(unsigned long);        /* returns 0 or negative on failure */
extern unsigned int probe_irq_mask(unsigned long);        /* returns mask of ISA interrupts */
#endif

#ifdef CONFIG_PROC_FS
/* Initialize /proc/irq/ */
extern void init_irq_proc(void);
#else
static inline void init_irq_proc(void)
{
}
#endif

#ifdef CONFIG_IRQ_TIMINGS
void irq_timings_enable(void);
void irq_timings_disable(void);
u64 irq_timings_next_event(u64 now);
#endif

struct seq_file;
int show_interrupts(struct seq_file *p, void *v);
int arch_show_interrupts(struct seq_file *p, int prec);

extern int early_irq_init(void);
extern int arch_probe_nr_irqs(void);
extern int arch_early_irq_init(void);

/*
 * We want to know which function is an entrypoint of a hardirq or a softirq.
 */
#ifndef __irq_entry
# define __irq_entry         __section(".irqentry.text")
#endif

#define __softirq_entry  __section(".softirqentry.text")

#endif





























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM fib6

#if !defined(_TRACE_FIB6_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_FIB6_H

#include <linux/in6.h>
#include <net/flow.h>
#include <net/ip6_fib.h>
#include <linux/tracepoint.h>

TRACE_EVENT(fib6_table_lookup,

        TP_PROTO(const struct net *net, const struct fib6_result *res,
                 struct fib6_table *table, const struct flowi6 *flp),

        TP_ARGS(net, res, table, flp),

        TP_STRUCT__entry(
                __field(        u32,        tb_id                )
                __field(        int,        err                )
                __field(        int,        oif                )
                __field(        int,        iif                )
                __field(        __u8,        tos                )
                __field(        __u8,        scope                )
                __field(        __u8,        flags                )
                __array(        __u8,        src,        16        )
                __array(        __u8,        dst,        16        )
                __field(        u16,        sport                )
                __field(        u16,        dport                )
                __field(        u8,        proto                )
                __field(        u8,        rt_type                )
                __dynamic_array(        char,        name,        IFNAMSIZ )
                __array(                __u8,        gw,        16         )
        ),

        TP_fast_assign(
                struct in6_addr *in6;

                __entry->tb_id = table->tb6_id;
                __entry->err = ip6_rt_type_to_error(res->fib6_type);
                __entry->oif = flp->flowi6_oif;
                __entry->iif = flp->flowi6_iif;
                __entry->tos = ip6_tclass(flp->flowlabel);
                __entry->scope = flp->flowi6_scope;
                __entry->flags = flp->flowi6_flags;

                in6 = (struct in6_addr *)__entry->src;
                *in6 = flp->saddr;

                in6 = (struct in6_addr *)__entry->dst;
                *in6 = flp->daddr;

                __entry->proto = flp->flowi6_proto;
                if (__entry->proto == IPPROTO_TCP ||
                    __entry->proto == IPPROTO_UDP) {
                        __entry->sport = ntohs(flp->fl6_sport);
                        __entry->dport = ntohs(flp->fl6_dport);
                } else {
                        __entry->sport = 0;
                        __entry->dport = 0;
                }

                if (res->nh && res->nh->fib_nh_dev) {
                        __assign_str(name, res->nh->fib_nh_dev);
                } else {
                        __assign_str(name, "-");
                }
                if (res->f6i == net->ipv6.fib6_null_entry) {
                        struct in6_addr in6_zero = {};

                        in6 = (struct in6_addr *)__entry->gw;
                        *in6 = in6_zero;

                } else if (res->nh) {
                        in6 = (struct in6_addr *)__entry->gw;
                        *in6 = res->nh->fib_nh_gw6;
                }
        ),

        TP_printk("table %3u oif %d iif %d proto %u %pI6c/%u -> %pI6c/%u tos %d scope %d flags %x ==> dev %s gw %pI6c err %d",
                  __entry->tb_id, __entry->oif, __entry->iif, __entry->proto,
                  __entry->src, __entry->sport, __entry->dst, __entry->dport,
                  __entry->tos, __entry->scope, __entry->flags,
                  __get_str(name), __entry->gw, __entry->err)
);

#endif /* _TRACE_FIB6_H */

/* This part must be outside protection */
#include <trace/define_trace.h>


































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * workqueue.h --- work queue handling for Linux.
 */

#ifndef _LINUX_WORKQUEUE_H
#define _LINUX_WORKQUEUE_H

#include <linux/timer.h>
#include <linux/linkage.h>
#include <linux/bitops.h>
#include <linux/lockdep.h>
#include <linux/threads.h>
#include <linux/atomic.h>
#include <linux/cpumask.h>
#include <linux/rcupdate.h>

struct workqueue_struct;

struct work_struct;
typedef void (*work_func_t)(struct work_struct *work);
void delayed_work_timer_fn(struct timer_list *t);

/*
 * The first word is the work queue pointer and the flags rolled into
 * one
 */
#define work_data_bits(work) ((unsigned long *)(&(work)->data))

enum {
        WORK_STRUCT_PENDING_BIT        = 0,        /* work item is pending execution */
        WORK_STRUCT_INACTIVE_BIT= 1,        /* work item is inactive */
        WORK_STRUCT_PWQ_BIT        = 2,        /* data points to pwq */
        WORK_STRUCT_LINKED_BIT        = 3,        /* next work is linked to this one */
#ifdef CONFIG_DEBUG_OBJECTS_WORK
        WORK_STRUCT_STATIC_BIT        = 4,        /* static initializer (debugobjects) */
        WORK_STRUCT_COLOR_SHIFT        = 5,        /* color for workqueue flushing */
#else
        WORK_STRUCT_COLOR_SHIFT        = 4,        /* color for workqueue flushing */
#endif

        WORK_STRUCT_COLOR_BITS        = 4,

        WORK_STRUCT_PENDING        = 1 << WORK_STRUCT_PENDING_BIT,
        WORK_STRUCT_INACTIVE        = 1 << WORK_STRUCT_INACTIVE_BIT,
        WORK_STRUCT_PWQ                = 1 << WORK_STRUCT_PWQ_BIT,
        WORK_STRUCT_LINKED        = 1 << WORK_STRUCT_LINKED_BIT,
#ifdef CONFIG_DEBUG_OBJECTS_WORK
        WORK_STRUCT_STATIC        = 1 << WORK_STRUCT_STATIC_BIT,
#else
        WORK_STRUCT_STATIC        = 0,
#endif

        /*
         * The last color is no color used for works which don't
         * participate in workqueue flushing.
         */
        WORK_NR_COLORS                = (1 << WORK_STRUCT_COLOR_BITS) - 1,
        WORK_NO_COLOR                = WORK_NR_COLORS,

        /* not bound to any CPU, prefer the local CPU */
        WORK_CPU_UNBOUND        = NR_CPUS,

        /*
         * Reserve 8 bits off of pwq pointer w/ debugobjects turned off.
         * This makes pwqs aligned to 256 bytes and allows 15 workqueue
         * flush colors.
         */
        WORK_STRUCT_FLAG_BITS        = WORK_STRUCT_COLOR_SHIFT +
                                  WORK_STRUCT_COLOR_BITS,

        /* data contains off-queue information when !WORK_STRUCT_PWQ */
        WORK_OFFQ_FLAG_BASE        = WORK_STRUCT_COLOR_SHIFT,

        __WORK_OFFQ_CANCELING        = WORK_OFFQ_FLAG_BASE,

        /*
         * When a work item is off queue, its high bits point to the last
         * pool it was on.  Cap at 31 bits and use the highest number to
         * indicate that no pool is associated.
         */
        WORK_OFFQ_FLAG_BITS        = 1,
        WORK_OFFQ_POOL_SHIFT        = WORK_OFFQ_FLAG_BASE + WORK_OFFQ_FLAG_BITS,
        WORK_OFFQ_LEFT                = BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT,
        WORK_OFFQ_POOL_BITS        = WORK_OFFQ_LEFT <= 31 ? WORK_OFFQ_LEFT : 31,

        /* bit mask for work_busy() return values */
        WORK_BUSY_PENDING        = 1 << 0,
        WORK_BUSY_RUNNING        = 1 << 1,

        /* maximum string length for set_worker_desc() */
        WORKER_DESC_LEN                = 24,
};

/* Convenience constants - of type 'unsigned long', not 'enum'! */
#define WORK_OFFQ_CANCELING        (1ul << __WORK_OFFQ_CANCELING)
#define WORK_OFFQ_POOL_NONE        ((1ul << WORK_OFFQ_POOL_BITS) - 1)
#define WORK_STRUCT_NO_POOL        (WORK_OFFQ_POOL_NONE << WORK_OFFQ_POOL_SHIFT)

#define WORK_STRUCT_FLAG_MASK    ((1ul << WORK_STRUCT_FLAG_BITS) - 1)
#define WORK_STRUCT_WQ_DATA_MASK (~WORK_STRUCT_FLAG_MASK)

struct work_struct {
        atomic_long_t data;
        struct list_head entry;
        work_func_t func;
#ifdef CONFIG_LOCKDEP
        struct lockdep_map lockdep_map;
#endif
};

#define WORK_DATA_INIT()        ATOMIC_LONG_INIT((unsigned long)WORK_STRUCT_NO_POOL)
#define WORK_DATA_STATIC_INIT()        \
        ATOMIC_LONG_INIT((unsigned long)(WORK_STRUCT_NO_POOL | WORK_STRUCT_STATIC))

struct delayed_work {
        struct work_struct work;
        struct timer_list timer;

        /* target workqueue and CPU ->timer uses to queue ->work */
        struct workqueue_struct *wq;
        int cpu;
};

struct rcu_work {
        struct work_struct work;
        struct rcu_head rcu;

        /* target workqueue ->rcu uses to queue ->work */
        struct workqueue_struct *wq;
};

/**
 * struct workqueue_attrs - A struct for workqueue attributes.
 *
 * This can be used to change attributes of an unbound workqueue.
 */
struct workqueue_attrs {
        /**
         * @nice: nice level
         */
        int nice;

        /**
         * @cpumask: allowed CPUs
         */
        cpumask_var_t cpumask;

        /**
         * @no_numa: disable NUMA affinity
         *
         * Unlike other fields, ``no_numa`` isn't a property of a worker_pool. It
         * only modifies how :c:func:`apply_workqueue_attrs` select pools and thus
         * doesn't participate in pool hash calculations or equality comparisons.
         */
        bool no_numa;
};

static inline struct delayed_work *to_delayed_work(struct work_struct *work)
{
        return container_of(work, struct delayed_work, work);
}

static inline struct rcu_work *to_rcu_work(struct work_struct *work)
{
        return container_of(work, struct rcu_work, work);
}

struct execute_work {
        struct work_struct work;
};

#ifdef CONFIG_LOCKDEP
/*
 * NB: because we have to copy the lockdep_map, setting _key
 * here is required, otherwise it could get initialised to the
 * copy of the lockdep_map!
 */
#define __WORK_INIT_LOCKDEP_MAP(n, k) \
        .lockdep_map = STATIC_LOCKDEP_MAP_INIT(n, k),
#else
#define __WORK_INIT_LOCKDEP_MAP(n, k)
#endif

#define __WORK_INITIALIZER(n, f) {                                        \
        .data = WORK_DATA_STATIC_INIT(),                                \
        .entry        = { &(n).entry, &(n).entry },                                \
        .func = (f),                                                        \
        __WORK_INIT_LOCKDEP_MAP(#n, &(n))                                \
        }

#define __DELAYED_WORK_INITIALIZER(n, f, tflags) {                        \
        .work = __WORK_INITIALIZER((n).work, (f)),                        \
        .timer = __TIMER_INITIALIZER(delayed_work_timer_fn,\
                                     (tflags) | TIMER_IRQSAFE),                \
        }

#define DECLARE_WORK(n, f)                                                \
        struct work_struct n = __WORK_INITIALIZER(n, f)

#define DECLARE_DELAYED_WORK(n, f)                                        \
        struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f, 0)

#define DECLARE_DEFERRABLE_WORK(n, f)                                        \
        struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f, TIMER_DEFERRABLE)

#ifdef CONFIG_DEBUG_OBJECTS_WORK
extern void __init_work(struct work_struct *work, int onstack);
extern void destroy_work_on_stack(struct work_struct *work);
extern void destroy_delayed_work_on_stack(struct delayed_work *work);
static inline unsigned int work_static(struct work_struct *work)
{
        return *work_data_bits(work) & WORK_STRUCT_STATIC;
}
#else
static inline void __init_work(struct work_struct *work, int onstack) { }
static inline void destroy_work_on_stack(struct work_struct *work) { }
static inline void destroy_delayed_work_on_stack(struct delayed_work *work) { }
static inline unsigned int work_static(struct work_struct *work) { return 0; }
#endif

/*
 * initialize all of a work item in one go
 *
 * NOTE! No point in using "atomic_long_set()": using a direct
 * assignment of the work data initializer allows the compiler
 * to generate better code.
 */
#ifdef CONFIG_LOCKDEP
#define __INIT_WORK(_work, _func, _onstack)                                \
        do {                                                                \
                static struct lock_class_key __key;                        \
                                                                        \
                __init_work((_work), _onstack);                                \
                (_work)->data = (atomic_long_t) WORK_DATA_INIT();        \
                lockdep_init_map(&(_work)->lockdep_map, "(work_completion)"#_work, &__key, 0); \
                INIT_LIST_HEAD(&(_work)->entry);                        \
                (_work)->func = (_func);                                \
        } while (0)
#else
#define __INIT_WORK(_work, _func, _onstack)                                \
        do {                                                                \
                __init_work((_work), _onstack);                                \
                (_work)->data = (atomic_long_t) WORK_DATA_INIT();        \
                INIT_LIST_HEAD(&(_work)->entry);                        \
                (_work)->func = (_func);                                \
        } while (0)
#endif

#define INIT_WORK(_work, _func)                                                \
        __INIT_WORK((_work), (_func), 0)

#define INIT_WORK_ONSTACK(_work, _func)                                        \
        __INIT_WORK((_work), (_func), 1)

#define __INIT_DELAYED_WORK(_work, _func, _tflags)                        \
        do {                                                                \
                INIT_WORK(&(_work)->work, (_func));                        \
                __init_timer(&(_work)->timer,                                \
                             delayed_work_timer_fn,                        \
                             (_tflags) | TIMER_IRQSAFE);                \
        } while (0)

#define __INIT_DELAYED_WORK_ONSTACK(_work, _func, _tflags)                \
        do {                                                                \
                INIT_WORK_ONSTACK(&(_work)->work, (_func));                \
                __init_timer_on_stack(&(_work)->timer,                        \
                                      delayed_work_timer_fn,                \
                                      (_tflags) | TIMER_IRQSAFE);        \
        } while (0)

#define INIT_DELAYED_WORK(_work, _func)                                        \
        __INIT_DELAYED_WORK(_work, _func, 0)

#define INIT_DELAYED_WORK_ONSTACK(_work, _func)                                \
        __INIT_DELAYED_WORK_ONSTACK(_work, _func, 0)

#define INIT_DEFERRABLE_WORK(_work, _func)                                \
        __INIT_DELAYED_WORK(_work, _func, TIMER_DEFERRABLE)

#define INIT_DEFERRABLE_WORK_ONSTACK(_work, _func)                        \
        __INIT_DELAYED_WORK_ONSTACK(_work, _func, TIMER_DEFERRABLE)

#define INIT_RCU_WORK(_work, _func)                                        \
        INIT_WORK(&(_work)->work, (_func))

#define INIT_RCU_WORK_ONSTACK(_work, _func)                                \
        INIT_WORK_ONSTACK(&(_work)->work, (_func))

/**
 * work_pending - Find out whether a work item is currently pending
 * @work: The work item in question
 */
#define work_pending(work) \
        test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))

/**
 * delayed_work_pending - Find out whether a delayable work item is currently
 * pending
 * @w: The work item in question
 */
#define delayed_work_pending(w) \
        work_pending(&(w)->work)

/*
 * Workqueue flags and constants.  For details, please refer to
 * Documentation/core-api/workqueue.rst.
 */
enum {
        WQ_UNBOUND                = 1 << 1, /* not bound to any cpu */
        WQ_FREEZABLE                = 1 << 2, /* freeze during suspend */
        WQ_MEM_RECLAIM                = 1 << 3, /* may be used for memory reclaim */
        WQ_HIGHPRI                = 1 << 4, /* high priority */
        WQ_CPU_INTENSIVE        = 1 << 5, /* cpu intensive workqueue */
        WQ_SYSFS                = 1 << 6, /* visible in sysfs, see wq_sysfs_register() */

        /*
         * Per-cpu workqueues are generally preferred because they tend to
         * show better performance thanks to cache locality.  Per-cpu
         * workqueues exclude the scheduler from choosing the CPU to
         * execute the worker threads, which has an unfortunate side effect
         * of increasing power consumption.
         *
         * The scheduler considers a CPU idle if it doesn't have any task
         * to execute and tries to keep idle cores idle to conserve power;
         * however, for example, a per-cpu work item scheduled from an
         * interrupt handler on an idle CPU will force the scheduler to
         * excute the work item on that CPU breaking the idleness, which in
         * turn may lead to more scheduling choices which are sub-optimal
         * in terms of power consumption.
         *
         * Workqueues marked with WQ_POWER_EFFICIENT are per-cpu by default
         * but become unbound if workqueue.power_efficient kernel param is
         * specified.  Per-cpu workqueues which are identified to
         * contribute significantly to power-consumption are identified and
         * marked with this flag and enabling the power_efficient mode
         * leads to noticeable power saving at the cost of small
         * performance disadvantage.
         *
         * http://thread.gmane.org/gmane.linux.kernel/1480396
         */
        WQ_POWER_EFFICIENT        = 1 << 7,

        __WQ_DRAINING                = 1 << 16, /* internal: workqueue is draining */
        __WQ_ORDERED                = 1 << 17, /* internal: workqueue is ordered */
        __WQ_LEGACY                = 1 << 18, /* internal: create*_workqueue() */
        __WQ_ORDERED_EXPLICIT        = 1 << 19, /* internal: alloc_ordered_workqueue() */

        WQ_MAX_ACTIVE                = 512,          /* I like 512, better ideas? */
        WQ_MAX_UNBOUND_PER_CPU        = 4,          /* 4 * #cpus for unbound wq */
        WQ_DFL_ACTIVE                = WQ_MAX_ACTIVE / 2,
};

/* unbound wq's aren't per-cpu, scale max_active according to #cpus */
#define WQ_UNBOUND_MAX_ACTIVE        \
        max_t(int, WQ_MAX_ACTIVE, num_possible_cpus() * WQ_MAX_UNBOUND_PER_CPU)

/*
 * System-wide workqueues which are always present.
 *
 * system_wq is the one used by schedule[_delayed]_work[_on]().
 * Multi-CPU multi-threaded.  There are users which expect relatively
 * short queue flush time.  Don't queue works which can run for too
 * long.
 *
 * system_highpri_wq is similar to system_wq but for work items which
 * require WQ_HIGHPRI.
 *
 * system_long_wq is similar to system_wq but may host long running
 * works.  Queue flushing might take relatively long.
 *
 * system_unbound_wq is unbound workqueue.  Workers are not bound to
 * any specific CPU, not concurrency managed, and all queued works are
 * executed immediately as long as max_active limit is not reached and
 * resources are available.
 *
 * system_freezable_wq is equivalent to system_wq except that it's
 * freezable.
 *
 * *_power_efficient_wq are inclined towards saving power and converted
 * into WQ_UNBOUND variants if 'wq_power_efficient' is enabled; otherwise,
 * they are same as their non-power-efficient counterparts - e.g.
 * system_power_efficient_wq is identical to system_wq if
 * 'wq_power_efficient' is disabled.  See WQ_POWER_EFFICIENT for more info.
 */
extern struct workqueue_struct *system_wq;
extern struct workqueue_struct *system_highpri_wq;
extern struct workqueue_struct *system_long_wq;
extern struct workqueue_struct *system_unbound_wq;
extern struct workqueue_struct *system_freezable_wq;
extern struct workqueue_struct *system_power_efficient_wq;
extern struct workqueue_struct *system_freezable_power_efficient_wq;

/**
 * alloc_workqueue - allocate a workqueue
 * @fmt: printf format for the name of the workqueue
 * @flags: WQ_* flags
 * @max_active: max in-flight work items, 0 for default
 * remaining args: args for @fmt
 *
 * Allocate a workqueue with the specified parameters.  For detailed
 * information on WQ_* flags, please refer to
 * Documentation/core-api/workqueue.rst.
 *
 * RETURNS:
 * Pointer to the allocated workqueue on success, %NULL on failure.
 */
struct workqueue_struct *alloc_workqueue(const char *fmt,
                                         unsigned int flags,
                                         int max_active, ...);

/**
 * alloc_ordered_workqueue - allocate an ordered workqueue
 * @fmt: printf format for the name of the workqueue
 * @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful)
 * @args...: args for @fmt
 *
 * Allocate an ordered workqueue.  An ordered workqueue executes at
 * most one work item at any given time in the queued order.  They are
 * implemented as unbound workqueues with @max_active of one.
 *
 * RETURNS:
 * Pointer to the allocated workqueue on success, %NULL on failure.
 */
#define alloc_ordered_workqueue(fmt, flags, args...)                        \
        alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED |                \
                        __WQ_ORDERED_EXPLICIT | (flags), 1, ##args)

#define create_workqueue(name)                                                \
        alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name))
#define create_freezable_workqueue(name)                                \
        alloc_workqueue("%s", __WQ_LEGACY | WQ_FREEZABLE | WQ_UNBOUND |        \
                        WQ_MEM_RECLAIM, 1, (name))
#define create_singlethread_workqueue(name)                                \
        alloc_ordered_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, name)

extern void destroy_workqueue(struct workqueue_struct *wq);

struct workqueue_attrs *alloc_workqueue_attrs(void);
void free_workqueue_attrs(struct workqueue_attrs *attrs);
int apply_workqueue_attrs(struct workqueue_struct *wq,
                          const struct workqueue_attrs *attrs);
int workqueue_set_unbound_cpumask(cpumask_var_t cpumask);

extern bool queue_work_on(int cpu, struct workqueue_struct *wq,
                        struct work_struct *work);
extern bool queue_work_node(int node, struct workqueue_struct *wq,
                            struct work_struct *work);
extern bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
                        struct delayed_work *work, unsigned long delay);
extern bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
                        struct delayed_work *dwork, unsigned long delay);
extern bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork);

extern void flush_workqueue(struct workqueue_struct *wq);
extern void drain_workqueue(struct workqueue_struct *wq);

extern int schedule_on_each_cpu(work_func_t func);

int execute_in_process_context(work_func_t fn, struct execute_work *);

extern bool flush_work(struct work_struct *work);
extern bool cancel_work(struct work_struct *work);
extern bool cancel_work_sync(struct work_struct *work);

extern bool flush_delayed_work(struct delayed_work *dwork);
extern bool cancel_delayed_work(struct delayed_work *dwork);
extern bool cancel_delayed_work_sync(struct delayed_work *dwork);

extern bool flush_rcu_work(struct rcu_work *rwork);

extern void workqueue_set_max_active(struct workqueue_struct *wq,
                                     int max_active);
extern struct work_struct *current_work(void);
extern bool current_is_workqueue_rescuer(void);
extern bool workqueue_congested(int cpu, struct workqueue_struct *wq);
extern unsigned int work_busy(struct work_struct *work);
extern __printf(1, 2) void set_worker_desc(const char *fmt, ...);
extern void print_worker_info(const char *log_lvl, struct task_struct *task);
extern void show_workqueue_state(void);
extern void wq_worker_comm(char *buf, size_t size, struct task_struct *task);

/**
 * queue_work - queue work on a workqueue
 * @wq: workqueue to use
 * @work: work to queue
 *
 * Returns %false if @work was already on a queue, %true otherwise.
 *
 * We queue the work to the CPU on which it was submitted, but if the CPU dies
 * it can be processed by another CPU.
 *
 * Memory-ordering properties:  If it returns %true, guarantees that all stores
 * preceding the call to queue_work() in the program order will be visible from
 * the CPU which will execute @work by the time such work executes, e.g.,
 *
 * { x is initially 0 }
 *
 *   CPU0                                CPU1
 *
 *   WRITE_ONCE(x, 1);                        [ @work is being executed ]
 *   r0 = queue_work(wq, work);                  r1 = READ_ONCE(x);
 *
 * Forbids: r0 == true && r1 == 0
 */
static inline bool queue_work(struct workqueue_struct *wq,
                              struct work_struct *work)
{
        return queue_work_on(WORK_CPU_UNBOUND, wq, work);
}

/**
 * queue_delayed_work - queue work on a workqueue after delay
 * @wq: workqueue to use
 * @dwork: delayable work to queue
 * @delay: number of jiffies to wait before queueing
 *
 * Equivalent to queue_delayed_work_on() but tries to use the local CPU.
 */
static inline bool queue_delayed_work(struct workqueue_struct *wq,
                                      struct delayed_work *dwork,
                                      unsigned long delay)
{
        return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
}

/**
 * mod_delayed_work - modify delay of or queue a delayed work
 * @wq: workqueue to use
 * @dwork: work to queue
 * @delay: number of jiffies to wait before queueing
 *
 * mod_delayed_work_on() on local CPU.
 */
static inline bool mod_delayed_work(struct workqueue_struct *wq,
                                    struct delayed_work *dwork,
                                    unsigned long delay)
{
        return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
}

/**
 * schedule_work_on - put work task on a specific cpu
 * @cpu: cpu to put the work task on
 * @work: job to be done
 *
 * This puts a job on a specific cpu
 */
static inline bool schedule_work_on(int cpu, struct work_struct *work)
{
        return queue_work_on(cpu, system_wq, work);
}

/**
 * schedule_work - put work task in global workqueue
 * @work: job to be done
 *
 * Returns %false if @work was already on the kernel-global workqueue and
 * %true otherwise.
 *
 * This puts a job in the kernel-global workqueue if it was not already
 * queued and leaves it in the same position on the kernel-global
 * workqueue otherwise.
 *
 * Shares the same memory-ordering properties of queue_work(), cf. the
 * DocBook header of queue_work().
 */
static inline bool schedule_work(struct work_struct *work)
{
        return queue_work(system_wq, work);
}

/**
 * flush_scheduled_work - ensure that any scheduled work has run to completion.
 *
 * Forces execution of the kernel-global workqueue and blocks until its
 * completion.
 *
 * Think twice before calling this function!  It's very easy to get into
 * trouble if you don't take great care.  Either of the following situations
 * will lead to deadlock:
 *
 *        One of the work items currently on the workqueue needs to acquire
 *        a lock held by your code or its caller.
 *
 *        Your code is running in the context of a work routine.
 *
 * They will be detected by lockdep when they occur, but the first might not
 * occur very often.  It depends on what work items are on the workqueue and
 * what locks they need, which you have no control over.
 *
 * In most situations flushing the entire workqueue is overkill; you merely
 * need to know that a particular work item isn't queued and isn't running.
 * In such cases you should use cancel_delayed_work_sync() or
 * cancel_work_sync() instead.
 */
static inline void flush_scheduled_work(void)
{
        flush_workqueue(system_wq);
}

/**
 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
 * @cpu: cpu to use
 * @dwork: job to be done
 * @delay: number of jiffies to wait
 *
 * After waiting for a given time this puts a job in the kernel-global
 * workqueue on the specified CPU.
 */
static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
                                            unsigned long delay)
{
        return queue_delayed_work_on(cpu, system_wq, dwork, delay);
}

/**
 * schedule_delayed_work - put work task in global workqueue after delay
 * @dwork: job to be done
 * @delay: number of jiffies to wait or 0 for immediate execution
 *
 * After waiting for a given time this puts a job in the kernel-global
 * workqueue.
 */
static inline bool schedule_delayed_work(struct delayed_work *dwork,
                                         unsigned long delay)
{
        return queue_delayed_work(system_wq, dwork, delay);
}

#ifndef CONFIG_SMP
static inline long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
{
        return fn(arg);
}
static inline long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
{
        return fn(arg);
}
#else
long work_on_cpu(int cpu, long (*fn)(void *), void *arg);
long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg);
#endif /* CONFIG_SMP */

#ifdef CONFIG_FREEZER
extern void freeze_workqueues_begin(void);
extern bool freeze_workqueues_busy(void);
extern void thaw_workqueues(void);
#endif /* CONFIG_FREEZER */

#ifdef CONFIG_SYSFS
int workqueue_sysfs_register(struct workqueue_struct *wq);
#else        /* CONFIG_SYSFS */
static inline int workqueue_sysfs_register(struct workqueue_struct *wq)
{ return 0; }
#endif        /* CONFIG_SYSFS */

#ifdef CONFIG_WQ_WATCHDOG
void wq_watchdog_touch(int cpu);
#else        /* CONFIG_WQ_WATCHDOG */
static inline void wq_watchdog_touch(int cpu) { }
#endif        /* CONFIG_WQ_WATCHDOG */

#ifdef CONFIG_SMP
int workqueue_prepare_cpu(unsigned int cpu);
int workqueue_online_cpu(unsigned int cpu);
int workqueue_offline_cpu(unsigned int cpu);
#endif

void __init workqueue_init_early(void);
void __init workqueue_init(void);

#endif













































































































































































































































































































































































































































    3 
























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PGTABLE_DEFS_H
#define _ASM_X86_PGTABLE_DEFS_H

#include <linux/const.h>
#include <linux/mem_encrypt.h>

#include <asm/page_types.h>

#define FIRST_USER_ADDRESS        0UL

#define _PAGE_BIT_PRESENT        0        /* is present */
#define _PAGE_BIT_RW                1        /* writeable */
#define _PAGE_BIT_USER                2        /* userspace addressable */
#define _PAGE_BIT_PWT                3        /* page write through */
#define _PAGE_BIT_PCD                4        /* page cache disabled */
#define _PAGE_BIT_ACCESSED        5        /* was accessed (raised by CPU) */
#define _PAGE_BIT_DIRTY                6        /* was written to (raised by CPU) */
#define _PAGE_BIT_PSE                7        /* 4 MB (or 2MB) page */
#define _PAGE_BIT_PAT                7        /* on 4KB pages */
#define _PAGE_BIT_GLOBAL        8        /* Global TLB entry PPro+ */
#define _PAGE_BIT_SOFTW1        9        /* available for programmer */
#define _PAGE_BIT_SOFTW2        10        /* " */
#define _PAGE_BIT_SOFTW3        11        /* " */
#define _PAGE_BIT_PAT_LARGE        12        /* On 2MB or 1GB pages */
#define _PAGE_BIT_SOFTW4        58        /* available for programmer */
#define _PAGE_BIT_PKEY_BIT0        59        /* Protection Keys, bit 1/4 */
#define _PAGE_BIT_PKEY_BIT1        60        /* Protection Keys, bit 2/4 */
#define _PAGE_BIT_PKEY_BIT2        61        /* Protection Keys, bit 3/4 */
#define _PAGE_BIT_PKEY_BIT3        62        /* Protection Keys, bit 4/4 */
#define _PAGE_BIT_NX                63        /* No execute: only valid after cpuid check */

#define _PAGE_BIT_SPECIAL        _PAGE_BIT_SOFTW1
#define _PAGE_BIT_CPA_TEST        _PAGE_BIT_SOFTW1
#define _PAGE_BIT_UFFD_WP        _PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */
#define _PAGE_BIT_SOFT_DIRTY        _PAGE_BIT_SOFTW3 /* software dirty tracking */
#define _PAGE_BIT_DEVMAP        _PAGE_BIT_SOFTW4

/* If _PAGE_BIT_PRESENT is clear, we use these: */
/* - if the user mapped it with PROT_NONE; pte_present gives true */
#define _PAGE_BIT_PROTNONE        _PAGE_BIT_GLOBAL

#define _PAGE_PRESENT        (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
#define _PAGE_RW        (_AT(pteval_t, 1) << _PAGE_BIT_RW)
#define _PAGE_USER        (_AT(pteval_t, 1) << _PAGE_BIT_USER)
#define _PAGE_PWT        (_AT(pteval_t, 1) << _PAGE_BIT_PWT)
#define _PAGE_PCD        (_AT(pteval_t, 1) << _PAGE_BIT_PCD)
#define _PAGE_ACCESSED        (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
#define _PAGE_DIRTY        (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
#define _PAGE_PSE        (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
#define _PAGE_GLOBAL        (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
#define _PAGE_SOFTW1        (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
#define _PAGE_SOFTW2        (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
#define _PAGE_SOFTW3        (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW3)
#define _PAGE_PAT        (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
#define _PAGE_SPECIAL        (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
#define _PAGE_CPA_TEST        (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
#define _PAGE_PKEY_BIT0        (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT0)
#define _PAGE_PKEY_BIT1        (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT1)
#define _PAGE_PKEY_BIT2        (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT2)
#define _PAGE_PKEY_BIT3        (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT3)
#else
#define _PAGE_PKEY_BIT0        (_AT(pteval_t, 0))
#define _PAGE_PKEY_BIT1        (_AT(pteval_t, 0))
#define _PAGE_PKEY_BIT2        (_AT(pteval_t, 0))
#define _PAGE_PKEY_BIT3        (_AT(pteval_t, 0))
#endif

#define _PAGE_PKEY_MASK (_PAGE_PKEY_BIT0 | \
                         _PAGE_PKEY_BIT1 | \
                         _PAGE_PKEY_BIT2 | \
                         _PAGE_PKEY_BIT3)

#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
#define _PAGE_KNL_ERRATUM_MASK (_PAGE_DIRTY | _PAGE_ACCESSED)
#else
#define _PAGE_KNL_ERRATUM_MASK 0
#endif

#ifdef CONFIG_MEM_SOFT_DIRTY
#define _PAGE_SOFT_DIRTY        (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY)
#else
#define _PAGE_SOFT_DIRTY        (_AT(pteval_t, 0))
#endif

/*
 * Tracking soft dirty bit when a page goes to a swap is tricky.
 * We need a bit which can be stored in pte _and_ not conflict
 * with swap entry format. On x86 bits 1-4 are *not* involved
 * into swap entry computation, but bit 7 is used for thp migration,
 * so we borrow bit 1 for soft dirty tracking.
 *
 * Please note that this bit must be treated as swap dirty page
 * mark if and only if the PTE/PMD has present bit clear!
 */
#ifdef CONFIG_MEM_SOFT_DIRTY
#define _PAGE_SWP_SOFT_DIRTY        _PAGE_RW
#else
#define _PAGE_SWP_SOFT_DIRTY        (_AT(pteval_t, 0))
#endif

#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
#define _PAGE_UFFD_WP                (_AT(pteval_t, 1) << _PAGE_BIT_UFFD_WP)
#define _PAGE_SWP_UFFD_WP        _PAGE_USER
#else
#define _PAGE_UFFD_WP                (_AT(pteval_t, 0))
#define _PAGE_SWP_UFFD_WP        (_AT(pteval_t, 0))
#endif

#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
#define _PAGE_NX        (_AT(pteval_t, 1) << _PAGE_BIT_NX)
#define _PAGE_DEVMAP        (_AT(u64, 1) << _PAGE_BIT_DEVMAP)
#else
#define _PAGE_NX        (_AT(pteval_t, 0))
#define _PAGE_DEVMAP        (_AT(pteval_t, 0))
#endif

#define _PAGE_PROTNONE        (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)

/*
 * Set of bits not changed in pte_modify.  The pte's
 * protection key is treated like _PAGE_RW, for
 * instance, and is *not* included in this mask since
 * pte_modify() does modify it.
 */
#define _COMMON_PAGE_CHG_MASK        (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT |               \
                                 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY |\
                                 _PAGE_SOFT_DIRTY | _PAGE_DEVMAP | _PAGE_ENC | \
                                 _PAGE_UFFD_WP)
#define _PAGE_CHG_MASK        (_COMMON_PAGE_CHG_MASK | _PAGE_PAT)
#define _HPAGE_CHG_MASK (_COMMON_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_PAT_LARGE)

/*
 * The cache modes defined here are used to translate between pure SW usage
 * and the HW defined cache mode bits and/or PAT entries.
 *
 * The resulting bits for PWT, PCD and PAT should be chosen in a way
 * to have the WB mode at index 0 (all bits clear). This is the default
 * right now and likely would break too much if changed.
 */
#ifndef __ASSEMBLY__
enum page_cache_mode {
        _PAGE_CACHE_MODE_WB       = 0,
        _PAGE_CACHE_MODE_WC       = 1,
        _PAGE_CACHE_MODE_UC_MINUS = 2,
        _PAGE_CACHE_MODE_UC       = 3,
        _PAGE_CACHE_MODE_WT       = 4,
        _PAGE_CACHE_MODE_WP       = 5,

        _PAGE_CACHE_MODE_NUM      = 8
};
#endif

#define _PAGE_ENC                (_AT(pteval_t, sme_me_mask))

#define _PAGE_CACHE_MASK        (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)
#define _PAGE_LARGE_CACHE_MASK        (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT_LARGE)

#define _PAGE_NOCACHE                (cachemode2protval(_PAGE_CACHE_MODE_UC))
#define _PAGE_CACHE_WP                (cachemode2protval(_PAGE_CACHE_MODE_WP))

#define __PP _PAGE_PRESENT
#define __RW _PAGE_RW
#define _USR _PAGE_USER
#define ___A _PAGE_ACCESSED
#define ___D _PAGE_DIRTY
#define ___G _PAGE_GLOBAL
#define __NX _PAGE_NX

#define _ENC _PAGE_ENC
#define __WP _PAGE_CACHE_WP
#define __NC _PAGE_NOCACHE
#define _PSE _PAGE_PSE

#define pgprot_val(x)                ((x).pgprot)
#define __pgprot(x)                ((pgprot_t) { (x) } )
#define __pg(x)                        __pgprot(x)

#define _PAGE_PAT_LARGE                (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)

#define PAGE_NONE             __pg(   0|   0|   0|___A|   0|   0|   0|___G)
#define PAGE_SHARED             __pg(__PP|__RW|_USR|___A|__NX|   0|   0|   0)
#define PAGE_SHARED_EXEC     __pg(__PP|__RW|_USR|___A|   0|   0|   0|   0)
#define PAGE_COPY_NOEXEC     __pg(__PP|   0|_USR|___A|__NX|   0|   0|   0)
#define PAGE_COPY_EXEC             __pg(__PP|   0|_USR|___A|   0|   0|   0|   0)
#define PAGE_COPY             __pg(__PP|   0|_USR|___A|__NX|   0|   0|   0)
#define PAGE_READONLY             __pg(__PP|   0|_USR|___A|__NX|   0|   0|   0)
#define PAGE_READONLY_EXEC   __pg(__PP|   0|_USR|___A|   0|   0|   0|   0)

#define __PAGE_KERNEL                 (__PP|__RW|   0|___A|__NX|___D|   0|___G)
#define __PAGE_KERNEL_EXEC         (__PP|__RW|   0|___A|   0|___D|   0|___G)
#define _KERNPG_TABLE_NOENC         (__PP|__RW|   0|___A|   0|___D|   0|   0)
#define _KERNPG_TABLE                 (__PP|__RW|   0|___A|   0|___D|   0|   0| _ENC)
#define _PAGE_TABLE_NOENC         (__PP|__RW|_USR|___A|   0|___D|   0|   0)
#define _PAGE_TABLE                 (__PP|__RW|_USR|___A|   0|___D|   0|   0| _ENC)
#define __PAGE_KERNEL_RO         (__PP|   0|   0|___A|__NX|___D|   0|___G)
#define __PAGE_KERNEL_ROX         (__PP|   0|   0|___A|   0|___D|   0|___G)
#define __PAGE_KERNEL_NOCACHE         (__PP|__RW|   0|___A|__NX|___D|   0|___G| __NC)
#define __PAGE_KERNEL_VVAR         (__PP|   0|_USR|___A|__NX|___D|   0|___G)
#define __PAGE_KERNEL_LARGE         (__PP|__RW|   0|___A|__NX|___D|_PSE|___G)
#define __PAGE_KERNEL_LARGE_EXEC (__PP|__RW|   0|___A|   0|___D|_PSE|___G)
#define __PAGE_KERNEL_WP         (__PP|__RW|   0|___A|__NX|___D|   0|___G| __WP)


#define __PAGE_KERNEL_IO                __PAGE_KERNEL
#define __PAGE_KERNEL_IO_NOCACHE        __PAGE_KERNEL_NOCACHE


#ifndef __ASSEMBLY__

#define __PAGE_KERNEL_ENC        (__PAGE_KERNEL    | _ENC)
#define __PAGE_KERNEL_ENC_WP        (__PAGE_KERNEL_WP | _ENC)
#define __PAGE_KERNEL_NOENC        (__PAGE_KERNEL    |    0)
#define __PAGE_KERNEL_NOENC_WP        (__PAGE_KERNEL_WP |    0)

#define __pgprot_mask(x)        __pgprot((x) & __default_kernel_pte_mask)

#define PAGE_KERNEL                __pgprot_mask(__PAGE_KERNEL            | _ENC)
#define PAGE_KERNEL_NOENC        __pgprot_mask(__PAGE_KERNEL            |    0)
#define PAGE_KERNEL_RO                __pgprot_mask(__PAGE_KERNEL_RO         | _ENC)
#define PAGE_KERNEL_EXEC        __pgprot_mask(__PAGE_KERNEL_EXEC       | _ENC)
#define PAGE_KERNEL_EXEC_NOENC        __pgprot_mask(__PAGE_KERNEL_EXEC       |    0)
#define PAGE_KERNEL_ROX                __pgprot_mask(__PAGE_KERNEL_ROX        | _ENC)
#define PAGE_KERNEL_NOCACHE        __pgprot_mask(__PAGE_KERNEL_NOCACHE    | _ENC)
#define PAGE_KERNEL_LARGE        __pgprot_mask(__PAGE_KERNEL_LARGE      | _ENC)
#define PAGE_KERNEL_LARGE_EXEC        __pgprot_mask(__PAGE_KERNEL_LARGE_EXEC | _ENC)
#define PAGE_KERNEL_VVAR        __pgprot_mask(__PAGE_KERNEL_VVAR       | _ENC)

#define PAGE_KERNEL_IO                __pgprot_mask(__PAGE_KERNEL_IO)
#define PAGE_KERNEL_IO_NOCACHE        __pgprot_mask(__PAGE_KERNEL_IO_NOCACHE)

#endif        /* __ASSEMBLY__ */

/*         xwr */
#define __P000        PAGE_NONE
#define __P001        PAGE_READONLY
#define __P010        PAGE_COPY
#define __P011        PAGE_COPY
#define __P100        PAGE_READONLY_EXEC
#define __P101        PAGE_READONLY_EXEC
#define __P110        PAGE_COPY_EXEC
#define __P111        PAGE_COPY_EXEC

#define __S000        PAGE_NONE
#define __S001        PAGE_READONLY
#define __S010        PAGE_SHARED
#define __S011        PAGE_SHARED
#define __S100        PAGE_READONLY_EXEC
#define __S101        PAGE_READONLY_EXEC
#define __S110        PAGE_SHARED_EXEC
#define __S111        PAGE_SHARED_EXEC

/*
 * early identity mapping  pte attrib macros.
 */
#ifdef CONFIG_X86_64
#define __PAGE_KERNEL_IDENT_LARGE_EXEC        __PAGE_KERNEL_LARGE_EXEC
#else
#define PTE_IDENT_ATTR         0x003                /* PRESENT+RW */
#define PDE_IDENT_ATTR         0x063                /* PRESENT+RW+DIRTY+ACCESSED */
#define PGD_IDENT_ATTR         0x001                /* PRESENT (no other attributes) */
#endif

#ifdef CONFIG_X86_32
# include <asm/pgtable_32_types.h>
#else
# include <asm/pgtable_64_types.h>
#endif

#ifndef __ASSEMBLY__

#include <linux/types.h>

/* Extracts the PFN from a (pte|pmd|pud|pgd)val_t of a 4KB page */
#define PTE_PFN_MASK                ((pteval_t)PHYSICAL_PAGE_MASK)

/*
 *  Extracts the flags from a (pte|pmd|pud|pgd)val_t
 *  This includes the protection key value.
 */
#define PTE_FLAGS_MASK                (~PTE_PFN_MASK)

typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;

typedef struct { pgdval_t pgd; } pgd_t;

static inline pgprot_t pgprot_nx(pgprot_t prot)
{
        return __pgprot(pgprot_val(prot) | _PAGE_NX);
}
#define pgprot_nx pgprot_nx

#ifdef CONFIG_X86_PAE

/*
 * PHYSICAL_PAGE_MASK might be non-constant when SME is compiled in, so we can't
 * use it here.
 */

#define PGD_PAE_PAGE_MASK        ((signed long)PAGE_MASK)
#define PGD_PAE_PHYS_MASK        (((1ULL << __PHYSICAL_MASK_SHIFT)-1) & PGD_PAE_PAGE_MASK)

/*
 * PAE allows Base Address, P, PWT, PCD and AVL bits to be set in PGD entries.
 * All other bits are Reserved MBZ
 */
#define PGD_ALLOWED_BITS        (PGD_PAE_PHYS_MASK | _PAGE_PRESENT | \
                                 _PAGE_PWT | _PAGE_PCD | \
                                 _PAGE_SOFTW1 | _PAGE_SOFTW2 | _PAGE_SOFTW3)

#else
/* No need to mask any bits for !PAE */
#define PGD_ALLOWED_BITS        (~0ULL)
#endif

static inline pgd_t native_make_pgd(pgdval_t val)
{
        return (pgd_t) { val & PGD_ALLOWED_BITS };
}

static inline pgdval_t native_pgd_val(pgd_t pgd)
{
        return pgd.pgd & PGD_ALLOWED_BITS;
}

static inline pgdval_t pgd_flags(pgd_t pgd)
{
        return native_pgd_val(pgd) & PTE_FLAGS_MASK;
}

#if CONFIG_PGTABLE_LEVELS > 4
typedef struct { p4dval_t p4d; } p4d_t;

static inline p4d_t native_make_p4d(pudval_t val)
{
        return (p4d_t) { val };
}

static inline p4dval_t native_p4d_val(p4d_t p4d)
{
        return p4d.p4d;
}
#else
#include <asm-generic/pgtable-nop4d.h>

static inline p4d_t native_make_p4d(pudval_t val)
{
        return (p4d_t) { .pgd = native_make_pgd((pgdval_t)val) };
}

static inline p4dval_t native_p4d_val(p4d_t p4d)
{
        return native_pgd_val(p4d.pgd);
}
#endif

#if CONFIG_PGTABLE_LEVELS > 3
typedef struct { pudval_t pud; } pud_t;

static inline pud_t native_make_pud(pmdval_t val)
{
        return (pud_t) { val };
}

static inline pudval_t native_pud_val(pud_t pud)
{
        return pud.pud;
}
#else
#include <asm-generic/pgtable-nopud.h>

static inline pud_t native_make_pud(pudval_t val)
{
        return (pud_t) { .p4d.pgd = native_make_pgd(val) };
}

static inline pudval_t native_pud_val(pud_t pud)
{
        return native_pgd_val(pud.p4d.pgd);
}
#endif

#if CONFIG_PGTABLE_LEVELS > 2
typedef struct { pmdval_t pmd; } pmd_t;

static inline pmd_t native_make_pmd(pmdval_t val)
{
        return (pmd_t) { val };
}

static inline pmdval_t native_pmd_val(pmd_t pmd)
{
        return pmd.pmd;
}
#else
#include <asm-generic/pgtable-nopmd.h>

static inline pmd_t native_make_pmd(pmdval_t val)
{
        return (pmd_t) { .pud.p4d.pgd = native_make_pgd(val) };
}

static inline pmdval_t native_pmd_val(pmd_t pmd)
{
        return native_pgd_val(pmd.pud.p4d.pgd);
}
#endif

static inline p4dval_t p4d_pfn_mask(p4d_t p4d)
{
        /* No 512 GiB huge pages yet */
        return PTE_PFN_MASK;
}

static inline p4dval_t p4d_flags_mask(p4d_t p4d)
{
        return ~p4d_pfn_mask(p4d);
}

static inline p4dval_t p4d_flags(p4d_t p4d)
{
        return native_p4d_val(p4d) & p4d_flags_mask(p4d);
}

static inline pudval_t pud_pfn_mask(pud_t pud)
{
        if (native_pud_val(pud) & _PAGE_PSE)
                return PHYSICAL_PUD_PAGE_MASK;
        else
                return PTE_PFN_MASK;
}

static inline pudval_t pud_flags_mask(pud_t pud)
{
        return ~pud_pfn_mask(pud);
}

static inline pudval_t pud_flags(pud_t pud)
{
        return native_pud_val(pud) & pud_flags_mask(pud);
}

static inline pmdval_t pmd_pfn_mask(pmd_t pmd)
{
        if (native_pmd_val(pmd) & _PAGE_PSE)
                return PHYSICAL_PMD_PAGE_MASK;
        else
                return PTE_PFN_MASK;
}

static inline pmdval_t pmd_flags_mask(pmd_t pmd)
{
        return ~pmd_pfn_mask(pmd);
}

static inline pmdval_t pmd_flags(pmd_t pmd)
{
        return native_pmd_val(pmd) & pmd_flags_mask(pmd);
}

static inline pte_t native_make_pte(pteval_t val)
{
        return (pte_t) { .pte = val };
}

static inline pteval_t native_pte_val(pte_t pte)
{
        return pte.pte;
}

static inline pteval_t pte_flags(pte_t pte)
{
        return native_pte_val(pte) & PTE_FLAGS_MASK;
}

#define __pte2cm_idx(cb)                                \
        ((((cb) >> (_PAGE_BIT_PAT - 2)) & 4) |                \
         (((cb) >> (_PAGE_BIT_PCD - 1)) & 2) |                \
         (((cb) >> _PAGE_BIT_PWT) & 1))
#define __cm_idx2pte(i)                                        \
        ((((i) & 4) << (_PAGE_BIT_PAT - 2)) |                \
         (((i) & 2) << (_PAGE_BIT_PCD - 1)) |                \
         (((i) & 1) << _PAGE_BIT_PWT))

unsigned long cachemode2protval(enum page_cache_mode pcm);

static inline pgprotval_t protval_4k_2_large(pgprotval_t val)
{
        return (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) |
                ((val & _PAGE_PAT) << (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT));
}
static inline pgprot_t pgprot_4k_2_large(pgprot_t pgprot)
{
        return __pgprot(protval_4k_2_large(pgprot_val(pgprot)));
}
static inline pgprotval_t protval_large_2_4k(pgprotval_t val)
{
        return (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) |
                ((val & _PAGE_PAT_LARGE) >>
                 (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT));
}
static inline pgprot_t pgprot_large_2_4k(pgprot_t pgprot)
{
        return __pgprot(protval_large_2_4k(pgprot_val(pgprot)));
}


typedef struct page *pgtable_t;

extern pteval_t __supported_pte_mask;
extern pteval_t __default_kernel_pte_mask;
extern void set_nx(void);
extern int nx_enabled;

#define pgprot_writecombine        pgprot_writecombine
extern pgprot_t pgprot_writecombine(pgprot_t prot);

#define pgprot_writethrough        pgprot_writethrough
extern pgprot_t pgprot_writethrough(pgprot_t prot);

/* Indicate that x86 has its own track and untrack pfn vma functions */
#define __HAVE_PFNMAP_TRACKING

#define __HAVE_PHYS_MEM_ACCESS_PROT
struct file;
pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
                              unsigned long size, pgprot_t vma_prot);

/* Install a pte for a particular vaddr in kernel space. */
void set_pte_vaddr(unsigned long vaddr, pte_t pte);

#ifdef CONFIG_X86_32
extern void native_pagetable_init(void);
#else
#define native_pagetable_init        paging_init
#endif

struct seq_file;
extern void arch_report_meminfo(struct seq_file *m);

enum pg_level {
        PG_LEVEL_NONE,
        PG_LEVEL_4K,
        PG_LEVEL_2M,
        PG_LEVEL_1G,
        PG_LEVEL_512G,
        PG_LEVEL_NUM
};

#ifdef CONFIG_PROC_FS
extern void update_page_count(int level, unsigned long pages);
#else
static inline void update_page_count(int level, unsigned long pages) { }
#endif

/*
 * Helper function that returns the kernel pagetable entry controlling
 * the virtual address 'address'. NULL means no pagetable entry present.
 * NOTE: the return type is pte_t but if the pmd is PSE then we return it
 * as a pte too.
 */
extern pte_t *lookup_address(unsigned long address, unsigned int *level);
extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
                                    unsigned int *level);

struct mm_struct;
extern pte_t *lookup_address_in_mm(struct mm_struct *mm, unsigned long address,
                                   unsigned int *level);
extern pmd_t *lookup_pmd_address(unsigned long address);
extern phys_addr_t slow_virt_to_phys(void *__address);
extern int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn,
                                          unsigned long address,
                                          unsigned numpages,
                                          unsigned long page_flags);
extern int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address,
                                            unsigned long numpages);
#endif        /* !__ASSEMBLY__ */

#endif /* _ASM_X86_PGTABLE_DEFS_H */






































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Cryptographic API for algorithms (i.e., low-level API).
 *
 * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
 */
#ifndef _CRYPTO_ALGAPI_H
#define _CRYPTO_ALGAPI_H

#include <linux/crypto.h>
#include <linux/list.h>
#include <linux/kernel.h>
#include <linux/workqueue.h>

/*
 * Maximum values for blocksize and alignmask, used to allocate
 * static buffers that are big enough for any combination of
 * algs and architectures. Ciphers have a lower maximum size.
 */
#define MAX_ALGAPI_BLOCKSIZE                160
#define MAX_ALGAPI_ALIGNMASK                63
#define MAX_CIPHER_BLOCKSIZE                16
#define MAX_CIPHER_ALIGNMASK                15

struct crypto_aead;
struct crypto_instance;
struct module;
struct rtattr;
struct seq_file;
struct sk_buff;

struct crypto_type {
        unsigned int (*ctxsize)(struct crypto_alg *alg, u32 type, u32 mask);
        unsigned int (*extsize)(struct crypto_alg *alg);
        int (*init)(struct crypto_tfm *tfm, u32 type, u32 mask);
        int (*init_tfm)(struct crypto_tfm *tfm);
        void (*show)(struct seq_file *m, struct crypto_alg *alg);
        int (*report)(struct sk_buff *skb, struct crypto_alg *alg);
        void (*free)(struct crypto_instance *inst);

        unsigned int type;
        unsigned int maskclear;
        unsigned int maskset;
        unsigned int tfmsize;
};

struct crypto_instance {
        struct crypto_alg alg;

        struct crypto_template *tmpl;

        union {
                /* Node in list of instances after registration. */
                struct hlist_node list;
                /* List of attached spawns before registration. */
                struct crypto_spawn *spawns;
        };

        struct work_struct free_work;

        void *__ctx[] CRYPTO_MINALIGN_ATTR;
};

struct crypto_template {
        struct list_head list;
        struct hlist_head instances;
        struct module *module;

        int (*create)(struct crypto_template *tmpl, struct rtattr **tb);

        char name[CRYPTO_MAX_ALG_NAME];
};

struct crypto_spawn {
        struct list_head list;
        struct crypto_alg *alg;
        union {
                /* Back pointer to instance after registration.*/
                struct crypto_instance *inst;
                /* Spawn list pointer prior to registration. */
                struct crypto_spawn *next;
        };
        const struct crypto_type *frontend;
        u32 mask;
        bool dead;
        bool registered;
};

struct crypto_queue {
        struct list_head list;
        struct list_head *backlog;

        unsigned int qlen;
        unsigned int max_qlen;
};

struct scatter_walk {
        struct scatterlist *sg;
        unsigned int offset;
};

void crypto_mod_put(struct crypto_alg *alg);

int crypto_register_template(struct crypto_template *tmpl);
int crypto_register_templates(struct crypto_template *tmpls, int count);
void crypto_unregister_template(struct crypto_template *tmpl);
void crypto_unregister_templates(struct crypto_template *tmpls, int count);
struct crypto_template *crypto_lookup_template(const char *name);

int crypto_register_instance(struct crypto_template *tmpl,
                             struct crypto_instance *inst);
void crypto_unregister_instance(struct crypto_instance *inst);

int crypto_grab_spawn(struct crypto_spawn *spawn, struct crypto_instance *inst,
                      const char *name, u32 type, u32 mask);
void crypto_drop_spawn(struct crypto_spawn *spawn);
struct crypto_tfm *crypto_spawn_tfm(struct crypto_spawn *spawn, u32 type,
                                    u32 mask);
void *crypto_spawn_tfm2(struct crypto_spawn *spawn);

struct crypto_attr_type *crypto_get_attr_type(struct rtattr **tb);
int crypto_check_attr_type(struct rtattr **tb, u32 type, u32 *mask_ret);
const char *crypto_attr_alg_name(struct rtattr *rta);
int crypto_attr_u32(struct rtattr *rta, u32 *num);
int crypto_inst_setname(struct crypto_instance *inst, const char *name,
                        struct crypto_alg *alg);

void crypto_init_queue(struct crypto_queue *queue, unsigned int max_qlen);
int crypto_enqueue_request(struct crypto_queue *queue,
                           struct crypto_async_request *request);
void crypto_enqueue_request_head(struct crypto_queue *queue,
                                 struct crypto_async_request *request);
struct crypto_async_request *crypto_dequeue_request(struct crypto_queue *queue);
static inline unsigned int crypto_queue_len(struct crypto_queue *queue)
{
        return queue->qlen;
}

void crypto_inc(u8 *a, unsigned int size);
void __crypto_xor(u8 *dst, const u8 *src1, const u8 *src2, unsigned int size);

static inline void crypto_xor(u8 *dst, const u8 *src, unsigned int size)
{
        if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
            __builtin_constant_p(size) &&
            (size % sizeof(unsigned long)) == 0) {
                unsigned long *d = (unsigned long *)dst;
                unsigned long *s = (unsigned long *)src;

                while (size > 0) {
                        *d++ ^= *s++;
                        size -= sizeof(unsigned long);
                }
        } else {
                __crypto_xor(dst, dst, src, size);
        }
}

static inline void crypto_xor_cpy(u8 *dst, const u8 *src1, const u8 *src2,
                                  unsigned int size)
{
        if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
            __builtin_constant_p(size) &&
            (size % sizeof(unsigned long)) == 0) {
                unsigned long *d = (unsigned long *)dst;
                unsigned long *s1 = (unsigned long *)src1;
                unsigned long *s2 = (unsigned long *)src2;

                while (size > 0) {
                        *d++ = *s1++ ^ *s2++;
                        size -= sizeof(unsigned long);
                }
        } else {
                __crypto_xor(dst, src1, src2, size);
        }
}

static inline void *crypto_tfm_ctx_aligned(struct crypto_tfm *tfm)
{
        return PTR_ALIGN(crypto_tfm_ctx(tfm),
                         crypto_tfm_alg_alignmask(tfm) + 1);
}

static inline struct crypto_instance *crypto_tfm_alg_instance(
        struct crypto_tfm *tfm)
{
        return container_of(tfm->__crt_alg, struct crypto_instance, alg);
}

static inline void *crypto_instance_ctx(struct crypto_instance *inst)
{
        return inst->__ctx;
}

struct crypto_cipher_spawn {
        struct crypto_spawn base;
};

static inline int crypto_grab_cipher(struct crypto_cipher_spawn *spawn,
                                     struct crypto_instance *inst,
                                     const char *name, u32 type, u32 mask)
{
        type &= ~CRYPTO_ALG_TYPE_MASK;
        type |= CRYPTO_ALG_TYPE_CIPHER;
        mask |= CRYPTO_ALG_TYPE_MASK;
        return crypto_grab_spawn(&spawn->base, inst, name, type, mask);
}

static inline void crypto_drop_cipher(struct crypto_cipher_spawn *spawn)
{
        crypto_drop_spawn(&spawn->base);
}

static inline struct crypto_alg *crypto_spawn_cipher_alg(
        struct crypto_cipher_spawn *spawn)
{
        return spawn->base.alg;
}

static inline struct crypto_cipher *crypto_spawn_cipher(
        struct crypto_cipher_spawn *spawn)
{
        u32 type = CRYPTO_ALG_TYPE_CIPHER;
        u32 mask = CRYPTO_ALG_TYPE_MASK;

        return __crypto_cipher_cast(crypto_spawn_tfm(&spawn->base, type, mask));
}

static inline struct cipher_alg *crypto_cipher_alg(struct crypto_cipher *tfm)
{
        return &crypto_cipher_tfm(tfm)->__crt_alg->cra_cipher;
}

static inline struct crypto_async_request *crypto_get_backlog(
        struct crypto_queue *queue)
{
        return queue->backlog == &queue->list ? NULL :
               container_of(queue->backlog, struct crypto_async_request, list);
}

static inline u32 crypto_requires_off(struct crypto_attr_type *algt, u32 off)
{
        return (algt->type ^ off) & algt->mask & off;
}

/*
 * When an algorithm uses another algorithm (e.g., if it's an instance of a
 * template), these are the flags that should always be set on the "outer"
 * algorithm if any "inner" algorithm has them set.
 */
#define CRYPTO_ALG_INHERITED_FLAGS        \
        (CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK |        \
         CRYPTO_ALG_ALLOCATES_MEMORY)

/*
 * Given the type and mask that specify the flags restrictions on a template
 * instance being created, return the mask that should be passed to
 * crypto_grab_*() (along with type=0) to honor any request the user made to
 * have any of the CRYPTO_ALG_INHERITED_FLAGS clear.
 */
static inline u32 crypto_algt_inherited_mask(struct crypto_attr_type *algt)
{
        return crypto_requires_off(algt, CRYPTO_ALG_INHERITED_FLAGS);
}

noinline unsigned long __crypto_memneq(const void *a, const void *b, size_t size);

/**
 * crypto_memneq - Compare two areas of memory without leaking
 *                   timing information.
 *
 * @a: One area of memory
 * @b: Another area of memory
 * @size: The size of the area.
 *
 * Returns 0 when data is equal, 1 otherwise.
 */
static inline int crypto_memneq(const void *a, const void *b, size_t size)
{
        return __crypto_memneq(a, b, size) != 0UL ? 1 : 0;
}

int crypto_register_notifier(struct notifier_block *nb);
int crypto_unregister_notifier(struct notifier_block *nb);

/* Crypto notification events. */
enum {
        CRYPTO_MSG_ALG_REQUEST,
        CRYPTO_MSG_ALG_REGISTER,
        CRYPTO_MSG_ALG_LOADED,
};

#endif        /* _CRYPTO_ALGAPI_H */
























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ASM_X86_XSAVE_H
#define __ASM_X86_XSAVE_H

#include <linux/uaccess.h>
#include <linux/types.h>

#include <asm/processor.h>
#include <asm/user.h>

/* Bit 63 of XCR0 is reserved for future expansion */
#define XFEATURE_MASK_EXTEND        (~(XFEATURE_MASK_FPSSE | (1ULL << 63)))

#define XSTATE_CPUID                0x0000000d

#define FXSAVE_SIZE        512

#define XSAVE_HDR_SIZE            64
#define XSAVE_HDR_OFFSET    FXSAVE_SIZE

#define XSAVE_YMM_SIZE            256
#define XSAVE_YMM_OFFSET    (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)

#define XSAVE_ALIGNMENT     64

/* All currently supported user features */
#define XFEATURE_MASK_USER_SUPPORTED (XFEATURE_MASK_FP | \
                                      XFEATURE_MASK_SSE | \
                                      XFEATURE_MASK_YMM | \
                                      XFEATURE_MASK_OPMASK | \
                                      XFEATURE_MASK_ZMM_Hi256 | \
                                      XFEATURE_MASK_Hi16_ZMM         | \
                                      XFEATURE_MASK_PKRU | \
                                      XFEATURE_MASK_BNDREGS | \
                                      XFEATURE_MASK_BNDCSR)

/* All currently supported supervisor features */
#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID)

/*
 * A supervisor state component may not always contain valuable information,
 * and its size may be huge. Saving/restoring such supervisor state components
 * at each context switch can cause high CPU and space overhead, which should
 * be avoided. Such supervisor state components should only be saved/restored
 * on demand. The on-demand dynamic supervisor features are set in this mask.
 *
 * Unlike the existing supported supervisor features, a dynamic supervisor
 * feature does not allocate a buffer in task->fpu, and the corresponding
 * supervisor state component cannot be saved/restored at each context switch.
 *
 * To support a dynamic supervisor feature, a developer should follow the
 * dos and don'ts as below:
 * - Do dynamically allocate a buffer for the supervisor state component.
 * - Do manually invoke the XSAVES/XRSTORS instruction to save/restore the
 *   state component to/from the buffer.
 * - Don't set the bit corresponding to the dynamic supervisor feature in
 *   IA32_XSS at run time, since it has been set at boot time.
 */
#define XFEATURE_MASK_DYNAMIC (XFEATURE_MASK_LBR)

/*
 * Unsupported supervisor features. When a supervisor feature in this mask is
 * supported in the future, move it to the supported supervisor feature mask.
 */
#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT)

/* All supervisor states including supported and unsupported states. */
#define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \
                                      XFEATURE_MASK_DYNAMIC | \
                                      XFEATURE_MASK_SUPERVISOR_UNSUPPORTED)

#ifdef CONFIG_X86_64
#define REX_PREFIX        "0x48, "
#else
#define REX_PREFIX
#endif

extern u64 xfeatures_mask_all;

static inline u64 xfeatures_mask_supervisor(void)
{
        return xfeatures_mask_all & XFEATURE_MASK_SUPERVISOR_SUPPORTED;
}

static inline u64 xfeatures_mask_user(void)
{
        return xfeatures_mask_all & XFEATURE_MASK_USER_SUPPORTED;
}

static inline u64 xfeatures_mask_dynamic(void)
{
        if (!boot_cpu_has(X86_FEATURE_ARCH_LBR))
                return XFEATURE_MASK_DYNAMIC & ~XFEATURE_MASK_LBR;

        return XFEATURE_MASK_DYNAMIC;
}

extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];

extern void __init update_regset_xstate_info(unsigned int size,
                                             u64 xstate_mask);

void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);
const void *get_xsave_field_ptr(int xfeature_nr);
int using_compacted_format(void);
int xfeature_size(int xfeature_nr);
struct membuf;
void copy_xstate_to_kernel(struct membuf to, struct xregs_state *xsave);
int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf);
int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf);
void copy_supervisor_to_kernel(struct xregs_state *xsave);
void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask);
void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask);


/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
int validate_user_xstate_header(const struct xstate_header *hdr);

#endif





















    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_DMA_MAPPING_H
#define _ASM_X86_DMA_MAPPING_H

/*
 * IOMMU interface. See Documentation/core-api/dma-api-howto.rst and
 * Documentation/core-api/dma-api.rst for documentation.
 */

#include <linux/scatterlist.h>
#include <asm/io.h>
#include <asm/swiotlb.h>

extern int iommu_merge;
extern int panic_on_overflow;

extern const struct dma_map_ops *dma_ops;

static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
{
        return dma_ops;
}

#endif
























































    4 




    3 


































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#ifndef _LINUX_MMAP_LOCK_H
#define _LINUX_MMAP_LOCK_H

#include <linux/mmdebug.h>

#define MMAP_LOCK_INITIALIZER(name) \
        .mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock),

static inline void mmap_init_lock(struct mm_struct *mm)
{
        init_rwsem(&mm->mmap_lock);
}

static inline void mmap_write_lock(struct mm_struct *mm)
{
        down_write(&mm->mmap_lock);
}

static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass)
{
        down_write_nested(&mm->mmap_lock, subclass);
}

static inline int mmap_write_lock_killable(struct mm_struct *mm)
{
        return down_write_killable(&mm->mmap_lock);
}

static inline bool mmap_write_trylock(struct mm_struct *mm)
{
        return down_write_trylock(&mm->mmap_lock) != 0;
}

static inline void mmap_write_unlock(struct mm_struct *mm)
{
        up_write(&mm->mmap_lock);
}

static inline void mmap_write_downgrade(struct mm_struct *mm)
{
        downgrade_write(&mm->mmap_lock);
}

static inline void mmap_read_lock(struct mm_struct *mm)
{
        down_read(&mm->mmap_lock);
}

static inline int mmap_read_lock_killable(struct mm_struct *mm)
{
        return down_read_killable(&mm->mmap_lock);
}

static inline bool mmap_read_trylock(struct mm_struct *mm)
{
        return down_read_trylock(&mm->mmap_lock) != 0;
}

static inline void mmap_read_unlock(struct mm_struct *mm)
{
        up_read(&mm->mmap_lock);
}

static inline bool mmap_read_trylock_non_owner(struct mm_struct *mm)
{
        if (down_read_trylock(&mm->mmap_lock)) {
                rwsem_release(&mm->mmap_lock.dep_map, _RET_IP_);
                return true;
        }
        return false;
}

static inline void mmap_read_unlock_non_owner(struct mm_struct *mm)
{
        up_read_non_owner(&mm->mmap_lock);
}

static inline void mmap_assert_locked(struct mm_struct *mm)
{
        lockdep_assert_held(&mm->mmap_lock);
        VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
}

static inline void mmap_assert_write_locked(struct mm_struct *mm)
{
        lockdep_assert_held_write(&mm->mmap_lock);
        VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
}

static inline int mmap_lock_is_contended(struct mm_struct *mm)
{
        return rwsem_is_contended(&mm->mmap_lock);
}

#endif /* _LINUX_MMAP_LOCK_H */













































































































































    1 




    1 












































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright (c) 2009-2021 Christoph Hellwig
 *
 * NOTE: none of these tracepoints shall be consider a stable kernel ABI
 * as they can change at any time.
 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM iomap

#if !defined(_IOMAP_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
#define _IOMAP_TRACE_H

#include <linux/tracepoint.h>

struct inode;

DECLARE_EVENT_CLASS(iomap_readpage_class,
        TP_PROTO(struct inode *inode, int nr_pages),
        TP_ARGS(inode, nr_pages),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(u64, ino)
                __field(int, nr_pages)
        ),
        TP_fast_assign(
                __entry->dev = inode->i_sb->s_dev;
                __entry->ino = inode->i_ino;
                __entry->nr_pages = nr_pages;
        ),
        TP_printk("dev %d:%d ino 0x%llx nr_pages %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->nr_pages)
)

#define DEFINE_READPAGE_EVENT(name)                \
DEFINE_EVENT(iomap_readpage_class, name,        \
        TP_PROTO(struct inode *inode, int nr_pages), \
        TP_ARGS(inode, nr_pages))
DEFINE_READPAGE_EVENT(iomap_readpage);
DEFINE_READPAGE_EVENT(iomap_readahead);

DECLARE_EVENT_CLASS(iomap_range_class,
        TP_PROTO(struct inode *inode, unsigned long off, unsigned int len),
        TP_ARGS(inode, off, len),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(u64, ino)
                __field(loff_t, size)
                __field(unsigned long, offset)
                __field(unsigned int, length)
        ),
        TP_fast_assign(
                __entry->dev = inode->i_sb->s_dev;
                __entry->ino = inode->i_ino;
                __entry->size = i_size_read(inode);
                __entry->offset = off;
                __entry->length = len;
        ),
        TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset %lx "
                  "length %x",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->size,
                  __entry->offset,
                  __entry->length)
)

#define DEFINE_RANGE_EVENT(name)                \
DEFINE_EVENT(iomap_range_class, name,        \
        TP_PROTO(struct inode *inode, unsigned long off, unsigned int len),\
        TP_ARGS(inode, off, len))
DEFINE_RANGE_EVENT(iomap_writepage);
DEFINE_RANGE_EVENT(iomap_releasepage);
DEFINE_RANGE_EVENT(iomap_invalidatepage);
DEFINE_RANGE_EVENT(iomap_dio_invalidate_fail);

#define IOMAP_TYPE_STRINGS \
        { IOMAP_HOLE,                "HOLE" }, \
        { IOMAP_DELALLOC,        "DELALLOC" }, \
        { IOMAP_MAPPED,                "MAPPED" }, \
        { IOMAP_UNWRITTEN,        "UNWRITTEN" }, \
        { IOMAP_INLINE,                "INLINE" }

#define IOMAP_FLAGS_STRINGS \
        { IOMAP_WRITE,                "WRITE" }, \
        { IOMAP_ZERO,                "ZERO" }, \
        { IOMAP_REPORT,                "REPORT" }, \
        { IOMAP_FAULT,                "FAULT" }, \
        { IOMAP_DIRECT,                "DIRECT" }, \
        { IOMAP_NOWAIT,                "NOWAIT" }

#define IOMAP_F_FLAGS_STRINGS \
        { IOMAP_F_NEW,                "NEW" }, \
        { IOMAP_F_DIRTY,        "DIRTY" }, \
        { IOMAP_F_SHARED,        "SHARED" }, \
        { IOMAP_F_MERGED,        "MERGED" }, \
        { IOMAP_F_BUFFER_HEAD,        "BH" }, \
        { IOMAP_F_SIZE_CHANGED,        "SIZE_CHANGED" }

DECLARE_EVENT_CLASS(iomap_class,
        TP_PROTO(struct inode *inode, struct iomap *iomap),
        TP_ARGS(inode, iomap),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(u64, ino)
                __field(u64, addr)
                __field(loff_t, offset)
                __field(u64, length)
                __field(u16, type)
                __field(u16, flags)
                __field(dev_t, bdev)
        ),
        TP_fast_assign(
                __entry->dev = inode->i_sb->s_dev;
                __entry->ino = inode->i_ino;
                __entry->addr = iomap->addr;
                __entry->offset = iomap->offset;
                __entry->length = iomap->length;
                __entry->type = iomap->type;
                __entry->flags = iomap->flags;
                __entry->bdev = iomap->bdev ? iomap->bdev->bd_dev : 0;
        ),
        TP_printk("dev %d:%d ino 0x%llx bdev %d:%d addr %lld offset %lld "
                  "length %llu type %s flags %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  MAJOR(__entry->bdev), MINOR(__entry->bdev),
                  __entry->addr,
                  __entry->offset,
                  __entry->length,
                  __print_symbolic(__entry->type, IOMAP_TYPE_STRINGS),
                  __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS))
)

#define DEFINE_IOMAP_EVENT(name)                \
DEFINE_EVENT(iomap_class, name,        \
        TP_PROTO(struct inode *inode, struct iomap *iomap), \
        TP_ARGS(inode, iomap))
DEFINE_IOMAP_EVENT(iomap_apply_dstmap);
DEFINE_IOMAP_EVENT(iomap_apply_srcmap);
DEFINE_IOMAP_EVENT(iomap_iter_dstmap);
DEFINE_IOMAP_EVENT(iomap_iter_srcmap);

TRACE_EVENT(iomap_apply,
        TP_PROTO(struct inode *inode, loff_t pos, loff_t length,
                unsigned int flags, const void *ops, void *actor,
                unsigned long caller),
        TP_ARGS(inode, pos, length, flags, ops, actor, caller),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(u64, ino)
                __field(loff_t, pos)
                __field(loff_t, length)
                __field(unsigned int, flags)
                __field(const void *, ops)
                __field(void *, actor)
                __field(unsigned long, caller)
        ),
        TP_fast_assign(
                __entry->dev = inode->i_sb->s_dev;
                __entry->ino = inode->i_ino;
                __entry->pos = pos;
                __entry->length = length;
                __entry->flags = flags;
                __entry->ops = ops;
                __entry->actor = actor;
                __entry->caller = caller;
        ),
        TP_printk("dev %d:%d ino 0x%llx pos %lld length %lld flags %s (0x%x) "
                  "ops %ps caller %pS actor %ps",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                   __entry->ino,
                   __entry->pos,
                   __entry->length,
                   __print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS),
                   __entry->flags,
                   __entry->ops,
                   (void *)__entry->caller,
                   __entry->actor)
);

TRACE_EVENT(iomap_iter,
        TP_PROTO(struct iomap_iter *iter, const void *ops,
                 unsigned long caller),
        TP_ARGS(iter, ops, caller),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(u64, ino)
                __field(loff_t, pos)
                __field(loff_t, length)
                __field(unsigned int, flags)
                __field(const void *, ops)
                __field(unsigned long, caller)
        ),
        TP_fast_assign(
                __entry->dev = iter->inode->i_sb->s_dev;
                __entry->ino = iter->inode->i_ino;
                __entry->pos = iter->pos;
                __entry->length = iomap_length(iter);
                __entry->flags = iter->flags;
                __entry->ops = ops;
                __entry->caller = caller;
        ),
        TP_printk("dev %d:%d ino 0x%llx pos %lld length %lld flags %s (0x%x) ops %ps caller %pS",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                   __entry->ino,
                   __entry->pos,
                   __entry->length,
                   __print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS),
                   __entry->flags,
                   __entry->ops,
                   (void *)__entry->caller)
);

#endif /* _IOMAP_TRACE_H */

#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH .
#define TRACE_INCLUDE_FILE trace
#include <trace/define_trace.h>

perf_event_open(&(0x7f0000000480)={0x1, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x4304, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @perf_config_ext, 0x0, 0x4}, 0x0, 0xffffffffffffffff, 0xffffffffffffffff, 0x0)
write(0xffffffffffffffff, 0x0, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000100)='./file1\x00', 0x105142, 0x0)
symlinkat(&(0x7f0000000240)='./file1\x00', r0, &(0x7f00000002c0)='./file1\x00')
lseek(0xffffffffffffffff, 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000280)='/proc/self/exe\x00', 0x145140, 0x0)
r2 = socket$nl_audit(0x10, 0x3, 0x9)
r3 = openat$dir(0xffffffffffffff9c, &(0x7f0000000180)='./file1\x00', 0x202102, 0x0)
mknodat(r3, &(0x7f0000000200)='./file1\x00', 0x10, 0x1)
r4 = socket$nl_audit(0x10, 0x3, 0x9)
open_by_handle_at(r1, &(0x7f0000000140)=@FILEID_NILFS_WITHOUT_PARENT={0x20, 0x61, {0x0, 0x3f, 0x3, 0x4, 0xfffffffffffffffa}}, 0x400)
sendmsg$AUDIT_USER_AVC(r4, &(0x7f0000003280)={0x0, 0x0, &(0x7f0000003240)={&(0x7f00000001c0)=ANY=[@ANYBLOB="10f920021071000000000000e1"], 0x10}}, 0x0)
inotify_init1(0x0)
r5 = openat(0xffffffffffffff9c, 0x0, 0x0, 0x0)
signalfd(r0, &(0x7f0000000300)={[0x8]}, 0x8)
timerfd_create(0x7, 0x0)
io_uring_register$IORING_REGISTER_FILES_UPDATE(r5, 0x6, &(0x7f00000000c0)={0x951, 0x0, 0x0}, 0x0)
syz_open_procfs(0xffffffffffffffff, &(0x7f0000000040)='net/rt6_stats\x00')
io_uring_register$IORING_REGISTER_FILES_UPDATE(r2, 0x6, &(0x7f0000000080)={0xc13, 0x0, &(0x7f0000000000)=[r0]}, 0x1)
sendfile(r0, r1, 0x0, 0x100000001)

r0 = openat$sr(0xffffffffffffff9c, &(0x7f00000035c0), 0xe8b02, 0x0)
mmap(&(0x7f0000000000/0xc00000)=nil, 0xc00000, 0x0, 0x1f012, r0, 0x0)
signalfd(0xffffffffffffffff, &(0x7f0000000200), 0x8)

seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000000)={0x1, &(0x7f00000001c0)=[{0x6, 0x0, 0x0, 0x7ffc0000}]})
signalfd(0xffffffffffffffff, &(0x7f0000000180), 0x8)

mknod$loop(&(0x7f0000000300)='./file0\x00', 0x6000, 0x0)
setxattr$trusted_overlay_opaque(&(0x7f0000000400)='./file0\x00', &(0x7f0000000440), &(0x7f0000000480), 0x2, 0x1)
r0 = creat(&(0x7f00000000c0)='./file0\x00', 0x0)
perf_event_open(&(0x7f0000001d80)={0x1, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5e, 0x0, 0xc, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xfffffffd, 0x0, @perf_config_ext}, 0x0, 0xffffffffffffffff, 0xffffffffffffffff, 0x0)
ioctl$BLKTRACESETUP(r0, 0xc0481273, &(0x7f0000000000)={'\x00', 0x0, 0x100, 0xb91})
r1 = openat$sysfs(0xffffffffffffff9c, &(0x7f00000003c0)='/sys/module/8250', 0x460800, 0x8)
ioctl$BLKTRACETEARDOWN(r1, 0x1276, 0x0)
signalfd(0xffffffffffffffff, &(0x7f0000000140), 0x8)
ioctl$BLKTRACETEARDOWN(r0, 0x1276, 0x0)
fcntl$getown(0xffffffffffffffff, 0x9)
r2 = openat$nvram(0xffffffffffffff9c, 0x0, 0x0, 0x0)
ioctl$FS_IOC_FSSETXATTR(0xffffffffffffffff, 0x401c5820, &(0x7f0000000180)={0xb0})
openat(r2, 0x0, 0x18000, 0x80)
accept$packet(0xffffffffffffffff, &(0x7f0000000180)={0x11, 0x0, 0x0, 0x1, 0x0, 0x6, @dev}, &(0x7f00000001c0)=0x14)
ioctl$BLKBSZGET(0xffffffffffffffff, 0x80081270, &(0x7f0000000380))
ioctl$KDGKBTYPE(r2, 0x4b33, &(0x7f0000000080))

r0 = perf_event_open(&(0x7f0000001d80)={0x1, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x4307, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @perf_config_ext}, 0x0, 0xffffffffffffffff, 0xffffffffffffffff, 0x0)
r1 = syz_io_uring_setup(0x133, &(0x7f0000000180)={0x0, 0x0, 0x0, 0x4000}, &(0x7f00000a0000)=nil, &(0x7f0000ffc000/0x3000)=nil, &(0x7f0000000200)=<r2=>0x0, &(0x7f0000000140)=<r3=>0x0)
ioctl$AUTOFS_DEV_IOCTL_REQUESTER(0xffffffffffffffff, 0xc018937b, 0x0)
openat(0xffffffffffffff9c, &(0x7f0000000080)='/proc/self/exe\x00', 0x0, 0x40)
copy_file_range(0xffffffffffffffff, 0x0, 0xffffffffffffffff, 0x0, 0x0, 0x0)
ioctl$FS_IOC_GETFSMAP(0xffffffffffffffff, 0xc0c0583b, 0x0)
socketpair$unix(0x1, 0x2, 0x0, 0x0)
syz_io_uring_submit(r2, r3, &(0x7f0000000000)=@IORING_OP_WRITE={0x17, 0x0, 0x0, @fd_index=0x2, 0x2, 0x0}, 0x0)
r4 = mmap$IORING_OFF_SQ_RING(&(0x7f0000ffc000/0x4000)=nil, 0x4000, 0x100000e, 0x13, r1, 0x0)
syz_io_uring_submit(r4, r3, &(0x7f0000000100)=@IORING_OP_ACCEPT={0xd, 0x0, 0x0, 0xffffffffffffffff, 0x0}, 0x8001)
syz_io_uring_submit(r4, r3, &(0x7f0000000240)=@IORING_OP_FALLOCATE={0x11, 0x2, 0x0, @fd_index=0x3, 0x4, 0x0, 0xe7ec, 0x0, 0x1}, 0x80d5)
signalfd(r0, &(0x7f0000000280)={[0x4]}, 0x8)
io_uring_enter(r1, 0x2c36, 0x0, 0x0, 0x0, 0x0)

r0 = perf_event_open(&(0x7f0000001d80)={0x1, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5e, 0x10240, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x3, 0x0, @perf_config_ext={0x0, 0x400}}, 0x0, 0xffffffffffffffff, 0xffffffffffffffff, 0x0)
r1 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r1, &(0x7f0000000000)={0x0, 0x0, &(0x7f00000000c0)={&(0x7f0000000280)=@newpolicy={0xb8, 0x13, 0x1, 0x0, 0x0, {{@in=@broadcast, @in=@multicast1, 0x4e22, 0x0, 0x2000, 0x0, 0xa, 0x40, 0x3c, 0x3b}, {0x0, 0x0, 0x0, 0x0, 0x0, 0x2}}}, 0xb8}}, 0x0)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000100)='./file1\x00', 0x42, 0x0)
fcntl$lock(r2, 0x6, &(0x7f0000000180))
sendmsg$NL80211_CMD_GET_REG(0xffffffffffffffff, &(0x7f0000000580)={&(0x7f0000000140)={0x10, 0x0, 0x0, 0x4}, 0xc, &(0x7f0000000440)={&(0x7f0000000380)={0x80, 0x0, 0x406, 0x70bd26, 0x25dfdbfe, {}, [@NL80211_ATTR_WIPHY={0x8, 0x1, 0x73}, @NL80211_ATTR_SOCKET_OWNER={0x4}, @NL80211_ATTR_REG_RULES={0x60, 0x22, 0x0, 0x1, [{0x14, 0x0, 0x0, 0x1, [@NL80211_ATTR_FREQ_RANGE_MAX_BW={0x8, 0x4, 0x30a}, @NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN={0x8, 0x5, 0x3}]}, {0xc, 0x0, 0x0, 0x1, [@NL80211_ATTR_FREQ_RANGE_START={0x8, 0x2, 0x101}]}, {0x3c, 0x0, 0x0, 0x1, [@NL80211_ATTR_REG_RULE_FLAGS={0x8, 0x1, 0xfff}, @NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN={0x8, 0x5, 0xffff296c}, @NL80211_ATTR_REG_RULE_FLAGS={0x8, 0x1, 0x1}, @NL80211_ATTR_DFS_CAC_TIME={0x8, 0x7, 0x3f}, @NL80211_ATTR_FREQ_RANGE_END={0x8, 0x3, 0x7}, @NL80211_ATTR_FREQ_RANGE_START={0x8, 0x2, 0x3}, @NL80211_ATTR_FREQ_RANGE_MAX_BW={0x8, 0x4, 0x8001}]}]}]}, 0x80}, 0x1, 0x0, 0x0, 0x4004004}, 0x20000080)
ioctl$AUTOFS_DEV_IOCTL_PROTOSUBVER(0xffffffffffffffff, 0xc0189373, &(0x7f0000000240)=ANY=[@ANYBLOB="080000007972e40348de0100000001000000cc7141f8ffffffff", @ANYRES32=<r3=>r0, @ANYBLOB="02000000000000002e2f66696c653100"])
perf_event_open(0xfffffffffffffffc, 0x0, 0xffffffffffffffff, r3, 0x8)
bind$inet6(0xffffffffffffffff, &(0x7f0000006e40)={0xa, 0x0, 0x0, @mcast1, 0x5}, 0x1c)
signalfd(0xffffffffffffffff, &(0x7f00000001c0)={[0x9]}, 0x8)
sendmsg$NL80211_CMD_SET_MPATH(0xffffffffffffffff, &(0x7f0000000800)={&(0x7f0000000200)={0x10, 0x0, 0x0, 0x2000000}, 0xc, &(0x7f00000007c0)={&(0x7f0000000a00)=ANY=[@ANYBLOB="80000000", @ANYRES16=0x0, @ANYBLOB="200028bd7000fcdbdf25160000000a000600ffffffff9f7dd263f6996879ffffffffffff00000a000600ffffffffffff00000a000600ffffffffffff00000a000600ffffffffffff00000a001a0008021100000000000a000600fc888af5df5fcb030a001a00afffffffffff00000a001a00080211cd69303800000000001a398a29df5072808839623fc731ac89551599358e1167976556f7ca1dcd49854934335e0b5b14c23905735a06074ed05f00729c093d152781359ff3f183fb43096cf8cc8babe75816d64c1ac5367dfc7788f79f5130e8c68e3c824b352447c940ef7272746e897558c3b9dc50e8b9b38194831c6d5126739852bc9dcdc8fb635cbf94dbe26523dc2c8417b40a5b66049196e2f8933076fc780c9f3b23bf7906ab55838c4e140d5bfe468fb195ffa185e888d579c1a06a57487b4791093501a4cc112a82f0898eccf1e08c34fb246ae10fa9ae6995fe3c053a21a33f1ffe5eaa43f8be76389e2a862862a208a0ba0ba2c017da326dd1de49a857a2e95529033aea9645d45148212d9f71795106a98f6d8c69c228a7945df2cf9947fee0"], 0x80}, 0x1, 0x0, 0x0, 0x80}, 0x20000080)
ioctl$FS_IOC_GET_ENCRYPTION_POLICY(0xffffffffffffffff, 0x400c6615, &(0x7f0000000180)={0x0, @aes256, 0x0, @desc4})
syz_init_net_socket$nl_generic(0x10, 0x3, 0x10)
sendmsg$NL80211_CMD_GET_REG(0xffffffffffffffff, &(0x7f0000000600)={0x0, 0x0, &(0x7f0000000480)={&(0x7f00000004c0)=ANY=[@ANYBLOB="2c365d00bac006a872e83e0110d50df05898e40cd1130e1859be045bd16e529630c979c8e3983cdbb6f473c5efb7257b290d6669b550228fe7dab5f23d3b7c43d56e7ac241afa7c96112cceba482127b61a2a262a41fdecee3ccf1aa0d8b0bf9264cebe5cbe0db1367d4b2e62e1bf3a667f85e55d7c712708e22e60bee99cc", @ANYRES16=0x0, @ANYBLOB="010000000000000000001f00000008000100000000000400cc00060021006200000004002280"], 0x2c}}, 0x0)

r0 = perf_event_open(&(0x7f0000001d80)={0x1, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5e, 0x10240, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x3, 0x0, @perf_config_ext={0x0, 0x400}}, 0x0, 0xffffffffffffffff, 0xffffffffffffffff, 0x0)
r1 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r1, &(0x7f0000000000)={0x0, 0x0, &(0x7f00000000c0)={&(0x7f0000000280)=@newpolicy={0xb8, 0x13, 0x1, 0x0, 0x0, {{@in=@broadcast, @in=@multicast1, 0x4e22, 0x0, 0x2000, 0x0, 0xa, 0x40, 0x3c, 0x3b}, {0x0, 0x0, 0x0, 0x0, 0x0, 0x2}}}, 0xb8}}, 0x0)
setsockopt$inet6_mreq(0xffffffffffffffff, 0x29, 0x1b, &(0x7f0000000400)={@rand_addr=' \x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01'}, 0x14)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000100)='./file1\x00', 0x42, 0x0)
fcntl$lock(r2, 0x6, &(0x7f0000000180))
r3 = syz_genetlink_get_family_id$nl80211(&(0x7f0000000340), r2)
sendmsg$NL80211_CMD_GET_REG(0xffffffffffffffff, &(0x7f0000000580)={&(0x7f0000000140)={0x10, 0x0, 0x0, 0x4}, 0xc, &(0x7f0000000440)={&(0x7f0000000380)={0x80, r3, 0x406, 0x70bd26, 0x25dfdbfe, {}, [@NL80211_ATTR_WIPHY={0x8, 0x1, 0x73}, @NL80211_ATTR_SOCKET_OWNER={0x4}, @NL80211_ATTR_REG_RULES={0x60, 0x22, 0x0, 0x1, [{0x14, 0x0, 0x0, 0x1, [@NL80211_ATTR_FREQ_RANGE_MAX_BW={0x8, 0x4, 0x30a}, @NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN={0x8, 0x5, 0x3}]}, {0xc, 0x0, 0x0, 0x1, [@NL80211_ATTR_FREQ_RANGE_START={0x8, 0x2, 0x101}]}, {0x3c, 0x0, 0x0, 0x1, [@NL80211_ATTR_REG_RULE_FLAGS={0x8, 0x1, 0xfff}, @NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN={0x8, 0x5, 0xffff296c}, @NL80211_ATTR_REG_RULE_FLAGS={0x8, 0x1, 0x1}, @NL80211_ATTR_DFS_CAC_TIME={0x8, 0x7, 0x3f}, @NL80211_ATTR_FREQ_RANGE_END={0x8, 0x3, 0x7}, @NL80211_ATTR_FREQ_RANGE_START={0x8, 0x2, 0x3}, @NL80211_ATTR_FREQ_RANGE_MAX_BW={0x8, 0x4, 0x8001}]}]}]}, 0x80}, 0x1, 0x0, 0x0, 0x4004004}, 0x20000080)
ioctl$TIOCGPGRP(r2, 0x540f, &(0x7f0000000040)=<r4=>0x0)
ioctl$AUTOFS_DEV_IOCTL_PROTOSUBVER(0xffffffffffffffff, 0xc0189373, &(0x7f0000000240)=ANY=[@ANYBLOB="080000007972e40348de0100000001000000cc7141f8ffffffff", @ANYRES32=<r5=>r0, @ANYBLOB="02000000000000002e2f66696c653100"])
perf_event_open(0xfffffffffffffffc, r4, 0xffffffffffffffff, r5, 0x8)
perf_event_open(&(0x7f0000001d80)={0x6, 0x80, 0x7, 0x0, 0xb, 0x0, 0x4, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @perf_bp={&(0x7f00000009c0)}, 0x8101, 0x3, 0x0, 0x0, 0x4, 0x0, 0x9, 0x0, 0x1, 0x0, 0x8000000000000}, 0xffffffffffffffff, 0x0, 0xffffffffffffffff, 0x8)
ioctl$sock_SIOCGIFINDEX_802154(0xffffffffffffffff, 0x8933, &(0x7f0000000680)={'wpan0\x00'})
r6 = socket$inet6_udp(0xa, 0x2, 0x0)
bind$inet6(r6, &(0x7f0000006e40)={0xa, 0x0, 0x0, @mcast1, 0x5}, 0x1c)
signalfd(r6, &(0x7f00000001c0)={[0x9]}, 0x8)
sendmsg$NL80211_CMD_SET_MPATH(0xffffffffffffffff, &(0x7f0000000800)={&(0x7f0000000200)={0x10, 0x0, 0x0, 0x2000000}, 0xc, &(0x7f00000007c0)={&(0x7f0000000a00)=ANY=[@ANYBLOB="80000000", @ANYRES16=0x0, @ANYBLOB="200028bd7000fcdbdf25160000000a000600ffffffff9f7dd263f6996879ffffffffffff00000a000600ffffffffffff00000a000600ffffffffffff00000a000600ffffffffffff00000a001a0008021100000000000a000600fc888af5df5fcb030a001a00afffffffffff00000a001a00080211cd69303800000000001a398a29df5072808839623fc731ac89551599358e1167976556f7ca1dcd49854934335e0b5b14c23905735a06074ed05f00729c093d152781359ff3f183fb43096cf8cc8babe75816d64c1ac5367dfc7788f79f5130e8c68e3c824b352447c940ef7272746e897558c3b9dc50e8b9b38194831c6d5126739852bc9dcdc8fb635cbf94dbe26523dc2c8417b40a5b66049196e2f8933076fc780c9f3b23bf7906ab55838c4e140d5bfe468fb195ffa185e888d579c1a06a57487b4791093501a4cc112a82f0898eccf1e08c34fb246ae10fa9ae6995fe3c053a21a33f1ffe5eaa43f8be76389e2a862862a208a0ba0ba2c017da326dd1de49a857a2e95529033aea9645d45148212d9f71795106a98f6d8c69c228a7945df2cf9947fee0"], 0x80}, 0x1, 0x0, 0x0, 0x80}, 0x20000080)
ioctl$FS_IOC_GET_ENCRYPTION_POLICY(0xffffffffffffffff, 0x400c6615, &(0x7f0000000180)={0x0, @aes256, 0x0, @desc4})
syz_init_net_socket$nl_generic(0x10, 0x3, 0x10)

perf_event_open(&(0x7f0000001d80)={0x2, 0x80, 0x96, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @perf_config_ext}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
signalfd(0xffffffffffffffff, &(0x7f0000000200), 0x8)

r0 = perf_event_open(&(0x7f0000000480)={0x1, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x4307, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @perf_config_ext}, 0x0, 0xffffffffffffffff, 0xffffffffffffffff, 0x0)
r1 = openat$procfs(0xffffffffffffff9c, &(0x7f0000000000)='/proc/keys\x00', 0x0, 0x0)
r2 = fsmount(r1, 0x0, 0x8)
preadv(r1, &(0x7f0000001580)=[{&(0x7f0000000500)=""/4102, 0x1006}], 0x10000000000001c7, 0x3, 0x0)
openat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0x10f6e2, 0x0)
r3 = signalfd(r0, &(0x7f0000000100)={[0xd9]}, 0x8)
ioctl$BTRFS_IOC_QUOTA_RESCAN_STATUS(r3, 0x8040942d, &(0x7f0000000140))
mmap$usbmon(&(0x7f0000ffd000/0x3000)=nil, 0x3000, 0x0, 0x12, r2, 0x6)
fsetxattr$trusted_overlay_opaque(0xffffffffffffffff, &(0x7f00000000c0), &(0x7f0000000100), 0x2, 0x0)
openat(r1, &(0x7f0000000400)='./file0\x00', 0x111802, 0x111)
r4 = syz_genetlink_get_family_id$nl80211(&(0x7f0000000080), 0xffffffffffffffff)
sendmsg$NL80211_CMD_UPDATE_OWE_INFO(0xffffffffffffffff, &(0x7f0000000280)={&(0x7f0000000180)={0x10, 0x0, 0x0, 0x40000000}, 0xc, &(0x7f0000000240)={&(0x7f0000002680)=ANY=[@ANYBLOB="959c0e07bb4e3a37d7d0363a97de5415984b684ac62fdad9d7e7ab30e6bbda197f17fa11e29272492d8d6a33b51a5b13533e000780b14571e36c0ea15be8dcb41e684eab38ea8aa73617475f314442868e8cd7130b78f54963990a9b1bcee9e264a8e14d2a46f5e4aa1d98fb28e670efaba70402a9746cf6a453ddea8eff49535e972b980292f8bc4212ba9bbdef93e290ddbd5b6f510c877af9560c9e615ee3ff51d07e1f6b03ac87ccf260a39e8caddb79edb78bb6c14d54c9cc0201", @ANYRES16=r4, @ANYBLOB, @ANYRES32=0x0, @ANYBLOB="0c0099003fffffff11000000060048005d0000000600480002000000"], 0x38}, 0x1, 0x0, 0x0, 0x80}, 0x20000008)
socket$inet(0x2, 0x2, 0x0)
preadv(0xffffffffffffffff, &(0x7f00000002c0)=[{&(0x7f0000000200)=""/33, 0x21}, {&(0x7f00000015c0)=""/178, 0xb2}], 0x2, 0xff, 0x1f)

r0 = fsopen(&(0x7f0000000040)='binfmt_misc\x00', 0x0)
fsconfig$FSCONFIG_CMD_CREATE(r0, 0x6, 0x0, 0x0, 0x0)
r1 = fsmount(r0, 0x0, 0x0)
signalfd(r1, &(0x7f0000001300), 0x8)

close_range(0xffffffffffffffff, 0xffffffffffffffff, 0x2)
r0 = signalfd(0xffffffffffffffff, &(0x7f0000000000), 0x8)
signalfd(r0, &(0x7f0000000080), 0x8)

perf_event_open(&(0x7f0000001d80)={0x2, 0x80, 0x97, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @perf_bp={0x0}}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
signalfd(0xffffffffffffffff, &(0x7f00000000c0), 0x8)

setrlimit(0x7, &(0x7f0000000000))
signalfd(0xffffffffffffffff, &(0x7f0000000200), 0x8)

perf_event_open(&(0x7f0000000480)={0x1, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x4304, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @perf_config_ext}, 0x0, 0xffffffffffffffff, 0xffffffffffffffff, 0x0)
mmap(&(0x7f0000ffd000/0x3000)=nil, 0x3000, 0x8, 0x10, 0xffffffffffffffff, 0x1fe13000)
signalfd(0xffffffffffffffff, 0x0, 0x0)
r0 = syz_mount_image$iso9660(&(0x7f0000000000), &(0x7f0000000100)='./file0\x00', 0x0, 0x4, &(0x7f0000000200)=[{&(0x7f0000010000)="01434430303101004c494e55582020202020202020202020202020202020202020202020202020204344524f4d20202020202020202020202020202020202020202020202020202000000000000000002301000000000123000000000000000000000000000000000000000000000000000000000000000001000001010000010008080018000000000000180d010000000000000000010f0000000022001501", 0xa0, 0x8000}, {0x0}, {0x0}, {&(0x7f0000014000)="8800150100000000011500080000000008007809140b2a3a0802", 0x1a, 0x8a800}], 0x0, &(0x7f0000015200))
fsconfig$FSCONFIG_SET_PATH_EMPTY(0xffffffffffffffff, 0x4, &(0x7f0000000500)='{\'%./(*}/@,\'!\\!-%\x00', &(0x7f0000000540)='./file0\x00', r0)

proc_setup_thread_self---of 7
proc_thread_self_get_link84%of 6

refcount_dec_and_lock---of 10
refcount_dec_and_lock_irqsave---of 9
refcount_dec_and_mutex_lock---of 10
refcount_dec_if_one---of 1
refcount_dec_not_one67%of 9
refcount_warn_saturate---of 13

__elevator_exit---of 1
elevator_alloc---of 4
elevator_find---of 7
elevator_get---of 5
elevator_init_mq---of 19
elevator_init_mq.cold---of 1
elevator_release---of 1
elevator_switch---of 4
elevator_switch_mq---of 58
elv_attempt_insert_merge86%of 7
elv_attr_show---of 5
elv_attr_store---of 5
elv_bio_merge_ok75%of 4
elv_former_request---of 3
elv_iosched_show---of 21
elv_iosched_store---of 21
elv_latter_request---of 3
elv_merge72%of 14
elv_merge_requests---of 7
elv_merged_request---of 9
elv_rb_add34%of 9
elv_rb_del67%of 3
elv_rb_find80%of 10
elv_rb_former_request---of 3
elv_rb_latter_request---of 3
elv_register---of 10
elv_register_queue---of 12
elv_rqhash_add60%of 5
elv_rqhash_del84%of 6
elv_rqhash_find12%of 18
elv_rqhash_reposition---of 7
elv_unregister---of 2
elv_unregister_queue---of 5

cpumask_any_and_distribute---of 7
cpumask_any_but---of 4
cpumask_local_spread---of 11
cpumask_next100%of 1
cpumask_next_and100%of 1
cpumask_next_wrap---of 6

errseq_check50%of 4
errseq_check_and_advance---of 5
errseq_sample100%of 2
errseq_set---of 9

FUA_show---of 1
allow_restart_show---of 1
allow_restart_store---of 6
app_tag_own_show---of 1
bytes_to_logical---of 1
cache_type_show---of 1
cache_type_store---of 19
logical_to_sectors100%of 1
manage_start_stop_show---of 1
manage_start_stop_store---of 4
max_medium_access_timeouts_show---of 1
max_medium_access_timeouts_store---of 6
max_retries_show---of 1
max_retries_store---of 6
max_write_same_blocks_show---of 1
max_write_same_blocks_store---of 11
protection_mode_show---of 6
protection_type_show---of 1
protection_type_store---of 7
provisioning_mode_show---of 1
provisioning_mode_store---of 8
read_capacity_10---of 21
read_capacity_16.part.0---of 55
read_capacity_error---of 15
scsi_disk_release---of 1
sd_check_events---of 30
sd_compat_ioctl---of 4
sd_completed_bytes---of 6
sd_config_discard---of 13
sd_config_write_same---of 21
sd_default_probe---of 1
sd_done---of 30
sd_eh_action---of 11
sd_eh_reset---of 1
sd_getgeo---of 4
sd_init_command30%of 77
sd_ioctl---of 4
sd_ioctl_common---of 6
sd_major---of 7
sd_major.part.0---of 1
sd_open---of 22
sd_pr_clear---of 1
sd_pr_command.isra.0---of 4
sd_pr_preempt---of 4
sd_pr_register---of 4
sd_pr_release---of 3
sd_pr_reserve---of 5
sd_print_result---of 13
sd_print_sense_hdr---of 3
sd_probe---of 39
sd_probe.cold---of 1
sd_release---of 5
sd_remove---of 1
sd_rescan---of 1
sd_resume---of 8
sd_revalidate_disk.isra.0---of 386
sd_revalidate_disk.isra.0.cold---of 6
sd_setup_write_same10_cmnd---of 6
sd_setup_write_same16_cmnd---of 6
sd_shutdown---of 15
sd_start_stop_device---of 14
sd_suspend_common.isra.0---of 17
sd_suspend_runtime---of 1
sd_suspend_system---of 1
sd_sync_cache---of 18
sd_uninit_command---of 4
sd_unlock_native_capacity---of 2
sectors_to_logical100%of 1
thin_provisioning_show---of 1
zeroing_mode_show---of 1
zeroing_mode_store---of 4
zoned_cap_show---of 11

__check_object_size85%of 20
__check_object_size.cold---of 5
check_stack_object100%of 6
usercopy_abort---of 4
usercopy_warn---of 5

cmp_ex_search100%of 3
cmp_ex_sort---of 3
search_extable100%of 1
sort_extable---of 1
swap_ex---of 1
trim_init_extable---of 9

__put_cred---of 9
abort_creds---of 5
commit_creds---of 47
copy_creds40%of 20
cred_alloc_blank---of 4
cred_fscmp---of 15
exit_creds67%of 6
get_task_cred---of 24
override_creds---of 1
prepare_creds---of 35
prepare_exec_creds---of 3
prepare_kernel_cred---of 23
put_cred_rcu---of 9
revert_creds---of 3
set_create_files_as---of 4
set_security_override---of 1
set_security_override_from_ctx---of 3

arch_get_random_long---of 5
prandom_bytes---of 6
prandom_bytes_state---of 5
prandom_reseed---of 13
prandom_seed---of 9
prandom_seed_full_state---of 11
prandom_timer_start---of 1
prandom_u32100%of 1
prandom_u32_state---of 1

__ia32_sys_seccomp---of 1
__put_seccomp_filter---of 6
__seccomp_filter17%of 77
__seccomp_filter_orphan---of 8
__secure_computing34%of 15
__x64_sys_seccomp---of 1
audit_actions_logged---of 8
do_seccomp---of 131
get_seccomp_filter10%of 11
prctl_get_seccomp---of 1
prctl_set_seccomp---of 3
read_actions_logged---of 3
seccomp_actions_logged_handler---of 4
seccomp_check_filter---of 8
seccomp_do_user_notification.constprop.0---of 23
seccomp_filter_release---of 1
seccomp_names_from_actions_logged.constprop.0---of 12
seccomp_notify_addfd---of 31
seccomp_notify_detach.part.0---of 5
seccomp_notify_ioctl---of 49
seccomp_notify_poll---of 15
seccomp_notify_release---of 3
write_actions_logged.constprop.0---of 12

__find_next_entry---of 17
__ftrace_trace_stack---of 15
__remove_instance.part.0---of 13
__set_tracer_option.isra.0---of 5
__trace_array_vprintk.part.0---of 16
__trace_bputs---of 11
__trace_find_cmdline---of 11
__trace_find_cmdline.cold---of 1
__trace_puts---of 14
__trace_stack---of 4
alloc_percpu_trace_buffer.part.0---of 5
alloc_percpu_trace_buffer.part.0.cold---of 1
allocate_cmdlines_buffer---of 3
allocate_trace_buffer---of 7
buffer_percent_read---of 1
buffer_percent_write---of 5
buffer_pipe_buf_get---of 8
buffer_pipe_buf_release---of 1
buffer_ref_release---of 5
buffer_spd_release---of 1
call_filter_check_discard---of 6
clear_tracing_err_log---of 3
close_pipe_on_cpu---of 7
create_trace_option_files---of 25
create_trace_option_files.cold---of 2
disable_trace_buffered_event---of 1
disable_trace_on_warning---of 4
dummy_set_flag---of 1
enable_trace_buffered_event---of 1
err_pos---of 6
ftrace_dump---of 18
ftrace_dump.cold---of 13
ftrace_exports---of 5
ftrace_now---of 4
get_order---of 1
get_total_entries---of 6
init_tracer_tracefs---of 49
init_tracer_tracefs.cold---of 25
instance_mkdir---of 8
instance_rmdir---of 10
is_tracing_stopped---of 1
ns2usecs---of 1
nsecs_to_usecs---of 1
peek_next_entry---of 10
print_event_info---of 1
print_trace_header---of 3
print_trace_line---of 63
rb_simple_read---of 4
rb_simple_write---of 19
register_ftrace_export---of 11
s_next---of 10
s_show---of 15
s_start---of 22
s_stop---of 5
saved_cmdlines_next---of 7
saved_cmdlines_show---of 1
saved_cmdlines_start---of 13
saved_cmdlines_stop---of 1
saved_tgids_next---of 5
saved_tgids_show---of 3
saved_tgids_start---of 5
set_tracer_flag---of 35
show_traces_open---of 10
show_traces_release---of 5
t_next---of 8
t_show---of 6
t_start---of 15
t_stop---of 1
trace_array_create---of 13
trace_array_create_dir---of 9
trace_array_destroy---of 11
trace_array_find---of 5
trace_array_find_get---of 7
trace_array_get---of 6
trace_array_get_by_name---of 8
trace_array_init_printk---of 6
trace_array_printk---of 8
trace_array_printk_buf---of 6
trace_array_put---of 4
trace_array_vprintk---of 4
trace_automount---of 5
trace_buffer_lock_reserve---of 3
trace_buffer_unlock_commit_nostack---of 3
trace_buffer_unlock_commit_regs---of 20
trace_buffered_event_disable---of 9
trace_buffered_event_enable---of 14
trace_buffered_event_enable.cold---of 1
trace_clock_in_ns---of 1
trace_create_file---of 2
trace_create_file.cold---of 1
trace_default_header---of 13
trace_die_handler---of 5
trace_dump_stack---of 3
trace_empty---of 11
trace_event_buffer_commit---of 29
trace_event_buffer_lock_reserve---of 12
trace_event_format---of 18
trace_filter_add_remove_task---of 8
trace_find_cmdline---of 4
trace_find_filtered_pid---of 4
trace_find_next_entry---of 13
trace_find_next_entry_inc---of 6
trace_find_tgid---of 6
trace_free_pid_list---of 1
trace_function---of 12
trace_get_user---of 33
trace_handle_return---of 3
trace_ignore_this_task---of 7
trace_init_global_iter---of 8
trace_keep_overwrite---of 5
trace_latency_header---of 5
trace_module_notify---of 6
trace_options_core_read---of 2
trace_options_core_write---of 6
trace_options_init_dentry.part.0---of 7
trace_options_init_dentry.part.0.cold---of 1
trace_options_read---of 2
trace_options_write---of 8
trace_panic_handler---of 4
trace_parse_run_command---of 18
trace_parse_run_command.cold---of 1
trace_parser_get_init---of 3
trace_parser_put---of 1
trace_pid_next---of 3
trace_pid_show---of 1
trace_pid_start---of 7
trace_pid_write---of 24
trace_printk_init_buffers---of 3
trace_printk_init_buffers.cold---of 2
trace_printk_seq---of 4
trace_printk_seq.cold---of 1
trace_printk_start_comm---of 2
trace_run_command---of 5
trace_save_cmdline---of 14
trace_set_options---of 11
trace_total_entries---of 2
trace_total_entries_cpu---of 5
trace_vbprintk---of 21
trace_vprintk---of 5
tracepoint_printk_sysctl---of 7
tracer_init---of 1
tracer_tracing_is_on---of 5
tracer_tracing_off---of 3
tracer_tracing_on---of 3
tracing_alloc_snapshot---of 4
tracing_buffers_open---of 14
tracing_buffers_poll---of 5
tracing_buffers_read---of 26
tracing_buffers_release---of 5
tracing_buffers_splice_read---of 38
tracing_check_open_get_tr---of 11
tracing_clock_open---of 7
tracing_clock_show---of 7
tracing_clock_write---of 6
tracing_cond_snapshot_data---of 1
tracing_cpumask_read---of 5
tracing_cpumask_write---of 6
tracing_entries_read---of 13
tracing_entries_write---of 8
tracing_err_log_open---of 11
tracing_err_log_release---of 8
tracing_err_log_seq_next---of 1
tracing_err_log_seq_show---of 7
tracing_err_log_seq_start---of 1
tracing_err_log_seq_stop---of 1
tracing_err_log_write---of 1
tracing_free_buffer_release---of 9
tracing_free_buffer_write---of 1
tracing_generic_entry_update78%of 9
tracing_init_dentry---of 7
tracing_init_dentry.cold---of 1
tracing_is_disabled---of 1
tracing_is_enabled---of 1
tracing_is_on---of 5
tracing_iter_reset---of 10
tracing_log_err---of 7
tracing_lseek---of 5
tracing_mark_raw_write---of 15
tracing_mark_write---of 26
tracing_off---of 3
tracing_on---of 3
tracing_open---of 43
tracing_open_file_tr---of 9
tracing_open_generic---of 4
tracing_open_generic_tr---of 3
tracing_open_options---of 3
tracing_open_pipe---of 20
tracing_poll_pipe---of 5
tracing_read_pipe---of 26
tracing_readme_read---of 1
tracing_record_cmdline---of 3
tracing_record_taskinfo---of 13
tracing_record_taskinfo_sched_switch---of 20
tracing_record_tgid---of 7
tracing_release---of 20
tracing_release_file_tr---of 5
tracing_release_generic_tr---of 5
tracing_release_options---of 5
tracing_release_pipe---of 7
tracing_reset_all_online_cpus---of 1
tracing_reset_all_online_cpus_unlocked---of 7
tracing_reset_online_cpus---of 6
tracing_resize_ring_buffer---of 13
tracing_saved_cmdlines_open---of 4
tracing_saved_cmdlines_size_read---of 4
tracing_saved_cmdlines_size_write---of 9
tracing_saved_tgids_open---of 4
tracing_set_clock---of 5
tracing_set_cpumask---of 16
tracing_set_time_stamp_abs---of 7
tracing_set_trace_read---of 1
tracing_set_trace_write---of 8
tracing_set_tracer---of 25
tracing_set_tracer.cold---of 1
tracing_single_release_tr---of 5
tracing_snapshot---of 2
tracing_snapshot_alloc---of 2
tracing_snapshot_cond---of 2
tracing_snapshot_cond_disable---of 1
tracing_snapshot_cond_enable---of 1
tracing_spd_release_pipe---of 1
tracing_splice_read_pipe---of 37
tracing_start---of 2
tracing_start_tr.part.0---of 10
tracing_stats_read---of 8
tracing_stop---of 1
tracing_stop_tr---of 8
tracing_thresh_read---of 5
tracing_thresh_write---of 8
tracing_time_stamp_mode_open---of 7
tracing_time_stamp_mode_show---of 4
tracing_total_entries_read---of 8
tracing_trace_options_open---of 7
tracing_trace_options_show---of 11
tracing_trace_options_write---of 6
tracing_update_buffers---of 9
tracing_wait_pipe---of 12
tracing_write_stub---of 1
unregister_ftrace_export---of 13

PageHeadHuge---of 3
PageHuge43%of 7
__free_huge_page---of 30
__nodes_weight.constprop.0---of 1
__unmap_hugepage_range---of 45
__unmap_hugepage_range_final---of 1
__vma_reservation_common---of 23
add_reservation_in_range.constprop.0---of 25
adjust_range_if_pmd_sharing_possible---of 8
alloc_fresh_huge_page---of 18
alloc_huge_page---of 56
alloc_huge_page_nodemask---of 7
alloc_huge_page_vma---of 7
alloc_pool_huge_page---of 13
alloc_surplus_huge_page---of 13
allocate_file_region_entries---of 14
allowed_mems_nr---of 9
clear_page_huge_active---of 4
coalesce_file_region---of 6
copy_hugetlb_page_range---of 54
dequeue_huge_page_nodemask---of 34
dissolve_free_huge_page---of 20
dissolve_free_huge_pages---of 4
follow_huge_addr---of 1
follow_huge_pd---of 1
follow_huge_pgd---of 6
follow_huge_pmd_pte---of 21
follow_huge_pud---of 6
follow_hugetlb_page---of 71
free_hpage_workfn---of 2
free_huge_page---of 4
free_hugepages_show---of 4
free_pool_huge_page---of 17
gather_surplus_pages---of 31
hstate_next_node_to_alloc---of 8
huge_add_to_page_cache---of 8
huge_pmd_share---of 49
huge_pmd_unshare---of 19
huge_pmd_unshare_flush---of 23
huge_pte_alloc---of 24
huge_pte_offset---of 12
hugepage_new_subpool---of 6
hugepage_put_subpool---of 8
hugepage_subpool_get_pages.part.0---of 9
hugepage_subpool_put_pages.part.0---of 18
hugetlb_acct_memory---of 7
hugetlb_basepage_index---of 15
hugetlb_change_protection---of 64
hugetlb_cow---of 71
hugetlb_fault---of 126
hugetlb_fault.cold---of 1
hugetlb_fault_mutex_hash---of 1
hugetlb_fix_reserve_counts---of 10
hugetlb_mcopy_atomic_pte---of 44
hugetlb_mempolicy_sysctl_handler---of 1
hugetlb_overcommit_handler---of 6
hugetlb_page_mapping_lock_write---of 4
hugetlb_register_node---of 7
hugetlb_register_node.cold---of 1
hugetlb_report_meminfo---of 6
hugetlb_report_node_meminfo---of 1
hugetlb_report_usage---of 1
hugetlb_reserve_pages---of 48
hugetlb_show_meminfo---of 3
hugetlb_show_meminfo.cold---of 1
hugetlb_split---of 12
hugetlb_sysctl_handler---of 1
hugetlb_sysctl_handler_common---of 9
hugetlb_sysfs_add_hstate---of 5
hugetlb_total_pages---of 3
hugetlb_unregister_node---of 7
hugetlb_unreserve_pages---of 6
hugetlb_unshare_pmds.constprop.0---of 28
hugetlb_vm_op_close---of 19
hugetlb_vm_op_fault---of 1
hugetlb_vm_op_open---of 12
hugetlb_vm_op_pagesize---of 1
hugetlb_vm_op_split---of 2
is_hugetlb_entry_migration---of 5
isolate_hugetlb---of 10
kobj_to_hstate---of 14
linear_hugepage_index---of 1
make_huge_pte.isra.0---of 18
move_hugetlb_state---of 12
nr_hugepages_mempolicy_show---of 4
nr_hugepages_mempolicy_store---of 1
nr_hugepages_show---of 4
nr_hugepages_store---of 1
nr_hugepages_store_common---of 10
nr_overcommit_hugepages_show---of 1
nr_overcommit_hugepages_store---of 5
page_huge_active---of 4
pfn_valid---of 36
prep_compound_gigantic_page---of 8
putback_active_hugepage---of 15
region_add.constprop.0.isra.0---of 9
region_chg---of 6
region_del---of 20
reset_vma_resv_huge_pages---of 4
restore_reserve_on_error---of 6
resv_hugepages_show---of 1
resv_map_alloc---of 5
resv_map_release---of 5
return_unused_surplus_pages---of 5
set_max_huge_pages---of 46
set_page_huge_active---of 4
set_vma_resv_flags---of 5
size_to_hstate---of 4
surplus_hugepages_show---of 4
unmap_hugepage_range---of 1
update_and_free_page---of 23
vma_has_reserves---of 8
vma_kernel_pagesize---of 5
vma_mmu_pagesize---of 5
vma_resv_map---of 7

__check_sticky---of 5
__ia32_sys_link---of 1
__ia32_sys_linkat---of 1
__ia32_sys_mkdir---of 1
__ia32_sys_mkdirat---of 1
__ia32_sys_mknod---of 1
__ia32_sys_mknodat---of 1
__ia32_sys_rename---of 1
__ia32_sys_renameat---of 1
__ia32_sys_renameat2---of 1
__ia32_sys_rmdir---of 5
__ia32_sys_symlink---of 1
__ia32_sys_symlinkat---of 1
__ia32_sys_unlink---of 5
__ia32_sys_unlinkat---of 6
__legitimize_path50%of 8
__lookup_hash84%of 6
__lookup_slow47%of 13
__page_get_link.isra.0---of 17
__page_symlink---of 7
__traverse_mounts---of 35
__x64_sys_link---of 1
__x64_sys_linkat---of 1
__x64_sys_mkdir---of 1
__x64_sys_mkdirat---of 1
__x64_sys_mknod100%of 1
__x64_sys_mknodat---of 1
__x64_sys_rename---of 1
__x64_sys_renameat---of 1
__x64_sys_renameat2---of 1
__x64_sys_rmdir---of 5
__x64_sys_symlink---of 1
__x64_sys_symlinkat---of 1
__x64_sys_unlink---of 5
__x64_sys_unlinkat---of 6
choose_mountpoint_rcu---of 11
complete_walk70%of 13
do_file_open_root---of 15
do_filp_open50%of 12
do_linkat---of 25
do_mkdirat---of 8
do_mknodat60%of 15
do_renameat2---of 57
do_rmdir---of 19
do_symlinkat---of 9
do_unlinkat---of 29
done_path_create---of 1
filename_create65%of 14
filename_lookup56%of 20
filename_parentat53%of 17
follow_down---of 6
follow_down_one---of 6
follow_up---of 7
fsnotify_move---of 17
full_name_hash---of 6
generic_permission24%of 30
getname60%of 5
getname_flags60%of 5
getname_flags.part.032%of 19
getname_kernel42%of 12
handle_dots.part.0---of 83
hashlen_string---of 4
inode_permission87%of 15
kern_path100%of 1
kern_path_create---of 1
kern_path_locked---of 7
legitimize_links34%of 15
legitimize_root100%of 5
link_path_walk.part.074%of 38
lock_rename---of 7
lookup_dcache29%of 7
lookup_fast77%of 21
lookup_one_len---of 7
lookup_one_len_common---of 13
lookup_one_len_unlocked---of 5
lookup_open.isra.041%of 64
lookup_positive_unlocked---of 5
may_delete---of 26
may_linkat---of 10
may_open50%of 18
may_open_dev100%of 3
nd_alloc_stack---of 5
nd_jump_link---of 7
nd_jump_root60%of 15
page_get_link---of 3
page_get_link_raw---of 1
page_put_link---of 6
page_readlink---of 5
page_symlink---of 1
path_get100%of 2
path_init31%of 71
path_lookupat45%of 27
path_openat49%of 148
path_parentat86%of 7
path_pts---of 7
path_put100%of 1
putname67%of 6
readlink_copy---of 8
set_root57%of 16
step_into45%of 99
terminate_walk36%of 17
try_lookup_one_len---of 6
try_to_unlazy41%of 22
try_to_unlazy_next38%of 32
unlock_rename---of 2
user_path_at_empty60%of 5
user_path_create---of 5
vfs_create---of 19
vfs_get_link---of 5
vfs_link---of 43
vfs_mkdir---of 21
vfs_mknod75%of 24
vfs_mkobj---of 16
vfs_path_lookup---of 1
vfs_readlink---of 11
vfs_rename---of 96
vfs_rmdir---of 4
vfs_rmdir.part.0---of 16
vfs_symlink---of 17
vfs_tmpfile---of 13
vfs_unlink---of 32
walk_component67%of 21

__ext4_forget---of 21
__ext4_handle_dirty_metadata35%of 23
__ext4_handle_dirty_metadata.cold---of 3
__ext4_handle_dirty_super60%of 5
__ext4_journal_ensure_credits---of 9
__ext4_journal_get_create_access75%of 4
__ext4_journal_get_write_access63%of 8
__ext4_journal_start_reserved---of 21
__ext4_journal_start_sb50%of 18
__ext4_journal_stop50%of 10
ext4_inode_journal_mode60%of 15
ext4_journal_abort_handle.constprop.0---of 9
ext4_journal_abort_handle.constprop.0.cold---of 1
ext4_journal_check_start67%of 9

__ia32_sys_readahead---of 1
__x64_sys_readahead---of 1
do_page_cache_ra---of 5
file_ra_state_init80%of 5
force_page_cache_ra---of 18
ksys_readahead---of 9
ondemand_readahead---of 80
page_cache_async_ra---of 10
page_cache_ra_unbounded---of 22
page_cache_sync_ra---of 7
read_cache_pages---of 15
read_cache_pages_invalidate_page---of 12
read_pages---of 43

avtab_alloc---of 3
avtab_alloc.part.0---of 8
avtab_alloc_dup---of 4
avtab_destroy---of 2
avtab_destroy.part.0---of 8
avtab_hash_eval---of 5
avtab_init---of 1
avtab_insert_node.constprop.0---of 12
avtab_insert_nonunique---of 17
avtab_insertf---of 17
avtab_read---of 10
avtab_read.cold---of 4
avtab_read_item---of 41
avtab_read_item.cold---of 13
avtab_search83%of 17
avtab_search_node88%of 16
avtab_search_node_next---of 12
avtab_write---of 7
avtab_write_item---of 12

__import_iovec62%of 26
_copy_from_iter---of 57
_copy_from_iter_flushcache---of 49
_copy_from_iter_full32%of 47
_copy_from_iter_full_nocache---of 39
_copy_from_iter_nocache---of 49
_copy_mc_to_iter---of 62
_copy_to_iter30%of 62
copy_overflow---of 1
copy_page_from_iter---of 31
copy_page_to_iter---of 50
copyin.part.064%of 11
copyout.part.064%of 11
copyout_mc---of 11
csum_and_copy_from_iter---of 56
csum_and_copy_from_iter_full---of 45
csum_and_copy_to_iter---of 69
dup_iter---of 7
hash_and_copy_to_iter---of 4
import_iovec100%of 1
import_single_range62%of 13
iov_iter_advance57%of 55
iov_iter_alignment42%of 36
iov_iter_bvec67%of 3
iov_iter_copy_from_user_atomic26%of 55
iov_iter_discard---of 3
iov_iter_fault_in_readable50%of 22
iov_iter_gap_alignment---of 45
iov_iter_get_pages29%of 49
iov_iter_get_pages_alloc---of 62
iov_iter_init67%of 3
iov_iter_kvec---of 3
iov_iter_npages43%of 47
iov_iter_pipe60%of 5
iov_iter_restore72%of 7
iov_iter_revert---of 25
iov_iter_single_seg_count---of 8
iov_iter_zero---of 52
iovec_from_user34%of 27
memzero_page---of 1
push_pipe50%of 14
sanity50%of 6
sanity.cold---of 4

__attach_mnt67%of 3
__cleanup_mnt---of 1
__detach_mounts---of 13
__do_loopback---of 14
__do_sys_fsmount36%of 42
__do_sys_fsmount.cold---of 2
__do_sys_pivot_root---of 52
__ia32_sys_fsmount---of 1
__ia32_sys_mount---of 15
__ia32_sys_move_mount---of 21
__ia32_sys_oldumount---of 3
__ia32_sys_open_tree---of 21
__ia32_sys_pivot_root---of 1
__ia32_sys_umount---of 6
__is_local_mountpoint---of 5
__legitimize_mnt50%of 8
__lookup_mnt80%of 5
__mnt_drop_write---of 1
__mnt_drop_write_file100%of 1
__mnt_is_readonly---of 3
__mnt_want_write72%of 7
__mnt_want_write_file63%of 8
__put_mountpoint.part.060%of 5
__x64_sys_fsmount100%of 1
__x64_sys_mount74%of 15
__x64_sys_move_mount---of 21
__x64_sys_oldumount---of 3
__x64_sys_open_tree---of 21
__x64_sys_pivot_root---of 1
__x64_sys_umount---of 6
alloc_mnt_ns---of 12
alloc_vfsmnt70%of 10
attach_mnt---of 5
attach_recursive_mnt35%of 41
can_change_locked_flags.isra.0---of 13
cleanup_group_ids---of 11
cleanup_mnt---of 19
clone_mnt---of 28
clone_private_mount---of 11
collect_mounts---of 6
commit_tree89%of 9
copy_mnt_ns---of 45
copy_mount_options75%of 8
copy_tree---of 35
count_mounts75%of 12
current_chrooted---of 6
delayed_free_vfsmnt---of 1
delayed_mntput---of 2
dissolve_on_fput---of 5
do_add_mount63%of 8
do_mount---of 4
do_move_mount---of 50
do_set_group---of 22
drop_collected_mounts---of 1
fc_mount---of 5
finish_automount---of 46
free_mnt_ns---of 6
from_mnt_ns---of 1
get_mountpoint65%of 17
graft_tree72%of 7
invent_group_ids---of 13
is_path_reachable---of 5
iterate_mounts---of 5
kern_mount---of 4
kern_unmount---of 5
kern_unmount_array---of 11
legitimize_mnt20%of 21
lock_mnt_tree---of 16
lock_mount50%of 10
lookup_mnt58%of 21
lookup_mountpoint---of 5
m_next---of 5
m_show---of 1
m_start---of 10
m_stop---of 4
mark_mounts_for_expiry---of 13
may_umount---of 1
may_umount_tree---of 11
mnt_change_mountpoint---of 13
mnt_clone_internal---of 3
mnt_clone_write---of 5
mnt_cursor_del---of 1
mnt_drop_write56%of 9
mnt_drop_write_file---of 9
mnt_get_count---of 3
mnt_may_suid---of 4
mnt_release_group_id---of 1
mnt_set_expiry---of 1
mnt_set_mountpoint---of 3
mnt_want_write32%of 19
mnt_want_write_file---of 23
mnt_warn_timestamp_expiry34%of 6
mnt_warn_timestamp_expiry.cold---of 7
mntget100%of 3
mntns_get---of 3
mntns_install---of 9
mntns_owner---of 1
mntns_put---of 1
mntput75%of 4
mntput_no_expire18%of 57
mount_subtree---of 13
mount_too_revealing9%of 23
namespace_unlock23%of 18
open_detached_copy---of 19
our_mnt---of 1
path_is_mountpoint---of 23
path_is_under---of 5
path_mount30%of 141
path_mount.cold---of 1
path_umount---of 58
put_mnt_ns---of 2
sb_prepare_remount_readonly---of 16
umount_tree---of 35
unhash_mnt---of 9
vfs_create_mount89%of 9
vfs_kern_mount---of 3
vfs_kern_mount.part.0---of 9
vfs_submount---of 4

__generic_block_fiemap---of 27
__ia32_compat_sys_ioctl---of 16
__ia32_sys_ioctl---of 11
__x64_sys_ioctl82%of 11
compat_ioctl_preallocate---of 7
compat_ptr_ioctl---of 3
do_vfs_ioctl5%of 63
do_vfs_ioctl.cold---of 1
fiemap_fill_next_extent---of 13
fiemap_prep---of 10
generic_block_fiemap---of 1
ioctl_file_clone---of 9
ioctl_preallocate---of 7
vfs_ioctl---of 5

ex_get_fault_handler_type---of 8
ex_handler_clear_fs---of 4
ex_handler_copy50%of 4
ex_handler_default---of 1
ex_handler_fault---of 1
ex_handler_fprestore---of 7
ex_handler_rdmsr_unsafe---of 2
ex_handler_rdmsr_unsafe.cold---of 1
ex_handler_uaccess50%of 4
ex_handler_wrmsr_unsafe---of 2
ex_handler_wrmsr_unsafe.cold---of 1
fixup_exception100%of 3

pids_can_attach---of 16
pids_can_fork57%of 16
pids_can_fork.cold---of 1
pids_cancel.constprop.050%of 2
pids_cancel_attach---of 16
pids_cancel_fork---of 12
pids_css_alloc---of 3
pids_css_free---of 1
pids_current_read---of 1
pids_events_show---of 1
pids_max_show---of 5
pids_max_write---of 7
pids_release---of 13

sg_alloc_table_chained73%of 11
sg_free_table_chained---of 3
sg_pool_alloc80%of 5
sg_pool_free---of 5

fsnotify_destroy_event---of 7
fsnotify_flush_notify---of 8
fsnotify_get_cookie---of 1
fsnotify_insert_event59%of 17
fsnotify_peek_first_event---of 8
fsnotify_remove_first_event---of 5
fsnotify_remove_queued_event---of 3

chroot_fs_refs---of 20
copy_fs_struct---of 3
current_umask100%of 1
exit_fs67%of 3
free_fs_struct---of 1
set_fs_pwd---of 6
set_fs_root---of 6
unshare_fs_struct---of 4

__f_setown---of 12
__ia32_compat_sys_fcntl---of 3
__ia32_compat_sys_fcntl64---of 1
__ia32_sys_fcntl---of 8
__x64_sys_fcntl---of 8
do_compat_fcntl64---of 30
do_fcntl---of 88
f_delown---of 1
f_getown---of 16
f_setown---of 20
fasync_alloc---of 1
fasync_free---of 1
fasync_free_rcu---of 1
fasync_helper---of 6
fasync_insert_entry---of 6
fasync_remove_entry---of 5
kill_fasync4%of 31
kill_fasync.cold---of 1
send_sigio---of 23
send_sigio_to_task---of 37
send_sigurg---of 73

mls_compute_context_len---of 29
mls_compute_sid38%of 37
mls_context_isvalid100%of 10
mls_context_to_sid---of 5
mls_context_to_sid.part.0---of 34
mls_convert_context---of 23
mls_export_netlbl_cat80%of 5
mls_export_netlbl_lvl100%of 2
mls_from_string---of 6
mls_import_netlbl_cat---of 6
mls_import_netlbl_lvl---of 2
mls_level_isvalid100%of 5
mls_range_isvalid100%of 8
mls_range_set---of 3
mls_setup_user_range---of 25
mls_sid_to_context---of 34

__hrtimer_get_remaining---of 5
__hrtimer_init67%of 9
__hrtimer_next_event_base---of 12
__hrtimer_run_queues---of 42
__ia32_sys_nanosleep---of 7
__ia32_sys_nanosleep_time32---of 8
__remove_hrtimer---of 6
__x64_sys_nanosleep---of 7
__x64_sys_nanosleep_time32---of 8
clock_was_set---of 1
clock_was_set_delayed---of 1
clock_was_set_work---of 1
do_nanosleep---of 19
enqueue_hrtimer---of 12
hrtimer_active---of 7
hrtimer_cancel---of 4
hrtimer_force_reprogram---of 9
hrtimer_forward---of 14
hrtimer_get_next_event---of 5
hrtimer_init50%of 10
hrtimer_init_sleeper---of 10
hrtimer_interrupt---of 18
hrtimer_interrupt.cold---of 1
hrtimer_nanosleep---of 22
hrtimer_nanosleep_restart---of 10
hrtimer_next_event_without---of 5
hrtimer_reprogram---of 14
hrtimer_run_queues---of 7
hrtimer_run_queues.cold---of 1
hrtimer_run_softirq---of 3
hrtimer_sleeper_start_expires---of 1
hrtimer_start_range_ns---of 43
hrtimer_try_to_cancel---of 3
hrtimer_try_to_cancel.part.0---of 17
hrtimer_wakeup---of 3
hrtimers_cpu_dying---of 22
hrtimers_cpu_starting---of 1
hrtimers_prepare_cpu---of 3
hrtimers_resume---of 5
ktime_add_safe---of 4
ktime_get_boottime---of 1
ktime_get_clocktai---of 1
ktime_get_real---of 1
nanosleep_copyout---of 5
retrigger_next_event---of 2
schedule_hrtimeout---of 1
schedule_hrtimeout_range---of 1
schedule_hrtimeout_range_clock---of 26

context_compute_hash60%of 5

__nla_parse100%of 1
__nla_put---of 1
__nla_put_64bit---of 1
__nla_put_nohdr---of 1
__nla_reserve---of 1
__nla_reserve_64bit---of 1
__nla_reserve_nohdr---of 1
__nla_validate---of 1
__nla_validate_parse4%of 169
__nla_validate_parse.cold---of 3
nla_append---of 5
nla_find---of 6
nla_get_range_signed---of 11
nla_get_range_unsigned---of 15
nla_memcmp---of 3
nla_memcpy---of 3
nla_policy_len---of 7
nla_put---of 5
nla_put_64bit---of 5
nla_put_nohdr---of 5
nla_reserve---of 5
nla_reserve_64bit---of 5
nla_reserve_nohdr---of 6
nla_strcmp---of 6
nla_strdup---of 6
nla_strlcpy---of 7

ebitmap_and---of 19
ebitmap_cmp45%of 9
ebitmap_contains24%of 21
ebitmap_cpy25%of 12
ebitmap_destroy75%of 4
ebitmap_get_bit100%of 3
ebitmap_get_bit.part.0.isra.088%of 8
ebitmap_hash12%of 17
ebitmap_netlbl_export24%of 13
ebitmap_netlbl_import---of 18
ebitmap_read---of 25
ebitmap_read.cold---of 5
ebitmap_set_bit---of 28
ebitmap_write---of 49

audit_comparator---of 16
audit_compare_dname_path---of 6
audit_compare_rule.part.0---of 16
audit_data_to_entry---of 121
audit_data_to_entry.cold---of 3
audit_del_rule---of 16
audit_dupe_rule---of 26
audit_dupe_rule.cold---of 1
audit_filter24%of 34
audit_find_rule---of 20
audit_free_rule_rcu---of 8
audit_gid_comparator---of 8
audit_init_entry---of 5
audit_list_rules_send---of 38
audit_log_rule_change.part.0---of 2
audit_match_class---of 6
audit_match_signal---of 17
audit_rule_change---of 45
audit_uid_comparator---of 8
audit_unpack_string---of 7
audit_update_lsm_rules---of 18
get_order---of 1
parent_len---of 11

__address_space_init_once---of 1
__destroy_inode34%of 21
__iget---of 1
__insert_inode_hash---of 3
__remove_inode_hash---of 5
__wait_on_freeing_inode---of 1
address_space_init_once---of 1
alloc_inode39%of 13
atime_needs_update81%of 21
bmap---of 3
clear_inode54%of 13
clear_nlink100%of 2
current_time75%of 4
dentry_needs_remove_privs58%of 7
destroy_inode67%of 6
discard_new_inode---of 8
dispose_list---of 2
drop_nlink---of 4
evict42%of 31
evict_inodes---of 12
file_modified100%of 4
file_remove_privs77%of 17
file_update_time77%of 17
find_inode19%of 11
find_inode_by_ino_rcu---of 13
find_inode_fast---of 11
find_inode_nowait---of 8
find_inode_rcu---of 13
free_inode_nonrcu---of 1
generic_delete_inode100%of 1
generic_update_time65%of 14
get_next_ino67%of 3
get_nr_dirty_inodes---of 6
i_callback---of 3
iget5_locked37%of 11
iget_locked---of 21
igrab60%of 5
ihold50%of 2
ilookup---of 10
ilookup5---of 8
ilookup5_nowait100%of 2
in_group_or_capable---of 3
inc_nlink---of 5
init_once---of 1
init_special_inode43%of 7
init_special_inode.cold---of 1
inode_add_lru---of 6
inode_dio_wait---of 7
inode_init_always100%of 7
inode_init_once---of 1
inode_init_owner67%of 6
inode_insert553%of 17
inode_lru_isolate---of 26
inode_needs_sync---of 7
inode_nohighmem---of 1
inode_owner_or_capable---of 4
inode_sb_list_add---of 1
inode_set_ctime_current---of 1
inode_set_flags75%of 4
inode_update_time---of 5
insert_inode_locked40%of 23
insert_inode_locked4---of 6
invalidate_inodes---of 14
iput75%of 4
iput.part.028%of 33
iunique---of 22
lock_two_inodes---of 11
lock_two_nondirectories---of 8
lockdep_annotate_inode_mutex_key34%of 3
lockdep_annotate_inode_mutex_key.part.0100%of 2
mode_strip_sgid19%of 11
new_inode100%of 3
new_inode_pseudo100%of 3
no_open---of 1
proc_nr_inodes---of 6
prune_icache_sb---of 1
set_nlink50%of 6
timestamp_truncate56%of 9
touch_atime---of 26
unlock_new_inode80%of 5
unlock_two_nondirectories---of 7
vfs_ioc_fssetxattr_check---of 22
vfs_ioc_setflags_prepare---of 4

strnlen_user72%of 21

__traceiter_irq_disable---of 4
__traceiter_irq_enable---of 4
perf_trace_preemptirq_template---of 6
trace_event_raw_event_preemptirq_template---of 10
trace_hardirqs_off40%of 10
trace_hardirqs_off_caller---of 10
trace_hardirqs_off_finish55%of 11
trace_hardirqs_on50%of 12
trace_hardirqs_on_caller---of 12
trace_hardirqs_on_prepare59%of 12
trace_raw_output_preemptirq_template---of 4

__set_oom_adj.isra.0---of 61
__set_oom_adj.isra.0.cold---of 1
auxv_open---of 5
auxv_read---of 4
comm_open---of 1
comm_show---of 9
comm_write---of 10
copy_overflow---of 1
dname_to_vma_addr.isra.0---of 10
do_io_accounting---of 10
environ_open---of 5
environ_read---of 19
map_files_d_revalidate---of 19
map_files_get_link---of 17
mem_lseek---of 5
mem_open---of 4
mem_read---of 1
mem_release---of 4
mem_rw---of 33
mem_write---of 1
next_tgid---of 27
oom_adj_read---of 10
oom_adj_write---of 11
oom_score_adj_read---of 8
oom_score_adj_write---of 8
pid_delete_dentry---of 1
pid_getattr---of 29
pid_revalidate75%of 8
pid_update_inode---of 1
proc_attr_dir_lookup---of 1
proc_attr_dir_readdir---of 1
proc_coredump_filter_read---of 9
proc_coredump_filter_write---of 18
proc_cwd_link---of 9
proc_exe_link---of 9
proc_fail_nth_read---of 8
proc_fail_nth_write---of 10
proc_fault_inject_read---of 8
proc_fault_inject_write---of 13
proc_fill_cache---of 11
proc_flush_pid---of 1
proc_gid_map_open---of 1
proc_id_map_open---of 43
proc_id_map_release---of 4
proc_loginuid_read---of 8
proc_loginuid_write---of 29
proc_map_files_get_link---of 6
proc_map_files_instantiate---of 3
proc_map_files_lookup---of 18
proc_map_files_readdir---of 29
proc_mem_open---of 10
proc_oom_score---of 3
proc_pid_attr_open---of 3
proc_pid_attr_read---of 10
proc_pid_attr_write---of 45
proc_pid_cmdline_read---of 44
proc_pid_cmdline_read.cold---of 1
proc_pid_evict_inode---of 7
proc_pid_get_link---of 3
proc_pid_get_link.part.0---of 11
proc_pid_instantiate---of 3
proc_pid_limits---of 15
proc_pid_lookup---of 29
proc_pid_make_base_inode.constprop.080%of 5
proc_pid_make_inode80%of 5
proc_pid_permission65%of 14
proc_pid_personality---of 5
proc_pid_readdir---of 25
proc_pid_readlink---of 17
proc_pid_schedstat---of 1
proc_pid_stack---of 10
proc_pid_syscall---of 10
proc_pid_wchan---of 6
proc_pident_instantiate100%of 9
proc_pident_lookup84%of 12
proc_pident_readdir---of 18
proc_projid_map_open---of 1
proc_root_link---of 9
proc_sessionid_read---of 8
proc_setattr---of 4
proc_setgroups_open---of 44
proc_setgroups_release---of 5
proc_single_open---of 1
proc_single_show---of 7
proc_task_getattr---of 8
proc_task_instantiate100%of 3
proc_task_lookup65%of 34
proc_task_readdir---of 70
proc_tgid_base_lookup100%of 1
proc_tgid_base_readdir---of 1
proc_tgid_io_accounting---of 1
proc_tid_base_lookup100%of 1
proc_tid_base_readdir---of 1
proc_tid_comm_permission---of 10
proc_tid_io_accounting---of 1
proc_uid_map_open---of 1
task_dump_owner50%of 28
tgid_pidfd_to_pid---of 4
timens_offsets_open---of 1
timens_offsets_show---of 9
timens_offsets_write---of 33
timerslack_ns_open---of 1
timerslack_ns_show---of 37
timerslack_ns_write---of 40

__ext4_check_dir_entry74%of 15
call_filldir---of 16
ext4_check_all_de---of 5
ext4_dir_llseek---of 7
ext4_dir_open---of 2
ext4_htree_free_dir_info---of 1
ext4_htree_store_dirent---of 14
ext4_readdir---of 138
ext4_release_dir---of 3
free_rb_tree_fname---of 7
is_dx_dir---of 7

iso_date72%of 7

dma_alloc_attrs---of 9
dma_alloc_noncoherent---of 7
dma_alloc_pages---of 11
dma_can_mmap---of 6
dma_free_attrs---of 9
dma_free_noncoherent---of 8
dma_free_pages---of 5
dma_get_merge_boundary---of 6
dma_get_required_mask---of 6
dma_get_sgtable_attrs---of 6
dma_map_page_attrs---of 29
dma_map_resource---of 46
dma_map_sg_attrs67%of 12
dma_max_mapping_size---of 6
dma_mmap_attrs---of 6
dma_need_sync---of 6
dma_pgprot---of 1
dma_set_coherent_mask---of 8
dma_set_mask---of 10
dma_supported---of 7
dma_sync_sg_for_cpu---of 8
dma_sync_sg_for_device---of 8
dma_sync_single_for_cpu---of 16
dma_sync_single_for_device---of 16
dma_unmap_page_attrs---of 30
dma_unmap_resource---of 7
dma_unmap_sg_attrs---of 8
dmam_alloc_attrs---of 5
dmam_free_coherent---of 3
dmam_match---of 6
dmam_release---of 1

__es_find_extent_range34%of 12
__es_insert_extent67%of 36
__es_remove_extent13%of 99
__es_tree_search.isra.057%of 16
__insert_pending---of 11
__remove_pending---of 10
count_rsvd.isra.0---of 22
es_do_reclaim_extents---of 13
ext4_clear_inode_es---of 5
ext4_es_cache_extent58%of 19
ext4_es_can_be_merged.isra.089%of 9
ext4_es_can_be_merged.isra.0.cold---of 1
ext4_es_count---of 10
ext4_es_delayed_clu40%of 23
ext4_es_find_extent_range48%of 19
ext4_es_free_extent---of 10
ext4_es_init_tree100%of 1
ext4_es_insert_delayed_block---of 42
ext4_es_insert_extent24%of 91
ext4_es_is_delonly---of 3
ext4_es_lookup_extent61%of 41
ext4_es_register_shrinker---of 10
ext4_es_remove_extent---of 23
ext4_es_scan---of 60
ext4_es_scan_clu---of 8
ext4_es_scan_range---of 8
ext4_es_unregister_shrinker---of 1
ext4_exit_es---of 1
ext4_exit_pending---of 1
ext4_init_pending_tree100%of 1
ext4_is_pending---of 11
ext4_remove_pending---of 1
ext4_seq_es_shrinker_info_show---of 10

__blk_mq_sched_bio_merge34%of 12
__blk_mq_sched_dispatch_requests83%of 17
blk_mq_do_dispatch_ctx---of 14
blk_mq_do_dispatch_sched61%of 38
blk_mq_exit_sched---of 12
blk_mq_init_sched---of 29
blk_mq_sched_assign_ioc---of 4
blk_mq_sched_dispatch_requests50%of 6
blk_mq_sched_free_requests---of 4
blk_mq_sched_insert_request---of 13
blk_mq_sched_insert_requests53%of 40
blk_mq_sched_mark_restart_hctx100%of 2
blk_mq_sched_request_inserted89%of 9
blk_mq_sched_restart---of 2
blk_mq_sched_try_insert_merge88%of 8
sched_rq_cmp---of 1

__do_sys_setns---of 118
__ia32_sys_setns---of 1
__x64_sys_setns---of 1
copy_namespaces43%of 28
create_new_namespaces---of 45
exit_task_namespaces100%of 1
free_nsproxy---of 36
put_nsset---of 10
switch_task_namespaces67%of 3
unshare_nsproxy_namespaces---of 10

cap_bprm_creds_from_file---of 93
cap_bprm_creds_from_file.cold---of 2
cap_capable34%of 9
cap_capget---of 18
cap_capset---of 24
cap_convert_nscap---of 14
cap_inode_getsecurity---of 34
cap_inode_killpriv---of 2
cap_inode_need_killpriv100%of 1
cap_inode_removexattr---of 7
cap_inode_setxattr---of 5
cap_mmap_addr---of 11
cap_mmap_file---of 1
cap_ptrace_access_check---of 27
cap_ptrace_traceme---of 24
cap_safe_nice---of 28
cap_settime---of 1
cap_task_fix_setuid---of 30
cap_task_prctl---of 45
cap_task_setioprio---of 1
cap_task_setnice---of 1
cap_task_setscheduler---of 1
cap_vm_enough_memory34%of 9
get_vfs_caps_from_disk---of 20

__get_super.part.0---of 15
__get_super_thawed---of 15
__iterate_supers---of 10
__put_super.part.0---of 10
alloc_super86%of 14
compare_single---of 1
deactivate_locked_super---of 5
deactivate_super75%of 4
destroy_super_rcu---of 1
destroy_super_work---of 1
destroy_unused_super.part.0---of 4
do_emergency_remount---of 1
do_emergency_remount_callback---of 9
do_thaw_all---of 1
do_thaw_all_callback---of 4
drop_super---of 3
drop_super_exclusive---of 3
emergency_remount---of 2
emergency_thaw_all---of 2
free_anon_bdev---of 1
freeze_super---of 13
freeze_super.cold---of 1
generic_shutdown_super---of 14
generic_shutdown_super.cold---of 1
get_active_super---of 9
get_anon_bdev---of 5
get_super---of 4
get_super_exclusive_thawed---of 1
get_super_thawed---of 1
get_tree_bdev---of 22
get_tree_keyed---of 10
get_tree_nodev---of 10
get_tree_single---of 10
get_tree_single_reconf---of 1
grab_super---of 11
iterate_supers---of 13
iterate_supers_type---of 14
kill_anon_super---of 1
kill_block_super---of 3
kill_litter_super---of 3
mount_bdev59%of 17
mount_capable60%of 5
mount_nodev---of 7
mount_single---of 14
reconfigure_single---of 7
reconfigure_super---of 30
set_anon_super---of 5
set_anon_super_fc---of 5
set_bdev_super63%of 8
set_bdev_super_fc---of 1
sget43%of 21
sget_fc---of 23
super_cache_count---of 7
super_cache_scan---of 14
super_setup_bdi---of 1
super_setup_bdi_name---of 7
test_bdev_super---of 1
test_bdev_super_fc---of 1
test_keyed_super---of 1
test_single_super---of 1
thaw_super---of 1
thaw_super_locked---of 11
thaw_super_locked.cold---of 2
trylock_super---of 6
user_get_super---of 11
vfs_get_super---of 14
vfs_get_tree70%of 10
vfs_get_tree.cold---of 1

__ext4_xattr_set_credits42%of 17
__xattr_check_inode---of 6
ext4_evict_ea_inode---of 7
ext4_expand_extra_isize_ea---of 89
ext4_get_inode_usage---of 37
ext4_listxattr---of 28
ext4_xattr_block_csum---of 17
ext4_xattr_block_csum_set---of 4
ext4_xattr_block_csum_verify---of 7
ext4_xattr_block_find13%of 16
ext4_xattr_block_set---of 150
ext4_xattr_check_entries---of 14
ext4_xattr_check_entries.cold---of 1
ext4_xattr_create_cache---of 1
ext4_xattr_delete_inode---of 42
ext4_xattr_destroy_cache---of 2
ext4_xattr_free_space---of 9
ext4_xattr_get23%of 31
ext4_xattr_get_block16%of 13
ext4_xattr_ibody_find58%of 7
ext4_xattr_ibody_get43%of 14
ext4_xattr_ibody_set86%of 7
ext4_xattr_inode_array_free---of 4
ext4_xattr_inode_dec_ref_all---of 43
ext4_xattr_inode_free_quota---of 3
ext4_xattr_inode_get---of 21
ext4_xattr_inode_iget---of 11
ext4_xattr_inode_read---of 21
ext4_xattr_inode_set_class---of 1
ext4_xattr_inode_update_ref---of 24
ext4_xattr_list_entries---of 17
ext4_xattr_release_block---of 28
ext4_xattr_set---of 10
ext4_xattr_set_credits---of 7
ext4_xattr_set_entry15%of 188
ext4_xattr_set_handle46%of 72
ext4_xattr_value_same.isra.0---of 4
get_order---of 1
xattr_find_entry57%of 16

__lock_sock---of 7
__receive_sock---of 1
__release_sock---of 20
__sk_backlog_rcv---of 3
__sk_destruct---of 34
__sk_dst_check---of 12
__sk_flush_backlog---of 1
__sk_free---of 11
__sk_mem_raise_allocated---of 57
__sk_mem_reclaim---of 1
__sk_mem_reduce_allocated---of 12
__sk_mem_schedule---of 3
__sk_receive_skb---of 34
__sock_cmsg_send---of 11
__sock_i_ino---of 3
__sock_queue_rcv_skb---of 26
__sock_set_timestamps---of 9
__sock_wfree---of 5
get_order---of 1
lock_sock_fast---of 4
lock_sock_nested75%of 4
proto_exit_net---of 1
proto_init_net---of 2
proto_register---of 17
proto_register.cold---of 4
proto_seq_next---of 1
proto_seq_show---of 33
proto_seq_start---of 1
proto_seq_stop---of 1
proto_unregister---of 6
release_sock67%of 9
sk_alloc80%of 10
sk_busy_loop_end---of 4
sk_capable---of 4
sk_clear_memalloc---of 3
sk_clone_lock---of 46
sk_common_release---of 14
sk_destruct---of 5
sk_dst_check---of 26
sk_free---of 5
sk_free_unlock_clone---of 5
sk_get_meminfo---of 1
sk_getsockopt.constprop.0---of 139
sk_getsockopt.constprop.0.cold---of 2
sk_init_common100%of 1
sk_mc_loop---of 12
sk_net_capable---of 4
sk_ns_capable---of 4
sk_page_frag_refill---of 6
sk_prot_alloc57%of 16
sk_reset_timer---of 6
sk_send_sigurg---of 22
sk_set_memalloc---of 1
sk_set_peek_off---of 1
sk_setup_caps---of 8
sk_stop_timer---of 3
sk_stop_timer_sync---of 3
sk_wait_data---of 9
skb_dst_force.isra.0---of 17
skb_orphan_partial---of 27
skb_page_frag_refill---of 17
skb_set_owner_w---of 17
sock_alloc_send_pskb---of 25
sock_alloc_send_skb---of 1
sock_bind_add---of 3
sock_bindtoindex---of 5
sock_bindtoindex_locked---of 7
sock_cmsg_send---of 10
sock_common_getsockopt---of 1
sock_common_recvmsg---of 3
sock_common_setsockopt---of 1
sock_def_error_report---of 40
sock_def_readable30%of 40
sock_def_wakeup---of 21
sock_def_write_space---of 42
sock_efree---of 8
sock_enable_timestamp---of 4
sock_enable_timestamps---of 1
sock_get_timeout---of 8
sock_getsockopt---of 1
sock_gettstamp---of 9
sock_i_ino---of 4
sock_i_uid---of 3
sock_init_data75%of 4
sock_init_data_uid72%of 7
sock_inuse_exit_net---of 1
sock_inuse_get---of 3
sock_inuse_init_net---of 4
sock_kfree_s---of 3
sock_kmalloc---of 5
sock_kzfree_s---of 3
sock_load_diag_module---of 9
sock_lock_init67%of 3
sock_no_accept---of 1
sock_no_bind---of 1
sock_no_connect---of 1
sock_no_getname---of 1
sock_no_ioctl---of 1
sock_no_linger---of 1
sock_no_listen---of 1
sock_no_mmap---of 1
sock_no_recvmsg---of 1
sock_no_sendmsg---of 1
sock_no_sendmsg_locked---of 1
sock_no_sendpage---of 1
sock_no_sendpage_locked---of 1
sock_no_shutdown---of 1
sock_no_socketpair---of 1
sock_ofree---of 1
sock_omalloc---of 4
sock_pfree---of 4
sock_prot_inuse_add---of 1
sock_prot_inuse_get---of 4
sock_queue_rcv_skb---of 3
sock_recv_errqueue---of 18
sock_rfree34%of 3
sock_set_keepalive---of 3
sock_set_mark---of 3
sock_set_priority---of 1
sock_set_rcvbuf---of 1
sock_set_reuseaddr---of 1
sock_set_reuseport---of 1
sock_set_sndtimeo---of 4
sock_set_timeout---of 27
sock_set_timeout.cold---of 1
sock_setsockopt---of 197
sock_wfree---of 13
sock_wmalloc---of 5

__smp_call_single_queue100%of 2
flush_smp_call_function_from_idle---of 8
flush_smp_call_function_queue---of 33
flush_smp_call_function_queue.cold---of 6
generic_exec_single40%of 15
generic_smp_call_function_single_interrupt---of 1
kick_all_cpus_sync---of 1
on_each_cpu---of 4
on_each_cpu_cond100%of 1
on_each_cpu_cond_mask86%of 7
on_each_cpu_mask---of 6
smp_call_function---of 1
smp_call_function_any---of 6
smp_call_function_many---of 1
smp_call_function_many_cond38%of 32
smp_call_function_single58%of 14
smp_call_function_single_async---of 3
smp_call_on_cpu---of 6
smp_call_on_cpu_callback---of 5
smpcfd_dead_cpu---of 1
smpcfd_dying_cpu---of 1
smpcfd_prepare_cpu---of 2
wake_up_all_idle_cpus---of 5

__radix_tree_delete58%of 14
__radix_tree_lookup92%of 12
__radix_tree_preload20%of 26
__radix_tree_replace46%of 22
delete_node18%of 34
idr_destroy---of 12
idr_get_free67%of 36
idr_preload13%of 8
node_tag_clear58%of 7
radix_tree_cpu_dead---of 3
radix_tree_delete---of 1
radix_tree_delete_item42%of 12
radix_tree_extend---of 22
radix_tree_gang_lookup---of 14
radix_tree_gang_lookup_tag---of 14
radix_tree_gang_lookup_tag_slot---of 11
radix_tree_insert---of 26
radix_tree_iter_delete---of 2
radix_tree_iter_replace100%of 1
radix_tree_iter_resume---of 1
radix_tree_iter_tag_clear100%of 1
radix_tree_lookup100%of 1
radix_tree_lookup_slot---of 3
radix_tree_maybe_preload---of 11
radix_tree_next_chunk---of 37
radix_tree_node_alloc.constprop.0---of 13
radix_tree_node_ctor---of 1
radix_tree_node_rcu_free---of 1
radix_tree_preload---of 3
radix_tree_replace_slot---of 9
radix_tree_tag_clear---of 9
radix_tree_tag_get100%of 9
radix_tree_tag_set---of 13
radix_tree_tagged---of 1

___pskb_trim---of 46
__alloc_skb57%of 16
__build_skb---of 3
__build_skb_around---of 3
__consume_stateless_skb---of 10
__copy_skb_header---of 20
__kfree_skb---of 3
__kfree_skb_defer---of 4
__kfree_skb_flush---of 2
__napi_alloc_skb---of 15
__netdev_alloc_skb---of 20
__pskb_copy_fclone---of 23
__pskb_pull_tail---of 61
__skb_checksum---of 38
__skb_checksum_complete---of 7
__skb_checksum_complete_head---of 7
__skb_clone---of 4
__skb_complete_tx_timestamp---of 7
__skb_ext_alloc---of 3
__skb_ext_del---of 12
__skb_ext_put---of 15
__skb_ext_set---of 3
__skb_pad---of 17
__skb_splice_bits.isra.0---of 18
__skb_to_sgvec---of 33
__skb_tstamp_tx---of 27
__skb_vlan_pop---of 20
__skb_warn_lro_forwarding---of 1
__skb_warn_lro_forwarding.cold---of 1
__splice_segment.part.0.isra.0---of 23
alloc_skb_for_msg---of 3
alloc_skb_with_frags---of 17
build_skb---of 10
build_skb_around---of 12
consume_skb53%of 19
csum_block_add_ext---of 3
csum_partial_ext---of 1
kfree_skb13%of 8
kfree_skb.part.0---of 12
kfree_skb_list---of 9
kfree_skb_partial---of 5
kfree_skbmem20%of 10
mm_account_pinned_pages---of 5
mm_account_pinned_pages.part.0---of 14
mm_unaccount_pinned_pages---of 2
napi_alloc_frag---of 1
napi_consume_skb---of 26
netdev_alloc_frag---of 7
pskb_carve_inside_header---of 30
pskb_carve_inside_nonlinear---of 47
pskb_carve_inside_nonlinear.cold---of 1
pskb_expand_head---of 46
pskb_extract---of 11
pskb_put---of 6
pskb_put.cold---of 1
pskb_trim_rcsum_slow---of 13
skb_abort_seq_read---of 2
skb_add_rx_frag---of 5
skb_append---of 1
skb_append_pagefrags---of 18
skb_checksum---of 1
skb_checksum_setup---of 33
skb_checksum_setup_ip---of 12
skb_checksum_trimmed---of 17
skb_checksum_trimmed.cold---of 1
skb_clone---of 16
skb_clone_fraglist.isra.0---of 7
skb_clone_sk---of 19
skb_coalesce_rx_frag---of 1
skb_complete_tx_timestamp---of 23
skb_complete_wifi_ack---of 18
skb_condense---of 7
skb_copy---of 8
skb_copy_and_csum_bits---of 26
skb_copy_and_csum_dev---of 8
skb_copy_bits---of 23
skb_copy_expand---of 10
skb_copy_header---of 1
skb_copy_ubufs---of 61
skb_cow_data---of 46
skb_dequeue---of 4
skb_dequeue_tail---of 4
skb_dump---of 32
skb_ensure_writable---of 11
skb_eth_pop---of 8
skb_eth_push---of 13
skb_eth_push.cold---of 1
skb_expand_head---of 24
skb_ext_add---of 19
skb_find_text---of 4
skb_gro_receive---of 27
skb_gro_receive_list---of 10
skb_gso_transport_seglen---of 8
skb_gso_validate_mac_len---of 6
skb_gso_validate_network_len---of 6
skb_headers_offset_update---of 5
skb_morph---of 3
skb_mpls_dec_ttl---of 8
skb_mpls_pop---of 16
skb_mpls_push---of 21
skb_mpls_push.cold---of 1
skb_mpls_update_lse---of 6
skb_panic---of 3
skb_partial_csum_set---of 5
skb_partial_csum_set.cold---of 1
skb_prepare_for_shift---of 5
skb_prepare_seq_read---of 1
skb_pull67%of 6
skb_pull_rcsum---of 10
skb_push---of 2
skb_push.cold---of 1
skb_put75%of 4
skb_put.cold---of 1
skb_queue_head---of 1
skb_queue_purge---of 2
skb_queue_tail100%of 1
skb_rbtree_purge---of 3
skb_realloc_headroom---of 7
skb_release_data30%of 24
skb_release_head_state38%of 16
skb_scrub_packet---of 14
skb_segment---of 152
skb_segment.cold---of 1
skb_segment_list---of 41
skb_segment_list.cold---of 2
skb_send_sock_locked---of 24
skb_seq_read---of 32
skb_shift---of 48
skb_splice_bits---of 3
skb_split---of 18
skb_store_bits---of 23
skb_to_sgvec---of 3
skb_to_sgvec_nomark---of 1
skb_trim---of 4
skb_try_coalesce---of 50
skb_ts_finish---of 2
skb_ts_get_next_block---of 1
skb_tstamp_tx---of 1
skb_tx_error---of 8
skb_unlink---of 1
skb_vlan_pop---of 9
skb_vlan_push---of 16
skb_vlan_push.cold---of 1
skb_vlan_untag---of 28
skb_zerocopy---of 48
skb_zerocopy_clone---of 20
skb_zerocopy_headlen---of 6
skb_zerocopy_iter_dgram---of 1
skb_zerocopy_iter_stream---of 21
sock_dequeue_err_skb---of 15
sock_queue_err_skb---of 25
sock_rmem_free---of 1
sock_spd_release---of 6
sock_zerocopy_alloc---of 13
sock_zerocopy_callback---of 19
sock_zerocopy_put---of 8
sock_zerocopy_put_abort---of 3
sock_zerocopy_realloc---of 22
warn_crc32c_csum_combine---of 2
warn_crc32c_csum_combine.cold---of 1
warn_crc32c_csum_update---of 2
warn_crc32c_csum_update.cold---of 1

chacha_block_generic100%of 3
chacha_permute80%of 5
hchacha_block_generic---of 1

get_order---of 1
public_key_describe100%of 2
public_key_destroy---of 3
public_key_free---of 2
public_key_verify_signature---of 25
public_key_verify_signature_2---of 1
software_key_determine_akcipher---of 8
software_key_eds_op---of 22
software_key_query---of 13

__rhashtable_walk_find_next---of 40
__rht_bucket_nested---of 27
bucket_table_alloc.isra.0---of 12
bucket_table_free---of 5
bucket_table_free_rcu---of 1
jhash---of 16
lockdep_rht_bucket_is_held80%of 5
lockdep_rht_mutex_is_held75%of 4
nested_table_alloc.constprop.0---of 16
nested_table_free---of 15
rhashtable_destroy---of 1
rhashtable_free_and_destroy---of 50
rhashtable_init---of 20
rhashtable_insert_slow---of 136
rhashtable_jhash286%of 7
rhashtable_last_table---of 9
rhashtable_rehash_alloc---of 8
rhashtable_walk_enter---of 6
rhashtable_walk_exit---of 3
rhashtable_walk_next---of 22
rhashtable_walk_peek---of 9
rhashtable_walk_start_check---of 58
rhashtable_walk_stop---of 14
rhltable_init---of 1
rht_bucket_nested---of 2
rht_bucket_nested_insert---of 5
rht_deferred_worker---of 139
rht_head_hashfn---of 7

__ia32_compat_sys_signalfd---of 4
__ia32_compat_sys_signalfd4---of 4
__ia32_sys_signalfd---of 4
__ia32_sys_signalfd4---of 4
__x64_sys_signalfd100%of 4
__x64_sys_signalfd4---of 4
do_signalfd492%of 12
signalfd_cleanup---of 2
signalfd_copyinfo---of 11
signalfd_poll---of 7
signalfd_read---of 18
signalfd_release---of 1
signalfd_show_fdinfo---of 1

add_system_zone---of 16
ext4_check_blockref---of 8
ext4_destroy_system_zone---of 5
ext4_exit_system_zone---of 1
ext4_inode_block_valid100%of 1
ext4_release_system_zone---of 7
ext4_sb_block_valid69%of 35
ext4_setup_system_zone---of 34
ext4_setup_system_zone.cold---of 22

__kthread_bind_mask---of 3
__kthread_cancel_work_sync---of 11
__kthread_create_on_node---of 8
__kthread_create_worker---of 8
__kthread_init_worker---of 1
__kthread_parkme---of 5
__kthread_queue_delayed_work---of 5
__kthread_should_park67%of 3
free_kthread_struct---of 3
kthread---of 8
kthread_bind---of 1
kthread_bind_mask---of 1
kthread_cancel_delayed_work_sync---of 1
kthread_cancel_work_sync---of 1
kthread_create_on_cpu---of 5
kthread_create_on_node---of 1
kthread_create_worker---of 1
kthread_create_worker_on_cpu---of 1
kthread_data---of 3
kthread_delayed_work_timer_fn---of 9
kthread_destroy_worker---of 5
kthread_exit---of 1
kthread_flush_work---of 11
kthread_flush_work_fn---of 1
kthread_flush_worker---of 1
kthread_freezable_should_stop---of 8
kthread_func---of 5
kthread_insert_work59%of 12
kthread_insert_work_sanity_check63%of 8
kthread_is_per_cpu---of 5
kthread_mod_delayed_work---of 8
kthread_park---of 11
kthread_parkme---of 3
kthread_probe_data---of 5
kthread_queue_delayed_work---of 7
kthread_queue_work86%of 7
kthread_set_per_cpu---of 8
kthread_should_park---of 3
kthread_should_stop---of 3
kthread_stop---of 31
kthread_unpark---of 5
kthread_unuse_mm---of 8
kthread_use_mm---of 13
kthread_worker_fn---of 42
kthreadd---of 17
tsk_fork_get_node---of 3

context_to_sid64%of 30
sidtab_cancel_convert---of 1
sidtab_context_to_sid8%of 26
sidtab_context_to_sid.cold---of 1
sidtab_convert---of 14
sidtab_convert_tree---of 14
sidtab_destroy---of 7
sidtab_destroy_tree---of 9
sidtab_do_lookup22%of 28
sidtab_freeze_begin---of 1
sidtab_freeze_end---of 1
sidtab_hash_stats---of 21
sidtab_init---of 5
sidtab_search_entry82%of 11
sidtab_search_entry_force60%of 10
sidtab_set_initial---of 14
sidtab_sid2str_get34%of 45
sidtab_sid2str_put---of 2
sidtab_sid2str_put.part.029%of 14

__bpf_redirect---of 37
__bpf_redirect.cold---of 1
__bpf_sk_lookup.constprop.0---of 11
__bpf_skc_lookup.constprop.0---of 13
__get_filter---of 10
__sk_filter_charge.isra.0---of 4
_bpf_getsockopt---of 30
_bpf_setsockopt---of 78
_bpf_setsockopt.cold---of 1
bpf_bind---of 10
bpf_clear_redirect_map---of 3
bpf_clone_redirect---of 9
bpf_convert_ctx_access---of 47
bpf_convert_filter---of 120
bpf_convert_filter.cold---of 1
bpf_csum_diff---of 10
bpf_csum_level---of 15
bpf_csum_update---of 4
bpf_fib_set_fwd_params---of 1
bpf_flow_dissector_load_bytes---of 10
bpf_gen_ld_abs---of 10
bpf_get_cgroup_classid---of 1
bpf_get_hash_recalc---of 3
bpf_get_listener_sock---of 7
bpf_get_netns_cookie_sock---of 3
bpf_get_netns_cookie_sock_addr---of 5
bpf_get_route_realm---of 1
bpf_get_skb_set_tunnel_proto---of 6
bpf_get_socket_cookie---of 3
bpf_get_socket_cookie_sock---of 1
bpf_get_socket_cookie_sock_addr---of 1
bpf_get_socket_cookie_sock_ops---of 1
bpf_get_socket_uid---of 8
bpf_helper_changes_pkt_data---of 25
bpf_ipv4_fib_lookup---of 113
bpf_ipv6_fib_lookup---of 50
bpf_l3_csum_replace---of 10
bpf_l4_csum_replace---of 16
bpf_lwt_in_push_encap---of 1
bpf_lwt_xmit_push_encap---of 1
bpf_msg_apply_bytes---of 1
bpf_msg_cork_bytes---of 1
bpf_msg_pop_data---of 67
bpf_msg_pull_data---of 47
bpf_msg_push_data---of 87
bpf_noop_prologue---of 1
bpf_prepare_filter---of 56
bpf_prog_create---of 7
bpf_prog_create_from_user---of 12
bpf_prog_destroy---of 4
bpf_prog_store_orig_filter---of 4
bpf_prog_test_run_flow_dissector---of 1
bpf_prog_test_run_sk_lookup---of 1
bpf_prog_test_run_skb---of 1
bpf_prog_test_run_xdp---of 1
bpf_redirect---of 4
bpf_redirect_neigh---of 6
bpf_redirect_peer---of 4
bpf_run_sk_reuseport---of 6
bpf_search_tcp_opt---of 16
bpf_set_hash---of 1
bpf_set_hash_invalid---of 1
bpf_sk_assign---of 25
bpf_sk_fullsock---of 2
bpf_sk_lookup---of 14
bpf_sk_lookup_assign---of 13
bpf_sk_lookup_tcp---of 1
bpf_sk_lookup_udp---of 1
bpf_sk_release---of 5
bpf_skb_adjust_room---of 110
bpf_skb_change_head---of 18
bpf_skb_change_proto---of 45
bpf_skb_change_tail---of 31
bpf_skb_change_type---of 5
bpf_skb_copy---of 10
bpf_skb_ecn_set_ce---of 40
bpf_skb_event_output---of 5
bpf_skb_fib_lookup---of 10
bpf_skb_generic_pop---of 13
bpf_skb_generic_push---of 1
bpf_skb_get_nlattr---of 6
bpf_skb_get_nlattr_nest---of 7
bpf_skb_get_pay_offset---of 1
bpf_skb_get_tunnel_key---of 37
bpf_skb_get_tunnel_opt---of 28
bpf_skb_get_xfrm_state---of 10
bpf_skb_grow_rcsum---of 12
bpf_skb_is_valid_access.part.0---of 15
bpf_skb_load_bytes---of 10
bpf_skb_load_bytes_relative---of 9
bpf_skb_load_helper_16---of 12
bpf_skb_load_helper_16_no_cache---of 12
bpf_skb_load_helper_32---of 12
bpf_skb_load_helper_32_no_cache---of 12
bpf_skb_load_helper_8---of 12
bpf_skb_load_helper_8_no_cache---of 12
bpf_skb_pull_data---of 3
bpf_skb_set_tunnel_key---of 27
bpf_skb_set_tunnel_key.cold---of 1
bpf_skb_set_tunnel_opt---of 26
bpf_skb_store_bytes---of 19
bpf_skb_under_cgroup---of 7
bpf_skb_vlan_pop---of 11
bpf_skb_vlan_push---of 12
bpf_skc_lookup_tcp---of 4
bpf_skc_to_tcp6_sock---of 5
bpf_skc_to_tcp_request_sock---of 6
bpf_skc_to_tcp_sock---of 4
bpf_skc_to_tcp_timewait_sock---of 6
bpf_skc_to_udp6_sock---of 5
bpf_sock_addr_getsockopt---of 1
bpf_sock_addr_setsockopt---of 1
bpf_sock_addr_sk_lookup_tcp---of 1
bpf_sock_addr_sk_lookup_udp---of 1
bpf_sock_addr_skc_lookup_tcp---of 1
bpf_sock_common_is_valid_access---of 5
bpf_sock_convert_ctx_access---of 22
bpf_sock_is_valid_access---of 12
bpf_sock_is_valid_access.part.0---of 12
bpf_sock_ops_cb_flags_set---of 3
bpf_sock_ops_get_syn.isra.0---of 18
bpf_sock_ops_getsockopt---of 11
bpf_sock_ops_load_hdr_opt---of 21
bpf_sock_ops_reserve_hdr_opt---of 6
bpf_sock_ops_setsockopt---of 1
bpf_sock_ops_store_hdr_opt---of 14
bpf_tcp_check_syncookie---of 24
bpf_tcp_gen_syncookie---of 17
bpf_tcp_sock---of 3
bpf_tcp_sock_convert_ctx_access---of 28
bpf_tcp_sock_is_valid_access---of 8
bpf_warn_invalid_xdp_action---of 1
bpf_warn_invalid_xdp_action.cold---of 2
bpf_xdp_adjust_head---of 8
bpf_xdp_adjust_meta---of 7
bpf_xdp_adjust_tail---of 9
bpf_xdp_copy---of 1
bpf_xdp_event_output---of 5
bpf_xdp_fib_lookup---of 9
bpf_xdp_redirect---of 4
bpf_xdp_redirect_map---of 4
bpf_xdp_sk_lookup_tcp---of 1
bpf_xdp_sk_lookup_udp---of 1
bpf_xdp_skc_lookup_tcp---of 1
bpf_xdp_sock_convert_ctx_access---of 3
bpf_xdp_sock_is_valid_access---of 5
cg_skb_func_proto---of 9
cg_skb_is_valid_access---of 19
convert_bpf_ld_abs---of 26
copy_bpf_fprog_from_user---of 12
copy_overflow---of 1
flow_dissector_convert_ctx_access---of 6
flow_dissector_func_proto---of 6
flow_dissector_is_valid_access---of 11
get_order---of 1
lwt_in_func_proto---of 9
lwt_is_valid_access---of 14
lwt_out_func_proto---of 7
lwt_seg6local_func_proto---of 7
lwt_xmit_func_proto---of 6
neigh_hh_output---of 16
sk_attach_bpf---of 2
sk_attach_filter---of 18
sk_detach_filter---of 10
sk_filter_charge---of 17
sk_filter_func_proto---of 7
sk_filter_is_valid_access---of 10
sk_filter_release_rcu---of 5
sk_filter_trim_cap37%of 36
sk_filter_uncharge---of 5
sk_get_filter---of 16
sk_lookup.constprop.0---of 15
sk_lookup_convert_ctx_access---of 13
sk_lookup_func_proto---of 6
sk_lookup_is_valid_access---of 8
sk_msg_convert_ctx_access---of 15
sk_msg_func_proto---of 7
sk_msg_is_valid_access---of 11
sk_reuseport_attach_bpf---of 2
sk_reuseport_attach_filter---of 9
sk_reuseport_convert_ctx_access---of 11
sk_reuseport_func_proto---of 2
sk_reuseport_is_valid_access---of 12
sk_reuseport_load_bytes---of 10
sk_reuseport_load_bytes_relative---of 9
sk_reuseport_prog_free---of 6
sk_select_reuseport---of 20
sk_skb_adjust_room---of 25
sk_skb_change_head---of 18
sk_skb_change_tail---of 27
sk_skb_convert_ctx_access---of 7
sk_skb_func_proto---of 7
sk_skb_is_valid_access---of 16
sk_skb_prologue---of 3
sk_skb_pull_data---of 3
skb_do_redirect---of 280
skb_do_redirect.cold---of 2
sock_addr_convert_ctx_access---of 63
sock_addr_func_proto---of 10
sock_addr_is_valid_access---of 36
sock_filter_func_proto---of 3
sock_filter_is_valid_access---of 10
sock_ops_convert_ctx_access---of 366
sock_ops_func_proto---of 7
sock_ops_is_valid_access---of 16
tc_cls_act_convert_ctx_access---of 4
tc_cls_act_func_proto---of 9
tc_cls_act_is_valid_access---of 12
tc_cls_act_prologue---of 3
xdp_convert_ctx_access---of 11
xdp_do_generic_redirect---of 40
xdp_do_redirect---of 22
xdp_func_proto---of 7
xdp_is_valid_access---of 12

perf_kprobe_destroy---of 1
perf_kprobe_init---of 13
perf_trace_add---of 10
perf_trace_buf_alloc67%of 9
perf_trace_buf_update100%of 1
perf_trace_del---of 4
perf_trace_destroy---of 1
perf_trace_event_init---of 35
perf_trace_event_unreg.isra.0---of 4
perf_trace_init---of 9
perf_uprobe_destroy---of 1
perf_uprobe_init---of 11

logon_vet_description---of 3
user_describe100%of 2
user_destroy---of 1
user_free_payload_rcu---of 1
user_free_preparse---of 1
user_preparse---of 6
user_read---of 9
user_revoke---of 7
user_update---of 11

_netlbl_catmap_getnode---of 17
netlbl_audit_start---of 1
netlbl_bitmap_setbit---of 3
netlbl_bitmap_walk---of 10
netlbl_cache_add---of 9
netlbl_cache_invalidate---of 1
netlbl_catmap_getlong---of 14
netlbl_catmap_setbit---of 3
netlbl_catmap_setlong---of 4
netlbl_catmap_setrng---of 7
netlbl_catmap_walk---of 17
netlbl_catmap_walkrng---of 23
netlbl_cfg_calipso_add---of 1
netlbl_cfg_calipso_del---of 1
netlbl_cfg_calipso_map_add---of 19
netlbl_cfg_cipsov4_add---of 1
netlbl_cfg_cipsov4_del---of 1
netlbl_cfg_cipsov4_map_add---of 19
netlbl_cfg_map_del---of 8
netlbl_cfg_unlbl_map_add---of 17
netlbl_cfg_unlbl_static_add---of 3
netlbl_cfg_unlbl_static_del---of 3
netlbl_conn_setattr---of 25
netlbl_enabled---of 1
netlbl_req_delattr---of 3
netlbl_req_setattr---of 24
netlbl_skbuff_err---of 3
netlbl_skbuff_getattr---of 7
netlbl_skbuff_setattr---of 22
netlbl_sock_delattr---of 3
netlbl_sock_getattr---of 6
netlbl_sock_setattr53%of 19

vmacache_find91%of 11
vmacache_update100%of 3

__bitmap_and---of 5
__bitmap_andnot---of 5
__bitmap_clear75%of 4
__bitmap_complement---of 2
__bitmap_equal---of 6
__bitmap_intersects---of 6
__bitmap_or---of 2
__bitmap_or_equal---of 6
__bitmap_replace---of 2
__bitmap_set75%of 4
__bitmap_shift_left---of 8
__bitmap_shift_right---of 12
__bitmap_subset---of 6
__bitmap_weight---of 5
__bitmap_xor---of 2
bitmap_alloc---of 1
bitmap_allocate_region---of 5
bitmap_bitremap---of 9
bitmap_cut---of 11
bitmap_find_free_region---of 9
bitmap_find_next_zero_area_off---of 5
bitmap_fold---of 3
bitmap_free---of 1
bitmap_from_arr32---of 6
bitmap_onto---of 5
bitmap_ord_to_pos---of 4
bitmap_parse---of 21
bitmap_parse_user---of 5
bitmap_parselist---of 59
bitmap_parselist_user---of 5
bitmap_print_to_pagebuf---of 4
bitmap_release_region---of 3
bitmap_remap---of 11
bitmap_to_arr32---of 6
bitmap_zalloc---of 1
devm_bitmap_alloc---of 4
devm_bitmap_free---of 1
devm_bitmap_zalloc---of 1
get_order---of 1

crypto_alloc_shash---of 1
crypto_exit_shash_ops_async---of 1
crypto_grab_shash---of 1
crypto_init_shash_ops_async---of 7
crypto_register_shash---of 3
crypto_register_shashes---of 7
crypto_shash_alg_has_setkey---of 1
crypto_shash_digest---of 6
crypto_shash_exit_tfm---of 1
crypto_shash_final---of 5
crypto_shash_finup---of 10
crypto_shash_free_instance---of 1
crypto_shash_init_tfm---of 11
crypto_shash_report---of 1
crypto_shash_setkey---of 9
crypto_shash_show---of 1
crypto_shash_tfm_digest---of 2
crypto_shash_tfm_digest.cold---of 1
crypto_shash_update75%of 4
crypto_unregister_shash---of 1
crypto_unregister_shashes---of 2
shash_ahash_digest---of 7
shash_ahash_finup---of 11
shash_ahash_update---of 6
shash_async_digest---of 1
shash_async_export---of 1
shash_async_final---of 5
shash_async_finup---of 1
shash_async_import---of 3
shash_async_init---of 3
shash_async_setkey---of 9
shash_async_update---of 6
shash_default_export---of 1
shash_default_import---of 1
shash_digest_unaligned---of 4
shash_final_unaligned---of 6
shash_finup_unaligned---of 8
shash_free_singlespawn_instance---of 1
shash_no_setkey---of 1
shash_prepare_alg---of 16
shash_register_instance---of 7
shash_setkey_unaligned---of 3
shash_update_unaligned---of 5

__inode_wait_for_writeback50%of 4
__mark_inode_dirty60%of 71
__traceiter_balance_dirty_pages---of 4
__traceiter_bdi_dirty_ratelimit---of 4
__traceiter_global_dirty_state---of 4
__traceiter_sb_clear_inode_writeback---of 4
__traceiter_sb_mark_inode_writeback---of 4
__traceiter_wait_on_page_writeback---of 4
__traceiter_wbc_writepage---of 4
__traceiter_writeback_bdi_register---of 4
__traceiter_writeback_congestion_wait---of 4
__traceiter_writeback_dirty_inode---of 4
__traceiter_writeback_dirty_inode_enqueue---of 4
__traceiter_writeback_dirty_inode_start---of 4
__traceiter_writeback_dirty_page---of 4
__traceiter_writeback_exec---of 4
__traceiter_writeback_lazytime---of 4
__traceiter_writeback_lazytime_iput---of 4
__traceiter_writeback_mark_inode_dirty---of 4
__traceiter_writeback_pages_written---of 4
__traceiter_writeback_queue---of 4
__traceiter_writeback_queue_io---of 4
__traceiter_writeback_sb_inodes_requeue---of 4
__traceiter_writeback_single_inode---of 4
__traceiter_writeback_single_inode_start---of 4
__traceiter_writeback_start---of 4
__traceiter_writeback_wait---of 4
__traceiter_writeback_wait_iff_congested---of 4
__traceiter_writeback_wake_background---of 4
__traceiter_writeback_write_inode---of 4
__traceiter_writeback_write_inode_start---of 4
__traceiter_writeback_written---of 4
__wakeup_flusher_threads_bdi.part.0---of 8
__writeback_inodes_sb_nr---of 10
__writeback_inodes_wb---of 8
__writeback_single_inode---of 61
dirtytime_interval_handler---of 6
finish_writeback_work.constprop.0---of 5
inode_io_list_del---of 5
inode_io_list_del_locked---of 5
inode_io_list_move_locked67%of 6
inode_sleep_on_writeback---of 3
inode_wait_for_writeback100%of 1
move_expired_inodes---of 18
perf_trace_balance_dirty_pages---of 8
perf_trace_bdi_dirty_ratelimit---of 6
perf_trace_global_dirty_state---of 6
perf_trace_wbc_class---of 6
perf_trace_writeback_bdi_register---of 6
perf_trace_writeback_class---of 6
perf_trace_writeback_congest_waited_template---of 6
perf_trace_writeback_dirty_inode_template---of 10
perf_trace_writeback_inode_template---of 6
perf_trace_writeback_page_template---of 14
perf_trace_writeback_pages_written---of 6
perf_trace_writeback_queue_io---of 6
perf_trace_writeback_sb_inodes_requeue---of 13
perf_trace_writeback_single_inode_template---of 10
perf_trace_writeback_work_class---of 8
perf_trace_writeback_write_inode_template---of 10
queue_io---of 17
redirty_tail_locked---of 6
sb_clear_inode_writeback---of 12
sb_mark_inode_writeback---of 12
sync_inode---of 1
sync_inode_metadata---of 1
sync_inodes_sb---of 39
trace_event_raw_event_balance_dirty_pages---of 12
trace_event_raw_event_bdi_dirty_ratelimit---of 10
trace_event_raw_event_global_dirty_state---of 10
trace_event_raw_event_wbc_class---of 10
trace_event_raw_event_writeback_bdi_register---of 10
trace_event_raw_event_writeback_class---of 10
trace_event_raw_event_writeback_congest_waited_template---of 10
trace_event_raw_event_writeback_dirty_inode_template---of 14
trace_event_raw_event_writeback_inode_template---of 10
trace_event_raw_event_writeback_page_template---of 18
trace_event_raw_event_writeback_pages_written---of 10
trace_event_raw_event_writeback_queue_io---of 10
trace_event_raw_event_writeback_sb_inodes_requeue---of 17
trace_event_raw_event_writeback_single_inode_template---of 14
trace_event_raw_event_writeback_work_class---of 12
trace_event_raw_event_writeback_write_inode_template---of 14
trace_raw_output_balance_dirty_pages---of 4
trace_raw_output_bdi_dirty_ratelimit---of 4
trace_raw_output_global_dirty_state---of 4
trace_raw_output_wbc_class---of 4
trace_raw_output_writeback_bdi_register---of 5
trace_raw_output_writeback_class---of 5
trace_raw_output_writeback_congest_waited_template---of 4
trace_raw_output_writeback_dirty_inode_template---of 4
trace_raw_output_writeback_inode_template---of 4
trace_raw_output_writeback_page_template---of 4
trace_raw_output_writeback_pages_written---of 5
trace_raw_output_writeback_queue_io---of 4
trace_raw_output_writeback_sb_inodes_requeue---of 4
trace_raw_output_writeback_single_inode_template---of 4
trace_raw_output_writeback_work_class---of 4
trace_raw_output_writeback_write_inode_template---of 4
try_to_writeback_inodes_sb---of 2
wakeup_dirtytime_writeback---of 22
wakeup_flusher_threads---of 22
wakeup_flusher_threads_bdi---of 15
wb_io_lists_depopulated---of 6
wb_io_lists_populated34%of 6
wb_queue_work---of 15
wb_start_background_writeback---of 12
wb_wait_for_completion---of 6
wb_workfn---of 56
wb_writeback---of 47
write_inode_now---of 7
writeback_inodes_sb---of 1
writeback_inodes_sb_nr---of 1
writeback_sb_inodes---of 55
writeback_single_inode---of 20

should_fail_usercopy100%of 1

__break_lease---of 105
__ia32_sys_flock---of 17
__locks_insert_block---of 13
__locks_wake_up_blocks---of 6
__show_fd_locks---of 6
__traceiter_break_lease_block---of 4
__traceiter_break_lease_noblock---of 4
__traceiter_break_lease_unblock---of 4
__traceiter_fcntl_setlk---of 4
__traceiter_flock_lock_inode---of 4
__traceiter_generic_add_lease---of 4
__traceiter_generic_delete_lease---of 4
__traceiter_leases_conflict---of 4
__traceiter_locks_get_lock_context---of 4
__traceiter_locks_remove_posix---of 4
__traceiter_posix_lock_inode---of 4
__traceiter_time_out_leases---of 4
__x64_sys_flock---of 17
any_leases_conflict.isra.0---of 7
check_conflicting_open---of 9
fcntl_getlease---of 28
fcntl_getlk---of 18
fcntl_setlease---of 14
fcntl_setlk---of 51
flock64_to_posix_lock---of 17
flock_lock_inode40%of 64
flock_locks_conflict---of 7
flock_make_lock---of 13
generic_setlease---of 104
lease_alloc---of 5
lease_break_callback---of 1
lease_get_mtime---of 8
lease_modify---of 17
lease_modify.cold---of 1
lease_register_notifier---of 1
lease_setup---of 3
lease_unregister_notifier---of 1
leases_conflict---of 17
lock_get_status---of 51
locks_alloc_lock---of 3
locks_check_ctx_file_list100%of 3
locks_check_ctx_file_list.cold---of 1
locks_copy_conflock---of 3
locks_copy_lock---of 8
locks_delete_block---of 5
locks_delete_global_blocked---of 7
locks_delete_global_locks---of 9
locks_dump_ctx_list---of 3
locks_free_lock---of 1
locks_free_lock_context20%of 5
locks_free_lock_context.cold---of 1
locks_get_lock_context40%of 15
locks_init_lock---of 1
locks_insert_global_locks---of 6
locks_lock_inode_wait---of 22
locks_mandatory_area---of 17
locks_mandatory_locked---of 9
locks_move_blocks---of 6
locks_next---of 1
locks_owner_has_blockers---of 7
locks_release_private---of 18
locks_remove_file16%of 26
locks_remove_flock67%of 9
locks_remove_posix22%of 19
locks_show---of 7
locks_start---of 1
locks_stop---of 1
locks_translate_pid.part.0---of 13
perf_trace_filelock_lease---of 9
perf_trace_filelock_lock---of 9
perf_trace_generic_add_lease---of 6
perf_trace_leases_conflict---of 6
perf_trace_locks_get_lock_context---of 6
posix_lock_file---of 1
posix_lock_inode---of 146
posix_locks_conflict---of 7
posix_test_lock---of 20
show_fd_locks---of 4
time_out_leases---of 21
trace_event_raw_event_filelock_lease---of 13
trace_event_raw_event_filelock_lock---of 13
trace_event_raw_event_generic_add_lease---of 10
trace_event_raw_event_leases_conflict---of 10
trace_event_raw_event_locks_get_lock_context---of 10
trace_raw_output_filelock_lease---of 4
trace_raw_output_filelock_lock---of 4
trace_raw_output_generic_add_lease---of 4
trace_raw_output_leases_conflict---of 4
trace_raw_output_locks_get_lock_context---of 4
vfs_cancel_lock---of 3
vfs_inode_has_locks---of 5
vfs_lock_file---of 4
vfs_setlease---of 7
vfs_test_lock---of 10

__get_vm_area_caller---of 1
__get_vm_area_node---of 12
__purge_vmap_area_lazy---of 113
__vmalloc---of 1
__vmalloc_node---of 3
__vmalloc_node_range---of 40
__vunmap---of 46
_vm_unmap_aliases.part.0---of 25
alloc_vmap_area---of 152
alloc_vmap_area.cold---of 1
find_vm_area---of 11
free_vm_area---of 3
free_vmap_area_noflush---of 6
free_vmap_area_rb_augment_cb_copy---of 1
free_vmap_area_rb_augment_cb_propagate---of 7
free_vmap_area_rb_augment_cb_rotate---of 5
free_vmap_block---of 4
free_work---of 2
get_order---of 1
get_vm_area---of 3
get_vm_area_caller---of 3
insert_vmap_area.constprop.0---of 17
insert_vmap_area_augment.constprop.0---of 34
is_vmalloc_addr40%of 5
is_vmalloc_or_module_addr---of 3
map_kernel_range---of 1
map_kernel_range_noflush---of 51
pcpu_free_vm_areas---of 5
pcpu_get_vm_areas---of 250
purge_fragmented_blocks---of 23
purge_vmap_area_lazy---of 4
pvm_determine_end_from_reverse---of 7
register_vmap_purge_notifier---of 1
remap_vmalloc_range---of 1
remap_vmalloc_range_partial---of 22
remove_vm_area---of 12
s_next---of 1
s_show---of 33
s_start---of 1
s_stop---of 1
set_iounmap_nonlazy---of 1
unmap_kernel_range---of 1
unmap_kernel_range_noflush---of 55
unregister_vmap_purge_notifier---of 1
vfree---of 9
vfree_atomic---of 5
vm_map_ram---of 132
vm_unmap_aliases---of 2
vm_unmap_ram---of 27
vmalloc---of 1
vmalloc_32---of 1
vmalloc_32_user---of 3
vmalloc_node---of 1
vmalloc_nr_pages---of 1
vmalloc_to_page---of 20
vmalloc_to_pfn---of 1
vmalloc_user---of 3
vmap---of 8
vmap_pfn---of 7
vmap_pfn_apply---of 44
vread---of 33
vunmap---of 4
vwrite---of 27
vzalloc---of 1
vzalloc_node---of 1

__ata_port_probe---of 1
__ata_qc_complete---of 18
__traceiter_ata_eh_link_autopsy---of 4
__traceiter_ata_eh_link_autopsy_qc---of 4
__traceiter_ata_qc_complete_done---of 4
__traceiter_ata_qc_complete_failed---of 4
__traceiter_ata_qc_complete_internal---of 4
__traceiter_ata_qc_issue---of 4
async_port_probe---of 7
ata_build_rw_tf30%of 27
ata_bus_probe---of 43
ata_cable_40wire---of 1
ata_cable_80wire---of 1
ata_cable_ignore---of 1
ata_cable_sata---of 1
ata_cable_unknown---of 1
ata_dev_blacklisted---of 8
ata_dev_classify---of 6
ata_dev_configure---of 275
ata_dev_configure.cold---of 48
ata_dev_init---of 5
ata_dev_next---of 19
ata_dev_pair---of 7
ata_dev_phys_link---of 5
ata_dev_printk---of 1
ata_dev_read_id---of 52
ata_dev_read_id.cold---of 7
ata_dev_reread_id---of 4
ata_dev_revalidate---of 12
ata_dev_revalidate.cold---of 11
ata_dev_same_device---of 4
ata_dev_same_device.cold---of 3
ata_dev_set_feature---of 5
ata_devres_release---of 10
ata_do_dev_read_id---of 3
ata_do_set_mode---of 137
ata_do_set_mode.cold---of 13
ata_down_xfermask_limit---of 21
ata_down_xfermask_limit.cold---of 10
ata_dummy_qc_issue---of 1
ata_exec_internal---of 5
ata_exec_internal_sg---of 43
ata_exec_internal_sg.cold---of 1
ata_finalize_port_ops---of 14
ata_force_cbl---of 6
ata_force_cbl.cold---of 1
ata_host_activate---of 15
ata_host_alloc---of 11
ata_host_alloc_pinfo---of 9
ata_host_detach---of 3
ata_host_get---of 5
ata_host_init---of 1
ata_host_put---of 5
ata_host_register---of 21
ata_host_register.cold---of 10
ata_host_release---of 5
ata_host_resume---of 1
ata_host_start---of 3
ata_host_start.part.0---of 31
ata_host_start.part.0.cold---of 1
ata_host_stop---of 8
ata_host_suspend---of 1
ata_id_c_string---of 5
ata_id_c_string.cold---of 1
ata_id_n_sectors---of 12
ata_id_string---of 4
ata_id_xfermask---of 17
ata_identify_page_supported---of 10
ata_identify_page_supported.cold---of 1
ata_link_init---of 1
ata_link_next---of 18
ata_link_offline---of 10
ata_link_online---of 10
ata_link_printk---of 5
ata_mode_string---of 4
ata_msleep---of 8
ata_noop_qc_prep---of 1
ata_pack_xfermask---of 1
ata_pci_device_do_resume---of 3
ata_pci_device_do_resume.cold---of 1
ata_pci_device_do_suspend---of 3
ata_pci_device_resume---of 3
ata_pci_device_resume.cold---of 1
ata_pci_device_suspend---of 3
ata_pci_remove_one---of 3
ata_pci_shutdown_one---of 6
ata_phys_link_offline---of 3
ata_phys_link_online---of 3
ata_pio_need_iordy---of 9
ata_platform_remove_one---of 3
ata_port_alloc---of 3
ata_port_detach---of 22
ata_port_pm_freeze---of 5
ata_port_pm_poweroff---of 1
ata_port_pm_resume---of 1
ata_port_pm_suspend---of 5
ata_port_printk---of 1
ata_port_probe---of 5
ata_port_request_pm---of 12
ata_port_runtime_idle---of 6
ata_port_runtime_resume---of 1
ata_port_runtime_suspend---of 1
ata_print_version---of 1
ata_qc_complete---of 56
ata_qc_complete_internal---of 1
ata_qc_free---of 5
ata_qc_get_active---of 3
ata_qc_issue56%of 36
ata_qc_new_init78%of 9
ata_ratelimit---of 1
ata_read_log_page---of 3
ata_read_log_page.part.0---of 14
ata_read_log_page.part.0.cold---of 1
ata_rwcmd_protocol75%of 8
ata_sas_port_resume---of 1
ata_sas_port_suspend---of 1
ata_sg_init100%of 1
ata_std_postreset---of 5
ata_std_postreset.cold---of 5
ata_std_prereset---of 10
ata_std_prereset.cold---of 1
ata_std_qc_defer---of 7
ata_tf_read_block---of 8
ata_tf_read_block.cold---of 1
ata_tf_to_lba---of 1
ata_tf_to_lba48---of 1
ata_timing_cycle2mode---of 18
ata_unpack_xfermask---of 6
ata_wait_after_reset---of 5
ata_wait_ready---of 23
ata_wait_ready.cold---of 1
ata_wait_register---of 4
ata_xfer_mask2mode---of 6
ata_xfer_mode2mask---of 6
ata_xfer_mode2shift---of 5
atapi_check_dma---of 5
atapi_cmd_type---of 5
get_order---of 1
pci_test_config_bits---of 6
perf_trace_ata_eh_link_autopsy---of 6
perf_trace_ata_eh_link_autopsy_qc---of 6
perf_trace_ata_qc_complete_template---of 6
perf_trace_ata_qc_issue---of 6
sata_down_spd_limit---of 11
sata_down_spd_limit.cold---of 6
sata_link_init_spd---of 17
sata_link_init_spd.cold---of 2
sata_spd_string---of 3
sata_std_hardreset---of 3
trace_event_raw_event_ata_eh_link_autopsy---of 10
trace_event_raw_event_ata_eh_link_autopsy_qc---of 10
trace_event_raw_event_ata_qc_complete_template---of 10
trace_event_raw_event_ata_qc_issue---of 10
trace_raw_output_ata_eh_link_autopsy---of 4
trace_raw_output_ata_eh_link_autopsy_qc---of 4
trace_raw_output_ata_qc_complete_template---of 4
trace_raw_output_ata_qc_issue---of 4

count_shadow_nodes---of 31
scan_shadow_nodes---of 1
shadow_lru_isolate---of 9
workingset_activation62%of 18
workingset_age_nonresident82%of 11
workingset_eviction---of 28
workingset_refault---of 64
workingset_update_node50%of 8

____fput100%of 1
__alloc_file75%of 8
__fput63%of 35
__fput_sync---of 4
alloc_empty_file58%of 7
alloc_empty_file.cold---of 1
alloc_empty_file_noaccount---of 3
alloc_file92%of 12
alloc_file_clone---of 3
alloc_file_pseudo84%of 6
delayed_fput---of 2
file_free_rcu---of 4
flush_delayed_fput---of 2
fput100%of 1
fput_many67%of 6
get_max_files---of 1
proc_nr_files---of 1

__blk_mq_alloc_map_and_request---of 4
__blk_mq_alloc_request65%of 17
__blk_mq_complete_request_remote---of 3
__blk_mq_delay_run_hw_queue37%of 19
__blk_mq_end_request---of 17
__blk_mq_free_request---of 8
__blk_mq_insert_request---of 20
__blk_mq_requeue_request---of 19
__blk_mq_run_hw_queue45%of 9
__blk_mq_try_issue_directly48%of 25
blk_add_rq_to_plug60%of 5
blk_done_softirq---of 5
blk_freeze_queue---of 1
blk_freeze_queue_start---of 4
blk_mq_add_to_requeue_list---of 7
blk_mq_alloc_request---of 6
blk_mq_alloc_request_hctx---of 17
blk_mq_alloc_rq_map---of 8
blk_mq_alloc_rqs---of 23
blk_mq_alloc_tag_set---of 50
blk_mq_alloc_tag_set.cold---of 2
blk_mq_check_expired---of 12
blk_mq_check_inflight---of 5
blk_mq_complete_request---of 2
blk_mq_complete_request_remote---of 13
blk_mq_delay_kick_requeue_list---of 1
blk_mq_delay_run_hw_queue100%of 1
blk_mq_delay_run_hw_queues---of 5
blk_mq_dequeue_from_ctx---of 18
blk_mq_dispatch_rq_list57%of 80
blk_mq_dispatch_wake---of 3
blk_mq_end_request---of 19
blk_mq_exit_hctx---of 17
blk_mq_exit_queue---of 9
blk_mq_flush_busy_ctxs---of 10
blk_mq_flush_plug_list68%of 28
blk_mq_free_map_and_requests---of 3
blk_mq_free_request---of 22
blk_mq_free_rq_map---of 1
blk_mq_free_rqs---of 17
blk_mq_free_tag_set---of 7
blk_mq_freeze_queue---of 1
blk_mq_freeze_queue_wait---of 6
blk_mq_freeze_queue_wait_timeout---of 12
blk_mq_get_driver_tag38%of 27
blk_mq_has_request---of 4
blk_mq_hctx_mark_pending---of 2
blk_mq_hctx_notify_dead---of 12
blk_mq_hctx_notify_offline---of 30
blk_mq_hctx_notify_online---of 6
blk_mq_in_flight---of 1
blk_mq_in_flight_rw---of 1
blk_mq_init_allocated_queue---of 31
blk_mq_init_queue---of 4
blk_mq_init_queue_data---of 4
blk_mq_init_sq_queue---of 7
blk_mq_insert_requests---of 16
blk_mq_kick_requeue_list---of 1
blk_mq_map_swqueue---of 28
blk_mq_poll_stats_bkt---of 4
blk_mq_poll_stats_fn---of 4
blk_mq_put_rq_ref---of 7
blk_mq_queue_inflight---of 1
blk_mq_queue_stopped---of 5
blk_mq_quiesce_queue---of 6
blk_mq_quiesce_queue_nowait---of 1
blk_mq_realloc_hw_ctxs---of 56
blk_mq_realloc_hw_ctxs.cold---of 1
blk_mq_realloc_tag_set_tags.part.0---of 6
blk_mq_release---of 9
blk_mq_request_bypass_insert---of 5
blk_mq_request_issue_directly---of 1
blk_mq_requeue_request---of 5
blk_mq_requeue_work---of 11
blk_mq_rq_cpu---of 1
blk_mq_rq_ctx_init.constprop.077%of 21
blk_mq_rq_inflight---of 4
blk_mq_run_hw_queue100%of 10
blk_mq_run_hw_queues---of 5
blk_mq_run_work_fn---of 3
blk_mq_start_hw_queue---of 1
blk_mq_start_hw_queues---of 2
blk_mq_start_request74%of 15
blk_mq_start_stopped_hw_queue---of 3
blk_mq_start_stopped_hw_queues---of 6
blk_mq_stop_hw_queue---of 1
blk_mq_stop_hw_queues---of 2
blk_mq_submit_bio47%of 94
blk_mq_tag_to_rq---of 3
blk_mq_timeout_work---of 13
blk_mq_trigger_softirq---of 7
blk_mq_try_issue_directly58%of 7
blk_mq_try_issue_list_directly---of 33
blk_mq_unfreeze_queue---of 5
blk_mq_unquiesce_queue---of 1
blk_mq_update_nr_hw_queues---of 37
blk_mq_update_nr_hw_queues.cold---of 1
blk_mq_update_nr_requests---of 16
blk_mq_update_queue_map---of 17
blk_mq_update_tag_set_shared---of 10
blk_mq_wake_waiters---of 5
blk_poll---of 45
blk_softirq_cpu_dead---of 5
hctx_lock45%of 9
hctx_unlock42%of 12
percpu_ref_tryget_many.constprop.0---of 19
plug_rq_cmp---of 6

___d_drop54%of 13
__d_alloc60%of 15
__d_drop---of 2
__d_free---of 1
__d_free_external---of 1
__d_instantiate62%of 13
__d_instantiate_anon---of 20
__d_lookup33%of 34
__d_lookup_done50%of 10
__d_lookup_rcu68%of 25
__d_move---of 76
__d_obtain_alias---of 11
__d_rehash50%of 10
__dentry_kill75%of 24
__dput_to_list---of 8
__lock_parent---of 18
d_add35%of 23
d_add_ci---of 10
d_alloc80%of 5
d_alloc_anon---of 1
d_alloc_cursor---of 5
d_alloc_name---of 1
d_alloc_parallel18%of 128
d_alloc_pseudo100%of 3
d_ancestor---of 4
d_delete---of 5
d_drop---of 3
d_exact_alias---of 20
d_exchange---of 9
d_find_alias---of 13
d_find_any_alias---of 3
d_flags_for_inode85%of 13
d_genocide---of 1
d_genocide_kill---of 8
d_hash_and_lookup---of 5
d_instantiate75%of 4
d_instantiate_anon---of 1
d_instantiate_new58%of 7
d_invalidate---of 12
d_lookup75%of 8
d_lru_add50%of 6
d_lru_del---of 6
d_lru_shrink_move---of 5
d_make_root67%of 6
d_mark_dontcache---of 5
d_move---of 1
d_obtain_alias---of 1
d_obtain_root---of 1
d_prune_aliases---of 13
d_rehash---of 1
d_set_d_op64%of 19
d_set_fallthru---of 1
d_set_mounted90%of 10
d_shrink_del---of 3
d_splice_alias43%of 52
d_splice_alias.cold---of 1
d_tmpfile---of 6
d_walk---of 60
dentry_free63%of 8
dentry_lru_isolate---of 11
dentry_lru_isolate_shrink---of 3
dentry_unlink_inode67%of 15
dget_parent---of 40
dput46%of 79
dput_to_list29%of 46
find_submount---of 4
is_subdir---of 30
path_check_mount---of 5
path_has_submounts---of 1
proc_nr_dentry---of 9
prune_dcache_sb---of 1
read_word_at_a_time100%of 1
release_dentry_name_snapshot---of 4
select_collect---of 12
select_collect2---of 20
shrink_dcache_for_umount---of 10
shrink_dcache_parent---of 27
shrink_dcache_sb---of 8
shrink_dentry_list4%of 30
shrink_lock_dentry.part.0---of 14
take_dentry_name_snapshot---of 4
umount_check---of 4
umount_check.cold---of 3

__blk_should_fake_timeout---of 1
blk_abort_request---of 1
blk_add_timer60%of 10
blk_rq_timeout---of 2
part_timeout_show---of 1
part_timeout_store---of 6

cn_proc_mcast_ctl---of 23
proc_comm_connector---of 16
proc_coredump_connector---of 35
proc_exec_connector---of 16
proc_exit_connector---of 35
proc_fork_connector10%of 33
proc_id_connector---of 44
proc_ptrace_connector---of 20
proc_sid_connector---of 16

build_aevent---of 22
copy_from_user_policy100%of 1
copy_sec_ctx---of 3
copy_templates---of 2
copy_to_user_policy---of 1
copy_to_user_state---of 1
copy_to_user_state_extra---of 39
copy_to_user_tmpl.part.0---of 3
dump_one_policy---of 17
dump_one_state---of 9
get_order---of 1
validate_tmpl.part.0---of 10
verify_newpolicy_info.isra.073%of 11
xfrm_add_acquire---of 15
xfrm_add_pol_expire---of 26
xfrm_add_policy43%of 14
xfrm_add_sa---of 147
xfrm_add_sa_expire---of 13
xfrm_alloc_userspi---of 27
xfrm_compile_policy---of 14
xfrm_del_sa---of 13
xfrm_do_migrate---of 1
xfrm_dump_policy---of 1
xfrm_dump_policy_done---of 1
xfrm_dump_policy_start---of 1
xfrm_dump_sa---of 16
xfrm_dump_sa_done---of 4
xfrm_flush_policy---of 6
xfrm_flush_sa---of 4
xfrm_get_ae---of 20
xfrm_get_default---of 7
xfrm_get_policy---of 33
xfrm_get_sa---of 10
xfrm_get_sadinfo---of 14
xfrm_get_spdinfo---of 23
xfrm_is_alive---of 20
xfrm_netlink_rcv100%of 1
xfrm_new_ae---of 24
xfrm_policy_construct50%of 16
xfrm_send_acquire---of 31
xfrm_send_mapping---of 16
xfrm_send_migrate---of 1
xfrm_send_policy_notify---of 72
xfrm_send_policy_notify.cold---of 1
xfrm_send_report---of 20
xfrm_send_state_notify---of 95
xfrm_send_state_notify.cold---of 1
xfrm_set_default---of 36
xfrm_set_spdinfo---of 18
xfrm_state_netlink---of 4
xfrm_update_ae_params---of 15
xfrm_user_net_exit---of 4
xfrm_user_net_init---of 3
xfrm_user_rcv_msg60%of 22
xfrm_user_state_lookup.constprop.0---of 14

__virt_addr_valid53%of 40

__fsnotify_inode_delete100%of 1
__fsnotify_parent36%of 51
__fsnotify_vfsmount_delete---of 1
fsnotify58%of 118
fsnotify_first_mark65%of 17
fsnotify_handle_inode_event.isra.045%of 18
fsnotify_sb_delete---of 15
fsnotify_set_children_dentry_flags---of 10

iint_init_once---of 1
integrity_iint_find---of 10
integrity_inode_free12%of 9
integrity_inode_get---of 16
integrity_inode_get.cold---of 1
integrity_kernel_read---of 1

__do_sys_adjtimex---of 4
__do_sys_adjtimex_time32---of 5
__ia32_compat_sys_gettimeofday---of 8
__ia32_compat_sys_settimeofday---of 12
__ia32_sys_adjtimex---of 1
__ia32_sys_adjtimex_time32---of 1
__ia32_sys_gettimeofday---of 8
__ia32_sys_settimeofday---of 12
__ia32_sys_stime---of 5
__ia32_sys_stime32---of 5
__ia32_sys_time---of 3
__ia32_sys_time32---of 4
__msecs_to_jiffies100%of 3
__usecs_to_jiffies---of 3
__x64_sys_adjtimex---of 1
__x64_sys_adjtimex_time32---of 1
__x64_sys_gettimeofday---of 8
__x64_sys_settimeofday---of 12
__x64_sys_stime---of 5
__x64_sys_stime32---of 5
__x64_sys_time---of 3
__x64_sys_time32---of 4
clock_t_to_jiffies---of 3
do_sys_settimeofday64---of 15
get_itimerspec64---of 3
get_old_itimerspec32---of 5
get_old_timespec32---of 3
get_old_timex32---of 3
get_timespec64---of 5
jiffies64_to_msecs---of 1
jiffies64_to_nsecs---of 1
jiffies_64_to_clock_t---of 1
jiffies_to_clock_t---of 1
jiffies_to_msecs---of 1
jiffies_to_timespec64---of 1
jiffies_to_usecs---of 1
mktime64100%of 3
ns_to_kernel_old_timeval---of 5
ns_to_timespec64---of 5
nsec_to_clock_t---of 1
nsecs_to_jiffies---of 1
nsecs_to_jiffies64---of 1
put_itimerspec64---of 5
put_old_itimerspec32---of 5
put_old_timespec32---of 2
put_old_timex32---of 2
put_timespec64---of 2
set_normalized_timespec64---of 5
timespec64_add_safe---of 7
timespec64_to_jiffies---of 3

__blk_add_trace.constprop.0---of 36
__blk_trace_remove---of 12
__blk_trace_setup---of 5
__blk_trace_startstop---of 15
__trace_note_message---of 10
blk_add_driver_data---of 28
blk_add_trace_bio39%of 26
blk_add_trace_bio_backmerge---of 1
blk_add_trace_bio_bounce---of 1
blk_add_trace_bio_complete---of 1
blk_add_trace_bio_frontmerge---of 1
blk_add_trace_bio_queue100%of 1
blk_add_trace_bio_remap---of 27
blk_add_trace_getrq10%of 22
blk_add_trace_plug50%of 20
blk_add_trace_rq.constprop.033%of 31
blk_add_trace_rq_complete---of 1
blk_add_trace_rq_insert100%of 1
blk_add_trace_rq_issue100%of 1
blk_add_trace_rq_merge---of 1
blk_add_trace_rq_remap---of 27
blk_add_trace_rq_requeue---of 1
blk_add_trace_sleeprq---of 22
blk_add_trace_split---of 20
blk_add_trace_unplug48%of 21
blk_create_buf_file_callback---of 1
blk_dropped_read---of 4
blk_dropped_read.cold---of 1
blk_fill_rwbs---of 18
blk_log_action---of 6
blk_log_action_classic---of 1
blk_log_dump_pdu---of 15
blk_log_generic---of 6
blk_log_plug---of 1
blk_log_remap---of 2
blk_log_split---of 2
blk_log_unplug---of 2
blk_log_with_error---of 5
blk_msg_write---of 5
blk_register_tracepoints---of 34
blk_remove_buf_file_callback---of 1
blk_subbuf_start_callback---of 4
blk_trace_event_print---of 1
blk_trace_event_print_binary---of 1
blk_trace_free---of 1
blk_trace_init_sysfs---of 1
blk_trace_ioctl---of 8
blk_trace_remove---of 1
blk_trace_remove_sysfs---of 1
blk_trace_setup---of 1
blk_trace_setup_queue---of 11
blk_trace_shutdown---of 8
blk_trace_startstop---of 1
blk_tracer_init---of 1
blk_tracer_print_header---of 2
blk_tracer_print_line---of 5
blk_tracer_reset---of 1
blk_tracer_set_flag---of 7
blk_tracer_start---of 1
blk_tracer_stop---of 1
blk_unregister_tracepoints---of 1
compat_blk_trace_setup---of 5
do_blk_trace_setup---of 37
do_blk_trace_setup.cold---of 2
fill_rwbs---of 20
print_one_line---of 11
sysfs_blk_trace_attr_show---of 29
sysfs_blk_trace_attr_store---of 63
trace_note.constprop.0---of 12

__scsi_execute---of 17
__scsi_init_queue---of 14
device_block---of 7
device_quiesce_fn---of 1
device_resume_fn---of 5
device_unblock---of 1
get_order---of 1
scsi_alloc_sgtables40%of 30
scsi_block_requests---of 1
scsi_cleanup_rq---of 7
scsi_cmd_runtime_exceeced---of 5
scsi_commit_rqs---of 1
scsi_dec_host_busy---of 19
scsi_device_from_queue---of 5
scsi_device_quiesce---of 9
scsi_device_resume---of 5
scsi_device_set_state---of 12
scsi_device_unbusy---of 3
scsi_end_request---of 45
scsi_evt_thread---of 21
scsi_exit_queue---of 1
scsi_free_sgtables---of 5
scsi_host_block---of 11
scsi_host_unblock---of 6
scsi_init_command78%of 9
scsi_init_sense_cache---of 6
scsi_initialize_rq---of 1
scsi_internal_device_block_nowait---of 7
scsi_internal_device_unblock_nowait---of 8
scsi_io_completion---of 78
scsi_kmap_atomic_sg---of 10
scsi_kmap_atomic_sg.cold---of 2
scsi_kunmap_atomic_sg---of 1
scsi_map_queues---of 5
scsi_mode_select---of 12
scsi_mode_sense---of 39
scsi_mq_alloc_queue---of 3
scsi_mq_done---of 13
scsi_mq_exit_request---of 6
scsi_mq_free_tags---of 1
scsi_mq_get_budget50%of 8
scsi_mq_init_request---of 13
scsi_mq_lld_busy---of 7
scsi_mq_put_budget100%of 1
scsi_mq_requeue_cmd---of 9
scsi_mq_setup_tags---of 7
scsi_queue_insert---of 1
scsi_queue_rq30%of 113
scsi_requeue_run_queue---of 1
scsi_result_to_blk_status---of 15
scsi_run_host_queues---of 2
scsi_run_queue---of 26
scsi_run_queue_async---of 6
scsi_set_blocked---of 4
scsi_softirq_done---of 9
scsi_start_queue---of 1
scsi_target_block---of 3
scsi_target_quiesce---of 1
scsi_target_resume---of 1
scsi_target_unblock---of 4
scsi_test_unit_ready---of 13
scsi_timeout---of 3
scsi_unblock_requests---of 2
scsi_uninit_cmd---of 3
scsi_vpd_lun_id---of 74
scsi_vpd_tpg_id---of 35
sdev_disable_disk_events---of 1
sdev_enable_disk_events---of 3
sdev_evt_alloc---of 5
sdev_evt_send---of 1
sdev_evt_send_simple---of 5
target_block---of 4
target_unblock---of 4

__change_pid---of 9
__ia32_sys_pidfd_getfd---of 9
__ia32_sys_pidfd_open---of 5
__task_pid_nr_ns69%of 29
__x64_sys_pidfd_getfd---of 9
__x64_sys_pidfd_open---of 5
alloc_pid52%of 43
attach_pid60%of 5
change_pid---of 5
cpumask_weight.constprop.0---of 1
delayed_put_pid---of 2
detach_pid---of 1
disable_pid_allocation---of 1
exchange_tids---of 5
find_ge_pid---of 1
find_get_pid---of 21
find_get_task_by_vpid---of 21
find_pid_ns---of 1
find_task_by_pid_ns50%of 6
find_task_by_vpid---of 3
find_vpid---of 3
free_pid50%of 8
get_pid_task58%of 19
get_task_pid56%of 27
pid_nr_ns---of 6
pid_task64%of 11
pid_vnr100%of 7
pidfd_create---of 12
pidfd_get_pid---of 12
pidfd_getfd---of 15
put_pid100%of 2
put_pid.part.060%of 5
task_active_pid_ns100%of 3
transfer_pid---of 5

__ata_sff_port_intr---of 18
ata_bmdma_dumb_qc_prep---of 12
ata_bmdma_error_handler---of 18
ata_bmdma_interrupt---of 26
ata_bmdma_irq_clear---of 2
ata_bmdma_nodma---of 1
ata_bmdma_port_intr---of 10
ata_bmdma_port_start---of 4
ata_bmdma_port_start32---of 4
ata_bmdma_post_internal_cmd---of 2
ata_bmdma_qc_issue50%of 12
ata_bmdma_qc_prep90%of 10
ata_bmdma_setup100%of 3
ata_bmdma_start100%of 1
ata_bmdma_status---of 1
ata_bmdma_stop---of 5
ata_dev_select.constprop.067%of 9
ata_devchk---of 3
ata_hsm_qc_complete---of 12
ata_pci_bmdma_clear_simplex---of 3
ata_pci_bmdma_init---of 12
ata_pci_bmdma_init.cold---of 3
ata_pci_bmdma_init_one---of 1
ata_pci_bmdma_prepare_host---of 4
ata_pci_init_one---of 15
ata_pci_init_one.cold---of 1
ata_pci_sff_activate_host---of 24
ata_pci_sff_init_host---of 12
ata_pci_sff_init_host.cold---of 4
ata_pci_sff_init_one---of 1
ata_pci_sff_prepare_host---of 7
ata_pci_sff_prepare_host.cold---of 1
ata_pio_sector---of 13
ata_pio_sectors---of 12
ata_pio_xfer---of 4
ata_sff_busy_sleep---of 22
ata_sff_check_ready---of 3
ata_sff_check_status100%of 1
ata_sff_data_xfer---of 7
ata_sff_data_xfer32---of 15
ata_sff_dev_classify---of 13
ata_sff_dev_select84%of 6
ata_sff_dma_pause---of 5
ata_sff_dma_pause.part.0---of 1
ata_sff_drain_fifo---of 8
ata_sff_error_handler---of 9
ata_sff_exec_command80%of 5
ata_sff_exit---of 1
ata_sff_flush_pio_task---of 2
ata_sff_freeze---of 6
ata_sff_hsm_move---of 93
ata_sff_interrupt---of 26
ata_sff_irq_on---of 11
ata_sff_lost_interrupt---of 10
ata_sff_pause---of 5
ata_sff_pio_task---of 20
ata_sff_port_init---of 1
ata_sff_port_intr---of 1
ata_sff_postreset---of 11
ata_sff_prereset---of 7
ata_sff_qc_fill_rtf---of 1
ata_sff_qc_issue30%of 17
ata_sff_queue_delayed_work---of 1
ata_sff_queue_pio_task---of 4
ata_sff_queue_work---of 1
ata_sff_softreset---of 13
ata_sff_std_ports---of 1
ata_sff_tf_load43%of 19
ata_sff_tf_read---of 4
ata_sff_thaw---of 3
ata_sff_wait_after_reset---of 13
ata_sff_wait_ready---of 1
sata_sff_hardreset---of 4

____sys_recvmsg---of 27
____sys_sendmsg25%of 32
___sys_recvmsg---of 9
___sys_sendmsg100%of 3
__copy_msghdr_from_user40%of 23
__do_sys_socketcall---of 29
__ia32_sys_accept---of 1
__ia32_sys_accept4---of 1
__ia32_sys_bind---of 1
__ia32_sys_connect---of 1
__ia32_sys_getpeername---of 1
__ia32_sys_getsockname---of 1
__ia32_sys_getsockopt---of 1
__ia32_sys_listen---of 1
__ia32_sys_recv---of 1
__ia32_sys_recvfrom---of 1
__ia32_sys_recvmmsg---of 9
__ia32_sys_recvmmsg_time32---of 9
__ia32_sys_recvmsg---of 1
__ia32_sys_send---of 1
__ia32_sys_sendmmsg---of 1
__ia32_sys_sendmsg---of 1
__ia32_sys_sendto---of 1
__ia32_sys_setsockopt---of 1
__ia32_sys_shutdown---of 1
__ia32_sys_socket---of 1
__ia32_sys_socketcall---of 1
__ia32_sys_socketpair---of 1
__sock_create52%of 43
__sock_create.cold---of 2
__sock_recv_timestamp---of 48
__sock_recv_ts_and_drops---of 15
__sock_recv_wifi_status---of 4
__sock_release---of 9
__sock_release.cold---of 1
__sock_sendmsg67%of 9
__sock_tx_timestamp---of 7
__sys_accept4---of 4
__sys_accept4_file---of 6
__sys_bind---of 11
__sys_connect---of 9
__sys_connect_file---of 5
__sys_getpeername---of 10
__sys_getsockname---of 7
__sys_getsockopt---of 12
__sys_listen---of 7
__sys_recvfrom---of 13
__sys_recvmmsg---of 15
__sys_recvmsg---of 8
__sys_recvmsg_sock---of 1
__sys_sendmmsg---of 26
__sys_sendmsg88%of 8
__sys_sendmsg_sock---of 1
__sys_sendto---of 16
__sys_setsockopt---of 13
__sys_shutdown---of 7
__sys_shutdown_sock---of 3
__sys_socket75%of 8
__sys_socketpair---of 23
__x64_sys_accept---of 1
__x64_sys_accept4---of 1
__x64_sys_bind---of 1
__x64_sys_connect---of 1
__x64_sys_getpeername---of 1
__x64_sys_getsockname---of 1
__x64_sys_getsockopt---of 1
__x64_sys_listen---of 1
__x64_sys_recv---of 1
__x64_sys_recvfrom---of 1
__x64_sys_recvmmsg---of 9
__x64_sys_recvmmsg_time32---of 9
__x64_sys_recvmsg---of 1
__x64_sys_send---of 1
__x64_sys_sendmmsg---of 1
__x64_sys_sendmsg100%of 1
__x64_sys_sendto---of 1
__x64_sys_setsockopt---of 1
__x64_sys_shutdown---of 1
__x64_sys_socket100%of 1
__x64_sys_socketcall---of 1
__x64_sys_socketpair---of 1
brioctl_set---of 1
compat_ifr_data_ioctl---of 5
compat_sock_ioctl---of 53
copy_overflow---of 1
dlci_ioctl_set---of 1
do_accept---of 12
do_recvmmsg---of 32
init_once---of 1
kernel_accept---of 5
kernel_bind---of 2
kernel_bind.cold---of 1
kernel_connect---of 2
kernel_connect.cold---of 1
kernel_getpeername---of 1
kernel_getsockname---of 1
kernel_listen---of 1
kernel_recvmsg---of 1
kernel_sendmsg---of 1
kernel_sendmsg_locked---of 4
kernel_sendpage---of 4
kernel_sendpage.part.0---of 9
kernel_sendpage_locked---of 4
kernel_sock_ip_overhead---of 30
kernel_sock_shutdown---of 1
move_addr_to_kernel---of 4
move_addr_to_kernel.part.0---of 5
move_addr_to_user---of 11
recvmsg_copy_msghdr---of 6
sendmsg_copy_msghdr84%of 6
sock_alloc100%of 3
sock_alloc_file75%of 8
sock_alloc_inode100%of 3
sock_close---of 1
sock_create---of 1
sock_create_kern---of 1
sock_create_lite---of 5
sock_do_ioctl---of 15
sock_fasync---of 6
sock_free_inode---of 1
sock_from_file---of 5
sock_ioctl---of 34
sock_is_registered---of 3
sock_mmap---of 1
sock_poll---of 15
sock_read_iter---of 7
sock_recvmsg---of 7
sock_register---of 10
sock_register.cold---of 1
sock_release---of 6
sock_release.cold---of 1
sock_sendmsg---of 12
sock_sendmsg.cold---of 1
sock_sendpage---of 4
sock_show_fdinfo---of 2
sock_splice_read---of 4
sock_unregister---of 3
sock_wake_async---of 8
sock_write_iter---of 8
socket_seq_show---of 1
sockfd_lookup---of 8
sockfd_lookup_light63%of 8
sockfs_dname---of 1
sockfs_init_fs_context---of 3
sockfs_listxattr---of 9
sockfs_security_xattr_set---of 1
sockfs_setattr---of 5
sockfs_xattr_get---of 5
vlan_ioctl_set---of 1

__ext4_abort---of 6
__ext4_abort.cold---of 7
__ext4_error---of 17
__ext4_error.cold---of 1
__ext4_error_file---of 17
__ext4_error_file.cold---of 4
__ext4_error_inode---of 17
__ext4_error_inode.cold---of 4
__ext4_grp_locked_error---of 24
__ext4_grp_locked_error.cold---of 5
__ext4_msg---of 3
__ext4_msg.cold---of 1
__ext4_sb_bread_gfp.isra.0---of 9
__ext4_std_error---of 13
__ext4_warning---of 3
__ext4_warning.cold---of 1
__ext4_warning_inode---of 3
__ext4_warning_inode.cold---of 1
__save_error_info---of 12
__traceiter_ext4_alloc_da_blocks---of 4
__traceiter_ext4_allocate_blocks---of 4
__traceiter_ext4_allocate_inode---of 4
__traceiter_ext4_begin_ordered_truncate---of 4
__traceiter_ext4_collapse_range---of 4
__traceiter_ext4_da_release_space---of 4
__traceiter_ext4_da_reserve_space---of 4
__traceiter_ext4_da_update_reserve_space---of 4
__traceiter_ext4_da_write_begin---of 4
__traceiter_ext4_da_write_end---of 4
__traceiter_ext4_da_write_pages---of 4
__traceiter_ext4_da_write_pages_extent---of 4
__traceiter_ext4_direct_IO_enter---of 4
__traceiter_ext4_direct_IO_exit---of 4
__traceiter_ext4_discard_blocks---of 4
__traceiter_ext4_discard_preallocations---of 4
__traceiter_ext4_drop_inode---of 4
__traceiter_ext4_error---of 4
__traceiter_ext4_es_cache_extent---of 4
__traceiter_ext4_es_find_extent_range_enter---of 4
__traceiter_ext4_es_find_extent_range_exit---of 4
__traceiter_ext4_es_insert_delayed_block---of 4
__traceiter_ext4_es_insert_extent---of 4
__traceiter_ext4_es_lookup_extent_enter---of 4
__traceiter_ext4_es_lookup_extent_exit---of 4
__traceiter_ext4_es_remove_extent---of 4
__traceiter_ext4_es_shrink---of 4
__traceiter_ext4_es_shrink_count---of 4
__traceiter_ext4_es_shrink_scan_enter---of 4
__traceiter_ext4_es_shrink_scan_exit---of 4
__traceiter_ext4_evict_inode---of 4
__traceiter_ext4_ext_convert_to_initialized_enter---of 4
__traceiter_ext4_ext_convert_to_initialized_fastpath---of 4
__traceiter_ext4_ext_handle_unwritten_extents---of 4
__traceiter_ext4_ext_in_cache---of 4
__traceiter_ext4_ext_load_extent---of 4
__traceiter_ext4_ext_map_blocks_enter---of 4
__traceiter_ext4_ext_map_blocks_exit---of 4
__traceiter_ext4_ext_put_in_cache---of 4
__traceiter_ext4_ext_remove_space---of 4
__traceiter_ext4_ext_remove_space_done---of 4
__traceiter_ext4_ext_rm_idx---of 4
__traceiter_ext4_ext_rm_leaf---of 4
__traceiter_ext4_ext_show_extent---of 4
__traceiter_ext4_fallocate_enter---of 4
__traceiter_ext4_fallocate_exit---of 4
__traceiter_ext4_fc_commit_start---of 4
__traceiter_ext4_fc_commit_stop---of 4
__traceiter_ext4_fc_replay---of 4
__traceiter_ext4_fc_replay_scan---of 4
__traceiter_ext4_fc_stats---of 4
__traceiter_ext4_fc_track_create---of 4
__traceiter_ext4_fc_track_inode---of 4
__traceiter_ext4_fc_track_link---of 4
__traceiter_ext4_fc_track_range---of 4
__traceiter_ext4_fc_track_unlink---of 4
__traceiter_ext4_find_delalloc_range---of 4
__traceiter_ext4_forget---of 4
__traceiter_ext4_free_blocks---of 4
__traceiter_ext4_free_inode---of 4
__traceiter_ext4_fsmap_high_key---of 4
__traceiter_ext4_fsmap_low_key---of 4
__traceiter_ext4_fsmap_mapping---of 4
__traceiter_ext4_get_implied_cluster_alloc_exit---of 4
__traceiter_ext4_get_reserved_cluster_alloc---of 4
__traceiter_ext4_getfsmap_high_key---of 4
__traceiter_ext4_getfsmap_low_key---of 4
__traceiter_ext4_getfsmap_mapping---of 4
__traceiter_ext4_ind_map_blocks_enter---of 4
__traceiter_ext4_ind_map_blocks_exit---of 4
__traceiter_ext4_insert_range---of 4
__traceiter_ext4_invalidatepage---of 4
__traceiter_ext4_journal_start---of 4
__traceiter_ext4_journal_start_reserved---of 4
__traceiter_ext4_journalled_invalidatepage---of 4
__traceiter_ext4_journalled_write_end---of 4
__traceiter_ext4_lazy_itable_init---of 4
__traceiter_ext4_load_inode---of 4
__traceiter_ext4_load_inode_bitmap---of 4
__traceiter_ext4_mark_inode_dirty---of 4
__traceiter_ext4_mb_bitmap_load---of 4
__traceiter_ext4_mb_buddy_bitmap_load---of 4
__traceiter_ext4_mb_discard_preallocations---of 4
__traceiter_ext4_mb_new_group_pa---of 4
__traceiter_ext4_mb_new_inode_pa---of 4
__traceiter_ext4_mb_release_group_pa---of 4
__traceiter_ext4_mb_release_inode_pa---of 4
__traceiter_ext4_mballoc_alloc---of 4
__traceiter_ext4_mballoc_discard---of 4
__traceiter_ext4_mballoc_free---of 4
__traceiter_ext4_mballoc_prealloc---of 4
__traceiter_ext4_nfs_commit_metadata---of 4
__traceiter_ext4_other_inode_update_time---of 4
__traceiter_ext4_prefetch_bitmaps---of 4
__traceiter_ext4_punch_hole---of 4
__traceiter_ext4_read_block_bitmap_load---of 4
__traceiter_ext4_readpage---of 4
__traceiter_ext4_releasepage---of 4
__traceiter_ext4_remove_blocks---of 4
__traceiter_ext4_request_blocks---of 4
__traceiter_ext4_request_inode---of 4
__traceiter_ext4_shutdown---of 4
__traceiter_ext4_sync_file_enter---of 4
__traceiter_ext4_sync_file_exit---of 4
__traceiter_ext4_sync_fs---of 4
__traceiter_ext4_trim_all_free---of 4
__traceiter_ext4_trim_extent---of 4
__traceiter_ext4_truncate_enter---of 4
__traceiter_ext4_truncate_exit---of 4
__traceiter_ext4_unlink_enter---of 4
__traceiter_ext4_unlink_exit---of 4
__traceiter_ext4_write_begin---of 4
__traceiter_ext4_write_end---of 4
__traceiter_ext4_writepage---of 4
__traceiter_ext4_writepages---of 4
__traceiter_ext4_writepages_result---of 4
__traceiter_ext4_zero_range---of 4
_ext4_show_options---of 113
bdev_try_to_free_page---of 8
clear_qf_name---of 10
descriptor_loc---of 9
ext4_acquire_dquot---of 9
ext4_alloc_flex_bg_array---of 32
ext4_alloc_inode100%of 3
ext4_block_bitmap100%of 3
ext4_block_bitmap_set---of 2
ext4_calculate_overhead---of 50
ext4_clear_inode---of 2
ext4_clear_journal_err---of 5
ext4_clear_request_list---of 5
ext4_commit_super---of 29
ext4_decode_error---of 8
ext4_destroy_inode---of 4
ext4_drop_inode---of 12
ext4_enable_quotas---of 28
ext4_feature_set_ok---of 15
ext4_fh_to_dentry---of 1
ext4_fh_to_parent---of 1
ext4_fill_super---of 642
ext4_fill_super.cold---of 3
ext4_force_commit---of 5
ext4_free_group_clusters100%of 3
ext4_free_group_clusters_set100%of 2
ext4_free_in_core_inode---of 2
ext4_free_in_core_inode.cold---of 1
ext4_free_inodes_count100%of 3
ext4_free_inodes_set100%of 2
ext4_freeze---of 7
ext4_get_dquots---of 1
ext4_get_journal_inode---of 10
ext4_group_desc_csum52%of 27
ext4_group_desc_csum_set80%of 5
ext4_group_desc_csum_verify---of 6
ext4_handle_error---of 10
ext4_handle_error.cold---of 1
ext4_init_journal_params---of 7
ext4_inode_bitmap100%of 3
ext4_inode_bitmap_set---of 2
ext4_inode_table100%of 3
ext4_inode_table_set---of 2
ext4_itable_unused_count100%of 3
ext4_itable_unused_set---of 2
ext4_journal_commit_callback---of 5
ext4_journal_finish_inode_data_buffers---of 4
ext4_journal_submit_inode_data_buffers---of 4
ext4_journalled_writepage_callback---of 10
ext4_lazyinit_thread---of 79
ext4_mark_dquot_dirty---of 6
ext4_mark_group_bitmap_corrupted---of 10
ext4_mark_recovery_complete.constprop.0---of 9
ext4_mount---of 1
ext4_nfs_commit_metadata---of 10
ext4_nfs_get_inode---of 6
ext4_put_super---of 39
ext4_put_super.cold---of 1
ext4_quota_off---of 13
ext4_quota_on---of 20
ext4_quota_read---of 14
ext4_quota_write---of 22
ext4_read_bh---of 14
ext4_read_bh_lock---of 7
ext4_read_bh_nowait---of 10
ext4_register_li_request---of 23
ext4_release_dquot---of 38
ext4_remount---of 114
ext4_sb_bread---of 1
ext4_sb_bread_unmovable---of 1
ext4_sb_breadahead_unmovable---of 4
ext4_seq_options_show---of 2
ext4_setup_super---of 25
ext4_setup_super.cold---of 1
ext4_show_options---of 1
ext4_statfs---of 31
ext4_superblock_csum_set67%of 9
ext4_sync_fs---of 27
ext4_unfreeze---of 9
ext4_unregister_li_request---of 5
ext4_update_dynamic_rev---of 2
ext4_used_dirs_count---of 3
ext4_used_dirs_set---of 2
ext4_write_dquot---of 8
ext4_write_info---of 5
init_once---of 1
parse_options---of 175
perf_trace_ext4__bitmap_load---of 6
perf_trace_ext4__es_extent---of 7
perf_trace_ext4__es_shrink_enter---of 6
perf_trace_ext4__fallocate_mode---of 6
perf_trace_ext4__map_blocks_enter---of 6
perf_trace_ext4__map_blocks_exit---of 6
perf_trace_ext4__mb_new_pa---of 6
perf_trace_ext4__mballoc---of 8
perf_trace_ext4__page_op---of 6
perf_trace_ext4__trim---of 6
perf_trace_ext4__truncate---of 6
perf_trace_ext4__write_begin---of 6
perf_trace_ext4__write_end---of 6
perf_trace_ext4_alloc_da_blocks---of 6
perf_trace_ext4_allocate_blocks---of 6
perf_trace_ext4_allocate_inode---of 6
perf_trace_ext4_begin_ordered_truncate---of 6
perf_trace_ext4_collapse_range---of 6
perf_trace_ext4_da_release_space---of 6
perf_trace_ext4_da_reserve_space---of 6
perf_trace_ext4_da_update_reserve_space---of 6
perf_trace_ext4_da_write_pages---of 6
perf_trace_ext4_da_write_pages_extent---of 6
perf_trace_ext4_direct_IO_enter---of 6
perf_trace_ext4_direct_IO_exit---of 6
perf_trace_ext4_discard_blocks---of 6
perf_trace_ext4_discard_preallocations---of 6
perf_trace_ext4_drop_inode---of 6
perf_trace_ext4_error---of 6
perf_trace_ext4_es_find_extent_range_enter---of 6
perf_trace_ext4_es_find_extent_range_exit---of 7
perf_trace_ext4_es_insert_delayed_block---of 7
perf_trace_ext4_es_lookup_extent_enter---of 6
perf_trace_ext4_es_lookup_extent_exit---of 7
perf_trace_ext4_es_remove_extent---of 6
perf_trace_ext4_es_shrink---of 6
perf_trace_ext4_es_shrink_scan_exit---of 6
perf_trace_ext4_evict_inode---of 6
perf_trace_ext4_ext_convert_to_initialized_enter---of 9
perf_trace_ext4_ext_convert_to_initialized_fastpath---of 12
perf_trace_ext4_ext_handle_unwritten_extents---of 6
perf_trace_ext4_ext_in_cache---of 6
perf_trace_ext4_ext_load_extent---of 6
perf_trace_ext4_ext_put_in_cache---of 6
perf_trace_ext4_ext_remove_space---of 6
perf_trace_ext4_ext_remove_space_done---of 6
perf_trace_ext4_ext_rm_idx---of 6
perf_trace_ext4_ext_rm_leaf---of 9
perf_trace_ext4_ext_show_extent---of 6
perf_trace_ext4_fallocate_exit---of 6
perf_trace_ext4_fc_commit_start---of 6
perf_trace_ext4_fc_commit_stop---of 6
perf_trace_ext4_fc_replay---of 6
perf_trace_ext4_fc_replay_scan---of 6
perf_trace_ext4_fc_stats---of 8
perf_trace_ext4_fc_track_create---of 6
perf_trace_ext4_fc_track_inode---of 6
perf_trace_ext4_fc_track_link---of 6
perf_trace_ext4_fc_track_range---of 6
perf_trace_ext4_fc_track_unlink---of 6
perf_trace_ext4_find_delalloc_range---of 6
perf_trace_ext4_forget---of 6
perf_trace_ext4_free_blocks---of 6
perf_trace_ext4_free_inode---of 6
perf_trace_ext4_fsmap_class---of 6
perf_trace_ext4_get_implied_cluster_alloc_exit---of 6
perf_trace_ext4_get_reserved_cluster_alloc---of 6
perf_trace_ext4_getfsmap_class---of 6
perf_trace_ext4_insert_range---of 6
perf_trace_ext4_invalidatepage_op---of 6
perf_trace_ext4_journal_start---of 6
perf_trace_ext4_journal_start_reserved---of 6
perf_trace_ext4_lazy_itable_init---of 6
perf_trace_ext4_load_inode---of 6
perf_trace_ext4_mark_inode_dirty---of 6
perf_trace_ext4_mb_discard_preallocations---of 6
perf_trace_ext4_mb_release_group_pa---of 6
perf_trace_ext4_mb_release_inode_pa---of 6
perf_trace_ext4_mballoc_alloc---of 6
perf_trace_ext4_mballoc_prealloc---of 6
perf_trace_ext4_nfs_commit_metadata---of 6
perf_trace_ext4_other_inode_update_time---of 6
perf_trace_ext4_prefetch_bitmaps---of 6
perf_trace_ext4_read_block_bitmap_load---of 6
perf_trace_ext4_remove_blocks---of 9
perf_trace_ext4_request_blocks---of 6
perf_trace_ext4_request_inode---of 6
perf_trace_ext4_shutdown---of 6
perf_trace_ext4_sync_file_enter---of 6
perf_trace_ext4_sync_file_exit---of 6
perf_trace_ext4_sync_fs---of 6
perf_trace_ext4_unlink_enter---of 6
perf_trace_ext4_unlink_exit---of 6
perf_trace_ext4_writepages---of 6
perf_trace_ext4_writepages_result---of 6
print_daily_error_info---of 5
print_daily_error_info.cold---of 10
ratelimit_state_init---of 1
rcu_lock_acquire.constprop.0---of 1
rcu_read_unlock---of 7
set_qf_name---of 20
trace_event_raw_event_ext4__bitmap_load---of 10
trace_event_raw_event_ext4__es_extent---of 11
trace_event_raw_event_ext4__es_shrink_enter---of 10
trace_event_raw_event_ext4__fallocate_mode---of 10
trace_event_raw_event_ext4__map_blocks_enter---of 10
trace_event_raw_event_ext4__map_blocks_exit---of 10
trace_event_raw_event_ext4__mb_new_pa---of 10
trace_event_raw_event_ext4__mballoc---of 12
trace_event_raw_event_ext4__page_op---of 10
trace_event_raw_event_ext4__trim---of 10
trace_event_raw_event_ext4__truncate---of 10
trace_event_raw_event_ext4__write_begin---of 10
trace_event_raw_event_ext4__write_end---of 10
trace_event_raw_event_ext4_alloc_da_blocks---of 10
trace_event_raw_event_ext4_allocate_blocks---of 10
trace_event_raw_event_ext4_allocate_inode---of 10
trace_event_raw_event_ext4_begin_ordered_truncate---of 10
trace_event_raw_event_ext4_collapse_range---of 10
trace_event_raw_event_ext4_da_release_space---of 10
trace_event_raw_event_ext4_da_reserve_space---of 10
trace_event_raw_event_ext4_da_update_reserve_space---of 10
trace_event_raw_event_ext4_da_write_pages---of 10
trace_event_raw_event_ext4_da_write_pages_extent---of 10
trace_event_raw_event_ext4_direct_IO_enter---of 10
trace_event_raw_event_ext4_direct_IO_exit---of 10
trace_event_raw_event_ext4_discard_blocks---of 10
trace_event_raw_event_ext4_discard_preallocations---of 10
trace_event_raw_event_ext4_drop_inode---of 10
trace_event_raw_event_ext4_error---of 10
trace_event_raw_event_ext4_es_find_extent_range_enter---of 10
trace_event_raw_event_ext4_es_find_extent_range_exit---of 11
trace_event_raw_event_ext4_es_insert_delayed_block---of 11
trace_event_raw_event_ext4_es_lookup_extent_enter---of 10
trace_event_raw_event_ext4_es_lookup_extent_exit---of 11
trace_event_raw_event_ext4_es_remove_extent---of 10
trace_event_raw_event_ext4_es_shrink---of 10
trace_event_raw_event_ext4_es_shrink_scan_exit---of 10
trace_event_raw_event_ext4_evict_inode---of 10
trace_event_raw_event_ext4_ext_convert_to_initialized_enter---of 13
trace_event_raw_event_ext4_ext_convert_to_initialized_fastpath---of 16
trace_event_raw_event_ext4_ext_handle_unwritten_extents---of 10
trace_event_raw_event_ext4_ext_in_cache---of 10
trace_event_raw_event_ext4_ext_load_extent---of 10
trace_event_raw_event_ext4_ext_put_in_cache---of 10
trace_event_raw_event_ext4_ext_remove_space---of 10
trace_event_raw_event_ext4_ext_remove_space_done---of 10
trace_event_raw_event_ext4_ext_rm_idx---of 10
trace_event_raw_event_ext4_ext_rm_leaf---of 13
trace_event_raw_event_ext4_ext_show_extent---of 10
trace_event_raw_event_ext4_fallocate_exit---of 10
trace_event_raw_event_ext4_fc_commit_start---of 10
trace_event_raw_event_ext4_fc_commit_stop---of 10
trace_event_raw_event_ext4_fc_replay---of 10
trace_event_raw_event_ext4_fc_replay_scan---of 10
trace_event_raw_event_ext4_fc_stats---of 12
trace_event_raw_event_ext4_fc_track_create---of 10
trace_event_raw_event_ext4_fc_track_inode---of 10
trace_event_raw_event_ext4_fc_track_link---of 10
trace_event_raw_event_ext4_fc_track_range---of 10
trace_event_raw_event_ext4_fc_track_unlink---of 10
trace_event_raw_event_ext4_find_delalloc_range---of 10
trace_event_raw_event_ext4_forget---of 10
trace_event_raw_event_ext4_free_blocks---of 10
trace_event_raw_event_ext4_free_inode---of 10
trace_event_raw_event_ext4_fsmap_class---of 10
trace_event_raw_event_ext4_get_implied_cluster_alloc_exit---of 10
trace_event_raw_event_ext4_get_reserved_cluster_alloc---of 10
trace_event_raw_event_ext4_getfsmap_class---of 10
trace_event_raw_event_ext4_insert_range---of 10
trace_event_raw_event_ext4_invalidatepage_op---of 10
trace_event_raw_event_ext4_journal_start---of 10
trace_event_raw_event_ext4_journal_start_reserved---of 10
trace_event_raw_event_ext4_lazy_itable_init---of 10
trace_event_raw_event_ext4_load_inode---of 10
trace_event_raw_event_ext4_mark_inode_dirty---of 10
trace_event_raw_event_ext4_mb_discard_preallocations---of 10
trace_event_raw_event_ext4_mb_release_group_pa---of 10
trace_event_raw_event_ext4_mb_release_inode_pa---of 10
trace_event_raw_event_ext4_mballoc_alloc---of 10
trace_event_raw_event_ext4_mballoc_prealloc---of 10
trace_event_raw_event_ext4_nfs_commit_metadata---of 10
trace_event_raw_event_ext4_other_inode_update_time---of 10
trace_event_raw_event_ext4_prefetch_bitmaps---of 10
trace_event_raw_event_ext4_read_block_bitmap_load---of 10
trace_event_raw_event_ext4_remove_blocks---of 13
trace_event_raw_event_ext4_request_blocks---of 10
trace_event_raw_event_ext4_request_inode---of 10
trace_event_raw_event_ext4_shutdown---of 10
trace_event_raw_event_ext4_sync_file_enter---of 10
trace_event_raw_event_ext4_sync_file_exit---of 10
trace_event_raw_event_ext4_sync_fs---of 10
trace_event_raw_event_ext4_unlink_enter---of 10
trace_event_raw_event_ext4_unlink_exit---of 10
trace_event_raw_event_ext4_writepages---of 10
trace_event_raw_event_ext4_writepages_result---of 10
trace_raw_output_ext4__bitmap_load---of 4
trace_raw_output_ext4__es_extent---of 4
trace_raw_output_ext4__es_shrink_enter---of 4
trace_raw_output_ext4__fallocate_mode---of 4
trace_raw_output_ext4__map_blocks_enter---of 4
trace_raw_output_ext4__map_blocks_exit---of 4
trace_raw_output_ext4__mb_new_pa---of 4
trace_raw_output_ext4__mballoc---of 4
trace_raw_output_ext4__page_op---of 4
trace_raw_output_ext4__trim---of 4
trace_raw_output_ext4__truncate---of 4
trace_raw_output_ext4__write_begin---of 4
trace_raw_output_ext4__write_end---of 4
trace_raw_output_ext4_alloc_da_blocks---of 4
trace_raw_output_ext4_allocate_blocks---of 4
trace_raw_output_ext4_allocate_inode---of 4
trace_raw_output_ext4_begin_ordered_truncate---of 4
trace_raw_output_ext4_collapse_range---of 4
trace_raw_output_ext4_da_release_space---of 4
trace_raw_output_ext4_da_reserve_space---of 4
trace_raw_output_ext4_da_update_reserve_space---of 4
trace_raw_output_ext4_da_write_pages---of 4
trace_raw_output_ext4_da_write_pages_extent---of 4
trace_raw_output_ext4_direct_IO_enter---of 4
trace_raw_output_ext4_direct_IO_exit---of 4
trace_raw_output_ext4_discard_blocks---of 4
trace_raw_output_ext4_discard_preallocations---of 4
trace_raw_output_ext4_drop_inode---of 4
trace_raw_output_ext4_error---of 4
trace_raw_output_ext4_es_find_extent_range_enter---of 4
trace_raw_output_ext4_es_find_extent_range_exit---of 4
trace_raw_output_ext4_es_insert_delayed_block---of 4
trace_raw_output_ext4_es_lookup_extent_enter---of 4
trace_raw_output_ext4_es_lookup_extent_exit---of 8
trace_raw_output_ext4_es_remove_extent---of 4
trace_raw_output_ext4_es_shrink---of 4
trace_raw_output_ext4_es_shrink_scan_exit---of 4
trace_raw_output_ext4_evict_inode---of 4
trace_raw_output_ext4_ext_convert_to_initialized_enter---of 4
trace_raw_output_ext4_ext_convert_to_initialized_fastpath---of 4
trace_raw_output_ext4_ext_handle_unwritten_extents---of 4
trace_raw_output_ext4_ext_in_cache---of 4
trace_raw_output_ext4_ext_load_extent---of 4
trace_raw_output_ext4_ext_put_in_cache---of 4
trace_raw_output_ext4_ext_remove_space---of 4
trace_raw_output_ext4_ext_remove_space_done---of 4
trace_raw_output_ext4_ext_rm_idx---of 4
trace_raw_output_ext4_ext_rm_leaf---of 4
trace_raw_output_ext4_ext_show_extent---of 4
trace_raw_output_ext4_fallocate_exit---of 4
trace_raw_output_ext4_fc_commit_start---of 5
trace_raw_output_ext4_fc_commit_stop---of 4
trace_raw_output_ext4_fc_replay---of 4
trace_raw_output_ext4_fc_replay_scan---of 4
trace_raw_output_ext4_fc_stats---of 4
trace_raw_output_ext4_fc_track_create---of 4
trace_raw_output_ext4_fc_track_inode---of 4
trace_raw_output_ext4_fc_track_link---of 4
trace_raw_output_ext4_fc_track_range---of 4
trace_raw_output_ext4_fc_track_unlink---of 4
trace_raw_output_ext4_find_delalloc_range---of 4
trace_raw_output_ext4_forget---of 4
trace_raw_output_ext4_free_blocks---of 4
trace_raw_output_ext4_free_inode---of 4
trace_raw_output_ext4_fsmap_class---of 4
trace_raw_output_ext4_get_implied_cluster_alloc_exit---of 4
trace_raw_output_ext4_get_reserved_cluster_alloc---of 4
trace_raw_output_ext4_getfsmap_class---of 4
trace_raw_output_ext4_insert_range---of 4
trace_raw_output_ext4_invalidatepage_op---of 4
trace_raw_output_ext4_journal_start---of 4
trace_raw_output_ext4_journal_start_reserved---of 4
trace_raw_output_ext4_lazy_itable_init---of 4
trace_raw_output_ext4_load_inode---of 4
trace_raw_output_ext4_mark_inode_dirty---of 4
trace_raw_output_ext4_mb_discard_preallocations---of 4
trace_raw_output_ext4_mb_release_group_pa---of 4
trace_raw_output_ext4_mb_release_inode_pa---of 4
trace_raw_output_ext4_mballoc_alloc---of 6
trace_raw_output_ext4_mballoc_prealloc---of 4
trace_raw_output_ext4_nfs_commit_metadata---of 4
trace_raw_output_ext4_other_inode_update_time---of 4
trace_raw_output_ext4_prefetch_bitmaps---of 4
trace_raw_output_ext4_read_block_bitmap_load---of 4
trace_raw_output_ext4_remove_blocks---of 4
trace_raw_output_ext4_request_blocks---of 4
trace_raw_output_ext4_request_inode---of 4
trace_raw_output_ext4_shutdown---of 4
trace_raw_output_ext4_sync_file_enter---of 4
trace_raw_output_ext4_sync_file_exit---of 4
trace_raw_output_ext4_sync_fs---of 4
trace_raw_output_ext4_unlink_enter---of 4
trace_raw_output_ext4_unlink_exit---of 4
trace_raw_output_ext4_writepages---of 4
trace_raw_output_ext4_writepages_result---of 4

__mod_timer47%of 58
__next_timer_interrupt---of 10
__round_jiffies---of 6
__round_jiffies_relative---of 6
__round_jiffies_up---of 3
__round_jiffies_up_relative---of 3
__run_timers.part.0---of 35
__traceiter_hrtimer_cancel---of 4
__traceiter_hrtimer_expire_entry---of 4
__traceiter_hrtimer_expire_exit---of 4
__traceiter_hrtimer_init---of 4
__traceiter_hrtimer_start---of 4
__traceiter_itimer_expire---of 4
__traceiter_itimer_state---of 4
__traceiter_tick_stop---of 4
__traceiter_timer_cancel---of 4
__traceiter_timer_expire_entry---of 4
__traceiter_timer_expire_exit---of 4
__traceiter_timer_init---of 4
__traceiter_timer_start---of 4
add_timer67%of 3
add_timer_on---of 15
calc_wheel_index38%of 24
call_timer_fn---of 25
del_timer100%of 3
detach_if_pending74%of 19
enqueue_timer42%of 17
get_next_timer_interrupt---of 20
init_timer_key50%of 12
lock_timer_base63%of 8
mod_timer---of 1
mod_timer_pending---of 1
msleep---of 4
msleep_interruptible---of 6
perf_trace_hrtimer_class---of 6
perf_trace_hrtimer_expire_entry---of 6
perf_trace_hrtimer_init---of 6
perf_trace_hrtimer_start---of 6
perf_trace_itimer_expire---of 8
perf_trace_itimer_state---of 6
perf_trace_tick_stop---of 6
perf_trace_timer_class---of 6
perf_trace_timer_expire_entry---of 6
perf_trace_timer_start---of 6
process_timeout---of 1
round_jiffies---of 6
round_jiffies_relative---of 6
round_jiffies_up100%of 3
round_jiffies_up_relative---of 3
run_local_timers---of 3
run_timer_softirq---of 4
schedule_timeout54%of 15
schedule_timeout_idle---of 1
schedule_timeout_interruptible---of 1
schedule_timeout_killable---of 1
schedule_timeout_uninterruptible---of 1
timer_clear_idle---of 1
timer_delete_sync70%of 13
timer_migration_handler---of 7
timer_reduce100%of 1
timer_update_keys---of 5
timers_dead_cpu---of 27
timers_prepare_cpu---of 1
timers_update_nohz---of 1
trace_event_raw_event_hrtimer_class---of 10
trace_event_raw_event_hrtimer_expire_entry---of 10
trace_event_raw_event_hrtimer_init---of 10
trace_event_raw_event_hrtimer_start---of 10
trace_event_raw_event_itimer_expire---of 12
trace_event_raw_event_itimer_state---of 10
trace_event_raw_event_tick_stop---of 10
trace_event_raw_event_timer_class---of 10
trace_event_raw_event_timer_expire_entry---of 10
trace_event_raw_event_timer_start---of 10
trace_raw_output_hrtimer_class---of 5
trace_raw_output_hrtimer_expire_entry---of 4
trace_raw_output_hrtimer_init---of 4
trace_raw_output_hrtimer_start---of 4
trace_raw_output_itimer_expire---of 4
trace_raw_output_itimer_state---of 4
trace_raw_output_tick_stop---of 4
trace_raw_output_timer_class---of 5
trace_raw_output_timer_expire_entry---of 4
trace_raw_output_timer_start---of 4
try_to_del_timer_sync---of 3
update_process_times---of 6
usleep_range---of 3

__fs_parse24%of 26
fs_lookup_param---of 13
fs_param_is_blob---of 3
fs_param_is_blockdev---of 1
fs_param_is_bool---of 8
fs_param_is_enum---of 8
fs_param_is_fd---of 6
fs_param_is_path---of 1
fs_param_is_s32---of 4
fs_param_is_string---of 4
fs_param_is_u32---of 4
fs_param_is_u64---of 4
lookup_constant67%of 6

__do_sys_io_uring_enter16%of 95
__do_sys_io_uring_register---of 214
__ia32_sys_io_uring_enter---of 1
__ia32_sys_io_uring_register---of 1
__ia32_sys_io_uring_setup---of 1
__io_arm_poll_handler32%of 19
__io_compat_recvmsg_copy_hdr---of 13
__io_cqring_overflow_flush---of 25
__io_file_supports_nowait36%of 17
__io_free_req---of 10
__io_openat_prep---of 12
__io_poll_add---of 11
__io_poll_execute---of 11
__io_prep_linked_timeout---of 8
__io_queue_proc28%of 11
__io_queue_sqe43%of 42
__io_recvmsg_copy_hdr---of 7
__io_req_find_next---of 10
__io_sqe_files_unregister---of 5
__io_sqe_files_update---of 27
__io_uring_add_tctx_node---of 15
__io_uring_cancel---of 1
__io_uring_free---of 7
__traceiter_io_uring_complete---of 4
__traceiter_io_uring_cqring_wait---of 4
__traceiter_io_uring_create---of 4
__traceiter_io_uring_defer---of 4
__traceiter_io_uring_fail_link---of 4
__traceiter_io_uring_file_get---of 4
__traceiter_io_uring_link---of 4
__traceiter_io_uring_poll_arm---of 4
__traceiter_io_uring_poll_wake---of 4
__traceiter_io_uring_queue_async_work---of 4
__traceiter_io_uring_register---of 4
__traceiter_io_uring_submit_sqe---of 4
__traceiter_io_uring_task_add---of 4
__traceiter_io_uring_task_run---of 4
__x64_sys_io_uring_enter100%of 1
__x64_sys_io_uring_register---of 1
__x64_sys_io_uring_setup---of 1
get_order---of 1
io_apoll_task_func---of 10
io_async_buf_func---of 4
io_async_queue_proc100%of 1
io_buffer_select.part.0---of 15
io_buffer_unmap---of 10
io_cancel_cb---of 4
io_cancel_ctx_cb---of 1
io_cancel_task_cb---of 1
io_clean_op48%of 23
io_complete_rw---of 12
io_complete_rw_iopoll---of 7
io_connect---of 22
io_copy_iov---of 6
io_cqring_ev_posted29%of 14
io_cqring_event_overflow---of 6
io_cqring_overflow_flush---of 7
io_disarm_next---of 42
io_dismantle_req78%of 9
io_do_iopoll---of 58
io_drain_req---of 36
io_fallback_req_func---of 22
io_file_get_normal50%of 14
io_fill_cqe_aux---of 14
io_fill_cqe_req.constprop.0.isra.0---of 13
io_fixed_file_set---of 7
io_flush_timeouts---of 7
io_free_req_work---of 4
io_import_iovec11%of 59
io_install_fixed_file.isra.0---of 27
io_iopoll_try_reap_events.part.0---of 7
io_is_uring_fops---of 1
io_issue_sqe2%of 418
io_kill_timeout.part.0---of 7
io_kill_timeouts---of 21
io_link_timeout_fn---of 7
io_match_task_safe---of 13
io_mem_free.part.0---of 9
io_openat2---of 35
io_poll_check_events---of 26
io_poll_find.isra.0---of 8
io_poll_get_ownership_slowpath---of 5
io_poll_queue_proc---of 1
io_poll_remove_all---of 17
io_poll_remove_entries58%of 21
io_poll_task_func---of 9
io_poll_wake---of 15
io_prep_async_link50%of 6
io_prep_async_work74%of 15
io_prep_rw35%of 64
io_put_sq_data---of 13
io_queue_async_work50%of 20
io_queue_deferred---of 7
io_queue_linked_timeout---of 13
io_queue_rsrc_removal.isra.0---of 3
io_read---of 55
io_recv---of 42
io_recvmsg---of 50
io_register_rsrc---of 9
io_register_rsrc_update---of 33
io_req_complete_post44%of 41
io_req_end_write---of 11
io_req_free_batch---of 20
io_req_io_end---of 13
io_req_prep_async---of 26
io_req_prep_async.cold---of 1
io_req_rw_complete---of 21
io_req_task_cancel67%of 3
io_req_task_link_timeout---of 13
io_req_task_submit---of 5
io_req_task_timeout---of 1
io_req_task_work_add65%of 14
io_ring_ctx_ref_free---of 1
io_ring_ctx_wait_and_kill---of 11
io_ring_ctx_wait_and_kill.cold---of 1
io_ring_exit_work---of 76
io_rsrc_buf_put---of 1
io_rsrc_data_alloc---of 17
io_rsrc_data_free---of 5
io_rsrc_file_put---of 1
io_rsrc_node_ref_zero---of 7
io_rsrc_node_switch---of 8
io_rsrc_node_switch_start.part.0---of 7
io_rsrc_put_work---of 17
io_rsrc_ref_quiesce.part.0.constprop.0---of 8
io_run_task_work40%of 10
io_run_task_work_sig---of 7
io_rw_should_reissue---of 10
io_send---of 30
io_sendmsg---of 34
io_setup_async_msg---of 10
io_setup_async_rw19%of 16
io_sq_thread---of 57
io_sq_thread_finish---of 4
io_sq_thread_park---of 4
io_sq_thread_unpark---of 5
io_sqd_handle_event---of 9
io_sqe_buffer_register---of 63
io_sqe_buffers_register---of 28
io_sqe_files_register---of 24
io_submit_flush_completions---of 42
io_submit_sqes17%of 362
io_submit_sqes.cold---of 1
io_tctx_exit_cb---of 4
io_timeout_extract---of 6
io_timeout_fn---of 1
io_timeout_prep---of 24
io_try_cancel_userdata---of 26
io_uring_alloc_task_context---of 12
io_uring_cancel_generic---of 8
io_uring_cancel_generic.cold---of 27
io_uring_del_tctx_node---of 9
io_uring_drop_tctx_refs---of 6
io_uring_mmap---of 19
io_uring_poll---of 9
io_uring_release---of 1
io_uring_setup---of 120
io_uring_show_fdinfo---of 38
io_uring_try_cancel_requests---of 41
io_wake_function---of 5
io_wq_free_work37%of 11
io_wq_submit_work30%of 17
io_write14%of 65
kiocb_done---of 45
loop_rw_iter---of 18
percpu_ref_put_many53%of 17
percpu_ref_tryget_many48%of 19
perf_trace_io_uring_complete---of 6
perf_trace_io_uring_cqring_wait---of 6
perf_trace_io_uring_create---of 6
perf_trace_io_uring_defer---of 6
perf_trace_io_uring_fail_link---of 6
perf_trace_io_uring_file_get---of 6
perf_trace_io_uring_link---of 6
perf_trace_io_uring_poll_arm---of 6
perf_trace_io_uring_poll_wake---of 6
perf_trace_io_uring_queue_async_work---of 6
perf_trace_io_uring_register---of 6
perf_trace_io_uring_submit_sqe---of 6
perf_trace_io_uring_task_add---of 6
perf_trace_io_uring_task_run---of 6
tctx_task_work56%of 47
tctx_task_work.cold---of 1
trace_event_raw_event_io_uring_complete---of 10
trace_event_raw_event_io_uring_cqring_wait---of 10
trace_event_raw_event_io_uring_create---of 10
trace_event_raw_event_io_uring_defer---of 10
trace_event_raw_event_io_uring_fail_link---of 10
trace_event_raw_event_io_uring_file_get---of 10
trace_event_raw_event_io_uring_link---of 10
trace_event_raw_event_io_uring_poll_arm---of 10
trace_event_raw_event_io_uring_poll_wake---of 10
trace_event_raw_event_io_uring_queue_async_work---of 10
trace_event_raw_event_io_uring_register---of 10
trace_event_raw_event_io_uring_submit_sqe---of 10
trace_event_raw_event_io_uring_task_add---of 10
trace_event_raw_event_io_uring_task_run---of 10
trace_raw_output_io_uring_complete---of 4
trace_raw_output_io_uring_cqring_wait---of 4
trace_raw_output_io_uring_create---of 4
trace_raw_output_io_uring_defer---of 4
trace_raw_output_io_uring_fail_link---of 4
trace_raw_output_io_uring_file_get---of 4
trace_raw_output_io_uring_link---of 4
trace_raw_output_io_uring_poll_arm---of 4
trace_raw_output_io_uring_poll_wake---of 4
trace_raw_output_io_uring_queue_async_work---of 5
trace_raw_output_io_uring_register---of 4
trace_raw_output_io_uring_submit_sqe---of 4
trace_raw_output_io_uring_task_add---of 4
trace_raw_output_io_uring_task_run---of 4

__part_end_io_acct---of 9
__part_start_io_acct---of 11
__traceiter_block_bio_backmerge---of 4
__traceiter_block_bio_bounce---of 4
__traceiter_block_bio_complete---of 4
__traceiter_block_bio_frontmerge---of 4
__traceiter_block_bio_queue---of 4
__traceiter_block_bio_remap---of 4
__traceiter_block_dirty_buffer---of 4
__traceiter_block_getrq---of 4
__traceiter_block_plug---of 4
__traceiter_block_rq_complete---of 4
__traceiter_block_rq_insert---of 4
__traceiter_block_rq_issue---of 4
__traceiter_block_rq_merge---of 4
__traceiter_block_rq_remap---of 4
__traceiter_block_rq_requeue---of 4
__traceiter_block_sleeprq---of 4
__traceiter_block_split---of 4
__traceiter_block_touch_buffer---of 4
__traceiter_block_unplug---of 4
bio_cur_bytes---of 6
blk_account_io_done---of 16
blk_account_io_start100%of 3
blk_alloc_queue---of 13
blk_check_plugged---of 10
blk_cleanup_queue---of 7
blk_clear_pm_only---of 4
blk_dump_rq_flags---of 5
blk_finish_plug100%of 2
blk_flush_plug_list45%of 9
blk_get_queue---of 4
blk_get_request---of 8
blk_insert_cloned_request---of 17
blk_insert_cloned_request.cold---of 2
blk_io_schedule67%of 3
blk_lld_busy---of 5
blk_op_str---of 4
blk_put_queue---of 1
blk_put_request---of 1
blk_queue_enter29%of 71
blk_queue_exit---of 1
blk_queue_flag_clear---of 1
blk_queue_flag_set---of 1
blk_queue_flag_test_and_set---of 1
blk_queue_usage_counter_release---of 1
blk_rq_err_bytes---of 10
blk_rq_init---of 1
blk_rq_prep_clone---of 16
blk_rq_timed_out_timer---of 1
blk_rq_unprep_clone---of 2
blk_set_pm_only---of 1
blk_set_queue_dying---of 3
blk_start_plug100%of 2
blk_status_to_errno---of 5
blk_steal_bios---of 6
blk_sync_queue---of 1
blk_update_request---of 61
blk_update_request.cold---of 8
disk_end_io_acct---of 1
disk_start_io_acct---of 1
errno_to_blk_status---of 5
handle_bad_sector---of 3
kblockd_mod_delayed_work_on100%of 1
kblockd_schedule_work---of 1
part_end_io_acct---of 2
part_start_io_acct---of 1
percpu_ref_put_many.constprop.0---of 17
perf_trace_block_bio_bounce---of 6
perf_trace_block_bio_complete---of 9
perf_trace_block_bio_merge---of 6
perf_trace_block_bio_queue---of 6
perf_trace_block_bio_remap---of 6
perf_trace_block_buffer---of 6
perf_trace_block_get_rq---of 9
perf_trace_block_plug---of 6
perf_trace_block_rq---of 12
perf_trace_block_rq_complete---of 8
perf_trace_block_rq_remap---of 8
perf_trace_block_rq_requeue---of 12
perf_trace_block_split---of 6
perf_trace_block_unplug---of 6
should_fail_bio50%of 4
submit_bio74%of 19
submit_bio_checks33%of 92
submit_bio_noacct12%of 78
trace_event_raw_event_block_bio_bounce---of 10
trace_event_raw_event_block_bio_complete---of 13
trace_event_raw_event_block_bio_merge---of 10
trace_event_raw_event_block_bio_queue---of 10
trace_event_raw_event_block_bio_remap---of 10
trace_event_raw_event_block_buffer---of 10
trace_event_raw_event_block_get_rq---of 13
trace_event_raw_event_block_plug---of 10
trace_event_raw_event_block_rq---of 16
trace_event_raw_event_block_rq_complete---of 12
trace_event_raw_event_block_rq_remap---of 12
trace_event_raw_event_block_rq_requeue---of 16
trace_event_raw_event_block_split---of 10
trace_event_raw_event_block_unplug---of 10
trace_raw_output_block_bio_bounce---of 4
trace_raw_output_block_bio_complete---of 4
trace_raw_output_block_bio_merge---of 4
trace_raw_output_block_bio_queue---of 4
trace_raw_output_block_bio_remap---of 4
trace_raw_output_block_buffer---of 4
trace_raw_output_block_get_rq---of 4
trace_raw_output_block_plug---of 5
trace_raw_output_block_rq---of 4
trace_raw_output_block_rq_complete---of 4
trace_raw_output_block_rq_remap---of 4
trace_raw_output_block_rq_requeue---of 4
trace_raw_output_block_split---of 4
trace_raw_output_block_unplug---of 5
update_io_ticks56%of 9

bad_file_open---of 1
bad_inode_atomic_open---of 1
bad_inode_create---of 1
bad_inode_fiemap---of 1
bad_inode_get_acl---of 1
bad_inode_get_link---of 1
bad_inode_getattr---of 1
bad_inode_link---of 1
bad_inode_listxattr---of 1
bad_inode_lookup---of 1
bad_inode_mkdir---of 1
bad_inode_mknod---of 1
bad_inode_permission---of 1
bad_inode_readlink---of 1
bad_inode_rename2---of 1
bad_inode_rmdir---of 1
bad_inode_set_acl---of 1
bad_inode_setattr---of 1
bad_inode_symlink---of 1
bad_inode_tmpfile---of 1
bad_inode_unlink---of 1
bad_inode_update_time---of 1
iget_failed---of 1
is_bad_inode100%of 1
make_bad_inode---of 4

__do_sys_memfd_create69%of 22
__do_sys_memfd_create.cold---of 1
__ia32_sys_memfd_create---of 1
__x64_sys_memfd_create100%of 1
memfd_fcntl---of 83
memfd_file_seals_ptr---of 10

cat_destroy---of 1
cat_index---of 5
cat_read---of 6
cat_write---of 4
class_index---of 4
class_read---of 26
class_read.cold---of 1
class_write---of 23
cls_destroy---of 15
common_destroy---of 3
common_index---of 4
common_read---of 10
common_write---of 4
context_read_and_validate---of 5
context_read_and_validate.cold---of 3
context_write.constprop.0---of 3
filename_write_helper---of 10
filename_write_helper_compat---of 17
filenametr_cmp---of 4
filenametr_destroy---of 3
filenametr_hash---of 4
get_order---of 1
mls_read_level---of 3
mls_read_level.cold---of 2
mls_read_range_helper---of 9
mls_read_range_helper.cold---of 6
mls_write_range_helper---of 10
next_entry---of 3
ocontext_destroy.part.0---of 4
perm_destroy---of 1
perm_read.constprop.0---of 6
perm_write---of 4
policydb_class_isvalid---of 4
policydb_context_isvalid100%of 13
policydb_destroy---of 19
policydb_filenametr_search---of 12
policydb_load_isids---of 7
policydb_load_isids.cold---of 3
policydb_rangetr_search78%of 9
policydb_read---of 319
policydb_read.cold---of 16
policydb_role_isvalid---of 4
policydb_roletr_search34%of 9
policydb_type_isvalid---of 4
policydb_write---of 102
policydb_write.cold---of 1
range_tr_destroy---of 1
range_write_helper---of 6
rangetr_cmp---of 4
rangetr_hash---of 1
read_cons_helper---of 33
role_bounds_sanity_check---of 21
role_bounds_sanity_check.cold---of 2
role_destroy---of 3
role_index---of 5
role_read---of 15
role_read.cold---of 1
role_tr_destroy---of 1
role_trans_cmp---of 4
role_trans_hash---of 1
role_trans_write_one---of 5
role_write---of 7
sens_destroy---of 5
sens_index---of 5
sens_read---of 10
sens_write---of 7
str_read---of 6
string_to_av_perm---of 8
string_to_security_class---of 4
type_bounds_sanity_check---of 6
type_bounds_sanity_check.cold---of 2
type_destroy---of 1
type_index---of 6
type_read---of 14
type_write---of 9
user_bounds_sanity_check---of 21
user_bounds_sanity_check.cold---of 2
user_destroy---of 3
user_index---of 5
user_read---of 13
user_write---of 11
write_cons_helper---of 18

__cgroup_procs_start---of 12
__cgroup_task_count---of 6
__traceiter_cgroup_attach_task---of 4
__traceiter_cgroup_destroy_root---of 4
__traceiter_cgroup_freeze---of 4
__traceiter_cgroup_mkdir---of 4
__traceiter_cgroup_notify_frozen---of 4
__traceiter_cgroup_notify_populated---of 4
__traceiter_cgroup_release---of 4
__traceiter_cgroup_remount---of 4
__traceiter_cgroup_rename---of 4
__traceiter_cgroup_rmdir---of 4
__traceiter_cgroup_setup_root---of 4
__traceiter_cgroup_transfer_tasks---of 4
__traceiter_cgroup_unfreeze---of 4
allocate_cgrp_cset_links---of 6
apply_cgroup_root_flags.part.0---of 11
cgroup2_parse_param---of 6
cgroup_add_cftypes---of 8
cgroup_add_dfl_cftypes---of 4
cgroup_add_legacy_cftypes---of 4
cgroup_addrm_files---of 44
cgroup_addrm_files.cold---of 2
cgroup_apply_cftypes---of 14
cgroup_apply_control_disable---of 23
cgroup_apply_control_enable---of 38
cgroup_apply_control_enable.cold---of 3
cgroup_attach_task---of 39
cgroup_can_fork20%of 106
cgroup_cancel_fork---of 4
cgroup_control---of 9
cgroup_controllers_show---of 4
cgroup_css.part.0.isra.0---of 7
cgroup_css_set_put_fork25%of 33
cgroup_destroy_locked---of 27
cgroup_do_get_tree---of 26
cgroup_e_css---of 12
cgroup_events_show---of 4
cgroup_exit---of 51
cgroup_exit_cftypes---of 4
cgroup_exit_root_id---of 4
cgroup_file_name---of 8
cgroup_file_notify100%of 6
cgroup_file_notify_timer---of 1
cgroup_file_open---of 17
cgroup_file_poll---of 5
cgroup_file_release---of 9
cgroup_file_write---of 33
cgroup_fork100%of 1
cgroup_free---of 11
cgroup_free_root---of 2
cgroup_freeze_show---of 4
cgroup_freeze_write---of 5
cgroup_fs_context_free---of 7
cgroup_get_e_css---of 56
cgroup_get_from_fd---of 23
cgroup_get_from_path---of 5
cgroup_get_live---of 19
cgroup_get_tree---of 4
cgroup_idr_alloc.constprop.0---of 7
cgroup_init_cftypes---of 11
cgroup_init_fs_context---of 15
cgroup_is_thread_root---of 6
cgroup_is_threaded---of 1
cgroup_is_valid_domain.part.0---of 10
cgroup_kill_sb---of 23
cgroup_kn_lock_live---of 29
cgroup_kn_set_ugid---of 3
cgroup_kn_unlock---of 21
cgroup_lock_and_drain_offline---of 33
cgroup_max_depth_show---of 8
cgroup_max_depth_write---of 7
cgroup_max_descendants_show---of 8
cgroup_max_descendants_write---of 7
cgroup_may_write---of 6
cgroup_migrate---of 15
cgroup_migrate_add_src---of 22
cgroup_migrate_add_task---of 20
cgroup_migrate_execute---of 45
cgroup_migrate_finish---of 8
cgroup_migrate_prepare_dst---of 23
cgroup_migrate_vet_dst---of 10
cgroup_migrate_vet_dst.part.0---of 9
cgroup_mkdir---of 56
cgroup_on_dfl---of 1
cgroup_parse_float---of 16
cgroup_path_from_kernfs_id---of 2
cgroup_path_ns---of 1
cgroup_path_ns_locked---of 1
cgroup_post_fork40%of 43
cgroup_print_ss_mask---of 7
cgroup_procs_next---of 3
cgroup_procs_release---of 2
cgroup_procs_show---of 1
cgroup_procs_start---of 7
cgroup_procs_write---of 22
cgroup_procs_write_finish---of 11
cgroup_procs_write_permission---of 17
cgroup_procs_write_start---of 39
cgroup_propagate_control---of 19
cgroup_reconfigure---of 4
cgroup_release---of 12
cgroup_restore_control---of 7
cgroup_rm_cftypes---of 1
cgroup_rm_cftypes_locked---of 7
cgroup_rmdir---of 16
cgroup_root_from_kf---of 1
cgroup_save_control---of 7
cgroup_seqfile_next---of 1
cgroup_seqfile_show---of 10
cgroup_seqfile_start---of 1
cgroup_seqfile_stop---of 2
cgroup_setup_root---of 42
cgroup_show_options---of 8
cgroup_show_path---of 28
cgroup_ssid_enabled---of 1
cgroup_stat_show---of 4
cgroup_subtree_control_show---of 4
cgroup_subtree_control_write---of 55
cgroup_task_count---of 1
cgroup_taskset_first---of 1
cgroup_taskset_next---of 10
cgroup_threads_start---of 1
cgroup_threads_write---of 24
cgroup_type_show---of 15
cgroup_type_write---of 30
cgroup_update_dfl_csses---of 23
cgroup_update_populated---of 23
cpu_stat_show---of 62
cpuset_init_fs_context---of 5
cset_cgroup_from_root---of 14
css_clear_dir---of 6
css_free_rwork_fn---of 81
css_from_id---of 3
css_has_online_children---of 16
css_killed_ref_fn---of 2
css_killed_work_fn---of 29
css_next_child---of 16
css_next_descendant_post---of 15
css_next_descendant_pre---of 12
css_populate_dir---of 16
css_release---of 1
css_release_work_fn---of 24
css_rightmost_descendant---of 10
css_set_move_task56%of 20
css_set_populated58%of 7
css_set_skip_task_iters---of 8
css_set_update_populated---of 6
css_task_iter_advance---of 25
css_task_iter_advance_css_set---of 42
css_task_iter_end---of 11
css_task_iter_next---of 17
css_task_iter_start---of 4
css_tryget_online_from_dir---of 48
css_visible.isra.0---of 13
delegate_show---of 4
features_show---of 1
find_css_set---of 84
init_and_link_css---of 33
init_cgroup_housekeeping---of 3
init_cgroup_root---of 6
kill_css---of 22
link_css_set---of 6
of_css---of 5
online_css---of 8
perf_trace_cgroup---of 7
perf_trace_cgroup_event---of 7
perf_trace_cgroup_migrate---of 8
perf_trace_cgroup_root---of 7
proc_cgroup_show---of 35
put_css_set_locked---of 57
rebind_subsystems---of 52
rebind_subsystems.cold---of 1
show_delegatable_files---of 9
task_cgroup_from_root---of 10
task_cgroup_path---of 17
task_css_set.part.0---of 1
trace_event_raw_event_cgroup---of 11
trace_event_raw_event_cgroup_event---of 11
trace_event_raw_event_cgroup_migrate---of 12
trace_event_raw_event_cgroup_root---of 11
trace_raw_output_cgroup---of 4
trace_raw_output_cgroup_event---of 4
trace_raw_output_cgroup_migrate---of 4
trace_raw_output_cgroup_root---of 4

__rhashtable_lookup.constprop.0---of 28
__xfrm_decode_session---of 6
__xfrm_dst_lookup---of 10
__xfrm_policy_bysel_ctx.constprop.0---of 13
__xfrm_policy_check---of 139
__xfrm_policy_inexact_flush---of 5
__xfrm_policy_inexact_prune_bin---of 87
__xfrm_policy_link---of 6
__xfrm_policy_unlink---of 13
__xfrm_route_forward---of 37
__xfrm_sk_clone_policy---of 27
decode_session4---of 64
decode_session6---of 65
dst_discard---of 1
jhash---of 16
policy_hash_bysel38%of 43
rt6_get_cookie---of 30
xdst_queue_output---of 47
xfrm_audit_common_policyinfo---of 10
xfrm_audit_policy_add17%of 6
xfrm_audit_policy_delete---of 6
xfrm_bundle_create---of 135
xfrm_confirm_neigh---of 11
xfrm_default_advmss---of 6
xfrm_dst_check---of 45
xfrm_dst_ifdown---of 9
xfrm_gen_index---of 9
xfrm_hash_rebuild---of 61
xfrm_hash_resize---of 77
xfrm_if_register_cb---of 1
xfrm_if_unregister_cb---of 1
xfrm_lookup---of 1
xfrm_lookup_route---of 13
xfrm_lookup_with_ifid---of 138
xfrm_lookup_with_ifid.cold---of 1
xfrm_mtu---of 6
xfrm_negative_advice---of 2
xfrm_neigh_lookup---of 13
xfrm_net_exit---of 1
xfrm_net_init---of 15
xfrm_pol_bin_cmp---of 6
xfrm_pol_bin_key---of 1
xfrm_pol_bin_obj---of 1
xfrm_pol_inexact_addr_use_any_list---of 14
xfrm_policy_addr_delta---of 9
xfrm_policy_alloc80%of 5
xfrm_policy_byid---of 21
xfrm_policy_bysel_ctx---of 39
xfrm_policy_delete---of 3
xfrm_policy_destroy---of 6
xfrm_policy_destroy_rcu---of 1
xfrm_policy_find_inexact_candidates---of 7
xfrm_policy_fini---of 12
xfrm_policy_flush---of 10
xfrm_policy_get_afinfo---of 22
xfrm_policy_hash_rebuild---of 1
xfrm_policy_inexact_alloc_bin---of 102
xfrm_policy_inexact_alloc_chain.isra.0---of 23
xfrm_policy_inexact_gc_tree---of 8
xfrm_policy_inexact_insert---of 39
xfrm_policy_inexact_insert_node.constprop.0---of 47
xfrm_policy_inexact_list_reinsert---of 30
xfrm_policy_insert19%of 27
xfrm_policy_insert_list55%of 24
xfrm_policy_kill---of 17
xfrm_policy_lookup_bytype.constprop.0---of 106
xfrm_policy_lookup_inexact_addr---of 15
xfrm_policy_queue_process---of 101
xfrm_policy_register_afinfo---of 23
xfrm_policy_requeue---of 21
xfrm_policy_timer---of 38
xfrm_policy_unregister_afinfo---of 5
xfrm_policy_walk---of 18
xfrm_policy_walk_done---of 2
xfrm_policy_walk_init---of 1
xfrm_resolve_and_create_bundle---of 6
xfrm_selector_match---of 49
xfrm_sk_policy_insert---of 13
xfrm_sk_policy_lookup---of 34
xfrm_spd_getinfo---of 1
xfrm_tmpl_resolve---of 15
xfrm_tmpl_resolve_one---of 45

name_to_int100%of 7

__do_compat_sys_wait4---of 5
__do_compat_sys_waitid---of 20
__do_sys_wait4---of 5
__do_sys_waitid---of 20
__ia32_compat_sys_wait4---of 1
__ia32_compat_sys_waitid---of 1
__ia32_sys_exit---of 1
__ia32_sys_exit_group---of 1
__ia32_sys_wait4---of 1
__ia32_sys_waitid---of 1
__ia32_sys_waitpid---of 1
__wake_up_parent---of 1
__x64_sys_exit---of 1
__x64_sys_exit_group---of 1
__x64_sys_wait4---of 1
__x64_sys_waitid---of 1
__x64_sys_waitpid---of 1
abort---of 1
child_wait_callback---of 9
complete_and_exit---of 3
delayed_put_task_struct---of 14
do_exit---of 131
do_exit.cold---of 10
do_group_exit---of 12
do_wait---of 30
find_alive_thread---of 5
is_current_pgrp_orphaned---of 1
kernel_wait---of 4
kernel_wait4---of 12
kernel_waitid---of 21
kill_orphaned_pgrp---of 12
make_task_dead---of 3
make_task_dead.cold---of 1
mm_update_next_owner---of 39
oops_count_show---of 1
put_task_struct_rcu_user40%of 5
rcuwait_wake_up50%of 20
release_task---of 53
test_bit---of 1
thread_group_exited---of 16
wait_consider_task---of 172
will_become_orphaned_pgrp---of 12

_copy_from_user72%of 14
_copy_to_user---of 12
check_zeroed_user---of 21

___bpf_prog_run3%of 223
___bpf_prog_run.cold---of 1
__bpf_call_base---of 1
__bpf_free_used_maps---of 4
__bpf_prog_free---of 3
__bpf_prog_ret1---of 1
__bpf_prog_run128---of 1
__bpf_prog_run160---of 1
__bpf_prog_run192---of 1
__bpf_prog_run224---of 1
__bpf_prog_run256---of 1
__bpf_prog_run288---of 1
__bpf_prog_run32100%of 1
__bpf_prog_run320---of 1
__bpf_prog_run352---of 1
__bpf_prog_run384---of 1
__bpf_prog_run416---of 1
__bpf_prog_run448---of 1
__bpf_prog_run480---of 1
__bpf_prog_run512---of 1
__bpf_prog_run64---of 1
__bpf_prog_run96---of 1
__bpf_prog_run_args128---of 1
__bpf_prog_run_args160---of 1
__bpf_prog_run_args192---of 1
__bpf_prog_run_args224---of 1
__bpf_prog_run_args256---of 1
__bpf_prog_run_args288---of 1
__bpf_prog_run_args32---of 1
__bpf_prog_run_args320---of 1
__bpf_prog_run_args352---of 1
__bpf_prog_run_args384---of 1
__bpf_prog_run_args416---of 1
__bpf_prog_run_args448---of 1
__bpf_prog_run_args480---of 1
__bpf_prog_run_args512---of 1
__bpf_prog_run_args64---of 1
__bpf_prog_run_args96---of 1
__traceiter_mem_connect---of 4
__traceiter_mem_disconnect---of 4
__traceiter_mem_return_failed---of 4
__traceiter_xdp_bulk_tx---of 4
__traceiter_xdp_cpumap_enqueue---of 4
__traceiter_xdp_cpumap_kthread---of 4
__traceiter_xdp_devmap_xmit---of 4
__traceiter_xdp_exception---of 4
__traceiter_xdp_redirect---of 4
__traceiter_xdp_redirect_err---of 4
__traceiter_xdp_redirect_map---of 4
__traceiter_xdp_redirect_map_err---of 4
bpf_adj_branches---of 31
bpf_arch_text_poke---of 1
bpf_event_output---of 1
bpf_get_raw_cpu_id---of 1
bpf_get_trace_printk_proto---of 1
bpf_int_jit_compile---of 1
bpf_internal_load_pointer_neg_helper---of 10
bpf_jit_needs_zext---of 1
bpf_opcode_in_insntable---of 1
bpf_patch_call_args---of 1
bpf_patch_insn_single---of 16
bpf_probe_read_kernel---of 1
bpf_prog_alloc---of 6
bpf_prog_alloc_jited_linfo---of 4
bpf_prog_alloc_no_stats---of 7
bpf_prog_array_alloc---of 4
bpf_prog_array_compatible---of 7
bpf_prog_array_copy---of 29
bpf_prog_array_copy_info---of 14
bpf_prog_array_copy_to_user---of 11
bpf_prog_array_delete_safe---of 4
bpf_prog_array_delete_safe_at---of 10
bpf_prog_array_free---of 3
bpf_prog_array_is_empty---of 5
bpf_prog_array_length---of 6
bpf_prog_array_update_at---of 8
bpf_prog_calc_tag---of 21
bpf_prog_fill_jited_linfo---of 4
bpf_prog_free---of 1
bpf_prog_free_deferred---of 12
bpf_prog_free_jited_linfo---of 1
bpf_prog_free_linfo---of 1
bpf_prog_free_unused_jited_linfo---of 3
bpf_prog_kallsyms_del_all---of 2
bpf_prog_realloc---of 4
bpf_prog_select_runtime---of 27
bpf_remove_insns---of 3
bpf_user_rnd_init_once---of 6
bpf_user_rnd_u32---of 1
get_order---of 1
perf_trace_mem_connect---of 6
perf_trace_mem_disconnect---of 6
perf_trace_mem_return_failed---of 6
perf_trace_xdp_bulk_tx---of 6
perf_trace_xdp_cpumap_enqueue---of 6
perf_trace_xdp_cpumap_kthread---of 6
perf_trace_xdp_devmap_xmit---of 6
perf_trace_xdp_exception---of 6
perf_trace_xdp_redirect_template---of 12
trace_event_raw_event_mem_connect---of 10
trace_event_raw_event_mem_disconnect---of 10
trace_event_raw_event_mem_return_failed---of 10
trace_event_raw_event_xdp_bulk_tx---of 10
trace_event_raw_event_xdp_cpumap_enqueue---of 10
trace_event_raw_event_xdp_cpumap_kthread---of 10
trace_event_raw_event_xdp_devmap_xmit---of 10
trace_event_raw_event_xdp_exception---of 10
trace_event_raw_event_xdp_redirect_template---of 16
trace_raw_output_mem_connect---of 4
trace_raw_output_mem_disconnect---of 4
trace_raw_output_mem_return_failed---of 4
trace_raw_output_xdp_bulk_tx---of 4
trace_raw_output_xdp_cpumap_enqueue---of 4
trace_raw_output_xdp_cpumap_kthread---of 4
trace_raw_output_xdp_devmap_xmit---of 4
trace_raw_output_xdp_exception---of 4
trace_raw_output_xdp_redirect_template---of 4

__jbd2_fc_end_commit---of 6
__jbd2_journal_force_commit---of 11
__jbd2_log_start_commit50%of 10
__jbd2_update_log_tail---of 16
__order_base_2.part.0---of 1
__traceiter_jbd2_checkpoint---of 4
__traceiter_jbd2_checkpoint_stats---of 4
__traceiter_jbd2_commit_flushing---of 4
__traceiter_jbd2_commit_locking---of 4
__traceiter_jbd2_commit_logging---of 4
__traceiter_jbd2_drop_transaction---of 4
__traceiter_jbd2_end_commit---of 4
__traceiter_jbd2_handle_extend---of 4
__traceiter_jbd2_handle_restart---of 4
__traceiter_jbd2_handle_start---of 4
__traceiter_jbd2_handle_stats---of 4
__traceiter_jbd2_lock_buffer_stall---of 4
__traceiter_jbd2_run_stats---of 4
__traceiter_jbd2_start_commit---of 4
__traceiter_jbd2_submit_inode_data---of 4
__traceiter_jbd2_update_log_tail---of 4
__traceiter_jbd2_write_superblock---of 4
commit_timeout---of 1
get_order100%of 1
get_slab---of 7
jbd2_alloc63%of 8
jbd2_complete_transaction82%of 11
jbd2_descriptor_block_csum_set---of 12
jbd2_fc_begin_commit---of 8
jbd2_fc_end_commit---of 1
jbd2_fc_end_commit_fallback---of 3
jbd2_fc_get_buf---of 5
jbd2_fc_release_bufs---of 4
jbd2_fc_wait_bufs---of 7
jbd2_free---of 3
jbd2_journal_abort---of 5
jbd2_journal_abort.cold---of 3
jbd2_journal_ack_err---of 3
jbd2_journal_add_journal_head58%of 28
jbd2_journal_add_journal_head.cold---of 1
jbd2_journal_blocks_per_page---of 1
jbd2_journal_bmap---of 6
jbd2_journal_bmap.cold---of 1
jbd2_journal_check_available_features---of 6
jbd2_journal_check_used_features---of 9
jbd2_journal_clear_err---of 3
jbd2_journal_clear_features---of 14
jbd2_journal_destroy---of 32
jbd2_journal_destroy_caches---of 2
jbd2_journal_errno---of 3
jbd2_journal_flush---of 25
jbd2_journal_force_commit---of 3
jbd2_journal_force_commit_nested---of 1
jbd2_journal_get_descriptor_buffer---of 8
jbd2_journal_get_log_tail---of 8
jbd2_journal_grab_journal_head60%of 10
jbd2_journal_init_dev---of 4
jbd2_journal_init_inode---of 6
jbd2_journal_init_inode.cold---of 1
jbd2_journal_init_jbd_inode100%of 1
jbd2_journal_load---of 28
jbd2_journal_load.cold---of 7
jbd2_journal_next_log_block---of 5
jbd2_journal_put_journal_head22%of 28
jbd2_journal_put_journal_head.cold---of 2
jbd2_journal_release_jbd_inode---of 8
jbd2_journal_set_features---of 51
jbd2_journal_set_features.cold---of 2
jbd2_journal_start_commit---of 7
jbd2_journal_update_sb_errno---of 5
jbd2_journal_update_sb_log_tail---of 8
jbd2_journal_wipe---of 6
jbd2_journal_wipe.cold---of 3
jbd2_journal_write_metadata_buffer---of 37
jbd2_log_start_commit---of 1
jbd2_log_wait_commit100%of 15
jbd2_mark_journal_empty---of 11
jbd2_seq_info_next---of 1
jbd2_seq_info_open---of 7
jbd2_seq_info_release---of 1
jbd2_seq_info_show---of 6
jbd2_seq_info_start---of 1
jbd2_superblock_csum---of 5
jbd2_trans_will_send_data_barrier80%of 10
jbd2_transaction_committed100%of 5
jbd2_update_log_tail---of 3
jbd2_write_superblock---of 28
jbd2_write_superblock.cold---of 4
journal_get_superblock---of 46
journal_get_superblock.cold---of 13
journal_init_common---of 10
journal_init_common.cold---of 1
journal_revoke_records_per_block---of 12
journal_tag_bytes34%of 9
kjournald2---of 25
load_superblock.part.0---of 8
perf_trace_jbd2_checkpoint---of 6
perf_trace_jbd2_checkpoint_stats---of 6
perf_trace_jbd2_commit---of 6
perf_trace_jbd2_end_commit---of 6
perf_trace_jbd2_handle_extend---of 6
perf_trace_jbd2_handle_start_class---of 6
perf_trace_jbd2_handle_stats---of 6
perf_trace_jbd2_lock_buffer_stall---of 6
perf_trace_jbd2_run_stats---of 6
perf_trace_jbd2_submit_inode_data---of 6
perf_trace_jbd2_update_log_tail---of 6
perf_trace_jbd2_write_superblock---of 6
trace_event_raw_event_jbd2_checkpoint---of 10
trace_event_raw_event_jbd2_checkpoint_stats---of 10
trace_event_raw_event_jbd2_commit---of 10
trace_event_raw_event_jbd2_end_commit---of 10
trace_event_raw_event_jbd2_handle_extend---of 10
trace_event_raw_event_jbd2_handle_start_class---of 10
trace_event_raw_event_jbd2_handle_stats---of 10
trace_event_raw_event_jbd2_lock_buffer_stall---of 10
trace_event_raw_event_jbd2_run_stats---of 10
trace_event_raw_event_jbd2_submit_inode_data---of 10
trace_event_raw_event_jbd2_update_log_tail---of 10
trace_event_raw_event_jbd2_write_superblock---of 10
trace_raw_output_jbd2_checkpoint---of 4
trace_raw_output_jbd2_checkpoint_stats---of 4
trace_raw_output_jbd2_commit---of 4
trace_raw_output_jbd2_end_commit---of 4
trace_raw_output_jbd2_handle_extend---of 4
trace_raw_output_jbd2_handle_start_class---of 4
trace_raw_output_jbd2_handle_stats---of 4
trace_raw_output_jbd2_lock_buffer_stall---of 4
trace_raw_output_jbd2_run_stats---of 4
trace_raw_output_jbd2_submit_inode_data---of 4
trace_raw_output_jbd2_update_log_tail---of 4
trace_raw_output_jbd2_write_superblock---of 4

__ia32_sys_fgetxattr---of 7
__ia32_sys_flistxattr---of 7
__ia32_sys_fremovexattr---of 10
__ia32_sys_fsetxattr---of 10
__ia32_sys_getxattr---of 1
__ia32_sys_lgetxattr---of 1
__ia32_sys_listxattr---of 1
__ia32_sys_llistxattr---of 1
__ia32_sys_lremovexattr---of 1
__ia32_sys_lsetxattr---of 1
__ia32_sys_removexattr---of 1
__ia32_sys_setxattr---of 1
__vfs_getxattr80%of 5
__vfs_removexattr---of 5
__vfs_removexattr_locked---of 17
__vfs_setxattr---of 6
__vfs_setxattr_locked---of 11
__vfs_setxattr_noperm---of 22
__x64_sys_fgetxattr---of 7
__x64_sys_flistxattr---of 7
__x64_sys_fremovexattr---of 10
__x64_sys_fsetxattr---of 10
__x64_sys_getxattr---of 1
__x64_sys_lgetxattr---of 1
__x64_sys_listxattr---of 1
__x64_sys_llistxattr---of 1
__x64_sys_lremovexattr---of 1
__x64_sys_lsetxattr---of 1
__x64_sys_removexattr---of 1
__x64_sys_setxattr---of 1
copy_overflow---of 1
generic_listxattr---of 21
getxattr---of 16
listxattr---of 11
path_getxattr---of 5
path_listxattr---of 5
path_removexattr---of 7
path_setxattr---of 7
removexattr---of 5
setxattr---of 18
simple_xattr_alloc---of 4
simple_xattr_get29%of 7
simple_xattr_list---of 14
simple_xattr_list_add---of 1
simple_xattr_set---of 20
vfs_getxattr---of 17
vfs_getxattr_alloc---of 12
vfs_listxattr---of 7
vfs_removexattr---of 8
vfs_setxattr---of 8
xattr_full_name67%of 3
xattr_list_one---of 5
xattr_permission---of 16
xattr_resolve_name88%of 16
xattr_supported_namespace---of 9

ext4_discard_allocated_blocks---of 17
ext4_discard_preallocations---of 40
ext4_exit_mballoc---of 2
ext4_free_blocks---of 146
ext4_free_blocks.cold---of 1
ext4_group_add_blocks---of 52
ext4_mb_add_groupinfo---of 70
ext4_mb_alloc_groupinfo---of 25
ext4_mb_check_limits84%of 12
ext4_mb_complex_scan_group74%of 30
ext4_mb_discard_group_preallocations---of 30
ext4_mb_discard_lg_preallocations---of 36
ext4_mb_find_by_goal50%of 40
ext4_mb_free_metadata.isra.0---of 34
ext4_mb_generate_buddy---of 19
ext4_mb_generate_from_pa---of 9
ext4_mb_good_group84%of 18
ext4_mb_init---of 65
ext4_mb_init.cold---of 1
ext4_mb_init_cache---of 85
ext4_mb_init_group---of 34
ext4_mb_initialize_context74%of 19
ext4_mb_load_buddy_gfp22%of 73
ext4_mb_mark_bb---of 60
ext4_mb_mark_diskspace_used51%of 57
ext4_mb_mark_pa_deleted75%of 4
ext4_mb_new_blocks40%of 194
ext4_mb_new_blocks.cold---of 1
ext4_mb_new_group_pa---of 21
ext4_mb_new_inode_pa40%of 30
ext4_mb_normalize_request.constprop.044%of 97
ext4_mb_pa_callback---of 5
ext4_mb_pa_free60%of 5
ext4_mb_prefetch35%of 20
ext4_mb_prefetch_fini---of 14
ext4_mb_regular_allocator43%of 115
ext4_mb_release---of 24
ext4_mb_release_group_pa.isra.0---of 25
ext4_mb_release_inode_pa.isra.0---of 30
ext4_mb_scan_aligned---of 10
ext4_mb_seq_groups_next---of 5
ext4_mb_seq_groups_show---of 16
ext4_mb_seq_groups_start---of 5
ext4_mb_simple_scan_group65%of 14
ext4_mb_try_best_found.isra.0---of 16
ext4_mb_unload_buddy58%of 14
ext4_mb_use_best_found62%of 18
ext4_mb_use_inode_pa56%of 9
ext4_mb_use_preallocated.constprop.065%of 56
ext4_mballoc_query_range---of 38
ext4_process_freed_data---of 71
ext4_seq_mb_stats_show---of 5
ext4_set_bits100%of 7
ext4_trim_fs---of 111
ext4_trim_interrupted---of 5
ext4_try_merge_freed_extent.part.0---of 5
get_order---of 1
mb_clear_bits---of 7
mb_find_buddy75%of 8
mb_find_extent60%of 20
mb_find_order_for_block72%of 7
mb_free_blocks---of 67
mb_mark_used82%of 38
mb_test_and_clear_bits---of 13

selinux_netlbl_cache_invalidate---of 1
selinux_netlbl_err---of 1
selinux_netlbl_inet_conn_request---of 18
selinux_netlbl_inet_csk_clone---of 2
selinux_netlbl_sctp_assoc_request---of 23
selinux_netlbl_sctp_sk_clone---of 1
selinux_netlbl_sk_security_free---of 16
selinux_netlbl_sk_security_reset100%of 1
selinux_netlbl_skbuff_getsid---of 26
selinux_netlbl_skbuff_setsid---of 25
selinux_netlbl_sock_genattr25%of 20
selinux_netlbl_sock_rcv_skb---of 33
selinux_netlbl_socket_connect---of 1
selinux_netlbl_socket_connect_locked---of 7
selinux_netlbl_socket_post_create63%of 8
selinux_netlbl_socket_setsockopt---of 22

__bad_area_nosemaphore11%of 19
__bad_area_nosemaphore.cold---of 1
__traceiter_page_fault_kernel---of 4
__traceiter_page_fault_user---of 4
bad_area---of 1
bad_area_access_error17%of 12
bad_area_nosemaphore---of 1
do_kern_addr_fault---of 9
do_user_addr_fault45%of 88
do_user_addr_fault.cold---of 1
dump_pagetable---of 1
dump_pagetable.cold---of 21
fault_in_kernel_space67%of 6
is_kmmio_active100%of 1
is_prefetch.constprop.0---of 25
kmmio_handler.constprop.0---of 1
no_context.constprop.016%of 19
no_context.constprop.0.cold---of 23
perf_trace_x86_exceptions---of 6
pgtable_bad---of 2
set_signal_archinfo---of 5
show_ldttss---of 9
spurious_kernel_fault---of 42
trace_event_raw_event_x86_exceptions---of 10
trace_page_fault_kernel---of 9
trace_page_fault_user---of 9
trace_raw_output_x86_exceptions---of 4

change_clocksource---of 11
do_adjtimex---of 46
do_settimeofday64---of 18
do_timer---of 1
dummy_clock_read---of 5
get_device_system_crosststamp---of 39
getboottime64---of 1
ktime_get80%of 10
ktime_get_boot_fast_ns---of 1
ktime_get_coarse_real_ts6467%of 6
ktime_get_coarse_ts64---of 7
ktime_get_coarse_with_offset---of 9
ktime_get_fast_timestamps---of 7
ktime_get_mono_fast_ns---of 3
ktime_get_raw---of 8
ktime_get_raw_fast_ns---of 3
ktime_get_raw_ts64---of 11
ktime_get_real_fast_ns---of 3
ktime_get_real_seconds100%of 1
ktime_get_real_ts64---of 13
ktime_get_resolution_ns---of 9
ktime_get_seconds---of 4
ktime_get_snapshot---of 11
ktime_get_ts64---of 13
ktime_get_update_offsets_now---of 12
ktime_get_with_offset70%of 10
ktime_mono_to_any---of 8
pvclock_gtod_register_notifier---of 1
pvclock_gtod_unregister_notifier---of 1
random_get_entropy_fallback---of 5
scale64_check_overflow---of 4
timekeeping_advance---of 56
timekeeping_advance.cold---of 1
timekeeping_forward_now.constprop.0---of 9
timekeeping_inject_offset---of 19
timekeeping_max_deferment---of 7
timekeeping_notify---of 3
timekeeping_resume---of 23
timekeeping_resume.cold---of 1
timekeeping_suspend---of 10
timekeeping_update---of 10
timekeeping_valid_for_hres---of 7
timekeeping_warp_clock---of 3
tk_set_wall_to_mono---of 7
tk_setup_internals.constprop.0---of 7
update_fast_timekeeper---of 1
update_wall_time---of 1
xtime_update---of 6

__invalidate_mapping_pages---of 24
do_invalidatepage---of 3
generic_error_remove_page---of 7
invalidate_inode_page---of 13
invalidate_inode_pages2---of 1
invalidate_inode_pages2_range6%of 56
invalidate_mapping_pages---of 1
invalidate_mapping_pagevec---of 1
pagecache_isize_extended---of 15
truncate_cleanup_page77%of 13
truncate_exceptional_pvec_entries.part.023%of 22
truncate_inode_page80%of 5
truncate_inode_pages100%of 1
truncate_inode_pages_final---of 3
truncate_inode_pages_range37%of 82
truncate_pagecache---of 1
truncate_pagecache_range---of 3
truncate_setsize---of 3

__isofs_iget33%of 74
__isofs_iget.cold---of 6
_isofs_bmap---of 1
init_once---of 1
isofs_alloc_inode100%of 3
isofs_bmap---of 3
isofs_bread---of 3
isofs_dentry_cmp_ms---of 8
isofs_dentry_cmpi---of 3
isofs_dentry_cmpi_ms---of 8
isofs_fill_super33%of 134
isofs_fill_super.cold---of 26
isofs_free_inode---of 1
isofs_get_block---of 3
isofs_get_block.cold---of 1
isofs_get_blocks---of 17
isofs_get_blocks.cold---of 2
isofs_hash_ms---of 5
isofs_hashi---of 5
isofs_hashi_ms---of 9
isofs_iget5_set100%of 1
isofs_iget5_test---of 4
isofs_mount100%of 1
isofs_put_super---of 1
isofs_readahead---of 1
isofs_readpage---of 1
isofs_remount---of 2
isofs_show_options---of 36
isofs_statfs---of 1

dd_bio_merge67%of 3
dd_dispatch_request72%of 28
dd_exit_queue---of 5
dd_has_work100%of 8
dd_init_queue---of 5
dd_insert_requests84%of 18
dd_merged_requests---of 5
dd_request_merge38%of 8
dd_request_merged---of 2
deadline_batching_show---of 1
deadline_dispatch_next---of 1
deadline_dispatch_start---of 1
deadline_dispatch_stop---of 1
deadline_fifo_batch_show---of 1
deadline_fifo_batch_store---of 1
deadline_front_merges_show---of 1
deadline_front_merges_store---of 1
deadline_read_expire_show---of 1
deadline_read_expire_store---of 3
deadline_read_fifo_next---of 1
deadline_read_fifo_start---of 1
deadline_read_fifo_stop---of 1
deadline_read_next_rq_show---of 3
deadline_remove_request63%of 8
deadline_starved_show---of 1
deadline_write_expire_show---of 1
deadline_write_expire_store---of 3
deadline_write_fifo_next---of 1
deadline_write_fifo_start---of 1
deadline_write_fifo_stop---of 1
deadline_write_next_rq_show---of 3
deadline_writes_starved_show---of 1
deadline_writes_starved_store---of 1

assoc_array_apply_edit---of 24
assoc_array_cancel_edit---of 4
assoc_array_clear---of 5
assoc_array_delete---of 35
assoc_array_delete_collapse_iterator---of 5
assoc_array_destroy---of 3
assoc_array_destroy_subtree.part.0---of 30
assoc_array_find100%of 7
assoc_array_gc---of 72
assoc_array_insert---of 104
assoc_array_insert_set_object---of 3
assoc_array_iterate---of 3
assoc_array_rcu_cleanup---of 13
assoc_array_subtree_iterate---of 16
assoc_array_walk.isra.039%of 18
get_order---of 1

llist_add_batch100%of 3
llist_del_first---of 4
llist_reverse_order---of 3

dnotify_flush7%of 16
dnotify_free_mark---of 3
dnotify_handle_event---of 11
dnotify_recalc_inode_mask---of 6
fcntl_dirnotify---of 68

__dispose_buffer---of 5
__jbd2_journal_file_buffer68%of 37
__jbd2_journal_file_buffer.cold---of 1
__jbd2_journal_refile_buffer---of 18
__jbd2_journal_temp_unlink_buffer58%of 26
__jbd2_journal_unfile_buffer---of 5
__jbd2_journal_unreserve_handle---of 4
add_transaction_credits28%of 33
do_get_write_access65%of 56
do_get_write_access.cold---of 3
jbd2__journal_restart---of 17
jbd2__journal_start50%of 24
jbd2_buffer_abort_trigger---of 3
jbd2_buffer_frozen_trigger---of 3
jbd2_journal_begin_ordered_truncate---of 5
jbd2_journal_destroy_transaction_cache---of 1
jbd2_journal_dirty_metadata68%of 37
jbd2_journal_dirty_metadata.cold---of 14
jbd2_journal_extend---of 19
jbd2_journal_file_buffer---of 1
jbd2_journal_file_inode69%of 22
jbd2_journal_forget---of 34
jbd2_journal_forget.cold---of 1
jbd2_journal_free_reserved---of 1
jbd2_journal_free_transaction---of 2
jbd2_journal_get_create_access65%of 17
jbd2_journal_get_undo_access---of 17
jbd2_journal_get_write_access88%of 8
jbd2_journal_inode_ranged_wait---of 1
jbd2_journal_inode_ranged_write100%of 1
jbd2_journal_invalidatepage---of 52
jbd2_journal_lock_updates---of 15
jbd2_journal_refile_buffer---of 2
jbd2_journal_restart---of 1
jbd2_journal_set_triggers---of 3
jbd2_journal_start---of 1
jbd2_journal_start_reserved---of 16
jbd2_journal_stop50%of 34
jbd2_journal_try_to_free_buffers---of 22
jbd2_journal_try_to_free_buffers.cold---of 1
jbd2_journal_unfile_buffer---of 1
jbd2_journal_unlock_updates---of 3
jbd2_write_access_granted69%of 22
start_this_handle63%of 51
start_this_handle.cold---of 1
stop_this_handle58%of 14
wait_transaction_locked75%of 4

__cpuset_memory_pressure_bump---of 22
__cpuset_node_allowed10%of 32
alloc_trial_cpuset---of 3
bitmap_fill.constprop.0---of 1
cpumask_weight---of 1
cpuset_attach---of 28
cpuset_bind---of 6
cpuset_can_attach---of 28
cpuset_cancel_attach---of 5
cpuset_change_task_nodemask---of 6
cpuset_common_seq_show---of 8
cpuset_cpus_allowed---of 27
cpuset_cpus_allowed_fallback---of 26
cpuset_css_alloc---of 4
cpuset_css_free---of 1
cpuset_css_offline---of 9
cpuset_css_online---of 39
cpuset_force_rebuild---of 1
cpuset_fork100%of 6
cpuset_hotplug_workfn---of 169
cpuset_hotplug_workfn.cold---of 2
cpuset_lock---of 1
cpuset_mem_spread_node---of 3
cpuset_mems_allowed---of 25
cpuset_mems_allowed_intersects---of 1
cpuset_migrate_mm---of 3
cpuset_migrate_mm_workfn---of 1
cpuset_nodemask_valid_mems_allowed---of 1
cpuset_post_attach---of 1
cpuset_print_current_mems_allowed---of 15
cpuset_print_current_mems_allowed.cold---of 8
cpuset_read_s64---of 3
cpuset_read_u64---of 20
cpuset_slab_spread_node---of 3
cpuset_task_status_allowed---of 2
cpuset_unlock---of 1
cpuset_update_active_cpus---of 1
cpuset_update_task_spread_flag---of 6
cpuset_wait_for_hotplug---of 1
cpuset_write_resmask---of 160
cpuset_write_resmask.cold---of 3
cpuset_write_s64---of 11
cpuset_write_u64---of 13
current_cpuset_is_being_rebound---of 22
dec_dl_tasks_cs---of 10
dl_update_tasks_root_domain---of 7
fmeter_update---of 4
get_order---of 1
inc_dl_tasks_cs---of 10
is_cpuset_subset---of 5
proc_cpuset_show---of 27
rebuild_sched_domains---of 1
rebuild_sched_domains_locked---of 179
rebuild_sched_domains_locked.cold---of 1
sched_partition_show---of 8
sched_partition_write---of 45
update_cpumasks_hier---of 111
update_domain_attr_tree---of 23
update_flag---of 16
update_parent_subparts_cpumask---of 48
update_prstate---of 18
update_sibling_cpumasks---of 74
update_tasks_cpumask---of 7
update_tasks_flags---of 4
update_tasks_nodemask---of 10
validate_change---of 49

ext4_bg_has_super---of 25
ext4_bg_num_gdb---of 9
ext4_claim_free_clusters100%of 3
ext4_count_free_clusters---of 8
ext4_free_clusters_after_init---of 38
ext4_get_group_desc55%of 24
ext4_get_group_info56%of 20
ext4_get_group_no_and_offset100%of 4
ext4_get_group_number75%of 4
ext4_has_free_clusters36%of 14
ext4_inode_to_goal_block59%of 12
ext4_new_meta_blocks72%of 7
ext4_num_base_meta_blocks---of 6
ext4_read_block_bitmap60%of 5
ext4_read_block_bitmap_nowait14%of 86
ext4_should_retry_alloc---of 10
ext4_validate_block_bitmap.part.06%of 35
ext4_wait_block_bitmap23%of 9
num_clusters_in_group---of 6

task_work_add78%of 9
task_work_cancel---of 9
task_work_cancel_func---of 9
task_work_cancel_match---of 9
task_work_run100%of 7

__bforget---of 3
__block_commit_write.constprop.0.isra.0---of 14
__block_write_begin---of 1
__block_write_begin_int---of 93
__block_write_full_page---of 70
__bread_gfp72%of 14
__breadahead---of 6
__breadahead_gfp---of 6
__brelse67%of 3
__find_get_block74%of 53
__find_get_block.cold---of 1
__getblk_gfp67%of 3
__getblk_slow65%of 34
__getblk_slow.cold---of 4
__ia32_sys_bdflush---of 5
__ia32_sys_bdflush.cold---of 1
__lock_buffer---of 2
__set_page_dirty---of 15
__set_page_dirty_buffers---of 17
__sync_dirty_buffer---of 16
__wait_on_buffer---of 2
__x64_sys_bdflush---of 5
__x64_sys_bdflush.cold---of 1
alloc_buffer_head80%of 5
alloc_page_buffers62%of 34
attach_nobh_buffers---of 18
bh_submit_read---of 10
bh_uptodate_or_lock25%of 8
block_commit_write---of 1
block_invalidatepage70%of 23
block_is_partially_uptodate---of 11
block_page_mkwrite---of 14
block_read_full_page---of 45
block_size_bits---of 1
block_truncate_page---of 38
block_write_begin---of 10
block_write_end---of 8
block_write_full_page---of 7
buffer_check_dirty_writeback---of 17
buffer_exit_cpu_dead---of 5
buffer_io_error---of 2
buffer_io_error.cold---of 1
clean_bdev_aliases---of 28
cont_write_begin---of 33
create_empty_buffers---of 25
create_page_buffers---of 9
drop_buffers62%of 18
emergency_thaw_bdev---of 2
emergency_thaw_bdev.cold---of 1
end_bio_bh_io_sync---of 3
end_buffer_async_read---of 26
end_buffer_async_read_io---of 1
end_buffer_async_write---of 17
end_buffer_read_nobh---of 5
end_buffer_read_sync---of 5
end_buffer_write_sync---of 5
free_buffer_head60%of 5
generic_block_bmap---of 1
generic_cont_expand_simple---of 5
generic_write_end---of 20
has_bh_in_lru60%of 5
init_page_buffers74%of 19
inode_has_buffers100%of 1
invalidate_bh_lru100%of 5
invalidate_bh_lrus100%of 1
invalidate_inode_buffers---of 6
ll_rw_block---of 9
mark_buffer_async_write---of 2
mark_buffer_dirty---of 20
mark_buffer_dirty_inode---of 6
mark_buffer_write_io_error---of 26
nobh_truncate_page---of 44
nobh_write_begin---of 55
nobh_write_end---of 21
nobh_writepage---of 8
page_zero_new_buffers---of 23
recalc_bh_state.part.0---of 4
remove_inode_buffers---of 8
set_bh_page---of 3
submit_bh---of 1
submit_bh_wbc.constprop.055%of 22
sync_dirty_buffer---of 1
sync_mapping_buffers---of 38
touch_buffer---of 10
try_to_free_buffers67%of 18
unlock_buffer100%of 1
write_boundary_block---of 5
write_dirty_buffer---of 6

__create_xol_area---of 19
__find_uprobe---of 14
__replace_page---of 64
__update_ref_ctr---of 13
__update_ref_ctr.cold---of 1
__uprobe_register---of 33
__uprobe_register.cold---of 1
__uprobe_unregister---of 12
arch_uprobe_copy_ixol---of 1
arch_uprobe_ignore---of 1
copy_from_page---of 1
copy_to_page---of 1
delayed_uprobe_delete---of 3
dup_xol_work---of 4
dup_xol_work.cold---of 1
filter_chain---of 5
install_breakpoint.isra.0---of 31
is_swbp_insn---of 1
is_trap_insn---of 1
put_uprobe---of 10
register_for_each_vma---of 48
set_orig_insn---of 1
set_swbp---of 1
update_ref_ctr---of 31
update_ref_ctr.cold---of 2
uprobe_apply---of 10
uprobe_clear_state---of 13
uprobe_copy_process7%of 16
uprobe_copy_process.cold---of 2
uprobe_deny_signal---of 9
uprobe_dup_mmap---of 2
uprobe_end_dup_mmap---of 9
uprobe_free_utask---of 6
uprobe_get_swbp_addr---of 1
uprobe_get_trap_addr---of 5
uprobe_mmap---of 56
uprobe_mmap.cold---of 1
uprobe_munmap---of 18
uprobe_notify_resume---of 139
uprobe_notify_resume.cold---of 4
uprobe_post_sstep_notifier---of 6
uprobe_pre_sstep_notifier---of 7
uprobe_register---of 1
uprobe_register_refctr---of 1
uprobe_start_dup_mmap---of 9
uprobe_unregister---of 3
uprobe_warn.constprop.0---of 1
uprobe_write_opcode---of 77
xol_free_insn_slot---of 10

page_counter_cancel50%of 2
page_counter_charge---of 5
page_counter_memparse---of 5
page_counter_set_low---of 2
page_counter_set_max---of 5
page_counter_set_min---of 2
page_counter_try_charge67%of 9
page_counter_uncharge100%of 2
propagate_protected_usage78%of 9

__check_block_validity.constprop.080%of 5
__ext4_block_zero_page_range---of 35
__ext4_expand_extra_isize---of 13
__ext4_get_inode_loc20%of 55
__ext4_get_inode_loc_noinmem---of 3
__ext4_iget---of 190
__ext4_journalled_invalidatepage---of 13
__ext4_mark_inode_dirty31%of 26
_ext4_get_block---of 9
check_igot_inode---of 10
do_journal_get_write_access---of 7
ext4_alloc_da_blocks---of 13
ext4_bmap---of 10
ext4_bread50%of 10
ext4_bread_batch38%of 24
ext4_break_layouts---of 4
ext4_can_truncate---of 4
ext4_change_inode_journal_flag---of 16
ext4_chunk_trans_blocks---of 1
ext4_da_get_block_prep---of 57
ext4_da_release_space---of 14
ext4_da_reserve_space---of 14
ext4_da_update_reserve_space---of 19
ext4_da_write_begin---of 54
ext4_da_write_end---of 37
ext4_dirty_inode100%of 3
ext4_es_is_delayed---of 1
ext4_es_is_delonly---of 3
ext4_es_is_mapped---of 1
ext4_evict_inode---of 108
ext4_expand_extra_isize---of 16
ext4_file_getattr---of 5
ext4_filemap_fault---of 1
ext4_get_block---of 1
ext4_get_block_unwritten---of 1
ext4_get_fc_inode_loc---of 1
ext4_get_inode_loc67%of 3
ext4_get_projid---of 3
ext4_get_reserved_space---of 1
ext4_getattr---of 16
ext4_getblk31%of 26
ext4_inode_attach_jinode67%of 9
ext4_inode_csum59%of 29
ext4_inode_csum_set86%of 7
ext4_inode_is_fast_symlink---of 11
ext4_invalidatepage---of 15
ext4_iomap_begin59%of 29
ext4_iomap_begin_report---of 20
ext4_iomap_end100%of 3
ext4_iomap_overwrite_begin---of 4
ext4_iomap_swap_activate---of 1
ext4_issue_zeroout---of 4
ext4_journalled_invalidatepage---of 2
ext4_journalled_set_page_dirty---of 1
ext4_journalled_write_end---of 79
ext4_journalled_zero_new_buffers---of 17
ext4_map_blocks54%of 79
ext4_map_query_blocks.constprop.0---of 9
ext4_mark_iloc_dirty45%of 135
ext4_meta_trans_blocks88%of 8
ext4_nonda_switch---of 6
ext4_page_mkwrite---of 92
ext4_punch_hole---of 57
ext4_readahead---of 3
ext4_readpage---of 15
ext4_releasepage---of 15
ext4_reserve_inode_write50%of 8
ext4_set_aops50%of 6
ext4_set_inode_flags64%of 25
ext4_set_iomap62%of 21
ext4_set_page_dirty---of 10
ext4_setattr---of 115
ext4_truncate---of 59
ext4_update_bh_state---of 4
ext4_update_disksize_before_punch---of 13
ext4_walk_page_buffers---of 11
ext4_write_begin---of 72
ext4_write_end---of 58
ext4_write_inode---of 18
ext4_write_inode.cold---of 1
ext4_writepage---of 113
ext4_writepage_trans_blocks---of 5
ext4_writepages---of 192
ext4_zero_partial_blocks---of 14
mpage_prepare_extent_to_map---of 43
mpage_process_page_bufs---of 26
mpage_release_unused_pages---of 26
mpage_submit_page---of 7
write_end_fn---of 7

selinux_nlmsg_lookup29%of 21

__ia32_compat_sys_ftruncate---of 1
__ia32_compat_sys_open---of 5
__ia32_compat_sys_openat---of 5
__ia32_compat_sys_truncate---of 3
__ia32_sys_access---of 1
__ia32_sys_chdir---of 8
__ia32_sys_chmod---of 1
__ia32_sys_chown---of 1
__ia32_sys_chroot---of 10
__ia32_sys_close---of 5
__ia32_sys_close_range---of 1
__ia32_sys_creat---of 1
__ia32_sys_faccessat---of 1
__ia32_sys_faccessat2---of 1
__ia32_sys_fallocate---of 5
__ia32_sys_fchdir---of 8
__ia32_sys_fchmod---of 9
__ia32_sys_fchmodat---of 1
__ia32_sys_fchown---of 1
__ia32_sys_fchownat---of 1
__ia32_sys_ftruncate---of 1
__ia32_sys_lchown---of 1
__ia32_sys_open---of 5
__ia32_sys_openat---of 5
__ia32_sys_openat2---of 11
__ia32_sys_truncate---of 3
__ia32_sys_vhangup---of 3
__x64_sys_access---of 1
__x64_sys_chdir---of 8
__x64_sys_chmod---of 1
__x64_sys_chown---of 1
__x64_sys_chroot---of 10
__x64_sys_close80%of 5
__x64_sys_close_range---of 1
__x64_sys_creat---of 1
__x64_sys_faccessat---of 1
__x64_sys_faccessat2---of 1
__x64_sys_fallocate---of 5
__x64_sys_fchdir---of 8
__x64_sys_fchmod---of 9
__x64_sys_fchmodat---of 1
__x64_sys_fchown---of 1
__x64_sys_fchownat---of 1
__x64_sys_ftruncate100%of 1
__x64_sys_lchown---of 1
__x64_sys_open---of 5
__x64_sys_openat80%of 5
__x64_sys_openat2---of 11
__x64_sys_truncate---of 3
build_open_flags64%of 47
build_open_how---of 5
chmod_common---of 10
chown_common---of 16
dentry_create---of 5
dentry_open---of 7
do_dentry_open66%of 50
do_faccessat---of 34
do_fchmodat---of 4
do_fchownat---of 10
do_sys_ftruncate59%of 36
do_sys_open---of 5
do_sys_openat278%of 18
do_sys_truncate---of 3
do_sys_truncate.part.0---of 5
do_truncate88%of 8
file_open_name---of 32
file_open_root---of 32
file_path---of 1
filp_close86%of 7
filp_close.cold---of 1
filp_open---of 4
finish_no_open---of 5
finish_open---of 3
generic_file_open67%of 3
ksys_fallocate---of 4
ksys_fchown---of 9
nonseekable_open---of 1
open_with_fake_path---of 4
stream_open100%of 1
vfs_fallocate---of 55
vfs_fchmod---of 4
vfs_fchown---of 7
vfs_open100%of 1
vfs_truncate---of 23

__loop_clr_fd---of 30
__loop_clr_fd.cold---of 1
__loop_update_dio---of 20
find_free_cb---of 4
lo_compat_ioctl---of 6
lo_complete_rq---of 11
lo_fallocate.isra.0---of 5
lo_ioctl8%of 79
lo_ioctl.cold---of 1
lo_open100%of 5
lo_release29%of 7
lo_rw_aio.isra.0---of 41
lo_rw_aio_complete---of 20
lo_rw_aio_do_completion---of 4
lo_write_bvec---of 25
lo_write_bvec.cold---of 1
loop_add---of 18
loop_attr_do_show_autoclear---of 2
loop_attr_do_show_backing_file---of 8
loop_attr_do_show_dio---of 2
loop_attr_do_show_offset---of 1
loop_attr_do_show_partscan---of 2
loop_attr_do_show_sizelimit---of 1
loop_config_discard---of 12
loop_configure---of 58
loop_control_ioctl---of 17
loop_exit_cb---of 1
loop_get_status.part.0---of 8
loop_get_status_compat---of 7
loop_get_status_old---of 14
loop_info64_from_compat---of 6
loop_info64_to_compat---of 11
loop_init_request---of 1
loop_kthread_worker_fn---of 1
loop_lookup---of 7
loop_probe---of 4
loop_queue_rq84%of 6
loop_queue_work---of 110
loop_queue_work.cold---of 2
loop_register_transfer---of 4
loop_reread_partitions---of 1
loop_reread_partitions.cold---of 1
loop_set_status---of 27
loop_set_status.cold---of 1
loop_set_status_compat---of 3
loop_set_status_from_info---of 20
loop_set_status_old---of 6
loop_unregister_transfer---of 4
loop_validate_file---of 8
transfer_xor---of 4
unregister_transfer_cb---of 6
xor_init---of 2

__xa_alloc---of 15
__xa_alloc_cyclic---of 10
__xa_clear_mark---of 3
__xa_cmpxchg---of 14
__xa_erase---of 5
__xa_insert---of 13
__xa_set_mark---of 3
__xa_store---of 13
__xas_next---of 27
__xas_nomem---of 16
__xas_prev---of 27
xa_clear_mark---of 3
xa_delete_node---of 6
xa_destroy---of 17
xa_erase---of 1
xa_extract---of 74
xa_find---of 21
xa_find_after---of 23
xa_get_mark---of 35
xa_load---of 16
xa_set_mark---of 3
xa_store---of 1
xas_alloc82%of 11
xas_clear_mark31%of 13
xas_create65%of 87
xas_create_range30%of 20
xas_find75%of 40
xas_find_conflict32%of 47
xas_find_marked50%of 70
xas_free_nodes---of 22
xas_get_mark---of 5
xas_init_marks67%of 6
xas_load75%of 12
xas_nomem38%of 8
xas_pause---of 17
xas_set_mark---of 12
xas_start50%of 30
xas_store39%of 112

__netlink_change_ngroups---of 11
__netlink_clear_multicast_users---of 4
__netlink_create---of 6
__netlink_dump_start---of 25
__netlink_kernel_create---of 28
__netlink_ns_capable---of 4
__netlink_seq_next---of 8
__nlmsg_put100%of 1
__rhashtable_lookup.constprop.066%of 29
copy_overflow---of 1
deferred_put_nlk_sk---of 5
get_order---of 1
jhash---of 16
netlink_ack25%of 49
netlink_add_tap---of 20
netlink_attachskb16%of 32
netlink_autobind.isra.052%of 35
netlink_bind---of 43
netlink_broadcast---of 1
netlink_broadcast_filtered---of 64
netlink_capable---of 4
netlink_change_ngroups---of 3
netlink_compare---of 3
netlink_connect---of 13
netlink_create---of 17
netlink_data_ready---of 1
netlink_deliver_tap31%of 53
netlink_detachskb---of 5
netlink_dump---of 41
netlink_getname---of 10
netlink_getsockbyfilp---of 8
netlink_getsockopt---of 16
netlink_has_listeners---of 23
netlink_hash---of 1
netlink_insert46%of 101
netlink_ioctl---of 1
netlink_kernel_release---of 3
netlink_lookup58%of 33
netlink_net_capable100%of 4
netlink_net_exit---of 1
netlink_net_init---of 2
netlink_ns_capable---of 4
netlink_overrun---of 4
netlink_rcv_skb93%of 13
netlink_realloc_groups---of 7
netlink_recvmsg---of 42
netlink_register_notifier---of 1
netlink_release---of 123
netlink_remove_tap---of 23
netlink_remove_tap.cold---of 1
netlink_sendmsg46%of 46
netlink_sendmsg.cold---of 1
netlink_sendskb---of 8
netlink_seq_next---of 1
netlink_seq_show---of 6
netlink_seq_start---of 5
netlink_seq_stop---of 2
netlink_set_err---of 15
netlink_setsockopt---of 47
netlink_skb_destructor38%of 8
netlink_sock_destruct---of 7
netlink_sock_destruct.cold---of 1
netlink_strict_get_check---of 1
netlink_table_grab---of 2
netlink_table_grab.part.0---of 4
netlink_table_ungrab---of 1
netlink_tap_init_net---of 18
netlink_trim45%of 9
netlink_unicast63%of 37
netlink_unregister_notifier---of 1
netlink_update_listeners---of 15
netlink_update_socket_mc---of 4
netlink_update_subscriptions---of 9
nlmsg_notify---of 16

__access_remote_vm---of 25
__apply_to_page_range---of 102
__do_fault---of 15
__get_locked_pte---of 11
__might_fault100%of 4
__p4d_alloc---of 21
__pmd_alloc---of 25
__pte_alloc---of 10
__pte_alloc_kernel---of 9
__pud_alloc---of 18
__vm_insert_mixed---of 9
__vm_map_pages---of 6
access_process_vm---of 3
access_remote_vm---of 1
add_mm_counter_fast19%of 11
alloc_set_pte---of 39
apply_to_existing_page_range---of 1
apply_to_page_range---of 1
clear_huge_page---of 15
clear_subpage---of 1
copy_huge_page_from_user---of 11
copy_page_range---of 237
copy_subpage---of 1
copy_user_huge_page---of 18
count_memcg_event_mm.part.057%of 23
do_page_mkwrite---of 16
do_swap_page---of 106
do_wp_page---of 72
fault_around_bytes_fops_open---of 1
fault_around_bytes_get---of 1
fault_around_bytes_set---of 6
fault_dirty_shared_page---of 15
finish_fault---of 10
finish_mkwrite_fault---of 16
follow_invalidate_pte---of 51
follow_pfn---of 7
follow_phys---of 11
follow_pte---of 1
free_pgd_range---of 82
free_pgtables---of 14
generic_access_phys---of 7
handle_mm_fault33%of 200
insert_page_into_pte_locked.constprop.0---of 16
insert_pfn---of 34
mm_trace_rss_stat---of 9
pfn_valid---of 36
print_bad_pte---of 18
print_bad_pte.cold---of 10
print_vma_addr---of 6
print_vma_addr.cold---of 4
ptlock_alloc---of 3
ptlock_free---of 1
remap_pfn_range---of 5
remap_pfn_range_internal---of 65
remap_pfn_range_notrack---of 3
sync_mm_rss---of 13
unmap_mapping_page---of 10
unmap_mapping_pages---of 8
unmap_mapping_range---of 1
unmap_page_range---of 146
unmap_single_vma---of 11
unmap_vmas---of 11
validate_page_before_insert---of 8
vm_insert_page---of 16
vm_insert_pages---of 34
vm_iomap_memory---of 6
vm_map_pages---of 1
vm_map_pages_zero---of 1
vm_normal_page---of 13
vmf_insert_mixed---of 1
vmf_insert_mixed_mkwrite---of 1
vmf_insert_mixed_prot---of 1
vmf_insert_pfn---of 1
vmf_insert_pfn_prot---of 15
walk_to_pmd---of 18
wp_page_copy---of 103
zap_page_range---of 13
zap_page_range_single---of 10
zap_vma_ptes---of 4

__inet_bind---of 33
__inet_stream_connect---of 37
inet_accept---of 32
inet_autobind---of 6
inet_bind---of 6
inet_compat_ioctl---of 6
inet_compat_routing_ioctl---of 9
inet_create37%of 57
inet_ctl_sock_create---of 3
inet_current_timestamp---of 1
inet_dgram_connect---of 7
inet_getname---of 10
inet_gro_complete---of 16
inet_gro_receive---of 41
inet_gso_segment---of 46
inet_init_net---of 1
inet_ioctl---of 12
inet_listen---of 14
inet_recv_error---of 5
inet_recvmsg---of 34
inet_register_protosw---of 9
inet_register_protosw.cold---of 2
inet_release---of 6
inet_send_prepare---of 29
inet_sendmsg---of 8
inet_sendpage---of 5
inet_shutdown---of 14
inet_sk_rebuild_header---of 50
inet_sk_rebuild_header.cold---of 1
inet_sk_set_state---of 10
inet_sk_state_store---of 10
inet_sock_destruct---of 23
inet_sock_destruct.cold---of 2
inet_stream_connect---of 1
inet_unregister_protosw---of 2
inet_unregister_protosw.cold---of 1
ipip_gro_complete---of 1
ipip_gro_receive---of 5
ipip_gso_segment---of 3
ipv4_mib_exit_net---of 1
ipv4_mib_init_net---of 17
snmp_fold_field---of 4
snmp_get_cpu_field---of 1

__traceiter_x86_fpu_after_restore---of 4
__traceiter_x86_fpu_after_save---of 4
__traceiter_x86_fpu_before_restore---of 4
__traceiter_x86_fpu_before_save---of 4
__traceiter_x86_fpu_copy_dst---of 4
__traceiter_x86_fpu_copy_src---of 4
__traceiter_x86_fpu_dropped---of 4
__traceiter_x86_fpu_init_state---of 4
__traceiter_x86_fpu_regs_activated---of 4
__traceiter_x86_fpu_regs_deactivated---of 4
__traceiter_x86_fpu_xstate_check_failed---of 4
copy_fpregs_to_fpstate60%of 10
copy_init_fpstate_to_fpregs---of 6
copy_kernel_to_fpregs63%of 8
fpregs_assert_state_consistent75%of 4
fpregs_mark_activate---of 10
fpstate_init---of 4
fpu__clear---of 12
fpu__clear_all---of 1
fpu__clear_user_states---of 1
fpu__copy54%of 26
fpu__drop30%of 20
fpu__exception_code---of 9
fpu__prepare_read---of 2
fpu__prepare_write---of 3
fpu__save---of 25
irq_fpu_usable---of 7
kernel_fpu_begin_mask---of 12
kernel_fpu_end---of 3
perf_trace_x86_fpu---of 8
switch_fpu_return54%of 15
trace_event_raw_event_x86_fpu---of 12
trace_raw_output_x86_fpu---of 4

acct_account_cputime---of 3
acct_clear_integrals100%of 1
acct_update_integrals---of 8
bacct_add_tsk---of 36
xacct_add_tsk---of 3

__dma_direct_alloc_pages---of 45
dma_direct_alloc---of 20
dma_direct_alloc_pages---of 9
dma_direct_can_mmap---of 1
dma_direct_free---of 10
dma_direct_free_pages---of 1
dma_direct_get_required_mask---of 7
dma_direct_get_sgtable---of 12
dma_direct_map_resource---of 8
dma_direct_map_resource.cold---of 1
dma_direct_map_sg38%of 27
dma_direct_max_mapping_size---of 10
dma_direct_mmap---of 11
dma_direct_need_sync---of 9
dma_direct_set_offset---of 5
dma_direct_set_offset.cold---of 1
dma_direct_supported---of 11
dma_direct_sync_sg_for_cpu---of 11
dma_direct_sync_sg_for_device---of 11
dma_direct_unmap_sg---of 26
get_order---of 1

__blkdev_driver_ioctl---of 3
blk_ioctl_discard---of 8
blkdev_bszset---of 8
blkdev_common_ioctl3%of 80
blkdev_compat_ptr_ioctl---of 3
blkdev_ioctl25%of 20
blkdev_pr_preempt---of 7
blkpg_do_ioctl---of 17
compat_blkdev_ioctl---of 28

__kobject_del---of 7
dynamic_kobj_release---of 1
get_order---of 1
kobj_attr_show---of 3
kobj_attr_store---of 3
kobj_child_ns_ops---of 6
kobj_ns_current_may_mount---of 4
kobj_ns_drop---of 5
kobj_ns_grab_current---of 4
kobj_ns_initial---of 4
kobj_ns_netlink---of 4
kobj_ns_ops---of 6
kobj_ns_type_register---of 4
kobj_ns_type_registered---of 3
kobject_add---of 6
kobject_add.cold---of 2
kobject_add_internal---of 39
kobject_add_internal.cold---of 4
kobject_create---of 4
kobject_create.cold---of 1
kobject_create_and_add---of 3
kobject_create_and_add.cold---of 1
kobject_del---of 2
kobject_get---of 10
kobject_get_ownership---of 2
kobject_get_path---of 9
kobject_get_unless_zero75%of 12
kobject_init---of 4
kobject_init.cold---of 2
kobject_init_and_add---of 3
kobject_init_and_add.cold---of 1
kobject_move---of 10
kobject_namespace---of 8
kobject_put34%of 15
kobject_rename---of 10
kobject_set_name---of 1
kobject_set_name_vargs---of 9
kset_create_and_add---of 5
kset_find_obj---of 6
kset_get_ownership---of 3
kset_init---of 3
kset_register---of 7
kset_register.cold---of 1
kset_release---of 1
kset_unregister---of 4

__account_locked_vm---of 13
__page_mapcount---of 9
__vcalloc---of 4
__vm_enough_memory31%of 13
__vma_link_list---of 5
__vma_unlink_list---of 5
__vmalloc_array---of 4
account_locked_vm---of 5
get_cmdline---of 13
get_cmdline.cold---of 1
kfree_const---of 3
kmemdup100%of 3
kmemdup_nul80%of 5
kstrdup100%of 4
kstrdup_const60%of 5
kstrndup---of 8
kstrndup.cold---of 1
kvfree67%of 3
kvfree_sensitive---of 4
kvmalloc_node31%of 13
kvrealloc---of 8
memcmp_pages---of 1
memdup_user50%of 8
memdup_user_nul---of 9
mmap_file---of 4
overcommit_kbytes_handler---of 4
overcommit_policy_handler---of 8
overcommit_ratio_handler---of 4
page_anon_vma---of 5
page_mapped29%of 14
page_mapping70%of 13
page_mapping_file---of 7
page_rmapping---of 3
randomize_page---of 7
randomize_stack_top---of 4
strndup_user84%of 6
sync_overcommit_as---of 1
vcalloc---of 4
vm_commit_limit---of 4
vm_memory_committed---of 1
vm_mmap---of 4
vm_mmap_pgoff---of 5
vma_close---of 3
vma_is_stack_for_current---of 3
vmalloc_array---of 4
vmemdup_user---of 9

__do_pipe_flags.part.0---of 10
__ia32_sys_pipe---of 1
__ia32_sys_pipe2---of 1
__x64_sys_pipe---of 1
__x64_sys_pipe2---of 1
account_pipe_buffers---of 1
alloc_pipe_info48%of 23
anon_pipe_buf_release---of 11
anon_pipe_buf_try_steal---of 9
create_pipe_files---of 14
do_pipe2---of 7
do_pipe_flags---of 5
fifo_open---of 34
free_pipe_info---of 7
generic_pipe_buf_get---of 7
generic_pipe_buf_release67%of 6
generic_pipe_buf_try_steal---of 8
get_order---of 1
get_pipe_info---of 2
pipe_double_lock---of 11
pipe_fasync---of 8
pipe_fcntl---of 22
pipe_ioctl---of 6
pipe_is_unprivileged_user---of 4
pipe_lock50%of 2
pipe_poll44%of 30
pipe_read---of 49
pipe_release---of 12
pipe_resize_ring---of 11
pipe_unlock50%of 2
pipe_wait_readable---of 14
pipe_wait_writable---of 14
pipe_write---of 80
pipefs_dname---of 1
pipefs_init_fs_context---of 3
round_pipe_size---of 4
round_pipe_size.part.0---of 1
too_many_pipe_buffers_hard---of 3
too_many_pipe_buffers_soft---of 3
wait_for_partner---of 10

__disable_kprobe---of 16
__disarm_kprobe---of 5
__free_insn_slot---of 27
__get_insn_slot---of 34
__get_valid_kprobe---of 13
__is_insn_slot_addr59%of 17
__kretprobe_trampoline_handler---of 20
__kretprobe_trampoline_handler.cold---of 2
__unregister_kprobe_bottom---of 5
__unregister_kprobe_top---of 26
aggr_fault_handler---of 5
aggr_post_handler---of 5
aggr_pre_handler---of 6
alloc_aggr_kprobe---of 5
arch_check_ftrace_location---of 2
arch_deref_entry_point---of 1
arch_kprobe_get_kallsym---of 1
arch_kprobe_on_func_entry---of 1
arch_within_kprobe_blacklist---of 3
arm_kprobe---of 6
cleanup_rp_inst---of 11
collect_garbage_slots---of 9
collect_one_slot.part.0---of 4
disable_kprobe---of 3
dump_kprobe---of 1
enable_kprobe---of 10
get_kprobe---of 4
get_optimized_kprobe---of 10
init_aggr_kprobe---of 6
is_cfi_preamble_symbol---of 6
kill_kprobe---of 20
kprobe_add_area_blacklist---of 7
kprobe_add_ksym_blacklist---of 5
kprobe_blacklist_open---of 4
kprobe_blacklist_seq_next---of 1
kprobe_blacklist_seq_show---of 5
kprobe_blacklist_seq_start---of 1
kprobe_blacklist_seq_stop---of 1
kprobe_busy_begin---of 1
kprobe_busy_end---of 1
kprobe_cache_get_kallsym---of 18
kprobe_disarmed---of 6
kprobe_exceptions_notify---of 1
kprobe_flush_task43%of 7
kprobe_free_init_mem---of 8
kprobe_get_kallsym---of 4
kprobe_lookup_name---of 1
kprobe_on_func_entry---of 10
kprobe_optimizer---of 36
kprobe_remove_area_blacklist---of 5
kprobe_seq_next---of 2
kprobe_seq_start---of 2
kprobes_inc_nmissed_count---of 4
kprobes_module_callback---of 41
kprobes_open---of 4
kretprobe_hash_lock---of 1
kretprobe_hash_unlock---of 1
kretprobe_table_lock---of 1
kretprobe_table_unlock---of 1
opt_pre_handler---of 5
optimize_all_kprobes---of 9
optimize_kprobe---of 16
optprobe_queued_unopt---of 6
pre_handler_kretprobe---of 13
proc_kprobes_optimization_handler---of 12
read_enabled_file_bool---of 2
recycle_rp_inst---of 8
register_kprobe---of 76
register_kprobes---of 6
register_kretprobe---of 34
register_kretprobes---of 6
report_probe---of 13
show_kprobe_addr---of 7
unoptimize_kprobe---of 15
unregister_kprobe---of 4
unregister_kprobes---of 2
unregister_kprobes.part.0---of 9
unregister_kretprobe---of 1
unregister_kretprobes---of 2
unregister_kretprobes.part.0---of 9
wait_for_kprobe_optimizer---of 5
within_kprobe_blacklist---of 8
within_kprobe_blacklist.part.0---of 9
write_enabled_file_bool---of 25
write_enabled_file_bool.cold---of 10

__get_vma_policy50%of 10
__ia32_compat_sys_get_mempolicy---of 5
__ia32_compat_sys_mbind---of 5
__ia32_compat_sys_migrate_pages---of 14
__ia32_compat_sys_set_mempolicy---of 5
__ia32_sys_get_mempolicy---of 1
__ia32_sys_mbind---of 1
__ia32_sys_migrate_pages---of 1
__ia32_sys_set_mempolicy---of 1
__mpol_dup---of 12
__mpol_equal---of 12
__mpol_put---of 2
__nodes_weight.constprop.0---of 1
__x64_sys_get_mempolicy---of 9
__x64_sys_mbind---of 7
__x64_sys_migrate_pages---of 1
__x64_sys_set_mempolicy---of 7
alloc_page_interleave---of 5
alloc_pages_current73%of 11
alloc_pages_vma45%of 18
copy_nodes_to_user---of 5
do_get_mempolicy---of 57
do_mbind---of 62
do_migrate_pages---of 3
do_migrate_pages.part.0---of 16
do_set_mempolicy---of 11
get_nodes---of 16
get_task_policy---of 7
huge_node---of 16
init_nodemask_of_mempolicy---of 12
kernel_get_mempolicy---of 8
kernel_mbind---of 7
kernel_migrate_pages---of 58
kernel_set_mempolicy---of 7
mempolicy_nodemask_intersects---of 8
mempolicy_slab_node22%of 14
migrate_to_node---of 6
mpol_free_shared_policy17%of 6
mpol_misplaced---of 24
mpol_new---of 16
mpol_new_bind---of 3
mpol_new_interleave---of 3
mpol_new_preferred---of 5
mpol_parse_str---of 56
mpol_put_task_policy---of 2
mpol_rebind_mm---of 8
mpol_rebind_nodemask---of 8
mpol_rebind_preferred---of 9
mpol_rebind_task---of 5
mpol_set_nodemask.part.0---of 13
mpol_set_shared_policy---of 39
mpol_shared_policy_init16%of 13
mpol_shared_policy_lookup34%of 6
mpol_to_str---of 17
new_page---of 13
numa_default_policy---of 1
numa_map_to_online_node---of 6
offset_il_node---of 5
policy_node50%of 8
policy_nodemask23%of 9
queue_pages_hugetlb---of 28
queue_pages_pte_range---of 30
queue_pages_test_walk---of 16
set_bit---of 1
sp_insert---of 10
sp_lookup.isra.0---of 10
vma_dup_policy---of 6
vma_migratable---of 12
vma_policy_mof---of 13

get_order---of 1
mempool_alloc43%of 14
mempool_alloc_pages---of 1
mempool_alloc_slab67%of 3
mempool_create---of 4
mempool_create_node---of 6
mempool_destroy---of 2
mempool_exit---of 10
mempool_free---of 11
mempool_free_pages---of 1
mempool_free_slab---of 1
mempool_init---of 1
mempool_init_node---of 15
mempool_kfree---of 1
mempool_kmalloc---of 1
mempool_resize---of 21
remove_element---of 9

__dquot_alloc_space9%of 62
__dquot_drop---of 14
__dquot_free_space---of 51
__dquot_initialize7%of 47
__dquot_transfer---of 87
__quota_error---of 2
__quota_error.cold---of 1
do_get_dqblk---of 1
do_proc_dqstats---of 4
dqcache_shrink_count---of 1
dqcache_shrink_scan---of 13
dqget---of 34
dqput---of 2
dqput.part.0---of 7
dquot_acquire---of 10
dquot_add_inodes---of 24
dquot_add_space---of 29
dquot_alloc---of 1
dquot_alloc_inode7%of 43
dquot_claim_space_nodirty---of 36
dquot_commit---of 8
dquot_commit_info---of 1
dquot_decr_inodes---of 6
dquot_decr_space---of 6
dquot_destroy---of 1
dquot_disable---of 74
dquot_drop---of 5
dquot_file_open80%of 5
dquot_free_inode---of 34
dquot_get_dqblk---of 6
dquot_get_next_dqblk---of 8
dquot_get_next_id---of 5
dquot_get_state---of 14
dquot_initialize100%of 1
dquot_initialize_needed---of 8
dquot_load_quota_inode---of 4
dquot_load_quota_sb---of 49
dquot_mark_dquot_dirty---of 8
dquot_quota_disable---of 15
dquot_quota_enable---of 14
dquot_quota_off---of 1
dquot_quota_on---of 6
dquot_quota_on_mount---of 8
dquot_quota_sync---of 12
dquot_reclaim_space_nodirty---of 35
dquot_release---of 8
dquot_resume---of 9
dquot_scan_active---of 16
dquot_set_dqblk---of 54
dquot_set_dqinfo---of 16
dquot_transfer---of 19
dquot_writeback_dquots---of 38
info_bdq_free---of 6
info_idq_free---of 7
mark_info_dirty---of 1
prepare_warning---of 4
quota_release_workfn---of 19
register_quota_format---of 1
unregister_quota_format---of 7
vfs_cleanup_quota_inode---of 4
vfs_setup_quota_inode---of 9

chksum_digest---of 1
chksum_final---of 1
chksum_finup---of 1
chksum_init---of 1
chksum_setkey---of 3
chksum_update100%of 1
crc32c_cra_init---of 1

__cgroup_account_cputime100%of 1
__cgroup_account_cputime_field---of 4
cgroup_base_stat_cputime_show---of 4
cgroup_rstat_exit---of 7
cgroup_rstat_flush---of 1
cgroup_rstat_flush_hold---of 1
cgroup_rstat_flush_irqsafe---of 1
cgroup_rstat_flush_locked---of 39
cgroup_rstat_flush_release---of 1
cgroup_rstat_init---of 5
cgroup_rstat_updated34%of 6
root_cgroup_cputime---of 3

audit_ctl_lock---of 1
audit_ctl_unlock---of 1
audit_get_tty---of 8
audit_log---of 5
audit_log_common_recv_msg---of 7
audit_log_config_change---of 6
audit_log_d_path70%of 10
audit_log_d_path_exe75%of 4
audit_log_end45%of 9
audit_log_feature_change.part.0---of 4
audit_log_format75%of 4
audit_log_key---of 3
audit_log_lost---of 8
audit_log_multicast---of 13
audit_log_n_hex---of 18
audit_log_n_string32%of 16
audit_log_n_untrustedstring84%of 6
audit_log_path_denied---of 7
audit_log_session_info---of 1
audit_log_start80%of 5
audit_log_start.part.044%of 30
audit_log_start.part.0.cold---of 1
audit_log_task_context67%of 6
audit_log_task_info---of 2
audit_log_task_info.part.0---of 26
audit_log_untrustedstring100%of 1
audit_log_vformat36%of 25
audit_make_reply---of 10
audit_multicast_bind---of 1
audit_multicast_unbind---of 1
audit_net_exit---of 18
audit_net_init---of 21
audit_panic---of 2
audit_panic.cold---of 2
audit_put_tty---of 1
audit_receive---of 12
audit_receive_msg---of 197
audit_send_list_thread---of 11
audit_send_reply.constprop.0---of 17
audit_send_reply_thread---of 11
audit_serial---of 1
audit_set_enabled---of 9
audit_set_loginuid---of 34
audit_signal_info---of 8
audit_string_contains_control---of 6
auditd_conn_free---of 6
auditd_pid_vnr---of 21
auditd_reset---of 13
auditd_test_task50%of 20
is_audit_feature_set---of 1
kauditd_hold_skb---of 13
kauditd_hold_skb.cold---of 1
kauditd_rehold_skb---of 1
kauditd_retry_skb---of 6
kauditd_retry_skb.cold---of 1
kauditd_send_multicast_skb---of 22
kauditd_send_queue---of 23
kauditd_thread---of 79
net_generic---of 18
skb_queue_head_init---of 1

__do_sys_capget---of 34
__do_sys_capset---of 20
__ia32_sys_capget---of 1
__ia32_sys_capset---of 1
__x64_sys_capget---of 1
__x64_sys_capset---of 1
cap_validate_magic---of 11
capable80%of 5
capable.cold---of 1
capable_wrt_inode_uidgid84%of 6
capable_wrt_inode_uidgid.cold---of 1
file_ns_capable60%of 5
has_capability---of 1
has_capability_noaudit---of 1
has_ns_capability---of 18
has_ns_capability_noaudit---of 18
ns_capable80%of 5
ns_capable.cold---of 1
ns_capable_noaudit---of 5
ns_capable_noaudit.cold---of 1
ns_capable_setid---of 5
ns_capable_setid.cold---of 1
privileged_wrt_inode_uidgid---of 4
ptracer_capable---of 20

__compat_save_altstack---of 4
__copy_siginfo_from_user---of 8
__copy_siginfo_from_user32---of 3
__copy_siginfo_to_user32---of 2
__do_sys_pidfd_send_signal---of 25
__flush_itimer_signals---of 8
__group_send_sig_info---of 1
__ia32_compat_sys_rt_sigaction---of 17
__ia32_compat_sys_rt_sigpending---of 8
__ia32_compat_sys_rt_sigprocmask---of 10
__ia32_compat_sys_rt_sigqueueinfo---of 7
__ia32_compat_sys_rt_sigsuspend---of 4
__ia32_compat_sys_rt_sigtimedwait_time32---of 8
__ia32_compat_sys_rt_sigtimedwait_time64---of 8
__ia32_compat_sys_rt_tgsigqueueinfo---of 4
__ia32_compat_sys_sigaction---of 31
__ia32_compat_sys_sigaltstack---of 1
__ia32_compat_sys_sigpending---of 1
__ia32_sys_kill---of 1
__ia32_sys_pause---of 4
__ia32_sys_pidfd_send_signal---of 1
__ia32_sys_rt_sigaction---of 16
__ia32_sys_rt_sigpending---of 3
__ia32_sys_rt_sigprocmask---of 8
__ia32_sys_rt_sigqueueinfo---of 7
__ia32_sys_rt_sigsuspend---of 4
__ia32_sys_rt_sigtimedwait---of 10
__ia32_sys_rt_sigtimedwait_time32---of 10
__ia32_sys_rt_tgsigqueueinfo---of 4
__ia32_sys_sigaltstack---of 15
__ia32_sys_signal---of 4
__ia32_sys_sigpending---of 2
__ia32_sys_sigprocmask---of 11
__ia32_sys_sigsuspend---of 1
__ia32_sys_ssetmask---of 3
__ia32_sys_tgkill---of 4
__ia32_sys_tkill---of 3
__kill_pgrp_info---of 7
__lock_task_sighand---of 21
__save_altstack---of 4
__send_signal---of 75
__set_current_blocked---of 2
__set_task_blocked---of 4
__sigqueue_alloc---of 32
__sigqueue_alloc.cold---of 1
__sigqueue_free.part.0---of 3
__traceiter_signal_deliver---of 4
__traceiter_signal_generate---of 4
__x64_sys_kill---of 1
__x64_sys_pidfd_send_signal---of 1
__x64_sys_restart_syscall---of 1
__x64_sys_rt_sigaction---of 16
__x64_sys_rt_sigpending---of 3
__x64_sys_rt_sigprocmask---of 8
__x64_sys_rt_sigqueueinfo---of 7
__x64_sys_rt_sigsuspend---of 4
__x64_sys_rt_sigtimedwait---of 10
__x64_sys_rt_sigtimedwait_time32---of 10
__x64_sys_rt_tgsigqueueinfo---of 4
__x64_sys_sgetmask---of 1
__x64_sys_sigaltstack---of 15
__x64_sys_signal---of 4
__x64_sys_sigpending---of 2
__x64_sys_sigprocmask---of 11
__x64_sys_sigsuspend---of 1
__x64_sys_ssetmask---of 3
__x64_sys_tgkill---of 4
__x64_sys_tkill---of 3
calculate_sigpending---of 1
check_kill_permission---of 21
collect_signal---of 12
compat_restore_altstack---of 2
complete_signal---of 33
copy_siginfo_from_user---of 8
copy_siginfo_from_user32---of 3
copy_siginfo_to_external32---of 11
copy_siginfo_to_user---of 3
dequeue_signal---of 19
do_compat_sigaltstack---of 10
do_no_restart_syscall---of 1
do_notify_parent---of 70
do_notify_parent_cldstop---of 58
do_rt_tgsigqueueinfo---of 7
do_send_sig_info---of 3
do_send_specific---of 21
do_sigaction---of 21
do_sigaltstack.constprop.0---of 16
do_signal_stop---of 29
do_sigpending---of 1
do_sigtimedwait---of 19
do_tkill---of 1
exit_signals---of 35
flush_itimer_signals---of 1
flush_signal_handlers---of 5
flush_signals---of 11
flush_sigqueue---of 5
flush_sigqueue_mask---of 6
force_sig---of 1
force_sig_bnderr---of 1
force_sig_fault---of 1
force_sig_fault_to_task---of 1
force_sig_info---of 1
force_sig_info_to_task---of 9
force_sig_mceerr---of 3
force_sig_pkuerr---of 1
force_sig_ptrace_errno_trap---of 1
force_sigsegv---of 3
get_signal---of 138
get_signal.cold---of 1
group_send_sig_info---of 18
ignore_signals---of 2
kernel_sigaction---of 3
kill_pgrp---of 1
kill_pid---of 1
kill_pid_info---of 18
kill_pid_usb_asyncio---of 29
kill_proc_info---of 13
kill_something_info---of 16
known_siginfo_layout---of 12
next_signal---of 4
perf_trace_signal_deliver---of 11
perf_trace_signal_generate---of 11
post_copy_siginfo_from_user32---of 14
prepare_signal---of 44
ptrace_do_notify---of 1
ptrace_notify---of 5
ptrace_stop---of 27
ptrace_trap_notify---of 6
recalc_sigpending75%of 4
recalc_sigpending_and_wake---of 5
recalc_sigpending_tsk84%of 6
restore_altstack---of 10
retarget_shared_pending.isra.0---of 8
send_sig---of 5
send_sig_fault---of 5
send_sig_info---of 5
send_sig_mceerr---of 5
send_signal---of 39
send_sigqueue---of 39
set_compat_user_sigmask---of 6
set_current_blocked---of 2
set_user_sigmask---of 6
siginfo_layout---of 15
signal_setup_done---of 11
signal_wake_up_state---of 2
sigprocmask---of 9
sigqueue_alloc---of 3
sigqueue_free---of 7
sigsuspend---of 6
task_clear_jobctl_pending---of 7
task_clear_jobctl_trapping---of 2
task_join_group_stop50%of 4
task_participate_group_stop---of 14
task_set_jobctl_pending---of 11
trace_event_raw_event_signal_deliver---of 15
trace_event_raw_event_signal_generate---of 15
trace_raw_output_signal_deliver---of 4
trace_raw_output_signal_generate---of 4
unhandled_signal---of 4
zap_other_threads---of 7

I_BDEV100%of 1
__blkdev_direct_IO_simple---of 30
__blkdev_get29%of 92
__blkdev_put30%of 31
__invalidate_device---of 6
__sync_blockdev---of 7
bd_abort_claiming---of 1
bd_acquire29%of 14
bd_clear_claiming67%of 6
bd_forget---of 4
bd_init_fs_context---of 3
bd_link_disk_holder---of 17
bd_may_claim---of 6
bd_prepare_to_claim60%of 10
bd_set_nr_sectors---of 1
bd_unlink_disk_holder---of 8
bdev_alloc_inode---of 3
bdev_disk_changed---of 18
bdev_evict_inode---of 2
bdev_free_inode---of 1
bdev_read_page---of 4
bdev_set---of 1
bdev_test---of 1
bdev_write_page---of 7
bdget---of 4
bdget_part---of 1
bdgrab---of 1
bdput---of 1
blkdev_bio_end_io---of 17
blkdev_bio_end_io_simple---of 3
blkdev_close100%of 1
blkdev_direct_IO---of 54
blkdev_fallocate---of 20
blkdev_fsync---of 3
blkdev_get_block---of 4
blkdev_get_by_dev---of 5
blkdev_get_by_path50%of 8
blkdev_iopoll---of 1
blkdev_open---of 11
blkdev_put19%of 16
blkdev_read_iter---of 5
blkdev_readahead---of 1
blkdev_readpage---of 1
blkdev_releasepage50%of 6
blkdev_write_begin---of 1
blkdev_write_end---of 7
blkdev_write_iter---of 14
blkdev_writepage---of 1
blkdev_writepages---of 1
block_ioctl75%of 4
block_llseek---of 1
check_disk_size_change---of 5
check_disk_size_change.cold---of 2
freeze_bdev---of 14
fsync_bdev---of 6
init_once---of 1
invalidate_bdev---of 2
iterate_bdevs---of 9
lookup_bdev90%of 10
nr_blockdev_pages---of 3
revalidate_disk_size---of 3
sb_min_blocksize89%of 9
sb_set_blocksize100%of 5
set_blocksize100%of 15
set_init_blocksize.isra.0---of 9
sync_blockdev---of 4
thaw_bdev---of 9
truncate_bdev_range---of 7

__do_splice---of 14
__do_sys_vmsplice---of 43
__ia32_sys_splice---of 9
__ia32_sys_tee---of 10
__ia32_sys_vmsplice---of 1
__splice_from_pipe---of 26
__x64_sys_splice---of 9
__x64_sys_tee---of 10
__x64_sys_vmsplice---of 1
add_to_pipe---of 6
direct_splice_actor100%of 3
do_splice---of 92
do_splice_direct67%of 6
do_splice_to84%of 6
do_tee---of 29
generic_file_splice_read64%of 22
generic_splice_sendpage---of 1
get_order---of 1
ipipe_prep.part.0---of 7
iter_file_splice_write65%of 42
opipe_prep.part.0---of 8
page_cache_pipe_buf_confirm---of 15
page_cache_pipe_buf_release---of 7
page_cache_pipe_buf_try_steal---of 16
pipe_to_sendpage---of 6
pipe_to_user---of 2
splice_direct_to_actor52%of 27
splice_from_pipe---of 1
splice_from_pipe_next.part.027%of 15
splice_grow_spd---of 5
splice_shrink_spd---of 2
splice_to_pipe---of 11
user_page_pipe_buf_try_steal---of 3
wait_for_space---of 8

__kernfs_new_node---of 23
__kernfs_remove---of 43
kernfs_activate---of 10
kernfs_active---of 4
kernfs_add_one---of 16
kernfs_break_active_protection---of 1
kernfs_create_dir_ns---of 4
kernfs_create_empty_dir---of 5
kernfs_create_root---of 6
kernfs_destroy_root---of 1
kernfs_dir_fop_release---of 4
kernfs_dir_pos---of 19
kernfs_dop_revalidate---of 13
kernfs_find_and_get_node_by_id---of 10
kernfs_find_and_get_ns---of 3
kernfs_find_ns---of 19
kernfs_fop_readdir---of 20
kernfs_get100%of 2
kernfs_get.part.067%of 3
kernfs_get_active---of 9
kernfs_get_parent---of 3
kernfs_iop_lookup---of 7
kernfs_iop_mkdir---of 6
kernfs_iop_rename---of 11
kernfs_iop_rmdir---of 8
kernfs_link_sibling---of 15
kernfs_name---of 1
kernfs_name_hash---of 5
kernfs_name_locked---of 11
kernfs_new_node---of 11
kernfs_next_descendant_post---of 15
kernfs_node_from_dentry---of 5
kernfs_path_from_node---of 1
kernfs_path_from_node_locked---of 76
kernfs_path_from_node_locked.cold---of 1
kernfs_put---of 3
kernfs_put.part.0---of 18
kernfs_put_active---of 6
kernfs_remove---of 1
kernfs_remove_by_name_ns---of 6
kernfs_remove_self---of 11
kernfs_rename_ns---of 21
kernfs_unbreak_active_protection---of 2
kernfs_walk_and_get_ns---of 13
pr_cont_kernfs_name---of 1
pr_cont_kernfs_path---of 1
pr_cont_kernfs_path.cold---of 5

__traceiter_selinux_audited---of 4
avc_alloc_node---of 34
avc_audit_post_callback---of 30
avc_audit_pre_callback---of 11
avc_compute_av.isra.0---of 28
avc_compute_av.isra.0.cold---of 1
avc_copy_xperms_decision---of 6
avc_denied---of 6
avc_disable---of 2
avc_flush---of 18
avc_get_cache_threshold---of 1
avc_get_hash_stats---of 19
avc_has_extended_perms24%of 64
avc_has_perm72%of 7
avc_has_perm_flags63%of 8
avc_has_perm_noaudit58%of 21
avc_lookup89%of 9
avc_node_delete---of 3
avc_node_free---of 1
avc_node_kill---of 1
avc_node_replace---of 3
avc_policy_seqno100%of 1
avc_set_cache_threshold---of 1
avc_ss_reset---of 7
avc_update_node.part.0.isra.0---of 26
avc_xperms_decision_alloc---of 9
avc_xperms_decision_free---of 7
avc_xperms_free---of 4
avc_xperms_populate.part.0---of 7
perf_trace_selinux_audited---of 9
selinux_avc_init---of 3
slow_avc_audit---of 6
trace_event_get_offsets_selinux_audited.constprop.0---of 4
trace_event_raw_event_selinux_audited---of 13
trace_raw_output_selinux_audited---of 4

arm_timer---of 6
bump_cpu_timer---of 9
check_cpu_itimer---of 16
clear_posix_cputimers_work100%of 1
collect_posix_cputimers---of 9
cpu_clock_sample---of 8
cpu_clock_sample_group---of 17
cpu_timer_fire---of 8
do_cpu_nanosleep---of 29
pid_for_clock---of 14
posix_cpu_clock_get---of 25
posix_cpu_clock_getres---of 19
posix_cpu_clock_set---of 15
posix_cpu_nsleep---of 9
posix_cpu_nsleep_restart---of 1
posix_cpu_timer_create---of 27
posix_cpu_timer_del---of 37
posix_cpu_timer_get---of 21
posix_cpu_timer_rearm---of 19
posix_cpu_timer_set---of 62
posix_cpu_timer_wait_running---of 28
posix_cpu_timers_exit---of 6
posix_cpu_timers_exit_group---of 6
posix_cpu_timers_work---of 37
posix_cpu_timers_work.cold---of 4
posix_cputimers_group_init---of 2
process_cpu_clock_get---of 1
process_cpu_clock_getres---of 16
process_cpu_nsleep---of 6
process_cpu_timer_create---of 1
run_posix_cpu_timers---of 27
set_process_cpu_timer---of 12
thread_cpu_clock_get---of 1
thread_cpu_clock_getres---of 16
thread_cpu_timer_create---of 1
thread_group_sample_cputime---of 3
update_rlimit_cpu---of 3

iomap_apply43%of 45
iomap_iter---of 52

call_blocking_lsm_notifier---of 1
fsnotify_perm.part.043%of 21
get_order---of 1
inode_free_by_rcu---of 1
lsm_append.constprop.0---of 10
lsm_inode_alloc---of 5
register_blocking_lsm_notifier---of 1
security_add_mnt_opt---of 4
security_audit_rule_free---of 2
security_audit_rule_init---of 4
security_audit_rule_known---of 4
security_audit_rule_match---of 4
security_binder_set_context_mgr---of 4
security_binder_transaction---of 4
security_binder_transfer_binder---of 4
security_binder_transfer_file---of 4
security_bprm_check---of 5
security_bprm_committed_creds---of 2
security_bprm_committing_creds---of 2
security_bprm_creds_for_exec---of 4
security_bprm_creds_from_file---of 4
security_capable100%of 4
security_capget---of 4
security_capset---of 4
security_cred_alloc_blank---of 9
security_cred_free---of 5
security_cred_getsecid---of 2
security_d_instantiate100%of 4
security_dentry_create_files_as---of 4
security_dentry_init_security---of 4
security_file_alloc78%of 9
security_file_fcntl---of 4
security_file_free75%of 4
security_file_ioctl100%of 4
security_file_ioctl_compat---of 4
security_file_lock---of 4
security_file_mprotect---of 5
security_file_open100%of 6
security_file_permission100%of 7
security_file_receive---of 4
security_file_send_sigiotask---of 4
security_file_set_fowner---of 2
security_free_mnt_opts20%of 5
security_fs_context_dup---of 4
security_fs_context_parse_param100%of 5
security_getprocattr---of 6
security_inet_conn_established---of 2
security_inet_conn_request---of 4
security_inet_csk_clone---of 2
security_inode_alloc78%of 9
security_inode_copy_up---of 4
security_inode_copy_up_xattr---of 5
security_inode_create100%of 6
security_inode_follow_link100%of 6
security_inode_free100%of 4
security_inode_getattr---of 5
security_inode_getsecctx---of 4
security_inode_getsecid---of 2
security_inode_getsecurity---of 6
security_inode_getxattr---of 5
security_inode_init_security74%of 15
security_inode_invalidate_secctx---of 2
security_inode_killpriv---of 4
security_inode_link---of 5
security_inode_listsecurity---of 5
security_inode_listxattr---of 5
security_inode_mkdir---of 6
security_inode_mknod100%of 6
security_inode_need_killpriv100%of 4
security_inode_notifysecctx---of 4
security_inode_permission100%of 5
security_inode_post_setxattr---of 3
security_inode_readlink---of 5
security_inode_removexattr---of 8
security_inode_rename---of 12
security_inode_rmdir---of 5
security_inode_setattr100%of 6
security_inode_setsecctx---of 4
security_inode_setsecurity---of 5
security_inode_setxattr---of 8
security_inode_symlink---of 5
security_inode_unlink---of 5
security_ipc_getsecid---of 2
security_ipc_permission---of 5
security_ismaclabel---of 4
security_kernel_act_as---of 4
security_kernel_create_files_as---of 4
security_kernel_load_data---of 5
security_kernel_module_request---of 5
security_kernel_post_load_data---of 5
security_kernel_post_read_file---of 5
security_kernel_read_file---of 5
security_kernfs_init_security---of 4
security_key_alloc---of 4
security_key_free---of 2
security_key_getsecurity---of 4
security_key_permission100%of 4
security_locked_down---of 4
security_mmap_addr---of 4
security_mmap_file---of 10
security_move_mount---of 4
security_msg_msg_alloc---of 11
security_msg_msg_free---of 3
security_msg_queue_alloc---of 11
security_msg_queue_associate---of 4
security_msg_queue_free---of 3
security_msg_queue_msgctl---of 4
security_msg_queue_msgrcv---of 4
security_msg_queue_msgsnd---of 4
security_netlink_send100%of 4
security_old_inode_init_security---of 5
security_path_notify---of 4
security_perf_event_alloc---of 4
security_perf_event_free---of 2
security_perf_event_open---of 4
security_perf_event_read---of 4
security_perf_event_write---of 4
security_prepare_creds---of 9
security_ptrace_access_check---of 4
security_ptrace_traceme---of 4
security_quota_on---of 4
security_quotactl---of 4
security_release_secctx100%of 2
security_req_classify_flow---of 2
security_sb_alloc100%of 4
security_sb_clone_mnt_opts---of 4
security_sb_eat_lsm_opts100%of 4
security_sb_free---of 2
security_sb_kern_mount100%of 4
security_sb_mount100%of 4
security_sb_pivotroot---of 4
security_sb_remount---of 4
security_sb_set_mnt_opts100%of 5
security_sb_show_options---of 4
security_sb_statfs---of 4
security_sb_umount---of 4
security_sctp_assoc_request---of 4
security_sctp_bind_connect---of 4
security_sctp_sk_clone---of 2
security_secctx_to_secid---of 4
security_secid_to_secctx75%of 4
security_secmark_refcount_dec---of 2
security_secmark_refcount_inc---of 2
security_secmark_relabel_packet---of 4
security_sem_alloc---of 11
security_sem_associate---of 4
security_sem_free---of 3
security_sem_semctl---of 4
security_sem_semop---of 4
security_setprocattr---of 6
security_settime64---of 4
security_shm_alloc---of 11
security_shm_associate---of 4
security_shm_free---of 3
security_shm_shmat---of 4
security_shm_shmctl---of 4
security_sk_alloc100%of 4
security_sk_classify_flow---of 3
security_sk_clone---of 2
security_sk_free---of 2
security_sock_graft---of 2
security_sock_rcv_skb100%of 4
security_socket_accept---of 4
security_socket_bind---of 4
security_socket_connect---of 4
security_socket_create100%of 4
security_socket_getpeername---of 4
security_socket_getpeersec_dgram75%of 4
security_socket_getpeersec_stream---of 4
security_socket_getsockname---of 4
security_socket_getsockopt---of 4
security_socket_listen---of 4
security_socket_post_create100%of 4
security_socket_recvmsg---of 4
security_socket_sendmsg100%of 4
security_socket_setsockopt---of 4
security_socket_shutdown---of 4
security_socket_socketpair---of 4
security_syslog---of 4
security_task_alloc64%of 11
security_task_fix_setgid---of 4
security_task_fix_setuid---of 4
security_task_free67%of 3
security_task_getioprio---of 4
security_task_getpgid---of 4
security_task_getscheduler---of 4
security_task_getsecid100%of 2
security_task_getsid---of 4
security_task_kill---of 4
security_task_movememory---of 4
security_task_prctl---of 5
security_task_prlimit---of 4
security_task_setioprio---of 4
security_task_setnice---of 4
security_task_setpgid---of 4
security_task_setrlimit---of 4
security_task_setscheduler---of 4
security_task_to_inode100%of 2
security_transfer_creds---of 2
security_tun_dev_alloc_security---of 4
security_tun_dev_attach---of 4
security_tun_dev_attach_queue---of 4
security_tun_dev_create---of 4
security_tun_dev_free_security---of 2
security_tun_dev_open---of 4
security_unix_may_send---of 4
security_unix_stream_connect---of 4
security_vm_enough_memory_mm100%of 4
unregister_blocking_lsm_notifier---of 1

__inode_security_revalidate78%of 9
audit_inode_permission---of 3
backing_inode_security---of 8
check_nnp_nosuid.isra.0---of 10
cred_has_capability.isra.073%of 11
cred_has_capability.isra.0.cold---of 1
delayed_superblock_init---of 1
file_has_perm80%of 5
file_map_prot_check---of 24
get_order---of 1
has_cap_mac_admin---of 5
inode_doinit_use_xattr---of 13
inode_doinit_use_xattr.cold---of 3
inode_doinit_with_dentry56%of 58
inode_has_perm100%of 5
inode_security88%of 8
ioctl_has_perm.constprop.0.isra.080%of 5
ipc_has_perm---of 1
match_file---of 8
may_context_mount_inode_relabel.isra.0---of 3
may_context_mount_sb_relabel.isra.0---of 4
may_create100%of 5
may_link---of 6
ptrace_parent_sid---of 21
sb_finish_set_opts83%of 23
sb_finish_set_opts.cold---of 4
selinux_add_mnt_opt---of 11
selinux_add_opt---of 19
selinux_add_opt.cold---of 1
selinux_binder_set_context_mgr---of 1
selinux_binder_transaction---of 4
selinux_binder_transfer_binder---of 1
selinux_binder_transfer_file---of 10
selinux_bprm_committed_creds---of 7
selinux_bprm_committing_creds---of 16
selinux_bprm_creds_for_exec---of 17
selinux_capable100%of 1
selinux_capget---of 1
selinux_capset---of 1
selinux_complete_init---of 1
selinux_cred_getsecid---of 1
selinux_cred_prepare---of 1
selinux_cred_transfer---of 1
selinux_d_instantiate100%of 2
selinux_dentry_create_files_as---of 4
selinux_dentry_init_security---of 4
selinux_determine_inode_label75%of 8
selinux_disable---of 3
selinux_disable.cold---of 5
selinux_file_alloc_security100%of 1
selinux_file_fcntl---of 7
selinux_file_ioctl34%of 9
selinux_file_ioctl_compat---of 2
selinux_file_lock---of 4
selinux_file_mprotect---of 24
selinux_file_open89%of 9
selinux_file_permission60%of 22
selinux_file_receive---of 6
selinux_file_send_sigiotask---of 4
selinux_file_set_fowner---of 1
selinux_free_mnt_opts---of 1
selinux_fs_context_dup---of 11
selinux_fs_context_parse_param50%of 4
selinux_getprocattr---of 42
selinux_inet_conn_established---of 4
selinux_inet_conn_request---of 7
selinux_inet_csk_clone---of 1
selinux_inet_sys_rcv_skb---of 5
selinux_inode_alloc_security100%of 3
selinux_inode_copy_up---of 6
selinux_inode_copy_up_xattr---of 2
selinux_inode_create100%of 1
selinux_inode_follow_link78%of 9
selinux_inode_free_security80%of 5
selinux_inode_getattr---of 1
selinux_inode_getsecctx---of 3
selinux_inode_getsecid---of 3
selinux_inode_getsecurity---of 10
selinux_inode_getxattr---of 1
selinux_inode_init_security100%of 16
selinux_inode_invalidate_secctx---of 3
selinux_inode_link---of 1
selinux_inode_listsecurity---of 5
selinux_inode_listxattr---of 1
selinux_inode_mkdir---of 1
selinux_inode_mknod100%of 2
selinux_inode_notifysecctx---of 2
selinux_inode_permission74%of 34
selinux_inode_post_setxattr---of 6
selinux_inode_post_setxattr.cold---of 1
selinux_inode_readlink---of 1
selinux_inode_removexattr---of 5
selinux_inode_rename---of 11
selinux_inode_rmdir---of 1
selinux_inode_setattr78%of 9
selinux_inode_setsecctx---of 1
selinux_inode_setsecurity---of 10
selinux_inode_setxattr---of 24
selinux_inode_symlink---of 1
selinux_inode_unlink---of 1
selinux_ip_forward---of 16
selinux_ip_postroute---of 50
selinux_ip_postroute_compat---of 9
selinux_ipc_getsecid---of 1
selinux_ipc_permission---of 6
selinux_ipv4_forward---of 1
selinux_ipv4_output---of 7
selinux_ipv4_postroute---of 1
selinux_ipv6_forward---of 1
selinux_ipv6_output---of 7
selinux_ipv6_postroute---of 1
selinux_ismaclabel---of 1
selinux_kernel_act_as---of 3
selinux_kernel_create_files_as---of 5
selinux_kernel_load_data---of 4
selinux_kernel_module_request---of 1
selinux_kernel_read_file---of 10
selinux_kernfs_init_security---of 15
selinux_key_alloc---of 5
selinux_key_free---of 1
selinux_key_getsecurity---of 3
selinux_key_permission75%of 4
selinux_lockdown---of 7
selinux_lsm_notifier_avc_callback---of 4
selinux_mmap_addr---of 4
selinux_mmap_file---of 5
selinux_mount75%of 4
selinux_move_mount---of 1
selinux_msg_msg_alloc_security---of 1
selinux_msg_queue_alloc_security---of 1
selinux_msg_queue_associate---of 1
selinux_msg_queue_msgctl---of 6
selinux_msg_queue_msgrcv---of 3
selinux_msg_queue_msgsnd---of 6
selinux_netcache_avc_callback---of 4
selinux_netlink_send54%of 13
selinux_netlink_send.cold---of 1
selinux_nf_register---of 1
selinux_nf_unregister---of 1
selinux_parse_skb.constprop.0---of 62
selinux_path_notify---of 8
selinux_perf_event_alloc---of 3
selinux_perf_event_free---of 1
selinux_perf_event_open---of 6
selinux_perf_event_read---of 1
selinux_perf_event_write---of 1
selinux_ptrace_access_check---of 4
selinux_ptrace_traceme---of 1
selinux_quota_on---of 1
selinux_quotactl---of 5
selinux_release_secctx100%of 1
selinux_req_classify_flow---of 1
selinux_sb_alloc_security100%of 3
selinux_sb_clone_mnt_opts---of 38
selinux_sb_clone_mnt_opts.cold---of 1
selinux_sb_eat_lsm_opts28%of 37
selinux_sb_free_security---of 1
selinux_sb_kern_mount100%of 1
selinux_sb_remount---of 27
selinux_sb_remount.cold---of 5
selinux_sb_show_options---of 17
selinux_sb_statfs---of 1
selinux_sctp_assoc_request---of 19
selinux_sctp_bind_connect---of 13
selinux_sctp_sk_clone---of 3
selinux_secctx_to_secid---of 1
selinux_secid_to_secctx100%of 1
selinux_secmark_refcount_dec---of 1
selinux_secmark_refcount_inc---of 1
selinux_secmark_relabel_packet---of 1
selinux_sem_alloc_security---of 1
selinux_sem_associate---of 1
selinux_sem_semctl---of 6
selinux_sem_semop---of 2
selinux_set_mnt_opts30%of 92
selinux_set_mnt_opts.cold---of 8
selinux_setprocattr---of 46
selinux_shm_alloc_security---of 1
selinux_shm_associate---of 1
selinux_shm_shmat---of 2
selinux_shm_shmctl---of 6
selinux_sk_alloc_security80%of 5
selinux_sk_clone_security---of 1
selinux_sk_free_security---of 1
selinux_sk_getsecid---of 3
selinux_skb_peerlbl_sid---of 3
selinux_skb_peerlbl_sid.cold---of 1
selinux_sock_graft---of 6
selinux_sock_rcv_skb_compat---of 6
selinux_socket_accept---of 7
selinux_socket_bind---of 31
selinux_socket_connect---of 3
selinux_socket_connect_helper.isra.0---of 17
selinux_socket_create38%of 24
selinux_socket_getpeername---of 1
selinux_socket_getpeersec_dgram43%of 14
selinux_socket_getpeersec_stream---of 15
selinux_socket_getsockname---of 1
selinux_socket_getsockopt---of 1
selinux_socket_listen---of 1
selinux_socket_post_create45%of 29
selinux_socket_recvmsg---of 1
selinux_socket_sendmsg100%of 1
selinux_socket_setsockopt---of 3
selinux_socket_shutdown---of 1
selinux_socket_sock_rcv_skb15%of 21
selinux_socket_socketpair---of 1
selinux_socket_unix_may_send---of 1
selinux_socket_unix_stream_connect---of 4
selinux_syslog---of 5
selinux_task_alloc100%of 1
selinux_task_getioprio---of 1
selinux_task_getpgid---of 1
selinux_task_getscheduler---of 1
selinux_task_getsecid100%of 1
selinux_task_getsid---of 1
selinux_task_kill---of 8
selinux_task_movememory---of 1
selinux_task_prlimit---of 6
selinux_task_setioprio---of 1
selinux_task_setnice---of 1
selinux_task_setpgid---of 1
selinux_task_setrlimit---of 4
selinux_task_setscheduler---of 1
selinux_task_to_inode100%of 4
selinux_tun_dev_alloc_security---of 3
selinux_tun_dev_attach---of 1
selinux_tun_dev_attach_queue---of 1
selinux_tun_dev_create---of 1
selinux_tun_dev_free_security---of 1
selinux_tun_dev_open---of 5
selinux_umount---of 1
selinux_vm_enough_memory100%of 1
show_sid---of 6
sock_has_perm67%of 3
task_sid50%of 18

__anon_vma_prepare---of 13
__page_set_anon_rmap64%of 11
__put_anon_vma---of 11
anon_vma_clone---of 25
anon_vma_ctor---of 1
anon_vma_fork---of 10
do_page_add_anon_rmap---of 32
flush_tlb_batched_pending---of 2
hugepage_add_anon_rmap---of 8
hugepage_add_new_anon_rmap---of 11
invalid_migration_vma---of 3
invalid_mkclean_vma---of 1
invalid_page_referenced_vma---of 24
mm_find_pmd---of 12
page_add_anon_rmap---of 2
page_add_file_rmap---of 14
page_add_new_anon_rmap62%of 18
page_address_in_vma---of 22
page_get_anon_vma---of 28
page_lock_anon_vma_read---of 39
page_mkclean---of 13
page_mkclean_one---of 21
page_move_anon_rmap---of 9
page_not_mapped---of 1
page_referenced---of 24
page_referenced_one---of 20
page_remove_rmap---of 41
page_unlock_anon_vma_read---of 1
rmap_walk---of 5
rmap_walk_anon---of 32
rmap_walk_file---of 33
rmap_walk_locked---of 5
try_to_munlock---of 15
try_to_unmap---of 21
try_to_unmap_flush---of 2
try_to_unmap_flush_dirty---of 3
try_to_unmap_one---of 119
unlink_anon_vmas---of 26

close_pdeo---of 4
close_pdeo.part.0---of 3
init_once---of 1
proc_alloc_inode100%of 3
proc_entry_rundown---of 5
proc_evict_inode---of 6
proc_free_inode---of 1
proc_get_inode48%of 23
proc_get_link---of 6
proc_invalidate_siblings_dcache---of 48
proc_put_link---of 2
proc_reg_compat_ioctl---of 11
proc_reg_get_unmapped_area---of 13
proc_reg_llseek---of 10
proc_reg_mmap---of 11
proc_reg_open67%of 18
proc_reg_poll---of 11
proc_reg_read---of 11
proc_reg_read_iter75%of 8
proc_reg_release---of 11
proc_reg_unlocked_ioctl---of 11
proc_reg_write---of 11
proc_show_options---of 11

__ext4_new_inode43%of 297
ext4_count_dirs---of 5
ext4_count_free_inodes---of 5
ext4_end_bitmap_read---of 5
ext4_free_inode---of 75
ext4_free_inode.cold---of 1
ext4_init_inode_table---of 30
ext4_mark_bitmap_end---of 2
ext4_mark_bitmap_end.part.0---of 4
ext4_mark_inode_used---of 76
ext4_orphan_get---of 27
ext4_read_inode_bitmap16%of 70
find_group_orlov---of 43
find_inode_bit20%of 21
get_orlov_stats---of 22
test_bit---of 1

__audit_bprm---of 1
__audit_fanotify---of 1
__audit_fd_pair---of 1
__audit_file---of 1
__audit_free---of 27
__audit_getcwd---of 3
__audit_getname---of 4
__audit_inode---of 93
__audit_inode.cold---of 2
__audit_inode_child---of 89
__audit_inode_child.cold---of 3
__audit_ipc_obj---of 1
__audit_ipc_set_perm---of 1
__audit_log_bprm_fcaps---of 3
__audit_log_capset---of 1
__audit_log_kern_module---of 3
__audit_log_nfcfg---of 4
__audit_mmap_fd---of 1
__audit_mq_getsetattr---of 1
__audit_mq_notify---of 3
__audit_mq_open---of 4
__audit_mq_sendrecv---of 4
__audit_ntp_log---of 4
__audit_ptrace---of 18
__audit_reusename---of 6
__audit_sockaddr---of 5
__audit_socketcall---of 4
__audit_syscall_entry---of 11
__audit_syscall_exit---of 29
__audit_tk_injoffset---of 3
_audit_getcwd---of 2
audit_alloc7%of 31
audit_alloc_name---of 6
audit_compare_gid---of 7
audit_compare_uid---of 7
audit_copy_inode---of 7
audit_core_dumps---of 4
audit_filter_inodes---of 2
audit_filter_inodes.part.0---of 22
audit_filter_rules---of 224
audit_filter_syscall.part.0.constprop.0.isra.0---of 27
audit_killed_trees---of 4
audit_log_cap---of 3
audit_log_execve_info---of 48
audit_log_exit---of 118
audit_log_pid_context---of 7
audit_log_task100%of 1
audit_seccomp100%of 3
audit_seccomp_actions_logged---of 3
audit_signal_info_syscall---of 30
auditsc_get_stamp---of 6
grow_tree_refs---of 8
unroll_tree_refs---of 11

__put_user_ns---of 1
cmp_extents_forward---of 3
cmp_extents_reverse---of 3
cmp_map_id---of 8
create_user_ns---of 15
current_in_userns---of 3
free_user_ns---of 8
from_kgid100%of 1
from_kgid_munged100%of 3
from_kprojid100%of 1
from_kprojid_munged---of 2
from_kuid100%of 1
from_kuid_munged67%of 3
gid_m_show---of 3
gid_m_start---of 5
in_userns---of 3
m_next---of 1
make_kgid100%of 1
make_kprojid100%of 1
make_kuid100%of 1
map_id_range_down84%of 12
map_id_up90%of 10
map_write---of 98
map_write.cold---of 1
ns_get_owner---of 7
proc_gid_map_write---of 5
proc_projid_map_write---of 5
proc_setgroups_show---of 2
proc_setgroups_write---of 13
proc_uid_map_write---of 5
projid_m_show---of 3
projid_m_start---of 5
uid_m_show---of 3
uid_m_start---of 7
unshare_userns---of 8
userns_get---of 22
userns_install---of 13
userns_may_setgroups---of 3
userns_owner---of 1
userns_put---of 3

__sbitmap_get_word58%of 7
__sbitmap_queue_get73%of 11
__sbitmap_queue_get_shallow---of 13
__sbitmap_weight---of 6
__sbq_wake_up---of 11
sbitmap_add_wait_queue---of 2
sbitmap_any_bit_set100%of 5
sbitmap_bitmap_show---of 22
sbitmap_del_wait_queue---of 2
sbitmap_finish_wait100%of 2
sbitmap_get100%of 14
sbitmap_get_shallow---of 12
sbitmap_init_node---of 15
sbitmap_prepare_to_wait100%of 3
sbitmap_queue_clear---of 5
sbitmap_queue_init_node---of 19
sbitmap_queue_min_shallow_depth---of 1
sbitmap_queue_resize---of 1
sbitmap_queue_show---of 9
sbitmap_queue_update_wake_batch---of 5
sbitmap_queue_wake_all---of 4
sbitmap_queue_wake_up---of 1
sbitmap_resize---of 9
sbitmap_show---of 1

__posix_acl_chmod---of 23
__posix_acl_create---of 14
acl_by_type.part.0---of 1
forget_all_cached_acls---of 14
forget_cached_acl---of 11
get_acl40%of 5
get_acl.part.0---of 22
get_cached_acl31%of 43
get_cached_acl_rcu---of 10
posix_acl_alloc---of 3
posix_acl_chmod---of 18
posix_acl_clone---of 5
posix_acl_create100%of 4
posix_acl_create.part.012%of 35
posix_acl_create_masq---of 11
posix_acl_equiv_mode---of 11
posix_acl_fix_xattr_from_user---of 2
posix_acl_fix_xattr_to_user---of 2
posix_acl_fix_xattr_userns---of 11
posix_acl_from_mode---of 5
posix_acl_from_xattr---of 18
posix_acl_init---of 1
posix_acl_permission---of 18
posix_acl_to_xattr---of 9
posix_acl_update_mode---of 9
posix_acl_valid---of 16
posix_acl_xattr_get---of 14
posix_acl_xattr_list---of 1
posix_acl_xattr_set---of 13
set_cached_acl---of 17
set_posix_acl---of 10
simple_acl_create---of 17
simple_set_acl---of 4

PDE_DATA---of 1
__proc_create---of 26
__xlate_proc_name---of 7
_proc_mkdir---of 7
get_order---of 1
pde_free---of 5
pde_put---of 9
pde_set_flags---of 6
pde_subdir_find100%of 11
proc_alloc_inum---of 3
proc_create---of 3
proc_create_data---of 3
proc_create_mount_point---of 3
proc_create_reg---of 10
proc_create_seq_private---of 5
proc_create_single_data---of 5
proc_free_inum---of 1
proc_get_parent_data---of 1
proc_getattr---of 4
proc_lookup---of 3
proc_lookup_de70%of 10
proc_misc_d_delete---of 1
proc_misc_d_revalidate---of 3
proc_mkdir---of 3
proc_mkdir_data---of 5
proc_mkdir_mode---of 5
proc_net_d_revalidate---of 1
proc_notify_change---of 3
proc_readdir---of 3
proc_readdir_de---of 25
proc_register---of 20
proc_remove---of 2
proc_seq_open---of 5
proc_seq_release---of 5
proc_set_size---of 1
proc_set_user---of 1
proc_simple_write---of 8
proc_single_open---of 1
proc_symlink---of 9
remove_proc_entry---of 17
remove_proc_subtree---of 17

__blk_mq_all_tag_iter---of 24
__blk_mq_get_tag37%of 19
__blk_mq_tag_busy---of 8
__blk_mq_tag_idle---of 6
blk_mq_all_tag_iter---of 24
blk_mq_exit_shared_sbitmap---of 1
blk_mq_find_and_get_req---of 13
blk_mq_free_tags---of 3
blk_mq_get_tag81%of 26
blk_mq_init_shared_sbitmap---of 6
blk_mq_init_tags---of 9
blk_mq_init_tags.cold---of 1
blk_mq_put_tag---of 5
blk_mq_queue_tag_busy_iter---of 47
blk_mq_tag_resize_shared_sbitmap---of 1
blk_mq_tag_update_depth---of 10
blk_mq_tag_wakeup_all---of 2
blk_mq_tagset_busy_iter---of 5
blk_mq_tagset_count_completed_rqs---of 4
blk_mq_tagset_wait_completed_request---of 8
blk_mq_unique_tag---of 1
bt_iter---of 8
bt_tags_iter---of 13

__shmem_file_setup60%of 10
shmem_add_to_page_cache46%of 72
shmem_alloc_inode100%of 3
shmem_alloc_page50%of 4
shmem_charge---of 9
shmem_create---of 1
shmem_destroy_inode100%of 2
shmem_encode_fh---of 8
shmem_evict_inode34%of 24
shmem_fallocate---of 42
shmem_fault---of 18
shmem_fh_to_dentry---of 7
shmem_file_llseek---of 11
shmem_file_read_iter---of 49
shmem_file_setup100%of 1
shmem_file_setup_with_mnt---of 1
shmem_fill_super---of 19
shmem_free_fc---of 4
shmem_free_in_core_inode---of 3
shmem_free_swap---of 3
shmem_get_inode33%of 28
shmem_get_inode.cold---of 1
shmem_get_link---of 17
shmem_get_parent---of 1
shmem_get_policy---of 1
shmem_get_tree---of 1
shmem_get_unmapped_area---of 7
shmem_getattr---of 3
shmem_getpage---of 1
shmem_getpage_gfp.constprop.027%of 103
shmem_init_fs_context---of 3
shmem_init_inode---of 1
shmem_initxattrs---of 6
shmem_kernel_file_setup---of 1
shmem_link---of 11
shmem_listxattr---of 1
shmem_lock---of 8
shmem_mapping100%of 1
shmem_match---of 4
shmem_mcopy_atomic_pte---of 1
shmem_mfill_atomic_pte---of 79
shmem_mfill_zeropage_pte---of 1
shmem_mkdir---of 3
shmem_mknod---of 8
shmem_mmap---of 6
shmem_parse_one---of 27
shmem_parse_options---of 11
shmem_partial_swap_usage---of 49
shmem_put_link---of 6
shmem_put_super---of 3
shmem_read_mapping_page_gfp---of 6
shmem_recalc_inode80%of 5
shmem_reconfigure---of 28
shmem_rename2---of 20
shmem_replace_entry---of 7
shmem_rmdir---of 3
shmem_seek_hole_data---of 25
shmem_set_policy---of 1
shmem_setattr36%of 28
shmem_show_options---of 17
shmem_statfs---of 5
shmem_swap_usage---of 11
shmem_swapin---of 4
shmem_swapin_page---of 115
shmem_symlink---of 21
shmem_tmpfile---of 7
shmem_truncate_range---of 1
shmem_uncharge---of 4
shmem_undo_range31%of 92
shmem_unlink---of 5
shmem_unlock_mapping---of 8
shmem_unuse---of 12
shmem_unuse_inode---of 80
shmem_write_begin50%of 6
shmem_write_end66%of 23
shmem_writepage---of 69
shmem_xattr_handler_get100%of 1
shmem_xattr_handler_set---of 1
shmem_zero_setup---of 6
synchronous_wake_function---of 1
vma_is_shmem---of 1

__generic_file_fsync---of 8
alloc_anon_inode---of 3
always_delete_dentry100%of 1
anon_set_page_dirty---of 1
dcache_dir_close---of 1
dcache_dir_lseek---of 11
dcache_dir_open---of 2
dcache_readdir---of 15
empty_dir_getattr---of 1
empty_dir_listxattr---of 1
empty_dir_llseek---of 1
empty_dir_lookup---of 1
empty_dir_readdir---of 7
empty_dir_setattr---of 1
generic_check_addressable---of 4
generic_fh_to_dentry---of 5
generic_fh_to_parent---of 7
generic_file_fsync---of 3
generic_read_dir---of 1
init_pseudo---of 3
is_empty_dir_inode---of 4
kfree_link100%of 1
make_empty_dir_inode---of 1
memory_read_from_buffer---of 4
noop_direct_IO---of 1
noop_fsync---of 1
noop_set_page_dirty---of 1
pseudo_fs_fill_super---of 5
pseudo_fs_free---of 1
pseudo_fs_get_tree---of 1
scan_positives---of 14
simple_attr_open---of 3
simple_attr_read---of 11
simple_attr_release---of 1
simple_attr_write---of 1
simple_attr_write_signed---of 1
simple_attr_write_xsigned.constprop.0.isra.0---of 11
simple_empty---of 6
simple_fill_super---of 13
simple_fill_super.cold---of 1
simple_get_link---of 1
simple_getattr---of 1
simple_link---of 3
simple_lookup84%of 6
simple_nosetlease---of 1
simple_open---of 3
simple_pin_fs---of 7
simple_read_from_buffer---of 8
simple_readpage---of 3
simple_recursive_removal---of 33
simple_release_fs---of 3
simple_rename---of 9
simple_rmdir---of 3
simple_setattr---of 5
simple_statfs---of 1
simple_transaction_get---of 6
simple_transaction_read---of 3
simple_transaction_release---of 1
simple_transaction_set---of 3
simple_unlink---of 1
simple_write_begin---of 12
simple_write_end---of 22
simple_write_to_buffer---of 8

anon_inode_getfd80%of 5
anon_inode_getfile63%of 8
anon_inodefs_dname---of 1
anon_inodefs_init_fs_context---of 3

__ia32_compat_sys_old_semctl---of 1
__ia32_compat_sys_semctl---of 1
__ia32_sys_semctl---of 1
__ia32_sys_semget---of 4
__ia32_sys_semop---of 1
__ia32_sys_semtimedop---of 6
__ia32_sys_semtimedop_time32---of 6
__x64_sys_semctl---of 1
__x64_sys_semget---of 4
__x64_sys_semop---of 1
__x64_sys_semtimedop---of 6
__x64_sys_semtimedop_time32---of 6
compat_ksys_old_semctl---of 1
compat_ksys_semctl---of 15
compat_ksys_semtimedop---of 6
copy_compat_semid_to_user---of 4
copy_overflow---of 1
copy_semid_from_user.constprop.0---of 2
copy_semid_to_user.constprop.0---of 1
copy_semundo22%of 14
count_semcnt---of 19
count_semcnt.cold---of 2
do_semtimedop---of 281
do_smart_wakeup_zero---of 12
exit_sem2%of 124
freeary---of 62
ksys_semctl.constprop.0---of 12
ksys_semget---of 4
ksys_semtimedop---of 6
lookup_undo---of 8
newary---of 27
perform_atomic_semop---of 50
sem_exit_ns---of 1
sem_init_ns---of 1
sem_more_checks---of 2
sem_rcu_free---of 1
semctl_down---of 39
semctl_info.part.0---of 10
semctl_main---of 164
semctl_setval---of 103
semctl_stat---of 31
sysvipc_sem_proc_show---of 12
update_queue---of 20
wake_const_ops---of 15

inode_newsize_ok72%of 7
notify_change45%of 79
setattr_copy58%of 14
setattr_prepare33%of 28
setattr_should_drop_sgid---of 5
setattr_should_drop_suidgid30%of 10

kobj_lookup85%of 13
kobj_map---of 11
kobj_map_init---of 7
kobj_unmap---of 8

__key_instantiate_and_link---of 15
generic_key_instantiate---of 3
key_alloc---of 60
key_create_or_update---of 56
key_instantiate_and_link---of 14
key_invalidate---of 4
key_lookup---of 19
key_payload_reserve---of 13
key_put100%of 2
key_put.part.060%of 5
key_reject_and_link---of 33
key_revoke---of 7
key_set_timeout---of 3
key_type_lookup---of 6
key_type_put---of 1
key_update---of 10
key_user_lookup---of 17
key_user_put---of 2
register_key_type---of 5
register_key_type.cold---of 1
unregister_key_type---of 1

get_rock_ridge_filename---of 26
get_rock_ridge_filename.cold---of 1
parse_rock_ridge_inode86%of 7
parse_rock_ridge_inode_internal.part.013%of 56
parse_rock_ridge_inode_internal.part.0.cold---of 11
rock_check_overflow---of 3
rock_check_overflow.cold---of 1
rock_continue---of 10
rock_continue.cold---of 2
rock_ridge_symlink_readpage---of 50
rock_ridge_symlink_readpage.cold---of 3
setup_rock_ridge56%of 9

__fsnotify_recalc_mask---of 25
fsnotify_add_mark---of 5
fsnotify_add_mark_locked---of 58
fsnotify_add_mark_locked.cold---of 1
fsnotify_clear_marks_by_group---of 19
fsnotify_compare_groups34%of 9
fsnotify_conn_mask---of 10
fsnotify_connector_destroy_workfn---of 2
fsnotify_destroy_mark---of 5
fsnotify_destroy_marks17%of 18
fsnotify_detach_connector_from_object---of 10
fsnotify_detach_mark---of 13
fsnotify_drop_object---of 5
fsnotify_final_mark_destroy---of 3
fsnotify_find_mark---of 10
fsnotify_finish_user_wait---of 5
fsnotify_free_mark---of 4
fsnotify_get_mark---of 7
fsnotify_grab_connector50%of 16
fsnotify_init_mark---of 1
fsnotify_mark_destroy_workfn---of 3
fsnotify_prepare_user_wait---of 23
fsnotify_put_mark---of 17
fsnotify_put_mark_wake.part.0---of 3
fsnotify_put_sb_connectors---of 7
fsnotify_recalc_mask---of 2
fsnotify_recalc_mask.part.0---of 9
fsnotify_wait_marks_destroyed---of 1

__ext4_ext_check56%of 47
__ext4_ext_dirty88%of 8
__read_extent_tree_block50%of 22
ext4_alloc_file_blocks.isra.0---of 35
ext4_cache_extents55%of 11
ext4_can_extents_be_merged.constprop.077%of 13
ext4_clu_mapped---of 25
ext4_convert_unwritten_extents---of 18
ext4_convert_unwritten_io_end_vec---of 13
ext4_datasem_ensure_credits---of 7
ext4_es_is_delayed---of 1
ext4_ext_calc_credits_for_single_extent---of 7
ext4_ext_check_inode---of 1
ext4_ext_clear_bb---of 31
ext4_ext_correct_indexes42%of 12
ext4_ext_drop_refs---of 5
ext4_ext_find_goal78%of 9
ext4_ext_get_access.constprop.0.isra.0100%of 5
ext4_ext_index_trans_blocks60%of 5
ext4_ext_insert_extent30%of 202
ext4_ext_map_blocks24%of 331
ext4_ext_mark_unwritten---of 3
ext4_ext_next_allocated_block75%of 12
ext4_ext_precache---of 23
ext4_ext_remove_space---of 237
ext4_ext_replay_set_iblocks---of 73
ext4_ext_replay_shrink_inode---of 20
ext4_ext_replay_update_ex---of 41
ext4_ext_rm_idx---of 21
ext4_ext_search_right36%of 31
ext4_ext_shift_extents---of 83
ext4_ext_tree_init100%of 1
ext4_ext_truncate---of 8
ext4_ext_try_to_merge50%of 14
ext4_ext_try_to_merge_right24%of 21
ext4_extent_block_csum_set67%of 9
ext4_fallocate---of 203
ext4_fiemap---of 8
ext4_fiemap_check_ranges---of 9
ext4_find_extent55%of 53
ext4_get_es_cache---of 28
ext4_iomap_xattr_begin---of 10
ext4_rereserve_cluster---of 1
ext4_split_convert_extents---of 9
ext4_split_extent---of 20
ext4_split_extent_at---of 70
ext4_swap_extents---of 78
get_implied_cluster_alloc.isra.0---of 30
get_order---of 1
skip_hole---of 4
trace_ext4_ext_convert_to_initialized_fastpath---of 9

__iomap_dio_rw53%of 59
iomap_dio_actor14%of 23
iomap_dio_actor.cold---of 1
iomap_dio_bio_actor71%of 44
iomap_dio_bio_end_io---of 13
iomap_dio_complete77%of 21
iomap_dio_complete_work---of 1
iomap_dio_iopoll---of 3
iomap_dio_rw80%of 5
iomap_dio_submit_bio58%of 7
iomap_dio_zero---of 10

__alloc_percpu100%of 1
__alloc_percpu_gfp---of 1
__alloc_reserved_percpu---of 1
__is_kernel_percpu_address---of 7
__pcpu_chunk_move---of 4
__traceiter_percpu_alloc_percpu---of 4
__traceiter_percpu_alloc_percpu_fail---of 4
__traceiter_percpu_create_chunk---of 4
__traceiter_percpu_destroy_chunk---of 4
__traceiter_percpu_free_percpu---of 4
cpumask_weight.constprop.0---of 1
free_percpu---of 44
is_kernel_percpu_address---of 5
pcpu_alloc32%of 105
pcpu_alloc.cold---of 2
pcpu_alloc_area80%of 29
pcpu_balance_workfn---of 70
pcpu_block_refresh_hint80%of 5
pcpu_block_update47%of 26
pcpu_block_update_hint_alloc43%of 33
pcpu_chunk_populated---of 5
pcpu_chunk_refresh_hint---of 7
pcpu_chunk_relocate72%of 7
pcpu_create_chunk---of 22
pcpu_dump_alloc_info---of 22
pcpu_find_block_fit58%of 14
pcpu_free_area---of 36
pcpu_free_pages.constprop.0---of 6
pcpu_get_pages---of 7
pcpu_init_md_blocks---of 2
pcpu_mem_zalloc---of 8
pcpu_next_fit_region.constprop.077%of 17
pcpu_next_md_free_region---of 11
pcpu_nr_pages---of 1
pcpu_populate_chunk---of 30
per_cpu_ptr_to_phys---of 13
percpu_ref_put_many.constprop.0---of 17
perf_trace_percpu_alloc_percpu---of 6
perf_trace_percpu_alloc_percpu_fail---of 6
perf_trace_percpu_create_chunk---of 6
perf_trace_percpu_destroy_chunk---of 6
perf_trace_percpu_free_percpu---of 6
set_bit---of 1
trace_event_raw_event_percpu_alloc_percpu---of 10
trace_event_raw_event_percpu_alloc_percpu_fail---of 10
trace_event_raw_event_percpu_create_chunk---of 10
trace_event_raw_event_percpu_destroy_chunk---of 10
trace_event_raw_event_percpu_free_percpu---of 10
trace_raw_output_percpu_alloc_percpu---of 4
trace_raw_output_percpu_alloc_percpu_fail---of 4
trace_raw_output_percpu_create_chunk---of 5
trace_raw_output_percpu_destroy_chunk---of 5
trace_raw_output_percpu_free_percpu---of 4

__do_munmap---of 76
__do_sys_brk---of 25
__do_sys_remap_file_pages---of 27
__do_sys_remap_file_pages.cold---of 1
__ia32_sys_brk---of 1
__ia32_sys_mmap_pgoff---of 1
__ia32_sys_munmap---of 1
__ia32_sys_remap_file_pages---of 1
__install_special_mapping---of 13
__remove_shared_vm_struct---of 6
__split_vma---of 25
__traceiter_vm_unmapped_area---of 4
__vm_munmap---of 5
__vma_adjust---of 182
__vma_link_file---of 6
__vma_link_rb---of 30
__vma_rb_erase---of 69
__x64_sys_brk---of 1
__x64_sys_mmap_pgoff---of 1
__x64_sys_munmap---of 1
__x64_sys_remap_file_pages---of 1
_install_special_mapping---of 1
anon_vma_compatible---of 7
can_vma_merge_before---of 12
copy_vma---of 32
do_brk_flags---of 32
do_mmap---of 76
do_munmap---of 1
exit_mmap---of 17
expand_downwards---of 48
expand_stack---of 1
find_extend_vma---of 7
find_mergeable_anon_vma---of 11
find_vma100%of 10
find_vma_prev---of 14
get_unmapped_area---of 17
init_admin_reserve---of 1
init_user_reserve---of 1
insert_vm_struct---of 17
install_special_mapping---of 3
ksys_mmap_pgoff---of 21
may_expand_vm---of 4
may_expand_vm.part.0---of 5
may_expand_vm.part.0.cold---of 2
mm_drop_all_locks---of 19
mm_take_all_locks---of 30
mmap_region---of 86
perf_trace_vm_unmapped_area---of 6
remove_vma---of 5
special_mapping_fault---of 14
special_mapping_mremap---of 6
special_mapping_name---of 1
split_vma---of 3
trace_event_raw_event_vm_unmapped_area---of 10
trace_raw_output_vm_unmapped_area---of 5
unlink_file_vma---of 2
unmap_region---of 14
vm_brk---of 7
vm_brk_flags---of 7
vm_get_page_prot---of 3
vm_lock_mapping---of 4
vm_munmap---of 1
vm_stat_account---of 6
vm_unmapped_area---of 72
vma_gap_callbacks_rotate---of 10
vma_is_special_mapping---of 6
vma_link---of 34
vma_merge---of 67
vma_set_page_prot---of 7
vma_wants_writenotify---of 16

__asymmetric_key_hex_to_key_id---of 1
asymmetric_key_cmp---of 7
asymmetric_key_cmp_partial---of 7
asymmetric_key_describe58%of 7
asymmetric_key_destroy---of 4
asymmetric_key_eds_op---of 6
asymmetric_key_free_preparse---of 5
asymmetric_key_generate_id---of 5
asymmetric_key_hex_to_key_id---of 3
asymmetric_key_hex_to_key_id.part.0---of 5
asymmetric_key_id_partial---of 6
asymmetric_key_id_same---of 6
asymmetric_key_match_free---of 1
asymmetric_key_match_preparse---of 15
asymmetric_key_preparse---of 6
asymmetric_key_verify_signature---of 1
asymmetric_lookup_restriction---of 22
find_asymmetric_key---of 17
get_order---of 1
register_asymmetric_key_parser---of 4
register_asymmetric_key_parser.cold---of 3
unregister_asymmetric_key_parser---of 1

__ext4_set_acl---of 21
ext4_get_acl---of 35
ext4_init_acl27%of 19
ext4_set_acl---of 20
get_order---of 1

ext4_init_security100%of 1
ext4_initxattrs100%of 4
ext4_xattr_security_get100%of 1
ext4_xattr_security_set---of 1

ext4_sync_file38%of 54

__ia32_compat_sys_epoll_pwait---of 8
__ia32_sys_epoll_create---of 9
__ia32_sys_epoll_create1---of 1
__ia32_sys_epoll_ctl---of 4
__ia32_sys_epoll_pwait---of 9
__ia32_sys_epoll_wait---of 1
__x64_sys_epoll_create---of 9
__x64_sys_epoll_create1---of 1
__x64_sys_epoll_ctl---of 4
__x64_sys_epoll_pwait---of 9
__x64_sys_epoll_wait---of 1
do_epoll_create---of 8
do_epoll_ctl---of 149
do_epoll_wait---of 61
ep_alloc.constprop.0---of 10
ep_autoremove_wake_function---of 1
ep_busy_loop_end---of 6
ep_call_nested.constprop.0---of 7
ep_create_wakeup_source---of 5
ep_destroy_wakeup_source---of 7
ep_eventpoll_poll---of 5
ep_eventpoll_release---of 3
ep_free---of 7
ep_item_poll---of 9
ep_loop_check_proc---of 11
ep_pm_stay_awake_rcu---of 20
ep_poll_callback33%of 37
ep_ptable_queue_proc---of 7
ep_read_events_proc---of 10
ep_remove---of 15
ep_scan_ready_list---of 27
ep_send_events_proc---of 41
ep_show_fdinfo---of 4
ep_unregister_pollwait.constprop.0---of 17
epi_rcu_free---of 1
eventpoll_release_file---of 3
get_epoll_tfile_raw_ptr---of 9
reverse_path_check_proc---of 20
reverse_path_check_proc.cold---of 1

ext4_block_bitmap_csum_set70%of 10
ext4_block_bitmap_csum_verify---of 12
ext4_count_free---of 1
ext4_inode_bitmap_csum_set70%of 10
ext4_inode_bitmap_csum_verify---of 12

_find_next_bit.constprop.092%of 12
find_first_bit80%of 5
find_first_zero_bit100%of 5
find_last_bit84%of 6
find_next_and_bit100%of 1
find_next_bit100%of 1
find_next_clump8---of 3
find_next_zero_bit100%of 1

__first_packet_length---of 20
__skb_recv_udp---of 33
__udp4_lib_err---of 40
__udp4_lib_lookup---of 51
__udp4_lib_rcv---of 159
__udp_disconnect---of 13
__udp_enqueue_schedule_skb---of 25
first_packet_length---of 9
skb_consume_udp---of 16
udp4_hwcsum---of 7
udp4_lib_lookup2.isra.0---of 48
udp4_lib_lookup_skb---of 10
udp4_proc_exit---of 1
udp4_proc_exit_net---of 1
udp4_proc_init_net---of 2
udp4_seq_show---of 4
udp_abort---of 3
udp_cmsg_send---of 12
udp_destroy_sock---of 12
udp_destruct_common---of 6
udp_destruct_sock---of 1
udp_disconnect---of 1
udp_ehashfn---of 6
udp_encap_disable---of 1
udp_encap_enable---of 1
udp_err---of 1
udp_flow_hashrnd---of 6
udp_flush_pending_frames---of 2
udp_get_first.isra.0---of 13
udp_get_next.isra.0---of 12
udp_getsockopt---of 5
udp_init_sock100%of 1
udp_ioctl---of 6
udp_lib_close---of 1
udp_lib_get_port---of 62
udp_lib_getsockopt---of 14
udp_lib_hash---of 1
udp_lib_lport_inuse---of 22
udp_lib_lport_inuse2---of 18
udp_lib_rehash---of 14
udp_lib_setsockopt---of 44
udp_lib_unhash---of 16
udp_poll---of 8
udp_pre_connect---of 2
udp_push_pending_frames---of 3
udp_queue_rcv_one_skb---of 80
udp_queue_rcv_skb---of 31
udp_rcv---of 1
udp_recvmsg---of 73
udp_rmem_release---of 11
udp_send_skb---of 43
udp_sendmsg---of 126
udp_sendmsg.cold---of 1
udp_sendpage---of 21
udp_seq_next---of 4
udp_seq_start---of 6
udp_seq_stop---of 4
udp_set_csum---of 9
udp_setsockopt---of 5
udp_sk_rx_dst_set---of 5
udp_skb_destructor---of 1
udp_sysctl_init---of 1
udp_unicast_rcv_skb---of 8
udp_v4_early_demux---of 66
udp_v4_get_port---of 1
udp_v4_rehash---of 1
udplite_getfrag---of 4

__ext4_fc_track_create50%of 10
__ext4_fc_track_link---of 10
__ext4_fc_track_unlink---of 10
__track_dentry_update---of 14
__track_inode---of 3
__track_range---of 6
ext4_end_buffer_io_sync---of 7
ext4_fc_add_dentry_tlv---of 3
ext4_fc_add_tlv---of 3
ext4_fc_cleanup---of 25
ext4_fc_commit20%of 87
ext4_fc_del---of 8
ext4_fc_destroy_dentry_cache---of 1
ext4_fc_info_show---of 5
ext4_fc_init---of 2
ext4_fc_init_inode100%of 1
ext4_fc_mark_ineligible20%of 5
ext4_fc_memcpy---of 7
ext4_fc_record_modified_inode.isra.0---of 8
ext4_fc_record_regions---of 9
ext4_fc_replay---of 220
ext4_fc_replay_check_excluded---of 8
ext4_fc_replay_cleanup---of 1
ext4_fc_replay_link_internal.isra.0---of 15
ext4_fc_reserve_space---of 19
ext4_fc_set_bitmaps_and_counters---of 14
ext4_fc_start_ineligible---of 5
ext4_fc_start_update---of 7
ext4_fc_stop_ineligible---of 3
ext4_fc_stop_update---of 4
ext4_fc_submit_bh---of 8
ext4_fc_track_create100%of 1
ext4_fc_track_inode59%of 12
ext4_fc_track_link---of 1
ext4_fc_track_range59%of 12
ext4_fc_track_template15%of 14
ext4_fc_track_unlink---of 1
ext4_fc_wait_committing_inode---of 4
ext4_fc_write_inode---of 10
ext4_fc_write_inode_data---of 15

__activate_page62%of 26
__page_cache_release12%of 26
__pagevec_lru_add---of 1
__pagevec_lru_add_fn52%of 54
__pagevec_release60%of 15
__put_compound_page---of 5
__put_page75%of 4
__traceiter_mm_lru_activate---of 4
__traceiter_mm_lru_insertion---of 4
deactivate_file_page---of 23
deactivate_page---of 30
get_kernel_page---of 8
get_kernel_pages---of 12
lru_add_drain---of 13
lru_add_drain_all---of 15
lru_add_drain_cpu33%of 28
lru_add_drain_cpu_zone---of 13
lru_add_drain_per_cpu---of 13
lru_cache_add54%of 32
lru_cache_add_inactive_or_unevictable40%of 10
lru_deactivate_file_fn---of 31
lru_deactivate_fn---of 20
lru_lazyfree_fn67%of 27
lru_note_cost---of 28
lru_note_cost_page---of 3
mark_page_accessed62%of 71
mark_page_lazyfree---of 37
pagevec_lookup_entries100%of 1
pagevec_lookup_range---of 1
pagevec_lookup_range_nr_tag---of 1
pagevec_lookup_range_tag100%of 1
pagevec_lru_move_fn89%of 9
pagevec_move_tail---of 1
pagevec_move_tail_fn---of 30
pagevec_remove_exceptionals100%of 6
perf_trace_mm_lru_activate---of 6
perf_trace_mm_lru_insertion---of 24
put_pages_list---of 7
release_pages60%of 52
rotate_reclaimable_page---of 37
trace_event_raw_event_mm_lru_activate---of 10
trace_event_raw_event_mm_lru_insertion---of 28
trace_raw_output_mm_lru_activate---of 4
trace_raw_output_mm_lru_insertion---of 10

__blk_queue_split46%of 66
__blk_rq_map_sg43%of 63
attempt_merge---of 14
attempt_merge.part.02%of 121
bio_attempt_back_merge---of 16
bio_attempt_discard_merge---of 29
bio_attempt_front_merge---of 69
blk_account_io_merge_bio---of 11
blk_attempt_bio_merge.part.0---of 16
blk_attempt_plug_merge34%of 9
blk_attempt_req_merge80%of 15
blk_bio_list_merge---of 6
blk_mq_sched_try_merge10%of 21
blk_queue_split---of 1
blk_recalc_rq_segments---of 18
blk_rq_merge_ok79%of 14
blk_rq_set_mixed_merge---of 9
blk_try_merge58%of 7
bvec_split_segs---of 13
ll_back_merge_fn---of 59

__crc32c_le100%of 1
__crc32c_le_shift---of 1
crc32_be---of 1
crc32_body100%of 12
crc32_generic_shift---of 20
crc32_le_base---of 1
crc32_le_shift---of 1

__blk_send_generic.constprop.0---of 5
blk_verify_command---of 6
copy_overflow---of 1
get_order---of 1
get_sg_io_hdr---of 5
put_sg_io_hdr---of 5
scsi_cdrom_send_packet---of 19
scsi_cmd_blk_ioctl---of 5
scsi_cmd_ioctl---of 21
scsi_cmd_ioctl.cold---of 2
scsi_req_init100%of 1
scsi_verify_blk_ioctl---of 4
sg_io---of 41
sg_scsi_ioctl---of 32

proc_key_users_next---of 5
proc_key_users_show---of 4
proc_key_users_start---of 14
proc_key_users_stop---of 1
proc_keys_next67%of 6
proc_keys_show67%of 53
proc_keys_start78%of 18
proc_keys_stop100%of 1

__ia32_sys_fdatasync---of 7
__ia32_sys_fsync---of 6
__ia32_sys_sync_file_range---of 5
__ia32_sys_sync_file_range2---of 5
__ia32_sys_syncfs---of 12
__x64_sys_fdatasync---of 7
__x64_sys_fsync---of 6
__x64_sys_sync---of 1
__x64_sys_sync_file_range---of 5
__x64_sys_sync_file_range2---of 5
__x64_sys_syncfs---of 12
do_sync_work---of 1
emergency_sync---of 2
fdatawait_one_bdev---of 1
fdatawrite_one_bdev---of 1
ksys_sync---of 3
ksys_sync_file_range---of 5
sync_file_range---of 15
sync_filesystem---of 5
sync_filesystem.part.0---of 8
sync_fs_one_sb---of 4
sync_inodes_one_sb---of 2
vfs_fsync---of 6
vfs_fsync_range84%of 6

__ia32_sys_getrandom---of 11
__x64_sys_getrandom---of 11
_credit_init_bits---of 12
_get_random_bytes.part.072%of 7
add_bootloader_randomness---of 3
add_device_randomness---of 1
add_disk_randomness---of 3
add_hwgenerator_randomness---of 4
add_hwgenerator_randomness.cold---of 1
add_input_randomness---of 2
add_interrupt_randomness---of 8
add_timer_randomness---of 12
add_timer_randomness.cold---of 1
blake2s.constprop.0---of 1
crng_fast_key_erasure67%of 3
crng_make_state43%of 33
crng_reseed---of 4
entropy_timer---of 2
extract_entropy.constprop.0---of 10
fast_mix---of 1
get_random_bytes---of 2
get_random_bytes_arch---of 8
get_random_bytes_user---of 15
get_random_u32---of 24
get_random_u6471%of 24
mix_interrupt_randomness---of 7
mix_interrupt_randomness.cold---of 3
mix_pool_bytes---of 1
proc_do_rointvec---of 3
proc_do_uuid---of 8
rand_initialize_disk---of 2
random_fasync---of 1
random_ioctl---of 21
random_ioctl.cold---of 2
random_online_cpu---of 1
random_poll---of 5
random_prepare_cpu---of 1
random_read_iter---of 8
random_write_iter---of 4
register_random_ready_notifier---of 5
rng_is_initialized---of 1
try_to_generate_entropy---of 11
unregister_random_ready_notifier---of 1
urandom_read_iter---of 6
urandom_read_iter.cold---of 2
wait_for_random_bytes---of 2
wait_for_random_bytes.cold---of 10
write_pool_user.part.0---of 8

__seq_open_private---of 5
copy_overflow---of 1
mangle_path---of 9
seq_dentry---of 15
seq_escape---of 11
seq_escape_mem_ascii---of 11
seq_file_path---of 1
seq_hex_dump---of 20
seq_hlist_next---of 5
seq_hlist_next_percpu---of 6
seq_hlist_next_rcu---of 12
seq_hlist_start---of 5
seq_hlist_start_head---of 6
seq_hlist_start_head_rcu---of 3
seq_hlist_start_percpu---of 7
seq_hlist_start_rcu---of 16
seq_list_next---of 2
seq_list_start---of 5
seq_list_start_head---of 6
seq_lseek---of 15
seq_open80%of 5
seq_open_private---of 2
seq_pad---of 7
seq_path---of 15
seq_path_root---of 17
seq_printf100%of 3
seq_put_decimal_ll---of 16
seq_put_decimal_ull---of 1
seq_put_decimal_ull_width---of 12
seq_put_hex_ll---of 16
seq_putc100%of 2
seq_puts67%of 3
seq_read---of 16
seq_read_iter57%of 62
seq_read_iter.cold---of 1
seq_release---of 1
seq_release_private---of 1
seq_vprintf---of 3
seq_write---of 4
single_next---of 1
single_open67%of 6
single_open_size---of 6
single_release---of 1
single_start---of 1
traverse.part.058%of 19

__get_fs_type100%of 7
__ia32_sys_sysfs---of 11
__x64_sys_sysfs---of 11
filesystems_proc_show---of 4
fs_index---of 8
fs_name---of 11
get_filesystem100%of 1
get_fs_type28%of 11
get_fs_type.cold---of 1
put_filesystem100%of 1
register_filesystem---of 12
unregister_filesystem---of 9

__d_path---of 7
__dentry_path66%of 29
__do_sys_getcwd---of 43
__ia32_sys_getcwd---of 1
__x64_sys_getcwd---of 1
d_absolute_path---of 7
d_path55%of 35
dentry_path---of 8
dentry_path_raw100%of 1
dynamic_dname---of 4
prepend_name100%of 5
prepend_path68%of 70
simple_dname---of 5

netlbl_domhsh_add---of 3
netlbl_domhsh_add.part.0---of 145
netlbl_domhsh_add_default---of 3
netlbl_domhsh_audit_add---of 15
netlbl_domhsh_free_entry---of 25
netlbl_domhsh_getentry80%of 5
netlbl_domhsh_getentry_af4---of 9
netlbl_domhsh_getentry_af6---of 9
netlbl_domhsh_hash67%of 9
netlbl_domhsh_remove---of 26
netlbl_domhsh_remove_af4---of 51
netlbl_domhsh_remove_af6---of 51
netlbl_domhsh_remove_default---of 1
netlbl_domhsh_remove_entry---of 70
netlbl_domhsh_search40%of 15
netlbl_domhsh_search_def.part.032%of 22
netlbl_domhsh_validate---of 34
netlbl_domhsh_walk---of 31

__delayacct_add_tsk---of 9
__delayacct_blkio_end---of 5
__delayacct_blkio_start100%of 1
__delayacct_blkio_ticks---of 1
__delayacct_freepages_end---of 2
__delayacct_freepages_start---of 1
__delayacct_thrashing_end---of 2
__delayacct_thrashing_start---of 1
__delayacct_tsk_init100%of 2
delayacct_init---of 3

__do_compat_sys_newfstat---of 3
__do_compat_sys_newfstatat---of 4
__do_compat_sys_newlstat---of 4
__do_compat_sys_newstat---of 4
__do_sys_fstat---of 3
__do_sys_lstat---of 4
__do_sys_newfstat---of 3
__do_sys_newfstatat---of 4
__do_sys_newlstat---of 4
__do_sys_newstat---of 4
__do_sys_stat---of 4
__ia32_compat_sys_newfstat---of 1
__ia32_compat_sys_newfstatat---of 1
__ia32_compat_sys_newlstat---of 1
__ia32_compat_sys_newstat---of 1
__ia32_sys_fstat---of 1
__ia32_sys_lstat---of 1
__ia32_sys_newfstat---of 1
__ia32_sys_newfstatat---of 1
__ia32_sys_newlstat---of 1
__ia32_sys_newstat---of 1
__ia32_sys_readlink---of 1
__ia32_sys_readlinkat---of 1
__ia32_sys_stat---of 1
__ia32_sys_statx---of 1
__inode_add_bytes---of 2
__inode_sub_bytes---of 3
__x64_sys_fstat---of 1
__x64_sys_lstat---of 1
__x64_sys_newfstat---of 1
__x64_sys_newfstatat---of 1
__x64_sys_newlstat---of 1
__x64_sys_newstat---of 1
__x64_sys_readlink---of 1
__x64_sys_readlinkat---of 1
__x64_sys_stat---of 1
__x64_sys_statx---of 1
cp_compat_stat---of 11
cp_new_stat---of 2
cp_old_stat---of 13
cp_old_stat.cold---of 1
cp_statx---of 2
do_readlinkat---of 11
do_statx---of 5
generic_fillattr---of 1
inode_add_bytes67%of 3
inode_get_bytes---of 1
inode_set_bytes---of 1
inode_sub_bytes---of 4
vfs_fstat---of 6
vfs_fstatat---of 1
vfs_getattr---of 3
vfs_getattr_nosec---of 8
vfs_statx---of 14

cgroup_freezing---of 22
freeze_cgroup---of 4
freezer_apply_state---of 12
freezer_attach---of 9
freezer_css_alloc---of 4
freezer_css_free---of 1
freezer_css_offline---of 3
freezer_css_online---of 5
freezer_fork52%of 29
freezer_parent_freezing_read---of 1
freezer_read---of 68
freezer_self_freezing_read---of 1
freezer_write---of 72
unfreeze_cgroup---of 4
update_if_frozen---of 38

bsearch100%of 5

_atomic_dec_and_lock72%of 7
_atomic_dec_and_lock_irqsave---of 6

__bio_add_page60%of 10
__bio_clone_fast---of 6
__bio_try_merge_page40%of 10
bio_add_hw_page---of 17
bio_add_page100%of 6
bio_add_pc_page---of 1
bio_advance---of 11
bio_alloc_bioset45%of 27
bio_alloc_rescue---of 6
bio_chain---of 4
bio_chain_endio---of 7
bio_check_pages_dirty---of 16
bio_clone_fast---of 8
bio_copy_data---of 1
bio_copy_data_iter---of 26
bio_devname---of 1
bio_dirty_fn---of 8
bio_endio---of 30
bio_free---of 3
bio_free_pages---of 7
bio_init100%of 1
bio_iov_iter_get_pages29%of 71
bio_list_copy_data---of 9
bio_put---of 3
bio_put.part.0---of 4
bio_put_slab---of 11
bio_release_pages---of 2
bio_release_pages.part.0---of 14
bio_reset---of 1
bio_set_pages_dirty---of 7
bio_split---of 22
bio_trim---of 12
bio_truncate---of 2
bio_truncate.part.0---of 21
bioset_exit---of 5
bioset_init---of 30
bioset_init_from_src---of 3
biovec_init_pool---of 1
bvec_alloc39%of 13
bvec_free---of 6
bvec_nr_vecs---of 1
get_order---of 1
guard_bio_eod56%of 20
punt_bios_to_rescuer---of 25
submit_bio_wait---of 5
submit_bio_wait_endio---of 1
zero_fill_bio_iter---of 12

find_revoke_record---of 6
flush_descriptor.part.0---of 5
insert_revoke_hash---of 3
jbd2_clear_buffer_revoked_flags---of 6
jbd2_journal_cancel_revoke30%of 10
jbd2_journal_clear_revoke---of 4
jbd2_journal_destroy_revoke---of 4
jbd2_journal_destroy_revoke_record_cache---of 1
jbd2_journal_destroy_revoke_table---of 5
jbd2_journal_destroy_revoke_table_cache---of 1
jbd2_journal_init_revoke---of 10
jbd2_journal_init_revoke_table---of 11
jbd2_journal_revoke---of 21
jbd2_journal_revoke.cold---of 3
jbd2_journal_set_revoke---of 6
jbd2_journal_switch_revoke_table---of 4
jbd2_journal_test_revoke---of 4
jbd2_journal_write_revoke_records---of 32

create_task_io_context75%of 8
exit_io_context---of 1
get_io_context---of 3
get_task_io_context---of 7
icq_free_icq_rcu---of 1
ioc_clear_queue---of 21
ioc_create_icq---of 17
ioc_create_icq.cold---of 1
ioc_destroy_icq---of 15
ioc_lookup_icq---of 26
ioc_release_fn---of 20
put_io_context---of 2
put_io_context.part.0---of 6
put_io_context_active---of 12

__ata_scsi_find_dev50%of 16
__ata_scsi_queuecmd49%of 37
ata_cmd_ioctl---of 23
ata_gen_passthru_sense---of 20
ata_msense_caching---of 8
ata_sas_scsi_ioctl---of 27
ata_scsi_add_hosts---of 7
ata_scsi_dev_config---of 24
ata_scsi_dev_rescan---of 8
ata_scsi_dma_need_drain---of 1
ata_scsi_find_dev84%of 6
ata_scsi_flush_xlat100%of 2
ata_scsi_handle_link_detach---of 10
ata_scsi_hotplug---of 5
ata_scsi_ioctl---of 1
ata_scsi_media_change_notify---of 2
ata_scsi_mode_select_xlat---of 69
ata_scsi_offline_dev---of 3
ata_scsi_park_show---of 13
ata_scsi_park_store---of 19
ata_scsi_pass_thru---of 68
ata_scsi_qc_complete---of 30
ata_scsi_qc_complete.cold---of 31
ata_scsi_queuecmd75%of 4
ata_scsi_rbuf_fill---of 3
ata_scsi_report_zones_complete---of 12
ata_scsi_rw_xlat27%of 38
ata_scsi_scan_host---of 25
ata_scsi_sdev_config---of 1
ata_scsi_security_inout_xlat---of 25
ata_scsi_set_sense---of 2
ata_scsi_set_sense_information---of 3
ata_scsi_simulate---of 45
ata_scsi_slave_config---of 3
ata_scsi_slave_destroy---of 5
ata_scsi_start_stop_xlat---of 18
ata_scsi_unlock_native_capacity---of 4
ata_scsi_user_scan---of 24
ata_scsi_var_len_cdb_xlat---of 4
ata_scsi_verify_xlat---of 29
ata_scsi_write_same_xlat---of 34
ata_scsi_zbc_in_xlat---of 21
ata_scsi_zbc_out_xlat---of 18
ata_scsiop_inq_00---of 2
ata_scsiop_inq_80---of 1
ata_scsiop_inq_83---of 3
ata_scsiop_inq_89---of 1
ata_scsiop_inq_b0---of 11
ata_scsiop_inq_b1---of 19
ata_scsiop_inq_b2---of 1
ata_scsiop_inq_b6---of 3
ata_scsiop_inq_std---of 14
ata_scsiop_maint_in---of 7
ata_scsiop_mode_sense---of 37
ata_scsiop_read_cap---of 36
ata_scsiop_report_luns---of 1
ata_std_bios_param---of 1
ata_task_ioctl---of 13
ata_to_sense_error---of 10
ata_to_sense_error.cold---of 1
atapi_qc_complete---of 27
atapi_sense_complete---of 4
atapi_xlat---of 17
get_order---of 1
modecpy---of 3
scsi_16_lba_len---of 1

___perf_sw_event---of 31
__do_sys_perf_event_open---of 216
__ia32_sys_perf_event_open---of 1
__perf_addr_filters_adjust---of 18
__perf_event__output_id_sample---of 12
__perf_event_account_interrupt---of 10
__perf_event_disable---of 9
__perf_event_enable---of 21
__perf_event_exit_context---of 11
__perf_event_header__init_id---of 18
__perf_event_init_context---of 1
__perf_event_output_stop---of 11
__perf_event_overflow---of 9
__perf_event_period---of 11
__perf_event_read---of 20
__perf_event_read_size---of 9
__perf_event_read_value---of 3
__perf_event_stop---of 5
__perf_event_task_sched_in---of 26
__perf_event_task_sched_out---of 111
__perf_install_in_context---of 15
__perf_pmu_output_stop---of 15
__perf_pmu_sched_task---of 11
__perf_read_group_add---of 25
__perf_remove_from_context---of 10
__perf_sw_event---of 3
__refcount_add.constprop.0---of 5
__x64_sys_perf_event_open---of 1
_free_event---of 48
_perf_event_disable---of 3
_perf_event_enable---of 7
_perf_event_period---of 8
_perf_event_reset---of 1
_perf_ioctl---of 134
calc_timer_values---of 12
cpu_clock_event_add---of 4
cpu_clock_event_del---of 3
cpu_clock_event_init---of 7
cpu_clock_event_read---of 1
cpu_clock_event_start---of 2
cpu_clock_event_stop---of 3
ctx_resched---of 15
ctx_sched_in.constprop.0---of 18
ctx_sched_out---of 36
event_function---of 18
event_function_call---of 15
event_function_local.constprop.0---of 25
event_sched_in---of 34
event_sched_out---of 30
exclusive_event_destroy---of 4
exclusive_event_installable---of 12
find_get_context---of 47
free_ctx---of 4
free_event---of 3
free_event_rcu---of 3
get_event_type---of 8
get_order---of 1
group_sched_out.part.0---of 7
inherit_event.constprop.0---of 24
inherit_task_group.isra.012%of 27
ktime_get_boottime_ns---of 1
ktime_get_clocktai_ns---of 1
ktime_get_real_ns---of 1
list_add_event---of 15
list_del_event---of 19
local_clock---of 1
merge_sched_in---of 51
nr_addr_filters_show---of 1
perf_addr_filters_splice---of 11
perf_adjust_period---of 34
perf_bp_event---of 4
perf_callchain---of 9
perf_compat_ioctl---of 6
perf_copy_attr---of 45
perf_cpu_time_max_percent_handler---of 8
perf_cpu_time_max_percent_handler.cold---of 1
perf_duration_warn---of 1
perf_duration_warn.cold---of 1
perf_event__header_size---of 18
perf_event__id_header_size---of 12
perf_event__output_id_sample---of 2
perf_event_account_interrupt---of 1
perf_event_addr_filters_apply---of 24
perf_event_addr_filters_exec---of 11
perf_event_addr_filters_sync---of 7
perf_event_alloc.part.0---of 151
perf_event_attrs---of 4
perf_event_aux_event---of 8
perf_event_bpf_event---of 9
perf_event_bpf_output---of 9
perf_event_comm---of 8
perf_event_comm.cold---of 1
perf_event_comm_output---of 20
perf_event_create_kernel_counter---of 18
perf_event_ctx_lock_nested---of 33
perf_event_delayed_put---of 4
perf_event_disable---of 4
perf_event_disable_inatomic---of 1
perf_event_disable_local---of 1
perf_event_enable---of 1
perf_event_exec---of 35
perf_event_exit_cpu---of 1
perf_event_exit_cpu_context---of 3
perf_event_exit_task---of 44
perf_event_for_each_child---of 5
perf_event_fork50%of 2
perf_event_free_task15%of 21
perf_event_get---of 5
perf_event_groups_insert---of 9
perf_event_header__init_id---of 2
perf_event_idx_default---of 1
perf_event_init_cpu---of 13
perf_event_init_task47%of 28
perf_event_itrace_started---of 1
perf_event_ksymbol---of 12
perf_event_ksymbol.cold---of 1
perf_event_ksymbol_output---of 13
perf_event_mmap---of 58
perf_event_mmap_output---of 27
perf_event_mux_interval_ms_show---of 1
perf_event_mux_interval_ms_store---of 9
perf_event_namespaces---of 2
perf_event_namespaces.part.0---of 1
perf_event_namespaces_output---of 16
perf_event_nop_int---of 1
perf_event_output---of 15
perf_event_output_backward---of 15
perf_event_output_forward---of 15
perf_event_overflow---of 1
perf_event_pause---of 8
perf_event_period---of 1
perf_event_read---of 23
perf_event_read_event---of 14
perf_event_read_local---of 19
perf_event_read_value---of 1
perf_event_refresh---of 4
perf_event_release_kernel---of 52
perf_event_sched_in.isra.0---of 3
perf_event_set_output---of 22
perf_event_set_state.part.0---of 11
perf_event_stop.isra.0---of 4
perf_event_switch_output---of 21
perf_event_sysfs_show---of 3
perf_event_task72%of 7
perf_event_task_disable---of 13
perf_event_task_enable---of 7
perf_event_task_output---of 31
perf_event_task_tick---of 36
perf_event_text_poke---of 4
perf_event_text_poke_output---of 22
perf_event_update_time---of 7
perf_event_update_userpage---of 25
perf_event_wakeup---of 3
perf_exclude_event---of 8
perf_fasync---of 1
perf_fill_ns_link_info---of 3
perf_get_aux_event---of 14
perf_get_event---of 4
perf_group_attach---of 10
perf_group_detach---of 40
perf_install_in_context---of 26
perf_ioctl---of 3
perf_iterate_ctx---of 14
perf_iterate_sb---of 37
perf_kprobe_event_init---of 8
perf_lock_task_context42%of 63
perf_log_itrace_start---of 17
perf_log_lost_samples---of 8
perf_log_throttle---of 12
perf_mmap---of 94
perf_mmap_close---of 67
perf_mmap_fault---of 30
perf_mmap_may_split---of 1
perf_mmap_open---of 4
perf_mux_hrtimer_handler---of 55
perf_mux_hrtimer_restart_ipi---of 5
perf_output_read---of 57
perf_output_sample---of 96
perf_output_sample_regs---of 3
perf_pending_event---of 12
perf_pmu_cancel_txn---of 3
perf_pmu_commit_txn---of 5
perf_pmu_disable---of 2
perf_pmu_enable---of 2
perf_pmu_migrate_context---of 24
perf_pmu_name---of 1
perf_pmu_nop_int---of 1
perf_pmu_register---of 52
perf_pmu_resched---of 4
perf_pmu_snapshot_aux---of 5
perf_pmu_start_txn---of 3
perf_pmu_unregister---of 9
perf_poll---of 10
perf_prepare_sample---of 94
perf_proc_update_handler---of 9
perf_read---of 35
perf_reboot---of 3
perf_register_guest_info_callbacks---of 5
perf_release---of 1
perf_remove_from_context---of 8
perf_sample_event_took---of 6
perf_sched_cb_dec---of 2
perf_sched_cb_inc---of 3
perf_sched_delayed---of 3
perf_swevent_add---of 14
perf_swevent_del---of 3
perf_swevent_event12%of 18
perf_swevent_get_recursion_context100%of 3
perf_swevent_hrtimer---of 8
perf_swevent_init---of 23
perf_swevent_put_recursion_context---of 1
perf_swevent_set_period---of 4
perf_swevent_start---of 1
perf_swevent_start_hrtimer.part.0---of 5
perf_swevent_stop---of 1
perf_tp_event23%of 45
perf_tp_event_init---of 5
perf_trace_run_bpf_submit50%of 4
perf_try_init_event---of 21
perf_unregister_guest_info_callbacks---of 5
perf_uprobe_event_init---of 6
pmu_dev_alloc---of 8
pmu_dev_is_visible---of 5
pmu_dev_release---of 1
put_ctx24%of 13
rb_free_rcu---of 1
ref_ctr_offset_show---of 1
remote_function---of 4
retprobe_show---of 1
ring_buffer_attach---of 15
ring_buffer_get---of 30
ring_buffer_put---of 7
ring_buffer_wakeup---of 22
sw_perf_event_destroy---of 5
swevent_hlist_put_cpu---of 9
task_clock_event_add---of 4
task_clock_event_del---of 3
task_clock_event_init---of 7
task_clock_event_read---of 1
task_clock_event_start---of 2
task_clock_event_stop---of 3
task_function_call---of 6
tp_perf_event_destroy---of 1
type_show---of 1
unclone_ctx---of 6
visit_groups_merge.constprop.0.isra.0---of 52

blk_queue_bounce8%of 83
bounce_end_io---of 20
bounce_end_io_read---of 3
bounce_end_io_read_isa---of 3
bounce_end_io_write---of 1
bounce_end_io_write_isa---of 1
copy_to_high_bio_irq---of 22
init_emergency_isa_pool---of 5
init_emergency_isa_pool.cold---of 6
mempool_alloc_pages_isa---of 1

__cancel_work---of 12
__cancel_work_timer---of 20
__flush_work---of 50
__queue_delayed_work62%of 13
__queue_work47%of 77
__queue_work.cold---of 1
__traceiter_workqueue_activate_work---of 4
__traceiter_workqueue_execute_end---of 4
__traceiter_workqueue_execute_start---of 4
__traceiter_workqueue_queue_work---of 4
alloc_unbound_pwq---of 32
alloc_workqueue---of 43
alloc_workqueue_attrs---of 3
apply_workqueue_attrs---of 1
apply_workqueue_attrs_locked---of 10
apply_wqattrs_cleanup.part.0---of 9
apply_wqattrs_commit---of 3
apply_wqattrs_prepare---of 25
bitmap_copy.constprop.0---of 1
cancel_delayed_work---of 1
cancel_delayed_work_sync---of 1
cancel_work---of 1
cancel_work_sync---of 1
check_flush_dependency---of 14
cpumask_weight.constprop.0---of 1
create_worker---of 11
current_is_workqueue_rescuer---of 6
current_work---of 6
cwt_wakefn---of 3
delayed_work_timer_fn---of 1
destroy_worker---of 10
destroy_workqueue---of 21
destroy_workqueue.cold---of 1
drain_workqueue---of 15
drain_workqueue.cold---of 1
execute_in_process_context---of 5
flush_delayed_work---of 5
flush_rcu_work---of 5
flush_work---of 1
flush_workqueue---of 50
flush_workqueue_prep_pwqs---of 20
free_workqueue_attrs---of 2
freeze_workqueues_begin---of 7
freeze_workqueues_busy---of 31
get_order---of 1
get_pwq67%of 6
get_work_pool70%of 13
idle_worker_timeout---of 7
init_pwq---of 3
init_rescuer---of 7
init_worker_pool---of 4
insert_work84%of 6
link_pwq---of 5
max_active_show---of 1
max_active_store---of 4
mod_delayed_work_on100%of 10
numa_pwq_tbl_install---of 7
per_cpu_show---of 1
perf_trace_workqueue_activate_work---of 6
perf_trace_workqueue_execute_end---of 6
perf_trace_workqueue_execute_start---of 6
perf_trace_workqueue_queue_work---of 6
pool_mayday_timeout---of 14
pr_cont_pool_info---of 3
pr_cont_work---of 5
print_worker_info---of 5
print_worker_info.cold---of 3
process_one_work---of 64
process_one_work.cold---of 1
put_pwq43%of 7
put_unbound_pool---of 30
pwq_activate_inactive_work---of 17
pwq_adjust_max_active---of 18
pwq_dec_nr_in_flight45%of 9
pwq_unbound_release_workfn---of 6
queue_delayed_work_on---of 7
queue_rcu_work---of 4
queue_work_node---of 14
queue_work_on100%of 7
rcu_free_pool---of 3
rcu_free_pwq---of 1
rcu_free_wq---of 7
rcu_work_rcufn---of 3
rescuer_thread---of 39
schedule_on_each_cpu---of 8
set_worker_desc---of 5
show_pwq---of 38
show_workqueue_state---of 6
show_workqueue_state.cold---of 40
thaw_workqueues---of 6
trace_event_raw_event_workqueue_activate_work---of 10
trace_event_raw_event_workqueue_execute_end---of 10
trace_event_raw_event_workqueue_execute_start---of 10
trace_event_raw_event_workqueue_queue_work---of 10
trace_raw_output_workqueue_activate_work---of 5
trace_raw_output_workqueue_execute_end---of 4
trace_raw_output_workqueue_execute_start---of 4
trace_raw_output_workqueue_queue_work---of 4
try_to_grab_pending---of 7
try_to_grab_pending.part.045%of 34
unbound_pwq_by_node---of 12
work_busy---of 21
work_for_cpu_fn---of 1
work_on_cpu---of 1
work_on_cpu_safe---of 3
worker_attach_to_pool---of 5
worker_detach_from_pool---of 4
worker_enter_idle---of 14
worker_pool_assign_id---of 7
worker_thread---of 53
workqueue_congested---of 18
workqueue_offline_cpu---of 17
workqueue_online_cpu---of 46
workqueue_prepare_cpu---of 5
workqueue_set_max_active---of 5
workqueue_set_unbound_cpumask---of 27
workqueue_sysfs_register---of 12
wq_barrier_func---of 1
wq_calc_node_cpumask---of 10
wq_calc_node_cpumask.cold---of 1
wq_clamp_max_active---of 7
wq_clamp_max_active.cold---of 1
wq_cpumask_show---of 1
wq_cpumask_store---of 9
wq_device_release---of 1
wq_nice_show---of 1
wq_nice_store---of 12
wq_numa_show---of 1
wq_numa_store---of 8
wq_pool_ids_show---of 16
wq_sysfs_prep_attrs---of 6
wq_unbound_cpumask_show---of 1
wq_unbound_cpumask_store---of 5
wq_update_unbound_numa---of 14
wq_update_unbound_numa.cold---of 1
wq_watchdog_param_set_thresh---of 9
wq_watchdog_timer_fn---of 35
wq_watchdog_timer_fn.cold---of 1
wq_watchdog_touch---of 3
wq_worker_comm---of 9
wq_worker_last_func---of 1
wq_worker_running---of 4
wq_worker_sleeping---of 8

bpf_iter_fini_seq_net---of 5
bpf_iter_init_seq_net---of 6
get_proc_task_net62%of 21
proc_create_net_data---of 3
proc_create_net_data_write---of 3
proc_create_net_single---of 3
proc_create_net_single_write---of 3
proc_net_ns_exit---of 1
proc_net_ns_init---of 9
proc_tgid_net_getattr---of 8
proc_tgid_net_lookup72%of 7
proc_tgid_net_readdir---of 9
seq_open_net---of 23
seq_release_net---of 6
single_open_net56%of 18
single_release_net---of 6

__ext4_find_entry44%of 53
__ext4_link---of 18
__ext4_read_dirblock.part.025%of 40
__ext4_unlink---of 33
add_dirent_to_buf59%of 17
do_split---of 85
dx_insert_block.isra.0---of 5
dx_probe---of 64
ext4_add_entry34%of 45
ext4_add_nondir63%of 8
ext4_append---of 12
ext4_create77%of 17
ext4_cross_rename---of 65
ext4_delete_entry---of 14
ext4_dirblock_csum_verify---of 14
ext4_dx_add_entry---of 63
ext4_dx_csum---of 13
ext4_dx_csum_set---of 13
ext4_dx_find_entry---of 16
ext4_empty_dir---of 38
ext4_find_dest_de77%of 13
ext4_find_entry---of 1
ext4_generic_delete_entry---of 14
ext4_get_parent---of 8
ext4_handle_dirty_dirblock72%of 14
ext4_htree_fill_tree---of 34
ext4_htree_next_block---of 18
ext4_inc_count---of 5
ext4_init_dot_dotdot---of 15
ext4_init_dot_dotdot.cold---of 1
ext4_init_new_dir---of 18
ext4_initialize_dirent_tail---of 3
ext4_insert_dentry82%of 11
ext4_link---of 7
ext4_lookup30%of 20
ext4_mkdir---of 32
ext4_mknod77%of 17
ext4_orphan_add60%of 22
ext4_orphan_del61%of 23
ext4_rec_len_to_disk.part.0---of 1
ext4_rename---of 144
ext4_rename2---of 8
ext4_rename_dir_finish---of 10
ext4_rename_dir_prepare---of 24
ext4_resetent---of 11
ext4_rmdir---of 39
ext4_search_dir73%of 11
ext4_setent.part.0---of 8
ext4_symlink---of 42
ext4_tmpfile---of 14
ext4_unlink---of 23
ext4_update_dir_count.isra.0---of 7
htree_dirblock_to_tree---of 20
make_indexed_dir---of 49

lockref_get100%of 1
lockref_get_not_dead100%of 3
lockref_get_not_zero---of 3
lockref_get_or_lock---of 3
lockref_mark_dead67%of 3
lockref_put_not_zero---of 3
lockref_put_or_lock100%of 3
lockref_put_return100%of 1

__add_wb_stat---of 1
__cancel_dirty_page27%of 19
__set_page_dirty_no_writeback72%of 7
__set_page_dirty_nobuffers---of 23
__test_set_page_writeback---of 51
__wb_calc_thresh---of 1
__wb_update_bandwidth.constprop.0---of 51
__writepage---of 7
account_page_cleaned---of 20
account_page_dirtied---of 29
account_page_redirty---of 13
balance_dirty_pages_ratelimited5%of 120
bdi_set_max_ratio---of 5
bdi_set_min_ratio---of 4
clear_page_dirty_for_io---of 38
dirty_background_bytes_handler---of 6
dirty_background_ratio_handler---of 4
dirty_bytes_handler---of 7
dirty_ratio_handler---of 5
dirty_writeback_centisecs_handler---of 6
do_writepages---of 9
domain_dirty_limits---of 19
generic_writepages---of 3
global_dirty_limits---of 1
global_dirtyable_memory---of 1
laptop_io_completion---of 1
laptop_mode_timer_fn---of 1
laptop_sync_completion---of 15
node_dirty_ok---of 10
page_writeback_cpu_online---of 1
redirty_page_for_writepage---of 1
set_page_dirty39%of 18
set_page_dirty_lock---of 5
tag_pages_for_writeback---of 22
test_clear_page_writeback---of 35
wait_for_stable_page---of 2
wait_on_page_writeback---of 13
wb_calc_thresh---of 1
wb_domain_init---of 1
wb_over_bg_thresh---of 3
wb_stat_error---of 1
wb_update_bandwidth---of 1
wb_writeout_inc---of 7
write_cache_pages---of 65
write_one_page---of 21
writeback_set_ratelimit---of 4
writeout_period---of 3

strncpy_from_user56%of 29

__do_sys_copy_file_range---of 22
__ia32_compat_sys_lseek---of 1
__ia32_compat_sys_preadv---of 1
__ia32_compat_sys_preadv2---of 4
__ia32_compat_sys_preadv64---of 9
__ia32_compat_sys_preadv64v2---of 1
__ia32_compat_sys_pwritev---of 1
__ia32_compat_sys_pwritev2---of 4
__ia32_compat_sys_pwritev64---of 9
__ia32_compat_sys_pwritev64v2---of 1
__ia32_compat_sys_sendfile---of 5
__ia32_compat_sys_sendfile64---of 5
__ia32_sys_copy_file_range---of 1
__ia32_sys_llseek---of 12
__ia32_sys_lseek---of 1
__ia32_sys_pread64---of 6
__ia32_sys_preadv---of 1
__ia32_sys_preadv2---of 1
__ia32_sys_pwrite64---of 6
__ia32_sys_pwritev---of 1
__ia32_sys_pwritev2---of 1
__ia32_sys_read---of 1
__ia32_sys_readv---of 1
__ia32_sys_sendfile---of 5
__ia32_sys_sendfile64---of 5
__ia32_sys_write---of 1
__ia32_sys_writev---of 1
__kernel_read---of 35
__kernel_read.cold---of 1
__kernel_write---of 35
__kernel_write.cold---of 1
__x64_sys_copy_file_range---of 1
__x64_sys_llseek---of 12
__x64_sys_lseek---of 1
__x64_sys_pread64---of 7
__x64_sys_preadv82%of 11
__x64_sys_preadv2---of 4
__x64_sys_pwrite64100%of 7
__x64_sys_pwritev---of 11
__x64_sys_pwritev2---of 4
__x64_sys_read---of 1
__x64_sys_readv---of 1
__x64_sys_sendfile---of 5
__x64_sys_sendfile6440%of 5
__x64_sys_write---of 1
__x64_sys_writev---of 1
default_llseek---of 11
do_iter_read46%of 24
do_iter_readv_writev70%of 33
do_iter_write60%of 22
do_preadv---of 11
do_pwritev---of 11
do_readv---of 13
do_sendfile31%of 59
do_writev---of 13
fixed_size_llseek---of 3
generic_copy_file_range---of 1
generic_file_llseek---of 1
generic_file_llseek_size---of 18
generic_file_rw_checks---of 7
generic_write_check_limits88%of 8
generic_write_checks73%of 11
kernel_read---of 3
kernel_write---of 22
ksys_lseek---of 10
ksys_pread64---of 7
ksys_pwrite64---of 7
ksys_read---of 12
ksys_write---of 12
new_sync_read---of 21
new_sync_write70%of 23
no_llseek---of 1
no_seek_end_llseek---of 3
no_seek_end_llseek_size---of 3
noop_llseek---of 1
rw_verify_area58%of 14
vfs_copy_file_range---of 67
vfs_iocb_iter_read---of 15
vfs_iocb_iter_write---of 14
vfs_iter_read---of 3
vfs_iter_write100%of 3
vfs_llseek---of 3
vfs_read---of 27
vfs_readv100%of 3
vfs_setpos---of 6
vfs_write61%of 46
vfs_writev---of 23

__alloc_disk_node---of 11
__alloc_disk_node.cold---of 1
__device_add_disk---of 49
__device_add_disk.cold---of 2
__disk_get_part50%of 14
__disk_unblock_events---of 12
base_probe---of 3
bdev_check_media_change---of 8
bdev_check_media_change.cold---of 1
bdev_read_only---of 4
bdevname---of 5
bdget_disk---of 3
blk_alloc_devt---of 16
blk_free_devt---of 3
blk_invalidate_devt---of 2
blk_lookup_devt---of 10
blk_register_region---of 1
blk_unregister_region---of 1
blkdev_show---of 5
block_devnode---of 3
del_gendisk---of 25
device_add_disk---of 1
device_add_disk_no_queue_reg---of 1
disk_alignment_offset_show---of 3
disk_badblocks_show---of 5
disk_badblocks_store---of 3
disk_block_events25%of 4
disk_capability_show---of 1
disk_check_events---of 15
disk_discard_alignment_show---of 3
disk_events_async_show---of 1
disk_events_poll_msecs_show---of 5
disk_events_poll_msecs_store---of 6
disk_events_set_dfl_poll_msecs---of 5
disk_events_show---of 9
disk_events_workfn---of 1
disk_expand_part_tbl---of 13
disk_ext_range_show---of 3
disk_flush_events25%of 4
disk_get_part---of 15
disk_has_partitions---of 27
disk_hidden_show---of 1
disk_map_sector_rcu34%of 42
disk_name---of 5
disk_name.part.0---of 3
disk_part_iter_exit---of 3
disk_part_iter_init---of 23
disk_part_iter_next---of 36
disk_range_show---of 1
disk_release---of 11
disk_removable_show---of 1
disk_ro_show---of 1
disk_seqf_next---of 3
disk_seqf_start---of 7
disk_seqf_stop---of 2
disk_unblock_events50%of 2
disk_visible---of 4
diskstats_show---of 12
exact_lock58%of 7
exact_match100%of 1
get_disk_and_module---of 7
get_gendisk39%of 18
invalidate_partition---of 6
part_fail_show---of 1
part_fail_store---of 4
part_in_flight---of 4
part_inflight_show---of 14
part_size_show---of 1
part_stat_read_all---of 5
part_stat_show---of 8
percpu_ref_tryget_live---of 20
put_disk---of 2
put_disk_and_module100%of 2
register_blkdev---of 14
register_blkdev.cold---of 2
set_capacity_and_notify---of 5
set_device_ro---of 1
set_disk_ro---of 10
show_partition---of 14
show_partition_start---of 6
unregister_blkdev---of 7

__kernfs_create_file---of 12
get_order---of 1
kernfs_drain_open_files---of 10
kernfs_fop_mmap---of 12
kernfs_fop_open---of 45
kernfs_fop_poll---of 9
kernfs_fop_read_iter---of 19
kernfs_fop_release---of 3
kernfs_fop_write_iter---of 20
kernfs_generic_poll---of 8
kernfs_notify72%of 7
kernfs_notify_workfn---of 18
kernfs_ops---of 5
kernfs_put_open_node---of 5
kernfs_release_file---of 5
kernfs_seq_next---of 7
kernfs_seq_show---of 1
kernfs_seq_start---of 8
kernfs_seq_stop---of 5
kernfs_vma_access---of 6
kernfs_vma_fault---of 7
kernfs_vma_get_policy---of 10
kernfs_vma_open---of 5
kernfs_vma_page_mkwrite---of 8
kernfs_vma_set_policy---of 6

__io_wq_cpu_online---of 37
create_io_worker65%of 14
create_worker_cb---of 5
create_worker_cont82%of 16
io_acct_cancel_pending_work.isra.058%of 19
io_flush_signals---of 5
io_init_new_worker100%of 3
io_queue_worker_create---of 27
io_task_work_match---of 5
io_task_worker_match---of 3
io_worker_cancel_cb---of 3
io_worker_handle_work---of 58
io_worker_release60%of 5
io_workqueue_create---of 2
io_wq_cancel_cb---of 24
io_wq_cpu_affinity---of 18
io_wq_cpu_offline---of 3
io_wq_cpu_online---of 3
io_wq_create---of 34
io_wq_enqueue100%of 1
io_wq_exit_start---of 1
io_wq_for_each_worker.isra.0---of 16
io_wq_hash_work---of 1
io_wq_max_workers---of 27
io_wq_put_and_exit---of 39
io_wq_work_match_all100%of 1
io_wq_work_match_item---of 1
io_wq_worker_cancel---of 9
io_wq_worker_running---of 4
io_wq_worker_sleeping---of 3
io_wq_worker_stopped---of 7
io_wq_worker_wake---of 4
io_wqe_activate_free_worker83%of 17
io_wqe_cancel_pending_work---of 5
io_wqe_dec_running---of 5
io_wqe_enqueue50%of 38
io_wqe_enqueue.cold---of 1
io_wqe_hash_wake---of 17
io_wqe_worker---of 39

__sg_alloc_table87%of 15
__sg_alloc_table_from_pages---of 51
__sg_free_table---of 11
__sg_page_iter_dma_next---of 11
__sg_page_iter_next---of 11
__sg_page_iter_start---of 1
get_order---of 1
sg_alloc_table---of 11
sg_alloc_table_from_pages---of 3
sg_copy_buffer---of 10
sg_copy_from_buffer---of 1
sg_copy_to_buffer---of 1
sg_free_table---of 9
sg_init_one---of 6
sg_init_table100%of 1
sg_kmalloc---of 5
sg_last---of 9
sg_miter_next---of 10
sg_miter_skip---of 10
sg_miter_start---of 3
sg_miter_stop---of 13
sg_nents---of 6
sg_nents_for_len---of 9
sg_next100%of 5
sg_pcopy_from_buffer---of 1
sg_pcopy_to_buffer---of 1
sg_zero_buffer---of 6
sgl_alloc---of 1
sgl_alloc_order---of 24
sgl_free---of 8
sgl_free_n_order---of 10
sgl_free_order---of 8

__add_to_page_cache_locked44%of 66
__delete_from_page_cache---of 10
__filemap_fdatawait_range46%of 11
__filemap_fdatawrite_range88%of 8
__filemap_set_wb_err---of 9
__generic_file_write_iter53%of 19
__lock_page---of 4
__lock_page_async---of 1
__lock_page_killable---of 4
__lock_page_or_retry---of 28
__page_cache_alloc15%of 20
__traceiter_file_check_and_advance_wb_err---of 4
__traceiter_filemap_set_wb_err---of 4
__traceiter_mm_filemap_add_to_page_cache---of 4
__traceiter_mm_filemap_delete_from_page_cache---of 4
__wait_on_page_locked_async---of 10
add_page_wait_queue---of 1
add_to_page_cache_locked---of 1
add_to_page_cache_lru54%of 15
delete_from_page_cache48%of 21
delete_from_page_cache_batch69%of 44
dio_warn_stale_pagecache---of 2
dio_warn_stale_pagecache.part.0---of 1
dio_warn_stale_pagecache.part.0.cold---of 1
do_read_cache_page---of 75
end_page_writeback---of 20
file_check_and_advance_wb_err17%of 12
file_fdatawait_range---of 1
file_write_and_wait_range50%of 6
filemap_check_errors50%of 6
filemap_fault---of 139
filemap_fdatawait_keep_errors---of 3
filemap_fdatawait_range---of 1
filemap_fdatawait_range_keep_errors---of 3
filemap_fdatawrite---of 8
filemap_fdatawrite_range---of 8
filemap_flush---of 8
filemap_map_pages---of 66
filemap_page_mkwrite---of 27
filemap_range_has_page---of 20
filemap_write_and_wait_range86%of 7
find_get_entries52%of 47
find_get_entry48%of 42
find_get_pages_contig---of 58
find_get_pages_range---of 51
find_get_pages_range_tag18%of 68
find_lock_entry8%of 27
generic_file_buffered_read---of 180
generic_file_direct_write---of 16
generic_file_mmap---of 5
generic_file_read_iter---of 13
generic_file_readonly_mmap---of 6
generic_file_write_iter72%of 7
generic_perform_write77%of 13
grab_cache_page_write_begin---of 4
page_cache_delete50%of 22
page_cache_next_miss---of 18
page_cache_prev_miss---of 18
page_endio---of 21
pagecache_get_page27%of 69
pagecache_write_begin---of 1
pagecache_write_end---of 1
perf_trace_file_check_and_advance_wb_err---of 9
perf_trace_filemap_set_wb_err---of 9
perf_trace_mm_filemap_op_page_cache---of 9
put_and_wait_on_page_locked---of 4
read_cache_page---of 1
read_cache_page_gfp---of 1
replace_page_cache_page---of 71
trace_event_raw_event_file_check_and_advance_wb_err---of 13
trace_event_raw_event_filemap_set_wb_err---of 13
trace_event_raw_event_mm_filemap_op_page_cache---of 13
trace_raw_output_file_check_and_advance_wb_err---of 4
trace_raw_output_filemap_set_wb_err---of 4
trace_raw_output_mm_filemap_op_page_cache---of 4
try_to_release_page67%of 12
unaccount_page_cache_page55%of 42
unlock_page50%of 8
wait_on_page_bit---of 1
wait_on_page_bit_common---of 54
wait_on_page_bit_killable---of 1
wake_page_function---of 9
wake_up_page_bit---of 6
xas_next_entry82%of 16

__percpu_counter_compare---of 5
__percpu_counter_init---of 3
__percpu_counter_sum---of 4
compute_batch_value---of 1
percpu_counter_add_batch100%of 4
percpu_counter_cpu_dead---of 3
percpu_counter_destroy---of 2
percpu_counter_set---of 4
percpu_counter_sync---of 1

ida_alloc_range37%of 49
ida_destroy---of 20
ida_free---of 15
idr_alloc72%of 7
idr_alloc_cyclic73%of 11
idr_alloc_u3278%of 9
idr_find100%of 1
idr_for_each---of 10
idr_get_next---of 4
idr_get_next_ul---of 16
idr_remove100%of 1
idr_replace100%of 4

__close_fd_get_file---of 11
__close_range---of 33
__f_unlock_pos---of 1
__fdget100%of 1
__fdget_pos---of 7
__fdget_raw---of 1
__fget_files60%of 27
__fget_light100%of 10
__get_unused_fd_flags---of 1
__ia32_sys_dup---of 6
__ia32_sys_dup2---of 25
__ia32_sys_dup3---of 1
__put_unused_fd50%of 8
__receive_fd---of 13
__x64_sys_dup---of 6
__x64_sys_dup2---of 25
__x64_sys_dup3---of 1
alloc_fd74%of 23
alloc_fd.cold---of 1
alloc_fdtable---of 10
close_fd100%of 3
close_fd_get_file---of 1
do_close_on_exec---of 16
do_dup2---of 20
dup_fd---of 38
exit_files100%of 2
expand_files12%of 36
f_dupfd---of 4
fd_install30%of 34
fget100%of 1
fget_many---of 1
fget_raw---of 1
fget_task---of 3
free_fdtable_rcu---of 1
get_close_on_exec---of 19
get_files_struct---of 3
get_unused_fd_flags100%of 1
iterate_fd---of 19
ksys_dup3---of 15
pick_file60%of 10
put_files_struct10%of 11
put_unused_fd100%of 1
replace_fd---of 8
set_close_on_exec---of 11
task_lookup_fd_rcu---of 9
task_lookup_next_fd_rcu---of 17

__key_link---of 8
__key_link_begin---of 12
__key_link_check_live_key---of 3
__key_link_end---of 8
__key_link_lock---of 5
__key_move_lock---of 8
find_key_to_update---of 10
find_keyring_by_name---of 22
key_default_cmp---of 1
key_free_user_ns---of 1
key_link---of 13
key_move---of 22
key_put_tag---of 9
key_remove_domain---of 2
key_set_index_key72%of 14
key_unlink---of 7
keyring_alloc---of 4
keyring_clear---of 8
keyring_compare_object100%of 5
keyring_describe86%of 7
keyring_destroy---of 8
keyring_detect_cycle---of 15
keyring_detect_cycle_iterator---of 4
keyring_diff_objects---of 11
keyring_free_object---of 1
keyring_gc---of 16
keyring_gc_check_iterator---of 7
keyring_gc_select_iterator---of 14
keyring_get_key_chunk25%of 12
keyring_get_object_key_chunk---of 12
keyring_instantiate---of 6
keyring_preparse---of 2
keyring_read---of 6
keyring_read_iterator---of 3
keyring_restrict---of 24
keyring_restriction_gc---of 4
keyring_revoke---of 4
keyring_search---of 21
keyring_search_iterator48%of 19
keyring_search_rcu75%of 12
restrict_link_reject---of 1
search_nested_keyrings77%of 52

address_val---of 12
bdev_name.constprop.024%of 17
bitmap_list_string.constprop.0---of 23
bitmap_string.constprop.0---of 18
bprintf---of 1
bstr_printf---of 60
check_pointer---of 11
clock.constprop.0---of 12
date_str---of 7
dentry_name---of 48
device_node_string.constprop.0.isra.0---of 19
enable_ptr_key_workfn---of 1
escaped_string---of 18
file_dentry_name---of 12
fill_random_ptr_key---of 1
flags_string---of 32
format_decode65%of 68
fwnode_full_name_string---of 5
fwnode_string---of 17
hex_string---of 19
ip4_addr_string---of 6
ip4_addr_string_sa---of 14
ip4_string---of 14
ip6_addr_string---of 10
ip6_addr_string_sa---of 22
ip6_compressed_string---of 30
ip6_string---of 5
ip_addr_string---of 34
mac_address_string---of 26
netdev_bits---of 21
num_to_str---of 13
number57%of 65
pointer6%of 50
ptr_to_hashval---of 3
ptr_to_id---of 23
put_dec75%of 4
put_dec_full8100%of 1
put_dec_trunc888%of 8
resource_string.isra.0---of 96
restricted_pointer---of 24
rtc_str.constprop.0---of 7
scnprintf---of 4
set_field_width---of 4
set_precision---of 4
simple_strntoull---of 6
simple_strtol---of 9
simple_strtoll---of 9
simple_strtoul---of 6
simple_strtoull---of 6
skip_atoi100%of 3
snprintf100%of 1
special_hex_number.constprop.0---of 1
sprintf100%of 1
sscanf---of 1
string53%of 17
symbol_string---of 13
time64_str.isra.0---of 1
time_and_date---of 22
time_str.constprop.0---of 5
uuid_string---of 24
va_format.constprop.0---of 14
vbin_printf---of 91
vscnprintf---of 4
vsnprintf49%of 106
vsprintf---of 1
vsscanf---of 157
widen_string88%of 16

alloc_fs_context57%of 25
fc_drop_locked---of 1
finish_clean_context---of 8
fs_context_for_mount100%of 1
fs_context_for_reconfigure---of 1
fs_context_for_submount---of 1
generic_parse_monolithic---of 9
legacy_fs_context_dup---of 6
legacy_fs_context_free75%of 4
legacy_get_tree67%of 6
legacy_init_fs_context100%of 3
legacy_parse_monolithic84%of 6
legacy_parse_monolithic.cold---of 1
legacy_parse_param20%of 26
legacy_reconfigure---of 6
logfc---of 12
logfc.cold---of 7
parse_monolithic_mount_data100%of 2
put_fs_context54%of 28
vfs_clean_context---of 5
vfs_dup_fs_context---of 21
vfs_parse_fs_param39%of 18
vfs_parse_fs_string100%of 5

__security_genfs_sid85%of 20
aurule_avc_callback---of 5
constraint_expr_eval---of 73
context_struct_compute_av---of 57
context_struct_compute_av.cold---of 1
context_struct_to_string---of 9
convert_context---of 27
convert_context.cold---of 3
dump_masked_av_helper---of 3
get_classes_callback---of 2
get_order---of 1
get_permissions_callback---of 2
security_bounded_transition---of 32
security_bounded_transition.cold---of 2
security_change_sid---of 6
security_compute_av---of 54
security_compute_av.cold---of 2
security_compute_av_user---of 31
security_compute_av_user.cold---of 2
security_compute_sid.part.047%of 101
security_compute_sid.part.0.cold---of 2
security_compute_validatetrans.part.0---of 36
security_compute_validatetrans.part.0.cold---of 3
security_compute_xperms_decision---of 60
security_compute_xperms_decision.cold---of 3
security_context_str_to_sid---of 1
security_context_to_sid---of 1
security_context_to_sid_core---of 47
security_context_to_sid_default---of 1
security_context_to_sid_force---of 1
security_dump_masked_av.constprop.0---of 16
security_fs_use38%of 48
security_genfs_sid55%of 22
security_get_allow_unknown---of 21
security_get_bool_value---of 23
security_get_bools---of 16
security_get_classes---of 8
security_get_initial_sid_context---of 4
security_get_permissions---of 12
security_get_permissions.cold---of 1
security_get_reject_unknown---of 21
security_get_user_sids---of 78
security_ib_endport_sid---of 39
security_ib_pkey_sid---of 40
security_load_policy---of 52
security_load_policy.cold---of 10
security_member_sid---of 6
security_mls_enabled---of 21
security_net_peersid_resolve---of 35
security_net_peersid_resolve.cold---of 2
security_netif_sid---of 38
security_netlbl_secattr_to_sid---of 47
security_netlbl_sid_to_secattr63%of 24
security_node_sid---of 47
security_policycap_supported---of 21
security_port_sid---of 40
security_read_policy---of 10
security_set_bools---of 17
security_sid_mls_copy---of 45
security_sid_mls_copy.cold---of 3
security_sid_to_context100%of 1
security_sid_to_context_core56%of 34
security_sid_to_context_core.cold---of 2
security_sid_to_context_force100%of 1
security_sid_to_context_inval---of 1
security_sidtab_hash_stats---of 20
security_sidtab_hash_stats.cold---of 1
security_transition_sid63%of 8
security_transition_sid_user---of 6
security_validate_transition---of 3
security_validate_transition_user---of 3
selinux_audit_rule_free---of 2
selinux_audit_rule_init---of 64
selinux_audit_rule_known---of 5
selinux_audit_rule_match---of 60
selinux_policy_cancel---of 8
selinux_policy_commit---of 35
selinux_policy_commit.cold---of 3
selinux_policy_genfs_sid---of 1
services_compute_xperms_decision---of 24
services_compute_xperms_decision.cold---of 2
services_compute_xperms_drivers---of 7
sidtab_entry_to_string40%of 5
string_to_context_struct---of 19
type_attribute_bounds_av---of 11

key_task_permission63%of 16
key_validate---of 5

__do_sys_uselib---of 25
__get_task_comm100%of 1
__ia32_compat_sys_execve---of 1
__ia32_compat_sys_execveat---of 1
__ia32_sys_execve---of 1
__ia32_sys_execveat---of 1
__ia32_sys_uselib---of 1
__register_binfmt---of 8
__set_task_comm---of 10
__x64_sys_execve---of 1
__x64_sys_execveat---of 1
__x64_sys_uselib---of 1
acct_arg_size---of 3
alloc_bprm---of 23
begin_new_exec---of 100
bprm_change_interp---of 4
bprm_execve---of 105
copy_string_kernel---of 16
copy_string_kernel.cold---of 1
copy_strings.isra.0---of 30
copy_strings_kernel---of 6
count.constprop.0---of 9
count_strings_kernel.part.0---of 6
do_execveat_common---of 21
do_execveat_common.cold---of 1
do_open_execat---of 24
finalize_exec---of 1
free_bprm---of 11
get_arg_page---of 6
get_user_arg_ptr.isra.0---of 6
kernel_execve---of 19
open_exec---of 4
path_noexec---of 3
remove_arg_zero---of 16
set_binfmt---of 4
set_dumpable---of 4
setup_arg_pages---of 21
setup_arg_pages.cold---of 1
setup_new_exec---of 5
shift_arg_pages---of 14
unregister_binfmt---of 1
would_dump---of 10

__isolate_lru_page---of 30
__node_reclaim---of 29
__remove_mapping---of 37
__traceiter_mm_shrink_slab_end---of 4
__traceiter_mm_shrink_slab_start---of 4
__traceiter_mm_vmscan_direct_reclaim_begin---of 4
__traceiter_mm_vmscan_direct_reclaim_end---of 4
__traceiter_mm_vmscan_inactive_list_is_low---of 4
__traceiter_mm_vmscan_kswapd_sleep---of 4
__traceiter_mm_vmscan_kswapd_wake---of 4
__traceiter_mm_vmscan_lru_isolate---of 4
__traceiter_mm_vmscan_lru_shrink_active---of 4
__traceiter_mm_vmscan_lru_shrink_inactive---of 4
__traceiter_mm_vmscan_memcg_reclaim_begin---of 4
__traceiter_mm_vmscan_memcg_reclaim_end---of 4
__traceiter_mm_vmscan_memcg_softlimit_reclaim_begin---of 4
__traceiter_mm_vmscan_memcg_softlimit_reclaim_end---of 4
__traceiter_mm_vmscan_node_reclaim_begin---of 4
__traceiter_mm_vmscan_node_reclaim_end---of 4
__traceiter_mm_vmscan_wakeup_kswapd---of 4
__traceiter_mm_vmscan_writepage---of 4
allow_direct_reclaim.part.0---of 11
balance_pgdat---of 83
check_move_unevictable_pages---of 25
do_shrink_slab---of 37
do_shrink_slab.cold---of 1
do_try_to_free_pages---of 77
drop_slab---of 2
drop_slab_node---of 5
free_prealloced_shrinker---of 4
inactive_is_low---of 9
isolate_lru_page---of 32
isolate_lru_pages---of 36
kswapd---of 49
kswapd_run---of 7
kswapd_run.cold---of 1
kswapd_stop---of 2
lruvec_lru_size---of 8
mem_cgroup_shrink_node---of 28
move_pages_to_lru.isra.0---of 48
node_pagecache_reclaimable---of 8
node_reclaim---of 10
page_evictable---of 18
perf_trace_mm_shrink_slab_end---of 6
perf_trace_mm_shrink_slab_start---of 6
perf_trace_mm_vmscan_direct_reclaim_begin_template---of 6
perf_trace_mm_vmscan_direct_reclaim_end_template---of 6
perf_trace_mm_vmscan_inactive_list_is_low---of 7
perf_trace_mm_vmscan_kswapd_sleep---of 6
perf_trace_mm_vmscan_kswapd_wake---of 6
perf_trace_mm_vmscan_lru_isolate---of 6
perf_trace_mm_vmscan_lru_shrink_active---of 7
perf_trace_mm_vmscan_lru_shrink_inactive---of 7
perf_trace_mm_vmscan_node_reclaim_begin---of 6
perf_trace_mm_vmscan_wakeup_kswapd---of 6
perf_trace_mm_vmscan_writepage---of 9
pgdat_balanced---of 7
prealloc_shrinker62%of 13
prepare_kswapd_sleep---of 11
putback_lru_page---of 6
reclaim_clean_pages_from_list---of 18
reclaim_pages---of 31
register_shrinker---of 6
register_shrinker_prepared100%of 3
remove_mapping---of 8
set_task_reclaim_state---of 6
shrink_active_list---of 47
shrink_all_memory---of 1
shrink_inactive_list---of 37
shrink_lruvec---of 64
shrink_node---of 99
shrink_page_list---of 302
shrink_slab---of 32
throttle_direct_reclaim---of 41
trace_event_raw_event_mm_shrink_slab_end---of 10
trace_event_raw_event_mm_shrink_slab_start---of 10
trace_event_raw_event_mm_vmscan_direct_reclaim_begin_template---of 10
trace_event_raw_event_mm_vmscan_direct_reclaim_end_template---of 10
trace_event_raw_event_mm_vmscan_inactive_list_is_low---of 11
trace_event_raw_event_mm_vmscan_kswapd_sleep---of 10
trace_event_raw_event_mm_vmscan_kswapd_wake---of 10
trace_event_raw_event_mm_vmscan_lru_isolate---of 10
trace_event_raw_event_mm_vmscan_lru_shrink_active---of 11
trace_event_raw_event_mm_vmscan_lru_shrink_inactive---of 11
trace_event_raw_event_mm_vmscan_node_reclaim_begin---of 10
trace_event_raw_event_mm_vmscan_wakeup_kswapd---of 10
trace_event_raw_event_mm_vmscan_writepage---of 13
trace_raw_output_mm_shrink_slab_end---of 4
trace_raw_output_mm_shrink_slab_start---of 6
trace_raw_output_mm_vmscan_direct_reclaim_begin_template---of 7
trace_raw_output_mm_vmscan_direct_reclaim_end_template---of 5
trace_raw_output_mm_vmscan_inactive_list_is_low---of 6
trace_raw_output_mm_vmscan_kswapd_sleep---of 5
trace_raw_output_mm_vmscan_kswapd_wake---of 4
trace_raw_output_mm_vmscan_lru_isolate---of 4
trace_raw_output_mm_vmscan_lru_shrink_active---of 6
trace_raw_output_mm_vmscan_lru_shrink_inactive---of 6
trace_raw_output_mm_vmscan_node_reclaim_begin---of 6
trace_raw_output_mm_vmscan_wakeup_kswapd---of 6
trace_raw_output_mm_vmscan_writepage---of 6
try_to_free_mem_cgroup_pages---of 24
try_to_free_pages---of 28
unregister_memcg_shrinker.isra.0---of 3
unregister_shrinker---of 4
wakeup_kswapd---of 29
zone_reclaimable_pages---of 20

symcmp---of 1
symhash100%of 3
symtab_init---of 1
symtab_insert---of 12
symtab_search72%of 7

__list_lru_init60%of 22
__list_lru_walk_one---of 23
__memcg_init_list_lru_node67%of 6
kvfree_rcu_local---of 1
list_lru_add65%of 20
list_lru_count_node---of 1
list_lru_count_one---of 25
list_lru_del---of 18
list_lru_destroy---of 7
list_lru_isolate---of 1
list_lru_isolate_move---of 1
list_lru_walk_node---of 6
list_lru_walk_one---of 3
list_lru_walk_one_irq---of 3
memcg_cancel_update_list_lru_node---of 7
memcg_drain_all_list_lrus---of 30
memcg_update_all_list_lrus---of 27

get_user_session_keyring_rcu---of 4
install_process_keyring_to_cred---of 6
install_session_keyring_to_cred---of 11
install_thread_keyring_to_cred---of 6
join_session_keyring---of 17
key_change_session_keyring---of 24
key_fsgid_changed---of 2
key_fsuid_changed---of 2
look_up_user_keyrings---of 33
lookup_user_key---of 96
lookup_user_key_possessed100%of 1
search_cred_keyrings_rcu50%of 16
search_process_keyrings_rcu---of 11

ext4_buffered_write_iter---of 13
ext4_dio_write_end_io71%of 27
ext4_file_mmap---of 7
ext4_file_open28%of 33
ext4_file_read_iter65%of 14
ext4_file_write_iter37%of 90
ext4_generic_write_checks42%of 12
ext4_llseek---of 10
ext4_release_file---of 11

idr_callback---of 4
idr_callback.cold---of 1
inotify_free_event---of 1
inotify_free_group_priv---of 2
inotify_free_mark---of 1
inotify_freeing_mark---of 1
inotify_handle_inode_event60%of 20
inotify_merge50%of 6

__cleanup_sighand60%of 5
__delayed_free_task---of 1
__do_sys_clone---of 1
__do_sys_clone3---of 19
__ia32_sys_clone---of 1
__ia32_sys_clone3---of 1
__ia32_sys_fork---of 1
__ia32_sys_set_tid_address---of 1
__ia32_sys_unshare---of 1
__mmdrop---of 16
__mmdrop.cold---of 1
__pidfd_prepare---of 12
__put_task_struct---of 21
__put_task_struct_rcu_cb---of 1
__refcount_add.constprop.060%of 5
__traceiter_task_newtask---of 4
__traceiter_task_rename---of 4
__x64_sys_clone---of 1
__x64_sys_clone3---of 1
__x64_sys_set_tid_address---of 1
__x64_sys_unshare---of 1
__x64_sys_vfork---of 1
copy_clone_args_from_user---of 21
copy_process48%of 275
create_io_thread100%of 1
exec_mm_release---of 1
exit_mm_release---of 1
free_task---of 7
get_mm_exe_file60%of 22
get_task_exe_file---of 4
get_task_mm---of 4
kernel_clone---of 45
kernel_thread---of 1
ksys_unshare---of 53
lockdep_tasklist_lock_is_held100%of 1
mm_access---of 8
mm_alloc---of 3
mm_init---of 12
mm_release---of 12
mmdrop_async---of 2
mmdrop_async_fn---of 1
mmput10%of 11
mmput_async---of 2
mmput_async_fn---of 10
nr_processes---of 3
perf_trace_task_newtask---of 6
perf_trace_task_rename---of 6
pidfd_pid---of 4
pidfd_poll---of 6
pidfd_prepare---of 4
pidfd_release---of 1
pidfd_show_fdinfo---of 6
ptrace_event_pid---of 24
put_task_stack54%of 13
set_mm_exe_file---of 4
set_task_stack_end_magic---of 1
sighand_ctor---of 1
sysctl_max_threads---of 4
trace_event_raw_event_task_newtask---of 10
trace_event_raw_event_task_rename---of 10
trace_raw_output_task_newtask---of 4
trace_raw_output_task_rename---of 4
unshare_fd---of 6
unshare_files---of 7
vm_area_alloc---of 3
vm_area_dup---of 3
vm_area_free---of 1
walk_process_tree---of 12

ioport_map---of 4
ioread16---of 8
ioread16_rep---of 7
ioread16be---of 8
ioread32---of 8
ioread32_rep---of 7
ioread32be---of 8
ioread64_hi_lo---of 8
ioread64_lo_hi---of 8
ioread64be_hi_lo---of 8
ioread64be_lo_hi---of 8
ioread850%of 8
ioread8_rep---of 7
iowrite16---of 6
iowrite16_rep---of 7
iowrite16be---of 6
iowrite3250%of 6
iowrite32_rep---of 7
iowrite32be---of 6
iowrite64_hi_lo---of 6
iowrite64_lo_hi---of 6
iowrite64be_hi_lo---of 6
iowrite64be_lo_hi---of 6
iowrite850%of 6
iowrite8_rep---of 7
pci_iounmap---of 5